From cc621e1829f61df96473cd04fb8ba27a65c99e13 Mon Sep 17 00:00:00 2001 From: Jeremy Andrews Date: Sun, 26 Sep 2021 12:50:43 -0500 Subject: Issue %3003 - Move libaom to libs/ --- libs/libaom/src/.clang-format | 148 + libs/libaom/src/.cmake-format.py | 102 + libs/libaom/src/.gitattributes | 18 + libs/libaom/src/.mailmap | 91 + libs/libaom/src/AUTHORS | 260 + libs/libaom/src/CHANGELOG | 51 + libs/libaom/src/CMakeLists.txt | 768 ++ libs/libaom/src/LICENSE | 27 + libs/libaom/src/PATENTS | 108 + libs/libaom/src/README.md | 665 + libs/libaom/src/Sample.cfg | 35 + libs/libaom/src/aom/aom.h | 109 + libs/libaom/src/aom/aom_codec.h | 478 + libs/libaom/src/aom/aom_decoder.h | 257 + libs/libaom/src/aom/aom_encoder.h | 1136 ++ libs/libaom/src/aom/aom_frame_buffer.h | 84 + libs/libaom/src/aom/aom_image.h | 430 + libs/libaom/src/aom/aom_integer.h | 103 + libs/libaom/src/aom/aomcx.h | 1774 +++ libs/libaom/src/aom/aomdx.h | 397 + libs/libaom/src/aom/exports_com | 41 + libs/libaom/src/aom/exports_dec | 8 + libs/libaom/src/aom/exports_enc | 17 + libs/libaom/src/aom/exports_test | 4 + libs/libaom/src/aom/internal/aom_codec_internal.h | 381 + libs/libaom/src/aom/internal/aom_image_internal.h | 93 + libs/libaom/src/aom/src/aom_codec.c | 157 + libs/libaom/src/aom/src/aom_decoder.c | 137 + libs/libaom/src/aom/src/aom_encoder.c | 302 + libs/libaom/src/aom/src/aom_image.c | 395 + libs/libaom/src/aom/src/aom_integer.c | 105 + libs/libaom/src/aom_dsp/aom_convolve.c | 239 + libs/libaom/src/aom_dsp/aom_dsp.cmake | 422 + libs/libaom/src/aom_dsp/aom_dsp_common.h | 101 + libs/libaom/src/aom_dsp/aom_dsp_rtcd.c | 18 + libs/libaom/src/aom_dsp/aom_dsp_rtcd_defs.pl | 1785 +++ libs/libaom/src/aom_dsp/aom_filter.h | 56 + libs/libaom/src/aom_dsp/aom_simd.h | 38 + libs/libaom/src/aom_dsp/aom_simd_inline.h | 21 + libs/libaom/src/aom_dsp/arm/avg_neon.c | 74 + libs/libaom/src/aom_dsp/arm/blend_a64_mask_neon.c | 451 + libs/libaom/src/aom_dsp/arm/fwd_txfm_neon.c | 316 + libs/libaom/src/aom_dsp/arm/hadamard_neon.c | 183 + libs/libaom/src/aom_dsp/arm/intrapred_neon.c | 590 + libs/libaom/src/aom_dsp/arm/loopfilter_neon.c | 927 ++ libs/libaom/src/aom_dsp/arm/sad4d_neon.c | 226 + libs/libaom/src/aom_dsp/arm/sad_neon.c | 224 + libs/libaom/src/aom_dsp/arm/sse_neon.c | 487 + libs/libaom/src/aom_dsp/arm/subpel_variance_neon.c | 131 + libs/libaom/src/aom_dsp/arm/subtract_neon.c | 81 + libs/libaom/src/aom_dsp/arm/sum_neon.h | 37 + libs/libaom/src/aom_dsp/arm/variance_neon.c | 401 + libs/libaom/src/aom_dsp/avg.c | 486 + libs/libaom/src/aom_dsp/binary_codes_reader.c | 56 + libs/libaom/src/aom_dsp/binary_codes_reader.h | 44 + libs/libaom/src/aom_dsp/binary_codes_writer.c | 138 + libs/libaom/src/aom_dsp/binary_codes_writer.h | 65 + libs/libaom/src/aom_dsp/bitreader.c | 41 + libs/libaom/src/aom_dsp/bitreader.h | 228 + libs/libaom/src/aom_dsp/bitreader_buffer.c | 116 + libs/libaom/src/aom_dsp/bitreader_buffer.h | 53 + libs/libaom/src/aom_dsp/bitwriter.c | 31 + libs/libaom/src/aom_dsp/bitwriter.h | 118 + libs/libaom/src/aom_dsp/bitwriter_buffer.c | 141 + libs/libaom/src/aom_dsp/bitwriter_buffer.h | 55 + libs/libaom/src/aom_dsp/blend.h | 45 + libs/libaom/src/aom_dsp/blend_a64_hmask.c | 71 + libs/libaom/src/aom_dsp/blend_a64_mask.c | 349 + libs/libaom/src/aom_dsp/blend_a64_vmask.c | 73 + libs/libaom/src/aom_dsp/blk_sse_sum.c | 26 + libs/libaom/src/aom_dsp/entcode.c | 49 + libs/libaom/src/aom_dsp/entcode.h | 41 + libs/libaom/src/aom_dsp/entdec.c | 247 + libs/libaom/src/aom_dsp/entdec.h | 81 + libs/libaom/src/aom_dsp/entenc.c | 423 + libs/libaom/src/aom_dsp/entenc.h | 85 + libs/libaom/src/aom_dsp/fastssim.c | 487 + libs/libaom/src/aom_dsp/fft.c | 219 + libs/libaom/src/aom_dsp/fft_common.h | 1050 ++ libs/libaom/src/aom_dsp/fwd_txfm.c | 229 + libs/libaom/src/aom_dsp/grain_synthesis.c | 1408 ++ libs/libaom/src/aom_dsp/grain_synthesis.h | 192 + libs/libaom/src/aom_dsp/grain_table.c | 334 + libs/libaom/src/aom_dsp/grain_table.h | 102 + libs/libaom/src/aom_dsp/intrapred.c | 792 ++ libs/libaom/src/aom_dsp/intrapred_common.h | 47 + libs/libaom/src/aom_dsp/loopfilter.c | 929 ++ .../src/aom_dsp/mips/aom_convolve8_horiz_msa.c | 693 + .../src/aom_dsp/mips/aom_convolve8_vert_msa.c | 699 + .../src/aom_dsp/mips/aom_convolve_copy_msa.c | 248 + libs/libaom/src/aom_dsp/mips/aom_convolve_msa.h | 79 + libs/libaom/src/aom_dsp/mips/common_dspr2.c | 31 + libs/libaom/src/aom_dsp/mips/common_dspr2.h | 51 + libs/libaom/src/aom_dsp/mips/convolve2_dspr2.c | 1031 ++ .../src/aom_dsp/mips/convolve2_horiz_dspr2.c | 681 + .../libaom/src/aom_dsp/mips/convolve2_vert_dspr2.c | 237 + libs/libaom/src/aom_dsp/mips/convolve8_dspr2.c | 222 + .../src/aom_dsp/mips/convolve8_horiz_dspr2.c | 879 ++ .../libaom/src/aom_dsp/mips/convolve8_vert_dspr2.c | 361 + .../src/aom_dsp/mips/convolve_common_dspr2.h | 48 + libs/libaom/src/aom_dsp/mips/intrapred16_dspr2.c | 327 + libs/libaom/src/aom_dsp/mips/intrapred4_dspr2.c | 82 + libs/libaom/src/aom_dsp/mips/intrapred8_dspr2.c | 150 + libs/libaom/src/aom_dsp/mips/intrapred_msa.c | 550 + libs/libaom/src/aom_dsp/mips/loopfilter_16_msa.c | 1488 +++ libs/libaom/src/aom_dsp/mips/loopfilter_4_msa.c | 147 + libs/libaom/src/aom_dsp/mips/loopfilter_8_msa.c | 333 + .../src/aom_dsp/mips/loopfilter_filters_dspr2.c | 328 + .../src/aom_dsp/mips/loopfilter_filters_dspr2.h | 736 ++ .../src/aom_dsp/mips/loopfilter_macros_dspr2.h | 437 + .../src/aom_dsp/mips/loopfilter_masks_dspr2.h | 357 + libs/libaom/src/aom_dsp/mips/loopfilter_mb_dspr2.c | 590 + .../src/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c | 734 ++ .../src/aom_dsp/mips/loopfilter_mb_vert_dspr2.c | 758 ++ libs/libaom/src/aom_dsp/mips/loopfilter_msa.h | 251 + libs/libaom/src/aom_dsp/mips/macros_msa.h | 2058 +++ libs/libaom/src/aom_dsp/mips/sad_msa.c | 800 ++ .../src/aom_dsp/mips/sub_pixel_variance_msa.c | 1792 +++ libs/libaom/src/aom_dsp/mips/subtract_msa.c | 266 + libs/libaom/src/aom_dsp/mips/variance_msa.c | 633 + libs/libaom/src/aom_dsp/noise_model.c | 1654 +++ libs/libaom/src/aom_dsp/noise_model.h | 323 + libs/libaom/src/aom_dsp/noise_util.c | 223 + libs/libaom/src/aom_dsp/noise_util.h | 68 + libs/libaom/src/aom_dsp/prob.h | 670 + libs/libaom/src/aom_dsp/psnr.c | 439 + libs/libaom/src/aom_dsp/psnr.h | 93 + libs/libaom/src/aom_dsp/psnrhvs.c | 277 + libs/libaom/src/aom_dsp/quantize.c | 466 + libs/libaom/src/aom_dsp/quantize.h | 124 + libs/libaom/src/aom_dsp/recenter.h | 61 + libs/libaom/src/aom_dsp/sad.c | 319 + libs/libaom/src/aom_dsp/sad_av1.c | 264 + libs/libaom/src/aom_dsp/simd/v128_intrinsics.h | 346 + libs/libaom/src/aom_dsp/simd/v128_intrinsics_arm.h | 973 ++ libs/libaom/src/aom_dsp/simd/v128_intrinsics_c.h | 903 ++ libs/libaom/src/aom_dsp/simd/v128_intrinsics_x86.h | 657 + libs/libaom/src/aom_dsp/simd/v256_intrinsics.h | 377 + libs/libaom/src/aom_dsp/simd/v256_intrinsics_arm.h | 17 + libs/libaom/src/aom_dsp/simd/v256_intrinsics_c.h | 968 ++ .../libaom/src/aom_dsp/simd/v256_intrinsics_v128.h | 876 ++ libs/libaom/src/aom_dsp/simd/v256_intrinsics_x86.h | 750 ++ libs/libaom/src/aom_dsp/simd/v64_intrinsics.h | 234 + libs/libaom/src/aom_dsp/simd/v64_intrinsics_arm.h | 684 + libs/libaom/src/aom_dsp/simd/v64_intrinsics_c.h | 982 ++ libs/libaom/src/aom_dsp/simd/v64_intrinsics_x86.h | 491 + libs/libaom/src/aom_dsp/sse.c | 54 + libs/libaom/src/aom_dsp/ssim.c | 441 + libs/libaom/src/aom_dsp/ssim.h | 87 + libs/libaom/src/aom_dsp/subtract.c | 55 + libs/libaom/src/aom_dsp/sum_squares.c | 73 + libs/libaom/src/aom_dsp/txfm_common.h | 91 + libs/libaom/src/aom_dsp/variance.c | 1483 +++ libs/libaom/src/aom_dsp/variance.h | 129 + libs/libaom/src/aom_dsp/vmaf.c | 159 + libs/libaom/src/aom_dsp/vmaf.h | 27 + .../src/aom_dsp/x86/adaptive_quantize_avx2.c | 244 + .../src/aom_dsp/x86/adaptive_quantize_sse2.c | 633 + libs/libaom/src/aom_dsp/x86/aom_asm_stubs.c | 95 + .../src/aom_dsp/x86/aom_convolve_copy_sse2.asm | 297 + .../src/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm | 613 + .../x86/aom_high_subpixel_bilinear_sse2.asm | 367 + .../src/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c | 1441 +++ .../src/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c | 569 + .../src/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c | 770 ++ .../src/aom_dsp/x86/aom_subpixel_8t_sse2.asm | 615 + .../src/aom_dsp/x86/aom_subpixel_8t_ssse3.asm | 870 ++ .../src/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm | 295 + .../aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm | 267 + libs/libaom/src/aom_dsp/x86/avg_intrin_avx2.c | 504 + libs/libaom/src/aom_dsp/x86/avg_intrin_sse2.c | 512 + .../src/aom_dsp/x86/bitdepth_conversion_avx2.h | 32 + .../src/aom_dsp/x86/bitdepth_conversion_sse2.h | 35 + libs/libaom/src/aom_dsp/x86/blend_a64_hmask_sse4.c | 36 + libs/libaom/src/aom_dsp/x86/blend_a64_mask_avx2.c | 1374 ++ libs/libaom/src/aom_dsp/x86/blend_a64_mask_sse4.c | 1560 +++ libs/libaom/src/aom_dsp/x86/blend_a64_vmask_sse4.c | 285 + libs/libaom/src/aom_dsp/x86/blend_mask_sse4.h | 237 + libs/libaom/src/aom_dsp/x86/blend_sse4.h | 191 + libs/libaom/src/aom_dsp/x86/blk_sse_sum_avx2.c | 185 + libs/libaom/src/aom_dsp/x86/blk_sse_sum_sse2.c | 138 + libs/libaom/src/aom_dsp/x86/common_avx2.h | 147 + libs/libaom/src/aom_dsp/x86/convolve.h | 203 + libs/libaom/src/aom_dsp/x86/convolve_avx2.h | 463 + .../src/aom_dsp/x86/convolve_common_intrin.h | 31 + libs/libaom/src/aom_dsp/x86/convolve_sse2.h | 121 + libs/libaom/src/aom_dsp/x86/convolve_sse4_1.h | 53 + libs/libaom/src/aom_dsp/x86/fft_avx2.c | 74 + libs/libaom/src/aom_dsp/x86/fft_sse2.c | 167 + libs/libaom/src/aom_dsp/x86/fwd_txfm_impl_sse2.h | 544 + libs/libaom/src/aom_dsp/x86/fwd_txfm_sse2.c | 39 + libs/libaom/src/aom_dsp/x86/fwd_txfm_sse2.h | 160 + .../src/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm | 379 + .../aom_dsp/x86/highbd_adaptive_quantize_avx2.c | 457 + .../aom_dsp/x86/highbd_adaptive_quantize_sse2.c | 732 ++ libs/libaom/src/aom_dsp/x86/highbd_convolve_avx2.c | 1323 ++ libs/libaom/src/aom_dsp/x86/highbd_convolve_sse2.c | 351 + .../libaom/src/aom_dsp/x86/highbd_convolve_ssse3.c | 251 + .../src/aom_dsp/x86/highbd_intrapred_asm_sse2.asm | 259 + .../libaom/src/aom_dsp/x86/highbd_intrapred_sse2.c | 984 ++ .../src/aom_dsp/x86/highbd_loopfilter_avx2.c | 66 + .../src/aom_dsp/x86/highbd_loopfilter_sse2.c | 1698 +++ .../src/aom_dsp/x86/highbd_quantize_intrin_avx2.c | 160 + .../src/aom_dsp/x86/highbd_quantize_intrin_sse2.c | 206 + libs/libaom/src/aom_dsp/x86/highbd_sad4d_sse2.asm | 296 + libs/libaom/src/aom_dsp/x86/highbd_sad_sse2.asm | 442 + .../x86/highbd_subpel_variance_impl_sse2.asm | 1024 ++ libs/libaom/src/aom_dsp/x86/highbd_subtract_sse2.c | 267 + libs/libaom/src/aom_dsp/x86/highbd_variance_avx2.c | 140 + .../src/aom_dsp/x86/highbd_variance_impl_sse2.asm | 318 + libs/libaom/src/aom_dsp/x86/highbd_variance_sse2.c | 842 ++ libs/libaom/src/aom_dsp/x86/highbd_variance_sse4.c | 216 + libs/libaom/src/aom_dsp/x86/intrapred_asm_sse2.asm | 608 + libs/libaom/src/aom_dsp/x86/intrapred_avx2.c | 4895 +++++++ libs/libaom/src/aom_dsp/x86/intrapred_sse2.c | 1411 ++ libs/libaom/src/aom_dsp/x86/intrapred_ssse3.c | 1695 +++ libs/libaom/src/aom_dsp/x86/intrapred_x86.h | 38 + libs/libaom/src/aom_dsp/x86/inv_wht_sse2.asm | 107 + libs/libaom/src/aom_dsp/x86/jnt_sad_ssse3.c | 238 + libs/libaom/src/aom_dsp/x86/jnt_variance_ssse3.c | 192 + libs/libaom/src/aom_dsp/x86/loopfilter_sse2.c | 2100 +++ libs/libaom/src/aom_dsp/x86/lpf_common_sse2.h | 495 + libs/libaom/src/aom_dsp/x86/masked_sad4d_ssse3.c | 266 + .../src/aom_dsp/x86/masked_sad_intrin_avx2.c | 389 + .../src/aom_dsp/x86/masked_sad_intrin_ssse3.c | 402 + .../src/aom_dsp/x86/masked_sad_intrin_ssse3.h | 33 + .../src/aom_dsp/x86/masked_variance_intrin_ssse3.c | 1067 ++ .../src/aom_dsp/x86/masked_variance_intrin_ssse3.h | 92 + libs/libaom/src/aom_dsp/x86/mem_sse2.h | 42 + libs/libaom/src/aom_dsp/x86/obmc_intrinsic_sse4.h | 58 + libs/libaom/src/aom_dsp/x86/obmc_intrinsic_ssse3.h | 54 + libs/libaom/src/aom_dsp/x86/obmc_sad_avx2.c | 270 + libs/libaom/src/aom_dsp/x86/obmc_sad_sse4.c | 268 + libs/libaom/src/aom_dsp/x86/obmc_variance_avx2.c | 190 + libs/libaom/src/aom_dsp/x86/obmc_variance_sse4.c | 381 + .../libaom/src/aom_dsp/x86/quantize_avx_x86_64.asm | 464 + libs/libaom/src/aom_dsp/x86/quantize_sse2.c | 125 + libs/libaom/src/aom_dsp/x86/quantize_ssse3.c | 192 + .../src/aom_dsp/x86/quantize_ssse3_x86_64.asm | 302 + libs/libaom/src/aom_dsp/x86/quantize_x86.h | 202 + libs/libaom/src/aom_dsp/x86/sad4d_avx2.c | 106 + libs/libaom/src/aom_dsp/x86/sad4d_sse2.asm | 428 + libs/libaom/src/aom_dsp/x86/sad_avx2.c | 189 + libs/libaom/src/aom_dsp/x86/sad_highbd_avx2.c | 699 + libs/libaom/src/aom_dsp/x86/sad_impl_avx2.c | 159 + libs/libaom/src/aom_dsp/x86/sad_sse2.asm | 353 + libs/libaom/src/aom_dsp/x86/sse_avx2.c | 384 + libs/libaom/src/aom_dsp/x86/sse_sse4.c | 353 + libs/libaom/src/aom_dsp/x86/ssim_sse2_x86_64.asm | 222 + .../src/aom_dsp/x86/subpel_variance_sse2.asm | 1470 +++ libs/libaom/src/aom_dsp/x86/subtract_avx2.c | 108 + libs/libaom/src/aom_dsp/x86/subtract_sse2.asm | 146 + libs/libaom/src/aom_dsp/x86/sum_squares_avx2.c | 248 + libs/libaom/src/aom_dsp/x86/sum_squares_sse2.c | 366 + libs/libaom/src/aom_dsp/x86/sum_squares_sse2.h | 22 + libs/libaom/src/aom_dsp/x86/synonyms.h | 118 + libs/libaom/src/aom_dsp/x86/synonyms_avx2.h | 79 + libs/libaom/src/aom_dsp/x86/transpose_sse2.h | 420 + libs/libaom/src/aom_dsp/x86/txfm_common_avx2.h | 360 + libs/libaom/src/aom_dsp/x86/txfm_common_sse2.h | 33 + libs/libaom/src/aom_dsp/x86/variance_avx2.c | 526 + libs/libaom/src/aom_dsp/x86/variance_impl_avx2.c | 814 ++ libs/libaom/src/aom_dsp/x86/variance_impl_ssse3.c | 129 + libs/libaom/src/aom_dsp/x86/variance_sse2.c | 757 ++ libs/libaom/src/aom_mem/aom_mem.c | 84 + libs/libaom/src/aom_mem/aom_mem.cmake | 29 + libs/libaom/src/aom_mem/aom_mem.h | 74 + libs/libaom/src/aom_mem/include/aom_mem_intrnl.h | 29 + libs/libaom/src/aom_ports/aom_once.h | 104 + libs/libaom/src/aom_ports/aom_ports.cmake | 92 + libs/libaom/src/aom_ports/aom_timer.h | 111 + libs/libaom/src/aom_ports/arm.h | 41 + libs/libaom/src/aom_ports/arm_cpudetect.c | 150 + libs/libaom/src/aom_ports/bitops.h | 78 + libs/libaom/src/aom_ports/emmintrin_compat.h | 56 + libs/libaom/src/aom_ports/emms.asm | 41 + libs/libaom/src/aom_ports/mem.h | 99 + libs/libaom/src/aom_ports/mem_ops.h | 228 + libs/libaom/src/aom_ports/mem_ops_aligned.h | 173 + libs/libaom/src/aom_ports/msvc.h | 75 + libs/libaom/src/aom_ports/ppc.h | 30 + libs/libaom/src/aom_ports/ppc_cpudetect.c | 82 + libs/libaom/src/aom_ports/sanitizer.h | 38 + libs/libaom/src/aom_ports/system_state.h | 23 + libs/libaom/src/aom_ports/x86.h | 375 + libs/libaom/src/aom_ports/x86_abi_support.asm | 402 + libs/libaom/src/aom_scale/aom_scale.cmake | 45 + libs/libaom/src/aom_scale/aom_scale.h | 23 + libs/libaom/src/aom_scale/aom_scale_rtcd.c | 18 + libs/libaom/src/aom_scale/aom_scale_rtcd.pl | 55 + libs/libaom/src/aom_scale/generic/aom_scale.c | 506 + libs/libaom/src/aom_scale/generic/gen_scalers.c | 201 + libs/libaom/src/aom_scale/generic/yv12config.c | 269 + libs/libaom/src/aom_scale/generic/yv12extend.c | 477 + .../src/aom_scale/mips/dspr2/yv12extend_dspr2.c | 142 + libs/libaom/src/aom_scale/yv12config.h | 159 + libs/libaom/src/aom_util/aom_thread.c | 212 + libs/libaom/src/aom_util/aom_thread.h | 364 + libs/libaom/src/aom_util/aom_util.cmake | 31 + libs/libaom/src/aom_util/debug_util.c | 279 + libs/libaom/src/aom_util/debug_util.h | 69 + libs/libaom/src/aom_util/endian_inl.h | 122 + libs/libaom/src/apps/aomdec.c | 1024 ++ libs/libaom/src/apps/aomenc.c | 2752 ++++ libs/libaom/src/apps/aomenc.h | 65 + libs/libaom/src/av1/av1.cmake | 580 + libs/libaom/src/av1/av1_cx_iface.c | 2936 +++++ libs/libaom/src/av1/av1_dx_iface.c | 1397 ++ libs/libaom/src/av1/av1_iface_common.h | 144 + libs/libaom/src/av1/common/alloccommon.c | 309 + libs/libaom/src/av1/common/alloccommon.h | 56 + libs/libaom/src/av1/common/arm/av1_inv_txfm_neon.c | 4271 ++++++ libs/libaom/src/av1/common/arm/av1_inv_txfm_neon.h | 154 + libs/libaom/src/av1/common/arm/av1_txfm_neon.c | 30 + .../src/av1/common/arm/blend_a64_hmask_neon.c | 134 + .../src/av1/common/arm/blend_a64_vmask_neon.c | 141 + libs/libaom/src/av1/common/arm/cfl_neon.c | 588 + libs/libaom/src/av1/common/arm/convolve_neon.c | 1593 +++ libs/libaom/src/av1/common/arm/convolve_neon.h | 228 + libs/libaom/src/av1/common/arm/jnt_convolve_neon.c | 1739 +++ libs/libaom/src/av1/common/arm/mem_neon.h | 539 + libs/libaom/src/av1/common/arm/reconinter_neon.c | 86 + libs/libaom/src/av1/common/arm/selfguided_neon.c | 1590 +++ libs/libaom/src/av1/common/arm/transpose_neon.h | 602 + libs/libaom/src/av1/common/arm/warp_plane_neon.c | 714 + .../src/av1/common/arm/wiener_convolve_neon.c | 530 + libs/libaom/src/av1/common/av1_common_int.h | 1557 +++ libs/libaom/src/av1/common/av1_inv_txfm1d.c | 1841 +++ libs/libaom/src/av1/common/av1_inv_txfm1d.h | 61 + libs/libaom/src/av1/common/av1_inv_txfm1d_cfg.h | 47 + libs/libaom/src/av1/common/av1_inv_txfm2d.c | 504 + libs/libaom/src/av1/common/av1_loopfilter.c | 790 ++ libs/libaom/src/av1/common/av1_loopfilter.h | 208 + libs/libaom/src/av1/common/av1_rtcd.c | 22 + libs/libaom/src/av1/common/av1_rtcd_defs.pl | 496 + libs/libaom/src/av1/common/av1_txfm.c | 161 + libs/libaom/src/av1/common/av1_txfm.h | 234 + libs/libaom/src/av1/common/blockd.c | 102 + libs/libaom/src/av1/common/blockd.h | 1296 ++ libs/libaom/src/av1/common/cdef.c | 388 + libs/libaom/src/av1/common/cdef.h | 52 + libs/libaom/src/av1/common/cdef_block.c | 253 + libs/libaom/src/av1/common/cdef_block.h | 58 + libs/libaom/src/av1/common/cdef_block_avx2.c | 14 + libs/libaom/src/av1/common/cdef_block_neon.c | 14 + libs/libaom/src/av1/common/cdef_block_simd.h | 915 ++ libs/libaom/src/av1/common/cdef_block_sse2.c | 14 + libs/libaom/src/av1/common/cdef_block_sse4.c | 14 + libs/libaom/src/av1/common/cdef_block_ssse3.c | 14 + libs/libaom/src/av1/common/cfl.c | 436 + libs/libaom/src/av1/common/cfl.h | 288 + libs/libaom/src/av1/common/common.h | 63 + libs/libaom/src/av1/common/common_data.h | 446 + libs/libaom/src/av1/common/convolve.c | 1274 ++ libs/libaom/src/av1/common/convolve.h | 136 + libs/libaom/src/av1/common/debugmodes.c | 113 + libs/libaom/src/av1/common/entropy.c | 179 + libs/libaom/src/av1/common/entropy.h | 181 + libs/libaom/src/av1/common/entropymode.c | 1103 ++ libs/libaom/src/av1/common/entropymode.h | 212 + libs/libaom/src/av1/common/entropymv.c | 67 + libs/libaom/src/av1/common/entropymv.h | 104 + libs/libaom/src/av1/common/enums.h | 678 + libs/libaom/src/av1/common/filter.h | 279 + libs/libaom/src/av1/common/frame_buffers.c | 98 + libs/libaom/src/av1/common/frame_buffers.h | 60 + libs/libaom/src/av1/common/idct.c | 322 + libs/libaom/src/av1/common/idct.h | 51 + libs/libaom/src/av1/common/loopfiltermask.c | 1458 +++ libs/libaom/src/av1/common/mv.h | 354 + libs/libaom/src/av1/common/mvref_common.c | 1511 +++ libs/libaom/src/av1/common/mvref_common.h | 341 + libs/libaom/src/av1/common/obmc.h | 89 + libs/libaom/src/av1/common/obu_util.c | 154 + libs/libaom/src/av1/common/obu_util.h | 47 + libs/libaom/src/av1/common/odintrin.c | 541 + libs/libaom/src/av1/common/odintrin.h | 96 + libs/libaom/src/av1/common/ppc/cfl_ppc.c | 152 + libs/libaom/src/av1/common/pred_common.c | 501 + libs/libaom/src/av1/common/pred_common.h | 374 + libs/libaom/src/av1/common/quant_common.c | 12875 +++++++++++++++++++ libs/libaom/src/av1/common/quant_common.h | 83 + libs/libaom/src/av1/common/reconinter.c | 1426 ++ libs/libaom/src/av1/common/reconinter.h | 414 + libs/libaom/src/av1/common/reconintra.c | 1704 +++ libs/libaom/src/av1/common/reconintra.h | 151 + libs/libaom/src/av1/common/resize.c | 1455 +++ libs/libaom/src/av1/common/resize.h | 115 + libs/libaom/src/av1/common/restoration.c | 1566 +++ libs/libaom/src/av1/common/restoration.h | 380 + libs/libaom/src/av1/common/scale.c | 128 + libs/libaom/src/av1/common/scale.h | 69 + libs/libaom/src/av1/common/scan.c | 2048 +++ libs/libaom/src/av1/common/scan.h | 55 + libs/libaom/src/av1/common/seg_common.c | 91 + libs/libaom/src/av1/common/seg_common.h | 104 + libs/libaom/src/av1/common/thread_common.c | 930 ++ libs/libaom/src/av1/common/thread_common.h | 122 + libs/libaom/src/av1/common/tile_common.c | 239 + libs/libaom/src/av1/common/tile_common.h | 75 + libs/libaom/src/av1/common/timing.c | 92 + libs/libaom/src/av1/common/timing.h | 55 + libs/libaom/src/av1/common/token_cdfs.h | 3555 +++++ libs/libaom/src/av1/common/txb_common.c | 458 + libs/libaom/src/av1/common/txb_common.h | 442 + libs/libaom/src/av1/common/warped_motion.c | 1073 ++ libs/libaom/src/av1/common/warped_motion.h | 186 + .../av1/common/x86/av1_convolve_horiz_rs_sse4.c | 228 + .../src/av1/common/x86/av1_convolve_scale_sse4.c | 498 + libs/libaom/src/av1/common/x86/av1_inv_txfm_avx2.c | 1949 +++ libs/libaom/src/av1/common/x86/av1_inv_txfm_avx2.h | 71 + .../libaom/src/av1/common/x86/av1_inv_txfm_ssse3.c | 2956 +++++ .../libaom/src/av1/common/x86/av1_inv_txfm_ssse3.h | 232 + libs/libaom/src/av1/common/x86/av1_txfm_sse2.h | 317 + libs/libaom/src/av1/common/x86/av1_txfm_sse4.c | 21 + libs/libaom/src/av1/common/x86/av1_txfm_sse4.h | 72 + libs/libaom/src/av1/common/x86/cfl_avx2.c | 495 + libs/libaom/src/av1/common/x86/cfl_simd.h | 246 + libs/libaom/src/av1/common/x86/cfl_sse2.c | 89 + libs/libaom/src/av1/common/x86/cfl_ssse3.c | 397 + libs/libaom/src/av1/common/x86/convolve_2d_avx2.c | 317 + libs/libaom/src/av1/common/x86/convolve_2d_sse2.c | 471 + libs/libaom/src/av1/common/x86/convolve_avx2.c | 439 + libs/libaom/src/av1/common/x86/convolve_sse2.c | 338 + libs/libaom/src/av1/common/x86/filterintra_sse4.c | 71 + .../src/av1/common/x86/highbd_convolve_2d_avx2.c | 326 + .../src/av1/common/x86/highbd_convolve_2d_sse2.c | 191 + .../src/av1/common/x86/highbd_convolve_2d_sse4.c | 425 + .../src/av1/common/x86/highbd_convolve_2d_ssse3.c | 217 + .../src/av1/common/x86/highbd_inv_txfm_avx2.c | 4246 ++++++ .../src/av1/common/x86/highbd_inv_txfm_sse4.c | 5821 +++++++++ .../src/av1/common/x86/highbd_jnt_convolve_avx2.c | 859 ++ .../src/av1/common/x86/highbd_jnt_convolve_sse4.c | 387 + .../src/av1/common/x86/highbd_txfm_utility_sse4.h | 132 + .../src/av1/common/x86/highbd_warp_plane_sse4.c | 632 + .../av1/common/x86/highbd_wiener_convolve_avx2.c | 245 + .../av1/common/x86/highbd_wiener_convolve_ssse3.c | 202 + libs/libaom/src/av1/common/x86/intra_edge_sse4.c | 318 + libs/libaom/src/av1/common/x86/jnt_convolve_avx2.c | 917 ++ libs/libaom/src/av1/common/x86/jnt_convolve_sse2.c | 615 + .../libaom/src/av1/common/x86/jnt_convolve_ssse3.c | 231 + libs/libaom/src/av1/common/x86/reconinter_avx2.c | 620 + libs/libaom/src/av1/common/x86/reconinter_sse4.c | 153 + libs/libaom/src/av1/common/x86/reconinter_ssse3.c | 116 + libs/libaom/src/av1/common/x86/selfguided_avx2.c | 724 ++ libs/libaom/src/av1/common/x86/selfguided_sse4.c | 662 + libs/libaom/src/av1/common/x86/warp_plane_avx2.c | 1318 ++ libs/libaom/src/av1/common/x86/warp_plane_sse2.c | 88 + libs/libaom/src/av1/common/x86/warp_plane_sse4.c | 963 ++ .../src/av1/common/x86/wiener_convolve_avx2.c | 242 + .../src/av1/common/x86/wiener_convolve_sse2.c | 199 + libs/libaom/src/av1/decoder/accounting.c | 138 + libs/libaom/src/av1/decoder/accounting.h | 82 + libs/libaom/src/av1/decoder/decodeframe.c | 5326 ++++++++ libs/libaom/src/av1/decoder/decodeframe.h | 87 + libs/libaom/src/av1/decoder/decodemv.c | 1575 +++ libs/libaom/src/av1/decoder/decodemv.h | 33 + libs/libaom/src/av1/decoder/decoder.c | 539 + libs/libaom/src/av1/decoder/decoder.h | 331 + libs/libaom/src/av1/decoder/decodetxb.c | 379 + libs/libaom/src/av1/decoder/decodetxb.h | 32 + libs/libaom/src/av1/decoder/detokenize.c | 78 + libs/libaom/src/av1/decoder/detokenize.h | 29 + libs/libaom/src/av1/decoder/dthread.h | 51 + libs/libaom/src/av1/decoder/inspection.c | 154 + libs/libaom/src/av1/decoder/inspection.h | 91 + libs/libaom/src/av1/decoder/obu.c | 1085 ++ libs/libaom/src/av1/decoder/obu.h | 31 + libs/libaom/src/av1/encoder/aq_complexity.c | 185 + libs/libaom/src/av1/encoder/aq_complexity.h | 37 + libs/libaom/src/av1/encoder/aq_cyclicrefresh.c | 501 + libs/libaom/src/av1/encoder/aq_cyclicrefresh.h | 132 + libs/libaom/src/av1/encoder/aq_variance.c | 205 + libs/libaom/src/av1/encoder/aq_variance.h | 33 + .../src/av1/encoder/arm/neon/av1_error_neon.c | 85 + .../src/av1/encoder/arm/neon/quantize_neon.c | 215 + libs/libaom/src/av1/encoder/av1_fwd_txfm1d.c | 1885 +++ libs/libaom/src/av1/encoder/av1_fwd_txfm1d.h | 49 + libs/libaom/src/av1/encoder/av1_fwd_txfm1d_cfg.h | 19 + libs/libaom/src/av1/encoder/av1_fwd_txfm2d.c | 429 + libs/libaom/src/av1/encoder/av1_multi_thread.c | 70 + libs/libaom/src/av1/encoder/av1_multi_thread.h | 21 + libs/libaom/src/av1/encoder/av1_quantize.c | 789 ++ libs/libaom/src/av1/encoder/av1_quantize.h | 163 + libs/libaom/src/av1/encoder/bitstream.c | 3925 ++++++ libs/libaom/src/av1/encoder/bitstream.h | 48 + libs/libaom/src/av1/encoder/block.h | 575 + libs/libaom/src/av1/encoder/blockiness.c | 142 + libs/libaom/src/av1/encoder/cnn.c | 1144 ++ libs/libaom/src/av1/encoder/cnn.h | 197 + libs/libaom/src/av1/encoder/compound_type.c | 1508 +++ libs/libaom/src/av1/encoder/compound_type.h | 48 + libs/libaom/src/av1/encoder/context_tree.c | 268 + libs/libaom/src/av1/encoder/context_tree.h | 97 + libs/libaom/src/av1/encoder/corner_detect.c | 37 + libs/libaom/src/av1/encoder/corner_detect.h | 22 + libs/libaom/src/av1/encoder/corner_match.c | 194 + libs/libaom/src/av1/encoder/corner_match.h | 33 + libs/libaom/src/av1/encoder/cost.c | 46 + libs/libaom/src/av1/encoder/cost.h | 51 + libs/libaom/src/av1/encoder/dwt.c | 155 + libs/libaom/src/av1/encoder/dwt.h | 25 + libs/libaom/src/av1/encoder/enc_enums.h | 255 + libs/libaom/src/av1/encoder/encode_strategy.c | 1322 ++ libs/libaom/src/av1/encoder/encode_strategy.h | 64 + libs/libaom/src/av1/encoder/encodeframe.c | 6475 ++++++++++ libs/libaom/src/av1/encoder/encodeframe.h | 49 + libs/libaom/src/av1/encoder/encodemb.c | 805 ++ libs/libaom/src/av1/encoder/encodemb.h | 145 + libs/libaom/src/av1/encoder/encodemv.c | 270 + libs/libaom/src/av1/encoder/encodemv.h | 76 + libs/libaom/src/av1/encoder/encoder.c | 7187 +++++++++++ libs/libaom/src/av1/encoder/encoder.h | 1965 +++ libs/libaom/src/av1/encoder/encodetxb.c | 2261 ++++ libs/libaom/src/av1/encoder/encodetxb.h | 101 + libs/libaom/src/av1/encoder/ethread.c | 729 ++ libs/libaom/src/av1/encoder/ethread.h | 54 + libs/libaom/src/av1/encoder/extend.c | 151 + libs/libaom/src/av1/encoder/extend.h | 29 + libs/libaom/src/av1/encoder/firstpass.c | 1065 ++ libs/libaom/src/av1/encoder/firstpass.h | 196 + libs/libaom/src/av1/encoder/global_motion.c | 1014 ++ libs/libaom/src/av1/encoder/global_motion.h | 101 + libs/libaom/src/av1/encoder/gop_structure.c | 311 + libs/libaom/src/av1/encoder/gop_structure.h | 43 + libs/libaom/src/av1/encoder/grain_test_vectors.h | 781 ++ libs/libaom/src/av1/encoder/hash.c | 125 + libs/libaom/src/av1/encoder/hash.h | 53 + libs/libaom/src/av1/encoder/hash_motion.c | 491 + libs/libaom/src/av1/encoder/hash_motion.h | 101 + libs/libaom/src/av1/encoder/hybrid_fwd_txfm.c | 308 + libs/libaom/src/av1/encoder/hybrid_fwd_txfm.h | 31 + libs/libaom/src/av1/encoder/interp_search.c | 753 ++ libs/libaom/src/av1/encoder/interp_search.h | 85 + libs/libaom/src/av1/encoder/intra_mode_search.c | 2132 +++ libs/libaom/src/av1/encoder/intra_mode_search.h | 63 + libs/libaom/src/av1/encoder/k_means_template.h | 123 + libs/libaom/src/av1/encoder/level.c | 1184 ++ libs/libaom/src/av1/encoder/level.h | 211 + libs/libaom/src/av1/encoder/lookahead.c | 205 + libs/libaom/src/av1/encoder/lookahead.h | 122 + libs/libaom/src/av1/encoder/mathutils.h | 359 + libs/libaom/src/av1/encoder/mcomp.c | 3391 +++++ libs/libaom/src/av1/encoder/mcomp.h | 329 + libs/libaom/src/av1/encoder/mips/msa/error_msa.c | 109 + libs/libaom/src/av1/encoder/mips/msa/fdct4x4_msa.c | 46 + .../src/av1/encoder/mips/msa/temporal_filter_msa.c | 286 + libs/libaom/src/av1/encoder/misc_model_weights.h | 696 + libs/libaom/src/av1/encoder/ml.c | 156 + libs/libaom/src/av1/encoder/ml.h | 82 + .../src/av1/encoder/mode_prune_model_weights.h | 185 + libs/libaom/src/av1/encoder/model_rd.h | 275 + libs/libaom/src/av1/encoder/motion_search_facade.c | 861 ++ libs/libaom/src/av1/encoder/motion_search_facade.h | 76 + libs/libaom/src/av1/encoder/mv_prec.c | 430 + libs/libaom/src/av1/encoder/mv_prec.h | 48 + libs/libaom/src/av1/encoder/nonrd_pickmode.c | 2182 ++++ libs/libaom/src/av1/encoder/palette.c | 154 + libs/libaom/src/av1/encoder/palette.h | 96 + .../libaom/src/av1/encoder/partition_cnn_weights.h | 2139 +++ .../src/av1/encoder/partition_model_weights.h | 5646 ++++++++ libs/libaom/src/av1/encoder/partition_strategy.c | 1288 ++ libs/libaom/src/av1/encoder/partition_strategy.h | 222 + libs/libaom/src/av1/encoder/pass2_strategy.c | 2895 +++++ libs/libaom/src/av1/encoder/pass2_strategy.h | 76 + libs/libaom/src/av1/encoder/pickcdef.c | 587 + libs/libaom/src/av1/encoder/picklpf.c | 285 + libs/libaom/src/av1/encoder/picklpf.h | 30 + libs/libaom/src/av1/encoder/pickrst.c | 1768 +++ libs/libaom/src/av1/encoder/pickrst.h | 66 + libs/libaom/src/av1/encoder/pustats.h | 198 + libs/libaom/src/av1/encoder/random.h | 29 + libs/libaom/src/av1/encoder/ransac.c | 820 ++ libs/libaom/src/av1/encoder/ransac.h | 31 + libs/libaom/src/av1/encoder/ratectrl.c | 2117 +++ libs/libaom/src/av1/encoder/ratectrl.h | 324 + libs/libaom/src/av1/encoder/rd.c | 1332 ++ libs/libaom/src/av1/encoder/rd.h | 370 + libs/libaom/src/av1/encoder/rdopt.c | 5505 ++++++++ libs/libaom/src/av1/encoder/rdopt.h | 244 + libs/libaom/src/av1/encoder/rdopt_data_defs.h | 294 + libs/libaom/src/av1/encoder/rdopt_utils.h | 652 + libs/libaom/src/av1/encoder/reconinter_enc.c | 407 + libs/libaom/src/av1/encoder/reconinter_enc.h | 72 + libs/libaom/src/av1/encoder/segmentation.c | 251 + libs/libaom/src/av1/encoder/segmentation.h | 38 + libs/libaom/src/av1/encoder/speed_features.c | 1322 ++ libs/libaom/src/av1/encoder/speed_features.h | 1034 ++ libs/libaom/src/av1/encoder/svc_layercontext.c | 288 + libs/libaom/src/av1/encoder/svc_layercontext.h | 99 + libs/libaom/src/av1/encoder/temporal_filter.c | 1338 ++ libs/libaom/src/av1/encoder/temporal_filter.h | 87 + libs/libaom/src/av1/encoder/tokenize.c | 242 + libs/libaom/src/av1/encoder/tokenize.h | 71 + libs/libaom/src/av1/encoder/tpl_model.c | 1189 ++ libs/libaom/src/av1/encoder/tpl_model.h | 47 + libs/libaom/src/av1/encoder/tune_vmaf.c | 794 ++ libs/libaom/src/av1/encoder/tune_vmaf.h | 32 + .../src/av1/encoder/tx_prune_model_weights.h | 3320 +++++ libs/libaom/src/av1/encoder/tx_search.c | 3602 ++++++ libs/libaom/src/av1/encoder/tx_search.h | 79 + .../src/av1/encoder/use_flat_gop_model_params.h | 233 + libs/libaom/src/av1/encoder/var_based_part.c | 1006 ++ libs/libaom/src/av1/encoder/var_based_part.h | 45 + libs/libaom/src/av1/encoder/wedge_utils.c | 125 + .../src/av1/encoder/x86/av1_fwd_txfm1d_sse4.c | 1417 ++ .../src/av1/encoder/x86/av1_fwd_txfm2d_avx2.c | 2814 ++++ .../src/av1/encoder/x86/av1_fwd_txfm2d_sse4.c | 364 + .../libaom/src/av1/encoder/x86/av1_fwd_txfm_avx2.h | 96 + .../libaom/src/av1/encoder/x86/av1_fwd_txfm_sse2.c | 2891 +++++ .../libaom/src/av1/encoder/x86/av1_fwd_txfm_sse2.h | 119 + .../src/av1/encoder/x86/av1_highbd_quantize_avx2.c | 137 + .../src/av1/encoder/x86/av1_highbd_quantize_sse4.c | 195 + .../libaom/src/av1/encoder/x86/av1_quantize_avx2.c | 445 + .../libaom/src/av1/encoder/x86/av1_quantize_sse2.c | 189 + .../av1/encoder/x86/av1_quantize_ssse3_x86_64.asm | 204 + .../src/av1/encoder/x86/av1_ssim_opt_x86_64.asm | 222 + libs/libaom/src/av1/encoder/x86/av1_txfm1d_sse4.h | 144 + .../libaom/src/av1/encoder/x86/corner_match_avx2.c | 81 + .../libaom/src/av1/encoder/x86/corner_match_sse4.c | 105 + libs/libaom/src/av1/encoder/x86/dct_sse2.asm | 82 + libs/libaom/src/av1/encoder/x86/encodetxb_avx2.c | 122 + libs/libaom/src/av1/encoder/x86/encodetxb_sse2.c | 505 + libs/libaom/src/av1/encoder/x86/encodetxb_sse4.c | 84 + .../libaom/src/av1/encoder/x86/error_intrin_avx2.c | 141 + libs/libaom/src/av1/encoder/x86/error_sse2.asm | 88 + libs/libaom/src/av1/encoder/x86/hash_sse42.c | 51 + .../encoder/x86/highbd_block_error_intrin_avx2.c | 63 + .../encoder/x86/highbd_block_error_intrin_sse2.c | 73 + .../src/av1/encoder/x86/highbd_fwd_txfm_avx2.c | 3167 +++++ .../src/av1/encoder/x86/highbd_fwd_txfm_sse4.c | 2604 ++++ libs/libaom/src/av1/encoder/x86/ml_sse3.c | 244 + libs/libaom/src/av1/encoder/x86/pickrst_avx2.c | 1084 ++ libs/libaom/src/av1/encoder/x86/pickrst_sse4.c | 833 ++ libs/libaom/src/av1/encoder/x86/rdopt_avx2.c | 256 + libs/libaom/src/av1/encoder/x86/rdopt_sse4.c | 275 + .../src/av1/encoder/x86/temporal_filter_avx2.c | 284 + .../av1/encoder/x86/temporal_filter_constants.h | 407 + .../src/av1/encoder/x86/temporal_filter_sse2.c | 262 + .../src/av1/encoder/x86/temporal_filter_sse4.c | 2044 +++ libs/libaom/src/av1/encoder/x86/wedge_utils_avx2.c | 215 + libs/libaom/src/av1/encoder/x86/wedge_utils_sse2.c | 254 + libs/libaom/src/av1/exports_com | 2 + libs/libaom/src/av1/exports_dec | 3 + libs/libaom/src/av1/exports_enc | 2 + libs/libaom/src/av1/exports_ident | 2 + libs/libaom/src/av1/exports_test | 2 + libs/libaom/src/build/.gitattributes | 2 + libs/libaom/src/build/cmake/aom_config.c.template | 13 + .../src/build/cmake/aom_config_defaults.cmake | 193 + libs/libaom/src/build/cmake/aom_configure.cmake | 399 + .../src/build/cmake/aom_experiment_deps.cmake | 28 + libs/libaom/src/build/cmake/aom_install.cmake | 96 + libs/libaom/src/build/cmake/aom_optimization.cmake | 240 + libs/libaom/src/build/cmake/compiler_flags.cmake | 373 + libs/libaom/src/build/cmake/compiler_tests.cmake | 179 + libs/libaom/src/build/cmake/cpu.cmake | 82 + libs/libaom/src/build/cmake/dist.cmake | 64 + libs/libaom/src/build/cmake/exports.cmake | 74 + libs/libaom/src/build/cmake/exports_sources.cmake | 35 + .../cmake/generate_aom_config_templates.cmake | 92 + libs/libaom/src/build/cmake/generate_exports.cmake | 66 + libs/libaom/src/build/cmake/ios-Info.plist | 37 + libs/libaom/src/build/cmake/iosbuild.sh | 384 + libs/libaom/src/build/cmake/msvc_runtime.cmake | 37 + libs/libaom/src/build/cmake/pkg_config.cmake | 62 + libs/libaom/src/build/cmake/rtcd.pl | 467 + libs/libaom/src/build/cmake/sanitizers.cmake | 46 + .../build/cmake/toolchains/arm-ios-common.cmake | 26 + .../cmake/toolchains/arm64-android-clang.cmake | 48 + .../src/build/cmake/toolchains/arm64-ios.cmake | 23 + .../build/cmake/toolchains/arm64-linux-gcc.cmake | 36 + .../build/cmake/toolchains/arm64-mingw-gcc.cmake | 29 + .../src/build/cmake/toolchains/armv7-ios.cmake | 31 + .../build/cmake/toolchains/armv7-linux-gcc.cmake | 40 + .../build/cmake/toolchains/armv7-mingw-gcc.cmake | 29 + .../src/build/cmake/toolchains/armv7s-ios.cmake | 31 + .../cmake/toolchains/ios-simulator-common.cmake | 23 + .../build/cmake/toolchains/mips32-linux-gcc.cmake | 77 + .../build/cmake/toolchains/mips64-linux-gcc.cmake | 54 + .../src/build/cmake/toolchains/ppc-linux-gcc.cmake | 29 + .../build/cmake/toolchains/x86-ios-simulator.cmake | 28 + .../src/build/cmake/toolchains/x86-linux.cmake | 19 + .../src/build/cmake/toolchains/x86-macos.cmake | 18 + .../src/build/cmake/toolchains/x86-mingw-gcc.cmake | 28 + .../cmake/toolchains/x86_64-ios-simulator.cmake | 25 + .../build/cmake/toolchains/x86_64-mingw-gcc.cmake | 26 + libs/libaom/src/build/cmake/util.cmake | 172 + libs/libaom/src/build/cmake/version.cmake | 65 + libs/libaom/src/build/cmake/version.pl | 112 + libs/libaom/src/codereview.settings | 4 + libs/libaom/src/common/args.c | 343 + libs/libaom/src/common/args.h | 71 + libs/libaom/src/common/av1_config.c | 511 + libs/libaom/src/common/av1_config.h | 86 + libs/libaom/src/common/ivfdec.c | 110 + libs/libaom/src/common/ivfdec.h | 29 + libs/libaom/src/common/ivfenc.c | 52 + libs/libaom/src/common/ivfenc.h | 34 + libs/libaom/src/common/md5_utils.c | 249 + libs/libaom/src/common/md5_utils.h | 49 + libs/libaom/src/common/obudec.c | 486 + libs/libaom/src/common/obudec.h | 48 + libs/libaom/src/common/rawenc.c | 96 + libs/libaom/src/common/rawenc.h | 32 + libs/libaom/src/common/tools_common.c | 508 + libs/libaom/src/common/tools_common.h | 264 + libs/libaom/src/common/video_common.h | 25 + libs/libaom/src/common/video_reader.c | 127 + libs/libaom/src/common/video_reader.h | 60 + libs/libaom/src/common/video_writer.c | 83 + libs/libaom/src/common/video_writer.h | 47 + libs/libaom/src/common/warnings.c | 97 + libs/libaom/src/common/warnings.h | 34 + libs/libaom/src/common/webmdec.cc | 248 + libs/libaom/src/common/webmdec.h | 71 + libs/libaom/src/common/webmenc.cc | 173 + libs/libaom/src/common/webmenc.h | 60 + libs/libaom/src/common/y4menc.c | 107 + libs/libaom/src/common/y4menc.h | 39 + libs/libaom/src/common/y4minput.c | 1153 ++ libs/libaom/src/common/y4minput.h | 75 + libs/libaom/src/docs.cmake | 257 + libs/libaom/src/examples/analyzer.cc | 723 ++ libs/libaom/src/examples/aom_cx_set_ref.c | 383 + libs/libaom/src/examples/av1_dec_fuzzer.cc | 67 + libs/libaom/src/examples/build_av1_dec_fuzzer.sh | 70 + libs/libaom/src/examples/decode_to_md5.c | 131 + libs/libaom/src/examples/decode_with_drops.c | 146 + libs/libaom/src/examples/encoder_util.c | 136 + libs/libaom/src/examples/encoder_util.h | 33 + libs/libaom/src/examples/inspect.c | 958 ++ .../src/examples/lightfield_bitstream_parsing.c | 414 + libs/libaom/src/examples/lightfield_decoder.c | 364 + libs/libaom/src/examples/lightfield_encoder.c | 522 + .../src/examples/lightfield_tile_list_decoder.c | 227 + libs/libaom/src/examples/lossless_encoder.c | 138 + libs/libaom/src/examples/noise_model.c | 432 + libs/libaom/src/examples/resize_util.c | 125 + libs/libaom/src/examples/scalable_decoder.c | 185 + libs/libaom/src/examples/scalable_encoder.c | 289 + libs/libaom/src/examples/set_maps.c | 208 + libs/libaom/src/examples/simple_decoder.c | 146 + libs/libaom/src/examples/simple_encoder.c | 249 + libs/libaom/src/examples/svc_encoder_rtc.c | 907 ++ libs/libaom/src/examples/twopass_encoder.c | 250 + libs/libaom/src/keywords.dox | 51 + libs/libaom/src/libs.doxy_template | 1260 ++ libs/libaom/src/mainpage.dox | 52 + libs/libaom/src/stats/aomstats.c | 106 + libs/libaom/src/stats/aomstats.h | 44 + libs/libaom/src/stats/rate_hist.c | 271 + libs/libaom/src/stats/rate_hist.h | 41 + libs/libaom/src/test/accounting_test.cc | 75 + libs/libaom/src/test/acm_random.h | 85 + libs/libaom/src/test/active_map_test.cc | 103 + libs/libaom/src/test/altref_test.cc | 98 + libs/libaom/src/test/aom_integer_test.cc | 177 + libs/libaom/src/test/aomcx_set_ref.sh | 58 + libs/libaom/src/test/aomdec.sh | 147 + libs/libaom/src/test/aomenc.sh | 269 + libs/libaom/src/test/aq_segment_test.cc | 97 + libs/libaom/src/test/arf_freq_test.cc | 225 + libs/libaom/src/test/av1_common_int_test.cc | 22 + libs/libaom/src/test/av1_config_test.cc | 164 + libs/libaom/src/test/av1_convolve_2d_test.cc | 261 + libs/libaom/src/test/av1_convolve_2d_test_util.cc | 708 + libs/libaom/src/test/av1_convolve_2d_test_util.h | 120 + libs/libaom/src/test/av1_convolve_scale_test.cc | 532 + .../src/test/av1_encoder_parms_get_to_decoder.cc | 160 + libs/libaom/src/test/av1_ext_tile_test.cc | 215 + libs/libaom/src/test/av1_fwd_txfm1d_test.cc | 105 + libs/libaom/src/test/av1_fwd_txfm2d_test.cc | 583 + libs/libaom/src/test/av1_highbd_iht_test.cc | 362 + .../src/test/av1_horz_only_frame_superres_test.cc | 365 + libs/libaom/src/test/av1_inv_txfm1d_test.cc | 157 + libs/libaom/src/test/av1_inv_txfm2d_test.cc | 422 + libs/libaom/src/test/av1_nn_predict_test.cc | 217 + libs/libaom/src/test/av1_quantize_test.cc | 239 + libs/libaom/src/test/av1_round_shift_array_test.cc | 130 + libs/libaom/src/test/av1_txfm_test.cc | 371 + libs/libaom/src/test/av1_txfm_test.h | 135 + libs/libaom/src/test/av1_wedge_utils_test.cc | 391 + libs/libaom/src/test/avg_test.cc | 291 + libs/libaom/src/test/best_encode.sh | 103 + libs/libaom/src/test/binary_codes_test.cc | 83 + libs/libaom/src/test/blend_a64_mask_1d_test.cc | 340 + libs/libaom/src/test/blend_a64_mask_test.cc | 620 + libs/libaom/src/test/blockd_test.cc | 122 + libs/libaom/src/test/boolcoder_test.cc | 173 + libs/libaom/src/test/borders_test.cc | 85 + libs/libaom/src/test/cdef_test.cc | 426 + libs/libaom/src/test/cfl_test.cc | 585 + libs/libaom/src/test/clear_system_state.h | 31 + libs/libaom/src/test/cnn_test.cc | 2496 ++++ libs/libaom/src/test/codec_factory.h | 173 + libs/libaom/src/test/coding_path_sync.cc | 206 + libs/libaom/src/test/comp_avg_pred_test.cc | 80 + libs/libaom/src/test/comp_avg_pred_test.h | 569 + libs/libaom/src/test/comp_mask_variance_test.cc | 577 + libs/libaom/src/test/convolve_round_test.cc | 184 + libs/libaom/src/test/convolve_test.cc | 885 ++ libs/libaom/src/test/corner_match_test.cc | 144 + libs/libaom/src/test/cpu_speed_test.cc | 180 + libs/libaom/src/test/datarate_test.cc | 373 + libs/libaom/src/test/datarate_test.h | 157 + libs/libaom/src/test/decode_api_test.cc | 55 + libs/libaom/src/test/decode_multithreaded_test.cc | 185 + libs/libaom/src/test/decode_perf_test.cc | 247 + libs/libaom/src/test/decode_test_driver.cc | 114 + libs/libaom/src/test/decode_test_driver.h | 165 + libs/libaom/src/test/decode_to_md5.sh | 77 + libs/libaom/src/test/decode_with_drops.sh | 68 + libs/libaom/src/test/divu_small_test.cc | 41 + libs/libaom/src/test/dr_prediction_test.cc | 474 + libs/libaom/src/test/dump_obu.sh | 70 + libs/libaom/src/test/ec_test.cc | 160 + libs/libaom/src/test/edge_detect_test.cc | 409 + libs/libaom/src/test/encode_api_test.cc | 73 + libs/libaom/src/test/encode_perf_test.cc | 184 + libs/libaom/src/test/encode_test_driver.cc | 297 + libs/libaom/src/test/encode_test_driver.h | 265 + libs/libaom/src/test/encodetxb_test.cc | 263 + libs/libaom/src/test/end_to_end_test.cc | 211 + libs/libaom/src/test/error_block_test.cc | 289 + libs/libaom/src/test/error_resilience_test.cc | 459 + libs/libaom/src/test/ethread_test.cc | 275 + libs/libaom/src/test/examples.sh | 29 + libs/libaom/src/test/external_frame_buffer_test.cc | 540 + libs/libaom/src/test/fdct4x4_test.cc | 124 + libs/libaom/src/test/fft_test.cc | 257 + libs/libaom/src/test/film_grain_table_test.cc | 250 + libs/libaom/src/test/filterintra_test.cc | 136 + libs/libaom/src/test/frame_error_test.cc | 164 + libs/libaom/src/test/frame_size_tests.cc | 78 + libs/libaom/src/test/function_equivalence_test.h | 71 + libs/libaom/src/test/fwd_kf_test.cc | 116 + libs/libaom/src/test/fwht4x4_test.cc | 100 + libs/libaom/src/test/gf_pyr_height_test.cc | 156 + libs/libaom/src/test/gviz_api.py | 1087 ++ libs/libaom/src/test/hadamard_test.cc | 261 + libs/libaom/src/test/hash_test.cc | 134 + libs/libaom/src/test/hbd_metrics_test.cc | 240 + libs/libaom/src/test/hiprec_convolve_test.cc | 68 + libs/libaom/src/test/hiprec_convolve_test_util.cc | 350 + libs/libaom/src/test/hiprec_convolve_test_util.h | 95 + libs/libaom/src/test/horver_correlation_test.cc | 148 + libs/libaom/src/test/horz_superres_test.cc | 406 + libs/libaom/src/test/i420_video_source.h | 34 + libs/libaom/src/test/intra_edge_test.cc | 337 + libs/libaom/src/test/intrabc_test.cc | 170 + libs/libaom/src/test/intrapred_test.cc | 273 + libs/libaom/src/test/invalid_file_test.cc | 159 + libs/libaom/src/test/ivf_video_source.h | 114 + libs/libaom/src/test/level_test.cc | 157 + libs/libaom/src/test/lightfield_test.sh | 115 + libs/libaom/src/test/log2_test.cc | 50 + libs/libaom/src/test/lossless_test.cc | 126 + libs/libaom/src/test/lpf_test.cc | 645 + libs/libaom/src/test/masked_sad_test.cc | 495 + libs/libaom/src/test/masked_variance_test.cc | 514 + libs/libaom/src/test/md5_helper.h | 76 + libs/libaom/src/test/metadata_test.cc | 337 + libs/libaom/src/test/metrics_template.html | 422 + libs/libaom/src/test/monochrome_test.cc | 130 + libs/libaom/src/test/motion_vector_test.cc | 107 + libs/libaom/src/test/noise_model_test.cc | 1343 ++ libs/libaom/src/test/obmc_sad_test.cc | 268 + libs/libaom/src/test/obmc_variance_test.cc | 397 + libs/libaom/src/test/pickrst_test.cc | 534 + libs/libaom/src/test/qm_test.cc | 81 + libs/libaom/src/test/quantize_func_test.cc | 547 + libs/libaom/src/test/reconinter_test.cc | 259 + libs/libaom/src/test/register_state_check.h | 148 + libs/libaom/src/test/resize_test.cc | 644 + libs/libaom/src/test/rt_end_to_end_test.cc | 174 + libs/libaom/src/test/run_encodes.sh | 39 + libs/libaom/src/test/sad_test.cc | 1981 +++ libs/libaom/src/test/sb_multipass_test.cc | 153 + libs/libaom/src/test/scalability_test.cc | 81 + libs/libaom/src/test/scan_test.cc | 133 + libs/libaom/src/test/segment_binarization_sync.cc | 61 + libs/libaom/src/test/selfguided_filter_test.cc | 420 + libs/libaom/src/test/set_maps.sh | 52 + libs/libaom/src/test/simd_avx2_test.cc | 15 + libs/libaom/src/test/simd_cmp_avx2.cc | 15 + libs/libaom/src/test/simd_cmp_impl.h | 2171 ++++ libs/libaom/src/test/simd_cmp_neon.cc | 17 + libs/libaom/src/test/simd_cmp_sse2.cc | 18 + libs/libaom/src/test/simd_cmp_sse4.cc | 18 + libs/libaom/src/test/simd_cmp_ssse3.cc | 18 + libs/libaom/src/test/simd_impl.h | 1143 ++ libs/libaom/src/test/simd_neon_test.cc | 17 + libs/libaom/src/test/simd_sse2_test.cc | 18 + libs/libaom/src/test/simd_sse4_test.cc | 18 + libs/libaom/src/test/simd_ssse3_test.cc | 18 + libs/libaom/src/test/simple_decoder.sh | 58 + libs/libaom/src/test/simple_encoder.sh | 53 + libs/libaom/src/test/subtract_test.cc | 252 + libs/libaom/src/test/sum_squares_test.cc | 839 ++ libs/libaom/src/test/superframe_test.cc | 110 + libs/libaom/src/test/svc_datarate_test.cc | 609 + .../src/test/temporal_filter_planewise_test.cc | 242 + libs/libaom/src/test/temporal_filter_yuv_test.cc | 841 ++ libs/libaom/src/test/test-data.sha1 | 559 + libs/libaom/src/test/test.cmake | 471 + .../src/test/test_data_download_worker.cmake | 46 + libs/libaom/src/test/test_data_util.cmake | 650 + libs/libaom/src/test/test_intra_pred_speed.cc | 1467 +++ libs/libaom/src/test/test_libaom.cc | 74 + libs/libaom/src/test/test_runner.cmake | 28 + libs/libaom/src/test/test_vector_test.cc | 174 + libs/libaom/src/test/test_vectors.cc | 263 + libs/libaom/src/test/test_vectors.h | 26 + libs/libaom/src/test/tile_independence_test.cc | 173 + libs/libaom/src/test/time_stamp_test.cc | 105 + libs/libaom/src/test/tools_common.sh | 477 + libs/libaom/src/test/transform_test_base.h | 345 + libs/libaom/src/test/twopass_encoder.sh | 54 + libs/libaom/src/test/util.h | 53 + libs/libaom/src/test/variance_test.cc | 2410 ++++ libs/libaom/src/test/video_source.h | 259 + libs/libaom/src/test/visual_metrics.py | 466 + libs/libaom/src/test/warp_filter_test.cc | 67 + libs/libaom/src/test/warp_filter_test_util.cc | 483 + libs/libaom/src/test/warp_filter_test_util.h | 107 + libs/libaom/src/test/webm_video_source.h | 96 + libs/libaom/src/test/wiener_test.cc | 587 + libs/libaom/src/test/y4m_test.cc | 180 + libs/libaom/src/test/y4m_video_source.h | 125 + libs/libaom/src/test/yuv_video_source.h | 123 + libs/libaom/src/third_party/fastfeat/LICENSE | 30 + libs/libaom/src/third_party/fastfeat/README.libaom | 40 + libs/libaom/src/third_party/fastfeat/fast.c | 22 + libs/libaom/src/third_party/fastfeat/fast.h | 20 + libs/libaom/src/third_party/fastfeat/fast_9.c | 5911 +++++++++ libs/libaom/src/third_party/fastfeat/nonmax.c | 121 + .../src/third_party/googletest/README.libaom | 17 + .../third_party/googletest/src/googletest/CHANGES | 157 + .../googletest/src/googletest/CMakeLists.txt | 331 + .../googletest/src/googletest/CONTRIBUTORS | 37 + .../third_party/googletest/src/googletest/LICENSE | 28 + .../googletest/src/googletest/README.md | 341 + .../src/googletest/cmake/Config.cmake.in | 9 + .../googletest/src/googletest/cmake/gtest.pc.in | 9 + .../src/googletest/cmake/gtest_main.pc.in | 10 + .../src/googletest/cmake/internal_utils.cmake | 318 + .../googletest/include/gtest/gtest-death-test.h | 342 + .../src/googletest/include/gtest/gtest-matchers.h | 769 ++ .../src/googletest/include/gtest/gtest-message.h | 217 + .../googletest/include/gtest/gtest-param-test.h | 507 + .../src/googletest/include/gtest/gtest-printers.h | 925 ++ .../src/googletest/include/gtest/gtest-spi.h | 245 + .../src/googletest/include/gtest/gtest-test-part.h | 183 + .../googletest/include/gtest/gtest-typed-test.h | 337 + .../src/googletest/include/gtest/gtest.h | 2454 ++++ .../src/googletest/include/gtest/gtest_pred_impl.h | 277 + .../src/googletest/include/gtest/gtest_prod.h | 61 + .../include/gtest/internal/custom/README.md | 56 + .../include/gtest/internal/custom/gtest-port.h | 37 + .../include/gtest/internal/custom/gtest-printers.h | 42 + .../include/gtest/internal/custom/gtest.h | 37 + .../gtest/internal/gtest-death-test-internal.h | 301 + .../include/gtest/internal/gtest-filepath.h | 208 + .../include/gtest/internal/gtest-internal.h | 1441 +++ .../include/gtest/internal/gtest-param-util.h | 922 ++ .../include/gtest/internal/gtest-port-arch.h | 111 + .../googletest/include/gtest/internal/gtest-port.h | 2232 ++++ .../include/gtest/internal/gtest-string.h | 171 + .../include/gtest/internal/gtest-type-util.h | 184 + .../googletest/src/googletest/src/gtest-all.cc | 48 + .../src/googletest/src/gtest-death-test.cc | 1614 +++ .../src/googletest/src/gtest-filepath.cc | 377 + .../src/googletest/src/gtest-internal-inl.h | 1213 ++ .../src/googletest/src/gtest-matchers.cc | 97 + .../googletest/src/googletest/src/gtest-port.cc | 1361 ++ .../src/googletest/src/gtest-printers.cc | 400 + .../src/googletest/src/gtest-test-part.cc | 107 + .../src/googletest/src/gtest-typed-test.cc | 117 + .../googletest/src/googletest/src/gtest.cc | 6240 +++++++++ .../googletest/src/googletest/src/gtest_main.cc | 52 + libs/libaom/src/third_party/libwebm/AUTHORS.TXT | 4 + libs/libaom/src/third_party/libwebm/Android.mk | 17 + libs/libaom/src/third_party/libwebm/LICENSE.TXT | 30 + libs/libaom/src/third_party/libwebm/PATENTS.TXT | 23 + libs/libaom/src/third_party/libwebm/README.libaom | 20 + .../src/third_party/libwebm/common/file_util.cc | 93 + .../src/third_party/libwebm/common/file_util.h | 44 + .../src/third_party/libwebm/common/hdr_util.cc | 220 + .../src/third_party/libwebm/common/hdr_util.h | 71 + .../src/third_party/libwebm/common/webmids.h | 193 + .../src/third_party/libwebm/mkvmuxer/mkvmuxer.cc | 4221 ++++++ .../src/third_party/libwebm/mkvmuxer/mkvmuxer.h | 1924 +++ .../third_party/libwebm/mkvmuxer/mkvmuxertypes.h | 28 + .../third_party/libwebm/mkvmuxer/mkvmuxerutil.cc | 743 ++ .../third_party/libwebm/mkvmuxer/mkvmuxerutil.h | 115 + .../src/third_party/libwebm/mkvmuxer/mkvwriter.cc | 92 + .../src/third_party/libwebm/mkvmuxer/mkvwriter.h | 51 + .../src/third_party/libwebm/mkvparser/mkvparser.cc | 8071 ++++++++++++ .../src/third_party/libwebm/mkvparser/mkvparser.h | 1147 ++ .../src/third_party/libwebm/mkvparser/mkvreader.cc | 135 + .../src/third_party/libwebm/mkvparser/mkvreader.h | 45 + libs/libaom/src/third_party/libyuv/README.libaom | 15 + .../libyuv/include/libyuv/basic_types.h | 119 + .../third_party/libyuv/include/libyuv/compare.h | 79 + .../third_party/libyuv/include/libyuv/convert.h | 246 + .../libyuv/include/libyuv/convert_argb.h | 232 + .../libyuv/include/libyuv/convert_from.h | 182 + .../libyuv/include/libyuv/convert_from_argb.h | 191 + .../src/third_party/libyuv/include/libyuv/cpu_id.h | 82 + .../libyuv/include/libyuv/mjpeg_decoder.h | 193 + .../libyuv/include/libyuv/planar_functions.h | 454 + .../src/third_party/libyuv/include/libyuv/rotate.h | 118 + .../libyuv/include/libyuv/rotate_argb.h | 34 + .../third_party/libyuv/include/libyuv/rotate_row.h | 139 + .../src/third_party/libyuv/include/libyuv/row.h | 1857 +++ .../src/third_party/libyuv/include/libyuv/scale.h | 104 + .../third_party/libyuv/include/libyuv/scale_argb.h | 58 + .../third_party/libyuv/include/libyuv/scale_row.h | 479 + .../third_party/libyuv/include/libyuv/version.h | 17 + .../libyuv/include/libyuv/video_common.h | 183 + .../src/third_party/libyuv/source/compare.cc | 373 + .../third_party/libyuv/source/compare_common.cc | 42 + .../src/third_party/libyuv/source/compare_gcc.cc | 152 + .../src/third_party/libyuv/source/compare_neon.cc | 65 + .../third_party/libyuv/source/compare_neon64.cc | 63 + .../src/third_party/libyuv/source/compare_win.cc | 229 + .../src/third_party/libyuv/source/convert.cc | 1389 ++ .../src/third_party/libyuv/source/convert_argb.cc | 1155 ++ .../src/third_party/libyuv/source/convert_from.cc | 1348 ++ .../third_party/libyuv/source/convert_from_argb.cc | 1301 ++ .../src/third_party/libyuv/source/convert_jpeg.cc | 392 + .../third_party/libyuv/source/convert_to_argb.cc | 306 + .../third_party/libyuv/source/convert_to_i420.cc | 339 + .../libaom/src/third_party/libyuv/source/cpu_id.cc | 307 + .../src/third_party/libyuv/source/mjpeg_decoder.cc | 572 + .../third_party/libyuv/source/mjpeg_validate.cc | 101 + .../third_party/libyuv/source/planar_functions.cc | 2555 ++++ .../libaom/src/third_party/libyuv/source/rotate.cc | 496 + .../src/third_party/libyuv/source/rotate_any.cc | 55 + .../src/third_party/libyuv/source/rotate_argb.cc | 205 + .../src/third_party/libyuv/source/rotate_common.cc | 92 + .../src/third_party/libyuv/source/rotate_gcc.cc | 493 + .../src/third_party/libyuv/source/rotate_mips.cc | 484 + .../src/third_party/libyuv/source/rotate_neon.cc | 535 + .../src/third_party/libyuv/source/rotate_neon64.cc | 543 + .../src/third_party/libyuv/source/rotate_win.cc | 248 + .../src/third_party/libyuv/source/row_any.cc | 680 + .../src/third_party/libyuv/source/row_common.cc | 2576 ++++ .../src/third_party/libyuv/source/row_gcc.cc | 5475 ++++++++ .../src/third_party/libyuv/source/row_mips.cc | 911 ++ .../src/third_party/libyuv/source/row_neon.cc | 3084 +++++ .../src/third_party/libyuv/source/row_neon64.cc | 3087 +++++ .../src/third_party/libyuv/source/row_win.cc | 6331 +++++++++ .../src/third_party/libyuv/source/row_x86.asm | 146 + libs/libaom/src/third_party/libyuv/source/scale.cc | 1689 +++ .../src/third_party/libyuv/source/scale_any.cc | 200 + .../src/third_party/libyuv/source/scale_argb.cc | 853 ++ .../src/third_party/libyuv/source/scale_common.cc | 1137 ++ .../src/third_party/libyuv/source/scale_gcc.cc | 1089 ++ .../src/third_party/libyuv/source/scale_mips.cc | 654 + .../src/third_party/libyuv/source/scale_neon.cc | 1037 ++ .../src/third_party/libyuv/source/scale_neon64.cc | 1042 ++ .../src/third_party/libyuv/source/scale_win.cc | 1354 ++ .../src/third_party/libyuv/source/video_common.cc | 64 + .../src/third_party/libyuv/source/x86inc.asm | 1136 ++ libs/libaom/src/third_party/vector/LICENSE | 19 + libs/libaom/src/third_party/vector/README.libaom | 16 + libs/libaom/src/third_party/vector/vector.c | 540 + libs/libaom/src/third_party/vector/vector.h | 138 + libs/libaom/src/third_party/x86inc/LICENSE | 18 + libs/libaom/src/third_party/x86inc/README.libaom | 20 + libs/libaom/src/third_party/x86inc/x86inc.asm | 1649 +++ libs/libaom/src/tools/aggregate_entropy_stats.py | 39 + libs/libaom/src/tools/aom_entropy_optimizer.c | 761 ++ libs/libaom/src/tools/cpplint.py | 4756 +++++++ libs/libaom/src/tools/diff.py | 132 + libs/libaom/src/tools/dump_obu.cc | 164 + libs/libaom/src/tools/gen_authors.sh | 10 + libs/libaom/src/tools/gen_constrained_tokenset.py | 120 + libs/libaom/src/tools/inspect-cli.js | 39 + libs/libaom/src/tools/inspect-post.js | 1 + libs/libaom/src/tools/intersect-diffs.py | 78 + libs/libaom/src/tools/lint-hunks.py | 146 + libs/libaom/src/tools/obu_parser.cc | 190 + libs/libaom/src/tools/obu_parser.h | 27 + .../src/tools/txfm_analyzer/txfm_gen_code.cc | 580 + libs/libaom/src/tools/txfm_analyzer/txfm_graph.cc | 943 ++ libs/libaom/src/tools/txfm_analyzer/txfm_graph.h | 160 + libs/libaom/src/tools/wrap-commit-msg.py | 72 + libs/libaom/src/usage.dox | 109 + libs/libaom/src/usage_cx.dox | 9 + libs/libaom/src/usage_dx.dox | 22 + 1093 files changed, 527771 insertions(+) create mode 100644 libs/libaom/src/.clang-format create mode 100644 libs/libaom/src/.cmake-format.py create mode 100644 libs/libaom/src/.gitattributes create mode 100644 libs/libaom/src/.mailmap create mode 100644 libs/libaom/src/AUTHORS create mode 100644 libs/libaom/src/CHANGELOG create mode 100644 libs/libaom/src/CMakeLists.txt create mode 100644 libs/libaom/src/LICENSE create mode 100644 libs/libaom/src/PATENTS create mode 100644 libs/libaom/src/README.md create mode 100644 libs/libaom/src/Sample.cfg create mode 100644 libs/libaom/src/aom/aom.h create mode 100644 libs/libaom/src/aom/aom_codec.h create mode 100644 libs/libaom/src/aom/aom_decoder.h create mode 100644 libs/libaom/src/aom/aom_encoder.h create mode 100644 libs/libaom/src/aom/aom_frame_buffer.h create mode 100644 libs/libaom/src/aom/aom_image.h create mode 100644 libs/libaom/src/aom/aom_integer.h create mode 100644 libs/libaom/src/aom/aomcx.h create mode 100644 libs/libaom/src/aom/aomdx.h create mode 100644 libs/libaom/src/aom/exports_com create mode 100644 libs/libaom/src/aom/exports_dec create mode 100644 libs/libaom/src/aom/exports_enc create mode 100644 libs/libaom/src/aom/exports_test create mode 100644 libs/libaom/src/aom/internal/aom_codec_internal.h create mode 100644 libs/libaom/src/aom/internal/aom_image_internal.h create mode 100644 libs/libaom/src/aom/src/aom_codec.c create mode 100644 libs/libaom/src/aom/src/aom_decoder.c create mode 100644 libs/libaom/src/aom/src/aom_encoder.c create mode 100644 libs/libaom/src/aom/src/aom_image.c create mode 100644 libs/libaom/src/aom/src/aom_integer.c create mode 100644 libs/libaom/src/aom_dsp/aom_convolve.c create mode 100644 libs/libaom/src/aom_dsp/aom_dsp.cmake create mode 100644 libs/libaom/src/aom_dsp/aom_dsp_common.h create mode 100644 libs/libaom/src/aom_dsp/aom_dsp_rtcd.c create mode 100644 libs/libaom/src/aom_dsp/aom_dsp_rtcd_defs.pl create mode 100644 libs/libaom/src/aom_dsp/aom_filter.h create mode 100644 libs/libaom/src/aom_dsp/aom_simd.h create mode 100644 libs/libaom/src/aom_dsp/aom_simd_inline.h create mode 100644 libs/libaom/src/aom_dsp/arm/avg_neon.c create mode 100644 libs/libaom/src/aom_dsp/arm/blend_a64_mask_neon.c create mode 100644 libs/libaom/src/aom_dsp/arm/fwd_txfm_neon.c create mode 100644 libs/libaom/src/aom_dsp/arm/hadamard_neon.c create mode 100644 libs/libaom/src/aom_dsp/arm/intrapred_neon.c create mode 100644 libs/libaom/src/aom_dsp/arm/loopfilter_neon.c create mode 100644 libs/libaom/src/aom_dsp/arm/sad4d_neon.c create mode 100644 libs/libaom/src/aom_dsp/arm/sad_neon.c create mode 100644 libs/libaom/src/aom_dsp/arm/sse_neon.c create mode 100644 libs/libaom/src/aom_dsp/arm/subpel_variance_neon.c create mode 100644 libs/libaom/src/aom_dsp/arm/subtract_neon.c create mode 100644 libs/libaom/src/aom_dsp/arm/sum_neon.h create mode 100644 libs/libaom/src/aom_dsp/arm/variance_neon.c create mode 100644 libs/libaom/src/aom_dsp/avg.c create mode 100644 libs/libaom/src/aom_dsp/binary_codes_reader.c create mode 100644 libs/libaom/src/aom_dsp/binary_codes_reader.h create mode 100644 libs/libaom/src/aom_dsp/binary_codes_writer.c create mode 100644 libs/libaom/src/aom_dsp/binary_codes_writer.h create mode 100644 libs/libaom/src/aom_dsp/bitreader.c create mode 100644 libs/libaom/src/aom_dsp/bitreader.h create mode 100644 libs/libaom/src/aom_dsp/bitreader_buffer.c create mode 100644 libs/libaom/src/aom_dsp/bitreader_buffer.h create mode 100644 libs/libaom/src/aom_dsp/bitwriter.c create mode 100644 libs/libaom/src/aom_dsp/bitwriter.h create mode 100644 libs/libaom/src/aom_dsp/bitwriter_buffer.c create mode 100644 libs/libaom/src/aom_dsp/bitwriter_buffer.h create mode 100644 libs/libaom/src/aom_dsp/blend.h create mode 100644 libs/libaom/src/aom_dsp/blend_a64_hmask.c create mode 100644 libs/libaom/src/aom_dsp/blend_a64_mask.c create mode 100644 libs/libaom/src/aom_dsp/blend_a64_vmask.c create mode 100644 libs/libaom/src/aom_dsp/blk_sse_sum.c create mode 100644 libs/libaom/src/aom_dsp/entcode.c create mode 100644 libs/libaom/src/aom_dsp/entcode.h create mode 100644 libs/libaom/src/aom_dsp/entdec.c create mode 100644 libs/libaom/src/aom_dsp/entdec.h create mode 100644 libs/libaom/src/aom_dsp/entenc.c create mode 100644 libs/libaom/src/aom_dsp/entenc.h create mode 100644 libs/libaom/src/aom_dsp/fastssim.c create mode 100644 libs/libaom/src/aom_dsp/fft.c create mode 100644 libs/libaom/src/aom_dsp/fft_common.h create mode 100644 libs/libaom/src/aom_dsp/fwd_txfm.c create mode 100644 libs/libaom/src/aom_dsp/grain_synthesis.c create mode 100644 libs/libaom/src/aom_dsp/grain_synthesis.h create mode 100644 libs/libaom/src/aom_dsp/grain_table.c create mode 100644 libs/libaom/src/aom_dsp/grain_table.h create mode 100644 libs/libaom/src/aom_dsp/intrapred.c create mode 100644 libs/libaom/src/aom_dsp/intrapred_common.h create mode 100644 libs/libaom/src/aom_dsp/loopfilter.c create mode 100644 libs/libaom/src/aom_dsp/mips/aom_convolve8_horiz_msa.c create mode 100644 libs/libaom/src/aom_dsp/mips/aom_convolve8_vert_msa.c create mode 100644 libs/libaom/src/aom_dsp/mips/aom_convolve_copy_msa.c create mode 100644 libs/libaom/src/aom_dsp/mips/aom_convolve_msa.h create mode 100644 libs/libaom/src/aom_dsp/mips/common_dspr2.c create mode 100644 libs/libaom/src/aom_dsp/mips/common_dspr2.h create mode 100644 libs/libaom/src/aom_dsp/mips/convolve2_dspr2.c create mode 100644 libs/libaom/src/aom_dsp/mips/convolve2_horiz_dspr2.c create mode 100644 libs/libaom/src/aom_dsp/mips/convolve2_vert_dspr2.c create mode 100644 libs/libaom/src/aom_dsp/mips/convolve8_dspr2.c create mode 100644 libs/libaom/src/aom_dsp/mips/convolve8_horiz_dspr2.c create mode 100644 libs/libaom/src/aom_dsp/mips/convolve8_vert_dspr2.c create mode 100644 libs/libaom/src/aom_dsp/mips/convolve_common_dspr2.h create mode 100644 libs/libaom/src/aom_dsp/mips/intrapred16_dspr2.c create mode 100644 libs/libaom/src/aom_dsp/mips/intrapred4_dspr2.c create mode 100644 libs/libaom/src/aom_dsp/mips/intrapred8_dspr2.c create mode 100644 libs/libaom/src/aom_dsp/mips/intrapred_msa.c create mode 100644 libs/libaom/src/aom_dsp/mips/loopfilter_16_msa.c create mode 100644 libs/libaom/src/aom_dsp/mips/loopfilter_4_msa.c create mode 100644 libs/libaom/src/aom_dsp/mips/loopfilter_8_msa.c create mode 100644 libs/libaom/src/aom_dsp/mips/loopfilter_filters_dspr2.c create mode 100644 libs/libaom/src/aom_dsp/mips/loopfilter_filters_dspr2.h create mode 100644 libs/libaom/src/aom_dsp/mips/loopfilter_macros_dspr2.h create mode 100644 libs/libaom/src/aom_dsp/mips/loopfilter_masks_dspr2.h create mode 100644 libs/libaom/src/aom_dsp/mips/loopfilter_mb_dspr2.c create mode 100644 libs/libaom/src/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c create mode 100644 libs/libaom/src/aom_dsp/mips/loopfilter_mb_vert_dspr2.c create mode 100644 libs/libaom/src/aom_dsp/mips/loopfilter_msa.h create mode 100644 libs/libaom/src/aom_dsp/mips/macros_msa.h create mode 100644 libs/libaom/src/aom_dsp/mips/sad_msa.c create mode 100644 libs/libaom/src/aom_dsp/mips/sub_pixel_variance_msa.c create mode 100644 libs/libaom/src/aom_dsp/mips/subtract_msa.c create mode 100644 libs/libaom/src/aom_dsp/mips/variance_msa.c create mode 100644 libs/libaom/src/aom_dsp/noise_model.c create mode 100644 libs/libaom/src/aom_dsp/noise_model.h create mode 100644 libs/libaom/src/aom_dsp/noise_util.c create mode 100644 libs/libaom/src/aom_dsp/noise_util.h create mode 100644 libs/libaom/src/aom_dsp/prob.h create mode 100644 libs/libaom/src/aom_dsp/psnr.c create mode 100644 libs/libaom/src/aom_dsp/psnr.h create mode 100644 libs/libaom/src/aom_dsp/psnrhvs.c create mode 100644 libs/libaom/src/aom_dsp/quantize.c create mode 100644 libs/libaom/src/aom_dsp/quantize.h create mode 100644 libs/libaom/src/aom_dsp/recenter.h create mode 100644 libs/libaom/src/aom_dsp/sad.c create mode 100644 libs/libaom/src/aom_dsp/sad_av1.c create mode 100644 libs/libaom/src/aom_dsp/simd/v128_intrinsics.h create mode 100644 libs/libaom/src/aom_dsp/simd/v128_intrinsics_arm.h create mode 100644 libs/libaom/src/aom_dsp/simd/v128_intrinsics_c.h create mode 100644 libs/libaom/src/aom_dsp/simd/v128_intrinsics_x86.h create mode 100644 libs/libaom/src/aom_dsp/simd/v256_intrinsics.h create mode 100644 libs/libaom/src/aom_dsp/simd/v256_intrinsics_arm.h create mode 100644 libs/libaom/src/aom_dsp/simd/v256_intrinsics_c.h create mode 100644 libs/libaom/src/aom_dsp/simd/v256_intrinsics_v128.h create mode 100644 libs/libaom/src/aom_dsp/simd/v256_intrinsics_x86.h create mode 100644 libs/libaom/src/aom_dsp/simd/v64_intrinsics.h create mode 100644 libs/libaom/src/aom_dsp/simd/v64_intrinsics_arm.h create mode 100644 libs/libaom/src/aom_dsp/simd/v64_intrinsics_c.h create mode 100644 libs/libaom/src/aom_dsp/simd/v64_intrinsics_x86.h create mode 100644 libs/libaom/src/aom_dsp/sse.c create mode 100644 libs/libaom/src/aom_dsp/ssim.c create mode 100644 libs/libaom/src/aom_dsp/ssim.h create mode 100644 libs/libaom/src/aom_dsp/subtract.c create mode 100644 libs/libaom/src/aom_dsp/sum_squares.c create mode 100644 libs/libaom/src/aom_dsp/txfm_common.h create mode 100644 libs/libaom/src/aom_dsp/variance.c create mode 100644 libs/libaom/src/aom_dsp/variance.h create mode 100644 libs/libaom/src/aom_dsp/vmaf.c create mode 100644 libs/libaom/src/aom_dsp/vmaf.h create mode 100644 libs/libaom/src/aom_dsp/x86/adaptive_quantize_avx2.c create mode 100644 libs/libaom/src/aom_dsp/x86/adaptive_quantize_sse2.c create mode 100644 libs/libaom/src/aom_dsp/x86/aom_asm_stubs.c create mode 100644 libs/libaom/src/aom_dsp/x86/aom_convolve_copy_sse2.asm create mode 100644 libs/libaom/src/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm create mode 100644 libs/libaom/src/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm create mode 100644 libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c create mode 100644 libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c create mode 100644 libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c create mode 100644 libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_sse2.asm create mode 100644 libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_ssse3.asm create mode 100644 libs/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm create mode 100644 libs/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm create mode 100644 libs/libaom/src/aom_dsp/x86/avg_intrin_avx2.c create mode 100644 libs/libaom/src/aom_dsp/x86/avg_intrin_sse2.c create mode 100644 libs/libaom/src/aom_dsp/x86/bitdepth_conversion_avx2.h create mode 100644 libs/libaom/src/aom_dsp/x86/bitdepth_conversion_sse2.h create mode 100644 libs/libaom/src/aom_dsp/x86/blend_a64_hmask_sse4.c create mode 100644 libs/libaom/src/aom_dsp/x86/blend_a64_mask_avx2.c create mode 100644 libs/libaom/src/aom_dsp/x86/blend_a64_mask_sse4.c create mode 100644 libs/libaom/src/aom_dsp/x86/blend_a64_vmask_sse4.c create mode 100644 libs/libaom/src/aom_dsp/x86/blend_mask_sse4.h create mode 100644 libs/libaom/src/aom_dsp/x86/blend_sse4.h create mode 100644 libs/libaom/src/aom_dsp/x86/blk_sse_sum_avx2.c create mode 100644 libs/libaom/src/aom_dsp/x86/blk_sse_sum_sse2.c create mode 100644 libs/libaom/src/aom_dsp/x86/common_avx2.h create mode 100644 libs/libaom/src/aom_dsp/x86/convolve.h create mode 100644 libs/libaom/src/aom_dsp/x86/convolve_avx2.h create mode 100644 libs/libaom/src/aom_dsp/x86/convolve_common_intrin.h create mode 100644 libs/libaom/src/aom_dsp/x86/convolve_sse2.h create mode 100644 libs/libaom/src/aom_dsp/x86/convolve_sse4_1.h create mode 100644 libs/libaom/src/aom_dsp/x86/fft_avx2.c create mode 100644 libs/libaom/src/aom_dsp/x86/fft_sse2.c create mode 100644 libs/libaom/src/aom_dsp/x86/fwd_txfm_impl_sse2.h create mode 100644 libs/libaom/src/aom_dsp/x86/fwd_txfm_sse2.c create mode 100644 libs/libaom/src/aom_dsp/x86/fwd_txfm_sse2.h create mode 100644 libs/libaom/src/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_avx2.c create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_sse2.c create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_convolve_avx2.c create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_convolve_sse2.c create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_convolve_ssse3.c create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_intrapred_asm_sse2.asm create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_intrapred_sse2.c create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_loopfilter_avx2.c create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_loopfilter_sse2.c create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_quantize_intrin_avx2.c create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_quantize_intrin_sse2.c create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_sad4d_sse2.asm create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_sad_sse2.asm create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_subtract_sse2.c create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_variance_avx2.c create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_variance_impl_sse2.asm create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_variance_sse2.c create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_variance_sse4.c create mode 100644 libs/libaom/src/aom_dsp/x86/intrapred_asm_sse2.asm create mode 100644 libs/libaom/src/aom_dsp/x86/intrapred_avx2.c create mode 100644 libs/libaom/src/aom_dsp/x86/intrapred_sse2.c create mode 100644 libs/libaom/src/aom_dsp/x86/intrapred_ssse3.c create mode 100644 libs/libaom/src/aom_dsp/x86/intrapred_x86.h create mode 100644 libs/libaom/src/aom_dsp/x86/inv_wht_sse2.asm create mode 100644 libs/libaom/src/aom_dsp/x86/jnt_sad_ssse3.c create mode 100644 libs/libaom/src/aom_dsp/x86/jnt_variance_ssse3.c create mode 100644 libs/libaom/src/aom_dsp/x86/loopfilter_sse2.c create mode 100644 libs/libaom/src/aom_dsp/x86/lpf_common_sse2.h create mode 100644 libs/libaom/src/aom_dsp/x86/masked_sad4d_ssse3.c create mode 100644 libs/libaom/src/aom_dsp/x86/masked_sad_intrin_avx2.c create mode 100644 libs/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.c create mode 100644 libs/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.h create mode 100644 libs/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.c create mode 100644 libs/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.h create mode 100644 libs/libaom/src/aom_dsp/x86/mem_sse2.h create mode 100644 libs/libaom/src/aom_dsp/x86/obmc_intrinsic_sse4.h create mode 100644 libs/libaom/src/aom_dsp/x86/obmc_intrinsic_ssse3.h create mode 100644 libs/libaom/src/aom_dsp/x86/obmc_sad_avx2.c create mode 100644 libs/libaom/src/aom_dsp/x86/obmc_sad_sse4.c create mode 100644 libs/libaom/src/aom_dsp/x86/obmc_variance_avx2.c create mode 100644 libs/libaom/src/aom_dsp/x86/obmc_variance_sse4.c create mode 100644 libs/libaom/src/aom_dsp/x86/quantize_avx_x86_64.asm create mode 100644 libs/libaom/src/aom_dsp/x86/quantize_sse2.c create mode 100644 libs/libaom/src/aom_dsp/x86/quantize_ssse3.c create mode 100644 libs/libaom/src/aom_dsp/x86/quantize_ssse3_x86_64.asm create mode 100644 libs/libaom/src/aom_dsp/x86/quantize_x86.h create mode 100644 libs/libaom/src/aom_dsp/x86/sad4d_avx2.c create mode 100644 libs/libaom/src/aom_dsp/x86/sad4d_sse2.asm create mode 100644 libs/libaom/src/aom_dsp/x86/sad_avx2.c create mode 100644 libs/libaom/src/aom_dsp/x86/sad_highbd_avx2.c create mode 100644 libs/libaom/src/aom_dsp/x86/sad_impl_avx2.c create mode 100644 libs/libaom/src/aom_dsp/x86/sad_sse2.asm create mode 100644 libs/libaom/src/aom_dsp/x86/sse_avx2.c create mode 100644 libs/libaom/src/aom_dsp/x86/sse_sse4.c create mode 100644 libs/libaom/src/aom_dsp/x86/ssim_sse2_x86_64.asm create mode 100644 libs/libaom/src/aom_dsp/x86/subpel_variance_sse2.asm create mode 100644 libs/libaom/src/aom_dsp/x86/subtract_avx2.c create mode 100644 libs/libaom/src/aom_dsp/x86/subtract_sse2.asm create mode 100644 libs/libaom/src/aom_dsp/x86/sum_squares_avx2.c create mode 100644 libs/libaom/src/aom_dsp/x86/sum_squares_sse2.c create mode 100644 libs/libaom/src/aom_dsp/x86/sum_squares_sse2.h create mode 100644 libs/libaom/src/aom_dsp/x86/synonyms.h create mode 100644 libs/libaom/src/aom_dsp/x86/synonyms_avx2.h create mode 100644 libs/libaom/src/aom_dsp/x86/transpose_sse2.h create mode 100644 libs/libaom/src/aom_dsp/x86/txfm_common_avx2.h create mode 100644 libs/libaom/src/aom_dsp/x86/txfm_common_sse2.h create mode 100644 libs/libaom/src/aom_dsp/x86/variance_avx2.c create mode 100644 libs/libaom/src/aom_dsp/x86/variance_impl_avx2.c create mode 100644 libs/libaom/src/aom_dsp/x86/variance_impl_ssse3.c create mode 100644 libs/libaom/src/aom_dsp/x86/variance_sse2.c create mode 100644 libs/libaom/src/aom_mem/aom_mem.c create mode 100644 libs/libaom/src/aom_mem/aom_mem.cmake create mode 100644 libs/libaom/src/aom_mem/aom_mem.h create mode 100644 libs/libaom/src/aom_mem/include/aom_mem_intrnl.h create mode 100644 libs/libaom/src/aom_ports/aom_once.h create mode 100644 libs/libaom/src/aom_ports/aom_ports.cmake create mode 100644 libs/libaom/src/aom_ports/aom_timer.h create mode 100644 libs/libaom/src/aom_ports/arm.h create mode 100644 libs/libaom/src/aom_ports/arm_cpudetect.c create mode 100644 libs/libaom/src/aom_ports/bitops.h create mode 100644 libs/libaom/src/aom_ports/emmintrin_compat.h create mode 100644 libs/libaom/src/aom_ports/emms.asm create mode 100644 libs/libaom/src/aom_ports/mem.h create mode 100644 libs/libaom/src/aom_ports/mem_ops.h create mode 100644 libs/libaom/src/aom_ports/mem_ops_aligned.h create mode 100644 libs/libaom/src/aom_ports/msvc.h create mode 100644 libs/libaom/src/aom_ports/ppc.h create mode 100644 libs/libaom/src/aom_ports/ppc_cpudetect.c create mode 100644 libs/libaom/src/aom_ports/sanitizer.h create mode 100644 libs/libaom/src/aom_ports/system_state.h create mode 100644 libs/libaom/src/aom_ports/x86.h create mode 100644 libs/libaom/src/aom_ports/x86_abi_support.asm create mode 100644 libs/libaom/src/aom_scale/aom_scale.cmake create mode 100644 libs/libaom/src/aom_scale/aom_scale.h create mode 100644 libs/libaom/src/aom_scale/aom_scale_rtcd.c create mode 100644 libs/libaom/src/aom_scale/aom_scale_rtcd.pl create mode 100644 libs/libaom/src/aom_scale/generic/aom_scale.c create mode 100644 libs/libaom/src/aom_scale/generic/gen_scalers.c create mode 100644 libs/libaom/src/aom_scale/generic/yv12config.c create mode 100644 libs/libaom/src/aom_scale/generic/yv12extend.c create mode 100644 libs/libaom/src/aom_scale/mips/dspr2/yv12extend_dspr2.c create mode 100644 libs/libaom/src/aom_scale/yv12config.h create mode 100644 libs/libaom/src/aom_util/aom_thread.c create mode 100644 libs/libaom/src/aom_util/aom_thread.h create mode 100644 libs/libaom/src/aom_util/aom_util.cmake create mode 100644 libs/libaom/src/aom_util/debug_util.c create mode 100644 libs/libaom/src/aom_util/debug_util.h create mode 100644 libs/libaom/src/aom_util/endian_inl.h create mode 100644 libs/libaom/src/apps/aomdec.c create mode 100644 libs/libaom/src/apps/aomenc.c create mode 100644 libs/libaom/src/apps/aomenc.h create mode 100644 libs/libaom/src/av1/av1.cmake create mode 100644 libs/libaom/src/av1/av1_cx_iface.c create mode 100644 libs/libaom/src/av1/av1_dx_iface.c create mode 100644 libs/libaom/src/av1/av1_iface_common.h create mode 100644 libs/libaom/src/av1/common/alloccommon.c create mode 100644 libs/libaom/src/av1/common/alloccommon.h create mode 100644 libs/libaom/src/av1/common/arm/av1_inv_txfm_neon.c create mode 100644 libs/libaom/src/av1/common/arm/av1_inv_txfm_neon.h create mode 100644 libs/libaom/src/av1/common/arm/av1_txfm_neon.c create mode 100644 libs/libaom/src/av1/common/arm/blend_a64_hmask_neon.c create mode 100644 libs/libaom/src/av1/common/arm/blend_a64_vmask_neon.c create mode 100644 libs/libaom/src/av1/common/arm/cfl_neon.c create mode 100644 libs/libaom/src/av1/common/arm/convolve_neon.c create mode 100644 libs/libaom/src/av1/common/arm/convolve_neon.h create mode 100644 libs/libaom/src/av1/common/arm/jnt_convolve_neon.c create mode 100644 libs/libaom/src/av1/common/arm/mem_neon.h create mode 100644 libs/libaom/src/av1/common/arm/reconinter_neon.c create mode 100644 libs/libaom/src/av1/common/arm/selfguided_neon.c create mode 100644 libs/libaom/src/av1/common/arm/transpose_neon.h create mode 100644 libs/libaom/src/av1/common/arm/warp_plane_neon.c create mode 100644 libs/libaom/src/av1/common/arm/wiener_convolve_neon.c create mode 100644 libs/libaom/src/av1/common/av1_common_int.h create mode 100644 libs/libaom/src/av1/common/av1_inv_txfm1d.c create mode 100644 libs/libaom/src/av1/common/av1_inv_txfm1d.h create mode 100644 libs/libaom/src/av1/common/av1_inv_txfm1d_cfg.h create mode 100644 libs/libaom/src/av1/common/av1_inv_txfm2d.c create mode 100644 libs/libaom/src/av1/common/av1_loopfilter.c create mode 100644 libs/libaom/src/av1/common/av1_loopfilter.h create mode 100644 libs/libaom/src/av1/common/av1_rtcd.c create mode 100644 libs/libaom/src/av1/common/av1_rtcd_defs.pl create mode 100644 libs/libaom/src/av1/common/av1_txfm.c create mode 100644 libs/libaom/src/av1/common/av1_txfm.h create mode 100644 libs/libaom/src/av1/common/blockd.c create mode 100644 libs/libaom/src/av1/common/blockd.h create mode 100644 libs/libaom/src/av1/common/cdef.c create mode 100644 libs/libaom/src/av1/common/cdef.h create mode 100644 libs/libaom/src/av1/common/cdef_block.c create mode 100644 libs/libaom/src/av1/common/cdef_block.h create mode 100644 libs/libaom/src/av1/common/cdef_block_avx2.c create mode 100644 libs/libaom/src/av1/common/cdef_block_neon.c create mode 100644 libs/libaom/src/av1/common/cdef_block_simd.h create mode 100644 libs/libaom/src/av1/common/cdef_block_sse2.c create mode 100644 libs/libaom/src/av1/common/cdef_block_sse4.c create mode 100644 libs/libaom/src/av1/common/cdef_block_ssse3.c create mode 100644 libs/libaom/src/av1/common/cfl.c create mode 100644 libs/libaom/src/av1/common/cfl.h create mode 100644 libs/libaom/src/av1/common/common.h create mode 100644 libs/libaom/src/av1/common/common_data.h create mode 100644 libs/libaom/src/av1/common/convolve.c create mode 100644 libs/libaom/src/av1/common/convolve.h create mode 100644 libs/libaom/src/av1/common/debugmodes.c create mode 100644 libs/libaom/src/av1/common/entropy.c create mode 100644 libs/libaom/src/av1/common/entropy.h create mode 100644 libs/libaom/src/av1/common/entropymode.c create mode 100644 libs/libaom/src/av1/common/entropymode.h create mode 100644 libs/libaom/src/av1/common/entropymv.c create mode 100644 libs/libaom/src/av1/common/entropymv.h create mode 100644 libs/libaom/src/av1/common/enums.h create mode 100644 libs/libaom/src/av1/common/filter.h create mode 100644 libs/libaom/src/av1/common/frame_buffers.c create mode 100644 libs/libaom/src/av1/common/frame_buffers.h create mode 100644 libs/libaom/src/av1/common/idct.c create mode 100644 libs/libaom/src/av1/common/idct.h create mode 100644 libs/libaom/src/av1/common/loopfiltermask.c create mode 100644 libs/libaom/src/av1/common/mv.h create mode 100644 libs/libaom/src/av1/common/mvref_common.c create mode 100644 libs/libaom/src/av1/common/mvref_common.h create mode 100644 libs/libaom/src/av1/common/obmc.h create mode 100644 libs/libaom/src/av1/common/obu_util.c create mode 100644 libs/libaom/src/av1/common/obu_util.h create mode 100644 libs/libaom/src/av1/common/odintrin.c create mode 100644 libs/libaom/src/av1/common/odintrin.h create mode 100644 libs/libaom/src/av1/common/ppc/cfl_ppc.c create mode 100644 libs/libaom/src/av1/common/pred_common.c create mode 100644 libs/libaom/src/av1/common/pred_common.h create mode 100644 libs/libaom/src/av1/common/quant_common.c create mode 100644 libs/libaom/src/av1/common/quant_common.h create mode 100644 libs/libaom/src/av1/common/reconinter.c create mode 100644 libs/libaom/src/av1/common/reconinter.h create mode 100644 libs/libaom/src/av1/common/reconintra.c create mode 100644 libs/libaom/src/av1/common/reconintra.h create mode 100644 libs/libaom/src/av1/common/resize.c create mode 100644 libs/libaom/src/av1/common/resize.h create mode 100644 libs/libaom/src/av1/common/restoration.c create mode 100644 libs/libaom/src/av1/common/restoration.h create mode 100644 libs/libaom/src/av1/common/scale.c create mode 100644 libs/libaom/src/av1/common/scale.h create mode 100644 libs/libaom/src/av1/common/scan.c create mode 100644 libs/libaom/src/av1/common/scan.h create mode 100644 libs/libaom/src/av1/common/seg_common.c create mode 100644 libs/libaom/src/av1/common/seg_common.h create mode 100644 libs/libaom/src/av1/common/thread_common.c create mode 100644 libs/libaom/src/av1/common/thread_common.h create mode 100644 libs/libaom/src/av1/common/tile_common.c create mode 100644 libs/libaom/src/av1/common/tile_common.h create mode 100644 libs/libaom/src/av1/common/timing.c create mode 100644 libs/libaom/src/av1/common/timing.h create mode 100644 libs/libaom/src/av1/common/token_cdfs.h create mode 100644 libs/libaom/src/av1/common/txb_common.c create mode 100644 libs/libaom/src/av1/common/txb_common.h create mode 100644 libs/libaom/src/av1/common/warped_motion.c create mode 100644 libs/libaom/src/av1/common/warped_motion.h create mode 100644 libs/libaom/src/av1/common/x86/av1_convolve_horiz_rs_sse4.c create mode 100644 libs/libaom/src/av1/common/x86/av1_convolve_scale_sse4.c create mode 100644 libs/libaom/src/av1/common/x86/av1_inv_txfm_avx2.c create mode 100644 libs/libaom/src/av1/common/x86/av1_inv_txfm_avx2.h create mode 100644 libs/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.c create mode 100644 libs/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.h create mode 100644 libs/libaom/src/av1/common/x86/av1_txfm_sse2.h create mode 100644 libs/libaom/src/av1/common/x86/av1_txfm_sse4.c create mode 100644 libs/libaom/src/av1/common/x86/av1_txfm_sse4.h create mode 100644 libs/libaom/src/av1/common/x86/cfl_avx2.c create mode 100644 libs/libaom/src/av1/common/x86/cfl_simd.h create mode 100644 libs/libaom/src/av1/common/x86/cfl_sse2.c create mode 100644 libs/libaom/src/av1/common/x86/cfl_ssse3.c create mode 100644 libs/libaom/src/av1/common/x86/convolve_2d_avx2.c create mode 100644 libs/libaom/src/av1/common/x86/convolve_2d_sse2.c create mode 100644 libs/libaom/src/av1/common/x86/convolve_avx2.c create mode 100644 libs/libaom/src/av1/common/x86/convolve_sse2.c create mode 100644 libs/libaom/src/av1/common/x86/filterintra_sse4.c create mode 100644 libs/libaom/src/av1/common/x86/highbd_convolve_2d_avx2.c create mode 100644 libs/libaom/src/av1/common/x86/highbd_convolve_2d_sse2.c create mode 100644 libs/libaom/src/av1/common/x86/highbd_convolve_2d_sse4.c create mode 100644 libs/libaom/src/av1/common/x86/highbd_convolve_2d_ssse3.c create mode 100644 libs/libaom/src/av1/common/x86/highbd_inv_txfm_avx2.c create mode 100644 libs/libaom/src/av1/common/x86/highbd_inv_txfm_sse4.c create mode 100644 libs/libaom/src/av1/common/x86/highbd_jnt_convolve_avx2.c create mode 100644 libs/libaom/src/av1/common/x86/highbd_jnt_convolve_sse4.c create mode 100644 libs/libaom/src/av1/common/x86/highbd_txfm_utility_sse4.h create mode 100644 libs/libaom/src/av1/common/x86/highbd_warp_plane_sse4.c create mode 100644 libs/libaom/src/av1/common/x86/highbd_wiener_convolve_avx2.c create mode 100644 libs/libaom/src/av1/common/x86/highbd_wiener_convolve_ssse3.c create mode 100644 libs/libaom/src/av1/common/x86/intra_edge_sse4.c create mode 100644 libs/libaom/src/av1/common/x86/jnt_convolve_avx2.c create mode 100644 libs/libaom/src/av1/common/x86/jnt_convolve_sse2.c create mode 100644 libs/libaom/src/av1/common/x86/jnt_convolve_ssse3.c create mode 100644 libs/libaom/src/av1/common/x86/reconinter_avx2.c create mode 100644 libs/libaom/src/av1/common/x86/reconinter_sse4.c create mode 100644 libs/libaom/src/av1/common/x86/reconinter_ssse3.c create mode 100644 libs/libaom/src/av1/common/x86/selfguided_avx2.c create mode 100644 libs/libaom/src/av1/common/x86/selfguided_sse4.c create mode 100644 libs/libaom/src/av1/common/x86/warp_plane_avx2.c create mode 100644 libs/libaom/src/av1/common/x86/warp_plane_sse2.c create mode 100644 libs/libaom/src/av1/common/x86/warp_plane_sse4.c create mode 100644 libs/libaom/src/av1/common/x86/wiener_convolve_avx2.c create mode 100644 libs/libaom/src/av1/common/x86/wiener_convolve_sse2.c create mode 100644 libs/libaom/src/av1/decoder/accounting.c create mode 100644 libs/libaom/src/av1/decoder/accounting.h create mode 100644 libs/libaom/src/av1/decoder/decodeframe.c create mode 100644 libs/libaom/src/av1/decoder/decodeframe.h create mode 100644 libs/libaom/src/av1/decoder/decodemv.c create mode 100644 libs/libaom/src/av1/decoder/decodemv.h create mode 100644 libs/libaom/src/av1/decoder/decoder.c create mode 100644 libs/libaom/src/av1/decoder/decoder.h create mode 100644 libs/libaom/src/av1/decoder/decodetxb.c create mode 100644 libs/libaom/src/av1/decoder/decodetxb.h create mode 100644 libs/libaom/src/av1/decoder/detokenize.c create mode 100644 libs/libaom/src/av1/decoder/detokenize.h create mode 100644 libs/libaom/src/av1/decoder/dthread.h create mode 100644 libs/libaom/src/av1/decoder/inspection.c create mode 100644 libs/libaom/src/av1/decoder/inspection.h create mode 100644 libs/libaom/src/av1/decoder/obu.c create mode 100644 libs/libaom/src/av1/decoder/obu.h create mode 100644 libs/libaom/src/av1/encoder/aq_complexity.c create mode 100644 libs/libaom/src/av1/encoder/aq_complexity.h create mode 100644 libs/libaom/src/av1/encoder/aq_cyclicrefresh.c create mode 100644 libs/libaom/src/av1/encoder/aq_cyclicrefresh.h create mode 100644 libs/libaom/src/av1/encoder/aq_variance.c create mode 100644 libs/libaom/src/av1/encoder/aq_variance.h create mode 100644 libs/libaom/src/av1/encoder/arm/neon/av1_error_neon.c create mode 100644 libs/libaom/src/av1/encoder/arm/neon/quantize_neon.c create mode 100644 libs/libaom/src/av1/encoder/av1_fwd_txfm1d.c create mode 100644 libs/libaom/src/av1/encoder/av1_fwd_txfm1d.h create mode 100644 libs/libaom/src/av1/encoder/av1_fwd_txfm1d_cfg.h create mode 100644 libs/libaom/src/av1/encoder/av1_fwd_txfm2d.c create mode 100644 libs/libaom/src/av1/encoder/av1_multi_thread.c create mode 100644 libs/libaom/src/av1/encoder/av1_multi_thread.h create mode 100644 libs/libaom/src/av1/encoder/av1_quantize.c create mode 100644 libs/libaom/src/av1/encoder/av1_quantize.h create mode 100644 libs/libaom/src/av1/encoder/bitstream.c create mode 100644 libs/libaom/src/av1/encoder/bitstream.h create mode 100644 libs/libaom/src/av1/encoder/block.h create mode 100644 libs/libaom/src/av1/encoder/blockiness.c create mode 100644 libs/libaom/src/av1/encoder/cnn.c create mode 100644 libs/libaom/src/av1/encoder/cnn.h create mode 100644 libs/libaom/src/av1/encoder/compound_type.c create mode 100644 libs/libaom/src/av1/encoder/compound_type.h create mode 100644 libs/libaom/src/av1/encoder/context_tree.c create mode 100644 libs/libaom/src/av1/encoder/context_tree.h create mode 100644 libs/libaom/src/av1/encoder/corner_detect.c create mode 100644 libs/libaom/src/av1/encoder/corner_detect.h create mode 100644 libs/libaom/src/av1/encoder/corner_match.c create mode 100644 libs/libaom/src/av1/encoder/corner_match.h create mode 100644 libs/libaom/src/av1/encoder/cost.c create mode 100644 libs/libaom/src/av1/encoder/cost.h create mode 100644 libs/libaom/src/av1/encoder/dwt.c create mode 100644 libs/libaom/src/av1/encoder/dwt.h create mode 100644 libs/libaom/src/av1/encoder/enc_enums.h create mode 100644 libs/libaom/src/av1/encoder/encode_strategy.c create mode 100644 libs/libaom/src/av1/encoder/encode_strategy.h create mode 100644 libs/libaom/src/av1/encoder/encodeframe.c create mode 100644 libs/libaom/src/av1/encoder/encodeframe.h create mode 100644 libs/libaom/src/av1/encoder/encodemb.c create mode 100644 libs/libaom/src/av1/encoder/encodemb.h create mode 100644 libs/libaom/src/av1/encoder/encodemv.c create mode 100644 libs/libaom/src/av1/encoder/encodemv.h create mode 100644 libs/libaom/src/av1/encoder/encoder.c create mode 100644 libs/libaom/src/av1/encoder/encoder.h create mode 100644 libs/libaom/src/av1/encoder/encodetxb.c create mode 100644 libs/libaom/src/av1/encoder/encodetxb.h create mode 100644 libs/libaom/src/av1/encoder/ethread.c create mode 100644 libs/libaom/src/av1/encoder/ethread.h create mode 100644 libs/libaom/src/av1/encoder/extend.c create mode 100644 libs/libaom/src/av1/encoder/extend.h create mode 100644 libs/libaom/src/av1/encoder/firstpass.c create mode 100644 libs/libaom/src/av1/encoder/firstpass.h create mode 100644 libs/libaom/src/av1/encoder/global_motion.c create mode 100644 libs/libaom/src/av1/encoder/global_motion.h create mode 100644 libs/libaom/src/av1/encoder/gop_structure.c create mode 100644 libs/libaom/src/av1/encoder/gop_structure.h create mode 100644 libs/libaom/src/av1/encoder/grain_test_vectors.h create mode 100644 libs/libaom/src/av1/encoder/hash.c create mode 100644 libs/libaom/src/av1/encoder/hash.h create mode 100644 libs/libaom/src/av1/encoder/hash_motion.c create mode 100644 libs/libaom/src/av1/encoder/hash_motion.h create mode 100644 libs/libaom/src/av1/encoder/hybrid_fwd_txfm.c create mode 100644 libs/libaom/src/av1/encoder/hybrid_fwd_txfm.h create mode 100644 libs/libaom/src/av1/encoder/interp_search.c create mode 100644 libs/libaom/src/av1/encoder/interp_search.h create mode 100644 libs/libaom/src/av1/encoder/intra_mode_search.c create mode 100644 libs/libaom/src/av1/encoder/intra_mode_search.h create mode 100644 libs/libaom/src/av1/encoder/k_means_template.h create mode 100644 libs/libaom/src/av1/encoder/level.c create mode 100644 libs/libaom/src/av1/encoder/level.h create mode 100644 libs/libaom/src/av1/encoder/lookahead.c create mode 100644 libs/libaom/src/av1/encoder/lookahead.h create mode 100644 libs/libaom/src/av1/encoder/mathutils.h create mode 100644 libs/libaom/src/av1/encoder/mcomp.c create mode 100644 libs/libaom/src/av1/encoder/mcomp.h create mode 100644 libs/libaom/src/av1/encoder/mips/msa/error_msa.c create mode 100644 libs/libaom/src/av1/encoder/mips/msa/fdct4x4_msa.c create mode 100644 libs/libaom/src/av1/encoder/mips/msa/temporal_filter_msa.c create mode 100644 libs/libaom/src/av1/encoder/misc_model_weights.h create mode 100644 libs/libaom/src/av1/encoder/ml.c create mode 100644 libs/libaom/src/av1/encoder/ml.h create mode 100644 libs/libaom/src/av1/encoder/mode_prune_model_weights.h create mode 100644 libs/libaom/src/av1/encoder/model_rd.h create mode 100644 libs/libaom/src/av1/encoder/motion_search_facade.c create mode 100644 libs/libaom/src/av1/encoder/motion_search_facade.h create mode 100644 libs/libaom/src/av1/encoder/mv_prec.c create mode 100644 libs/libaom/src/av1/encoder/mv_prec.h create mode 100644 libs/libaom/src/av1/encoder/nonrd_pickmode.c create mode 100644 libs/libaom/src/av1/encoder/palette.c create mode 100644 libs/libaom/src/av1/encoder/palette.h create mode 100644 libs/libaom/src/av1/encoder/partition_cnn_weights.h create mode 100644 libs/libaom/src/av1/encoder/partition_model_weights.h create mode 100644 libs/libaom/src/av1/encoder/partition_strategy.c create mode 100644 libs/libaom/src/av1/encoder/partition_strategy.h create mode 100644 libs/libaom/src/av1/encoder/pass2_strategy.c create mode 100644 libs/libaom/src/av1/encoder/pass2_strategy.h create mode 100644 libs/libaom/src/av1/encoder/pickcdef.c create mode 100644 libs/libaom/src/av1/encoder/picklpf.c create mode 100644 libs/libaom/src/av1/encoder/picklpf.h create mode 100644 libs/libaom/src/av1/encoder/pickrst.c create mode 100644 libs/libaom/src/av1/encoder/pickrst.h create mode 100644 libs/libaom/src/av1/encoder/pustats.h create mode 100644 libs/libaom/src/av1/encoder/random.h create mode 100644 libs/libaom/src/av1/encoder/ransac.c create mode 100644 libs/libaom/src/av1/encoder/ransac.h create mode 100644 libs/libaom/src/av1/encoder/ratectrl.c create mode 100644 libs/libaom/src/av1/encoder/ratectrl.h create mode 100644 libs/libaom/src/av1/encoder/rd.c create mode 100644 libs/libaom/src/av1/encoder/rd.h create mode 100644 libs/libaom/src/av1/encoder/rdopt.c create mode 100644 libs/libaom/src/av1/encoder/rdopt.h create mode 100644 libs/libaom/src/av1/encoder/rdopt_data_defs.h create mode 100644 libs/libaom/src/av1/encoder/rdopt_utils.h create mode 100644 libs/libaom/src/av1/encoder/reconinter_enc.c create mode 100644 libs/libaom/src/av1/encoder/reconinter_enc.h create mode 100644 libs/libaom/src/av1/encoder/segmentation.c create mode 100644 libs/libaom/src/av1/encoder/segmentation.h create mode 100644 libs/libaom/src/av1/encoder/speed_features.c create mode 100644 libs/libaom/src/av1/encoder/speed_features.h create mode 100644 libs/libaom/src/av1/encoder/svc_layercontext.c create mode 100644 libs/libaom/src/av1/encoder/svc_layercontext.h create mode 100644 libs/libaom/src/av1/encoder/temporal_filter.c create mode 100644 libs/libaom/src/av1/encoder/temporal_filter.h create mode 100644 libs/libaom/src/av1/encoder/tokenize.c create mode 100644 libs/libaom/src/av1/encoder/tokenize.h create mode 100644 libs/libaom/src/av1/encoder/tpl_model.c create mode 100644 libs/libaom/src/av1/encoder/tpl_model.h create mode 100644 libs/libaom/src/av1/encoder/tune_vmaf.c create mode 100644 libs/libaom/src/av1/encoder/tune_vmaf.h create mode 100644 libs/libaom/src/av1/encoder/tx_prune_model_weights.h create mode 100644 libs/libaom/src/av1/encoder/tx_search.c create mode 100644 libs/libaom/src/av1/encoder/tx_search.h create mode 100644 libs/libaom/src/av1/encoder/use_flat_gop_model_params.h create mode 100644 libs/libaom/src/av1/encoder/var_based_part.c create mode 100644 libs/libaom/src/av1/encoder/var_based_part.h create mode 100644 libs/libaom/src/av1/encoder/wedge_utils.c create mode 100644 libs/libaom/src/av1/encoder/x86/av1_fwd_txfm1d_sse4.c create mode 100644 libs/libaom/src/av1/encoder/x86/av1_fwd_txfm2d_avx2.c create mode 100644 libs/libaom/src/av1/encoder/x86/av1_fwd_txfm2d_sse4.c create mode 100644 libs/libaom/src/av1/encoder/x86/av1_fwd_txfm_avx2.h create mode 100644 libs/libaom/src/av1/encoder/x86/av1_fwd_txfm_sse2.c create mode 100644 libs/libaom/src/av1/encoder/x86/av1_fwd_txfm_sse2.h create mode 100644 libs/libaom/src/av1/encoder/x86/av1_highbd_quantize_avx2.c create mode 100644 libs/libaom/src/av1/encoder/x86/av1_highbd_quantize_sse4.c create mode 100644 libs/libaom/src/av1/encoder/x86/av1_quantize_avx2.c create mode 100644 libs/libaom/src/av1/encoder/x86/av1_quantize_sse2.c create mode 100644 libs/libaom/src/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm create mode 100644 libs/libaom/src/av1/encoder/x86/av1_ssim_opt_x86_64.asm create mode 100644 libs/libaom/src/av1/encoder/x86/av1_txfm1d_sse4.h create mode 100644 libs/libaom/src/av1/encoder/x86/corner_match_avx2.c create mode 100644 libs/libaom/src/av1/encoder/x86/corner_match_sse4.c create mode 100644 libs/libaom/src/av1/encoder/x86/dct_sse2.asm create mode 100644 libs/libaom/src/av1/encoder/x86/encodetxb_avx2.c create mode 100644 libs/libaom/src/av1/encoder/x86/encodetxb_sse2.c create mode 100644 libs/libaom/src/av1/encoder/x86/encodetxb_sse4.c create mode 100644 libs/libaom/src/av1/encoder/x86/error_intrin_avx2.c create mode 100644 libs/libaom/src/av1/encoder/x86/error_sse2.asm create mode 100644 libs/libaom/src/av1/encoder/x86/hash_sse42.c create mode 100644 libs/libaom/src/av1/encoder/x86/highbd_block_error_intrin_avx2.c create mode 100644 libs/libaom/src/av1/encoder/x86/highbd_block_error_intrin_sse2.c create mode 100644 libs/libaom/src/av1/encoder/x86/highbd_fwd_txfm_avx2.c create mode 100644 libs/libaom/src/av1/encoder/x86/highbd_fwd_txfm_sse4.c create mode 100644 libs/libaom/src/av1/encoder/x86/ml_sse3.c create mode 100644 libs/libaom/src/av1/encoder/x86/pickrst_avx2.c create mode 100644 libs/libaom/src/av1/encoder/x86/pickrst_sse4.c create mode 100644 libs/libaom/src/av1/encoder/x86/rdopt_avx2.c create mode 100644 libs/libaom/src/av1/encoder/x86/rdopt_sse4.c create mode 100644 libs/libaom/src/av1/encoder/x86/temporal_filter_avx2.c create mode 100644 libs/libaom/src/av1/encoder/x86/temporal_filter_constants.h create mode 100644 libs/libaom/src/av1/encoder/x86/temporal_filter_sse2.c create mode 100644 libs/libaom/src/av1/encoder/x86/temporal_filter_sse4.c create mode 100644 libs/libaom/src/av1/encoder/x86/wedge_utils_avx2.c create mode 100644 libs/libaom/src/av1/encoder/x86/wedge_utils_sse2.c create mode 100644 libs/libaom/src/av1/exports_com create mode 100644 libs/libaom/src/av1/exports_dec create mode 100644 libs/libaom/src/av1/exports_enc create mode 100644 libs/libaom/src/av1/exports_ident create mode 100644 libs/libaom/src/av1/exports_test create mode 100644 libs/libaom/src/build/.gitattributes create mode 100644 libs/libaom/src/build/cmake/aom_config.c.template create mode 100644 libs/libaom/src/build/cmake/aom_config_defaults.cmake create mode 100644 libs/libaom/src/build/cmake/aom_configure.cmake create mode 100644 libs/libaom/src/build/cmake/aom_experiment_deps.cmake create mode 100644 libs/libaom/src/build/cmake/aom_install.cmake create mode 100644 libs/libaom/src/build/cmake/aom_optimization.cmake create mode 100644 libs/libaom/src/build/cmake/compiler_flags.cmake create mode 100644 libs/libaom/src/build/cmake/compiler_tests.cmake create mode 100644 libs/libaom/src/build/cmake/cpu.cmake create mode 100644 libs/libaom/src/build/cmake/dist.cmake create mode 100644 libs/libaom/src/build/cmake/exports.cmake create mode 100644 libs/libaom/src/build/cmake/exports_sources.cmake create mode 100644 libs/libaom/src/build/cmake/generate_aom_config_templates.cmake create mode 100644 libs/libaom/src/build/cmake/generate_exports.cmake create mode 100644 libs/libaom/src/build/cmake/ios-Info.plist create mode 100644 libs/libaom/src/build/cmake/iosbuild.sh create mode 100644 libs/libaom/src/build/cmake/msvc_runtime.cmake create mode 100644 libs/libaom/src/build/cmake/pkg_config.cmake create mode 100644 libs/libaom/src/build/cmake/rtcd.pl create mode 100644 libs/libaom/src/build/cmake/sanitizers.cmake create mode 100644 libs/libaom/src/build/cmake/toolchains/arm-ios-common.cmake create mode 100644 libs/libaom/src/build/cmake/toolchains/arm64-android-clang.cmake create mode 100644 libs/libaom/src/build/cmake/toolchains/arm64-ios.cmake create mode 100644 libs/libaom/src/build/cmake/toolchains/arm64-linux-gcc.cmake create mode 100644 libs/libaom/src/build/cmake/toolchains/arm64-mingw-gcc.cmake create mode 100644 libs/libaom/src/build/cmake/toolchains/armv7-ios.cmake create mode 100644 libs/libaom/src/build/cmake/toolchains/armv7-linux-gcc.cmake create mode 100644 libs/libaom/src/build/cmake/toolchains/armv7-mingw-gcc.cmake create mode 100644 libs/libaom/src/build/cmake/toolchains/armv7s-ios.cmake create mode 100644 libs/libaom/src/build/cmake/toolchains/ios-simulator-common.cmake create mode 100644 libs/libaom/src/build/cmake/toolchains/mips32-linux-gcc.cmake create mode 100644 libs/libaom/src/build/cmake/toolchains/mips64-linux-gcc.cmake create mode 100644 libs/libaom/src/build/cmake/toolchains/ppc-linux-gcc.cmake create mode 100644 libs/libaom/src/build/cmake/toolchains/x86-ios-simulator.cmake create mode 100644 libs/libaom/src/build/cmake/toolchains/x86-linux.cmake create mode 100644 libs/libaom/src/build/cmake/toolchains/x86-macos.cmake create mode 100644 libs/libaom/src/build/cmake/toolchains/x86-mingw-gcc.cmake create mode 100644 libs/libaom/src/build/cmake/toolchains/x86_64-ios-simulator.cmake create mode 100644 libs/libaom/src/build/cmake/toolchains/x86_64-mingw-gcc.cmake create mode 100644 libs/libaom/src/build/cmake/util.cmake create mode 100644 libs/libaom/src/build/cmake/version.cmake create mode 100644 libs/libaom/src/build/cmake/version.pl create mode 100644 libs/libaom/src/codereview.settings create mode 100644 libs/libaom/src/common/args.c create mode 100644 libs/libaom/src/common/args.h create mode 100644 libs/libaom/src/common/av1_config.c create mode 100644 libs/libaom/src/common/av1_config.h create mode 100644 libs/libaom/src/common/ivfdec.c create mode 100644 libs/libaom/src/common/ivfdec.h create mode 100644 libs/libaom/src/common/ivfenc.c create mode 100644 libs/libaom/src/common/ivfenc.h create mode 100644 libs/libaom/src/common/md5_utils.c create mode 100644 libs/libaom/src/common/md5_utils.h create mode 100644 libs/libaom/src/common/obudec.c create mode 100644 libs/libaom/src/common/obudec.h create mode 100644 libs/libaom/src/common/rawenc.c create mode 100644 libs/libaom/src/common/rawenc.h create mode 100644 libs/libaom/src/common/tools_common.c create mode 100644 libs/libaom/src/common/tools_common.h create mode 100644 libs/libaom/src/common/video_common.h create mode 100644 libs/libaom/src/common/video_reader.c create mode 100644 libs/libaom/src/common/video_reader.h create mode 100644 libs/libaom/src/common/video_writer.c create mode 100644 libs/libaom/src/common/video_writer.h create mode 100644 libs/libaom/src/common/warnings.c create mode 100644 libs/libaom/src/common/warnings.h create mode 100644 libs/libaom/src/common/webmdec.cc create mode 100644 libs/libaom/src/common/webmdec.h create mode 100644 libs/libaom/src/common/webmenc.cc create mode 100644 libs/libaom/src/common/webmenc.h create mode 100644 libs/libaom/src/common/y4menc.c create mode 100644 libs/libaom/src/common/y4menc.h create mode 100644 libs/libaom/src/common/y4minput.c create mode 100644 libs/libaom/src/common/y4minput.h create mode 100644 libs/libaom/src/docs.cmake create mode 100644 libs/libaom/src/examples/analyzer.cc create mode 100644 libs/libaom/src/examples/aom_cx_set_ref.c create mode 100644 libs/libaom/src/examples/av1_dec_fuzzer.cc create mode 100644 libs/libaom/src/examples/build_av1_dec_fuzzer.sh create mode 100644 libs/libaom/src/examples/decode_to_md5.c create mode 100644 libs/libaom/src/examples/decode_with_drops.c create mode 100644 libs/libaom/src/examples/encoder_util.c create mode 100644 libs/libaom/src/examples/encoder_util.h create mode 100644 libs/libaom/src/examples/inspect.c create mode 100644 libs/libaom/src/examples/lightfield_bitstream_parsing.c create mode 100644 libs/libaom/src/examples/lightfield_decoder.c create mode 100644 libs/libaom/src/examples/lightfield_encoder.c create mode 100644 libs/libaom/src/examples/lightfield_tile_list_decoder.c create mode 100644 libs/libaom/src/examples/lossless_encoder.c create mode 100644 libs/libaom/src/examples/noise_model.c create mode 100644 libs/libaom/src/examples/resize_util.c create mode 100644 libs/libaom/src/examples/scalable_decoder.c create mode 100644 libs/libaom/src/examples/scalable_encoder.c create mode 100644 libs/libaom/src/examples/set_maps.c create mode 100644 libs/libaom/src/examples/simple_decoder.c create mode 100644 libs/libaom/src/examples/simple_encoder.c create mode 100644 libs/libaom/src/examples/svc_encoder_rtc.c create mode 100644 libs/libaom/src/examples/twopass_encoder.c create mode 100644 libs/libaom/src/keywords.dox create mode 100644 libs/libaom/src/libs.doxy_template create mode 100644 libs/libaom/src/mainpage.dox create mode 100644 libs/libaom/src/stats/aomstats.c create mode 100644 libs/libaom/src/stats/aomstats.h create mode 100644 libs/libaom/src/stats/rate_hist.c create mode 100644 libs/libaom/src/stats/rate_hist.h create mode 100644 libs/libaom/src/test/accounting_test.cc create mode 100644 libs/libaom/src/test/acm_random.h create mode 100644 libs/libaom/src/test/active_map_test.cc create mode 100644 libs/libaom/src/test/altref_test.cc create mode 100644 libs/libaom/src/test/aom_integer_test.cc create mode 100644 libs/libaom/src/test/aomcx_set_ref.sh create mode 100644 libs/libaom/src/test/aomdec.sh create mode 100644 libs/libaom/src/test/aomenc.sh create mode 100644 libs/libaom/src/test/aq_segment_test.cc create mode 100644 libs/libaom/src/test/arf_freq_test.cc create mode 100644 libs/libaom/src/test/av1_common_int_test.cc create mode 100644 libs/libaom/src/test/av1_config_test.cc create mode 100644 libs/libaom/src/test/av1_convolve_2d_test.cc create mode 100644 libs/libaom/src/test/av1_convolve_2d_test_util.cc create mode 100644 libs/libaom/src/test/av1_convolve_2d_test_util.h create mode 100644 libs/libaom/src/test/av1_convolve_scale_test.cc create mode 100644 libs/libaom/src/test/av1_encoder_parms_get_to_decoder.cc create mode 100644 libs/libaom/src/test/av1_ext_tile_test.cc create mode 100644 libs/libaom/src/test/av1_fwd_txfm1d_test.cc create mode 100644 libs/libaom/src/test/av1_fwd_txfm2d_test.cc create mode 100644 libs/libaom/src/test/av1_highbd_iht_test.cc create mode 100644 libs/libaom/src/test/av1_horz_only_frame_superres_test.cc create mode 100644 libs/libaom/src/test/av1_inv_txfm1d_test.cc create mode 100644 libs/libaom/src/test/av1_inv_txfm2d_test.cc create mode 100644 libs/libaom/src/test/av1_nn_predict_test.cc create mode 100644 libs/libaom/src/test/av1_quantize_test.cc create mode 100644 libs/libaom/src/test/av1_round_shift_array_test.cc create mode 100644 libs/libaom/src/test/av1_txfm_test.cc create mode 100644 libs/libaom/src/test/av1_txfm_test.h create mode 100644 libs/libaom/src/test/av1_wedge_utils_test.cc create mode 100644 libs/libaom/src/test/avg_test.cc create mode 100644 libs/libaom/src/test/best_encode.sh create mode 100644 libs/libaom/src/test/binary_codes_test.cc create mode 100644 libs/libaom/src/test/blend_a64_mask_1d_test.cc create mode 100644 libs/libaom/src/test/blend_a64_mask_test.cc create mode 100644 libs/libaom/src/test/blockd_test.cc create mode 100644 libs/libaom/src/test/boolcoder_test.cc create mode 100644 libs/libaom/src/test/borders_test.cc create mode 100644 libs/libaom/src/test/cdef_test.cc create mode 100644 libs/libaom/src/test/cfl_test.cc create mode 100644 libs/libaom/src/test/clear_system_state.h create mode 100644 libs/libaom/src/test/cnn_test.cc create mode 100644 libs/libaom/src/test/codec_factory.h create mode 100644 libs/libaom/src/test/coding_path_sync.cc create mode 100644 libs/libaom/src/test/comp_avg_pred_test.cc create mode 100644 libs/libaom/src/test/comp_avg_pred_test.h create mode 100644 libs/libaom/src/test/comp_mask_variance_test.cc create mode 100644 libs/libaom/src/test/convolve_round_test.cc create mode 100644 libs/libaom/src/test/convolve_test.cc create mode 100644 libs/libaom/src/test/corner_match_test.cc create mode 100644 libs/libaom/src/test/cpu_speed_test.cc create mode 100644 libs/libaom/src/test/datarate_test.cc create mode 100644 libs/libaom/src/test/datarate_test.h create mode 100644 libs/libaom/src/test/decode_api_test.cc create mode 100644 libs/libaom/src/test/decode_multithreaded_test.cc create mode 100644 libs/libaom/src/test/decode_perf_test.cc create mode 100644 libs/libaom/src/test/decode_test_driver.cc create mode 100644 libs/libaom/src/test/decode_test_driver.h create mode 100644 libs/libaom/src/test/decode_to_md5.sh create mode 100644 libs/libaom/src/test/decode_with_drops.sh create mode 100644 libs/libaom/src/test/divu_small_test.cc create mode 100644 libs/libaom/src/test/dr_prediction_test.cc create mode 100644 libs/libaom/src/test/dump_obu.sh create mode 100644 libs/libaom/src/test/ec_test.cc create mode 100644 libs/libaom/src/test/edge_detect_test.cc create mode 100644 libs/libaom/src/test/encode_api_test.cc create mode 100644 libs/libaom/src/test/encode_perf_test.cc create mode 100644 libs/libaom/src/test/encode_test_driver.cc create mode 100644 libs/libaom/src/test/encode_test_driver.h create mode 100644 libs/libaom/src/test/encodetxb_test.cc create mode 100644 libs/libaom/src/test/end_to_end_test.cc create mode 100644 libs/libaom/src/test/error_block_test.cc create mode 100644 libs/libaom/src/test/error_resilience_test.cc create mode 100644 libs/libaom/src/test/ethread_test.cc create mode 100644 libs/libaom/src/test/examples.sh create mode 100644 libs/libaom/src/test/external_frame_buffer_test.cc create mode 100644 libs/libaom/src/test/fdct4x4_test.cc create mode 100644 libs/libaom/src/test/fft_test.cc create mode 100644 libs/libaom/src/test/film_grain_table_test.cc create mode 100644 libs/libaom/src/test/filterintra_test.cc create mode 100644 libs/libaom/src/test/frame_error_test.cc create mode 100644 libs/libaom/src/test/frame_size_tests.cc create mode 100644 libs/libaom/src/test/function_equivalence_test.h create mode 100644 libs/libaom/src/test/fwd_kf_test.cc create mode 100644 libs/libaom/src/test/fwht4x4_test.cc create mode 100644 libs/libaom/src/test/gf_pyr_height_test.cc create mode 100644 libs/libaom/src/test/gviz_api.py create mode 100644 libs/libaom/src/test/hadamard_test.cc create mode 100644 libs/libaom/src/test/hash_test.cc create mode 100644 libs/libaom/src/test/hbd_metrics_test.cc create mode 100644 libs/libaom/src/test/hiprec_convolve_test.cc create mode 100644 libs/libaom/src/test/hiprec_convolve_test_util.cc create mode 100644 libs/libaom/src/test/hiprec_convolve_test_util.h create mode 100644 libs/libaom/src/test/horver_correlation_test.cc create mode 100644 libs/libaom/src/test/horz_superres_test.cc create mode 100644 libs/libaom/src/test/i420_video_source.h create mode 100644 libs/libaom/src/test/intra_edge_test.cc create mode 100644 libs/libaom/src/test/intrabc_test.cc create mode 100644 libs/libaom/src/test/intrapred_test.cc create mode 100644 libs/libaom/src/test/invalid_file_test.cc create mode 100644 libs/libaom/src/test/ivf_video_source.h create mode 100644 libs/libaom/src/test/level_test.cc create mode 100644 libs/libaom/src/test/lightfield_test.sh create mode 100644 libs/libaom/src/test/log2_test.cc create mode 100644 libs/libaom/src/test/lossless_test.cc create mode 100644 libs/libaom/src/test/lpf_test.cc create mode 100644 libs/libaom/src/test/masked_sad_test.cc create mode 100644 libs/libaom/src/test/masked_variance_test.cc create mode 100644 libs/libaom/src/test/md5_helper.h create mode 100644 libs/libaom/src/test/metadata_test.cc create mode 100644 libs/libaom/src/test/metrics_template.html create mode 100644 libs/libaom/src/test/monochrome_test.cc create mode 100644 libs/libaom/src/test/motion_vector_test.cc create mode 100644 libs/libaom/src/test/noise_model_test.cc create mode 100644 libs/libaom/src/test/obmc_sad_test.cc create mode 100644 libs/libaom/src/test/obmc_variance_test.cc create mode 100644 libs/libaom/src/test/pickrst_test.cc create mode 100644 libs/libaom/src/test/qm_test.cc create mode 100644 libs/libaom/src/test/quantize_func_test.cc create mode 100644 libs/libaom/src/test/reconinter_test.cc create mode 100644 libs/libaom/src/test/register_state_check.h create mode 100644 libs/libaom/src/test/resize_test.cc create mode 100644 libs/libaom/src/test/rt_end_to_end_test.cc create mode 100644 libs/libaom/src/test/run_encodes.sh create mode 100644 libs/libaom/src/test/sad_test.cc create mode 100644 libs/libaom/src/test/sb_multipass_test.cc create mode 100644 libs/libaom/src/test/scalability_test.cc create mode 100644 libs/libaom/src/test/scan_test.cc create mode 100644 libs/libaom/src/test/segment_binarization_sync.cc create mode 100644 libs/libaom/src/test/selfguided_filter_test.cc create mode 100644 libs/libaom/src/test/set_maps.sh create mode 100644 libs/libaom/src/test/simd_avx2_test.cc create mode 100644 libs/libaom/src/test/simd_cmp_avx2.cc create mode 100644 libs/libaom/src/test/simd_cmp_impl.h create mode 100644 libs/libaom/src/test/simd_cmp_neon.cc create mode 100644 libs/libaom/src/test/simd_cmp_sse2.cc create mode 100644 libs/libaom/src/test/simd_cmp_sse4.cc create mode 100644 libs/libaom/src/test/simd_cmp_ssse3.cc create mode 100644 libs/libaom/src/test/simd_impl.h create mode 100644 libs/libaom/src/test/simd_neon_test.cc create mode 100644 libs/libaom/src/test/simd_sse2_test.cc create mode 100644 libs/libaom/src/test/simd_sse4_test.cc create mode 100644 libs/libaom/src/test/simd_ssse3_test.cc create mode 100644 libs/libaom/src/test/simple_decoder.sh create mode 100644 libs/libaom/src/test/simple_encoder.sh create mode 100644 libs/libaom/src/test/subtract_test.cc create mode 100644 libs/libaom/src/test/sum_squares_test.cc create mode 100644 libs/libaom/src/test/superframe_test.cc create mode 100644 libs/libaom/src/test/svc_datarate_test.cc create mode 100644 libs/libaom/src/test/temporal_filter_planewise_test.cc create mode 100644 libs/libaom/src/test/temporal_filter_yuv_test.cc create mode 100644 libs/libaom/src/test/test-data.sha1 create mode 100644 libs/libaom/src/test/test.cmake create mode 100644 libs/libaom/src/test/test_data_download_worker.cmake create mode 100644 libs/libaom/src/test/test_data_util.cmake create mode 100644 libs/libaom/src/test/test_intra_pred_speed.cc create mode 100644 libs/libaom/src/test/test_libaom.cc create mode 100644 libs/libaom/src/test/test_runner.cmake create mode 100644 libs/libaom/src/test/test_vector_test.cc create mode 100644 libs/libaom/src/test/test_vectors.cc create mode 100644 libs/libaom/src/test/test_vectors.h create mode 100644 libs/libaom/src/test/tile_independence_test.cc create mode 100644 libs/libaom/src/test/time_stamp_test.cc create mode 100644 libs/libaom/src/test/tools_common.sh create mode 100644 libs/libaom/src/test/transform_test_base.h create mode 100644 libs/libaom/src/test/twopass_encoder.sh create mode 100644 libs/libaom/src/test/util.h create mode 100644 libs/libaom/src/test/variance_test.cc create mode 100644 libs/libaom/src/test/video_source.h create mode 100644 libs/libaom/src/test/visual_metrics.py create mode 100644 libs/libaom/src/test/warp_filter_test.cc create mode 100644 libs/libaom/src/test/warp_filter_test_util.cc create mode 100644 libs/libaom/src/test/warp_filter_test_util.h create mode 100644 libs/libaom/src/test/webm_video_source.h create mode 100644 libs/libaom/src/test/wiener_test.cc create mode 100644 libs/libaom/src/test/y4m_test.cc create mode 100644 libs/libaom/src/test/y4m_video_source.h create mode 100644 libs/libaom/src/test/yuv_video_source.h create mode 100644 libs/libaom/src/third_party/fastfeat/LICENSE create mode 100644 libs/libaom/src/third_party/fastfeat/README.libaom create mode 100644 libs/libaom/src/third_party/fastfeat/fast.c create mode 100644 libs/libaom/src/third_party/fastfeat/fast.h create mode 100644 libs/libaom/src/third_party/fastfeat/fast_9.c create mode 100644 libs/libaom/src/third_party/fastfeat/nonmax.c create mode 100644 libs/libaom/src/third_party/googletest/README.libaom create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/CHANGES create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/CMakeLists.txt create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/CONTRIBUTORS create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/LICENSE create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/README.md create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/cmake/Config.cmake.in create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/cmake/gtest.pc.in create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/cmake/gtest_main.pc.in create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/cmake/internal_utils.cmake create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-death-test.h create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-matchers.h create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-message.h create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-printers.h create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-spi.h create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-test-part.h create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-typed-test.h create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest.h create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest_pred_impl.h create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest_prod.h create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/README.md create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-port.h create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-printers.h create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest.h create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-death-test-internal.h create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-filepath.h create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-internal.h create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util.h create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-port-arch.h create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-port.h create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-string.h create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/src/gtest-all.cc create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/src/gtest-death-test.cc create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/src/gtest-filepath.cc create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/src/gtest-internal-inl.h create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/src/gtest-matchers.cc create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/src/gtest-port.cc create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/src/gtest-printers.cc create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/src/gtest-test-part.cc create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/src/gtest-typed-test.cc create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/src/gtest.cc create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/src/gtest_main.cc create mode 100644 libs/libaom/src/third_party/libwebm/AUTHORS.TXT create mode 100644 libs/libaom/src/third_party/libwebm/Android.mk create mode 100644 libs/libaom/src/third_party/libwebm/LICENSE.TXT create mode 100644 libs/libaom/src/third_party/libwebm/PATENTS.TXT create mode 100644 libs/libaom/src/third_party/libwebm/README.libaom create mode 100644 libs/libaom/src/third_party/libwebm/common/file_util.cc create mode 100644 libs/libaom/src/third_party/libwebm/common/file_util.h create mode 100644 libs/libaom/src/third_party/libwebm/common/hdr_util.cc create mode 100644 libs/libaom/src/third_party/libwebm/common/hdr_util.h create mode 100644 libs/libaom/src/third_party/libwebm/common/webmids.h create mode 100644 libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxer.cc create mode 100644 libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxer.h create mode 100644 libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxertypes.h create mode 100644 libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc create mode 100644 libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxerutil.h create mode 100644 libs/libaom/src/third_party/libwebm/mkvmuxer/mkvwriter.cc create mode 100644 libs/libaom/src/third_party/libwebm/mkvmuxer/mkvwriter.h create mode 100644 libs/libaom/src/third_party/libwebm/mkvparser/mkvparser.cc create mode 100644 libs/libaom/src/third_party/libwebm/mkvparser/mkvparser.h create mode 100644 libs/libaom/src/third_party/libwebm/mkvparser/mkvreader.cc create mode 100644 libs/libaom/src/third_party/libwebm/mkvparser/mkvreader.h create mode 100644 libs/libaom/src/third_party/libyuv/README.libaom create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/basic_types.h create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/compare.h create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/convert.h create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/convert_argb.h create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/convert_from.h create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/convert_from_argb.h create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/cpu_id.h create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/mjpeg_decoder.h create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/planar_functions.h create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/rotate.h create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/rotate_argb.h create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/rotate_row.h create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/row.h create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/scale.h create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/scale_argb.h create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/scale_row.h create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/version.h create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/video_common.h create mode 100644 libs/libaom/src/third_party/libyuv/source/compare.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/compare_common.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/compare_gcc.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/compare_neon.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/compare_neon64.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/compare_win.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/convert.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/convert_argb.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/convert_from.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/convert_from_argb.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/convert_jpeg.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/convert_to_argb.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/convert_to_i420.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/cpu_id.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/mjpeg_decoder.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/mjpeg_validate.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/planar_functions.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/rotate.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/rotate_any.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/rotate_argb.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/rotate_common.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/rotate_gcc.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/rotate_mips.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/rotate_neon.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/rotate_neon64.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/rotate_win.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/row_any.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/row_common.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/row_gcc.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/row_mips.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/row_neon.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/row_neon64.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/row_win.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/row_x86.asm create mode 100644 libs/libaom/src/third_party/libyuv/source/scale.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/scale_any.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/scale_argb.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/scale_common.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/scale_gcc.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/scale_mips.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/scale_neon.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/scale_neon64.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/scale_win.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/video_common.cc create mode 100644 libs/libaom/src/third_party/libyuv/source/x86inc.asm create mode 100644 libs/libaom/src/third_party/vector/LICENSE create mode 100644 libs/libaom/src/third_party/vector/README.libaom create mode 100644 libs/libaom/src/third_party/vector/vector.c create mode 100644 libs/libaom/src/third_party/vector/vector.h create mode 100644 libs/libaom/src/third_party/x86inc/LICENSE create mode 100644 libs/libaom/src/third_party/x86inc/README.libaom create mode 100644 libs/libaom/src/third_party/x86inc/x86inc.asm create mode 100644 libs/libaom/src/tools/aggregate_entropy_stats.py create mode 100644 libs/libaom/src/tools/aom_entropy_optimizer.c create mode 100644 libs/libaom/src/tools/cpplint.py create mode 100644 libs/libaom/src/tools/diff.py create mode 100644 libs/libaom/src/tools/dump_obu.cc create mode 100644 libs/libaom/src/tools/gen_authors.sh create mode 100644 libs/libaom/src/tools/gen_constrained_tokenset.py create mode 100644 libs/libaom/src/tools/inspect-cli.js create mode 100644 libs/libaom/src/tools/inspect-post.js create mode 100644 libs/libaom/src/tools/intersect-diffs.py create mode 100644 libs/libaom/src/tools/lint-hunks.py create mode 100644 libs/libaom/src/tools/obu_parser.cc create mode 100644 libs/libaom/src/tools/obu_parser.h create mode 100644 libs/libaom/src/tools/txfm_analyzer/txfm_gen_code.cc create mode 100644 libs/libaom/src/tools/txfm_analyzer/txfm_graph.cc create mode 100644 libs/libaom/src/tools/txfm_analyzer/txfm_graph.h create mode 100644 libs/libaom/src/tools/wrap-commit-msg.py create mode 100644 libs/libaom/src/usage.dox create mode 100644 libs/libaom/src/usage_cx.dox create mode 100644 libs/libaom/src/usage_dx.dox (limited to 'libs/libaom/src') diff --git a/libs/libaom/src/.clang-format b/libs/libaom/src/.clang-format new file mode 100644 index 000000000..a37882007 --- /dev/null +++ b/libs/libaom/src/.clang-format @@ -0,0 +1,148 @@ +--- +Language: Cpp +# BasedOnStyle: Google +# Generated with clang-format 7.0.1 +AccessModifierOffset: -1 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignEscapedNewlines: Left +AlignOperands: true +AlignTrailingComments: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: true +AllowShortIfStatementsOnASingleLine: true +AllowShortLoopsOnASingleLine: true +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakTemplateDeclarations: true +BinPackArguments: true +BinPackParameters: true +BraceWrapping: + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Attach +BreakBeforeInheritanceComma: false +BreakInheritanceList: BeforeColon +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +BreakConstructorInitializers: BeforeColon +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: true +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerAllOnOneLineOrOnePerLine: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: false +DerivePointerAlignment: false +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IncludeBlocks: Preserve +IncludeCategories: + - Regex: '^' + Priority: 2 + - Regex: '^<.*\.h>' + Priority: 1 + - Regex: '^<.*' + Priority: 2 + - Regex: '.*' + Priority: 3 +IncludeIsMainRegex: '([-_](test|unittest))?$' +IndentCaseLabels: true +IndentPPDirectives: None +IndentWidth: 2 +IndentWrappedFunctionNames: false +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Never +ObjCBlockIndentWidth: 2 +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: false +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 1 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 200 +PointerAlignment: Right +RawStringFormats: + - Language: Cpp + Delimiters: + - cc + - CC + - cpp + - Cpp + - CPP + - 'c++' + - 'C++' + CanonicalDelimiter: '' + BasedOnStyle: google + - Language: TextProto + Delimiters: + - pb + - PB + - proto + - PROTO + EnclosingFunctions: + - EqualsProto + - EquivToProto + - PARSE_PARTIAL_TEXT_PROTO + - PARSE_TEST_PROTO + - PARSE_TEXT_PROTO + - ParseTextOrDie + - ParseTextProtoOrDie + CanonicalDelimiter: '' + BasedOnStyle: google +ReflowComments: true +SortIncludes: false +SortUsingDeclarations: true +SpaceAfterCStyleCast: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 2 +SpacesInAngles: false +SpacesInContainerLiterals: false +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Auto +TabWidth: 8 +UseTab: Never +... + diff --git a/libs/libaom/src/.cmake-format.py b/libs/libaom/src/.cmake-format.py new file mode 100644 index 000000000..7b0e4f08d --- /dev/null +++ b/libs/libaom/src/.cmake-format.py @@ -0,0 +1,102 @@ +# Generated with cmake-format 0.5.1 +# How wide to allow formatted cmake files +line_width = 80 + +# How many spaces to tab for indent +tab_size = 2 + +# If arglists are longer than this, break them always +max_subargs_per_line = 10 + +# If true, separate flow control names from their parentheses with a space +separate_ctrl_name_with_space = False + +# If true, separate function names from parentheses with a space +separate_fn_name_with_space = False + +# If a statement is wrapped to more than one line, than dangle the closing +# parenthesis on it's own line +dangle_parens = False + +# What character to use for bulleted lists +bullet_char = '*' + +# What character to use as punctuation after numerals in an enumerated list +enum_char = '.' + +# What style line endings to use in the output. +line_ending = u'unix' + +# Format command names consistently as 'lower' or 'upper' case +command_case = u'lower' + +# Format keywords consistently as 'lower' or 'upper' case +keyword_case = u'unchanged' + +# Specify structure for custom cmake functions +additional_commands = { + "foo": { + "flags": [ + "BAR", + "BAZ" + ], + "kwargs": { + "HEADERS": "*", + "DEPENDS": "*", + "SOURCES": "*" + } + } +} + +# A list of command names which should always be wrapped +always_wrap = [] + +# Specify the order of wrapping algorithms during successive reflow attempts +algorithm_order = [0, 1, 2, 3, 4] + +# If true, the argument lists which are known to be sortable will be sorted +# lexicographicall +autosort = False + +# enable comment markup parsing and reflow +enable_markup = True + +# If comment markup is enabled, don't reflow the first comment block in +# eachlistfile. Use this to preserve formatting of your +# copyright/licensestatements. +first_comment_is_literal = False + +# If comment markup is enabled, don't reflow any comment block which matchesthis +# (regex) pattern. Default is `None` (disabled). +literal_comment_pattern = None + +# Regular expression to match preformat fences in comments +# default=r'^\s*([`~]{3}[`~]*)(.*)$' +fence_pattern = u'^\\s*([`~]{3}[`~]*)(.*)$' + +# Regular expression to match rulers in comments +# default=r'^\s*[^\w\s]{3}.*[^\w\s]{3}$' +ruler_pattern = u'^\\s*[^\\w\\s]{3}.*[^\\w\\s]{3}$' + +# If true, emit the unicode byte-order mark (BOM) at the start of the file +emit_byteorder_mark = False + +# If a comment line starts with at least this many consecutive hash characters, +# then don't lstrip() them off. This allows for lazy hash rulers where the first +# hash char is not separated by space +hashruler_min_length = 10 + +# If true, then insert a space between the first hash char and remaining hash +# chars in a hash ruler, and normalize it's length to fill the column +canonicalize_hashrulers = True + +# Specify the encoding of the input file. Defaults to utf-8. +input_encoding = u'utf-8' + +# Specify the encoding of the output file. Defaults to utf-8. Note that cmake +# only claims to support utf-8 so be careful when using anything else +output_encoding = u'utf-8' + +# A dictionary containing any per-command configuration overrides. Currently +# only `command_case` is supported. +per_command = {} diff --git a/libs/libaom/src/.gitattributes b/libs/libaom/src/.gitattributes new file mode 100644 index 000000000..ffc6912a9 --- /dev/null +++ b/libs/libaom/src/.gitattributes @@ -0,0 +1,18 @@ +*.[chs] filter=fixtabswsp +*.[ch]pp filter=fixtabswsp +*.[ch]xx filter=fixtabswsp +*.asm filter=fixtabswsp +*.php filter=fixtabswsp +*.pl filter=fixtabswsp +*.sh filter=fixtabswsp +*.txt filter=fixwsp +[Mm]akefile filter=fixwsp +*.mk filter=fixwsp +*.rc -crlf +*.ds[pw] -crlf +*.bat -crlf +*.mmp -crlf +*.dpj -crlf +*.pjt -crlf +*.vcp -crlf +*.inf -crlf diff --git a/libs/libaom/src/.mailmap b/libs/libaom/src/.mailmap new file mode 100644 index 000000000..30fae4de7 --- /dev/null +++ b/libs/libaom/src/.mailmap @@ -0,0 +1,91 @@ +Adrian Grange +Aℓex Converse +Aℓex Converse +Alexis Ballier +Alpha Lam +Andrey Norkin +Angie Chiang +Arild Fuldseth +Arild Fuldseth +Bohan Li +Changjun Yang +Chi Yo Tsai +Chi Yo Tsai +Chm +Damon Shen +Daniele Castagna +Deb Mukherjee +Elliott Karpilovsky +Emil Keyder +Erik Niemeyer +Frederic Barbier +Fyodor Kyslov +Grant Hsu +Guillaume Martres +Guillaume Martres +Guillaume Martres +Guillaume Martres +Hangyu Kuang +Hui Su +Iole Moccagatta +Jacky Chen +James Zern +Jean-Marc Valin +Jim Bankoski +Johann Koenig +Johann Koenig +Johann Koenig +Johann Koenig +John Koleszar +Joshua Litt +Lokeshwar Reddy B +Logan Goldberg +Luc Trudeau +Luc Trudeau +Marco Paniconi +Marco Paniconi +Michael Bebenita +Michael Horowitz +Mingliang Chen +Monty Montgomery +Nathan E. Egge +Nathan E. Egge +Pascal Massimino +Pascal Massimino +Paul Wilkins +Peng Bin +Peng Bin +Peter de Rivaz +Ralph Giles +Ralph Giles +Remya Prakasan +Roger Zhou +Ronald S. Bultje +Ryan Lei +Ryan Lei +Ryan Lei +Sachin Kumar Garg +Sai Deng +Sami Pietilä +Sarah Parker +Tamar Levy +Tamar Levy +Tero Rintaluoma +Thomas Davies Thomas +Timothy B. Terriberry +Timothy B. Terriberry +Timothy B. Terriberry Tim Terriberry +Tom Finegan +Tom Finegan +Tristan Matthews +Venkat Sanampudi +Wei-Ting Lin +Wei-Ting Lin +Wenyao Liu +Yaowu Xu +Yaowu Xu +Yaowu Xu +Yaowu Xu +Yaowu Xu +Zhipin Deng +Zoe Liu diff --git a/libs/libaom/src/AUTHORS b/libs/libaom/src/AUTHORS new file mode 100644 index 000000000..f61026fc0 --- /dev/null +++ b/libs/libaom/src/AUTHORS @@ -0,0 +1,260 @@ +# This file is automatically generated from the git commit history +# by tools/gen_authors.sh. + +Aamir Anis +Aaron Watry +Aasaipriya +Abo Talib Mahfoodh +Adrian Grange +Ahmad Sharif +Akshata Jadhav +Alexander Bokov +Alexander Voronov +Aℓex Converse +Alexis Ballier +Alok Ahuja +Alpha Lam +A.Mahfoodh +Ami Fischman +Andoni Morales Alastruey +Andres Mejia +Andrew Russell +Andrey Norkin +Angie Chiang +Aniket Dhok +Ankur Saxena +Arild Fuldseth +Aron Rosenberg +Attila Nagy +Bohan Li +Brennan Shacklett +Brion Vibber +Bruno Berthier +Changjun Yang +Charles 'Buck' Krasic +Cheng Chen +Cherma Rajan A +Chi Yo Tsai +Chm +Christian Duvivier +Cyril Concolato +Dake He +Damon Shen +Dandan Ding +Daniele Castagna +Daniel Kang +Daniel Max Valenzuela +Danil Chapovalov +David Barker +David Major +David Michael Barr +David Turner +Deb Mukherjee +Deepa K G +Deng +Di Chen +Dim Temp +Dmitry Kovalev +Dominic Symes +Dragan Mrdjan +Ed Baker +Edward Hervey +Ehsan Akhgari +Elliott Karpilovsky +Emil Keyder +Erik Niemeyer +Fabio Pedretti +Fangwen Fu +Fergus Simpson +Frank Bossen +Frank Galligan +Frederic Barbier +Fredrik Söderquist +Fritz Koenig +Fyodor Kyslov +Gaute Strokkenes +Geza Lore +Ghislain MARY +Giuseppe Scrivano +Gordana Cmiljanovic +Grant Hsu +Guillaume Martres +Guillermo Ballester Valor +Hamsalekha S +Hangyu Kuang +Hanno Böck +Harish Mahendrakar +Henrik Lundin +Hien Ho +Hui Su +Ilie Halip +Ilya Brailovskiy +Imdad Sardharwalla +iole moccagatta +Ivan Krasin +Ivan Maltz +Jacek Caban +Jack Haughton +Jacky Chen +James Berry +James Yu +James Zern +Jan Gerber +Jan Kratochvil +Janne Salonen +Jayasanker J +Jean-Marc Valin +Jean-Yves Avenard +Jeff Faust +Jeff Muizelaar +Jeff Petkau +Jerome Jiang +Jia Jia +Jian Zhou +Jim Bankoski +Jingning Han +Joe Young +Joey Parrish +Johann Koenig +John Koleszar +Johnny Klonaris +John Stark +Jonathan Matthews +Joshua Bleecher Snyder +Joshua Litt +Julia Robson +Justin Clift +Justin Lebar +Katsuhisa Yuasa +KO Myung-Hun +Krishna Malladi +Kyle Siefring +Larisa Markeeva +Lawrence Velázquez +Lester Lu +Linfeng Zhang +Logan Goldberg +Lokeshwar Reddy B +Lou Quillio +Luca Barbato +Luc Trudeau +Makoto Kato +Mans Rullgard +Marco Paniconi +Mark Mentovai +Martin Ettl +Martin Storsjo +Matthew Heaney +Matthieu Vaudano +Mattias Hansson +Maxym Dmytrychenko +Michael Bebenita +Michael Horowitz +Michael Kohler +Michelle Findlay-Olynyk +Mike Frysinger +Mike Hommey +Mikhal Shemer +Minghai Shang +Mingliang Chen +Mirko Bonadei +Monty Montgomery +Morton Jonuschat +Mufaddal Chakera +Nathan E. Egge +Neil Birkbeck +Nico Weber +Nithya V S +Ola Hugosson +Oleg Nalivayko +Parag Salasakar +Pascal Massimino +Patrik Westin +Paul Wilkins +Pavel Frolov +Pavol Rusnak +Paweł Hajdan +Peng Bin +Pengchong Jin +Peter Boström +Peter de Rivaz +Philip Jägenstedt +Priit Laes +Rafael Ávila de Espíndola +Rafaël Carré +Ralph Giles +Ranjit Kumar Tulabandu +Ravi Chaudhary +Remya Prakasan +Remy Foray +Rob Bradford +Robert-André Mauchin +RogerZhou +Rohit Athavale +Ronald S. Bultje +Rostislav Pehlivanov +Ruiling Song +Rui Ueyama +Rupert Swarbrick +Ryan Lei +Ryan Overbeck +Sachin Kumar Garg +Sai Deng +Sami Pietilä +Sarah Parker +Sasi Inguva +Satish Kumar Suman +Scott Graham +Scott LaVarnway +Sean DuBois +Sean McGovern +Sean Purser-Haskell +Sebastien Alaiwan +Sergey Kolomenkin +Sergey Ulanov +Shimon Doodkin +Shunyao Li +SmilingWolf +Soo-Chul Han +Stanislav Vitvitskyy +Stefan Holmer +Steinar Midtskogen +Suman Sunkara +Taekhyun Kim +Takanori MATSUURA +Tamar Levy +Tao Bai +Tarek AMARA +Tero Rintaluoma +Thijs Vermeir +Thomas Daede +Thomas Davies Thomas +Tim Kopp +Timothy B. Terriberry +Timo Witte +Todd Nguyen +Tom Anderson +Tom Finegan +Tristan Matthews +Umang Saini +Urvang Joshi +Venkat Sanampudi +Victoria Zhislina +Vignesh Venkatasubramanian +Vishesh +Wan-Teh Chang +Wei-Ting Lin +Wenyao Liu +Xing Jin +Xin Zhao +Yaowu Xu +Yaowu Xu +Yi Luo +Yongzhe Wang +Yue Chen +Yunqing Wang +Yury Gitman +Yushin Cho +Zhijie Yang +zhipin deng +Zoe Liu diff --git a/libs/libaom/src/CHANGELOG b/libs/libaom/src/CHANGELOG new file mode 100644 index 000000000..11da097af --- /dev/null +++ b/libs/libaom/src/CHANGELOG @@ -0,0 +1,51 @@ +2021-02-09 v2.0.2 + This release includes several bug fixes. + + - Bug fixes: + Issue 2643: Modify the assertion in temporal filter intrinsics. + + Issue 2648: Fix unit test ThreadTestLarge.EncoderResultTest/49 + assertion failure. + + Issue 2869: Add -Wimplicit-function-declaration as C flag only. + + Issue 2878: Avoid memset in the av1_filter_intra_predictor module + functions. + + Issue 2903: Fix a typo bug in apply_temporal_filter_planewise. + + Call av1_setup_frame_size() when dropping a frame in the + encode_frame_to_data_rate() function in av1/encoder/encoder.c. + +2020-11-25 v2.0.1 + This release includes two bug fixes. + + - Bug fixes: + Issue 2723: Fix crash in chroma_check() when generating a monochrome + encoded stream in real-time mode. + + Issue 2833: Fix crash on some input when reduced still picture header is + used in real-time mode and speed >=7. + +2020-05-07 v2.0.0 "Applejack" + First official release of libaom. + This release includes new real-time mode and SVC support. + + - Upgrading: + AOM_SET_POSTPROC, AOM_CODEC_CAP_POSTPROC and AOM_CODEC_USE_POSTPROC are + removed. + + AOM_SET_DBG_* is removed. + + Multi-resolution encoding is removed. + + put_frame and put_slice callbacks are removed. + + - Enhancements: + Full-sweep document update for codec controls. + +2018-06-28 v1.0.0 + AOMedia Codec Workgroup Approved version 1.0 + +2016-04-07 v0.1.0 "AOMedia Codec 1" + This release is the first Alliance for Open Media codec. diff --git a/libs/libaom/src/CMakeLists.txt b/libs/libaom/src/CMakeLists.txt new file mode 100644 index 000000000..84c8995a5 --- /dev/null +++ b/libs/libaom/src/CMakeLists.txt @@ -0,0 +1,768 @@ +# +# Copyright (c) 2016, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +cmake_minimum_required(VERSION 3.5) +project(AOM C CXX) + +if(NOT EMSCRIPTEN) + if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) + set(CMAKE_BUILD_TYPE + "Release" + CACHE STRING "Build type: Debug, Release, RelWithDebInfo or MinSizeRel" + FORCE) + endif() +endif() + +set(AOM_ROOT "${CMAKE_CURRENT_SOURCE_DIR}") +set(AOM_CONFIG_DIR "${CMAKE_CURRENT_BINARY_DIR}") + +if("${AOM_ROOT}" STREQUAL "${AOM_CONFIG_DIR}") + message( + FATAL_ERROR "Building from within the aom source tree is not supported.\n" + "Hint: Run these commands\n" + "$ rm -rf CMakeCache.txt CMakeFiles\n" + "$ mkdir -p ../aom_build\n" "$ cd ../aom_build\n" + "And re-run CMake from the aom_build directory.") +endif() + +# Updating version info. +# https://www.gnu.org/software/libtool/manual/libtool.html#Updating-version-info +set(SO_VERSION 2) +set(SO_FILE_VERSION 2.0.2) + +include("${AOM_ROOT}/build/cmake/aom_configure.cmake") +include("${AOM_ROOT}/aom_dsp/aom_dsp.cmake") +include("${AOM_ROOT}/aom_mem/aom_mem.cmake") +include("${AOM_ROOT}/aom_ports/aom_ports.cmake") +include("${AOM_ROOT}/aom_scale/aom_scale.cmake") +include("${AOM_ROOT}/aom_util/aom_util.cmake") +include("${AOM_ROOT}/av1/av1.cmake") +include("${AOM_ROOT}/build/cmake/aom_install.cmake") +include("${AOM_ROOT}/build/cmake/sanitizers.cmake") +include("${AOM_ROOT}/build/cmake/util.cmake") +include("${AOM_ROOT}/test/test.cmake") + +list(APPEND AOM_RTCD_SOURCES + "${AOM_CONFIG_DIR}/config/aom_dsp_rtcd.h" + "${AOM_CONFIG_DIR}/config/aom_scale_rtcd.h" + "${AOM_CONFIG_DIR}/config/av1_rtcd.h" + "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd_defs.pl" + "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c" + "${AOM_ROOT}/aom_scale/aom_scale_rtcd.pl" + "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c" + "${AOM_ROOT}/av1/common/av1_rtcd_defs.pl" + "${AOM_ROOT}/av1/common/av1_rtcd.c" + "${AOM_ROOT}/build/cmake/rtcd.pl") + +list(APPEND AOM_LIBWEBM_SOURCES + "${AOM_ROOT}/third_party/libwebm/common/hdr_util.cc" + "${AOM_ROOT}/third_party/libwebm/common/hdr_util.h" + "${AOM_ROOT}/third_party/libwebm/common/webmids.h" + "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxer.cc" + "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxer.h" + "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxertypes.h" + "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc" + "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxerutil.h" + "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvwriter.cc" + "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvwriter.h" + "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvparser.cc" + "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvparser.h" + "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvreader.cc" + "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvreader.h") + +list(APPEND AOM_LIBYUV_SOURCES + "${AOM_ROOT}/third_party/libyuv/include/libyuv/basic_types.h" + "${AOM_ROOT}/third_party/libyuv/include/libyuv/convert.h" + "${AOM_ROOT}/third_party/libyuv/include/libyuv/convert_argb.h" + "${AOM_ROOT}/third_party/libyuv/include/libyuv/convert_from.h" + "${AOM_ROOT}/third_party/libyuv/include/libyuv/cpu_id.h" + "${AOM_ROOT}/third_party/libyuv/include/libyuv/planar_functions.h" + "${AOM_ROOT}/third_party/libyuv/include/libyuv/rotate.h" + "${AOM_ROOT}/third_party/libyuv/include/libyuv/row.h" + "${AOM_ROOT}/third_party/libyuv/include/libyuv/scale.h" + "${AOM_ROOT}/third_party/libyuv/include/libyuv/scale_row.h" + "${AOM_ROOT}/third_party/libyuv/source/cpu_id.cc" + "${AOM_ROOT}/third_party/libyuv/source/planar_functions.cc" + "${AOM_ROOT}/third_party/libyuv/source/row_any.cc" + "${AOM_ROOT}/third_party/libyuv/source/row_common.cc" + "${AOM_ROOT}/third_party/libyuv/source/row_gcc.cc" + "${AOM_ROOT}/third_party/libyuv/source/row_mips.cc" + "${AOM_ROOT}/third_party/libyuv/source/row_neon.cc" + "${AOM_ROOT}/third_party/libyuv/source/row_neon64.cc" + "${AOM_ROOT}/third_party/libyuv/source/row_win.cc" + "${AOM_ROOT}/third_party/libyuv/source/scale.cc" + "${AOM_ROOT}/third_party/libyuv/source/scale_any.cc" + "${AOM_ROOT}/third_party/libyuv/source/scale_common.cc" + "${AOM_ROOT}/third_party/libyuv/source/scale_gcc.cc" + "${AOM_ROOT}/third_party/libyuv/source/scale_mips.cc" + "${AOM_ROOT}/third_party/libyuv/source/scale_neon.cc" + "${AOM_ROOT}/third_party/libyuv/source/scale_neon64.cc" + "${AOM_ROOT}/third_party/libyuv/source/scale_win.cc") + +list(APPEND AOM_SOURCES + "${AOM_CONFIG_DIR}/config/aom_config.c" + "${AOM_CONFIG_DIR}/config/aom_config.h" + "${AOM_ROOT}/aom/aom.h" + "${AOM_ROOT}/aom/aom_codec.h" + "${AOM_ROOT}/aom/aom_decoder.h" + "${AOM_ROOT}/aom/aom_encoder.h" + "${AOM_ROOT}/aom/aom_frame_buffer.h" + "${AOM_ROOT}/aom/aom_image.h" + "${AOM_ROOT}/aom/aom_integer.h" + "${AOM_ROOT}/aom/aomcx.h" + "${AOM_ROOT}/aom/aomdx.h" + "${AOM_ROOT}/aom/internal/aom_codec_internal.h" + "${AOM_ROOT}/aom/internal/aom_image_internal.h" + "${AOM_ROOT}/aom/src/aom_codec.c" + "${AOM_ROOT}/aom/src/aom_decoder.c" + "${AOM_ROOT}/aom/src/aom_encoder.c" + "${AOM_ROOT}/aom/src/aom_image.c" + "${AOM_ROOT}/aom/src/aom_integer.c") + +list(APPEND AOM_COMMON_APP_UTIL_SOURCES + "${AOM_ROOT}/common/args.c" + "${AOM_ROOT}/common/args.h" + "${AOM_ROOT}/common/av1_config.c" + "${AOM_ROOT}/common/av1_config.h" + "${AOM_ROOT}/common/md5_utils.c" + "${AOM_ROOT}/common/md5_utils.h" + "${AOM_ROOT}/common/tools_common.c" + "${AOM_ROOT}/common/tools_common.h" + "${AOM_ROOT}/common/video_common.h" + "${AOM_ROOT}/common/rawenc.c" + "${AOM_ROOT}/common/rawenc.h" + "${AOM_ROOT}/common/y4menc.c" + "${AOM_ROOT}/common/y4menc.h") + +list(APPEND AOM_DECODER_APP_UTIL_SOURCES "${AOM_ROOT}/common/ivfdec.c" + "${AOM_ROOT}/common/ivfdec.h" "${AOM_ROOT}/common/obudec.c" + "${AOM_ROOT}/common/obudec.h" "${AOM_ROOT}/common/video_reader.c" + "${AOM_ROOT}/common/video_reader.h") + +list(APPEND AOM_ENCODER_APP_UTIL_SOURCES + "${AOM_ROOT}/common/ivfenc.c" + "${AOM_ROOT}/common/ivfenc.h" + "${AOM_ROOT}/common/video_writer.c" + "${AOM_ROOT}/common/video_writer.h" + "${AOM_ROOT}/common/warnings.c" + "${AOM_ROOT}/common/warnings.h" + "${AOM_ROOT}/common/y4minput.c" + "${AOM_ROOT}/common/y4minput.h" + "${AOM_ROOT}/examples/encoder_util.h" + "${AOM_ROOT}/examples/encoder_util.c") + +list(APPEND AOM_ENCODER_STATS_SOURCES "${AOM_ROOT}/stats/aomstats.c" + "${AOM_ROOT}/stats/aomstats.h" "${AOM_ROOT}/stats/rate_hist.c" + "${AOM_ROOT}/stats/rate_hist.h") + +list(APPEND AOM_VERSION_SOURCES "${AOM_CONFIG_DIR}/config/aom_version.h") + +list(APPEND AOM_WEBM_DECODER_SOURCES "${AOM_ROOT}/common/webmdec.cc" + "${AOM_ROOT}/common/webmdec.h") + +list(APPEND AOM_WEBM_ENCODER_SOURCES "${AOM_ROOT}/common/webmenc.cc" + "${AOM_ROOT}/common/webmenc.h") + +include_directories(${AOM_ROOT} ${AOM_CONFIG_DIR} ${AOM_ROOT}/apps + ${AOM_ROOT}/common ${AOM_ROOT}/examples ${AOM_ROOT}/stats) + +# Targets +add_library(aom_version ${AOM_VERSION_SOURCES}) +add_dummy_source_file_to_target(aom_version c) +add_custom_command(OUTPUT "${AOM_CONFIG_DIR}/config/aom_version.h" + COMMAND ${CMAKE_COMMAND} ARGS + -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR} + -DAOM_ROOT=${AOM_ROOT} + -DGIT_EXECUTABLE=${GIT_EXECUTABLE} + -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P + "${AOM_ROOT}/build/cmake/version.cmake" + COMMENT "Writing aom_version.h" + VERBATIM) + +add_custom_target(aom_version_check + COMMAND ${CMAKE_COMMAND} + -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR} + -DAOM_ROOT=${AOM_ROOT} + -DGIT_EXECUTABLE=${GIT_EXECUTABLE} + -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P + "${AOM_ROOT}/build/cmake/version.cmake" + COMMENT "Updating version info if necessary." + VERBATIM) + +if(BUILD_SHARED_LIBS AND NOT MSVC) + # Generate version file immediately for non-MSVC shared builds: The version + # string is needed for the aom target. + execute_process(COMMAND ${CMAKE_COMMAND} + -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR} + -DAOM_ROOT=${AOM_ROOT} + -DGIT_EXECUTABLE=${GIT_EXECUTABLE} + -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P + "${AOM_ROOT}/build/cmake/version.cmake") +endif() + +add_dependencies(aom_version aom_version_check) + +# TODO(tomfinegan): Move rtcd target setup where it belongs for each rtcd +# source. +add_rtcd_build_step("${AOM_ROOT}/aom_dsp/aom_dsp_rtcd_defs.pl" + "${AOM_CONFIG_DIR}/config/aom_dsp_rtcd.h" + "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c" "aom_dsp_rtcd") +add_rtcd_build_step("${AOM_ROOT}/aom_scale/aom_scale_rtcd.pl" + "${AOM_CONFIG_DIR}/config/aom_scale_rtcd.h" + "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c" "aom_scale_rtcd") +add_rtcd_build_step("${AOM_ROOT}/av1/common/av1_rtcd_defs.pl" + "${AOM_CONFIG_DIR}/config/av1_rtcd.h" + "${AOM_ROOT}/av1/common/av1_rtcd.c" "av1_rtcd") + +add_library(aom_rtcd OBJECT ${AOM_RTCD_SOURCES}) +add_dependencies(aom_rtcd aom_version) + +if(ENABLE_EXAMPLES) + add_library(aom_encoder_stats OBJECT ${AOM_ENCODER_STATS_SOURCES}) + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_encoder_stats) +endif() + +add_library(aom ${AOM_SOURCES} $) +if(BUILD_SHARED_LIBS) + add_library(aom_static STATIC ${AOM_SOURCES} $) + set_target_properties(aom_static PROPERTIES OUTPUT_NAME aom) + + if(NOT MSVC) + # Extract version string and set VERSION/SOVERSION for the aom target. + extract_version_string("${AOM_CONFIG_DIR}/config/aom_version.h" + aom_version_triple) + + # Strip any trailing version information, if present. + string(FIND "${aom_version_triple}" "-" dash_pos) + if(NOT dash_pos EQUAL -1) + string(SUBSTRING "${aom_version_triple}" 0 ${dash_pos} aom_version_triple) + endif() + + # cmake-format: off + # VERSION is embedded in the .so file name. + # libaom.so -> libaom.so.SOVERSION + # libaom.so.SOVERSION -> libaom.so.VERSION + # libaom.so.VERSION + # cmake-format: on + set_target_properties(aom PROPERTIES SOVERSION ${SO_VERSION}) + set_target_properties(aom PROPERTIES VERSION ${SO_FILE_VERSION}) + endif() +endif() + +if(NOT MSVC AND NOT APPLE) + target_link_libraries(aom ${AOM_LIB_LINK_TYPE} m) + if(BUILD_SHARED_LIBS) + target_link_libraries(aom_static ${AOM_LIB_LINK_TYPE} m) + endif() +endif() + +# List of object and static library targets. +set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_rtcd aom_mem aom_scale aom) +if(BUILD_SHARED_LIBS) + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_static) +endif() + +# Setup dependencies. +setup_aom_dsp_targets() +setup_aom_mem_targets() +setup_aom_ports_targets() +setup_aom_util_targets() +setup_aom_scale_targets() +setup_av1_targets() + +# Make all library targets depend on aom_rtcd to make sure it builds first. +foreach(aom_lib ${AOM_LIB_TARGETS}) + if(NOT "${aom_lib}" STREQUAL "aom_rtcd") + add_dependencies(${aom_lib} aom_rtcd) + endif() +endforeach() + +# Generate C/C++ stub files containing the function usage_exit(). Users of the +# aom_common_app_util library must define this function. This is a convenience +# to allow omission of the function from applications that might want to use +# other pieces of the util support without defining usage_exit(). +file(WRITE "${AOM_GEN_SRC_DIR}/usage_exit.c" "void usage_exit(void) {}") +file(WRITE "${AOM_GEN_SRC_DIR}/usage_exit.cc" + "extern \"C\" void usage_exit(void) {}") + +# +# Application and application support targets. +# +if(ENABLE_EXAMPLES OR ENABLE_TESTS OR ENABLE_TOOLS) + add_library(aom_common_app_util OBJECT ${AOM_COMMON_APP_UTIL_SOURCES}) + if(CONFIG_AV1_DECODER) + add_library(aom_decoder_app_util OBJECT ${AOM_DECODER_APP_UTIL_SOURCES}) + # obudec depends on internal headers that require *rtcd.h + add_dependencies(aom_decoder_app_util aom_rtcd) + endif() + if(CONFIG_AV1_ENCODER) + add_library(aom_encoder_app_util OBJECT ${AOM_ENCODER_APP_UTIL_SOURCES}) + endif() +endif() + +if((CONFIG_AV1_DECODER OR CONFIG_AV1_ENCODER) AND ENABLE_EXAMPLES) + add_executable(resize_util "${AOM_ROOT}/examples/resize_util.c" + $) + list(APPEND AOM_APP_TARGETS resize_util) +endif() + +if(CONFIG_AV1_DECODER AND ENABLE_EXAMPLES) + add_executable(aomdec "${AOM_ROOT}/apps/aomdec.c" + $ + $) + add_executable(decode_to_md5 "${AOM_ROOT}/examples/decode_to_md5.c" + $ + $) + add_executable(decode_with_drops "${AOM_ROOT}/examples/decode_with_drops.c" + $ + $) + add_executable(simple_decoder "${AOM_ROOT}/examples/simple_decoder.c" + $ + $) + add_executable(scalable_decoder "${AOM_ROOT}/examples/scalable_decoder.c" + $ + $) + + if(CONFIG_ANALYZER) + add_executable(analyzer "${AOM_ROOT}/examples/analyzer.cc" + $ + $) + target_link_libraries(analyzer ${AOM_LIB_LINK_TYPE} ${wxWidgets_LIBRARIES}) + list(APPEND AOM_APP_TARGETS analyzer) + list(APPEND AOM_DECODER_EXAMPLE_TARGETS analyzer) + endif() + + if(CONFIG_INSPECTION) + add_executable(inspect "${AOM_ROOT}/examples/inspect.c" + $ + $) + list(APPEND AOM_DECODER_EXAMPLE_TARGETS inspect) + + if(EMSCRIPTEN) + add_preproc_definition(_POSIX_SOURCE) + append_link_flag_to_target("inspect" "--emrun") + append_link_flag_to_target("inspect" "-s USE_PTHREADS=0") + append_link_flag_to_target("inspect" "-s WASM=1") + append_link_flag_to_target("inspect" "-s MODULARIZE=1") + append_link_flag_to_target("inspect" "-s ALLOW_MEMORY_GROWTH=1") + append_link_flag_to_target( + "inspect" "-s \'EXTRA_EXPORTED_RUNTIME_METHODS=[\"UTF8ToString\"]\'") + append_link_flag_to_target("inspect" + "-s EXPORT_NAME=\"\'DecoderModule\'\"") + append_link_flag_to_target("inspect" "--memory-init-file 0") + + if("${CMAKE_BUILD_TYPE}" STREQUAL "") + + # Default to -O3 when no build type is specified. + append_compiler_flag("-O3") + endif() + + em_link_post_js(inspect "${AOM_ROOT}/tools/inspect-post.js") + endif() + endif() + + # Maintain a list of decoder example targets. + list(APPEND AOM_DECODER_EXAMPLE_TARGETS aomdec decode_to_md5 decode_with_drops + scalable_decoder simple_decoder) + + # Add decoder examples to the app targets list. + list(APPEND AOM_APP_TARGETS ${AOM_DECODER_EXAMPLE_TARGETS}) +endif() + +if(CONFIG_AV1_ENCODER) + if(ENABLE_EXAMPLES) + add_executable(aomenc "${AOM_ROOT}/apps/aomenc.c" + $ + $ + $) + add_executable(lossless_encoder "${AOM_ROOT}/examples/lossless_encoder.c" + $ + $) + add_executable(set_maps "${AOM_ROOT}/examples/set_maps.c" + $ + $) + add_executable(simple_encoder "${AOM_ROOT}/examples/simple_encoder.c" + $ + $) + add_executable(twopass_encoder "${AOM_ROOT}/examples/twopass_encoder.c" + $ + $) + add_executable(noise_model "${AOM_ROOT}/examples/noise_model.c" + $ + $) + add_executable(scalable_encoder "${AOM_ROOT}/examples/scalable_encoder.c" + $ + $) + + add_executable(svc_encoder_rtc "${AOM_ROOT}/examples/svc_encoder_rtc.c" + $ + $) + + # Maintain a list of encoder example targets. + list(APPEND AOM_ENCODER_EXAMPLE_TARGETS aomenc lossless_encoder noise_model + set_maps simple_encoder scalable_encoder twopass_encoder + svc_encoder_rtc) + endif() + + if(ENABLE_TOOLS) + if(CONFIG_ENTROPY_STATS AND NOT BUILD_SHARED_LIBS) + + # TODO(tomfinegan): Sort out why a simple link command with + # aom_entropy_optimizer.c won't work on macos, but dragging in all the + # helper machinery allows the link to succeed. + add_executable(aom_entropy_optimizer + "${AOM_GEN_SRC_DIR}/usage_exit.c" + "${AOM_ROOT}/tools/aom_entropy_optimizer.c" + $ + $) + + # Maintain a list of encoder tool targets. + list(APPEND AOM_ENCODER_TOOL_TARGETS aom_entropy_optimizer) + endif() + endif() + + # Add encoder examples and tools to the targets list. + list(APPEND AOM_APP_TARGETS ${AOM_ENCODER_EXAMPLE_TARGETS} + ${AOM_ENCODER_TOOL_TARGETS}) + + if(CONFIG_TUNE_VMAF) + find_library(VMAF libvmaf.a vmaf) + if(NOT VMAF) + message(FATAL_ERROR "VMAF library not found.") + endif() + message("-- Found VMAF library: " ${VMAF}) + set_target_properties(aom PROPERTIES LINKER_LANGUAGE CXX) + if(BUILD_SHARED_LIBS) + set_target_properties(aom_static PROPERTIES LINKER_LANGUAGE CXX) + endif() + target_link_libraries(aom PRIVATE ${VMAF}) + endif() +endif() + +if(ENABLE_EXAMPLES) + + # Maintain a separate variable listing only the examples to facilitate + # installation of example programs into an examples sub directory of + # $AOM_DIST_DIR/bin when building the dist target. + list(APPEND AOM_EXAMPLE_TARGETS ${AOM_DECODER_EXAMPLE_TARGETS} + ${AOM_ENCODER_EXAMPLE_TARGETS}) +endif() + +if(ENABLE_TOOLS) + if(CONFIG_AV1_DECODER) + add_executable(dump_obu "${AOM_GEN_SRC_DIR}/usage_exit.cc" + "${AOM_ROOT}/tools/dump_obu.cc" + "${AOM_ROOT}/tools/obu_parser.cc" + "${AOM_ROOT}/tools/obu_parser.h" + $ + $) + + list(APPEND AOM_TOOL_TARGETS dump_obu) + list(APPEND AOM_APP_TARGETS dump_obu) + + # Maintain a separate variable listing only the examples to facilitate + # installation of example programs into an tools sub directory of + # $AOM_DIST_DIR/bin when building the dist target. + list(APPEND AOM_TOOL_TARGETS ${AOM_DECODER_TOOL_TARGETS} + ${AOM_ENCODER_TOOL_TARGETS}) + endif() +endif() + +if(ENABLE_EXAMPLES AND CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER) + add_executable(aom_cx_set_ref "${AOM_ROOT}/examples/aom_cx_set_ref.c" + $ + $) + list(APPEND AOM_EXAMPLE_TARGETS aom_cx_set_ref) + list(APPEND AOM_APP_TARGETS aom_cx_set_ref) +endif() + +if(ENABLE_EXAMPLES AND CONFIG_AV1_ENCODER) + add_executable(lightfield_encoder "${AOM_ROOT}/examples/lightfield_encoder.c" + $ + $) + list(APPEND AOM_EXAMPLE_TARGETS lightfield_encoder) + list(APPEND AOM_APP_TARGETS lightfield_encoder) +endif() + +if(ENABLE_EXAMPLES AND CONFIG_AV1_DECODER) + add_executable(lightfield_tile_list_decoder + "${AOM_ROOT}/examples/lightfield_tile_list_decoder.c" + $ + $) + list(APPEND AOM_EXAMPLE_TARGETS lightfield_tile_list_decoder) + list(APPEND AOM_APP_TARGETS lightfield_tile_list_decoder) +endif() + +if(ENABLE_EXAMPLES AND CONFIG_AV1_DECODER) + add_executable(lightfield_decoder "${AOM_ROOT}/examples/lightfield_decoder.c" + $ + $) + list(APPEND AOM_EXAMPLE_TARGETS lightfield_decoder) + list(APPEND AOM_APP_TARGETS lightfield_decoder) +endif() + +if(ENABLE_EXAMPLES AND CONFIG_AV1_ENCODER AND CONFIG_AV1_DECODER) + add_executable(lightfield_bitstream_parsing + "${AOM_ROOT}/examples/lightfield_bitstream_parsing.c" + $ + $ + $) + list(APPEND AOM_EXAMPLE_TARGETS lightfield_bitstream_parsing) + list(APPEND AOM_APP_TARGETS lightfield_bitstream_parsing) +endif() + +foreach(aom_app ${AOM_APP_TARGETS}) + target_link_libraries(${aom_app} ${AOM_LIB_LINK_TYPE} aom) +endforeach() + +if(ENABLE_EXAMPLES OR ENABLE_TESTS OR ENABLE_TOOLS) + if(CONFIG_LIBYUV) + add_library(yuv OBJECT ${AOM_LIBYUV_SOURCES}) + if(NOT MSVC) + target_compile_options(yuv PRIVATE -Wno-unused-parameter) + endif() + include_directories("${AOM_ROOT}/third_party/libyuv/include") + + # Add to existing targets. + foreach(aom_app ${AOM_APP_TARGETS}) + target_sources(${aom_app} PRIVATE $) + set_property(TARGET ${aom_app} PROPERTY LINKER_LANGUAGE CXX) + endforeach() + endif() + + if(CONFIG_WEBM_IO) + add_library(webm OBJECT ${AOM_LIBWEBM_SOURCES}) + include_directories("${AOM_ROOT}/third_party/libwebm") + target_compile_definitions(webm PRIVATE __STDC_CONSTANT_MACROS) + target_compile_definitions(webm PRIVATE __STDC_LIMIT_MACROS) + + if(NOT MSVC) + target_compile_options(webm PRIVATE -Wno-shadow) + endif() + + # Add to existing targets. + if(CONFIG_AV1_DECODER) + target_sources(aom_decoder_app_util PRIVATE ${AOM_WEBM_DECODER_SOURCES}) + endif() + + if(CONFIG_AV1_ENCODER) + target_sources(aom_encoder_app_util PRIVATE ${AOM_WEBM_ENCODER_SOURCES}) + endif() + + foreach(aom_app ${AOM_APP_TARGETS}) + target_sources(${aom_app} PRIVATE $) + set_property(TARGET ${aom_app} PROPERTY LINKER_LANGUAGE CXX) + endforeach() + endif() +endif() + +if(ENABLE_TESTS) + + # Create test_libaom target and the targets it depends on. + setup_aom_test_targets() +endif() + +if(HAVE_PTHREAD_H AND CONFIG_MULTITHREAD) + find_package(Threads) + target_link_libraries(aom ${AOM_LIB_LINK_TYPE} Threads::Threads) + if(BUILD_SHARED_LIBS) + target_link_libraries(aom_static ${AOM_LIB_LINK_TYPE} Threads::Threads) + endif() +endif() + +if(XCODE) + + # TODO(tomfinegan): Make sure target has no C++ files before doing this as + # it's not necessary in that case. + if(CONFIG_LIBYUV OR CONFIG_WEBM_IO) + + # The Xcode generator does not obey LINKER_LANGUAGE. Because of the issue + # what looks like a C++ file needs to be in any target that Xcode will link + # when the target contains a C++ dependency. Without this Xcode will try to + # link with the C linker, which always ends badly when a dependency actually + # includes C++. + + # Note: LINKER_LANGUAGE is explicitly set to C++ for all targets touched + # here, it really is the Xcode generator's fault, or just a deficiency in + # Xcode itself. + foreach(aom_app ${AOM_APP_TARGETS}) + add_dummy_source_file_to_target("${aom_app}" "cc") + endforeach() + endif() +endif() + +if(ENABLE_EXAMPLES AND "${CMAKE_GENERATOR}" MATCHES "Makefiles$") + + # For historical purposes place the example binaries in the example directory. + file(MAKE_DIRECTORY "${AOM_CONFIG_DIR}/examples") + + foreach(target ${AOM_EXAMPLE_TARGETS}) + if(NOT "${target}" MATCHES "aomdec\|aomenc") + set_target_properties(${target} + PROPERTIES RUNTIME_OUTPUT_DIRECTORY + "${AOM_CONFIG_DIR}/examples") + endif() + endforeach() + + if(ENABLE_TOOLS AND AOM_TOOL_TARGETS) + + # The same expectation is true for tool targets. + file(MAKE_DIRECTORY "${AOM_CONFIG_DIR}/tools") + set_target_properties(${AOM_TOOL_TARGETS} + PROPERTIES RUNTIME_OUTPUT_DIRECTORY + "${AOM_CONFIG_DIR}/tools") + endif() +endif() + +if(BUILD_SHARED_LIBS) + include("${AOM_ROOT}/build/cmake/exports.cmake") + setup_exports_target() +endif() + +# Handle user supplied compile and link flags last to ensure they're obeyed. +set_user_flags() + +# Aomedia documentation rule. +if(ENABLE_DOCS) + include(FindDoxygen) + if(DOXYGEN_FOUND) + include("${AOM_ROOT}/docs.cmake") + setup_documentation_targets() + else() + message("--- Cannot find doxygen, ENABLE_DOCS turned off.") + set(ENABLE_DOCS OFF) + endif() +endif() + +# Aomedia dist rule. +if(CONFIG_AV1_DECODER AND ENABLE_EXAMPLES) + list(APPEND AOM_DIST_APPS $) +endif() +if(CONFIG_AV1_ENCODER AND ENABLE_EXAMPLES) + list(APPEND AOM_DIST_APPS $) +endif() + +if(ENABLE_EXAMPLES) + foreach(example ${AOM_EXAMPLE_TARGETS}) + list(APPEND AOM_DIST_EXAMPLES $) + endforeach() +endif() + +if(ENABLE_TOOLS) + foreach(tool ${AOM_TOOL_TARGETS}) + list(APPEND AOM_DIST_TOOLS $) + endforeach() +endif() + +if(NOT AOM_DIST_DIR) + set(AOM_DIST_DIR "${AOM_CONFIG_DIR}/dist") +endif() + +add_custom_target(dist + COMMAND ${CMAKE_COMMAND} + -DAOM_ROOT=${AOM_ROOT} + -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR} + -DAOM_DIST_DIR=${AOM_DIST_DIR} + -DAOM_DIST_APPS="${AOM_DIST_APPS}" + -DAOM_DIST_EXAMPLES="${AOM_DIST_EXAMPLES}" + -DAOM_DIST_TOOLS="${AOM_DIST_TOOLS}" + -DAOM_DIST_INCLUDES="${AOM_INSTALL_INCS}" + -DAOM_DIST_LIBS=$ + -DENABLE_DOCS=${ENABLE_DOCS} -P + "${AOM_ROOT}/build/cmake/dist.cmake" + DEPENDS ${AOM_INSTALL_BINS} ${AOM_INSTALL_LIBS} + ${AOM_INSTALL_INCS} ${AOM_EXAMPLE_TARGETS} + ${AOM_TOOL_TARGETS}) + +if(ENABLE_DOCS) + add_dependencies(dist docs) +endif() + +# Collect all variables containing libaom source files. +get_cmake_property(all_cmake_vars VARIABLES) +foreach(var ${all_cmake_vars}) + if("${var}" MATCHES "SOURCES$\|_INTRIN_\|_ASM_" + AND NOT "${var}" MATCHES "_APP_\|DOXYGEN\|LIBWEBM\|LIBYUV\|_PKG_\|TEST") + list(APPEND aom_source_vars ${var}) + endif() +endforeach() + +# Libaom_srcs.txt generation. +set(libaom_srcs_txt_file "${AOM_CONFIG_DIR}/libaom_srcs.txt") +file(WRITE "${libaom_srcs_txt_file}" "# This file is generated. DO NOT EDIT.\n") + +# Static source file list first. +foreach(aom_source_var ${aom_source_vars}) + foreach(file ${${aom_source_var}}) + if(NOT "${file}" MATCHES "${AOM_CONFIG_DIR}") + string(REPLACE "${AOM_ROOT}/" "" file "${file}") + file(APPEND "${libaom_srcs_txt_file}" "${file}\n") + endif() + endforeach() +endforeach() + +file(APPEND "${libaom_srcs_txt_file}" + "# Files below this line are generated by the libaom build system.\n") +foreach(aom_source_var ${aom_source_vars}) + foreach(file ${${aom_source_var}}) + if("${file}" MATCHES "${AOM_CONFIG_DIR}") + string(REPLACE "${AOM_CONFIG_DIR}/" "" file "${file}") + file(APPEND "${libaom_srcs_txt_file}" "${file}\n") + endif() + endforeach() +endforeach() + +# Libaom_srcs.gni generation. +set(libaom_srcs_gni_file "${AOM_CONFIG_DIR}/libaom_srcs.gni") +file(WRITE "${libaom_srcs_gni_file}" "# This file is generated. DO NOT EDIT.\n") + +foreach(aom_source_var ${aom_source_vars}) + if("${${aom_source_var}}" MATCHES "${AOM_ROOT}") + string(TOLOWER ${aom_source_var} aom_source_var_lowercase) + file(APPEND "${libaom_srcs_gni_file}" "\n${aom_source_var_lowercase} = [\n") + endif() + + foreach(file ${${aom_source_var}}) + if(NOT "${file}" MATCHES "${AOM_CONFIG_DIR}") + string(REPLACE "${AOM_ROOT}" "//third_party/libaom/source/libaom" file + "${file}") + file(APPEND "${libaom_srcs_gni_file}" " \"${file}\",\n") + endif() + endforeach() + + if("${${aom_source_var}}" MATCHES "${AOM_ROOT}") + file(APPEND "${libaom_srcs_gni_file}" "]\n") + endif() +endforeach() + +file(APPEND "${libaom_srcs_gni_file}" + "\n# Files below this line are generated by the libaom build system.\n") + +foreach(aom_source_var ${aom_source_vars}) + if("${${aom_source_var}}" MATCHES "${AOM_CONFIG_DIR}") + string(TOLOWER ${aom_source_var} aom_source_var_lowercase) + file(APPEND "${libaom_srcs_gni_file}" + "\n${aom_source_var_lowercase}_gen = [\n") + endif() + foreach(file ${${aom_source_var}}) + if(NOT "${file}" MATCHES "${AOM_ROOT}") + string(REPLACE "${AOM_CONFIG_DIR}" "//third_party/libaom/source/libaom" + file "${file}") + file(APPEND "${libaom_srcs_gni_file}" " \"${file}\",\n") + endif() + endforeach() + + if("${${aom_source_var}}" MATCHES "${AOM_CONFIG_DIR}") + file(APPEND "${libaom_srcs_gni_file}" "]\n") + endif() +endforeach() + +# Generate aom.pc and setup install rule. +setup_aom_install_targets() diff --git a/libs/libaom/src/LICENSE b/libs/libaom/src/LICENSE new file mode 100644 index 000000000..fc340c376 --- /dev/null +++ b/libs/libaom/src/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2016, Alliance for Open Media. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + diff --git a/libs/libaom/src/PATENTS b/libs/libaom/src/PATENTS new file mode 100644 index 000000000..493f61637 --- /dev/null +++ b/libs/libaom/src/PATENTS @@ -0,0 +1,108 @@ +Alliance for Open Media Patent License 1.0 + +1. License Terms. + +1.1. Patent License. Subject to the terms and conditions of this License, each + Licensor, on behalf of itself and successors in interest and assigns, + grants Licensee a non-sublicensable, perpetual, worldwide, non-exclusive, + no-charge, royalty-free, irrevocable (except as expressly stated in this + License) patent license to its Necessary Claims to make, use, sell, offer + for sale, import or distribute any Implementation. + +1.2. Conditions. + +1.2.1. Availability. As a condition to the grant of rights to Licensee to make, + sell, offer for sale, import or distribute an Implementation under + Section 1.1, Licensee must make its Necessary Claims available under + this License, and must reproduce this License with any Implementation + as follows: + + a. For distribution in source code, by including this License in the + root directory of the source code with its Implementation. + + b. For distribution in any other form (including binary, object form, + and/or hardware description code (e.g., HDL, RTL, Gate Level Netlist, + GDSII, etc.)), by including this License in the documentation, legal + notices, and/or other written materials provided with the + Implementation. + +1.2.2. Additional Conditions. This license is directly from Licensor to + Licensee. Licensee acknowledges as a condition of benefiting from it + that no rights from Licensor are received from suppliers, distributors, + or otherwise in connection with this License. + +1.3. Defensive Termination. If any Licensee, its Affiliates, or its agents + initiates patent litigation or files, maintains, or voluntarily + participates in a lawsuit against another entity or any person asserting + that any Implementation infringes Necessary Claims, any patent licenses + granted under this License directly to the Licensee are immediately + terminated as of the date of the initiation of action unless 1) that suit + was in response to a corresponding suit regarding an Implementation first + brought against an initiating entity, or 2) that suit was brought to + enforce the terms of this License (including intervention in a third-party + action by a Licensee). + +1.4. Disclaimers. The Reference Implementation and Specification are provided + "AS IS" and without warranty. The entire risk as to implementing or + otherwise using the Reference Implementation or Specification is assumed + by the implementer and user. Licensor expressly disclaims any warranties + (express, implied, or otherwise), including implied warranties of + merchantability, non-infringement, fitness for a particular purpose, or + title, related to the material. IN NO EVENT WILL LICENSOR BE LIABLE TO + ANY OTHER PARTY FOR LOST PROFITS OR ANY FORM OF INDIRECT, SPECIAL, + INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER FROM ANY CAUSES OF + ACTION OF ANY KIND WITH RESPECT TO THIS LICENSE, WHETHER BASED ON BREACH + OF CONTRACT, TORT (INCLUDING NEGLIGENCE), OR OTHERWISE, AND WHETHER OR + NOT THE OTHER PARTRY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +2. Definitions. + +2.1. Affiliate. "Affiliate" means an entity that directly or indirectly + Controls, is Controlled by, or is under common Control of that party. + +2.2. Control. "Control" means direct or indirect control of more than 50% of + the voting power to elect directors of that corporation, or for any other + entity, the power to direct management of such entity. + +2.3. Decoder. "Decoder" means any decoder that conforms fully with all + non-optional portions of the Specification. + +2.4. Encoder. "Encoder" means any encoder that produces a bitstream that can + be decoded by a Decoder only to the extent it produces such a bitstream. + +2.5. Final Deliverable. "Final Deliverable" means the final version of a + deliverable approved by the Alliance for Open Media as a Final + Deliverable. + +2.6. Implementation. "Implementation" means any implementation, including the + Reference Implementation, that is an Encoder and/or a Decoder. An + Implementation also includes components of an Implementation only to the + extent they are used as part of an Implementation. + +2.7. License. "License" means this license. + +2.8. Licensee. "Licensee" means any person or entity who exercises patent + rights granted under this License. + +2.9. Licensor. "Licensor" means (i) any Licensee that makes, sells, offers + for sale, imports or distributes any Implementation, or (ii) a person + or entity that has a licensing obligation to the Implementation as a + result of its membership and/or participation in the Alliance for Open + Media working group that developed the Specification. + +2.10. Necessary Claims. "Necessary Claims" means all claims of patents or + patent applications, (a) that currently or at any time in the future, + are owned or controlled by the Licensor, and (b) (i) would be an + Essential Claim as defined by the W3C Policy as of February 5, 2004 + (https://www.w3.org/Consortium/Patent-Policy-20040205/#def-essential) + as if the Specification was a W3C Recommendation; or (ii) are infringed + by the Reference Implementation. + +2.11. Reference Implementation. "Reference Implementation" means an Encoder + and/or Decoder released by the Alliance for Open Media as a Final + Deliverable. + +2.12. Specification. "Specification" means the specification designated by + the Alliance for Open Media as a Final Deliverable for which this + License was issued. + diff --git a/libs/libaom/src/README.md b/libs/libaom/src/README.md new file mode 100644 index 000000000..cf057ae6c --- /dev/null +++ b/libs/libaom/src/README.md @@ -0,0 +1,665 @@ +# AV1 Codec Library + +## Contents +1. [Building the lib and applications](#building-the-library-and-applications) + - [Prerequisites](#prerequisites) + - [Get the code](#get-the-code) + - [Basics](#basic-build) + - [Configuration options](#configuration-options) + - [Dylib builds](#dylib-builds) + - [Debugging](#debugging) + - [Cross compiling](#cross-compiling) + - [Sanitizer support](#sanitizers) + - [MSVC builds](#microsoft-visual-studio-builds) + - [Xcode builds](#xcode-builds) + - [Emscripten builds](#emscripten-builds) + - [Extra Build Flags](#extra-build-flags) + - [Build with VMAF support](#build-with-vmaf) +2. [Testing the library](#testing-the-av1-codec) + - [Basics](#testing-basics) + - [Unit tests](#1_unit-tests) + - [Example tests](#2_example-tests) + - [Encoder tests](#3_encoder-tests) + - [IDE hosted tests](#ide-hosted-tests) + - [Downloading test data](#downloading-the-test-data) + - [Adding a new test data file](#adding-a-new-test-data-file) + - [Additional test data](#additional-test-data) + - [Sharded testing](#sharded-testing) + - [Running tests directly](#1_running-test_libaom-directly) + - [Running tests via CMake](#2_running-the-tests-via-the-cmake-build) +3. [Coding style](#coding-style) +4. [Submitting patches](#submitting-patches) + - [Login cookie](#login-cookie) + - [Contributor agreement](#contributor-agreement) + - [Testing your code](#testing-your-code) + - [Commit message hook](#commit-message-hook) + - [Upload your change](#upload-your-change) + - [Incorporating Reviewer Comments](#incorporating-reviewer-comments) + - [Submitting your change](#submitting-your-change) + - [Viewing change status](#viewing-the-status-of-uploaded-changes) +5. [Support](#support) +6. [Bug reports](#bug-reports) + +## Building the library and applications + +### Prerequisites + + 1. [CMake](https://cmake.org) version 3.5 or higher. + 2. [Git](https://git-scm.com/). + 3. [Perl](https://www.perl.org/). + 4. For x86 targets, [yasm](http://yasm.tortall.net/), which is preferred, or a + recent version of [nasm](http://www.nasm.us/). If you download yasm with + the intention to work with Visual Studio, please download win32.exe or + win64.exe and rename it into yasm.exe. DO NOT download or use vsyasm.exe. + 5. Building the documentation requires [doxygen](http://doxygen.org). + 6. Building the unit tests requires [Python](https://www.python.org/). + 7. Emscripten builds require the portable + [EMSDK](https://kripken.github.io/emscripten-site/index.html). + +### Get the code + +The AV1 library source code is stored in the Alliance for Open Media Git +repository: + +~~~ + $ git clone https://aomedia.googlesource.com/aom + # By default, the above command stores the source in the aom directory: + $ cd aom +~~~ + +### Basic build + +CMake replaces the configure step typical of many projects. Running CMake will +produce configuration and build files for the currently selected CMake +generator. For most systems the default generator is Unix Makefiles. The basic +form of a makefile build is the following: + +~~~ + $ cmake path/to/aom + $ make +~~~ + +The above will generate a makefile build that produces the AV1 library and +applications for the current host system after the make step completes +successfully. The compiler chosen varies by host platform, but a general rule +applies: On systems where cc and c++ are present in $PATH at the time CMake is +run the generated build will use cc and c++ by default. + +### Configuration options + +The AV1 codec library has a great many configuration options. These come in two +varieties: + + 1. Build system configuration options. These have the form `ENABLE_FEATURE`. + 2. AV1 codec configuration options. These have the form `CONFIG_FEATURE`. + +Both types of options are set at the time CMake is run. The following example +enables ccache and disables the AV1 encoder: + +~~~ + $ cmake path/to/aom -DENABLE_CCACHE=1 -DCONFIG_AV1_ENCODER=0 + $ make +~~~ + +The available configuration options are too numerous to list here. Build system +configuration options can be found at the top of the CMakeLists.txt file found +in the root of the AV1 repository, and AV1 codec configuration options can +currently be found in the file `build/cmake/aom_config_defaults.cmake`. + +### Dylib builds + +A dylib (shared object) build of the AV1 codec library can be enabled via the +CMake built in variable `BUILD_SHARED_LIBS`: + +~~~ + $ cmake path/to/aom -DBUILD_SHARED_LIBS=1 + $ make +~~~ + +This is currently only supported on non-Windows targets. + +### Debugging + +Depending on the generator used there are multiple ways of going about +debugging AV1 components. For single configuration generators like the Unix +Makefiles generator, setting `CMAKE_BUILD_TYPE` to Debug is sufficient: + +~~~ + $ cmake path/to/aom -DCMAKE_BUILD_TYPE=Debug +~~~ + +For Xcode, mainly because configuration controls for Xcode builds are buried two +configuration windows deep and must be set for each subproject within the Xcode +IDE individually, `CMAKE_CONFIGURATION_TYPES` should be set to Debug: + +~~~ + $ cmake path/to/aom -G Xcode -DCMAKE_CONFIGURATION_TYPES=Debug +~~~ + +For Visual Studio the in-IDE configuration controls should be used. Simply set +the IDE project configuration to Debug to allow for stepping through the code. + +In addition to the above it can sometimes be useful to debug only C and C++ +code. To disable all assembly code and intrinsics set `AOM_TARGET_CPU` to +generic at generation time: + +~~~ + $ cmake path/to/aom -DAOM_TARGET_CPU=generic +~~~ + +### Cross compiling + +For the purposes of building the AV1 codec and applications and relative to the +scope of this guide, all builds for architectures differing from the native host +architecture will be considered cross compiles. The AV1 CMake build handles +cross compiling via the use of toolchain files included in the AV1 repository. +The toolchain files available at the time of this writing are: + + - arm64-ios.cmake + - arm64-linux-gcc.cmake + - arm64-mingw-gcc.cmake + - armv7-ios.cmake + - armv7-linux-gcc.cmake + - armv7-mingw-gcc.cmake + - armv7s-ios.cmake + - mips32-linux-gcc.cmake + - mips64-linux-gcc.cmake + - x86-ios-simulator.cmake + - x86-linux.cmake + - x86-macos.cmake + - x86-mingw-gcc.cmake + - x86\_64-ios-simulator.cmake + - x86\_64-mingw-gcc.cmake + +The following example demonstrates use of the x86-macos.cmake toolchain file on +a x86\_64 MacOS host: + +~~~ + $ cmake path/to/aom \ + -DCMAKE_TOOLCHAIN_FILE=path/to/aom/build/cmake/toolchains/x86-macos.cmake + $ make +~~~ + +To build for an unlisted target creation of a new toolchain file is the best +solution. The existing toolchain files can be used a starting point for a new +toolchain file since each one exposes the basic requirements for toolchain files +as used in the AV1 codec build. + +As a temporary work around an unoptimized AV1 configuration that builds only C +and C++ sources can be produced using the following commands: + +~~~ + $ cmake path/to/aom -DAOM_TARGET_CPU=generic + $ make +~~~ + +In addition to the above it's important to note that the toolchain files +suffixed with gcc behave differently than the others. These toolchain files +attempt to obey the $CROSS environment variable. + +### Sanitizers + +Sanitizer integration is built-in to the CMake build system. To enable a +sanitizer, add `-DSANITIZE=` to the CMake command line. For example, to +enable address sanitizer: + +~~~ + $ cmake path/to/aom -DSANITIZE=address + $ make +~~~ + +Sanitizers available vary by platform, target, and compiler. Consult your +compiler documentation to determine which, if any, are available. + +### Microsoft Visual Studio builds + +Building the AV1 codec library in Microsoft Visual Studio is supported. Visual +Studio 2017 (15.0) or later is required. The following example demonstrates +generating projects and a solution for the Microsoft IDE: + +~~~ + # This does not require a bash shell; Command Prompt (cmd.exe) is fine. + # This assumes the build host is a Windows x64 computer. + + # To build with Visual Studio 2019 for the x64 target: + $ cmake path/to/aom -G "Visual Studio 16 2019" + $ cmake --build . + + # To build with Visual Studio 2019 for the 32-bit x86 target: + $ cmake path/to/aom -G "Visual Studio 16 2019" -A Win32 + $ cmake --build . + + # To build with Visual Studio 2017 for the x64 target: + $ cmake path/to/aom -G "Visual Studio 15 2017" -T host=x64 -A x64 + $ cmake --build . + + # To build with Visual Studio 2017 for the 32-bit x86 target: + $ cmake path/to/aom -G "Visual Studio 15 2017" -T host=x64 + $ cmake --build . +~~~ + +NOTE: The build system targets Windows 7 or later by compiling files with +`-D_WIN32_WINNT=0x0601`. + +### Xcode builds + +Building the AV1 codec library in Xcode is supported. The following example +demonstrates generating an Xcode project: + +~~~ + $ cmake path/to/aom -G Xcode +~~~ + +### Emscripten builds + +Building the AV1 codec library with Emscripten is supported. Typically this is +used to hook into the AOMAnalyzer GUI application. These instructions focus on +using the inspector with AOMAnalyzer, but all tools can be built with +Emscripten. + +It is assumed here that you have already downloaded and installed the EMSDK, +installed and activated at least one toolchain, and setup your environment +appropriately using the emsdk\_env script. + +1. Download [AOMAnalyzer](https://people.xiph.org/~mbebenita/analyzer/). + +2. Configure the build: + +~~~ + $ cmake path/to/aom \ + -DENABLE_CCACHE=1 \ + -DAOM_TARGET_CPU=generic \ + -DENABLE_DOCS=0 \ + -DENABLE_TESTS=0 \ + -DCONFIG_ACCOUNTING=1 \ + -DCONFIG_INSPECTION=1 \ + -DCONFIG_MULTITHREAD=0 \ + -DCONFIG_RUNTIME_CPU_DETECT=0 \ + -DCONFIG_WEBM_IO=0 \ + -DCMAKE_TOOLCHAIN_FILE=path/to/emsdk-portable/.../Emscripten.cmake +~~~ + +3. Build it: run make if that's your generator of choice: + +~~~ + $ make inspect +~~~ + +4. Run the analyzer: + +~~~ + # inspect.js is in the examples sub directory of the directory in which you + # executed cmake. + $ path/to/AOMAnalyzer path/to/examples/inspect.js path/to/av1/input/file +~~~ + +### Extra build flags + +Three variables allow for passing of additional flags to the build system. + +- AOM\_EXTRA\_C\_FLAGS +- AOM\_EXTRA\_CXX\_FLAGS +- AOM\_EXTRA\_EXE\_LINKER\_FLAGS + +The build system attempts to ensure the flags passed through the above variables +are passed to tools last in order to allow for override of default behavior. +These flags can be used, for example, to enable asserts in a release build: + +~~~ + $ cmake path/to/aom \ + -DCMAKE_BUILD_TYPE=Release \ + -DAOM_EXTRA_C_FLAGS=-UNDEBUG \ + -DAOM_EXTRA_CXX_FLAGS=-UNDEBUG +~~~ + +### Build with VMAF support + +After installing +[libvmaf.a](https://github.com/Netflix/vmaf/blob/master/resource/doc/libvmaf.md), +you can use it with the encoder: + +~~~ + $ cmake path/to/aom -DCONFIG_TUNE_VMAF=1 +~~~ + +Please note that the default VMAF model +("/usr/local/share/model/vmaf_v0.6.1.pkl") +will be used unless you set the following flag when running the encoder: + +~~~ + # --vmaf-model-path=path/to/model +~~~ + +## Testing the AV1 codec + +### Testing basics + +There are several methods of testing the AV1 codec. All of these methods require +the presence of the AV1 source code and a working build of the AV1 library and +applications. + +#### 1. Unit tests: + +The unit tests can be run at build time: + +~~~ + # Before running the make command the LIBAOM_TEST_DATA_PATH environment + # variable should be set to avoid downloading the test files to the + # cmake build configuration directory. + $ cmake path/to/aom + # Note: The AV1 CMake build creates many test targets. Running make + # with multiple jobs will speed up the test run significantly. + $ make runtests +~~~ + +#### 2. Example tests: + +The example tests require a bash shell and can be run in the following manner: + +~~~ + # See the note above about LIBAOM_TEST_DATA_PATH above. + $ cmake path/to/aom + $ make + # It's best to build the testdata target using many make jobs. + # Running it like this will verify and download (if necessary) + # one at a time, which takes a while. + $ make testdata + $ path/to/aom/test/examples.sh --bin-path examples +~~~ + +#### 3. Encoder tests: + +When making a change to the encoder run encoder tests to confirm that your +change has a positive or negligible impact on encode quality. When running these +tests the build configuration should be changed to enable internal encoder +statistics: + +~~~ + $ cmake path/to/aom -DCONFIG_INTERNAL_STATS=1 + $ make +~~~ + +The repository contains scripts intended to make running these tests as simple +as possible. The following example demonstrates creating a set of baseline clips +for comparison to results produced after making your change to libaom: + +~~~ + # This will encode all Y4M files in the current directory using the + # settings specified to create the encoder baseline statistical data: + $ cd path/to/test/inputs + # This command line assumes that run_encodes.sh, its helper script + # best_encode.sh, and the aomenc you intend to test are all within a + # directory in your PATH. + $ run_encodes.sh 200 500 50 baseline +~~~ + +After making your change and creating the baseline clips, you'll need to run +encodes that include your change(s) to confirm that things are working as +intended: + +~~~ + # This will encode all Y4M files in the current directory using the + # settings specified to create the statistical data for your change: + $ cd path/to/test/inputs + # This command line assumes that run_encodes.sh, its helper script + # best_encode.sh, and the aomenc you intend to test are all within a + # directory in your PATH. + $ run_encodes.sh 200 500 50 mytweak +~~~ + +After creating both data sets you can use `test/visual_metrics.py` to generate a +report that can be viewed in a web browser: + +~~~ + $ visual_metrics.py metrics_template.html "*stt" baseline mytweak \ + > mytweak.html +~~~ + +You can view the report by opening mytweak.html in a web browser. + + +### IDE hosted tests + +By default the generated projects files created by CMake will not include the +runtests and testdata rules when generating for IDEs like Microsoft Visual +Studio and Xcode. This is done to avoid intolerably long build cycles in the +IDEs-- IDE behavior is to build all targets when selecting the build project +options in MSVS and Xcode. To enable the test rules in IDEs the +`ENABLE_IDE_TEST_HOSTING` variable must be enabled at CMake generation time: + +~~~ + # This example uses Xcode. To get a list of the generators + # available, run cmake with the -G argument missing its + # value. + $ cmake path/to/aom -DENABLE_IDE_TEST_HOSTING=1 -G Xcode +~~~ + +### Downloading the test data + +The fastest and easiest way to obtain the test data is to use CMake to generate +a build using the Unix Makefiles generator, and then to build only the testdata +rule: + +~~~ + $ cmake path/to/aom -G "Unix Makefiles" + # 28 is used because there are 28 test files as of this writing. + $ make -j28 testdata +~~~ + +The above make command will only download and verify the test data. + +### Adding a new test data file + +First, add the new test data file to the `aom-test-data` bucket of the +`aomedia-testing` project on Google Cloud Platform. You may need to ask someone +with the necessary access permissions to do this for you. + +NOTE: When a new test data file is added to the `aom-test-data` bucket, its +"Public access" is initially "Not public". We need to change its +"Public access" to "Public" by using the following +[`gsutil`](https://cloud.google.com/storage/docs/gsutil_install) command: +~~~ + $ gsutil acl ch -g all:R gs://aom-test-data/test-data-file-name +~~~ +This command grants the `AllUsers` group READ access to the file named +"test-data-file-name" in the `aom-test-data` bucket. + +Once the new test data file has been added to `aom-test-data`, create a CL to +add the name of the new test data file to `test/test_data_util.cmake` and add +the SHA1 checksum of the new test data file to `test/test-data.sha1`. (The SHA1 +checksum of a file can be calculated by running the `sha1sum` command on the +file.) + +### Additional test data + +The test data mentioned above is strictly intended for unit testing. + +Additional input data for testing the encoder can be obtained from: +https://media.xiph.org/video/derf/ + +### Sharded testing + +The AV1 codec library unit tests are built upon gtest which supports sharding of +test jobs. Sharded test runs can be achieved in a couple of ways. + +#### 1. Running test\_libaom directly: + +~~~ + # Set the environment variable GTEST_TOTAL_SHARDS to control the number of + # shards. + $ export GTEST_TOTAL_SHARDS=10 + # (GTEST shard indexing is 0 based). + $ seq 0 $(( $GTEST_TOTAL_SHARDS - 1 )) \ + | xargs -n 1 -P 0 -I{} env GTEST_SHARD_INDEX={} ./test_libaom +~~~ + +To create a test shard for each CPU core available on the current system set +`GTEST_TOTAL_SHARDS` to the number of CPU cores on your system minus one. + +#### 2. Running the tests via the CMake build: + +~~~ + # For IDE based builds, ENABLE_IDE_TEST_HOSTING must be enabled. See + # the IDE hosted tests section above for more information. If the IDE + # supports building targets concurrently tests will be sharded by default. + + # For make and ninja builds the -j parameter controls the number of shards + # at test run time. This example will run the tests using 10 shards via + # make. + $ make -j10 runtests +~~~ + +The maximum number of test targets that can run concurrently is determined by +the number of CPUs on the system where the build is configured as detected by +CMake. A system with 24 cores can run 24 test shards using a value of 24 with +the `-j` parameter. When CMake is unable to detect the number of cores 10 shards +is the default maximum value. + +## Coding style + +We are using the Google C Coding Style defined by the +[Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html). + +The coding style used by this project is enforced with clang-format using the +configuration contained in the +[.clang-format](https://chromium.googlesource.com/webm/aom/+/master/.clang-format) +file in the root of the repository. + +You can download clang-format using your system's package manager, or directly +from [llvm.org](http://llvm.org/releases/download.html). You can also view the +[documentation](https://clang.llvm.org/docs/ClangFormat.html) on llvm.org. +Output from clang-format varies by clang-format version, for best results your +version should match the one used on Jenkins. You can find the clang-format +version by reading the comment in the `.clang-format` file linked above. + +Before pushing changes for review you can format your code with: + +~~~ + # Apply clang-format to modified .c, .h and .cc files + $ clang-format -i --style=file \ + $(git diff --name-only --diff-filter=ACMR '*.[hc]' '*.cc') +~~~ + +Check the .clang-format file for the version used to generate it if there is any +difference between your local formatting and the review system. + +Some Git installations have clang-format integration. Here are some examples: + +~~~ + # Apply clang-format to all staged changes: + $ git clang-format + + # Clang format all staged and unstaged changes: + $ git clang-format -f + + # Clang format all staged and unstaged changes interactively: + $ git clang-format -f -p +~~~ + +## Submitting patches + +We manage the submission of patches using the +[Gerrit](https://www.gerritcodereview.com/) code review tool. This tool +implements a workflow on top of the Git version control system to ensure that +all changes get peer reviewed and tested prior to their distribution. + +### Login cookie + +Browse to [AOMedia Git index](https://aomedia.googlesource.com/) and login with +your account (Gmail credentials, for example). Next, follow the +`Generate Password` Password link at the top of the page. You’ll be given +instructions for creating a cookie to use with our Git repos. + +### Contributor agreement + +You will be required to execute a +[contributor agreement](http://aomedia.org/license) to ensure that the AOMedia +Project has the right to distribute your changes. + +### Testing your code + +The testing basics are covered in the [testing section](#testing-the-av1-codec) +above. + +In addition to the local tests, many more (e.g. asan, tsan, valgrind) will run +through Jenkins instances upon upload to gerrit. + +### Commit message hook + +Gerrit requires that each submission include a unique Change-Id. You can assign +one manually using git commit --amend, but it’s easier to automate it with the +commit-msg hook provided by Gerrit. + +Copy commit-msg to the `.git/hooks` directory of your local repo. Here's an +example: + +~~~ + $ curl -Lo aom/.git/hooks/commit-msg https://chromium-review.googlesource.com/tools/hooks/commit-msg + + # Next, ensure that the downloaded commit-msg script is executable: + $ chmod u+x aom/.git/hooks/commit-msg +~~~ + +See the Gerrit +[documentation](https://gerrit-review.googlesource.com/Documentation/user-changeid.html) +for more information. + +### Upload your change + +The command line to upload your patch looks like this: + +~~~ + $ git push https://aomedia-review.googlesource.com/aom HEAD:refs/for/master +~~~ + +### Incorporating reviewer comments + +If you previously uploaded a change to Gerrit and the Approver has asked for +changes, follow these steps: + +1. Edit the files to make the changes the reviewer has requested. +2. Recommit your edits using the --amend flag, for example: + +~~~ + $ git commit -a --amend +~~~ + +3. Use the same git push command as above to upload to Gerrit again for another + review cycle. + +In general, you should not rebase your changes when doing updates in response to +review. Doing so can make it harder to follow the evolution of your change in +the diff view. + +### Submitting your change + +Once your change has been Approved and Verified, you can “submit” it through the +Gerrit UI. This will usually automatically rebase your change onto the branch +specified. + +Sometimes this can’t be done automatically. If you run into this problem, you +must rebase your changes manually: + +~~~ + $ git fetch + $ git rebase origin/branchname +~~~ + +If there are any conflicts, resolve them as you normally would with Git. When +you’re done, reupload your change. + +### Viewing the status of uploaded changes + +To check the status of a change that you uploaded, open +[Gerrit](https://aomedia-review.googlesource.com/), sign in, and click My > +Changes. + +## Support + +This library is an open source project supported by its community. Please +please email aomediacodec@jointdevelopment.kavi.com for help. + +## Bug reports + +Bug reports can be filed in the Alliance for Open Media +[issue tracker](https://bugs.chromium.org/p/aomedia/issues/list). diff --git a/libs/libaom/src/Sample.cfg b/libs/libaom/src/Sample.cfg new file mode 100644 index 000000000..d5dbe6641 --- /dev/null +++ b/libs/libaom/src/Sample.cfg @@ -0,0 +1,35 @@ +#sample config file +super_block_size = 128 # super block size. 0, 64 or 128 +max_partition_size = 128 # max partition size(8, 16, 32, 64, 128) +min_partition_size = 4 # min partition size(4, 8, 16, 32, 64) +disable_rect_partition_type = 0 # disable rectangle partition type +disable_ab_partition_type = 0 # disable AB partition type +disable_1to4_partition_type = 0 # disable 1 to 4 and 4 to 1 partition type +disable_intra_angle_delta = 0 # disable intra angle delta +disable_paeth_intra = 0 # disable paeth intra +disable_smooth_intra = 0 # disable intra smooth mode +disable_intra_edge_filter = 0 # disable intra edge filter +disable_filter_intra = 0 # disable filter intra +disable_intrabc = 0 # disable Intra Block Copy +disable_cfl = 0 # disable chroma from luma prediction +disable_palette = 0 # disable Palette +disable_flip_idtx = 0 # disable flip and identity transform +disable_tx_64x64 = 0 # disable 64x64 transform +reduced_tx_type_set = 0 # use reduced transform type set +reduced_reference_set = 0 # use reduced reference frame set +disable_obmc = 0 # disable OBMC +disable_warp_motion = 0 # disable Warped Motion +disable_global_motion = 0 # disable global motion +disable_ref_frame_mv = 0 # disable ref mv +disable_dual_filter = 0 # disable dual interpolation filter +disable_one_sided_comp = 0 # disable one sided compound mode +disable_masked_comp = 0 # disable masked compound prediction +disable_diff_wtd_comp = 0 # disable difference weighted compound mode +disable_inter_inter_wedge = 0 # disable inter/inter wedge comp +disable_dist_wtd_comp = 0 # disable distant weighted compound mode +disable_inter_intra_comp = 0 # disable inter/intra compound mode. +disable_inter_intra_wedge = 0 # disable inter/intra wedge comp +disable_smooth_inter_intra = 0 # disable smooth inter/intra +disable_cdef = 0 # disable CDEF filter +disable_lr = 0 # disable Loop Restoration Filter +disable_trellis_quant = 0 # disable trellis quantization \ No newline at end of file diff --git a/libs/libaom/src/aom/aom.h b/libs/libaom/src/aom/aom.h new file mode 100644 index 000000000..c591dc9a4 --- /dev/null +++ b/libs/libaom/src/aom/aom.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\defgroup aom AOM + * \ingroup codecs + * AOM is aom's newest video compression algorithm that uses motion + * compensated prediction, Discrete Cosine Transform (DCT) coding of the + * prediction error signal and context dependent entropy coding techniques + * based on arithmetic principles. It features: + * - YUV 4:2:0 image format + * - Macro-block based coding (16x16 luma plus two 8x8 chroma) + * - 1/4 (1/8) pixel accuracy motion compensated prediction + * - 4x4 DCT transform + * - 128 level linear quantizer + * - In loop deblocking filter + * - Context-based entropy coding + * + * @{ + */ +/*!\file + * \brief Provides controls common to both the AOM encoder and decoder. + */ +#ifndef AOM_AOM_AOM_H_ +#define AOM_AOM_AOM_H_ + +#include "aom/aom_codec.h" +#include "aom/aom_image.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\brief Control functions + * + * The set of macros define the control functions of AOM interface + */ +enum aom_com_control_id { + /* TODO(https://crbug.com/aomedia/2671): The encoder overlaps the range of + * these values for its control ids, see the NOTEs in aom/aomcx.h. These + * should be migrated to something like the AOM_DECODER_CTRL_ID_START range + * next time we're ready to break the ABI. + */ + AV1_GET_REFERENCE = 128, /**< get a pointer to a reference frame, + av1_ref_frame_t* parameter */ + AV1_SET_REFERENCE = 129, /**< write a frame into a reference buffer, + av1_ref_frame_t* parameter */ + AV1_COPY_REFERENCE = 130, /**< get a copy of reference frame from the decoderm + av1_ref_frame_t* parameter */ + AOM_COMMON_CTRL_ID_MAX, + + AV1_GET_NEW_FRAME_IMAGE = + 192, /**< get a pointer to the new frame, aom_image_t* parameter */ + AV1_COPY_NEW_FRAME_IMAGE = 193, /**< copy the new frame to an external buffer, + aom_image_t* parameter */ + + AOM_DECODER_CTRL_ID_START = 256 +}; + +/*!\brief AV1 specific reference frame data struct + * + * Define the data struct to access av1 reference frames. + */ +typedef struct av1_ref_frame { + int idx; /**< frame index to get (input) */ + int use_external_ref; /**< Directly use external ref buffer(decoder only) */ + aom_image_t img; /**< img structure to populate (output) */ +} av1_ref_frame_t; + +/*!\cond */ +/*!\brief aom decoder control function parameter type + * + * Defines the data type for each of AOM decoder control function requires. + * + * \note For each control ID "X", a macro-define of + * AOM_CTRL_X is provided. It is used at compile time to determine + * if the control ID is supported by the libaom library available, + * when the libaom version cannot be controlled. + */ +AOM_CTRL_USE_TYPE(AV1_GET_REFERENCE, av1_ref_frame_t *) +#define AOM_CTRL_AV1_GET_REFERENCE + +AOM_CTRL_USE_TYPE(AV1_SET_REFERENCE, av1_ref_frame_t *) +#define AOM_CTRL_AV1_SET_REFERENCE + +AOM_CTRL_USE_TYPE(AV1_COPY_REFERENCE, av1_ref_frame_t *) +#define AOM_CTRL_AV1_COPY_REFERENCE + +AOM_CTRL_USE_TYPE(AV1_GET_NEW_FRAME_IMAGE, aom_image_t *) +#define AOM_CTRL_AV1_GET_NEW_FRAME_IMAGE + +AOM_CTRL_USE_TYPE(AV1_COPY_NEW_FRAME_IMAGE, aom_image_t *) +#define AOM_CTRL_AV1_COPY_NEW_FRAME_IMAGE + +/*!\endcond */ +/*! @} - end defgroup aom */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_AOM_H_ diff --git a/libs/libaom/src/aom/aom_codec.h b/libs/libaom/src/aom/aom_codec.h new file mode 100644 index 000000000..75f6a1af2 --- /dev/null +++ b/libs/libaom/src/aom/aom_codec.h @@ -0,0 +1,478 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\defgroup codec Common Algorithm Interface + * This abstraction allows applications to easily support multiple video + * formats with minimal code duplication. This section describes the interface + * common to all codecs (both encoders and decoders). + * @{ + */ + +/*!\file + * \brief Describes the codec algorithm interface to applications. + * + * This file describes the interface between an application and a + * video codec algorithm. + * + * An application instantiates a specific codec instance by using + * aom_codec_init() and a pointer to the algorithm's interface structure: + *
+ *     my_app.c:
+ *       extern aom_codec_iface_t my_codec;
+ *       {
+ *           aom_codec_ctx_t algo;
+ *           res = aom_codec_init(&algo, &my_codec);
+ *       }
+ *     
+ * + * Once initialized, the instance is managed using other functions from + * the aom_codec_* family. + */ +#ifndef AOM_AOM_AOM_CODEC_H_ +#define AOM_AOM_AOM_CODEC_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "aom/aom_image.h" +#include "aom/aom_integer.h" + +/*!\brief Decorator indicating a function is deprecated */ +#ifndef AOM_DEPRECATED +#if defined(__GNUC__) && __GNUC__ +#define AOM_DEPRECATED __attribute__((deprecated)) +#elif defined(_MSC_VER) +#define AOM_DEPRECATED +#else +#define AOM_DEPRECATED +#endif +#endif /* AOM_DEPRECATED */ + +#ifndef AOM_DECLSPEC_DEPRECATED +#if defined(__GNUC__) && __GNUC__ +#define AOM_DECLSPEC_DEPRECATED /**< \copydoc #AOM_DEPRECATED */ +#elif defined(_MSC_VER) +/*!\brief \copydoc #AOM_DEPRECATED */ +#define AOM_DECLSPEC_DEPRECATED __declspec(deprecated) +#else +#define AOM_DECLSPEC_DEPRECATED /**< \copydoc #AOM_DEPRECATED */ +#endif +#endif /* AOM_DECLSPEC_DEPRECATED */ + +/*!\brief Decorator indicating a function is potentially unused */ +#ifdef AOM_UNUSED +#elif defined(__GNUC__) || defined(__clang__) +#define AOM_UNUSED __attribute__((unused)) +#else +#define AOM_UNUSED +#endif + +/*!\brief Decorator indicating that given struct/union/enum is packed */ +#ifndef ATTRIBUTE_PACKED +#if defined(__GNUC__) && __GNUC__ +#define ATTRIBUTE_PACKED __attribute__((packed)) +#elif defined(_MSC_VER) +#define ATTRIBUTE_PACKED +#else +#define ATTRIBUTE_PACKED +#endif +#endif /* ATTRIBUTE_PACKED */ + +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + */ +#define AOM_CODEC_ABI_VERSION (5 + AOM_IMAGE_ABI_VERSION) /**<\hideinitializer*/ + +/*!\brief Algorithm return codes */ +typedef enum { + /*!\brief Operation completed without error */ + AOM_CODEC_OK, + + /*!\brief Unspecified error */ + AOM_CODEC_ERROR, + + /*!\brief Memory operation failed */ + AOM_CODEC_MEM_ERROR, + + /*!\brief ABI version mismatch */ + AOM_CODEC_ABI_MISMATCH, + + /*!\brief Algorithm does not have required capability */ + AOM_CODEC_INCAPABLE, + + /*!\brief The given bitstream is not supported. + * + * The bitstream was unable to be parsed at the highest level. The decoder + * is unable to proceed. This error \ref SHOULD be treated as fatal to the + * stream. */ + AOM_CODEC_UNSUP_BITSTREAM, + + /*!\brief Encoded bitstream uses an unsupported feature + * + * The decoder does not implement a feature required by the encoder. This + * return code should only be used for features that prevent future + * pictures from being properly decoded. This error \ref MAY be treated as + * fatal to the stream or \ref MAY be treated as fatal to the current GOP. + */ + AOM_CODEC_UNSUP_FEATURE, + + /*!\brief The coded data for this stream is corrupt or incomplete + * + * There was a problem decoding the current frame. This return code + * should only be used for failures that prevent future pictures from + * being properly decoded. This error \ref MAY be treated as fatal to the + * stream or \ref MAY be treated as fatal to the current GOP. If decoding + * is continued for the current GOP, artifacts may be present. + */ + AOM_CODEC_CORRUPT_FRAME, + + /*!\brief An application-supplied parameter is not valid. + * + */ + AOM_CODEC_INVALID_PARAM, + + /*!\brief An iterator reached the end of list. + * + */ + AOM_CODEC_LIST_END + +} aom_codec_err_t; + +/*! \brief Codec capabilities bitfield + * + * Each codec advertises the capabilities it supports as part of its + * ::aom_codec_iface_t interface structure. Capabilities are extra interfaces + * or functionality, and are not required to be supported. + * + * The available flags are specified by AOM_CODEC_CAP_* defines. + */ +typedef long aom_codec_caps_t; +#define AOM_CODEC_CAP_DECODER 0x1 /**< Is a decoder */ +#define AOM_CODEC_CAP_ENCODER 0x2 /**< Is an encoder */ + +/*! \brief Initialization-time Feature Enabling + * + * Certain codec features must be known at initialization time, to allow for + * proper memory allocation. + * + * The available flags are specified by AOM_CODEC_USE_* defines. + */ +typedef long aom_codec_flags_t; + +/*!\brief Time Stamp Type + * + * An integer, which when multiplied by the stream's time base, provides + * the absolute time of a sample. + */ +typedef int64_t aom_codec_pts_t; + +/*!\brief Codec interface structure. + * + * Contains function pointers and other data private to the codec + * implementation. This structure is opaque to the application. Common + * functions used with this structure: + * - aom_codec_iface_name: get the name of the codec + * - aom_codec_get_caps: returns the capabilities of the codec (see + * aom_encoder.h for more details) + * - aom_codec_enc_config_default: generate the default config to use + * when initializing the encoder + * - aom_codec_dec_init, aom_codec_enc_init: initialize the codec context + * structure (see documentation on aom_codec_ctx for more information). + */ +typedef const struct aom_codec_iface aom_codec_iface_t; + +/*!\brief Codec private data structure. + * + * Contains data private to the codec implementation. This structure is opaque + * to the application. + */ +typedef struct aom_codec_priv aom_codec_priv_t; + +/*!\brief Iterator + * + * Opaque storage used for iterating over lists. + */ +typedef const void *aom_codec_iter_t; + +/*!\brief Codec context structure + * + * All codecs \ref MUST support this context structure fully. In general, + * this data should be considered private to the codec algorithm, and + * not be manipulated or examined by the calling application. Applications + * may reference the 'name' member to get a printable description of the + * algorithm. + */ +typedef struct aom_codec_ctx { + const char *name; /**< Printable interface name */ + aom_codec_iface_t *iface; /**< Interface pointers */ + aom_codec_err_t err; /**< Last returned error */ + const char *err_detail; /**< Detailed info, if available */ + aom_codec_flags_t init_flags; /**< Flags passed at init time */ + union { + /**< Decoder Configuration Pointer */ + const struct aom_codec_dec_cfg *dec; + /**< Encoder Configuration Pointer */ + const struct aom_codec_enc_cfg *enc; + const void *raw; + } config; /**< Configuration pointer aliasing union */ + aom_codec_priv_t *priv; /**< Algorithm private storage */ +} aom_codec_ctx_t; + +/*!\brief Bit depth for codec + * * + * This enumeration determines the bit depth of the codec. + */ +typedef enum aom_bit_depth { + AOM_BITS_8 = 8, /**< 8 bits */ + AOM_BITS_10 = 10, /**< 10 bits */ + AOM_BITS_12 = 12, /**< 12 bits */ +} aom_bit_depth_t; + +/*!\brief Superblock size selection. + * + * Defines the superblock size used for encoding. The superblock size can + * either be fixed at 64x64 or 128x128 pixels, or it can be dynamically + * selected by the encoder for each frame. + */ +typedef enum aom_superblock_size { + AOM_SUPERBLOCK_SIZE_64X64, /**< Always use 64x64 superblocks. */ + AOM_SUPERBLOCK_SIZE_128X128, /**< Always use 128x128 superblocks. */ + AOM_SUPERBLOCK_SIZE_DYNAMIC /**< Select superblock size dynamically. */ +} aom_superblock_size_t; + +/* + * Library Version Number Interface + * + * For example, see the following sample return values: + * aom_codec_version() (1<<16 | 2<<8 | 3) + * aom_codec_version_str() "v1.2.3-rc1-16-gec6a1ba" + * aom_codec_version_extra_str() "rc1-16-gec6a1ba" + */ + +/*!\brief Return the version information (as an integer) + * + * Returns a packed encoding of the library version number. This will only + * include + * the major.minor.patch component of the version number. Note that this encoded + * value should be accessed through the macros provided, as the encoding may + * change + * in the future. + * + */ +int aom_codec_version(void); + +/*!\brief Return the version major number */ +#define aom_codec_version_major() ((aom_codec_version() >> 16) & 0xff) + +/*!\brief Return the version minor number */ +#define aom_codec_version_minor() ((aom_codec_version() >> 8) & 0xff) + +/*!\brief Return the version patch number */ +#define aom_codec_version_patch() ((aom_codec_version() >> 0) & 0xff) + +/*!\brief Return the version information (as a string) + * + * Returns a printable string containing the full library version number. This + * may + * contain additional text following the three digit version number, as to + * indicate + * release candidates, prerelease versions, etc. + * + */ +const char *aom_codec_version_str(void); + +/*!\brief Return the version information (as a string) + * + * Returns a printable "extra string". This is the component of the string + * returned + * by aom_codec_version_str() following the three digit version number. + * + */ +const char *aom_codec_version_extra_str(void); + +/*!\brief Return the build configuration + * + * Returns a printable string containing an encoded version of the build + * configuration. This may be useful to aom support. + * + */ +const char *aom_codec_build_config(void); + +/*!\brief Return the name for a given interface + * + * Returns a human readable string for name of the given codec interface. + * + * \param[in] iface Interface pointer + * + */ +const char *aom_codec_iface_name(aom_codec_iface_t *iface); + +/*!\brief Convert error number to printable string + * + * Returns a human readable string for the last error returned by the + * algorithm. The returned error will be one line and will not contain + * any newline characters. + * + * + * \param[in] err Error number. + * + */ +const char *aom_codec_err_to_string(aom_codec_err_t err); + +/*!\brief Retrieve error synopsis for codec context + * + * Returns a human readable string for the last error returned by the + * algorithm. The returned error will be one line and will not contain + * any newline characters. + * + * + * \param[in] ctx Pointer to this instance's context. + * + */ +const char *aom_codec_error(aom_codec_ctx_t *ctx); + +/*!\brief Retrieve detailed error information for codec context + * + * Returns a human readable string providing detailed information about + * the last error. + * + * \param[in] ctx Pointer to this instance's context. + * + * \retval NULL + * No detailed information is available. + */ +const char *aom_codec_error_detail(aom_codec_ctx_t *ctx); + +/* REQUIRED FUNCTIONS + * + * The following functions are required to be implemented for all codecs. + * They represent the base case functionality expected of all codecs. + */ + +/*!\brief Destroy a codec instance + * + * Destroys a codec context, freeing any associated memory buffers. + * + * \param[in] ctx Pointer to this instance's context + * + * \retval #AOM_CODEC_OK + * The codec algorithm initialized. + * \retval #AOM_CODEC_MEM_ERROR + * Memory allocation failed. + */ +aom_codec_err_t aom_codec_destroy(aom_codec_ctx_t *ctx); + +/*!\brief Get the capabilities of an algorithm. + * + * Retrieves the capabilities bitfield from the algorithm's interface. + * + * \param[in] iface Pointer to the algorithm interface + * + */ +aom_codec_caps_t aom_codec_get_caps(aom_codec_iface_t *iface); + +/*!\name Codec Control + * + * The aom_codec_control function exchanges algorithm specific data with the + * codec instance. Additionally, the macro AOM_CODEC_CONTROL_TYPECHECKED is + * provided, which will type-check the parameter against the control ID before + * calling aom_codec_control - note that this macro requires the control ID + * to be directly encoded in it, e.g., + * AOM_CODEC_CONTROL_TYPECHECKED(&ctx, AOME_SET_CPUUSED, 8). + * + * The codec control IDs can be found in aom.h, aomcx.h, and aomdx.h + * (defined as aom_com_control_id, aome_enc_control_id, and aom_dec_control_id). + * @{ + */ +/*!\brief Algorithm Control + * + * aom_codec_control takes a context, a control ID, and a third parameter + * (with varying type). If the context is non-null and an error occurs, + * ctx->err will be set to the same value as the return value. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] ctrl_id Algorithm specific control identifier + * + * \retval #AOM_CODEC_OK + * The control request was processed. + * \retval #AOM_CODEC_ERROR + * The control request was not processed. + * \retval #AOM_CODEC_INVALID_PARAM + * The data was not valid. + */ +aom_codec_err_t aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...); + +/*!\brief aom_codec_control wrapper macro (adds type-checking, less flexible) + * + * This macro allows for type safe conversions across the variadic parameter + * to aom_codec_control(). However, it requires the explicit control ID + * be passed in (it cannot be passed in via a variable) -- otherwise a compiler + * error will occur. After the type checking, it calls aom_codec_control. + */ +#define AOM_CODEC_CONTROL_TYPECHECKED(ctx, id, data) \ + aom_codec_control_typechecked_##id(ctx, id, data) /**<\hideinitializer*/ + +/*!\brief Creates typechecking mechanisms for aom_codec_control + * + * It defines a static function with the correctly typed arguments as a wrapper + * to the type-unsafe aom_codec_control function. It also creates a typedef + * for each type. + */ +#define AOM_CTRL_USE_TYPE(id, typ) \ + static aom_codec_err_t aom_codec_control_typechecked_##id( \ + aom_codec_ctx_t *, int, typ) AOM_UNUSED; \ + static aom_codec_err_t aom_codec_control_typechecked_##id( \ + aom_codec_ctx_t *ctx, int ctrl, typ data) { \ + return aom_codec_control(ctx, ctrl, data); \ + } /**<\hideinitializer*/ \ + typedef typ aom_codec_control_type_##id; +/*!@} end Codec Control group */ + +/*!\brief OBU types. */ +typedef enum ATTRIBUTE_PACKED { + OBU_SEQUENCE_HEADER = 1, + OBU_TEMPORAL_DELIMITER = 2, + OBU_FRAME_HEADER = 3, + OBU_TILE_GROUP = 4, + OBU_METADATA = 5, + OBU_FRAME = 6, + OBU_REDUNDANT_FRAME_HEADER = 7, + OBU_TILE_LIST = 8, + OBU_PADDING = 15, +} OBU_TYPE; + +/*!\brief OBU metadata types. */ +typedef enum { + OBU_METADATA_TYPE_AOM_RESERVED_0 = 0, + OBU_METADATA_TYPE_HDR_CLL = 1, + OBU_METADATA_TYPE_HDR_MDCV = 2, + OBU_METADATA_TYPE_SCALABILITY = 3, + OBU_METADATA_TYPE_ITUT_T35 = 4, + OBU_METADATA_TYPE_TIMECODE = 5, +} OBU_METADATA_TYPE; + +/*!\brief Returns string representation of OBU_TYPE. + * + * \param[in] type The OBU_TYPE to convert to string. + */ +const char *aom_obu_type_to_string(OBU_TYPE type); + +/*!@} - end defgroup codec*/ +#ifdef __cplusplus +} +#endif +#endif // AOM_AOM_AOM_CODEC_H_ diff --git a/libs/libaom/src/aom/aom_decoder.h b/libs/libaom/src/aom/aom_decoder.h new file mode 100644 index 000000000..5ce7c7b10 --- /dev/null +++ b/libs/libaom/src/aom/aom_decoder.h @@ -0,0 +1,257 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AOM_AOM_DECODER_H_ +#define AOM_AOM_AOM_DECODER_H_ + +/*!\defgroup decoder Decoder Algorithm Interface + * \ingroup codec + * This abstraction allows applications using this decoder to easily support + * multiple video formats with minimal code duplication. This section describes + * the interface common to all decoders. + * @{ + */ + +/*!\file + * \brief Describes the decoder algorithm interface to applications. + * + * This file describes the interface between an application and a + * video decoder algorithm. + * + */ +#ifdef __cplusplus +extern "C" { +#endif + +#include "aom/aom_codec.h" +#include "aom/aom_frame_buffer.h" + +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + */ +#define AOM_DECODER_ABI_VERSION \ + (6 + AOM_CODEC_ABI_VERSION) /**<\hideinitializer*/ + +/*! \brief Decoder capabilities bitfield + * + * Each decoder advertises the capabilities it supports as part of its + * ::aom_codec_iface_t interface structure. Capabilities are extra interfaces + * or functionality, and are not required to be supported by a decoder. + * + * The available flags are specified by AOM_CODEC_CAP_* defines. + */ +/*!brief Can support external frame buffers */ +#define AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x200000 + +/*! \brief Initialization-time Feature Enabling + * + * Certain codec features must be known at initialization time, to allow for + * proper memory allocation. + * + * The available flags are specified by AOM_CODEC_USE_* defines. + */ + +/*!\brief Stream properties + * + * This structure is used to query or set properties of the decoded + * stream. + */ +typedef struct aom_codec_stream_info { + unsigned int w; /**< Width (or 0 for unknown/default) */ + unsigned int h; /**< Height (or 0 for unknown/default) */ + unsigned int is_kf; /**< Current frame is a keyframe */ + unsigned int number_spatial_layers; /**< Number of spatial layers */ + unsigned int number_temporal_layers; /**< Number of temporal layers */ + unsigned int is_annexb; /**< Is Bitstream in Annex-B format */ +} aom_codec_stream_info_t; + +/* REQUIRED FUNCTIONS + * + * The following functions are required to be implemented for all decoders. + * They represent the base case functionality expected of all decoders. + */ + +/*!\brief Initialization Configurations + * + * This structure is used to pass init time configuration options to the + * decoder. + */ +typedef struct aom_codec_dec_cfg { + unsigned int threads; /**< Maximum number of threads to use, default 1 */ + unsigned int w; /**< Width */ + unsigned int h; /**< Height */ + unsigned int allow_lowbitdepth; /**< Allow use of low-bitdepth coding path */ +} aom_codec_dec_cfg_t; /**< alias for struct aom_codec_dec_cfg */ + +/*!\brief Initialize a decoder instance + * + * Initializes a decoder context using the given interface. Applications + * should call the aom_codec_dec_init convenience macro instead of this + * function directly, to ensure that the ABI version number parameter + * is properly initialized. + * + * If the library was configured with cmake -DCONFIG_MULTITHREAD=0, this + * call is not thread safe and should be guarded with a lock if being used + * in a multithreaded context. + * + * \param[in] ctx Pointer to this instance's context. + * \param[in] iface Pointer to the algorithm interface to use. + * \param[in] cfg Configuration to use, if known. May be NULL. + * \param[in] flags Bitfield of AOM_CODEC_USE_* flags + * \param[in] ver ABI version number. Must be set to + * AOM_DECODER_ABI_VERSION + * \retval #AOM_CODEC_OK + * The decoder algorithm initialized. + * \retval #AOM_CODEC_MEM_ERROR + * Memory allocation failed. + */ +aom_codec_err_t aom_codec_dec_init_ver(aom_codec_ctx_t *ctx, + aom_codec_iface_t *iface, + const aom_codec_dec_cfg_t *cfg, + aom_codec_flags_t flags, int ver); + +/*!\brief Convenience macro for aom_codec_dec_init_ver() + * + * Ensures the ABI version parameter is properly set. + */ +#define aom_codec_dec_init(ctx, iface, cfg, flags) \ + aom_codec_dec_init_ver(ctx, iface, cfg, flags, AOM_DECODER_ABI_VERSION) + +/*!\brief Parse stream info from a buffer + * + * Performs high level parsing of the bitstream. Construction of a decoder + * context is not necessary. Can be used to determine if the bitstream is + * of the proper format, and to extract information from the stream. + * + * \param[in] iface Pointer to the algorithm interface + * \param[in] data Pointer to a block of data to parse + * \param[in] data_sz Size of the data buffer + * \param[in,out] si Pointer to stream info to update. The is_annexb + * member \ref MUST be properly initialized. This + * function sets the rest of the members. + * + * \retval #AOM_CODEC_OK + * Bitstream is parsable and stream information updated. + * \retval #AOM_CODEC_INVALID_PARAM + * One of the arguments is invalid, for example a NULL pointer. + * \retval #AOM_CODEC_UNSUP_BITSTREAM + * The decoder didn't recognize the coded data, or the + * buffer was too short. + */ +aom_codec_err_t aom_codec_peek_stream_info(aom_codec_iface_t *iface, + const uint8_t *data, size_t data_sz, + aom_codec_stream_info_t *si); + +/*!\brief Return information about the current stream. + * + * Returns information about the stream that has been parsed during decoding. + * + * \param[in] ctx Pointer to this instance's context + * \param[in,out] si Pointer to stream info to update. + * + * \retval #AOM_CODEC_OK + * Bitstream is parsable and stream information updated. + * \retval #AOM_CODEC_INVALID_PARAM + * One of the arguments is invalid, for example a NULL pointer. + * \retval #AOM_CODEC_UNSUP_BITSTREAM + * The decoder couldn't parse the submitted data. + */ +aom_codec_err_t aom_codec_get_stream_info(aom_codec_ctx_t *ctx, + aom_codec_stream_info_t *si); + +/*!\brief Decode data + * + * Processes a buffer of coded data. Encoded data \ref MUST be passed in DTS + * (decode time stamp) order. Frames produced will always be in PTS + * (presentation time stamp) order. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] data Pointer to this block of new coded data. + * \param[in] data_sz Size of the coded data, in bytes. + * \param[in] user_priv Application specific data to associate with + * this frame. + * + * \return Returns #AOM_CODEC_OK if the coded data was processed completely + * and future pictures can be decoded without error. Otherwise, + * see the descriptions of the other error codes in ::aom_codec_err_t + * for recoverability capabilities. + */ +aom_codec_err_t aom_codec_decode(aom_codec_ctx_t *ctx, const uint8_t *data, + size_t data_sz, void *user_priv); + +/*!\brief Decoded frames iterator + * + * Iterates over a list of the frames available for display. The iterator + * storage should be initialized to NULL to start the iteration. Iteration is + * complete when this function returns NULL. + * + * The list of available frames becomes valid upon completion of the + * aom_codec_decode call, and remains valid until the next call to + * aom_codec_decode. + * + * \param[in] ctx Pointer to this instance's context + * \param[in,out] iter Iterator storage, initialized to NULL + * + * \return Returns a pointer to an image, if one is ready for display. Frames + * produced will always be in PTS (presentation time stamp) order. + */ +aom_image_t *aom_codec_get_frame(aom_codec_ctx_t *ctx, aom_codec_iter_t *iter); + +/*!\defgroup cap_external_frame_buffer External Frame Buffer Functions + * + * The following function is required to be implemented for all decoders + * that advertise the AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER capability. + * Calling this function for codecs that don't advertise this capability + * will result in an error code being returned, usually AOM_CODEC_INCAPABLE. + * @{ + */ + +/*!\brief Pass in external frame buffers for the decoder to use. + * + * Registers functions to be called when libaom needs a frame buffer + * to decode the current frame and a function to be called when libaom does + * not internally reference the frame buffer. This set function must + * be called before the first call to decode or libaom will assume the + * default behavior of allocating frame buffers internally. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] cb_get Pointer to the get callback function + * \param[in] cb_release Pointer to the release callback function + * \param[in] cb_priv Callback's private data + * + * \retval #AOM_CODEC_OK + * External frame buffers will be used by libaom. + * \retval #AOM_CODEC_INVALID_PARAM + * One or more of the callbacks were NULL. + * \retval #AOM_CODEC_ERROR + * Decoder context not initialized. + * \retval #AOM_CODEC_INCAPABLE + * Algorithm not capable of using external frame buffers. + * + * \note + * When decoding AV1, the application may be required to pass in at least + * #AOM_MAXIMUM_WORK_BUFFERS external frame buffers. + */ +aom_codec_err_t aom_codec_set_frame_buffer_functions( + aom_codec_ctx_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get, + aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv); + +/*!@} - end defgroup cap_external_frame_buffer */ + +/*!@} - end defgroup decoder*/ +#ifdef __cplusplus +} +#endif +#endif // AOM_AOM_AOM_DECODER_H_ diff --git a/libs/libaom/src/aom/aom_encoder.h b/libs/libaom/src/aom/aom_encoder.h new file mode 100644 index 000000000..a494c17a4 --- /dev/null +++ b/libs/libaom/src/aom/aom_encoder.h @@ -0,0 +1,1136 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AOM_AOM_ENCODER_H_ +#define AOM_AOM_AOM_ENCODER_H_ + +/*!\defgroup encoder Encoder Algorithm Interface + * \ingroup codec + * This abstraction allows applications using this encoder to easily support + * multiple video formats with minimal code duplication. This section describes + * the interface common to all encoders. + * @{ + */ + +/*!\file + * \brief Describes the encoder algorithm interface to applications. + * + * This file describes the interface between an application and a + * video encoder algorithm. + * + */ +#ifdef __cplusplus +extern "C" { +#endif + +#include "aom/aom_codec.h" + +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + */ +#define AOM_ENCODER_ABI_VERSION \ + (8 + AOM_CODEC_ABI_VERSION) /**<\hideinitializer*/ + +/*! \brief Encoder capabilities bitfield + * + * Each encoder advertises the capabilities it supports as part of its + * ::aom_codec_iface_t interface structure. Capabilities are extra + * interfaces or functionality, and are not required to be supported + * by an encoder. + * + * The available flags are specified by AOM_CODEC_CAP_* defines. + */ +#define AOM_CODEC_CAP_PSNR 0x10000 /**< Can issue PSNR packets */ + +/*! Can support input images at greater than 8 bitdepth. + */ +#define AOM_CODEC_CAP_HIGHBITDEPTH 0x40000 + +/*! \brief Initialization-time Feature Enabling + * + * Certain codec features must be known at initialization time, to allow + * for proper memory allocation. + * + * The available flags are specified by AOM_CODEC_USE_* defines. + */ +#define AOM_CODEC_USE_PSNR 0x10000 /**< Calculate PSNR on each frame */ +/*!\brief Make the encoder output one partition at a time. */ +#define AOM_CODEC_USE_HIGHBITDEPTH 0x40000 /**< Use high bitdepth */ + +/*!\brief Generic fixed size buffer structure + * + * This structure is able to hold a reference to any fixed size buffer. + */ +typedef struct aom_fixed_buf { + void *buf; /**< Pointer to the data. Does NOT own the data! */ + size_t sz; /**< Length of the buffer, in chars */ +} aom_fixed_buf_t; /**< alias for struct aom_fixed_buf */ + +/*!\brief Compressed Frame Flags + * + * This type represents a bitfield containing information about a compressed + * frame that may be useful to an application. The most significant 16 bits + * can be used by an algorithm to provide additional detail, for example to + * support frame types that are codec specific (MPEG-1 D-frames for example) + */ +typedef uint32_t aom_codec_frame_flags_t; +#define AOM_FRAME_IS_KEY 0x1 /**< frame is the start of a GOP */ +/*!\brief frame can be dropped without affecting the stream (no future frame + * depends on this one) */ +#define AOM_FRAME_IS_DROPPABLE 0x2 +/*!\brief this is an INTRA_ONLY frame */ +#define AOM_FRAME_IS_INTRAONLY 0x10 +/*!\brief this is an S-frame */ +#define AOM_FRAME_IS_SWITCH 0x20 +/*!\brief this is an error-resilient frame */ +#define AOM_FRAME_IS_ERROR_RESILIENT 0x40 +/*!\brief this is a key-frame dependent recovery-point frame */ +#define AOM_FRAME_IS_DELAYED_RANDOM_ACCESS_POINT 0x80 + +/*!\brief Error Resilient flags + * + * These flags define which error resilient features to enable in the + * encoder. The flags are specified through the + * aom_codec_enc_cfg::g_error_resilient variable. + */ +typedef uint32_t aom_codec_er_flags_t; +/*!\brief Improve resiliency against losses of whole frames */ +#define AOM_ERROR_RESILIENT_DEFAULT 0x1 + +/*!\brief Encoder output packet variants + * + * This enumeration lists the different kinds of data packets that can be + * returned by calls to aom_codec_get_cx_data(). Algorithms \ref MAY + * extend this list to provide additional functionality. + */ +enum aom_codec_cx_pkt_kind { + AOM_CODEC_CX_FRAME_PKT, /**< Compressed video frame */ + AOM_CODEC_STATS_PKT, /**< Two-pass statistics for this frame */ + AOM_CODEC_FPMB_STATS_PKT, /**< first pass mb statistics for this frame */ + AOM_CODEC_PSNR_PKT, /**< PSNR statistics for this frame */ + AOM_CODEC_CUSTOM_PKT = 256 /**< Algorithm extensions */ +}; + +/*!\brief Encoder output packet + * + * This structure contains the different kinds of output data the encoder + * may produce while compressing a frame. + */ +typedef struct aom_codec_cx_pkt { + enum aom_codec_cx_pkt_kind kind; /**< packet variant */ + union { + struct { + void *buf; /**< compressed data buffer */ + size_t sz; /**< length of compressed data */ + /*!\brief time stamp to show frame (in timebase units) */ + aom_codec_pts_t pts; + /*!\brief duration to show frame (in timebase units) */ + unsigned long duration; + aom_codec_frame_flags_t flags; /**< flags for this frame */ + /*!\brief the partition id defines the decoding order of the partitions. + * Only applicable when "output partition" mode is enabled. First + * partition has id 0.*/ + int partition_id; + /*!\brief size of the visible frame in this packet */ + size_t vis_frame_size; + } frame; /**< data for compressed frame packet */ + aom_fixed_buf_t twopass_stats; /**< data for two-pass packet */ + aom_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */ + struct aom_psnr_pkt { + unsigned int samples[4]; /**< Number of samples, total/y/u/v */ + uint64_t sse[4]; /**< sum squared error, total/y/u/v */ + double psnr[4]; /**< PSNR, total/y/u/v */ + } psnr; /**< data for PSNR packet */ + aom_fixed_buf_t raw; /**< data for arbitrary packets */ + + /* This packet size is fixed to allow codecs to extend this + * interface without having to manage storage for raw packets, + * i.e., if it's smaller than 128 bytes, you can store in the + * packet list directly. + */ + char pad[128 - sizeof(enum aom_codec_cx_pkt_kind)]; /**< fixed sz */ + } data; /**< packet data */ +} aom_codec_cx_pkt_t; /**< alias for struct aom_codec_cx_pkt */ + +/*!\brief Rational Number + * + * This structure holds a fractional value. + */ +typedef struct aom_rational { + int num; /**< fraction numerator */ + int den; /**< fraction denominator */ +} aom_rational_t; /**< alias for struct aom_rational */ + +/*!\brief Multi-pass Encoding Pass */ +enum aom_enc_pass { + AOM_RC_ONE_PASS, /**< Single pass mode */ + AOM_RC_FIRST_PASS, /**< First pass of multi-pass mode */ + AOM_RC_LAST_PASS /**< Final pass of multi-pass mode */ +}; + +/*!\brief Rate control mode */ +enum aom_rc_mode { + AOM_VBR, /**< Variable Bit Rate (VBR) mode */ + AOM_CBR, /**< Constant Bit Rate (CBR) mode */ + AOM_CQ, /**< Constrained Quality (CQ) mode */ + AOM_Q, /**< Constant Quality (Q) mode */ +}; + +/*!\brief Keyframe placement mode. + * + * This enumeration determines whether keyframes are placed automatically by + * the encoder or whether this behavior is disabled. Older releases of this + * SDK were implemented such that AOM_KF_FIXED meant keyframes were disabled. + * This name is confusing for this behavior, so the new symbols to be used + * are AOM_KF_AUTO and AOM_KF_DISABLED. + */ +enum aom_kf_mode { + AOM_KF_FIXED, /**< deprecated, implies AOM_KF_DISABLED */ + AOM_KF_AUTO, /**< Encoder determines optimal placement automatically */ + AOM_KF_DISABLED = 0 /**< Encoder does not place keyframes. */ +}; + +/*!\brief Encoder Config Options + * + * This type allows to enumerate and control flags defined for encoder control + * via config file at runtime. + */ +typedef struct cfg_options { + /*!\brief Indicate init by cfg file + * 0 or 1 + */ + unsigned int init_by_cfg_file; + /*!\brief Superblock size + * 0, 64 or 128 + */ + unsigned int super_block_size; + /*!\brief max partition size + * 8, 16, 32, 64, 128 + */ + unsigned int max_partition_size; + /*!\brief min partition size + * 8, 16, 32, 64, 128 + */ + unsigned int min_partition_size; + /*!\brief disable AB Shape partition type + * + */ + unsigned int disable_ab_partition_type; + /*!\brief disable rectangular partition type + * + */ + unsigned int disable_rect_partition_type; + /*!\brief disable 1:4/4:1 partition type + * + */ + unsigned int disable_1to4_partition_type; + /*!\brief disable flip and identity transform type + * + */ + unsigned int disable_flip_idtx; + /*!\brief disable CDEF filter + * + */ + unsigned int disable_cdef; + /*!\brief disable Loop Restoration Filter + * + */ + unsigned int disable_lr; + /*!\brief disable OBMC + * + */ + unsigned int disable_obmc; + /*!\brief disable Warped Motion + * + */ + unsigned int disable_warp_motion; + /*!\brief disable global motion + * + */ + unsigned int disable_global_motion; + /*!\brief disable dist weighted compound + * + */ + unsigned int disable_dist_wtd_comp; + /*!\brief disable diff weighted compound + * + */ + unsigned int disable_diff_wtd_comp; + /*!\brief disable inter/intra compound + * + */ + unsigned int disable_inter_intra_comp; + /*!\brief disable masked compound + * + */ + unsigned int disable_masked_comp; + /*!\brief disable one sided compound + * + */ + unsigned int disable_one_sided_comp; + /*!\brief disable Palette + * + */ + unsigned int disable_palette; + /*!\brief disable Intra Block Copy + * + */ + unsigned int disable_intrabc; + /*!\brief disable chroma from luma + * + */ + unsigned int disable_cfl; + /*!\brief disable intra smooth mode + * + */ + unsigned int disable_smooth_intra; + /*!\brief disable filter intra + * + */ + unsigned int disable_filter_intra; + /*!\brief disable dual filter + * + */ + unsigned int disable_dual_filter; + /*!\brief disable intra angle delta + * + */ + unsigned int disable_intra_angle_delta; + /*!\brief disable intra edge filter + * + */ + unsigned int disable_intra_edge_filter; + /*!\brief disable 64x64 transform + * + */ + unsigned int disable_tx_64x64; + /*!\brief disable smooth inter/intra + * + */ + unsigned int disable_smooth_inter_intra; + /*!\brief disable inter/inter wedge comp + * + */ + unsigned int disable_inter_inter_wedge; + /*!\brief disable inter/intra wedge comp + * + */ + unsigned int disable_inter_intra_wedge; + /*!\brief disable paeth intra + * + */ + unsigned int disable_paeth_intra; + /*!\brief disable trellis quantization + * + */ + unsigned int disable_trellis_quant; + /*!\brief disable ref frame MV + * + */ + unsigned int disable_ref_frame_mv; + /*!\brief use reduced reference frame set + * + */ + unsigned int reduced_reference_set; + /*!\brief use reduced transform type set + * + */ + unsigned int reduced_tx_type_set; +} cfg_options_t; + +/*!\brief Encoded Frame Flags + * + * This type indicates a bitfield to be passed to aom_codec_encode(), defining + * per-frame boolean values. By convention, bits common to all codecs will be + * named AOM_EFLAG_*, and bits specific to an algorithm will be named + * /algo/_eflag_*. The lower order 16 bits are reserved for common use. + */ +typedef long aom_enc_frame_flags_t; +#define AOM_EFLAG_FORCE_KF (1 << 0) /**< Force this frame to be a keyframe */ + +/*!\brief Encoder configuration structure + * + * This structure contains the encoder settings that have common representations + * across all codecs. This doesn't imply that all codecs support all features, + * however. + */ +typedef struct aom_codec_enc_cfg { + /* + * generic settings (g) + */ + + /*!\brief Algorithm specific "usage" value + * + * Algorithms may define multiple values for usage, which may convey the + * intent of how the application intends to use the stream. If this value + * is non-zero, consult the documentation for the codec to determine its + * meaning. + */ + unsigned int g_usage; + + /*!\brief Maximum number of threads to use + * + * For multi-threaded implementations, use no more than this number of + * threads. The codec may use fewer threads than allowed. The value + * 0 is equivalent to the value 1. + */ + unsigned int g_threads; + + /*!\brief Bitstream profile to use + * + * Some codecs support a notion of multiple bitstream profiles. Typically + * this maps to a set of features that are turned on or off. Often the + * profile to use is determined by the features of the intended decoder. + * Consult the documentation for the codec to determine the valid values + * for this parameter, or set to zero for a sane default. + */ + unsigned int g_profile; /**< profile of bitstream to use */ + + /*!\brief Width of the frame + * + * This value identifies the presentation resolution of the frame, + * in pixels. Note that the frames passed as input to the encoder must + * have this resolution. Frames will be presented by the decoder in this + * resolution, independent of any spatial resampling the encoder may do. + */ + unsigned int g_w; + + /*!\brief Height of the frame + * + * This value identifies the presentation resolution of the frame, + * in pixels. Note that the frames passed as input to the encoder must + * have this resolution. Frames will be presented by the decoder in this + * resolution, independent of any spatial resampling the encoder may do. + */ + unsigned int g_h; + + /*!\brief Max number of frames to encode + * + */ + unsigned int g_limit; + + /*!\brief Forced maximum width of the frame + * + * If this value is non-zero then it is used to force the maximum frame + * width written in write_sequence_header(). + */ + unsigned int g_forced_max_frame_width; + + /*!\brief Forced maximum height of the frame + * + * If this value is non-zero then it is used to force the maximum frame + * height written in write_sequence_header(). + */ + unsigned int g_forced_max_frame_height; + + /*!\brief Bit-depth of the codec + * + * This value identifies the bit_depth of the codec, + * Only certain bit-depths are supported as identified in the + * aom_bit_depth_t enum. + */ + aom_bit_depth_t g_bit_depth; + + /*!\brief Bit-depth of the input frames + * + * This value identifies the bit_depth of the input frames in bits. + * Note that the frames passed as input to the encoder must have + * this bit-depth. + */ + unsigned int g_input_bit_depth; + + /*!\brief Stream timebase units + * + * Indicates the smallest interval of time, in seconds, used by the stream. + * For fixed frame rate material, or variable frame rate material where + * frames are timed at a multiple of a given clock (ex: video capture), + * the \ref RECOMMENDED method is to set the timebase to the reciprocal + * of the frame rate (ex: 1001/30000 for 29.970 Hz NTSC). This allows the + * pts to correspond to the frame number, which can be handy. For + * re-encoding video from containers with absolute time timestamps, the + * \ref RECOMMENDED method is to set the timebase to that of the parent + * container or multimedia framework (ex: 1/1000 for ms, as in FLV). + */ + struct aom_rational g_timebase; + + /*!\brief Enable error resilient modes. + * + * The error resilient bitfield indicates to the encoder which features + * it should enable to take measures for streaming over lossy or noisy + * links. + */ + aom_codec_er_flags_t g_error_resilient; + + /*!\brief Multi-pass Encoding Mode + * + * This value should be set to the current phase for multi-pass encoding. + * For single pass, set to #AOM_RC_ONE_PASS. + */ + enum aom_enc_pass g_pass; + + /*!\brief Allow lagged encoding + * + * If set, this value allows the encoder to consume a number of input + * frames before producing output frames. This allows the encoder to + * base decisions for the current frame on future frames. This does + * increase the latency of the encoding pipeline, so it is not appropriate + * in all situations (ex: realtime encoding). + * + * Note that this is a maximum value -- the encoder may produce frames + * sooner than the given limit. Set this value to 0 to disable this + * feature. + */ + unsigned int g_lag_in_frames; + + /* + * rate control settings (rc) + */ + + /*!\brief Temporal resampling configuration, if supported by the codec. + * + * Temporal resampling allows the codec to "drop" frames as a strategy to + * meet its target data rate. This can cause temporal discontinuities in + * the encoded video, which may appear as stuttering during playback. This + * trade-off is often acceptable, but for many applications is not. It can + * be disabled in these cases. + * + * Note that not all codecs support this feature. All aom AVx codecs do. + * For other codecs, consult the documentation for that algorithm. + * + * This threshold is described as a percentage of the target data buffer. + * When the data buffer falls below this percentage of fullness, a + * dropped frame is indicated. Set the threshold to zero (0) to disable + * this feature. + */ + unsigned int rc_dropframe_thresh; + + /*!\brief Mode for spatial resampling, if supported by the codec. + * + * Spatial resampling allows the codec to compress a lower resolution + * version of the frame, which is then upscaled by the decoder to the + * correct presentation resolution. This increases visual quality at + * low data rates, at the expense of CPU time on the encoder/decoder. + */ + unsigned int rc_resize_mode; + + /*!\brief Frame resize denominator. + * + * The denominator for resize to use, assuming 8 as the numerator. + * + * Valid denominators are 8 - 16 for now. + */ + unsigned int rc_resize_denominator; + + /*!\brief Keyframe resize denominator. + * + * The denominator for resize to use, assuming 8 as the numerator. + * + * Valid denominators are 8 - 16 for now. + */ + unsigned int rc_resize_kf_denominator; + + /*!\brief Frame super-resolution scaling mode. + * + * Similar to spatial resampling, frame super-resolution integrates + * upscaling after the encode/decode process. Taking control of upscaling and + * using restoration filters should allow it to outperform normal resizing. + * + * Valid values are 0 to 4 as defined in enum SUPERRES_MODE. + */ + unsigned int rc_superres_mode; + + /*!\brief Frame super-resolution denominator. + * + * The denominator for superres to use. If fixed it will only change if the + * cumulative scale change over resizing and superres is greater than 1/2; + * this forces superres to reduce scaling. + * + * Valid denominators are 8 to 16. + * + * Used only by SUPERRES_FIXED. + */ + unsigned int rc_superres_denominator; + + /*!\brief Keyframe super-resolution denominator. + * + * The denominator for superres to use. If fixed it will only change if the + * cumulative scale change over resizing and superres is greater than 1/2; + * this forces superres to reduce scaling. + * + * Valid denominators are 8 - 16 for now. + */ + unsigned int rc_superres_kf_denominator; + + /*!\brief Frame super-resolution q threshold. + * + * The q level threshold after which superres is used. + * Valid values are 1 to 63. + * + * Used only by SUPERRES_QTHRESH + */ + unsigned int rc_superres_qthresh; + + /*!\brief Keyframe super-resolution q threshold. + * + * The q level threshold after which superres is used for key frames. + * Valid values are 1 to 63. + * + * Used only by SUPERRES_QTHRESH + */ + unsigned int rc_superres_kf_qthresh; + + /*!\brief Rate control algorithm to use. + * + * Indicates whether the end usage of this stream is to be streamed over + * a bandwidth constrained link, indicating that Constant Bit Rate (CBR) + * mode should be used, or whether it will be played back on a high + * bandwidth link, as from a local disk, where higher variations in + * bitrate are acceptable. + */ + enum aom_rc_mode rc_end_usage; + + /*!\brief Two-pass stats buffer. + * + * A buffer containing all of the stats packets produced in the first + * pass, concatenated. + */ + aom_fixed_buf_t rc_twopass_stats_in; + + /*!\brief first pass mb stats buffer. + * + * A buffer containing all of the first pass mb stats packets produced + * in the first pass, concatenated. + */ + aom_fixed_buf_t rc_firstpass_mb_stats_in; + + /*!\brief Target data rate + * + * Target bandwidth to use for this stream, in kilobits per second. + */ + unsigned int rc_target_bitrate; + + /* + * quantizer settings + */ + + /*!\brief Minimum (Best Quality) Quantizer + * + * The quantizer is the most direct control over the quality of the + * encoded image. The range of valid values for the quantizer is codec + * specific. Consult the documentation for the codec to determine the + * values to use. To determine the range programmatically, call + * aom_codec_enc_config_default() with a usage value of 0. + */ + unsigned int rc_min_quantizer; + + /*!\brief Maximum (Worst Quality) Quantizer + * + * The quantizer is the most direct control over the quality of the + * encoded image. The range of valid values for the quantizer is codec + * specific. Consult the documentation for the codec to determine the + * values to use. To determine the range programmatically, call + * aom_codec_enc_config_default() with a usage value of 0. + */ + unsigned int rc_max_quantizer; + + /* + * bitrate tolerance + */ + + /*!\brief Rate control adaptation undershoot control + * + * This value, expressed as a percentage of the target bitrate, + * controls the maximum allowed adaptation speed of the codec. + * This factor controls the maximum amount of bits that can + * be subtracted from the target bitrate in order to compensate + * for prior overshoot. + * + * Valid values in the range 0-1000. + */ + unsigned int rc_undershoot_pct; + + /*!\brief Rate control adaptation overshoot control + * + * This value, expressed as a percentage of the target bitrate, + * controls the maximum allowed adaptation speed of the codec. + * This factor controls the maximum amount of bits that can + * be added to the target bitrate in order to compensate for + * prior undershoot. + * + * Valid values in the range 0-1000. + */ + unsigned int rc_overshoot_pct; + + /* + * decoder buffer model parameters + */ + + /*!\brief Decoder Buffer Size + * + * This value indicates the amount of data that may be buffered by the + * decoding application. Note that this value is expressed in units of + * time (milliseconds). For example, a value of 5000 indicates that the + * client will buffer (at least) 5000ms worth of encoded data. Use the + * target bitrate (#rc_target_bitrate) to convert to bits/bytes, if + * necessary. + */ + unsigned int rc_buf_sz; + + /*!\brief Decoder Buffer Initial Size + * + * This value indicates the amount of data that will be buffered by the + * decoding application prior to beginning playback. This value is + * expressed in units of time (milliseconds). Use the target bitrate + * (#rc_target_bitrate) to convert to bits/bytes, if necessary. + */ + unsigned int rc_buf_initial_sz; + + /*!\brief Decoder Buffer Optimal Size + * + * This value indicates the amount of data that the encoder should try + * to maintain in the decoder's buffer. This value is expressed in units + * of time (milliseconds). Use the target bitrate (#rc_target_bitrate) + * to convert to bits/bytes, if necessary. + */ + unsigned int rc_buf_optimal_sz; + + /* + * 2 pass rate control parameters + */ + + /*!\brief Two-pass mode CBR/VBR bias + * + * Bias, expressed on a scale of 0 to 100, for determining target size + * for the current frame. The value 0 indicates the optimal CBR mode + * value should be used. The value 100 indicates the optimal VBR mode + * value should be used. Values in between indicate which way the + * encoder should "lean." + */ + unsigned int rc_2pass_vbr_bias_pct; + + /*!\brief Two-pass mode per-GOP minimum bitrate + * + * This value, expressed as a percentage of the target bitrate, indicates + * the minimum bitrate to be used for a single GOP (aka "section") + */ + unsigned int rc_2pass_vbr_minsection_pct; + + /*!\brief Two-pass mode per-GOP maximum bitrate + * + * This value, expressed as a percentage of the target bitrate, indicates + * the maximum bitrate to be used for a single GOP (aka "section") + */ + unsigned int rc_2pass_vbr_maxsection_pct; + + /* + * keyframing settings (kf) + */ + + /*!\brief Option to enable forward reference key frame + * + */ + int fwd_kf_enabled; + + /*!\brief Keyframe placement mode + * + * This value indicates whether the encoder should place keyframes at a + * fixed interval, or determine the optimal placement automatically + * (as governed by the #kf_min_dist and #kf_max_dist parameters) + */ + enum aom_kf_mode kf_mode; + + /*!\brief Keyframe minimum interval + * + * This value, expressed as a number of frames, prevents the encoder from + * placing a keyframe nearer than kf_min_dist to the previous keyframe. At + * least kf_min_dist frames non-keyframes will be coded before the next + * keyframe. Set kf_min_dist equal to kf_max_dist for a fixed interval. + */ + unsigned int kf_min_dist; + + /*!\brief Keyframe maximum interval + * + * This value, expressed as a number of frames, forces the encoder to code + * a keyframe if one has not been coded in the last kf_max_dist frames. + * A value of 0 implies all frames will be keyframes. Set kf_min_dist + * equal to kf_max_dist for a fixed interval. + */ + unsigned int kf_max_dist; + + /*!\brief sframe interval + * + * This value, expressed as a number of frames, forces the encoder to code + * an S-Frame every sframe_dist frames. + */ + unsigned int sframe_dist; + + /*!\brief sframe insertion mode + * + * This value must be set to 1 or 2, and tells the encoder how to insert + * S-Frames. It will only have an effect if sframe_dist != 0. + * + * If altref is enabled: + * - if sframe_mode == 1, the considered frame will be made into an + * S-Frame only if it is an altref frame + * - if sframe_mode == 2, the next altref frame will be made into an + * S-Frame. + * + * Otherwise: the considered frame will be made into an S-Frame. + */ + unsigned int sframe_mode; + + /*!\brief Tile coding mode + * + * This value indicates the tile coding mode. + * A value of 0 implies a normal non-large-scale tile coding. A value of 1 + * implies a large-scale tile coding. + */ + unsigned int large_scale_tile; + + /*!\brief Monochrome mode + * + * If this is nonzero, the encoder will generate a monochrome stream + * with no chroma planes. + */ + unsigned int monochrome; + + /*!\brief full_still_picture_hdr + * + * If this is nonzero, the encoder will generate a full header even for + * still picture encoding. if zero, a reduced header is used for still + * picture. This flag has no effect when a regular video with more than + * a single frame is encoded. + */ + unsigned int full_still_picture_hdr; + + /*!\brief Bitstream syntax mode + * + * This value indicates the bitstream syntax mode. + * A value of 0 indicates bitstream is saved as Section 5 bitstream. A value + * of 1 indicates the bitstream is saved in Annex-B format + */ + unsigned int save_as_annexb; + + /*!\brief Number of explicit tile widths specified + * + * This value indicates the number of tile widths specified + * A value of 0 implies no tile widths are specified. + * Tile widths are given in the array tile_widths[] + */ + int tile_width_count; + + /*!\brief Number of explicit tile heights specified + * + * This value indicates the number of tile heights specified + * A value of 0 implies no tile heights are specified. + * Tile heights are given in the array tile_heights[] + */ + int tile_height_count; + +/*!\brief Maximum number of tile widths in tile widths array + * + * This define gives the maximum number of elements in the tile_widths array. + */ +#define MAX_TILE_WIDTHS 64 // maximum tile width array length + + /*!\brief Array of specified tile widths + * + * This array specifies tile widths (and may be empty) + * The number of widths specified is given by tile_width_count + */ + int tile_widths[MAX_TILE_WIDTHS]; + +/*!\brief Maximum number of tile heights in tile heights array. + * + * This define gives the maximum number of elements in the tile_heights array. + */ +#define MAX_TILE_HEIGHTS 64 // maximum tile height array length + + /*!\brief Array of specified tile heights + * + * This array specifies tile heights (and may be empty) + * The number of heights specified is given by tile_height_count + */ + int tile_heights[MAX_TILE_HEIGHTS]; + + /*!\brief Whether encoder should use fixed QP offsets. + * + * If a value of 1 is provided, encoder will use fixed QP offsets for frames + * at different levels of the pyramid. + * - If 'fixed_qp_offsets' is also provided, encoder will use the given + * offsets + * - If not, encoder will select the fixed offsets based on the cq-level + * provided. + * If a value of 0 is provided and fixed_qp_offset are not provided, encoder + * will NOT use fixed QP offsets. + * Note: This option is only relevant for --end-usage=q. + */ + unsigned int use_fixed_qp_offsets; + +/*!\brief Number of fixed QP offsets + * + * This defines the number of elements in the fixed_qp_offsets array. + */ +#define FIXED_QP_OFFSET_COUNT 5 + + /*!\brief Array of fixed QP offsets + * + * This array specifies fixed QP offsets (range: 0 to 63) for frames at + * different levels of the pyramid. It is a comma-separated list of 5 values: + * - QP offset for keyframe + * - QP offset for ALTREF frame + * - QP offset for 1st level internal ARF + * - QP offset for 2nd level internal ARF + * - QP offset for 3rd level internal ARF + * Notes: + * - QP offset for leaf level frames is not explicitly specified. These frames + * use the worst quality allowed (--cq-level). + * - This option is only relevant for --end-usage=q. + */ + int fixed_qp_offsets[FIXED_QP_OFFSET_COUNT]; + + /*!\brief Options defined per config file + * + */ + cfg_options_t encoder_cfg; +} aom_codec_enc_cfg_t; /**< alias for struct aom_codec_enc_cfg */ + +/*!\brief Initialize an encoder instance + * + * Initializes a encoder context using the given interface. Applications + * should call the aom_codec_enc_init convenience macro instead of this + * function directly, to ensure that the ABI version number parameter + * is properly initialized. + * + * If the library was configured with --disable-multithread, this call + * is not thread safe and should be guarded with a lock if being used + * in a multithreaded context. + * + * \param[in] ctx Pointer to this instance's context. + * \param[in] iface Pointer to the algorithm interface to use. + * \param[in] cfg Configuration to use, if known. + * \param[in] flags Bitfield of AOM_CODEC_USE_* flags + * \param[in] ver ABI version number. Must be set to + * AOM_ENCODER_ABI_VERSION + * \retval #AOM_CODEC_OK + * The decoder algorithm initialized. + * \retval #AOM_CODEC_MEM_ERROR + * Memory allocation failed. + */ +aom_codec_err_t aom_codec_enc_init_ver(aom_codec_ctx_t *ctx, + aom_codec_iface_t *iface, + const aom_codec_enc_cfg_t *cfg, + aom_codec_flags_t flags, int ver); + +/*!\brief Convenience macro for aom_codec_enc_init_ver() + * + * Ensures the ABI version parameter is properly set. + */ +#define aom_codec_enc_init(ctx, iface, cfg, flags) \ + aom_codec_enc_init_ver(ctx, iface, cfg, flags, AOM_ENCODER_ABI_VERSION) + +/*!\brief Get the default configuration for a usage. + * + * Initializes an encoder configuration structure with default values. Supports + * the notion of "usages" so that an algorithm may offer different default + * settings depending on the user's intended goal. This function \ref SHOULD + * be called by all applications to initialize the configuration structure + * before specializing the configuration with application specific values. + * + * \param[in] iface Pointer to the algorithm interface to use. + * \param[out] cfg Configuration buffer to populate. + * \param[in] usage Algorithm specific usage value. For AV1, must be + * set to AOM_USAGE_GOOD_QUALITY (0) or + * AOM_USAGE_REALTIME (1). + * + * \retval #AOM_CODEC_OK + * The configuration was populated. + * \retval #AOM_CODEC_INCAPABLE + * Interface is not an encoder interface. + * \retval #AOM_CODEC_INVALID_PARAM + * A parameter was NULL, or the usage value was not recognized. + */ +aom_codec_err_t aom_codec_enc_config_default(aom_codec_iface_t *iface, + aom_codec_enc_cfg_t *cfg, + unsigned int usage); + +/*!\brief Set or change configuration + * + * Reconfigures an encoder instance according to the given configuration. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] cfg Configuration buffer to use + * + * \retval #AOM_CODEC_OK + * The configuration was populated. + * \retval #AOM_CODEC_INCAPABLE + * Interface is not an encoder interface. + * \retval #AOM_CODEC_INVALID_PARAM + * A parameter was NULL, or the usage value was not recognized. + */ +aom_codec_err_t aom_codec_enc_config_set(aom_codec_ctx_t *ctx, + const aom_codec_enc_cfg_t *cfg); + +/*!\brief Get global stream headers + * + * Retrieves a stream level global header packet, if supported by the codec. + * Calls to this function should be deferred until all configuration information + * has been passed to libaom. Otherwise the global header data may be + * invalidated by additional configuration changes. + * + * The AV1 implementation of this function returns an OBU. The OBU returned is + * in Low Overhead Bitstream Format. Specifically, the obu_has_size_field bit is + * set, and the buffer contains the obu_size field for the returned OBU. + * + * \param[in] ctx Pointer to this instance's context + * + * \retval NULL + * Encoder does not support global header, or an error occurred while + * generating the global header. + * + * \retval Non-NULL + * Pointer to buffer containing global header packet. The caller owns the + * memory associated with this buffer, and must free the 'buf' member of the + * aom_fixed_buf_t as well as the aom_fixed_buf_t pointer. Memory returned + * must be freed via call to free(). + */ +aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx); + +/*!\brief usage parameter analogous to AV1 GOOD QUALITY mode. */ +#define AOM_USAGE_GOOD_QUALITY (0) +/*!\brief usage parameter analogous to AV1 REALTIME mode. */ +#define AOM_USAGE_REALTIME (1) + +/*!\brief Encode a frame + * + * Encodes a video frame at the given "presentation time." The presentation + * time stamp (PTS) \ref MUST be strictly increasing. + * + * When the last frame has been passed to the encoder, this function should + * continue to be called, with the img parameter set to NULL. This will + * signal the end-of-stream condition to the encoder and allow it to encode + * any held buffers. Encoding is complete when aom_codec_encode() is called + * and aom_codec_get_cx_data() returns no data. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] img Image data to encode, NULL to flush. + * \param[in] pts Presentation time stamp, in timebase units. + * \param[in] duration Duration to show frame, in timebase units. + * \param[in] flags Flags to use for encoding this frame. + * + * \retval #AOM_CODEC_OK + * The configuration was populated. + * \retval #AOM_CODEC_INCAPABLE + * Interface is not an encoder interface. + * \retval #AOM_CODEC_INVALID_PARAM + * A parameter was NULL, the image format is unsupported, etc. + */ +aom_codec_err_t aom_codec_encode(aom_codec_ctx_t *ctx, const aom_image_t *img, + aom_codec_pts_t pts, unsigned long duration, + aom_enc_frame_flags_t flags); + +/*!\brief Set compressed data output buffer + * + * Sets the buffer that the codec should output the compressed data + * into. This call effectively sets the buffer pointer returned in the + * next AOM_CODEC_CX_FRAME_PKT packet. Subsequent packets will be + * appended into this buffer. The buffer is preserved across frames, + * so applications must periodically call this function after flushing + * the accumulated compressed data to disk or to the network to reset + * the pointer to the buffer's head. + * + * `pad_before` bytes will be skipped before writing the compressed + * data, and `pad_after` bytes will be appended to the packet. The size + * of the packet will be the sum of the size of the actual compressed + * data, pad_before, and pad_after. The padding bytes will be preserved + * (not overwritten). + * + * Note that calling this function does not guarantee that the returned + * compressed data will be placed into the specified buffer. In the + * event that the encoded data will not fit into the buffer provided, + * the returned packet \ref MAY point to an internal buffer, as it would + * if this call were never used. In this event, the output packet will + * NOT have any padding, and the application must free space and copy it + * to the proper place. This is of particular note in configurations + * that may output multiple packets for a single encoded frame (e.g., lagged + * encoding) or if the application does not reset the buffer periodically. + * + * Applications may restore the default behavior of the codec providing + * the compressed data buffer by calling this function with a NULL + * buffer. + * + * Applications \ref MUSTNOT call this function during iteration of + * aom_codec_get_cx_data(). + * + * \param[in] ctx Pointer to this instance's context + * \param[in] buf Buffer to store compressed data into + * \param[in] pad_before Bytes to skip before writing compressed data + * \param[in] pad_after Bytes to skip after writing compressed data + * + * \retval #AOM_CODEC_OK + * The buffer was set successfully. + * \retval #AOM_CODEC_INVALID_PARAM + * A parameter was NULL, the image format is unsupported, etc. + */ +aom_codec_err_t aom_codec_set_cx_data_buf(aom_codec_ctx_t *ctx, + const aom_fixed_buf_t *buf, + unsigned int pad_before, + unsigned int pad_after); + +/*!\brief Encoded data iterator + * + * Iterates over a list of data packets to be passed from the encoder to the + * application. The different kinds of packets available are enumerated in + * #aom_codec_cx_pkt_kind. + * + * #AOM_CODEC_CX_FRAME_PKT packets should be passed to the application's + * muxer. Multiple compressed frames may be in the list. + * #AOM_CODEC_STATS_PKT packets should be appended to a global buffer. + * + * The application \ref MUST silently ignore any packet kinds that it does + * not recognize or support. + * + * The data buffers returned from this function are only guaranteed to be + * valid until the application makes another call to any aom_codec_* function. + * + * \param[in] ctx Pointer to this instance's context + * \param[in,out] iter Iterator storage, initialized to NULL + * + * \return Returns a pointer to an output data packet (compressed frame data, + * two-pass statistics, etc.) or NULL to signal end-of-list. + * + */ +const aom_codec_cx_pkt_t *aom_codec_get_cx_data(aom_codec_ctx_t *ctx, + aom_codec_iter_t *iter); + +/*!\brief Get Preview Frame + * + * Returns an image that can be used as a preview. Shows the image as it would + * exist at the decompressor. The application \ref MUST NOT write into this + * image buffer. + * + * \param[in] ctx Pointer to this instance's context + * + * \return Returns a pointer to a preview image, or NULL if no image is + * available. + * + */ +const aom_image_t *aom_codec_get_preview_frame(aom_codec_ctx_t *ctx); + +/*!@} - end defgroup encoder*/ +#ifdef __cplusplus +} +#endif +#endif // AOM_AOM_AOM_ENCODER_H_ diff --git a/libs/libaom/src/aom/aom_frame_buffer.h b/libs/libaom/src/aom/aom_frame_buffer.h new file mode 100644 index 000000000..a715645a7 --- /dev/null +++ b/libs/libaom/src/aom/aom_frame_buffer.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_AOM_FRAME_BUFFER_H_ +#define AOM_AOM_AOM_FRAME_BUFFER_H_ + +/*!\file + * \brief Describes the decoder external frame buffer interface. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "aom/aom_integer.h" + +/*!\brief The maximum number of work buffers used by libaom. + * Support maximum 4 threads to decode video in parallel. + * Each thread will use one work buffer. + * TODO(hkuang): Add support to set number of worker threads dynamically. + */ +#define AOM_MAXIMUM_WORK_BUFFERS 8 + +/*!\brief The maximum number of reference buffers that a AV1 encoder may use. + */ +#define AOM_MAXIMUM_REF_BUFFERS 8 + +/*!\brief External frame buffer + * + * This structure holds allocated frame buffers used by the decoder. + */ +typedef struct aom_codec_frame_buffer { + uint8_t *data; /**< Pointer to the data buffer */ + size_t size; /**< Size of data in bytes */ + void *priv; /**< Frame's private data */ +} aom_codec_frame_buffer_t; + +/*!\brief get frame buffer callback prototype + * + * This callback is invoked by the decoder to retrieve data for the frame + * buffer in order for the decode call to complete. The callback must + * allocate at least min_size in bytes and assign it to fb->data. The callback + * must zero out all the data allocated. Then the callback must set fb->size + * to the allocated size. The application does not need to align the allocated + * data. The callback is triggered when the decoder needs a frame buffer to + * decode a compressed image into. This function may be called more than once + * for every call to aom_codec_decode. The application may set fb->priv to + * some data which will be passed back in the aom_image_t and the release + * function call. |fb| is guaranteed to not be NULL. On success the callback + * must return 0. Any failure the callback must return a value less than 0. + * + * \param[in] priv Callback's private data + * \param[in] new_size Size in bytes needed by the buffer + * \param[in,out] fb Pointer to aom_codec_frame_buffer_t + */ +typedef int (*aom_get_frame_buffer_cb_fn_t)(void *priv, size_t min_size, + aom_codec_frame_buffer_t *fb); + +/*!\brief release frame buffer callback prototype + * + * This callback is invoked by the decoder when the frame buffer is not + * referenced by any other buffers. |fb| is guaranteed to not be NULL. On + * success the callback must return 0. Any failure the callback must return + * a value less than 0. + * + * \param[in] priv Callback's private data + * \param[in] fb Pointer to aom_codec_frame_buffer_t + */ +typedef int (*aom_release_frame_buffer_cb_fn_t)(void *priv, + aom_codec_frame_buffer_t *fb); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_AOM_FRAME_BUFFER_H_ diff --git a/libs/libaom/src/aom/aom_image.h b/libs/libaom/src/aom/aom_image.h new file mode 100644 index 000000000..bb6973f9c --- /dev/null +++ b/libs/libaom/src/aom/aom_image.h @@ -0,0 +1,430 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Describes the aom image descriptor and associated operations + * + */ +#ifndef AOM_AOM_AOM_IMAGE_H_ +#define AOM_AOM_AOM_IMAGE_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "aom/aom_integer.h" + +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + */ +#define AOM_IMAGE_ABI_VERSION (9) /**<\hideinitializer*/ + +#define AOM_IMG_FMT_PLANAR 0x100 /**< Image is a planar format. */ +#define AOM_IMG_FMT_UV_FLIP 0x200 /**< V plane precedes U in memory. */ +/** 0x400 used to signal alpha channel, skipping for backwards compatibility. */ +#define AOM_IMG_FMT_HIGHBITDEPTH 0x800 /**< Image uses 16bit framebuffer. */ + +/*!\brief List of supported image formats */ +typedef enum aom_img_fmt { + AOM_IMG_FMT_NONE, + AOM_IMG_FMT_YV12 = + AOM_IMG_FMT_PLANAR | AOM_IMG_FMT_UV_FLIP | 1, /**< planar YVU */ + AOM_IMG_FMT_I420 = AOM_IMG_FMT_PLANAR | 2, + AOM_IMG_FMT_AOMYV12 = AOM_IMG_FMT_PLANAR | AOM_IMG_FMT_UV_FLIP | + 3, /** < planar 4:2:0 format with aom color space */ + AOM_IMG_FMT_AOMI420 = AOM_IMG_FMT_PLANAR | 4, + AOM_IMG_FMT_I422 = AOM_IMG_FMT_PLANAR | 5, + AOM_IMG_FMT_I444 = AOM_IMG_FMT_PLANAR | 6, + AOM_IMG_FMT_I42016 = AOM_IMG_FMT_I420 | AOM_IMG_FMT_HIGHBITDEPTH, + AOM_IMG_FMT_YV1216 = AOM_IMG_FMT_YV12 | AOM_IMG_FMT_HIGHBITDEPTH, + AOM_IMG_FMT_I42216 = AOM_IMG_FMT_I422 | AOM_IMG_FMT_HIGHBITDEPTH, + AOM_IMG_FMT_I44416 = AOM_IMG_FMT_I444 | AOM_IMG_FMT_HIGHBITDEPTH, +} aom_img_fmt_t; /**< alias for enum aom_img_fmt */ + +/*!\brief List of supported color primaries */ +typedef enum aom_color_primaries { + AOM_CICP_CP_RESERVED_0 = 0, /**< For future use */ + AOM_CICP_CP_BT_709 = 1, /**< BT.709 */ + AOM_CICP_CP_UNSPECIFIED = 2, /**< Unspecified */ + AOM_CICP_CP_RESERVED_3 = 3, /**< For future use */ + AOM_CICP_CP_BT_470_M = 4, /**< BT.470 System M (historical) */ + AOM_CICP_CP_BT_470_B_G = 5, /**< BT.470 System B, G (historical) */ + AOM_CICP_CP_BT_601 = 6, /**< BT.601 */ + AOM_CICP_CP_SMPTE_240 = 7, /**< SMPTE 240 */ + AOM_CICP_CP_GENERIC_FILM = + 8, /**< Generic film (color filters using illuminant C) */ + AOM_CICP_CP_BT_2020 = 9, /**< BT.2020, BT.2100 */ + AOM_CICP_CP_XYZ = 10, /**< SMPTE 428 (CIE 1921 XYZ) */ + AOM_CICP_CP_SMPTE_431 = 11, /**< SMPTE RP 431-2 */ + AOM_CICP_CP_SMPTE_432 = 12, /**< SMPTE EG 432-1 */ + AOM_CICP_CP_RESERVED_13 = 13, /**< For future use (values 13 - 21) */ + AOM_CICP_CP_EBU_3213 = 22, /**< EBU Tech. 3213-E */ + AOM_CICP_CP_RESERVED_23 = 23 /**< For future use (values 23 - 255) */ +} aom_color_primaries_t; /**< alias for enum aom_color_primaries */ + +/*!\brief List of supported transfer functions */ +typedef enum aom_transfer_characteristics { + AOM_CICP_TC_RESERVED_0 = 0, /**< For future use */ + AOM_CICP_TC_BT_709 = 1, /**< BT.709 */ + AOM_CICP_TC_UNSPECIFIED = 2, /**< Unspecified */ + AOM_CICP_TC_RESERVED_3 = 3, /**< For future use */ + AOM_CICP_TC_BT_470_M = 4, /**< BT.470 System M (historical) */ + AOM_CICP_TC_BT_470_B_G = 5, /**< BT.470 System B, G (historical) */ + AOM_CICP_TC_BT_601 = 6, /**< BT.601 */ + AOM_CICP_TC_SMPTE_240 = 7, /**< SMPTE 240 M */ + AOM_CICP_TC_LINEAR = 8, /**< Linear */ + AOM_CICP_TC_LOG_100 = 9, /**< Logarithmic (100 : 1 range) */ + AOM_CICP_TC_LOG_100_SQRT10 = + 10, /**< Logarithmic (100 * Sqrt(10) : 1 range) */ + AOM_CICP_TC_IEC_61966 = 11, /**< IEC 61966-2-4 */ + AOM_CICP_TC_BT_1361 = 12, /**< BT.1361 */ + AOM_CICP_TC_SRGB = 13, /**< sRGB or sYCC*/ + AOM_CICP_TC_BT_2020_10_BIT = 14, /**< BT.2020 10-bit systems */ + AOM_CICP_TC_BT_2020_12_BIT = 15, /**< BT.2020 12-bit systems */ + AOM_CICP_TC_SMPTE_2084 = 16, /**< SMPTE ST 2084, ITU BT.2100 PQ */ + AOM_CICP_TC_SMPTE_428 = 17, /**< SMPTE ST 428 */ + AOM_CICP_TC_HLG = 18, /**< BT.2100 HLG, ARIB STD-B67 */ + AOM_CICP_TC_RESERVED_19 = 19 /**< For future use (values 19-255) */ +} aom_transfer_characteristics_t; /**< alias for enum aom_transfer_function */ + +/*!\brief List of supported matrix coefficients */ +typedef enum aom_matrix_coefficients { + AOM_CICP_MC_IDENTITY = 0, /**< Identity matrix */ + AOM_CICP_MC_BT_709 = 1, /**< BT.709 */ + AOM_CICP_MC_UNSPECIFIED = 2, /**< Unspecified */ + AOM_CICP_MC_RESERVED_3 = 3, /**< For future use */ + AOM_CICP_MC_FCC = 4, /**< US FCC 73.628 */ + AOM_CICP_MC_BT_470_B_G = 5, /**< BT.470 System B, G (historical) */ + AOM_CICP_MC_BT_601 = 6, /**< BT.601 */ + AOM_CICP_MC_SMPTE_240 = 7, /**< SMPTE 240 M */ + AOM_CICP_MC_SMPTE_YCGCO = 8, /**< YCgCo */ + AOM_CICP_MC_BT_2020_NCL = + 9, /**< BT.2020 non-constant luminance, BT.2100 YCbCr */ + AOM_CICP_MC_BT_2020_CL = 10, /**< BT.2020 constant luminance */ + AOM_CICP_MC_SMPTE_2085 = 11, /**< SMPTE ST 2085 YDzDx */ + AOM_CICP_MC_CHROMAT_NCL = + 12, /**< Chromaticity-derived non-constant luminance */ + AOM_CICP_MC_CHROMAT_CL = 13, /**< Chromaticity-derived constant luminance */ + AOM_CICP_MC_ICTCP = 14, /**< BT.2100 ICtCp */ + AOM_CICP_MC_RESERVED_15 = 15 /**< For future use (values 15-255) */ +} aom_matrix_coefficients_t; + +/*!\brief List of supported color range */ +typedef enum aom_color_range { + AOM_CR_STUDIO_RANGE = 0, /**< Y [16..235], UV [16..240] */ + AOM_CR_FULL_RANGE = 1 /**< YUV/RGB [0..255] */ +} aom_color_range_t; /**< alias for enum aom_color_range */ + +/*!\brief List of chroma sample positions */ +typedef enum aom_chroma_sample_position { + AOM_CSP_UNKNOWN = 0, /**< Unknown */ + AOM_CSP_VERTICAL = 1, /**< Horizontally co-located with luma(0, 0)*/ + /**< sample, between two vertical samples */ + AOM_CSP_COLOCATED = 2, /**< Co-located with luma(0, 0) sample */ + AOM_CSP_RESERVED = 3 /**< Reserved value */ +} aom_chroma_sample_position_t; /**< alias for enum aom_transfer_function */ + +/*!\brief List of insert flags for Metadata + * + * These flags control how the library treats metadata during encode. + * + * While encoding, when metadata is added to an aom_image via + * aom_img_add_metadata(), the flag passed along with the metadata will + * determine where the metadata OBU will be placed in the encoded OBU stream. + * Metadata will be emitted into the output stream within the next temporal unit + * if it satisfies the specified insertion flag. + * + * During decoding, when the library encounters a metadata OBU, it is always + * flagged as AOM_MIF_ANY_FRAME and emitted with the next output aom_image. + */ +typedef enum aom_metadata_insert_flags { + AOM_MIF_NON_KEY_FRAME = 0, /**< Adds metadata if it's not keyframe */ + AOM_MIF_KEY_FRAME = 1, /**< Adds metadata only if it's a keyframe */ + AOM_MIF_ANY_FRAME = 2 /**< Adds metadata to any type of frame */ +} aom_metadata_insert_flags_t; + +/*!\brief Array of aom_metadata structs for an image. */ +typedef struct aom_metadata_array aom_metadata_array_t; + +/*!\brief Metadata payload. */ +typedef struct aom_metadata { + uint32_t type; /**< Metadata type */ + uint8_t *payload; /**< Metadata payload data */ + size_t sz; /**< Metadata payload size */ + aom_metadata_insert_flags_t insert_flag; /**< Metadata insertion flag */ +} aom_metadata_t; + +/**\brief Image Descriptor */ +typedef struct aom_image { + aom_img_fmt_t fmt; /**< Image Format */ + aom_color_primaries_t cp; /**< CICP Color Primaries */ + aom_transfer_characteristics_t tc; /**< CICP Transfer Characteristics */ + aom_matrix_coefficients_t mc; /**< CICP Matrix Coefficients */ + int monochrome; /**< Whether image is monochrome */ + aom_chroma_sample_position_t csp; /**< chroma sample position */ + aom_color_range_t range; /**< Color Range */ + + /* Image storage dimensions */ + unsigned int w; /**< Stored image width */ + unsigned int h; /**< Stored image height */ + unsigned int bit_depth; /**< Stored image bit-depth */ + + /* Image display dimensions */ + unsigned int d_w; /**< Displayed image width */ + unsigned int d_h; /**< Displayed image height */ + + /* Image intended rendering dimensions */ + unsigned int r_w; /**< Intended rendering image width */ + unsigned int r_h; /**< Intended rendering image height */ + + /* Chroma subsampling info */ + unsigned int x_chroma_shift; /**< subsampling order, X */ + unsigned int y_chroma_shift; /**< subsampling order, Y */ + +/* Image data pointers. */ +#define AOM_PLANE_PACKED 0 /**< To be used for all packed formats */ +#define AOM_PLANE_Y 0 /**< Y (Luminance) plane */ +#define AOM_PLANE_U 1 /**< U (Chroma) plane */ +#define AOM_PLANE_V 2 /**< V (Chroma) plane */ + unsigned char *planes[3]; /**< pointer to the top left pixel for each plane */ + int stride[3]; /**< stride between rows for each plane */ + size_t sz; /**< data size */ + + int bps; /**< bits per sample (for packed formats) */ + + int temporal_id; /**< Temporal layer Id of image */ + int spatial_id; /**< Spatial layer Id of image */ + + /*!\brief The following member may be set by the application to associate + * data with this image. + */ + void *user_priv; + + /* The following members should be treated as private. */ + unsigned char *img_data; /**< private */ + int img_data_owner; /**< private */ + int self_allocd; /**< private */ + + aom_metadata_array_t + *metadata; /**< Metadata payloads associated with the image. */ + + void *fb_priv; /**< Frame buffer data associated with the image. */ +} aom_image_t; /**< alias for struct aom_image */ + +/*!\brief Open a descriptor, allocating storage for the underlying image + * + * Returns a descriptor for storing an image of the given format. The + * storage for the image is allocated on the heap. + * + * \param[in] img Pointer to storage for descriptor. If this parameter + * is NULL, the storage for the descriptor will be + * allocated on the heap. + * \param[in] fmt Format for the image + * \param[in] d_w Width of the image + * \param[in] d_h Height of the image + * \param[in] align Alignment, in bytes, of the image buffer and + * each row in the image (stride). + * + * \return Returns a pointer to the initialized image descriptor. If the img + * parameter is non-null, the value of the img parameter will be + * returned. + */ +aom_image_t *aom_img_alloc(aom_image_t *img, aom_img_fmt_t fmt, + unsigned int d_w, unsigned int d_h, + unsigned int align); + +/*!\brief Open a descriptor, using existing storage for the underlying image + * + * Returns a descriptor for storing an image of the given format. The + * storage for the image has been allocated elsewhere, and a descriptor is + * desired to "wrap" that storage. + * + * \param[in] img Pointer to storage for descriptor. If this parameter + * is NULL, the storage for the descriptor will be + * allocated on the heap. + * \param[in] fmt Format for the image + * \param[in] d_w Width of the image + * \param[in] d_h Height of the image + * \param[in] align Alignment, in bytes, of each row in the image + * (stride). + * \param[in] img_data Storage to use for the image + * + * \return Returns a pointer to the initialized image descriptor. If the img + * parameter is non-null, the value of the img parameter will be + * returned. + */ +aom_image_t *aom_img_wrap(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w, + unsigned int d_h, unsigned int align, + unsigned char *img_data); + +/*!\brief Open a descriptor, allocating storage for the underlying image with a + * border + * + * Returns a descriptor for storing an image of the given format and its + * borders. The storage for the image is allocated on the heap. + * + * \param[in] img Pointer to storage for descriptor. If this parameter + * is NULL, the storage for the descriptor will be + * allocated on the heap. + * \param[in] fmt Format for the image + * \param[in] d_w Width of the image + * \param[in] d_h Height of the image + * \param[in] align Alignment, in bytes, of the image buffer and + * each row in the image (stride). + * \param[in] size_align Alignment, in pixels, of the image width and height. + * \param[in] border A border that is padded on four sides of the image. + * + * \return Returns a pointer to the initialized image descriptor. If the img + * parameter is non-null, the value of the img parameter will be + * returned. + */ +aom_image_t *aom_img_alloc_with_border(aom_image_t *img, aom_img_fmt_t fmt, + unsigned int d_w, unsigned int d_h, + unsigned int align, + unsigned int size_align, + unsigned int border); + +/*!\brief Set the rectangle identifying the displayed portion of the image + * + * Updates the displayed rectangle (aka viewport) on the image surface to + * match the specified coordinates and size. + * + * \param[in] img Image descriptor + * \param[in] x leftmost column + * \param[in] y topmost row + * \param[in] w width + * \param[in] h height + * \param[in] border A border that is padded on four sides of the image. + * + * \return 0 if the requested rectangle is valid, nonzero otherwise. + */ +int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y, + unsigned int w, unsigned int h, unsigned int border); + +/*!\brief Flip the image vertically (top for bottom) + * + * Adjusts the image descriptor's pointers and strides to make the image + * be referenced upside-down. + * + * \param[in] img Image descriptor + */ +void aom_img_flip(aom_image_t *img); + +/*!\brief Close an image descriptor + * + * Frees all allocated storage associated with an image descriptor. + * + * \param[in] img Image descriptor + */ +void aom_img_free(aom_image_t *img); + +/*!\brief Get the width of a plane + * + * Get the width of a plane of an image + * + * \param[in] img Image descriptor + * \param[in] plane Plane index + */ +int aom_img_plane_width(const aom_image_t *img, int plane); + +/*!\brief Get the height of a plane + * + * Get the height of a plane of an image + * + * \param[in] img Image descriptor + * \param[in] plane Plane index + */ +int aom_img_plane_height(const aom_image_t *img, int plane); + +/*!\brief Add metadata to image. + * + * Adds metadata to aom_image_t. + * Function makes a copy of the provided data parameter. + * Metadata insertion point is controlled by insert_flag. + * + * \param[in] img Image descriptor + * \param[in] type Metadata type + * \param[in] data Metadata contents + * \param[in] sz Metadata contents size + * \param[in] insert_flag Metadata insert flag + */ +int aom_img_add_metadata(aom_image_t *img, uint32_t type, const uint8_t *data, + size_t sz, aom_metadata_insert_flags_t insert_flag); + +/*!\brief Return a metadata payload stored within the image metadata array. + * + * Gets the metadata (aom_metadata_t) at the indicated index in the image + * metadata array. + * + * \param[in] img Pointer to image descriptor to get metadata from + * \param[in] index Metadata index to get from metadata array + * + * \return Returns a const pointer to the selected metadata, if img and/or index + * is invalid, it returns NULL. + */ +const aom_metadata_t *aom_img_get_metadata(const aom_image_t *img, + size_t index); + +/*!\brief Return the number of metadata blocks within the image. + * + * Gets the number of metadata blocks contained within the provided image + * metadata array. + * + * \param[in] img Pointer to image descriptor to get metadata number + * from. + * + * \return Returns the size of the metadata array. If img or metadata is NULL, + * it returns 0. + */ +size_t aom_img_num_metadata(const aom_image_t *img); + +/*!\brief Remove metadata from image. + * + * Removes all metadata in image metadata list and sets metadata list pointer + * to NULL. + * + * \param[in] img Image descriptor + */ +void aom_img_remove_metadata(aom_image_t *img); + +/*!\brief Allocate memory for aom_metadata struct. + * + * Allocates storage for the metadata payload, sets its type and copies the + * payload data into the aom_metadata struct. A metadata payload buffer of size + * sz is allocated and sz bytes are copied from data into the payload buffer. + * + * \param[in] type Metadata type + * \param[in] data Metadata data pointer + * \param[in] sz Metadata size + * \param[in] insert_flag Metadata insert flag + */ +aom_metadata_t *aom_img_metadata_alloc(uint32_t type, const uint8_t *data, + size_t sz, + aom_metadata_insert_flags_t insert_flag); + +/*!\brief Free metadata struct. + * + * Free metadata struct and its buffer. + * + * \param[in] metadata Metadata struct pointer + */ +void aom_img_metadata_free(aom_metadata_t *metadata); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_AOM_IMAGE_H_ diff --git a/libs/libaom/src/aom/aom_integer.h b/libs/libaom/src/aom/aom_integer.h new file mode 100644 index 000000000..113671e82 --- /dev/null +++ b/libs/libaom/src/aom/aom_integer.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AOM_AOM_INTEGER_H_ +#define AOM_AOM_AOM_INTEGER_H_ + +/* get ptrdiff_t, size_t, wchar_t, NULL */ +#include + +#if defined(_MSC_VER) +#define AOM_FORCE_INLINE __forceinline +#define AOM_INLINE __inline +#else +#define AOM_FORCE_INLINE __inline__ __attribute__((always_inline)) +#define AOM_INLINE inline +#endif + +#if defined(AOM_EMULATE_INTTYPES) +typedef signed char int8_t; +typedef signed short int16_t; +typedef signed int int32_t; + +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; + +#ifndef _UINTPTR_T_DEFINED +typedef size_t uintptr_t; +#endif + +#else + +/* Most platforms have the C99 standard integer types. */ + +#if defined(__cplusplus) +#if !defined(__STDC_FORMAT_MACROS) +#define __STDC_FORMAT_MACROS +#endif +#if !defined(__STDC_LIMIT_MACROS) +#define __STDC_LIMIT_MACROS +#endif +#endif // __cplusplus + +#include + +#endif + +/* VS2010 defines stdint.h, but not inttypes.h */ +#if defined(_MSC_VER) && _MSC_VER < 1800 +#define PRId64 "I64d" +#else +#include +#endif + +#if !defined(INT8_MAX) +#define INT8_MAX 127 +#endif + +#if !defined(INT32_MAX) +#define INT32_MAX 2147483647 +#endif + +#if !defined(INT32_MIN) +#define INT32_MIN (-2147483647 - 1) +#endif + +#if defined(__cplusplus) +extern "C" { +#endif // __cplusplus + +// Returns size of uint64_t when encoded using LEB128. +size_t aom_uleb_size_in_bytes(uint64_t value); + +// Returns 0 on success, -1 on decode failure. +// On success, 'value' stores the decoded LEB128 value and 'length' stores +// the number of bytes decoded. +int aom_uleb_decode(const uint8_t *buffer, size_t available, uint64_t *value, + size_t *length); + +// Encodes LEB128 integer. Returns 0 when successful, and -1 upon failure. +int aom_uleb_encode(uint64_t value, size_t available, uint8_t *coded_value, + size_t *coded_size); + +// Encodes LEB128 integer to size specified. Returns 0 when successful, and -1 +// upon failure. +// Note: This will write exactly pad_to_size bytes; if the value cannot be +// encoded in this many bytes, then this will fail. +int aom_uleb_encode_fixed_size(uint64_t value, size_t available, + size_t pad_to_size, uint8_t *coded_value, + size_t *coded_size); + +#if defined(__cplusplus) +} // extern "C" +#endif // __cplusplus + +#endif // AOM_AOM_AOM_INTEGER_H_ diff --git a/libs/libaom/src/aom/aomcx.h b/libs/libaom/src/aom/aomcx.h new file mode 100644 index 000000000..051d33e7b --- /dev/null +++ b/libs/libaom/src/aom/aomcx.h @@ -0,0 +1,1774 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AOM_AOMCX_H_ +#define AOM_AOM_AOMCX_H_ + +/*!\defgroup aom_encoder AOMedia AOM/AV1 Encoder + * \ingroup aom + * + * @{ + */ +#include "aom/aom.h" +#include "aom/aom_encoder.h" + +/*!\file + * \brief Provides definitions for using AOM or AV1 encoder algorithm within the + * aom Codec Interface. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\name Algorithm interface for AV1 + * + * This interface provides the capability to encode raw AV1 streams. + * @{ + */ +extern aom_codec_iface_t aom_codec_av1_cx_algo; +extern aom_codec_iface_t *aom_codec_av1_cx(void); +/*!@} - end algorithm interface member group*/ + +/* + * Algorithm Flags + */ + +/*!\brief Don't reference the last frame + * + * When this flag is set, the encoder will not use the last frame as a + * predictor. When not set, the encoder will choose whether to use the + * last frame or not automatically. + */ +#define AOM_EFLAG_NO_REF_LAST (1 << 16) +/*!\brief Don't reference the last2 frame + * + * When this flag is set, the encoder will not use the last2 frame as a + * predictor. When not set, the encoder will choose whether to use the + * last2 frame or not automatically. + */ +#define AOM_EFLAG_NO_REF_LAST2 (1 << 17) +/*!\brief Don't reference the last3 frame + * + * When this flag is set, the encoder will not use the last3 frame as a + * predictor. When not set, the encoder will choose whether to use the + * last3 frame or not automatically. + */ +#define AOM_EFLAG_NO_REF_LAST3 (1 << 18) +/*!\brief Don't reference the golden frame + * + * When this flag is set, the encoder will not use the golden frame as a + * predictor. When not set, the encoder will choose whether to use the + * golden frame or not automatically. + */ +#define AOM_EFLAG_NO_REF_GF (1 << 19) + +/*!\brief Don't reference the alternate reference frame + * + * When this flag is set, the encoder will not use the alt ref frame as a + * predictor. When not set, the encoder will choose whether to use the + * alt ref frame or not automatically. + */ +#define AOM_EFLAG_NO_REF_ARF (1 << 20) +/*!\brief Don't reference the bwd reference frame + * + * When this flag is set, the encoder will not use the bwd ref frame as a + * predictor. When not set, the encoder will choose whether to use the + * bwd ref frame or not automatically. + */ +#define AOM_EFLAG_NO_REF_BWD (1 << 21) +/*!\brief Don't reference the alt2 reference frame + * + * When this flag is set, the encoder will not use the alt2 ref frame as a + * predictor. When not set, the encoder will choose whether to use the + * alt2 ref frame or not automatically. + */ +#define AOM_EFLAG_NO_REF_ARF2 (1 << 22) + +/*!\brief Don't update the last frame + * + * When this flag is set, the encoder will not update the last frame with + * the contents of the current frame. + */ +#define AOM_EFLAG_NO_UPD_LAST (1 << 23) + +/*!\brief Don't update the golden frame + * + * When this flag is set, the encoder will not update the golden frame with + * the contents of the current frame. + */ +#define AOM_EFLAG_NO_UPD_GF (1 << 24) + +/*!\brief Don't update the alternate reference frame + * + * When this flag is set, the encoder will not update the alt ref frame with + * the contents of the current frame. + */ +#define AOM_EFLAG_NO_UPD_ARF (1 << 25) +/*!\brief Disable entropy update + * + * When this flag is set, the encoder will not update its internal entropy + * model based on the entropy of this frame. + */ +#define AOM_EFLAG_NO_UPD_ENTROPY (1 << 26) +/*!\brief Disable ref frame mvs + * + * When this flag is set, the encoder will not allow frames to + * be encoded using mfmv. + */ +#define AOM_EFLAG_NO_REF_FRAME_MVS (1 << 27) +/*!\brief Enable error resilient frame + * + * When this flag is set, the encoder will code frames as error + * resilient. + */ +#define AOM_EFLAG_ERROR_RESILIENT (1 << 28) +/*!\brief Enable s frame mode + * + * When this flag is set, the encoder will code frames as an + * s frame. + */ +#define AOM_EFLAG_SET_S_FRAME (1 << 29) +/*!\brief Force primary_ref_frame to PRIMARY_REF_NONE + * + * When this flag is set, the encoder will set a frame's primary_ref_frame + * to PRIMARY_REF_NONE + */ +#define AOM_EFLAG_SET_PRIMARY_REF_NONE (1 << 30) + +/*!\brief AVx encoder control functions + * + * This set of macros define the control functions available for AVx + * encoder interface. + * + * \sa #aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...) + */ +enum aome_enc_control_id { + /*!\brief Codec control function to set which reference frame encoder can use, + * int parameter. + */ + AOME_USE_REFERENCE = 7, + + /*!\brief Codec control function to pass an ROI map to encoder, aom_roi_map_t* + * parameter. + */ + AOME_SET_ROI_MAP = 8, + + /*!\brief Codec control function to pass an Active map to encoder, + * aom_active_map_t* parameter. + */ + AOME_SET_ACTIVEMAP = 9, + + /* NOTE: enum 10 unused */ + + /*!\brief Codec control function to set encoder scaling mode, + * aom_scaling_mode_t* parameter. + */ + AOME_SET_SCALEMODE = 11, + + /*!\brief Codec control function to set encoder spatial layer id, unsigned int + * parameter. + */ + AOME_SET_SPATIAL_LAYER_ID = 12, + + /*!\brief Codec control function to set encoder internal speed settings, + * int parameter + * + * Changes in this value influences the complexity of algorithms used in + * encoding process, values greater than 0 will increase encoder speed at + * the expense of quality. + * + * Valid range: 0..8. 0 runs the slowest, and 8 runs the fastest; + * quality improves as speed decreases (since more compression + * possibilities are explored). + */ + AOME_SET_CPUUSED = 13, + + /*!\brief Codec control function to enable automatic set and use alf frames, + * unsigned int parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AOME_SET_ENABLEAUTOALTREF = 14, + + /* NOTE: enum 15 unused */ + + /*!\brief Codec control function to set sharpness, unsigned int parameter. + */ + AOME_SET_SHARPNESS = AOME_SET_ENABLEAUTOALTREF + 2, // 16 + + /*!\brief Codec control function to set the threshold for MBs treated static, + * unsigned int parameter + */ + AOME_SET_STATIC_THRESHOLD = 17, + + /* NOTE: enum 18 unused */ + + /*!\brief Codec control function to get last quantizer chosen by the encoder, + * int* parameter + * + * Return value uses internal quantizer scale defined by the codec. + */ + AOME_GET_LAST_QUANTIZER = AOME_SET_STATIC_THRESHOLD + 2, // 19 + + /*!\brief Codec control function to get last quantizer chosen by the encoder, + * int* parameter + * + * Return value uses the 0..63 scale as used by the rc_*_quantizer config + * parameters. + */ + AOME_GET_LAST_QUANTIZER_64 = 20, + + /*!\brief Codec control function to set the max no of frames to create arf, + * unsigned int parameter + */ + AOME_SET_ARNR_MAXFRAMES = 21, + + /*!\brief Codec control function to set the filter strength for the arf, + * unsigned int parameter + */ + AOME_SET_ARNR_STRENGTH = 22, + + /* NOTE: enum 23 unused */ + + /*!\brief Codec control function to set visual tuning, aom_tune_metric (int) + * parameter + */ + AOME_SET_TUNING = AOME_SET_ARNR_STRENGTH + 2, // 24 + + /*!\brief Codec control function to set constrained / constant quality level, + * unsigned int parameter + * + * Valid range: 0..63 + * + * \attention For this value to be used aom_codec_enc_cfg_t::rc_end_usage + * must be set to #AOM_CQ or #AOM_Q. + */ + AOME_SET_CQ_LEVEL = 25, + + /*!\brief Codec control function to set max data rate for intra frames, + * unsigned int parameter + * + * This value controls additional clamping on the maximum size of a + * keyframe. It is expressed as a percentage of the average + * per-frame bitrate, with the special (and default) value 0 meaning + * unlimited, or no additional clamping beyond the codec's built-in + * algorithm. + * + * For example, to allocate no more than 4.5 frames worth of bitrate + * to a keyframe, set this to 450. + */ + AOME_SET_MAX_INTRA_BITRATE_PCT = 26, + + /*!\brief Codec control function to set number of spatial layers, int + * parameter + */ + AOME_SET_NUMBER_SPATIAL_LAYERS = 27, + + /*!\brief Codec control function to set max data rate for inter frames, + * unsigned int parameter + * + * This value controls additional clamping on the maximum size of an + * inter frame. It is expressed as a percentage of the average + * per-frame bitrate, with the special (and default) value 0 meaning + * unlimited, or no additional clamping beyond the codec's built-in + * algorithm. + * + * For example, to allow no more than 4.5 frames worth of bitrate + * to an inter frame, set this to 450. + */ + AV1E_SET_MAX_INTER_BITRATE_PCT = AOME_SET_MAX_INTRA_BITRATE_PCT + 2, // 28 + + /*!\brief Boost percentage for Golden Frame in CBR mode, unsigned int + * parameter + * + * This value controls the amount of boost given to Golden Frame in + * CBR mode. It is expressed as a percentage of the average + * per-frame bitrate, with the special (and default) value 0 meaning + * the feature is off, i.e., no golden frame boost in CBR mode and + * average bitrate target is used. + * + * For example, to allow 100% more bits, i.e, 2X, in a golden frame + * than average frame, set this to 100. + */ + AV1E_SET_GF_CBR_BOOST_PCT = 29, + + /* NOTE: enum 30 unused */ + + /*!\brief Codec control function to set lossless encoding mode, unsigned int + * parameter + * + * AV1 can operate in lossless encoding mode, in which the bitstream + * produced will be able to decode and reconstruct a perfect copy of + * input source. + * + * - 0 = normal coding mode, may be lossy (default) + * - 1 = lossless coding mode + */ + AV1E_SET_LOSSLESS = AV1E_SET_GF_CBR_BOOST_PCT + 2, // 31 + + /*!\brief Codec control function to enable the row based multi-threading + * of the encoder, unsigned int parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ROW_MT = 32, + + /*!\brief Codec control function to set number of tile columns. unsigned int + * parameter + * + * In encoding and decoding, AV1 allows an input image frame be partitioned + * into separate vertical tile columns, which can be encoded or decoded + * independently. This enables easy implementation of parallel encoding and + * decoding. The parameter for this control describes the number of tile + * columns (in log2 units), which has a valid range of [0, 6]: + * \verbatim + 0 = 1 tile column + 1 = 2 tile columns + 2 = 4 tile columns + ..... + n = 2**n tile columns + \endverbatim + * By default, the value is 0, i.e. one single column tile for entire image. + */ + AV1E_SET_TILE_COLUMNS = 33, + + /*!\brief Codec control function to set number of tile rows, unsigned int + * parameter + * + * In encoding and decoding, AV1 allows an input image frame be partitioned + * into separate horizontal tile rows, which can be encoded or decoded + * independently. The parameter for this control describes the number of tile + * rows (in log2 units), which has a valid range of [0, 6]: + * \verbatim + 0 = 1 tile row + 1 = 2 tile rows + 2 = 4 tile rows + ..... + n = 2**n tile rows + \endverbatim + * By default, the value is 0, i.e. one single row tile for entire image. + */ + AV1E_SET_TILE_ROWS = 34, + + /*!\brief Codec control function to enable RDO modulated by frame temporal + * dependency, unsigned int parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_TPL_MODEL = 35, + + /*!\brief Codec control function to enable temporal filtering on key frame, + * unsigned int parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_KEYFRAME_FILTERING = 36, + + /*!\brief Codec control function to enable frame parallel decoding feature, + * unsigned int parameter + * + * AV1 has a bitstream feature to reduce decoding dependency between frames + * by turning off backward update of probability context used in encoding + * and decoding. This allows staged parallel processing of more than one + * video frames in the decoder. This control function provides a mean to + * turn this feature on or off for bitstreams produced by encoder. + * + * - 0 = disable (default) + * - 1 = enable + */ + AV1E_SET_FRAME_PARALLEL_DECODING = 37, + + /*!\brief Codec control function to enable error_resilient_mode, int parameter + * + * AV1 has a bitstream feature to guarantee parseability of a frame + * by turning on the error_resilient_decoding mode, even though the + * reference buffers are unreliable or not received. + * + * - 0 = disable (default) + * - 1 = enable + */ + AV1E_SET_ERROR_RESILIENT_MODE = 38, + + /*!\brief Codec control function to enable s_frame_mode, int parameter + * + * AV1 has a bitstream feature to designate certain frames as S-frames, + * from where we can switch to a different stream, + * even though the reference buffers may not be exactly identical. + * + * - 0 = disable (default) + * - 1 = enable + */ + AV1E_SET_S_FRAME_MODE = 39, + + /*!\brief Codec control function to set adaptive quantization mode, unsigned + * int parameter + * + * AV1 has a segment based feature that allows encoder to adaptively change + * quantization parameter for each segment within a frame to improve the + * subjective quality. This control makes encoder operate in one of the + * several AQ_modes supported. + * + * - 0 = disable (default) + * - 1 = enable + */ + AV1E_SET_AQ_MODE = 40, + + /*!\brief Codec control function to enable/disable periodic Q boost, unsigned + * int parameter + * + * One AV1 encoder speed feature is to enable quality boost by lowering + * frame level Q periodically. This control function provides a mean to + * turn on/off this feature. + * + * - 0 = disable (default) + * - 1 = enable + */ + AV1E_SET_FRAME_PERIODIC_BOOST = 41, + + /*!\brief Codec control function to set noise sensitivity, unsigned int + * parameter + * + * - 0 = disable (default) + * - 1 = enable (Y only) + */ + AV1E_SET_NOISE_SENSITIVITY = 42, + + /*!\brief Codec control function to set content type, aom_tune_content + * parameter + * + * - AOM_CONTENT_DEFAULT = Regular video content (default) + * - AOM_CONTENT_SCREEN = Screen capture content + */ + AV1E_SET_TUNE_CONTENT = 43, + + /*!\brief Codec control function to set CDF update mode, unsigned int + * parameter + * + * - 0: no update + * - 1: update on every frame (default) + * - 2: selectively update + */ + AV1E_SET_CDF_UPDATE_MODE = 44, + + /*!\brief Codec control function to set color space info, int parameter + * + * - 0 = For future use + * - 1 = BT.709 + * - 2 = Unspecified (default) + * - 3 = For future use + * - 4 = BT.470 System M (historical) + * - 5 = BT.470 System B, G (historical) + * - 6 = BT.601 + * - 7 = SMPTE 240 + * - 8 = Generic film (color filters using illuminant C) + * - 9 = BT.2020, BT.2100 + * - 10 = SMPTE 428 (CIE 1921 XYZ) + * - 11 = SMPTE RP 431-2 + * - 12 = SMPTE EG 432-1 + * - 13..21 = For future use + * - 22 = EBU Tech. 3213-E + * - 23 = For future use + */ + AV1E_SET_COLOR_PRIMARIES = 45, + + /*!\brief Codec control function to set transfer function info, int parameter + * + * - 0 = For future use + * - 1 = BT.709 + * - 2 = Unspecified (default) + * - 3 = For future use + * - 4 = BT.470 System M (historical) + * - 5 = BT.470 System B, G (historical) + * - 6 = BT.601 + * - 7 = SMPTE 240 M + * - 8 = Linear + * - 9 = Logarithmic (100 : 1 range) + * - 10 = Logarithmic (100 * Sqrt(10) : 1 range) + * - 11 = IEC 61966-2-4 + * - 12 = BT.1361 + * - 13 = sRGB or sYCC + * - 14 = BT.2020 10-bit systems + * - 15 = BT.2020 12-bit systems + * - 16 = SMPTE ST 2084, ITU BT.2100 PQ + * - 17 = SMPTE ST 428 + * - 18 = BT.2100 HLG, ARIB STD-B67 + * - 19 = For future use + */ + AV1E_SET_TRANSFER_CHARACTERISTICS = 46, + + /*!\brief Codec control function to set transfer function info, int parameter + * + * - 0 = Identity matrix + * - 1 = BT.709 + * - 2 = Unspecified (default) + * - 3 = For future use + * - 4 = US FCC 73.628 + * - 5 = BT.470 System B, G (historical) + * - 6 = BT.601 + * - 7 = SMPTE 240 M + * - 8 = YCgCo + * - 9 = BT.2020 non-constant luminance, BT.2100 YCbCr + * - 10 = BT.2020 constant luminance + * - 11 = SMPTE ST 2085 YDzDx + * - 12 = Chromaticity-derived non-constant luminance + * - 13 = Chromaticity-derived constant luminance + * - 14 = BT.2100 ICtCp + * - 15 = For future use + */ + AV1E_SET_MATRIX_COEFFICIENTS = 47, + + /*!\brief Codec control function to set chroma 4:2:0 sample position info, + * aom_chroma_sample_position_t parameter + * + * AOM_CSP_UNKNOWN is default + */ + AV1E_SET_CHROMA_SAMPLE_POSITION = 48, + + /*!\brief Codec control function to set minimum interval between GF/ARF + * frames, unsigned int parameter + * + * By default the value is set as 4. + */ + AV1E_SET_MIN_GF_INTERVAL = 49, + + /*!\brief Codec control function to set minimum interval between GF/ARF + * frames, unsigned int parameter + * + * By default the value is set as 16. + */ + AV1E_SET_MAX_GF_INTERVAL = 50, + + /*!\brief Codec control function to get an active map back from the encoder, + aom_active_map_t* parameter + */ + AV1E_GET_ACTIVEMAP = 51, + + /*!\brief Codec control function to set color range bit, int parameter + * + * - 0 = Limited range, 16..235 or HBD equivalent (default) + * - 1 = Full range, 0..255 or HBD equivalent + */ + AV1E_SET_COLOR_RANGE = 52, + + /*!\brief Codec control function to set intended rendering image size, + * int32_t[2] parameter + * + * By default, this is identical to the image size in pixels. + */ + AV1E_SET_RENDER_SIZE = 53, + + /*!\brief Control to set target sequence level index for a certain operating + * point(OP), int parameter + * Possible values are in the form of "ABxy"(pad leading zeros if less than + * 4 digits). + * - AB: OP index. + * - xy: Target level index for the OP. Can be values 0~23(corresponding to + * level 2.0 ~ 7.3) or 24(keep level stats only for level monitoring) or + * 31(maximum level parameter, no level-based constraints). + * + * E.g.: + * - "0" means target level index 0 for the 0th OP; + * - "111" means target level index 11 for the 1st OP; + * - "1021" means target level index 21 for the 10th OP. + * + * If the target level is not specified for an OP, the maximum level parameter + * of 31 is used as default. + */ + AV1E_SET_TARGET_SEQ_LEVEL_IDX = 54, + + /*!\brief Codec control function to get sequence level index for each + * operating point. int* parameter. There can be at most 32 operating points. + * The results will be written into a provided integer array of sufficient + * size. + */ + AV1E_GET_SEQ_LEVEL_IDX = 55, + + /*!\brief Codec control function to set intended superblock size, unsigned int + * parameter + * + * By default, the superblock size is determined separately for each + * frame by the encoder. + */ + AV1E_SET_SUPERBLOCK_SIZE = 56, + + /*!\brief Codec control function to enable automatic set and use of + * bwd-pred frames, unsigned int parameter + * + * - 0 = disable (default) + * - 1 = enable + */ + AOME_SET_ENABLEAUTOBWDREF = 57, + + /*!\brief Codec control function to encode with CDEF, unsigned int parameter + * + * CDEF is the constrained directional enhancement filter which is an + * in-loop filter aiming to remove coding artifacts + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_CDEF = 58, + + /*!\brief Codec control function to encode with Loop Restoration Filter, + * unsigned int parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_RESTORATION = 59, + + /*!\brief Codec control function to force video mode, unsigned int parameter + * + * - 0 = do not force video mode (default) + * - 1 = force video mode even for a single frame + */ + AV1E_SET_FORCE_VIDEO_MODE = 60, + + /*!\brief Codec control function to predict with OBMC mode, unsigned int + * parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_OBMC = 61, + + /*!\brief Codec control function to encode without trellis quantization, + * unsigned int parameter + * + * - 0 = apply trellis quantization (default) + * - 1 = do not apply trellis quantization + * - 2 = disable trellis quantization in rd search + * - 3 = disable trellis quantization in estimate yrd + */ + AV1E_SET_DISABLE_TRELLIS_QUANT = 62, + + /*!\brief Codec control function to encode with quantisation matrices, + * unsigned int parameter + * + * AOM can operate with default quantisation matrices dependent on + * quantisation level and block type. + * + * - 0 = disable (default) + * - 1 = enable + */ + AV1E_SET_ENABLE_QM = 63, + + /*!\brief Codec control function to set the min quant matrix flatness, + * unsigned int parameter + * + * AOM can operate with different ranges of quantisation matrices. + * As quantisation levels increase, the matrices get flatter. This + * control sets the minimum level of flatness from which the matrices + * are determined. + * + * By default, the encoder sets this minimum at half the available + * range. + */ + AV1E_SET_QM_MIN = 64, + + /*!\brief Codec control function to set the max quant matrix flatness, + * unsigned int parameter + * + * AOM can operate with different ranges of quantisation matrices. + * As quantisation levels increase, the matrices get flatter. This + * control sets the maximum level of flatness possible. + * + * By default, the encoder sets this maximum at the top of the + * available range. + */ + AV1E_SET_QM_MAX = 65, + + /*!\brief Codec control function to set the min quant matrix flatness, + * unsigned int parameter + * + * AOM can operate with different ranges of quantisation matrices. + * As quantisation levels increase, the matrices get flatter. This + * control sets the flatness for luma (Y). + * + * By default, the encoder sets this minimum at half the available + * range. + */ + AV1E_SET_QM_Y = 66, + + /*!\brief Codec control function to set the min quant matrix flatness, + * unsigned int parameter + * + * AOM can operate with different ranges of quantisation matrices. + * As quantisation levels increase, the matrices get flatter. This + * control sets the flatness for chroma (U). + * + * By default, the encoder sets this minimum at half the available + * range. + */ + AV1E_SET_QM_U = 67, + + /*!\brief Codec control function to set the min quant matrix flatness, + * unsigned int parameter + * + * AOM can operate with different ranges of quantisation matrices. + * As quantisation levels increase, the matrices get flatter. This + * control sets the flatness for chrome (V). + * + * By default, the encoder sets this minimum at half the available + * range. + */ + AV1E_SET_QM_V = 68, + + /* NOTE: enum 69 unused */ + + /*!\brief Codec control function to set a maximum number of tile groups, + * unsigned int parameter + * + * This will set the maximum number of tile groups. This will be + * overridden if an MTU size is set. The default value is 1. + */ + AV1E_SET_NUM_TG = 70, + + /*!\brief Codec control function to set an MTU size for a tile group, unsigned + * int parameter + * + * This will set the maximum number of bytes in a tile group. This can be + * exceeded only if a single tile is larger than this amount. + * + * By default, the value is 0, in which case a fixed number of tile groups + * is used. + */ + AV1E_SET_MTU = 71, + + /* NOTE: enum 72 unused */ + + /*!\brief Codec control function to enable/disable rectangular partitions, int + * parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_RECT_PARTITIONS = 73, + + /*!\brief Codec control function to enable/disable AB partitions, int + * parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_AB_PARTITIONS = 74, + + /*!\brief Codec control function to enable/disable 1:4 and 4:1 partitions, int + * parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_1TO4_PARTITIONS = 75, + + /*!\brief Codec control function to set min partition size, int parameter + * + * min_partition_size is applied to both width and height of the partition. + * i.e, both width and height of a partition can not be smaller than + * the min_partition_size, except the partition at the picture boundary. + * + * Valid values: [4, 8, 16, 32, 64, 128]. The default value is 4 for + * 4x4. + */ + AV1E_SET_MIN_PARTITION_SIZE = 76, + + /*!\brief Codec control function to set max partition size, int parameter + * + * max_partition_size is applied to both width and height of the partition. + * i.e, both width and height of a partition can not be larger than + * the max_partition_size. + * + * Valid values:[4, 8, 16, 32, 64, 128] The default value is 128 for + * 128x128. + */ + AV1E_SET_MAX_PARTITION_SIZE = 77, + + /*!\brief Codec control function to turn on / off intra edge filter + * at sequence level, int parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_INTRA_EDGE_FILTER = 78, + + /*!\brief Codec control function to turn on / off frame order hint (int + * parameter). Affects: joint compound mode, motion field motion vector, + * ref frame sign bias + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_ORDER_HINT = 79, + + /*!\brief Codec control function to turn on / off 64-length transforms, int + * parameter + * + * This will enable or disable usage of length 64 transforms in any + * direction. + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_TX64 = 80, + + /*!\brief Codec control function to turn on / off flip and identity + * transforms, int parameter + * + * This will enable or disable usage of flip and identity transform + * types in any direction. If enabled, this includes: + * - FLIPADST_DCT + * - DCT_FLIPADST + * - FLIPADST_FLIPADST + * - ADST_FLIPADST + * - FLIPADST_ADST + * - IDTX + * - V_DCT + * - H_DCT + * - V_ADST + * - H_ADST + * - V_FLIPADST + * - H_FLIPADST + * + * Valid values: + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_FLIP_IDTX = 81, + + /* Note: enum value 82 unused */ + + /*!\brief Codec control function to turn on / off dist-wtd compound mode + * at sequence level, int parameter + * + * This will enable or disable distance-weighted compound mode. + * \attention If AV1E_SET_ENABLE_ORDER_HINT is 0, then this flag is forced + * to 0. + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_DIST_WTD_COMP = 83, + + /*!\brief Codec control function to turn on / off ref frame mvs (mfmv) usage + * at sequence level, int parameter + * + * \attention If AV1E_SET_ENABLE_ORDER_HINT is 0, then this flag is forced + * to 0. + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_REF_FRAME_MVS = 84, + + /*!\brief Codec control function to set temporal mv prediction + * enabling/disabling at frame level, int parameter + * + * \attention If AV1E_SET_ENABLE_REF_FRAME_MVS is 0, then this flag is + * forced to 0. + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ALLOW_REF_FRAME_MVS = 85, + + /*!\brief Codec control function to turn on / off dual interpolation filter + * for a sequence, int parameter + * + * - 0 = disable + * - 1 = enable + */ + AV1E_SET_ENABLE_DUAL_FILTER = 86, + + /*!\brief Codec control function to turn on / off delta quantization in chroma + * planes usage for a sequence, int parameter + * + * - 0 = disable (default) + * - 1 = enable + */ + AV1E_SET_ENABLE_CHROMA_DELTAQ = 87, + + /*!\brief Codec control function to turn on / off masked compound usage + * (wedge and diff-wtd compound modes) for a sequence, int parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_MASKED_COMP = 88, + + /*!\brief Codec control function to turn on / off one sided compound usage + * for a sequence, int parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_ONESIDED_COMP = 89, + + /*!\brief Codec control function to turn on / off interintra compound + * for a sequence, int parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_INTERINTRA_COMP = 90, + + /*!\brief Codec control function to turn on / off smooth inter-intra + * mode for a sequence, int parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_SMOOTH_INTERINTRA = 91, + + /*!\brief Codec control function to turn on / off difference weighted + * compound, int parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_DIFF_WTD_COMP = 92, + + /*!\brief Codec control function to turn on / off interinter wedge + * compound, int parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_INTERINTER_WEDGE = 93, + + /*!\brief Codec control function to turn on / off interintra wedge + * compound, int parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_INTERINTRA_WEDGE = 94, + + /*!\brief Codec control function to turn on / off global motion usage + * for a sequence, int parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_GLOBAL_MOTION = 95, + + /*!\brief Codec control function to turn on / off warped motion usage + * at sequence level, int parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_WARPED_MOTION = 96, + + /*!\brief Codec control function to turn on / off warped motion usage + * at frame level, int parameter + * + * \attention If AV1E_SET_ENABLE_WARPED_MOTION is 0, then this flag is + * forced to 0. + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ALLOW_WARPED_MOTION = 97, + + /*!\brief Codec control function to turn on / off filter intra usage at + * sequence level, int parameter + * + * \attention If AV1E_SET_ENABLE_FILTER_INTRA is 0, then this flag is + * forced to 0. + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_FILTER_INTRA = 98, + + /*!\brief Codec control function to turn on / off smooth intra modes usage, + * int parameter + * + * This will enable or disable usage of smooth, smooth_h and smooth_v intra + * modes. + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_SMOOTH_INTRA = 99, + + /*!\brief Codec control function to turn on / off Paeth intra mode usage, int + * parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_PAETH_INTRA = 100, + + /*!\brief Codec control function to turn on / off CFL uv intra mode usage, int + * parameter + * + * This will enable or disable usage of chroma-from-luma intra mode. + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_CFL_INTRA = 101, + + /*!\brief Codec control function to turn on / off frame superresolution, int + * parameter + * + * \attention If AV1E_SET_ENABLE_SUPERRES is 0, then this flag is forced to 0. + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_SUPERRES = 102, + + /*!\brief Codec control function to turn on / off overlay frames for + * filtered ALTREF frames, int parameter + * + * This will enable or disable coding of overlay frames for filtered ALTREF + * frames. When set to 0, overlay frames are not used but show existing frame + * is used to display the filtered ALTREF frame as is. As a result the decoded + * frame rate remains the same as the display frame rate. The default is 1. + */ + AV1E_SET_ENABLE_OVERLAY = 103, + + /*!\brief Codec control function to turn on/off palette mode, int parameter */ + AV1E_SET_ENABLE_PALETTE = 104, + + /*!\brief Codec control function to turn on/off intra block copy mode, int + parameter */ + AV1E_SET_ENABLE_INTRABC = 105, + + /*!\brief Codec control function to turn on/off intra angle delta, int + parameter */ + AV1E_SET_ENABLE_ANGLE_DELTA = 106, + + /*!\brief Codec control function to set the delta q mode, unsigned int + * parameter + * + * AV1 supports a delta q mode feature, that allows modulating q per + * superblock. + * + * - 0 = deltaq signaling off + * - 1 = use modulation to maximize objective quality (default) + * - 2 = use modulation to maximize perceptual quality + */ + AV1E_SET_DELTAQ_MODE = 107, + + /*!\brief Codec control function to turn on/off loopfilter modulation + * when delta q modulation is enabled, unsigned int parameter. + * + * \attention AV1 only supports loopfilter modulation when delta q + * modulation is enabled as well. + */ + AV1E_SET_DELTALF_MODE = 108, + + /*!\brief Codec control function to set the single tile decoding mode, + * unsigned int parameter + * + * \attention Only applicable if large scale tiling is on. + * + * - 0 = single tile decoding is off + * - 1 = single tile decoding is on (default) + */ + AV1E_SET_SINGLE_TILE_DECODING = 109, + + /*!\brief Codec control function to enable the extreme motion vector unit + * test, unsigned int parameter + * + * - 0 = off + * - 1 = MAX_EXTREME_MV + * - 2 = MIN_EXTREME_MV + * + * \note This is only used in motion vector unit test. + */ + AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST = 110, + + /*!\brief Codec control function to signal picture timing info in the + * bitstream, aom_timing_info_type_t parameter. Default is + * AOM_TIMING_UNSPECIFIED. + */ + AV1E_SET_TIMING_INFO_TYPE = 111, + + /*!\brief Codec control function to add film grain parameters (one of several + * preset types) info in the bitstream, int parameter + * + Valid range: 0..16, 0 is unknown, 1..16 are test vectors + */ + AV1E_SET_FILM_GRAIN_TEST_VECTOR = 112, + + /*!\brief Codec control function to set the path to the film grain parameters, + * const char* parameter + */ + AV1E_SET_FILM_GRAIN_TABLE = 113, + + /*!\brief Sets the noise level, int parameter */ + AV1E_SET_DENOISE_NOISE_LEVEL = 114, + + /*!\brief Sets the denoisers block size, unsigned int parameter */ + AV1E_SET_DENOISE_BLOCK_SIZE = 115, + + /*!\brief Sets the chroma subsampling x value, unsigned int parameter */ + AV1E_SET_CHROMA_SUBSAMPLING_X = 116, + + /*!\brief Sets the chroma subsampling y value, unsigned int parameter */ + AV1E_SET_CHROMA_SUBSAMPLING_Y = 117, + + /*!\brief Control to use a reduced tx type set, int parameter */ + AV1E_SET_REDUCED_TX_TYPE_SET = 118, + + /*!\brief Control to use dct only for intra modes, int parameter */ + AV1E_SET_INTRA_DCT_ONLY = 119, + + /*!\brief Control to use dct only for inter modes, int parameter */ + AV1E_SET_INTER_DCT_ONLY = 120, + + /*!\brief Control to use default tx type only for intra modes, int parameter + */ + AV1E_SET_INTRA_DEFAULT_TX_ONLY = 121, + + /*!\brief Control to use adaptive quantize_b, int parameter */ + AV1E_SET_QUANT_B_ADAPT = 122, + + /*!\brief Control to select maximum height for the GF group pyramid structure, + * unsigned int parameter + * + * Valid range: 0..4 + */ + AV1E_SET_GF_MAX_PYRAMID_HEIGHT = 123, + + /*!\brief Control to select maximum reference frames allowed per frame, int + * parameter + * + * Valid range: 3..7 + */ + AV1E_SET_MAX_REFERENCE_FRAMES = 124, + + /*!\brief Control to use reduced set of single and compound references, int + parameter */ + AV1E_SET_REDUCED_REFERENCE_SET = 125, + + /* NOTE: enums 126-139 unused */ + /* NOTE: Need a gap in enum values to avoud conflict with 128, 129, 130 */ + + /*!\brief Control to set frequency of the cost updates for coefficients, + * unsigned int parameter + * + * - 0 = update at SB level (default) + * - 1 = update at SB row level in tile + * - 2 = update at tile level + * - 3 = turn off + */ + AV1E_SET_COEFF_COST_UPD_FREQ = 140, + + /*!\brief Control to set frequency of the cost updates for mode, unsigned int + * parameter + * + * - 0 = update at SB level (default) + * - 1 = update at SB row level in tile + * - 2 = update at tile level + * - 3 = turn off + */ + AV1E_SET_MODE_COST_UPD_FREQ = 141, + + /*!\brief Control to set frequency of the cost updates for motion vectors, + * unsigned int parameter + * + * - 0 = update at SB level (default) + * - 1 = update at SB row level in tile + * - 2 = update at tile level + * - 3 = turn off + */ + AV1E_SET_MV_COST_UPD_FREQ = 142, + + /*!\brief Control to set bit mask that specifies which tier each of the 32 + * possible operating points conforms to, unsigned int parameter + * + * - 0 = main tier (default) + * - 1 = high tier + */ + AV1E_SET_TIER_MASK = 143, + + /*!\brief Control to set minimum compression ratio, unsigned int parameter + * Take integer values. If non-zero, encoder will try to keep the compression + * ratio of each frame to be higher than the given value divided by 100. + * E.g. 850 means minimum compression ratio of 8.5. + */ + AV1E_SET_MIN_CR = 144, + + /* NOTE: enums 145-149 unused */ + + /*!\brief Codec control function to set the layer id, aom_svc_layer_id_t* + * parameter + */ + AV1E_SET_SVC_LAYER_ID = 150, + + /*!\brief Codec control function to set SVC paramaeters, aom_svc_params_t* + * parameter + */ + AV1E_SET_SVC_PARAMS = 151, + + /*!\brief Codec control function to set reference frame config: + * the ref_idx and the refresh flags for each buffer slot. + * aom_svc_ref_frame_config_t* parameter + */ + AV1E_SET_SVC_REF_FRAME_CONFIG = 152, + + /*!\brief Codec control function to set the path to the VMAF model used when + * tuning the encoder for VMAF, const char* parameter + */ + AV1E_SET_VMAF_MODEL_PATH = 153, + + /*!\brief Codec control function to enable EXT_TILE_DEBUG in AV1 encoder, + * unsigned int parameter + * + * - 0 = disable (default) + * - 1 = enable + * + * \note This is only used in lightfield example test. + */ + AV1E_ENABLE_EXT_TILE_DEBUG = 154, + + /*!\brief Codec control function to enable the superblock multipass unit test + * in AV1 to ensure that the encoder does not leak state between different + * passes. unsigned int parameter. + * + * - 0 = disable (default) + * - 1 = enable + * + * \note This is only used in sb_multipass unit test. + */ + AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST = 155, + + /*!\brief Control to select minimum height for the GF group pyramid structure, + * unsigned int parameter + * + * Valid values: 0..4 + */ + AV1E_SET_GF_MIN_PYRAMID_HEIGHT = 156, +}; + +/*!\brief aom 1-D scaling mode + * + * This set of constants define 1-D aom scaling modes + */ +typedef enum aom_scaling_mode_1d { + AOME_NORMAL = 0, + AOME_FOURFIVE = 1, + AOME_THREEFIVE = 2, + AOME_ONETWO = 3 +} AOM_SCALING_MODE; + +/*!\brief Max number of segments + * + * This is the limit of number of segments allowed within a frame. + * + * Currently same as "MAX_SEGMENTS" in AV1, the maximum that AV1 supports. + * + */ +#define AOM_MAX_SEGMENTS 8 + +/*!\brief aom region of interest map + * + * These defines the data structures for the region of interest map + * + * TODO(yaowu): create a unit test for ROI map related APIs + * + */ +typedef struct aom_roi_map { + /*! An id between 0 and 7 for each 8x8 region within a frame. */ + unsigned char *roi_map; + unsigned int rows; /**< Number of rows. */ + unsigned int cols; /**< Number of columns. */ + int delta_q[AOM_MAX_SEGMENTS]; /**< Quantizer deltas. */ + int delta_lf[AOM_MAX_SEGMENTS]; /**< Loop filter deltas. */ + /*! Static breakout threshold for each segment. */ + unsigned int static_threshold[AOM_MAX_SEGMENTS]; +} aom_roi_map_t; + +/*!\brief aom active region map + * + * These defines the data structures for active region map + * + */ + +typedef struct aom_active_map { + /*!\brief specify an on (1) or off (0) each 16x16 region within a frame */ + unsigned char *active_map; + unsigned int rows; /**< number of rows */ + unsigned int cols; /**< number of cols */ +} aom_active_map_t; + +/*!\brief aom image scaling mode + * + * This defines the data structure for image scaling mode + * + */ +typedef struct aom_scaling_mode { + AOM_SCALING_MODE h_scaling_mode; /**< horizontal scaling mode */ + AOM_SCALING_MODE v_scaling_mode; /**< vertical scaling mode */ +} aom_scaling_mode_t; + +/*!brief AV1 encoder content type */ +typedef enum { + AOM_CONTENT_DEFAULT, + AOM_CONTENT_SCREEN, + AOM_CONTENT_INVALID +} aom_tune_content; + +/*!brief AV1 encoder timing info type signaling */ +typedef enum { + AOM_TIMING_UNSPECIFIED, + AOM_TIMING_EQUAL, + AOM_TIMING_DEC_MODEL +} aom_timing_info_type_t; + +/*!\brief Model tuning parameters + * + * Changes the encoder to tune for certain types of input material. + * + */ +typedef enum { + AOM_TUNE_PSNR = 0, + AOM_TUNE_SSIM = 1, + /* NOTE: enums 2 and 3 unused */ + AOM_TUNE_VMAF_WITH_PREPROCESSING = 4, + AOM_TUNE_VMAF_WITHOUT_PREPROCESSING = 5, + AOM_TUNE_VMAF_MAX_GAIN = 6 +} aom_tune_metric; + +#define AOM_MAX_LAYERS 32 /**< Max number of layers */ +#define AOM_MAX_SS_LAYERS 4 /**< Max number of spatial layers */ +#define AOM_MAX_TS_LAYERS 8 /**< Max number of temporal layers */ + +/*!brief Struct for spatial and temporal layer ID */ +typedef struct aom_svc_layer_id { + int spatial_layer_id; /**< Spatial layer ID */ + int temporal_layer_id; /**< Temporal layer ID */ +} aom_svc_layer_id_t; + +/*!brief Parameter type for SVC */ +typedef struct aom_svc_params { + int number_spatial_layers; /**< Number of spatial layers */ + int number_temporal_layers; /**< Number of temporal layers */ + int max_quantizers[AOM_MAX_LAYERS]; /**< Max Q for each layer */ + int min_quantizers[AOM_MAX_LAYERS]; /**< Min Q for each layer */ + int scaling_factor_num[AOM_MAX_SS_LAYERS]; /**< Scaling factor-numerator */ + int scaling_factor_den[AOM_MAX_SS_LAYERS]; /**< Scaling factor-denominator */ + /*! Target bitrate for each layer */ + int layer_target_bitrate[AOM_MAX_LAYERS]; + /*! Frame rate factor for each temporal layer */ + int framerate_factor[AOM_MAX_TS_LAYERS]; +} aom_svc_params_t; + +/*!brief Parameters for setting ref frame config */ +typedef struct aom_svc_ref_frame_config { + // 7 references: LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), + // GOLDEN_FRAME(3), BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6). + int reference[7]; /**< Reference flag for each of the 7 references. */ + /*! Buffer slot index for each of 7 references. */ + int ref_idx[7]; + int refresh[8]; /**< Refresh flag for each of the 8 slots. */ +} aom_svc_ref_frame_config_t; + +/*!\cond */ +/*!\brief Encoder control function parameter type + * + * Defines the data types that AOME/AV1E control functions take. + * + * \note Additional common controls are defined in aom.h. + * + * \note For each control ID "X", a macro-define of + * AOM_CTRL_X is provided. It is used at compile time to determine + * if the control ID is supported by the libaom library available, + * when the libaom version cannot be controlled. + */ +AOM_CTRL_USE_TYPE(AOME_USE_REFERENCE, int) +#define AOM_CTRL_AOME_USE_REFERENCE + +AOM_CTRL_USE_TYPE(AOME_SET_ROI_MAP, aom_roi_map_t *) +#define AOM_CTRL_AOME_SET_ROI_MAP + +AOM_CTRL_USE_TYPE(AOME_SET_ACTIVEMAP, aom_active_map_t *) +#define AOM_CTRL_AOME_SET_ACTIVEMAP + +AOM_CTRL_USE_TYPE(AOME_SET_SCALEMODE, aom_scaling_mode_t *) +#define AOM_CTRL_AOME_SET_SCALEMODE + +AOM_CTRL_USE_TYPE(AOME_SET_SPATIAL_LAYER_ID, unsigned int) +#define AOM_CTRL_AOME_SET_SPATIAL_LAYER_ID + +AOM_CTRL_USE_TYPE(AOME_SET_CPUUSED, int) +#define AOM_CTRL_AOME_SET_CPUUSED + +AOM_CTRL_USE_TYPE(AOME_SET_ENABLEAUTOALTREF, unsigned int) +#define AOM_CTRL_AOME_SET_ENABLEAUTOALTREF + +AOM_CTRL_USE_TYPE(AOME_SET_ENABLEAUTOBWDREF, unsigned int) +#define AOM_CTRL_AOME_SET_ENABLEAUTOBWDREF + +AOM_CTRL_USE_TYPE(AOME_SET_SHARPNESS, unsigned int) +#define AOM_CTRL_AOME_SET_SHARPNESS + +AOM_CTRL_USE_TYPE(AOME_SET_STATIC_THRESHOLD, unsigned int) +#define AOM_CTRL_AOME_SET_STATIC_THRESHOLD + +AOM_CTRL_USE_TYPE(AOME_SET_ARNR_MAXFRAMES, unsigned int) +#define AOM_CTRL_AOME_SET_ARNR_MAXFRAMES + +AOM_CTRL_USE_TYPE(AOME_SET_ARNR_STRENGTH, unsigned int) +#define AOM_CTRL_AOME_SET_ARNR_STRENGTH + +AOM_CTRL_USE_TYPE(AOME_SET_TUNING, int) /* aom_tune_metric */ +#define AOM_CTRL_AOME_SET_TUNING + +AOM_CTRL_USE_TYPE(AOME_SET_CQ_LEVEL, unsigned int) +#define AOM_CTRL_AOME_SET_CQ_LEVEL + +AOM_CTRL_USE_TYPE(AV1E_SET_ROW_MT, unsigned int) +#define AOM_CTRL_AV1E_SET_ROW_MT + +AOM_CTRL_USE_TYPE(AV1E_SET_TILE_COLUMNS, unsigned int) +#define AOM_CTRL_AV1E_SET_TILE_COLUMNS + +AOM_CTRL_USE_TYPE(AV1E_SET_TILE_ROWS, unsigned int) +#define AOM_CTRL_AV1E_SET_TILE_ROWS + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_TPL_MODEL, unsigned int) +#define AOM_CTRL_AV1E_SET_ENABLE_TPL_MODEL + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_KEYFRAME_FILTERING, unsigned int) +#define AOM_CTRL_AV1E_SET_ENABLE_KEYFRAME_FILTERING + +AOM_CTRL_USE_TYPE(AOME_GET_LAST_QUANTIZER, int *) +#define AOM_CTRL_AOME_GET_LAST_QUANTIZER + +AOM_CTRL_USE_TYPE(AOME_GET_LAST_QUANTIZER_64, int *) +#define AOM_CTRL_AOME_GET_LAST_QUANTIZER_64 + +AOM_CTRL_USE_TYPE(AOME_SET_MAX_INTRA_BITRATE_PCT, unsigned int) +#define AOM_CTRL_AOME_SET_MAX_INTRA_BITRATE_PCT + +AOM_CTRL_USE_TYPE(AOME_SET_MAX_INTER_BITRATE_PCT, unsigned int) +#define AOM_CTRL_AOME_SET_MAX_INTER_BITRATE_PCT + +AOM_CTRL_USE_TYPE(AOME_SET_NUMBER_SPATIAL_LAYERS, int) +#define AOME_CTRL_AOME_SET_NUMBER_SPATIAL_LAYERS + +AOM_CTRL_USE_TYPE(AV1E_SET_GF_CBR_BOOST_PCT, unsigned int) +#define AOM_CTRL_AV1E_SET_GF_CBR_BOOST_PCT + +AOM_CTRL_USE_TYPE(AV1E_SET_LOSSLESS, unsigned int) +#define AOM_CTRL_AV1E_SET_LOSSLESS + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_CDEF, unsigned int) +#define AOM_CTRL_AV1E_SET_ENABLE_CDEF + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_RESTORATION, unsigned int) +#define AOM_CTRL_AV1E_SET_ENABLE_RESTORATION + +AOM_CTRL_USE_TYPE(AV1E_SET_FORCE_VIDEO_MODE, unsigned int) +#define AOM_CTRL_AV1E_SET_FORCE_VIDEO_MODE + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_OBMC, unsigned int) +#define AOM_CTRL_AV1E_SET_ENABLE_OBMC + +AOM_CTRL_USE_TYPE(AV1E_SET_DISABLE_TRELLIS_QUANT, unsigned int) +#define AOM_CTRL_AV1E_SET_DISABLE_TRELLIS_QUANT + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_QM, unsigned int) +#define AOM_CTRL_AV1E_SET_ENABLE_QM + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DIST_8X8, unsigned int) +#define AOM_CTRL_AV1E_SET_ENABLE_DIST_8X8 + +AOM_CTRL_USE_TYPE(AV1E_SET_QM_MIN, unsigned int) +#define AOM_CTRL_AV1E_SET_QM_MIN + +AOM_CTRL_USE_TYPE(AV1E_SET_QM_MAX, unsigned int) +#define AOM_CTRL_AV1E_SET_QM_MAX + +AOM_CTRL_USE_TYPE(AV1E_SET_QM_Y, unsigned int) +#define AOM_CTRL_AV1E_SET_QM_Y + +AOM_CTRL_USE_TYPE(AV1E_SET_QM_U, unsigned int) +#define AOM_CTRL_AV1E_SET_QM_U + +AOM_CTRL_USE_TYPE(AV1E_SET_QM_V, unsigned int) +#define AOM_CTRL_AV1E_SET_QM_V + +AOM_CTRL_USE_TYPE(AV1E_SET_NUM_TG, unsigned int) +#define AOM_CTRL_AV1E_SET_NUM_TG + +AOM_CTRL_USE_TYPE(AV1E_SET_MTU, unsigned int) +#define AOM_CTRL_AV1E_SET_MTU + +AOM_CTRL_USE_TYPE(AV1E_SET_TIMING_INFO_TYPE, int) /* aom_timing_info_type_t */ +#define AOM_CTRL_AV1E_SET_TIMING_INFO_TYPE + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_RECT_PARTITIONS, int) +#define AOM_CTRL_AV1E_SET_ENABLE_RECT_PARTITIONS + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_AB_PARTITIONS, int) +#define AOM_CTRL_AV1E_SET_ENABLE_AB_PARTITIONS + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_1TO4_PARTITIONS, int) +#define AOM_CTRL_AV1E_SET_ENABLE_1TO4_PARTITIONS + +AOM_CTRL_USE_TYPE(AV1E_SET_MIN_PARTITION_SIZE, int) +#define AOM_CTRL_AV1E_SET_MIN_PARTITION_SIZE + +AOM_CTRL_USE_TYPE(AV1E_SET_MAX_PARTITION_SIZE, int) +#define AOM_CTRL_AV1E_SET_MAX_PARTITION_SIZE + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTRA_EDGE_FILTER, int) +#define AOM_CTRL_AV1E_SET_ENABLE_INTRA_EDGE_FILTER + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_ORDER_HINT, int) +#define AOM_CTRL_AV1E_SET_ENABLE_ORDER_HINT + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_TX64, int) +#define AOM_CTRL_AV1E_SET_ENABLE_TX64 + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_FLIP_IDTX, int) +#define AOM_CTRL_AV1E_SET_ENABLE_FLIP_IDTX + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DIST_WTD_COMP, int) +#define AOM_CTRL_AV1E_SET_ENABLE_DIST_WTD_COMP + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_REF_FRAME_MVS, int) +#define AOM_CTRL_AV1E_SET_ENABLE_REF_FRAME_MVS + +AOM_CTRL_USE_TYPE(AV1E_SET_ALLOW_REF_FRAME_MVS, int) +#define AOM_CTRL_AV1E_SET_ALLOW_REF_FRAME_MVS + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DUAL_FILTER, int) +#define AOM_CTRL_AV1E_SET_ENABLE_DUAL_FILTER + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_CHROMA_DELTAQ, int) +#define AOM_CTRL_AV1E_SET_ENABLE_CHROMA_DELTAQ + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_MASKED_COMP, int) +#define AOM_CTRL_AV1E_SET_ENABLE_MASKED_COMP + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_ONESIDED_COMP, int) +#define AOM_CTRL_AV1E_SET_ENABLE_ONESIDED_COMP + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTERINTRA_COMP, int) +#define AOM_CTRL_AV1E_SET_ENABLE_INTERINTRA_COMP + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_SMOOTH_INTERINTRA, int) +#define AOM_CTRL_AV1E_SET_ENABLE_SMOOTH_INTERINTRA + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DIFF_WTD_COMP, int) +#define AOM_CTRL_AV1E_SET_ENABLE_DIFF_WTD_COMP + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTERINTER_WEDGE, int) +#define AOM_CTRL_AV1E_SET_ENABLE_INTERINTER_WEDGE + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTERINTRA_WEDGE, int) +#define AOM_CTRL_AV1E_SET_ENABLE_INTERINTRA_WEDGE + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_GLOBAL_MOTION, int) +#define AOM_CTRL_AV1E_SET_ENABLE_GLOBAL_MOTION + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_WARPED_MOTION, int) +#define AOM_CTRL_AV1E_SET_ENABLE_WARPED_MOTION + +AOM_CTRL_USE_TYPE(AV1E_SET_ALLOW_WARPED_MOTION, int) +#define AOM_CTRL_AV1E_SET_ALLOW_WARPED_MOTION + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_FILTER_INTRA, int) +#define AOM_CTRL_AV1E_SET_ENABLE_FILTER_INTRA + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_SMOOTH_INTRA, int) +#define AOM_CTRL_AV1E_SET_ENABLE_SMOOTH_INTRA + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_PAETH_INTRA, int) +#define AOM_CTRL_AV1E_SET_ENABLE_PAETH_INTRA + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_CFL_INTRA, int) +#define AOM_CTRL_AV1E_SET_ENABLE_CFL_INTRA + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_SUPERRES, int) +#define AOM_CTRL_AV1E_SET_ENABLE_SUPERRES + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_OVERLAY, int) +#define AOM_CTRL_AV1E_SET_ENABLE_OVERLAY + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_PALETTE, int) +#define AOM_CTRL_AV1E_SET_ENABLE_PALETTE + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTRABC, int) +#define AOM_CTRL_AV1E_SET_ENABLE_INTRABC + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_ANGLE_DELTA, int) +#define AOM_CTRL_AV1E_SET_ENABLE_ANGLE_DELTA + +AOM_CTRL_USE_TYPE(AV1E_SET_FRAME_PARALLEL_DECODING, unsigned int) +#define AOM_CTRL_AV1E_SET_FRAME_PARALLEL_DECODING + +AOM_CTRL_USE_TYPE(AV1E_SET_ERROR_RESILIENT_MODE, int) +#define AOM_CTRL_AV1E_SET_ERROR_RESILIENT_MODE + +AOM_CTRL_USE_TYPE(AV1E_SET_S_FRAME_MODE, int) +#define AOM_CTRL_AV1E_SET_S_FRAME_MODE + +AOM_CTRL_USE_TYPE(AV1E_SET_AQ_MODE, unsigned int) +#define AOM_CTRL_AV1E_SET_AQ_MODE + +AOM_CTRL_USE_TYPE(AV1E_SET_DELTAQ_MODE, unsigned int) +#define AOM_CTRL_AV1E_SET_DELTAQ_MODE + +AOM_CTRL_USE_TYPE(AV1E_SET_DELTALF_MODE, unsigned int) +#define AOM_CTRL_AV1E_SET_DELTALF_MODE + +AOM_CTRL_USE_TYPE(AV1E_SET_FRAME_PERIODIC_BOOST, unsigned int) +#define AOM_CTRL_AV1E_SET_FRAME_PERIODIC_BOOST + +AOM_CTRL_USE_TYPE(AV1E_SET_NOISE_SENSITIVITY, unsigned int) +#define AOM_CTRL_AV1E_SET_NOISE_SENSITIVITY + +AOM_CTRL_USE_TYPE(AV1E_SET_TUNE_CONTENT, int) /* aom_tune_content */ +#define AOM_CTRL_AV1E_SET_TUNE_CONTENT + +AOM_CTRL_USE_TYPE(AV1E_SET_COLOR_PRIMARIES, int) +#define AOM_CTRL_AV1E_SET_COLOR_PRIMARIES + +AOM_CTRL_USE_TYPE(AV1E_SET_TRANSFER_CHARACTERISTICS, int) +#define AOM_CTRL_AV1E_SET_TRANSFER_CHARACTERISTICS + +AOM_CTRL_USE_TYPE(AV1E_SET_MATRIX_COEFFICIENTS, int) +#define AOM_CTRL_AV1E_SET_MATRIX_COEFFICIENTS + +AOM_CTRL_USE_TYPE(AV1E_SET_CHROMA_SAMPLE_POSITION, int) +#define AOM_CTRL_AV1E_SET_CHROMA_SAMPLE_POSITION + +AOM_CTRL_USE_TYPE(AV1E_SET_MIN_GF_INTERVAL, unsigned int) +#define AOM_CTRL_AV1E_SET_MIN_GF_INTERVAL + +AOM_CTRL_USE_TYPE(AV1E_SET_MAX_GF_INTERVAL, unsigned int) +#define AOM_CTRL_AV1E_SET_MAX_GF_INTERVAL + +AOM_CTRL_USE_TYPE(AV1E_GET_ACTIVEMAP, aom_active_map_t *) +#define AOM_CTRL_AV1E_GET_ACTIVEMAP + +AOM_CTRL_USE_TYPE(AV1E_SET_COLOR_RANGE, int) +#define AOM_CTRL_AV1E_SET_COLOR_RANGE + +#define AOM_CTRL_AV1E_SET_RENDER_SIZE +AOM_CTRL_USE_TYPE(AV1E_SET_RENDER_SIZE, int *) + +AOM_CTRL_USE_TYPE(AV1E_SET_SUPERBLOCK_SIZE, unsigned int) +#define AOM_CTRL_AV1E_SET_SUPERBLOCK_SIZE + +AOM_CTRL_USE_TYPE(AV1E_GET_SEQ_LEVEL_IDX, int *) +#define AOM_CTRL_AV1E_GET_SEQ_LEVEL_IDX + +AOM_CTRL_USE_TYPE(AV1E_SET_SINGLE_TILE_DECODING, unsigned int) +#define AOM_CTRL_AV1E_SET_SINGLE_TILE_DECODING + +AOM_CTRL_USE_TYPE(AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, unsigned int) +#define AOM_CTRL_AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST + +AOM_CTRL_USE_TYPE(AV1E_ENABLE_EXT_TILE_DEBUG, unsigned int) +#define AOM_CTRL_AV1E_ENABLE_EXT_TILE_DEBUG + +AOM_CTRL_USE_TYPE(AV1E_SET_VMAF_MODEL_PATH, const char *) +#define AOM_CTRL_AV1E_SET_VMAF_MODEL_PATH + +AOM_CTRL_USE_TYPE(AV1E_SET_FILM_GRAIN_TEST_VECTOR, int) +#define AOM_CTRL_AV1E_SET_FILM_GRAIN_TEST_VECTOR + +AOM_CTRL_USE_TYPE(AV1E_SET_FILM_GRAIN_TABLE, const char *) +#define AOM_CTRL_AV1E_SET_FILM_GRAIN_TABLE + +AOM_CTRL_USE_TYPE(AV1E_SET_CDF_UPDATE_MODE, unsigned int) +#define AOM_CTRL_AV1E_SET_CDF_UPDATE_MODE + +AOM_CTRL_USE_TYPE(AV1E_SET_DENOISE_NOISE_LEVEL, int) +#define AOM_CTRL_AV1E_SET_DENOISE_NOISE_LEVEL + +AOM_CTRL_USE_TYPE(AV1E_SET_DENOISE_BLOCK_SIZE, unsigned int) +#define AOM_CTRL_AV1E_SET_DENOISE_BLOCK_SIZE + +AOM_CTRL_USE_TYPE(AV1E_SET_CHROMA_SUBSAMPLING_X, unsigned int) +#define AOM_CTRL_AV1E_SET_CHROMA_SUBSAMPLING_X + +AOM_CTRL_USE_TYPE(AV1E_SET_CHROMA_SUBSAMPLING_Y, unsigned int) +#define AOM_CTRL_AV1E_SET_CHROMA_SUBSAMPLING_Y + +AOM_CTRL_USE_TYPE(AV1E_SET_REDUCED_TX_TYPE_SET, int) +#define AOM_CTRL_AV1E_SET_REDUCED_TX_TYPE_SET + +AOM_CTRL_USE_TYPE(AV1E_SET_INTRA_DCT_ONLY, int) +#define AOM_CTRL_AV1E_SET_INTRA_DCT_ONLY + +AOM_CTRL_USE_TYPE(AV1E_SET_INTER_DCT_ONLY, int) +#define AOM_CTRL_AV1E_SET_INTER_DCT_ONLY + +AOM_CTRL_USE_TYPE(AV1E_SET_INTRA_DEFAULT_TX_ONLY, int) +#define AOM_CTRL_AV1E_SET_INTRA_DEFAULT_TX_ONLY + +AOM_CTRL_USE_TYPE(AV1E_SET_QUANT_B_ADAPT, int) +#define AOM_CTRL_AV1E_SET_QUANT_B_ADAPT + +AOM_CTRL_USE_TYPE(AV1E_SET_GF_MIN_PYRAMID_HEIGHT, unsigned int) +#define AOM_CTRL_AV1E_SET_GF_MIN_PYRAMID_HEIGHT + +AOM_CTRL_USE_TYPE(AV1E_SET_GF_MAX_PYRAMID_HEIGHT, unsigned int) +#define AOM_CTRL_AV1E_SET_GF_MAX_PYRAMID_HEIGHT + +AOM_CTRL_USE_TYPE(AV1E_SET_MAX_REFERENCE_FRAMES, int) +#define AOM_CTRL_AV1E_SET_MAX_REFERENCE_FRAMES + +AOM_CTRL_USE_TYPE(AV1E_SET_REDUCED_REFERENCE_SET, int) +#define AOM_CTRL_AV1E_SET_REDUCED_REFERENCE_SET + +AOM_CTRL_USE_TYPE(AV1E_SET_COEFF_COST_UPD_FREQ, unsigned int) +#define AOM_CTRL_AV1E_SET_COEFF_COST_UPD_FREQ + +AOM_CTRL_USE_TYPE(AV1E_SET_MODE_COST_UPD_FREQ, unsigned int) +#define AOM_CTRL_AV1E_SET_MODE_COST_UPD_FREQ + +AOM_CTRL_USE_TYPE(AV1E_SET_MV_COST_UPD_FREQ, unsigned int) +#define AOM_CTRL_AV1E_SET_MV_COST_UPD_FREQ + +AOM_CTRL_USE_TYPE(AV1E_SET_TARGET_SEQ_LEVEL_IDX, int) +#define AOM_CTRL_AV1E_SET_TARGET_SEQ_LEVEL_IDX + +AOM_CTRL_USE_TYPE(AV1E_SET_TIER_MASK, unsigned int) +#define AOM_CTRL_AV1E_SET_TIER_MASK + +AOM_CTRL_USE_TYPE(AV1E_SET_MIN_CR, unsigned int) +#define AOM_CTRL_AV1E_SET_MIN_CR + +AOM_CTRL_USE_TYPE(AV1E_SET_SVC_LAYER_ID, aom_svc_layer_id_t *) +#define AOME_CTRL_AV1E_SET_SVC_LAYER_ID + +AOM_CTRL_USE_TYPE(AV1E_SET_SVC_PARAMS, aom_svc_params_t *) +#define AOME_CTRL_AV1E_SET_SVC_PARAMS + +AOM_CTRL_USE_TYPE(AV1E_SET_SVC_REF_FRAME_CONFIG, aom_svc_ref_frame_config_t *) +#define AOME_CTRL_AV1E_SET_SVC_REF_FRAME_CONFIG + +AOM_CTRL_USE_TYPE(AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST, unsigned int) +#define AOM_CTRL_AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST + +/*!\endcond */ +/*! @} - end defgroup aom_encoder */ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_AOMCX_H_ diff --git a/libs/libaom/src/aom/aomdx.h b/libs/libaom/src/aom/aomdx.h new file mode 100644 index 000000000..8cd5de395 --- /dev/null +++ b/libs/libaom/src/aom/aomdx.h @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\defgroup aom_decoder AOMedia AOM/AV1 Decoder + * \ingroup aom + * + * @{ + */ +/*!\file + * \brief Provides definitions for using AOM or AV1 within the aom Decoder + * interface. + */ +#ifndef AOM_AOM_AOMDX_H_ +#define AOM_AOM_AOMDX_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* Include controls common to both the encoder and decoder */ +#include "aom/aom.h" + +/*!\name Algorithm interface for AV1 + * + * This interface provides the capability to decode AV1 streams. + * @{ + */ +extern aom_codec_iface_t aom_codec_av1_dx_algo; +extern aom_codec_iface_t *aom_codec_av1_dx(void); +/*!@} - end algorithm interface member group*/ + +/** Data structure that stores bit accounting for debug + */ +typedef struct Accounting Accounting; + +#ifndef AOM_INSPECTION_H_ +/** Callback that inspects decoder frame data. + */ +typedef void (*aom_inspect_cb)(void *decoder, void *ctx); + +#endif + +/*!\brief Structure to hold inspection callback and context. + * + * Defines a structure to hold the inspection callback function and calling + * context. + */ +typedef struct aom_inspect_init { + /*! Inspection callback. */ + aom_inspect_cb inspect_cb; + + /*! Inspection context. */ + void *inspect_ctx; +} aom_inspect_init; + +/*!\brief Structure to collect a buffer index when inspecting. + * + * Defines a structure to hold the buffer and return an index + * when calling decode from inspect. This enables us to decode + * non showable sub frames. + */ +typedef struct { + /*! Pointer for new position in compressed buffer after decoding 1 OBU. */ + const unsigned char *buf; + /*! Index into reference buffer array to see result of decoding 1 OBU. */ + int idx; + /*! Is a show existing frame. */ + int show_existing; +} Av1DecodeReturn; + +/*!\brief Structure to hold a tile's start address and size in the bitstream. + * + * Defines a structure to hold a tile's start address and size in the bitstream. + */ +typedef struct aom_tile_data { + /*! Tile data size. */ + size_t coded_tile_data_size; + /*! Tile's start address. */ + const void *coded_tile_data; + /*! Extra size information. */ + size_t extra_size; +} aom_tile_data; + +/*!\brief Structure to hold the external reference frame pointer. + * + * Define a structure to hold the external reference frame pointer. + */ +typedef struct av1_ext_ref_frame { + /*! Start pointer of external references. */ + aom_image_t *img; + /*! Number of available external references. */ + int num; +} av1_ext_ref_frame_t; + +/*!\enum aom_dec_control_id + * \brief AOM decoder control functions + * + * This set of macros define the control functions available for the AOM + * decoder interface. + * + * \sa #aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...) + */ +enum aom_dec_control_id { + /*!\brief Codec control function to get info on which reference frames were + * updated by the last decode, int* parameter + */ + AOMD_GET_LAST_REF_UPDATES = AOM_DECODER_CTRL_ID_START, + + /*!\brief Codec control function to check if the indicated frame is + corrupted, int* parameter + */ + AOMD_GET_FRAME_CORRUPTED, + + /*!\brief Codec control function to get info on which reference frames were + * used by the last decode, int* parameter + */ + AOMD_GET_LAST_REF_USED, + + /*!\brief Codec control function to get the dimensions that the current + * frame is decoded at, int* parameter. This may be different to the + * intended display size for the frame as specified in the wrapper or frame + * header (see AV1D_GET_DISPLAY_SIZE). + */ + AV1D_GET_FRAME_SIZE, + + /*!\brief Codec control function to get the current frame's intended display + * dimensions (as specified in the wrapper or frame header), int* parameter. + * This may be different to the decoded dimensions of this frame (see + * AV1D_GET_FRAME_SIZE). + */ + AV1D_GET_DISPLAY_SIZE, + + /*!\brief Codec control function to get the bit depth of the stream, + * unsigned int* parameter + */ + AV1D_GET_BIT_DEPTH, + + /*!\brief Codec control function to get the image format of the stream, + * aom_img_fmt_t* parameter + */ + AV1D_GET_IMG_FORMAT, + + /*!\brief Codec control function to get the size of the tile, unsigned int + parameter */ + AV1D_GET_TILE_SIZE, + + /*!\brief Codec control function to get the tile count in a tile list, int* + * parameter + */ + AV1D_GET_TILE_COUNT, + + /*!\brief Codec control function to set the byte alignment of the planes in + * the reference buffers, int parameter + * + * Valid values are power of 2, from 32 to 1024. A value of 0 sets + * legacy alignment. I.e. Y plane is aligned to 32 bytes, U plane directly + * follows Y plane, and V plane directly follows U plane. Default value is 0. + */ + AV1_SET_BYTE_ALIGNMENT, + + /*!\brief Codec control function to invert the decoding order to from right to + * left, int parameter + * + * The function is used in a test to confirm the decoding independence of tile + * columns. The function may be used in application where this order + * of decoding is desired. int parameter + * + * TODO(yaowu): Rework the unit test that uses this control, and in a future + * release, this test-only control shall be removed. + */ + AV1_INVERT_TILE_DECODE_ORDER, + + /*!\brief Codec control function to set the skip loop filter flag, int + * parameter + * + * Valid values are integers. The decoder will skip the loop filter + * when its value is set to nonzero. If the loop filter is skipped the + * decoder may accumulate decode artifacts. The default value is 0. + */ + AV1_SET_SKIP_LOOP_FILTER, + + /*!\brief Codec control function to retrieve a pointer to the Accounting + * struct, takes Accounting** as parameter + * + * If called before a frame has been decoded, this returns AOM_CODEC_ERROR. + * The caller should ensure that AOM_CODEC_OK is returned before attempting + * to dereference the Accounting pointer. + * + * \attention When compiled without --enable-accounting, this returns + * AOM_CODEC_INCAPABLE. + */ + AV1_GET_ACCOUNTING, + + /*!\brief Codec control function to get last decoded frame quantizer, + * int* parameter + * + * Returned value uses internal quantizer scale defined by the codec. + */ + AOMD_GET_LAST_QUANTIZER, + + /*!\brief Codec control function to set the range of tile decoding, int + * parameter + * + * A value that is greater and equal to zero indicates only the specific + * row/column is decoded. A value that is -1 indicates the whole row/column + * is decoded. A special case is both values are -1 that means the whole + * frame is decoded. + */ + AV1_SET_DECODE_TILE_ROW, + AV1_SET_DECODE_TILE_COL, + + /*!\brief Codec control function to set the tile coding mode, int parameter + * + * - 0 = tiles are coded in normal tile mode + * - 1 = tiles are coded in large-scale tile mode + */ + AV1_SET_TILE_MODE, + + /*!\brief Codec control function to get the frame header information of an + * encoded frame, unsigned int* parameter + */ + AV1D_GET_FRAME_HEADER_INFO, + + /*!\brief Codec control function to get the start address and size of a + * tile in the coded bitstream, aom_tile_data* parameter. + */ + AV1D_GET_TILE_DATA, + + /*!\brief Codec control function to set the external references' pointers in + * the decoder, av1_ext_ref_frame_t* parameter. + * + * This is used while decoding the tile list OBU in large-scale tile coding + * mode. + */ + AV1D_SET_EXT_REF_PTR, + + /*!\brief Codec control function to enable the ext-tile software debug and + * testing code in the decoder, unsigned int parameter + */ + AV1D_EXT_TILE_DEBUG, + + /*!\brief Codec control function to enable the row based multi-threading of + * decoding, unsigned int parameter + * + * - 0 = disabled + * - 1 = enabled (default) + */ + AV1D_SET_ROW_MT, + + /*!\brief Codec control function to indicate whether bitstream is in + * Annex-B format, unsigned int parameter + */ + AV1D_SET_IS_ANNEXB, + + /*!\brief Codec control function to indicate which operating point to use, + * int parameter + * + * A scalable stream may define multiple operating points, each of which + * defines a set of temporal and spatial layers to be processed. The + * operating point index may take a value between 0 and + * operating_points_cnt_minus_1 (which is at most 31). + */ + AV1D_SET_OPERATING_POINT, + + /*!\brief Codec control function to indicate whether to output one frame per + * temporal unit (the default), or one frame per spatial layer. int parameter + * + * In a scalable stream, each temporal unit corresponds to a single "frame" + * of video, and within a temporal unit there may be multiple spatial layers + * with different versions of that frame. + * For video playback, only the highest-quality version (within the + * selected operating point) is needed, but for some use cases it is useful + * to have access to multiple versions of a frame when they are available. + */ + AV1D_SET_OUTPUT_ALL_LAYERS, + + /*!\brief Codec control function to set an aom_inspect_cb callback that is + * invoked each time a frame is decoded, aom_inspect_init* parameter + * + * \attention When compiled without --enable-inspection, this + * returns AOM_CODEC_INCAPABLE. + */ + AV1_SET_INSPECTION_CALLBACK, + + /*!\brief Codec control function to set the skip film grain flag, int + * parameter + * + * Valid values are integers. The decoder will skip the film grain when its + * value is set to nonzero. The default value is 0. + */ + AV1D_SET_SKIP_FILM_GRAIN, + + AOM_DECODER_CTRL_ID_MAX, +}; + +/*!\cond */ +/*!\brief AOM decoder control function parameter type + * + * Defines the data types that AOMD control functions take. + * + * \note Additional common controls are defined in aom.h. + * + * \note For each control ID "X", a macro-define of + * AOM_CTRL_X is provided. It is used at compile time to determine + * if the control ID is supported by the libaom library available, + * when the libaom version cannot be controlled. + */ +AOM_CTRL_USE_TYPE(AOMD_GET_LAST_REF_UPDATES, int *) +#define AOM_CTRL_AOMD_GET_LAST_REF_UPDATES + +AOM_CTRL_USE_TYPE(AOMD_GET_FRAME_CORRUPTED, int *) +#define AOM_CTRL_AOMD_GET_FRAME_CORRUPTED + +AOM_CTRL_USE_TYPE(AOMD_GET_LAST_REF_USED, int *) +#define AOM_CTRL_AOMD_GET_LAST_REF_USED + +AOM_CTRL_USE_TYPE(AOMD_GET_LAST_QUANTIZER, int *) +#define AOM_CTRL_AOMD_GET_LAST_QUANTIZER + +AOM_CTRL_USE_TYPE(AV1D_GET_DISPLAY_SIZE, int *) +#define AOM_CTRL_AV1D_GET_DISPLAY_SIZE + +AOM_CTRL_USE_TYPE(AV1D_GET_BIT_DEPTH, unsigned int *) +#define AOM_CTRL_AV1D_GET_BIT_DEPTH + +AOM_CTRL_USE_TYPE(AV1D_GET_IMG_FORMAT, aom_img_fmt_t *) +#define AOM_CTRL_AV1D_GET_IMG_FORMAT + +AOM_CTRL_USE_TYPE(AV1D_GET_TILE_SIZE, unsigned int *) +#define AOM_CTRL_AV1D_GET_TILE_SIZE + +AOM_CTRL_USE_TYPE(AV1D_GET_TILE_COUNT, unsigned int *) +#define AOM_CTRL_AV1D_GET_TILE_COUNT + +AOM_CTRL_USE_TYPE(AV1D_GET_FRAME_SIZE, int *) +#define AOM_CTRL_AV1D_GET_FRAME_SIZE + +AOM_CTRL_USE_TYPE(AV1_INVERT_TILE_DECODE_ORDER, int) +#define AOM_CTRL_AV1_INVERT_TILE_DECODE_ORDER + +AOM_CTRL_USE_TYPE(AV1_GET_ACCOUNTING, Accounting **) +#define AOM_CTRL_AV1_GET_ACCOUNTING + +AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_ROW, int) +#define AOM_CTRL_AV1_SET_DECODE_TILE_ROW + +AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_COL, int) +#define AOM_CTRL_AV1_SET_DECODE_TILE_COL + +AOM_CTRL_USE_TYPE(AV1_SET_TILE_MODE, unsigned int) +#define AOM_CTRL_AV1_SET_TILE_MODE + +AOM_CTRL_USE_TYPE(AV1D_GET_FRAME_HEADER_INFO, aom_tile_data *) +#define AOM_CTRL_AV1D_GET_FRAME_HEADER_INFO + +AOM_CTRL_USE_TYPE(AV1D_GET_TILE_DATA, aom_tile_data *) +#define AOM_CTRL_AV1D_GET_TILE_DATA + +AOM_CTRL_USE_TYPE(AV1D_SET_EXT_REF_PTR, av1_ext_ref_frame_t *) +#define AOM_CTRL_AV1D_SET_EXT_REF_PTR + +AOM_CTRL_USE_TYPE(AV1D_EXT_TILE_DEBUG, unsigned int) +#define AOM_CTRL_AV1D_EXT_TILE_DEBUG + +AOM_CTRL_USE_TYPE(AV1D_SET_ROW_MT, unsigned int) +#define AOM_CTRL_AV1D_SET_ROW_MT + +AOM_CTRL_USE_TYPE(AV1D_SET_SKIP_FILM_GRAIN, int) +#define AOM_CTRL_AV1D_SET_SKIP_FILM_GRAIN + +AOM_CTRL_USE_TYPE(AV1D_SET_IS_ANNEXB, unsigned int) +#define AOM_CTRL_AV1D_SET_IS_ANNEXB + +AOM_CTRL_USE_TYPE(AV1D_SET_OPERATING_POINT, int) +#define AOM_CTRL_AV1D_SET_OPERATING_POINT + +AOM_CTRL_USE_TYPE(AV1D_SET_OUTPUT_ALL_LAYERS, int) +#define AOM_CTRL_AV1D_SET_OUTPUT_ALL_LAYERS + +AOM_CTRL_USE_TYPE(AV1_SET_INSPECTION_CALLBACK, aom_inspect_init *) +#define AOM_CTRL_AV1_SET_INSPECTION_CALLBACK +/*!\endcond */ +/*! @} - end defgroup aom_decoder */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_AOMDX_H_ diff --git a/libs/libaom/src/aom/exports_com b/libs/libaom/src/aom/exports_com new file mode 100644 index 000000000..6f796f5db --- /dev/null +++ b/libs/libaom/src/aom/exports_com @@ -0,0 +1,41 @@ +text aom_codec_build_config +text aom_codec_control +text aom_codec_destroy +text aom_codec_err_to_string +text aom_codec_error +text aom_codec_error_detail +text aom_codec_get_caps +text aom_codec_iface_name +text aom_codec_version +text aom_codec_version_extra_str +text aom_codec_version_str +text aom_free +text aom_img_add_metadata +text aom_img_alloc +text aom_img_alloc_with_border +text aom_img_flip +text aom_img_free +text aom_img_get_metadata +text aom_img_metadata_array_free +text aom_img_metadata_array_alloc +text aom_img_metadata_free +text aom_img_metadata_alloc +text aom_img_num_metadata +text aom_img_plane_height +text aom_img_plane_width +text aom_img_remove_metadata +text aom_img_set_rect +text aom_img_wrap +text aom_malloc +text aom_rb_bytes_read +text aom_rb_read_bit +text aom_rb_read_literal +text aom_rb_read_uvlc +text aom_uleb_decode +text aom_uleb_encode +text aom_uleb_encode_fixed_size +text aom_uleb_size_in_bytes +text aom_wb_bytes_written +text aom_wb_write_bit +text aom_wb_write_literal +text aom_wb_write_unsigned_literal diff --git a/libs/libaom/src/aom/exports_dec b/libs/libaom/src/aom/exports_dec new file mode 100644 index 000000000..ffff023dd --- /dev/null +++ b/libs/libaom/src/aom/exports_dec @@ -0,0 +1,8 @@ +text aom_codec_dec_init_ver +text aom_codec_decode +text aom_codec_get_frame +text aom_codec_get_stream_info +text aom_codec_peek_stream_info +text aom_codec_set_frame_buffer_functions +text aom_obu_type_to_string +text aom_read_obu_header diff --git a/libs/libaom/src/aom/exports_enc b/libs/libaom/src/aom/exports_enc new file mode 100644 index 000000000..1473d9d2b --- /dev/null +++ b/libs/libaom/src/aom/exports_enc @@ -0,0 +1,17 @@ +text aom_codec_enc_config_default +text aom_codec_enc_config_set +text aom_codec_enc_init_ver +text aom_codec_encode +text aom_codec_get_cx_data +text aom_codec_get_global_headers +text aom_codec_get_preview_frame +text aom_codec_set_cx_data_buf +text aom_film_grain_table_append +text aom_film_grain_table_free +text aom_film_grain_table_write +text aom_flat_block_finder_init +text aom_flat_block_finder_run +text aom_noise_model_init +text aom_noise_model_get_grain_parameters +text aom_noise_model_save_latest +text aom_noise_model_update diff --git a/libs/libaom/src/aom/exports_test b/libs/libaom/src/aom/exports_test new file mode 100644 index 000000000..452a532ce --- /dev/null +++ b/libs/libaom/src/aom/exports_test @@ -0,0 +1,4 @@ +text aom_copy_metadata_to_frame_buffer +text aom_dsp_rtcd +text aom_remove_metadata_from_frame_buffer +text aom_scale_rtcd diff --git a/libs/libaom/src/aom/internal/aom_codec_internal.h b/libs/libaom/src/aom/internal/aom_codec_internal.h new file mode 100644 index 000000000..efe09acc9 --- /dev/null +++ b/libs/libaom/src/aom/internal/aom_codec_internal.h @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Describes the decoder algorithm interface for algorithm + * implementations. + * + * This file defines the private structures and data types that are only + * relevant to implementing an algorithm, as opposed to using it. + * + * To create a decoder algorithm class, an interface structure is put + * into the global namespace: + *
+ *     my_codec.c:
+ *       aom_codec_iface_t my_codec = {
+ *           "My Codec v1.0",
+ *           AOM_CODEC_ALG_ABI_VERSION,
+ *           ...
+ *       };
+ *     
+ * + * An application instantiates a specific decoder instance by using + * aom_codec_init() and a pointer to the algorithm's interface structure: + *
+ *     my_app.c:
+ *       extern aom_codec_iface_t my_codec;
+ *       {
+ *           aom_codec_ctx_t algo;
+ *           res = aom_codec_init(&algo, &my_codec);
+ *       }
+ *     
+ * + * Once initialized, the instance is managed using other functions from + * the aom_codec_* family. + */ +#ifndef AOM_AOM_INTERNAL_AOM_CODEC_INTERNAL_H_ +#define AOM_AOM_INTERNAL_AOM_CODEC_INTERNAL_H_ +#include "../aom_decoder.h" +#include "../aom_encoder.h" +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + */ +#define AOM_CODEC_INTERNAL_ABI_VERSION (7) /**<\hideinitializer*/ + +typedef struct aom_codec_alg_priv aom_codec_alg_priv_t; + +/*!\brief init function pointer prototype + * + * Performs algorithm-specific initialization of the decoder context. This + * function is called by the generic aom_codec_init() wrapper function, so + * plugins implementing this interface may trust the input parameters to be + * properly initialized. + * + * \param[in] ctx Pointer to this instance's context + * \retval #AOM_CODEC_OK + * The input stream was recognized and decoder initialized. + * \retval #AOM_CODEC_MEM_ERROR + * Memory operation failed. + */ +typedef aom_codec_err_t (*aom_codec_init_fn_t)(aom_codec_ctx_t *ctx); + +/*!\brief destroy function pointer prototype + * + * Performs algorithm-specific destruction of the decoder context. This + * function is called by the generic aom_codec_destroy() wrapper function, + * so plugins implementing this interface may trust the input parameters + * to be properly initialized. + * + * \param[in] ctx Pointer to this instance's context + * \retval #AOM_CODEC_OK + * The input stream was recognized and decoder initialized. + * \retval #AOM_CODEC_MEM_ERROR + * Memory operation failed. + */ +typedef aom_codec_err_t (*aom_codec_destroy_fn_t)(aom_codec_alg_priv_t *ctx); + +/*!\brief parse stream info function pointer prototype + * + * Performs high level parsing of the bitstream. This function is called by the + * generic aom_codec_peek_stream_info() wrapper function, so plugins + * implementing this interface may trust the input parameters to be properly + * initialized. + * + * \param[in] data Pointer to a block of data to parse + * \param[in] data_sz Size of the data buffer + * \param[in,out] si Pointer to stream info to update. The is_annexb + * member \ref MUST be properly initialized. This + * function sets the rest of the members. + * + * \retval #AOM_CODEC_OK + * Bitstream is parsable and stream information updated + */ +typedef aom_codec_err_t (*aom_codec_peek_si_fn_t)(const uint8_t *data, + size_t data_sz, + aom_codec_stream_info_t *si); + +/*!\brief Return information about the current stream. + * + * Returns information about the stream that has been parsed during decoding. + * + * \param[in] ctx Pointer to this instance's context + * \param[in,out] si Pointer to stream info to update + * + * \retval #AOM_CODEC_OK + * Bitstream is parsable and stream information updated + */ +typedef aom_codec_err_t (*aom_codec_get_si_fn_t)(aom_codec_alg_priv_t *ctx, + aom_codec_stream_info_t *si); + +/*!\brief control function pointer prototype + * + * This function is used to exchange algorithm specific data with the decoder + * instance. This can be used to implement features specific to a particular + * algorithm. + * + * This function is called by the generic aom_codec_control() wrapper + * function, so plugins implementing this interface may trust the input + * parameters to be properly initialized. However, this interface does not + * provide type safety for the exchanged data or assign meanings to the + * control IDs. Those details should be specified in the algorithm's + * header file. In particular, the ctrl_id parameter is guaranteed to exist + * in the algorithm's control mapping table, and the data parameter may be NULL. + * + * + * \param[in] ctx Pointer to this instance's context + * \param[in] ctrl_id Algorithm specific control identifier + * \param[in,out] data Data to exchange with algorithm instance. + * + * \retval #AOM_CODEC_OK + * The internal state data was deserialized. + */ +typedef aom_codec_err_t (*aom_codec_control_fn_t)(aom_codec_alg_priv_t *ctx, + va_list ap); + +/*!\brief control function pointer mapping + * + * This structure stores the mapping between control identifiers and + * implementing functions. Each algorithm provides a list of these + * mappings. This list is searched by the aom_codec_control() wrapper + * function to determine which function to invoke. The special + * value {0, NULL} is used to indicate end-of-list, and must be + * present. The special value {0, } can be used as a catch-all + * mapping. This implies that ctrl_id values chosen by the algorithm + * \ref MUST be non-zero. + */ +typedef const struct aom_codec_ctrl_fn_map { + int ctrl_id; + aom_codec_control_fn_t fn; +} aom_codec_ctrl_fn_map_t; + +/*!\brief decode data function pointer prototype + * + * Processes a buffer of coded data. This function is called by the generic + * aom_codec_decode() wrapper function, so plugins implementing this interface + * may trust the input parameters to be properly initialized. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] data Pointer to this block of new coded data. + * \param[in] data_sz Size of the coded data, in bytes. + * + * \return Returns #AOM_CODEC_OK if the coded data was processed completely + * and future pictures can be decoded without error. Otherwise, + * see the descriptions of the other error codes in ::aom_codec_err_t + * for recoverability capabilities. + */ +typedef aom_codec_err_t (*aom_codec_decode_fn_t)(aom_codec_alg_priv_t *ctx, + const uint8_t *data, + size_t data_sz, + void *user_priv); + +/*!\brief Decoded frames iterator + * + * Iterates over a list of the frames available for display. The iterator + * storage should be initialized to NULL to start the iteration. Iteration is + * complete when this function returns NULL. + * + * The list of available frames becomes valid upon completion of the + * aom_codec_decode call, and remains valid until the next call to + * aom_codec_decode. + * + * \param[in] ctx Pointer to this instance's context + * \param[in out] iter Iterator storage, initialized to NULL + * + * \return Returns a pointer to an image, if one is ready for display. Frames + * produced will always be in PTS (presentation time stamp) order. + */ +typedef aom_image_t *(*aom_codec_get_frame_fn_t)(aom_codec_alg_priv_t *ctx, + aom_codec_iter_t *iter); + +/*!\brief Pass in external frame buffers for the decoder to use. + * + * Registers functions to be called when libaom needs a frame buffer + * to decode the current frame and a function to be called when libaom does + * not internally reference the frame buffer. This set function must + * be called before the first call to decode or libaom will assume the + * default behavior of allocating frame buffers internally. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] cb_get Pointer to the get callback function + * \param[in] cb_release Pointer to the release callback function + * \param[in] cb_priv Callback's private data + * + * \retval #AOM_CODEC_OK + * External frame buffers will be used by libaom. + * \retval #AOM_CODEC_INVALID_PARAM + * One or more of the callbacks were NULL. + * \retval #AOM_CODEC_ERROR + * Decoder context not initialized, or algorithm not capable of + * using external frame buffers. + * + * \note + * When decoding AV1, the application may be required to pass in at least + * #AOM_MAXIMUM_WORK_BUFFERS external frame + * buffers. + */ +typedef aom_codec_err_t (*aom_codec_set_fb_fn_t)( + aom_codec_alg_priv_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get, + aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv); + +typedef aom_codec_err_t (*aom_codec_encode_fn_t)(aom_codec_alg_priv_t *ctx, + const aom_image_t *img, + aom_codec_pts_t pts, + unsigned long duration, + aom_enc_frame_flags_t flags); +typedef const aom_codec_cx_pkt_t *(*aom_codec_get_cx_data_fn_t)( + aom_codec_alg_priv_t *ctx, aom_codec_iter_t *iter); + +typedef aom_codec_err_t (*aom_codec_enc_config_set_fn_t)( + aom_codec_alg_priv_t *ctx, const aom_codec_enc_cfg_t *cfg); +typedef aom_fixed_buf_t *(*aom_codec_get_global_headers_fn_t)( + aom_codec_alg_priv_t *ctx); + +typedef aom_image_t *(*aom_codec_get_preview_frame_fn_t)( + aom_codec_alg_priv_t *ctx); + +/*!\brief Decoder algorithm interface interface + * + * All decoders \ref MUST expose a variable of this type. + */ +struct aom_codec_iface { + const char *name; /**< Identification String */ + int abi_version; /**< Implemented ABI version */ + aom_codec_caps_t caps; /**< Decoder capabilities */ + aom_codec_init_fn_t init; /**< \copydoc ::aom_codec_init_fn_t */ + aom_codec_destroy_fn_t destroy; /**< \copydoc ::aom_codec_destroy_fn_t */ + aom_codec_ctrl_fn_map_t *ctrl_maps; /**< \copydoc ::aom_codec_ctrl_fn_map_t */ + struct aom_codec_dec_iface { + aom_codec_peek_si_fn_t peek_si; /**< \copydoc ::aom_codec_peek_si_fn_t */ + aom_codec_get_si_fn_t get_si; /**< \copydoc ::aom_codec_get_si_fn_t */ + aom_codec_decode_fn_t decode; /**< \copydoc ::aom_codec_decode_fn_t */ + aom_codec_get_frame_fn_t + get_frame; /**< \copydoc ::aom_codec_get_frame_fn_t */ + aom_codec_set_fb_fn_t set_fb_fn; /**< \copydoc ::aom_codec_set_fb_fn_t */ + } dec; + struct aom_codec_enc_iface { + int cfg_count; + const aom_codec_enc_cfg_t *cfgs; /**< \copydoc ::aom_codec_enc_cfg_t */ + aom_codec_encode_fn_t encode; /**< \copydoc ::aom_codec_encode_fn_t */ + aom_codec_get_cx_data_fn_t + get_cx_data; /**< \copydoc ::aom_codec_get_cx_data_fn_t */ + aom_codec_enc_config_set_fn_t + cfg_set; /**< \copydoc ::aom_codec_enc_config_set_fn_t */ + aom_codec_get_global_headers_fn_t + get_glob_hdrs; /**< \copydoc ::aom_codec_get_global_headers_fn_t */ + aom_codec_get_preview_frame_fn_t + get_preview; /**< \copydoc ::aom_codec_get_preview_frame_fn_t */ + } enc; +}; + +/*!\brief Instance private storage + * + * This structure is allocated by the algorithm's init function. It can be + * extended in one of two ways. First, a second, algorithm specific structure + * can be allocated and the priv member pointed to it. Alternatively, this + * structure can be made the first member of the algorithm specific structure, + * and the pointer cast to the proper type. + */ +struct aom_codec_priv { + const char *err_detail; + aom_codec_flags_t init_flags; + struct { + aom_fixed_buf_t cx_data_dst_buf; + unsigned int cx_data_pad_before; + unsigned int cx_data_pad_after; + aom_codec_cx_pkt_t cx_data_pkt; + } enc; +}; + +#define CAST(id, arg) va_arg((arg), aom_codec_control_type_##id) + +/* CODEC_INTERFACE convenience macro + * + * By convention, each codec interface is a struct with extern linkage, where + * the symbol is suffixed with _algo. A getter function is also defined to + * return a pointer to the struct, since in some cases it's easier to work + * with text symbols than data symbols (see issue #169). This function has + * the same name as the struct, less the _algo suffix. The CODEC_INTERFACE + * macro is provided to define this getter function automatically. + */ +#define CODEC_INTERFACE(id) \ + aom_codec_iface_t *id(void) { return &id##_algo; } \ + aom_codec_iface_t id##_algo + +/* Internal Utility Functions + * + * The following functions are intended to be used inside algorithms as + * utilities for manipulating aom_codec_* data structures. + */ +struct aom_codec_pkt_list { + unsigned int cnt; + unsigned int max; + struct aom_codec_cx_pkt pkts[1]; +}; + +#define aom_codec_pkt_list_decl(n) \ + union { \ + struct aom_codec_pkt_list head; \ + struct { \ + struct aom_codec_pkt_list head; \ + struct aom_codec_cx_pkt pkts[n]; \ + } alloc; \ + } + +#define aom_codec_pkt_list_init(m) \ + (m)->alloc.head.cnt = 0, \ + (m)->alloc.head.max = sizeof((m)->alloc.pkts) / sizeof((m)->alloc.pkts[0]) + +int aom_codec_pkt_list_add(struct aom_codec_pkt_list *, + const struct aom_codec_cx_pkt *); + +const aom_codec_cx_pkt_t *aom_codec_pkt_list_get( + struct aom_codec_pkt_list *list, aom_codec_iter_t *iter); + +#include +#include + +struct aom_internal_error_info { + aom_codec_err_t error_code; + int has_detail; + char detail[80]; + int setjmp; // Boolean: whether 'jmp' is valid. + jmp_buf jmp; +}; + +#define CLANG_ANALYZER_NORETURN +#if defined(__has_feature) +#if __has_feature(attribute_analyzer_noreturn) +#undef CLANG_ANALYZER_NORETURN +#define CLANG_ANALYZER_NORETURN __attribute__((analyzer_noreturn)) +#endif +#endif + +void aom_internal_error(struct aom_internal_error_info *info, + aom_codec_err_t error, const char *fmt, + ...) CLANG_ANALYZER_NORETURN; + +void aom_merge_corrupted_flag(int *corrupted, int value); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_INTERNAL_AOM_CODEC_INTERNAL_H_ diff --git a/libs/libaom/src/aom/internal/aom_image_internal.h b/libs/libaom/src/aom/internal/aom_image_internal.h new file mode 100644 index 000000000..7f2fd1891 --- /dev/null +++ b/libs/libaom/src/aom/internal/aom_image_internal.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Describes the internal functions associated with the aom image + * descriptor. + * + */ +#ifndef AOM_AOM_INTERNAL_AOM_IMAGE_INTERNAL_H_ +#define AOM_AOM_INTERNAL_AOM_IMAGE_INTERNAL_H_ + +#include "aom/aom_image.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\brief Array of aom_metadata structs for an image. */ +struct aom_metadata_array { + size_t sz; /* Number of metadata structs in the list */ + aom_metadata_t **metadata_array; /* Array of metadata structs */ +}; + +/*!\brief Alloc memory for aom_metadata_array struct. + * + * Allocate memory for aom_metadata_array struct. + * If sz is 0 the aom_metadata_array structs internal buffer list will be NULL, + * but the aom_metadata_array struct itself will still be allocated. + * Returns a pointer to the allocated struct or NULL on failure. + * + * \param[in] sz Size of internal metadata list buffer + */ +aom_metadata_array_t *aom_img_metadata_array_alloc(size_t sz); + +/*!\brief Free metadata array struct. + * + * Free metadata array struct and all metadata structs inside. + * + * \param[in] arr Metadata array struct pointer + */ +void aom_img_metadata_array_free(aom_metadata_array_t *arr); + +typedef void *(*aom_alloc_img_data_cb_fn_t)(void *priv, size_t size); + +/*!\brief Open a descriptor, allocating storage for the underlying image by + * using the provided callback function. + * + * Returns a descriptor for storing an image of the given format. The storage + * for the image is allocated by using the provided callback function. Unlike + * aom_img_alloc(), the returned descriptor does not own the storage for the + * image. The caller is responsible for freeing the storage for the image. + * + * Note: If the callback function is invoked and succeeds, + * aom_img_alloc_with_cb() is guaranteed to succeed. Therefore, if + * aom_img_alloc_with_cb() fails, the caller is assured that no storage was + * allocated. + * + * \param[in] img Pointer to storage for descriptor. If this parameter + * is NULL, the storage for the descriptor will be + * allocated on the heap. + * \param[in] fmt Format for the image + * \param[in] d_w Width of the image + * \param[in] d_h Height of the image + * \param[in] align Alignment, in bytes, of the image buffer and + * each row in the image (stride). + * \param[in] alloc_cb Callback function used to allocate storage for the + * image. + * \param[in] cb_priv The first argument ('priv') for the callback + * function. + * + * \return Returns a pointer to the initialized image descriptor. If the img + * parameter is non-null, the value of the img parameter will be + * returned. + */ +aom_image_t *aom_img_alloc_with_cb(aom_image_t *img, aom_img_fmt_t fmt, + unsigned int d_w, unsigned int d_h, + unsigned int align, + aom_alloc_img_data_cb_fn_t alloc_cb, + void *cb_priv); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_INTERNAL_AOM_IMAGE_INTERNAL_H_ diff --git a/libs/libaom/src/aom/src/aom_codec.c b/libs/libaom/src/aom/src/aom_codec.c new file mode 100644 index 000000000..196ab8354 --- /dev/null +++ b/libs/libaom/src/aom/src/aom_codec.c @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Provides the high level interface to wrap decoder algorithms. + * + */ +#include +#include + +#include "config/aom_config.h" +#include "config/aom_version.h" + +#include "aom/aom_integer.h" +#include "aom/internal/aom_codec_internal.h" + +#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var) + +int aom_codec_version(void) { return VERSION_PACKED; } + +const char *aom_codec_version_str(void) { return VERSION_STRING_NOSP; } + +const char *aom_codec_version_extra_str(void) { return VERSION_EXTRA; } + +const char *aom_codec_iface_name(aom_codec_iface_t *iface) { + return iface ? iface->name : ""; +} + +const char *aom_codec_err_to_string(aom_codec_err_t err) { + switch (err) { + case AOM_CODEC_OK: return "Success"; + case AOM_CODEC_ERROR: return "Unspecified internal error"; + case AOM_CODEC_MEM_ERROR: return "Memory allocation error"; + case AOM_CODEC_ABI_MISMATCH: return "ABI version mismatch"; + case AOM_CODEC_INCAPABLE: + return "Codec does not implement requested capability"; + case AOM_CODEC_UNSUP_BITSTREAM: + return "Bitstream not supported by this decoder"; + case AOM_CODEC_UNSUP_FEATURE: + return "Bitstream required feature not supported by this decoder"; + case AOM_CODEC_CORRUPT_FRAME: return "Corrupt frame detected"; + case AOM_CODEC_INVALID_PARAM: return "Invalid parameter"; + case AOM_CODEC_LIST_END: return "End of iterated list"; + } + + return "Unrecognized error code"; +} + +const char *aom_codec_error(aom_codec_ctx_t *ctx) { + return (ctx) ? aom_codec_err_to_string(ctx->err) + : aom_codec_err_to_string(AOM_CODEC_INVALID_PARAM); +} + +const char *aom_codec_error_detail(aom_codec_ctx_t *ctx) { + if (ctx && ctx->err) + return ctx->priv ? ctx->priv->err_detail : ctx->err_detail; + + return NULL; +} + +aom_codec_err_t aom_codec_destroy(aom_codec_ctx_t *ctx) { + aom_codec_err_t res; + + if (!ctx) + res = AOM_CODEC_INVALID_PARAM; + else if (!ctx->iface || !ctx->priv) + res = AOM_CODEC_ERROR; + else { + ctx->iface->destroy((aom_codec_alg_priv_t *)ctx->priv); + + ctx->iface = NULL; + ctx->name = NULL; + ctx->priv = NULL; + res = AOM_CODEC_OK; + } + + return SAVE_STATUS(ctx, res); +} + +aom_codec_caps_t aom_codec_get_caps(aom_codec_iface_t *iface) { + return (iface) ? iface->caps : 0; +} + +aom_codec_err_t aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...) { + aom_codec_err_t res; + + if (!ctx || !ctrl_id) + res = AOM_CODEC_INVALID_PARAM; + else if (!ctx->iface || !ctx->priv || !ctx->iface->ctrl_maps) + res = AOM_CODEC_ERROR; + else { + aom_codec_ctrl_fn_map_t *entry; + + res = AOM_CODEC_ERROR; + + for (entry = ctx->iface->ctrl_maps; entry && entry->fn; entry++) { + if (!entry->ctrl_id || entry->ctrl_id == ctrl_id) { + va_list ap; + + va_start(ap, ctrl_id); + res = entry->fn((aom_codec_alg_priv_t *)ctx->priv, ap); + va_end(ap); + break; + } + } + } + + return SAVE_STATUS(ctx, res); +} + +void aom_internal_error(struct aom_internal_error_info *info, + aom_codec_err_t error, const char *fmt, ...) { + va_list ap; + + info->error_code = error; + info->has_detail = 0; + + if (fmt) { + size_t sz = sizeof(info->detail); + + info->has_detail = 1; + va_start(ap, fmt); + vsnprintf(info->detail, sz - 1, fmt, ap); + va_end(ap); + info->detail[sz - 1] = '\0'; + } + + if (info->setjmp) longjmp(info->jmp, info->error_code); +} + +void aom_merge_corrupted_flag(int *corrupted, int value) { + *corrupted |= value; +} + +const char *aom_obu_type_to_string(OBU_TYPE type) { + switch (type) { + case OBU_SEQUENCE_HEADER: return "OBU_SEQUENCE_HEADER"; + case OBU_TEMPORAL_DELIMITER: return "OBU_TEMPORAL_DELIMITER"; + case OBU_FRAME_HEADER: return "OBU_FRAME_HEADER"; + case OBU_REDUNDANT_FRAME_HEADER: return "OBU_REDUNDANT_FRAME_HEADER"; + case OBU_FRAME: return "OBU_FRAME"; + case OBU_TILE_GROUP: return "OBU_TILE_GROUP"; + case OBU_METADATA: return "OBU_METADATA"; + case OBU_TILE_LIST: return "OBU_TILE_LIST"; + case OBU_PADDING: return "OBU_PADDING"; + default: break; + } + return ""; +} diff --git a/libs/libaom/src/aom/src/aom_decoder.c b/libs/libaom/src/aom/src/aom_decoder.c new file mode 100644 index 000000000..49fff2635 --- /dev/null +++ b/libs/libaom/src/aom/src/aom_decoder.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Provides the high level interface to wrap decoder algorithms. + * + */ +#include +#include "aom/internal/aom_codec_internal.h" + +#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var) + +static aom_codec_alg_priv_t *get_alg_priv(aom_codec_ctx_t *ctx) { + return (aom_codec_alg_priv_t *)ctx->priv; +} + +aom_codec_err_t aom_codec_dec_init_ver(aom_codec_ctx_t *ctx, + aom_codec_iface_t *iface, + const aom_codec_dec_cfg_t *cfg, + aom_codec_flags_t flags, int ver) { + aom_codec_err_t res; + + if (ver != AOM_DECODER_ABI_VERSION) + res = AOM_CODEC_ABI_MISMATCH; + else if (!ctx || !iface) + res = AOM_CODEC_INVALID_PARAM; + else if (iface->abi_version != AOM_CODEC_INTERNAL_ABI_VERSION) + res = AOM_CODEC_ABI_MISMATCH; + else if (!(iface->caps & AOM_CODEC_CAP_DECODER)) + res = AOM_CODEC_INCAPABLE; + else { + memset(ctx, 0, sizeof(*ctx)); + ctx->iface = iface; + ctx->name = iface->name; + ctx->priv = NULL; + ctx->init_flags = flags; + ctx->config.dec = cfg; + + res = ctx->iface->init(ctx); + if (res) { + ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL; + aom_codec_destroy(ctx); + } + } + + return SAVE_STATUS(ctx, res); +} + +aom_codec_err_t aom_codec_peek_stream_info(aom_codec_iface_t *iface, + const uint8_t *data, size_t data_sz, + aom_codec_stream_info_t *si) { + aom_codec_err_t res; + + if (!iface || !data || !data_sz || !si) { + res = AOM_CODEC_INVALID_PARAM; + } else { + /* Set default/unknown values */ + si->w = 0; + si->h = 0; + + res = iface->dec.peek_si(data, data_sz, si); + } + + return res; +} + +aom_codec_err_t aom_codec_get_stream_info(aom_codec_ctx_t *ctx, + aom_codec_stream_info_t *si) { + aom_codec_err_t res; + + if (!ctx || !si) { + res = AOM_CODEC_INVALID_PARAM; + } else if (!ctx->iface || !ctx->priv) { + res = AOM_CODEC_ERROR; + } else { + /* Set default/unknown values */ + si->w = 0; + si->h = 0; + + res = ctx->iface->dec.get_si(get_alg_priv(ctx), si); + } + + return SAVE_STATUS(ctx, res); +} + +aom_codec_err_t aom_codec_decode(aom_codec_ctx_t *ctx, const uint8_t *data, + size_t data_sz, void *user_priv) { + aom_codec_err_t res; + + if (!ctx) + res = AOM_CODEC_INVALID_PARAM; + else if (!ctx->iface || !ctx->priv) + res = AOM_CODEC_ERROR; + else { + res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv); + } + + return SAVE_STATUS(ctx, res); +} + +aom_image_t *aom_codec_get_frame(aom_codec_ctx_t *ctx, aom_codec_iter_t *iter) { + aom_image_t *img; + + if (!ctx || !iter || !ctx->iface || !ctx->priv) + img = NULL; + else + img = ctx->iface->dec.get_frame(get_alg_priv(ctx), iter); + + return img; +} + +aom_codec_err_t aom_codec_set_frame_buffer_functions( + aom_codec_ctx_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get, + aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) { + aom_codec_err_t res; + + if (!ctx || !cb_get || !cb_release) { + res = AOM_CODEC_INVALID_PARAM; + } else if (!ctx->iface || !ctx->priv) { + res = AOM_CODEC_ERROR; + } else if (!(ctx->iface->caps & AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) { + res = AOM_CODEC_INCAPABLE; + } else { + res = ctx->iface->dec.set_fb_fn(get_alg_priv(ctx), cb_get, cb_release, + cb_priv); + } + + return SAVE_STATUS(ctx, res); +} diff --git a/libs/libaom/src/aom/src/aom_encoder.c b/libs/libaom/src/aom/src/aom_encoder.c new file mode 100644 index 000000000..bb51c9388 --- /dev/null +++ b/libs/libaom/src/aom/src/aom_encoder.c @@ -0,0 +1,302 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Provides the high level interface to wrap encoder algorithms. + * + */ +#include "config/aom_config.h" + +#if HAVE_FEXCEPT +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#endif + +#include +#include + +#include "aom/aom_encoder.h" +#include "aom/internal/aom_codec_internal.h" + +#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var) + +static aom_codec_alg_priv_t *get_alg_priv(aom_codec_ctx_t *ctx) { + return (aom_codec_alg_priv_t *)ctx->priv; +} + +aom_codec_err_t aom_codec_enc_init_ver(aom_codec_ctx_t *ctx, + aom_codec_iface_t *iface, + const aom_codec_enc_cfg_t *cfg, + aom_codec_flags_t flags, int ver) { + aom_codec_err_t res; + + if (ver != AOM_ENCODER_ABI_VERSION) + res = AOM_CODEC_ABI_MISMATCH; + else if (!ctx || !iface || !cfg) + res = AOM_CODEC_INVALID_PARAM; + else if (iface->abi_version != AOM_CODEC_INTERNAL_ABI_VERSION) + res = AOM_CODEC_ABI_MISMATCH; + else if (!(iface->caps & AOM_CODEC_CAP_ENCODER)) + res = AOM_CODEC_INCAPABLE; + else if ((flags & AOM_CODEC_USE_PSNR) && !(iface->caps & AOM_CODEC_CAP_PSNR)) + res = AOM_CODEC_INCAPABLE; + else { + ctx->iface = iface; + ctx->name = iface->name; + ctx->priv = NULL; + ctx->init_flags = flags; + ctx->config.enc = cfg; + res = ctx->iface->init(ctx); + + if (res) { + ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL; + aom_codec_destroy(ctx); + } + } + + return SAVE_STATUS(ctx, res); +} + +aom_codec_err_t aom_codec_enc_config_default(aom_codec_iface_t *iface, + aom_codec_enc_cfg_t *cfg, + unsigned int usage) { + aom_codec_err_t res; + int i; + + if (!iface || !cfg) + res = AOM_CODEC_INVALID_PARAM; + else if (!(iface->caps & AOM_CODEC_CAP_ENCODER)) + res = AOM_CODEC_INCAPABLE; + else { + res = AOM_CODEC_INVALID_PARAM; + + for (i = 0; i < iface->enc.cfg_count; ++i) { + if (iface->enc.cfgs[i].g_usage == usage) { + *cfg = iface->enc.cfgs[i]; + res = AOM_CODEC_OK; + break; + } + } + } + /* default values */ + if (cfg) { + memset(&cfg->encoder_cfg, 0, sizeof(cfg->encoder_cfg)); + cfg->encoder_cfg.super_block_size = 0; // Dynamic + cfg->encoder_cfg.max_partition_size = 128; + cfg->encoder_cfg.min_partition_size = 4; + cfg->encoder_cfg.disable_trellis_quant = 3; + } + return res; +} + +#if ARCH_X86 || ARCH_X86_64 +/* On X86, disable the x87 unit's internal 80 bit precision for better + * consistency with the SSE unit's 64 bit precision. + */ +#include "aom_ports/x86.h" +#define FLOATING_POINT_SET_PRECISION \ + unsigned short x87_orig_mode = x87_set_double_precision(); +#define FLOATING_POINT_RESTORE_PRECISION x87_set_control_word(x87_orig_mode); +#else +#define FLOATING_POINT_SET_PRECISION +#define FLOATING_POINT_RESTORE_PRECISION +#endif // ARCH_X86 || ARCH_X86_64 + +#if HAVE_FEXCEPT && CONFIG_DEBUG +#define FLOATING_POINT_SET_EXCEPTIONS \ + const int float_excepts = \ + feenableexcept(FE_DIVBYZERO | FE_UNDERFLOW | FE_OVERFLOW); +#define FLOATING_POINT_RESTORE_EXCEPTIONS \ + fedisableexcept(FE_ALL_EXCEPT); \ + feenableexcept(float_excepts); +#else +#define FLOATING_POINT_SET_EXCEPTIONS +#define FLOATING_POINT_RESTORE_EXCEPTIONS +#endif // HAVE_FEXCEPT && CONFIG_DEBUG + +/* clang-format off */ +#define FLOATING_POINT_INIT \ + do { \ + FLOATING_POINT_SET_PRECISION \ + FLOATING_POINT_SET_EXCEPTIONS + +#define FLOATING_POINT_RESTORE \ + FLOATING_POINT_RESTORE_EXCEPTIONS \ + FLOATING_POINT_RESTORE_PRECISION \ + } while (0); +/* clang-format on */ + +aom_codec_err_t aom_codec_encode(aom_codec_ctx_t *ctx, const aom_image_t *img, + aom_codec_pts_t pts, unsigned long duration, + aom_enc_frame_flags_t flags) { + aom_codec_err_t res = AOM_CODEC_OK; + + if (!ctx || (img && !duration)) + res = AOM_CODEC_INVALID_PARAM; + else if (!ctx->iface || !ctx->priv) + res = AOM_CODEC_ERROR; + else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER)) + res = AOM_CODEC_INCAPABLE; + else { + /* Execute in a normalized floating point environment, if the platform + * requires it. + */ + FLOATING_POINT_INIT + res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration, flags); + FLOATING_POINT_RESTORE + } + + return SAVE_STATUS(ctx, res); +} + +const aom_codec_cx_pkt_t *aom_codec_get_cx_data(aom_codec_ctx_t *ctx, + aom_codec_iter_t *iter) { + const aom_codec_cx_pkt_t *pkt = NULL; + + if (ctx) { + if (!iter) + ctx->err = AOM_CODEC_INVALID_PARAM; + else if (!ctx->iface || !ctx->priv) + ctx->err = AOM_CODEC_ERROR; + else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER)) + ctx->err = AOM_CODEC_INCAPABLE; + else + pkt = ctx->iface->enc.get_cx_data(get_alg_priv(ctx), iter); + } + + if (pkt && pkt->kind == AOM_CODEC_CX_FRAME_PKT) { + // If the application has specified a destination area for the + // compressed data, and the codec has not placed the data there, + // and it fits, copy it. + aom_codec_priv_t *const priv = ctx->priv; + char *const dst_buf = (char *)priv->enc.cx_data_dst_buf.buf; + + if (dst_buf && pkt->data.raw.buf != dst_buf && + pkt->data.raw.sz + priv->enc.cx_data_pad_before + + priv->enc.cx_data_pad_after <= + priv->enc.cx_data_dst_buf.sz) { + aom_codec_cx_pkt_t *modified_pkt = &priv->enc.cx_data_pkt; + + memcpy(dst_buf + priv->enc.cx_data_pad_before, pkt->data.raw.buf, + pkt->data.raw.sz); + *modified_pkt = *pkt; + modified_pkt->data.raw.buf = dst_buf; + modified_pkt->data.raw.sz += + priv->enc.cx_data_pad_before + priv->enc.cx_data_pad_after; + pkt = modified_pkt; + } + + if (dst_buf == pkt->data.raw.buf) { + priv->enc.cx_data_dst_buf.buf = dst_buf + pkt->data.raw.sz; + priv->enc.cx_data_dst_buf.sz -= pkt->data.raw.sz; + } + } + + return pkt; +} + +aom_codec_err_t aom_codec_set_cx_data_buf(aom_codec_ctx_t *ctx, + const aom_fixed_buf_t *buf, + unsigned int pad_before, + unsigned int pad_after) { + if (!ctx || !ctx->priv) return AOM_CODEC_INVALID_PARAM; + + if (buf) { + ctx->priv->enc.cx_data_dst_buf = *buf; + ctx->priv->enc.cx_data_pad_before = pad_before; + ctx->priv->enc.cx_data_pad_after = pad_after; + } else { + ctx->priv->enc.cx_data_dst_buf.buf = NULL; + ctx->priv->enc.cx_data_dst_buf.sz = 0; + ctx->priv->enc.cx_data_pad_before = 0; + ctx->priv->enc.cx_data_pad_after = 0; + } + + return AOM_CODEC_OK; +} + +const aom_image_t *aom_codec_get_preview_frame(aom_codec_ctx_t *ctx) { + aom_image_t *img = NULL; + + if (ctx) { + if (!ctx->iface || !ctx->priv) + ctx->err = AOM_CODEC_ERROR; + else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER)) + ctx->err = AOM_CODEC_INCAPABLE; + else if (!ctx->iface->enc.get_preview) + ctx->err = AOM_CODEC_INCAPABLE; + else + img = ctx->iface->enc.get_preview(get_alg_priv(ctx)); + } + + return img; +} + +aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx) { + aom_fixed_buf_t *buf = NULL; + + if (ctx) { + if (!ctx->iface || !ctx->priv) + ctx->err = AOM_CODEC_ERROR; + else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER)) + ctx->err = AOM_CODEC_INCAPABLE; + else if (!ctx->iface->enc.get_glob_hdrs) + ctx->err = AOM_CODEC_INCAPABLE; + else + buf = ctx->iface->enc.get_glob_hdrs(get_alg_priv(ctx)); + } + + return buf; +} + +aom_codec_err_t aom_codec_enc_config_set(aom_codec_ctx_t *ctx, + const aom_codec_enc_cfg_t *cfg) { + aom_codec_err_t res; + + if (!ctx || !ctx->iface || !ctx->priv || !cfg) + res = AOM_CODEC_INVALID_PARAM; + else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER)) + res = AOM_CODEC_INCAPABLE; + else + res = ctx->iface->enc.cfg_set(get_alg_priv(ctx), cfg); + + return SAVE_STATUS(ctx, res); +} + +int aom_codec_pkt_list_add(struct aom_codec_pkt_list *list, + const struct aom_codec_cx_pkt *pkt) { + if (list->cnt < list->max) { + list->pkts[list->cnt++] = *pkt; + return 0; + } + + return 1; +} + +const aom_codec_cx_pkt_t *aom_codec_pkt_list_get( + struct aom_codec_pkt_list *list, aom_codec_iter_t *iter) { + const aom_codec_cx_pkt_t *pkt; + + if (!(*iter)) { + *iter = list->pkts; + } + + pkt = (const aom_codec_cx_pkt_t *)*iter; + + if ((size_t)(pkt - list->pkts) < list->cnt) + *iter = pkt + 1; + else + pkt = NULL; + + return pkt; +} diff --git a/libs/libaom/src/aom/src/aom_image.c b/libs/libaom/src/aom/src/aom_image.c new file mode 100644 index 000000000..cd0b5ed83 --- /dev/null +++ b/libs/libaom/src/aom/src/aom_image.c @@ -0,0 +1,395 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom/aom_image.h" +#include "aom/aom_integer.h" +#include "aom/internal/aom_image_internal.h" +#include "aom_mem/aom_mem.h" + +static INLINE unsigned int align_image_dimension(unsigned int d, + unsigned int subsampling, + unsigned int size_align) { + unsigned int align; + + align = (1 << subsampling) - 1; + align = (size_align - 1 > align) ? (size_align - 1) : align; + return ((d + align) & ~align); +} + +static aom_image_t *img_alloc_helper( + aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w, unsigned int d_h, + unsigned int buf_align, unsigned int stride_align, unsigned int size_align, + unsigned int border, unsigned char *img_data, + aom_alloc_img_data_cb_fn_t alloc_cb, void *cb_priv) { + /* NOTE: In this function, bit_depth is either 8 or 16 (if + * AOM_IMG_FMT_HIGHBITDEPTH is set), never 10 or 12. + */ + unsigned int h, w, s, xcs, ycs, bps, bit_depth; + unsigned int stride_in_bytes; + + /* Treat align==0 like align==1 */ + if (!buf_align) buf_align = 1; + + /* Validate alignment (must be power of 2) */ + if (buf_align & (buf_align - 1)) goto fail; + + /* Treat align==0 like align==1 */ + if (!stride_align) stride_align = 1; + + /* Validate alignment (must be power of 2) */ + if (stride_align & (stride_align - 1)) goto fail; + + /* Treat align==0 like align==1 */ + if (!size_align) size_align = 1; + + /* Validate alignment (must be power of 2) */ + if (size_align & (size_align - 1)) goto fail; + + /* Get sample size for this format */ + switch (fmt) { + case AOM_IMG_FMT_I420: + case AOM_IMG_FMT_YV12: + case AOM_IMG_FMT_AOMI420: + case AOM_IMG_FMT_AOMYV12: bps = 12; break; + case AOM_IMG_FMT_I422: bps = 16; break; + case AOM_IMG_FMT_I444: bps = 24; break; + case AOM_IMG_FMT_YV1216: + case AOM_IMG_FMT_I42016: bps = 24; break; + case AOM_IMG_FMT_I42216: bps = 32; break; + case AOM_IMG_FMT_I44416: bps = 48; break; + default: bps = 16; break; + } + + bit_depth = (fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 16 : 8; + + /* Get chroma shift values for this format */ + switch (fmt) { + case AOM_IMG_FMT_I420: + case AOM_IMG_FMT_YV12: + case AOM_IMG_FMT_AOMI420: + case AOM_IMG_FMT_AOMYV12: + case AOM_IMG_FMT_I422: + case AOM_IMG_FMT_I42016: + case AOM_IMG_FMT_YV1216: + case AOM_IMG_FMT_I42216: xcs = 1; break; + default: xcs = 0; break; + } + + switch (fmt) { + case AOM_IMG_FMT_I420: + case AOM_IMG_FMT_YV12: + case AOM_IMG_FMT_AOMI420: + case AOM_IMG_FMT_AOMYV12: + case AOM_IMG_FMT_YV1216: + case AOM_IMG_FMT_I42016: ycs = 1; break; + default: ycs = 0; break; + } + + /* Calculate storage sizes given the chroma subsampling */ + w = align_image_dimension(d_w, xcs, size_align); + h = align_image_dimension(d_h, ycs, size_align); + + s = (fmt & AOM_IMG_FMT_PLANAR) ? w : bps * w / bit_depth; + s = (s + 2 * border + stride_align - 1) & ~(stride_align - 1); + stride_in_bytes = s * bit_depth / 8; + + /* Allocate the new image */ + if (!img) { + img = (aom_image_t *)calloc(1, sizeof(aom_image_t)); + + if (!img) goto fail; + + img->self_allocd = 1; + } else { + memset(img, 0, sizeof(aom_image_t)); + } + + img->img_data = img_data; + + if (!img_data) { + const uint64_t alloc_size = + (fmt & AOM_IMG_FMT_PLANAR) + ? (uint64_t)(h + 2 * border) * stride_in_bytes * bps / bit_depth + : (uint64_t)(h + 2 * border) * stride_in_bytes; + + if (alloc_size != (size_t)alloc_size) goto fail; + + if (alloc_cb) { + const size_t padded_alloc_size = (size_t)alloc_size + buf_align - 1; + img->img_data = (uint8_t *)alloc_cb(cb_priv, padded_alloc_size); + if (img->img_data) { + img->img_data = (uint8_t *)aom_align_addr(img->img_data, buf_align); + } + img->img_data_owner = 0; + } else { + img->img_data = (uint8_t *)aom_memalign(buf_align, (size_t)alloc_size); + img->img_data_owner = 1; + } + img->sz = (size_t)alloc_size; + } + + if (!img->img_data) goto fail; + + img->fmt = fmt; + img->bit_depth = bit_depth; + // aligned width and aligned height + img->w = w; + img->h = h; + img->x_chroma_shift = xcs; + img->y_chroma_shift = ycs; + img->bps = bps; + + /* Calculate strides */ + img->stride[AOM_PLANE_Y] = stride_in_bytes; + img->stride[AOM_PLANE_U] = img->stride[AOM_PLANE_V] = stride_in_bytes >> xcs; + + /* Default viewport to entire image. (This aom_img_set_rect call always + * succeeds.) */ + aom_img_set_rect(img, 0, 0, d_w, d_h, border); + return img; + +fail: + aom_img_free(img); + return NULL; +} + +aom_image_t *aom_img_alloc(aom_image_t *img, aom_img_fmt_t fmt, + unsigned int d_w, unsigned int d_h, + unsigned int align) { + return img_alloc_helper(img, fmt, d_w, d_h, align, align, 1, 0, NULL, NULL, + NULL); +} + +aom_image_t *aom_img_alloc_with_cb(aom_image_t *img, aom_img_fmt_t fmt, + unsigned int d_w, unsigned int d_h, + unsigned int align, + aom_alloc_img_data_cb_fn_t alloc_cb, + void *cb_priv) { + return img_alloc_helper(img, fmt, d_w, d_h, align, align, 1, 0, NULL, + alloc_cb, cb_priv); +} + +aom_image_t *aom_img_wrap(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w, + unsigned int d_h, unsigned int stride_align, + unsigned char *img_data) { + /* Set buf_align = 1. It is ignored by img_alloc_helper because img_data is + * not NULL. */ + return img_alloc_helper(img, fmt, d_w, d_h, 1, stride_align, 1, 0, img_data, + NULL, NULL); +} + +aom_image_t *aom_img_alloc_with_border(aom_image_t *img, aom_img_fmt_t fmt, + unsigned int d_w, unsigned int d_h, + unsigned int align, + unsigned int size_align, + unsigned int border) { + return img_alloc_helper(img, fmt, d_w, d_h, align, align, size_align, border, + NULL, NULL, NULL); +} + +int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y, + unsigned int w, unsigned int h, unsigned int border) { + unsigned char *data; + + if (x + w <= img->w && y + h <= img->h) { + img->d_w = w; + img->d_h = h; + + x += border; + y += border; + + /* Calculate plane pointers */ + if (!(img->fmt & AOM_IMG_FMT_PLANAR)) { + img->planes[AOM_PLANE_PACKED] = + img->img_data + x * img->bps / 8 + y * img->stride[AOM_PLANE_PACKED]; + } else { + const int bytes_per_sample = + (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; + data = img->img_data; + + img->planes[AOM_PLANE_Y] = + data + x * bytes_per_sample + y * img->stride[AOM_PLANE_Y]; + data += (img->h + 2 * border) * img->stride[AOM_PLANE_Y]; + + unsigned int uv_border_h = border >> img->y_chroma_shift; + unsigned int uv_x = x >> img->x_chroma_shift; + unsigned int uv_y = y >> img->y_chroma_shift; + if (!(img->fmt & AOM_IMG_FMT_UV_FLIP)) { + img->planes[AOM_PLANE_U] = + data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_U]; + data += ((img->h >> img->y_chroma_shift) + 2 * uv_border_h) * + img->stride[AOM_PLANE_U]; + img->planes[AOM_PLANE_V] = + data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_V]; + } else { + img->planes[AOM_PLANE_V] = + data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_V]; + data += ((img->h >> img->y_chroma_shift) + 2 * uv_border_h) * + img->stride[AOM_PLANE_V]; + img->planes[AOM_PLANE_U] = + data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_U]; + } + } + return 0; + } + return -1; +} + +void aom_img_flip(aom_image_t *img) { + /* Note: In the calculation pointer adjustment calculation, we want the + * rhs to be promoted to a signed type. Section 6.3.1.8 of the ISO C99 + * standard indicates that if the adjustment parameter is unsigned, the + * stride parameter will be promoted to unsigned, causing errors when + * the lhs is a larger type than the rhs. + */ + img->planes[AOM_PLANE_Y] += (signed)(img->d_h - 1) * img->stride[AOM_PLANE_Y]; + img->stride[AOM_PLANE_Y] = -img->stride[AOM_PLANE_Y]; + + img->planes[AOM_PLANE_U] += (signed)((img->d_h >> img->y_chroma_shift) - 1) * + img->stride[AOM_PLANE_U]; + img->stride[AOM_PLANE_U] = -img->stride[AOM_PLANE_U]; + + img->planes[AOM_PLANE_V] += (signed)((img->d_h >> img->y_chroma_shift) - 1) * + img->stride[AOM_PLANE_V]; + img->stride[AOM_PLANE_V] = -img->stride[AOM_PLANE_V]; +} + +void aom_img_free(aom_image_t *img) { + if (img) { + aom_img_remove_metadata(img); + if (img->img_data && img->img_data_owner) aom_free(img->img_data); + + if (img->self_allocd) free(img); + } +} + +int aom_img_plane_width(const aom_image_t *img, int plane) { + if (plane > 0 && img->x_chroma_shift > 0) + return (img->d_w + 1) >> img->x_chroma_shift; + else + return img->d_w; +} + +int aom_img_plane_height(const aom_image_t *img, int plane) { + if (plane > 0 && img->y_chroma_shift > 0) + return (img->d_h + 1) >> img->y_chroma_shift; + else + return img->d_h; +} + +aom_metadata_t *aom_img_metadata_alloc( + uint32_t type, const uint8_t *data, size_t sz, + aom_metadata_insert_flags_t insert_flag) { + if (!data || sz == 0) return NULL; + aom_metadata_t *metadata = (aom_metadata_t *)malloc(sizeof(aom_metadata_t)); + if (!metadata) return NULL; + metadata->type = type; + metadata->payload = (uint8_t *)malloc(sz); + if (!metadata->payload) { + free(metadata); + return NULL; + } + memcpy(metadata->payload, data, sz); + metadata->sz = sz; + metadata->insert_flag = insert_flag; + return metadata; +} + +void aom_img_metadata_free(aom_metadata_t *metadata) { + if (metadata) { + if (metadata->payload) free(metadata->payload); + free(metadata); + } +} + +aom_metadata_array_t *aom_img_metadata_array_alloc(size_t sz) { + aom_metadata_array_t *arr = + (aom_metadata_array_t *)calloc(1, sizeof(aom_metadata_array_t)); + if (!arr) return NULL; + if (sz > 0) { + arr->metadata_array = + (aom_metadata_t **)calloc(sz, sizeof(aom_metadata_t *)); + if (!arr->metadata_array) { + aom_img_metadata_array_free(arr); + return NULL; + } + arr->sz = sz; + } + return arr; +} + +void aom_img_metadata_array_free(aom_metadata_array_t *arr) { + if (arr) { + if (arr->metadata_array) { + for (size_t i = 0; i < arr->sz; i++) { + aom_img_metadata_free(arr->metadata_array[i]); + } + free(arr->metadata_array); + } + free(arr); + } +} + +int aom_img_add_metadata(aom_image_t *img, uint32_t type, const uint8_t *data, + size_t sz, aom_metadata_insert_flags_t insert_flag) { + if (!img) return -1; + if (!img->metadata) { + img->metadata = aom_img_metadata_array_alloc(0); + if (!img->metadata) return -1; + } + aom_metadata_t *metadata = + aom_img_metadata_alloc(type, data, sz, insert_flag); + if (!metadata) goto fail; + if (!img->metadata->metadata_array) { + img->metadata->metadata_array = + (aom_metadata_t **)calloc(1, sizeof(metadata)); + if (!img->metadata->metadata_array || img->metadata->sz != 0) { + aom_img_metadata_free(metadata); + goto fail; + } + } else { + img->metadata->metadata_array = + (aom_metadata_t **)realloc(img->metadata->metadata_array, + (img->metadata->sz + 1) * sizeof(metadata)); + } + img->metadata->metadata_array[img->metadata->sz] = metadata; + img->metadata->sz++; + return 0; +fail: + aom_img_metadata_array_free(img->metadata); + img->metadata = NULL; + return -1; +} + +void aom_img_remove_metadata(aom_image_t *img) { + if (img && img->metadata) { + aom_img_metadata_array_free(img->metadata); + img->metadata = NULL; + } +} + +const aom_metadata_t *aom_img_get_metadata(const aom_image_t *img, + size_t index) { + if (!img) return NULL; + const aom_metadata_array_t *array = img->metadata; + if (array && index < array->sz) { + return array->metadata_array[index]; + } + return NULL; +} + +size_t aom_img_num_metadata(const aom_image_t *img) { + if (!img || !img->metadata) return 0; + return img->metadata->sz; +} diff --git a/libs/libaom/src/aom/src/aom_integer.c b/libs/libaom/src/aom/src/aom_integer.c new file mode 100644 index 000000000..7edfd0de8 --- /dev/null +++ b/libs/libaom/src/aom/src/aom_integer.c @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "aom/aom_integer.h" + +static const size_t kMaximumLeb128Size = 8; +static const uint8_t kLeb128ByteMask = 0x7f; // Binary: 01111111 + +// Disallow values larger than 32-bits to ensure consistent behavior on 32 and +// 64 bit targets: value is typically used to determine buffer allocation size +// when decoded. +static const uint64_t kMaximumLeb128Value = UINT32_MAX; + +size_t aom_uleb_size_in_bytes(uint64_t value) { + size_t size = 0; + do { + ++size; + } while ((value >>= 7) != 0); + return size; +} + +int aom_uleb_decode(const uint8_t *buffer, size_t available, uint64_t *value, + size_t *length) { + if (buffer && value) { + *value = 0; + for (size_t i = 0; i < kMaximumLeb128Size && i < available; ++i) { + const uint8_t decoded_byte = *(buffer + i) & kLeb128ByteMask; + *value |= ((uint64_t)decoded_byte) << (i * 7); + if ((*(buffer + i) >> 7) == 0) { + if (length) { + *length = i + 1; + } + + // Fail on values larger than 32-bits to ensure consistent behavior on + // 32 and 64 bit targets: value is typically used to determine buffer + // allocation size. + if (*value > UINT32_MAX) return -1; + + return 0; + } + } + } + + // If we get here, either the buffer/value pointers were invalid, + // or we ran over the available space + return -1; +} + +int aom_uleb_encode(uint64_t value, size_t available, uint8_t *coded_value, + size_t *coded_size) { + const size_t leb_size = aom_uleb_size_in_bytes(value); + if (value > kMaximumLeb128Value || leb_size > kMaximumLeb128Size || + leb_size > available || !coded_value || !coded_size) { + return -1; + } + + for (size_t i = 0; i < leb_size; ++i) { + uint8_t byte = value & 0x7f; + value >>= 7; + + if (value != 0) byte |= 0x80; // Signal that more bytes follow. + + *(coded_value + i) = byte; + } + + *coded_size = leb_size; + return 0; +} + +int aom_uleb_encode_fixed_size(uint64_t value, size_t available, + size_t pad_to_size, uint8_t *coded_value, + size_t *coded_size) { + if (value > kMaximumLeb128Value || !coded_value || !coded_size || + available < pad_to_size || pad_to_size > kMaximumLeb128Size) { + return -1; + } + const uint64_t limit = 1ULL << (7 * pad_to_size); + if (value >= limit) { + // Can't encode 'value' within 'pad_to_size' bytes + return -1; + } + + for (size_t i = 0; i < pad_to_size; ++i) { + uint8_t byte = value & 0x7f; + value >>= 7; + + if (i < pad_to_size - 1) byte |= 0x80; // Signal that more bytes follow. + + *(coded_value + i) = byte; + } + + assert(value == 0); + + *coded_size = pad_to_size; + return 0; +} diff --git a/libs/libaom/src/aom_dsp/aom_convolve.c b/libs/libaom/src/aom_dsp/aom_convolve.c new file mode 100644 index 000000000..7879b88f6 --- /dev/null +++ b/libs/libaom/src/aom_dsp/aom_convolve.c @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_ports/mem.h" + +static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) { + int sum = 0; + for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k]; + return sum; +} + +static INLINE int vert_scalar_product(const uint8_t *a, ptrdiff_t a_stride, + const int16_t *b) { + int sum = 0; + for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k]; + return sum; +} + +static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + src -= SUBPEL_TAPS / 2 - 1; + for (int y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (int x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + const int sum = horz_scalar_product(src_x, x_filter); + dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (int x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (int y = 0; y < h; ++y) { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + const int sum = vert_scalar_product(src_y, src_stride, y_filter); + dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static const InterpKernel *get_filter_base(const int16_t *filter) { + // NOTE: This assumes that the filter table is 256-byte aligned. + return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); +} + +static int get_filter_offset(const int16_t *f, const InterpKernel *base) { + return (int)((const InterpKernel *)(intptr_t)f - base); +} + +void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + (void)filter_y; + (void)y_step_q4; + + convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, + w, h); +} + +void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + (void)filter_x; + (void)x_step_q4; + + convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4, + w, h); +} + +void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int filter_x_stride, const int16_t *filter_y, + int filter_y_stride, int w, int h) { + int r; + + (void)filter_x; + (void)filter_x_stride; + (void)filter_y; + (void)filter_y_stride; + + for (r = h; r > 0; --r) { + memcpy(dst, src, w); + src += src_stride; + dst += dst_stride; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE int highbd_vert_scalar_product(const uint16_t *a, + ptrdiff_t a_stride, + const int16_t *b) { + int sum = 0; + for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k]; + return sum; +} + +static INLINE int highbd_horz_scalar_product(const uint16_t *a, + const int16_t *b) { + int sum = 0; + for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k]; + return sum; +} + +static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h, int bd) { + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= SUBPEL_TAPS / 2 - 1; + for (int y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (int x = 0; x < w; ++x) { + const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + const int sum = highbd_horz_scalar_product(src_x, x_filter); + dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h, int bd) { + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (int x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (int y = 0; y < h; ++y) { + const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter); + dst[y * dst_stride] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h, int bd) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + (void)filter_y; + (void)y_step_q4; + + highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, + x_step_q4, w, h, bd); +} + +void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h, int bd) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + (void)filter_x; + (void)x_step_q4; + + highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, + y_step_q4, w, h, bd); +} + +void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h, int bd) { + int r; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + (void)filter_x; + (void)filter_y; + (void)filter_x_stride; + (void)filter_y_stride; + (void)bd; + + for (r = h; r > 0; --r) { + memcpy(dst, src, w * sizeof(uint16_t)); + src += src_stride; + dst += dst_stride; + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/libs/libaom/src/aom_dsp/aom_dsp.cmake b/libs/libaom/src/aom_dsp/aom_dsp.cmake new file mode 100644 index 000000000..f1b61f010 --- /dev/null +++ b/libs/libaom/src/aom_dsp/aom_dsp.cmake @@ -0,0 +1,422 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_AOM_DSP_AOM_DSP_CMAKE_) + return() +endif() # AOM_AOM_DSP_AOM_DSP_CMAKE_ +set(AOM_AOM_DSP_AOM_DSP_CMAKE_ 1) + +list(APPEND AOM_DSP_COMMON_SOURCES + "${AOM_ROOT}/aom_dsp/aom_convolve.c" + "${AOM_ROOT}/aom_dsp/aom_dsp_common.h" + "${AOM_ROOT}/aom_dsp/aom_filter.h" + "${AOM_ROOT}/aom_dsp/aom_simd.h" + "${AOM_ROOT}/aom_dsp/aom_simd_inline.h" + "${AOM_ROOT}/aom_dsp/bitreader_buffer.c" + "${AOM_ROOT}/aom_dsp/bitreader_buffer.h" + "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c" + "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h" + "${AOM_ROOT}/aom_dsp/blend.h" + "${AOM_ROOT}/aom_dsp/blend_a64_hmask.c" + "${AOM_ROOT}/aom_dsp/blend_a64_mask.c" + "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c" + "${AOM_ROOT}/aom_dsp/entcode.c" + "${AOM_ROOT}/aom_dsp/entcode.h" + "${AOM_ROOT}/aom_dsp/fft.c" + "${AOM_ROOT}/aom_dsp/fft_common.h" + "${AOM_ROOT}/aom_dsp/intrapred.c" + "${AOM_ROOT}/aom_dsp/intrapred_common.h" + "${AOM_ROOT}/aom_dsp/loopfilter.c" + "${AOM_ROOT}/aom_dsp/prob.h" + "${AOM_ROOT}/aom_dsp/recenter.h" + "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h" + "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics_c.h" + "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics.h" + "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics_c.h" + "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics.h" + "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics_c.h" + "${AOM_ROOT}/aom_dsp/subtract.c" + "${AOM_ROOT}/aom_dsp/txfm_common.h" + "${AOM_ROOT}/aom_dsp/x86/convolve_common_intrin.h" + "${AOM_ROOT}/aom_dsp/avg.c") + +list(APPEND AOM_DSP_COMMON_ASM_SSE2 + "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_asm_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/intrapred_asm_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/inv_wht_sse2.asm") + +list(APPEND AOM_DSP_COMMON_INTRIN_SSE2 + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c" + "${AOM_ROOT}/aom_dsp/x86/convolve.h" + "${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/fft_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/intrapred_x86.h" + "${AOM_ROOT}/aom_dsp/x86/loopfilter_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/lpf_common_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/mem_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/transpose_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/avg_intrin_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/bitdepth_conversion_sse2.h") + +if(NOT CONFIG_AV1_HIGHBITDEPTH) + list(REMOVE_ITEM AOM_DSP_COMMON_INTRIN_SSE2 + "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c") +endif() + +list(APPEND AOM_DSP_COMMON_ASM_SSSE3 + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_ssse3.asm" + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm") + +list(APPEND AOM_DSP_COMMON_INTRIN_SSSE3 + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c") + +if(NOT CONFIG_AV1_HIGHBITDEPTH) + list(REMOVE_ITEM AOM_DSP_COMMON_INTRIN_SSSE3 + "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_ssse3.c") +endif() + +list(APPEND AOM_DSP_COMMON_INTRIN_SSE4_1 + "${AOM_ROOT}/aom_dsp/x86/blend_mask_sse4.h" + "${AOM_ROOT}/aom_dsp/x86/blend_a64_hmask_sse4.c" + "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_sse4.c" + "${AOM_ROOT}/aom_dsp/x86/blend_a64_vmask_sse4.c") + +list(APPEND AOM_DSP_COMMON_INTRIN_AVX2 + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/common_avx2.h" + "${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h" + "${AOM_ROOT}/aom_dsp/x86/convolve_avx2.h" + "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/avg_intrin_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/bitdepth_conversion_avx2.h") + +if(NOT CONFIG_AV1_HIGHBITDEPTH) + list(REMOVE_ITEM AOM_DSP_COMMON_INTRIN_AVX2 + "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c") +endif() + +list(APPEND AOM_DSP_COMMON_INTRIN_NEON "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c" + "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c" + "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c" + "${AOM_ROOT}/aom_dsp/arm/blend_a64_mask_neon.c") + +list(APPEND AOM_DSP_COMMON_INTRIN_DSPR2 + "${AOM_ROOT}/aom_dsp/mips/common_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/common_dspr2.h" + "${AOM_ROOT}/aom_dsp/mips/convolve2_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve2_horiz_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve2_vert_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve8_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve8_horiz_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve8_vert_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve_common_dspr2.h" + "${AOM_ROOT}/aom_dsp/mips/intrapred16_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/intrapred4_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/intrapred8_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/inv_txfm_dspr2.h") + +list(APPEND AOM_DSP_COMMON_INTRIN_MSA + "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_horiz_msa.c" + "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_vert_msa.c" + "${AOM_ROOT}/aom_dsp/mips/aom_convolve_copy_msa.c" + "${AOM_ROOT}/aom_dsp/mips/aom_convolve_msa.h" + "${AOM_ROOT}/aom_dsp/mips/intrapred_msa.c" + "${AOM_ROOT}/aom_dsp/mips/macros_msa.h") + +if(CONFIG_AV1_DECODER) + list(APPEND AOM_DSP_DECODER_SOURCES + "${AOM_ROOT}/aom_dsp/binary_codes_reader.c" + "${AOM_ROOT}/aom_dsp/binary_codes_reader.h" + "${AOM_ROOT}/aom_dsp/bitreader.c" + "${AOM_ROOT}/aom_dsp/bitreader.h" "${AOM_ROOT}/aom_dsp/entdec.c" + "${AOM_ROOT}/aom_dsp/entdec.h" + "${AOM_ROOT}/aom_dsp/grain_synthesis.c" + "${AOM_ROOT}/aom_dsp/grain_synthesis.h") +endif() + +if(CONFIG_AV1_ENCODER) + list(APPEND AOM_DSP_ENCODER_SOURCES + "${AOM_ROOT}/aom_dsp/binary_codes_writer.c" + "${AOM_ROOT}/aom_dsp/binary_codes_writer.h" + "${AOM_ROOT}/aom_dsp/bitwriter.c" + "${AOM_ROOT}/aom_dsp/bitwriter.h" + "${AOM_ROOT}/aom_dsp/blk_sse_sum.c" + "${AOM_ROOT}/aom_dsp/entenc.c" + "${AOM_ROOT}/aom_dsp/entenc.h" + "${AOM_ROOT}/aom_dsp/fwd_txfm.c" + "${AOM_ROOT}/aom_dsp/grain_table.c" + "${AOM_ROOT}/aom_dsp/grain_table.h" + "${AOM_ROOT}/aom_dsp/noise_model.c" + "${AOM_ROOT}/aom_dsp/noise_model.h" + "${AOM_ROOT}/aom_dsp/noise_util.c" + "${AOM_ROOT}/aom_dsp/noise_util.h" + "${AOM_ROOT}/aom_dsp/psnr.c" + "${AOM_ROOT}/aom_dsp/psnr.h" + "${AOM_ROOT}/aom_dsp/quantize.c" + "${AOM_ROOT}/aom_dsp/quantize.h" + "${AOM_ROOT}/aom_dsp/sad.c" + "${AOM_ROOT}/aom_dsp/sse.c" + "${AOM_ROOT}/aom_dsp/sad_av1.c" + "${AOM_ROOT}/aom_dsp/sum_squares.c" + "${AOM_ROOT}/aom_dsp/variance.c" + "${AOM_ROOT}/aom_dsp/variance.h") + + list(APPEND AOM_DSP_ENCODER_ASM_SSE2 + "${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/highbd_sad_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/highbd_variance_impl_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/sad_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/subpel_variance_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/subtract_sse2.asm") + + list(APPEND AOM_DSP_ENCODER_ASM_SSE2_X86_64 + "${AOM_ROOT}/aom_dsp/x86/ssim_sse2_x86_64.asm") + + list(APPEND AOM_DSP_ENCODER_INTRIN_SSE2 + "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_impl_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/quantize_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/quantize_x86.h" + "${AOM_ROOT}/aom_dsp/x86/blk_sse_sum_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c") + if(NOT CONFIG_AV1_HIGHBITDEPTH) + list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_SSE2 + "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c") + endif() + + list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64 + "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm" + "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm") + + list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2 + "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/subtract_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/sad_highbd_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/variance_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_variance_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/sse_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/obmc_variance_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/blk_sse_sum_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/sum_squares_avx2.c") + + list(APPEND AOM_DSP_ENCODER_AVX_ASM_X86_64 + "${AOM_ROOT}/aom_dsp/x86/quantize_avx_x86_64.asm") + + list(APPEND AOM_DSP_ENCODER_INTRIN_SSSE3 + "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.h" + "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/masked_sad4d_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.h" + "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/variance_impl_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/jnt_sad_ssse3.c") + + list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1 + "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c" + "${AOM_ROOT}/aom_dsp/x86/sse_sse4.c" + "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c" + "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c") + + if(NOT CONFIG_AV1_HIGHBITDEPTH) + list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_SSE4_1 + "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c") + endif() + + list(APPEND AOM_DSP_ENCODER_INTRIN_NEON "${AOM_ROOT}/aom_dsp/arm/sad4d_neon.c" + "${AOM_ROOT}/aom_dsp/arm/sad_neon.c" + "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c" + "${AOM_ROOT}/aom_dsp/arm/variance_neon.c" + "${AOM_ROOT}/aom_dsp/arm/hadamard_neon.c" + "${AOM_ROOT}/aom_dsp/arm/avg_neon.c" + "${AOM_ROOT}/aom_dsp/arm/sse_neon.c") + + list(APPEND AOM_DSP_ENCODER_INTRIN_MSA "${AOM_ROOT}/aom_dsp/mips/sad_msa.c" + "${AOM_ROOT}/aom_dsp/mips/subtract_msa.c" + "${AOM_ROOT}/aom_dsp/mips/variance_msa.c" + "${AOM_ROOT}/aom_dsp/mips/sub_pixel_variance_msa.c") + + if(CONFIG_INTERNAL_STATS) + list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/fastssim.c" + "${AOM_ROOT}/aom_dsp/psnrhvs.c" "${AOM_ROOT}/aom_dsp/ssim.c" + "${AOM_ROOT}/aom_dsp/ssim.h") + endif() + + if(CONFIG_TUNE_VMAF) + list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/vmaf.c" + "${AOM_ROOT}/aom_dsp/vmaf.h") + endif() +endif() + +# Creates aom_dsp build targets. Must not be called until after libaom target +# has been created. +function(setup_aom_dsp_targets) + add_library(aom_dsp_common OBJECT ${AOM_DSP_COMMON_SOURCES}) + list(APPEND AOM_LIB_TARGETS aom_dsp_common) + create_dummy_source_file("aom_av1" "c" "dummy_source_file") + add_library(aom_dsp OBJECT "${dummy_source_file}") + target_sources(aom PRIVATE $) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE $) + endif() + list(APPEND AOM_LIB_TARGETS aom_dsp) + + # Not all generators support libraries consisting only of object files. Add a + # dummy source file to the aom_dsp target. + add_dummy_source_file_to_target("aom_dsp" "c") + + if(CONFIG_AV1_DECODER) + add_library(aom_dsp_decoder OBJECT ${AOM_DSP_DECODER_SOURCES}) + list(APPEND AOM_LIB_TARGETS aom_dsp_decoder) + target_sources(aom PRIVATE $) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE $) + endif() + endif() + + if(CONFIG_AV1_ENCODER) + add_library(aom_dsp_encoder OBJECT ${AOM_DSP_ENCODER_SOURCES}) + list(APPEND AOM_LIB_TARGETS aom_dsp_encoder) + target_sources(aom PRIVATE $) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE $) + endif() + endif() + + if(HAVE_SSE2) + add_asm_library("aom_dsp_common_sse2" "AOM_DSP_COMMON_ASM_SSE2") + add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_common" + "AOM_DSP_COMMON_INTRIN_SSE2") + + if(CONFIG_AV1_ENCODER) + if("${AOM_TARGET_CPU}" STREQUAL "x86_64") + list(APPEND AOM_DSP_ENCODER_ASM_SSE2 ${AOM_DSP_ENCODER_ASM_SSE2_X86_64}) + endif() + add_asm_library("aom_dsp_encoder_sse2" "AOM_DSP_ENCODER_ASM_SSE2") + add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_SSE2") + endif() + endif() + + if(HAVE_SSSE3) + add_asm_library("aom_dsp_common_ssse3" "AOM_DSP_COMMON_ASM_SSSE3") + add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_common" + "AOM_DSP_COMMON_INTRIN_SSSE3") + + if(CONFIG_AV1_ENCODER) + if("${AOM_TARGET_CPU}" STREQUAL "x86_64") + list(APPEND AOM_DSP_ENCODER_ASM_SSSE3 + ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64}) + endif() + add_asm_library("aom_dsp_encoder_ssse3" "AOM_DSP_ENCODER_ASM_SSSE3") + add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_SSSE3") + endif() + endif() + + if(HAVE_SSE4_1) + add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_common" + "AOM_DSP_COMMON_INTRIN_SSE4_1") + if(CONFIG_AV1_ENCODER) + add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_SSE4_1") + endif() + endif() + + if(HAVE_AVX AND "${AOM_TARGET_CPU}" STREQUAL "x86_64") + if(CONFIG_AV1_ENCODER) + add_asm_library("aom_dsp_encoder_avx" "AOM_DSP_ENCODER_AVX_ASM_X86_64") + endif() + endif() + + if(HAVE_AVX2) + add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_common" + "AOM_DSP_COMMON_INTRIN_AVX2") + if(CONFIG_AV1_ENCODER) + add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_AVX2") + endif() + endif() + + if(HAVE_NEON) + add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon" + "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_NEON") + if(CONFIG_AV1_ENCODER) + add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon" + "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_NEON") + endif() + endif() + + if(HAVE_DSPR2) + add_intrinsics_object_library("" "dspr2" "aom_dsp_common" + "AOM_DSP_COMMON_INTRIN_DSPR2") + endif() + + if(HAVE_MSA) + add_intrinsics_object_library("" "msa" "aom_dsp_common" + "AOM_DSP_COMMON_INTRIN_MSA") + if(CONFIG_AV1_ENCODER) + add_intrinsics_object_library("" "msa" "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_MSA") + endif() + endif() + + target_sources(aom PRIVATE $) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE $) + endif() + + # Pass the new lib targets up to the parent scope instance of + # $AOM_LIB_TARGETS. + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE) +endfunction() diff --git a/libs/libaom/src/aom_dsp/aom_dsp_common.h b/libs/libaom/src/aom_dsp/aom_dsp_common.h new file mode 100644 index 000000000..150d35dd1 --- /dev/null +++ b/libs/libaom/src/aom_dsp/aom_dsp_common.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_AOM_DSP_COMMON_H_ +#define AOM_AOM_DSP_AOM_DSP_COMMON_H_ + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef MAX_SB_SIZE +#define MAX_SB_SIZE 128 +#endif // ndef MAX_SB_SIZE + +#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y)) +#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y)) +#define AOMSIGN(x) ((x) < 0 ? -1 : 0) + +#define NELEMENTS(x) (int)(sizeof(x) / sizeof(x[0])) + +#define IMPLIES(a, b) (!(a) || (b)) // Logical 'a implies b' (or 'a -> b') + +#define IS_POWER_OF_TWO(x) (((x) & ((x)-1)) == 0) + +/* Left shifting a negative value became undefined behavior in C99 (downgraded + from merely implementation-defined in C89). This should still compile to the + correct thing on any two's-complement machine, but avoid ubsan warnings.*/ +#define AOM_SIGNED_SHL(x, shift) ((x) * (((x)*0 + 1) << (shift))) + +// These can be used to give a hint about branch outcomes. +// This can have an effect, even if your target processor has a +// good branch predictor, as these hints can affect basic block +// ordering by the compiler. +#ifdef __GNUC__ +#define LIKELY(v) __builtin_expect(v, 1) +#define UNLIKELY(v) __builtin_expect(v, 0) +#else +#define LIKELY(v) (v) +#define UNLIKELY(v) (v) +#endif + +typedef uint8_t qm_val_t; +#define AOM_QM_BITS 5 + +// Note: +// tran_low_t is the datatype used for final transform coefficients. +// tran_high_t is the datatype used for intermediate transform stages. +typedef int64_t tran_high_t; +typedef int32_t tran_low_t; + +static INLINE uint8_t clip_pixel(int val) { + return (val > 255) ? 255 : (val < 0) ? 0 : val; +} + +static INLINE int clamp(int value, int low, int high) { + return value < low ? low : (value > high ? high : value); +} + +static INLINE int64_t clamp64(int64_t value, int64_t low, int64_t high) { + return value < low ? low : (value > high ? high : value); +} + +static INLINE double fclamp(double value, double low, double high) { + return value < low ? low : (value > high ? high : value); +} + +static INLINE uint16_t clip_pixel_highbd(int val, int bd) { + switch (bd) { + case 8: + default: return (uint16_t)clamp(val, 0, 255); + case 10: return (uint16_t)clamp(val, 0, 1023); + case 12: return (uint16_t)clamp(val, 0, 4095); + } +} + +// The result of this branchless code is equivalent to (value < 0 ? 0 : value) +// or max(0, value) and might be faster in some cases. +// Care should be taken since the behavior of right shifting signed type +// negative value is undefined by C standards and implementation defined, +static INLINE unsigned int negative_to_zero(int value) { + return value & ~(value >> (sizeof(value) * 8 - 1)); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_AOM_DSP_COMMON_H_ diff --git a/libs/libaom/src/aom_dsp/aom_dsp_rtcd.c b/libs/libaom/src/aom_dsp/aom_dsp_rtcd.c new file mode 100644 index 000000000..1514bd64e --- /dev/null +++ b/libs/libaom/src/aom_dsp/aom_dsp_rtcd.c @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "config/aom_config.h" + +#define RTCD_C +#include "config/aom_dsp_rtcd.h" + +#include "aom_ports/aom_once.h" + +void aom_dsp_rtcd() { aom_once(setup_rtcd_internal); } diff --git a/libs/libaom/src/aom_dsp/aom_dsp_rtcd_defs.pl b/libs/libaom/src/aom_dsp/aom_dsp_rtcd_defs.pl new file mode 100644 index 000000000..b7d5a41ba --- /dev/null +++ b/libs/libaom/src/aom_dsp/aom_dsp_rtcd_defs.pl @@ -0,0 +1,1785 @@ +## +## Copyright (c) 2017, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +sub aom_dsp_forward_decls() { +print <=4 && $h >=4 && ($w == 2*$h || $h == 2*$w)); + push @tx_sizes, [$w, $h] if ($w >=4 && $h >=4 && ($w == 4*$h || $h == 4*$w)); + } +} + +@pred_names = qw/dc dc_top dc_left dc_128 v h paeth smooth smooth_v smooth_h/; + +# +# Intra prediction +# + +foreach (@tx_sizes) { + ($w, $h) = @$_; + foreach $pred_name (@pred_names) { + add_proto "void", "aom_${pred_name}_predictor_${w}x${h}", + "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto "void", "aom_highbd_${pred_name}_predictor_${w}x${h}", + "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + } + } +} + +specialize qw/aom_dc_top_predictor_4x4 msa neon sse2/; +specialize qw/aom_dc_top_predictor_4x8 sse2/; +specialize qw/aom_dc_top_predictor_4x16 sse2/; +specialize qw/aom_dc_top_predictor_8x4 sse2/; +specialize qw/aom_dc_top_predictor_8x8 neon msa sse2/; +specialize qw/aom_dc_top_predictor_8x16 sse2/; +specialize qw/aom_dc_top_predictor_8x32 sse2/; +specialize qw/aom_dc_top_predictor_16x4 sse2/; +specialize qw/aom_dc_top_predictor_16x8 sse2/; +specialize qw/aom_dc_top_predictor_16x16 neon msa sse2/; + +specialize qw/aom_dc_top_predictor_16x32 sse2/; +specialize qw/aom_dc_top_predictor_16x64 sse2/; +specialize qw/aom_dc_top_predictor_32x8 sse2/; +specialize qw/aom_dc_top_predictor_32x16 sse2 avx2/; +specialize qw/aom_dc_top_predictor_32x32 msa neon sse2 avx2/; +specialize qw/aom_dc_top_predictor_32x64 sse2 avx2/; +specialize qw/aom_dc_top_predictor_64x64 sse2 avx2/; +specialize qw/aom_dc_top_predictor_64x32 sse2 avx2/; +specialize qw/aom_dc_top_predictor_64x16 sse2 avx2/; +specialize qw/aom_dc_left_predictor_4x4 msa neon sse2/; +specialize qw/aom_dc_left_predictor_4x8 sse2/; +specialize qw/aom_dc_left_predictor_4x16 sse2/; +specialize qw/aom_dc_left_predictor_8x4 sse2/; +specialize qw/aom_dc_left_predictor_8x8 neon msa sse2/; +specialize qw/aom_dc_left_predictor_8x16 sse2/; +specialize qw/aom_dc_left_predictor_8x32 sse2/; +specialize qw/aom_dc_left_predictor_16x4 sse2/; +specialize qw/aom_dc_left_predictor_16x8 sse2/; +specialize qw/aom_dc_left_predictor_16x16 neon msa sse2/; +specialize qw/aom_dc_left_predictor_16x32 sse2/; +specialize qw/aom_dc_left_predictor_16x64 sse2/; +specialize qw/aom_dc_left_predictor_32x8 sse2/; +specialize qw/aom_dc_left_predictor_32x16 sse2 avx2/; +specialize qw/aom_dc_left_predictor_32x32 msa neon sse2 avx2/; +specialize qw/aom_dc_left_predictor_32x64 sse2 avx2/; +specialize qw/aom_dc_left_predictor_64x64 sse2 avx2/; +specialize qw/aom_dc_left_predictor_64x32 sse2 avx2/; +specialize qw/aom_dc_left_predictor_64x16 sse2 avx2/; +specialize qw/aom_dc_128_predictor_4x4 msa neon sse2/; +specialize qw/aom_dc_128_predictor_4x8 sse2/; +specialize qw/aom_dc_128_predictor_4x16 sse2/; +specialize qw/aom_dc_128_predictor_8x4 sse2/; +specialize qw/aom_dc_128_predictor_8x8 neon msa sse2/; +specialize qw/aom_dc_128_predictor_8x16 sse2/; +specialize qw/aom_dc_128_predictor_8x32 sse2/; +specialize qw/aom_dc_128_predictor_16x4 sse2/; +specialize qw/aom_dc_128_predictor_16x8 sse2/; +specialize qw/aom_dc_128_predictor_16x16 neon msa sse2/; +specialize qw/aom_dc_128_predictor_16x32 sse2/; +specialize qw/aom_dc_128_predictor_16x64 sse2/; +specialize qw/aom_dc_128_predictor_32x8 sse2/; +specialize qw/aom_dc_128_predictor_32x16 sse2 avx2/; +specialize qw/aom_dc_128_predictor_32x32 msa neon sse2 avx2/; +specialize qw/aom_dc_128_predictor_32x64 sse2 avx2/; +specialize qw/aom_dc_128_predictor_64x64 sse2 avx2/; +specialize qw/aom_dc_128_predictor_64x32 sse2 avx2/; +specialize qw/aom_dc_128_predictor_64x16 sse2 avx2/; +specialize qw/aom_v_predictor_4x4 neon msa sse2/; +specialize qw/aom_v_predictor_4x8 sse2/; +specialize qw/aom_v_predictor_4x16 sse2/; +specialize qw/aom_v_predictor_8x4 sse2/; +specialize qw/aom_v_predictor_8x8 neon msa sse2/; +specialize qw/aom_v_predictor_8x16 sse2/; +specialize qw/aom_v_predictor_8x32 sse2/; +specialize qw/aom_v_predictor_16x4 sse2/; +specialize qw/aom_v_predictor_16x8 sse2/; +specialize qw/aom_v_predictor_16x16 neon msa sse2/; +specialize qw/aom_v_predictor_16x32 sse2/; +specialize qw/aom_v_predictor_16x64 sse2/; +specialize qw/aom_v_predictor_32x8 sse2/; +specialize qw/aom_v_predictor_32x16 sse2 avx2/; +specialize qw/aom_v_predictor_32x32 neon msa sse2 avx2/; +specialize qw/aom_v_predictor_32x64 sse2 avx2/; +specialize qw/aom_v_predictor_64x64 sse2 avx2/; +specialize qw/aom_v_predictor_64x32 sse2 avx2/; +specialize qw/aom_v_predictor_64x16 sse2 avx2/; +specialize qw/aom_h_predictor_4x8 sse2/; +specialize qw/aom_h_predictor_4x16 sse2/; +specialize qw/aom_h_predictor_4x4 neon dspr2 msa sse2/; +specialize qw/aom_h_predictor_8x4 sse2/; +specialize qw/aom_h_predictor_8x8 neon dspr2 msa sse2/; +specialize qw/aom_h_predictor_8x16 sse2/; +specialize qw/aom_h_predictor_8x32 sse2/; +specialize qw/aom_h_predictor_16x4 sse2/; +specialize qw/aom_h_predictor_16x8 sse2/; +specialize qw/aom_h_predictor_16x16 neon dspr2 msa sse2/; +specialize qw/aom_h_predictor_16x32 sse2/; +specialize qw/aom_h_predictor_16x64 sse2/; +specialize qw/aom_h_predictor_32x8 sse2/; +specialize qw/aom_h_predictor_32x16 sse2/; +specialize qw/aom_h_predictor_32x32 neon msa sse2 avx2/; +specialize qw/aom_h_predictor_32x64 sse2/; +specialize qw/aom_h_predictor_64x64 sse2/; +specialize qw/aom_h_predictor_64x32 sse2/; +specialize qw/aom_h_predictor_64x16 sse2/; +specialize qw/aom_paeth_predictor_4x4 ssse3/; +specialize qw/aom_paeth_predictor_4x8 ssse3/; +specialize qw/aom_paeth_predictor_4x16 ssse3/; +specialize qw/aom_paeth_predictor_8x4 ssse3/; +specialize qw/aom_paeth_predictor_8x8 ssse3/; +specialize qw/aom_paeth_predictor_8x16 ssse3/; +specialize qw/aom_paeth_predictor_8x32 ssse3/; +specialize qw/aom_paeth_predictor_16x4 ssse3/; +specialize qw/aom_paeth_predictor_16x8 ssse3 avx2/; +specialize qw/aom_paeth_predictor_16x16 ssse3 avx2/; +specialize qw/aom_paeth_predictor_16x32 ssse3 avx2/; +specialize qw/aom_paeth_predictor_16x64 ssse3 avx2/; +specialize qw/aom_paeth_predictor_32x8 ssse3/; +specialize qw/aom_paeth_predictor_32x16 ssse3 avx2/; +specialize qw/aom_paeth_predictor_32x32 ssse3 avx2/; +specialize qw/aom_paeth_predictor_32x64 ssse3 avx2/; +specialize qw/aom_paeth_predictor_64x32 ssse3 avx2/; +specialize qw/aom_paeth_predictor_64x64 ssse3 avx2/; +specialize qw/aom_paeth_predictor_64x16 ssse3 avx2/; +specialize qw/aom_paeth_predictor_16x8 ssse3/; +specialize qw/aom_paeth_predictor_16x16 ssse3/; +specialize qw/aom_paeth_predictor_16x32 ssse3/; +specialize qw/aom_paeth_predictor_32x16 ssse3/; +specialize qw/aom_paeth_predictor_32x32 ssse3/; +specialize qw/aom_smooth_predictor_4x4 ssse3/; +specialize qw/aom_smooth_predictor_4x8 ssse3/; +specialize qw/aom_smooth_predictor_4x16 ssse3/; +specialize qw/aom_smooth_predictor_8x4 ssse3/; +specialize qw/aom_smooth_predictor_8x8 ssse3/; +specialize qw/aom_smooth_predictor_8x16 ssse3/; +specialize qw/aom_smooth_predictor_8x32 ssse3/; +specialize qw/aom_smooth_predictor_16x4 ssse3/; +specialize qw/aom_smooth_predictor_16x8 ssse3/; +specialize qw/aom_smooth_predictor_16x16 ssse3/; +specialize qw/aom_smooth_predictor_16x32 ssse3/; +specialize qw/aom_smooth_predictor_16x64 ssse3/; +specialize qw/aom_smooth_predictor_32x8 ssse3/; +specialize qw/aom_smooth_predictor_32x16 ssse3/; +specialize qw/aom_smooth_predictor_32x32 ssse3/; +specialize qw/aom_smooth_predictor_32x64 ssse3/; +specialize qw/aom_smooth_predictor_64x64 ssse3/; +specialize qw/aom_smooth_predictor_64x32 ssse3/; +specialize qw/aom_smooth_predictor_64x16 ssse3/; + +specialize qw/aom_smooth_v_predictor_4x4 ssse3/; +specialize qw/aom_smooth_v_predictor_4x8 ssse3/; +specialize qw/aom_smooth_v_predictor_4x16 ssse3/; +specialize qw/aom_smooth_v_predictor_8x4 ssse3/; +specialize qw/aom_smooth_v_predictor_8x8 ssse3/; +specialize qw/aom_smooth_v_predictor_8x16 ssse3/; +specialize qw/aom_smooth_v_predictor_8x32 ssse3/; +specialize qw/aom_smooth_v_predictor_16x4 ssse3/; +specialize qw/aom_smooth_v_predictor_16x8 ssse3/; +specialize qw/aom_smooth_v_predictor_16x16 ssse3/; +specialize qw/aom_smooth_v_predictor_16x32 ssse3/; +specialize qw/aom_smooth_v_predictor_16x64 ssse3/; +specialize qw/aom_smooth_v_predictor_32x8 ssse3/; +specialize qw/aom_smooth_v_predictor_32x16 ssse3/; +specialize qw/aom_smooth_v_predictor_32x32 ssse3/; +specialize qw/aom_smooth_v_predictor_32x64 ssse3/; +specialize qw/aom_smooth_v_predictor_64x64 ssse3/; +specialize qw/aom_smooth_v_predictor_64x32 ssse3/; +specialize qw/aom_smooth_v_predictor_64x16 ssse3/; + +specialize qw/aom_smooth_h_predictor_4x4 ssse3/; +specialize qw/aom_smooth_h_predictor_4x8 ssse3/; +specialize qw/aom_smooth_h_predictor_4x16 ssse3/; +specialize qw/aom_smooth_h_predictor_8x4 ssse3/; +specialize qw/aom_smooth_h_predictor_8x8 ssse3/; +specialize qw/aom_smooth_h_predictor_8x16 ssse3/; +specialize qw/aom_smooth_h_predictor_8x32 ssse3/; +specialize qw/aom_smooth_h_predictor_16x4 ssse3/; +specialize qw/aom_smooth_h_predictor_16x8 ssse3/; +specialize qw/aom_smooth_h_predictor_16x16 ssse3/; +specialize qw/aom_smooth_h_predictor_16x32 ssse3/; +specialize qw/aom_smooth_h_predictor_16x64 ssse3/; +specialize qw/aom_smooth_h_predictor_32x8 ssse3/; +specialize qw/aom_smooth_h_predictor_32x16 ssse3/; +specialize qw/aom_smooth_h_predictor_32x32 ssse3/; +specialize qw/aom_smooth_h_predictor_32x64 ssse3/; +specialize qw/aom_smooth_h_predictor_64x64 ssse3/; +specialize qw/aom_smooth_h_predictor_64x32 ssse3/; +specialize qw/aom_smooth_h_predictor_64x16 ssse3/; + +# TODO(yunqingwang): optimize rectangular DC_PRED to replace division +# by multiply and shift. +specialize qw/aom_dc_predictor_4x4 dspr2 msa neon sse2/; +specialize qw/aom_dc_predictor_4x8 sse2/; +specialize qw/aom_dc_predictor_4x16 sse2/; +specialize qw/aom_dc_predictor_8x4 sse2/; +specialize qw/aom_dc_predictor_8x8 dspr2 neon msa sse2/; +specialize qw/aom_dc_predictor_8x16 sse2/; +specialize qw/aom_dc_predictor_8x32 sse2/; +specialize qw/aom_dc_predictor_16x4 sse2/; +specialize qw/aom_dc_predictor_16x8 sse2/; +specialize qw/aom_dc_predictor_16x16 dspr2 neon msa sse2/; +specialize qw/aom_dc_predictor_16x32 sse2/; +specialize qw/aom_dc_predictor_16x64 sse2/; +specialize qw/aom_dc_predictor_32x8 sse2/; +specialize qw/aom_dc_predictor_32x16 sse2 avx2/; +specialize qw/aom_dc_predictor_32x32 msa neon sse2 avx2/; +specialize qw/aom_dc_predictor_32x64 sse2 avx2/; +specialize qw/aom_dc_predictor_64x64 sse2 avx2/; +specialize qw/aom_dc_predictor_64x32 sse2 avx2/; +specialize qw/aom_dc_predictor_64x16 sse2 avx2/; +if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + specialize qw/aom_highbd_v_predictor_4x4 sse2/; + specialize qw/aom_highbd_v_predictor_4x8 sse2/; + specialize qw/aom_highbd_v_predictor_8x4 sse2/; + specialize qw/aom_highbd_v_predictor_8x8 sse2/; + specialize qw/aom_highbd_v_predictor_8x16 sse2/; + specialize qw/aom_highbd_v_predictor_16x8 sse2/; + specialize qw/aom_highbd_v_predictor_16x16 sse2/; + specialize qw/aom_highbd_v_predictor_16x32 sse2/; + specialize qw/aom_highbd_v_predictor_32x16 sse2/; + specialize qw/aom_highbd_v_predictor_32x32 sse2/; + + # TODO(yunqingwang): optimize rectangular DC_PRED to replace division + # by multiply and shift. + specialize qw/aom_highbd_dc_predictor_4x4 sse2 neon/; + specialize qw/aom_highbd_dc_predictor_4x8 sse2/; + specialize qw/aom_highbd_dc_predictor_8x4 sse2/;; + specialize qw/aom_highbd_dc_predictor_8x8 sse2 neon/;; + specialize qw/aom_highbd_dc_predictor_8x16 sse2/;; + specialize qw/aom_highbd_dc_predictor_16x8 sse2/; + specialize qw/aom_highbd_dc_predictor_16x16 sse2 neon/; + specialize qw/aom_highbd_dc_predictor_16x32 sse2/; + specialize qw/aom_highbd_dc_predictor_32x16 sse2/; + specialize qw/aom_highbd_dc_predictor_32x32 sse2 neon/; + specialize qw/aom_highbd_dc_predictor_64x64 neon/; + + specialize qw/aom_highbd_h_predictor_4x4 sse2/; + specialize qw/aom_highbd_h_predictor_4x8 sse2/; + specialize qw/aom_highbd_h_predictor_8x4 sse2/; + specialize qw/aom_highbd_h_predictor_8x8 sse2/; + specialize qw/aom_highbd_h_predictor_8x16 sse2/; + specialize qw/aom_highbd_h_predictor_16x8 sse2/; + specialize qw/aom_highbd_h_predictor_16x16 sse2/; + specialize qw/aom_highbd_h_predictor_16x32 sse2/; + specialize qw/aom_highbd_h_predictor_32x16 sse2/; + specialize qw/aom_highbd_h_predictor_32x32 sse2/; + specialize qw/aom_highbd_dc_left_predictor_4x4 sse2/; + specialize qw/aom_highbd_dc_top_predictor_4x4 sse2/; + specialize qw/aom_highbd_dc_128_predictor_4x4 sse2/; + specialize qw/aom_highbd_dc_left_predictor_4x8 sse2/; + specialize qw/aom_highbd_dc_top_predictor_4x8 sse2/; + specialize qw/aom_highbd_dc_128_predictor_4x8 sse2/; + specialize qw/aom_highbd_dc_left_predictor_8x4 sse2/; + specialize qw/aom_highbd_dc_top_predictor_8x4 sse2/; + specialize qw/aom_highbd_dc_128_predictor_8x4 sse2/; + specialize qw/aom_highbd_dc_left_predictor_8x8 sse2/; + specialize qw/aom_highbd_dc_top_predictor_8x8 sse2/; + specialize qw/aom_highbd_dc_128_predictor_8x8 sse2/; + specialize qw/aom_highbd_dc_left_predictor_8x16 sse2/; + specialize qw/aom_highbd_dc_top_predictor_8x16 sse2/; + specialize qw/aom_highbd_dc_128_predictor_8x16 sse2/; + specialize qw/aom_highbd_dc_left_predictor_16x8 sse2/; + specialize qw/aom_highbd_dc_top_predictor_16x8 sse2/; + specialize qw/aom_highbd_dc_128_predictor_16x8 sse2/; + specialize qw/aom_highbd_dc_left_predictor_16x16 sse2/; + specialize qw/aom_highbd_dc_top_predictor_16x16 sse2/; + specialize qw/aom_highbd_dc_128_predictor_16x16 sse2/; + specialize qw/aom_highbd_dc_left_predictor_16x32 sse2/; + specialize qw/aom_highbd_dc_top_predictor_16x32 sse2/; + specialize qw/aom_highbd_dc_128_predictor_16x32 sse2/; + specialize qw/aom_highbd_dc_left_predictor_32x16 sse2/; + specialize qw/aom_highbd_dc_top_predictor_32x16 sse2/; + specialize qw/aom_highbd_dc_128_predictor_32x16 sse2/; + specialize qw/aom_highbd_dc_left_predictor_32x32 sse2/; + specialize qw/aom_highbd_dc_top_predictor_32x32 sse2/; + specialize qw/aom_highbd_dc_128_predictor_32x32 sse2/; +} +# +# Sub Pixel Filters +# +add_proto qw/void aom_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h"; +add_proto qw/void aom_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void aom_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; + +specialize qw/aom_convolve_copy sse2 /; +specialize qw/aom_convolve8_horiz sse2 ssse3/, "$avx2_ssse3"; +specialize qw/aom_convolve8_vert sse2 ssse3/, "$avx2_ssse3"; + +if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd"; + specialize qw/aom_highbd_convolve_copy sse2 avx2/; + + add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd"; + specialize qw/aom_highbd_convolve8_horiz sse2 avx2/; + + add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd"; + specialize qw/aom_highbd_convolve8_vert sse2 avx2/; +} + +# +# Loopfilter +# +add_proto qw/void aom_lpf_vertical_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; +specialize qw/aom_lpf_vertical_14 sse2 neon/; + +add_proto qw/void aom_lpf_vertical_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; +specialize qw/aom_lpf_vertical_14_dual sse2/; + +add_proto qw/void aom_lpf_vertical_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; +specialize qw/aom_lpf_vertical_6 sse2 neon/; + +add_proto qw/void aom_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; +specialize qw/aom_lpf_vertical_8 sse2 neon/; + +add_proto qw/void aom_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; +specialize qw/aom_lpf_vertical_8_dual sse2/; + +add_proto qw/void aom_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; +specialize qw/aom_lpf_vertical_4 sse2 neon/; + +add_proto qw/void aom_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; +specialize qw/aom_lpf_vertical_4_dual sse2/; + +add_proto qw/void aom_lpf_horizontal_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; +specialize qw/aom_lpf_horizontal_14 sse2 neon/; + +add_proto qw/void aom_lpf_horizontal_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; +specialize qw/aom_lpf_horizontal_14_dual sse2/; + +add_proto qw/void aom_lpf_horizontal_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; +specialize qw/aom_lpf_horizontal_6 sse2 neon/; + +add_proto qw/void aom_lpf_horizontal_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; +specialize qw/aom_lpf_horizontal_6_dual sse2/; + +add_proto qw/void aom_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; +specialize qw/aom_lpf_horizontal_8 sse2 neon/; + +add_proto qw/void aom_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; +specialize qw/aom_lpf_horizontal_8_dual sse2/; + +add_proto qw/void aom_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; +specialize qw/aom_lpf_horizontal_4 sse2 neon/; + +add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; +specialize qw/aom_lpf_horizontal_4_dual sse2/; + +add_proto qw/void aom_lpf_vertical_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; +specialize qw/aom_lpf_vertical_6_dual sse2/; + +if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_lpf_vertical_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; + specialize qw/aom_highbd_lpf_vertical_14 sse2/; + + add_proto qw/void aom_highbd_lpf_vertical_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; + specialize qw/aom_highbd_lpf_vertical_14_dual sse2 avx2/; + + add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; + specialize qw/aom_highbd_lpf_vertical_8 sse2/; + + add_proto qw/void aom_highbd_lpf_vertical_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; + specialize qw/aom_highbd_lpf_vertical_6 sse2/; + + add_proto qw/void aom_highbd_lpf_vertical_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; + specialize qw/aom_highbd_lpf_vertical_6_dual sse2/; + + add_proto qw/void aom_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; + specialize qw/aom_highbd_lpf_vertical_8_dual sse2 avx2/; + + add_proto qw/void aom_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; + specialize qw/aom_highbd_lpf_vertical_4 sse2/; + + add_proto qw/void aom_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; + specialize qw/aom_highbd_lpf_vertical_4_dual sse2 avx2/; + + add_proto qw/void aom_highbd_lpf_horizontal_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; + specialize qw/aom_highbd_lpf_horizontal_14 sse2/; + + add_proto qw/void aom_highbd_lpf_horizontal_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,int bd"; + specialize qw/aom_highbd_lpf_horizontal_14_dual sse2 avx2/; + + add_proto qw/void aom_highbd_lpf_horizontal_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; + specialize qw/aom_highbd_lpf_horizontal_6 sse2/; + + add_proto qw/void aom_highbd_lpf_horizontal_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; + specialize qw/aom_highbd_lpf_horizontal_6_dual sse2/; + + add_proto qw/void aom_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; + specialize qw/aom_highbd_lpf_horizontal_8 sse2/; + + add_proto qw/void aom_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; + specialize qw/aom_highbd_lpf_horizontal_8_dual sse2 avx2/; + + add_proto qw/void aom_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; + specialize qw/aom_highbd_lpf_horizontal_4 sse2/; + + add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; + specialize qw/aom_highbd_lpf_horizontal_4_dual sse2 avx2/; +} + +# +# Encoder functions. +# + +# +# Forward transform +# +if (aom_config("CONFIG_AV1_ENCODER") eq "yes"){ + add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/aom_fdct4x4 neon sse2/; + + add_proto qw/void aom_fdct4x4_lp/, "const int16_t *input, int16_t *output, int stride"; + specialize qw/aom_fdct4x4_lp neon sse2/; + + add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/aom_fdct8x8 neon sse2/, "$ssse3_x86_64"; + # High bit depth + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/aom_highbd_fdct8x8 sse2/; + } + # FFT/IFFT (float) only used for denoising (and noise power spectral density estimation) + add_proto qw/void aom_fft2x2_float/, "const float *input, float *temp, float *output"; + + add_proto qw/void aom_fft4x4_float/, "const float *input, float *temp, float *output"; + specialize qw/aom_fft4x4_float sse2/; + + add_proto qw/void aom_fft8x8_float/, "const float *input, float *temp, float *output"; + specialize qw/aom_fft8x8_float avx2 sse2/; + + add_proto qw/void aom_fft16x16_float/, "const float *input, float *temp, float *output"; + specialize qw/aom_fft16x16_float avx2 sse2/; + + add_proto qw/void aom_fft32x32_float/, "const float *input, float *temp, float *output"; + specialize qw/aom_fft32x32_float avx2 sse2/; + + add_proto qw/void aom_ifft2x2_float/, "const float *input, float *temp, float *output"; + + add_proto qw/void aom_ifft4x4_float/, "const float *input, float *temp, float *output"; + specialize qw/aom_ifft4x4_float sse2/; + + add_proto qw/void aom_ifft8x8_float/, "const float *input, float *temp, float *output"; + specialize qw/aom_ifft8x8_float avx2 sse2/; + + add_proto qw/void aom_ifft16x16_float/, "const float *input, float *temp, float *output"; + specialize qw/aom_ifft16x16_float avx2 sse2/; + + add_proto qw/void aom_ifft32x32_float/, "const float *input, float *temp, float *output"; + specialize qw/aom_ifft32x32_float avx2 sse2/; +} # CONFIG_AV1_ENCODER + +# +# Quantization +# +if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { + add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64"; + + add_proto qw/void aom_quantize_b_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_quantize_b_adaptive sse2 avx2/; + + add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64"; + + add_proto qw/void aom_quantize_b_32x32_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_quantize_b_32x32_adaptive sse2/; + + add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_quantize_b_64x64 ssse3/; + + add_proto qw/void aom_quantize_b_64x64_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_quantize_b_64x64_adaptive sse2/; +} # CONFIG_AV1_ENCODER + +if (aom_config("CONFIG_AV1_ENCODER") eq "yes" && aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_highbd_quantize_b sse2 avx2/; + + add_proto qw/void aom_highbd_quantize_b_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_highbd_quantize_b_adaptive sse2 avx2/; + + add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_highbd_quantize_b_32x32 sse2/; + + add_proto qw/void aom_highbd_quantize_b_32x32_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_highbd_quantize_b_32x32_adaptive sse2 avx2/; + + add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_highbd_quantize_b_64x64 sse2/; + + add_proto qw/void aom_highbd_quantize_b_64x64_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_highbd_quantize_b_64x64_adaptive sse2/; +} # CONFIG_AV1_ENCODER + +# +# Alpha blending with mask +# +add_proto qw/void aom_lowbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params"; +specialize qw/aom_lowbd_blend_a64_d16_mask sse4_1 avx2 neon/; +add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh"; +add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h"; +add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h"; +specialize "aom_blend_a64_mask", qw/sse4_1 avx2/; +specialize "aom_blend_a64_hmask", qw/sse4_1 neon/; +specialize "aom_blend_a64_vmask", qw/sse4_1 neon/; + +if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, int bd"; + add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd"; + add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd"; + add_proto qw/void aom_highbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params, const int bd"; + specialize "aom_highbd_blend_a64_mask", qw/sse4_1/; + specialize "aom_highbd_blend_a64_hmask", qw/sse4_1/; + specialize "aom_highbd_blend_a64_vmask", qw/sse4_1/; + specialize "aom_highbd_blend_a64_d16_mask", qw/sse4_1 avx2/; +} + +if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { + # + # Block subtraction + # + add_proto qw/void aom_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; + specialize qw/aom_subtract_block neon msa sse2 avx2/; + + add_proto qw/int64_t/, "aom_sse", "const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height"; + specialize qw/aom_sse sse4_1 avx2 neon/; + + add_proto qw/void/, "aom_get_blk_sse_sum", "const int16_t *data, int stride, int bw, int bh, int *x_sum, int64_t *x2_sum"; + specialize qw/aom_get_blk_sse_sum sse2 avx2/; + + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd"; + specialize qw/aom_highbd_subtract_block sse2/; + + add_proto qw/int64_t/, "aom_highbd_sse", "const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height"; + specialize qw/aom_highbd_sse sse4_1 avx2 neon/; + } + + if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { + # + # Sum of Squares + # + add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height"; + specialize qw/aom_sum_squares_2d_i16 sse2 avx2/; + + add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N"; + specialize qw/aom_sum_squares_i16 sse2/; + + add_proto qw/uint64_t aom_var_2d_u8/, "uint8_t *src, int src_stride, int width, int height"; + specialize qw/aom_var_2d_u8 sse2 avx2/; + + add_proto qw/uint64_t aom_var_2d_u16/, "uint8_t *src, int src_stride, int width, int height"; + specialize qw/aom_var_2d_u16 sse2 avx2/; + } + + # + # Single block SAD / Single block Avg SAD + # + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + add_proto qw/unsigned int/, "aom_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; + add_proto qw/unsigned int/, "aom_dist_wtd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param"; + } + + specialize qw/aom_sad128x128 avx2 sse2/; + specialize qw/aom_sad128x64 avx2 sse2/; + specialize qw/aom_sad64x128 avx2 sse2/; + specialize qw/aom_sad64x64 avx2 neon msa sse2/; + specialize qw/aom_sad64x32 avx2 msa sse2/; + specialize qw/aom_sad32x64 avx2 msa sse2/; + specialize qw/aom_sad32x32 avx2 neon msa sse2/; + specialize qw/aom_sad32x16 avx2 msa sse2/; + specialize qw/aom_sad16x32 msa sse2/; + specialize qw/aom_sad16x16 neon msa sse2/; + specialize qw/aom_sad16x8 neon msa sse2/; + specialize qw/aom_sad8x16 neon msa sse2/; + specialize qw/aom_sad8x8 neon msa sse2/; + specialize qw/aom_sad8x4 msa sse2/; + specialize qw/aom_sad4x8 msa sse2/; + specialize qw/aom_sad4x4 neon msa sse2/; + + specialize qw/aom_sad128x128_avg avx2 sse2/; + specialize qw/aom_sad128x64_avg avx2 sse2/; + specialize qw/aom_sad64x128_avg avx2 sse2/; + specialize qw/aom_sad64x64_avg avx2 msa sse2/; + specialize qw/aom_sad64x32_avg avx2 msa sse2/; + specialize qw/aom_sad32x64_avg avx2 msa sse2/; + specialize qw/aom_sad32x32_avg avx2 msa sse2/; + specialize qw/aom_sad32x16_avg avx2 msa sse2/; + specialize qw/aom_sad16x32_avg msa sse2/; + specialize qw/aom_sad16x16_avg msa sse2/; + specialize qw/aom_sad16x8_avg msa sse2/; + specialize qw/aom_sad8x16_avg msa sse2/; + specialize qw/aom_sad8x8_avg msa sse2/; + specialize qw/aom_sad8x4_avg msa sse2/; + specialize qw/aom_sad4x8_avg msa sse2/; + specialize qw/aom_sad4x4_avg msa sse2/; + + specialize qw/aom_sad4x16 sse2/; + specialize qw/aom_sad16x4 sse2/; + specialize qw/aom_sad8x32 sse2/; + specialize qw/aom_sad32x8 sse2/; + specialize qw/aom_sad16x64 sse2/; + specialize qw/aom_sad64x16 sse2/; + + specialize qw/aom_sad4x16_avg sse2/; + specialize qw/aom_sad16x4_avg sse2/; + specialize qw/aom_sad8x32_avg sse2/; + specialize qw/aom_sad32x8_avg sse2/; + specialize qw/aom_sad16x64_avg sse2/; + specialize qw/aom_sad64x16_avg sse2/; + + specialize qw/aom_dist_wtd_sad128x128_avg ssse3/; + specialize qw/aom_dist_wtd_sad128x64_avg ssse3/; + specialize qw/aom_dist_wtd_sad64x128_avg ssse3/; + specialize qw/aom_dist_wtd_sad64x64_avg ssse3/; + specialize qw/aom_dist_wtd_sad64x32_avg ssse3/; + specialize qw/aom_dist_wtd_sad32x64_avg ssse3/; + specialize qw/aom_dist_wtd_sad32x32_avg ssse3/; + specialize qw/aom_dist_wtd_sad32x16_avg ssse3/; + specialize qw/aom_dist_wtd_sad16x32_avg ssse3/; + specialize qw/aom_dist_wtd_sad16x16_avg ssse3/; + specialize qw/aom_dist_wtd_sad16x8_avg ssse3/; + specialize qw/aom_dist_wtd_sad8x16_avg ssse3/; + specialize qw/aom_dist_wtd_sad8x8_avg ssse3/; + specialize qw/aom_dist_wtd_sad8x4_avg ssse3/; + specialize qw/aom_dist_wtd_sad4x8_avg ssse3/; + specialize qw/aom_dist_wtd_sad4x4_avg ssse3/; + + specialize qw/aom_dist_wtd_sad4x16_avg ssse3/; + specialize qw/aom_dist_wtd_sad16x4_avg ssse3/; + specialize qw/aom_dist_wtd_sad8x32_avg ssse3/; + specialize qw/aom_dist_wtd_sad32x8_avg ssse3/; + specialize qw/aom_dist_wtd_sad16x64_avg ssse3/; + specialize qw/aom_dist_wtd_sad64x16_avg ssse3/; + + add_proto qw/unsigned int/, "aom_sad4xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height"; + add_proto qw/unsigned int/, "aom_sad8xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height"; + add_proto qw/unsigned int/, "aom_sad16xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height"; + add_proto qw/unsigned int/, "aom_sad32xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height"; + add_proto qw/unsigned int/, "aom_sad64xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height"; + add_proto qw/unsigned int/, "aom_sad128xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height"; + + specialize qw/aom_sad4xh sse2/; + specialize qw/aom_sad8xh sse2/; + specialize qw/aom_sad16xh sse2/; + specialize qw/aom_sad32xh sse2/; + specialize qw/aom_sad64xh sse2/; + specialize qw/aom_sad128xh sse2/; + + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; + if ($w != 128 && $h != 128 && $w != 4) { + specialize "aom_highbd_sad${w}x${h}", qw/sse2/; + specialize "aom_highbd_sad${w}x${h}_avg", qw/sse2/; + } + add_proto qw/unsigned int/, "aom_highbd_dist_wtd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param"; + } + specialize qw/aom_highbd_sad128x128 avx2/; + specialize qw/aom_highbd_sad128x64 avx2/; + specialize qw/aom_highbd_sad64x128 avx2/; + specialize qw/aom_highbd_sad64x64 avx2 sse2/; + specialize qw/aom_highbd_sad64x32 avx2 sse2/; + specialize qw/aom_highbd_sad32x64 avx2 sse2/; + specialize qw/aom_highbd_sad32x32 avx2 sse2/; + specialize qw/aom_highbd_sad32x16 avx2 sse2/; + specialize qw/aom_highbd_sad16x32 avx2 sse2/; + specialize qw/aom_highbd_sad16x16 avx2 sse2/; + specialize qw/aom_highbd_sad16x8 avx2 sse2/; + specialize qw/aom_highbd_sad8x4 sse2/; + specialize qw/aom_highbd_sad4x8 sse2/; + specialize qw/aom_highbd_sad4x4 sse2/; + + specialize qw/aom_highbd_sad128x128_avg avx2/; + specialize qw/aom_highbd_sad128x64_avg avx2/; + specialize qw/aom_highbd_sad64x128_avg avx2/; + specialize qw/aom_highbd_sad64x64_avg avx2 sse2/; + specialize qw/aom_highbd_sad64x32_avg avx2 sse2/; + specialize qw/aom_highbd_sad32x64_avg avx2 sse2/; + specialize qw/aom_highbd_sad32x32_avg avx2 sse2/; + specialize qw/aom_highbd_sad32x16_avg avx2 sse2/; + specialize qw/aom_highbd_sad16x32_avg avx2 sse2/; + specialize qw/aom_highbd_sad16x16_avg avx2 sse2/; + specialize qw/aom_highbd_sad16x8_avg avx2 sse2/; + specialize qw/aom_highbd_sad8x4_avg sse2/; + specialize qw/aom_highbd_sad4x8_avg sse2/; + specialize qw/aom_highbd_sad4x4_avg sse2/; + + specialize qw/aom_highbd_sad4x16 sse2/; + specialize qw/aom_highbd_sad16x4 avx2 sse2/; + specialize qw/aom_highbd_sad8x32 sse2/; + specialize qw/aom_highbd_sad32x8 avx2 sse2/; + specialize qw/aom_highbd_sad16x64 avx2 sse2/; + specialize qw/aom_highbd_sad64x16 avx2 sse2/; + + specialize qw/aom_highbd_sad4x16_avg sse2/; + specialize qw/aom_highbd_sad16x4_avg avx2 sse2/; + specialize qw/aom_highbd_sad8x32_avg sse2/; + specialize qw/aom_highbd_sad32x8_avg avx2 sse2/; + specialize qw/aom_highbd_sad16x64_avg avx2 sse2/; + specialize qw/aom_highbd_sad64x16_avg avx2 sse2/; + } + # + # Masked SAD + # + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_masked_sad${w}x${h}", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask"; + specialize "aom_masked_sad${w}x${h}", qw/ssse3 avx2/; + } + + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask"; + specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3 avx2/; + } + } + + # + # OBMC SAD + # + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask"; + if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) { + specialize "aom_obmc_sad${w}x${h}", qw/sse4_1 avx2/; + } + } + + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask"; + if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) { + specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1 avx2/; + } + } + } + + # + # Multi-block SAD, comparing a reference to N independent blocks + # + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/void/, "aom_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void/, "aom_sad${w}x${h}x4d_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, const uint8_t *second_pred, uint32_t *sad_array"; + add_proto qw/void/, "aom_masked_sad${w}x${h}x4d", "const uint8_t *src, int src_stride, const uint8_t *ref[], int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned sads[]"; + } + + specialize qw/aom_sad128x128x4d avx2 sse2/; + specialize qw/aom_sad128x64x4d avx2 sse2/; + specialize qw/aom_sad64x128x4d avx2 sse2/; + specialize qw/aom_sad64x64x4d avx2 neon msa sse2/; + specialize qw/aom_sad64x32x4d avx2 msa sse2/; + specialize qw/aom_sad64x16x4d avx2 sse2/; + specialize qw/aom_sad32x64x4d avx2 msa sse2/; + specialize qw/aom_sad32x32x4d avx2 neon msa sse2/; + specialize qw/aom_sad32x16x4d avx2 msa sse2/; + specialize qw/aom_sad32x8x4d avx2 sse2/; + specialize qw/aom_sad16x64x4d sse2/; + specialize qw/aom_sad16x32x4d msa sse2/; + specialize qw/aom_sad16x16x4d neon msa sse2/; + specialize qw/aom_sad16x8x4d msa sse2/; + + specialize qw/aom_sad8x16x4d msa sse2/; + specialize qw/aom_sad8x8x4d msa sse2/; + specialize qw/aom_sad8x4x4d msa sse2/; + specialize qw/aom_sad4x16x4d msa sse2/; + specialize qw/aom_sad4x8x4d msa sse2/; + specialize qw/aom_sad4x4x4d msa sse2/; + + specialize qw/aom_sad4x32x4d sse2/; + specialize qw/aom_sad4x16x4d sse2/; + specialize qw/aom_sad16x4x4d sse2/; + specialize qw/aom_sad8x32x4d sse2/; + specialize qw/aom_sad32x8x4d sse2/; + specialize qw/aom_sad64x16x4d sse2/; + + specialize qw/aom_sad128x128x4d_avg sse2/; + specialize qw/aom_sad128x64x4d_avg sse2/; + specialize qw/aom_sad64x128x4d_avg sse2/; + specialize qw/aom_sad64x64x4d_avg sse2/; + specialize qw/aom_sad64x32x4d_avg sse2/; + specialize qw/aom_sad64x16x4d_avg sse2/; + specialize qw/aom_sad32x64x4d_avg sse2/; + specialize qw/aom_sad32x32x4d_avg sse2/; + specialize qw/aom_sad32x16x4d_avg sse2/; + specialize qw/aom_sad32x8x4d_avg sse2/; + specialize qw/aom_sad16x64x4d_avg sse2/; + specialize qw/aom_sad16x32x4d_avg sse2/; + specialize qw/aom_sad16x16x4d_avg sse2/; + specialize qw/aom_sad16x8x4d_avg sse2/; + + specialize qw/aom_sad8x16x4d_avg sse2/; + specialize qw/aom_sad8x8x4d_avg sse2/; + specialize qw/aom_sad8x4x4d_avg sse2/; + specialize qw/aom_sad4x16x4d_avg sse2/; + specialize qw/aom_sad4x8x4d_avg sse2/; + specialize qw/aom_sad4x4x4d_avg sse2/; + + specialize qw/aom_sad4x32x4d_avg sse2/; + specialize qw/aom_sad4x16x4d_avg sse2/; + specialize qw/aom_sad16x4x4d_avg sse2/; + specialize qw/aom_sad8x32x4d_avg sse2/; + specialize qw/aom_sad32x8x4d_avg sse2/; + specialize qw/aom_sad64x16x4d_avg sse2/; + + specialize qw/aom_masked_sad128x128x4d ssse3/; + specialize qw/aom_masked_sad128x64x4d ssse3/; + specialize qw/aom_masked_sad64x128x4d ssse3/; + specialize qw/aom_masked_sad64x64x4d ssse3/; + specialize qw/aom_masked_sad64x32x4d ssse3/; + specialize qw/aom_masked_sad64x16x4d ssse3/; + specialize qw/aom_masked_sad32x64x4d ssse3/; + specialize qw/aom_masked_sad32x32x4d ssse3/; + specialize qw/aom_masked_sad32x16x4d ssse3/; + specialize qw/aom_masked_sad32x8x4d ssse3/; + specialize qw/aom_masked_sad16x64x4d ssse3/; + specialize qw/aom_masked_sad16x32x4d ssse3/; + specialize qw/aom_masked_sad16x16x4d ssse3/; + specialize qw/aom_masked_sad16x8x4d ssse3/; + + specialize qw/aom_masked_sad8x16x4d ssse3/; + specialize qw/aom_masked_sad8x8x4d ssse3/; + specialize qw/aom_masked_sad8x4x4d ssse3/; + specialize qw/aom_masked_sad4x16x4d ssse3/; + specialize qw/aom_masked_sad4x8x4d ssse3/; + specialize qw/aom_masked_sad4x4x4d ssse3/; + + specialize qw/aom_masked_sad4x32x4d ssse3/; + specialize qw/aom_masked_sad4x16x4d ssse3/; + specialize qw/aom_masked_sad16x4x4d ssse3/; + specialize qw/aom_masked_sad8x32x4d ssse3/; + specialize qw/aom_masked_sad32x8x4d ssse3/; + specialize qw/aom_masked_sad64x16x4d ssse3/; + # + # Multi-block SAD, comparing a reference to N independent blocks + # + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; + if ($w != 128 && $h != 128) { + specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/; + } + } + specialize qw/aom_highbd_sad128x128x4d avx2/; + specialize qw/aom_highbd_sad128x64x4d avx2/; + specialize qw/aom_highbd_sad64x128x4d avx2/; + specialize qw/aom_highbd_sad64x64x4d sse2 avx2/; + specialize qw/aom_highbd_sad64x32x4d sse2 avx2/; + specialize qw/aom_highbd_sad32x64x4d sse2 avx2/; + specialize qw/aom_highbd_sad32x32x4d sse2 avx2/; + specialize qw/aom_highbd_sad32x16x4d sse2 avx2/; + specialize qw/aom_highbd_sad16x32x4d sse2 avx2/; + specialize qw/aom_highbd_sad16x16x4d sse2 avx2/; + specialize qw/aom_highbd_sad16x8x4d sse2 avx2/; + specialize qw/aom_highbd_sad8x16x4d sse2/; + specialize qw/aom_highbd_sad8x8x4d sse2/; + specialize qw/aom_highbd_sad8x4x4d sse2/; + specialize qw/aom_highbd_sad4x8x4d sse2/; + specialize qw/aom_highbd_sad4x4x4d sse2/; + + specialize qw/aom_highbd_sad4x16x4d sse2/; + specialize qw/aom_highbd_sad16x4x4d avx2 sse2/; + specialize qw/aom_highbd_sad8x32x4d sse2/; + specialize qw/aom_highbd_sad32x8x4d avx2 sse2/; + specialize qw/aom_highbd_sad16x64x4d avx2 sse2/; + specialize qw/aom_highbd_sad64x16x4d avx2 sse2/; + } + # + # Avg + # + add_proto qw/unsigned int aom_avg_8x8/, "const uint8_t *, int p"; + specialize qw/aom_avg_8x8 sse2 neon/; + + add_proto qw/unsigned int aom_avg_4x4/, "const uint8_t *, int p"; + specialize qw/aom_avg_4x4 sse2 neon/; + + add_proto qw/void aom_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; + specialize qw/aom_minmax_8x8 sse2/; + + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/unsigned int aom_highbd_avg_8x8/, "const uint8_t *, int p"; + add_proto qw/unsigned int aom_highbd_avg_4x4/, "const uint8_t *, int p"; + add_proto qw/void aom_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; + } + + add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height"; + specialize qw/aom_int_pro_row sse2/; + + add_proto qw/int16_t aom_int_pro_col/, "const uint8_t *ref, const int width"; + specialize qw/aom_int_pro_col sse2/; + + add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl"; + # TODO(kyslov@) bring back SSE2 by extending it to 128 block size + #specialize qw/aom_vector_var sse2/; + + # + # hamadard transform and satd for implmenting temporal dependency model + # + add_proto qw/void aom_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; + specialize qw/aom_hadamard_8x8 sse2 neon/; + + add_proto qw/void aom_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; + specialize qw/aom_hadamard_16x16 avx2 sse2 neon/; + + add_proto qw/void aom_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; + specialize qw/aom_hadamard_32x32 avx2 sse2/; + + add_proto qw/void aom_hadamard_lp_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff"; + specialize qw/aom_hadamard_lp_8x8 sse2 neon/; + + add_proto qw/void aom_hadamard_lp_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff"; + specialize qw/aom_hadamard_lp_16x16 avx2 neon/; + + + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; + specialize qw/aom_highbd_hadamard_8x8 avx2/; + + add_proto qw/void aom_highbd_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; + specialize qw/aom_highbd_hadamard_16x16 avx2/; + + add_proto qw/void aom_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; + specialize qw/aom_highbd_hadamard_32x32 avx2/; + } + add_proto qw/int aom_satd/, "const tran_low_t *coeff, int length"; + specialize qw/aom_satd avx2/; + + add_proto qw/int aom_satd_lp/, "const int16_t *coeff, int length"; + specialize qw/aom_satd_lp avx2 neon/; + + + # + # Structured Similarity (SSIM) + # + if (aom_config("CONFIG_INTERNAL_STATS") eq "yes") { + add_proto qw/void aom_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; + specialize qw/aom_ssim_parms_8x8/, "$sse2_x86_64"; + + add_proto qw/void aom_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; + specialize qw/aom_ssim_parms_16x16/, "$sse2_x86_64"; + + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; + } + } +} # CONFIG_AV1_ENCODER + +if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { + + # + # Specialty Variance + # + add_proto qw/void aom_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + + add_proto qw/void aom_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + + specialize qw/aom_get16x16var neon msa/; + specialize qw/aom_get8x8var sse2 neon msa/; + + + add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + + specialize qw/aom_mse16x16 sse2 avx2 neon msa/; + specialize qw/aom_mse16x8 sse2 msa/; + specialize qw/aom_mse8x16 sse2 msa/; + specialize qw/aom_mse8x8 sse2 msa/; + + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + foreach $bd (8, 10, 12) { + add_proto qw/void/, "aom_highbd_${bd}_get16x16var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void/, "aom_highbd_${bd}_get8x8var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + + add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + + specialize "aom_highbd_${bd}_mse16x16", qw/sse2/; + specialize "aom_highbd_${bd}_mse8x8", qw/sse2/; + } + } + + # + # + # + add_proto qw/void aom_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, int width, int height, int subpel_x_q3, + int subpel_y_q3, const uint8_t *ref, int ref_stride, int subpel_search"; + specialize qw/aom_upsampled_pred sse2/; + + add_proto qw/void aom_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, int subpel_search"; + specialize qw/aom_comp_avg_upsampled_pred sse2/; + + add_proto qw/void aom_dist_wtd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search"; + specialize qw/aom_dist_wtd_comp_avg_upsampled_pred ssse3/; + + add_proto qw/void aom_comp_mask_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, + int subpel_search"; + specialize qw/aom_comp_mask_upsampled_pred sse2/; + + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3, + int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search"; + specialize qw/aom_highbd_upsampled_pred sse2/; + + add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search"; + specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/; + + add_proto qw/void aom_highbd_dist_wtd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search"; + specialize qw/aom_highbd_dist_wtd_comp_avg_upsampled_pred sse2/; + } + + # + # + # + add_proto qw/unsigned int aom_get_mb_ss/, "const int16_t *"; + add_proto qw/unsigned int aom_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride"; + + specialize qw/aom_get_mb_ss sse2 msa/; + specialize qw/aom_get4x4sse_cs neon msa/; + + # + # Variance / Subpixel Variance / Subpixel Avg Variance + # + add_proto qw/unsigned int/, "aom_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + + add_proto qw/unsigned int/, "aom_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + + add_proto qw/unsigned int/, "aom_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/uint32_t/, "aom_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t/, "aom_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param"; + } + specialize qw/aom_variance128x128 sse2 avx2 neon /; + specialize qw/aom_variance128x64 sse2 avx2 /; + specialize qw/aom_variance64x128 sse2 avx2 /; + specialize qw/aom_variance64x64 sse2 avx2 neon msa/; + specialize qw/aom_variance64x32 sse2 avx2 neon msa/; + specialize qw/aom_variance32x64 sse2 avx2 neon msa/; + specialize qw/aom_variance32x32 sse2 avx2 neon msa/; + specialize qw/aom_variance32x16 sse2 avx2 msa/; + specialize qw/aom_variance16x32 sse2 avx2 msa/; + specialize qw/aom_variance16x16 sse2 avx2 neon msa/; + specialize qw/aom_variance16x8 sse2 avx2 neon msa/; + specialize qw/aom_variance8x16 sse2 neon msa/; + specialize qw/aom_variance8x8 sse2 neon msa/; + specialize qw/aom_variance8x4 sse2 msa/; + specialize qw/aom_variance4x8 sse2 msa/; + specialize qw/aom_variance4x4 sse2 msa/; + + specialize qw/aom_sub_pixel_variance128x128 avx2 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance128x64 avx2 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance64x128 avx2 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance64x64 avx2 neon msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance64x32 avx2 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance32x64 avx2 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance32x32 avx2 neon msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance32x16 avx2 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance16x32 avx2 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance16x16 avx2 neon msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance16x8 avx2 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance8x16 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance8x8 neon msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance8x4 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance4x8 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance4x4 msa sse2 ssse3/; + + specialize qw/aom_sub_pixel_avg_variance128x128 avx2 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance128x64 avx2 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x128 avx2 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x32 avx2 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x64 avx2 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x16 avx2 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x32 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x16 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x8 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x16 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x8 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x4 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance4x8 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/; + + specialize qw/aom_variance4x16 sse2/; + specialize qw/aom_variance16x4 sse2 avx2/; + specialize qw/aom_variance8x32 sse2/; + specialize qw/aom_variance32x8 sse2 avx2/; + specialize qw/aom_variance16x64 sse2 avx2/; + specialize qw/aom_variance64x16 sse2 avx2/; + + specialize qw/aom_sub_pixel_variance4x16 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance16x4 avx2 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance8x32 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance32x8 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance16x64 avx2 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance64x16 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance4x16 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x4 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x32 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x8 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x64 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x16 sse2 ssse3/; + + specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x64 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x32 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x64 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x32 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x16 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x32 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x16 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x8 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x16 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x8 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x4 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x8 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x4 ssse3/; + + specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x16 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x4 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x32 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x8 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x64 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x16 ssse3/; + + specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x128 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x64 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x128 ssse3/; + + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + foreach $bd (8, 10, 12) { + add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + + add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + + add_proto qw/unsigned int/, "aom_highbd_${bd}_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + if ($w != 128 && $h != 128 && $w != 4 && $h != 4) { + specialize "aom_highbd_${bd}_variance${w}x${h}", "sse2"; + } + # TODO(david.barker): When ext-partition-types is enabled, we currently + # don't have vectorized 4x16 highbd variance functions + if ($w == 4 && $h == 4) { + specialize "aom_highbd_${bd}_variance${w}x${h}", "sse4_1"; + } + if ($w != 128 && $h != 128 && $w != 4) { + specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", qw/sse2/; + specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", qw/sse2/; + } + if ($w == 4 && $h == 4) { + specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1"; + specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1"; + } + + add_proto qw/uint32_t/, "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param"; + } + } + } + # + # Masked Variance / Masked Subpixel Variance + # + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse"; + specialize "aom_masked_sub_pixel_variance${w}x${h}", qw/ssse3/; + } + + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + foreach $bd ("_8_", "_10_", "_12_") { + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse"; + specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3/; + } + } + } + + # + # OBMC Variance / OBMC Subpixel Variance + # + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; + add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; + specialize "aom_obmc_variance${w}x${h}", qw/sse4_1 avx2/; + specialize "aom_obmc_sub_pixel_variance${w}x${h}", q/sse4_1/; + } + + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + foreach $bd ("_", "_10_", "_12_") { + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; + add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; + specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1/; + } + } + } + + add_proto qw/uint32_t aom_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/; + + add_proto qw/uint32_t aom_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_sub_pixel_avg_variance64x32 msa sse2 ssse3/; + + add_proto qw/uint32_t aom_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_sub_pixel_avg_variance32x64 msa sse2 ssse3/; + + add_proto qw/uint32_t aom_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/; + + add_proto qw/uint32_t aom_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_sub_pixel_avg_variance32x16 msa sse2 ssse3/; + + add_proto qw/uint32_t aom_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_sub_pixel_avg_variance16x32 msa sse2 ssse3/; + + add_proto qw/uint32_t aom_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_sub_pixel_avg_variance16x16 msa sse2 ssse3/; + + add_proto qw/uint32_t aom_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_sub_pixel_avg_variance16x8 msa sse2 ssse3/; + + add_proto qw/uint32_t aom_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_sub_pixel_avg_variance8x16 msa sse2 ssse3/; + + add_proto qw/uint32_t aom_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_sub_pixel_avg_variance8x8 msa sse2 ssse3/; + + add_proto qw/uint32_t aom_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_sub_pixel_avg_variance8x4 msa sse2 ssse3/; + + add_proto qw/uint32_t aom_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_sub_pixel_avg_variance4x8 msa sse2 ssse3/; + + add_proto qw/uint32_t aom_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/; + + # + # Comp Avg + # + add_proto qw/void aom_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; + + add_proto qw/void aom_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param"; + specialize qw/aom_dist_wtd_comp_avg_pred ssse3/; + + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + + add_proto qw/unsigned int aom_highbd_12_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance128x128 sse2/; + + add_proto qw/unsigned int aom_highbd_12_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance128x64 sse2/; + + add_proto qw/unsigned int aom_highbd_12_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance64x128 sse2/; + + add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance64x64 sse2/; + + add_proto qw/unsigned int aom_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance64x32 sse2/; + + add_proto qw/unsigned int aom_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance32x64 sse2/; + + add_proto qw/unsigned int aom_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance32x32 sse2/; + + add_proto qw/unsigned int aom_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance32x16 sse2/; + + add_proto qw/unsigned int aom_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance16x32 sse2/; + + add_proto qw/unsigned int aom_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance16x16 sse2/; + + add_proto qw/unsigned int aom_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance16x8 sse2/; + + add_proto qw/unsigned int aom_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance8x16 sse2/; + + add_proto qw/unsigned int aom_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance8x8 sse2/; + + add_proto qw/unsigned int aom_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + + add_proto qw/unsigned int aom_highbd_10_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance128x128 sse2 avx2/; + + add_proto qw/unsigned int aom_highbd_10_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance128x64 sse2 avx2/; + + add_proto qw/unsigned int aom_highbd_10_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance64x128 sse2 avx2/; + + add_proto qw/unsigned int aom_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance64x64 sse2 avx2/; + + add_proto qw/unsigned int aom_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance64x32 sse2 avx2/; + + add_proto qw/unsigned int aom_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance32x64 sse2 avx2/; + + add_proto qw/unsigned int aom_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance32x32 sse2 avx2/; + + add_proto qw/unsigned int aom_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance32x16 sse2 avx2/; + + add_proto qw/unsigned int aom_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance16x32 sse2 avx2/; + + add_proto qw/unsigned int aom_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance16x16 sse2 avx2/; + + add_proto qw/unsigned int aom_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance16x8 sse2 avx2/; + + add_proto qw/unsigned int aom_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance8x16 sse2 avx2/; + + add_proto qw/unsigned int aom_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance8x8 sse2 avx2/; + + add_proto qw/unsigned int aom_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + + add_proto qw/unsigned int aom_highbd_8_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance128x128 sse2/; + + add_proto qw/unsigned int aom_highbd_8_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance128x64 sse2/; + + add_proto qw/unsigned int aom_highbd_8_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance64x128 sse2/; + + add_proto qw/unsigned int aom_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance64x64 sse2/; + + add_proto qw/unsigned int aom_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance64x32 sse2/; + + add_proto qw/unsigned int aom_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance32x64 sse2/; + + add_proto qw/unsigned int aom_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance32x32 sse2/; + + add_proto qw/unsigned int aom_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance32x16 sse2/; + + add_proto qw/unsigned int aom_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance16x32 sse2/; + + add_proto qw/unsigned int aom_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance16x16 sse2/; + + add_proto qw/unsigned int aom_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance16x8 sse2/; + + add_proto qw/unsigned int aom_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance8x16 sse2/; + + add_proto qw/unsigned int aom_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance8x8 sse2/; + + add_proto qw/unsigned int aom_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + + add_proto qw/void aom_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void aom_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + + add_proto qw/void aom_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void aom_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + + add_proto qw/void aom_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void aom_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + + add_proto qw/unsigned int aom_highbd_8_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_mse16x16 sse2/; + + add_proto qw/unsigned int aom_highbd_8_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_8_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_8_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_mse8x8 sse2/; + + add_proto qw/unsigned int aom_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_mse16x16 sse2/; + + add_proto qw/unsigned int aom_highbd_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_mse8x8 sse2/; + + add_proto qw/unsigned int aom_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_mse16x16 sse2/; + + add_proto qw/unsigned int aom_highbd_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_mse8x8 sse2/; + + add_proto qw/void aom_highbd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride"; + + add_proto qw/void aom_highbd_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param"; + specialize qw/aom_highbd_dist_wtd_comp_avg_pred sse2/; + } + # + # Subpixel Variance + # + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance128x128 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance128x64 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance64x128 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance8x16 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance8x8 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance8x4 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance128x128 sse2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance128x64 sse2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance64x128 sse2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance64x32 sse2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance32x64 sse2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance32x32 sse2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance32x16 sse2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance16x32 sse2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance16x16 sse2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance8x4 sse2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance128x128 sse2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance128x64 sse2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance64x128 sse2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance8x16 sse2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance8x8 sse2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance8x4 sse2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance64x32 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance32x64 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance32x32 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance32x16 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance16x32 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance16x16 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance16x8 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance8x16 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance8x8 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance8x4 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance64x64 sse2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance64x32 sse2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance32x64 sse2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance32x32 sse2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance32x16 sse2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance16x32 sse2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance16x16 sse2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance16x8 sse2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance8x16 sse2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance8x8 sse2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance8x4 sse2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance64x64 sse2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance64x32 sse2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance32x64 sse2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance32x32 sse2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance32x16 sse2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance16x32 sse2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance16x16 sse2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance16x8 sse2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance8x16 sse2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance8x8 sse2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance8x4 sse2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + } + + + add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; + specialize qw/aom_comp_mask_pred ssse3 avx2/; + + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; + specialize qw/aom_highbd_comp_mask_pred sse2 avx2/; + } + +} # CONFIG_AV1_ENCODER + +1; diff --git a/libs/libaom/src/aom_dsp/aom_filter.h b/libs/libaom/src/aom_dsp/aom_filter.h new file mode 100644 index 000000000..00686ac38 --- /dev/null +++ b/libs/libaom/src/aom_dsp/aom_filter.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_AOM_FILTER_H_ +#define AOM_AOM_DSP_AOM_FILTER_H_ + +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define FILTER_BITS 7 + +#define SUBPEL_BITS 4 +#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1) +#define SUBPEL_SHIFTS (1 << SUBPEL_BITS) +#define SUBPEL_TAPS 8 + +#define SCALE_SUBPEL_BITS 10 +#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS) +#define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1) +#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS) +#define SCALE_EXTRA_OFF ((1 << SCALE_EXTRA_BITS) / 2) + +#define RS_SUBPEL_BITS 6 +#define RS_SUBPEL_MASK ((1 << RS_SUBPEL_BITS) - 1) +#define RS_SCALE_SUBPEL_BITS 14 +#define RS_SCALE_SUBPEL_MASK ((1 << RS_SCALE_SUBPEL_BITS) - 1) +#define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS) +#define RS_SCALE_EXTRA_OFF (1 << (RS_SCALE_EXTRA_BITS - 1)) + +typedef int16_t InterpKernel[SUBPEL_TAPS]; + +#define BIL_SUBPEL_BITS 3 +#define BIL_SUBPEL_SHIFTS (1 << BIL_SUBPEL_BITS) + +// 2 tap bilinear filters +static const uint8_t bilinear_filters_2t[BIL_SUBPEL_SHIFTS][2] = { + { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_AOM_FILTER_H_ diff --git a/libs/libaom/src/aom_dsp/aom_simd.h b/libs/libaom/src/aom_dsp/aom_simd.h new file mode 100644 index 000000000..ab950ca55 --- /dev/null +++ b/libs/libaom/src/aom_dsp/aom_simd.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_AOM_SIMD_H_ +#define AOM_AOM_DSP_AOM_SIMD_H_ + +#include + +#if defined(_WIN32) +#include +#endif + +#include "config/aom_config.h" + +#include "aom_dsp/aom_simd_inline.h" + +#define SIMD_CHECK 1 // Sanity checks in C equivalents + +#if HAVE_NEON +#include "simd/v256_intrinsics_arm.h" +// VS compiling for 32 bit targets does not support vector types in +// structs as arguments, which makes the v256 type of the intrinsics +// hard to support, so optimizations for this target are disabled. +#elif HAVE_SSE2 && (defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__)) +#include "simd/v256_intrinsics_x86.h" +#else +#include "simd/v256_intrinsics.h" +#endif + +#endif // AOM_AOM_DSP_AOM_SIMD_H_ diff --git a/libs/libaom/src/aom_dsp/aom_simd_inline.h b/libs/libaom/src/aom_dsp/aom_simd_inline.h new file mode 100644 index 000000000..eb333f6f6 --- /dev/null +++ b/libs/libaom/src/aom_dsp/aom_simd_inline.h @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_AOM_SIMD_INLINE_H_ +#define AOM_AOM_DSP_AOM_SIMD_INLINE_H_ + +#include "aom/aom_integer.h" + +#ifndef SIMD_INLINE +#define SIMD_INLINE static AOM_FORCE_INLINE +#endif + +#endif // AOM_AOM_DSP_AOM_SIMD_INLINE_H_ diff --git a/libs/libaom/src/aom_dsp/arm/avg_neon.c b/libs/libaom/src/aom_dsp/arm/avg_neon.c new file mode 100644 index 000000000..af3769edf --- /dev/null +++ b/libs/libaom/src/aom_dsp/arm/avg_neon.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/arm/sum_neon.h" +#include "av1/common/arm/mem_neon.h" +#include "av1/common/arm/transpose_neon.h" + +unsigned int aom_avg_4x4_neon(const uint8_t *a, int a_stride) { + const uint8x16_t b = load_unaligned_u8q(a, a_stride); + const uint16x8_t c = vaddl_u8(vget_low_u8(b), vget_high_u8(b)); +#if defined(__aarch64__) + const uint32_t d = vaddlvq_u16(c); + return (d + 8) >> 4; +#else + const uint32x2_t d = horizontal_add_u16x8(c); + return vget_lane_u32(vrshr_n_u32(d, 4), 0); +#endif +} + +unsigned int aom_avg_8x8_neon(const uint8_t *a, int a_stride) { + uint16x8_t sum; + uint32x2_t d; + uint8x8_t b = vld1_u8(a); + a += a_stride; + uint8x8_t c = vld1_u8(a); + a += a_stride; + sum = vaddl_u8(b, c); + + for (int i = 0; i < 6; ++i) { + const uint8x8_t e = vld1_u8(a); + a += a_stride; + sum = vaddw_u8(sum, e); + } + + d = horizontal_add_u16x8(sum); + + return vget_lane_u32(vrshr_n_u32(d, 6), 0); +} + +int aom_satd_lp_neon(const int16_t *coeff, int length) { + const int16x4_t zero = vdup_n_s16(0); + int32x4_t accum = vdupq_n_s32(0); + + do { + const int16x8_t src0 = vld1q_s16(coeff); + const int16x8_t src8 = vld1q_s16(coeff + 8); + accum = vabal_s16(accum, vget_low_s16(src0), zero); + accum = vabal_s16(accum, vget_high_s16(src0), zero); + accum = vabal_s16(accum, vget_low_s16(src8), zero); + accum = vabal_s16(accum, vget_high_s16(src8), zero); + length -= 16; + coeff += 16; + } while (length != 0); + + { + // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] + const int64x2_t s0 = vpaddlq_s32(accum); // cascading summation of 'accum'. + const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)), + vreinterpret_s32_s64(vget_high_s64(s0))); + const int satd = vget_lane_s32(s1, 0); + return satd; + } +} diff --git a/libs/libaom/src/aom_dsp/arm/blend_a64_mask_neon.c b/libs/libaom/src/aom_dsp/arm/blend_a64_mask_neon.c new file mode 100644 index 000000000..e7f08a5fd --- /dev/null +++ b/libs/libaom/src/aom_dsp/arm/blend_a64_mask_neon.c @@ -0,0 +1,451 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" +#include "aom_ports/mem.h" +#include "av1/common/arm/mem_neon.h" +#include "config/aom_dsp_rtcd.h" + +static INLINE void blend8x1(int16x8_t mask, int16x8_t src_0, int16x8_t src_1, + const int16x8_t v_maxval, int16x8_t *res) { + int32x4_t im_res_low, im_res_high; + const int16x8_t max_minus_mask = vsubq_s16(v_maxval, mask); + + im_res_low = vmull_s16(vget_low_s16(mask), vget_low_s16(src_0)); + im_res_low = + vmlal_s16(im_res_low, vget_low_s16(max_minus_mask), vget_low_s16(src_1)); + + im_res_high = vmull_s16(vget_high_s16(mask), vget_high_s16(src_0)); + im_res_high = vmlal_s16(im_res_high, vget_high_s16(max_minus_mask), + vget_high_s16(src_1)); + + *res = vcombine_s16(vshrn_n_s32(im_res_low, AOM_BLEND_A64_ROUND_BITS), + vshrn_n_s32(im_res_high, AOM_BLEND_A64_ROUND_BITS)); +} + +static INLINE void blend_8x4(uint8_t *dst, uint32_t dst_stride, + const CONV_BUF_TYPE *src0, uint32_t src0_stride, + const CONV_BUF_TYPE *src1, uint32_t src1_stride, + int16x8_t mask0, int16x8_t mask1, int16x8_t mask2, + int16x8_t mask3, const int16x8_t v_maxval, + const uint16x8_t vec_round_offset, + const int16x8_t vec_round_bits) { + int16x8_t src0_0, src0_1, src0_2, src0_3; + int16x8_t src1_0, src1_1, src1_2, src1_3; + int16x8_t im_res_0, im_res_1, im_res_2, im_res_3; + + load_s16_8x4((int16_t *)src0, (int32_t)src0_stride, &src0_0, &src0_1, &src0_2, + &src0_3); + load_s16_8x4((int16_t *)src1, (int32_t)src1_stride, &src1_0, &src1_1, &src1_2, + &src1_3); + + blend8x1(mask0, src0_0, src1_0, v_maxval, &im_res_0); + blend8x1(mask1, src0_1, src1_1, v_maxval, &im_res_1); + blend8x1(mask2, src0_2, src1_2, v_maxval, &im_res_2); + blend8x1(mask3, src0_3, src1_3, v_maxval, &im_res_3); + + uint16x8_t im_res1_0 = + vqsubq_u16(vreinterpretq_u16_s16(im_res_0), vec_round_offset); + uint16x8_t im_res1_1 = + vqsubq_u16(vreinterpretq_u16_s16(im_res_1), vec_round_offset); + uint16x8_t im_res1_2 = + vqsubq_u16(vreinterpretq_u16_s16(im_res_2), vec_round_offset); + uint16x8_t im_res1_3 = + vqsubq_u16(vreinterpretq_u16_s16(im_res_3), vec_round_offset); + + im_res_0 = vshlq_s16(vreinterpretq_s16_u16(im_res1_0), vec_round_bits); + im_res_1 = vshlq_s16(vreinterpretq_s16_u16(im_res1_1), vec_round_bits); + im_res_2 = vshlq_s16(vreinterpretq_s16_u16(im_res1_2), vec_round_bits); + im_res_3 = vshlq_s16(vreinterpretq_s16_u16(im_res1_3), vec_round_bits); + + vst1_u8((dst + 0 * dst_stride), vqmovun_s16(im_res_0)); + vst1_u8((dst + 1 * dst_stride), vqmovun_s16(im_res_1)); + vst1_u8((dst + 2 * dst_stride), vqmovun_s16(im_res_2)); + vst1_u8((dst + 3 * dst_stride), vqmovun_s16(im_res_3)); +} + +static INLINE void blend_4x4(uint8_t *dst, uint32_t dst_stride, + const CONV_BUF_TYPE *src0, uint32_t src0_stride, + const CONV_BUF_TYPE *src1, uint32_t src1_stride, + int16x4_t mask0, int16x4_t mask1, int16x4_t mask2, + int16x4_t mask3, const int16x8_t v_maxval, + const uint16x8_t vec_round_offset, + const int16x8_t vec_round_bits) { + int16x8_t src0_0, src0_1; + int16x8_t src1_0, src1_1; + uint64x2_t tu0 = vdupq_n_u64(0), tu1 = vdupq_n_u64(0), tu2 = vdupq_n_u64(0), + tu3 = vdupq_n_u64(0); + int16x8_t mask0_1, mask2_3; + int16x8_t res0, res1; + + load_unaligned_u16_4x4(src0, src0_stride, &tu0, &tu1); + load_unaligned_u16_4x4(src1, src1_stride, &tu2, &tu3); + + src0_0 = vreinterpretq_s16_u64(tu0); + src0_1 = vreinterpretq_s16_u64(tu1); + + src1_0 = vreinterpretq_s16_u64(tu2); + src1_1 = vreinterpretq_s16_u64(tu3); + + mask0_1 = vcombine_s16(mask0, mask1); + mask2_3 = vcombine_s16(mask2, mask3); + + blend8x1(mask0_1, src0_0, src1_0, v_maxval, &res0); + blend8x1(mask2_3, src0_1, src1_1, v_maxval, &res1); + + uint16x8_t im_res_0 = + vqsubq_u16(vreinterpretq_u16_s16(res0), vec_round_offset); + uint16x8_t im_res_1 = + vqsubq_u16(vreinterpretq_u16_s16(res1), vec_round_offset); + + src0_0 = vshlq_s16(vreinterpretq_s16_u16(im_res_0), vec_round_bits); + src0_1 = vshlq_s16(vreinterpretq_s16_u16(im_res_1), vec_round_bits); + + uint8x8_t res_0 = vqmovun_s16(src0_0); + uint8x8_t res_1 = vqmovun_s16(src0_1); + + vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride), vreinterpret_u32_u8(res_0), + 0); + vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride), vreinterpret_u32_u8(res_0), + 1); + vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride), vreinterpret_u32_u8(res_1), + 0); + vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride), vreinterpret_u32_u8(res_1), + 1); +} + +void aom_lowbd_blend_a64_d16_mask_neon( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, + ConvolveParams *conv_params) { + int i = 0; + const int bd = 8; + int w_tmp = w; + const uint8_t *mask_tmp = mask; + const CONV_BUF_TYPE *src0_tmp = src0; + const CONV_BUF_TYPE *src1_tmp = src1; + uint8_t *dst_tmp = dst; + + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + + assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); + + assert(h >= 4); + assert(w >= 4); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + uint8x8_t s0, s1, s2, s3; + uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0), tu2 = vdup_n_u32(0), + tu3 = vdup_n_u32(0); + uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7; + int16x8_t mask0, mask1, mask2, mask3; + int16x8_t mask4, mask5, mask6, mask7; + int32x4_t m0_32, m1_32, m2_32, m3_32; + int32x4_t m4_32, m5_32, m6_32, m7_32; + uint8x8_t mask0_l, mask1_l, mask2_l, mask3_l; + uint8x8_t mask4_l, mask5_l, mask6_l, mask7_l; + int16x4_t mask0_low, mask1_low, mask2_low, mask3_low; + const uint16x4_t vec_zero = vdup_n_u16(0); + const uint16_t offset = round_offset - (1 << (round_bits - 1)); + const int16x8_t v_maxval = vdupq_n_s16(AOM_BLEND_A64_MAX_ALPHA); + const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits); + const uint16x8_t vec_offset = vdupq_n_u16(offset); + + if (subw == 0 && subh == 0) { + if (w_tmp > 7) { + do { + w_tmp = w; + do { + load_u8_8x4(mask_tmp, mask_stride, &s0, &s1, &s2, &s3); + + mask0 = vmovl_s8(vreinterpret_s8_u8(s0)); + mask1 = vmovl_s8(vreinterpret_s8_u8(s1)); + mask2 = vmovl_s8(vreinterpret_s8_u8(s2)); + mask3 = vmovl_s8(vreinterpret_s8_u8(s3)); + + blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, + src1_stride, mask0, mask1, mask2, mask3, v_maxval, + vec_offset, vec_round_bits); + + w_tmp -= 8; + mask_tmp += 8; + dst_tmp += 8; + src0_tmp += 8; + src1_tmp += 8; + } while (w_tmp > 7); + i += 4; + mask_tmp += (4 * mask_stride) - w; + dst_tmp += (4 * dst_stride) - w; + src0_tmp += (4 * src0_stride) - w; + src1_tmp += (4 * src1_stride) - w; + } while (i < h); + } else { + do { + load_unaligned_u8_4x4(mask_tmp, mask_stride, &tu0, &tu1); + + mask0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0))); + mask1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1))); + + mask0_low = vget_low_s16(mask0); + mask1_low = vget_high_s16(mask0); + mask2_low = vget_low_s16(mask1); + mask3_low = vget_high_s16(mask1); + + blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, + src1_stride, mask0_low, mask1_low, mask2_low, mask3_low, + v_maxval, vec_offset, vec_round_bits); + + i += 4; + mask_tmp += (4 * mask_stride); + dst_tmp += (4 * dst_stride); + src0_tmp += (4 * src0_stride); + src1_tmp += (4 * src1_stride); + } while (i < h); + } + } else if (subw == 1 && subh == 1) { + if (w_tmp > 7) { + do { + w_tmp = w; + do { + load_u8_16x8(mask_tmp, mask_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, + &t7); + + mask0 = + vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t0), vget_low_u8(t1))); + mask1 = + vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t2), vget_low_u8(t3))); + mask2 = + vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t4), vget_low_u8(t5))); + mask3 = + vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t6), vget_low_u8(t7))); + + mask4 = vreinterpretq_s16_u16( + vaddl_u8(vget_high_u8(t0), vget_high_u8(t1))); + mask5 = vreinterpretq_s16_u16( + vaddl_u8(vget_high_u8(t2), vget_high_u8(t3))); + mask6 = vreinterpretq_s16_u16( + vaddl_u8(vget_high_u8(t4), vget_high_u8(t5))); + mask7 = vreinterpretq_s16_u16( + vaddl_u8(vget_high_u8(t6), vget_high_u8(t7))); + + m0_32 = vpaddlq_s16(mask0); + m1_32 = vpaddlq_s16(mask1); + m2_32 = vpaddlq_s16(mask2); + m3_32 = vpaddlq_s16(mask3); + + m4_32 = vpaddlq_s16(mask4); + m5_32 = vpaddlq_s16(mask5); + m6_32 = vpaddlq_s16(mask6); + m7_32 = vpaddlq_s16(mask7); + + mask0 = + vcombine_s16(vqrshrn_n_s32(m0_32, 2), vqrshrn_n_s32(m4_32, 2)); + mask1 = + vcombine_s16(vqrshrn_n_s32(m1_32, 2), vqrshrn_n_s32(m5_32, 2)); + mask2 = + vcombine_s16(vqrshrn_n_s32(m2_32, 2), vqrshrn_n_s32(m6_32, 2)); + mask3 = + vcombine_s16(vqrshrn_n_s32(m3_32, 2), vqrshrn_n_s32(m7_32, 2)); + + blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, + src1_stride, mask0, mask1, mask2, mask3, v_maxval, + vec_offset, vec_round_bits); + + w_tmp -= 8; + mask_tmp += 16; + dst_tmp += 8; + src0_tmp += 8; + src1_tmp += 8; + } while (w_tmp > 7); + i += 4; + mask_tmp += (8 * mask_stride) - (2 * w); + dst_tmp += (4 * dst_stride) - w; + src0_tmp += (4 * src0_stride) - w; + src1_tmp += (4 * src1_stride) - w; + } while (i < h); + } else { + do { + load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l, + &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l); + + mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l)); + mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l)); + mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l)); + mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l)); + + m0_32 = vpaddlq_s16(mask0); + m1_32 = vpaddlq_s16(mask1); + m2_32 = vpaddlq_s16(mask2); + m3_32 = vpaddlq_s16(mask3); + + mask0_low = vqrshrn_n_s32(m0_32, 2); + mask1_low = vqrshrn_n_s32(m1_32, 2); + mask2_low = vqrshrn_n_s32(m2_32, 2); + mask3_low = vqrshrn_n_s32(m3_32, 2); + + blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, + src1_stride, mask0_low, mask1_low, mask2_low, mask3_low, + v_maxval, vec_offset, vec_round_bits); + + i += 4; + mask_tmp += (8 * mask_stride); + dst_tmp += (4 * dst_stride); + src0_tmp += (4 * src0_stride); + src1_tmp += (4 * src1_stride); + } while (i < h); + } + } else if (subw == 1 && subh == 0) { + if (w_tmp > 7) { + do { + w_tmp = w; + do { + load_u8_16x4(mask_tmp, mask_stride, &t0, &t1, &t2, &t3); + + mask0 = vreinterpretq_s16_u16(vcombine_u16( + vpaddl_u8(vget_low_u8(t0)), vpaddl_u8(vget_high_u8(t0)))); + mask1 = vreinterpretq_s16_u16(vcombine_u16( + vpaddl_u8(vget_low_u8(t1)), vpaddl_u8(vget_high_u8(t1)))); + mask2 = vreinterpretq_s16_u16(vcombine_u16( + vpaddl_u8(vget_low_u8(t2)), vpaddl_u8(vget_high_u8(t2)))); + mask3 = vreinterpretq_s16_u16(vcombine_u16( + vpaddl_u8(vget_low_u8(t3)), vpaddl_u8(vget_high_u8(t3)))); + + mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1)); + mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1)); + mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1)); + mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1)); + + blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, + src1_stride, mask0, mask1, mask2, mask3, v_maxval, + vec_offset, vec_round_bits); + w_tmp -= 8; + mask_tmp += 16; + dst_tmp += 8; + src0_tmp += 8; + src1_tmp += 8; + } while (w_tmp > 7); + i += 4; + mask_tmp += (4 * mask_stride) - (2 * w); + dst_tmp += (4 * dst_stride) - w; + src0_tmp += (4 * src0_stride) - w; + src1_tmp += (4 * src1_stride) - w; + } while (i < h); + } else { + do { + load_u8_8x4(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l, + &mask3_l); + + mask0 = + vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask0_l), vec_zero)); + mask1 = + vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask1_l), vec_zero)); + mask2 = + vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask2_l), vec_zero)); + mask3 = + vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask3_l), vec_zero)); + + mask0_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask0, 1))); + mask1_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask1, 1))); + mask2_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask2, 1))); + mask3_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask3, 1))); + + blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, + src1_stride, mask0_low, mask1_low, mask2_low, mask3_low, + v_maxval, vec_offset, vec_round_bits); + + i += 4; + mask_tmp += (4 * mask_stride); + dst_tmp += (4 * dst_stride); + src0_tmp += (4 * src0_stride); + src1_tmp += (4 * src1_stride); + } while (i < h); + } + } else { + if (w_tmp > 7) { + do { + w_tmp = w; + do { + load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l, + &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l); + + mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l)); + mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l)); + mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l)); + mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l)); + + mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1)); + mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1)); + mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1)); + mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1)); + + blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, + src1_stride, mask0, mask1, mask2, mask3, v_maxval, + vec_offset, vec_round_bits); + + w_tmp -= 8; + mask_tmp += 8; + dst_tmp += 8; + src0_tmp += 8; + src1_tmp += 8; + } while (w_tmp > 7); + i += 4; + mask_tmp += (8 * mask_stride) - w; + dst_tmp += (4 * dst_stride) - w; + src0_tmp += (4 * src0_stride) - w; + src1_tmp += (4 * src1_stride) - w; + } while (i < h); + } else { + do { + load_unaligned_u8_4x4(mask_tmp, 2 * mask_stride, &tu0, &tu1); + load_unaligned_u8_4x4(mask_tmp + mask_stride, 2 * mask_stride, &tu2, + &tu3); + + s0 = vreinterpret_u8_u32(tu0); + s1 = vreinterpret_u8_u32(tu1); + s2 = vreinterpret_u8_u32(tu2); + s3 = vreinterpret_u8_u32(tu3); + + mask0 = vreinterpretq_s16_u16(vaddl_u8(s0, s2)); + mask1 = vreinterpretq_s16_u16(vaddl_u8(s1, s3)); + + mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1)); + mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1)); + + mask0_low = vget_low_s16(mask0); + mask1_low = vget_high_s16(mask0); + mask2_low = vget_low_s16(mask1); + mask3_low = vget_high_s16(mask1); + + blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, + src1_stride, mask0_low, mask1_low, mask2_low, mask3_low, + v_maxval, vec_offset, vec_round_bits); + + i += 4; + mask_tmp += (8 * mask_stride); + dst_tmp += (4 * dst_stride); + src0_tmp += (4 * src0_stride); + src1_tmp += (4 * src1_stride); + } while (i < h); + } + } +} diff --git a/libs/libaom/src/aom_dsp/arm/fwd_txfm_neon.c b/libs/libaom/src/aom_dsp/arm/fwd_txfm_neon.c new file mode 100644 index 000000000..ce9352347 --- /dev/null +++ b/libs/libaom/src/aom_dsp/arm/fwd_txfm_neon.c @@ -0,0 +1,316 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" + +#include "aom_dsp/txfm_common.h" +#include "av1/common/arm/mem_neon.h" +#include "av1/common/arm/transpose_neon.h" + +static void aom_fdct4x4_helper(const int16_t *input, int stride, + int16x4_t *input_0, int16x4_t *input_1, + int16x4_t *input_2, int16x4_t *input_3) { + *input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4); + *input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4); + *input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4); + *input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4); + // If the very first value != 0, then add 1. + if (input[0] != 0) { + const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1)); + *input_0 = vadd_s16(*input_0, one); + } + + for (int i = 0; i < 2; ++i) { + const int16x8_t input_01 = vcombine_s16(*input_0, *input_1); + const int16x8_t input_32 = vcombine_s16(*input_3, *input_2); + + // in_0 +/- in_3, in_1 +/- in_2 + const int16x8_t s_01 = vaddq_s16(input_01, input_32); + const int16x8_t s_32 = vsubq_s16(input_01, input_32); + + // step_0 +/- step_1, step_2 +/- step_3 + const int16x4_t s_0 = vget_low_s16(s_01); + const int16x4_t s_1 = vget_high_s16(s_01); + const int16x4_t s_2 = vget_high_s16(s_32); + const int16x4_t s_3 = vget_low_s16(s_32); + + // (s_0 +/- s_1) * cospi_16_64 + // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c. + const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1); + const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1); + const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64); + const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64); + + // fdct_round_shift + int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS); + int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS); + + // s_3 * cospi_8_64 + s_2 * cospi_24_64 + // s_3 * cospi_24_64 - s_2 * cospi_8_64 + const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64); + const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64); + + const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64); + const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64); + + // fdct_round_shift + int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS); + int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS); + + transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3); + + *input_0 = out_0; + *input_1 = out_1; + *input_2 = out_2; + *input_3 = out_3; + } +} + +void aom_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, + int stride) { + // input[M * stride] * 16 + int16x4_t input_0, input_1, input_2, input_3; + + aom_fdct4x4_helper(input, stride, &input_0, &input_1, &input_2, &input_3); + + // Not quite a rounding shift. Only add 1 despite shifting by 2. + const int16x8_t one = vdupq_n_s16(1); + int16x8_t out_01 = vcombine_s16(input_0, input_1); + int16x8_t out_23 = vcombine_s16(input_2, input_3); + out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2); + out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2); + store_s16q_to_tran_low(final_output + 0 * 8, out_01); + store_s16q_to_tran_low(final_output + 1 * 8, out_23); +} + +void aom_fdct4x4_lp_neon(const int16_t *input, int16_t *final_output, + int stride) { + // input[M * stride] * 16 + int16x4_t input_0, input_1, input_2, input_3; + + aom_fdct4x4_helper(input, stride, &input_0, &input_1, &input_2, &input_3); + + // Not quite a rounding shift. Only add 1 despite shifting by 2. + const int16x8_t one = vdupq_n_s16(1); + int16x8_t out_01 = vcombine_s16(input_0, input_1); + int16x8_t out_23 = vcombine_s16(input_2, input_3); + out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2); + out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2); + vst1q_s16(final_output + 0 * 8, out_01); + vst1q_s16(final_output + 1 * 8, out_23); +} + +void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { + // stage 1 + int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2); + int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2); + int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2); + int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2); + int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2); + int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2); + int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2); + int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2); + for (int i = 0; i < 2; ++i) { + int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7; + const int16x8_t v_s0 = vaddq_s16(input_0, input_7); + const int16x8_t v_s1 = vaddq_s16(input_1, input_6); + const int16x8_t v_s2 = vaddq_s16(input_2, input_5); + const int16x8_t v_s3 = vaddq_s16(input_3, input_4); + const int16x8_t v_s4 = vsubq_s16(input_3, input_4); + const int16x8_t v_s5 = vsubq_s16(input_2, input_5); + const int16x8_t v_s6 = vsubq_s16(input_1, input_6); + const int16x8_t v_s7 = vsubq_s16(input_0, input_7); + // fdct4(step, step); + int16x8_t v_x0 = vaddq_s16(v_s0, v_s3); + int16x8_t v_x1 = vaddq_s16(v_s1, v_s2); + int16x8_t v_x2 = vsubq_s16(v_s1, v_s2); + int16x8_t v_x3 = vsubq_s16(v_s0, v_s3); + // fdct4(step, step); + int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); + int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); + int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); + int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); + int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64); + int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64); + int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64); + int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64); + v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64); + v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64); + v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64); + v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64); + v_t0_lo = vmulq_n_s32(v_t0_lo, (int32_t)cospi_16_64); + v_t0_hi = vmulq_n_s32(v_t0_hi, (int32_t)cospi_16_64); + v_t1_lo = vmulq_n_s32(v_t1_lo, (int32_t)cospi_16_64); + v_t1_hi = vmulq_n_s32(v_t1_hi, (int32_t)cospi_16_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); + const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); + const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); + const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); + out_0 = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43 + out_2 = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63 + out_4 = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47 + out_6 = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67 + } + // Stage 2 + v_x0 = vsubq_s16(v_s6, v_s5); + v_x1 = vaddq_s16(v_s6, v_s5); + v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64); + v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64); + v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64); + v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x8_t ab = vcombine_s16(a, b); + const int16x8_t cd = vcombine_s16(c, d); + // Stage 3 + v_x0 = vaddq_s16(v_s4, ab); + v_x1 = vsubq_s16(v_s4, ab); + v_x2 = vsubq_s16(v_s7, cd); + v_x3 = vaddq_s16(v_s7, cd); + } + // Stage 4 + v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64); + v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64); + v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64); + v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64); + v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64); + v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64); + v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64); + v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64); + v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64); + v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64); + v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64); + v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64); + v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64); + v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64); + v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64); + v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); + const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); + const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); + const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); + out_1 = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53 + out_3 = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73 + out_5 = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57 + out_7 = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77 + } + // transpose 8x8 + { + // 00 01 02 03 40 41 42 43 + // 10 11 12 13 50 51 52 53 + // 20 21 22 23 60 61 62 63 + // 30 31 32 33 70 71 72 73 + // 04 05 06 07 44 45 46 47 + // 14 15 16 17 54 55 56 57 + // 24 25 26 27 64 65 66 67 + // 34 35 36 37 74 75 76 77 + const int32x4x2_t r02_s32 = + vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2)); + const int32x4x2_t r13_s32 = + vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3)); + const int32x4x2_t r46_s32 = + vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6)); + const int32x4x2_t r57_s32 = + vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7)); + const int16x8x2_t r01_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]), + vreinterpretq_s16_s32(r13_s32.val[0])); + const int16x8x2_t r23_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]), + vreinterpretq_s16_s32(r13_s32.val[1])); + const int16x8x2_t r45_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]), + vreinterpretq_s16_s32(r57_s32.val[0])); + const int16x8x2_t r67_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]), + vreinterpretq_s16_s32(r57_s32.val[1])); + input_0 = r01_s16.val[0]; + input_1 = r01_s16.val[1]; + input_2 = r23_s16.val[0]; + input_3 = r23_s16.val[1]; + input_4 = r45_s16.val[0]; + input_5 = r45_s16.val[1]; + input_6 = r67_s16.val[0]; + input_7 = r67_s16.val[1]; + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + } + } // for + { + // from aom_dct_sse2.c + // Post-condition (division by two) + // division of two 16 bits signed numbers using shifts + // n / 2 = (n - (n >> 15)) >> 1 + const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15); + const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15); + const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15); + const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15); + const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15); + const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15); + const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15); + const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15); + input_0 = vhsubq_s16(input_0, sign_in0); + input_1 = vhsubq_s16(input_1, sign_in1); + input_2 = vhsubq_s16(input_2, sign_in2); + input_3 = vhsubq_s16(input_3, sign_in3); + input_4 = vhsubq_s16(input_4, sign_in4); + input_5 = vhsubq_s16(input_5, sign_in5); + input_6 = vhsubq_s16(input_6, sign_in6); + input_7 = vhsubq_s16(input_7, sign_in7); + // store results + vst1q_s16(&final_output[0 * 8], input_0); + vst1q_s16(&final_output[1 * 8], input_1); + vst1q_s16(&final_output[2 * 8], input_2); + vst1q_s16(&final_output[3 * 8], input_3); + vst1q_s16(&final_output[4 * 8], input_4); + vst1q_s16(&final_output[5 * 8], input_5); + vst1q_s16(&final_output[6 * 8], input_6); + vst1q_s16(&final_output[7 * 8], input_7); + } +} + +void aom_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) { + int r; + int16x8_t sum = vld1q_s16(&input[0]); + for (r = 1; r < 8; ++r) { + const int16x8_t input_00 = vld1q_s16(&input[r * stride]); + sum = vaddq_s16(sum, input_00); + } + { + const int32x4_t a = vpaddlq_s16(sum); + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0); + output[1] = 0; + } +} diff --git a/libs/libaom/src/aom_dsp/arm/hadamard_neon.c b/libs/libaom/src/aom_dsp/arm/hadamard_neon.c new file mode 100644 index 000000000..929792ab3 --- /dev/null +++ b/libs/libaom/src/aom_dsp/arm/hadamard_neon.c @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "av1/common/arm/mem_neon.h" +#include "av1/common/arm/transpose_neon.h" + +static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, + int16x8_t *a3, int16x8_t *a4, int16x8_t *a5, + int16x8_t *a6, int16x8_t *a7) { + const int16x8_t b0 = vaddq_s16(*a0, *a1); + const int16x8_t b1 = vsubq_s16(*a0, *a1); + const int16x8_t b2 = vaddq_s16(*a2, *a3); + const int16x8_t b3 = vsubq_s16(*a2, *a3); + const int16x8_t b4 = vaddq_s16(*a4, *a5); + const int16x8_t b5 = vsubq_s16(*a4, *a5); + const int16x8_t b6 = vaddq_s16(*a6, *a7); + const int16x8_t b7 = vsubq_s16(*a6, *a7); + + const int16x8_t c0 = vaddq_s16(b0, b2); + const int16x8_t c1 = vaddq_s16(b1, b3); + const int16x8_t c2 = vsubq_s16(b0, b2); + const int16x8_t c3 = vsubq_s16(b1, b3); + const int16x8_t c4 = vaddq_s16(b4, b6); + const int16x8_t c5 = vaddq_s16(b5, b7); + const int16x8_t c6 = vsubq_s16(b4, b6); + const int16x8_t c7 = vsubq_s16(b5, b7); + + *a0 = vaddq_s16(c0, c4); + *a1 = vsubq_s16(c2, c6); + *a2 = vsubq_s16(c0, c4); + *a3 = vaddq_s16(c2, c6); + *a4 = vaddq_s16(c3, c7); + *a5 = vsubq_s16(c3, c7); + *a6 = vsubq_s16(c1, c5); + *a7 = vaddq_s16(c1, c5); +} + +void aom_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int16x8_t a0 = vld1q_s16(src_diff); + int16x8_t a1 = vld1q_s16(src_diff + src_stride); + int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride); + int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride); + int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride); + int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride); + int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride); + int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride); + + hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + // Skip the second transpose because it is not required. + + store_s16q_to_tran_low(coeff + 0, a0); + store_s16q_to_tran_low(coeff + 8, a1); + store_s16q_to_tran_low(coeff + 16, a2); + store_s16q_to_tran_low(coeff + 24, a3); + store_s16q_to_tran_low(coeff + 32, a4); + store_s16q_to_tran_low(coeff + 40, a5); + store_s16q_to_tran_low(coeff + 48, a6); + store_s16q_to_tran_low(coeff + 56, a7); +} + +void aom_hadamard_lp_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, + int16_t *coeff) { + int16x8_t a0 = vld1q_s16(src_diff); + int16x8_t a1 = vld1q_s16(src_diff + src_stride); + int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride); + int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride); + int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride); + int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride); + int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride); + int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride); + + hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + // Skip the second transpose because it is not required. + + vst1q_s16(coeff + 0, a0); + vst1q_s16(coeff + 8, a1); + vst1q_s16(coeff + 16, a2); + vst1q_s16(coeff + 24, a3); + vst1q_s16(coeff + 32, a4); + vst1q_s16(coeff + 40, a5); + vst1q_s16(coeff + 48, a6); + vst1q_s16(coeff + 56, a7); +} + +void aom_hadamard_lp_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, + int16_t *coeff) { + /* Rearrange 16x16 to 8x32 and remove stride. + * Top left first. */ + aom_hadamard_lp_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, + coeff + 0); + /* Top right. */ + aom_hadamard_lp_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, + coeff + 64); + /* Bottom left. */ + aom_hadamard_lp_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, + coeff + 128); + /* Bottom right. */ + aom_hadamard_lp_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, + coeff + 192); + + for (int i = 0; i < 64; i += 8) { + const int16x8_t a0 = vld1q_s16(coeff + 0); + const int16x8_t a1 = vld1q_s16(coeff + 64); + const int16x8_t a2 = vld1q_s16(coeff + 128); + const int16x8_t a3 = vld1q_s16(coeff + 192); + + const int16x8_t b0 = vhaddq_s16(a0, a1); + const int16x8_t b1 = vhsubq_s16(a0, a1); + const int16x8_t b2 = vhaddq_s16(a2, a3); + const int16x8_t b3 = vhsubq_s16(a2, a3); + + const int16x8_t c0 = vaddq_s16(b0, b2); + const int16x8_t c1 = vaddq_s16(b1, b3); + const int16x8_t c2 = vsubq_s16(b0, b2); + const int16x8_t c3 = vsubq_s16(b1, b3); + + vst1q_s16(coeff + 0, c0); + vst1q_s16(coeff + 64, c1); + vst1q_s16(coeff + 128, c2); + vst1q_s16(coeff + 192, c3); + + coeff += 8; + } +} + +void aom_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + /* Rearrange 16x16 to 8x32 and remove stride. + * Top left first. */ + aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0); + /* Top right. */ + aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64); + /* Bottom left. */ + aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128); + /* Bottom right. */ + aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192); + + for (int i = 0; i < 64; i += 8) { + const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0); + const int16x8_t a1 = load_tran_low_to_s16q(coeff + 64); + const int16x8_t a2 = load_tran_low_to_s16q(coeff + 128); + const int16x8_t a3 = load_tran_low_to_s16q(coeff + 192); + + const int16x8_t b0 = vhaddq_s16(a0, a1); + const int16x8_t b1 = vhsubq_s16(a0, a1); + const int16x8_t b2 = vhaddq_s16(a2, a3); + const int16x8_t b3 = vhsubq_s16(a2, a3); + + const int16x8_t c0 = vaddq_s16(b0, b2); + const int16x8_t c1 = vaddq_s16(b1, b3); + const int16x8_t c2 = vsubq_s16(b0, b2); + const int16x8_t c3 = vsubq_s16(b1, b3); + + store_s16q_to_tran_low(coeff + 0, c0); + store_s16q_to_tran_low(coeff + 64, c1); + store_s16q_to_tran_low(coeff + 128, c2); + store_s16q_to_tran_low(coeff + 192, c3); + + coeff += 8; + } +} diff --git a/libs/libaom/src/aom_dsp/arm/intrapred_neon.c b/libs/libaom/src/aom_dsp/arm/intrapred_neon.c new file mode 100644 index 000000000..c85b1e910 --- /dev/null +++ b/libs/libaom/src/aom_dsp/arm/intrapred_neon.c @@ -0,0 +1,590 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" + +//------------------------------------------------------------------------------ +// DC 4x4 + +// 'do_above' and 'do_left' facilitate branch removal when inlined. +static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, + const uint8_t *left, int do_above, int do_left) { + uint16x8_t sum_top; + uint16x8_t sum_left; + uint8x8_t dc0; + + if (do_above) { + const uint8x8_t A = vld1_u8(above); // top row + const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top + const uint16x4_t p1 = vpadd_u16(p0, p0); + sum_top = vcombine_u16(p1, p1); + } + + if (do_left) { + const uint8x8_t L = vld1_u8(left); // left border + const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left + const uint16x4_t p1 = vpadd_u16(p0, p0); + sum_left = vcombine_u16(p1, p1); + } + + if (do_above && do_left) { + const uint16x8_t sum = vaddq_u16(sum_left, sum_top); + dc0 = vrshrn_n_u16(sum, 3); + } else if (do_above) { + dc0 = vrshrn_n_u16(sum_top, 2); + } else if (do_left) { + dc0 = vrshrn_n_u16(sum_left, 2); + } else { + dc0 = vdup_n_u8(0x80); + } + + { + const uint8x8_t dc = vdup_lane_u8(dc0, 0); + int i; + for (i = 0; i < 4; ++i) { + vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0); + } + } +} + +void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_4x4(dst, stride, above, left, 1, 1); +} + +void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + dc_4x4(dst, stride, NULL, left, 0, 1); +} + +void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + dc_4x4(dst, stride, above, NULL, 1, 0); +} + +void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + dc_4x4(dst, stride, NULL, NULL, 0, 0); +} + +//------------------------------------------------------------------------------ +// DC 8x8 + +// 'do_above' and 'do_left' facilitate branch removal when inlined. +static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, + const uint8_t *left, int do_above, int do_left) { + uint16x8_t sum_top; + uint16x8_t sum_left; + uint8x8_t dc0; + + if (do_above) { + const uint8x8_t A = vld1_u8(above); // top row + const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top + const uint16x4_t p1 = vpadd_u16(p0, p0); + const uint16x4_t p2 = vpadd_u16(p1, p1); + sum_top = vcombine_u16(p2, p2); + } + + if (do_left) { + const uint8x8_t L = vld1_u8(left); // left border + const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left + const uint16x4_t p1 = vpadd_u16(p0, p0); + const uint16x4_t p2 = vpadd_u16(p1, p1); + sum_left = vcombine_u16(p2, p2); + } + + if (do_above && do_left) { + const uint16x8_t sum = vaddq_u16(sum_left, sum_top); + dc0 = vrshrn_n_u16(sum, 4); + } else if (do_above) { + dc0 = vrshrn_n_u16(sum_top, 3); + } else if (do_left) { + dc0 = vrshrn_n_u16(sum_left, 3); + } else { + dc0 = vdup_n_u8(0x80); + } + + { + const uint8x8_t dc = vdup_lane_u8(dc0, 0); + int i; + for (i = 0; i < 8; ++i) { + vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc)); + } + } +} + +void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_8x8(dst, stride, above, left, 1, 1); +} + +void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + dc_8x8(dst, stride, NULL, left, 0, 1); +} + +void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + dc_8x8(dst, stride, above, NULL, 1, 0); +} + +void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + dc_8x8(dst, stride, NULL, NULL, 0, 0); +} + +//------------------------------------------------------------------------------ +// DC 16x16 + +// 'do_above' and 'do_left' facilitate branch removal when inlined. +static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int do_above, int do_left) { + uint16x8_t sum_top; + uint16x8_t sum_left; + uint8x8_t dc0; + + if (do_above) { + const uint8x16_t A = vld1q_u8(above); // top row + const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top + const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); + const uint16x4_t p2 = vpadd_u16(p1, p1); + const uint16x4_t p3 = vpadd_u16(p2, p2); + sum_top = vcombine_u16(p3, p3); + } + + if (do_left) { + const uint8x16_t L = vld1q_u8(left); // left row + const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left + const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); + const uint16x4_t p2 = vpadd_u16(p1, p1); + const uint16x4_t p3 = vpadd_u16(p2, p2); + sum_left = vcombine_u16(p3, p3); + } + + if (do_above && do_left) { + const uint16x8_t sum = vaddq_u16(sum_left, sum_top); + dc0 = vrshrn_n_u16(sum, 5); + } else if (do_above) { + dc0 = vrshrn_n_u16(sum_top, 4); + } else if (do_left) { + dc0 = vrshrn_n_u16(sum_left, 4); + } else { + dc0 = vdup_n_u8(0x80); + } + + { + const uint8x16_t dc = vdupq_lane_u8(dc0, 0); + int i; + for (i = 0; i < 16; ++i) { + vst1q_u8(dst + i * stride, dc); + } + } +} + +void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_16x16(dst, stride, above, left, 1, 1); +} + +void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + dc_16x16(dst, stride, NULL, left, 0, 1); +} + +void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + dc_16x16(dst, stride, above, NULL, 1, 0); +} + +void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + dc_16x16(dst, stride, NULL, NULL, 0, 0); +} + +//------------------------------------------------------------------------------ +// DC 32x32 + +// 'do_above' and 'do_left' facilitate branch removal when inlined. +static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int do_above, int do_left) { + uint16x8_t sum_top; + uint16x8_t sum_left; + uint8x8_t dc0; + + if (do_above) { + const uint8x16_t A0 = vld1q_u8(above); // top row + const uint8x16_t A1 = vld1q_u8(above + 16); + const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top + const uint16x8_t p1 = vpaddlq_u8(A1); + const uint16x8_t p2 = vaddq_u16(p0, p1); + const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); + const uint16x4_t p4 = vpadd_u16(p3, p3); + const uint16x4_t p5 = vpadd_u16(p4, p4); + sum_top = vcombine_u16(p5, p5); + } + + if (do_left) { + const uint8x16_t L0 = vld1q_u8(left); // left row + const uint8x16_t L1 = vld1q_u8(left + 16); + const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left + const uint16x8_t p1 = vpaddlq_u8(L1); + const uint16x8_t p2 = vaddq_u16(p0, p1); + const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); + const uint16x4_t p4 = vpadd_u16(p3, p3); + const uint16x4_t p5 = vpadd_u16(p4, p4); + sum_left = vcombine_u16(p5, p5); + } + + if (do_above && do_left) { + const uint16x8_t sum = vaddq_u16(sum_left, sum_top); + dc0 = vrshrn_n_u16(sum, 6); + } else if (do_above) { + dc0 = vrshrn_n_u16(sum_top, 5); + } else if (do_left) { + dc0 = vrshrn_n_u16(sum_left, 5); + } else { + dc0 = vdup_n_u8(0x80); + } + + { + const uint8x16_t dc = vdupq_lane_u8(dc0, 0); + int i; + for (i = 0; i < 32; ++i) { + vst1q_u8(dst + i * stride, dc); + vst1q_u8(dst + i * stride + 16, dc); + } + } +} + +void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_32x32(dst, stride, above, left, 1, 1); +} + +void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + dc_32x32(dst, stride, NULL, left, 0, 1); +} + +void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + dc_32x32(dst, stride, above, NULL, 1, 0); +} + +void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + dc_32x32(dst, stride, NULL, NULL, 0, 0); +} + +// ----------------------------------------------------------------------------- + +void aom_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t XABCD_u8 = vld1_u8(above - 1); + const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8); + const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32); + const uint32x2_t zero = vdup_n_u32(0); + const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0); + const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL); + const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8)); + const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC); + const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8)); + const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16)); + const uint8_t D = vget_lane_u8(XABCD_u8, 4); + const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6); + const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC); + const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8); + const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_); + const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); + const uint32x2_t r3 = vreinterpret_u32_u8(avg2); + const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); + const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); + const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); + vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); + vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); + vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); + vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); +} + +void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint32x2_t d0u32 = vdup_n_u32(0); + (void)left; + + d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0); + for (i = 0; i < 4; i++, dst += stride) + vst1_lane_u32((uint32_t *)dst, d0u32, 0); +} + +void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint8x8_t d0u8 = vdup_n_u8(0); + (void)left; + + d0u8 = vld1_u8(above); + for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8); +} + +void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint8x16_t q0u8 = vdupq_n_u8(0); + (void)left; + + q0u8 = vld1q_u8(above); + for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8); +} + +void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint8x16_t q0u8 = vdupq_n_u8(0); + uint8x16_t q1u8 = vdupq_n_u8(0); + (void)left; + + q0u8 = vld1q_u8(above); + q1u8 = vld1q_u8(above + 16); + for (i = 0; i < 32; i++, dst += stride) { + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + } +} + +void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t d0u8 = vdup_n_u8(0); + uint32x2_t d1u32 = vdup_n_u32(0); + (void)above; + + d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0); + + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); +} + +void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t d0u8 = vdup_n_u8(0); + uint64x1_t d1u64 = vdup_n_u64(0); + (void)above; + + d1u64 = vld1_u64((const uint64_t *)left); + + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7); + vst1_u8(dst, d0u8); +} + +void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int j; + uint8x8_t d2u8 = vdup_n_u8(0); + uint8x16_t q0u8 = vdupq_n_u8(0); + uint8x16_t q1u8 = vdupq_n_u8(0); + (void)above; + + q1u8 = vld1q_u8(left); + d2u8 = vget_low_u8(q1u8); + for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { + q0u8 = vdupq_lane_u8(d2u8, 0); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 1); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 2); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 3); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 4); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 5); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 6); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 7); + vst1q_u8(dst, q0u8); + dst += stride; + } +} + +void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int j, k; + uint8x8_t d2u8 = vdup_n_u8(0); + uint8x16_t q0u8 = vdupq_n_u8(0); + uint8x16_t q1u8 = vdupq_n_u8(0); + (void)above; + + for (k = 0; k < 2; k++, left += 16) { + q1u8 = vld1q_u8(left); + d2u8 = vget_low_u8(q1u8); + for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { + q0u8 = vdupq_lane_u8(d2u8, 0); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 1); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 2); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 3); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 4); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 5); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 6); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 7); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + } + } +} + +static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw, + const uint16_t *above, + const uint16_t *left) { + assert(bw >= 4); + assert(IS_POWER_OF_TWO(bw)); + int expected_dc, sum = 0; + const int count = bw * 2; + uint32x4_t sum_q = vdupq_n_u32(0); + uint32x2_t sum_d; + uint16_t *dst_1; + if (bw >= 8) { + for (int i = 0; i < bw; i += 8) { + sum_q = vpadalq_u16(sum_q, vld1q_u16(above)); + sum_q = vpadalq_u16(sum_q, vld1q_u16(left)); + above += 8; + left += 8; + } + sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q)); + sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0); + expected_dc = (sum + (count >> 1)) / count; + const uint16x8_t dc = vdupq_n_u16((uint16_t)expected_dc); + for (int r = 0; r < bw; r++) { + dst_1 = dst; + for (int i = 0; i < bw; i += 8) { + vst1q_u16(dst_1, dc); + dst_1 += 8; + } + dst += stride; + } + } else { // 4x4 + sum_q = vaddl_u16(vld1_u16(above), vld1_u16(left)); + sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q)); + sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0); + expected_dc = (sum + (count >> 1)) / count; + const uint16x4_t dc = vdup_n_u16((uint16_t)expected_dc); + for (int r = 0; r < bw; r++) { + vst1_u16(dst, dc); + dst += stride; + } + } +} + +#define intra_pred_highbd_sized_neon(type, width) \ + void aom_highbd_##type##_predictor_##width##x##width##_neon( \ + uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ + (void)bd; \ + highbd_##type##_predictor(dst, stride, width, above, left); \ + } + +#define intra_pred_square(type) \ + intra_pred_highbd_sized_neon(type, 4); \ + intra_pred_highbd_sized_neon(type, 8); \ + intra_pred_highbd_sized_neon(type, 16); \ + intra_pred_highbd_sized_neon(type, 32); \ + intra_pred_highbd_sized_neon(type, 64); + +intra_pred_square(dc); +#undef intra_pred_square diff --git a/libs/libaom/src/aom_dsp/arm/loopfilter_neon.c b/libs/libaom/src/aom_dsp/arm/loopfilter_neon.c new file mode 100644 index 000000000..aafac8966 --- /dev/null +++ b/libs/libaom/src/aom_dsp/arm/loopfilter_neon.c @@ -0,0 +1,927 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "av1/common/arm/mem_neon.h" +#include "av1/common/arm/transpose_neon.h" + +static INLINE uint8x8_t lpf_mask(uint8x8_t p3q3, uint8x8_t p2q2, uint8x8_t p1q1, + uint8x8_t p0q0, const uint8_t blimit, + const uint8_t limit) { + // Calculate mask values for four samples + uint32x2x2_t p0q0_p1q1; + uint16x8_t temp_16x8; + uint16x4_t temp0_16x4, temp1_16x4; + uint8x8_t mask_8x8, temp_8x8; + const uint8x8_t limit_8x8 = vdup_n_u8(limit); + const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit); + + mask_8x8 = vabd_u8(p3q3, p2q2); + mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p2q2, p1q1)); + mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0)); + mask_8x8 = vcle_u8(mask_8x8, limit_8x8); + + temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8))); + mask_8x8 = vand_u8(mask_8x8, temp_8x8); + + p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1)); + temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]), + vreinterpret_u8_u32(p0q0_p1q1.val[1])); + temp_16x8 = vmovl_u8(temp_8x8); + temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1); + temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1); + temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4); + temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4); + temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4)); + + mask_8x8 = vand_u8(mask_8x8, temp_8x8); + + return mask_8x8; +} + +static INLINE uint8x8_t lpf_mask2(uint8x8_t p1q1, uint8x8_t p0q0, + const uint8_t blimit, const uint8_t limit) { + uint32x2x2_t p0q0_p1q1; + uint16x8_t temp_16x8; + uint16x4_t temp0_16x4, temp1_16x4; + const uint16x4_t blimit_16x4 = vdup_n_u16(blimit); + const uint8x8_t limit_8x8 = vdup_n_u8(limit); + uint8x8_t mask_8x8, temp_8x8; + + mask_8x8 = vabd_u8(p1q1, p0q0); + mask_8x8 = vcle_u8(mask_8x8, limit_8x8); + + temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8))); + mask_8x8 = vand_u8(mask_8x8, temp_8x8); + + p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1)); + temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]), + vreinterpret_u8_u32(p0q0_p1q1.val[1])); + temp_16x8 = vmovl_u8(temp_8x8); + temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1); + temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1); + temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4); + temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4); + temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4)); + + mask_8x8 = vand_u8(mask_8x8, temp_8x8); + + return mask_8x8; +} + +static INLINE uint8x8_t lpf_flat_mask4(uint8x8_t p3q3, uint8x8_t p2q2, + uint8x8_t p1q1, uint8x8_t p0q0) { + const uint8x8_t thresh_8x8 = vdup_n_u8(1); // for bd==8 threshold is always 1 + uint8x8_t flat_8x8, temp_8x8; + + flat_8x8 = vabd_u8(p1q1, p0q0); + flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0)); + flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p3q3, p0q0)); + flat_8x8 = vcle_u8(flat_8x8, thresh_8x8); + + temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8))); + flat_8x8 = vand_u8(flat_8x8, temp_8x8); + + return flat_8x8; +} + +static INLINE uint8x8_t lpf_flat_mask3(uint8x8_t p2q2, uint8x8_t p1q1, + uint8x8_t p0q0) { + const uint8x8_t thresh_8x8 = vdup_n_u8(1); // for bd==8 threshold is always 1 + uint8x8_t flat_8x8, temp_8x8; + + flat_8x8 = vabd_u8(p1q1, p0q0); + flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0)); + flat_8x8 = vcle_u8(flat_8x8, thresh_8x8); + + temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8))); + flat_8x8 = vand_u8(flat_8x8, temp_8x8); + + return flat_8x8; +} + +static INLINE uint8x8_t lpf_mask3_chroma(uint8x8_t p2q2, uint8x8_t p1q1, + uint8x8_t p0q0, const uint8_t blimit, + const uint8_t limit) { + // Calculate mask3 values for four samples + uint32x2x2_t p0q0_p1q1; + uint16x8_t temp_16x8; + uint16x4_t temp0_16x4, temp1_16x4; + uint8x8_t mask_8x8, temp_8x8; + const uint8x8_t limit_8x8 = vdup_n_u8(limit); + const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit); + + mask_8x8 = vabd_u8(p2q2, p1q1); + mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0)); + mask_8x8 = vcle_u8(mask_8x8, limit_8x8); + + temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8))); + mask_8x8 = vand_u8(mask_8x8, temp_8x8); + + p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1)); + temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]), + vreinterpret_u8_u32(p0q0_p1q1.val[1])); + temp_16x8 = vmovl_u8(temp_8x8); + temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1); + temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1); + temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4); + temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4); + temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4)); + + mask_8x8 = vand_u8(mask_8x8, temp_8x8); + + return mask_8x8; +} + +static void lpf_14_neon(uint8x8_t *p6q6, uint8x8_t *p5q5, uint8x8_t *p4q4, + uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1, + uint8x8_t *p0q0, const uint8_t blimit, + const uint8_t limit, const uint8_t thresh) { + uint16x8_t out; + uint8x8_t out_f14_pq0, out_f14_pq1, out_f14_pq2, out_f14_pq3, out_f14_pq4, + out_f14_pq5; + uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2; + uint8x8_t out_f4_pq0, out_f4_pq1; + uint8x8_t mask_8x8, flat_8x8, flat2_8x8; + uint8x8_t q0p0, q1p1, q2p2; + + // Calculate filter masks + mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit); + flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0); + flat2_8x8 = lpf_flat_mask4(*p6q6, *p5q5, *p4q4, *p0q0); + { + // filter 4 + int32x2x2_t ps0_qs0, ps1_qs1; + int16x8_t filter_s16; + const uint8x8_t thresh_f4 = vdup_n_u8(thresh); + uint8x8_t temp0_8x8, temp1_8x8; + int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8; + int8x8_t op0, oq0, op1, oq1; + int8x8_t pq_s0, pq_s1; + int8x8_t filter_s8, filter1_s8, filter2_s8; + int8x8_t hev_8x8; + const int8x8_t sign_mask = vdup_n_s8(0x80); + const int8x8_t val_4 = vdup_n_s8(4); + const int8x8_t val_3 = vdup_n_s8(3); + + pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask); + pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask); + + ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0)); + ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1)); + ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]); + qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]); + ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]); + qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]); + + // hev_mask + temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4); + temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8))); + hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8)); + + // add outer taps if we have high edge variance + filter_s8 = vqsub_s8(ps1_s8, qs1_s8); + filter_s8 = vand_s8(filter_s8, hev_8x8); + + // inner taps + temp_s8 = vqsub_s8(qs0_s8, ps0_s8); + filter_s16 = vmovl_s8(filter_s8); + filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3); + filter_s8 = vqmovn_s16(filter_s16); + filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8)); + + filter1_s8 = vqadd_s8(filter_s8, val_4); + filter2_s8 = vqadd_s8(filter_s8, val_3); + filter1_s8 = vshr_n_s8(filter1_s8, 3); + filter2_s8 = vshr_n_s8(filter2_s8, 3); + + oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask); + op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask); + + hev_8x8 = vmvn_s8(hev_8x8); + filter_s8 = vrshr_n_s8(filter1_s8, 1); + filter_s8 = vand_s8(filter_s8, hev_8x8); + + oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask); + op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask); + + out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4)); + out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4)); + } + // reverse p and q + q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0))); + q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1))); + q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2))); + { + // filter 8 + uint16x8_t out_pq0, out_pq1, out_pq2; + out = vaddl_u8(*p3q3, *p2q2); + out = vaddw_u8(out, *p1q1); + out = vaddw_u8(out, *p0q0); + + out = vaddw_u8(out, q0p0); + out_pq1 = vaddw_u8(out, *p3q3); + out_pq2 = vaddw_u8(out_pq1, *p3q3); + out_pq2 = vaddw_u8(out_pq2, *p2q2); + out_pq1 = vaddw_u8(out_pq1, *p1q1); + out_pq1 = vaddw_u8(out_pq1, q1p1); + + out_pq0 = vaddw_u8(out, *p0q0); + out_pq0 = vaddw_u8(out_pq0, q1p1); + out_pq0 = vaddw_u8(out_pq0, q2p2); + + out_f7_pq0 = vrshrn_n_u16(out_pq0, 3); + out_f7_pq1 = vrshrn_n_u16(out_pq1, 3); + out_f7_pq2 = vrshrn_n_u16(out_pq2, 3); + } + { + // filter 14 + uint16x8_t out_pq0, out_pq1, out_pq2, out_pq3, out_pq4, out_pq5; + uint16x8_t p6q6_2, p6q6_temp, qp_sum; + uint8x8_t qp_rev; + + out = vaddw_u8(out, *p4q4); + out = vaddw_u8(out, *p5q5); + out = vaddw_u8(out, *p6q6); + + out_pq5 = vaddw_u8(out, *p4q4); + out_pq4 = vaddw_u8(out_pq5, *p3q3); + out_pq3 = vaddw_u8(out_pq4, *p2q2); + + out_pq5 = vaddw_u8(out_pq5, *p5q5); + out_pq4 = vaddw_u8(out_pq4, *p5q5); + + out_pq0 = vaddw_u8(out, *p1q1); + out_pq1 = vaddw_u8(out_pq0, *p2q2); + out_pq2 = vaddw_u8(out_pq1, *p3q3); + + out_pq0 = vaddw_u8(out_pq0, *p0q0); + out_pq1 = vaddw_u8(out_pq1, *p0q0); + + out_pq1 = vaddw_u8(out_pq1, *p6q6); + p6q6_2 = vaddl_u8(*p6q6, *p6q6); + out_pq2 = vaddq_u16(out_pq2, p6q6_2); + p6q6_temp = vaddw_u8(p6q6_2, *p6q6); + out_pq3 = vaddq_u16(out_pq3, p6q6_temp); + p6q6_temp = vaddw_u8(p6q6_temp, *p6q6); + out_pq4 = vaddq_u16(out_pq4, p6q6_temp); + p6q6_temp = vaddq_u16(p6q6_temp, p6q6_2); + out_pq5 = vaddq_u16(out_pq5, p6q6_temp); + + out_pq4 = vaddw_u8(out_pq4, q1p1); + + qp_sum = vaddl_u8(q2p2, q1p1); + out_pq3 = vaddq_u16(out_pq3, qp_sum); + + qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p3q3))); + qp_sum = vaddw_u8(qp_sum, qp_rev); + out_pq2 = vaddq_u16(out_pq2, qp_sum); + + qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p4q4))); + qp_sum = vaddw_u8(qp_sum, qp_rev); + out_pq1 = vaddq_u16(out_pq1, qp_sum); + + qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p5q5))); + qp_sum = vaddw_u8(qp_sum, qp_rev); + out_pq0 = vaddq_u16(out_pq0, qp_sum); + + out_pq0 = vaddw_u8(out_pq0, q0p0); + + out_f14_pq0 = vrshrn_n_u16(out_pq0, 4); + out_f14_pq1 = vrshrn_n_u16(out_pq1, 4); + out_f14_pq2 = vrshrn_n_u16(out_pq2, 4); + out_f14_pq3 = vrshrn_n_u16(out_pq3, 4); + out_f14_pq4 = vrshrn_n_u16(out_pq4, 4); + out_f14_pq5 = vrshrn_n_u16(out_pq5, 4); + } + { + uint8x8_t filter4_cond, filter8_cond, filter14_cond; + filter8_cond = vand_u8(flat_8x8, mask_8x8); + filter4_cond = vmvn_u8(filter8_cond); + filter14_cond = vand_u8(filter8_cond, flat2_8x8); + + // filter4 outputs + *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); + *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); + + // filter8 outputs + *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0); + *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1); + *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2); + + // filter14 outputs + *p0q0 = vbsl_u8(filter14_cond, out_f14_pq0, *p0q0); + *p1q1 = vbsl_u8(filter14_cond, out_f14_pq1, *p1q1); + *p2q2 = vbsl_u8(filter14_cond, out_f14_pq2, *p2q2); + *p3q3 = vbsl_u8(filter14_cond, out_f14_pq3, *p3q3); + *p4q4 = vbsl_u8(filter14_cond, out_f14_pq4, *p4q4); + *p5q5 = vbsl_u8(filter14_cond, out_f14_pq5, *p5q5); + } +} + +static void lpf_8_neon(uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1, + uint8x8_t *p0q0, const uint8_t blimit, + const uint8_t limit, const uint8_t thresh) { + uint16x8_t out; + uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2; + uint8x8_t out_f4_pq0, out_f4_pq1; + uint8x8_t mask_8x8, flat_8x8; + + // Calculate filter masks + mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit); + flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0); + { + // filter 4 + int32x2x2_t ps0_qs0, ps1_qs1; + int16x8_t filter_s16; + const uint8x8_t thresh_f4 = vdup_n_u8(thresh); + uint8x8_t temp0_8x8, temp1_8x8; + int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8; + int8x8_t op0, oq0, op1, oq1; + int8x8_t pq_s0, pq_s1; + int8x8_t filter_s8, filter1_s8, filter2_s8; + int8x8_t hev_8x8; + const int8x8_t sign_mask = vdup_n_s8(0x80); + const int8x8_t val_4 = vdup_n_s8(4); + const int8x8_t val_3 = vdup_n_s8(3); + + pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask); + pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask); + + ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0)); + ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1)); + ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]); + qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]); + ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]); + qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]); + + // hev_mask + temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4); + temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8))); + hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8)); + + // add outer taps if we have high edge variance + filter_s8 = vqsub_s8(ps1_s8, qs1_s8); + filter_s8 = vand_s8(filter_s8, hev_8x8); + + // inner taps + temp_s8 = vqsub_s8(qs0_s8, ps0_s8); + filter_s16 = vmovl_s8(filter_s8); + filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3); + filter_s8 = vqmovn_s16(filter_s16); + filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8)); + + filter1_s8 = vqadd_s8(filter_s8, val_4); + filter2_s8 = vqadd_s8(filter_s8, val_3); + filter1_s8 = vshr_n_s8(filter1_s8, 3); + filter2_s8 = vshr_n_s8(filter2_s8, 3); + + oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask); + op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask); + + hev_8x8 = vmvn_s8(hev_8x8); + filter_s8 = vrshr_n_s8(filter1_s8, 1); + filter_s8 = vand_s8(filter_s8, hev_8x8); + + oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask); + op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask); + + out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4)); + out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4)); + } + { + // filter 8 + uint16x8_t out_pq0, out_pq1, out_pq2; + uint8x8_t q0p0, q1p1, q2p2; + + out = vaddl_u8(*p3q3, *p2q2); + out = vaddw_u8(out, *p1q1); + out = vaddw_u8(out, *p0q0); + + // reverse p and q + q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0))); + q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1))); + q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2))); + + out = vaddw_u8(out, q0p0); + out_pq1 = vaddw_u8(out, *p3q3); + out_pq2 = vaddw_u8(out_pq1, *p3q3); + out_pq2 = vaddw_u8(out_pq2, *p2q2); + out_pq1 = vaddw_u8(out_pq1, *p1q1); + out_pq1 = vaddw_u8(out_pq1, q1p1); + + out_pq0 = vaddw_u8(out, *p0q0); + out_pq0 = vaddw_u8(out_pq0, q1p1); + out_pq0 = vaddw_u8(out_pq0, q2p2); + + out_f7_pq0 = vrshrn_n_u16(out_pq0, 3); + out_f7_pq1 = vrshrn_n_u16(out_pq1, 3); + out_f7_pq2 = vrshrn_n_u16(out_pq2, 3); + } + { + uint8x8_t filter4_cond, filter8_cond; + filter8_cond = vand_u8(flat_8x8, mask_8x8); + filter4_cond = vmvn_u8(filter8_cond); + + // filter4 outputs + *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); + *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); + + // filter8 outputs + *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0); + *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1); + *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2); + } +} + +static void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0, + const uint8_t blimit, const uint8_t limit, + const uint8_t thresh) { + uint16x8_t out; + uint8x8_t out_f6_pq0, out_f6_pq1; + uint8x8_t out_f4_pq0, out_f4_pq1; + uint8x8_t mask_8x8, flat_8x8; + + // Calculate filter masks + mask_8x8 = lpf_mask3_chroma(*p2q2, *p1q1, *p0q0, blimit, limit); + flat_8x8 = lpf_flat_mask3(*p2q2, *p1q1, *p0q0); + { + // filter 4 + int32x2x2_t ps0_qs0, ps1_qs1; + int16x8_t filter_s16; + const uint8x8_t thresh_f4 = vdup_n_u8(thresh); + uint8x8_t temp0_8x8, temp1_8x8; + int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8; + int8x8_t op0, oq0, op1, oq1; + int8x8_t pq_s0, pq_s1; + int8x8_t filter_s8, filter1_s8, filter2_s8; + int8x8_t hev_8x8; + const int8x8_t sign_mask = vdup_n_s8(0x80); + const int8x8_t val_4 = vdup_n_s8(4); + const int8x8_t val_3 = vdup_n_s8(3); + + pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask); + pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask); + + ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0)); + ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1)); + ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]); + qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]); + ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]); + qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]); + + // hev_mask + temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4); + temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8))); + hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8)); + + // add outer taps if we have high edge variance + filter_s8 = vqsub_s8(ps1_s8, qs1_s8); + filter_s8 = vand_s8(filter_s8, hev_8x8); + + // inner taps + temp_s8 = vqsub_s8(qs0_s8, ps0_s8); + filter_s16 = vmovl_s8(filter_s8); + filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3); + filter_s8 = vqmovn_s16(filter_s16); + filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8)); + + filter1_s8 = vqadd_s8(filter_s8, val_4); + filter2_s8 = vqadd_s8(filter_s8, val_3); + filter1_s8 = vshr_n_s8(filter1_s8, 3); + filter2_s8 = vshr_n_s8(filter2_s8, 3); + + oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask); + op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask); + + filter_s8 = vrshr_n_s8(filter1_s8, 1); + filter_s8 = vbic_s8(filter_s8, hev_8x8); + + oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask); + op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask); + + out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4)); + out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4)); + } + { + // filter 6 + uint16x8_t out_pq0, out_pq1; + uint8x8_t pq_rev; + + out = vaddl_u8(*p0q0, *p1q1); + out = vaddq_u16(out, out); + out = vaddw_u8(out, *p2q2); + + pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0))); + out = vaddw_u8(out, pq_rev); + + out_pq0 = vaddw_u8(out, pq_rev); + pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1))); + out_pq0 = vaddw_u8(out_pq0, pq_rev); + + out_pq1 = vaddw_u8(out, *p2q2); + out_pq1 = vaddw_u8(out_pq1, *p2q2); + + out_f6_pq0 = vrshrn_n_u16(out_pq0, 3); + out_f6_pq1 = vrshrn_n_u16(out_pq1, 3); + } + { + uint8x8_t filter4_cond, filter6_cond; + filter6_cond = vand_u8(flat_8x8, mask_8x8); + filter4_cond = vmvn_u8(filter6_cond); + + // filter4 outputs + *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); + *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); + + // filter6 outputs + *p0q0 = vbsl_u8(filter6_cond, out_f6_pq0, *p0q0); + *p1q1 = vbsl_u8(filter6_cond, out_f6_pq1, *p1q1); + } +} + +static void lpf_4_neon(uint8x8_t *p1q1, uint8x8_t *p0q0, const uint8_t blimit, + const uint8_t limit, const uint8_t thresh) { + int32x2x2_t ps0_qs0, ps1_qs1; + int16x8_t filter_s16; + const uint8x8_t thresh_f4 = vdup_n_u8(thresh); + uint8x8_t mask_8x8, temp0_8x8, temp1_8x8; + int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8; + int8x8_t op0, oq0, op1, oq1; + int8x8_t pq_s0, pq_s1; + int8x8_t filter_s8, filter1_s8, filter2_s8; + int8x8_t hev_8x8; + const int8x8_t sign_mask = vdup_n_s8(0x80); + const int8x8_t val_4 = vdup_n_s8(4); + const int8x8_t val_3 = vdup_n_s8(3); + + // Calculate filter mask + mask_8x8 = lpf_mask2(*p1q1, *p0q0, blimit, limit); + + pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask); + pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask); + + ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0)); + ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1)); + ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]); + qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]); + ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]); + qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]); + + // hev_mask + temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4); + temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8))); + hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8)); + + // add outer taps if we have high edge variance + filter_s8 = vqsub_s8(ps1_s8, qs1_s8); + filter_s8 = vand_s8(filter_s8, hev_8x8); + + // inner taps + temp_s8 = vqsub_s8(qs0_s8, ps0_s8); + filter_s16 = vmovl_s8(filter_s8); + filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3); + filter_s8 = vqmovn_s16(filter_s16); + filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8)); + + filter1_s8 = vqadd_s8(filter_s8, val_4); + filter2_s8 = vqadd_s8(filter_s8, val_3); + filter1_s8 = vshr_n_s8(filter1_s8, 3); + filter2_s8 = vshr_n_s8(filter2_s8, 3); + + oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask); + op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask); + + filter_s8 = vrshr_n_s8(filter1_s8, 1); + filter_s8 = vbic_s8(filter_s8, hev_8x8); + + oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask); + op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask); + + *p0q0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4)); + *p1q1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4)); +} + +void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x16_t row0, row1, row2, row3; + uint8x8_t pxp3, p6p2, p5p1, p4p0; + uint8x8_t q0q4, q1q5, q2q6, q3qy; + uint32x2x2_t p6q6_p2q2, p5q5_p1q1, p4q4_p0q0, pxqx_p3q3; + uint32x2_t pq_rev; + uint8x8_t p0q0, p1q1, p2q2, p3q3, p4q4, p5q5, p6q6; + + // row0: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y + // row1: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y + // row2: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y + // row3: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y + load_u8_8x16(src - 8, stride, &row0, &row1, &row2, &row3); + + pxp3 = vget_low_u8(row0); + p6p2 = vget_low_u8(row1); + p5p1 = vget_low_u8(row2); + p4p0 = vget_low_u8(row3); + transpose_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0); + + q0q4 = vget_high_u8(row0); + q1q5 = vget_high_u8(row1); + q2q6 = vget_high_u8(row2); + q3qy = vget_high_u8(row3); + transpose_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(q3qy)); + pxqx_p3q3 = vtrn_u32(vreinterpret_u32_u8(pxp3), pq_rev); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(q1q5)); + p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5p1), pq_rev); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(q0q4)); + p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4p0), pq_rev); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(q2q6)); + p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6p2), pq_rev); + + p0q0 = vreinterpret_u8_u32(p4q4_p0q0.val[1]); + p1q1 = vreinterpret_u8_u32(p5q5_p1q1.val[1]); + p2q2 = vreinterpret_u8_u32(p6q6_p2q2.val[1]); + p3q3 = vreinterpret_u8_u32(pxqx_p3q3.val[1]); + p4q4 = vreinterpret_u8_u32(p4q4_p0q0.val[0]); + p5q5 = vreinterpret_u8_u32(p5q5_p1q1.val[0]); + p6q6 = vreinterpret_u8_u32(p6q6_p2q2.val[0]); + + lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, + *thresh); + + pxqx_p3q3 = vtrn_u32(pxqx_p3q3.val[0], vreinterpret_u32_u8(p3q3)); + p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5q5), vreinterpret_u32_u8(p1q1)); + p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4q4), vreinterpret_u32_u8(p0q0)); + p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6q6), vreinterpret_u32_u8(p2q2)); + + pxqx_p3q3.val[1] = vrev64_u32(pxqx_p3q3.val[1]); + p5q5_p1q1.val[1] = vrev64_u32(p5q5_p1q1.val[1]); + p4q4_p0q0.val[1] = vrev64_u32(p4q4_p0q0.val[1]); + p6q6_p2q2.val[1] = vrev64_u32(p6q6_p2q2.val[1]); + + q0q4 = vreinterpret_u8_u32(p4q4_p0q0.val[1]); + q1q5 = vreinterpret_u8_u32(p5q5_p1q1.val[1]); + q2q6 = vreinterpret_u8_u32(p6q6_p2q2.val[1]); + q3qy = vreinterpret_u8_u32(pxqx_p3q3.val[1]); + transpose_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy); + + pxp3 = vreinterpret_u8_u32(pxqx_p3q3.val[0]); + p6p2 = vreinterpret_u8_u32(p6q6_p2q2.val[0]); + p5p1 = vreinterpret_u8_u32(p5q5_p1q1.val[0]); + p4p0 = vreinterpret_u8_u32(p4q4_p0q0.val[0]); + transpose_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0); + + row0 = vcombine_u8(pxp3, q0q4); + row1 = vcombine_u8(p6p2, q1q5); + row2 = vcombine_u8(p5p1, q2q6); + row3 = vcombine_u8(p4p0, q3qy); + + store_u8_8x16(src - 8, stride, row0, row1, row2, row3); +} + +void aom_lpf_vertical_8_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint32x2x2_t p2q2_p1q1, p3q3_p0q0; + uint32x2_t pq_rev; + uint8x8_t p3q0, p2q1, p1q2, p0q3; + uint8x8_t p0q0, p1q1, p2q2, p3q3; + + // row0: p3 p2 p1 p0 | q0 q1 q2 q3 + // row1: p3 p2 p1 p0 | q0 q1 q2 q3 + // row2: p3 p2 p1 p0 | q0 q1 q2 q3 + // row3: p3 p2 p1 p0 | q0 q1 q2 q3 + load_u8_8x4(src - 4, stride, &p3q0, &p2q1, &p1q2, &p0q3); + + transpose_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q3)); + p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q0), pq_rev); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2)); + p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev); + + p0q0 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1])); + p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1])); + p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]); + p3q3 = vreinterpret_u8_u32(p3q3_p0q0.val[0]); + + lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0)); + p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q3), pq_rev); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1)); + p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev); + + p0q3 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1])); + p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1])); + p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]); + p3q0 = vreinterpret_u8_u32(p3q3_p0q0.val[0]); + transpose_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3); + + store_u8_8x4(src - 4, stride, p3q0, p2q1, p1q2, p0q3); +} + +void aom_lpf_vertical_6_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint32x2x2_t p2q2_p1q1, pxqy_p0q0; + uint32x2_t pq_rev; + uint8x8_t pxq0, p2q1, p1q2, p0qy; + uint8x8_t p0q0, p1q1, p2q2, pxqy; + + // row0: px p2 p1 p0 | q0 q1 q2 qy + // row1: px p2 p1 p0 | q0 q1 q2 qy + // row2: px p2 p1 p0 | q0 q1 q2 qy + // row3: px p2 p1 p0 | q0 q1 q2 qy + load_u8_8x4(src - 4, stride, &pxq0, &p2q1, &p1q2, &p0qy); + + transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p0qy)); + pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxq0), pq_rev); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2)); + p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev); + + p0q0 = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1])); + p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1])); + p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]); + pxqy = vreinterpret_u8_u32(pxqy_p0q0.val[0]); + + lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0)); + pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxqy), pq_rev); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1)); + p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev); + + p0qy = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1])); + p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1])); + p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]); + pxq0 = vreinterpret_u8_u32(pxqy_p0q0.val[0]); + transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy); + + store_u8_8x4(src - 4, stride, pxq0, p2q1, p1q2, p0qy); +} + +void aom_lpf_vertical_4_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint32x2x2_t p1q0_p0q1, p1q1_p0q0, p1p0_q1q0; + uint32x2_t pq_rev; + uint8x8_t UNINITIALIZED_IS_SAFE(p1p0), UNINITIALIZED_IS_SAFE(q0q1); + uint8x8_t p0q0, p1q1; + + // row0: p1 p0 | q0 q1 + // row1: p1 p0 | q0 q1 + // row2: p1 p0 | q0 q1 + // row3: p1 p0 | q0 q1 + load_unaligned_u8_4x4(src - 2, stride, (uint32x2_t *)&p1p0, + (uint32x2_t *)&q0q1); + + transpose_u8_4x4(&p1p0, &q0q1); + + p1q0_p0q1 = vtrn_u32(vreinterpret_u32_u8(p1p0), vreinterpret_u32_u8(q0q1)); + + pq_rev = vrev64_u32(p1q0_p0q1.val[1]); + p1q1_p0q0 = vtrn_u32(p1q0_p0q1.val[0], pq_rev); + + p1q1 = vreinterpret_u8_u32(p1q1_p0q0.val[0]); + p0q0 = vreinterpret_u8_u32(p1q1_p0q0.val[1]); + + lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh); + + p1p0_q1q0 = vtrn_u32(vreinterpret_u32_u8(p1q1), vreinterpret_u32_u8(p0q0)); + + p1p0 = vreinterpret_u8_u32(p1p0_q1q0.val[0]); + q0q1 = vreinterpret_u8_u32(vrev64_u32(p1p0_q1q0.val[1])); + + transpose_u8_4x4(&p1p0, &q0q1); + + store_unaligned_u8_4x1(src - 2, p1p0, 0); + store_unaligned_u8_4x1((src - 2) + 1 * stride, q0q1, 0); + store_unaligned_u8_4x1((src - 2) + 2 * stride, p1p0, 1); + store_unaligned_u8_4x1((src - 2) + 3 * stride, q0q1, 1); +} + +void aom_lpf_horizontal_14_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t p0q0, p1q1, p2q2, p3q3, p4q4, p5q5, UNINITIALIZED_IS_SAFE(p6q6); + + load_u8_4x1(src - 7 * stride, &p6q6, 0); + load_u8_4x1(src - 6 * stride, &p5q5, 0); + load_u8_4x1(src - 5 * stride, &p4q4, 0); + load_u8_4x1(src - 4 * stride, &p3q3, 0); + load_u8_4x1(src - 3 * stride, &p2q2, 0); + load_u8_4x1(src - 2 * stride, &p1q1, 0); + load_u8_4x1(src - 1 * stride, &p0q0, 0); + load_u8_4x1(src + 0 * stride, &p0q0, 1); + load_u8_4x1(src + 1 * stride, &p1q1, 1); + load_u8_4x1(src + 2 * stride, &p2q2, 1); + load_u8_4x1(src + 3 * stride, &p3q3, 1); + load_u8_4x1(src + 4 * stride, &p4q4, 1); + load_u8_4x1(src + 5 * stride, &p5q5, 1); + load_u8_4x1(src + 6 * stride, &p6q6, 1); + + lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, + *thresh); + + store_u8_4x1(src - 6 * stride, p5q5, 0); + store_u8_4x1(src - 5 * stride, p4q4, 0); + store_u8_4x1(src - 4 * stride, p3q3, 0); + store_u8_4x1(src - 3 * stride, p2q2, 0); + store_u8_4x1(src - 2 * stride, p1q1, 0); + store_u8_4x1(src - 1 * stride, p0q0, 0); + store_u8_4x1(src + 0 * stride, p0q0, 1); + store_u8_4x1(src + 1 * stride, p1q1, 1); + store_u8_4x1(src + 2 * stride, p2q2, 1); + store_u8_4x1(src + 3 * stride, p3q3, 1); + store_u8_4x1(src + 4 * stride, p4q4, 1); + store_u8_4x1(src + 5 * stride, p5q5, 1); +} + +void aom_lpf_horizontal_8_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t p0q0, p1q1, p2q2, p3q3; + + p3q3 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 4 * stride))); + p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride))); + p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride))); + p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride))); + p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride), + vreinterpret_u32_u8(p0q0), 1)); + p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride), + vreinterpret_u32_u8(p1q1), 1)); + p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride), + vreinterpret_u32_u8(p2q2), 1)); + p3q3 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 3 * stride), + vreinterpret_u32_u8(p3q3), 1)); + + lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh); + + vst1_lane_u32((uint32_t *)(src - 4 * stride), vreinterpret_u32_u8(p3q3), 0); + vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0); + vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0); + vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0); + vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1); + vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1); + vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1); + vst1_lane_u32((uint32_t *)(src + 3 * stride), vreinterpret_u32_u8(p3q3), 1); +} + +void aom_lpf_horizontal_6_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t p0q0, p1q1, p2q2; + + p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride))); + p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride))); + p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride))); + p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride), + vreinterpret_u32_u8(p0q0), 1)); + p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride), + vreinterpret_u32_u8(p1q1), 1)); + p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride), + vreinterpret_u32_u8(p2q2), 1)); + + lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh); + + vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0); + vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0); + vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0); + vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1); + vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1); + vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1); +} + +void aom_lpf_horizontal_4_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t p0q0, UNINITIALIZED_IS_SAFE(p1q1); + + load_u8_4x1(src - 2 * stride, &p1q1, 0); + load_u8_4x1(src - 1 * stride, &p0q0, 0); + load_u8_4x1(src + 0 * stride, &p0q0, 1); + load_u8_4x1(src + 1 * stride, &p1q1, 1); + + lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh); + + store_u8_4x1(src - 2 * stride, p1q1, 0); + store_u8_4x1(src - 1 * stride, p0q0, 0); + store_u8_4x1(src + 0 * stride, p0q0, 1); + store_u8_4x1(src + 1 * stride, p1q1, 1); +} diff --git a/libs/libaom/src/aom_dsp/arm/sad4d_neon.c b/libs/libaom/src/aom_dsp/arm/sad4d_neon.c new file mode 100644 index 000000000..606950ab2 --- /dev/null +++ b/libs/libaom/src/aom_dsp/arm/sad4d_neon.c @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" + +static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, + const uint16x8_t vec_hi) { + const uint32x4_t vec_l_lo = + vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo)); + const uint32x4_t vec_l_hi = + vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi)); + const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +} + +// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16, +// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo +// and vec_sum_ref_hi. +static void sad_neon_64(const uint8x16_t vec_src_00, + const uint8x16_t vec_src_16, + const uint8x16_t vec_src_32, + const uint8x16_t vec_src_48, const uint8_t *ref, + uint16x8_t *vec_sum_ref_lo, + uint16x8_t *vec_sum_ref_hi) { + const uint8x16_t vec_ref_00 = vld1q_u8(ref); + const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); + const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); + const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); + + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00), + vget_low_u8(vec_ref_00)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00), + vget_high_u8(vec_ref_00)); + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16), + vget_low_u8(vec_ref_16)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16), + vget_high_u8(vec_ref_16)); + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32), + vget_low_u8(vec_ref_32)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32), + vget_high_u8(vec_ref_32)); + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48), + vget_low_u8(vec_ref_48)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48), + vget_high_u8(vec_ref_48)); +} + +// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16, +// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi. +static void sad_neon_32(const uint8x16_t vec_src_00, + const uint8x16_t vec_src_16, const uint8_t *ref, + uint16x8_t *vec_sum_ref_lo, + uint16x8_t *vec_sum_ref_hi) { + const uint8x16_t vec_ref_00 = vld1q_u8(ref); + const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); + + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00), + vget_low_u8(vec_ref_00)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00), + vget_high_u8(vec_ref_00)); + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16), + vget_low_u8(vec_ref_16)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16), + vget_high_u8(vec_ref_16)); +} + +void aom_sad64x64x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + int i; + uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); + const uint8_t *ref0, *ref1, *ref2, *ref3; + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + + for (i = 0; i < 64; ++i) { + const uint8x16_t vec_src_00 = vld1q_u8(src); + const uint8x16_t vec_src_16 = vld1q_u8(src + 16); + const uint8x16_t vec_src_32 = vld1q_u8(src + 32); + const uint8x16_t vec_src_48 = vld1q_u8(src + 48); + + sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0, + &vec_sum_ref0_lo, &vec_sum_ref0_hi); + sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1, + &vec_sum_ref1_lo, &vec_sum_ref1_hi); + sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2, + &vec_sum_ref2_lo, &vec_sum_ref2_hi); + sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3, + &vec_sum_ref3_lo, &vec_sum_ref3_hi); + + src += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; + } + + res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); + res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); + res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); + res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); +} + +void aom_sad32x32x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + int i; + uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); + const uint8_t *ref0, *ref1, *ref2, *ref3; + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + + for (i = 0; i < 32; ++i) { + const uint8x16_t vec_src_00 = vld1q_u8(src); + const uint8x16_t vec_src_16 = vld1q_u8(src + 16); + + sad_neon_32(vec_src_00, vec_src_16, ref0, &vec_sum_ref0_lo, + &vec_sum_ref0_hi); + sad_neon_32(vec_src_00, vec_src_16, ref1, &vec_sum_ref1_lo, + &vec_sum_ref1_hi); + sad_neon_32(vec_src_00, vec_src_16, ref2, &vec_sum_ref2_lo, + &vec_sum_ref2_hi); + sad_neon_32(vec_src_00, vec_src_16, ref3, &vec_sum_ref3_lo, + &vec_sum_ref3_hi); + + src += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; + } + + res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); + res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); + res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); + res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); +} + +void aom_sad16x16x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + int i; + uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); + const uint8_t *ref0, *ref1, *ref2, *ref3; + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + + for (i = 0; i < 16; ++i) { + const uint8x16_t vec_src = vld1q_u8(src); + const uint8x16_t vec_ref0 = vld1q_u8(ref0); + const uint8x16_t vec_ref1 = vld1q_u8(ref1); + const uint8x16_t vec_ref2 = vld1q_u8(ref2); + const uint8x16_t vec_ref3 = vld1q_u8(ref3); + + vec_sum_ref0_lo = + vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0)); + vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src), + vget_high_u8(vec_ref0)); + vec_sum_ref1_lo = + vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1)); + vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src), + vget_high_u8(vec_ref1)); + vec_sum_ref2_lo = + vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2)); + vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src), + vget_high_u8(vec_ref2)); + vec_sum_ref3_lo = + vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3)); + vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src), + vget_high_u8(vec_ref3)); + + src += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; + } + + res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); + res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); + res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); + res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); +} diff --git a/libs/libaom/src/aom_dsp/arm/sad_neon.c b/libs/libaom/src/aom_dsp/arm/sad_neon.c new file mode 100644 index 000000000..a39de91d6 --- /dev/null +++ b/libs/libaom/src/aom_dsp/arm/sad_neon.c @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" + +unsigned int aom_sad8x16_neon(unsigned char *src_ptr, int src_stride, + unsigned char *ref_ptr, int ref_stride) { + uint8x8_t d0, d8; + uint16x8_t q12; + uint32x4_t q1; + uint64x2_t q3; + uint32x2_t d5; + int i; + + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabdl_u8(d0, d8); + + for (i = 0; i < 15; i++) { + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabal_u8(q12, d0, d8); + } + + q1 = vpaddlq_u16(q12); + q3 = vpaddlq_u32(q1); + d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), + vreinterpret_u32_u64(vget_high_u64(q3))); + + return vget_lane_u32(d5, 0); +} + +unsigned int aom_sad4x4_neon(unsigned char *src_ptr, int src_stride, + unsigned char *ref_ptr, int ref_stride) { + uint8x8_t d0, d8; + uint16x8_t q12; + uint32x2_t d1; + uint64x1_t d3; + int i; + + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabdl_u8(d0, d8); + + for (i = 0; i < 3; i++) { + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabal_u8(q12, d0, d8); + } + + d1 = vpaddl_u16(vget_low_u16(q12)); + d3 = vpaddl_u32(d1); + + return vget_lane_u32(vreinterpret_u32_u64(d3), 0); +} + +unsigned int aom_sad16x8_neon(unsigned char *src_ptr, int src_stride, + unsigned char *ref_ptr, int ref_stride) { + uint8x16_t q0, q4; + uint16x8_t q12, q13; + uint32x4_t q1; + uint64x2_t q3; + uint32x2_t d5; + int i; + + q0 = vld1q_u8(src_ptr); + src_ptr += src_stride; + q4 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4)); + q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4)); + + for (i = 0; i < 7; i++) { + q0 = vld1q_u8(src_ptr); + src_ptr += src_stride; + q4 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4)); + q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4)); + } + + q12 = vaddq_u16(q12, q13); + q1 = vpaddlq_u16(q12); + q3 = vpaddlq_u32(q1); + d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), + vreinterpret_u32_u64(vget_high_u64(q3))); + + return vget_lane_u32(d5, 0); +} + +static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, + const uint16x8_t vec_hi) { + const uint32x4_t vec_l_lo = + vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo)); + const uint32x4_t vec_l_hi = + vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi)); + const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +} +static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) { + const uint32x4_t a = vpaddlq_u16(vec_16x8); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +} + +unsigned int aom_sad64x64_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + int i; + uint16x8_t vec_accum_lo = vdupq_n_u16(0); + uint16x8_t vec_accum_hi = vdupq_n_u16(0); + for (i = 0; i < 64; ++i) { + const uint8x16_t vec_src_00 = vld1q_u8(src); + const uint8x16_t vec_src_16 = vld1q_u8(src + 16); + const uint8x16_t vec_src_32 = vld1q_u8(src + 32); + const uint8x16_t vec_src_48 = vld1q_u8(src + 48); + const uint8x16_t vec_ref_00 = vld1q_u8(ref); + const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); + const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); + const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); + src += src_stride; + ref += ref_stride; + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), + vget_low_u8(vec_ref_00)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), + vget_high_u8(vec_ref_00)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), + vget_low_u8(vec_ref_16)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), + vget_high_u8(vec_ref_16)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32), + vget_low_u8(vec_ref_32)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32), + vget_high_u8(vec_ref_32)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48), + vget_low_u8(vec_ref_48)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48), + vget_high_u8(vec_ref_48)); + } + return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi); +} + +unsigned int aom_sad32x32_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + int i; + uint16x8_t vec_accum_lo = vdupq_n_u16(0); + uint16x8_t vec_accum_hi = vdupq_n_u16(0); + + for (i = 0; i < 32; ++i) { + const uint8x16_t vec_src_00 = vld1q_u8(src); + const uint8x16_t vec_src_16 = vld1q_u8(src + 16); + const uint8x16_t vec_ref_00 = vld1q_u8(ref); + const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); + src += src_stride; + ref += ref_stride; + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), + vget_low_u8(vec_ref_00)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), + vget_high_u8(vec_ref_00)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), + vget_low_u8(vec_ref_16)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), + vget_high_u8(vec_ref_16)); + } + return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); +} + +unsigned int aom_sad16x16_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + int i; + uint16x8_t vec_accum_lo = vdupq_n_u16(0); + uint16x8_t vec_accum_hi = vdupq_n_u16(0); + + for (i = 0; i < 16; ++i) { + const uint8x16_t vec_src = vld1q_u8(src); + const uint8x16_t vec_ref = vld1q_u8(ref); + src += src_stride; + ref += ref_stride; + vec_accum_lo = + vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref)); + vec_accum_hi = + vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref)); + } + return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); +} + +unsigned int aom_sad8x8_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + int i; + uint16x8_t vec_accum = vdupq_n_u16(0); + + for (i = 0; i < 8; ++i) { + const uint8x8_t vec_src = vld1_u8(src); + const uint8x8_t vec_ref = vld1_u8(ref); + src += src_stride; + ref += ref_stride; + vec_accum = vabal_u8(vec_accum, vec_src, vec_ref); + } + return horizontal_add_16x8(vec_accum); +} diff --git a/libs/libaom/src/aom_dsp/arm/sse_neon.c b/libs/libaom/src/aom_dsp/arm/sse_neon.c new file mode 100644 index 000000000..06b81cc3d --- /dev/null +++ b/libs/libaom/src/aom_dsp/arm/sse_neon.c @@ -0,0 +1,487 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" + +static INLINE uint32_t sse_W16x1_neon(uint8x16_t q2, uint8x16_t q3) { + const uint16_t sse1 = 0; + const uint16x8_t q1 = vld1q_dup_u16(&sse1); + + uint32_t sse; + + uint8x16_t q4 = vabdq_u8(q2, q3); // diff = abs(a[x] - b[x]) + uint8x8_t d0 = vget_low_u8(q4); + uint8x8_t d1 = vget_high_u8(q4); + + uint16x8_t q6 = vmlal_u8(q1, d0, d0); + uint16x8_t q7 = vmlal_u8(q1, d1, d1); + + uint32x4_t q8 = vaddl_u16(vget_low_u16(q6), vget_high_u16(q6)); + uint32x4_t q9 = vaddl_u16(vget_low_u16(q7), vget_high_u16(q7)); + + uint32x2_t d4 = vadd_u32(vget_low_u32(q8), vget_high_u32(q8)); + uint32x2_t d5 = vadd_u32(vget_low_u32(q9), vget_high_u32(q9)); + + uint32x2_t d6 = vadd_u32(d4, d5); + + sse = vget_lane_u32(d6, 0); + sse += vget_lane_u32(d6, 1); + + return sse; +} + +int64_t aom_sse_neon(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + const uint8x16_t q0 = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + }; + int addinc, x, y; + uint8x8_t d0, d1, d2, d3; + uint8_t dx; + uint8x16_t q2, q3, q4, q5; + uint32_t sse = 0; + uint8x8x2_t tmp, tmp2; + + switch (width) { + case 4: + for (y = 0; y < height; y += 4) { + d0 = vld1_u8(a); // load 4 data + a += a_stride; + d1 = vld1_u8(a); + a += a_stride; + d2 = vld1_u8(a); + a += a_stride; + d3 = vld1_u8(a); + a += a_stride; + tmp = vzip_u8(d0, d1); + tmp2 = vzip_u8(d2, d3); + q2 = vcombine_u8(tmp.val[0], tmp2.val[0]); // make a 16 data vector + + d0 = vld1_u8(b); + b += b_stride; + d1 = vld1_u8(b); + b += b_stride; + d2 = vld1_u8(b); + b += b_stride; + d3 = vld1_u8(b); + b += b_stride; + tmp = vzip_u8(d0, d1); + tmp2 = vzip_u8(d2, d3); + q3 = vcombine_u8(tmp.val[0], tmp2.val[0]); + + sse += sse_W16x1_neon(q2, q3); + } + break; + case 8: + for (y = 0; y < height; y += 2) { + d0 = vld1_u8(a); // load 8 data + d1 = vld1_u8(a + a_stride); + q2 = vcombine_u8(d0, d1); // make a 16 data vector + + d0 = vld1_u8(b); + d1 = vld1_u8(b + b_stride); + q3 = vcombine_u8(d0, d1); + + sse += sse_W16x1_neon(q2, q3); + + a += 2 * a_stride; + b += 2 * b_stride; + } + break; + case 16: + for (y = 0; y < height; y++) { + q2 = vld1q_u8(a); + q3 = vld1q_u8(b); + + sse += sse_W16x1_neon(q2, q3); + + a += a_stride; + b += b_stride; + } + break; + case 32: + for (y = 0; y < height; y++) { + q2 = vld1q_u8(a); + q3 = vld1q_u8(b); + + sse += sse_W16x1_neon(q2, q3); + + q2 = vld1q_u8(a + 16); + q3 = vld1q_u8(b + 16); + + sse += sse_W16x1_neon(q2, q3); + + a += a_stride; + b += b_stride; + } + break; + case 64: + for (y = 0; y < height; y++) { + q2 = vld1q_u8(a); + q3 = vld1q_u8(b); + + sse += sse_W16x1_neon(q2, q3); + + q2 = vld1q_u8(a + 16); + q3 = vld1q_u8(b + 16); + + sse += sse_W16x1_neon(q2, q3); + + q2 = vld1q_u8(a + 32); + q3 = vld1q_u8(b + 32); + + sse += sse_W16x1_neon(q2, q3); + + q2 = vld1q_u8(a + 48); + q3 = vld1q_u8(b + 48); + + sse += sse_W16x1_neon(q2, q3); + + a += a_stride; + b += b_stride; + } + break; + case 128: + for (y = 0; y < height; y++) { + q2 = vld1q_u8(a); + q3 = vld1q_u8(b); + + sse += sse_W16x1_neon(q2, q3); + + q2 = vld1q_u8(a + 16); + q3 = vld1q_u8(b + 16); + + sse += sse_W16x1_neon(q2, q3); + + q2 = vld1q_u8(a + 32); + q3 = vld1q_u8(b + 32); + + sse += sse_W16x1_neon(q2, q3); + + q2 = vld1q_u8(a + 48); + q3 = vld1q_u8(b + 48); + + sse += sse_W16x1_neon(q2, q3); + + q2 = vld1q_u8(a + 64); + q3 = vld1q_u8(b + 64); + + sse += sse_W16x1_neon(q2, q3); + + q2 = vld1q_u8(a + 80); + q3 = vld1q_u8(b + 80); + + sse += sse_W16x1_neon(q2, q3); + + q2 = vld1q_u8(a + 96); + q3 = vld1q_u8(b + 96); + + sse += sse_W16x1_neon(q2, q3); + + q2 = vld1q_u8(a + 112); + q3 = vld1q_u8(b + 112); + + sse += sse_W16x1_neon(q2, q3); + + a += a_stride; + b += b_stride; + } + break; + default: + for (y = 0; y < height; y++) { + x = width; + while (x > 0) { + addinc = width - x; + q2 = vld1q_u8(a + addinc); + q3 = vld1q_u8(b + addinc); + if (x < 16) { + dx = x; + q4 = vld1q_dup_u8(&dx); + q5 = vcltq_u8(q0, q4); + q2 = vandq_u8(q2, q5); + q3 = vandq_u8(q3, q5); + } + sse += sse_W16x1_neon(q2, q3); + x -= 16; + } + a += a_stride; + b += b_stride; + } + } + return (int64_t)sse; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE uint32_t highbd_sse_W8x1_neon(uint16x8_t q2, uint16x8_t q3) { + uint32_t sse; + const uint32_t sse1 = 0; + const uint32x4_t q1 = vld1q_dup_u32(&sse1); + + uint16x8_t q4 = vabdq_u16(q2, q3); // diff = abs(a[x] - b[x]) + uint16x4_t d0 = vget_low_u16(q4); + uint16x4_t d1 = vget_high_u16(q4); + + uint32x4_t q6 = vmlal_u16(q1, d0, d0); + uint32x4_t q7 = vmlal_u16(q1, d1, d1); + + uint32x2_t d4 = vadd_u32(vget_low_u32(q6), vget_high_u32(q6)); + uint32x2_t d5 = vadd_u32(vget_low_u32(q7), vget_high_u32(q7)); + + uint32x2_t d6 = vadd_u32(d4, d5); + + sse = vget_lane_u32(d6, 0); + sse += vget_lane_u32(d6, 1); + + return sse; +} + +int64_t aom_highbd_sse_neon(const uint8_t *a8, int a_stride, const uint8_t *b8, + int b_stride, int width, int height) { + const uint16x8_t q0 = { 0, 1, 2, 3, 4, 5, 6, 7 }; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + int x, y; + int addinc; + uint16x4_t d0, d1, d2, d3; + uint16_t dx; + uint16x8_t q2, q3, q4, q5; + + switch (width) { + case 4: + for (y = 0; y < height; y += 2) { + d0 = vld1_u16(a); // load 4 data + a += a_stride; + d1 = vld1_u16(a); + a += a_stride; + + d2 = vld1_u16(b); + b += b_stride; + d3 = vld1_u16(b); + b += b_stride; + q2 = vcombine_u16(d0, d1); // make a 8 data vector + q3 = vcombine_u16(d2, d3); + + sse += highbd_sse_W8x1_neon(q2, q3); + } + break; + case 8: + for (y = 0; y < height; y++) { + q2 = vld1q_u16(a); + q3 = vld1q_u16(b); + + sse += highbd_sse_W8x1_neon(q2, q3); + + a += a_stride; + b += b_stride; + } + break; + case 16: + for (y = 0; y < height; y++) { + q2 = vld1q_u16(a); + q3 = vld1q_u16(b); + + sse += highbd_sse_W8x1_neon(q2, q3); + + q2 = vld1q_u16(a + 8); + q3 = vld1q_u16(b + 8); + + sse += highbd_sse_W8x1_neon(q2, q3); + + a += a_stride; + b += b_stride; + } + break; + case 32: + for (y = 0; y < height; y++) { + q2 = vld1q_u16(a); + q3 = vld1q_u16(b); + + sse += highbd_sse_W8x1_neon(q2, q3); + + q2 = vld1q_u16(a + 8); + q3 = vld1q_u16(b + 8); + + sse += highbd_sse_W8x1_neon(q2, q3); + + q2 = vld1q_u16(a + 16); + q3 = vld1q_u16(b + 16); + + sse += highbd_sse_W8x1_neon(q2, q3); + + q2 = vld1q_u16(a + 24); + q3 = vld1q_u16(b + 24); + + sse += highbd_sse_W8x1_neon(q2, q3); + + a += a_stride; + b += b_stride; + } + break; + case 64: + for (y = 0; y < height; y++) { + q2 = vld1q_u16(a); + q3 = vld1q_u16(b); + + sse += highbd_sse_W8x1_neon(q2, q3); + + q2 = vld1q_u16(a + 8); + q3 = vld1q_u16(b + 8); + + sse += highbd_sse_W8x1_neon(q2, q3); + + q2 = vld1q_u16(a + 16); + q3 = vld1q_u16(b + 16); + + sse += highbd_sse_W8x1_neon(q2, q3); + + q2 = vld1q_u16(a + 24); + q3 = vld1q_u16(b + 24); + + sse += highbd_sse_W8x1_neon(q2, q3); + + q2 = vld1q_u16(a + 32); + q3 = vld1q_u16(b + 32); + + sse += highbd_sse_W8x1_neon(q2, q3); + + q2 = vld1q_u16(a + 40); + q3 = vld1q_u16(b + 40); + + sse += highbd_sse_W8x1_neon(q2, q3); + + q2 = vld1q_u16(a + 48); + q3 = vld1q_u16(b + 48); + + sse += highbd_sse_W8x1_neon(q2, q3); + + q2 = vld1q_u16(a + 56); + q3 = vld1q_u16(b + 56); + + sse += highbd_sse_W8x1_neon(q2, q3); + + a += a_stride; + b += b_stride; + } + break; + case 128: + for (y = 0; y < height; y++) { + q2 = vld1q_u16(a); + q3 = vld1q_u16(b); + + sse += highbd_sse_W8x1_neon(q2, q3); + + q2 = vld1q_u16(a + 8); + q3 = vld1q_u16(b + 8); + + sse += highbd_sse_W8x1_neon(q2, q3); + + q2 = vld1q_u16(a + 16); + q3 = vld1q_u16(b + 16); + + sse += highbd_sse_W8x1_neon(q2, q3); + + q2 = vld1q_u16(a + 24); + q3 = vld1q_u16(b + 24); + + sse += highbd_sse_W8x1_neon(q2, q3); + + q2 = vld1q_u16(a + 32); + q3 = vld1q_u16(b + 32); + + sse += highbd_sse_W8x1_neon(q2, q3); + + q2 = vld1q_u16(a + 40); + q3 = vld1q_u16(b + 40); + + sse += highbd_sse_W8x1_neon(q2, q3); + + q2 = vld1q_u16(a + 48); + q3 = vld1q_u16(b + 48); + + sse += highbd_sse_W8x1_neon(q2, q3); + + q2 = vld1q_u16(a + 56); + q3 = vld1q_u16(b + 56); + + sse += highbd_sse_W8x1_neon(q2, q3); + + q2 = vld1q_u16(a + 64); + q3 = vld1q_u16(b + 64); + + sse += highbd_sse_W8x1_neon(q2, q3); + + q2 = vld1q_u16(a + 72); + q3 = vld1q_u16(b + 72); + + sse += highbd_sse_W8x1_neon(q2, q3); + + q2 = vld1q_u16(a + 80); + q3 = vld1q_u16(b + 80); + + sse += highbd_sse_W8x1_neon(q2, q3); + + q2 = vld1q_u16(a + 88); + q3 = vld1q_u16(b + 88); + + sse += highbd_sse_W8x1_neon(q2, q3); + + q2 = vld1q_u16(a + 96); + q3 = vld1q_u16(b + 96); + + sse += highbd_sse_W8x1_neon(q2, q3); + + q2 = vld1q_u16(a + 104); + q3 = vld1q_u16(b + 104); + + sse += highbd_sse_W8x1_neon(q2, q3); + + q2 = vld1q_u16(a + 112); + q3 = vld1q_u16(b + 112); + + sse += highbd_sse_W8x1_neon(q2, q3); + + q2 = vld1q_u16(a + 120); + q3 = vld1q_u16(b + 120); + + sse += highbd_sse_W8x1_neon(q2, q3); + a += a_stride; + b += b_stride; + } + break; + default: + + for (y = 0; y < height; y++) { + x = width; + while (x > 0) { + addinc = width - x; + q2 = vld1q_u16(a + addinc); + q3 = vld1q_u16(b + addinc); + if (x < 8) { + dx = x; + q4 = vld1q_dup_u16(&dx); + q5 = vcltq_u16(q0, q4); + q2 = vandq_u16(q2, q5); + q3 = vandq_u16(q3, q5); + } + sse += highbd_sse_W8x1_neon(q2, q3); + x -= 8; + } + a += a_stride; + b += b_stride; + } + } + return (int64_t)sse; +} +#endif diff --git a/libs/libaom/src/aom_dsp/arm/subpel_variance_neon.c b/libs/libaom/src/aom_dsp/arm/subpel_variance_neon.c new file mode 100644 index 000000000..cf618eee7 --- /dev/null +++ b/libs/libaom/src/aom_dsp/arm/subpel_variance_neon.c @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_config.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" + +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/variance.h" + +static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, + uint8_t *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint8_t *filter) { + const uint8x8_t f0 = vmov_n_u8(filter[0]); + const uint8x8_t f1 = vmov_n_u8(filter[1]); + unsigned int i; + for (i = 0; i < output_height; ++i) { + const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); + const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]); + const uint16x8_t a = vmull_u8(src_0, f0); + const uint16x8_t b = vmlal_u8(a, src_1, f1); + const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); + vst1_u8(&output_ptr[0], out); + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } +} + +static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, + uint8_t *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint8_t *filter) { + const uint8x8_t f0 = vmov_n_u8(filter[0]); + const uint8x8_t f1 = vmov_n_u8(filter[1]); + unsigned int i, j; + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 16) { + const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]); + const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]); + const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0); + const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1); + const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS); + const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0); + const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1); + const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS); + vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi)); + } + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } +} + +unsigned int aom_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride, + int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]); + DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]); + + var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8, + bilinear_filters_2t[xoffset]); + var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8, + bilinear_filters_2t[yoffset]); + return aom_variance8x8_neon(temp2, 8, dst, dst_stride, sse); +} + +unsigned int aom_sub_pixel_variance16x16_neon(const uint8_t *src, + int src_stride, int xoffset, + int yoffset, const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]); + DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]); + + var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16, + bilinear_filters_2t[xoffset]); + var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16, + bilinear_filters_2t[yoffset]); + return aom_variance16x16_neon(temp2, 16, dst, dst_stride, sse); +} + +unsigned int aom_sub_pixel_variance32x32_neon(const uint8_t *src, + int src_stride, int xoffset, + int yoffset, const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]); + DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]); + + var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32, + bilinear_filters_2t[xoffset]); + var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32, + bilinear_filters_2t[yoffset]); + return aom_variance32x32_neon(temp2, 32, dst, dst_stride, sse); +} + +unsigned int aom_sub_pixel_variance64x64_neon(const uint8_t *src, + int src_stride, int xoffset, + int yoffset, const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]); + DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]); + + var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64, + bilinear_filters_2t[xoffset]); + var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64, + bilinear_filters_2t[yoffset]); + return aom_variance64x64_neon(temp2, 64, dst, dst_stride, sse); +} diff --git a/libs/libaom/src/aom_dsp/arm/subtract_neon.c b/libs/libaom/src/aom_dsp/arm/subtract_neon.c new file mode 100644 index 000000000..28f5ace8e --- /dev/null +++ b/libs/libaom/src/aom_dsp/arm/subtract_neon.c @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" + +void aom_subtract_block_neon(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src, + ptrdiff_t src_stride, const uint8_t *pred, + ptrdiff_t pred_stride) { + int r, c; + + if (cols > 16) { + for (r = 0; r < rows; ++r) { + for (c = 0; c < cols; c += 32) { + const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]); + const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]); + const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]); + const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]); + const uint16x8_t v_diff_lo_00 = + vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00)); + const uint16x8_t v_diff_hi_00 = + vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00)); + const uint16x8_t v_diff_lo_16 = + vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16)); + const uint16x8_t v_diff_hi_16 = + vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16)); + vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00)); + vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00)); + vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16)); + vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16)); + } + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } + } else if (cols > 8) { + for (r = 0; r < rows; ++r) { + const uint8x16_t v_src = vld1q_u8(&src[0]); + const uint8x16_t v_pred = vld1q_u8(&pred[0]); + const uint16x8_t v_diff_lo = + vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred)); + const uint16x8_t v_diff_hi = + vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred)); + vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo)); + vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi)); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } + } else if (cols > 4) { + for (r = 0; r < rows; ++r) { + const uint8x8_t v_src = vld1_u8(&src[0]); + const uint8x8_t v_pred = vld1_u8(&pred[0]); + const uint16x8_t v_diff = vsubl_u8(v_src, v_pred); + vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff)); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } + } else { + for (r = 0; r < rows; ++r) { + for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c]; + + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } + } +} diff --git a/libs/libaom/src/aom_dsp/arm/sum_neon.h b/libs/libaom/src/aom_dsp/arm/sum_neon.h new file mode 100644 index 000000000..809e51ce1 --- /dev/null +++ b/libs/libaom/src/aom_dsp/arm/sum_neon.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { + const int32x4_t a = vpaddlq_s16(v_16x8); + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { + const int64x2_t b = vpaddlq_s32(v_32x4); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +static INLINE uint32x2_t horizontal_add_u16x8(const uint16x8_t a) { + const uint32x4_t b = vpaddlq_u16(a); + const uint64x2_t c = vpaddlq_u32(b); + return vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)), + vreinterpret_u32_u64(vget_high_u64(c))); +} diff --git a/libs/libaom/src/aom_dsp/arm/variance_neon.c b/libs/libaom/src/aom_dsp/arm/variance_neon.c new file mode 100644 index 000000000..d4107ce0d --- /dev/null +++ b/libs/libaom/src/aom_dsp/arm/variance_neon.c @@ -0,0 +1,401 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +// w * h must be less than 2048 or local variable v_sum may overflow. +static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h, uint32_t *sse, + int *sum) { + int i, j; + int16x8_t v_sum = vdupq_n_s16(0); + int32x4_t v_sse_lo = vdupq_n_s32(0); + int32x4_t v_sse_hi = vdupq_n_s32(0); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const uint8x8_t v_a = vld1_u8(&a[j]); + const uint8x8_t v_b = vld1_u8(&b[j]); + const uint16x8_t v_diff = vsubl_u8(v_a, v_b); + const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); + v_sum = vaddq_s16(v_sum, sv_diff); + v_sse_lo = + vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff)); + v_sse_hi = + vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff)); + } + a += a_stride; + b += b_stride; + } + + *sum = horizontal_add_s16x8(v_sum); + *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); +} + +void aom_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, unsigned int *sse, int *sum) { + variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum); +} + +void aom_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, unsigned int *sse, int *sum) { + variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum); +} + +unsigned int aom_variance8x8_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum); + return *sse - ((sum * sum) >> 6); +} + +unsigned int aom_variance16x16_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum); + return *sse - (((unsigned int)((int64_t)sum * sum)) >> 8); +} + +unsigned int aom_variance32x32_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); +} + +unsigned int aom_variance32x64_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum1, sum2; + uint32_t sse1, sse2; + variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1); + variance_neon_w8(a + (32 * a_stride), a_stride, b + (32 * b_stride), b_stride, + 32, 32, &sse2, &sum2); + *sse = sse1 + sse2; + sum1 += sum2; + return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); +} + +unsigned int aom_variance64x32_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum1, sum2; + uint32_t sse1, sse2; + variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); + variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride, + 64, 16, &sse2, &sum2); + *sse = sse1 + sse2; + sum1 += sum2; + return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); +} + +unsigned int aom_variance64x64_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum1, sum2; + uint32_t sse1, sse2; + + variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); + variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride, + 64, 16, &sse2, &sum2); + sse1 += sse2; + sum1 += sum2; + + variance_neon_w8(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride), + b_stride, 64, 16, &sse2, &sum2); + sse1 += sse2; + sum1 += sum2; + + variance_neon_w8(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride), + b_stride, 64, 16, &sse2, &sum2); + *sse = sse1 + sse2; + sum1 += sum2; + return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12); +} + +unsigned int aom_variance128x128_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum1, sum2; + uint32_t sse1, sse2; + sum1 = sse1 = 0; + for (int i = 0; i < 16; i++) { + variance_neon_w8(a + (8 * i * a_stride), a_stride, b + (8 * i * b_stride), + b_stride, 128, 8, &sse2, &sum2); + sse1 += sse2; + sum1 += sum2; + } + + *sse = sse1; + + return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 14); +} + +unsigned int aom_variance16x8_neon(const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, unsigned int *sse) { + int i; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 4; i++) { + q0u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q1u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + __builtin_prefetch(src_ptr); + + q2u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q3u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + __builtin_prefetch(ref_ptr); + + q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); + q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); + q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); + q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); + q9s32 = vmlal_s16(q9s32, d26s16, d26s16); + q10s32 = vmlal_s16(q10s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); + d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int aom_variance8x16_neon(const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, unsigned int *sse) { + int i; + uint8x8_t d0u8, d2u8, d4u8, d6u8; + int16x4_t d22s16, d23s16, d24s16, d25s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64; + uint16x8_t q11u16, q12u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 8; i++) { + d0u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + __builtin_prefetch(src_ptr); + + d4u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d6u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + __builtin_prefetch(ref_ptr); + + q11u16 = vsubl_u8(d0u8, d4u8); + q12u16 = vsubl_u8(d2u8, d6u8); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); + d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int aom_mse16x16_neon(const unsigned char *src_ptr, int source_stride, + const unsigned char *ref_ptr, int recon_stride, + unsigned int *sse) { + int i; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + int64x1_t d0s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + int32x4_t q7s32, q8s32, q9s32, q10s32; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int64x2_t q1s64; + + q7s32 = vdupq_n_s32(0); + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 8; i++) { // mse16x16_neon_loop + q0u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q1u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q2u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q3u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + + q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); + q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); + q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); + q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q7s32 = vmlal_s16(q7s32, d22s16, d22s16); + q8s32 = vmlal_s16(q8s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q7s32 = vmlal_s16(q7s32, d26s16, d26s16); + q8s32 = vmlal_s16(q8s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } + + q7s32 = vaddq_s32(q7s32, q8s32); + q9s32 = vaddq_s32(q9s32, q10s32); + q10s32 = vaddq_s32(q7s32, q9s32); + + q1s64 = vpaddlq_s32(q10s32); + d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0); + return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); +} + +unsigned int aom_get4x4sse_cs_neon(const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride) { + int16x4_t d22s16, d24s16, d26s16, d28s16; + int64x1_t d0s64; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + int32x4_t q7s32, q8s32, q9s32, q10s32; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int64x2_t q1s64; + + d0u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d4u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d1u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d5u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d6u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d3u8 = vld1_u8(src_ptr); + d7u8 = vld1_u8(ref_ptr); + + q11u16 = vsubl_u8(d0u8, d4u8); + q12u16 = vsubl_u8(d1u8, d5u8); + q13u16 = vsubl_u8(d2u8, d6u8); + q14u16 = vsubl_u8(d3u8, d7u8); + + d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16)); + d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16)); + d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16)); + d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16)); + + q7s32 = vmull_s16(d22s16, d22s16); + q8s32 = vmull_s16(d24s16, d24s16); + q9s32 = vmull_s16(d26s16, d26s16); + q10s32 = vmull_s16(d28s16, d28s16); + + q7s32 = vaddq_s32(q7s32, q8s32); + q9s32 = vaddq_s32(q9s32, q10s32); + q9s32 = vaddq_s32(q7s32, q9s32); + + q1s64 = vpaddlq_s32(q9s32); + d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); +} diff --git a/libs/libaom/src/aom_dsp/avg.c b/libs/libaom/src/aom_dsp/avg.c new file mode 100644 index 000000000..7386296fd --- /dev/null +++ b/libs/libaom/src/aom_dsp/avg.c @@ -0,0 +1,486 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "aom_ports/mem.h" + +void aom_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, + int *min, int *max) { + int i, j; + *min = 255; + *max = 0; + for (i = 0; i < 8; ++i, s += p, d += dp) { + for (j = 0; j < 8; ++j) { + int diff = abs(s[j] - d[j]); + *min = diff < *min ? diff : *min; + *max = diff > *max ? diff : *max; + } + } +} + +unsigned int aom_avg_4x4_c(const uint8_t *s, int p) { + int i, j; + int sum = 0; + for (i = 0; i < 4; ++i, s += p) + for (j = 0; j < 4; sum += s[j], ++j) { + } + + return (sum + 8) >> 4; +} + +unsigned int aom_avg_8x8_c(const uint8_t *s, int p) { + int i, j; + int sum = 0; + for (i = 0; i < 8; ++i, s += p) + for (j = 0; j < 8; sum += s[j], ++j) { + } + + return (sum + 32) >> 6; +} + +#if CONFIG_AV1_HIGHBITDEPTH +unsigned int aom_highbd_avg_8x8_c(const uint8_t *s8, int p) { + int i, j; + int sum = 0; + const uint16_t *s = CONVERT_TO_SHORTPTR(s8); + for (i = 0; i < 8; ++i, s += p) + for (j = 0; j < 8; sum += s[j], ++j) { + } + + return (sum + 32) >> 6; +} + +unsigned int aom_highbd_avg_4x4_c(const uint8_t *s8, int p) { + int i, j; + int sum = 0; + const uint16_t *s = CONVERT_TO_SHORTPTR(s8); + for (i = 0; i < 4; ++i, s += p) + for (j = 0; j < 4; sum += s[j], ++j) { + } + + return (sum + 8) >> 4; +} + +void aom_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, + int dp, int *min, int *max) { + int i, j; + const uint16_t *s = CONVERT_TO_SHORTPTR(s8); + const uint16_t *d = CONVERT_TO_SHORTPTR(d8); + *min = 255; + *max = 0; + for (i = 0; i < 8; ++i, s += p, d += dp) { + for (j = 0; j < 8; ++j) { + int diff = abs(s[j] - d[j]); + *min = diff < *min ? diff : *min; + *max = diff > *max ? diff : *max; + } + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +// src_diff: first pass, 9 bit, dynamic range [-255, 255] +// second pass, 12 bit, dynamic range [-2040, 2040] +static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride, + int16_t *coeff) { + int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride]; + int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride]; + int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride]; + int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride]; + int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride]; + int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride]; + int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride]; + int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride]; + + int16_t c0 = b0 + b2; + int16_t c1 = b1 + b3; + int16_t c2 = b0 - b2; + int16_t c3 = b1 - b3; + int16_t c4 = b4 + b6; + int16_t c5 = b5 + b7; + int16_t c6 = b4 - b6; + int16_t c7 = b5 - b7; + + coeff[0] = c0 + c4; + coeff[7] = c1 + c5; + coeff[3] = c2 + c6; + coeff[4] = c3 + c7; + coeff[2] = c0 - c4; + coeff[6] = c1 - c5; + coeff[1] = c2 - c6; + coeff[5] = c3 - c7; +} + +// The order of the output coeff of the hadamard is not important. For +// optimization purposes the final transpose may be skipped. +void aom_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + int16_t buffer[64]; + int16_t buffer2[64]; + int16_t *tmp_buf = &buffer[0]; + for (idx = 0; idx < 8; ++idx) { + hadamard_col8(src_diff, src_stride, tmp_buf); // src_diff: 9 bit + // dynamic range [-255, 255] + tmp_buf += 8; + ++src_diff; + } + + tmp_buf = &buffer[0]; + for (idx = 0; idx < 8; ++idx) { + hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx); // tmp_buf: 12 bit + // dynamic range [-2040, 2040] + // buffer2: 15 bit + // dynamic range [-16320, 16320] + ++tmp_buf; + } + + for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx]; +} + +void aom_hadamard_lp_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, + int16_t *coeff) { + int16_t buffer[64]; + int16_t buffer2[64]; + int16_t *tmp_buf = &buffer[0]; + for (int idx = 0; idx < 8; ++idx) { + hadamard_col8(src_diff, src_stride, tmp_buf); // src_diff: 9 bit + // dynamic range [-255, 255] + tmp_buf += 8; + ++src_diff; + } + + tmp_buf = &buffer[0]; + for (int idx = 0; idx < 8; ++idx) { + hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx); // tmp_buf: 12 bit + // dynamic range [-2040, 2040] + // buffer2: 15 bit + // dynamic range [-16320, 16320] + ++tmp_buf; + } + + for (int idx = 0; idx < 64; ++idx) coeff[idx] = buffer2[idx]; +} + +// In place 16x16 2D Hadamard transform +void aom_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + for (idx = 0; idx < 4; ++idx) { + // src_diff: 9 bit, dynamic range [-255, 255] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; + aom_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64); + } + + // coeff: 15 bit, dynamic range [-16320, 16320] + for (idx = 0; idx < 64; ++idx) { + tran_low_t a0 = coeff[0]; + tran_low_t a1 = coeff[64]; + tran_low_t a2 = coeff[128]; + tran_low_t a3 = coeff[192]; + + tran_low_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640] + tran_low_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range + tran_low_t b2 = (a2 + a3) >> 1; // [-16320, 16320] + tran_low_t b3 = (a2 - a3) >> 1; + + coeff[0] = b0 + b2; // 16 bit, [-32640, 32640] + coeff[64] = b1 + b3; + coeff[128] = b0 - b2; + coeff[192] = b1 - b3; + + ++coeff; + } +} + +void aom_hadamard_lp_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, + int16_t *coeff) { + for (int idx = 0; idx < 4; ++idx) { + // src_diff: 9 bit, dynamic range [-255, 255] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; + aom_hadamard_lp_8x8_c(src_ptr, src_stride, coeff + idx * 64); + } + + for (int idx = 0; idx < 64; ++idx) { + int16_t a0 = coeff[0]; + int16_t a1 = coeff[64]; + int16_t a2 = coeff[128]; + int16_t a3 = coeff[192]; + + int16_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640] + int16_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range + int16_t b2 = (a2 + a3) >> 1; // [-16320, 16320] + int16_t b3 = (a2 - a3) >> 1; + + coeff[0] = b0 + b2; // 16 bit, [-32640, 32640] + coeff[64] = b1 + b3; + coeff[128] = b0 - b2; + coeff[192] = b1 - b3; + + ++coeff; + } +} + +void aom_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + for (idx = 0; idx < 4; ++idx) { + // src_diff: 9 bit, dynamic range [-255, 255] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + aom_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256); + } + + // coeff: 15 bit, dynamic range [-16320, 16320] + for (idx = 0; idx < 256; ++idx) { + tran_low_t a0 = coeff[0]; + tran_low_t a1 = coeff[256]; + tran_low_t a2 = coeff[512]; + tran_low_t a3 = coeff[768]; + + tran_low_t b0 = (a0 + a1) >> 2; // (a0 + a1): 16 bit, [-32640, 32640] + tran_low_t b1 = (a0 - a1) >> 2; // b0-b3: 15 bit, dynamic range + tran_low_t b2 = (a2 + a3) >> 2; // [-16320, 16320] + tran_low_t b3 = (a2 - a3) >> 2; + + coeff[0] = b0 + b2; // 16 bit, [-32640, 32640] + coeff[256] = b1 + b3; + coeff[512] = b0 - b2; + coeff[768] = b1 - b3; + + ++coeff; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void hadamard_highbd_col8_first_pass(const int16_t *src_diff, + ptrdiff_t src_stride, + int16_t *coeff) { + int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride]; + int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride]; + int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride]; + int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride]; + int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride]; + int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride]; + int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride]; + int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride]; + + int16_t c0 = b0 + b2; + int16_t c1 = b1 + b3; + int16_t c2 = b0 - b2; + int16_t c3 = b1 - b3; + int16_t c4 = b4 + b6; + int16_t c5 = b5 + b7; + int16_t c6 = b4 - b6; + int16_t c7 = b5 - b7; + + coeff[0] = c0 + c4; + coeff[7] = c1 + c5; + coeff[3] = c2 + c6; + coeff[4] = c3 + c7; + coeff[2] = c0 - c4; + coeff[6] = c1 - c5; + coeff[1] = c2 - c6; + coeff[5] = c3 - c7; +} + +// src_diff: 16 bit, dynamic range [-32760, 32760] +// coeff: 19 bit +static void hadamard_highbd_col8_second_pass(const int16_t *src_diff, + ptrdiff_t src_stride, + int32_t *coeff) { + int32_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride]; + int32_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride]; + int32_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride]; + int32_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride]; + int32_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride]; + int32_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride]; + int32_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride]; + int32_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride]; + + int32_t c0 = b0 + b2; + int32_t c1 = b1 + b3; + int32_t c2 = b0 - b2; + int32_t c3 = b1 - b3; + int32_t c4 = b4 + b6; + int32_t c5 = b5 + b7; + int32_t c6 = b4 - b6; + int32_t c7 = b5 - b7; + + coeff[0] = c0 + c4; + coeff[7] = c1 + c5; + coeff[3] = c2 + c6; + coeff[4] = c3 + c7; + coeff[2] = c0 - c4; + coeff[6] = c1 - c5; + coeff[1] = c2 - c6; + coeff[5] = c3 - c7; +} + +// The order of the output coeff of the hadamard is not important. For +// optimization purposes the final transpose may be skipped. +void aom_highbd_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + int16_t buffer[64]; + int32_t buffer2[64]; + int16_t *tmp_buf = &buffer[0]; + for (idx = 0; idx < 8; ++idx) { + // src_diff: 13 bit + // buffer: 16 bit, dynamic range [-32760, 32760] + hadamard_highbd_col8_first_pass(src_diff, src_stride, tmp_buf); + tmp_buf += 8; + ++src_diff; + } + + tmp_buf = &buffer[0]; + for (idx = 0; idx < 8; ++idx) { + // buffer: 16 bit + // buffer2: 19 bit, dynamic range [-262080, 262080] + hadamard_highbd_col8_second_pass(tmp_buf, 8, buffer2 + 8 * idx); + ++tmp_buf; + } + + for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx]; +} + +// In place 16x16 2D Hadamard transform +void aom_highbd_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + for (idx = 0; idx < 4; ++idx) { + // src_diff: 13 bit, dynamic range [-4095, 4095] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; + aom_highbd_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64); + } + + // coeff: 19 bit, dynamic range [-262080, 262080] + for (idx = 0; idx < 64; ++idx) { + tran_low_t a0 = coeff[0]; + tran_low_t a1 = coeff[64]; + tran_low_t a2 = coeff[128]; + tran_low_t a3 = coeff[192]; + + tran_low_t b0 = (a0 + a1) >> 1; + tran_low_t b1 = (a0 - a1) >> 1; + tran_low_t b2 = (a2 + a3) >> 1; + tran_low_t b3 = (a2 - a3) >> 1; + + // new coeff dynamic range: 20 bit + coeff[0] = b0 + b2; + coeff[64] = b1 + b3; + coeff[128] = b0 - b2; + coeff[192] = b1 - b3; + + ++coeff; + } +} + +void aom_highbd_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + for (idx = 0; idx < 4; ++idx) { + // src_diff: 13 bit, dynamic range [-4095, 4095] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + aom_highbd_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256); + } + + // coeff: 20 bit + for (idx = 0; idx < 256; ++idx) { + tran_low_t a0 = coeff[0]; + tran_low_t a1 = coeff[256]; + tran_low_t a2 = coeff[512]; + tran_low_t a3 = coeff[768]; + + tran_low_t b0 = (a0 + a1) >> 2; + tran_low_t b1 = (a0 - a1) >> 2; + tran_low_t b2 = (a2 + a3) >> 2; + tran_low_t b3 = (a2 - a3) >> 2; + + // new coeff dynamic range: 20 bit + coeff[0] = b0 + b2; + coeff[256] = b1 + b3; + coeff[512] = b0 - b2; + coeff[768] = b1 - b3; + + ++coeff; + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +// coeff: 16 bits, dynamic range [-32640, 32640]. +// length: value range {16, 64, 256, 1024}. +int aom_satd_c(const tran_low_t *coeff, int length) { + int i; + int satd = 0; + for (i = 0; i < length; ++i) satd += abs(coeff[i]); + + // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] + return satd; +} + +int aom_satd_lp_c(const int16_t *coeff, int length) { + int satd = 0; + for (int i = 0; i < length; ++i) satd += abs(coeff[i]); + + // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] + return satd; +} + +// Integer projection onto row vectors. +// height: value range {16, 32, 64, 128}. +void aom_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, + const int ref_stride, const int height) { + int idx; + const int norm_factor = height >> 1; + for (idx = 0; idx < 16; ++idx) { + int i; + hbuf[idx] = 0; + // hbuf[idx]: 14 bit, dynamic range [0, 32640]. + for (i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride]; + // hbuf[idx]: 9 bit, dynamic range [0, 1020]. + hbuf[idx] /= norm_factor; + ++ref; + } +} + +// width: value range {16, 32, 64, 128}. +int16_t aom_int_pro_col_c(const uint8_t *ref, const int width) { + int idx; + int16_t sum = 0; + // sum: 14 bit, dynamic range [0, 32640] + for (idx = 0; idx < width; ++idx) sum += ref[idx]; + return sum; +} + +// ref: [0 - 510] +// src: [0 - 510] +// bwl: {2, 3, 4, 5} +int aom_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl) { + int i; + int width = 4 << bwl; + int sse = 0, mean = 0, var; + + for (i = 0; i < width; ++i) { + int diff = ref[i] - src[i]; // diff: dynamic range [-510, 510], 10 bits. + mean += diff; // mean: dynamic range 16 bits. + sse += diff * diff; // sse: dynamic range 26 bits. + } + + // (mean * mean): dynamic range 31 bits. + var = sse - ((mean * mean) >> (bwl + 2)); + return var; +} diff --git a/libs/libaom/src/aom_dsp/binary_codes_reader.c b/libs/libaom/src/aom_dsp/binary_codes_reader.c new file mode 100644 index 000000000..7cd903d82 --- /dev/null +++ b/libs/libaom/src/aom_dsp/binary_codes_reader.c @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/binary_codes_reader.h" +#include "aom_dsp/recenter.h" +#include "av1/common/common.h" + +uint16_t aom_read_primitive_quniform_(aom_reader *r, + uint16_t n ACCT_STR_PARAM) { + if (n <= 1) return 0; + const int l = get_msb(n) + 1; + const int m = (1 << l) - n; + const int v = aom_read_literal(r, l - 1, ACCT_STR_NAME); + return v < m ? v : (v << 1) - m + aom_read_bit(r, ACCT_STR_NAME); +} + +// Decode finite subexponential code that for a symbol v in [0, n-1] with +// parameter k +uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n, + uint16_t k ACCT_STR_PARAM) { + int i = 0; + int mk = 0; + + while (1) { + int b = (i ? k + i - 1 : k); + int a = (1 << b); + + if (n <= mk + 3 * a) { + return aom_read_primitive_quniform(r, n - mk, ACCT_STR_NAME) + mk; + } + + if (!aom_read_bit(r, ACCT_STR_NAME)) { + return aom_read_literal(r, b, ACCT_STR_NAME) + mk; + } + + i = i + 1; + mk += a; + } + + assert(0); + return 0; +} + +uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k, + uint16_t ref ACCT_STR_PARAM) { + return inv_recenter_finite_nonneg( + n, ref, aom_read_primitive_subexpfin(r, n, k, ACCT_STR_NAME)); +} diff --git a/libs/libaom/src/aom_dsp/binary_codes_reader.h b/libs/libaom/src/aom_dsp/binary_codes_reader.h new file mode 100644 index 000000000..d218f0619 --- /dev/null +++ b/libs/libaom/src/aom_dsp/binary_codes_reader.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_BINARY_CODES_READER_H_ +#define AOM_AOM_DSP_BINARY_CODES_READER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/bitreader.h" +#include "aom_dsp/bitreader_buffer.h" + +#define aom_read_primitive_quniform(r, n, ACCT_STR_NAME) \ + aom_read_primitive_quniform_(r, n ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_primitive_subexpfin(r, n, k, ACCT_STR_NAME) \ + aom_read_primitive_subexpfin_(r, n, k ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_primitive_refsubexpfin(r, n, k, ref, ACCT_STR_NAME) \ + aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME)) + +uint16_t aom_read_primitive_quniform_(aom_reader *r, uint16_t n ACCT_STR_PARAM); +uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n, + uint16_t k ACCT_STR_PARAM); +uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k, + uint16_t ref ACCT_STR_PARAM); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_BINARY_CODES_READER_H_ diff --git a/libs/libaom/src/aom_dsp/binary_codes_writer.c b/libs/libaom/src/aom_dsp/binary_codes_writer.c new file mode 100644 index 000000000..adf1c1304 --- /dev/null +++ b/libs/libaom/src/aom_dsp/binary_codes_writer.c @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/bitwriter.h" +#include "aom_dsp/binary_codes_writer.h" +#include "aom_dsp/recenter.h" +#include "aom_ports/bitops.h" +#include "av1/common/common.h" + +// Codes a symbol v in [-2^mag_bits, 2^mag_bits]. +// mag_bits is number of bits for magnitude. The alphabet is of size +// 2 * 2^mag_bits + 1, symmetric around 0, where one bit is used to +// indicate 0 or non-zero, mag_bits bits are used to indicate magnitide +// and 1 more bit for the sign if non-zero. +void aom_write_primitive_symmetric(aom_writer *w, int16_t v, + unsigned int abs_bits) { + if (v == 0) { + aom_write_bit(w, 0); + } else { + const int x = abs(v); + const int s = v < 0; + aom_write_bit(w, 1); + aom_write_bit(w, s); + aom_write_literal(w, x - 1, abs_bits); + } +} + +int aom_count_primitive_symmetric(int16_t v, unsigned int abs_bits) { + return (v == 0 ? 1 : abs_bits + 2); +} + +// Encodes a value v in [0, n-1] quasi-uniformly +void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v) { + if (n <= 1) return; + const int l = get_msb(n) + 1; + const int m = (1 << l) - n; + if (v < m) { + aom_write_literal(w, v, l - 1); + } else { + aom_write_literal(w, m + ((v - m) >> 1), l - 1); + aom_write_bit(w, (v - m) & 1); + } +} + +int aom_count_primitive_quniform(uint16_t n, uint16_t v) { + if (n <= 1) return 0; + const int l = get_msb(n) + 1; + const int m = (1 << l) - n; + return v < m ? l - 1 : l; +} + +// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k +void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k, + uint16_t v) { + int i = 0; + int mk = 0; + while (1) { + int b = (i ? k + i - 1 : k); + int a = (1 << b); + if (n <= mk + 3 * a) { + aom_write_primitive_quniform(w, n - mk, v - mk); + break; + } else { + int t = (v >= mk + a); + aom_write_bit(w, t); + if (t) { + i = i + 1; + mk += a; + } else { + aom_write_literal(w, v - mk, b); + break; + } + } + } +} + +int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v) { + int count = 0; + int i = 0; + int mk = 0; + while (1) { + int b = (i ? k + i - 1 : k); + int a = (1 << b); + if (n <= mk + 3 * a) { + count += aom_count_primitive_quniform(n - mk, v - mk); + break; + } else { + int t = (v >= mk + a); + count++; + if (t) { + i = i + 1; + mk += a; + } else { + count += b; + break; + } + } + } + return count; +} + +// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k +// based on a reference ref also in [0, n-1]. +// Recenters symbol around r first and then uses a finite subexponential code. +void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k, + uint16_t ref, uint16_t v) { + aom_write_primitive_subexpfin(w, n, k, recenter_finite_nonneg(n, ref, v)); +} + +void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n, + uint16_t k, int16_t ref, + int16_t v) { + ref += n - 1; + v += n - 1; + const uint16_t scaled_n = (n << 1) - 1; + aom_write_primitive_refsubexpfin(w, scaled_n, k, ref, v); +} + +int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref, + uint16_t v) { + return aom_count_primitive_subexpfin(n, k, recenter_finite_nonneg(n, ref, v)); +} + +int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref, + int16_t v) { + ref += n - 1; + v += n - 1; + const uint16_t scaled_n = (n << 1) - 1; + return aom_count_primitive_refsubexpfin(scaled_n, k, ref, v); +} diff --git a/libs/libaom/src/aom_dsp/binary_codes_writer.h b/libs/libaom/src/aom_dsp/binary_codes_writer.h new file mode 100644 index 000000000..5ec866213 --- /dev/null +++ b/libs/libaom/src/aom_dsp/binary_codes_writer.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_BINARY_CODES_WRITER_H_ +#define AOM_AOM_DSP_BINARY_CODES_WRITER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/bitwriter.h" +#include "aom_dsp/bitwriter_buffer.h" + +// Codes a symbol v in [-2^mag_bits, 2^mag_bits] +// mag_bits is number of bits for magnitude. The alphabet is of size +// 2 * 2^mag_bits + 1, symmetric around 0, where one bit is used to +// indicate 0 or non-zero, mag_bits bits are used to indicate magnitide +// and 1 more bit for the sign if non-zero. +void aom_write_primitive_symmetric(aom_writer *w, int16_t v, + unsigned int mag_bits); + +// Encodes a value v in [0, n-1] quasi-uniformly +void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v); + +// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k +void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k, + uint16_t v); + +// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k +// based on a reference ref also in [0, n-1]. +void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k, + uint16_t ref, uint16_t v); + +// Finite subexponential code that codes a symbol v in [-(n-1), n-1] with +// parameter k based on a reference ref also in [-(n-1), n-1]. +void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n, + uint16_t k, int16_t ref, + int16_t v); + +// Functions that counts bits for the above primitives +int aom_count_primitive_symmetric(int16_t v, unsigned int mag_bits); +int aom_count_primitive_quniform(uint16_t n, uint16_t v); +int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v); +int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref, + uint16_t v); +int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref, + int16_t v); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_BINARY_CODES_WRITER_H_ diff --git a/libs/libaom/src/aom_dsp/bitreader.c b/libs/libaom/src/aom_dsp/bitreader.c new file mode 100644 index 000000000..4c70a9171 --- /dev/null +++ b/libs/libaom/src/aom_dsp/bitreader.c @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/bitreader.h" + +int aom_reader_init(aom_reader *r, const uint8_t *buffer, size_t size) { + if (size && !buffer) { + return 1; + } + r->buffer_end = buffer + size; + r->buffer = buffer; + od_ec_dec_init(&r->ec, buffer, (uint32_t)size); +#if CONFIG_ACCOUNTING + r->accounting = NULL; +#endif + return 0; +} + +const uint8_t *aom_reader_find_begin(aom_reader *r) { return r->buffer; } + +const uint8_t *aom_reader_find_end(aom_reader *r) { return r->buffer_end; } + +uint32_t aom_reader_tell(const aom_reader *r) { return od_ec_dec_tell(&r->ec); } + +uint32_t aom_reader_tell_frac(const aom_reader *r) { + return od_ec_dec_tell_frac(&r->ec); +} + +int aom_reader_has_overflowed(const aom_reader *r) { + const uint32_t tell_bits = aom_reader_tell(r); + const uint32_t tell_bytes = (tell_bits + 7) >> 3; + return ((ptrdiff_t)tell_bytes > r->buffer_end - r->buffer); +} diff --git a/libs/libaom/src/aom_dsp/bitreader.h b/libs/libaom/src/aom_dsp/bitreader.h new file mode 100644 index 000000000..a8b3f55ef --- /dev/null +++ b/libs/libaom/src/aom_dsp/bitreader.h @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_BITREADER_H_ +#define AOM_AOM_DSP_BITREADER_H_ + +#include +#include + +#include "config/aom_config.h" + +#include "aom/aomdx.h" +#include "aom/aom_integer.h" +#include "aom_dsp/entdec.h" +#include "aom_dsp/prob.h" +#include "av1/common/odintrin.h" + +#if CONFIG_ACCOUNTING +#include "av1/decoder/accounting.h" +#define ACCT_STR_NAME acct_str +#define ACCT_STR_PARAM , const char *ACCT_STR_NAME +#define ACCT_STR_ARG(s) , s +#else +#define ACCT_STR_PARAM +#define ACCT_STR_ARG(s) +#endif + +#define aom_read(r, prob, ACCT_STR_NAME) \ + aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_bit(r, ACCT_STR_NAME) \ + aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_tree(r, tree, probs, ACCT_STR_NAME) \ + aom_read_tree_(r, tree, probs ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_literal(r, bits, ACCT_STR_NAME) \ + aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME) \ + aom_read_cdf_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME) \ + aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME)) + +#ifdef __cplusplus +extern "C" { +#endif + +struct aom_reader { + const uint8_t *buffer; + const uint8_t *buffer_end; + od_ec_dec ec; +#if CONFIG_ACCOUNTING + Accounting *accounting; +#endif + uint8_t allow_update_cdf; +}; + +typedef struct aom_reader aom_reader; + +int aom_reader_init(aom_reader *r, const uint8_t *buffer, size_t size); + +const uint8_t *aom_reader_find_begin(aom_reader *r); + +const uint8_t *aom_reader_find_end(aom_reader *r); + +// Returns true if the bit reader has tried to decode more data from the buffer +// than was actually provided. +int aom_reader_has_overflowed(const aom_reader *r); + +// Returns the position in the bit reader in bits. +uint32_t aom_reader_tell(const aom_reader *r); + +// Returns the position in the bit reader in 1/8th bits. +uint32_t aom_reader_tell_frac(const aom_reader *r); + +#if CONFIG_ACCOUNTING +static INLINE void aom_process_accounting(const aom_reader *r ACCT_STR_PARAM) { + if (r->accounting != NULL) { + uint32_t tell_frac; + tell_frac = aom_reader_tell_frac(r); + aom_accounting_record(r->accounting, ACCT_STR_NAME, + tell_frac - r->accounting->last_tell_frac); + r->accounting->last_tell_frac = tell_frac; + } +} + +static INLINE void aom_update_symb_counts(const aom_reader *r, int is_binary) { + if (r->accounting != NULL) { + r->accounting->syms.num_multi_syms += !is_binary; + r->accounting->syms.num_binary_syms += !!is_binary; + } +} +#endif + +static INLINE int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) { + int p = (0x7FFFFF - (prob << 15) + prob) >> 8; + int bit = od_ec_decode_bool_q15(&r->ec, p); + +#if CONFIG_BITSTREAM_DEBUG + { + int i; + int ref_bit, ref_nsymbs; + aom_cdf_prob ref_cdf[16]; + const int queue_r = bitstream_queue_get_read(); + const int frame_idx = aom_bitstream_queue_get_frame_read(); + bitstream_queue_pop(&ref_bit, ref_cdf, &ref_nsymbs); + if (ref_nsymbs != 2) { + fprintf(stderr, + "\n *** [bit] nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs " + "%d queue_r %d\n", + frame_idx, 2, ref_nsymbs, queue_r); + assert(0); + } + if ((ref_nsymbs != 2) || (ref_cdf[0] != (aom_cdf_prob)p) || + (ref_cdf[1] != 32767)) { + fprintf(stderr, + "\n *** [bit] cdf error, frame_idx_r %d cdf {%d, %d} ref_cdf {%d", + frame_idx, p, 32767, ref_cdf[0]); + for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]); + fprintf(stderr, "} queue_r %d\n", queue_r); + assert(0); + } + if (bit != ref_bit) { + fprintf(stderr, + "\n *** [bit] symb error, frame_idx_r %d symb %d ref_symb %d " + "queue_r %d\n", + frame_idx, bit, ref_bit, queue_r); + assert(0); + } + } +#endif + +#if CONFIG_ACCOUNTING + if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); + aom_update_symb_counts(r, 1); +#endif + return bit; +} + +static INLINE int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) { + int ret; + ret = aom_read(r, 128, NULL); // aom_prob_half +#if CONFIG_ACCOUNTING + if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); +#endif + return ret; +} + +static INLINE int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) { + int literal = 0, bit; + + for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r, NULL) << bit; +#if CONFIG_ACCOUNTING + if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); +#endif + return literal; +} + +static INLINE int aom_read_cdf_(aom_reader *r, const aom_cdf_prob *cdf, + int nsymbs ACCT_STR_PARAM) { + int symb; + assert(cdf != NULL); + symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs); + +#if CONFIG_BITSTREAM_DEBUG + { + int i; + int cdf_error = 0; + int ref_symb, ref_nsymbs; + aom_cdf_prob ref_cdf[16]; + const int queue_r = bitstream_queue_get_read(); + const int frame_idx = aom_bitstream_queue_get_frame_read(); + bitstream_queue_pop(&ref_symb, ref_cdf, &ref_nsymbs); + if (nsymbs != ref_nsymbs) { + fprintf(stderr, + "\n *** nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs %d " + "queue_r %d\n", + frame_idx, nsymbs, ref_nsymbs, queue_r); + cdf_error = 0; + assert(0); + } else { + for (i = 0; i < nsymbs; ++i) + if (cdf[i] != ref_cdf[i]) cdf_error = 1; + } + if (cdf_error) { + fprintf(stderr, "\n *** cdf error, frame_idx_r %d cdf {%d", frame_idx, + cdf[0]); + for (i = 1; i < nsymbs; ++i) fprintf(stderr, ", %d", cdf[i]); + fprintf(stderr, "} ref_cdf {%d", ref_cdf[0]); + for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]); + fprintf(stderr, "} queue_r %d\n", queue_r); + assert(0); + } + if (symb != ref_symb) { + fprintf( + stderr, + "\n *** symb error, frame_idx_r %d symb %d ref_symb %d queue_r %d\n", + frame_idx, symb, ref_symb, queue_r); + assert(0); + } + } +#endif + +#if CONFIG_ACCOUNTING + if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); + aom_update_symb_counts(r, (nsymbs == 2)); +#endif + return symb; +} + +static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf, + int nsymbs ACCT_STR_PARAM) { + int ret; + ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME); + if (r->allow_update_cdf) update_cdf(cdf, ret, nsymbs); + return ret; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_BITREADER_H_ diff --git a/libs/libaom/src/aom_dsp/bitreader_buffer.c b/libs/libaom/src/aom_dsp/bitreader_buffer.c new file mode 100644 index 000000000..d79feea6a --- /dev/null +++ b/libs/libaom/src/aom_dsp/bitreader_buffer.c @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" + +#include "aom_dsp/bitreader_buffer.h" +#include "aom_dsp/recenter.h" +#include "aom_ports/bitops.h" + +size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb) { + return (rb->bit_offset + 7) >> 3; +} + +int aom_rb_read_bit(struct aom_read_bit_buffer *rb) { + const uint32_t off = rb->bit_offset; + const uint32_t p = off >> 3; + const int q = 7 - (int)(off & 0x7); + if (rb->bit_buffer + p < rb->bit_buffer_end) { + const int bit = (rb->bit_buffer[p] >> q) & 1; + rb->bit_offset = off + 1; + return bit; + } else { + if (rb->error_handler) rb->error_handler(rb->error_handler_data); + return 0; + } +} + +int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) { + assert(bits <= 31); + int value = 0, bit; + for (bit = bits - 1; bit >= 0; bit--) value |= aom_rb_read_bit(rb) << bit; + return value; +} + +uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb, + int bits) { + assert(bits <= 32); + uint32_t value = 0; + int bit; + for (bit = bits - 1; bit >= 0; bit--) + value |= (uint32_t)aom_rb_read_bit(rb) << bit; + return value; +} + +int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) { + const int nbits = sizeof(unsigned) * 8 - bits - 1; + const unsigned value = (unsigned)aom_rb_read_literal(rb, bits + 1) << nbits; + return ((int)value) >> nbits; +} + +uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb) { + int leading_zeros = 0; + while (leading_zeros < 32 && !aom_rb_read_bit(rb)) ++leading_zeros; + // Maximum 32 bits. + if (leading_zeros == 32) return UINT32_MAX; + const uint32_t base = (1u << leading_zeros) - 1; + const uint32_t value = aom_rb_read_literal(rb, leading_zeros); + return base + value; +} + +static uint16_t aom_rb_read_primitive_quniform(struct aom_read_bit_buffer *rb, + uint16_t n) { + if (n <= 1) return 0; + const int l = get_msb(n) + 1; + const int m = (1 << l) - n; + const int v = aom_rb_read_literal(rb, l - 1); + return v < m ? v : (v << 1) - m + aom_rb_read_bit(rb); +} + +static uint16_t aom_rb_read_primitive_subexpfin(struct aom_read_bit_buffer *rb, + uint16_t n, uint16_t k) { + int i = 0; + int mk = 0; + + while (1) { + int b = (i ? k + i - 1 : k); + int a = (1 << b); + + if (n <= mk + 3 * a) { + return aom_rb_read_primitive_quniform(rb, n - mk) + mk; + } + + if (!aom_rb_read_bit(rb)) { + return aom_rb_read_literal(rb, b) + mk; + } + + i = i + 1; + mk += a; + } + + assert(0); + return 0; +} + +static uint16_t aom_rb_read_primitive_refsubexpfin( + struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, uint16_t ref) { + return inv_recenter_finite_nonneg(n, ref, + aom_rb_read_primitive_subexpfin(rb, n, k)); +} + +int16_t aom_rb_read_signed_primitive_refsubexpfin( + struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref) { + ref += n - 1; + const uint16_t scaled_n = (n << 1) - 1; + return aom_rb_read_primitive_refsubexpfin(rb, scaled_n, k, ref) - n + 1; +} diff --git a/libs/libaom/src/aom_dsp/bitreader_buffer.h b/libs/libaom/src/aom_dsp/bitreader_buffer.h new file mode 100644 index 000000000..359fbe519 --- /dev/null +++ b/libs/libaom/src/aom_dsp/bitreader_buffer.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_BITREADER_BUFFER_H_ +#define AOM_AOM_DSP_BITREADER_BUFFER_H_ + +#include + +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*aom_rb_error_handler)(void *data); + +struct aom_read_bit_buffer { + const uint8_t *bit_buffer; + const uint8_t *bit_buffer_end; + uint32_t bit_offset; + + void *error_handler_data; + aom_rb_error_handler error_handler; +}; + +size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb); + +int aom_rb_read_bit(struct aom_read_bit_buffer *rb); + +int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits); + +uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb, int bits); + +int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits); + +uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb); + +int16_t aom_rb_read_signed_primitive_refsubexpfin( + struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_BITREADER_BUFFER_H_ diff --git a/libs/libaom/src/aom_dsp/bitwriter.c b/libs/libaom/src/aom_dsp/bitwriter.c new file mode 100644 index 000000000..41fcc5175 --- /dev/null +++ b/libs/libaom/src/aom_dsp/bitwriter.c @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "aom_dsp/bitwriter.h" + +void aom_start_encode(aom_writer *w, uint8_t *source) { + w->buffer = source; + w->pos = 0; + od_ec_enc_init(&w->ec, 62025); +} + +int aom_stop_encode(aom_writer *w) { + int nb_bits; + uint32_t bytes; + unsigned char *data; + data = od_ec_enc_done(&w->ec, &bytes); + nb_bits = od_ec_enc_tell(&w->ec); + memcpy(w->buffer, data, bytes); + w->pos = bytes; + od_ec_enc_clear(&w->ec); + return nb_bits; +} diff --git a/libs/libaom/src/aom_dsp/bitwriter.h b/libs/libaom/src/aom_dsp/bitwriter.h new file mode 100644 index 000000000..4e77a1794 --- /dev/null +++ b/libs/libaom/src/aom_dsp/bitwriter.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_BITWRITER_H_ +#define AOM_AOM_DSP_BITWRITER_H_ + +#include + +#include "config/aom_config.h" + +#include "aom_dsp/entenc.h" +#include "aom_dsp/prob.h" + +#if CONFIG_RD_DEBUG +#include "av1/common/blockd.h" +#include "av1/encoder/cost.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +struct aom_writer { + unsigned int pos; + uint8_t *buffer; + od_ec_enc ec; + uint8_t allow_update_cdf; +}; + +typedef struct aom_writer aom_writer; + +typedef struct TOKEN_STATS { + int cost; +#if CONFIG_RD_DEBUG + int txb_coeff_cost_map[TXB_COEFF_COST_MAP_SIZE][TXB_COEFF_COST_MAP_SIZE]; +#endif +} TOKEN_STATS; + +static INLINE void init_token_stats(TOKEN_STATS *token_stats) { +#if CONFIG_RD_DEBUG + int r, c; + for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) { + for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) { + token_stats->txb_coeff_cost_map[r][c] = 0; + } + } +#endif + token_stats->cost = 0; +} + +void aom_start_encode(aom_writer *w, uint8_t *buffer); + +int aom_stop_encode(aom_writer *w); + +static INLINE void aom_write(aom_writer *w, int bit, int probability) { + int p = (0x7FFFFF - (probability << 15) + probability) >> 8; +#if CONFIG_BITSTREAM_DEBUG + aom_cdf_prob cdf[2] = { (aom_cdf_prob)p, 32767 }; + /*int queue_r = 0; + int frame_idx_r = 0; + int queue_w = bitstream_queue_get_write(); + int frame_idx_w = aom_bitstream_queue_get_frame_writee(); + if (frame_idx_w == frame_idx_r && queue_w == queue_r) { + fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n", + frame_idx_w, queue_w); + }*/ + bitstream_queue_push(bit, cdf, 2); +#endif + + od_ec_encode_bool_q15(&w->ec, bit, p); +} + +static INLINE void aom_write_bit(aom_writer *w, int bit) { + aom_write(w, bit, 128); // aom_prob_half +} + +static INLINE void aom_write_literal(aom_writer *w, int data, int bits) { + int bit; + + for (bit = bits - 1; bit >= 0; bit--) aom_write_bit(w, 1 & (data >> bit)); +} + +static INLINE void aom_write_cdf(aom_writer *w, int symb, + const aom_cdf_prob *cdf, int nsymbs) { +#if CONFIG_BITSTREAM_DEBUG + /*int queue_r = 0; + int frame_idx_r = 0; + int queue_w = bitstream_queue_get_write(); + int frame_idx_w = aom_bitstream_queue_get_frame_writee(); + if (frame_idx_w == frame_idx_r && queue_w == queue_r) { + fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n", + frame_idx_w, queue_w); + }*/ + bitstream_queue_push(symb, cdf, nsymbs); +#endif + + od_ec_encode_cdf_q15(&w->ec, symb, cdf, nsymbs); +} + +static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf, + int nsymbs) { + aom_write_cdf(w, symb, cdf, nsymbs); + if (w->allow_update_cdf) update_cdf(cdf, symb, nsymbs); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_BITWRITER_H_ diff --git a/libs/libaom/src/aom_dsp/bitwriter_buffer.c b/libs/libaom/src/aom_dsp/bitwriter_buffer.c new file mode 100644 index 000000000..7d0ab9486 --- /dev/null +++ b/libs/libaom/src/aom_dsp/bitwriter_buffer.c @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" + +#include "aom_dsp/bitwriter_buffer.h" +#include "aom_dsp/recenter.h" +#include "aom_ports/bitops.h" + +int aom_wb_is_byte_aligned(const struct aom_write_bit_buffer *wb) { + return (wb->bit_offset % CHAR_BIT == 0); +} + +uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb) { + return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0); +} + +void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit) { + const int off = (int)wb->bit_offset; + const int p = off / CHAR_BIT; + const int q = CHAR_BIT - 1 - off % CHAR_BIT; + if (q == CHAR_BIT - 1) { + // Zero next char and write bit + wb->bit_buffer[p] = bit << q; + } else { + wb->bit_buffer[p] &= ~(1 << q); + wb->bit_buffer[p] |= bit << q; + } + wb->bit_offset = off + 1; +} + +void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit) { + // Do not zero bytes but overwrite exisiting values + const int off = (int)wb->bit_offset; + const int p = off / CHAR_BIT; + const int q = CHAR_BIT - 1 - off % CHAR_BIT; + wb->bit_buffer[p] &= ~(1 << q); + wb->bit_buffer[p] |= bit << q; + wb->bit_offset = off + 1; +} + +void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits) { + assert(bits <= 31); + int bit; + for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1); +} + +void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb, + uint32_t data, int bits) { + assert(bits <= 32); + int bit; + for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1); +} + +void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data, + int bits) { + int bit; + for (bit = bits - 1; bit >= 0; bit--) + aom_wb_overwrite_bit(wb, (data >> bit) & 1); +} + +void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data, + int bits) { + aom_wb_write_literal(wb, data, bits + 1); +} + +void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v) { + int64_t shift_val = ++v; + int leading_zeroes = 1; + + assert(shift_val > 0); + + while (shift_val >>= 1) leading_zeroes += 2; + + aom_wb_write_literal(wb, 0, leading_zeroes >> 1); + aom_wb_write_unsigned_literal(wb, v, (leading_zeroes + 1) >> 1); +} + +static void wb_write_primitive_quniform(struct aom_write_bit_buffer *wb, + uint16_t n, uint16_t v) { + if (n <= 1) return; + const int l = get_msb(n) + 1; + const int m = (1 << l) - n; + if (v < m) { + aom_wb_write_literal(wb, v, l - 1); + } else { + aom_wb_write_literal(wb, m + ((v - m) >> 1), l - 1); + aom_wb_write_bit(wb, (v - m) & 1); + } +} + +static void wb_write_primitive_subexpfin(struct aom_write_bit_buffer *wb, + uint16_t n, uint16_t k, uint16_t v) { + int i = 0; + int mk = 0; + while (1) { + int b = (i ? k + i - 1 : k); + int a = (1 << b); + if (n <= mk + 3 * a) { + wb_write_primitive_quniform(wb, n - mk, v - mk); + break; + } else { + int t = (v >= mk + a); + aom_wb_write_bit(wb, t); + if (t) { + i = i + 1; + mk += a; + } else { + aom_wb_write_literal(wb, v - mk, b); + break; + } + } + } +} + +static void wb_write_primitive_refsubexpfin(struct aom_write_bit_buffer *wb, + uint16_t n, uint16_t k, + uint16_t ref, uint16_t v) { + wb_write_primitive_subexpfin(wb, n, k, recenter_finite_nonneg(n, ref, v)); +} + +void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb, + uint16_t n, uint16_t k, + int16_t ref, int16_t v) { + ref += n - 1; + v += n - 1; + const uint16_t scaled_n = (n << 1) - 1; + wb_write_primitive_refsubexpfin(wb, scaled_n, k, ref, v); +} diff --git a/libs/libaom/src/aom_dsp/bitwriter_buffer.h b/libs/libaom/src/aom_dsp/bitwriter_buffer.h new file mode 100644 index 000000000..fd10e01bb --- /dev/null +++ b/libs/libaom/src/aom_dsp/bitwriter_buffer.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_BITWRITER_BUFFER_H_ +#define AOM_AOM_DSP_BITWRITER_BUFFER_H_ + +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct aom_write_bit_buffer { + uint8_t *bit_buffer; + uint32_t bit_offset; +}; + +int aom_wb_is_byte_aligned(const struct aom_write_bit_buffer *wb); + +uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb); + +void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit); + +void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit); + +void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits); + +void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb, + uint32_t data, int bits); + +void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data, + int bits); + +void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data, + int bits); + +void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v); + +void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb, + uint16_t n, uint16_t k, + int16_t ref, int16_t v); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_BITWRITER_BUFFER_H_ diff --git a/libs/libaom/src/aom_dsp/blend.h b/libs/libaom/src/aom_dsp/blend.h new file mode 100644 index 000000000..fd87dc181 --- /dev/null +++ b/libs/libaom/src/aom_dsp/blend.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_BLEND_H_ +#define AOM_AOM_DSP_BLEND_H_ + +#include "aom_ports/mem.h" + +// Various blending functions and macros. +// See also the aom_blend_* functions in aom_dsp_rtcd.h + +// Alpha blending with alpha values from the range [0, 64], where 64 +// means use the first input and 0 means use the second input. + +#define AOM_BLEND_A64_ROUND_BITS 6 +#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS) // 64 + +#define AOM_BLEND_A64(a, v0, v1) \ + ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \ + AOM_BLEND_A64_ROUND_BITS) + +// Alpha blending with alpha values from the range [0, 256], where 256 +// means use the first input and 0 means use the second input. +#define AOM_BLEND_A256_ROUND_BITS 8 +#define AOM_BLEND_A256_MAX_ALPHA (1 << AOM_BLEND_A256_ROUND_BITS) // 256 + +#define AOM_BLEND_A256(a, v0, v1) \ + ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A256_MAX_ALPHA - (a)) * (v1), \ + AOM_BLEND_A256_ROUND_BITS) + +// Blending by averaging. +#define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1) + +#define DIFF_FACTOR_LOG2 4 +#define DIFF_FACTOR (1 << DIFF_FACTOR_LOG2) + +#endif // AOM_AOM_DSP_BLEND_H_ diff --git a/libs/libaom/src/aom_dsp/blend_a64_hmask.c b/libs/libaom/src/aom_dsp/blend_a64_hmask.c new file mode 100644 index 000000000..e9e38ef96 --- /dev/null +++ b/libs/libaom/src/aom_dsp/blend_a64_hmask.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" + +#include "config/aom_dsp_rtcd.h" + +void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + int i, j; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + dst[i * dst_stride + j] = AOM_BLEND_A64( + mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]); + } + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_blend_a64_hmask_c(uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, uint32_t src0_stride, + const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, int w, int h, int bd) { + int i, j; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); + const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); + (void)bd; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + dst[i * dst_stride + j] = AOM_BLEND_A64( + mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]); + } + } +} +#endif diff --git a/libs/libaom/src/aom_dsp/blend_a64_mask.c b/libs/libaom/src/aom_dsp/blend_a64_mask.c new file mode 100644 index 000000000..32f2dc6d8 --- /dev/null +++ b/libs/libaom/src/aom_dsp/blend_a64_mask.c @@ -0,0 +1,349 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/blend.h" +#include "aom_dsp/aom_dsp_common.h" + +#include "config/aom_dsp_rtcd.h" + +// Blending with alpha mask. Mask values come from the range [0, 64], +// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can +// be the same as dst, or dst can be different from both sources. + +// NOTE(david.barker): The input and output of aom_blend_a64_d16_mask_c() are +// in a higher intermediate precision, and will later be rounded down to pixel +// precision. +// Thus, in order to avoid double-rounding, we want to use normal right shifts +// within this function, not ROUND_POWER_OF_TWO. +// This works because of the identity: +// ROUND_POWER_OF_TWO(x >> y, z) == ROUND_POWER_OF_TWO(x, y+z) +// +// In contrast, the output of the non-d16 functions will not be further rounded, +// so we *should* use ROUND_POWER_OF_TWO there. + +void aom_lowbd_blend_a64_d16_mask_c( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, + ConvolveParams *conv_params) { + int i, j; + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + + assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); + + assert(h >= 4); + assert(w >= 4); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (subw == 0 && subh == 0) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + int32_t res; + const int m = mask[i * mask_stride + j]; + res = ((m * (int32_t)src0[i * src0_stride + j] + + (AOM_BLEND_A64_MAX_ALPHA - m) * + (int32_t)src1[i * src1_stride + j]) >> + AOM_BLEND_A64_ROUND_BITS); + res -= round_offset; + dst[i * dst_stride + j] = + clip_pixel(ROUND_POWER_OF_TWO(res, round_bits)); + } + } + } else if (subw == 1 && subh == 1) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + int32_t res; + const int m = ROUND_POWER_OF_TWO( + mask[(2 * i) * mask_stride + (2 * j)] + + mask[(2 * i + 1) * mask_stride + (2 * j)] + + mask[(2 * i) * mask_stride + (2 * j + 1)] + + mask[(2 * i + 1) * mask_stride + (2 * j + 1)], + 2); + res = ((m * (int32_t)src0[i * src0_stride + j] + + (AOM_BLEND_A64_MAX_ALPHA - m) * + (int32_t)src1[i * src1_stride + j]) >> + AOM_BLEND_A64_ROUND_BITS); + res -= round_offset; + dst[i * dst_stride + j] = + clip_pixel(ROUND_POWER_OF_TWO(res, round_bits)); + } + } + } else if (subw == 1 && subh == 0) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + int32_t res; + const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)], + mask[i * mask_stride + (2 * j + 1)]); + res = ((m * (int32_t)src0[i * src0_stride + j] + + (AOM_BLEND_A64_MAX_ALPHA - m) * + (int32_t)src1[i * src1_stride + j]) >> + AOM_BLEND_A64_ROUND_BITS); + res -= round_offset; + dst[i * dst_stride + j] = + clip_pixel(ROUND_POWER_OF_TWO(res, round_bits)); + } + } + } else { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + int32_t res; + const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j], + mask[(2 * i + 1) * mask_stride + j]); + res = ((int32_t)(m * (int32_t)src0[i * src0_stride + j] + + (AOM_BLEND_A64_MAX_ALPHA - m) * + (int32_t)src1[i * src1_stride + j]) >> + AOM_BLEND_A64_ROUND_BITS); + res -= round_offset; + dst[i * dst_stride + j] = + clip_pixel(ROUND_POWER_OF_TWO(res, round_bits)); + } + } + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_blend_a64_d16_mask_c( + uint8_t *dst_8, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, + ConvolveParams *conv_params, const int bd) { + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + // excerpt from clip_pixel_highbd() + // set saturation_value to (1 << bd) - 1 + unsigned int saturation_value; + switch (bd) { + case 8: + default: saturation_value = 255; break; + case 10: saturation_value = 1023; break; + case 12: saturation_value = 4095; break; + } + + if (subw == 0 && subh == 0) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int32_t res; + const int m = mask[j]; + res = ((m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >> + AOM_BLEND_A64_ROUND_BITS); + res -= round_offset; + unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits)); + dst[j] = AOMMIN(v, saturation_value); + } + mask += mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } + } else if (subw == 1 && subh == 1) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int32_t res; + const int m = ROUND_POWER_OF_TWO( + mask[2 * j] + mask[mask_stride + 2 * j] + mask[2 * j + 1] + + mask[mask_stride + 2 * j + 1], + 2); + res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >> + AOM_BLEND_A64_ROUND_BITS; + res -= round_offset; + unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits)); + dst[j] = AOMMIN(v, saturation_value); + } + mask += 2 * mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } + } else if (subw == 1 && subh == 0) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int32_t res; + const int m = AOM_BLEND_AVG(mask[2 * j], mask[2 * j + 1]); + res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >> + AOM_BLEND_A64_ROUND_BITS; + res -= round_offset; + unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits)); + dst[j] = AOMMIN(v, saturation_value); + } + mask += mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } + } else { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int32_t res; + const int m = AOM_BLEND_AVG(mask[j], mask[mask_stride + j]); + res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >> + AOM_BLEND_A64_ROUND_BITS; + res -= round_offset; + unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits)); + dst[j] = AOMMIN(v, saturation_value); + } + mask += 2 * mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +// Blending with alpha mask. Mask values come from the range [0, 64], +// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can +// be the same as dst, or dst can be different from both sources. + +void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, + int h, int subw, int subh) { + int i, j; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (subw == 0 && subh == 0) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = mask[i * mask_stride + j]; + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else if (subw == 1 && subh == 1) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = ROUND_POWER_OF_TWO( + mask[(2 * i) * mask_stride + (2 * j)] + + mask[(2 * i + 1) * mask_stride + (2 * j)] + + mask[(2 * i) * mask_stride + (2 * j + 1)] + + mask[(2 * i + 1) * mask_stride + (2 * j + 1)], + 2); + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else if (subw == 1 && subh == 0) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)], + mask[i * mask_stride + (2 * j + 1)]); + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j], + mask[(2 * i + 1) * mask_stride + j]); + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, uint32_t src0_stride, + const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int w, int h, int subw, int subh, int bd) { + int i, j; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); + const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); + (void)bd; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + + if (subw == 0 && subh == 0) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = mask[i * mask_stride + j]; + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else if (subw == 1 && subh == 1) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = ROUND_POWER_OF_TWO( + mask[(2 * i) * mask_stride + (2 * j)] + + mask[(2 * i + 1) * mask_stride + (2 * j)] + + mask[(2 * i) * mask_stride + (2 * j + 1)] + + mask[(2 * i + 1) * mask_stride + (2 * j + 1)], + 2); + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else if (subw == 1 && subh == 0) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)], + mask[i * mask_stride + (2 * j + 1)]); + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j], + mask[(2 * i + 1) * mask_stride + j]); + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/libs/libaom/src/aom_dsp/blend_a64_vmask.c b/libs/libaom/src/aom_dsp/blend_a64_vmask.c new file mode 100644 index 000000000..c938bb33a --- /dev/null +++ b/libs/libaom/src/aom_dsp/blend_a64_vmask.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" + +#include "config/aom_dsp_rtcd.h" + +void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + int i, j; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + for (i = 0; i < h; ++i) { + const int m = mask[i]; + for (j = 0; j < w; ++j) { + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_blend_a64_vmask_c(uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, uint32_t src0_stride, + const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, int w, int h, int bd) { + int i, j; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); + const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); + (void)bd; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + + for (i = 0; i < h; ++i) { + const int m = mask[i]; + for (j = 0; j < w; ++j) { + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } +} +#endif diff --git a/libs/libaom/src/aom_dsp/blk_sse_sum.c b/libs/libaom/src/aom_dsp/blk_sse_sum.c new file mode 100644 index 000000000..d76c3f87b --- /dev/null +++ b/libs/libaom/src/aom_dsp/blk_sse_sum.c @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_dsp_rtcd.h" + +void aom_get_blk_sse_sum_c(const int16_t *data, int stride, int bw, int bh, + int *x_sum, int64_t *x2_sum) { + *x_sum = 0; + *x2_sum = 0; + for (int i = 0; i < bh; ++i) { + for (int j = 0; j < bw; ++j) { + const int val = data[j]; + *x_sum += val; + *x2_sum += val * val; + } + data += stride; + } +} diff --git a/libs/libaom/src/aom_dsp/entcode.c b/libs/libaom/src/aom_dsp/entcode.c new file mode 100644 index 000000000..aad96c6fc --- /dev/null +++ b/libs/libaom/src/aom_dsp/entcode.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/entcode.h" + +/*Given the current total integer number of bits used and the current value of + rng, computes the fraction number of bits used to OD_BITRES precision. + This is used by od_ec_enc_tell_frac() and od_ec_dec_tell_frac(). + nbits_total: The number of whole bits currently used, i.e., the value + returned by od_ec_enc_tell() or od_ec_dec_tell(). + rng: The current value of rng from either the encoder or decoder state. + Return: The number of bits scaled by 2**OD_BITRES. + This will always be slightly larger than the exact value (e.g., all + rounding error is in the positive direction).*/ +uint32_t od_ec_tell_frac(uint32_t nbits_total, uint32_t rng) { + uint32_t nbits; + int l; + int i; + /*To handle the non-integral number of bits still left in the encoder/decoder + state, we compute the worst-case number of bits of val that must be + encoded to ensure that the value is inside the range for any possible + subsequent bits. + The computation here is independent of val itself (the decoder does not + even track that value), even though the real number of bits used after + od_ec_enc_done() may be 1 smaller if rng is a power of two and the + corresponding trailing bits of val are all zeros. + If we did try to track that special case, then coding a value with a + probability of 1/(1 << n) might sometimes appear to use more than n bits. + This may help explain the surprising result that a newly initialized + encoder or decoder claims to have used 1 bit.*/ + nbits = nbits_total << OD_BITRES; + l = 0; + for (i = OD_BITRES; i-- > 0;) { + int b; + rng = rng * rng >> 15; + b = (int)(rng >> 16); + l = l << 1 | b; + rng >>= b; + } + return nbits - l; +} diff --git a/libs/libaom/src/aom_dsp/entcode.h b/libs/libaom/src/aom_dsp/entcode.h new file mode 100644 index 000000000..751887921 --- /dev/null +++ b/libs/libaom/src/aom_dsp/entcode.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_ENTCODE_H_ +#define AOM_AOM_DSP_ENTCODE_H_ + +#include +#include +#include "av1/common/odintrin.h" +#include "aom_dsp/prob.h" + +#define EC_PROB_SHIFT 6 +#define EC_MIN_PROB 4 // must be <= (1< 1/8th bits.*/ +#define OD_BITRES (3) + +#define OD_ICDF AOM_ICDF + +/*See entcode.c for further documentation.*/ + +OD_WARN_UNUSED_RESULT uint32_t od_ec_tell_frac(uint32_t nbits_total, + uint32_t rng); + +#endif // AOM_AOM_DSP_ENTCODE_H_ diff --git a/libs/libaom/src/aom_dsp/entdec.c b/libs/libaom/src/aom_dsp/entdec.c new file mode 100644 index 000000000..da43e8a39 --- /dev/null +++ b/libs/libaom/src/aom_dsp/entdec.c @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "aom_dsp/entdec.h" +#include "aom_dsp/prob.h" + +/*A range decoder. + This is an entropy decoder based upon \cite{Mar79}, which is itself a + rediscovery of the FIFO arithmetic code introduced by \cite{Pas76}. + It is very similar to arithmetic encoding, except that encoding is done with + digits in any base, instead of with bits, and so it is faster when using + larger bases (i.e.: a byte). + The author claims an average waste of $\frac{1}{2}\log_b(2b)$ bits, where $b$ + is the base, longer than the theoretical optimum, but to my knowledge there + is no published justification for this claim. + This only seems true when using near-infinite precision arithmetic so that + the process is carried out with no rounding errors. + + An excellent description of implementation details is available at + http://www.arturocampos.com/ac_range.html + A recent work \cite{MNW98} which proposes several changes to arithmetic + encoding for efficiency actually re-discovers many of the principles + behind range encoding, and presents a good theoretical analysis of them. + + End of stream is handled by writing out the smallest number of bits that + ensures that the stream will be correctly decoded regardless of the value of + any subsequent bits. + od_ec_dec_tell() can be used to determine how many bits were needed to decode + all the symbols thus far; other data can be packed in the remaining bits of + the input buffer. + @PHDTHESIS{Pas76, + author="Richard Clark Pasco", + title="Source coding algorithms for fast data compression", + school="Dept. of Electrical Engineering, Stanford University", + address="Stanford, CA", + month=May, + year=1976, + URL="http://www.richpasco.org/scaffdc.pdf" + } + @INPROCEEDINGS{Mar79, + author="Martin, G.N.N.", + title="Range encoding: an algorithm for removing redundancy from a digitised + message", + booktitle="Video & Data Recording Conference", + year=1979, + address="Southampton", + month=Jul, + URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz" + } + @ARTICLE{MNW98, + author="Alistair Moffat and Radford Neal and Ian H. Witten", + title="Arithmetic Coding Revisited", + journal="{ACM} Transactions on Information Systems", + year=1998, + volume=16, + number=3, + pages="256--294", + month=Jul, + URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf" + }*/ + +/*This is meant to be a large, positive constant that can still be efficiently + loaded as an immediate (on platforms like ARM, for example). + Even relatively modest values like 100 would work fine.*/ +#define OD_EC_LOTS_OF_BITS (0x4000) + +/*The return value of od_ec_dec_tell does not change across an od_ec_dec_refill + call.*/ +static void od_ec_dec_refill(od_ec_dec *dec) { + int s; + od_ec_window dif; + int16_t cnt; + const unsigned char *bptr; + const unsigned char *end; + dif = dec->dif; + cnt = dec->cnt; + bptr = dec->bptr; + end = dec->end; + s = OD_EC_WINDOW_SIZE - 9 - (cnt + 15); + for (; s >= 0 && bptr < end; s -= 8, bptr++) { + /*Each time a byte is inserted into the window (dif), bptr advances and cnt + is incremented by 8, so the total number of consumed bits (the return + value of od_ec_dec_tell) does not change.*/ + assert(s <= OD_EC_WINDOW_SIZE - 8); + dif ^= (od_ec_window)bptr[0] << s; + cnt += 8; + } + if (bptr >= end) { + /*We've reached the end of the buffer. It is perfectly valid for us to need + to fill the window with additional bits past the end of the buffer (and + this happens in normal operation). These bits should all just be taken + as zero. But we cannot increment bptr past 'end' (this is undefined + behavior), so we start to increment dec->tell_offs. We also don't want + to keep testing bptr against 'end', so we set cnt to OD_EC_LOTS_OF_BITS + and adjust dec->tell_offs so that the total number of unconsumed bits in + the window (dec->cnt - dec->tell_offs) does not change. This effectively + puts lots of zero bits into the window, and means we won't try to refill + it from the buffer for a very long time (at which point we'll put lots + of zero bits into the window again).*/ + dec->tell_offs += OD_EC_LOTS_OF_BITS - cnt; + cnt = OD_EC_LOTS_OF_BITS; + } + dec->dif = dif; + dec->cnt = cnt; + dec->bptr = bptr; +} + +/*Takes updated dif and range values, renormalizes them so that + 32768 <= rng < 65536 (reading more bytes from the stream into dif if + necessary), and stores them back in the decoder context. + dif: The new value of dif. + rng: The new value of the range. + ret: The value to return. + Return: ret. + This allows the compiler to jump to this function via a tail-call.*/ +static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng, + int ret) { + int d; + assert(rng <= 65535U); + /*The number of leading zeros in the 16-bit binary representation of rng.*/ + d = 16 - OD_ILOG_NZ(rng); + /*d bits in dec->dif are consumed.*/ + dec->cnt -= d; + /*This is equivalent to shifting in 1's instead of 0's.*/ + dec->dif = ((dif + 1) << d) - 1; + dec->rng = rng << d; + if (dec->cnt < 0) od_ec_dec_refill(dec); + return ret; +} + +/*Initializes the decoder. + buf: The input buffer to use. + storage: The size in bytes of the input buffer.*/ +void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf, + uint32_t storage) { + dec->buf = buf; + dec->tell_offs = 10 - (OD_EC_WINDOW_SIZE - 8); + dec->end = buf + storage; + dec->bptr = buf; + dec->dif = ((od_ec_window)1 << (OD_EC_WINDOW_SIZE - 1)) - 1; + dec->rng = 0x8000; + dec->cnt = -15; + od_ec_dec_refill(dec); +} + +/*Decode a single binary value. + f: The probability that the bit is one, scaled by 32768. + Return: The value decoded (0 or 1).*/ +int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) { + od_ec_window dif; + od_ec_window vw; + unsigned r; + unsigned r_new; + unsigned v; + int ret; + assert(0 < f); + assert(f < 32768U); + dif = dec->dif; + r = dec->rng; + assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r); + assert(32768U <= r); + v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)); + v += EC_MIN_PROB; + vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16); + ret = 1; + r_new = v; + if (dif >= vw) { + r_new = r - v; + dif -= vw; + ret = 0; + } + return od_ec_dec_normalize(dec, dif, r_new, ret); +} + +/*Decodes a symbol given an inverse cumulative distribution function (CDF) + table in Q15. + icdf: CDF_PROB_TOP minus the CDF, such that symbol s falls in the range + [s > 0 ? (CDF_PROB_TOP - icdf[s - 1]) : 0, CDF_PROB_TOP - icdf[s]). + The values must be monotonically non-increasing, and icdf[nsyms - 1] + must be 0. + nsyms: The number of symbols in the alphabet. + This should be at most 16. + Return: The decoded symbol s.*/ +int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *icdf, int nsyms) { + od_ec_window dif; + unsigned r; + unsigned c; + unsigned u; + unsigned v; + int ret; + (void)nsyms; + dif = dec->dif; + r = dec->rng; + const int N = nsyms - 1; + + assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r); + assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP)); + assert(32768U <= r); + assert(7 - EC_PROB_SHIFT - CDF_SHIFT >= 0); + c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16)); + v = r; + ret = -1; + do { + u = v; + v = ((r >> 8) * (uint32_t)(icdf[++ret] >> EC_PROB_SHIFT) >> + (7 - EC_PROB_SHIFT - CDF_SHIFT)); + v += EC_MIN_PROB * (N - ret); + } while (c < v); + assert(v < u); + assert(u <= r); + r = u - v; + dif -= (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16); + return od_ec_dec_normalize(dec, dif, r, ret); +} + +/*Returns the number of bits "used" by the decoded symbols so far. + This same number can be computed in either the encoder or the decoder, and is + suitable for making coding decisions. + Return: The number of bits. + This will always be slightly larger than the exact value (e.g., all + rounding error is in the positive direction).*/ +int od_ec_dec_tell(const od_ec_dec *dec) { + /*There is a window of bits stored in dec->dif. The difference + (dec->bptr - dec->buf) tells us how many bytes have been read into this + window. The difference (dec->cnt - dec->tell_offs) tells us how many of + the bits in that window remain unconsumed.*/ + return (int)((dec->bptr - dec->buf) * 8 - dec->cnt + dec->tell_offs); +} + +/*Returns the number of bits "used" by the decoded symbols so far. + This same number can be computed in either the encoder or the decoder, and is + suitable for making coding decisions. + Return: The number of bits scaled by 2**OD_BITRES. + This will always be slightly larger than the exact value (e.g., all + rounding error is in the positive direction).*/ +uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec) { + return od_ec_tell_frac(od_ec_dec_tell(dec), dec->rng); +} diff --git a/libs/libaom/src/aom_dsp/entdec.h b/libs/libaom/src/aom_dsp/entdec.h new file mode 100644 index 000000000..c74616777 --- /dev/null +++ b/libs/libaom/src/aom_dsp/entdec.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_ENTDEC_H_ +#define AOM_AOM_DSP_ENTDEC_H_ +#include +#include "aom_dsp/entcode.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct od_ec_dec od_ec_dec; + +#if defined(OD_ACCOUNTING) && OD_ACCOUNTING +#define OD_ACC_STR , char *acc_str +#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb, str) +#else +#define OD_ACC_STR +#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb) +#endif + +/*The entropy decoder context.*/ +struct od_ec_dec { + /*The start of the current input buffer.*/ + const unsigned char *buf; + /*An offset used to keep track of tell after reaching the end of the stream. + This is constant throughout most of the decoding process, but becomes + important once we hit the end of the buffer and stop incrementing bptr + (and instead pretend cnt has lots of bits).*/ + int32_t tell_offs; + /*The end of the current input buffer.*/ + const unsigned char *end; + /*The read pointer for the entropy-coded bits.*/ + const unsigned char *bptr; + /*The difference between the high end of the current range, (low + rng), and + the coded value, minus 1. + This stores up to OD_EC_WINDOW_SIZE bits of that difference, but the + decoder only uses the top 16 bits of the window to decode the next symbol. + As we shift up during renormalization, if we don't have enough bits left in + the window to fill the top 16, we'll read in more bits of the coded + value.*/ + od_ec_window dif; + /*The number of values in the current range.*/ + uint16_t rng; + /*The number of bits of data in the current value.*/ + int16_t cnt; +}; + +/*See entdec.c for further documentation.*/ + +void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf, uint32_t storage) + OD_ARG_NONNULL(1) OD_ARG_NONNULL(2); + +OD_WARN_UNUSED_RESULT int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) + OD_ARG_NONNULL(1); +OD_WARN_UNUSED_RESULT int od_ec_decode_cdf_q15(od_ec_dec *dec, + const uint16_t *cdf, int nsyms) + OD_ARG_NONNULL(1) OD_ARG_NONNULL(2); + +OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb) + OD_ARG_NONNULL(1); + +OD_WARN_UNUSED_RESULT int od_ec_dec_tell(const od_ec_dec *dec) + OD_ARG_NONNULL(1); +OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec) + OD_ARG_NONNULL(1); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_ENTDEC_H_ diff --git a/libs/libaom/src/aom_dsp/entenc.c b/libs/libaom/src/aom_dsp/entenc.c new file mode 100644 index 000000000..2fd4493ea --- /dev/null +++ b/libs/libaom/src/aom_dsp/entenc.c @@ -0,0 +1,423 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include "aom_dsp/entenc.h" +#include "aom_dsp/prob.h" + +#if OD_MEASURE_EC_OVERHEAD +#if !defined(M_LOG2E) +#define M_LOG2E (1.4426950408889634073599246810019) +#endif +#define OD_LOG2(x) (M_LOG2E * log(x)) +#endif // OD_MEASURE_EC_OVERHEAD + +/*A range encoder. + See entdec.c and the references for implementation details \cite{Mar79,MNW98}. + + @INPROCEEDINGS{Mar79, + author="Martin, G.N.N.", + title="Range encoding: an algorithm for removing redundancy from a digitised + message", + booktitle="Video \& Data Recording Conference", + year=1979, + address="Southampton", + month=Jul, + URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz" + } + @ARTICLE{MNW98, + author="Alistair Moffat and Radford Neal and Ian H. Witten", + title="Arithmetic Coding Revisited", + journal="{ACM} Transactions on Information Systems", + year=1998, + volume=16, + number=3, + pages="256--294", + month=Jul, + URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf" + }*/ + +/*Takes updated low and range values, renormalizes them so that + 32768 <= rng < 65536 (flushing bytes from low to the pre-carry buffer if + necessary), and stores them back in the encoder context. + low: The new value of low. + rng: The new value of the range.*/ +static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_window low, + unsigned rng) { + int d; + int c; + int s; + c = enc->cnt; + assert(rng <= 65535U); + /*The number of leading zeros in the 16-bit binary representation of rng.*/ + d = 16 - OD_ILOG_NZ(rng); + s = c + d; + /*TODO: Right now we flush every time we have at least one byte available. + Instead we should use an od_ec_window and flush right before we're about to + shift bits off the end of the window. + For a 32-bit window this is about the same amount of work, but for a 64-bit + window it should be a fair win.*/ + if (s >= 0) { + uint16_t *buf; + uint32_t storage; + uint32_t offs; + unsigned m; + buf = enc->precarry_buf; + storage = enc->precarry_storage; + offs = enc->offs; + if (offs + 2 > storage) { + storage = 2 * storage + 2; + buf = (uint16_t *)realloc(buf, sizeof(*buf) * storage); + if (buf == NULL) { + enc->error = -1; + enc->offs = 0; + return; + } + enc->precarry_buf = buf; + enc->precarry_storage = storage; + } + c += 16; + m = (1 << c) - 1; + if (s >= 8) { + assert(offs < storage); + buf[offs++] = (uint16_t)(low >> c); + low &= m; + c -= 8; + m >>= 8; + } + assert(offs < storage); + buf[offs++] = (uint16_t)(low >> c); + s = c + d - 24; + low &= m; + enc->offs = offs; + } + enc->low = low << d; + enc->rng = rng << d; + enc->cnt = s; +} + +/*Initializes the encoder. + size: The initial size of the buffer, in bytes.*/ +void od_ec_enc_init(od_ec_enc *enc, uint32_t size) { + od_ec_enc_reset(enc); + enc->buf = (unsigned char *)malloc(sizeof(*enc->buf) * size); + enc->storage = size; + if (size > 0 && enc->buf == NULL) { + enc->storage = 0; + enc->error = -1; + } + enc->precarry_buf = (uint16_t *)malloc(sizeof(*enc->precarry_buf) * size); + enc->precarry_storage = size; + if (size > 0 && enc->precarry_buf == NULL) { + enc->precarry_storage = 0; + enc->error = -1; + } +} + +/*Reinitializes the encoder.*/ +void od_ec_enc_reset(od_ec_enc *enc) { + enc->offs = 0; + enc->low = 0; + enc->rng = 0x8000; + /*This is initialized to -9 so that it crosses zero after we've accumulated + one byte + one carry bit.*/ + enc->cnt = -9; + enc->error = 0; +#if OD_MEASURE_EC_OVERHEAD + enc->entropy = 0; + enc->nb_symbols = 0; +#endif +} + +/*Frees the buffers used by the encoder.*/ +void od_ec_enc_clear(od_ec_enc *enc) { + free(enc->precarry_buf); + free(enc->buf); +} + +/*Encodes a symbol given its frequency in Q15. + fl: CDF_PROB_TOP minus the cumulative frequency of all symbols that come + before the + one to be encoded. + fh: CDF_PROB_TOP minus the cumulative frequency of all symbols up to and + including + the one to be encoded.*/ +static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh, int s, + int nsyms) { + od_ec_window l; + unsigned r; + unsigned u; + unsigned v; + l = enc->low; + r = enc->rng; + assert(32768U <= r); + assert(fh <= fl); + assert(fl <= 32768U); + assert(7 - EC_PROB_SHIFT - CDF_SHIFT >= 0); + const int N = nsyms - 1; + if (fl < CDF_PROB_TOP) { + u = ((r >> 8) * (uint32_t)(fl >> EC_PROB_SHIFT) >> + (7 - EC_PROB_SHIFT - CDF_SHIFT)) + + EC_MIN_PROB * (N - (s - 1)); + v = ((r >> 8) * (uint32_t)(fh >> EC_PROB_SHIFT) >> + (7 - EC_PROB_SHIFT - CDF_SHIFT)) + + EC_MIN_PROB * (N - (s + 0)); + l += r - u; + r = u - v; + } else { + r -= ((r >> 8) * (uint32_t)(fh >> EC_PROB_SHIFT) >> + (7 - EC_PROB_SHIFT - CDF_SHIFT)) + + EC_MIN_PROB * (N - (s + 0)); + } + od_ec_enc_normalize(enc, l, r); +#if OD_MEASURE_EC_OVERHEAD + enc->entropy -= OD_LOG2((double)(OD_ICDF(fh) - OD_ICDF(fl)) / CDF_PROB_TOP.); + enc->nb_symbols++; +#endif +} + +/*Encode a single binary value. + val: The value to encode (0 or 1). + f: The probability that the val is one, scaled by 32768.*/ +void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) { + od_ec_window l; + unsigned r; + unsigned v; + assert(0 < f); + assert(f < 32768U); + l = enc->low; + r = enc->rng; + assert(32768U <= r); + v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)); + v += EC_MIN_PROB; + if (val) l += r - v; + r = val ? v : r - v; + od_ec_enc_normalize(enc, l, r); +#if OD_MEASURE_EC_OVERHEAD + enc->entropy -= OD_LOG2((double)(val ? f : (32768 - f)) / 32768.); + enc->nb_symbols++; +#endif +} + +/*Encodes a symbol given a cumulative distribution function (CDF) table in Q15. + s: The index of the symbol to encode. + icdf: 32768 minus the CDF, such that symbol s falls in the range + [s > 0 ? (32768 - icdf[s - 1]) : 0, 32768 - icdf[s]). + The values must be monotonically decreasing, and icdf[nsyms - 1] must + be 0. + nsyms: The number of symbols in the alphabet. + This should be at most 16.*/ +void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *icdf, + int nsyms) { + (void)nsyms; + assert(s >= 0); + assert(s < nsyms); + assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP)); + od_ec_encode_q15(enc, s > 0 ? icdf[s - 1] : OD_ICDF(0), icdf[s], s, nsyms); +} + +/*Overwrites a few bits at the very start of an existing stream, after they + have already been encoded. + This makes it possible to have a few flags up front, where it is easy for + decoders to access them without parsing the whole stream, even if their + values are not determined until late in the encoding process, without having + to buffer all the intermediate symbols in the encoder. + In order for this to work, at least nbits bits must have already been encoded + using probabilities that are an exact power of two. + The encoder can verify the number of encoded bits is sufficient, but cannot + check this latter condition. + val: The bits to encode (in the least nbits significant bits). + They will be decoded in order from most-significant to least. + nbits: The number of bits to overwrite. + This must be no more than 8.*/ +void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits) { + int shift; + unsigned mask; + assert(nbits >= 0); + assert(nbits <= 8); + assert(val < 1U << nbits); + shift = 8 - nbits; + mask = ((1U << nbits) - 1) << shift; + if (enc->offs > 0) { + /*The first byte has been finalized.*/ + enc->precarry_buf[0] = + (uint16_t)((enc->precarry_buf[0] & ~mask) | val << shift); + } else if (9 + enc->cnt + (enc->rng == 0x8000) > nbits) { + /*The first byte has yet to be output.*/ + enc->low = (enc->low & ~((od_ec_window)mask << (16 + enc->cnt))) | + (od_ec_window)val << (16 + enc->cnt + shift); + } else { + /*The encoder hasn't even encoded _nbits of data yet.*/ + enc->error = -1; + } +} + +#if OD_MEASURE_EC_OVERHEAD +#include +#endif + +/*Indicates that there are no more symbols to encode. + All remaining output bytes are flushed to the output buffer. + od_ec_enc_reset() should be called before using the encoder again. + bytes: Returns the size of the encoded data in the returned buffer. + Return: A pointer to the start of the final buffer, or NULL if there was an + encoding error.*/ +unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) { + unsigned char *out; + uint32_t storage; + uint16_t *buf; + uint32_t offs; + od_ec_window m; + od_ec_window e; + od_ec_window l; + int c; + int s; + if (enc->error) return NULL; +#if OD_MEASURE_EC_OVERHEAD + { + uint32_t tell; + /* Don't count the 1 bit we lose to raw bits as overhead. */ + tell = od_ec_enc_tell(enc) - 1; + fprintf(stderr, "overhead: %f%%\n", + 100 * (tell - enc->entropy) / enc->entropy); + fprintf(stderr, "efficiency: %f bits/symbol\n", + (double)tell / enc->nb_symbols); + } +#endif + /*We output the minimum number of bits that ensures that the symbols encoded + thus far will be decoded correctly regardless of the bits that follow.*/ + l = enc->low; + c = enc->cnt; + s = 10; + m = 0x3FFF; + e = ((l + m) & ~m) | (m + 1); + s += c; + offs = enc->offs; + buf = enc->precarry_buf; + if (s > 0) { + unsigned n; + storage = enc->precarry_storage; + if (offs + ((s + 7) >> 3) > storage) { + storage = storage * 2 + ((s + 7) >> 3); + buf = (uint16_t *)realloc(buf, sizeof(*buf) * storage); + if (buf == NULL) { + enc->error = -1; + return NULL; + } + enc->precarry_buf = buf; + enc->precarry_storage = storage; + } + n = (1 << (c + 16)) - 1; + do { + assert(offs < storage); + buf[offs++] = (uint16_t)(e >> (c + 16)); + e &= n; + s -= 8; + c -= 8; + n >>= 8; + } while (s > 0); + } + /*Make sure there's enough room for the entropy-coded bits.*/ + out = enc->buf; + storage = enc->storage; + c = OD_MAXI((s + 7) >> 3, 0); + if (offs + c > storage) { + storage = offs + c; + out = (unsigned char *)realloc(out, sizeof(*out) * storage); + if (out == NULL) { + enc->error = -1; + return NULL; + } + enc->buf = out; + enc->storage = storage; + } + *nbytes = offs; + /*Perform carry propagation.*/ + assert(offs <= storage); + out = out + storage - offs; + c = 0; + while (offs > 0) { + offs--; + c = buf[offs] + c; + out[offs] = (unsigned char)c; + c >>= 8; + } + /*Note: Unless there's an allocation error, if you keep encoding into the + current buffer and call this function again later, everything will work + just fine (you won't get a new packet out, but you will get a single + buffer with the new data appended to the old). + However, this function is O(N) where N is the amount of data coded so far, + so calling it more than once for a given packet is a bad idea.*/ + return out; +} + +/*Returns the number of bits "used" by the encoded symbols so far. + This same number can be computed in either the encoder or the decoder, and is + suitable for making coding decisions. + Warning: The value returned by this function can decrease compared to an + earlier call, even after encoding more data, if there is an encoding error + (i.e., a failure to allocate enough space for the output buffer). + Return: The number of bits. + This will always be slightly larger than the exact value (e.g., all + rounding error is in the positive direction).*/ +int od_ec_enc_tell(const od_ec_enc *enc) { + /*The 10 here counteracts the offset of -9 baked into cnt, and adds 1 extra + bit, which we reserve for terminating the stream.*/ + return (enc->cnt + 10) + enc->offs * 8; +} + +/*Returns the number of bits "used" by the encoded symbols so far. + This same number can be computed in either the encoder or the decoder, and is + suitable for making coding decisions. + Warning: The value returned by this function can decrease compared to an + earlier call, even after encoding more data, if there is an encoding error + (i.e., a failure to allocate enough space for the output buffer). + Return: The number of bits scaled by 2**OD_BITRES. + This will always be slightly larger than the exact value (e.g., all + rounding error is in the positive direction).*/ +uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc) { + return od_ec_tell_frac(od_ec_enc_tell(enc), enc->rng); +} + +/*Saves a entropy coder checkpoint to dst. + This allows an encoder to reverse a series of entropy coder + decisions if it decides that the information would have been + better coded some other way.*/ +void od_ec_enc_checkpoint(od_ec_enc *dst, const od_ec_enc *src) { + OD_COPY(dst, src, 1); +} + +/*Restores an entropy coder checkpoint saved by od_ec_enc_checkpoint. + This can only be used to restore from checkpoints earlier in the target + state's history: you can not switch backwards and forwards or otherwise + switch to a state which isn't a casual ancestor of the current state. + Restore is also incompatible with patching the initial bits, as the + changes will remain in the restored version.*/ +void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src) { + unsigned char *buf; + uint32_t storage; + uint16_t *precarry_buf; + uint32_t precarry_storage; + assert(dst->storage >= src->storage); + assert(dst->precarry_storage >= src->precarry_storage); + buf = dst->buf; + storage = dst->storage; + precarry_buf = dst->precarry_buf; + precarry_storage = dst->precarry_storage; + OD_COPY(dst, src, 1); + dst->buf = buf; + dst->storage = storage; + dst->precarry_buf = precarry_buf; + dst->precarry_storage = precarry_storage; +} diff --git a/libs/libaom/src/aom_dsp/entenc.h b/libs/libaom/src/aom_dsp/entenc.h new file mode 100644 index 000000000..3551d4250 --- /dev/null +++ b/libs/libaom/src/aom_dsp/entenc.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_ENTENC_H_ +#define AOM_AOM_DSP_ENTENC_H_ +#include +#include "aom_dsp/entcode.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct od_ec_enc od_ec_enc; + +#define OD_MEASURE_EC_OVERHEAD (0) + +/*The entropy encoder context.*/ +struct od_ec_enc { + /*Buffered output. + This contains only the raw bits until the final call to od_ec_enc_done(), + where all the arithmetic-coded data gets prepended to it.*/ + unsigned char *buf; + /*The size of the buffer.*/ + uint32_t storage; + /*A buffer for output bytes with their associated carry flags.*/ + uint16_t *precarry_buf; + /*The size of the pre-carry buffer.*/ + uint32_t precarry_storage; + /*The offset at which the next entropy-coded byte will be written.*/ + uint32_t offs; + /*The low end of the current range.*/ + od_ec_window low; + /*The number of values in the current range.*/ + uint16_t rng; + /*The number of bits of data in the current value.*/ + int16_t cnt; + /*Nonzero if an error occurred.*/ + int error; +#if OD_MEASURE_EC_OVERHEAD + double entropy; + int nb_symbols; +#endif +}; + +/*See entenc.c for further documentation.*/ + +void od_ec_enc_init(od_ec_enc *enc, uint32_t size) OD_ARG_NONNULL(1); +void od_ec_enc_reset(od_ec_enc *enc) OD_ARG_NONNULL(1); +void od_ec_enc_clear(od_ec_enc *enc) OD_ARG_NONNULL(1); + +void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f_q15) + OD_ARG_NONNULL(1); +void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *cdf, int nsyms) + OD_ARG_NONNULL(1) OD_ARG_NONNULL(3); + +void od_ec_enc_bits(od_ec_enc *enc, uint32_t fl, unsigned ftb) + OD_ARG_NONNULL(1); + +void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits) + OD_ARG_NONNULL(1); +OD_WARN_UNUSED_RESULT unsigned char *od_ec_enc_done(od_ec_enc *enc, + uint32_t *nbytes) + OD_ARG_NONNULL(1) OD_ARG_NONNULL(2); + +OD_WARN_UNUSED_RESULT int od_ec_enc_tell(const od_ec_enc *enc) + OD_ARG_NONNULL(1); +OD_WARN_UNUSED_RESULT uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc) + OD_ARG_NONNULL(1); + +void od_ec_enc_checkpoint(od_ec_enc *dst, const od_ec_enc *src); +void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_ENTENC_H_ diff --git a/libs/libaom/src/aom_dsp/fastssim.c b/libs/libaom/src/aom_dsp/fastssim.c new file mode 100644 index 000000000..3804519b3 --- /dev/null +++ b/libs/libaom/src/aom_dsp/fastssim.c @@ -0,0 +1,487 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + * + * This code was originally written by: Nathan E. Egge, at the Daala + * project. + */ +#include +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/ssim.h" +#include "aom_ports/system_state.h" + +typedef struct fs_level fs_level; +typedef struct fs_ctx fs_ctx; + +#define SSIM_C1 (255 * 255 * 0.01 * 0.01) +#define SSIM_C2 (255 * 255 * 0.03 * 0.03) +#define SSIM_C1_10 (1023 * 1023 * 0.01 * 0.01) +#define SSIM_C1_12 (4095 * 4095 * 0.01 * 0.01) +#define SSIM_C2_10 (1023 * 1023 * 0.03 * 0.03) +#define SSIM_C2_12 (4095 * 4095 * 0.03 * 0.03) + +#define FS_MINI(_a, _b) ((_a) < (_b) ? (_a) : (_b)) +#define FS_MAXI(_a, _b) ((_a) > (_b) ? (_a) : (_b)) + +struct fs_level { + uint32_t *im1; + uint32_t *im2; + double *ssim; + int w; + int h; +}; + +struct fs_ctx { + fs_level *level; + int nlevels; + unsigned *col_buf; +}; + +static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) { + unsigned char *data; + size_t data_size; + int lw; + int lh; + int l; + lw = (_w + 1) >> 1; + lh = (_h + 1) >> 1; + data_size = + _nlevels * sizeof(fs_level) + 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf); + for (l = 0; l < _nlevels; l++) { + size_t im_size; + size_t level_size; + im_size = lw * (size_t)lh; + level_size = 2 * im_size * sizeof(*_ctx->level[l].im1); + level_size += sizeof(*_ctx->level[l].ssim) - 1; + level_size /= sizeof(*_ctx->level[l].ssim); + level_size += im_size; + level_size *= sizeof(*_ctx->level[l].ssim); + data_size += level_size; + lw = (lw + 1) >> 1; + lh = (lh + 1) >> 1; + } + data = (unsigned char *)malloc(data_size); + _ctx->level = (fs_level *)data; + _ctx->nlevels = _nlevels; + data += _nlevels * sizeof(*_ctx->level); + lw = (_w + 1) >> 1; + lh = (_h + 1) >> 1; + for (l = 0; l < _nlevels; l++) { + size_t im_size; + size_t level_size; + _ctx->level[l].w = lw; + _ctx->level[l].h = lh; + im_size = lw * (size_t)lh; + level_size = 2 * im_size * sizeof(*_ctx->level[l].im1); + level_size += sizeof(*_ctx->level[l].ssim) - 1; + level_size /= sizeof(*_ctx->level[l].ssim); + level_size *= sizeof(*_ctx->level[l].ssim); + _ctx->level[l].im1 = (uint32_t *)data; + _ctx->level[l].im2 = _ctx->level[l].im1 + im_size; + data += level_size; + _ctx->level[l].ssim = (double *)data; + data += im_size * sizeof(*_ctx->level[l].ssim); + lw = (lw + 1) >> 1; + lh = (lh + 1) >> 1; + } + _ctx->col_buf = (unsigned *)data; +} + +static void fs_ctx_clear(fs_ctx *_ctx) { free(_ctx->level); } + +static void fs_downsample_level(fs_ctx *_ctx, int _l) { + const uint32_t *src1; + const uint32_t *src2; + uint32_t *dst1; + uint32_t *dst2; + int w2; + int h2; + int w; + int h; + int i; + int j; + w = _ctx->level[_l].w; + h = _ctx->level[_l].h; + dst1 = _ctx->level[_l].im1; + dst2 = _ctx->level[_l].im2; + w2 = _ctx->level[_l - 1].w; + h2 = _ctx->level[_l - 1].h; + src1 = _ctx->level[_l - 1].im1; + src2 = _ctx->level[_l - 1].im2; + for (j = 0; j < h; j++) { + int j0offs; + int j1offs; + j0offs = 2 * j * w2; + j1offs = FS_MINI(2 * j + 1, h2) * w2; + for (i = 0; i < w; i++) { + int i0; + int i1; + i0 = 2 * i; + i1 = FS_MINI(i0 + 1, w2); + dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1] + + src1[j1offs + i0] + src1[j1offs + i1]; + dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1] + + src2[j1offs + i0] + src2[j1offs + i1]; + } + } +} + +static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1, + int _s1ystride, const uint8_t *_src2, + int _s2ystride, int _w, int _h, uint32_t shift, + int buf_is_hbd) { + uint32_t *dst1; + uint32_t *dst2; + int w; + int h; + int i; + int j; + w = _ctx->level[0].w; + h = _ctx->level[0].h; + dst1 = _ctx->level[0].im1; + dst2 = _ctx->level[0].im2; + for (j = 0; j < h; j++) { + int j0; + int j1; + j0 = 2 * j; + j1 = FS_MINI(j0 + 1, _h); + for (i = 0; i < w; i++) { + int i0; + int i1; + i0 = 2 * i; + i1 = FS_MINI(i0 + 1, _w); + if (!buf_is_hbd) { + dst1[j * w + i] = + _src1[j0 * _s1ystride + i0] + _src1[j0 * _s1ystride + i1] + + _src1[j1 * _s1ystride + i0] + _src1[j1 * _s1ystride + i1]; + dst2[j * w + i] = + _src2[j0 * _s2ystride + i0] + _src2[j0 * _s2ystride + i1] + + _src2[j1 * _s2ystride + i0] + _src2[j1 * _s2ystride + i1]; + } else { + uint16_t *src1s = CONVERT_TO_SHORTPTR(_src1); + uint16_t *src2s = CONVERT_TO_SHORTPTR(_src2); + dst1[j * w + i] = (src1s[j0 * _s1ystride + i0] >> shift) + + (src1s[j0 * _s1ystride + i1] >> shift) + + (src1s[j1 * _s1ystride + i0] >> shift) + + (src1s[j1 * _s1ystride + i1] >> shift); + dst2[j * w + i] = (src2s[j0 * _s2ystride + i0] >> shift) + + (src2s[j0 * _s2ystride + i1] >> shift) + + (src2s[j1 * _s2ystride + i0] >> shift) + + (src2s[j1 * _s2ystride + i1] >> shift); + } + } + } +} + +static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) { + unsigned *col_sums_x; + unsigned *col_sums_y; + uint32_t *im1; + uint32_t *im2; + double *ssim; + double c1; + int w; + int h; + int j0offs; + int j1offs; + int i; + int j; + double ssim_c1 = SSIM_C1; + + if (bit_depth == 10) ssim_c1 = SSIM_C1_10; + if (bit_depth == 12) ssim_c1 = SSIM_C1_12; + + w = _ctx->level[_l].w; + h = _ctx->level[_l].h; + col_sums_x = _ctx->col_buf; + col_sums_y = col_sums_x + w; + im1 = _ctx->level[_l].im1; + im2 = _ctx->level[_l].im2; + for (i = 0; i < w; i++) col_sums_x[i] = 5 * im1[i]; + for (i = 0; i < w; i++) col_sums_y[i] = 5 * im2[i]; + for (j = 1; j < 4; j++) { + j1offs = FS_MINI(j, h - 1) * w; + for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i]; + for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i]; + } + ssim = _ctx->level[_l].ssim; + c1 = (double)(ssim_c1 * 4096 * (1 << 4 * _l)); + for (j = 0; j < h; j++) { + unsigned mux; + unsigned muy; + int i0; + int i1; + mux = 5 * col_sums_x[0]; + muy = 5 * col_sums_y[0]; + for (i = 1; i < 4; i++) { + i1 = FS_MINI(i, w - 1); + mux += col_sums_x[i1]; + muy += col_sums_y[i1]; + } + for (i = 0; i < w; i++) { + ssim[j * w + i] *= (2 * mux * (double)muy + c1) / + (mux * (double)mux + muy * (double)muy + c1); + if (i + 1 < w) { + i0 = FS_MAXI(0, i - 4); + i1 = FS_MINI(i + 4, w - 1); + mux += col_sums_x[i1] - col_sums_x[i0]; + muy += col_sums_x[i1] - col_sums_x[i0]; + } + } + if (j + 1 < h) { + j0offs = FS_MAXI(0, j - 4) * w; + for (i = 0; i < w; i++) col_sums_x[i] -= im1[j0offs + i]; + for (i = 0; i < w; i++) col_sums_y[i] -= im2[j0offs + i]; + j1offs = FS_MINI(j + 4, h - 1) * w; + for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i]; + for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i]; + } + } +} + +#define FS_COL_SET(_col, _joffs, _ioffs) \ + do { \ + unsigned gx; \ + unsigned gy; \ + gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + col_sums_gx2[(_col)] = gx * (double)gx; \ + col_sums_gy2[(_col)] = gy * (double)gy; \ + col_sums_gxgy[(_col)] = gx * (double)gy; \ + } while (0) + +#define FS_COL_ADD(_col, _joffs, _ioffs) \ + do { \ + unsigned gx; \ + unsigned gy; \ + gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + col_sums_gx2[(_col)] += gx * (double)gx; \ + col_sums_gy2[(_col)] += gy * (double)gy; \ + col_sums_gxgy[(_col)] += gx * (double)gy; \ + } while (0) + +#define FS_COL_SUB(_col, _joffs, _ioffs) \ + do { \ + unsigned gx; \ + unsigned gy; \ + gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + col_sums_gx2[(_col)] -= gx * (double)gx; \ + col_sums_gy2[(_col)] -= gy * (double)gy; \ + col_sums_gxgy[(_col)] -= gx * (double)gy; \ + } while (0) + +#define FS_COL_COPY(_col1, _col2) \ + do { \ + col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)]; \ + col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)]; \ + col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)]; \ + } while (0) + +#define FS_COL_HALVE(_col1, _col2) \ + do { \ + col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 0.5; \ + col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 0.5; \ + col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 0.5; \ + } while (0) + +#define FS_COL_DOUBLE(_col1, _col2) \ + do { \ + col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 2; \ + col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 2; \ + col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 2; \ + } while (0) + +static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) { + uint32_t *im1; + uint32_t *im2; + unsigned *gx_buf; + unsigned *gy_buf; + double *ssim; + double col_sums_gx2[8]; + double col_sums_gy2[8]; + double col_sums_gxgy[8]; + double c2; + int stride; + int w; + int h; + int i; + int j; + double ssim_c2 = SSIM_C2; + if (bit_depth == 10) ssim_c2 = SSIM_C2_10; + if (bit_depth == 12) ssim_c2 = SSIM_C2_12; + + w = _ctx->level[_l].w; + h = _ctx->level[_l].h; + im1 = _ctx->level[_l].im1; + im2 = _ctx->level[_l].im2; + ssim = _ctx->level[_l].ssim; + gx_buf = _ctx->col_buf; + stride = w + 8; + gy_buf = gx_buf + 8 * stride; + memset(gx_buf, 0, 2 * 8 * stride * sizeof(*gx_buf)); + c2 = ssim_c2 * (1 << 4 * _l) * 16 * 104; + for (j = 0; j < h + 4; j++) { + if (j < h - 1) { + for (i = 0; i < w - 1; i++) { + unsigned g1; + unsigned g2; + unsigned gx; + unsigned gy; + g1 = abs((int)im1[(j + 1) * w + i + 1] - (int)im1[j * w + i]); + g2 = abs((int)im1[(j + 1) * w + i] - (int)im1[j * w + i + 1]); + gx = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2); + g1 = abs((int)im2[(j + 1) * w + i + 1] - (int)im2[j * w + i]); + g2 = abs((int)im2[(j + 1) * w + i] - (int)im2[j * w + i + 1]); + gy = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2); + gx_buf[(j & 7) * stride + i + 4] = gx; + gy_buf[(j & 7) * stride + i + 4] = gy; + } + } else { + memset(gx_buf + (j & 7) * stride, 0, stride * sizeof(*gx_buf)); + memset(gy_buf + (j & 7) * stride, 0, stride * sizeof(*gy_buf)); + } + if (j >= 4) { + int k; + col_sums_gx2[3] = col_sums_gx2[2] = col_sums_gx2[1] = col_sums_gx2[0] = 0; + col_sums_gy2[3] = col_sums_gy2[2] = col_sums_gy2[1] = col_sums_gy2[0] = 0; + col_sums_gxgy[3] = col_sums_gxgy[2] = col_sums_gxgy[1] = + col_sums_gxgy[0] = 0; + for (i = 4; i < 8; i++) { + FS_COL_SET(i, -1, 0); + FS_COL_ADD(i, 0, 0); + for (k = 1; k < 8 - i; k++) { + FS_COL_DOUBLE(i, i); + FS_COL_ADD(i, -k - 1, 0); + FS_COL_ADD(i, k, 0); + } + } + for (i = 0; i < w; i++) { + double mugx2; + double mugy2; + double mugxgy; + mugx2 = col_sums_gx2[0]; + for (k = 1; k < 8; k++) mugx2 += col_sums_gx2[k]; + mugy2 = col_sums_gy2[0]; + for (k = 1; k < 8; k++) mugy2 += col_sums_gy2[k]; + mugxgy = col_sums_gxgy[0]; + for (k = 1; k < 8; k++) mugxgy += col_sums_gxgy[k]; + ssim[(j - 4) * w + i] = (2 * mugxgy + c2) / (mugx2 + mugy2 + c2); + if (i + 1 < w) { + FS_COL_SET(0, -1, 1); + FS_COL_ADD(0, 0, 1); + FS_COL_SUB(2, -3, 2); + FS_COL_SUB(2, 2, 2); + FS_COL_HALVE(1, 2); + FS_COL_SUB(3, -4, 3); + FS_COL_SUB(3, 3, 3); + FS_COL_HALVE(2, 3); + FS_COL_COPY(3, 4); + FS_COL_DOUBLE(4, 5); + FS_COL_ADD(4, -4, 5); + FS_COL_ADD(4, 3, 5); + FS_COL_DOUBLE(5, 6); + FS_COL_ADD(5, -3, 6); + FS_COL_ADD(5, 2, 6); + FS_COL_DOUBLE(6, 7); + FS_COL_ADD(6, -2, 7); + FS_COL_ADD(6, 1, 7); + FS_COL_SET(7, -1, 8); + FS_COL_ADD(7, 0, 8); + } + } + } + } +} + +#define FS_NLEVELS (4) + +/*These weights were derived from the default weights found in Wang's original + Matlab implementation: {0.0448, 0.2856, 0.2363, 0.1333}. + We drop the finest scale and renormalize the rest to sum to 1.*/ + +static const double FS_WEIGHTS[FS_NLEVELS] = { + 0.2989654541015625, 0.3141326904296875, 0.2473602294921875, 0.1395416259765625 +}; + +static double fs_average(fs_ctx *_ctx, int _l) { + double *ssim; + double ret; + int w; + int h; + int i; + int j; + w = _ctx->level[_l].w; + h = _ctx->level[_l].h; + ssim = _ctx->level[_l].ssim; + ret = 0; + for (j = 0; j < h; j++) + for (i = 0; i < w; i++) ret += ssim[j * w + i]; + return pow(ret / (w * h), FS_WEIGHTS[_l]); +} + +static double convert_ssim_db(double _ssim, double _weight) { + assert(_weight >= _ssim); + if ((_weight - _ssim) < 1e-10) return MAX_SSIM_DB; + return 10 * (log10(_weight) - log10(_weight - _ssim)); +} + +static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst, + int _dystride, int _w, int _h, uint32_t _bd, + uint32_t _shift, int buf_is_hbd) { + fs_ctx ctx; + double ret; + int l; + ret = 1; + fs_ctx_init(&ctx, _w, _h, FS_NLEVELS); + fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _shift, + buf_is_hbd); + for (l = 0; l < FS_NLEVELS - 1; l++) { + fs_calc_structure(&ctx, l, _bd); + ret *= fs_average(&ctx, l); + fs_downsample_level(&ctx, l + 1); + } + fs_calc_structure(&ctx, l, _bd); + fs_apply_luminance(&ctx, l, _bd); + ret *= fs_average(&ctx, l); + fs_ctx_clear(&ctx); + return ret; +} + +double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *ssim_y, + double *ssim_u, double *ssim_v, uint32_t bd, + uint32_t in_bd) { + double ssimv; + uint32_t bd_shift = 0; + aom_clear_system_state(); + assert(bd >= in_bd); + assert(source->flags == dest->flags); + int buf_is_hbd = source->flags & YV12_FLAG_HIGHBITDEPTH; + bd_shift = bd - in_bd; + + *ssim_y = calc_ssim(source->y_buffer, source->y_stride, dest->y_buffer, + dest->y_stride, source->y_crop_width, + source->y_crop_height, in_bd, bd_shift, buf_is_hbd); + *ssim_u = calc_ssim(source->u_buffer, source->uv_stride, dest->u_buffer, + dest->uv_stride, source->uv_crop_width, + source->uv_crop_height, in_bd, bd_shift, buf_is_hbd); + *ssim_v = calc_ssim(source->v_buffer, source->uv_stride, dest->v_buffer, + dest->uv_stride, source->uv_crop_width, + source->uv_crop_height, in_bd, bd_shift, buf_is_hbd); + ssimv = (*ssim_y) * .8 + .1 * ((*ssim_u) + (*ssim_v)); + return convert_ssim_db(ssimv, 1.0); +} diff --git a/libs/libaom/src/aom_dsp/fft.c b/libs/libaom/src/aom_dsp/fft.c new file mode 100644 index 000000000..0ba71cfb3 --- /dev/null +++ b/libs/libaom/src/aom_dsp/fft.c @@ -0,0 +1,219 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/fft_common.h" + +static INLINE void simple_transpose(const float *A, float *B, int n) { + for (int y = 0; y < n; y++) { + for (int x = 0; x < n; x++) { + B[y * n + x] = A[x * n + y]; + } + } +} + +// The 1d transform is real to complex and packs the complex results in +// a way to take advantage of conjugate symmetry (e.g., the n/2 + 1 real +// components, followed by the n/2 - 1 imaginary components). After the +// transform is done on the rows, the first n/2 + 1 columns are real, and +// the remaining are the imaginary components. After the transform on the +// columns, the region of [0, n/2]x[0, n/2] contains the real part of +// fft of the real columns. The real part of the 2d fft also includes the +// imaginary part of transformed imaginary columns. This function assembles +// the correct outputs while putting the real and imaginary components +// next to each other. +static INLINE void unpack_2d_output(const float *col_fft, float *output, + int n) { + for (int y = 0; y <= n / 2; ++y) { + const int y2 = y + n / 2; + const int y_extra = y2 > n / 2 && y2 < n; + + for (int x = 0; x <= n / 2; ++x) { + const int x2 = x + n / 2; + const int x_extra = x2 > n / 2 && x2 < n; + output[2 * (y * n + x)] = + col_fft[y * n + x] - (x_extra && y_extra ? col_fft[y2 * n + x2] : 0); + output[2 * (y * n + x) + 1] = (y_extra ? col_fft[y2 * n + x] : 0) + + (x_extra ? col_fft[y * n + x2] : 0); + if (y_extra) { + output[2 * ((n - y) * n + x)] = + col_fft[y * n + x] + + (x_extra && y_extra ? col_fft[y2 * n + x2] : 0); + output[2 * ((n - y) * n + x) + 1] = + -(y_extra ? col_fft[y2 * n + x] : 0) + + (x_extra ? col_fft[y * n + x2] : 0); + } + } + } +} + +void aom_fft_2d_gen(const float *input, float *temp, float *output, int n, + aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose, + aom_fft_unpack_func_t unpack, int vec_size) { + for (int x = 0; x < n; x += vec_size) { + tform(input + x, output + x, n); + } + transpose(output, temp, n); + + for (int x = 0; x < n; x += vec_size) { + tform(temp + x, output + x, n); + } + transpose(output, temp, n); + + unpack(temp, output, n); +} + +static INLINE void store_float(float *output, float input) { *output = input; } +static INLINE float add_float(float a, float b) { return a + b; } +static INLINE float sub_float(float a, float b) { return a - b; } +static INLINE float mul_float(float a, float b) { return a * b; } + +GEN_FFT_2(void, float, float, float, *, store_float); +GEN_FFT_4(void, float, float, float, *, store_float, (float), add_float, + sub_float); +GEN_FFT_8(void, float, float, float, *, store_float, (float), add_float, + sub_float, mul_float); +GEN_FFT_16(void, float, float, float, *, store_float, (float), add_float, + sub_float, mul_float); +GEN_FFT_32(void, float, float, float, *, store_float, (float), add_float, + sub_float, mul_float); + +void aom_fft2x2_float_c(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, simple_transpose, + unpack_2d_output, 1); +} + +void aom_fft4x4_float_c(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, simple_transpose, + unpack_2d_output, 1); +} + +void aom_fft8x8_float_c(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, simple_transpose, + unpack_2d_output, 1); +} + +void aom_fft16x16_float_c(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, simple_transpose, + unpack_2d_output, 1); +} + +void aom_fft32x32_float_c(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, simple_transpose, + unpack_2d_output, 1); +} + +void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n, + aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi, + aom_fft_1d_func_t ifft_multi, + aom_fft_transpose_func_t transpose, int vec_size) { + // Column 0 and n/2 have conjugate symmetry, so we can directly do the ifft + // and get real outputs. + for (int y = 0; y <= n / 2; ++y) { + output[y * n] = input[2 * y * n]; + output[y * n + 1] = input[2 * (y * n + n / 2)]; + } + for (int y = n / 2 + 1; y < n; ++y) { + output[y * n] = input[2 * (y - n / 2) * n + 1]; + output[y * n + 1] = input[2 * ((y - n / 2) * n + n / 2) + 1]; + } + + for (int i = 0; i < 2; i += vec_size) { + ifft_multi(output + i, temp + i, n); + } + + // For the other columns, since we don't have a full ifft for complex inputs + // we have to split them into the real and imaginary counterparts. + // Pack the real component, then the imaginary components. + for (int y = 0; y < n; ++y) { + for (int x = 1; x < n / 2; ++x) { + output[y * n + (x + 1)] = input[2 * (y * n + x)]; + } + for (int x = 1; x < n / 2; ++x) { + output[y * n + (x + n / 2)] = input[2 * (y * n + x) + 1]; + } + } + for (int y = 2; y < vec_size; y++) { + fft_single(output + y, temp + y, n); + } + // This is the part that can be sped up with SIMD + for (int y = AOMMAX(2, vec_size); y < n; y += vec_size) { + fft_multi(output + y, temp + y, n); + } + + // Put the 0 and n/2 th results in the correct place. + for (int x = 0; x < n; ++x) { + output[x] = temp[x * n]; + output[(n / 2) * n + x] = temp[x * n + 1]; + } + // This rearranges and transposes. + for (int y = 1; y < n / 2; ++y) { + // Fill in the real columns + for (int x = 0; x <= n / 2; ++x) { + output[x + y * n] = + temp[(y + 1) + x * n] + + ((x > 0 && x < n / 2) ? temp[(y + n / 2) + (x + n / 2) * n] : 0); + } + for (int x = n / 2 + 1; x < n; ++x) { + output[x + y * n] = temp[(y + 1) + (n - x) * n] - + temp[(y + n / 2) + ((n - x) + n / 2) * n]; + } + // Fill in the imag columns + for (int x = 0; x <= n / 2; ++x) { + output[x + (y + n / 2) * n] = + temp[(y + n / 2) + x * n] - + ((x > 0 && x < n / 2) ? temp[(y + 1) + (x + n / 2) * n] : 0); + } + for (int x = n / 2 + 1; x < n; ++x) { + output[x + (y + n / 2) * n] = temp[(y + 1) + ((n - x) + n / 2) * n] + + temp[(y + n / 2) + (n - x) * n]; + } + } + for (int y = 0; y < n; y += vec_size) { + ifft_multi(output + y, temp + y, n); + } + transpose(temp, output, n); +} + +GEN_IFFT_2(void, float, float, float, *, store_float); +GEN_IFFT_4(void, float, float, float, *, store_float, (float), add_float, + sub_float); +GEN_IFFT_8(void, float, float, float, *, store_float, (float), add_float, + sub_float, mul_float); +GEN_IFFT_16(void, float, float, float, *, store_float, (float), add_float, + sub_float, mul_float); +GEN_IFFT_32(void, float, float, float, *, store_float, (float), add_float, + sub_float, mul_float); + +void aom_ifft2x2_float_c(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, aom_fft1d_2_float, + aom_ifft1d_2_float, simple_transpose, 1); +} + +void aom_ifft4x4_float_c(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_float, + aom_ifft1d_4_float, simple_transpose, 1); +} + +void aom_ifft8x8_float_c(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_float, + aom_ifft1d_8_float, simple_transpose, 1); +} + +void aom_ifft16x16_float_c(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, + aom_fft1d_16_float, aom_ifft1d_16_float, simple_transpose, 1); +} + +void aom_ifft32x32_float_c(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, + aom_fft1d_32_float, aom_ifft1d_32_float, simple_transpose, 1); +} diff --git a/libs/libaom/src/aom_dsp/fft_common.h b/libs/libaom/src/aom_dsp/fft_common.h new file mode 100644 index 000000000..5137331ae --- /dev/null +++ b/libs/libaom/src/aom_dsp/fft_common.h @@ -0,0 +1,1050 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_FFT_COMMON_H_ +#define AOM_AOM_DSP_FFT_COMMON_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\brief A function pointer for computing 1d fft and ifft. + * + * The function will point to an implementation for a specific transform size, + * and may perform the transforms using vectorized instructions. + * + * For a non-vectorized forward transforms of size n, the input and output + * buffers will be size n. The output takes advantage of conjugate symmetry and + * packs the results as: [r_0, r_1, ..., r_{n/2}, i_1, ..., i_{n/2-1}], where + * (r_{j}, i_{j}) is the complex output for index j. + * + * An inverse transform will assume that the complex "input" is packed + * similarly. Its output will be real. + * + * Non-vectorized transforms (e.g., on a single row) would use a stride = 1. + * + * Vectorized implementations are parallelized along the columns so that the fft + * can be performed on multiple columns at a time. In such cases the data block + * for input and output is typically square (n x n) and the stride will + * correspond to the spacing between rows. At minimum, the input size must be + * n x simd_vector_length. + * + * \param[in] input Input buffer. See above for size restrictions. + * \param[out] output Output buffer. See above for size restrictions. + * \param[in] stride The spacing in number of elements between rows + * (or elements) + */ +typedef void (*aom_fft_1d_func_t)(const float *input, float *output, + int stride); + +// Declare some of the forward non-vectorized transforms which are used in some +// of the vectorized implementations +void aom_fft1d_4_float(const float *input, float *output, int stride); +void aom_fft1d_8_float(const float *input, float *output, int stride); +void aom_fft1d_16_float(const float *input, float *output, int stride); +void aom_fft1d_32_float(const float *input, float *output, int stride); + +/**\!brief Function pointer for transposing a matrix of floats. + * + * \param[in] input Input buffer (size n x n) + * \param[out] output Output buffer (size n x n) + * \param[in] n Extent of one dimension of the square matrix. + */ +typedef void (*aom_fft_transpose_func_t)(const float *input, float *output, + int n); + +/**\!brief Function pointer for re-arranging intermediate 2d transform results. + * + * After re-arrangement, the real and imaginary components will be packed + * tightly next to each other. + * + * \param[in] input Input buffer (size n x n) + * \param[out] output Output buffer (size 2 x n x n) + * \param[in] n Extent of one dimension of the square matrix. + */ +typedef void (*aom_fft_unpack_func_t)(const float *input, float *output, int n); + +/*!\brief Performs a 2d fft with the given functions. + * + * This generator function allows for multiple different implementations of 2d + * fft with different vector operations, without having to redefine the main + * body multiple times. + * + * \param[in] input Input buffer to run the transform on (size n x n) + * \param[out] temp Working buffer for computing the transform (size n x n) + * \param[out] output Output buffer (size 2 x n x n) + * \param[in] tform Forward transform function + * \param[in] transpose Transpose function (for n x n matrix) + * \param[in] unpack Unpack function used to massage outputs to correct form + * \param[in] vec_size Vector size (the transform is done vec_size units at + * a time) + */ +void aom_fft_2d_gen(const float *input, float *temp, float *output, int n, + aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose, + aom_fft_unpack_func_t unpack, int vec_size); + +/*!\brief Perform a 2d inverse fft with the given helper functions + * + * \param[in] input Input buffer to run the transform on (size 2 x n x n) + * \param[out] temp Working buffer for computations (size 2 x n x n) + * \param[out] output Output buffer (size n x n) + * \param[in] fft_single Forward transform function (non vectorized) + * \param[in] fft_multi Forward transform function (vectorized) + * \param[in] ifft_multi Inverse transform function (vectorized) + * \param[in] transpose Transpose function (for n x n matrix) + * \param[in] vec_size Vector size (the transform is done vec_size + * units at a time) + */ +void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n, + aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi, + aom_fft_1d_func_t ifft_multi, + aom_fft_transpose_func_t transpose, int vec_size); +#ifdef __cplusplus +} +#endif + +// The macros below define 1D fft/ifft for different data types and for +// different simd vector intrinsic types. + +#define GEN_FFT_2(ret, suffix, T, T_VEC, load, store) \ + ret aom_fft1d_2_##suffix(const T *input, T *output, int stride) { \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + store(output + 0 * stride, i0 + i1); \ + store(output + 1 * stride, i0 - i1); \ + } + +#define GEN_FFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \ + ret aom_fft1d_4_##suffix(const T *input, T *output, int stride) { \ + const T_VEC kWeight0 = constant(0.0f); \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + const T_VEC i2 = load(input + 2 * stride); \ + const T_VEC i3 = load(input + 3 * stride); \ + const T_VEC w0 = add(i0, i2); \ + const T_VEC w1 = sub(i0, i2); \ + const T_VEC w2 = add(i1, i3); \ + const T_VEC w3 = sub(i1, i3); \ + store(output + 0 * stride, add(w0, w2)); \ + store(output + 1 * stride, w1); \ + store(output + 2 * stride, sub(w0, w2)); \ + store(output + 3 * stride, sub(kWeight0, w3)); \ + } + +#define GEN_FFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul) \ + ret aom_fft1d_8_##suffix(const T *input, T *output, int stride) { \ + const T_VEC kWeight0 = constant(0.0f); \ + const T_VEC kWeight2 = constant(0.707107f); \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + const T_VEC i2 = load(input + 2 * stride); \ + const T_VEC i3 = load(input + 3 * stride); \ + const T_VEC i4 = load(input + 4 * stride); \ + const T_VEC i5 = load(input + 5 * stride); \ + const T_VEC i6 = load(input + 6 * stride); \ + const T_VEC i7 = load(input + 7 * stride); \ + const T_VEC w0 = add(i0, i4); \ + const T_VEC w1 = sub(i0, i4); \ + const T_VEC w2 = add(i2, i6); \ + const T_VEC w3 = sub(i2, i6); \ + const T_VEC w4 = add(w0, w2); \ + const T_VEC w5 = sub(w0, w2); \ + const T_VEC w7 = add(i1, i5); \ + const T_VEC w8 = sub(i1, i5); \ + const T_VEC w9 = add(i3, i7); \ + const T_VEC w10 = sub(i3, i7); \ + const T_VEC w11 = add(w7, w9); \ + const T_VEC w12 = sub(w7, w9); \ + store(output + 0 * stride, add(w4, w11)); \ + store(output + 1 * stride, add(w1, mul(kWeight2, sub(w8, w10)))); \ + store(output + 2 * stride, w5); \ + store(output + 3 * stride, sub(w1, mul(kWeight2, sub(w8, w10)))); \ + store(output + 4 * stride, sub(w4, w11)); \ + store(output + 5 * stride, \ + sub(sub(kWeight0, w3), mul(kWeight2, add(w10, w8)))); \ + store(output + 6 * stride, sub(kWeight0, w12)); \ + store(output + 7 * stride, sub(w3, mul(kWeight2, add(w10, w8)))); \ + } + +#define GEN_FFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ + mul) \ + ret aom_fft1d_16_##suffix(const T *input, T *output, int stride) { \ + const T_VEC kWeight0 = constant(0.0f); \ + const T_VEC kWeight2 = constant(0.707107f); \ + const T_VEC kWeight3 = constant(0.92388f); \ + const T_VEC kWeight4 = constant(0.382683f); \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + const T_VEC i2 = load(input + 2 * stride); \ + const T_VEC i3 = load(input + 3 * stride); \ + const T_VEC i4 = load(input + 4 * stride); \ + const T_VEC i5 = load(input + 5 * stride); \ + const T_VEC i6 = load(input + 6 * stride); \ + const T_VEC i7 = load(input + 7 * stride); \ + const T_VEC i8 = load(input + 8 * stride); \ + const T_VEC i9 = load(input + 9 * stride); \ + const T_VEC i10 = load(input + 10 * stride); \ + const T_VEC i11 = load(input + 11 * stride); \ + const T_VEC i12 = load(input + 12 * stride); \ + const T_VEC i13 = load(input + 13 * stride); \ + const T_VEC i14 = load(input + 14 * stride); \ + const T_VEC i15 = load(input + 15 * stride); \ + const T_VEC w0 = add(i0, i8); \ + const T_VEC w1 = sub(i0, i8); \ + const T_VEC w2 = add(i4, i12); \ + const T_VEC w3 = sub(i4, i12); \ + const T_VEC w4 = add(w0, w2); \ + const T_VEC w5 = sub(w0, w2); \ + const T_VEC w7 = add(i2, i10); \ + const T_VEC w8 = sub(i2, i10); \ + const T_VEC w9 = add(i6, i14); \ + const T_VEC w10 = sub(i6, i14); \ + const T_VEC w11 = add(w7, w9); \ + const T_VEC w12 = sub(w7, w9); \ + const T_VEC w14 = add(w4, w11); \ + const T_VEC w15 = sub(w4, w11); \ + const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))), \ + sub(sub(kWeight0, w3), \ + mul(kWeight2, add(w10, w8))) }; \ + const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))), \ + sub(w3, mul(kWeight2, add(w10, w8))) }; \ + const T_VEC w19 = add(i1, i9); \ + const T_VEC w20 = sub(i1, i9); \ + const T_VEC w21 = add(i5, i13); \ + const T_VEC w22 = sub(i5, i13); \ + const T_VEC w23 = add(w19, w21); \ + const T_VEC w24 = sub(w19, w21); \ + const T_VEC w26 = add(i3, i11); \ + const T_VEC w27 = sub(i3, i11); \ + const T_VEC w28 = add(i7, i15); \ + const T_VEC w29 = sub(i7, i15); \ + const T_VEC w30 = add(w26, w28); \ + const T_VEC w31 = sub(w26, w28); \ + const T_VEC w33 = add(w23, w30); \ + const T_VEC w34 = sub(w23, w30); \ + const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))), \ + sub(sub(kWeight0, w22), \ + mul(kWeight2, add(w29, w27))) }; \ + const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))), \ + sub(w22, mul(kWeight2, add(w29, w27))) }; \ + store(output + 0 * stride, add(w14, w33)); \ + store(output + 1 * stride, \ + add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1])))); \ + store(output + 2 * stride, add(w5, mul(kWeight2, sub(w24, w31)))); \ + store(output + 3 * stride, \ + add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1])))); \ + store(output + 4 * stride, w15); \ + store(output + 5 * stride, \ + add(w18[0], sub(sub(kWeight0, mul(kWeight4, w37[0])), \ + mul(kWeight3, w37[1])))); \ + store(output + 6 * stride, sub(w5, mul(kWeight2, sub(w24, w31)))); \ + store(output + 7 * stride, \ + add(w16[0], sub(sub(kWeight0, mul(kWeight3, w35[0])), \ + mul(kWeight4, w35[1])))); \ + store(output + 8 * stride, sub(w14, w33)); \ + store(output + 9 * stride, \ + add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))); \ + store(output + 10 * stride, \ + sub(sub(kWeight0, w12), mul(kWeight2, add(w31, w24)))); \ + store(output + 11 * stride, \ + add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))); \ + store(output + 12 * stride, sub(kWeight0, w34)); \ + store(output + 13 * stride, \ + sub(sub(kWeight0, w18[1]), \ + sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1])))); \ + store(output + 14 * stride, sub(w12, mul(kWeight2, add(w31, w24)))); \ + store(output + 15 * stride, \ + sub(sub(kWeight0, w16[1]), \ + sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1])))); \ + } + +#define GEN_FFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ + mul) \ + ret aom_fft1d_32_##suffix(const T *input, T *output, int stride) { \ + const T_VEC kWeight0 = constant(0.0f); \ + const T_VEC kWeight2 = constant(0.707107f); \ + const T_VEC kWeight3 = constant(0.92388f); \ + const T_VEC kWeight4 = constant(0.382683f); \ + const T_VEC kWeight5 = constant(0.980785f); \ + const T_VEC kWeight6 = constant(0.19509f); \ + const T_VEC kWeight7 = constant(0.83147f); \ + const T_VEC kWeight8 = constant(0.55557f); \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + const T_VEC i2 = load(input + 2 * stride); \ + const T_VEC i3 = load(input + 3 * stride); \ + const T_VEC i4 = load(input + 4 * stride); \ + const T_VEC i5 = load(input + 5 * stride); \ + const T_VEC i6 = load(input + 6 * stride); \ + const T_VEC i7 = load(input + 7 * stride); \ + const T_VEC i8 = load(input + 8 * stride); \ + const T_VEC i9 = load(input + 9 * stride); \ + const T_VEC i10 = load(input + 10 * stride); \ + const T_VEC i11 = load(input + 11 * stride); \ + const T_VEC i12 = load(input + 12 * stride); \ + const T_VEC i13 = load(input + 13 * stride); \ + const T_VEC i14 = load(input + 14 * stride); \ + const T_VEC i15 = load(input + 15 * stride); \ + const T_VEC i16 = load(input + 16 * stride); \ + const T_VEC i17 = load(input + 17 * stride); \ + const T_VEC i18 = load(input + 18 * stride); \ + const T_VEC i19 = load(input + 19 * stride); \ + const T_VEC i20 = load(input + 20 * stride); \ + const T_VEC i21 = load(input + 21 * stride); \ + const T_VEC i22 = load(input + 22 * stride); \ + const T_VEC i23 = load(input + 23 * stride); \ + const T_VEC i24 = load(input + 24 * stride); \ + const T_VEC i25 = load(input + 25 * stride); \ + const T_VEC i26 = load(input + 26 * stride); \ + const T_VEC i27 = load(input + 27 * stride); \ + const T_VEC i28 = load(input + 28 * stride); \ + const T_VEC i29 = load(input + 29 * stride); \ + const T_VEC i30 = load(input + 30 * stride); \ + const T_VEC i31 = load(input + 31 * stride); \ + const T_VEC w0 = add(i0, i16); \ + const T_VEC w1 = sub(i0, i16); \ + const T_VEC w2 = add(i8, i24); \ + const T_VEC w3 = sub(i8, i24); \ + const T_VEC w4 = add(w0, w2); \ + const T_VEC w5 = sub(w0, w2); \ + const T_VEC w7 = add(i4, i20); \ + const T_VEC w8 = sub(i4, i20); \ + const T_VEC w9 = add(i12, i28); \ + const T_VEC w10 = sub(i12, i28); \ + const T_VEC w11 = add(w7, w9); \ + const T_VEC w12 = sub(w7, w9); \ + const T_VEC w14 = add(w4, w11); \ + const T_VEC w15 = sub(w4, w11); \ + const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))), \ + sub(sub(kWeight0, w3), \ + mul(kWeight2, add(w10, w8))) }; \ + const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))), \ + sub(w3, mul(kWeight2, add(w10, w8))) }; \ + const T_VEC w19 = add(i2, i18); \ + const T_VEC w20 = sub(i2, i18); \ + const T_VEC w21 = add(i10, i26); \ + const T_VEC w22 = sub(i10, i26); \ + const T_VEC w23 = add(w19, w21); \ + const T_VEC w24 = sub(w19, w21); \ + const T_VEC w26 = add(i6, i22); \ + const T_VEC w27 = sub(i6, i22); \ + const T_VEC w28 = add(i14, i30); \ + const T_VEC w29 = sub(i14, i30); \ + const T_VEC w30 = add(w26, w28); \ + const T_VEC w31 = sub(w26, w28); \ + const T_VEC w33 = add(w23, w30); \ + const T_VEC w34 = sub(w23, w30); \ + const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))), \ + sub(sub(kWeight0, w22), \ + mul(kWeight2, add(w29, w27))) }; \ + const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))), \ + sub(w22, mul(kWeight2, add(w29, w27))) }; \ + const T_VEC w38 = add(w14, w33); \ + const T_VEC w39 = sub(w14, w33); \ + const T_VEC w40[2] = { \ + add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1]))), \ + add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0]))) \ + }; \ + const T_VEC w41[2] = { add(w5, mul(kWeight2, sub(w24, w31))), \ + sub(sub(kWeight0, w12), \ + mul(kWeight2, add(w31, w24))) }; \ + const T_VEC w42[2] = { \ + add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1]))), \ + add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0]))) \ + }; \ + const T_VEC w44[2] = { \ + add(w18[0], \ + sub(sub(kWeight0, mul(kWeight4, w37[0])), mul(kWeight3, w37[1]))), \ + sub(sub(kWeight0, w18[1]), \ + sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1]))) \ + }; \ + const T_VEC w45[2] = { sub(w5, mul(kWeight2, sub(w24, w31))), \ + sub(w12, mul(kWeight2, add(w31, w24))) }; \ + const T_VEC w46[2] = { \ + add(w16[0], \ + sub(sub(kWeight0, mul(kWeight3, w35[0])), mul(kWeight4, w35[1]))), \ + sub(sub(kWeight0, w16[1]), \ + sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1]))) \ + }; \ + const T_VEC w47 = add(i1, i17); \ + const T_VEC w48 = sub(i1, i17); \ + const T_VEC w49 = add(i9, i25); \ + const T_VEC w50 = sub(i9, i25); \ + const T_VEC w51 = add(w47, w49); \ + const T_VEC w52 = sub(w47, w49); \ + const T_VEC w54 = add(i5, i21); \ + const T_VEC w55 = sub(i5, i21); \ + const T_VEC w56 = add(i13, i29); \ + const T_VEC w57 = sub(i13, i29); \ + const T_VEC w58 = add(w54, w56); \ + const T_VEC w59 = sub(w54, w56); \ + const T_VEC w61 = add(w51, w58); \ + const T_VEC w62 = sub(w51, w58); \ + const T_VEC w63[2] = { add(w48, mul(kWeight2, sub(w55, w57))), \ + sub(sub(kWeight0, w50), \ + mul(kWeight2, add(w57, w55))) }; \ + const T_VEC w65[2] = { sub(w48, mul(kWeight2, sub(w55, w57))), \ + sub(w50, mul(kWeight2, add(w57, w55))) }; \ + const T_VEC w66 = add(i3, i19); \ + const T_VEC w67 = sub(i3, i19); \ + const T_VEC w68 = add(i11, i27); \ + const T_VEC w69 = sub(i11, i27); \ + const T_VEC w70 = add(w66, w68); \ + const T_VEC w71 = sub(w66, w68); \ + const T_VEC w73 = add(i7, i23); \ + const T_VEC w74 = sub(i7, i23); \ + const T_VEC w75 = add(i15, i31); \ + const T_VEC w76 = sub(i15, i31); \ + const T_VEC w77 = add(w73, w75); \ + const T_VEC w78 = sub(w73, w75); \ + const T_VEC w80 = add(w70, w77); \ + const T_VEC w81 = sub(w70, w77); \ + const T_VEC w82[2] = { add(w67, mul(kWeight2, sub(w74, w76))), \ + sub(sub(kWeight0, w69), \ + mul(kWeight2, add(w76, w74))) }; \ + const T_VEC w84[2] = { sub(w67, mul(kWeight2, sub(w74, w76))), \ + sub(w69, mul(kWeight2, add(w76, w74))) }; \ + const T_VEC w85 = add(w61, w80); \ + const T_VEC w86 = sub(w61, w80); \ + const T_VEC w87[2] = { \ + add(w63[0], add(mul(kWeight3, w82[0]), mul(kWeight4, w82[1]))), \ + add(w63[1], sub(mul(kWeight3, w82[1]), mul(kWeight4, w82[0]))) \ + }; \ + const T_VEC w88[2] = { add(w52, mul(kWeight2, sub(w71, w78))), \ + sub(sub(kWeight0, w59), \ + mul(kWeight2, add(w78, w71))) }; \ + const T_VEC w89[2] = { \ + add(w65[0], add(mul(kWeight4, w84[0]), mul(kWeight3, w84[1]))), \ + add(w65[1], sub(mul(kWeight4, w84[1]), mul(kWeight3, w84[0]))) \ + }; \ + const T_VEC w91[2] = { \ + add(w65[0], \ + sub(sub(kWeight0, mul(kWeight4, w84[0])), mul(kWeight3, w84[1]))), \ + sub(sub(kWeight0, w65[1]), \ + sub(mul(kWeight3, w84[0]), mul(kWeight4, w84[1]))) \ + }; \ + const T_VEC w92[2] = { sub(w52, mul(kWeight2, sub(w71, w78))), \ + sub(w59, mul(kWeight2, add(w78, w71))) }; \ + const T_VEC w93[2] = { \ + add(w63[0], \ + sub(sub(kWeight0, mul(kWeight3, w82[0])), mul(kWeight4, w82[1]))), \ + sub(sub(kWeight0, w63[1]), \ + sub(mul(kWeight4, w82[0]), mul(kWeight3, w82[1]))) \ + }; \ + store(output + 0 * stride, add(w38, w85)); \ + store(output + 1 * stride, \ + add(w40[0], add(mul(kWeight5, w87[0]), mul(kWeight6, w87[1])))); \ + store(output + 2 * stride, \ + add(w41[0], add(mul(kWeight3, w88[0]), mul(kWeight4, w88[1])))); \ + store(output + 3 * stride, \ + add(w42[0], add(mul(kWeight7, w89[0]), mul(kWeight8, w89[1])))); \ + store(output + 4 * stride, add(w15, mul(kWeight2, sub(w62, w81)))); \ + store(output + 5 * stride, \ + add(w44[0], add(mul(kWeight8, w91[0]), mul(kWeight7, w91[1])))); \ + store(output + 6 * stride, \ + add(w45[0], add(mul(kWeight4, w92[0]), mul(kWeight3, w92[1])))); \ + store(output + 7 * stride, \ + add(w46[0], add(mul(kWeight6, w93[0]), mul(kWeight5, w93[1])))); \ + store(output + 8 * stride, w39); \ + store(output + 9 * stride, \ + add(w46[0], sub(sub(kWeight0, mul(kWeight6, w93[0])), \ + mul(kWeight5, w93[1])))); \ + store(output + 10 * stride, \ + add(w45[0], sub(sub(kWeight0, mul(kWeight4, w92[0])), \ + mul(kWeight3, w92[1])))); \ + store(output + 11 * stride, \ + add(w44[0], sub(sub(kWeight0, mul(kWeight8, w91[0])), \ + mul(kWeight7, w91[1])))); \ + store(output + 12 * stride, sub(w15, mul(kWeight2, sub(w62, w81)))); \ + store(output + 13 * stride, \ + add(w42[0], sub(sub(kWeight0, mul(kWeight7, w89[0])), \ + mul(kWeight8, w89[1])))); \ + store(output + 14 * stride, \ + add(w41[0], sub(sub(kWeight0, mul(kWeight3, w88[0])), \ + mul(kWeight4, w88[1])))); \ + store(output + 15 * stride, \ + add(w40[0], sub(sub(kWeight0, mul(kWeight5, w87[0])), \ + mul(kWeight6, w87[1])))); \ + store(output + 16 * stride, sub(w38, w85)); \ + store(output + 17 * stride, \ + add(w40[1], sub(mul(kWeight5, w87[1]), mul(kWeight6, w87[0])))); \ + store(output + 18 * stride, \ + add(w41[1], sub(mul(kWeight3, w88[1]), mul(kWeight4, w88[0])))); \ + store(output + 19 * stride, \ + add(w42[1], sub(mul(kWeight7, w89[1]), mul(kWeight8, w89[0])))); \ + store(output + 20 * stride, \ + sub(sub(kWeight0, w34), mul(kWeight2, add(w81, w62)))); \ + store(output + 21 * stride, \ + add(w44[1], sub(mul(kWeight8, w91[1]), mul(kWeight7, w91[0])))); \ + store(output + 22 * stride, \ + add(w45[1], sub(mul(kWeight4, w92[1]), mul(kWeight3, w92[0])))); \ + store(output + 23 * stride, \ + add(w46[1], sub(mul(kWeight6, w93[1]), mul(kWeight5, w93[0])))); \ + store(output + 24 * stride, sub(kWeight0, w86)); \ + store(output + 25 * stride, \ + sub(sub(kWeight0, w46[1]), \ + sub(mul(kWeight5, w93[0]), mul(kWeight6, w93[1])))); \ + store(output + 26 * stride, \ + sub(sub(kWeight0, w45[1]), \ + sub(mul(kWeight3, w92[0]), mul(kWeight4, w92[1])))); \ + store(output + 27 * stride, \ + sub(sub(kWeight0, w44[1]), \ + sub(mul(kWeight7, w91[0]), mul(kWeight8, w91[1])))); \ + store(output + 28 * stride, sub(w34, mul(kWeight2, add(w81, w62)))); \ + store(output + 29 * stride, \ + sub(sub(kWeight0, w42[1]), \ + sub(mul(kWeight8, w89[0]), mul(kWeight7, w89[1])))); \ + store(output + 30 * stride, \ + sub(sub(kWeight0, w41[1]), \ + sub(mul(kWeight4, w88[0]), mul(kWeight3, w88[1])))); \ + store(output + 31 * stride, \ + sub(sub(kWeight0, w40[1]), \ + sub(mul(kWeight6, w87[0]), mul(kWeight5, w87[1])))); \ + } + +#define GEN_IFFT_2(ret, suffix, T, T_VEC, load, store) \ + ret aom_ifft1d_2_##suffix(const T *input, T *output, int stride) { \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + store(output + 0 * stride, i0 + i1); \ + store(output + 1 * stride, i0 - i1); \ + } + +#define GEN_IFFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \ + ret aom_ifft1d_4_##suffix(const T *input, T *output, int stride) { \ + const T_VEC kWeight0 = constant(0.0f); \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + const T_VEC i2 = load(input + 2 * stride); \ + const T_VEC i3 = load(input + 3 * stride); \ + const T_VEC w2 = add(i0, i2); \ + const T_VEC w3 = sub(i0, i2); \ + const T_VEC w4[2] = { add(i1, i1), sub(i3, i3) }; \ + const T_VEC w5[2] = { sub(i1, i1), sub(sub(kWeight0, i3), i3) }; \ + store(output + 0 * stride, add(w2, w4[0])); \ + store(output + 1 * stride, add(w3, w5[1])); \ + store(output + 2 * stride, sub(w2, w4[0])); \ + store(output + 3 * stride, sub(w3, w5[1])); \ + } + +#define GEN_IFFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ + mul) \ + ret aom_ifft1d_8_##suffix(const T *input, T *output, int stride) { \ + const T_VEC kWeight0 = constant(0.0f); \ + const T_VEC kWeight2 = constant(0.707107f); \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + const T_VEC i2 = load(input + 2 * stride); \ + const T_VEC i3 = load(input + 3 * stride); \ + const T_VEC i4 = load(input + 4 * stride); \ + const T_VEC i5 = load(input + 5 * stride); \ + const T_VEC i6 = load(input + 6 * stride); \ + const T_VEC i7 = load(input + 7 * stride); \ + const T_VEC w6 = add(i0, i4); \ + const T_VEC w7 = sub(i0, i4); \ + const T_VEC w8[2] = { add(i2, i2), sub(i6, i6) }; \ + const T_VEC w9[2] = { sub(i2, i2), sub(sub(kWeight0, i6), i6) }; \ + const T_VEC w10[2] = { add(w6, w8[0]), w8[1] }; \ + const T_VEC w11[2] = { sub(w6, w8[0]), sub(kWeight0, w8[1]) }; \ + const T_VEC w12[2] = { add(w7, w9[1]), sub(kWeight0, w9[0]) }; \ + const T_VEC w13[2] = { sub(w7, w9[1]), w9[0] }; \ + const T_VEC w14[2] = { add(i1, i3), sub(i7, i5) }; \ + const T_VEC w15[2] = { sub(i1, i3), sub(sub(kWeight0, i5), i7) }; \ + const T_VEC w16[2] = { add(i3, i1), sub(i5, i7) }; \ + const T_VEC w17[2] = { sub(i3, i1), sub(sub(kWeight0, i7), i5) }; \ + const T_VEC w18[2] = { add(w14[0], w16[0]), add(w14[1], w16[1]) }; \ + const T_VEC w19[2] = { sub(w14[0], w16[0]), sub(w14[1], w16[1]) }; \ + const T_VEC w20[2] = { add(w15[0], w17[1]), sub(w15[1], w17[0]) }; \ + const T_VEC w21[2] = { sub(w15[0], w17[1]), add(w15[1], w17[0]) }; \ + store(output + 0 * stride, add(w10[0], w18[0])); \ + store(output + 1 * stride, \ + add(w12[0], mul(kWeight2, add(w20[0], w20[1])))); \ + store(output + 2 * stride, add(w11[0], w19[1])); \ + store(output + 3 * stride, \ + sub(w13[0], mul(kWeight2, sub(w21[0], w21[1])))); \ + store(output + 4 * stride, sub(w10[0], w18[0])); \ + store(output + 5 * stride, \ + add(w12[0], sub(sub(kWeight0, mul(kWeight2, w20[0])), \ + mul(kWeight2, w20[1])))); \ + store(output + 6 * stride, sub(w11[0], w19[1])); \ + store(output + 7 * stride, \ + add(w13[0], mul(kWeight2, sub(w21[0], w21[1])))); \ + } + +#define GEN_IFFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ + mul) \ + ret aom_ifft1d_16_##suffix(const T *input, T *output, int stride) { \ + const T_VEC kWeight0 = constant(0.0f); \ + const T_VEC kWeight2 = constant(0.707107f); \ + const T_VEC kWeight3 = constant(0.92388f); \ + const T_VEC kWeight4 = constant(0.382683f); \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + const T_VEC i2 = load(input + 2 * stride); \ + const T_VEC i3 = load(input + 3 * stride); \ + const T_VEC i4 = load(input + 4 * stride); \ + const T_VEC i5 = load(input + 5 * stride); \ + const T_VEC i6 = load(input + 6 * stride); \ + const T_VEC i7 = load(input + 7 * stride); \ + const T_VEC i8 = load(input + 8 * stride); \ + const T_VEC i9 = load(input + 9 * stride); \ + const T_VEC i10 = load(input + 10 * stride); \ + const T_VEC i11 = load(input + 11 * stride); \ + const T_VEC i12 = load(input + 12 * stride); \ + const T_VEC i13 = load(input + 13 * stride); \ + const T_VEC i14 = load(input + 14 * stride); \ + const T_VEC i15 = load(input + 15 * stride); \ + const T_VEC w14 = add(i0, i8); \ + const T_VEC w15 = sub(i0, i8); \ + const T_VEC w16[2] = { add(i4, i4), sub(i12, i12) }; \ + const T_VEC w17[2] = { sub(i4, i4), sub(sub(kWeight0, i12), i12) }; \ + const T_VEC w18[2] = { add(w14, w16[0]), w16[1] }; \ + const T_VEC w19[2] = { sub(w14, w16[0]), sub(kWeight0, w16[1]) }; \ + const T_VEC w20[2] = { add(w15, w17[1]), sub(kWeight0, w17[0]) }; \ + const T_VEC w21[2] = { sub(w15, w17[1]), w17[0] }; \ + const T_VEC w22[2] = { add(i2, i6), sub(i14, i10) }; \ + const T_VEC w23[2] = { sub(i2, i6), sub(sub(kWeight0, i10), i14) }; \ + const T_VEC w24[2] = { add(i6, i2), sub(i10, i14) }; \ + const T_VEC w25[2] = { sub(i6, i2), sub(sub(kWeight0, i14), i10) }; \ + const T_VEC w26[2] = { add(w22[0], w24[0]), add(w22[1], w24[1]) }; \ + const T_VEC w27[2] = { sub(w22[0], w24[0]), sub(w22[1], w24[1]) }; \ + const T_VEC w28[2] = { add(w23[0], w25[1]), sub(w23[1], w25[0]) }; \ + const T_VEC w29[2] = { sub(w23[0], w25[1]), add(w23[1], w25[0]) }; \ + const T_VEC w30[2] = { add(w18[0], w26[0]), add(w18[1], w26[1]) }; \ + const T_VEC w31[2] = { sub(w18[0], w26[0]), sub(w18[1], w26[1]) }; \ + const T_VEC w32[2] = { add(w20[0], mul(kWeight2, add(w28[0], w28[1]))), \ + add(w20[1], mul(kWeight2, sub(w28[1], w28[0]))) }; \ + const T_VEC w33[2] = { add(w20[0], \ + sub(sub(kWeight0, mul(kWeight2, w28[0])), \ + mul(kWeight2, w28[1]))), \ + add(w20[1], mul(kWeight2, sub(w28[0], w28[1]))) }; \ + const T_VEC w34[2] = { add(w19[0], w27[1]), sub(w19[1], w27[0]) }; \ + const T_VEC w35[2] = { sub(w19[0], w27[1]), add(w19[1], w27[0]) }; \ + const T_VEC w36[2] = { sub(w21[0], mul(kWeight2, sub(w29[0], w29[1]))), \ + sub(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \ + const T_VEC w37[2] = { add(w21[0], mul(kWeight2, sub(w29[0], w29[1]))), \ + add(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \ + const T_VEC w38[2] = { add(i1, i7), sub(i15, i9) }; \ + const T_VEC w39[2] = { sub(i1, i7), sub(sub(kWeight0, i9), i15) }; \ + const T_VEC w40[2] = { add(i5, i3), sub(i11, i13) }; \ + const T_VEC w41[2] = { sub(i5, i3), sub(sub(kWeight0, i13), i11) }; \ + const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) }; \ + const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) }; \ + const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) }; \ + const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) }; \ + const T_VEC w46[2] = { add(i3, i5), sub(i13, i11) }; \ + const T_VEC w47[2] = { sub(i3, i5), sub(sub(kWeight0, i11), i13) }; \ + const T_VEC w48[2] = { add(i7, i1), sub(i9, i15) }; \ + const T_VEC w49[2] = { sub(i7, i1), sub(sub(kWeight0, i15), i9) }; \ + const T_VEC w50[2] = { add(w46[0], w48[0]), add(w46[1], w48[1]) }; \ + const T_VEC w51[2] = { sub(w46[0], w48[0]), sub(w46[1], w48[1]) }; \ + const T_VEC w52[2] = { add(w47[0], w49[1]), sub(w47[1], w49[0]) }; \ + const T_VEC w53[2] = { sub(w47[0], w49[1]), add(w47[1], w49[0]) }; \ + const T_VEC w54[2] = { add(w42[0], w50[0]), add(w42[1], w50[1]) }; \ + const T_VEC w55[2] = { sub(w42[0], w50[0]), sub(w42[1], w50[1]) }; \ + const T_VEC w56[2] = { add(w44[0], mul(kWeight2, add(w52[0], w52[1]))), \ + add(w44[1], mul(kWeight2, sub(w52[1], w52[0]))) }; \ + const T_VEC w57[2] = { add(w44[0], \ + sub(sub(kWeight0, mul(kWeight2, w52[0])), \ + mul(kWeight2, w52[1]))), \ + add(w44[1], mul(kWeight2, sub(w52[0], w52[1]))) }; \ + const T_VEC w58[2] = { add(w43[0], w51[1]), sub(w43[1], w51[0]) }; \ + const T_VEC w59[2] = { sub(w43[0], w51[1]), add(w43[1], w51[0]) }; \ + const T_VEC w60[2] = { sub(w45[0], mul(kWeight2, sub(w53[0], w53[1]))), \ + sub(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \ + const T_VEC w61[2] = { add(w45[0], mul(kWeight2, sub(w53[0], w53[1]))), \ + add(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \ + store(output + 0 * stride, add(w30[0], w54[0])); \ + store(output + 1 * stride, \ + add(w32[0], add(mul(kWeight3, w56[0]), mul(kWeight4, w56[1])))); \ + store(output + 2 * stride, \ + add(w34[0], mul(kWeight2, add(w58[0], w58[1])))); \ + store(output + 3 * stride, \ + add(w36[0], add(mul(kWeight4, w60[0]), mul(kWeight3, w60[1])))); \ + store(output + 4 * stride, add(w31[0], w55[1])); \ + store(output + 5 * stride, \ + sub(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1])))); \ + store(output + 6 * stride, \ + sub(w35[0], mul(kWeight2, sub(w59[0], w59[1])))); \ + store(output + 7 * stride, \ + sub(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1])))); \ + store(output + 8 * stride, sub(w30[0], w54[0])); \ + store(output + 9 * stride, \ + add(w32[0], sub(sub(kWeight0, mul(kWeight3, w56[0])), \ + mul(kWeight4, w56[1])))); \ + store(output + 10 * stride, \ + add(w34[0], sub(sub(kWeight0, mul(kWeight2, w58[0])), \ + mul(kWeight2, w58[1])))); \ + store(output + 11 * stride, \ + add(w36[0], sub(sub(kWeight0, mul(kWeight4, w60[0])), \ + mul(kWeight3, w60[1])))); \ + store(output + 12 * stride, sub(w31[0], w55[1])); \ + store(output + 13 * stride, \ + add(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1])))); \ + store(output + 14 * stride, \ + add(w35[0], mul(kWeight2, sub(w59[0], w59[1])))); \ + store(output + 15 * stride, \ + add(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1])))); \ + } +#define GEN_IFFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ + mul) \ + ret aom_ifft1d_32_##suffix(const T *input, T *output, int stride) { \ + const T_VEC kWeight0 = constant(0.0f); \ + const T_VEC kWeight2 = constant(0.707107f); \ + const T_VEC kWeight3 = constant(0.92388f); \ + const T_VEC kWeight4 = constant(0.382683f); \ + const T_VEC kWeight5 = constant(0.980785f); \ + const T_VEC kWeight6 = constant(0.19509f); \ + const T_VEC kWeight7 = constant(0.83147f); \ + const T_VEC kWeight8 = constant(0.55557f); \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + const T_VEC i2 = load(input + 2 * stride); \ + const T_VEC i3 = load(input + 3 * stride); \ + const T_VEC i4 = load(input + 4 * stride); \ + const T_VEC i5 = load(input + 5 * stride); \ + const T_VEC i6 = load(input + 6 * stride); \ + const T_VEC i7 = load(input + 7 * stride); \ + const T_VEC i8 = load(input + 8 * stride); \ + const T_VEC i9 = load(input + 9 * stride); \ + const T_VEC i10 = load(input + 10 * stride); \ + const T_VEC i11 = load(input + 11 * stride); \ + const T_VEC i12 = load(input + 12 * stride); \ + const T_VEC i13 = load(input + 13 * stride); \ + const T_VEC i14 = load(input + 14 * stride); \ + const T_VEC i15 = load(input + 15 * stride); \ + const T_VEC i16 = load(input + 16 * stride); \ + const T_VEC i17 = load(input + 17 * stride); \ + const T_VEC i18 = load(input + 18 * stride); \ + const T_VEC i19 = load(input + 19 * stride); \ + const T_VEC i20 = load(input + 20 * stride); \ + const T_VEC i21 = load(input + 21 * stride); \ + const T_VEC i22 = load(input + 22 * stride); \ + const T_VEC i23 = load(input + 23 * stride); \ + const T_VEC i24 = load(input + 24 * stride); \ + const T_VEC i25 = load(input + 25 * stride); \ + const T_VEC i26 = load(input + 26 * stride); \ + const T_VEC i27 = load(input + 27 * stride); \ + const T_VEC i28 = load(input + 28 * stride); \ + const T_VEC i29 = load(input + 29 * stride); \ + const T_VEC i30 = load(input + 30 * stride); \ + const T_VEC i31 = load(input + 31 * stride); \ + const T_VEC w30 = add(i0, i16); \ + const T_VEC w31 = sub(i0, i16); \ + const T_VEC w32[2] = { add(i8, i8), sub(i24, i24) }; \ + const T_VEC w33[2] = { sub(i8, i8), sub(sub(kWeight0, i24), i24) }; \ + const T_VEC w34[2] = { add(w30, w32[0]), w32[1] }; \ + const T_VEC w35[2] = { sub(w30, w32[0]), sub(kWeight0, w32[1]) }; \ + const T_VEC w36[2] = { add(w31, w33[1]), sub(kWeight0, w33[0]) }; \ + const T_VEC w37[2] = { sub(w31, w33[1]), w33[0] }; \ + const T_VEC w38[2] = { add(i4, i12), sub(i28, i20) }; \ + const T_VEC w39[2] = { sub(i4, i12), sub(sub(kWeight0, i20), i28) }; \ + const T_VEC w40[2] = { add(i12, i4), sub(i20, i28) }; \ + const T_VEC w41[2] = { sub(i12, i4), sub(sub(kWeight0, i28), i20) }; \ + const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) }; \ + const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) }; \ + const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) }; \ + const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) }; \ + const T_VEC w46[2] = { add(w34[0], w42[0]), add(w34[1], w42[1]) }; \ + const T_VEC w47[2] = { sub(w34[0], w42[0]), sub(w34[1], w42[1]) }; \ + const T_VEC w48[2] = { add(w36[0], mul(kWeight2, add(w44[0], w44[1]))), \ + add(w36[1], mul(kWeight2, sub(w44[1], w44[0]))) }; \ + const T_VEC w49[2] = { add(w36[0], \ + sub(sub(kWeight0, mul(kWeight2, w44[0])), \ + mul(kWeight2, w44[1]))), \ + add(w36[1], mul(kWeight2, sub(w44[0], w44[1]))) }; \ + const T_VEC w50[2] = { add(w35[0], w43[1]), sub(w35[1], w43[0]) }; \ + const T_VEC w51[2] = { sub(w35[0], w43[1]), add(w35[1], w43[0]) }; \ + const T_VEC w52[2] = { sub(w37[0], mul(kWeight2, sub(w45[0], w45[1]))), \ + sub(w37[1], mul(kWeight2, add(w45[1], w45[0]))) }; \ + const T_VEC w53[2] = { add(w37[0], mul(kWeight2, sub(w45[0], w45[1]))), \ + add(w37[1], mul(kWeight2, add(w45[1], w45[0]))) }; \ + const T_VEC w54[2] = { add(i2, i14), sub(i30, i18) }; \ + const T_VEC w55[2] = { sub(i2, i14), sub(sub(kWeight0, i18), i30) }; \ + const T_VEC w56[2] = { add(i10, i6), sub(i22, i26) }; \ + const T_VEC w57[2] = { sub(i10, i6), sub(sub(kWeight0, i26), i22) }; \ + const T_VEC w58[2] = { add(w54[0], w56[0]), add(w54[1], w56[1]) }; \ + const T_VEC w59[2] = { sub(w54[0], w56[0]), sub(w54[1], w56[1]) }; \ + const T_VEC w60[2] = { add(w55[0], w57[1]), sub(w55[1], w57[0]) }; \ + const T_VEC w61[2] = { sub(w55[0], w57[1]), add(w55[1], w57[0]) }; \ + const T_VEC w62[2] = { add(i6, i10), sub(i26, i22) }; \ + const T_VEC w63[2] = { sub(i6, i10), sub(sub(kWeight0, i22), i26) }; \ + const T_VEC w64[2] = { add(i14, i2), sub(i18, i30) }; \ + const T_VEC w65[2] = { sub(i14, i2), sub(sub(kWeight0, i30), i18) }; \ + const T_VEC w66[2] = { add(w62[0], w64[0]), add(w62[1], w64[1]) }; \ + const T_VEC w67[2] = { sub(w62[0], w64[0]), sub(w62[1], w64[1]) }; \ + const T_VEC w68[2] = { add(w63[0], w65[1]), sub(w63[1], w65[0]) }; \ + const T_VEC w69[2] = { sub(w63[0], w65[1]), add(w63[1], w65[0]) }; \ + const T_VEC w70[2] = { add(w58[0], w66[0]), add(w58[1], w66[1]) }; \ + const T_VEC w71[2] = { sub(w58[0], w66[0]), sub(w58[1], w66[1]) }; \ + const T_VEC w72[2] = { add(w60[0], mul(kWeight2, add(w68[0], w68[1]))), \ + add(w60[1], mul(kWeight2, sub(w68[1], w68[0]))) }; \ + const T_VEC w73[2] = { add(w60[0], \ + sub(sub(kWeight0, mul(kWeight2, w68[0])), \ + mul(kWeight2, w68[1]))), \ + add(w60[1], mul(kWeight2, sub(w68[0], w68[1]))) }; \ + const T_VEC w74[2] = { add(w59[0], w67[1]), sub(w59[1], w67[0]) }; \ + const T_VEC w75[2] = { sub(w59[0], w67[1]), add(w59[1], w67[0]) }; \ + const T_VEC w76[2] = { sub(w61[0], mul(kWeight2, sub(w69[0], w69[1]))), \ + sub(w61[1], mul(kWeight2, add(w69[1], w69[0]))) }; \ + const T_VEC w77[2] = { add(w61[0], mul(kWeight2, sub(w69[0], w69[1]))), \ + add(w61[1], mul(kWeight2, add(w69[1], w69[0]))) }; \ + const T_VEC w78[2] = { add(w46[0], w70[0]), add(w46[1], w70[1]) }; \ + const T_VEC w79[2] = { sub(w46[0], w70[0]), sub(w46[1], w70[1]) }; \ + const T_VEC w80[2] = { \ + add(w48[0], add(mul(kWeight3, w72[0]), mul(kWeight4, w72[1]))), \ + add(w48[1], sub(mul(kWeight3, w72[1]), mul(kWeight4, w72[0]))) \ + }; \ + const T_VEC w81[2] = { \ + add(w48[0], \ + sub(sub(kWeight0, mul(kWeight3, w72[0])), mul(kWeight4, w72[1]))), \ + add(w48[1], sub(mul(kWeight4, w72[0]), mul(kWeight3, w72[1]))) \ + }; \ + const T_VEC w82[2] = { add(w50[0], mul(kWeight2, add(w74[0], w74[1]))), \ + add(w50[1], mul(kWeight2, sub(w74[1], w74[0]))) }; \ + const T_VEC w83[2] = { add(w50[0], \ + sub(sub(kWeight0, mul(kWeight2, w74[0])), \ + mul(kWeight2, w74[1]))), \ + add(w50[1], mul(kWeight2, sub(w74[0], w74[1]))) }; \ + const T_VEC w84[2] = { \ + add(w52[0], add(mul(kWeight4, w76[0]), mul(kWeight3, w76[1]))), \ + add(w52[1], sub(mul(kWeight4, w76[1]), mul(kWeight3, w76[0]))) \ + }; \ + const T_VEC w85[2] = { \ + add(w52[0], \ + sub(sub(kWeight0, mul(kWeight4, w76[0])), mul(kWeight3, w76[1]))), \ + add(w52[1], sub(mul(kWeight3, w76[0]), mul(kWeight4, w76[1]))) \ + }; \ + const T_VEC w86[2] = { add(w47[0], w71[1]), sub(w47[1], w71[0]) }; \ + const T_VEC w87[2] = { sub(w47[0], w71[1]), add(w47[1], w71[0]) }; \ + const T_VEC w88[2] = { \ + sub(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))), \ + add(w49[1], \ + sub(sub(kWeight0, mul(kWeight4, w73[1])), mul(kWeight3, w73[0]))) \ + }; \ + const T_VEC w89[2] = { \ + add(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))), \ + add(w49[1], add(mul(kWeight4, w73[1]), mul(kWeight3, w73[0]))) \ + }; \ + const T_VEC w90[2] = { sub(w51[0], mul(kWeight2, sub(w75[0], w75[1]))), \ + sub(w51[1], mul(kWeight2, add(w75[1], w75[0]))) }; \ + const T_VEC w91[2] = { add(w51[0], mul(kWeight2, sub(w75[0], w75[1]))), \ + add(w51[1], mul(kWeight2, add(w75[1], w75[0]))) }; \ + const T_VEC w92[2] = { \ + sub(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))), \ + add(w53[1], \ + sub(sub(kWeight0, mul(kWeight3, w77[1])), mul(kWeight4, w77[0]))) \ + }; \ + const T_VEC w93[2] = { \ + add(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))), \ + add(w53[1], add(mul(kWeight3, w77[1]), mul(kWeight4, w77[0]))) \ + }; \ + const T_VEC w94[2] = { add(i1, i15), sub(i31, i17) }; \ + const T_VEC w95[2] = { sub(i1, i15), sub(sub(kWeight0, i17), i31) }; \ + const T_VEC w96[2] = { add(i9, i7), sub(i23, i25) }; \ + const T_VEC w97[2] = { sub(i9, i7), sub(sub(kWeight0, i25), i23) }; \ + const T_VEC w98[2] = { add(w94[0], w96[0]), add(w94[1], w96[1]) }; \ + const T_VEC w99[2] = { sub(w94[0], w96[0]), sub(w94[1], w96[1]) }; \ + const T_VEC w100[2] = { add(w95[0], w97[1]), sub(w95[1], w97[0]) }; \ + const T_VEC w101[2] = { sub(w95[0], w97[1]), add(w95[1], w97[0]) }; \ + const T_VEC w102[2] = { add(i5, i11), sub(i27, i21) }; \ + const T_VEC w103[2] = { sub(i5, i11), sub(sub(kWeight0, i21), i27) }; \ + const T_VEC w104[2] = { add(i13, i3), sub(i19, i29) }; \ + const T_VEC w105[2] = { sub(i13, i3), sub(sub(kWeight0, i29), i19) }; \ + const T_VEC w106[2] = { add(w102[0], w104[0]), add(w102[1], w104[1]) }; \ + const T_VEC w107[2] = { sub(w102[0], w104[0]), sub(w102[1], w104[1]) }; \ + const T_VEC w108[2] = { add(w103[0], w105[1]), sub(w103[1], w105[0]) }; \ + const T_VEC w109[2] = { sub(w103[0], w105[1]), add(w103[1], w105[0]) }; \ + const T_VEC w110[2] = { add(w98[0], w106[0]), add(w98[1], w106[1]) }; \ + const T_VEC w111[2] = { sub(w98[0], w106[0]), sub(w98[1], w106[1]) }; \ + const T_VEC w112[2] = { \ + add(w100[0], mul(kWeight2, add(w108[0], w108[1]))), \ + add(w100[1], mul(kWeight2, sub(w108[1], w108[0]))) \ + }; \ + const T_VEC w113[2] = { \ + add(w100[0], \ + sub(sub(kWeight0, mul(kWeight2, w108[0])), mul(kWeight2, w108[1]))), \ + add(w100[1], mul(kWeight2, sub(w108[0], w108[1]))) \ + }; \ + const T_VEC w114[2] = { add(w99[0], w107[1]), sub(w99[1], w107[0]) }; \ + const T_VEC w115[2] = { sub(w99[0], w107[1]), add(w99[1], w107[0]) }; \ + const T_VEC w116[2] = { \ + sub(w101[0], mul(kWeight2, sub(w109[0], w109[1]))), \ + sub(w101[1], mul(kWeight2, add(w109[1], w109[0]))) \ + }; \ + const T_VEC w117[2] = { \ + add(w101[0], mul(kWeight2, sub(w109[0], w109[1]))), \ + add(w101[1], mul(kWeight2, add(w109[1], w109[0]))) \ + }; \ + const T_VEC w118[2] = { add(i3, i13), sub(i29, i19) }; \ + const T_VEC w119[2] = { sub(i3, i13), sub(sub(kWeight0, i19), i29) }; \ + const T_VEC w120[2] = { add(i11, i5), sub(i21, i27) }; \ + const T_VEC w121[2] = { sub(i11, i5), sub(sub(kWeight0, i27), i21) }; \ + const T_VEC w122[2] = { add(w118[0], w120[0]), add(w118[1], w120[1]) }; \ + const T_VEC w123[2] = { sub(w118[0], w120[0]), sub(w118[1], w120[1]) }; \ + const T_VEC w124[2] = { add(w119[0], w121[1]), sub(w119[1], w121[0]) }; \ + const T_VEC w125[2] = { sub(w119[0], w121[1]), add(w119[1], w121[0]) }; \ + const T_VEC w126[2] = { add(i7, i9), sub(i25, i23) }; \ + const T_VEC w127[2] = { sub(i7, i9), sub(sub(kWeight0, i23), i25) }; \ + const T_VEC w128[2] = { add(i15, i1), sub(i17, i31) }; \ + const T_VEC w129[2] = { sub(i15, i1), sub(sub(kWeight0, i31), i17) }; \ + const T_VEC w130[2] = { add(w126[0], w128[0]), add(w126[1], w128[1]) }; \ + const T_VEC w131[2] = { sub(w126[0], w128[0]), sub(w126[1], w128[1]) }; \ + const T_VEC w132[2] = { add(w127[0], w129[1]), sub(w127[1], w129[0]) }; \ + const T_VEC w133[2] = { sub(w127[0], w129[1]), add(w127[1], w129[0]) }; \ + const T_VEC w134[2] = { add(w122[0], w130[0]), add(w122[1], w130[1]) }; \ + const T_VEC w135[2] = { sub(w122[0], w130[0]), sub(w122[1], w130[1]) }; \ + const T_VEC w136[2] = { \ + add(w124[0], mul(kWeight2, add(w132[0], w132[1]))), \ + add(w124[1], mul(kWeight2, sub(w132[1], w132[0]))) \ + }; \ + const T_VEC w137[2] = { \ + add(w124[0], \ + sub(sub(kWeight0, mul(kWeight2, w132[0])), mul(kWeight2, w132[1]))), \ + add(w124[1], mul(kWeight2, sub(w132[0], w132[1]))) \ + }; \ + const T_VEC w138[2] = { add(w123[0], w131[1]), sub(w123[1], w131[0]) }; \ + const T_VEC w139[2] = { sub(w123[0], w131[1]), add(w123[1], w131[0]) }; \ + const T_VEC w140[2] = { \ + sub(w125[0], mul(kWeight2, sub(w133[0], w133[1]))), \ + sub(w125[1], mul(kWeight2, add(w133[1], w133[0]))) \ + }; \ + const T_VEC w141[2] = { \ + add(w125[0], mul(kWeight2, sub(w133[0], w133[1]))), \ + add(w125[1], mul(kWeight2, add(w133[1], w133[0]))) \ + }; \ + const T_VEC w142[2] = { add(w110[0], w134[0]), add(w110[1], w134[1]) }; \ + const T_VEC w143[2] = { sub(w110[0], w134[0]), sub(w110[1], w134[1]) }; \ + const T_VEC w144[2] = { \ + add(w112[0], add(mul(kWeight3, w136[0]), mul(kWeight4, w136[1]))), \ + add(w112[1], sub(mul(kWeight3, w136[1]), mul(kWeight4, w136[0]))) \ + }; \ + const T_VEC w145[2] = { \ + add(w112[0], \ + sub(sub(kWeight0, mul(kWeight3, w136[0])), mul(kWeight4, w136[1]))), \ + add(w112[1], sub(mul(kWeight4, w136[0]), mul(kWeight3, w136[1]))) \ + }; \ + const T_VEC w146[2] = { \ + add(w114[0], mul(kWeight2, add(w138[0], w138[1]))), \ + add(w114[1], mul(kWeight2, sub(w138[1], w138[0]))) \ + }; \ + const T_VEC w147[2] = { \ + add(w114[0], \ + sub(sub(kWeight0, mul(kWeight2, w138[0])), mul(kWeight2, w138[1]))), \ + add(w114[1], mul(kWeight2, sub(w138[0], w138[1]))) \ + }; \ + const T_VEC w148[2] = { \ + add(w116[0], add(mul(kWeight4, w140[0]), mul(kWeight3, w140[1]))), \ + add(w116[1], sub(mul(kWeight4, w140[1]), mul(kWeight3, w140[0]))) \ + }; \ + const T_VEC w149[2] = { \ + add(w116[0], \ + sub(sub(kWeight0, mul(kWeight4, w140[0])), mul(kWeight3, w140[1]))), \ + add(w116[1], sub(mul(kWeight3, w140[0]), mul(kWeight4, w140[1]))) \ + }; \ + const T_VEC w150[2] = { add(w111[0], w135[1]), sub(w111[1], w135[0]) }; \ + const T_VEC w151[2] = { sub(w111[0], w135[1]), add(w111[1], w135[0]) }; \ + const T_VEC w152[2] = { \ + sub(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))), \ + add(w113[1], \ + sub(sub(kWeight0, mul(kWeight4, w137[1])), mul(kWeight3, w137[0]))) \ + }; \ + const T_VEC w153[2] = { \ + add(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))), \ + add(w113[1], add(mul(kWeight4, w137[1]), mul(kWeight3, w137[0]))) \ + }; \ + const T_VEC w154[2] = { \ + sub(w115[0], mul(kWeight2, sub(w139[0], w139[1]))), \ + sub(w115[1], mul(kWeight2, add(w139[1], w139[0]))) \ + }; \ + const T_VEC w155[2] = { \ + add(w115[0], mul(kWeight2, sub(w139[0], w139[1]))), \ + add(w115[1], mul(kWeight2, add(w139[1], w139[0]))) \ + }; \ + const T_VEC w156[2] = { \ + sub(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))), \ + add(w117[1], \ + sub(sub(kWeight0, mul(kWeight3, w141[1])), mul(kWeight4, w141[0]))) \ + }; \ + const T_VEC w157[2] = { \ + add(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))), \ + add(w117[1], add(mul(kWeight3, w141[1]), mul(kWeight4, w141[0]))) \ + }; \ + store(output + 0 * stride, add(w78[0], w142[0])); \ + store(output + 1 * stride, \ + add(w80[0], add(mul(kWeight5, w144[0]), mul(kWeight6, w144[1])))); \ + store(output + 2 * stride, \ + add(w82[0], add(mul(kWeight3, w146[0]), mul(kWeight4, w146[1])))); \ + store(output + 3 * stride, \ + add(w84[0], add(mul(kWeight7, w148[0]), mul(kWeight8, w148[1])))); \ + store(output + 4 * stride, \ + add(w86[0], mul(kWeight2, add(w150[0], w150[1])))); \ + store(output + 5 * stride, \ + add(w88[0], add(mul(kWeight8, w152[0]), mul(kWeight7, w152[1])))); \ + store(output + 6 * stride, \ + add(w90[0], add(mul(kWeight4, w154[0]), mul(kWeight3, w154[1])))); \ + store(output + 7 * stride, \ + add(w92[0], add(mul(kWeight6, w156[0]), mul(kWeight5, w156[1])))); \ + store(output + 8 * stride, add(w79[0], w143[1])); \ + store(output + 9 * stride, \ + sub(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1])))); \ + store(output + 10 * stride, \ + sub(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1])))); \ + store(output + 11 * stride, \ + sub(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1])))); \ + store(output + 12 * stride, \ + sub(w87[0], mul(kWeight2, sub(w151[0], w151[1])))); \ + store(output + 13 * stride, \ + sub(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1])))); \ + store(output + 14 * stride, \ + sub(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1])))); \ + store(output + 15 * stride, \ + sub(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1])))); \ + store(output + 16 * stride, sub(w78[0], w142[0])); \ + store(output + 17 * stride, \ + add(w80[0], sub(sub(kWeight0, mul(kWeight5, w144[0])), \ + mul(kWeight6, w144[1])))); \ + store(output + 18 * stride, \ + add(w82[0], sub(sub(kWeight0, mul(kWeight3, w146[0])), \ + mul(kWeight4, w146[1])))); \ + store(output + 19 * stride, \ + add(w84[0], sub(sub(kWeight0, mul(kWeight7, w148[0])), \ + mul(kWeight8, w148[1])))); \ + store(output + 20 * stride, \ + add(w86[0], sub(sub(kWeight0, mul(kWeight2, w150[0])), \ + mul(kWeight2, w150[1])))); \ + store(output + 21 * stride, \ + add(w88[0], sub(sub(kWeight0, mul(kWeight8, w152[0])), \ + mul(kWeight7, w152[1])))); \ + store(output + 22 * stride, \ + add(w90[0], sub(sub(kWeight0, mul(kWeight4, w154[0])), \ + mul(kWeight3, w154[1])))); \ + store(output + 23 * stride, \ + add(w92[0], sub(sub(kWeight0, mul(kWeight6, w156[0])), \ + mul(kWeight5, w156[1])))); \ + store(output + 24 * stride, sub(w79[0], w143[1])); \ + store(output + 25 * stride, \ + add(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1])))); \ + store(output + 26 * stride, \ + add(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1])))); \ + store(output + 27 * stride, \ + add(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1])))); \ + store(output + 28 * stride, \ + add(w87[0], mul(kWeight2, sub(w151[0], w151[1])))); \ + store(output + 29 * stride, \ + add(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1])))); \ + store(output + 30 * stride, \ + add(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1])))); \ + store(output + 31 * stride, \ + add(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1])))); \ + } + +#endif // AOM_AOM_DSP_FFT_COMMON_H_ diff --git a/libs/libaom/src/aom_dsp/fwd_txfm.c b/libs/libaom/src/aom_dsp/fwd_txfm.c new file mode 100644 index 000000000..3d3044415 --- /dev/null +++ b/libs/libaom/src/aom_dsp/fwd_txfm.c @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "aom_dsp/txfm_common.h" +#include "config/aom_dsp_rtcd.h" + +void aom_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { + // The 2D transform is done with two passes which are actually pretty + // similar. In the first one, we transform the columns and transpose + // the results. In the second one, we transform the rows. To achieve that, + // as the first pass results are transposed, we transpose the columns (that + // is the transposed rows) and transpose the results (so that it goes back + // in normal/row positions). + // We need an intermediate buffer between passes. + tran_low_t intermediate[4 * 4]; + const tran_low_t *in_low = NULL; + tran_low_t *out = intermediate; + // Do the two transform/transpose passes + for (int pass = 0; pass < 2; ++pass) { + tran_high_t in_high[4]; // canbe16 + tran_high_t step[4]; // canbe16 + tran_high_t temp1, temp2; // needs32 + for (int i = 0; i < 4; ++i) { + // Load inputs. + if (pass == 0) { + in_high[0] = input[0 * stride] * 16; + in_high[1] = input[1 * stride] * 16; + in_high[2] = input[2 * stride] * 16; + in_high[3] = input[3 * stride] * 16; + if (i == 0 && in_high[0]) { + ++in_high[0]; + } + } else { + assert(in_low != NULL); + in_high[0] = in_low[0 * 4]; + in_high[1] = in_low[1 * 4]; + in_high[2] = in_low[2 * 4]; + in_high[3] = in_low[3 * 4]; + ++in_low; + } + // Transform. + step[0] = in_high[0] + in_high[3]; + step[1] = in_high[1] + in_high[2]; + step[2] = in_high[1] - in_high[2]; + step[3] = in_high[0] - in_high[3]; + temp1 = (step[0] + step[1]) * cospi_16_64; + temp2 = (step[0] - step[1]) * cospi_16_64; + out[0] = (tran_low_t)fdct_round_shift(temp1); + out[2] = (tran_low_t)fdct_round_shift(temp2); + temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; + temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; + out[1] = (tran_low_t)fdct_round_shift(temp1); + out[3] = (tran_low_t)fdct_round_shift(temp2); + // Do next column (which is a transposed row in second/horizontal pass) + ++input; + out += 4; + } + // Setup in/out for next pass. + in_low = intermediate; + out = output; + } + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) + output[j + i * 4] = (output[j + i * 4] + 1) >> 2; + } +} + +void aom_fdct4x4_lp_c(const int16_t *input, int16_t *output, int stride) { + // The 2D transform is done with two passes which are actually pretty + // similar. In the first one, we transform the columns and transpose + // the results. In the second one, we transform the rows. To achieve that, + // as the first pass results are transposed, we transpose the columns (that + // is the transposed rows) and transpose the results (so that it goes back + // in normal/row positions). + // We need an intermediate buffer between passes. + int16_t intermediate[4 * 4]; + const int16_t *in_low = NULL; + int16_t *out = intermediate; + // Do the two transform/transpose passes + for (int pass = 0; pass < 2; ++pass) { + int32_t in_high[4]; // canbe16 + int32_t step[4]; // canbe16 + int32_t temp1, temp2; // needs32 + for (int i = 0; i < 4; ++i) { + // Load inputs. + if (pass == 0) { + in_high[0] = input[0 * stride] * 16; + in_high[1] = input[1 * stride] * 16; + in_high[2] = input[2 * stride] * 16; + in_high[3] = input[3 * stride] * 16; + if (i == 0 && in_high[0]) { + ++in_high[0]; + } + } else { + assert(in_low != NULL); + in_high[0] = in_low[0 * 4]; + in_high[1] = in_low[1 * 4]; + in_high[2] = in_low[2 * 4]; + in_high[3] = in_low[3 * 4]; + ++in_low; + } + // Transform. + step[0] = in_high[0] + in_high[3]; + step[1] = in_high[1] + in_high[2]; + step[2] = in_high[1] - in_high[2]; + step[3] = in_high[0] - in_high[3]; + temp1 = (step[0] + step[1]) * (int32_t)cospi_16_64; + temp2 = (step[0] - step[1]) * (int32_t)cospi_16_64; + out[0] = (int16_t)fdct_round_shift(temp1); + out[2] = (int16_t)fdct_round_shift(temp2); + temp1 = step[2] * (int32_t)cospi_24_64 + step[3] * (int32_t)cospi_8_64; + temp2 = -step[2] * (int32_t)cospi_8_64 + step[3] * (int32_t)cospi_24_64; + out[1] = (int16_t)fdct_round_shift(temp1); + out[3] = (int16_t)fdct_round_shift(temp2); + // Do next column (which is a transposed row in second/horizontal pass) + ++input; + out += 4; + } + // Setup in/out for next pass. + in_low = intermediate; + out = output; + } + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) + output[j + i * 4] = (output[j + i * 4] + 1) >> 2; + } +} + +void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { + int i, j; + tran_low_t intermediate[64]; + int pass; + tran_low_t *output = intermediate; + const tran_low_t *in = NULL; + + // Transform columns + for (pass = 0; pass < 2; ++pass) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 + tran_high_t t0, t1, t2, t3; // needs32 + tran_high_t x0, x1, x2, x3; // canbe16 + + for (i = 0; i < 8; i++) { + // stage 1 + if (pass == 0) { + s0 = (input[0 * stride] + input[7 * stride]) * 4; + s1 = (input[1 * stride] + input[6 * stride]) * 4; + s2 = (input[2 * stride] + input[5 * stride]) * 4; + s3 = (input[3 * stride] + input[4 * stride]) * 4; + s4 = (input[3 * stride] - input[4 * stride]) * 4; + s5 = (input[2 * stride] - input[5 * stride]) * 4; + s6 = (input[1 * stride] - input[6 * stride]) * 4; + s7 = (input[0 * stride] - input[7 * stride]) * 4; + ++input; + } else { + s0 = in[0 * 8] + in[7 * 8]; + s1 = in[1 * 8] + in[6 * 8]; + s2 = in[2 * 8] + in[5 * 8]; + s3 = in[3 * 8] + in[4 * 8]; + s4 = in[3 * 8] - in[4 * 8]; + s5 = in[2 * 8] - in[5 * 8]; + s6 = in[1 * 8] - in[6 * 8]; + s7 = in[0 * 8] - in[7 * 8]; + ++in; + } + + // fdct4(step, step); + x0 = s0 + s3; + x1 = s1 + s2; + x2 = s1 - s2; + x3 = s0 - s3; + t0 = (x0 + x1) * cospi_16_64; + t1 = (x0 - x1) * cospi_16_64; + t2 = x2 * cospi_24_64 + x3 * cospi_8_64; + t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; + output[0] = (tran_low_t)fdct_round_shift(t0); + output[2] = (tran_low_t)fdct_round_shift(t2); + output[4] = (tran_low_t)fdct_round_shift(t1); + output[6] = (tran_low_t)fdct_round_shift(t3); + + // Stage 2 + t0 = (s6 - s5) * cospi_16_64; + t1 = (s6 + s5) * cospi_16_64; + t2 = fdct_round_shift(t0); + t3 = fdct_round_shift(t1); + + // Stage 3 + x0 = s4 + t2; + x1 = s4 - t2; + x2 = s7 - t3; + x3 = s7 + t3; + + // Stage 4 + t0 = x0 * cospi_28_64 + x3 * cospi_4_64; + t1 = x1 * cospi_12_64 + x2 * cospi_20_64; + t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; + t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; + output[1] = (tran_low_t)fdct_round_shift(t0); + output[3] = (tran_low_t)fdct_round_shift(t2); + output[5] = (tran_low_t)fdct_round_shift(t1); + output[7] = (tran_low_t)fdct_round_shift(t3); + output += 8; + } + in = intermediate; + output = final_output; + } + + // Rows + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output, + int stride) { + aom_fdct8x8_c(input, final_output, stride); +} +#endif diff --git a/libs/libaom/src/aom_dsp/grain_synthesis.c b/libs/libaom/src/aom_dsp/grain_synthesis.c new file mode 100644 index 000000000..626eb76af --- /dev/null +++ b/libs/libaom/src/aom_dsp/grain_synthesis.c @@ -0,0 +1,1408 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Describes film grain parameters and film grain synthesis + * + */ + +#include +#include +#include +#include +#include "aom_dsp/grain_synthesis.h" +#include "aom_mem/aom_mem.h" + +// Samples with Gaussian distribution in the range of [-2048, 2047] (12 bits) +// with zero mean and standard deviation of about 512. +// should be divided by 4 for 10-bit range and 16 for 8-bit range. +static const int gaussian_sequence[2048] = { + 56, 568, -180, 172, 124, -84, 172, -64, -900, 24, 820, + 224, 1248, 996, 272, -8, -916, -388, -732, -104, -188, 800, + 112, -652, -320, -376, 140, -252, 492, -168, 44, -788, 588, + -584, 500, -228, 12, 680, 272, -476, 972, -100, 652, 368, + 432, -196, -720, -192, 1000, -332, 652, -136, -552, -604, -4, + 192, -220, -136, 1000, -52, 372, -96, -624, 124, -24, 396, + 540, -12, -104, 640, 464, 244, -208, -84, 368, -528, -740, + 248, -968, -848, 608, 376, -60, -292, -40, -156, 252, -292, + 248, 224, -280, 400, -244, 244, -60, 76, -80, 212, 532, + 340, 128, -36, 824, -352, -60, -264, -96, -612, 416, -704, + 220, -204, 640, -160, 1220, -408, 900, 336, 20, -336, -96, + -792, 304, 48, -28, -1232, -1172, -448, 104, -292, -520, 244, + 60, -948, 0, -708, 268, 108, 356, -548, 488, -344, -136, + 488, -196, -224, 656, -236, -1128, 60, 4, 140, 276, -676, + -376, 168, -108, 464, 8, 564, 64, 240, 308, -300, -400, + -456, -136, 56, 120, -408, -116, 436, 504, -232, 328, 844, + -164, -84, 784, -168, 232, -224, 348, -376, 128, 568, 96, + -1244, -288, 276, 848, 832, -360, 656, 464, -384, -332, -356, + 728, -388, 160, -192, 468, 296, 224, 140, -776, -100, 280, + 4, 196, 44, -36, -648, 932, 16, 1428, 28, 528, 808, + 772, 20, 268, 88, -332, -284, 124, -384, -448, 208, -228, + -1044, -328, 660, 380, -148, -300, 588, 240, 540, 28, 136, + -88, -436, 256, 296, -1000, 1400, 0, -48, 1056, -136, 264, + -528, -1108, 632, -484, -592, -344, 796, 124, -668, -768, 388, + 1296, -232, -188, -200, -288, -4, 308, 100, -168, 256, -500, + 204, -508, 648, -136, 372, -272, -120, -1004, -552, -548, -384, + 548, -296, 428, -108, -8, -912, -324, -224, -88, -112, -220, + -100, 996, -796, 548, 360, -216, 180, 428, -200, -212, 148, + 96, 148, 284, 216, -412, -320, 120, -300, -384, -604, -572, + -332, -8, -180, -176, 696, 116, -88, 628, 76, 44, -516, + 240, -208, -40, 100, -592, 344, -308, -452, -228, 20, 916, + -1752, -136, -340, -804, 140, 40, 512, 340, 248, 184, -492, + 896, -156, 932, -628, 328, -688, -448, -616, -752, -100, 560, + -1020, 180, -800, -64, 76, 576, 1068, 396, 660, 552, -108, + -28, 320, -628, 312, -92, -92, -472, 268, 16, 560, 516, + -672, -52, 492, -100, 260, 384, 284, 292, 304, -148, 88, + -152, 1012, 1064, -228, 164, -376, -684, 592, -392, 156, 196, + -524, -64, -884, 160, -176, 636, 648, 404, -396, -436, 864, + 424, -728, 988, -604, 904, -592, 296, -224, 536, -176, -920, + 436, -48, 1176, -884, 416, -776, -824, -884, 524, -548, -564, + -68, -164, -96, 692, 364, -692, -1012, -68, 260, -480, 876, + -1116, 452, -332, -352, 892, -1088, 1220, -676, 12, -292, 244, + 496, 372, -32, 280, 200, 112, -440, -96, 24, -644, -184, + 56, -432, 224, -980, 272, -260, 144, -436, 420, 356, 364, + -528, 76, 172, -744, -368, 404, -752, -416, 684, -688, 72, + 540, 416, 92, 444, 480, -72, -1416, 164, -1172, -68, 24, + 424, 264, 1040, 128, -912, -524, -356, 64, 876, -12, 4, + -88, 532, 272, -524, 320, 276, -508, 940, 24, -400, -120, + 756, 60, 236, -412, 100, 376, -484, 400, -100, -740, -108, + -260, 328, -268, 224, -200, -416, 184, -604, -564, -20, 296, + 60, 892, -888, 60, 164, 68, -760, 216, -296, 904, -336, + -28, 404, -356, -568, -208, -1480, -512, 296, 328, -360, -164, + -1560, -776, 1156, -428, 164, -504, -112, 120, -216, -148, -264, + 308, 32, 64, -72, 72, 116, 176, -64, -272, 460, -536, + -784, -280, 348, 108, -752, -132, 524, -540, -776, 116, -296, + -1196, -288, -560, 1040, -472, 116, -848, -1116, 116, 636, 696, + 284, -176, 1016, 204, -864, -648, -248, 356, 972, -584, -204, + 264, 880, 528, -24, -184, 116, 448, -144, 828, 524, 212, + -212, 52, 12, 200, 268, -488, -404, -880, 824, -672, -40, + 908, -248, 500, 716, -576, 492, -576, 16, 720, -108, 384, + 124, 344, 280, 576, -500, 252, 104, -308, 196, -188, -8, + 1268, 296, 1032, -1196, 436, 316, 372, -432, -200, -660, 704, + -224, 596, -132, 268, 32, -452, 884, 104, -1008, 424, -1348, + -280, 4, -1168, 368, 476, 696, 300, -8, 24, 180, -592, + -196, 388, 304, 500, 724, -160, 244, -84, 272, -256, -420, + 320, 208, -144, -156, 156, 364, 452, 28, 540, 316, 220, + -644, -248, 464, 72, 360, 32, -388, 496, -680, -48, 208, + -116, -408, 60, -604, -392, 548, -840, 784, -460, 656, -544, + -388, -264, 908, -800, -628, -612, -568, 572, -220, 164, 288, + -16, -308, 308, -112, -636, -760, 280, -668, 432, 364, 240, + -196, 604, 340, 384, 196, 592, -44, -500, 432, -580, -132, + 636, -76, 392, 4, -412, 540, 508, 328, -356, -36, 16, + -220, -64, -248, -60, 24, -192, 368, 1040, 92, -24, -1044, + -32, 40, 104, 148, 192, -136, -520, 56, -816, -224, 732, + 392, 356, 212, -80, -424, -1008, -324, 588, -1496, 576, 460, + -816, -848, 56, -580, -92, -1372, -112, -496, 200, 364, 52, + -140, 48, -48, -60, 84, 72, 40, 132, -356, -268, -104, + -284, -404, 732, -520, 164, -304, -540, 120, 328, -76, -460, + 756, 388, 588, 236, -436, -72, -176, -404, -316, -148, 716, + -604, 404, -72, -88, -888, -68, 944, 88, -220, -344, 960, + 472, 460, -232, 704, 120, 832, -228, 692, -508, 132, -476, + 844, -748, -364, -44, 1116, -1104, -1056, 76, 428, 552, -692, + 60, 356, 96, -384, -188, -612, -576, 736, 508, 892, 352, + -1132, 504, -24, -352, 324, 332, -600, -312, 292, 508, -144, + -8, 484, 48, 284, -260, -240, 256, -100, -292, -204, -44, + 472, -204, 908, -188, -1000, -256, 92, 1164, -392, 564, 356, + 652, -28, -884, 256, 484, -192, 760, -176, 376, -524, -452, + -436, 860, -736, 212, 124, 504, -476, 468, 76, -472, 552, + -692, -944, -620, 740, -240, 400, 132, 20, 192, -196, 264, + -668, -1012, -60, 296, -316, -828, 76, -156, 284, -768, -448, + -832, 148, 248, 652, 616, 1236, 288, -328, -400, -124, 588, + 220, 520, -696, 1032, 768, -740, -92, -272, 296, 448, -464, + 412, -200, 392, 440, -200, 264, -152, -260, 320, 1032, 216, + 320, -8, -64, 156, -1016, 1084, 1172, 536, 484, -432, 132, + 372, -52, -256, 84, 116, -352, 48, 116, 304, -384, 412, + 924, -300, 528, 628, 180, 648, 44, -980, -220, 1320, 48, + 332, 748, 524, -268, -720, 540, -276, 564, -344, -208, -196, + 436, 896, 88, -392, 132, 80, -964, -288, 568, 56, -48, + -456, 888, 8, 552, -156, -292, 948, 288, 128, -716, -292, + 1192, -152, 876, 352, -600, -260, -812, -468, -28, -120, -32, + -44, 1284, 496, 192, 464, 312, -76, -516, -380, -456, -1012, + -48, 308, -156, 36, 492, -156, -808, 188, 1652, 68, -120, + -116, 316, 160, -140, 352, 808, -416, 592, 316, -480, 56, + 528, -204, -568, 372, -232, 752, -344, 744, -4, 324, -416, + -600, 768, 268, -248, -88, -132, -420, -432, 80, -288, 404, + -316, -1216, -588, 520, -108, 92, -320, 368, -480, -216, -92, + 1688, -300, 180, 1020, -176, 820, -68, -228, -260, 436, -904, + 20, 40, -508, 440, -736, 312, 332, 204, 760, -372, 728, + 96, -20, -632, -520, -560, 336, 1076, -64, -532, 776, 584, + 192, 396, -728, -520, 276, -188, 80, -52, -612, -252, -48, + 648, 212, -688, 228, -52, -260, 428, -412, -272, -404, 180, + 816, -796, 48, 152, 484, -88, -216, 988, 696, 188, -528, + 648, -116, -180, 316, 476, 12, -564, 96, 476, -252, -364, + -376, -392, 556, -256, -576, 260, -352, 120, -16, -136, -260, + -492, 72, 556, 660, 580, 616, 772, 436, 424, -32, -324, + -1268, 416, -324, -80, 920, 160, 228, 724, 32, -516, 64, + 384, 68, -128, 136, 240, 248, -204, -68, 252, -932, -120, + -480, -628, -84, 192, 852, -404, -288, -132, 204, 100, 168, + -68, -196, -868, 460, 1080, 380, -80, 244, 0, 484, -888, + 64, 184, 352, 600, 460, 164, 604, -196, 320, -64, 588, + -184, 228, 12, 372, 48, -848, -344, 224, 208, -200, 484, + 128, -20, 272, -468, -840, 384, 256, -720, -520, -464, -580, + 112, -120, 644, -356, -208, -608, -528, 704, 560, -424, 392, + 828, 40, 84, 200, -152, 0, -144, 584, 280, -120, 80, + -556, -972, -196, -472, 724, 80, 168, -32, 88, 160, -688, + 0, 160, 356, 372, -776, 740, -128, 676, -248, -480, 4, + -364, 96, 544, 232, -1032, 956, 236, 356, 20, -40, 300, + 24, -676, -596, 132, 1120, -104, 532, -1096, 568, 648, 444, + 508, 380, 188, -376, -604, 1488, 424, 24, 756, -220, -192, + 716, 120, 920, 688, 168, 44, -460, 568, 284, 1144, 1160, + 600, 424, 888, 656, -356, -320, 220, 316, -176, -724, -188, + -816, -628, -348, -228, -380, 1012, -452, -660, 736, 928, 404, + -696, -72, -268, -892, 128, 184, -344, -780, 360, 336, 400, + 344, 428, 548, -112, 136, -228, -216, -820, -516, 340, 92, + -136, 116, -300, 376, -244, 100, -316, -520, -284, -12, 824, + 164, -548, -180, -128, 116, -924, -828, 268, -368, -580, 620, + 192, 160, 0, -1676, 1068, 424, -56, -360, 468, -156, 720, + 288, -528, 556, -364, 548, -148, 504, 316, 152, -648, -620, + -684, -24, -376, -384, -108, -920, -1032, 768, 180, -264, -508, + -1268, -260, -60, 300, -240, 988, 724, -376, -576, -212, -736, + 556, 192, 1092, -620, -880, 376, -56, -4, -216, -32, 836, + 268, 396, 1332, 864, -600, 100, 56, -412, -92, 356, 180, + 884, -468, -436, 292, -388, -804, -704, -840, 368, -348, 140, + -724, 1536, 940, 372, 112, -372, 436, -480, 1136, 296, -32, + -228, 132, -48, -220, 868, -1016, -60, -1044, -464, 328, 916, + 244, 12, -736, -296, 360, 468, -376, -108, -92, 788, 368, + -56, 544, 400, -672, -420, 728, 16, 320, 44, -284, -380, + -796, 488, 132, 204, -596, -372, 88, -152, -908, -636, -572, + -624, -116, -692, -200, -56, 276, -88, 484, -324, 948, 864, + 1000, -456, -184, -276, 292, -296, 156, 676, 320, 160, 908, + -84, -1236, -288, -116, 260, -372, -644, 732, -756, -96, 84, + 344, -520, 348, -688, 240, -84, 216, -1044, -136, -676, -396, + -1500, 960, -40, 176, 168, 1516, 420, -504, -344, -364, -360, + 1216, -940, -380, -212, 252, -660, -708, 484, -444, -152, 928, + -120, 1112, 476, -260, 560, -148, -344, 108, -196, 228, -288, + 504, 560, -328, -88, 288, -1008, 460, -228, 468, -836, -196, + 76, 388, 232, 412, -1168, -716, -644, 756, -172, -356, -504, + 116, 432, 528, 48, 476, -168, -608, 448, 160, -532, -272, + 28, -676, -12, 828, 980, 456, 520, 104, -104, 256, -344, + -4, -28, -368, -52, -524, -572, -556, -200, 768, 1124, -208, + -512, 176, 232, 248, -148, -888, 604, -600, -304, 804, -156, + -212, 488, -192, -804, -256, 368, -360, -916, -328, 228, -240, + -448, -472, 856, -556, -364, 572, -12, -156, -368, -340, 432, + 252, -752, -152, 288, 268, -580, -848, -592, 108, -76, 244, + 312, -716, 592, -80, 436, 360, 4, -248, 160, 516, 584, + 732, 44, -468, -280, -292, -156, -588, 28, 308, 912, 24, + 124, 156, 180, -252, 944, -924, -772, -520, -428, -624, 300, + -212, -1144, 32, -724, 800, -1128, -212, -1288, -848, 180, -416, + 440, 192, -576, -792, -76, -1080, 80, -532, -352, -132, 380, + -820, 148, 1112, 128, 164, 456, 700, -924, 144, -668, -384, + 648, -832, 508, 552, -52, -100, -656, 208, -568, 748, -88, + 680, 232, 300, 192, -408, -1012, -152, -252, -268, 272, -876, + -664, -648, -332, -136, 16, 12, 1152, -28, 332, -536, 320, + -672, -460, -316, 532, -260, 228, -40, 1052, -816, 180, 88, + -496, -556, -672, -368, 428, 92, 356, 404, -408, 252, 196, + -176, -556, 792, 268, 32, 372, 40, 96, -332, 328, 120, + 372, -900, -40, 472, -264, -592, 952, 128, 656, 112, 664, + -232, 420, 4, -344, -464, 556, 244, -416, -32, 252, 0, + -412, 188, -696, 508, -476, 324, -1096, 656, -312, 560, 264, + -136, 304, 160, -64, -580, 248, 336, -720, 560, -348, -288, + -276, -196, -500, 852, -544, -236, -1128, -992, -776, 116, 56, + 52, 860, 884, 212, -12, 168, 1020, 512, -552, 924, -148, + 716, 188, 164, -340, -520, -184, 880, -152, -680, -208, -1156, + -300, -528, -472, 364, 100, -744, -1056, -32, 540, 280, 144, + -676, -32, -232, -280, -224, 96, 568, -76, 172, 148, 148, + 104, 32, -296, -32, 788, -80, 32, -16, 280, 288, 944, + 428, -484 +}; + +static const int gauss_bits = 11; + +static int luma_subblock_size_y = 32; +static int luma_subblock_size_x = 32; + +static int chroma_subblock_size_y = 16; +static int chroma_subblock_size_x = 16; + +static const int min_luma_legal_range = 16; +static const int max_luma_legal_range = 235; + +static const int min_chroma_legal_range = 16; +static const int max_chroma_legal_range = 240; + +static int scaling_lut_y[256]; +static int scaling_lut_cb[256]; +static int scaling_lut_cr[256]; + +static int grain_min; +static int grain_max; + +static uint16_t random_register = 0; // random number generator register + +static void init_arrays(const aom_film_grain_t *params, int luma_stride, + int chroma_stride, int ***pred_pos_luma_p, + int ***pred_pos_chroma_p, int **luma_grain_block, + int **cb_grain_block, int **cr_grain_block, + int **y_line_buf, int **cb_line_buf, int **cr_line_buf, + int **y_col_buf, int **cb_col_buf, int **cr_col_buf, + int luma_grain_samples, int chroma_grain_samples, + int chroma_subsamp_y, int chroma_subsamp_x) { + memset(scaling_lut_y, 0, sizeof(*scaling_lut_y) * 256); + memset(scaling_lut_cb, 0, sizeof(*scaling_lut_cb) * 256); + memset(scaling_lut_cr, 0, sizeof(*scaling_lut_cr) * 256); + + int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1); + int num_pos_chroma = num_pos_luma; + if (params->num_y_points > 0) ++num_pos_chroma; + + int **pred_pos_luma; + int **pred_pos_chroma; + + pred_pos_luma = (int **)aom_malloc(sizeof(*pred_pos_luma) * num_pos_luma); + + for (int row = 0; row < num_pos_luma; row++) { + pred_pos_luma[row] = (int *)aom_malloc(sizeof(**pred_pos_luma) * 3); + } + + pred_pos_chroma = + (int **)aom_malloc(sizeof(*pred_pos_chroma) * num_pos_chroma); + + for (int row = 0; row < num_pos_chroma; row++) { + pred_pos_chroma[row] = (int *)aom_malloc(sizeof(**pred_pos_chroma) * 3); + } + + int pos_ar_index = 0; + + for (int row = -params->ar_coeff_lag; row < 0; row++) { + for (int col = -params->ar_coeff_lag; col < params->ar_coeff_lag + 1; + col++) { + pred_pos_luma[pos_ar_index][0] = row; + pred_pos_luma[pos_ar_index][1] = col; + pred_pos_luma[pos_ar_index][2] = 0; + + pred_pos_chroma[pos_ar_index][0] = row; + pred_pos_chroma[pos_ar_index][1] = col; + pred_pos_chroma[pos_ar_index][2] = 0; + ++pos_ar_index; + } + } + + for (int col = -params->ar_coeff_lag; col < 0; col++) { + pred_pos_luma[pos_ar_index][0] = 0; + pred_pos_luma[pos_ar_index][1] = col; + pred_pos_luma[pos_ar_index][2] = 0; + + pred_pos_chroma[pos_ar_index][0] = 0; + pred_pos_chroma[pos_ar_index][1] = col; + pred_pos_chroma[pos_ar_index][2] = 0; + + ++pos_ar_index; + } + + if (params->num_y_points > 0) { + pred_pos_chroma[pos_ar_index][0] = 0; + pred_pos_chroma[pos_ar_index][1] = 0; + pred_pos_chroma[pos_ar_index][2] = 1; + } + + *pred_pos_luma_p = pred_pos_luma; + *pred_pos_chroma_p = pred_pos_chroma; + + *y_line_buf = (int *)aom_malloc(sizeof(**y_line_buf) * luma_stride * 2); + *cb_line_buf = (int *)aom_malloc(sizeof(**cb_line_buf) * chroma_stride * + (2 >> chroma_subsamp_y)); + *cr_line_buf = (int *)aom_malloc(sizeof(**cr_line_buf) * chroma_stride * + (2 >> chroma_subsamp_y)); + + *y_col_buf = + (int *)aom_malloc(sizeof(**y_col_buf) * (luma_subblock_size_y + 2) * 2); + *cb_col_buf = + (int *)aom_malloc(sizeof(**cb_col_buf) * + (chroma_subblock_size_y + (2 >> chroma_subsamp_y)) * + (2 >> chroma_subsamp_x)); + *cr_col_buf = + (int *)aom_malloc(sizeof(**cr_col_buf) * + (chroma_subblock_size_y + (2 >> chroma_subsamp_y)) * + (2 >> chroma_subsamp_x)); + + *luma_grain_block = + (int *)aom_malloc(sizeof(**luma_grain_block) * luma_grain_samples); + *cb_grain_block = + (int *)aom_malloc(sizeof(**cb_grain_block) * chroma_grain_samples); + *cr_grain_block = + (int *)aom_malloc(sizeof(**cr_grain_block) * chroma_grain_samples); +} + +static void dealloc_arrays(const aom_film_grain_t *params, int ***pred_pos_luma, + int ***pred_pos_chroma, int **luma_grain_block, + int **cb_grain_block, int **cr_grain_block, + int **y_line_buf, int **cb_line_buf, + int **cr_line_buf, int **y_col_buf, int **cb_col_buf, + int **cr_col_buf) { + int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1); + int num_pos_chroma = num_pos_luma; + if (params->num_y_points > 0) ++num_pos_chroma; + + for (int row = 0; row < num_pos_luma; row++) { + aom_free((*pred_pos_luma)[row]); + } + aom_free(*pred_pos_luma); + + for (int row = 0; row < num_pos_chroma; row++) { + aom_free((*pred_pos_chroma)[row]); + } + aom_free((*pred_pos_chroma)); + + aom_free(*y_line_buf); + + aom_free(*cb_line_buf); + + aom_free(*cr_line_buf); + + aom_free(*y_col_buf); + + aom_free(*cb_col_buf); + + aom_free(*cr_col_buf); + + aom_free(*luma_grain_block); + + aom_free(*cb_grain_block); + + aom_free(*cr_grain_block); +} + +// get a number between 0 and 2^bits - 1 +static INLINE int get_random_number(int bits) { + uint16_t bit; + bit = ((random_register >> 0) ^ (random_register >> 1) ^ + (random_register >> 3) ^ (random_register >> 12)) & + 1; + random_register = (random_register >> 1) | (bit << 15); + return (random_register >> (16 - bits)) & ((1 << bits) - 1); +} + +static void init_random_generator(int luma_line, uint16_t seed) { + // same for the picture + + uint16_t msb = (seed >> 8) & 255; + uint16_t lsb = seed & 255; + + random_register = (msb << 8) + lsb; + + // changes for each row + int luma_num = luma_line >> 5; + + random_register ^= ((luma_num * 37 + 178) & 255) << 8; + random_register ^= ((luma_num * 173 + 105) & 255); +} + +// Return 0 for success, -1 for failure +static int generate_luma_grain_block( + const aom_film_grain_t *params, int **pred_pos_luma, int *luma_grain_block, + int luma_block_size_y, int luma_block_size_x, int luma_grain_stride, + int left_pad, int top_pad, int right_pad, int bottom_pad) { + if (params->num_y_points == 0) { + memset(luma_grain_block, 0, + sizeof(*luma_grain_block) * luma_block_size_y * luma_grain_stride); + return 0; + } + + int bit_depth = params->bit_depth; + int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift; + + int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1); + int rounding_offset = (1 << (params->ar_coeff_shift - 1)); + + for (int i = 0; i < luma_block_size_y; i++) + for (int j = 0; j < luma_block_size_x; j++) + luma_grain_block[i * luma_grain_stride + j] = + (gaussian_sequence[get_random_number(gauss_bits)] + + ((1 << gauss_sec_shift) >> 1)) >> + gauss_sec_shift; + + for (int i = top_pad; i < luma_block_size_y - bottom_pad; i++) + for (int j = left_pad; j < luma_block_size_x - right_pad; j++) { + int wsum = 0; + for (int pos = 0; pos < num_pos_luma; pos++) { + wsum = wsum + params->ar_coeffs_y[pos] * + luma_grain_block[(i + pred_pos_luma[pos][0]) * + luma_grain_stride + + j + pred_pos_luma[pos][1]]; + } + luma_grain_block[i * luma_grain_stride + j] = + clamp(luma_grain_block[i * luma_grain_stride + j] + + ((wsum + rounding_offset) >> params->ar_coeff_shift), + grain_min, grain_max); + } + return 0; +} + +// Return 0 for success, -1 for failure +static int generate_chroma_grain_blocks( + const aom_film_grain_t *params, + // int** pred_pos_luma, + int **pred_pos_chroma, int *luma_grain_block, int *cb_grain_block, + int *cr_grain_block, int luma_grain_stride, int chroma_block_size_y, + int chroma_block_size_x, int chroma_grain_stride, int left_pad, int top_pad, + int right_pad, int bottom_pad, int chroma_subsamp_y, int chroma_subsamp_x) { + int bit_depth = params->bit_depth; + int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift; + + int num_pos_chroma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1); + if (params->num_y_points > 0) ++num_pos_chroma; + int rounding_offset = (1 << (params->ar_coeff_shift - 1)); + int chroma_grain_block_size = chroma_block_size_y * chroma_grain_stride; + + if (params->num_cb_points || params->chroma_scaling_from_luma) { + init_random_generator(7 << 5, params->random_seed); + + for (int i = 0; i < chroma_block_size_y; i++) + for (int j = 0; j < chroma_block_size_x; j++) + cb_grain_block[i * chroma_grain_stride + j] = + (gaussian_sequence[get_random_number(gauss_bits)] + + ((1 << gauss_sec_shift) >> 1)) >> + gauss_sec_shift; + } else { + memset(cb_grain_block, 0, + sizeof(*cb_grain_block) * chroma_grain_block_size); + } + + if (params->num_cr_points || params->chroma_scaling_from_luma) { + init_random_generator(11 << 5, params->random_seed); + + for (int i = 0; i < chroma_block_size_y; i++) + for (int j = 0; j < chroma_block_size_x; j++) + cr_grain_block[i * chroma_grain_stride + j] = + (gaussian_sequence[get_random_number(gauss_bits)] + + ((1 << gauss_sec_shift) >> 1)) >> + gauss_sec_shift; + } else { + memset(cr_grain_block, 0, + sizeof(*cr_grain_block) * chroma_grain_block_size); + } + + for (int i = top_pad; i < chroma_block_size_y - bottom_pad; i++) + for (int j = left_pad; j < chroma_block_size_x - right_pad; j++) { + int wsum_cb = 0; + int wsum_cr = 0; + for (int pos = 0; pos < num_pos_chroma; pos++) { + if (pred_pos_chroma[pos][2] == 0) { + wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] * + cb_grain_block[(i + pred_pos_chroma[pos][0]) * + chroma_grain_stride + + j + pred_pos_chroma[pos][1]]; + wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] * + cr_grain_block[(i + pred_pos_chroma[pos][0]) * + chroma_grain_stride + + j + pred_pos_chroma[pos][1]]; + } else if (pred_pos_chroma[pos][2] == 1) { + int av_luma = 0; + int luma_coord_y = ((i - top_pad) << chroma_subsamp_y) + top_pad; + int luma_coord_x = ((j - left_pad) << chroma_subsamp_x) + left_pad; + + for (int k = luma_coord_y; k < luma_coord_y + chroma_subsamp_y + 1; + k++) + for (int l = luma_coord_x; l < luma_coord_x + chroma_subsamp_x + 1; + l++) + av_luma += luma_grain_block[k * luma_grain_stride + l]; + + av_luma = + (av_luma + ((1 << (chroma_subsamp_y + chroma_subsamp_x)) >> 1)) >> + (chroma_subsamp_y + chroma_subsamp_x); + + wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] * av_luma; + wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] * av_luma; + } else { + fprintf( + stderr, + "Grain synthesis: prediction between two chroma components is " + "not supported!"); + return -1; + } + } + if (params->num_cb_points || params->chroma_scaling_from_luma) + cb_grain_block[i * chroma_grain_stride + j] = + clamp(cb_grain_block[i * chroma_grain_stride + j] + + ((wsum_cb + rounding_offset) >> params->ar_coeff_shift), + grain_min, grain_max); + if (params->num_cr_points || params->chroma_scaling_from_luma) + cr_grain_block[i * chroma_grain_stride + j] = + clamp(cr_grain_block[i * chroma_grain_stride + j] + + ((wsum_cr + rounding_offset) >> params->ar_coeff_shift), + grain_min, grain_max); + } + return 0; +} + +static void init_scaling_function(const int scaling_points[][2], int num_points, + int scaling_lut[]) { + if (num_points == 0) return; + + for (int i = 0; i < scaling_points[0][0]; i++) + scaling_lut[i] = scaling_points[0][1]; + + for (int point = 0; point < num_points - 1; point++) { + int delta_y = scaling_points[point + 1][1] - scaling_points[point][1]; + int delta_x = scaling_points[point + 1][0] - scaling_points[point][0]; + + int64_t delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x); + + for (int x = 0; x < delta_x; x++) { + scaling_lut[scaling_points[point][0] + x] = + scaling_points[point][1] + (int)((x * delta + 32768) >> 16); + } + } + + for (int i = scaling_points[num_points - 1][0]; i < 256; i++) + scaling_lut[i] = scaling_points[num_points - 1][1]; +} + +// function that extracts samples from a LUT (and interpolates intemediate +// frames for 10- and 12-bit video) +static int scale_LUT(int *scaling_lut, int index, int bit_depth) { + int x = index >> (bit_depth - 8); + + if (!(bit_depth - 8) || x == 255) + return scaling_lut[x]; + else + return scaling_lut[x] + (((scaling_lut[x + 1] - scaling_lut[x]) * + (index & ((1 << (bit_depth - 8)) - 1)) + + (1 << (bit_depth - 9))) >> + (bit_depth - 8)); +} + +static void add_noise_to_block(const aom_film_grain_t *params, uint8_t *luma, + uint8_t *cb, uint8_t *cr, int luma_stride, + int chroma_stride, int *luma_grain, + int *cb_grain, int *cr_grain, + int luma_grain_stride, int chroma_grain_stride, + int half_luma_height, int half_luma_width, + int bit_depth, int chroma_subsamp_y, + int chroma_subsamp_x, int mc_identity) { + int cb_mult = params->cb_mult - 128; // fixed scale + int cb_luma_mult = params->cb_luma_mult - 128; // fixed scale + int cb_offset = params->cb_offset - 256; + + int cr_mult = params->cr_mult - 128; // fixed scale + int cr_luma_mult = params->cr_luma_mult - 128; // fixed scale + int cr_offset = params->cr_offset - 256; + + int rounding_offset = (1 << (params->scaling_shift - 1)); + + int apply_y = params->num_y_points > 0 ? 1 : 0; + int apply_cb = + (params->num_cb_points > 0 || params->chroma_scaling_from_luma) ? 1 : 0; + int apply_cr = + (params->num_cr_points > 0 || params->chroma_scaling_from_luma) ? 1 : 0; + + if (params->chroma_scaling_from_luma) { + cb_mult = 0; // fixed scale + cb_luma_mult = 64; // fixed scale + cb_offset = 0; + + cr_mult = 0; // fixed scale + cr_luma_mult = 64; // fixed scale + cr_offset = 0; + } + + int min_luma, max_luma, min_chroma, max_chroma; + + if (params->clip_to_restricted_range) { + min_luma = min_luma_legal_range; + max_luma = max_luma_legal_range; + + if (mc_identity) { + min_chroma = min_luma_legal_range; + max_chroma = max_luma_legal_range; + } else { + min_chroma = min_chroma_legal_range; + max_chroma = max_chroma_legal_range; + } + } else { + min_luma = min_chroma = 0; + max_luma = max_chroma = 255; + } + + for (int i = 0; i < (half_luma_height << (1 - chroma_subsamp_y)); i++) { + for (int j = 0; j < (half_luma_width << (1 - chroma_subsamp_x)); j++) { + int average_luma = 0; + if (chroma_subsamp_x) { + average_luma = (luma[(i << chroma_subsamp_y) * luma_stride + + (j << chroma_subsamp_x)] + + luma[(i << chroma_subsamp_y) * luma_stride + + (j << chroma_subsamp_x) + 1] + + 1) >> + 1; + } else { + average_luma = luma[(i << chroma_subsamp_y) * luma_stride + j]; + } + + if (apply_cb) { + cb[i * chroma_stride + j] = clamp( + cb[i * chroma_stride + j] + + ((scale_LUT(scaling_lut_cb, + clamp(((average_luma * cb_luma_mult + + cb_mult * cb[i * chroma_stride + j]) >> + 6) + + cb_offset, + 0, (256 << (bit_depth - 8)) - 1), + 8) * + cb_grain[i * chroma_grain_stride + j] + + rounding_offset) >> + params->scaling_shift), + min_chroma, max_chroma); + } + + if (apply_cr) { + cr[i * chroma_stride + j] = clamp( + cr[i * chroma_stride + j] + + ((scale_LUT(scaling_lut_cr, + clamp(((average_luma * cr_luma_mult + + cr_mult * cr[i * chroma_stride + j]) >> + 6) + + cr_offset, + 0, (256 << (bit_depth - 8)) - 1), + 8) * + cr_grain[i * chroma_grain_stride + j] + + rounding_offset) >> + params->scaling_shift), + min_chroma, max_chroma); + } + } + } + + if (apply_y) { + for (int i = 0; i < (half_luma_height << 1); i++) { + for (int j = 0; j < (half_luma_width << 1); j++) { + luma[i * luma_stride + j] = + clamp(luma[i * luma_stride + j] + + ((scale_LUT(scaling_lut_y, luma[i * luma_stride + j], 8) * + luma_grain[i * luma_grain_stride + j] + + rounding_offset) >> + params->scaling_shift), + min_luma, max_luma); + } + } + } +} + +static void add_noise_to_block_hbd( + const aom_film_grain_t *params, uint16_t *luma, uint16_t *cb, uint16_t *cr, + int luma_stride, int chroma_stride, int *luma_grain, int *cb_grain, + int *cr_grain, int luma_grain_stride, int chroma_grain_stride, + int half_luma_height, int half_luma_width, int bit_depth, + int chroma_subsamp_y, int chroma_subsamp_x, int mc_identity) { + int cb_mult = params->cb_mult - 128; // fixed scale + int cb_luma_mult = params->cb_luma_mult - 128; // fixed scale + // offset value depends on the bit depth + int cb_offset = (params->cb_offset << (bit_depth - 8)) - (1 << bit_depth); + + int cr_mult = params->cr_mult - 128; // fixed scale + int cr_luma_mult = params->cr_luma_mult - 128; // fixed scale + // offset value depends on the bit depth + int cr_offset = (params->cr_offset << (bit_depth - 8)) - (1 << bit_depth); + + int rounding_offset = (1 << (params->scaling_shift - 1)); + + int apply_y = params->num_y_points > 0 ? 1 : 0; + int apply_cb = + (params->num_cb_points > 0 || params->chroma_scaling_from_luma) > 0 ? 1 + : 0; + int apply_cr = + (params->num_cr_points > 0 || params->chroma_scaling_from_luma) > 0 ? 1 + : 0; + + if (params->chroma_scaling_from_luma) { + cb_mult = 0; // fixed scale + cb_luma_mult = 64; // fixed scale + cb_offset = 0; + + cr_mult = 0; // fixed scale + cr_luma_mult = 64; // fixed scale + cr_offset = 0; + } + + int min_luma, max_luma, min_chroma, max_chroma; + + if (params->clip_to_restricted_range) { + min_luma = min_luma_legal_range << (bit_depth - 8); + max_luma = max_luma_legal_range << (bit_depth - 8); + + if (mc_identity) { + min_chroma = min_luma_legal_range << (bit_depth - 8); + max_chroma = max_luma_legal_range << (bit_depth - 8); + } else { + min_chroma = min_chroma_legal_range << (bit_depth - 8); + max_chroma = max_chroma_legal_range << (bit_depth - 8); + } + } else { + min_luma = min_chroma = 0; + max_luma = max_chroma = (256 << (bit_depth - 8)) - 1; + } + + for (int i = 0; i < (half_luma_height << (1 - chroma_subsamp_y)); i++) { + for (int j = 0; j < (half_luma_width << (1 - chroma_subsamp_x)); j++) { + int average_luma = 0; + if (chroma_subsamp_x) { + average_luma = (luma[(i << chroma_subsamp_y) * luma_stride + + (j << chroma_subsamp_x)] + + luma[(i << chroma_subsamp_y) * luma_stride + + (j << chroma_subsamp_x) + 1] + + 1) >> + 1; + } else { + average_luma = luma[(i << chroma_subsamp_y) * luma_stride + j]; + } + + if (apply_cb) { + cb[i * chroma_stride + j] = clamp( + cb[i * chroma_stride + j] + + ((scale_LUT(scaling_lut_cb, + clamp(((average_luma * cb_luma_mult + + cb_mult * cb[i * chroma_stride + j]) >> + 6) + + cb_offset, + 0, (256 << (bit_depth - 8)) - 1), + bit_depth) * + cb_grain[i * chroma_grain_stride + j] + + rounding_offset) >> + params->scaling_shift), + min_chroma, max_chroma); + } + if (apply_cr) { + cr[i * chroma_stride + j] = clamp( + cr[i * chroma_stride + j] + + ((scale_LUT(scaling_lut_cr, + clamp(((average_luma * cr_luma_mult + + cr_mult * cr[i * chroma_stride + j]) >> + 6) + + cr_offset, + 0, (256 << (bit_depth - 8)) - 1), + bit_depth) * + cr_grain[i * chroma_grain_stride + j] + + rounding_offset) >> + params->scaling_shift), + min_chroma, max_chroma); + } + } + } + + if (apply_y) { + for (int i = 0; i < (half_luma_height << 1); i++) { + for (int j = 0; j < (half_luma_width << 1); j++) { + luma[i * luma_stride + j] = + clamp(luma[i * luma_stride + j] + + ((scale_LUT(scaling_lut_y, luma[i * luma_stride + j], + bit_depth) * + luma_grain[i * luma_grain_stride + j] + + rounding_offset) >> + params->scaling_shift), + min_luma, max_luma); + } + } + } +} + +static void copy_rect(uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int width, int height, + int use_high_bit_depth) { + int hbd_coeff = use_high_bit_depth ? 2 : 1; + while (height) { + memcpy(dst, src, width * sizeof(uint8_t) * hbd_coeff); + src += src_stride; + dst += dst_stride; + --height; + } + return; +} + +static void copy_area(int *src, int src_stride, int *dst, int dst_stride, + int width, int height) { + while (height) { + memcpy(dst, src, width * sizeof(*src)); + src += src_stride; + dst += dst_stride; + --height; + } + return; +} + +static void extend_even(uint8_t *dst, int dst_stride, int width, int height, + int use_high_bit_depth) { + if ((width & 1) == 0 && (height & 1) == 0) return; + if (use_high_bit_depth) { + uint16_t *dst16 = (uint16_t *)dst; + int dst16_stride = dst_stride / 2; + if (width & 1) { + for (int i = 0; i < height; ++i) + dst16[i * dst16_stride + width] = dst16[i * dst16_stride + width - 1]; + } + width = (width + 1) & (~1); + if (height & 1) { + memcpy(&dst16[height * dst16_stride], &dst16[(height - 1) * dst16_stride], + sizeof(*dst16) * width); + } + } else { + if (width & 1) { + for (int i = 0; i < height; ++i) + dst[i * dst_stride + width] = dst[i * dst_stride + width - 1]; + } + width = (width + 1) & (~1); + if (height & 1) { + memcpy(&dst[height * dst_stride], &dst[(height - 1) * dst_stride], + sizeof(*dst) * width); + } + } +} + +static void ver_boundary_overlap(int *left_block, int left_stride, + int *right_block, int right_stride, + int *dst_block, int dst_stride, int width, + int height) { + if (width == 1) { + while (height) { + *dst_block = clamp((*left_block * 23 + *right_block * 22 + 16) >> 5, + grain_min, grain_max); + left_block += left_stride; + right_block += right_stride; + dst_block += dst_stride; + --height; + } + return; + } else if (width == 2) { + while (height) { + dst_block[0] = clamp((27 * left_block[0] + 17 * right_block[0] + 16) >> 5, + grain_min, grain_max); + dst_block[1] = clamp((17 * left_block[1] + 27 * right_block[1] + 16) >> 5, + grain_min, grain_max); + left_block += left_stride; + right_block += right_stride; + dst_block += dst_stride; + --height; + } + return; + } +} + +static void hor_boundary_overlap(int *top_block, int top_stride, + int *bottom_block, int bottom_stride, + int *dst_block, int dst_stride, int width, + int height) { + if (height == 1) { + while (width) { + *dst_block = clamp((*top_block * 23 + *bottom_block * 22 + 16) >> 5, + grain_min, grain_max); + ++top_block; + ++bottom_block; + ++dst_block; + --width; + } + return; + } else if (height == 2) { + while (width) { + dst_block[0] = clamp((27 * top_block[0] + 17 * bottom_block[0] + 16) >> 5, + grain_min, grain_max); + dst_block[dst_stride] = clamp((17 * top_block[top_stride] + + 27 * bottom_block[bottom_stride] + 16) >> + 5, + grain_min, grain_max); + ++top_block; + ++bottom_block; + ++dst_block; + --width; + } + return; + } +} + +int av1_add_film_grain(const aom_film_grain_t *params, const aom_image_t *src, + aom_image_t *dst) { + uint8_t *luma, *cb, *cr; + int height, width, luma_stride, chroma_stride; + int use_high_bit_depth = 0; + int chroma_subsamp_x = 0; + int chroma_subsamp_y = 0; + int mc_identity = src->mc == AOM_CICP_MC_IDENTITY ? 1 : 0; + + switch (src->fmt) { + case AOM_IMG_FMT_AOMI420: + case AOM_IMG_FMT_I420: + use_high_bit_depth = 0; + chroma_subsamp_x = 1; + chroma_subsamp_y = 1; + break; + case AOM_IMG_FMT_I42016: + use_high_bit_depth = 1; + chroma_subsamp_x = 1; + chroma_subsamp_y = 1; + break; + // case AOM_IMG_FMT_444A: + case AOM_IMG_FMT_I444: + use_high_bit_depth = 0; + chroma_subsamp_x = 0; + chroma_subsamp_y = 0; + break; + case AOM_IMG_FMT_I44416: + use_high_bit_depth = 1; + chroma_subsamp_x = 0; + chroma_subsamp_y = 0; + break; + case AOM_IMG_FMT_I422: + use_high_bit_depth = 0; + chroma_subsamp_x = 1; + chroma_subsamp_y = 0; + break; + case AOM_IMG_FMT_I42216: + use_high_bit_depth = 1; + chroma_subsamp_x = 1; + chroma_subsamp_y = 0; + break; + default: // unknown input format + fprintf(stderr, "Film grain error: input format is not supported!"); + return -1; + } + + assert(params->bit_depth == src->bit_depth); + + dst->fmt = src->fmt; + dst->bit_depth = src->bit_depth; + + dst->r_w = src->r_w; + dst->r_h = src->r_h; + dst->d_w = src->d_w; + dst->d_h = src->d_h; + + dst->cp = src->cp; + dst->tc = src->tc; + dst->mc = src->mc; + + dst->monochrome = src->monochrome; + dst->csp = src->csp; + dst->range = src->range; + + dst->x_chroma_shift = src->x_chroma_shift; + dst->y_chroma_shift = src->y_chroma_shift; + + dst->temporal_id = src->temporal_id; + dst->spatial_id = src->spatial_id; + + width = src->d_w % 2 ? src->d_w + 1 : src->d_w; + height = src->d_h % 2 ? src->d_h + 1 : src->d_h; + + copy_rect(src->planes[AOM_PLANE_Y], src->stride[AOM_PLANE_Y], + dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], src->d_w, + src->d_h, use_high_bit_depth); + // Note that dst is already assumed to be aligned to even. + extend_even(dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], src->d_w, + src->d_h, use_high_bit_depth); + + if (!src->monochrome) { + copy_rect(src->planes[AOM_PLANE_U], src->stride[AOM_PLANE_U], + dst->planes[AOM_PLANE_U], dst->stride[AOM_PLANE_U], + width >> chroma_subsamp_x, height >> chroma_subsamp_y, + use_high_bit_depth); + + copy_rect(src->planes[AOM_PLANE_V], src->stride[AOM_PLANE_V], + dst->planes[AOM_PLANE_V], dst->stride[AOM_PLANE_V], + width >> chroma_subsamp_x, height >> chroma_subsamp_y, + use_high_bit_depth); + } + + luma = dst->planes[AOM_PLANE_Y]; + cb = dst->planes[AOM_PLANE_U]; + cr = dst->planes[AOM_PLANE_V]; + + // luma and chroma strides in samples + luma_stride = dst->stride[AOM_PLANE_Y] >> use_high_bit_depth; + chroma_stride = dst->stride[AOM_PLANE_U] >> use_high_bit_depth; + + return av1_add_film_grain_run( + params, luma, cb, cr, height, width, luma_stride, chroma_stride, + use_high_bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity); +} + +int av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma, + uint8_t *cb, uint8_t *cr, int height, int width, + int luma_stride, int chroma_stride, + int use_high_bit_depth, int chroma_subsamp_y, + int chroma_subsamp_x, int mc_identity) { + int **pred_pos_luma; + int **pred_pos_chroma; + int *luma_grain_block; + int *cb_grain_block; + int *cr_grain_block; + + int *y_line_buf; + int *cb_line_buf; + int *cr_line_buf; + + int *y_col_buf; + int *cb_col_buf; + int *cr_col_buf; + + random_register = params->random_seed; + + int left_pad = 3; + int right_pad = 3; // padding to offset for AR coefficients + int top_pad = 3; + int bottom_pad = 0; + + int ar_padding = 3; // maximum lag used for stabilization of AR coefficients + + luma_subblock_size_y = 32; + luma_subblock_size_x = 32; + + chroma_subblock_size_y = luma_subblock_size_y >> chroma_subsamp_y; + chroma_subblock_size_x = luma_subblock_size_x >> chroma_subsamp_x; + + // Initial padding is only needed for generation of + // film grain templates (to stabilize the AR process) + // Only a 64x64 luma and 32x32 chroma part of a template + // is used later for adding grain, padding can be discarded + + int luma_block_size_y = + top_pad + 2 * ar_padding + luma_subblock_size_y * 2 + bottom_pad; + int luma_block_size_x = left_pad + 2 * ar_padding + luma_subblock_size_x * 2 + + 2 * ar_padding + right_pad; + + int chroma_block_size_y = top_pad + (2 >> chroma_subsamp_y) * ar_padding + + chroma_subblock_size_y * 2 + bottom_pad; + int chroma_block_size_x = left_pad + (2 >> chroma_subsamp_x) * ar_padding + + chroma_subblock_size_x * 2 + + (2 >> chroma_subsamp_x) * ar_padding + right_pad; + + int luma_grain_stride = luma_block_size_x; + int chroma_grain_stride = chroma_block_size_x; + + int overlap = params->overlap_flag; + int bit_depth = params->bit_depth; + + const int grain_center = 128 << (bit_depth - 8); + grain_min = 0 - grain_center; + grain_max = grain_center - 1; + + init_arrays(params, luma_stride, chroma_stride, &pred_pos_luma, + &pred_pos_chroma, &luma_grain_block, &cb_grain_block, + &cr_grain_block, &y_line_buf, &cb_line_buf, &cr_line_buf, + &y_col_buf, &cb_col_buf, &cr_col_buf, + luma_block_size_y * luma_block_size_x, + chroma_block_size_y * chroma_block_size_x, chroma_subsamp_y, + chroma_subsamp_x); + + if (generate_luma_grain_block(params, pred_pos_luma, luma_grain_block, + luma_block_size_y, luma_block_size_x, + luma_grain_stride, left_pad, top_pad, right_pad, + bottom_pad)) + return -1; + + if (generate_chroma_grain_blocks( + params, + // pred_pos_luma, + pred_pos_chroma, luma_grain_block, cb_grain_block, cr_grain_block, + luma_grain_stride, chroma_block_size_y, chroma_block_size_x, + chroma_grain_stride, left_pad, top_pad, right_pad, bottom_pad, + chroma_subsamp_y, chroma_subsamp_x)) + return -1; + + init_scaling_function(params->scaling_points_y, params->num_y_points, + scaling_lut_y); + + if (params->chroma_scaling_from_luma) { + memcpy(scaling_lut_cb, scaling_lut_y, sizeof(*scaling_lut_y) * 256); + memcpy(scaling_lut_cr, scaling_lut_y, sizeof(*scaling_lut_y) * 256); + } else { + init_scaling_function(params->scaling_points_cb, params->num_cb_points, + scaling_lut_cb); + init_scaling_function(params->scaling_points_cr, params->num_cr_points, + scaling_lut_cr); + } + for (int y = 0; y < height / 2; y += (luma_subblock_size_y >> 1)) { + init_random_generator(y * 2, params->random_seed); + + for (int x = 0; x < width / 2; x += (luma_subblock_size_x >> 1)) { + int offset_y = get_random_number(8); + int offset_x = (offset_y >> 4) & 15; + offset_y &= 15; + + int luma_offset_y = left_pad + 2 * ar_padding + (offset_y << 1); + int luma_offset_x = top_pad + 2 * ar_padding + (offset_x << 1); + + int chroma_offset_y = top_pad + (2 >> chroma_subsamp_y) * ar_padding + + offset_y * (2 >> chroma_subsamp_y); + int chroma_offset_x = left_pad + (2 >> chroma_subsamp_x) * ar_padding + + offset_x * (2 >> chroma_subsamp_x); + + if (overlap && x) { + ver_boundary_overlap( + y_col_buf, 2, + luma_grain_block + luma_offset_y * luma_grain_stride + + luma_offset_x, + luma_grain_stride, y_col_buf, 2, 2, + AOMMIN(luma_subblock_size_y + 2, height - (y << 1))); + + ver_boundary_overlap( + cb_col_buf, 2 >> chroma_subsamp_x, + cb_grain_block + chroma_offset_y * chroma_grain_stride + + chroma_offset_x, + chroma_grain_stride, cb_col_buf, 2 >> chroma_subsamp_x, + 2 >> chroma_subsamp_x, + AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y), + (height - (y << 1)) >> chroma_subsamp_y)); + + ver_boundary_overlap( + cr_col_buf, 2 >> chroma_subsamp_x, + cr_grain_block + chroma_offset_y * chroma_grain_stride + + chroma_offset_x, + chroma_grain_stride, cr_col_buf, 2 >> chroma_subsamp_x, + 2 >> chroma_subsamp_x, + AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y), + (height - (y << 1)) >> chroma_subsamp_y)); + + int i = y ? 1 : 0; + + if (use_high_bit_depth) { + add_noise_to_block_hbd( + params, + (uint16_t *)luma + ((y + i) << 1) * luma_stride + (x << 1), + (uint16_t *)cb + + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + + (x << (1 - chroma_subsamp_x)), + (uint16_t *)cr + + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + + (x << (1 - chroma_subsamp_x)), + luma_stride, chroma_stride, y_col_buf + i * 4, + cb_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x), + cr_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x), + 2, (2 - chroma_subsamp_x), + AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, 1, + bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity); + } else { + add_noise_to_block( + params, luma + ((y + i) << 1) * luma_stride + (x << 1), + cb + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + + (x << (1 - chroma_subsamp_x)), + cr + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + + (x << (1 - chroma_subsamp_x)), + luma_stride, chroma_stride, y_col_buf + i * 4, + cb_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x), + cr_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x), + 2, (2 - chroma_subsamp_x), + AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, 1, + bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity); + } + } + + if (overlap && y) { + if (x) { + hor_boundary_overlap(y_line_buf + (x << 1), luma_stride, y_col_buf, 2, + y_line_buf + (x << 1), luma_stride, 2, 2); + + hor_boundary_overlap(cb_line_buf + x * (2 >> chroma_subsamp_x), + chroma_stride, cb_col_buf, 2 >> chroma_subsamp_x, + cb_line_buf + x * (2 >> chroma_subsamp_x), + chroma_stride, 2 >> chroma_subsamp_x, + 2 >> chroma_subsamp_y); + + hor_boundary_overlap(cr_line_buf + x * (2 >> chroma_subsamp_x), + chroma_stride, cr_col_buf, 2 >> chroma_subsamp_x, + cr_line_buf + x * (2 >> chroma_subsamp_x), + chroma_stride, 2 >> chroma_subsamp_x, + 2 >> chroma_subsamp_y); + } + + hor_boundary_overlap( + y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride, + luma_grain_block + luma_offset_y * luma_grain_stride + + luma_offset_x + (x ? 2 : 0), + luma_grain_stride, y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride, + AOMMIN(luma_subblock_size_x - ((x ? 1 : 0) << 1), + width - ((x ? x + 1 : 0) << 1)), + 2); + + hor_boundary_overlap( + cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), + chroma_stride, + cb_grain_block + chroma_offset_y * chroma_grain_stride + + chroma_offset_x + ((x ? 1 : 0) << (1 - chroma_subsamp_x)), + chroma_grain_stride, + cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), + chroma_stride, + AOMMIN(chroma_subblock_size_x - + ((x ? 1 : 0) << (1 - chroma_subsamp_x)), + (width - ((x ? x + 1 : 0) << 1)) >> chroma_subsamp_x), + 2 >> chroma_subsamp_y); + + hor_boundary_overlap( + cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), + chroma_stride, + cr_grain_block + chroma_offset_y * chroma_grain_stride + + chroma_offset_x + ((x ? 1 : 0) << (1 - chroma_subsamp_x)), + chroma_grain_stride, + cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), + chroma_stride, + AOMMIN(chroma_subblock_size_x - + ((x ? 1 : 0) << (1 - chroma_subsamp_x)), + (width - ((x ? x + 1 : 0) << 1)) >> chroma_subsamp_x), + 2 >> chroma_subsamp_y); + + if (use_high_bit_depth) { + add_noise_to_block_hbd( + params, (uint16_t *)luma + (y << 1) * luma_stride + (x << 1), + (uint16_t *)cb + (y << (1 - chroma_subsamp_y)) * chroma_stride + + (x << ((1 - chroma_subsamp_x))), + (uint16_t *)cr + (y << (1 - chroma_subsamp_y)) * chroma_stride + + (x << ((1 - chroma_subsamp_x))), + luma_stride, chroma_stride, y_line_buf + (x << 1), + cb_line_buf + (x << (1 - chroma_subsamp_x)), + cr_line_buf + (x << (1 - chroma_subsamp_x)), luma_stride, + chroma_stride, 1, + AOMMIN(luma_subblock_size_x >> 1, width / 2 - x), bit_depth, + chroma_subsamp_y, chroma_subsamp_x, mc_identity); + } else { + add_noise_to_block( + params, luma + (y << 1) * luma_stride + (x << 1), + cb + (y << (1 - chroma_subsamp_y)) * chroma_stride + + (x << ((1 - chroma_subsamp_x))), + cr + (y << (1 - chroma_subsamp_y)) * chroma_stride + + (x << ((1 - chroma_subsamp_x))), + luma_stride, chroma_stride, y_line_buf + (x << 1), + cb_line_buf + (x << (1 - chroma_subsamp_x)), + cr_line_buf + (x << (1 - chroma_subsamp_x)), luma_stride, + chroma_stride, 1, + AOMMIN(luma_subblock_size_x >> 1, width / 2 - x), bit_depth, + chroma_subsamp_y, chroma_subsamp_x, mc_identity); + } + } + + int i = overlap && y ? 1 : 0; + int j = overlap && x ? 1 : 0; + + if (use_high_bit_depth) { + add_noise_to_block_hbd( + params, + (uint16_t *)luma + ((y + i) << 1) * luma_stride + ((x + j) << 1), + (uint16_t *)cb + + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + + ((x + j) << (1 - chroma_subsamp_x)), + (uint16_t *)cr + + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + + ((x + j) << (1 - chroma_subsamp_x)), + luma_stride, chroma_stride, + luma_grain_block + (luma_offset_y + (i << 1)) * luma_grain_stride + + luma_offset_x + (j << 1), + cb_grain_block + + (chroma_offset_y + (i << (1 - chroma_subsamp_y))) * + chroma_grain_stride + + chroma_offset_x + (j << (1 - chroma_subsamp_x)), + cr_grain_block + + (chroma_offset_y + (i << (1 - chroma_subsamp_y))) * + chroma_grain_stride + + chroma_offset_x + (j << (1 - chroma_subsamp_x)), + luma_grain_stride, chroma_grain_stride, + AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, + AOMMIN(luma_subblock_size_x >> 1, width / 2 - x) - j, bit_depth, + chroma_subsamp_y, chroma_subsamp_x, mc_identity); + } else { + add_noise_to_block( + params, luma + ((y + i) << 1) * luma_stride + ((x + j) << 1), + cb + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + + ((x + j) << (1 - chroma_subsamp_x)), + cr + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + + ((x + j) << (1 - chroma_subsamp_x)), + luma_stride, chroma_stride, + luma_grain_block + (luma_offset_y + (i << 1)) * luma_grain_stride + + luma_offset_x + (j << 1), + cb_grain_block + + (chroma_offset_y + (i << (1 - chroma_subsamp_y))) * + chroma_grain_stride + + chroma_offset_x + (j << (1 - chroma_subsamp_x)), + cr_grain_block + + (chroma_offset_y + (i << (1 - chroma_subsamp_y))) * + chroma_grain_stride + + chroma_offset_x + (j << (1 - chroma_subsamp_x)), + luma_grain_stride, chroma_grain_stride, + AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, + AOMMIN(luma_subblock_size_x >> 1, width / 2 - x) - j, bit_depth, + chroma_subsamp_y, chroma_subsamp_x, mc_identity); + } + + if (overlap) { + if (x) { + // Copy overlapped column bufer to line buffer + copy_area(y_col_buf + (luma_subblock_size_y << 1), 2, + y_line_buf + (x << 1), luma_stride, 2, 2); + + copy_area( + cb_col_buf + (chroma_subblock_size_y << (1 - chroma_subsamp_x)), + 2 >> chroma_subsamp_x, + cb_line_buf + (x << (1 - chroma_subsamp_x)), chroma_stride, + 2 >> chroma_subsamp_x, 2 >> chroma_subsamp_y); + + copy_area( + cr_col_buf + (chroma_subblock_size_y << (1 - chroma_subsamp_x)), + 2 >> chroma_subsamp_x, + cr_line_buf + (x << (1 - chroma_subsamp_x)), chroma_stride, + 2 >> chroma_subsamp_x, 2 >> chroma_subsamp_y); + } + + // Copy grain to the line buffer for overlap with a bottom block + copy_area( + luma_grain_block + + (luma_offset_y + luma_subblock_size_y) * luma_grain_stride + + luma_offset_x + ((x ? 2 : 0)), + luma_grain_stride, y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride, + AOMMIN(luma_subblock_size_x, width - (x << 1)) - (x ? 2 : 0), 2); + + copy_area(cb_grain_block + + (chroma_offset_y + chroma_subblock_size_y) * + chroma_grain_stride + + chroma_offset_x + (x ? 2 >> chroma_subsamp_x : 0), + chroma_grain_stride, + cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), + chroma_stride, + AOMMIN(chroma_subblock_size_x, + ((width - (x << 1)) >> chroma_subsamp_x)) - + (x ? 2 >> chroma_subsamp_x : 0), + 2 >> chroma_subsamp_y); + + copy_area(cr_grain_block + + (chroma_offset_y + chroma_subblock_size_y) * + chroma_grain_stride + + chroma_offset_x + (x ? 2 >> chroma_subsamp_x : 0), + chroma_grain_stride, + cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), + chroma_stride, + AOMMIN(chroma_subblock_size_x, + ((width - (x << 1)) >> chroma_subsamp_x)) - + (x ? 2 >> chroma_subsamp_x : 0), + 2 >> chroma_subsamp_y); + + // Copy grain to the column buffer for overlap with the next block to + // the right + + copy_area(luma_grain_block + luma_offset_y * luma_grain_stride + + luma_offset_x + luma_subblock_size_x, + luma_grain_stride, y_col_buf, 2, 2, + AOMMIN(luma_subblock_size_y + 2, height - (y << 1))); + + copy_area(cb_grain_block + chroma_offset_y * chroma_grain_stride + + chroma_offset_x + chroma_subblock_size_x, + chroma_grain_stride, cb_col_buf, 2 >> chroma_subsamp_x, + 2 >> chroma_subsamp_x, + AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y), + (height - (y << 1)) >> chroma_subsamp_y)); + + copy_area(cr_grain_block + chroma_offset_y * chroma_grain_stride + + chroma_offset_x + chroma_subblock_size_x, + chroma_grain_stride, cr_col_buf, 2 >> chroma_subsamp_x, + 2 >> chroma_subsamp_x, + AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y), + (height - (y << 1)) >> chroma_subsamp_y)); + } + } + } + + dealloc_arrays(params, &pred_pos_luma, &pred_pos_chroma, &luma_grain_block, + &cb_grain_block, &cr_grain_block, &y_line_buf, &cb_line_buf, + &cr_line_buf, &y_col_buf, &cb_col_buf, &cr_col_buf); + return 0; +} diff --git a/libs/libaom/src/aom_dsp/grain_synthesis.h b/libs/libaom/src/aom_dsp/grain_synthesis.h new file mode 100644 index 000000000..9155b3903 --- /dev/null +++ b/libs/libaom/src/aom_dsp/grain_synthesis.h @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Describes film grain parameters and film grain synthesis + * + */ +#ifndef AOM_AOM_DSP_GRAIN_SYNTHESIS_H_ +#define AOM_AOM_DSP_GRAIN_SYNTHESIS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom/aom_image.h" + +/*!\brief Structure containing film grain synthesis parameters for a frame + * + * This structure contains input parameters for film grain synthesis + */ +typedef struct { + // This structure is compared element-by-element in the function + // av1_check_grain_params_equiv: this function must be updated if any changes + // are made to this structure. + int apply_grain; + + int update_parameters; + + // 8 bit values + int scaling_points_y[14][2]; + int num_y_points; // value: 0..14 + + // 8 bit values + int scaling_points_cb[10][2]; + int num_cb_points; // value: 0..10 + + // 8 bit values + int scaling_points_cr[10][2]; + int num_cr_points; // value: 0..10 + + int scaling_shift; // values : 8..11 + + int ar_coeff_lag; // values: 0..3 + + // 8 bit values + int ar_coeffs_y[24]; + int ar_coeffs_cb[25]; + int ar_coeffs_cr[25]; + + // Shift value: AR coeffs range + // 6: [-2, 2) + // 7: [-1, 1) + // 8: [-0.5, 0.5) + // 9: [-0.25, 0.25) + int ar_coeff_shift; // values : 6..9 + + int cb_mult; // 8 bits + int cb_luma_mult; // 8 bits + int cb_offset; // 9 bits + + int cr_mult; // 8 bits + int cr_luma_mult; // 8 bits + int cr_offset; // 9 bits + + int overlap_flag; + + int clip_to_restricted_range; + + unsigned int bit_depth; // video bit depth + + int chroma_scaling_from_luma; + + int grain_scale_shift; + + uint16_t random_seed; + // This structure is compared element-by-element in the function + // av1_check_grain_params_equiv: this function must be updated if any changes + // are made to this structure. +} aom_film_grain_t; + +/*!\brief Check if two film grain parameters structs are equivalent + * + * Check if two film grain parameters are equal, except for the + * update_parameters and random_seed elements which are ignored. + * + * \param[in] pa The first set of parameters to compare + * \param[in] pb The second set of parameters to compare + * \return Returns 1 if the params are equivalent, 0 otherwise + */ +static INLINE int av1_check_grain_params_equiv( + const aom_film_grain_t *const pa, const aom_film_grain_t *const pb) { + if (pa->apply_grain != pb->apply_grain) return 0; + // Don't compare update_parameters + + if (pa->num_y_points != pb->num_y_points) return 0; + if (memcmp(pa->scaling_points_y, pb->scaling_points_y, + pa->num_y_points * 2 * sizeof(*pa->scaling_points_y)) != 0) + return 0; + + if (pa->num_cb_points != pb->num_cb_points) return 0; + if (memcmp(pa->scaling_points_cb, pb->scaling_points_cb, + pa->num_cb_points * 2 * sizeof(*pa->scaling_points_cb)) != 0) + return 0; + + if (pa->num_cr_points != pb->num_cr_points) return 0; + if (memcmp(pa->scaling_points_cr, pb->scaling_points_cr, + pa->num_cr_points * 2 * sizeof(*pa->scaling_points_cr)) != 0) + return 0; + + if (pa->scaling_shift != pb->scaling_shift) return 0; + if (pa->ar_coeff_lag != pb->ar_coeff_lag) return 0; + + const int num_pos = 2 * pa->ar_coeff_lag * (pa->ar_coeff_lag + 1); + if (memcmp(pa->ar_coeffs_y, pb->ar_coeffs_y, + num_pos * sizeof(*pa->ar_coeffs_y)) != 0) + return 0; + if (memcmp(pa->ar_coeffs_cb, pb->ar_coeffs_cb, + num_pos * sizeof(*pa->ar_coeffs_cb)) != 0) + return 0; + if (memcmp(pa->ar_coeffs_cr, pb->ar_coeffs_cr, + num_pos * sizeof(*pa->ar_coeffs_cr)) != 0) + return 0; + + if (pa->ar_coeff_shift != pb->ar_coeff_shift) return 0; + + if (pa->cb_mult != pb->cb_mult) return 0; + if (pa->cb_luma_mult != pb->cb_luma_mult) return 0; + if (pa->cb_offset != pb->cb_offset) return 0; + + if (pa->cr_mult != pb->cr_mult) return 0; + if (pa->cr_luma_mult != pb->cr_luma_mult) return 0; + if (pa->cr_offset != pb->cr_offset) return 0; + + if (pa->overlap_flag != pb->overlap_flag) return 0; + if (pa->clip_to_restricted_range != pb->clip_to_restricted_range) return 0; + if (pa->bit_depth != pb->bit_depth) return 0; + if (pa->chroma_scaling_from_luma != pb->chroma_scaling_from_luma) return 0; + if (pa->grain_scale_shift != pb->grain_scale_shift) return 0; + + return 1; +} + +/*!\brief Add film grain + * + * Add film grain to an image + * + * Returns 0 for success, -1 for failure + * + * \param[in] grain_params Grain parameters + * \param[in] luma luma plane + * \param[in] cb cb plane + * \param[in] cr cr plane + * \param[in] height luma plane height + * \param[in] width luma plane width + * \param[in] luma_stride luma plane stride + * \param[in] chroma_stride chroma plane stride + */ +int av1_add_film_grain_run(const aom_film_grain_t *grain_params, uint8_t *luma, + uint8_t *cb, uint8_t *cr, int height, int width, + int luma_stride, int chroma_stride, + int use_high_bit_depth, int chroma_subsamp_y, + int chroma_subsamp_x, int mc_identity); + +/*!\brief Add film grain + * + * Add film grain to an image + * + * Returns 0 for success, -1 for failure + * + * \param[in] grain_params Grain parameters + * \param[in] src Source image + * \param[out] dst Resulting image with grain + */ +int av1_add_film_grain(const aom_film_grain_t *grain_params, + const aom_image_t *src, aom_image_t *dst); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_GRAIN_SYNTHESIS_H_ diff --git a/libs/libaom/src/aom_dsp/grain_table.c b/libs/libaom/src/aom_dsp/grain_table.c new file mode 100644 index 000000000..e03f04d5d --- /dev/null +++ b/libs/libaom/src/aom_dsp/grain_table.c @@ -0,0 +1,334 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief This file has the implementation details of the grain table. + * + * The file format is an ascii representation for readability and + * editability. Array parameters are separated from the non-array + * parameters and prefixed with a few characters to make for easy + * localization with a parameter set. Each entry is prefixed with "E" + * and the other parameters are only specified if "update-parms" is + * non-zero. + * + * filmgrn1 + * E + * p ... + * sY ... + * sCb ... + * sCr ... + * cY .... + * cCb .... + * cCr .... + * E ... + */ +#include +#include +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/grain_table.h" +#include "aom_mem/aom_mem.h" + +static const char kFileMagic[8] = "filmgrn1"; + +static void grain_table_entry_read(FILE *file, + struct aom_internal_error_info *error_info, + aom_film_grain_table_entry_t *entry) { + aom_film_grain_t *pars = &entry->params; + int num_read = + fscanf(file, "E %" PRId64 " %" PRId64 " %d %hd %d\n", &entry->start_time, + &entry->end_time, &pars->apply_grain, &pars->random_seed, + &pars->update_parameters); + if (num_read == 0 && feof(file)) return; + if (num_read != 5) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read entry header. Read %d != 5", num_read); + return; + } + if (pars->update_parameters) { + num_read = fscanf(file, "p %d %d %d %d %d %d %d %d %d %d %d %d\n", + &pars->ar_coeff_lag, &pars->ar_coeff_shift, + &pars->grain_scale_shift, &pars->scaling_shift, + &pars->chroma_scaling_from_luma, &pars->overlap_flag, + &pars->cb_mult, &pars->cb_luma_mult, &pars->cb_offset, + &pars->cr_mult, &pars->cr_luma_mult, &pars->cr_offset); + if (num_read != 12) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read entry params. Read %d != 12", + num_read); + return; + } + if (!fscanf(file, "\tsY %d ", &pars->num_y_points)) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read num y points"); + return; + } + for (int i = 0; i < pars->num_y_points; ++i) { + if (2 != fscanf(file, "%d %d", &pars->scaling_points_y[i][0], + &pars->scaling_points_y[i][1])) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read y scaling points"); + return; + } + } + if (!fscanf(file, "\n\tsCb %d", &pars->num_cb_points)) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read num cb points"); + return; + } + for (int i = 0; i < pars->num_cb_points; ++i) { + if (2 != fscanf(file, "%d %d", &pars->scaling_points_cb[i][0], + &pars->scaling_points_cb[i][1])) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read cb scaling points"); + return; + } + } + if (!fscanf(file, "\n\tsCr %d", &pars->num_cr_points)) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read num cr points"); + return; + } + for (int i = 0; i < pars->num_cr_points; ++i) { + if (2 != fscanf(file, "%d %d", &pars->scaling_points_cr[i][0], + &pars->scaling_points_cr[i][1])) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read cr scaling points"); + return; + } + } + + fscanf(file, "\n\tcY"); + const int n = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1); + for (int i = 0; i < n; ++i) { + if (1 != fscanf(file, "%d", &pars->ar_coeffs_y[i])) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read Y coeffs"); + return; + } + } + fscanf(file, "\n\tcCb"); + for (int i = 0; i <= n; ++i) { + if (1 != fscanf(file, "%d", &pars->ar_coeffs_cb[i])) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read Cb coeffs"); + return; + } + } + fscanf(file, "\n\tcCr"); + for (int i = 0; i <= n; ++i) { + if (1 != fscanf(file, "%d", &pars->ar_coeffs_cr[i])) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read Cr coeffs"); + return; + } + } + fscanf(file, "\n"); + } +} + +static void grain_table_entry_write(FILE *file, + aom_film_grain_table_entry_t *entry) { + const aom_film_grain_t *pars = &entry->params; + fprintf(file, "E %" PRId64 " %" PRId64 " %d %d %d\n", entry->start_time, + entry->end_time, pars->apply_grain, pars->random_seed, + pars->update_parameters); + if (pars->update_parameters) { + fprintf(file, "\tp %d %d %d %d %d %d %d %d %d %d %d %d\n", + pars->ar_coeff_lag, pars->ar_coeff_shift, pars->grain_scale_shift, + pars->scaling_shift, pars->chroma_scaling_from_luma, + pars->overlap_flag, pars->cb_mult, pars->cb_luma_mult, + pars->cb_offset, pars->cr_mult, pars->cr_luma_mult, + pars->cr_offset); + fprintf(file, "\tsY %d ", pars->num_y_points); + for (int i = 0; i < pars->num_y_points; ++i) { + fprintf(file, " %d %d", pars->scaling_points_y[i][0], + pars->scaling_points_y[i][1]); + } + fprintf(file, "\n\tsCb %d", pars->num_cb_points); + for (int i = 0; i < pars->num_cb_points; ++i) { + fprintf(file, " %d %d", pars->scaling_points_cb[i][0], + pars->scaling_points_cb[i][1]); + } + fprintf(file, "\n\tsCr %d", pars->num_cr_points); + for (int i = 0; i < pars->num_cr_points; ++i) { + fprintf(file, " %d %d", pars->scaling_points_cr[i][0], + pars->scaling_points_cr[i][1]); + } + fprintf(file, "\n\tcY"); + const int n = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1); + for (int i = 0; i < n; ++i) { + fprintf(file, " %d", pars->ar_coeffs_y[i]); + } + fprintf(file, "\n\tcCb"); + for (int i = 0; i <= n; ++i) { + fprintf(file, " %d", pars->ar_coeffs_cb[i]); + } + fprintf(file, "\n\tcCr"); + for (int i = 0; i <= n; ++i) { + fprintf(file, " %d", pars->ar_coeffs_cr[i]); + } + fprintf(file, "\n"); + } +} + +void aom_film_grain_table_append(aom_film_grain_table_t *t, int64_t time_stamp, + int64_t end_time, + const aom_film_grain_t *grain) { + if (!t->tail || memcmp(grain, &t->tail->params, sizeof(*grain))) { + aom_film_grain_table_entry_t *new_tail = aom_malloc(sizeof(*new_tail)); + memset(new_tail, 0, sizeof(*new_tail)); + if (t->tail) t->tail->next = new_tail; + if (!t->head) t->head = new_tail; + t->tail = new_tail; + + new_tail->start_time = time_stamp; + new_tail->end_time = end_time; + new_tail->params = *grain; + } else { + t->tail->end_time = AOMMAX(t->tail->end_time, end_time); + t->tail->start_time = AOMMIN(t->tail->start_time, time_stamp); + } +} + +int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp, + int64_t end_time, int erase, + aom_film_grain_t *grain) { + aom_film_grain_table_entry_t *entry = t->head; + aom_film_grain_table_entry_t *prev_entry = 0; + uint16_t random_seed = grain ? grain->random_seed : 0; + if (grain) memset(grain, 0, sizeof(*grain)); + + while (entry) { + aom_film_grain_table_entry_t *next = entry->next; + if (time_stamp >= entry->start_time && time_stamp < entry->end_time) { + if (grain) { + *grain = entry->params; + if (time_stamp != 0) grain->random_seed = random_seed; + } + if (!erase) return 1; + + const int64_t entry_end_time = entry->end_time; + if (time_stamp <= entry->start_time && end_time >= entry->end_time) { + if (t->tail == entry) t->tail = prev_entry; + if (prev_entry) { + prev_entry->next = entry->next; + } else { + t->head = entry->next; + } + aom_free(entry); + } else if (time_stamp <= entry->start_time && + end_time < entry->end_time) { + entry->start_time = end_time; + } else if (time_stamp > entry->start_time && + end_time >= entry->end_time) { + entry->end_time = time_stamp; + } else { + aom_film_grain_table_entry_t *new_entry = + aom_malloc(sizeof(*new_entry)); + new_entry->next = entry->next; + new_entry->start_time = end_time; + new_entry->end_time = entry->end_time; + new_entry->params = entry->params; + entry->next = new_entry; + entry->end_time = time_stamp; + if (t->tail == entry) t->tail = new_entry; + } + // If segments aren't aligned, delete from the beggining of subsequent + // segments + if (end_time > entry_end_time) { + aom_film_grain_table_lookup(t, entry->end_time, end_time, 1, 0); + } + return 1; + } + prev_entry = entry; + entry = next; + } + return 0; +} + +aom_codec_err_t aom_film_grain_table_read( + aom_film_grain_table_t *t, const char *filename, + struct aom_internal_error_info *error_info) { + FILE *file = fopen(filename, "rb"); + if (!file) { + aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to open %s", + filename); + return error_info->error_code; + } + error_info->error_code = AOM_CODEC_OK; + + // Read in one extra character as there should be white space after + // the header. + char magic[9]; + if (!fread(magic, 9, 1, file) || memcmp(magic, kFileMagic, 8)) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read (or invalid) file magic"); + fclose(file); + return error_info->error_code; + } + + aom_film_grain_table_entry_t *prev_entry = 0; + while (!feof(file)) { + aom_film_grain_table_entry_t *entry = aom_malloc(sizeof(*entry)); + memset(entry, 0, sizeof(*entry)); + grain_table_entry_read(file, error_info, entry); + entry->next = 0; + + if (prev_entry) prev_entry->next = entry; + if (!t->head) t->head = entry; + t->tail = entry; + prev_entry = entry; + + if (error_info->error_code != AOM_CODEC_OK) break; + } + + fclose(file); + return error_info->error_code; +} + +aom_codec_err_t aom_film_grain_table_write( + const aom_film_grain_table_t *t, const char *filename, + struct aom_internal_error_info *error_info) { + error_info->error_code = AOM_CODEC_OK; + + FILE *file = fopen(filename, "wb"); + if (!file) { + aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to open file %s", + filename); + return error_info->error_code; + } + + if (!fwrite(kFileMagic, 8, 1, file)) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to write file magic"); + fclose(file); + return error_info->error_code; + } + + fprintf(file, "\n"); + aom_film_grain_table_entry_t *entry = t->head; + while (entry) { + grain_table_entry_write(file, entry); + entry = entry->next; + } + fclose(file); + return error_info->error_code; +} + +void aom_film_grain_table_free(aom_film_grain_table_t *t) { + aom_film_grain_table_entry_t *entry = t->head; + while (entry) { + aom_film_grain_table_entry_t *next = entry->next; + aom_free(entry); + entry = next; + } + memset(t, 0, sizeof(*t)); +} diff --git a/libs/libaom/src/aom_dsp/grain_table.h b/libs/libaom/src/aom_dsp/grain_table.h new file mode 100644 index 000000000..a8ac50730 --- /dev/null +++ b/libs/libaom/src/aom_dsp/grain_table.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief A table mapping from time to corresponding film grain parameters. + * + * In order to apply grain synthesis in the decoder, the film grain parameters + * need to be signalled in the encoder. The film grain parameters are time + * varying, and for two-pass encoding (and denoiser implementation flexibility) + * it is common to denoise the video and do parameter estimation before encoding + * the denoised video. + * + * The film grain table is used to provide this flexibility and is used as a + * parameter that is passed to the encoder. + * + * Further, if regraining is to be done in say a single pass mode, or in two + * pass within the encoder (before frames are added to the lookahead buffer), + * this data structure can be used to keep track of on-the-fly estimated grain + * parameters, that are then extracted from the table before the encoded frame + * is written. + */ +#ifndef AOM_AOM_DSP_GRAIN_TABLE_H_ +#define AOM_AOM_DSP_GRAIN_TABLE_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "aom_dsp/grain_synthesis.h" +#include "aom/internal/aom_codec_internal.h" + +typedef struct aom_film_grain_table_entry_t { + aom_film_grain_t params; + int64_t start_time; + int64_t end_time; + struct aom_film_grain_table_entry_t *next; +} aom_film_grain_table_entry_t; + +typedef struct { + aom_film_grain_table_entry_t *head; + aom_film_grain_table_entry_t *tail; +} aom_film_grain_table_t; + +/*!\brief Add a mapping from [time_stamp, end_time) to the given grain + * parameters + * + * \param[in/out] table The grain table + * \param[in] time_stamp The start time stamp + * \param[in] end_stamp The end time_stamp + * \param[in] grain The grain parameters + */ +void aom_film_grain_table_append(aom_film_grain_table_t *table, + int64_t time_stamp, int64_t end_time, + const aom_film_grain_t *grain); + +/*!\brief Look-up (and optionally erase) the grain parameters for the given time + * + * \param[in] table The grain table + * \param[in] time_stamp The start time stamp + * \param[in] end_stamp The end time_stamp + * \param[in] erase Whether the time segment can be deleted + * \param[out] grain The output grain parameters + */ +int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp, + int64_t end_time, int erase, + aom_film_grain_t *grain); + +/*!\brief Reads the grain table from a file. + * + * \param[out] table The grain table + * \param[in] filename The file to read from + * \param[in] error_info Error info for tracking errors + */ +aom_codec_err_t aom_film_grain_table_read( + aom_film_grain_table_t *table, const char *filename, + struct aom_internal_error_info *error_info); + +/*!\brief Writes the grain table from a file. + * + * \param[out] table The grain table + * \param[in] filename The file to read from + * \param[in] error_info Error info for tracking errors + */ +aom_codec_err_t aom_film_grain_table_write( + const aom_film_grain_table_t *t, const char *filename, + struct aom_internal_error_info *error_info); + +void aom_film_grain_table_free(aom_film_grain_table_t *t); + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AOM_DSP_GRAIN_TABLE_H_ diff --git a/libs/libaom/src/aom_dsp/intrapred.c b/libs/libaom/src/aom_dsp/intrapred.c new file mode 100644 index 000000000..72ccfd835 --- /dev/null +++ b/libs/libaom/src/aom_dsp/intrapred.c @@ -0,0 +1,792 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/intrapred_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/bitops.h" + +static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left) { + int r; + (void)left; + + for (r = 0; r < bh; r++) { + memcpy(dst, above, bw); + dst += stride; + } +} + +static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left) { + int r; + (void)above; + + for (r = 0; r < bh; r++) { + memset(dst, left[r], bw); + dst += stride; + } +} + +static INLINE int abs_diff(int a, int b) { return (a > b) ? a - b : b - a; } + +static INLINE uint16_t paeth_predictor_single(uint16_t left, uint16_t top, + uint16_t top_left) { + const int base = top + left - top_left; + const int p_left = abs_diff(base, left); + const int p_top = abs_diff(base, top); + const int p_top_left = abs_diff(base, top_left); + + // Return nearest to base of left, top and top_left. + return (p_left <= p_top && p_left <= p_top_left) + ? left + : (p_top <= p_top_left) ? top : top_left; +} + +static INLINE void paeth_predictor(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, + const uint8_t *left) { + int r, c; + const uint8_t ytop_left = above[-1]; + + for (r = 0; r < bh; r++) { + for (c = 0; c < bw; c++) + dst[c] = (uint8_t)paeth_predictor_single(left[r], above[c], ytop_left); + dst += stride; + } +} + +// Some basic checks on weights for smooth predictor. +#define sm_weights_sanity_checks(weights_w, weights_h, weights_scale, \ + pred_scale) \ + assert(weights_w[0] < weights_scale); \ + assert(weights_h[0] < weights_scale); \ + assert(weights_scale - weights_w[bw - 1] < weights_scale); \ + assert(weights_scale - weights_h[bh - 1] < weights_scale); \ + assert(pred_scale < 31) // ensures no overflow when calculating predictor. + +#define divide_round(value, bits) (((value) + (1 << ((bits)-1))) >> (bits)) + +static INLINE void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, + const uint8_t *left) { + const uint8_t below_pred = left[bh - 1]; // estimated by bottom-left pixel + const uint8_t right_pred = above[bw - 1]; // estimated by top-right pixel + const uint8_t *const sm_weights_w = sm_weight_arrays + bw; + const uint8_t *const sm_weights_h = sm_weight_arrays + bh; + // scale = 2 * 2^sm_weight_log2_scale + const int log2_scale = 1 + sm_weight_log2_scale; + const uint16_t scale = (1 << sm_weight_log2_scale); + sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale, + log2_scale + sizeof(*dst)); + int r; + for (r = 0; r < bh; ++r) { + int c; + for (c = 0; c < bw; ++c) { + const uint8_t pixels[] = { above[c], below_pred, left[r], right_pred }; + const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r], + sm_weights_w[c], scale - sm_weights_w[c] }; + uint32_t this_pred = 0; + int i; + assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]); + for (i = 0; i < 4; ++i) { + this_pred += weights[i] * pixels[i]; + } + dst[c] = divide_round(this_pred, log2_scale); + } + dst += stride; + } +} + +static INLINE void smooth_v_predictor(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, + const uint8_t *left) { + const uint8_t below_pred = left[bh - 1]; // estimated by bottom-left pixel + const uint8_t *const sm_weights = sm_weight_arrays + bh; + // scale = 2^sm_weight_log2_scale + const int log2_scale = sm_weight_log2_scale; + const uint16_t scale = (1 << sm_weight_log2_scale); + sm_weights_sanity_checks(sm_weights, sm_weights, scale, + log2_scale + sizeof(*dst)); + + int r; + for (r = 0; r < bh; r++) { + int c; + for (c = 0; c < bw; ++c) { + const uint8_t pixels[] = { above[c], below_pred }; + const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] }; + uint32_t this_pred = 0; + assert(scale >= sm_weights[r]); + int i; + for (i = 0; i < 2; ++i) { + this_pred += weights[i] * pixels[i]; + } + dst[c] = divide_round(this_pred, log2_scale); + } + dst += stride; + } +} + +static INLINE void smooth_h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, + const uint8_t *left) { + const uint8_t right_pred = above[bw - 1]; // estimated by top-right pixel + const uint8_t *const sm_weights = sm_weight_arrays + bw; + // scale = 2^sm_weight_log2_scale + const int log2_scale = sm_weight_log2_scale; + const uint16_t scale = (1 << sm_weight_log2_scale); + sm_weights_sanity_checks(sm_weights, sm_weights, scale, + log2_scale + sizeof(*dst)); + + int r; + for (r = 0; r < bh; r++) { + int c; + for (c = 0; c < bw; ++c) { + const uint8_t pixels[] = { left[r], right_pred }; + const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] }; + uint32_t this_pred = 0; + assert(scale >= sm_weights[c]); + int i; + for (i = 0; i < 2; ++i) { + this_pred += weights[i] * pixels[i]; + } + dst[c] = divide_round(this_pred, log2_scale); + } + dst += stride; + } +} + +static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, + const uint8_t *left) { + int r; + (void)above; + (void)left; + + for (r = 0; r < bh; r++) { + memset(dst, 128, bw); + dst += stride; + } +} + +static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, + const uint8_t *left) { + int i, r, expected_dc, sum = 0; + (void)above; + + for (i = 0; i < bh; i++) sum += left[i]; + expected_dc = (sum + (bh >> 1)) / bh; + + for (r = 0; r < bh; r++) { + memset(dst, expected_dc, bw); + dst += stride; + } +} + +static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, + const uint8_t *left) { + int i, r, expected_dc, sum = 0; + (void)left; + + for (i = 0; i < bw; i++) sum += above[i]; + expected_dc = (sum + (bw >> 1)) / bw; + + for (r = 0; r < bh; r++) { + memset(dst, expected_dc, bw); + dst += stride; + } +} + +static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left) { + int i, r, expected_dc, sum = 0; + const int count = bw + bh; + + for (i = 0; i < bw; i++) { + sum += above[i]; + } + for (i = 0; i < bh; i++) { + sum += left[i]; + } + + expected_dc = (sum + (count >> 1)) / count; + + for (r = 0; r < bh; r++) { + memset(dst, expected_dc, bw); + dst += stride; + } +} + +static INLINE int divide_using_multiply_shift(int num, int shift1, + int multiplier, int shift2) { + const int interm = num >> shift1; + return interm * multiplier >> shift2; +} + +// The constants (multiplier and shifts) for a given block size are obtained +// as follows: +// - Let sum_w_h = block width + block height. +// - Shift 'sum_w_h' right until we reach an odd number. Let the number of +// shifts for that block size be called 'shift1' (see the parameter in +// dc_predictor_rect() function), and let the odd number be 'd'. [d has only 2 +// possible values: d = 3 for a 1:2 rect block and d = 5 for a 1:4 rect +// block]. +// - Find multipliers for (i) dividing by 3, and (ii) dividing by 5, +// using the "Algorithm 1" in: +// http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1467632 +// by ensuring that m + n = 16 (in that algorithm). This ensures that our 2nd +// shift will be 16, regardless of the block size. + +// Note: For low bitdepth, assembly code may be optimized by using smaller +// constants for smaller block sizes, where the range of the 'sum' is +// restricted to fewer bits. + +#define DC_MULTIPLIER_1X2 0x5556 +#define DC_MULTIPLIER_1X4 0x3334 + +#define DC_SHIFT2 16 + +static INLINE void dc_predictor_rect(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, + const uint8_t *left, int shift1, + int multiplier) { + int sum = 0; + + for (int i = 0; i < bw; i++) { + sum += above[i]; + } + for (int i = 0; i < bh; i++) { + sum += left[i]; + } + + const int expected_dc = divide_using_multiply_shift( + sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2); + assert(expected_dc < (1 << 8)); + + for (int r = 0; r < bh; r++) { + memset(dst, expected_dc, bw); + dst += stride; + } +} + +#undef DC_SHIFT2 + +void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 4, 8, above, left, 2, DC_MULTIPLIER_1X2); +} + +void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 8, 4, above, left, 2, DC_MULTIPLIER_1X2); +} + +void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 4, 16, above, left, 2, DC_MULTIPLIER_1X4); +} + +void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 16, 4, above, left, 2, DC_MULTIPLIER_1X4); +} + +void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 8, 16, above, left, 3, DC_MULTIPLIER_1X2); +} + +void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 16, 8, above, left, 3, DC_MULTIPLIER_1X2); +} + +void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 8, 32, above, left, 3, DC_MULTIPLIER_1X4); +} + +void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 32, 8, above, left, 3, DC_MULTIPLIER_1X4); +} + +void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 16, 32, above, left, 4, DC_MULTIPLIER_1X2); +} + +void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 32, 16, above, left, 4, DC_MULTIPLIER_1X2); +} + +void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 16, 64, above, left, 4, DC_MULTIPLIER_1X4); +} + +void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 64, 16, above, left, 4, DC_MULTIPLIER_1X4); +} + +void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 32, 64, above, left, 5, DC_MULTIPLIER_1X2); +} + +void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 64, 32, above, left, 5, DC_MULTIPLIER_1X2); +} + +#undef DC_MULTIPLIER_1X2 +#undef DC_MULTIPLIER_1X4 + +static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int bd) { + int r; + (void)left; + (void)bd; + for (r = 0; r < bh; r++) { + memcpy(dst, above, bw * sizeof(uint16_t)); + dst += stride; + } +} + +static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int bd) { + int r; + (void)above; + (void)bd; + for (r = 0; r < bh; r++) { + aom_memset16(dst, left[r], bw); + dst += stride; + } +} + +static INLINE void highbd_paeth_predictor(uint16_t *dst, ptrdiff_t stride, + int bw, int bh, const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + const uint16_t ytop_left = above[-1]; + (void)bd; + + for (r = 0; r < bh; r++) { + for (c = 0; c < bw; c++) + dst[c] = paeth_predictor_single(left[r], above[c], ytop_left); + dst += stride; + } +} + +static INLINE void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride, + int bw, int bh, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + const uint16_t below_pred = left[bh - 1]; // estimated by bottom-left pixel + const uint16_t right_pred = above[bw - 1]; // estimated by top-right pixel + const uint8_t *const sm_weights_w = sm_weight_arrays + bw; + const uint8_t *const sm_weights_h = sm_weight_arrays + bh; + // scale = 2 * 2^sm_weight_log2_scale + const int log2_scale = 1 + sm_weight_log2_scale; + const uint16_t scale = (1 << sm_weight_log2_scale); + sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale, + log2_scale + sizeof(*dst)); + int r; + for (r = 0; r < bh; ++r) { + int c; + for (c = 0; c < bw; ++c) { + const uint16_t pixels[] = { above[c], below_pred, left[r], right_pred }; + const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r], + sm_weights_w[c], scale - sm_weights_w[c] }; + uint32_t this_pred = 0; + int i; + assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]); + for (i = 0; i < 4; ++i) { + this_pred += weights[i] * pixels[i]; + } + dst[c] = divide_round(this_pred, log2_scale); + } + dst += stride; + } +} + +static INLINE void highbd_smooth_v_predictor(uint16_t *dst, ptrdiff_t stride, + int bw, int bh, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + const uint16_t below_pred = left[bh - 1]; // estimated by bottom-left pixel + const uint8_t *const sm_weights = sm_weight_arrays + bh; + // scale = 2^sm_weight_log2_scale + const int log2_scale = sm_weight_log2_scale; + const uint16_t scale = (1 << sm_weight_log2_scale); + sm_weights_sanity_checks(sm_weights, sm_weights, scale, + log2_scale + sizeof(*dst)); + + int r; + for (r = 0; r < bh; r++) { + int c; + for (c = 0; c < bw; ++c) { + const uint16_t pixels[] = { above[c], below_pred }; + const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] }; + uint32_t this_pred = 0; + assert(scale >= sm_weights[r]); + int i; + for (i = 0; i < 2; ++i) { + this_pred += weights[i] * pixels[i]; + } + dst[c] = divide_round(this_pred, log2_scale); + } + dst += stride; + } +} + +static INLINE void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride, + int bw, int bh, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + const uint16_t right_pred = above[bw - 1]; // estimated by top-right pixel + const uint8_t *const sm_weights = sm_weight_arrays + bw; + // scale = 2^sm_weight_log2_scale + const int log2_scale = sm_weight_log2_scale; + const uint16_t scale = (1 << sm_weight_log2_scale); + sm_weights_sanity_checks(sm_weights, sm_weights, scale, + log2_scale + sizeof(*dst)); + + int r; + for (r = 0; r < bh; r++) { + int c; + for (c = 0; c < bw; ++c) { + const uint16_t pixels[] = { left[r], right_pred }; + const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] }; + uint32_t this_pred = 0; + assert(scale >= sm_weights[c]); + int i; + for (i = 0; i < 2; ++i) { + this_pred += weights[i] * pixels[i]; + } + dst[c] = divide_round(this_pred, log2_scale); + } + dst += stride; + } +} + +static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride, + int bw, int bh, + const uint16_t *above, + const uint16_t *left, int bd) { + int r; + (void)above; + (void)left; + + for (r = 0; r < bh; r++) { + aom_memset16(dst, 128 << (bd - 8), bw); + dst += stride; + } +} + +static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride, + int bw, int bh, + const uint16_t *above, + const uint16_t *left, int bd) { + int i, r, expected_dc, sum = 0; + (void)above; + (void)bd; + + for (i = 0; i < bh; i++) sum += left[i]; + expected_dc = (sum + (bh >> 1)) / bh; + + for (r = 0; r < bh; r++) { + aom_memset16(dst, expected_dc, bw); + dst += stride; + } +} + +static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride, + int bw, int bh, + const uint16_t *above, + const uint16_t *left, int bd) { + int i, r, expected_dc, sum = 0; + (void)left; + (void)bd; + + for (i = 0; i < bw; i++) sum += above[i]; + expected_dc = (sum + (bw >> 1)) / bw; + + for (r = 0; r < bh; r++) { + aom_memset16(dst, expected_dc, bw); + dst += stride; + } +} + +static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int bd) { + int i, r, expected_dc, sum = 0; + const int count = bw + bh; + (void)bd; + + for (i = 0; i < bw; i++) { + sum += above[i]; + } + for (i = 0; i < bh; i++) { + sum += left[i]; + } + + expected_dc = (sum + (count >> 1)) / count; + + for (r = 0; r < bh; r++) { + aom_memset16(dst, expected_dc, bw); + dst += stride; + } +} + +// Obtained similarly as DC_MULTIPLIER_1X2 and DC_MULTIPLIER_1X4 above, but +// assume 2nd shift of 17 bits instead of 16. +// Note: Strictly speaking, 2nd shift needs to be 17 only when: +// - bit depth == 12, and +// - bw + bh is divisible by 5 (as opposed to divisible by 3). +// All other cases can use half the multipliers with a shift of 16 instead. +// This special optimization can be used when writing assembly code. +#define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB +// Note: This constant is odd, but a smaller even constant (0x199a) with the +// appropriate shift should work for neon in 8/10-bit. +#define HIGHBD_DC_MULTIPLIER_1X4 0x6667 + +#define HIGHBD_DC_SHIFT2 17 + +static INLINE void highbd_dc_predictor_rect(uint16_t *dst, ptrdiff_t stride, + int bw, int bh, + const uint16_t *above, + const uint16_t *left, int bd, + int shift1, uint32_t multiplier) { + int sum = 0; + (void)bd; + + for (int i = 0; i < bw; i++) { + sum += above[i]; + } + for (int i = 0; i < bh; i++) { + sum += left[i]; + } + + const int expected_dc = divide_using_multiply_shift( + sum + ((bw + bh) >> 1), shift1, multiplier, HIGHBD_DC_SHIFT2); + assert(expected_dc < (1 << bd)); + + for (int r = 0; r < bh; r++) { + aom_memset16(dst, expected_dc, bw); + dst += stride; + } +} + +#undef HIGHBD_DC_SHIFT2 + +void aom_highbd_dc_predictor_4x8_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + highbd_dc_predictor_rect(dst, stride, 4, 8, above, left, bd, 2, + HIGHBD_DC_MULTIPLIER_1X2); +} + +void aom_highbd_dc_predictor_8x4_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + highbd_dc_predictor_rect(dst, stride, 8, 4, above, left, bd, 2, + HIGHBD_DC_MULTIPLIER_1X2); +} + +void aom_highbd_dc_predictor_4x16_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + highbd_dc_predictor_rect(dst, stride, 4, 16, above, left, bd, 2, + HIGHBD_DC_MULTIPLIER_1X4); +} + +void aom_highbd_dc_predictor_16x4_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + highbd_dc_predictor_rect(dst, stride, 16, 4, above, left, bd, 2, + HIGHBD_DC_MULTIPLIER_1X4); +} + +void aom_highbd_dc_predictor_8x16_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + highbd_dc_predictor_rect(dst, stride, 8, 16, above, left, bd, 3, + HIGHBD_DC_MULTIPLIER_1X2); +} + +void aom_highbd_dc_predictor_16x8_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + highbd_dc_predictor_rect(dst, stride, 16, 8, above, left, bd, 3, + HIGHBD_DC_MULTIPLIER_1X2); +} + +void aom_highbd_dc_predictor_8x32_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + highbd_dc_predictor_rect(dst, stride, 8, 32, above, left, bd, 3, + HIGHBD_DC_MULTIPLIER_1X4); +} + +void aom_highbd_dc_predictor_32x8_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + highbd_dc_predictor_rect(dst, stride, 32, 8, above, left, bd, 3, + HIGHBD_DC_MULTIPLIER_1X4); +} + +void aom_highbd_dc_predictor_16x32_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + highbd_dc_predictor_rect(dst, stride, 16, 32, above, left, bd, 4, + HIGHBD_DC_MULTIPLIER_1X2); +} + +void aom_highbd_dc_predictor_32x16_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + highbd_dc_predictor_rect(dst, stride, 32, 16, above, left, bd, 4, + HIGHBD_DC_MULTIPLIER_1X2); +} + +void aom_highbd_dc_predictor_16x64_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + highbd_dc_predictor_rect(dst, stride, 16, 64, above, left, bd, 4, + HIGHBD_DC_MULTIPLIER_1X4); +} + +void aom_highbd_dc_predictor_64x16_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + highbd_dc_predictor_rect(dst, stride, 64, 16, above, left, bd, 4, + HIGHBD_DC_MULTIPLIER_1X4); +} + +void aom_highbd_dc_predictor_32x64_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + highbd_dc_predictor_rect(dst, stride, 32, 64, above, left, bd, 5, + HIGHBD_DC_MULTIPLIER_1X2); +} + +void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + highbd_dc_predictor_rect(dst, stride, 64, 32, above, left, bd, 5, + HIGHBD_DC_MULTIPLIER_1X2); +} + +#undef HIGHBD_DC_MULTIPLIER_1X2 +#undef HIGHBD_DC_MULTIPLIER_1X4 + +// This serves as a wrapper function, so that all the prediction functions +// can be unified and accessed as a pointer array. Note that the boundary +// above and left are not necessarily used all the time. +#define intra_pred_sized(type, width, height) \ + void aom_##type##_predictor_##width##x##height##_c( \ + uint8_t *dst, ptrdiff_t stride, const uint8_t *above, \ + const uint8_t *left) { \ + type##_predictor(dst, stride, width, height, above, left); \ + } + +#define intra_pred_highbd_sized(type, width, height) \ + void aom_highbd_##type##_predictor_##width##x##height##_c( \ + uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ + highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \ + } + +/* clang-format off */ +#define intra_pred_rectangular(type) \ + intra_pred_sized(type, 4, 8) \ + intra_pred_sized(type, 8, 4) \ + intra_pred_sized(type, 8, 16) \ + intra_pred_sized(type, 16, 8) \ + intra_pred_sized(type, 16, 32) \ + intra_pred_sized(type, 32, 16) \ + intra_pred_sized(type, 32, 64) \ + intra_pred_sized(type, 64, 32) \ + intra_pred_sized(type, 4, 16) \ + intra_pred_sized(type, 16, 4) \ + intra_pred_sized(type, 8, 32) \ + intra_pred_sized(type, 32, 8) \ + intra_pred_sized(type, 16, 64) \ + intra_pred_sized(type, 64, 16) \ + intra_pred_highbd_sized(type, 4, 8) \ + intra_pred_highbd_sized(type, 8, 4) \ + intra_pred_highbd_sized(type, 8, 16) \ + intra_pred_highbd_sized(type, 16, 8) \ + intra_pred_highbd_sized(type, 16, 32) \ + intra_pred_highbd_sized(type, 32, 16) \ + intra_pred_highbd_sized(type, 32, 64) \ + intra_pred_highbd_sized(type, 64, 32) \ + intra_pred_highbd_sized(type, 4, 16) \ + intra_pred_highbd_sized(type, 16, 4) \ + intra_pred_highbd_sized(type, 8, 32) \ + intra_pred_highbd_sized(type, 32, 8) \ + intra_pred_highbd_sized(type, 16, 64) \ + intra_pred_highbd_sized(type, 64, 16) +#define intra_pred_above_4x4(type) \ + intra_pred_sized(type, 8, 8) \ + intra_pred_sized(type, 16, 16) \ + intra_pred_sized(type, 32, 32) \ + intra_pred_sized(type, 64, 64) \ + intra_pred_highbd_sized(type, 4, 4) \ + intra_pred_highbd_sized(type, 8, 8) \ + intra_pred_highbd_sized(type, 16, 16) \ + intra_pred_highbd_sized(type, 32, 32) \ + intra_pred_highbd_sized(type, 64, 64) \ + intra_pred_rectangular(type) +#define intra_pred_allsizes(type) \ + intra_pred_sized(type, 4, 4) \ + intra_pred_above_4x4(type) +#define intra_pred_square(type) \ + intra_pred_sized(type, 4, 4) \ + intra_pred_sized(type, 8, 8) \ + intra_pred_sized(type, 16, 16) \ + intra_pred_sized(type, 32, 32) \ + intra_pred_sized(type, 64, 64) \ + intra_pred_highbd_sized(type, 4, 4) \ + intra_pred_highbd_sized(type, 8, 8) \ + intra_pred_highbd_sized(type, 16, 16) \ + intra_pred_highbd_sized(type, 32, 32) \ + intra_pred_highbd_sized(type, 64, 64) + +intra_pred_allsizes(v) +intra_pred_allsizes(h) +intra_pred_allsizes(smooth) +intra_pred_allsizes(smooth_v) +intra_pred_allsizes(smooth_h) +intra_pred_allsizes(paeth) +intra_pred_allsizes(dc_128) +intra_pred_allsizes(dc_left) +intra_pred_allsizes(dc_top) +intra_pred_square(dc) +/* clang-format on */ +#undef intra_pred_allsizes diff --git a/libs/libaom/src/aom_dsp/intrapred_common.h b/libs/libaom/src/aom_dsp/intrapred_common.h new file mode 100644 index 000000000..3ec62a86e --- /dev/null +++ b/libs/libaom/src/aom_dsp/intrapred_common.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_INTRAPRED_COMMON_H_ +#define AOM_AOM_DSP_INTRAPRED_COMMON_H_ + +#include "config/aom_config.h" + +// Weights are quadratic from '1' to '1 / block_size', scaled by +// 2^sm_weight_log2_scale. +static const int sm_weight_log2_scale = 8; + +// max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST]) +#define MAX_BLOCK_DIM 64 + +/* clang-format off */ +static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = { + // Unused, because we always offset by bs, which is at least 2. + 0, 0, + // bs = 2 + 255, 128, + // bs = 4 + 255, 149, 85, 64, + // bs = 8 + 255, 197, 146, 105, 73, 50, 37, 32, + // bs = 16 + 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16, + // bs = 32 + 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74, + 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8, + // bs = 64 + 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156, + 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69, + 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15, + 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4, +}; +/* clang-format on */ + +#endif // AOM_AOM_DSP_INTRAPRED_COMMON_H_ diff --git a/libs/libaom/src/aom_dsp/loopfilter.c b/libs/libaom/src/aom_dsp/loopfilter.c new file mode 100644 index 000000000..903ebcd7c --- /dev/null +++ b/libs/libaom/src/aom_dsp/loopfilter.c @@ -0,0 +1,929 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +static INLINE int8_t signed_char_clamp(int t) { + return (int8_t)clamp(t, -128, 127); +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE int16_t signed_char_clamp_high(int t, int bd) { + switch (bd) { + case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1); + case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1); + case 8: + default: return (int16_t)clamp(t, -128, 128 - 1); + } +} +#endif + +// should we apply any filter at all: 11111111 yes, 00000000 no +static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1, + uint8_t p0, uint8_t q0, uint8_t q1) { + int8_t mask = 0; + mask |= (abs(p1 - p0) > limit) * -1; + mask |= (abs(q1 - q0) > limit) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + return ~mask; +} + +static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3, + uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, + uint8_t q1, uint8_t q2, uint8_t q3) { + int8_t mask = 0; + mask |= (abs(p3 - p2) > limit) * -1; + mask |= (abs(p2 - p1) > limit) * -1; + mask |= (abs(p1 - p0) > limit) * -1; + mask |= (abs(q1 - q0) > limit) * -1; + mask |= (abs(q2 - q1) > limit) * -1; + mask |= (abs(q3 - q2) > limit) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + return ~mask; +} + +static INLINE int8_t filter_mask3_chroma(uint8_t limit, uint8_t blimit, + uint8_t p2, uint8_t p1, uint8_t p0, + uint8_t q0, uint8_t q1, uint8_t q2) { + int8_t mask = 0; + mask |= (abs(p2 - p1) > limit) * -1; + mask |= (abs(p1 - p0) > limit) * -1; + mask |= (abs(q1 - q0) > limit) * -1; + mask |= (abs(q2 - q1) > limit) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + return ~mask; +} + +static INLINE int8_t flat_mask3_chroma(uint8_t thresh, uint8_t p2, uint8_t p1, + uint8_t p0, uint8_t q0, uint8_t q1, + uint8_t q2) { + int8_t mask = 0; + mask |= (abs(p1 - p0) > thresh) * -1; + mask |= (abs(q1 - q0) > thresh) * -1; + mask |= (abs(p2 - p0) > thresh) * -1; + mask |= (abs(q2 - q0) > thresh) * -1; + return ~mask; +} + +static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2, + uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1, + uint8_t q2, uint8_t q3) { + int8_t mask = 0; + mask |= (abs(p1 - p0) > thresh) * -1; + mask |= (abs(q1 - q0) > thresh) * -1; + mask |= (abs(p2 - p0) > thresh) * -1; + mask |= (abs(q2 - q0) > thresh) * -1; + mask |= (abs(p3 - p0) > thresh) * -1; + mask |= (abs(q3 - q0) > thresh) * -1; + return ~mask; +} + +// is there high edge variance internal edge: 11111111 yes, 00000000 no +static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0, + uint8_t q0, uint8_t q1) { + int8_t hev = 0; + hev |= (abs(p1 - p0) > thresh) * -1; + hev |= (abs(q1 - q0) > thresh) * -1; + return hev; +} + +static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1, + uint8_t *op0, uint8_t *oq0, uint8_t *oq1) { + int8_t filter1, filter2; + + const int8_t ps1 = (int8_t)(*op1 ^ 0x80); + const int8_t ps0 = (int8_t)(*op0 ^ 0x80); + const int8_t qs0 = (int8_t)(*oq0 ^ 0x80); + const int8_t qs1 = (int8_t)(*oq1 ^ 0x80); + const int8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); + + // add outer taps if we have high edge variance + int8_t filter = signed_char_clamp(ps1 - qs1) & hev; + + // inner taps + filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; + + // save bottom 3 bits so that we round one side +4 and the other +3 + // if it equals 4 we'll set to adjust by -1 to account for the fact + // we'd round 3 the other way + filter1 = signed_char_clamp(filter + 4) >> 3; + filter2 = signed_char_clamp(filter + 3) >> 3; + + *oq0 = (uint8_t)(signed_char_clamp(qs0 - filter1) ^ 0x80); + *op0 = (uint8_t)(signed_char_clamp(ps0 + filter2) ^ 0x80); + + // outer tap adjustments + filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; + + *oq1 = (uint8_t)(signed_char_clamp(qs1 - filter) ^ 0x80); + *op1 = (uint8_t)(signed_char_clamp(ps1 + filter) ^ 0x80); +} + +void aom_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + int i; + int count = 4; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < count; ++i) { + const uint8_t p1 = s[-2 * p], p0 = s[-p]; + const uint8_t q0 = s[0 * p], q1 = s[1 * p]; + const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1); + filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p); + ++s; + } +} + +void aom_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + int count = 4; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < count; ++i) { + const uint8_t p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1]; + const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1); + filter4(mask, *thresh, s - 2, s - 1, s, s + 1); + s += pitch; + } +} + +void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0); + aom_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1); +} + +static INLINE void filter6(int8_t mask, uint8_t thresh, int8_t flat, + uint8_t *op2, uint8_t *op1, uint8_t *op0, + uint8_t *oq0, uint8_t *oq1, uint8_t *oq2) { + if (flat && mask) { + const uint8_t p2 = *op2, p1 = *op1, p0 = *op0; + const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2; + + // 5-tap filter [1, 2, 2, 2, 1] + *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3); + *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3); + *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3); + *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3); + } else { + filter4(mask, thresh, op1, op0, oq0, oq1); + } +} + +static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat, + uint8_t *op3, uint8_t *op2, uint8_t *op1, + uint8_t *op0, uint8_t *oq0, uint8_t *oq1, + uint8_t *oq2, uint8_t *oq3) { + if (flat && mask) { + const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; + + // 7-tap filter [1, 1, 1, 2, 1, 1, 1] + *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3); + *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3); + *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3); + *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3); + *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3); + *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3); + } else { + filter4(mask, thresh, op1, op0, oq0, oq1); + } +} + +void aom_lpf_horizontal_6_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + int count = 4; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < count; ++i) { + const uint8_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p]; + + const int8_t mask = + filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2); + const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2); + filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, + s + 2 * p); + ++s; + } +} + +void aom_lpf_horizontal_6_dual_c(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1); +} + +void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + int count = 4; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < count; ++i) { + const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, + s + 1 * p, s + 2 * p, s + 3 * p); + ++s; + } +} + +void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + int count = 4; + + for (i = 0; i < count; ++i) { + const uint8_t p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2]; + const int8_t mask = + filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2); + const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2); + filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2); + s += pitch; + } +} + +void aom_lpf_vertical_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0); + aom_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + int count = 4; + + for (i = 0; i < count; ++i) { + const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, + s + 3); + s += pitch; + } +} + +void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0); + aom_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1); +} + +static INLINE void filter14(int8_t mask, uint8_t thresh, int8_t flat, + int8_t flat2, uint8_t *op6, uint8_t *op5, + uint8_t *op4, uint8_t *op3, uint8_t *op2, + uint8_t *op1, uint8_t *op0, uint8_t *oq0, + uint8_t *oq1, uint8_t *oq2, uint8_t *oq3, + uint8_t *oq4, uint8_t *oq5, uint8_t *oq6) { + if (flat2 && flat && mask) { + const uint8_t p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3, p2 = *op2, + p1 = *op1, p0 = *op0; + const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4, + q5 = *oq5, q6 = *oq6; + + // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1] + *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0, + 4); + *op4 = ROUND_POWER_OF_TWO( + p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4); + *op3 = ROUND_POWER_OF_TWO( + p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4); + *op2 = ROUND_POWER_OF_TWO( + p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3, + 4); + *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + + q0 + q1 + q2 + q3 + q4, + 4); + *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + + q0 * 2 + q1 + q2 + q3 + q4 + q5, + 4); + *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 + + q1 * 2 + q2 + q3 + q4 + q5 + q6, + 4); + *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 + + q2 * 2 + q3 + q4 + q5 + q6 * 2, + 4); + *oq2 = ROUND_POWER_OF_TWO( + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3, + 4); + *oq3 = ROUND_POWER_OF_TWO( + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4); + *oq4 = ROUND_POWER_OF_TWO( + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4); + *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7, + 4); + } else { + filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3); + } +} + +static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int count) { + int i; + int step = 4; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < step * count; ++i) { + const uint8_t p6 = s[-7 * p], p5 = s[-6 * p], p4 = s[-5 * p], + p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p], + q4 = s[4 * p], q5 = s[5 * p], q6 = s[6 * p]; + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6); + + filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p, + s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, + s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p); + ++s; + } +} + +void aom_lpf_horizontal_14_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1); +} + +void aom_lpf_horizontal_14_dual_c(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1); + mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1); +} + +static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int count) { + int i; + + for (i = 0; i < count; ++i) { + const uint8_t p6 = s[-7], p5 = s[-6], p4 = s[-5], p3 = s[-4], p2 = s[-3], + p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3], q4 = s[4], + q5 = s[5], q6 = s[6]; + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6); + + filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4, s - 3, + s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6); + s += p; + } +} + +void aom_lpf_vertical_14_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4); +} + +void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4); + mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1, 4); +} + +#if CONFIG_AV1_HIGHBITDEPTH +// Should we apply any filter at all: 11111111 yes, 00000000 no ? +static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit, + uint16_t p1, uint16_t p0, uint16_t q0, + uint16_t q1, int bd) { + int8_t mask = 0; + int16_t limit16 = (uint16_t)limit << (bd - 8); + int16_t blimit16 = (uint16_t)blimit << (bd - 8); + mask |= (abs(p1 - p0) > limit16) * -1; + mask |= (abs(q1 - q0) > limit16) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1; + return ~mask; +} + +// Should we apply any filter at all: 11111111 yes, 00000000 no ? +static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit, + uint16_t p3, uint16_t p2, uint16_t p1, + uint16_t p0, uint16_t q0, uint16_t q1, + uint16_t q2, uint16_t q3, int bd) { + int8_t mask = 0; + int16_t limit16 = (uint16_t)limit << (bd - 8); + int16_t blimit16 = (uint16_t)blimit << (bd - 8); + mask |= (abs(p3 - p2) > limit16) * -1; + mask |= (abs(p2 - p1) > limit16) * -1; + mask |= (abs(p1 - p0) > limit16) * -1; + mask |= (abs(q1 - q0) > limit16) * -1; + mask |= (abs(q2 - q1) > limit16) * -1; + mask |= (abs(q3 - q2) > limit16) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1; + return ~mask; +} + +static INLINE int8_t highbd_filter_mask3_chroma(uint8_t limit, uint8_t blimit, + uint16_t p2, uint16_t p1, + uint16_t p0, uint16_t q0, + uint16_t q1, uint16_t q2, + int bd) { + int8_t mask = 0; + int16_t limit16 = (uint16_t)limit << (bd - 8); + int16_t blimit16 = (uint16_t)blimit << (bd - 8); + mask |= (abs(p2 - p1) > limit16) * -1; + mask |= (abs(p1 - p0) > limit16) * -1; + mask |= (abs(q1 - q0) > limit16) * -1; + mask |= (abs(q2 - q1) > limit16) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1; + return ~mask; +} + +static INLINE int8_t highbd_flat_mask3_chroma(uint8_t thresh, uint16_t p2, + uint16_t p1, uint16_t p0, + uint16_t q0, uint16_t q1, + uint16_t q2, int bd) { + int8_t mask = 0; + int16_t thresh16 = (uint16_t)thresh << (bd - 8); + mask |= (abs(p1 - p0) > thresh16) * -1; + mask |= (abs(q1 - q0) > thresh16) * -1; + mask |= (abs(p2 - p0) > thresh16) * -1; + mask |= (abs(q2 - q0) > thresh16) * -1; + return ~mask; +} + +static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2, + uint16_t p1, uint16_t p0, uint16_t q0, + uint16_t q1, uint16_t q2, uint16_t q3, + int bd) { + int8_t mask = 0; + int16_t thresh16 = (uint16_t)thresh << (bd - 8); + mask |= (abs(p1 - p0) > thresh16) * -1; + mask |= (abs(q1 - q0) > thresh16) * -1; + mask |= (abs(p2 - p0) > thresh16) * -1; + mask |= (abs(q2 - q0) > thresh16) * -1; + mask |= (abs(p3 - p0) > thresh16) * -1; + mask |= (abs(q3 - q0) > thresh16) * -1; + return ~mask; +} + +// Is there high edge variance internal edge: +// 11111111_11111111 yes, 00000000_00000000 no ? +static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0, + uint16_t q0, uint16_t q1, int bd) { + int16_t hev = 0; + int16_t thresh16 = (uint16_t)thresh << (bd - 8); + hev |= (abs(p1 - p0) > thresh16) * -1; + hev |= (abs(q1 - q0) > thresh16) * -1; + return hev; +} + +static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1, + uint16_t *op0, uint16_t *oq0, uint16_t *oq1, + int bd) { + int16_t filter1, filter2; + // ^0x80 equivalent to subtracting 0x80 from the values to turn them + // into -128 to +127 instead of 0 to 255. + int shift = bd - 8; + const int16_t ps1 = (int16_t)*op1 - (0x80 << shift); + const int16_t ps0 = (int16_t)*op0 - (0x80 << shift); + const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift); + const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift); + const int16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd); + + // Add outer taps if we have high edge variance. + int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev; + + // Inner taps. + filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask; + + // Save bottom 3 bits so that we round one side +4 and the other +3 + // if it equals 4 we'll set to adjust by -1 to account for the fact + // we'd round 3 the other way. + filter1 = signed_char_clamp_high(filter + 4, bd) >> 3; + filter2 = signed_char_clamp_high(filter + 3, bd) >> 3; + + *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift); + *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift); + + // Outer tap adjustments. + filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; + + *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift); + *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift); +} + +void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int bd) { + int i; + int count = 4; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < count; ++i) { + const uint16_t p1 = s[-2 * p]; + const uint16_t p0 = s[-p]; + const uint16_t q0 = s[0 * p]; + const uint16_t q1 = s[1 * p]; + const int8_t mask = + highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd); + highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd); + ++s; + } +} + +void aom_highbd_lpf_horizontal_4_dual_c( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1, bd); +} + +void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + int i; + int count = 4; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < count; ++i) { + const uint16_t p1 = s[-2], p0 = s[-1]; + const uint16_t q0 = s[0], q1 = s[1]; + const int8_t mask = + highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd); + highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd); + s += pitch; + } +} + +void aom_highbd_lpf_vertical_4_dual_c( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1, + bd); +} + +static INLINE void highbd_filter6(int8_t mask, uint8_t thresh, int8_t flat, + uint16_t *op2, uint16_t *op1, uint16_t *op0, + uint16_t *oq0, uint16_t *oq1, uint16_t *oq2, + int bd) { + if (flat && mask) { + const uint16_t p2 = *op2, p1 = *op1, p0 = *op0; + const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2; + + // 5-tap filter [1, 2, 2, 2, 1] + *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3); + *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3); + *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3); + *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3); + } else { + highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd); + } +} + +static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat, + uint16_t *op3, uint16_t *op2, uint16_t *op1, + uint16_t *op0, uint16_t *oq0, uint16_t *oq1, + uint16_t *oq2, uint16_t *oq3, int bd) { + if (flat && mask) { + const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; + + // 7-tap filter [1, 1, 1, 2, 1, 1, 1] + *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3); + *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3); + *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3); + *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3); + *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3); + *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3); + } else { + highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd); + } +} + +void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + int i; + int count = 4; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < count; ++i) { + const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = + highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, + s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd); + ++s; + } +} + +void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + int i; + int count = 4; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < count; ++i) { + const uint16_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p]; + + const int8_t mask = + highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd); + const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd); + highbd_filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, + s + 1 * p, s + 2 * p, bd); + ++s; + } +} + +void aom_highbd_lpf_horizontal_6_dual_c( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1, bd); +} + +void aom_highbd_lpf_horizontal_8_dual_c( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1, bd); +} + +void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + int i; + int count = 4; + + for (i = 0; i < count; ++i) { + const uint16_t p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint16_t q0 = s[0], q1 = s[1], q2 = s[2]; + const int8_t mask = + highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd); + const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd); + highbd_filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2, + bd); + s += pitch; + } +} + +void aom_highbd_lpf_vertical_6_dual_c( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1, + bd); +} + +void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + int i; + int count = 4; + + for (i = 0; i < count; ++i) { + const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = + highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, + s + 2, s + 3, bd); + s += pitch; + } +} + +void aom_highbd_lpf_vertical_8_dual_c( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1, + bd); +} + +static INLINE void highbd_filter14(int8_t mask, uint8_t thresh, int8_t flat, + int8_t flat2, uint16_t *op6, uint16_t *op5, + uint16_t *op4, uint16_t *op3, uint16_t *op2, + uint16_t *op1, uint16_t *op0, uint16_t *oq0, + uint16_t *oq1, uint16_t *oq2, uint16_t *oq3, + uint16_t *oq4, uint16_t *oq5, uint16_t *oq6, + int bd) { + if (flat2 && flat && mask) { + const uint16_t p6 = *op6; + const uint16_t p5 = *op5; + const uint16_t p4 = *op4; + const uint16_t p3 = *op3; + const uint16_t p2 = *op2; + const uint16_t p1 = *op1; + const uint16_t p0 = *op0; + const uint16_t q0 = *oq0; + const uint16_t q1 = *oq1; + const uint16_t q2 = *oq2; + const uint16_t q3 = *oq3; + const uint16_t q4 = *oq4; + const uint16_t q5 = *oq5; + const uint16_t q6 = *oq6; + + // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1] + *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0, + 4); + *op4 = ROUND_POWER_OF_TWO( + p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4); + *op3 = ROUND_POWER_OF_TWO( + p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4); + *op2 = ROUND_POWER_OF_TWO( + p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3, + 4); + *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + + q0 + q1 + q2 + q3 + q4, + 4); + *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + + q0 * 2 + q1 + q2 + q3 + q4 + q5, + 4); + *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 + + q1 * 2 + q2 + q3 + q4 + q5 + q6, + 4); + *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 + + q2 * 2 + q3 + q4 + q5 + q6 * 2, + 4); + *oq2 = ROUND_POWER_OF_TWO( + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3, + 4); + *oq3 = ROUND_POWER_OF_TWO( + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4); + *oq4 = ROUND_POWER_OF_TWO( + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4); + *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7, + 4); + } else { + highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3, + bd); + } +} + +static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int count, + int bd) { + int i; + int step = 4; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < step * count; ++i) { + const uint16_t p3 = s[-4 * p]; + const uint16_t p2 = s[-3 * p]; + const uint16_t p1 = s[-2 * p]; + const uint16_t p0 = s[-p]; + const uint16_t q0 = s[0 * p]; + const uint16_t q1 = s[1 * p]; + const uint16_t q2 = s[2 * p]; + const uint16_t q3 = s[3 * p]; + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = + highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + + const int8_t flat2 = + highbd_flat_mask4(1, s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, s[4 * p], + s[5 * p], s[6 * p], bd); + + highbd_filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p, + s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, + s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, bd); + ++s; + } +} + +void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int bd) { + highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1, bd); +} + +void aom_highbd_lpf_horizontal_14_dual_c( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + highbd_mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1, bd); + highbd_mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1, bd); +} + +static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int count, + int bd) { + int i; + + for (i = 0; i < count; ++i) { + const uint16_t p3 = s[-4]; + const uint16_t p2 = s[-3]; + const uint16_t p1 = s[-2]; + const uint16_t p0 = s[-1]; + const uint16_t q0 = s[0]; + const uint16_t q1 = s[1]; + const uint16_t q2 = s[2]; + const uint16_t q3 = s[3]; + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = + highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat2 = + highbd_flat_mask4(1, s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], bd); + + highbd_filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4, + s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, + s + 6, bd); + s += p; + } +} + +void aom_highbd_lpf_vertical_14_c(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4, bd); +} + +void aom_highbd_lpf_vertical_14_dual_c( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + highbd_mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4, bd); + highbd_mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1, + 4, bd); +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/libs/libaom/src/aom_dsp/mips/aom_convolve8_horiz_msa.c b/libs/libaom/src/aom_dsp/mips/aom_convolve8_horiz_msa.c new file mode 100644 index 000000000..c8ab61249 --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/aom_convolve8_horiz_msa.c @@ -0,0 +1,693 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/mips/aom_convolve_msa.h" + +static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16u8 mask0, mask1, mask2, mask3, out; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v8i16 filt, out0, out1; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, out0, out1); + SRARI_H2_SH(out0, out1, FILTER_BITS); + SAT_SH2_SH(out0, out1, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16i8 filt0, filt1, filt2, filt3; + v16i8 src0, src1, src2, src3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, out0, out1); + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + out = PCKEV_XORI128_UB(out2, out3); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, out0, out1, out2, + out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + tmp1 = PCKEV_XORI128_UB(out2, out3); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); +} + +static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + tmp1 = PCKEV_XORI128_UB(out2, out3); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src, src_stride, src0, src2); + LD_SB2(src + 8, src_stride, src1, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (2 * src_stride); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + dst += dst_stride; + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst); + dst += dst_stride; + } +} + +static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 16); + dst += dst_stride; + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 16); + dst += dst_stride; + } +} + +static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = height; loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 16); + + src0 = LD_SB(src + 32); + src2 = LD_SB(src + 48); + src3 = LD_SB(src + 56); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst + 32); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 48); + dst += dst_stride; + } +} + +static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, mask; + v16u8 filt0, vec0, vec1, res0, res1; + v8u16 vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); + SRARI_H2_UH(vec2, vec3, FILTER_BITS); + PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16u8 vec0, vec1, vec2, vec3, filt0; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16i8 res0, res1, res2, res3; + v8u16 vec4, vec5, vec6, vec7, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, + vec6, vec7); + SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); + PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, + res3); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16u8 filt0; + v16i8 src0, src1, src2, src3, mask; + v8u16 vec0, vec1, vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1); + ST8x4_UB(src0, src1, dst, dst_stride); +} + +static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + v16u8 filt0; + v16i8 src0, src1, src2, src3, mask, out0, out1; + v8u16 vec0, vec1, vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + if (16 == height) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD_SB4(src, src_stride, src0, src1, src2, src3); + + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride); + } +} + +static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + loop_cnt = (height >> 2) - 1; + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_ST_SB(out0, out1, dst); + dst += dst_stride; + PCKEV_ST_SB(out2, out3, dst); + dst += dst_stride; + PCKEV_ST_SB(out4, out5, dst); + dst += dst_stride; + PCKEV_ST_SB(out6, out7, dst); + dst += dst_stride; + + for (; loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_ST_SB(out0, out1, dst); + dst += dst_stride; + PCKEV_ST_SB(out2, out3, dst); + dst += dst_stride; + PCKEV_ST_SB(out4, out5, dst); + dst += dst_stride; + PCKEV_ST_SB(out6, out7, dst); + dst += dst_stride; + } +} + +static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + for (loop_cnt = height >> 1; loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + src4 = LD_SB(src); + src6 = LD_SB(src + 16); + src7 = LD_SB(src + 24); + src5 = __msa_sldi_b(src6, src4, 8); + src += src_stride; + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_ST_SB(out0, out1, dst); + PCKEV_ST_SB(out2, out3, dst + 16); + dst += dst_stride; + PCKEV_ST_SB(out4, out5, dst); + PCKEV_ST_SB(out6, out7, dst + 16); + dst += dst_stride; + } +} + +static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + for (loop_cnt = height; loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src4 = LD_SB(src + 32); + src6 = LD_SB(src + 48); + src7 = LD_SB(src + 56); + SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); + src += src_stride; + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_ST_SB(out0, out1, dst); + PCKEV_ST_SB(out2, out3, dst + 16); + PCKEV_ST_SB(out4, out5, dst + 32); + PCKEV_ST_SB(out6, out7, dst + 48); + dst += dst_stride; + } +} + +void aom_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + int8_t cnt, filt_hor[8]; + + assert(x_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + } + + if (((const int32_t *)filter_x)[0] == 0) { + switch (w) { + case 4: + common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 8: + common_hz_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 16: + common_hz_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 32: + common_hz_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 64: + common_hz_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + default: + aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_hz_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + case 8: + common_hz_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + case 16: + common_hz_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + case 32: + common_hz_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + case 64: + common_hz_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + default: + aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } +} diff --git a/libs/libaom/src/aom_dsp/mips/aom_convolve8_vert_msa.c b/libs/libaom/src/aom_dsp/mips/aom_convolve8_vert_msa.c new file mode 100644 index 000000000..2c3bc084c --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/aom_convolve8_vert_msa.c @@ -0,0 +1,699 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/mips/aom_convolve_msa.h" + +static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; + v16i8 src10998, filt0, filt1, filt2, filt3; + v16u8 out; + v8i16 filt, out10, out32; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110, + src4332, src6554); + XORI_B3_128_SB(src2110, src4332, src6554); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998); + XORI_B2_128_SB(src8776, src10998); + out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0, + filt1, filt2, filt3); + out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0, + filt1, filt2, filt3); + SRARI_H2_SH(out10, out32, FILTER_BITS); + SAT_SH2_SH(out10, out32, 7); + out = PCKEV_XORI128_UB(out10, out32); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + src2110 = src6554; + src4332 = src8776; + src6554 = src10998; + src6 = src10; + } +} + +static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; + v16u8 tmp0, tmp1; + v8i16 filt, out0_r, out1_r, out2_r, out3_r; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); + tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src6 = src10; + } +} + +static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt0, filt1, filt2, filt3; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; + v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, + src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, + src87_l, src98_l, src109_l); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0, + filt1, filt2, filt3); + out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0, + filt1, filt2, filt3); + out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0, + filt1, filt2, filt3); + out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r, + tmp0, tmp1, tmp2, tmp3); + XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); + ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src10_l = src54_l; + src32_l = src76_l; + src54_l = src98_l; + src21_l = src65_l; + src43_l = src87_l; + src65_l = src109_l; + src6 = src10; + } +} + +static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height, + int32_t width) { + const uint8_t *src_tmp; + uint8_t *dst_tmp; + uint32_t loop_cnt, cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt0, filt1, filt2, filt3; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; + v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + for (cnt = (width >> 4); cnt--;) { + src_tmp = src; + dst_tmp = dst; + + LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src_tmp += (7 * src_stride); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, + src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src_tmp, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src_tmp += (4 * src_stride); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, + src87_l, src98_l, src109_l); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0, + filt1, filt2, filt3); + out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0, + filt1, filt2, filt3); + out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0, + filt1, filt2, filt3); + out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, + out3_r, tmp0, tmp1, tmp2, tmp3); + XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); + ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride); + dst_tmp += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src10_l = src54_l; + src32_l = src76_l; + src54_l = src98_l; + src21_l = src65_l; + src43_l = src87_l; + src65_l = src109_l; + src6 = src10; + } + + src += 16; + dst += 16; + } +} + +static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, + 32); +} + +static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, + 64); +} + +static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, src4; + v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332; + v16u8 filt0; + v8i16 filt; + v8u16 tmp0, tmp1; + + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 filt0; + v8i16 filt; + + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + src8 = LD_SB(src); + + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, + src76_r, src87_r); + ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r, + src76_r, src2110, src4332, src6554, src8776); + DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, + tmp0, tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); + ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); + ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); +} + +static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0; + v16i8 out0, out1; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_UB5(src, src_stride, src0, src1, src2, src3, src4); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); + ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); +} + +static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v16i8 out0, out1; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); + src += (8 * src_stride); + + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, + vec3); + ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6, + vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + src0 = src8; + } +} + +static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst); + dst += dst_stride; + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst); + dst += dst_stride; + + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst); + dst += dst_stride; + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst); + dst += dst_stride; + + src0 = src4; + } +} + +static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_UB(src); + src5 = LD_UB(src + 16); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + + LD_UB4(src + 16, src_stride, src6, src7, src8, src9); + src += (4 * src_stride); + + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride); + + ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2); + ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst + 16); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride); + + ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6); + ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride); + dst += (4 * dst_stride); + + src0 = src4; + src5 = src9; + } +} + +static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_UB4(src, 16, src0, src3, src6, src9); + src += src_stride; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_UB2(src, src_stride, src1, src2); + LD_UB2(src + 16, src_stride, src4, src5); + LD_UB2(src + 32, src_stride, src7, src8); + LD_UB2(src + 48, src_stride, src10, src11); + src += (2 * src_stride); + + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); + + ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6); + ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); + SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); + PCKEV_ST_SB(tmp4, tmp5, dst + 16); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); + SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); + PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride); + + ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2); + ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst + 32); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride); + + ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6); + ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); + SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); + PCKEV_ST_SB(tmp4, tmp5, dst + 48); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); + SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); + PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride); + dst += (2 * dst_stride); + + src0 = src2; + src3 = src5; + src6 = src8; + src9 = src11; + } +} + +void aom_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + int8_t cnt, filt_ver[8]; + + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 8; cnt--;) { + filt_ver[cnt] = filter_y[cnt]; + } + + if (((const int32_t *)filter_y)[0] == 0) { + switch (w) { + case 4: + common_vt_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 8: + common_vt_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 16: + common_vt_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 32: + common_vt_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 64: + common_vt_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + default: + aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_vt_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 8: + common_vt_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 16: + common_vt_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 32: + common_vt_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 64: + common_vt_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + default: + aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } +} diff --git a/libs/libaom/src/aom_dsp/mips/aom_convolve_copy_msa.c b/libs/libaom/src/aom_dsp/mips/aom_convolve_copy_msa.c new file mode 100644 index 000000000..f7f116f4d --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/aom_convolve_copy_msa.c @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "aom_dsp/mips/macros_msa.h" + +static void copy_width8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if (0 == height % 12) { + for (cnt = (height / 12); cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + out0 = __msa_copy_u_d((v2i64)src0, 0); + out1 = __msa_copy_u_d((v2i64)src1, 0); + out2 = __msa_copy_u_d((v2i64)src2, 0); + out3 = __msa_copy_u_d((v2i64)src3, 0); + out4 = __msa_copy_u_d((v2i64)src4, 0); + out5 = __msa_copy_u_d((v2i64)src5, 0); + out6 = __msa_copy_u_d((v2i64)src6, 0); + out7 = __msa_copy_u_d((v2i64)src7, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); + dst += (4 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + out0 = __msa_copy_u_d((v2i64)src0, 0); + out1 = __msa_copy_u_d((v2i64)src1, 0); + out2 = __msa_copy_u_d((v2i64)src2, 0); + out3 = __msa_copy_u_d((v2i64)src3, 0); + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 8) { + for (cnt = height >> 3; cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + out0 = __msa_copy_u_d((v2i64)src0, 0); + out1 = __msa_copy_u_d((v2i64)src1, 0); + out2 = __msa_copy_u_d((v2i64)src2, 0); + out3 = __msa_copy_u_d((v2i64)src3, 0); + out4 = __msa_copy_u_d((v2i64)src4, 0); + out5 = __msa_copy_u_d((v2i64)src5, 0); + out6 = __msa_copy_u_d((v2i64)src6, 0); + out7 = __msa_copy_u_d((v2i64)src7, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 4) { + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + out0 = __msa_copy_u_d((v2i64)src0, 0); + out1 = __msa_copy_u_d((v2i64)src1, 0); + out2 = __msa_copy_u_d((v2i64)src2, 0); + out3 = __msa_copy_u_d((v2i64)src3, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 2) { + for (cnt = (height / 2); cnt--;) { + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + out0 = __msa_copy_u_d((v2i64)src0, 0); + out1 = __msa_copy_u_d((v2i64)src1, 0); + + SD(out0, dst); + dst += dst_stride; + SD(out1, dst); + dst += dst_stride; + } + } +} + +static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, int32_t width) { + int32_t cnt, loop_cnt; + const uint8_t *src_tmp; + uint8_t *dst_tmp; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + for (cnt = (width >> 4); cnt--;) { + src_tmp = src; + dst_tmp = dst; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6, + src7); + src_tmp += (8 * src_stride); + + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst_tmp, + dst_stride); + dst_tmp += (8 * dst_stride); + } + + src += 16; + dst += 16; + } +} + +static void copy_width16_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if (0 == height % 12) { + for (cnt = (height / 12); cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); + dst += (8 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 8) { + copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16); + } else if (0 == height % 4) { + for (cnt = (height >> 2); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } + } +} + +static void copy_width32_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if (0 == height % 12) { + for (cnt = (height / 12); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 8) { + copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32); + } else if (0 == height % 4) { + for (cnt = (height >> 2); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); + } + } +} + +static void copy_width64_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64); +} + +void aom_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int32_t filter_x_stride, + const int16_t *filter_y, int32_t filter_y_stride, + int32_t w, int32_t h) { + (void)filter_x; + (void)filter_y; + (void)filter_x_stride; + (void)filter_y_stride; + + switch (w) { + case 4: { + uint32_t cnt, tmp; + /* 1 word storage */ + for (cnt = h; cnt--;) { + tmp = LW(src); + SW(tmp, dst); + src += src_stride; + dst += dst_stride; + } + break; + } + case 8: { + copy_width8_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 16: { + copy_width16_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 32: { + copy_width32_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 64: { + copy_width64_msa(src, src_stride, dst, dst_stride, h); + break; + } + default: { + uint32_t cnt; + for (cnt = h; cnt--;) { + memcpy(dst, src, w); + src += src_stride; + dst += dst_stride; + } + break; + } + } +} diff --git a/libs/libaom/src/aom_dsp/mips/aom_convolve_msa.h b/libs/libaom/src/aom_dsp/mips/aom_convolve_msa.h new file mode 100644 index 000000000..852415c20 --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/aom_convolve_msa.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ +#define AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ + +#include "aom_dsp/mips/macros_msa.h" +#include "aom_dsp/aom_filter.h" + +extern const uint8_t mc_filt_mask_arr[16 * 3]; + +#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2, \ + filt3) \ + ({ \ + v8i16 tmp_dpadd_0, tmp_dpadd_1; \ + \ + tmp_dpadd_0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \ + tmp_dpadd_0 = __msa_dpadd_s_h(tmp_dpadd_0, (v16i8)vec1, (v16i8)filt1); \ + tmp_dpadd_1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \ + tmp_dpadd_1 = __msa_dpadd_s_h(tmp_dpadd_1, (v16i8)vec3, (v16i8)filt3); \ + tmp_dpadd_0 = __msa_adds_s_h(tmp_dpadd_0, tmp_dpadd_1); \ + \ + tmp_dpadd_0; \ + }) + +#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + mask2, mask3, filt0, filt1, filt2, filt3, \ + out0, out1) \ + { \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v8i16 res0_m, res1_m, res2_m, res3_m; \ + \ + VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ + DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \ + VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ + DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \ + VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ + DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \ + VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \ + DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \ + ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \ + } + +#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + mask2, mask3, filt0, filt1, filt2, filt3, \ + out0, out1, out2, out3) \ + { \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \ + \ + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ + DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ + res0_m, res1_m, res2_m, res3_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \ + DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \ + res4_m, res5_m, res6_m, res7_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \ + DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \ + res0_m, res1_m, res2_m, res3_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \ + DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \ + res4_m, res5_m, res6_m, res7_m); \ + ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \ + res7_m, out0, out1, out2, out3); \ + } + +#endif // AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ diff --git a/libs/libaom/src/aom_dsp/mips/common_dspr2.c b/libs/libaom/src/aom_dsp/mips/common_dspr2.c new file mode 100644 index 000000000..00ab75dc3 --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/common_dspr2.c @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/common_dspr2.h" + +#if HAVE_DSPR2 +uint8_t aom_ff_cropTbl_a[256 + 2 * CROP_WIDTH]; +uint8_t *aom_ff_cropTbl; + +void aom_dsputil_static_init(void) { + int i; + + for (i = 0; i < 256; i++) aom_ff_cropTbl_a[i + CROP_WIDTH] = i; + + for (i = 0; i < CROP_WIDTH; i++) { + aom_ff_cropTbl_a[i] = 0; + aom_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255; + } + + aom_ff_cropTbl = &aom_ff_cropTbl_a[CROP_WIDTH]; +} + +#endif diff --git a/libs/libaom/src/aom_dsp/mips/common_dspr2.h b/libs/libaom/src/aom_dsp/mips/common_dspr2.h new file mode 100644 index 000000000..c42188d62 --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/common_dspr2.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_ +#define AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_ + +#include + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif +#if HAVE_DSPR2 +#define CROP_WIDTH 512 + +extern uint8_t *aom_ff_cropTbl; // From "aom_dsp/mips/intrapred4_dspr2.c" + +static INLINE void prefetch_load(const unsigned char *src) { + __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r"(src)); +} + +/* prefetch data for store */ +static INLINE void prefetch_store(unsigned char *dst) { + __asm__ __volatile__("pref 1, 0(%[dst]) \n\t" : : [dst] "r"(dst)); +} + +static INLINE void prefetch_load_streamed(const unsigned char *src) { + __asm__ __volatile__("pref 4, 0(%[src]) \n\t" : : [src] "r"(src)); +} + +/* prefetch data for store */ +static INLINE void prefetch_store_streamed(unsigned char *dst) { + __asm__ __volatile__("pref 5, 0(%[dst]) \n\t" : : [dst] "r"(dst)); +} +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_ diff --git a/libs/libaom/src/aom_dsp/mips/convolve2_dspr2.c b/libs/libaom/src/aom_dsp/mips/convolve2_dspr2.c new file mode 100644 index 000000000..08bf1ab30 --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/convolve2_dspr2.c @@ -0,0 +1,1031 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_bi_horiz_4_transposed_dspr2( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + uint8_t *dst_ptr; + int32_t Temp1, Temp2; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + dst_ptr = dst; + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp2], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp2](%[cm]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp2], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p1], %[Temp1](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + + /* store bytes */ + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[p1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[tp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[p2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [dst_ptr] "+r"(dst_ptr) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [src] "r"(src), [dst_stride] "r"(dst_stride)); + + /* Next row... */ + src += src_stride; + dst += 1; + } +} + +static void convolve_bi_horiz_8_transposed_dspr2( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + uint8_t *dst_ptr; + uint32_t vector4a = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2, tp3; + uint32_t p1, p2, p3, p4; + uint8_t *odd_dst; + uint32_t dst_pitch_2 = (dst_stride << 1); + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + + dst_ptr = dst; + odd_dst = (dst_ptr + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "balign %[tp3], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" + "lbux %[tp1], %[Temp3](%[cm]) \n\t" + "extp %[p3], $ac1, 31 \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sb %[Temp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbux %[Temp1], %[p3](%[cm]) " + "\n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tp3] \n\t" + "preceu.ph.qbl %[p4], %[tp3] \n\t" + "sb %[Temp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp1], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + /* odd 3. pixel */ + "lbux %[tp3], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 4. pixel */ + "sb %[tp3], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "lbux %[p1], %[Temp1](%[cm]) \n\t" + + /* store bytes */ + "sb %[p4], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[p2], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[p1], 0(%[odd_dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1), + [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [dst_ptr] "+r"(dst_ptr), + [odd_dst] "+r"(odd_dst) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2)); + + /* Next row... */ + src += src_stride; + dst += 1; + } +} + +static void convolve_bi_horiz_16_transposed_dspr2( + const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) { + int32_t c, y; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + uint32_t dst_pitch_2 = (dst_stride << 1); + uint8_t *odd_dst; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + + src = src_ptr; + dst = dst_ptr; + + odd_dst = (dst + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) " + "\n\t" + "ulw %[qload2], 4(%[src]) " + "\n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 1 */ + "mthi $zero, $ac1 " + "\n\t" + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 2 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "ulw %[qload1], 8(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] " + "\n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 3 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload1] " + "\n\t" + "ulw %[qload2], 12(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] " + "\n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 4 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + " \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] " + "\n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 5 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p4], %[filter45] " + "\n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 6 */ + "mthi $zero, $ac3 " + "\n\t" + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter45] " + "\n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 7 */ + "mthi $zero, $ac1 " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 20(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p5], %[filter45] " + "\n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 8 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] " + "\n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 1 */ + "mthi $zero, $ac3 " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] " + "\n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) " + "\n\t" + "ulw %[qload2], 5(%[src]) " + "\n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 2 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 9(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] " + "\n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 3 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] " + "\n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 4 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] " + "\n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 5 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p4], %[filter45] " + "\n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 6 */ + "mthi $zero, $ac2 " + "\n\t" + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] " + "\n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 7 */ + "mthi $zero, $ac3 " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 21(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p5], %[filter45] " + "\n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 8 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] " + "\n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] " + "\n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), + [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), + [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2)); + + src += 16; + dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); + odd_dst = (dst + dst_stride); + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += 1; + } +} + +static void convolve_bi_horiz_64_transposed_dspr2( + const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, const int16_t *filter_x0, int32_t h) { + int32_t c, y; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + uint32_t dst_pitch_2 = (dst_stride << 1); + uint8_t *odd_dst; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_load(src_ptr + src_stride + 64); + + src = src_ptr; + dst = dst_ptr; + + odd_dst = (dst + dst_stride); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) " + "\n\t" + "ulw %[qload2], 4(%[src]) " + "\n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 1 */ + "mthi $zero, $ac1 " + "\n\t" + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 2 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "ulw %[qload1], 8(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] " + "\n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 3 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload1] " + "\n\t" + "ulw %[qload2], 12(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] " + "\n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 4 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + " \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] " + "\n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 5 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p4], %[filter45] " + "\n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 6 */ + "mthi $zero, $ac3 " + "\n\t" + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter45] " + "\n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 7 */ + "mthi $zero, $ac1 " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 20(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p5], %[filter45] " + "\n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 8 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] " + "\n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 1 */ + "mthi $zero, $ac3 " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] " + "\n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) " + "\n\t" + "ulw %[qload2], 5(%[src]) " + "\n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 2 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 9(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] " + "\n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 3 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] " + "\n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 4 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] " + "\n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 5 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p4], %[filter45] " + "\n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 6 */ + "mthi $zero, $ac2 " + "\n\t" + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] " + "\n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 7 */ + "mthi $zero, $ac3 " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 21(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p5], %[filter45] " + "\n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 8 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] " + "\n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] " + "\n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), + [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), + [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2)); + + src += 16; + dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); + odd_dst = (dst + dst_stride); + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += 1; + } +} + +void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter, int w, int h) { + int x, y; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + int sum = 0; + + sum += src[x] * filter[3]; + sum += src[x + 1] * filter[4]; + + dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + + src += src_stride; + dst += 1; + } +} + +void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter, int w, + int h) { + uint32_t pos = 38; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + + switch (w) { + case 4: + convolve_bi_horiz_4_transposed_dspr2(src, src_stride, dst, dst_stride, + filter, h); + break; + case 8: + convolve_bi_horiz_8_transposed_dspr2(src, src_stride, dst, dst_stride, + filter, h); + break; + case 16: + case 32: + convolve_bi_horiz_16_transposed_dspr2(src, src_stride, dst, dst_stride, + filter, h, (w / 16)); + break; + case 64: + prefetch_load(src + 32); + convolve_bi_horiz_64_transposed_dspr2(src, src_stride, dst, dst_stride, + filter, h); + break; + default: + convolve_bi_horiz_transposed(src, src_stride, dst, dst_stride, filter, w, + h); + break; + } +} +#endif diff --git a/libs/libaom/src/aom_dsp/mips/convolve2_horiz_dspr2.c b/libs/libaom/src/aom_dsp/mips/convolve2_horiz_dspr2.c new file mode 100644 index 000000000..097da73ca --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/convolve2_horiz_dspr2.c @@ -0,0 +1,681 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_bi_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p1], %[Temp2](%[cm]) \n\t" + "lbux %[p2], %[Temp4](%[cm]) \n\t" + + /* store bytes */ + "sb %[tp1], 0(%[dst]) \n\t" + "sb %[p1], 1(%[dst]) \n\t" + "sb %[tp2], 2(%[dst]) \n\t" + "sb %[p2], 3(%[dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [Temp4] "=&r"(Temp4) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2, tp3; + uint32_t p1, p2, p3, p4; + uint32_t st0, st1; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sb %[st0], 0(%[dst]) \n\t" + "lbux %[st1], %[Temp3](%[cm]) \n\t" + + "balign %[tp3], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbux %[st0], %[Temp1](%[cm]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sb %[st1], 2(%[dst]) \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tp3] \n\t" + "preceu.ph.qbl %[p4], %[tp3] \n\t" + "sb %[st0], 4(%[dst]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "lbux %[st0], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + /* odd 3. pixel */ + "lbux %[st1], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 4. pixel */ + "sb %[st1], 1(%[dst]) \n\t" + "sb %[st0], 6(%[dst]) \n\t" + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "lbux %[p1], %[Temp1](%[cm]) \n\t" + + /* store bytes */ + "sb %[p4], 3(%[dst]) \n\t" + "sb %[p2], 5(%[dst]) \n\t" + "sb %[p1], 7(%[dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), + [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr, + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h, + int32_t count) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_store(dst_ptr + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ + "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ + "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ + "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ + "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ + "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ + "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), + [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), + [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr, + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_load(src_ptr + src_stride + 64); + prefetch_store(dst_ptr + dst_stride); + prefetch_store(dst_ptr + dst_stride + 32); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ + "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ + "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ + "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ + "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ + "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ + "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), + [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), + [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + uint32_t pos = 38; + + assert(x_step_q4 == 16); + + prefetch_load((const uint8_t *)filter_x); + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + prefetch_store(dst); + + switch (w) { + case 4: + convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + case 8: + convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + case 16: + convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h, 1); + break; + case 32: + convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h, 2); + break; + case 64: + prefetch_load(src + 64); + prefetch_store(dst + 32); + + convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + default: + aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } +} +#endif diff --git a/libs/libaom/src/aom_dsp/mips/convolve2_vert_dspr2.c b/libs/libaom/src/aom_dsp/mips/convolve2_vert_dspr2.c new file mode 100644 index 000000000..40abfd89e --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/convolve2_vert_dspr2.c @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_bi_vert_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t w, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2; + uint32_t p1, p2; + uint32_t scratch1; + uint32_t store1, store2; + int32_t Temp1, Temp2; + const int16_t *filter = &filter_y[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + + for (x = 0; x < w; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" + + "extp %[Temp1], $ac0, 31 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), + [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), + [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2; + uint32_t p1, p2; + uint32_t scratch1; + uint32_t store1, store2; + int32_t Temp1, Temp2; + const int16_t *filter = &filter_y[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + + for (x = 0; x < 64; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" + + "extp %[Temp1], $ac0, 31 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), + [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), + [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + uint32_t pos = 38; + + assert(y_step_q4 == 16); + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + prefetch_store(dst); + + switch (w) { + case 4: + case 8: + case 16: + case 32: + convolve_bi_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, + h); + break; + case 64: + prefetch_store(dst + 32); + convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h); + break; + default: + aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } +} +#endif diff --git a/libs/libaom/src/aom_dsp/mips/convolve8_dspr2.c b/libs/libaom/src/aom_dsp/mips/convolve8_dspr2.c new file mode 100644 index 000000000..af54b4264 --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/convolve8_dspr2.c @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h) { + int x, y; + + (void)filter_x; + (void)filter_x_stride; + (void)filter_y; + (void)filter_y_stride; + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + prefetch_store(dst); + + switch (w) { + case 4: { + uint32_t tp1; + + /* 1 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], (%[src]) \n\t" + "sw %[tp1], (%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + } break; + case 8: { + uint32_t tp1, tp2; + + /* 2 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + } break; + case 16: { + uint32_t tp1, tp2, tp3, tp4; + + /* 4 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "ulw %[tp4], 12(%[src]) \n\t" + + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + "sw %[tp3], 8(%[dst]) \n\t" /* store */ + "sw %[tp4], 12(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + } break; + case 32: { + uint32_t tp1, tp2, tp3, tp4; + uint32_t tp5, tp6, tp7, tp8; + + /* 8 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "ulw %[tp4], 12(%[src]) \n\t" + "ulw %[tp5], 16(%[src]) \n\t" + "ulw %[tp6], 20(%[src]) \n\t" + "ulw %[tp7], 24(%[src]) \n\t" + "ulw %[tp8], 28(%[src]) \n\t" + + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + "sw %[tp3], 8(%[dst]) \n\t" /* store */ + "sw %[tp4], 12(%[dst]) \n\t" /* store */ + "sw %[tp5], 16(%[dst]) \n\t" /* store */ + "sw %[tp6], 20(%[dst]) \n\t" /* store */ + "sw %[tp7], 24(%[dst]) \n\t" /* store */ + "sw %[tp8], 28(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6), + [tp7] "=&r"(tp7), [tp8] "=&r"(tp8) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + } break; + case 64: { + uint32_t tp1, tp2, tp3, tp4; + uint32_t tp5, tp6, tp7, tp8; + + prefetch_load(src + 64); + prefetch_store(dst + 32); + + /* 16 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_load(src + src_stride + 64); + prefetch_store(dst + dst_stride); + prefetch_store(dst + dst_stride + 32); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "ulw %[tp4], 12(%[src]) \n\t" + "ulw %[tp5], 16(%[src]) \n\t" + "ulw %[tp6], 20(%[src]) \n\t" + "ulw %[tp7], 24(%[src]) \n\t" + "ulw %[tp8], 28(%[src]) \n\t" + + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + "sw %[tp3], 8(%[dst]) \n\t" /* store */ + "sw %[tp4], 12(%[dst]) \n\t" /* store */ + "sw %[tp5], 16(%[dst]) \n\t" /* store */ + "sw %[tp6], 20(%[dst]) \n\t" /* store */ + "sw %[tp7], 24(%[dst]) \n\t" /* store */ + "sw %[tp8], 28(%[dst]) \n\t" /* store */ + + "ulw %[tp1], 32(%[src]) \n\t" + "ulw %[tp2], 36(%[src]) \n\t" + "ulw %[tp3], 40(%[src]) \n\t" + "ulw %[tp4], 44(%[src]) \n\t" + "ulw %[tp5], 48(%[src]) \n\t" + "ulw %[tp6], 52(%[src]) \n\t" + "ulw %[tp7], 56(%[src]) \n\t" + "ulw %[tp8], 60(%[src]) \n\t" + + "sw %[tp1], 32(%[dst]) \n\t" /* store */ + "sw %[tp2], 36(%[dst]) \n\t" /* store */ + "sw %[tp3], 40(%[dst]) \n\t" /* store */ + "sw %[tp4], 44(%[dst]) \n\t" /* store */ + "sw %[tp5], 48(%[dst]) \n\t" /* store */ + "sw %[tp6], 52(%[dst]) \n\t" /* store */ + "sw %[tp7], 56(%[dst]) \n\t" /* store */ + "sw %[tp8], 60(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6), + [tp7] "=&r"(tp7), [tp8] "=&r"(tp8) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + } break; + default: + for (y = h; y--;) { + for (x = 0; x < w; ++x) { + dst[x] = src[x]; + } + + src += src_stride; + dst += dst_stride; + } + break; + } +} +#endif diff --git a/libs/libaom/src/aom_dsp/mips/convolve8_horiz_dspr2.c b/libs/libaom/src/aom_dsp/mips/convolve8_horiz_dspr2.c new file mode 100644 index 000000000..f9c6879ab --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/convolve8_horiz_dspr2.c @@ -0,0 +1,879 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4; + uint32_t n1, n2, n3, n4; + uint32_t tn1, tn2; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[n1], %[tp2] \n\t" + "preceu.ph.qbl %[n2], %[tp2] \n\t" + "preceu.ph.qbr %[n3], %[tn2] \n\t" + "preceu.ph.qbl %[n4], %[tn2] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[n1], %[tn1] \n\t" + "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[tn1], %[Temp2](%[cm]) \n\t" + "lbux %[n2], %[Temp4](%[cm]) \n\t" + + /* store bytes */ + "sb %[tp1], 0(%[dst]) \n\t" + "sb %[tn1], 1(%[dst]) \n\t" + "sb %[tp2], 2(%[dst]) \n\t" + "sb %[n2], 3(%[dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3), + [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4, n1; + uint32_t tn1, tn2, tn3; + uint32_t st0, st1; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "preceu.ph.qbl %[n1], %[tn2] \n\t" + "ulw %[tn1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[tn1] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sb %[st0], 0(%[dst]) \n\t" + "lbux %[st1], %[Temp3](%[cm]) \n\t" + + "balign %[tn3], %[tn1], 3 \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbux %[st0], %[Temp1](%[cm]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sb %[st1], 2(%[dst]) \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tn2] \n\t" + "preceu.ph.qbl %[p4], %[tn2] \n\t" + "sb %[st0], 4(%[dst]) \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn1] \n\t" + "preceu.ph.qbl %[n1], %[tn1] \n\t" + "lbux %[st0], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + /* odd 3. pixel */ + "lbux %[st1], %[Temp2](%[cm]) \n\t" + "preceu.ph.qbr %[p2], %[tn3] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 4. pixel */ + "sb %[st1], 1(%[dst]) \n\t" + "sb %[st0], 6(%[dst]) \n\t" + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "lbux %[n1], %[Temp1](%[cm]) \n\t" + + /* store bytes */ + "sb %[p4], 3(%[dst]) \n\t" + "sb %[p2], 5(%[dst]) \n\t" + "sb %[n1], 7(%[dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0), + [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, int32_t src_stride, + uint8_t *dst_ptr, int32_t dst_stride, + const int16_t *filter_x0, int32_t h, + int32_t count) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_store(dst_ptr + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ + "ulw %[qload2], 16(%[src]) \n\t" + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ + "ulw %[qload3], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ + "ulw %[qload3], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ + "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ + "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), + [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), + [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride, + uint8_t *dst_ptr, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_load(src_ptr + src_stride + 64); + prefetch_store(dst_ptr + dst_stride); + prefetch_store(dst_ptr + dst_stride + 32); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ + "ulw %[qload2], 16(%[src]) \n\t" + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ + "ulw %[qload3], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ + "ulw %[qload3], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ + "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ + "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), + [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), + [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void aom_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + assert(x_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + + if (((const int32_t *)filter_x)[0] == 0) { + aom_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + } else { + uint32_t pos = 38; + + prefetch_load((const uint8_t *)filter_x); + src -= 3; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + prefetch_store(dst); + + switch (w) { + case 4: + convolve_horiz_4_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + case 8: + convolve_horiz_8_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + case 16: + convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h, 1); + break; + case 32: + convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h, 2); + break; + case 64: + prefetch_load(src + 64); + prefetch_store(dst + 32); + + convolve_horiz_64_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + default: + aom_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } +} +#endif diff --git a/libs/libaom/src/aom_dsp/mips/convolve8_vert_dspr2.c b/libs/libaom/src/aom_dsp/mips/convolve8_vert_dspr2.c new file mode 100644 index 000000000..201e66427 --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/convolve8_vert_dspr2.c @@ -0,0 +1,361 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_vert_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t w, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2, load3, load4; + uint32_t p1, p2; + uint32_t n1, n2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2; + + vector1b = ((const int32_t *)filter_y)[0]; + vector2b = ((const int32_t *)filter_y)[1]; + vector3b = ((const int32_t *)filter_y)[2]; + vector4b = ((const int32_t *)filter_y)[3]; + + src -= 3 * src_stride; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + + for (x = 0; x < w; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac0, 31 \n\t" + "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), + [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), + [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2, load3, load4; + uint32_t p1, p2; + uint32_t n1, n2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2; + + vector1b = ((const int32_t *)filter_y)[0]; + vector2b = ((const int32_t *)filter_y)[1]; + vector3b = ((const int32_t *)filter_y)[2]; + vector4b = ((const int32_t *)filter_y)[3]; + + src -= 3 * src_stride; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + prefetch_store(dst + dst_stride + 32); + + for (x = 0; x < 64; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac0, 31 \n\t" + "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), + [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), + [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +void aom_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + if (((const int32_t *)filter_y)[0] == 0) { + aom_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + } else { + uint32_t pos = 38; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + prefetch_store(dst); + + switch (w) { + case 4: + case 8: + case 16: + case 32: + convolve_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, h); + break; + case 64: + prefetch_store(dst + 32); + convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h); + break; + default: + aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } +} + +#endif diff --git a/libs/libaom/src/aom_dsp/mips/convolve_common_dspr2.h b/libs/libaom/src/aom_dsp/mips/convolve_common_dspr2.h new file mode 100644 index 000000000..e5d48a884 --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/convolve_common_dspr2.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_ +#define AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_ + +#include + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/mips/common_dspr2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if HAVE_DSPR2 +void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h); + +void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter, int w, + int h); + +void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h); + +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_ diff --git a/libs/libaom/src/aom_dsp/mips/intrapred16_dspr2.c b/libs/libaom/src/aom_dsp/mips/intrapred16_dspr2.c new file mode 100644 index 000000000..7c221ae89 --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/intrapred16_dspr2.c @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/common_dspr2.h" + +#if HAVE_DSPR2 +void aom_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16; + + (void)above; + + __asm__ __volatile__( + "lb %[tmp1], (%[left]) \n\t" + "lb %[tmp2], 1(%[left]) \n\t" + "lb %[tmp3], 2(%[left]) \n\t" + "lb %[tmp4], 3(%[left]) \n\t" + "lb %[tmp5], 4(%[left]) \n\t" + "lb %[tmp6], 5(%[left]) \n\t" + "lb %[tmp7], 6(%[left]) \n\t" + "lb %[tmp8], 7(%[left]) \n\t" + "lb %[tmp9], 8(%[left]) \n\t" + "lb %[tmp10], 9(%[left]) \n\t" + "lb %[tmp11], 10(%[left]) \n\t" + "lb %[tmp12], 11(%[left]) \n\t" + "lb %[tmp13], 12(%[left]) \n\t" + "lb %[tmp14], 13(%[left]) \n\t" + "lb %[tmp15], 14(%[left]) \n\t" + "lb %[tmp16], 15(%[left]) \n\t" + + "replv.qb %[tmp1], %[tmp1] \n\t" + "replv.qb %[tmp2], %[tmp2] \n\t" + "replv.qb %[tmp3], %[tmp3] \n\t" + "replv.qb %[tmp4], %[tmp4] \n\t" + "replv.qb %[tmp5], %[tmp5] \n\t" + "replv.qb %[tmp6], %[tmp6] \n\t" + "replv.qb %[tmp7], %[tmp7] \n\t" + "replv.qb %[tmp8], %[tmp8] \n\t" + "replv.qb %[tmp9], %[tmp9] \n\t" + "replv.qb %[tmp10], %[tmp10] \n\t" + "replv.qb %[tmp11], %[tmp11] \n\t" + "replv.qb %[tmp12], %[tmp12] \n\t" + "replv.qb %[tmp13], %[tmp13] \n\t" + "replv.qb %[tmp14], %[tmp14] \n\t" + "replv.qb %[tmp15], %[tmp15] \n\t" + "replv.qb %[tmp16], %[tmp16] \n\t" + + "sw %[tmp1], (%[dst]) \n\t" + "sw %[tmp1], 4(%[dst]) \n\t" + "sw %[tmp1], 8(%[dst]) \n\t" + "sw %[tmp1], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp2], (%[dst]) \n\t" + "sw %[tmp2], 4(%[dst]) \n\t" + "sw %[tmp2], 8(%[dst]) \n\t" + "sw %[tmp2], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp3], (%[dst]) \n\t" + "sw %[tmp3], 4(%[dst]) \n\t" + "sw %[tmp3], 8(%[dst]) \n\t" + "sw %[tmp3], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp4], (%[dst]) \n\t" + "sw %[tmp4], 4(%[dst]) \n\t" + "sw %[tmp4], 8(%[dst]) \n\t" + "sw %[tmp4], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp5], (%[dst]) \n\t" + "sw %[tmp5], 4(%[dst]) \n\t" + "sw %[tmp5], 8(%[dst]) \n\t" + "sw %[tmp5], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp6], (%[dst]) \n\t" + "sw %[tmp6], 4(%[dst]) \n\t" + "sw %[tmp6], 8(%[dst]) \n\t" + "sw %[tmp6], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp7], (%[dst]) \n\t" + "sw %[tmp7], 4(%[dst]) \n\t" + "sw %[tmp7], 8(%[dst]) \n\t" + "sw %[tmp7], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp8], (%[dst]) \n\t" + "sw %[tmp8], 4(%[dst]) \n\t" + "sw %[tmp8], 8(%[dst]) \n\t" + "sw %[tmp8], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp9], (%[dst]) \n\t" + "sw %[tmp9], 4(%[dst]) \n\t" + "sw %[tmp9], 8(%[dst]) \n\t" + "sw %[tmp9], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp10], (%[dst]) \n\t" + "sw %[tmp10], 4(%[dst]) \n\t" + "sw %[tmp10], 8(%[dst]) \n\t" + "sw %[tmp10], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp11], (%[dst]) \n\t" + "sw %[tmp11], 4(%[dst]) \n\t" + "sw %[tmp11], 8(%[dst]) \n\t" + "sw %[tmp11], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp12], (%[dst]) \n\t" + "sw %[tmp12], 4(%[dst]) \n\t" + "sw %[tmp12], 8(%[dst]) \n\t" + "sw %[tmp12], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp13], (%[dst]) \n\t" + "sw %[tmp13], 4(%[dst]) \n\t" + "sw %[tmp13], 8(%[dst]) \n\t" + "sw %[tmp13], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp14], (%[dst]) \n\t" + "sw %[tmp14], 4(%[dst]) \n\t" + "sw %[tmp14], 8(%[dst]) \n\t" + "sw %[tmp14], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp15], (%[dst]) \n\t" + "sw %[tmp15], 4(%[dst]) \n\t" + "sw %[tmp15], 8(%[dst]) \n\t" + "sw %[tmp15], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp16], (%[dst]) \n\t" + "sw %[tmp16], 4(%[dst]) \n\t" + "sw %[tmp16], 8(%[dst]) \n\t" + "sw %[tmp16], 12(%[dst]) \n\t" + + : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3), + [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7), + [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8), [tmp9] "=&r"(tmp9), + [tmp10] "=&r"(tmp10), [tmp11] "=&r"(tmp11), [tmp12] "=&r"(tmp12), + [tmp13] "=&r"(tmp13), [tmp14] "=&r"(tmp14), [tmp15] "=&r"(tmp15), + [tmp16] "=&r"(tmp16) + : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride)); +} + +void aom_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t expected_dc; + int32_t average; + int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1; + int32_t above2, left2; + + __asm__ __volatile__( + "lw %[above1], (%[above]) \n\t" + "lw %[above2], 4(%[above]) \n\t" + "lw %[left1], (%[left]) \n\t" + "lw %[left2], 4(%[left]) \n\t" + + "preceu.ph.qbl %[above_l1], %[above1] \n\t" + "preceu.ph.qbr %[above_r1], %[above1] \n\t" + "preceu.ph.qbl %[left_l1], %[left1] \n\t" + "preceu.ph.qbr %[left_r1], %[left1] \n\t" + + "addu.ph %[average], %[above_r1], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "preceu.ph.qbl %[above_l1], %[above2] \n\t" + "preceu.ph.qbr %[above_r1], %[above2] \n\t" + "preceu.ph.qbl %[left_l1], %[left2] \n\t" + "preceu.ph.qbr %[left_r1], %[left2] \n\t" + + "addu.ph %[average], %[average], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[above_r1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "lw %[above1], 8(%[above]) \n\t" + "lw %[above2], 12(%[above]) \n\t" + "lw %[left1], 8(%[left]) \n\t" + "lw %[left2], 12(%[left]) \n\t" + + "preceu.ph.qbl %[above_l1], %[above1] \n\t" + "preceu.ph.qbr %[above_r1], %[above1] \n\t" + "preceu.ph.qbl %[left_l1], %[left1] \n\t" + "preceu.ph.qbr %[left_r1], %[left1] \n\t" + + "addu.ph %[average], %[average], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[above_r1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "preceu.ph.qbl %[above_l1], %[above2] \n\t" + "preceu.ph.qbr %[above_r1], %[above2] \n\t" + "preceu.ph.qbl %[left_l1], %[left2] \n\t" + "preceu.ph.qbr %[left_r1], %[left2] \n\t" + + "addu.ph %[average], %[average], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[above_r1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "addiu %[average], %[average], 16 \n\t" + "srl %[tmp], %[average], 16 \n\t" + "addu.ph %[average], %[tmp], %[average] \n\t" + "srl %[expected_dc], %[average], 5 \n\t" + "replv.qb %[expected_dc], %[expected_dc] \n\t" + + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + : [left1] "=&r"(left1), [above1] "=&r"(above1), [left_l1] "=&r"(left_l1), + [above_l1] "=&r"(above_l1), [left_r1] "=&r"(left_r1), + [above_r1] "=&r"(above_r1), [above2] "=&r"(above2), + [left2] "=&r"(left2), [average] "=&r"(average), [tmp] "=&r"(tmp), + [expected_dc] "=&r"(expected_dc) + : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), + [stride] "r"(stride)); +} +#endif // #if HAVE_DSPR2 diff --git a/libs/libaom/src/aom_dsp/mips/intrapred4_dspr2.c b/libs/libaom/src/aom_dsp/mips/intrapred4_dspr2.c new file mode 100644 index 000000000..0a21979c7 --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/intrapred4_dspr2.c @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/common_dspr2.h" + +#if HAVE_DSPR2 +void aom_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t tmp1, tmp2, tmp3, tmp4; + (void)above; + + __asm__ __volatile__( + "lb %[tmp1], (%[left]) \n\t" + "lb %[tmp2], 1(%[left]) \n\t" + "lb %[tmp3], 2(%[left]) \n\t" + "lb %[tmp4], 3(%[left]) \n\t" + "replv.qb %[tmp1], %[tmp1] \n\t" + "replv.qb %[tmp2], %[tmp2] \n\t" + "replv.qb %[tmp3], %[tmp3] \n\t" + "replv.qb %[tmp4], %[tmp4] \n\t" + "sw %[tmp1], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp2], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp3], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp4], (%[dst]) \n\t" + + : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3), + [tmp4] "=&r"(tmp4) + : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride)); +} + +void aom_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t expected_dc; + int32_t average; + int32_t tmp, above_c, above_l, above_r, left_c, left_r, left_l; + + __asm__ __volatile__( + "lw %[above_c], (%[above]) \n\t" + "lw %[left_c], (%[left]) \n\t" + + "preceu.ph.qbl %[above_l], %[above_c] \n\t" + "preceu.ph.qbr %[above_r], %[above_c] \n\t" + "preceu.ph.qbl %[left_l], %[left_c] \n\t" + "preceu.ph.qbr %[left_r], %[left_c] \n\t" + + "addu.ph %[average], %[above_r], %[above_l] \n\t" + "addu.ph %[average], %[average], %[left_l] \n\t" + "addu.ph %[average], %[average], %[left_r] \n\t" + "addiu %[average], %[average], 4 \n\t" + "srl %[tmp], %[average], 16 \n\t" + "addu.ph %[average], %[tmp], %[average] \n\t" + "srl %[expected_dc], %[average], 3 \n\t" + "replv.qb %[expected_dc], %[expected_dc] \n\t" + + "sw %[expected_dc], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + + : [above_c] "=&r"(above_c), [above_l] "=&r"(above_l), + [above_r] "=&r"(above_r), [left_c] "=&r"(left_c), + [left_l] "=&r"(left_l), [left_r] "=&r"(left_r), + [average] "=&r"(average), [tmp] "=&r"(tmp), + [expected_dc] "=&r"(expected_dc) + : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), + [stride] "r"(stride)); +} +#endif // #if HAVE_DSPR2 diff --git a/libs/libaom/src/aom_dsp/mips/intrapred8_dspr2.c b/libs/libaom/src/aom_dsp/mips/intrapred8_dspr2.c new file mode 100644 index 000000000..d42a77c80 --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/intrapred8_dspr2.c @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/common_dspr2.h" + +#if HAVE_DSPR2 +void aom_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + (void)above; + + __asm__ __volatile__( + "lb %[tmp1], (%[left]) \n\t" + "lb %[tmp2], 1(%[left]) \n\t" + "lb %[tmp3], 2(%[left]) \n\t" + "lb %[tmp4], 3(%[left]) \n\t" + "lb %[tmp5], 4(%[left]) \n\t" + "lb %[tmp6], 5(%[left]) \n\t" + "lb %[tmp7], 6(%[left]) \n\t" + "lb %[tmp8], 7(%[left]) \n\t" + + "replv.qb %[tmp1], %[tmp1] \n\t" + "replv.qb %[tmp2], %[tmp2] \n\t" + "replv.qb %[tmp3], %[tmp3] \n\t" + "replv.qb %[tmp4], %[tmp4] \n\t" + "replv.qb %[tmp5], %[tmp5] \n\t" + "replv.qb %[tmp6], %[tmp6] \n\t" + "replv.qb %[tmp7], %[tmp7] \n\t" + "replv.qb %[tmp8], %[tmp8] \n\t" + + "sw %[tmp1], (%[dst]) \n\t" + "sw %[tmp1], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp2], (%[dst]) \n\t" + "sw %[tmp2], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp3], (%[dst]) \n\t" + "sw %[tmp3], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp4], (%[dst]) \n\t" + "sw %[tmp4], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp5], (%[dst]) \n\t" + "sw %[tmp5], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp6], (%[dst]) \n\t" + "sw %[tmp6], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp7], (%[dst]) \n\t" + "sw %[tmp7], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp8], (%[dst]) \n\t" + "sw %[tmp8], 4(%[dst]) \n\t" + + : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3), + [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7), + [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8) + : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride)); +} + +void aom_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t expected_dc; + int32_t average; + int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1; + int32_t above2, above_l2, above_r2, left2, left_r2, left_l2; + + __asm__ __volatile__( + "lw %[above1], (%[above]) \n\t" + "lw %[above2], 4(%[above]) \n\t" + "lw %[left1], (%[left]) \n\t" + "lw %[left2], 4(%[left]) \n\t" + + "preceu.ph.qbl %[above_l1], %[above1] \n\t" + "preceu.ph.qbr %[above_r1], %[above1] \n\t" + "preceu.ph.qbl %[left_l1], %[left1] \n\t" + "preceu.ph.qbr %[left_r1], %[left1] \n\t" + + "preceu.ph.qbl %[above_l2], %[above2] \n\t" + "preceu.ph.qbr %[above_r2], %[above2] \n\t" + "preceu.ph.qbl %[left_l2], %[left2] \n\t" + "preceu.ph.qbr %[left_r2], %[left2] \n\t" + + "addu.ph %[average], %[above_r1], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "addu.ph %[average], %[average], %[above_l2] \n\t" + "addu.ph %[average], %[average], %[above_r2] \n\t" + "addu.ph %[average], %[average], %[left_l2] \n\t" + "addu.ph %[average], %[average], %[left_r2] \n\t" + + "addiu %[average], %[average], 8 \n\t" + + "srl %[tmp], %[average], 16 \n\t" + "addu.ph %[average], %[tmp], %[average] \n\t" + "srl %[expected_dc], %[average], 4 \n\t" + "replv.qb %[expected_dc], %[expected_dc] \n\t" + + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + : [above1] "=&r"(above1), [above_l1] "=&r"(above_l1), + [above_r1] "=&r"(above_r1), [left1] "=&r"(left1), + [left_l1] "=&r"(left_l1), [left_r1] "=&r"(left_r1), + [above2] "=&r"(above2), [above_l2] "=&r"(above_l2), + [above_r2] "=&r"(above_r2), [left2] "=&r"(left2), + [left_l2] "=&r"(left_l2), [left_r2] "=&r"(left_r2), + [average] "=&r"(average), [tmp] "=&r"(tmp), + [expected_dc] "=&r"(expected_dc) + : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), + [stride] "r"(stride)); +} +#endif // #if HAVE_DSPR2 diff --git a/libs/libaom/src/aom_dsp/mips/intrapred_msa.c b/libs/libaom/src/aom_dsp/mips/intrapred_msa.c new file mode 100644 index 000000000..9f25cc1ca --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/intrapred_msa.c @@ -0,0 +1,550 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/mips/macros_msa.h" + +#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \ + { \ + out0 = __msa_subs_u_h(out0, in0); \ + out1 = __msa_subs_u_h(out1, in1); \ + } + +static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t src_data; + + src_data = LW(src); + + SW4(src_data, src_data, src_data, src_data, dst, dst_stride); +} + +static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + uint32_t src_data1, src_data2; + + src_data1 = LW(src); + src_data2 = LW(src + 4); + + for (row = 8; row--;) { + SW(src_data1, dst); + SW(src_data2, (dst + 4)); + dst += dst_stride; + } +} + +static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + v16u8 src0; + + src0 = LD_UB(src); + + for (row = 16; row--;) { + ST_UB(src0, dst); + dst += dst_stride; + } +} + +static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + v16u8 src1, src2; + + src1 = LD_UB(src); + src2 = LD_UB(src + 16); + + for (row = 32; row--;) { + ST_UB2(src1, src2, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_horiz_4x4_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t out0, out1, out2, out3; + + out0 = src[0] * 0x01010101; + out1 = src[1] * 0x01010101; + out2 = src[2] * 0x01010101; + out3 = src[3] * 0x01010101; + + SW4(out0, out1, out2, out3, dst, dst_stride); +} + +static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + out0 = src[0] * 0x0101010101010101ull; + out1 = src[1] * 0x0101010101010101ull; + out2 = src[2] * 0x0101010101010101ull; + out3 = src[3] * 0x0101010101010101ull; + out4 = src[4] * 0x0101010101010101ull; + out5 = src[5] * 0x0101010101010101ull; + out6 = src[6] * 0x0101010101010101ull; + out7 = src[7] * 0x0101010101010101ull; + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); +} + +static void intra_predict_horiz_16x16_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + uint8_t inp0, inp1, inp2, inp3; + v16u8 src0, src1, src2, src3; + + for (row = 4; row--;) { + inp0 = src[0]; + inp1 = src[1]; + inp2 = src[2]; + inp3 = src[3]; + src += 4; + + src0 = (v16u8)__msa_fill_b(inp0); + src1 = (v16u8)__msa_fill_b(inp1); + src2 = (v16u8)__msa_fill_b(inp2); + src3 = (v16u8)__msa_fill_b(inp3); + + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + uint8_t inp0, inp1, inp2, inp3; + v16u8 src0, src1, src2, src3; + + for (row = 8; row--;) { + inp0 = src[0]; + inp1 = src[1]; + inp2 = src[2]; + inp3 = src[3]; + src += 4; + + src0 = (v16u8)__msa_fill_b(inp0); + src1 = (v16u8)__msa_fill_b(inp1); + src2 = (v16u8)__msa_fill_b(inp2); + src3 = (v16u8)__msa_fill_b(inp3); + + ST_UB2(src0, src0, dst, 16); + dst += dst_stride; + ST_UB2(src1, src1, dst, 16); + dst += dst_stride; + ST_UB2(src2, src2, dst, 16); + dst += dst_stride; + ST_UB2(src3, src3, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_dc_4x4_msa(const uint8_t *src_top, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint32_t val0, val1; + v16i8 store, src = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + val0 = LW(src_top); + val1 = LW(src_left); + INSERT_W2_SB(val0, val1, src); + sum_h = __msa_hadd_u_h((v16u8)src, (v16u8)src); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3); + store = __msa_splati_b((v16i8)sum_w, 0); + val0 = __msa_copy_u_w((v4i32)store, 0); + + SW4(val0, val0, val0, val0, dst, dst_stride); +} + +static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t val0; + v16i8 store, data = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + + val0 = LW(src); + data = (v16i8)__msa_insert_w((v4i32)data, 0, val0); + sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2); + store = __msa_splati_b((v16i8)sum_w, 0); + val0 = __msa_copy_u_w((v4i32)store, 0); + + SW4(val0, val0, val0, val0, dst, dst_stride); +} + +static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) { + uint32_t out; + const v16i8 store = __msa_ldi_b(128); + + out = __msa_copy_u_w((v4i32)store, 0); + + SW4(out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_8x8_msa(const uint8_t *src_top, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint64_t val0, val1; + v16i8 store; + v16u8 src = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + val0 = LD(src_top); + val1 = LD(src_left); + INSERT_D2_UB(val0, val1, src); + sum_h = __msa_hadd_u_h(src, src); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4); + store = __msa_splati_b((v16i8)sum_w, 0); + val0 = __msa_copy_u_d((v2i64)store, 0); + + SD4(val0, val0, val0, val0, dst, dst_stride); + dst += (4 * dst_stride); + SD4(val0, val0, val0, val0, dst, dst_stride); +} + +static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint64_t val0; + v16i8 store; + v16u8 data = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + val0 = LD(src); + data = (v16u8)__msa_insert_d((v2i64)data, 0, val0); + sum_h = __msa_hadd_u_h(data, data); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3); + store = __msa_splati_b((v16i8)sum_w, 0); + val0 = __msa_copy_u_d((v2i64)store, 0); + + SD4(val0, val0, val0, val0, dst, dst_stride); + dst += (4 * dst_stride); + SD4(val0, val0, val0, val0, dst, dst_stride); +} + +static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) { + uint64_t out; + const v16i8 store = __msa_ldi_b(128); + + out = __msa_copy_u_d((v2i64)store, 0); + + SD4(out, out, out, out, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_16x16_msa(const uint8_t *src_top, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + v16u8 top, left, out; + v8u16 sum_h, sum_top, sum_left; + v4u32 sum_w; + v2u64 sum_d; + + top = LD_UB(src_top); + left = LD_UB(src_left); + HADD_UB2_UH(top, left, sum_top, sum_left); + sum_h = sum_top + sum_left; + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5); + out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); + + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); + dst += (8 * dst_stride); + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + v16u8 data, out; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + data = LD_UB(src); + sum_h = __msa_hadd_u_h(data, data); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4); + out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); + + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); + dst += (8 * dst_stride); + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); +} + +static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) { + const v16u8 out = (v16u8)__msa_ldi_b(128); + + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); + dst += (8 * dst_stride); + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_32x32_msa(const uint8_t *src_top, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + v16u8 top0, top1, left0, left1, out; + v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1; + v4u32 sum_w; + v2u64 sum_d; + + LD_UB2(src_top, 16, top0, top1); + LD_UB2(src_left, 16, left0, left1); + HADD_UB2_UH(top0, top1, sum_top0, sum_top1); + HADD_UB2_UH(left0, left1, sum_left0, sum_left1); + sum_h = sum_top0 + sum_top1; + sum_h += sum_left0 + sum_left1; + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 6); + out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); + + for (row = 16; row--;) { + ST_UB2(out, out, dst, 16); + dst += dst_stride; + ST_UB2(out, out, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_dc_tl_32x32_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + v16u8 data0, data1, out; + v8u16 sum_h, sum_data0, sum_data1; + v4u32 sum_w; + v2u64 sum_d; + + LD_UB2(src, 16, data0, data1); + HADD_UB2_UH(data0, data1, sum_data0, sum_data1); + sum_h = sum_data0 + sum_data1; + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5); + out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); + + for (row = 16; row--;) { + ST_UB2(out, out, dst, 16); + dst += dst_stride; + ST_UB2(out, out, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) { + uint32_t row; + const v16u8 out = (v16u8)__msa_ldi_b(128); + + for (row = 16; row--;) { + ST_UB2(out, out, dst, 16); + dst += dst_stride; + ST_UB2(out, out, dst, 16); + dst += dst_stride; + } +} + +void aom_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_vert_4x4_msa(above, dst, y_stride); +} + +void aom_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_vert_8x8_msa(above, dst, y_stride); +} + +void aom_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_vert_16x16_msa(above, dst, y_stride); +} + +void aom_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_vert_32x32_msa(above, dst, y_stride); +} + +void aom_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_horiz_4x4_msa(left, dst, y_stride); +} + +void aom_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_horiz_8x8_msa(left, dst, y_stride); +} + +void aom_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_horiz_16x16_msa(left, dst, y_stride); +} + +void aom_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_horiz_32x32_msa(left, dst, y_stride); +} + +void aom_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_4x4_msa(above, left, dst, y_stride); +} + +void aom_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_8x8_msa(above, left, dst, y_stride); +} + +void aom_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_16x16_msa(above, left, dst, y_stride); +} + +void aom_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_32x32_msa(above, left, dst, y_stride); +} + +void aom_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_dc_tl_4x4_msa(above, dst, y_stride); +} + +void aom_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_dc_tl_8x8_msa(above, dst, y_stride); +} + +void aom_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_dc_tl_16x16_msa(above, dst, y_stride); +} + +void aom_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_dc_tl_32x32_msa(above, dst, y_stride); +} + +void aom_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_dc_tl_4x4_msa(left, dst, y_stride); +} + +void aom_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_dc_tl_8x8_msa(left, dst, y_stride); +} + +void aom_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + + intra_predict_dc_tl_16x16_msa(left, dst, y_stride); +} + +void aom_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + + intra_predict_dc_tl_32x32_msa(left, dst, y_stride); +} + +void aom_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + + intra_predict_128dc_4x4_msa(dst, y_stride); +} + +void aom_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + + intra_predict_128dc_8x8_msa(dst, y_stride); +} + +void aom_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + + intra_predict_128dc_16x16_msa(dst, y_stride); +} + +void aom_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + + intra_predict_128dc_32x32_msa(dst, y_stride); +} diff --git a/libs/libaom/src/aom_dsp/mips/loopfilter_16_msa.c b/libs/libaom/src/aom_dsp/mips/loopfilter_16_msa.c new file mode 100644 index 000000000..38a10e9b2 --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/loopfilter_16_msa.c @@ -0,0 +1,1488 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_ports/mem.h" +#include "aom_dsp/mips/loopfilter_msa.h" + +int32_t aom_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16u8 zero = { 0 }; + + /* load vector elements */ + LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__msa_test_bz_v(flat)) { + ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); + + return 1; + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); + AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, + p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); + filter48 += (4 * 16); + ST_UB2(q1_out, q2_out, filter48, 16); + filter48 += (2 * 16); + ST_UB(flat, filter48); + + return 0; + } +} + +void aom_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) { + v16u8 flat, flat2, filter8; + v16i8 zero = { 0 }; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; + v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; + v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in; + v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in; + v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l; + v8i16 l_out, r_out; + + flat = LD_UB(filter48 + 96); + + LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0); + LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7); + AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__msa_test_bz_v(flat2)) { + LD_UB4(filter48, 16, p2, p1, p0, q0); + LD_UB2(filter48 + 4 * 16, 16, q1, q2); + + src -= 3 * pitch; + ST_UB4(p2, p1, p0, q0, src, pitch); + src += (4 * pitch); + ST_UB2(q1, q2, src, pitch); + } else { + src -= 7 * pitch; + + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, + p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, + p2_r_in, p1_r_in, p0_r_in); + + q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); + + tmp0_r = p7_r_in << 3; + tmp0_r -= p7_r_in; + tmp0_r += p6_r_in; + tmp0_r += q0_r_in; + tmp1_r = p6_r_in + p5_r_in; + tmp1_r += p4_r_in; + tmp1_r += p3_r_in; + tmp1_r += p2_r_in; + tmp1_r += p1_r_in; + tmp1_r += p0_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in, + p5_l_in, p4_l_in); + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in, + p1_l_in, p0_l_in); + q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0); + + tmp0_l = p7_l_in << 3; + tmp0_l -= p7_l_in; + tmp0_l += p6_l_in; + tmp0_l += q0_l_in; + tmp1_l = p6_l_in + p5_l_in; + tmp1_l += p4_l_in; + tmp1_l += p3_l_in; + tmp1_l += p2_l_in; + tmp1_l += p1_l_in; + tmp1_l += p0_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); + ST_UB(p6, src); + src += pitch; + + /* p5 */ + q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); + tmp0_r = p5_r_in - p6_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1); + tmp0_l = p5_l_in - p6_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); + ST_UB(p5, src); + src += pitch; + + /* p4 */ + q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); + tmp0_r = p4_r_in - p5_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4); + + q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2); + tmp0_l = p4_l_in - p5_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); + ST_UB(p4, src); + src += pitch; + + /* p3 */ + q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); + tmp0_r = p3_r_in - p4_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3); + tmp0_l = p3_l_in - p4_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); + ST_UB(p3, src); + src += pitch; + + /* p2 */ + q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); + filter8 = LD_UB(filter48); + tmp0_r = p2_r_in - p3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4); + tmp0_l = p2_l_in - p3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* p1 */ + q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); + filter8 = LD_UB(filter48 + 16); + tmp0_r = p1_r_in - p2_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5); + tmp0_l = p1_l_in - p2_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* p0 */ + q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); + filter8 = LD_UB(filter48 + 32); + tmp0_r = p0_r_in - p1_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6); + tmp0_l = p0_l_in - p1_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* q0 */ + q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); + filter8 = LD_UB(filter48 + 48); + tmp0_r = q7_r_in - p0_r_in; + tmp0_r += q0_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7); + tmp0_l = q7_l_in - p0_l_in; + tmp0_l += q0_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* q1 */ + filter8 = LD_UB(filter48 + 64); + tmp0_r = q7_r_in - q0_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p6_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q0_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p6_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* q2 */ + filter8 = LD_UB(filter48 + 80); + tmp0_r = q7_r_in - q1_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p5_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q1_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p5_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* q3 */ + tmp0_r = q7_r_in - q2_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p4_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q2_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p4_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); + ST_UB(q3, src); + src += pitch; + + /* q4 */ + tmp0_r = q7_r_in - q3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p3_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p3_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); + ST_UB(q4, src); + src += pitch; + + /* q5 */ + tmp0_r = q7_r_in - q4_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p2_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q4_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p2_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); + ST_UB(q5, src); + src += pitch; + + /* q6 */ + tmp0_r = q7_r_in - q5_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p1_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q5_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p1_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); + ST_UB(q6, src); + } +} + +static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr, + int32_t count) { + DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]); + uint8_t early_exit = 0; + + (void)count; + + early_exit = aom_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr, + limit_ptr, thresh_ptr); + + if (0 == early_exit) { + aom_hz_lpf_t16_16w(src, pitch, filter48); + } +} + +static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr, int32_t count) { + if (1 == count) { + uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; + uint64_t dword0, dword1; + v16u8 flat2, mask, hev, flat, thresh, b_limit, limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 p0_filter16, p1_filter16; + v8i16 p2_filter8, p1_filter8, p0_filter8; + v8i16 q0_filter8, q1_filter8, q2_filter8; + v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r; + v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; + v16i8 zero = { 0 }; + v8u16 tmp0, tmp1, tmp2; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, + q1_out); + + flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); + + if (__msa_test_bz_v(flat)) { + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch); + } else { + /* convert 8 bit input data into 16 bit */ + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, + zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, + q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8, + p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero, + q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8); + PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat); + + /* load 16 vector elements */ + LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4); + LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7); + + AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__msa_test_bz_v(flat2)) { + p2_d = __msa_copy_u_d((v2i64)p2_out, 0); + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + q2_d = __msa_copy_u_d((v2i64)q2_out, 0); + + SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch); + SD(q1_d, src + pitch); + SD(q2_d, src + 2 * pitch); + } else { + /* LSB(right) 8 pixel operation */ + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5, + zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r, + q7_r); + + tmp0 = p7_r << 3; + tmp0 -= p7_r; + tmp0 += p6_r; + tmp0 += q0_r; + + src -= 7 * pitch; + + /* calculation of p6 and p5 */ + tmp1 = p6_r + p5_r + p4_r + p3_r; + tmp1 += (p2_r + p1_r + p0_r); + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp0 = p5_r - p6_r + q1_r - p7_r; + tmp1 += tmp0; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of p4 and p3 */ + tmp0 = p4_r - p5_r + q2_r - p7_r; + tmp2 = p3_r - p4_r + q3_r - p7_r; + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of p2 and p1 */ + tmp0 = p2_r - p3_r + q4_r - p7_r; + tmp2 = p1_r - p2_r + q5_r - p7_r; + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of p0 and q0 */ + tmp0 = (p0_r - p1_r) + (q6_r - p7_r); + tmp2 = (q7_r - p0_r) + (q0_r - p7_r); + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of q1 and q2 */ + tmp0 = q7_r - q0_r + q1_r - p6_r; + tmp2 = q7_r - q1_r + q2_r - p5_r; + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of q3 and q4 */ + tmp0 = (q7_r - q2_r) + (q3_r - p4_r); + tmp2 = (q7_r - q3_r) + (q4_r - p3_r); + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of q5 and q6 */ + tmp0 = (q7_r - q4_r) + (q5_r - p2_r); + tmp2 = (q7_r - q5_r) + (q6_r - p1_r); + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + } + } + } else { + mb_lpf_horizontal_edge_dual(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, + count); + } +} + +void aom_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1); +} + +void aom_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2); +} + +static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch, + uint8_t *output, int32_t out_pitch) { + v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org; + v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + + LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, + p1_org, p0_org); + /* 8x8 transpose */ + TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, + p0_org, p7, p6, p5, p4, p3, p2, p1, p0); + /* 8x8 transpose */ + ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org, + tmp0, tmp1, tmp2, tmp3); + ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6); + ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7); + ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4); + ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6); + SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8); + + ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); + output += (8 * out_pitch); + ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); +} + +static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch, + uint8_t *output, int32_t out_pitch) { + v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + + LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0); + LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7); + TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, + q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o); + ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch); +} + +static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output, + int32_t out_pitch) { + v16u8 row0, row1, row2, row3, row4, row5, row6, row7; + v16u8 row8, row9, row10, row11, row12, row13, row14, row15; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7; + v4i32 tmp2, tmp3; + + LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7); + input += (8 * in_pitch); + LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15); + + TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p7, p6, + p5, p4, p3, p2, p1, p0); + + /* transpose 16x8 matrix into 8x16 */ + /* total 8 intermediate register and 32 instructions */ + q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0); + q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1); + q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2); + q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3); + q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4); + q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5); + q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6); + q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7); + + ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1); + tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7); + tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5); + + ILVEV_B2_UB(q3, q2, q1, q0, q5, q7); + tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3); + tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1); + + ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3); + q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2); + q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2); + + tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0); + tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5); + q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2); + q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2); + + ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3); + q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2); + q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2); + + tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4); + tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6); + q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2); + q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2); + + ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); + output += (8 * out_pitch); + ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); +} + +int32_t aom_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, + uint8_t *src_org, int32_t pitch_org, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v16i8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3; + + /* load vector elements */ + LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); + + if (__msa_test_bz_v(flat)) { + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org); + return 1; + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + /* convert 16 bit output data into 8 bit */ + p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r); + p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r); + p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r); + q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r); + q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r); + q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat); + + ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); + filter48 += (4 * 16); + ST_UB2(q1_out, q2_out, filter48, 16); + filter48 += (2 * 16); + ST_UB(flat, filter48); + + return 0; + } +} + +int32_t aom_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch, + uint8_t *filter48) { + v16i8 zero = { 0 }; + v16u8 filter8, flat, flat2; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; + v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; + v8u16 tmp0_r, tmp1_r; + v8i16 r_out; + + flat = LD_UB(filter48 + 6 * 16); + + LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0); + LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7); + + AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__msa_test_bz_v(flat2)) { + v8i16 vec0, vec1, vec2, vec3, vec4; + + LD_UB4(filter48, 16, p2, p1, p0, q0); + LD_UB2(filter48 + 4 * 16, 16, q1, q2); + + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec3, vec4); + vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1); + + src_org -= 3; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec2, 0, (src_org + 4), pitch); + src_org += (4 * pitch); + ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec2, 4, (src_org + 4), pitch); + + return 1; + } else { + src -= 7 * 16; + + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, + p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, + p2_r_in, p1_r_in, p0_r_in); + q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); + + tmp0_r = p7_r_in << 3; + tmp0_r -= p7_r_in; + tmp0_r += p6_r_in; + tmp0_r += q0_r_in; + tmp1_r = p6_r_in + p5_r_in; + tmp1_r += p4_r_in; + tmp1_r += p3_r_in; + tmp1_r += p2_r_in; + tmp1_r += p1_r_in; + tmp1_r += p0_r_in; + tmp1_r += tmp0_r; + + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); + ST8x1_UB(p6, src); + src += 16; + + /* p5 */ + q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); + tmp0_r = p5_r_in - p6_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); + ST8x1_UB(p5, src); + src += 16; + + /* p4 */ + q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); + tmp0_r = p4_r_in - p5_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); + ST8x1_UB(p4, src); + src += 16; + + /* p3 */ + q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); + tmp0_r = p3_r_in - p4_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); + ST8x1_UB(p3, src); + src += 16; + + /* p2 */ + q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); + filter8 = LD_UB(filter48); + tmp0_r = p2_r_in - p3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* p1 */ + q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); + filter8 = LD_UB(filter48 + 16); + tmp0_r = p1_r_in - p2_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* p0 */ + q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); + filter8 = LD_UB(filter48 + 32); + tmp0_r = p0_r_in - p1_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* q0 */ + q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); + filter8 = LD_UB(filter48 + 48); + tmp0_r = q7_r_in - p0_r_in; + tmp0_r += q0_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* q1 */ + filter8 = LD_UB(filter48 + 64); + tmp0_r = q7_r_in - q0_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p6_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* q2 */ + filter8 = LD_UB(filter48 + 80); + tmp0_r = q7_r_in - q1_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p5_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* q3 */ + tmp0_r = q7_r_in - q2_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p4_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); + ST8x1_UB(q3, src); + src += 16; + + /* q4 */ + tmp0_r = q7_r_in - q3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p3_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); + ST8x1_UB(q4, src); + src += 16; + + /* q5 */ + tmp0_r = q7_r_in - q4_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p2_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); + ST8x1_UB(q5, src); + src += 16; + + /* q6 */ + tmp0_r = q7_r_in - q5_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p1_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); + ST8x1_UB(q6, src); + + return 0; + } +} + +void aom_lpf_vertical_16_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + uint8_t early_exit = 0; + DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]); + uint8_t *filter48 = &transposed_input[16 * 16]; + + transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16); + + early_exit = + aom_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src, + pitch, b_limit_ptr, limit_ptr, thresh_ptr); + + if (0 == early_exit) { + early_exit = aom_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, + &filter48[0]); + + if (0 == early_exit) { + transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch); + } + } +} + +int32_t aom_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, + uint8_t *src_org, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16i8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5; + + /* load vector elements */ + LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__msa_test_bz_v(flat)) { + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec4, vec5); + + src_org -= 2; + ST4x8_UB(vec2, vec3, src_org, pitch); + src_org += 8 * pitch; + ST4x8_UB(vec4, vec5, src_org, pitch); + + return 1; + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); + AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, + p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); + filter48 += (4 * 16); + ST_UB2(q1_out, q2_out, filter48, 16); + filter48 += (2 * 16); + ST_UB(flat, filter48); + + return 0; + } +} + +int32_t aom_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch, + uint8_t *filter48) { + v16u8 flat, flat2, filter8; + v16i8 zero = { 0 }; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; + v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; + v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in; + v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in; + v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l; + v8i16 l_out, r_out; + + flat = LD_UB(filter48 + 6 * 16); + + LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0); + LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7); + + AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__msa_test_bz_v(flat2)) { + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + + LD_UB4(filter48, 16, p2, p1, p0, q0); + LD_UB2(filter48 + 4 * 16, 16, q1, q2); + + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec3, vec4); + ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec6, vec7); + ILVRL_B2_SH(q2, q1, vec2, vec5); + + src_org -= 3; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec2, 0, (src_org + 4), pitch); + src_org += (4 * pitch); + ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec2, 4, (src_org + 4), pitch); + src_org += (4 * pitch); + ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec5, 0, (src_org + 4), pitch); + src_org += (4 * pitch); + ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec5, 4, (src_org + 4), pitch); + + return 1; + } else { + src -= 7 * 16; + + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, + p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, + p2_r_in, p1_r_in, p0_r_in); + q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); + + tmp0_r = p7_r_in << 3; + tmp0_r -= p7_r_in; + tmp0_r += p6_r_in; + tmp0_r += q0_r_in; + tmp1_r = p6_r_in + p5_r_in; + tmp1_r += p4_r_in; + tmp1_r += p3_r_in; + tmp1_r += p2_r_in; + tmp1_r += p1_r_in; + tmp1_r += p0_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in, + p5_l_in, p4_l_in); + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in, + p1_l_in, p0_l_in); + q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0); + + tmp0_l = p7_l_in << 3; + tmp0_l -= p7_l_in; + tmp0_l += p6_l_in; + tmp0_l += q0_l_in; + tmp1_l = p6_l_in + p5_l_in; + tmp1_l += p4_l_in; + tmp1_l += p3_l_in; + tmp1_l += p2_l_in; + tmp1_l += p1_l_in; + tmp1_l += p0_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); + ST_UB(p6, src); + src += 16; + + /* p5 */ + q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); + tmp0_r = p5_r_in - p6_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1); + tmp0_l = p5_l_in - p6_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); + ST_UB(p5, src); + src += 16; + + /* p4 */ + q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); + tmp0_r = p4_r_in - p5_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2); + tmp0_l = p4_l_in - p5_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); + ST_UB(p4, src); + src += 16; + + /* p3 */ + q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); + tmp0_r = p3_r_in - p4_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3); + tmp0_l = p3_l_in - p4_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); + ST_UB(p3, src); + src += 16; + + /* p2 */ + q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); + filter8 = LD_UB(filter48); + tmp0_r = p2_r_in - p3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4); + tmp0_l = p2_l_in - p3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* p1 */ + q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); + filter8 = LD_UB(filter48 + 16); + tmp0_r = p1_r_in - p2_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5); + tmp0_l = p1_l_in - p2_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)(tmp1_l), 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* p0 */ + q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); + filter8 = LD_UB(filter48 + 32); + tmp0_r = p0_r_in - p1_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6); + tmp0_l = p0_l_in - p1_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* q0 */ + q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); + filter8 = LD_UB(filter48 + 48); + tmp0_r = q7_r_in - p0_r_in; + tmp0_r += q0_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7); + tmp0_l = q7_l_in - p0_l_in; + tmp0_l += q0_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* q1 */ + filter8 = LD_UB(filter48 + 64); + tmp0_r = q7_r_in - q0_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p6_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q0_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p6_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* q2 */ + filter8 = LD_UB(filter48 + 80); + tmp0_r = q7_r_in - q1_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p5_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q1_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p5_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* q3 */ + tmp0_r = q7_r_in - q2_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p4_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q2_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p4_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); + ST_UB(q3, src); + src += 16; + + /* q4 */ + tmp0_r = q7_r_in - q3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p3_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p3_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); + ST_UB(q4, src); + src += 16; + + /* q5 */ + tmp0_r = q7_r_in - q4_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p2_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q4_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p2_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); + ST_UB(q5, src); + src += 16; + + /* q6 */ + tmp0_r = q7_r_in - q5_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p1_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q5_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p1_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); + ST_UB(q6, src); + + return 0; + } +} + +void aom_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + uint8_t early_exit = 0; + DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]); + uint8_t *filter48 = &transposed_input[16 * 16]; + + transpose_16x16((src - 8), pitch, &transposed_input[0], 16); + + early_exit = + aom_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src, + pitch, b_limit_ptr, limit_ptr, thresh_ptr); + + if (0 == early_exit) { + early_exit = aom_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, + &filter48[0]); + + if (0 == early_exit) { + transpose_16x16(transposed_input, 16, (src - 8), pitch); + } + } +} diff --git a/libs/libaom/src/aom_dsp/mips/loopfilter_4_msa.c b/libs/libaom/src/aom_dsp/mips/loopfilter_4_msa.c new file mode 100644 index 000000000..dc0a97764 --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/loopfilter_4_msa.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/loopfilter_msa.h" + +void aom_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + uint64_t p1_d, p0_d, q0_d, q1_d; + v16u8 mask, hev, flat, thresh, b_limit, limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); +} + +void aom_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0_ptr, + const uint8_t *limit0_ptr, + const uint8_t *thresh0_ptr, + const uint8_t *b_limit1_ptr, + const uint8_t *limit1_ptr, + const uint8_t *thresh1_ptr) { + v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); + thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); + thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0); + + b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr); + b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr); + b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0); + + limit0 = (v16u8)__msa_fill_b(*limit0_ptr); + limit1 = (v16u8)__msa_fill_b(*limit1_ptr); + limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, + mask, flat); + AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + + ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch); +} + +void aom_lpf_vertical_4_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + v16u8 mask, hev, flat, limit, thresh, b_limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v8i16 vec0, vec1, vec2, vec3; + + LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, + q3); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + + src -= 2; + ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); + src += 4 * pitch; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); +} + +void aom_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0_ptr, + const uint8_t *limit0_ptr, + const uint8_t *thresh0_ptr, + const uint8_t *b_limit1_ptr, + const uint8_t *limit1_ptr, + const uint8_t *thresh1_ptr) { + v16u8 mask, hev, flat; + v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 row0, row1, row2, row3, row4, row5, row6, row7; + v16u8 row8, row9, row10, row11, row12, row13, row14, row15; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + + LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); + LD_UB8(src - 4 + (8 * pitch), pitch, row8, row9, row10, row11, row12, row13, + row14, row15); + + TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p3, p2, + p1, p0, q0, q1, q2, q3); + + thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); + thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); + thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0); + + b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr); + b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr); + b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0); + + limit0 = (v16u8)__msa_fill_b(*limit0_ptr); + limit1 = (v16u8)__msa_fill_b(*limit1_ptr); + limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, + mask, flat); + AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1); + ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3); + ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1); + ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5); + + src -= 2; + + ST4x8_UB(tmp2, tmp3, src, pitch); + src += (8 * pitch); + ST4x8_UB(tmp4, tmp5, src, pitch); +} diff --git a/libs/libaom/src/aom_dsp/mips/loopfilter_8_msa.c b/libs/libaom/src/aom_dsp/mips/loopfilter_8_msa.c new file mode 100644 index 000000000..dc203e79c --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/loopfilter_8_msa.c @@ -0,0 +1,333 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/loopfilter_msa.h" + +void aom_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; + v16u8 mask, hev, flat, thresh, b_limit, limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v8i16 p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8; + v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; + v16i8 zero = { 0 }; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); + + if (__msa_test_bz_v(flat)) { + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8, + p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero, + q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8); + PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat); + + p2_d = __msa_copy_u_d((v2i64)p2_out, 0); + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + q2_d = __msa_copy_u_d((v2i64)q2_out, 0); + + src -= 3 * pitch; + + SD4(p2_d, p1_d, p0_d, q0_d, src, pitch); + src += (4 * pitch); + SD(q1_d, src); + src += pitch; + SD(q2_d, src); + } +} + +void aom_lpf_horizontal_8_dual_msa( + uint8_t *src, int32_t pitch, const uint8_t *b_limit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *b_limit1, const uint8_t *limit1, + const uint8_t *thresh1) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, tmp, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16u8 zero = { 0 }; + + /* load vector elements */ + LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh0); + tmp = (v16u8)__msa_fill_b(*thresh1); + thresh = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)thresh); + + b_limit = (v16u8)__msa_fill_b(*b_limit0); + tmp = (v16u8)__msa_fill_b(*b_limit1); + b_limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)b_limit); + + limit = (v16u8)__msa_fill_b(*limit0); + tmp = (v16u8)__msa_fill_b(*limit1); + limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__msa_test_bz_v(flat)) { + ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); + AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, + p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + src -= 3 * pitch; + + ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch); + src += (4 * pitch); + ST_UB2(q1_out, q2_out, src, pitch); + src += (2 * pitch); + } +} + +void aom_lpf_vertical_8_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p1_out, p0_out, q0_out, q1_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v16u8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3, vec4; + + /* load vector elements */ + LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, + q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); + + if (__msa_test_bz_v(flat)) { + /* Store 4 pixels p1-_q1 */ + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + + src -= 2; + ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); + src += 4 * pitch; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r, + p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + /* Store 6 pixels p2-_q2 */ + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + vec4 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1); + + src -= 3; + ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec4, 0, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec4, 4, src + 4, pitch); + } +} + +void aom_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0, const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *b_limit1, const uint8_t *limit1, + const uint8_t *thresh1) { + uint8_t *temp_src; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p1_out, p0_out, q0_out, q1_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v16u8 row4, row5, row6, row7, row12, row13, row14, row15; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16u8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + + temp_src = src - 4; + + LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7); + temp_src += (8 * pitch); + LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15); + + /* transpose 16x8 matrix into 8x16 */ + TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0, + row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, + q3); + + thresh = (v16u8)__msa_fill_b(*thresh0); + vec0 = (v8i16)__msa_fill_b(*thresh1); + thresh = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)thresh); + + b_limit = (v16u8)__msa_fill_b(*b_limit0); + vec0 = (v8i16)__msa_fill_b(*b_limit1); + b_limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)b_limit); + + limit = (v16u8)__msa_fill_b(*limit0); + vec0 = (v8i16)__msa_fill_b(*limit1); + limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__msa_test_bz_v(flat)) { + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec4, vec5); + + src -= 2; + ST4x8_UB(vec2, vec3, src, pitch); + src += 8 * pitch; + ST4x8_UB(vec4, vec5, src, pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); + + /* filter8 */ + AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, + p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec3, vec4); + ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec6, vec7); + ILVRL_B2_SH(q2, q1, vec2, vec5); + + src -= 3; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec2, 0, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec2, 4, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec5, 0, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec5, 4, src + 4, pitch); + } +} diff --git a/libs/libaom/src/aom_dsp/mips/loopfilter_filters_dspr2.c b/libs/libaom/src/aom_dsp/mips/loopfilter_filters_dspr2.c new file mode 100644 index 000000000..8c41278be --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/loopfilter_filters_dspr2.c @@ -0,0 +1,328 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/mips/common_dspr2.h" +#include "aom_dsp/mips/loopfilter_filters_dspr2.h" +#include "aom_dsp/mips/loopfilter_macros_dspr2.h" +#include "aom_dsp/mips/loopfilter_masks_dspr2.h" +#include "aom_mem/aom_mem.h" + +#if HAVE_DSPR2 +void aom_lpf_horizontal_4_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + uint8_t i; + uint32_t mask; + uint32_t hev; + uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; + uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + /* prefetch data for store */ + prefetch_store(s); + + /* loop filter designed to work using chars so that we can make maximum use + of 8 bit simd instructions. */ + for (i = 0; i < 2; i++) { + sm1 = s - (pitch << 2); + s0 = sm1 + pitch; + s1 = s0 + pitch; + s2 = s - pitch; + s3 = s; + s4 = s + pitch; + s5 = s4 + pitch; + s6 = s5 + pitch; + + __asm__ __volatile__( + "lw %[p1], (%[s1]) \n\t" + "lw %[p2], (%[s2]) \n\t" + "lw %[p3], (%[s3]) \n\t" + "lw %[p4], (%[s4]) \n\t" + + : [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4) + : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + mask will be zero and filtering is not needed */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + __asm__ __volatile__( + "lw %[pm1], (%[sm1]) \n\t" + "lw %[p0], (%[s0]) \n\t" + "lw %[p5], (%[s5]) \n\t" + "lw %[p6], (%[s6]) \n\t" + + : [pm1] "=&r"(pm1), [p0] "=&r"(p0), [p5] "=&r"(p5), [p6] "=&r"(p6) + : [sm1] "r"(sm1), [s0] "r"(s0), [s5] "r"(s5), [s6] "r"(s6)); + + filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5, + p6, thresh_vec, &hev, &mask); + + /* if mask == 0 do filtering is not needed */ + if (mask) { + /* filtering */ + filter_dspr2(mask, hev, &p1, &p2, &p3, &p4); + + __asm__ __volatile__( + "sw %[p1], (%[s1]) \n\t" + "sw %[p2], (%[s2]) \n\t" + "sw %[p3], (%[s3]) \n\t" + "sw %[p4], (%[s4]) \n\t" + + : + : [p1] "r"(p1), [p2] "r"(p2), [p3] "r"(p3), [p4] "r"(p4), + [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); + } + } + + s = s + 4; + } +} + +void aom_lpf_vertical_4_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + uint8_t i; + uint32_t mask, hev; + uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; + uint8_t *s1, *s2, *s3, *s4; + uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + /* prefetch data for store */ + prefetch_store(s + pitch); + + for (i = 0; i < 2; i++) { + s1 = s; + s2 = s + pitch; + s3 = s2 + pitch; + s4 = s3 + pitch; + s = s4 + pitch; + + /* load quad-byte vectors + * memory is 4 byte aligned + */ + p2 = *((uint32_t *)(s1 - 4)); + p6 = *((uint32_t *)(s1)); + p1 = *((uint32_t *)(s2 - 4)); + p5 = *((uint32_t *)(s2)); + p0 = *((uint32_t *)(s3 - 4)); + p4 = *((uint32_t *)(s3)); + pm1 = *((uint32_t *)(s4 - 4)); + p3 = *((uint32_t *)(s4)); + + /* transpose pm1, p0, p1, p2 */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" + "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" + "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" + "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" + + "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" + "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" + "append %[p1], %[sec3], 16 \n\t" + "append %[pm1], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), + [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose p3, p4, p5, p6 */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" + "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" + "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" + "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" + + "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" + "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" + "append %[p5], %[sec3], 16 \n\t" + "append %[p3], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4), + [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + * mask will be zero and filtering is not needed + */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5, + p6, thresh_vec, &hev, &mask); + + /* if mask == 0 do filtering is not needed */ + if (mask) { + /* filtering */ + filter_dspr2(mask, hev, &p1, &p2, &p3, &p4); + + /* unpack processed 4x4 neighborhood + * don't use transpose on output data + * because memory isn't aligned + */ + __asm__ __volatile__( + "sb %[p4], 1(%[s4]) \n\t" + "sb %[p3], 0(%[s4]) \n\t" + "sb %[p2], -1(%[s4]) \n\t" + "sb %[p1], -2(%[s4]) \n\t" + + : + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [s4] "r"(s4)); + + __asm__ __volatile__( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); + + __asm__ __volatile__( + "sb %[p4], 1(%[s3]) \n\t" + "sb %[p3], 0(%[s3]) \n\t" + "sb %[p2], -1(%[s3]) \n\t" + "sb %[p1], -2(%[s3]) \n\t" + + : [p1] "+r"(p1) + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [s3] "r"(s3)); + + __asm__ __volatile__( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); + + __asm__ __volatile__( + "sb %[p4], 1(%[s2]) \n\t" + "sb %[p3], 0(%[s2]) \n\t" + "sb %[p2], -1(%[s2]) \n\t" + "sb %[p1], -2(%[s2]) \n\t" + + : + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [s2] "r"(s2)); + + __asm__ __volatile__( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); + + __asm__ __volatile__( + "sb %[p4], 1(%[s1]) \n\t" + "sb %[p3], 0(%[s1]) \n\t" + "sb %[p2], -1(%[s1]) \n\t" + "sb %[p1], -2(%[s1]) \n\t" + + : + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [s1] "r"(s1)); + } + } + } +} + +void aom_lpf_horizontal_4_dual_dspr2( + uint8_t *s, int p /* pitch */, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, + const uint8_t *limit1, const uint8_t *thresh1) { + aom_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1); +} + +void aom_lpf_horizontal_8_dual_dspr2( + uint8_t *s, int p /* pitch */, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, + const uint8_t *limit1, const uint8_t *thresh1) { + aom_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0); + aom_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0); + aom_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + aom_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh); + aom_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh); +} +#endif // #if HAVE_DSPR2 diff --git a/libs/libaom/src/aom_dsp/mips/loopfilter_filters_dspr2.h b/libs/libaom/src/aom_dsp/mips/loopfilter_filters_dspr2.h new file mode 100644 index 000000000..28f0dc35a --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/loopfilter_filters_dspr2.h @@ -0,0 +1,736 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ +#define AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if HAVE_DSPR2 +/* inputs & outputs are quad-byte vectors */ +static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, uint32_t *ps1, + uint32_t *ps0, uint32_t *qs0, uint32_t *qs1) { + int32_t aom_filter_l, aom_filter_r; + int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r; + int32_t subr_r, subr_l; + uint32_t t1, t2, HWM, t3; + uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r; + int32_t vps1, vps0, vqs0, vqs1; + int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r; + uint32_t N128; + + N128 = 0x80808080; + t1 = 0x03000300; + t2 = 0x04000400; + t3 = 0x01000100; + HWM = 0xFF00FF00; + + vps0 = (*ps0) ^ N128; + vps1 = (*ps1) ^ N128; + vqs0 = (*qs0) ^ N128; + vqs1 = (*qs1) ^ N128; + + /* use halfword pairs instead quad-bytes because of accuracy */ + vps0_l = vps0 & HWM; + vps0_r = vps0 << 8; + vps0_r = vps0_r & HWM; + + vps1_l = vps1 & HWM; + vps1_r = vps1 << 8; + vps1_r = vps1_r & HWM; + + vqs0_l = vqs0 & HWM; + vqs0_r = vqs0 << 8; + vqs0_r = vqs0_r & HWM; + + vqs1_l = vqs1 & HWM; + vqs1_r = vqs1 << 8; + vqs1_r = vqs1_r & HWM; + + mask_l = mask & HWM; + mask_r = mask << 8; + mask_r = mask_r & HWM; + + hev_l = hev & HWM; + hev_r = hev << 8; + hev_r = hev_r & HWM; + + __asm__ __volatile__( + /* aom_filter = aom_signed_char_clamp(ps1 - qs1); */ + "subq_s.ph %[aom_filter_l], %[vps1_l], %[vqs1_l] \n\t" + "subq_s.ph %[aom_filter_r], %[vps1_r], %[vqs1_r] \n\t" + + /* qs0 - ps0 */ + "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t" + "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t" + + /* aom_filter &= hev; */ + "and %[aom_filter_l], %[aom_filter_l], %[hev_l] \n\t" + "and %[aom_filter_r], %[aom_filter_r], %[hev_r] \n\t" + + /* aom_filter = aom_signed_char_clamp(aom_filter + 3 * (qs0 - ps0)); */ + "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" + "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" + "xor %[invhev_l], %[hev_l], %[HWM] \n\t" + "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" + "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" + "xor %[invhev_r], %[hev_r], %[HWM] \n\t" + "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" + "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" + + /* aom_filter &= mask; */ + "and %[aom_filter_l], %[aom_filter_l], %[mask_l] \n\t" + "and %[aom_filter_r], %[aom_filter_r], %[mask_r] \n\t" + + : [aom_filter_l] "=&r"(aom_filter_l), [aom_filter_r] "=&r"(aom_filter_r), + [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r), + [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r) + : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l), + [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r), + [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l), + [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r), + [HWM] "r"(HWM)); + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + __asm__ __volatile__( + /* Filter2 = aom_signed_char_clamp(aom_filter + 3) >>= 3; */ + "addq_s.ph %[Filter1_l], %[aom_filter_l], %[t2] \n\t" + "addq_s.ph %[Filter1_r], %[aom_filter_r], %[t2] \n\t" + + /* Filter1 = aom_signed_char_clamp(aom_filter + 4) >>= 3; */ + "addq_s.ph %[Filter2_l], %[aom_filter_l], %[t1] \n\t" + "addq_s.ph %[Filter2_r], %[aom_filter_r], %[t1] \n\t" + "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t" + "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t" + + "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t" + "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t" + + "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t" + "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t" + + /* vps0 = aom_signed_char_clamp(ps0 + Filter2); */ + "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t" + "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t" + + /* vqs0 = aom_signed_char_clamp(qs0 - Filter1); */ + "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t" + "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t" + + : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r), + [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r), + [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l), + [vqs0_r] "+r"(vqs0_r) + : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM), + [aom_filter_l] "r"(aom_filter_l), [aom_filter_r] "r"(aom_filter_r)); + + __asm__ __volatile__( + /* (aom_filter += 1) >>= 1 */ + "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t" + "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t" + + /* aom_filter &= ~hev; */ + "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t" + "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t" + + /* vps1 = aom_signed_char_clamp(ps1 + aom_filter); */ + "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t" + "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t" + + /* vqs1 = aom_signed_char_clamp(qs1 - aom_filter); */ + "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t" + "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t" + + : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r), + [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l), + [vqs1_r] "+r"(vqs1_r) + : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r)); + + /* Create quad-bytes from halfword pairs */ + vqs0_l = vqs0_l & HWM; + vqs1_l = vqs1_l & HWM; + vps0_l = vps0_l & HWM; + vps1_l = vps1_l & HWM; + + __asm__ __volatile__( + "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t" + "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t" + "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t" + "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t" + + : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r), + [vqs0_r] "+r"(vqs0_r) + :); + + vqs0 = vqs0_l | vqs0_r; + vqs1 = vqs1_l | vqs1_r; + vps0 = vps0_l | vps0_r; + vps1 = vps1_l | vps1_r; + + *ps0 = vps0 ^ N128; + *ps1 = vps1 ^ N128; + *qs0 = vqs0 ^ N128; + *qs1 = vqs1 ^ N128; +} + +static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, uint32_t ps1, + uint32_t ps0, uint32_t qs0, uint32_t qs1, + uint32_t *p1_f0, uint32_t *p0_f0, + uint32_t *q0_f0, uint32_t *q1_f0) { + int32_t aom_filter_l, aom_filter_r; + int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r; + int32_t subr_r, subr_l; + uint32_t t1, t2, HWM, t3; + uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r; + int32_t vps1, vps0, vqs0, vqs1; + int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r; + uint32_t N128; + + N128 = 0x80808080; + t1 = 0x03000300; + t2 = 0x04000400; + t3 = 0x01000100; + HWM = 0xFF00FF00; + + vps0 = (ps0) ^ N128; + vps1 = (ps1) ^ N128; + vqs0 = (qs0) ^ N128; + vqs1 = (qs1) ^ N128; + + /* use halfword pairs instead quad-bytes because of accuracy */ + vps0_l = vps0 & HWM; + vps0_r = vps0 << 8; + vps0_r = vps0_r & HWM; + + vps1_l = vps1 & HWM; + vps1_r = vps1 << 8; + vps1_r = vps1_r & HWM; + + vqs0_l = vqs0 & HWM; + vqs0_r = vqs0 << 8; + vqs0_r = vqs0_r & HWM; + + vqs1_l = vqs1 & HWM; + vqs1_r = vqs1 << 8; + vqs1_r = vqs1_r & HWM; + + mask_l = mask & HWM; + mask_r = mask << 8; + mask_r = mask_r & HWM; + + hev_l = hev & HWM; + hev_r = hev << 8; + hev_r = hev_r & HWM; + + __asm__ __volatile__( + /* aom_filter = aom_signed_char_clamp(ps1 - qs1); */ + "subq_s.ph %[aom_filter_l], %[vps1_l], %[vqs1_l] \n\t" + "subq_s.ph %[aom_filter_r], %[vps1_r], %[vqs1_r] \n\t" + + /* qs0 - ps0 */ + "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t" + "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t" + + /* aom_filter &= hev; */ + "and %[aom_filter_l], %[aom_filter_l], %[hev_l] \n\t" + "and %[aom_filter_r], %[aom_filter_r], %[hev_r] \n\t" + + /* aom_filter = aom_signed_char_clamp(aom_filter + 3 * (qs0 - ps0)); */ + "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" + "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" + "xor %[invhev_l], %[hev_l], %[HWM] \n\t" + "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" + "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" + "xor %[invhev_r], %[hev_r], %[HWM] \n\t" + "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" + "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" + + /* aom_filter &= mask; */ + "and %[aom_filter_l], %[aom_filter_l], %[mask_l] \n\t" + "and %[aom_filter_r], %[aom_filter_r], %[mask_r] \n\t" + + : [aom_filter_l] "=&r"(aom_filter_l), [aom_filter_r] "=&r"(aom_filter_r), + [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r), + [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r) + : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l), + [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r), + [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l), + [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r), + [HWM] "r"(HWM)); + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + __asm__ __volatile__( + /* Filter2 = aom_signed_char_clamp(aom_filter + 3) >>= 3; */ + "addq_s.ph %[Filter1_l], %[aom_filter_l], %[t2] \n\t" + "addq_s.ph %[Filter1_r], %[aom_filter_r], %[t2] \n\t" + + /* Filter1 = aom_signed_char_clamp(aom_filter + 4) >>= 3; */ + "addq_s.ph %[Filter2_l], %[aom_filter_l], %[t1] \n\t" + "addq_s.ph %[Filter2_r], %[aom_filter_r], %[t1] \n\t" + "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t" + "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t" + + "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t" + "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t" + + "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t" + "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t" + + /* vps0 = aom_signed_char_clamp(ps0 + Filter2); */ + "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t" + "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t" + + /* vqs0 = aom_signed_char_clamp(qs0 - Filter1); */ + "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t" + "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t" + + : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r), + [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r), + [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l), + [vqs0_r] "+r"(vqs0_r) + : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM), + [aom_filter_l] "r"(aom_filter_l), [aom_filter_r] "r"(aom_filter_r)); + + __asm__ __volatile__( + /* (aom_filter += 1) >>= 1 */ + "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t" + "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t" + + /* aom_filter &= ~hev; */ + "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t" + "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t" + + /* vps1 = aom_signed_char_clamp(ps1 + aom_filter); */ + "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t" + "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t" + + /* vqs1 = aom_signed_char_clamp(qs1 - aom_filter); */ + "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t" + "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t" + + : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r), + [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l), + [vqs1_r] "+r"(vqs1_r) + : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r)); + + /* Create quad-bytes from halfword pairs */ + vqs0_l = vqs0_l & HWM; + vqs1_l = vqs1_l & HWM; + vps0_l = vps0_l & HWM; + vps1_l = vps1_l & HWM; + + __asm__ __volatile__( + "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t" + "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t" + "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t" + "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t" + + : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r), + [vqs0_r] "+r"(vqs0_r) + :); + + vqs0 = vqs0_l | vqs0_r; + vqs1 = vqs1_l | vqs1_r; + vps0 = vps0_l | vps0_r; + vps1 = vps1_l | vps1_r; + + *p0_f0 = vps0 ^ N128; + *p1_f0 = vps1 ^ N128; + *q0_f0 = vqs0 ^ N128; + *q1_f0 = vqs1 ^ N128; +} + +static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2, uint32_t *op1, + uint32_t *op0, uint32_t *oq0, uint32_t *oq1, + uint32_t *oq2, uint32_t *oq3) { + /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */ + const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; + uint32_t res_op2, res_op1, res_op0; + uint32_t res_oq0, res_oq1, res_oq2; + uint32_t tmp; + uint32_t add_p210_q012; + uint32_t u32Four = 0x00040004; + + /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */ + /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */ + /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3) 3 */ + /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3) 4 */ + /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */ + /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */ + + __asm__ __volatile__( + "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q1] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q2] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[u32Four] \n\t" + + "shll.ph %[tmp], %[p3], 1 \n\t" + "addu.ph %[res_op2], %[tmp], %[p3] \n\t" + "addu.ph %[res_op1], %[p3], %[p3] \n\t" + "addu.ph %[res_op2], %[res_op2], %[p2] \n\t" + "addu.ph %[res_op1], %[res_op1], %[p1] \n\t" + "addu.ph %[res_op2], %[res_op2], %[add_p210_q012] \n\t" + "addu.ph %[res_op1], %[res_op1], %[add_p210_q012] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q1] \n\t" + "subu.ph %[res_op1], %[res_op1], %[q2] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q2] \n\t" + "shrl.ph %[res_op1], %[res_op1], 3 \n\t" + "shrl.ph %[res_op2], %[res_op2], 3 \n\t" + "addu.ph %[res_op0], %[p3], %[p0] \n\t" + "addu.ph %[res_oq0], %[q0], %[q3] \n\t" + "addu.ph %[res_op0], %[res_op0], %[add_p210_q012] \n\t" + "addu.ph %[res_oq0], %[res_oq0], %[add_p210_q012] \n\t" + "addu.ph %[res_oq1], %[q3], %[q3] \n\t" + "shll.ph %[tmp], %[q3], 1 \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[q1] \n\t" + "addu.ph %[res_oq2], %[tmp], %[q3] \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[add_p210_q012] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[add_p210_q012] \n\t" + "subu.ph %[res_oq1], %[res_oq1], %[p2] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t" + "shrl.ph %[res_oq1], %[res_oq1], 3 \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p2] \n\t" + "shrl.ph %[res_oq0], %[res_oq0], 3 \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p1] \n\t" + "shrl.ph %[res_op0], %[res_op0], 3 \n\t" + "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t" + + : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp), + [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1), + [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0), + [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2) + : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2), + [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four)); + + *op2 = res_op2; + *op1 = res_op1; + *op0 = res_op0; + *oq0 = res_oq0; + *oq1 = res_oq1; + *oq2 = res_oq2; +} + +static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2, uint32_t p1, + uint32_t p0, uint32_t q0, uint32_t q1, + uint32_t q2, uint32_t q3, uint32_t *op2_f1, + uint32_t *op1_f1, uint32_t *op0_f1, + uint32_t *oq0_f1, uint32_t *oq1_f1, + uint32_t *oq2_f1) { + /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */ + uint32_t res_op2, res_op1, res_op0; + uint32_t res_oq0, res_oq1, res_oq2; + uint32_t tmp; + uint32_t add_p210_q012; + uint32_t u32Four = 0x00040004; + + /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */ + /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */ + /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3) 3 */ + /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3) 4 */ + /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */ + /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */ + + __asm__ __volatile__( + "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q1] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q2] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[u32Four] \n\t" + + "shll.ph %[tmp], %[p3], 1 \n\t" + "addu.ph %[res_op2], %[tmp], %[p3] \n\t" + "addu.ph %[res_op1], %[p3], %[p3] \n\t" + "addu.ph %[res_op2], %[res_op2], %[p2] \n\t" + "addu.ph %[res_op1], %[res_op1], %[p1] \n\t" + "addu.ph %[res_op2], %[res_op2], %[add_p210_q012] \n\t" + "addu.ph %[res_op1], %[res_op1], %[add_p210_q012] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q1] \n\t" + "subu.ph %[res_op1], %[res_op1], %[q2] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q2] \n\t" + "shrl.ph %[res_op1], %[res_op1], 3 \n\t" + "shrl.ph %[res_op2], %[res_op2], 3 \n\t" + "addu.ph %[res_op0], %[p3], %[p0] \n\t" + "addu.ph %[res_oq0], %[q0], %[q3] \n\t" + "addu.ph %[res_op0], %[res_op0], %[add_p210_q012] \n\t" + "addu.ph %[res_oq0], %[res_oq0], %[add_p210_q012] \n\t" + "addu.ph %[res_oq1], %[q3], %[q3] \n\t" + "shll.ph %[tmp], %[q3], 1 \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[q1] \n\t" + "addu.ph %[res_oq2], %[tmp], %[q3] \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[add_p210_q012] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[add_p210_q012] \n\t" + "subu.ph %[res_oq1], %[res_oq1], %[p2] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t" + "shrl.ph %[res_oq1], %[res_oq1], 3 \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p2] \n\t" + "shrl.ph %[res_oq0], %[res_oq0], 3 \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p1] \n\t" + "shrl.ph %[res_op0], %[res_op0], 3 \n\t" + "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t" + + : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp), + [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1), + [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0), + [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2) + : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2), + [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four)); + + *op2_f1 = res_op2; + *op1_f1 = res_op1; + *op0_f1 = res_op0; + *oq0_f1 = res_oq0; + *oq1_f1 = res_oq1; + *oq2_f1 = res_oq2; +} + +static INLINE void wide_mbfilter_dspr2( + uint32_t *op7, uint32_t *op6, uint32_t *op5, uint32_t *op4, uint32_t *op3, + uint32_t *op2, uint32_t *op1, uint32_t *op0, uint32_t *oq0, uint32_t *oq1, + uint32_t *oq2, uint32_t *oq3, uint32_t *oq4, uint32_t *oq5, uint32_t *oq6, + uint32_t *oq7) { + const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4; + const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; + const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7; + uint32_t res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0; + uint32_t res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6; + uint32_t tmp; + uint32_t add_p6toq6; + uint32_t u32Eight = 0x00080008; + + __asm__ __volatile__( + /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6 + which is used most of the time */ + "addu.ph %[add_p6toq6], %[p6], %[p5] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p4] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p3] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p2] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p1] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p0] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q0] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q1] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q2] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q3] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q4] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q5] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q6] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[u32Eight] \n\t" + + : [add_p6toq6] "=&r"(add_p6toq6) + : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), + [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), + [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), + [u32Eight] "r"(u32Eight)); + + __asm__ __volatile__( + /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + + p3 + p2 + p1 + p0 + q0, 4) */ + "shll.ph %[tmp], %[p7], 3 \n\t" + "subu.ph %[res_op6], %[tmp], %[p7] \n\t" + "addu.ph %[res_op6], %[res_op6], %[p6] \n\t" + "addu.ph %[res_op6], %[res_op6], %[add_p6toq6] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q1] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q2] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q3] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q4] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q5] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q6] \n\t" + "shrl.ph %[res_op6], %[res_op6], 4 \n\t" + + /* *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + + p2 + p1 + p0 + q0 + q1, 4) */ + "shll.ph %[tmp], %[p7], 2 \n\t" + "addu.ph %[res_op5], %[tmp], %[p7] \n\t" + "addu.ph %[res_op5], %[res_op5], %[p7] \n\t" + "addu.ph %[res_op5], %[res_op5], %[p5] \n\t" + "addu.ph %[res_op5], %[res_op5], %[add_p6toq6] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q2] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q3] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q4] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q5] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q6] \n\t" + "shrl.ph %[res_op5], %[res_op5], 4 \n\t" + + /* *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + + p1 + p0 + q0 + q1 + q2, 4) */ + "shll.ph %[tmp], %[p7], 2 \n\t" + "addu.ph %[res_op4], %[tmp], %[p7] \n\t" + "addu.ph %[res_op4], %[res_op4], %[p4] \n\t" + "addu.ph %[res_op4], %[res_op4], %[add_p6toq6] \n\t" + "subu.ph %[res_op4], %[res_op4], %[q3] \n\t" + "subu.ph %[res_op4], %[res_op4], %[q4] \n\t" + "subu.ph %[res_op4], %[res_op4], %[q5] \n\t" + "subu.ph %[res_op4], %[res_op4], %[q6] \n\t" + "shrl.ph %[res_op4], %[res_op4], 4 \n\t" + + /* *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + + p1 + p0 + q0 + q1 + q2 + q3, 4) */ + "shll.ph %[tmp], %[p7], 2 \n\t" + "addu.ph %[res_op3], %[tmp], %[p3] \n\t" + "addu.ph %[res_op3], %[res_op3], %[add_p6toq6] \n\t" + "subu.ph %[res_op3], %[res_op3], %[q4] \n\t" + "subu.ph %[res_op3], %[res_op3], %[q5] \n\t" + "subu.ph %[res_op3], %[res_op3], %[q6] \n\t" + "shrl.ph %[res_op3], %[res_op3], 4 \n\t" + + /* *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + + p0 + q0 + q1 + q2 + q3 + q4, 4) */ + "shll.ph %[tmp], %[p7], 1 \n\t" + "addu.ph %[res_op2], %[tmp], %[p7] \n\t" + "addu.ph %[res_op2], %[res_op2], %[p2] \n\t" + "addu.ph %[res_op2], %[res_op2], %[add_p6toq6] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q5] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q6] \n\t" + "shrl.ph %[res_op2], %[res_op2], 4 \n\t" + + /* *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + + p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); */ + "shll.ph %[tmp], %[p7], 1 \n\t" + "addu.ph %[res_op1], %[tmp], %[p1] \n\t" + "addu.ph %[res_op1], %[res_op1], %[add_p6toq6] \n\t" + "subu.ph %[res_op1], %[res_op1], %[q6] \n\t" + "shrl.ph %[res_op1], %[res_op1], 4 \n\t" + + /* *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + + q0 + q1 + q2 + q3 + q4 + q5 + q6, 4) */ + "addu.ph %[res_op0], %[p7], %[p0] \n\t" + "addu.ph %[res_op0], %[res_op0], %[add_p6toq6] \n\t" + "shrl.ph %[res_op0], %[res_op0], 4 \n\t" + + : [res_op6] "=&r"(res_op6), [res_op5] "=&r"(res_op5), + [res_op4] "=&r"(res_op4), [res_op3] "=&r"(res_op3), + [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1), + [res_op0] "=&r"(res_op0), [tmp] "=&r"(tmp) + : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), + [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q2] "r"(q2), [q1] "r"(q1), + [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), + [add_p6toq6] "r"(add_p6toq6)); + + *op6 = res_op6; + *op5 = res_op5; + *op4 = res_op4; + *op3 = res_op3; + *op2 = res_op2; + *op1 = res_op1; + *op0 = res_op0; + + __asm__ __volatile__( + /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */ + "addu.ph %[res_oq0], %[q7], %[q0] \n\t" + "addu.ph %[res_oq0], %[res_oq0], %[add_p6toq6] \n\t" + "shrl.ph %[res_oq0], %[res_oq0], 4 \n\t" + + /* *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4) */ + "shll.ph %[tmp], %[q7], 1 \n\t" + "addu.ph %[res_oq1], %[tmp], %[q1] \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[add_p6toq6] \n\t" + "subu.ph %[res_oq1], %[res_oq1], %[p6] \n\t" + "shrl.ph %[res_oq1], %[res_oq1], 4 \n\t" + + /* *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + + q3 + q4 + q5 + q6 + q7 * 3, 4) */ + "shll.ph %[tmp], %[q7], 1 \n\t" + "addu.ph %[res_oq2], %[tmp], %[q7] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[add_p6toq6] \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p5] \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p6] \n\t" + "shrl.ph %[res_oq2], %[res_oq2], 4 \n\t" + + /* *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 + q2 + + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4) */ + "shll.ph %[tmp], %[q7], 2 \n\t" + "addu.ph %[res_oq3], %[tmp], %[q3] \n\t" + "addu.ph %[res_oq3], %[res_oq3], %[add_p6toq6] \n\t" + "subu.ph %[res_oq3], %[res_oq3], %[p4] \n\t" + "subu.ph %[res_oq3], %[res_oq3], %[p5] \n\t" + "subu.ph %[res_oq3], %[res_oq3], %[p6] \n\t" + "shrl.ph %[res_oq3], %[res_oq3], 4 \n\t" + + /* *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 + q3 + + q4 * 2 + q5 + q6 + q7 * 5, 4) */ + "shll.ph %[tmp], %[q7], 2 \n\t" + "addu.ph %[res_oq4], %[tmp], %[q7] \n\t" + "addu.ph %[res_oq4], %[res_oq4], %[q4] \n\t" + "addu.ph %[res_oq4], %[res_oq4], %[add_p6toq6] \n\t" + "subu.ph %[res_oq4], %[res_oq4], %[p3] \n\t" + "subu.ph %[res_oq4], %[res_oq4], %[p4] \n\t" + "subu.ph %[res_oq4], %[res_oq4], %[p5] \n\t" + "subu.ph %[res_oq4], %[res_oq4], %[p6] \n\t" + "shrl.ph %[res_oq4], %[res_oq4], 4 \n\t" + + /* *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 + q4 + + q5 * 2 + q6 + q7 * 6, 4) */ + "shll.ph %[tmp], %[q7], 2 \n\t" + "addu.ph %[res_oq5], %[tmp], %[q7] \n\t" + "addu.ph %[res_oq5], %[res_oq5], %[q7] \n\t" + "addu.ph %[res_oq5], %[res_oq5], %[q5] \n\t" + "addu.ph %[res_oq5], %[res_oq5], %[add_p6toq6] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p2] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p3] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p4] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p5] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p6] \n\t" + "shrl.ph %[res_oq5], %[res_oq5], 4 \n\t" + + /* *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + + q4 + q5 + q6 * 2 + q7 * 7, 4) */ + "shll.ph %[tmp], %[q7], 3 \n\t" + "subu.ph %[res_oq6], %[tmp], %[q7] \n\t" + "addu.ph %[res_oq6], %[res_oq6], %[q6] \n\t" + "addu.ph %[res_oq6], %[res_oq6], %[add_p6toq6] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p1] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p2] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p3] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p4] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p5] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p6] \n\t" + "shrl.ph %[res_oq6], %[res_oq6], 4 \n\t" + + : [res_oq6] "=&r"(res_oq6), [res_oq5] "=&r"(res_oq5), + [res_oq4] "=&r"(res_oq4), [res_oq3] "=&r"(res_oq3), + [res_oq2] "=&r"(res_oq2), [res_oq1] "=&r"(res_oq1), + [res_oq0] "=&r"(res_oq0), [tmp] "=&r"(tmp) + : [q7] "r"(q7), [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3), + [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [p1] "r"(p1), [p2] "r"(p2), + [p3] "r"(p3), [p4] "r"(p4), [p5] "r"(p5), [p6] "r"(p6), + [add_p6toq6] "r"(add_p6toq6)); + + *oq0 = res_oq0; + *oq1 = res_oq1; + *oq2 = res_oq2; + *oq3 = res_oq3; + *oq4 = res_oq4; + *oq5 = res_oq5; + *oq6 = res_oq6; +} +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ diff --git a/libs/libaom/src/aom_dsp/mips/loopfilter_macros_dspr2.h b/libs/libaom/src/aom_dsp/mips/loopfilter_macros_dspr2.h new file mode 100644 index 000000000..62295d69d --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/loopfilter_macros_dspr2.h @@ -0,0 +1,437 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ +#define AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_mem/aom_mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if HAVE_DSPR2 +#define STORE_F0() \ + { \ + __asm__ __volatile__( \ + "sb %[q1_f0], 1(%[s4]) \n\t" \ + "sb %[q0_f0], 0(%[s4]) \n\t" \ + "sb %[p0_f0], -1(%[s4]) \n\t" \ + "sb %[p1_f0], -2(%[s4]) \n\t" \ + \ + : \ + : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \ + [p1_f0] "r"(p1_f0), [s4] "r"(s4)); \ + \ + __asm__ __volatile__( \ + "srl %[q1_f0], %[q1_f0], 8 \n\t" \ + "srl %[q0_f0], %[q0_f0], 8 \n\t" \ + "srl %[p0_f0], %[p0_f0], 8 \n\t" \ + "srl %[p1_f0], %[p1_f0], 8 \n\t" \ + \ + : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \ + [p1_f0] "+r"(p1_f0) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q1_f0], 1(%[s3]) \n\t" \ + "sb %[q0_f0], 0(%[s3]) \n\t" \ + "sb %[p0_f0], -1(%[s3]) \n\t" \ + "sb %[p1_f0], -2(%[s3]) \n\t" \ + \ + : [p1_f0] "+r"(p1_f0) \ + : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [s3] "r"(s3), \ + [p0_f0] "r"(p0_f0)); \ + \ + __asm__ __volatile__( \ + "srl %[q1_f0], %[q1_f0], 8 \n\t" \ + "srl %[q0_f0], %[q0_f0], 8 \n\t" \ + "srl %[p0_f0], %[p0_f0], 8 \n\t" \ + "srl %[p1_f0], %[p1_f0], 8 \n\t" \ + \ + : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \ + [p1_f0] "+r"(p1_f0) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q1_f0], 1(%[s2]) \n\t" \ + "sb %[q0_f0], 0(%[s2]) \n\t" \ + "sb %[p0_f0], -1(%[s2]) \n\t" \ + "sb %[p1_f0], -2(%[s2]) \n\t" \ + \ + : \ + : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \ + [p1_f0] "r"(p1_f0), [s2] "r"(s2)); \ + \ + __asm__ __volatile__( \ + "srl %[q1_f0], %[q1_f0], 8 \n\t" \ + "srl %[q0_f0], %[q0_f0], 8 \n\t" \ + "srl %[p0_f0], %[p0_f0], 8 \n\t" \ + "srl %[p1_f0], %[p1_f0], 8 \n\t" \ + \ + : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \ + [p1_f0] "+r"(p1_f0) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q1_f0], 1(%[s1]) \n\t" \ + "sb %[q0_f0], 0(%[s1]) \n\t" \ + "sb %[p0_f0], -1(%[s1]) \n\t" \ + "sb %[p1_f0], -2(%[s1]) \n\t" \ + \ + : \ + : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \ + [p1_f0] "r"(p1_f0), [s1] "r"(s1)); \ + } + +#define STORE_F1() \ + { \ + __asm__ __volatile__( \ + "sb %[q2_r], 2(%[s4]) \n\t" \ + "sb %[q1_r], 1(%[s4]) \n\t" \ + "sb %[q0_r], 0(%[s4]) \n\t" \ + "sb %[p0_r], -1(%[s4]) \n\t" \ + "sb %[p1_r], -2(%[s4]) \n\t" \ + "sb %[p2_r], -3(%[s4]) \n\t" \ + \ + : \ + : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r), \ + [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s4] "r"(s4)); \ + \ + __asm__ __volatile__( \ + "srl %[q2_r], %[q2_r], 16 \n\t" \ + "srl %[q1_r], %[q1_r], 16 \n\t" \ + "srl %[q0_r], %[q0_r], 16 \n\t" \ + "srl %[p0_r], %[p0_r], 16 \n\t" \ + "srl %[p1_r], %[p1_r], 16 \n\t" \ + "srl %[p2_r], %[p2_r], 16 \n\t" \ + \ + : [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), [q0_r] "+r"(q0_r), \ + [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), [p2_r] "+r"(p2_r) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q2_r], 2(%[s3]) \n\t" \ + "sb %[q1_r], 1(%[s3]) \n\t" \ + "sb %[q0_r], 0(%[s3]) \n\t" \ + "sb %[p0_r], -1(%[s3]) \n\t" \ + "sb %[p1_r], -2(%[s3]) \n\t" \ + "sb %[p2_r], -3(%[s3]) \n\t" \ + \ + : \ + : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r), \ + [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s3] "r"(s3)); \ + \ + __asm__ __volatile__( \ + "sb %[q2_l], 2(%[s2]) \n\t" \ + "sb %[q1_l], 1(%[s2]) \n\t" \ + "sb %[q0_l], 0(%[s2]) \n\t" \ + "sb %[p0_l], -1(%[s2]) \n\t" \ + "sb %[p1_l], -2(%[s2]) \n\t" \ + "sb %[p2_l], -3(%[s2]) \n\t" \ + \ + : \ + : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l), \ + [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s2] "r"(s2)); \ + \ + __asm__ __volatile__( \ + "srl %[q2_l], %[q2_l], 16 \n\t" \ + "srl %[q1_l], %[q1_l], 16 \n\t" \ + "srl %[q0_l], %[q0_l], 16 \n\t" \ + "srl %[p0_l], %[p0_l], 16 \n\t" \ + "srl %[p1_l], %[p1_l], 16 \n\t" \ + "srl %[p2_l], %[p2_l], 16 \n\t" \ + \ + : [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), [q0_l] "+r"(q0_l), \ + [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), [p2_l] "+r"(p2_l) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q2_l], 2(%[s1]) \n\t" \ + "sb %[q1_l], 1(%[s1]) \n\t" \ + "sb %[q0_l], 0(%[s1]) \n\t" \ + "sb %[p0_l], -1(%[s1]) \n\t" \ + "sb %[p1_l], -2(%[s1]) \n\t" \ + "sb %[p2_l], -3(%[s1]) \n\t" \ + \ + : \ + : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l), \ + [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s1] "r"(s1)); \ + } + +#define STORE_F2() \ + { \ + __asm__ __volatile__( \ + "sb %[q6_r], 6(%[s4]) \n\t" \ + "sb %[q5_r], 5(%[s4]) \n\t" \ + "sb %[q4_r], 4(%[s4]) \n\t" \ + "sb %[q3_r], 3(%[s4]) \n\t" \ + "sb %[q2_r], 2(%[s4]) \n\t" \ + "sb %[q1_r], 1(%[s4]) \n\t" \ + "sb %[q0_r], 0(%[s4]) \n\t" \ + "sb %[p0_r], -1(%[s4]) \n\t" \ + "sb %[p1_r], -2(%[s4]) \n\t" \ + "sb %[p2_r], -3(%[s4]) \n\t" \ + "sb %[p3_r], -4(%[s4]) \n\t" \ + "sb %[p4_r], -5(%[s4]) \n\t" \ + "sb %[p5_r], -6(%[s4]) \n\t" \ + "sb %[p6_r], -7(%[s4]) \n\t" \ + \ + : \ + : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r), \ + [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), \ + [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), \ + [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r), \ + [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s4] "r"(s4)); \ + \ + __asm__ __volatile__( \ + "srl %[q6_r], %[q6_r], 16 \n\t" \ + "srl %[q5_r], %[q5_r], 16 \n\t" \ + "srl %[q4_r], %[q4_r], 16 \n\t" \ + "srl %[q3_r], %[q3_r], 16 \n\t" \ + "srl %[q2_r], %[q2_r], 16 \n\t" \ + "srl %[q1_r], %[q1_r], 16 \n\t" \ + "srl %[q0_r], %[q0_r], 16 \n\t" \ + "srl %[p0_r], %[p0_r], 16 \n\t" \ + "srl %[p1_r], %[p1_r], 16 \n\t" \ + "srl %[p2_r], %[p2_r], 16 \n\t" \ + "srl %[p3_r], %[p3_r], 16 \n\t" \ + "srl %[p4_r], %[p4_r], 16 \n\t" \ + "srl %[p5_r], %[p5_r], 16 \n\t" \ + "srl %[p6_r], %[p6_r], 16 \n\t" \ + \ + : [q6_r] "+r"(q6_r), [q5_r] "+r"(q5_r), [q4_r] "+r"(q4_r), \ + [q3_r] "+r"(q3_r), [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), \ + [q0_r] "+r"(q0_r), [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), \ + [p2_r] "+r"(p2_r), [p3_r] "+r"(p3_r), [p4_r] "+r"(p4_r), \ + [p5_r] "+r"(p5_r), [p6_r] "+r"(p6_r) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q6_r], 6(%[s3]) \n\t" \ + "sb %[q5_r], 5(%[s3]) \n\t" \ + "sb %[q4_r], 4(%[s3]) \n\t" \ + "sb %[q3_r], 3(%[s3]) \n\t" \ + "sb %[q2_r], 2(%[s3]) \n\t" \ + "sb %[q1_r], 1(%[s3]) \n\t" \ + "sb %[q0_r], 0(%[s3]) \n\t" \ + "sb %[p0_r], -1(%[s3]) \n\t" \ + "sb %[p1_r], -2(%[s3]) \n\t" \ + "sb %[p2_r], -3(%[s3]) \n\t" \ + "sb %[p3_r], -4(%[s3]) \n\t" \ + "sb %[p4_r], -5(%[s3]) \n\t" \ + "sb %[p5_r], -6(%[s3]) \n\t" \ + "sb %[p6_r], -7(%[s3]) \n\t" \ + \ + : \ + : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r), \ + [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), \ + [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), \ + [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r), \ + [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s3] "r"(s3)); \ + \ + __asm__ __volatile__( \ + "sb %[q6_l], 6(%[s2]) \n\t" \ + "sb %[q5_l], 5(%[s2]) \n\t" \ + "sb %[q4_l], 4(%[s2]) \n\t" \ + "sb %[q3_l], 3(%[s2]) \n\t" \ + "sb %[q2_l], 2(%[s2]) \n\t" \ + "sb %[q1_l], 1(%[s2]) \n\t" \ + "sb %[q0_l], 0(%[s2]) \n\t" \ + "sb %[p0_l], -1(%[s2]) \n\t" \ + "sb %[p1_l], -2(%[s2]) \n\t" \ + "sb %[p2_l], -3(%[s2]) \n\t" \ + "sb %[p3_l], -4(%[s2]) \n\t" \ + "sb %[p4_l], -5(%[s2]) \n\t" \ + "sb %[p5_l], -6(%[s2]) \n\t" \ + "sb %[p6_l], -7(%[s2]) \n\t" \ + \ + : \ + : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l), \ + [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), \ + [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), \ + [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l), \ + [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s2] "r"(s2)); \ + \ + __asm__ __volatile__( \ + "srl %[q6_l], %[q6_l], 16 \n\t" \ + "srl %[q5_l], %[q5_l], 16 \n\t" \ + "srl %[q4_l], %[q4_l], 16 \n\t" \ + "srl %[q3_l], %[q3_l], 16 \n\t" \ + "srl %[q2_l], %[q2_l], 16 \n\t" \ + "srl %[q1_l], %[q1_l], 16 \n\t" \ + "srl %[q0_l], %[q0_l], 16 \n\t" \ + "srl %[p0_l], %[p0_l], 16 \n\t" \ + "srl %[p1_l], %[p1_l], 16 \n\t" \ + "srl %[p2_l], %[p2_l], 16 \n\t" \ + "srl %[p3_l], %[p3_l], 16 \n\t" \ + "srl %[p4_l], %[p4_l], 16 \n\t" \ + "srl %[p5_l], %[p5_l], 16 \n\t" \ + "srl %[p6_l], %[p6_l], 16 \n\t" \ + \ + : [q6_l] "+r"(q6_l), [q5_l] "+r"(q5_l), [q4_l] "+r"(q4_l), \ + [q3_l] "+r"(q3_l), [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), \ + [q0_l] "+r"(q0_l), [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), \ + [p2_l] "+r"(p2_l), [p3_l] "+r"(p3_l), [p4_l] "+r"(p4_l), \ + [p5_l] "+r"(p5_l), [p6_l] "+r"(p6_l) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q6_l], 6(%[s1]) \n\t" \ + "sb %[q5_l], 5(%[s1]) \n\t" \ + "sb %[q4_l], 4(%[s1]) \n\t" \ + "sb %[q3_l], 3(%[s1]) \n\t" \ + "sb %[q2_l], 2(%[s1]) \n\t" \ + "sb %[q1_l], 1(%[s1]) \n\t" \ + "sb %[q0_l], 0(%[s1]) \n\t" \ + "sb %[p0_l], -1(%[s1]) \n\t" \ + "sb %[p1_l], -2(%[s1]) \n\t" \ + "sb %[p2_l], -3(%[s1]) \n\t" \ + "sb %[p3_l], -4(%[s1]) \n\t" \ + "sb %[p4_l], -5(%[s1]) \n\t" \ + "sb %[p5_l], -6(%[s1]) \n\t" \ + "sb %[p6_l], -7(%[s1]) \n\t" \ + \ + : \ + : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l), \ + [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), \ + [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), \ + [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l), \ + [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s1] "r"(s1)); \ + } + +#define PACK_LEFT_0TO3() \ + { \ + __asm__ __volatile__( \ + "preceu.ph.qbl %[p3_l], %[p3] \n\t" \ + "preceu.ph.qbl %[p2_l], %[p2] \n\t" \ + "preceu.ph.qbl %[p1_l], %[p1] \n\t" \ + "preceu.ph.qbl %[p0_l], %[p0] \n\t" \ + "preceu.ph.qbl %[q0_l], %[q0] \n\t" \ + "preceu.ph.qbl %[q1_l], %[q1] \n\t" \ + "preceu.ph.qbl %[q2_l], %[q2] \n\t" \ + "preceu.ph.qbl %[q3_l], %[q3] \n\t" \ + \ + : [p3_l] "=&r"(p3_l), [p2_l] "=&r"(p2_l), [p1_l] "=&r"(p1_l), \ + [p0_l] "=&r"(p0_l), [q0_l] "=&r"(q0_l), [q1_l] "=&r"(q1_l), \ + [q2_l] "=&r"(q2_l), [q3_l] "=&r"(q3_l) \ + : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), \ + [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3)); \ + } + +#define PACK_LEFT_4TO7() \ + { \ + __asm__ __volatile__( \ + "preceu.ph.qbl %[p7_l], %[p7] \n\t" \ + "preceu.ph.qbl %[p6_l], %[p6] \n\t" \ + "preceu.ph.qbl %[p5_l], %[p5] \n\t" \ + "preceu.ph.qbl %[p4_l], %[p4] \n\t" \ + "preceu.ph.qbl %[q4_l], %[q4] \n\t" \ + "preceu.ph.qbl %[q5_l], %[q5] \n\t" \ + "preceu.ph.qbl %[q6_l], %[q6] \n\t" \ + "preceu.ph.qbl %[q7_l], %[q7] \n\t" \ + \ + : [p7_l] "=&r"(p7_l), [p6_l] "=&r"(p6_l), [p5_l] "=&r"(p5_l), \ + [p4_l] "=&r"(p4_l), [q4_l] "=&r"(q4_l), [q5_l] "=&r"(q5_l), \ + [q6_l] "=&r"(q6_l), [q7_l] "=&r"(q7_l) \ + : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), \ + [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7)); \ + } + +#define PACK_RIGHT_0TO3() \ + { \ + __asm__ __volatile__( \ + "preceu.ph.qbr %[p3_r], %[p3] \n\t" \ + "preceu.ph.qbr %[p2_r], %[p2] \n\t" \ + "preceu.ph.qbr %[p1_r], %[p1] \n\t" \ + "preceu.ph.qbr %[p0_r], %[p0] \n\t" \ + "preceu.ph.qbr %[q0_r], %[q0] \n\t" \ + "preceu.ph.qbr %[q1_r], %[q1] \n\t" \ + "preceu.ph.qbr %[q2_r], %[q2] \n\t" \ + "preceu.ph.qbr %[q3_r], %[q3] \n\t" \ + \ + : [p3_r] "=&r"(p3_r), [p2_r] "=&r"(p2_r), [p1_r] "=&r"(p1_r), \ + [p0_r] "=&r"(p0_r), [q0_r] "=&r"(q0_r), [q1_r] "=&r"(q1_r), \ + [q2_r] "=&r"(q2_r), [q3_r] "=&r"(q3_r) \ + : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), \ + [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3)); \ + } + +#define PACK_RIGHT_4TO7() \ + { \ + __asm__ __volatile__( \ + "preceu.ph.qbr %[p7_r], %[p7] \n\t" \ + "preceu.ph.qbr %[p6_r], %[p6] \n\t" \ + "preceu.ph.qbr %[p5_r], %[p5] \n\t" \ + "preceu.ph.qbr %[p4_r], %[p4] \n\t" \ + "preceu.ph.qbr %[q4_r], %[q4] \n\t" \ + "preceu.ph.qbr %[q5_r], %[q5] \n\t" \ + "preceu.ph.qbr %[q6_r], %[q6] \n\t" \ + "preceu.ph.qbr %[q7_r], %[q7] \n\t" \ + \ + : [p7_r] "=&r"(p7_r), [p6_r] "=&r"(p6_r), [p5_r] "=&r"(p5_r), \ + [p4_r] "=&r"(p4_r), [q4_r] "=&r"(q4_r), [q5_r] "=&r"(q5_r), \ + [q6_r] "=&r"(q6_r), [q7_r] "=&r"(q7_r) \ + : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), \ + [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7)); \ + } + +#define COMBINE_LEFT_RIGHT_0TO2() \ + { \ + __asm__ __volatile__( \ + "precr.qb.ph %[p2], %[p2_l], %[p2_r] \n\t" \ + "precr.qb.ph %[p1], %[p1_l], %[p1_r] \n\t" \ + "precr.qb.ph %[p0], %[p0_l], %[p0_r] \n\t" \ + "precr.qb.ph %[q0], %[q0_l], %[q0_r] \n\t" \ + "precr.qb.ph %[q1], %[q1_l], %[q1_r] \n\t" \ + "precr.qb.ph %[q2], %[q2_l], %[q2_r] \n\t" \ + \ + : [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), [q0] "=&r"(q0), \ + [q1] "=&r"(q1), [q2] "=&r"(q2) \ + : [p2_l] "r"(p2_l), [p2_r] "r"(p2_r), [p1_l] "r"(p1_l), \ + [p1_r] "r"(p1_r), [p0_l] "r"(p0_l), [p0_r] "r"(p0_r), \ + [q0_l] "r"(q0_l), [q0_r] "r"(q0_r), [q1_l] "r"(q1_l), \ + [q1_r] "r"(q1_r), [q2_l] "r"(q2_l), [q2_r] "r"(q2_r)); \ + } + +#define COMBINE_LEFT_RIGHT_3TO6() \ + { \ + __asm__ __volatile__( \ + "precr.qb.ph %[p6], %[p6_l], %[p6_r] \n\t" \ + "precr.qb.ph %[p5], %[p5_l], %[p5_r] \n\t" \ + "precr.qb.ph %[p4], %[p4_l], %[p4_r] \n\t" \ + "precr.qb.ph %[p3], %[p3_l], %[p3_r] \n\t" \ + "precr.qb.ph %[q3], %[q3_l], %[q3_r] \n\t" \ + "precr.qb.ph %[q4], %[q4_l], %[q4_r] \n\t" \ + "precr.qb.ph %[q5], %[q5_l], %[q5_r] \n\t" \ + "precr.qb.ph %[q6], %[q6_l], %[q6_r] \n\t" \ + \ + : [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4), [p3] "=&r"(p3), \ + [q3] "=&r"(q3), [q4] "=&r"(q4), [q5] "=&r"(q5), [q6] "=&r"(q6) \ + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), \ + [p3_l] "r"(p3_l), [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), \ + [p4_r] "r"(p4_r), [p3_r] "r"(p3_r), [q3_l] "r"(q3_l), \ + [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), [q6_l] "r"(q6_l), \ + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), \ + [q6_r] "r"(q6_r)); \ + } + +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ diff --git a/libs/libaom/src/aom_dsp/mips/loopfilter_masks_dspr2.h b/libs/libaom/src/aom_dsp/mips/loopfilter_masks_dspr2.h new file mode 100644 index 000000000..a0f57f386 --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/loopfilter_masks_dspr2.h @@ -0,0 +1,357 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ +#define AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_mem/aom_mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if HAVE_DSPR2 +/* processing 4 pixels at the same time + * compute hev and mask in the same function */ +static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit, + uint32_t p1, uint32_t p0, uint32_t p3, + uint32_t p2, uint32_t q0, uint32_t q1, + uint32_t q2, uint32_t q3, + uint32_t thresh, uint32_t *hev, + uint32_t *mask) { + uint32_t c, r, r3, r_k; + uint32_t s1, s2, s3; + uint32_t ones = 0xFFFFFFFF; + uint32_t hev1; + + __asm__ __volatile__( + /* mask |= (abs(p3 - p2) > limit) */ + "subu_s.qb %[c], %[p3], %[p2] \n\t" + "subu_s.qb %[r_k], %[p2], %[p3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], $0, %[c] \n\t" + + /* mask |= (abs(p2 - p1) > limit) */ + "subu_s.qb %[c], %[p2], %[p1] \n\t" + "subu_s.qb %[r_k], %[p1], %[p2] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + /* mask |= (abs(p1 - p0) > limit) + * hev |= (abs(p1 - p0) > thresh) + */ + "subu_s.qb %[c], %[p1], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" + "or %[r3], $0, %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + /* mask |= (abs(q1 - q0) > limit) + * hev |= (abs(q1 - q0) > thresh) + */ + "subu_s.qb %[c], %[q1], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" + "or %[r3], %[r3], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + /* mask |= (abs(q2 - q1) > limit) */ + "subu_s.qb %[c], %[q2], %[q1] \n\t" + "subu_s.qb %[r_k], %[q1], %[q2] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r3], %[r3], 24 \n\t" + + /* mask |= (abs(q3 - q2) > limit) */ + "subu_s.qb %[c], %[q3], %[q2] \n\t" + "subu_s.qb %[r_k], %[q2], %[q3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3) + : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3), + [thresh] "r"(thresh)); + + __asm__ __volatile__( + /* abs(p0 - q0) */ + "subu_s.qb %[c], %[p0], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[p0] \n\t" + "wrdsp %[r3] \n\t" + "or %[s1], %[r_k], %[c] \n\t" + + /* abs(p1 - q1) */ + "subu_s.qb %[c], %[p1], %[q1] \n\t" + "addu_s.qb %[s3], %[s1], %[s1] \n\t" + "pick.qb %[hev1], %[ones], $0 \n\t" + "subu_s.qb %[r_k], %[q1], %[p1] \n\t" + "or %[s2], %[r_k], %[c] \n\t" + + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */ + "shrl.qb %[s2], %[s2], 1 \n\t" + "addu_s.qb %[s1], %[s2], %[s3] \n\t" + "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r], %[r], 24 \n\t" + + "wrdsp %[r] \n\t" + "pick.qb %[s2], $0, %[ones] \n\t" + + : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1), + [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3) + : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1), + [ones] "r"(ones), [flimit] "r"(flimit)); + + *hev = hev1; + *mask = s2; +} + +static INLINE void filter_hev_mask_flatmask4_dspr2( + uint32_t limit, uint32_t flimit, uint32_t thresh, uint32_t p1, uint32_t p0, + uint32_t p3, uint32_t p2, uint32_t q0, uint32_t q1, uint32_t q2, + uint32_t q3, uint32_t *hev, uint32_t *mask, uint32_t *flat) { + uint32_t c, r, r3, r_k, r_flat; + uint32_t s1, s2, s3; + uint32_t ones = 0xFFFFFFFF; + uint32_t flat_thresh = 0x01010101; + uint32_t hev1; + uint32_t flat1; + + __asm__ __volatile__( + /* mask |= (abs(p3 - p2) > limit) */ + "subu_s.qb %[c], %[p3], %[p2] \n\t" + "subu_s.qb %[r_k], %[p2], %[p3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], $0, %[c] \n\t" + + /* mask |= (abs(p2 - p1) > limit) */ + "subu_s.qb %[c], %[p2], %[p1] \n\t" + "subu_s.qb %[r_k], %[p1], %[p2] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + /* mask |= (abs(p1 - p0) > limit) + * hev |= (abs(p1 - p0) > thresh) + * flat |= (abs(p1 - p0) > thresh) + */ + "subu_s.qb %[c], %[p1], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" + "or %[r3], $0, %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], $0, %[c] \n\t" + + /* mask |= (abs(q1 - q0) > limit) + * hev |= (abs(q1 - q0) > thresh) + * flat |= (abs(q1 - q0) > thresh) + */ + "subu_s.qb %[c], %[q1], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" + "or %[r3], %[r3], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(p0 - p2) > thresh) */ + "subu_s.qb %[c], %[p0], %[p2] \n\t" + "subu_s.qb %[r_k], %[p2], %[p0] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(q0 - q2) > thresh) */ + "subu_s.qb %[c], %[q0], %[q2] \n\t" + "subu_s.qb %[r_k], %[q2], %[q0] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(p3 - p0) > thresh) */ + "subu_s.qb %[c], %[p3], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(q3 - q0) > thresh) */ + "subu_s.qb %[c], %[q3], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + "sll %[r_flat], %[r_flat], 24 \n\t" + /* look at stall here */ + "wrdsp %[r_flat] \n\t" + "pick.qb %[flat1], $0, %[ones] \n\t" + + /* mask |= (abs(q2 - q1) > limit) */ + "subu_s.qb %[c], %[q2], %[q1] \n\t" + "subu_s.qb %[r_k], %[q1], %[q2] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r3], %[r3], 24 \n\t" + + /* mask |= (abs(q3 - q2) > limit) */ + "subu_s.qb %[c], %[q3], %[q2] \n\t" + "subu_s.qb %[r_k], %[q2], %[q3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3), + [r_flat] "=&r"(r_flat), [flat1] "=&r"(flat1) + : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3), + [thresh] "r"(thresh), [flat_thresh] "r"(flat_thresh), [ones] "r"(ones)); + + __asm__ __volatile__( + /* abs(p0 - q0) */ + "subu_s.qb %[c], %[p0], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[p0] \n\t" + "wrdsp %[r3] \n\t" + "or %[s1], %[r_k], %[c] \n\t" + + /* abs(p1 - q1) */ + "subu_s.qb %[c], %[p1], %[q1] \n\t" + "addu_s.qb %[s3], %[s1], %[s1] \n\t" + "pick.qb %[hev1], %[ones], $0 \n\t" + "subu_s.qb %[r_k], %[q1], %[p1] \n\t" + "or %[s2], %[r_k], %[c] \n\t" + + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */ + "shrl.qb %[s2], %[s2], 1 \n\t" + "addu_s.qb %[s1], %[s2], %[s3] \n\t" + "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r], %[r], 24 \n\t" + + "wrdsp %[r] \n\t" + "pick.qb %[s2], $0, %[ones] \n\t" + + : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1), + [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3) + : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1), + [ones] "r"(ones), [flimit] "r"(flimit)); + + *hev = hev1; + *mask = s2; + *flat = flat1; +} + +static INLINE void flatmask5(uint32_t p4, uint32_t p3, uint32_t p2, uint32_t p1, + uint32_t p0, uint32_t q0, uint32_t q1, uint32_t q2, + uint32_t q3, uint32_t q4, uint32_t *flat2) { + uint32_t c, r, r_k, r_flat; + uint32_t ones = 0xFFFFFFFF; + uint32_t flat_thresh = 0x01010101; + uint32_t flat1, flat3; + + __asm__ __volatile__( + /* flat |= (abs(p4 - p0) > thresh) */ + "subu_s.qb %[c], %[p4], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p4] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r], $0, %[c] \n\t" + + /* flat |= (abs(q4 - q0) > thresh) */ + "subu_s.qb %[c], %[q4], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q4] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r], %[r], 24 \n\t" + "wrdsp %[r] \n\t" + "pick.qb %[flat3], $0, %[ones] \n\t" + + /* flat |= (abs(p1 - p0) > thresh) */ + "subu_s.qb %[c], %[p1], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], $0, %[c] \n\t" + + /* flat |= (abs(q1 - q0) > thresh) */ + "subu_s.qb %[c], %[q1], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(p0 - p2) > thresh) */ + "subu_s.qb %[c], %[p0], %[p2] \n\t" + "subu_s.qb %[r_k], %[p2], %[p0] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(q0 - q2) > thresh) */ + "subu_s.qb %[c], %[q0], %[q2] \n\t" + "subu_s.qb %[r_k], %[q2], %[q0] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(p3 - p0) > thresh) */ + "subu_s.qb %[c], %[p3], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(q3 - q0) > thresh) */ + "subu_s.qb %[c], %[q3], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + "sll %[r_flat], %[r_flat], 24 \n\t" + "wrdsp %[r_flat] \n\t" + "pick.qb %[flat1], $0, %[ones] \n\t" + /* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */ + "and %[flat1], %[flat3], %[flat1] \n\t" + + : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r_flat] "=&r"(r_flat), + [flat1] "=&r"(flat1), [flat3] "=&r"(flat3) + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), + [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3), [q4] "r"(q4), + [flat_thresh] "r"(flat_thresh), [ones] "r"(ones)); + + *flat2 = flat1; +} +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ diff --git a/libs/libaom/src/aom_dsp/mips/loopfilter_mb_dspr2.c b/libs/libaom/src/aom_dsp/mips/loopfilter_mb_dspr2.c new file mode 100644 index 000000000..b67ccfe9d --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/loopfilter_mb_dspr2.c @@ -0,0 +1,590 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/mips/common_dspr2.h" +#include "aom_dsp/mips/loopfilter_filters_dspr2.h" +#include "aom_dsp/mips/loopfilter_macros_dspr2.h" +#include "aom_dsp/mips/loopfilter_masks_dspr2.h" +#include "aom_mem/aom_mem.h" + +#if HAVE_DSPR2 +void aom_lpf_horizontal_8_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + uint32_t mask; + uint32_t hev, flat; + uint8_t i; + uint8_t *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + uint32_t p1_f0, p0_f0, q0_f0, q1_f0; + uint32_t p3, p2, p1, p0, q0, q1, q2, q3; + uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l; + uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + /* prefetch data for store */ + prefetch_store(s); + + for (i = 0; i < 2; i++) { + sp3 = s - (pitch << 2); + sp2 = sp3 + pitch; + sp1 = sp2 + pitch; + sp0 = sp1 + pitch; + sq0 = s; + sq1 = s + pitch; + sq2 = sq1 + pitch; + sq3 = sq2 + pitch; + + __asm__ __volatile__( + "lw %[p3], (%[sp3]) \n\t" + "lw %[p2], (%[sp2]) \n\t" + "lw %[p1], (%[sp1]) \n\t" + "lw %[p0], (%[sp0]) \n\t" + "lw %[q0], (%[sq0]) \n\t" + "lw %[q1], (%[sq1]) \n\t" + "lw %[q2], (%[sq2]) \n\t" + "lw %[q3], (%[sq3]) \n\t" + + : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), + [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0) + : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0)); + + filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, + p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); + + if ((flat == 0) && (mask != 0)) { + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + __asm__ __volatile__( + "sw %[p1_f0], (%[sp1]) \n\t" + "sw %[p0_f0], (%[sp0]) \n\t" + "sw %[q0_f0], (%[sq0]) \n\t" + "sw %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1)); + } else if ((mask & flat) == 0xFFFFFFFF) { + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + COMBINE_LEFT_RIGHT_0TO2() + + __asm__ __volatile__( + "sw %[p2], (%[sp2]) \n\t" + "sw %[p1], (%[sp1]) \n\t" + "sw %[p0], (%[sp0]) \n\t" + "sw %[q0], (%[sq0]) \n\t" + "sw %[q1], (%[sq1]) \n\t" + "sw %[q2], (%[sq2]) \n\t" + + : + : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), + [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1), + [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if ((flat != 0) && (mask != 0)) { + /* filtering */ + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r], (%[sp2]) \n\t" + "sb %[p1_r], (%[sp1]) \n\t" + "sb %[p0_r], (%[sp0]) \n\t" + "sb %[q0_r], (%[sq0]) \n\t" + "sb %[q1_r], (%[sq1]) \n\t" + "sb %[q2_r], (%[sq2]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], (%[sp1]) \n\t" + "sb %[p0_f0], (%[sp0]) \n\t" + "sb %[q0_f0], (%[sq0]) \n\t" + "sb %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), + [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r], +1(%[sp2]) \n\t" + "sb %[p1_r], +1(%[sp1]) \n\t" + "sb %[p0_r], +1(%[sp0]) \n\t" + "sb %[q0_r], +1(%[sq0]) \n\t" + "sb %[q1_r], +1(%[sq1]) \n\t" + "sb %[q2_r], +1(%[sq2]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], +1(%[sp1]) \n\t" + "sb %[p0_f0], +1(%[sp0]) \n\t" + "sb %[q0_f0], +1(%[sq0]) \n\t" + "sb %[q1_f0], +1(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0), + [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0), + [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l], +2(%[sp2]) \n\t" + "sb %[p1_l], +2(%[sp1]) \n\t" + "sb %[p0_l], +2(%[sp0]) \n\t" + "sb %[q0_l], +2(%[sq0]) \n\t" + "sb %[q1_l], +2(%[sq1]) \n\t" + "sb %[q2_l], +2(%[sq2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], +2(%[sp1]) \n\t" + "sb %[p0_f0], +2(%[sp0]) \n\t" + "sb %[q0_f0], +2(%[sq0]) \n\t" + "sb %[q1_f0], +2(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), + [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l], +3(%[sp2]) \n\t" + "sb %[p1_l], +3(%[sp1]) \n\t" + "sb %[p0_l], +3(%[sp0]) \n\t" + "sb %[q0_l], +3(%[sq0]) \n\t" + "sb %[q1_l], +3(%[sq1]) \n\t" + "sb %[q2_l], +3(%[sq2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], +3(%[sp1]) \n\t" + "sb %[p0_f0], +3(%[sp0]) \n\t" + "sb %[q0_f0], +3(%[sq0]) \n\t" + "sb %[q1_f0], +3(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + } + + s = s + 4; + } +} + +void aom_lpf_vertical_8_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + uint8_t i; + uint32_t mask, hev, flat; + uint8_t *s1, *s2, *s3, *s4; + uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + uint32_t p3, p2, p1, p0, q3, q2, q1, q0; + uint32_t p1_f0, p0_f0, q0_f0, q1_f0; + uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l; + uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + prefetch_store(s + pitch); + + for (i = 0; i < 2; i++) { + s1 = s; + s2 = s + pitch; + s3 = s2 + pitch; + s4 = s3 + pitch; + s = s4 + pitch; + + __asm__ __volatile__( + "lw %[p0], -4(%[s1]) \n\t" + "lw %[p1], -4(%[s2]) \n\t" + "lw %[p2], -4(%[s3]) \n\t" + "lw %[p3], -4(%[s4]) \n\t" + "lw %[q3], (%[s1]) \n\t" + "lw %[q2], (%[s2]) \n\t" + "lw %[q1], (%[s3]) \n\t" + "lw %[q0], (%[s4]) \n\t" + + : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), + [q0] "=&r"(q0), [q1] "=&r"(q1), [q2] "=&r"(q2), [q3] "=&r"(q3) + : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); + + /* transpose p3, p2, p1, p0 + original (when loaded from memory) + register -4 -3 -2 -1 + p0 p0_0 p0_1 p0_2 p0_3 + p1 p1_0 p1_1 p1_2 p1_3 + p2 p2_0 p2_1 p2_2 p2_3 + p3 p3_0 p3_1 p3_2 p3_3 + + after transpose + register + p0 p3_3 p2_3 p1_3 p0_3 + p1 p3_2 p2_2 p1_2 p0_2 + p2 p3_1 p2_1 p1_1 p0_1 + p3 p3_0 p2_0 p1_0 p0_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t" + "precr.qb.ph %[prim2], %[p0], %[p1] \n\t" + "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t" + "precr.qb.ph %[prim4], %[p2], %[p3] \n\t" + + "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p0], %[p1], %[sec3] \n\t" + "precrq.ph.w %[p2], %[p3], %[sec4] \n\t" + "append %[p1], %[sec3], 16 \n\t" + "append %[p3], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2), + [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose q0, q1, q2, q3 + original (when loaded from memory) + register +1 +2 +3 +4 + q3 q3_0 q3_1 q3_2 q3_3 + q2 q2_0 q2_1 q2_2 q2_3 + q1 q1_0 q1_1 q1_2 q1_3 + q0 q0_0 q0_1 q0_2 q0_3 + + after transpose + register + q3 q0_3 q1_3 q2_3 q3_3 + q2 q0_2 q1_2 q2_2 q3_2 + q1 q0_1 q1_1 q2_1 q3_1 + q0 q0_0 q1_0 q2_0 q3_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t" + "precr.qb.ph %[prim2], %[q3], %[q2] \n\t" + "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t" + "precr.qb.ph %[prim4], %[q1], %[q0] \n\t" + + "precrq.qb.ph %[q2], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[q0], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[q3], %[q2], %[sec3] \n\t" + "precrq.ph.w %[q1], %[q0], %[sec4] \n\t" + "append %[q2], %[sec3], 16 \n\t" + "append %[q0], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1), + [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, + p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); + + if ((flat == 0) && (mask != 0)) { + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + STORE_F0() + } else if ((mask & flat) == 0xFFFFFFFF) { + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + STORE_F1() + } else if ((flat != 0) && (mask != 0)) { + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r], -3(%[s4]) \n\t" + "sb %[p1_r], -2(%[s4]) \n\t" + "sb %[p0_r], -1(%[s4]) \n\t" + "sb %[q0_r], (%[s4]) \n\t" + "sb %[q1_r], +1(%[s4]) \n\t" + "sb %[q2_r], +2(%[s4]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [s4] "r"(s4)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s4]) \n\t" + "sb %[p0_f0], -1(%[s4]) \n\t" + "sb %[q0_f0], (%[s4]) \n\t" + "sb %[q1_f0], +1(%[s4]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s4] "r"(s4)); + } + + __asm__ __volatile__( + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), + [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r], -3(%[s3]) \n\t" + "sb %[p1_r], -2(%[s3]) \n\t" + "sb %[p0_r], -1(%[s3]) \n\t" + "sb %[q0_r], (%[s3]) \n\t" + "sb %[q1_r], +1(%[s3]) \n\t" + "sb %[q2_r], +2(%[s3]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [s3] "r"(s3)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s3]) \n\t" + "sb %[p0_f0], -1(%[s3]) \n\t" + "sb %[q0_f0], (%[s3]) \n\t" + "sb %[q1_f0], +1(%[s3]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s3] "r"(s3)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0), + [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0), + [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l], -3(%[s2]) \n\t" + "sb %[p1_l], -2(%[s2]) \n\t" + "sb %[p0_l], -1(%[s2]) \n\t" + "sb %[q0_l], (%[s2]) \n\t" + "sb %[q1_l], +1(%[s2]) \n\t" + "sb %[q2_l], +2(%[s2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [s2] "r"(s2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s2]) \n\t" + "sb %[p0_f0], -1(%[s2]) \n\t" + "sb %[q0_f0], (%[s2]) \n\t" + "sb %[q1_f0], +1(%[s2]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s2] "r"(s2)); + } + + __asm__ __volatile__( + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), + [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l], -3(%[s1]) \n\t" + "sb %[p1_l], -2(%[s1]) \n\t" + "sb %[p0_l], -1(%[s1]) \n\t" + "sb %[q0_l], (%[s1]) \n\t" + "sb %[q1_l], +1(%[s1]) \n\t" + "sb %[q2_l], +2(%[s1]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [s1] "r"(s1)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s1]) \n\t" + "sb %[p0_f0], -1(%[s1]) \n\t" + "sb %[q0_f0], (%[s1]) \n\t" + "sb %[q1_f0], +1(%[s1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s1] "r"(s1)); + } + } + } +} +#endif // #if HAVE_DSPR2 diff --git a/libs/libaom/src/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c b/libs/libaom/src/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c new file mode 100644 index 000000000..34733e42e --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c @@ -0,0 +1,734 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/mips/common_dspr2.h" +#include "aom_dsp/mips/loopfilter_filters_dspr2.h" +#include "aom_dsp/mips/loopfilter_macros_dspr2.h" +#include "aom_dsp/mips/loopfilter_masks_dspr2.h" +#include "aom_mem/aom_mem.h" + +#if HAVE_DSPR2 +static void mb_lpf_horizontal_edge(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int count) { + uint32_t mask; + uint32_t hev, flat, flat2; + uint8_t i; + uint8_t *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0; + uint8_t *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + uint32_t p1_f0, p0_f0, q0_f0, q1_f0; + uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l; + uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l; + uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r; + uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r; + uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1; + uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + /* prefetch data for store */ + prefetch_store(s); + + for (i = 0; i < (2 * count); i++) { + sp7 = s - (pitch << 3); + sp6 = sp7 + pitch; + sp5 = sp6 + pitch; + sp4 = sp5 + pitch; + sp3 = sp4 + pitch; + sp2 = sp3 + pitch; + sp1 = sp2 + pitch; + sp0 = sp1 + pitch; + sq0 = s; + sq1 = s + pitch; + sq2 = sq1 + pitch; + sq3 = sq2 + pitch; + sq4 = sq3 + pitch; + sq5 = sq4 + pitch; + sq6 = sq5 + pitch; + sq7 = sq6 + pitch; + + __asm__ __volatile__( + "lw %[p7], (%[sp7]) \n\t" + "lw %[p6], (%[sp6]) \n\t" + "lw %[p5], (%[sp5]) \n\t" + "lw %[p4], (%[sp4]) \n\t" + "lw %[p3], (%[sp3]) \n\t" + "lw %[p2], (%[sp2]) \n\t" + "lw %[p1], (%[sp1]) \n\t" + "lw %[p0], (%[sp0]) \n\t" + + : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), + [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4) + : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sp4] "r"(sp4), [sp5] "r"(sp5), [sp6] "r"(sp6), [sp7] "r"(sp7)); + + __asm__ __volatile__( + "lw %[q0], (%[sq0]) \n\t" + "lw %[q1], (%[sq1]) \n\t" + "lw %[q2], (%[sq2]) \n\t" + "lw %[q3], (%[sq3]) \n\t" + "lw %[q4], (%[sq4]) \n\t" + "lw %[q5], (%[sq5]) \n\t" + "lw %[q6], (%[sq6]) \n\t" + "lw %[q7], (%[sq7]) \n\t" + + : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0), + [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4) + : [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0), + [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6), [sq7] "r"(sq7)); + + filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, + p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); + + flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2); + + /* f0 */ + if (((flat2 == 0) && (flat == 0) && (mask != 0)) || + ((flat2 != 0) && (flat == 0) && (mask != 0))) { + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + __asm__ __volatile__( + "sw %[p1_f0], (%[sp1]) \n\t" + "sw %[p0_f0], (%[sp0]) \n\t" + "sw %[q0_f0], (%[sq0]) \n\t" + "sw %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1)); + } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) && + (mask == 0xFFFFFFFF)) { + /* f2 */ + PACK_LEFT_0TO3() + PACK_LEFT_4TO7() + wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, + &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, + &q6_l, &q7_l); + + PACK_RIGHT_0TO3() + PACK_RIGHT_4TO7() + wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, + &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, + &q6_r, &q7_r); + + COMBINE_LEFT_RIGHT_0TO2() + COMBINE_LEFT_RIGHT_3TO6() + + __asm__ __volatile__( + "sw %[p6], (%[sp6]) \n\t" + "sw %[p5], (%[sp5]) \n\t" + "sw %[p4], (%[sp4]) \n\t" + "sw %[p3], (%[sp3]) \n\t" + "sw %[p2], (%[sp2]) \n\t" + "sw %[p1], (%[sp1]) \n\t" + "sw %[p0], (%[sp0]) \n\t" + + : + : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), + [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [sp6] "r"(sp6), + [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0)); + + __asm__ __volatile__( + "sw %[q6], (%[sq6]) \n\t" + "sw %[q5], (%[sq5]) \n\t" + "sw %[q4], (%[sq4]) \n\t" + "sw %[q3], (%[sq3]) \n\t" + "sw %[q2], (%[sq2]) \n\t" + "sw %[q1], (%[sq1]) \n\t" + "sw %[q0], (%[sq0]) \n\t" + + : + : [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3), + [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [sq6] "r"(sq6), + [sq5] "r"(sq5), [sq4] "r"(sq4), [sq3] "r"(sq3), [sq2] "r"(sq2), + [sq1] "r"(sq1), [sq0] "r"(sq0)); + } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) { + /* f1 */ + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + COMBINE_LEFT_RIGHT_0TO2() + + __asm__ __volatile__( + "sw %[p2], (%[sp2]) \n\t" + "sw %[p1], (%[sp1]) \n\t" + "sw %[p0], (%[sp0]) \n\t" + "sw %[q0], (%[sq0]) \n\t" + "sw %[q1], (%[sq1]) \n\t" + "sw %[q2], (%[sq2]) \n\t" + + : + : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), + [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1), + [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) { + /* f0+f1 */ + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r], (%[sp2]) \n\t" + "sb %[p1_r], (%[sp1]) \n\t" + "sb %[p0_r], (%[sp0]) \n\t" + "sb %[q0_r], (%[sq0]) \n\t" + "sb %[q1_r], (%[sq1]) \n\t" + "sb %[q2_r], (%[sq2]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], (%[sp1]) \n\t" + "sb %[p0_f0], (%[sp0]) \n\t" + "sb %[q0_f0], (%[sq0]) \n\t" + "sb %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), + [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r], +1(%[sp2]) \n\t" + "sb %[p1_r], +1(%[sp1]) \n\t" + "sb %[p0_r], +1(%[sp0]) \n\t" + "sb %[q0_r], +1(%[sq0]) \n\t" + "sb %[q1_r], +1(%[sq1]) \n\t" + "sb %[q2_r], +1(%[sq2]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], +1(%[sp1]) \n\t" + "sb %[p0_f0], +1(%[sp0]) \n\t" + "sb %[q0_f0], +1(%[sq0]) \n\t" + "sb %[q1_f0], +1(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l], +2(%[sp2]) \n\t" + "sb %[p1_l], +2(%[sp1]) \n\t" + "sb %[p0_l], +2(%[sp0]) \n\t" + "sb %[q0_l], +2(%[sq0]) \n\t" + "sb %[q1_l], +2(%[sq1]) \n\t" + "sb %[q2_l], +2(%[sq2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], +2(%[sp1]) \n\t" + "sb %[p0_f0], +2(%[sp0]) \n\t" + "sb %[q0_f0], +2(%[sq0]) \n\t" + "sb %[q1_f0], +2(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), + [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l], +3(%[sp2]) \n\t" + "sb %[p1_l], +3(%[sp1]) \n\t" + "sb %[p0_l], +3(%[sp0]) \n\t" + "sb %[q0_l], +3(%[sq0]) \n\t" + "sb %[q1_l], +3(%[sq1]) \n\t" + "sb %[q2_l], +3(%[sq2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], +3(%[sp1]) \n\t" + "sb %[p0_f0], +3(%[sp0]) \n\t" + "sb %[q0_f0], +3(%[sq0]) \n\t" + "sb %[q1_f0], +3(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) { + /* f0 + f1 + f2 */ + /* f0 function */ + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* f1 function */ + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1, + &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1, + &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1); + + /* f2 function */ + PACK_LEFT_4TO7() + wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, + &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, + &q6_l, &q7_l); + + PACK_RIGHT_4TO7() + wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, + &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, + &q6_r, &q7_r); + + if (mask & flat & flat2 & 0x000000FF) { + __asm__ __volatile__( + "sb %[p6_r], (%[sp6]) \n\t" + "sb %[p5_r], (%[sp5]) \n\t" + "sb %[p4_r], (%[sp4]) \n\t" + "sb %[p3_r], (%[sp3]) \n\t" + "sb %[p2_r], (%[sp2]) \n\t" + "sb %[p1_r], (%[sp1]) \n\t" + "sb %[p0_r], (%[sp0]) \n\t" + + : + : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), + [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), + [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), + [sp2] "r"(sp2), [sp1] "r"(sp1), [p0_r] "r"(p0_r), [sp0] "r"(sp0)); + + __asm__ __volatile__( + "sb %[q0_r], (%[sq0]) \n\t" + "sb %[q1_r], (%[sq1]) \n\t" + "sb %[q2_r], (%[sq2]) \n\t" + "sb %[q3_r], (%[sq3]) \n\t" + "sb %[q4_r], (%[sq4]) \n\t" + "sb %[q5_r], (%[sq5]) \n\t" + "sb %[q6_r], (%[sq6]) \n\t" + + : + : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), + [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), + [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6)); + } else if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r_f1], (%[sp2]) \n\t" + "sb %[p1_r_f1], (%[sp1]) \n\t" + "sb %[p0_r_f1], (%[sp0]) \n\t" + "sb %[q0_r_f1], (%[sq0]) \n\t" + "sb %[q1_r_f1], (%[sq1]) \n\t" + "sb %[q2_r_f1], (%[sq2]) \n\t" + + : + : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), + [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), + [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), + [sq2] "r"(sq2)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], (%[sp1]) \n\t" + "sb %[p0_f0], (%[sp0]) \n\t" + "sb %[q0_f0], (%[sq0]) \n\t" + "sb %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p6_r], %[p6_r], 16 \n\t" + "srl %[p5_r], %[p5_r], 16 \n\t" + "srl %[p4_r], %[p4_r], 16 \n\t" + "srl %[p3_r], %[p3_r], 16 \n\t" + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[q3_r], %[q3_r], 16 \n\t" + "srl %[q4_r], %[q4_r], 16 \n\t" + "srl %[q5_r], %[q5_r], 16 \n\t" + "srl %[q6_r], %[q6_r], 16 \n\t" + + : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r), + [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), [p4_r] "+r"(p4_r), + [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), + [q6_r] "+r"(q6_r), [p0_r] "+r"(p0_r) + :); + + __asm__ __volatile__( + "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t" + "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t" + "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t" + "srl %[q0_r_f1], %[q0_r_f1], 16 \n\t" + "srl %[q1_r_f1], %[q1_r_f1], 16 \n\t" + "srl %[q2_r_f1], %[q2_r_f1], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1), + [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1), + [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p6_r], +1(%[sp6]) \n\t" + "sb %[p5_r], +1(%[sp5]) \n\t" + "sb %[p4_r], +1(%[sp4]) \n\t" + "sb %[p3_r], +1(%[sp3]) \n\t" + "sb %[p2_r], +1(%[sp2]) \n\t" + "sb %[p1_r], +1(%[sp1]) \n\t" + "sb %[p0_r], +1(%[sp0]) \n\t" + + : + : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), + [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), + [p0_r] "r"(p0_r), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), + [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0)); + + __asm__ __volatile__( + "sb %[q0_r], +1(%[sq0]) \n\t" + "sb %[q1_r], +1(%[sq1]) \n\t" + "sb %[q2_r], +1(%[sq2]) \n\t" + "sb %[q3_r], +1(%[sq3]) \n\t" + "sb %[q4_r], +1(%[sq4]) \n\t" + "sb %[q5_r], +1(%[sq5]) \n\t" + "sb %[q6_r], +1(%[sq6]) \n\t" + + : + : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), + [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), + [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6)); + } else if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r_f1], +1(%[sp2]) \n\t" + "sb %[p1_r_f1], +1(%[sp1]) \n\t" + "sb %[p0_r_f1], +1(%[sp0]) \n\t" + "sb %[q0_r_f1], +1(%[sq0]) \n\t" + "sb %[q1_r_f1], +1(%[sq1]) \n\t" + "sb %[q2_r_f1], +1(%[sq2]) \n\t" + + : + : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), + [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), + [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), + [sq2] "r"(sq2)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], +1(%[sp1]) \n\t" + "sb %[p0_f0], +1(%[sp0]) \n\t" + "sb %[q0_f0], +1(%[sq0]) \n\t" + "sb %[q1_f0], +1(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p6_l], +2(%[sp6]) \n\t" + "sb %[p5_l], +2(%[sp5]) \n\t" + "sb %[p4_l], +2(%[sp4]) \n\t" + "sb %[p3_l], +2(%[sp3]) \n\t" + "sb %[p2_l], +2(%[sp2]) \n\t" + "sb %[p1_l], +2(%[sp1]) \n\t" + "sb %[p0_l], +2(%[sp0]) \n\t" + + : + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), + [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), + [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), + [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0)); + + __asm__ __volatile__( + "sb %[q0_l], +2(%[sq0]) \n\t" + "sb %[q1_l], +2(%[sq1]) \n\t" + "sb %[q2_l], +2(%[sq2]) \n\t" + "sb %[q3_l], +2(%[sq3]) \n\t" + "sb %[q4_l], +2(%[sq4]) \n\t" + "sb %[q5_l], +2(%[sq5]) \n\t" + "sb %[q6_l], +2(%[sq6]) \n\t" + + : + : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), + [q6_l] "r"(q6_l), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), + [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6)); + } else if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l_f1], +2(%[sp2]) \n\t" + "sb %[p1_l_f1], +2(%[sp1]) \n\t" + "sb %[p0_l_f1], +2(%[sp0]) \n\t" + "sb %[q0_l_f1], +2(%[sq0]) \n\t" + "sb %[q1_l_f1], +2(%[sq1]) \n\t" + "sb %[q2_l_f1], +2(%[sq2]) \n\t" + + : + : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), + [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), + [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), + [sq2] "r"(sq2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], +2(%[sp1]) \n\t" + "sb %[p0_f0], +2(%[sp0]) \n\t" + "sb %[q0_f0], +2(%[sq0]) \n\t" + "sb %[q1_f0], +2(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p6_l], %[p6_l], 16 \n\t" + "srl %[p5_l], %[p5_l], 16 \n\t" + "srl %[p4_l], %[p4_l], 16 \n\t" + "srl %[p3_l], %[p3_l], 16 \n\t" + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[q3_l], %[q3_l], 16 \n\t" + "srl %[q4_l], %[q4_l], 16 \n\t" + "srl %[q5_l], %[q5_l], 16 \n\t" + "srl %[q6_l], %[q6_l], 16 \n\t" + + : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l), + [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l), + [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l), + [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l) + :); + + __asm__ __volatile__( + "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t" + "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t" + "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t" + "srl %[q0_l_f1], %[q0_l_f1], 16 \n\t" + "srl %[q1_l_f1], %[q1_l_f1], 16 \n\t" + "srl %[q2_l_f1], %[q2_l_f1], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1), + [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1), + [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0xFF000000) { + __asm__ __volatile__( + "sb %[p6_l], +3(%[sp6]) \n\t" + "sb %[p5_l], +3(%[sp5]) \n\t" + "sb %[p4_l], +3(%[sp4]) \n\t" + "sb %[p3_l], +3(%[sp3]) \n\t" + "sb %[p2_l], +3(%[sp2]) \n\t" + "sb %[p1_l], +3(%[sp1]) \n\t" + "sb %[p0_l], +3(%[sp0]) \n\t" + + : + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), + [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), + [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), + [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0)); + + __asm__ __volatile__( + "sb %[q0_l], +3(%[sq0]) \n\t" + "sb %[q1_l], +3(%[sq1]) \n\t" + "sb %[q2_l], +3(%[sq2]) \n\t" + "sb %[q3_l], +3(%[sq3]) \n\t" + "sb %[q4_l], +3(%[sq4]) \n\t" + "sb %[q5_l], +3(%[sq5]) \n\t" + "sb %[q6_l], +3(%[sq6]) \n\t" + + : + : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), + [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), [sq3] "r"(sq3), + [sq4] "r"(sq4), [sq5] "r"(sq5), [q6_l] "r"(q6_l), [sq6] "r"(sq6)); + } else if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l_f1], +3(%[sp2]) \n\t" + "sb %[p1_l_f1], +3(%[sp1]) \n\t" + "sb %[p0_l_f1], +3(%[sp0]) \n\t" + "sb %[q0_l_f1], +3(%[sq0]) \n\t" + "sb %[q1_l_f1], +3(%[sq1]) \n\t" + "sb %[q2_l_f1], +3(%[sq2]) \n\t" + + : + : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), + [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), + [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), + [sq2] "r"(sq2)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], +3(%[sp1]) \n\t" + "sb %[p0_f0], +3(%[sp0]) \n\t" + "sb %[q0_f0], +3(%[sq0]) \n\t" + "sb %[q1_f0], +3(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + } + + s = s + 4; + } +} + +void aom_lpf_horizontal_16_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1); +} + +void aom_lpf_horizontal_16_dual_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 2); +} +#endif // #if HAVE_DSPR2 diff --git a/libs/libaom/src/aom_dsp/mips/loopfilter_mb_vert_dspr2.c b/libs/libaom/src/aom_dsp/mips/loopfilter_mb_vert_dspr2.c new file mode 100644 index 000000000..3d3f1ec97 --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/loopfilter_mb_vert_dspr2.c @@ -0,0 +1,758 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/mips/common_dspr2.h" +#include "aom_dsp/mips/loopfilter_filters_dspr2.h" +#include "aom_dsp/mips/loopfilter_macros_dspr2.h" +#include "aom_dsp/mips/loopfilter_masks_dspr2.h" +#include "aom_mem/aom_mem.h" + +#if HAVE_DSPR2 +void aom_lpf_vertical_16_dspr2(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8_t i; + uint32_t mask, hev, flat, flat2; + uint8_t *s1, *s2, *s3, *s4; + uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + uint32_t p1_f0, p0_f0, q0_f0, q1_f0; + uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l; + uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l; + uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r; + uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r; + uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1; + uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + prefetch_store(s + pitch); + + for (i = 0; i < 2; i++) { + s1 = s; + s2 = s + pitch; + s3 = s2 + pitch; + s4 = s3 + pitch; + s = s4 + pitch; + + __asm__ __volatile__( + "lw %[p0], -4(%[s1]) \n\t" + "lw %[p1], -4(%[s2]) \n\t" + "lw %[p2], -4(%[s3]) \n\t" + "lw %[p3], -4(%[s4]) \n\t" + "lw %[p4], -8(%[s1]) \n\t" + "lw %[p5], -8(%[s2]) \n\t" + "lw %[p6], -8(%[s3]) \n\t" + "lw %[p7], -8(%[s4]) \n\t" + + : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), + [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4) + : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); + + __asm__ __volatile__( + "lw %[q3], (%[s1]) \n\t" + "lw %[q2], (%[s2]) \n\t" + "lw %[q1], (%[s3]) \n\t" + "lw %[q0], (%[s4]) \n\t" + "lw %[q7], +4(%[s1]) \n\t" + "lw %[q6], +4(%[s2]) \n\t" + "lw %[q5], +4(%[s3]) \n\t" + "lw %[q4], +4(%[s4]) \n\t" + + : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0), + [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4) + : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); + + /* transpose p3, p2, p1, p0 + original (when loaded from memory) + register -4 -3 -2 -1 + p0 p0_0 p0_1 p0_2 p0_3 + p1 p1_0 p1_1 p1_2 p1_3 + p2 p2_0 p2_1 p2_2 p2_3 + p3 p3_0 p3_1 p3_2 p3_3 + + after transpose + register + p0 p3_3 p2_3 p1_3 p0_3 + p1 p3_2 p2_2 p1_2 p0_2 + p2 p3_1 p2_1 p1_1 p0_1 + p3 p3_0 p2_0 p1_0 p0_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t" + "precr.qb.ph %[prim2], %[p0], %[p1] \n\t" + "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t" + "precr.qb.ph %[prim4], %[p2], %[p3] \n\t" + + "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p0], %[p1], %[sec3] \n\t" + "precrq.ph.w %[p2], %[p3], %[sec4] \n\t" + "append %[p1], %[sec3], 16 \n\t" + "append %[p3], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2), + [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose q0, q1, q2, q3 + original (when loaded from memory) + register +1 +2 +3 +4 + q3 q3_0 q3_1 q3_2 q3_3 + q2 q2_0 q2_1 q2_2 q2_3 + q1 q1_0 q1_1 q1_2 q1_3 + q0 q0_0 q0_1 q0_2 q0_3 + + after transpose + register + q3 q0_3 q1_3 q2_3 q3_3 + q2 q0_2 q1_2 q2_2 q3_2 + q1 q0_1 q1_1 q2_1 q3_1 + q0 q0_0 q1_0 q2_0 q3_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t" + "precr.qb.ph %[prim2], %[q3], %[q2] \n\t" + "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t" + "precr.qb.ph %[prim4], %[q1], %[q0] \n\t" + + "precrq.qb.ph %[q2], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[q0], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[q3], %[q2], %[sec3] \n\t" + "precrq.ph.w %[q1], %[q0], %[sec4] \n\t" + "append %[q2], %[sec3], 16 \n\t" + "append %[q0], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1), + [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose p7, p6, p5, p4 + original (when loaded from memory) + register -8 -7 -6 -5 + p4 p4_0 p4_1 p4_2 p4_3 + p5 p5_0 p5_1 p5_2 p5_3 + p6 p6_0 p6_1 p6_2 p6_3 + p7 p7_0 p7_1 p7_2 p7_3 + + after transpose + register + p4 p7_3 p6_3 p5_3 p4_3 + p5 p7_2 p6_2 p5_2 p4_2 + p6 p7_1 p6_1 p5_1 p4_1 + p7 p7_0 p6_0 p5_0 p4_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p4], %[p5] \n\t" + "precr.qb.ph %[prim2], %[p4], %[p5] \n\t" + "precrq.qb.ph %[prim3], %[p6], %[p7] \n\t" + "precr.qb.ph %[prim4], %[p6], %[p7] \n\t" + + "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p7], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p4], %[p5], %[sec3] \n\t" + "precrq.ph.w %[p6], %[p7], %[sec4] \n\t" + "append %[p5], %[sec3], 16 \n\t" + "append %[p7], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p4] "+r"(p4), [p5] "+r"(p5), [p6] "+r"(p6), + [p7] "+r"(p7), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose q4, q5, q6, q7 + original (when loaded from memory) + register +5 +6 +7 +8 + q7 q7_0 q7_1 q7_2 q7_3 + q6 q6_0 q6_1 q6_2 q6_3 + q5 q5_0 q5_1 q5_2 q5_3 + q4 q4_0 q4_1 q4_2 q4_3 + + after transpose + register + q7 q4_3 q5_3 q26_3 q7_3 + q6 q4_2 q5_2 q26_2 q7_2 + q5 q4_1 q5_1 q26_1 q7_1 + q4 q4_0 q5_0 q26_0 q7_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[q7], %[q6] \n\t" + "precr.qb.ph %[prim2], %[q7], %[q6] \n\t" + "precrq.qb.ph %[prim3], %[q5], %[q4] \n\t" + "precr.qb.ph %[prim4], %[q5], %[q4] \n\t" + + "precrq.qb.ph %[q6], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[q4], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[q7], %[q6], %[sec3] \n\t" + "precrq.ph.w %[q5], %[q4], %[sec4] \n\t" + "append %[q6], %[sec3], 16 \n\t" + "append %[q4], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [q7] "+r"(q7), [q6] "+r"(q6), [q5] "+r"(q5), + [q4] "+r"(q4), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, + p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); + + flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2); + + /* f0 */ + if (((flat2 == 0) && (flat == 0) && (mask != 0)) || + ((flat2 != 0) && (flat == 0) && (mask != 0))) { + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + STORE_F0() + } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) && + (mask == 0xFFFFFFFF)) { + /* f2 */ + PACK_LEFT_0TO3() + PACK_LEFT_4TO7() + wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, + &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, + &q6_l, &q7_l); + + PACK_RIGHT_0TO3() + PACK_RIGHT_4TO7() + wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, + &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, + &q6_r, &q7_r); + + STORE_F2() + } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) { + /* f1 */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + STORE_F1() + } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) { + /* f0 + f1 */ + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r], -3(%[s4]) \n\t" + "sb %[p1_r], -2(%[s4]) \n\t" + "sb %[p0_r], -1(%[s4]) \n\t" + "sb %[q0_r], (%[s4]) \n\t" + "sb %[q1_r], +1(%[s4]) \n\t" + "sb %[q2_r], +2(%[s4]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [s4] "r"(s4)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s4]) \n\t" + "sb %[p0_f0], -1(%[s4]) \n\t" + "sb %[q0_f0], (%[s4]) \n\t" + "sb %[q1_f0], +1(%[s4]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s4] "r"(s4)); + } + + __asm__ __volatile__( + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), + [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r], -3(%[s3]) \n\t" + "sb %[p1_r], -2(%[s3]) \n\t" + "sb %[p0_r], -1(%[s3]) \n\t" + "sb %[q0_r], (%[s3]) \n\t" + "sb %[q1_r], +1(%[s3]) \n\t" + "sb %[q2_r], +2(%[s3]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [s3] "r"(s3)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s3]) \n\t" + "sb %[p0_f0], -1(%[s3]) \n\t" + "sb %[q0_f0], (%[s3]) \n\t" + "sb %[q1_f0], +1(%[s3]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s3] "r"(s3)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l], -3(%[s2]) \n\t" + "sb %[p1_l], -2(%[s2]) \n\t" + "sb %[p0_l], -1(%[s2]) \n\t" + "sb %[q0_l], (%[s2]) \n\t" + "sb %[q1_l], +1(%[s2]) \n\t" + "sb %[q2_l], +2(%[s2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [s2] "r"(s2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s2]) \n\t" + "sb %[p0_f0], -1(%[s2]) \n\t" + "sb %[q0_f0], (%[s2]) \n\t" + "sb %[q1_f0], +1(%[s2]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s2] "r"(s2)); + } + + __asm__ __volatile__( + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), + [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l], -3(%[s1]) \n\t" + "sb %[p1_l], -2(%[s1]) \n\t" + "sb %[p0_l], -1(%[s1]) \n\t" + "sb %[q0_l], (%[s1]) \n\t" + "sb %[q1_l], +1(%[s1]) \n\t" + "sb %[q2_l], +2(%[s1]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [s1] "r"(s1)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s1]) \n\t" + "sb %[p0_f0], -1(%[s1]) \n\t" + "sb %[q0_f0], (%[s1]) \n\t" + "sb %[q1_f0], +1(%[s1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s1] "r"(s1)); + } + } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) { + /* f0+f1+f2 */ + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + PACK_LEFT_0TO3() + mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1, + &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1); + + PACK_RIGHT_0TO3() + mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1, + &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1); + + PACK_LEFT_4TO7() + wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, + &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, + &q6_l, &q7_l); + + PACK_RIGHT_4TO7() + wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, + &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, + &q6_r, &q7_r); + + if (mask & flat & flat2 & 0x000000FF) { + __asm__ __volatile__( + "sb %[p6_r], -7(%[s4]) \n\t" + "sb %[p5_r], -6(%[s4]) \n\t" + "sb %[p4_r], -5(%[s4]) \n\t" + "sb %[p3_r], -4(%[s4]) \n\t" + "sb %[p2_r], -3(%[s4]) \n\t" + "sb %[p1_r], -2(%[s4]) \n\t" + "sb %[p0_r], -1(%[s4]) \n\t" + + : + : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), + [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), + [p0_r] "r"(p0_r), [s4] "r"(s4)); + + __asm__ __volatile__( + "sb %[q0_r], (%[s4]) \n\t" + "sb %[q1_r], +1(%[s4]) \n\t" + "sb %[q2_r], +2(%[s4]) \n\t" + "sb %[q3_r], +3(%[s4]) \n\t" + "sb %[q4_r], +4(%[s4]) \n\t" + "sb %[q5_r], +5(%[s4]) \n\t" + "sb %[q6_r], +6(%[s4]) \n\t" + + : + : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), + [q6_r] "r"(q6_r), [s4] "r"(s4)); + } else if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r_f1], -3(%[s4]) \n\t" + "sb %[p1_r_f1], -2(%[s4]) \n\t" + "sb %[p0_r_f1], -1(%[s4]) \n\t" + "sb %[q0_r_f1], (%[s4]) \n\t" + "sb %[q1_r_f1], +1(%[s4]) \n\t" + "sb %[q2_r_f1], +2(%[s4]) \n\t" + + : + : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), + [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), + [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s4] "r"(s4)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s4]) \n\t" + "sb %[p0_f0], -1(%[s4]) \n\t" + "sb %[q0_f0], (%[s4]) \n\t" + "sb %[q1_f0], +1(%[s4]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s4] "r"(s4)); + } + + __asm__ __volatile__( + "srl %[p6_r], %[p6_r], 16 \n\t" + "srl %[p5_r], %[p5_r], 16 \n\t" + "srl %[p4_r], %[p4_r], 16 \n\t" + "srl %[p3_r], %[p3_r], 16 \n\t" + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[q3_r], %[q3_r], 16 \n\t" + "srl %[q4_r], %[q4_r], 16 \n\t" + "srl %[q5_r], %[q5_r], 16 \n\t" + "srl %[q6_r], %[q6_r], 16 \n\t" + + : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r), + [q6_r] "+r"(q6_r), [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), + [p4_r] "+r"(p4_r), [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), + [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r) + :); + + __asm__ __volatile__( + "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t" + "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t" + "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t" + "srl %[q0_r_f1], %[q0_r_f1], 16 \n\t" + "srl %[q1_r_f1], %[q1_r_f1], 16 \n\t" + "srl %[q2_r_f1], %[q2_r_f1], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1), + [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1), + [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p6_r], -7(%[s3]) \n\t" + "sb %[p5_r], -6(%[s3]) \n\t" + "sb %[p4_r], -5(%[s3]) \n\t" + "sb %[p3_r], -4(%[s3]) \n\t" + "sb %[p2_r], -3(%[s3]) \n\t" + "sb %[p1_r], -2(%[s3]) \n\t" + "sb %[p0_r], -1(%[s3]) \n\t" + + : + : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), + [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), + [p0_r] "r"(p0_r), [s3] "r"(s3)); + + __asm__ __volatile__( + "sb %[q0_r], (%[s3]) \n\t" + "sb %[q1_r], +1(%[s3]) \n\t" + "sb %[q2_r], +2(%[s3]) \n\t" + "sb %[q3_r], +3(%[s3]) \n\t" + "sb %[q4_r], +4(%[s3]) \n\t" + "sb %[q5_r], +5(%[s3]) \n\t" + "sb %[q6_r], +6(%[s3]) \n\t" + + : + : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), + [q6_r] "r"(q6_r), [s3] "r"(s3)); + } else if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r_f1], -3(%[s3]) \n\t" + "sb %[p1_r_f1], -2(%[s3]) \n\t" + "sb %[p0_r_f1], -1(%[s3]) \n\t" + "sb %[q0_r_f1], (%[s3]) \n\t" + "sb %[q1_r_f1], +1(%[s3]) \n\t" + "sb %[q2_r_f1], +2(%[s3]) \n\t" + + : + : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), + [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), + [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s3] "r"(s3)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s3]) \n\t" + "sb %[p0_f0], -1(%[s3]) \n\t" + "sb %[q0_f0], (%[s3]) \n\t" + "sb %[q1_f0], +1(%[s3]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s3] "r"(s3)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p6_l], -7(%[s2]) \n\t" + "sb %[p5_l], -6(%[s2]) \n\t" + "sb %[p4_l], -5(%[s2]) \n\t" + "sb %[p3_l], -4(%[s2]) \n\t" + "sb %[p2_l], -3(%[s2]) \n\t" + "sb %[p1_l], -2(%[s2]) \n\t" + "sb %[p0_l], -1(%[s2]) \n\t" + + : + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), + [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), + [p0_l] "r"(p0_l), [s2] "r"(s2)); + + __asm__ __volatile__( + "sb %[q0_l], (%[s2]) \n\t" + "sb %[q1_l], +1(%[s2]) \n\t" + "sb %[q2_l], +2(%[s2]) \n\t" + "sb %[q3_l], +3(%[s2]) \n\t" + "sb %[q4_l], +4(%[s2]) \n\t" + "sb %[q5_l], +5(%[s2]) \n\t" + "sb %[q6_l], +6(%[s2]) \n\t" + + : + : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), + [q6_l] "r"(q6_l), [s2] "r"(s2)); + } else if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l_f1], -3(%[s2]) \n\t" + "sb %[p1_l_f1], -2(%[s2]) \n\t" + "sb %[p0_l_f1], -1(%[s2]) \n\t" + "sb %[q0_l_f1], (%[s2]) \n\t" + "sb %[q1_l_f1], +1(%[s2]) \n\t" + "sb %[q2_l_f1], +2(%[s2]) \n\t" + + : + : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), + [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), + [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s2] "r"(s2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s2]) \n\t" + "sb %[p0_f0], -1(%[s2]) \n\t" + "sb %[q0_f0], (%[s2]) \n\t" + "sb %[q1_f0], +1(%[s2]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s2] "r"(s2)); + } + + __asm__ __volatile__( + "srl %[p6_l], %[p6_l], 16 \n\t" + "srl %[p5_l], %[p5_l], 16 \n\t" + "srl %[p4_l], %[p4_l], 16 \n\t" + "srl %[p3_l], %[p3_l], 16 \n\t" + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[q3_l], %[q3_l], 16 \n\t" + "srl %[q4_l], %[q4_l], 16 \n\t" + "srl %[q5_l], %[q5_l], 16 \n\t" + "srl %[q6_l], %[q6_l], 16 \n\t" + + : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l), + [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l), + [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l), + [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l) + :); + + __asm__ __volatile__( + "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t" + "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t" + "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t" + "srl %[q0_l_f1], %[q0_l_f1], 16 \n\t" + "srl %[q1_l_f1], %[q1_l_f1], 16 \n\t" + "srl %[q2_l_f1], %[q2_l_f1], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1), + [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1), + [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0xFF000000) { + __asm__ __volatile__( + "sb %[p6_l], -7(%[s1]) \n\t" + "sb %[p5_l], -6(%[s1]) \n\t" + "sb %[p4_l], -5(%[s1]) \n\t" + "sb %[p3_l], -4(%[s1]) \n\t" + "sb %[p2_l], -3(%[s1]) \n\t" + "sb %[p1_l], -2(%[s1]) \n\t" + "sb %[p0_l], -1(%[s1]) \n\t" + + : + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), + [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), + [p0_l] "r"(p0_l), [s1] "r"(s1)); + + __asm__ __volatile__( + "sb %[q0_l], (%[s1]) \n\t" + "sb %[q1_l], 1(%[s1]) \n\t" + "sb %[q2_l], 2(%[s1]) \n\t" + "sb %[q3_l], 3(%[s1]) \n\t" + "sb %[q4_l], 4(%[s1]) \n\t" + "sb %[q5_l], 5(%[s1]) \n\t" + "sb %[q6_l], 6(%[s1]) \n\t" + + : + : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), + [q6_l] "r"(q6_l), [s1] "r"(s1)); + } else if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l_f1], -3(%[s1]) \n\t" + "sb %[p1_l_f1], -2(%[s1]) \n\t" + "sb %[p0_l_f1], -1(%[s1]) \n\t" + "sb %[q0_l_f1], (%[s1]) \n\t" + "sb %[q1_l_f1], +1(%[s1]) \n\t" + "sb %[q2_l_f1], +2(%[s1]) \n\t" + + : + : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), + [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), + [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s1] "r"(s1)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s1]) \n\t" + "sb %[p0_f0], -1(%[s1]) \n\t" + "sb %[q0_f0], (%[s1]) \n\t" + "sb %[q1_f0], +1(%[s1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s1] "r"(s1)); + } + } + } +} +#endif // #if HAVE_DSPR2 diff --git a/libs/libaom/src/aom_dsp/mips/loopfilter_msa.h b/libs/libaom/src/aom_dsp/mips/loopfilter_msa.h new file mode 100644 index 000000000..54b0bb4bd --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/loopfilter_msa.h @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_ +#define AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_ + +#include "aom_dsp/mips/macros_msa.h" + +#define AOM_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ + p1_out, p0_out, q0_out, q1_out) \ + { \ + v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ + v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ + v8i16 q0_sub_p0_r, filt_r, cnst3h; \ + \ + p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ + p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ + q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ + q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + filt = filt & (v16i8)hev_in; \ + q0_sub_p0 = q0_m - p0_m; \ + filt_sign = __msa_clti_s_b(filt, 0); \ + \ + cnst3h = __msa_ldi_h(3); \ + q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ + q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \ + filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ + filt_r += q0_sub_p0_r; \ + filt_r = __msa_sat_s_h(filt_r, 7); \ + \ + /* combine left and right part */ \ + filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r); \ + \ + filt = filt & (v16i8)mask_in; \ + cnst4b = __msa_ldi_b(4); \ + filt1 = __msa_adds_s_b(filt, cnst4b); \ + filt1 >>= 3; \ + \ + cnst3b = __msa_ldi_b(3); \ + filt2 = __msa_adds_s_b(filt, cnst3b); \ + filt2 >>= 3; \ + \ + q0_m = __msa_subs_s_b(q0_m, filt1); \ + q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \ + p0_m = __msa_adds_s_b(p0_m, filt2); \ + p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \ + \ + filt = __msa_srari_b(filt1, 1); \ + hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \ + filt = filt & (v16i8)hev_in; \ + \ + q1_m = __msa_subs_s_b(q1_m, filt); \ + q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ + p1_m = __msa_adds_s_b(p1_m, filt); \ + p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ + } + +#define AOM_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ + p1_out, p0_out, q0_out, q1_out) \ + { \ + v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ + v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ + v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \ + \ + p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ + p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ + q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ + q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + \ + filt = filt & (v16i8)hev_in; \ + \ + q0_sub_p0 = q0_m - p0_m; \ + filt_sign = __msa_clti_s_b(filt, 0); \ + \ + cnst3h = __msa_ldi_h(3); \ + q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ + q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \ + filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ + filt_r += q0_sub_p0_r; \ + filt_r = __msa_sat_s_h(filt_r, 7); \ + \ + q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0); \ + q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h); \ + filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt); \ + filt_l += q0_sub_p0_l; \ + filt_l = __msa_sat_s_h(filt_l, 7); \ + \ + filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r); \ + filt = filt & (v16i8)mask_in; \ + \ + cnst4b = __msa_ldi_b(4); \ + filt1 = __msa_adds_s_b(filt, cnst4b); \ + filt1 >>= 3; \ + \ + cnst3b = __msa_ldi_b(3); \ + filt2 = __msa_adds_s_b(filt, cnst3b); \ + filt2 >>= 3; \ + \ + q0_m = __msa_subs_s_b(q0_m, filt1); \ + q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \ + p0_m = __msa_adds_s_b(p0_m, filt2); \ + p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \ + \ + filt = __msa_srari_b(filt1, 1); \ + hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \ + filt = filt & (v16i8)hev_in; \ + \ + q1_m = __msa_subs_s_b(q1_m, filt); \ + q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ + p1_m = __msa_adds_s_b(p1_m, filt); \ + p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ + } + +#define AOM_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \ + { \ + v16u8 tmp_flat4, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \ + v16u8 zero_in = { 0 }; \ + \ + tmp_flat4 = __msa_ori_b(zero_in, 1); \ + p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \ + q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \ + p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \ + q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \ + \ + p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \ + flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \ + p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \ + flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \ + \ + flat_out = (tmp_flat4 < (v16u8)flat_out); \ + flat_out = __msa_xori_b(flat_out, 0xff); \ + flat_out = flat_out & (mask); \ + } + +#define AOM_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \ + q6_in, q7_in, flat_in, flat2_out) \ + { \ + v16u8 tmp_flat5, zero_in = { 0 }; \ + v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \ + v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \ + \ + tmp_flat5 = __msa_ori_b(zero_in, 1); \ + p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \ + q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \ + p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \ + q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \ + p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \ + q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \ + p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \ + q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \ + \ + p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \ + flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \ + flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \ + p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \ + flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \ + p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \ + flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \ + \ + flat2_out = (tmp_flat5 < (v16u8)flat2_out); \ + flat2_out = __msa_xori_b(flat2_out, 0xff); \ + flat2_out = flat2_out & flat_in; \ + } + +#define AOM_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ + p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \ + q1_filt8_out, q2_filt8_out) \ + { \ + v8u16 tmp_filt8_0, tmp_filt8_1, tmp_filt8_2; \ + \ + tmp_filt8_2 = p2_in + p1_in + p0_in; \ + tmp_filt8_0 = p3_in << 1; \ + \ + tmp_filt8_0 = tmp_filt8_0 + tmp_filt8_2 + q0_in; \ + tmp_filt8_1 = tmp_filt8_0 + p3_in + p2_in; \ + p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ + \ + tmp_filt8_1 = tmp_filt8_0 + p1_in + q1_in; \ + p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ + \ + tmp_filt8_1 = q2_in + q1_in + q0_in; \ + tmp_filt8_2 = tmp_filt8_2 + tmp_filt8_1; \ + tmp_filt8_0 = tmp_filt8_2 + (p0_in); \ + tmp_filt8_0 = tmp_filt8_0 + (p3_in); \ + p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_0, 3); \ + \ + tmp_filt8_0 = q2_in + q3_in; \ + tmp_filt8_0 = p0_in + tmp_filt8_1 + tmp_filt8_0; \ + tmp_filt8_1 = q3_in + q3_in; \ + tmp_filt8_1 = tmp_filt8_1 + tmp_filt8_0; \ + q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ + \ + tmp_filt8_0 = tmp_filt8_2 + q3_in; \ + tmp_filt8_1 = tmp_filt8_0 + q0_in; \ + q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ + \ + tmp_filt8_1 = tmp_filt8_0 - p2_in; \ + tmp_filt8_0 = q1_in + q3_in; \ + tmp_filt8_1 = tmp_filt8_0 + tmp_filt8_1; \ + q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ + } + +#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ + limit_in, b_limit_in, thresh_in, hev_out, mask_out, \ + flat_out) \ + { \ + v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ + v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ + \ + /* absolute subtraction of pixel values */ \ + p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \ + p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \ + p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \ + q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \ + q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \ + q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \ + p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \ + p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \ + \ + /* calculation of hev */ \ + flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \ + hev_out = thresh_in < (v16u8)flat_out; \ + \ + /* calculation of mask */ \ + p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \ + p1_asub_q1_m >>= 1; \ + p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \ + \ + mask_out = b_limit_in < p0_asub_q0_m; \ + mask_out = __msa_max_u_b(flat_out, mask_out); \ + p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \ + mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \ + q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \ + mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \ + \ + mask_out = limit_in < (v16u8)mask_out; \ + mask_out = __msa_xori_b(mask_out, 0xff); \ + } +#endif // AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_ diff --git a/libs/libaom/src/aom_dsp/mips/macros_msa.h b/libs/libaom/src/aom_dsp/mips/macros_msa.h new file mode 100644 index 000000000..9bfc27147 --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/macros_msa.h @@ -0,0 +1,2058 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_MIPS_MACROS_MSA_H_ +#define AOM_AOM_DSP_MIPS_MACROS_MSA_H_ + +#include + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" + +#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc)) +#define LD_UB(...) LD_B(v16u8, __VA_ARGS__) +#define LD_SB(...) LD_B(v16i8, __VA_ARGS__) + +#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc)) +#define LD_UH(...) LD_H(v8u16, __VA_ARGS__) +#define LD_SH(...) LD_H(v8i16, __VA_ARGS__) + +#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc)) +#define LD_SW(...) LD_W(v4i32, __VA_ARGS__) + +#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_UB(...) ST_B(v16u8, __VA_ARGS__) +#define ST_SB(...) ST_B(v16i8, __VA_ARGS__) + +#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_SH(...) ST_H(v8i16, __VA_ARGS__) + +#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_SW(...) ST_W(v4i32, __VA_ARGS__) + +#if (__mips_isa_rev >= 6) +#define LH(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint16_t val_m; \ + \ + __asm__ __volatile__("lh %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + \ + val_m; \ + }) + +#define LW(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint32_t val_m; \ + \ + __asm__ __volatile__("lw %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + \ + val_m; \ + }) + +#if (__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint64_t val_m = 0; \ + \ + __asm__ __volatile__("ld %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + \ + val_m; \ + }) +#else // !(__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint32_t val0_m, val1_m; \ + uint64_t val_m = 0; \ + \ + val0_m = LW(psrc_m); \ + val1_m = LW(psrc_m + 4); \ + \ + val_m = (uint64_t)(val1_m); \ + val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ + val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ + \ + val_m; \ + }) +#endif // (__mips == 64) + +#define SH(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint16_t val_m = (val); \ + \ + __asm__ __volatile__("sh %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m"(*pdst_m) \ + : [val_m] "r"(val_m)); \ + } + +#define SW(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint32_t val_m = (val); \ + \ + __asm__ __volatile__("sw %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m"(*pdst_m) \ + : [val_m] "r"(val_m)); \ + } + +#define SD(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint64_t val_m = (val); \ + \ + __asm__ __volatile__("sd %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m"(*pdst_m) \ + : [val_m] "r"(val_m)); \ + } +#else // !(__mips_isa_rev >= 6) +#define LH(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint16_t val_m; \ + \ + __asm__ __volatile__("ulh %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + \ + val_m; \ + }) + +#define LW(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint32_t val_m; \ + \ + __asm__ __volatile__("ulw %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + \ + val_m; \ + }) + +#if (__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint64_t val_m = 0; \ + \ + __asm__ __volatile__("uld %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + \ + val_m; \ + }) +#else // !(__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ + uint32_t val0_m, val1_m; \ + uint64_t val_m_combined = 0; \ + \ + val0_m = LW(psrc_m1); \ + val1_m = LW(psrc_m1 + 4); \ + \ + val_m_combined = (uint64_t)(val1_m); \ + val_m_combined = (uint64_t)((val_m_combined << 32) & 0xFFFFFFFF00000000); \ + val_m_combined = (uint64_t)(val_m_combined | (uint64_t)val0_m); \ + \ + val_m_combined; \ + }) +#endif // (__mips == 64) + +#define SH(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint16_t val_m = (val); \ + \ + __asm__ __volatile__("ush %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m"(*pdst_m) \ + : [val_m] "r"(val_m)); \ + } + +#define SW(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint32_t val_m = (val); \ + \ + __asm__ __volatile__("usw %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m"(*pdst_m) \ + : [val_m] "r"(val_m)); \ + } + +#define SD(val, pdst) \ + { \ + uint8_t *pdst_m1 = (uint8_t *)(pdst); \ + uint32_t val0_m, val1_m; \ + \ + val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ + val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ + \ + SW(val0_m, pdst_m1); \ + SW(val1_m, pdst_m1 + 4); \ + } +#endif // (__mips_isa_rev >= 6) + +/* Description : Load 4 words with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1, out2, out3 + Details : Load word in 'out0' from (psrc) + Load word in 'out1' from (psrc + stride) + Load word in 'out2' from (psrc + 2 * stride) + Load word in 'out3' from (psrc + 3 * stride) +*/ +#define LW4(psrc, stride, out0, out1, out2, out3) \ + { \ + out0 = LW((psrc)); \ + out1 = LW((psrc) + stride); \ + out2 = LW((psrc) + 2 * stride); \ + out3 = LW((psrc) + 3 * stride); \ + } + +/* Description : Load double words with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Details : Load double word in 'out0' from (psrc) + Load double word in 'out1' from (psrc + stride) +*/ +#define LD2(psrc, stride, out0, out1) \ + { \ + out0 = LD((psrc)); \ + out1 = LD((psrc) + stride); \ + } +#define LD4(psrc, stride, out0, out1, out2, out3) \ + { \ + LD2((psrc), stride, out0, out1); \ + LD2((psrc) + 2 * stride, stride, out2, out3); \ + } + +/* Description : Store 4 words with stride + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Store word from 'in0' to (pdst) + Store word from 'in1' to (pdst + stride) + Store word from 'in2' to (pdst + 2 * stride) + Store word from 'in3' to (pdst + 3 * stride) +*/ +#define SW4(in0, in1, in2, in3, pdst, stride) \ + { \ + SW(in0, (pdst)) \ + SW(in1, (pdst) + stride); \ + SW(in2, (pdst) + 2 * stride); \ + SW(in3, (pdst) + 3 * stride); \ + } + +/* Description : Store 4 double words with stride + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Store double word from 'in0' to (pdst) + Store double word from 'in1' to (pdst + stride) + Store double word from 'in2' to (pdst + 2 * stride) + Store double word from 'in3' to (pdst + 3 * stride) +*/ +#define SD4(in0, in1, in2, in3, pdst, stride) \ + { \ + SD(in0, (pdst)) \ + SD(in1, (pdst) + stride); \ + SD(in2, (pdst) + 2 * stride); \ + SD(in3, (pdst) + 3 * stride); \ + } + +/* Description : Load vectors with 16 byte elements with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Load 16 byte elements in 'out0' from (psrc) + Load 16 byte elements in 'out1' from (psrc + stride) +*/ +#define LD_B2(RTYPE, psrc, stride, out0, out1) \ + { \ + out0 = LD_B(RTYPE, (psrc)); \ + out1 = LD_B(RTYPE, (psrc) + stride); \ + } +#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) +#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) + +#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \ + { \ + LD_B2(RTYPE, (psrc), stride, out0, out1); \ + out2 = LD_B(RTYPE, (psrc) + 2 * stride); \ + } +#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__) + +#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ + { \ + LD_B2(RTYPE, (psrc), stride, out0, out1); \ + LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ + } +#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) +#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) + +#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ + { \ + LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + out4 = LD_B(RTYPE, (psrc) + 4 * stride); \ + } +#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__) +#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__) + +#define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \ + { \ + LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ + LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ + } +#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__) + +#define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ + out7) \ + { \ + LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ + } +#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__) +#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__) + +/* Description : Load vectors with 8 halfword elements with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Details : Load 8 halfword elements in 'out0' from (psrc) + Load 8 halfword elements in 'out1' from (psrc + stride) +*/ +#define LD_H2(RTYPE, psrc, stride, out0, out1) \ + { \ + out0 = LD_H(RTYPE, (psrc)); \ + out1 = LD_H(RTYPE, (psrc) + (stride)); \ + } +#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) + +#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \ + { \ + LD_H2(RTYPE, (psrc), stride, out0, out1); \ + LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ + } +#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__) + +#define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ + out7) \ + { \ + LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ + } +#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__) + +#define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ + out7, out8, out9, out10, out11, out12, out13, out14, out15) \ + { \ + LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6, \ + out7); \ + LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \ + out13, out14, out15); \ + } +#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__) + +/* Description : Load 4x4 block of signed halfword elements from 1D source + data into 4 vectors (Each vector with 4 signed halfwords) + Arguments : Input - psrc + Outputs - out0, out1, out2, out3 +*/ +#define LD4x4_SH(psrc, out0, out1, out2, out3) \ + { \ + out0 = LD_SH(psrc); \ + out2 = LD_SH(psrc + 8); \ + out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ + out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ + } + +/* Description : Load 2 vectors of signed word elements with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Return Type - signed word +*/ +#define LD_SW2(psrc, stride, out0, out1) \ + { \ + out0 = LD_SW((psrc)); \ + out1 = LD_SW((psrc) + stride); \ + } + +/* Description : Store vectors of 16 byte elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 16 byte elements from 'in0' to (pdst) + Store 16 byte elements from 'in1' to (pdst + stride) +*/ +#define ST_B2(RTYPE, in0, in1, pdst, stride) \ + { \ + ST_B(RTYPE, in0, (pdst)); \ + ST_B(RTYPE, in1, (pdst) + stride); \ + } +#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) + +#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ + { \ + ST_B2(RTYPE, in0, in1, (pdst), stride); \ + ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ + } +#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) + +#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ + { \ + ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \ + ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ + } +#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) + +/* Description : Store vectors of 8 halfword elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 8 halfword elements from 'in0' to (pdst) + Store 8 halfword elements from 'in1' to (pdst + stride) +*/ +#define ST_H2(RTYPE, in0, in1, pdst, stride) \ + { \ + ST_H(RTYPE, in0, (pdst)); \ + ST_H(RTYPE, in1, (pdst) + stride); \ + } +#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) + +#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \ + { \ + ST_H2(RTYPE, in0, in1, (pdst), stride); \ + ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ + } +#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__) + +#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ + { \ + ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ + ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ + } +#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__) + +/* Description : Store vectors of word elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 4 word elements from 'in0' to (pdst) + Store 4 word elements from 'in1' to (pdst + stride) +*/ +#define ST_SW2(in0, in1, pdst, stride) \ + { \ + ST_SW(in0, (pdst)); \ + ST_SW(in1, (pdst) + stride); \ + } + +/* Description : Store 2x4 byte block to destination memory from input vector + Arguments : Inputs - in, stidx, pdst, stride + Details : Index 'stidx' halfword element from 'in' vector is copied to + the GP register and stored to (pdst) + Index 'stidx+1' halfword element from 'in' vector is copied to + the GP register and stored to (pdst + stride) + Index 'stidx+2' halfword element from 'in' vector is copied to + the GP register and stored to (pdst + 2 * stride) + Index 'stidx+3' halfword element from 'in' vector is copied to + the GP register and stored to (pdst + 3 * stride) +*/ +#define ST2x4_UB(in, stidx, pdst, stride) \ + { \ + uint16_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \ + out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \ + out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \ + out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \ + \ + SH(out0_m, pblk_2x4_m); \ + SH(out1_m, pblk_2x4_m + stride); \ + SH(out2_m, pblk_2x4_m + 2 * stride); \ + SH(out3_m, pblk_2x4_m + 3 * stride); \ + } + +/* Description : Store 4x2 byte block to destination memory from input vector + Arguments : Inputs - in, pdst, stride + Details : Index 0 word element from 'in' vector is copied to the GP + register and stored to (pdst) + Index 1 word element from 'in' vector is copied to the GP + register and stored to (pdst + stride) +*/ +#define ST4x2_UB(in, pdst, stride) \ + { \ + uint32_t out0_m, out1_m; \ + uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_w((v4i32)in, 0); \ + out1_m = __msa_copy_u_w((v4i32)in, 1); \ + \ + SW(out0_m, pblk_4x2_m); \ + SW(out1_m, pblk_4x2_m + stride); \ + } + +/* Description : Store 4x4 byte block to destination memory from input vector + Arguments : Inputs - in0, in1, pdst, stride + Details : 'Idx0' word element from input vector 'in0' is copied to the + GP register and stored to (pdst) + 'Idx1' word element from input vector 'in0' is copied to the + GP register and stored to (pdst + stride) + 'Idx2' word element from input vector 'in0' is copied to the + GP register and stored to (pdst + 2 * stride) + 'Idx3' word element from input vector 'in0' is copied to the + GP register and stored to (pdst + 3 * stride) +*/ +#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \ + { \ + uint32_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_w((v4i32)in0, idx0); \ + out1_m = __msa_copy_u_w((v4i32)in0, idx1); \ + out2_m = __msa_copy_u_w((v4i32)in1, idx2); \ + out3_m = __msa_copy_u_w((v4i32)in1, idx3); \ + \ + SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \ + } +#define ST4x8_UB(in0, in1, pdst, stride) \ + { \ + uint8_t *pblk_4x8 = (uint8_t *)(pdst); \ + \ + ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \ + ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \ + } + +/* Description : Store 8x1 byte block to destination memory from input vector + Arguments : Inputs - in, pdst + Details : Index 0 double word element from 'in' vector is copied to the + GP register and stored to (pdst) +*/ +#define ST8x1_UB(in, pdst) \ + { \ + uint64_t out0_m; \ + \ + out0_m = __msa_copy_u_d((v2i64)in, 0); \ + SD(out0_m, pdst); \ + } + +/* Description : Store 8x2 byte block to destination memory from input vector + Arguments : Inputs - in, pdst, stride + Details : Index 0 double word element from 'in' vector is copied to the + GP register and stored to (pdst) + Index 1 double word element from 'in' vector is copied to the + GP register and stored to (pdst + stride) +*/ +#define ST8x2_UB(in, pdst, stride) \ + { \ + uint64_t out0_m, out1_m; \ + uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_d((v2i64)in, 0); \ + out1_m = __msa_copy_u_d((v2i64)in, 1); \ + \ + SD(out0_m, pblk_8x2_m); \ + SD(out1_m, pblk_8x2_m + stride); \ + } + +/* Description : Store 8x4 byte block to destination memory from input + vectors + Arguments : Inputs - in0, in1, pdst, stride + Details : Index 0 double word element from 'in0' vector is copied to the + GP register and stored to (pdst) + Index 1 double word element from 'in0' vector is copied to the + GP register and stored to (pdst + stride) + Index 0 double word element from 'in1' vector is copied to the + GP register and stored to (pdst + 2 * stride) + Index 1 double word element from 'in1' vector is copied to the + GP register and stored to (pdst + 3 * stride) +*/ +#define ST8x4_UB(in0, in1, pdst, stride) \ + { \ + uint64_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_d((v2i64)in0, 0); \ + out1_m = __msa_copy_u_d((v2i64)in0, 1); \ + out2_m = __msa_copy_u_d((v2i64)in1, 0); \ + out3_m = __msa_copy_u_d((v2i64)in1, 1); \ + \ + SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \ + } + +/* Description : average with rounding (in0 + in1 + 1) / 2. + Arguments : Inputs - in0, in1, in2, in3, + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each unsigned byte element from 'in0' vector is added with + each unsigned byte element from 'in1' vector. Then the average + with rounding is calculated and written to 'out0' +*/ +#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \ + out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \ + } +#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__) + +#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ + AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \ + } +#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__) + +/* Description : Immediate number of elements to slide with zero + Arguments : Inputs - in0, in1, slide_val + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'zero_m' vector are slid into 'in0' by + value specified in the 'slide_val' +*/ +#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \ + { \ + v16i8 zero_m = { 0 }; \ + out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \ + out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \ + } +#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__) + +#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, \ + slide_val) \ + { \ + SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \ + SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \ + } +#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__) + +/* Description : Immediate number of elements to slide + Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'in0_0' vector are slid into 'in1_0' by + value specified in the 'slide_val' +*/ +#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ + { \ + out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \ + out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \ + } +#define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__) +#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__) + +#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \ + out2, slide_val) \ + { \ + SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ + out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \ + } +#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__) +#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__) + +/* Description : Shuffle byte vector elements as per mask vector + Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'in0' & 'in1' are copied selectively to + 'out0' as per control vector 'mask0' +*/ +#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ + out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ + } +#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) +#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) +#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) + +#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \ + out3) \ + { \ + VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \ + VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \ + } +#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__) +#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__) + +/* Description : Dot product of byte vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Unsigned byte elements from 'mult0' are multiplied with + unsigned byte elements from 'cnst0' producing a result + twice the size of input i.e. unsigned halfword. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \ + out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \ + } +#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__) + +#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } +#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__) + +/* Description : Dot product of byte vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed byte elements from 'mult0' are multiplied with + signed byte elements from 'cnst0' producing a result + twice the size of input i.e. signed halfword. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \ + } +#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__) + +#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } +#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__) + +/* Description : Dot product of halfword vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed halfword elements from 'mult0' are multiplied with + signed halfword elements from 'cnst0' producing a result + twice the size of input i.e. signed word. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \ + } +#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__) + +#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } +#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__) + +/* Description : Dot product of word vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed word elements from 'mult0' are multiplied with + signed word elements from 'cnst0' producing a result + twice the size of input i.e. signed double word. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \ + } +#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__) + +/* Description : Dot product & addition of byte vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed byte elements from 'mult0' are multiplied with + signed byte elements from 'cnst0' producing a result + twice the size of input i.e. signed halfword. + The multiplication result of adjacent odd-even elements + are added to the 'out0' vector +*/ +#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \ + out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \ + } +#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__) + +#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } +#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__) + +/* Description : Dot product & addition of halfword vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed halfword elements from 'mult0' are multiplied with + signed halfword elements from 'cnst0' producing a result + twice the size of input i.e. signed word. + The multiplication result of adjacent odd-even elements + are added to the 'out0' vector +*/ +#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \ + out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \ + } +#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__) + +/* Description : Dot product & addition of double word vector elements + Arguments : Inputs - mult0, mult1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each signed word element from 'mult0' is multiplied with itself + producing an intermediate result twice the size of input + i.e. signed double word + The multiplication result of adjacent odd-even elements + are added to the 'out0' vector +*/ +#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \ + out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \ + } +#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__) + +/* Description : Minimum values between unsigned elements of + either vector are copied to the output vector + Arguments : Inputs - in0, in1, min_vec + Outputs - in place operation + Return Type - as per RTYPE + Details : Minimum of unsigned halfword element values from 'in0' and + 'min_vec' are written to output vector 'in0' +*/ +#define MIN_UH2(RTYPE, in0, in1, min_vec) \ + { \ + in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \ + in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \ + } +#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__) + +#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \ + { \ + MIN_UH2(RTYPE, in0, in1, min_vec); \ + MIN_UH2(RTYPE, in2, in3, min_vec); \ + } +#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__) + +/* Description : Clips all signed halfword elements of input vector + between 0 & 255 + Arguments : Input - in + Output - out_m + Return Type - signed halfword +*/ +#define CLIP_SH_0_255(in) \ + ({ \ + v8i16 max_m = __msa_ldi_h(255); \ + v8i16 out_m; \ + \ + out_m = __msa_maxi_s_h((v8i16)in, 0); \ + out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \ + out_m; \ + }) +#define CLIP_SH2_0_255(in0, in1) \ + { \ + in0 = CLIP_SH_0_255(in0); \ + in1 = CLIP_SH_0_255(in1); \ + } +#define CLIP_SH4_0_255(in0, in1, in2, in3) \ + { \ + CLIP_SH2_0_255(in0, in1); \ + CLIP_SH2_0_255(in2, in3); \ + } + +/* Description : Horizontal addition of 4 signed word elements of input vector + Arguments : Input - in (signed word vector) + Output - sum_m (i32 sum) + Return Type - signed word (GP) + Details : 4 signed word elements of 'in' vector are added together and + the resulting integer sum is returned +*/ +#define HADD_SW_S32(in) \ + ({ \ + v2i64 res0_m, res1_m; \ + int32_t sum_m; \ + \ + res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \ + res1_m = __msa_splati_d(res0_m, 1); \ + res0_m = res0_m + res1_m; \ + sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \ + sum_m; \ + }) + +/* Description : Horizontal addition of 8 unsigned halfword elements + Arguments : Inputs - in (unsigned halfword vector) + Outputs - sum_m (u32 sum) + Return Type - unsigned word + Details : 8 unsigned halfword elements of input vector are added + together and the resulting integer sum is returned +*/ +#define HADD_UH_U32(in) \ + ({ \ + v4u32 res_m; \ + v2u64 res0_m, res1_m; \ + uint32_t sum_m; \ + \ + res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ + res0_m = __msa_hadd_u_d(res_m, res_m); \ + res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \ + res0_m = res0_m + res1_m; \ + sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \ + sum_m; \ + }) + +/* Description : Horizontal addition of unsigned byte vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each unsigned odd byte element from 'in0' is added to + even unsigned byte element from 'in0' (pairwise) and the + halfword result is written to 'out0' +*/ +#define HADD_UB2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \ + out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \ + } +#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__) + +#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + HADD_UB2(RTYPE, in0, in1, out0, out1); \ + HADD_UB2(RTYPE, in2, in3, out2, out3); \ + } +#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__) + +/* Description : Horizontal subtraction of unsigned byte vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each unsigned odd byte element from 'in0' is subtracted from + even unsigned byte element from 'in0' (pairwise) and the + halfword result is written to 'out0' +*/ +#define HSUB_UB2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \ + out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \ + } +#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) + +/* Description : SAD (Sum of Absolute Difference) + Arguments : Inputs - in0, in1, ref0, ref1 + Outputs - sad_m (halfword vector) + Return Type - unsigned halfword + Details : Absolute difference of all the byte elements from 'in0' with + 'ref0' is calculated and preserved in 'diff0'. Then even-odd + pairs are added together to generate 8 halfword results. +*/ +#define SAD_UB2_UH(in0, in1, ref0, ref1) \ + ({ \ + v16u8 diff0_m, diff1_m; \ + v8u16 sad_m = { 0 }; \ + \ + diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0); \ + diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1); \ + \ + sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \ + sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \ + \ + sad_m; \ + }) + +/* Description : Horizontal subtraction of signed halfword vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each signed odd halfword element from 'in0' is subtracted from + even signed halfword element from 'in0' (pairwise) and the + word result is written to 'out0' +*/ +#define HSUB_UH2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \ + out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \ + } +#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__) + +/* Description : Set element n input vector to GPR value + Arguments : Inputs - in0, in1, in2, in3 + Output - out + Return Type - as per RTYPE + Details : Set element 0 in vector 'out' to value specified in 'in0' +*/ +#define INSERT_W2(RTYPE, in0, in1, out) \ + { \ + out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ + } +#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__) + +#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \ + { \ + out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \ + } +#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__) +#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__) + +#define INSERT_D2(RTYPE, in0, in1, out) \ + { \ + out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \ + out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \ + } +#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__) +#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) + +/* Description : Interleave even byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even byte elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ + out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ + } +#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__) +#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__) + +/* Description : Interleave even halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even halfword elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ + out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ + } +#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) +#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) +#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__) + +/* Description : Interleave even word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even word elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \ + out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \ + } +#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__) + +/* Description : Interleave even double word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even double word elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \ + out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \ + } +#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__) + +/* Description : Interleave left half of byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of byte elements of 'in0' and 'in1' are interleaved + and written to 'out0'. +*/ +#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \ + } +#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__) +#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__) +#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__) +#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__) + +#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__) +#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__) +#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__) + +/* Description : Interleave left half of halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of halfword elements of 'in0' and 'in1' are + interleaved and written to 'out0'. +*/ +#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \ + } +#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__) +#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__) + +/* Description : Interleave left half of word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of word elements of 'in0' and 'in1' are interleaved + and written to 'out0'. +*/ +#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \ + } +#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__) +#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__) + +/* Description : Interleave right half of byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of byte elements of 'in0' and 'in1' are interleaved + and written to out0. +*/ +#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \ + } +#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) +#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) +#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__) +#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) + +#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) +#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) +#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) +#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) + +#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ + in11, in12, in13, in14, in15, out0, out1, out2, out3, out4, \ + out5, out6, out7) \ + { \ + ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3); \ + ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, out4, out5, \ + out6, out7); \ + } +#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__) + +/* Description : Interleave right half of halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of halfword elements of 'in0' and 'in1' are + interleaved and written to 'out0'. +*/ +#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \ + } +#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) +#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__) + +#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) + +#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \ + } +#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__) +#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__) + +#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__) + +/* Description : Interleave right half of double word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of double word elements of 'in0' and 'in1' are + interleaved and written to 'out0'. +*/ +#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \ + out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \ + } +#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) +#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) +#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) + +#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ + { \ + ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5)); \ + } +#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__) + +#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__) +#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__) + +/* Description : Interleave both left and right half of input vectors + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of byte elements from 'in0' and 'in1' are + interleaved and written to 'out0' +*/ +#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ + } +#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) +#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) +#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__) +#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) + +#define ILVRL_H2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ + } +#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) +#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) + +#define ILVRL_W2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ + } +#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) +#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) +#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) + +/* Description : Saturate the halfword element values to the max + unsigned value of (sat_val + 1) bits + The element data width remains unchanged + Arguments : Inputs - in0, in1, sat_val + Outputs - in place operation + Return Type - as per RTYPE + Details : Each unsigned halfword element from 'in0' is saturated to the + value generated with (sat_val + 1) bit range. + The results are written in place +*/ +#define SAT_UH2(RTYPE, in0, in1, sat_val) \ + { \ + in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \ + in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \ + } +#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__) + +#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \ + { \ + SAT_UH2(RTYPE, in0, in1, sat_val); \ + SAT_UH2(RTYPE, in2, in3, sat_val) \ + } +#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__) + +/* Description : Saturate the halfword element values to the max + unsigned value of (sat_val + 1) bits + The element data width remains unchanged + Arguments : Inputs - in0, in1, sat_val + Outputs - in place operation + Return Type - as per RTYPE + Details : Each unsigned halfword element from 'in0' is saturated to the + value generated with (sat_val + 1) bit range + The results are written in place +*/ +#define SAT_SH2(RTYPE, in0, in1, sat_val) \ + { \ + in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \ + in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \ + } +#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__) + +#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \ + { \ + SAT_SH2(RTYPE, in0, in1, sat_val); \ + SAT_SH2(RTYPE, in2, in3, sat_val); \ + } +#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__) + +/* Description : Indexed halfword element values are replicated to all + elements in output vector + Arguments : Inputs - in, idx0, idx1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : 'idx0' element value from 'in' vector is replicated to all + elements in 'out0' vector + Valid index range for halfword operation is 0-7 +*/ +#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \ + out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \ + } +#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__) + +#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, out0, out1, out2, out3) \ + { \ + SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ + SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \ + } +#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__) +#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__) + +/* Description : Pack even byte elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even byte elements of 'in0' are copied to the left half of + 'out0' & even byte elements of 'in1' are copied to the right + half of 'out0'. +*/ +#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \ + } +#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) +#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) +#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) + +#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__) +#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__) +#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__) + +/* Description : Pack even halfword elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even halfword elements of 'in0' are copied to the left half of + 'out0' & even halfword elements of 'in1' are copied to the + right half of 'out0'. +*/ +#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \ + } +#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__) +#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__) + +#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__) + +/* Description : Pack even double word elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even double elements of 'in0' are copied to the left half of + 'out0' & even double elements of 'in1' are copied to the right + half of 'out0'. +*/ +#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \ + out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \ + } +#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__) +#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__) + +#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__) + +/* Description : Each byte element is logically xor'ed with immediate 128 + Arguments : Inputs - in0, in1 + Outputs - in place operation + Return Type - as per RTYPE + Details : Each unsigned byte element from input vector 'in0' is + logically xor'ed with 128 and the result is stored in-place. +*/ +#define XORI_B2_128(RTYPE, in0, in1) \ + { \ + in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \ + in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \ + } +#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__) +#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__) + +#define XORI_B3_128(RTYPE, in0, in1, in2) \ + { \ + XORI_B2_128(RTYPE, in0, in1); \ + in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \ + } +#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__) + +#define XORI_B4_128(RTYPE, in0, in1, in2, in3) \ + { \ + XORI_B2_128(RTYPE, in0, in1); \ + XORI_B2_128(RTYPE, in2, in3); \ + } +#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__) +#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__) + +#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \ + { \ + XORI_B4_128(RTYPE, in0, in1, in2, in3); \ + XORI_B3_128(RTYPE, in4, in5, in6); \ + } +#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__) + +/* Description : Average of signed halfword elements -> (a + b) / 2 + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3 + Return Type - as per RTYPE + Details : Each signed halfword element from 'in0' is added to each + signed halfword element of 'in1' with full precision resulting + in one extra bit in the result. The result is then divided by + 2 and written to 'out0' +*/ +#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3); \ + out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5); \ + out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7); \ + } +#define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__) + +/* Description : Addition of signed halfword elements and signed saturation + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed halfword elements from 'in0' are added to signed + halfword elements of 'in1'. The result is then signed saturated + between halfword data type range +*/ +#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \ + } +#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__) + +#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__) + +/* Description : Shift left all elements of vector (generic for all data types) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - in place operation + Return Type - as per input vector RTYPE + Details : Each element of vector 'in0' is left shifted by 'shift' and + the result is written in-place. +*/ +#define SLLI_4V(in0, in1, in2, in3, shift) \ + { \ + in0 = in0 << shift; \ + in1 = in1 << shift; \ + in2 = in2 << shift; \ + in3 = in3 << shift; \ + } + +/* Description : Arithmetic shift right all elements of vector + (generic for all data types) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - in place operation + Return Type - as per input vector RTYPE + Details : Each element of vector 'in0' is right shifted by 'shift' and + the result is written in-place. 'shift' is a GP variable. +*/ +#define SRA_4V(in0, in1, in2, in3, shift) \ + { \ + in0 = in0 >> shift; \ + in1 = in1 >> shift; \ + in2 = in2 >> shift; \ + in3 = in3 >> shift; \ + } + +/* Description : Shift right arithmetic rounded words + Arguments : Inputs - in0, in1, shift + Outputs - in place operation + Return Type - as per RTYPE + Details : Each element of vector 'in0' is shifted right arithmetically by + the number of bits in the corresponding element in the vector + 'shift'. The last discarded bit is added to shifted value for + rounding and the result is written in-place. + 'shift' is a vector. +*/ +#define SRAR_W2(RTYPE, in0, in1, shift) \ + { \ + in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \ + in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \ + } + +#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \ + { \ + SRAR_W2(RTYPE, in0, in1, shift) \ + SRAR_W2(RTYPE, in2, in3, shift) \ + } +#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__) + +/* Description : Shift right arithmetic rounded (immediate) + Arguments : Inputs - in0, in1, shift + Outputs - in place operation + Return Type - as per RTYPE + Details : Each element of vector 'in0' is shifted right arithmetically by + the value in 'shift'. The last discarded bit is added to the + shifted value for rounding and the result is written in-place. + 'shift' is an immediate value. +*/ +#define SRARI_H2(RTYPE, in0, in1, shift) \ + { \ + in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \ + in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \ + } +#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__) +#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__) + +#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \ + { \ + SRARI_H2(RTYPE, in0, in1, shift); \ + SRARI_H2(RTYPE, in2, in3, shift); \ + } +#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__) +#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__) + +#define SRARI_W2(RTYPE, in0, in1, shift) \ + { \ + in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \ + in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \ + } +#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__) + +#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \ + { \ + SRARI_W2(RTYPE, in0, in1, shift); \ + SRARI_W2(RTYPE, in2, in3, shift); \ + } +#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) + +/* Description : Logical shift right all elements of vector (immediate) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - out0, out1, out2, out3 + Return Type - as per RTYPE + Details : Each element of vector 'in0' is right shifted by 'shift' and + the result is written in-place. 'shift' is an immediate value. +*/ +#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) \ + { \ + out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift); \ + out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift); \ + out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift); \ + out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift); \ + } +#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__) + +/* Description : Multiplication of pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element from 'in0' is multiplied with elements from 'in1' + and the result is written to 'out0' +*/ +#define MUL2(in0, in1, in2, in3, out0, out1) \ + { \ + out0 = in0 * in1; \ + out1 = in2 * in3; \ + } +#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ + { \ + MUL2(in0, in1, in2, in3, out0, out1); \ + MUL2(in4, in5, in6, in7, out2, out3); \ + } + +/* Description : Addition of 2 pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element in 'in0' is added to 'in1' and result is written + to 'out0'. +*/ +#define ADD2(in0, in1, in2, in3, out0, out1) \ + { \ + out0 = in0 + in1; \ + out1 = in2 + in3; \ + } +#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ + { \ + ADD2(in0, in1, in2, in3, out0, out1); \ + ADD2(in4, in5, in6, in7, out2, out3); \ + } + +/* Description : Subtraction of 2 pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element in 'in1' is subtracted from 'in0' and result is + written to 'out0'. +*/ +#define SUB2(in0, in1, in2, in3, out0, out1) \ + { \ + out0 = in0 - in1; \ + out1 = in2 - in3; \ + } +#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ + { \ + out0 = in0 - in1; \ + out1 = in2 - in3; \ + out2 = in4 - in5; \ + out3 = in6 - in7; \ + } + +/* Description : Sign extend halfword elements from right half of the vector + Arguments : Input - in (halfword vector) + Output - out (sign extended word vector) + Return Type - signed word + Details : Sign bit of halfword elements from input vector 'in' is + extracted and interleaved with same vector 'in0' to generate + 4 word elements keeping sign intact +*/ +#define UNPCK_R_SH_SW(in, out) \ + { \ + v8i16 sign_m; \ + \ + sign_m = __msa_clti_s_h((v8i16)in, 0); \ + out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \ + } + +/* Description : Zero extend unsigned byte elements to halfword elements + Arguments : Input - in (unsigned byte vector) + Outputs - out0, out1 (unsigned halfword vectors) + Return Type - signed halfword + Details : Zero extended right half of vector is returned in 'out0' + Zero extended left half of vector is returned in 'out1' +*/ +#define UNPCK_UB_SH(in, out0, out1) \ + { \ + v16i8 zero_m = { 0 }; \ + \ + ILVRL_B2_SH(zero_m, in, out0, out1); \ + } + +/* Description : Sign extend halfword elements from input vector and return + the result in pair of vectors + Arguments : Input - in (halfword vector) + Outputs - out0, out1 (sign extended word vectors) + Return Type - signed word + Details : Sign bit of halfword elements from input vector 'in' is + extracted and interleaved right with same vector 'in0' to + generate 4 signed word elements in 'out0' + Then interleaved left with same vector 'in0' to + generate 4 signed word elements in 'out1' +*/ +#define UNPCK_SH_SW(in, out0, out1) \ + { \ + v8i16 tmp_m; \ + \ + tmp_m = __msa_clti_s_h((v8i16)in, 0); \ + ILVRL_H2_SW(tmp_m, in, out0, out1); \ + } + +/* Description : Butterfly of 4 input vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Details : Butterfly operation +*/ +#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + out0 = in0 + in3; \ + out1 = in1 + in2; \ + \ + out2 = in1 - in2; \ + out3 = in0 - in3; \ + } + +/* Description : Butterfly of 8 input vectors + Arguments : Inputs - in0 ... in7 + Outputs - out0 .. out7 + Details : Butterfly operation +*/ +#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ + { \ + out0 = in0 + in7; \ + out1 = in1 + in6; \ + out2 = in2 + in5; \ + out3 = in3 + in4; \ + \ + out4 = in3 - in4; \ + out5 = in2 - in5; \ + out6 = in1 - in6; \ + out7 = in0 - in7; \ + } + +/* Description : Butterfly of 16 input vectors + Arguments : Inputs - in0 ... in15 + Outputs - out0 .. out15 + Details : Butterfly operation +*/ +#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ + in11, in12, in13, in14, in15, out0, out1, out2, out3, \ + out4, out5, out6, out7, out8, out9, out10, out11, out12, \ + out13, out14, out15) \ + { \ + out0 = in0 + in15; \ + out1 = in1 + in14; \ + out2 = in2 + in13; \ + out3 = in3 + in12; \ + out4 = in4 + in11; \ + out5 = in5 + in10; \ + out6 = in6 + in9; \ + out7 = in7 + in8; \ + \ + out8 = in7 - in8; \ + out9 = in6 - in9; \ + out10 = in5 - in10; \ + out11 = in4 - in11; \ + out12 = in3 - in12; \ + out13 = in2 - in13; \ + out14 = in1 - in14; \ + out15 = in0 - in15; \ + } + +/* Description : Transpose input 8x8 byte block + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - as per RTYPE +*/ +#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \ + out1, out2, out3, out4, out5, out6, out7) \ + { \ + v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \ + tmp3_m); \ + ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \ + ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \ + ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \ + ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \ + SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \ + SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \ + } +#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__) + +/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, + in8, in9, in10, in11, in12, in13, in14, in15 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - unsigned byte +*/ +#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \ + in10, in11, in12, in13, in14, in15, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \ + ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \ + ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \ + ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \ + \ + tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \ + tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \ + tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \ + tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \ + out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \ + tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \ + out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \ + tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \ + \ + ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \ + out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + \ + tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ + tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \ + out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + \ + ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \ + out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + \ + tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ + tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ + tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ + tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ + out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + } + +/* Description : Transpose 4x4 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Return Type - signed halfword +*/ +#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 s0_m, s1_m; \ + \ + ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \ + ILVRL_W2_SH(s1_m, s0_m, out0, out2); \ + out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ + out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \ + } + +/* Description : Transpose 4x8 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - signed halfword +*/ +#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \ + v8i16 zero_m = { 0 }; \ + \ + ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, tmp0_n, tmp1_n, tmp2_n, \ + tmp3_n); \ + ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \ + ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \ + \ + out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ + out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ + out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ + out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ + \ + out4 = zero_m; \ + out5 = zero_m; \ + out6 = zero_m; \ + out7 = zero_m; \ + } + +/* Description : Transpose 8x4 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - signed halfword +*/ +#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \ + ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \ + ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \ + ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ + } + +/* Description : Transpose 8x8 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - as per RTYPE +*/ +#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \ + out1, out2, out3, out4, out5, out6, out7) \ + { \ + v8i16 s0_m, s1_m; \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \ + ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \ + ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \ + ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \ + PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, tmp3_m, \ + tmp7_m, out0, out2, out4, out6); \ + out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ + out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ + out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ + out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ + } +#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__) + +/* Description : Transpose 4x4 block with word elements in vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Return Type - signed word +*/ +#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v4i32 s0_m, s1_m, s2_m, s3_m; \ + \ + ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ + ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ + \ + out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \ + out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \ + out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \ + out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \ + } + +/* Description : Add block 4x4 + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Least significant 4 bytes from each input vector are added to + the destination bytes, clipped between 0-255 and stored. +*/ +#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \ + { \ + uint32_t src0_m, src1_m, src2_m, src3_m; \ + v8i16 inp0_m, inp1_m, res0_m, res1_m; \ + v16i8 dst0_m = { 0 }; \ + v16i8 dst1_m = { 0 }; \ + v16i8 zero_m = { 0 }; \ + \ + ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \ + LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \ + INSERT_W2_SB(src0_m, src1_m, dst0_m); \ + INSERT_W2_SB(src2_m, src3_m, dst1_m); \ + ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \ + ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \ + CLIP_SH2_0_255(res0_m, res1_m); \ + PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \ + ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \ + } + +/* Description : Pack even elements of input vectors & xor with 128 + Arguments : Inputs - in0, in1 + Output - out_m + Return Type - unsigned byte + Details : Signed byte even elements from 'in0' and 'in1' are packed + together in one vector and the resulting vector is xor'ed with + 128 to shift the range from signed to unsigned byte +*/ +#define PCKEV_XORI128_UB(in0, in1) \ + ({ \ + v16u8 out_m; \ + \ + out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \ + out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \ + out_m; \ + }) + +/* Description : Converts inputs to unsigned bytes, interleave, average & store + as 8x4 unsigned byte block + Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3, + pdst, stride +*/ +#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \ + pdst, stride) \ + { \ + v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + tmp0_m = PCKEV_XORI128_UB(in0, in1); \ + tmp1_m = PCKEV_XORI128_UB(in2, in3); \ + ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ + AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, pdst, stride); \ + } + +/* Description : Pack even byte elements and store byte vector in destination + memory + Arguments : Inputs - in0, in1, pdst +*/ +#define PCKEV_ST_SB(in0, in1, pdst) \ + { \ + v16i8 tmp_m; \ + \ + tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \ + ST_SB(tmp_m, (pdst)); \ + } + +/* Description : Horizontal 2 tap filter kernel code + Arguments : Inputs - in0, in1, mask, coeff, shift +*/ +#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \ + ({ \ + v16i8 tmp0_m; \ + v8u16 tmp1_m; \ + \ + tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \ + tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \ + tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \ + \ + tmp1_m; \ + }) +#endif // AOM_AOM_DSP_MIPS_MACROS_MSA_H_ diff --git a/libs/libaom/src/aom_dsp/mips/sad_msa.c b/libs/libaom/src/aom_dsp/mips/sad_msa.c new file mode 100644 index 000000000..58cdd80d9 --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/sad_msa.c @@ -0,0 +1,800 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/mips/macros_msa.h" + +#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) \ + { \ + out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \ + out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \ + out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \ + out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \ + } +#define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__) + +static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v16u8 diff; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + INSERT_W4_UB(src0, src1, src2, src3, src); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + + diff = __msa_asub_u_b(src, ref); + sad += __msa_hadd_u_h(diff, diff); + } + + return HADD_UH_U32(sad); +} + +static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); + ref += (4 * ref_stride); + + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + LD_UB2(ref, ref_stride, ref0, ref1); + ref += (2 * ref_stride); + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + LD_UB2(ref, ref_stride, ref0, ref1); + ref += (2 * ref_stride); + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB2(ref, 16, ref0, ref1); + ref += ref_stride; + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB2(ref, 16, ref0, ref1); + ref += ref_stride; + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB2(ref, 16, ref0, ref1); + ref += ref_stride; + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB2(ref, 16, ref0, ref1); + ref += ref_stride; + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + uint32_t sad = 0; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + + for (ht_cnt = (height >> 1); ht_cnt--;) { + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad1 += SAD_UB2_UH(src2, src3, ref2, ref3); + } + + sad = HADD_UH_U32(sad0); + sad += HADD_UH_U32(sad1); + + return sad; +} + +static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + int32_t ht_cnt; + uint32_t src0, src1, src2, src3; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v16u8 diff; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + INSERT_W4_UB(src0, src1, src2, src3, src); + src_ptr += (4 * src_stride); + + LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ref0_ptr += (4 * ref_stride); + + diff = __msa_asub_u_b(src, ref); + sad0 += __msa_hadd_u_h(diff, diff); + + LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ref1_ptr += (4 * ref_stride); + + diff = __msa_asub_u_b(src, ref); + sad1 += __msa_hadd_u_h(diff, diff); + + LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ref2_ptr += (4 * ref_stride); + + diff = __msa_asub_u_b(src, ref); + sad2 += __msa_hadd_u_h(diff, diff); + + LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ref3_ptr += (4 * ref_stride); + + diff = __msa_asub_u_b(src, ref); + sad3 += __msa_hadd_u_h(diff, diff); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); +} + +static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + int32_t ht_cnt; + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; + v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref0_ptr += (4 * ref_stride); + LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7); + ref1_ptr += (4 * ref_stride); + LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11); + ref2_ptr += (4 * ref_stride); + LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15); + ref3_ptr += (4 * ref_stride); + + PCKEV_D2_UB(src1, src0, src3, src2, src0, src1); + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); + + PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1); + sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); + + PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1); + sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); + + PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1); + sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); +} + +static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + int32_t ht_cnt; + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + v16u8 src, ref0, ref1, ref2, ref3, diff; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (ht_cnt = (height >> 1); ht_cnt--;) { + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref0 = LD_UB(ref0_ptr); + ref0_ptr += ref_stride; + ref1 = LD_UB(ref1_ptr); + ref1_ptr += ref_stride; + ref2 = LD_UB(ref2_ptr); + ref2_ptr += ref_stride; + ref3 = LD_UB(ref3_ptr); + ref3_ptr += ref_stride; + + diff = __msa_asub_u_b(src, ref0); + sad0 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref1); + sad1 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref2); + sad2 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref3); + sad3 += __msa_hadd_u_h(diff, diff); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref0 = LD_UB(ref0_ptr); + ref0_ptr += ref_stride; + ref1 = LD_UB(ref1_ptr); + ref1_ptr += ref_stride; + ref2 = LD_UB(ref2_ptr); + ref2_ptr += ref_stride; + ref3 = LD_UB(ref3_ptr); + ref3_ptr += ref_stride; + + diff = __msa_asub_u_b(src, ref0); + sad0 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref1); + sad1 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref2); + sad2 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref3); + sad3 += __msa_hadd_u_h(diff, diff); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); +} + +static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (ht_cnt = height; ht_cnt--;) { + LD_UB2(src, 16, src0, src1); + src += src_stride; + + LD_UB2(ref0_ptr, 16, ref0, ref1); + ref0_ptr += ref_stride; + sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(ref1_ptr, 16, ref0, ref1); + ref1_ptr += ref_stride; + sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(ref2_ptr, 16, ref0, ref1); + ref2_ptr += ref_stride; + sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(ref3_ptr, 16, ref0, ref1); + ref3_ptr += ref_stride; + sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); +} + +static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v8u16 sad0_0 = { 0 }; + v8u16 sad0_1 = { 0 }; + v8u16 sad1_0 = { 0 }; + v8u16 sad1_1 = { 0 }; + v8u16 sad2_0 = { 0 }; + v8u16 sad2_1 = { 0 }; + v8u16 sad3_0 = { 0 }; + v8u16 sad3_1 = { 0 }; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (ht_cnt = height; ht_cnt--;) { + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + + LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3); + ref0_ptr += ref_stride; + sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3); + ref1_ptr += ref_stride; + sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3); + ref2_ptr += ref_stride; + sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3); + ref3_ptr += ref_stride; + sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + } + + sad_array[0] = HADD_UH_U32(sad0_0); + sad_array[0] += HADD_UH_U32(sad0_1); + sad_array[1] = HADD_UH_U32(sad1_0); + sad_array[1] += HADD_UH_U32(sad1_1); + sad_array[2] = HADD_UH_U32(sad2_0); + sad_array[2] += HADD_UH_U32(sad2_1); + sad_array[3] = HADD_UH_U32(sad3_0); + sad_array[3] += HADD_UH_U32(sad3_1); +} + +static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt; + uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v16u8 diff, pred, comp; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + pred = LD_UB(sec_pred); + sec_pred += 16; + + INSERT_W4_UB(src0, src1, src2, src3, src); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + + comp = __msa_aver_u_b(pred, ref); + diff = __msa_asub_u_b(src, comp); + sad += __msa_hadd_u_h(diff, diff); + } + + return HADD_UH_U32(sad); +} + +static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; + v16u8 diff0, diff1, pred0, pred1; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); + ref += (4 * ref_stride); + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); + AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1); + sad += SAD_UB2_UH(src0, src1, diff0, diff1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, pred2, pred3, comp0, comp1; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 3); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); + ref += (4 * ref_stride); + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += (4 * 16); + AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); + sad += SAD_UB2_UH(src0, src1, comp0, comp1); + AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); + sad += SAD_UB2_UH(src2, src3, comp0, comp1); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); + ref += (4 * ref_stride); + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += (4 * 16); + AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); + sad += SAD_UB2_UH(src0, src1, comp0, comp1); + AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); + sad += SAD_UB2_UH(src2, src3, comp0, comp1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; + v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + v16u8 comp0, comp1; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src2, src4, src6); + LD_UB4(src + 16, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6); + LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7); + ref += (4 * ref_stride); + + LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6); + LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7); + sec_pred += (4 * 32); + + AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); + sad += SAD_UB2_UH(src0, src1, comp0, comp1); + AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); + sad += SAD_UB2_UH(src2, src3, comp0, comp1); + AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1); + sad += SAD_UB2_UH(src4, src5, comp0, comp1); + AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1); + sad += SAD_UB2_UH(src6, src7, comp0, comp1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v16u8 comp0, comp1, comp2, comp3; + v16u8 pred0, pred1, pred2, pred3; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v4u32 sad; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, + comp1, comp2, comp3); + sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); + sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); + + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, + comp1, comp2, comp3); + sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); + sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); + + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, + comp1, comp2, comp3); + sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); + sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); + + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, + comp1, comp2, comp3); + sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); + sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); + } + + sad = __msa_hadd_u_w(sad0, sad0); + sad += __msa_hadd_u_w(sad1, sad1); + + return HADD_SW_S32(sad); +} + +#define AOM_SAD_4xHEIGHT_MSA(height) \ + uint32_t aom_sad4x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_4width_msa(src, src_stride, ref, ref_stride, height); \ + } + +#define AOM_SAD_8xHEIGHT_MSA(height) \ + uint32_t aom_sad8x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_8width_msa(src, src_stride, ref, ref_stride, height); \ + } + +#define AOM_SAD_16xHEIGHT_MSA(height) \ + uint32_t aom_sad16x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_16width_msa(src, src_stride, ref, ref_stride, height); \ + } + +#define AOM_SAD_32xHEIGHT_MSA(height) \ + uint32_t aom_sad32x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_32width_msa(src, src_stride, ref, ref_stride, height); \ + } + +#define AOM_SAD_64xHEIGHT_MSA(height) \ + uint32_t aom_sad64x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_64width_msa(src, src_stride, ref, ref_stride, height); \ + } + +#define AOM_SAD_4xHEIGHTx4D_MSA(height) \ + void aom_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define AOM_SAD_8xHEIGHTx4D_MSA(height) \ + void aom_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define AOM_SAD_16xHEIGHTx4D_MSA(height) \ + void aom_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define AOM_SAD_32xHEIGHTx4D_MSA(height) \ + void aom_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define AOM_SAD_64xHEIGHTx4D_MSA(height) \ + void aom_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define AOM_AVGSAD_4xHEIGHT_MSA(height) \ + uint32_t aom_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + const uint8_t *second_pred) { \ + return avgsad_4width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define AOM_AVGSAD_8xHEIGHT_MSA(height) \ + uint32_t aom_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + const uint8_t *second_pred) { \ + return avgsad_8width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define AOM_AVGSAD_16xHEIGHT_MSA(height) \ + uint32_t aom_sad16x##height##_avg_msa( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, const uint8_t *second_pred) { \ + return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define AOM_AVGSAD_32xHEIGHT_MSA(height) \ + uint32_t aom_sad32x##height##_avg_msa( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, const uint8_t *second_pred) { \ + return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define AOM_AVGSAD_64xHEIGHT_MSA(height) \ + uint32_t aom_sad64x##height##_avg_msa( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, const uint8_t *second_pred) { \ + return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +/* clang-format off */ +// 64x64 +AOM_SAD_64xHEIGHT_MSA(64) +AOM_SAD_64xHEIGHTx4D_MSA(64) +AOM_AVGSAD_64xHEIGHT_MSA(64) + +// 64x32 +AOM_SAD_64xHEIGHT_MSA(32) +AOM_SAD_64xHEIGHTx4D_MSA(32) +AOM_AVGSAD_64xHEIGHT_MSA(32) + +// 32x64 +AOM_SAD_32xHEIGHT_MSA(64) +AOM_SAD_32xHEIGHTx4D_MSA(64) +AOM_AVGSAD_32xHEIGHT_MSA(64) + +// 32x32 +AOM_SAD_32xHEIGHT_MSA(32) +AOM_SAD_32xHEIGHTx4D_MSA(32) +AOM_AVGSAD_32xHEIGHT_MSA(32) + +// 32x16 +AOM_SAD_32xHEIGHT_MSA(16) +AOM_SAD_32xHEIGHTx4D_MSA(16) +AOM_AVGSAD_32xHEIGHT_MSA(16) + +// 16x32 +AOM_SAD_16xHEIGHT_MSA(32) +AOM_SAD_16xHEIGHTx4D_MSA(32) +AOM_AVGSAD_16xHEIGHT_MSA(32) + +// 16x16 +AOM_SAD_16xHEIGHT_MSA(16) +AOM_SAD_16xHEIGHTx4D_MSA(16) +AOM_AVGSAD_16xHEIGHT_MSA(16) + +// 16x8 +AOM_SAD_16xHEIGHT_MSA(8) +AOM_SAD_16xHEIGHTx4D_MSA(8) +AOM_AVGSAD_16xHEIGHT_MSA(8) + +// 8x16 +AOM_SAD_8xHEIGHT_MSA(16) +AOM_SAD_8xHEIGHTx4D_MSA(16) +AOM_AVGSAD_8xHEIGHT_MSA(16) + +// 8x8 +AOM_SAD_8xHEIGHT_MSA(8) +AOM_SAD_8xHEIGHTx4D_MSA(8) +AOM_AVGSAD_8xHEIGHT_MSA(8) + +// 8x4 +AOM_SAD_8xHEIGHT_MSA(4) +AOM_SAD_8xHEIGHTx4D_MSA(4) +AOM_AVGSAD_8xHEIGHT_MSA(4) + +// 4x8 +AOM_SAD_4xHEIGHT_MSA(8) +AOM_SAD_4xHEIGHTx4D_MSA(8) +AOM_AVGSAD_4xHEIGHT_MSA(8) + +// 4x4 +AOM_SAD_4xHEIGHT_MSA(4) +AOM_SAD_4xHEIGHTx4D_MSA(4) +AOM_AVGSAD_4xHEIGHT_MSA(4) + /* clang-format on */ diff --git a/libs/libaom/src/aom_dsp/mips/sub_pixel_variance_msa.c b/libs/libaom/src/aom_dsp/mips/sub_pixel_variance_msa.c new file mode 100644 index 000000000..810b6efaa --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/sub_pixel_variance_msa.c @@ -0,0 +1,1792 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_dsp_rtcd.h" + +#include "aom_ports/mem.h" +#include "aom_dsp/mips/macros_msa.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/variance.h" + +#define CALC_MSE_AVG_B(src, ref, var, sub) \ + { \ + v16u8 src_l0_m, src_l1_m; \ + v8i16 res_l0_m, res_l1_m; \ + \ + ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ + HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ + DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ + \ + sub += res_l0_m + res_l1_m; \ + } + +#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift) + +#define VARIANCE_LARGE_WxH(sse, diff, shift) \ + sse - (((int64_t)diff * diff) >> shift) + +static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t height, + int32_t *diff) { + int32_t ht_cnt; + uint32_t src0, src1, src2, src3; + uint32_t ref0, ref1, ref2, ref3; + v16u8 pred, src = { 0 }; + v16u8 ref = { 0 }; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + pred = LD_UB(sec_pred); + sec_pred += 16; + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + INSERT_W4_UB(src0, src1, src2, src3, src); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + + src = __msa_aver_u_b(src, pred); + CALC_MSE_AVG_B(src, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t height, + int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, + int32_t height, int32_t *diff) { + int32_t ht_cnt; + v16u8 src, ref, pred; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + pred = LD_UB(sec_pred); + sec_pred += 16; + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + src = __msa_aver_u_b(src, pred); + CALC_MSE_AVG_B(src, ref, var, avg); + + pred = LD_UB(sec_pred); + sec_pred += 16; + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + src = __msa_aver_u_b(src, pred); + CALC_MSE_AVG_B(src, ref, var, avg); + + pred = LD_UB(sec_pred); + sec_pred += 16; + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + src = __msa_aver_u_b(src, pred); + CALC_MSE_AVG_B(src, ref, var, avg); + + pred = LD_UB(sec_pred); + sec_pred += 16; + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + src = __msa_aver_u_b(src, pred); + CALC_MSE_AVG_B(src, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, + int32_t height, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1, pred0, pred1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1, pred0, pred1; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 16; ht_cnt--;) { + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, pred2, pred3; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 16; ht_cnt--;) { + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, + src2, src3); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src2, ref2, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src3, ref3, var, avg1); + + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, + src2, src3); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src2, ref2, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src3, ref3, var, avg1); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, pred2, pred3; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v8i16 avg2 = { 0 }; + v8i16 avg3 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 32; ht_cnt--;) { + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, + src2, src3); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, + src2, src3); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + vec += __msa_hadd_s_w(avg2, avg2); + vec += __msa_hadd_s_w(avg3, avg3); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_4width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 filt0, ref = { 0 }; + v16i8 src0, src1, src2, src3; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 vec0, vec1, vec2, vec3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, + src2, src3); + ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); + src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); + CALC_MSE_AVG_B(src0, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_8width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 filt0, out, ref0, ref1, ref2, ref3; + v16i8 src0, src1, src2, src3; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 vec0, vec1, vec2, vec3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, + src2, src3); + out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); + CALC_MSE_AVG_B(out, ref0, var, avg); + out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); + CALC_MSE_AVG_B(out, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_16width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 dst0, dst1, dst2, dst3, filt0; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + dst += (4 * dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, src0, src1, + src2, src3); + CALC_MSE_AVG_B(src0, dst0, var, avg); + CALC_MSE_AVG_B(src1, dst1, var, avg); + CALC_MSE_AVG_B(src2, dst2, var, avg); + CALC_MSE_AVG_B(src3, dst3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_32width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride, + filter, height, &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_64width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride, + filter, height, &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_4width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4, out; + v16u8 src10_r, src32_r, src21_r, src43_r; + v16u8 ref = { 0 }; + v16u8 src2110, src4332; + v16u8 filt0; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + v8u16 tmp0, tmp1; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + CALC_MSE_AVG_B(out, ref, var, avg); + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_8width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4; + v16u8 ref0, ref1, ref2, ref3; + v8u16 vec0, vec1, vec2, vec3; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 filt0; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, + vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_16width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 out0, out1, out2, out3; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 filt0; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); + + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); + + src0 = src4; + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + CALC_MSE_AVG_B(out2, ref2, var, avg); + CALC_MSE_AVG_B(out3, ref3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_32width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride, + filter, height, &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_64width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride, + filter, height, &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_4width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 out, ref = { 0 }; + v16u8 filt_vt, filt_hz, vec0, vec1; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4; + v8u16 tmp0, tmp1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); + hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + CALC_MSE_AVG_B(out, ref, var, avg); + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_8width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 out0, out1; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 hz_out0, hz_out1; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 filt_vt, filt_hz, vec0; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp1 = __msa_dotp_u_h(vec0, filt_vt); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp2 = __msa_dotp_u_h(vec0, filt_vt); + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp3 = __msa_dotp_u_h(vec0, filt_vt); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_16width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 ref0, ref1, ref2, ref3; + v16u8 filt_hz, filt_vt, vec0, vec1; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3; + v8u16 tmp0, tmp1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + LD_UB2(src, 8, src0, src1); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src0, src2, src4, src6); + LD_UB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + CALC_MSE_AVG_B(src2, ref2, var, avg); + CALC_MSE_AVG_B(src3, ref3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_32width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height, + &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_64width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height, + &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_4width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 out, pred, filt0, ref = { 0 }; + v16i8 src0, src1, src2, src3; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 vec0, vec1, vec2, vec3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + pred = LD_UB(sec_pred); + sec_pred += 16; + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, + src2, src3); + ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); + out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); + out = __msa_aver_u_b(out, pred); + CALC_MSE_AVG_B(out, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_8width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 out, pred, filt0; + v16u8 ref0, ref1, ref2, ref3; + v16i8 src0, src1, src2, src3; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 vec0, vec1, vec2, vec3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, + src2, src3); + out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); + + pred = LD_UB(sec_pred); + sec_pred += 16; + out = __msa_aver_u_b(out, pred); + CALC_MSE_AVG_B(out, ref0, var, avg); + out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); + pred = LD_UB(sec_pred); + sec_pred += 16; + out = __msa_aver_u_b(out, pred); + CALC_MSE_AVG_B(out, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t subpel_avg_ssediff_16w_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff, int32_t width) { + int16_t filtval; + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 dst0, dst1, dst2, dst3; + v16u8 tmp0, tmp1, tmp2, tmp3; + v16u8 pred0, pred1, pred2, pred3, filt0; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + dst += (4 * dst_stride); + LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); + sec_pred += (4 * width); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, tmp0, tmp1, + tmp2, tmp3); + AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, tmp0, tmp1, + tmp2, tmp3); + + CALC_MSE_AVG_B(tmp0, dst0, var, avg); + CALC_MSE_AVG_B(tmp1, dst1, var, avg); + CALC_MSE_AVG_B(tmp2, dst2, var, avg); + CALC_MSE_AVG_B(tmp3, dst3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_16width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, + sec_pred, filter, height, diff, 16); +} + +static uint32_t sub_pixel_avg_sse_diff_32width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += + subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 32); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_64width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += + subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 64); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_4width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 src10_r, src32_r, src21_r, src43_r; + v16u8 out, pred, ref = { 0 }; + v16u8 src2110, src4332, filt0; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + v8u16 tmp0, tmp1; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + pred = LD_UB(sec_pred); + sec_pred += 16; + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + out = __msa_aver_u_b(out, pred); + CALC_MSE_AVG_B(out, ref, var, avg); + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_8width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, filt0; + v8u16 vec0, vec1, vec2, vec3; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, + vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t subpel_avg_ssediff_16w_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff, int32_t width) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, pred2, pred3; + v16u8 src0, src1, src2, src3, src4; + v16u8 out0, out1, out2, out3, filt0; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); + sec_pred += (4 * width); + + ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); + + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); + + src0 = src4; + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1, + out2, out3); + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + CALC_MSE_AVG_B(out2, ref2, var, avg); + CALC_MSE_AVG_B(out3, ref3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_16width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, + sec_pred, filter, height, diff, 16); +} + +static uint32_t sub_pixel_avg_sse_diff_32width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += + subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 32); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_64width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += + subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 64); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + v16u8 filt_hz, filt_vt, vec0, vec1; + v16u8 out, pred, ref = { 0 }; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + pred = LD_UB(sec_pred); + sec_pred += 16; + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); + hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + out = __msa_aver_u_b(out, pred); + CALC_MSE_AVG_B(out, ref, var, avg); + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 pred0, pred1, out0, out1; + v16u8 filt_hz, filt_vt, vec0; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp1 = __msa_dotp_u_h(vec0, filt_vt); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp2 = __msa_dotp_u_h(vec0, filt_vt); + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp3 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); + AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1); + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t subpel_avg_ssediff_16w_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, pred2, pred3; + v16u8 out0, out1, out2, out3; + v16u8 filt_hz, filt_vt, vec0, vec1; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + LD_UB2(src, 8, src0, src1); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src0, src2, src4, src6); + LD_UB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); + sec_pred += (4 * width); + + hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1, + out2, out3); + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + CALC_MSE_AVG_B(out2, ref2, var, avg); + CALC_MSE_AVG_B(out3, ref3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, + sec_pred, filter_horiz, filter_vert, + height, diff, 16); +} + +static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, + sec_pred, filter_horiz, filter_vert, + height, &diff0[loop_cnt], 32); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, + sec_pred, filter_horiz, filter_vert, + height, &diff0[loop_cnt], 64); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4); +#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5); +#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5); +#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6); +#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7); +#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7); +#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8); + +#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); +#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); +#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10); +#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); +#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); +#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12); + +#define AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \ + uint32_t aom_sub_pixel_variance##wd##x##ht##_msa( \ + const uint8_t *src, int32_t src_stride, int32_t xoffset, \ + int32_t yoffset, const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sse) { \ + int32_t diff; \ + uint32_t var; \ + const uint8_t *h_filter = bilinear_filters_2t[xoffset]; \ + const uint8_t *v_filter = bilinear_filters_2t[yoffset]; \ + \ + if (yoffset) { \ + if (xoffset) { \ + *sse = sub_pixel_sse_diff_##wd##width_hv_msa( \ + src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \ + } else { \ + *sse = sub_pixel_sse_diff_##wd##width_v_msa( \ + src, src_stride, ref, ref_stride, v_filter, ht, &diff); \ + } \ + \ + var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } else { \ + if (xoffset) { \ + *sse = sub_pixel_sse_diff_##wd##width_h_msa( \ + src, src_stride, ref, ref_stride, h_filter, ht, &diff); \ + \ + var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } else { \ + var = aom_variance##wd##x##ht##_msa(src, src_stride, ref, ref_stride, \ + sse); \ + } \ + } \ + \ + return var; \ + } + +/* clang-format off */ +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8) + +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16) + +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32) + +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64) + +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64) +/* clang-format on */ + +#define AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht) \ + uint32_t aom_sub_pixel_avg_variance##wd##x##ht##_msa( \ + const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \ + int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \ + uint32_t *sse, const uint8_t *sec_pred) { \ + int32_t diff; \ + const uint8_t *h_filter = bilinear_filters_2t[xoffset]; \ + const uint8_t *v_filter = bilinear_filters_2t[yoffset]; \ + \ + if (yoffset) { \ + if (xoffset) { \ + *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ + v_filter, ht, &diff); \ + } else { \ + *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \ + &diff); \ + } \ + } else { \ + if (xoffset) { \ + *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ + &diff); \ + } else { \ + *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, ref_ptr, \ + ref_stride, sec_pred, ht, &diff); \ + } \ + } \ + \ + return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } + +/* clang-format off */ +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4) +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8) + +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4) +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8) +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16) + +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8) +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16) +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32) + +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16) +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32) +/* clang-format on */ + +uint32_t aom_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, + int32_t src_stride, + int32_t xoffset, int32_t yoffset, + const uint8_t *ref_ptr, + int32_t ref_stride, uint32_t *sse, + const uint8_t *sec_pred) { + int32_t diff; + const uint8_t *h_filter = bilinear_filters_2t[xoffset]; + const uint8_t *v_filter = bilinear_filters_2t[yoffset]; + + if (yoffset) { + if (xoffset) { + *sse = sub_pixel_avg_sse_diff_32width_hv_msa( + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, + v_filter, 64, &diff); + } else { + *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, ref_ptr, + ref_stride, sec_pred, + v_filter, 64, &diff); + } + } else { + if (xoffset) { + *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr, + ref_stride, sec_pred, + h_filter, 64, &diff); + } else { + *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride, + sec_pred, &diff); + } + } + + return VARIANCE_32Wx64H(*sse, diff); +} + +#define AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht) \ + uint32_t aom_sub_pixel_avg_variance64x##ht##_msa( \ + const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \ + int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \ + uint32_t *sse, const uint8_t *sec_pred) { \ + int32_t diff; \ + const uint8_t *h_filter = bilinear_filters_2t[xoffset]; \ + const uint8_t *v_filter = bilinear_filters_2t[yoffset]; \ + \ + if (yoffset) { \ + if (xoffset) { \ + *sse = sub_pixel_avg_sse_diff_64width_hv_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ + v_filter, ht, &diff); \ + } else { \ + *sse = sub_pixel_avg_sse_diff_64width_v_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \ + &diff); \ + } \ + } else { \ + if (xoffset) { \ + *sse = sub_pixel_avg_sse_diff_64width_h_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ + &diff); \ + } else { \ + *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, ref_ptr, \ + ref_stride, sec_pred, &diff); \ + } \ + } \ + \ + return VARIANCE_64Wx##ht##H(*sse, diff); \ + } + +/* clang-format off */ +AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32) +AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64) +/* clang-format on */ diff --git a/libs/libaom/src/aom_dsp/mips/subtract_msa.c b/libs/libaom/src/aom_dsp/mips/subtract_msa.c new file mode 100644 index 000000000..bfed773ac --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/subtract_msa.c @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/mips/macros_msa.h" + +static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *pred_ptr, int32_t pred_stride, + int16_t *diff_ptr, int32_t diff_stride) { + uint32_t src0, src1, src2, src3; + uint32_t pred0, pred1, pred2, pred3; + v16i8 src = { 0 }; + v16i8 pred = { 0 }; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + LW4(src_ptr, src_stride, src0, src1, src2, src3); + LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3); + INSERT_W4_SB(src0, src1, src2, src3, src); + INSERT_W4_SB(pred0, pred1, pred2, pred3, pred); + ILVRL_B2_UB(src, pred, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride)); +} + +static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *pred_ptr, int32_t pred_stride, + int16_t *diff_ptr, int32_t diff_stride) { + uint32_t loop_cnt; + uint64_t src0, src1, pred0, pred1; + v16i8 src = { 0 }; + v16i8 pred = { 0 }; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + for (loop_cnt = 4; loop_cnt--;) { + LD2(src_ptr, src_stride, src0, src1); + src_ptr += (2 * src_stride); + LD2(pred_ptr, pred_stride, pred0, pred1); + pred_ptr += (2 * pred_stride); + + INSERT_D2_SB(src0, src1, src); + INSERT_D2_SB(pred0, pred1, pred); + ILVRL_B2_UB(src, pred, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff_ptr, diff_stride); + diff_ptr += (2 * diff_stride); + } +} + +static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + int8_t count; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + for (count = 2; count--;) { + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6, + pred7); + pred += (8 * pred_stride); + + ILVRL_B2_UB(src0, pred0, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src1, pred1, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src2, pred2, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src3, pred3, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src4, pred4, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src5, pred5, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src6, pred6, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src7, pred7, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + } +} + +static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + for (loop_cnt = 8; loop_cnt--;) { + LD_SB2(src, 16, src0, src1); + src += src_stride; + LD_SB2(src, 16, src2, src3); + src += src_stride; + LD_SB2(src, 16, src4, src5); + src += src_stride; + LD_SB2(src, 16, src6, src7); + src += src_stride; + + LD_SB2(pred, 16, pred0, pred1); + pred += pred_stride; + LD_SB2(pred, 16, pred2, pred3); + pred += pred_stride; + LD_SB2(pred, 16, pred4, pred5); + pred += pred_stride; + LD_SB2(pred, 16, pred6, pred7); + pred += pred_stride; + + ILVRL_B2_UB(src0, pred0, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src1, pred1, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + diff += diff_stride; + + ILVRL_B2_UB(src2, pred2, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src3, pred3, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + diff += diff_stride; + + ILVRL_B2_UB(src4, pred4, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src5, pred5, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + diff += diff_stride; + + ILVRL_B2_UB(src6, pred6, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src7, pred7, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + diff += diff_stride; + } +} + +static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + for (loop_cnt = 32; loop_cnt--;) { + LD_SB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_SB4(src, 16, src4, src5, src6, src7); + src += src_stride; + + LD_SB4(pred, 16, pred0, pred1, pred2, pred3); + pred += pred_stride; + LD_SB4(pred, 16, pred4, pred5, pred6, pred7); + pred += pred_stride; + + ILVRL_B2_UB(src0, pred0, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src1, pred1, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + ILVRL_B2_UB(src2, pred2, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 32, 8); + ILVRL_B2_UB(src3, pred3, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 48, 8); + diff += diff_stride; + + ILVRL_B2_UB(src4, pred4, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src5, pred5, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + ILVRL_B2_UB(src6, pred6, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 32, 8); + ILVRL_B2_UB(src7, pred7, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 48, 8); + diff += diff_stride; + } +} + +void aom_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride) { + if (rows == cols) { + switch (rows) { + case 4: + sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 8: + sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 16: + sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 32: + sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 64: + sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + default: + aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + } + } else { + aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + } +} diff --git a/libs/libaom/src/aom_dsp/mips/variance_msa.c b/libs/libaom/src/aom_dsp/mips/variance_msa.c new file mode 100644 index 000000000..065c09ac5 --- /dev/null +++ b/libs/libaom/src/aom_dsp/mips/variance_msa.c @@ -0,0 +1,633 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/mips/macros_msa.h" + +#define CALC_MSE_B(src, ref, var) \ + { \ + v16u8 src_l0_m, src_l1_m; \ + v8i16 res_l0_m, res_l1_m; \ + \ + ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ + HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ + DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ + } + +#define CALC_MSE_AVG_B(src, ref, var, sub) \ + { \ + v16u8 src_l0_m, src_l1_m; \ + v8i16 res_l0_m, res_l1_m; \ + \ + ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ + HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ + DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ + \ + sub += res_l0_m + res_l1_m; \ + } + +#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift) + +#define VARIANCE_LARGE_WxH(sse, diff, shift) \ + sse - (((int64_t)diff * diff) >> shift) + +static uint32_t sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + uint32_t src0, src1, src2, src3; + uint32_t ref0, ref1, ref2, ref3; + int32_t ht_cnt; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + INSERT_W4_UB(src0, src1, src2, src3, src); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + CALC_MSE_AVG_B(src, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_16width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + int32_t ht_cnt; + v16u8 src, ref; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_32width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_32x64_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 16; ht_cnt--;) { + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_64x32_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 16; ht_cnt--;) { + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src2, ref2, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src3, ref3, var, avg1); + + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src2, ref2, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src3, ref3, var, avg1); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_64x64_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v8i16 avg2 = { 0 }; + v8i16 avg3 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 32; ht_cnt--;) { + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + vec += __msa_hadd_s_w(avg2, avg2); + vec += __msa_hadd_s_w(avg3, avg3); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t get_mb_ss_msa(const int16_t *src) { + uint32_t sum, cnt; + v8i16 src0, src1, src2, src3; + v4i32 src0_l, src1_l, src2_l, src3_l; + v4i32 src0_r, src1_r, src2_r, src3_r; + v2i64 sq_src_l = { 0 }; + v2i64 sq_src_r = { 0 }; + + for (cnt = 8; cnt--;) { + LD_SH4(src, 8, src0, src1, src2, src3); + src += 4 * 8; + + UNPCK_SH_SW(src0, src0_l, src0_r); + UNPCK_SH_SW(src1, src1_l, src1_r); + UNPCK_SH_SW(src2, src2_l, src2_r); + UNPCK_SH_SW(src3, src3_l, src3_r); + + DPADD_SD2_SD(src0_l, src0_r, sq_src_l, sq_src_r); + DPADD_SD2_SD(src1_l, src1_r, sq_src_l, sq_src_r); + DPADD_SD2_SD(src2_l, src2_r, sq_src_l, sq_src_r); + DPADD_SD2_SD(src3_l, src3_r, sq_src_l, sq_src_r); + } + + sq_src_l += __msa_splati_d(sq_src_l, 1); + sq_src_r += __msa_splati_d(sq_src_r, 1); + + sum = __msa_copy_s_d(sq_src_l, 0); + sum += __msa_copy_s_d(sq_src_r, 0); + + return sum; +} + +static uint32_t sse_4width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + uint32_t src0, src1, src2, src3; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v4i32 var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + INSERT_W4_UB(src0, src1, src2, src3, src); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + CALC_MSE_B(src, ref, var); + } + + return HADD_SW_S32(var); +} + +static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v4i32 var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src1, ref1, var); + } + + return HADD_SW_S32(var); +} + +static uint32_t sse_16width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src, ref; + v4i32 var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + } + + return HADD_SW_S32(var); +} + +static uint32_t sse_32width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v4i32 var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src1, ref1, var); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src1, ref1, var); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src1, ref1, var); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src1, ref1, var); + } + + return HADD_SW_S32(var); +} + +static uint32_t sse_64width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v4i32 var = { 0 }; + + for (ht_cnt = height >> 1; ht_cnt--;) { + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src2, ref2, var); + CALC_MSE_B(src1, ref1, var); + CALC_MSE_B(src3, ref3, var); + + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src2, ref2, var); + CALC_MSE_B(src1, ref1, var); + CALC_MSE_B(src3, ref3, var); + } + + return HADD_SW_S32(var); +} + +uint32_t aom_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride) { + uint32_t err = 0; + uint32_t src0, src1, src2, src3; + uint32_t ref0, ref1, ref2, ref3; + v16i8 src = { 0 }; + v16i8 ref = { 0 }; + v16u8 src_vec0, src_vec1; + v8i16 diff0, diff1; + v4i32 err0 = { 0 }; + v4i32 err1 = { 0 }; + + LW4(src_ptr, src_stride, src0, src1, src2, src3); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + INSERT_W4_SB(src0, src1, src2, src3, src); + INSERT_W4_SB(ref0, ref1, ref2, ref3, ref); + ILVRL_B2_UB(src, ref, src_vec0, src_vec1); + HSUB_UB2_SH(src_vec0, src_vec1, diff0, diff1); + DPADD_SH2_SW(diff0, diff1, diff0, diff1, err0, err1); + err = HADD_SW_S32(err0); + err += HADD_SW_S32(err1); + + return err; +} + +#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4); +#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5); +#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5); +#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6); +#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7); +#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7); +#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8); + +#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); +#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); +#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10); +#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); +#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); +#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12); + +#define AOM_VARIANCE_WDXHT_MSA(wd, ht) \ + uint32_t aom_variance##wd##x##ht##_msa( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, uint32_t *sse) { \ + int32_t diff; \ + \ + *sse = \ + sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, ht, &diff); \ + \ + return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } + +/* clang-format off */ +AOM_VARIANCE_WDXHT_MSA(4, 4) +AOM_VARIANCE_WDXHT_MSA(4, 8) + +AOM_VARIANCE_WDXHT_MSA(8, 4) +AOM_VARIANCE_WDXHT_MSA(8, 8) +AOM_VARIANCE_WDXHT_MSA(8, 16) + +AOM_VARIANCE_WDXHT_MSA(16, 8) +AOM_VARIANCE_WDXHT_MSA(16, 16) +AOM_VARIANCE_WDXHT_MSA(16, 32) + +AOM_VARIANCE_WDXHT_MSA(32, 16) +AOM_VARIANCE_WDXHT_MSA(32, 32) +/* clang-format on */ + +uint32_t aom_variance32x64_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + int32_t diff; + + *sse = sse_diff_32x64_msa(src, src_stride, ref, ref_stride, &diff); + + return VARIANCE_32Wx64H(*sse, diff); +} + +uint32_t aom_variance64x32_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + int32_t diff; + + *sse = sse_diff_64x32_msa(src, src_stride, ref, ref_stride, &diff); + + return VARIANCE_64Wx32H(*sse, diff); +} + +uint32_t aom_variance64x64_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + int32_t diff; + + *sse = sse_diff_64x64_msa(src, src_stride, ref, ref_stride, &diff); + + return VARIANCE_64Wx64H(*sse, diff); +} + +uint32_t aom_mse8x8_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, uint32_t *sse) { + *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8); + + return *sse; +} + +uint32_t aom_mse8x16_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 16); + + return *sse; +} + +uint32_t aom_mse16x8_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 8); + + return *sse; +} + +uint32_t aom_mse16x16_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 16); + + return *sse; +} + +void aom_get8x8var_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, uint32_t *sse, + int32_t *sum) { + *sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum); +} + +void aom_get16x16var_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, uint32_t *sse, + int32_t *sum) { + *sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum); +} + +uint32_t aom_get_mb_ss_msa(const int16_t *src) { return get_mb_ss_msa(src); } diff --git a/libs/libaom/src/aom_dsp/noise_model.c b/libs/libaom/src/aom_dsp/noise_model.c new file mode 100644 index 000000000..c7a0003a8 --- /dev/null +++ b/libs/libaom/src/aom_dsp/noise_model.c @@ -0,0 +1,1654 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/noise_model.h" +#include "aom_dsp/noise_util.h" +#include "aom_mem/aom_mem.h" +#include "av1/common/common.h" +#include "av1/encoder/mathutils.h" + +#define kLowPolyNumParams 3 + +static const int kMaxLag = 4; + +// Defines a function that can be used to obtain the mean of a block for the +// provided data type (uint8_t, or uint16_t) +#define GET_BLOCK_MEAN(INT_TYPE, suffix) \ + static double get_block_mean_##suffix(const INT_TYPE *data, int w, int h, \ + int stride, int x_o, int y_o, \ + int block_size) { \ + const int max_h = AOMMIN(h - y_o, block_size); \ + const int max_w = AOMMIN(w - x_o, block_size); \ + double block_mean = 0; \ + for (int y = 0; y < max_h; ++y) { \ + for (int x = 0; x < max_w; ++x) { \ + block_mean += data[(y_o + y) * stride + x_o + x]; \ + } \ + } \ + return block_mean / (max_w * max_h); \ + } + +GET_BLOCK_MEAN(uint8_t, lowbd); +GET_BLOCK_MEAN(uint16_t, highbd); + +static INLINE double get_block_mean(const uint8_t *data, int w, int h, + int stride, int x_o, int y_o, + int block_size, int use_highbd) { + if (use_highbd) + return get_block_mean_highbd((const uint16_t *)data, w, h, stride, x_o, y_o, + block_size); + return get_block_mean_lowbd(data, w, h, stride, x_o, y_o, block_size); +} + +// Defines a function that can be used to obtain the variance of a block +// for the provided data type (uint8_t, or uint16_t) +#define GET_NOISE_VAR(INT_TYPE, suffix) \ + static double get_noise_var_##suffix( \ + const INT_TYPE *data, const INT_TYPE *denoised, int stride, int w, \ + int h, int x_o, int y_o, int block_size_x, int block_size_y) { \ + const int max_h = AOMMIN(h - y_o, block_size_y); \ + const int max_w = AOMMIN(w - x_o, block_size_x); \ + double noise_var = 0; \ + double noise_mean = 0; \ + for (int y = 0; y < max_h; ++y) { \ + for (int x = 0; x < max_w; ++x) { \ + double noise = (double)data[(y_o + y) * stride + x_o + x] - \ + denoised[(y_o + y) * stride + x_o + x]; \ + noise_mean += noise; \ + noise_var += noise * noise; \ + } \ + } \ + noise_mean /= (max_w * max_h); \ + return noise_var / (max_w * max_h) - noise_mean * noise_mean; \ + } + +GET_NOISE_VAR(uint8_t, lowbd); +GET_NOISE_VAR(uint16_t, highbd); + +static INLINE double get_noise_var(const uint8_t *data, const uint8_t *denoised, + int w, int h, int stride, int x_o, int y_o, + int block_size_x, int block_size_y, + int use_highbd) { + if (use_highbd) + return get_noise_var_highbd((const uint16_t *)data, + (const uint16_t *)denoised, w, h, stride, x_o, + y_o, block_size_x, block_size_y); + return get_noise_var_lowbd(data, denoised, w, h, stride, x_o, y_o, + block_size_x, block_size_y); +} + +static void equation_system_clear(aom_equation_system_t *eqns) { + const int n = eqns->n; + memset(eqns->A, 0, sizeof(*eqns->A) * n * n); + memset(eqns->x, 0, sizeof(*eqns->x) * n); + memset(eqns->b, 0, sizeof(*eqns->b) * n); +} + +static void equation_system_copy(aom_equation_system_t *dst, + const aom_equation_system_t *src) { + const int n = dst->n; + memcpy(dst->A, src->A, sizeof(*dst->A) * n * n); + memcpy(dst->x, src->x, sizeof(*dst->x) * n); + memcpy(dst->b, src->b, sizeof(*dst->b) * n); +} + +static int equation_system_init(aom_equation_system_t *eqns, int n) { + eqns->A = (double *)aom_malloc(sizeof(*eqns->A) * n * n); + eqns->b = (double *)aom_malloc(sizeof(*eqns->b) * n); + eqns->x = (double *)aom_malloc(sizeof(*eqns->x) * n); + eqns->n = n; + if (!eqns->A || !eqns->b || !eqns->x) { + fprintf(stderr, "Failed to allocate system of equations of size %d\n", n); + aom_free(eqns->A); + aom_free(eqns->b); + aom_free(eqns->x); + memset(eqns, 0, sizeof(*eqns)); + return 0; + } + equation_system_clear(eqns); + return 1; +} + +static int equation_system_solve(aom_equation_system_t *eqns) { + const int n = eqns->n; + double *b = (double *)aom_malloc(sizeof(*b) * n); + double *A = (double *)aom_malloc(sizeof(*A) * n * n); + int ret = 0; + if (A == NULL || b == NULL) { + fprintf(stderr, "Unable to allocate temp values of size %dx%d\n", n, n); + aom_free(b); + aom_free(A); + return 0; + } + memcpy(A, eqns->A, sizeof(*eqns->A) * n * n); + memcpy(b, eqns->b, sizeof(*eqns->b) * n); + ret = linsolve(n, A, eqns->n, b, eqns->x); + aom_free(b); + aom_free(A); + + if (ret == 0) { + return 0; + } + return 1; +} + +static void equation_system_add(aom_equation_system_t *dest, + aom_equation_system_t *src) { + const int n = dest->n; + int i, j; + for (i = 0; i < n; ++i) { + for (j = 0; j < n; ++j) { + dest->A[i * n + j] += src->A[i * n + j]; + } + dest->b[i] += src->b[i]; + } +} + +static void equation_system_free(aom_equation_system_t *eqns) { + if (!eqns) return; + aom_free(eqns->A); + aom_free(eqns->b); + aom_free(eqns->x); + memset(eqns, 0, sizeof(*eqns)); +} + +static void noise_strength_solver_clear(aom_noise_strength_solver_t *solver) { + equation_system_clear(&solver->eqns); + solver->num_equations = 0; + solver->total = 0; +} + +static void noise_strength_solver_add(aom_noise_strength_solver_t *dest, + aom_noise_strength_solver_t *src) { + equation_system_add(&dest->eqns, &src->eqns); + dest->num_equations += src->num_equations; + dest->total += src->total; +} + +// Return the number of coefficients required for the given parameters +static int num_coeffs(const aom_noise_model_params_t params) { + const int n = 2 * params.lag + 1; + switch (params.shape) { + case AOM_NOISE_SHAPE_DIAMOND: return params.lag * (params.lag + 1); + case AOM_NOISE_SHAPE_SQUARE: return (n * n) / 2; + } + return 0; +} + +static int noise_state_init(aom_noise_state_t *state, int n, int bit_depth) { + const int kNumBins = 20; + if (!equation_system_init(&state->eqns, n)) { + fprintf(stderr, "Failed initialization noise state with size %d\n", n); + return 0; + } + state->ar_gain = 1.0; + state->num_observations = 0; + return aom_noise_strength_solver_init(&state->strength_solver, kNumBins, + bit_depth); +} + +static void set_chroma_coefficient_fallback_soln(aom_equation_system_t *eqns) { + const double kTolerance = 1e-6; + const int last = eqns->n - 1; + // Set all of the AR coefficients to zero, but try to solve for correlation + // with the luma channel + memset(eqns->x, 0, sizeof(*eqns->x) * eqns->n); + if (fabs(eqns->A[last * eqns->n + last]) > kTolerance) { + eqns->x[last] = eqns->b[last] / eqns->A[last * eqns->n + last]; + } +} + +int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points) { + if (!lut) return 0; + lut->num_points = 0; + lut->points = (double(*)[2])aom_malloc(num_points * sizeof(*lut->points)); + if (!lut->points) return 0; + lut->num_points = num_points; + memset(lut->points, 0, sizeof(*lut->points) * num_points); + return 1; +} + +void aom_noise_strength_lut_free(aom_noise_strength_lut_t *lut) { + if (!lut) return; + aom_free(lut->points); + memset(lut, 0, sizeof(*lut)); +} + +double aom_noise_strength_lut_eval(const aom_noise_strength_lut_t *lut, + double x) { + int i = 0; + // Constant extrapolation for x < x_0. + if (x < lut->points[0][0]) return lut->points[0][1]; + for (i = 0; i < lut->num_points - 1; ++i) { + if (x >= lut->points[i][0] && x <= lut->points[i + 1][0]) { + const double a = + (x - lut->points[i][0]) / (lut->points[i + 1][0] - lut->points[i][0]); + return lut->points[i + 1][1] * a + lut->points[i][1] * (1.0 - a); + } + } + // Constant extrapolation for x > x_{n-1} + return lut->points[lut->num_points - 1][1]; +} + +static double noise_strength_solver_get_bin_index( + const aom_noise_strength_solver_t *solver, double value) { + const double val = + fclamp(value, solver->min_intensity, solver->max_intensity); + const double range = solver->max_intensity - solver->min_intensity; + return (solver->num_bins - 1) * (val - solver->min_intensity) / range; +} + +static double noise_strength_solver_get_value( + const aom_noise_strength_solver_t *solver, double x) { + const double bin = noise_strength_solver_get_bin_index(solver, x); + const int bin_i0 = (int)floor(bin); + const int bin_i1 = AOMMIN(solver->num_bins - 1, bin_i0 + 1); + const double a = bin - bin_i0; + return (1.0 - a) * solver->eqns.x[bin_i0] + a * solver->eqns.x[bin_i1]; +} + +void aom_noise_strength_solver_add_measurement( + aom_noise_strength_solver_t *solver, double block_mean, double noise_std) { + const double bin = noise_strength_solver_get_bin_index(solver, block_mean); + const int bin_i0 = (int)floor(bin); + const int bin_i1 = AOMMIN(solver->num_bins - 1, bin_i0 + 1); + const double a = bin - bin_i0; + const int n = solver->num_bins; + solver->eqns.A[bin_i0 * n + bin_i0] += (1.0 - a) * (1.0 - a); + solver->eqns.A[bin_i1 * n + bin_i0] += a * (1.0 - a); + solver->eqns.A[bin_i1 * n + bin_i1] += a * a; + solver->eqns.A[bin_i0 * n + bin_i1] += a * (1.0 - a); + solver->eqns.b[bin_i0] += (1.0 - a) * noise_std; + solver->eqns.b[bin_i1] += a * noise_std; + solver->total += noise_std; + solver->num_equations++; +} + +int aom_noise_strength_solver_solve(aom_noise_strength_solver_t *solver) { + // Add regularization proportional to the number of constraints + const int n = solver->num_bins; + const double kAlpha = 2.0 * (double)(solver->num_equations) / n; + int result = 0; + double mean = 0; + + // Do this in a non-destructive manner so it is not confusing to the caller + double *old_A = solver->eqns.A; + double *A = (double *)aom_malloc(sizeof(*A) * n * n); + if (!A) { + fprintf(stderr, "Unable to allocate copy of A\n"); + return 0; + } + memcpy(A, old_A, sizeof(*A) * n * n); + + for (int i = 0; i < n; ++i) { + const int i_lo = AOMMAX(0, i - 1); + const int i_hi = AOMMIN(n - 1, i + 1); + A[i * n + i_lo] -= kAlpha; + A[i * n + i] += 2 * kAlpha; + A[i * n + i_hi] -= kAlpha; + } + + // Small regularization to give average noise strength + mean = solver->total / solver->num_equations; + for (int i = 0; i < n; ++i) { + A[i * n + i] += 1.0 / 8192.; + solver->eqns.b[i] += mean / 8192.; + } + solver->eqns.A = A; + result = equation_system_solve(&solver->eqns); + solver->eqns.A = old_A; + + aom_free(A); + return result; +} + +int aom_noise_strength_solver_init(aom_noise_strength_solver_t *solver, + int num_bins, int bit_depth) { + if (!solver) return 0; + memset(solver, 0, sizeof(*solver)); + solver->num_bins = num_bins; + solver->min_intensity = 0; + solver->max_intensity = (1 << bit_depth) - 1; + solver->total = 0; + solver->num_equations = 0; + return equation_system_init(&solver->eqns, num_bins); +} + +void aom_noise_strength_solver_free(aom_noise_strength_solver_t *solver) { + if (!solver) return; + equation_system_free(&solver->eqns); +} + +double aom_noise_strength_solver_get_center( + const aom_noise_strength_solver_t *solver, int i) { + const double range = solver->max_intensity - solver->min_intensity; + const int n = solver->num_bins; + return ((double)i) / (n - 1) * range + solver->min_intensity; +} + +// Computes the residual if a point were to be removed from the lut. This is +// calculated as the area between the output of the solver and the line segment +// that would be formed between [x_{i - 1}, x_{i + 1}). +static void update_piecewise_linear_residual( + const aom_noise_strength_solver_t *solver, + const aom_noise_strength_lut_t *lut, double *residual, int start, int end) { + const double dx = 255. / solver->num_bins; + for (int i = AOMMAX(start, 1); i < AOMMIN(end, lut->num_points - 1); ++i) { + const int lower = AOMMAX(0, (int)floor(noise_strength_solver_get_bin_index( + solver, lut->points[i - 1][0]))); + const int upper = AOMMIN(solver->num_bins - 1, + (int)ceil(noise_strength_solver_get_bin_index( + solver, lut->points[i + 1][0]))); + double r = 0; + for (int j = lower; j <= upper; ++j) { + const double x = aom_noise_strength_solver_get_center(solver, j); + if (x < lut->points[i - 1][0]) continue; + if (x >= lut->points[i + 1][0]) continue; + const double y = solver->eqns.x[j]; + const double a = (x - lut->points[i - 1][0]) / + (lut->points[i + 1][0] - lut->points[i - 1][0]); + const double estimate_y = + lut->points[i - 1][1] * (1.0 - a) + lut->points[i + 1][1] * a; + r += fabs(y - estimate_y); + } + residual[i] = r * dx; + } +} + +int aom_noise_strength_solver_fit_piecewise( + const aom_noise_strength_solver_t *solver, int max_output_points, + aom_noise_strength_lut_t *lut) { + // The tolerance is normalized to be give consistent results between + // different bit-depths. + const double kTolerance = solver->max_intensity * 0.00625 / 255.0; + if (!aom_noise_strength_lut_init(lut, solver->num_bins)) { + fprintf(stderr, "Failed to init lut\n"); + return 0; + } + for (int i = 0; i < solver->num_bins; ++i) { + lut->points[i][0] = aom_noise_strength_solver_get_center(solver, i); + lut->points[i][1] = solver->eqns.x[i]; + } + if (max_output_points < 0) { + max_output_points = solver->num_bins; + } + + double *residual = aom_malloc(solver->num_bins * sizeof(*residual)); + memset(residual, 0, sizeof(*residual) * solver->num_bins); + + update_piecewise_linear_residual(solver, lut, residual, 0, solver->num_bins); + + // Greedily remove points if there are too many or if it doesn't hurt local + // approximation (never remove the end points) + while (lut->num_points > 2) { + int min_index = 1; + for (int j = 1; j < lut->num_points - 1; ++j) { + if (residual[j] < residual[min_index]) { + min_index = j; + } + } + const double dx = + lut->points[min_index + 1][0] - lut->points[min_index - 1][0]; + const double avg_residual = residual[min_index] / dx; + if (lut->num_points <= max_output_points && avg_residual > kTolerance) { + break; + } + + const int num_remaining = lut->num_points - min_index - 1; + memmove(lut->points + min_index, lut->points + min_index + 1, + sizeof(lut->points[0]) * num_remaining); + lut->num_points--; + + update_piecewise_linear_residual(solver, lut, residual, min_index - 1, + min_index + 1); + } + aom_free(residual); + return 1; +} + +int aom_flat_block_finder_init(aom_flat_block_finder_t *block_finder, + int block_size, int bit_depth, int use_highbd) { + const int n = block_size * block_size; + aom_equation_system_t eqns; + double *AtA_inv = 0; + double *A = 0; + int x = 0, y = 0, i = 0, j = 0; + block_finder->A = NULL; + block_finder->AtA_inv = NULL; + + if (!equation_system_init(&eqns, kLowPolyNumParams)) { + fprintf(stderr, "Failed to init equation system for block_size=%d\n", + block_size); + return 0; + } + + AtA_inv = (double *)aom_malloc(kLowPolyNumParams * kLowPolyNumParams * + sizeof(*AtA_inv)); + A = (double *)aom_malloc(kLowPolyNumParams * n * sizeof(*A)); + if (AtA_inv == NULL || A == NULL) { + fprintf(stderr, "Failed to alloc A or AtA_inv for block_size=%d\n", + block_size); + aom_free(AtA_inv); + aom_free(A); + equation_system_free(&eqns); + return 0; + } + + block_finder->A = A; + block_finder->AtA_inv = AtA_inv; + block_finder->block_size = block_size; + block_finder->normalization = (1 << bit_depth) - 1; + block_finder->use_highbd = use_highbd; + + for (y = 0; y < block_size; ++y) { + const double yd = ((double)y - block_size / 2.) / (block_size / 2.); + for (x = 0; x < block_size; ++x) { + const double xd = ((double)x - block_size / 2.) / (block_size / 2.); + const double coords[3] = { yd, xd, 1 }; + const int row = y * block_size + x; + A[kLowPolyNumParams * row + 0] = yd; + A[kLowPolyNumParams * row + 1] = xd; + A[kLowPolyNumParams * row + 2] = 1; + + for (i = 0; i < kLowPolyNumParams; ++i) { + for (j = 0; j < kLowPolyNumParams; ++j) { + eqns.A[kLowPolyNumParams * i + j] += coords[i] * coords[j]; + } + } + } + } + + // Lazy inverse using existing equation solver. + for (i = 0; i < kLowPolyNumParams; ++i) { + memset(eqns.b, 0, sizeof(*eqns.b) * kLowPolyNumParams); + eqns.b[i] = 1; + equation_system_solve(&eqns); + + for (j = 0; j < kLowPolyNumParams; ++j) { + AtA_inv[j * kLowPolyNumParams + i] = eqns.x[j]; + } + } + equation_system_free(&eqns); + return 1; +} + +void aom_flat_block_finder_free(aom_flat_block_finder_t *block_finder) { + if (!block_finder) return; + aom_free(block_finder->A); + aom_free(block_finder->AtA_inv); + memset(block_finder, 0, sizeof(*block_finder)); +} + +void aom_flat_block_finder_extract_block( + const aom_flat_block_finder_t *block_finder, const uint8_t *const data, + int w, int h, int stride, int offsx, int offsy, double *plane, + double *block) { + const int block_size = block_finder->block_size; + const int n = block_size * block_size; + const double *A = block_finder->A; + const double *AtA_inv = block_finder->AtA_inv; + double plane_coords[kLowPolyNumParams]; + double AtA_inv_b[kLowPolyNumParams]; + int xi, yi, i; + + if (block_finder->use_highbd) { + const uint16_t *const data16 = (const uint16_t *const)data; + for (yi = 0; yi < block_size; ++yi) { + const int y = clamp(offsy + yi, 0, h - 1); + for (xi = 0; xi < block_size; ++xi) { + const int x = clamp(offsx + xi, 0, w - 1); + block[yi * block_size + xi] = + ((double)data16[y * stride + x]) / block_finder->normalization; + } + } + } else { + for (yi = 0; yi < block_size; ++yi) { + const int y = clamp(offsy + yi, 0, h - 1); + for (xi = 0; xi < block_size; ++xi) { + const int x = clamp(offsx + xi, 0, w - 1); + block[yi * block_size + xi] = + ((double)data[y * stride + x]) / block_finder->normalization; + } + } + } + multiply_mat(block, A, AtA_inv_b, 1, n, kLowPolyNumParams); + multiply_mat(AtA_inv, AtA_inv_b, plane_coords, kLowPolyNumParams, + kLowPolyNumParams, 1); + multiply_mat(A, plane_coords, plane, n, kLowPolyNumParams, 1); + + for (i = 0; i < n; ++i) { + block[i] -= plane[i]; + } +} + +typedef struct { + int index; + float score; +} index_and_score_t; + +static int compare_scores(const void *a, const void *b) { + const float diff = + ((index_and_score_t *)a)->score - ((index_and_score_t *)b)->score; + if (diff < 0) + return -1; + else if (diff > 0) + return 1; + return 0; +} + +int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder, + const uint8_t *const data, int w, int h, + int stride, uint8_t *flat_blocks) { + // The gradient-based features used in this code are based on: + // A. Kokaram, D. Kelly, H. Denman and A. Crawford, "Measuring noise + // correlation for improved video denoising," 2012 19th, ICIP. + // The thresholds are more lenient to allow for correct grain modeling + // if extreme cases. + const int block_size = block_finder->block_size; + const int n = block_size * block_size; + const double kTraceThreshold = 0.15 / (32 * 32); + const double kRatioThreshold = 1.25; + const double kNormThreshold = 0.08 / (32 * 32); + const double kVarThreshold = 0.005 / (double)n; + const int num_blocks_w = (w + block_size - 1) / block_size; + const int num_blocks_h = (h + block_size - 1) / block_size; + int num_flat = 0; + int bx = 0, by = 0; + double *plane = (double *)aom_malloc(n * sizeof(*plane)); + double *block = (double *)aom_malloc(n * sizeof(*block)); + index_and_score_t *scores = (index_and_score_t *)aom_malloc( + num_blocks_w * num_blocks_h * sizeof(*scores)); + if (plane == NULL || block == NULL || scores == NULL) { + fprintf(stderr, "Failed to allocate memory for block of size %d\n", n); + aom_free(plane); + aom_free(block); + aom_free(scores); + return -1; + } + +#ifdef NOISE_MODEL_LOG_SCORE + fprintf(stderr, "score = ["); +#endif + for (by = 0; by < num_blocks_h; ++by) { + for (bx = 0; bx < num_blocks_w; ++bx) { + // Compute gradient covariance matrix. + double Gxx = 0, Gxy = 0, Gyy = 0; + double var = 0; + double mean = 0; + int xi, yi; + aom_flat_block_finder_extract_block(block_finder, data, w, h, stride, + bx * block_size, by * block_size, + plane, block); + + for (yi = 1; yi < block_size - 1; ++yi) { + for (xi = 1; xi < block_size - 1; ++xi) { + const double gx = (block[yi * block_size + xi + 1] - + block[yi * block_size + xi - 1]) / + 2; + const double gy = (block[yi * block_size + xi + block_size] - + block[yi * block_size + xi - block_size]) / + 2; + Gxx += gx * gx; + Gxy += gx * gy; + Gyy += gy * gy; + + mean += block[yi * block_size + xi]; + var += block[yi * block_size + xi] * block[yi * block_size + xi]; + } + } + mean /= (block_size - 2) * (block_size - 2); + + // Normalize gradients by block_size. + Gxx /= ((block_size - 2) * (block_size - 2)); + Gxy /= ((block_size - 2) * (block_size - 2)); + Gyy /= ((block_size - 2) * (block_size - 2)); + var = var / ((block_size - 2) * (block_size - 2)) - mean * mean; + + { + const double trace = Gxx + Gyy; + const double det = Gxx * Gyy - Gxy * Gxy; + const double e1 = (trace + sqrt(trace * trace - 4 * det)) / 2.; + const double e2 = (trace - sqrt(trace * trace - 4 * det)) / 2.; + const double norm = e1; // Spectral norm + const double ratio = (e1 / AOMMAX(e2, 1e-6)); + const int is_flat = (trace < kTraceThreshold) && + (ratio < kRatioThreshold) && + (norm < kNormThreshold) && (var > kVarThreshold); + // The following weights are used to combine the above features to give + // a sigmoid score for flatness. If the input was normalized to [0,100] + // the magnitude of these values would be close to 1 (e.g., weights + // corresponding to variance would be a factor of 10000x smaller). + // The weights are given in the following order: + // [{var}, {ratio}, {trace}, {norm}, offset] + // with one of the most discriminative being simply the variance. + const double weights[5] = { -6682, -0.2056, 13087, -12434, 2.5694 }; + double sum_weights = weights[0] * var + weights[1] * ratio + + weights[2] * trace + weights[3] * norm + + weights[4]; + // clamp the value to [-25.0, 100.0] to prevent overflow + sum_weights = fclamp(sum_weights, -25.0, 100.0); + const float score = (float)(1.0 / (1 + exp(-sum_weights))); + flat_blocks[by * num_blocks_w + bx] = is_flat ? 255 : 0; + scores[by * num_blocks_w + bx].score = var > kVarThreshold ? score : 0; + scores[by * num_blocks_w + bx].index = by * num_blocks_w + bx; +#ifdef NOISE_MODEL_LOG_SCORE + fprintf(stderr, "%g %g %g %g %g %d ", score, var, ratio, trace, norm, + is_flat); +#endif + num_flat += is_flat; + } + } +#ifdef NOISE_MODEL_LOG_SCORE + fprintf(stderr, "\n"); +#endif + } +#ifdef NOISE_MODEL_LOG_SCORE + fprintf(stderr, "];\n"); +#endif + // Find the top-scored blocks (most likely to be flat) and set the flat blocks + // be the union of the thresholded results and the top 10th percentile of the + // scored results. + qsort(scores, num_blocks_w * num_blocks_h, sizeof(*scores), &compare_scores); + const int top_nth_percentile = num_blocks_w * num_blocks_h * 90 / 100; + const float score_threshold = scores[top_nth_percentile].score; + for (int i = 0; i < num_blocks_w * num_blocks_h; ++i) { + if (scores[i].score >= score_threshold) { + num_flat += flat_blocks[scores[i].index] == 0; + flat_blocks[scores[i].index] |= 1; + } + } + aom_free(block); + aom_free(plane); + aom_free(scores); + return num_flat; +} + +int aom_noise_model_init(aom_noise_model_t *model, + const aom_noise_model_params_t params) { + const int n = num_coeffs(params); + const int lag = params.lag; + const int bit_depth = params.bit_depth; + int x = 0, y = 0, i = 0, c = 0; + + memset(model, 0, sizeof(*model)); + if (params.lag < 1) { + fprintf(stderr, "Invalid noise param: lag = %d must be >= 1\n", params.lag); + return 0; + } + if (params.lag > kMaxLag) { + fprintf(stderr, "Invalid noise param: lag = %d must be <= %d\n", params.lag, + kMaxLag); + return 0; + } + + memcpy(&model->params, ¶ms, sizeof(params)); + for (c = 0; c < 3; ++c) { + if (!noise_state_init(&model->combined_state[c], n + (c > 0), bit_depth)) { + fprintf(stderr, "Failed to allocate noise state for channel %d\n", c); + aom_noise_model_free(model); + return 0; + } + if (!noise_state_init(&model->latest_state[c], n + (c > 0), bit_depth)) { + fprintf(stderr, "Failed to allocate noise state for channel %d\n", c); + aom_noise_model_free(model); + return 0; + } + } + model->n = n; + model->coords = (int(*)[2])aom_malloc(sizeof(*model->coords) * n); + + for (y = -lag; y <= 0; ++y) { + const int max_x = y == 0 ? -1 : lag; + for (x = -lag; x <= max_x; ++x) { + switch (params.shape) { + case AOM_NOISE_SHAPE_DIAMOND: + if (abs(x) <= y + lag) { + model->coords[i][0] = x; + model->coords[i][1] = y; + ++i; + } + break; + case AOM_NOISE_SHAPE_SQUARE: + model->coords[i][0] = x; + model->coords[i][1] = y; + ++i; + break; + default: + fprintf(stderr, "Invalid shape\n"); + aom_noise_model_free(model); + return 0; + } + } + } + assert(i == n); + return 1; +} + +void aom_noise_model_free(aom_noise_model_t *model) { + int c = 0; + if (!model) return; + + aom_free(model->coords); + for (c = 0; c < 3; ++c) { + equation_system_free(&model->latest_state[c].eqns); + equation_system_free(&model->combined_state[c].eqns); + + equation_system_free(&model->latest_state[c].strength_solver.eqns); + equation_system_free(&model->combined_state[c].strength_solver.eqns); + } + memset(model, 0, sizeof(*model)); +} + +// Extracts the neighborhood defined by coords around point (x, y) from +// the difference between the data and denoised images. Also extracts the +// entry (possibly downsampled) for (x, y) in the alt_data (e.g., luma). +#define EXTRACT_AR_ROW(INT_TYPE, suffix) \ + static double extract_ar_row_##suffix( \ + int(*coords)[2], int num_coords, const INT_TYPE *const data, \ + const INT_TYPE *const denoised, int stride, int sub_log2[2], \ + const INT_TYPE *const alt_data, const INT_TYPE *const alt_denoised, \ + int alt_stride, int x, int y, double *buffer) { \ + for (int i = 0; i < num_coords; ++i) { \ + const int x_i = x + coords[i][0], y_i = y + coords[i][1]; \ + buffer[i] = \ + (double)data[y_i * stride + x_i] - denoised[y_i * stride + x_i]; \ + } \ + const double val = \ + (double)data[y * stride + x] - denoised[y * stride + x]; \ + \ + if (alt_data && alt_denoised) { \ + double avg_data = 0, avg_denoised = 0; \ + int num_samples = 0; \ + for (int dy_i = 0; dy_i < (1 << sub_log2[1]); dy_i++) { \ + const int y_up = (y << sub_log2[1]) + dy_i; \ + for (int dx_i = 0; dx_i < (1 << sub_log2[0]); dx_i++) { \ + const int x_up = (x << sub_log2[0]) + dx_i; \ + avg_data += alt_data[y_up * alt_stride + x_up]; \ + avg_denoised += alt_denoised[y_up * alt_stride + x_up]; \ + num_samples++; \ + } \ + } \ + buffer[num_coords] = (avg_data - avg_denoised) / num_samples; \ + } \ + return val; \ + } + +EXTRACT_AR_ROW(uint8_t, lowbd); +EXTRACT_AR_ROW(uint16_t, highbd); + +static int add_block_observations( + aom_noise_model_t *noise_model, int c, const uint8_t *const data, + const uint8_t *const denoised, int w, int h, int stride, int sub_log2[2], + const uint8_t *const alt_data, const uint8_t *const alt_denoised, + int alt_stride, const uint8_t *const flat_blocks, int block_size, + int num_blocks_w, int num_blocks_h) { + const int lag = noise_model->params.lag; + const int num_coords = noise_model->n; + const double normalization = (1 << noise_model->params.bit_depth) - 1; + double *A = noise_model->latest_state[c].eqns.A; + double *b = noise_model->latest_state[c].eqns.b; + double *buffer = (double *)aom_malloc(sizeof(*buffer) * (num_coords + 1)); + const int n = noise_model->latest_state[c].eqns.n; + + if (!buffer) { + fprintf(stderr, "Unable to allocate buffer of size %d\n", num_coords + 1); + return 0; + } + for (int by = 0; by < num_blocks_h; ++by) { + const int y_o = by * (block_size >> sub_log2[1]); + for (int bx = 0; bx < num_blocks_w; ++bx) { + const int x_o = bx * (block_size >> sub_log2[0]); + if (!flat_blocks[by * num_blocks_w + bx]) { + continue; + } + int y_start = + (by > 0 && flat_blocks[(by - 1) * num_blocks_w + bx]) ? 0 : lag; + int x_start = + (bx > 0 && flat_blocks[by * num_blocks_w + bx - 1]) ? 0 : lag; + int y_end = AOMMIN((h >> sub_log2[1]) - by * (block_size >> sub_log2[1]), + block_size >> sub_log2[1]); + int x_end = AOMMIN( + (w >> sub_log2[0]) - bx * (block_size >> sub_log2[0]) - lag, + (bx + 1 < num_blocks_w && flat_blocks[by * num_blocks_w + bx + 1]) + ? (block_size >> sub_log2[0]) + : ((block_size >> sub_log2[0]) - lag)); + for (int y = y_start; y < y_end; ++y) { + for (int x = x_start; x < x_end; ++x) { + const double val = + noise_model->params.use_highbd + ? extract_ar_row_highbd(noise_model->coords, num_coords, + (const uint16_t *const)data, + (const uint16_t *const)denoised, + stride, sub_log2, + (const uint16_t *const)alt_data, + (const uint16_t *const)alt_denoised, + alt_stride, x + x_o, y + y_o, buffer) + : extract_ar_row_lowbd(noise_model->coords, num_coords, data, + denoised, stride, sub_log2, alt_data, + alt_denoised, alt_stride, x + x_o, + y + y_o, buffer); + for (int i = 0; i < n; ++i) { + for (int j = 0; j < n; ++j) { + A[i * n + j] += + (buffer[i] * buffer[j]) / (normalization * normalization); + } + b[i] += (buffer[i] * val) / (normalization * normalization); + } + noise_model->latest_state[c].num_observations++; + } + } + } + } + aom_free(buffer); + return 1; +} + +static void add_noise_std_observations( + aom_noise_model_t *noise_model, int c, const double *coeffs, + const uint8_t *const data, const uint8_t *const denoised, int w, int h, + int stride, int sub_log2[2], const uint8_t *const alt_data, int alt_stride, + const uint8_t *const flat_blocks, int block_size, int num_blocks_w, + int num_blocks_h) { + const int num_coords = noise_model->n; + aom_noise_strength_solver_t *noise_strength_solver = + &noise_model->latest_state[c].strength_solver; + + const aom_noise_strength_solver_t *noise_strength_luma = + &noise_model->latest_state[0].strength_solver; + const double luma_gain = noise_model->latest_state[0].ar_gain; + const double noise_gain = noise_model->latest_state[c].ar_gain; + for (int by = 0; by < num_blocks_h; ++by) { + const int y_o = by * (block_size >> sub_log2[1]); + for (int bx = 0; bx < num_blocks_w; ++bx) { + const int x_o = bx * (block_size >> sub_log2[0]); + if (!flat_blocks[by * num_blocks_w + bx]) { + continue; + } + const int num_samples_h = + AOMMIN((h >> sub_log2[1]) - by * (block_size >> sub_log2[1]), + block_size >> sub_log2[1]); + const int num_samples_w = + AOMMIN((w >> sub_log2[0]) - bx * (block_size >> sub_log2[0]), + (block_size >> sub_log2[0])); + // Make sure that we have a reasonable amount of samples to consider the + // block + if (num_samples_w * num_samples_h > block_size) { + const double block_mean = get_block_mean( + alt_data ? alt_data : data, w, h, alt_data ? alt_stride : stride, + x_o << sub_log2[0], y_o << sub_log2[1], block_size, + noise_model->params.use_highbd); + const double noise_var = get_noise_var( + data, denoised, stride, w >> sub_log2[0], h >> sub_log2[1], x_o, + y_o, block_size >> sub_log2[0], block_size >> sub_log2[1], + noise_model->params.use_highbd); + // We want to remove the part of the noise that came from being + // correlated with luma. Note that the noise solver for luma must + // have already been run. + const double luma_strength = + c > 0 ? luma_gain * noise_strength_solver_get_value( + noise_strength_luma, block_mean) + : 0; + const double corr = c > 0 ? coeffs[num_coords] : 0; + // Chroma noise: + // N(0, noise_var) = N(0, uncorr_var) + corr * N(0, luma_strength^2) + // The uncorrelated component: + // uncorr_var = noise_var - (corr * luma_strength)^2 + // But don't allow fully correlated noise (hence the max), since the + // synthesis cannot model it. + const double uncorr_std = sqrt( + AOMMAX(noise_var / 16, noise_var - pow(corr * luma_strength, 2))); + // After we've removed correlation with luma, undo the gain that will + // come from running the IIR filter. + const double adjusted_strength = uncorr_std / noise_gain; + aom_noise_strength_solver_add_measurement( + noise_strength_solver, block_mean, adjusted_strength); + } + } + } +} + +// Return true if the noise estimate appears to be different from the combined +// (multi-frame) estimate. The difference is measured by checking whether the +// AR coefficients have diverged (using a threshold on normalized cross +// correlation), or whether the noise strength has changed. +static int is_noise_model_different(aom_noise_model_t *const noise_model) { + // These thresholds are kind of arbitrary and will likely need further tuning + // (or exported as parameters). The threshold on noise strength is a weighted + // difference between the noise strength histograms + const double kCoeffThreshold = 0.9; + const double kStrengthThreshold = + 0.005 * (1 << (noise_model->params.bit_depth - 8)); + for (int c = 0; c < 1; ++c) { + const double corr = + aom_normalized_cross_correlation(noise_model->latest_state[c].eqns.x, + noise_model->combined_state[c].eqns.x, + noise_model->combined_state[c].eqns.n); + if (corr < kCoeffThreshold) return 1; + + const double dx = + 1.0 / noise_model->latest_state[c].strength_solver.num_bins; + + const aom_equation_system_t *latest_eqns = + &noise_model->latest_state[c].strength_solver.eqns; + const aom_equation_system_t *combined_eqns = + &noise_model->combined_state[c].strength_solver.eqns; + double diff = 0; + double total_weight = 0; + for (int j = 0; j < latest_eqns->n; ++j) { + double weight = 0; + for (int i = 0; i < latest_eqns->n; ++i) { + weight += latest_eqns->A[i * latest_eqns->n + j]; + } + weight = sqrt(weight); + diff += weight * fabs(latest_eqns->x[j] - combined_eqns->x[j]); + total_weight += weight; + } + if (diff * dx / total_weight > kStrengthThreshold) return 1; + } + return 0; +} + +static int ar_equation_system_solve(aom_noise_state_t *state, int is_chroma) { + const int ret = equation_system_solve(&state->eqns); + state->ar_gain = 1.0; + if (!ret) return ret; + + // Update the AR gain from the equation system as it will be used to fit + // the noise strength as a function of intensity. In the Yule-Walker + // equations, the diagonal should be the variance of the correlated noise. + // In the case of the least squares estimate, there will be some variability + // in the diagonal. So use the mean of the diagonal as the estimate of + // overall variance (this works for least squares or Yule-Walker formulation). + double var = 0; + const int n = state->eqns.n; + for (int i = 0; i < (state->eqns.n - is_chroma); ++i) { + var += state->eqns.A[i * n + i] / state->num_observations; + } + var /= (n - is_chroma); + + // Keep track of E(Y^2) = + E(X^2) + // In the case that we are using chroma and have an estimate of correlation + // with luma we adjust that estimate slightly to remove the correlated bits by + // subtracting out the last column of a scaled by our correlation estimate + // from b. E(y^2) = + double sum_covar = 0; + for (int i = 0; i < state->eqns.n - is_chroma; ++i) { + double bi = state->eqns.b[i]; + if (is_chroma) { + bi -= state->eqns.A[i * n + (n - 1)] * state->eqns.x[n - 1]; + } + sum_covar += (bi * state->eqns.x[i]) / state->num_observations; + } + // Now, get an estimate of the variance of uncorrelated noise signal and use + // it to determine the gain of the AR filter. + const double noise_var = AOMMAX(var - sum_covar, 1e-6); + state->ar_gain = AOMMAX(1, sqrt(AOMMAX(var / noise_var, 1e-6))); + return ret; +} + +aom_noise_status_t aom_noise_model_update( + aom_noise_model_t *const noise_model, const uint8_t *const data[3], + const uint8_t *const denoised[3], int w, int h, int stride[3], + int chroma_sub_log2[2], const uint8_t *const flat_blocks, int block_size) { + const int num_blocks_w = (w + block_size - 1) / block_size; + const int num_blocks_h = (h + block_size - 1) / block_size; + int y_model_different = 0; + int num_blocks = 0; + int i = 0, channel = 0; + + if (block_size <= 1) { + fprintf(stderr, "block_size = %d must be > 1\n", block_size); + return AOM_NOISE_STATUS_INVALID_ARGUMENT; + } + + if (block_size < noise_model->params.lag * 2 + 1) { + fprintf(stderr, "block_size = %d must be >= %d\n", block_size, + noise_model->params.lag * 2 + 1); + return AOM_NOISE_STATUS_INVALID_ARGUMENT; + } + + // Clear the latest equation system + for (i = 0; i < 3; ++i) { + equation_system_clear(&noise_model->latest_state[i].eqns); + noise_model->latest_state[i].num_observations = 0; + noise_strength_solver_clear(&noise_model->latest_state[i].strength_solver); + } + + // Check that we have enough flat blocks + for (i = 0; i < num_blocks_h * num_blocks_w; ++i) { + if (flat_blocks[i]) { + num_blocks++; + } + } + + if (num_blocks <= 1) { + fprintf(stderr, "Not enough flat blocks to update noise estimate\n"); + return AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS; + } + + for (channel = 0; channel < 3; ++channel) { + int no_subsampling[2] = { 0, 0 }; + const uint8_t *alt_data = channel > 0 ? data[0] : 0; + const uint8_t *alt_denoised = channel > 0 ? denoised[0] : 0; + int *sub = channel > 0 ? chroma_sub_log2 : no_subsampling; + const int is_chroma = channel != 0; + if (!data[channel] || !denoised[channel]) break; + if (!add_block_observations(noise_model, channel, data[channel], + denoised[channel], w, h, stride[channel], sub, + alt_data, alt_denoised, stride[0], flat_blocks, + block_size, num_blocks_w, num_blocks_h)) { + fprintf(stderr, "Adding block observation failed\n"); + return AOM_NOISE_STATUS_INTERNAL_ERROR; + } + + if (!ar_equation_system_solve(&noise_model->latest_state[channel], + is_chroma)) { + if (is_chroma) { + set_chroma_coefficient_fallback_soln( + &noise_model->latest_state[channel].eqns); + } else { + fprintf(stderr, "Solving latest noise equation system failed %d!\n", + channel); + return AOM_NOISE_STATUS_INTERNAL_ERROR; + } + } + + add_noise_std_observations( + noise_model, channel, noise_model->latest_state[channel].eqns.x, + data[channel], denoised[channel], w, h, stride[channel], sub, alt_data, + stride[0], flat_blocks, block_size, num_blocks_w, num_blocks_h); + + if (!aom_noise_strength_solver_solve( + &noise_model->latest_state[channel].strength_solver)) { + fprintf(stderr, "Solving latest noise strength failed!\n"); + return AOM_NOISE_STATUS_INTERNAL_ERROR; + } + + // Check noise characteristics and return if error. + if (channel == 0 && + noise_model->combined_state[channel].strength_solver.num_equations > + 0 && + is_noise_model_different(noise_model)) { + y_model_different = 1; + } + + // Don't update the combined stats if the y model is different. + if (y_model_different) continue; + + noise_model->combined_state[channel].num_observations += + noise_model->latest_state[channel].num_observations; + equation_system_add(&noise_model->combined_state[channel].eqns, + &noise_model->latest_state[channel].eqns); + if (!ar_equation_system_solve(&noise_model->combined_state[channel], + is_chroma)) { + if (is_chroma) { + set_chroma_coefficient_fallback_soln( + &noise_model->combined_state[channel].eqns); + } else { + fprintf(stderr, "Solving combined noise equation system failed %d!\n", + channel); + return AOM_NOISE_STATUS_INTERNAL_ERROR; + } + } + + noise_strength_solver_add( + &noise_model->combined_state[channel].strength_solver, + &noise_model->latest_state[channel].strength_solver); + + if (!aom_noise_strength_solver_solve( + &noise_model->combined_state[channel].strength_solver)) { + fprintf(stderr, "Solving combined noise strength failed!\n"); + return AOM_NOISE_STATUS_INTERNAL_ERROR; + } + } + + return y_model_different ? AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE + : AOM_NOISE_STATUS_OK; +} + +void aom_noise_model_save_latest(aom_noise_model_t *noise_model) { + for (int c = 0; c < 3; c++) { + equation_system_copy(&noise_model->combined_state[c].eqns, + &noise_model->latest_state[c].eqns); + equation_system_copy(&noise_model->combined_state[c].strength_solver.eqns, + &noise_model->latest_state[c].strength_solver.eqns); + noise_model->combined_state[c].strength_solver.num_equations = + noise_model->latest_state[c].strength_solver.num_equations; + noise_model->combined_state[c].num_observations = + noise_model->latest_state[c].num_observations; + noise_model->combined_state[c].ar_gain = + noise_model->latest_state[c].ar_gain; + } +} + +int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model, + aom_film_grain_t *film_grain) { + if (noise_model->params.lag > 3) { + fprintf(stderr, "params.lag = %d > 3\n", noise_model->params.lag); + return 0; + } + uint16_t random_seed = film_grain->random_seed; + memset(film_grain, 0, sizeof(*film_grain)); + film_grain->random_seed = random_seed; + + film_grain->apply_grain = 1; + film_grain->update_parameters = 1; + + film_grain->ar_coeff_lag = noise_model->params.lag; + + // Convert the scaling functions to 8 bit values + aom_noise_strength_lut_t scaling_points[3]; + aom_noise_strength_solver_fit_piecewise( + &noise_model->combined_state[0].strength_solver, 14, scaling_points + 0); + aom_noise_strength_solver_fit_piecewise( + &noise_model->combined_state[1].strength_solver, 10, scaling_points + 1); + aom_noise_strength_solver_fit_piecewise( + &noise_model->combined_state[2].strength_solver, 10, scaling_points + 2); + + // Both the domain and the range of the scaling functions in the film_grain + // are normalized to 8-bit (e.g., they are implicitly scaled during grain + // synthesis). + const double strength_divisor = 1 << (noise_model->params.bit_depth - 8); + double max_scaling_value = 1e-4; + for (int c = 0; c < 3; ++c) { + for (int i = 0; i < scaling_points[c].num_points; ++i) { + scaling_points[c].points[i][0] = + AOMMIN(255, scaling_points[c].points[i][0] / strength_divisor); + scaling_points[c].points[i][1] = + AOMMIN(255, scaling_points[c].points[i][1] / strength_divisor); + max_scaling_value = + AOMMAX(scaling_points[c].points[i][1], max_scaling_value); + } + } + + // Scaling_shift values are in the range [8,11] + const int max_scaling_value_log2 = + clamp((int)floor(log2(max_scaling_value) + 1), 2, 5); + film_grain->scaling_shift = 5 + (8 - max_scaling_value_log2); + + const double scale_factor = 1 << (8 - max_scaling_value_log2); + film_grain->num_y_points = scaling_points[0].num_points; + film_grain->num_cb_points = scaling_points[1].num_points; + film_grain->num_cr_points = scaling_points[2].num_points; + + int(*film_grain_scaling[3])[2] = { + film_grain->scaling_points_y, + film_grain->scaling_points_cb, + film_grain->scaling_points_cr, + }; + for (int c = 0; c < 3; c++) { + for (int i = 0; i < scaling_points[c].num_points; ++i) { + film_grain_scaling[c][i][0] = (int)(scaling_points[c].points[i][0] + 0.5); + film_grain_scaling[c][i][1] = clamp( + (int)(scale_factor * scaling_points[c].points[i][1] + 0.5), 0, 255); + } + } + aom_noise_strength_lut_free(scaling_points + 0); + aom_noise_strength_lut_free(scaling_points + 1); + aom_noise_strength_lut_free(scaling_points + 2); + + // Convert the ar_coeffs into 8-bit values + const int n_coeff = noise_model->combined_state[0].eqns.n; + double max_coeff = 1e-4, min_coeff = -1e-4; + double y_corr[2] = { 0, 0 }; + double avg_luma_strength = 0; + for (int c = 0; c < 3; c++) { + aom_equation_system_t *eqns = &noise_model->combined_state[c].eqns; + for (int i = 0; i < n_coeff; ++i) { + max_coeff = AOMMAX(max_coeff, eqns->x[i]); + min_coeff = AOMMIN(min_coeff, eqns->x[i]); + } + // Since the correlation between luma/chroma was computed in an already + // scaled space, we adjust it in the un-scaled space. + aom_noise_strength_solver_t *solver = + &noise_model->combined_state[c].strength_solver; + // Compute a weighted average of the strength for the channel. + double average_strength = 0, total_weight = 0; + for (int i = 0; i < solver->eqns.n; ++i) { + double w = 0; + for (int j = 0; j < solver->eqns.n; ++j) { + w += solver->eqns.A[i * solver->eqns.n + j]; + } + w = sqrt(w); + average_strength += solver->eqns.x[i] * w; + total_weight += w; + } + if (total_weight == 0) + average_strength = 1; + else + average_strength /= total_weight; + if (c == 0) { + avg_luma_strength = average_strength; + } else { + y_corr[c - 1] = avg_luma_strength * eqns->x[n_coeff] / average_strength; + max_coeff = AOMMAX(max_coeff, y_corr[c - 1]); + min_coeff = AOMMIN(min_coeff, y_corr[c - 1]); + } + } + // Shift value: AR coeffs range (values 6-9) + // 6: [-2, 2), 7: [-1, 1), 8: [-0.5, 0.5), 9: [-0.25, 0.25) + film_grain->ar_coeff_shift = + clamp(7 - (int)AOMMAX(1 + floor(log2(max_coeff)), ceil(log2(-min_coeff))), + 6, 9); + double scale_ar_coeff = 1 << film_grain->ar_coeff_shift; + int *ar_coeffs[3] = { + film_grain->ar_coeffs_y, + film_grain->ar_coeffs_cb, + film_grain->ar_coeffs_cr, + }; + for (int c = 0; c < 3; ++c) { + aom_equation_system_t *eqns = &noise_model->combined_state[c].eqns; + for (int i = 0; i < n_coeff; ++i) { + ar_coeffs[c][i] = + clamp((int)round(scale_ar_coeff * eqns->x[i]), -128, 127); + } + if (c > 0) { + ar_coeffs[c][n_coeff] = + clamp((int)round(scale_ar_coeff * y_corr[c - 1]), -128, 127); + } + } + + // At the moment, the noise modeling code assumes that the chroma scaling + // functions are a function of luma. + film_grain->cb_mult = 128; // 8 bits + film_grain->cb_luma_mult = 192; // 8 bits + film_grain->cb_offset = 256; // 9 bits + + film_grain->cr_mult = 128; // 8 bits + film_grain->cr_luma_mult = 192; // 8 bits + film_grain->cr_offset = 256; // 9 bits + + film_grain->chroma_scaling_from_luma = 0; + film_grain->grain_scale_shift = 0; + film_grain->overlap_flag = 1; + return 1; +} + +static void pointwise_multiply(const float *a, float *b, int n) { + for (int i = 0; i < n; ++i) { + b[i] *= a[i]; + } +} + +static float *get_half_cos_window(int block_size) { + float *window_function = + (float *)aom_malloc(block_size * block_size * sizeof(*window_function)); + for (int y = 0; y < block_size; ++y) { + const double cos_yd = cos((.5 + y) * PI / block_size - PI / 2); + for (int x = 0; x < block_size; ++x) { + const double cos_xd = cos((.5 + x) * PI / block_size - PI / 2); + window_function[y * block_size + x] = (float)(cos_yd * cos_xd); + } + } + return window_function; +} + +#define DITHER_AND_QUANTIZE(INT_TYPE, suffix) \ + static void dither_and_quantize_##suffix( \ + float *result, int result_stride, INT_TYPE *denoised, int w, int h, \ + int stride, int chroma_sub_w, int chroma_sub_h, int block_size, \ + float block_normalization) { \ + for (int y = 0; y < (h >> chroma_sub_h); ++y) { \ + for (int x = 0; x < (w >> chroma_sub_w); ++x) { \ + const int result_idx = \ + (y + (block_size >> chroma_sub_h)) * result_stride + x + \ + (block_size >> chroma_sub_w); \ + INT_TYPE new_val = (INT_TYPE)AOMMIN( \ + AOMMAX(result[result_idx] * block_normalization + 0.5f, 0), \ + block_normalization); \ + const float err = \ + -(((float)new_val) / block_normalization - result[result_idx]); \ + denoised[y * stride + x] = new_val; \ + if (x + 1 < (w >> chroma_sub_w)) { \ + result[result_idx + 1] += err * 7.0f / 16.0f; \ + } \ + if (y + 1 < (h >> chroma_sub_h)) { \ + if (x > 0) { \ + result[result_idx + result_stride - 1] += err * 3.0f / 16.0f; \ + } \ + result[result_idx + result_stride] += err * 5.0f / 16.0f; \ + if (x + 1 < (w >> chroma_sub_w)) { \ + result[result_idx + result_stride + 1] += err * 1.0f / 16.0f; \ + } \ + } \ + } \ + } \ + } + +DITHER_AND_QUANTIZE(uint8_t, lowbd); +DITHER_AND_QUANTIZE(uint16_t, highbd); + +int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3], + int w, int h, int stride[3], int chroma_sub[2], + float *noise_psd[3], int block_size, int bit_depth, + int use_highbd) { + float *plane = NULL, *block = NULL, *window_full = NULL, + *window_chroma = NULL; + double *block_d = NULL, *plane_d = NULL; + struct aom_noise_tx_t *tx_full = NULL; + struct aom_noise_tx_t *tx_chroma = NULL; + const int num_blocks_w = (w + block_size - 1) / block_size; + const int num_blocks_h = (h + block_size - 1) / block_size; + const int result_stride = (num_blocks_w + 2) * block_size; + const int result_height = (num_blocks_h + 2) * block_size; + float *result = NULL; + int init_success = 1; + aom_flat_block_finder_t block_finder_full; + aom_flat_block_finder_t block_finder_chroma; + const float kBlockNormalization = (float)((1 << bit_depth) - 1); + if (chroma_sub[0] != chroma_sub[1]) { + fprintf(stderr, + "aom_wiener_denoise_2d doesn't handle different chroma " + "subsampling"); + return 0; + } + init_success &= aom_flat_block_finder_init(&block_finder_full, block_size, + bit_depth, use_highbd); + result = (float *)aom_malloc((num_blocks_h + 2) * block_size * result_stride * + sizeof(*result)); + plane = (float *)aom_malloc(block_size * block_size * sizeof(*plane)); + block = + (float *)aom_memalign(32, 2 * block_size * block_size * sizeof(*block)); + block_d = (double *)aom_malloc(block_size * block_size * sizeof(*block_d)); + plane_d = (double *)aom_malloc(block_size * block_size * sizeof(*plane_d)); + window_full = get_half_cos_window(block_size); + tx_full = aom_noise_tx_malloc(block_size); + + if (chroma_sub[0] != 0) { + init_success &= aom_flat_block_finder_init(&block_finder_chroma, + block_size >> chroma_sub[0], + bit_depth, use_highbd); + window_chroma = get_half_cos_window(block_size >> chroma_sub[0]); + tx_chroma = aom_noise_tx_malloc(block_size >> chroma_sub[0]); + } else { + window_chroma = window_full; + tx_chroma = tx_full; + } + + init_success &= (tx_full != NULL) && (tx_chroma != NULL) && (plane != NULL) && + (plane_d != NULL) && (block != NULL) && (block_d != NULL) && + (window_full != NULL) && (window_chroma != NULL) && + (result != NULL); + for (int c = init_success ? 0 : 3; c < 3; ++c) { + float *window_function = c == 0 ? window_full : window_chroma; + aom_flat_block_finder_t *block_finder = &block_finder_full; + const int chroma_sub_h = c > 0 ? chroma_sub[1] : 0; + const int chroma_sub_w = c > 0 ? chroma_sub[0] : 0; + struct aom_noise_tx_t *tx = + (c > 0 && chroma_sub[0] > 0) ? tx_chroma : tx_full; + if (!data[c] || !denoised[c]) continue; + if (c > 0 && chroma_sub[0] != 0) { + block_finder = &block_finder_chroma; + } + memset(result, 0, sizeof(*result) * result_stride * result_height); + // Do overlapped block processing (half overlapped). The block rows can + // easily be done in parallel + for (int offsy = 0; offsy < (block_size >> chroma_sub_h); + offsy += (block_size >> chroma_sub_h) / 2) { + for (int offsx = 0; offsx < (block_size >> chroma_sub_w); + offsx += (block_size >> chroma_sub_w) / 2) { + // Pad the boundary when processing each block-set. + for (int by = -1; by < num_blocks_h; ++by) { + for (int bx = -1; bx < num_blocks_w; ++bx) { + const int pixels_per_block = + (block_size >> chroma_sub_w) * (block_size >> chroma_sub_h); + aom_flat_block_finder_extract_block( + block_finder, data[c], w >> chroma_sub_w, h >> chroma_sub_h, + stride[c], bx * (block_size >> chroma_sub_w) + offsx, + by * (block_size >> chroma_sub_h) + offsy, plane_d, block_d); + for (int j = 0; j < pixels_per_block; ++j) { + block[j] = (float)block_d[j]; + plane[j] = (float)plane_d[j]; + } + pointwise_multiply(window_function, block, pixels_per_block); + aom_noise_tx_forward(tx, block); + aom_noise_tx_filter(tx, noise_psd[c]); + aom_noise_tx_inverse(tx, block); + + // Apply window function to the plane approximation (we will apply + // it to the sum of plane + block when composing the results). + pointwise_multiply(window_function, plane, pixels_per_block); + + for (int y = 0; y < (block_size >> chroma_sub_h); ++y) { + const int y_result = + y + (by + 1) * (block_size >> chroma_sub_h) + offsy; + for (int x = 0; x < (block_size >> chroma_sub_w); ++x) { + const int x_result = + x + (bx + 1) * (block_size >> chroma_sub_w) + offsx; + result[y_result * result_stride + x_result] += + (block[y * (block_size >> chroma_sub_w) + x] + + plane[y * (block_size >> chroma_sub_w) + x]) * + window_function[y * (block_size >> chroma_sub_w) + x]; + } + } + } + } + } + } + if (use_highbd) { + dither_and_quantize_highbd(result, result_stride, (uint16_t *)denoised[c], + w, h, stride[c], chroma_sub_w, chroma_sub_h, + block_size, kBlockNormalization); + } else { + dither_and_quantize_lowbd(result, result_stride, denoised[c], w, h, + stride[c], chroma_sub_w, chroma_sub_h, + block_size, kBlockNormalization); + } + } + aom_free(result); + aom_free(plane); + aom_free(block); + aom_free(plane_d); + aom_free(block_d); + aom_free(window_full); + + aom_noise_tx_free(tx_full); + + aom_flat_block_finder_free(&block_finder_full); + if (chroma_sub[0] != 0) { + aom_flat_block_finder_free(&block_finder_chroma); + aom_free(window_chroma); + aom_noise_tx_free(tx_chroma); + } + return init_success; +} + +struct aom_denoise_and_model_t { + int block_size; + int bit_depth; + float noise_level; + + // Size of current denoised buffer and flat_block buffer + int width; + int height; + int y_stride; + int uv_stride; + int num_blocks_w; + int num_blocks_h; + + // Buffers for image and noise_psd allocated on the fly + float *noise_psd[3]; + uint8_t *denoised[3]; + uint8_t *flat_blocks; + + aom_flat_block_finder_t flat_block_finder; + aom_noise_model_t noise_model; +}; + +struct aom_denoise_and_model_t *aom_denoise_and_model_alloc(int bit_depth, + int block_size, + float noise_level) { + struct aom_denoise_and_model_t *ctx = + (struct aom_denoise_and_model_t *)aom_malloc( + sizeof(struct aom_denoise_and_model_t)); + if (!ctx) { + fprintf(stderr, "Unable to allocate denoise_and_model struct\n"); + return NULL; + } + memset(ctx, 0, sizeof(*ctx)); + + ctx->block_size = block_size; + ctx->noise_level = noise_level; + ctx->bit_depth = bit_depth; + + ctx->noise_psd[0] = + aom_malloc(sizeof(*ctx->noise_psd[0]) * block_size * block_size); + ctx->noise_psd[1] = + aom_malloc(sizeof(*ctx->noise_psd[1]) * block_size * block_size); + ctx->noise_psd[2] = + aom_malloc(sizeof(*ctx->noise_psd[2]) * block_size * block_size); + if (!ctx->noise_psd[0] || !ctx->noise_psd[1] || !ctx->noise_psd[2]) { + fprintf(stderr, "Unable to allocate noise PSD buffers\n"); + aom_denoise_and_model_free(ctx); + return NULL; + } + return ctx; +} + +void aom_denoise_and_model_free(struct aom_denoise_and_model_t *ctx) { + aom_free(ctx->flat_blocks); + for (int i = 0; i < 3; ++i) { + aom_free(ctx->denoised[i]); + aom_free(ctx->noise_psd[i]); + } + aom_noise_model_free(&ctx->noise_model); + aom_flat_block_finder_free(&ctx->flat_block_finder); + aom_free(ctx); +} + +static int denoise_and_model_realloc_if_necessary( + struct aom_denoise_and_model_t *ctx, YV12_BUFFER_CONFIG *sd) { + if (ctx->width == sd->y_width && ctx->height == sd->y_height && + ctx->y_stride == sd->y_stride && ctx->uv_stride == sd->uv_stride) + return 1; + const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0; + const int block_size = ctx->block_size; + + ctx->width = sd->y_width; + ctx->height = sd->y_height; + ctx->y_stride = sd->y_stride; + ctx->uv_stride = sd->uv_stride; + + for (int i = 0; i < 3; ++i) { + aom_free(ctx->denoised[i]); + ctx->denoised[i] = NULL; + } + aom_free(ctx->flat_blocks); + ctx->flat_blocks = NULL; + + ctx->denoised[0] = aom_malloc((sd->y_stride * sd->y_height) << use_highbd); + ctx->denoised[1] = aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd); + ctx->denoised[2] = aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd); + if (!ctx->denoised[0] || !ctx->denoised[1] || !ctx->denoised[2]) { + fprintf(stderr, "Unable to allocate denoise buffers\n"); + return 0; + } + ctx->num_blocks_w = (sd->y_width + ctx->block_size - 1) / ctx->block_size; + ctx->num_blocks_h = (sd->y_height + ctx->block_size - 1) / ctx->block_size; + ctx->flat_blocks = aom_malloc(ctx->num_blocks_w * ctx->num_blocks_h); + + aom_flat_block_finder_free(&ctx->flat_block_finder); + if (!aom_flat_block_finder_init(&ctx->flat_block_finder, ctx->block_size, + ctx->bit_depth, use_highbd)) { + fprintf(stderr, "Unable to init flat block finder\n"); + return 0; + } + + const aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3, + ctx->bit_depth, use_highbd }; + aom_noise_model_free(&ctx->noise_model); + if (!aom_noise_model_init(&ctx->noise_model, params)) { + fprintf(stderr, "Unable to init noise model\n"); + return 0; + } + + // Simply use a flat PSD (although we could use the flat blocks to estimate + // PSD) those to estimate an actual noise PSD) + const float y_noise_level = + aom_noise_psd_get_default_value(ctx->block_size, ctx->noise_level); + const float uv_noise_level = aom_noise_psd_get_default_value( + ctx->block_size >> sd->subsampling_x, ctx->noise_level); + for (int i = 0; i < block_size * block_size; ++i) { + ctx->noise_psd[0][i] = y_noise_level; + ctx->noise_psd[1][i] = ctx->noise_psd[2][i] = uv_noise_level; + } + return 1; +} + +int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx, + YV12_BUFFER_CONFIG *sd, + aom_film_grain_t *film_grain) { + const int block_size = ctx->block_size; + const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0; + uint8_t *raw_data[3] = { + use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->y_buffer) : sd->y_buffer, + use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->u_buffer) : sd->u_buffer, + use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->v_buffer) : sd->v_buffer, + }; + const uint8_t *const data[3] = { raw_data[0], raw_data[1], raw_data[2] }; + int strides[3] = { sd->y_stride, sd->uv_stride, sd->uv_stride }; + int chroma_sub_log2[2] = { sd->subsampling_x, sd->subsampling_y }; + + if (!denoise_and_model_realloc_if_necessary(ctx, sd)) { + fprintf(stderr, "Unable to realloc buffers\n"); + return 0; + } + + aom_flat_block_finder_run(&ctx->flat_block_finder, data[0], sd->y_width, + sd->y_height, strides[0], ctx->flat_blocks); + + if (!aom_wiener_denoise_2d(data, ctx->denoised, sd->y_width, sd->y_height, + strides, chroma_sub_log2, ctx->noise_psd, + block_size, ctx->bit_depth, use_highbd)) { + fprintf(stderr, "Unable to denoise image\n"); + return 0; + } + + const aom_noise_status_t status = aom_noise_model_update( + &ctx->noise_model, data, (const uint8_t *const *)ctx->denoised, + sd->y_width, sd->y_height, strides, chroma_sub_log2, ctx->flat_blocks, + block_size); + int have_noise_estimate = 0; + if (status == AOM_NOISE_STATUS_OK) { + have_noise_estimate = 1; + } else if (status == AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE) { + aom_noise_model_save_latest(&ctx->noise_model); + have_noise_estimate = 1; + } else { + // Unable to update noise model; proceed if we have a previous estimate. + have_noise_estimate = + (ctx->noise_model.combined_state[0].strength_solver.num_equations > 0); + } + + film_grain->apply_grain = 0; + if (have_noise_estimate) { + if (!aom_noise_model_get_grain_parameters(&ctx->noise_model, film_grain)) { + fprintf(stderr, "Unable to get grain parameters.\n"); + return 0; + } + if (!film_grain->random_seed) { + film_grain->random_seed = 7391; + } + memcpy(raw_data[0], ctx->denoised[0], + (strides[0] * sd->y_height) << use_highbd); + memcpy(raw_data[1], ctx->denoised[1], + (strides[1] * sd->uv_height) << use_highbd); + memcpy(raw_data[2], ctx->denoised[2], + (strides[2] * sd->uv_height) << use_highbd); + } + return 1; +} diff --git a/libs/libaom/src/aom_dsp/noise_model.h b/libs/libaom/src/aom_dsp/noise_model.h new file mode 100644 index 000000000..5e7de9bf2 --- /dev/null +++ b/libs/libaom/src/aom_dsp/noise_model.h @@ -0,0 +1,323 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_NOISE_MODEL_H_ +#define AOM_AOM_DSP_NOISE_MODEL_H_ + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +#include +#include "aom_dsp/grain_synthesis.h" +#include "aom_scale/yv12config.h" + +/*!\brief Wrapper of data required to represent linear system of eqns and soln. + */ +typedef struct { + double *A; + double *b; + double *x; + int n; +} aom_equation_system_t; + +/*!\brief Representation of a piecewise linear curve + * + * Holds n points as (x, y) pairs, that store the curve. + */ +typedef struct { + double (*points)[2]; + int num_points; +} aom_noise_strength_lut_t; + +/*!\brief Init the noise strength lut with the given number of points*/ +int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points); + +/*!\brief Frees the noise strength lut. */ +void aom_noise_strength_lut_free(aom_noise_strength_lut_t *lut); + +/*!\brief Evaluate the lut at the point x. + * + * \param[in] lut The lut data. + * \param[in] x The coordinate to evaluate the lut. + */ +double aom_noise_strength_lut_eval(const aom_noise_strength_lut_t *lut, + double x); + +/*!\brief Helper struct to model noise strength as a function of intensity. + * + * Internally, this structure holds a representation of a linear system + * of equations that models noise strength (standard deviation) as a + * function of intensity. The mapping is initially stored using a + * piecewise representation with evenly spaced bins that cover the entire + * domain from [min_intensity, max_intensity]. Each observation (x,y) gives a + * constraint of the form: + * y_{i} (1 - a) + y_{i+1} a = y + * where y_{i} is the value of bin i and x_{i} <= x <= x_{i+1} and + * a = x/(x_{i+1} - x{i}). The equation system holds the corresponding + * normal equations. + * + * As there may be missing data, the solution is regularized to get a + * complete set of values for the bins. A reduced representation after + * solving can be obtained by getting the corresponding noise_strength_lut_t. + */ +typedef struct { + aom_equation_system_t eqns; + double min_intensity; + double max_intensity; + int num_bins; + int num_equations; + double total; +} aom_noise_strength_solver_t; + +/*!\brief Initializes the noise solver with the given number of bins. + * + * Returns 0 if initialization fails. + * + * \param[in] solver The noise solver to be initialized. + * \param[in] num_bins Number of bins to use in the internal representation. + * \param[in] bit_depth The bit depth used to derive {min,max}_intensity. + */ +int aom_noise_strength_solver_init(aom_noise_strength_solver_t *solver, + int num_bins, int bit_depth); +void aom_noise_strength_solver_free(aom_noise_strength_solver_t *solver); + +/*!\brief Gets the x coordinate of bin i. + * + * \param[in] i The bin whose coordinate to query. + */ +double aom_noise_strength_solver_get_center( + const aom_noise_strength_solver_t *solver, int i); + +/*!\brief Add an observation of the block mean intensity to its noise strength. + * + * \param[in] block_mean The average block intensity, + * \param[in] noise_std The observed noise strength. + */ +void aom_noise_strength_solver_add_measurement( + aom_noise_strength_solver_t *solver, double block_mean, double noise_std); + +/*!\brief Solves the current set of equations for the noise strength. */ +int aom_noise_strength_solver_solve(aom_noise_strength_solver_t *solver); + +/*!\brief Fits a reduced piecewise linear lut to the internal solution + * + * \param[in] max_num_points The maximum number of output points + * \param[out] lut The output piecewise linear lut. + */ +int aom_noise_strength_solver_fit_piecewise( + const aom_noise_strength_solver_t *solver, int max_num_points, + aom_noise_strength_lut_t *lut); + +/*!\brief Helper for holding precomputed data for finding flat blocks. + * + * Internally a block is modeled with a low-order polynomial model. A + * planar model would be a bunch of equations like: + * <[y_i x_i 1], [a_1, a_2, a_3]> = b_i + * for each point in the block. The system matrix A with row i as [y_i x_i 1] + * is maintained as is the inverse, inv(A'*A), so that the plane parameters + * can be fit for each block. + */ +typedef struct { + double *AtA_inv; + double *A; + int num_params; // The number of parameters used for internal low-order model + int block_size; // The block size the finder was initialized with + double normalization; // Normalization factor (1 / (2^(bit_depth) - 1)) + int use_highbd; // Whether input data should be interpreted as uint16 +} aom_flat_block_finder_t; + +/*!\brief Init the block_finder with the given block size, bit_depth */ +int aom_flat_block_finder_init(aom_flat_block_finder_t *block_finder, + int block_size, int bit_depth, int use_highbd); +void aom_flat_block_finder_free(aom_flat_block_finder_t *block_finder); + +/*!\brief Helper to extract a block and low order "planar" model. */ +void aom_flat_block_finder_extract_block( + const aom_flat_block_finder_t *block_finder, const uint8_t *const data, + int w, int h, int stride, int offsx, int offsy, double *plane, + double *block); + +/*!\brief Runs the flat block finder on the input data. + * + * Find flat blocks in the input image data. Returns a map of + * flat_blocks, where the value of flat_blocks map will be non-zero + * when a block is determined to be flat. A higher value indicates a bigger + * confidence in the decision. + */ +int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder, + const uint8_t *const data, int w, int h, + int stride, uint8_t *flat_blocks); + +// The noise shape indicates the allowed coefficients in the AR model. +enum { + AOM_NOISE_SHAPE_DIAMOND = 0, + AOM_NOISE_SHAPE_SQUARE = 1 +} UENUM1BYTE(aom_noise_shape); + +// The parameters of the noise model include the shape type, lag, the +// bit depth of the input images provided, and whether the input images +// will be using uint16 (or uint8) representation. +typedef struct { + aom_noise_shape shape; + int lag; + int bit_depth; + int use_highbd; +} aom_noise_model_params_t; + +/*!\brief State of a noise model estimate for a single channel. + * + * This contains a system of equations that can be used to solve + * for the auto-regressive coefficients as well as a noise strength + * solver that can be used to model noise strength as a function of + * intensity. + */ +typedef struct { + aom_equation_system_t eqns; + aom_noise_strength_solver_t strength_solver; + int num_observations; // The number of observations in the eqn system + double ar_gain; // The gain of the current AR filter +} aom_noise_state_t; + +/*!\brief Complete model of noise for a planar video + * + * This includes a noise model for the latest frame and an aggregated + * estimate over all previous frames that had similar parameters. + */ +typedef struct { + aom_noise_model_params_t params; + aom_noise_state_t combined_state[3]; // Combined state per channel + aom_noise_state_t latest_state[3]; // Latest state per channel + int (*coords)[2]; // Offsets (x,y) of the coefficient samples + int n; // Number of parameters (size of coords) + int bit_depth; +} aom_noise_model_t; + +/*!\brief Result of a noise model update. */ +enum { + AOM_NOISE_STATUS_OK = 0, + AOM_NOISE_STATUS_INVALID_ARGUMENT, + AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS, + AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE, + AOM_NOISE_STATUS_INTERNAL_ERROR, +} UENUM1BYTE(aom_noise_status_t); + +/*!\brief Initializes a noise model with the given parameters. + * + * Returns 0 on failure. + */ +int aom_noise_model_init(aom_noise_model_t *model, + const aom_noise_model_params_t params); +void aom_noise_model_free(aom_noise_model_t *model); + +/*!\brief Updates the noise model with a new frame observation. + * + * Updates the noise model with measurements from the given input frame and a + * denoised variant of it. Noise is sampled from flat blocks using the flat + * block map. + * + * Returns a noise_status indicating if the update was successful. If the + * Update was successful, the combined_state is updated with measurements from + * the provided frame. If status is OK or DIFFERENT_NOISE_TYPE, the latest noise + * state will be updated with measurements from the provided frame. + * + * \param[in,out] noise_model The noise model to be updated + * \param[in] data Raw frame data + * \param[in] denoised Denoised frame data. + * \param[in] w Frame width + * \param[in] h Frame height + * \param[in] strides Stride of the planes + * \param[in] chroma_sub_log2 Chroma subsampling for planes != 0. + * \param[in] flat_blocks A map to blocks that have been determined flat + * \param[in] block_size The size of blocks. + */ +aom_noise_status_t aom_noise_model_update( + aom_noise_model_t *const noise_model, const uint8_t *const data[3], + const uint8_t *const denoised[3], int w, int h, int strides[3], + int chroma_sub_log2[2], const uint8_t *const flat_blocks, int block_size); + +/*\brief Save the "latest" estimate into the "combined" estimate. + * + * This is meant to be called when the noise modeling detected a change + * in parameters (or for example, if a user wanted to reset estimation at + * a shot boundary). + */ +void aom_noise_model_save_latest(aom_noise_model_t *noise_model); + +/*!\brief Converts the noise_model parameters to the corresponding + * grain_parameters. + * + * The noise structs in this file are suitable for estimation (e.g., using + * floats), but the grain parameters in the bitstream are quantized. This + * function does the conversion by selecting the correct quantization levels. + */ +int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model, + aom_film_grain_t *film_grain); + +/*!\brief Perform a Wiener filter denoising in 2D using the provided noise psd. + * + * \param[in] data Raw frame data + * \param[out] denoised Denoised frame data + * \param[in] w Frame width + * \param[in] h Frame height + * \param[in] stride Stride of the planes + * \param[in] chroma_sub_log2 Chroma subsampling for planes != 0. + * \param[in] noise_psd The power spectral density of the noise + * \param[in] block_size The size of blocks + * \param[in] bit_depth Bit depth of the image + * \param[in] use_highbd If true, uint8 pointers are interpreted as + * uint16 and stride is measured in uint16. + * This must be true when bit_depth >= 10. + */ +int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3], + int w, int h, int stride[3], int chroma_sub_log2[2], + float *noise_psd[3], int block_size, int bit_depth, + int use_highbd); + +struct aom_denoise_and_model_t; + +/*!\brief Denoise the buffer and model the residual noise. + * + * This is meant to be called sequentially on input frames. The input buffer + * is denoised and the residual noise is modelled. The current noise estimate + * is populated in film_grain. Returns true on success. The grain.apply_grain + * parameter will be true when the input buffer was successfully denoised and + * grain was modelled. Returns false on error. + * + * \param[in] ctx Struct allocated with aom_denoise_and_model_alloc + * that holds some buffers for denoising and the current + * noise estimate. + * \param[in/out] buf The raw input buffer to be denoised. + * \param[out] grain Output film grain parameters + */ +int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx, + YV12_BUFFER_CONFIG *buf, aom_film_grain_t *grain); + +/*!\brief Allocates a context that can be used for denoising and noise modeling. + * + * \param[in] bit_depth Bit depth of buffers this will be run on. + * \param[in] block_size Block size for noise modeling and flat block + * estimation + * \param[in] noise_level The noise_level (2.5 for moderate noise, and 5 for + * higher levels of noise) + */ +struct aom_denoise_and_model_t *aom_denoise_and_model_alloc(int bit_depth, + int block_size, + float noise_level); + +/*!\brief Frees the denoise context allocated with aom_denoise_and_model_alloc + */ +void aom_denoise_and_model_free(struct aom_denoise_and_model_t *denoise_model); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus +#endif // AOM_AOM_DSP_NOISE_MODEL_H_ diff --git a/libs/libaom/src/aom_dsp/noise_util.c b/libs/libaom/src/aom_dsp/noise_util.c new file mode 100644 index 000000000..7e7e380c6 --- /dev/null +++ b/libs/libaom/src/aom_dsp/noise_util.c @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include +#include +#include + +#include "aom_dsp/noise_util.h" +#include "aom_dsp/fft_common.h" +#include "aom_mem/aom_mem.h" +#include "config/aom_dsp_rtcd.h" + +float aom_noise_psd_get_default_value(int block_size, float factor) { + return (factor * factor / 10000) * block_size * block_size / 8; +} + +// Internal representation of noise transform. It keeps track of the +// transformed data and a temporary working buffer to use during the +// transform. +struct aom_noise_tx_t { + float *tx_block; + float *temp; + int block_size; + void (*fft)(const float *, float *, float *); + void (*ifft)(const float *, float *, float *); +}; + +struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size) { + struct aom_noise_tx_t *noise_tx = + (struct aom_noise_tx_t *)aom_malloc(sizeof(struct aom_noise_tx_t)); + if (!noise_tx) return NULL; + memset(noise_tx, 0, sizeof(*noise_tx)); + switch (block_size) { + case 2: + noise_tx->fft = aom_fft2x2_float; + noise_tx->ifft = aom_ifft2x2_float; + break; + case 4: + noise_tx->fft = aom_fft4x4_float; + noise_tx->ifft = aom_ifft4x4_float; + break; + case 8: + noise_tx->fft = aom_fft8x8_float; + noise_tx->ifft = aom_ifft8x8_float; + break; + case 16: + noise_tx->fft = aom_fft16x16_float; + noise_tx->ifft = aom_ifft16x16_float; + break; + case 32: + noise_tx->fft = aom_fft32x32_float; + noise_tx->ifft = aom_ifft32x32_float; + break; + default: + aom_free(noise_tx); + fprintf(stderr, "Unsupported block size %d\n", block_size); + return NULL; + } + noise_tx->block_size = block_size; + noise_tx->tx_block = (float *)aom_memalign( + 32, 2 * sizeof(*noise_tx->tx_block) * block_size * block_size); + noise_tx->temp = (float *)aom_memalign( + 32, 2 * sizeof(*noise_tx->temp) * block_size * block_size); + if (!noise_tx->tx_block || !noise_tx->temp) { + aom_noise_tx_free(noise_tx); + return NULL; + } + // Clear the buffers up front. Some outputs of the forward transform are + // real only (the imaginary component will never be touched) + memset(noise_tx->tx_block, 0, + 2 * sizeof(*noise_tx->tx_block) * block_size * block_size); + memset(noise_tx->temp, 0, + 2 * sizeof(*noise_tx->temp) * block_size * block_size); + return noise_tx; +} + +void aom_noise_tx_forward(struct aom_noise_tx_t *noise_tx, const float *data) { + noise_tx->fft(data, noise_tx->temp, noise_tx->tx_block); +} + +void aom_noise_tx_filter(struct aom_noise_tx_t *noise_tx, const float *psd) { + const int block_size = noise_tx->block_size; + const float kBeta = 1.1f; + const float kEps = 1e-6f; + for (int y = 0; y < block_size; ++y) { + for (int x = 0; x < block_size; ++x) { + int i = y * block_size + x; + float *c = noise_tx->tx_block + 2 * i; + const float c0 = AOMMAX((float)fabs(c[0]), 1e-8f); + const float c1 = AOMMAX((float)fabs(c[1]), 1e-8f); + const float p = c0 * c0 + c1 * c1; + if (p > kBeta * psd[i] && p > 1e-6) { + noise_tx->tx_block[2 * i + 0] *= (p - psd[i]) / AOMMAX(p, kEps); + noise_tx->tx_block[2 * i + 1] *= (p - psd[i]) / AOMMAX(p, kEps); + } else { + noise_tx->tx_block[2 * i + 0] *= (kBeta - 1.0f) / kBeta; + noise_tx->tx_block[2 * i + 1] *= (kBeta - 1.0f) / kBeta; + } + } + } +} + +void aom_noise_tx_inverse(struct aom_noise_tx_t *noise_tx, float *data) { + const int n = noise_tx->block_size * noise_tx->block_size; + noise_tx->ifft(noise_tx->tx_block, noise_tx->temp, data); + for (int i = 0; i < n; ++i) { + data[i] /= n; + } +} + +void aom_noise_tx_add_energy(const struct aom_noise_tx_t *noise_tx, + float *psd) { + const int block_size = noise_tx->block_size; + for (int yb = 0; yb < block_size; ++yb) { + for (int xb = 0; xb <= block_size / 2; ++xb) { + float *c = noise_tx->tx_block + 2 * (yb * block_size + xb); + psd[yb * block_size + xb] += c[0] * c[0] + c[1] * c[1]; + } + } +} + +void aom_noise_tx_free(struct aom_noise_tx_t *noise_tx) { + if (!noise_tx) return; + aom_free(noise_tx->tx_block); + aom_free(noise_tx->temp); + aom_free(noise_tx); +} + +double aom_normalized_cross_correlation(const double *a, const double *b, + int n) { + double c = 0; + double a_len = 0; + double b_len = 0; + for (int i = 0; i < n; ++i) { + a_len += a[i] * a[i]; + b_len += b[i] * b[i]; + c += a[i] * b[i]; + } + return c / (sqrt(a_len) * sqrt(b_len)); +} + +int aom_noise_data_validate(const double *data, int w, int h) { + const double kVarianceThreshold = 2; + const double kMeanThreshold = 2; + + int x = 0, y = 0; + int ret_value = 1; + double var = 0, mean = 0; + double *mean_x, *mean_y, *var_x, *var_y; + + // Check that noise variance is not increasing in x or y + // and that the data is zero mean. + mean_x = (double *)aom_malloc(sizeof(*mean_x) * w); + var_x = (double *)aom_malloc(sizeof(*var_x) * w); + mean_y = (double *)aom_malloc(sizeof(*mean_x) * h); + var_y = (double *)aom_malloc(sizeof(*var_y) * h); + + memset(mean_x, 0, sizeof(*mean_x) * w); + memset(var_x, 0, sizeof(*var_x) * w); + memset(mean_y, 0, sizeof(*mean_y) * h); + memset(var_y, 0, sizeof(*var_y) * h); + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + const double d = data[y * w + x]; + var_x[x] += d * d; + var_y[y] += d * d; + mean_x[x] += d; + mean_y[y] += d; + var += d * d; + mean += d; + } + } + mean /= (w * h); + var = var / (w * h) - mean * mean; + + for (y = 0; y < h; ++y) { + mean_y[y] /= h; + var_y[y] = var_y[y] / h - mean_y[y] * mean_y[y]; + if (fabs(var_y[y] - var) >= kVarianceThreshold) { + fprintf(stderr, "Variance distance too large %f %f\n", var_y[y], var); + ret_value = 0; + break; + } + if (fabs(mean_y[y] - mean) >= kMeanThreshold) { + fprintf(stderr, "Mean distance too large %f %f\n", mean_y[y], mean); + ret_value = 0; + break; + } + } + + for (x = 0; x < w; ++x) { + mean_x[x] /= w; + var_x[x] = var_x[x] / w - mean_x[x] * mean_x[x]; + if (fabs(var_x[x] - var) >= kVarianceThreshold) { + fprintf(stderr, "Variance distance too large %f %f\n", var_x[x], var); + ret_value = 0; + break; + } + if (fabs(mean_x[x] - mean) >= kMeanThreshold) { + fprintf(stderr, "Mean distance too large %f %f\n", mean_x[x], mean); + ret_value = 0; + break; + } + } + + aom_free(mean_x); + aom_free(mean_y); + aom_free(var_x); + aom_free(var_y); + + return ret_value; +} diff --git a/libs/libaom/src/aom_dsp/noise_util.h b/libs/libaom/src/aom_dsp/noise_util.h new file mode 100644 index 000000000..2284a171a --- /dev/null +++ b/libs/libaom/src/aom_dsp/noise_util.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_NOISE_UTIL_H_ +#define AOM_AOM_DSP_NOISE_UTIL_H_ + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// aom_noise_tx_t is an abstraction of a transform that is used for denoising. +// It is meant to be lightweight and does hold the transformed data (as +// the user should not be manipulating the transformed data directly). +struct aom_noise_tx_t; + +// Allocates and returns a aom_noise_tx_t useful for denoising the given +// block_size. The resulting aom_noise_tx_t should be free'd with +// aom_noise_tx_free. +struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size); +void aom_noise_tx_free(struct aom_noise_tx_t *aom_noise_tx); + +// Transforms the internal data and holds it in the aom_noise_tx's internal +// buffer. For compatibility with existing SIMD implementations, "data" must +// be 32-byte aligned. +void aom_noise_tx_forward(struct aom_noise_tx_t *aom_noise_tx, + const float *data); + +// Filters aom_noise_tx's internal data using the provided noise power spectral +// density. The PSD must be at least block_size * block_size and should be +// populated with a constant or via estimates taken from +// aom_noise_tx_add_energy. +void aom_noise_tx_filter(struct aom_noise_tx_t *aom_noise_tx, const float *psd); + +// Performs an inverse transform using the internal transform data. +// For compatibility with existing SIMD implementations, "data" must be 32-byte +// aligned. +void aom_noise_tx_inverse(struct aom_noise_tx_t *aom_noise_tx, float *data); + +// Aggregates the power of the buffered transform data into the psd buffer. +void aom_noise_tx_add_energy(const struct aom_noise_tx_t *aom_noise_tx, + float *psd); + +// Returns a default value suitable for denosing a transform of the given +// block_size. The noise "factor" determines the strength of the noise to +// be removed. A value of about 2.5 can be used for moderate denoising, +// where a value of 5.0 can be used for a high level of denoising. +float aom_noise_psd_get_default_value(int block_size, float factor); + +// Computes normalized cross correlation of two vectors a and b of length n. +double aom_normalized_cross_correlation(const double *a, const double *b, + int n); + +// Validates the correlated noise in the data buffer of size (w, h). +int aom_noise_data_validate(const double *data, int w, int h); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // AOM_AOM_DSP_NOISE_UTIL_H_ diff --git a/libs/libaom/src/aom_dsp/prob.h b/libs/libaom/src/aom_dsp/prob.h new file mode 100644 index 000000000..ea5e4cb34 --- /dev/null +++ b/libs/libaom/src/aom_dsp/prob.h @@ -0,0 +1,670 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_PROB_H_ +#define AOM_AOM_DSP_PROB_H_ + +#include +#include + +#include "config/aom_config.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/entcode.h" +#include "aom_ports/bitops.h" +#include "aom_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef uint16_t aom_cdf_prob; + +#define CDF_SIZE(x) ((x) + 1) +#define CDF_PROB_BITS 15 +#define CDF_PROB_TOP (1 << CDF_PROB_BITS) +#define CDF_INIT_TOP 32768 +#define CDF_SHIFT (15 - CDF_PROB_BITS) +/*The value stored in an iCDF is CDF_PROB_TOP minus the actual cumulative + probability (an "inverse" CDF). + This function converts from one representation to the other (and is its own + inverse).*/ +#define AOM_ICDF(x) (CDF_PROB_TOP - (x)) + +#if CDF_SHIFT == 0 + +#define AOM_CDF2(a0) AOM_ICDF(a0), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF3(a0, a1) AOM_ICDF(a0), AOM_ICDF(a1), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF4(a0, a1, a2) \ + AOM_ICDF(a0), AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF5(a0, a1, a2, a3) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF6(a0, a1, a2, a3, a4) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF7(a0, a1, a2, a3, a4, a5) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF8(a0, a1, a2, a3, a4, a5, a6) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(a6), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF9(a0, a1, a2, a3, a4, a5, a6, a7) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF10(a0, a1, a2, a3, a4, a5, a6, a7, a8) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF11(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF12(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF13(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \ + AOM_ICDF(a11), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF14(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \ + AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF15(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \ + AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(a13), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF16(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, \ + a14) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \ + AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(a13), AOM_ICDF(a14), \ + AOM_ICDF(CDF_PROB_TOP), 0 + +#else +#define AOM_CDF2(a0) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 2) + \ + ((CDF_INIT_TOP - 2) >> 1)) / \ + ((CDF_INIT_TOP - 2)) + \ + 1) \ + , AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF3(a0, a1) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 3) + \ + ((CDF_INIT_TOP - 3) >> 1)) / \ + ((CDF_INIT_TOP - 3)) + \ + 1) \ + , \ + AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 3) + \ + ((CDF_INIT_TOP - 3) >> 1)) / \ + ((CDF_INIT_TOP - 3)) + \ + 2), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF4(a0, a1, a2) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 4) + \ + ((CDF_INIT_TOP - 4) >> 1)) / \ + ((CDF_INIT_TOP - 4)) + \ + 1) \ + , \ + AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 4) + \ + ((CDF_INIT_TOP - 4) >> 1)) / \ + ((CDF_INIT_TOP - 4)) + \ + 2), \ + AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 4) + \ + ((CDF_INIT_TOP - 4) >> 1)) / \ + ((CDF_INIT_TOP - 4)) + \ + 3), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF5(a0, a1, a2, a3) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \ + ((CDF_INIT_TOP - 5) >> 1)) / \ + ((CDF_INIT_TOP - 5)) + \ + 1) \ + , \ + AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \ + ((CDF_INIT_TOP - 5) >> 1)) / \ + ((CDF_INIT_TOP - 5)) + \ + 2), \ + AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \ + ((CDF_INIT_TOP - 5) >> 1)) / \ + ((CDF_INIT_TOP - 5)) + \ + 3), \ + AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \ + ((CDF_INIT_TOP - 5) >> 1)) / \ + ((CDF_INIT_TOP - 5)) + \ + 4), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF6(a0, a1, a2, a3, a4) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \ + ((CDF_INIT_TOP - 6) >> 1)) / \ + ((CDF_INIT_TOP - 6)) + \ + 1) \ + , \ + AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \ + ((CDF_INIT_TOP - 6) >> 1)) / \ + ((CDF_INIT_TOP - 6)) + \ + 2), \ + AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \ + ((CDF_INIT_TOP - 6) >> 1)) / \ + ((CDF_INIT_TOP - 6)) + \ + 3), \ + AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \ + ((CDF_INIT_TOP - 6) >> 1)) / \ + ((CDF_INIT_TOP - 6)) + \ + 4), \ + AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \ + ((CDF_INIT_TOP - 6) >> 1)) / \ + ((CDF_INIT_TOP - 6)) + \ + 5), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF7(a0, a1, a2, a3, a4, a5) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \ + ((CDF_INIT_TOP - 7) >> 1)) / \ + ((CDF_INIT_TOP - 7)) + \ + 1) \ + , \ + AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \ + ((CDF_INIT_TOP - 7) >> 1)) / \ + ((CDF_INIT_TOP - 7)) + \ + 2), \ + AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \ + ((CDF_INIT_TOP - 7) >> 1)) / \ + ((CDF_INIT_TOP - 7)) + \ + 3), \ + AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \ + ((CDF_INIT_TOP - 7) >> 1)) / \ + ((CDF_INIT_TOP - 7)) + \ + 4), \ + AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \ + ((CDF_INIT_TOP - 7) >> 1)) / \ + ((CDF_INIT_TOP - 7)) + \ + 5), \ + AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \ + ((CDF_INIT_TOP - 7) >> 1)) / \ + ((CDF_INIT_TOP - 7)) + \ + 6), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF8(a0, a1, a2, a3, a4, a5, a6) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \ + ((CDF_INIT_TOP - 8) >> 1)) / \ + ((CDF_INIT_TOP - 8)) + \ + 1) \ + , \ + AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \ + ((CDF_INIT_TOP - 8) >> 1)) / \ + ((CDF_INIT_TOP - 8)) + \ + 2), \ + AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \ + ((CDF_INIT_TOP - 8) >> 1)) / \ + ((CDF_INIT_TOP - 8)) + \ + 3), \ + AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \ + ((CDF_INIT_TOP - 8) >> 1)) / \ + ((CDF_INIT_TOP - 8)) + \ + 4), \ + AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \ + ((CDF_INIT_TOP - 8) >> 1)) / \ + ((CDF_INIT_TOP - 8)) + \ + 5), \ + AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \ + ((CDF_INIT_TOP - 8) >> 1)) / \ + ((CDF_INIT_TOP - 8)) + \ + 6), \ + AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \ + ((CDF_INIT_TOP - 8) >> 1)) / \ + ((CDF_INIT_TOP - 8)) + \ + 7), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF9(a0, a1, a2, a3, a4, a5, a6, a7) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \ + ((CDF_INIT_TOP - 9) >> 1)) / \ + ((CDF_INIT_TOP - 9)) + \ + 1) \ + , \ + AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \ + ((CDF_INIT_TOP - 9) >> 1)) / \ + ((CDF_INIT_TOP - 9)) + \ + 2), \ + AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \ + ((CDF_INIT_TOP - 9) >> 1)) / \ + ((CDF_INIT_TOP - 9)) + \ + 3), \ + AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \ + ((CDF_INIT_TOP - 9) >> 1)) / \ + ((CDF_INIT_TOP - 9)) + \ + 4), \ + AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \ + ((CDF_INIT_TOP - 9) >> 1)) / \ + ((CDF_INIT_TOP - 9)) + \ + 5), \ + AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \ + ((CDF_INIT_TOP - 9) >> 1)) / \ + ((CDF_INIT_TOP - 9)) + \ + 6), \ + AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \ + ((CDF_INIT_TOP - 9) >> 1)) / \ + ((CDF_INIT_TOP - 9)) + \ + 7), \ + AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \ + ((CDF_INIT_TOP - 9) >> 1)) / \ + ((CDF_INIT_TOP - 9)) + \ + 8), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF10(a0, a1, a2, a3, a4, a5, a6, a7, a8) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \ + ((CDF_INIT_TOP - 10) >> 1)) / \ + ((CDF_INIT_TOP - 10)) + \ + 1) \ + , \ + AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \ + ((CDF_INIT_TOP - 10) >> 1)) / \ + ((CDF_INIT_TOP - 10)) + \ + 2), \ + AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \ + ((CDF_INIT_TOP - 10) >> 1)) / \ + ((CDF_INIT_TOP - 10)) + \ + 3), \ + AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \ + ((CDF_INIT_TOP - 10) >> 1)) / \ + ((CDF_INIT_TOP - 10)) + \ + 4), \ + AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \ + ((CDF_INIT_TOP - 10) >> 1)) / \ + ((CDF_INIT_TOP - 10)) + \ + 5), \ + AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \ + ((CDF_INIT_TOP - 10) >> 1)) / \ + ((CDF_INIT_TOP - 10)) + \ + 6), \ + AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \ + ((CDF_INIT_TOP - 10) >> 1)) / \ + ((CDF_INIT_TOP - 10)) + \ + 7), \ + AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \ + ((CDF_INIT_TOP - 10) >> 1)) / \ + ((CDF_INIT_TOP - 10)) + \ + 8), \ + AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \ + ((CDF_INIT_TOP - 10) >> 1)) / \ + ((CDF_INIT_TOP - 10)) + \ + 9), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF11(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ + ((CDF_INIT_TOP - 11) >> 1)) / \ + ((CDF_INIT_TOP - 11)) + \ + 1) \ + , \ + AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ + ((CDF_INIT_TOP - 11) >> 1)) / \ + ((CDF_INIT_TOP - 11)) + \ + 2), \ + AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ + ((CDF_INIT_TOP - 11) >> 1)) / \ + ((CDF_INIT_TOP - 11)) + \ + 3), \ + AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ + ((CDF_INIT_TOP - 11) >> 1)) / \ + ((CDF_INIT_TOP - 11)) + \ + 4), \ + AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ + ((CDF_INIT_TOP - 11) >> 1)) / \ + ((CDF_INIT_TOP - 11)) + \ + 5), \ + AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ + ((CDF_INIT_TOP - 11) >> 1)) / \ + ((CDF_INIT_TOP - 11)) + \ + 6), \ + AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ + ((CDF_INIT_TOP - 11) >> 1)) / \ + ((CDF_INIT_TOP - 11)) + \ + 7), \ + AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ + ((CDF_INIT_TOP - 11) >> 1)) / \ + ((CDF_INIT_TOP - 11)) + \ + 8), \ + AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ + ((CDF_INIT_TOP - 11) >> 1)) / \ + ((CDF_INIT_TOP - 11)) + \ + 9), \ + AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ + ((CDF_INIT_TOP - 11) >> 1)) / \ + ((CDF_INIT_TOP - 11)) + \ + 10), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF12(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ + ((CDF_INIT_TOP - 12) >> 1)) / \ + ((CDF_INIT_TOP - 12)) + \ + 1) \ + , \ + AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ + ((CDF_INIT_TOP - 12) >> 1)) / \ + ((CDF_INIT_TOP - 12)) + \ + 2), \ + AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ + ((CDF_INIT_TOP - 12) >> 1)) / \ + ((CDF_INIT_TOP - 12)) + \ + 3), \ + AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ + ((CDF_INIT_TOP - 12) >> 1)) / \ + ((CDF_INIT_TOP - 12)) + \ + 4), \ + AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ + ((CDF_INIT_TOP - 12) >> 1)) / \ + ((CDF_INIT_TOP - 12)) + \ + 5), \ + AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ + ((CDF_INIT_TOP - 12) >> 1)) / \ + ((CDF_INIT_TOP - 12)) + \ + 6), \ + AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ + ((CDF_INIT_TOP - 12) >> 1)) / \ + ((CDF_INIT_TOP - 12)) + \ + 7), \ + AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ + ((CDF_INIT_TOP - 12) >> 1)) / \ + ((CDF_INIT_TOP - 12)) + \ + 8), \ + AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ + ((CDF_INIT_TOP - 12) >> 1)) / \ + ((CDF_INIT_TOP - 12)) + \ + 9), \ + AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ + ((CDF_INIT_TOP - 12) >> 1)) / \ + ((CDF_INIT_TOP - 12)) + \ + 10), \ + AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ + ((CDF_INIT_TOP - 12) >> 1)) / \ + ((CDF_INIT_TOP - 12)) + \ + 11), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF13(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ + ((CDF_INIT_TOP - 13) >> 1)) / \ + ((CDF_INIT_TOP - 13)) + \ + 1) \ + , \ + AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ + ((CDF_INIT_TOP - 13) >> 1)) / \ + ((CDF_INIT_TOP - 13)) + \ + 2), \ + AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ + ((CDF_INIT_TOP - 13) >> 1)) / \ + ((CDF_INIT_TOP - 13)) + \ + 3), \ + AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ + ((CDF_INIT_TOP - 13) >> 1)) / \ + ((CDF_INIT_TOP - 13)) + \ + 4), \ + AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ + ((CDF_INIT_TOP - 13) >> 1)) / \ + ((CDF_INIT_TOP - 13)) + \ + 5), \ + AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ + ((CDF_INIT_TOP - 13) >> 1)) / \ + ((CDF_INIT_TOP - 13)) + \ + 6), \ + AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ + ((CDF_INIT_TOP - 13) >> 1)) / \ + ((CDF_INIT_TOP - 13)) + \ + 7), \ + AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ + ((CDF_INIT_TOP - 13) >> 1)) / \ + ((CDF_INIT_TOP - 13)) + \ + 8), \ + AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ + ((CDF_INIT_TOP - 13) >> 1)) / \ + ((CDF_INIT_TOP - 13)) + \ + 9), \ + AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ + ((CDF_INIT_TOP - 13) >> 1)) / \ + ((CDF_INIT_TOP - 13)) + \ + 10), \ + AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ + ((CDF_INIT_TOP - 13) >> 1)) / \ + ((CDF_INIT_TOP - 13)) + \ + 11), \ + AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ + ((CDF_INIT_TOP - 13) >> 1)) / \ + ((CDF_INIT_TOP - 13)) + \ + 12), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF14(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ + ((CDF_INIT_TOP - 14) >> 1)) / \ + ((CDF_INIT_TOP - 14)) + \ + 1) \ + , \ + AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ + ((CDF_INIT_TOP - 14) >> 1)) / \ + ((CDF_INIT_TOP - 14)) + \ + 2), \ + AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ + ((CDF_INIT_TOP - 14) >> 1)) / \ + ((CDF_INIT_TOP - 14)) + \ + 3), \ + AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ + ((CDF_INIT_TOP - 14) >> 1)) / \ + ((CDF_INIT_TOP - 14)) + \ + 4), \ + AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ + ((CDF_INIT_TOP - 14) >> 1)) / \ + ((CDF_INIT_TOP - 14)) + \ + 5), \ + AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ + ((CDF_INIT_TOP - 14) >> 1)) / \ + ((CDF_INIT_TOP - 14)) + \ + 6), \ + AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ + ((CDF_INIT_TOP - 14) >> 1)) / \ + ((CDF_INIT_TOP - 14)) + \ + 7), \ + AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ + ((CDF_INIT_TOP - 14) >> 1)) / \ + ((CDF_INIT_TOP - 14)) + \ + 8), \ + AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ + ((CDF_INIT_TOP - 14) >> 1)) / \ + ((CDF_INIT_TOP - 14)) + \ + 9), \ + AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ + ((CDF_INIT_TOP - 14) >> 1)) / \ + ((CDF_INIT_TOP - 14)) + \ + 10), \ + AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ + ((CDF_INIT_TOP - 14) >> 1)) / \ + ((CDF_INIT_TOP - 14)) + \ + 11), \ + AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ + ((CDF_INIT_TOP - 14) >> 1)) / \ + ((CDF_INIT_TOP - 14)) + \ + 12), \ + AOM_ICDF((((a12)-13) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ + ((CDF_INIT_TOP - 14) >> 1)) / \ + ((CDF_INIT_TOP - 14)) + \ + 13), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF15(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ + ((CDF_INIT_TOP - 15) >> 1)) / \ + ((CDF_INIT_TOP - 15)) + \ + 1) \ + , \ + AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ + ((CDF_INIT_TOP - 15) >> 1)) / \ + ((CDF_INIT_TOP - 15)) + \ + 2), \ + AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ + ((CDF_INIT_TOP - 15) >> 1)) / \ + ((CDF_INIT_TOP - 15)) + \ + 3), \ + AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ + ((CDF_INIT_TOP - 15) >> 1)) / \ + ((CDF_INIT_TOP - 15)) + \ + 4), \ + AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ + ((CDF_INIT_TOP - 15) >> 1)) / \ + ((CDF_INIT_TOP - 15)) + \ + 5), \ + AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ + ((CDF_INIT_TOP - 15) >> 1)) / \ + ((CDF_INIT_TOP - 15)) + \ + 6), \ + AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ + ((CDF_INIT_TOP - 15) >> 1)) / \ + ((CDF_INIT_TOP - 15)) + \ + 7), \ + AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ + ((CDF_INIT_TOP - 15) >> 1)) / \ + ((CDF_INIT_TOP - 15)) + \ + 8), \ + AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ + ((CDF_INIT_TOP - 15) >> 1)) / \ + ((CDF_INIT_TOP - 15)) + \ + 9), \ + AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ + ((CDF_INIT_TOP - 15) >> 1)) / \ + ((CDF_INIT_TOP - 15)) + \ + 10), \ + AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ + ((CDF_INIT_TOP - 15) >> 1)) / \ + ((CDF_INIT_TOP - 15)) + \ + 11), \ + AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ + ((CDF_INIT_TOP - 15) >> 1)) / \ + ((CDF_INIT_TOP - 15)) + \ + 12), \ + AOM_ICDF((((a12)-13) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ + ((CDF_INIT_TOP - 15) >> 1)) / \ + ((CDF_INIT_TOP - 15)) + \ + 13), \ + AOM_ICDF((((a13)-14) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ + ((CDF_INIT_TOP - 15) >> 1)) / \ + ((CDF_INIT_TOP - 15)) + \ + 14), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF16(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, \ + a14) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 1) \ + , \ + AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 2), \ + AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 3), \ + AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 4), \ + AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 5), \ + AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 6), \ + AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 7), \ + AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 8), \ + AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 9), \ + AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 10), \ + AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 11), \ + AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 12), \ + AOM_ICDF((((a12)-13) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 13), \ + AOM_ICDF((((a13)-14) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 14), \ + AOM_ICDF((((a14)-15) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 15), \ + AOM_ICDF(CDF_PROB_TOP), 0 + +#endif + +static INLINE uint8_t get_prob(unsigned int num, unsigned int den) { + assert(den != 0); + { + const int p = (int)(((uint64_t)num * 256 + (den >> 1)) / den); + // (p > 255) ? 255 : (p < 1) ? 1 : p; + const int clipped_prob = p | ((255 - p) >> 23) | (p == 0); + return (uint8_t)clipped_prob; + } +} + +static INLINE void update_cdf(aom_cdf_prob *cdf, int8_t val, int nsymbs) { + int rate; + int i, tmp; + + static const int nsymbs2speed[17] = { 0, 0, 1, 1, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2 }; + assert(nsymbs < 17); + rate = 3 + (cdf[nsymbs] > 15) + (cdf[nsymbs] > 31) + + nsymbs2speed[nsymbs]; // + get_msb(nsymbs); + tmp = AOM_ICDF(0); + + // Single loop (faster) + for (i = 0; i < nsymbs - 1; ++i) { + tmp = (i == val) ? 0 : tmp; + if (tmp < cdf[i]) { + cdf[i] -= ((cdf[i] - tmp) >> rate); + } else { + cdf[i] += ((tmp - cdf[i]) >> rate); + } + } + cdf[nsymbs] += (cdf[nsymbs] < 32); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_PROB_H_ diff --git a/libs/libaom/src/aom_dsp/psnr.c b/libs/libaom/src/aom_dsp/psnr.c new file mode 100644 index 000000000..c66dd52d0 --- /dev/null +++ b/libs/libaom/src/aom_dsp/psnr.c @@ -0,0 +1,439 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/psnr.h" +#include "aom_scale/yv12config.h" + +double aom_sse_to_psnr(double samples, double peak, double sse) { + if (sse > 0.0) { + const double psnr = 10.0 * log10(samples * peak * peak / sse); + return psnr > MAX_PSNR ? MAX_PSNR : psnr; + } else { + return MAX_PSNR; + } +} + +static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h, unsigned int *sse, + int *sum) { + int i, j; + + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void encoder_highbd_variance64(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, + int h, uint64_t *sse, int64_t *sum) { + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + int64_t tsum = 0; + uint64_t tsse = 0; + for (int i = 0; i < h; ++i) { + int32_t lsum = 0; + for (int j = 0; j < w; ++j) { + const int diff = a[j] - b[j]; + lsum += diff; + tsse += (uint32_t)(diff * diff); + } + tsum += lsum; + a += a_stride; + b += b_stride; + } + *sum = tsum; + *sse = tsse; +} + +static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, + int h, unsigned int *sse, int *sum) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, + &sum_long); + *sse = (unsigned int)sse_long; + *sum = (int)sum_long; +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + const int dw = width % 16; + const int dh = height % 16; + int64_t total_sse = 0; + unsigned int sse = 0; + int sum = 0; + int x, y; + + if (dw > 0) { + encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw, + height, &sse, &sum); + total_sse += sse; + } + + if (dh > 0) { + encoder_variance(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, width - dw, dh, + &sse, &sum); + total_sse += sse; + } + + for (y = 0; y < height / 16; ++y) { + const uint8_t *pa = a; + const uint8_t *pb = b; + for (x = 0; x < width / 16; ++x) { + aom_mse16x16(pa, a_stride, pb, b_stride, &sse); + total_sse += sse; + + pa += 16; + pb += 16; + } + + a += 16 * a_stride; + b += 16 * b_stride; + } + + return total_sse; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int width, + int height, unsigned int input_shift) { + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + int64_t total_sse = 0; + int x, y; + for (y = 0; y < height; ++y) { + for (x = 0; x < width; ++x) { + int64_t diff; + diff = (a[x] >> input_shift) - (b[x] >> input_shift); + total_sse += diff * diff; + } + a += a_stride; + b += b_stride; + } + return total_sse; +} + +static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int64_t total_sse = 0; + int x, y; + const int dw = width % 16; + const int dh = height % 16; + unsigned int sse = 0; + int sum = 0; + if (dw > 0) { + encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw], + b_stride, dw, height, &sse, &sum); + total_sse += sse; + } + if (dh > 0) { + encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, + width - dw, dh, &sse, &sum); + total_sse += sse; + } + for (y = 0; y < height / 16; ++y) { + const uint8_t *pa = a; + const uint8_t *pb = b; + for (x = 0; x < width / 16; ++x) { + aom_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse); + total_sse += sse; + pa += 16; + pb += 16; + } + a += 16 * a_stride; + b += 16 * b_stride; + } + return total_sse; +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +uint64_t aom_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart, int width, + int vstart, int height) { + return aom_var_2d_u8(a->y_buffer + vstart * a->y_stride + hstart, a->y_stride, + width, height) / + (width * height); +} + +uint64_t aom_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart, int width, + int vstart, int height) { + return aom_var_2d_u8(a->u_buffer + vstart * a->uv_stride + hstart, + a->uv_stride, width, height) / + (width * height); +} + +uint64_t aom_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart, int width, + int vstart, int height) { + return aom_var_2d_u8(a->v_buffer + vstart * a->uv_stride + hstart, + a->uv_stride, width, height) / + (width * height); +} + +int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, int width, + int vstart, int height) { + return get_sse(a->y_buffer + vstart * a->y_stride + hstart, a->y_stride, + b->y_buffer + vstart * b->y_stride + hstart, b->y_stride, + width, height); +} + +int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + assert(a->y_crop_width == b->y_crop_width); + assert(a->y_crop_height == b->y_crop_height); + + return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride, + a->y_crop_width, a->y_crop_height); +} + +int64_t aom_get_u_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, int width, + int vstart, int height) { + return get_sse(a->u_buffer + vstart * a->uv_stride + hstart, a->uv_stride, + b->u_buffer + vstart * b->uv_stride + hstart, b->uv_stride, + width, height); +} + +int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + assert(a->uv_crop_width == b->uv_crop_width); + assert(a->uv_crop_height == b->uv_crop_height); + + return get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride, + a->uv_crop_width, a->uv_crop_height); +} + +int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, int width, + int vstart, int height) { + return get_sse(a->v_buffer + vstart * a->uv_stride + hstart, a->uv_stride, + b->v_buffer + vstart * b->uv_stride + hstart, b->uv_stride, + width, height); +} + +int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + assert(a->uv_crop_width == b->uv_crop_width); + assert(a->uv_crop_height == b->uv_crop_height); + + return get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride, + a->uv_crop_width, a->uv_crop_height); +} + +#if CONFIG_AV1_HIGHBITDEPTH +uint64_t aom_highbd_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart, + int width, int vstart, int height) { + return aom_var_2d_u16(a->y_buffer + vstart * a->y_stride + hstart, + a->y_stride, width, height) / + (width * height); +} + +uint64_t aom_highbd_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart, + int width, int vstart, int height) { + return aom_var_2d_u16(a->u_buffer + vstart * a->uv_stride + hstart, + a->uv_stride, width, height) / + (width * height); +} + +uint64_t aom_highbd_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart, + int width, int vstart, int height) { + return aom_var_2d_u16(a->v_buffer + vstart * a->uv_stride + hstart, + a->uv_stride, width, height) / + (width * height); +} + +int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, + int width, int vstart, int height) { + return highbd_get_sse( + a->y_buffer + vstart * a->y_stride + hstart, a->y_stride, + b->y_buffer + vstart * b->y_stride + hstart, b->y_stride, width, height); +} + +int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + assert(a->y_crop_width == b->y_crop_width); + assert(a->y_crop_height == b->y_crop_height); + assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0); + assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0); + + return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride, + a->y_crop_width, a->y_crop_height); +} + +int64_t aom_highbd_get_u_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, + int width, int vstart, int height) { + return highbd_get_sse(a->u_buffer + vstart * a->uv_stride + hstart, + a->uv_stride, + b->u_buffer + vstart * b->uv_stride + hstart, + b->uv_stride, width, height); +} + +int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + assert(a->uv_crop_width == b->uv_crop_width); + assert(a->uv_crop_height == b->uv_crop_height); + assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0); + assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0); + + return highbd_get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride, + a->uv_crop_width, a->uv_crop_height); +} + +int64_t aom_highbd_get_v_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, + int width, int vstart, int height) { + return highbd_get_sse(a->v_buffer + vstart * a->uv_stride + hstart, + a->uv_stride, + b->v_buffer + vstart * b->uv_stride + hstart, + b->uv_stride, width, height); +} + +int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + assert(a->uv_crop_width == b->uv_crop_width); + assert(a->uv_crop_height == b->uv_crop_height); + assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0); + assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0); + + return highbd_get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride, + a->uv_crop_width, a->uv_crop_height); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int plane, int highbd) { +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd) { + switch (plane) { + case 0: return aom_highbd_get_y_sse(a, b); + case 1: return aom_highbd_get_u_sse(a, b); + case 2: return aom_highbd_get_v_sse(a, b); + default: assert(plane >= 0 && plane <= 2); return 0; + } + } else { + switch (plane) { + case 0: return aom_get_y_sse(a, b); + case 1: return aom_get_u_sse(a, b); + case 2: return aom_get_v_sse(a, b); + default: assert(plane >= 0 && plane <= 2); return 0; + } + } +#else + (void)highbd; + switch (plane) { + case 0: return aom_get_y_sse(a, b); + case 1: return aom_get_u_sse(a, b); + case 2: return aom_get_v_sse(a, b); + default: assert(plane >= 0 && plane <= 2); return 0; + } +#endif +} + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr, + uint32_t bit_depth, uint32_t in_bit_depth) { + const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width }; + const int heights[3] = { a->y_crop_height, a->uv_crop_height, + a->uv_crop_height }; + const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride }; + const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride }; + int i; + uint64_t total_sse = 0; + uint32_t total_samples = 0; + const double peak = (double)((1 << in_bit_depth) - 1); + const unsigned int input_shift = bit_depth - in_bit_depth; + + for (i = 0; i < 3; ++i) { + const int w = widths[i]; + const int h = heights[i]; + const uint32_t samples = w * h; + uint64_t sse; + if (a->flags & YV12_FLAG_HIGHBITDEPTH) { + if (input_shift) { + sse = highbd_get_sse_shift(a->buffers[i], a_strides[i], b->buffers[i], + b_strides[i], w, h, input_shift); + } else { + sse = highbd_get_sse(a->buffers[i], a_strides[i], b->buffers[i], + b_strides[i], w, h); + } + } else { + sse = get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w, + h); + } + psnr->sse[1 + i] = sse; + psnr->samples[1 + i] = samples; + psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse); + + total_sse += sse; + total_samples += samples; + } + + psnr->sse[0] = total_sse; + psnr->samples[0] = total_samples; + psnr->psnr[0] = + aom_sse_to_psnr((double)total_samples, peak, (double)total_sse); +} +#endif + +void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, + PSNR_STATS *psnr) { + static const double peak = 255.0; + const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width }; + const int heights[3] = { a->y_crop_height, a->uv_crop_height, + a->uv_crop_height }; + const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride }; + const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride }; + int i; + uint64_t total_sse = 0; + uint32_t total_samples = 0; + + for (i = 0; i < 3; ++i) { + const int w = widths[i]; + const int h = heights[i]; + const uint32_t samples = w * h; + const uint64_t sse = + get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w, h); + psnr->sse[1 + i] = sse; + psnr->samples[1 + i] = samples; + psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse); + + total_sse += sse; + total_samples += samples; + } + + psnr->sse[0] = total_sse; + psnr->samples[0] = total_samples; + psnr->psnr[0] = + aom_sse_to_psnr((double)total_samples, peak, (double)total_sse); +} diff --git a/libs/libaom/src/aom_dsp/psnr.h b/libs/libaom/src/aom_dsp/psnr.h new file mode 100644 index 000000000..7f40b8b57 --- /dev/null +++ b/libs/libaom/src/aom_dsp/psnr.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_PSNR_H_ +#define AOM_AOM_DSP_PSNR_H_ + +#include "aom_scale/yv12config.h" + +#define MAX_PSNR 100.0 + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + double psnr[4]; // total/y/u/v + uint64_t sse[4]; // total/y/u/v + uint32_t samples[4]; // total/y/u/v +} PSNR_STATS; + +/*!\brief Converts SSE to PSNR + * + * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR). + * + * \param[in] samples Number of samples + * \param[in] peak Max sample value + * \param[in] sse Sum of squared errors + */ +double aom_sse_to_psnr(double samples, double peak, double sse); +uint64_t aom_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart, int width, + int vstart, int height); +uint64_t aom_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart, int width, + int vstart, int height); +uint64_t aom_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart, int width, + int vstart, int height); +int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, int width, + int vstart, int height); +int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); +int64_t aom_get_u_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, int width, + int vstart, int height); +int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); +int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, int width, + int vstart, int height); +int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); +int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int plane, int highbd); +#if CONFIG_AV1_HIGHBITDEPTH +uint64_t aom_highbd_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart, + int width, int vstart, int height); +uint64_t aom_highbd_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart, + int width, int vstart, int height); +uint64_t aom_highbd_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart, + int width, int vstart, int height); +int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, + int width, int vstart, int height); +int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b); +int64_t aom_highbd_get_u_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, + int width, int vstart, int height); +int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b); +int64_t aom_highbd_get_v_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, + int width, int vstart, int height); +int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b); +void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr, + unsigned int bit_depth, unsigned int in_bit_depth); +#endif +void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, + PSNR_STATS *psnr); + +double aom_psnrhvs(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *phvs_y, + double *phvs_u, double *phvs_v, uint32_t bd, uint32_t in_bd); +#ifdef __cplusplus +} // extern "C" +#endif +#endif // AOM_AOM_DSP_PSNR_H_ diff --git a/libs/libaom/src/aom_dsp/psnrhvs.c b/libs/libaom/src/aom_dsp/psnrhvs.c new file mode 100644 index 000000000..69a1d99bf --- /dev/null +++ b/libs/libaom/src/aom_dsp/psnrhvs.c @@ -0,0 +1,277 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + * + * This code was originally written by: Gregory Maxwell, at the Daala + * project. + */ + +#include +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/psnr.h" +#include "aom_dsp/ssim.h" +#include "aom_ports/system_state.h" + +static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, + int xstride) { + int i, j; + (void)xstride; + aom_fdct8x8(x, y, ystride); + for (i = 0; i < 8; i++) + for (j = 0; j < 8; j++) + *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3; +} + +static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, + int xstride) { + int i, j; + (void)xstride; + aom_highbd_fdct8x8(x, y, ystride); + for (i = 0; i < 8; i++) + for (j = 0; j < 8; j++) + *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3; +} + +/* Normalized inverse quantization matrix for 8x8 DCT at the point of + * transparency. This is not the JPEG based matrix from the paper, + this one gives a slightly higher MOS agreement.*/ +static const double csf_y[8][8] = { + { 1.6193873005, 2.2901594831, 2.08509755623, 1.48366094411, 1.00227514334, + 0.678296995242, 0.466224900598, 0.3265091542 }, + { 2.2901594831, 1.94321815382, 2.04793073064, 1.68731108984, 1.2305666963, + 0.868920337363, 0.61280991668, 0.436405793551 }, + { 2.08509755623, 2.04793073064, 1.34329019223, 1.09205635862, 0.875748795257, + 0.670882927016, 0.501731932449, 0.372504254596 }, + { 1.48366094411, 1.68731108984, 1.09205635862, 0.772819797575, 0.605636379554, + 0.48309405692, 0.380429446972, 0.295774038565 }, + { 1.00227514334, 1.2305666963, 0.875748795257, 0.605636379554, 0.448996256676, + 0.352889268808, 0.283006984131, 0.226951348204 }, + { 0.678296995242, 0.868920337363, 0.670882927016, 0.48309405692, + 0.352889268808, 0.27032073436, 0.215017739696, 0.17408067321 }, + { 0.466224900598, 0.61280991668, 0.501731932449, 0.380429446972, + 0.283006984131, 0.215017739696, 0.168869545842, 0.136153931001 }, + { 0.3265091542, 0.436405793551, 0.372504254596, 0.295774038565, + 0.226951348204, 0.17408067321, 0.136153931001, 0.109083846276 } +}; +static const double csf_cb420[8][8] = { + { 1.91113096927, 2.46074210438, 1.18284184739, 1.14982565193, 1.05017074788, + 0.898018824055, 0.74725392039, 0.615105596242 }, + { 2.46074210438, 1.58529308355, 1.21363250036, 1.38190029285, 1.33100189972, + 1.17428548929, 0.996404342439, 0.830890433625 }, + { 1.18284184739, 1.21363250036, 0.978712413627, 1.02624506078, 1.03145147362, + 0.960060382087, 0.849823426169, 0.731221236837 }, + { 1.14982565193, 1.38190029285, 1.02624506078, 0.861317501629, 0.801821139099, + 0.751437590932, 0.685398513368, 0.608694761374 }, + { 1.05017074788, 1.33100189972, 1.03145147362, 0.801821139099, 0.676555426187, + 0.605503172737, 0.55002013668, 0.495804539034 }, + { 0.898018824055, 1.17428548929, 0.960060382087, 0.751437590932, + 0.605503172737, 0.514674450957, 0.454353482512, 0.407050308965 }, + { 0.74725392039, 0.996404342439, 0.849823426169, 0.685398513368, + 0.55002013668, 0.454353482512, 0.389234902883, 0.342353999733 }, + { 0.615105596242, 0.830890433625, 0.731221236837, 0.608694761374, + 0.495804539034, 0.407050308965, 0.342353999733, 0.295530605237 } +}; +static const double csf_cr420[8][8] = { + { 2.03871978502, 2.62502345193, 1.26180942886, 1.11019789803, 1.01397751469, + 0.867069376285, 0.721500455585, 0.593906509971 }, + { 2.62502345193, 1.69112867013, 1.17180569821, 1.3342742857, 1.28513006198, + 1.13381474809, 0.962064122248, 0.802254508198 }, + { 1.26180942886, 1.17180569821, 0.944981930573, 0.990876405848, + 0.995903384143, 0.926972725286, 0.820534991409, 0.706020324706 }, + { 1.11019789803, 1.3342742857, 0.990876405848, 0.831632933426, 0.77418706195, + 0.725539939514, 0.661776842059, 0.587716619023 }, + { 1.01397751469, 1.28513006198, 0.995903384143, 0.77418706195, 0.653238524286, + 0.584635025748, 0.531064164893, 0.478717061273 }, + { 0.867069376285, 1.13381474809, 0.926972725286, 0.725539939514, + 0.584635025748, 0.496936637883, 0.438694579826, 0.393021669543 }, + { 0.721500455585, 0.962064122248, 0.820534991409, 0.661776842059, + 0.531064164893, 0.438694579826, 0.375820256136, 0.330555063063 }, + { 0.593906509971, 0.802254508198, 0.706020324706, 0.587716619023, + 0.478717061273, 0.393021669543, 0.330555063063, 0.285345396658 } +}; + +static double convert_score_db(double _score, double _weight, int16_t pix_max) { + assert(_score * _weight >= 0.0); + + if (_weight * _score < pix_max * pix_max * 1e-10) return MAX_PSNR; + return 10 * (log10(pix_max * pix_max) - log10(_weight * _score)); +} + +static double calc_psnrhvs(const unsigned char *src, int _systride, + const unsigned char *dst, int _dystride, double _par, + int _w, int _h, int _step, const double _csf[8][8], + uint32_t _shift, int buf_is_hbd, int16_t pix_max, + int luma) { + double ret; + const uint8_t *_src8 = src; + const uint8_t *_dst8 = dst; + const uint16_t *_src16 = CONVERT_TO_SHORTPTR(src); + const uint16_t *_dst16 = CONVERT_TO_SHORTPTR(dst); + DECLARE_ALIGNED(16, int16_t, dct_s[8 * 8]); + DECLARE_ALIGNED(16, int16_t, dct_d[8 * 8]); + DECLARE_ALIGNED(16, tran_low_t, dct_s_coef[8 * 8]); + DECLARE_ALIGNED(16, tran_low_t, dct_d_coef[8 * 8]); + double mask[8][8]; + int pixels; + int x; + int y; + float sum1; + float sum2; + float delt; + (void)_par; + ret = pixels = 0; + sum1 = sum2 = delt = 0.0f; + for (y = 0; y < _h; y++) { + for (x = 0; x < _w; x++) { + if (!buf_is_hbd) { + sum1 += _src8[y * _systride + x]; + sum2 += _dst8[y * _dystride + x]; + } else { + sum1 += _src16[y * _systride + x] >> _shift; + sum2 += _dst16[y * _dystride + x] >> _shift; + } + } + } + if (luma) delt = (sum1 - sum2) / (_w * _h); + /*In the PSNR-HVS-M paper[1] the authors describe the construction of + their masking table as "we have used the quantization table for the + color component Y of JPEG [6] that has been also obtained on the + basis of CSF. Note that the values in quantization table JPEG have + been normalized and then squared." Their CSF matrix (from PSNR-HVS) + was also constructed from the JPEG matrices. I can not find any obvious + scheme of normalizing to produce their table, but if I multiply their + CSF by 0.3885746225901003 and square the result I get their masking table. + I have no idea where this constant comes from, but deviating from it + too greatly hurts MOS agreement. + + [1] Nikolay Ponomarenko, Flavia Silvestri, Karen Egiazarian, Marco Carli, + Jaakko Astola, Vladimir Lukin, "On between-coefficient contrast masking + of DCT basis functions", CD-ROM Proceedings of the Third + International Workshop on Video Processing and Quality Metrics for Consumer + Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p. + + Suggested in aomedia issue#2363: + 0.3885746225901003 is a reciprocal of the maximum coefficient (2.573509) + of the old JPEG based matrix from the paper. Since you are not using that, + divide by actual maximum coefficient. */ + for (x = 0; x < 8; x++) + for (y = 0; y < 8; y++) + mask[x][y] = (_csf[x][y] / _csf[1][0]) * (_csf[x][y] / _csf[1][0]); + for (y = 0; y < _h - 7; y += _step) { + for (x = 0; x < _w - 7; x += _step) { + int i; + int j; + int n = 0; + double s_gx = 0; + double s_gy = 0; + double g = 0; + double s_gmean = 0; + double s_gvar = 0; + double s_mask = 0; + for (i = 0; i < 8; i++) { + for (j = 0; j < 8; j++) { + if (!buf_is_hbd) { + dct_s[i * 8 + j] = _src8[(y + i) * _systride + (j + x)]; + dct_d[i * 8 + j] = _dst8[(y + i) * _dystride + (j + x)]; + } else { + dct_s[i * 8 + j] = _src16[(y + i) * _systride + (j + x)] >> _shift; + dct_d[i * 8 + j] = _dst16[(y + i) * _dystride + (j + x)] >> _shift; + } + dct_d[i * 8 + j] += (int)(delt + 0.5f); + } + } + for (i = 1; i < 7; i++) { + for (j = 1; j < 7; j++) { + s_gx = (dct_s[(i - 1) * 8 + j - 1] * 3 - + dct_s[(i - 1) * 8 + j + 1] * 3 + dct_s[i * 8 + j - 1] * 10 - + dct_s[i * 8 + j + 1] * 10 + dct_s[(i + 1) * 8 + j - 1] * 3 - + dct_s[(i + 1) * 8 + j + 1] * 3) / + (pix_max * 16.f); + s_gy = (dct_s[(i - 1) * 8 + j - 1] * 3 - + dct_s[(i + 1) * 8 + j - 1] * 3 + dct_s[(i - 1) * 8 + j] * 10 - + dct_s[(i + 1) * 8 + j] * 10 + dct_s[(i - 1) * 8 + j + 1] * 3 - + dct_s[(i + 1) * 8 + j + 1] * 3) / + (pix_max * 16.f); + g = sqrt(s_gx * s_gx + s_gy * s_gy); + if (g > 0.1f) n++; + s_gmean += g; + } + } + s_gvar = 1.f / (36 - n + 1) * s_gmean / 36.f; + if (!buf_is_hbd) { + od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8); + od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8); + } else { + hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8); + hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8); + } + for (i = 0; i < 8; i++) + for (j = (i == 0); j < 8; j++) + s_mask += dct_s_coef[i * 8 + j] * dct_s_coef[i * 8 + j] * mask[i][j]; + s_mask = sqrt(s_mask * s_gvar) / 8.f; + for (i = 0; i < 8; i++) { + for (j = 0; j < 8; j++) { + double err; + err = fabs((double)(dct_s_coef[i * 8 + j] - dct_d_coef[i * 8 + j])); + if (i != 0 || j != 0) + err = err < s_mask / mask[i][j] ? 0 : err - s_mask / mask[i][j]; + ret += (err * _csf[i][j]) * (err * _csf[i][j]); + pixels++; + } + } + } + } + if (pixels <= 0) return 0; + ret /= pixels; + ret += 0.04 * delt * delt; + return ret; +} + +double aom_psnrhvs(const YV12_BUFFER_CONFIG *src, const YV12_BUFFER_CONFIG *dst, + double *y_psnrhvs, double *u_psnrhvs, double *v_psnrhvs, + uint32_t bd, uint32_t in_bd) { + double psnrhvs; + const double par = 1.0; + const int step = 7; + uint32_t bd_shift = 0; + aom_clear_system_state(); + assert(bd == 8 || bd == 10 || bd == 12); + assert(bd >= in_bd); + assert(src->flags == dst->flags); + const int buf_is_hbd = src->flags & YV12_FLAG_HIGHBITDEPTH; + + int16_t pix_max = 255; + if (in_bd == 10) + pix_max = 1023; + else if (in_bd == 12) + pix_max = 4095; + + bd_shift = bd - in_bd; + + *y_psnrhvs = + calc_psnrhvs(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, + par, src->y_crop_width, src->y_crop_height, step, csf_y, + bd_shift, buf_is_hbd, pix_max, 1); + *u_psnrhvs = + calc_psnrhvs(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + par, src->uv_crop_width, src->uv_crop_height, step, + csf_cb420, bd_shift, buf_is_hbd, pix_max, 0); + *v_psnrhvs = + calc_psnrhvs(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + par, src->uv_crop_width, src->uv_crop_height, step, + csf_cr420, bd_shift, buf_is_hbd, pix_max, 0); + psnrhvs = (*y_psnrhvs) * .8 + .1 * ((*u_psnrhvs) + (*v_psnrhvs)); + return convert_score_db(psnrhvs, 1.0, pix_max); +} diff --git a/libs/libaom/src/aom_dsp/quantize.c b/libs/libaom/src/aom_dsp/quantize.c new file mode 100644 index 000000000..edd4d9648 --- /dev/null +++ b/libs/libaom/src/aom_dsp/quantize.c @@ -0,0 +1,466 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/quantize.h" +#include "aom_mem/aom_mem.h" +#include "av1/encoder/av1_quantize.h" + +void aom_quantize_b_adaptive_helper_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale) { + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + int i, non_zero_count = (int)n_coeffs, eob = -1; + (void)iscan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + int prescan_add[2]; + for (i = 0; i < 2; ++i) + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + + // Pre-scan pass + for (i = (int)n_coeffs - 1; i >= 0; i--) { + const int rc = scan[i]; + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const int coeff = coeff_ptr[rc] * wt; + const int prescan_add_val = prescan_add[rc != 0]; + if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) && + coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) + non_zero_count--; + else + break; + } + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif // SKIP_EOB_FACTOR_ADJUST + for (i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp32; + + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) { + int64_t tmp = + clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale), + INT16_MIN, INT16_MAX); + tmp *= wt; + tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + quant_shift_ptr[rc != 0]) >> + (16 - log_scale + AOM_QM_BITS)); // quantization + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); + const int dequant = + (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> + AOM_QM_BITS; + const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + + if (tmp32) { + eob = i; +#if SKIP_EOB_FACTOR_ADJUST + if (first == -1) first = i; +#endif // SKIP_EOB_FACTOR_ADJUST + } + } + } +#if SKIP_EOB_FACTOR_ADJUST + if (eob >= 0 && first == eob) { + const int rc = scan[eob]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const int coeff = coeff_ptr[rc] * wt; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) && + coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + eob = -1; + } + } + } +#endif // SKIP_EOB_FACTOR_ADJUST + *eob_ptr = eob + 1; +} + +void aom_quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, + const int log_scale) { + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + int i, non_zero_count = (int)n_coeffs, eob = -1; + (void)iscan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + // Pre-scan pass + for (i = (int)n_coeffs - 1; i >= 0; i--) { + const int rc = scan[i]; + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const int coeff = coeff_ptr[rc] * wt; + + if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS)) && + coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS))) + non_zero_count--; + else + break; + } + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp32; + + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) { + int64_t tmp = + clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale), + INT16_MIN, INT16_MAX); + tmp *= wt; + tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + quant_shift_ptr[rc != 0]) >> + (16 - log_scale + AOM_QM_BITS)); // quantization + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); + const int dequant = + (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> + AOM_QM_BITS; + const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + + if (tmp32) eob = i; + } + } + *eob_ptr = eob + 1; +} + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_quantize_b_adaptive_helper_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale) { + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + (void)iscan; + int i, non_zero_count = (int)n_coeffs, eob = -1; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + int prescan_add[2]; + for (i = 0; i < 2; ++i) + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + + // Pre-scan pass + for (i = (int)n_coeffs - 1; i >= 0; i--) { + const int rc = scan[i]; + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const int coeff = coeff_ptr[rc] * wt; + const int prescan_add_val = prescan_add[rc != 0]; + if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) && + coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) + non_zero_count--; + else + break; + } + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif // SKIP_EOB_FACTOR_ADJUST + for (i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = AOMSIGN(coeff); + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) { + const int64_t tmp1 = + abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale); + const int64_t tmpw = tmp1 * wt; + const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; + const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> + (16 - log_scale + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); + const int dequant = + (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> + AOM_QM_BITS; + const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + if (abs_qcoeff) { + eob = i; +#if SKIP_EOB_FACTOR_ADJUST + if (first == -1) first = eob; +#endif // SKIP_EOB_FACTOR_ADJUST + } + } + } +#if SKIP_EOB_FACTOR_ADJUST + if (eob >= 0 && first == eob) { + const int rc = scan[eob]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const int coeff = coeff_ptr[rc] * wt; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) && + coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + eob = -1; + } + } + } +#endif // SKIP_EOB_FACTOR_ADJUST + *eob_ptr = eob + 1; +} + +void aom_highbd_quantize_b_helper_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale) { + int i, eob = -1; + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + int dequant; + int idx_arr[4096]; + (void)iscan; + int idx = 0; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + // Pre-scan pass + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const int coeff = coeff_ptr[rc] * wt; + + // If the coefficient is out of the base ZBIN range, keep it for + // quantization. + if (coeff >= (zbins[rc != 0] * (1 << AOM_QM_BITS)) || + coeff <= (nzbins[rc != 0] * (1 << AOM_QM_BITS))) + idx_arr[idx++] = i; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = scan[idx_arr[i]]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = AOMSIGN(coeff); + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = + abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale); + const int64_t tmpw = tmp1 * wt; + const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; + const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> + (16 - log_scale + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dequant = + (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + if (abs_qcoeff) eob = idx_arr[i]; + } + *eob_ptr = eob + 1; +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +/* These functions should only be called when quantisation matrices + are not used. */ +void aom_quantize_b_adaptive_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + aom_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, + iscan, NULL, NULL, 0); +} + +void aom_quantize_b_32x32_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + aom_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, + iscan, NULL, NULL, 1); +} + +void aom_quantize_b_64x64_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + aom_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, + iscan, NULL, NULL, 2); +} + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_quantize_b_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + aom_highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, + round_ptr, quant_ptr, quant_shift_ptr, + qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 0); +} + +void aom_highbd_quantize_b_32x32_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + aom_highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, + round_ptr, quant_ptr, quant_shift_ptr, + qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 1); +} + +void aom_highbd_quantize_b_64x64_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + aom_highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, + round_ptr, quant_ptr, quant_shift_ptr, + qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 2); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + aom_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 0); +} + +void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + aom_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 1); +} + +void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + aom_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 2); +} + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + aom_highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, + NULL, NULL, 0); +} + +void aom_highbd_quantize_b_32x32_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + aom_highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, + NULL, NULL, 1); +} + +void aom_highbd_quantize_b_64x64_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + aom_highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, + NULL, NULL, 2); +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/libs/libaom/src/aom_dsp/quantize.h b/libs/libaom/src/aom_dsp/quantize.h new file mode 100644 index 000000000..395631814 --- /dev/null +++ b/libs/libaom/src/aom_dsp/quantize.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_QUANTIZE_H_ +#define AOM_AOM_DSP_QUANTIZE_H_ + +#include "config/aom_config.h" + +#include "aom_dsp/aom_dsp_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void aom_quantize_b_adaptive_helper_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale); + +void aom_quantize_b_adaptive_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); + +void aom_quantize_b_32x32_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); + +void aom_quantize_b_64x64_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_quantize_b_adaptive_helper_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale); + +void aom_highbd_quantize_b_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); + +void aom_highbd_quantize_b_32x32_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); + +void aom_highbd_quantize_b_64x64_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); +#endif // CONFIG_AV1_HIGHBITDEPTH + +void aom_quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, + const int log_scale); + +void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_quantize_b_helper_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale); + +void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); +#endif // CONFIG_AV1_HIGHBITDEPTH + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_QUANTIZE_H_ diff --git a/libs/libaom/src/aom_dsp/recenter.h b/libs/libaom/src/aom_dsp/recenter.h new file mode 100644 index 000000000..b3fd41290 --- /dev/null +++ b/libs/libaom/src/aom_dsp/recenter.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_RECENTER_H_ +#define AOM_AOM_DSP_RECENTER_H_ + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" + +// Inverse recenters a non-negative literal v around a reference r +static INLINE uint16_t inv_recenter_nonneg(uint16_t r, uint16_t v) { + if (v > (r << 1)) + return v; + else if ((v & 1) == 0) + return (v >> 1) + r; + else + return r - ((v + 1) >> 1); +} + +// Inverse recenters a non-negative literal v in [0, n-1] around a +// reference r also in [0, n-1] +static INLINE uint16_t inv_recenter_finite_nonneg(uint16_t n, uint16_t r, + uint16_t v) { + if ((r << 1) <= n) { + return inv_recenter_nonneg(r, v); + } else { + return n - 1 - inv_recenter_nonneg(n - 1 - r, v); + } +} + +// Recenters a non-negative literal v around a reference r +static INLINE uint16_t recenter_nonneg(uint16_t r, uint16_t v) { + if (v > (r << 1)) + return v; + else if (v >= r) + return ((v - r) << 1); + else + return ((r - v) << 1) - 1; +} + +// Recenters a non-negative literal v in [0, n-1] around a +// reference r also in [0, n-1] +static INLINE uint16_t recenter_finite_nonneg(uint16_t n, uint16_t r, + uint16_t v) { + if ((r << 1) <= n) { + return recenter_nonneg(r, v); + } else { + return recenter_nonneg(n - 1 - r, n - 1 - v); + } +} + +#endif // AOM_AOM_DSP_RECENTER_H_ diff --git a/libs/libaom/src/aom_dsp/sad.c b/libs/libaom/src/aom_dsp/sad.c new file mode 100644 index 000000000..8ddc683d6 --- /dev/null +++ b/libs/libaom/src/aom_dsp/sad.c @@ -0,0 +1,319 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/blend.h" + +/* Sum the difference between every corresponding element of the buffers. */ +static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int y, x; + unsigned int sad = 0; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + sad += abs(a[x] - b[x]); + } + + a += a_stride; + b += b_stride; + } + return sad; +} + +#define sadMxh(m) \ + unsigned int aom_sad##m##xh_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, int width, \ + int height) { \ + return sad(a, a_stride, b, b_stride, width, height); \ + } + +#define sadMxN(m, n) \ + unsigned int aom_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return sad(src, src_stride, ref, ref_stride, m, n); \ + } \ + unsigned int aom_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + uint8_t comp_pred[m * n]; \ + aom_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \ + return sad(src, src_stride, comp_pred, m, m, n); \ + } \ + unsigned int aom_dist_wtd_sad##m##x##n##_avg_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + uint8_t comp_pred[m * n]; \ + aom_dist_wtd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, \ + ref_stride, jcp_param); \ + return sad(src, src_stride, comp_pred, m, m, n); \ + } + +// Calculate sad against 4 reference locations and store each in sad_array +#define sadMxNx4D(m, n) \ + void aom_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = \ + aom_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \ + } \ + } \ + void aom_sad##m##x##n##x4d_avg_c( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[], \ + int ref_stride, const uint8_t *second_pred, uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = aom_sad##m##x##n##_avg_c(src, src_stride, ref_array[i], \ + ref_stride, second_pred); \ + } \ + } + +// 128x128 +sadMxN(128, 128); +sadMxNx4D(128, 128); + +// 128x64 +sadMxN(128, 64); +sadMxNx4D(128, 64); + +// 64x128 +sadMxN(64, 128); +sadMxNx4D(64, 128); + +// 64x64 +sadMxN(64, 64); +sadMxNx4D(64, 64); + +// 64x32 +sadMxN(64, 32); +sadMxNx4D(64, 32); + +// 32x64 +sadMxN(32, 64); +sadMxNx4D(32, 64); + +// 32x32 +sadMxN(32, 32); +sadMxNx4D(32, 32); + +// 32x16 +sadMxN(32, 16); +sadMxNx4D(32, 16); + +// 16x32 +sadMxN(16, 32); +sadMxNx4D(16, 32); + +// 16x16 +sadMxN(16, 16); +sadMxNx4D(16, 16); + +// 16x8 +sadMxN(16, 8); +sadMxNx4D(16, 8); + +// 8x16 +sadMxN(8, 16); +sadMxNx4D(8, 16); + +// 8x8 +sadMxN(8, 8); +sadMxNx4D(8, 8); + +// 8x4 +sadMxN(8, 4); +sadMxNx4D(8, 4); + +// 4x8 +sadMxN(4, 8); +sadMxNx4D(4, 8); + +// 4x4 +sadMxN(4, 4); +sadMxNx4D(4, 4); + +sadMxh(128); +sadMxh(64); +sadMxh(32); +sadMxh(16); +sadMxh(8); +sadMxh(4); + +sadMxN(4, 16); +sadMxNx4D(4, 16); +sadMxN(16, 4); +sadMxNx4D(16, 4); +sadMxN(8, 32); +sadMxNx4D(8, 32); +sadMxN(32, 8); +sadMxNx4D(32, 8); +sadMxN(16, 64); +sadMxNx4D(16, 64); +sadMxN(64, 16); +sadMxNx4D(64, 16); + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE unsigned int highbd_sad(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int width, int height) { + int y, x; + unsigned int sad = 0; + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + sad += abs(a[x] - b[x]); + } + + a += a_stride; + b += b_stride; + } + return sad; +} + +static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int width, int height) { + int y, x; + unsigned int sad = 0; + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + sad += abs(a[x] - b[x]); + } + + a += a_stride; + b += b_stride; + } + return sad; +} + +#define highbd_sadMxN(m, n) \ + unsigned int aom_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad(src, src_stride, ref, ref_stride, m, n); \ + } \ + unsigned int aom_highbd_sad##m##x##n##_avg_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + uint16_t comp_pred[m * n]; \ + uint8_t *const comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred); \ + aom_highbd_comp_avg_pred(comp_pred8, second_pred, m, n, ref, ref_stride); \ + return highbd_sadb(src, src_stride, comp_pred8, m, m, n); \ + } \ + unsigned int aom_highbd_dist_wtd_sad##m##x##n##_avg_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + uint16_t comp_pred[m * n]; \ + uint8_t *const comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred); \ + aom_highbd_dist_wtd_comp_avg_pred(comp_pred8, second_pred, m, n, ref, \ + ref_stride, jcp_param); \ + return highbd_sadb(src, src_stride, comp_pred8, m, m, n); \ + } + +#define highbd_sadMxNx4D(m, n) \ + void aom_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride, \ + ref_array[i], ref_stride); \ + } \ + } + +// 128x128 +highbd_sadMxN(128, 128); +highbd_sadMxNx4D(128, 128); + +// 128x64 +highbd_sadMxN(128, 64); +highbd_sadMxNx4D(128, 64); + +// 64x128 +highbd_sadMxN(64, 128); +highbd_sadMxNx4D(64, 128); + +// 64x64 +highbd_sadMxN(64, 64); +highbd_sadMxNx4D(64, 64); + +// 64x32 +highbd_sadMxN(64, 32); +highbd_sadMxNx4D(64, 32); + +// 32x64 +highbd_sadMxN(32, 64); +highbd_sadMxNx4D(32, 64); + +// 32x32 +highbd_sadMxN(32, 32); +highbd_sadMxNx4D(32, 32); + +// 32x16 +highbd_sadMxN(32, 16); +highbd_sadMxNx4D(32, 16); + +// 16x32 +highbd_sadMxN(16, 32); +highbd_sadMxNx4D(16, 32); + +// 16x16 +highbd_sadMxN(16, 16); +highbd_sadMxNx4D(16, 16); + +// 16x8 +highbd_sadMxN(16, 8); +highbd_sadMxNx4D(16, 8); + +// 8x16 +highbd_sadMxN(8, 16); +highbd_sadMxNx4D(8, 16); + +// 8x8 +highbd_sadMxN(8, 8); +highbd_sadMxNx4D(8, 8); + +// 8x4 +highbd_sadMxN(8, 4); +highbd_sadMxNx4D(8, 4); + +// 4x8 +highbd_sadMxN(4, 8); +highbd_sadMxNx4D(4, 8); + +// 4x4 +highbd_sadMxN(4, 4); +highbd_sadMxNx4D(4, 4); + +highbd_sadMxN(4, 16); +highbd_sadMxNx4D(4, 16); +highbd_sadMxN(16, 4); +highbd_sadMxNx4D(16, 4); +highbd_sadMxN(8, 32); +highbd_sadMxNx4D(8, 32); +highbd_sadMxN(32, 8); +highbd_sadMxNx4D(32, 8); +highbd_sadMxN(16, 64); +highbd_sadMxNx4D(16, 64); +highbd_sadMxN(64, 16); +highbd_sadMxNx4D(64, 16); +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/libs/libaom/src/aom_dsp/sad_av1.c b/libs/libaom/src/aom_dsp/sad_av1.c new file mode 100644 index 000000000..467518163 --- /dev/null +++ b/libs/libaom/src/aom_dsp/sad_av1.c @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/blend.h" + +static INLINE unsigned int masked_sad(const uint8_t *src, int src_stride, + const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const uint8_t *m, int m_stride, int width, + int height) { + int y, x; + unsigned int sad = 0; + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + const int16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]); + sad += abs(pred - src[x]); + } + src += src_stride; + a += a_stride; + b += b_stride; + m += m_stride; + } + return sad; +} + +#define MASKSADMxN(m, n) \ + unsigned int aom_masked_sad##m##x##n##_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ + int invert_mask) { \ + if (!invert_mask) \ + return masked_sad(src, src_stride, ref, ref_stride, second_pred, m, msk, \ + msk_stride, m, n); \ + else \ + return masked_sad(src, src_stride, second_pred, m, ref, ref_stride, msk, \ + msk_stride, m, n); \ + } \ + void aom_masked_sad##m##x##n##x4d_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref[], \ + int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \ + int msk_stride, int invert_mask, unsigned sads[]) { \ + if (!invert_mask) \ + for (int i = 0; i < 4; i++) { \ + sads[i] = masked_sad(src, src_stride, ref[i], ref_stride, second_pred, \ + m, msk, msk_stride, m, n); \ + } \ + else \ + for (int i = 0; i < 4; i++) { \ + sads[i] = masked_sad(src, src_stride, second_pred, m, ref[i], \ + ref_stride, msk, msk_stride, m, n); \ + } \ + } + +/* clang-format off */ +MASKSADMxN(128, 128) +MASKSADMxN(128, 64) +MASKSADMxN(64, 128) +MASKSADMxN(64, 64) +MASKSADMxN(64, 32) +MASKSADMxN(32, 64) +MASKSADMxN(32, 32) +MASKSADMxN(32, 16) +MASKSADMxN(16, 32) +MASKSADMxN(16, 16) +MASKSADMxN(16, 8) +MASKSADMxN(8, 16) +MASKSADMxN(8, 8) +MASKSADMxN(8, 4) +MASKSADMxN(4, 8) +MASKSADMxN(4, 4) +MASKSADMxN(4, 16) +MASKSADMxN(16, 4) +MASKSADMxN(8, 32) +MASKSADMxN(32, 8) +MASKSADMxN(16, 64) +MASKSADMxN(64, 16) +/* clang-format on */ + +#if CONFIG_AV1_HIGHBITDEPTH + static INLINE + unsigned int highbd_masked_sad(const uint8_t *src8, int src_stride, + const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + const uint8_t *m, int m_stride, int width, + int height) { + int y, x; + unsigned int sad = 0; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + const uint16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]); + sad += abs(pred - src[x]); + } + + src += src_stride; + a += a_stride; + b += b_stride; + m += m_stride; + } + + return sad; +} + +#define HIGHBD_MASKSADMXN(m, n) \ + unsigned int aom_highbd_masked_sad##m##x##n##_c( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ + int msk_stride, int invert_mask) { \ + if (!invert_mask) \ + return highbd_masked_sad(src8, src_stride, ref8, ref_stride, \ + second_pred8, m, msk, msk_stride, m, n); \ + else \ + return highbd_masked_sad(src8, src_stride, second_pred8, m, ref8, \ + ref_stride, msk, msk_stride, m, n); \ + } + +HIGHBD_MASKSADMXN(128, 128) +HIGHBD_MASKSADMXN(128, 64) +HIGHBD_MASKSADMXN(64, 128) +HIGHBD_MASKSADMXN(64, 64) +HIGHBD_MASKSADMXN(64, 32) +HIGHBD_MASKSADMXN(32, 64) +HIGHBD_MASKSADMXN(32, 32) +HIGHBD_MASKSADMXN(32, 16) +HIGHBD_MASKSADMXN(16, 32) +HIGHBD_MASKSADMXN(16, 16) +HIGHBD_MASKSADMXN(16, 8) +HIGHBD_MASKSADMXN(8, 16) +HIGHBD_MASKSADMXN(8, 8) +HIGHBD_MASKSADMXN(8, 4) +HIGHBD_MASKSADMXN(4, 8) +HIGHBD_MASKSADMXN(4, 4) +HIGHBD_MASKSADMXN(4, 16) +HIGHBD_MASKSADMXN(16, 4) +HIGHBD_MASKSADMXN(8, 32) +HIGHBD_MASKSADMXN(32, 8) +HIGHBD_MASKSADMXN(16, 64) +HIGHBD_MASKSADMXN(64, 16) +#endif // CONFIG_AV1_HIGHBITDEPTH + +// pre: predictor being evaluated +// wsrc: target weighted prediction (has been *4096 to keep precision) +// mask: 2d weights (scaled by 4096) +static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride, + const int32_t *wsrc, const int32_t *mask, + int width, int height) { + int y, x; + unsigned int sad = 0; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12); + + pre += pre_stride; + wsrc += width; + mask += width; + } + + return sad; +} + +#define OBMCSADMxN(m, n) \ + unsigned int aom_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \ + const int32_t *wsrc, \ + const int32_t *mask) { \ + return obmc_sad(ref, ref_stride, wsrc, mask, m, n); \ + } + +/* clang-format off */ +OBMCSADMxN(128, 128) +OBMCSADMxN(128, 64) +OBMCSADMxN(64, 128) +OBMCSADMxN(64, 64) +OBMCSADMxN(64, 32) +OBMCSADMxN(32, 64) +OBMCSADMxN(32, 32) +OBMCSADMxN(32, 16) +OBMCSADMxN(16, 32) +OBMCSADMxN(16, 16) +OBMCSADMxN(16, 8) +OBMCSADMxN(8, 16) +OBMCSADMxN(8, 8) +OBMCSADMxN(8, 4) +OBMCSADMxN(4, 8) +OBMCSADMxN(4, 4) +OBMCSADMxN(4, 16) +OBMCSADMxN(16, 4) +OBMCSADMxN(8, 32) +OBMCSADMxN(32, 8) +OBMCSADMxN(16, 64) +OBMCSADMxN(64, 16) +/* clang-format on */ + +#if CONFIG_AV1_HIGHBITDEPTH + static INLINE + unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, const int32_t *mask, + int width, int height) { + int y, x; + unsigned int sad = 0; + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12); + + pre += pre_stride; + wsrc += width; + mask += width; + } + + return sad; +} + +#define HIGHBD_OBMCSADMXN(m, n) \ + unsigned int aom_highbd_obmc_sad##m##x##n##_c( \ + const uint8_t *ref, int ref_stride, const int32_t *wsrc, \ + const int32_t *mask) { \ + return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \ + } + +/* clang-format off */ +HIGHBD_OBMCSADMXN(128, 128) +HIGHBD_OBMCSADMXN(128, 64) +HIGHBD_OBMCSADMXN(64, 128) +HIGHBD_OBMCSADMXN(64, 64) +HIGHBD_OBMCSADMXN(64, 32) +HIGHBD_OBMCSADMXN(32, 64) +HIGHBD_OBMCSADMXN(32, 32) +HIGHBD_OBMCSADMXN(32, 16) +HIGHBD_OBMCSADMXN(16, 32) +HIGHBD_OBMCSADMXN(16, 16) +HIGHBD_OBMCSADMXN(16, 8) +HIGHBD_OBMCSADMXN(8, 16) +HIGHBD_OBMCSADMXN(8, 8) +HIGHBD_OBMCSADMXN(8, 4) +HIGHBD_OBMCSADMXN(4, 8) +HIGHBD_OBMCSADMXN(4, 4) +HIGHBD_OBMCSADMXN(4, 16) +HIGHBD_OBMCSADMXN(16, 4) +HIGHBD_OBMCSADMXN(8, 32) +HIGHBD_OBMCSADMXN(32, 8) +HIGHBD_OBMCSADMXN(16, 64) +HIGHBD_OBMCSADMXN(64, 16) +/* clang-format on */ +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/libs/libaom/src/aom_dsp/simd/v128_intrinsics.h b/libs/libaom/src/aom_dsp/simd/v128_intrinsics.h new file mode 100644 index 000000000..218a7a618 --- /dev/null +++ b/libs/libaom/src/aom_dsp/simd/v128_intrinsics.h @@ -0,0 +1,346 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_ +#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_ + +#include +#include +#include + +#include "aom_dsp/simd/v128_intrinsics_c.h" +#include "aom_dsp/simd/v64_intrinsics.h" + +/* Fallback to plain, unoptimised C. */ + +typedef c_v128 v128; + +SIMD_INLINE uint32_t v128_low_u32(v128 a) { return c_v128_low_u32(a); } +SIMD_INLINE v64 v128_low_v64(v128 a) { return c_v128_low_v64(a); } +SIMD_INLINE v64 v128_high_v64(v128 a) { return c_v128_high_v64(a); } +SIMD_INLINE v128 v128_from_64(uint64_t hi, uint64_t lo) { + return c_v128_from_64(hi, lo); +} +SIMD_INLINE v128 v128_from_v64(v64 hi, v64 lo) { + return c_v128_from_v64(hi, lo); +} +SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + return c_v128_from_32(a, b, c, d); +} + +SIMD_INLINE v128 v128_load_unaligned(const void *p) { + return c_v128_load_unaligned(p); +} +SIMD_INLINE v128 v128_load_aligned(const void *p) { + return c_v128_load_aligned(p); +} + +SIMD_INLINE void v128_store_unaligned(void *p, v128 a) { + c_v128_store_unaligned(p, a); +} +SIMD_INLINE void v128_store_aligned(void *p, v128 a) { + c_v128_store_aligned(p, a); +} + +SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) { + return c_v128_align(a, b, c); +} + +SIMD_INLINE v128 v128_zero(void) { return c_v128_zero(); } +SIMD_INLINE v128 v128_dup_8(uint8_t x) { return c_v128_dup_8(x); } +SIMD_INLINE v128 v128_dup_16(uint16_t x) { return c_v128_dup_16(x); } +SIMD_INLINE v128 v128_dup_32(uint32_t x) { return c_v128_dup_32(x); } +SIMD_INLINE v128 v128_dup_64(uint64_t x) { return c_v128_dup_64(x); } + +SIMD_INLINE c_sad128_internal v128_sad_u8_init(void) { + return c_v128_sad_u8_init(); +} +SIMD_INLINE c_sad128_internal v128_sad_u8(c_sad128_internal s, v128 a, v128 b) { + return c_v128_sad_u8(s, a, b); +} +SIMD_INLINE uint32_t v128_sad_u8_sum(c_sad128_internal s) { + return c_v128_sad_u8_sum(s); +} +SIMD_INLINE c_ssd128_internal v128_ssd_u8_init(void) { + return c_v128_ssd_u8_init(); +} +SIMD_INLINE c_ssd128_internal v128_ssd_u8(c_ssd128_internal s, v128 a, v128 b) { + return c_v128_ssd_u8(s, a, b); +} +SIMD_INLINE uint32_t v128_ssd_u8_sum(c_ssd128_internal s) { + return c_v128_ssd_u8_sum(s); +} +SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) { + return c_v128_dotp_su8(a, b); +} +SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) { + return c_v128_dotp_s16(a, b); +} +SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) { + return c_v128_dotp_s32(a, b); +} +SIMD_INLINE uint64_t v128_hadd_u8(v128 a) { return c_v128_hadd_u8(a); } + +SIMD_INLINE v128 v128_or(v128 a, v128 b) { return c_v128_or(a, b); } +SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return c_v128_xor(a, b); } +SIMD_INLINE v128 v128_and(v128 a, v128 b) { return c_v128_and(a, b); } +SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return c_v128_andn(a, b); } + +SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return c_v128_add_8(a, b); } +SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return c_v128_add_16(a, b); } +SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return c_v128_sadd_u8(a, b); } +SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return c_v128_sadd_s8(a, b); } +SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return c_v128_sadd_s16(a, b); } +SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return c_v128_add_32(a, b); } +SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return c_v128_add_64(a, b); } +SIMD_INLINE v128 v128_padd_u8(v128 a) { return c_v128_padd_u8(a); } +SIMD_INLINE v128 v128_padd_s16(v128 a) { return c_v128_padd_s16(a); } +SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return c_v128_sub_8(a, b); } +SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return c_v128_ssub_u8(a, b); } +SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return c_v128_ssub_s8(a, b); } +SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return c_v128_sub_16(a, b); } +SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return c_v128_ssub_s16(a, b); } +SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return c_v128_ssub_u16(a, b); } +SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return c_v128_sub_32(a, b); } +SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return c_v128_sub_64(a, b); } +SIMD_INLINE v128 v128_abs_s16(v128 a) { return c_v128_abs_s16(a); } +SIMD_INLINE v128 v128_abs_s8(v128 a) { return c_v128_abs_s8(a); } + +SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { return c_v128_mul_s16(a, b); } +SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) { + return c_v128_mullo_s16(a, b); +} +SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) { + return c_v128_mulhi_s16(a, b); +} +SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) { + return c_v128_mullo_s32(a, b); +} +SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return c_v128_madd_s16(a, b); } +SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { return c_v128_madd_us8(a, b); } + +SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return c_v128_movemask_8(a); } +SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) { + return c_v128_blend_8(a, b, c); +} + +SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return c_v128_avg_u8(a, b); } +SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { return c_v128_rdavg_u8(a, b); } +SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) { + return c_v128_rdavg_u16(a, b); +} +SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return c_v128_avg_u16(a, b); } +SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return c_v128_min_u8(a, b); } +SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return c_v128_max_u8(a, b); } +SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) { return c_v128_min_s8(a, b); } +SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) { return c_v128_max_s8(a, b); } +SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return c_v128_min_s16(a, b); } +SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return c_v128_max_s16(a, b); } +SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) { return c_v128_min_s32(a, b); } +SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) { return c_v128_max_s32(a, b); } + +SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) { return c_v128_ziplo_8(a, b); } +SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) { return c_v128_ziphi_8(a, b); } +SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) { return c_v128_ziplo_16(a, b); } +SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) { return c_v128_ziphi_16(a, b); } +SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) { return c_v128_ziplo_32(a, b); } +SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) { return c_v128_ziphi_32(a, b); } +SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { return c_v128_ziplo_64(a, b); } +SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { return c_v128_ziphi_64(a, b); } +SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return c_v128_zip_8(a, b); } +SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return c_v128_zip_16(a, b); } +SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return c_v128_zip_32(a, b); } +SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) { + return c_v128_unziplo_8(a, b); +} +SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) { + return c_v128_unziphi_8(a, b); +} +SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) { + return c_v128_unziplo_16(a, b); +} +SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) { + return c_v128_unziphi_16(a, b); +} +SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) { + return c_v128_unziplo_32(a, b); +} +SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) { + return c_v128_unziphi_32(a, b); +} +SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { return c_v128_unpack_u8_s16(a); } +SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) { + return c_v128_unpacklo_u8_s16(a); +} +SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) { + return c_v128_unpackhi_u8_s16(a); +} +SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) { return c_v128_unpack_s8_s16(a); } +SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) { + return c_v128_unpacklo_s8_s16(a); +} +SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) { + return c_v128_unpackhi_s8_s16(a); +} +SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) { + return c_v128_pack_s32_s16(a, b); +} +SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) { + return c_v128_pack_s32_u16(a, b); +} +SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) { + return c_v128_pack_s16_u8(a, b); +} +SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) { + return c_v128_pack_s16_s8(a, b); +} +SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { return c_v128_unpack_u16_s32(a); } +SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { return c_v128_unpack_s16_s32(a); } +SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) { + return c_v128_unpacklo_u16_s32(a); +} +SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) { + return c_v128_unpacklo_s16_s32(a); +} +SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) { + return c_v128_unpackhi_u16_s32(a); +} +SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) { + return c_v128_unpackhi_s16_s32(a); +} +SIMD_INLINE v128 v128_shuffle_8(v128 a, v128 pattern) { + return c_v128_shuffle_8(a, pattern); +} + +SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return c_v128_cmpgt_s8(a, b); } +SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return c_v128_cmplt_s8(a, b); } +SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return c_v128_cmpeq_8(a, b); } +SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) { + return c_v128_cmpgt_s16(a, b); +} +SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) { + return c_v128_cmplt_s16(a, b); +} +SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return c_v128_cmpeq_16(a, b); } + +SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) { + return c_v128_cmpgt_s32(a, b); +} +SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) { + return c_v128_cmplt_s32(a, b); +} +SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return c_v128_cmpeq_32(a, b); } + +SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) { + return c_v128_shl_8(a, c); +} +SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) { + return c_v128_shr_u8(a, c); +} +SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) { + return c_v128_shr_s8(a, c); +} +SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) { + return c_v128_shl_16(a, c); +} +SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) { + return c_v128_shr_u16(a, c); +} +SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) { + return c_v128_shr_s16(a, c); +} +SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) { + return c_v128_shl_32(a, c); +} +SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) { + return c_v128_shr_u32(a, c); +} +SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) { + return c_v128_shr_s32(a, c); +} +SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) { + return c_v128_shl_64(a, c); +} +SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) { + return c_v128_shr_u64(a, c); +} +SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) { + return c_v128_shr_s64(a, c); +} + +SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) { + return c_v128_shr_n_byte(a, n); +} +SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) { + return c_v128_shl_n_byte(a, n); +} +SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int n) { + return c_v128_shl_n_8(a, n); +} +SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int n) { + return c_v128_shl_n_16(a, n); +} +SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int n) { + return c_v128_shl_n_32(a, n); +} +SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int n) { + return c_v128_shl_n_64(a, n); +} +SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int n) { + return c_v128_shr_n_u8(a, n); +} +SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int n) { + return c_v128_shr_n_u16(a, n); +} +SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int n) { + return c_v128_shr_n_u32(a, n); +} +SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int n) { + return c_v128_shr_n_u64(a, n); +} +SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int n) { + return c_v128_shr_n_s8(a, n); +} +SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int n) { + return c_v128_shr_n_s16(a, n); +} +SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int n) { + return c_v128_shr_n_s32(a, n); +} +SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int n) { + return c_v128_shr_n_s64(a, n); +} + +typedef uint32_t sad128_internal_u16; +SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) { + return c_v128_sad_u16_init(); +} +SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a, + v128 b) { + return c_v128_sad_u16(s, a, b); +} +SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) { + return c_v128_sad_u16_sum(s); +} + +typedef uint64_t ssd128_internal_s16; +SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) { + return c_v128_ssd_s16_init(); +} +SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a, + v128 b) { + return c_v128_ssd_s16(s, a, b); +} +SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) { + return c_v128_ssd_s16_sum(s); +} + +#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_ diff --git a/libs/libaom/src/aom_dsp/simd/v128_intrinsics_arm.h b/libs/libaom/src/aom_dsp/simd/v128_intrinsics_arm.h new file mode 100644 index 000000000..2d497f4c0 --- /dev/null +++ b/libs/libaom/src/aom_dsp/simd/v128_intrinsics_arm.h @@ -0,0 +1,973 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_ +#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_ + +#include + +#include "aom_dsp/simd/v64_intrinsics_arm.h" + +typedef int64x2_t v128; + +SIMD_INLINE uint32_t v128_low_u32(v128 a) { + return v64_low_u32(vget_low_s64(a)); +} + +SIMD_INLINE v64 v128_low_v64(v128 a) { return vget_low_s64(a); } + +SIMD_INLINE v64 v128_high_v64(v128 a) { return vget_high_s64(a); } + +SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { return vcombine_s64(b, a); } + +SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) { + return vcombine_s64((int64x1_t)b, (int64x1_t)a); +} + +SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + return vcombine_s64(v64_from_32(c, d), v64_from_32(a, b)); +} + +SIMD_INLINE v128 v128_load_aligned(const void *p) { + return vreinterpretq_s64_u8(vld1q_u8((const uint8_t *)p)); +} + +SIMD_INLINE v128 v128_load_unaligned(const void *p) { + return v128_load_aligned(p); +} + +SIMD_INLINE void v128_store_aligned(void *p, v128 r) { + vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r)); +} + +SIMD_INLINE void v128_store_unaligned(void *p, v128 r) { + vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r)); +} + +SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) { +// The following functions require an immediate. +// Some compilers will check this during optimisation, others wont. +#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) + return c ? vreinterpretq_s64_s8( + vextq_s8(vreinterpretq_s8_s64(b), vreinterpretq_s8_s64(a), c)) + : b; +#else + return c < 8 ? v128_from_v64(v64_align(v128_low_v64(a), v128_high_v64(b), c), + v64_align(v128_high_v64(b), v128_low_v64(b), c)) + : v128_from_v64( + v64_align(v128_high_v64(a), v128_low_v64(a), c - 8), + v64_align(v128_low_v64(a), v128_high_v64(b), c - 8)); +#endif +} + +SIMD_INLINE v128 v128_zero(void) { return vreinterpretq_s64_u8(vdupq_n_u8(0)); } + +SIMD_INLINE v128 v128_ones(void) { + return vreinterpretq_s64_u8(vdupq_n_u8(-1)); +} + +SIMD_INLINE v128 v128_dup_8(uint8_t x) { + return vreinterpretq_s64_u8(vdupq_n_u8(x)); +} + +SIMD_INLINE v128 v128_dup_16(uint16_t x) { + return vreinterpretq_s64_u16(vdupq_n_u16(x)); +} + +SIMD_INLINE v128 v128_dup_32(uint32_t x) { + return vreinterpretq_s64_u32(vdupq_n_u32(x)); +} + +SIMD_INLINE v128 v128_dup_64(uint64_t x) { + return vreinterpretq_s64_u64(vdupq_n_u64(x)); +} + +SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) { + int16x8_t t1 = vmulq_s16( + vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a))), + vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(b))))); + int16x8_t t2 = vmulq_s16( + vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a))), + vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(b))))); +#if defined(__aarch64__) + return vaddlvq_s16(t1) + vaddlvq_s16(t2); +#else + int64x2_t t = vpaddlq_s32(vaddq_s32(vpaddlq_s16(t1), vpaddlq_s16(t2))); + return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t); +#endif +} + +SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) { + return v64_dotp_s16(vget_high_s64(a), vget_high_s64(b)) + + v64_dotp_s16(vget_low_s64(a), vget_low_s64(b)); +} + +SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) { + int64x2_t t = vpaddlq_s32( + vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); + return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t); +} + +SIMD_INLINE uint64_t v128_hadd_u8(v128 x) { +#if defined(__aarch64__) + return vaddlvq_u8(vreinterpretq_u8_s64(x)); +#else + uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s64(x)))); + return vget_lane_s32( + vreinterpret_s32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0); +#endif +} + +SIMD_INLINE v128 v128_padd_s16(v128 a) { + return vreinterpretq_s64_s32(vpaddlq_s16(vreinterpretq_s16_s64(a))); +} + +SIMD_INLINE v128 v128_padd_u8(v128 a) { + return vreinterpretq_s64_u16(vpaddlq_u8(vreinterpretq_u8_s64(a))); +} + +typedef struct { + sad64_internal hi, lo; +} sad128_internal; + +SIMD_INLINE sad128_internal v128_sad_u8_init(void) { + sad128_internal s; + s.hi = s.lo = vdupq_n_u16(0); + return s; +} + +/* Implementation dependent return value. Result must be finalised with + v128_sad_u8_sum(). + The result for more than 32 v128_sad_u8() calls is undefined. */ +SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) { + sad128_internal r; + r.hi = v64_sad_u8(s.hi, vget_high_s64(a), vget_high_s64(b)); + r.lo = v64_sad_u8(s.lo, vget_low_s64(a), vget_low_s64(b)); + return r; +} + +SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) { +#if defined(__aarch64__) + return vaddlvq_u16(s.hi) + vaddlvq_u16(s.lo); +#else + uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vaddq_u16(s.hi, s.lo))); + return (uint32_t)(uint64_t)(vget_high_u64(t) + vget_low_u64(t)); +#endif +} + +typedef struct { + ssd64_internal hi, lo; +} ssd128_internal; + +SIMD_INLINE ssd128_internal v128_ssd_u8_init(void) { + ssd128_internal s; + s.hi = s.lo = v64_ssd_u8_init(); + return s; +} + +/* Implementation dependent return value. Result must be finalised with + * v128_ssd_u8_sum(). */ +SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) { + ssd128_internal r; + r.hi = v64_ssd_u8(s.hi, vget_high_s64(a), vget_high_s64(b)); + r.lo = v64_ssd_u8(s.lo, vget_low_s64(a), vget_low_s64(b)); + return r; +} + +SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) { + return (uint32_t)(v64_ssd_u8_sum(s.hi) + v64_ssd_u8_sum(s.lo)); +} + +SIMD_INLINE v128 v128_or(v128 x, v128 y) { return vorrq_s64(x, y); } + +SIMD_INLINE v128 v128_xor(v128 x, v128 y) { return veorq_s64(x, y); } + +SIMD_INLINE v128 v128_and(v128 x, v128 y) { return vandq_s64(x, y); } + +SIMD_INLINE v128 v128_andn(v128 x, v128 y) { return vbicq_s64(x, y); } + +SIMD_INLINE v128 v128_add_8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); +} + +SIMD_INLINE v128 v128_sadd_u8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vqaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); +} + +SIMD_INLINE v128 v128_sadd_s8(v128 x, v128 y) { + return vreinterpretq_s64_s8( + vqaddq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); +} + +SIMD_INLINE v128 v128_add_16(v128 x, v128 y) { + return vreinterpretq_s64_s16( + vaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); +} + +SIMD_INLINE v128 v128_sadd_s16(v128 x, v128 y) { + return vreinterpretq_s64_s16( + vqaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); +} + +SIMD_INLINE v128 v128_add_32(v128 x, v128 y) { + return vreinterpretq_s64_u32( + vaddq_u32(vreinterpretq_u32_s64(x), vreinterpretq_u32_s64(y))); +} + +SIMD_INLINE v128 v128_add_64(v128 x, v128 y) { + return vreinterpretq_s64_u64( + vaddq_u64(vreinterpretq_u64_s64(x), vreinterpretq_u64_s64(y))); +} + +SIMD_INLINE v128 v128_sub_8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); +} + +SIMD_INLINE v128 v128_sub_16(v128 x, v128 y) { + return vreinterpretq_s64_s16( + vsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); +} + +SIMD_INLINE v128 v128_ssub_s16(v128 x, v128 y) { + return vreinterpretq_s64_s16( + vqsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); +} + +SIMD_INLINE v128 v128_ssub_u16(v128 x, v128 y) { + return vreinterpretq_s64_u16( + vqsubq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y))); +} + +SIMD_INLINE v128 v128_ssub_u8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vqsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); +} + +SIMD_INLINE v128 v128_ssub_s8(v128 x, v128 y) { + return vreinterpretq_s64_s8( + vqsubq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); +} + +SIMD_INLINE v128 v128_sub_32(v128 x, v128 y) { + return vreinterpretq_s64_s32( + vsubq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); +} + +SIMD_INLINE v128 v128_sub_64(v128 x, v128 y) { return vsubq_s64(x, y); } + +SIMD_INLINE v128 v128_abs_s16(v128 x) { + return vreinterpretq_s64_s16(vabsq_s16(vreinterpretq_s16_s64(x))); +} + +SIMD_INLINE v128 v128_abs_s8(v128 x) { + return vreinterpretq_s64_s8(vabsq_s8(vreinterpretq_s8_s64(x))); +} + +SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { + return vreinterpretq_s64_s32( + vmull_s16(vreinterpret_s16_s64(a), vreinterpret_s16_s64(b))); +} + +SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) { + return vreinterpretq_s64_s16( + vmulq_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b))); +} + +SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) { +#if defined(__aarch64__) + return vreinterpretq_s64_s16(vuzp2q_s16( + vreinterpretq_s16_s32(vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)), + vreinterpret_s16_s64(vget_low_s64(b)))), + vreinterpretq_s16_s32( + vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b))))); +#else + return v128_from_v64(v64_mulhi_s16(vget_high_s64(a), vget_high_s64(b)), + v64_mulhi_s16(vget_low_s64(a), vget_low_s64(b))); +#endif +} + +SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) { + return vreinterpretq_s64_s32( + vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); +} + +SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { +#if defined(__aarch64__) + int32x4_t t1 = vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)), + vreinterpret_s16_s64(vget_low_s64(b))); + int32x4_t t2 = + vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b)); + return vreinterpretq_s64_s32(vpaddq_s32(t1, t2)); +#else + return v128_from_v64(v64_madd_s16(vget_high_s64(a), vget_high_s64(b)), + v64_madd_s16(vget_low_s64(a), vget_low_s64(b))); +#endif +} + +SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { +#if defined(__aarch64__) + int16x8_t t1 = vmulq_s16( + vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a)))), + vmovl_s8(vreinterpret_s8_s64(vget_low_s64(b)))); + int16x8_t t2 = vmulq_s16( + vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a)))), + vmovl_s8(vreinterpret_s8_s64(vget_high_s64(b)))); + return vreinterpretq_s64_s16( + vqaddq_s16(vuzp1q_s16(t1, t2), vuzp2q_s16(t1, t2))); +#else + return v128_from_v64(v64_madd_us8(vget_high_s64(a), vget_high_s64(b)), + v64_madd_us8(vget_low_s64(a), vget_low_s64(b))); +#endif +} + +SIMD_INLINE v128 v128_avg_u8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vrhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); +} + +SIMD_INLINE v128 v128_rdavg_u8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); +} + +SIMD_INLINE v128 v128_rdavg_u16(v128 x, v128 y) { + return vreinterpretq_s64_u16( + vhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y))); +} + +SIMD_INLINE v128 v128_avg_u16(v128 x, v128 y) { + return vreinterpretq_s64_u16( + vrhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y))); +} + +SIMD_INLINE v128 v128_min_u8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vminq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); +} + +SIMD_INLINE v128 v128_max_u8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vmaxq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); +} + +SIMD_INLINE v128 v128_min_s8(v128 x, v128 y) { + return vreinterpretq_s64_s8( + vminq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); +} + +SIMD_INLINE uint32_t v128_movemask_8(v128 a) { + a = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(a), vdupq_n_s8(0))); +#if defined(__aarch64__) + uint8x16_t m = + vandq_u8(vreinterpretq_u8_s64(a), + vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL))); + return vaddv_u8(vget_low_u8(m)) + (vaddv_u8(vget_high_u8(m)) << 8); +#else + uint64x2_t m = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8( + vandq_u8(vreinterpretq_u8_s64(a), + vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL)))))); + return v64_low_u32( + v64_ziplo_8(v128_high_v64((v128)m), v128_low_v64((v128)m))); +#endif +} + +SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) { + c = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(c), vdupq_n_s8(0))); + return v128_or(v128_and(b, c), v128_andn(a, c)); +} + +SIMD_INLINE v128 v128_max_s8(v128 x, v128 y) { + return vreinterpretq_s64_s8( + vmaxq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); +} + +SIMD_INLINE v128 v128_min_s16(v128 x, v128 y) { + return vreinterpretq_s64_s16( + vminq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); +} + +SIMD_INLINE v128 v128_max_s16(v128 x, v128 y) { + return vreinterpretq_s64_s16( + vmaxq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); +} + +SIMD_INLINE v128 v128_min_s32(v128 x, v128 y) { + return vreinterpretq_s64_s32( + vminq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); +} + +SIMD_INLINE v128 v128_max_s32(v128 x, v128 y) { + return vreinterpretq_s64_s32( + vmaxq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); +} + +SIMD_INLINE v128 v128_ziplo_8(v128 x, v128 y) { +#if defined(__aarch64__) + return vreinterpretq_s64_u8( + vzip1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x))); +#else + uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)); + return vreinterpretq_s64_u8(r.val[0]); +#endif +} + +SIMD_INLINE v128 v128_ziphi_8(v128 x, v128 y) { +#if defined(__aarch64__) + return vreinterpretq_s64_u8( + vzip2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x))); +#else + uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)); + return vreinterpretq_s64_u8(r.val[1]); +#endif +} + +SIMD_INLINE v128 v128_zip_8(v64 x, v64 y) { + uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); + return vreinterpretq_s64_u8(vcombine_u8(r.val[0], r.val[1])); +} + +SIMD_INLINE v128 v128_ziplo_16(v128 x, v128 y) { +#if defined(__aarch64__) + return vreinterpretq_s64_u16( + vzip1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x))); +#else + int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x)); + return vreinterpretq_s64_s16(r.val[0]); +#endif +} + +SIMD_INLINE v128 v128_ziphi_16(v128 x, v128 y) { +#if defined(__aarch64__) + return vreinterpretq_s64_u16( + vzip2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x))); +#else + int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x)); + return vreinterpretq_s64_s16(r.val[1]); +#endif +} + +SIMD_INLINE v128 v128_zip_16(v64 x, v64 y) { + uint16x4x2_t r = vzip_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)); + return vreinterpretq_s64_u16(vcombine_u16(r.val[0], r.val[1])); +} + +SIMD_INLINE v128 v128_ziplo_32(v128 x, v128 y) { +#if defined(__aarch64__) + return vreinterpretq_s64_u32( + vzip1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x))); +#else + int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x)); + return vreinterpretq_s64_s32(r.val[0]); +#endif +} + +SIMD_INLINE v128 v128_ziphi_32(v128 x, v128 y) { +#if defined(__aarch64__) + return vreinterpretq_s64_u32( + vzip2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x))); +#else + int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x)); + return vreinterpretq_s64_s32(r.val[1]); +#endif +} + +SIMD_INLINE v128 v128_zip_32(v64 x, v64 y) { + uint32x2x2_t r = vzip_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x)); + return vreinterpretq_s64_u32(vcombine_u32(r.val[0], r.val[1])); +} + +SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { + return v128_from_v64(vget_low_s64((int64x2_t)a), vget_low_s64((int64x2_t)b)); +} + +SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { + return v128_from_v64(vget_high_s64((int64x2_t)a), + vget_high_s64((int64x2_t)b)); +} + +SIMD_INLINE v128 v128_unziplo_8(v128 x, v128 y) { +#if defined(__aarch64__) + return vreinterpretq_s64_u8( + vuzp1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x))); +#else + uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)); + return vreinterpretq_s64_u8(r.val[0]); +#endif +} + +SIMD_INLINE v128 v128_unziphi_8(v128 x, v128 y) { +#if defined(__aarch64__) + return vreinterpretq_s64_u8( + vuzp2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x))); +#else + uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)); + return vreinterpretq_s64_u8(r.val[1]); +#endif +} + +SIMD_INLINE v128 v128_unziplo_16(v128 x, v128 y) { +#if defined(__aarch64__) + return vreinterpretq_s64_u16( + vuzp1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x))); +#else + uint16x8x2_t r = + vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)); + return vreinterpretq_s64_u16(r.val[0]); +#endif +} + +SIMD_INLINE v128 v128_unziphi_16(v128 x, v128 y) { +#if defined(__aarch64__) + return vreinterpretq_s64_u16( + vuzp2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x))); +#else + uint16x8x2_t r = + vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)); + return vreinterpretq_s64_u16(r.val[1]); +#endif +} + +SIMD_INLINE v128 v128_unziplo_32(v128 x, v128 y) { +#if defined(__aarch64__) + return vreinterpretq_s64_u32( + vuzp1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x))); +#else + uint32x4x2_t r = + vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)); + return vreinterpretq_s64_u32(r.val[0]); +#endif +} + +SIMD_INLINE v128 v128_unziphi_32(v128 x, v128 y) { +#if defined(__aarch64__) + return vreinterpretq_s64_u32( + vuzp2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x))); +#else + uint32x4x2_t r = + vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)); + return vreinterpretq_s64_u32(r.val[1]); +#endif +} + +SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { + return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(a))); +} + +SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) { + return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a)))); +} + +SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) { + return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a)))); +} + +SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) { + return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(a))); +} + +SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) { + return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a)))); +} + +SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) { + return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a)))); +} + +SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) { + return v128_from_v64( + vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(a))), + vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(b)))); +} + +SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) { + return v128_from_v64( + vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(a))), + vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(b)))); +} + +SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) { + return v128_from_v64( + vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(a))), + vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(b)))); +} + +SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) { + return v128_from_v64( + vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(a))), + vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(b)))); +} + +SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { + return vreinterpretq_s64_u32(vmovl_u16(vreinterpret_u16_s64(a))); +} + +SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { + return vreinterpretq_s64_s32(vmovl_s16(vreinterpret_s16_s64(a))); +} + +SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) { + return vreinterpretq_s64_u32( + vmovl_u16(vreinterpret_u16_s64(vget_low_s64(a)))); +} + +SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) { + return vreinterpretq_s64_s32( + vmovl_s16(vreinterpret_s16_s64(vget_low_s64(a)))); +} + +SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) { + return vreinterpretq_s64_u32( + vmovl_u16(vreinterpret_u16_s64(vget_high_s64(a)))); +} + +SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) { + return vreinterpretq_s64_s32( + vmovl_s16(vreinterpret_s16_s64(vget_high_s64(a)))); +} + +SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) { +#if defined(__aarch64__) + return vreinterpretq_s64_u8( + vqtbl1q_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(pattern))); +#else + uint8x8x2_t p = { { vget_low_u8(vreinterpretq_u8_s64(x)), + vget_high_u8(vreinterpretq_u8_s64(x)) } }; + return v128_from_64((uint64_t)vreinterpret_s64_u8(vtbl2_u8( + p, vreinterpret_u8_s64(vget_high_s64(pattern)))), + (uint64_t)vreinterpret_s64_u8(vtbl2_u8( + p, vreinterpret_u8_s64(vget_low_s64(pattern))))); +#endif +} + +SIMD_INLINE v128 v128_cmpgt_s8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vcgtq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); +} + +SIMD_INLINE v128 v128_cmplt_s8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vcltq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); +} + +SIMD_INLINE v128 v128_cmpeq_8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vceqq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); +} + +SIMD_INLINE v128 v128_cmpgt_s16(v128 x, v128 y) { + return vreinterpretq_s64_u16( + vcgtq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); +} + +SIMD_INLINE v128 v128_cmplt_s16(v128 x, v128 y) { + return vreinterpretq_s64_u16( + vcltq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); +} + +SIMD_INLINE v128 v128_cmpeq_16(v128 x, v128 y) { + return vreinterpretq_s64_u16( + vceqq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); +} + +SIMD_INLINE v128 v128_cmpgt_s32(v128 x, v128 y) { + return vreinterpretq_s64_u32( + vcgtq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); +} + +SIMD_INLINE v128 v128_cmplt_s32(v128 x, v128 y) { + return vreinterpretq_s64_u32( + vcltq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); +} + +SIMD_INLINE v128 v128_cmpeq_32(v128 x, v128 y) { + return vreinterpretq_s64_u32( + vceqq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); +} + +SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) { + return (c > 7) ? v128_zero() + : vreinterpretq_s64_u8( + vshlq_u8(vreinterpretq_u8_s64(a), vdupq_n_s8(c))); +} + +SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) { + return (c > 7) ? v128_zero() + : vreinterpretq_s64_u8( + vshlq_u8(vreinterpretq_u8_s64(a), vdupq_n_s8(-c))); +} + +SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) { + return (c > 7) ? v128_ones() + : vreinterpretq_s64_s8( + vshlq_s8(vreinterpretq_s8_s64(a), vdupq_n_s8(-c))); +} + +SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) { + return (c > 15) ? v128_zero() + : vreinterpretq_s64_u16( + vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16(c))); +} + +SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) { + return (c > 15) ? v128_zero() + : vreinterpretq_s64_u16( + vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16(-c))); +} + +SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) { + return (c > 15) ? v128_ones() + : vreinterpretq_s64_s16( + vshlq_s16(vreinterpretq_s16_s64(a), vdupq_n_s16(-c))); +} + +SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) { + return (c > 31) ? v128_zero() + : vreinterpretq_s64_u32( + vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(c))); +} + +SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) { + return (c > 31) ? v128_zero() + : vreinterpretq_s64_u32( + vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(-c))); +} + +SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) { + return (c > 31) ? v128_ones() + : vreinterpretq_s64_s32( + vshlq_s32(vreinterpretq_s32_s64(a), vdupq_n_s32(-c))); +} + +SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) { + return (c > 63) ? v128_zero() + : vreinterpretq_s64_u64( + vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(c))); +} + +SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) { + return (c > 63) ? v128_zero() + : vreinterpretq_s64_u64( + vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(-c))); +} + +SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) { + return (c > 63) ? v128_ones() : vshlq_s64(a, vdupq_n_s64(-c)); +} + +#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) + +SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) { + return n < 8 + ? v128_from_64( + (uint64_t)vorr_u64( + vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)), + n * 8), + vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), + (8 - n) * 8)), + (uint64_t)vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), + n * 8)) + : (n == 8 ? v128_from_64( + (uint64_t)vreinterpret_u64_s64(vget_low_s64(a)), 0) + : v128_from_64((uint64_t)vshl_n_u64( + vreinterpret_u64_s64(vget_low_s64(a)), + (n - 8) * 8), + 0)); +} + +SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) { + return n == 0 + ? a + : (n < 8 + ? v128_from_64( + (uint64_t)vshr_n_u64( + vreinterpret_u64_s64(vget_high_s64(a)), n * 8), + (uint64_t)vorr_u64( + vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), + n * 8), + vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)), + (8 - n) * 8))) + : (n == 8 ? v128_from_64(0, (uint64_t)vreinterpret_u64_s64( + vget_high_s64(a))) + : v128_from_64(0, (uint64_t)vshr_n_u64( + vreinterpret_u64_s64( + vget_high_s64(a)), + (n - 8) * 8)))); +} + +SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) { + return c ? vreinterpretq_s64_u8(vshlq_n_u8(vreinterpretq_u8_s64(a), c)) : a; +} + +SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) { + return c ? vreinterpretq_s64_u8(vshrq_n_u8(vreinterpretq_u8_s64(a), c)) : a; +} + +SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) { + return c ? vreinterpretq_s64_s8(vshrq_n_s8(vreinterpretq_s8_s64(a), c)) : a; +} + +SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) { + return c ? vreinterpretq_s64_u16(vshlq_n_u16(vreinterpretq_u16_s64(a), c)) + : a; +} + +SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) { + return c ? vreinterpretq_s64_u16(vshrq_n_u16(vreinterpretq_u16_s64(a), c)) + : a; +} + +SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) { + return c ? vreinterpretq_s64_s16(vshrq_n_s16(vreinterpretq_s16_s64(a), c)) + : a; +} + +SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) { + return c ? vreinterpretq_s64_u32(vshlq_n_u32(vreinterpretq_u32_s64(a), c)) + : a; +} + +SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) { + return c ? vreinterpretq_s64_u32(vshrq_n_u32(vreinterpretq_u32_s64(a), c)) + : a; +} + +SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) { + return c ? vreinterpretq_s64_s32(vshrq_n_s32(vreinterpretq_s32_s64(a), c)) + : a; +} + +SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) { + return c ? vreinterpretq_s64_u64(vshlq_n_u64(vreinterpretq_u64_s64(a), c)) + : a; +} + +SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) { + return c ? vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), c)) + : a; +} + +SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) { + return c ? vshrq_n_s64(a, c) : a; +} + +#else + +SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) { + if (n < 8) + return v128_from_v64(v64_or(v64_shl_n_byte(v128_high_v64(a), n), + v64_shr_n_byte(v128_low_v64(a), 8 - n)), + v64_shl_n_byte(v128_low_v64(a), n)); + else + return v128_from_v64(v64_shl_n_byte(v128_low_v64(a), n - 8), v64_zero()); +} + +SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) { + if (n < 8) + return v128_from_v64(v64_shr_n_byte(v128_high_v64(a), n), + v64_or(v64_shr_n_byte(v128_low_v64(a), n), + v64_shl_n_byte(v128_high_v64(a), 8 - n))); + else + return v128_from_v64(v64_zero(), v64_shr_n_byte(v128_high_v64(a), n - 8)); +} + +SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) { + return v128_shl_8(a, c); +} + +SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) { + return v128_shr_u8(a, c); +} + +SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) { + return v128_shr_s8(a, c); +} + +SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) { + return v128_shl_16(a, c); +} + +SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) { + return v128_shr_u16(a, c); +} + +SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) { + return v128_shr_s16(a, c); +} + +SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) { + return v128_shl_32(a, c); +} + +SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) { + return v128_shr_u32(a, c); +} + +SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) { + return v128_shr_s32(a, c); +} + +SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) { + return v128_shl_64(a, c); +} + +SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) { + return v128_shr_u64(a, c); +} + +SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) { + return v128_shr_s64(a, c); +} + +#endif + +typedef uint32x4_t sad128_internal_u16; + +SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) { + return vdupq_n_u32(0); +} + +/* Implementation dependent return value. Result must be finalised with + * v128_sad_u16_sum(). */ +SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a, + v128 b) { + return vaddq_u32( + s, vpaddlq_u16(vsubq_u16( + vmaxq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b)), + vminq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b))))); +} + +SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) { + uint64x2_t t = vpaddlq_u32(s); + return (uint32_t)(uint64_t)vget_high_u64(t) + + (uint32_t)(uint64_t)vget_low_u64(t); +} + +typedef v128 ssd128_internal_s16; +SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) { return v128_zero(); } + +/* Implementation dependent return value. Result must be finalised with + * v128_ssd_s16_sum(). */ +SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a, + v128 b) { + v128 d = v128_sub_16(a, b); + d = v128_madd_s16(d, d); + return v128_add_64( + s, vreinterpretq_s64_u64(vpaddlq_u32(vreinterpretq_u32_s64(d)))); +} + +SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) { + return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s)); +} + +#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_ diff --git a/libs/libaom/src/aom_dsp/simd/v128_intrinsics_c.h b/libs/libaom/src/aom_dsp/simd/v128_intrinsics_c.h new file mode 100644 index 000000000..466a41e10 --- /dev/null +++ b/libs/libaom/src/aom_dsp/simd/v128_intrinsics_c.h @@ -0,0 +1,903 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_ +#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_ + +#include +#include + +#include "config/aom_config.h" + +#include "aom_dsp/simd/v64_intrinsics_c.h" + +typedef union { + uint8_t u8[16]; + uint16_t u16[8]; + uint32_t u32[4]; + uint64_t u64[2]; + int8_t s8[16]; + int16_t s16[8]; + int32_t s32[4]; + int64_t s64[2]; + c_v64 v64[2]; +} c_v128; + +SIMD_INLINE uint32_t c_v128_low_u32(c_v128 a) { return a.u32[0]; } + +SIMD_INLINE c_v64 c_v128_low_v64(c_v128 a) { return a.v64[0]; } + +SIMD_INLINE c_v64 c_v128_high_v64(c_v128 a) { return a.v64[1]; } + +SIMD_INLINE c_v128 c_v128_from_64(uint64_t hi, uint64_t lo) { + c_v128 t; + t.u64[1] = hi; + t.u64[0] = lo; + return t; +} + +SIMD_INLINE c_v128 c_v128_from_v64(c_v64 hi, c_v64 lo) { + c_v128 t; + t.v64[1] = hi; + t.v64[0] = lo; + return t; +} + +SIMD_INLINE c_v128 c_v128_from_32(uint32_t a, uint32_t b, uint32_t c, + uint32_t d) { + c_v128 t; + t.u32[3] = a; + t.u32[2] = b; + t.u32[1] = c; + t.u32[0] = d; + return t; +} + +SIMD_INLINE c_v128 c_v128_load_unaligned(const void *p) { + c_v128 t; + uint8_t *pp = (uint8_t *)p; + uint8_t *q = (uint8_t *)&t; + int c; + for (c = 0; c < 16; c++) q[c] = pp[c]; + return t; +} + +SIMD_INLINE c_v128 c_v128_load_aligned(const void *p) { + if (SIMD_CHECK && (uintptr_t)p & 15) { + fprintf(stderr, "Error: unaligned v128 load at %p\n", p); + abort(); + } + return c_v128_load_unaligned(p); +} + +SIMD_INLINE void c_v128_store_unaligned(void *p, c_v128 a) { + uint8_t *pp = (uint8_t *)p; + uint8_t *q = (uint8_t *)&a; + int c; + for (c = 0; c < 16; c++) pp[c] = q[c]; +} + +SIMD_INLINE void c_v128_store_aligned(void *p, c_v128 a) { + if (SIMD_CHECK && (uintptr_t)p & 15) { + fprintf(stderr, "Error: unaligned v128 store at %p\n", p); + abort(); + } + c_v128_store_unaligned(p, a); +} + +SIMD_INLINE c_v128 c_v128_zero(void) { + c_v128 t; + t.u64[1] = t.u64[0] = 0; + return t; +} + +SIMD_INLINE c_v128 c_v128_dup_8(uint8_t x) { + c_v128 t; + t.v64[1] = t.v64[0] = c_v64_dup_8(x); + return t; +} + +SIMD_INLINE c_v128 c_v128_dup_16(uint16_t x) { + c_v128 t; + t.v64[1] = t.v64[0] = c_v64_dup_16(x); + return t; +} + +SIMD_INLINE c_v128 c_v128_dup_32(uint32_t x) { + c_v128 t; + t.v64[1] = t.v64[0] = c_v64_dup_32(x); + return t; +} + +SIMD_INLINE c_v128 c_v128_dup_64(uint64_t x) { + c_v128 t; + t.u64[1] = t.u64[0] = x; + return t; +} + +SIMD_INLINE int64_t c_v128_dotp_su8(c_v128 a, c_v128 b) { + return c_v64_dotp_su8(a.v64[1], b.v64[1]) + + c_v64_dotp_su8(a.v64[0], b.v64[0]); +} + +SIMD_INLINE int64_t c_v128_dotp_s16(c_v128 a, c_v128 b) { + return c_v64_dotp_s16(a.v64[1], b.v64[1]) + + c_v64_dotp_s16(a.v64[0], b.v64[0]); +} + +SIMD_INLINE int64_t c_v128_dotp_s32(c_v128 a, c_v128 b) { + // 32 bit products, 64 bit sum + return (int64_t)(int32_t)((int64_t)a.s32[3] * b.s32[3]) + + (int64_t)(int32_t)((int64_t)a.s32[2] * b.s32[2]) + + (int64_t)(int32_t)((int64_t)a.s32[1] * b.s32[1]) + + (int64_t)(int32_t)((int64_t)a.s32[0] * b.s32[0]); +} + +SIMD_INLINE uint64_t c_v128_hadd_u8(c_v128 a) { + return c_v64_hadd_u8(a.v64[1]) + c_v64_hadd_u8(a.v64[0]); +} + +typedef struct { + uint32_t val; + int count; +} c_sad128_internal; + +SIMD_INLINE c_sad128_internal c_v128_sad_u8_init(void) { + c_sad128_internal t; + t.val = t.count = 0; + return t; +} + +/* Implementation dependent return value. Result must be finalised with + * v128_sad_u8_sum(). The result for more than 32 v128_sad_u8() calls is + * undefined. */ +SIMD_INLINE c_sad128_internal c_v128_sad_u8(c_sad128_internal s, c_v128 a, + c_v128 b) { + int c; + for (c = 0; c < 16; c++) + s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c]; + s.count++; + if (SIMD_CHECK && s.count > 32) { + fprintf(stderr, + "Error: sad called 32 times returning an undefined result\n"); + abort(); + } + return s; +} + +SIMD_INLINE uint32_t c_v128_sad_u8_sum(c_sad128_internal s) { return s.val; } + +typedef uint32_t c_ssd128_internal; + +SIMD_INLINE c_ssd128_internal c_v128_ssd_u8_init(void) { return 0; } + +/* Implementation dependent return value. Result must be finalised with + * v128_ssd_u8_sum(). */ +SIMD_INLINE c_ssd128_internal c_v128_ssd_u8(c_ssd128_internal s, c_v128 a, + c_v128 b) { + int c; + for (c = 0; c < 16; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]); + return s; +} + +SIMD_INLINE uint32_t c_v128_ssd_u8_sum(c_ssd128_internal s) { return s; } + +SIMD_INLINE c_v128 c_v128_or(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_or(a.v64[1], b.v64[1]), + c_v64_or(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_xor(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_xor(a.v64[1], b.v64[1]), + c_v64_xor(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_and(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_and(a.v64[1], b.v64[1]), + c_v64_and(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_andn(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_andn(a.v64[1], b.v64[1]), + c_v64_andn(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_add_8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_add_8(a.v64[1], b.v64[1]), + c_v64_add_8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_add_16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_add_16(a.v64[1], b.v64[1]), + c_v64_add_16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_sadd_u8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_sadd_u8(a.v64[1], b.v64[1]), + c_v64_sadd_u8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_sadd_s8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_sadd_s8(a.v64[1], b.v64[1]), + c_v64_sadd_s8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_sadd_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_sadd_s16(a.v64[1], b.v64[1]), + c_v64_sadd_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_add_32(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_add_32(a.v64[1], b.v64[1]), + c_v64_add_32(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_add_64(c_v128 a, c_v128 b) { + // Two complement overflow (silences sanitizers) + return c_v128_from_64( + a.v64[1].u64 > ~b.v64[1].u64 ? a.v64[1].u64 - ~b.v64[1].u64 - 1 + : a.v64[1].u64 + b.v64[1].u64, + a.v64[0].u64 > ~b.v64[0].u64 ? a.v64[0].u64 - ~b.v64[0].u64 - 1 + : a.v64[0].u64 + b.v64[0].u64); +} + +SIMD_INLINE c_v128 c_v128_padd_s16(c_v128 a) { + c_v128 t; + t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1]; + t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3]; + t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5]; + t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7]; + return t; +} + +SIMD_INLINE c_v128 c_v128_padd_u8(c_v128 a) { + c_v128 t; + t.u16[0] = (uint16_t)a.u8[0] + (uint16_t)a.u8[1]; + t.u16[1] = (uint16_t)a.u8[2] + (uint16_t)a.u8[3]; + t.u16[2] = (uint16_t)a.u8[4] + (uint16_t)a.u8[5]; + t.u16[3] = (uint16_t)a.u8[6] + (uint16_t)a.u8[7]; + t.u16[4] = (uint16_t)a.u8[8] + (uint16_t)a.u8[9]; + t.u16[5] = (uint16_t)a.u8[10] + (uint16_t)a.u8[11]; + t.u16[6] = (uint16_t)a.u8[12] + (uint16_t)a.u8[13]; + t.u16[7] = (uint16_t)a.u8[14] + (uint16_t)a.u8[15]; + return t; +} + +SIMD_INLINE c_v128 c_v128_sub_8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_sub_8(a.v64[1], b.v64[1]), + c_v64_sub_8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ssub_u8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ssub_u8(a.v64[1], b.v64[1]), + c_v64_ssub_u8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ssub_s8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ssub_s8(a.v64[1], b.v64[1]), + c_v64_ssub_s8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_sub_16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_sub_16(a.v64[1], b.v64[1]), + c_v64_sub_16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ssub_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ssub_s16(a.v64[1], b.v64[1]), + c_v64_ssub_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ssub_u16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ssub_u16(a.v64[1], b.v64[1]), + c_v64_ssub_u16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_sub_32(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_sub_32(a.v64[1], b.v64[1]), + c_v64_sub_32(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_sub_64(c_v128 a, c_v128 b) { + // Two complement underflow (silences sanitizers) + return c_v128_from_64( + a.v64[1].u64 < b.v64[1].u64 ? a.v64[1].u64 + ~b.v64[1].u64 + 1 + : a.v64[1].u64 - b.v64[1].u64, + a.v64[0].u64 < b.v64[0].u64 ? a.v64[0].u64 + ~b.v64[0].u64 + 1 + : a.v64[0].u64 - b.v64[0].u64); +} + +SIMD_INLINE c_v128 c_v128_abs_s16(c_v128 a) { + return c_v128_from_v64(c_v64_abs_s16(a.v64[1]), c_v64_abs_s16(a.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_abs_s8(c_v128 a) { + return c_v128_from_v64(c_v64_abs_s8(a.v64[1]), c_v64_abs_s8(a.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_mul_s16(c_v64 a, c_v64 b) { + c_v64 lo_bits = c_v64_mullo_s16(a, b); + c_v64 hi_bits = c_v64_mulhi_s16(a, b); + return c_v128_from_v64(c_v64_ziphi_16(hi_bits, lo_bits), + c_v64_ziplo_16(hi_bits, lo_bits)); +} + +SIMD_INLINE c_v128 c_v128_mullo_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_mullo_s16(a.v64[1], b.v64[1]), + c_v64_mullo_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_mulhi_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_mulhi_s16(a.v64[1], b.v64[1]), + c_v64_mulhi_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_mullo_s32(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_mullo_s32(a.v64[1], b.v64[1]), + c_v64_mullo_s32(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_madd_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_madd_s16(a.v64[1], b.v64[1]), + c_v64_madd_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_madd_us8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_madd_us8(a.v64[1], b.v64[1]), + c_v64_madd_us8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_avg_u8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_avg_u8(a.v64[1], b.v64[1]), + c_v64_avg_u8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_rdavg_u8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_rdavg_u8(a.v64[1], b.v64[1]), + c_v64_rdavg_u8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_rdavg_u16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_rdavg_u16(a.v64[1], b.v64[1]), + c_v64_rdavg_u16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_avg_u16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_avg_u16(a.v64[1], b.v64[1]), + c_v64_avg_u16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_min_u8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_min_u8(a.v64[1], b.v64[1]), + c_v64_min_u8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_max_u8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_max_u8(a.v64[1], b.v64[1]), + c_v64_max_u8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_min_s8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_min_s8(a.v64[1], b.v64[1]), + c_v64_min_s8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE uint32_t c_v128_movemask_8(c_v128 a) { + return ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) | + ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) | + ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) | + ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) | + ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) | + ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) | + ((a.s8[0] < 0) << 0); +} + +SIMD_INLINE c_v128 c_v128_blend_8(c_v128 a, c_v128 b, c_v128 c) { + c_v128 t; + for (int i = 0; i < 16; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i]; + return t; +} + +SIMD_INLINE c_v128 c_v128_max_s8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_max_s8(a.v64[1], b.v64[1]), + c_v64_max_s8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_min_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_min_s16(a.v64[1], b.v64[1]), + c_v64_min_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_max_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_max_s16(a.v64[1], b.v64[1]), + c_v64_max_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_max_s32(c_v128 a, c_v128 b) { + c_v128 t; + int c; + for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? a.s32[c] : b.s32[c]; + return t; +} + +SIMD_INLINE c_v128 c_v128_min_s32(c_v128 a, c_v128 b) { + c_v128 t; + int c; + for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? b.s32[c] : a.s32[c]; + return t; +} + +SIMD_INLINE c_v128 c_v128_ziplo_8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ziphi_8(a.v64[0], b.v64[0]), + c_v64_ziplo_8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ziphi_8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ziphi_8(a.v64[1], b.v64[1]), + c_v64_ziplo_8(a.v64[1], b.v64[1])); +} + +SIMD_INLINE c_v128 c_v128_ziplo_16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ziphi_16(a.v64[0], b.v64[0]), + c_v64_ziplo_16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ziphi_16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ziphi_16(a.v64[1], b.v64[1]), + c_v64_ziplo_16(a.v64[1], b.v64[1])); +} + +SIMD_INLINE c_v128 c_v128_ziplo_32(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ziphi_32(a.v64[0], b.v64[0]), + c_v64_ziplo_32(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ziphi_32(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ziphi_32(a.v64[1], b.v64[1]), + c_v64_ziplo_32(a.v64[1], b.v64[1])); +} + +SIMD_INLINE c_v128 c_v128_ziplo_64(c_v128 a, c_v128 b) { + return c_v128_from_v64(a.v64[0], b.v64[0]); +} + +SIMD_INLINE c_v128 c_v128_ziphi_64(c_v128 a, c_v128 b) { + return c_v128_from_v64(a.v64[1], b.v64[1]); +} + +SIMD_INLINE c_v128 c_v128_zip_8(c_v64 a, c_v64 b) { + return c_v128_from_v64(c_v64_ziphi_8(a, b), c_v64_ziplo_8(a, b)); +} + +SIMD_INLINE c_v128 c_v128_zip_16(c_v64 a, c_v64 b) { + return c_v128_from_v64(c_v64_ziphi_16(a, b), c_v64_ziplo_16(a, b)); +} + +SIMD_INLINE c_v128 c_v128_zip_32(c_v64 a, c_v64 b) { + return c_v128_from_v64(c_v64_ziphi_32(a, b), c_v64_ziplo_32(a, b)); +} + +SIMD_INLINE c_v128 _c_v128_unzip_8(c_v128 a, c_v128 b, int mode) { + c_v128 t; + if (mode) { + t.u8[15] = b.u8[15]; + t.u8[14] = b.u8[13]; + t.u8[13] = b.u8[11]; + t.u8[12] = b.u8[9]; + t.u8[11] = b.u8[7]; + t.u8[10] = b.u8[5]; + t.u8[9] = b.u8[3]; + t.u8[8] = b.u8[1]; + t.u8[7] = a.u8[15]; + t.u8[6] = a.u8[13]; + t.u8[5] = a.u8[11]; + t.u8[4] = a.u8[9]; + t.u8[3] = a.u8[7]; + t.u8[2] = a.u8[5]; + t.u8[1] = a.u8[3]; + t.u8[0] = a.u8[1]; + } else { + t.u8[15] = a.u8[14]; + t.u8[14] = a.u8[12]; + t.u8[13] = a.u8[10]; + t.u8[12] = a.u8[8]; + t.u8[11] = a.u8[6]; + t.u8[10] = a.u8[4]; + t.u8[9] = a.u8[2]; + t.u8[8] = a.u8[0]; + t.u8[7] = b.u8[14]; + t.u8[6] = b.u8[12]; + t.u8[5] = b.u8[10]; + t.u8[4] = b.u8[8]; + t.u8[3] = b.u8[6]; + t.u8[2] = b.u8[4]; + t.u8[1] = b.u8[2]; + t.u8[0] = b.u8[0]; + } + return t; +} + +SIMD_INLINE c_v128 c_v128_unziplo_8(c_v128 a, c_v128 b) { + return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(a, b, 1) + : _c_v128_unzip_8(a, b, 0); +} + +SIMD_INLINE c_v128 c_v128_unziphi_8(c_v128 a, c_v128 b) { + return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(b, a, 0) + : _c_v128_unzip_8(b, a, 1); +} + +SIMD_INLINE c_v128 _c_v128_unzip_16(c_v128 a, c_v128 b, int mode) { + c_v128 t; + if (mode) { + t.u16[7] = b.u16[7]; + t.u16[6] = b.u16[5]; + t.u16[5] = b.u16[3]; + t.u16[4] = b.u16[1]; + t.u16[3] = a.u16[7]; + t.u16[2] = a.u16[5]; + t.u16[1] = a.u16[3]; + t.u16[0] = a.u16[1]; + } else { + t.u16[7] = a.u16[6]; + t.u16[6] = a.u16[4]; + t.u16[5] = a.u16[2]; + t.u16[4] = a.u16[0]; + t.u16[3] = b.u16[6]; + t.u16[2] = b.u16[4]; + t.u16[1] = b.u16[2]; + t.u16[0] = b.u16[0]; + } + return t; +} + +SIMD_INLINE c_v128 c_v128_unziplo_16(c_v128 a, c_v128 b) { + return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(a, b, 1) + : _c_v128_unzip_16(a, b, 0); +} + +SIMD_INLINE c_v128 c_v128_unziphi_16(c_v128 a, c_v128 b) { + return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(b, a, 0) + : _c_v128_unzip_16(b, a, 1); +} + +SIMD_INLINE c_v128 _c_v128_unzip_32(c_v128 a, c_v128 b, int mode) { + c_v128 t; + if (mode) { + t.u32[3] = b.u32[3]; + t.u32[2] = b.u32[1]; + t.u32[1] = a.u32[3]; + t.u32[0] = a.u32[1]; + } else { + t.u32[3] = a.u32[2]; + t.u32[2] = a.u32[0]; + t.u32[1] = b.u32[2]; + t.u32[0] = b.u32[0]; + } + return t; +} + +SIMD_INLINE c_v128 c_v128_unziplo_32(c_v128 a, c_v128 b) { + return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(a, b, 1) + : _c_v128_unzip_32(a, b, 0); +} + +SIMD_INLINE c_v128 c_v128_unziphi_32(c_v128 a, c_v128 b) { + return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(b, a, 0) + : _c_v128_unzip_32(b, a, 1); +} + +SIMD_INLINE c_v128 c_v128_unpack_u8_s16(c_v64 a) { + return c_v128_from_v64(c_v64_unpackhi_u8_s16(a), c_v64_unpacklo_u8_s16(a)); +} + +SIMD_INLINE c_v128 c_v128_unpacklo_u8_s16(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[0]), + c_v64_unpacklo_u8_s16(a.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_unpackhi_u8_s16(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[1]), + c_v64_unpacklo_u8_s16(a.v64[1])); +} + +SIMD_INLINE c_v128 c_v128_unpack_s8_s16(c_v64 a) { + return c_v128_from_v64(c_v64_unpackhi_s8_s16(a), c_v64_unpacklo_s8_s16(a)); +} + +SIMD_INLINE c_v128 c_v128_unpacklo_s8_s16(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[0]), + c_v64_unpacklo_s8_s16(a.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_unpackhi_s8_s16(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[1]), + c_v64_unpacklo_s8_s16(a.v64[1])); +} + +SIMD_INLINE c_v128 c_v128_pack_s32_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_pack_s32_s16(a.v64[1], a.v64[0]), + c_v64_pack_s32_s16(b.v64[1], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_pack_s32_u16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_pack_s32_u16(a.v64[1], a.v64[0]), + c_v64_pack_s32_u16(b.v64[1], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_pack_s16_u8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_pack_s16_u8(a.v64[1], a.v64[0]), + c_v64_pack_s16_u8(b.v64[1], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_pack_s16_s8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_pack_s16_s8(a.v64[1], a.v64[0]), + c_v64_pack_s16_s8(b.v64[1], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_unpack_u16_s32(c_v64 a) { + return c_v128_from_v64(c_v64_unpackhi_u16_s32(a), c_v64_unpacklo_u16_s32(a)); +} + +SIMD_INLINE c_v128 c_v128_unpack_s16_s32(c_v64 a) { + return c_v128_from_v64(c_v64_unpackhi_s16_s32(a), c_v64_unpacklo_s16_s32(a)); +} + +SIMD_INLINE c_v128 c_v128_unpacklo_u16_s32(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[0]), + c_v64_unpacklo_u16_s32(a.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_unpacklo_s16_s32(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[0]), + c_v64_unpacklo_s16_s32(a.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_unpackhi_u16_s32(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[1]), + c_v64_unpacklo_u16_s32(a.v64[1])); +} + +SIMD_INLINE c_v128 c_v128_unpackhi_s16_s32(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[1]), + c_v64_unpacklo_s16_s32(a.v64[1])); +} + +SIMD_INLINE c_v128 c_v128_shuffle_8(c_v128 a, c_v128 pattern) { + c_v128 t; + int c; + for (c = 0; c < 16; c++) + t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 15 - (pattern.u8[c] & 15) + : pattern.u8[c] & 15]; + + return t; +} + +SIMD_INLINE c_v128 c_v128_cmpgt_s8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_cmpgt_s8(a.v64[1], b.v64[1]), + c_v64_cmpgt_s8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_cmplt_s8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_cmplt_s8(a.v64[1], b.v64[1]), + c_v64_cmplt_s8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_cmpeq_8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_cmpeq_8(a.v64[1], b.v64[1]), + c_v64_cmpeq_8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_cmpgt_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_cmpgt_s16(a.v64[1], b.v64[1]), + c_v64_cmpgt_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_cmplt_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_cmplt_s16(a.v64[1], b.v64[1]), + c_v64_cmplt_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_cmpeq_16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_cmpeq_16(a.v64[1], b.v64[1]), + c_v64_cmpeq_16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_cmpgt_s32(c_v128 a, c_v128 b) { + c_v128 t; + int c; + for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] > b.s32[c]); + return t; +} + +SIMD_INLINE c_v128 c_v128_cmplt_s32(c_v128 a, c_v128 b) { + c_v128 t; + int c; + for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] < b.s32[c]); + return t; +} + +SIMD_INLINE c_v128 c_v128_cmpeq_32(c_v128 a, c_v128 b) { + c_v128 t; + int c; + for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] == b.s32[c]); + return t; +} + +SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, const unsigned int n) { + if (n == 0) return a; + if (n < 8) + return c_v128_from_v64(c_v64_or(c_v64_shl_n_byte(a.v64[1], n), + c_v64_shr_n_byte(a.v64[0], 8 - n)), + c_v64_shl_n_byte(a.v64[0], n)); + else + return c_v128_from_v64(c_v64_shl_n_byte(a.v64[0], n - 8), c_v64_zero()); +} + +SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, const unsigned int n) { + if (n == 0) return a; + if (n < 8) + return c_v128_from_v64(c_v64_shr_n_byte(a.v64[1], n), + c_v64_or(c_v64_shr_n_byte(a.v64[0], n), + c_v64_shl_n_byte(a.v64[1], 8 - n))); + else + return c_v128_from_v64(c_v64_zero(), c_v64_shr_n_byte(a.v64[1], n - 8)); +} + +SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, const unsigned int c) { + if (SIMD_CHECK && c > 15) { + fprintf(stderr, "Error: undefined alignment %d\n", c); + abort(); + } + return c ? c_v128_or(c_v128_shr_n_byte(b, c), c_v128_shl_n_byte(a, 16 - c)) + : b; +} + +SIMD_INLINE c_v128 c_v128_shl_8(c_v128 a, const unsigned int c) { + return c_v128_from_v64(c_v64_shl_8(a.v64[1], c), c_v64_shl_8(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shr_u8(c_v128 a, const unsigned int c) { + return c_v128_from_v64(c_v64_shr_u8(a.v64[1], c), c_v64_shr_u8(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shr_s8(c_v128 a, const unsigned int c) { + return c_v128_from_v64(c_v64_shr_s8(a.v64[1], c), c_v64_shr_s8(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shl_16(c_v128 a, const unsigned int c) { + return c_v128_from_v64(c_v64_shl_16(a.v64[1], c), c_v64_shl_16(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shr_u16(c_v128 a, const unsigned int c) { + return c_v128_from_v64(c_v64_shr_u16(a.v64[1], c), + c_v64_shr_u16(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shr_s16(c_v128 a, const unsigned int c) { + return c_v128_from_v64(c_v64_shr_s16(a.v64[1], c), + c_v64_shr_s16(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shl_32(c_v128 a, const unsigned int c) { + return c_v128_from_v64(c_v64_shl_32(a.v64[1], c), c_v64_shl_32(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shr_u32(c_v128 a, const unsigned int c) { + return c_v128_from_v64(c_v64_shr_u32(a.v64[1], c), + c_v64_shr_u32(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shr_s32(c_v128 a, const unsigned int c) { + return c_v128_from_v64(c_v64_shr_s32(a.v64[1], c), + c_v64_shr_s32(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shl_64(c_v128 a, const unsigned int c) { + a.v64[1].u64 <<= c; + a.v64[0].u64 <<= c; + return c_v128_from_v64(a.v64[1], a.v64[0]); +} + +SIMD_INLINE c_v128 c_v128_shr_u64(c_v128 a, const unsigned int c) { + a.v64[1].u64 >>= c; + a.v64[0].u64 >>= c; + return c_v128_from_v64(a.v64[1], a.v64[0]); +} + +SIMD_INLINE c_v128 c_v128_shr_s64(c_v128 a, const unsigned int c) { + a.v64[1].s64 >>= c; + a.v64[0].s64 >>= c; + return c_v128_from_v64(a.v64[1], a.v64[0]); +} + +SIMD_INLINE c_v128 c_v128_shl_n_8(c_v128 a, const unsigned int n) { + return c_v128_shl_8(a, n); +} + +SIMD_INLINE c_v128 c_v128_shl_n_16(c_v128 a, const unsigned int n) { + return c_v128_shl_16(a, n); +} + +SIMD_INLINE c_v128 c_v128_shl_n_32(c_v128 a, const unsigned int n) { + return c_v128_shl_32(a, n); +} + +SIMD_INLINE c_v128 c_v128_shl_n_64(c_v128 a, const unsigned int n) { + return c_v128_shl_64(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_u8(c_v128 a, const unsigned int n) { + return c_v128_shr_u8(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_u16(c_v128 a, const unsigned int n) { + return c_v128_shr_u16(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_u32(c_v128 a, const unsigned int n) { + return c_v128_shr_u32(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_u64(c_v128 a, const unsigned int n) { + return c_v128_shr_u64(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_s8(c_v128 a, const unsigned int n) { + return c_v128_shr_s8(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_s16(c_v128 a, const unsigned int n) { + return c_v128_shr_s16(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_s32(c_v128 a, const unsigned int n) { + return c_v128_shr_s32(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_s64(c_v128 a, const unsigned int n) { + return c_v128_shr_s64(a, n); +} + +typedef uint32_t c_sad128_internal_u16; + +SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16_init(void) { return 0; } + +/* Implementation dependent return value. Result must be finalised with + * v128_sad_u16_sum(). */ +SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16(c_sad128_internal_u16 s, + c_v128 a, c_v128 b) { + int c; + for (c = 0; c < 8; c++) + s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c]; + return s; +} + +SIMD_INLINE uint32_t c_v128_sad_u16_sum(c_sad128_internal_u16 s) { return s; } + +typedef uint64_t c_ssd128_internal_s16; + +SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16_init(void) { return 0; } + +/* Implementation dependent return value. Result must be finalised with + * v128_ssd_s16_sum(). */ +SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16(c_ssd128_internal_s16 s, + c_v128 a, c_v128 b) { + int c; + for (c = 0; c < 8; c++) + s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) * + (int32_t)(int16_t)(a.s16[c] - b.s16[c]); + return s; +} + +SIMD_INLINE uint64_t c_v128_ssd_s16_sum(c_ssd128_internal_s16 s) { return s; } + +#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_ diff --git a/libs/libaom/src/aom_dsp/simd/v128_intrinsics_x86.h b/libs/libaom/src/aom_dsp/simd/v128_intrinsics_x86.h new file mode 100644 index 000000000..c404015ef --- /dev/null +++ b/libs/libaom/src/aom_dsp/simd/v128_intrinsics_x86.h @@ -0,0 +1,657 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_ +#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_ + +#include +#include "aom_dsp/simd/v64_intrinsics_x86.h" + +typedef __m128i v128; + +SIMD_INLINE uint32_t v128_low_u32(v128 a) { + return (uint32_t)_mm_cvtsi128_si32(a); +} + +SIMD_INLINE v64 v128_low_v64(v128 a) { + return _mm_unpacklo_epi64(a, v64_zero()); +} + +SIMD_INLINE v64 v128_high_v64(v128 a) { return _mm_srli_si128(a, 8); } + +SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { + return _mm_unpacklo_epi64(b, a); +} + +SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) { + return v128_from_v64(v64_from_64(a), v64_from_64(b)); +} + +SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + return _mm_set_epi32(a, b, c, d); +} + +SIMD_INLINE v128 v128_load_aligned(const void *p) { + return _mm_load_si128((__m128i *)p); +} + +SIMD_INLINE v128 v128_load_unaligned(const void *p) { +#if defined(__SSSE3__) + return _mm_lddqu_si128((__m128i *)p); +#else + return _mm_loadu_si128((__m128i *)p); +#endif +} + +SIMD_INLINE void v128_store_aligned(void *p, v128 a) { + _mm_store_si128((__m128i *)p, a); +} + +SIMD_INLINE void v128_store_unaligned(void *p, v128 a) { + _mm_storeu_si128((__m128i *)p, a); +} + +// The following function requires an immediate. +// Some compilers will check this during optimisation, others wont. +#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) +#if defined(__SSSE3__) +SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) { + return c ? _mm_alignr_epi8(a, b, c) : b; +} +#else +#define v128_align(a, b, c) \ + ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b)) +#endif +#else +#if defined(__SSSE3__) +#define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, (uint8_t)(c)) : (b)) +#else +#define v128_align(a, b, c) \ + ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b)) +#endif +#endif + +SIMD_INLINE v128 v128_zero() { return _mm_setzero_si128(); } + +SIMD_INLINE v128 v128_dup_8(uint8_t x) { return _mm_set1_epi8(x); } + +SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16(x); } + +SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32(x); } + +SIMD_INLINE v128 v128_dup_64(uint64_t x) { + // _mm_set_pi64x and _mm_cvtsi64x_si64 missing in some compilers + return _mm_set_epi32((uint32_t)(x >> 32), (uint32_t)x, (uint32_t)(x >> 32), + (uint32_t)x); +} + +SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return _mm_add_epi8(a, b); } + +SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return _mm_add_epi16(a, b); } + +SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return _mm_adds_epu8(a, b); } + +SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return _mm_adds_epi8(a, b); } + +SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return _mm_adds_epi16(a, b); } + +SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return _mm_add_epi32(a, b); } + +SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return _mm_add_epi64(a, b); } + +SIMD_INLINE v128 v128_padd_s16(v128 a) { + return _mm_madd_epi16(a, _mm_set1_epi16(1)); +} + +SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return _mm_sub_epi8(a, b); } + +SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return _mm_subs_epu8(a, b); } + +SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return _mm_subs_epi8(a, b); } + +SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return _mm_sub_epi16(a, b); } + +SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return _mm_subs_epi16(a, b); } + +SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return _mm_subs_epu16(a, b); } + +SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return _mm_sub_epi32(a, b); } + +SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return _mm_sub_epi64(a, b); } + +SIMD_INLINE v128 v128_abs_s16(v128 a) { +#if defined(__SSSE3__) + return _mm_abs_epi16(a); +#else + return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a)); +#endif +} + +SIMD_INLINE v128 v128_abs_s8(v128 a) { +#if defined(__SSSE3__) + return _mm_abs_epi8(a); +#else + v128 sign = _mm_cmplt_epi8(a, _mm_setzero_si128()); + return _mm_xor_si128(sign, _mm_add_epi8(a, sign)); +#endif +} + +SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) { + return _mm_unpacklo_epi8(b, a); +} + +SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) { + return _mm_unpackhi_epi8(b, a); +} + +SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) { + return _mm_unpacklo_epi16(b, a); +} + +SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) { + return _mm_unpackhi_epi16(b, a); +} + +SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) { + return _mm_unpacklo_epi32(b, a); +} + +SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) { + return _mm_unpackhi_epi32(b, a); +} + +SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { + return _mm_unpacklo_epi64(b, a); +} + +SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { + return _mm_unpackhi_epi64(b, a); +} + +SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); } + +SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); } + +SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); } + +SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) { + return _mm_packs_epi16(_mm_srai_epi16(b, 8), _mm_srai_epi16(a, 8)); +} + +SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) { +#if defined(__SSSE3__) +#ifdef __x86_64__ + v128 order = _mm_cvtsi64_si128(0x0e0c0a0806040200LL); +#else + v128 order = _mm_set_epi32(0, 0, 0x0e0c0a08, 0x06040200); +#endif + return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order), + _mm_shuffle_epi8(a, order)); +#else + return v128_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1)); +#endif +} + +SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) { + return _mm_packs_epi32(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)); +} + +SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) { +#if defined(__SSSE3__) +#ifdef __x86_64__ + v128 order = _mm_cvtsi64_si128(0x0d0c090805040100LL); +#else + v128 order = _mm_set_epi32(0, 0, 0x0d0c0908, 0x05040100); +#endif + return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order), + _mm_shuffle_epi8(a, order)); +#else + return v128_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2)); +#endif +} + +SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) { + return _mm_castps_si128(_mm_shuffle_ps( + _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(3, 1, 3, 1))); +} + +SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) { + return _mm_castps_si128(_mm_shuffle_ps( + _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(2, 0, 2, 0))); +} + +SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { + return _mm_unpacklo_epi8(a, _mm_setzero_si128()); +} + +SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) { + return _mm_unpacklo_epi8(a, _mm_setzero_si128()); +} + +SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) { + return _mm_unpackhi_epi8(a, _mm_setzero_si128()); +} + +SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) { + return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); +} + +SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) { + return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); +} + +SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) { + return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8); +} + +SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) { + return _mm_packs_epi32(b, a); +} + +SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) { +#if defined(__SSE4_1__) + return _mm_packus_epi32(b, a); +#else + return v128_from_v64(v64_pack_s32_u16(v128_high_v64(a), v128_low_v64(a)), + v64_pack_s32_u16(v128_high_v64(b), v128_low_v64(b))); +#endif +} + +SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) { + return _mm_packus_epi16(b, a); +} + +SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) { + return _mm_packs_epi16(b, a); +} + +SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { + return _mm_unpacklo_epi16(a, _mm_setzero_si128()); +} + +SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { + return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); +} + +SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) { + return _mm_unpacklo_epi16(a, _mm_setzero_si128()); +} + +SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) { + return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); +} + +SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) { + return _mm_unpackhi_epi16(a, _mm_setzero_si128()); +} + +SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) { + return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16); +} + +SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) { +#if defined(__SSSE3__) + return _mm_shuffle_epi8(x, pattern); +#else + v128 output; + unsigned char *input = (unsigned char *)&x; + unsigned char *index = (unsigned char *)&pattern; + char *selected = (char *)&output; + int counter; + + for (counter = 0; counter < 16; counter++) { + selected[counter] = input[index[counter] & 15]; + } + + return output; +#endif +} + +SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) { + v128 t1 = _mm_madd_epi16(v128_unpackhi_s8_s16(a), v128_unpackhi_u8_s16(b)); + v128 t2 = _mm_madd_epi16(v128_unpacklo_s8_s16(a), v128_unpacklo_u8_s16(b)); + v128 t = v128_add_32(t1, t2); + t = v128_add_32(t, _mm_srli_si128(t, 8)); + t = v128_add_32(t, _mm_srli_si128(t, 4)); + return (int32_t)v128_low_u32(t); +} + +SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) { + v128 r = _mm_madd_epi16(a, b); +#if defined(__SSE4_1__) && defined(__x86_64__) + v128 c = _mm_add_epi64(_mm_cvtepi32_epi64(r), + _mm_cvtepi32_epi64(_mm_srli_si128(r, 8))); + return _mm_cvtsi128_si64(_mm_add_epi64(c, _mm_srli_si128(c, 8))); +#else + return (int64_t)_mm_cvtsi128_si32(r) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12)); +#endif +} + +SIMD_INLINE uint64_t v128_hadd_u8(v128 a) { + v128 t = _mm_sad_epu8(a, _mm_setzero_si128()); + return v64_low_u32(v128_low_v64(t)) + v64_low_u32(v128_high_v64(t)); +} + +typedef v128 sad128_internal; + +SIMD_INLINE sad128_internal v128_sad_u8_init() { return _mm_setzero_si128(); } + +/* Implementation dependent return value. Result must be finalised with + v128_sad_sum(). + The result for more than 32 v128_sad_u8() calls is undefined. */ +SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) { + return _mm_add_epi64(s, _mm_sad_epu8(a, b)); +} + +SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) { + return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s))); +} + +typedef int32_t ssd128_internal; + +SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return 0; } + +/* Implementation dependent return value. Result must be finalised with + * v128_ssd_sum(). */ +SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) { + v128 z = _mm_setzero_si128(); + v128 l = _mm_sub_epi16(_mm_unpacklo_epi8(a, z), _mm_unpacklo_epi8(b, z)); + v128 h = _mm_sub_epi16(_mm_unpackhi_epi8(a, z), _mm_unpackhi_epi8(b, z)); + v128 rl = _mm_madd_epi16(l, l); + v128 rh = _mm_madd_epi16(h, h); + v128 r = _mm_add_epi32(rl, rh); + r = _mm_add_epi32(r, _mm_srli_si128(r, 8)); + r = _mm_add_epi32(r, _mm_srli_si128(r, 4)); + return s + _mm_cvtsi128_si32(r); +} + +SIMD_INLINE int32_t v128_ssd_u8_sum(ssd128_internal s) { return s; } + +SIMD_INLINE v128 v128_or(v128 a, v128 b) { return _mm_or_si128(a, b); } + +SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return _mm_xor_si128(a, b); } + +SIMD_INLINE v128 v128_and(v128 a, v128 b) { return _mm_and_si128(a, b); } + +SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return _mm_andnot_si128(b, a); } + +SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { + v64 lo_bits = v64_mullo_s16(a, b); + v64 hi_bits = v64_mulhi_s16(a, b); + return v128_from_v64(v64_ziphi_16(hi_bits, lo_bits), + v64_ziplo_16(hi_bits, lo_bits)); +} + +SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) { + return _mm_mullo_epi16(a, b); +} + +SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) { + return _mm_mulhi_epi16(a, b); +} + +SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) { +#if defined(__SSE4_1__) + return _mm_mullo_epi32(a, b); +#else + return _mm_unpacklo_epi32( + _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8), + _mm_shuffle_epi32( + _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)), 8)); +#endif +} + +SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) { + v128 r = v128_mullo_s32(a, b); + return (int64_t)_mm_cvtsi128_si32(r) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12)); +} + +SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return _mm_madd_epi16(a, b); } + +SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { +#if defined(__SSSE3__) + return _mm_maddubs_epi16(a, b); +#else + return _mm_packs_epi32( + _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()), + _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8)), + _mm_madd_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()), + _mm_srai_epi16(_mm_unpackhi_epi8(b, b), 8))); +#endif +} + +SIMD_INLINE v128 v128_padd_u8(v128 a) { + return v128_madd_us8(a, _mm_set1_epi8(1)); +} + +SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return _mm_avg_epu8(a, b); } + +SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { + return _mm_sub_epi8(_mm_avg_epu8(a, b), + _mm_and_si128(_mm_xor_si128(a, b), v128_dup_8(1))); +} + +SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) { + return _mm_sub_epi16(_mm_avg_epu16(a, b), + _mm_and_si128(_mm_xor_si128(a, b), v128_dup_16(1))); +} + +SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return _mm_avg_epu16(a, b); } + +SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return _mm_min_epu8(a, b); } + +SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return _mm_max_epu8(a, b); } + +SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) { +#if defined(__SSE4_1__) + return _mm_min_epi8(a, b); +#else + v128 mask = _mm_cmplt_epi8(a, b); + return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); +#endif +} + +SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return _mm_movemask_epi8(a); } + +SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) { +#if defined(__SSE4_1__) + return _mm_blendv_epi8(a, b, c); +#else + c = _mm_cmplt_epi8(c, v128_zero()); + return v128_or(v128_and(b, c), v128_andn(a, c)); +#endif +} + +SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) { +#if defined(__SSE4_1__) + return _mm_max_epi8(a, b); +#else + v128 mask = _mm_cmplt_epi8(b, a); + return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); +#endif +} + +SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return _mm_min_epi16(a, b); } + +SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return _mm_max_epi16(a, b); } + +SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) { +#if defined(__SSE4_1__) + return _mm_min_epi32(a, b); +#else + v128 mask = _mm_cmplt_epi32(a, b); + return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); +#endif +} + +SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) { +#if defined(__SSE4_1__) + return _mm_max_epi32(a, b); +#else + v128 mask = _mm_cmplt_epi32(b, a); + return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); +#endif +} + +SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return _mm_cmpgt_epi8(a, b); } + +SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return _mm_cmplt_epi8(a, b); } + +SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return _mm_cmpeq_epi8(a, b); } + +SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) { + return _mm_cmpgt_epi16(a, b); +} + +SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) { + return _mm_cmplt_epi16(a, b); +} + +SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return _mm_cmpeq_epi32(a, b); } + +SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) { + return _mm_cmpgt_epi32(a, b); +} + +SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) { + return _mm_cmplt_epi32(a, b); +} + +SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); } + +SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) { + return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)), + _mm_sll_epi16(a, _mm_cvtsi32_si128(c))); +} + +SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) { + return _mm_and_si128(_mm_set1_epi8((char)(0xff >> c)), + _mm_srl_epi16(a, _mm_cvtsi32_si128(c))); +} + +SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) { + __m128i x = _mm_cvtsi32_si128(c + 8); + return _mm_packs_epi16(_mm_sra_epi16(_mm_unpacklo_epi8(a, a), x), + _mm_sra_epi16(_mm_unpackhi_epi8(a, a), x)); +} + +SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) { + return _mm_sll_epi16(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) { + return _mm_srl_epi16(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) { + return _mm_sra_epi16(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) { + return _mm_sll_epi32(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) { + return _mm_srl_epi32(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) { + return _mm_sra_epi32(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) { + return _mm_sll_epi64(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) { + return _mm_srl_epi64(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) { + // _mm_sra_epi64 is missing in gcc? + return v128_from_64((int64_t)v64_u64(v128_high_v64(a)) >> c, + (int64_t)v64_u64(v128_low_v64(a)) >> c); + // return _mm_sra_epi64(a, _mm_cvtsi32_si128(c)); +} + +/* These intrinsics require immediate values, so we must use #defines + to enforce that. */ +#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127) +#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127) +#define v128_shl_n_8(a, c) \ + _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c)) +#define v128_shr_n_u8(a, c) \ + _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c)) +#define v128_shr_n_s8(a, c) \ + _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), \ + _mm_srai_epi16(_mm_unpackhi_epi8(a, a), (c) + 8)) +#define v128_shl_n_16(a, c) _mm_slli_epi16(a, c) +#define v128_shr_n_u16(a, c) _mm_srli_epi16(a, c) +#define v128_shr_n_s16(a, c) _mm_srai_epi16(a, c) +#define v128_shl_n_32(a, c) _mm_slli_epi32(a, c) +#define v128_shr_n_u32(a, c) _mm_srli_epi32(a, c) +#define v128_shr_n_s32(a, c) _mm_srai_epi32(a, c) +#define v128_shl_n_64(a, c) _mm_slli_epi64(a, c) +#define v128_shr_n_u64(a, c) _mm_srli_epi64(a, c) +#define v128_shr_n_s64(a, c) \ + v128_shr_s64(a, c) // _mm_srai_epi64 missing in gcc? + +typedef v128 sad128_internal_u16; + +SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() { return v128_zero(); } + +/* Implementation dependent return value. Result must be finalised with + * v128_sad_u16_sum(). */ +SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a, + v128 b) { +#if defined(__SSE4_1__) + v128 t = v128_sub_16(_mm_max_epu16(a, b), _mm_min_epu16(a, b)); +#else + v128 t = v128_cmplt_s16(v128_xor(a, v128_dup_16(32768)), + v128_xor(b, v128_dup_16(32768))); + t = v128_sub_16(v128_or(v128_and(b, t), v128_andn(a, t)), + v128_or(v128_and(a, t), v128_andn(b, t))); +#endif + return v128_add_32( + s, v128_add_32(v128_unpackhi_u16_s32(t), v128_unpacklo_u16_s32(t))); +} + +SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) { + return v128_low_u32(s) + v128_low_u32(v128_shr_n_byte(s, 4)) + + v128_low_u32(v128_shr_n_byte(s, 8)) + + v128_low_u32(v128_shr_n_byte(s, 12)); +} + +typedef v128 ssd128_internal_s16; + +SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() { return v128_zero(); } + +/* Implementation dependent return value. Result must be finalised with + * v128_ssd_s16_sum(). */ +SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a, + v128 b) { + v128 d = v128_sub_16(a, b); + d = v128_madd_s16(d, d); + return v128_add_64(s, v128_add_64(_mm_unpackhi_epi32(d, v128_zero()), + _mm_unpacklo_epi32(d, v128_zero()))); +} + +SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) { + return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s)); +} + +#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_ diff --git a/libs/libaom/src/aom_dsp/simd/v256_intrinsics.h b/libs/libaom/src/aom_dsp/simd/v256_intrinsics.h new file mode 100644 index 000000000..17e36eed6 --- /dev/null +++ b/libs/libaom/src/aom_dsp/simd/v256_intrinsics.h @@ -0,0 +1,377 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_ +#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_ + +#include +#include +#include + +#include "aom_dsp/simd/v256_intrinsics_c.h" +#include "aom_dsp/simd/v128_intrinsics.h" +#include "aom_dsp/simd/v64_intrinsics.h" + +/* Fallback to plain, unoptimised C. */ + +typedef c_v256 v256; + +SIMD_INLINE uint32_t v256_low_u32(v256 a) { return c_v256_low_u32(a); } +SIMD_INLINE v64 v256_low_v64(v256 a) { return c_v256_low_v64(a); } +SIMD_INLINE uint64_t v256_low_u64(v256 a) { return c_v256_low_u64(a); } +SIMD_INLINE v128 v256_low_v128(v256 a) { return c_v256_low_v128(a); } +SIMD_INLINE v128 v256_high_v128(v256 a) { return c_v256_high_v128(a); } +SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) { + return c_v256_from_v128(hi, lo); +} +SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) { + return c_v256_from_64(a, b, c, d); +} +SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) { + return c_v256_from_v64(a, b, c, d); +} + +SIMD_INLINE v256 v256_load_unaligned(const void *p) { + return c_v256_load_unaligned(p); +} +SIMD_INLINE v256 v256_load_aligned(const void *p) { + return c_v256_load_aligned(p); +} + +SIMD_INLINE void v256_store_unaligned(void *p, v256 a) { + c_v256_store_unaligned(p, a); +} +SIMD_INLINE void v256_store_aligned(void *p, v256 a) { + c_v256_store_aligned(p, a); +} + +SIMD_INLINE v256 v256_align(v256 a, v256 b, unsigned int c) { + return c_v256_align(a, b, c); +} + +SIMD_INLINE v256 v256_zero(void) { return c_v256_zero(); } +SIMD_INLINE v256 v256_dup_8(uint8_t x) { return c_v256_dup_8(x); } +SIMD_INLINE v256 v256_dup_16(uint16_t x) { return c_v256_dup_16(x); } +SIMD_INLINE v256 v256_dup_32(uint32_t x) { return c_v256_dup_32(x); } +SIMD_INLINE v256 v256_dup_64(uint64_t x) { return c_v256_dup_64(x); } + +SIMD_INLINE c_sad256_internal v256_sad_u8_init(void) { + return c_v256_sad_u8_init(); +} +SIMD_INLINE c_sad256_internal v256_sad_u8(c_sad256_internal s, v256 a, v256 b) { + return c_v256_sad_u8(s, a, b); +} +SIMD_INLINE uint32_t v256_sad_u8_sum(c_sad256_internal s) { + return c_v256_sad_u8_sum(s); +} +SIMD_INLINE c_ssd256_internal v256_ssd_u8_init(void) { + return c_v256_ssd_u8_init(); +} +SIMD_INLINE c_ssd256_internal v256_ssd_u8(c_ssd256_internal s, v256 a, v256 b) { + return c_v256_ssd_u8(s, a, b); +} +SIMD_INLINE uint32_t v256_ssd_u8_sum(c_ssd256_internal s) { + return c_v256_ssd_u8_sum(s); +} + +SIMD_INLINE c_ssd256_internal_s16 v256_ssd_s16_init(void) { + return c_v256_ssd_s16_init(); +} +SIMD_INLINE c_ssd256_internal_s16 v256_ssd_s16(c_ssd256_internal_s16 s, v256 a, + v256 b) { + return c_v256_ssd_s16(s, a, b); +} +SIMD_INLINE uint64_t v256_ssd_s16_sum(c_ssd256_internal_s16 s) { + return c_v256_ssd_s16_sum(s); +} + +SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) { + return c_v256_dotp_su8(a, b); +} +SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) { + return c_v256_dotp_s16(a, b); +} +SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) { + return c_v256_dotp_s32(a, b); +} +SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { return c_v256_hadd_u8(a); } + +SIMD_INLINE v256 v256_or(v256 a, v256 b) { return c_v256_or(a, b); } +SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return c_v256_xor(a, b); } +SIMD_INLINE v256 v256_and(v256 a, v256 b) { return c_v256_and(a, b); } +SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return c_v256_andn(a, b); } + +SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return c_v256_add_8(a, b); } +SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return c_v256_add_16(a, b); } +SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return c_v256_sadd_s8(a, b); } +SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return c_v256_sadd_u8(a, b); } +SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { return c_v256_sadd_s16(a, b); } +SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return c_v256_add_32(a, b); } +SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return c_v256_add_64(a, b); } +SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return c_v256_sub_64(a, b); } +SIMD_INLINE v256 v256_padd_u8(v256 a) { return c_v256_padd_u8(a); } +SIMD_INLINE v256 v256_padd_s16(v256 a) { return c_v256_padd_s16(a); } +SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return c_v256_sub_8(a, b); } +SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return c_v256_ssub_u8(a, b); } +SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return c_v256_ssub_s8(a, b); } +SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return c_v256_sub_16(a, b); } +SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { return c_v256_ssub_s16(a, b); } +SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { return c_v256_ssub_u16(a, b); } +SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return c_v256_sub_32(a, b); } +SIMD_INLINE v256 v256_abs_s16(v256 a) { return c_v256_abs_s16(a); } +SIMD_INLINE v256 v256_abs_s8(v256 a) { return c_v256_abs_s8(a); } + +SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { return c_v256_mul_s16(a, b); } +SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) { + return c_v256_mullo_s16(a, b); +} +SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) { + return c_v256_mulhi_s16(a, b); +} +SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) { + return c_v256_mullo_s32(a, b); +} +SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { return c_v256_madd_s16(a, b); } +SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { return c_v256_madd_us8(a, b); } + +SIMD_INLINE uint32_t v256_movemask_8(v256 a) { return c_v256_movemask_8(a); } +SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) { + return c_v256_blend_8(a, b, c); +} + +SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return c_v256_avg_u8(a, b); } +SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { return c_v256_rdavg_u8(a, b); } +SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) { + return c_v256_rdavg_u16(a, b); +} +SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return c_v256_avg_u16(a, b); } +SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return c_v256_min_u8(a, b); } +SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return c_v256_max_u8(a, b); } +SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return c_v256_min_s8(a, b); } +SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return c_v256_max_s8(a, b); } +SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return c_v256_min_s16(a, b); } +SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return c_v256_max_s16(a, b); } +SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return c_v256_min_s32(a, b); } +SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return c_v256_max_s32(a, b); } + +SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { return c_v256_ziplo_8(a, b); } +SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { return c_v256_ziphi_8(a, b); } +SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { return c_v256_ziplo_16(a, b); } +SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { return c_v256_ziphi_16(a, b); } +SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { return c_v256_ziplo_32(a, b); } +SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { return c_v256_ziphi_32(a, b); } +SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { return c_v256_ziplo_64(a, b); } +SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { return c_v256_ziphi_64(a, b); } +SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) { + return c_v256_ziplo_128(a, b); +} +SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) { + return c_v256_ziphi_128(a, b); +} +SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { return c_v256_zip_8(a, b); } +SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { return c_v256_zip_16(a, b); } +SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { return c_v256_zip_32(a, b); } +SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) { + return c_v256_unziplo_8(a, b); +} +SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) { + return c_v256_unziphi_8(a, b); +} +SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) { + return c_v256_unziplo_16(a, b); +} +SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) { + return c_v256_unziphi_16(a, b); +} +SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) { + return c_v256_unziplo_32(a, b); +} +SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) { + return c_v256_unziphi_32(a, b); +} +SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) { + return c_v256_unziplo_64(a, b); +} +SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) { + return c_v256_unziphi_64(a, b); +} +SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return c_v256_unpack_u8_s16(a); } +SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) { + return c_v256_unpacklo_u8_s16(a); +} +SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) { + return c_v256_unpackhi_u8_s16(a); +} +SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { return c_v256_unpack_s8_s16(a); } +SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) { + return c_v256_unpacklo_s8_s16(a); +} +SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) { + return c_v256_unpackhi_s8_s16(a); +} +SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) { + return c_v256_pack_s32_s16(a, b); +} +SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) { + return c_v256_pack_s32_u16(a, b); +} +SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) { + return c_v256_pack_s16_u8(a, b); +} +SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) { + return c_v256_pack_s16_s8(a, b); +} +SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) { + return c_v256_unpack_u16_s32(a); +} +SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) { + return c_v256_unpack_s16_s32(a); +} +SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) { + return c_v256_unpacklo_u16_s32(a); +} +SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) { + return c_v256_unpacklo_s16_s32(a); +} +SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) { + return c_v256_unpackhi_u16_s32(a); +} +SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) { + return c_v256_unpackhi_s16_s32(a); +} +SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) { + return c_v256_shuffle_8(a, pattern); +} +SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) { + return c_v256_wideshuffle_8(a, b, pattern); +} +SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) { + return c_v256_pshuffle_8(a, pattern); +} + +SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { return c_v256_cmpgt_s8(a, b); } +SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { return c_v256_cmplt_s8(a, b); } +SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { return c_v256_cmpeq_8(a, b); } +SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) { + return c_v256_cmpgt_s16(a, b); +} +SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) { + return c_v256_cmplt_s16(a, b); +} +SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { return c_v256_cmpeq_16(a, b); } +SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) { return c_v256_cmpeq_32(a, b); } + +SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) { + return c_v256_cmpgt_s32(a, b); +} +SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) { + return c_v256_cmplt_s32(a, b); +} +SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) { + return c_v256_shl_8(a, c); +} +SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) { + return c_v256_shr_u8(a, c); +} +SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) { + return c_v256_shr_s8(a, c); +} +SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) { + return c_v256_shl_16(a, c); +} +SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) { + return c_v256_shr_u16(a, c); +} +SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) { + return c_v256_shr_s16(a, c); +} +SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) { + return c_v256_shl_32(a, c); +} +SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) { + return c_v256_shr_u32(a, c); +} +SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) { + return c_v256_shr_s32(a, c); +} +SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) { + return c_v256_shl_64(a, c); +} +SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) { + return c_v256_shr_u64(a, c); +} +SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) { + return c_v256_shr_s64(a, c); +} + +SIMD_INLINE v256 v256_shr_n_byte(v256 a, unsigned int n) { + return c_v256_shr_n_byte(a, n); +} +SIMD_INLINE v256 v256_shl_n_byte(v256 a, unsigned int n) { + return c_v256_shl_n_byte(a, n); +} +SIMD_INLINE v256 v256_shl_n_8(v256 a, unsigned int n) { + return c_v256_shl_n_8(a, n); +} +SIMD_INLINE v256 v256_shl_n_16(v256 a, unsigned int n) { + return c_v256_shl_n_16(a, n); +} +SIMD_INLINE v256 v256_shl_n_32(v256 a, unsigned int n) { + return c_v256_shl_n_32(a, n); +} +SIMD_INLINE v256 v256_shl_n_64(v256 a, unsigned int n) { + return c_v256_shl_n_64(a, n); +} +SIMD_INLINE v256 v256_shr_n_u8(v256 a, unsigned int n) { + return c_v256_shr_n_u8(a, n); +} +SIMD_INLINE v256 v256_shr_n_u16(v256 a, unsigned int n) { + return c_v256_shr_n_u16(a, n); +} +SIMD_INLINE v256 v256_shr_n_u32(v256 a, unsigned int n) { + return c_v256_shr_n_u32(a, n); +} +SIMD_INLINE v256 v256_shr_n_u64(v256 a, unsigned int n) { + return c_v256_shr_n_u64(a, n); +} +SIMD_INLINE v256 v256_shr_n_s8(v256 a, unsigned int n) { + return c_v256_shr_n_s8(a, n); +} +SIMD_INLINE v256 v256_shr_n_s16(v256 a, unsigned int n) { + return c_v256_shr_n_s16(a, n); +} +SIMD_INLINE v256 v256_shr_n_s32(v256 a, unsigned int n) { + return c_v256_shr_n_s32(a, n); +} +SIMD_INLINE v256 v256_shr_n_s64(v256 a, unsigned int n) { + return c_v256_shr_n_s64(a, n); +} + +SIMD_INLINE v256 v256_shr_n_word(v256 a, unsigned int n) { + return c_v256_shr_n_word(a, n); +} +SIMD_INLINE v256 v256_shl_n_word(v256 a, unsigned int n) { + return c_v256_shl_n_word(a, n); +} + +typedef uint32_t sad256_internal_u16; +SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) { + return c_v256_sad_u16_init(); +} +SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a, + v256 b) { + return c_v256_sad_u16(s, a, b); +} +SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) { + return c_v256_sad_u16_sum(s); +} + +#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_ diff --git a/libs/libaom/src/aom_dsp/simd/v256_intrinsics_arm.h b/libs/libaom/src/aom_dsp/simd/v256_intrinsics_arm.h new file mode 100644 index 000000000..bd86ea172 --- /dev/null +++ b/libs/libaom/src/aom_dsp/simd/v256_intrinsics_arm.h @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_ +#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_ + +#include "aom_dsp/simd/v256_intrinsics_v128.h" + +#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_ diff --git a/libs/libaom/src/aom_dsp/simd/v256_intrinsics_c.h b/libs/libaom/src/aom_dsp/simd/v256_intrinsics_c.h new file mode 100644 index 000000000..8127ee356 --- /dev/null +++ b/libs/libaom/src/aom_dsp/simd/v256_intrinsics_c.h @@ -0,0 +1,968 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_ +#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_ + +#include +#include + +#include "config/aom_config.h" + +#include "aom_dsp/simd/v128_intrinsics_c.h" + +typedef union { + uint8_t u8[32]; + uint16_t u16[16]; + uint32_t u32[8]; + uint64_t u64[4]; + int8_t s8[32]; + int16_t s16[16]; + int32_t s32[8]; + int64_t s64[4]; + c_v64 v64[4]; + c_v128 v128[2]; +} c_v256; + +SIMD_INLINE uint32_t c_v256_low_u32(c_v256 a) { return a.u32[0]; } + +SIMD_INLINE c_v64 c_v256_low_v64(c_v256 a) { return a.v64[0]; } + +SIMD_INLINE uint64_t c_v256_low_u64(c_v256 a) { return a.u64[0]; } + +SIMD_INLINE c_v128 c_v256_low_v128(c_v256 a) { return a.v128[0]; } + +SIMD_INLINE c_v128 c_v256_high_v128(c_v256 a) { return a.v128[1]; } + +SIMD_INLINE c_v256 c_v256_from_v128(c_v128 hi, c_v128 lo) { + c_v256 t; + t.v128[1] = hi; + t.v128[0] = lo; + return t; +} + +SIMD_INLINE c_v256 c_v256_from_64(uint64_t a, uint64_t b, uint64_t c, + uint64_t d) { + c_v256 t; + t.u64[3] = a; + t.u64[2] = b; + t.u64[1] = c; + t.u64[0] = d; + return t; +} + +SIMD_INLINE c_v256 c_v256_from_v64(c_v64 a, c_v64 b, c_v64 c, c_v64 d) { + c_v256 t; + t.u64[3] = a.u64; + t.u64[2] = b.u64; + t.u64[1] = c.u64; + t.u64[0] = d.u64; + return t; +} + +SIMD_INLINE c_v256 c_v256_load_unaligned(const void *p) { + c_v256 t; + uint8_t *pp = (uint8_t *)p; + uint8_t *q = (uint8_t *)&t; + int c; + for (c = 0; c < 32; c++) q[c] = pp[c]; + return t; +} + +SIMD_INLINE c_v256 c_v256_load_aligned(const void *p) { + if (SIMD_CHECK && (uintptr_t)p & 31) { + fprintf(stderr, "Error: unaligned v256 load at %p\n", p); + abort(); + } + return c_v256_load_unaligned(p); +} + +SIMD_INLINE void c_v256_store_unaligned(void *p, c_v256 a) { + uint8_t *pp = (uint8_t *)p; + uint8_t *q = (uint8_t *)&a; + int c; + for (c = 0; c < 32; c++) pp[c] = q[c]; +} + +SIMD_INLINE void c_v256_store_aligned(void *p, c_v256 a) { + if (SIMD_CHECK && (uintptr_t)p & 31) { + fprintf(stderr, "Error: unaligned v256 store at %p\n", p); + abort(); + } + c_v256_store_unaligned(p, a); +} + +SIMD_INLINE c_v256 c_v256_zero() { + c_v256 t; + t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = 0; + return t; +} + +SIMD_INLINE c_v256 c_v256_dup_8(uint8_t x) { + c_v256 t; + t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_8(x); + return t; +} + +SIMD_INLINE c_v256 c_v256_dup_16(uint16_t x) { + c_v256 t; + t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_16(x); + return t; +} + +SIMD_INLINE c_v256 c_v256_dup_32(uint32_t x) { + c_v256 t; + t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_32(x); + return t; +} + +SIMD_INLINE c_v256 c_v256_dup_64(uint64_t x) { + c_v256 t; + t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = x; + return t; +} + +SIMD_INLINE int64_t c_v256_dotp_su8(c_v256 a, c_v256 b) { + return c_v128_dotp_su8(a.v128[1], b.v128[1]) + + c_v128_dotp_su8(a.v128[0], b.v128[0]); +} + +SIMD_INLINE int64_t c_v256_dotp_s16(c_v256 a, c_v256 b) { + return c_v128_dotp_s16(a.v128[1], b.v128[1]) + + c_v128_dotp_s16(a.v128[0], b.v128[0]); +} + +SIMD_INLINE int64_t c_v256_dotp_s32(c_v256 a, c_v256 b) { + return c_v128_dotp_s32(a.v128[1], b.v128[1]) + + c_v128_dotp_s32(a.v128[0], b.v128[0]); +} + +SIMD_INLINE uint64_t c_v256_hadd_u8(c_v256 a) { + return c_v128_hadd_u8(a.v128[1]) + c_v128_hadd_u8(a.v128[0]); +} + +typedef struct { + uint32_t val; + int count; +} c_sad256_internal; + +SIMD_INLINE c_sad256_internal c_v256_sad_u8_init(void) { + c_sad256_internal t; + t.val = t.count = 0; + return t; +} + +/* Implementation dependent return value. Result must be finalised with + v256_sad_u8_sum(). + The result for more than 16 v256_sad_u8() calls is undefined. */ +SIMD_INLINE c_sad256_internal c_v256_sad_u8(c_sad256_internal s, c_v256 a, + c_v256 b) { + int c; + for (c = 0; c < 32; c++) + s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c]; + s.count++; + if (SIMD_CHECK && s.count > 32) { + fprintf(stderr, + "Error: sad called 32 times returning an undefined result\n"); + abort(); + } + return s; +} + +SIMD_INLINE uint32_t c_v256_sad_u8_sum(c_sad256_internal s) { return s.val; } + +typedef uint32_t c_ssd256_internal; + +SIMD_INLINE c_ssd256_internal c_v256_ssd_u8_init() { return 0; } + +/* Implementation dependent return value. Result must be finalised with + * v256_ssd_u8_sum(). */ +SIMD_INLINE c_ssd256_internal c_v256_ssd_u8(c_ssd256_internal s, c_v256 a, + c_v256 b) { + int c; + for (c = 0; c < 32; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]); + return s; +} + +SIMD_INLINE uint32_t c_v256_ssd_u8_sum(c_ssd256_internal s) { return s; } + +SIMD_INLINE c_v256 c_v256_or(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_or(a.v128[1], b.v128[1]), + c_v128_or(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_xor(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_xor(a.v128[1], b.v128[1]), + c_v128_xor(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_and(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_and(a.v128[1], b.v128[1]), + c_v128_and(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_andn(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_andn(a.v128[1], b.v128[1]), + c_v128_andn(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_add_8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_add_8(a.v128[1], b.v128[1]), + c_v128_add_8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_add_16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_add_16(a.v128[1], b.v128[1]), + c_v128_add_16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_sadd_s8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_sadd_s8(a.v128[1], b.v128[1]), + c_v128_sadd_s8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_sadd_u8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_sadd_u8(a.v128[1], b.v128[1]), + c_v128_sadd_u8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_sadd_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_sadd_s16(a.v128[1], b.v128[1]), + c_v128_sadd_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_add_32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_add_32(a.v128[1], b.v128[1]), + c_v128_add_32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_add_64(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_add_64(a.v128[1], b.v128[1]), + c_v128_add_64(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_sub_64(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_sub_64(a.v128[1], b.v128[1]), + c_v128_sub_64(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_padd_u8(c_v256 a) { + c_v256 t; + for (int i = 0; i < 16; i++) + t.u16[i] = (uint16_t)a.u8[i * 2] + (uint16_t)a.u8[i * 2 + 1]; + return t; +} + +SIMD_INLINE c_v256 c_v256_padd_s16(c_v256 a) { + c_v256 t; + t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1]; + t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3]; + t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5]; + t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7]; + t.s32[4] = (int32_t)a.s16[8] + (int32_t)a.s16[9]; + t.s32[5] = (int32_t)a.s16[10] + (int32_t)a.s16[11]; + t.s32[6] = (int32_t)a.s16[12] + (int32_t)a.s16[13]; + t.s32[7] = (int32_t)a.s16[14] + (int32_t)a.s16[15]; + return t; +} + +SIMD_INLINE c_v256 c_v256_sub_8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_sub_8(a.v128[1], b.v128[1]), + c_v128_sub_8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ssub_u8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ssub_u8(a.v128[1], b.v128[1]), + c_v128_ssub_u8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ssub_s8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ssub_s8(a.v128[1], b.v128[1]), + c_v128_ssub_s8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_sub_16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_sub_16(a.v128[1], b.v128[1]), + c_v128_sub_16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ssub_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ssub_s16(a.v128[1], b.v128[1]), + c_v128_ssub_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ssub_u16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ssub_u16(a.v128[1], b.v128[1]), + c_v128_ssub_u16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_sub_32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_sub_32(a.v128[1], b.v128[1]), + c_v128_sub_32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_abs_s16(c_v256 a) { + return c_v256_from_v128(c_v128_abs_s16(a.v128[1]), c_v128_abs_s16(a.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_abs_s8(c_v256 a) { + return c_v256_from_v128(c_v128_abs_s8(a.v128[1]), c_v128_abs_s8(a.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_mul_s16(c_v128 a, c_v128 b) { + c_v128 lo_bits = c_v128_mullo_s16(a, b); + c_v128 hi_bits = c_v128_mulhi_s16(a, b); + return c_v256_from_v128(c_v128_ziphi_16(hi_bits, lo_bits), + c_v128_ziplo_16(hi_bits, lo_bits)); +} + +SIMD_INLINE c_v256 c_v256_mullo_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_mullo_s16(a.v128[1], b.v128[1]), + c_v128_mullo_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_mulhi_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_mulhi_s16(a.v128[1], b.v128[1]), + c_v128_mulhi_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_mullo_s32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_mullo_s32(a.v128[1], b.v128[1]), + c_v128_mullo_s32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_madd_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_madd_s16(a.v128[1], b.v128[1]), + c_v128_madd_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_madd_us8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_madd_us8(a.v128[1], b.v128[1]), + c_v128_madd_us8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_avg_u8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_avg_u8(a.v128[1], b.v128[1]), + c_v128_avg_u8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_rdavg_u8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_rdavg_u8(a.v128[1], b.v128[1]), + c_v128_rdavg_u8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_rdavg_u16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_rdavg_u16(a.v128[1], b.v128[1]), + c_v128_rdavg_u16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_avg_u16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_avg_u16(a.v128[1], b.v128[1]), + c_v128_avg_u16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_min_u8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_min_u8(a.v128[1], b.v128[1]), + c_v128_min_u8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_max_u8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_max_u8(a.v128[1], b.v128[1]), + c_v128_max_u8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_min_s8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_min_s8(a.v128[1], b.v128[1]), + c_v128_min_s8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE uint32_t c_v256_movemask_8(c_v256 a) { + return ((a.s8[31] < 0) << 31) | ((a.s8[30] < 0) << 30) | + ((a.s8[29] < 0) << 29) | ((a.s8[28] < 0) << 28) | + ((a.s8[27] < 0) << 27) | ((a.s8[26] < 0) << 26) | + ((a.s8[25] < 0) << 25) | ((a.s8[24] < 0) << 24) | + ((a.s8[23] < 0) << 23) | ((a.s8[22] < 0) << 22) | + ((a.s8[21] < 0) << 21) | ((a.s8[20] < 0) << 20) | + ((a.s8[19] < 0) << 19) | ((a.s8[18] < 0) << 18) | + ((a.s8[17] < 0) << 17) | ((a.s8[16] < 0) << 16) | + ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) | + ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) | + ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) | + ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) | + ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) | + ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) | + ((a.s8[0] < 0) << 0); +} + +SIMD_INLINE c_v256 c_v256_blend_8(c_v256 a, c_v256 b, c_v256 c) { + c_v256 t; + for (int i = 0; i < 32; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i]; + return t; +} + +SIMD_INLINE c_v256 c_v256_max_s8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_max_s8(a.v128[1], b.v128[1]), + c_v128_max_s8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_min_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_min_s16(a.v128[1], b.v128[1]), + c_v128_min_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_max_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_max_s16(a.v128[1], b.v128[1]), + c_v128_max_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_min_s32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_min_s32(a.v128[1], b.v128[1]), + c_v128_min_s32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_max_s32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_max_s32(a.v128[1], b.v128[1]), + c_v128_max_s32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ziplo_8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_8(a.v128[0], b.v128[0]), + c_v128_ziplo_8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ziphi_8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_8(a.v128[1], b.v128[1]), + c_v128_ziplo_8(a.v128[1], b.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_ziplo_16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_16(a.v128[0], b.v128[0]), + c_v128_ziplo_16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ziphi_16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_16(a.v128[1], b.v128[1]), + c_v128_ziplo_16(a.v128[1], b.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_ziplo_32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_32(a.v128[0], b.v128[0]), + c_v128_ziplo_32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ziphi_32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_32(a.v128[1], b.v128[1]), + c_v128_ziplo_32(a.v128[1], b.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_ziplo_64(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_64(a.v128[0], b.v128[0]), + c_v128_ziplo_64(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ziphi_64(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_64(a.v128[1], b.v128[1]), + c_v128_ziplo_64(a.v128[1], b.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_ziplo_128(c_v256 a, c_v256 b) { + return c_v256_from_v128(a.v128[0], b.v128[0]); +} + +SIMD_INLINE c_v256 c_v256_ziphi_128(c_v256 a, c_v256 b) { + return c_v256_from_v128(a.v128[1], b.v128[1]); +} + +SIMD_INLINE c_v256 c_v256_zip_8(c_v128 a, c_v128 b) { + return c_v256_from_v128(c_v128_ziphi_8(a, b), c_v128_ziplo_8(a, b)); +} + +SIMD_INLINE c_v256 c_v256_zip_16(c_v128 a, c_v128 b) { + return c_v256_from_v128(c_v128_ziphi_16(a, b), c_v128_ziplo_16(a, b)); +} + +SIMD_INLINE c_v256 c_v256_zip_32(c_v128 a, c_v128 b) { + return c_v256_from_v128(c_v128_ziphi_32(a, b), c_v128_ziplo_32(a, b)); +} + +SIMD_INLINE c_v256 _c_v256_unzip_8(c_v256 a, c_v256 b, int mode) { + c_v256 t; + int i; + if (mode) { + for (i = 0; i < 16; i++) { + t.u8[i] = a.u8[i * 2 + 1]; + t.u8[i + 16] = b.u8[i * 2 + 1]; + } + } else { + for (i = 0; i < 16; i++) { + t.u8[i] = b.u8[i * 2]; + t.u8[i + 16] = a.u8[i * 2]; + } + } + return t; +} + +SIMD_INLINE c_v256 c_v256_unziplo_8(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(a, b, 1) + : _c_v256_unzip_8(a, b, 0); +} + +SIMD_INLINE c_v256 c_v256_unziphi_8(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(b, a, 0) + : _c_v256_unzip_8(b, a, 1); +} + +SIMD_INLINE c_v256 _c_v256_unzip_16(c_v256 a, c_v256 b, int mode) { + c_v256 t; + int i; + if (mode) { + for (i = 0; i < 8; i++) { + t.u16[i] = a.u16[i * 2 + 1]; + t.u16[i + 8] = b.u16[i * 2 + 1]; + } + } else { + for (i = 0; i < 8; i++) { + t.u16[i] = b.u16[i * 2]; + t.u16[i + 8] = a.u16[i * 2]; + } + } + return t; +} + +SIMD_INLINE c_v256 c_v256_unziplo_16(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(a, b, 1) + : _c_v256_unzip_16(a, b, 0); +} + +SIMD_INLINE c_v256 c_v256_unziphi_16(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(b, a, 0) + : _c_v256_unzip_16(b, a, 1); +} + +SIMD_INLINE c_v256 _c_v256_unzip_32(c_v256 a, c_v256 b, int mode) { + c_v256 t; + if (mode) { + t.u32[7] = b.u32[7]; + t.u32[6] = b.u32[5]; + t.u32[5] = b.u32[3]; + t.u32[4] = b.u32[1]; + t.u32[3] = a.u32[7]; + t.u32[2] = a.u32[5]; + t.u32[1] = a.u32[3]; + t.u32[0] = a.u32[1]; + } else { + t.u32[7] = a.u32[6]; + t.u32[6] = a.u32[4]; + t.u32[5] = a.u32[2]; + t.u32[4] = a.u32[0]; + t.u32[3] = b.u32[6]; + t.u32[2] = b.u32[4]; + t.u32[1] = b.u32[2]; + t.u32[0] = b.u32[0]; + } + return t; +} + +SIMD_INLINE c_v256 c_v256_unziplo_32(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(a, b, 1) + : _c_v256_unzip_32(a, b, 0); +} + +SIMD_INLINE c_v256 c_v256_unziphi_32(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(b, a, 0) + : _c_v256_unzip_32(b, a, 1); +} + +SIMD_INLINE c_v256 _c_v256_unzip_64(c_v256 a, c_v256 b, int mode) { + c_v256 t; + if (mode) { + t.u64[3] = b.u64[3]; + t.u64[2] = b.u64[1]; + t.u64[1] = a.u64[3]; + t.u64[0] = a.u64[1]; + } else { + t.u64[3] = a.u64[2]; + t.u64[2] = a.u64[0]; + t.u64[1] = b.u64[2]; + t.u64[0] = b.u64[0]; + } + return t; +} + +SIMD_INLINE c_v256 c_v256_unziplo_64(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(a, b, 1) + : _c_v256_unzip_64(a, b, 0); +} + +SIMD_INLINE c_v256 c_v256_unziphi_64(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(b, a, 0) + : _c_v256_unzip_64(b, a, 1); +} + +SIMD_INLINE c_v256 c_v256_unpack_u8_s16(c_v128 a) { + return c_v256_from_v128(c_v128_unpackhi_u8_s16(a), c_v128_unpacklo_u8_s16(a)); +} + +SIMD_INLINE c_v256 c_v256_unpacklo_u8_s16(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[0]), + c_v128_unpacklo_u8_s16(a.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_unpackhi_u8_s16(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[1]), + c_v128_unpacklo_u8_s16(a.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_unpack_s8_s16(c_v128 a) { + return c_v256_from_v128(c_v128_unpackhi_s8_s16(a), c_v128_unpacklo_s8_s16(a)); +} + +SIMD_INLINE c_v256 c_v256_unpacklo_s8_s16(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[0]), + c_v128_unpacklo_s8_s16(a.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_unpackhi_s8_s16(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[1]), + c_v128_unpacklo_s8_s16(a.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_pack_s32_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_pack_s32_s16(a.v128[1], a.v128[0]), + c_v128_pack_s32_s16(b.v128[1], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_pack_s32_u16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_pack_s32_u16(a.v128[1], a.v128[0]), + c_v128_pack_s32_u16(b.v128[1], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_pack_s16_u8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_pack_s16_u8(a.v128[1], a.v128[0]), + c_v128_pack_s16_u8(b.v128[1], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_pack_s16_s8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_pack_s16_s8(a.v128[1], a.v128[0]), + c_v128_pack_s16_s8(b.v128[1], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_unpack_u16_s32(c_v128 a) { + return c_v256_from_v128(c_v128_unpackhi_u16_s32(a), + c_v128_unpacklo_u16_s32(a)); +} + +SIMD_INLINE c_v256 c_v256_unpack_s16_s32(c_v128 a) { + return c_v256_from_v128(c_v128_unpackhi_s16_s32(a), + c_v128_unpacklo_s16_s32(a)); +} + +SIMD_INLINE c_v256 c_v256_unpacklo_u16_s32(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[0]), + c_v128_unpacklo_u16_s32(a.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_unpacklo_s16_s32(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[0]), + c_v128_unpacklo_s16_s32(a.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_unpackhi_u16_s32(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[1]), + c_v128_unpacklo_u16_s32(a.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_unpackhi_s16_s32(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[1]), + c_v128_unpacklo_s16_s32(a.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_shuffle_8(c_v256 a, c_v256 pattern) { + c_v256 t; + int c; + for (c = 0; c < 32; c++) + t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31) + : pattern.u8[c] & 31]; + + return t; +} + +SIMD_INLINE c_v256 c_v256_wideshuffle_8(c_v256 a, c_v256 b, c_v256 pattern) { + c_v256 t; + int c; + for (c = 0; c < 32; c++) + t.u8[c] = (pattern.u8[c] < 32 + ? b.u8 + : a.u8)[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31) + : pattern.u8[c] & 31]; + return t; +} + +// Pairwise / dual-lane shuffle: shuffle two 128 bit lates. +SIMD_INLINE c_v256 c_v256_pshuffle_8(c_v256 a, c_v256 pattern) { + return c_v256_from_v128( + c_v128_shuffle_8(c_v256_high_v128(a), c_v256_high_v128(pattern)), + c_v128_shuffle_8(c_v256_low_v128(a), c_v256_low_v128(pattern))); +} + +SIMD_INLINE c_v256 c_v256_cmpgt_s8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmpgt_s8(a.v128[1], b.v128[1]), + c_v128_cmpgt_s8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_cmplt_s8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmplt_s8(a.v128[1], b.v128[1]), + c_v128_cmplt_s8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_cmpeq_8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmpeq_8(a.v128[1], b.v128[1]), + c_v128_cmpeq_8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_cmpgt_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmpgt_s16(a.v128[1], b.v128[1]), + c_v128_cmpgt_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_cmplt_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmplt_s16(a.v128[1], b.v128[1]), + c_v128_cmplt_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_cmpeq_16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmpeq_16(a.v128[1], b.v128[1]), + c_v128_cmpeq_16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_cmpgt_s32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmpgt_s32(a.v128[1], b.v128[1]), + c_v128_cmpgt_s32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_cmplt_s32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmplt_s32(a.v128[1], b.v128[1]), + c_v128_cmplt_s32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_cmpeq_32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmpeq_32(a.v128[1], b.v128[1]), + c_v128_cmpeq_32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_shl_n_byte(c_v256 a, unsigned int n) { + if (n == 0) return a; + if (n < 16) + return c_v256_from_v128(c_v128_or(c_v128_shl_n_byte(a.v128[1], n), + c_v128_shr_n_byte(a.v128[0], 16 - n)), + c_v128_shl_n_byte(a.v128[0], n)); + else if (n > 16) + return c_v256_from_v128(c_v128_shl_n_byte(a.v128[0], n - 16), + c_v128_zero()); + else + return c_v256_from_v128(c_v256_low_v128(a), c_v128_zero()); +} + +SIMD_INLINE c_v256 c_v256_shr_n_byte(c_v256 a, unsigned int n) { + if (n == 0) return a; + if (n < 16) + return c_v256_from_v128(c_v128_shr_n_byte(a.v128[1], n), + c_v128_or(c_v128_shr_n_byte(a.v128[0], n), + c_v128_shl_n_byte(a.v128[1], 16 - n))); + else if (n > 16) + return c_v256_from_v128(c_v128_zero(), + c_v128_shr_n_byte(a.v128[1], n - 16)); + else + return c_v256_from_v128(c_v128_zero(), c_v256_high_v128(a)); +} + +SIMD_INLINE c_v256 c_v256_align(c_v256 a, c_v256 b, unsigned int c) { + if (SIMD_CHECK && c > 31) { + fprintf(stderr, "Error: undefined alignment %d\n", c); + abort(); + } + return c ? c_v256_or(c_v256_shr_n_byte(b, c), c_v256_shl_n_byte(a, 32 - c)) + : b; +} + +SIMD_INLINE c_v256 c_v256_shl_8(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shl_8(a.v128[1], c), + c_v128_shl_8(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shr_u8(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shr_u8(a.v128[1], c), + c_v128_shr_u8(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shr_s8(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shr_s8(a.v128[1], c), + c_v128_shr_s8(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shl_16(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shl_16(a.v128[1], c), + c_v128_shl_16(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shr_u16(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shr_u16(a.v128[1], c), + c_v128_shr_u16(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shr_s16(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shr_s16(a.v128[1], c), + c_v128_shr_s16(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shl_32(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shl_32(a.v128[1], c), + c_v128_shl_32(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shr_u32(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shr_u32(a.v128[1], c), + c_v128_shr_u32(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shr_s32(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shr_s32(a.v128[1], c), + c_v128_shr_s32(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shr_s64(c_v256 a, unsigned int n) { + c_v256 t; + if (SIMD_CHECK && n > 63) { + fprintf(stderr, "Error: undefined s64 shift right %d\n", n); + abort(); + } + t.s64[3] = a.s64[3] >> n; + t.s64[2] = a.s64[2] >> n; + t.s64[1] = a.s64[1] >> n; + t.s64[0] = a.s64[0] >> n; + return t; +} + +SIMD_INLINE c_v256 c_v256_shr_u64(c_v256 a, unsigned int n) { + c_v256 t; + if (SIMD_CHECK && n > 63) { + fprintf(stderr, "Error: undefined s64 shift right %d\n", n); + abort(); + } + t.u64[3] = a.u64[3] >> n; + t.u64[2] = a.u64[2] >> n; + t.u64[1] = a.u64[1] >> n; + t.u64[0] = a.u64[0] >> n; + return t; +} + +SIMD_INLINE c_v256 c_v256_shl_64(c_v256 a, unsigned int n) { + c_v256 t; + if (SIMD_CHECK && n > 63) { + fprintf(stderr, "Error: undefined s64 shift right %d\n", n); + abort(); + } + t.u64[3] = a.u64[3] << n; + t.u64[2] = a.u64[2] << n; + t.u64[1] = a.u64[1] << n; + t.u64[0] = a.u64[0] << n; + return t; +} + +SIMD_INLINE c_v256 c_v256_shl_n_8(c_v256 a, unsigned int n) { + return c_v256_shl_8(a, n); +} + +SIMD_INLINE c_v256 c_v256_shl_n_16(c_v256 a, unsigned int n) { + return c_v256_shl_16(a, n); +} + +SIMD_INLINE c_v256 c_v256_shl_n_32(c_v256 a, unsigned int n) { + return c_v256_shl_32(a, n); +} + +SIMD_INLINE c_v256 c_v256_shl_n_64(c_v256 a, unsigned int n) { + return c_v256_shl_64(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_u8(c_v256 a, unsigned int n) { + return c_v256_shr_u8(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_u16(c_v256 a, unsigned int n) { + return c_v256_shr_u16(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_u32(c_v256 a, unsigned int n) { + return c_v256_shr_u32(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_u64(c_v256 a, unsigned int n) { + return c_v256_shr_u64(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_s8(c_v256 a, unsigned int n) { + return c_v256_shr_s8(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_s16(c_v256 a, unsigned int n) { + return c_v256_shr_s16(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_s32(c_v256 a, unsigned int n) { + return c_v256_shr_s32(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_s64(c_v256 a, unsigned int n) { + return c_v256_shr_s64(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_word(c_v256 a, const unsigned int n) { + return c_v256_shr_n_byte(a, 2 * n); +} +SIMD_INLINE c_v256 c_v256_shl_n_word(c_v256 a, const unsigned int n) { + return c_v256_shl_n_byte(a, 2 * n); +} + +typedef uint32_t c_sad256_internal_u16; + +SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16_init() { return 0; } + +/* Implementation dependent return value. Result must be finalised with + v256_sad_u16_sum(). */ +SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16(c_sad256_internal_u16 s, + c_v256 a, c_v256 b) { + int c; + for (c = 0; c < 16; c++) + s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c]; + return s; +} + +SIMD_INLINE uint32_t c_v256_sad_u16_sum(c_sad256_internal_u16 s) { return s; } + +typedef uint64_t c_ssd256_internal_s16; + +SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16_init() { return 0; } + +/* Implementation dependent return value. Result must be finalised with + * v256_ssd_s16_sum(). */ +SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16(c_ssd256_internal_s16 s, + c_v256 a, c_v256 b) { + int c; + for (c = 0; c < 16; c++) + s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) * + (int32_t)(int16_t)(a.s16[c] - b.s16[c]); + return s; +} + +SIMD_INLINE uint64_t c_v256_ssd_s16_sum(c_ssd256_internal_s16 s) { return s; } + +#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_ diff --git a/libs/libaom/src/aom_dsp/simd/v256_intrinsics_v128.h b/libs/libaom/src/aom_dsp/simd/v256_intrinsics_v128.h new file mode 100644 index 000000000..0d2266754 --- /dev/null +++ b/libs/libaom/src/aom_dsp/simd/v256_intrinsics_v128.h @@ -0,0 +1,876 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_ +#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_ + +#if HAVE_NEON +#include "aom_dsp/simd/v128_intrinsics_arm.h" +#elif HAVE_SSE2 +#include "aom_dsp/simd/v128_intrinsics_x86.h" +#else +#include "aom_dsp/simd/v128_intrinsics.h" +#endif + +#if HAVE_NEON +typedef int64x2x2_t v256; +#else +typedef struct { + v128 val[2]; +} v256; +#endif + +SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.val[0]); } + +SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.val[0]); } + +SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); } + +SIMD_INLINE v128 v256_low_v128(v256 a) { return a.val[0]; } + +SIMD_INLINE v128 v256_high_v128(v256 a) { return a.val[1]; } + +SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) { + v256 t; + t.val[1] = hi; + t.val[0] = lo; + return t; +} + +SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) { + return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d)); +} + +SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) { + return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d)); +} + +SIMD_INLINE v256 v256_load_unaligned(const void *p) { + return v256_from_v128(v128_load_unaligned((uint8_t *)p + 16), + v128_load_unaligned(p)); +} + +SIMD_INLINE v256 v256_load_aligned(const void *p) { + return v256_from_v128(v128_load_aligned((uint8_t *)p + 16), + v128_load_aligned(p)); +} + +SIMD_INLINE void v256_store_unaligned(void *p, v256 a) { + v128_store_unaligned(p, a.val[0]); + v128_store_unaligned((uint8_t *)p + 16, a.val[1]); +} + +SIMD_INLINE void v256_store_aligned(void *p, v256 a) { + v128_store_aligned(p, a.val[0]); + v128_store_aligned((uint8_t *)p + 16, a.val[1]); +} + +SIMD_INLINE v256 v256_zero(void) { + return v256_from_v128(v128_zero(), v128_zero()); +} + +SIMD_INLINE v256 v256_dup_8(uint8_t x) { + v128 t = v128_dup_8(x); + return v256_from_v128(t, t); +} + +SIMD_INLINE v256 v256_dup_16(uint16_t x) { + v128 t = v128_dup_16(x); + return v256_from_v128(t, t); +} + +SIMD_INLINE v256 v256_dup_32(uint32_t x) { + v128 t = v128_dup_32(x); + return v256_from_v128(t, t); +} + +SIMD_INLINE v256 v256_dup_64(uint64_t x) { + v128 t = v128_dup_64(x); + return v256_from_v128(t, t); +} + +SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) { + return v128_dotp_su8(a.val[1], b.val[1]) + v128_dotp_su8(a.val[0], b.val[0]); +} + +SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) { + return v128_dotp_s16(a.val[1], b.val[1]) + v128_dotp_s16(a.val[0], b.val[0]); +} + +SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) { + return v128_dotp_s32(a.val[1], b.val[1]) + v128_dotp_s32(a.val[0], b.val[0]); +} + +SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { + return v128_hadd_u8(a.val[1]) + v128_hadd_u8(a.val[0]); +} + +typedef struct { + sad128_internal val[2]; +} sad256_internal; + +SIMD_INLINE sad256_internal v256_sad_u8_init(void) { + sad256_internal t; + t.val[1] = v128_sad_u8_init(); + t.val[0] = v128_sad_u8_init(); + return t; +} + +/* Implementation dependent return value. Result must be finalised with + v256_sad_u8_sum(). + The result for more than 16 v256_sad_u8() calls is undefined. */ +SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) { + sad256_internal t; + t.val[1] = v128_sad_u8(s.val[1], a.val[1], b.val[1]); + t.val[0] = v128_sad_u8(s.val[0], a.val[0], b.val[0]); + return t; +} + +SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) { + return v128_sad_u8_sum(s.val[1]) + v128_sad_u8_sum(s.val[0]); +} + +typedef struct { + ssd128_internal val[2]; +} ssd256_internal; + +SIMD_INLINE ssd256_internal v256_ssd_u8_init(void) { + ssd256_internal t; + t.val[1] = v128_ssd_u8_init(); + t.val[0] = v128_ssd_u8_init(); + return t; +} + +/* Implementation dependent return value. Result must be finalised with + * v256_ssd_u8_sum(). */ +SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) { + ssd256_internal t; + t.val[1] = v128_ssd_u8(s.val[1], a.val[1], b.val[1]); + t.val[0] = v128_ssd_u8(s.val[0], a.val[0], b.val[0]); + return t; +} + +SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) { + return v128_ssd_u8_sum(s.val[1]) + v128_ssd_u8_sum(s.val[0]); +} + +SIMD_INLINE v256 v256_or(v256 a, v256 b) { + return v256_from_v128(v128_or(a.val[1], b.val[1]), + v128_or(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_xor(v256 a, v256 b) { + return v256_from_v128(v128_xor(a.val[1], b.val[1]), + v128_xor(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_and(v256 a, v256 b) { + return v256_from_v128(v128_and(a.val[1], b.val[1]), + v128_and(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_andn(v256 a, v256 b) { + return v256_from_v128(v128_andn(a.val[1], b.val[1]), + v128_andn(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { + return v256_from_v128(v128_add_8(a.val[1], b.val[1]), + v128_add_8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { + return v256_from_v128(v128_add_16(a.val[1], b.val[1]), + v128_add_16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { + return v256_from_v128(v128_sadd_s8(a.val[1], b.val[1]), + v128_sadd_s8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { + return v256_from_v128(v128_sadd_u8(a.val[1], b.val[1]), + v128_sadd_u8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { + return v256_from_v128(v128_sadd_s16(a.val[1], b.val[1]), + v128_sadd_s16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { + return v256_from_v128(v128_add_32(a.val[1], b.val[1]), + v128_add_32(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { + return v256_from_v128(v128_add_64(a.val[1], b.val[1]), + v128_add_64(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_padd_u8(v256 a) { + return v256_from_v128(v128_padd_u8(a.val[1]), v128_padd_u8(a.val[0])); +} + +SIMD_INLINE v256 v256_padd_s16(v256 a) { + return v256_from_v128(v128_padd_s16(a.val[1]), v128_padd_s16(a.val[0])); +} + +SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { + return v256_from_v128(v128_sub_8(a.val[1], b.val[1]), + v128_sub_8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { + return v256_from_v128(v128_ssub_u8(a.val[1], b.val[1]), + v128_ssub_u8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { + return v256_from_v128(v128_ssub_s8(a.val[1], b.val[1]), + v128_ssub_s8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { + return v256_from_v128(v128_sub_16(a.val[1], b.val[1]), + v128_sub_16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { + return v256_from_v128(v128_ssub_s16(a.val[1], b.val[1]), + v128_ssub_s16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { + return v256_from_v128(v128_ssub_u16(a.val[1], b.val[1]), + v128_ssub_u16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { + return v256_from_v128(v128_sub_32(a.val[1], b.val[1]), + v128_sub_32(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { + return v256_from_v128(v128_sub_64(a.val[1], b.val[1]), + v128_sub_64(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_abs_s16(v256 a) { + return v256_from_v128(v128_abs_s16(a.val[1]), v128_abs_s16(a.val[0])); +} + +SIMD_INLINE v256 v256_abs_s8(v256 a) { + return v256_from_v128(v128_abs_s8(a.val[1]), v128_abs_s8(a.val[0])); +} + +SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { + v128 lo_bits = v128_mullo_s16(a, b); + v128 hi_bits = v128_mulhi_s16(a, b); + return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits), + v128_ziplo_16(hi_bits, lo_bits)); +} + +SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) { + return v256_from_v128(v128_mullo_s16(a.val[1], b.val[1]), + v128_mullo_s16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) { + return v256_from_v128(v128_mulhi_s16(a.val[1], b.val[1]), + v128_mulhi_s16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) { + return v256_from_v128(v128_mullo_s32(a.val[1], b.val[1]), + v128_mullo_s32(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { + return v256_from_v128(v128_madd_s16(a.val[1], b.val[1]), + v128_madd_s16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { + return v256_from_v128(v128_madd_us8(a.val[1], b.val[1]), + v128_madd_us8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { + return v256_from_v128(v128_avg_u8(a.val[1], b.val[1]), + v128_avg_u8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { + return v256_from_v128(v128_rdavg_u8(a.val[1], b.val[1]), + v128_rdavg_u8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) { + return v256_from_v128(v128_rdavg_u16(a.val[1], b.val[1]), + v128_rdavg_u16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { + return v256_from_v128(v128_avg_u16(a.val[1], b.val[1]), + v128_avg_u16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { + return v256_from_v128(v128_min_u8(a.val[1], b.val[1]), + v128_min_u8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { + return v256_from_v128(v128_max_u8(a.val[1], b.val[1]), + v128_max_u8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { + return v256_from_v128(v128_min_s8(a.val[1], b.val[1]), + v128_min_s8(a.val[0], b.val[0])); +} + +SIMD_INLINE uint32_t v256_movemask_8(v256 a) { + return (v128_movemask_8(v256_high_v128(a)) << 16) | + v128_movemask_8(v256_low_v128(a)); +} + +SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) { + return v256_from_v128(v128_blend_8(a.val[1], b.val[1], c.val[1]), + v128_blend_8(a.val[0], b.val[0], c.val[0])); +} + +SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { + return v256_from_v128(v128_max_s8(a.val[1], b.val[1]), + v128_max_s8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { + return v256_from_v128(v128_min_s16(a.val[1], b.val[1]), + v128_min_s16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { + return v256_from_v128(v128_max_s16(a.val[1], b.val[1]), + v128_max_s16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { + return v256_from_v128(v128_min_s32(a.val[1], b.val[1]), + v128_min_s32(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { + return v256_from_v128(v128_max_s32(a.val[1], b.val[1]), + v128_max_s32(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_8(a.val[0], b.val[0]), + v128_ziplo_8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_8(a.val[1], b.val[1]), + v128_ziplo_8(a.val[1], b.val[1])); +} + +SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_16(a.val[0], b.val[0]), + v128_ziplo_16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_16(a.val[1], b.val[1]), + v128_ziplo_16(a.val[1], b.val[1])); +} + +SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_32(a.val[0], b.val[0]), + v128_ziplo_32(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_32(a.val[1], b.val[1]), + v128_ziplo_32(a.val[1], b.val[1])); +} + +SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_64(a.val[0], b.val[0]), + v128_ziplo_64(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_64(a.val[1], b.val[1]), + v128_ziplo_64(a.val[1], b.val[1])); +} + +SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) { + return v256_from_v128(a.val[0], b.val[0]); +} + +SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) { + return v256_from_v128(a.val[1], b.val[1]); +} + +SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { + return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b)); +} + +SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { + return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b)); +} + +SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { + return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b)); +} + +SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) { + return v256_from_v128(v128_unziplo_8(a.val[1], a.val[0]), + v128_unziplo_8(b.val[1], b.val[0])); +} + +SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) { + return v256_from_v128(v128_unziphi_8(a.val[1], a.val[0]), + v128_unziphi_8(b.val[1], b.val[0])); +} + +SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) { + return v256_from_v128(v128_unziplo_16(a.val[1], a.val[0]), + v128_unziplo_16(b.val[1], b.val[0])); +} + +SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) { + return v256_from_v128(v128_unziphi_16(a.val[1], a.val[0]), + v128_unziphi_16(b.val[1], b.val[0])); +} + +SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) { + return v256_from_v128(v128_unziplo_32(a.val[1], a.val[0]), + v128_unziplo_32(b.val[1], b.val[0])); +} + +SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) { + return v256_from_v128(v128_unziphi_32(a.val[1], a.val[0]), + v128_unziphi_32(b.val[1], b.val[0])); +} + +SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) { +#if HAVE_SSE2 + return v256_from_v128( + _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]), + _mm_castsi128_pd(a.val[1]), 0)), + _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]), + _mm_castsi128_pd(b.val[1]), 0))); +#else + return v256_from_v64(v128_low_v64(a.val[1]), v128_low_v64(a.val[0]), + v128_low_v64(b.val[1]), v128_low_v64(b.val[0])); +#endif +} + +SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) { +#if HAVE_SSE2 + return v256_from_v128( + _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]), + _mm_castsi128_pd(a.val[1]), 3)), + _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]), + _mm_castsi128_pd(b.val[1]), 3))); +#else + return v256_from_v64(v128_high_v64(a.val[1]), v128_high_v64(a.val[0]), + v128_high_v64(b.val[1]), v128_high_v64(b.val[0])); +#endif +} + +SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { + return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a)); +} + +SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) { + return v256_from_v128(v128_unpackhi_u8_s16(a.val[0]), + v128_unpacklo_u8_s16(a.val[0])); +} + +SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) { + return v256_from_v128(v128_unpackhi_u8_s16(a.val[1]), + v128_unpacklo_u8_s16(a.val[1])); +} + +SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { + return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a)); +} + +SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) { + return v256_from_v128(v128_unpackhi_s8_s16(a.val[0]), + v128_unpacklo_s8_s16(a.val[0])); +} + +SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) { + return v256_from_v128(v128_unpackhi_s8_s16(a.val[1]), + v128_unpacklo_s8_s16(a.val[1])); +} + +SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) { + return v256_from_v128(v128_pack_s32_s16(a.val[1], a.val[0]), + v128_pack_s32_s16(b.val[1], b.val[0])); +} + +SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) { + return v256_from_v128(v128_pack_s32_u16(a.val[1], a.val[0]), + v128_pack_s32_u16(b.val[1], b.val[0])); +} + +SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) { + return v256_from_v128(v128_pack_s16_u8(a.val[1], a.val[0]), + v128_pack_s16_u8(b.val[1], b.val[0])); +} + +SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) { + return v256_from_v128(v128_pack_s16_s8(a.val[1], a.val[0]), + v128_pack_s16_s8(b.val[1], b.val[0])); +} + +SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) { + return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a)); +} + +SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) { + return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a)); +} + +SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) { + return v256_from_v128(v128_unpackhi_u16_s32(a.val[0]), + v128_unpacklo_u16_s32(a.val[0])); +} + +SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) { + return v256_from_v128(v128_unpackhi_s16_s32(a.val[0]), + v128_unpacklo_s16_s32(a.val[0])); +} + +SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) { + return v256_from_v128(v128_unpackhi_u16_s32(a.val[1]), + v128_unpacklo_u16_s32(a.val[1])); +} + +SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) { + return v256_from_v128(v128_unpackhi_s16_s32(a.val[1]), + v128_unpacklo_s16_s32(a.val[1])); +} + +SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { + return v256_from_v128(v128_cmpgt_s8(a.val[1], b.val[1]), + v128_cmpgt_s8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { + return v256_from_v128(v128_cmplt_s8(a.val[1], b.val[1]), + v128_cmplt_s8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { + return v256_from_v128(v128_cmpeq_8(a.val[1], b.val[1]), + v128_cmpeq_8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) { + return v256_from_v128(v128_cmpgt_s16(a.val[1], b.val[1]), + v128_cmpgt_s16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) { + return v256_from_v128(v128_cmplt_s16(a.val[1], b.val[1]), + v128_cmplt_s16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { + return v256_from_v128(v128_cmpeq_16(a.val[1], b.val[1]), + v128_cmpeq_16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) { + return v256_from_v128(v128_cmpgt_s32(a.val[1], b.val[1]), + v128_cmpgt_s32(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) { + return v256_from_v128(v128_cmplt_s32(a.val[1], b.val[1]), + v128_cmplt_s32(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) { + return v256_from_v128(v128_cmpeq_32(a.val[1], b.val[1]), + v128_cmpeq_32(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_shuffle_8(v256 x, v256 pattern) { +#if HAVE_NEON +#if defined(__aarch64__) + uint8x16x2_t p = { { vreinterpretq_u8_s64(x.val[0]), + vreinterpretq_u8_s64(x.val[1]) } }; + return v256_from_v128( + vreinterpretq_s64_u8(vqtbl2q_u8(p, vreinterpretq_u8_s64(pattern.val[1]))), + vreinterpretq_s64_u8( + vqtbl2q_u8(p, vreinterpretq_u8_s64(pattern.val[0])))); +#else + uint8x8x4_t p = { { vget_low_u8(vreinterpretq_u8_s64(x.val[0])), + vget_high_u8(vreinterpretq_u8_s64(x.val[0])), + vget_low_u8(vreinterpretq_u8_s64(x.val[1])), + vget_high_u8(vreinterpretq_u8_s64(x.val[1])) } }; + return v256_from_64( + (uint64_t)vreinterpret_s64_u8( + vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[1])))), + (uint64_t)vreinterpret_s64_u8( + vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[1])))), + (uint64_t)vreinterpret_s64_u8( + vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[0])))), + (uint64_t)vreinterpret_s64_u8( + vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[0]))))); +#endif +#else + v128 c16 = v128_dup_8(16); + v128 maskhi = v128_cmplt_s8(pattern.val[1], c16); + v128 masklo = v128_cmplt_s8(pattern.val[0], c16); + return v256_from_v128( + v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c16)), + v128_shuffle_8(x.val[0], pattern.val[1]), maskhi), + v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c16)), + v128_shuffle_8(x.val[0], pattern.val[0]), masklo)); +#endif +} + +SIMD_INLINE v256 v256_wideshuffle_8(v256 x, v256 y, v256 pattern) { +#if HAVE_NEON +#if defined(__aarch64__) + uint8x16x4_t p = { { + vreinterpretq_u8_s64(y.val[0]), + vreinterpretq_u8_s64(y.val[1]), + vreinterpretq_u8_s64(x.val[0]), + vreinterpretq_u8_s64(x.val[1]), + } }; + return v256_from_v128( + vreinterpretq_s64_u8(vqtbl4q_u8(p, vreinterpretq_u8_s64(pattern.val[1]))), + vreinterpretq_s64_u8( + vqtbl4q_u8(p, vreinterpretq_u8_s64(pattern.val[0])))); +#else + v256 c32 = v256_dup_8(32); + v256 p32 = v256_sub_8(pattern, c32); + uint8x8x4_t p = { { vget_low_u8(vreinterpretq_u8_s64(x.val[0])), + vget_high_u8(vreinterpretq_u8_s64(x.val[0])), + vget_low_u8(vreinterpretq_u8_s64(x.val[1])), + vget_high_u8(vreinterpretq_u8_s64(x.val[1])) } }; + uint8x8x4_t q = { { vget_low_u8(vreinterpretq_u8_s64(y.val[0])), + vget_high_u8(vreinterpretq_u8_s64(y.val[0])), + vget_low_u8(vreinterpretq_u8_s64(y.val[1])), + vget_high_u8(vreinterpretq_u8_s64(y.val[1])) } }; + v256 r1 = + v256_from_64((uint64_t)vreinterpret_s64_u8(vtbl4_u8( + p, vreinterpret_u8_s64(vget_high_s64(p32.val[1])))), + (uint64_t)vreinterpret_s64_u8(vtbl4_u8( + p, vreinterpret_u8_s64(vget_low_s64(p32.val[1])))), + (uint64_t)vreinterpret_s64_u8(vtbl4_u8( + p, vreinterpret_u8_s64(vget_high_s64(p32.val[0])))), + (uint64_t)vreinterpret_s64_u8(vtbl4_u8( + p, vreinterpret_u8_s64(vget_low_s64(p32.val[0]))))); + v256 r2 = + v256_from_64((uint64_t)vreinterpret_s64_u8(vtbl4_u8( + q, vreinterpret_u8_s64(vget_high_s64(pattern.val[1])))), + (uint64_t)vreinterpret_s64_u8(vtbl4_u8( + q, vreinterpret_u8_s64(vget_low_s64(pattern.val[1])))), + (uint64_t)vreinterpret_s64_u8(vtbl4_u8( + q, vreinterpret_u8_s64(vget_high_s64(pattern.val[0])))), + (uint64_t)vreinterpret_s64_u8(vtbl4_u8( + q, vreinterpret_u8_s64(vget_low_s64(pattern.val[0]))))); + return v256_blend_8(r1, r2, v256_cmplt_s8(pattern, c32)); +#endif +#else + v128 c16 = v128_dup_8(16); + v128 c32 = v128_dup_8(32); + v128 c48 = v128_dup_8(48); + v128 maskhi16 = v128_cmpgt_s8(c16, pattern.val[1]); + v128 masklo16 = v128_cmpgt_s8(c16, pattern.val[0]); + v128 maskhi48 = v128_cmpgt_s8(c48, pattern.val[1]); + v128 masklo48 = v128_cmpgt_s8(c48, pattern.val[0]); + v256 r1 = v256_from_v128( + v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c48)), + v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[1], c32)), + maskhi48), + v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c48)), + v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[0], c32)), + masklo48)); + v256 r2 = v256_from_v128( + v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[1], c16)), + v128_shuffle_8(y.val[0], pattern.val[1]), maskhi16), + v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[0], c16)), + v128_shuffle_8(y.val[0], pattern.val[0]), masklo16)); + return v256_blend_8(r1, r2, v256_cmpgt_s8(v256_from_v128(c32, c32), pattern)); +#endif +} + +SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) { + return v256_from_v128( + v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)), + v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern))); +} + +SIMD_INLINE v256 v256_shl_8(v256 a, const unsigned int c) { + return v256_from_v128(v128_shl_8(a.val[1], c), v128_shl_8(a.val[0], c)); +} + +SIMD_INLINE v256 v256_shr_u8(v256 a, const unsigned int c) { + return v256_from_v128(v128_shr_u8(a.val[1], c), v128_shr_u8(a.val[0], c)); +} + +SIMD_INLINE v256 v256_shr_s8(v256 a, const unsigned int c) { + return v256_from_v128(v128_shr_s8(a.val[1], c), v128_shr_s8(a.val[0], c)); +} + +SIMD_INLINE v256 v256_shl_16(v256 a, const unsigned int c) { + return v256_from_v128(v128_shl_16(a.val[1], c), v128_shl_16(a.val[0], c)); +} + +SIMD_INLINE v256 v256_shr_u16(v256 a, const unsigned int c) { + return v256_from_v128(v128_shr_u16(a.val[1], c), v128_shr_u16(a.val[0], c)); +} + +SIMD_INLINE v256 v256_shr_s16(v256 a, const unsigned int c) { + return v256_from_v128(v128_shr_s16(a.val[1], c), v128_shr_s16(a.val[0], c)); +} + +SIMD_INLINE v256 v256_shl_32(v256 a, const unsigned int c) { + return v256_from_v128(v128_shl_32(a.val[1], c), v128_shl_32(a.val[0], c)); +} + +SIMD_INLINE v256 v256_shr_u32(v256 a, const unsigned int c) { + return v256_from_v128(v128_shr_u32(a.val[1], c), v128_shr_u32(a.val[0], c)); +} + +SIMD_INLINE v256 v256_shr_s32(v256 a, const unsigned int c) { + return v256_from_v128(v128_shr_s32(a.val[1], c), v128_shr_s32(a.val[0], c)); +} + +SIMD_INLINE v256 v256_shl_64(v256 a, const unsigned int c) { + return v256_from_v128(v128_shl_64(a.val[1], c), v128_shl_64(a.val[0], c)); +} + +SIMD_INLINE v256 v256_shr_u64(v256 a, const unsigned int c) { + return v256_from_v128(v128_shr_u64(a.val[1], c), v128_shr_u64(a.val[0], c)); +} + +SIMD_INLINE v256 v256_shr_s64(v256 a, const unsigned int c) { + return v256_from_v128(v128_shr_s64(a.val[1], c), v128_shr_s64(a.val[0], c)); +} + +/* These intrinsics require immediate values, so we must use #defines + to enforce that. */ +#define v256_shl_n_byte(a, n) \ + ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.val[1], n), \ + v128_shr_n_byte(a.val[0], 16 - (n))), \ + v128_shl_n_byte(a.val[0], (n))) \ + : v256_from_v128( \ + (n) > 16 ? v128_shl_n_byte(a.val[0], (n)-16) : a.val[0], \ + v128_zero())) + +#define v256_shr_n_byte(a, n) \ + (n == 0 \ + ? a \ + : ((n) < 16 \ + ? v256_from_v128(v128_shr_n_byte(a.val[1], n), \ + v128_or(v128_shr_n_byte(a.val[0], n), \ + v128_shl_n_byte(a.val[1], 16 - (n)))) \ + : v256_from_v128( \ + v128_zero(), \ + (n) > 16 ? v128_shr_n_byte(a.val[1], (n)-16) : a.val[1]))) + +#define v256_align(a, b, c) \ + ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b) + +#define v256_shl_n_8(a, n) \ + v256_from_v128(v128_shl_n_8(a.val[1], n), v128_shl_n_8(a.val[0], n)) +#define v256_shl_n_16(a, n) \ + v256_from_v128(v128_shl_n_16(a.val[1], n), v128_shl_n_16(a.val[0], n)) +#define v256_shl_n_32(a, n) \ + v256_from_v128(v128_shl_n_32(a.val[1], n), v128_shl_n_32(a.val[0], n)) +#define v256_shl_n_64(a, n) \ + v256_from_v128(v128_shl_n_64(a.val[1], n), v128_shl_n_64(a.val[0], n)) +#define v256_shr_n_u8(a, n) \ + v256_from_v128(v128_shr_n_u8(a.val[1], n), v128_shr_n_u8(a.val[0], n)) +#define v256_shr_n_u16(a, n) \ + v256_from_v128(v128_shr_n_u16(a.val[1], n), v128_shr_n_u16(a.val[0], n)) +#define v256_shr_n_u32(a, n) \ + v256_from_v128(v128_shr_n_u32(a.val[1], n), v128_shr_n_u32(a.val[0], n)) +#define v256_shr_n_u64(a, n) \ + v256_from_v128(v128_shr_n_u64(a.val[1], n), v128_shr_n_u64(a.val[0], n)) +#define v256_shr_n_s8(a, n) \ + v256_from_v128(v128_shr_n_s8(a.val[1], n), v128_shr_n_s8(a.val[0], n)) +#define v256_shr_n_s16(a, n) \ + v256_from_v128(v128_shr_n_s16(a.val[1], n), v128_shr_n_s16(a.val[0], n)) +#define v256_shr_n_s32(a, n) \ + v256_from_v128(v128_shr_n_s32(a.val[1], n), v128_shr_n_s32(a.val[0], n)) +#define v256_shr_n_s64(a, n) \ + v256_from_v128(v128_shr_n_s64(a.val[1], n), v128_shr_n_s64(a.val[0], n)) + +#define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n)) +#define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n)) + +typedef struct { + sad128_internal_u16 val[2]; +} sad256_internal_u16; + +SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) { + sad256_internal_u16 t; + t.val[1] = v128_sad_u16_init(); + t.val[0] = v128_sad_u16_init(); + return t; +} + +/* Implementation dependent return value. Result must be finalised with + v256_sad_u16_sum(). + The result for more than 16 v256_sad_u16() calls is undefined. */ +SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a, + v256 b) { + sad256_internal_u16 t; + t.val[1] = v128_sad_u16(s.val[1], a.val[1], b.val[1]); + t.val[0] = v128_sad_u16(s.val[0], a.val[0], b.val[0]); + return t; +} + +SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) { + return v128_sad_u16_sum(s.val[1]) + v128_sad_u16_sum(s.val[0]); +} + +typedef struct { + ssd128_internal_s16 val[2]; +} ssd256_internal_s16; + +SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init(void) { + ssd256_internal_s16 t; + t.val[1] = v128_ssd_s16_init(); + t.val[0] = v128_ssd_s16_init(); + return t; +} + +/* Implementation dependent return value. Result must be finalised with + * v256_ssd_s16_sum(). */ +SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a, + v256 b) { + ssd256_internal_s16 t; + t.val[1] = v128_ssd_s16(s.val[1], a.val[1], b.val[1]); + t.val[0] = v128_ssd_s16(s.val[0], a.val[0], b.val[0]); + return t; +} + +SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) { + return v128_ssd_s16_sum(s.val[1]) + v128_ssd_s16_sum(s.val[0]); +} + +#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_ diff --git a/libs/libaom/src/aom_dsp/simd/v256_intrinsics_x86.h b/libs/libaom/src/aom_dsp/simd/v256_intrinsics_x86.h new file mode 100644 index 000000000..5983cb80c --- /dev/null +++ b/libs/libaom/src/aom_dsp/simd/v256_intrinsics_x86.h @@ -0,0 +1,750 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_ +#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_ + +#if !defined(__AVX2__) + +#include "aom_dsp/simd/v256_intrinsics_v128.h" + +#else + +// The _m256i type seems to cause problems for g++'s mangling prior to +// version 5, but adding -fabi-version=0 fixes this. +#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5 && \ + defined(__AVX2__) && defined(__cplusplus) +#pragma GCC optimize "-fabi-version=0" +#endif + +#include + +#include "aom_dsp/simd/v128_intrinsics_x86.h" + +typedef __m256i v256; + +SIMD_INLINE uint32_t v256_low_u32(v256 a) { + return (uint32_t)_mm_cvtsi128_si32(_mm256_extracti128_si256(a, 0)); +} + +SIMD_INLINE v64 v256_low_v64(v256 a) { + return _mm_unpacklo_epi64(_mm256_extracti128_si256(a, 0), v64_zero()); +} + +SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); } + +SIMD_INLINE v128 v256_low_v128(v256 a) { return _mm256_castsi256_si128(a); } + +SIMD_INLINE v128 v256_high_v128(v256 a) { + return _mm256_extracti128_si256(a, 1); +} + +SIMD_INLINE v256 v256_from_v128(v128 a, v128 b) { + // gcc seems to be missing _mm256_set_m128i() + return _mm256_inserti128_si256(_mm256_castsi128_si256(b), a, 1); +} + +SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) { + return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d)); +} + +SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) { + return _mm256_set_epi64x(a, b, c, d); +} + +SIMD_INLINE v256 v256_load_aligned(const void *p) { + return _mm256_load_si256((const __m256i *)p); +} + +SIMD_INLINE v256 v256_load_unaligned(const void *p) { + return _mm256_loadu_si256((const __m256i *)p); +} + +SIMD_INLINE void v256_store_aligned(void *p, v256 a) { + _mm256_store_si256((__m256i *)p, a); +} + +SIMD_INLINE void v256_store_unaligned(void *p, v256 a) { + _mm256_storeu_si256((__m256i *)p, a); +} + +SIMD_INLINE v256 v256_zero(void) { return _mm256_setzero_si256(); } + +SIMD_INLINE v256 v256_dup_8(uint8_t x) { return _mm256_set1_epi8(x); } + +SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16(x); } + +SIMD_INLINE v256 v256_dup_32(uint32_t x) { return _mm256_set1_epi32(x); } + +SIMD_INLINE v256 v256_dup_64(uint64_t x) { return _mm256_set1_epi64x(x); } + +SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return _mm256_add_epi8(a, b); } + +SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return _mm256_add_epi16(a, b); } + +SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return _mm256_adds_epu8(a, b); } + +SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return _mm256_adds_epi8(a, b); } + +SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { + return _mm256_adds_epi16(a, b); +} + +SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return _mm256_add_epi32(a, b); } + +SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return _mm256_add_epi64(a, b); } + +SIMD_INLINE v256 v256_padd_u8(v256 a) { + return _mm256_maddubs_epi16(a, _mm256_set1_epi8(1)); +} + +SIMD_INLINE v256 v256_padd_s16(v256 a) { + return _mm256_madd_epi16(a, _mm256_set1_epi16(1)); +} + +SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return _mm256_sub_epi8(a, b); } + +SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return _mm256_subs_epu8(a, b); } + +SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return _mm256_subs_epi8(a, b); } + +SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return _mm256_sub_epi16(a, b); } + +SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { + return _mm256_subs_epi16(a, b); +} + +SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { + return _mm256_subs_epu16(a, b); +} + +SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return _mm256_sub_epi32(a, b); } + +SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return _mm256_sub_epi64(a, b); } + +SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); } + +SIMD_INLINE v256 v256_abs_s8(v256 a) { return _mm256_abs_epi8(a); } + +// AVX doesn't have the direct intrinsics to zip/unzip 8, 16, 32 bit +// lanes of lower or upper halves of a 256bit vector because the +// unpack/pack intrinsics operate on the 256 bit input vector as 2 +// independent 128 bit vectors. +SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { + return _mm256_unpacklo_epi8( + _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); +} + +SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { + return _mm256_unpackhi_epi8( + _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); +} + +SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { + return _mm256_unpacklo_epi16( + _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); +} + +SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { + return _mm256_unpackhi_epi16( + _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); +} + +SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { + return _mm256_unpacklo_epi32( + _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); +} + +SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { + return _mm256_unpackhi_epi32( + _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); +} + +SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { + return _mm256_unpacklo_epi64( + _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); +} + +SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { + return _mm256_unpackhi_epi64( + _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); +} + +SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) { + return _mm256_permute2x128_si256(a, b, 0x02); +} + +SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) { + return _mm256_permute2x128_si256(a, b, 0x13); +} + +SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { + return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b)); +} + +SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { + return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b)); +} + +SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { + return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b)); +} + +SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) { + return _mm256_permute4x64_epi64( + _mm256_packs_epi16(_mm256_srai_epi16(b, 8), _mm256_srai_epi16(a, 8)), + _MM_SHUFFLE(3, 1, 2, 0)); +} + +SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) { + return v256_unziphi_8(_mm256_slli_si256(a, 1), _mm256_slli_si256(b, 1)); +} + +SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) { + return _mm256_permute4x64_epi64( + _mm256_packs_epi32(_mm256_srai_epi32(b, 16), _mm256_srai_epi32(a, 16)), + _MM_SHUFFLE(3, 1, 2, 0)); +} + +SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) { + return v256_unziphi_16(_mm256_slli_si256(a, 2), _mm256_slli_si256(b, 2)); +} + +SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) { + return _mm256_permute4x64_epi64( + _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b), + _mm256_castsi256_ps(a), + _MM_SHUFFLE(3, 1, 3, 1))), + _MM_SHUFFLE(3, 1, 2, 0)); +} + +SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) { + return _mm256_permute4x64_epi64( + _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b), + _mm256_castsi256_ps(a), + _MM_SHUFFLE(2, 0, 2, 0))), + _MM_SHUFFLE(3, 1, 2, 0)); +} + +SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) { + return _mm256_permute4x64_epi64( + _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(b), + _mm256_castsi256_pd(a), 15)), + _MM_SHUFFLE(3, 1, 2, 0)); +} + +SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) { + return _mm256_permute4x64_epi64( + _mm256_castpd_si256( + _mm256_shuffle_pd(_mm256_castsi256_pd(b), _mm256_castsi256_pd(a), 0)), + _MM_SHUFFLE(3, 1, 2, 0)); +} + +SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return _mm256_cvtepu8_epi16(a); } + +SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) { + return _mm256_unpacklo_epi8( + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_setzero_si256()); +} + +SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) { + return _mm256_unpackhi_epi8( + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_setzero_si256()); +} + +SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { + return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a)); +} + +SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) { + return _mm256_srai_epi16( + _mm256_unpacklo_epi8( + a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))), + 8); +} + +SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) { + return _mm256_srai_epi16( + _mm256_unpackhi_epi8( + a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))), + 8); +} + +SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) { + return _mm256_permute4x64_epi64(_mm256_packs_epi32(b, a), + _MM_SHUFFLE(3, 1, 2, 0)); +} + +SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) { + return _mm256_permute4x64_epi64(_mm256_packus_epi32(b, a), + _MM_SHUFFLE(3, 1, 2, 0)); +} + +SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) { + return _mm256_permute4x64_epi64(_mm256_packus_epi16(b, a), + _MM_SHUFFLE(3, 1, 2, 0)); +} + +SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) { + return _mm256_permute4x64_epi64(_mm256_packs_epi16(b, a), + _MM_SHUFFLE(3, 1, 2, 0)); +} + +SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) { + return _mm256_cvtepu16_epi32(a); +} + +SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) { + return _mm256_cvtepi16_epi32(a); +} + +SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) { + return _mm256_unpacklo_epi16( + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_setzero_si256()); +} + +SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) { + return _mm256_srai_epi32( + _mm256_unpacklo_epi16( + a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))), + 16); +} + +SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) { + return _mm256_unpackhi_epi16( + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_setzero_si256()); +} + +SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) { + return _mm256_srai_epi32( + _mm256_unpackhi_epi16( + a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))), + 16); +} + +SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) { + return _mm256_blendv_epi8( + _mm256_shuffle_epi8( + _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 1, 0, 1)), pattern), + _mm256_shuffle_epi8( + _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 0, 0, 0)), pattern), + _mm256_cmpgt_epi8(v256_dup_8(16), pattern)); +} + +SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) { + v256 c32 = v256_dup_8(32); + v256 p32 = v256_sub_8(pattern, c32); + v256 r1 = _mm256_blendv_epi8( + _mm256_shuffle_epi8( + _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 1, 0, 1)), p32), + _mm256_shuffle_epi8( + _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 0, 0, 0)), p32), + _mm256_cmpgt_epi8(v256_dup_8(48), pattern)); + v256 r2 = _mm256_blendv_epi8( + _mm256_shuffle_epi8( + _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 3, 0, 3)), pattern), + _mm256_shuffle_epi8( + _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 2, 0, 2)), pattern), + _mm256_cmpgt_epi8(v256_dup_8(16), pattern)); + return _mm256_blendv_epi8(r1, r2, _mm256_cmpgt_epi8(c32, pattern)); +} + +SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) { + return _mm256_shuffle_epi8(a, pattern); +} + +SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) { + v256 t1 = _mm256_madd_epi16(v256_unpackhi_s8_s16(a), v256_unpackhi_u8_s16(b)); + v256 t2 = _mm256_madd_epi16(v256_unpacklo_s8_s16(a), v256_unpacklo_u8_s16(b)); + t1 = _mm256_add_epi32(t1, t2); + v128 t = _mm_add_epi32(_mm256_extracti128_si256(t1, 0), + _mm256_extracti128_si256(t1, 1)); + t = _mm_add_epi32(t, _mm_srli_si128(t, 8)); + t = _mm_add_epi32(t, _mm_srli_si128(t, 4)); + return (int32_t)v128_low_u32(t); +} + +SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) { + v256 r = _mm256_madd_epi16(a, b); +#if defined(__x86_64__) + v128 t; + r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)), + _mm256_cvtepi32_epi64(v256_low_v128(r))); + t = v256_low_v128(_mm256_add_epi64( + r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1)))); + return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8))); +#else + v128 l = v256_low_v128(r); + v128 h = v256_high_v128(r); + return (int64_t)_mm_cvtsi128_si32(l) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) + + (int64_t)_mm_cvtsi128_si32(h) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12)); +#endif +} + +SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) { + v256 r = _mm256_mullo_epi32(a, b); +#if defined(__x86_64__) + v128 t; + r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)), + _mm256_cvtepi32_epi64(v256_low_v128(r))); + t = v256_low_v128(_mm256_add_epi64( + r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1)))); + return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8))); +#else + v128 l = v256_low_v128(r); + v128 h = v256_high_v128(r); + return (int64_t)_mm_cvtsi128_si32(l) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) + + (int64_t)_mm_cvtsi128_si32(h) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12)); +#endif +} + +SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { + v256 t = _mm256_sad_epu8(a, _mm256_setzero_si256()); + v128 lo = v256_low_v128(t); + v128 hi = v256_high_v128(t); + lo = v128_add_32(lo, hi); + return v64_low_u32(v128_low_v64(lo)) + v128_low_u32(v128_high_v64(lo)); +} + +typedef v256 sad256_internal; + +SIMD_INLINE sad256_internal v256_sad_u8_init(void) { + return _mm256_setzero_si256(); +} + +/* Implementation dependent return value. Result must be finalised with + v256_sad_u8_sum(). + The result for more than 32 v256_sad_u8() calls is undefined. */ +SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) { + return _mm256_add_epi64(s, _mm256_sad_epu8(a, b)); +} + +SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) { + v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s)); + return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t))); +} + +typedef v256 ssd256_internal; + +SIMD_INLINE ssd256_internal v256_ssd_u8_init(void) { + return _mm256_setzero_si256(); +} + +/* Implementation dependent return value. Result must be finalised with + * v256_ssd_u8_sum(). */ +SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) { + v256 l = _mm256_sub_epi16(_mm256_unpacklo_epi8(a, _mm256_setzero_si256()), + _mm256_unpacklo_epi8(b, _mm256_setzero_si256())); + v256 h = _mm256_sub_epi16(_mm256_unpackhi_epi8(a, _mm256_setzero_si256()), + _mm256_unpackhi_epi8(b, _mm256_setzero_si256())); + v256 rl = _mm256_madd_epi16(l, l); + v256 rh = _mm256_madd_epi16(h, h); + v128 c = _mm_cvtsi32_si128(32); + rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 8)); + rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 4)); + rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 8)); + rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 4)); + return _mm256_add_epi64( + s, + _mm256_srl_epi64(_mm256_sll_epi64(_mm256_unpacklo_epi64(rl, rh), c), c)); +} + +SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) { + v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s)); + return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t))); +} + +SIMD_INLINE v256 v256_or(v256 a, v256 b) { return _mm256_or_si256(a, b); } + +SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return _mm256_xor_si256(a, b); } + +SIMD_INLINE v256 v256_and(v256 a, v256 b) { return _mm256_and_si256(a, b); } + +SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return _mm256_andnot_si256(b, a); } + +SIMD_INLINE v256 v256_mul_s16(v64 a, v64 b) { + v128 lo_bits = v128_mullo_s16(a, b); + v128 hi_bits = v128_mulhi_s16(a, b); + return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits), + v128_ziplo_16(hi_bits, lo_bits)); +} + +SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) { + return _mm256_mullo_epi16(a, b); +} + +SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) { + return _mm256_mulhi_epi16(a, b); +} + +SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) { + return _mm256_mullo_epi32(a, b); +} + +SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { + return _mm256_madd_epi16(a, b); +} + +SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { + return _mm256_maddubs_epi16(a, b); +} + +SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return _mm256_avg_epu8(a, b); } + +SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { + return _mm256_sub_epi8( + _mm256_avg_epu8(a, b), + _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_8(1))); +} + +SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) { + return _mm256_sub_epi16( + _mm256_avg_epu16(a, b), + _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_16(1))); +} + +SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return _mm256_avg_epu16(a, b); } + +SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return _mm256_min_epu8(a, b); } + +SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return _mm256_max_epu8(a, b); } + +SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return _mm256_min_epi8(a, b); } + +SIMD_INLINE uint32_t v256_movemask_8(v256 a) { return _mm256_movemask_epi8(a); } + +SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) { + return _mm256_blendv_epi8(a, b, c); +} + +SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return _mm256_max_epi8(a, b); } + +SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return _mm256_min_epi16(a, b); } + +SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return _mm256_max_epi16(a, b); } + +SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return _mm256_min_epi32(a, b); } + +SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return _mm256_max_epi32(a, b); } + +SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { + return _mm256_cmpgt_epi8(a, b); +} + +SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { + return _mm256_cmpgt_epi8(b, a); +} + +SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { + return _mm256_cmpeq_epi8(a, b); +} + +SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) { + return _mm256_cmpgt_epi16(a, b); +} + +SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) { + return _mm256_cmpgt_epi16(b, a); +} + +SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { + return _mm256_cmpeq_epi16(a, b); +} + +SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) { + return _mm256_cmpgt_epi32(a, b); +} + +SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) { + return _mm256_cmpgt_epi32(b, a); +} + +SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) { + return _mm256_cmpeq_epi32(a, b); +} + +SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) { + return _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << c)), + _mm256_sll_epi16(a, _mm_cvtsi32_si128(c))); +} + +SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) { + return _mm256_and_si256(_mm256_set1_epi8((char)(0xff >> c)), + _mm256_srl_epi16(a, _mm_cvtsi32_si128(c))); +} + +SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) { + __m128i x = _mm_cvtsi32_si128(c + 8); + return _mm256_packs_epi16(_mm256_sra_epi16(_mm256_unpacklo_epi8(a, a), x), + _mm256_sra_epi16(_mm256_unpackhi_epi8(a, a), x)); +} + +SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) { + return _mm256_sll_epi16(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) { + return _mm256_srl_epi16(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) { + return _mm256_sra_epi16(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) { + return _mm256_sll_epi32(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) { + return _mm256_srl_epi32(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) { + return _mm256_sra_epi32(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) { + return _mm256_sll_epi64(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) { + return _mm256_srl_epi64(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) { +#if defined(__AVX512VL__) + return _mm256_sra_epi64(a, _mm_cvtsi32_si128(c)); +#else + return v256_from_v128(v128_shr_s64(v256_high_v128(a), c), + v128_shr_s64(v256_low_v128(a), c)); +#endif +} + +/* These intrinsics require immediate values, so we must use #defines + to enforce that. */ +// _mm256_slli_si256 works on 128 bit lanes and can't be used +#define v256_shl_n_byte(a, n) \ + ((n) < 16 ? v256_from_v128( \ + v128_align(v256_high_v128(a), v256_low_v128(a), 16 - (n)), \ + v128_shl_n_byte(v256_low_v128(a), n)) \ + : _mm256_inserti128_si256( \ + _mm256_setzero_si256(), \ + v128_shl_n_byte(v256_low_v128(a), (n)-16), 1)) + +// _mm256_srli_si256 works on 128 bit lanes and can't be used +#define v256_shr_n_byte(a, n) \ + ((n) < 16 \ + ? _mm256_alignr_epi8( \ + _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n) \ + : ((n) == 16 \ + ? _mm256_permute2x128_si256(_mm256_setzero_si256(), a, 3) \ + : _mm256_inserti128_si256( \ + _mm256_setzero_si256(), \ + v128_align(v256_high_v128(a), v256_high_v128(a), n), 0))) + +// _mm256_alignr_epi8 works on two 128 bit lanes and can't be used +#define v256_align(a, b, c) \ + ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b) + +#define v256_shl_n_8(a, c) \ + _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << (c))), \ + _mm256_slli_epi16(a, c)) +#define v256_shr_n_u8(a, c) \ + _mm256_and_si256(_mm256_set1_epi8(0xff >> (c)), _mm256_srli_epi16(a, c)) +#define v256_shr_n_s8(a, c) \ + _mm256_packs_epi16(_mm256_srai_epi16(_mm256_unpacklo_epi8(a, a), (c) + 8), \ + _mm256_srai_epi16(_mm256_unpackhi_epi8(a, a), (c) + 8)) +#define v256_shl_n_16(a, c) _mm256_slli_epi16(a, c) +#define v256_shr_n_u16(a, c) _mm256_srli_epi16(a, c) +#define v256_shr_n_s16(a, c) _mm256_srai_epi16(a, c) +#define v256_shl_n_32(a, c) _mm256_slli_epi32(a, c) +#define v256_shr_n_u32(a, c) _mm256_srli_epi32(a, c) +#define v256_shr_n_s32(a, c) _mm256_srai_epi32(a, c) +#define v256_shl_n_64(a, c) _mm256_slli_epi64(a, c) +#define v256_shr_n_u64(a, c) _mm256_srli_epi64(a, c) +#define v256_shr_n_s64(a, c) \ + v256_shr_s64((a), (c)) // _mm256_srai_epi64 broken in gcc? +#define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n)) +#define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n)) + +typedef v256 sad256_internal_u16; + +SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) { return v256_zero(); } + +/* Implementation dependent return value. Result must be finalised with + * v256_sad_u16_sum(). */ +SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a, + v256 b) { +#if defined(__SSE4_1__) + v256 t = v256_sub_16(_mm256_max_epu16(a, b), _mm256_min_epu16(a, b)); +#else + v256 t = v256_cmplt_s16(v256_xor(a, v256_dup_16(32768)), + v256_xor(b, v256_dup_16(32768))); + t = v256_sub_16(v256_or(v256_and(b, t), v256_andn(a, t)), + v256_or(v256_and(a, t), v256_andn(b, t))); +#endif + return v256_add_32( + s, v256_add_32(v256_unpackhi_u16_s32(t), v256_unpacklo_u16_s32(t))); +} + +SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) { + v128 t = v128_add_32(v256_high_v128(s), v256_low_v128(s)); + return v128_low_u32(t) + v128_low_u32(v128_shr_n_byte(t, 4)) + + v128_low_u32(v128_shr_n_byte(t, 8)) + + v128_low_u32(v128_shr_n_byte(t, 12)); +} + +typedef v256 ssd256_internal_s16; + +SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init(void) { return v256_zero(); } + +/* Implementation dependent return value. Result must be finalised with + * v256_ssd_s16_sum(). */ +SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a, + v256 b) { + v256 d = v256_sub_16(a, b); + d = v256_madd_s16(d, d); + return v256_add_64(s, v256_add_64(_mm256_unpackhi_epi32(d, v256_zero()), + _mm256_unpacklo_epi32(d, v256_zero()))); +} + +SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) { + v128 t = v128_add_64(v256_high_v128(s), v256_low_v128(s)); + return v64_u64(v128_low_v64(t)) + v64_u64(v128_high_v64(t)); +} + +#endif + +#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_ diff --git a/libs/libaom/src/aom_dsp/simd/v64_intrinsics.h b/libs/libaom/src/aom_dsp/simd/v64_intrinsics.h new file mode 100644 index 000000000..7079949cd --- /dev/null +++ b/libs/libaom/src/aom_dsp/simd/v64_intrinsics.h @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_ +#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_ + +#include +#include + +#include "aom_dsp/simd/v64_intrinsics_c.h" + +/* Fallback to plain, unoptimised C. */ + +typedef c_v64 v64; + +SIMD_INLINE uint32_t v64_low_u32(v64 a) { return c_v64_low_u32(a); } +SIMD_INLINE uint32_t v64_high_u32(v64 a) { return c_v64_high_u32(a); } +SIMD_INLINE int32_t v64_low_s32(v64 a) { return c_v64_low_s32(a); } +SIMD_INLINE int32_t v64_high_s32(v64 a) { return c_v64_high_s32(a); } +SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) { + return c_v64_from_32(x, y); +} +SIMD_INLINE v64 v64_from_64(uint64_t x) { return c_v64_from_64(x); } +SIMD_INLINE uint64_t v64_u64(v64 x) { return c_v64_u64(x); } +SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) { + return c_v64_from_16(a, b, c, d); +} + +SIMD_INLINE uint32_t u32_load_unaligned(const void *p) { + return c_u32_load_unaligned(p); +} +SIMD_INLINE uint32_t u32_load_aligned(const void *p) { + return c_u32_load_aligned(p); +} +SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) { + c_u32_store_unaligned(p, a); +} +SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) { + c_u32_store_aligned(p, a); +} + +SIMD_INLINE v64 v64_load_unaligned(const void *p) { + return c_v64_load_unaligned(p); +} +SIMD_INLINE v64 v64_load_aligned(const void *p) { + return c_v64_load_aligned(p); +} + +SIMD_INLINE void v64_store_unaligned(void *p, v64 a) { + c_v64_store_unaligned(p, a); +} +SIMD_INLINE void v64_store_aligned(void *p, v64 a) { + c_v64_store_aligned(p, a); +} + +SIMD_INLINE v64 v64_align(v64 a, v64 b, unsigned int c) { + return c_v64_align(a, b, c); +} + +SIMD_INLINE v64 v64_zero(void) { return c_v64_zero(); } +SIMD_INLINE v64 v64_dup_8(uint8_t x) { return c_v64_dup_8(x); } +SIMD_INLINE v64 v64_dup_16(uint16_t x) { return c_v64_dup_16(x); } +SIMD_INLINE v64 v64_dup_32(uint32_t x) { return c_v64_dup_32(x); } + +SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return c_v64_add_8(a, b); } +SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return c_v64_add_16(a, b); } +SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return c_v64_sadd_u8(a, b); } +SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return c_v64_sadd_s8(a, b); } +SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return c_v64_sadd_s16(a, b); } +SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return c_v64_add_32(a, b); } +SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return c_v64_sub_8(a, b); } +SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return c_v64_ssub_u8(a, b); } +SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return c_v64_ssub_s8(a, b); } +SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return c_v64_sub_16(a, b); } +SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return c_v64_ssub_s16(a, b); } +SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return c_v64_ssub_u16(a, b); } +SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return c_v64_sub_32(a, b); } +SIMD_INLINE v64 v64_abs_s16(v64 a) { return c_v64_abs_s16(a); } +SIMD_INLINE v64 v64_abs_s8(v64 a) { return c_v64_abs_s8(a); } + +SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return c_v64_ziplo_8(a, b); } +SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) { return c_v64_ziphi_8(a, b); } +SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return c_v64_ziplo_16(a, b); } +SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) { return c_v64_ziphi_16(a, b); } +SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return c_v64_ziplo_32(a, b); } +SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) { return c_v64_ziphi_32(a, b); } +SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) { return c_v64_unziplo_8(a, b); } +SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) { return c_v64_unziphi_8(a, b); } +SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) { return c_v64_unziplo_16(a, b); } +SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) { return c_v64_unziphi_16(a, b); } +SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { return c_v64_unpacklo_u8_s16(a); } +SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { return c_v64_unpackhi_u8_s16(a); } +SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) { return c_v64_unpacklo_s8_s16(a); } +SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { return c_v64_unpackhi_s8_s16(a); } +SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) { + return c_v64_pack_s32_s16(a, b); +} +SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) { + return c_v64_pack_s32_u16(a, b); +} +SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) { + return c_v64_pack_s16_u8(a, b); +} +SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) { + return c_v64_pack_s16_s8(a, b); +} +SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) { + return c_v64_unpacklo_u16_s32(a); +} +SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) { + return c_v64_unpacklo_s16_s32(a); +} +SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) { + return c_v64_unpackhi_u16_s32(a); +} +SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) { + return c_v64_unpackhi_s16_s32(a); +} +SIMD_INLINE v64 v64_shuffle_8(v64 a, v64 pattern) { + return c_v64_shuffle_8(a, pattern); +} + +SIMD_INLINE c_sad64_internal v64_sad_u8_init(void) { + return c_v64_sad_u8_init(); +} +SIMD_INLINE c_sad64_internal v64_sad_u8(c_sad64_internal s, v64 a, v64 b) { + return c_v64_sad_u8(s, a, b); +} +SIMD_INLINE uint32_t v64_sad_u8_sum(c_sad64_internal s) { + return c_v64_sad_u8_sum(s); +} +SIMD_INLINE c_ssd64_internal v64_ssd_u8_init(void) { + return c_v64_ssd_u8_init(); +} +SIMD_INLINE c_ssd64_internal v64_ssd_u8(c_ssd64_internal s, v64 a, v64 b) { + return c_v64_ssd_u8(s, a, b); +} +SIMD_INLINE uint32_t v64_ssd_u8_sum(c_ssd64_internal s) { + return c_v64_ssd_u8_sum(s); +} +SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) { return c_v64_dotp_su8(a, b); } +SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) { return c_v64_dotp_s16(a, b); } +SIMD_INLINE uint64_t v64_hadd_u8(v64 a) { return c_v64_hadd_u8(a); } +SIMD_INLINE int64_t v64_hadd_s16(v64 a) { return c_v64_hadd_s16(a); } + +SIMD_INLINE v64 v64_or(v64 a, v64 b) { return c_v64_or(a, b); } +SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return c_v64_xor(a, b); } +SIMD_INLINE v64 v64_and(v64 a, v64 b) { return c_v64_and(a, b); } +SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return c_v64_andn(a, b); } + +SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return c_v64_mullo_s16(a, b); } +SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return c_v64_mulhi_s16(a, b); } +SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) { return c_v64_mullo_s32(a, b); } +SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return c_v64_madd_s16(a, b); } +SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) { return c_v64_madd_us8(a, b); } + +SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return c_v64_avg_u8(a, b); } +SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) { return c_v64_rdavg_u8(a, b); } +SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) { return c_v64_rdavg_u16(a, b); } +SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return c_v64_avg_u16(a, b); } +SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return c_v64_min_u8(a, b); } +SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return c_v64_max_u8(a, b); } +SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) { return c_v64_min_s8(a, b); } +SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) { return c_v64_max_s8(a, b); } +SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return c_v64_min_s16(a, b); } +SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return c_v64_max_s16(a, b); } + +SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return c_v64_cmpgt_s8(a, b); } +SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return c_v64_cmplt_s8(a, b); } +SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return c_v64_cmpeq_8(a, b); } +SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return c_v64_cmpgt_s16(a, b); } +SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return c_v64_cmplt_s16(a, b); } +SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return c_v64_cmpeq_16(a, b); } + +SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int n) { return c_v64_shl_8(a, n); } +SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int n) { return c_v64_shr_u8(a, n); } +SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int n) { return c_v64_shr_s8(a, n); } +SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int n) { return c_v64_shl_16(a, n); } +SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int n) { + return c_v64_shr_u16(a, n); +} +SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int n) { + return c_v64_shr_s16(a, n); +} +SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int n) { return c_v64_shl_32(a, n); } +SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int n) { + return c_v64_shr_u32(a, n); +} +SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int n) { + return c_v64_shr_s32(a, n); +} +SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int n) { + return c_v64_shr_n_byte(a, n); +} +SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int n) { + return c_v64_shl_n_byte(a, n); +} +SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { + return c_v64_shl_n_8(a, c); +} +SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { + return c_v64_shr_n_u8(a, c); +} +SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { + return c_v64_shr_n_s8(a, c); +} +SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { + return c_v64_shl_n_16(a, c); +} +SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) { + return c_v64_shr_n_u16(a, c); +} +SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) { + return c_v64_shr_n_s16(a, c); +} +SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { + return c_v64_shl_n_32(a, c); +} +SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) { + return c_v64_shr_n_u32(a, c); +} +SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) { + return c_v64_shr_n_s32(a, c); +} + +#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_ diff --git a/libs/libaom/src/aom_dsp/simd/v64_intrinsics_arm.h b/libs/libaom/src/aom_dsp/simd/v64_intrinsics_arm.h new file mode 100644 index 000000000..a4ecdf4b5 --- /dev/null +++ b/libs/libaom/src/aom_dsp/simd/v64_intrinsics_arm.h @@ -0,0 +1,684 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_ +#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_ + +#include + +#include "aom_dsp/simd/v64_intrinsics_arm.h" +#include "aom_ports/arm.h" + +#ifdef AOM_INCOMPATIBLE_GCC +#error Incompatible gcc +#endif + +typedef int64x1_t v64; + +SIMD_INLINE uint32_t v64_low_u32(v64 a) { + return vget_lane_u32(vreinterpret_u32_s64(a), 0); +} + +SIMD_INLINE uint32_t v64_high_u32(v64 a) { + return vget_lane_u32(vreinterpret_u32_s64(a), 1); +} + +SIMD_INLINE int32_t v64_low_s32(v64 a) { + return vget_lane_s32(vreinterpret_s32_s64(a), 0); +} + +SIMD_INLINE int32_t v64_high_s32(v64 a) { + return vget_lane_s32(vreinterpret_s32_s64(a), 1); +} + +SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) { + return vcreate_s64((uint64_t)a << 48 | (uint64_t)b << 32 | (uint64_t)c << 16 | + d); +} + +SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) { + return vcreate_s64((uint64_t)x << 32 | y); +} + +SIMD_INLINE v64 v64_from_64(uint64_t x) { return vcreate_s64(x); } + +SIMD_INLINE uint64_t v64_u64(v64 x) { return (uint64_t)x; } + +SIMD_INLINE uint32_t u32_load_aligned(const void *p) { + return *((uint32_t *)p); +} + +SIMD_INLINE uint32_t u32_load_unaligned(const void *p) { + return vget_lane_u32(vreinterpret_u32_u8(vld1_u8((const uint8_t *)p)), 0); +} + +SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) { + *((uint32_t *)p) = a; +} + +SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) { +#if defined(__clang__) + vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a), + 0); +#elif defined(__CC_ARM) + *(__packed uint32_t *)p) = a; +#elif defined(__GNUC__) + struct Unaligned32Struct { + uint32_t value; + uint8_t dummy; // To make the size non-power-of-two. + } __attribute__((__packed__)); + ((struct Unaligned32Struct *)p)->value = a; +#else + vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a), + 0); +#endif +} + +SIMD_INLINE v64 v64_load_aligned(const void *p) { + return vreinterpret_s64_u8(vld1_u8((const uint8_t *)p)); +} + +SIMD_INLINE v64 v64_load_unaligned(const void *p) { + return v64_load_aligned(p); +} + +SIMD_INLINE void v64_store_aligned(void *p, v64 r) { + vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r)); +} + +SIMD_INLINE void v64_store_unaligned(void *p, v64 r) { + vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r)); +} + +// The following function requires an immediate. +// Some compilers will check this if it's optimising, others wont. +SIMD_INLINE v64 v64_align(v64 a, v64 b, unsigned int c) { +#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) + return c ? vreinterpret_s64_s8( + vext_s8(vreinterpret_s8_s64(b), vreinterpret_s8_s64(a), c)) + : b; +#else + return c ? v64_from_64(((uint64_t)b >> c * 8) | ((uint64_t)a << (8 - c) * 8)) + : b; +#endif +} + +SIMD_INLINE v64 v64_zero(void) { return vreinterpret_s64_u8(vdup_n_u8(0)); } + +SIMD_INLINE v64 v64_dup_8(uint8_t x) { + return vreinterpret_s64_u8(vdup_n_u8(x)); +} + +SIMD_INLINE v64 v64_dup_16(uint16_t x) { + return vreinterpret_s64_u16(vdup_n_u16(x)); +} + +SIMD_INLINE v64 v64_dup_32(uint32_t x) { + return vreinterpret_s64_u32(vdup_n_u32(x)); +} + +SIMD_INLINE int64_t v64_dotp_su8(v64 x, v64 y) { + int16x8_t t = + vmulq_s16(vmovl_s8(vreinterpret_s8_s64(x)), + vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y)))); +#if defined(__aarch64__) + return vaddlvq_s16(t); +#else + int64x2_t r = vpaddlq_s32(vpaddlq_s16(t)); + return (int64_t)vadd_s64(vget_high_s64(r), vget_low_s64(r)); +#endif +} + +SIMD_INLINE int64_t v64_dotp_s16(v64 x, v64 y) { +#if defined(__aarch64__) + return vaddlvq_s32( + vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +#else + int64x2_t r = + vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); + return (int64_t)(vget_high_s64(r) + vget_low_s64(r)); +#endif +} + +SIMD_INLINE uint64_t v64_hadd_u8(v64 x) { +#if defined(__aarch64__) + return vaddlv_u8(vreinterpret_u8_s64(x)); +#else + return (uint64_t)vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x)))); +#endif +} + +SIMD_INLINE int64_t v64_hadd_s16(v64 a) { + return (int64_t)vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a))); +} + +typedef uint16x8_t sad64_internal; + +SIMD_INLINE sad64_internal v64_sad_u8_init(void) { return vdupq_n_u16(0); } + +// Implementation dependent return value. Result must be finalised with +// v64_sad_u8_sum(). +SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) { + return vabal_u8(s, vreinterpret_u8_s64(a), vreinterpret_u8_s64(b)); +} + +SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { +#if defined(__aarch64__) + return vaddlvq_u16(s); +#else + uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s)); + return (uint32_t)(uint64_t)(vget_high_u64(r) + vget_low_u64(r)); +#endif +} + +typedef uint32x4_t ssd64_internal; + +SIMD_INLINE ssd64_internal v64_ssd_u8_init(void) { return vdupq_n_u32(0); } + +// Implementation dependent return value. Result must be finalised with +// v64_ssd_u8_sum(). +SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) { + uint8x8_t t = vabd_u8(vreinterpret_u8_s64(a), vreinterpret_u8_s64(b)); + return vaddq_u32(s, vpaddlq_u16(vmull_u8(t, t))); +} + +SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) { +#if defined(__aarch64__) + return vaddvq_u32(s); +#else + uint64x2_t t = vpaddlq_u32(s); + return vget_lane_u32( + vreinterpret_u32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0); +#endif +} + +SIMD_INLINE v64 v64_or(v64 x, v64 y) { return vorr_s64(x, y); } + +SIMD_INLINE v64 v64_xor(v64 x, v64 y) { return veor_s64(x, y); } + +SIMD_INLINE v64 v64_and(v64 x, v64 y) { return vand_s64(x, y); } + +SIMD_INLINE v64 v64_andn(v64 x, v64 y) { return vbic_s64(x, y); } + +SIMD_INLINE v64 v64_add_8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); +} + +SIMD_INLINE v64 v64_sadd_u8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vqadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); +} + +SIMD_INLINE v64 v64_sadd_s8(v64 x, v64 y) { + return vreinterpret_s64_s8( + vqadd_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); +} + +SIMD_INLINE v64 v64_add_16(v64 x, v64 y) { + return vreinterpret_s64_s16( + vadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_sadd_s16(v64 x, v64 y) { + return vreinterpret_s64_s16( + vqadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_add_32(v64 x, v64 y) { + return vreinterpret_s64_u32( + vadd_u32(vreinterpret_u32_s64(x), vreinterpret_u32_s64(y))); +} + +SIMD_INLINE v64 v64_sub_8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); +} + +SIMD_INLINE v64 v64_sub_16(v64 x, v64 y) { + return vreinterpret_s64_s16( + vsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_ssub_s16(v64 x, v64 y) { + return vreinterpret_s64_s16( + vqsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_ssub_u16(v64 x, v64 y) { + return vreinterpret_s64_u16( + vqsub_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y))); +} + +SIMD_INLINE v64 v64_ssub_u8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vqsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); +} + +SIMD_INLINE v64 v64_ssub_s8(v64 x, v64 y) { + return vreinterpret_s64_s8( + vqsub_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); +} + +SIMD_INLINE v64 v64_sub_32(v64 x, v64 y) { + return vreinterpret_s64_s32( + vsub_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y))); +} + +SIMD_INLINE v64 v64_abs_s16(v64 x) { + return vreinterpret_s64_s16(vabs_s16(vreinterpret_s16_s64(x))); +} + +SIMD_INLINE v64 v64_abs_s8(v64 x) { + return vreinterpret_s64_s8(vabs_s8(vreinterpret_s8_s64(x))); +} + +SIMD_INLINE v64 v64_mullo_s16(v64 x, v64 y) { + return vreinterpret_s64_s16( + vmul_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_mulhi_s16(v64 x, v64 y) { +#if defined(__aarch64__) + int16x8_t t = vreinterpretq_s16_s32( + vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); + return vget_low_s64(vreinterpretq_s64_s16(vuzp2q_s16(t, t))); +#else + return vreinterpret_s64_s16(vmovn_s32(vshrq_n_s32( + vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)), 16))); +#endif +} + +SIMD_INLINE v64 v64_mullo_s32(v64 x, v64 y) { + return vreinterpret_s64_s32( + vmul_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y))); +} + +SIMD_INLINE v64 v64_madd_s16(v64 x, v64 y) { + int32x4_t t = vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)); + return vreinterpret_s64_s32( + vpadd_s32(vreinterpret_s32_s64(vget_low_s64(vreinterpretq_s64_s32(t))), + vreinterpret_s32_s64(vget_high_s64(vreinterpretq_s64_s32(t))))); +} + +SIMD_INLINE v64 v64_madd_us8(v64 x, v64 y) { + int16x8_t t = + vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(x))), + vmovl_s8(vreinterpret_s8_s64(y))); + return vreinterpret_s64_s16(vqmovn_s32(vpaddlq_s16(t))); +} + +SIMD_INLINE v64 v64_avg_u8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vrhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); +} + +SIMD_INLINE v64 v64_rdavg_u8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); +} + +SIMD_INLINE v64 v64_rdavg_u16(v64 x, v64 y) { + return vreinterpret_s64_u16( + vhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y))); +} + +SIMD_INLINE v64 v64_avg_u16(v64 x, v64 y) { + return vreinterpret_s64_u16( + vrhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y))); +} + +SIMD_INLINE v64 v64_max_u8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vmax_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); +} + +SIMD_INLINE v64 v64_min_u8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vmin_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); +} + +SIMD_INLINE v64 v64_max_s8(v64 x, v64 y) { + return vreinterpret_s64_s8( + vmax_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); +} + +SIMD_INLINE v64 v64_min_s8(v64 x, v64 y) { + return vreinterpret_s64_s8( + vmin_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); +} + +SIMD_INLINE v64 v64_max_s16(v64 x, v64 y) { + return vreinterpret_s64_s16( + vmax_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_min_s16(v64 x, v64 y) { + return vreinterpret_s64_s16( + vmin_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_ziplo_8(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u8( + vzip1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x))); +#else + uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); + return vreinterpret_s64_u8(r.val[0]); +#endif +} + +SIMD_INLINE v64 v64_ziphi_8(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u8( + vzip2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x))); +#else + uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); + return vreinterpret_s64_u8(r.val[1]); +#endif +} + +SIMD_INLINE v64 v64_ziplo_16(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u16( + vzip1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x))); +#else + int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x)); + return vreinterpret_s64_s16(r.val[0]); +#endif +} + +SIMD_INLINE v64 v64_ziphi_16(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u16( + vzip2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x))); +#else + int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x)); + return vreinterpret_s64_s16(r.val[1]); +#endif +} + +SIMD_INLINE v64 v64_ziplo_32(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u32( + vzip1_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x))); +#else + int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)); + return vreinterpret_s64_s32(r.val[0]); +#endif +} + +SIMD_INLINE v64 v64_ziphi_32(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u32( + vzip2_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x))); +#else + int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)); + return vreinterpret_s64_s32(r.val[1]); +#endif +} + +SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { + return vreinterpret_s64_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_s64(a)))); +} + +SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { + return vreinterpret_s64_u16(vget_high_u16(vmovl_u8(vreinterpret_u8_s64(a)))); +} + +SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) { + return vreinterpret_s64_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s64(a)))); +} + +SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { + return vreinterpret_s64_s16(vget_high_s16(vmovl_s8(vreinterpret_s8_s64(a)))); +} + +SIMD_INLINE v64 v64_pack_s32_s16(v64 x, v64 y) { + return vreinterpret_s64_s16(vqmovn_s32( + vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))); +} + +SIMD_INLINE v64 v64_pack_s32_u16(v64 x, v64 y) { + return vreinterpret_s64_u16(vqmovun_s32( + vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))); +} + +SIMD_INLINE v64 v64_pack_s16_u8(v64 x, v64 y) { + return vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s32( + vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))))); +} + +SIMD_INLINE v64 v64_pack_s16_s8(v64 x, v64 y) { + return vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s32( + vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))))); +} + +SIMD_INLINE v64 v64_unziplo_8(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u8( + vuzp1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x))); +#else + uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); + return vreinterpret_s64_u8(r.val[0]); +#endif +} + +SIMD_INLINE v64 v64_unziphi_8(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u8( + vuzp2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x))); +#else + uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); + return vreinterpret_s64_u8(r.val[1]); +#endif +} + +SIMD_INLINE v64 v64_unziplo_16(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u16( + vuzp1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x))); +#else + uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)); + return vreinterpret_s64_u16(r.val[0]); +#endif +} + +SIMD_INLINE v64 v64_unziphi_16(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u16( + vuzp2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x))); +#else + uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)); + return vreinterpret_s64_u16(r.val[1]); +#endif +} + +SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 x) { + return vreinterpret_s64_s32(vget_low_s32(vmovl_s16(vreinterpret_s16_s64(x)))); +} + +SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 x) { + return vreinterpret_s64_u32(vget_low_u32(vmovl_u16(vreinterpret_u16_s64(x)))); +} + +SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 x) { + return vreinterpret_s64_s32( + vget_high_s32(vmovl_s16(vreinterpret_s16_s64(x)))); +} + +SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 x) { + return vreinterpret_s64_u32( + vget_high_u32(vmovl_u16(vreinterpret_u16_s64(x)))); +} + +SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) { + return vreinterpret_s64_u8( + vtbl1_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(pattern))); +} + +SIMD_INLINE v64 v64_cmpgt_s8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vcgt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); +} + +SIMD_INLINE v64 v64_cmplt_s8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vclt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); +} + +SIMD_INLINE v64 v64_cmpeq_8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vceq_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); +} + +SIMD_INLINE v64 v64_cmpgt_s16(v64 x, v64 y) { + return vreinterpret_s64_u16( + vcgt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_cmplt_s16(v64 x, v64 y) { + return vreinterpret_s64_u16( + vclt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_cmpeq_16(v64 x, v64 y) { + return vreinterpret_s64_u16( + vceq_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) { + return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(c))); +} + +SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) { + return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(-c))); +} + +SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) { + return vreinterpret_s64_s8(vshl_s8(vreinterpret_s8_s64(a), vdup_n_s8(-c))); +} + +SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) { + return vreinterpret_s64_u16(vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(c))); +} + +SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) { + return vreinterpret_s64_u16( + vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(-(int)c))); +} + +SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) { + return vreinterpret_s64_s16( + vshl_s16(vreinterpret_s16_s64(a), vdup_n_s16(-(int)c))); +} + +SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) { + return vreinterpret_s64_u32(vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(c))); +} + +SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) { + return vreinterpret_s64_u32( + vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(-(int)c))); +} + +SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) { + return vreinterpret_s64_s32( + vshl_s32(vreinterpret_s32_s64(a), vdup_n_s32(-(int)c))); +} + +// The following functions require an immediate. +// Some compilers will check this during optimisation, others wont. +#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) + +SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) { + return vshl_n_s64(a, c * 8); +} + +SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) { + return c ? (v64)vshr_n_u64(vreinterpret_u64_s64(a), c * 8) : a; +} + +SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { + return c ? vreinterpret_s64_u8(vshl_n_u8(vreinterpret_u8_s64(a), c)) : a; +} + +SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { + return c ? vreinterpret_s64_u8(vshr_n_u8(vreinterpret_u8_s64(a), c)) : a; +} + +SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { + return c ? vreinterpret_s64_s8(vshr_n_s8(vreinterpret_s8_s64(a), c)) : a; +} + +SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { + return c ? vreinterpret_s64_u16(vshl_n_u16(vreinterpret_u16_s64(a), c)) : a; +} + +SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) { + return c ? vreinterpret_s64_u16(vshr_n_u16(vreinterpret_u16_s64(a), c)) : a; +} + +SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) { + return c ? vreinterpret_s64_s16(vshr_n_s16(vreinterpret_s16_s64(a), c)) : a; +} + +SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { + return c ? vreinterpret_s64_u32(vshl_n_u32(vreinterpret_u32_s64(a), c)) : a; +} + +SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) { + return c ? vreinterpret_s64_u32(vshr_n_u32(vreinterpret_u32_s64(a), c)) : a; +} + +SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) { + return c ? vreinterpret_s64_s32(vshr_n_s32(vreinterpret_s32_s64(a), c)) : a; +} + +#else + +SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) { + return v64_from_64(v64_u64(a) << c * 8); +} + +SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) { + return v64_from_64(v64_u64(a) >> c * 8); +} + +SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { return v64_shl_8(a, c); } + +SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { return v64_shr_u8(a, c); } + +SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { return v64_shr_s8(a, c); } + +SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { return v64_shl_16(a, c); } + +SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) { + return v64_shr_u16(a, c); +} + +SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) { + return v64_shr_s16(a, c); +} + +SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { return v64_shl_32(a, c); } + +SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) { + return v64_shr_u32(a, c); +} + +SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) { + return v64_shr_s32(a, c); +} + +#endif + +#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_ diff --git a/libs/libaom/src/aom_dsp/simd/v64_intrinsics_c.h b/libs/libaom/src/aom_dsp/simd/v64_intrinsics_c.h new file mode 100644 index 000000000..b84f243c4 --- /dev/null +++ b/libs/libaom/src/aom_dsp/simd/v64_intrinsics_c.h @@ -0,0 +1,982 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_ +#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_ + +/* Note: This implements the intrinsics in plain, unoptimised C. + Intended for reference, porting or debugging. */ + +#include +#include + +#include "config/aom_config.h" + +typedef union { + uint8_t u8[8]; + uint16_t u16[4]; + uint32_t u32[2]; + uint64_t u64; + int8_t s8[8]; + int16_t s16[4]; + int32_t s32[2]; + int64_t s64; +} c_v64; + +SIMD_INLINE uint32_t c_v64_low_u32(c_v64 a) { + return a.u32[!!CONFIG_BIG_ENDIAN]; +} + +SIMD_INLINE uint32_t c_v64_high_u32(c_v64 a) { + return a.u32[!CONFIG_BIG_ENDIAN]; +} + +SIMD_INLINE int32_t c_v64_low_s32(c_v64 a) { + return a.s32[!!CONFIG_BIG_ENDIAN]; +} + +SIMD_INLINE int32_t c_v64_high_s32(c_v64 a) { + return a.s32[!CONFIG_BIG_ENDIAN]; +} + +SIMD_INLINE c_v64 c_v64_from_32(uint32_t x, uint32_t y) { + c_v64 t; + t.u32[!CONFIG_BIG_ENDIAN] = x; + t.u32[!!CONFIG_BIG_ENDIAN] = y; + return t; +} + +SIMD_INLINE c_v64 c_v64_from_64(uint64_t x) { + c_v64 t; + t.u64 = x; + return t; +} + +SIMD_INLINE uint64_t c_v64_u64(c_v64 x) { return x.u64; } + +SIMD_INLINE c_v64 c_v64_from_16(uint16_t a, uint16_t b, uint16_t c, + uint16_t d) { + c_v64 t; + if (CONFIG_BIG_ENDIAN) { + t.u16[0] = a; + t.u16[1] = b; + t.u16[2] = c; + t.u16[3] = d; + } else { + t.u16[3] = a; + t.u16[2] = b; + t.u16[1] = c; + t.u16[0] = d; + } + return t; +} + +SIMD_INLINE uint32_t c_u32_load_unaligned(const void *p) { + uint32_t t; + uint8_t *pp = (uint8_t *)p; + uint8_t *q = (uint8_t *)&t; + int c; + for (c = 0; c < 4; c++) q[c] = pp[c]; + return t; +} + +SIMD_INLINE void c_u32_store_unaligned(void *p, uint32_t a) { + uint8_t *pp = (uint8_t *)p; + uint8_t *q = (uint8_t *)&a; + int c; + for (c = 0; c < 4; c++) pp[c] = q[c]; +} + +SIMD_INLINE uint32_t c_u32_load_aligned(const void *p) { + if (SIMD_CHECK && (uintptr_t)p & 3) { + fprintf(stderr, "Error: Unaligned u32 load at %p\n", p); + abort(); + } + return c_u32_load_unaligned(p); +} + +SIMD_INLINE void c_u32_store_aligned(void *p, uint32_t a) { + if (SIMD_CHECK && (uintptr_t)p & 3) { + fprintf(stderr, "Error: Unaligned u32 store at %p\n", p); + abort(); + } + c_u32_store_unaligned(p, a); +} + +SIMD_INLINE c_v64 c_v64_load_unaligned(const void *p) { + c_v64 t; + uint8_t *pp = (uint8_t *)p; + uint8_t *q = (uint8_t *)&t; + int c; + for (c = 0; c < 8; c++) q[c] = pp[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_load_aligned(const void *p) { + if (SIMD_CHECK && (uintptr_t)p & 7) { + fprintf(stderr, "Error: Unaligned c_v64 load at %p\n", p); + abort(); + } + return c_v64_load_unaligned(p); +} + +SIMD_INLINE void c_v64_store_unaligned(void *p, c_v64 a) { + uint8_t *q = (uint8_t *)p; + uint8_t *r = (uint8_t *)&a; + int c; + for (c = 0; c < 8; c++) q[c] = r[c]; +} + +SIMD_INLINE void c_v64_store_aligned(void *p, c_v64 a) { + if (SIMD_CHECK && (uintptr_t)p & 7) { + fprintf(stderr, "Error: Unaligned c_v64 store at %p\n", p); + abort(); + } + c_v64_store_unaligned(p, a); +} + +SIMD_INLINE c_v64 c_v64_zero(void) { + c_v64 t; + t.u64 = 0; + return t; +} + +SIMD_INLINE c_v64 c_v64_dup_8(uint8_t x) { + c_v64 t; + t.u8[0] = t.u8[1] = t.u8[2] = t.u8[3] = t.u8[4] = t.u8[5] = t.u8[6] = + t.u8[7] = x; + return t; +} + +SIMD_INLINE c_v64 c_v64_dup_16(uint16_t x) { + c_v64 t; + t.u16[0] = t.u16[1] = t.u16[2] = t.u16[3] = x; + return t; +} + +SIMD_INLINE c_v64 c_v64_dup_32(uint32_t x) { + c_v64 t; + t.u32[0] = t.u32[1] = x; + return t; +} + +SIMD_INLINE c_v64 c_v64_add_8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.u8[c] = (uint8_t)(a.u8[c] + b.u8[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_add_16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] + b.u16[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_sadd_u8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) + t.u8[c] = (int16_t)a.u8[c] + (int16_t)b.u8[c] > 255 + ? 255 + : (int16_t)a.u8[c] + (int16_t)b.u8[c] < 0 + ? 0 + : (int16_t)a.u8[c] + (int16_t)b.u8[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_sadd_s8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) + t.s8[c] = (int16_t)a.s8[c] + (int16_t)b.s8[c] > 127 + ? 127 + : (int16_t)a.s8[c] + (int16_t)b.s8[c] < -128 + ? -128 + : (int16_t)a.s8[c] + (int16_t)b.s8[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_sadd_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) + t.s16[c] = (int32_t)a.s16[c] + (int32_t)b.s16[c] > 32767 + ? 32767 + : (int32_t)a.s16[c] + (int32_t)b.s16[c] < -32768 + ? -32768 + : (int32_t)a.s16[c] + (int32_t)b.s16[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_add_32(c_v64 a, c_v64 b) { + c_v64 t; + t.u32[0] = (uint32_t)((uint64_t)a.u32[0] + b.u32[0]); + t.u32[1] = (uint32_t)((uint64_t)a.u32[1] + b.u32[1]); + return t; +} + +SIMD_INLINE c_v64 c_v64_sub_8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.u8[c] = (uint8_t)(a.u8[c] - b.u8[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_ssub_u8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] < b.u8[c] ? 0 : a.u8[c] - b.u8[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_ssub_s8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) { + int16_t d = (int16_t)a.s8[c] - (int16_t)b.s8[c]; + t.s8[c] = d > 127 ? 127 : (d < -128 ? -128 : d); + } + return t; +} + +SIMD_INLINE c_v64 c_v64_sub_16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] - b.u16[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_ssub_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) + t.s16[c] = (int32_t)a.s16[c] - (int32_t)b.s16[c] < -32768 + ? -32768 + : (int32_t)a.s16[c] - (int32_t)b.s16[c] > 32767 + ? 32767 + : (int32_t)a.s16[c] - (int32_t)b.s16[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_ssub_u16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) + t.u16[c] = + (int32_t)a.u16[c] - (int32_t)b.u16[c] < 0 ? 0 : a.u16[c] - b.u16[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_sub_32(c_v64 a, c_v64 b) { + c_v64 t; + t.u32[0] = (uint32_t)((int64_t)a.u32[0] - b.u32[0]); + t.u32[1] = (uint32_t)((int64_t)a.u32[1] - b.u32[1]); + return t; +} + +SIMD_INLINE c_v64 c_v64_abs_s16(c_v64 a) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) + t.u16[c] = (uint16_t)((int16_t)a.u16[c] > 0 ? a.u16[c] : -a.u16[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_abs_s8(c_v64 a) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) + t.u8[c] = (uint8_t)((int8_t)a.u8[c] > 0 ? a.u8[c] : -a.u8[c]); + return t; +} + +SIMD_INLINE c_v64 _c_v64_zip_8(c_v64 a, c_v64 b, int mode) { + c_v64 t; + if (mode) { + t.u8[7] = a.u8[7]; + t.u8[6] = b.u8[7]; + t.u8[5] = a.u8[6]; + t.u8[4] = b.u8[6]; + t.u8[3] = a.u8[5]; + t.u8[2] = b.u8[5]; + t.u8[1] = a.u8[4]; + t.u8[0] = b.u8[4]; + } else { + t.u8[7] = a.u8[3]; + t.u8[6] = b.u8[3]; + t.u8[5] = a.u8[2]; + t.u8[4] = b.u8[2]; + t.u8[3] = a.u8[1]; + t.u8[2] = b.u8[1]; + t.u8[1] = a.u8[0]; + t.u8[0] = b.u8[0]; + } + return t; +} + +SIMD_INLINE c_v64 c_v64_ziplo_8(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 1) : _c_v64_zip_8(a, b, 0); +} + +SIMD_INLINE c_v64 c_v64_ziphi_8(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 0) : _c_v64_zip_8(a, b, 1); +} + +SIMD_INLINE c_v64 _c_v64_zip_16(c_v64 a, c_v64 b, int mode) { + c_v64 t; + if (mode) { + t.u16[3] = a.u16[3]; + t.u16[2] = b.u16[3]; + t.u16[1] = a.u16[2]; + t.u16[0] = b.u16[2]; + } else { + t.u16[3] = a.u16[1]; + t.u16[2] = b.u16[1]; + t.u16[1] = a.u16[0]; + t.u16[0] = b.u16[0]; + } + return t; +} + +SIMD_INLINE c_v64 c_v64_ziplo_16(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 1) : _c_v64_zip_16(a, b, 0); +} + +SIMD_INLINE c_v64 c_v64_ziphi_16(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 0) : _c_v64_zip_16(a, b, 1); +} + +SIMD_INLINE c_v64 _c_v64_zip_32(c_v64 a, c_v64 b, int mode) { + c_v64 t; + if (mode) { + t.u32[1] = a.u32[1]; + t.u32[0] = b.u32[1]; + } else { + t.u32[1] = a.u32[0]; + t.u32[0] = b.u32[0]; + } + return t; +} + +SIMD_INLINE c_v64 c_v64_ziplo_32(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 1) : _c_v64_zip_32(a, b, 0); +} + +SIMD_INLINE c_v64 c_v64_ziphi_32(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 0) : _c_v64_zip_32(a, b, 1); +} + +SIMD_INLINE c_v64 _c_v64_unzip_8(c_v64 a, c_v64 b, int mode) { + c_v64 t; + if (mode) { + t.u8[7] = b.u8[7]; + t.u8[6] = b.u8[5]; + t.u8[5] = b.u8[3]; + t.u8[4] = b.u8[1]; + t.u8[3] = a.u8[7]; + t.u8[2] = a.u8[5]; + t.u8[1] = a.u8[3]; + t.u8[0] = a.u8[1]; + } else { + t.u8[7] = a.u8[6]; + t.u8[6] = a.u8[4]; + t.u8[5] = a.u8[2]; + t.u8[4] = a.u8[0]; + t.u8[3] = b.u8[6]; + t.u8[2] = b.u8[4]; + t.u8[1] = b.u8[2]; + t.u8[0] = b.u8[0]; + } + return t; +} + +SIMD_INLINE c_v64 c_v64_unziplo_8(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(a, b, 1) : _c_v64_unzip_8(a, b, 0); +} + +SIMD_INLINE c_v64 c_v64_unziphi_8(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(b, a, 0) : _c_v64_unzip_8(b, a, 1); +} + +SIMD_INLINE c_v64 _c_v64_unzip_16(c_v64 a, c_v64 b, int mode) { + c_v64 t; + if (mode) { + t.u16[3] = b.u16[3]; + t.u16[2] = b.u16[1]; + t.u16[1] = a.u16[3]; + t.u16[0] = a.u16[1]; + } else { + t.u16[3] = a.u16[2]; + t.u16[2] = a.u16[0]; + t.u16[1] = b.u16[2]; + t.u16[0] = b.u16[0]; + } + return t; +} + +SIMD_INLINE c_v64 c_v64_unziplo_16(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(a, b, 1) + : _c_v64_unzip_16(a, b, 0); +} + +SIMD_INLINE c_v64 c_v64_unziphi_16(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(b, a, 0) + : _c_v64_unzip_16(b, a, 1); +} + +SIMD_INLINE c_v64 c_v64_unpacklo_u8_s16(c_v64 a) { + c_v64 t; + int endian = !!CONFIG_BIG_ENDIAN * 4; + t.s16[3] = (int16_t)a.u8[3 + endian]; + t.s16[2] = (int16_t)a.u8[2 + endian]; + t.s16[1] = (int16_t)a.u8[1 + endian]; + t.s16[0] = (int16_t)a.u8[0 + endian]; + return t; +} + +SIMD_INLINE c_v64 c_v64_unpackhi_u8_s16(c_v64 a) { + c_v64 t; + int endian = !!CONFIG_BIG_ENDIAN * 4; + t.s16[3] = (int16_t)a.u8[7 - endian]; + t.s16[2] = (int16_t)a.u8[6 - endian]; + t.s16[1] = (int16_t)a.u8[5 - endian]; + t.s16[0] = (int16_t)a.u8[4 - endian]; + return t; +} + +SIMD_INLINE c_v64 c_v64_unpacklo_s8_s16(c_v64 a) { + c_v64 t; + int endian = !!CONFIG_BIG_ENDIAN * 4; + t.s16[3] = (int16_t)a.s8[3 + endian]; + t.s16[2] = (int16_t)a.s8[2 + endian]; + t.s16[1] = (int16_t)a.s8[1 + endian]; + t.s16[0] = (int16_t)a.s8[0 + endian]; + return t; +} + +SIMD_INLINE c_v64 c_v64_unpackhi_s8_s16(c_v64 a) { + c_v64 t; + int endian = !!CONFIG_BIG_ENDIAN * 4; + t.s16[3] = (int16_t)a.s8[7 - endian]; + t.s16[2] = (int16_t)a.s8[6 - endian]; + t.s16[1] = (int16_t)a.s8[5 - endian]; + t.s16[0] = (int16_t)a.s8[4 - endian]; + return t; +} + +SIMD_INLINE c_v64 c_v64_pack_s32_s16(c_v64 a, c_v64 b) { + c_v64 t; + if (CONFIG_BIG_ENDIAN) { + c_v64 u = a; + a = b; + b = u; + } + t.s16[3] = a.s32[1] > 32767 ? 32767 : a.s32[1] < -32768 ? -32768 : a.s32[1]; + t.s16[2] = a.s32[0] > 32767 ? 32767 : a.s32[0] < -32768 ? -32768 : a.s32[0]; + t.s16[1] = b.s32[1] > 32767 ? 32767 : b.s32[1] < -32768 ? -32768 : b.s32[1]; + t.s16[0] = b.s32[0] > 32767 ? 32767 : b.s32[0] < -32768 ? -32768 : b.s32[0]; + return t; +} + +SIMD_INLINE c_v64 c_v64_pack_s32_u16(c_v64 a, c_v64 b) { + c_v64 t; + if (CONFIG_BIG_ENDIAN) { + c_v64 u = a; + a = b; + b = u; + } + t.u16[3] = a.s32[1] > 65535 ? 65535 : a.s32[1] < 0 ? 0 : a.s32[1]; + t.u16[2] = a.s32[0] > 65535 ? 65535 : a.s32[0] < 0 ? 0 : a.s32[0]; + t.u16[1] = b.s32[1] > 65535 ? 65535 : b.s32[1] < 0 ? 0 : b.s32[1]; + t.u16[0] = b.s32[0] > 65535 ? 65535 : b.s32[0] < 0 ? 0 : b.s32[0]; + return t; +} + +SIMD_INLINE c_v64 c_v64_pack_s16_u8(c_v64 a, c_v64 b) { + c_v64 t; + if (CONFIG_BIG_ENDIAN) { + c_v64 u = a; + a = b; + b = u; + } + t.u8[7] = a.s16[3] > 255 ? 255 : a.s16[3] < 0 ? 0 : a.s16[3]; + t.u8[6] = a.s16[2] > 255 ? 255 : a.s16[2] < 0 ? 0 : a.s16[2]; + t.u8[5] = a.s16[1] > 255 ? 255 : a.s16[1] < 0 ? 0 : a.s16[1]; + t.u8[4] = a.s16[0] > 255 ? 255 : a.s16[0] < 0 ? 0 : a.s16[0]; + t.u8[3] = b.s16[3] > 255 ? 255 : b.s16[3] < 0 ? 0 : b.s16[3]; + t.u8[2] = b.s16[2] > 255 ? 255 : b.s16[2] < 0 ? 0 : b.s16[2]; + t.u8[1] = b.s16[1] > 255 ? 255 : b.s16[1] < 0 ? 0 : b.s16[1]; + t.u8[0] = b.s16[0] > 255 ? 255 : b.s16[0] < 0 ? 0 : b.s16[0]; + return t; +} + +SIMD_INLINE c_v64 c_v64_pack_s16_s8(c_v64 a, c_v64 b) { + c_v64 t; + if (CONFIG_BIG_ENDIAN) { + c_v64 u = a; + a = b; + b = u; + } + t.u8[7] = (uint8_t)(a.s16[3] > 127 ? 127 : a.s16[3] < -128 ? 128 : a.s16[3]); + t.u8[6] = (uint8_t)(a.s16[2] > 127 ? 127 : a.s16[2] < -128 ? 128 : a.s16[2]); + t.u8[5] = (uint8_t)(a.s16[1] > 127 ? 127 : a.s16[1] < -128 ? 128 : a.s16[1]); + t.u8[4] = (uint8_t)(a.s16[0] > 127 ? 127 : a.s16[0] < -128 ? 128 : a.s16[0]); + t.u8[3] = (uint8_t)(b.s16[3] > 127 ? 127 : b.s16[3] < -128 ? 128 : b.s16[3]); + t.u8[2] = (uint8_t)(b.s16[2] > 127 ? 127 : b.s16[2] < -128 ? 128 : b.s16[2]); + t.u8[1] = (uint8_t)(b.s16[1] > 127 ? 127 : b.s16[1] < -128 ? 128 : b.s16[1]); + t.u8[0] = (uint8_t)(b.s16[0] > 127 ? 127 : b.s16[0] < -128 ? 128 : b.s16[0]); + return t; +} + +SIMD_INLINE c_v64 c_v64_unpacklo_u16_s32(c_v64 a) { + c_v64 t; + t.s32[1] = a.u16[1 + !!CONFIG_BIG_ENDIAN * 2]; + t.s32[0] = a.u16[0 + !!CONFIG_BIG_ENDIAN * 2]; + return t; +} + +SIMD_INLINE c_v64 c_v64_unpacklo_s16_s32(c_v64 a) { + c_v64 t; + t.s32[1] = a.s16[1 + !!CONFIG_BIG_ENDIAN * 2]; + t.s32[0] = a.s16[0 + !!CONFIG_BIG_ENDIAN * 2]; + return t; +} + +SIMD_INLINE c_v64 c_v64_unpackhi_u16_s32(c_v64 a) { + c_v64 t; + t.s32[1] = a.u16[3 - !!CONFIG_BIG_ENDIAN * 2]; + t.s32[0] = a.u16[2 - !!CONFIG_BIG_ENDIAN * 2]; + return t; +} + +SIMD_INLINE c_v64 c_v64_unpackhi_s16_s32(c_v64 a) { + c_v64 t; + t.s32[1] = a.s16[3 - !!CONFIG_BIG_ENDIAN * 2]; + t.s32[0] = a.s16[2 - !!CONFIG_BIG_ENDIAN * 2]; + return t; +} + +SIMD_INLINE c_v64 c_v64_shuffle_8(c_v64 a, c_v64 pattern) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) { + if (SIMD_CHECK && (pattern.u8[c] & ~7)) { + fprintf(stderr, "Error: Undefined v64_shuffle_8 index %d/%d\n", + pattern.u8[c], c); + abort(); + } + t.u8[c] = + a.u8[CONFIG_BIG_ENDIAN ? 7 - (pattern.u8[c] & 7) : pattern.u8[c] & 7]; + } + return t; +} + +SIMD_INLINE int64_t c_v64_dotp_su8(c_v64 a, c_v64 b) { + return a.s8[7] * b.u8[7] + a.s8[6] * b.u8[6] + a.s8[5] * b.u8[5] + + a.s8[4] * b.u8[4] + a.s8[3] * b.u8[3] + a.s8[2] * b.u8[2] + + a.s8[1] * b.u8[1] + a.s8[0] * b.u8[0]; +} + +SIMD_INLINE int64_t c_v64_dotp_s16(c_v64 a, c_v64 b) { + return (int64_t)(a.s16[3] * b.s16[3] + a.s16[2] * b.s16[2]) + + (int64_t)(a.s16[1] * b.s16[1] + a.s16[0] * b.s16[0]); +} + +SIMD_INLINE uint64_t c_v64_hadd_u8(c_v64 a) { + return a.u8[7] + a.u8[6] + a.u8[5] + a.u8[4] + a.u8[3] + a.u8[2] + a.u8[1] + + a.u8[0]; +} + +SIMD_INLINE int64_t c_v64_hadd_s16(c_v64 a) { + return a.s16[3] + a.s16[2] + a.s16[1] + a.s16[0]; +} + +typedef struct { + uint32_t val; + int count; +} c_sad64_internal; + +SIMD_INLINE c_sad64_internal c_v64_sad_u8_init(void) { + c_sad64_internal t; + t.val = t.count = 0; + return t; +} + +/* Implementation dependent return value. Result must be finalised with + v64_sad_u8_sum(). The result for more than 32 v64_sad_u8() calls is + undefined. */ +SIMD_INLINE c_sad64_internal c_v64_sad_u8(c_sad64_internal s, c_v64 a, + c_v64 b) { + int c; + for (c = 0; c < 8; c++) + s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c]; + s.count++; + if (SIMD_CHECK && s.count > 32) { + fprintf(stderr, + "Error: sad called 32 times returning an undefined result\n"); + abort(); + } + return s; +} + +SIMD_INLINE uint32_t c_v64_sad_u8_sum(c_sad64_internal s) { return s.val; } + +typedef uint32_t c_ssd64_internal; + +/* Implementation dependent return value. Result must be finalised with + * v64_ssd_u8_sum(). */ +SIMD_INLINE c_ssd64_internal c_v64_ssd_u8_init(void) { return 0; } + +SIMD_INLINE c_ssd64_internal c_v64_ssd_u8(c_ssd64_internal s, c_v64 a, + c_v64 b) { + int c; + for (c = 0; c < 8; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]); + return s; +} + +SIMD_INLINE uint32_t c_v64_ssd_u8_sum(c_ssd64_internal s) { return s; } + +SIMD_INLINE c_v64 c_v64_or(c_v64 a, c_v64 b) { + c_v64 t; + t.u64 = a.u64 | b.u64; + return t; +} + +SIMD_INLINE c_v64 c_v64_xor(c_v64 a, c_v64 b) { + c_v64 t; + t.u64 = a.u64 ^ b.u64; + return t; +} + +SIMD_INLINE c_v64 c_v64_and(c_v64 a, c_v64 b) { + c_v64 t; + t.u64 = a.u64 & b.u64; + return t; +} + +SIMD_INLINE c_v64 c_v64_andn(c_v64 a, c_v64 b) { + c_v64 t; + t.u64 = a.u64 & ~b.u64; + return t; +} + +SIMD_INLINE c_v64 c_v64_mullo_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.s16[c] = (int16_t)(a.s16[c] * b.s16[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_mulhi_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.s16[c] = (a.s16[c] * b.s16[c]) >> 16; + return t; +} + +SIMD_INLINE c_v64 c_v64_mullo_s32(c_v64 a, c_v64 b) { + c_v64 t; + t.s32[0] = (int32_t)((int64_t)a.s32[0] * b.s32[0]); + t.s32[1] = (int32_t)((int64_t)a.s32[1] * b.s32[1]); + return t; +} + +SIMD_INLINE c_v64 c_v64_madd_s16(c_v64 a, c_v64 b) { + c_v64 t; + t.s32[0] = a.s16[0] * b.s16[0] + a.s16[1] * b.s16[1]; + t.s32[1] = a.s16[2] * b.s16[2] + a.s16[3] * b.s16[3]; + return t; +} + +SIMD_INLINE c_v64 c_v64_madd_us8(c_v64 a, c_v64 b) { + c_v64 t; + int32_t u; + u = a.u8[0] * b.s8[0] + a.u8[1] * b.s8[1]; + t.s16[0] = u > 32767 ? 32767 : u < -32768 ? -32768 : u; + u = a.u8[2] * b.s8[2] + a.u8[3] * b.s8[3]; + t.s16[1] = u > 32767 ? 32767 : u < -32768 ? -32768 : u; + u = a.u8[4] * b.s8[4] + a.u8[5] * b.s8[5]; + t.s16[2] = u > 32767 ? 32767 : u < -32768 ? -32768 : u; + u = a.u8[6] * b.s8[6] + a.u8[7] * b.s8[7]; + t.s16[3] = u > 32767 ? 32767 : u < -32768 ? -32768 : u; + return t; +} + +SIMD_INLINE c_v64 c_v64_avg_u8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c] + 1) >> 1; + return t; +} + +SIMD_INLINE c_v64 c_v64_rdavg_u8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c]) >> 1; + return t; +} + +SIMD_INLINE c_v64 c_v64_rdavg_u16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c]) >> 1; + return t; +} + +SIMD_INLINE c_v64 c_v64_avg_u16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c] + 1) >> 1; + return t; +} + +SIMD_INLINE c_v64 c_v64_min_u8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? b.u8[c] : a.u8[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_max_u8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? a.u8[c] : b.u8[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_min_s8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? b.s8[c] : a.s8[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_max_s8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? a.s8[c] : b.s8[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_min_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? b.s16[c] : a.s16[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_max_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? a.s16[c] : b.s16[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_cmpgt_s8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] > b.s8[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_cmplt_s8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] < b.s8[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_cmpeq_8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.s8[c] = -(a.u8[c] == b.u8[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_cmpgt_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] > b.s16[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_cmplt_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] < b.s16[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_cmpeq_16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.s16[c] = -(a.u16[c] == b.u16[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_shl_8(c_v64 a, unsigned int n) { + c_v64 t; + int c; + if (SIMD_CHECK && n > 7) { + fprintf(stderr, "Error: Undefined u8 shift left %d\n", n); + abort(); + } + for (c = 0; c < 8; c++) t.s8[c] = (int8_t)(a.u8[c] << n); + return t; +} + +SIMD_INLINE c_v64 c_v64_shr_u8(c_v64 a, unsigned int n) { + c_v64 t; + int c; + if (SIMD_CHECK && n > 7) { + fprintf(stderr, "Error: Undefined u8 shift right %d\n", n); + abort(); + } + for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] >> n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shr_s8(c_v64 a, unsigned int n) { + c_v64 t; + int c; + if (SIMD_CHECK && n > 7) { + fprintf(stderr, "Error: Undefined s8 shift right %d\n", n); + abort(); + } + for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] >> n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shl_16(c_v64 a, unsigned int n) { + c_v64 t; + int c; + if (SIMD_CHECK && n > 15) { + fprintf(stderr, "Error: Undefined u16 shift left %d\n", n); + abort(); + } + for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] << n); + return t; +} + +SIMD_INLINE c_v64 c_v64_shr_u16(c_v64 a, unsigned int n) { + c_v64 t; + int c; + if (SIMD_CHECK && n > 15) { + fprintf(stderr, "Error: Undefined u16 shift right %d\n", n); + abort(); + } + for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] >> n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shr_s16(c_v64 a, unsigned int n) { + c_v64 t; + int c; + if (SIMD_CHECK && n > 15) { + fprintf(stderr, "Error: undefined s16 shift right %d\n", n); + abort(); + } + for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] >> n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shl_32(c_v64 a, unsigned int n) { + c_v64 t; + if (SIMD_CHECK && n > 31) { + fprintf(stderr, "Error: undefined u32 shift left %d\n", n); + abort(); + } + t.u32[1] = a.u32[1] << n; + t.u32[0] = a.u32[0] << n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shr_u32(c_v64 a, unsigned int n) { + c_v64 t; + if (SIMD_CHECK && n > 31) { + fprintf(stderr, "Error: undefined u32 shift right %d\n", n); + abort(); + } + t.u32[1] = a.u32[1] >> n; + t.u32[0] = a.u32[0] >> n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shr_s32(c_v64 a, unsigned int n) { + c_v64 t; + if (SIMD_CHECK && n > 31) { + fprintf(stderr, "Error: undefined s32 shift right %d\n", n); + abort(); + } + t.s32[1] = a.s32[1] >> n; + t.s32[0] = a.s32[0] >> n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shr_n_byte(c_v64 x, unsigned int i) { + c_v64 t; + t.u64 = x.u64 >> i * 8; + return t; +} + +SIMD_INLINE c_v64 c_v64_shl_n_byte(c_v64 x, unsigned int i) { + c_v64 t; + t.u64 = x.u64 << i * 8; + return t; +} + +SIMD_INLINE c_v64 c_v64_align(c_v64 a, c_v64 b, unsigned int c) { + if (SIMD_CHECK && c > 7) { + fprintf(stderr, "Error: undefined alignment %d\n", c); + abort(); + } + return c ? c_v64_or(c_v64_shr_n_byte(b, c), c_v64_shl_n_byte(a, 8 - c)) : b; +} + +SIMD_INLINE c_v64 c_v64_shl_n_8(c_v64 a, unsigned int c) { + return c_v64_shl_8(a, c); +} + +SIMD_INLINE c_v64 c_v64_shr_n_u8(c_v64 a, unsigned int c) { + return c_v64_shr_u8(a, c); +} + +SIMD_INLINE c_v64 c_v64_shr_n_s8(c_v64 a, unsigned int c) { + return c_v64_shr_s8(a, c); +} + +SIMD_INLINE c_v64 c_v64_shl_n_16(c_v64 a, unsigned int c) { + return c_v64_shl_16(a, c); +} + +SIMD_INLINE c_v64 c_v64_shr_n_u16(c_v64 a, unsigned int c) { + return c_v64_shr_u16(a, c); +} + +SIMD_INLINE c_v64 c_v64_shr_n_s16(c_v64 a, unsigned int c) { + return c_v64_shr_s16(a, c); +} + +SIMD_INLINE c_v64 c_v64_shl_n_32(c_v64 a, unsigned int c) { + return c_v64_shl_32(a, c); +} + +SIMD_INLINE c_v64 c_v64_shr_n_u32(c_v64 a, unsigned int c) { + return c_v64_shr_u32(a, c); +} + +SIMD_INLINE c_v64 c_v64_shr_n_s32(c_v64 a, unsigned int c) { + return c_v64_shr_s32(a, c); +} + +#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_ diff --git a/libs/libaom/src/aom_dsp/simd/v64_intrinsics_x86.h b/libs/libaom/src/aom_dsp/simd/v64_intrinsics_x86.h new file mode 100644 index 000000000..1f273fe96 --- /dev/null +++ b/libs/libaom/src/aom_dsp/simd/v64_intrinsics_x86.h @@ -0,0 +1,491 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_ +#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_ + +#include +#if defined(__SSSE3__) +#include +#endif +#if defined(__SSE4_1__) +#include +#endif + +typedef __m128i v64; + +SIMD_INLINE uint32_t v64_low_u32(v64 a) { + return (uint32_t)_mm_cvtsi128_si32(a); +} + +SIMD_INLINE uint32_t v64_high_u32(v64 a) { + return (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4)); +} + +SIMD_INLINE int32_t v64_low_s32(v64 a) { return (int32_t)_mm_cvtsi128_si32(a); } + +SIMD_INLINE int32_t v64_high_s32(v64 a) { + return (int32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4)); +} + +SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) { + return _mm_packs_epi32( + _mm_set_epi32((int16_t)a, (int16_t)b, (int16_t)c, (int16_t)d), + _mm_setzero_si128()); +} + +SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) { + return _mm_set_epi32(0, 0, x, y); +} + +SIMD_INLINE v64 v64_from_64(uint64_t x) { +#ifdef __x86_64__ + return _mm_cvtsi64_si128(x); +#else + return _mm_set_epi32(0, 0, x >> 32, (uint32_t)x); +#endif +} + +SIMD_INLINE uint64_t v64_u64(v64 x) { + return (uint64_t)v64_low_u32(x) | ((uint64_t)v64_high_u32(x) << 32); +} + +SIMD_INLINE uint32_t u32_load_aligned(const void *p) { + return *((uint32_t *)p); +} + +SIMD_INLINE uint32_t u32_load_unaligned(const void *p) { + return *((uint32_t *)p); +} + +SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) { + *((uint32_t *)p) = a; +} + +SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) { + *((uint32_t *)p) = a; +} + +SIMD_INLINE v64 v64_load_aligned(const void *p) { + return _mm_loadl_epi64((__m128i *)p); +} + +SIMD_INLINE v64 v64_load_unaligned(const void *p) { + return _mm_loadl_epi64((__m128i *)p); +} + +SIMD_INLINE void v64_store_aligned(void *p, v64 a) { + _mm_storel_epi64((__m128i *)p, a); +} + +SIMD_INLINE void v64_store_unaligned(void *p, v64 a) { + _mm_storel_epi64((__m128i *)p, a); +} + +#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) +#define v64_align(a, b, c) \ + ((c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b) +#else +#define v64_align(a, b, c) \ + ((c) ? v64_from_64((v64_u64(b) >> (c)*8) | (v64_u64(a) << (8 - (c)) * 8)) \ + : (b)) +#endif + +SIMD_INLINE v64 v64_zero(void) { return _mm_setzero_si128(); } + +SIMD_INLINE v64 v64_dup_8(uint8_t x) { return _mm_set1_epi8(x); } + +SIMD_INLINE v64 v64_dup_16(uint16_t x) { return _mm_set1_epi16(x); } + +SIMD_INLINE v64 v64_dup_32(uint32_t x) { return _mm_set1_epi32(x); } + +SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return _mm_add_epi8(a, b); } + +SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return _mm_add_epi16(a, b); } + +SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return _mm_adds_epu8(a, b); } + +SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return _mm_adds_epi8(a, b); } + +SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return _mm_adds_epi16(a, b); } + +SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return _mm_add_epi32(a, b); } + +SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return _mm_sub_epi8(a, b); } + +SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return _mm_subs_epu8(a, b); } + +SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return _mm_subs_epi8(a, b); } + +SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return _mm_sub_epi16(a, b); } + +SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return _mm_subs_epi16(a, b); } + +SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return _mm_subs_epu16(a, b); } + +SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return _mm_sub_epi32(a, b); } + +SIMD_INLINE v64 v64_abs_s16(v64 a) { +#if defined(__SSSE3__) + return _mm_abs_epi16(a); +#else + return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a)); +#endif +} + +SIMD_INLINE v64 v64_abs_s8(v64 a) { +#if defined(__SSSE3__) + return _mm_abs_epi8(a); +#else + v64 sign = _mm_cmplt_epi8(a, _mm_setzero_si128()); + return _mm_xor_si128(sign, _mm_add_epi8(a, sign)); +#endif +} + +SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); } + +SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) { + return _mm_srli_si128(_mm_unpacklo_epi8(b, a), 8); +} + +SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); } + +SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) { + return _mm_srli_si128(_mm_unpacklo_epi16(b, a), 8); +} + +SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); } + +SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) { + return _mm_srli_si128(_mm_unpacklo_epi32(b, a), 8); +} + +SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) { + __m128i t = _mm_unpacklo_epi64(b, a); + return _mm_packs_epi32(t, t); +} + +SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) { +#if defined(__SSE4_1__) + __m128i t = _mm_unpacklo_epi64(b, a); + return _mm_packus_epi32(t, t); +#else + int32_t ah = v64_high_u32(a); + int32_t al = v64_low_u32(a); + int32_t bh = v64_high_u32(b); + int32_t bl = v64_low_u32(b); + return v64_from_16(ah > 65535 ? 65535 : ah < 0 ? 0 : ah, + al > 65535 ? 65535 : al < 0 ? 0 : al, + bh > 65535 ? 65535 : bh < 0 ? 0 : bh, + bl > 65535 ? 65535 : bl < 0 ? 0 : bl); +#endif +} + +SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) { + __m128i t = _mm_unpacklo_epi64(b, a); + return _mm_packus_epi16(t, t); +} + +SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) { + __m128i t = _mm_unpacklo_epi64(b, a); + return _mm_packs_epi16(t, t); +} + +SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) { +#if defined(__SSSE3__) + return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a), + v64_from_64(0x0f0d0b0907050301LL)); +#else + return _mm_packus_epi16( + _mm_unpacklo_epi64(_mm_srli_epi16(b, 8), _mm_srli_epi16(a, 8)), + _mm_setzero_si128()); +#endif +} + +SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) { +#if defined(__SSSE3__) + return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a), + v64_from_64(0x0e0c0a0806040200LL)); +#else + return v64_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1)); +#endif +} + +SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) { +#if defined(__SSSE3__) + return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a), + v64_from_64(0x0f0e0b0a07060302LL)); +#else + return _mm_packs_epi32( + _mm_unpacklo_epi64(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)), + _mm_setzero_si128()); +#endif +} + +SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) { +#if defined(__SSSE3__) + return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a), + v64_from_64(0x0d0c090805040100LL)); +#else + return v64_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2)); +#endif +} + +SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { + return _mm_unpacklo_epi8(a, _mm_setzero_si128()); +} + +SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { + return _mm_srli_si128(_mm_unpacklo_epi8(a, _mm_setzero_si128()), 8); +} + +SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) { + return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); +} + +SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { + return _mm_srli_si128(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8), 8); +} + +SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) { + return _mm_unpacklo_epi16(a, _mm_setzero_si128()); +} + +SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) { + return _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16); +} + +SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) { + return _mm_srli_si128(_mm_unpacklo_epi16(a, _mm_setzero_si128()), 8); +} + +SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) { + return _mm_srli_si128( + _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16), 8); +} + +SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) { +#if defined(__SSSE3__) + return _mm_shuffle_epi8(x, pattern); +#else + v64 output; + unsigned char *input = (unsigned char *)&x; + unsigned char *index = (unsigned char *)&pattern; + char *selected = (char *)&output; + int counter; + + for (counter = 0; counter < 8; counter++) { + selected[counter] = input[index[counter]]; + } + + return output; +#endif +} + +SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) { + __m128i t = _mm_madd_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8), + _mm_unpacklo_epi8(b, _mm_setzero_si128())); + t = _mm_add_epi32(t, _mm_srli_si128(t, 8)); + t = _mm_add_epi32(t, _mm_srli_si128(t, 4)); + return (int32_t)v64_low_u32(t); +} + +SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) { + __m128i r = _mm_madd_epi16(a, b); +#if defined(__SSE4_1__) && defined(__x86_64__) + __m128i x = _mm_cvtepi32_epi64(r); + return _mm_cvtsi128_si64(_mm_add_epi64(x, _mm_srli_si128(x, 8))); +#else + return (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) + + (int64_t)_mm_cvtsi128_si32(r); +#endif +} + +SIMD_INLINE uint64_t v64_hadd_u8(v64 a) { + return v64_low_u32(_mm_sad_epu8(a, _mm_setzero_si128())); +} + +SIMD_INLINE int64_t v64_hadd_s16(v64 a) { + return v64_dotp_s16(a, v64_dup_16(1)); +} + +typedef v64 sad64_internal; + +SIMD_INLINE sad64_internal v64_sad_u8_init(void) { return _mm_setzero_si128(); } + +/* Implementation dependent return value. Result must be finalised with + v64_sad_u8_sum(). + The result for more than 32 v64_sad_u8() calls is undefined. */ +SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) { + return _mm_add_epi64(s, _mm_sad_epu8(a, b)); +} + +SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { return v64_low_u32(s); } + +typedef v64 ssd64_internal; + +SIMD_INLINE ssd64_internal v64_ssd_u8_init(void) { return _mm_setzero_si128(); } + +/* Implementation dependent return value. Result must be finalised with + * v64_ssd_u8_sum(). */ +SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) { + v64 l = v64_sub_16(v64_ziplo_8(v64_zero(), a), v64_ziplo_8(v64_zero(), b)); + v64 h = v64_sub_16(v64_ziphi_8(v64_zero(), a), v64_ziphi_8(v64_zero(), b)); + v64 r = v64_add_32(_mm_madd_epi16(l, l), _mm_madd_epi16(h, h)); + return _mm_add_epi64( + s, v64_ziplo_32(v64_zero(), _mm_add_epi32(r, _mm_srli_si128(r, 4)))); +} + +SIMD_INLINE uint32_t v64_ssd_u8_sum(sad64_internal s) { return v64_low_u32(s); } + +SIMD_INLINE v64 v64_or(v64 a, v64 b) { return _mm_or_si128(a, b); } + +SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return _mm_xor_si128(a, b); } + +SIMD_INLINE v64 v64_and(v64 a, v64 b) { return _mm_and_si128(a, b); } + +SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return _mm_andnot_si128(b, a); } + +SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return _mm_mullo_epi16(a, b); } + +SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return _mm_mulhi_epi16(a, b); } + +SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) { +#if defined(__SSE4_1__) + return _mm_mullo_epi32(a, b); +#else + return _mm_unpacklo_epi32( + _mm_mul_epu32(a, b), + _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4))); +#endif +} + +SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return _mm_madd_epi16(a, b); } + +SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) { +#if defined(__SSSE3__) + return _mm_maddubs_epi16(a, b); +#else + __m128i t = _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()), + _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8)); + return _mm_packs_epi32(t, t); +#endif +} + +SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return _mm_avg_epu8(a, b); } + +SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) { + return _mm_sub_epi8(_mm_avg_epu8(a, b), + _mm_and_si128(_mm_xor_si128(a, b), v64_dup_8(1))); +} + +SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) { + return _mm_sub_epi16(_mm_avg_epu16(a, b), + _mm_and_si128(_mm_xor_si128(a, b), v64_dup_16(1))); +} + +SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return _mm_avg_epu16(a, b); } + +SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return _mm_min_epu8(a, b); } + +SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return _mm_max_epu8(a, b); } + +SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) { +#if defined(__SSE4_1__) + return _mm_min_epi8(a, b); +#else + v64 mask = _mm_cmplt_epi8(a, b); + return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); +#endif +} + +SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) { +#if defined(__SSE4_1__) + return _mm_max_epi8(a, b); +#else + v64 mask = _mm_cmplt_epi8(b, a); + return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); +#endif +} + +SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return _mm_min_epi16(a, b); } + +SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return _mm_max_epi16(a, b); } + +SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return _mm_cmpgt_epi8(a, b); } + +SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return _mm_cmplt_epi8(a, b); } + +SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return _mm_cmpeq_epi8(a, b); } + +SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return _mm_cmpgt_epi16(a, b); } + +SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return _mm_cmplt_epi16(a, b); } + +SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return _mm_cmpeq_epi16(a, b); } + +SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) { + return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)), + _mm_sll_epi16(a, _mm_cvtsi32_si128(c))); +} + +SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) { + return _mm_and_si128(_mm_set1_epi8((char)(0xff >> c)), + _mm_srl_epi16(a, _mm_cvtsi32_si128(c))); +} + +SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) { + return _mm_packs_epi16( + _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128(c + 8)), a); +} + +SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) { + return _mm_sll_epi16(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) { + return _mm_srl_epi16(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) { + return _mm_sra_epi16(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) { + return _mm_sll_epi32(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) { + return _mm_srl_epi32(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) { + return _mm_sra_epi32(a, _mm_cvtsi32_si128(c)); +} + +/* These intrinsics require immediate values, so we must use #defines + to enforce that. */ +#define v64_shl_n_byte(a, c) _mm_slli_si128(a, c) +#define v64_shr_n_byte(a, c) _mm_srli_si128(_mm_unpacklo_epi64(a, a), c + 8) +#define v64_shl_n_8(a, c) \ + _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c)) +#define v64_shr_n_u8(a, c) \ + _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c)) +#define v64_shr_n_s8(a, c) \ + _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), a) +#define v64_shl_n_16(a, c) _mm_slli_epi16(a, c) +#define v64_shr_n_u16(a, c) _mm_srli_epi16(a, c) +#define v64_shr_n_s16(a, c) _mm_srai_epi16(a, c) +#define v64_shl_n_32(a, c) _mm_slli_epi32(a, c) +#define v64_shr_n_u32(a, c) _mm_srli_epi32(a, c) +#define v64_shr_n_s32(a, c) _mm_srai_epi32(a, c) + +#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_ diff --git a/libs/libaom/src/aom_dsp/sse.c b/libs/libaom/src/aom_dsp/sse.c new file mode 100644 index 000000000..16f6b58bd --- /dev/null +++ b/libs/libaom/src/aom_dsp/sse.c @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/* Sum the difference between every corresponding element of the buffers. */ + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" + +int64_t aom_sse_c(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int y, x; + int64_t sse = 0; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + const int32_t diff = abs(a[x] - b[x]); + sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } + return sse; +} + +#if CONFIG_AV1_HIGHBITDEPTH +int64_t aom_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8, + int b_stride, int width, int height) { + int y, x; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + const int32_t diff = (int32_t)(a[x]) - (int32_t)(b[x]); + sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } + return sse; +} +#endif diff --git a/libs/libaom/src/aom_dsp/ssim.c b/libs/libaom/src/aom_dsp/ssim.c new file mode 100644 index 000000000..95b88887b --- /dev/null +++ b/libs/libaom/src/aom_dsp/ssim.c @@ -0,0 +1,441 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/ssim.h" +#include "aom_ports/mem.h" +#include "aom_ports/system_state.h" + +void aom_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp, + uint32_t *sum_s, uint32_t *sum_r, + uint32_t *sum_sq_s, uint32_t *sum_sq_r, + uint32_t *sum_sxr) { + int i, j; + for (i = 0; i < 16; i++, s += sp, r += rp) { + for (j = 0; j < 16; j++) { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} + +void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp, + uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, + uint32_t *sum_sq_r, uint32_t *sum_sxr) { + int i, j; + for (i = 0; i < 8; i++, s += sp, r += rp) { + for (j = 0; j < 8; j++) { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r, + int rp, uint32_t *sum_s, uint32_t *sum_r, + uint32_t *sum_sq_s, uint32_t *sum_sq_r, + uint32_t *sum_sxr) { + int i, j; + for (i = 0; i < 8; i++, s += sp, r += rp) { + for (j = 0; j < 8; j++) { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} +#endif + +static const int64_t cc1 = 26634; // (64^2*(.01*255)^2 +static const int64_t cc2 = 239708; // (64^2*(.03*255)^2 +static const int64_t cc1_10 = 428658; // (64^2*(.01*1023)^2 +static const int64_t cc2_10 = 3857925; // (64^2*(.03*1023)^2 +static const int64_t cc1_12 = 6868593; // (64^2*(.01*4095)^2 +static const int64_t cc2_12 = 61817334; // (64^2*(.03*4095)^2 + +static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s, + uint32_t sum_sq_r, uint32_t sum_sxr, int count, + uint32_t bd) { + int64_t ssim_n, ssim_d; + int64_t c1, c2; + if (bd == 8) { + // scale the constants by number of pixels + c1 = (cc1 * count * count) >> 12; + c2 = (cc2 * count * count) >> 12; + } else if (bd == 10) { + c1 = (cc1_10 * count * count) >> 12; + c2 = (cc2_10 * count * count) >> 12; + } else if (bd == 12) { + c1 = (cc1_12 * count * count) >> 12; + c2 = (cc2_12 * count * count) >> 12; + } else { + c1 = c2 = 0; + assert(0); + } + + ssim_n = (2 * sum_s * sum_r + c1) * + ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2); + + ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) * + ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s + + (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2); + + return ssim_n * 1.0 / ssim_d; +} + +static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) { + uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; + aom_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, + &sum_sxr); + return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8); +} + +static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r, + int rp, uint32_t bd, uint32_t shift) { + uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; + aom_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, + &sum_sxr); + return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift), + sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd); +} + +// We are using a 8x8 moving window with starting location of each 8x8 window +// on the 4x4 pixel grid. Such arrangement allows the windows to overlap +// block boundaries to penalize blocking artifacts. +static double aom_ssim2(const uint8_t *img1, const uint8_t *img2, + int stride_img1, int stride_img2, int width, + int height) { + int i, j; + int samples = 0; + double ssim_total = 0; + + // sample point start with each 4x4 location + for (i = 0; i <= height - 8; + i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { + for (j = 0; j <= width - 8; j += 4) { + double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2); + ssim_total += v; + samples++; + } + } + ssim_total /= samples; + return ssim_total; +} + +static double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2, + int stride_img1, int stride_img2, int width, + int height, uint32_t bd, uint32_t shift) { + int i, j; + int samples = 0; + double ssim_total = 0; + + // sample point start with each 4x4 location + for (i = 0; i <= height - 8; + i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { + for (j = 0; j <= width - 8; j += 4) { + double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1, + CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd, + shift); + ssim_total += v; + samples++; + } + } + ssim_total /= samples; + return ssim_total; +} + +double aom_calc_ssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *weight) { + double abc[3]; + for (int i = 0; i < 3; ++i) { + const int is_uv = i > 0; + abc[i] = aom_ssim2(source->buffers[i], dest->buffers[i], + source->strides[is_uv], dest->strides[is_uv], + source->crop_widths[is_uv], source->crop_heights[is_uv]); + } + + *weight = 1; + return abc[0] * .8 + .1 * (abc[1] + abc[2]); +} + +// traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity +// +// Re working out the math -> +// +// ssim(x,y) = (2*mean(x)*mean(y) + c1)*(2*cov(x,y)+c2) / +// ((mean(x)^2+mean(y)^2+c1)*(var(x)+var(y)+c2)) +// +// mean(x) = sum(x) / n +// +// cov(x,y) = (n*sum(xi*yi)-sum(x)*sum(y))/(n*n) +// +// var(x) = (n*sum(xi*xi)-sum(xi)*sum(xi))/(n*n) +// +// ssim(x,y) = +// (2*sum(x)*sum(y)/(n*n) + c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))/(n*n)+c2) / +// (((sum(x)*sum(x)+sum(y)*sum(y))/(n*n) +c1) * +// ((n*sum(xi*xi) - sum(xi)*sum(xi))/(n*n)+ +// (n*sum(yi*yi) - sum(yi)*sum(yi))/(n*n)+c2))) +// +// factoring out n*n +// +// ssim(x,y) = +// (2*sum(x)*sum(y) + n*n*c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))+n*n*c2) / +// (((sum(x)*sum(x)+sum(y)*sum(y)) + n*n*c1) * +// (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2)) +// +// Replace c1 with n*n * c1 for the final step that leads to this code: +// The final step scales by 12 bits so we don't lose precision in the constants. + +static double ssimv_similarity(const Ssimv *sv, int64_t n) { + // Scale the constants by number of pixels. + const int64_t c1 = (cc1 * n * n) >> 12; + const int64_t c2 = (cc2 * n * n) >> 12; + + const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) / + (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1); + + // Since these variables are unsigned sums, convert to double so + // math is done in double arithmetic. + const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) / + (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + + n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2); + + return l * v; +} + +// The first term of the ssim metric is a luminance factor. +// +// (2*mean(x)*mean(y) + c1)/ (mean(x)^2+mean(y)^2+c1) +// +// This luminance factor is super sensitive to the dark side of luminance +// values and completely insensitive on the white side. check out 2 sets +// (1,3) and (250,252) the term gives ( 2*1*3/(1+9) = .60 +// 2*250*252/ (250^2+252^2) => .99999997 +// +// As a result in this tweaked version of the calculation in which the +// luminance is taken as percentage off from peak possible. +// +// 255 * 255 - (sum_s - sum_r) / count * (sum_s - sum_r) / count +// +static double ssimv_similarity2(const Ssimv *sv, int64_t n) { + // Scale the constants by number of pixels. + const int64_t c1 = (cc1 * n * n) >> 12; + const int64_t c2 = (cc2 * n * n) >> 12; + + const double mean_diff = (1.0 * sv->sum_s - sv->sum_r) / n; + const double l = (255 * 255 - mean_diff * mean_diff + c1) / (255 * 255 + c1); + + // Since these variables are unsigned, sums convert to double so + // math is done in double arithmetic. + const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) / + (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + + n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2); + + return l * v; +} +static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2, + int img2_pitch, Ssimv *sv) { + aom_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch, &sv->sum_s, &sv->sum_r, + &sv->sum_sq_s, &sv->sum_sq_r, &sv->sum_sxr); +} + +double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2, + int img2_pitch, int width, int height, Ssimv *sv2, + Metrics *m, int do_inconsistency) { + double dssim_total = 0; + double ssim_total = 0; + double ssim2_total = 0; + double inconsistency_total = 0; + int i, j; + int c = 0; + double norm; + double old_ssim_total = 0; + aom_clear_system_state(); + // We can sample points as frequently as we like start with 1 per 4x4. + for (i = 0; i < height; + i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) { + for (j = 0; j < width; j += 4, ++c) { + Ssimv sv = { 0, 0, 0, 0, 0, 0 }; + double ssim; + double ssim2; + double dssim; + uint32_t var_new; + uint32_t var_old; + uint32_t mean_new; + uint32_t mean_old; + double ssim_new; + double ssim_old; + + // Not sure there's a great way to handle the edge pixels + // in ssim when using a window. Seems biased against edge pixels + // however you handle this. This uses only samples that are + // fully in the frame. + if (j + 8 <= width && i + 8 <= height) { + ssimv_parms(img1 + j, img1_pitch, img2 + j, img2_pitch, &sv); + } + + ssim = ssimv_similarity(&sv, 64); + ssim2 = ssimv_similarity2(&sv, 64); + + sv.ssim = ssim2; + + // dssim is calculated to use as an actual error metric and + // is scaled up to the same range as sum square error. + // Since we are subsampling every 16th point maybe this should be + // *16 ? + dssim = 255 * 255 * (1 - ssim2) / 2; + + // Here I introduce a new error metric: consistency-weighted + // SSIM-inconsistency. This metric isolates frames where the + // SSIM 'suddenly' changes, e.g. if one frame in every 8 is much + // sharper or blurrier than the others. Higher values indicate a + // temporally inconsistent SSIM. There are two ideas at work: + // + // 1) 'SSIM-inconsistency': the total inconsistency value + // reflects how much SSIM values are changing between this + // source / reference frame pair and the previous pair. + // + // 2) 'consistency-weighted': weights de-emphasize areas in the + // frame where the scene content has changed. Changes in scene + // content are detected via changes in local variance and local + // mean. + // + // Thus the overall measure reflects how inconsistent the SSIM + // values are, over consistent regions of the frame. + // + // The metric has three terms: + // + // term 1 -> uses change in scene Variance to weight error score + // 2 * var(Fi)*var(Fi-1) / (var(Fi)^2+var(Fi-1)^2) + // larger changes from one frame to the next mean we care + // less about consistency. + // + // term 2 -> uses change in local scene luminance to weight error + // 2 * avg(Fi)*avg(Fi-1) / (avg(Fi)^2+avg(Fi-1)^2) + // larger changes from one frame to the next mean we care + // less about consistency. + // + // term3 -> measures inconsistency in ssim scores between frames + // 1 - ( 2 * ssim(Fi)*ssim(Fi-1)/(ssim(Fi)^2+sssim(Fi-1)^2). + // + // This term compares the ssim score for the same location in 2 + // subsequent frames. + var_new = sv.sum_sq_s - sv.sum_s * sv.sum_s / 64; + var_old = sv2[c].sum_sq_s - sv2[c].sum_s * sv2[c].sum_s / 64; + mean_new = sv.sum_s; + mean_old = sv2[c].sum_s; + ssim_new = sv.ssim; + ssim_old = sv2[c].ssim; + + if (do_inconsistency) { + // We do the metric once for every 4x4 block in the image. Since + // we are scaling the error to SSE for use in a psnr calculation + // 1.0 = 4x4x255x255 the worst error we can possibly have. + static const double kScaling = 4. * 4 * 255 * 255; + + // The constants have to be non 0 to avoid potential divide by 0 + // issues other than that they affect kind of a weighting between + // the terms. No testing of what the right terms should be has been + // done. + static const double c1 = 1, c2 = 1, c3 = 1; + + // This measures how much consistent variance is in two consecutive + // source frames. 1.0 means they have exactly the same variance. + const double variance_term = + (2.0 * var_old * var_new + c1) / + (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1); + + // This measures how consistent the local mean are between two + // consecutive frames. 1.0 means they have exactly the same mean. + const double mean_term = + (2.0 * mean_old * mean_new + c2) / + (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2); + + // This measures how consistent the ssims of two + // consecutive frames is. 1.0 means they are exactly the same. + double ssim_term = + pow((2.0 * ssim_old * ssim_new + c3) / + (ssim_old * ssim_old + ssim_new * ssim_new + c3), + 5); + + double this_inconsistency; + + // Floating point math sometimes makes this > 1 by a tiny bit. + // We want the metric to scale between 0 and 1.0 so we can convert + // it to an snr scaled value. + if (ssim_term > 1) ssim_term = 1; + + // This converts the consistency metric to an inconsistency metric + // ( so we can scale it like psnr to something like sum square error. + // The reason for the variance and mean terms is the assumption that + // if there are big changes in the source we shouldn't penalize + // inconsistency in ssim scores a bit less as it will be less visible + // to the user. + this_inconsistency = (1 - ssim_term) * variance_term * mean_term; + + this_inconsistency *= kScaling; + inconsistency_total += this_inconsistency; + } + sv2[c] = sv; + ssim_total += ssim; + ssim2_total += ssim2; + dssim_total += dssim; + + old_ssim_total += ssim_old; + } + old_ssim_total += 0; + } + + norm = 1. / (width / 4) / (height / 4); + ssim_total *= norm; + ssim2_total *= norm; + m->ssim2 = ssim2_total; + m->ssim = ssim_total; + if (old_ssim_total == 0) inconsistency_total = 0; + + m->ssimc = inconsistency_total; + + m->dssim = dssim_total; + return inconsistency_total; +} + +double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *weight, + uint32_t bd, uint32_t in_bd) { + assert(bd >= in_bd); + const uint32_t shift = bd - in_bd; + + double abc[3]; + for (int i = 0; i < 3; ++i) { + const int is_uv = i > 0; + abc[i] = aom_highbd_ssim2(source->buffers[i], dest->buffers[i], + source->strides[is_uv], dest->strides[is_uv], + source->crop_widths[is_uv], + source->crop_heights[is_uv], in_bd, shift); + } + + *weight = 1; + return abc[0] * .8 + .1 * (abc[1] + abc[2]); +} diff --git a/libs/libaom/src/aom_dsp/ssim.h b/libs/libaom/src/aom_dsp/ssim.h new file mode 100644 index 000000000..55038f4c2 --- /dev/null +++ b/libs/libaom/src/aom_dsp/ssim.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_SSIM_H_ +#define AOM_AOM_DSP_SSIM_H_ + +#define MAX_SSIM_DB 100.0; + +#ifdef __cplusplus +extern "C" { +#endif + +#include "config/aom_config.h" + +#include "aom_scale/yv12config.h" + +// metrics used for calculating ssim, ssim2, dssim, and ssimc +typedef struct { + // source sum ( over 8x8 region ) + uint32_t sum_s; + + // reference sum (over 8x8 region ) + uint32_t sum_r; + + // source sum squared ( over 8x8 region ) + uint32_t sum_sq_s; + + // reference sum squared (over 8x8 region ) + uint32_t sum_sq_r; + + // sum of source times reference (over 8x8 region) + uint32_t sum_sxr; + + // calculated ssim score between source and reference + double ssim; +} Ssimv; + +// metrics collected on a frame basis +typedef struct { + // ssim consistency error metric ( see code for explanation ) + double ssimc; + + // standard ssim + double ssim; + + // revised ssim ( see code for explanation) + double ssim2; + + // ssim restated as an error metric like sse + double dssim; + + // dssim converted to decibels + double dssimd; + + // ssimc converted to decibels + double ssimcd; +} Metrics; + +double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2, + int img2_pitch, int width, int height, Ssimv *sv2, + Metrics *m, int do_inconsistency); + +double aom_calc_ssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *weight); + +double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *ssim_y, + double *ssim_u, double *ssim_v, uint32_t bd, + uint32_t in_bd); + +double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *weight, + uint32_t bd, uint32_t in_bd); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_SSIM_H_ diff --git a/libs/libaom/src/aom_dsp/subtract.c b/libs/libaom/src/aom_dsp/subtract.c new file mode 100644 index 000000000..4f4e35597 --- /dev/null +++ b/libs/libaom/src/aom_dsp/subtract.c @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +void aom_subtract_block_c(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src, + ptrdiff_t src_stride, const uint8_t *pred, + ptrdiff_t pred_stride) { + int r, c; + + for (r = 0; r < rows; r++) { + for (c = 0; c < cols; c++) diff[c] = src[c] - pred[c]; + + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_subtract_block_c(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src8, + ptrdiff_t src_stride, const uint8_t *pred8, + ptrdiff_t pred_stride, int bd) { + int r, c; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + (void)bd; + + for (r = 0; r < rows; r++) { + for (c = 0; c < cols; c++) { + diff[c] = src[c] - pred[c]; + } + + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } +} +#endif diff --git a/libs/libaom/src/aom_dsp/sum_squares.c b/libs/libaom/src/aom_dsp/sum_squares.c new file mode 100644 index 000000000..d739a6083 --- /dev/null +++ b/libs/libaom/src/aom_dsp/sum_squares.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +uint64_t aom_sum_squares_2d_i16_c(const int16_t *src, int src_stride, int width, + int height) { + int r, c; + uint64_t ss = 0; + + for (r = 0; r < height; r++) { + for (c = 0; c < width; c++) { + const int16_t v = src[c]; + ss += v * v; + } + src += src_stride; + } + + return ss; +} + +uint64_t aom_sum_squares_i16_c(const int16_t *src, uint32_t n) { + uint64_t ss = 0; + do { + const int16_t v = *src++; + ss += v * v; + } while (--n); + + return ss; +} + +uint64_t aom_var_2d_u8_c(uint8_t *src, int src_stride, int width, int height) { + int r, c; + uint64_t ss = 0, s = 0; + + for (r = 0; r < height; r++) { + for (c = 0; c < width; c++) { + const uint8_t v = src[c]; + ss += v * v; + s += v; + } + src += src_stride; + } + + return (ss - s * s / (width * height)); +} + +uint64_t aom_var_2d_u16_c(uint8_t *src, int src_stride, int width, int height) { + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + int r, c; + uint64_t ss = 0, s = 0; + + for (r = 0; r < height; r++) { + for (c = 0; c < width; c++) { + const uint16_t v = srcp[c]; + ss += v * v; + s += v; + } + srcp += src_stride; + } + + return (ss - s * s / (width * height)); +} diff --git a/libs/libaom/src/aom_dsp/txfm_common.h b/libs/libaom/src/aom_dsp/txfm_common.h new file mode 100644 index 000000000..f13d69092 --- /dev/null +++ b/libs/libaom/src/aom_dsp/txfm_common.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_TXFM_COMMON_H_ +#define AOM_AOM_DSP_TXFM_COMMON_H_ + +#include "aom_dsp/aom_dsp_common.h" +#include "av1/common/enums.h" + +// Constants and Macros used by all idct/dct functions +#define DCT_CONST_BITS 14 +#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1)) + +#define UNIT_QUANT_SHIFT 2 +#define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT) + +typedef struct txfm_param { + // for both forward and inverse transforms + TX_TYPE tx_type; + TX_SIZE tx_size; + int lossless; + int bd; + // are the pixel buffers octets or shorts? This should collapse to + // bd==8 implies !is_hbd, but that's not certain right now. + int is_hbd; + TxSetType tx_set_type; + // for inverse transforms only + int eob; +} TxfmParam; + +// Constants: +// for (int i = 1; i< 32; ++i) +// printf("static const int cospi_%d_64 = %.0f;\n", i, +// round(16384 * cos(i*PI/64))); +// Note: sin(k*Pi/64) = cos((32-k)*Pi/64) +static const tran_high_t cospi_1_64 = 16364; +static const tran_high_t cospi_2_64 = 16305; +static const tran_high_t cospi_3_64 = 16207; +static const tran_high_t cospi_4_64 = 16069; +static const tran_high_t cospi_5_64 = 15893; +static const tran_high_t cospi_6_64 = 15679; +static const tran_high_t cospi_7_64 = 15426; +static const tran_high_t cospi_8_64 = 15137; +static const tran_high_t cospi_9_64 = 14811; +static const tran_high_t cospi_10_64 = 14449; +static const tran_high_t cospi_11_64 = 14053; +static const tran_high_t cospi_12_64 = 13623; +static const tran_high_t cospi_13_64 = 13160; +static const tran_high_t cospi_14_64 = 12665; +static const tran_high_t cospi_15_64 = 12140; +static const tran_high_t cospi_16_64 = 11585; +static const tran_high_t cospi_17_64 = 11003; +static const tran_high_t cospi_18_64 = 10394; +static const tran_high_t cospi_19_64 = 9760; +static const tran_high_t cospi_20_64 = 9102; +static const tran_high_t cospi_21_64 = 8423; +static const tran_high_t cospi_22_64 = 7723; +static const tran_high_t cospi_23_64 = 7005; +static const tran_high_t cospi_24_64 = 6270; +static const tran_high_t cospi_25_64 = 5520; +static const tran_high_t cospi_26_64 = 4756; +static const tran_high_t cospi_27_64 = 3981; +static const tran_high_t cospi_28_64 = 3196; +static const tran_high_t cospi_29_64 = 2404; +static const tran_high_t cospi_30_64 = 1606; +static const tran_high_t cospi_31_64 = 804; + +// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3 +static const tran_high_t sinpi_1_9 = 5283; +static const tran_high_t sinpi_2_9 = 9929; +static const tran_high_t sinpi_3_9 = 13377; +static const tran_high_t sinpi_4_9 = 15212; + +// 16384 * sqrt(2) +static const tran_high_t Sqrt2 = 23170; +static const tran_high_t InvSqrt2 = 11585; + +static INLINE tran_high_t fdct_round_shift(tran_high_t input) { + tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); + return rv; +} + +#endif // AOM_AOM_DSP_TXFM_COMMON_H_ diff --git a/libs/libaom/src/aom_dsp/variance.c b/libs/libaom/src/aom_dsp/variance.c new file mode 100644 index 000000000..695f12a52 --- /dev/null +++ b/libs/libaom/src/aom_dsp/variance.c @@ -0,0 +1,1483 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/blend.h" +#include "aom_dsp/variance.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/filter.h" +#include "av1/common/reconinter.h" +#include "av1/encoder/reconinter_enc.h" + +uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride) { + int distortion = 0; + int r, c; + + for (r = 0; r < 4; ++r) { + for (c = 0; c < 4; ++c) { + int diff = a[c] - b[c]; + distortion += diff * diff; + } + + a += a_stride; + b += b_stride; + } + + return distortion; +} + +uint32_t aom_get_mb_ss_c(const int16_t *a) { + unsigned int i, sum = 0; + + for (i = 0; i < 256; ++i) { + sum += a[i] * a[i]; + } + + return sum; +} + +static void variance(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h, uint32_t *sse, int *sum) { + int i, j; + + *sum = 0; + *sse = 0; + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } +} + +uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h) { + uint32_t sse; + int sum; + variance(a, a_stride, b, b_stride, w, h, &sse, &sum); + return sse; +} + +// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal +// or vertical direction to produce the filtered output block. Used to implement +// the first-pass of 2-D separable filter. +// +// Produces int16_t output to retain precision for the next pass. Two filter +// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is +// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride). +// It defines the offset required to move from one input to the next. +void aom_var_filter_block2d_bil_first_pass_c(const uint8_t *a, uint16_t *b, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + b[j] = ROUND_POWER_OF_TWO( + (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); + + ++a; + } + + a += src_pixels_per_line - output_width; + b += output_width; + } +} + +// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal +// or vertical direction to produce the filtered output block. Used to implement +// the second-pass of 2-D separable filter. +// +// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two +// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the +// filter is applied horizontally (pixel_step = 1) or vertically +// (pixel_step = stride). It defines the offset required to move from one input +// to the next. Output is 8-bit. +void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + b[j] = ROUND_POWER_OF_TWO( + (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); + ++a; + } + + a += src_pixels_per_line - output_width; + b += output_width; + } +} + +#define VAR(W, H) \ + uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + int sum; \ + variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + } + +#define SUBPIX_VAR(W, H) \ + uint32_t aom_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \ + } + +#define SUBPIX_AVG_VAR(W, H) \ + uint32_t aom_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ + \ + return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \ + } \ + uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + aom_dist_wtd_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \ + \ + return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \ + } + +/* Identical to the variance call except it takes an additional parameter, sum, + * and returns that value using pass-by-reference instead of returning + * sse - sum^2 / w*h + */ +#define GET_VAR(W, H) \ + void aom_get##W##x##H##var_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, uint32_t *sse, \ + int *sum) { \ + variance(a, a_stride, b, b_stride, W, H, sse, sum); \ + } + +/* Identical to the variance call except it does not calculate the + * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in + * variable. + */ +#define MSE(W, H) \ + uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + int sum; \ + variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse; \ + } + +/* All three forms of the variance are available in the same sizes. */ +#define VARIANCES(W, H) \ + VAR(W, H) \ + SUBPIX_VAR(W, H) \ + SUBPIX_AVG_VAR(W, H) + +VARIANCES(128, 128) +VARIANCES(128, 64) +VARIANCES(64, 128) +VARIANCES(64, 64) +VARIANCES(64, 32) +VARIANCES(32, 64) +VARIANCES(32, 32) +VARIANCES(32, 16) +VARIANCES(16, 32) +VARIANCES(16, 16) +VARIANCES(16, 8) +VARIANCES(8, 16) +VARIANCES(8, 8) +VARIANCES(8, 4) +VARIANCES(4, 8) +VARIANCES(4, 4) +VARIANCES(4, 2) +VARIANCES(2, 4) +VARIANCES(2, 2) +VARIANCES(4, 16) +VARIANCES(16, 4) +VARIANCES(8, 32) +VARIANCES(32, 8) +VARIANCES(16, 64) +VARIANCES(64, 16) + +GET_VAR(16, 16) +GET_VAR(8, 8) + +MSE(16, 16) +MSE(16, 8) +MSE(8, 16) +MSE(8, 8) + +void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + int i, j; + + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int tmp = pred[j] + ref[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} + +// Get pred block from up-sampled reference. +void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred, int width, int height, + int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, int subpel_search) { + // expect xd == NULL only in tests + if (xd != NULL) { + const MB_MODE_INFO *mi = xd->mi[0]; + const int ref_num = 0; + const int is_intrabc = is_intrabc_block(mi); + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; + const int is_scaled = av1_is_scaled(sf); + + if (is_scaled) { + int plane = 0; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const struct buf_2d *const dst_buf = &pd->dst; + const struct buf_2d *const pre_buf = + is_intrabc ? dst_buf : &pd->pre[ref_num]; + + InterPredParams inter_pred_params; + inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); + const int_interpfilters filters = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + av1_init_inter_params( + &inter_pred_params, width, height, mi_y >> pd->subsampling_y, + mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, + xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters); + av1_enc_build_one_inter_predictor(comp_pred, width, mv, + &inter_pred_params); + return; + } + } + + const InterpFilterParams *filter = av1_get_filter(subpel_search); + + if (!subpel_x_q3 && !subpel_y_q3) { + for (int i = 0; i < height; i++) { + memcpy(comp_pred, ref, width * sizeof(*comp_pred)); + comp_pred += width; + ref += ref_stride; + } + } else if (!subpel_y_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL, + -1, width, height); + } else if (!subpel_x_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel, + 16, width, height); + } else { + DECLARE_ALIGNED(16, uint8_t, + temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); + const int16_t *const kernel_x = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + const int16_t *const kernel_y = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + const int intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; + assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); + aom_convolve8_horiz_c(ref - ref_stride * ((filter->taps >> 1) - 1), + ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, + width, intermediate_height); + aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1), + MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16, + width, height); + } +} + +void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred, const uint8_t *pred, + int width, int height, int subpel_x_q3, + int subpel_y_q3, const uint8_t *ref, + int ref_stride, int subpel_search) { + int i, j; + + aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search); + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1); + } + comp_pred += width; + pred += width; + } +} + +void aom_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, const uint8_t *ref, + int ref_stride, + const DIST_WTD_COMP_PARAMS *jcp_param) { + int i, j; + const int fwd_offset = jcp_param->fwd_offset; + const int bck_offset = jcp_param->bck_offset; + + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + int tmp = pred[j] * bck_offset + ref[j] * fwd_offset; + tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS); + comp_pred[j] = (uint8_t)tmp; + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} + +void aom_dist_wtd_comp_avg_upsampled_pred_c( + MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) { + int i, j; + const int fwd_offset = jcp_param->fwd_offset; + const int bck_offset = jcp_param->bck_offset; + + aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, + subpel_search); + + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset; + tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS); + comp_pred[j] = (uint8_t)tmp; + } + comp_pred += width; + pred += width; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void highbd_variance64(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, int h, + uint64_t *sse, int64_t *sum) { + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + int64_t tsum = 0; + uint64_t tsse = 0; + for (int i = 0; i < h; ++i) { + int32_t lsum = 0; + for (int j = 0; j < w; ++j) { + const int diff = a[j] - b[j]; + lsum += diff; + tsse += (uint32_t)(diff * diff); + } + tsum += lsum; + a += a_stride; + b += b_stride; + } + *sum = tsum; + *sse = tsse; +} + +uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, int w, int h) { + uint64_t sse; + int64_t sum; + highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum); + return sse; +} + +static void highbd_8_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, int h, + uint32_t *sse, int *sum) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = (uint32_t)sse_long; + *sum = (int)sum_long; +} + +static void highbd_10_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, int h, + uint32_t *sse, int *sum) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); +} + +static void highbd_12_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, int h, + uint32_t *sse, int *sum) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); +} + +#define HIGHBD_VAR(W, H) \ + uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + int sum; \ + highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + } \ + \ + uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + int sum; \ + int64_t var; \ + highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + int sum; \ + int64_t var; \ + highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define HIGHBD_GET_VAR(S) \ + void aom_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + uint32_t *sse, int *sum) { \ + highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ + } \ + \ + void aom_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + uint32_t *sse, int *sum) { \ + highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ + } \ + \ + void aom_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + uint32_t *sse, int *sum) { \ + highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ + } + +#define HIGHBD_MSE(W, H) \ + uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ + } \ + \ + uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ + } \ + \ + uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ + } + +void aom_highbd_var_filter_block2d_bil_first_pass( + const uint8_t *src_ptr8, uint16_t *output_ptr, + unsigned int src_pixels_per_line, int pixel_step, + unsigned int output_height, unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8); + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + output_ptr[j] = ROUND_POWER_OF_TWO( + (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); + + ++src_ptr; + } + + // Next row... + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +void aom_highbd_var_filter_block2d_bil_second_pass( + const uint16_t *src_ptr, uint16_t *output_ptr, + unsigned int src_pixels_per_line, unsigned int pixel_step, + unsigned int output_height, unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + output_ptr[j] = ROUND_POWER_OF_TWO( + (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); + ++src_ptr; + } + + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +#define HIGHBD_SUBPIX_VAR(W, H) \ + uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + dst, dst_stride, sse); \ + } + +#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ + uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ + \ + return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ + \ + return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ + \ + return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \ + W, H, CONVERT_TO_BYTEPTR(temp2), W, \ + jcp_param); \ + \ + return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ + dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \ + W, H, CONVERT_TO_BYTEPTR(temp2), W, \ + jcp_param); \ + \ + return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ + dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \ + W, H, CONVERT_TO_BYTEPTR(temp2), W, \ + jcp_param); \ + \ + return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ + dst_stride, sse); \ + } + +/* All three forms of the variance are available in the same sizes. */ +#define HIGHBD_VARIANCES(W, H) \ + HIGHBD_VAR(W, H) \ + HIGHBD_SUBPIX_VAR(W, H) \ + HIGHBD_SUBPIX_AVG_VAR(W, H) + +HIGHBD_VARIANCES(128, 128) +HIGHBD_VARIANCES(128, 64) +HIGHBD_VARIANCES(64, 128) +HIGHBD_VARIANCES(64, 64) +HIGHBD_VARIANCES(64, 32) +HIGHBD_VARIANCES(32, 64) +HIGHBD_VARIANCES(32, 32) +HIGHBD_VARIANCES(32, 16) +HIGHBD_VARIANCES(16, 32) +HIGHBD_VARIANCES(16, 16) +HIGHBD_VARIANCES(16, 8) +HIGHBD_VARIANCES(8, 16) +HIGHBD_VARIANCES(8, 8) +HIGHBD_VARIANCES(8, 4) +HIGHBD_VARIANCES(4, 8) +HIGHBD_VARIANCES(4, 4) +HIGHBD_VARIANCES(4, 2) +HIGHBD_VARIANCES(2, 4) +HIGHBD_VARIANCES(2, 2) +HIGHBD_VARIANCES(4, 16) +HIGHBD_VARIANCES(16, 4) +HIGHBD_VARIANCES(8, 32) +HIGHBD_VARIANCES(32, 8) +HIGHBD_VARIANCES(16, 64) +HIGHBD_VARIANCES(64, 16) + +HIGHBD_GET_VAR(8) +HIGHBD_GET_VAR(16) + +HIGHBD_MSE(16, 16) +HIGHBD_MSE(16, 8) +HIGHBD_MSE(8, 16) +HIGHBD_MSE(8, 8) + +void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride) { + int i, j; + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int tmp = pred[j] + ref[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} + +void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd, + const struct AV1Common *const cm, int mi_row, + int mi_col, const MV *const mv, + uint8_t *comp_pred8, int width, int height, + int subpel_x_q3, int subpel_y_q3, + const uint8_t *ref8, int ref_stride, int bd, + int subpel_search) { + // expect xd == NULL only in tests + if (xd != NULL) { + const MB_MODE_INFO *mi = xd->mi[0]; + const int ref_num = 0; + const int is_intrabc = is_intrabc_block(mi); + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; + const int is_scaled = av1_is_scaled(sf); + + if (is_scaled) { + int plane = 0; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const struct buf_2d *const dst_buf = &pd->dst; + const struct buf_2d *const pre_buf = + is_intrabc ? dst_buf : &pd->pre[ref_num]; + + InterPredParams inter_pred_params; + inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); + const int_interpfilters filters = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + av1_init_inter_params( + &inter_pred_params, width, height, mi_y >> pd->subsampling_y, + mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, + xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters); + av1_enc_build_one_inter_predictor(comp_pred8, width, mv, + &inter_pred_params); + return; + } + } + + const InterpFilterParams *filter = av1_get_filter(subpel_search); + + if (!subpel_x_q3 && !subpel_y_q3) { + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + for (int i = 0; i < height; i++) { + memcpy(comp_pred, ref, width * sizeof(*comp_pred)); + comp_pred += width; + ref += ref_stride; + } + } else if (!subpel_y_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + aom_highbd_convolve8_horiz_c(ref8, ref_stride, comp_pred8, width, kernel, + 16, NULL, -1, width, height, bd); + } else if (!subpel_x_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + aom_highbd_convolve8_vert_c(ref8, ref_stride, comp_pred8, width, NULL, -1, + kernel, 16, width, height, bd); + } else { + DECLARE_ALIGNED(16, uint16_t, + temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]); + const int16_t *const kernel_x = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + const int16_t *const kernel_y = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + const int intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; + assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); + aom_highbd_convolve8_horiz_c(ref8 - ref_stride * ((filter->taps >> 1) - 1), + ref_stride, CONVERT_TO_BYTEPTR(temp), + MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, + intermediate_height, bd); + aom_highbd_convolve8_vert_c( + CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)), + MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height, + bd); + } +} + +void aom_highbd_comp_avg_upsampled_pred_c( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, int bd, int subpel_search) { + int i, j; + + const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, + height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, + bd, subpel_search); + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1); + } + comp_pred += width; + pred += width; + } +} + +void aom_highbd_dist_wtd_comp_avg_pred_c( + uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, + const uint8_t *ref8, int ref_stride, + const DIST_WTD_COMP_PARAMS *jcp_param) { + int i, j; + const int fwd_offset = jcp_param->fwd_offset; + const int bck_offset = jcp_param->bck_offset; + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + int tmp = pred[j] * bck_offset + ref[j] * fwd_offset; + tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS); + comp_pred[j] = (uint16_t)tmp; + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} + +void aom_highbd_dist_wtd_comp_avg_upsampled_pred_c( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, + int subpel_search) { + int i, j; + const int fwd_offset = jcp_param->fwd_offset; + const int bck_offset = jcp_param->bck_offset; + const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + aom_highbd_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred8, width, + height, subpel_x_q3, subpel_y_q3, ref8, + ref_stride, bd, subpel_search); + + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset; + tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS); + comp_pred[j] = (uint16_t)tmp; + } + comp_pred += width; + pred += width; + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride, + const uint8_t *mask, int mask_stride, + int invert_mask) { + int i, j; + const uint8_t *src0 = invert_mask ? pred : ref; + const uint8_t *src1 = invert_mask ? ref : pred; + const int stride0 = invert_mask ? width : ref_stride; + const int stride1 = invert_mask ? ref_stride : width; + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]); + } + comp_pred += width; + src0 += stride0; + src1 += stride1; + mask += mask_stride; + } +} + +void aom_comp_mask_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred, const uint8_t *pred, + int width, int height, int subpel_x_q3, + int subpel_y_q3, const uint8_t *ref, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask, + int subpel_search) { + if (subpel_x_q3 | subpel_y_q3) { + aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, + subpel_search); + ref = comp_pred; + ref_stride = width; + } + aom_comp_mask_pred_c(comp_pred, pred, width, height, ref, ref_stride, mask, + mask_stride, invert_mask); +} + +#define MASK_SUBPIX_VAR(W, H) \ + unsigned int aom_masked_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + aom_var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, \ + W, bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \ + invert_mask); \ + return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse); \ + } + +MASK_SUBPIX_VAR(4, 4) +MASK_SUBPIX_VAR(4, 8) +MASK_SUBPIX_VAR(8, 4) +MASK_SUBPIX_VAR(8, 8) +MASK_SUBPIX_VAR(8, 16) +MASK_SUBPIX_VAR(16, 8) +MASK_SUBPIX_VAR(16, 16) +MASK_SUBPIX_VAR(16, 32) +MASK_SUBPIX_VAR(32, 16) +MASK_SUBPIX_VAR(32, 32) +MASK_SUBPIX_VAR(32, 64) +MASK_SUBPIX_VAR(64, 32) +MASK_SUBPIX_VAR(64, 64) +MASK_SUBPIX_VAR(64, 128) +MASK_SUBPIX_VAR(128, 64) +MASK_SUBPIX_VAR(128, 128) +MASK_SUBPIX_VAR(4, 16) +MASK_SUBPIX_VAR(16, 4) +MASK_SUBPIX_VAR(8, 32) +MASK_SUBPIX_VAR(32, 8) +MASK_SUBPIX_VAR(16, 64) +MASK_SUBPIX_VAR(64, 16) + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask) { + int i, j; + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + if (!invert_mask) + comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]); + else + comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]); + } + comp_pred += width; + pred += width; + ref += ref_stride; + mask += mask_stride; + } +} + +void aom_highbd_comp_mask_upsampled_pred( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, + int bd, int subpel_search) { + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, + height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, + bd, subpel_search); + aom_highbd_comp_mask_pred(comp_pred8, pred8, width, height, comp_pred8, width, + mask, mask_stride, invert_mask); +} + +#define HIGHBD_MASK_SUBPIX_VAR(W, H) \ + unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \ + invert_mask); \ + \ + return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + ref, ref_stride, sse); \ + } \ + \ + unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \ + invert_mask); \ + \ + return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + ref, ref_stride, sse); \ + } \ + \ + unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \ + invert_mask); \ + \ + return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + ref, ref_stride, sse); \ + } + +HIGHBD_MASK_SUBPIX_VAR(4, 4) +HIGHBD_MASK_SUBPIX_VAR(4, 8) +HIGHBD_MASK_SUBPIX_VAR(8, 4) +HIGHBD_MASK_SUBPIX_VAR(8, 8) +HIGHBD_MASK_SUBPIX_VAR(8, 16) +HIGHBD_MASK_SUBPIX_VAR(16, 8) +HIGHBD_MASK_SUBPIX_VAR(16, 16) +HIGHBD_MASK_SUBPIX_VAR(16, 32) +HIGHBD_MASK_SUBPIX_VAR(32, 16) +HIGHBD_MASK_SUBPIX_VAR(32, 32) +HIGHBD_MASK_SUBPIX_VAR(32, 64) +HIGHBD_MASK_SUBPIX_VAR(64, 32) +HIGHBD_MASK_SUBPIX_VAR(64, 64) +HIGHBD_MASK_SUBPIX_VAR(64, 128) +HIGHBD_MASK_SUBPIX_VAR(128, 64) +HIGHBD_MASK_SUBPIX_VAR(128, 128) +HIGHBD_MASK_SUBPIX_VAR(4, 16) +HIGHBD_MASK_SUBPIX_VAR(16, 4) +HIGHBD_MASK_SUBPIX_VAR(8, 32) +HIGHBD_MASK_SUBPIX_VAR(32, 8) +HIGHBD_MASK_SUBPIX_VAR(16, 64) +HIGHBD_MASK_SUBPIX_VAR(64, 16) +#endif // CONFIG_AV1_HIGHBITDEPTH + +static INLINE void obmc_variance(const uint8_t *pre, int pre_stride, + const int32_t *wsrc, const int32_t *mask, + int w, int h, unsigned int *sse, int *sum) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12); + *sum += diff; + *sse += diff * diff; + } + + pre += pre_stride; + wsrc += w; + mask += w; + } +} + +#define OBMC_VAR(W, H) \ + unsigned int aom_obmc_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ + } + +#define OBMC_SUBPIX_VAR(W, H) \ + unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + aom_var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, \ + W, bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse); \ + } + +OBMC_VAR(4, 4) +OBMC_SUBPIX_VAR(4, 4) + +OBMC_VAR(4, 8) +OBMC_SUBPIX_VAR(4, 8) + +OBMC_VAR(8, 4) +OBMC_SUBPIX_VAR(8, 4) + +OBMC_VAR(8, 8) +OBMC_SUBPIX_VAR(8, 8) + +OBMC_VAR(8, 16) +OBMC_SUBPIX_VAR(8, 16) + +OBMC_VAR(16, 8) +OBMC_SUBPIX_VAR(16, 8) + +OBMC_VAR(16, 16) +OBMC_SUBPIX_VAR(16, 16) + +OBMC_VAR(16, 32) +OBMC_SUBPIX_VAR(16, 32) + +OBMC_VAR(32, 16) +OBMC_SUBPIX_VAR(32, 16) + +OBMC_VAR(32, 32) +OBMC_SUBPIX_VAR(32, 32) + +OBMC_VAR(32, 64) +OBMC_SUBPIX_VAR(32, 64) + +OBMC_VAR(64, 32) +OBMC_SUBPIX_VAR(64, 32) + +OBMC_VAR(64, 64) +OBMC_SUBPIX_VAR(64, 64) + +OBMC_VAR(64, 128) +OBMC_SUBPIX_VAR(64, 128) + +OBMC_VAR(128, 64) +OBMC_SUBPIX_VAR(128, 64) + +OBMC_VAR(128, 128) +OBMC_SUBPIX_VAR(128, 128) + +OBMC_VAR(4, 16) +OBMC_SUBPIX_VAR(4, 16) +OBMC_VAR(16, 4) +OBMC_SUBPIX_VAR(16, 4) +OBMC_VAR(8, 32) +OBMC_SUBPIX_VAR(8, 32) +OBMC_VAR(32, 8) +OBMC_SUBPIX_VAR(32, 8) +OBMC_VAR(16, 64) +OBMC_SUBPIX_VAR(16, 64) +OBMC_VAR(64, 16) +OBMC_SUBPIX_VAR(64, 16) + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + uint64_t *sse, int64_t *sum) { + int i, j; + uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12); + *sum += diff; + *sse += diff * diff; + } + + pre += pre_stride; + wsrc += w; + mask += w; + } +} + +static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64; + uint64_t sse64; + highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64); + *sum = (int)sum64; + *sse = (unsigned int)sse64; +} + +static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64; + uint64_t sse64; + highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64); + *sum = (int)ROUND_POWER_OF_TWO(sum64, 2); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4); +} + +static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64; + uint64_t sse64; + highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64); + *sum = (int)ROUND_POWER_OF_TWO(sum64, 4); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8); +} + +#define HIGHBD_OBMC_VAR(W, H) \ + unsigned int aom_highbd_obmc_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ + } \ + \ + unsigned int aom_highbd_10_obmc_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + int64_t var; \ + highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + unsigned int aom_highbd_12_obmc_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + int64_t var; \ + highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define HIGHBD_OBMC_SUBPIX_VAR(W, H) \ + unsigned int aom_highbd_obmc_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + wsrc, mask, sse); \ + } \ + \ + unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ + W, wsrc, mask, sse); \ + } \ + \ + unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ + W, wsrc, mask, sse); \ + } + +HIGHBD_OBMC_VAR(4, 4) +HIGHBD_OBMC_SUBPIX_VAR(4, 4) + +HIGHBD_OBMC_VAR(4, 8) +HIGHBD_OBMC_SUBPIX_VAR(4, 8) + +HIGHBD_OBMC_VAR(8, 4) +HIGHBD_OBMC_SUBPIX_VAR(8, 4) + +HIGHBD_OBMC_VAR(8, 8) +HIGHBD_OBMC_SUBPIX_VAR(8, 8) + +HIGHBD_OBMC_VAR(8, 16) +HIGHBD_OBMC_SUBPIX_VAR(8, 16) + +HIGHBD_OBMC_VAR(16, 8) +HIGHBD_OBMC_SUBPIX_VAR(16, 8) + +HIGHBD_OBMC_VAR(16, 16) +HIGHBD_OBMC_SUBPIX_VAR(16, 16) + +HIGHBD_OBMC_VAR(16, 32) +HIGHBD_OBMC_SUBPIX_VAR(16, 32) + +HIGHBD_OBMC_VAR(32, 16) +HIGHBD_OBMC_SUBPIX_VAR(32, 16) + +HIGHBD_OBMC_VAR(32, 32) +HIGHBD_OBMC_SUBPIX_VAR(32, 32) + +HIGHBD_OBMC_VAR(32, 64) +HIGHBD_OBMC_SUBPIX_VAR(32, 64) + +HIGHBD_OBMC_VAR(64, 32) +HIGHBD_OBMC_SUBPIX_VAR(64, 32) + +HIGHBD_OBMC_VAR(64, 64) +HIGHBD_OBMC_SUBPIX_VAR(64, 64) + +HIGHBD_OBMC_VAR(64, 128) +HIGHBD_OBMC_SUBPIX_VAR(64, 128) + +HIGHBD_OBMC_VAR(128, 64) +HIGHBD_OBMC_SUBPIX_VAR(128, 64) + +HIGHBD_OBMC_VAR(128, 128) +HIGHBD_OBMC_SUBPIX_VAR(128, 128) + +HIGHBD_OBMC_VAR(4, 16) +HIGHBD_OBMC_SUBPIX_VAR(4, 16) +HIGHBD_OBMC_VAR(16, 4) +HIGHBD_OBMC_SUBPIX_VAR(16, 4) +HIGHBD_OBMC_VAR(8, 32) +HIGHBD_OBMC_SUBPIX_VAR(8, 32) +HIGHBD_OBMC_VAR(32, 8) +HIGHBD_OBMC_SUBPIX_VAR(32, 8) +HIGHBD_OBMC_VAR(16, 64) +HIGHBD_OBMC_SUBPIX_VAR(16, 64) +HIGHBD_OBMC_VAR(64, 16) +HIGHBD_OBMC_SUBPIX_VAR(64, 16) +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/libs/libaom/src/aom_dsp/variance.h b/libs/libaom/src/aom_dsp/variance.h new file mode 100644 index 000000000..4550c17b3 --- /dev/null +++ b/libs/libaom/src/aom_dsp/variance.h @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_VARIANCE_H_ +#define AOM_AOM_DSP_VARIANCE_H_ + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define FILTER_BITS 7 +#define FILTER_WEIGHT 128 + +typedef unsigned int (*aom_sad_fn_t)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride); + +typedef unsigned int (*aom_sad_avg_fn_t)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const uint8_t *second_pred); + +typedef void (*aom_copy32xn_fn_t)(const uint8_t *a, int a_stride, uint8_t *b, + int b_stride, int n); + +typedef void (*aom_sad_multi_d_fn_t)(const uint8_t *a, int a_stride, + const uint8_t *const b_array[], + int b_stride, unsigned int *sad_array); + +typedef unsigned int (*aom_variance_fn_t)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse); + +typedef unsigned int (*aom_subpixvariance_fn_t)(const uint8_t *a, int a_stride, + int xoffset, int yoffset, + const uint8_t *b, int b_stride, + unsigned int *sse); + +typedef unsigned int (*aom_subp_avg_variance_fn_t)( + const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b, + int b_stride, unsigned int *sse, const uint8_t *second_pred); + +typedef unsigned int (*aom_dist_wtd_sad_avg_fn_t)( + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param); + +typedef unsigned int (*aom_dist_wtd_subp_avg_variance_fn_t)( + const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b, + int b_stride, unsigned int *sse, const uint8_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param); + +typedef unsigned int (*aom_masked_sad_fn_t)(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, + const uint8_t *msk, int msk_stride, + int invert_mask); +typedef unsigned int (*aom_masked_subpixvariance_fn_t)( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, + const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse); + +void aom_highbd_comp_mask_upsampled_pred( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, + int bd, int subpel_search); + +typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride, + const int32_t *wsrc, + const int32_t *msk); +typedef unsigned int (*aom_obmc_variance_fn_t)(const uint8_t *pred, + int pred_stride, + const int32_t *wsrc, + const int32_t *msk, + unsigned int *sse); +typedef unsigned int (*aom_obmc_subpixvariance_fn_t)( + const uint8_t *pred, int pred_stride, int xoffset, int yoffset, + const int32_t *wsrc, const int32_t *msk, unsigned int *sse); + +typedef struct aom_variance_vtable { + aom_sad_fn_t sdf; + aom_sad_avg_fn_t sdaf; + aom_variance_fn_t vf; + aom_subpixvariance_fn_t svf; + aom_subp_avg_variance_fn_t svaf; + aom_sad_multi_d_fn_t sdx4df; + aom_masked_sad_fn_t msdf; + aom_masked_subpixvariance_fn_t msvf; + aom_obmc_sad_fn_t osdf; + aom_obmc_variance_fn_t ovf; + aom_obmc_subpixvariance_fn_t osvf; + aom_dist_wtd_sad_avg_fn_t jsdaf; + aom_dist_wtd_subp_avg_variance_fn_t jsvaf; +} aom_variance_fn_ptr_t; + +void aom_highbd_var_filter_block2d_bil_first_pass( + const uint8_t *src_ptr8, uint16_t *output_ptr, + unsigned int src_pixels_per_line, int pixel_step, + unsigned int output_height, unsigned int output_width, + const uint8_t *filter); + +void aom_highbd_var_filter_block2d_bil_second_pass( + const uint16_t *src_ptr, uint16_t *output_ptr, + unsigned int src_pixels_per_line, unsigned int pixel_step, + unsigned int output_height, unsigned int output_width, + const uint8_t *filter); + +uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h); + +uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, int w, int h); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_VARIANCE_H_ diff --git a/libs/libaom/src/aom_dsp/vmaf.c b/libs/libaom/src/aom_dsp/vmaf.c new file mode 100644 index 000000000..3a012e768 --- /dev/null +++ b/libs/libaom/src/aom_dsp/vmaf.c @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include + +#include "aom_dsp/blend.h" +#include "aom_dsp/vmaf.h" +#include "aom_ports/system_state.h" + +typedef struct FrameData { + const YV12_BUFFER_CONFIG *source; + const YV12_BUFFER_CONFIG *distorted; + int frame_set; + int bit_depth; +} FrameData; + +static void vmaf_fatal_error(const char *message) { + fprintf(stderr, "Fatal error: %s\n", message); + exit(EXIT_FAILURE); +} + +// A callback function used to pass data to VMAF. +// Returns 0 after reading a frame. +// Returns 2 when there is no more frame to read. +static int read_frame(float *ref_data, float *main_data, float *temp_data, + int stride, void *user_data) { + FrameData *frames = (FrameData *)user_data; + + if (!frames->frame_set) { + const int width = frames->source->y_width; + const int height = frames->source->y_height; + assert(width == frames->distorted->y_width); + assert(height == frames->distorted->y_height); + + if (frames->bit_depth > 8) { + const float scale_factor = 1.0f / (float)(1 << (frames->bit_depth - 8)); + uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(frames->source->y_buffer); + uint16_t *main_ptr = CONVERT_TO_SHORTPTR(frames->distorted->y_buffer); + + for (int row = 0; row < height; ++row) { + for (int col = 0; col < width; ++col) { + ref_data[col] = scale_factor * (float)ref_ptr[col]; + } + ref_ptr += frames->source->y_stride; + ref_data += stride / sizeof(*ref_data); + } + + for (int row = 0; row < height; ++row) { + for (int col = 0; col < width; ++col) { + main_data[col] = scale_factor * (float)main_ptr[col]; + } + main_ptr += frames->distorted->y_stride; + main_data += stride / sizeof(*main_data); + } + } else { + uint8_t *ref_ptr = frames->source->y_buffer; + uint8_t *main_ptr = frames->distorted->y_buffer; + + for (int row = 0; row < height; ++row) { + for (int col = 0; col < width; ++col) { + ref_data[col] = (float)ref_ptr[col]; + } + ref_ptr += frames->source->y_stride; + ref_data += stride / sizeof(*ref_data); + } + + for (int row = 0; row < height; ++row) { + for (int col = 0; col < width; ++col) { + main_data[col] = (float)main_ptr[col]; + } + main_ptr += frames->distorted->y_stride; + main_data += stride / sizeof(*main_data); + } + } + frames->frame_set = 1; + return 0; + } + + (void)temp_data; + return 2; +} + +void aom_calc_vmaf(const char *model_path, const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *distorted, const int bit_depth, + double *const vmaf) { + aom_clear_system_state(); + const int width = source->y_width; + const int height = source->y_height; + FrameData frames = { source, distorted, 0, bit_depth }; + char *fmt = bit_depth == 10 ? "yuv420p10le" : "yuv420p"; + double vmaf_score; + const int ret = + compute_vmaf(&vmaf_score, fmt, width, height, read_frame, + /*user_data=*/&frames, (char *)model_path, + /*log_path=*/NULL, /*log_fmt=*/NULL, /*disable_clip=*/1, + /*disable_avx=*/0, /*enable_transform=*/0, + /*phone_model=*/0, /*do_psnr=*/0, /*do_ssim=*/0, + /*do_ms_ssim=*/0, /*pool_method=*/NULL, /*n_thread=*/0, + /*n_subsample=*/1, /*enable_conf_interval=*/0); + if (ret) vmaf_fatal_error("Failed to compute VMAF scores."); + + aom_clear_system_state(); + *vmaf = vmaf_score; +} + +void aom_calc_vmaf_multi_frame( + void *user_data, const char *model_path, + int (*read_frame)(float *ref_data, float *main_data, float *temp_data, + int stride_byte, void *user_data), + int frame_width, int frame_height, int bit_depth, double *vmaf) { + aom_clear_system_state(); + + char *fmt = bit_depth == 10 ? "yuv420p10le" : "yuv420p"; + double vmaf_score; + const int ret = compute_vmaf( + &vmaf_score, fmt, frame_width, frame_height, read_frame, + /*user_data=*/user_data, (char *)model_path, + /*log_path=*/"vmaf_scores.xml", /*log_fmt=*/NULL, /*disable_clip=*/0, + /*disable_avx=*/0, /*enable_transform=*/0, + /*phone_model=*/0, /*do_psnr=*/0, /*do_ssim=*/0, + /*do_ms_ssim=*/0, /*pool_method=*/NULL, /*n_thread=*/0, + /*n_subsample=*/1, /*enable_conf_interval=*/0); + FILE *vmaf_log = fopen("vmaf_scores.xml", "r"); + if (vmaf_log == NULL || ret) { + vmaf_fatal_error("Failed to compute VMAF scores."); + } + + int frame_index = 0; + char buf[512]; + while (fgets(buf, 511, vmaf_log) != NULL) { + if (memcmp(buf, "\t\t 100.0) { + vmaf_fatal_error("Failed to compute VMAF scores."); + } + vmaf[frame_index++] = score; + } + } + } + fclose(vmaf_log); + + aom_clear_system_state(); +} diff --git a/libs/libaom/src/aom_dsp/vmaf.h b/libs/libaom/src/aom_dsp/vmaf.h new file mode 100644 index 000000000..fb8bf4613 --- /dev/null +++ b/libs/libaom/src/aom_dsp/vmaf.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_VMAF_H_ +#define AOM_AOM_DSP_VMAF_H_ + +#include "aom_scale/yv12config.h" + +void aom_calc_vmaf(const char *model_path, const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *distorted, int bit_depth, + double *vmaf); + +void aom_calc_vmaf_multi_frame( + void *user_data, const char *model_path, + int (*read_frame)(float *ref_data, float *main_data, float *temp_data, + int stride_byte, void *user_data), + int frame_width, int frame_height, int bit_depth, double *vmaf); + +#endif // AOM_AOM_DSP_VMAF_H_ diff --git a/libs/libaom/src/aom_dsp/x86/adaptive_quantize_avx2.c b/libs/libaom/src/aom_dsp/x86/adaptive_quantize_avx2.c new file mode 100644 index 000000000..e33dff20c --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/adaptive_quantize_avx2.c @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "config/aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "av1/encoder/av1_quantize.h" +#include "aom_dsp/x86/quantize_x86.h" + +static INLINE void load_b_values_avx2(const int16_t *zbin_ptr, __m256i *zbin, + const int16_t *round_ptr, __m256i *round, + const int16_t *quant_ptr, __m256i *quant, + const int16_t *dequant_ptr, + __m256i *dequant, + const int16_t *shift_ptr, + __m256i *shift) { + *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr)); + *zbin = _mm256_permute4x64_epi64(*zbin, 0x54); + *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1)); + *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr)); + *round = _mm256_permute4x64_epi64(*round, 0x54); + *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr)); + *quant = _mm256_permute4x64_epi64(*quant, 0x54); + *dequant = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); + *dequant = _mm256_permute4x64_epi64(*dequant, 0x54); + *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr)); + *shift = _mm256_permute4x64_epi64(*shift, 0x54); +} + +static INLINE __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) { + const __m256i coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr)); + const __m256i coeff2 = _mm256_load_si256((__m256i *)(coeff_ptr + 8)); + return _mm256_packs_epi32(coeff1, coeff2); +} + +static INLINE void update_mask1_avx2(__m256i *cmp_mask, + const int16_t *iscan_ptr, int *is_found, + __m256i *mask) { + __m256i temp_mask = _mm256_setzero_si256(); + if (_mm256_movemask_epi8(*cmp_mask)) { + __m256i iscan = _mm256_loadu_si256((const __m256i *)(iscan_ptr)); + temp_mask = _mm256_and_si256(*cmp_mask, iscan); + *is_found = 1; + } + *mask = _mm256_max_epi16(temp_mask, *mask); +} + +static INLINE void update_mask0_avx2(__m256i *qcoeff, __m256i *threshold, + const int16_t *iscan_ptr, int *is_found, + __m256i *mask) { + __m256i zero = _mm256_setzero_si256(); + __m256i coeff[2], cmp_mask0, cmp_mask1; + coeff[0] = _mm256_unpacklo_epi16(*qcoeff, zero); + coeff[1] = _mm256_unpackhi_epi16(*qcoeff, zero); + coeff[0] = _mm256_slli_epi32(coeff[0], AOM_QM_BITS); + cmp_mask0 = _mm256_cmpgt_epi32(coeff[0], threshold[0]); + coeff[1] = _mm256_slli_epi32(coeff[1], AOM_QM_BITS); + cmp_mask1 = _mm256_cmpgt_epi32(coeff[1], threshold[1]); + cmp_mask0 = + _mm256_permute4x64_epi64(_mm256_packs_epi32(cmp_mask0, cmp_mask1), 0xd8); + update_mask1_avx2(&cmp_mask0, iscan_ptr, is_found, mask); +} + +static INLINE void calculate_qcoeff_avx2(__m256i *coeff, const __m256i *round, + const __m256i *quant, + const __m256i *shift) { + __m256i tmp, qcoeff; + qcoeff = _mm256_adds_epi16(*coeff, *round); + tmp = _mm256_mulhi_epi16(qcoeff, *quant); + qcoeff = _mm256_add_epi16(tmp, qcoeff); + *coeff = _mm256_mulhi_epi16(qcoeff, *shift); +} + +static INLINE __m256i calculate_dqcoeff_avx2(__m256i qcoeff, __m256i dequant) { + return _mm256_mullo_epi16(qcoeff, dequant); +} + +static INLINE void store_coefficients_avx2(__m256i coeff_vals, + tran_low_t *coeff_ptr) { + __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15); + __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign); + __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign); + _mm256_store_si256((__m256i *)(coeff_ptr), coeff_vals_lo); + _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi); +} + +void aom_quantize_b_adaptive_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int index = 16; + int non_zero_count = 0; + int non_zero_count_prescan_add_zero = 0; + int is_found0 = 0, is_found1 = 0; + int eob = -1; + const __m256i zero = _mm256_setzero_si256(); + __m256i zbin, round, quant, dequant, shift; + __m256i coeff, qcoeff; + __m256i cmp_mask, mask0 = zero, mask1 = zero; + __m128i temp_mask0, temp_mask1; + int prescan_add[2]; + int thresh[2]; + const qm_val_t wt = (1 << AOM_QM_BITS); + for (int i = 0; i < 2; ++i) { + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1; + } + __m256i threshold[2]; + threshold[0] = _mm256_set1_epi32(thresh[0]); + threshold[1] = _mm256_set1_epi32(thresh[1]); + threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe); + +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + + // Setup global values. + load_b_values_avx2(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, + dequant_ptr, &dequant, quant_shift_ptr, &shift); + + // Do DC and first 15 AC. + coeff = load_coefficients_avx2(coeff_ptr); + qcoeff = _mm256_abs_epi16(coeff); + update_mask0_avx2(&qcoeff, threshold, iscan, &is_found0, &mask0); + __m256i temp0 = _mm256_cmpgt_epi16(qcoeff, zbin); + zbin = _mm256_unpackhi_epi64(zbin, zbin); + cmp_mask = _mm256_permute4x64_epi64(temp0, 0xd8); + update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1); + threshold[0] = threshold[1]; + if (_mm256_movemask_epi8(cmp_mask) == 0) { + _mm256_store_si256((__m256i *)(qcoeff_ptr), zero); + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero); + round = _mm256_unpackhi_epi64(round, round); + quant = _mm256_unpackhi_epi64(quant, quant); + shift = _mm256_unpackhi_epi64(shift, shift); + dequant = _mm256_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff_avx2(&qcoeff, &round, &quant, &shift); + round = _mm256_unpackhi_epi64(round, round); + quant = _mm256_unpackhi_epi64(quant, quant); + shift = _mm256_unpackhi_epi64(shift, shift); + // Reinsert signs + qcoeff = _mm256_sign_epi16(qcoeff, coeff); + // Mask out zbin threshold coeffs + qcoeff = _mm256_and_si256(qcoeff, temp0); + store_coefficients_avx2(qcoeff, qcoeff_ptr); + coeff = calculate_dqcoeff_avx2(qcoeff, dequant); + dequant = _mm256_unpackhi_epi64(dequant, dequant); + store_coefficients_avx2(coeff, dqcoeff_ptr); + } + + // AC only loop. + while (index < n_coeffs) { + coeff = load_coefficients_avx2(coeff_ptr + index); + qcoeff = _mm256_abs_epi16(coeff); + update_mask0_avx2(&qcoeff, threshold, iscan + index, &is_found0, &mask0); + temp0 = _mm256_cmpgt_epi16(qcoeff, zbin); + cmp_mask = _mm256_permute4x64_epi64(temp0, 0xd8); + update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1); + if (_mm256_movemask_epi8(cmp_mask) == 0) { + _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero); + _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero); + index += 16; + continue; + } + calculate_qcoeff_avx2(&qcoeff, &round, &quant, &shift); + qcoeff = _mm256_sign_epi16(qcoeff, coeff); + qcoeff = _mm256_and_si256(qcoeff, temp0); + store_coefficients_avx2(qcoeff, qcoeff_ptr + index); + coeff = calculate_dqcoeff_avx2(qcoeff, dequant); + store_coefficients_avx2(coeff, dqcoeff_ptr + index); + index += 16; + } + if (is_found0) { + temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0), + _mm256_extracti128_si256(mask0, 1)); + non_zero_count = calculate_non_zero_count(temp_mask0); + } + if (is_found1) { + temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1), + _mm256_extracti128_si256(mask1, 1)); + non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1); + } + + for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { + const int rc = scan[i]; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + + for (int i = non_zero_count - 1; i >= 0; i--) { + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + break; + } + } + + *eob_ptr = eob + 1; +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff0 = qcoeff_ptr[rc]; + if (qcoeff0) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const int coeff0 = coeff_ptr[rc] * wt; + const int coeff_sign = AOMSIGN(coeff0); + const int abs_coeff = (coeff0 ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < + (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} diff --git a/libs/libaom/src/aom_dsp/x86/adaptive_quantize_sse2.c b/libs/libaom/src/aom_dsp/x86/adaptive_quantize_sse2.c new file mode 100644 index 000000000..584cd671f --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/adaptive_quantize_sse2.c @@ -0,0 +1,633 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include "config/aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "av1/encoder/av1_quantize.h" +#include "aom_dsp/x86/quantize_x86.h" + +void aom_quantize_b_adaptive_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int index = 16; + int non_zero_count = 0; + int non_zero_count_prescan_add_zero = 0; + int is_found0 = 0, is_found1 = 0; + int eob = -1; + const __m128i zero = _mm_setzero_si128(); + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i all_zero; + __m128i mask0 = zero, mask1 = zero; + + int prescan_add[2]; + int thresh[4]; + const qm_val_t wt = (1 << AOM_QM_BITS); + for (int i = 0; i < 2; ++i) { + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1; + } + thresh[2] = thresh[3] = thresh[1]; + __m128i threshold[2]; + threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); + threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); + +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + // Setup global values. + load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, + dequant_ptr, &dequant, quant_shift_ptr, &shift); + + // Do DC and first 15 AC. + coeff0 = load_coefficients(coeff_ptr); + coeff1 = load_coefficients(coeff_ptr + 8); + + // Poor man's abs(). + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1); + + threshold[0] = threshold[1]; + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff(&qcoeff0, round, quant, shift); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr); + store_coefficients(qcoeff1, qcoeff_ptr + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_coefficients(coeff0, dqcoeff_ptr); + store_coefficients(coeff1, dqcoeff_ptr + 8); + } + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_coefficients(coeff_ptr + index); + coeff1 = load_coefficients(coeff_ptr + index + 8); + + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0, + &mask0); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); + index += 16; + continue; + } + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr + index); + store_coefficients(qcoeff1, qcoeff_ptr + index + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_coefficients(coeff0, dqcoeff_ptr + index); + store_coefficients(coeff1, dqcoeff_ptr + index + 8); + + index += 16; + } + if (is_found0) non_zero_count = calculate_non_zero_count(mask0); + if (is_found1) + non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); + + for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { + const int rc = scan[i]; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + + for (int i = non_zero_count - 1; i >= 0; i--) { + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + break; + } + } + + *eob_ptr = eob + 1; +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + if (qcoeff) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const int coeff = coeff_ptr[rc] * wt; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < + (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} + +void aom_quantize_b_32x32_adaptive_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int index = 16; + const int log_scale = 1; + int non_zero_count = 0; + int non_zero_count_prescan_add_zero = 0; + int is_found0 = 0, is_found1 = 0; + int eob = -1; + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i log_scale_vec = _mm_set1_epi16(log_scale); + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i all_zero; + __m128i mask0 = zero, mask1 = zero; + + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + int prescan_add[2]; + int thresh[4]; + const qm_val_t wt = (1 << AOM_QM_BITS); + for (int i = 0; i < 2; ++i) { + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1; + } + thresh[2] = thresh[3] = thresh[1]; + __m128i threshold[2]; + threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); + threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); + +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + // Setup global values. + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + + // Shift with rounding. + zbin = _mm_add_epi16(zbin, log_scale_vec); + round = _mm_add_epi16(round, log_scale_vec); + zbin = _mm_srli_epi16(zbin, log_scale); + round = _mm_srli_epi16(round, log_scale); + zbin = _mm_sub_epi16(zbin, one); + + // Do DC and first 15 AC. + coeff0 = load_coefficients(coeff_ptr); + coeff1 = load_coefficients(coeff_ptr + 8); + + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1); + + threshold[0] = threshold[1]; + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale); + + // Reinsert signs + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr); + store_coefficients(qcoeff1, qcoeff_ptr + 8); + + calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr, + &log_scale); + dequant = _mm_unpackhi_epi64(dequant, dequant); + calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero, + dqcoeff_ptr + 8, &log_scale); + } + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_coefficients(coeff_ptr + index); + coeff1 = load_coefficients(coeff_ptr + index + 8); + + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0, + &mask0); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); + index += 16; + continue; + } + calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale); + calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale); + + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr + index); + store_coefficients(qcoeff1, qcoeff_ptr + index + 8); + + calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, + dqcoeff_ptr + index, &log_scale); + calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero, + dqcoeff_ptr + index + 8, &log_scale); + index += 16; + } + if (is_found0) non_zero_count = calculate_non_zero_count(mask0); + if (is_found1) + non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); + + for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { + const int rc = scan[i]; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + + for (int i = non_zero_count - 1; i >= 0; i--) { + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + break; + } + } + + *eob_ptr = eob + 1; +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + if (qcoeff) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const int coeff = coeff_ptr[rc] * wt; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} + +void aom_quantize_b_64x64_adaptive_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int index = 16; + const int log_scale = 2; + int non_zero_count = 0; + int non_zero_count_prescan_add_zero = 0; + int is_found0 = 0, is_found1 = 0; + int eob = -1; + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i log_scale_vec = _mm_set1_epi16(log_scale); + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i all_zero; + __m128i mask0 = zero, mask1 = zero; + + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + int prescan_add[2]; + int thresh[4]; + const qm_val_t wt = (1 << AOM_QM_BITS); + for (int i = 0; i < 2; ++i) { + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1; + } + thresh[2] = thresh[3] = thresh[1]; + __m128i threshold[2]; + threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); + threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); + +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + // Setup global values. + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + + // Shift with rounding. + zbin = _mm_add_epi16(zbin, log_scale_vec); + round = _mm_add_epi16(round, log_scale_vec); + zbin = _mm_srli_epi16(zbin, log_scale); + round = _mm_srli_epi16(round, log_scale); + zbin = _mm_sub_epi16(zbin, one); + + // Do DC and first 15 AC. + coeff0 = load_coefficients(coeff_ptr); + coeff1 = load_coefficients(coeff_ptr + 8); + + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1); + + threshold[0] = threshold[1]; + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale); + + // Reinsert signs + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr); + store_coefficients(qcoeff1, qcoeff_ptr + 8); + + calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr, + &log_scale); + dequant = _mm_unpackhi_epi64(dequant, dequant); + calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero, + dqcoeff_ptr + 8, &log_scale); + } + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_coefficients(coeff_ptr + index); + coeff1 = load_coefficients(coeff_ptr + index + 8); + + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0, + &mask0); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); + index += 16; + continue; + } + calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale); + calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale); + + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr + index); + store_coefficients(qcoeff1, qcoeff_ptr + index + 8); + + calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, + dqcoeff_ptr + index, &log_scale); + calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero, + dqcoeff_ptr + index + 8, &log_scale); + index += 16; + } + if (is_found0) non_zero_count = calculate_non_zero_count(mask0); + if (is_found1) + non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); + + for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { + const int rc = scan[i]; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + + for (int i = non_zero_count - 1; i >= 0; i--) { + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + break; + } + } + + *eob_ptr = eob + 1; +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + if (qcoeff) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const int coeff = coeff_ptr[rc] * wt; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} diff --git a/libs/libaom/src/aom_dsp/x86/aom_asm_stubs.c b/libs/libaom/src/aom_dsp/x86/aom_asm_stubs.c new file mode 100644 index 000000000..ce8285e43 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/aom_asm_stubs.c @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/convolve.h" + +#if HAVE_SSE2 +filter8_1dfunction aom_filter_block1d16_v8_sse2; +filter8_1dfunction aom_filter_block1d16_h8_sse2; +filter8_1dfunction aom_filter_block1d8_v8_sse2; +filter8_1dfunction aom_filter_block1d8_h8_sse2; +filter8_1dfunction aom_filter_block1d4_v8_sse2; +filter8_1dfunction aom_filter_block1d4_h8_sse2; +filter8_1dfunction aom_filter_block1d16_v4_sse2; +filter8_1dfunction aom_filter_block1d16_h4_sse2; + +filter8_1dfunction aom_filter_block1d8_h4_sse2; +filter8_1dfunction aom_filter_block1d8_v4_sse2; +filter8_1dfunction aom_filter_block1d4_h4_sse2; +filter8_1dfunction aom_filter_block1d4_v4_sse2; + +filter8_1dfunction aom_filter_block1d16_v2_sse2; +filter8_1dfunction aom_filter_block1d16_h2_sse2; +filter8_1dfunction aom_filter_block1d8_v2_sse2; +filter8_1dfunction aom_filter_block1d8_h2_sse2; +filter8_1dfunction aom_filter_block1d4_v2_sse2; +filter8_1dfunction aom_filter_block1d4_h2_sse2; + +// void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); + +#if CONFIG_AV1_HIGHBITDEPTH +highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2; + +highbd_filter8_1dfunction aom_highbd_filter_block1d16_v4_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d16_h4_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_v4_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_h4_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_v4_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_h4_sse2; + +highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2; + +// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void aom_highbd_convolve8_vert_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); +HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); +#endif +#endif // HAVE_SSE2 diff --git a/libs/libaom/src/aom_dsp/x86/aom_convolve_copy_sse2.asm b/libs/libaom/src/aom_dsp/x86/aom_convolve_copy_sse2.asm new file mode 100644 index 000000000..7283c32b8 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/aom_convolve_copy_sse2.asm @@ -0,0 +1,297 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro convolve_fn 1-2 +%ifidn %1, avg +%define AUX_XMM_REGS 4 +%else +%define AUX_XMM_REGS 0 +%endif +%ifidn %2, highbd +%define pavg pavgw +cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ + dst, dst_stride, \ + fx, fxs, fy, fys, w, h, bd +%else +%define pavg pavgb +cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ + dst, dst_stride, \ + fx, fxs, fy, fys, w, h +%endif + mov r4d, dword wm +%ifidn %2, highbd + shl r4d, 1 + shl srcq, 1 + shl src_strideq, 1 + shl dstq, 1 + shl dst_strideq, 1 +%else + cmp r4d, 4 + je .w4 +%endif + cmp r4d, 8 + je .w8 + cmp r4d, 16 + je .w16 + cmp r4d, 32 + je .w32 + + cmp r4d, 64 + je .w64 +%ifidn %2, highbd + cmp r4d, 128 + je .w128 + +.w256: + mov r4d, dword hm +.loop256: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+32] + movu m3, [srcq+48] +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq+16] + pavg m2, [dstq+32] + pavg m3, [dstq+48] +%endif + mova [dstq ], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + movu m0, [srcq+64] + movu m1, [srcq+80] + movu m2, [srcq+96] + movu m3, [srcq+112] +%ifidn %1, avg + pavg m0, [dstq+64] + pavg m1, [dstq+80] + pavg m2, [dstq+96] + pavg m3, [dstq+112] +%endif + mova [dstq+64], m0 + mova [dstq+80], m1 + mova [dstq+96], m2 + mova [dstq+112], m3 + movu m0, [srcq+128] + movu m1, [srcq+128+16] + movu m2, [srcq+128+32] + movu m3, [srcq+128+48] +%ifidn %1, avg + pavg m0, [dstq+128] + pavg m1, [dstq+128+16] + pavg m2, [dstq+128+32] + pavg m3, [dstq+128+48] +%endif + mova [dstq+128 ], m0 + mova [dstq+128+16], m1 + mova [dstq+128+32], m2 + mova [dstq+128+48], m3 + movu m0, [srcq+128+64] + movu m1, [srcq+128+80] + movu m2, [srcq+128+96] + movu m3, [srcq+128+112] + add srcq, src_strideq +%ifidn %1, avg + pavg m0, [dstq+128+64] + pavg m1, [dstq+128+80] + pavg m2, [dstq+128+96] + pavg m3, [dstq+128+112] +%endif + mova [dstq+128+64], m0 + mova [dstq+128+80], m1 + mova [dstq+128+96], m2 + mova [dstq+128+112], m3 + add dstq, dst_strideq + sub r4d, 1 + jnz .loop256 + RET +%endif + +.w128: + mov r4d, dword hm +.loop128: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+32] + movu m3, [srcq+48] +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq+16] + pavg m2, [dstq+32] + pavg m3, [dstq+48] +%endif + mova [dstq ], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + movu m0, [srcq+64] + movu m1, [srcq+80] + movu m2, [srcq+96] + movu m3, [srcq+112] + add srcq, src_strideq +%ifidn %1, avg + pavg m0, [dstq+64] + pavg m1, [dstq+80] + pavg m2, [dstq+96] + pavg m3, [dstq+112] +%endif + mova [dstq+64], m0 + mova [dstq+80], m1 + mova [dstq+96], m2 + mova [dstq+112], m3 + add dstq, dst_strideq + sub r4d, 1 + jnz .loop128 + RET + +.w64: + mov r4d, dword hm +.loop64: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+32] + movu m3, [srcq+48] + add srcq, src_strideq +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq+16] + pavg m2, [dstq+32] + pavg m3, [dstq+48] +%endif + mova [dstq ], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + add dstq, dst_strideq + sub r4d, 1 + jnz .loop64 + RET + +.w32: + mov r4d, dword hm +.loop32: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+src_strideq] + movu m3, [srcq+src_strideq+16] + lea srcq, [srcq+src_strideq*2] +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq +16] + pavg m2, [dstq+dst_strideq] + pavg m3, [dstq+dst_strideq+16] +%endif + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+dst_strideq ], m2 + mova [dstq+dst_strideq+16], m3 + lea dstq, [dstq+dst_strideq*2] + sub r4d, 2 + jnz .loop32 + RET + +.w16: + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] +.loop16: + movu m0, [srcq] + movu m1, [srcq+src_strideq] + movu m2, [srcq+src_strideq*2] + movu m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq+dst_strideq] + pavg m2, [dstq+dst_strideq*2] + pavg m3, [dstq+r6q] +%endif + mova [dstq ], m0 + mova [dstq+dst_strideq ], m1 + mova [dstq+dst_strideq*2], m2 + mova [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 + jnz .loop16 + RET + +.w8: + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] +.loop8: + movh m0, [srcq] + movh m1, [srcq+src_strideq] + movh m2, [srcq+src_strideq*2] + movh m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] +%ifidn %1, avg + movh m4, [dstq] + movh m5, [dstq+dst_strideq] + movh m6, [dstq+dst_strideq*2] + movh m7, [dstq+r6q] + pavg m0, m4 + pavg m1, m5 + pavg m2, m6 + pavg m3, m7 +%endif + movh [dstq ], m0 + movh [dstq+dst_strideq ], m1 + movh [dstq+dst_strideq*2], m2 + movh [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 + jnz .loop8 + RET + +%ifnidn %2, highbd +.w4: + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] +.loop4: + movd m0, [srcq] + movd m1, [srcq+src_strideq] + movd m2, [srcq+src_strideq*2] + movd m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] +%ifidn %1, avg + movd m4, [dstq] + movd m5, [dstq+dst_strideq] + movd m6, [dstq+dst_strideq*2] + movd m7, [dstq+r6q] + pavg m0, m4 + pavg m1, m5 + pavg m2, m6 + pavg m3, m7 +%endif + movd [dstq ], m0 + movd [dstq+dst_strideq ], m1 + movd [dstq+dst_strideq*2], m2 + movd [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 + jnz .loop4 + RET +%endif +%endmacro + +INIT_XMM sse2 +convolve_fn copy +convolve_fn avg +convolve_fn copy, highbd diff --git a/libs/libaom/src/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm b/libs/libaom/src/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm new file mode 100644 index 000000000..b6f040791 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm @@ -0,0 +1,613 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + +%include "aom_ports/x86_abi_support.asm" + +;Note: tap3 and tap4 have to be applied and added after other taps to avoid +;overflow. + +%macro HIGH_GET_FILTERS_4 0 + mov rdx, arg(5) ;filter ptr + mov rcx, 0x00000040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + psrldq xmm7, 8 + pshuflw xmm4, xmm7, 0b ;k4 + pshuflw xmm5, xmm7, 01010101b ;k5 + pshuflw xmm6, xmm7, 10101010b ;k6 + pshuflw xmm7, xmm7, 11111111b ;k7 + + punpcklwd xmm0, xmm6 + punpcklwd xmm2, xmm5 + punpcklwd xmm3, xmm4 + punpcklwd xmm1, xmm7 + + movdqa k0k6, xmm0 + movdqa k2k5, xmm2 + movdqa k3k4, xmm3 + movdqa k1k7, xmm1 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 + + ;Compute max and min values of a pixel + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bps + movq xmm0, rdx + movq xmm1, rcx + pshufd xmm0, xmm0, 0b + movdqa xmm2, xmm0 + psllw xmm0, xmm1 + psubw xmm0, xmm2 + pxor xmm1, xmm1 + movdqa max, xmm0 ;max value (for clamping) + movdqa min, xmm1 ;min value (for clamping) + +%endm + +%macro HIGH_APPLY_FILTER_4 1 + punpcklwd xmm0, xmm6 ;two row in one register + punpcklwd xmm1, xmm7 + punpcklwd xmm2, xmm5 + punpcklwd xmm3, xmm4 + + pmaddwd xmm0, k0k6 ;multiply the filter factors + pmaddwd xmm1, k1k7 + pmaddwd xmm2, k2k5 + pmaddwd xmm3, k3k4 + + paddd xmm0, xmm1 ;sum + paddd xmm0, xmm2 + paddd xmm0, xmm3 + + paddd xmm0, krd ;rounding + psrad xmm0, 7 ;shift + packssdw xmm0, xmm0 ;pack to word + + ;clamp the values + pminsw xmm0, max + pmaxsw xmm0, min + +%if %1 + movq xmm1, [rdi] + pavgw xmm0, xmm1 +%endif + movq [rdi], xmm0 +%endm + +%macro HIGH_GET_FILTERS 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x00000040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + pshufhw xmm4, xmm7, 0b ;k4 + pshufhw xmm5, xmm7, 01010101b ;k5 + pshufhw xmm6, xmm7, 10101010b ;k6 + pshufhw xmm7, xmm7, 11111111b ;k7 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + punpcklwd xmm0, xmm1 + punpckhwd xmm6, xmm7 + punpckhwd xmm2, xmm5 + punpckhwd xmm3, xmm4 + + movdqa k0k1, xmm0 ;store filter factors on stack + movdqa k6k7, xmm6 + movdqa k2k5, xmm2 + movdqa k3k4, xmm3 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 ;rounding + + ;Compute max and min values of a pixel + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bps + movq xmm0, rdx + movq xmm1, rcx + pshufd xmm0, xmm0, 0b + movdqa xmm2, xmm0 + psllw xmm0, xmm1 + psubw xmm0, xmm2 + pxor xmm1, xmm1 + movdqa max, xmm0 ;max value (for clamping) + movdqa min, xmm1 ;min value (for clamping) +%endm + +%macro LOAD_VERT_8 1 + movdqu xmm0, [rsi + %1] ;0 + movdqu xmm1, [rsi + rax + %1] ;1 + movdqu xmm6, [rsi + rdx * 2 + %1] ;6 + lea rsi, [rsi + rax] + movdqu xmm7, [rsi + rdx * 2 + %1] ;7 + movdqu xmm2, [rsi + rax + %1] ;2 + movdqu xmm3, [rsi + rax * 2 + %1] ;3 + movdqu xmm4, [rsi + rdx + %1] ;4 + movdqu xmm5, [rsi + rax * 4 + %1] ;5 +%endm + +%macro HIGH_APPLY_FILTER_8 2 + movdqu temp, xmm4 + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm1 + punpckhwd xmm4, xmm1 + movdqa xmm1, xmm6 + punpcklwd xmm6, xmm7 + punpckhwd xmm1, xmm7 + movdqa xmm7, xmm2 + punpcklwd xmm2, xmm5 + punpckhwd xmm7, xmm5 + + movdqu xmm5, temp + movdqu temp, xmm4 + movdqa xmm4, xmm3 + punpcklwd xmm3, xmm5 + punpckhwd xmm4, xmm5 + movdqu xmm5, temp + + pmaddwd xmm0, k0k1 + pmaddwd xmm5, k0k1 + pmaddwd xmm6, k6k7 + pmaddwd xmm1, k6k7 + pmaddwd xmm2, k2k5 + pmaddwd xmm7, k2k5 + pmaddwd xmm3, k3k4 + pmaddwd xmm4, k3k4 + + paddd xmm0, xmm6 + paddd xmm0, xmm2 + paddd xmm0, xmm3 + paddd xmm5, xmm1 + paddd xmm5, xmm7 + paddd xmm5, xmm4 + + paddd xmm0, krd ;rounding + paddd xmm5, krd + psrad xmm0, 7 ;shift + psrad xmm5, 7 + packssdw xmm0, xmm5 ;pack back to word + + ;clamp the values + pminsw xmm0, max + pmaxsw xmm0, min + +%if %1 + movdqu xmm1, [rdi + %2] + pavgw xmm0, xmm1 +%endif + movdqu [rdi + %2], xmm0 +%endm + +SECTION .text + +;void aom_filter_block1d4_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_highbd_filter_block1d4_v8_sse2) PRIVATE +sym(aom_highbd_filter_block1d4_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 7 + %define k0k6 [rsp + 16 * 0] + %define k2k5 [rsp + 16 * 1] + %define k3k4 [rsp + 16 * 2] + %define k1k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define max [rsp + 16 * 5] + %define min [rsp + 16 * 6] + + HIGH_GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movq xmm0, [rsi] ;load src: row 0 + movq xmm1, [rsi + rax] ;1 + movq xmm6, [rsi + rdx * 2] ;6 + lea rsi, [rsi + rax] + movq xmm7, [rsi + rdx * 2] ;7 + movq xmm2, [rsi + rax] ;2 + movq xmm3, [rsi + rax * 2] ;3 + movq xmm4, [rsi + rdx] ;4 + movq xmm5, [rsi + rax * 4] ;5 + + HIGH_APPLY_FILTER_4 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 7 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d8_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_highbd_filter_block1d8_v8_sse2) PRIVATE +sym(aom_highbd_filter_block1d8_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + HIGH_APPLY_FILTER_8 0, 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d16_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_highbd_filter_block1d16_v8_sse2) PRIVATE +sym(aom_highbd_filter_block1d16_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + HIGH_APPLY_FILTER_8 0, 0 + sub rsi, rax + + LOAD_VERT_8 16 + HIGH_APPLY_FILTER_8 0, 16 + add rdi, rbx + + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d4_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_highbd_filter_block1d4_h8_sse2) PRIVATE +sym(aom_highbd_filter_block1d4_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 7 + %define k0k6 [rsp + 16 * 0] + %define k2k5 [rsp + 16 * 1] + %define k3k4 [rsp + 16 * 2] + %define k1k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define max [rsp + 16 * 5] + %define min [rsp + 16 * 6] + + HIGH_GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm4, [rsi + 2] + movdqa xmm1, xmm0 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm4 + + psrldq xmm1, 2 + psrldq xmm6, 4 + psrldq xmm7, 6 + psrldq xmm2, 4 + psrldq xmm3, 6 + psrldq xmm5, 2 + + HIGH_APPLY_FILTER_4 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 7 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d8_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_highbd_filter_block1d8_h8_sse2) PRIVATE +sym(aom_highbd_filter_block1d8_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm1, [rsi - 4] + movdqu xmm2, [rsi - 2] + movdqu xmm3, [rsi] + movdqu xmm4, [rsi + 2] + movdqu xmm5, [rsi + 4] + movdqu xmm6, [rsi + 6] + movdqu xmm7, [rsi + 8] + + HIGH_APPLY_FILTER_8 0, 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d16_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_highbd_filter_block1d16_h8_sse2) PRIVATE +sym(aom_highbd_filter_block1d16_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm1, [rsi - 4] + movdqu xmm2, [rsi - 2] + movdqu xmm3, [rsi] + movdqu xmm4, [rsi + 2] + movdqu xmm5, [rsi + 4] + movdqu xmm6, [rsi + 6] + movdqu xmm7, [rsi + 8] + + HIGH_APPLY_FILTER_8 0, 0 + + movdqu xmm0, [rsi + 10] ;load src + movdqu xmm1, [rsi + 12] + movdqu xmm2, [rsi + 14] + movdqu xmm3, [rsi + 16] + movdqu xmm4, [rsi + 18] + movdqu xmm5, [rsi + 20] + movdqu xmm6, [rsi + 22] + movdqu xmm7, [rsi + 24] + + HIGH_APPLY_FILTER_8 0, 16 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/libs/libaom/src/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm b/libs/libaom/src/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm new file mode 100644 index 000000000..a7152be57 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm @@ -0,0 +1,367 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "aom_ports/x86_abi_support.asm" + +%macro HIGH_GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x00000040 + + movdqa xmm3, [rdx] ;load filters + pshuflw xmm4, xmm3, 11111111b ;k3 + psrldq xmm3, 8 + pshuflw xmm3, xmm3, 0b ;k4 + punpcklwd xmm4, xmm3 ;k3k4 + + movq xmm3, rcx ;rounding + pshufd xmm3, xmm3, 0 + + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bps + movq xmm5, rdx + movq xmm2, rcx + pshufd xmm5, xmm5, 0b + movdqa xmm1, xmm5 + psllw xmm5, xmm2 + psubw xmm5, xmm1 ;max value (for clamping) + pxor xmm2, xmm2 ;min value (for clamping) + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro HIGH_APPLY_FILTER_4 1 + + punpcklwd xmm0, xmm1 ;two row in one register + pmaddwd xmm0, xmm4 ;multiply the filter factors + + paddd xmm0, xmm3 ;rounding + psrad xmm0, 7 ;shift + packssdw xmm0, xmm0 ;pack to word + + ;clamp the values + pminsw xmm0, xmm5 + pmaxsw xmm0, xmm2 + +%if %1 + movq xmm1, [rdi] + pavgw xmm0, xmm1 +%endif + + movq [rdi], xmm0 + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + dec rcx +%endm + +%macro HIGH_GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x00000040 + + movdqa xmm6, [rdx] ;load filters + + pshuflw xmm7, xmm6, 11111111b ;k3 + pshufhw xmm6, xmm6, 0b ;k4 + psrldq xmm6, 8 + punpcklwd xmm7, xmm6 ;k3k4k3k4k3k4k3k4 + + movq xmm4, rcx ;rounding + pshufd xmm4, xmm4, 0 + + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bps + movq xmm3, rdx + movq xmm5, rcx + pshufd xmm3, xmm3, 0b + movdqa xmm1, xmm3 + psllw xmm3, xmm5 + psubw xmm3, xmm1 ;max value (for clamping) + pxor xmm5, xmm5 ;min value (for clamping) + + movdqa max, xmm3 + movdqa min, xmm5 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro HIGH_APPLY_FILTER_8 1 + movdqa xmm6, xmm0 + punpckhwd xmm6, xmm1 + punpcklwd xmm0, xmm1 + pmaddwd xmm6, xmm7 + pmaddwd xmm0, xmm7 + + paddd xmm6, xmm4 ;rounding + paddd xmm0, xmm4 ;rounding + psrad xmm6, 7 ;shift + psrad xmm0, 7 ;shift + packssdw xmm0, xmm6 ;pack back to word + + ;clamp the values + pminsw xmm0, max + pmaxsw xmm0, min + +%if %1 + movdqu xmm1, [rdi] + pavgw xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + dec rcx +%endm + +%macro HIGH_APPLY_FILTER_16 1 + movdqa xmm5, xmm0 + movdqa xmm6, xmm2 + punpckhwd xmm5, xmm1 + punpckhwd xmm6, xmm3 + punpcklwd xmm0, xmm1 + punpcklwd xmm2, xmm3 + + pmaddwd xmm5, xmm7 + pmaddwd xmm6, xmm7 + pmaddwd xmm0, xmm7 + pmaddwd xmm2, xmm7 + + paddd xmm5, xmm4 ;rounding + paddd xmm6, xmm4 + paddd xmm0, xmm4 + paddd xmm2, xmm4 + + psrad xmm5, 7 ;shift + psrad xmm6, 7 + psrad xmm0, 7 + psrad xmm2, 7 + + packssdw xmm0, xmm5 ;pack back to word + packssdw xmm2, xmm6 ;pack back to word + + ;clamp the values + pminsw xmm0, max + pmaxsw xmm0, min + pminsw xmm2, max + pmaxsw xmm2, min + +%if %1 + movdqu xmm1, [rdi] + movdqu xmm3, [rdi + 16] + pavgw xmm0, xmm1 + pavgw xmm2, xmm3 +%endif + movdqu [rdi], xmm0 ;store the result + movdqu [rdi + 16], xmm2 ;store the result + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + dec rcx +%endm + +SECTION .text + +global sym(aom_highbd_filter_block1d4_v2_sse2) PRIVATE +sym(aom_highbd_filter_block1d4_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM_4 +.loop: + movq xmm0, [rsi] ;load src + movq xmm1, [rsi + 2*rax] + + HIGH_APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_highbd_filter_block1d8_v2_sse2) PRIVATE +sym(aom_highbd_filter_block1d8_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 8 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 2 + %define max [rsp + 16 * 0] + %define min [rsp + 16 * 1] + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + 2*rax] ;1 + + HIGH_APPLY_FILTER_8 0 + jnz .loop + + add rsp, 16 * 2 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_highbd_filter_block1d16_v2_sse2) PRIVATE +sym(aom_highbd_filter_block1d16_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 9 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 2 + %define max [rsp + 16 * 0] + %define min [rsp + 16 * 1] + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm2, [rsi + 16] + movdqu xmm1, [rsi + 2*rax] ;1 + movdqu xmm3, [rsi + 2*rax + 16] + + HIGH_APPLY_FILTER_16 0 + jnz .loop + + add rsp, 16 * 2 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_highbd_filter_block1d4_h2_sse2) PRIVATE +sym(aom_highbd_filter_block1d4_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 2 + + HIGH_APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_highbd_filter_block1d8_h2_sse2) PRIVATE +sym(aom_highbd_filter_block1d8_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 8 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 2 + %define max [rsp + 16 * 0] + %define min [rsp + 16 * 1] + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 2] + + HIGH_APPLY_FILTER_8 0 + jnz .loop + + add rsp, 16 * 2 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_highbd_filter_block1d16_h2_sse2) PRIVATE +sym(aom_highbd_filter_block1d16_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 9 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 2 + %define max [rsp + 16 * 0] + %define min [rsp + 16 * 1] + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 2] + movdqu xmm2, [rsi + 16] + movdqu xmm3, [rsi + 18] + + HIGH_APPLY_FILTER_16 0 + jnz .loop + + add rsp, 16 * 2 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c new file mode 100644 index 000000000..94b5da171 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c @@ -0,0 +1,1441 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/convolve.h" +#include "aom_dsp/x86/convolve_avx2.h" +#include "aom_ports/mem.h" + +#if defined(__clang__) +#if (__clang_major__ > 0 && __clang_major__ < 3) || \ + (__clang_major__ == 3 && __clang_minor__ <= 3) || \ + (defined(__APPLE__) && defined(__apple_build_version__) && \ + ((__clang_major__ == 4 && __clang_minor__ <= 2) || \ + (__clang_major__ == 5 && __clang_minor__ == 0))) +#define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *)&(x)) +#else // clang > 3.3, and not 5.0 on macosx. +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // clang <= 3.3 +#elif defined(__GNUC__) +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) +#define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *)&(x)) +#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 +#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) +#else // gcc > 4.7 +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // gcc <= 4.6 +#else // !(gcc || clang) +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // __clang__ + +static INLINE void xx_storeu2_epi32(const uint8_t *output_ptr, + const ptrdiff_t stride, const __m256i *a) { + *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a)); + *((uint32_t *)(output_ptr + stride)) = + _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1)); +} + +static INLINE __m256i xx_loadu2_epi64(const void *hi, const void *lo) { + __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo))); + a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1); + return a; +} + +static INLINE void xx_storeu2_epi64(const uint8_t *output_ptr, + const ptrdiff_t stride, const __m256i *a) { + _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a)); + _mm_storel_epi64((__m128i *)(output_ptr + stride), + _mm256_extractf128_si256(*a, 1)); +} + +static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) { + __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo))); + a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1); + return a; +} + +static INLINE void xx_store2_mi128(const uint8_t *output_ptr, + const ptrdiff_t stride, const __m256i *a) { + _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a)); + _mm_store_si128((__m128i *)(output_ptr + stride), + _mm256_extractf128_si256(*a, 1)); +} + +static void aom_filter_block1d4_h4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + firstFilters = + _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u)); + filt1Reg = _mm256_load_si256((__m256i const *)(filt4_d4_global_avx2)); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + + srcRegFilt32b1_1 = + _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = + _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + src_ptr += src_stride; + + xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 4 bytes + if (i > 0) { + __m128i srcReg1, srcRegFilt1_1; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt1_1 = + _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); + + srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128()); + // shift by 6 bit each 16 bit + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 4 bytes + *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1); + } +} + +static void aom_filter_block1d4_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt1Reg, filt2Reg; + __m256i firstFilters, secondFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2; + __m256i srcReg32b1; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 32 bits + firstFilters = _mm256_shuffle_epi32(filtersReg32, 0); + // duplicate only the second 32 bits + secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55); + + filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2); + filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32)); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + + // filter the source buffer + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); + + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); + + srcRegFilt32b1_1 = + _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = + _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + src_ptr += src_stride; + + xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 4 bytes + if (i > 0) { + __m128i srcReg1, srcRegFilt1_1; + __m128i srcRegFilt2; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt1_1 = + _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); + + // filter the source buffer + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters)); + + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128()); + // shift by 6 bit each 16 bit + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 4 bytes + *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1); + } +} + +static void aom_filter_block1d8_h4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt2Reg, filt3Reg; + __m256i secondFilters, thirdFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1, filtersReg32; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + + filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + + // multiply the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits + srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b1_1); + + src_ptr += src_stride; + + xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 8 bytes + if (i > 0) { + __m128i srcReg1, srcRegFilt1_1; + __m128i srcRegFilt2, srcRegFilt3; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + + // filter the source buffer + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters)); + srcRegFilt3 = + _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt2, srcRegFilt3); + + // shift by 6 bit each 16 bit + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 8 bytes + _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1); + } +} + +static void aom_filter_block1d8_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); + + filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2); + filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = + _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + src_ptr += src_stride; + + xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 8 bytes + if (i > 0) { + __m128i srcReg1, srcRegFilt1_1; + __m128i srcRegFilt2, srcRegFilt3; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1_1 = + _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = + _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2)); + + // shift by 6 bit each 16 bit + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 8 bytes + _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1); + } +} + +static void aom_filter_block1d16_h4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt2Reg, filt3Reg; + __m256i secondFilters, thirdFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1, srcReg32b2, filtersReg32; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + + filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + + // multiply the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + + // reading 2 strides of the next 16 bytes + // (part of it was being read by earlier read) + srcReg32b2 = + xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); + + src_ptr += src_stride; + + xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 16 bytes + if (i > 0) { + __m256i srcReg1, srcReg12; + __m256i srcRegFilt2, srcRegFilt3, srcRegFilt1_1; + + srcReg1 = _mm256_loadu_si256((const __m256i *)(src_ptr)); + srcReg12 = _mm256_permute4x64_epi64(srcReg1, 0x94); + + // filter the source buffer + srcRegFilt2 = _mm256_shuffle_epi8(srcReg12, filt2Reg); + srcRegFilt3 = _mm256_shuffle_epi8(srcReg12, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt2 = _mm256_maddubs_epi16(srcRegFilt2, secondFilters); + srcRegFilt3 = _mm256_maddubs_epi16(srcRegFilt3, thirdFilters); + + // add and saturate the results together + srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt2, srcRegFilt3); + + // shift by 6 bit each 16 bit + srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt1_1, addFilterReg32); + srcRegFilt1_1 = _mm256_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1_1 = _mm256_packus_epi16(srcRegFilt1_1, srcRegFilt1_1); + srcRegFilt1_1 = _mm256_permute4x64_epi64(srcRegFilt1_1, 0x8); + + // save 16 bytes + _mm_store_si128((__m128i *)output_ptr, + _mm256_castsi256_si128(srcRegFilt1_1)); + } +} + +static void aom_filter_block1d16_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1, srcReg32b2, filtersReg32; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); + + filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2); + filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23); + + // reading 2 strides of the next 16 bytes + // (part of it was being read by earlier read) + srcReg32b2 = + xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8); + + // filter the source buffer + srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm256_adds_epi16( + srcRegFilt32b2_1, _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2)); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); + + src_ptr += src_stride; + + xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 16 bytes + if (i > 0) { + __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; + __m128i srcRegFilt2, srcRegFilt3; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1_1 = + _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = + _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2)); + + // reading the next 16 bytes + // (part of it was being read by earlier read) + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); + + // filter the source buffer + srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt2_1 = + _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = + _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt2_1 = + _mm_adds_epi16(srcRegFilt2_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2)); + + // shift by 6 bit each 16 bit + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); + + srcRegFilt2_1 = + _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); + + // save 16 bytes + _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1); + } +} + +static void aom_filter_block1d8_v4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i filtersReg32, addFilterReg32; + __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56; + __m256i srcReg23_34_lo, srcReg45_56_lo; + __m256i resReg23_34_lo, resReg45_56_lo; + __m256i resReglo, resReg; + __m256i secondFilters, thirdFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg4x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4))); + + // have consecutive loads on the same 256 register + srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21); + + srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg5x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5))); + srcReg45 = + _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1); + + srcReg6x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6))); + srcReg56 = + _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1); + + // merge every two consecutive registers + srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56); + + // multiply 2 adjacent elements with the filter and add the result + resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters); + resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters); + + // add and saturate the results together + resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo); + + // shift by 6 bit each 16 bit + resReglo = _mm256_adds_epi16(resReglo, addFilterReg32); + resReglo = _mm256_srai_epi16(resReglo, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg = _mm256_packus_epi16(resReglo, resReglo); + + src_ptr += src_stride; + + xx_storeu2_epi64(output_ptr, out_pitch, &resReg); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_34_lo = srcReg45_56_lo; + srcReg4x = srcReg6x; + } +} + +static void aom_filter_block1d8_v8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32; + __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; + __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; + __m256i srcReg32b11, srcReg32b12, filtersReg32; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + // load 16 bytes 7 times in stride of src_pitch + srcReg32b1 = xx_loadu2_epi64(src_ptr + src_pitch, src_ptr); + srcReg32b3 = + xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg32b5 = + xx_loadu2_epi64(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4); + srcReg32b7 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6))); + + // have each consecutive loads on the same 256 register + srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21); + srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21); + srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21); + // merge every two consecutive registers except the last one + srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); + srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); + srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg32b8 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7))); + srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, + _mm256_castsi256_si128(srcReg32b8), 1); + srcReg32b9 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 8))); + srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, + _mm256_castsi256_si128(srcReg32b9), 1); + + // merge every two consecutive registers + // save + srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); + srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, + _mm256_adds_epi16(srcReg32b8, srcReg32b12)); + + // shift by 6 bit each 16 bit + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32); + srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcReg32b1 = _mm256_packus_epi16(srcReg32b10, _mm256_setzero_si256()); + + src_ptr += src_stride; + + xx_storeu2_epi64(output_ptr, out_pitch, &srcReg32b1); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg32b10 = srcReg32b11; + srcReg32b11 = srcReg32b2; + srcReg32b2 = srcReg32b4; + srcReg32b7 = srcReg32b9; + } + if (i > 0) { + __m128i srcRegFilt1, srcRegFilt4, srcRegFilt6, srcRegFilt8; + // load the last 16 bytes + srcRegFilt8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); + + // merge the last 2 results together + srcRegFilt4 = + _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), + _mm256_castsi256_si128(firstFilters)); + srcRegFilt4 = + _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), + _mm256_castsi256_si128(secondFilters)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1 = + _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6)); + + // shift by 6 bit each 16 bit + srcRegFilt1 = + _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, _mm_setzero_si128()); + + // save 8 bytes + _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1); + } +} + +static void aom_filter_block1d16_v4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i filtersReg32, addFilterReg32; + __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56; + __m256i srcReg23_34_lo, srcReg23_34_hi, srcReg45_56_lo, srcReg45_56_hi; + __m256i resReg23_34_lo, resReg23_34_hi, resReg45_56_lo, resReg45_56_hi; + __m256i resReglo, resReghi, resReg; + __m256i secondFilters, thirdFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg23 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg4x = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); + + // have consecutive loads on the same 256 register + srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21); + + srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34); + srcReg23_34_hi = _mm256_unpackhi_epi8(srcReg23, srcReg34); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg5x = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); + srcReg45 = + _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1); + + srcReg6x = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); + srcReg56 = + _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1); + + // merge every two consecutive registers + srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56); + srcReg45_56_hi = _mm256_unpackhi_epi8(srcReg45, srcReg56); + + // multiply 2 adjacent elements with the filter and add the result + resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters); + resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters); + + // add and saturate the results together + resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo); + + // multiply 2 adjacent elements with the filter and add the result + resReg23_34_hi = _mm256_maddubs_epi16(srcReg23_34_hi, secondFilters); + resReg45_56_hi = _mm256_maddubs_epi16(srcReg45_56_hi, thirdFilters); + + // add and saturate the results together + resReghi = _mm256_adds_epi16(resReg23_34_hi, resReg45_56_hi); + + // shift by 6 bit each 16 bit + resReglo = _mm256_adds_epi16(resReglo, addFilterReg32); + resReghi = _mm256_adds_epi16(resReghi, addFilterReg32); + resReglo = _mm256_srai_epi16(resReglo, 6); + resReghi = _mm256_srai_epi16(resReghi, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg = _mm256_packus_epi16(resReglo, resReghi); + + src_ptr += src_stride; + + xx_store2_mi128(output_ptr, out_pitch, &resReg); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_34_lo = srcReg45_56_lo; + srcReg23_34_hi = srcReg45_56_hi; + srcReg4x = srcReg6x; + } +} + +static void aom_filter_block1d16_v8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32; + __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; + __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; + __m256i srcReg32b11, srcReg32b12, filtersReg32; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + // load 16 bytes 7 times in stride of src_pitch + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pitch, src_ptr); + srcReg32b3 = + xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg32b5 = + xx_loadu2_mi128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4); + srcReg32b7 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); + + // have each consecutive loads on the same 256 register + srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21); + srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21); + srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21); + // merge every two consecutive registers except the last one + srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); + srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); + + // save + srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); + srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); + srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); + srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg32b8 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); + srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, + _mm256_castsi256_si128(srcReg32b8), 1); + srcReg32b9 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); + srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, + _mm256_castsi256_si128(srcReg32b9), 1); + + // merge every two consecutive registers + // save + srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); + srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); + srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, + _mm256_adds_epi16(srcReg32b8, srcReg32b12)); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); + + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); + srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); + + // add and saturate the results together + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, + _mm256_adds_epi16(srcReg32b8, srcReg32b12)); + + // shift by 6 bit each 16 bit + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg32); + srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6); + srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); + + src_ptr += src_stride; + + xx_store2_mi128(output_ptr, out_pitch, &srcReg32b1); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg32b10 = srcReg32b11; + srcReg32b1 = srcReg32b3; + srcReg32b11 = srcReg32b2; + srcReg32b3 = srcReg32b5; + srcReg32b2 = srcReg32b4; + srcReg32b5 = srcReg32b7; + srcReg32b7 = srcReg32b9; + } + if (i > 0) { + __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; + __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; + // load the last 16 bytes + srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); + + // merge the last 2 results together + srcRegFilt4 = + _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + srcRegFilt7 = + _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), + _mm256_castsi256_si128(firstFilters)); + srcRegFilt4 = + _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); + srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), + _mm256_castsi256_si128(firstFilters)); + srcRegFilt7 = + _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), + _mm256_castsi256_si128(secondFilters)); + srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), + _mm256_castsi256_si128(secondFilters)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), + _mm256_castsi256_si128(thirdFilters)); + srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1 = + _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6)); + srcRegFilt3 = + _mm_adds_epi16(srcRegFilt3, _mm_adds_epi16(srcRegFilt5, srcRegFilt7)); + + // shift by 6 bit each 16 bit + srcRegFilt1 = + _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt3 = + _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6); + srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); + + // save 16 bytes + _mm_store_si128((__m128i *)output_ptr, srcRegFilt1); + } +} + +static void aom_filter_block1d4_v4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i filtersReg32, addFilterReg32; + __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56; + __m256i srcReg23_34_lo, srcReg45_56_lo; + __m256i srcReg2345_3456_lo; + __m256i resReglo, resReg; + __m256i firstFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + firstFilters = + _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg4x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4))); + + // have consecutive loads on the same 256 register + srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21); + + srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg5x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5))); + srcReg45 = + _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1); + + srcReg6x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6))); + srcReg56 = + _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1); + + // merge every two consecutive registers + srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56); + + srcReg2345_3456_lo = _mm256_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo); + + // multiply 2 adjacent elements with the filter and add the result + resReglo = _mm256_maddubs_epi16(srcReg2345_3456_lo, firstFilters); + + resReglo = _mm256_hadds_epi16(resReglo, _mm256_setzero_si256()); + + // shift by 6 bit each 16 bit + resReglo = _mm256_adds_epi16(resReglo, addFilterReg32); + resReglo = _mm256_srai_epi16(resReglo, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg = _mm256_packus_epi16(resReglo, resReglo); + + src_ptr += src_stride; + + xx_storeu2_epi32(output_ptr, out_pitch, &resReg); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_34_lo = srcReg45_56_lo; + srcReg4x = srcReg6x; + } +} + +#if HAVE_AVX2 && HAVE_SSSE3 +filter8_1dfunction aom_filter_block1d4_v8_ssse3; +filter8_1dfunction aom_filter_block1d16_v2_ssse3; +filter8_1dfunction aom_filter_block1d16_h2_ssse3; +filter8_1dfunction aom_filter_block1d8_v2_ssse3; +filter8_1dfunction aom_filter_block1d8_h2_ssse3; +filter8_1dfunction aom_filter_block1d4_v2_ssse3; +filter8_1dfunction aom_filter_block1d4_h2_ssse3; +#define aom_filter_block1d4_v8_avx2 aom_filter_block1d4_v8_ssse3 +#define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3 +#define aom_filter_block1d16_h2_avx2 aom_filter_block1d16_h2_ssse3 +#define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3 +#define aom_filter_block1d8_h2_avx2 aom_filter_block1d8_h2_ssse3 +#define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3 +#define aom_filter_block1d4_h2_avx2 aom_filter_block1d4_h2_ssse3 +// void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); + +#endif // HAVE_AX2 && HAVE_SSSE3 diff --git a/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c b/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c new file mode 100644 index 000000000..cff7f43ee --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c @@ -0,0 +1,569 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 + +#include "config/aom_dsp_rtcd.h" +#include "aom_dsp/x86/convolve.h" +#include "aom_ports/mem.h" + +void aom_filter_block1d16_h4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_pixels_per_line, + uint8_t *output_ptr, ptrdiff_t output_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i filtersReg; + __m128i addFilterReg32; + __m128i secondFilters, thirdFilters; + __m128i srcRegFilt32b1_1, srcRegFilt32b1_2, srcRegFilt32b2_1, + srcRegFilt32b2_2; + __m128i srcReg32b1, srcReg32b2; + unsigned int i; + src_ptr -= 3; + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 + + for (i = output_height; i > 0; i -= 1) { + srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); + + __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2); + __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4); + __m128i ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128()); + __m128i ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128()); + __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters); + __m128i d2 = _mm_madd_epi16(ss_2_1, thirdFilters); + srcRegFilt32b1_1 = _mm_add_epi32(d1, d2); + + __m128i ss_1 = _mm_srli_si128(srcReg32b1, 3); + __m128i ss_3 = _mm_srli_si128(srcReg32b1, 5); + __m128i ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128()); + __m128i ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128()); + d1 = _mm_madd_epi16(ss_1_2, secondFilters); + d2 = _mm_madd_epi16(ss_2_2, thirdFilters); + srcRegFilt32b1_2 = _mm_add_epi32(d1, d2); + + __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); + __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); + srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi); + + // reading stride of the next 16 bytes + // (part of it was being read by earlier read) + srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); + + ss_2 = _mm_srli_si128(srcReg32b2, 2); + ss_4 = _mm_srli_si128(srcReg32b2, 4); + ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128()); + ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128()); + d1 = _mm_madd_epi16(ss_1_1, secondFilters); + d2 = _mm_madd_epi16(ss_2_1, thirdFilters); + srcRegFilt32b2_1 = _mm_add_epi32(d1, d2); + + ss_1 = _mm_srli_si128(srcReg32b2, 3); + ss_3 = _mm_srli_si128(srcReg32b2, 5); + ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128()); + ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128()); + d1 = _mm_madd_epi16(ss_1_2, secondFilters); + d2 = _mm_madd_epi16(ss_2_2, thirdFilters); + srcRegFilt32b2_2 = _mm_add_epi32(d1, d2); + + res_lo = _mm_unpacklo_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2); + res_hi = _mm_unpackhi_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2); + srcRegFilt32b2_1 = _mm_packs_epi32(res_lo, res_hi); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32); + srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); + srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); + + src_ptr += src_pixels_per_line; + + _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1); + + output_ptr += output_pitch; + } +} + +void aom_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch, + uint8_t *output_ptr, ptrdiff_t out_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i filtersReg; + __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; + __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi; + __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi; + __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; + __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi; + __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi; + __m128i resReg23_45, resReg34_56; + __m128i addFilterReg32, secondFilters, thirdFilters; + __m128i tmp_0, tmp_1; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 + + // multiply the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3); + srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3); + __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128()); + __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128()); + __m128i resReg23_hi_1 = _mm_unpacklo_epi8(srcReg23_hi, _mm_setzero_si128()); + __m128i resReg23_hi_2 = _mm_unpackhi_epi8(srcReg23_hi, _mm_setzero_si128()); + + srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); + srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4); + srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4); + __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128()); + __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128()); + __m128i resReg34_hi_1 = _mm_unpacklo_epi8(srcReg34_hi, _mm_setzero_si128()); + __m128i resReg34_hi_2 = _mm_unpackhi_epi8(srcReg34_hi, _mm_setzero_si128()); + + for (i = output_height; i > 1; i -= 2) { + srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); + + srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5); + srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5); + + srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); + + srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6); + srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6); + + // multiply 2 adjacent elements with the filter and add the result + + tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters); + tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters); + resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1); + + tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters); + tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters); + resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1); + + __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128()); + __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128()); + tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters); + tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters); + resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1); + + __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128()); + __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128()); + tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters); + tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters); + resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1); + + // add and saturate the results together + resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo); + resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo); + + // multiply 2 adjacent elements with the filter and add the result + + tmp_0 = _mm_madd_epi16(resReg23_hi_1, secondFilters); + tmp_1 = _mm_madd_epi16(resReg23_hi_2, secondFilters); + resReg23_hi = _mm_packs_epi32(tmp_0, tmp_1); + + tmp_0 = _mm_madd_epi16(resReg34_hi_1, secondFilters); + tmp_1 = _mm_madd_epi16(resReg34_hi_2, secondFilters); + resReg34_hi = _mm_packs_epi32(tmp_0, tmp_1); + + __m128i resReg45_hi_1 = _mm_unpacklo_epi8(srcReg45_hi, _mm_setzero_si128()); + __m128i resReg45_hi_2 = _mm_unpackhi_epi8(srcReg45_hi, _mm_setzero_si128()); + tmp_0 = _mm_madd_epi16(resReg45_hi_1, thirdFilters); + tmp_1 = _mm_madd_epi16(resReg45_hi_2, thirdFilters); + resReg45_hi = _mm_packs_epi32(tmp_0, tmp_1); + + __m128i resReg56_hi_1 = _mm_unpacklo_epi8(srcReg56_hi, _mm_setzero_si128()); + __m128i resReg56_hi_2 = _mm_unpackhi_epi8(srcReg56_hi, _mm_setzero_si128()); + tmp_0 = _mm_madd_epi16(resReg56_hi_1, thirdFilters); + tmp_1 = _mm_madd_epi16(resReg56_hi_2, thirdFilters); + resReg56_hi = _mm_packs_epi32(tmp_0, tmp_1); + + // add and saturate the results together + resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi); + resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi); + + // shift by 6 bit each 16 bit + resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32); + resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32); + resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32); + resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32); + resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6); + resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6); + resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6); + resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi); + resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi); + + src_ptr += src_stride; + + _mm_store_si128((__m128i *)output_ptr, (resReg23_45)); + _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56)); + + output_ptr += dst_stride; + + // save part of the registers for next strides + resReg23_lo_1 = resReg45_lo_1; + resReg23_lo_2 = resReg45_lo_2; + resReg23_hi_1 = resReg45_hi_1; + resReg23_hi_2 = resReg45_hi_2; + resReg34_lo_1 = resReg56_lo_1; + resReg34_lo_2 = resReg56_lo_2; + resReg34_hi_1 = resReg56_hi_1; + resReg34_hi_2 = resReg56_hi_2; + srcReg4 = srcReg6; + } +} + +void aom_filter_block1d8_h4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_pixels_per_line, + uint8_t *output_ptr, ptrdiff_t output_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i filtersReg; + __m128i addFilterReg32; + __m128i secondFilters, thirdFilters; + __m128i srcRegFilt32b1_1, srcRegFilt32b1_2; + __m128i srcReg32b1; + unsigned int i; + src_ptr -= 3; + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 + + for (i = output_height; i > 0; i -= 1) { + srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); + + __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2); + __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4); + ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128()); + ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128()); + __m128i d1 = _mm_madd_epi16(ss_2, secondFilters); + __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters); + srcRegFilt32b1_1 = _mm_add_epi32(d1, d2); + + __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3); + __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5); + ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128()); + ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128()); + d1 = _mm_madd_epi16(ss_3, secondFilters); + d2 = _mm_madd_epi16(ss_5, thirdFilters); + srcRegFilt32b1_2 = _mm_add_epi32(d1, d2); + + __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); + __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); + srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); + + src_ptr += src_pixels_per_line; + + _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1); + + output_ptr += output_pitch; + } +} + +void aom_filter_block1d8_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch, + uint8_t *output_ptr, ptrdiff_t out_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i filtersReg; + __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; + __m128i srcReg23_lo, srcReg34_lo; + __m128i srcReg45_lo, srcReg56_lo; + __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; + __m128i resReg23_45_lo, resReg34_56_lo; + __m128i resReg23_45, resReg34_56; + __m128i addFilterReg32, secondFilters, thirdFilters; + __m128i tmp_0, tmp_1; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 + + // multiply the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3); + __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128()); + __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128()); + + srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); + srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4); + __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128()); + __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128()); + + for (i = output_height; i > 1; i -= 2) { + srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); + srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5); + + srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); + srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6); + + // multiply 2 adjacent elements with the filter and add the result + + tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters); + tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters); + resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1); + + tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters); + tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters); + resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1); + + __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128()); + __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128()); + tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters); + tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters); + resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1); + + __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128()); + __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128()); + tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters); + tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters); + resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1); + + // add and saturate the results together + resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo); + resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo); + + // shift by 6 bit each 16 bit + resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32); + resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32); + resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6); + resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg23_45 = _mm_packus_epi16(resReg23_45_lo, _mm_setzero_si128()); + resReg34_56 = _mm_packus_epi16(resReg34_56_lo, _mm_setzero_si128()); + + src_ptr += src_stride; + + _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45)); + _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56)); + + output_ptr += dst_stride; + + // save part of the registers for next strides + resReg23_lo_1 = resReg45_lo_1; + resReg23_lo_2 = resReg45_lo_2; + resReg34_lo_1 = resReg56_lo_1; + resReg34_lo_2 = resReg56_lo_2; + srcReg4 = srcReg6; + } +} + +void aom_filter_block1d4_h4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_pixels_per_line, + uint8_t *output_ptr, ptrdiff_t output_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i filtersReg; + __m128i addFilterReg32; + __m128i secondFilters, thirdFilters; + __m128i srcRegFilt32b1_1; + __m128i srcReg32b1; + unsigned int i; + src_ptr -= 3; + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 + + for (i = output_height; i > 0; i -= 1) { + srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); + + __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2); + __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3); + __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4); + __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5); + + ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128()); + ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128()); + ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128()); + ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128()); + + __m128i ss_1_1 = _mm_unpacklo_epi32(ss_2, ss_3); + __m128i ss_1_2 = _mm_unpacklo_epi32(ss_4, ss_5); + + __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters); + __m128i d2 = _mm_madd_epi16(ss_1_2, thirdFilters); + srcRegFilt32b1_1 = _mm_add_epi32(d1, d2); + + srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128()); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); + + src_ptr += src_pixels_per_line; + + *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1); + + output_ptr += output_pitch; + } +} + +void aom_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch, + uint8_t *output_ptr, ptrdiff_t out_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i filtersReg; + __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; + __m128i srcReg23, srcReg34, srcReg45, srcReg56; + __m128i resReg23_34, resReg45_56; + __m128i resReg23_34_45_56; + __m128i addFilterReg32, secondFilters, thirdFilters; + __m128i tmp_0, tmp_1; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 + + // multiply the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3); + __m128i resReg23 = _mm_unpacklo_epi8(srcReg23, _mm_setzero_si128()); + + srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); + srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4); + __m128i resReg34 = _mm_unpacklo_epi8(srcReg34, _mm_setzero_si128()); + + for (i = output_height; i > 1; i -= 2) { + srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); + srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5); + srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); + srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6); + + // multiply 2 adjacent elements with the filter and add the result + tmp_0 = _mm_madd_epi16(resReg23, secondFilters); + tmp_1 = _mm_madd_epi16(resReg34, secondFilters); + resReg23_34 = _mm_packs_epi32(tmp_0, tmp_1); + + __m128i resReg45 = _mm_unpacklo_epi8(srcReg45, _mm_setzero_si128()); + __m128i resReg56 = _mm_unpacklo_epi8(srcReg56, _mm_setzero_si128()); + + tmp_0 = _mm_madd_epi16(resReg45, thirdFilters); + tmp_1 = _mm_madd_epi16(resReg56, thirdFilters); + resReg45_56 = _mm_packs_epi32(tmp_0, tmp_1); + + // add and saturate the results together + resReg23_34_45_56 = _mm_adds_epi16(resReg23_34, resReg45_56); + + // shift by 6 bit each 16 bit + resReg23_34_45_56 = _mm_adds_epi16(resReg23_34_45_56, addFilterReg32); + resReg23_34_45_56 = _mm_srai_epi16(resReg23_34_45_56, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg23_34_45_56 = + _mm_packus_epi16(resReg23_34_45_56, _mm_setzero_si128()); + + src_ptr += src_stride; + + *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(resReg23_34_45_56); + *((uint32_t *)(output_ptr + out_pitch)) = + _mm_cvtsi128_si32(_mm_srli_si128(resReg23_34_45_56, 4)); + + output_ptr += dst_stride; + + // save part of the registers for next strides + resReg23 = resReg45; + resReg34 = resReg56; + srcReg4 = srcReg6; + } +} diff --git a/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c new file mode 100644 index 000000000..f64b821ea --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c @@ -0,0 +1,770 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "aom_ports/emmintrin_compat.h" + +// filters only for the 4_h8 convolution +DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { 0, 1, 1, 2, 2, 3, + 3, 4, 2, 3, 3, 4, + 4, 5, 5, 6 }; + +DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { 4, 5, 5, 6, 6, 7, + 7, 8, 6, 7, 7, 8, + 8, 9, 9, 10 }; + +// filters for 8_h8 and 16_h8 +DECLARE_ALIGNED(16, static const uint8_t, + filt1_global[16]) = { 0, 1, 1, 2, 2, 3, 3, 4, + 4, 5, 5, 6, 6, 7, 7, 8 }; + +DECLARE_ALIGNED(16, static const uint8_t, + filt2_global[16]) = { 2, 3, 3, 4, 4, 5, 5, 6, + 6, 7, 7, 8, 8, 9, 9, 10 }; + +DECLARE_ALIGNED(16, static const uint8_t, + filt3_global[16]) = { 4, 5, 5, 6, 6, 7, 7, 8, + 8, 9, 9, 10, 10, 11, 11, 12 }; + +DECLARE_ALIGNED(16, static const uint8_t, + filt4_global[16]) = { 6, 7, 7, 8, 8, 9, 9, 10, + 10, 11, 11, 12, 12, 13, 13, 14 }; + +DECLARE_ALIGNED(32, static const uint8_t, filt_h4[]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, + 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5, + 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, + 7, 7, 8, 8, 9, 9, 10, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, + 12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7, + 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filtd4[]) = { + 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, + 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, +}; + +// These are reused by the avx2 intrinsics. +filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3; + +static void aom_filter_block1d4_h4_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m128i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1; + unsigned int i; + src_ptr -= 3; + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u)); + filt1Reg = _mm_load_si128((__m128i const *)(filtd4)); + + for (i = output_height; i > 0; i -= 1) { + // load the 2 strides of source + srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); + + // filter the source buffer + srcRegFilt32b1_1 = _mm_shuffle_epi8(srcReg32b1, filt1Reg); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + + srcRegFilt32b1_1 = _mm_hadds_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); + + src_ptr += src_pixels_per_line; + + *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1); + output_ptr += output_pitch; + } +} + +static void aom_filter_block1d4_v4_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m128i addFilterReg32; + __m128i srcReg2, srcReg3, srcReg23, srcReg4, srcReg34, srcReg5, srcReg45, + srcReg6, srcReg56; + __m128i srcReg23_34_lo, srcReg45_56_lo; + __m128i srcReg2345_3456_lo, srcReg2345_3456_hi; + __m128i resReglo, resReghi; + __m128i firstFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg23 = _mm_unpacklo_epi32(srcReg2, srcReg3); + + srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); + + // have consecutive loads on the same 256 register + srcReg34 = _mm_unpacklo_epi32(srcReg3, srcReg4); + + srcReg23_34_lo = _mm_unpacklo_epi8(srcReg23, srcReg34); + + for (i = output_height; i > 1; i -= 2) { + srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); + srcReg45 = _mm_unpacklo_epi32(srcReg4, srcReg5); + + srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); + srcReg56 = _mm_unpacklo_epi32(srcReg5, srcReg6); + + // merge every two consecutive registers + srcReg45_56_lo = _mm_unpacklo_epi8(srcReg45, srcReg56); + + srcReg2345_3456_lo = _mm_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo); + srcReg2345_3456_hi = _mm_unpackhi_epi16(srcReg23_34_lo, srcReg45_56_lo); + + // multiply 2 adjacent elements with the filter and add the result + resReglo = _mm_maddubs_epi16(srcReg2345_3456_lo, firstFilters); + resReghi = _mm_maddubs_epi16(srcReg2345_3456_hi, firstFilters); + + resReglo = _mm_hadds_epi16(resReglo, _mm_setzero_si128()); + resReghi = _mm_hadds_epi16(resReghi, _mm_setzero_si128()); + + // shift by 6 bit each 16 bit + resReglo = _mm_adds_epi16(resReglo, addFilterReg32); + resReghi = _mm_adds_epi16(resReghi, addFilterReg32); + resReglo = _mm_srai_epi16(resReglo, 6); + resReghi = _mm_srai_epi16(resReghi, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReglo = _mm_packus_epi16(resReglo, resReglo); + resReghi = _mm_packus_epi16(resReghi, resReghi); + + src_ptr += src_stride; + + *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(resReglo); + *((uint32_t *)(output_ptr + out_pitch)) = _mm_cvtsi128_si32(resReghi); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_34_lo = srcReg45_56_lo; + srcReg4 = srcReg6; + } +} + +void aom_filter_block1d4_h8_intrin_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i firstFilters, secondFilters, shuffle1, shuffle2; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; + __m128i addFilterReg64, filtersReg, srcReg, minReg; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits in the filter into the first lane + firstFilters = _mm_shufflelo_epi16(filtersReg, 0); + // duplicate only the third 16 bit in the filter into the first lane + secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); + // duplicate only the seconds 16 bits in the filter into the second lane + // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3 + firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); + // duplicate only the forth 16 bits in the filter into the second lane + // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7 + secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); + + // loading the local filters + shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8); + shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); + + for (i = 0; i < output_height; i++) { + srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + // filter the source buffer + srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1); + srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); + + // extract the higher half of the lane + srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); + srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); + + minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); + + // add and saturate all the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bits + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + src_ptr += src_pixels_per_line; + + // save only 4 bytes + *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1); + + output_ptr += output_pitch; + } +} + +static void aom_filter_block1d8_h4_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m128i addFilterReg32, filt2Reg, filt3Reg; + __m128i secondFilters, thirdFilters; + __m128i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3; + __m128i srcReg32b1; + unsigned int i; + src_ptr -= 3; + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + + filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32)); + filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2)); + + for (i = output_height; i > 0; i -= 1) { + srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); + + // filter the source buffer + srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits + srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); + + src_ptr += src_pixels_per_line; + + _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1); + + output_ptr += output_pitch; + } +} + +static void aom_filter_block1d8_v4_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; + __m128i srcReg23, srcReg34, srcReg45, srcReg56; + __m128i resReg23, resReg34, resReg45, resReg56; + __m128i resReg23_45, resReg34_56; + __m128i addFilterReg32, secondFilters, thirdFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 128 bit register + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 128 bit register + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3); + + srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); + + // have consecutive loads on the same 256 register + srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4); + + for (i = output_height; i > 1; i -= 2) { + srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); + + srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5); + + srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); + + srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6); + + // multiply 2 adjacent elements with the filter and add the result + resReg23 = _mm_maddubs_epi16(srcReg23, secondFilters); + resReg34 = _mm_maddubs_epi16(srcReg34, secondFilters); + resReg45 = _mm_maddubs_epi16(srcReg45, thirdFilters); + resReg56 = _mm_maddubs_epi16(srcReg56, thirdFilters); + + // add and saturate the results together + resReg23_45 = _mm_adds_epi16(resReg23, resReg45); + resReg34_56 = _mm_adds_epi16(resReg34, resReg56); + + // shift by 6 bit each 16 bit + resReg23_45 = _mm_adds_epi16(resReg23_45, addFilterReg32); + resReg34_56 = _mm_adds_epi16(resReg34_56, addFilterReg32); + resReg23_45 = _mm_srai_epi16(resReg23_45, 6); + resReg34_56 = _mm_srai_epi16(resReg34_56, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg23_45 = _mm_packus_epi16(resReg23_45, _mm_setzero_si128()); + resReg34_56 = _mm_packus_epi16(resReg34_56, _mm_setzero_si128()); + + src_ptr += src_stride; + + _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45)); + _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56)); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23 = srcReg45; + srcReg34 = srcReg56; + srcReg4 = srcReg6; + } +} + +void aom_filter_block1d8_h8_intrin_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; + __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; + __m128i addFilterReg64, filtersReg, minReg; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 128 bit register + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 128 bit register + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 128 bit register + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 128 bit register + forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + + filt1Reg = _mm_load_si128((__m128i const *)filt1_global); + filt2Reg = _mm_load_si128((__m128i const *)filt2_global); + filt3Reg = _mm_load_si128((__m128i const *)filt3_global); + filt4Reg = _mm_load_si128((__m128i const *)filt4_global); + + for (i = 0; i < output_height; i++) { + srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + // filter the source buffer + srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg); + srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); + + // filter the source buffer + srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg); + srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); + srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); + + // add and saturate all the results together + minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + + srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bits + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + + src_ptr += src_pixels_per_line; + + // save only 8 bytes + _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); + + output_ptr += output_pitch; + } +} + +void aom_filter_block1d8_v8_intrin_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i addFilterReg64, filtersReg, minReg; + __m128i firstFilters, secondFilters, thirdFilters, forthFilters; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; + __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; + __m128i srcReg8; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits in the filter + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); + // duplicate only the second 16 bits in the filter + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits in the filter + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + // duplicate only the forth 16 bits in the filter + forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + + // load the first 7 rows of 8 bytes + srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr); + srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); + srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); + srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); + srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); + + for (i = 0; i < output_height; i++) { + // load the last 8 bytes + srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); + + // merge the result together + srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); + srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); + + // merge the result together + srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6); + srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); + srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); + + // add and saturate the results together + minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); + srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + + src_ptr += src_pitch; + + // shift down a row + srcReg1 = srcReg2; + srcReg2 = srcReg3; + srcReg3 = srcReg4; + srcReg4 = srcReg5; + srcReg5 = srcReg6; + srcReg6 = srcReg7; + srcReg7 = srcReg8; + + // save only 8 bytes convolve result + _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); + + output_ptr += out_pitch; + } +} + +static void aom_filter_block1d16_h4_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m128i addFilterReg32, filt2Reg, filt3Reg; + __m128i secondFilters, thirdFilters; + __m128i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; + __m128i srcReg32b1, srcReg32b2; + unsigned int i; + src_ptr -= 3; + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + + filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32)); + filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2)); + + for (i = output_height; i > 0; i -= 1) { + srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); + + // filter the source buffer + srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + + // reading stride of the next 16 bytes + // (part of it was being read by earlier read) + srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); + + // filter the source buffer + srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b2, filt2Reg); + srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b2, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32); + srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); + srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); + + src_ptr += src_pixels_per_line; + + _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1); + + output_ptr += output_pitch; + } +} + +static void aom_filter_block1d16_v4_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; + __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi; + __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi; + __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; + __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi; + __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi; + __m128i resReg23_45, resReg34_56; + __m128i addFilterReg32, secondFilters, thirdFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 128 bit register + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 128 bit register + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3); + srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3); + + srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); + + // have consecutive loads on the same 256 register + srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4); + srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4); + + for (i = output_height; i > 1; i -= 2) { + srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); + + srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5); + srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5); + + srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); + + srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6); + srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6); + + // multiply 2 adjacent elements with the filter and add the result + resReg23_lo = _mm_maddubs_epi16(srcReg23_lo, secondFilters); + resReg34_lo = _mm_maddubs_epi16(srcReg34_lo, secondFilters); + resReg45_lo = _mm_maddubs_epi16(srcReg45_lo, thirdFilters); + resReg56_lo = _mm_maddubs_epi16(srcReg56_lo, thirdFilters); + + // add and saturate the results together + resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo); + resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo); + + // multiply 2 adjacent elements with the filter and add the result + + resReg23_hi = _mm_maddubs_epi16(srcReg23_hi, secondFilters); + resReg34_hi = _mm_maddubs_epi16(srcReg34_hi, secondFilters); + resReg45_hi = _mm_maddubs_epi16(srcReg45_hi, thirdFilters); + resReg56_hi = _mm_maddubs_epi16(srcReg56_hi, thirdFilters); + + // add and saturate the results together + resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi); + resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi); + + // shift by 6 bit each 16 bit + resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32); + resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32); + resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32); + resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32); + resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6); + resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6); + resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6); + resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi); + resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi); + + src_ptr += src_stride; + + _mm_store_si128((__m128i *)output_ptr, (resReg23_45)); + _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56)); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_lo = srcReg45_lo; + srcReg34_lo = srcReg56_lo; + srcReg23_hi = srcReg45_hi; + srcReg34_hi = srcReg56_hi; + srcReg4 = srcReg6; + } +} + +filter8_1dfunction aom_filter_block1d16_v8_ssse3; +filter8_1dfunction aom_filter_block1d16_h8_ssse3; +filter8_1dfunction aom_filter_block1d8_v8_ssse3; +filter8_1dfunction aom_filter_block1d8_h8_ssse3; +filter8_1dfunction aom_filter_block1d4_v8_ssse3; +filter8_1dfunction aom_filter_block1d4_h8_ssse3; + +filter8_1dfunction aom_filter_block1d16_v2_ssse3; +filter8_1dfunction aom_filter_block1d16_h2_ssse3; +filter8_1dfunction aom_filter_block1d8_v2_ssse3; +filter8_1dfunction aom_filter_block1d8_h2_ssse3; +filter8_1dfunction aom_filter_block1d4_v2_ssse3; +filter8_1dfunction aom_filter_block1d4_h2_ssse3; + +// void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); diff --git a/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_sse2.asm b/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_sse2.asm new file mode 100644 index 000000000..c88fc9ffb --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_sse2.asm @@ -0,0 +1,615 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + +%include "aom_ports/x86_abi_support.asm" + +;Note: tap3 and tap4 have to be applied and added after other taps to avoid +;overflow. + +%macro GET_FILTERS_4 0 + mov rdx, arg(5) ;filter ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + psrldq xmm7, 8 + pshuflw xmm4, xmm7, 0b ;k4 + pshuflw xmm5, xmm7, 01010101b ;k5 + pshuflw xmm6, xmm7, 10101010b ;k6 + pshuflw xmm7, xmm7, 11111111b ;k7 + + punpcklqdq xmm0, xmm1 + punpcklqdq xmm2, xmm3 + punpcklqdq xmm5, xmm4 + punpcklqdq xmm6, xmm7 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm2 + movdqa k5k4, xmm5 + movdqa k6k7, xmm6 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 + + pxor xmm7, xmm7 + movdqa zero, xmm7 +%endm + +%macro APPLY_FILTER_4 1 + punpckldq xmm0, xmm1 ;two row in one register + punpckldq xmm6, xmm7 + punpckldq xmm2, xmm3 + punpckldq xmm5, xmm4 + + punpcklbw xmm0, zero ;unpack to word + punpcklbw xmm6, zero + punpcklbw xmm2, zero + punpcklbw xmm5, zero + + pmullw xmm0, k0k1 ;multiply the filter factors + pmullw xmm6, k6k7 + pmullw xmm2, k2k3 + pmullw xmm5, k5k4 + + paddsw xmm0, xmm6 ;sum + movdqa xmm1, xmm0 + psrldq xmm1, 8 + paddsw xmm0, xmm1 + paddsw xmm0, xmm2 + psrldq xmm2, 8 + paddsw xmm0, xmm5 + psrldq xmm5, 8 + paddsw xmm0, xmm2 + paddsw xmm0, xmm5 + + paddsw xmm0, krd ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movd [rdi], xmm0 +%endm + +%macro GET_FILTERS 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + pshufhw xmm4, xmm7, 0b ;k4 + pshufhw xmm5, xmm7, 01010101b ;k5 + pshufhw xmm6, xmm7, 10101010b ;k6 + pshufhw xmm7, xmm7, 11111111b ;k7 + + punpcklwd xmm0, xmm0 + punpcklwd xmm1, xmm1 + punpcklwd xmm2, xmm2 + punpcklwd xmm3, xmm3 + punpckhwd xmm4, xmm4 + punpckhwd xmm5, xmm5 + punpckhwd xmm6, xmm6 + punpckhwd xmm7, xmm7 + + movdqa k0, xmm0 ;store filter factors on stack + movdqa k1, xmm1 + movdqa k2, xmm2 + movdqa k3, xmm3 + movdqa k4, xmm4 + movdqa k5, xmm5 + movdqa k6, xmm6 + movdqa k7, xmm7 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 ;rounding + + pxor xmm7, xmm7 + movdqa zero, xmm7 +%endm + +%macro LOAD_VERT_8 1 + movq xmm0, [rsi + %1] ;0 + movq xmm1, [rsi + rax + %1] ;1 + movq xmm6, [rsi + rdx * 2 + %1] ;6 + lea rsi, [rsi + rax] + movq xmm7, [rsi + rdx * 2 + %1] ;7 + movq xmm2, [rsi + rax + %1] ;2 + movq xmm3, [rsi + rax * 2 + %1] ;3 + movq xmm4, [rsi + rdx + %1] ;4 + movq xmm5, [rsi + rax * 4 + %1] ;5 +%endm + +%macro APPLY_FILTER_8 2 + punpcklbw xmm0, zero + punpcklbw xmm1, zero + punpcklbw xmm6, zero + punpcklbw xmm7, zero + punpcklbw xmm2, zero + punpcklbw xmm5, zero + punpcklbw xmm3, zero + punpcklbw xmm4, zero + + pmullw xmm0, k0 + pmullw xmm1, k1 + pmullw xmm6, k6 + pmullw xmm7, k7 + pmullw xmm2, k2 + pmullw xmm5, k5 + pmullw xmm3, k3 + pmullw xmm4, k4 + + paddsw xmm0, xmm1 + paddsw xmm0, xmm6 + paddsw xmm0, xmm7 + paddsw xmm0, xmm2 + paddsw xmm0, xmm5 + paddsw xmm0, xmm3 + paddsw xmm0, xmm4 + + paddsw xmm0, krd ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack back to byte +%if %1 + movq xmm1, [rdi + %2] + pavgb xmm0, xmm1 +%endif + movq [rdi + %2], xmm0 +%endm + +SECTION .text + +;void aom_filter_block1d4_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_filter_block1d4_v8_sse2) PRIVATE +sym(aom_filter_block1d4_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movd xmm0, [rsi] ;load src: row 0 + movd xmm1, [rsi + rax] ;1 + movd xmm6, [rsi + rdx * 2] ;6 + lea rsi, [rsi + rax] + movd xmm7, [rsi + rdx * 2] ;7 + movd xmm2, [rsi + rax] ;2 + movd xmm3, [rsi + rax * 2] ;3 + movd xmm4, [rsi + rdx] ;4 + movd xmm5, [rsi + rax * 4] ;5 + + APPLY_FILTER_4 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d8_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_filter_block1d8_v8_sse2) PRIVATE +sym(aom_filter_block1d8_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 0, 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d16_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_filter_block1d16_v8_sse2) PRIVATE +sym(aom_filter_block1d16_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 0, 0 + sub rsi, rax + + LOAD_VERT_8 8 + APPLY_FILTER_8 0, 8 + add rdi, rbx + + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d4_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_filter_block1d4_h8_sse2) PRIVATE +sym(aom_filter_block1d4_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm3, 3 + psrldq xmm5, 5 + psrldq xmm4, 4 + + APPLY_FILTER_4 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d8_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_filter_block1d8_h8_sse2) PRIVATE +sym(aom_filter_block1d8_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 0, 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d16_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_filter_block1d16_h8_sse2) PRIVATE +sym(aom_filter_block1d16_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 0, 0 + + movdqu xmm0, [rsi + 5] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 0, 8 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_ssse3.asm new file mode 100644 index 000000000..3ca7921b6 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_ssse3.asm @@ -0,0 +1,870 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_64: times 8 dw 64 +even_byte_mask: times 8 dw 0x00ff + +; %define USE_PMULHRSW +; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss +; when using this instruction. +; +; The add order below (based on ffav1) must be followed to prevent outranges. +; x = k0k1 + k4k5 +; y = k2k3 + k6k7 +; z = signed SAT(x + y) + +SECTION .text +%define LOCAL_VARS_SIZE 16*6 + +%macro SETUP_LOCAL_VARS 0 + ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 + + ; pmaddubsw has a higher latency on some platforms, this might be eased by + ; interleaving the instructions. + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + packsswb m4, m4 + ; TODO(slavarnway): multiple pshufb instructions had a higher latency on + ; some platforms. + pshuflw m0, m4, 0b ;k0_k1 + pshuflw m1, m4, 01010101b ;k2_k3 + pshuflw m2, m4, 10101010b ;k4_k5 + pshuflw m3, m4, 11111111b ;k6_k7 + punpcklqdq m0, m0 + punpcklqdq m1, m1 + punpcklqdq m2, m2 + punpcklqdq m3, m3 + mova k0k1, m0 + mova k2k3, m1 + mova k4k5, m2 + mova k6k7, m3 +%if ARCH_X86_64 + %define krd m12 + %define tmp0 [rsp + 16*4] + %define tmp1 [rsp + 16*5] + mova krd, [GLOBAL(pw_64)] +%else + %define krd [rsp + 16*4] +%if CONFIG_PIC=0 + mova m6, [GLOBAL(pw_64)] +%else + ; build constants without accessing global memory + pcmpeqb m6, m6 ;all ones + psrlw m6, 15 + psllw m6, 6 ;aka pw_64 +%endif + mova krd, m6 +%endif +%endm + +;------------------------------------------------------------------------------- +%if ARCH_X86_64 + %define LOCAL_VARS_SIZE_H4 0 +%else + %define LOCAL_VARS_SIZE_H4 16*4 +%endif + +%macro SUBPIX_HFILTER4 1 +cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + packsswb m4, m4 +%if ARCH_X86_64 + %define k0k1k4k5 m8 + %define k2k3k6k7 m9 + %define krd m10 + mova krd, [GLOBAL(pw_64)] + pshuflw k0k1k4k5, m4, 0b ;k0_k1 + pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5 + pshuflw k2k3k6k7, m4, 01010101b ;k2_k3 + pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7 +%else + %define k0k1k4k5 [rsp + 16*0] + %define k2k3k6k7 [rsp + 16*1] + %define krd [rsp + 16*2] + pshuflw m6, m4, 0b ;k0_k1 + pshufhw m6, m6, 10101010b ;k0_k1_k4_k5 + pshuflw m7, m4, 01010101b ;k2_k3 + pshufhw m7, m7, 11111111b ;k2_k3_k6_k7 +%if CONFIG_PIC=0 + mova m1, [GLOBAL(pw_64)] +%else + ; build constants without accessing global memory + pcmpeqb m1, m1 ;all ones + psrlw m1, 15 + psllw m1, 6 ;aka pw_64 +%endif + mova k0k1k4k5, m6 + mova k2k3k6k7, m7 + mova krd, m1 +%endif + dec heightd + +.loop: + ;Do two rows at once + movu m4, [srcq - 3] + movu m5, [srcq + sstrideq - 3] + punpckhbw m1, m4, m4 + punpcklbw m4, m4 + punpckhbw m3, m5, m5 + punpcklbw m5, m5 + palignr m0, m1, m4, 1 + pmaddubsw m0, k0k1k4k5 + palignr m1, m4, 5 + pmaddubsw m1, k2k3k6k7 + palignr m2, m3, m5, 1 + pmaddubsw m2, k0k1k4k5 + palignr m3, m5, 5 + pmaddubsw m3, k2k3k6k7 + punpckhqdq m4, m0, m2 + punpcklqdq m0, m2 + punpckhqdq m5, m1, m3 + punpcklqdq m1, m3 + paddsw m0, m4 + paddsw m1, m5 +%ifidn %1, h8_avg + movd m4, [dstq] + movd m5, [dstq + dstrideq] +%endif + paddsw m0, m1 + paddsw m0, krd + psraw m0, 7 +%ifidn %1, h8_add_src + pxor m3, m3 + movu m4, [srcq] + movu m5, [srcq + sstrideq] + punpckldq m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2 + punpcklbw m4, m3 + paddsw m0, m4 +%endif + packuswb m0, m0 + psrldq m1, m0, 4 + +%ifidn %1, h8_avg + pavgb m0, m4 + pavgb m1, m5 +%endif + movd [dstq], m0 + movd [dstq + dstrideq], m1 + + lea srcq, [srcq + sstrideq ] + prefetcht0 [srcq + 4 * sstrideq - 3] + lea srcq, [srcq + sstrideq ] + lea dstq, [dstq + 2 * dstrideq ] + prefetcht0 [srcq + 2 * sstrideq - 3] + + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movu m4, [srcq - 3] + punpckhbw m1, m4, m4 + punpcklbw m4, m4 + palignr m0, m1, m4, 1 + palignr m1, m4, 5 + pmaddubsw m0, k0k1k4k5 + pmaddubsw m1, k2k3k6k7 + psrldq m2, m0, 8 + psrldq m3, m1, 8 + paddsw m0, m2 + paddsw m1, m3 + paddsw m0, m1 + paddsw m0, krd + psraw m0, 7 +%ifidn %1, h8_add_src + pxor m3, m3 + movu m4, [srcq] + punpcklbw m4, m3 + paddsw m0, m4 +%endif + packuswb m0, m0 +%ifidn %1, h8_avg + movd m4, [dstq] + pavgb m0, m4 +%endif + movd [dstq], m0 +.done: + REP_RET +%endm + +;------------------------------------------------------------------------------- +%macro SUBPIX_HFILTER8 1 +cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS + dec heightd + +.loop: + ;Do two rows at once + movu m0, [srcq - 3] + movu m4, [srcq + sstrideq - 3] + punpckhbw m1, m0, m0 + punpcklbw m0, m0 + palignr m5, m1, m0, 13 + pmaddubsw m5, k6k7 + palignr m2, m1, m0, 5 + palignr m3, m1, m0, 9 + palignr m1, m0, 1 + pmaddubsw m1, k0k1 + punpckhbw m6, m4, m4 + punpcklbw m4, m4 + pmaddubsw m2, k2k3 + pmaddubsw m3, k4k5 + + palignr m7, m6, m4, 13 + palignr m0, m6, m4, 5 + pmaddubsw m7, k6k7 + paddsw m1, m3 + paddsw m2, m5 + paddsw m1, m2 +%ifidn %1, h8_avg + movh m2, [dstq] + movhps m2, [dstq + dstrideq] +%endif + palignr m5, m6, m4, 9 + palignr m6, m4, 1 + pmaddubsw m0, k2k3 + pmaddubsw m6, k0k1 + paddsw m1, krd + pmaddubsw m5, k4k5 + psraw m1, 7 + paddsw m0, m7 + paddsw m6, m5 + paddsw m6, m0 + paddsw m6, krd + psraw m6, 7 +%ifidn %1, h8_add_src + pxor m3, m3 + movu m4, [srcq] + movu m5, [srcq + sstrideq] + punpcklbw m4, m3 + punpcklbw m5, m3 + paddsw m1, m4 + paddsw m6, m5 +%endif + packuswb m1, m6 +%ifidn %1, h8_avg + pavgb m1, m2 +%endif + movh [dstq], m1 + movhps [dstq + dstrideq], m1 + + lea srcq, [srcq + sstrideq ] + prefetcht0 [srcq + 4 * sstrideq - 3] + lea srcq, [srcq + sstrideq ] + lea dstq, [dstq + 2 * dstrideq ] + prefetcht0 [srcq + 2 * sstrideq - 3] + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movu m0, [srcq - 3] + punpckhbw m3, m0, m0 + punpcklbw m0, m0 + palignr m1, m3, m0, 1 + palignr m2, m3, m0, 5 + palignr m4, m3, m0, 13 + palignr m3, m0, 9 + pmaddubsw m1, k0k1 + pmaddubsw m2, k2k3 + pmaddubsw m3, k4k5 + pmaddubsw m4, k6k7 + paddsw m1, m3 + paddsw m4, m2 + paddsw m1, m4 + paddsw m1, krd + psraw m1, 7 +%ifidn %1, h8_add_src + pxor m6, m6 + movu m5, [srcq] + punpcklbw m5, m6 + paddsw m1, m5 +%endif + packuswb m1, m1 +%ifidn %1, h8_avg + movh m0, [dstq] + pavgb m1, m0 +%endif + movh [dstq], m1 +.done: + REP_RET +%endm + +;------------------------------------------------------------------------------- +%macro SUBPIX_HFILTER16 1 +cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS + +.loop: + prefetcht0 [srcq + 2 * sstrideq -3] + + movu m0, [srcq - 3] + movu m4, [srcq - 2] + pmaddubsw m0, k0k1 + pmaddubsw m4, k0k1 + movu m1, [srcq - 1] + movu m5, [srcq + 0] + pmaddubsw m1, k2k3 + pmaddubsw m5, k2k3 + movu m2, [srcq + 1] + movu m6, [srcq + 2] + pmaddubsw m2, k4k5 + pmaddubsw m6, k4k5 + movu m3, [srcq + 3] + movu m7, [srcq + 4] + pmaddubsw m3, k6k7 + pmaddubsw m7, k6k7 + paddsw m0, m2 + paddsw m1, m3 + paddsw m0, m1 + paddsw m4, m6 + paddsw m5, m7 + paddsw m4, m5 + paddsw m0, krd + paddsw m4, krd + psraw m0, 7 + psraw m4, 7 +%ifidn %1, h8_add_src +%if ARCH_X86=1 && CONFIG_PIC=1 + pcmpeqb m2, m2 ;all ones + psrlw m2, 8 ;even_byte_mask +%else + mova m2, [GLOBAL(even_byte_mask)] +%endif + movu m5, [srcq] + mova m7, m5 + pand m5, m2 + psrlw m7, 8 + paddsw m0, m5 + paddsw m4, m7 +%endif + packuswb m0, m0 + packuswb m4, m4 + punpcklbw m0, m4 +%ifidn %1, h8_avg + pavgb m0, [dstq] +%endif + lea srcq, [srcq + sstrideq] + mova [dstq], m0 + lea dstq, [dstq + dstrideq] + dec heightd + jnz .loop + REP_RET +%endm + +INIT_XMM ssse3 +SUBPIX_HFILTER16 h8 +SUBPIX_HFILTER8 h8 +SUBPIX_HFILTER4 h8 + +;------------------------------------------------------------------------------- + +; TODO(Linfeng): Detect cpu type and choose the code with better performance. +%define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1 + +%if ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON + %define NUM_GENERAL_REG_USED 9 +%else + %define NUM_GENERAL_REG_USED 6 +%endif + +%macro SUBPIX_VFILTER 2 +cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS + +%ifidn %2, 8 + %define movx movh +%else + %define movx movd +%endif + + dec heightd + +%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON + +%if ARCH_X86_64 + %define src1q r7 + %define sstride6q r8 + %define dst_stride dstrideq +%else + %define src1q filterq + %define sstride6q dstrideq + %define dst_stride dstridemp +%endif + mov src1q, srcq + add src1q, sstrideq + lea sstride6q, [sstrideq + sstrideq * 4] + add sstride6q, sstrideq ;pitch * 6 + +.loop: + ;Do two rows at once + movx m0, [srcq ] ;A + movx m1, [src1q ] ;B + punpcklbw m0, m1 ;A B + movx m2, [srcq + sstrideq * 2 ] ;C + pmaddubsw m0, k0k1 + mova m6, m2 + movx m3, [src1q + sstrideq * 2] ;D + punpcklbw m2, m3 ;C D + pmaddubsw m2, k2k3 + movx m4, [srcq + sstrideq * 4 ] ;E + mova m7, m4 + movx m5, [src1q + sstrideq * 4] ;F + punpcklbw m4, m5 ;E F + pmaddubsw m4, k4k5 + punpcklbw m1, m6 ;A B next iter + movx m6, [srcq + sstride6q ] ;G + punpcklbw m5, m6 ;E F next iter + punpcklbw m3, m7 ;C D next iter + pmaddubsw m5, k4k5 + movx m7, [src1q + sstride6q ] ;H + punpcklbw m6, m7 ;G H + pmaddubsw m6, k6k7 + pmaddubsw m3, k2k3 + pmaddubsw m1, k0k1 + paddsw m0, m4 + paddsw m2, m6 + movx m6, [srcq + sstrideq * 8 ] ;H next iter + punpcklbw m7, m6 + pmaddubsw m7, k6k7 + paddsw m0, m2 + paddsw m0, krd + psraw m0, 7 + paddsw m1, m5 +%ifidn %1, v8_add_src + pxor m6, m6 + movu m4, [srcq] + punpcklbw m4, m6 + paddsw m0, m4 +%endif + packuswb m0, m0 + + paddsw m3, m7 + paddsw m1, m3 + paddsw m1, krd + psraw m1, 7 +%ifidn %1, v8_add_src + movu m4, [src1q] + punpcklbw m4, m6 + paddsw m1, m4 +%endif + lea srcq, [srcq + sstrideq * 2 ] + lea src1q, [src1q + sstrideq * 2] + packuswb m1, m1 + +%ifidn %1, v8_avg + movx m2, [dstq] + pavgb m0, m2 +%endif + movx [dstq], m0 + add dstq, dst_stride +%ifidn %1, v8_avg + movx m3, [dstq] + pavgb m1, m3 +%endif + movx [dstq], m1 + add dstq, dst_stride + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movx m0, [srcq ] ;A + movx m1, [srcq + sstrideq ] ;B + movx m6, [srcq + sstride6q ] ;G + punpcklbw m0, m1 ;A B + movx m7, [src1q + sstride6q ] ;H + pmaddubsw m0, k0k1 + movx m2, [srcq + sstrideq * 2 ] ;C + punpcklbw m6, m7 ;G H + movx m3, [src1q + sstrideq * 2] ;D + pmaddubsw m6, k6k7 + movx m4, [srcq + sstrideq * 4 ] ;E + punpcklbw m2, m3 ;C D + movx m5, [src1q + sstrideq * 4] ;F + punpcklbw m4, m5 ;E F + pmaddubsw m2, k2k3 + pmaddubsw m4, k4k5 + paddsw m2, m6 + paddsw m0, m4 + paddsw m0, m2 + paddsw m0, krd + psraw m0, 7 +%ifidn %1, v8_add_src + pxor m6, m6 + movu m4, [srcq] + punpcklbw m4, m6 + paddsw m0, m4 +%endif + packuswb m0, m0 +%ifidn %1, v8_avg + movx m1, [dstq] + pavgb m0, m1 +%endif + movx [dstq], m0 + +%else + ; ARCH_X86_64 + + movx m0, [srcq ] ;A + movx m1, [srcq + sstrideq ] ;B + lea srcq, [srcq + sstrideq * 2 ] + movx m2, [srcq] ;C + movx m3, [srcq + sstrideq] ;D + lea srcq, [srcq + sstrideq * 2 ] + movx m4, [srcq] ;E + movx m5, [srcq + sstrideq] ;F + lea srcq, [srcq + sstrideq * 2 ] + movx m6, [srcq] ;G + punpcklbw m0, m1 ;A B + punpcklbw m1, m2 ;A B next iter + punpcklbw m2, m3 ;C D + punpcklbw m3, m4 ;C D next iter + punpcklbw m4, m5 ;E F + punpcklbw m5, m6 ;E F next iter + +.loop: + ;Do two rows at once + movx m7, [srcq + sstrideq] ;H + lea srcq, [srcq + sstrideq * 2 ] + movx m14, [srcq] ;H next iter + punpcklbw m6, m7 ;G H + punpcklbw m7, m14 ;G H next iter + pmaddubsw m8, m0, k0k1 + pmaddubsw m9, m1, k0k1 + mova m0, m2 + mova m1, m3 + pmaddubsw m10, m2, k2k3 + pmaddubsw m11, m3, k2k3 + mova m2, m4 + mova m3, m5 + pmaddubsw m4, k4k5 + pmaddubsw m5, k4k5 + paddsw m8, m4 + paddsw m9, m5 + mova m4, m6 + mova m5, m7 + pmaddubsw m6, k6k7 + pmaddubsw m7, k6k7 + paddsw m10, m6 + paddsw m11, m7 + paddsw m8, m10 + paddsw m9, m11 + mova m6, m14 + paddsw m8, krd + paddsw m9, krd + psraw m8, 7 + psraw m9, 7 +%ifidn %2, 4 + packuswb m8, m8 + packuswb m9, m9 +%else + packuswb m8, m9 +%endif + +%ifidn %1, v8_avg + movx m7, [dstq] +%ifidn %2, 4 + movx m10, [dstq + dstrideq] + pavgb m9, m10 +%else + movhpd m7, [dstq + dstrideq] +%endif + pavgb m8, m7 +%endif + movx [dstq], m8 +%ifidn %2, 4 + movx [dstq + dstrideq], m9 +%else + movhpd [dstq + dstrideq], m8 +%endif + + lea dstq, [dstq + dstrideq * 2 ] + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movx m7, [srcq + sstrideq] ;H + punpcklbw m6, m7 ;G H + pmaddubsw m0, k0k1 + pmaddubsw m2, k2k3 + pmaddubsw m4, k4k5 + pmaddubsw m6, k6k7 + paddsw m0, m4 + paddsw m2, m6 + paddsw m0, m2 + paddsw m0, krd + psraw m0, 7 + packuswb m0, m0 +%ifidn %1, v8_avg + movx m1, [dstq] + pavgb m0, m1 +%endif + movx [dstq], m0 + +%endif ; ARCH_X86_64 + +.done: + REP_RET + +%endm + +;------------------------------------------------------------------------------- +%macro SUBPIX_VFILTER16 1 +cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS + +%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON + +%if ARCH_X86_64 + %define src1q r7 + %define sstride6q r8 + %define dst_stride dstrideq +%else + %define src1q filterq + %define sstride6q dstrideq + %define dst_stride dstridemp +%endif + lea src1q, [srcq + sstrideq] + lea sstride6q, [sstrideq + sstrideq * 4] + add sstride6q, sstrideq ;pitch * 6 + +.loop: + movh m0, [srcq ] ;A + movh m1, [src1q ] ;B + movh m2, [srcq + sstrideq * 2 ] ;C + movh m3, [src1q + sstrideq * 2] ;D + movh m4, [srcq + sstrideq * 4 ] ;E + movh m5, [src1q + sstrideq * 4] ;F + + punpcklbw m0, m1 ;A B + movh m6, [srcq + sstride6q] ;G + punpcklbw m2, m3 ;C D + movh m7, [src1q + sstride6q] ;H + punpcklbw m4, m5 ;E F + pmaddubsw m0, k0k1 + movh m3, [srcq + 8] ;A + pmaddubsw m2, k2k3 + punpcklbw m6, m7 ;G H + movh m5, [srcq + sstrideq + 8] ;B + pmaddubsw m4, k4k5 + punpcklbw m3, m5 ;A B + movh m7, [srcq + sstrideq * 2 + 8] ;C + pmaddubsw m6, k6k7 + movh m5, [src1q + sstrideq * 2 + 8] ;D + punpcklbw m7, m5 ;C D + paddsw m2, m6 + pmaddubsw m3, k0k1 + movh m1, [srcq + sstrideq * 4 + 8] ;E + paddsw m0, m4 + pmaddubsw m7, k2k3 + movh m6, [src1q + sstrideq * 4 + 8] ;F + punpcklbw m1, m6 ;E F + paddsw m0, m2 + paddsw m0, krd + movh m2, [srcq + sstride6q + 8] ;G + pmaddubsw m1, k4k5 + movh m5, [src1q + sstride6q + 8] ;H + psraw m0, 7 + punpcklbw m2, m5 ;G H + pmaddubsw m2, k6k7 + paddsw m7, m2 + paddsw m3, m1 + paddsw m3, m7 + paddsw m3, krd + psraw m3, 7 +%ifidn %1, v8_add_src + pxor m6, m6 + movu m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down + mova m5, m4 + punpcklbw m4, m6 + punpckhbw m5, m6 + paddsw m0, m4 + paddsw m3, m5 +%endif + packuswb m0, m3 + + add srcq, sstrideq + add src1q, sstrideq +%ifidn %1, v8_avg + pavgb m0, [dstq] +%endif + mova [dstq], m0 + add dstq, dst_stride + dec heightd + jnz .loop + REP_RET + +%else + ; ARCH_X86_64 + dec heightd + + movu m1, [srcq ] ;A + movu m3, [srcq + sstrideq ] ;B + lea srcq, [srcq + sstrideq * 2] + punpcklbw m0, m1, m3 ;A B + punpckhbw m1, m3 ;A B + movu m5, [srcq] ;C + punpcklbw m2, m3, m5 ;A B next iter + punpckhbw m3, m5 ;A B next iter + mova tmp0, m2 ;store to stack + mova tmp1, m3 ;store to stack + movu m7, [srcq + sstrideq] ;D + lea srcq, [srcq + sstrideq * 2] + punpcklbw m4, m5, m7 ;C D + punpckhbw m5, m7 ;C D + movu m9, [srcq] ;E + punpcklbw m6, m7, m9 ;C D next iter + punpckhbw m7, m9 ;C D next iter + movu m11, [srcq + sstrideq] ;F + lea srcq, [srcq + sstrideq * 2] + punpcklbw m8, m9, m11 ;E F + punpckhbw m9, m11 ;E F + movu m2, [srcq] ;G + punpcklbw m10, m11, m2 ;E F next iter + punpckhbw m11, m2 ;E F next iter + +.loop: + ;Do two rows at once + pmaddubsw m13, m0, k0k1 + mova m0, m4 + pmaddubsw m14, m8, k4k5 + pmaddubsw m15, m4, k2k3 + mova m4, m8 + paddsw m13, m14 + movu m3, [srcq + sstrideq] ;H + lea srcq, [srcq + sstrideq * 2] + punpcklbw m14, m2, m3 ;G H + mova m8, m14 + pmaddubsw m14, k6k7 + paddsw m15, m14 + paddsw m13, m15 + paddsw m13, krd + psraw m13, 7 + + pmaddubsw m14, m1, k0k1 + pmaddubsw m1, m9, k4k5 + pmaddubsw m15, m5, k2k3 + paddsw m14, m1 + mova m1, m5 + mova m5, m9 + punpckhbw m2, m3 ;G H + mova m9, m2 + pmaddubsw m2, k6k7 + paddsw m15, m2 + paddsw m14, m15 + paddsw m14, krd + psraw m14, 7 + packuswb m13, m14 +%ifidn %1, v8_avg + pavgb m13, [dstq] +%endif + mova [dstq], m13 + + ; next iter + pmaddubsw m15, tmp0, k0k1 + pmaddubsw m14, m10, k4k5 + pmaddubsw m13, m6, k2k3 + paddsw m15, m14 + mova tmp0, m6 + mova m6, m10 + movu m2, [srcq] ;G next iter + punpcklbw m14, m3, m2 ;G H next iter + mova m10, m14 + pmaddubsw m14, k6k7 + paddsw m13, m14 + paddsw m15, m13 + paddsw m15, krd + psraw m15, 7 + + pmaddubsw m14, tmp1, k0k1 + mova tmp1, m7 + pmaddubsw m13, m7, k2k3 + mova m7, m11 + pmaddubsw m11, k4k5 + paddsw m14, m11 + punpckhbw m3, m2 ;G H next iter + mova m11, m3 + pmaddubsw m3, k6k7 + paddsw m13, m3 + paddsw m14, m13 + paddsw m14, krd + psraw m14, 7 + packuswb m15, m14 +%ifidn %1, v8_avg + pavgb m15, [dstq + dstrideq] +%endif + mova [dstq + dstrideq], m15 + lea dstq, [dstq + dstrideq * 2] + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movu m3, [srcq + sstrideq] ;H + punpcklbw m6, m2, m3 ;G H + punpckhbw m2, m3 ;G H + pmaddubsw m0, k0k1 + pmaddubsw m1, k0k1 + pmaddubsw m4, k2k3 + pmaddubsw m5, k2k3 + pmaddubsw m8, k4k5 + pmaddubsw m9, k4k5 + pmaddubsw m6, k6k7 + pmaddubsw m2, k6k7 + paddsw m0, m8 + paddsw m1, m9 + paddsw m4, m6 + paddsw m5, m2 + paddsw m0, m4 + paddsw m1, m5 + paddsw m0, krd + paddsw m1, krd + psraw m0, 7 + psraw m1, 7 + packuswb m0, m1 +%ifidn %1, v8_avg + pavgb m0, [dstq] +%endif + mova [dstq], m0 + +.done: + REP_RET + +%endif ; ARCH_X86_64 + +%endm + +INIT_XMM ssse3 +SUBPIX_VFILTER16 v8 +SUBPIX_VFILTER v8, 8 +SUBPIX_VFILTER v8, 4 diff --git a/libs/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm b/libs/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm new file mode 100644 index 000000000..d0b4b2839 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm @@ -0,0 +1,295 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "aom_ports/x86_abi_support.asm" + +%macro GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm3, [rdx] ;load filters + pshuflw xmm4, xmm3, 11111111b ;k3 + psrldq xmm3, 8 + pshuflw xmm3, xmm3, 0b ;k4 + punpcklqdq xmm4, xmm3 ;k3k4 + + movq xmm3, rcx ;rounding + pshufd xmm3, xmm3, 0 + + pxor xmm2, xmm2 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_4 1 + + punpckldq xmm0, xmm1 ;two row in one register + punpcklbw xmm0, xmm2 ;unpack to word + pmullw xmm0, xmm4 ;multiply the filter factors + + movdqa xmm1, xmm0 + psrldq xmm1, 8 + paddsw xmm0, xmm1 + + paddsw xmm0, xmm3 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + + movd [rdi], xmm0 + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + + pshuflw xmm6, xmm7, 11111111b ;k3 + pshufhw xmm7, xmm7, 0b ;k4 + punpcklwd xmm6, xmm6 + punpckhwd xmm7, xmm7 + + movq xmm4, rcx ;rounding + pshufd xmm4, xmm4, 0 + + pxor xmm5, xmm5 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_8 1 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + + pmullw xmm0, xmm6 + pmullw xmm1, xmm7 + paddsw xmm0, xmm1 + paddsw xmm0, xmm4 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack back to byte +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro APPLY_FILTER_16 1 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpckhbw xmm2, xmm5 + punpckhbw xmm3, xmm5 + + pmullw xmm0, xmm6 + pmullw xmm1, xmm7 + pmullw xmm2, xmm6 + pmullw xmm3, xmm7 + + paddsw xmm0, xmm1 + paddsw xmm2, xmm3 + + paddsw xmm0, xmm4 ;rounding + paddsw xmm2, xmm4 + psraw xmm0, 7 ;shift + psraw xmm2, 7 + packuswb xmm0, xmm2 ;pack back to byte +%if %1 + movdqu xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +SECTION .text + +global sym(aom_filter_block1d4_v2_sse2) PRIVATE +sym(aom_filter_block1d4_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d8_v2_sse2) PRIVATE +sym(aom_filter_block1d8_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d16_v2_sse2) PRIVATE +sym(aom_filter_block1d16_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d4_h2_sse2) PRIVATE +sym(aom_filter_block1d4_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d8_h2_sse2) PRIVATE +sym(aom_filter_block1d8_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d16_h2_sse2) PRIVATE +sym(aom_filter_block1d16_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/libs/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm b/libs/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm new file mode 100644 index 000000000..59edc49a9 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm @@ -0,0 +1,267 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "aom_ports/x86_abi_support.asm" + +%macro GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov ecx, 0x01000100 + + movdqa xmm3, [rdx] ;load filters + psrldq xmm3, 6 + packsswb xmm3, xmm3 + pshuflw xmm3, xmm3, 0b ;k3_k4 + + movd xmm2, ecx ;rounding_shift + pshufd xmm2, xmm2, 0 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_4 1 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm3 + + pmulhrsw xmm0, xmm2 ;rounding(+64)+shift(>>7) + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movd [rdi], xmm0 + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov ecx, 0x01000100 + + movdqa xmm7, [rdx] ;load filters + psrldq xmm7, 6 + packsswb xmm7, xmm7 + pshuflw xmm7, xmm7, 0b ;k3_k4 + punpcklwd xmm7, xmm7 + + movd xmm6, ecx ;rounding_shift + pshufd xmm6, xmm6, 0 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_8 1 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm7 + + pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7) + packuswb xmm0, xmm0 ;pack back to byte + +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro APPLY_FILTER_16 1 + punpcklbw xmm0, xmm1 + punpckhbw xmm2, xmm1 + pmaddubsw xmm0, xmm7 + pmaddubsw xmm2, xmm7 + + pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7) + pmulhrsw xmm2, xmm6 + packuswb xmm0, xmm2 ;pack back to byte + +%if %1 + movdqu xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +SECTION .text + +global sym(aom_filter_block1d4_v2_ssse3) PRIVATE +sym(aom_filter_block1d4_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d8_v2_ssse3) PRIVATE +sym(aom_filter_block1d8_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d16_v2_ssse3) PRIVATE +sym(aom_filter_block1d16_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d4_h2_ssse3) PRIVATE +sym(aom_filter_block1d4_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d8_h2_ssse3) PRIVATE +sym(aom_filter_block1d8_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d16_h2_ssse3) PRIVATE +sym(aom_filter_block1d16_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/libs/libaom/src/aom_dsp/x86/avg_intrin_avx2.c b/libs/libaom/src/aom_dsp/x86/avg_intrin_avx2.c new file mode 100644 index 000000000..3bbffbd80 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/avg_intrin_avx2.c @@ -0,0 +1,504 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/x86/bitdepth_conversion_avx2.h" +#include "aom_ports/mem.h" + +static void hadamard_col8x2_avx2(__m256i *in, int iter) { + __m256i a0 = in[0]; + __m256i a1 = in[1]; + __m256i a2 = in[2]; + __m256i a3 = in[3]; + __m256i a4 = in[4]; + __m256i a5 = in[5]; + __m256i a6 = in[6]; + __m256i a7 = in[7]; + + __m256i b0 = _mm256_add_epi16(a0, a1); + __m256i b1 = _mm256_sub_epi16(a0, a1); + __m256i b2 = _mm256_add_epi16(a2, a3); + __m256i b3 = _mm256_sub_epi16(a2, a3); + __m256i b4 = _mm256_add_epi16(a4, a5); + __m256i b5 = _mm256_sub_epi16(a4, a5); + __m256i b6 = _mm256_add_epi16(a6, a7); + __m256i b7 = _mm256_sub_epi16(a6, a7); + + a0 = _mm256_add_epi16(b0, b2); + a1 = _mm256_add_epi16(b1, b3); + a2 = _mm256_sub_epi16(b0, b2); + a3 = _mm256_sub_epi16(b1, b3); + a4 = _mm256_add_epi16(b4, b6); + a5 = _mm256_add_epi16(b5, b7); + a6 = _mm256_sub_epi16(b4, b6); + a7 = _mm256_sub_epi16(b5, b7); + + if (iter == 0) { + b0 = _mm256_add_epi16(a0, a4); + b7 = _mm256_add_epi16(a1, a5); + b3 = _mm256_add_epi16(a2, a6); + b4 = _mm256_add_epi16(a3, a7); + b2 = _mm256_sub_epi16(a0, a4); + b6 = _mm256_sub_epi16(a1, a5); + b1 = _mm256_sub_epi16(a2, a6); + b5 = _mm256_sub_epi16(a3, a7); + + a0 = _mm256_unpacklo_epi16(b0, b1); + a1 = _mm256_unpacklo_epi16(b2, b3); + a2 = _mm256_unpackhi_epi16(b0, b1); + a3 = _mm256_unpackhi_epi16(b2, b3); + a4 = _mm256_unpacklo_epi16(b4, b5); + a5 = _mm256_unpacklo_epi16(b6, b7); + a6 = _mm256_unpackhi_epi16(b4, b5); + a7 = _mm256_unpackhi_epi16(b6, b7); + + b0 = _mm256_unpacklo_epi32(a0, a1); + b1 = _mm256_unpacklo_epi32(a4, a5); + b2 = _mm256_unpackhi_epi32(a0, a1); + b3 = _mm256_unpackhi_epi32(a4, a5); + b4 = _mm256_unpacklo_epi32(a2, a3); + b5 = _mm256_unpacklo_epi32(a6, a7); + b6 = _mm256_unpackhi_epi32(a2, a3); + b7 = _mm256_unpackhi_epi32(a6, a7); + + in[0] = _mm256_unpacklo_epi64(b0, b1); + in[1] = _mm256_unpackhi_epi64(b0, b1); + in[2] = _mm256_unpacklo_epi64(b2, b3); + in[3] = _mm256_unpackhi_epi64(b2, b3); + in[4] = _mm256_unpacklo_epi64(b4, b5); + in[5] = _mm256_unpackhi_epi64(b4, b5); + in[6] = _mm256_unpacklo_epi64(b6, b7); + in[7] = _mm256_unpackhi_epi64(b6, b7); + } else { + in[0] = _mm256_add_epi16(a0, a4); + in[7] = _mm256_add_epi16(a1, a5); + in[3] = _mm256_add_epi16(a2, a6); + in[4] = _mm256_add_epi16(a3, a7); + in[2] = _mm256_sub_epi16(a0, a4); + in[6] = _mm256_sub_epi16(a1, a5); + in[1] = _mm256_sub_epi16(a2, a6); + in[5] = _mm256_sub_epi16(a3, a7); + } +} + +static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride, + int16_t *coeff) { + __m256i src[8]; + src[0] = _mm256_loadu_si256((const __m256i *)src_diff); + src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[7] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + + hadamard_col8x2_avx2(src, 0); + hadamard_col8x2_avx2(src, 1); + + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[0], src[1], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[2], src[3], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[4], src[5], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[6], src[7], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[0], src[1], 0x31)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[2], src[3], 0x31)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[4], src[5], 0x31)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[6], src[7], 0x31)); +} + +static INLINE void hadamard_16x16_avx2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff, + int is_final) { + DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]); + int16_t *t_coeff = temp_coeff; + int16_t *coeff16 = (int16_t *)coeff; + int idx; + for (idx = 0; idx < 2; ++idx) { + const int16_t *src_ptr = src_diff + idx * 8 * src_stride; + hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2)); + } + + for (idx = 0; idx < 64; idx += 16) { + const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64)); + const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128)); + const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192)); + + __m256i b0 = _mm256_add_epi16(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi16(coeff0, coeff1); + __m256i b2 = _mm256_add_epi16(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi16(coeff2, coeff3); + + b0 = _mm256_srai_epi16(b0, 1); + b1 = _mm256_srai_epi16(b1, 1); + b2 = _mm256_srai_epi16(b2, 1); + b3 = _mm256_srai_epi16(b3, 1); + if (is_final) { + store_tran_low(_mm256_add_epi16(b0, b2), coeff); + store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64); + store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128); + store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192); + coeff += 16; + } else { + _mm256_storeu_si256((__m256i *)coeff16, _mm256_add_epi16(b0, b2)); + _mm256_storeu_si256((__m256i *)(coeff16 + 64), _mm256_add_epi16(b1, b3)); + _mm256_storeu_si256((__m256i *)(coeff16 + 128), _mm256_sub_epi16(b0, b2)); + _mm256_storeu_si256((__m256i *)(coeff16 + 192), _mm256_sub_epi16(b1, b3)); + coeff16 += 16; + } + t_coeff += 16; + } +} + +void aom_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + hadamard_16x16_avx2(src_diff, src_stride, coeff, 1); +} + +void aom_hadamard_lp_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, + int16_t *coeff) { + int16_t *t_coeff = coeff; + for (int idx = 0; idx < 2; ++idx) { + const int16_t *src_ptr = src_diff + idx * 8 * src_stride; + hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2)); + } + + for (int idx = 0; idx < 64; idx += 16) { + const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64)); + const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128)); + const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192)); + + __m256i b0 = _mm256_add_epi16(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi16(coeff0, coeff1); + __m256i b2 = _mm256_add_epi16(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi16(coeff2, coeff3); + + b0 = _mm256_srai_epi16(b0, 1); + b1 = _mm256_srai_epi16(b1, 1); + b2 = _mm256_srai_epi16(b2, 1); + b3 = _mm256_srai_epi16(b3, 1); + _mm256_storeu_si256((__m256i *)coeff, _mm256_add_epi16(b0, b2)); + _mm256_storeu_si256((__m256i *)(coeff + 64), _mm256_add_epi16(b1, b3)); + _mm256_storeu_si256((__m256i *)(coeff + 128), _mm256_sub_epi16(b0, b2)); + _mm256_storeu_si256((__m256i *)(coeff + 192), _mm256_sub_epi16(b1, b3)); + coeff += 16; + t_coeff += 16; + } +} + +void aom_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + // For high bitdepths, it is unnecessary to store_tran_low + // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the + // next stage. Output to an intermediate buffer first, then store_tran_low() + // in the final stage. + DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]); + int16_t *t_coeff = temp_coeff; + int idx; + for (idx = 0; idx < 4; ++idx) { + // src_diff: 9 bit, dynamic range [-255, 255] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + hadamard_16x16_avx2(src_ptr, src_stride, + (tran_low_t *)(t_coeff + idx * 256), 0); + } + + for (idx = 0; idx < 256; idx += 16) { + const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256)); + const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512)); + const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768)); + + __m256i b0 = _mm256_add_epi16(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi16(coeff0, coeff1); + __m256i b2 = _mm256_add_epi16(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi16(coeff2, coeff3); + + b0 = _mm256_srai_epi16(b0, 2); + b1 = _mm256_srai_epi16(b1, 2); + b2 = _mm256_srai_epi16(b2, 2); + b3 = _mm256_srai_epi16(b3, 2); + + store_tran_low(_mm256_add_epi16(b0, b2), coeff); + store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256); + store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 512); + store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 768); + + coeff += 16; + t_coeff += 16; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void highbd_hadamard_col8_avx2(__m256i *in, int iter) { + __m256i a0 = in[0]; + __m256i a1 = in[1]; + __m256i a2 = in[2]; + __m256i a3 = in[3]; + __m256i a4 = in[4]; + __m256i a5 = in[5]; + __m256i a6 = in[6]; + __m256i a7 = in[7]; + + __m256i b0 = _mm256_add_epi32(a0, a1); + __m256i b1 = _mm256_sub_epi32(a0, a1); + __m256i b2 = _mm256_add_epi32(a2, a3); + __m256i b3 = _mm256_sub_epi32(a2, a3); + __m256i b4 = _mm256_add_epi32(a4, a5); + __m256i b5 = _mm256_sub_epi32(a4, a5); + __m256i b6 = _mm256_add_epi32(a6, a7); + __m256i b7 = _mm256_sub_epi32(a6, a7); + + a0 = _mm256_add_epi32(b0, b2); + a1 = _mm256_add_epi32(b1, b3); + a2 = _mm256_sub_epi32(b0, b2); + a3 = _mm256_sub_epi32(b1, b3); + a4 = _mm256_add_epi32(b4, b6); + a5 = _mm256_add_epi32(b5, b7); + a6 = _mm256_sub_epi32(b4, b6); + a7 = _mm256_sub_epi32(b5, b7); + + if (iter == 0) { + b0 = _mm256_add_epi32(a0, a4); + b7 = _mm256_add_epi32(a1, a5); + b3 = _mm256_add_epi32(a2, a6); + b4 = _mm256_add_epi32(a3, a7); + b2 = _mm256_sub_epi32(a0, a4); + b6 = _mm256_sub_epi32(a1, a5); + b1 = _mm256_sub_epi32(a2, a6); + b5 = _mm256_sub_epi32(a3, a7); + + a0 = _mm256_unpacklo_epi32(b0, b1); + a1 = _mm256_unpacklo_epi32(b2, b3); + a2 = _mm256_unpackhi_epi32(b0, b1); + a3 = _mm256_unpackhi_epi32(b2, b3); + a4 = _mm256_unpacklo_epi32(b4, b5); + a5 = _mm256_unpacklo_epi32(b6, b7); + a6 = _mm256_unpackhi_epi32(b4, b5); + a7 = _mm256_unpackhi_epi32(b6, b7); + + b0 = _mm256_unpacklo_epi64(a0, a1); + b1 = _mm256_unpacklo_epi64(a4, a5); + b2 = _mm256_unpackhi_epi64(a0, a1); + b3 = _mm256_unpackhi_epi64(a4, a5); + b4 = _mm256_unpacklo_epi64(a2, a3); + b5 = _mm256_unpacklo_epi64(a6, a7); + b6 = _mm256_unpackhi_epi64(a2, a3); + b7 = _mm256_unpackhi_epi64(a6, a7); + + in[0] = _mm256_permute2x128_si256(b0, b1, 0x20); + in[1] = _mm256_permute2x128_si256(b0, b1, 0x31); + in[2] = _mm256_permute2x128_si256(b2, b3, 0x20); + in[3] = _mm256_permute2x128_si256(b2, b3, 0x31); + in[4] = _mm256_permute2x128_si256(b4, b5, 0x20); + in[5] = _mm256_permute2x128_si256(b4, b5, 0x31); + in[6] = _mm256_permute2x128_si256(b6, b7, 0x20); + in[7] = _mm256_permute2x128_si256(b6, b7, 0x31); + } else { + in[0] = _mm256_add_epi32(a0, a4); + in[7] = _mm256_add_epi32(a1, a5); + in[3] = _mm256_add_epi32(a2, a6); + in[4] = _mm256_add_epi32(a3, a7); + in[2] = _mm256_sub_epi32(a0, a4); + in[6] = _mm256_sub_epi32(a1, a5); + in[1] = _mm256_sub_epi32(a2, a6); + in[5] = _mm256_sub_epi32(a3, a7); + } +} + +void aom_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + __m128i src16[8]; + __m256i src32[8]; + + src16[0] = _mm_loadu_si128((const __m128i *)src_diff); + src16[1] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[2] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[3] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[7] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + + src32[0] = _mm256_cvtepi16_epi32(src16[0]); + src32[1] = _mm256_cvtepi16_epi32(src16[1]); + src32[2] = _mm256_cvtepi16_epi32(src16[2]); + src32[3] = _mm256_cvtepi16_epi32(src16[3]); + src32[4] = _mm256_cvtepi16_epi32(src16[4]); + src32[5] = _mm256_cvtepi16_epi32(src16[5]); + src32[6] = _mm256_cvtepi16_epi32(src16[6]); + src32[7] = _mm256_cvtepi16_epi32(src16[7]); + + highbd_hadamard_col8_avx2(src32, 0); + highbd_hadamard_col8_avx2(src32, 1); + + _mm256_storeu_si256((__m256i *)coeff, src32[0]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[1]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[2]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[3]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[4]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[5]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[6]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[7]); +} + +void aom_highbd_hadamard_16x16_avx2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff) { + int idx; + tran_low_t *t_coeff = coeff; + for (idx = 0; idx < 4; ++idx) { + const int16_t *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; + aom_highbd_hadamard_8x8_avx2(src_ptr, src_stride, t_coeff + idx * 64); + } + + for (idx = 0; idx < 64; idx += 8) { + __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64)); + __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128)); + __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192)); + + __m256i b0 = _mm256_add_epi32(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi32(coeff0, coeff1); + __m256i b2 = _mm256_add_epi32(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi32(coeff2, coeff3); + + b0 = _mm256_srai_epi32(b0, 1); + b1 = _mm256_srai_epi32(b1, 1); + b2 = _mm256_srai_epi32(b2, 1); + b3 = _mm256_srai_epi32(b3, 1); + + coeff0 = _mm256_add_epi32(b0, b2); + coeff1 = _mm256_add_epi32(b1, b3); + coeff2 = _mm256_sub_epi32(b0, b2); + coeff3 = _mm256_sub_epi32(b1, b3); + + _mm256_storeu_si256((__m256i *)coeff, coeff0); + _mm256_storeu_si256((__m256i *)(coeff + 64), coeff1); + _mm256_storeu_si256((__m256i *)(coeff + 128), coeff2); + _mm256_storeu_si256((__m256i *)(coeff + 192), coeff3); + + coeff += 8; + t_coeff += 8; + } +} + +void aom_highbd_hadamard_32x32_avx2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff) { + int idx; + tran_low_t *t_coeff = coeff; + for (idx = 0; idx < 4; ++idx) { + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + aom_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256); + } + + for (idx = 0; idx < 256; idx += 8) { + __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256)); + __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512)); + __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768)); + + __m256i b0 = _mm256_add_epi32(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi32(coeff0, coeff1); + __m256i b2 = _mm256_add_epi32(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi32(coeff2, coeff3); + + b0 = _mm256_srai_epi32(b0, 2); + b1 = _mm256_srai_epi32(b1, 2); + b2 = _mm256_srai_epi32(b2, 2); + b3 = _mm256_srai_epi32(b3, 2); + + coeff0 = _mm256_add_epi32(b0, b2); + coeff1 = _mm256_add_epi32(b1, b3); + coeff2 = _mm256_sub_epi32(b0, b2); + coeff3 = _mm256_sub_epi32(b1, b3); + + _mm256_storeu_si256((__m256i *)coeff, coeff0); + _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1); + _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2); + _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3); + + coeff += 8; + t_coeff += 8; + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +int aom_satd_avx2(const tran_low_t *coeff, int length) { + __m256i accum = _mm256_setzero_si256(); + int i; + + for (i = 0; i < length; i += 8, coeff += 8) { + const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff); + const __m256i abs = _mm256_abs_epi32(src_line); + accum = _mm256_add_epi32(accum, abs); + } + + { // 32 bit horizontal add + const __m256i a = _mm256_srli_si256(accum, 8); + const __m256i b = _mm256_add_epi32(accum, a); + const __m256i c = _mm256_srli_epi64(b, 32); + const __m256i d = _mm256_add_epi32(b, c); + const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d), + _mm256_extractf128_si256(d, 1)); + return _mm_cvtsi128_si32(accum_128); + } +} + +int aom_satd_lp_avx2(const int16_t *coeff, int length) { + const __m256i one = _mm256_set1_epi16(1); + __m256i accum = _mm256_setzero_si256(); + + for (int i = 0; i < length; i += 16) { + const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff); + const __m256i abs = _mm256_abs_epi16(src_line); + const __m256i sum = _mm256_madd_epi16(abs, one); + accum = _mm256_add_epi32(accum, sum); + coeff += 16; + } + + { // 32 bit horizontal add + const __m256i a = _mm256_srli_si256(accum, 8); + const __m256i b = _mm256_add_epi32(accum, a); + const __m256i c = _mm256_srli_epi64(b, 32); + const __m256i d = _mm256_add_epi32(b, c); + const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d), + _mm256_extractf128_si256(d, 1)); + return _mm_cvtsi128_si32(accum_128); + } +} diff --git a/libs/libaom/src/aom_dsp/x86/avg_intrin_sse2.c b/libs/libaom/src/aom_dsp/x86/avg_intrin_sse2.c new file mode 100644 index 000000000..260ca2ad1 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/avg_intrin_sse2.c @@ -0,0 +1,512 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/x86/bitdepth_conversion_sse2.h" +#include "aom_ports/mem.h" + +void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, + int *min, int *max) { + __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff; + u0 = _mm_setzero_si128(); + // Row 0 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff0 = _mm_max_epi16(diff, negdiff); + // Row 1 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(absdiff0, absdiff); + minabsdiff = _mm_min_epi16(absdiff0, absdiff); + // Row 2 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 3 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 4 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 5 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 6 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 7 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + + maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8)); + maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32)); + maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16)); + *max = _mm_extract_epi16(maxabsdiff, 0); + + minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8)); + minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32)); + minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16)); + *min = _mm_extract_epi16(minabsdiff, 0); +} + +unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) { + __m128i s0, s1, u0; + unsigned int avg = 0; + u0 = _mm_setzero_si128(); + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + + s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8)); + s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32)); + s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); + avg = _mm_extract_epi16(s0, 0); + return (avg + 32) >> 6; +} + +unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) { + __m128i s0, s1, u0; + unsigned int avg = 0; + u0 = _mm_setzero_si128(); + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + + s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4)); + s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); + avg = _mm_extract_epi16(s0, 0); + return (avg + 8) >> 4; +} + +static INLINE void hadamard_col8_sse2(__m128i *in, int iter) { + __m128i a0 = in[0]; + __m128i a1 = in[1]; + __m128i a2 = in[2]; + __m128i a3 = in[3]; + __m128i a4 = in[4]; + __m128i a5 = in[5]; + __m128i a6 = in[6]; + __m128i a7 = in[7]; + + __m128i b0 = _mm_add_epi16(a0, a1); + __m128i b1 = _mm_sub_epi16(a0, a1); + __m128i b2 = _mm_add_epi16(a2, a3); + __m128i b3 = _mm_sub_epi16(a2, a3); + __m128i b4 = _mm_add_epi16(a4, a5); + __m128i b5 = _mm_sub_epi16(a4, a5); + __m128i b6 = _mm_add_epi16(a6, a7); + __m128i b7 = _mm_sub_epi16(a6, a7); + + a0 = _mm_add_epi16(b0, b2); + a1 = _mm_add_epi16(b1, b3); + a2 = _mm_sub_epi16(b0, b2); + a3 = _mm_sub_epi16(b1, b3); + a4 = _mm_add_epi16(b4, b6); + a5 = _mm_add_epi16(b5, b7); + a6 = _mm_sub_epi16(b4, b6); + a7 = _mm_sub_epi16(b5, b7); + + if (iter == 0) { + b0 = _mm_add_epi16(a0, a4); + b7 = _mm_add_epi16(a1, a5); + b3 = _mm_add_epi16(a2, a6); + b4 = _mm_add_epi16(a3, a7); + b2 = _mm_sub_epi16(a0, a4); + b6 = _mm_sub_epi16(a1, a5); + b1 = _mm_sub_epi16(a2, a6); + b5 = _mm_sub_epi16(a3, a7); + + a0 = _mm_unpacklo_epi16(b0, b1); + a1 = _mm_unpacklo_epi16(b2, b3); + a2 = _mm_unpackhi_epi16(b0, b1); + a3 = _mm_unpackhi_epi16(b2, b3); + a4 = _mm_unpacklo_epi16(b4, b5); + a5 = _mm_unpacklo_epi16(b6, b7); + a6 = _mm_unpackhi_epi16(b4, b5); + a7 = _mm_unpackhi_epi16(b6, b7); + + b0 = _mm_unpacklo_epi32(a0, a1); + b1 = _mm_unpacklo_epi32(a4, a5); + b2 = _mm_unpackhi_epi32(a0, a1); + b3 = _mm_unpackhi_epi32(a4, a5); + b4 = _mm_unpacklo_epi32(a2, a3); + b5 = _mm_unpacklo_epi32(a6, a7); + b6 = _mm_unpackhi_epi32(a2, a3); + b7 = _mm_unpackhi_epi32(a6, a7); + + in[0] = _mm_unpacklo_epi64(b0, b1); + in[1] = _mm_unpackhi_epi64(b0, b1); + in[2] = _mm_unpacklo_epi64(b2, b3); + in[3] = _mm_unpackhi_epi64(b2, b3); + in[4] = _mm_unpacklo_epi64(b4, b5); + in[5] = _mm_unpackhi_epi64(b4, b5); + in[6] = _mm_unpacklo_epi64(b6, b7); + in[7] = _mm_unpackhi_epi64(b6, b7); + } else { + in[0] = _mm_add_epi16(a0, a4); + in[7] = _mm_add_epi16(a1, a5); + in[3] = _mm_add_epi16(a2, a6); + in[4] = _mm_add_epi16(a3, a7); + in[2] = _mm_sub_epi16(a0, a4); + in[6] = _mm_sub_epi16(a1, a5); + in[1] = _mm_sub_epi16(a2, a6); + in[5] = _mm_sub_epi16(a3, a7); + } +} + +static INLINE void hadamard_8x8_sse2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff, + int is_final) { + __m128i src[8]; + src[0] = _mm_load_si128((const __m128i *)src_diff); + src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + + hadamard_col8_sse2(src, 0); + hadamard_col8_sse2(src, 1); + + if (is_final) { + store_tran_low(src[0], coeff); + coeff += 8; + store_tran_low(src[1], coeff); + coeff += 8; + store_tran_low(src[2], coeff); + coeff += 8; + store_tran_low(src[3], coeff); + coeff += 8; + store_tran_low(src[4], coeff); + coeff += 8; + store_tran_low(src[5], coeff); + coeff += 8; + store_tran_low(src[6], coeff); + coeff += 8; + store_tran_low(src[7], coeff); + } else { + int16_t *coeff16 = (int16_t *)coeff; + _mm_store_si128((__m128i *)coeff16, src[0]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[1]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[2]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[3]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[4]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[5]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[6]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[7]); + } +} + +void aom_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + hadamard_8x8_sse2(src_diff, src_stride, coeff, 1); +} + +void aom_hadamard_lp_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, + int16_t *coeff) { + __m128i src[8]; + src[0] = _mm_load_si128((const __m128i *)src_diff); + src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + + hadamard_col8_sse2(src, 0); + hadamard_col8_sse2(src, 1); + + _mm_store_si128((__m128i *)coeff, src[0]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[1]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[2]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[3]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[4]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[5]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[6]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[7]); +} + +static INLINE void hadamard_16x16_sse2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff, + int is_final) { + // For high bitdepths, it is unnecessary to store_tran_low + // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the + // next stage. Output to an intermediate buffer first, then store_tran_low() + // in the final stage. + DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]); + int16_t *t_coeff = temp_coeff; + int16_t *coeff16 = (int16_t *)coeff; + int idx; + for (idx = 0; idx < 4; ++idx) { + const int16_t *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; + hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64), + 0); + } + + for (idx = 0; idx < 64; idx += 8) { + __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff); + __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64)); + __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128)); + __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192)); + + __m128i b0 = _mm_add_epi16(coeff0, coeff1); + __m128i b1 = _mm_sub_epi16(coeff0, coeff1); + __m128i b2 = _mm_add_epi16(coeff2, coeff3); + __m128i b3 = _mm_sub_epi16(coeff2, coeff3); + + b0 = _mm_srai_epi16(b0, 1); + b1 = _mm_srai_epi16(b1, 1); + b2 = _mm_srai_epi16(b2, 1); + b3 = _mm_srai_epi16(b3, 1); + + coeff0 = _mm_add_epi16(b0, b2); + coeff1 = _mm_add_epi16(b1, b3); + coeff2 = _mm_sub_epi16(b0, b2); + coeff3 = _mm_sub_epi16(b1, b3); + + if (is_final) { + store_tran_low(coeff0, coeff); + store_tran_low(coeff1, coeff + 64); + store_tran_low(coeff2, coeff + 128); + store_tran_low(coeff3, coeff + 192); + coeff += 8; + } else { + _mm_store_si128((__m128i *)coeff16, coeff0); + _mm_store_si128((__m128i *)(coeff16 + 64), coeff1); + _mm_store_si128((__m128i *)(coeff16 + 128), coeff2); + _mm_store_si128((__m128i *)(coeff16 + 192), coeff3); + coeff16 += 8; + } + + t_coeff += 8; + } +} + +void aom_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + hadamard_16x16_sse2(src_diff, src_stride, coeff, 1); +} + +void aom_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + // For high bitdepths, it is unnecessary to store_tran_low + // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the + // next stage. Output to an intermediate buffer first, then store_tran_low() + // in the final stage. + DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]); + int16_t *t_coeff = temp_coeff; + int idx; + for (idx = 0; idx < 4; ++idx) { + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + hadamard_16x16_sse2(src_ptr, src_stride, + (tran_low_t *)(t_coeff + idx * 256), 0); + } + + for (idx = 0; idx < 256; idx += 8) { + __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff); + __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256)); + __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512)); + __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768)); + + __m128i b0 = _mm_add_epi16(coeff0, coeff1); + __m128i b1 = _mm_sub_epi16(coeff0, coeff1); + __m128i b2 = _mm_add_epi16(coeff2, coeff3); + __m128i b3 = _mm_sub_epi16(coeff2, coeff3); + + b0 = _mm_srai_epi16(b0, 2); + b1 = _mm_srai_epi16(b1, 2); + b2 = _mm_srai_epi16(b2, 2); + b3 = _mm_srai_epi16(b3, 2); + + coeff0 = _mm_add_epi16(b0, b2); + coeff1 = _mm_add_epi16(b1, b3); + store_tran_low(coeff0, coeff); + store_tran_low(coeff1, coeff + 256); + + coeff2 = _mm_sub_epi16(b0, b2); + coeff3 = _mm_sub_epi16(b1, b3); + store_tran_low(coeff2, coeff + 512); + store_tran_low(coeff3, coeff + 768); + + coeff += 8; + t_coeff += 8; + } +} + +int aom_satd_sse2(const tran_low_t *coeff, int length) { + int i; + const __m128i zero = _mm_setzero_si128(); + __m128i accum = zero; + + for (i = 0; i < length; i += 8) { + const __m128i src_line = load_tran_low(coeff); + const __m128i inv = _mm_sub_epi16(zero, src_line); + const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line) + const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero); + const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero); + const __m128i sum = _mm_add_epi32(abs_lo, abs_hi); + accum = _mm_add_epi32(accum, sum); + coeff += 8; + } + + { // cascading summation of accum + __m128i hi = _mm_srli_si128(accum, 8); + accum = _mm_add_epi32(accum, hi); + hi = _mm_srli_epi64(accum, 32); + accum = _mm_add_epi32(accum, hi); + } + + return _mm_cvtsi128_si32(accum); +} + +void aom_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, + const int ref_stride, const int height) { + int idx = 1; + __m128i zero = _mm_setzero_si128(); + __m128i src_line = _mm_loadu_si128((const __m128i *)ref); + __m128i s0 = _mm_unpacklo_epi8(src_line, zero); + __m128i s1 = _mm_unpackhi_epi8(src_line, zero); + __m128i t0, t1; + int height_1 = height - 1; + ref += ref_stride; + do { + src_line = _mm_loadu_si128((const __m128i *)ref); + t0 = _mm_unpacklo_epi8(src_line, zero); + t1 = _mm_unpackhi_epi8(src_line, zero); + s0 = _mm_adds_epu16(s0, t0); + s1 = _mm_adds_epu16(s1, t1); + ref += ref_stride; + + src_line = _mm_loadu_si128((const __m128i *)ref); + t0 = _mm_unpacklo_epi8(src_line, zero); + t1 = _mm_unpackhi_epi8(src_line, zero); + s0 = _mm_adds_epu16(s0, t0); + s1 = _mm_adds_epu16(s1, t1); + ref += ref_stride; + idx += 2; + } while (idx < height_1); + + src_line = _mm_loadu_si128((const __m128i *)ref); + t0 = _mm_unpacklo_epi8(src_line, zero); + t1 = _mm_unpackhi_epi8(src_line, zero); + s0 = _mm_adds_epu16(s0, t0); + s1 = _mm_adds_epu16(s1, t1); + if (height == 128) { + s0 = _mm_srai_epi16(s0, 6); + s1 = _mm_srai_epi16(s1, 6); + } else if (height == 64) { + s0 = _mm_srai_epi16(s0, 5); + s1 = _mm_srai_epi16(s1, 5); + } else if (height == 32) { + s0 = _mm_srai_epi16(s0, 4); + s1 = _mm_srai_epi16(s1, 4); + } else { + assert(height == 16); + s0 = _mm_srai_epi16(s0, 3); + s1 = _mm_srai_epi16(s1, 3); + } + + _mm_storeu_si128((__m128i *)hbuf, s0); + hbuf += 8; + _mm_storeu_si128((__m128i *)hbuf, s1); +} + +int16_t aom_int_pro_col_sse2(const uint8_t *ref, const int width) { + __m128i zero = _mm_setzero_si128(); + __m128i src_line = _mm_loadu_si128((const __m128i *)ref); + __m128i s0 = _mm_sad_epu8(src_line, zero); + __m128i s1; + int i; + + for (i = 16; i < width; i += 16) { + ref += 16; + src_line = _mm_loadu_si128((const __m128i *)ref); + s1 = _mm_sad_epu8(src_line, zero); + s0 = _mm_adds_epu16(s0, s1); + } + + s1 = _mm_srli_si128(s0, 8); + s0 = _mm_adds_epu16(s0, s1); + + return _mm_extract_epi16(s0, 0); +} diff --git a/libs/libaom/src/aom_dsp/x86/bitdepth_conversion_avx2.h b/libs/libaom/src/aom_dsp/x86/bitdepth_conversion_avx2.h new file mode 100644 index 000000000..85896e276 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/bitdepth_conversion_avx2.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" + +static INLINE __m256i load_tran_low(const tran_low_t *a) { + const __m256i a_low = _mm256_loadu_si256((const __m256i *)a); + const __m256i a_high = _mm256_loadu_si256((const __m256i *)(a + 8)); + return _mm256_packs_epi32(a_low, a_high); +} + +static INLINE void store_tran_low(__m256i a, tran_low_t *b) { + const __m256i one = _mm256_set1_epi16(1); + const __m256i a_hi = _mm256_mulhi_epi16(a, one); + const __m256i a_lo = _mm256_mullo_epi16(a, one); + const __m256i a_1 = _mm256_unpacklo_epi16(a_lo, a_hi); + const __m256i a_2 = _mm256_unpackhi_epi16(a_lo, a_hi); + _mm256_storeu_si256((__m256i *)b, a_1); + _mm256_storeu_si256((__m256i *)(b + 8), a_2); +} diff --git a/libs/libaom/src/aom_dsp/x86/bitdepth_conversion_sse2.h b/libs/libaom/src/aom_dsp/x86/bitdepth_conversion_sse2.h new file mode 100644 index 000000000..42bb2d1d3 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/bitdepth_conversion_sse2.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" + +// Load 8 16 bit values. If the source is 32 bits then pack down with +// saturation. +static INLINE __m128i load_tran_low(const tran_low_t *a) { + const __m128i a_low = _mm_load_si128((const __m128i *)a); + return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4)); +} + +// Store 8 16 bit values. If the destination is 32 bits then sign extend the +// values by multiplying by 1. +static INLINE void store_tran_low(__m128i a, tran_low_t *b) { + const __m128i one = _mm_set1_epi16(1); + const __m128i a_hi = _mm_mulhi_epi16(a, one); + const __m128i a_lo = _mm_mullo_epi16(a, one); + const __m128i a_1 = _mm_unpacklo_epi16(a_lo, a_hi); + const __m128i a_2 = _mm_unpackhi_epi16(a_lo, a_hi); + _mm_store_si128((__m128i *)(b), a_1); + _mm_store_si128((__m128i *)(b + 4), a_2); +} diff --git a/libs/libaom/src/aom_dsp/x86/blend_a64_hmask_sse4.c b/libs/libaom/src/aom_dsp/x86/blend_a64_hmask_sse4.c new file mode 100644 index 000000000..e0289abe1 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/blend_a64_hmask_sse4.c @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom/aom_integer.h" + +#include "config/aom_dsp_rtcd.h" + +// To start out, just dispatch to the function using the 2D mask and +// pass mask stride as 0. This can be improved upon if necessary. + +void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, 0, w, h, 0, 0); +} + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_blend_a64_hmask_sse4_1( + uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, + uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, int w, int h, int bd) { + aom_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride, + src1_8, src1_stride, mask, 0, w, h, 0, 0, + bd); +} +#endif diff --git a/libs/libaom/src/aom_dsp/x86/blend_a64_mask_avx2.c b/libs/libaom/src/aom_dsp/x86/blend_a64_mask_avx2.c new file mode 100644 index 000000000..95383d2fd --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/blend_a64_mask_avx2.c @@ -0,0 +1,1374 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE4.1 +#include // AVX2 + +#include + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" + +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" +#include "aom_dsp/x86/blend_sse4.h" +#include "aom_dsp/x86/blend_mask_sse4.h" + +#include "config/aom_dsp_rtcd.h" + +static INLINE void blend_a64_d16_mask_w16_avx2( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m256i *m0, const __m256i *v_round_offset, const __m256i *v_maxval, + int shift) { + const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0); + const __m256i s0_0 = yy_loadu_256(src0); + const __m256i s1_0 = yy_loadu_256(src1); + __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0), + _mm256_unpacklo_epi16(*m0, max_minus_m0)); + __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0), + _mm256_unpackhi_epi16(*m0, max_minus_m0)); + res0_lo = + _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift); + res0_hi = + _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift); + const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi); + __m256i res = _mm256_packus_epi16(res0, res0); + res = _mm256_permute4x64_epi64(res, 0xd8); + _mm_storeu_si128((__m128i *)(dst), _mm256_castsi256_si128(res)); +} + +static INLINE void blend_a64_d16_mask_w32_avx2( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m256i *m0, const __m256i *m1, const __m256i *v_round_offset, + const __m256i *v_maxval, int shift) { + const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0); + const __m256i max_minus_m1 = _mm256_sub_epi16(*v_maxval, *m1); + const __m256i s0_0 = yy_loadu_256(src0); + const __m256i s0_1 = yy_loadu_256(src0 + 16); + const __m256i s1_0 = yy_loadu_256(src1); + const __m256i s1_1 = yy_loadu_256(src1 + 16); + __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0), + _mm256_unpacklo_epi16(*m0, max_minus_m0)); + __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0), + _mm256_unpackhi_epi16(*m0, max_minus_m0)); + __m256i res1_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_1, s1_1), + _mm256_unpacklo_epi16(*m1, max_minus_m1)); + __m256i res1_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_1, s1_1), + _mm256_unpackhi_epi16(*m1, max_minus_m1)); + res0_lo = + _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift); + res0_hi = + _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift); + res1_lo = + _mm256_srai_epi32(_mm256_sub_epi32(res1_lo, *v_round_offset), shift); + res1_hi = + _mm256_srai_epi32(_mm256_sub_epi32(res1_hi, *v_round_offset), shift); + const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi); + const __m256i res1 = _mm256_packs_epi32(res1_lo, res1_hi); + __m256i res = _mm256_packus_epi16(res0, res1); + res = _mm256_permute4x64_epi64(res, 0xd8); + _mm256_storeu_si256((__m256i *)(dst), res); +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + const __m128i m = xx_loadu_128(mask); + const __m256i m0 = _mm256_cvtepu8_epi16(m); + + blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 32) { + const __m256i m = yy_loadu_256(mask + j); + const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m)); + const __m256i m1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m, 1)); + + blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i two_w = _mm256_set1_epi16(2); + for (int i = 0; i < h; ++i) { + const __m256i m_i00 = yy_loadu_256(mask); + const __m256i m_i10 = yy_loadu_256(mask + mask_stride); + + const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10); + const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b); + const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2); + + blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i two_w = _mm256_set1_epi16(2); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 32) { + const __m256i m_i00 = yy_loadu_256(mask + 2 * j); + const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32); + const __m256i m_i10 = yy_loadu_256(mask + mask_stride + 2 * j); + const __m256i m_i11 = yy_loadu_256(mask + mask_stride + 2 * j + 32); + + const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10); + const __m256i m1_ac = _mm256_adds_epu8(m_i01, m_i11); + const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b); + const __m256i m1_acbd = _mm256_maddubs_epi16(m1_ac, one_b); + const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2); + const __m256i m1 = _mm256_srli_epi16(_mm256_add_epi16(m1_acbd, two_w), 2); + + blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i zeros = _mm256_setzero_si256(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m256i m_i00 = yy_loadu_256(mask + 2 * j); + const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b); + const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros); + + blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i zeros = _mm256_setzero_si256(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 32) { + const __m256i m_i00 = yy_loadu_256(mask + 2 * j); + const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32); + const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b); + const __m256i m1_ac = _mm256_maddubs_epi16(m_i01, one_b); + const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros); + const __m256i m1 = _mm256_avg_epu16(m1_ac, zeros); + + blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m_i00 = xx_loadu_128(mask + j); + const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j); + + const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros); + const __m256i m0 = _mm256_cvtepu8_epi16(m_ac); + + blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i zeros = _mm256_setzero_si256(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 32) { + const __m256i m_i00 = yy_loadu_256(mask + j); + const __m256i m_i10 = yy_loadu_256(mask + mask_stride + j); + + const __m256i m_ac = + _mm256_avg_epu8(_mm256_adds_epu8(m_i00, m_i10), zeros); + const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m_ac)); + const __m256i m1 = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m_ac, 1)); + + blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +void aom_lowbd_blend_a64_d16_mask_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, + ConvolveParams *conv_params) { + const int bd = 8; + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + + const int round_offset = + ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) - + (1 << (round_bits - 1))) + << AOM_BLEND_A64_ROUND_BITS; + + const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS; + assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); + + assert(h >= 4); + assert(w >= 4); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + const __m128i v_round_offset = _mm_set1_epi32(round_offset); + const __m256i y_round_offset = _mm256_set1_epi32(round_offset); + + if (subw == 0 && subh == 0) { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 16: + lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &y_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + } + } else if (subw == 1 && subh == 1) { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 16: + lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &y_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + } + } else if (subw == 1 && subh == 0) { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 16: + lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + } + } else { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 16: + lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + } + } +} + +static INLINE __m256i blend_16_u8_avx2(const uint8_t *src0, const uint8_t *src1, + const __m256i *v_m0_b, + const __m256i *v_m1_b, + const int32_t bits) { + const __m256i v_s0_b = _mm256_castsi128_si256(xx_loadu_128(src0)); + const __m256i v_s1_b = _mm256_castsi128_si256(xx_loadu_128(src1)); + const __m256i v_s0_s_b = _mm256_permute4x64_epi64(v_s0_b, 0xd8); + const __m256i v_s1_s_b = _mm256_permute4x64_epi64(v_s1_b, 0xd8); + + const __m256i v_p0_w = + _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_s_b, v_s1_s_b), + _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b)); + + const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits); + const __m256i v_res_b = _mm256_packus_epi16(v_res0_w, v_res0_w); + const __m256i v_res = _mm256_permute4x64_epi64(v_res_b, 0xd8); + return v_res; +} + +static INLINE __m256i blend_32_u8_avx2(const uint8_t *src0, const uint8_t *src1, + const __m256i *v_m0_b, + const __m256i *v_m1_b, + const int32_t bits) { + const __m256i v_s0_b = yy_loadu_256(src0); + const __m256i v_s1_b = yy_loadu_256(src1); + + const __m256i v_p0_w = + _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_b, v_s1_b), + _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b)); + const __m256i v_p1_w = + _mm256_maddubs_epi16(_mm256_unpackhi_epi8(v_s0_b, v_s1_b), + _mm256_unpackhi_epi8(*v_m0_b, *v_m1_b)); + + const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits); + const __m256i v_res1_w = yy_roundn_epu16(v_p1_w, bits); + const __m256i v_res = _mm256_packus_epi16(v_res0_w, v_res1_w); + return v_res; +} + +static INLINE void blend_a64_mask_sx_sy_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h) { + const __m256i v_zmask_b = _mm256_set1_epi16(0xFF); + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + const __m256i v_ral_b = yy_loadu_256(mask); + const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride); + const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b); + const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b); + const __m256i v_rvsbl_w = + _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b); + const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w); + + const __m256i v_m0_w = yy_roundn_epu16(v_rsl_w, 2); + const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, v_m0_w); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i y_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b, + AOM_BLEND_A64_ROUND_BITS); + + xx_storeu_128(dst, _mm256_castsi256_si128(y_res_b)); + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sx_sy_w32n_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m256i v_zmask_b = _mm256_set1_epi16(0xFF); + do { + int c; + for (c = 0; c < w; c += 32) { + const __m256i v_ral_b = yy_loadu_256(mask + 2 * c); + const __m256i v_rah_b = yy_loadu_256(mask + 2 * c + 32); + const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride + 2 * c); + const __m256i v_rbh_b = yy_loadu_256(mask + mask_stride + 2 * c + 32); + const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b); + const __m256i v_rvsh_b = _mm256_add_epi8(v_rah_b, v_rbh_b); + const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b); + const __m256i v_rvsah_w = _mm256_and_si256(v_rvsh_b, v_zmask_b); + const __m256i v_rvsbl_w = + _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b); + const __m256i v_rvsbh_w = + _mm256_and_si256(_mm256_srli_si256(v_rvsh_b, 1), v_zmask_b); + const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w); + const __m256i v_rsh_w = _mm256_add_epi16(v_rvsah_w, v_rvsbh_w); + + const __m256i v_m0l_w = yy_roundn_epu16(v_rsl_w, 2); + const __m256i v_m0h_w = yy_roundn_epu16(v_rsh_w, 2); + const __m256i v_m0_b = + _mm256_permute4x64_epi64(_mm256_packus_epi16(v_m0l_w, v_m0h_w), 0xd8); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i v_res_b = blend_32_u8_avx2( + src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); + + yy_storeu_256(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sx_sy_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + switch (w) { + case 4: + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); + const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); + const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); + const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); + break; + case 8: + do { + const __m128i v_ra_b = xx_loadu_128(mask); + const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); + const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); + const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); + const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); + break; + case 16: + blend_a64_mask_sx_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h); + break; + default: + blend_a64_mask_sx_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + break; + } +} + +static INLINE void blend_a64_mask_sx_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h) { + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m256i v_zmask_b = _mm256_set1_epi16(0xff); + do { + const __m256i v_rl_b = yy_loadu_256(mask); + const __m256i v_al_b = + _mm256_avg_epu8(v_rl_b, _mm256_srli_si256(v_rl_b, 1)); + + const __m256i v_m0_w = _mm256_and_si256(v_al_b, v_zmask_b); + const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, _mm256_setzero_si256()); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i v_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b, + AOM_BLEND_A64_ROUND_BITS); + + xx_storeu_128(dst, _mm256_castsi256_si128(v_res_b)); + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sx_w32n_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m256i v_shuffle_b = yy_loadu_256(g_blend_a64_mask_shuffle); + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + int c; + for (c = 0; c < w; c += 32) { + const __m256i v_r0_b = yy_loadu_256(mask + 2 * c); + const __m256i v_r1_b = yy_loadu_256(mask + 2 * c + 32); + const __m256i v_r0_s_b = _mm256_shuffle_epi8(v_r0_b, v_shuffle_b); + const __m256i v_r1_s_b = _mm256_shuffle_epi8(v_r1_b, v_shuffle_b); + const __m256i v_al_b = + _mm256_avg_epu8(v_r0_s_b, _mm256_srli_si256(v_r0_s_b, 8)); + const __m256i v_ah_b = + _mm256_avg_epu8(v_r1_s_b, _mm256_srli_si256(v_r1_s_b, 8)); + + const __m256i v_m0_b = + _mm256_permute4x64_epi64(_mm256_unpacklo_epi64(v_al_b, v_ah_b), 0xd8); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i v_res_b = blend_32_u8_avx2( + src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); + + yy_storeu_256(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sx_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + switch (w) { + case 4: + do { + const __m128i v_r_b = xx_loadl_64(mask); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + case 8: + do { + const __m128i v_r_b = xx_loadu_128(mask); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + case 16: + blend_a64_mask_sx_w16_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h); + break; + default: + blend_a64_mask_sx_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + break; + } +} + +static INLINE void blend_a64_mask_sy_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h) { + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + const __m128i v_ra_b = xx_loadu_128(mask); + const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + + const __m128i v_m1_b = _mm_sub_epi16(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storeu_128(dst, v_res_b); + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sy_w32n_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + int c; + for (c = 0; c < w; c += 32) { + const __m256i v_ra_b = yy_loadu_256(mask + c); + const __m256i v_rb_b = yy_loadu_256(mask + c + mask_stride); + const __m256i v_m0_b = _mm256_avg_epu8(v_ra_b, v_rb_b); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + const __m256i v_res_b = blend_32_u8_avx2( + src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); + + yy_storeu_256(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sy_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + switch (w) { + case 4: + do { + const __m128i v_ra_b = xx_loadl_32(mask); + const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); + break; + case 8: + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); + break; + case 16: + blend_a64_mask_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h); + break; + default: + blend_a64_mask_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } +} + +static INLINE void blend_a64_mask_w32n_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + int c; + for (c = 0; c < w; c += 32) { + const __m256i v_m0_b = yy_loadu_256(mask + c); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i v_res_b = blend_32_u8_avx2( + src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); + + yy_storeu_256(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + switch (w) { + case 4: + do { + const __m128i v_m0_b = xx_loadl_32(mask); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + case 8: + do { + const __m128i v_m0_b = xx_loadl_64(mask); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + case 16: + do { + const __m128i v_m0_b = xx_loadu_128(mask); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storeu_128(dst, v_res_b); + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + default: + blend_a64_mask_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } +} + +void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, + int h, int subw, int subh) { + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) + aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, mask_stride, w, h, subw, subh); + } else { + if (subw & subh) { + blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } else if (subw) { + blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } else if (subh) { + blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } else { + blend_a64_mask_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, mask_stride, w, h); + } + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +////////////////////////////////////////////////////////////////////////////// +// aom_highbd_blend_a64_d16_mask_avx2() +////////////////////////////////////////////////////////////////////////////// + +static INLINE void highbd_blend_a64_d16_mask_w4_avx2( + uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0, + const __m256i *round_offset, int shift, const __m256i *clip_low, + const __m256i *clip_high, const __m256i *mask_max) { + // Load 4x u16 pixels from each of 4 rows from each source + const __m256i s0 = _mm256_set_epi64x(*(uint64_t *)(src0 + 3 * src0_stride), + *(uint64_t *)(src0 + 2 * src0_stride), + *(uint64_t *)(src0 + 1 * src0_stride), + *(uint64_t *)(src0 + 0 * src0_stride)); + const __m256i s1 = _mm256_set_epi64x(*(uint64_t *)(src1 + 3 * src1_stride), + *(uint64_t *)(src1 + 2 * src1_stride), + *(uint64_t *)(src1 + 1 * src1_stride), + *(uint64_t *)(src1 + 0 * src1_stride)); + // Generate the inverse mask + const __m256i mask1 = _mm256_sub_epi16(*mask_max, *mask0); + + // Multiply each mask by the respective source + const __m256i mul0_highs = _mm256_mulhi_epu16(*mask0, s0); + const __m256i mul0_lows = _mm256_mullo_epi16(*mask0, s0); + const __m256i mul0h = _mm256_unpackhi_epi16(mul0_lows, mul0_highs); + const __m256i mul0l = _mm256_unpacklo_epi16(mul0_lows, mul0_highs); + // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within + // lanes Later, packs does the same again which cancels this out with no need + // for a permute. The intermediate values being reordered makes no difference + + const __m256i mul1_highs = _mm256_mulhi_epu16(mask1, s1); + const __m256i mul1_lows = _mm256_mullo_epi16(mask1, s1); + const __m256i mul1h = _mm256_unpackhi_epi16(mul1_lows, mul1_highs); + const __m256i mul1l = _mm256_unpacklo_epi16(mul1_lows, mul1_highs); + + const __m256i sumh = _mm256_add_epi32(mul0h, mul1h); + const __m256i suml = _mm256_add_epi32(mul0l, mul1l); + + const __m256i roundh = + _mm256_srai_epi32(_mm256_sub_epi32(sumh, *round_offset), shift); + const __m256i roundl = + _mm256_srai_epi32(_mm256_sub_epi32(suml, *round_offset), shift); + + const __m256i pack = _mm256_packs_epi32(roundl, roundh); + const __m256i clip = + _mm256_min_epi16(_mm256_max_epi16(pack, *clip_low), *clip_high); + + // _mm256_extract_epi64 doesn't exist on x86, so do it the old-fashioned way: + const __m128i cliph = _mm256_extracti128_si256(clip, 1); + xx_storel_64(dst + 3 * dst_stride, _mm_srli_si128(cliph, 8)); + xx_storel_64(dst + 2 * dst_stride, cliph); + const __m128i clipl = _mm256_castsi256_si128(clip); + xx_storel_64(dst + 1 * dst_stride, _mm_srli_si128(clipl, 8)); + xx_storel_64(dst + 0 * dst_stride, clipl); +} + +static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2( + uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m256i *round_offset, int shift, const __m256i *clip_low, + const __m256i *clip_high, const __m256i *mask_max) { + do { + // Load 8x u8 pixels from each of 4 rows of the mask, pad each to u16 + const __m128i mask08 = _mm_set_epi32(*(uint32_t *)(mask + 3 * mask_stride), + *(uint32_t *)(mask + 2 * mask_stride), + *(uint32_t *)(mask + 1 * mask_stride), + *(uint32_t *)(mask + 0 * mask_stride)); + const __m256i mask0 = _mm256_cvtepu8_epi16(mask08); + + highbd_blend_a64_d16_mask_w4_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, &mask0, round_offset, shift, + clip_low, clip_high, mask_max); + + dst += dst_stride * 4; + src0 += src0_stride * 4; + src1 += src1_stride * 4; + mask += mask_stride * 4; + } while (h -= 4); +} + +static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2( + uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m256i *round_offset, int shift, const __m256i *clip_low, + const __m256i *clip_high, const __m256i *mask_max) { + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i two_w = _mm256_set1_epi16(2); + do { + // Load 8 pixels from each of 8 rows of mask, + // (saturating) add together rows then use madd to add adjacent pixels + // Finally, divide each value by 4 (with rounding) + const __m256i m0246 = + _mm256_set_epi64x(*(uint64_t *)(mask + 6 * mask_stride), + *(uint64_t *)(mask + 4 * mask_stride), + *(uint64_t *)(mask + 2 * mask_stride), + *(uint64_t *)(mask + 0 * mask_stride)); + const __m256i m1357 = + _mm256_set_epi64x(*(uint64_t *)(mask + 7 * mask_stride), + *(uint64_t *)(mask + 5 * mask_stride), + *(uint64_t *)(mask + 3 * mask_stride), + *(uint64_t *)(mask + 1 * mask_stride)); + const __m256i addrows = _mm256_adds_epu8(m0246, m1357); + const __m256i adjacent = _mm256_maddubs_epi16(addrows, one_b); + const __m256i mask0 = + _mm256_srli_epi16(_mm256_add_epi16(adjacent, two_w), 2); + + highbd_blend_a64_d16_mask_w4_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, &mask0, round_offset, shift, + clip_low, clip_high, mask_max); + + dst += dst_stride * 4; + src0 += src0_stride * 4; + src1 += src1_stride * 4; + mask += mask_stride * 8; + } while (h -= 4); +} + +static INLINE void highbd_blend_a64_d16_mask_w8_avx2( + uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0a, + const __m256i *mask0b, const __m256i *round_offset, int shift, + const __m256i *clip_low, const __m256i *clip_high, + const __m256i *mask_max) { + // Load 8x u16 pixels from each of 4 rows from each source + const __m256i s0a = + yy_loadu2_128(src0 + 0 * src0_stride, src0 + 1 * src0_stride); + const __m256i s0b = + yy_loadu2_128(src0 + 2 * src0_stride, src0 + 3 * src0_stride); + const __m256i s1a = + yy_loadu2_128(src1 + 0 * src1_stride, src1 + 1 * src1_stride); + const __m256i s1b = + yy_loadu2_128(src1 + 2 * src1_stride, src1 + 3 * src1_stride); + + // Generate inverse masks + const __m256i mask1a = _mm256_sub_epi16(*mask_max, *mask0a); + const __m256i mask1b = _mm256_sub_epi16(*mask_max, *mask0b); + + // Multiply sources by respective masks + const __m256i mul0a_highs = _mm256_mulhi_epu16(*mask0a, s0a); + const __m256i mul0a_lows = _mm256_mullo_epi16(*mask0a, s0a); + const __m256i mul0ah = _mm256_unpackhi_epi16(mul0a_lows, mul0a_highs); + const __m256i mul0al = _mm256_unpacklo_epi16(mul0a_lows, mul0a_highs); + // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within + // lanes Later, packs does the same again which cancels this out with no need + // for a permute. The intermediate values being reordered makes no difference + + const __m256i mul1a_highs = _mm256_mulhi_epu16(mask1a, s1a); + const __m256i mul1a_lows = _mm256_mullo_epi16(mask1a, s1a); + const __m256i mul1ah = _mm256_unpackhi_epi16(mul1a_lows, mul1a_highs); + const __m256i mul1al = _mm256_unpacklo_epi16(mul1a_lows, mul1a_highs); + + const __m256i sumah = _mm256_add_epi32(mul0ah, mul1ah); + const __m256i sumal = _mm256_add_epi32(mul0al, mul1al); + + const __m256i mul0b_highs = _mm256_mulhi_epu16(*mask0b, s0b); + const __m256i mul0b_lows = _mm256_mullo_epi16(*mask0b, s0b); + const __m256i mul0bh = _mm256_unpackhi_epi16(mul0b_lows, mul0b_highs); + const __m256i mul0bl = _mm256_unpacklo_epi16(mul0b_lows, mul0b_highs); + + const __m256i mul1b_highs = _mm256_mulhi_epu16(mask1b, s1b); + const __m256i mul1b_lows = _mm256_mullo_epi16(mask1b, s1b); + const __m256i mul1bh = _mm256_unpackhi_epi16(mul1b_lows, mul1b_highs); + const __m256i mul1bl = _mm256_unpacklo_epi16(mul1b_lows, mul1b_highs); + + const __m256i sumbh = _mm256_add_epi32(mul0bh, mul1bh); + const __m256i sumbl = _mm256_add_epi32(mul0bl, mul1bl); + + // Divide down each result, with rounding + const __m256i roundah = + _mm256_srai_epi32(_mm256_sub_epi32(sumah, *round_offset), shift); + const __m256i roundal = + _mm256_srai_epi32(_mm256_sub_epi32(sumal, *round_offset), shift); + const __m256i roundbh = + _mm256_srai_epi32(_mm256_sub_epi32(sumbh, *round_offset), shift); + const __m256i roundbl = + _mm256_srai_epi32(_mm256_sub_epi32(sumbl, *round_offset), shift); + + // Pack each i32 down to an i16 with saturation, then clip to valid range + const __m256i packa = _mm256_packs_epi32(roundal, roundah); + const __m256i clipa = + _mm256_min_epi16(_mm256_max_epi16(packa, *clip_low), *clip_high); + const __m256i packb = _mm256_packs_epi32(roundbl, roundbh); + const __m256i clipb = + _mm256_min_epi16(_mm256_max_epi16(packb, *clip_low), *clip_high); + + // Store 8x u16 pixels to each of 4 rows in the destination + yy_storeu2_128(dst + 0 * dst_stride, dst + 1 * dst_stride, clipa); + yy_storeu2_128(dst + 2 * dst_stride, dst + 3 * dst_stride, clipb); +} + +static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2( + uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask, + int mask_stride, int h, const __m256i *round_offset, int shift, + const __m256i *clip_low, const __m256i *clip_high, + const __m256i *mask_max) { + do { + // Load 8x u8 pixels from each of 4 rows in the mask + const __m128i mask0a8 = + _mm_set_epi64x(*(uint64_t *)mask, *(uint64_t *)(mask + mask_stride)); + const __m128i mask0b8 = + _mm_set_epi64x(*(uint64_t *)(mask + 2 * mask_stride), + *(uint64_t *)(mask + 3 * mask_stride)); + const __m256i mask0a = _mm256_cvtepu8_epi16(mask0a8); + const __m256i mask0b = _mm256_cvtepu8_epi16(mask0b8); + + highbd_blend_a64_d16_mask_w8_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b, + round_offset, shift, clip_low, clip_high, mask_max); + + dst += dst_stride * 4; + src0 += src0_stride * 4; + src1 += src1_stride * 4; + mask += mask_stride * 4; + } while (h -= 4); +} + +static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2( + uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask, + int mask_stride, int h, const __m256i *round_offset, int shift, + const __m256i *clip_low, const __m256i *clip_high, + const __m256i *mask_max) { + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i two_w = _mm256_set1_epi16(2); + do { + // Load 16x u8 pixels from each of 8 rows in the mask, + // (saturating) add together rows then use madd to add adjacent pixels + // Finally, divide each value by 4 (with rounding) + const __m256i m02 = + yy_loadu2_128(mask + 0 * mask_stride, mask + 2 * mask_stride); + const __m256i m13 = + yy_loadu2_128(mask + 1 * mask_stride, mask + 3 * mask_stride); + const __m256i m0123 = + _mm256_maddubs_epi16(_mm256_adds_epu8(m02, m13), one_b); + const __m256i mask_0a = + _mm256_srli_epi16(_mm256_add_epi16(m0123, two_w), 2); + const __m256i m46 = + yy_loadu2_128(mask + 4 * mask_stride, mask + 6 * mask_stride); + const __m256i m57 = + yy_loadu2_128(mask + 5 * mask_stride, mask + 7 * mask_stride); + const __m256i m4567 = + _mm256_maddubs_epi16(_mm256_adds_epu8(m46, m57), one_b); + const __m256i mask_0b = + _mm256_srli_epi16(_mm256_add_epi16(m4567, two_w), 2); + + highbd_blend_a64_d16_mask_w8_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_0a, + &mask_0b, round_offset, shift, clip_low, clip_high, mask_max); + + dst += dst_stride * 4; + src0 += src0_stride * 4; + src1 += src1_stride * 4; + mask += mask_stride * 8; + } while (h -= 4); +} + +static INLINE void highbd_blend_a64_d16_mask_w16_avx2( + uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0a, + const __m256i *mask0b, const __m256i *round_offset, int shift, + const __m256i *clip_low, const __m256i *clip_high, + const __m256i *mask_max) { + // Load 16x pixels from each of 2 rows from each source + const __m256i s0a = yy_loadu_256(src0); + const __m256i s0b = yy_loadu_256(src0 + src0_stride); + const __m256i s1a = yy_loadu_256(src1); + const __m256i s1b = yy_loadu_256(src1 + src1_stride); + + // Calculate inverse masks + const __m256i mask1a = _mm256_sub_epi16(*mask_max, *mask0a); + const __m256i mask1b = _mm256_sub_epi16(*mask_max, *mask0b); + + // Multiply each source by appropriate mask + const __m256i mul0a_highs = _mm256_mulhi_epu16(*mask0a, s0a); + const __m256i mul0a_lows = _mm256_mullo_epi16(*mask0a, s0a); + const __m256i mul0ah = _mm256_unpackhi_epi16(mul0a_lows, mul0a_highs); + const __m256i mul0al = _mm256_unpacklo_epi16(mul0a_lows, mul0a_highs); + // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within + // lanes Later, packs does the same again which cancels this out with no need + // for a permute. The intermediate values being reordered makes no difference + + const __m256i mul1a_highs = _mm256_mulhi_epu16(mask1a, s1a); + const __m256i mul1a_lows = _mm256_mullo_epi16(mask1a, s1a); + const __m256i mul1ah = _mm256_unpackhi_epi16(mul1a_lows, mul1a_highs); + const __m256i mul1al = _mm256_unpacklo_epi16(mul1a_lows, mul1a_highs); + + const __m256i mulah = _mm256_add_epi32(mul0ah, mul1ah); + const __m256i mulal = _mm256_add_epi32(mul0al, mul1al); + + const __m256i mul0b_highs = _mm256_mulhi_epu16(*mask0b, s0b); + const __m256i mul0b_lows = _mm256_mullo_epi16(*mask0b, s0b); + const __m256i mul0bh = _mm256_unpackhi_epi16(mul0b_lows, mul0b_highs); + const __m256i mul0bl = _mm256_unpacklo_epi16(mul0b_lows, mul0b_highs); + + const __m256i mul1b_highs = _mm256_mulhi_epu16(mask1b, s1b); + const __m256i mul1b_lows = _mm256_mullo_epi16(mask1b, s1b); + const __m256i mul1bh = _mm256_unpackhi_epi16(mul1b_lows, mul1b_highs); + const __m256i mul1bl = _mm256_unpacklo_epi16(mul1b_lows, mul1b_highs); + + const __m256i mulbh = _mm256_add_epi32(mul0bh, mul1bh); + const __m256i mulbl = _mm256_add_epi32(mul0bl, mul1bl); + + const __m256i resah = + _mm256_srai_epi32(_mm256_sub_epi32(mulah, *round_offset), shift); + const __m256i resal = + _mm256_srai_epi32(_mm256_sub_epi32(mulal, *round_offset), shift); + const __m256i resbh = + _mm256_srai_epi32(_mm256_sub_epi32(mulbh, *round_offset), shift); + const __m256i resbl = + _mm256_srai_epi32(_mm256_sub_epi32(mulbl, *round_offset), shift); + + // Signed saturating pack from i32 to i16: + const __m256i packa = _mm256_packs_epi32(resal, resah); + const __m256i packb = _mm256_packs_epi32(resbl, resbh); + + // Clip the values to the valid range + const __m256i clipa = + _mm256_min_epi16(_mm256_max_epi16(packa, *clip_low), *clip_high); + const __m256i clipb = + _mm256_min_epi16(_mm256_max_epi16(packb, *clip_low), *clip_high); + + // Store 16 pixels + yy_storeu_256(dst, clipa); + yy_storeu_256(dst + dst_stride, clipb); +} + +static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2( + uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask, + int mask_stride, int h, int w, const __m256i *round_offset, int shift, + const __m256i *clip_low, const __m256i *clip_high, + const __m256i *mask_max) { + for (int i = 0; i < h; i += 2) { + for (int j = 0; j < w; j += 16) { + // Load 16x u8 alpha-mask values from each of two rows and pad to u16 + const __m128i masks_a8 = xx_loadu_128(mask + j); + const __m128i masks_b8 = xx_loadu_128(mask + mask_stride + j); + const __m256i mask0a = _mm256_cvtepu8_epi16(masks_a8); + const __m256i mask0b = _mm256_cvtepu8_epi16(masks_b8); + + highbd_blend_a64_d16_mask_w16_avx2( + dst + j, dst_stride, src0 + j, src0_stride, src1 + j, src1_stride, + &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max); + } + dst += dst_stride * 2; + src0 += src0_stride * 2; + src1 += src1_stride * 2; + mask += mask_stride * 2; + } +} + +static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2( + uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask, + int mask_stride, int h, int w, const __m256i *round_offset, int shift, + const __m256i *clip_low, const __m256i *clip_high, + const __m256i *mask_max) { + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i two_w = _mm256_set1_epi16(2); + for (int i = 0; i < h; i += 2) { + for (int j = 0; j < w; j += 16) { + // Load 32x u8 alpha-mask values from each of four rows + // (saturating) add pairs of rows, then use madd to add adjacent values + // Finally, divide down each result with rounding + const __m256i m0 = yy_loadu_256(mask + 0 * mask_stride + 2 * j); + const __m256i m1 = yy_loadu_256(mask + 1 * mask_stride + 2 * j); + const __m256i m2 = yy_loadu_256(mask + 2 * mask_stride + 2 * j); + const __m256i m3 = yy_loadu_256(mask + 3 * mask_stride + 2 * j); + + const __m256i m01_8 = _mm256_adds_epu8(m0, m1); + const __m256i m23_8 = _mm256_adds_epu8(m2, m3); + + const __m256i m01 = _mm256_maddubs_epi16(m01_8, one_b); + const __m256i m23 = _mm256_maddubs_epi16(m23_8, one_b); + + const __m256i mask0a = _mm256_srli_epi16(_mm256_add_epi16(m01, two_w), 2); + const __m256i mask0b = _mm256_srli_epi16(_mm256_add_epi16(m23, two_w), 2); + + highbd_blend_a64_d16_mask_w16_avx2( + dst + j, dst_stride, src0 + j, src0_stride, src1 + j, src1_stride, + &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max); + } + dst += dst_stride * 2; + src0 += src0_stride * 2; + src1 += src1_stride * 2; + mask += mask_stride * 4; + } +} + +void aom_highbd_blend_a64_d16_mask_avx2( + uint8_t *dst8, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, + ConvolveParams *conv_params, const int bd) { + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int32_t round_offset = + ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) - + (1 << (round_bits - 1))) + << AOM_BLEND_A64_ROUND_BITS; + const __m256i v_round_offset = _mm256_set1_epi32(round_offset); + const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS; + + const __m256i clip_low = _mm256_set1_epi16(0); + const __m256i clip_high = _mm256_set1_epi16((1 << bd) - 1); + const __m256i mask_max = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); + + assert(h >= 4); + assert(w >= 4); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (subw == 0 && subh == 0) { + switch (w) { + case 4: + highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + case 8: + highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + default: // >= 16 + highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + } + + } else if (subw == 1 && subh == 1) { + switch (w) { + case 4: + highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + case 8: + highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + default: // >= 16 + highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + } + } else { + // Sub-sampling in only one axis doesn't seem to happen very much, so fall + // back to the vanilla C implementation instead of having all the optimised + // code for these. + aom_highbd_blend_a64_d16_mask_c(dst8, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, subw, + subh, conv_params, bd); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/libs/libaom/src/aom_dsp/x86/blend_a64_mask_sse4.c b/libs/libaom/src/aom_dsp/x86/blend_a64_mask_sse4.c new file mode 100644 index 000000000..4a368ef94 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/blend_a64_mask_sse4.c @@ -0,0 +1,1560 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE4.1 + +#include + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" + +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/blend_sse4.h" +#include "aom_dsp/x86/blend_mask_sse4.h" + +#include "config/aom_dsp_rtcd.h" + +////////////////////////////////////////////////////////////////////////////// +// No sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int w, int h) { + (void)w; + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { + const __m128i v_m0_b = xx_loadl_32(mask); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int w, int h) { + (void)w; + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { + const __m128i v_m0_b = xx_loadl_64(mask); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_w16n_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + + do { + int c; + for (c = 0; c < w; c += 16) { + const __m128i v_m0_b = xx_loadu_128(mask + c); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = + blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); + + xx_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Horizontal sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_a64_mask_sx_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { + const __m128i v_r_b = xx_loadl_64(mask); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_sx_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { + const __m128i v_r_b = xx_loadu_128(mask); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_sx_w16n_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + + do { + int c; + for (c = 0; c < w; c += 16) { + const __m128i v_r0_b = xx_loadu_128(mask + 2 * c); + const __m128i v_r1_b = xx_loadu_128(mask + 2 * c + 16); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r0_b, v_shuffle_b); + const __m128i v_r1_s_b = _mm_shuffle_epi8(v_r1_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r1_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r1_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = + blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); + + xx_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Vertical sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_a64_mask_sy_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + + do { + const __m128i v_ra_b = xx_loadl_32(mask); + const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_sy_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_sy_w16n_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { + int c; + for (c = 0; c < w; c += 16) { + const __m128i v_ra_b = xx_loadu_128(mask + c); + const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = + blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); + + xx_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Horizontal and Vertical sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_a64_mask_sx_sy_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + (void)w; + + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); + const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); + const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); + const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_sx_sy_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + (void)w; + + do { + const __m128i v_ra_b = xx_loadu_128(mask); + const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); + + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); + const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); + const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); + const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_sx_sy_w16n_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_zmask_b = + _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { + int c; + for (c = 0; c < w; c += 16) { + const __m128i v_ral_b = xx_loadu_128(mask + 2 * c); + const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16); + const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c); + const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16); + const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b); + const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b); + const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b); + const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b); + const __m128i v_rvsbl_w = + _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), v_zmask_b); + const __m128i v_rvsbh_w = + _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), v_zmask_b); + const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w); + const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w); + + const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2); + const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2); + const __m128i v_m0_b = _mm_packus_epi16(v_m0l_w, v_m0h_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = + blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); + + xx_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Dispatch +////////////////////////////////////////////////////////////////////////////// + +void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, + int h, int subw, int subh) { + typedef void (*blend_fn)( + uint8_t * dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h); + + // Dimensions are: width_index X subx X suby + static const blend_fn blend[3][2][2] = { + { // w % 16 == 0 + { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 }, + { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } }, + { // w == 4 + { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 }, + { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } }, + { // w == 8 + { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 }, + { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } } + }; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) + aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, mask_stride, w, h, subw, subh); + } else { + blend[(w >> 2) & 3][subw != 0][subh != 0](dst, dst_stride, src0, + src0_stride, src1, src1_stride, + mask, mask_stride, w, h); + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +////////////////////////////////////////////////////////////////////////////// +// No sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_a64_mask_bn_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + const __m128i v_m0_b = xx_loadl_32(mask); + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + xx_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, blend_4_b10); +} + +static void blend_a64_mask_b12_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, blend_4_b12); +} + +static INLINE void blend_a64_mask_bn_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, + blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + for (c = 0; c < w; c += 8) { + const __m128i v_m0_b = xx_loadl_64(mask + c); + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + xx_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, + blend_8_b10); +} + +static void blend_a64_mask_b12_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, + blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Horizontal sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_a64_mask_bn_sx_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { + const __m128i v_zmask_b = + _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + const __m128i v_r_b = xx_loadl_64(mask); + const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); + + const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + xx_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sx_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b10); +} + +static void blend_a64_mask_b12_sx_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b12); +} + +static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, + blend_unit_fn blend) { + const __m128i v_zmask_b = + _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + for (c = 0; c < w; c += 8) { + const __m128i v_r_b = xx_loadu_128(mask + 2 * c); + const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); + + const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + xx_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sx_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, + blend_8_b10); +} + +static void blend_a64_mask_b12_sx_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, + blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Vertical sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_a64_mask_bn_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + const __m128i v_ra_b = xx_loadl_32(mask); + const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); + const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + xx_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b10); +} + +static void blend_a64_mask_b12_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b12); +} + +static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, + blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + for (c = 0; c < w; c += 8) { + const __m128i v_ra_b = xx_loadl_64(mask + c); + const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride); + const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + xx_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, + blend_8_b10); +} + +static void blend_a64_mask_b12_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, + blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Horizontal and Vertical sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { + const __m128i v_zmask_b = + _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); + const __m128i v_rvsb_w = + _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); + const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); + + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + xx_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sx_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b10); +} + +static void blend_a64_mask_b12_sx_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b12); +} + +static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, + blend_unit_fn blend) { + const __m128i v_zmask_b = + _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + for (c = 0; c < w; c += 8) { + const __m128i v_ra_b = xx_loadu_128(mask + 2 * c); + const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); + const __m128i v_rvsb_w = + _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); + const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); + + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + xx_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sx_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, + blend_8_b10); +} + +static void blend_a64_mask_b12_sx_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, + blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Dispatch +////////////////////////////////////////////////////////////////////////////// +void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, + uint32_t src0_stride, + const uint8_t *src1_8, + uint32_t src1_stride, const uint8_t *mask, + uint32_t mask_stride, int w, int h, + int subw, int subh, int bd) { + typedef void (*blend_fn)( + uint16_t * dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h); + + // Dimensions are: bd_index X width_index X subw X subh + static const blend_fn blend[2][2][2][2] = { + { // bd == 8 or 10 + { // w % 8 == 0 + { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 }, + { blend_a64_mask_b10_sx_w8n_sse4_1, + blend_a64_mask_b10_sx_sy_w8n_sse4_1 } }, + { // w == 4 + { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 }, + { blend_a64_mask_b10_sx_w4_sse4_1, + blend_a64_mask_b10_sx_sy_w4_sse4_1 } } }, + { // bd == 12 + { // w % 8 == 0 + { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 }, + { blend_a64_mask_b12_sx_w8n_sse4_1, + blend_a64_mask_b12_sx_sy_w8n_sse4_1 } }, + { // w == 4 + { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 }, + { blend_a64_mask_b12_sx_w4_sse4_1, + blend_a64_mask_b12_sx_sy_w4_sse4_1 } } } + }; + + assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride)); + assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) + aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, + src1_stride, mask, mask_stride, w, h, subw, + subh, bd); + } else { + uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); + const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8); + + blend[bd == 12][(w >> 2) & 1][subw != 0][subh != 0]( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, w, h); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static INLINE void blend_a64_d16_mask_w16_sse41( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m128i *m0, const __m128i *m1, const __m128i *v_round_offset, + const __m128i *v_maxval, int shift) { + const __m128i max_minus_m0 = _mm_sub_epi16(*v_maxval, *m0); + const __m128i max_minus_m1 = _mm_sub_epi16(*v_maxval, *m1); + const __m128i s0_0 = xx_loadu_128(src0); + const __m128i s0_1 = xx_loadu_128(src0 + 8); + const __m128i s1_0 = xx_loadu_128(src1); + const __m128i s1_1 = xx_loadu_128(src1 + 8); + __m128i res0_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_0, s1_0), + _mm_unpacklo_epi16(*m0, max_minus_m0)); + __m128i res0_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_0, s1_0), + _mm_unpackhi_epi16(*m0, max_minus_m0)); + __m128i res1_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_1, s1_1), + _mm_unpacklo_epi16(*m1, max_minus_m1)); + __m128i res1_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_1, s1_1), + _mm_unpackhi_epi16(*m1, max_minus_m1)); + res0_lo = _mm_srai_epi32(_mm_sub_epi32(res0_lo, *v_round_offset), shift); + res0_hi = _mm_srai_epi32(_mm_sub_epi32(res0_hi, *v_round_offset), shift); + res1_lo = _mm_srai_epi32(_mm_sub_epi32(res1_lo, *v_round_offset), shift); + res1_hi = _mm_srai_epi32(_mm_sub_epi32(res1_hi, *v_round_offset), shift); + const __m128i res0 = _mm_packs_epi32(res0_lo, res0_hi); + const __m128i res1 = _mm_packs_epi32(res1_lo, res1_hi); + const __m128i res = _mm_packus_epi16(res0, res1); + + _mm_storeu_si128((__m128i *)(dst), res); +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m = xx_loadu_128(mask + j); + const __m128i m0 = _mm_cvtepu8_epi16(m); + const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m, 8)); + + blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m_i00 = xx_loadu_128(mask + 2 * j); + const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16); + const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j); + const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16); + + const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10); + const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11); + const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b); + const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b); + const __m128i m0 = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2); + const __m128i m1 = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2); + + blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m_i00 = xx_loadu_128(mask + 2 * j); + const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16); + const __m128i m0_ac = _mm_maddubs_epi16(m_i00, one_b); + const __m128i m1_ac = _mm_maddubs_epi16(m_i01, one_b); + const __m128i m0 = _mm_avg_epu16(m0_ac, zeros); + const __m128i m1 = _mm_avg_epu16(m1_ac, zeros); + + blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m_i00 = xx_loadu_128(mask + j); + const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j); + + const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros); + const __m128i m0 = _mm_cvtepu8_epi16(m_ac); + const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m_ac, 8)); + + blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +void aom_lowbd_blend_a64_d16_mask_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, + ConvolveParams *conv_params) { + const int bd = 8; + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + + const int round_offset = + ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) - + (1 << (round_bits - 1))) + << AOM_BLEND_A64_ROUND_BITS; + + const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS; + assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); + + assert(h >= 4); + assert(w >= 4); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + const __m128i v_round_offset = _mm_set1_epi32(round_offset); + + if (subw == 0 && subh == 0) { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift); + break; + } + + } else if (subw == 1 && subh == 1) { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift); + break; + } + } else if (subw == 1 && subh == 0) { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift); + break; + } + } else { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift); + break; + } + } +} + +////////////////////////////////////////////////////////////////////////////// +// aom_highbd_blend_a64_d16_mask_sse4_1() +////////////////////////////////////////////////////////////////////////////// +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void highbd_blend_a64_d16_mask_w4_sse4_1( + uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a, + const __m128i *mask0b, const __m128i *round_offset, int shift, + const __m128i *clip_low, const __m128i *clip_high, + const __m128i *mask_max) { + // Load 4 pixels from each of 4 rows from each source + const __m128i s0a = + _mm_set_epi64x(*(uint64_t *)src0, *(uint64_t *)(src0 + src0_stride)); + const __m128i s0b = _mm_set_epi64x(*(uint64_t *)(src0 + 2 * src0_stride), + *(uint64_t *)(src0 + 3 * src0_stride)); + const __m128i s1a = + _mm_set_epi64x(*(uint64_t *)(src1), *(uint64_t *)(src1 + src1_stride)); + const __m128i s1b = _mm_set_epi64x(*(uint64_t *)(src1 + 2 * src1_stride), + *(uint64_t *)(src1 + 3 * src1_stride)); + + // Generate the inverse masks + const __m128i mask1a = _mm_sub_epi16(*mask_max, *mask0a); + const __m128i mask1b = _mm_sub_epi16(*mask_max, *mask0b); + + // Multiply each mask by the respective source + const __m128i mul0a_highs = _mm_mulhi_epu16(*mask0a, s0a); + const __m128i mul0a_lows = _mm_mullo_epi16(*mask0a, s0a); + const __m128i mul0ah = _mm_unpackhi_epi16(mul0a_lows, mul0a_highs); + const __m128i mul0al = _mm_unpacklo_epi16(mul0a_lows, mul0a_highs); + const __m128i mul1a_highs = _mm_mulhi_epu16(mask1a, s1a); + const __m128i mul1a_lows = _mm_mullo_epi16(mask1a, s1a); + const __m128i mul1ah = _mm_unpackhi_epi16(mul1a_lows, mul1a_highs); + const __m128i mul1al = _mm_unpacklo_epi16(mul1a_lows, mul1a_highs); + + const __m128i mul0b_highs = _mm_mulhi_epu16(*mask0b, s0b); + const __m128i mul0b_lows = _mm_mullo_epi16(*mask0b, s0b); + const __m128i mul0bh = _mm_unpackhi_epi16(mul0b_lows, mul0b_highs); + const __m128i mul0bl = _mm_unpacklo_epi16(mul0b_lows, mul0b_highs); + const __m128i mul1b_highs = _mm_mulhi_epu16(mask1b, s1b); + const __m128i mul1b_lows = _mm_mullo_epi16(mask1b, s1b); + const __m128i mul1bh = _mm_unpackhi_epi16(mul1b_lows, mul1b_highs); + const __m128i mul1bl = _mm_unpacklo_epi16(mul1b_lows, mul1b_highs); + + const __m128i sumah = _mm_add_epi32(mul0ah, mul1ah); + const __m128i sumal = _mm_add_epi32(mul0al, mul1al); + const __m128i sumbh = _mm_add_epi32(mul0bh, mul1bh); + const __m128i sumbl = _mm_add_epi32(mul0bl, mul1bl); + + const __m128i roundah = + _mm_srai_epi32(_mm_sub_epi32(sumah, *round_offset), shift); + const __m128i roundbh = + _mm_srai_epi32(_mm_sub_epi32(sumbh, *round_offset), shift); + const __m128i roundal = + _mm_srai_epi32(_mm_sub_epi32(sumal, *round_offset), shift); + const __m128i roundbl = + _mm_srai_epi32(_mm_sub_epi32(sumbl, *round_offset), shift); + + const __m128i packa = _mm_packs_epi32(roundal, roundah); + const __m128i packb = _mm_packs_epi32(roundbl, roundbh); + + const __m128i clipa = + _mm_min_epi16(_mm_max_epi16(packa, *clip_low), *clip_high); + const __m128i clipb = + _mm_min_epi16(_mm_max_epi16(packb, *clip_low), *clip_high); + + xx_storel_64(dst, _mm_srli_si128(clipa, 8)); + xx_storel_64(dst + dst_stride, clipa); + xx_storel_64(dst + 2 * dst_stride, _mm_srli_si128(clipb, 8)); + xx_storel_64(dst + 3 * dst_stride, clipb); +} + +static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift, const __m128i *clip_low, + const __m128i *clip_high, const __m128i *mask_max) { + do { + const __m128i mask0a8 = _mm_set_epi32(0, 0, *(uint32_t *)mask, + *(uint32_t *)(mask + mask_stride)); + const __m128i mask0b8 = + _mm_set_epi32(0, 0, *(uint32_t *)(mask + 2 * mask_stride), + *(uint32_t *)(mask + 3 * mask_stride)); + const __m128i mask0a = _mm_cvtepu8_epi16(mask0a8); + const __m128i mask0b = _mm_cvtepu8_epi16(mask0b8); + + highbd_blend_a64_d16_mask_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b, + round_offset, shift, clip_low, clip_high, mask_max); + + dst += dst_stride * 4; + src0 += src0_stride * 4; + src1 += src1_stride * 4; + mask += mask_stride * 4; + } while (h -= 4); +} + +static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift, const __m128i *clip_low, + const __m128i *clip_high, const __m128i *mask_max) { + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + do { + // Load 8 pixels from each of 8 rows of mask, + // (saturating) add together rows then use madd to add adjacent pixels + // Finally, divide each value by 4 (with rounding) + const __m128i m02 = _mm_set_epi64x(*(uint64_t *)(mask), + *(uint64_t *)(mask + 2 * mask_stride)); + const __m128i m13 = _mm_set_epi64x(*(uint64_t *)(mask + mask_stride), + *(uint64_t *)(mask + 3 * mask_stride)); + const __m128i m0123 = _mm_maddubs_epi16(_mm_adds_epu8(m02, m13), one_b); + const __m128i mask_0a = _mm_srli_epi16(_mm_add_epi16(m0123, two_w), 2); + const __m128i m46 = _mm_set_epi64x(*(uint64_t *)(mask + 4 * mask_stride), + *(uint64_t *)(mask + 6 * mask_stride)); + const __m128i m57 = _mm_set_epi64x(*(uint64_t *)(mask + 5 * mask_stride), + *(uint64_t *)(mask + 7 * mask_stride)); + const __m128i m4567 = _mm_maddubs_epi16(_mm_adds_epu8(m46, m57), one_b); + const __m128i mask_0b = _mm_srli_epi16(_mm_add_epi16(m4567, two_w), 2); + + highbd_blend_a64_d16_mask_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_0a, + &mask_0b, round_offset, shift, clip_low, clip_high, mask_max); + + dst += dst_stride * 4; + src0 += src0_stride * 4; + src1 += src1_stride * 4; + mask += mask_stride * 8; + } while (h -= 4); +} + +static INLINE void highbd_blend_a64_d16_mask_w8_sse4_1( + uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a, + const __m128i *mask0b, const __m128i *round_offset, int shift, + const __m128i *clip_low, const __m128i *clip_high, + const __m128i *max_mask) { + // Load 8x pixels from each of 2 rows from each source + const __m128i s0a = xx_loadu_128(src0); + const __m128i s0b = xx_loadu_128(src0 + src0_stride); + const __m128i s1a = xx_loadu_128(src1); + const __m128i s1b = xx_loadu_128(src1 + src1_stride); + + // Generate inverse masks + const __m128i mask1a = _mm_sub_epi16(*max_mask, *mask0a); + const __m128i mask1b = _mm_sub_epi16(*max_mask, *mask0b); + + // Multiply sources by respective masks + const __m128i mul0a_highs = _mm_mulhi_epu16(*mask0a, s0a); + const __m128i mul0a_lows = _mm_mullo_epi16(*mask0a, s0a); + const __m128i mul0ah = _mm_unpackhi_epi16(mul0a_lows, mul0a_highs); + const __m128i mul0al = _mm_unpacklo_epi16(mul0a_lows, mul0a_highs); + + const __m128i mul1a_highs = _mm_mulhi_epu16(mask1a, s1a); + const __m128i mul1a_lows = _mm_mullo_epi16(mask1a, s1a); + const __m128i mul1ah = _mm_unpackhi_epi16(mul1a_lows, mul1a_highs); + const __m128i mul1al = _mm_unpacklo_epi16(mul1a_lows, mul1a_highs); + + const __m128i sumah = _mm_add_epi32(mul0ah, mul1ah); + const __m128i sumal = _mm_add_epi32(mul0al, mul1al); + + const __m128i mul0b_highs = _mm_mulhi_epu16(*mask0b, s0b); + const __m128i mul0b_lows = _mm_mullo_epi16(*mask0b, s0b); + const __m128i mul0bh = _mm_unpackhi_epi16(mul0b_lows, mul0b_highs); + const __m128i mul0bl = _mm_unpacklo_epi16(mul0b_lows, mul0b_highs); + + const __m128i mul1b_highs = _mm_mulhi_epu16(mask1b, s1b); + const __m128i mul1b_lows = _mm_mullo_epi16(mask1b, s1b); + const __m128i mul1bh = _mm_unpackhi_epi16(mul1b_lows, mul1b_highs); + const __m128i mul1bl = _mm_unpacklo_epi16(mul1b_lows, mul1b_highs); + + const __m128i sumbh = _mm_add_epi32(mul0bh, mul1bh); + const __m128i sumbl = _mm_add_epi32(mul0bl, mul1bl); + + const __m128i roundah = + _mm_srai_epi32(_mm_sub_epi32(sumah, *round_offset), shift); + const __m128i roundal = + _mm_srai_epi32(_mm_sub_epi32(sumal, *round_offset), shift); + const __m128i roundbh = + _mm_srai_epi32(_mm_sub_epi32(sumbh, *round_offset), shift); + const __m128i roundbl = + _mm_srai_epi32(_mm_sub_epi32(sumbl, *round_offset), shift); + + const __m128i packa = _mm_packs_epi32(roundal, roundah); + const __m128i clipa = + _mm_min_epi16(_mm_max_epi16(packa, *clip_low), *clip_high); + const __m128i packb = _mm_packs_epi32(roundbl, roundbh); + const __m128i clipb = + _mm_min_epi16(_mm_max_epi16(packb, *clip_low), *clip_high); + + xx_storeu_128(dst, clipa); + xx_storeu_128(dst + dst_stride, clipb); +} + +static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( + uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift, const __m128i *clip_low, + const __m128i *clip_high, const __m128i *max_mask) { + do { + const __m128i mask0a = _mm_cvtepu8_epi16(xx_loadl_64(mask)); + const __m128i mask0b = _mm_cvtepu8_epi16(xx_loadl_64(mask + mask_stride)); + highbd_blend_a64_d16_mask_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b, + round_offset, shift, clip_low, clip_high, max_mask); + + dst += dst_stride * 2; + src0 += src0_stride * 2; + src1 += src1_stride * 2; + mask += mask_stride * 2; + } while (h -= 2); +} + +static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( + uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift, const __m128i *clip_low, + const __m128i *clip_high, const __m128i *max_mask) { + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + do { + const __m128i mask_thisrowa = xx_loadu_128(mask); + const __m128i mask_nextrowa = xx_loadu_128(mask + mask_stride); + const __m128i mask_thisrowb = xx_loadu_128(mask + 2 * mask_stride); + const __m128i mask_nextrowb = xx_loadu_128(mask + 3 * mask_stride); + const __m128i mask_bothrowsa = _mm_adds_epu8(mask_thisrowa, mask_nextrowa); + const __m128i mask_bothrowsb = _mm_adds_epu8(mask_thisrowb, mask_nextrowb); + const __m128i mask_16a = _mm_maddubs_epi16(mask_bothrowsa, one_b); + const __m128i mask_16b = _mm_maddubs_epi16(mask_bothrowsb, one_b); + const __m128i mask_sa = _mm_srli_epi16(_mm_add_epi16(mask_16a, two_w), 2); + const __m128i mask_sb = _mm_srli_epi16(_mm_add_epi16(mask_16b, two_w), 2); + + highbd_blend_a64_d16_mask_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_sa, + &mask_sb, round_offset, shift, clip_low, clip_high, max_mask); + + dst += dst_stride * 2; + src0 += src0_stride * 2; + src1 += src1_stride * 2; + mask += mask_stride * 4; + } while (h -= 2); +} + +static INLINE void highbd_blend_a64_d16_mask_w16_sse4_1( + uint16_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m128i *round_offset, int shift, const __m128i *mask0l, + const __m128i *mask0h, const __m128i *clip_low, const __m128i *clip_high, + const __m128i *mask_max) { + // Load 16x u16 pixels for this row from each src + const __m128i s0l = xx_loadu_128(src0); + const __m128i s0h = xx_loadu_128(src0 + 8); + const __m128i s1l = xx_loadu_128(src1); + const __m128i s1h = xx_loadu_128(src1 + 8); + + // Calculate inverse masks + const __m128i mask1h = _mm_sub_epi16(*mask_max, *mask0h); + const __m128i mask1l = _mm_sub_epi16(*mask_max, *mask0l); + + const __m128i mul0_highs = _mm_mulhi_epu16(*mask0h, s0h); + const __m128i mul0_lows = _mm_mullo_epi16(*mask0h, s0h); + const __m128i mul0h = _mm_unpackhi_epi16(mul0_lows, mul0_highs); + const __m128i mul0l = _mm_unpacklo_epi16(mul0_lows, mul0_highs); + + const __m128i mul1_highs = _mm_mulhi_epu16(mask1h, s1h); + const __m128i mul1_lows = _mm_mullo_epi16(mask1h, s1h); + const __m128i mul1h = _mm_unpackhi_epi16(mul1_lows, mul1_highs); + const __m128i mul1l = _mm_unpacklo_epi16(mul1_lows, mul1_highs); + + const __m128i mulhh = _mm_add_epi32(mul0h, mul1h); + const __m128i mulhl = _mm_add_epi32(mul0l, mul1l); + + const __m128i mul2_highs = _mm_mulhi_epu16(*mask0l, s0l); + const __m128i mul2_lows = _mm_mullo_epi16(*mask0l, s0l); + const __m128i mul2h = _mm_unpackhi_epi16(mul2_lows, mul2_highs); + const __m128i mul2l = _mm_unpacklo_epi16(mul2_lows, mul2_highs); + + const __m128i mul3_highs = _mm_mulhi_epu16(mask1l, s1l); + const __m128i mul3_lows = _mm_mullo_epi16(mask1l, s1l); + const __m128i mul3h = _mm_unpackhi_epi16(mul3_lows, mul3_highs); + const __m128i mul3l = _mm_unpacklo_epi16(mul3_lows, mul3_highs); + + const __m128i mullh = _mm_add_epi32(mul2h, mul3h); + const __m128i mulll = _mm_add_epi32(mul2l, mul3l); + + const __m128i reshh = + _mm_srai_epi32(_mm_sub_epi32(mulhh, *round_offset), shift); + const __m128i reshl = + _mm_srai_epi32(_mm_sub_epi32(mulhl, *round_offset), shift); + const __m128i reslh = + _mm_srai_epi32(_mm_sub_epi32(mullh, *round_offset), shift); + const __m128i resll = + _mm_srai_epi32(_mm_sub_epi32(mulll, *round_offset), shift); + + // Signed saturating pack from i32 to i16: + const __m128i packh = _mm_packs_epi32(reshl, reshh); + const __m128i packl = _mm_packs_epi32(resll, reslh); + + // Clip the values to the valid range + const __m128i cliph = + _mm_min_epi16(_mm_max_epi16(packh, *clip_low), *clip_high); + const __m128i clipl = + _mm_min_epi16(_mm_max_epi16(packl, *clip_low), *clip_high); + + // Store 16 pixels + xx_storeu_128(dst, clipl); + xx_storeu_128(dst + 8, cliph); +} + +static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1( + uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift, const __m128i *clip_low, + const __m128i *clip_high, const __m128i *mask_max) { + for (int i = 0; i < h; i++) { + for (int j = 0; j < w; j += 16) { + // Load 16x u8 alpha-mask values and pad to u16 + const __m128i masks_u8 = xx_loadu_128(mask + j); + const __m128i mask0l = _mm_cvtepu8_epi16(masks_u8); + const __m128i mask0h = _mm_cvtepu8_epi16(_mm_srli_si128(masks_u8, 8)); + + highbd_blend_a64_d16_mask_w16_sse4_1( + dst + j, src0 + j, src1 + j, round_offset, shift, &mask0l, &mask0h, + clip_low, clip_high, mask_max); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } +} + +static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1( + uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift, const __m128i *clip_low, + const __m128i *clip_high, const __m128i *mask_max) { + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + for (int i = 0; i < h; i++) { + for (int j = 0; j < w; j += 16) { + const __m128i m_i00 = xx_loadu_128(mask + 2 * j); + const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16); + const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j); + const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16); + + const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10); + const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11); + const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b); + const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b); + const __m128i mask_l = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2); + const __m128i mask_h = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2); + + highbd_blend_a64_d16_mask_w16_sse4_1( + dst + j, src0 + j, src1 + j, round_offset, shift, &mask_l, &mask_h, + clip_low, clip_high, mask_max); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride * 2; + } +} + +void aom_highbd_blend_a64_d16_mask_sse4_1( + uint8_t *dst8, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, + ConvolveParams *conv_params, const int bd) { + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int32_t round_offset = + ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) - + (1 << (round_bits - 1))) + << AOM_BLEND_A64_ROUND_BITS; + const __m128i v_round_offset = _mm_set1_epi32(round_offset); + const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS; + + const __m128i clip_low = _mm_set1_epi16(0); + const __m128i clip_high = _mm_set1_epi16((1 << bd) - 1); + const __m128i mask_max = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); + + assert(h >= 4); + assert(w >= 4); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (subw == 0 && subh == 0) { + switch (w) { + case 4: + highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + case 8: + highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + default: // >=16 + highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + } + + } else if (subw == 1 && subh == 1) { + switch (w) { + case 4: + highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + case 8: + highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + default: // >=16 + highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + } + } else { + // Sub-sampling in only one axis doesn't seem to happen very much, so fall + // back to the vanilla C implementation instead of having all the optimised + // code for these. + aom_highbd_blend_a64_d16_mask_c(dst8, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, subw, + subh, conv_params, bd); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/libs/libaom/src/aom_dsp/x86/blend_a64_vmask_sse4.c b/libs/libaom/src/aom_dsp/x86/blend_a64_vmask_sse4.c new file mode 100644 index 000000000..75fb1c5a9 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/blend_a64_vmask_sse4.c @@ -0,0 +1,285 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE4.1 + +#include + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" + +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/blend_sse4.h" + +#include "config/aom_dsp_rtcd.h" + +////////////////////////////////////////////////////////////////////////////// +// Implementation - No sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + (void)w; + + do { + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + (void)w; + + do { + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, + uint32_t src0_stride, + const uint8_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + for (c = 0; c < w; c += 16) { + const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w); + const __m128i v_resh_w = + blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + + xx_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Dispatch +////////////////////////////////////////////////////////////////////////////// + +void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + typedef void (*blend_fn)(uint8_t * dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h); + + // Dimension: width_index + static const blend_fn blend[9] = { + blend_a64_vmask_w16n_sse4_1, // w % 16 == 0 + aom_blend_a64_vmask_c, // w == 1 + aom_blend_a64_vmask_c, // w == 2 + NULL, // INVALID + blend_a64_vmask_w4_sse4_1, // w == 4 + NULL, // INVALID + NULL, // INVALID + NULL, // INVALID + blend_a64_vmask_w8_sse4_1, // w == 8 + }; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w, + h); +} + +#if CONFIG_AV1_HIGHBITDEPTH +////////////////////////////////////////////////////////////////////////////// +// Implementation - No sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_a64_vmask_bn_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + xx_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, + uint32_t src0_stride, + const uint16_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + (void)w; + blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, h, blend_4_b10); +} + +static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, + uint32_t src0_stride, + const uint16_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + (void)w; + blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, h, blend_4_b12); +} + +static INLINE void blend_a64_vmask_bn_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h, blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + for (c = 0; c < w; c += 8) { + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + xx_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, + uint32_t src0_stride, + const uint16_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, w, h, blend_8_b10); +} + +static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, + uint32_t src0_stride, + const uint16_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, w, h, blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Dispatch +////////////////////////////////////////////////////////////////////////////// + +void aom_highbd_blend_a64_vmask_sse4_1( + uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, + uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, int w, int h, int bd) { + typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h); + + // Dimensions are: bd_index X width_index + static const blend_fn blend[2][2] = { + { + // bd == 8 or 10 + blend_a64_vmask_b10_w8n_sse4_1, // w % 8 == 0 + blend_a64_vmask_b10_w4_sse4_1, // w == 4 + }, + { + // bd == 12 + blend_a64_vmask_b12_w8n_sse4_1, // w % 8 == 0 + blend_a64_vmask_b12_w4_sse4_1, // w == 4 + } + }; + + assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride)); + assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + + if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) + aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, + src1_stride, mask, w, h, bd); + } else { + uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); + const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8); + + blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, w, h); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/libs/libaom/src/aom_dsp/x86/blend_mask_sse4.h b/libs/libaom/src/aom_dsp/x86/blend_mask_sse4.h new file mode 100644 index 000000000..c071fdcfc --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/blend_mask_sse4.h @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ +#define AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ +#include // SSE4.1 + +#include + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" + +#include "aom_dsp/x86/synonyms.h" + +#include "config/aom_dsp_rtcd.h" + +static INLINE void blend_a64_d16_mask_w4_sse41( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval, + int shift) { + const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m); + const __m128i s0 = xx_loadl_64(src0); + const __m128i s1 = xx_loadl_64(src1); + const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1); + const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m); + const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m); + const __m128i res_c = _mm_sub_epi32(res_a, *v_round_offset); + const __m128i res_d = _mm_srai_epi32(res_c, shift); + const __m128i res_e = _mm_packs_epi32(res_d, res_d); + const __m128i res = _mm_packus_epi16(res_e, res_e); + + xx_storel_32(dst, res); +} + +static INLINE void blend_a64_d16_mask_w8_sse41( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval, + int shift) { + const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m); + const __m128i s0 = xx_loadu_128(src0); + const __m128i s1 = xx_loadu_128(src1); + __m128i res_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0, s1), + _mm_unpacklo_epi16(*m, max_minus_m)); + __m128i res_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0, s1), + _mm_unpackhi_epi16(*m, max_minus_m)); + res_lo = _mm_srai_epi32(_mm_sub_epi32(res_lo, *v_round_offset), shift); + res_hi = _mm_srai_epi32(_mm_sub_epi32(res_hi, *v_round_offset), shift); + const __m128i res_e = _mm_packs_epi32(res_lo, res_hi); + const __m128i res = _mm_packus_epi16(res_e, res_e); + + _mm_storel_epi64((__m128i *)(dst), res); +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + const __m128i m0 = xx_loadl_32(mask); + const __m128i m = _mm_cvtepu8_epi16(m0); + + blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + const __m128i m0 = xx_loadl_64(mask); + const __m128i m = _mm_cvtepu8_epi16(m0); + blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadl_64(mask); + const __m128i m_i1 = xx_loadl_64(mask + mask_stride); + const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); + const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b); + const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w); + const __m128i m = _mm_srli_epi16(m_acbd_2, 2); + + blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadu_128(mask); + const __m128i m_i1 = xx_loadu_128(mask + mask_stride); + const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); + const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b); + const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w); + const __m128i m = _mm_srli_epi16(m_acbd_2, 2); + + blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadl_64(mask); + const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); + const __m128i m = _mm_avg_epu16(m_ac, zeros); + + blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadu_128(mask); + const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); + const __m128i m = _mm_avg_epu16(m_ac, zeros); + + blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} +static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadl_64(mask); + const __m128i m_i1 = xx_loadl_64(mask + mask_stride); + const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); + const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros)); + + blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadl_64(mask); + const __m128i m_i1 = xx_loadl_64(mask + mask_stride); + const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); + const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros)); + + blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} +#endif // AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ diff --git a/libs/libaom/src/aom_dsp/x86/blend_sse4.h b/libs/libaom/src/aom_dsp/x86/blend_sse4.h new file mode 100644 index 000000000..8d9b32510 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/blend_sse4.h @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_BLEND_SSE4_H_ +#define AOM_AOM_DSP_X86_BLEND_SSE4_H_ + +#include "aom_dsp/blend.h" +#include "aom_dsp/x86/synonyms.h" +static const uint8_t g_blend_a64_mask_shuffle[32] = { + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, +}; + +////////////////////////////////////////////////////////////////////////////// +// Common kernels +////////////////////////////////////////////////////////////////////////////// + +static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1, + const __m128i *v_m0_w, const __m128i *v_m1_w) { + const __m128i v_s0_b = xx_loadl_32(src0); + const __m128i v_s1_b = xx_loadl_32(src1); + const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); + const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w); + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); + + return v_res_w; +} + +static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1, + const __m128i *v_m0_w, const __m128i *v_m1_w) { + const __m128i v_s0_b = xx_loadl_64(src0); + const __m128i v_s1_b = xx_loadl_64(src1); + const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); + const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w); + + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + + const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); + + return v_res_w; +} + +static INLINE __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1, + const __m128i *v_m0_b, const __m128i *v_m1_b, + const __m128i *rounding) { + const __m128i v_s0_b = xx_loadl_32(src0); + const __m128i v_s1_b = xx_loadl_32(src1); + + const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), + _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); + + const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding); + const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w); + return v_res; +} + +static INLINE __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1, + const __m128i *v_m0_b, const __m128i *v_m1_b, + const __m128i *rounding) { + const __m128i v_s0_b = xx_loadl_64(src0); + const __m128i v_s1_b = xx_loadl_64(src1); + + const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), + _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); + + const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding); + const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w); + return v_res; +} + +static INLINE __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1, + const __m128i *v_m0_b, const __m128i *v_m1_b, + const __m128i *rounding) { + const __m128i v_s0_b = xx_loadu_128(src0); + const __m128i v_s1_b = xx_loadu_128(src1); + + const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), + _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); + const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b), + _mm_unpackhi_epi8(*v_m0_b, *v_m1_b)); + + const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding); + const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding); + const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w); + return v_res; +} + +typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w); + +static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_w = xx_loadl_64(src0); + const __m128i v_s1_w = xx_loadl_64(src1); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); + + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + + const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); + + return v_res_w; +} + +static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_w = xx_loadu_128(src0); + const __m128i v_s1_w = xx_loadu_128(src1); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); + + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + + const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); + + return v_res_w; +} + +static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_w = xx_loadl_64(src0); + const __m128i v_s1_w = xx_loadl_64(src1); + + // Interleave + const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w); + const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w); + + // Multiply-Add + const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w); + + // Scale + const __m128i v_ssum_d = + _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1); + + // Pack + const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d); + + // Round + const __m128i v_res_w = xx_round_epu16(v_pssum_d); + + return v_res_w; +} + +static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_w = xx_loadu_128(src0); + const __m128i v_s1_w = xx_loadu_128(src1); + + // Interleave + const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w); + const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w); + const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w); + const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w); + + // Multiply-Add + const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w); + const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w); + + // Scale + const __m128i v_ssuml_d = + _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1); + const __m128i v_ssumh_d = + _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1); + + // Pack + const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d); + + // Round + const __m128i v_res_w = xx_round_epu16(v_pssum_d); + + return v_res_w; +} + +#endif // AOM_AOM_DSP_X86_BLEND_SSE4_H_ diff --git a/libs/libaom/src/aom_dsp/x86/blk_sse_sum_avx2.c b/libs/libaom/src/aom_dsp/x86/blk_sse_sum_avx2.c new file mode 100644 index 000000000..f7c0eb037 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/blk_sse_sum_avx2.c @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +static INLINE void accumulate_sse_sum(__m256i regx_sum, __m256i regx2_sum, + int *x_sum, int64_t *x2_sum) { + __m256i sum_buffer, sse_buffer; + __m128i out_buffer; + + // Accumulate the various elements of register into first element. + sum_buffer = _mm256_permute2f128_si256(regx_sum, regx_sum, 1); + regx_sum = _mm256_add_epi32(sum_buffer, regx_sum); + regx_sum = _mm256_add_epi32(regx_sum, _mm256_srli_si256(regx_sum, 8)); + regx_sum = _mm256_add_epi32(regx_sum, _mm256_srli_si256(regx_sum, 4)); + + sse_buffer = _mm256_permute2f128_si256(regx2_sum, regx2_sum, 1); + regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum); + regx2_sum = _mm256_add_epi64(regx2_sum, _mm256_srli_si256(regx2_sum, 8)); + + out_buffer = _mm256_castsi256_si128(regx_sum); + *x_sum += _mm_cvtsi128_si32(out_buffer); + out_buffer = _mm256_castsi256_si128(regx2_sum); +#if ARCH_X86_64 + *x2_sum += _mm_cvtsi128_si64(out_buffer); +#else + { + int64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, out_buffer); + *x2_sum += tmp; + } +#endif +} + +static INLINE void sse_sum_wd4_avx2(const int16_t *data, int stride, int bh, + int *x_sum, int64_t *x2_sum) { + __m128i row1, row2, row3; + __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer, + temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer; + const int16_t *data_tmp = data; + __m256i one = _mm256_set1_epi16(1); + regx_sum = _mm256_setzero_si256(); + regx2_sum = regx_sum; + sum_buffer = _mm256_setzero_si256(); + sse_buffer = sum_buffer; + + for (int j = 0; j < (bh >> 2); ++j) { + // Load 4 rows at a time. + row1 = _mm_loadl_epi64((__m128i const *)(data_tmp)); + row2 = _mm_loadl_epi64((__m128i const *)(data_tmp + stride)); + row1 = _mm_unpacklo_epi64(row1, row2); + row2 = _mm_loadl_epi64((__m128i const *)(data_tmp + 2 * stride)); + row3 = _mm_loadl_epi64((__m128i const *)(data_tmp + 3 * stride)); + row2 = _mm_unpacklo_epi64(row2, row3); + load_pixels = + _mm256_insertf128_si256(_mm256_castsi128_si256(row1), row2, 1); + + row_sum_buffer = _mm256_madd_epi16(load_pixels, one); + row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels); + sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer); + sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer); + data_tmp += 4 * stride; + } + + // To prevent 32-bit variable overflow, unpack the elements to 64-bit. + temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256()); + temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256()); + sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2); + regx_sum = _mm256_add_epi32(sum_buffer, regx_sum); + regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum); + + accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum); +} + +static INLINE void sse_sum_wd8_avx2(const int16_t *data, int stride, int bh, + int *x_sum, int64_t *x2_sum) { + __m128i load_128bit, load_next_128bit; + __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer, + temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer; + const int16_t *data_tmp = data; + __m256i one = _mm256_set1_epi16(1); + regx_sum = _mm256_setzero_si256(); + regx2_sum = regx_sum; + sum_buffer = _mm256_setzero_si256(); + sse_buffer = sum_buffer; + + for (int j = 0; j < (bh >> 1); ++j) { + // Load 2 rows at a time. + load_128bit = _mm_loadu_si128((__m128i const *)(data_tmp)); + load_next_128bit = _mm_loadu_si128((__m128i const *)(data_tmp + stride)); + load_pixels = _mm256_insertf128_si256(_mm256_castsi128_si256(load_128bit), + load_next_128bit, 1); + + row_sum_buffer = _mm256_madd_epi16(load_pixels, one); + row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels); + sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer); + sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer); + data_tmp += 2 * stride; + } + + temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256()); + temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256()); + sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2); + regx_sum = _mm256_add_epi32(sum_buffer, regx_sum); + regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum); + + accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum); +} + +static INLINE void sse_sum_wd16_avx2(const int16_t *data, int stride, int bh, + int *x_sum, int64_t *x2_sum, + int loop_count) { + __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer, + temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer; + const int16_t *data_tmp = data; + __m256i one = _mm256_set1_epi16(1); + regx_sum = _mm256_setzero_si256(); + regx2_sum = regx_sum; + sum_buffer = _mm256_setzero_si256(); + sse_buffer = sum_buffer; + + for (int i = 0; i < loop_count; ++i) { + data_tmp = data + 16 * i; + for (int j = 0; j < bh; ++j) { + load_pixels = _mm256_lddqu_si256((__m256i const *)(data_tmp)); + + row_sum_buffer = _mm256_madd_epi16(load_pixels, one); + row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels); + sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer); + sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer); + data_tmp += stride; + } + } + + temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256()); + temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256()); + sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2); + regx_sum = _mm256_add_epi32(sum_buffer, regx_sum); + regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum); + + accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum); +} + +void aom_get_blk_sse_sum_avx2(const int16_t *data, int stride, int bw, int bh, + int *x_sum, int64_t *x2_sum) { + *x_sum = 0; + *x2_sum = 0; + + if ((bh & 3) == 0) { + switch (bw) { + // For smaller block widths, compute multiple rows simultaneously. + case 4: sse_sum_wd4_avx2(data, stride, bh, x_sum, x2_sum); break; + case 8: sse_sum_wd8_avx2(data, stride, bh, x_sum, x2_sum); break; + case 16: + case 32: + sse_sum_wd16_avx2(data, stride, bh, x_sum, x2_sum, bw >> 4); + break; + case 64: + // 32-bit variables will overflow for 64 rows at a single time, so + // compute 32 rows at a time. + if (bh <= 32) { + sse_sum_wd16_avx2(data, stride, bh, x_sum, x2_sum, bw >> 4); + } else { + sse_sum_wd16_avx2(data, stride, 32, x_sum, x2_sum, bw >> 4); + sse_sum_wd16_avx2(data + 32 * stride, stride, 32, x_sum, x2_sum, + bw >> 4); + } + break; + + default: aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum); + } + } else { + aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum); + } +} diff --git a/libs/libaom/src/aom_dsp/x86/blk_sse_sum_sse2.c b/libs/libaom/src/aom_dsp/x86/blk_sse_sum_sse2.c new file mode 100644 index 000000000..ef0a024ee --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/blk_sse_sum_sse2.c @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +static INLINE void sse_sum_wd4_sse2(const int16_t *data, int stride, int bh, + int *x_sum, int64_t *x2_sum) { + const int16_t *data_tmp = data; + __m128i temp_buffer1, temp_buffer2; + __m128i load_pixels_low, load_pixels_hi, sum_buffer, sse_buffer; + __m128i one = _mm_set1_epi16(1); + __m128i regx_sum = _mm_setzero_si128(); + __m128i regx2_sum = regx_sum; + + for (int j = 0; j < (bh >> 1); ++j) { + // Load 2 rows (8 pixels) at a time. + load_pixels_low = _mm_loadl_epi64((__m128i const *)(data_tmp)); + load_pixels_hi = _mm_loadl_epi64((__m128i const *)(data_tmp + stride)); + load_pixels_low = _mm_unpacklo_epi64(load_pixels_low, load_pixels_hi); + sum_buffer = _mm_madd_epi16(load_pixels_low, one); + sse_buffer = _mm_madd_epi16(load_pixels_low, load_pixels_low); + regx_sum = _mm_add_epi32(sum_buffer, regx_sum); + regx2_sum = _mm_add_epi32(sse_buffer, regx2_sum); + data_tmp += 2 * stride; + } + + regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 8)); + regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 4)); + *x_sum = _mm_cvtsi128_si32(regx_sum); + temp_buffer1 = _mm_unpacklo_epi32(regx2_sum, _mm_setzero_si128()); + temp_buffer2 = _mm_unpackhi_epi32(regx2_sum, _mm_setzero_si128()); + regx2_sum = _mm_add_epi64(temp_buffer1, temp_buffer2); + regx2_sum = _mm_add_epi64(regx2_sum, _mm_srli_si128(regx2_sum, 8)); +#if ARCH_X86_64 + *x2_sum += _mm_cvtsi128_si64(regx2_sum); +#else + { + int64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, regx2_sum); + *x2_sum += tmp; + } +#endif +} + +static INLINE void sse_sum_wd8_sse2(const int16_t *data, int stride, int bh, + int *x_sum, int64_t *x2_sum, + int loop_cycles) { + const int16_t *data_tmp; + __m128i temp_buffer1, temp_buffer2; + __m128i one = _mm_set1_epi16(1); + __m128i regx_sum = _mm_setzero_si128(); + __m128i regx2_sum = regx_sum; + __m128i load_pixels, sum_buffer, sse_buffer; + + for (int i = 0; i < loop_cycles; ++i) { + data_tmp = data + (8 * i); + for (int j = 0; j < bh; ++j) { + // Load 1 row (8-pixels) at a time. + load_pixels = _mm_loadu_si128((__m128i const *)(data_tmp)); + sum_buffer = _mm_madd_epi16(load_pixels, one); + sse_buffer = _mm_madd_epi16(load_pixels, load_pixels); + regx_sum = _mm_add_epi32(sum_buffer, regx_sum); + regx2_sum = _mm_add_epi32(sse_buffer, regx2_sum); + data_tmp += stride; + } + } + + regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 8)); + regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 4)); + *x_sum += _mm_cvtsi128_si32(regx_sum); + temp_buffer1 = _mm_unpacklo_epi32(regx2_sum, _mm_setzero_si128()); + temp_buffer2 = _mm_unpackhi_epi32(regx2_sum, _mm_setzero_si128()); + regx2_sum = _mm_add_epi64(temp_buffer1, temp_buffer2); + regx2_sum = _mm_add_epi64(regx2_sum, _mm_srli_si128(regx2_sum, 8)); +#if ARCH_X86_64 + *x2_sum += _mm_cvtsi128_si64(regx2_sum); +#else + { + int64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, regx2_sum); + *x2_sum += tmp; + } +#endif +} + +// This functions adds SSE2 Support for the functions 'get_blk_sse_sum_c' +void aom_get_blk_sse_sum_sse2(const int16_t *data, int stride, int bw, int bh, + int *x_sum, int64_t *x2_sum) { + *x_sum = 0; + *x2_sum = 0; + + if ((bh & 3) == 0) { + switch (bw) { + case 4: sse_sum_wd4_sse2(data, stride, bh, x_sum, x2_sum); break; + case 8: + case 16: + sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3); + break; + // For widths 32 and 64, the registers may overflow. So compute + // partial widths at a time. + case 32: + if (bh <= 32) { + sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3); + break; + } else { + sse_sum_wd8_sse2(data, stride, 32, x_sum, x2_sum, bw >> 3); + sse_sum_wd8_sse2(data + 32 * stride, stride, 32, x_sum, x2_sum, + bw >> 3); + break; + } + + case 64: + if (bh <= 16) { + sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3); + break; + } else { + for (int i = 0; i < bh; i += 16) + sse_sum_wd8_sse2(data + i * stride, stride, 16, x_sum, x2_sum, + bw >> 3); + break; + } + + default: aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum); + } + } else { + aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum); + } +} diff --git a/libs/libaom/src/aom_dsp/x86/common_avx2.h b/libs/libaom/src/aom_dsp/x86/common_avx2.h new file mode 100644 index 000000000..96fe4ebb6 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/common_avx2.h @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_COMMON_AVX2_H_ +#define AOM_AOM_DSP_X86_COMMON_AVX2_H_ + +#include + +#include "config/aom_config.h" + +// Note: in and out could have the same value +static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) { + __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]); + __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]); + __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]); + __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]); + __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]); + __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]); + __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]); + __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]); + + __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]); + __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]); + __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]); + __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]); + __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]); + __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]); + __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]); + __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]); + + // 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b + // 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f + // 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b + // 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f + // 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b + // 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f + // 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b + // 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f + + // 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b + // 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f + // a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb + // a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf + // c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db + // c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df + // e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb + // e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff + + __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2); + __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2); + __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3); + __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3); + __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6); + __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6); + __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7); + __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7); + + __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a); + __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a); + __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b); + __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b); + __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e); + __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e); + __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f); + __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f); + + // 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39 + // 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b + // 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d + // 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f + // 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79 + // 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b + // 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d + // 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f + + // 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9 + // 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb + // 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd + // 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf + // c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9 + // c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb + // c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd + // c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff + + tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4); + tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4); + tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5); + tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5); + tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6); + tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6); + tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7); + tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7); + + tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c); + tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c); + tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d); + tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d); + tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e); + tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e); + tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f); + tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f); + + // 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78 + // 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79 + // 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a + // 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b + // 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c + // 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d + // 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e + // 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f + + // 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8 + // 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9 + // 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa + // 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb + // 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc + // 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd + // 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe + // 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff + + out[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000 + out[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001 + out[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20); + out[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31); + out[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20); + out[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31); + out[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20); + out[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31); + + out[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20); + out[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31); + out[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20); + out[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31); + out[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20); + out[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31); + out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20); + out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31); +} +#endif // AOM_AOM_DSP_X86_COMMON_AVX2_H_ diff --git a/libs/libaom/src/aom_dsp/x86/convolve.h b/libs/libaom/src/aom_dsp/x86/convolve.h new file mode 100644 index 000000000..b4ff6975c --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/convolve.h @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AOM_DSP_X86_CONVOLVE_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_H_ + +#include + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, + uint8_t *output_ptr, ptrdiff_t out_pitch, + uint32_t output_height, const int16_t *filter); + +#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ + void aom_convolve8_##name##_##opt( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, int w, int h) { \ + (void)filter_x; \ + (void)x_step_q4; \ + (void)filter_y; \ + (void)y_step_q4; \ + assert((-128 <= filter[3]) && (filter[3] <= 127)); \ + assert(step_q4 == 16); \ + if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) && \ + (filter[2] | filter[5])) { \ + while (w >= 16) { \ + aom_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else if (filter[0] | filter[1] | filter[2]) { \ + while (w >= 16) { \ + aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else { \ + while (w >= 16) { \ + aom_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst, \ + dst_stride, h, filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst, \ + dst_stride, h, filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst, \ + dst_stride, h, filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } \ + if (w) { \ + aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x, \ + x_step_q4, filter_y, y_step_q4, w, h); \ + } \ + } + +#if CONFIG_AV1_HIGHBITDEPTH +typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, + const ptrdiff_t src_pitch, + uint16_t *output_ptr, + ptrdiff_t out_pitch, + unsigned int output_height, + const int16_t *filter, int bd); + +#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ + void aom_highbd_convolve8_##name##_##opt( \ + const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, \ + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + if (step_q4 == 16 && filter[3] != 128) { \ + if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) && \ + (filter[2] | filter[5])) { \ + while (w >= 16) { \ + aom_highbd_filter_block1d16_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_highbd_filter_block1d8_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_highbd_filter_block1d4_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else if (filter[0] | filter[1] | filter[2]) { \ + while (w >= 16) { \ + aom_highbd_filter_block1d16_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_highbd_filter_block1d8_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_highbd_filter_block1d4_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else { \ + while (w >= 16) { \ + aom_highbd_filter_block1d16_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_highbd_filter_block1d8_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_highbd_filter_block1d4_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } \ + } \ + if (w) { \ + aom_highbd_convolve8_##name##_c( \ + CONVERT_TO_BYTEPTR(src), src_stride, CONVERT_TO_BYTEPTR(dst), \ + dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \ + } \ + } +#endif // CONFIG_AV1_HIGHBITDEPTH + +#endif // AOM_AOM_DSP_X86_CONVOLVE_H_ diff --git a/libs/libaom/src/aom_dsp/x86/convolve_avx2.h b/libs/libaom/src/aom_dsp/x86/convolve_avx2.h new file mode 100644 index 000000000..d516de5f2 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/convolve_avx2.h @@ -0,0 +1,463 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ + +// filters for 16 +DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, + 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5, + 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, + 7, 7, 8, 8, 9, 9, 10, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, + 12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7, + 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, + 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, + 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = { + 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, + 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt_center_global_avx2[32]) = { + 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255, + 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255 +}; + +DECLARE_ALIGNED(32, static const uint8_t, + filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, + 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, + 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + +DECLARE_ALIGNED(32, static const uint8_t, + filt2_global_avx2[32]) = { 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, + 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, + 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 }; + +DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +}; + +#define CONVOLVE_SR_HORIZONTAL_FILTER_8TAP \ + for (i = 0; i < (im_h - 2); i += 2) { \ + __m256i data = _mm256_castsi128_si256( \ + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ + data = _mm256_inserti128_si256( \ + data, \ + _mm_loadu_si128( \ + (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), \ + 1); \ + \ + __m256i res = convolve_lowbd_x(data, coeffs_h, filt); \ + res = \ + _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \ + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \ + } \ + \ + __m256i data_1 = _mm256_castsi128_si256( \ + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ + \ + __m256i res = convolve_lowbd_x(data_1, coeffs_h, filt); \ + \ + res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \ + \ + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); + +#define CONVOLVE_SR_VERTICAL_FILTER_8TAP \ + __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ + __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ + __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ + __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ + __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \ + __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \ + \ + __m256i s[8]; \ + s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ + s[1] = _mm256_unpacklo_epi16(src_2, src_3); \ + s[2] = _mm256_unpacklo_epi16(src_4, src_5); \ + \ + s[4] = _mm256_unpackhi_epi16(src_0, src_1); \ + s[5] = _mm256_unpackhi_epi16(src_2, src_3); \ + s[6] = _mm256_unpackhi_epi16(src_4, src_5); \ + \ + for (i = 0; i < h; i += 2) { \ + const int16_t *data = &im_block[i * im_stride]; \ + \ + const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \ + const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \ + \ + s[3] = _mm256_unpacklo_epi16(s6, s7); \ + s[7] = _mm256_unpackhi_epi16(s6, s7); \ + \ + __m256i res_a = convolve(s, coeffs_v); \ + __m256i res_b = convolve(s + 4, coeffs_v); \ + \ + res_a = \ + _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \ + res_b = \ + _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \ + \ + const __m256i res_a_round = _mm256_sra_epi32( \ + _mm256_add_epi32(res_a, round_const_v), round_shift_v); \ + const __m256i res_b_round = _mm256_sra_epi32( \ + _mm256_add_epi32(res_b, round_const_v), round_shift_v); \ + \ + const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \ + const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \ + \ + const __m128i res_0 = _mm256_castsi256_si128(res_8b); \ + const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \ + \ + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \ + __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \ + if (w - j > 4) { \ + _mm_storel_epi64(p_0, res_0); \ + _mm_storel_epi64(p_1, res_1); \ + } else if (w == 4) { \ + xx_storel_32(p_0, res_0); \ + xx_storel_32(p_1, res_1); \ + } else { \ + *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); \ + *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); \ + } \ + \ + s[0] = s[1]; \ + s[1] = s[2]; \ + s[2] = s[3]; \ + \ + s[4] = s[5]; \ + s[5] = s[6]; \ + s[6] = s[7]; \ + } + +#define DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP \ + for (i = 0; i < im_h; i += 2) { \ + __m256i data = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h)); \ + if (i + 1 < im_h) \ + data = _mm256_inserti128_si256( \ + data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); \ + src_h += (src_stride << 1); \ + __m256i res = convolve_lowbd_x(data, coeffs_x, filt); \ + \ + res = \ + _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \ + \ + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \ + } + +#define DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP \ + __m256i s[8]; \ + __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ + __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ + __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ + __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ + __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \ + __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \ + \ + s[0] = _mm256_unpacklo_epi16(s0, s1); \ + s[1] = _mm256_unpacklo_epi16(s2, s3); \ + s[2] = _mm256_unpacklo_epi16(s4, s5); \ + \ + s[4] = _mm256_unpackhi_epi16(s0, s1); \ + s[5] = _mm256_unpackhi_epi16(s2, s3); \ + s[6] = _mm256_unpackhi_epi16(s4, s5); \ + \ + for (i = 0; i < h; i += 2) { \ + const int16_t *data = &im_block[i * im_stride]; \ + \ + const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \ + const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \ + \ + s[3] = _mm256_unpacklo_epi16(s6, s7); \ + s[7] = _mm256_unpackhi_epi16(s6, s7); \ + \ + const __m256i res_a = convolve(s, coeffs_y); \ + const __m256i res_a_round = _mm256_sra_epi32( \ + _mm256_add_epi32(res_a, round_const_v), round_shift_v); \ + \ + if (w - j > 4) { \ + const __m256i res_b = convolve(s + 4, coeffs_y); \ + const __m256i res_b_round = _mm256_sra_epi32( \ + _mm256_add_epi32(res_b, round_const_v), round_shift_v); \ + const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); \ + const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \ + \ + if (do_average) { \ + const __m256i data_ref_0 = load_line2_avx2( \ + &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); \ + const __m256i comp_avg_res = \ + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); \ + \ + const __m256i round_result = convolve_rounding( \ + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); \ + \ + const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); \ + const __m128i res_0 = _mm256_castsi256_si128(res_8); \ + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \ + \ + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); \ + _mm_storel_epi64( \ + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); \ + } else { \ + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \ + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); \ + \ + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \ + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), \ + res_1); \ + } \ + } else { \ + const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round); \ + const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \ + \ + if (do_average) { \ + const __m256i data_ref_0 = load_line2_avx2( \ + &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); \ + \ + const __m256i comp_avg_res = \ + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); \ + \ + const __m256i round_result = convolve_rounding( \ + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); \ + \ + const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); \ + const __m128i res_0 = _mm256_castsi256_si128(res_8); \ + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \ + \ + *(uint32_t *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); \ + *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = \ + _mm_cvtsi128_si32(res_1); \ + \ + } else { \ + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \ + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); \ + \ + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \ + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), \ + res_1); \ + } \ + } \ + \ + s[0] = s[1]; \ + s[1] = s[2]; \ + s[2] = s[3]; \ + \ + s[4] = s[5]; \ + s[5] = s[6]; \ + s[6] = s[7]; \ + } +static INLINE void prepare_coeffs_lowbd( + const InterpFilterParams *const filter_params, const int subpel_q4, + __m256i *const coeffs /* [4] */) { + const int16_t *const filter = av1_get_interp_filter_subpel_kernel( + filter_params, subpel_q4 & SUBPEL_MASK); + const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); + const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); + + // right shift all filter co-efficients by 1 to reduce the bits required. + // This extra right shift will be taken care of at the end while rounding + // the result. + // Since all filter co-efficients are even, this change will not affect the + // end result + assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), + _mm_set1_epi16((short)0xffff))); + + const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); + + // coeffs 0 1 0 1 0 1 0 1 + coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u)); + // coeffs 2 3 2 3 2 3 2 3 + coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); + // coeffs 4 5 4 5 4 5 4 5 + coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); + // coeffs 6 7 6 7 6 7 6 7 + coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu)); +} + +static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params, + const int subpel_q4, + __m256i *const coeffs /* [4] */) { + const int16_t *filter = av1_get_interp_filter_subpel_kernel( + filter_params, subpel_q4 & SUBPEL_MASK); + + const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); + const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); + + // coeffs 0 1 0 1 0 1 0 1 + coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); + // coeffs 2 3 2 3 2 3 2 3 + coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); + // coeffs 4 5 4 5 4 5 4 5 + coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); + // coeffs 6 7 6 7 6 7 6 7 + coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); +} + +static INLINE __m256i convolve_lowbd(const __m256i *const s, + const __m256i *const coeffs) { + const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); + const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); + const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); + const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); + + // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), + _mm256_add_epi16(res_23, res_67)); + + return res; +} + +static INLINE __m256i convolve_lowbd_4tap(const __m256i *const s, + const __m256i *const coeffs) { + const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]); + const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]); + + // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + const __m256i res = _mm256_add_epi16(res_45, res_23); + + return res; +} + +static INLINE __m256i convolve(const __m256i *const s, + const __m256i *const coeffs) { + const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); + const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); + const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); + const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); + + const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), + _mm256_add_epi32(res_2, res_3)); + + return res; +} + +static INLINE __m256i convolve_4tap(const __m256i *const s, + const __m256i *const coeffs) { + const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]); + const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]); + + const __m256i res = _mm256_add_epi32(res_1, res_2); + return res; +} + +static INLINE __m256i convolve_lowbd_x(const __m256i data, + const __m256i *const coeffs, + const __m256i *const filt) { + __m256i s[4]; + + s[0] = _mm256_shuffle_epi8(data, filt[0]); + s[1] = _mm256_shuffle_epi8(data, filt[1]); + s[2] = _mm256_shuffle_epi8(data, filt[2]); + s[3] = _mm256_shuffle_epi8(data, filt[3]); + + return convolve_lowbd(s, coeffs); +} + +static INLINE __m256i convolve_lowbd_x_4tap(const __m256i data, + const __m256i *const coeffs, + const __m256i *const filt) { + __m256i s[2]; + + s[0] = _mm256_shuffle_epi8(data, filt[0]); + s[1] = _mm256_shuffle_epi8(data, filt[1]); + + return convolve_lowbd_4tap(s, coeffs); +} + +static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst, + const __m256i *const res, + const int do_average) { + __m256i d; + if (do_average) { + d = _mm256_load_si256((__m256i *)dst); + d = _mm256_add_epi32(d, *res); + d = _mm256_srai_epi32(d, 1); + } else { + d = *res; + } + _mm256_store_si256((__m256i *)dst, d); +} + +static INLINE __m256i comp_avg(const __m256i *const data_ref_0, + const __m256i *const res_unsigned, + const __m256i *const wt, + const int use_dist_wtd_comp_avg) { + __m256i res; + if (use_dist_wtd_comp_avg) { + const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned); + const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned); + + const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt); + const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt); + + const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); + const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); + + res = _mm256_packs_epi32(res_lo, res_hi); + } else { + const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned); + res = _mm256_srai_epi16(wt_res, 1); + } + return res; +} + +static INLINE __m256i convolve_rounding(const __m256i *const res_unsigned, + const __m256i *const offset_const, + const __m256i *const round_const, + const int round_shift) { + const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const); + const __m256i res_round = _mm256_srai_epi16( + _mm256_add_epi16(res_signed, *round_const), round_shift); + return res_round; +} + +static INLINE __m256i highbd_comp_avg(const __m256i *const data_ref_0, + const __m256i *const res_unsigned, + const __m256i *const wt0, + const __m256i *const wt1, + const int use_dist_wtd_comp_avg) { + __m256i res; + if (use_dist_wtd_comp_avg) { + const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0); + const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1); + const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res); + res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS); + } else { + const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned); + res = _mm256_srai_epi32(wt_res, 1); + } + return res; +} + +static INLINE __m256i highbd_convolve_rounding( + const __m256i *const res_unsigned, const __m256i *const offset_const, + const __m256i *const round_const, const int round_shift) { + const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const); + const __m256i res_round = _mm256_srai_epi32( + _mm256_add_epi32(res_signed, *round_const), round_shift); + + return res_round; +} + +#endif // AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ diff --git a/libs/libaom/src/aom_dsp/x86/convolve_common_intrin.h b/libs/libaom/src/aom_dsp/x86/convolve_common_intrin.h new file mode 100644 index 000000000..707bd2d78 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/convolve_common_intrin.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ + +// Note: +// This header file should be put below any x86 intrinsics head file + +static INLINE void add_store(CONV_BUF_TYPE *const dst, const __m128i *const res, + const int do_average) { + __m128i d; + if (do_average) { + d = _mm_load_si128((__m128i *)dst); + d = _mm_add_epi32(d, *res); + d = _mm_srai_epi32(d, 1); + } else { + d = *res; + } + _mm_store_si128((__m128i *)dst, d); +} + +#endif // AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ diff --git a/libs/libaom/src/aom_dsp/x86/convolve_sse2.h b/libs/libaom/src/aom_dsp/x86/convolve_sse2.h new file mode 100644 index 000000000..385c7c7e1 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/convolve_sse2.h @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_ + +// Note: +// This header file should be put below any x86 intrinsics head file + +static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params, + const int subpel_q4, + __m128i *const coeffs /* [4] */) { + const int16_t *filter = av1_get_interp_filter_subpel_kernel( + filter_params, subpel_q4 & SUBPEL_MASK); + const __m128i coeff = _mm_loadu_si128((__m128i *)filter); + + // coeffs 0 1 0 1 0 1 0 1 + coeffs[0] = _mm_shuffle_epi32(coeff, 0x00); + // coeffs 2 3 2 3 2 3 2 3 + coeffs[1] = _mm_shuffle_epi32(coeff, 0x55); + // coeffs 4 5 4 5 4 5 4 5 + coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa); + // coeffs 6 7 6 7 6 7 6 7 + coeffs[3] = _mm_shuffle_epi32(coeff, 0xff); +} + +static INLINE __m128i convolve(const __m128i *const s, + const __m128i *const coeffs) { + const __m128i res_0 = _mm_madd_epi16(s[0], coeffs[0]); + const __m128i res_1 = _mm_madd_epi16(s[1], coeffs[1]); + const __m128i res_2 = _mm_madd_epi16(s[2], coeffs[2]); + const __m128i res_3 = _mm_madd_epi16(s[3], coeffs[3]); + + const __m128i res = + _mm_add_epi32(_mm_add_epi32(res_0, res_1), _mm_add_epi32(res_2, res_3)); + + return res; +} + +static INLINE __m128i convolve_lo_x(const __m128i *const s, + const __m128i *const coeffs) { + __m128i ss[4]; + ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128()); + ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128()); + ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128()); + ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128()); + return convolve(ss, coeffs); +} + +static INLINE __m128i convolve_lo_y(const __m128i *const s, + const __m128i *const coeffs) { + __m128i ss[4]; + ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128()); + ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128()); + ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128()); + ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128()); + return convolve(ss, coeffs); +} + +static INLINE __m128i convolve_hi_y(const __m128i *const s, + const __m128i *const coeffs) { + __m128i ss[4]; + ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128()); + ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128()); + ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128()); + ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128()); + return convolve(ss, coeffs); +} + +static INLINE __m128i comp_avg(const __m128i *const data_ref_0, + const __m128i *const res_unsigned, + const __m128i *const wt, + const int use_dist_wtd_avg) { + __m128i res; + if (use_dist_wtd_avg) { + const __m128i data_lo = _mm_unpacklo_epi16(*data_ref_0, *res_unsigned); + const __m128i data_hi = _mm_unpackhi_epi16(*data_ref_0, *res_unsigned); + + const __m128i wt_res_lo = _mm_madd_epi16(data_lo, *wt); + const __m128i wt_res_hi = _mm_madd_epi16(data_hi, *wt); + + const __m128i res_lo = _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); + const __m128i res_hi = _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); + + res = _mm_packs_epi32(res_lo, res_hi); + } else { + const __m128i wt_res = _mm_add_epi16(*data_ref_0, *res_unsigned); + res = _mm_srai_epi16(wt_res, 1); + } + return res; +} + +static INLINE __m128i convolve_rounding(const __m128i *const res_unsigned, + const __m128i *const offset_const, + const __m128i *const round_const, + const int round_shift) { + const __m128i res_signed = _mm_sub_epi16(*res_unsigned, *offset_const); + const __m128i res_round = + _mm_srai_epi16(_mm_add_epi16(res_signed, *round_const), round_shift); + return res_round; +} + +static INLINE __m128i highbd_convolve_rounding_sse2( + const __m128i *const res_unsigned, const __m128i *const offset_const, + const __m128i *const round_const, const int round_shift) { + const __m128i res_signed = _mm_sub_epi32(*res_unsigned, *offset_const); + const __m128i res_round = + _mm_srai_epi32(_mm_add_epi32(res_signed, *round_const), round_shift); + + return res_round; +} + +#endif // AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_ diff --git a/libs/libaom/src/aom_dsp/x86/convolve_sse4_1.h b/libs/libaom/src/aom_dsp/x86/convolve_sse4_1.h new file mode 100644 index 000000000..b1a3bb466 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/convolve_sse4_1.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_ + +// Note: +// This header file should be put below any x86 intrinsics head file + +static INLINE void mult_add_store(CONV_BUF_TYPE *const dst, + const __m128i *const res, + const __m128i *const wt0, + const __m128i *const wt1, + const int do_average) { + __m128i d; + if (do_average) { + d = _mm_load_si128((__m128i *)dst); + d = _mm_add_epi32(_mm_mullo_epi32(d, *wt0), _mm_mullo_epi32(*res, *wt1)); + d = _mm_srai_epi32(d, DIST_PRECISION_BITS); + } else { + d = *res; + } + _mm_store_si128((__m128i *)dst, d); +} + +static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0, + const __m128i *const res_unsigned, + const __m128i *const wt0, + const __m128i *const wt1, + const int use_dist_wtd_avg) { + __m128i res; + if (use_dist_wtd_avg) { + const __m128i wt0_res = _mm_mullo_epi32(*data_ref_0, *wt0); + const __m128i wt1_res = _mm_mullo_epi32(*res_unsigned, *wt1); + + const __m128i wt_res = _mm_add_epi32(wt0_res, wt1_res); + res = _mm_srai_epi32(wt_res, DIST_PRECISION_BITS); + } else { + const __m128i wt_res = _mm_add_epi32(*data_ref_0, *res_unsigned); + res = _mm_srai_epi32(wt_res, 1); + } + return res; +} + +#endif // AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_ diff --git a/libs/libaom/src/aom_dsp/x86/fft_avx2.c b/libs/libaom/src/aom_dsp/x86/fft_avx2.c new file mode 100644 index 000000000..4cccc5f00 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/fft_avx2.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/fft_common.h" + +extern void aom_transpose_float_sse2(const float *A, float *B, int n); +extern void aom_fft_unpack_2d_output_sse2(const float *col_fft, float *output, + int n); + +// Generate the 1d forward transforms for float using _mm256 +GEN_FFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps); +GEN_FFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps); +GEN_FFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps); + +void aom_fft8x8_float_avx2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_avx2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8); +} + +void aom_fft16x16_float_avx2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_avx2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8); +} + +void aom_fft32x32_float_avx2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_avx2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8); +} + +// Generate the 1d inverse transforms for float using _mm256 +GEN_IFFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps); +GEN_IFFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps); +GEN_IFFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps); + +void aom_ifft8x8_float_avx2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_avx2, + aom_ifft1d_8_avx2, aom_transpose_float_sse2, 8); +} + +void aom_ifft16x16_float_avx2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, + aom_fft1d_16_avx2, aom_ifft1d_16_avx2, + aom_transpose_float_sse2, 8); +} + +void aom_ifft32x32_float_avx2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, + aom_fft1d_32_avx2, aom_ifft1d_32_avx2, + aom_transpose_float_sse2, 8); +} diff --git a/libs/libaom/src/aom_dsp/x86/fft_sse2.c b/libs/libaom/src/aom_dsp/x86/fft_sse2.c new file mode 100644 index 000000000..6f20a3cc0 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/fft_sse2.c @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the +s * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/fft_common.h" + +static INLINE void transpose4x4(const float *A, float *B, const int lda, + const int ldb) { + __m128 row1 = _mm_load_ps(&A[0 * lda]); + __m128 row2 = _mm_load_ps(&A[1 * lda]); + __m128 row3 = _mm_load_ps(&A[2 * lda]); + __m128 row4 = _mm_load_ps(&A[3 * lda]); + _MM_TRANSPOSE4_PS(row1, row2, row3, row4); + _mm_store_ps(&B[0 * ldb], row1); + _mm_store_ps(&B[1 * ldb], row2); + _mm_store_ps(&B[2 * ldb], row3); + _mm_store_ps(&B[3 * ldb], row4); +} + +void aom_transpose_float_sse2(const float *A, float *B, int n) { + for (int y = 0; y < n; y += 4) { + for (int x = 0; x < n; x += 4) { + transpose4x4(A + y * n + x, B + x * n + y, n, n); + } + } +} + +void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n) { + const int n2 = n / 2; + output[0] = packed[0]; + output[1] = 0; + output[2 * (n2 * n)] = packed[n2 * n]; + output[2 * (n2 * n) + 1] = 0; + + output[2 * n2] = packed[n2]; + output[2 * n2 + 1] = 0; + output[2 * (n2 * n + n2)] = packed[n2 * n + n2]; + output[2 * (n2 * n + n2) + 1] = 0; + + for (int c = 1; c < n2; ++c) { + output[2 * (0 * n + c)] = packed[c]; + output[2 * (0 * n + c) + 1] = packed[c + n2]; + output[2 * (n2 * n + c) + 0] = packed[n2 * n + c]; + output[2 * (n2 * n + c) + 1] = packed[n2 * n + c + n2]; + } + for (int r = 1; r < n2; ++r) { + output[2 * (r * n + 0)] = packed[r * n]; + output[2 * (r * n + 0) + 1] = packed[(r + n2) * n]; + output[2 * (r * n + n2) + 0] = packed[r * n + n2]; + output[2 * (r * n + n2) + 1] = packed[(r + n2) * n + n2]; + + for (int c = 1; c < AOMMIN(n2, 4); ++c) { + output[2 * (r * n + c)] = + packed[r * n + c] - packed[(r + n2) * n + c + n2]; + output[2 * (r * n + c) + 1] = + packed[(r + n2) * n + c] + packed[r * n + c + n2]; + } + + for (int c = 4; c < n2; c += 4) { + __m128 real1 = _mm_load_ps(packed + r * n + c); + __m128 real2 = _mm_load_ps(packed + (r + n2) * n + c + n2); + __m128 imag1 = _mm_load_ps(packed + (r + n2) * n + c); + __m128 imag2 = _mm_load_ps(packed + r * n + c + n2); + real1 = _mm_sub_ps(real1, real2); + imag1 = _mm_add_ps(imag1, imag2); + _mm_store_ps(output + 2 * (r * n + c), _mm_unpacklo_ps(real1, imag1)); + _mm_store_ps(output + 2 * (r * n + c + 2), _mm_unpackhi_ps(real1, imag1)); + } + + int r2 = r + n2; + int r3 = n - r2; + output[2 * (r2 * n + 0)] = packed[r3 * n]; + output[2 * (r2 * n + 0) + 1] = -packed[(r3 + n2) * n]; + output[2 * (r2 * n + n2)] = packed[r3 * n + n2]; + output[2 * (r2 * n + n2) + 1] = -packed[(r3 + n2) * n + n2]; + for (int c = 1; c < AOMMIN(4, n2); ++c) { + output[2 * (r2 * n + c)] = + packed[r3 * n + c] + packed[(r3 + n2) * n + c + n2]; + output[2 * (r2 * n + c) + 1] = + -packed[(r3 + n2) * n + c] + packed[r3 * n + c + n2]; + } + for (int c = 4; c < n2; c += 4) { + __m128 real1 = _mm_load_ps(packed + r3 * n + c); + __m128 real2 = _mm_load_ps(packed + (r3 + n2) * n + c + n2); + __m128 imag1 = _mm_load_ps(packed + (r3 + n2) * n + c); + __m128 imag2 = _mm_load_ps(packed + r3 * n + c + n2); + real1 = _mm_add_ps(real1, real2); + imag1 = _mm_sub_ps(imag2, imag1); + _mm_store_ps(output + 2 * (r2 * n + c), _mm_unpacklo_ps(real1, imag1)); + _mm_store_ps(output + 2 * (r2 * n + c + 2), + _mm_unpackhi_ps(real1, imag1)); + } + } +} + +// Generate definitions for 1d transforms using float and __mm128 +GEN_FFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps); +GEN_FFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); +GEN_FFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); +GEN_FFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); + +void aom_fft4x4_float_sse2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_sse2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); +} + +void aom_fft8x8_float_sse2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_sse2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); +} + +void aom_fft16x16_float_sse2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_sse2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); +} + +void aom_fft32x32_float_sse2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_sse2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); +} + +// Generate definitions for 1d inverse transforms using float and mm128 +GEN_IFFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps); +GEN_IFFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); +GEN_IFFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); +GEN_IFFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); + +void aom_ifft4x4_float_sse2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_sse2, + aom_ifft1d_4_sse2, aom_transpose_float_sse2, 4); +} + +void aom_ifft8x8_float_sse2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_sse2, + aom_ifft1d_8_sse2, aom_transpose_float_sse2, 4); +} + +void aom_ifft16x16_float_sse2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, + aom_fft1d_16_sse2, aom_ifft1d_16_sse2, + aom_transpose_float_sse2, 4); +} + +void aom_ifft32x32_float_sse2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, + aom_fft1d_32_sse2, aom_ifft1d_32_sse2, + aom_transpose_float_sse2, 4); +} diff --git a/libs/libaom/src/aom_dsp/x86/fwd_txfm_impl_sse2.h b/libs/libaom/src/aom_dsp/x86/fwd_txfm_impl_sse2.h new file mode 100644 index 000000000..89fe1899b --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/fwd_txfm_impl_sse2.h @@ -0,0 +1,544 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/txfm_common.h" +#include "aom_dsp/x86/fwd_txfm_sse2.h" +#include "aom_dsp/x86/txfm_common_sse2.h" +#include "aom_ports/mem.h" + +// TODO(jingning) The high bit-depth functions need rework for performance. +// After we properly fix the high bit-depth function implementations, this +// file's dependency should be substantially simplified. +#if DCT_HIGH_BIT_DEPTH +#define ADD_EPI16 _mm_adds_epi16 +#define SUB_EPI16 _mm_subs_epi16 + +#else +#define ADD_EPI16 _mm_add_epi16 +#define SUB_EPI16 _mm_sub_epi16 +#endif + +static void FDCT4x4_2D_HELPER(const int16_t *input, int stride, __m128i *in0, + __m128i *in1) { + // Constants + // These are the coefficients used for the multiplies. + // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64), + // where cospi_N_64 = cos(N pi /64) + const __m128i k__cospi_A = + octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64, + cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64); + const __m128i k__cospi_B = + octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64, + cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64); + const __m128i k__cospi_C = + octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64, + cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64); + const __m128i k__cospi_D = + octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64, + cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64); + const __m128i k__cospi_E = + octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64, + cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64); + const __m128i k__cospi_F = + octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64, + cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64); + const __m128i k__cospi_G = + octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64, + -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64); + const __m128i k__cospi_H = + octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64, + -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64); + + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + // This second rounding constant saves doing some extra adds at the end + const __m128i k__DCT_CONST_ROUNDING2 = + _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1)); + const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2; + const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); + const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); + + // Load inputs. + *in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + *in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + *in1 = _mm_unpacklo_epi64( + *in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride))); + *in0 = _mm_unpacklo_epi64( + *in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride))); + // in0 = [i0 i1 i2 i3 iC iD iE iF] + // in1 = [i4 i5 i6 i7 i8 i9 iA iB] + // multiply by 16 to give some extra precision + *in0 = _mm_slli_epi16(*in0, 4); + *in1 = _mm_slli_epi16(*in1, 4); + // if (i == 0 && input[0]) input[0] += 1; + // add 1 to the upper left pixel if it is non-zero, which helps reduce + // the round-trip error + { + // The mask will only contain whether the first value is zero, all + // other comparison will fail as something shifted by 4 (above << 4) + // can never be equal to one. To increment in the non-zero case, we + // add the mask and one for the first element: + // - if zero, mask = -1, v = v - 1 + 1 = v + // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 + __m128i mask = _mm_cmpeq_epi16(*in0, k__nonzero_bias_a); + *in0 = _mm_add_epi16(*in0, mask); + *in0 = _mm_add_epi16(*in0, k__nonzero_bias_b); + } + // There are 4 total stages, alternating between an add/subtract stage + // followed by an multiply-and-add stage. + { + // Stage 1: Add/subtract + + // in0 = [i0 i1 i2 i3 iC iD iE iF] + // in1 = [i4 i5 i6 i7 i8 i9 iA iB] + const __m128i r0 = _mm_unpacklo_epi16(*in0, *in1); + const __m128i r1 = _mm_unpackhi_epi16(*in0, *in1); + // r0 = [i0 i4 i1 i5 i2 i6 i3 i7] + // r1 = [iC i8 iD i9 iE iA iF iB] + const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4); + const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4); + // r2 = [i0 i4 i1 i5 i3 i7 i2 i6] + // r3 = [iC i8 iD i9 iF iB iE iA] + + const __m128i t0 = _mm_add_epi16(r2, r3); + const __m128i t1 = _mm_sub_epi16(r2, r3); + // t0 = [a0 a4 a1 a5 a3 a7 a2 a6] + // t1 = [aC a8 aD a9 aF aB aE aA] + + // Stage 2: multiply by constants (which gets us into 32 bits). + // The constants needed here are: + // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16] + // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16] + // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08] + // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24] + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D); + // Then add and right-shift to get back to 16-bit range + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + // w0 = [b0 b1 b7 b6] + // w1 = [b8 b9 bF bE] + // w2 = [b4 b5 b3 b2] + // w3 = [bC bD bB bA] + const __m128i x0 = _mm_packs_epi32(w0, w1); + const __m128i x1 = _mm_packs_epi32(w2, w3); + + // x0 = [b0 b1 b7 b6 b8 b9 bF bE] + // x1 = [b4 b5 b3 b2 bC bD bB bA] + *in0 = _mm_shuffle_epi32(x0, 0xD8); + *in1 = _mm_shuffle_epi32(x1, 0x8D); + // in0 = [b0 b1 b8 b9 b7 b6 bF bE] + // in1 = [b3 b2 bB bA b4 b5 bC bD] + } + { + // vertical DCTs finished. Now we do the horizontal DCTs. + // Stage 3: Add/subtract + + const __m128i t0 = ADD_EPI16(*in0, *in1); + const __m128i t1 = SUB_EPI16(*in0, *in1); + + // Stage 4: multiply by constants (which gets us into 32 bits). + { + // The constants needed here are: + // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16] + // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16] + // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24] + // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08] + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E); + const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F); + const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H); + // Then add and right-shift to get back to 16-bit range + // but this combines the final right-shift as well to save operations + // This unusual rounding operations is to maintain bit-accurate + // compatibility with the c version of this function which has two + // rounding steps in a row. + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2); + // w0 = [o0 o4 o8 oC] + // w1 = [o2 o6 oA oE] + // w2 = [o1 o5 o9 oD] + // w3 = [o3 o7 oB oF] + // remember the o's are numbered according to the correct output location + const __m128i x0 = _mm_packs_epi32(w0, w1); + const __m128i x1 = _mm_packs_epi32(w2, w3); + { + // x0 = [o0 o4 o8 oC o2 o6 oA oE] + // x1 = [o1 o5 o9 oD o3 o7 oB oF] + const __m128i y0 = _mm_unpacklo_epi16(x0, x1); + const __m128i y1 = _mm_unpackhi_epi16(x0, x1); + // y0 = [o0 o1 o4 o5 o8 o9 oC oD] + // y1 = [o2 o3 o6 o7 oA oB oE oF] + *in0 = _mm_unpacklo_epi32(y0, y1); + // in0 = [o0 o1 o2 o3 o4 o5 o6 o7] + *in1 = _mm_unpackhi_epi32(y0, y1); + // in1 = [o8 o9 oA oB oC oD oE oF] + } + } + } +} + +void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) { + // This 2D transform implements 4 vertical 1D transforms followed + // by 4 horizontal 1D transforms. The multiplies and adds are as given + // by Chen, Smith and Fralick ('77). The commands for moving the data + // around have been minimized by hand. + // For the purposes of the comments, the 16 inputs are referred to at i0 + // through iF (in raster order), intermediate variables are a0, b0, c0 + // through f, and correspond to the in-place computations mapped to input + // locations. The outputs, o0 through oF are labeled according to the + // output locations. + __m128i in0, in1; + FDCT4x4_2D_HELPER(input, stride, &in0, &in1); + + // Post-condition (v + 1) >> 2 is now incorporated into previous + // add and right-shift commands. Only 2 store instructions needed + // because we are using the fact that 1/3 are stored just after 0/2. + storeu_output(&in0, output + 0 * 4); + storeu_output(&in1, output + 2 * 4); +} + +void FDCT4x4_2D_LP(const int16_t *input, int16_t *output, int stride) { + __m128i in0, in1; + FDCT4x4_2D_HELPER(input, stride, &in0, &in1); + _mm_storeu_si128((__m128i *)(output + 0 * 4), in0); + _mm_storeu_si128((__m128i *)(output + 2 * 4), in1); +} + +void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { + int pass; + // Constants + // When we use them, in one case, they are all the same. In all others + // it's a pair of them that we need to repeat four times. This is done + // by constructing the 32 bit constant corresponding to that pair. + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); +#if DCT_HIGH_BIT_DEPTH + int overflow; +#endif + // Load input + __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); + __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); + __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); + __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); + __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); + __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); + __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); + __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); + // Pre-condition input (shift by two) + in0 = _mm_slli_epi16(in0, 2); + in1 = _mm_slli_epi16(in1, 2); + in2 = _mm_slli_epi16(in2, 2); + in3 = _mm_slli_epi16(in3, 2); + in4 = _mm_slli_epi16(in4, 2); + in5 = _mm_slli_epi16(in5, 2); + in6 = _mm_slli_epi16(in6, 2); + in7 = _mm_slli_epi16(in7, 2); + + // We do two passes, first the columns, then the rows. The results of the + // first pass are transposed so that the same column code can be reused. The + // results of the second pass are also transposed so that the rows (processed + // as columns) are put back in row positions. + for (pass = 0; pass < 2; pass++) { + // To store results of each pass before the transpose. + __m128i res0, res1, res2, res3, res4, res5, res6, res7; + // Add/subtract + const __m128i q0 = ADD_EPI16(in0, in7); + const __m128i q1 = ADD_EPI16(in1, in6); + const __m128i q2 = ADD_EPI16(in2, in5); + const __m128i q3 = ADD_EPI16(in3, in4); + const __m128i q4 = SUB_EPI16(in3, in4); + const __m128i q5 = SUB_EPI16(in2, in5); + const __m128i q6 = SUB_EPI16(in1, in6); + const __m128i q7 = SUB_EPI16(in0, in7); +#if DCT_HIGH_BIT_DEPTH + if (pass == 1) { + overflow = + check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } + } +#endif // DCT_HIGH_BIT_DEPTH + // Work on first four results + { + // Add/subtract + const __m128i r0 = ADD_EPI16(q0, q3); + const __m128i r1 = ADD_EPI16(q1, q2); + const __m128i r2 = SUB_EPI16(q1, q2); + const __m128i r3 = SUB_EPI16(q0, q3); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + // Interleave to do the multiply by constants which gets us into 32bits + { + const __m128i t0 = _mm_unpacklo_epi16(r0, r1); + const __m128i t1 = _mm_unpackhi_epi16(r0, r1); + const __m128i t2 = _mm_unpacklo_epi16(r2, r3); + const __m128i t3 = _mm_unpackhi_epi16(r2, r3); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); + const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); + const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + // Combine + res0 = _mm_packs_epi32(w0, w1); + res4 = _mm_packs_epi32(w2, w3); + res2 = _mm_packs_epi32(w4, w5); + res6 = _mm_packs_epi32(w6, w7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + } + // Work on next four results + { + // Interleave to do the multiply by constants which gets us into 32bits + const __m128i d0 = _mm_unpacklo_epi16(q6, q5); + const __m128i d1 = _mm_unpackhi_epi16(q6, q5); + const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); + const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); + const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); + const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); + const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); + const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); + const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); + const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); + const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); + const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); + const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); + // Combine + const __m128i r0 = _mm_packs_epi32(s0, s1); + const __m128i r1 = _mm_packs_epi32(s2, s3); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x2(&r0, &r1); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + { + // Add/subtract + const __m128i x0 = ADD_EPI16(q4, r0); + const __m128i x1 = SUB_EPI16(q4, r0); + const __m128i x2 = SUB_EPI16(q7, r1); + const __m128i x3 = ADD_EPI16(q7, r1); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + // Interleave to do the multiply by constants which gets us into 32bits + { + const __m128i t0 = _mm_unpacklo_epi16(x0, x3); + const __m128i t1 = _mm_unpackhi_epi16(x0, x3); + const __m128i t2 = _mm_unpacklo_epi16(x1, x2); + const __m128i t3 = _mm_unpackhi_epi16(x1, x2); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); + const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); + const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + // Combine + res1 = _mm_packs_epi32(w0, w1); + res7 = _mm_packs_epi32(w2, w3); + res5 = _mm_packs_epi32(w4, w5); + res3 = _mm_packs_epi32(w6, w7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + } + } + // Transpose the 8x8. + { + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 + // 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 + // 50 51 52 53 54 55 56 57 + // 60 61 62 63 64 65 66 67 + // 70 71 72 73 74 75 76 77 + const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); + const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); + const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); + const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); + const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); + const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); + const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); + const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + // 04 14 05 15 06 16 07 17 + // 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 + // 60 70 61 71 62 72 63 73 + // 54 54 55 55 56 56 57 57 + // 64 74 65 75 66 76 67 77 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + // 00 10 20 30 01 11 21 31 + // 40 50 60 70 41 51 61 71 + // 02 12 22 32 03 13 23 33 + // 42 52 62 72 43 53 63 73 + // 04 14 24 34 05 15 21 36 + // 44 54 64 74 45 55 61 76 + // 06 16 26 36 07 17 27 37 + // 46 56 66 76 47 57 67 77 + in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); + in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); + in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); + in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); + in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); + in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); + in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); + in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + } + } + // Post-condition output and store it + { + // Post-condition (division by two) + // division of two 16 bits signed numbers using shifts + // n / 2 = (n - (n >> 15)) >> 1 + const __m128i sign_in0 = _mm_srai_epi16(in0, 15); + const __m128i sign_in1 = _mm_srai_epi16(in1, 15); + const __m128i sign_in2 = _mm_srai_epi16(in2, 15); + const __m128i sign_in3 = _mm_srai_epi16(in3, 15); + const __m128i sign_in4 = _mm_srai_epi16(in4, 15); + const __m128i sign_in5 = _mm_srai_epi16(in5, 15); + const __m128i sign_in6 = _mm_srai_epi16(in6, 15); + const __m128i sign_in7 = _mm_srai_epi16(in7, 15); + in0 = _mm_sub_epi16(in0, sign_in0); + in1 = _mm_sub_epi16(in1, sign_in1); + in2 = _mm_sub_epi16(in2, sign_in2); + in3 = _mm_sub_epi16(in3, sign_in3); + in4 = _mm_sub_epi16(in4, sign_in4); + in5 = _mm_sub_epi16(in5, sign_in5); + in6 = _mm_sub_epi16(in6, sign_in6); + in7 = _mm_sub_epi16(in7, sign_in7); + in0 = _mm_srai_epi16(in0, 1); + in1 = _mm_srai_epi16(in1, 1); + in2 = _mm_srai_epi16(in2, 1); + in3 = _mm_srai_epi16(in3, 1); + in4 = _mm_srai_epi16(in4, 1); + in5 = _mm_srai_epi16(in5, 1); + in6 = _mm_srai_epi16(in6, 1); + in7 = _mm_srai_epi16(in7, 1); + // store results + store_output(&in0, (output + 0 * 8)); + store_output(&in1, (output + 1 * 8)); + store_output(&in2, (output + 2 * 8)); + store_output(&in3, (output + 3 * 8)); + store_output(&in4, (output + 4 * 8)); + store_output(&in5, (output + 5 * 8)); + store_output(&in6, (output + 6 * 8)); + store_output(&in7, (output + 7 * 8)); + } +} + +#undef ADD_EPI16 +#undef SUB_EPI16 diff --git a/libs/libaom/src/aom_dsp/x86/fwd_txfm_sse2.c b/libs/libaom/src/aom_dsp/x86/fwd_txfm_sse2.c new file mode 100644 index 000000000..0e4fb8046 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/fwd_txfm_sse2.c @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/x86/fwd_txfm_sse2.h" + +#define DCT_HIGH_BIT_DEPTH 0 +#define FDCT4x4_2D_HELPER fdct4x4_helper +#define FDCT4x4_2D aom_fdct4x4_sse2 +#define FDCT4x4_2D_LP aom_fdct4x4_lp_sse2 +#define FDCT8x8_2D aom_fdct8x8_sse2 +#include "aom_dsp/x86/fwd_txfm_impl_sse2.h" +#undef FDCT4x4_2D_HELPER +#undef FDCT4x4_2D +#undef FDCT4x4_2D_LP +#undef FDCT8x8_2D + +#if CONFIG_AV1_HIGHBITDEPTH + +#undef DCT_HIGH_BIT_DEPTH +#define DCT_HIGH_BIT_DEPTH 1 +#define FDCT8x8_2D aom_highbd_fdct8x8_sse2 +#include "aom_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT +#undef FDCT8x8_2D + +#endif diff --git a/libs/libaom/src/aom_dsp/x86/fwd_txfm_sse2.h b/libs/libaom/src/aom_dsp/x86/fwd_txfm_sse2.h new file mode 100644 index 000000000..ab3cd9155 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/fwd_txfm_sse2.h @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_ +#define AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) { + __m128i buf0, buf1; + buf0 = _mm_mul_epu32(a, b); + a = _mm_srli_epi64(a, 32); + b = _mm_srli_epi64(b, 32); + buf1 = _mm_mul_epu32(a, b); + return _mm_add_epi64(buf0, buf1); +} + +static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) { + __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0)); + __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0)); + return _mm_unpacklo_epi64(buf0, buf1); +} + +static INLINE int check_epi16_overflow_x2(const __m128i *preg0, + const __m128i *preg1) { + const __m128i max_overflow = _mm_set1_epi16(0x7fff); + const __m128i min_overflow = _mm_set1_epi16(0x8000); + __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow), + _mm_cmpeq_epi16(*preg0, min_overflow)); + __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow), + _mm_cmpeq_epi16(*preg1, min_overflow)); + cmp0 = _mm_or_si128(cmp0, cmp1); + return _mm_movemask_epi8(cmp0); +} + +static INLINE int check_epi16_overflow_x4(const __m128i *preg0, + const __m128i *preg1, + const __m128i *preg2, + const __m128i *preg3) { + const __m128i max_overflow = _mm_set1_epi16(0x7fff); + const __m128i min_overflow = _mm_set1_epi16(0x8000); + __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow), + _mm_cmpeq_epi16(*preg0, min_overflow)); + __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow), + _mm_cmpeq_epi16(*preg1, min_overflow)); + __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow), + _mm_cmpeq_epi16(*preg2, min_overflow)); + __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow), + _mm_cmpeq_epi16(*preg3, min_overflow)); + cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3)); + return _mm_movemask_epi8(cmp0); +} + +static INLINE int check_epi16_overflow_x8( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7) { + int res0, res1; + res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); + res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); + return res0 + res1; +} + +static INLINE int check_epi16_overflow_x12( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) { + int res0, res1; + res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); + res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); + if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); + return res0 + res1; +} + +static INLINE int check_epi16_overflow_x16( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, + const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, + const __m128i *preg15) { + int res0, res1; + res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); + res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); + if (!res0) { + res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); + if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15); + } + return res0 + res1; +} + +static INLINE int check_epi16_overflow_x32( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, + const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, + const __m128i *preg15, const __m128i *preg16, const __m128i *preg17, + const __m128i *preg18, const __m128i *preg19, const __m128i *preg20, + const __m128i *preg21, const __m128i *preg22, const __m128i *preg23, + const __m128i *preg24, const __m128i *preg25, const __m128i *preg26, + const __m128i *preg27, const __m128i *preg28, const __m128i *preg29, + const __m128i *preg30, const __m128i *preg31) { + int res0, res1; + res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); + res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); + if (!res0) { + res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); + if (!res1) { + res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15); + if (!res0) { + res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19); + if (!res1) { + res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23); + if (!res0) { + res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27); + if (!res1) + res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31); + } + } + } + } + } + return res0 + res1; +} + +static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) { + const __m128i zero = _mm_setzero_si128(); + const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); + _mm_store_si128((__m128i *)(dst_ptr), out0); + _mm_store_si128((__m128i *)(dst_ptr + 4), out1); +} + +static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) { + const __m128i zero = _mm_setzero_si128(); + const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); + _mm_storeu_si128((__m128i *)(dst_ptr), out0); + _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_ diff --git a/libs/libaom/src/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/libs/libaom/src/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm new file mode 100644 index 000000000..c1fb259a1 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm @@ -0,0 +1,379 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA + +pw_11585x2: times 8 dw 23170 +pd_8192: times 4 dd 8192 + +%macro TRANSFORM_COEFFS 2 +pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2 +pw_%2_m%1: dw %2, -%1, %2, -%1, %2, -%1, %2, -%1 +%endmacro + +TRANSFORM_COEFFS 11585, 11585 +TRANSFORM_COEFFS 15137, 6270 +TRANSFORM_COEFFS 16069, 3196 +TRANSFORM_COEFFS 9102, 13623 + +%macro STORE_OUTPUT 2 ; index, result + ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); + ; _mm_store_si128((__m128i *)(dst_ptr), out0); + ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1); + pxor m11, m11 + pcmpgtw m11, m%2 + movdqa m12, m%2 + punpcklwd m%2, m11 + punpckhwd m12, m11 + mova [outputq + 4*%1 + 0], m%2 + mova [outputq + 4*%1 + 16], m12 +%endmacro + +SECTION .text + +%if ARCH_X86_64 +INIT_XMM ssse3 +cglobal fdct8x8, 3, 5, 13, input, output, stride + + mova m8, [GLOBAL(pd_8192)] + mova m12, [GLOBAL(pw_11585x2)] + + lea r3, [2 * strideq] + lea r4, [4 * strideq] + mova m0, [inputq] + mova m1, [inputq + r3] + lea inputq, [inputq + r4] + mova m2, [inputq] + mova m3, [inputq + r3] + lea inputq, [inputq + r4] + mova m4, [inputq] + mova m5, [inputq + r3] + lea inputq, [inputq + r4] + mova m6, [inputq] + mova m7, [inputq + r3] + + ; left shift by 2 to increase forward transformation precision + psllw m0, 2 + psllw m1, 2 + psllw m2, 2 + psllw m3, 2 + psllw m4, 2 + psllw m5, 2 + psllw m6, 2 + psllw m7, 2 + + ; column transform + ; stage 1 + paddw m10, m0, m7 + psubw m0, m7 + + paddw m9, m1, m6 + psubw m1, m6 + + paddw m7, m2, m5 + psubw m2, m5 + + paddw m6, m3, m4 + psubw m3, m4 + + ; stage 2 + paddw m5, m9, m7 + psubw m9, m7 + + paddw m4, m10, m6 + psubw m10, m6 + + paddw m7, m1, m2 + psubw m1, m2 + + ; stage 3 + paddw m6, m4, m5 + psubw m4, m5 + + pmulhrsw m1, m12 + pmulhrsw m7, m12 + + ; sin(pi / 8), cos(pi / 8) + punpcklwd m2, m10, m9 + punpckhwd m10, m9 + pmaddwd m5, m2, [GLOBAL(pw_15137_6270)] + pmaddwd m2, [GLOBAL(pw_6270_m15137)] + pmaddwd m9, m10, [GLOBAL(pw_15137_6270)] + pmaddwd m10, [GLOBAL(pw_6270_m15137)] + paddd m5, m8 + paddd m2, m8 + paddd m9, m8 + paddd m10, m8 + psrad m5, 14 + psrad m2, 14 + psrad m9, 14 + psrad m10, 14 + packssdw m5, m9 + packssdw m2, m10 + + pmulhrsw m6, m12 + pmulhrsw m4, m12 + + paddw m9, m3, m1 + psubw m3, m1 + + paddw m10, m0, m7 + psubw m0, m7 + + ; stage 4 + ; sin(pi / 16), cos(pi / 16) + punpcklwd m1, m10, m9 + punpckhwd m10, m9 + pmaddwd m7, m1, [GLOBAL(pw_16069_3196)] + pmaddwd m1, [GLOBAL(pw_3196_m16069)] + pmaddwd m9, m10, [GLOBAL(pw_16069_3196)] + pmaddwd m10, [GLOBAL(pw_3196_m16069)] + paddd m7, m8 + paddd m1, m8 + paddd m9, m8 + paddd m10, m8 + psrad m7, 14 + psrad m1, 14 + psrad m9, 14 + psrad m10, 14 + packssdw m7, m9 + packssdw m1, m10 + + ; sin(3 * pi / 16), cos(3 * pi / 16) + punpcklwd m11, m0, m3 + punpckhwd m0, m3 + pmaddwd m9, m11, [GLOBAL(pw_9102_13623)] + pmaddwd m11, [GLOBAL(pw_13623_m9102)] + pmaddwd m3, m0, [GLOBAL(pw_9102_13623)] + pmaddwd m0, [GLOBAL(pw_13623_m9102)] + paddd m9, m8 + paddd m11, m8 + paddd m3, m8 + paddd m0, m8 + psrad m9, 14 + psrad m11, 14 + psrad m3, 14 + psrad m0, 14 + packssdw m9, m3 + packssdw m11, m0 + + ; transpose + ; stage 1 + punpcklwd m0, m6, m7 + punpcklwd m3, m5, m11 + punpckhwd m6, m7 + punpckhwd m5, m11 + punpcklwd m7, m4, m9 + punpcklwd m10, m2, m1 + punpckhwd m4, m9 + punpckhwd m2, m1 + + ; stage 2 + punpckldq m9, m0, m3 + punpckldq m1, m6, m5 + punpckhdq m0, m3 + punpckhdq m6, m5 + punpckldq m3, m7, m10 + punpckldq m5, m4, m2 + punpckhdq m7, m10 + punpckhdq m4, m2 + + ; stage 3 + punpcklqdq m10, m9, m3 + punpckhqdq m9, m3 + punpcklqdq m2, m0, m7 + punpckhqdq m0, m7 + punpcklqdq m3, m1, m5 + punpckhqdq m1, m5 + punpcklqdq m7, m6, m4 + punpckhqdq m6, m4 + + ; row transform + ; stage 1 + paddw m5, m10, m6 + psubw m10, m6 + + paddw m4, m9, m7 + psubw m9, m7 + + paddw m6, m2, m1 + psubw m2, m1 + + paddw m7, m0, m3 + psubw m0, m3 + + ;stage 2 + paddw m1, m5, m7 + psubw m5, m7 + + paddw m3, m4, m6 + psubw m4, m6 + + paddw m7, m9, m2 + psubw m9, m2 + + ; stage 3 + punpcklwd m6, m1, m3 + punpckhwd m1, m3 + pmaddwd m2, m6, [GLOBAL(pw_11585_11585)] + pmaddwd m6, [GLOBAL(pw_11585_m11585)] + pmaddwd m3, m1, [GLOBAL(pw_11585_11585)] + pmaddwd m1, [GLOBAL(pw_11585_m11585)] + paddd m2, m8 + paddd m6, m8 + paddd m3, m8 + paddd m1, m8 + psrad m2, 14 + psrad m6, 14 + psrad m3, 14 + psrad m1, 14 + packssdw m2, m3 + packssdw m6, m1 + + pmulhrsw m7, m12 + pmulhrsw m9, m12 + + punpcklwd m3, m5, m4 + punpckhwd m5, m4 + pmaddwd m1, m3, [GLOBAL(pw_15137_6270)] + pmaddwd m3, [GLOBAL(pw_6270_m15137)] + pmaddwd m4, m5, [GLOBAL(pw_15137_6270)] + pmaddwd m5, [GLOBAL(pw_6270_m15137)] + paddd m1, m8 + paddd m3, m8 + paddd m4, m8 + paddd m5, m8 + psrad m1, 14 + psrad m3, 14 + psrad m4, 14 + psrad m5, 14 + packssdw m1, m4 + packssdw m3, m5 + + paddw m4, m0, m9 + psubw m0, m9 + + paddw m5, m10, m7 + psubw m10, m7 + + ; stage 4 + punpcklwd m9, m5, m4 + punpckhwd m5, m4 + pmaddwd m7, m9, [GLOBAL(pw_16069_3196)] + pmaddwd m9, [GLOBAL(pw_3196_m16069)] + pmaddwd m4, m5, [GLOBAL(pw_16069_3196)] + pmaddwd m5, [GLOBAL(pw_3196_m16069)] + paddd m7, m8 + paddd m9, m8 + paddd m4, m8 + paddd m5, m8 + psrad m7, 14 + psrad m9, 14 + psrad m4, 14 + psrad m5, 14 + packssdw m7, m4 + packssdw m9, m5 + + punpcklwd m4, m10, m0 + punpckhwd m10, m0 + pmaddwd m5, m4, [GLOBAL(pw_9102_13623)] + pmaddwd m4, [GLOBAL(pw_13623_m9102)] + pmaddwd m0, m10, [GLOBAL(pw_9102_13623)] + pmaddwd m10, [GLOBAL(pw_13623_m9102)] + paddd m5, m8 + paddd m4, m8 + paddd m0, m8 + paddd m10, m8 + psrad m5, 14 + psrad m4, 14 + psrad m0, 14 + psrad m10, 14 + packssdw m5, m0 + packssdw m4, m10 + + ; transpose + ; stage 1 + punpcklwd m0, m2, m7 + punpcklwd m10, m1, m4 + punpckhwd m2, m7 + punpckhwd m1, m4 + punpcklwd m7, m6, m5 + punpcklwd m4, m3, m9 + punpckhwd m6, m5 + punpckhwd m3, m9 + + ; stage 2 + punpckldq m5, m0, m10 + punpckldq m9, m2, m1 + punpckhdq m0, m10 + punpckhdq m2, m1 + punpckldq m10, m7, m4 + punpckldq m1, m6, m3 + punpckhdq m7, m4 + punpckhdq m6, m3 + + ; stage 3 + punpcklqdq m4, m5, m10 + punpckhqdq m5, m10 + punpcklqdq m3, m0, m7 + punpckhqdq m0, m7 + punpcklqdq m10, m9, m1 + punpckhqdq m9, m1 + punpcklqdq m7, m2, m6 + punpckhqdq m2, m6 + + psraw m1, m4, 15 + psraw m6, m5, 15 + psraw m8, m3, 15 + psraw m11, m0, 15 + + psubw m4, m1 + psubw m5, m6 + psubw m3, m8 + psubw m0, m11 + + psraw m4, 1 + psraw m5, 1 + psraw m3, 1 + psraw m0, 1 + + psraw m1, m10, 15 + psraw m6, m9, 15 + psraw m8, m7, 15 + psraw m11, m2, 15 + + psubw m10, m1 + psubw m9, m6 + psubw m7, m8 + psubw m2, m11 + + psraw m10, 1 + psraw m9, 1 + psraw m7, 1 + psraw m2, 1 + + STORE_OUTPUT 0, 4 + STORE_OUTPUT 8, 5 + STORE_OUTPUT 16, 3 + STORE_OUTPUT 24, 0 + STORE_OUTPUT 32, 10 + STORE_OUTPUT 40, 9 + STORE_OUTPUT 48, 7 + STORE_OUTPUT 56, 2 + + RET +%endif diff --git a/libs/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_avx2.c b/libs/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_avx2.c new file mode 100644 index 000000000..c500b0a26 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_avx2.c @@ -0,0 +1,457 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/quantize_x86.h" + +#include "av1/encoder/av1_quantize.h" + +static INLINE void highbd_load_b_values_avx2( + const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr, + __m256i *round, const int16_t *quant_ptr, __m256i *quant, + const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr, + __m256i *shift) { + *zbin = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)zbin_ptr)); + *zbin = _mm256_sub_epi32(*zbin, _mm256_set1_epi32(1)); + *round = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)round_ptr)); + *quant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_ptr)); + *dequant = + _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)dequant_ptr)); + *shift = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)shift_ptr)); +} + +static INLINE void highbd_update_mask1_avx2(__m256i *cmp_mask, + const int16_t *iscan_ptr, + int *is_found, __m256i *mask) { + __m256i temp_mask = _mm256_setzero_si256(); + if (_mm256_movemask_epi8(*cmp_mask)) { + __m256i iscan = _mm256_loadu_si256((const __m256i *)(iscan_ptr)); + temp_mask = _mm256_and_si256(*cmp_mask, iscan); + *is_found = 1; + } + *mask = _mm256_max_epi16(temp_mask, *mask); +} + +static INLINE void highbd_update_mask0_avx2(__m256i *qcoeff0, __m256i *qcoeff1, + __m256i *threshold, + const int16_t *iscan_ptr, + int *is_found, __m256i *mask) { + __m256i coeff[2], cmp_mask0, cmp_mask1; + coeff[0] = _mm256_slli_epi32(*qcoeff0, AOM_QM_BITS); + cmp_mask0 = _mm256_cmpgt_epi32(coeff[0], threshold[0]); + coeff[1] = _mm256_slli_epi32(*qcoeff1, AOM_QM_BITS); + cmp_mask1 = _mm256_cmpgt_epi32(coeff[1], threshold[1]); + cmp_mask0 = + _mm256_permute4x64_epi64(_mm256_packs_epi32(cmp_mask0, cmp_mask1), 0xd8); + highbd_update_mask1_avx2(&cmp_mask0, iscan_ptr, is_found, mask); +} + +static INLINE void highbd_mul_shift_avx2(const __m256i *x, const __m256i *y, + __m256i *p, const int shift) { + __m256i prod_lo = _mm256_mul_epi32(*x, *y); + __m256i prod_hi = _mm256_srli_epi64(*x, 32); + const __m256i mult_hi = _mm256_srli_epi64(*y, 32); + prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); + + prod_lo = _mm256_srli_epi64(prod_lo, shift); + prod_hi = _mm256_srli_epi64(prod_hi, shift); + + prod_hi = _mm256_slli_epi64(prod_hi, 32); + *p = _mm256_blend_epi32(prod_lo, prod_hi, 0xaa); +} + +static INLINE void highbd_calculate_qcoeff_avx2(__m256i *coeff, + const __m256i *round, + const __m256i *quant, + const __m256i *shift, + const int *log_scale) { + __m256i tmp, qcoeff; + qcoeff = _mm256_add_epi32(*coeff, *round); + highbd_mul_shift_avx2(&qcoeff, quant, &tmp, 16); + qcoeff = _mm256_add_epi32(tmp, qcoeff); + highbd_mul_shift_avx2(&qcoeff, shift, coeff, 16 - *log_scale); +} + +static INLINE __m256i highbd_calculate_dqcoeff_avx2(__m256i qcoeff, + __m256i dequant) { + return _mm256_mullo_epi32(qcoeff, dequant); +} + +static INLINE __m256i highbd_calculate_dqcoeff_log_scale_avx2( + __m256i qcoeff, __m256i dequant, const int log_scale) { + __m256i abs_coeff = _mm256_abs_epi32(qcoeff); + highbd_mul_shift_avx2(&abs_coeff, &dequant, &abs_coeff, log_scale); + return _mm256_sign_epi32(abs_coeff, qcoeff); +} + +static INLINE void highbd_store_coefficients_avx2(__m256i coeff0, + __m256i coeff1, + tran_low_t *coeff_ptr) { + _mm256_store_si256((__m256i *)(coeff_ptr), coeff0); + _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff1); +} + +void aom_highbd_quantize_b_adaptive_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int index = 16; + int non_zero_count = 0; + int non_zero_count_prescan_add_zero = 0; + int is_found0 = 0, is_found1 = 0; + int eob = -1; + const __m256i zero = _mm256_setzero_si256(); + __m256i zbin, round, quant, dequant, shift; + __m256i coeff0, qcoeff0, coeff1, qcoeff1; + __m256i cmp_mask, mask0 = zero, mask1 = zero; + __m128i temp_mask0, temp_mask1; + int prescan_add[2]; + int thresh[2]; + const int log_scale = 0; + const qm_val_t wt = (1 << AOM_QM_BITS); + for (int i = 0; i < 2; ++i) { + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1; + } + __m256i threshold[2]; + threshold[0] = _mm256_set1_epi32(thresh[0]); + threshold[1] = _mm256_set1_epi32(thresh[1]); + threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe); + +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + + // Setup global values. + highbd_load_b_values_avx2(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, + &quant, dequant_ptr, &dequant, quant_shift_ptr, + &shift); + + // Do DC and first 15 AC. + coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr)); + qcoeff0 = _mm256_abs_epi32(coeff0); + coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + 8)); + qcoeff1 = _mm256_abs_epi32(coeff1); + highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, + &mask0); + __m256i temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin); + zbin = _mm256_unpackhi_epi64(zbin, zbin); + __m256i temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8); + highbd_update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1); + threshold[0] = threshold[1]; + if (_mm256_movemask_epi8(cmp_mask) == 0) { + _mm256_store_si256((__m256i *)(qcoeff_ptr), zero); + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero); + round = _mm256_unpackhi_epi64(round, round); + quant = _mm256_unpackhi_epi64(quant, quant); + shift = _mm256_unpackhi_epi64(shift, shift); + dequant = _mm256_unpackhi_epi64(dequant, dequant); + } else { + highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale); + round = _mm256_unpackhi_epi64(round, round); + quant = _mm256_unpackhi_epi64(quant, quant); + shift = _mm256_unpackhi_epi64(shift, shift); + highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale); + // Reinsert signs + qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0); + qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1); + // Mask out zbin threshold coeffs + qcoeff0 = _mm256_and_si256(qcoeff0, temp0); + qcoeff1 = _mm256_and_si256(qcoeff1, temp1); + highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr); + coeff0 = highbd_calculate_dqcoeff_avx2(qcoeff0, dequant); + dequant = _mm256_unpackhi_epi64(dequant, dequant); + coeff1 = highbd_calculate_dqcoeff_avx2(qcoeff1, dequant); + highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr); + } + + // AC only loop. + while (index < n_coeffs) { + coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr + index)); + qcoeff0 = _mm256_abs_epi32(coeff0); + coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + index + 8)); + qcoeff1 = _mm256_abs_epi32(coeff1); + highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan + index, + &is_found0, &mask0); + temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin); + temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8); + highbd_update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1); + if (_mm256_movemask_epi8(cmp_mask) == 0) { + _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero); + _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero); + index += 16; + continue; + } + highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale); + highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale); + qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0); + qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1); + qcoeff0 = _mm256_and_si256(qcoeff0, temp0); + qcoeff1 = _mm256_and_si256(qcoeff1, temp1); + highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr + index); + coeff0 = highbd_calculate_dqcoeff_avx2(qcoeff0, dequant); + coeff1 = highbd_calculate_dqcoeff_avx2(qcoeff1, dequant); + highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr + index); + index += 16; + } + if (is_found0) { + temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0), + _mm256_extracti128_si256(mask0, 1)); + non_zero_count = calculate_non_zero_count(temp_mask0); + } + if (is_found1) { + temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1), + _mm256_extracti128_si256(mask1, 1)); + non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1); + } + + for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { + const int rc = scan[i]; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + + for (int i = non_zero_count - 1; i >= 0; i--) { + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + break; + } + } + + *eob_ptr = eob + 1; +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + if (qcoeff) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const int coeff = coeff_ptr[rc] * wt; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < + (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} + +void aom_highbd_quantize_b_32x32_adaptive_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int index = 16; + int non_zero_count = 0; + int non_zero_count_prescan_add_zero = 0; + int is_found0 = 0, is_found1 = 0; + int eob = -1; + const int log_scale = 1; + const __m256i zero = _mm256_setzero_si256(); + __m256i zbin, round, quant, dequant, shift; + __m256i coeff0, qcoeff0, coeff1, qcoeff1; + __m256i cmp_mask, mask0 = zero, mask1 = zero; + __m128i temp_mask0, temp_mask1; + const __m256i one = _mm256_set1_epi32(1); + const __m256i log_scale_vec = _mm256_set1_epi32(log_scale); + int prescan_add[2]; + int thresh[2]; + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + const qm_val_t wt = (1 << AOM_QM_BITS); + for (int i = 0; i < 2; ++i) { + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1; + } + __m256i threshold[2]; + threshold[0] = _mm256_set1_epi32(thresh[0]); + threshold[1] = _mm256_set1_epi32(thresh[1]); + threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe); + +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + + // Setup global values. + zbin = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)zbin_ptr)); + round = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)round_ptr)); + quant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_ptr)); + dequant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)dequant_ptr)); + shift = + _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_shift_ptr)); + + // Shift with rounding. + zbin = _mm256_add_epi32(zbin, log_scale_vec); + round = _mm256_add_epi32(round, log_scale_vec); + zbin = _mm256_srli_epi32(zbin, log_scale); + round = _mm256_srli_epi32(round, log_scale); + zbin = _mm256_sub_epi32(zbin, one); + + // Do DC and first 15 AC. + coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr)); + qcoeff0 = _mm256_abs_epi32(coeff0); + coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + 8)); + qcoeff1 = _mm256_abs_epi32(coeff1); + highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, + &mask0); + __m256i temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin); + zbin = _mm256_permute2x128_si256(zbin, zbin, 0x11); + __m256i temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8); + highbd_update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1); + threshold[0] = threshold[1]; + if (_mm256_movemask_epi8(cmp_mask) == 0) { + _mm256_store_si256((__m256i *)(qcoeff_ptr), zero); + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero); + round = _mm256_permute2x128_si256(round, round, 0x11); + quant = _mm256_permute2x128_si256(quant, quant, 0x11); + shift = _mm256_permute2x128_si256(shift, shift, 0x11); + dequant = _mm256_permute2x128_si256(dequant, dequant, 0x11); + } else { + highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale); + round = _mm256_permute2x128_si256(round, round, 0x11); + quant = _mm256_permute2x128_si256(quant, quant, 0x11); + shift = _mm256_permute2x128_si256(shift, shift, 0x11); + highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale); + // Reinsert signs + qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0); + qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1); + // Mask out zbin threshold coeffs + qcoeff0 = _mm256_and_si256(qcoeff0, temp0); + qcoeff1 = _mm256_and_si256(qcoeff1, temp1); + highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr); + coeff0 = + highbd_calculate_dqcoeff_log_scale_avx2(qcoeff0, dequant, log_scale); + dequant = _mm256_permute2x128_si256(dequant, dequant, 0x11); + coeff1 = + highbd_calculate_dqcoeff_log_scale_avx2(qcoeff1, dequant, log_scale); + highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr); + } + + // AC only loop. + while (index < n_coeffs) { + coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr + index)); + qcoeff0 = _mm256_abs_epi32(coeff0); + coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + index + 8)); + qcoeff1 = _mm256_abs_epi32(coeff1); + highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan + index, + &is_found0, &mask0); + temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin); + temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8); + highbd_update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1); + if (_mm256_movemask_epi8(cmp_mask) == 0) { + _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero); + _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero); + index += 16; + continue; + } + highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale); + highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale); + qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0); + qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1); + qcoeff0 = _mm256_and_si256(qcoeff0, temp0); + qcoeff1 = _mm256_and_si256(qcoeff1, temp1); + highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr + index); + coeff0 = + highbd_calculate_dqcoeff_log_scale_avx2(qcoeff0, dequant, log_scale); + coeff1 = + highbd_calculate_dqcoeff_log_scale_avx2(qcoeff1, dequant, log_scale); + highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr + index); + index += 16; + } + if (is_found0) { + temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0), + _mm256_extracti128_si256(mask0, 1)); + non_zero_count = calculate_non_zero_count(temp_mask0); + } + if (is_found1) { + temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1), + _mm256_extracti128_si256(mask1, 1)); + non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1); + } + + for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { + const int rc = scan[i]; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + + for (int i = non_zero_count - 1; i >= 0; i--) { + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + break; + } + } + + *eob_ptr = eob + 1; +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + if (qcoeff) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const int coeff = coeff_ptr[rc] * wt; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} diff --git a/libs/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_sse2.c b/libs/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_sse2.c new file mode 100644 index 000000000..8f31f3596 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_sse2.c @@ -0,0 +1,732 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/quantize_x86.h" +#include "av1/encoder/av1_quantize.h" + +static INLINE __m128i highbd_invert_sign_64bit_sse2(__m128i a, __m128i sign) { + a = _mm_xor_si128(a, sign); + return _mm_sub_epi64(a, sign); +} + +static INLINE void highbd_mul_shift_sse2(const __m128i *x, const __m128i *y, + __m128i *p, const int shift) { + __m128i sign = _mm_srai_epi32(*y, 31); + __m128i sign_lo = _mm_unpacklo_epi32(sign, sign); + __m128i sign_hi = _mm_unpackhi_epi32(sign, sign); + __m128i abs_y = invert_sign_32_sse2(*y, sign); + __m128i prod_lo = _mm_mul_epu32(*x, abs_y); + __m128i prod_hi = _mm_srli_epi64(*x, 32); + const __m128i mult_hi = _mm_srli_epi64(abs_y, 32); + prod_hi = _mm_mul_epu32(prod_hi, mult_hi); + prod_lo = highbd_invert_sign_64bit_sse2(prod_lo, sign_lo); + prod_hi = highbd_invert_sign_64bit_sse2(prod_hi, sign_hi); + + prod_lo = _mm_srli_epi64(prod_lo, shift); + const __m128i mask = _mm_set_epi32(0, -1, 0, -1); + prod_lo = _mm_and_si128(prod_lo, mask); + prod_hi = _mm_srli_epi64(prod_hi, shift); + + prod_hi = _mm_slli_epi64(prod_hi, 32); + *p = _mm_or_si128(prod_lo, prod_hi); +} + +static INLINE void highbd_calculate_qcoeff(__m128i *coeff, const __m128i *round, + const __m128i *quant, + const __m128i *shift, + const int *log_scale) { + __m128i tmp, qcoeff; + qcoeff = _mm_add_epi32(*coeff, *round); + highbd_mul_shift_sse2(&qcoeff, quant, &tmp, 16); + qcoeff = _mm_add_epi32(tmp, qcoeff); + highbd_mul_shift_sse2(&qcoeff, shift, coeff, 16 - *log_scale); +} + +static INLINE void highbd_update_mask1(__m128i *cmp_mask0, + const int16_t *iscan_ptr, int *is_found, + __m128i *mask) { + __m128i temp_mask = _mm_setzero_si128(); + if (_mm_movemask_epi8(*cmp_mask0)) { + __m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr)); + __m128i mask0 = _mm_and_si128(*cmp_mask0, iscan0); + temp_mask = mask0; + *is_found = 1; + } + *mask = _mm_max_epi16(temp_mask, *mask); +} + +static INLINE void highbd_update_mask0(__m128i *qcoeff0, __m128i *qcoeff1, + __m128i *threshold, + const int16_t *iscan_ptr, int *is_found, + __m128i *mask) { + __m128i coeff[2], cmp_mask0, cmp_mask1; + + coeff[0] = _mm_slli_epi32(*qcoeff0, AOM_QM_BITS); + cmp_mask0 = _mm_cmpgt_epi32(coeff[0], threshold[0]); + coeff[1] = _mm_slli_epi32(*qcoeff1, AOM_QM_BITS); + cmp_mask1 = _mm_cmpgt_epi32(coeff[1], threshold[1]); + + cmp_mask0 = _mm_packs_epi32(cmp_mask0, cmp_mask1); + + highbd_update_mask1(&cmp_mask0, iscan_ptr, is_found, mask); +} + +static INLINE __m128i highbd_calculate_dqcoeff(__m128i qcoeff, __m128i dequant, + const int log_scale) { + __m128i coeff_sign = _mm_srai_epi32(qcoeff, 31); + __m128i abs_coeff = invert_sign_32_sse2(qcoeff, coeff_sign); + highbd_mul_shift_sse2(&abs_coeff, &dequant, &abs_coeff, log_scale); + return invert_sign_32_sse2(abs_coeff, coeff_sign); +} + +void aom_highbd_quantize_b_adaptive_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int index = 8; + const int log_scale = 0; + int non_zero_count = 0; + int non_zero_count_prescan_add_zero = 0; + int is_found0 = 0, is_found1 = 0; + int eob = -1; + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi32(1); + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1, cmp_mask; + __m128i all_zero; + __m128i mask0 = zero, mask1 = zero; + + int prescan_add[2]; + int thresh[4]; + const qm_val_t wt = (1 << AOM_QM_BITS); + for (int i = 0; i < 2; ++i) { + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1; + } + thresh[2] = thresh[3] = thresh[1]; + __m128i threshold[2]; + threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); + threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); + +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + // Setup global values. + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + + __m128i zbin_sign = _mm_srai_epi16(zbin, 15); + __m128i round_sign = _mm_srai_epi16(round, 15); + __m128i quant_sign = _mm_srai_epi16(quant, 15); + __m128i dequant_sign = _mm_srai_epi16(dequant, 15); + __m128i shift_sign = _mm_srai_epi16(shift, 15); + + zbin = _mm_unpacklo_epi16(zbin, zbin_sign); + round = _mm_unpacklo_epi16(round, round_sign); + quant = _mm_unpacklo_epi16(quant, quant_sign); + dequant = _mm_unpacklo_epi16(dequant, dequant_sign); + shift = _mm_unpacklo_epi16(shift, shift_sign); + zbin = _mm_sub_epi32(zbin, one); + + // Do DC and first 15 AC. + coeff0 = _mm_load_si128((__m128i *)(coeff_ptr)); + coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4)); + + coeff0_sign = _mm_srai_epi32(coeff0, 31); + coeff1_sign = _mm_srai_epi32(coeff1, 31); + qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); + + highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); + + cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); + highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1); + + threshold[0] = threshold[1]; + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); + + // Reinsert signs + qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1); + + coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); + _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1); + } + + // AC only loop. + while (index < n_coeffs) { + coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index)); + coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4)); + + coeff0_sign = _mm_srai_epi32(coeff0, 31); + coeff1_sign = _mm_srai_epi32(coeff1, 31); + qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); + + highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, + &is_found0, &mask0); + + cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); + highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); + index += 8; + continue; + } + highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); + highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); + + qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1); + + coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); + coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); + + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1); + + index += 8; + } + if (is_found0) non_zero_count = calculate_non_zero_count(mask0); + if (is_found1) + non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); + + for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { + const int rc = scan[i]; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + + for (int i = non_zero_count - 1; i >= 0; i--) { + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + break; + } + } + + *eob_ptr = eob + 1; +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + if (qcoeff) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const int coeff = coeff_ptr[rc] * wt; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < + (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} + +void aom_highbd_quantize_b_32x32_adaptive_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int index = 8; + const int log_scale = 1; + int non_zero_count = 0; + int non_zero_count_prescan_add_zero = 0; + int is_found0 = 0, is_found1 = 0; + int eob = -1; + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi32(1); + const __m128i log_scale_vec = _mm_set1_epi32(log_scale); + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1, cmp_mask; + __m128i all_zero; + __m128i mask0 = zero, mask1 = zero; + + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + int prescan_add[2]; + int thresh[4]; + const qm_val_t wt = (1 << AOM_QM_BITS); + for (int i = 0; i < 2; ++i) { + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1; + } + thresh[2] = thresh[3] = thresh[1]; + __m128i threshold[2]; + threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); + threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); + +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + // Setup global values. + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + + __m128i zbin_sign = _mm_srai_epi16(zbin, 15); + __m128i round_sign = _mm_srai_epi16(round, 15); + __m128i quant_sign = _mm_srai_epi16(quant, 15); + __m128i dequant_sign = _mm_srai_epi16(dequant, 15); + __m128i shift_sign = _mm_srai_epi16(shift, 15); + + zbin = _mm_unpacklo_epi16(zbin, zbin_sign); + round = _mm_unpacklo_epi16(round, round_sign); + quant = _mm_unpacklo_epi16(quant, quant_sign); + dequant = _mm_unpacklo_epi16(dequant, dequant_sign); + shift = _mm_unpacklo_epi16(shift, shift_sign); + + // Shift with rounding. + zbin = _mm_add_epi32(zbin, log_scale_vec); + round = _mm_add_epi32(round, log_scale_vec); + zbin = _mm_srli_epi32(zbin, log_scale); + round = _mm_srli_epi32(round, log_scale); + zbin = _mm_sub_epi32(zbin, one); + + // Do DC and first 15 AC. + coeff0 = _mm_load_si128((__m128i *)(coeff_ptr)); + coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4)); + + coeff0_sign = _mm_srai_epi32(coeff0, 31); + coeff1_sign = _mm_srai_epi32(coeff1, 31); + qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); + + highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); + + cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); + highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1); + + threshold[0] = threshold[1]; + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); + + // Reinsert signs + qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1); + + coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); + _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1); + } + + // AC only loop. + while (index < n_coeffs) { + coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index)); + coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4)); + + coeff0_sign = _mm_srai_epi32(coeff0, 31); + coeff1_sign = _mm_srai_epi32(coeff1, 31); + qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); + + highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, + &is_found0, &mask0); + + cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); + highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); + index += 8; + continue; + } + highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); + highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); + + qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1); + + coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); + coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); + + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1); + + index += 8; + } + if (is_found0) non_zero_count = calculate_non_zero_count(mask0); + if (is_found1) + non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); + + for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { + const int rc = scan[i]; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + + for (int i = non_zero_count - 1; i >= 0; i--) { + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + break; + } + } + + *eob_ptr = eob + 1; +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + if (qcoeff) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const int coeff = coeff_ptr[rc] * wt; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} + +void aom_highbd_quantize_b_64x64_adaptive_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int index = 8; + const int log_scale = 2; + int non_zero_count = 0; + int non_zero_count_prescan_add_zero = 0; + int is_found0 = 0, is_found1 = 0; + int eob = -1; + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi32(1); + const __m128i log_scale_vec = _mm_set1_epi32(log_scale); + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1, cmp_mask; + __m128i all_zero; + __m128i mask0 = zero, mask1 = zero; + + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + int prescan_add[2]; + int thresh[4]; + const qm_val_t wt = (1 << AOM_QM_BITS); + for (int i = 0; i < 2; ++i) { + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1; + } + thresh[2] = thresh[3] = thresh[1]; + __m128i threshold[2]; + threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); + threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); + +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + // Setup global values. + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + + __m128i zbin_sign = _mm_srai_epi16(zbin, 15); + __m128i round_sign = _mm_srai_epi16(round, 15); + __m128i quant_sign = _mm_srai_epi16(quant, 15); + __m128i dequant_sign = _mm_srai_epi16(dequant, 15); + __m128i shift_sign = _mm_srai_epi16(shift, 15); + + zbin = _mm_unpacklo_epi16(zbin, zbin_sign); + round = _mm_unpacklo_epi16(round, round_sign); + quant = _mm_unpacklo_epi16(quant, quant_sign); + dequant = _mm_unpacklo_epi16(dequant, dequant_sign); + shift = _mm_unpacklo_epi16(shift, shift_sign); + + // Shift with rounding. + zbin = _mm_add_epi32(zbin, log_scale_vec); + round = _mm_add_epi32(round, log_scale_vec); + zbin = _mm_srli_epi32(zbin, log_scale); + round = _mm_srli_epi32(round, log_scale); + zbin = _mm_sub_epi32(zbin, one); + + // Do DC and first 15 AC. + coeff0 = _mm_load_si128((__m128i *)(coeff_ptr)); + coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4)); + + coeff0_sign = _mm_srai_epi32(coeff0, 31); + coeff1_sign = _mm_srai_epi32(coeff1, 31); + qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); + + highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); + + cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); + highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1); + + threshold[0] = threshold[1]; + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); + + // Reinsert signs + qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1); + + coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); + _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1); + } + + // AC only loop. + while (index < n_coeffs) { + coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index)); + coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4)); + + coeff0_sign = _mm_srai_epi32(coeff0, 31); + coeff1_sign = _mm_srai_epi32(coeff1, 31); + qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); + + highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, + &is_found0, &mask0); + + cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); + highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); + index += 8; + continue; + } + highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); + highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); + + qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1); + + coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); + coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); + + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1); + + index += 8; + } + if (is_found0) non_zero_count = calculate_non_zero_count(mask0); + if (is_found1) + non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); + + for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { + const int rc = scan[i]; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + + for (int i = non_zero_count - 1; i >= 0; i--) { + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + break; + } + } + + *eob_ptr = eob + 1; +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + if (qcoeff) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const int coeff = coeff_ptr[rc] * wt; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} diff --git a/libs/libaom/src/aom_dsp/x86/highbd_convolve_avx2.c b/libs/libaom/src/aom_dsp/x86/highbd_convolve_avx2.c new file mode 100644 index 000000000..b43a7d7b5 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/highbd_convolve_avx2.c @@ -0,0 +1,1323 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/convolve.h" +#include "aom_dsp/x86/convolve_avx2.h" +#include "aom_dsp/x86/synonyms.h" + +// ----------------------------------------------------------------------------- +// Copy and average + +static const uint8_t ip_shuffle_f2f3[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, + 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, + 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; +static const uint8_t ip_shuffle_f4f5[32] = { 4, 5, 6, 7, 6, 7, 8, 9, + 8, 9, 10, 11, 10, 11, 12, 13, + 4, 5, 6, 7, 6, 7, 8, 9, + 8, 9, 10, 11, 10, 11, 12, 13 }; + +void aom_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int width, int h, int bd) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + (void)filter_x; + (void)filter_y; + (void)filter_x_stride; + (void)filter_y_stride; + (void)bd; + + assert(width % 4 == 0); + if (width > 32) { // width = 64 + do { + const __m256i p0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32)); + const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48)); + src += src_stride; + _mm256_storeu_si256((__m256i *)dst, p0); + _mm256_storeu_si256((__m256i *)(dst + 16), p1); + _mm256_storeu_si256((__m256i *)(dst + 32), p2); + _mm256_storeu_si256((__m256i *)(dst + 48), p3); + dst += dst_stride; + h--; + } while (h > 0); + } else if (width > 16) { // width = 32 + do { + const __m256i p0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + src += src_stride; + _mm256_storeu_si256((__m256i *)dst, p0); + _mm256_storeu_si256((__m256i *)(dst + 16), p1); + dst += dst_stride; + h--; + } while (h > 0); + } else if (width > 8) { // width = 16 + __m256i p0, p1; + do { + p0 = _mm256_loadu_si256((const __m256i *)src); + src += src_stride; + p1 = _mm256_loadu_si256((const __m256i *)src); + src += src_stride; + + _mm256_storeu_si256((__m256i *)dst, p0); + dst += dst_stride; + _mm256_storeu_si256((__m256i *)dst, p1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (width > 4) { // width = 8 + __m128i p0, p1; + do { + p0 = _mm_loadu_si128((const __m128i *)src); + src += src_stride; + p1 = _mm_loadu_si128((const __m128i *)src); + src += src_stride; + + _mm_storeu_si128((__m128i *)dst, p0); + dst += dst_stride; + _mm_storeu_si128((__m128i *)dst, p1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else { // width = 4 + __m128i p0, p1; + do { + p0 = _mm_loadl_epi64((const __m128i *)src); + src += src_stride; + p1 = _mm_loadl_epi64((const __m128i *)src); + src += src_stride; + + _mm_storel_epi64((__m128i *)dst, p0); + dst += dst_stride; + _mm_storel_epi64((__m128i *)dst, p1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } +} + +void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params, int bd) { + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride; + (void)filter_params_x; + (void)subpel_x_qn; + (void)conv_params; + + assert(conv_params->round_0 <= FILTER_BITS); + assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) || + ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); + + __m256i s[8], coeffs_y[4]; + + const int bits = FILTER_BITS; + + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); + const __m256i clip_pixel = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m256i zero = _mm256_setzero_si256(); + + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); + + for (j = 0; j < w; j += 8) { + const uint16_t *data = &src_ptr[j]; + /* Vertical filter */ + { + __m256i src6; + __m256i s01 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 0 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), + 0x20); + __m256i s12 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), + 0x20); + __m256i s23 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), + 0x20); + __m256i s34 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), + 0x20); + __m256i s45 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), + 0x20); + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); + __m256i s56 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), + src6, 0x20); + + s[0] = _mm256_unpacklo_epi16(s01, s12); + s[1] = _mm256_unpacklo_epi16(s23, s34); + s[2] = _mm256_unpacklo_epi16(s45, s56); + + s[4] = _mm256_unpackhi_epi16(s01, s12); + s[5] = _mm256_unpackhi_epi16(s23, s34); + s[6] = _mm256_unpackhi_epi16(s45, s56); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + + const __m256i s67 = _mm256_permute2x128_si256( + src6, + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), + 0x20); + + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); + + const __m256i s78 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), + src6, 0x20); + + s[3] = _mm256_unpacklo_epi16(s67, s78); + s[7] = _mm256_unpackhi_epi16(s67, s78); + + const __m256i res_a = convolve(s, coeffs_y); + + __m256i res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a, round_const_bits), round_shift_bits); + + if (w - j > 4) { + const __m256i res_b = convolve(s + 4, coeffs_y); + __m256i res_b_round = _mm256_sra_epi32( + _mm256_add_epi32(res_b, round_const_bits), round_shift_bits); + + __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); + res_16bit = _mm256_min_epi16(res_16bit, clip_pixel); + res_16bit = _mm256_max_epi16(res_16bit, zero); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res_16bit)); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res_16bit, 1)); + } else if (w == 4) { + res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); + res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); + res_a_round = _mm256_max_epi16(res_a_round, zero); + + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res_a_round)); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res_a_round, 1)); + } else { + res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); + res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); + res_a_round = _mm256_max_epi16(res_a_round, zero); + + xx_storel_32((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res_a_round)); + xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res_a_round, 1)); + } + + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + } + } + } +} + +void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params, int bd) { + int i, j; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_horiz; + (void)subpel_y_qn; + (void)filter_params_y; + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + + __m256i s[4], coeffs_x[4]; + + const __m256i round_const_x = + _mm256_set1_epi32(((1 << conv_params->round_0) >> 1)); + const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); + + const int bits = FILTER_BITS - conv_params->round_0; + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); + const __m256i clip_pixel = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m256i zero = _mm256_setzero_si256(); + + assert(bits >= 0); + assert((FILTER_BITS - conv_params->round_1) >= 0 || + ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); + + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + for (i = 0; i < h; i += 2) { + const __m256i row0 = + _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]); + __m256i row1 = + _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]); + + const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); + const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); + + // even pixels + s[0] = _mm256_alignr_epi8(r1, r0, 0); + s[1] = _mm256_alignr_epi8(r1, r0, 4); + s[2] = _mm256_alignr_epi8(r1, r0, 8); + s[3] = _mm256_alignr_epi8(r1, r0, 12); + + __m256i res_even = convolve(s, coeffs_x); + res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x), + round_shift_x); + + // odd pixels + s[0] = _mm256_alignr_epi8(r1, r0, 2); + s[1] = _mm256_alignr_epi8(r1, r0, 6); + s[2] = _mm256_alignr_epi8(r1, r0, 10); + s[3] = _mm256_alignr_epi8(r1, r0, 14); + + __m256i res_odd = convolve(s, coeffs_x); + res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x), + round_shift_x); + + res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_bits), + round_shift_bits); + res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_bits), + round_shift_bits); + + __m256i res_even1 = _mm256_packs_epi32(res_even, res_even); + __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd); + + __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1); + res = _mm256_min_epi16(res, clip_pixel); + res = _mm256_max_epi16(res, zero); + + if (w - j > 4) { + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res)); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res, 1)); + } else if (w == 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res)); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res, 1)); + } else { + xx_storel_32((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res)); + xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res, 1)); + } + } + } +} + +#define CONV8_ROUNDING_BITS (7) + +// ----------------------------------------------------------------------------- +// Horizontal and vertical filtering + +static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, + 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, + 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; + +static const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9, + 8, 9, 10, 11, 10, 11, 12, 13, + 4, 5, 6, 7, 6, 7, 8, 9, + 8, 9, 10, 11, 10, 11, 12, 13 }; + +static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11, + 10, 11, 12, 13, 12, 13, 14, 15, + 6, 7, 8, 9, 8, 9, 10, 11, + 10, 11, 12, 13, 12, 13, 14, 15 }; + +static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 }; + +// ----------------------------------------------------------------------------- +// Horizontal Filtering + +static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) { + const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); + const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0); + const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1); + const __m256i c = _mm256_permutevar8x32_epi32(*s, idx); + + p[0] = _mm256_shuffle_epi8(*s, sf0); // x0x6 + p[1] = _mm256_shuffle_epi8(*s, sf1); // x1x7 + p[2] = _mm256_shuffle_epi8(c, sf0); // x2x4 + p[3] = _mm256_shuffle_epi8(c, sf1); // x3x5 +} + +// Note: +// Shared by 8x2 and 16x1 block +static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1, + __m256i *x /*x[8]*/) { + __m256i pp[8]; + pack_pixels(s0, pp); + pack_pixels(s1, &pp[4]); + x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20); + x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20); + x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20); + x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20); + x[4] = x[2]; + x[5] = x[3]; + x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31); + x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31); +} + +static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) { + __m256i pp[8]; + __m256i s0; + s0 = _mm256_loadu_si256((const __m256i *)src); + pack_pixels(&s0, pp); + x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30); + x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30); + x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30); + x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30); +} + +static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride, + __m256i *x) { + __m256i s0, s1; + s0 = _mm256_loadu_si256((const __m256i *)src); + s1 = _mm256_loadu_si256((const __m256i *)(src + stride)); + pack_16_pixels(&s0, &s1, x); +} + +static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) { + __m256i s0, s1; + s0 = _mm256_loadu_si256((const __m256i *)src); + s1 = _mm256_loadu_si256((const __m256i *)(src + 8)); + pack_16_pixels(&s0, &s1, x); +} + +// Note: +// Shared by horizontal and vertical filtering +static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); + const __m256i p0 = _mm256_set1_epi32(0x03020100); + const __m256i p1 = _mm256_set1_epi32(0x07060504); + const __m256i p2 = _mm256_set1_epi32(0x0b0a0908); + const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c); + f[0] = _mm256_shuffle_epi8(hh, p0); + f[1] = _mm256_shuffle_epi8(hh, p1); + f[2] = _mm256_shuffle_epi8(hh, p2); + f[3] = _mm256_shuffle_epi8(hh, p3); +} + +static INLINE void pack_filters_4tap(const int16_t *filter, + __m256i *f /*f[4]*/) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m256i coeff = _mm256_broadcastsi128_si256(h); + + // coeffs 2 3 2 3 2 3 2 3 + f[0] = _mm256_shuffle_epi32(coeff, 0x55); + // coeffs 4 5 4 5 4 5 4 5 + f[1] = _mm256_shuffle_epi32(coeff, 0xaa); +} + +static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/, + const __m256i *fil /*fil[4]*/, + __m256i *y) { + __m256i a, a0, a1; + + a0 = _mm256_madd_epi16(fil[0], sig[0]); + a1 = _mm256_madd_epi16(fil[3], sig[3]); + a = _mm256_add_epi32(a0, a1); + + a0 = _mm256_madd_epi16(fil[1], sig[1]); + a1 = _mm256_madd_epi16(fil[2], sig[2]); + + { + const __m256i min = _mm256_min_epi32(a0, a1); + a = _mm256_add_epi32(a, min); + } + { + const __m256i max = _mm256_max_epi32(a0, a1); + a = _mm256_add_epi32(a, max); + } + { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + a = _mm256_add_epi32(a, rounding); + *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS); + } +} + +static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask, + uint16_t *dst) { + const __m128i a0 = _mm256_castsi256_si128(*y); + const __m128i a1 = _mm256_extractf128_si256(*y, 1); + __m128i res = _mm_packus_epi32(a0, a1); + res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask)); + _mm_storeu_si128((__m128i *)dst, res); +} + +static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + __m256i a = _mm256_packus_epi32(*y0, *y1); + a = _mm256_min_epi16(a, *mask); + _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a)); + _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); +} + +static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst) { + __m256i a = _mm256_packus_epi32(*y0, *y1); + a = _mm256_min_epi16(a, *mask); + _mm256_storeu_si256((__m256i *)dst, a); +} + +static void aom_highbd_filter_block1d8_h8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[8], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + src_ptr -= 3; + do { + pack_8x2_pixels(src_ptr, src_pitch, signal); + filter_8x1_pixels(signal, ff, &res0); + filter_8x1_pixels(&signal[4], ff, &res1); + store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 2; + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + } while (height > 1); + + if (height > 0) { + pack_8x1_pixels(src_ptr, signal); + filter_8x1_pixels(signal, ff, &res0); + store_8x1_pixels(&res0, &max, dst_ptr); + } +} + +static void aom_highbd_filter_block1d16_h8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[8], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + src_ptr -= 3; + do { + pack_16x1_pixels(src_ptr, signal); + filter_8x1_pixels(signal, ff, &res0); + filter_8x1_pixels(&signal[4], ff, &res1); + store_16x1_pixels(&res0, &res1, &max, dst_ptr); + height -= 1; + src_ptr += src_pitch; + dst_ptr += dst_pitch; + } while (height > 0); +} + +static void aom_highbd_filter_block1d4_h4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m256i ff[2], s[2]; + uint32_t i; + const __m256i clip_pixel = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m256i zero = _mm256_setzero_si256(); + + static const uint8_t shuffle_mask[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, + 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, + 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; + + __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask); + __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3); + __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5); + + pack_filters_4tap(filter, ff); + src_ptr -= 3; + for (i = 0; i <= (height - 2); i += 2) { + __m256i row0 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2])); + __m256i row1 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)&src_ptr[(i + 1) * src_pitch + 2])); + + s[0] = _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1); + s[1] = _mm256_alignr_epi8(s[0], s[0], 4); + + s[0] = _mm256_shuffle_epi8(s[0], mask); + s[1] = _mm256_shuffle_epi8(s[1], mask); + + __m256i res = convolve_4tap(s, ff); + res = + _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS); + + res = _mm256_packs_epi32(res, res); + res = _mm256_min_epi16(res, clip_pixel); + res = _mm256_max_epi16(res, zero); + + _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch], + _mm256_castsi256_si128(res)); + _mm_storel_epi64((__m128i *)&dst_ptr[(i + 1) * dst_pitch], + _mm256_extracti128_si256(res, 1)); + } + if (height % 2 != 0) { + i = height - 1; + const __m256i row0_0 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2])); + const __m256i row0_1 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 6])); + + const __m256i r0 = + _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1); + + s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3); + s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5); + + __m256i res = convolve_4tap(s, ff); + res = + _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS); + + res = _mm256_packs_epi32(res, res); + res = _mm256_min_epi16(res, clip_pixel); + res = _mm256_max_epi16(res, zero); + + _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch], + _mm256_castsi256_si128(res)); + } +} + +static void aom_highbd_filter_block1d8_h4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m256i ff[2], s[2]; + uint32_t i = 0; + const __m256i clip_pixel = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m256i zero = _mm256_setzero_si256(); + + static const uint8_t shuffle_mask[32] = { 0, 1, 8, 9, 2, 3, 10, 11, + 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, + 4, 5, 12, 13, 6, 7, 14, 15 }; + + __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask); + __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3); + __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5); + + pack_filters_4tap(filter, ff); + src_ptr -= 3; + + /* Horizontal filter */ + + for (i = 0; i <= (height - 2); i += 2) { + const __m256i row0 = + _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]); + __m256i row1 = + _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_pitch + 2]); + + const __m256i r0 = + _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1); + const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); + + // even pixels + s[0] = r0; + s[1] = _mm256_alignr_epi8(r1, r0, 4); + + __m256i res_even = convolve_4tap(s, ff); + res_even = _mm256_srai_epi32(_mm256_add_epi32(res_even, rounding), + CONV8_ROUNDING_BITS); + + // odd pixels + s[0] = _mm256_alignr_epi8(r1, r0, 2); + s[1] = _mm256_alignr_epi8(r1, r0, 6); + + __m256i res_odd = convolve_4tap(s, ff); + res_odd = _mm256_srai_epi32(_mm256_add_epi32(res_odd, rounding), + CONV8_ROUNDING_BITS); + + __m256i res = _mm256_packs_epi32(res_even, res_odd); + res = _mm256_shuffle_epi8(res, mask); + + res = _mm256_min_epi16(res, clip_pixel); + res = _mm256_max_epi16(res, zero); + + _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch], + _mm256_castsi256_si128(res)); + _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch], + _mm256_extracti128_si256(res, 1)); + } + + if (height % 2 != 0) { + i = height - 1; + const __m256i row0_0 = + _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]); + const __m256i row0_1 = + _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 6]); + + const __m256i r0 = + _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1); + + s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3); + s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5); + + __m256i res = convolve_4tap(s, ff); + res = + _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS); + + res = _mm256_packs_epi32(res, res); + res = _mm256_min_epi16(res, clip_pixel); + res = _mm256_max_epi16(res, zero); + + _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch], + _mm256_castsi256_si128(res)); + _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + 4], + _mm256_extracti128_si256(res, 1)); + } +} + +static void aom_highbd_filter_block1d16_h4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + aom_highbd_filter_block1d8_h4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch, + height, filter, bd); + aom_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8, + dst_pitch, height, filter, bd); +} + +// ----------------------------------------------------------------------------- +// 2-tap horizontal filtering + +static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); + const __m256i p = _mm256_set1_epi32(0x09080706); + f[0] = _mm256_shuffle_epi8(hh, p); +} + +// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels() +// the difference is s0/s1 specifies first and second rows or, +// first 16 samples and 8-sample shifted 16 samples +static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1, + __m256i *sig) { + const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); + const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); + __m256i x0 = _mm256_shuffle_epi8(*s0, sf2); + __m256i x1 = _mm256_shuffle_epi8(*s1, sf2); + __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx); + __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx); + r0 = _mm256_shuffle_epi8(r0, sf2); + r1 = _mm256_shuffle_epi8(r1, sf2); + sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20); + sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20); +} + +static INLINE void pack_8x2_2t_pixels(const uint16_t *src, + const ptrdiff_t pitch, __m256i *sig) { + const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); + pack_16_2t_pixels(&r0, &r1, sig); +} + +static INLINE void pack_16x1_2t_pixels(const uint16_t *src, + __m256i *sig /*sig[2]*/) { + const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8)); + pack_16_2t_pixels(&r0, &r1, sig); +} + +static INLINE void pack_8x1_2t_pixels(const uint16_t *src, + __m256i *sig /*sig[2]*/) { + const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); + const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); + __m256i r0 = _mm256_loadu_si256((const __m256i *)src); + __m256i x0 = _mm256_shuffle_epi8(r0, sf2); + r0 = _mm256_permutevar8x32_epi32(r0, idx); + r0 = _mm256_shuffle_epi8(r0, sf2); + sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20); +} + +// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels() +static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m256i x0 = _mm256_madd_epi16(sig[0], *f); + __m256i x1 = _mm256_madd_epi16(sig[1], *f); + x0 = _mm256_add_epi32(x0, rounding); + x1 = _mm256_add_epi32(x1, rounding); + *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); + *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS); +} + +static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0) { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m256i x0 = _mm256_madd_epi16(sig[0], *f); + x0 = _mm256_add_epi32(x0, rounding); + *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); +} + +static void aom_highbd_filter_block1d8_h2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[2], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff; + pack_2t_filter(filter, &ff); + + src_ptr -= 3; + do { + pack_8x2_2t_pixels(src_ptr, src_pitch, signal); + filter_16_2t_pixels(signal, &ff, &res0, &res1); + store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 2; + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + } while (height > 1); + + if (height > 0) { + pack_8x1_2t_pixels(src_ptr, signal); + filter_8x1_2t_pixels(signal, &ff, &res0); + store_8x1_pixels(&res0, &max, dst_ptr); + } +} + +static void aom_highbd_filter_block1d16_h2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[2], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff; + pack_2t_filter(filter, &ff); + + src_ptr -= 3; + do { + pack_16x1_2t_pixels(src_ptr, signal); + filter_16_2t_pixels(signal, &ff, &res0, &res1); + store_16x1_pixels(&res0, &res1, &max, dst_ptr); + height -= 1; + src_ptr += src_pitch; + dst_ptr += dst_pitch; + } while (height > 0); +} + +// ----------------------------------------------------------------------------- +// Vertical Filtering + +static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { + __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src)); + __m256i s1 = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch))); + __m256i s2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 2 * pitch))); + __m256i s3 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 3 * pitch))); + __m256i s4 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 4 * pitch))); + __m256i s5 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 5 * pitch))); + __m256i s6 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 6 * pitch))); + + s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); + s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1); + s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1); + s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1); + s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1); + s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1); + + sig[0] = _mm256_unpacklo_epi16(s0, s1); + sig[4] = _mm256_unpackhi_epi16(s0, s1); + sig[1] = _mm256_unpacklo_epi16(s2, s3); + sig[5] = _mm256_unpackhi_epi16(s2, s3); + sig[2] = _mm256_unpacklo_epi16(s4, s5); + sig[6] = _mm256_unpackhi_epi16(s4, s5); + sig[8] = s6; +} + +static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch, + __m256i *sig) { + // base + 7th row + __m256i s0 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 7 * pitch))); + // base + 8th row + __m256i s1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 8 * pitch))); + __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1); + __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); + sig[3] = _mm256_unpacklo_epi16(s2, s3); + sig[7] = _mm256_unpackhi_epi16(s2, s3); + sig[8] = s1; +} + +static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + filter_8x1_pixels(sig, f, y0); + filter_8x1_pixels(&sig[4], f, y1); +} + +static INLINE void update_pixels(__m256i *sig) { + int i; + for (i = 0; i < 3; ++i) { + sig[i] = sig[i + 1]; + sig[i + 4] = sig[i + 5]; + } +} + +static void aom_highbd_filter_block1d8_v8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[9], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + pack_8x9_init(src_ptr, src_pitch, signal); + + do { + pack_8x9_pixels(src_ptr, src_pitch, signal); + + filter_8x9_pixels(signal, ff, &res0, &res1); + store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + update_pixels(signal); + + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + height -= 2; + } while (height > 0); +} + +static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { + __m256i u0, u1, u2, u3; + // load 0-6 rows + const __m256i s0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); + const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch)); + const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch)); + const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch)); + const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch)); + const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch)); + + u0 = _mm256_permute2x128_si256(s0, s1, 0x20); // 0, 1 low + u1 = _mm256_permute2x128_si256(s0, s1, 0x31); // 0, 1 high + + u2 = _mm256_permute2x128_si256(s1, s2, 0x20); // 1, 2 low + u3 = _mm256_permute2x128_si256(s1, s2, 0x31); // 1, 2 high + + sig[0] = _mm256_unpacklo_epi16(u0, u2); + sig[4] = _mm256_unpackhi_epi16(u0, u2); + + sig[8] = _mm256_unpacklo_epi16(u1, u3); + sig[12] = _mm256_unpackhi_epi16(u1, u3); + + u0 = _mm256_permute2x128_si256(s2, s3, 0x20); + u1 = _mm256_permute2x128_si256(s2, s3, 0x31); + + u2 = _mm256_permute2x128_si256(s3, s4, 0x20); + u3 = _mm256_permute2x128_si256(s3, s4, 0x31); + + sig[1] = _mm256_unpacklo_epi16(u0, u2); + sig[5] = _mm256_unpackhi_epi16(u0, u2); + + sig[9] = _mm256_unpacklo_epi16(u1, u3); + sig[13] = _mm256_unpackhi_epi16(u1, u3); + + u0 = _mm256_permute2x128_si256(s4, s5, 0x20); + u1 = _mm256_permute2x128_si256(s4, s5, 0x31); + + u2 = _mm256_permute2x128_si256(s5, s6, 0x20); + u3 = _mm256_permute2x128_si256(s5, s6, 0x31); + + sig[2] = _mm256_unpacklo_epi16(u0, u2); + sig[6] = _mm256_unpackhi_epi16(u0, u2); + + sig[10] = _mm256_unpacklo_epi16(u1, u3); + sig[14] = _mm256_unpackhi_epi16(u1, u3); + + sig[16] = s6; +} + +static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch, + __m256i *sig) { + // base + 7th row + const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch)); + // base + 8th row + const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch)); + + __m256i u0, u1, u2, u3; + u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20); + u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31); + + u2 = _mm256_permute2x128_si256(s7, s8, 0x20); + u3 = _mm256_permute2x128_si256(s7, s8, 0x31); + + sig[3] = _mm256_unpacklo_epi16(u0, u2); + sig[7] = _mm256_unpackhi_epi16(u0, u2); + + sig[11] = _mm256_unpacklo_epi16(u1, u3); + sig[15] = _mm256_unpackhi_epi16(u1, u3); + + sig[16] = s8; +} + +static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + __m256i res[4]; + int i; + for (i = 0; i < 4; ++i) { + filter_8x1_pixels(&sig[i << 2], f, &res[i]); + } + + { + const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]); + const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]); + *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20); + *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31); + } +} + +static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + __m256i p = _mm256_min_epi16(*y0, *mask); + _mm256_storeu_si256((__m256i *)dst, p); + p = _mm256_min_epi16(*y1, *mask); + _mm256_storeu_si256((__m256i *)(dst + pitch), p); +} + +static void update_16x9_pixels(__m256i *sig) { + update_pixels(&sig[0]); + update_pixels(&sig[8]); +} + +static void aom_highbd_filter_block1d16_v8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[17], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + pack_16x9_init(src_ptr, src_pitch, signal); + + do { + pack_16x9_pixels(src_ptr, src_pitch, signal); + filter_16x9_pixels(signal, ff, &res0, &res1); + store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + update_16x9_pixels(signal); + + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + height -= 2; + } while (height > 0); +} + +static void aom_highbd_filter_block1d4_v4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + const int bits = FILTER_BITS; + + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); + const __m256i clip_pixel = + _mm256_set1_epi32(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m256i zero = _mm256_setzero_si256(); + uint32_t i; + __m256i s[2], ff[2]; + + pack_filters_4tap(filter, ff); + + const uint16_t *data = src_ptr; + /* Vertical filter */ + { + __m128i s2 = _mm_loadl_epi64((__m128i *)(data + 2 * src_pitch)); + __m128i s3 = _mm_loadl_epi64((__m128i *)(data + 3 * src_pitch)); + + __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1); + + __m128i s4 = _mm_loadl_epi64((__m128i *)(data + 4 * src_pitch)); + + __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1); + + s[0] = _mm256_unpacklo_epi16(s23, s34); + + for (i = 0; i < height; i += 2) { + data = &src_ptr[i * src_pitch]; + + __m128i s5 = _mm_loadl_epi64((__m128i *)(data + 5 * src_pitch)); + __m128i s6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_pitch)); + + __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1); + __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1); + + s[1] = _mm256_unpacklo_epi16(s45, s56); + + const __m256i res_a = convolve_4tap(s, ff); + + __m256i res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a, round_const_bits), round_shift_bits); + + __m256i res_16bit = _mm256_min_epi32(res_a_round, clip_pixel); + res_16bit = _mm256_max_epi32(res_16bit, zero); + res_16bit = _mm256_packs_epi32(res_16bit, res_16bit); + + _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch], + _mm256_castsi256_si128(res_16bit)); + _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch], + _mm256_extracti128_si256(res_16bit, 1)); + + s[0] = s[1]; + s4 = s6; + } + } +} + +static void aom_highbd_filter_block1d8_v4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + const int bits = FILTER_BITS; + + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); + const __m256i clip_pixel = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m256i zero = _mm256_setzero_si256(); + __m256i s[4], ff[2]; + uint32_t i; + pack_filters_4tap(filter, ff); + + const uint16_t *data = src_ptr; + /* Vertical filter */ + { + __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_pitch)); + __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_pitch)); + + __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1); + + __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_pitch)); + + __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1); + + s[0] = _mm256_unpacklo_epi16(s23, s34); + s[2] = _mm256_unpackhi_epi16(s23, s34); + + for (i = 0; i < height; i += 2) { + data = &src_ptr[i * src_pitch]; + + __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_pitch)); + __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_pitch)); + + __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1); + __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1); + + s[1] = _mm256_unpacklo_epi16(s45, s56); + s[3] = _mm256_unpackhi_epi16(s45, s56); + + const __m256i res_a = convolve_4tap(s, ff); + + __m256i res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a, round_const_bits), round_shift_bits); + + const __m256i res_b = convolve_4tap(s + 2, ff); + __m256i res_b_round = _mm256_sra_epi32( + _mm256_add_epi32(res_b, round_const_bits), round_shift_bits); + + __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); + res_16bit = _mm256_min_epi16(res_16bit, clip_pixel); + res_16bit = _mm256_max_epi16(res_16bit, zero); + + _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch], + _mm256_castsi256_si128(res_16bit)); + _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch], + _mm256_extracti128_si256(res_16bit, 1)); + + s[0] = s[1]; + s[2] = s[3]; + s4 = s6; + } + } +} + +static void aom_highbd_filter_block1d16_v4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + aom_highbd_filter_block1d8_v4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch, + height, filter, bd); + + aom_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8, + dst_pitch, height, filter, bd); +} + +// ----------------------------------------------------------------------------- +// 2-tap vertical filtering + +static void pack_16x2_init(const uint16_t *src, __m256i *sig) { + sig[2] = _mm256_loadu_si256((const __m256i *)src); +} + +static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch, + __m256i *sig) { + // load the next row + const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch)); + sig[0] = _mm256_unpacklo_epi16(sig[2], u); + sig[1] = _mm256_unpackhi_epi16(sig[2], u); + sig[2] = u; +} + +static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + filter_16_2t_pixels(sig, f, y0, y1); +} + +static void aom_highbd_filter_block1d16_v2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[3], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + __m256i ff; + + pack_2t_filter(filter, &ff); + pack_16x2_init(src_ptr, signal); + + do { + pack_16x2_2t_pixels(src_ptr, src_pitch, signal); + filter_16x2_2t_pixels(signal, &ff, &res0, &res1); + store_16x1_pixels(&res0, &res1, &max, dst_ptr); + + src_ptr += src_pitch; + dst_ptr += dst_pitch; + height -= 1; + } while (height > 0); +} + +static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m128i p = _mm_set1_epi32(0x09080706); + f[0] = _mm_shuffle_epi8(h, p); +} + +static void pack_8x2_init(const uint16_t *src, __m128i *sig) { + sig[2] = _mm_loadu_si128((const __m128i *)src); +} + +static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch, + __m128i *sig) { + // load the next row + const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch)); + sig[0] = _mm_unpacklo_epi16(sig[2], u); + sig[1] = _mm_unpackhi_epi16(sig[2], u); + sig[2] = u; +} + +static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f, + __m128i *y0, __m128i *y1) { + const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m128i x0 = _mm_madd_epi16(sig[0], *f); + __m128i x1 = _mm_madd_epi16(sig[1], *f); + x0 = _mm_add_epi32(x0, rounding); + x1 = _mm_add_epi32(x1, rounding); + *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS); + *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS); +} + +static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1, + const __m128i *mask, uint16_t *dst) { + __m128i res = _mm_packus_epi32(*y0, *y1); + res = _mm_min_epi16(res, *mask); + _mm_storeu_si128((__m128i *)dst, res); +} + +static void aom_highbd_filter_block1d8_v2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m128i signal[3], res0, res1; + const __m128i max = _mm_set1_epi16((1 << bd) - 1); + __m128i ff; + + pack_8x1_2t_filter(filter, &ff); + pack_8x2_init(src_ptr, signal); + + do { + pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal); + filter_8_2t_pixels(signal, &ff, &res0, &res1); + store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr); + + src_ptr += src_pitch; + dst_ptr += dst_pitch; + height -= 1; + } while (height > 0); +} + +void aom_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, + ptrdiff_t, uint32_t, const int16_t *, + int); +void aom_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, + ptrdiff_t, uint32_t, const int16_t *, + int); +void aom_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, + ptrdiff_t, uint32_t, const int16_t *, + int); +void aom_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, + ptrdiff_t, uint32_t, const int16_t *, + int); +#define aom_highbd_filter_block1d4_h8_avx2 aom_highbd_filter_block1d4_h8_sse2 +#define aom_highbd_filter_block1d4_h2_avx2 aom_highbd_filter_block1d4_h2_sse2 +#define aom_highbd_filter_block1d4_v8_avx2 aom_highbd_filter_block1d4_v8_sse2 +#define aom_highbd_filter_block1d4_v2_avx2 aom_highbd_filter_block1d4_v2_sse2 + +HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); +HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); + +#undef HIGHBD_FUNC diff --git a/libs/libaom/src/aom_dsp/x86/highbd_convolve_sse2.c b/libs/libaom/src/aom_dsp/x86/highbd_convolve_sse2.c new file mode 100644 index 000000000..a2bb28322 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/highbd_convolve_sse2.c @@ -0,0 +1,351 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "config/aom_dsp_rtcd.h" +#include "aom_dsp/x86/convolve.h" + +// ----------------------------------------------------------------------------- + +void aom_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, + const int16_t *filter, int bd) { + __m128i filtersReg; + __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; + __m128i srcReg23_lo, srcReg34_lo; + __m128i srcReg45_lo, srcReg56_lo; + __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; + __m128i resReg23_45_lo, resReg34_56_lo; + __m128i resReg23_45, resReg34_56; + __m128i addFilterReg64, secondFilters, thirdFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + const __m128i max = _mm_set1_epi16((1 << bd) - 1); + addFilterReg64 = _mm_set1_epi32(64); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 + + // multiply the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = dst_pitch << 1; + + srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3); + + srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); + srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4); + + for (i = height; i > 1; i -= 2) { + srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); + srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5); + + srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); + srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6); + + // multiply 2 adjacent elements with the filter and add the result + + resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters); + resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters); + resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters); + resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters); + + resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo); + resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo); + + // shift by 7 bit each 32 bit + resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64); + resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64); + resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7); + resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7); + + // shrink to 16 bit each 32 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg23_45 = _mm_packs_epi32(resReg23_45_lo, _mm_setzero_si128()); + resReg34_56 = _mm_packs_epi32(resReg34_56_lo, _mm_setzero_si128()); + + resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128()); + resReg23_45 = _mm_min_epi16(resReg23_45, max); + resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128()); + resReg34_56 = _mm_min_epi16(resReg34_56, max); + + src_ptr += src_stride; + + _mm_storel_epi64((__m128i *)dst_ptr, (resReg23_45)); + _mm_storel_epi64((__m128i *)(dst_ptr + dst_pitch), (resReg34_56)); + + dst_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_lo = srcReg45_lo; + srcReg34_lo = srcReg56_lo; + srcReg4 = srcReg6; + } +} + +void aom_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, + const int16_t *filter, int bd) { + __m128i filtersReg; + __m128i addFilterReg64; + __m128i secondFilters, thirdFilters; + __m128i srcRegFilt32b1_1; + __m128i srcReg32b1; + unsigned int i; + src_ptr -= 3; + addFilterReg64 = _mm_set1_epi32(64); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + const __m128i max = _mm_set1_epi16((1 << bd) - 1); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 + + for (i = height; i > 0; i -= 1) { + srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2)); + + __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2); + __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4); + __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6); + __m128i ss_23 = _mm_unpacklo_epi32(srcReg32b1, ss_3_1); + __m128i ss_45 = _mm_unpacklo_epi32(ss_4_1, ss_5_1); + + ss_23 = _mm_madd_epi16(ss_23, secondFilters); + ss_45 = _mm_madd_epi16(ss_45, thirdFilters); + srcRegFilt32b1_1 = _mm_add_epi32(ss_23, ss_45); + + // shift by 7 bit each 32 bit + srcRegFilt32b1_1 = _mm_add_epi32(srcRegFilt32b1_1, addFilterReg64); + srcRegFilt32b1_1 = _mm_srai_epi32(srcRegFilt32b1_1, 7); + + srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128()); + srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); + srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max); + + src_ptr += src_pitch; + + _mm_storel_epi64((__m128i *)dst_ptr, srcRegFilt32b1_1); + + dst_ptr += dst_pitch; + } +} + +void aom_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, + const int16_t *filter, int bd) { + __m128i filtersReg; + __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; + __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi; + __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi; + __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; + __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi; + __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi; + __m128i resReg23_45, resReg34_56; + __m128i addFilterReg64, secondFilters, thirdFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + const __m128i max = _mm_set1_epi16((1 << bd) - 1); + addFilterReg64 = _mm_set1_epi32(64); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = dst_pitch << 1; + + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3); + srcReg23_hi = _mm_unpackhi_epi16(srcReg2, srcReg3); + + srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); + srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4); + srcReg34_hi = _mm_unpackhi_epi16(srcReg3, srcReg4); + + for (i = height; i > 1; i -= 2) { + srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); + + srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5); + srcReg45_hi = _mm_unpackhi_epi16(srcReg4, srcReg5); + + srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); + + srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6); + srcReg56_hi = _mm_unpackhi_epi16(srcReg5, srcReg6); + + // multiply 2 adjacent elements with the filter and add the result + + resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters); + resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters); + resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters); + resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters); + + resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo); + resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo); + + // multiply 2 adjacent elements with the filter and add the result + + resReg23_hi = _mm_madd_epi16(srcReg23_hi, secondFilters); + resReg34_hi = _mm_madd_epi16(srcReg34_hi, secondFilters); + resReg45_hi = _mm_madd_epi16(srcReg45_hi, thirdFilters); + resReg56_hi = _mm_madd_epi16(srcReg56_hi, thirdFilters); + + resReg23_45_hi = _mm_add_epi32(resReg23_hi, resReg45_hi); + resReg34_56_hi = _mm_add_epi32(resReg34_hi, resReg56_hi); + + // shift by 7 bit each 32 bit + resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64); + resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64); + resReg23_45_hi = _mm_add_epi32(resReg23_45_hi, addFilterReg64); + resReg34_56_hi = _mm_add_epi32(resReg34_56_hi, addFilterReg64); + resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7); + resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7); + resReg23_45_hi = _mm_srai_epi32(resReg23_45_hi, 7); + resReg34_56_hi = _mm_srai_epi32(resReg34_56_hi, 7); + + // shrink to 16 bit each 32 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg23_45 = _mm_packs_epi32(resReg23_45_lo, resReg23_45_hi); + resReg34_56 = _mm_packs_epi32(resReg34_56_lo, resReg34_56_hi); + + resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128()); + resReg23_45 = _mm_min_epi16(resReg23_45, max); + resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128()); + resReg34_56 = _mm_min_epi16(resReg34_56, max); + + src_ptr += src_stride; + + _mm_store_si128((__m128i *)dst_ptr, (resReg23_45)); + _mm_store_si128((__m128i *)(dst_ptr + dst_pitch), (resReg34_56)); + + dst_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_lo = srcReg45_lo; + srcReg23_hi = srcReg45_hi; + srcReg34_lo = srcReg56_lo; + srcReg34_hi = srcReg56_hi; + srcReg4 = srcReg6; + } +} + +void aom_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, + const int16_t *filter, int bd) { + __m128i filtersReg; + __m128i addFilterReg64; + __m128i secondFilters, thirdFilters; + __m128i srcRegFilt32b1_1, srcRegFilt32b1_2; + __m128i srcReg32b1, srcReg32b2; + unsigned int i; + src_ptr -= 3; + addFilterReg64 = _mm_set1_epi32(64); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + const __m128i max = _mm_set1_epi16((1 << bd) - 1); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 + + for (i = height; i > 0; i -= 1) { + srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2)); + srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 6)); + + __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4); + __m128i ss_4_2 = _mm_srli_si128(srcReg32b2, 4); + __m128i ss_4 = _mm_unpacklo_epi64(ss_4_1, ss_4_2); + + __m128i d1 = _mm_madd_epi16(srcReg32b1, secondFilters); + __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters); + srcRegFilt32b1_1 = _mm_add_epi32(d1, d2); + + __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2); + __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6); + __m128i ss_3_2 = _mm_srli_si128(srcReg32b2, 2); + __m128i ss_5_2 = _mm_srli_si128(srcReg32b2, 6); + __m128i ss_3 = _mm_unpacklo_epi64(ss_3_1, ss_3_2); + __m128i ss_5 = _mm_unpacklo_epi64(ss_5_1, ss_5_2); + + d1 = _mm_madd_epi16(ss_3, secondFilters); + d2 = _mm_madd_epi16(ss_5, thirdFilters); + srcRegFilt32b1_2 = _mm_add_epi32(d1, d2); + + __m128i res_lo_1 = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); + __m128i res_hi_1 = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); + + // shift by 7 bit each 32 bit + res_lo_1 = _mm_add_epi32(res_lo_1, addFilterReg64); + res_hi_1 = _mm_add_epi32(res_hi_1, addFilterReg64); + res_lo_1 = _mm_srai_epi32(res_lo_1, 7); + res_hi_1 = _mm_srai_epi32(res_hi_1, 7); + + srcRegFilt32b1_1 = _mm_packs_epi32(res_lo_1, res_hi_1); + + srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); + srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max); + + src_ptr += src_pitch; + + _mm_store_si128((__m128i *)dst_ptr, srcRegFilt32b1_1); + + dst_ptr += dst_pitch; + } +} + +void aom_highbd_filter_block1d16_v4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, + const int16_t *filter, int bd) { + aom_highbd_filter_block1d8_v4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch, + height, filter, bd); + aom_highbd_filter_block1d8_v4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8), + dst_pitch, height, filter, bd); +} + +void aom_highbd_filter_block1d16_h4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, + const int16_t *filter, int bd) { + aom_highbd_filter_block1d8_h4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch, + height, filter, bd); + aom_highbd_filter_block1d8_h4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8), + dst_pitch, height, filter, bd); +} diff --git a/libs/libaom/src/aom_dsp/x86/highbd_convolve_ssse3.c b/libs/libaom/src/aom_dsp/x86/highbd_convolve_ssse3.c new file mode 100644 index 000000000..a79350f5a --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/highbd_convolve_ssse3.c @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/convolve_sse2.h" + +void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, + const int subpel_y_qn, + ConvolveParams *conv_params, int bd) { + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride; + (void)filter_params_x; + (void)subpel_x_qn; + (void)conv_params; + + assert(conv_params->round_0 <= FILTER_BITS); + assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) || + ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); + + __m128i s[16], coeffs_y[4]; + + const int bits = FILTER_BITS; + + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); + const __m128i clip_pixel = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m128i zero = _mm_setzero_si128(); + + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); + + for (j = 0; j < w; j += 8) { + const uint16_t *data = &src_ptr[j]; + /* Vertical filter */ + { + __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); + __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); + __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); + __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); + __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); + __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); + __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); + + s[0] = _mm_unpacklo_epi16(s0, s1); + s[1] = _mm_unpacklo_epi16(s2, s3); + s[2] = _mm_unpacklo_epi16(s4, s5); + + s[4] = _mm_unpackhi_epi16(s0, s1); + s[5] = _mm_unpackhi_epi16(s2, s3); + s[6] = _mm_unpackhi_epi16(s4, s5); + + s[0 + 8] = _mm_unpacklo_epi16(s1, s2); + s[1 + 8] = _mm_unpacklo_epi16(s3, s4); + s[2 + 8] = _mm_unpacklo_epi16(s5, s6); + + s[4 + 8] = _mm_unpackhi_epi16(s1, s2); + s[5 + 8] = _mm_unpackhi_epi16(s3, s4); + s[6 + 8] = _mm_unpackhi_epi16(s5, s6); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + + __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride)); + __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride)); + + s[3] = _mm_unpacklo_epi16(s6, s7); + s[7] = _mm_unpackhi_epi16(s6, s7); + + s[3 + 8] = _mm_unpacklo_epi16(s7, s8); + s[7 + 8] = _mm_unpackhi_epi16(s7, s8); + + const __m128i res_a0 = convolve(s, coeffs_y); + __m128i res_a_round0 = _mm_sra_epi32( + _mm_add_epi32(res_a0, round_const_bits), round_shift_bits); + + const __m128i res_a1 = convolve(s + 8, coeffs_y); + __m128i res_a_round1 = _mm_sra_epi32( + _mm_add_epi32(res_a1, round_const_bits), round_shift_bits); + + if (w - j > 4) { + const __m128i res_b0 = convolve(s + 4, coeffs_y); + __m128i res_b_round0 = _mm_sra_epi32( + _mm_add_epi32(res_b0, round_const_bits), round_shift_bits); + + const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y); + __m128i res_b_round1 = _mm_sra_epi32( + _mm_add_epi32(res_b1, round_const_bits), round_shift_bits); + + __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0); + res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel); + res_16bit0 = _mm_max_epi16(res_16bit0, zero); + + __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1); + res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel); + res_16bit1 = _mm_max_epi16(res_16bit1, zero); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_16bit1); + } else if (w == 4) { + res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); + res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); + res_a_round0 = _mm_max_epi16(res_a_round0, zero); + + res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); + res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); + res_a_round1 = _mm_max_epi16(res_a_round1, zero); + + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_a_round1); + } else { + res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); + res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); + res_a_round0 = _mm_max_epi16(res_a_round0, zero); + + res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); + res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); + res_a_round1 = _mm_max_epi16(res_a_round1, zero); + + *((uint32_t *)(&dst[i * dst_stride + j])) = + _mm_cvtsi128_si32(res_a_round0); + + *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) = + _mm_cvtsi128_si32(res_a_round1); + } + + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + + s[0 + 8] = s[1 + 8]; + s[1 + 8] = s[2 + 8]; + s[2 + 8] = s[3 + 8]; + + s[4 + 8] = s[5 + 8]; + s[5 + 8] = s[6 + 8]; + s[6 + 8] = s[7 + 8]; + + s6 = s8; + } + } + } +} + +void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, + const int subpel_y_qn, + ConvolveParams *conv_params, int bd) { + int i, j; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_horiz; + (void)subpel_y_qn; + (void)filter_params_y; + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + + __m128i s[4], coeffs_x[4]; + + const __m128i round_const_x = + _mm_set1_epi32(((1 << conv_params->round_0) >> 1)); + const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); + + const int bits = FILTER_BITS - conv_params->round_0; + + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); + const __m128i clip_pixel = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m128i zero = _mm_setzero_si128(); + + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + { + for (i = 0; i < h; i += 1) { + const __m128i row00 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + const __m128i row01 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]); + + // even pixels + s[0] = _mm_alignr_epi8(row01, row00, 0); + s[1] = _mm_alignr_epi8(row01, row00, 4); + s[2] = _mm_alignr_epi8(row01, row00, 8); + s[3] = _mm_alignr_epi8(row01, row00, 12); + + __m128i res_even = convolve(s, coeffs_x); + res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), + round_shift_x); + + // odd pixels + s[0] = _mm_alignr_epi8(row01, row00, 2); + s[1] = _mm_alignr_epi8(row01, row00, 6); + s[2] = _mm_alignr_epi8(row01, row00, 10); + s[3] = _mm_alignr_epi8(row01, row00, 14); + + __m128i res_odd = convolve(s, coeffs_x); + res_odd = + _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x); + + res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits), + round_shift_bits); + res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits), + round_shift_bits); + + __m128i res_even1 = _mm_packs_epi32(res_even, res_even); + __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd); + __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1); + + res = _mm_min_epi16(res, clip_pixel); + res = _mm_max_epi16(res, zero); + + if (w - j > 4) { + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); + } else if (w == 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res); + } else { + *((uint32_t *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res); + } + } + } + } +} diff --git a/libs/libaom/src/aom_dsp/x86/highbd_intrapred_asm_sse2.asm b/libs/libaom/src/aom_dsp/x86/highbd_intrapred_asm_sse2.asm new file mode 100644 index 000000000..91b3d126c --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/highbd_intrapred_asm_sse2.asm @@ -0,0 +1,259 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_4: times 8 dw 4 +pw_8: times 8 dw 8 +pw_16: times 4 dd 16 +pw_32: times 4 dd 32 + +SECTION .text +INIT_XMM sse2 +cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset + GET_GOT goffsetq + + movq m0, [aboveq] + movq m2, [leftq] + paddw m0, m2 + pshuflw m1, m0, 0xe + paddw m0, m1 + pshuflw m1, m0, 0x1 + paddw m0, m1 + paddw m0, [GLOBAL(pw_4)] + psraw m0, 3 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [leftq] + DEFINE_ARGS dst, stride, stride3, one + mov oned, 0x00010001 + lea stride3q, [strideq*3] + movd m3, oned + pshufd m3, m3, 0x0 + paddw m0, m2 + pmaddwd m0, m3 + packssdw m0, m1 + pmaddwd m0, m3 + packssdw m0, m1 + pmaddwd m0, m3 + paddw m0, [GLOBAL(pw_8)] + psrlw m0, 4 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + lea dstq, [dstq+strideq*8] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m3, [aboveq+16] + mova m2, [leftq] + mova m4, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + movhlps m2, m0 + paddw m0, m2 + punpcklwd m0, m1 + movhlps m2, m0 + paddd m0, m2 + punpckldq m0, m1 + movhlps m2, m0 + paddd m0, m2 + paddd m0, [GLOBAL(pw_16)] + psrad m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m0 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2+16], m0 + lea dstq, [dstq+strideq*8] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset + GET_GOT goffsetq + + mova m0, [aboveq] + mova m2, [aboveq+16] + mova m3, [aboveq+32] + mova m4, [aboveq+48] + paddw m0, m2 + paddw m3, m4 + mova m2, [leftq] + mova m4, [leftq+16] + mova m5, [leftq+32] + mova m6, [leftq+48] + paddw m2, m4 + paddw m5, m6 + paddw m0, m3 + paddw m2, m5 + pxor m1, m1 + paddw m0, m2 + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + movhlps m2, m0 + paddw m0, m2 + punpcklwd m0, m1 + movhlps m2, m0 + paddd m0, m2 + punpckldq m0, m1 + movhlps m2, m0 + paddd m0, m2 + paddd m0, [GLOBAL(pw_32)] + psrad m0, 6 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16 ], m0 + mova [dstq +32 ], m0 + mova [dstq +48 ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16 ], m0 + mova [dstq+strideq*2+32 ], m0 + mova [dstq+strideq*2+48 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4+16 ], m0 + mova [dstq+strideq*4+32 ], m0 + mova [dstq+strideq*4+48 ], m0 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2 +16], m0 + mova [dstq+stride3q*2 +32], m0 + mova [dstq+stride3q*2 +48], m0 + lea dstq, [dstq+strideq*8] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above + movq m0, [aboveq] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + lea dstq, [dstq+strideq*8] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 4 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m1 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m1 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2+16], m1 + lea dstq, [dstq+strideq*8] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + mova m2, [aboveq+32] + mova m3, [aboveq+48] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 8 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq +32], m2 + mova [dstq +48], m3 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m1 + mova [dstq+strideq*2 +32], m2 + mova [dstq+strideq*2 +48], m3 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m1 + mova [dstq+strideq*4 +32], m2 + mova [dstq+strideq*4 +48], m3 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2 +16], m1 + mova [dstq+stride3q*2 +32], m2 + mova [dstq+stride3q*2 +48], m3 + lea dstq, [dstq+strideq*8] + dec nlines4d + jnz .loop + REP_RET diff --git a/libs/libaom/src/aom_dsp/x86/highbd_intrapred_sse2.c b/libs/libaom/src/aom_dsp/x86/highbd_intrapred_sse2.c new file mode 100644 index 000000000..5a55736c4 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/highbd_intrapred_sse2.c @@ -0,0 +1,984 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +// ----------------------------------------------------------------------------- +// H_PRED + +void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + (void)above; + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd); + dst += stride << 2; + left += 4; + aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd); +} + +void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + (void)above; + (void)bd; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); +} + +void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + (void)above; + (void)bd; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7)); +} + +void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd); + dst += stride << 3; + left += 8; + aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd); +} + +static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpacklo_epi64(*row, *row); + _mm_store_si128((__m128i *)*dst, val); + _mm_store_si128((__m128i *)(*dst + 8), val); + *dst += stride; +} + +static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpackhi_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + *dst += stride; +} + +static INLINE void h_predictor_16x8(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + h_store_16_unpacklo(&dst, stride, &row0); + h_store_16_unpacklo(&dst, stride, &row1); + h_store_16_unpacklo(&dst, stride, &row2); + h_store_16_unpacklo(&dst, stride, &row3); + h_store_16_unpackhi(&dst, stride, &row4); + h_store_16_unpackhi(&dst, stride, &row5); + h_store_16_unpackhi(&dst, stride, &row6); + h_store_16_unpackhi(&dst, stride, &row7); +} + +void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)bd; + h_predictor_16x8(dst, stride, left); +} + +void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 2; i++, left += 8) { + h_predictor_16x8(dst, stride, left); + dst += stride << 3; + } +} + +void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 4; i++, left += 8) { + h_predictor_16x8(dst, stride, left); + dst += stride << 3; + } +} + +static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpacklo_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + _mm_store_si128((__m128i *)(*dst + 16), val); + _mm_store_si128((__m128i *)(*dst + 24), val); + *dst += stride; +} + +static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpackhi_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + _mm_store_si128((__m128i *)(*dst + 16), val); + _mm_store_si128((__m128i *)(*dst + 24), val); + *dst += stride; +} + +static INLINE void h_predictor_32x8(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + h_store_32_unpacklo(&dst, stride, &row0); + h_store_32_unpacklo(&dst, stride, &row1); + h_store_32_unpacklo(&dst, stride, &row2); + h_store_32_unpacklo(&dst, stride, &row3); + h_store_32_unpackhi(&dst, stride, &row4); + h_store_32_unpackhi(&dst, stride, &row5); + h_store_32_unpackhi(&dst, stride, &row6); + h_store_32_unpackhi(&dst, stride, &row7); +} + +void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 2; i++, left += 8) { + h_predictor_32x8(dst, stride, left); + dst += stride << 3; + } +} + +void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 4; i++, left += 8) { + h_predictor_32x8(dst, stride, left); + dst += stride << 3; + } +} + +// ----------------------------------------------------------------------------- +// DC_TOP, DC_LEFT, DC_128 + +// 4x4 + +static INLINE __m128i dc_sum_4(const uint16_t *ref) { + const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref); + const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); + const __m128i a = _mm_add_epi16(_dcba, _xxdc); + return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); +} + +static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride, + const __m128i *dc) { + const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); + int i; + for (i = 0; i < 4; ++i, dst += stride) { + _mm_storel_epi64((__m128i *)dst, dc_dup); + } +} + +void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)above; + (void)bd; + dc_store_4x4(dst, stride, &dc); +} + +void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)left; + (void)bd; + dc_store_4x4(dst, stride, &dc); +} + +void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_4x4(dst, stride, &dc_dup); +} + +// ----------------------------------------------------------------------------- +// 4x8 + +static INLINE void dc_store_4x8(uint16_t *dst, ptrdiff_t stride, + const __m128i *dc) { + const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); + int i; + for (i = 0; i < 8; ++i, dst += stride) { + _mm_storel_epi64((__m128i *)dst, dc_dup); + } +} + +// Shared with DC 8xh +static INLINE __m128i dc_sum_8(const uint16_t *ref) { + const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref); + const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8)); + const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); + const __m128i a = _mm_add_epi16(_dcba, _xxdc); + + return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); +} + +void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sum = dc_sum_8(left); + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + (void)above; + (void)bd; + dc_store_4x8(dst, stride, &dc); +} + +void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)left; + (void)bd; + dc_store_4x8(dst, stride, &dc); +} + +void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_4x8(dst, stride, &dc_dup); +} + +// ----------------------------------------------------------------------------- +// 8xh + +static INLINE void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int height, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < height; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + } +} + +// ----------------------------------------------------------------------------- +// DC_TOP + +static INLINE void dc_top_predictor_8xh(uint16_t *dst, ptrdiff_t stride, + int height, const uint16_t *above) { + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i sum = dc_sum_8(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + dc_store_8xh(dst, stride, height, &dc); +} + +void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + dc_top_predictor_8xh(dst, stride, 4, above); +} + +void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + dc_top_predictor_8xh(dst, stride, 8, above); +} + +void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + dc_top_predictor_8xh(dst, stride, 16, above); +} + +// ----------------------------------------------------------------------------- +// DC_LEFT + +void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)above; + (void)bd; + dc_store_8xh(dst, stride, 4, &dc); +} + +void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i sum = dc_sum_8(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + (void)above; + (void)bd; + dc_store_8xh(dst, stride, 8, &dc); +} + +// Shared with DC 16xh +static INLINE __m128i dc_sum_16(const uint16_t *ref) { + const __m128i sum_lo = dc_sum_8(ref); + const __m128i sum_hi = dc_sum_8(ref + 8); + return _mm_add_epi16(sum_lo, sum_hi); +} + +void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)above; + (void)bd; + dc_store_8xh(dst, stride, 16, &dc); +} + +// ----------------------------------------------------------------------------- +// DC_128 + +static INLINE void dc_128_predictor_8xh(uint16_t *dst, ptrdiff_t stride, + int height, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + dc_store_8xh(dst, stride, height, &dc_dup); +} + +void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)left; + dc_128_predictor_8xh(dst, stride, 4, bd); +} + +void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)left; + dc_128_predictor_8xh(dst, stride, 8, bd); +} + +void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)left; + dc_128_predictor_8xh(dst, stride, 16, bd); +} + +// ----------------------------------------------------------------------------- +// 16xh + +static INLINE void dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int height, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < height; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + _mm_store_si128((__m128i *)(dst + 8), dc_dup); + } +} + +// ----------------------------------------------------------------------------- +// DC_LEFT + +void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i sum = dc_sum_8(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + (void)above; + (void)bd; + dc_store_16xh(dst, stride, 8, &dc); +} + +void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)above; + (void)bd; + dc_store_16xh(dst, stride, 16, &dc); +} + +// Shared with 32xh +static INLINE __m128i dc_sum_32(const uint16_t *ref) { + const __m128i zero = _mm_setzero_si128(); + const __m128i sum_a = dc_sum_16(ref); + const __m128i sum_b = dc_sum_16(ref + 16); + // 12 bit bd will outrange, so expand to 32 bit before adding final total + return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero), + _mm_unpacklo_epi16(sum_b, zero)); +} + +void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(left); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)above; + (void)bd; + dc_store_16xh(dst, stride, 32, &dc); +} + +// ----------------------------------------------------------------------------- +// DC_TOP + +void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)left; + (void)bd; + dc_store_16xh(dst, stride, 8, &dc); +} + +void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)left; + (void)bd; + dc_store_16xh(dst, stride, 16, &dc); +} + +void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)left; + (void)bd; + dc_store_16xh(dst, stride, 32, &dc); +} + +// ----------------------------------------------------------------------------- +// DC_128 + +void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_16xh(dst, stride, 8, &dc_dup); +} + +void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_16xh(dst, stride, 16, &dc_dup); +} + +void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_16xh(dst, stride, 32, &dc_dup); +} + +// ----------------------------------------------------------------------------- +// 32xh + +static INLINE void dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int height, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < height; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + _mm_store_si128((__m128i *)(dst + 8), dc_dup); + _mm_store_si128((__m128i *)(dst + 16), dc_dup); + _mm_store_si128((__m128i *)(dst + 24), dc_dup); + } +} + +void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)above; + (void)bd; + dc_store_32xh(dst, stride, 16, &dc); +} + +void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(left); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)above; + (void)bd; + dc_store_32xh(dst, stride, 32, &dc); +} + +void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(above); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)left; + (void)bd; + dc_store_32xh(dst, stride, 16, &dc); +} + +void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_32xh(dst, stride, 16, &dc_dup); +} + +void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(above); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)left; + (void)bd; + dc_store_32xh(dst, stride, 32, &dc); +} + +void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_32xh(dst, stride, 32, &dc_dup); +} + +// ----------------------------------------------------------------------------- +// V_PRED + +void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above); + int i; + for (i = 0; i < 2; ++i) { + _mm_storel_epi64((__m128i *)dst, above_u16); + _mm_storel_epi64((__m128i *)(dst + stride), above_u16); + _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16); + _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16); + dst += stride << 2; + } +} + +void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above_u16 = _mm_load_si128((const __m128i *)above); + _mm_store_si128((__m128i *)dst, above_u16); + _mm_store_si128((__m128i *)(dst + stride), above_u16); + _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16); + _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16); +} + +void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above_u16 = _mm_load_si128((const __m128i *)above); + int i; + for (i = 0; i < 4; ++i) { + _mm_store_si128((__m128i *)dst, above_u16); + _mm_store_si128((__m128i *)(dst + stride), above_u16); + _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16); + _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16); + dst += stride << 2; + } +} + +void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); + const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); + int i; + for (i = 0; i < 2; ++i) { + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + } +} + +void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); + const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); + int i; + for (i = 0; i < 8; ++i) { + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + } +} + +void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); + const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i above2_u16 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i above3_u16 = _mm_load_si128((const __m128i *)(above + 24)); + int i; + for (i = 0; i < 4; ++i) { + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + _mm_store_si128((__m128i *)(dst + 16), above2_u16); + _mm_store_si128((__m128i *)(dst + 24), above3_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + _mm_store_si128((__m128i *)(dst + 16), above2_u16); + _mm_store_si128((__m128i *)(dst + 24), above3_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + _mm_store_si128((__m128i *)(dst + 16), above2_u16); + _mm_store_si128((__m128i *)(dst + 24), above3_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + _mm_store_si128((__m128i *)(dst + 16), above2_u16); + _mm_store_si128((__m128i *)(dst + 24), above3_u16); + dst += stride; + } +} + +// ----------------------------------------------------------------------------- +// DC_PRED + +void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + const __m128i sum_above = dc_sum_4(above); + const __m128i sum_left = dc_sum_8(left); + const __m128i sum = _mm_add_epi16(sum_above, sum_left); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 >>= 16; + sum32 += 6; + sum32 /= 12; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + int i; + for (i = 0; i < 4; ++i) { + _mm_storel_epi64((__m128i *)dst, row); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row); + dst += stride; + } +} + +void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + const __m128i sum_left = dc_sum_4(left); + const __m128i sum_above = dc_sum_8(above); + const __m128i sum = _mm_add_epi16(sum_above, sum_left); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 >>= 16; + sum32 += 6; + sum32 /= 12; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); +} + +void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + __m128i sum_left = dc_sum_16(left); + __m128i sum_above = dc_sum_8(above); + const __m128i zero = _mm_setzero_si128(); + sum_left = _mm_unpacklo_epi16(sum_left, zero); + sum_above = _mm_unpacklo_epi16(sum_above, zero); + const __m128i sum = _mm_add_epi32(sum_left, sum_above); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 += 12; + sum32 /= 24; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + int i; + for (i = 0; i < 4; ++i) { + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + } +} + +void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + __m128i sum_left = dc_sum_8(left); + __m128i sum_above = dc_sum_16(above); + const __m128i zero = _mm_setzero_si128(); + sum_left = _mm_unpacklo_epi16(sum_left, zero); + sum_above = _mm_unpacklo_epi16(sum_above, zero); + const __m128i sum = _mm_add_epi32(sum_left, sum_above); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 += 12; + sum32 /= 24; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + int i; + for (i = 0; i < 2; ++i) { + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + } +} + +void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + __m128i sum_left = dc_sum_32(left); + __m128i sum_above = dc_sum_16(above); + const __m128i zero = _mm_setzero_si128(); + sum_above = _mm_unpacklo_epi16(sum_above, zero); + const __m128i sum = _mm_add_epi32(sum_left, sum_above); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 += 24; + sum32 /= 48; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + int i; + for (i = 0; i < 8; ++i) { + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + } +} + +void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + __m128i sum_left = dc_sum_16(left); + __m128i sum_above = dc_sum_32(above); + const __m128i zero = _mm_setzero_si128(); + sum_left = _mm_unpacklo_epi16(sum_left, zero); + const __m128i sum = _mm_add_epi32(sum_left, sum_above); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 += 24; + sum32 /= 48; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + int i; + for (i = 0; i < 4; ++i) { + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + _mm_store_si128((__m128i *)(dst + 16), row); + _mm_store_si128((__m128i *)(dst + 24), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + _mm_store_si128((__m128i *)(dst + 16), row); + _mm_store_si128((__m128i *)(dst + 24), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + _mm_store_si128((__m128i *)(dst + 16), row); + _mm_store_si128((__m128i *)(dst + 24), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + _mm_store_si128((__m128i *)(dst + 16), row); + _mm_store_si128((__m128i *)(dst + 24), row); + dst += stride; + } +} diff --git a/libs/libaom/src/aom_dsp/x86/highbd_loopfilter_avx2.c b/libs/libaom/src/aom_dsp/x86/highbd_loopfilter_avx2.c new file mode 100644 index 000000000..c954da94e --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/highbd_loopfilter_avx2.c @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/common_avx2.h" +#include "aom_dsp/x86/lpf_common_sse2.h" +#include "aom/aom_integer.h" + +void aom_highbd_lpf_horizontal_14_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_14_dual_sse2(s, p, blimit0, limit0, thresh0, + blimit1, limit1, thresh1, bd); +} + +void aom_highbd_lpf_vertical_14_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_14_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} + +void aom_highbd_lpf_horizontal_4_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} + +void aom_highbd_lpf_horizontal_8_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} + +void aom_highbd_lpf_vertical_4_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} + +void aom_highbd_lpf_vertical_8_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} diff --git a/libs/libaom/src/aom_dsp/x86/highbd_loopfilter_sse2.c b/libs/libaom/src/aom_dsp/x86/highbd_loopfilter_sse2.c new file mode 100644 index 000000000..ea7dc6a9e --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/highbd_loopfilter_sse2.c @@ -0,0 +1,1698 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/lpf_common_sse2.h" + +static AOM_FORCE_INLINE void pixel_clamp(const __m128i *min, const __m128i *max, + __m128i *pixel) { + *pixel = _mm_min_epi16(*pixel, *max); + *pixel = _mm_max_epi16(*pixel, *min); +} + +static AOM_FORCE_INLINE __m128i abs_diff16(__m128i a, __m128i b) { + return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a)); +} + +static INLINE void get_limit(const uint8_t *bl, const uint8_t *l, + const uint8_t *t, int bd, __m128i *blt, + __m128i *lt, __m128i *thr, __m128i *t80_out) { + const int shift = bd - 8; + const __m128i zero = _mm_setzero_si128(); + + __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero); + *blt = _mm_slli_epi16(x, shift); + + x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero); + *lt = _mm_slli_epi16(x, shift); + + x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero); + *thr = _mm_slli_epi16(x, shift); + + *t80_out = _mm_set1_epi16(1 << (bd - 1)); +} + +static INLINE void get_limit_dual( + const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0, + const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1, + int bd, __m128i *blt_out, __m128i *lt_out, __m128i *thr_out, + __m128i *t80_out) { + const int shift = bd - 8; + const __m128i zero = _mm_setzero_si128(); + + __m128i x0 = + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit0), zero); + __m128i x1 = + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit1), zero); + x0 = _mm_unpacklo_epi64(x0, x1); + *blt_out = _mm_slli_epi16(x0, shift); + + x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit0), zero); + x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit1), zero); + x0 = _mm_unpacklo_epi64(x0, x1); + *lt_out = _mm_slli_epi16(x0, shift); + + x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh0), zero); + x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh1), zero); + x0 = _mm_unpacklo_epi64(x0, x1); + *thr_out = _mm_slli_epi16(x0, shift); + + *t80_out = _mm_set1_epi16(1 << (bd - 1)); +} + +static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch, + __m128i *p, __m128i *q) { + int i; + for (i = 0; i < size; i++) { + p[i] = _mm_loadu_si128((__m128i *)(s - (i + 1) * pitch)); + q[i] = _mm_loadu_si128((__m128i *)(s + i * pitch)); + } +} + +static INLINE void highbd_filter_mask_dual(const __m128i *p, const __m128i *q, + const __m128i *l, const __m128i *bl, + __m128i *mask) { + __m128i abs_p0q0 = abs_diff16(p[0], q[0]); + __m128i abs_p1q1 = abs_diff16(p[1], q[1]); + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); + + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i ffff = _mm_set1_epi16((short)0xFFFF); + + __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl); + max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff); + max = _mm_and_si128(max, _mm_adds_epu16(*l, one)); + + int i; + for (i = 1; i < 4; ++i) { + max = _mm_max_epi16(max, abs_diff16(p[i], p[i - 1])); + max = _mm_max_epi16(max, abs_diff16(q[i], q[i - 1])); + } + max = _mm_subs_epu16(max, *l); + *mask = _mm_cmpeq_epi16(max, zero); // return ~mask +} + +static INLINE void highbd_hev_filter_mask_x_sse2(__m128i *pq, int x, + __m128i *p1p0, __m128i *q1q0, + __m128i *abs_p1p0, __m128i *l, + __m128i *bl, __m128i *t, + __m128i *hev, __m128i *mask) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i ffff = _mm_set1_epi16((short)0xFFFF); + __m128i abs_p0q0_p1q1, abs_p0q0, abs_p1q1, abs_q1q0; + __m128i max, max01, h; + + *p1p0 = _mm_unpacklo_epi64(pq[0], pq[1]); + *q1q0 = _mm_unpackhi_epi64(pq[0], pq[1]); + + abs_p0q0_p1q1 = abs_diff16(*p1p0, *q1q0); + abs_p0q0 = _mm_adds_epu16(abs_p0q0_p1q1, abs_p0q0_p1q1); + abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero); + + abs_p1q1 = _mm_srli_si128(abs_p0q0_p1q1, 8); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // divide by 2 + + max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl); + max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff); + // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1; + // So taking maximums continues to work: + max = _mm_and_si128(max, _mm_adds_epu16(*l, one)); + + *abs_p1p0 = abs_diff16(pq[0], pq[1]); + abs_q1q0 = _mm_srli_si128(*abs_p1p0, 8); + max01 = _mm_max_epi16(*abs_p1p0, abs_q1q0); + // mask |= (abs(*p1 - *p0) > limit) * -1; + // mask |= (abs(*q1 - *q0) > limit) * -1; + h = _mm_subs_epu16(max01, *t); + + *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff); + // replicate for the further "merged variables" usage + *hev = _mm_unpacklo_epi64(*hev, *hev); + + max = _mm_max_epi16(max, max01); + int i; + for (i = 2; i < x; ++i) { + max = _mm_max_epi16(max, abs_diff16(pq[i], pq[i - 1])); + } + max = _mm_max_epi16(max, _mm_srli_si128(max, 8)); + + max = _mm_subs_epu16(max, *l); + *mask = _mm_cmpeq_epi16(max, zero); // ~mask +} + +static INLINE void flat_mask_internal(const __m128i *th, const __m128i *pq, + int start, int end, __m128i *flat) { + int i; + __m128i max = _mm_max_epi16(abs_diff16(pq[start], pq[0]), + abs_diff16(pq[start + 1], pq[0])); + + for (i = start + 2; i < end; ++i) { + max = _mm_max_epi16(max, abs_diff16(pq[i], pq[0])); + } + max = _mm_max_epi16(max, _mm_srli_si128(max, 8)); + + __m128i ft; + ft = _mm_subs_epu16(max, *th); + + const __m128i zero = _mm_setzero_si128(); + *flat = _mm_cmpeq_epi16(ft, zero); +} + +static INLINE void flat_mask_internal_dual(const __m128i *th, const __m128i *p, + const __m128i *q, int start, int end, + __m128i *flat) { + int i; + __m128i max = + _mm_max_epi16(abs_diff16(q[start], q[0]), abs_diff16(p[start], p[0])); + + for (i = start + 1; i < end; ++i) { + max = _mm_max_epi16(max, abs_diff16(p[i], p[0])); + max = _mm_max_epi16(max, abs_diff16(q[i], q[0])); + } + + __m128i ft; + ft = _mm_subs_epu16(max, *th); + + const __m128i zero = _mm_setzero_si128(); + *flat = _mm_cmpeq_epi16(ft, zero); +} + +static INLINE void highbd_flat_mask4_sse2(__m128i *pq, __m128i *flat, + __m128i *flat2, int bd) { + // check the distance 1,2,3 against 0 + __m128i th = _mm_set1_epi16(1); + th = _mm_slli_epi16(th, bd - 8); + flat_mask_internal(&th, pq, 1, 4, flat); + flat_mask_internal(&th, pq, 4, 7, flat2); +} + +static INLINE void highbd_flat_mask4_dual_sse2(const __m128i *p, + const __m128i *q, __m128i *flat, + __m128i *flat2, int bd) { + // check the distance 1,2,3 against 0 + __m128i th = _mm_set1_epi16(1); + th = _mm_slli_epi16(th, bd - 8); + flat_mask_internal_dual(&th, p, q, 1, 4, flat); + flat_mask_internal_dual(&th, p, q, 4, 7, flat2); +} + +static AOM_FORCE_INLINE void highbd_filter4_sse2(__m128i *p1p0, __m128i *q1q0, + __m128i *hev, __m128i *mask, + __m128i *qs1qs0, + __m128i *ps1ps0, __m128i *t80, + int bd) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i pmax = + _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80); + const __m128i pmin = _mm_subs_epi16(zero, *t80); + + const __m128i t3t4 = _mm_set_epi16(3, 3, 3, 3, 4, 4, 4, 4); + __m128i ps1ps0_work, qs1qs0_work, work; + __m128i filt, filter2filter1, filter2filt, filter1filt; + + ps1ps0_work = _mm_subs_epi16(*p1p0, *t80); + qs1qs0_work = _mm_subs_epi16(*q1q0, *t80); + + work = _mm_subs_epi16(ps1ps0_work, qs1qs0_work); + pixel_clamp(&pmin, &pmax, &work); + filt = _mm_and_si128(_mm_srli_si128(work, 8), *hev); + + filt = _mm_subs_epi16(filt, work); + filt = _mm_subs_epi16(filt, work); + filt = _mm_subs_epi16(filt, work); + // (aom_filter + 3 * (qs0 - ps0)) & mask + pixel_clamp(&pmin, &pmax, &filt); + filt = _mm_and_si128(filt, *mask); + filt = _mm_unpacklo_epi64(filt, filt); + + filter2filter1 = _mm_adds_epi16(filt, t3t4); /* signed_short_clamp */ + pixel_clamp(&pmin, &pmax, &filter2filter1); + filter2filter1 = _mm_srai_epi16(filter2filter1, 3); /* >> 3 */ + + filt = _mm_unpacklo_epi64(filter2filter1, filter2filter1); + + // filt >> 1 + filt = _mm_adds_epi16(filt, one); + filt = _mm_srai_epi16(filt, 1); + filt = _mm_andnot_si128(*hev, filt); + + filter2filt = _mm_unpackhi_epi64(filter2filter1, filt); + filter1filt = _mm_unpacklo_epi64(filter2filter1, filt); + + qs1qs0_work = _mm_subs_epi16(qs1qs0_work, filter1filt); + ps1ps0_work = _mm_adds_epi16(ps1ps0_work, filter2filt); + + pixel_clamp(&pmin, &pmax, &qs1qs0_work); + pixel_clamp(&pmin, &pmax, &ps1ps0_work); + + *qs1qs0 = _mm_adds_epi16(qs1qs0_work, *t80); + *ps1ps0 = _mm_adds_epi16(ps1ps0_work, *t80); +} + +static INLINE void highbd_filter4_dual_sse2(__m128i *p, __m128i *q, __m128i *ps, + __m128i *qs, const __m128i *mask, + const __m128i *th, int bd, + __m128i *t80) { + __m128i ps0 = _mm_subs_epi16(p[0], *t80); + __m128i ps1 = _mm_subs_epi16(p[1], *t80); + __m128i qs0 = _mm_subs_epi16(q[0], *t80); + __m128i qs1 = _mm_subs_epi16(q[1], *t80); + const __m128i one = _mm_set1_epi16(1); + const __m128i pmax = + _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80); + + const __m128i zero = _mm_setzero_si128(); + const __m128i pmin = _mm_subs_epi16(zero, *t80); + __m128i filter = _mm_subs_epi16(ps1, qs1); + pixel_clamp(&pmin, &pmax, &filter); + + // hev_filter + __m128i hev; + const __m128i abs_p1p0 = abs_diff16(p[1], p[0]); + const __m128i abs_q1q0 = abs_diff16(q[1], q[0]); + __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0); + h = _mm_subs_epu16(h, *th); + const __m128i ffff = _mm_cmpeq_epi16(h, h); + hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff); + + filter = _mm_and_si128(filter, hev); + + const __m128i x = _mm_subs_epi16(qs0, ps0); + filter = _mm_adds_epi16(filter, x); + filter = _mm_adds_epi16(filter, x); + filter = _mm_adds_epi16(filter, x); + pixel_clamp(&pmin, &pmax, &filter); + filter = _mm_and_si128(filter, *mask); + const __m128i t3 = _mm_set1_epi16(3); + const __m128i t4 = _mm_set1_epi16(4); + __m128i filter1 = _mm_adds_epi16(filter, t4); + __m128i filter2 = _mm_adds_epi16(filter, t3); + pixel_clamp(&pmin, &pmax, &filter1); + pixel_clamp(&pmin, &pmax, &filter2); + filter1 = _mm_srai_epi16(filter1, 3); + filter2 = _mm_srai_epi16(filter2, 3); + qs0 = _mm_subs_epi16(qs0, filter1); + pixel_clamp(&pmin, &pmax, &qs0); + ps0 = _mm_adds_epi16(ps0, filter2); + pixel_clamp(&pmin, &pmax, &ps0); + qs[0] = _mm_adds_epi16(qs0, *t80); + ps[0] = _mm_adds_epi16(ps0, *t80); + filter = _mm_adds_epi16(filter1, one); + filter = _mm_srai_epi16(filter, 1); + filter = _mm_andnot_si128(hev, filter); + qs1 = _mm_subs_epi16(qs1, filter); + pixel_clamp(&pmin, &pmax, &qs1); + ps1 = _mm_adds_epi16(ps1, filter); + pixel_clamp(&pmin, &pmax, &ps1); + qs[1] = _mm_adds_epi16(qs1, *t80); + ps[1] = _mm_adds_epi16(ps1, *t80); +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2( + __m128i *p, __m128i *q, __m128i *pq, const unsigned char *blt, + const unsigned char *lt, const unsigned char *thr, int bd) { + int i; + const __m128i zero = _mm_setzero_si128(); + __m128i blimit, limit, thresh; + __m128i t80; + get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80); + + for (i = 0; i < 7; i++) { + pq[i] = _mm_unpacklo_epi64(p[i], q[i]); + } + __m128i mask, hevhev; + __m128i p1p0, q1q0, abs_p1p0; + + highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, + &thresh, &hevhev, &mask); + + __m128i ps0ps1, qs0qs1; + // filter4 + highbd_filter4_sse2(&p1p0, &q1q0, &hevhev, &mask, &qs0qs1, &ps0ps1, &t80, bd); + + __m128i flat, flat2; + highbd_flat_mask4_sse2(pq, &flat, &flat2, bd); + + flat = _mm_and_si128(flat, mask); + flat2 = _mm_and_si128(flat2, flat); + + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi64(flat, flat); + flat2 = _mm_unpacklo_epi64(flat2, flat2); + + // flat and wide flat calculations + + // if flat ==0 then flat2 is zero as well and we don't need any calc below + // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i flat_p[3], flat_q[3], flat_pq[3]; + __m128i flat2_p[6], flat2_q[6]; + __m128i flat2_pq[6]; + __m128i sum_p6, sum_p3; + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + + __m128i work0, work0_0, work0_1, sum_p_0; + __m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3])); + __m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1])); + sum_p = _mm_add_epi16(sum_p, sum_lp); + + __m128i sum_lq = _mm_srli_si128(sum_lp, 8); + __m128i sum_q = _mm_srli_si128(sum_p, 8); + + sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); + sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); + + flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq[3], pq[0])); + flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])); + + sum_p6 = _mm_add_epi16(pq[6], pq[6]); + sum_p3 = _mm_add_epi16(pq[3], pq[3]); + + sum_q = _mm_sub_epi16(sum_p_0, pq[5]); + sum_p = _mm_sub_epi16(sum_p_0, q[5]); + + work0_0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]); + work0_1 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0]))); + + sum_lq = _mm_sub_epi16(sum_lp, pq[2]); + sum_lp = _mm_sub_epi16(sum_lp, q[2]); + + work0 = _mm_add_epi16(sum_p3, pq[1]); + flat_p[1] = _mm_add_epi16(sum_lp, work0); + flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); + + flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3); + flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3); + + sum_lp = _mm_sub_epi16(sum_lp, q[1]); + sum_lq = _mm_sub_epi16(sum_lq, pq[1]); + + sum_p3 = _mm_add_epi16(sum_p3, pq[3]); + work0 = _mm_add_epi16(sum_p3, pq[2]); + + flat_p[2] = _mm_add_epi16(sum_lp, work0); + flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); + flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3); + + int flat2_mask = + (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero))); + if (flat2_mask) { + flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q[0])); + flat2_q[0] = _mm_add_epi16( + sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq[0])); + + flat2_p[1] = _mm_add_epi16(sum_p, work0_1); + flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8)); + + flat2_pq[0] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4); + flat2_pq[1] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4); + + sum_p = _mm_sub_epi16(sum_p, q[4]); + sum_q = _mm_sub_epi16(sum_q, pq[4]); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1]))); + flat2_p[2] = _mm_add_epi16(sum_p, work0); + flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[2] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + sum_p = _mm_sub_epi16(sum_p, q[3]); + sum_q = _mm_sub_epi16(sum_q, pq[3]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2]))); + flat2_p[3] = _mm_add_epi16(sum_p, work0); + flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[3] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + sum_p = _mm_sub_epi16(sum_p, q[2]); + sum_q = _mm_sub_epi16(sum_q, pq[2]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3]))); + flat2_p[4] = _mm_add_epi16(sum_p, work0); + flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[4] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + sum_p = _mm_sub_epi16(sum_p, q[1]); + sum_q = _mm_sub_epi16(sum_q, pq[1]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4]))); + flat2_p[5] = _mm_add_epi16(sum_p, work0); + flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[5] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4); + } // flat2 + // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // highbd_filter8 + pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1); + pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1); + + for (i = 0; i < 3; i++) { + pq[i] = _mm_andnot_si128(flat, pq[i]); + flat_pq[i] = _mm_and_si128(flat, flat_pq[i]); + pq[i] = _mm_or_si128(pq[i], flat_pq[i]); + } + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + if (flat2_mask) { + for (i = 0; i < 6; i++) { + pq[i] = _mm_andnot_si128(flat2, pq[i]); + flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]); + pq[i] = _mm_or_si128(pq[i], flat2_pq[i]); // full list of pq values + } + } + } else { + pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1); + pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1); + } +} + +void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + __m128i p[7], q[7], pq[7]; + int i; + + for (i = 0; i < 7; i++) { + p[i] = _mm_loadl_epi64((__m128i *)(s - (i + 1) * pitch)); + q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch)); + } + + highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd); + + for (i = 0; i < 6; i++) { + _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), pq[i]); + _mm_storel_epi64((__m128i *)(s + i * pitch), _mm_srli_si128(pq[i], 8)); + } +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2( + __m128i *p, __m128i *q, const uint8_t *blt0, const uint8_t *lt0, + const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1, + const uint8_t *thr1, int bd) { + __m128i blimit, limit, thresh, t80; + const __m128i zero = _mm_setzero_si128(); + + get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh, + &t80); + __m128i mask; + highbd_filter_mask_dual(p, q, &limit, &blimit, &mask); + __m128i flat, flat2; + highbd_flat_mask4_dual_sse2(p, q, &flat, &flat2, bd); + + flat = _mm_and_si128(flat, mask); + flat2 = _mm_and_si128(flat2, flat); + __m128i ps[2], qs[2]; + highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh, bd, &t80); + // flat and wide flat calculations + + // if flat ==0 then flat2 is zero as well and we don't need any calc below + // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i flat_p[3], flat_q[3]; + __m128i flat2_p[6], flat2_q[6]; + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i sum_p_0 = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3])); + __m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3])); + __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1])); + sum_p_0 = _mm_add_epi16(sum_p_0, sum_lp); + __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1])); + sum_q = _mm_add_epi16(sum_q, sum_lq); + sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p_0, sum_q)); + sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); + flat_p[0] = + _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3); + flat_q[0] = + _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])), 3); + __m128i sum_p6 = _mm_add_epi16(p[6], p[6]); + __m128i sum_q6 = _mm_add_epi16(q[6], q[6]); + __m128i sum_p3 = _mm_add_epi16(p[3], p[3]); + __m128i sum_q3 = _mm_add_epi16(q[3], q[3]); + + sum_q = _mm_sub_epi16(sum_p_0, p[5]); + __m128i sum_p = _mm_sub_epi16(sum_p_0, q[5]); + + sum_lq = _mm_sub_epi16(sum_lp, p[2]); + sum_lp = _mm_sub_epi16(sum_lp, q[2]); + flat_p[1] = + _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3); + flat_q[1] = + _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3); + + sum_lp = _mm_sub_epi16(sum_lp, q[1]); + sum_lq = _mm_sub_epi16(sum_lq, p[1]); + sum_p3 = _mm_add_epi16(sum_p3, p[3]); + sum_q3 = _mm_add_epi16(sum_q3, q[3]); + flat_p[2] = + _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3); + flat_q[2] = + _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3); + + int flat2_mask = + (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero))); + if (flat2_mask) { + flat2_p[0] = _mm_srli_epi16( + _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(p[6], p[0]), + _mm_add_epi16(p[1], q[0]))), + 4); + flat2_q[0] = _mm_srli_epi16( + _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(q[6], q[0]), + _mm_add_epi16(p[0], q[1]))), + 4); + + flat2_p[1] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))), + 4); + flat2_q[1] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[4]); + sum_q = _mm_sub_epi16(sum_q, p[4]); + flat2_p[2] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))), + 4); + flat2_q[2] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[3]); + sum_q = _mm_sub_epi16(sum_q, p[3]); + flat2_p[3] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))), + 4); + flat2_q[3] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[2]); + sum_q = _mm_sub_epi16(sum_q, p[2]); + flat2_p[4] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))), + 4); + flat2_q[4] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[1]); + sum_q = _mm_sub_epi16(sum_q, p[1]); + flat2_p[5] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))), + 4); + flat2_q[5] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))), + 4); + } + // highbd_filter8 + int i; + for (i = 0; i < 2; i++) { + ps[i] = _mm_andnot_si128(flat, ps[i]); + flat_p[i] = _mm_and_si128(flat, flat_p[i]); + p[i] = _mm_or_si128(ps[i], flat_p[i]); + qs[i] = _mm_andnot_si128(flat, qs[i]); + flat_q[i] = _mm_and_si128(flat, flat_q[i]); + q[i] = _mm_or_si128(qs[i], flat_q[i]); + } + p[2] = _mm_andnot_si128(flat, p[2]); + // p2 remains unchanged if !(flat && mask) + flat_p[2] = _mm_and_si128(flat, flat_p[2]); + // when (flat && mask) + p[2] = _mm_or_si128(p[2], flat_p[2]); // full list of p2 values + q[2] = _mm_andnot_si128(flat, q[2]); + flat_q[2] = _mm_and_si128(flat, flat_q[2]); + q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values + + for (i = 0; i < 2; i++) { + ps[i] = _mm_andnot_si128(flat, ps[i]); + flat_p[i] = _mm_and_si128(flat, flat_p[i]); + p[i] = _mm_or_si128(ps[i], flat_p[i]); + qs[i] = _mm_andnot_si128(flat, qs[i]); + flat_q[i] = _mm_and_si128(flat, flat_q[i]); + q[i] = _mm_or_si128(qs[i], flat_q[i]); + } + // highbd_filter16 + if (flat2_mask) { + for (i = 0; i < 6; i++) { + // p[i] remains unchanged if !(flat2 && flat && mask) + p[i] = _mm_andnot_si128(flat2, p[i]); + flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]); + // get values for when (flat2 && flat && mask) + p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values + q[i] = _mm_andnot_si128(flat2, q[i]); + flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]); + q[i] = _mm_or_si128(q[i], flat2_q[i]); + } + } + } else { + p[0] = ps[0]; + q[0] = qs[0]; + p[1] = ps[1]; + q[1] = qs[1]; + } +} + +void aom_highbd_lpf_horizontal_14_dual_sse2( + uint16_t *s, int pitch, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m128i p[7], q[7]; + int i; + load_highbd_pixel(s, 7, pitch, p, q); + + highbd_lpf_internal_14_dual_sse2(p, q, _blimit0, _limit0, _thresh0, _blimit1, + _limit1, _thresh1, bd); + + for (i = 0; i < 6; i++) { + _mm_storeu_si128((__m128i *)(s - (i + 1) * pitch), p[i]); + _mm_storeu_si128((__m128i *)(s + i * pitch), q[i]); + } +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2( + __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, + __m128i *q2, __m128i *p1p0_out, __m128i *q1q0_out, const uint8_t *_blimit, + const uint8_t *_limit, const uint8_t *_thresh, int bd) { + __m128i blimit, limit, thresh; + __m128i mask, hev, flat; + __m128i pq[3]; + __m128i p1p0, q1q0, abs_p1p0, ps1ps0, qs1qs0; + __m128i flat_p1p0, flat_q0q1; + + pq[0] = _mm_unpacklo_epi64(*p0, *q0); + pq[1] = _mm_unpacklo_epi64(*p1, *q1); + pq[2] = _mm_unpacklo_epi64(*p2, *q2); + + const __m128i zero = _mm_setzero_si128(); + const __m128i four = _mm_set1_epi16(4); + __m128i t80; + const __m128i one = _mm_set1_epi16(0x1); + + get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80); + + highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, + &thresh, &hev, &mask); + + // lp filter + highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd); + + // flat_mask + flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0); + flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8)); + + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); + + flat = _mm_cmpeq_epi16(flat, zero); + flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi64(flat, flat); + + // 5 tap filter + // need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i workp_a, workp_b, workp_c; + __m128i pq0x2_pq1, pq1_pq2; + + // op1 + pq0x2_pq1 = + _mm_add_epi16(_mm_add_epi16(pq[0], pq[0]), pq[1]); // p0 *2 + p1 + pq1_pq2 = _mm_add_epi16(pq[1], pq[2]); // p1 + p2 + workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four), + pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4 + + workp_b = _mm_add_epi16(_mm_add_epi16(pq[2], pq[2]), *q0); + workp_b = + _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4 + + // op0 + workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1 + workp_a = _mm_add_epi16(workp_a, + workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4 + workp_b = _mm_unpacklo_epi64(workp_a, workp_b); + flat_p1p0 = _mm_srli_epi16(workp_b, 3); + + // oq0 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[2]), + pq[1]); // p0 * 2 + p1 + q0 * 2 + q1 + 4 + workp_b = _mm_srli_si128(pq1_pq2, 8); + workp_a = _mm_add_epi16( + workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4 + // workp_shft0 = _mm_srli_epi16(workp_a, 3); + + // oq1 + workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[1]), + pq[0]); // p0 + q0 * 2 + q1 * 2 + q2 + 4 + workp_b = _mm_add_epi16(*q2, *q2); + workp_b = + _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4 + + workp_a = _mm_unpacklo_epi64(workp_a, workp_b); + flat_q0q1 = _mm_srli_epi16(workp_a, 3); + + qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); + + ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + } +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2( + __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, + __m128i *q2, const unsigned char *_blimit0, const unsigned char *_limit0, + const unsigned char *_thresh0, const unsigned char *_blimit1, + const unsigned char *_limit1, const unsigned char *_thresh1, int bd) { + const __m128i zero = _mm_setzero_si128(); + __m128i blimit0, limit0, thresh0; + __m128i t80; + __m128i mask, flat, work; + __m128i abs_p1q1, abs_p0q0, abs_p1p0, abs_p2p1, abs_q1q0, abs_q2q1; + __m128i op1, op0, oq0, oq1; + const __m128i four = _mm_set1_epi16(4); + const __m128i one = _mm_set1_epi16(0x1); + const __m128i ffff = _mm_cmpeq_epi16(one, one); + + get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, + &blimit0, &limit0, &thresh0, &t80); + + abs_p2p1 = abs_diff16(*p2, *p1); + abs_p1p0 = abs_diff16(*p1, *p0); + abs_q1q0 = abs_diff16(*q1, *q0); + abs_q2q1 = abs_diff16(*q2, *q1); + + abs_p0q0 = abs_diff16(*p0, *q0); + abs_p1q1 = abs_diff16(*p1, *q1); + + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0); + mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); + // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1; + // So taking maximums continues to work: + mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one)); + + mask = _mm_max_epi16(abs_q2q1, mask); + work = _mm_max_epi16(abs_p1p0, abs_q1q0); + mask = _mm_max_epi16(work, mask); + mask = _mm_max_epi16(mask, abs_p2p1); + mask = _mm_subs_epu16(mask, limit0); + mask = _mm_cmpeq_epi16(mask, zero); + + // lp filter + __m128i ps[2], qs[2], p[2], q[2]; + { + p[0] = *p0; + p[1] = *p1; + q[0] = *q0; + q[1] = *q1; + // filter_mask and hev_mask + highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); + } + + // flat_mask + flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0)); + flat = _mm_max_epi16(flat, work); + + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); + + flat = _mm_cmpeq_epi16(flat, zero); + flat = _mm_and_si128(flat, mask); // flat & mask + + // 5 tap filter + // need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i workp_a, workp_b, workp_shft0, workp_shft1; + + // op1 + workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0), + _mm_add_epi16(*p1, *p1)); // *p0 *2 + *p1 * 2 + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), + *p2); // *p2 + *p0 * 2 + *p1 * 2 + 4 + + workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0); + workp_shft0 = _mm_add_epi16( + workp_a, workp_b); // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4 + op1 = _mm_srli_epi16(workp_shft0, 3); + + // op0 + workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1); // *q0 * 2 + *q1 + workp_a = + _mm_add_epi16(workp_a, + workp_b); // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4 + op0 = _mm_srli_epi16(workp_a, 3); + + // oq0 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2), + *p1); // *p0 * 2 + *p1 + *q0 * 2 + *q1 + 4 + workp_b = _mm_add_epi16(*q1, *q2); + workp_shft0 = _mm_add_epi16( + workp_a, workp_b); // *p0 * 2 + *p1 + *q0 * 2 + *q1 * 2 + *q2 + 4 + oq0 = _mm_srli_epi16(workp_shft0, 3); + + // oq1 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1), + *p0); // *p0 + *q0 * 2 + *q1 * 2 + *q2 + 4 + workp_b = _mm_add_epi16(*q2, *q2); + workp_shft1 = _mm_add_epi16( + workp_a, workp_b); // *p0 + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4 + oq1 = _mm_srli_epi16(workp_shft1, 3); + + qs[0] = _mm_andnot_si128(flat, qs[0]); + oq0 = _mm_and_si128(flat, oq0); + *q0 = _mm_or_si128(qs[0], oq0); + + qs[1] = _mm_andnot_si128(flat, qs[1]); + oq1 = _mm_and_si128(flat, oq1); + *q1 = _mm_or_si128(qs[1], oq1); + + ps[0] = _mm_andnot_si128(flat, ps[0]); + op0 = _mm_and_si128(flat, op0); + *p0 = _mm_or_si128(ps[0], op0); + + ps[1] = _mm_andnot_si128(flat, ps[1]); + op1 = _mm_and_si128(flat, op1); + *p1 = _mm_or_si128(ps[1], op1); + } else { + *q0 = qs[0]; + *q1 = qs[1]; + *p0 = ps[0]; + *p1 = ps[1]; + } +} + +void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { + __m128i p2, p1, p0, q0, q1, q2, p1p0_out, q1q0_out; + + p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); + + highbd_lpf_internal_6_sse2(&p2, &p1, &p0, &q0, &q1, &q2, &p1p0_out, &q1q0_out, + _blimit, _limit, _thresh, bd); + + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0_out, 8)); + _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0_out); + _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0_out); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0_out, 8)); +} + +void aom_highbd_lpf_horizontal_6_dual_sse2( + uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m128i p2, p1, p0, q0, q1, q2; + + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s + 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + + highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0, + _limit0, _thresh0, _blimit1, _limit1, + _thresh1, bd); + + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2( + __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, + __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out, + const unsigned char *_blimit, const unsigned char *_limit, + const unsigned char *_thresh, int bd) { + const __m128i zero = _mm_setzero_si128(); + __m128i blimit, limit, thresh; + __m128i mask, hev, flat; + __m128i pq[4]; + __m128i p1p0, q1q0, ps1ps0, qs1qs0; + __m128i work_a, opq2, flat_p1p0, flat_q0q1; + + pq[0] = _mm_unpacklo_epi64(*p0, *q0); + pq[1] = _mm_unpacklo_epi64(*p1, *q1); + pq[2] = _mm_unpacklo_epi64(*p2, *q2); + pq[3] = _mm_unpacklo_epi64(*p3, *q3); + + __m128i abs_p1p0; + + const __m128i four = _mm_set1_epi16(4); + __m128i t80; + const __m128i one = _mm_set1_epi16(0x1); + + get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80); + + highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, + &thresh, &hev, &mask); + + // lp filter + highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd); + + // flat_mask4 + flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0])); + flat = _mm_max_epi16(abs_p1p0, flat); + flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8)); + + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); + + flat = _mm_cmpeq_epi16(flat, zero); + flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi64(flat, flat); + + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i workp_a, workp_b, workp_c, workp_shft0, workp_shft1; + // Added before shift for rounding part of ROUND_POWER_OF_TWO + + // o*p2 + workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0); + workp_c = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3); + workp_c = _mm_add_epi16(workp_a, workp_c); + + // o*p1 + workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1); + workp_shft0 = _mm_add_epi16(workp_a, workp_b); + + // o*p0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0); + workp_shft1 = _mm_add_epi16(workp_a, workp_b); + + flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft1, workp_shft0), 3); + + // oq0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0); + workp_shft0 = _mm_add_epi16(workp_a, workp_b); + + // oq1 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1); + workp_shft1 = _mm_add_epi16(workp_a, workp_b); + + flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3); + + // oq2 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2); + workp_a = _mm_add_epi16(workp_a, workp_b); + opq2 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_c, workp_a), 3); + + qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); + + ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + + work_a = _mm_andnot_si128(flat, pq[2]); + *p2 = _mm_and_si128(flat, opq2); + *p2 = _mm_or_si128(work_a, *p2); + *q2 = _mm_srli_si128(*p2, 8); + } +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2( + __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, + __m128i *q1, __m128i *p0, __m128i *q0, const unsigned char *_blimit0, + const unsigned char *_limit0, const unsigned char *_thresh0, + const unsigned char *_blimit1, const unsigned char *_limit1, + const unsigned char *_thresh1, int bd) { + __m128i blimit0, limit0, thresh0; + __m128i t80; + __m128i mask, flat; + __m128i work_a, op2, oq2, op1, op0, oq0, oq1; + __m128i abs_p1q1, abs_p0q0, work0, work1, work2; + + const __m128i zero = _mm_setzero_si128(); + const __m128i four = _mm_set1_epi16(4); + const __m128i one = _mm_set1_epi16(0x1); + const __m128i ffff = _mm_cmpeq_epi16(one, one); + + get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, + &blimit0, &limit0, &thresh0, &t80); + + abs_p0q0 = abs_diff16(*p0, *q0); + abs_p1q1 = abs_diff16(*p1, *q1); + + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0); + mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); + // mask |= (abs(*p0 - q0) * 2 + abs(*p1 - q1) / 2 > blimit) * -1; + + // So taking maximums continues to work: + mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one)); + + work0 = _mm_max_epi16(abs_diff16(*p3, *p2), abs_diff16(*p2, *p1)); + work1 = + _mm_max_epi16(abs_diff16(*p1, *p0), abs_diff16(*q1, *q0)); // tbu 4 flat + work0 = _mm_max_epi16(work0, work1); + work2 = _mm_max_epi16(abs_diff16(*q2, *q1), abs_diff16(*q2, *q3)); + work2 = _mm_max_epi16(work2, work0); + mask = _mm_max_epi16(work2, mask); + + mask = _mm_subs_epu16(mask, limit0); + mask = _mm_cmpeq_epi16(mask, zero); + + // lp filter + __m128i ps[2], qs[2], p[2], q[2]; + { + p[0] = *p0; + p[1] = *p1; + q[0] = *q0; + q[1] = *q1; + // filter_mask and hev_mask + highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); + } + + flat = _mm_max_epi16(abs_diff16(*p2, *p0), abs_diff16(*q2, *q0)); + flat = _mm_max_epi16(work1, flat); + work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0)); + flat = _mm_max_epi16(work0, flat); + + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); + flat = _mm_cmpeq_epi16(flat, zero); + flat = _mm_and_si128(flat, mask); // flat & mask + + // filter8 need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i workp_a, workp_b; + // Added before shift for rounding part of ROUND_POWER_OF_TWO + + // o*p2 + workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0); + workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3); + op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // o*p1 + workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1); + op1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // o*p0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0); + op0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // oq0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0); + oq0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // oq1 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1); + oq1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // oq2 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2); + oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + qs[0] = _mm_andnot_si128(flat, qs[0]); + oq0 = _mm_and_si128(flat, oq0); + *q0 = _mm_or_si128(qs[0], oq0); + + qs[1] = _mm_andnot_si128(flat, qs[1]); + oq1 = _mm_and_si128(flat, oq1); + *q1 = _mm_or_si128(qs[1], oq1); + + ps[0] = _mm_andnot_si128(flat, ps[0]); + op0 = _mm_and_si128(flat, op0); + *p0 = _mm_or_si128(ps[0], op0); + + ps[1] = _mm_andnot_si128(flat, ps[1]); + op1 = _mm_and_si128(flat, op1); + *p1 = _mm_or_si128(ps[1], op1); + + work_a = _mm_andnot_si128(flat, *q2); + *q2 = _mm_and_si128(flat, oq2); + *q2 = _mm_or_si128(work_a, *q2); + + work_a = _mm_andnot_si128(flat, *p2); + *p2 = _mm_and_si128(flat, op2); + *p2 = _mm_or_si128(work_a, *p2); + } else { + *q0 = qs[0]; + *q1 = qs[1]; + *p0 = ps[0]; + *p1 = ps[1]; + } +} + +void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { + __m128i p2, p1, p0, q0, q1, q2, p3, q3; + __m128i q1q0, p1p0; + + p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); + q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); + p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p)); + + highbd_lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, + &p1p0, _blimit, _limit, _thresh, bd); + + _mm_storel_epi64((__m128i *)(s - 3 * p), p2); + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); + _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); + _mm_storel_epi64((__m128i *)(s + 2 * p), q2); +} + +void aom_highbd_lpf_horizontal_8_dual_sse2( + uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m128i p2, p1, p0, q0, q1, q2, p3, q3; + + p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s + 0 * p)); + + highbd_lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, + _blimit0, _limit0, _thresh0, _blimit1, + _limit1, _thresh1, bd); + + _mm_storeu_si128((__m128i *)(s - 3 * p), p2); + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + _mm_storeu_si128((__m128i *)(s + 2 * p), q2); +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_4_sse2( + __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *q1q0_out, + __m128i *p1p0_out, const uint8_t *_blimit, const uint8_t *_limit, + const uint8_t *_thresh, int bd) { + __m128i blimit, limit, thresh; + __m128i mask, hev; + __m128i p1p0, q1q0; + __m128i pq[2]; + + __m128i abs_p1p0; + + __m128i t80; + get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80); + + pq[0] = _mm_unpacklo_epi64(*p0, *q0); + pq[1] = _mm_unpacklo_epi64(*p1, *q1); + + highbd_hev_filter_mask_x_sse2(pq, 2, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, + &thresh, &hev, &mask); + + highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd); +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_4_dual_sse2( + __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *ps, + __m128i *qs, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m128i blimit0, limit0, thresh0; + __m128i mask, flat; + __m128i p[2], q[2]; + + const __m128i zero = _mm_setzero_si128(); + __m128i abs_p0q0 = abs_diff16(*q0, *p0); + __m128i abs_p1q1 = abs_diff16(*q1, *p1); + + __m128i abs_p1p0 = abs_diff16(*p1, *p0); + __m128i abs_q1q0 = abs_diff16(*q1, *q0); + + const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0); + const __m128i one = _mm_set1_epi16(1); + + __m128i t80; + + get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, + &blimit0, &limit0, &thresh0, &t80); + + // filter_mask and hev_mask + flat = _mm_max_epi16(abs_p1p0, abs_q1q0); + + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); + + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0); + mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); + // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1; + // So taking maximums continues to work: + mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one)); + mask = _mm_max_epi16(flat, mask); + + mask = _mm_subs_epu16(mask, limit0); + mask = _mm_cmpeq_epi16(mask, zero); + + p[0] = *p0; + p[1] = *p1; + q[0] = *q0; + q[1] = *q1; + + highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); +} + +void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { + __m128i p1p0, q1q0; + __m128i p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + __m128i p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + __m128i q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); + __m128i q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + + highbd_lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &q1q0, &p1p0, _blimit, _limit, + _thresh, bd); + + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); + _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); +} + +void aom_highbd_lpf_horizontal_4_dual_sse2( + uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + __m128i ps[2], qs[2]; + + highbd_lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, ps, qs, _blimit0, _limit0, + _thresh0, _blimit1, _limit1, _thresh1, bd); + + _mm_storeu_si128((__m128i *)(s - 2 * p), ps[1]); + _mm_storeu_si128((__m128i *)(s - 1 * p), ps[0]); + _mm_storeu_si128((__m128i *)(s + 0 * p), qs[0]); + _mm_storeu_si128((__m128i *)(s + 1 * p), qs[1]); +} + +void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + __m128i x0, x1, x2, x3, d0, d1, d2, d3; + __m128i p1p0, q1q0; + __m128i p1, q1; + + x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p)); + x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p)); + x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p)); + + highbd_transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &d0, &d1, &d2, &d3); + + highbd_lpf_internal_4_sse2(&d0, &d1, &d2, &d3, &q1q0, &p1p0, blimit, limit, + thresh, bd); + + p1 = _mm_srli_si128(p1p0, 8); + q1 = _mm_srli_si128(q1q0, 8); + + // transpose from 8x4 to 4x8 + highbd_transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3); + + _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0); + _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1); + _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2); + _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3); +} + +void aom_highbd_lpf_vertical_4_dual_sse2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i ps[2], qs[2]; + + x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p)); + x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p)); + x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p)); + x4 = _mm_loadl_epi64((__m128i *)(s - 2 + 4 * p)); + x5 = _mm_loadl_epi64((__m128i *)(s - 2 + 5 * p)); + x6 = _mm_loadl_epi64((__m128i *)(s - 2 + 6 * p)); + x7 = _mm_loadl_epi64((__m128i *)(s - 2 + 7 * p)); + + highbd_transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1, + &d2, &d3); + + highbd_lpf_internal_4_dual_sse2(&d0, &d1, &d2, &d3, ps, qs, blimit0, limit0, + thresh0, blimit1, limit1, thresh1, bd); + + highbd_transpose4x8_8x4_sse2(&ps[1], &ps[0], &qs[0], &qs[1], &d0, &d1, &d2, + &d3, &d4, &d5, &d6, &d7); + + _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0); + _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1); + _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2); + _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3); + _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4); + _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5); + _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6); + _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7); +} + +void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i x3, x2, x1, x0, p0, q0; + __m128i p1p0, q1q0; + + x3 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p)); + x2 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p)); + x1 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p)); + x0 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p)); + + highbd_transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, + &d6, &d7); + + highbd_lpf_internal_6_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &p1p0, &q1q0, blimit, + limit, thresh, bd); + + p0 = _mm_srli_si128(p1p0, 8); + q0 = _mm_srli_si128(q1q0, 8); + + highbd_transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3); + + _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0); + _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1); + _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2); + _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3); +} + +void aom_highbd_lpf_vertical_6_dual_sse2( + uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i p0, q0, p1, q1, p2, q2; + + x0 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p)); + x1 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p)); + x2 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p)); + x3 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p)); + x4 = _mm_loadu_si128((__m128i *)((s - 3) + 4 * p)); + x5 = _mm_loadu_si128((__m128i *)((s - 3) + 5 * p)); + x6 = _mm_loadu_si128((__m128i *)((s - 3) + 6 * p)); + x7 = _mm_loadu_si128((__m128i *)((s - 3) + 7 * p)); + + highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p2, &p1, + &p0, &q0, &q1, &q2, &d6, &d7); + + highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0, + _limit0, _thresh0, _blimit1, _limit1, + _thresh1, bd); + + highbd_transpose4x8_8x4_sse2(&p1, &p0, &q0, &q1, &d0, &d1, &d2, &d3, &d4, &d5, + &d6, &d7); + + _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0); + _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1); + _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2); + _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3); + _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4); + _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5); + _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6); + _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7); +} + +void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i p2, p1, p0, p3, q0; + __m128i q1q0, p1p0; + + p3 = _mm_loadu_si128((__m128i *)((s - 4) + 0 * p)); + p2 = _mm_loadu_si128((__m128i *)((s - 4) + 1 * p)); + p1 = _mm_loadu_si128((__m128i *)((s - 4) + 2 * p)); + p0 = _mm_loadu_si128((__m128i *)((s - 4) + 3 * p)); + + highbd_transpose4x8_8x4_sse2(&p3, &p2, &p1, &p0, &d0, &d1, &d2, &d3, &d4, &d5, + &d6, &d7); + + // Loop filtering + highbd_lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, + &p1p0, blimit, limit, thresh, bd); + + p0 = _mm_srli_si128(p1p0, 8); + q0 = _mm_srli_si128(q1q0, 8); + + highbd_transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, + &d1, &d2, &d3); + + _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), d0); + _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), d1); + _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), d2); + _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), d3); +} + +void aom_highbd_lpf_vertical_8_dual_sse2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + + x0 = _mm_loadu_si128((__m128i *)(s - 4 + 0 * p)); + x1 = _mm_loadu_si128((__m128i *)(s - 4 + 1 * p)); + x2 = _mm_loadu_si128((__m128i *)(s - 4 + 2 * p)); + x3 = _mm_loadu_si128((__m128i *)(s - 4 + 3 * p)); + x4 = _mm_loadu_si128((__m128i *)(s - 4 + 4 * p)); + x5 = _mm_loadu_si128((__m128i *)(s - 4 + 5 * p)); + x6 = _mm_loadu_si128((__m128i *)(s - 4 + 6 * p)); + x7 = _mm_loadu_si128((__m128i *)(s - 4 + 7 * p)); + + highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1, + &d2, &d3, &d4, &d5, &d6, &d7); + + highbd_lpf_internal_8_dual_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, + blimit0, limit0, thresh0, blimit1, limit1, + thresh1, bd); + + highbd_transpose8x8_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &x0, &x1, + &x2, &x3, &x4, &x5, &x6, &x7); + + _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), x0); + _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), x1); + _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), x2); + _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), x3); + _mm_storeu_si128((__m128i *)(s - 4 + 4 * p), x4); + _mm_storeu_si128((__m128i *)(s - 4 + 5 * p), x5); + _mm_storeu_si128((__m128i *)(s - 4 + 6 * p), x6); + _mm_storeu_si128((__m128i *)(s - 4 + 7 * p), x7); +} + +void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + __m128i q[7], p[7], pq[7]; + __m128i p6, p5, p4, p3; + __m128i p6_2, p5_2, p4_2, p3_2; + __m128i d0, d1, d2, d3; + __m128i d0_2, d1_2, d2_2, d3_2, d7_2; + + p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch)); + p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch)); + p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch)); + p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch)); + + highbd_transpose4x8_8x4_sse2(&p6, &p5, &p4, &p3, &d0, &p[6], &p[5], &p[4], + &p[3], &p[2], &p[1], &p[0]); + + p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch)); + p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); + p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); + + highbd_transpose4x8_8x4_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &q[0], &q[1], &q[2], + &q[3], &q[4], &q[5], &q[6], &d7_2); + + highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd); + + highbd_transpose8x8_low_sse2(&d0, &p[6], &pq[5], &pq[4], &pq[3], &pq[2], + &pq[1], &pq[0], &d0, &d1, &d2, &d3); + + q[0] = _mm_srli_si128(pq[0], 8); + q[1] = _mm_srli_si128(pq[1], 8); + q[2] = _mm_srli_si128(pq[2], 8); + q[3] = _mm_srli_si128(pq[3], 8); + q[4] = _mm_srli_si128(pq[4], 8); + q[5] = _mm_srli_si128(pq[5], 8); + + highbd_transpose8x8_low_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], + &d7_2, &d0_2, &d1_2, &d2_2, &d3_2); + + _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0); + _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_2); + + _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_2); + + _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2); + _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_2); + + _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3); + _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_2); +} + +void aom_highbd_lpf_vertical_14_dual_sse2( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + __m128i q[7], p[7]; + __m128i p6, p5, p4, p3, p2, p1, p0, q0; + __m128i p6_2, p5_2, p4_2, p3_2, p2_2, p1_2, q0_2, p0_2; + __m128i d0, d7; + __m128i d0_out, d1_out, d2_out, d3_out, d4_out, d5_out, d6_out, d7_out; + + p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch)); + p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch)); + p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch)); + p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch)); + p2 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * pitch)); + p1 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * pitch)); + p0 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * pitch)); + q0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * pitch)); + + highbd_transpose8x8_sse2(&p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &d0, &p[6], + &p[5], &p[4], &p[3], &p[2], &p[1], &p[0]); + + p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch)); + p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); + p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); + p2_2 = _mm_loadu_si128((__m128i *)(s + 4 * pitch)); + p1_2 = _mm_loadu_si128((__m128i *)(s + 5 * pitch)); + p0_2 = _mm_loadu_si128((__m128i *)(s + 6 * pitch)); + q0_2 = _mm_loadu_si128((__m128i *)(s + 7 * pitch)); + + highbd_transpose8x8_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &p2_2, &p1_2, &p0_2, + &q0_2, &q[0], &q[1], &q[2], &q[3], &q[4], &q[5], + &q[6], &d7); + + highbd_lpf_internal_14_dual_sse2(p, q, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); + + highbd_transpose8x8_sse2(&d0, &p[6], &p[5], &p[4], &p[3], &p[2], &p[1], &p[0], + &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out, + &d6_out, &d7_out); + + _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0_out); + _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1_out); + _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2_out); + _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3_out); + _mm_storeu_si128((__m128i *)(s - 8 + 4 * pitch), d4_out); + _mm_storeu_si128((__m128i *)(s - 8 + 5 * pitch), d5_out); + _mm_storeu_si128((__m128i *)(s - 8 + 6 * pitch), d6_out); + _mm_storeu_si128((__m128i *)(s - 8 + 7 * pitch), d7_out); + + highbd_transpose8x8_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], &d7, + &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out, + &d6_out, &d7_out); + + _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_out); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_out); + _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_out); + _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_out); + _mm_storeu_si128((__m128i *)(s + 4 * pitch), d4_out); + _mm_storeu_si128((__m128i *)(s + 5 * pitch), d5_out); + _mm_storeu_si128((__m128i *)(s + 6 * pitch), d6_out); + _mm_storeu_si128((__m128i *)(s + 7 * pitch), d7_out); +} diff --git a/libs/libaom/src/aom_dsp/x86/highbd_quantize_intrin_avx2.c b/libs/libaom/src/aom_dsp/x86/highbd_quantize_intrin_avx2.c new file mode 100644 index 000000000..b9689202a --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/highbd_quantize_intrin_avx2.c @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" + +static INLINE void init_one_qp(const __m128i *p, __m256i *qp) { + const __m128i sign = _mm_srai_epi16(*p, 15); + const __m128i dc = _mm_unpacklo_epi16(*p, sign); + const __m128i ac = _mm_unpackhi_epi16(*p, sign); + *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1); +} + +static INLINE void update_qp(__m256i *qp) { + int i; + for (i = 0; i < 5; ++i) { + qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11); + } +} + +static INLINE void init_qp(const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *dequant_ptr, + const int16_t *quant_shift_ptr, __m256i *qp) { + const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr); + const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr); + const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr); + const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr); + const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr); + init_one_qp(&zbin, &qp[0]); + init_one_qp(&round, &qp[1]); + init_one_qp(&quant, &qp[2]); + init_one_qp(&dequant, &qp[3]); + init_one_qp(&quant_shift, &qp[4]); +} + +// Note: +// *x is vector multiplied by *y which is 16 int32_t parallel multiplication +// and right shift 16. The output, 16 int32_t is save in *p. +static INLINE void mm256_mul_shift_epi32(const __m256i *x, const __m256i *y, + __m256i *p) { + __m256i prod_lo = _mm256_mul_epi32(*x, *y); + __m256i prod_hi = _mm256_srli_epi64(*x, 32); + const __m256i mult_hi = _mm256_srli_epi64(*y, 32); + prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); + + prod_lo = _mm256_srli_epi64(prod_lo, 16); + const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); + prod_lo = _mm256_and_si256(prod_lo, mask); + prod_hi = _mm256_srli_epi64(prod_hi, 16); + + prod_hi = _mm256_slli_epi64(prod_hi, 32); + *p = _mm256_or_si256(prod_lo, prod_hi); +} + +static INLINE void quantize(const __m256i *qp, __m256i *c, + const int16_t *iscan_ptr, tran_low_t *qcoeff, + tran_low_t *dqcoeff, __m256i *eob) { + const __m256i abs = _mm256_abs_epi32(*c); + const __m256i flag1 = _mm256_cmpgt_epi32(abs, qp[0]); + __m256i flag2 = _mm256_cmpeq_epi32(abs, qp[0]); + flag2 = _mm256_or_si256(flag1, flag2); + const int32_t nzflag = _mm256_movemask_epi8(flag2); + + if (LIKELY(nzflag)) { + __m256i q = _mm256_add_epi32(abs, qp[1]); + __m256i tmp; + mm256_mul_shift_epi32(&q, &qp[2], &tmp); + q = _mm256_add_epi32(tmp, q); + + mm256_mul_shift_epi32(&q, &qp[4], &q); + __m256i dq = _mm256_mullo_epi32(q, qp[3]); + + q = _mm256_sign_epi32(q, *c); + dq = _mm256_sign_epi32(dq, *c); + q = _mm256_and_si256(q, flag2); + dq = _mm256_and_si256(dq, flag2); + + _mm256_storeu_si256((__m256i *)qcoeff, q); + _mm256_storeu_si256((__m256i *)dqcoeff, dq); + + const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr); + const __m128i zr = _mm_setzero_si128(); + const __m128i lo = _mm_unpacklo_epi16(isc, zr); + const __m128i hi = _mm_unpackhi_epi16(isc, zr); + const __m256i iscan = + _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); + + const __m256i zero = _mm256_setzero_si256(); + const __m256i zc = _mm256_cmpeq_epi32(dq, zero); + const __m256i nz = _mm256_cmpeq_epi32(zc, zero); + __m256i cur_eob = _mm256_sub_epi32(iscan, nz); + cur_eob = _mm256_and_si256(cur_eob, nz); + *eob = _mm256_max_epi32(cur_eob, *eob); + } else { + const __m256i zero = _mm256_setzero_si256(); + _mm256_storeu_si256((__m256i *)qcoeff, zero); + _mm256_storeu_si256((__m256i *)dqcoeff, zero); + } +} + +void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + (void)scan; + const unsigned int step = 8; + + __m256i qp[5], coeff; + init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp); + coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + + __m256i eob = _mm256_setzero_si256(); + quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + + update_qp(qp); + + while (n_coeffs > 0) { + coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + } + { + __m256i eob_s; + eob_s = _mm256_shuffle_epi32(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 1); + eob = _mm256_max_epi16(eob, eob_s); + const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob), + _mm256_extractf128_si256(eob, 1)); + *eob_ptr = _mm_extract_epi16(final_eob, 0); + } +} diff --git a/libs/libaom/src/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/libs/libaom/src/aom_dsp/x86/highbd_quantize_intrin_sse2.c new file mode 100644 index 000000000..1764a4952 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/highbd_quantize_intrin_sse2.c @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" + +void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int i, j, non_zero_regs = (int)count / 4, eob_i = -1; + __m128i zbins[2]; + __m128i nzbins[2]; + + zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1], + (int)zbin_ptr[0]); + zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]); + + nzbins[0] = _mm_setzero_si128(); + nzbins[1] = _mm_setzero_si128(); + nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); + nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); + + (void)scan; + + memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); + + // Pre-scan pass + for (i = ((int)count / 4) - 1; i >= 0; i--) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (test == 0xffff) + non_zero_regs--; + else + break; + } + + // Quantization pass: + for (i = 0; i < non_zero_regs; i++) { + __m128i coeffs, coeffs_sign, tmp1, tmp2; + int test; + int abs_coeff[4]; + int coeff_sign[4]; + + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + coeffs_sign = _mm_srai_epi32(coeffs, 31); + coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); + tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); + tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); + tmp1 = _mm_or_si128(tmp1, tmp2); + test = _mm_movemask_epi8(tmp1); + _mm_storeu_si128((__m128i *)abs_coeff, coeffs); + _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); + + for (j = 0; j < 4; j++) { + if (test & (1 << (4 * j))) { + int k = 4 * i + j; + const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0]; + const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; + const uint32_t abs_qcoeff = + (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); + qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; + dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; + if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; + } + } + } + *eob_ptr = eob_i + 1; +} + +void aom_highbd_quantize_b_32x32_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + __m128i zbins[2]; + __m128i nzbins[2]; + int idx = 0; + int idx_arr[1024]; + int i, eob = -1; + const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); + const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); + (void)scan; + zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); + zbins[1] = _mm_set1_epi32(zbin1_tmp); + + nzbins[0] = _mm_setzero_si128(); + nzbins[1] = _mm_setzero_si128(); + nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); + nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + // Pre-scan pass + for (i = 0; i < n_coeffs / 4; i++) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (!(test & 0xf)) idx_arr[idx++] = i * 4; + if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; + if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; + if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = idx_arr[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; + } + *eob_ptr = eob + 1; +} + +void aom_highbd_quantize_b_64x64_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + __m128i zbins[2]; + __m128i nzbins[2]; + int idx = 0; + int idx_arr[1024]; + int i, eob = -1; + const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 2); + const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 2); + (void)scan; + zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); + zbins[1] = _mm_set1_epi32(zbin1_tmp); + + nzbins[0] = _mm_setzero_si128(); + nzbins[1] = _mm_setzero_si128(); + nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); + nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + // Pre-scan pass + for (i = 0; i < n_coeffs / 4; i++) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (!(test & 0xf)) idx_arr[idx++] = i * 4; + if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; + if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; + if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = idx_arr[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2); + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 14); + qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4; + if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; + } + *eob_ptr = eob + 1; +} diff --git a/libs/libaom/src/aom_dsp/x86/highbd_sad4d_sse2.asm b/libs/libaom/src/aom_dsp/x86/highbd_sad4d_sse2.asm new file mode 100644 index 000000000..e0d22522d --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/highbd_sad4d_sse2.asm @@ -0,0 +1,296 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_4x2x4 5-6 0 + movh m0, [srcq +%2*2] +%if %1 == 1 + movu m4, [ref1q+%3*2] + movu m5, [ref2q+%3*2] + movu m6, [ref3q+%3*2] + movu m7, [ref4q+%3*2] + movhps m0, [srcq +%4*2] + movhps m4, [ref1q+%5*2] + movhps m5, [ref2q+%5*2] + movhps m6, [ref3q+%5*2] + movhps m7, [ref4q+%5*2] + mova m3, m0 + mova m2, m0 + psubusw m3, m4 + psubusw m2, m5 + psubusw m4, m0 + psubusw m5, m0 + por m4, m3 + por m5, m2 + pmaddwd m4, m1 + pmaddwd m5, m1 + mova m3, m0 + mova m2, m0 + psubusw m3, m6 + psubusw m2, m7 + psubusw m6, m0 + psubusw m7, m0 + por m6, m3 + por m7, m2 + pmaddwd m6, m1 + pmaddwd m7, m1 +%else + movu m2, [ref1q+%3*2] + movhps m0, [srcq +%4*2] + movhps m2, [ref1q+%5*2] + mova m3, m0 + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m4, m2 + + movu m2, [ref2q+%3*2] + mova m3, m0 + movhps m2, [ref2q+%5*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m5, m2 + + movu m2, [ref3q+%3*2] + mova m3, m0 + movhps m2, [ref3q+%5*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m6, m2 + + movu m2, [ref4q+%3*2] + mova m3, m0 + movhps m2, [ref4q+%5*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m7, m2 +%endif +%if %6 == 1 + lea srcq, [srcq +src_strideq*4] + lea ref1q, [ref1q+ref_strideq*4] + lea ref2q, [ref2q+ref_strideq*4] + lea ref3q, [ref3q+ref_strideq*4] + lea ref4q, [ref4q+ref_strideq*4] +%endif +%endmacro + +; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_8x2x4 5-6 0 + ; 1st 8 px + mova m0, [srcq +%2*2] +%if %1 == 1 + movu m4, [ref1q+%3*2] + movu m5, [ref2q+%3*2] + movu m6, [ref3q+%3*2] + movu m7, [ref4q+%3*2] + mova m3, m0 + mova m2, m0 + psubusw m3, m4 + psubusw m2, m5 + psubusw m4, m0 + psubusw m5, m0 + por m4, m3 + por m5, m2 + pmaddwd m4, m1 + pmaddwd m5, m1 + mova m3, m0 + mova m2, m0 + psubusw m3, m6 + psubusw m2, m7 + psubusw m6, m0 + psubusw m7, m0 + por m6, m3 + por m7, m2 + pmaddwd m6, m1 + pmaddwd m7, m1 +%else + mova m3, m0 + movu m2, [ref1q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m4, m2 + movu m2, [ref2q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m5, m2 + movu m2, [ref3q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m6, m2 + movu m2, [ref4q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m7, m2 +%endif + + ; 2nd 8 px + mova m0, [srcq +(%4)*2] + mova m3, m0 + movu m2, [ref1q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m4, m2 + movu m2, [ref2q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m5, m2 + movu m2, [ref3q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m6, m2 + movu m2, [ref4q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 +%if %6 == 1 + lea srcq, [srcq +src_strideq*4] + lea ref1q, [ref1q+ref_strideq*4] + lea ref2q, [ref2q+ref_strideq*4] + lea ref3q, [ref3q+ref_strideq*4] + lea ref4q, [ref4q+ref_strideq*4] +%endif + por m2, m3 + pmaddwd m2, m1 + paddd m7, m2 +%endmacro + +; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_16x2x4 5-6 0 + HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8) + HIGH_PROCESS_8x2x4 0, %4, %5, (%4 + 8), (%5 + 8), %6 +%endmacro + +; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_32x2x4 5-6 0 + HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16) + HIGH_PROCESS_16x2x4 0, %4, %5, (%4 + 16), (%5 + 16), %6 +%endmacro + +; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_64x2x4 5-6 0 + HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32) + HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6 +%endmacro + +; void aom_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride, +; uint8_t *ref[4], int ref_stride, +; uint32_t res[4]); +; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8 +%macro HIGH_SADNXN4D 2 +%if UNIX64 +cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif + +; set m1 + push srcq + mov srcd, 0x00010001 + movd m1, srcd + pshufd m1, m1, 0x0 + pop srcq + + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + mov ref2q, [ref1q+gprsize*1] + mov ref3q, [ref1q+gprsize*2] + mov ref4q, [ref1q+gprsize*3] + mov ref1q, [ref1q+gprsize*0] + +; convert byte pointers to short pointers + shl srcq, 1 + shl ref2q, 1 + shl ref3q, 1 + shl ref4q, 1 + shl ref1q, 1 + + HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 +%rep (%2-4)/2 + HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 +%endrep + HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 + ; N.B. HIGH_PROCESS outputs dwords (32 bits) + ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM + movhlps m0, m4 + movhlps m1, m5 + movhlps m2, m6 + movhlps m3, m7 + paddd m4, m0 + paddd m5, m1 + paddd m6, m2 + paddd m7, m3 + punpckldq m4, m5 + punpckldq m6, m7 + movhlps m0, m4 + movhlps m1, m6 + paddd m4, m0 + paddd m6, m1 + punpcklqdq m4, m6 + movifnidn r4, r4mp + movu [r4], m4 + RET +%endmacro + + +INIT_XMM sse2 +HIGH_SADNXN4D 64, 64 +HIGH_SADNXN4D 64, 32 +HIGH_SADNXN4D 32, 64 +HIGH_SADNXN4D 32, 32 +HIGH_SADNXN4D 32, 16 +HIGH_SADNXN4D 16, 32 +HIGH_SADNXN4D 16, 16 +HIGH_SADNXN4D 16, 8 +HIGH_SADNXN4D 8, 16 +HIGH_SADNXN4D 8, 8 +HIGH_SADNXN4D 8, 4 +HIGH_SADNXN4D 4, 8 +HIGH_SADNXN4D 4, 4 +HIGH_SADNXN4D 4, 16 +HIGH_SADNXN4D 16, 4 +HIGH_SADNXN4D 8, 32 +HIGH_SADNXN4D 32, 8 +HIGH_SADNXN4D 16, 64 +HIGH_SADNXN4D 64, 16 diff --git a/libs/libaom/src/aom_dsp/x86/highbd_sad_sse2.asm b/libs/libaom/src/aom_dsp/x86/highbd_sad_sse2.asm new file mode 100644 index 000000000..09e64d510 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/highbd_sad_sse2.asm @@ -0,0 +1,442 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro HIGH_SAD_FN 4 +%if %4 == 0 +%if %3 == 5 +cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 +%else ; avg +%if %3 == 5 +cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \ + second_pred, n_rows +%else ; %3 == 7 +cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \ + ref, ref_stride, \ + second_pred, \ + src_stride3, ref_stride3 +%if ARCH_X86_64 +%define n_rowsd r7d +%else ; x86-32 +%define n_rowsd dword r0m +%endif ; x86-32/64 +%endif ; %3 == 5/7 +%endif ; avg/sad + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided +%if %3 == 7 + lea src_stride3q, [src_strideq*3] + lea ref_stride3q, [ref_strideq*3] +%endif ; %3 == 7 +; convert src, ref & second_pred to short ptrs (from byte ptrs) + shl srcq, 1 + shl refq, 1 +%if %4 == 1 + shl second_predq, 1 +%endif +%endmacro + +; unsigned int aom_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD64XN 1-2 0 + HIGH_SAD_FN 64, %1, 5, %2 + mov n_rowsd, %1 + pxor m0, m0 + pxor m6, m6 + +.loop: + ; first half of each row + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+16] + psubusw m5, m2 + psubusw m2, [srcq+16] + por m2, m5 + mova m5, [srcq+32] + psubusw m5, m3 + psubusw m3, [srcq+32] + por m3, m5 + mova m5, [srcq+48] + psubusw m5, m4 + psubusw m4, [srcq+48] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + paddd m0, m1 + paddd m0, m3 + ; second half of each row + movu m1, [refq+64] + movu m2, [refq+80] + movu m3, [refq+96] + movu m4, [refq+112] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq+64] + psubusw m5, m1 + psubusw m1, [srcq+64] + por m1, m5 + mova m5, [srcq+80] + psubusw m5, m2 + psubusw m2, [srcq+80] + por m2, m5 + mova m5, [srcq+96] + psubusw m5, m3 + psubusw m3, [srcq+96] + por m3, m5 + mova m5, [srcq+112] + psubusw m5, m4 + psubusw m4, [srcq+112] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD64XN 64 ; highbd_sad64x64_sse2 +HIGH_SAD64XN 32 ; highbd_sad64x32_sse2 +HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2 +HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 +HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2 +HIGH_SAD64XN 16, 1 ; highbd_sad_64x16_avg_sse2 + +; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD32XN 1-2 0 + HIGH_SAD_FN 32, %1, 5, %2 + mov n_rowsd, %1 + pxor m0, m0 + pxor m6, m6 + +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+16] + psubusw m5, m2 + psubusw m2, [srcq+16] + por m2, m5 + mova m5, [srcq+32] + psubusw m5, m3 + psubusw m3, [srcq+32] + por m3, m5 + mova m5, [srcq+48] + psubusw m5, m4 + psubusw m4, [srcq+48] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD32XN 64 ; highbd_sad32x64_sse2 +HIGH_SAD32XN 32 ; highbd_sad32x32_sse2 +HIGH_SAD32XN 16 ; highbd_sad32x16_sse2 +HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2 +HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2 +HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 +HIGH_SAD32XN 8 ; highbd_sad_32x8_sse2 +HIGH_SAD32XN 8, 1 ; highbd_sad_32x8_avg_sse2 + +; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD16XN 1-2 0 + HIGH_SAD_FN 16, %1, 5, %2 + mov n_rowsd, %1/2 + pxor m0, m0 + pxor m6, m6 + +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+ref_strideq*2] + movu m4, [refq+ref_strideq*2+16] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+16] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*2+16] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+16] + psubusw m5, m2 + psubusw m2, [srcq+16] + por m2, m5 + mova m5, [srcq+src_strideq*2] + psubusw m5, m3 + psubusw m3, [srcq+src_strideq*2] + por m3, m5 + mova m5, [srcq+src_strideq*2+16] + psubusw m5, m4 + psubusw m4, [srcq+src_strideq*2+16] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD16XN 32 ; highbd_sad16x32_sse2 +HIGH_SAD16XN 16 ; highbd_sad16x16_sse2 +HIGH_SAD16XN 8 ; highbd_sad16x8_sse2 +HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2 +HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2 +HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 +HIGH_SAD16XN 4 ; highbd_sad_16x4_sse2 +HIGH_SAD16XN 4, 1 ; highbd_sad_16x4_avg_sse2 +HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2 +HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2 + +; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD8XN 1-2 0 + HIGH_SAD_FN 8, %1, 7, %2 + mov n_rowsd, %1/4 + pxor m0, m0 + pxor m6, m6 + +.loop: + movu m1, [refq] + movu m2, [refq+ref_strideq*2] + movu m3, [refq+ref_strideq*4] + movu m4, [refq+ref_stride3q*2] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+src_strideq*2] + psubusw m5, m2 + psubusw m2, [srcq+src_strideq*2] + por m2, m5 + mova m5, [srcq+src_strideq*4] + psubusw m5, m3 + psubusw m3, [srcq+src_strideq*4] + por m3, m5 + mova m5, [srcq+src_stride3q*2] + psubusw m5, m4 + psubusw m4, [srcq+src_stride3q*2] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*8] + paddd m0, m1 + lea srcq, [srcq+src_strideq*8] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD8XN 16 ; highbd_sad8x16_sse2 +HIGH_SAD8XN 8 ; highbd_sad8x8_sse2 +HIGH_SAD8XN 4 ; highbd_sad8x4_sse2 +HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2 +HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2 +HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2 +HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2 +HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2 + +; unsigned int aom_highbd_sad4x{4,8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD4XN 1-2 0 + HIGH_SAD_FN 4, %1, 7, %2 + mov n_rowsd, %1/4 + pxor m0, m0 + pxor m6, m6 + +.loop: + movq m1, [refq] + movq m2, [refq+ref_strideq*2] + movq m3, [refq+ref_strideq*4] + movq m4, [refq+ref_stride3q*2] + punpcklwd m1, m3 + punpcklwd m2, m4 +%if %2 == 1 + movq m3, [second_predq+8*0] + movq m5, [second_predq+8*2] + punpcklwd m3, m5 + movq m4, [second_predq+8*1] + movq m5, [second_predq+8*3] + punpcklwd m4, m5 + lea second_predq, [second_predq+8*4] + pavgw m1, m3 + pavgw m2, m4 +%endif + movq m5, [srcq] + movq m3, [srcq+src_strideq*4] + punpcklwd m5, m3 + movdqa m3, m1 + psubusw m1, m5 + psubusw m5, m3 + por m1, m5 + movq m5, [srcq+src_strideq*2] + movq m4, [srcq+src_stride3q*2] + punpcklwd m5, m4 + movdqa m4, m2 + psubusw m2, m5 + psubusw m5, m4 + por m2, m5 + paddw m1, m2 + movdqa m2, m1 + punpcklwd m1, m6 + punpckhwd m2, m6 + lea refq, [refq+ref_strideq*8] + paddd m0, m1 + lea srcq, [srcq+src_strideq*8] + paddd m0, m2 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD4XN 16 ; highbd_sad4x16_sse2 +HIGH_SAD4XN 8 ; highbd_sad4x8_sse2 +HIGH_SAD4XN 4 ; highbd_sad4x4_sse2 +HIGH_SAD4XN 16, 1 ; highbd_sad4x16_avg_sse2 +HIGH_SAD4XN 8, 1 ; highbd_sad4x8_avg_sse2 +HIGH_SAD4XN 4, 1 ; highbd_sad4x4_avg_sse2 diff --git a/libs/libaom/src/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/libs/libaom/src/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm new file mode 100644 index 000000000..5c78933df --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm @@ -0,0 +1,1024 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_8: times 8 dw 8 +bilin_filter_m_sse2: times 8 dw 16 + times 8 dw 0 + times 8 dw 14 + times 8 dw 2 + times 8 dw 12 + times 8 dw 4 + times 8 dw 10 + times 8 dw 6 + times 16 dw 8 + times 8 dw 6 + times 8 dw 10 + times 8 dw 4 + times 8 dw 12 + times 8 dw 2 + times 8 dw 14 + +SECTION .text + +; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, +; int x_offset, int y_offset, +; const uint8_t *dst, ptrdiff_t dst_stride, +; int height, unsigned int *sse); +; +; This function returns the SE and stores SSE in the given pointer. + +%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse + psubw %3, %4 + psubw %1, %2 + mova %4, %3 ; make copies to manipulate to calc sum + mova %2, %1 ; use originals for calc sse + pmaddwd %3, %3 + paddw %4, %2 + pmaddwd %1, %1 + movhlps %2, %4 + paddd %6, %3 + paddw %4, %2 + pxor %2, %2 + pcmpgtw %2, %4 ; mask for 0 > %4 (sum) + punpcklwd %4, %2 ; sign-extend word to dword + paddd %6, %1 + paddd %5, %4 + +%endmacro + +%macro STORE_AND_RET 0 +%if mmsize == 16 + ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit + ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. + ; We have to sign-extend it before adding the words within the register + ; and outputing to a dword. + movhlps m3, m7 + movhlps m4, m6 + paddd m7, m3 + paddd m6, m4 + pshufd m3, m7, 0x1 + pshufd m4, m6, 0x1 + paddd m7, m3 + paddd m6, m4 + mov r1, ssem ; r1 = unsigned int *sse + movd [r1], m7 ; store sse + movd eax, m6 ; store sum as return value +%endif + RET +%endmacro + +%macro INC_SRC_BY_SRC_STRIDE 0 +%if ARCH_X86=1 && CONFIG_PIC=1 + add srcq, src_stridemp + add srcq, src_stridemp +%else + lea srcq, [srcq + src_strideq*2] +%endif +%endmacro + +%macro SUBPEL_VARIANCE 1-2 0 ; W +%define bilin_filter_m bilin_filter_m_sse2 +%define filter_idx_shift 5 + + +%if ARCH_X86_64 + %if %2 == 1 ; avg + cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, height, sse + %define sec_str sec_strideq + %else + cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, height, sse + %endif + %define block_height heightd + %define bilin_filter sseq +%else + %if CONFIG_PIC=1 + %if %2 == 1 ; avg + cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, height, sse + %define block_height dword heightm + %define sec_str sec_stridemp + %else + cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, height, sse + %define block_height heightd + %endif + + ; reuse argument stack space + %define g_bilin_filterm x_offsetm + %define g_pw_8m y_offsetm + + ; Store bilin_filter and pw_8 location in stack + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back + %else + %if %2 == 1 ; avg + cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, height, sse + %define block_height dword heightm + %define sec_str sec_stridemp + %else + cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, height, sse + %define block_height heightd + %endif + + %define bilin_filter bilin_filter_m + %endif +%endif + + ASSERT %1 <= 16 ; m6 overflows if w > 16 + pxor m6, m6 ; sum + pxor m7, m7 ; sse + +%if %1 < 16 + sar block_height, 1 +%endif +%if %2 == 1 ; avg + shl sec_str, 1 +%endif + + ; FIXME(rbultje) replace by jumptable? + test x_offsetd, x_offsetd + jnz .x_nonzero + ; x_offset == 0 + test y_offsetd, y_offsetd + jnz .x_zero_y_nonzero + + ; x_offset == 0 && y_offset == 0 +.x_zero_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m2, [srcq + 16] + mova m1, [dstq] + mova m3, [dstq + 16] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m2, [secq+16] +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq + src_strideq*2] + mova m1, [dstq] + mova m3, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m2, [secq] +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_zero_y_zero_loop + STORE_AND_RET + +.x_zero_y_nonzero: + cmp y_offsetd, 8 + jne .x_zero_y_nonhalf + + ; x_offset == 0 && y_offset == 0.5 +.x_zero_y_half_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m4, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*2+16] + mova m2, [dstq] + mova m3, [dstq+16] + pavgw m0, m4 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*4] + mova m2, [dstq] + mova m3, [dstq+dst_strideq*2] + pavgw m0, m1 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m1, [secq] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_zero_y_half_loop + STORE_AND_RET + +.x_zero_y_nonhalf: + ; x_offset == 0 && y_offset == bilin interpolation +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+y_offsetq] + mova m9, [bilin_filter+y_offsetq+16] + mova m10, [GLOBAL(pw_8)] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ; x86-32 or mmx +%if ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0, reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +.x_zero_y_other_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq + 16] + movu m4, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*2+16] + mova m2, [dstq] + mova m3, [dstq+16] + ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can + ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of + ; instructions is the same (5), but it is 1 mul instead of 2, so might be + ; slightly faster because of pmullw latency. It would also cut our rodata + ; tables in half for this function, and save 1-2 registers on x86-64. + pmullw m1, filter_y_a + pmullw m5, filter_y_b + paddw m1, filter_rnd + pmullw m0, filter_y_a + pmullw m4, filter_y_b + paddw m0, filter_rnd + paddw m1, m5 + paddw m0, m4 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*4] + mova m4, m1 + mova m2, [dstq] + mova m3, [dstq+dst_strideq*2] + pmullw m1, filter_y_a + pmullw m5, filter_y_b + paddw m1, filter_rnd + pmullw m0, filter_y_a + pmullw m4, filter_y_b + paddw m0, filter_rnd + paddw m1, m5 + paddw m0, m4 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m1, [secq] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_zero_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET + +.x_nonzero: + cmp x_offsetd, 8 + jne .x_nonhalf + ; x_offset == 0.5 + test y_offsetd, y_offsetd + jnz .x_half_y_nonzero + + ; x_offset == 0.5 && y_offset == 0 +.x_half_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq + 16] + movu m4, [srcq + 2] + movu m5, [srcq + 18] + mova m2, [dstq] + mova m3, [dstq + 16] + pavgw m0, m4 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq + src_strideq*2] + movu m4, [srcq + 2] + movu m5, [srcq + src_strideq*2 + 2] + mova m2, [dstq] + mova m3, [dstq + dst_strideq*2] + pavgw m0, m4 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m1, [secq] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_half_y_zero_loop + STORE_AND_RET + +.x_half_y_nonzero: + cmp y_offsetd, 8 + jne .x_half_y_nonhalf + + ; x_offset == 0.5 && y_offset == 0.5 +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 + pavgw m1, m3 +.x_half_y_half_loop: + movu m2, [srcq] + movu m3, [srcq + 16] + movu m4, [srcq + 2] + movu m5, [srcq + 18] + pavgw m2, m4 + pavgw m3, m5 + pavgw m0, m2 + pavgw m1, m3 + mova m4, [dstq] + mova m5, [dstq + 16] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + mova m0, m2 + mova m1, m3 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 +.x_half_y_half_loop: + movu m2, [srcq] + movu m3, [srcq + src_strideq*2] + movu m4, [srcq + 2] + movu m5, [srcq + src_strideq*2 + 2] + pavgw m2, m4 + pavgw m3, m5 + pavgw m0, m2 + pavgw m2, m3 + mova m4, [dstq] + mova m5, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m2, [secq] +%endif + SUM_SSE m0, m4, m2, m5, m6, m7 + mova m0, m3 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_half_y_half_loop + STORE_AND_RET + +.x_half_y_nonhalf: + ; x_offset == 0.5 && y_offset == bilin interpolation +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+y_offsetq] + mova m9, [bilin_filter+y_offsetq+16] + mova m10, [GLOBAL(pw_8)] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ; x86_32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0.5. We can reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 + pavgw m1, m3 +.x_half_y_other_loop: + movu m2, [srcq] + movu m3, [srcq+16] + movu m4, [srcq+2] + movu m5, [srcq+18] + pavgw m2, m4 + pavgw m3, m5 + mova m4, m2 + mova m5, m3 + pmullw m1, filter_y_a + pmullw m3, filter_y_b + paddw m1, filter_rnd + paddw m1, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + psrlw m1, 4 + paddw m0, m2 + mova m2, [dstq] + psrlw m0, 4 + mova m3, [dstq+16] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + mova m0, m4 + mova m1, m5 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 +.x_half_y_other_loop: + movu m2, [srcq] + movu m3, [srcq+src_strideq*2] + movu m4, [srcq+2] + movu m5, [srcq+src_strideq*2+2] + pavgw m2, m4 + pavgw m3, m5 + mova m4, m2 + mova m5, m3 + pmullw m4, filter_y_a + pmullw m3, filter_y_b + paddw m4, filter_rnd + paddw m4, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + psrlw m4, 4 + paddw m0, m2 + mova m2, [dstq] + psrlw m0, 4 + mova m3, [dstq+dst_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m4, [secq] +%endif + SUM_SSE m0, m2, m4, m3, m6, m7 + mova m0, m5 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_half_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf: + test y_offsetd, y_offsetd + jnz .x_nonhalf_y_nonzero + + ; x_offset == bilin interpolation && y_offset == 0 +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] + mova m9, [bilin_filter+x_offsetq+16] + mova m10, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; y_offset == 0. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +.x_other_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + mova m4, [dstq] + mova m5, [dstq+16] + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m1, m3 + paddw m0, m2 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq+src_strideq*2] + movu m2, [srcq+2] + movu m3, [srcq+src_strideq*2+2] + mova m4, [dstq] + mova m5, [dstq+dst_strideq*2] + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m1, m3 + paddw m0, m2 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m1, [secq] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + + lea srcq, [srcq+src_strideq*4] + lea dstq, [dstq+dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_other_y_zero_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf_y_nonzero: + cmp y_offsetd, 8 + jne .x_nonhalf_y_nonhalf + + ; x_offset == bilin interpolation && y_offset == 0.5 +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] + mova m9, [bilin_filter+x_offsetq+16] + mova m10, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; y_offset == 0.5. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + paddw m0, m2 + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + lea srcq, [srcq+src_strideq*2] +.x_other_y_half_loop: + movu m2, [srcq] + movu m3, [srcq+16] + movu m4, [srcq+2] + movu m5, [srcq+18] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + mova m4, [dstq] + mova m5, [dstq+16] + psrlw m2, 4 + psrlw m3, 4 + pavgw m0, m2 + pavgw m1, m3 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + mova m0, m2 + mova m1, m3 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m0, m2 + psrlw m0, 4 + lea srcq, [srcq+src_strideq*2] +.x_other_y_half_loop: + movu m2, [srcq] + movu m3, [srcq+src_strideq*2] + movu m4, [srcq+2] + movu m5, [srcq+src_strideq*2+2] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + mova m4, [dstq] + mova m5, [dstq+dst_strideq*2] + psrlw m2, 4 + psrlw m3, 4 + pavgw m0, m2 + pavgw m2, m3 +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m2, [secq] +%endif + SUM_SSE m0, m4, m2, m5, m6, m7 + mova m0, m3 + + lea srcq, [srcq+src_strideq*4] + lea dstq, [dstq+dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_other_y_half_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf_y_nonhalf: +; loading filter - this is same as in 8-bit depth +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5 + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] + mova m9, [bilin_filter+x_offsetq+16] + mova m10, [bilin_filter+y_offsetq] + mova m11, [bilin_filter+y_offsetq+16] + mova m12, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_y_a m10 +%define filter_y_b m11 +%define filter_rnd m12 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; In this case, there is NO unused register. Used src_stride register. Later, +; src_stride has to be loaded from stack when it is needed. +%define tempq src_strideq + mov tempq, g_bilin_filterm + add x_offsetq, tempq + add y_offsetq, tempq +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter + add y_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif +; end of load filter + + ; x_offset == bilin interpolation && y_offset == bilin interpolation +%if %1 == 16 + movu m0, [srcq] + movu m2, [srcq+2] + movu m1, [srcq+16] + movu m3, [srcq+18] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + paddw m0, m2 + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + + INC_SRC_BY_SRC_STRIDE + +.x_other_y_other_loop: + movu m2, [srcq] + movu m4, [srcq+2] + movu m3, [srcq+16] + movu m5, [srcq+18] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + psrlw m2, 4 + psrlw m3, 4 + mova m4, m2 + mova m5, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + pmullw m1, filter_y_a + pmullw m3, filter_y_b + paddw m0, m2 + paddw m1, filter_rnd + mova m2, [dstq] + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + mova m3, [dstq+16] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + mova m0, m4 + mova m1, m5 + + INC_SRC_BY_SRC_STRIDE + lea dstq, [dstq + dst_strideq * 2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m0, m2 + psrlw m0, 4 + + INC_SRC_BY_SRC_STRIDE + +.x_other_y_other_loop: + movu m2, [srcq] + movu m4, [srcq+2] + INC_SRC_BY_SRC_STRIDE + movu m3, [srcq] + movu m5, [srcq+2] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + psrlw m2, 4 + psrlw m3, 4 + mova m4, m2 + mova m5, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + pmullw m4, filter_y_a + pmullw m3, filter_y_b + paddw m0, m2 + paddw m4, filter_rnd + mova m2, [dstq] + paddw m4, m3 + psrlw m0, 4 + psrlw m4, 4 + mova m3, [dstq+dst_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m4, [secq] +%endif + SUM_SSE m0, m2, m4, m3, m6, m7 + mova m0, m5 + + INC_SRC_BY_SRC_STRIDE + lea dstq, [dstq + dst_strideq * 4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_other_y_other_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET +%endmacro + +INIT_XMM sse2 +SUBPEL_VARIANCE 8 +SUBPEL_VARIANCE 16 + +INIT_XMM sse2 +SUBPEL_VARIANCE 8, 1 +SUBPEL_VARIANCE 16, 1 diff --git a/libs/libaom/src/aom_dsp/x86/highbd_subtract_sse2.c b/libs/libaom/src/aom_dsp/x86/highbd_subtract_sse2.c new file mode 100644 index 000000000..b72d1cf8b --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/highbd_subtract_sse2.c @@ -0,0 +1,267 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +typedef void (*SubtractWxHFuncType)(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, + ptrdiff_t pred_stride); + +static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + __m128i u0, u1, u2, u3; + __m128i v0, v1, v2, v3; + __m128i x0, x1, x2, x3; + int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride); + + u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride)); + u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride)); + u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride)); + u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride)); + + v0 = _mm_loadl_epi64((__m128i const *)(pred + 0 * pred_stride)); + v1 = _mm_loadl_epi64((__m128i const *)(pred + 1 * pred_stride)); + v2 = _mm_loadl_epi64((__m128i const *)(pred + 2 * pred_stride)); + v3 = _mm_loadl_epi64((__m128i const *)(pred + 3 * pred_stride)); + + x0 = _mm_sub_epi16(u0, v0); + x1 = _mm_sub_epi16(u1, v1); + x2 = _mm_sub_epi16(u2, v2); + x3 = _mm_sub_epi16(u3, v3); + + _mm_storel_epi64((__m128i *)store_diff, x0); + store_diff = (int64_t *)(diff + 1 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x1); + store_diff = (int64_t *)(diff + 2 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x2); + store_diff = (int64_t *)(diff + 3 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x3); +} + +static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride); + + u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride)); + u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride)); + u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride)); + u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride)); + u4 = _mm_loadl_epi64((__m128i const *)(src + 4 * src_stride)); + u5 = _mm_loadl_epi64((__m128i const *)(src + 5 * src_stride)); + u6 = _mm_loadl_epi64((__m128i const *)(src + 6 * src_stride)); + u7 = _mm_loadl_epi64((__m128i const *)(src + 7 * src_stride)); + + v0 = _mm_loadl_epi64((__m128i const *)(pred + 0 * pred_stride)); + v1 = _mm_loadl_epi64((__m128i const *)(pred + 1 * pred_stride)); + v2 = _mm_loadl_epi64((__m128i const *)(pred + 2 * pred_stride)); + v3 = _mm_loadl_epi64((__m128i const *)(pred + 3 * pred_stride)); + v4 = _mm_loadl_epi64((__m128i const *)(pred + 4 * pred_stride)); + v5 = _mm_loadl_epi64((__m128i const *)(pred + 5 * pred_stride)); + v6 = _mm_loadl_epi64((__m128i const *)(pred + 6 * pred_stride)); + v7 = _mm_loadl_epi64((__m128i const *)(pred + 7 * pred_stride)); + + x0 = _mm_sub_epi16(u0, v0); + x1 = _mm_sub_epi16(u1, v1); + x2 = _mm_sub_epi16(u2, v2); + x3 = _mm_sub_epi16(u3, v3); + x4 = _mm_sub_epi16(u4, v4); + x5 = _mm_sub_epi16(u5, v5); + x6 = _mm_sub_epi16(u6, v6); + x7 = _mm_sub_epi16(u7, v7); + + _mm_storel_epi64((__m128i *)store_diff, x0); + store_diff = (int64_t *)(diff + 1 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x1); + store_diff = (int64_t *)(diff + 2 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x2); + store_diff = (int64_t *)(diff + 3 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x3); + store_diff = (int64_t *)(diff + 4 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x4); + store_diff = (int64_t *)(diff + 5 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x5); + store_diff = (int64_t *)(diff + 6 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x6); + store_diff = (int64_t *)(diff + 7 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x7); +} + +static void subtract_8x4(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + __m128i u0, u1, u2, u3; + __m128i v0, v1, v2, v3; + __m128i x0, x1, x2, x3; + + u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); + u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); + u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); + u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); + + v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride)); + v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride)); + v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride)); + v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride)); + + x0 = _mm_sub_epi16(u0, v0); + x1 = _mm_sub_epi16(u1, v1); + x2 = _mm_sub_epi16(u2, v2); + x3 = _mm_sub_epi16(u3, v3); + + _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0); + _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1); + _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2); + _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3); +} + +static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + + u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); + u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); + u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); + u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); + u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride)); + u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride)); + u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride)); + u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride)); + + v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride)); + v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride)); + v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride)); + v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride)); + v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride)); + v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride)); + v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride)); + v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride)); + + x0 = _mm_sub_epi16(u0, v0); + x1 = _mm_sub_epi16(u1, v1); + x2 = _mm_sub_epi16(u2, v2); + x3 = _mm_sub_epi16(u3, v3); + x4 = _mm_sub_epi16(u4, v4); + x5 = _mm_sub_epi16(u5, v5); + x6 = _mm_sub_epi16(u6, v6); + x7 = _mm_sub_epi16(u7, v7); + + _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0); + _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1); + _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2); + _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3); + _mm_storeu_si128((__m128i *)(diff + 4 * diff_stride), x4); + _mm_storeu_si128((__m128i *)(diff + 5 * diff_stride), x5); + _mm_storeu_si128((__m128i *)(diff + 6 * diff_stride), x6); + _mm_storeu_si128((__m128i *)(diff + 7 * diff_stride), x7); +} + +#define STACK_V(h, fun) \ + do { \ + fun(diff, diff_stride, src, src_stride, pred, pred_stride); \ + fun(diff + diff_stride * h, diff_stride, src + src_stride * h, src_stride, \ + pred + pred_stride * h, pred_stride); \ + } while (0) + +#define STACK_H(w, fun) \ + do { \ + fun(diff, diff_stride, src, src_stride, pred, pred_stride); \ + fun(diff + w, diff_stride, src + w, src_stride, pred + w, pred_stride); \ + } while (0) + +#define SUBTRACT_FUN(size) \ + static void subtract_##size(int16_t *diff, ptrdiff_t diff_stride, \ + const uint16_t *src, ptrdiff_t src_stride, \ + const uint16_t *pred, ptrdiff_t pred_stride) + +SUBTRACT_FUN(8x16) { STACK_V(8, subtract_8x8); } +SUBTRACT_FUN(16x8) { STACK_H(8, subtract_8x8); } +SUBTRACT_FUN(16x16) { STACK_V(8, subtract_16x8); } +SUBTRACT_FUN(16x32) { STACK_V(16, subtract_16x16); } +SUBTRACT_FUN(32x16) { STACK_H(16, subtract_16x16); } +SUBTRACT_FUN(32x32) { STACK_V(16, subtract_32x16); } +SUBTRACT_FUN(32x64) { STACK_V(32, subtract_32x32); } +SUBTRACT_FUN(64x32) { STACK_H(32, subtract_32x32); } +SUBTRACT_FUN(64x64) { STACK_V(32, subtract_64x32); } +SUBTRACT_FUN(64x128) { STACK_V(64, subtract_64x64); } +SUBTRACT_FUN(128x64) { STACK_H(64, subtract_64x64); } +SUBTRACT_FUN(128x128) { STACK_V(64, subtract_128x64); } +SUBTRACT_FUN(4x16) { STACK_V(8, subtract_4x8); } +SUBTRACT_FUN(16x4) { STACK_H(8, subtract_8x4); } +SUBTRACT_FUN(8x32) { STACK_V(16, subtract_8x16); } +SUBTRACT_FUN(32x8) { STACK_H(16, subtract_16x8); } +SUBTRACT_FUN(16x64) { STACK_V(32, subtract_16x32); } +SUBTRACT_FUN(64x16) { STACK_H(32, subtract_32x16); } + +static SubtractWxHFuncType getSubtractFunc(int rows, int cols) { + if (rows == 4) { + if (cols == 4) return subtract_4x4; + if (cols == 8) return subtract_8x4; + if (cols == 16) return subtract_16x4; + } + if (rows == 8) { + if (cols == 4) return subtract_4x8; + if (cols == 8) return subtract_8x8; + if (cols == 16) return subtract_16x8; + if (cols == 32) return subtract_32x8; + } + if (rows == 16) { + if (cols == 4) return subtract_4x16; + if (cols == 8) return subtract_8x16; + if (cols == 16) return subtract_16x16; + if (cols == 32) return subtract_32x16; + if (cols == 64) return subtract_64x16; + } + if (rows == 32) { + if (cols == 8) return subtract_8x32; + if (cols == 16) return subtract_16x32; + if (cols == 32) return subtract_32x32; + if (cols == 64) return subtract_64x32; + } + if (rows == 64) { + if (cols == 16) return subtract_16x64; + if (cols == 32) return subtract_32x64; + if (cols == 64) return subtract_64x64; + if (cols == 128) return subtract_128x64; + } + if (rows == 128) { + if (cols == 64) return subtract_64x128; + if (cols == 128) return subtract_128x128; + } + assert(0); + return NULL; +} + +void aom_highbd_subtract_block_sse2(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src8, + ptrdiff_t src_stride, const uint8_t *pred8, + ptrdiff_t pred_stride, int bd) { + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + SubtractWxHFuncType func; + (void)bd; + + func = getSubtractFunc(rows, cols); + func(diff, diff_stride, src, src_stride, pred, pred_stride); +} diff --git a/libs/libaom/src/aom_dsp/x86/highbd_variance_avx2.c b/libs/libaom/src/aom_dsp/x86/highbd_variance_avx2.c new file mode 100644 index 000000000..9b1b4c9de --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/highbd_variance_avx2.c @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // AVX2 + +#include "config/aom_dsp_rtcd.h" + +typedef void (*high_variance_fn_t)(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum) { + __m256i v_sum_d = _mm256_setzero_si256(); + __m256i v_sse_d = _mm256_setzero_si256(); + for (int i = 0; i < 8; i += 2) { + const __m128i v_p_a0 = _mm_loadu_si128((const __m128i *)src); + const __m128i v_p_a1 = _mm_loadu_si128((const __m128i *)(src + src_stride)); + const __m128i v_p_b0 = _mm_loadu_si128((const __m128i *)ref); + const __m128i v_p_b1 = _mm_loadu_si128((const __m128i *)(ref + ref_stride)); + __m256i v_p_a = _mm256_castsi128_si256(v_p_a0); + __m256i v_p_b = _mm256_castsi128_si256(v_p_b0); + v_p_a = _mm256_inserti128_si256(v_p_a, v_p_a1, 1); + v_p_b = _mm256_inserti128_si256(v_p_b, v_p_b1, 1); + const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b); + const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff); + v_sum_d = _mm256_add_epi16(v_sum_d, v_diff); + v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff); + src += src_stride * 2; + ref += ref_stride * 2; + } + __m256i v_sum00 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_sum_d)); + __m256i v_sum01 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(v_sum_d, 1)); + __m256i v_sum0 = _mm256_add_epi32(v_sum00, v_sum01); + __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d); + __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d); + __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); + const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); + const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); + __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); + v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); + *sum = _mm_extract_epi32(v_d, 0); + *sse = _mm_extract_epi32(v_d, 1); +} + +void aom_highbd_calc16x16var_avx2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum) { + __m256i v_sum_d = _mm256_setzero_si256(); + __m256i v_sse_d = _mm256_setzero_si256(); + const __m256i one = _mm256_set1_epi16(1); + for (int i = 0; i < 16; ++i) { + const __m256i v_p_a = _mm256_loadu_si256((const __m256i *)src); + const __m256i v_p_b = _mm256_loadu_si256((const __m256i *)ref); + const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b); + const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff); + v_sum_d = _mm256_add_epi16(v_sum_d, v_diff); + v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff); + src += src_stride; + ref += ref_stride; + } + __m256i v_sum0 = _mm256_madd_epi16(v_sum_d, one); + __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d); + __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d); + __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); + const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); + const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); + __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); + v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); + *sum = _mm_extract_epi32(v_d, 0); + *sse = _mm_extract_epi32(v_d, 1); +} + +static void highbd_10_variance_avx2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, int w, + int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int32_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 2); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); +} + +#define VAR_FN(w, h, block_size, shift) \ + uint32_t aom_highbd_10_variance##w##x##h##_avx2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_10_variance_avx2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +VAR_FN(128, 128, 16, 14); +VAR_FN(128, 64, 16, 13); +VAR_FN(64, 128, 16, 13); +VAR_FN(64, 64, 16, 12); +VAR_FN(64, 32, 16, 11); +VAR_FN(32, 64, 16, 11); +VAR_FN(32, 32, 16, 10); +VAR_FN(32, 16, 16, 9); +VAR_FN(16, 32, 16, 9); +VAR_FN(16, 16, 16, 8); +VAR_FN(16, 8, 8, 7); +VAR_FN(8, 16, 8, 7); +VAR_FN(8, 8, 8, 6); +VAR_FN(16, 4, 16, 6); +VAR_FN(8, 32, 8, 8); +VAR_FN(32, 8, 8, 8); +VAR_FN(16, 64, 16, 10); +VAR_FN(64, 16, 16, 10); + +#undef VAR_FN diff --git a/libs/libaom/src/aom_dsp/x86/highbd_variance_impl_sse2.asm b/libs/libaom/src/aom_dsp/x86/highbd_variance_impl_sse2.asm new file mode 100644 index 000000000..0d954e178 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/highbd_variance_impl_sse2.asm @@ -0,0 +1,318 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + +%include "aom_ports/x86_abi_support.asm" + +SECTION .text + +;unsigned int aom_highbd_calc16x16var_sse2 +;( +; unsigned char * src_ptr, +; int source_stride, +; unsigned char * ref_ptr, +; int recon_stride, +; unsigned int * SSE, +; int * Sum +;) +global sym(aom_highbd_calc16x16var_sse2) PRIVATE +sym(aom_highbd_calc16x16var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;[src_ptr] + mov rdi, arg(2) ;[ref_ptr] + + movsxd rax, DWORD PTR arg(1) ;[source_stride] + movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + add rax, rax ; source stride in bytes + add rdx, rdx ; recon stride in bytes + + ; Prefetch data + prefetcht0 [rsi] + prefetcht0 [rsi+16] + prefetcht0 [rsi+rax] + prefetcht0 [rsi+rax+16] + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rax] + prefetcht0 [rbx+rax+16] + + prefetcht0 [rdi] + prefetcht0 [rdi+16] + prefetcht0 [rdi+rdx] + prefetcht0 [rdi+rdx+16] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rdx] + prefetcht0 [rbx+rdx+16] + + pxor xmm0, xmm0 ; clear xmm0 for unpack + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs + + pxor xmm6, xmm6 ; clear xmm6 for accumulating sse + mov rcx, 16 + +.var16loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rdi] + + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rax] + prefetcht0 [rbx+rax+16] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rdx] + prefetcht0 [rbx+rdx+16] + + pxor xmm5, xmm5 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+16] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+16] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + movdqu xmm1, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm3 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax+16] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx+16] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + paddd xmm6, xmm3 + + movdqa xmm1, xmm5 + movdqa xmm2, xmm5 + pcmpgtw xmm1, xmm0 + pcmpeqw xmm2, xmm0 + por xmm1, xmm2 + pcmpeqw xmm1, xmm0 + movdqa xmm2, xmm5 + punpcklwd xmm5, xmm1 + punpckhwd xmm2, xmm1 + paddd xmm7, xmm5 + paddd xmm7, xmm2 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + sub rcx, 2 + jnz .var16loop + + movdqa xmm4, xmm6 + punpckldq xmm6, xmm0 + + punpckhdq xmm4, xmm0 + movdqa xmm5, xmm7 + + paddd xmm6, xmm4 + punpckldq xmm7, xmm0 + + punpckhdq xmm5, xmm0 + paddd xmm7, xmm5 + + movdqa xmm4, xmm6 + movdqa xmm5, xmm7 + + psrldq xmm4, 8 + psrldq xmm5, 8 + + paddd xmm6, xmm4 + paddd xmm7, xmm5 + + mov rdi, arg(4) ; [SSE] + mov rax, arg(5) ; [Sum] + + movd DWORD PTR [rdi], xmm6 + movd DWORD PTR [rax], xmm7 + + + ; begin epilog + pop rdi + pop rsi + pop rbx + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int aom_highbd_calc8x8var_sse2 +;( +; unsigned char * src_ptr, +; int source_stride, +; unsigned char * ref_ptr, +; int recon_stride, +; unsigned int * SSE, +; int * Sum +;) +global sym(aom_highbd_calc8x8var_sse2) PRIVATE +sym(aom_highbd_calc8x8var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;[src_ptr] + mov rdi, arg(2) ;[ref_ptr] + + movsxd rax, DWORD PTR arg(1) ;[source_stride] + movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + add rax, rax ; source stride in bytes + add rdx, rdx ; recon stride in bytes + + ; Prefetch data + prefetcht0 [rsi] + prefetcht0 [rsi+rax] + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + + prefetcht0 [rdi] + prefetcht0 [rdi+rdx] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + + pxor xmm0, xmm0 ; clear xmm0 for unpack + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs + + pxor xmm6, xmm6 ; clear xmm6 for accumulating sse + mov rcx, 8 + +.var8loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rdi] + + lea rbx, [rsi+rax*4] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + lea rbx, [rbx+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + lea rbx, [rdi+rdx*4] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + lea rbx, [rbx+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + + pxor xmm5, xmm5 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm1 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + + psubw xmm3, xmm2 + movdqu xmm1, XMMWORD PTR [rsi] + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + movdqu xmm2, XMMWORD PTR [rdi] + paddd xmm6, xmm3 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + paddd xmm6, xmm3 + + movdqa xmm1, xmm5 + movdqa xmm2, xmm5 + pcmpgtw xmm1, xmm0 + pcmpeqw xmm2, xmm0 + por xmm1, xmm2 + pcmpeqw xmm1, xmm0 + movdqa xmm2, xmm5 + punpcklwd xmm5, xmm1 + punpckhwd xmm2, xmm1 + paddd xmm7, xmm5 + paddd xmm7, xmm2 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + sub rcx, 4 + jnz .var8loop + + movdqa xmm4, xmm6 + punpckldq xmm6, xmm0 + + punpckhdq xmm4, xmm0 + movdqa xmm5, xmm7 + + paddd xmm6, xmm4 + punpckldq xmm7, xmm0 + + punpckhdq xmm5, xmm0 + paddd xmm7, xmm5 + + movdqa xmm4, xmm6 + movdqa xmm5, xmm7 + + psrldq xmm4, 8 + psrldq xmm5, 8 + + paddd xmm6, xmm4 + paddd xmm7, xmm5 + + mov rdi, arg(4) ; [SSE] + mov rax, arg(5) ; [Sum] + + movd DWORD PTR [rdi], xmm6 + movd DWORD PTR [rax], xmm7 + + ; begin epilog + pop rdi + pop rsi + pop rbx + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/libs/libaom/src/aom_dsp/x86/highbd_variance_sse2.c b/libs/libaom/src/aom_dsp/x86/highbd_variance_sse2.c new file mode 100644 index 000000000..b7d15f93e --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/highbd_variance_sse2.c @@ -0,0 +1,842 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" + +#include "aom_ports/mem.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/filter.h" +#include "av1/common/reconinter.h" +#include "av1/encoder/reconinter_enc.h" + +typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +uint32_t aom_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +uint32_t aom_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +static void highbd_8_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, int w, + int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); + *sse += sse0; + *sum += sum0; + } + } +} + +static void highbd_10_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, int w, + int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int32_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 2); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); +} + +static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, int w, + int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int32_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 4); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); +} + +#define HIGH_GET_VAR(S) \ + void aom_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \ + sum); \ + } \ + \ + void aom_highbd_10_get##S##x##S##var_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \ + sum); \ + *sum = ROUND_POWER_OF_TWO(*sum, 2); \ + *sse = ROUND_POWER_OF_TWO(*sse, 4); \ + } \ + \ + void aom_highbd_12_get##S##x##S##var_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \ + sum); \ + *sum = ROUND_POWER_OF_TWO(*sum, 4); \ + *sse = ROUND_POWER_OF_TWO(*sse, 8); \ + } + +HIGH_GET_VAR(16); +HIGH_GET_VAR(8); + +#undef HIGH_GET_VAR + +#define VAR_FN(w, h, block_size, shift) \ + uint32_t aom_highbd_8_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_8_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ + } \ + \ + uint32_t aom_highbd_10_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_10_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t aom_highbd_12_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_12_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +VAR_FN(128, 128, 16, 14); +VAR_FN(128, 64, 16, 13); +VAR_FN(64, 128, 16, 13); +VAR_FN(64, 64, 16, 12); +VAR_FN(64, 32, 16, 11); +VAR_FN(32, 64, 16, 11); +VAR_FN(32, 32, 16, 10); +VAR_FN(32, 16, 16, 9); +VAR_FN(16, 32, 16, 9); +VAR_FN(16, 16, 16, 8); +VAR_FN(16, 8, 8, 7); +VAR_FN(8, 16, 8, 7); +VAR_FN(8, 8, 8, 6); +VAR_FN(8, 32, 8, 8); +VAR_FN(32, 8, 8, 8); +VAR_FN(16, 64, 16, 10); +VAR_FN(64, 16, 16, 10); + +#undef VAR_FN + +unsigned int aom_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, + aom_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, + aom_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int aom_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, + aom_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int aom_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, + aom_highbd_calc8x8var_sse2, 8); + return *sse; +} + +unsigned int aom_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, + aom_highbd_calc8x8var_sse2, 8); + return *sse; +} + +unsigned int aom_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, + aom_highbd_calc8x8var_sse2, 8); + return *sse; +} + +// The 2 unused parameters are place holders for PIC enabled build. +// These definitions are for functions defined in +// highbd_subpel_variance_impl_sse2.asm +#define DECL(w, opt) \ + int aom_highbd_sub_pixel_variance##w##xh_##opt( \ + const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ + const uint16_t *dst, ptrdiff_t dst_stride, int height, \ + unsigned int *sse, void *unused0, void *unused); +#define DECLS(opt) \ + DECL(8, opt); \ + DECL(16, opt) + +DECLS(sse2); + +#undef DECLS +#undef DECL + +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ + uint32_t aom_highbd_8_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + int se = 0; \ + unsigned int sse = 0; \ + unsigned int sse2; \ + int row_rep = (w > 64) ? 2 : 1; \ + for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \ + src += wd_64 * 64; \ + dst += wd_64 * 64; \ + int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse2, \ + NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf) { \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf, \ + dst_stride, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf, \ + dst_stride, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + } \ + *sse_ptr = sse; \ + return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \ + } \ + \ + uint32_t aom_highbd_10_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + int64_t var; \ + uint32_t sse; \ + uint64_t long_sse = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + int se = 0; \ + int row_rep = (w > 64) ? 2 : 1; \ + for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \ + src += wd_64 * 64; \ + dst += wd_64 * 64; \ + int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \ + NULL); \ + se += se2; \ + long_sse += sse; \ + if (w > wf) { \ + uint32_t sse2; \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf, \ + dst_stride, h, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf, \ + dst_stride, h, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + } \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 2); \ + sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 4); \ + *sse_ptr = sse; \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t aom_highbd_12_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + int start_row; \ + uint32_t sse; \ + int se = 0; \ + int64_t var; \ + uint64_t long_sse = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + int row_rep = (w > 64) ? 2 : 1; \ + for (start_row = 0; start_row < h; start_row += 16) { \ + uint32_t sse2; \ + int height = h - start_row < 16 ? h - start_row : 16; \ + uint16_t *src_tmp = src + (start_row * src_stride); \ + uint16_t *dst_tmp = dst + (start_row * dst_stride); \ + for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \ + src_tmp += wd_64 * 64; \ + dst_tmp += wd_64 * 64; \ + int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src_tmp, src_stride, x_offset, y_offset, dst_tmp, dst_stride, \ + height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf) { \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src_tmp + wf, src_stride, x_offset, y_offset, dst_tmp + wf, \ + dst_stride, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src_tmp + 2 * wf, src_stride, x_offset, y_offset, \ + dst_tmp + 2 * wf, dst_stride, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src_tmp + 3 * wf, src_stride, x_offset, y_offset, \ + dst_tmp + 3 * wf, dst_stride, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + } \ + } \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 4); \ + sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ + *sse_ptr = sse; \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define FNS(opt) \ + FN(128, 128, 16, 7, 7, opt, (int64_t)); \ + FN(128, 64, 16, 7, 6, opt, (int64_t)); \ + FN(64, 128, 16, 6, 7, opt, (int64_t)); \ + FN(64, 64, 16, 6, 6, opt, (int64_t)); \ + FN(64, 32, 16, 6, 5, opt, (int64_t)); \ + FN(32, 64, 16, 5, 6, opt, (int64_t)); \ + FN(32, 32, 16, 5, 5, opt, (int64_t)); \ + FN(32, 16, 16, 5, 4, opt, (int64_t)); \ + FN(16, 32, 16, 4, 5, opt, (int64_t)); \ + FN(16, 16, 16, 4, 4, opt, (int64_t)); \ + FN(16, 8, 16, 4, 3, opt, (int64_t)); \ + FN(8, 16, 8, 3, 4, opt, (int64_t)); \ + FN(8, 8, 8, 3, 3, opt, (int64_t)); \ + FN(8, 4, 8, 3, 2, opt, (int64_t)); \ + FN(16, 4, 16, 4, 2, opt, (int64_t)); \ + FN(8, 32, 8, 3, 5, opt, (int64_t)); \ + FN(32, 8, 16, 5, 3, opt, (int64_t)); \ + FN(16, 64, 16, 4, 6, opt, (int64_t)); \ + FN(64, 16, 16, 6, 4, opt, (int64_t)) + +FNS(sse2); + +#undef FNS +#undef FN + +// The 2 unused parameters are place holders for PIC enabled build. +#define DECL(w, opt) \ + int aom_highbd_sub_pixel_avg_variance##w##xh_##opt( \ + const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ + const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec, \ + ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ + void *unused); +#define DECLS(opt) \ + DECL(16, opt) \ + DECL(8, opt) + +DECLS(sse2); +#undef DECL +#undef DECLS + +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ + uint32_t aom_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *sec8) { \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ + NULL, NULL); \ + if (w > wf) { \ + uint32_t sse2; \ + int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, \ + sec + wf, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf, \ + dst_stride, sec + 2 * wf, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf, \ + dst_stride, sec + 3 * wf, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sse_ptr = sse; \ + return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \ + } \ + \ + uint32_t aom_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *sec8) { \ + int64_t var; \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ + NULL, NULL); \ + if (w > wf) { \ + uint32_t sse2; \ + int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, \ + sec + wf, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf, \ + dst_stride, sec + 2 * wf, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf, \ + dst_stride, sec + 3 * wf, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 2); \ + sse = ROUND_POWER_OF_TWO(sse, 4); \ + *sse_ptr = sse; \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t aom_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *sec8) { \ + int start_row; \ + int64_t var; \ + uint32_t sse; \ + int se = 0; \ + uint64_t long_sse = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + for (start_row = 0; start_row < h; start_row += 16) { \ + uint32_t sse2; \ + int height = h - start_row < 16 ? h - start_row : 16; \ + int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + (start_row * src_stride), src_stride, x_offset, y_offset, \ + dst + (start_row * dst_stride), dst_stride, sec + (start_row * w), \ + w, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf) { \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + wf + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + wf + (start_row * dst_stride), dst_stride, \ + sec + wf + (start_row * w), w, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 2 * wf + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 2 * wf + (start_row * dst_stride), dst_stride, \ + sec + 2 * wf + (start_row * w), w, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 3 * wf + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 3 * wf + (start_row * dst_stride), dst_stride, \ + sec + 3 * wf + (start_row * w), w, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + } \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 4); \ + sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ + *sse_ptr = sse; \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define FNS(opt) \ + FN(64, 64, 16, 6, 6, opt, (int64_t)); \ + FN(64, 32, 16, 6, 5, opt, (int64_t)); \ + FN(32, 64, 16, 5, 6, opt, (int64_t)); \ + FN(32, 32, 16, 5, 5, opt, (int64_t)); \ + FN(32, 16, 16, 5, 4, opt, (int64_t)); \ + FN(16, 32, 16, 4, 5, opt, (int64_t)); \ + FN(16, 16, 16, 4, 4, opt, (int64_t)); \ + FN(16, 8, 16, 4, 3, opt, (int64_t)); \ + FN(8, 16, 8, 3, 4, opt, (int64_t)); \ + FN(8, 8, 8, 3, 3, opt, (int64_t)); \ + FN(8, 4, 8, 3, 2, opt, (int64_t)); \ + FN(16, 4, 16, 4, 2, opt, (int64_t)); \ + FN(8, 32, 8, 3, 5, opt, (int64_t)); \ + FN(32, 8, 16, 5, 3, opt, (int64_t)); \ + FN(16, 64, 16, 4, 6, opt, (int64_t)); \ + FN(64, 16, 16, 6, 4, opt, (int64_t)); + +FNS(sse2); + +#undef FNS +#undef FN + +void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, + const struct AV1Common *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred8, int width, int height, + int subpel_x_q3, int subpel_y_q3, + const uint8_t *ref8, int ref_stride, int bd, + int subpel_search) { + // expect xd == NULL only in tests + if (xd != NULL) { + const MB_MODE_INFO *mi = xd->mi[0]; + const int ref_num = 0; + const int is_intrabc = is_intrabc_block(mi); + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; + const int is_scaled = av1_is_scaled(sf); + + if (is_scaled) { + int plane = 0; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const struct buf_2d *const dst_buf = &pd->dst; + const struct buf_2d *const pre_buf = + is_intrabc ? dst_buf : &pd->pre[ref_num]; + + InterPredParams inter_pred_params; + inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); + const int_interpfilters filters = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + av1_init_inter_params( + &inter_pred_params, width, height, mi_y >> pd->subsampling_y, + mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, + xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters); + av1_enc_build_one_inter_predictor(comp_pred8, width, mv, + &inter_pred_params); + return; + } + } + + const InterpFilterParams *filter = av1_get_filter(subpel_search); + int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS; + if (!subpel_x_q3 && !subpel_y_q3) { + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + if (width >= 8) { + int i; + assert(!(width & 7)); + /*Read 8 pixels one row at a time.*/ + for (i = 0; i < height; i++) { + int j; + for (j = 0; j < width; j += 8) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + _mm_storeu_si128((__m128i *)comp_pred, s0); + comp_pred += 8; + ref += 8; + } + ref += ref_stride - width; + } + } else { + int i; + assert(!(width & 3)); + /*Read 4 pixels two rows at a time.*/ + for (i = 0; i < height; i += 2) { + __m128i s0 = _mm_loadl_epi64((const __m128i *)ref); + __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride)); + __m128i t0 = _mm_unpacklo_epi64(s0, s1); + _mm_storeu_si128((__m128i *)comp_pred, t0); + comp_pred += 8; + ref += 2 * ref_stride; + } + } + } else if (!subpel_y_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16, + NULL, -1, width, height, bd); + } else if (!subpel_x_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1, + kernel, 16, width, height, bd); + } else { + DECLARE_ALIGNED(16, uint16_t, + temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]); + const int16_t *const kernel_x = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + const int16_t *const kernel_y = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + const uint8_t *ref_start = ref8 - ref_stride * ((filter_taps >> 1) - 1); + uint16_t *temp_start_horiz = (subpel_search <= USE_4_TAPS) + ? temp + (filter_taps >> 1) * MAX_SB_SIZE + : temp; + uint16_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1); + const int intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps; + assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); + aom_highbd_convolve8_horiz( + ref_start, ref_stride, CONVERT_TO_BYTEPTR(temp_start_horiz), + MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height, bd); + aom_highbd_convolve8_vert(CONVERT_TO_BYTEPTR(temp_start_vert), MAX_SB_SIZE, + comp_pred8, width, NULL, -1, kernel_y, 16, width, + height, bd); + } +} + +void aom_highbd_comp_avg_upsampled_pred_sse2( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, int bd, int subpel_search) { + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, + height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, + bd, subpel_search); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8); + /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/ + assert(!(width * height & 7)); + int n = width * height >> 3; + for (int i = 0; i < n; i++) { + __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred16); + __m128i p0 = _mm_loadu_si128((const __m128i *)pred); + _mm_storeu_si128((__m128i *)comp_pred16, _mm_avg_epu16(s0, p0)); + comp_pred16 += 8; + pred += 8; + } +} + +static INLINE void highbd_compute_dist_wtd_comp_avg(__m128i *p0, __m128i *p1, + const __m128i *w0, + const __m128i *w1, + const __m128i *r, + void *const result) { + assert(DIST_PRECISION_BITS <= 4); + __m128i mult0 = _mm_mullo_epi16(*p0, *w0); + __m128i mult1 = _mm_mullo_epi16(*p1, *w1); + __m128i sum = _mm_adds_epu16(mult0, mult1); + __m128i round = _mm_adds_epu16(sum, *r); + __m128i shift = _mm_srli_epi16(round, DIST_PRECISION_BITS); + + xx_storeu_128(result, shift); +} + +void aom_highbd_dist_wtd_comp_avg_pred_sse2( + uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, + const uint8_t *ref8, int ref_stride, + const DIST_WTD_COMP_PARAMS *jcp_param) { + int i; + const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset; + const uint16_t wt1 = (uint16_t)jcp_param->bck_offset; + const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0); + const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1); + const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1); + const __m128i r = + _mm_set_epi16(round, round, round, round, round, round, round, round); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + + if (width >= 8) { + // Read 8 pixels one row at a time + assert(!(width & 7)); + for (i = 0; i < height; ++i) { + int j; + for (j = 0; j < width; j += 8) { + __m128i p0 = xx_loadu_128(ref); + __m128i p1 = xx_loadu_128(pred); + + highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred); + + comp_pred += 8; + pred += 8; + ref += 8; + } + ref += ref_stride - width; + } + } else { + // Read 4 pixels two rows at a time + assert(!(width & 3)); + for (i = 0; i < height; i += 2) { + __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride); + __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride); + __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1); + __m128i p1 = xx_loadu_128(pred); + + highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred); + + comp_pred += 8; + pred += 8; + ref += 2 * ref_stride; + } + } +} + +void aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, + int subpel_search) { + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + int n; + int i; + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, + height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, + bd, subpel_search); + assert(!(width * height & 7)); + n = width * height >> 3; + + const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset; + const uint16_t wt1 = (uint16_t)jcp_param->bck_offset; + const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0); + const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1); + const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1); + const __m128i r = + _mm_set_epi16(round, round, round, round, round, round, round, round); + + uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8); + for (i = 0; i < n; i++) { + __m128i p0 = xx_loadu_128(comp_pred16); + __m128i p1 = xx_loadu_128(pred); + + highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16); + + comp_pred16 += 8; + pred += 8; + } +} diff --git a/libs/libaom/src/aom_dsp/x86/highbd_variance_sse4.c b/libs/libaom/src/aom_dsp/x86/highbd_variance_sse4.c new file mode 100644 index 000000000..df5449a9d --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/highbd_variance_sse4.c @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include /* SSE4.1 */ + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/variance.h" +#include "aom_dsp/aom_filter.h" + +static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + uint64_t *sse, int64_t *sum) { + __m128i u0, u1, u2, u3; + __m128i s0, s1, s2, s3; + __m128i t0, t1, x0, y0; + __m128i a0, a1, a2, a3; + __m128i b0, b1, b2, b3; + __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1); + + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + + a0 = _mm_loadl_epi64((__m128i const *)(a + 0 * a_stride)); + a1 = _mm_loadl_epi64((__m128i const *)(a + 1 * a_stride)); + a2 = _mm_loadl_epi64((__m128i const *)(a + 2 * a_stride)); + a3 = _mm_loadl_epi64((__m128i const *)(a + 3 * a_stride)); + + b0 = _mm_loadl_epi64((__m128i const *)(b + 0 * b_stride)); + b1 = _mm_loadl_epi64((__m128i const *)(b + 1 * b_stride)); + b2 = _mm_loadl_epi64((__m128i const *)(b + 2 * b_stride)); + b3 = _mm_loadl_epi64((__m128i const *)(b + 3 * b_stride)); + + u0 = _mm_unpacklo_epi16(a0, a1); + u1 = _mm_unpacklo_epi16(a2, a3); + u2 = _mm_unpacklo_epi16(b0, b1); + u3 = _mm_unpacklo_epi16(b2, b3); + + s0 = _mm_sub_epi16(u0, u2); + s1 = _mm_sub_epi16(u1, u3); + + t0 = _mm_madd_epi16(s0, k_one_epi16); + t1 = _mm_madd_epi16(s1, k_one_epi16); + + s2 = _mm_hadd_epi32(t0, t1); + s3 = _mm_hadd_epi32(s2, s2); + y0 = _mm_hadd_epi32(s3, s3); + + t0 = _mm_madd_epi16(s0, s0); + t1 = _mm_madd_epi16(s1, s1); + + s2 = _mm_hadd_epi32(t0, t1); + s3 = _mm_hadd_epi32(s2, s2); + x0 = _mm_hadd_epi32(s3, s3); + + *sse = (uint64_t)_mm_extract_epi32(x0, 0); + *sum = (int64_t)_mm_extract_epi32(y0, 0); +} + +uint32_t aom_highbd_8_variance4x4_sse4_1(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse) { + int64_t sum, diff; + uint64_t local_sse; + + variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum); + *sse = (uint32_t)local_sse; + + diff = (int64_t)*sse - ((sum * sum) >> 4); + return (diff >= 0) ? (uint32_t)diff : 0; +} + +uint32_t aom_highbd_10_variance4x4_sse4_1(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse) { + int64_t sum, diff; + uint64_t local_sse; + + variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum); + *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4); + sum = ROUND_POWER_OF_TWO(sum, 2); + + diff = (int64_t)*sse - ((sum * sum) >> 4); + return (diff >= 0) ? (uint32_t)diff : 0; +} + +uint32_t aom_highbd_12_variance4x4_sse4_1(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse) { + int64_t sum, diff; + uint64_t local_sse; + + variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum); + *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8); + sum = ROUND_POWER_OF_TWO(sum, 4); + + diff = (int64_t)*sse - ((sum * sum) >> 4); + return diff >= 0 ? (uint32_t)diff : 0; +} + +// Sub-pixel +uint32_t aom_highbd_8_sub_pixel_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride, + sse); +} + +uint32_t aom_highbd_10_sub_pixel_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, + dst_stride, sse); +} + +uint32_t aom_highbd_12_sub_pixel_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, + dst_stride, sse); +} + +// Sub-pixel average + +uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse, + const uint8_t *second_pred) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4, + CONVERT_TO_BYTEPTR(temp2), 4); + + return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride, + sse); +} + +uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse, + const uint8_t *second_pred) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4, + CONVERT_TO_BYTEPTR(temp2), 4); + + return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, + dst_stride, sse); +} + +uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse, + const uint8_t *second_pred) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4, + CONVERT_TO_BYTEPTR(temp2), 4); + + return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, + dst_stride, sse); +} diff --git a/libs/libaom/src/aom_dsp/x86/intrapred_asm_sse2.asm b/libs/libaom/src/aom_dsp/x86/intrapred_asm_sse2.asm new file mode 100644 index 000000000..0eb632326 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/intrapred_asm_sse2.asm @@ -0,0 +1,608 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pb_1: times 16 db 1 +pw_4: times 8 dw 4 +pw_8: times 8 dw 8 +pw_16: times 8 dw 16 +pw_32: times 8 dw 32 +dc_128: times 16 db 128 +pw2_4: times 8 dw 2 +pw2_8: times 8 dw 4 +pw2_16: times 8 dw 8 +pw2_32: times 8 dw 16 + +SECTION .text + +INIT_XMM sse2 +cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + movd m2, [leftq] + movd m0, [aboveq] + pxor m1, m1 + punpckldq m0, m2 + psadbw m0, m1 + paddw m0, [GLOBAL(pw_4)] + psraw m0, 3 + pshuflw m0, m0, 0x0 + packuswb m0, m0 + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset + movifnidn leftq, leftmp + GET_GOT goffsetq + + pxor m1, m1 + movd m0, [leftq] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_4)] + psraw m0, 2 + pshuflw m0, m0, 0x0 + packuswb m0, m0 + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movd m0, [aboveq] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_4)] + psraw m0, 2 + pshuflw m0, m0, 0x0 + packuswb m0, m0 + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movq m0, [aboveq] + movq m2, [leftq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + paddw m0, [GLOBAL(pw_8)] + psraw m0, 4 + punpcklbw m0, m0 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movq m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_8)] + psraw m0, 3 + punpcklbw m0, m0 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset + movifnidn leftq, leftmp + GET_GOT goffsetq + + pxor m1, m1 + movq m0, [leftq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_8)] + psraw m0, 3 + punpcklbw m0, m0 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movd m0, [GLOBAL(dc_128)] + movd [dstq ], m0 + movd [dstq+strideq ], m0 + movd [dstq+strideq*2], m0 + movd [dstq+stride3q ], m0 + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movq m0, [GLOBAL(dc_128)] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [leftq] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw_16)] + psraw m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + + +INIT_XMM sse2 +cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + psadbw m0, m1 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_16)] + psraw m0, 4 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [leftq] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + psadbw m0, m1 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_16)] + psraw m0, 4 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + mova m0, [GLOBAL(dc_128)] +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + RESTORE_GOT + RET + + +INIT_XMM sse2 +cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [aboveq+16] + mova m3, [leftq] + mova m4, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + psadbw m0, m1 + psadbw m2, m1 + psadbw m3, m1 + psadbw m4, m1 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw_32)] + psraw m0, 6 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_32)] + psraw m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [leftq] + mova m2, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_32)] + psraw m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + mova m0, [GLOBAL(dc_128)] +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above + movd m0, [aboveq] + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + RET + +INIT_XMM sse2 +cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above + movq m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RET + +INIT_XMM sse2 +cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 4 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 8 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m1 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m1 + lea dstq, [dstq+strideq*4] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left + movifnidn leftq, leftmp + movd m0, [leftq] + punpcklbw m0, m0 + punpcklbw m0, m0 + pshufd m1, m0, 0x1 + movd [dstq ], m0 + movd [dstq+strideq], m1 + pshufd m2, m0, 0x2 + lea dstq, [dstq+strideq*2] + pshufd m3, m0, 0x3 + movd [dstq ], m2 + movd [dstq+strideq], m3 + RET + +INIT_XMM sse2 +cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left + movifnidn leftq, leftmp + mov lineq, -2 + DEFINE_ARGS dst, stride, line, left, stride3 + lea stride3q, [strideq*3] + movq m0, [leftq ] + punpcklbw m0, m0 ; l1 l1 l2 l2 ... l8 l8 +.loop: + pshuflw m1, m0, 0x0 ; l1 l1 l1 l1 l1 l1 l1 l1 + pshuflw m2, m0, 0x55 ; l2 l2 l2 l2 l2 l2 l2 l2 + movq [dstq ], m1 + movq [dstq+strideq], m2 + pshuflw m1, m0, 0xaa + pshuflw m2, m0, 0xff + movq [dstq+strideq*2], m1 + movq [dstq+stride3q ], m2 + pshufd m0, m0, 0xe ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8 + inc lineq + lea dstq, [dstq+strideq*4] + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left + movifnidn leftq, leftmp + mov lineq, -4 + DEFINE_ARGS dst, stride, line, left, stride3 + lea stride3q, [strideq*3] +.loop: + movd m0, [leftq] + punpcklbw m0, m0 + punpcklbw m0, m0 ; l1 to l4 each repeated 4 times + pshufd m1, m0, 0x0 ; l1 repeated 16 times + pshufd m2, m0, 0x55 ; l2 repeated 16 times + mova [dstq ], m1 + mova [dstq+strideq ], m2 + pshufd m1, m0, 0xaa + pshufd m2, m0, 0xff + mova [dstq+strideq*2], m1 + mova [dstq+stride3q ], m2 + inc lineq + lea leftq, [leftq+4 ] + lea dstq, [dstq+strideq*4] + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left + movifnidn leftq, leftmp + mov lineq, -8 + DEFINE_ARGS dst, stride, line, left, stride3 + lea stride3q, [strideq*3] +.loop: + movd m0, [leftq] + punpcklbw m0, m0 + punpcklbw m0, m0 ; l1 to l4 each repeated 4 times + pshufd m1, m0, 0x0 ; l1 repeated 16 times + pshufd m2, m0, 0x55 ; l2 repeated 16 times + mova [dstq ], m1 + mova [dstq+16 ], m1 + mova [dstq+strideq ], m2 + mova [dstq+strideq+16 ], m2 + pshufd m1, m0, 0xaa + pshufd m2, m0, 0xff + mova [dstq+strideq*2 ], m1 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q ], m2 + mova [dstq+stride3q+16 ], m2 + inc lineq + lea leftq, [leftq+4 ] + lea dstq, [dstq+strideq*4] + jnz .loop + REP_RET diff --git a/libs/libaom/src/aom_dsp/x86/intrapred_avx2.c b/libs/libaom/src/aom_dsp/x86/intrapred_avx2.c new file mode 100644 index 000000000..546ee74bb --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/intrapred_avx2.c @@ -0,0 +1,4895 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "aom_dsp/x86/intrapred_x86.h" +#include "aom_dsp/x86/lpf_common_sse2.h" + +static INLINE __m256i dc_sum_64(const uint8_t *ref) { + const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32)); + const __m256i zero = _mm256_setzero_si256(); + __m256i y0 = _mm256_sad_epu8(x0, zero); + __m256i y1 = _mm256_sad_epu8(x1, zero); + y0 = _mm256_add_epi64(y0, y1); + __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1); + y0 = _mm256_add_epi64(u0, y0); + u0 = _mm256_unpackhi_epi64(y0, y0); + return _mm256_add_epi16(y0, u0); +} + +static INLINE __m256i dc_sum_32(const uint8_t *ref) { + const __m256i x = _mm256_loadu_si256((const __m256i *)ref); + const __m256i zero = _mm256_setzero_si256(); + __m256i y = _mm256_sad_epu8(x, zero); + __m256i u = _mm256_permute2x128_si256(y, y, 1); + y = _mm256_add_epi64(u, y); + u = _mm256_unpackhi_epi64(y, y); + return _mm256_add_epi16(y, u); +} + +static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst, + ptrdiff_t stride) { + for (int i = 0; i < height; ++i) { + _mm256_storeu_si256((__m256i *)dst, *r); + dst += stride; + } +} + +static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1, + int height, uint8_t *dst, + ptrdiff_t stride) { + for (int i = 0; i < height; ++i) { + _mm256_storeu_si256((__m256i *)dst, *r0); + _mm256_storeu_si256((__m256i *)(dst + 32), *r1); + dst += stride; + } +} + +static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst, + ptrdiff_t stride) { + for (int i = 0; i < height; ++i) { + _mm256_storeu_si256((__m256i *)dst, *r); + _mm256_storeu_si256((__m256i *)(dst + 32), *r); + dst += stride; + } +} + +static DECLARE_ALIGNED(16, uint8_t, HighbdLoadMaskx[8][16]) = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }, + { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 }, +}; + +static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx4[4][16]) = { + { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 }, + { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 4, 5, 8, 9, 12, 13 }, + { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 6, 7, 10, 11 }, + { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 0, 1, 8, 9 } +}; + +static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx[8][32]) = { + { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29, + 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }, + { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, + 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }, + { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, + 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27 }, + { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, + 0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21, + 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19, 22, 23 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17, 20, 21 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15, 18, 19 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 16, 17 } +}; + +static DECLARE_ALIGNED(32, uint16_t, HighbdBaseMask[17][16]) = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, + 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, + 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, + 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0xffff, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0xffff, 0xffff, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff } +}; + +static INLINE void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) { + __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + + r0 = _mm_unpacklo_epi16(x[0], x[1]); + r1 = _mm_unpacklo_epi16(x[2], x[3]); + r2 = _mm_unpacklo_epi16(x[4], x[5]); + r3 = _mm_unpacklo_epi16(x[6], x[7]); + + r4 = _mm_unpacklo_epi16(x[8], x[9]); + r5 = _mm_unpacklo_epi16(x[10], x[11]); + r6 = _mm_unpacklo_epi16(x[12], x[13]); + r7 = _mm_unpacklo_epi16(x[14], x[15]); + + r8 = _mm_unpacklo_epi32(r0, r1); + r9 = _mm_unpackhi_epi32(r0, r1); + r10 = _mm_unpacklo_epi32(r2, r3); + r11 = _mm_unpackhi_epi32(r2, r3); + + r12 = _mm_unpacklo_epi32(r4, r5); + r13 = _mm_unpackhi_epi32(r4, r5); + r14 = _mm_unpacklo_epi32(r6, r7); + r15 = _mm_unpackhi_epi32(r6, r7); + + r0 = _mm_unpacklo_epi64(r8, r9); + r1 = _mm_unpackhi_epi64(r8, r9); + r2 = _mm_unpacklo_epi64(r10, r11); + r3 = _mm_unpackhi_epi64(r10, r11); + + r4 = _mm_unpacklo_epi64(r12, r13); + r5 = _mm_unpackhi_epi64(r12, r13); + r6 = _mm_unpacklo_epi64(r14, r15); + r7 = _mm_unpackhi_epi64(r14, r15); + + d[0] = _mm_unpacklo_epi64(r0, r2); + d[1] = _mm_unpacklo_epi64(r4, r6); + d[2] = _mm_unpacklo_epi64(r1, r3); + d[3] = _mm_unpacklo_epi64(r5, r7); + + d[4] = _mm_unpackhi_epi64(r0, r2); + d[5] = _mm_unpackhi_epi64(r4, r6); + d[6] = _mm_unpackhi_epi64(r1, r3); + d[7] = _mm_unpackhi_epi64(r5, r7); +} + +static INLINE void highbd_transpose4x16_avx2(__m256i *x, __m256i *d) { + __m256i w0, w1, w2, w3, ww0, ww1; + + w0 = _mm256_unpacklo_epi16(x[0], x[1]); // 00 10 01 11 02 12 03 13 + w1 = _mm256_unpacklo_epi16(x[2], x[3]); // 20 30 21 31 22 32 23 33 + w2 = _mm256_unpackhi_epi16(x[0], x[1]); // 40 50 41 51 42 52 43 53 + w3 = _mm256_unpackhi_epi16(x[2], x[3]); // 60 70 61 71 62 72 63 73 + + ww0 = _mm256_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 + ww1 = _mm256_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71 + + d[0] = _mm256_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70 + d[1] = _mm256_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71 + + ww0 = _mm256_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 + ww1 = _mm256_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73 + + d[2] = _mm256_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72 + d[3] = _mm256_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73 +} + +static INLINE void highbd_transpose8x16_16x8_avx2(__m256i *x, __m256i *d) { + __m256i w0, w1, w2, w3, ww0, ww1; + + w0 = _mm256_unpacklo_epi16(x[0], x[1]); // 00 10 01 11 02 12 03 13 + w1 = _mm256_unpacklo_epi16(x[2], x[3]); // 20 30 21 31 22 32 23 33 + w2 = _mm256_unpacklo_epi16(x[4], x[5]); // 40 50 41 51 42 52 43 53 + w3 = _mm256_unpacklo_epi16(x[6], x[7]); // 60 70 61 71 62 72 63 73 + + ww0 = _mm256_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 + ww1 = _mm256_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71 + + d[0] = _mm256_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70 + d[1] = _mm256_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71 + + ww0 = _mm256_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 + ww1 = _mm256_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73 + + d[2] = _mm256_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72 + d[3] = _mm256_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73 + + w0 = _mm256_unpackhi_epi16(x[0], x[1]); // 04 14 05 15 06 16 07 17 + w1 = _mm256_unpackhi_epi16(x[2], x[3]); // 24 34 25 35 26 36 27 37 + w2 = _mm256_unpackhi_epi16(x[4], x[5]); // 44 54 45 55 46 56 47 57 + w3 = _mm256_unpackhi_epi16(x[6], x[7]); // 64 74 65 75 66 76 67 77 + + ww0 = _mm256_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35 + ww1 = _mm256_unpacklo_epi32(w2, w3); // 44 54 64 74 45 55 65 75 + + d[4] = _mm256_unpacklo_epi64(ww0, ww1); // 04 14 24 34 44 54 64 74 + d[5] = _mm256_unpackhi_epi64(ww0, ww1); // 05 15 25 35 45 55 65 75 + + ww0 = _mm256_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37 + ww1 = _mm256_unpackhi_epi32(w2, w3); // 46 56 66 76 47 57 67 77 + + d[6] = _mm256_unpacklo_epi64(ww0, ww1); // 06 16 26 36 46 56 66 76 + d[7] = _mm256_unpackhi_epi64(ww0, ww1); // 07 17 27 37 47 57 67 77 +} + +static INLINE void highbd_transpose16x16_avx2(__m256i *x, __m256i *d) { + __m256i w0, w1, w2, w3, ww0, ww1; + __m256i dd[16]; + w0 = _mm256_unpacklo_epi16(x[0], x[1]); + w1 = _mm256_unpacklo_epi16(x[2], x[3]); + w2 = _mm256_unpacklo_epi16(x[4], x[5]); + w3 = _mm256_unpacklo_epi16(x[6], x[7]); + + ww0 = _mm256_unpacklo_epi32(w0, w1); // + ww1 = _mm256_unpacklo_epi32(w2, w3); // + + dd[0] = _mm256_unpacklo_epi64(ww0, ww1); + dd[1] = _mm256_unpackhi_epi64(ww0, ww1); + + ww0 = _mm256_unpackhi_epi32(w0, w1); // + ww1 = _mm256_unpackhi_epi32(w2, w3); // + + dd[2] = _mm256_unpacklo_epi64(ww0, ww1); + dd[3] = _mm256_unpackhi_epi64(ww0, ww1); + + w0 = _mm256_unpackhi_epi16(x[0], x[1]); + w1 = _mm256_unpackhi_epi16(x[2], x[3]); + w2 = _mm256_unpackhi_epi16(x[4], x[5]); + w3 = _mm256_unpackhi_epi16(x[6], x[7]); + + ww0 = _mm256_unpacklo_epi32(w0, w1); // + ww1 = _mm256_unpacklo_epi32(w2, w3); // + + dd[4] = _mm256_unpacklo_epi64(ww0, ww1); + dd[5] = _mm256_unpackhi_epi64(ww0, ww1); + + ww0 = _mm256_unpackhi_epi32(w0, w1); // + ww1 = _mm256_unpackhi_epi32(w2, w3); // + + dd[6] = _mm256_unpacklo_epi64(ww0, ww1); + dd[7] = _mm256_unpackhi_epi64(ww0, ww1); + + w0 = _mm256_unpacklo_epi16(x[8], x[9]); + w1 = _mm256_unpacklo_epi16(x[10], x[11]); + w2 = _mm256_unpacklo_epi16(x[12], x[13]); + w3 = _mm256_unpacklo_epi16(x[14], x[15]); + + ww0 = _mm256_unpacklo_epi32(w0, w1); + ww1 = _mm256_unpacklo_epi32(w2, w3); + + dd[8] = _mm256_unpacklo_epi64(ww0, ww1); + dd[9] = _mm256_unpackhi_epi64(ww0, ww1); + + ww0 = _mm256_unpackhi_epi32(w0, w1); + ww1 = _mm256_unpackhi_epi32(w2, w3); + + dd[10] = _mm256_unpacklo_epi64(ww0, ww1); + dd[11] = _mm256_unpackhi_epi64(ww0, ww1); + + w0 = _mm256_unpackhi_epi16(x[8], x[9]); + w1 = _mm256_unpackhi_epi16(x[10], x[11]); + w2 = _mm256_unpackhi_epi16(x[12], x[13]); + w3 = _mm256_unpackhi_epi16(x[14], x[15]); + + ww0 = _mm256_unpacklo_epi32(w0, w1); + ww1 = _mm256_unpacklo_epi32(w2, w3); + + dd[12] = _mm256_unpacklo_epi64(ww0, ww1); + dd[13] = _mm256_unpackhi_epi64(ww0, ww1); + + ww0 = _mm256_unpackhi_epi32(w0, w1); + ww1 = _mm256_unpackhi_epi32(w2, w3); + + dd[14] = _mm256_unpacklo_epi64(ww0, ww1); + dd[15] = _mm256_unpackhi_epi64(ww0, ww1); + + for (int i = 0; i < 8; i++) { + d[i] = _mm256_insertf128_si256(dd[i], _mm256_castsi256_si128(dd[i + 8]), 1); + d[i + 8] = _mm256_insertf128_si256(dd[i + 8], + _mm256_extracti128_si256(dd[i], 1), 0); + } +} + +void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i sum_above = dc_sum_32(above); + __m256i sum_left = dc_sum_32(left); + sum_left = _mm256_add_epi16(sum_left, sum_above); + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum_left = _mm256_add_epi16(sum_left, thirtytwo); + sum_left = _mm256_srai_epi16(sum_left, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum_left, zero); + row_store_32xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(above); + (void)left; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(left); + (void)above; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((uint8_t)0x80); + row_store_32xh(&row, 32, dst, stride); +} + +void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row = _mm256_loadu_si256((const __m256i *)above); + (void)left; + row_store_32xh(&row, 32, dst, stride); +} + +// There are 32 rows togeter. This function does line: +// 0,1,2,3, and 16,17,18,19. The next call would do +// 4,5,6,7, and 20,21,22,23. So 4 times of calling +// would finish 32 rows. +static INLINE void h_predictor_32x8line(const __m256i *row, uint8_t *dst, + ptrdiff_t stride) { + __m256i t[4]; + __m256i m = _mm256_setzero_si256(); + const __m256i inc = _mm256_set1_epi8(4); + int i; + + for (i = 0; i < 4; i++) { + t[i] = _mm256_shuffle_epi8(*row, m); + __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0); + __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11); + _mm256_storeu_si256((__m256i *)dst, r0); + _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1); + dst += stride; + m = _mm256_add_epi8(m, inc); + } +} + +void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + const __m256i left_col = _mm256_loadu_si256((__m256i const *)left); + + __m256i u = _mm256_unpacklo_epi8(left_col, left_col); + + __m256i v = _mm256_unpacklo_epi8(u, u); + h_predictor_32x8line(&v, dst, stride); + dst += stride << 2; + + v = _mm256_unpackhi_epi8(u, u); + h_predictor_32x8line(&v, dst, stride); + dst += stride << 2; + + u = _mm256_unpackhi_epi8(left_col, left_col); + + v = _mm256_unpacklo_epi8(u, u); + h_predictor_32x8line(&v, dst, stride); + dst += stride << 2; + + v = _mm256_unpackhi_epi8(u, u); + h_predictor_32x8line(&v, dst, stride); +} + +// ----------------------------------------------------------------------------- +// Rectangle +void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i top_sum = dc_sum_32_sse2(above); + __m128i left_sum = dc_sum_16_sse2(left); + left_sum = _mm_add_epi16(top_sum, left_sum); + uint16_t sum = _mm_cvtsi128_si32(left_sum); + sum += 24; + sum /= 48; + const __m256i row = _mm256_set1_epi8((uint8_t)sum); + row_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i sum_above = dc_sum_32(above); + __m256i sum_left = dc_sum_64(left); + sum_left = _mm256_add_epi16(sum_left, sum_above); + uint16_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); + sum += 48; + sum /= 96; + const __m256i row = _mm256_set1_epi8((uint8_t)sum); + row_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i sum_above = dc_sum_64(above); + __m256i sum_left = dc_sum_64(left); + sum_left = _mm256_add_epi16(sum_left, sum_above); + uint16_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); + sum += 64; + sum /= 128; + const __m256i row = _mm256_set1_epi8((uint8_t)sum); + row_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i sum_above = dc_sum_64(above); + __m256i sum_left = dc_sum_32(left); + sum_left = _mm256_add_epi16(sum_left, sum_above); + uint16_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); + sum += 48; + sum /= 96; + const __m256i row = _mm256_set1_epi8((uint8_t)sum); + row_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i sum_above = dc_sum_64(above); + __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left)); + sum_left = _mm256_add_epi16(sum_left, sum_above); + uint16_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); + sum += 40; + sum /= 80; + const __m256i row = _mm256_set1_epi8((uint8_t)sum); + row_store_64xh(&row, 16, dst, stride); +} + +void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(above); + (void)left; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(above); + (void)left; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_64(above); + (void)left; + + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum = _mm256_add_epi16(sum, thirtytwo); + sum = _mm256_srai_epi16(sum, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_64(above); + (void)left; + + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum = _mm256_add_epi16(sum, thirtytwo); + sum = _mm256_srai_epi16(sum, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_64(above); + (void)left; + + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum = _mm256_add_epi16(sum, thirtytwo); + sum = _mm256_srai_epi16(sum, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_64xh(&row, 16, dst, stride); +} + +void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i sum = dc_sum_16_sse2(left); + (void)above; + + const __m128i eight = _mm_set1_epi16(8); + sum = _mm_add_epi16(sum, eight); + sum = _mm_srai_epi16(sum, 4); + const __m128i zero = _mm_setzero_si128(); + const __m128i r = _mm_shuffle_epi8(sum, zero); + const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1); + row_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_64(left); + (void)above; + + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum = _mm256_add_epi16(sum, thirtytwo); + sum = _mm256_srai_epi16(sum, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_64(left); + (void)above; + + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum = _mm256_add_epi16(sum, thirtytwo); + sum = _mm256_srai_epi16(sum, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(left); + (void)above; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i sum = dc_sum_16_sse2(left); + (void)above; + + const __m128i eight = _mm_set1_epi16(8); + sum = _mm_add_epi16(sum, eight); + sum = _mm_srai_epi16(sum, 4); + const __m128i zero = _mm_setzero_si128(); + const __m128i r = _mm_shuffle_epi8(sum, zero); + const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1); + row_store_64xh(&row, 16, dst, stride); +} + +void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((uint8_t)0x80); + row_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((uint8_t)0x80); + row_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((uint8_t)0x80); + row_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((uint8_t)0x80); + row_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((uint8_t)0x80); + row_store_64xh(&row, 16, dst, stride); +} + +void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row = _mm256_loadu_si256((const __m256i *)above); + (void)left; + row_store_32xh(&row, 16, dst, stride); +} + +void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row = _mm256_loadu_si256((const __m256i *)above); + (void)left; + row_store_32xh(&row, 64, dst, stride); +} + +void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row0 = _mm256_loadu_si256((const __m256i *)above); + const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32)); + (void)left; + row_store_32x2xh(&row0, &row1, 64, dst, stride); +} + +void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row0 = _mm256_loadu_si256((const __m256i *)above); + const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32)); + (void)left; + row_store_32x2xh(&row0, &row1, 32, dst, stride); +} + +void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row0 = _mm256_loadu_si256((const __m256i *)above); + const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32)); + (void)left; + row_store_32x2xh(&row0, &row1, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// PAETH_PRED + +// Return 16 16-bit pixels in one row (__m256i) +static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top, + const __m256i *topleft) { + const __m256i base = + _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft); + + __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left)); + __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top)); + __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft)); + + __m256i mask1 = _mm256_cmpgt_epi16(pl, pt); + mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl)); + __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl); + + pl = _mm256_andnot_si256(mask1, *left); + + ptl = _mm256_and_si256(mask2, *topleft); + pt = _mm256_andnot_si256(mask2, *top); + pt = _mm256_or_si256(pt, ptl); + pt = _mm256_and_si256(mask1, pt); + + return _mm256_or_si256(pt, pl); +} + +// Return 16 8-bit pixels in one row (__m128i) +static INLINE __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top, + const __m256i *topleft) { + const __m256i p0 = paeth_pred(left, top, topleft); + const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe); + const __m256i p = _mm256_packus_epi16(p0, p1); + return _mm256_castsi256_si128(p); +} + +static INLINE __m256i get_top_vector(const uint8_t *above) { + const __m128i x = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t0 = _mm_unpacklo_epi8(x, zero); + const __m128i t1 = _mm_unpackhi_epi8(x, zero); + return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1); +} + +void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i x = _mm_loadl_epi64((const __m128i *)left); + const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); + const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); + __m256i rep = _mm256_set1_epi16((short)0x8000); + const __m256i one = _mm256_set1_epi16(1); + const __m256i top = get_top_vector(above); + + int i; + for (i = 0; i < 8; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +static INLINE __m256i get_left_vector(const uint8_t *left) { + const __m128i x = _mm_load_si128((const __m128i *)left); + return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); +} + +void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i l = get_left_vector(left); + const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); + __m256i rep = _mm256_set1_epi16((short)0x8000); + const __m256i one = _mm256_set1_epi16(1); + const __m256i top = get_top_vector(above); + + int i; + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m256i l = get_left_vector(left); + const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); + __m256i rep = _mm256_set1_epi16((short)0x8000); + const __m256i one = _mm256_set1_epi16(1); + const __m256i top = get_top_vector(above); + + int i; + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + + l = get_left_vector(left + 16); + rep = _mm256_set1_epi16((short)0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); + const __m256i one = _mm256_set1_epi16(1); + const __m256i top = get_top_vector(above); + + for (int j = 0; j < 4; ++j) { + const __m256i l = get_left_vector(left + j * 16); + __m256i rep = _mm256_set1_epi16((short)0x8000); + for (int i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + } +} + +// Return 32 8-bit pixels in one row (__m256i) +static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0, + const __m256i *top1, + const __m256i *topleft) { + __m256i p0 = paeth_pred(left, top0, topleft); + __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe); + const __m256i x0 = _mm256_packus_epi16(p0, p1); + + p0 = paeth_pred(left, top1, topleft); + p1 = _mm256_permute4x64_epi64(p0, 0xe); + const __m256i x1 = _mm256_packus_epi16(p0, p1); + + return _mm256_permute2x128_si256(x0, x1, 0x20); +} + +void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i l = get_left_vector(left); + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); + __m256i rep = _mm256_set1_epi16((short)0x8000); + const __m256i one = _mm256_set1_epi16(1); + + int i; + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl); + + _mm256_storeu_si256((__m256i *)dst, r); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m256i l = get_left_vector(left); + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); + __m256i rep = _mm256_set1_epi16((short)0x8000); + const __m256i one = _mm256_set1_epi16(1); + + int i; + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + + l = get_left_vector(left + 16); + rep = _mm256_set1_epi16((short)0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); + const __m256i one = _mm256_set1_epi16(1); + + int i, j; + for (j = 0; j < 4; ++j) { + const __m256i l = get_left_vector(left + j * 16); + __m256i rep = _mm256_set1_epi16((short)0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i t2 = get_top_vector(above + 32); + const __m256i t3 = get_top_vector(above + 48); + const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); + const __m256i one = _mm256_set1_epi16(1); + + int i, j; + for (j = 0; j < 2; ++j) { + const __m256i l = get_left_vector(left + j * 16); + __m256i rep = _mm256_set1_epi16((short)0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl); + const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i t2 = get_top_vector(above + 32); + const __m256i t3 = get_top_vector(above + 48); + const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); + const __m256i one = _mm256_set1_epi16(1); + + int i, j; + for (j = 0; j < 4; ++j) { + const __m256i l = get_left_vector(left + j * 16); + __m256i rep = _mm256_set1_epi16((short)0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl); + const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i t2 = get_top_vector(above + 32); + const __m256i t3 = get_top_vector(above + 48); + const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); + const __m256i one = _mm256_set1_epi16(1); + + int i; + const __m256i l = get_left_vector(left); + __m256i rep = _mm256_set1_epi16((short)0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl); + const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +#define PERM4x64(c0, c1, c2, c3) c0 + (c1 << 2) + (c2 << 4) + (c3 << 6) +#define PERM2x128(c0, c1) c0 + (c1 << 4) + +static AOM_FORCE_INLINE void highbd_dr_prediction_z1_4xN_internal_avx2( + int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) { + const int frac_bits = 6 - upsample_above; + const int max_base_x = ((N + 4) - 1) << upsample_above; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a32, a16; + __m256i diff, c3f; + __m128i a_mbase_x, max_base_x128, base_inc128, mask128; + __m128i a0_128, a1_128; + a16 = _mm256_set1_epi16(16); + a_mbase_x = _mm_set1_epi16(above[max_base_x]); + max_base_x128 = _mm_set1_epi16(max_base_x); + c3f = _mm256_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res, shift; + __m128i res1; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + dst[i] = a_mbase_x; // save 4 values + } + return; + } + + a0_128 = _mm_loadu_si128((__m128i *)(above + base)); + a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1)); + + if (upsample_above) { + a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)HighbdEvenOddMaskx4[0]); + a1_128 = _mm_srli_si128(a0_128, 8); + + base_inc128 = _mm_setr_epi16(base, base + 2, base + 4, base + 6, base + 8, + base + 10, base + 12, base + 14); + shift = _mm256_srli_epi16( + _mm256_and_si256( + _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), + _mm256_set1_epi16(0x3f)), + 1); + } else { + base_inc128 = _mm_setr_epi16(base, base + 1, base + 2, base + 3, base + 4, + base + 5, base + 6, base + 7); + shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); + } + a0 = _mm256_castsi128_si256(a0_128); + a1 = _mm256_castsi128_si256(a1_128); + diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + res1 = _mm256_castsi256_si128(res); + + mask128 = _mm_cmpgt_epi16(max_base_x128, base_inc128); + dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128); + x += dx; + } +} + +static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_4xN_internal_avx2( + int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) { + const int frac_bits = 6 - upsample_above; + const int max_base_x = ((N + 4) - 1) << upsample_above; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a32, a16; + __m256i diff; + __m128i a_mbase_x, max_base_x128, base_inc128, mask128; + + a16 = _mm256_set1_epi32(16); + a_mbase_x = _mm_set1_epi16(above[max_base_x]); + max_base_x128 = _mm_set1_epi32(max_base_x); + + int x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res, shift; + __m128i res1; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + dst[i] = a_mbase_x; // save 4 values + } + return; + } + + a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base))); + a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1))); + + if (upsample_above) { + a0 = _mm256_permutevar8x32_epi32( + a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); + a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1)); + base_inc128 = _mm_setr_epi32(base, base + 2, base + 4, base + 6); + shift = _mm256_srli_epi32( + _mm256_and_si256( + _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above), + _mm256_set1_epi32(0x3f)), + 1); + } else { + base_inc128 = _mm_setr_epi32(base, base + 1, base + 2, base + 3); + shift = _mm256_srli_epi32( + _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1); + } + + diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + res1 = _mm256_castsi256_si128(res); + res1 = _mm_packus_epi32(res1, res1); + + mask128 = _mm_cmpgt_epi32(max_base_x128, base_inc128); + mask128 = _mm_packs_epi32(mask128, mask128); // goto 16 bit + dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128); + x += dx; + } +} + +static void highbd_dr_prediction_z1_4xN_avx2(int N, uint16_t *dst, + ptrdiff_t stride, + const uint16_t *above, + int upsample_above, int dx, + int bd) { + __m128i dstvec[16]; + if (bd < 12) { + highbd_dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above, + dx); + } else { + highbd_dr_prediction_32bit_z1_4xN_internal_avx2(N, dstvec, above, + upsample_above, dx); + } + for (int i = 0; i < N; i++) { + _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]); + } +} + +static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_8xN_internal_avx2( + int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) { + const int frac_bits = 6 - upsample_above; + const int max_base_x = ((8 + N) - 1) << upsample_above; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a0_1, a1_1, a32, a16; + __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; + + a16 = _mm256_set1_epi32(16); + a_mbase_x = _mm256_set1_epi16(above[max_base_x]); + max_base_x256 = _mm256_set1_epi32(max_base_x); + + int x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res, res1, shift; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + dst[i] = _mm256_castsi256_si128(a_mbase_x); // save 8 values + } + return; + } + + a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base))); + a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1))); + + if (upsample_above) { + a0 = _mm256_permutevar8x32_epi32( + a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); + a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1)); + + a0_1 = + _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8))); + a0_1 = _mm256_permutevar8x32_epi32( + a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); + a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1)); + + a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1); + a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1); + base_inc256 = + _mm256_setr_epi32(base, base + 2, base + 4, base + 6, base + 8, + base + 10, base + 12, base + 14); + shift = _mm256_srli_epi32( + _mm256_and_si256( + _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above), + _mm256_set1_epi32(0x3f)), + 1); + } else { + base_inc256 = _mm256_setr_epi32(base, base + 1, base + 2, base + 3, + base + 4, base + 5, base + 6, base + 7); + shift = _mm256_srli_epi32( + _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1); + } + + diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + res1 = _mm256_packus_epi32( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); + + mask256 = _mm256_cmpgt_epi32(max_base_x256, base_inc256); + mask256 = _mm256_packs_epi32( + mask256, _mm256_castsi128_si256( + _mm256_extracti128_si256(mask256, 1))); // goto 16 bit + res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256); + dst[r] = _mm256_castsi256_si128(res1); + x += dx; + } +} + +static AOM_FORCE_INLINE void highbd_dr_prediction_z1_8xN_internal_avx2( + int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) { + const int frac_bits = 6 - upsample_above; + const int max_base_x = ((8 + N) - 1) << upsample_above; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a32, a16, c3f; + __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; + __m128i a0_x128, a1_x128; + + a16 = _mm256_set1_epi16(16); + a_mbase_x = _mm256_set1_epi16(above[max_base_x]); + max_base_x256 = _mm256_set1_epi16(max_base_x); + c3f = _mm256_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res, res1, shift; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + dst[i] = _mm256_castsi256_si128(a_mbase_x); // save 8 values + } + return; + } + + a0_x128 = _mm_loadu_si128((__m128i *)(above + base)); + if (upsample_above) { + __m128i mask, atmp0, atmp1, atmp2, atmp3; + a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 8)); + atmp0 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdEvenOddMaskx[0]); + atmp1 = _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdEvenOddMaskx[0]); + atmp2 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16)); + atmp3 = + _mm_shuffle_epi8(a1_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16)); + mask = + _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[0], _mm_set1_epi8(15)); + a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask); + mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[0] + 16), + _mm_set1_epi8(15)); + a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask); + + base_inc256 = _mm256_setr_epi16(base, base + 2, base + 4, base + 6, + base + 8, base + 10, base + 12, base + 14, + 0, 0, 0, 0, 0, 0, 0, 0); + shift = _mm256_srli_epi16( + _mm256_and_si256( + _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f), + 1); + } else { + a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 1)); + base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3, + base + 4, base + 5, base + 6, base + 7, 0, + 0, 0, 0, 0, 0, 0, 0); + shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); + } + a0 = _mm256_castsi128_si256(a0_x128); + a1 = _mm256_castsi128_si256(a1_x128); + + diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + + mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); + res1 = _mm256_blendv_epi8(a_mbase_x, res, mask256); + dst[r] = _mm256_castsi256_si128(res1); + x += dx; + } +} + +static void highbd_dr_prediction_z1_8xN_avx2(int N, uint16_t *dst, + ptrdiff_t stride, + const uint16_t *above, + int upsample_above, int dx, + int bd) { + __m128i dstvec[32]; + if (bd < 12) { + highbd_dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above, + dx); + } else { + highbd_dr_prediction_32bit_z1_8xN_internal_avx2(N, dstvec, above, + upsample_above, dx); + } + for (int i = 0; i < N; i++) { + _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]); + } +} + +static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_16xN_internal_avx2( + int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) { + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((16 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a0_1, a1, a1_1, a32, a16; + __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; + + a16 = _mm256_set1_epi32(16); + a_mbase_x = _mm256_set1_epi16(above[max_base_x]); + max_base_x256 = _mm256_set1_epi16(max_base_x); + + int x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res[2], res1; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + dstvec[i] = a_mbase_x; // save 16 values + } + return; + } + __m256i shift = _mm256_srli_epi32( + _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1); + + a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base))); + a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1))); + + diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi32(diff, shift); + + res[0] = _mm256_add_epi32(a32, b); + res[0] = _mm256_srli_epi32(res[0], 5); + res[0] = _mm256_packus_epi32( + res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1))); + + int mdif = max_base_x - base; + if (mdif > 8) { + a0_1 = + _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8))); + a1_1 = + _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 9))); + + diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi32(diff, shift); + + res[1] = _mm256_add_epi32(a32, b); + res[1] = _mm256_srli_epi32(res[1], 5); + res[1] = _mm256_packus_epi32( + res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1))); + } else { + res[1] = a_mbase_x; + } + res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]), + 1); // 16 16bit values + + base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3, + base + 4, base + 5, base + 6, base + 7, + base + 8, base + 9, base + 10, base + 11, + base + 12, base + 13, base + 14, base + 15); + mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); + dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res1, mask256); + x += dx; + } +} + +static AOM_FORCE_INLINE void highbd_dr_prediction_z1_16xN_internal_avx2( + int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) { + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((16 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a32, a16, c3f; + __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; + + a16 = _mm256_set1_epi16(16); + a_mbase_x = _mm256_set1_epi16(above[max_base_x]); + max_base_x256 = _mm256_set1_epi16(max_base_x); + c3f = _mm256_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + dstvec[i] = a_mbase_x; // save 16 values + } + return; + } + __m256i shift = + _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); + + a0 = _mm256_loadu_si256((__m256i *)(above + base)); + a1 = _mm256_loadu_si256((__m256i *)(above + base + 1)); + + diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi16(diff, shift); + + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); // 16 16bit values + + base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3, + base + 4, base + 5, base + 6, base + 7, + base + 8, base + 9, base + 10, base + 11, + base + 12, base + 13, base + 14, base + 15); + mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); + dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res, mask256); + x += dx; + } +} + +static void highbd_dr_prediction_z1_16xN_avx2(int N, uint16_t *dst, + ptrdiff_t stride, + const uint16_t *above, + int upsample_above, int dx, + int bd) { + __m256i dstvec[64]; + if (bd < 12) { + highbd_dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above, + dx); + } else { + highbd_dr_prediction_32bit_z1_16xN_internal_avx2(N, dstvec, above, + upsample_above, dx); + } + for (int i = 0; i < N; i++) { + _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]); + } +} + +static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_32xN_internal_avx2( + int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) { + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((32 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a0_1, a1, a1_1, a32, a16, c3f; + __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; + + a16 = _mm256_set1_epi32(16); + a_mbase_x = _mm256_set1_epi16(above[max_base_x]); + max_base_x256 = _mm256_set1_epi16(max_base_x); + c3f = _mm256_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res[2], res1; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + dstvec[i] = a_mbase_x; // save 32 values + dstvec[i + N] = a_mbase_x; + } + return; + } + + __m256i shift = + _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1); + + for (int j = 0; j < 32; j += 16) { + int mdif = max_base_x - (base + j); + if (mdif <= 0) { + res1 = a_mbase_x; + } else { + a0 = _mm256_cvtepu16_epi32( + _mm_loadu_si128((__m128i *)(above + base + j))); + a1 = _mm256_cvtepu16_epi32( + _mm_loadu_si128((__m128i *)(above + base + 1 + j))); + + diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi32(diff, shift); + + res[0] = _mm256_add_epi32(a32, b); + res[0] = _mm256_srli_epi32(res[0], 5); + res[0] = _mm256_packus_epi32( + res[0], + _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1))); + if (mdif > 8) { + a0_1 = _mm256_cvtepu16_epi32( + _mm_loadu_si128((__m128i *)(above + base + 8 + j))); + a1_1 = _mm256_cvtepu16_epi32( + _mm_loadu_si128((__m128i *)(above + base + 9 + j))); + + diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi32(diff, shift); + + res[1] = _mm256_add_epi32(a32, b); + res[1] = _mm256_srli_epi32(res[1], 5); + res[1] = _mm256_packus_epi32( + res[1], + _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1))); + } else { + res[1] = a_mbase_x; + } + res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]), + 1); // 16 16bit values + base_inc256 = _mm256_setr_epi16( + base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4, + base + j + 5, base + j + 6, base + j + 7, base + j + 8, + base + j + 9, base + j + 10, base + j + 11, base + j + 12, + base + j + 13, base + j + 14, base + j + 15); + + mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); + res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256); + } + if (!j) { + dstvec[r] = res1; + } else { + dstvec[r + N] = res1; + } + } + x += dx; + } +} + +static AOM_FORCE_INLINE void highbd_dr_prediction_z1_32xN_internal_avx2( + int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) { + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((32 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a32, a16, c3f; + __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; + + a16 = _mm256_set1_epi16(16); + a_mbase_x = _mm256_set1_epi16(above[max_base_x]); + max_base_x256 = _mm256_set1_epi16(max_base_x); + c3f = _mm256_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + dstvec[i] = a_mbase_x; // save 32 values + dstvec[i + N] = a_mbase_x; + } + return; + } + + __m256i shift = + _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); + + for (int j = 0; j < 32; j += 16) { + int mdif = max_base_x - (base + j); + if (mdif <= 0) { + res = a_mbase_x; + } else { + a0 = _mm256_loadu_si256((__m256i *)(above + base + j)); + a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j)); + + diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi16(diff, shift); + + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + + base_inc256 = _mm256_setr_epi16( + base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4, + base + j + 5, base + j + 6, base + j + 7, base + j + 8, + base + j + 9, base + j + 10, base + j + 11, base + j + 12, + base + j + 13, base + j + 14, base + j + 15); + + mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); + res = _mm256_blendv_epi8(a_mbase_x, res, mask256); + } + if (!j) { + dstvec[r] = res; + } else { + dstvec[r + N] = res; + } + } + x += dx; + } +} + +static void highbd_dr_prediction_z1_32xN_avx2(int N, uint16_t *dst, + ptrdiff_t stride, + const uint16_t *above, + int upsample_above, int dx, + int bd) { + __m256i dstvec[128]; + if (bd < 12) { + highbd_dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, + dx); + } else { + highbd_dr_prediction_32bit_z1_32xN_internal_avx2(N, dstvec, above, + upsample_above, dx); + } + for (int i = 0; i < N; i++) { + _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]); + _mm256_storeu_si256((__m256i *)(dst + stride * i + 16), dstvec[i + N]); + } +} + +static void highbd_dr_prediction_32bit_z1_64xN_avx2(int N, uint16_t *dst, + ptrdiff_t stride, + const uint16_t *above, + int upsample_above, + int dx) { + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((64 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a0_1, a1, a1_1, a32, a16; + __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; + + a16 = _mm256_set1_epi32(16); + a_mbase_x = _mm256_set1_epi16(above[max_base_x]); + max_base_x256 = _mm256_set1_epi16(max_base_x); + + int x = dx; + for (int r = 0; r < N; r++, dst += stride) { + __m256i b, res[2], res1; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values + _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x); + _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x); + _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x); + dst += stride; + } + return; + } + + __m256i shift = _mm256_srli_epi32( + _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1); + + __m128i a0_128, a0_1_128, a1_128, a1_1_128; + for (int j = 0; j < 64; j += 16) { + int mdif = max_base_x - (base + j); + if (mdif <= 0) { + _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x); + } else { + a0_128 = _mm_loadu_si128((__m128i *)(above + base + j)); + a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j)); + a0 = _mm256_cvtepu16_epi32(a0_128); + a1 = _mm256_cvtepu16_epi32(a1_128); + + diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi32(diff, shift); + + res[0] = _mm256_add_epi32(a32, b); + res[0] = _mm256_srli_epi32(res[0], 5); + res[0] = _mm256_packus_epi32( + res[0], + _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1))); + if (mdif > 8) { + a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j)); + a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j)); + a0_1 = _mm256_cvtepu16_epi32(a0_1_128); + a1_1 = _mm256_cvtepu16_epi32(a1_1_128); + + diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi32(diff, shift); + + res[1] = _mm256_add_epi32(a32, b); + res[1] = _mm256_srli_epi32(res[1], 5); + res[1] = _mm256_packus_epi32( + res[1], + _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1))); + } else { + res[1] = a_mbase_x; + } + res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]), + 1); // 16 16bit values + base_inc256 = _mm256_setr_epi16( + base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4, + base + j + 5, base + j + 6, base + j + 7, base + j + 8, + base + j + 9, base + j + 10, base + j + 11, base + j + 12, + base + j + 13, base + j + 14, base + j + 15); + + mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); + res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256); + _mm256_storeu_si256((__m256i *)(dst + j), res1); + } + } + x += dx; + } +} + +static void highbd_dr_prediction_z1_64xN_avx2(int N, uint16_t *dst, + ptrdiff_t stride, + const uint16_t *above, + int upsample_above, int dx) { + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((64 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a32, a16, c3f; + __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; + + a16 = _mm256_set1_epi16(16); + a_mbase_x = _mm256_set1_epi16(above[max_base_x]); + max_base_x256 = _mm256_set1_epi16(max_base_x); + c3f = _mm256_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < N; r++, dst += stride) { + __m256i b, res; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values + _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x); + _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x); + _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x); + dst += stride; + } + return; + } + + __m256i shift = + _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); + + for (int j = 0; j < 64; j += 16) { + int mdif = max_base_x - (base + j); + if (mdif <= 0) { + _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x); + } else { + a0 = _mm256_loadu_si256((__m256i *)(above + base + j)); + a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j)); + + diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi16(diff, shift); + + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + + base_inc256 = _mm256_setr_epi16( + base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4, + base + j + 5, base + j + 6, base + j + 7, base + j + 8, + base + j + 9, base + j + 10, base + j + 11, base + j + 12, + base + j + 13, base + j + 14, base + j + 15); + + mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); + res = _mm256_blendv_epi8(a_mbase_x, res, mask256); + _mm256_storeu_si256((__m256i *)(dst + j), res); // 16 16bit values + } + } + x += dx; + } +} + +// Directional prediction, zone 1: 0 < angle < 90 +void av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int upsample_above, + int dx, int dy, int bd) { + (void)left; + (void)dy; + + switch (bw) { + case 4: + highbd_dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, + dx, bd); + break; + case 8: + highbd_dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, + dx, bd); + break; + case 16: + highbd_dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, + dx, bd); + break; + case 32: + highbd_dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, + dx, bd); + break; + case 64: + if (bd < 12) { + highbd_dr_prediction_z1_64xN_avx2(bh, dst, stride, above, + upsample_above, dx); + } else { + highbd_dr_prediction_32bit_z1_64xN_avx2(bh, dst, stride, above, + upsample_above, dx); + } + break; + default: break; + } + return; +} + +static void highbd_transpose_TX_16X16(const uint16_t *src, ptrdiff_t pitchSrc, + uint16_t *dst, ptrdiff_t pitchDst) { + __m256i r[16]; + __m256i d[16]; + for (int j = 0; j < 16; j++) { + r[j] = _mm256_loadu_si256((__m256i *)(src + j * pitchSrc)); + } + highbd_transpose16x16_avx2(r, d); + for (int j = 0; j < 16; j++) { + _mm256_storeu_si256((__m256i *)(dst + j * pitchDst), d[j]); + } +} + +static void highbd_transpose(const uint16_t *src, ptrdiff_t pitchSrc, + uint16_t *dst, ptrdiff_t pitchDst, int width, + int height) { + for (int j = 0; j < height; j += 16) + for (int i = 0; i < width; i += 16) + highbd_transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc, + dst + j * pitchDst + i, pitchDst); +} + +static void highbd_dr_prediction_32bit_z2_Nx4_avx2( + int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, + const uint16_t *left, int upsample_above, int upsample_left, int dx, + int dy) { + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0_x, a1_x, a32, a16; + __m256i diff; + __m128i c3f, min_base_y128; + + a16 = _mm256_set1_epi32(16); + c3f = _mm_set1_epi32(0x3f); + min_base_y128 = _mm_set1_epi32(min_base_y); + + for (int r = 0; r < N; r++) { + __m256i b, res, shift; + __m128i resx, resy, resxy; + __m128i a0_x128, a1_x128; + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + int base_shift = 0; + if (base_x < (min_base_x - 1)) { + base_shift = (min_base_x - base_x) >> upsample_above; + } + int base_min_diff = + (min_base_x - base_x + upsample_above) >> upsample_above; + if (base_min_diff > 4) { + base_min_diff = 4; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 3) { + a0_x = _mm256_setzero_si256(); + a1_x = _mm256_setzero_si256(); + shift = _mm256_setzero_si256(); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + if (upsample_above) { + a0_x128 = _mm_shuffle_epi8(a0_x128, + *(__m128i *)HighbdEvenOddMaskx4[base_shift]); + a1_x128 = _mm_srli_si128(a0_x128, 8); + + shift = _mm256_castsi128_si256(_mm_srli_epi32( + _mm_and_si128( + _mm_slli_epi32( + _mm_setr_epi32(-y * dx, (1 << 6) - y * dx, + (2 << 6) - y * dx, (3 << 6) - y * dx), + upsample_above), + c3f), + 1)); + } else { + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + a1_x128 = _mm_srli_si128(a0_x128, 2); + + shift = _mm256_castsi128_si256(_mm_srli_epi32( + _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx, + (2 << 6) - y * dx, (3 << 6) - y * dx), + c3f), + 1)); + } + a0_x = _mm256_cvtepu16_epi32(a0_x128); + a1_x = _mm256_cvtepu16_epi32(a1_x128); + } + // y calc + __m128i a0_y, a1_y, shifty; + if (base_x < min_base_x) { + __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128; + DECLARE_ALIGNED(32, int, base_y_c[4]); + r6 = _mm_set1_epi32(r << 6); + dy128 = _mm_set1_epi32(dy); + c1234 = _mm_setr_epi32(1, 2, 3, 4); + y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128)); + base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y); + mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128); + base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); + _mm_store_si128((__m128i *)base_y_c, base_y_c128); + + a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]]); + a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1], + left[base_y_c[2] + 1], left[base_y_c[3] + 1]); + + if (upsample_left) { + shifty = _mm_srli_epi32( + _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1); + } else { + shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1); + } + a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1); + a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1); + shift = _mm256_inserti128_si256(shift, shifty, 1); + } + + diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + resx = _mm256_castsi256_si128(res); + resx = _mm_packus_epi32(resx, resx); + + resy = _mm256_extracti128_si256(res, 1); + resy = _mm_packus_epi32(resy, resy); + + resxy = + _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]); + _mm_storel_epi64((__m128i *)(dst), resxy); + dst += stride; + } +} + +static void highbd_dr_prediction_z2_Nx4_avx2( + int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, + const uint16_t *left, int upsample_above, int upsample_left, int dx, + int dy) { + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0_x, a1_x, a32, a16; + __m256i diff; + __m128i c3f, min_base_y128; + + a16 = _mm256_set1_epi16(16); + c3f = _mm_set1_epi16(0x3f); + min_base_y128 = _mm_set1_epi16(min_base_y); + + for (int r = 0; r < N; r++) { + __m256i b, res, shift; + __m128i resx, resy, resxy; + __m128i a0_x128, a1_x128; + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + int base_shift = 0; + if (base_x < (min_base_x - 1)) { + base_shift = (min_base_x - base_x) >> upsample_above; + } + int base_min_diff = + (min_base_x - base_x + upsample_above) >> upsample_above; + if (base_min_diff > 4) { + base_min_diff = 4; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 3) { + a0_x = _mm256_setzero_si256(); + a1_x = _mm256_setzero_si256(); + shift = _mm256_setzero_si256(); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + if (upsample_above) { + a0_x128 = _mm_shuffle_epi8(a0_x128, + *(__m128i *)HighbdEvenOddMaskx4[base_shift]); + a1_x128 = _mm_srli_si128(a0_x128, 8); + + shift = _mm256_castsi128_si256(_mm_srli_epi16( + _mm_and_si128( + _mm_slli_epi16(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx, + (2 << 6) - y * dx, + (3 << 6) - y * dx, 0, 0, 0, 0), + upsample_above), + c3f), + 1)); + } else { + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + a1_x128 = _mm_srli_si128(a0_x128, 2); + + shift = _mm256_castsi128_si256(_mm_srli_epi16( + _mm_and_si128( + _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx, + (3 << 6) - y * dx, 0, 0, 0, 0), + c3f), + 1)); + } + a0_x = _mm256_castsi128_si256(a0_x128); + a1_x = _mm256_castsi128_si256(a1_x128); + } + // y calc + __m128i a0_y, a1_y, shifty; + if (base_x < min_base_x) { + __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128; + DECLARE_ALIGNED(32, int16_t, base_y_c[8]); + r6 = _mm_set1_epi16(r << 6); + dy128 = _mm_set1_epi16(dy); + c1234 = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0); + y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128)); + base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y); + mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128); + base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); + _mm_store_si128((__m128i *)base_y_c, base_y_c128); + + a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0); + a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1], + left[base_y_c[2] + 1], left[base_y_c[3] + 1], 0, 0, + 0, 0); + + if (upsample_left) { + shifty = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1); + } else { + shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1); + } + a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1); + a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1); + shift = _mm256_inserti128_si256(shift, shifty, 1); + } + + diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + + resx = _mm256_castsi256_si128(res); + resy = _mm256_extracti128_si256(res, 1); + resxy = + _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]); + _mm_storel_epi64((__m128i *)(dst), resxy); + dst += stride; + } +} + +static void highbd_dr_prediction_32bit_z2_Nx8_avx2( + int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, + const uint16_t *left, int upsample_above, int upsample_left, int dx, + int dy) { + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256; + __m256i diff; + __m128i a0_x128, a1_x128; + + a16 = _mm256_set1_epi32(16); + c3f = _mm256_set1_epi32(0x3f); + min_base_y256 = _mm256_set1_epi32(min_base_y); + + for (int r = 0; r < N; r++) { + __m256i b, res, shift; + __m128i resx, resy, resxy; + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + int base_shift = 0; + if (base_x < (min_base_x - 1)) { + base_shift = (min_base_x - base_x) >> upsample_above; + } + int base_min_diff = + (min_base_x - base_x + upsample_above) >> upsample_above; + if (base_min_diff > 8) { + base_min_diff = 8; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 7) { + resx = _mm_setzero_si128(); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + if (upsample_above) { + __m128i mask, atmp0, atmp1, atmp2, atmp3; + a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift)); + atmp0 = _mm_shuffle_epi8(a0_x128, + *(__m128i *)HighbdEvenOddMaskx[base_shift]); + atmp1 = _mm_shuffle_epi8(a1_x128, + *(__m128i *)HighbdEvenOddMaskx[base_shift]); + atmp2 = _mm_shuffle_epi8( + a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16)); + atmp3 = _mm_shuffle_epi8( + a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16)); + mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift], + _mm_set1_epi8(15)); + a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask); + mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16), + _mm_set1_epi8(15)); + a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask); + shift = _mm256_srli_epi32( + _mm256_and_si256( + _mm256_slli_epi32( + _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, + (2 << 6) - y * dx, (3 << 6) - y * dx, + (4 << 6) - y * dx, (5 << 6) - y * dx, + (6 << 6) - y * dx, (7 << 6) - y * dx), + upsample_above), + c3f), + 1); + } else { + a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift)); + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + a1_x128 = + _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + + shift = _mm256_srli_epi32( + _mm256_and_si256( + _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx, + (3 << 6) - y * dx, (4 << 6) - y * dx, + (5 << 6) - y * dx, (6 << 6) - y * dx, + (7 << 6) - y * dx), + c3f), + 1); + } + a0_x = _mm256_cvtepu16_epi32(a0_x128); + a1_x = _mm256_cvtepu16_epi32(a1_x128); + + diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + resx = _mm256_castsi256_si128(_mm256_packus_epi32( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)))); + } + // y calc + if (base_x < min_base_x) { + DECLARE_ALIGNED(32, int, base_y_c[8]); + __m256i r6, c256, dy256, y_c256, base_y_c256, mask256; + r6 = _mm256_set1_epi32(r << 6); + dy256 = _mm256_set1_epi32(dy); + c256 = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); + y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256)); + base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y); + mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256); + base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); + _mm256_store_si256((__m256i *)base_y_c, base_y_c256); + + a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( + left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], + left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]])); + a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( + left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1], + left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1], + left[base_y_c[6] + 1], left[base_y_c[7] + 1])); + + if (upsample_left) { + shift = _mm256_srli_epi32( + _mm256_and_si256(_mm256_slli_epi32((y_c256), upsample_left), c3f), + 1); + } else { + shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1); + } + diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + resy = _mm256_castsi256_si128(_mm256_packus_epi32( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)))); + } else { + resy = resx; + } + resxy = + _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]); + _mm_storeu_si128((__m128i *)(dst), resxy); + dst += stride; + } +} + +static void highbd_dr_prediction_z2_Nx8_avx2( + int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, + const uint16_t *left, int upsample_above, int upsample_left, int dx, + int dy) { + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m128i c3f, min_base_y128; + __m256i a0_x, a1_x, diff, a32, a16; + __m128i a0_x128, a1_x128; + + a16 = _mm256_set1_epi16(16); + c3f = _mm_set1_epi16(0x3f); + min_base_y128 = _mm_set1_epi16(min_base_y); + + for (int r = 0; r < N; r++) { + __m256i b, res, shift; + __m128i resx, resy, resxy; + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + int base_shift = 0; + if (base_x < (min_base_x - 1)) { + base_shift = (min_base_x - base_x) >> upsample_above; + } + int base_min_diff = + (min_base_x - base_x + upsample_above) >> upsample_above; + if (base_min_diff > 8) { + base_min_diff = 8; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 7) { + a0_x = _mm256_setzero_si256(); + a1_x = _mm256_setzero_si256(); + shift = _mm256_setzero_si256(); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + if (upsample_above) { + __m128i mask, atmp0, atmp1, atmp2, atmp3; + a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift)); + atmp0 = _mm_shuffle_epi8(a0_x128, + *(__m128i *)HighbdEvenOddMaskx[base_shift]); + atmp1 = _mm_shuffle_epi8(a1_x128, + *(__m128i *)HighbdEvenOddMaskx[base_shift]); + atmp2 = _mm_shuffle_epi8( + a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16)); + atmp3 = _mm_shuffle_epi8( + a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16)); + mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift], + _mm_set1_epi8(15)); + a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask); + mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16), + _mm_set1_epi8(15)); + a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask); + + shift = _mm256_castsi128_si256(_mm_srli_epi16( + _mm_and_si128( + _mm_slli_epi16( + _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, + (2 << 6) - y * dx, (3 << 6) - y * dx, + (4 << 6) - y * dx, (5 << 6) - y * dx, + (6 << 6) - y * dx, (7 << 6) - y * dx), + upsample_above), + c3f), + 1)); + } else { + a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift)); + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + a1_x128 = + _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + + shift = _mm256_castsi128_si256(_mm_srli_epi16( + _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx, + (2 << 6) - y * dx, (3 << 6) - y * dx, + (4 << 6) - y * dx, (5 << 6) - y * dx, + (6 << 6) - y * dx, (7 << 6) - y * dx), + c3f), + 1)); + } + a0_x = _mm256_castsi128_si256(a0_x128); + a1_x = _mm256_castsi128_si256(a1_x128); + } + + // y calc + __m128i a0_y, a1_y, shifty; + if (base_x < min_base_x) { + DECLARE_ALIGNED(32, int16_t, base_y_c[8]); + __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128; + r6 = _mm_set1_epi16(r << 6); + dy128 = _mm_set1_epi16(dy); + c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128)); + base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y); + mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128); + base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); + _mm_store_si128((__m128i *)base_y_c, base_y_c128); + + a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]], + left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]]); + a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1], + left[base_y_c[2] + 1], left[base_y_c[3] + 1], + left[base_y_c[4] + 1], left[base_y_c[5] + 1], + left[base_y_c[6] + 1], left[base_y_c[7] + 1]); + + if (upsample_left) { + shifty = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16((y_c128), upsample_left), c3f), 1); + } else { + shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1); + } + a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1); + a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1); + shift = _mm256_inserti128_si256(shift, shifty, 1); + } + + diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + + resx = _mm256_castsi256_si128(res); + resy = _mm256_extracti128_si256(res, 1); + + resxy = + _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]); + _mm_storeu_si128((__m128i *)(dst), resxy); + dst += stride; + } +} + +static void highbd_dr_prediction_32bit_z2_HxW_avx2( + int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, + const uint16_t *left, int upsample_above, int upsample_left, int dx, + int dy) { + // here upsample_above and upsample_left are 0 by design of + // av1_use_intra_edge_upsample + const int min_base_x = -1; + const int min_base_y = -1; + (void)upsample_above; + (void)upsample_left; + const int frac_bits_x = 6; + const int frac_bits_y = 6; + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16, c1; + __m256i diff, min_base_y256, c3f, dy256, c1234, c0123, c8; + __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128; + DECLARE_ALIGNED(32, int, base_y_c[16]); + + a16 = _mm256_set1_epi32(16); + c1 = _mm256_srli_epi32(a16, 4); + c8 = _mm256_srli_epi32(a16, 1); + min_base_y256 = _mm256_set1_epi16(min_base_y); + c3f = _mm256_set1_epi32(0x3f); + dy256 = _mm256_set1_epi32(dy); + c0123 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + c1234 = _mm256_add_epi32(c0123, c1); + + for (int r = 0; r < H; r++) { + __m256i b, res, shift, ydx; + __m256i resx[2], resy[2]; + __m256i resxy, j256, r6; + for (int j = 0; j < W; j += 16) { + j256 = _mm256_set1_epi32(j); + int y = r + 1; + ydx = _mm256_set1_epi32(y * dx); + + int base_x = ((j << 6) - y * dx) >> frac_bits_x; + int base_shift = 0; + if ((base_x) < (min_base_x - 1)) { + base_shift = (min_base_x - base_x - 1); + } + int base_min_diff = (min_base_x - base_x); + if (base_min_diff > 16) { + base_min_diff = 16; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 7) { + resx[0] = _mm256_setzero_si256(); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1)); + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + a1_x128 = + _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + + a0_x = _mm256_cvtepu16_epi32(a0_x128); + a1_x = _mm256_cvtepu16_epi32(a1_x128); + + r6 = _mm256_slli_epi32(_mm256_add_epi32(c0123, j256), 6); + shift = _mm256_srli_epi32( + _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1); + + diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + resx[0] = _mm256_packus_epi32( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); + } + int base_shift8 = 0; + if ((base_x + 8) < (min_base_x - 1)) { + base_shift8 = (min_base_x - (base_x + 8) - 1); + } + if (base_shift8 > 7) { + resx[1] = _mm256_setzero_si256(); + } else { + a0_1_x128 = + _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 8)); + a1_1_x128 = + _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 9)); + a0_1_x128 = _mm_shuffle_epi8(a0_1_x128, + *(__m128i *)HighbdLoadMaskx[base_shift8]); + a1_1_x128 = _mm_shuffle_epi8(a1_1_x128, + *(__m128i *)HighbdLoadMaskx[base_shift8]); + + a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128); + a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128); + + r6 = _mm256_slli_epi32( + _mm256_add_epi32(c0123, _mm256_add_epi32(j256, c8)), 6); + shift = _mm256_srli_epi32( + _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1); + + diff = _mm256_sub_epi32(a1_1_x, a0_1_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_1_x, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi32(diff, shift); + + resx[1] = _mm256_add_epi32(a32, b); + resx[1] = _mm256_srli_epi32(resx[1], 5); + resx[1] = _mm256_packus_epi32( + resx[1], + _mm256_castsi128_si256(_mm256_extracti128_si256(resx[1], 1))); + } + resx[0] = + _mm256_inserti128_si256(resx[0], _mm256_castsi256_si128(resx[1]), + 1); // 16 16bit values + + // y calc + resy[0] = _mm256_setzero_si256(); + if ((base_x < min_base_x)) { + __m256i c256, y_c256, y_c_1_256, base_y_c256, mask256; + r6 = _mm256_set1_epi32(r << 6); + c256 = _mm256_add_epi32(j256, c1234); + y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256)); + base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y); + mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256); + base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); + _mm256_store_si256((__m256i *)base_y_c, base_y_c256); + c256 = _mm256_add_epi32(c256, c8); + y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256)); + base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y); + mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256); + base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); + _mm256_store_si256((__m256i *)(base_y_c + 8), base_y_c256); + + a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( + left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], + left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]])); + a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( + left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1], + left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1], + left[base_y_c[6] + 1], left[base_y_c[7] + 1])); + + shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1); + + diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + resy[0] = _mm256_packus_epi32( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); + + a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( + left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]], + left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]], + left[base_y_c[14]], left[base_y_c[15]])); + a1_y = _mm256_cvtepu16_epi32( + _mm_setr_epi16(left[base_y_c[8] + 1], left[base_y_c[9] + 1], + left[base_y_c[10] + 1], left[base_y_c[11] + 1], + left[base_y_c[12] + 1], left[base_y_c[13] + 1], + left[base_y_c[14] + 1], left[base_y_c[15] + 1])); + shift = _mm256_srli_epi32(_mm256_and_si256(y_c_1_256, c3f), 1); + + diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + resy[1] = _mm256_packus_epi32( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); + + resy[0] = + _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]), + 1); // 16 16bit values + } + + resxy = _mm256_blendv_epi8(resx[0], resy[0], + *(__m256i *)HighbdBaseMask[base_min_diff]); + _mm256_storeu_si256((__m256i *)(dst + j), resxy); + } // for j + dst += stride; + } +} + +static void highbd_dr_prediction_z2_HxW_avx2( + int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, + const uint16_t *left, int upsample_above, int upsample_left, int dx, + int dy) { + // here upsample_above and upsample_left are 0 by design of + // av1_use_intra_edge_upsample + const int min_base_x = -1; + const int min_base_y = -1; + (void)upsample_above; + (void)upsample_left; + const int frac_bits_x = 6; + const int frac_bits_y = 6; + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0_x, a1_x, a32, a16, c3f, c1; + __m256i diff, min_base_y256, dy256, c1234, c0123; + DECLARE_ALIGNED(32, int16_t, base_y_c[16]); + + a16 = _mm256_set1_epi16(16); + c1 = _mm256_srli_epi16(a16, 4); + min_base_y256 = _mm256_set1_epi16(min_base_y); + c3f = _mm256_set1_epi16(0x3f); + dy256 = _mm256_set1_epi16(dy); + c0123 = + _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + c1234 = _mm256_add_epi16(c0123, c1); + + for (int r = 0; r < H; r++) { + __m256i b, res, shift; + __m256i resx, resy, ydx; + __m256i resxy, j256, r6; + __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128; + int y = r + 1; + ydx = _mm256_set1_epi16((short)(y * dx)); + + for (int j = 0; j < W; j += 16) { + j256 = _mm256_set1_epi16(j); + int base_x = ((j << 6) - y * dx) >> frac_bits_x; + int base_shift = 0; + if ((base_x) < (min_base_x - 1)) { + base_shift = (min_base_x - (base_x)-1); + } + int base_min_diff = (min_base_x - base_x); + if (base_min_diff > 16) { + base_min_diff = 16; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift < 8) { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1)); + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + a1_x128 = + _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + + a0_x = _mm256_castsi128_si256(a0_x128); + a1_x = _mm256_castsi128_si256(a1_x128); + } else { + a0_x = _mm256_setzero_si256(); + a1_x = _mm256_setzero_si256(); + } + + int base_shift1 = 0; + if (base_shift > 8) { + base_shift1 = base_shift - 8; + } + if (base_shift1 < 8) { + a0_1_x128 = + _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 8)); + a1_1_x128 = + _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 9)); + a0_1_x128 = _mm_shuffle_epi8(a0_1_x128, + *(__m128i *)HighbdLoadMaskx[base_shift1]); + a1_1_x128 = _mm_shuffle_epi8(a1_1_x128, + *(__m128i *)HighbdLoadMaskx[base_shift1]); + + a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1); + a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1); + } + r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6); + shift = _mm256_srli_epi16( + _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1); + + diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + resx = _mm256_srli_epi16(res, 5); // 16 16-bit values + + // y calc + resy = _mm256_setzero_si256(); + __m256i a0_y, a1_y, shifty; + if ((base_x < min_base_x)) { + __m256i c256, y_c256, base_y_c256, mask256, mul16; + r6 = _mm256_set1_epi16(r << 6); + c256 = _mm256_add_epi16(j256, c1234); + mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256), + _mm256_srli_epi16(min_base_y256, 1)); + y_c256 = _mm256_sub_epi16(r6, mul16); + base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y); + mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256); + base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); + _mm256_store_si256((__m256i *)base_y_c, base_y_c256); + + a0_y = _mm256_setr_epi16( + left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], + left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]], + left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]], + left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]], + left[base_y_c[15]]); + base_y_c256 = _mm256_add_epi16(base_y_c256, c1); + _mm256_store_si256((__m256i *)base_y_c, base_y_c256); + + a1_y = _mm256_setr_epi16( + left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], + left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]], + left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]], + left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]], + left[base_y_c[15]]); + + shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1); + + diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0_y, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shifty); + res = _mm256_add_epi16(a32, b); + resy = _mm256_srli_epi16(res, 5); + } + + resxy = _mm256_blendv_epi8(resx, resy, + *(__m256i *)HighbdBaseMask[base_min_diff]); + _mm256_storeu_si256((__m256i *)(dst + j), resxy); + } // for j + dst += stride; + } +} + +// Directional prediction, zone 2: 90 < angle < 180 +void av1_highbd_dr_prediction_z2_avx2(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int upsample_above, + int upsample_left, int dx, int dy, + int bd) { + (void)bd; + assert(dx > 0); + assert(dy > 0); + switch (bw) { + case 4: + if (bd < 12) { + highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, + upsample_above, upsample_left, dx, dy); + } else { + highbd_dr_prediction_32bit_z2_Nx4_avx2(bh, dst, stride, above, left, + upsample_above, upsample_left, + dx, dy); + } + break; + case 8: + if (bd < 12) { + highbd_dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, + upsample_above, upsample_left, dx, dy); + } else { + highbd_dr_prediction_32bit_z2_Nx8_avx2(bh, dst, stride, above, left, + upsample_above, upsample_left, + dx, dy); + } + break; + default: + if (bd < 12) { + highbd_dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left, + upsample_above, upsample_left, dx, dy); + } else { + highbd_dr_prediction_32bit_z2_HxW_avx2(bh, bw, dst, stride, above, left, + upsample_above, upsample_left, + dx, dy); + } + break; + } +} + +// Directional prediction, zone 3 functions +static void highbd_dr_prediction_z3_4x4_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m128i dstvec[4], d[4]; + if (bd < 12) { + highbd_dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_4xN_internal_avx2(4, dstvec, left, + upsample_left, dy); + } + highbd_transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], + &dstvec[3], &d[0], &d[1], &d[2], &d[3]); + _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]); + _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]); + _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]); + _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]); + return; +} + +static void highbd_dr_prediction_z3_8x8_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m128i dstvec[8], d[8]; + if (bd < 12) { + highbd_dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_8xN_internal_avx2(8, dstvec, left, + upsample_left, dy); + } + highbd_transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], + &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], + &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], + &d[7]); + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + } +} + +static void highbd_dr_prediction_z3_4x8_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m128i dstvec[4], d[8]; + if (bd < 12) { + highbd_dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_8xN_internal_avx2(4, dstvec, left, + upsample_left, dy); + } + + highbd_transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], + &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], + &d[7]); + for (int i = 0; i < 8; i++) { + _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]); + } +} + +static void highbd_dr_prediction_z3_8x4_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m128i dstvec[8], d[4]; + if (bd < 12) { + highbd_dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_4xN_internal_avx2(8, dstvec, left, + upsample_left, dy); + } + + highbd_transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], + &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], + &d[0], &d[1], &d[2], &d[3]); + _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]); + _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[1]); + _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[2]); + _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[3]); +} + +static void highbd_dr_prediction_z3_8x16_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m256i dstvec[8], d[8]; + if (bd < 12) { + highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_16xN_internal_avx2(8, dstvec, left, + upsample_left, dy); + } + highbd_transpose8x16_16x8_avx2(dstvec, d); + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), + _mm256_castsi256_si128(d[i])); + } + for (int i = 8; i < 16; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), + _mm256_extracti128_si256(d[i - 8], 1)); + } +} + +static void highbd_dr_prediction_z3_16x8_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m128i dstvec[16], d[16]; + if (bd < 12) { + highbd_dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_8xN_internal_avx2(16, dstvec, left, + upsample_left, dy); + } + for (int i = 0; i < 16; i += 8) { + highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i], + &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i], + &dstvec[6 + i], &dstvec[7 + i], &d[0 + i], + &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i], + &d[5 + i], &d[6 + i], &d[7 + i]); + } + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]); + } +} + +static void highbd_dr_prediction_z3_4x16_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m256i dstvec[4], d[4], d1; + if (bd < 12) { + highbd_dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_16xN_internal_avx2(4, dstvec, left, + upsample_left, dy); + } + highbd_transpose4x16_avx2(dstvec, d); + for (int i = 0; i < 4; i++) { + _mm_storel_epi64((__m128i *)(dst + i * stride), + _mm256_castsi256_si128(d[i])); + d1 = _mm256_bsrli_epi128(d[i], 8); + _mm_storel_epi64((__m128i *)(dst + (i + 4) * stride), + _mm256_castsi256_si128(d1)); + _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride), + _mm256_extracti128_si256(d[i], 1)); + _mm_storel_epi64((__m128i *)(dst + (i + 12) * stride), + _mm256_extracti128_si256(d1, 1)); + } +} + +static void highbd_dr_prediction_z3_16x4_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m128i dstvec[16], d[8]; + if (bd < 12) { + highbd_dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_4xN_internal_avx2(16, dstvec, left, + upsample_left, dy); + } + highbd_transpose16x4_8x8_sse2(dstvec, d); + + _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]); + _mm_storeu_si128((__m128i *)(dst + 0 * stride + 8), d[1]); + _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[2]); + _mm_storeu_si128((__m128i *)(dst + 1 * stride + 8), d[3]); + _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[4]); + _mm_storeu_si128((__m128i *)(dst + 2 * stride + 8), d[5]); + _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[6]); + _mm_storeu_si128((__m128i *)(dst + 3 * stride + 8), d[7]); +} + +static void highbd_dr_prediction_z3_8x32_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m256i dstvec[16], d[16]; + if (bd < 12) { + highbd_dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_32xN_internal_avx2(8, dstvec, left, + upsample_left, dy); + } + + for (int i = 0; i < 16; i += 8) { + highbd_transpose8x16_16x8_avx2(dstvec + i, d + i); + } + + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), + _mm256_castsi256_si128(d[i])); + } + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride), + _mm256_extracti128_si256(d[i], 1)); + } + for (int i = 8; i < 16; i++) { + _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride), + _mm256_castsi256_si128(d[i])); + } + for (int i = 8; i < 16; i++) { + _mm_storeu_si128((__m128i *)(dst + (i + 16) * stride), + _mm256_extracti128_si256(d[i], 1)); + } +} + +static void highbd_dr_prediction_z3_32x8_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m128i dstvec[32], d[32]; + if (bd < 12) { + highbd_dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_8xN_internal_avx2(32, dstvec, left, + upsample_left, dy); + } + + for (int i = 0; i < 32; i += 8) { + highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i], + &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i], + &dstvec[6 + i], &dstvec[7 + i], &d[0 + i], + &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i], + &d[5 + i], &d[6 + i], &d[7 + i]); + } + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]); + _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 16]); + _mm_storeu_si128((__m128i *)(dst + i * stride + 24), d[i + 24]); + } +} + +static void highbd_dr_prediction_z3_16x16_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m256i dstvec[16], d[16]; + if (bd < 12) { + highbd_dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_16xN_internal_avx2(16, dstvec, left, + upsample_left, dy); + } + + highbd_transpose16x16_avx2(dstvec, d); + + for (int i = 0; i < 16; i++) { + _mm256_storeu_si256((__m256i *)(dst + i * stride), d[i]); + } +} + +static void highbd_dr_prediction_z3_32x32_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m256i dstvec[64], d[16]; + if (bd < 12) { + highbd_dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_32xN_internal_avx2(32, dstvec, left, + upsample_left, dy); + } + highbd_transpose16x16_avx2(dstvec, d); + for (int j = 0; j < 16; j++) { + _mm256_storeu_si256((__m256i *)(dst + j * stride), d[j]); + } + highbd_transpose16x16_avx2(dstvec + 16, d); + for (int j = 0; j < 16; j++) { + _mm256_storeu_si256((__m256i *)(dst + j * stride + 16), d[j]); + } + highbd_transpose16x16_avx2(dstvec + 32, d); + for (int j = 0; j < 16; j++) { + _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride), d[j]); + } + highbd_transpose16x16_avx2(dstvec + 48, d); + for (int j = 0; j < 16; j++) { + _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride + 16), d[j]); + } +} + +static void highbd_dr_prediction_z3_64x64_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]); + if (bd < 12) { + highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy); + } else { + highbd_dr_prediction_32bit_z1_64xN_avx2(64, dstT, 64, left, upsample_left, + dy); + } + highbd_transpose(dstT, 64, dst, stride, 64, 64); +} + +static void highbd_dr_prediction_z3_16x32_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m256i dstvec[32], d[32]; + if (bd < 12) { + highbd_dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_32xN_internal_avx2(16, dstvec, left, + upsample_left, dy); + } + for (int i = 0; i < 32; i += 8) { + highbd_transpose8x16_16x8_avx2(dstvec + i, d + i); + } + // store + for (int j = 0; j < 32; j += 16) { + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + (i + j) * stride), + _mm256_castsi256_si128(d[(i + j)])); + } + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + (i + j) * stride + 8), + _mm256_castsi256_si128(d[(i + j) + 8])); + } + for (int i = 8; i < 16; i++) { + _mm256_storeu_si256( + (__m256i *)(dst + (i + j) * stride), + _mm256_inserti128_si256( + d[(i + j)], _mm256_extracti128_si256(d[(i + j) - 8], 1), 0)); + } + } +} + +static void highbd_dr_prediction_z3_32x16_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m256i dstvec[32], d[16]; + if (bd < 12) { + highbd_dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_16xN_internal_avx2(32, dstvec, left, + upsample_left, dy); + } + for (int i = 0; i < 32; i += 16) { + highbd_transpose16x16_avx2((dstvec + i), d); + for (int j = 0; j < 16; j++) { + _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]); + } + } +} + +static void highbd_dr_prediction_z3_32x64_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + uint16_t dstT[64 * 32]; + if (bd < 12) { + highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy); + } else { + highbd_dr_prediction_32bit_z1_64xN_avx2(32, dstT, 64, left, upsample_left, + dy); + } + highbd_transpose(dstT, 64, dst, stride, 32, 64); +} + +static void highbd_dr_prediction_z3_64x32_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]); + highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy, bd); + highbd_transpose(dstT, 32, dst, stride, 64, 32); + return; +} + +static void highbd_dr_prediction_z3_16x64_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]); + if (bd < 12) { + highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy); + } else { + highbd_dr_prediction_32bit_z1_64xN_avx2(16, dstT, 64, left, upsample_left, + dy); + } + highbd_transpose(dstT, 64, dst, stride, 16, 64); +} + +static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m256i dstvec[64], d[16]; + if (bd < 12) { + highbd_dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_16xN_internal_avx2(64, dstvec, left, + upsample_left, dy); + } + for (int i = 0; i < 64; i += 16) { + highbd_transpose16x16_avx2((dstvec + i), d); + for (int j = 0; j < 16; j++) { + _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]); + } + } +} + +void av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int upsample_left, + int dx, int dy, int bd) { + (void)above; + (void)dx; + + assert(dx == 1); + assert(dy > 0); + if (bw == bh) { + switch (bw) { + case 4: + highbd_dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy, + bd); + break; + case 8: + highbd_dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy, + bd); + break; + case 16: + highbd_dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy, + bd); + break; + case 32: + highbd_dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy, + bd); + break; + case 64: + highbd_dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy, + bd); + break; + } + } else { + if (bw < bh) { + if (bw + bw == bh) { + switch (bw) { + case 4: + highbd_dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 8: + highbd_dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 16: + highbd_dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 32: + highbd_dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + } + } else { + switch (bw) { + case 4: + highbd_dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 8: + highbd_dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 16: + highbd_dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + } + } + } else { + if (bh + bh == bw) { + switch (bh) { + case 4: + highbd_dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 8: + highbd_dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 16: + highbd_dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 32: + highbd_dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + } + } else { + switch (bh) { + case 4: + highbd_dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 8: + highbd_dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 16: + highbd_dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + } + } + } + } + return; +} + +// Low bit depth functions +static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, +}; + +static DECLARE_ALIGNED(16, uint8_t, LoadMaskx[16][16]) = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, + { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }, + { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 }, + { 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }, + { 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }, + { 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, +}; + +static DECLARE_ALIGNED(16, uint8_t, EvenOddMaskx[8][16]) = { + { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }, + { 0, 1, 3, 5, 7, 9, 11, 13, 0, 2, 4, 6, 8, 10, 12, 14 }, + { 0, 0, 2, 4, 6, 8, 10, 12, 0, 0, 3, 5, 7, 9, 11, 13 }, + { 0, 0, 0, 3, 5, 7, 9, 11, 0, 0, 0, 4, 6, 8, 10, 12 }, + { 0, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 0, 5, 7, 9, 11 }, + { 0, 0, 0, 0, 0, 5, 7, 9, 0, 0, 0, 0, 0, 6, 8, 10 }, + { 0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9 }, + { 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8 } +}; +/* clang-format off */ +static DECLARE_ALIGNED(32, int, LoadMaskz2[8][8]) = { + { -1, 0, 0, 0, 0, 0, 0, 0}, + { -1, -1, 0, 0, 0, 0, 0, 0}, + { -1, -1, -1, 0, 0, 0, 0, 0}, + { -1, -1, -1, -1, 0, 0, 0, 0}, + { -1, -1, -1, -1, -1, 0, 0, 0}, + { -1, -1, -1, -1, -1, -1, 0, 0}, + { -1, -1, -1, -1, -1, -1, -1, 0}, + { -1, -1, -1, -1, -1, -1, -1, -1}, +}; +/* clang-format on */ +static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_avx2( + int H, int W, __m128i *dst, const uint8_t *above, int upsample_above, + int dx) { + const int frac_bits = 6 - upsample_above; + const int max_base_x = ((W + H) - 1) << upsample_above; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a32, a16; + __m256i diff, c3f; + __m128i a_mbase_x; + + a16 = _mm256_set1_epi16(16); + a_mbase_x = _mm_set1_epi8(above[max_base_x]); + c3f = _mm256_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < W; r++) { + __m256i b, res, shift; + __m128i res1, a0_128, a1_128; + + int base = x >> frac_bits; + int base_max_diff = (max_base_x - base) >> upsample_above; + if (base_max_diff <= 0) { + for (int i = r; i < W; ++i) { + dst[i] = a_mbase_x; // save 4 values + } + return; + } + if (base_max_diff > H) base_max_diff = H; + a0_128 = _mm_loadu_si128((__m128i *)(above + base)); + a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1)); + + if (upsample_above) { + a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)EvenOddMaskx[0]); + a1_128 = _mm_srli_si128(a0_128, 8); + + shift = _mm256_srli_epi16( + _mm256_and_si256( + _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f), + 1); + } else { + shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); + } + a0 = _mm256_cvtepu8_epi16(a0_128); + a1 = _mm256_cvtepu8_epi16(a1_128); + + diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + + res = _mm256_packus_epi16( + res, _mm256_castsi128_si256( + _mm256_extracti128_si256(res, 1))); // goto 8 bit + res1 = _mm256_castsi256_si128(res); // 16 8bit values + + dst[r] = + _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]); + x += dx; + } +} + +static void dr_prediction_z1_4xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int upsample_above, + int dx) { + __m128i dstvec[16]; + + dr_prediction_z1_HxW_internal_avx2(4, N, dstvec, above, upsample_above, dx); + for (int i = 0; i < N; i++) { + *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]); + } +} + +static void dr_prediction_z1_8xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int upsample_above, + int dx) { + __m128i dstvec[32]; + + dr_prediction_z1_HxW_internal_avx2(8, N, dstvec, above, upsample_above, dx); + for (int i = 0; i < N; i++) { + _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]); + } +} + +static void dr_prediction_z1_16xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int upsample_above, + int dx) { + __m128i dstvec[64]; + + dr_prediction_z1_HxW_internal_avx2(16, N, dstvec, above, upsample_above, dx); + for (int i = 0; i < N; i++) { + _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]); + } +} + +static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_avx2( + int N, __m256i *dstvec, const uint8_t *above, int upsample_above, int dx) { + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((32 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a32, a16; + __m256i a_mbase_x, diff, c3f; + + a16 = _mm256_set1_epi16(16); + a_mbase_x = _mm256_set1_epi8(above[max_base_x]); + c3f = _mm256_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res, res16[2]; + __m128i a0_128, a1_128; + + int base = x >> frac_bits; + int base_max_diff = (max_base_x - base); + if (base_max_diff <= 0) { + for (int i = r; i < N; ++i) { + dstvec[i] = a_mbase_x; // save 32 values + } + return; + } + if (base_max_diff > 32) base_max_diff = 32; + __m256i shift = + _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); + + for (int j = 0, jj = 0; j < 32; j += 16, jj++) { + int mdiff = base_max_diff - j; + if (mdiff <= 0) { + res16[jj] = a_mbase_x; + } else { + a0_128 = _mm_loadu_si128((__m128i *)(above + base + j)); + a1_128 = _mm_loadu_si128((__m128i *)(above + base + j + 1)); + a0 = _mm256_cvtepu8_epi16(a0_128); + a1 = _mm256_cvtepu8_epi16(a1_128); + + diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi16(diff, shift); + + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + res16[jj] = _mm256_packus_epi16( + res, _mm256_castsi128_si256( + _mm256_extracti128_si256(res, 1))); // 16 8bit values + } + } + res16[1] = + _mm256_inserti128_si256(res16[0], _mm256_castsi256_si128(res16[1]), + 1); // 32 8bit values + + dstvec[r] = _mm256_blendv_epi8( + a_mbase_x, res16[1], + *(__m256i *)BaseMask[base_max_diff]); // 32 8bit values + x += dx; + } +} + +static void dr_prediction_z1_32xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int upsample_above, + int dx) { + __m256i dstvec[64]; + dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx); + for (int i = 0; i < N; i++) { + _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]); + } +} + +static void dr_prediction_z1_64xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int upsample_above, + int dx) { + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((64 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a32, a16; + __m256i a_mbase_x, diff, c3f; + __m128i max_base_x128, base_inc128, mask128; + + a16 = _mm256_set1_epi16(16); + a_mbase_x = _mm256_set1_epi8(above[max_base_x]); + max_base_x128 = _mm_set1_epi8(max_base_x); + c3f = _mm256_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < N; r++, dst += stride) { + __m256i b, res; + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values + _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x); + dst += stride; + } + return; + } + + __m256i shift = + _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); + + __m128i a0_128, a1_128, res128; + for (int j = 0; j < 64; j += 16) { + int mdif = max_base_x - (base + j); + if (mdif <= 0) { + _mm_storeu_si128((__m128i *)(dst + j), + _mm256_castsi256_si128(a_mbase_x)); + } else { + a0_128 = _mm_loadu_si128((__m128i *)(above + base + j)); + a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j)); + a0 = _mm256_cvtepu8_epi16(a0_128); + a1 = _mm256_cvtepu8_epi16(a1_128); + + diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi16(diff, shift); + + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + res = _mm256_packus_epi16( + res, _mm256_castsi128_si256( + _mm256_extracti128_si256(res, 1))); // 16 8bit values + + base_inc128 = + _mm_setr_epi8((uint8_t)(base + j), (uint8_t)(base + j + 1), + (uint8_t)(base + j + 2), (uint8_t)(base + j + 3), + (uint8_t)(base + j + 4), (uint8_t)(base + j + 5), + (uint8_t)(base + j + 6), (uint8_t)(base + j + 7), + (uint8_t)(base + j + 8), (uint8_t)(base + j + 9), + (uint8_t)(base + j + 10), (uint8_t)(base + j + 11), + (uint8_t)(base + j + 12), (uint8_t)(base + j + 13), + (uint8_t)(base + j + 14), (uint8_t)(base + j + 15)); + + mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128), + _mm_setzero_si128()); + res128 = _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x), + _mm256_castsi256_si128(res), mask128); + _mm_storeu_si128((__m128i *)(dst + j), res128); + } + } + x += dx; + } +} + +// Directional prediction, zone 1: 0 < angle < 90 +void av1_dr_prediction_z1_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_above, int dx, int dy) { + (void)left; + (void)dy; + switch (bw) { + case 4: + dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx); + break; + case 8: + dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx); + break; + case 16: + dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx); + break; + case 32: + dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx); + break; + case 64: + dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx); + break; + default: break; + } + return; +} + +static void dr_prediction_z2_Nx4_avx2(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int upsample_above, int upsample_left, + int dx, int dy) { + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m128i a0_x, a1_x, a32, a16, diff; + __m128i c3f, min_base_y128, c1234, dy128; + + a16 = _mm_set1_epi16(16); + c3f = _mm_set1_epi16(0x3f); + min_base_y128 = _mm_set1_epi16(min_base_y); + c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0); + dy128 = _mm_set1_epi16(dy); + + for (int r = 0; r < N; r++) { + __m128i b, res, shift, r6, ydx; + __m128i resx, resy, resxy; + __m128i a0_x128, a1_x128; + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + int base_shift = 0; + if (base_x < (min_base_x - 1)) { + base_shift = (min_base_x - base_x - 1) >> upsample_above; + } + int base_min_diff = + (min_base_x - base_x + upsample_above) >> upsample_above; + if (base_min_diff > 4) { + base_min_diff = 4; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 3) { + a0_x = _mm_setzero_si128(); + a1_x = _mm_setzero_si128(); + shift = _mm_setzero_si128(); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + ydx = _mm_set1_epi16(y * dx); + r6 = _mm_slli_epi16(c1234, 6); + + if (upsample_above) { + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]); + a1_x128 = _mm_srli_si128(a0_x128, 8); + + shift = _mm_srli_epi16( + _mm_and_si128( + _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f), + 1); + } else { + a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]); + a1_x128 = _mm_srli_si128(a0_x128, 1); + + shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1); + } + a0_x = _mm_cvtepu8_epi16(a0_x128); + a1_x = _mm_cvtepu8_epi16(a1_x128); + } + // y calc + __m128i a0_y, a1_y, shifty; + if (base_x < min_base_x) { + DECLARE_ALIGNED(32, int16_t, base_y_c[8]); + __m128i y_c128, base_y_c128, mask128, c1234_; + c1234_ = _mm_srli_si128(c1234, 2); + r6 = _mm_set1_epi16(r << 6); + y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy128)); + base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y); + mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128); + base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); + _mm_store_si128((__m128i *)base_y_c, base_y_c128); + + a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0); + base_y_c128 = _mm_add_epi16(base_y_c128, _mm_srli_epi16(a16, 4)); + _mm_store_si128((__m128i *)base_y_c, base_y_c128); + a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0); + + if (upsample_left) { + shifty = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1); + } else { + shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1); + } + a0_x = _mm_unpacklo_epi64(a0_x, a0_y); + a1_x = _mm_unpacklo_epi64(a1_x, a1_y); + shift = _mm_unpacklo_epi64(shift, shifty); + } + + diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32 + a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm_mullo_epi16(diff, shift); + res = _mm_add_epi16(a32, b); + res = _mm_srli_epi16(res, 5); + + resx = _mm_packus_epi16(res, res); + resy = _mm_srli_si128(resx, 4); + + resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]); + *(uint32_t *)(dst) = _mm_cvtsi128_si32(resxy); + dst += stride; + } +} + +static void dr_prediction_z2_Nx8_avx2(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int upsample_above, int upsample_left, + int dx, int dy) { + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i diff, a32, a16; + __m256i a0_x, a1_x; + __m128i a0_x128, a1_x128, min_base_y128, c3f; + __m128i c1234, dy128; + + a16 = _mm256_set1_epi16(16); + c3f = _mm_set1_epi16(0x3f); + min_base_y128 = _mm_set1_epi16(min_base_y); + dy128 = _mm_set1_epi16(dy); + c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + + for (int r = 0; r < N; r++) { + __m256i b, res, shift; + __m128i resx, resy, resxy, r6, ydx; + + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + int base_shift = 0; + if (base_x < (min_base_x - 1)) { + base_shift = (min_base_x - base_x - 1) >> upsample_above; + } + int base_min_diff = + (min_base_x - base_x + upsample_above) >> upsample_above; + if (base_min_diff > 8) { + base_min_diff = 8; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 7) { + a0_x = _mm256_setzero_si256(); + a1_x = _mm256_setzero_si256(); + shift = _mm256_setzero_si256(); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + ydx = _mm_set1_epi16(y * dx); + r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6); + if (upsample_above) { + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]); + a1_x128 = _mm_srli_si128(a0_x128, 8); + + shift = _mm256_castsi128_si256(_mm_srli_epi16( + _mm_and_si128( + _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f), + 1)); + } else { + a1_x128 = _mm_srli_si128(a0_x128, 1); + a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]); + a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]); + + shift = _mm256_castsi128_si256( + _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1)); + } + a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128)); + a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128)); + } + + // y calc + __m128i a0_y, a1_y, shifty; + if (base_x < min_base_x) { + DECLARE_ALIGNED(32, int16_t, base_y_c[16]); + __m128i y_c128, base_y_c128, mask128; + r6 = _mm_set1_epi16(r << 6); + y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128)); + base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y); + mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128); + base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); + _mm_store_si128((__m128i *)base_y_c, base_y_c128); + + a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]], + left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]]); + base_y_c128 = _mm_add_epi16( + base_y_c128, _mm_srli_epi16(_mm256_castsi256_si128(a16), 4)); + _mm_store_si128((__m128i *)base_y_c, base_y_c128); + + a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]], + left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]]); + + if (upsample_left) { + shifty = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1); + } else { + shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1); + } + + a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1); + a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1); + shift = _mm256_inserti128_si256(shift, shifty, 1); + } + + diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + + resx = _mm_packus_epi16(_mm256_castsi256_si128(res), + _mm256_castsi256_si128(res)); + resy = _mm256_extracti128_si256(res, 1); + resy = _mm_packus_epi16(resy, resy); + + resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]); + _mm_storel_epi64((__m128i *)(dst), resxy); + dst += stride; + } +} + +static void dr_prediction_z2_HxW_avx2(int H, int W, uint8_t *dst, + ptrdiff_t stride, const uint8_t *above, + const uint8_t *left, int upsample_above, + int upsample_left, int dx, int dy) { + // here upsample_above and upsample_left are 0 by design of + // av1_use_intra_edge_upsample + const int min_base_x = -1; + const int min_base_y = -1; + (void)upsample_above; + (void)upsample_left; + const int frac_bits_x = 6; + const int frac_bits_y = 6; + + __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c1234, c0123; + __m256i diff, min_base_y256, c3f, shifty, dy256, c1; + __m128i a0_x128, a1_x128; + + DECLARE_ALIGNED(32, int16_t, base_y_c[16]); + a16 = _mm256_set1_epi16(16); + c1 = _mm256_srli_epi16(a16, 4); + min_base_y256 = _mm256_set1_epi16(min_base_y); + c3f = _mm256_set1_epi16(0x3f); + dy256 = _mm256_set1_epi16(dy); + c0123 = + _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + c1234 = _mm256_add_epi16(c0123, c1); + + for (int r = 0; r < H; r++) { + __m256i b, res, shift, j256, r6, ydx; + __m128i resx, resy; + __m128i resxy; + int y = r + 1; + ydx = _mm256_set1_epi16((uint16_t)(y * dx)); + + int base_x = (-y * dx) >> frac_bits_x; + for (int j = 0; j < W; j += 16) { + j256 = _mm256_set1_epi16(j); + int base_shift = 0; + if ((base_x + j) < (min_base_x - 1)) { + base_shift = (min_base_x - (base_x + j) - 1); + } + int base_min_diff = (min_base_x - base_x - j); + if (base_min_diff > 16) { + base_min_diff = 16; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift < 16) { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j)); + a1_x128 = + _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j)); + a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]); + a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]); + + a0_x = _mm256_cvtepu8_epi16(a0_x128); + a1_x = _mm256_cvtepu8_epi16(a1_x128); + + r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6); + shift = _mm256_srli_epi16( + _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1); + + diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); // 16 16-bit values + resx = _mm256_castsi256_si128(_mm256_packus_epi16( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)))); + } else { + resx = _mm_setzero_si128(); + } + + // y calc + if (base_x < min_base_x) { + __m256i c256, y_c256, base_y_c256, mask256, mul16; + r6 = _mm256_set1_epi16(r << 6); + c256 = _mm256_add_epi16(j256, c1234); + mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256), + _mm256_srli_epi16(min_base_y256, 1)); + y_c256 = _mm256_sub_epi16(r6, mul16); + + base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y); + mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256); + + base_y_c256 = _mm256_blendv_epi8(base_y_c256, min_base_y256, mask256); + int16_t min_y = (int16_t)_mm_extract_epi16( + _mm256_extracti128_si256(base_y_c256, 1), 7); + int16_t max_y = + (int16_t)_mm_extract_epi16(_mm256_castsi256_si128(base_y_c256), 0); + int16_t offset_diff = max_y - min_y; + + if (offset_diff < 16) { + __m256i min_y256 = _mm256_set1_epi16(min_y); + + __m256i base_y_offset = _mm256_sub_epi16(base_y_c256, min_y256); + __m128i base_y_offset128 = + _mm_packs_epi16(_mm256_extracti128_si256(base_y_offset, 0), + _mm256_extracti128_si256(base_y_offset, 1)); + + __m128i a0_y128 = _mm_maskload_epi32( + (int *)(left + min_y), *(__m128i *)LoadMaskz2[offset_diff / 4]); + __m128i a1_y128 = + _mm_maskload_epi32((int *)(left + min_y + 1), + *(__m128i *)LoadMaskz2[offset_diff / 4]); + a0_y128 = _mm_shuffle_epi8(a0_y128, base_y_offset128); + a1_y128 = _mm_shuffle_epi8(a1_y128, base_y_offset128); + a0_y = _mm256_cvtepu8_epi16(a0_y128); + a1_y = _mm256_cvtepu8_epi16(a1_y128); + } else { + base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); + _mm256_store_si256((__m256i *)base_y_c, base_y_c256); + + a0_y = _mm256_setr_epi16( + left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], + left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]], + left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]], + left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]], + left[base_y_c[15]]); + base_y_c256 = _mm256_add_epi16(base_y_c256, c1); + _mm256_store_si256((__m256i *)base_y_c, base_y_c256); + + a1_y = _mm256_setr_epi16( + left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], + left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]], + left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]], + left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]], + left[base_y_c[15]]); + } + shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1); + + diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0_y, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shifty); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); // 16 16-bit values + resy = _mm256_castsi256_si128(_mm256_packus_epi16( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)))); + } else { + resy = _mm_setzero_si128(); + } + resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]); + _mm_storeu_si128((__m128i *)(dst + j), resxy); + } // for j + dst += stride; + } +} + +// Directional prediction, zone 2: 90 < angle < 180 +void av1_dr_prediction_z2_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_above, int upsample_left, int dx, + int dy) { + assert(dx > 0); + assert(dy > 0); + switch (bw) { + case 4: + dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above, + upsample_left, dx, dy); + break; + case 8: + dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above, + upsample_left, dx, dy); + break; + default: + dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left, + upsample_above, upsample_left, dx, dy); + break; + } + return; +} + +// z3 functions +static INLINE void transpose4x16_sse2(__m128i *x, __m128i *d) { + __m128i w0, w1, w2, w3, ww0, ww1, ww2, ww3; + w0 = _mm_unpacklo_epi8(x[0], x[1]); + w1 = _mm_unpacklo_epi8(x[2], x[3]); + w2 = _mm_unpackhi_epi8(x[0], x[1]); + w3 = _mm_unpackhi_epi8(x[2], x[3]); + + ww0 = _mm_unpacklo_epi16(w0, w1); + ww1 = _mm_unpacklo_epi16(w2, w3); + ww2 = _mm_unpackhi_epi16(w0, w1); + ww3 = _mm_unpackhi_epi16(w2, w3); + + w0 = _mm_unpacklo_epi32(ww0, ww1); + w2 = _mm_unpacklo_epi32(ww2, ww3); + w1 = _mm_unpackhi_epi32(ww0, ww1); + w3 = _mm_unpackhi_epi32(ww2, ww3); + + d[0] = _mm_unpacklo_epi64(w0, w2); + d[1] = _mm_unpackhi_epi64(w0, w2); + d[2] = _mm_unpacklo_epi64(w1, w3); + d[3] = _mm_unpackhi_epi64(w1, w3); + + d[4] = _mm_srli_si128(d[0], 8); + d[5] = _mm_srli_si128(d[1], 8); + d[6] = _mm_srli_si128(d[2], 8); + d[7] = _mm_srli_si128(d[3], 8); + + d[8] = _mm_srli_si128(d[0], 4); + d[9] = _mm_srli_si128(d[1], 4); + d[10] = _mm_srli_si128(d[2], 4); + d[11] = _mm_srli_si128(d[3], 4); + + d[12] = _mm_srli_si128(d[0], 12); + d[13] = _mm_srli_si128(d[1], 12); + d[14] = _mm_srli_si128(d[2], 12); + d[15] = _mm_srli_si128(d[3], 12); +} + +static INLINE void transpose16x32_avx2(__m256i *x, __m256i *d) { + __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; + __m256i w10, w11, w12, w13, w14, w15; + + w0 = _mm256_unpacklo_epi8(x[0], x[1]); + w1 = _mm256_unpacklo_epi8(x[2], x[3]); + w2 = _mm256_unpacklo_epi8(x[4], x[5]); + w3 = _mm256_unpacklo_epi8(x[6], x[7]); + + w8 = _mm256_unpacklo_epi8(x[8], x[9]); + w9 = _mm256_unpacklo_epi8(x[10], x[11]); + w10 = _mm256_unpacklo_epi8(x[12], x[13]); + w11 = _mm256_unpacklo_epi8(x[14], x[15]); + + w4 = _mm256_unpacklo_epi16(w0, w1); + w5 = _mm256_unpacklo_epi16(w2, w3); + w12 = _mm256_unpacklo_epi16(w8, w9); + w13 = _mm256_unpacklo_epi16(w10, w11); + + w6 = _mm256_unpacklo_epi32(w4, w5); + w7 = _mm256_unpackhi_epi32(w4, w5); + w14 = _mm256_unpacklo_epi32(w12, w13); + w15 = _mm256_unpackhi_epi32(w12, w13); + + // Store first 4-line result + d[0] = _mm256_unpacklo_epi64(w6, w14); + d[1] = _mm256_unpackhi_epi64(w6, w14); + d[2] = _mm256_unpacklo_epi64(w7, w15); + d[3] = _mm256_unpackhi_epi64(w7, w15); + + w4 = _mm256_unpackhi_epi16(w0, w1); + w5 = _mm256_unpackhi_epi16(w2, w3); + w12 = _mm256_unpackhi_epi16(w8, w9); + w13 = _mm256_unpackhi_epi16(w10, w11); + + w6 = _mm256_unpacklo_epi32(w4, w5); + w7 = _mm256_unpackhi_epi32(w4, w5); + w14 = _mm256_unpacklo_epi32(w12, w13); + w15 = _mm256_unpackhi_epi32(w12, w13); + + // Store second 4-line result + d[4] = _mm256_unpacklo_epi64(w6, w14); + d[5] = _mm256_unpackhi_epi64(w6, w14); + d[6] = _mm256_unpacklo_epi64(w7, w15); + d[7] = _mm256_unpackhi_epi64(w7, w15); + + // upper half + w0 = _mm256_unpackhi_epi8(x[0], x[1]); + w1 = _mm256_unpackhi_epi8(x[2], x[3]); + w2 = _mm256_unpackhi_epi8(x[4], x[5]); + w3 = _mm256_unpackhi_epi8(x[6], x[7]); + + w8 = _mm256_unpackhi_epi8(x[8], x[9]); + w9 = _mm256_unpackhi_epi8(x[10], x[11]); + w10 = _mm256_unpackhi_epi8(x[12], x[13]); + w11 = _mm256_unpackhi_epi8(x[14], x[15]); + + w4 = _mm256_unpacklo_epi16(w0, w1); + w5 = _mm256_unpacklo_epi16(w2, w3); + w12 = _mm256_unpacklo_epi16(w8, w9); + w13 = _mm256_unpacklo_epi16(w10, w11); + + w6 = _mm256_unpacklo_epi32(w4, w5); + w7 = _mm256_unpackhi_epi32(w4, w5); + w14 = _mm256_unpacklo_epi32(w12, w13); + w15 = _mm256_unpackhi_epi32(w12, w13); + + // Store first 4-line result + d[8] = _mm256_unpacklo_epi64(w6, w14); + d[9] = _mm256_unpackhi_epi64(w6, w14); + d[10] = _mm256_unpacklo_epi64(w7, w15); + d[11] = _mm256_unpackhi_epi64(w7, w15); + + w4 = _mm256_unpackhi_epi16(w0, w1); + w5 = _mm256_unpackhi_epi16(w2, w3); + w12 = _mm256_unpackhi_epi16(w8, w9); + w13 = _mm256_unpackhi_epi16(w10, w11); + + w6 = _mm256_unpacklo_epi32(w4, w5); + w7 = _mm256_unpackhi_epi32(w4, w5); + w14 = _mm256_unpacklo_epi32(w12, w13); + w15 = _mm256_unpackhi_epi32(w12, w13); + + // Store second 4-line result + d[12] = _mm256_unpacklo_epi64(w6, w14); + d[13] = _mm256_unpackhi_epi64(w6, w14); + d[14] = _mm256_unpacklo_epi64(w7, w15); + d[15] = _mm256_unpackhi_epi64(w7, w15); +} + +static INLINE void transpose16x16_sse2(__m128i *x, __m128i *d) { + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; + __m128i w10, w11, w12, w13, w14, w15; + + w0 = _mm_unpacklo_epi8(x[0], x[1]); + w1 = _mm_unpacklo_epi8(x[2], x[3]); + w2 = _mm_unpacklo_epi8(x[4], x[5]); + w3 = _mm_unpacklo_epi8(x[6], x[7]); + + w8 = _mm_unpacklo_epi8(x[8], x[9]); + w9 = _mm_unpacklo_epi8(x[10], x[11]); + w10 = _mm_unpacklo_epi8(x[12], x[13]); + w11 = _mm_unpacklo_epi8(x[14], x[15]); + + w4 = _mm_unpacklo_epi16(w0, w1); + w5 = _mm_unpacklo_epi16(w2, w3); + w12 = _mm_unpacklo_epi16(w8, w9); + w13 = _mm_unpacklo_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store first 4-line result + d[0] = _mm_unpacklo_epi64(w6, w14); + d[1] = _mm_unpackhi_epi64(w6, w14); + d[2] = _mm_unpacklo_epi64(w7, w15); + d[3] = _mm_unpackhi_epi64(w7, w15); + + w4 = _mm_unpackhi_epi16(w0, w1); + w5 = _mm_unpackhi_epi16(w2, w3); + w12 = _mm_unpackhi_epi16(w8, w9); + w13 = _mm_unpackhi_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store second 4-line result + d[4] = _mm_unpacklo_epi64(w6, w14); + d[5] = _mm_unpackhi_epi64(w6, w14); + d[6] = _mm_unpacklo_epi64(w7, w15); + d[7] = _mm_unpackhi_epi64(w7, w15); + + // upper half + w0 = _mm_unpackhi_epi8(x[0], x[1]); + w1 = _mm_unpackhi_epi8(x[2], x[3]); + w2 = _mm_unpackhi_epi8(x[4], x[5]); + w3 = _mm_unpackhi_epi8(x[6], x[7]); + + w8 = _mm_unpackhi_epi8(x[8], x[9]); + w9 = _mm_unpackhi_epi8(x[10], x[11]); + w10 = _mm_unpackhi_epi8(x[12], x[13]); + w11 = _mm_unpackhi_epi8(x[14], x[15]); + + w4 = _mm_unpacklo_epi16(w0, w1); + w5 = _mm_unpacklo_epi16(w2, w3); + w12 = _mm_unpacklo_epi16(w8, w9); + w13 = _mm_unpacklo_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store first 4-line result + d[8] = _mm_unpacklo_epi64(w6, w14); + d[9] = _mm_unpackhi_epi64(w6, w14); + d[10] = _mm_unpacklo_epi64(w7, w15); + d[11] = _mm_unpackhi_epi64(w7, w15); + + w4 = _mm_unpackhi_epi16(w0, w1); + w5 = _mm_unpackhi_epi16(w2, w3); + w12 = _mm_unpackhi_epi16(w8, w9); + w13 = _mm_unpackhi_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store second 4-line result + d[12] = _mm_unpacklo_epi64(w6, w14); + d[13] = _mm_unpackhi_epi64(w6, w14); + d[14] = _mm_unpacklo_epi64(w7, w15); + d[15] = _mm_unpackhi_epi64(w7, w15); +} + +static void transpose_TX_16X16(const uint8_t *src, ptrdiff_t pitchSrc, + uint8_t *dst, ptrdiff_t pitchDst) { + __m128i r[16]; + __m128i d[16]; + for (int j = 0; j < 16; j++) { + r[j] = _mm_loadu_si128((__m128i *)(src + j * pitchSrc)); + } + transpose16x16_sse2(r, d); + for (int j = 0; j < 16; j++) { + _mm_storeu_si128((__m128i *)(dst + j * pitchDst), d[j]); + } +} + +static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst, + ptrdiff_t pitchDst, int width, int height) { + for (int j = 0; j < height; j += 16) + for (int i = 0; i < width; i += 16) + transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc, + dst + j * pitchDst + i, pitchDst); +} + +static void dr_prediction_z3_4x4_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[4], d[4]; + + dr_prediction_z1_HxW_internal_avx2(4, 4, dstvec, left, upsample_left, dy); + transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], + &d[0], &d[1], &d[2], &d[3]); + + *(uint32_t *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]); + *(uint32_t *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]); + *(uint32_t *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]); + *(uint32_t *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]); + return; +} + +static void dr_prediction_z3_8x8_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[8], d[8]; + + dr_prediction_z1_HxW_internal_avx2(8, 8, dstvec, left, upsample_left, dy); + transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], + &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2], + &d[3]); + + _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]); + _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8)); + _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]); + _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8)); + _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]); + _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8)); + _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]); + _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8)); +} + +static void dr_prediction_z3_4x8_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[4], d[8]; + + dr_prediction_z1_HxW_internal_avx2(8, 4, dstvec, left, upsample_left, dy); + transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0], + &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); + for (int i = 0; i < 8; i++) { + *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]); + } +} + +static void dr_prediction_z3_8x4_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[8], d[4]; + + dr_prediction_z1_HxW_internal_avx2(4, 8, dstvec, left, upsample_left, dy); + transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], + &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0], + &d[1], &d[2], &d[3]); + _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]); + _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]); + _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]); + _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]); +} + +static void dr_prediction_z3_8x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[8], d[8]; + + dr_prediction_z1_HxW_internal_avx2(16, 8, dstvec, left, upsample_left, dy); + transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3, + dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d, + d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7); + for (int i = 0; i < 8; i++) { + _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]); + _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride), + _mm_srli_si128(d[i], 8)); + } +} + +static void dr_prediction_z3_16x8_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[16], d[16]; + + dr_prediction_z1_HxW_internal_avx2(8, 16, dstvec, left, upsample_left, dy); + transpose16x8_8x16_sse2( + &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], + &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], + &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], + &d[3], &d[4], &d[5], &d[6], &d[7]); + + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + } +} + +static void dr_prediction_z3_4x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[4], d[16]; + + dr_prediction_z1_HxW_internal_avx2(16, 4, dstvec, left, upsample_left, dy); + transpose4x16_sse2(dstvec, d); + for (int i = 0; i < 16; i++) { + *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]); + } +} + +static void dr_prediction_z3_16x4_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[16], d[8]; + + dr_prediction_z1_HxW_internal_avx2(4, 16, dstvec, left, upsample_left, dy); + for (int i = 4; i < 8; i++) { + d[i] = _mm_setzero_si128(); + } + transpose16x8_8x16_sse2( + &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], + &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], + &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], + &d[3], &d[4], &d[5], &d[6], &d[7]); + + for (int i = 0; i < 4; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + } +} + +static void dr_prediction_z3_8x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m256i dstvec[16], d[16]; + + dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy); + for (int i = 8; i < 16; i++) { + dstvec[i] = _mm256_setzero_si256(); + } + transpose16x32_avx2(dstvec, d); + + for (int i = 0; i < 16; i++) { + _mm_storel_epi64((__m128i *)(dst + i * stride), + _mm256_castsi256_si128(d[i])); + } + for (int i = 0; i < 16; i++) { + _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride), + _mm256_extracti128_si256(d[i], 1)); + } +} + +static void dr_prediction_z3_32x8_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[32], d[16]; + + dr_prediction_z1_HxW_internal_avx2(8, 32, dstvec, left, upsample_left, dy); + + transpose16x8_8x16_sse2( + &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], + &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], + &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], + &d[3], &d[4], &d[5], &d[6], &d[7]); + transpose16x8_8x16_sse2( + &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16], + &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16], + &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16], + &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16], + &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8], + &d[6 + 8], &d[7 + 8]); + + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]); + } +} + +static void dr_prediction_z3_16x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[16], d[16]; + + dr_prediction_z1_HxW_internal_avx2(16, 16, dstvec, left, upsample_left, dy); + transpose16x16_sse2(dstvec, d); + + for (int i = 0; i < 16; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + } +} + +static void dr_prediction_z3_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m256i dstvec[32], d[32]; + + dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy); + transpose16x32_avx2(dstvec, d); + transpose16x32_avx2(dstvec + 16, d + 16); + for (int j = 0; j < 16; j++) { + _mm_storeu_si128((__m128i *)(dst + j * stride), + _mm256_castsi256_si128(d[j])); + _mm_storeu_si128((__m128i *)(dst + j * stride + 16), + _mm256_castsi256_si128(d[j + 16])); + } + for (int j = 0; j < 16; j++) { + _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), + _mm256_extracti128_si256(d[j], 1)); + _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16), + _mm256_extracti128_si256(d[j + 16], 1)); + } +} + +static void dr_prediction_z3_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]); + dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy); + transpose(dstT, 64, dst, stride, 64, 64); +} + +static void dr_prediction_z3_16x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m256i dstvec[16], d[16]; + + dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy); + transpose16x32_avx2(dstvec, d); + // store + for (int j = 0; j < 16; j++) { + _mm_storeu_si128((__m128i *)(dst + j * stride), + _mm256_castsi256_si128(d[j])); + _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), + _mm256_extracti128_si256(d[j], 1)); + } +} + +static void dr_prediction_z3_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[32], d[16]; + + dr_prediction_z1_HxW_internal_avx2(16, 32, dstvec, left, upsample_left, dy); + for (int i = 0; i < 32; i += 16) { + transpose16x16_sse2((dstvec + i), d); + for (int j = 0; j < 16; j++) { + _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]); + } + } +} + +static void dr_prediction_z3_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + uint8_t dstT[64 * 32]; + dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy); + transpose(dstT, 64, dst, stride, 32, 64); +} + +static void dr_prediction_z3_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + uint8_t dstT[32 * 64]; + dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy); + transpose(dstT, 32, dst, stride, 64, 32); + return; +} + +static void dr_prediction_z3_16x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + uint8_t dstT[64 * 16]; + dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy); + transpose(dstT, 64, dst, stride, 16, 64); +} + +static void dr_prediction_z3_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[64], d[16]; + + dr_prediction_z1_HxW_internal_avx2(16, 64, dstvec, left, upsample_left, dy); + for (int i = 0; i < 64; i += 16) { + transpose16x16_sse2((dstvec + i), d); + for (int j = 0; j < 16; j++) { + _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]); + } + } +} + +void av1_dr_prediction_z3_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_left, int dx, int dy) { + (void)above; + (void)dx; + assert(dx == 1); + assert(dy > 0); + + if (bw == bh) { + switch (bw) { + case 4: + dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy); + break; + case 8: + dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy); + break; + case 16: + dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy); + break; + case 32: + dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy); + break; + case 64: + dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy); + break; + } + } else { + if (bw < bh) { + if (bw + bw == bh) { + switch (bw) { + case 4: + dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy); + break; + case 8: + dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy); + break; + case 16: + dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy); + break; + case 32: + dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy); + break; + } + } else { + switch (bw) { + case 4: + dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy); + break; + case 8: + dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy); + break; + case 16: + dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy); + break; + } + } + } else { + if (bh + bh == bw) { + switch (bh) { + case 4: + dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy); + break; + case 8: + dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy); + break; + case 16: + dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy); + break; + case 32: + dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy); + break; + } + } else { + switch (bh) { + case 4: + dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy); + break; + case 8: + dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy); + break; + case 16: + dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy); + break; + } + } + } + } +} diff --git a/libs/libaom/src/aom_dsp/x86/intrapred_sse2.c b/libs/libaom/src/aom_dsp/x86/intrapred_sse2.c new file mode 100644 index 000000000..5afef68c3 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/intrapred_sse2.c @@ -0,0 +1,1411 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "aom_dsp/x86/intrapred_x86.h" +#include "config/aom_dsp_rtcd.h" + +static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst, + ptrdiff_t stride) { + for (int i = 0; i < height; i += 2) { + *(uint32_t *)dst = dc; + dst += stride; + *(uint32_t *)dst = dc; + dst += stride; + } +} + +static INLINE void dc_store_8xh(const __m128i *row, int height, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < height; ++i) { + _mm_storel_epi64((__m128i *)dst, *row); + dst += stride; + } +} + +static INLINE void dc_store_16xh(const __m128i *row, int height, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < height; ++i) { + _mm_store_si128((__m128i *)dst, *row); + dst += stride; + } +} + +static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < height; ++i) { + _mm_store_si128((__m128i *)dst, *row); + _mm_store_si128((__m128i *)(dst + 16), *row); + dst += stride; + } +} + +static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst, + ptrdiff_t stride) { + for (int i = 0; i < height; ++i) { + _mm_store_si128((__m128i *)dst, *row); + _mm_store_si128((__m128i *)(dst + 16), *row); + _mm_store_si128((__m128i *)(dst + 32), *row); + _mm_store_si128((__m128i *)(dst + 48), *row); + dst += stride; + } +} + +static INLINE __m128i dc_sum_4(const uint8_t *ref) { + __m128i x = _mm_loadl_epi64((__m128i const *)ref); + const __m128i zero = _mm_setzero_si128(); + x = _mm_unpacklo_epi8(x, zero); + return _mm_sad_epu8(x, zero); +} + +static INLINE __m128i dc_sum_8(const uint8_t *ref) { + __m128i x = _mm_loadl_epi64((__m128i const *)ref); + const __m128i zero = _mm_setzero_si128(); + return _mm_sad_epu8(x, zero); +} + +static INLINE __m128i dc_sum_64(const uint8_t *ref) { + __m128i x0 = _mm_load_si128((__m128i const *)ref); + __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); + __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32)); + __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48)); + const __m128i zero = _mm_setzero_si128(); + x0 = _mm_sad_epu8(x0, zero); + x1 = _mm_sad_epu8(x1, zero); + x2 = _mm_sad_epu8(x2, zero); + x3 = _mm_sad_epu8(x3, zero); + x0 = _mm_add_epi16(x0, x1); + x2 = _mm_add_epi16(x2, x3); + x0 = _mm_add_epi16(x0, x2); + const __m128i high = _mm_unpackhi_epi64(x0, x0); + return _mm_add_epi16(x0, high); +} + +#define DC_MULTIPLIER_1X2 0x5556 +#define DC_MULTIPLIER_1X4 0x3334 + +#define DC_SHIFT2 16 + +static INLINE int divide_using_multiply_shift(int num, int shift1, + int multiplier) { + const int interm = num >> shift1; + return interm * multiplier >> DC_SHIFT2; +} + +// ----------------------------------------------------------------------------- +// DC_PRED + +void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_8(left); + __m128i sum_above = dc_sum_4(above); + sum_above = _mm_add_epi16(sum_left, sum_above); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 6; + sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2); + + const __m128i row = _mm_set1_epi8((uint8_t)sum); + const uint32_t pred = _mm_cvtsi128_si32(row); + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_16_sse2(left); + __m128i sum_above = dc_sum_4(above); + sum_above = _mm_add_epi16(sum_left, sum_above); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 10; + sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4); + + const __m128i row = _mm_set1_epi8((uint8_t)sum); + const uint32_t pred = _mm_cvtsi128_si32(row); + dc_store_4xh(pred, 16, dst, stride); +} + +void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_4(left); + __m128i sum_above = dc_sum_8(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 6; + sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2); + + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_16_sse2(left); + __m128i sum_above = dc_sum_8(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 12; + sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_32_sse2(left); + __m128i sum_above = dc_sum_8(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 20; + sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_4(left); + __m128i sum_above = dc_sum_16_sse2(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 10; + sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_16xh(&row, 4, dst, stride); +} + +void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_8(left); + __m128i sum_above = dc_sum_16_sse2(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 12; + sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_32_sse2(left); + __m128i sum_above = dc_sum_16_sse2(above); + sum_above = _mm_add_epi16(sum_left, sum_above); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 24; + sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_64(left); + __m128i sum_above = dc_sum_16_sse2(above); + sum_above = _mm_add_epi16(sum_left, sum_above); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 40; + sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_16xh(&row, 64, dst, stride); +} + +void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_32_sse2(above); + const __m128i sum_left = dc_sum_8(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 20; + sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_32xh(&row, 8, dst, stride); +} + +void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_32_sse2(above); + const __m128i sum_left = dc_sum_16_sse2(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 24; + sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_32_sse2(above); + const __m128i sum_left = dc_sum_64(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 48; + sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_64(above); + const __m128i sum_left = dc_sum_64(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 64; + sum /= 128; + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_64(above); + const __m128i sum_left = dc_sum_32_sse2(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 48; + sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_64(above); + const __m128i sum_left = dc_sum_16_sse2(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 40; + sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_64xh(&row, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// DC_TOP + +void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_4(above); + const __m128i two = _mm_set1_epi16((int16_t)2); + sum_above = _mm_add_epi16(sum_above, two); + sum_above = _mm_srai_epi16(sum_above, 2); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + sum_above = _mm_packus_epi16(sum_above, sum_above); + + const uint32_t pred = _mm_cvtsi128_si32(sum_above); + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_4(above); + const __m128i two = _mm_set1_epi16((int16_t)2); + sum_above = _mm_add_epi16(sum_above, two); + sum_above = _mm_srai_epi16(sum_above, 2); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + sum_above = _mm_packus_epi16(sum_above, sum_above); + + const uint32_t pred = _mm_cvtsi128_si32(sum_above); + dc_store_4xh(pred, 16, dst, stride); +} + +void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_8(above); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_above = _mm_add_epi16(sum_above, four); + sum_above = _mm_srai_epi16(sum_above, 3); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + const __m128i row = _mm_shufflelo_epi16(sum_above, 0); + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_8(above); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_above = _mm_add_epi16(sum_above, four); + sum_above = _mm_srai_epi16(sum_above, 3); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + const __m128i row = _mm_shufflelo_epi16(sum_above, 0); + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_8(above); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_above = _mm_add_epi16(sum_above, four); + sum_above = _mm_srai_epi16(sum_above, 3); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + const __m128i row = _mm_shufflelo_epi16(sum_above, 0); + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_16_sse2(above); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_above = _mm_add_epi16(sum_above, eight); + sum_above = _mm_srai_epi16(sum_above, 4); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_16xh(&row, 4, dst, stride); +} + +void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_16_sse2(above); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_above = _mm_add_epi16(sum_above, eight); + sum_above = _mm_srai_epi16(sum_above, 4); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_16_sse2(above); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_above = _mm_add_epi16(sum_above, eight); + sum_above = _mm_srai_epi16(sum_above, 4); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_16_sse2(above); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_above = _mm_add_epi16(sum_above, eight); + sum_above = _mm_srai_epi16(sum_above, 4); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_16xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_32_sse2(above); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_above = _mm_add_epi16(sum_above, sixteen); + sum_above = _mm_srai_epi16(sum_above, 5); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_32xh(&row, 8, dst, stride); +} + +void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_32_sse2(above); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_above = _mm_add_epi16(sum_above, sixteen); + sum_above = _mm_srai_epi16(sum_above, 5); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_32_sse2(above); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_above = _mm_add_epi16(sum_above, sixteen); + sum_above = _mm_srai_epi16(sum_above, 5); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_64(above); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_above = _mm_add_epi16(sum_above, thirtytwo); + sum_above = _mm_srai_epi16(sum_above, 6); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_64(above); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_above = _mm_add_epi16(sum_above, thirtytwo); + sum_above = _mm_srai_epi16(sum_above, 6); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_64(above); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_above = _mm_add_epi16(sum_above, thirtytwo); + sum_above = _mm_srai_epi16(sum_above, 6); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_64xh(&row, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// DC_LEFT + +void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_8(left); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_left = _mm_add_epi16(sum_left, four); + sum_left = _mm_srai_epi16(sum_left, 3); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + sum_left = _mm_packus_epi16(sum_left, sum_left); + + const uint32_t pred = _mm_cvtsi128_si32(sum_left); + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_16_sse2(left); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_left = _mm_add_epi16(sum_left, eight); + sum_left = _mm_srai_epi16(sum_left, 4); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + sum_left = _mm_packus_epi16(sum_left, sum_left); + + const uint32_t pred = _mm_cvtsi128_si32(sum_left); + dc_store_4xh(pred, 16, dst, stride); +} + +void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_4(left); + const __m128i two = _mm_set1_epi16((uint16_t)2); + sum_left = _mm_add_epi16(sum_left, two); + sum_left = _mm_srai_epi16(sum_left, 2); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + const __m128i row = _mm_shufflelo_epi16(sum_left, 0); + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_16_sse2(left); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_left = _mm_add_epi16(sum_left, eight); + sum_left = _mm_srai_epi16(sum_left, 4); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + const __m128i row = _mm_shufflelo_epi16(sum_left, 0); + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_32_sse2(left); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_left = _mm_add_epi16(sum_left, sixteen); + sum_left = _mm_srai_epi16(sum_left, 5); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + const __m128i row = _mm_shufflelo_epi16(sum_left, 0); + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_4(left); + const __m128i two = _mm_set1_epi16((uint16_t)2); + sum_left = _mm_add_epi16(sum_left, two); + sum_left = _mm_srai_epi16(sum_left, 2); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_16xh(&row, 4, dst, stride); +} + +void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_8(left); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_left = _mm_add_epi16(sum_left, four); + sum_left = _mm_srai_epi16(sum_left, 3); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_32_sse2(left); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_left = _mm_add_epi16(sum_left, sixteen); + sum_left = _mm_srai_epi16(sum_left, 5); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_64(left); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_left = _mm_add_epi16(sum_left, thirtytwo); + sum_left = _mm_srai_epi16(sum_left, 6); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_16xh(&row, 64, dst, stride); +} + +void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_8(left); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_left = _mm_add_epi16(sum_left, four); + sum_left = _mm_srai_epi16(sum_left, 3); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_32xh(&row, 8, dst, stride); +} + +void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_16_sse2(left); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_left = _mm_add_epi16(sum_left, eight); + sum_left = _mm_srai_epi16(sum_left, 4); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_64(left); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_left = _mm_add_epi16(sum_left, thirtytwo); + sum_left = _mm_srai_epi16(sum_left, 6); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_64(left); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_left = _mm_add_epi16(sum_left, thirtytwo); + sum_left = _mm_srai_epi16(sum_left, 6); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_32_sse2(left); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_left = _mm_add_epi16(sum_left, sixteen); + sum_left = _mm_srai_epi16(sum_left, 5); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_16_sse2(left); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_left = _mm_add_epi16(sum_left, eight); + sum_left = _mm_srai_epi16(sum_left, 4); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_64xh(&row, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// DC_128 + +void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const uint32_t pred = 0x80808080; + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const uint32_t pred = 0x80808080; + dc_store_4xh(pred, 16, dst, stride); +} + +void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_16xh(&row, 4, dst, stride); +} + +void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_16xh(&row, 64, dst, stride); +} + +void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_32xh(&row, 8, dst, stride); +} + +void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_64xh(&row, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// V_PRED + +void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint32_t pred = *(uint32_t *)above; + (void)left; + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint32_t pred = *(uint32_t *)above; + (void)left; + dc_store_4xh(pred, 16, dst, stride); +} + +void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_loadl_epi64((__m128i const *)above); + (void)left; + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_loadl_epi64((__m128i const *)above); + (void)left; + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_loadl_epi64((__m128i const *)above); + (void)left; + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_load_si128((__m128i const *)above); + (void)left; + dc_store_16xh(&row, 4, dst, stride); +} + +void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_load_si128((__m128i const *)above); + (void)left; + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_load_si128((__m128i const *)above); + (void)left; + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_load_si128((__m128i const *)above); + (void)left; + dc_store_16xh(&row, 64, dst, stride); +} + +static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int height) { + const __m128i row0 = _mm_load_si128((__m128i const *)above); + const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16)); + for (int i = 0; i < height; ++i) { + _mm_store_si128((__m128i *)dst, row0); + _mm_store_si128((__m128i *)(dst + 16), row1); + dst += stride; + } +} + +void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_32xh(dst, stride, above, 8); +} + +void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_32xh(dst, stride, above, 16); +} + +void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_32xh(dst, stride, above, 64); +} + +static INLINE void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int height) { + const __m128i row0 = _mm_load_si128((__m128i const *)above); + const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16)); + const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32)); + const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48)); + for (int i = 0; i < height; ++i) { + _mm_store_si128((__m128i *)dst, row0); + _mm_store_si128((__m128i *)(dst + 16), row1); + _mm_store_si128((__m128i *)(dst + 32), row2); + _mm_store_si128((__m128i *)(dst + 48), row3); + dst += stride; + } +} + +void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_64xh(dst, stride, above, 64); +} + +void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_64xh(dst, stride, above, 32); +} + +void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_64xh(dst, stride, above, 16); +} + +// ----------------------------------------------------------------------------- +// H_PRED + +void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + __m128i left_col = _mm_loadl_epi64((__m128i const *)left); + left_col = _mm_unpacklo_epi8(left_col, left_col); + __m128i row0 = _mm_shufflelo_epi16(left_col, 0); + __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); + __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); + __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); + dst += stride; + left_col = _mm_unpackhi_epi64(left_col, left_col); + row0 = _mm_shufflelo_epi16(left_col, 0); + row1 = _mm_shufflelo_epi16(left_col, 0x55); + row2 = _mm_shufflelo_epi16(left_col, 0xaa); + row3 = _mm_shufflelo_epi16(left_col, 0xff); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); +} + +void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + const __m128i left_col = _mm_load_si128((__m128i const *)left); + __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col); + __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col); + + __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0); + __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55); + __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); + __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); + dst += stride; + + left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low); + row0 = _mm_shufflelo_epi16(left_col_low, 0); + row1 = _mm_shufflelo_epi16(left_col_low, 0x55); + row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_low, 0xff); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); + dst += stride; + + row0 = _mm_shufflelo_epi16(left_col_high, 0); + row1 = _mm_shufflelo_epi16(left_col_high, 0x55); + row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_high, 0xff); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); + dst += stride; + + left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high); + row0 = _mm_shufflelo_epi16(left_col_high, 0); + row1 = _mm_shufflelo_epi16(left_col_high, 0x55); + row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_high, 0xff); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); +} + +void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + __m128i left_col = _mm_loadl_epi64((__m128i const *)left); + left_col = _mm_unpacklo_epi8(left_col, left_col); + __m128i row0 = _mm_shufflelo_epi16(left_col, 0); + __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); + __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); + __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int count) { + (void)above; + for (int i = 0; i < count; ++i) { + const __m128i left_col = _mm_load_si128((__m128i const *)left); + __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col); + __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col); + + __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0); + __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55); + __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); + __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + + left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low); + row0 = _mm_shufflelo_epi16(left_col_low, 0); + row1 = _mm_shufflelo_epi16(left_col_low, 0x55); + row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_low, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + + row0 = _mm_shufflelo_epi16(left_col_high, 0); + row1 = _mm_shufflelo_epi16(left_col_high, 0x55); + row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_high, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + + left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high); + row0 = _mm_shufflelo_epi16(left_col_high, 0); + row1 = _mm_shufflelo_epi16(left_col_high, 0x55); + row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_high, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + left += 16; + } +} + +void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + h_predictor_8x16xc(dst, stride, above, left, 1); +} + +void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + h_predictor_8x16xc(dst, stride, above, left, 2); +} + +static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < h; ++i) { + _mm_store_si128((__m128i *)dst, row[i]); + dst += stride; + } +} + +static INLINE void repeat_low_4pixels(const __m128i *x, __m128i *row) { + const __m128i u0 = _mm_shufflelo_epi16(*x, 0); + const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55); + const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa); + const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff); + + row[0] = _mm_unpacklo_epi64(u0, u0); + row[1] = _mm_unpacklo_epi64(u1, u1); + row[2] = _mm_unpacklo_epi64(u2, u2); + row[3] = _mm_unpacklo_epi64(u3, u3); +} + +static INLINE void repeat_high_4pixels(const __m128i *x, __m128i *row) { + const __m128i u0 = _mm_shufflehi_epi16(*x, 0); + const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55); + const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa); + const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff); + + row[0] = _mm_unpackhi_epi64(u0, u0); + row[1] = _mm_unpackhi_epi64(u1, u1); + row[2] = _mm_unpackhi_epi64(u2, u2); + row[3] = _mm_unpackhi_epi64(u3, u3); +} + +// Process 16x8, first 4 rows +// Use first 8 bytes of left register: xxxxxxxx33221100 +static INLINE void h_prediction_16x8_1(const __m128i *left, uint8_t *dst, + ptrdiff_t stride) { + __m128i row[4]; + repeat_low_4pixels(left, row); + h_pred_store_16xh(row, 4, dst, stride); +} + +// Process 16x8, second 4 rows +// Use second 8 bytes of left register: 77665544xxxxxxxx +static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst, + ptrdiff_t stride) { + __m128i row[4]; + repeat_high_4pixels(left, row); + h_pred_store_16xh(row, 4, dst, stride); +} + +void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + const __m128i left_col = _mm_loadl_epi64((const __m128i *)left); + const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p, dst, stride); +} + +void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + const __m128i left_col = _mm_loadl_epi64((const __m128i *)left); + const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_16x8_2(&left_col_8p, dst, stride); +} + +static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int count) { + int i = 0; + do { + const __m128i left_col = _mm_load_si128((const __m128i *)left); + const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p_lo, dst, stride); + dst += stride << 2; + h_prediction_16x8_2(&left_col_8p_lo, dst, stride); + dst += stride << 2; + + const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p_hi, dst, stride); + dst += stride << 2; + h_prediction_16x8_2(&left_col_8p_hi, dst, stride); + dst += stride << 2; + + left += 16; + i++; + } while (i < count); +} + +void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_16xh(dst, stride, left, 2); +} + +void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_16xh(dst, stride, left, 4); +} + +static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < h; ++i) { + _mm_store_si128((__m128i *)dst, row[i]); + _mm_store_si128((__m128i *)(dst + 16), row[i]); + dst += stride; + } +} + +// Process 32x8, first 4 rows +// Use first 8 bytes of left register: xxxxxxxx33221100 +static INLINE void h_prediction_32x8_1(const __m128i *left, uint8_t *dst, + ptrdiff_t stride) { + __m128i row[4]; + repeat_low_4pixels(left, row); + h_pred_store_32xh(row, 4, dst, stride); +} + +// Process 32x8, second 4 rows +// Use second 8 bytes of left register: 77665544xxxxxxxx +static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst, + ptrdiff_t stride) { + __m128i row[4]; + repeat_high_4pixels(left, row); + h_pred_store_32xh(row, 4, dst, stride); +} + +void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i left_col, left_col_8p; + (void)above; + + left_col = _mm_load_si128((const __m128i *)left); + + left_col_8p = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_32x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_32x8_2(&left_col_8p, dst, stride); +} + +void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i left_col, left_col_8p; + (void)above; + + left_col = _mm_load_si128((const __m128i *)left); + + left_col_8p = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_32x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_32x8_2(&left_col_8p, dst, stride); + dst += stride << 2; + + left_col_8p = _mm_unpackhi_epi8(left_col, left_col); + h_prediction_32x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_32x8_2(&left_col_8p, dst, stride); +} + +static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int height) { + int i = height >> 2; + do { + __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]); + left4 = _mm_unpacklo_epi8(left4, left4); + left4 = _mm_unpacklo_epi8(left4, left4); + const __m128i r0 = _mm_shuffle_epi32(left4, 0x0); + const __m128i r1 = _mm_shuffle_epi32(left4, 0x55); + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r0); + _mm_store_si128((__m128i *)(dst + stride), r1); + _mm_store_si128((__m128i *)(dst + stride + 16), r1); + const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa); + const __m128i r3 = _mm_shuffle_epi32(left4, 0xff); + _mm_store_si128((__m128i *)(dst + stride * 2), r2); + _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2); + _mm_store_si128((__m128i *)(dst + stride * 3), r3); + _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3); + left += 4; + dst += stride * 4; + } while (--i); +} + +void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_32xh(dst, stride, left, 64); +} + +static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int height) { + int i = height >> 2; + do { + __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]); + left4 = _mm_unpacklo_epi8(left4, left4); + left4 = _mm_unpacklo_epi8(left4, left4); + const __m128i r0 = _mm_shuffle_epi32(left4, 0x0); + const __m128i r1 = _mm_shuffle_epi32(left4, 0x55); + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r0); + _mm_store_si128((__m128i *)(dst + 32), r0); + _mm_store_si128((__m128i *)(dst + 48), r0); + _mm_store_si128((__m128i *)(dst + stride), r1); + _mm_store_si128((__m128i *)(dst + stride + 16), r1); + _mm_store_si128((__m128i *)(dst + stride + 32), r1); + _mm_store_si128((__m128i *)(dst + stride + 48), r1); + const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa); + const __m128i r3 = _mm_shuffle_epi32(left4, 0xff); + _mm_store_si128((__m128i *)(dst + stride * 2), r2); + _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2); + _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2); + _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2); + _mm_store_si128((__m128i *)(dst + stride * 3), r3); + _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3); + _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3); + _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3); + left += 4; + dst += stride * 4; + } while (--i); +} + +void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_64xh(dst, stride, left, 64); +} + +void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_64xh(dst, stride, left, 32); +} + +void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_64xh(dst, stride, left, 16); +} diff --git a/libs/libaom/src/aom_dsp/x86/intrapred_ssse3.c b/libs/libaom/src/aom_dsp/x86/intrapred_ssse3.c new file mode 100644 index 000000000..5a34ea0c8 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/intrapred_ssse3.c @@ -0,0 +1,1695 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/intrapred_common.h" + +// ----------------------------------------------------------------------------- +// PAETH_PRED + +// Return 8 16-bit pixels in one row +static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top, + const __m128i *topleft) { + const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft); + + __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left)); + __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top)); + __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft)); + + __m128i mask1 = _mm_cmpgt_epi16(pl, pt); + mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl)); + __m128i mask2 = _mm_cmpgt_epi16(pt, ptl); + + pl = _mm_andnot_si128(mask1, *left); + + ptl = _mm_and_si128(mask2, *topleft); + pt = _mm_andnot_si128(mask2, *top); + pt = _mm_or_si128(pt, ptl); + pt = _mm_and_si128(mask1, pt); + + return _mm_or_si128(pl, pt); +} + +void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16((short)0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 4; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16((short)0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 8; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_load_si128((const __m128i *)left); + const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)above)[0]); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16((short)0x8000); + const __m128i one = _mm_set1_epi16(1); + + for (int i = 0; i < 16; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16((short)0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 4; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16((short)0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 8; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_load_si128((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16((short)0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 16; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + + for (int j = 0; j < 2; ++j) { + const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); + __m128i rep = _mm_set1_epi16((short)0x8000); + for (int i = 0; i < 16; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + } +} + +// Return 16 8-bit pixels in one row +static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0, + const __m128i *top1, + const __m128i *topleft) { + const __m128i p0 = paeth_8x1_pred(left, top0, topleft); + const __m128i p1 = paeth_8x1_pred(left, top1, topleft); + return _mm_packus_epi16(p0, p1); +} + +void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_cvtsi32_si128(((const uint32_t *)left)[0]); + const __m128i t = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i top0 = _mm_unpacklo_epi8(t, zero); + const __m128i top1 = _mm_unpackhi_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16((short)0x8000); + const __m128i one = _mm_set1_epi16(1); + + for (int i = 0; i < 4; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i top0 = _mm_unpacklo_epi8(t, zero); + const __m128i top1 = _mm_unpackhi_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16((short)0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 8; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i l = _mm_load_si128((const __m128i *)left); + const __m128i t = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i top0 = _mm_unpacklo_epi8(t, zero); + const __m128i top1 = _mm_unpackhi_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16((short)0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 16; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i l = _mm_load_si128((const __m128i *)left); + const __m128i t = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i top0 = _mm_unpacklo_epi8(t, zero); + const __m128i top1 = _mm_unpackhi_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16((short)0x8000); + const __m128i one = _mm_set1_epi16(1); + __m128i l16; + + int i; + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + + l = _mm_load_si128((const __m128i *)(left + 16)); + rep = _mm_set1_epi16((short)0x8000); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i t = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i top0 = _mm_unpacklo_epi8(t, zero); + const __m128i top1 = _mm_unpackhi_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + + for (int j = 0; j < 4; ++j) { + const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); + __m128i rep = _mm_set1_epi16((short)0x8000); + for (int i = 0; i < 16; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16((short)0x8000); + const __m128i one = _mm_set1_epi16(1); + const __m128i l = _mm_loadl_epi64((const __m128i *)left); + __m128i l16; + + for (int i = 0; i < 8; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + + _mm_store_si128((__m128i *)dst, r32l); + _mm_store_si128((__m128i *)(dst + 16), r32h); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16((short)0x8000); + const __m128i one = _mm_set1_epi16(1); + __m128i l = _mm_load_si128((const __m128i *)left); + __m128i l16; + + int i; + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + + _mm_store_si128((__m128i *)dst, r32l); + _mm_store_si128((__m128i *)(dst + 16), r32h); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16((short)0x8000); + const __m128i one = _mm_set1_epi16(1); + __m128i l = _mm_load_si128((const __m128i *)left); + __m128i l16; + + int i; + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + + _mm_store_si128((__m128i *)dst, r32l); + _mm_store_si128((__m128i *)(dst + 16), r32h); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + + rep = _mm_set1_epi16((short)0x8000); + l = _mm_load_si128((const __m128i *)(left + 16)); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + + _mm_store_si128((__m128i *)dst, r32l); + _mm_store_si128((__m128i *)(dst + 16), r32h); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + __m128i l16; + + int i, j; + for (j = 0; j < 4; ++j) { + const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); + __m128i rep = _mm_set1_epi16((short)0x8000); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + + _mm_store_si128((__m128i *)dst, r32l); + _mm_store_si128((__m128i *)(dst + 16), r32h); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i c = _mm_load_si128((const __m128i *)(above + 32)); + const __m128i d = _mm_load_si128((const __m128i *)(above + 48)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + const __m128i cl = _mm_unpacklo_epi8(c, zero); + const __m128i ch = _mm_unpackhi_epi8(c, zero); + const __m128i dl = _mm_unpacklo_epi8(d, zero); + const __m128i dh = _mm_unpackhi_epi8(d, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + __m128i l16; + + int i, j; + for (j = 0; j < 2; ++j) { + const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); + __m128i rep = _mm_set1_epi16((short)0x8000); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16); + const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i c = _mm_load_si128((const __m128i *)(above + 32)); + const __m128i d = _mm_load_si128((const __m128i *)(above + 48)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + const __m128i cl = _mm_unpacklo_epi8(c, zero); + const __m128i ch = _mm_unpackhi_epi8(c, zero); + const __m128i dl = _mm_unpacklo_epi8(d, zero); + const __m128i dh = _mm_unpackhi_epi8(d, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + __m128i l16; + + int i, j; + for (j = 0; j < 4; ++j) { + const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); + __m128i rep = _mm_set1_epi16((short)0x8000); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16); + const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i c = _mm_load_si128((const __m128i *)(above + 32)); + const __m128i d = _mm_load_si128((const __m128i *)(above + 48)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + const __m128i cl = _mm_unpacklo_epi8(c, zero); + const __m128i ch = _mm_unpackhi_epi8(c, zero); + const __m128i dl = _mm_unpacklo_epi8(d, zero); + const __m128i dh = _mm_unpackhi_epi8(d, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + __m128i l16; + + int i; + const __m128i l = _mm_load_si128((const __m128i *)left); + __m128i rep = _mm_set1_epi16((short)0x8000); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16); + const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +// ----------------------------------------------------------------------------- +// SMOOTH_PRED + +// pixels[0]: above and below_pred interleave vector +// pixels[1]: left vector +// pixels[2]: right_pred vector +static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]); + if (height == 4) + pixels[1] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]); + else if (height == 8) + pixels[1] = _mm_loadl_epi64(((const __m128i *)left)); + else + pixels[1] = _mm_loadu_si128(((const __m128i *)left)); + + pixels[2] = _mm_set1_epi16((uint16_t)above[3]); + + const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); + const __m128i zero = _mm_setzero_si128(); + d = _mm_unpacklo_epi8(d, zero); + pixels[0] = _mm_unpacklo_epi16(d, bp); +} + +// weight_h[0]: weight_h vector +// weight_h[1]: scale - weight_h vector +// weight_h[2]: same as [0], second half for height = 16 only +// weight_h[3]: same as [1], second half for height = 16 only +// weight_w[0]: weights_w and scale - weights_w interleave vector +static INLINE void load_weight_w4(const uint8_t *weight_array, int height, + __m128i *weight_h, __m128i *weight_w) { + const __m128i zero = _mm_setzero_si128(); + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]); + weight_h[0] = _mm_unpacklo_epi8(t, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); + + if (height == 8) { + const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]); + weight_h[0] = _mm_unpacklo_epi8(weight, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + } else if (height == 16) { + const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]); + weight_h[0] = _mm_unpacklo_epi8(weight, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(weight, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + } +} + +static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh, + const __m128i *ww, int h, uint8_t *dst, + ptrdiff_t stride, int second_half) { + const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale)); + const __m128i one = _mm_set1_epi16(1); + const __m128i inc = _mm_set1_epi16(0x202); + const __m128i gat = _mm_set1_epi32(0xc080400); + __m128i rep = second_half ? _mm_set1_epi16((short)0x8008) + : _mm_set1_epi16((short)0x8000); + __m128i d = _mm_set1_epi16(0x100); + + for (int i = 0; i < h; ++i) { + const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); + const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); + const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); + __m128i s = _mm_madd_epi16(pixel[0], wh_sc); + + __m128i b = _mm_shuffle_epi8(pixel[1], rep); + b = _mm_unpacklo_epi16(b, pixel[2]); + __m128i sum = _mm_madd_epi16(b, ww[0]); + + sum = _mm_add_epi32(s, sum); + sum = _mm_add_epi32(sum, round); + sum = _mm_srai_epi32(sum, 1 + sm_weight_log2_scale); + + sum = _mm_shuffle_epi8(sum, gat); + *(uint32_t *)dst = _mm_cvtsi128_si32(sum); + dst += stride; + + rep = _mm_add_epi16(rep, one); + d = _mm_add_epi16(d, inc); + } +} + +void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i pixels[3]; + load_pixel_w4(above, left, 4, pixels); + + __m128i wh[4], ww[2]; + load_weight_w4(sm_weight_arrays, 4, wh, ww); + + smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0); +} + +void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i pixels[3]; + load_pixel_w4(above, left, 8, pixels); + + __m128i wh[4], ww[2]; + load_weight_w4(sm_weight_arrays, 8, wh, ww); + + smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0); +} + +void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[3]; + load_pixel_w4(above, left, 16, pixels); + + __m128i wh[4], ww[2]; + load_weight_w4(sm_weight_arrays, 16, wh, ww); + + smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1); +} + +// pixels[0]: above and below_pred interleave vector, first half +// pixels[1]: above and below_pred interleave vector, second half +// pixels[2]: left vector +// pixels[3]: right_pred vector +// pixels[4]: above and below_pred interleave vector, first half +// pixels[5]: above and below_pred interleave vector, second half +// pixels[6]: left vector + 16 +// pixels[7]: right_pred vector +static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + const __m128i zero = _mm_setzero_si128(); + const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); + __m128i d = _mm_loadl_epi64((const __m128i *)above); + d = _mm_unpacklo_epi8(d, zero); + pixels[0] = _mm_unpacklo_epi16(d, bp); + pixels[1] = _mm_unpackhi_epi16(d, bp); + + pixels[3] = _mm_set1_epi16((uint16_t)above[7]); + + if (height == 4) { + pixels[2] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]); + } else if (height == 8) { + pixels[2] = _mm_loadl_epi64((const __m128i *)left); + } else if (height == 16) { + pixels[2] = _mm_load_si128((const __m128i *)left); + } else { + pixels[2] = _mm_load_si128((const __m128i *)left); + pixels[4] = pixels[0]; + pixels[5] = pixels[1]; + pixels[6] = _mm_load_si128((const __m128i *)(left + 16)); + pixels[7] = pixels[3]; + } +} + +// weight_h[0]: weight_h vector +// weight_h[1]: scale - weight_h vector +// weight_h[2]: same as [0], offset 8 +// weight_h[3]: same as [1], offset 8 +// weight_h[4]: same as [0], offset 16 +// weight_h[5]: same as [1], offset 16 +// weight_h[6]: same as [0], offset 24 +// weight_h[7]: same as [1], offset 24 +// weight_w[0]: weights_w and scale - weights_w interleave vector, first half +// weight_w[1]: weights_w and scale - weights_w interleave vector, second half +static INLINE void load_weight_w8(const uint8_t *weight_array, int height, + __m128i *weight_h, __m128i *weight_w) { + const __m128i zero = _mm_setzero_si128(); + const int we_offset = height < 8 ? 4 : 8; + __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]); + weight_h[0] = _mm_unpacklo_epi8(we, zero); + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + + if (height == 4) { + we = _mm_srli_si128(we, 4); + __m128i tmp1 = _mm_unpacklo_epi8(we, zero); + __m128i tmp2 = _mm_sub_epi16(d, tmp1); + weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2); + weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2); + } else { + weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); + weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]); + } + + if (height == 16) { + we = _mm_loadu_si128((const __m128i *)&weight_array[16]); + weight_h[0] = _mm_unpacklo_epi8(we, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(we, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + } else if (height == 32) { + const __m128i weight_lo = + _mm_loadu_si128((const __m128i *)&weight_array[32]); + weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + const __m128i weight_hi = + _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]); + weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero); + weight_h[5] = _mm_sub_epi16(d, weight_h[4]); + weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero); + weight_h[7] = _mm_sub_epi16(d, weight_h[6]); + } +} + +static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh, + const __m128i *ww, int h, uint8_t *dst, + ptrdiff_t stride, int second_half) { + const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale)); + const __m128i one = _mm_set1_epi16(1); + const __m128i inc = _mm_set1_epi16(0x202); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + + __m128i rep = second_half ? _mm_set1_epi16((short)0x8008) + : _mm_set1_epi16((short)0x8000); + __m128i d = _mm_set1_epi16(0x100); + + int i; + for (i = 0; i < h; ++i) { + const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); + const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); + const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); + __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc); + __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc); + + __m128i b = _mm_shuffle_epi8(pixels[2], rep); + b = _mm_unpacklo_epi16(b, pixels[3]); + __m128i sum0 = _mm_madd_epi16(b, ww[0]); + __m128i sum1 = _mm_madd_epi16(b, ww[1]); + + s0 = _mm_add_epi32(s0, sum0); + s0 = _mm_add_epi32(s0, round); + s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale); + + s1 = _mm_add_epi32(s1, sum1); + s1 = _mm_add_epi32(s1, round); + s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale); + + sum0 = _mm_packus_epi16(s0, s1); + sum0 = _mm_shuffle_epi8(sum0, gat); + _mm_storel_epi64((__m128i *)dst, sum0); + dst += stride; + + rep = _mm_add_epi16(rep, one); + d = _mm_add_epi16(d, inc); + } +} + +void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i pixels[4]; + load_pixel_w8(above, left, 4, pixels); + + __m128i wh[4], ww[2]; + load_weight_w8(sm_weight_arrays, 4, wh, ww); + + smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0); +} + +void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i pixels[4]; + load_pixel_w8(above, left, 8, pixels); + + __m128i wh[4], ww[2]; + load_weight_w8(sm_weight_arrays, 8, wh, ww); + + smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0); +} + +void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[4]; + load_pixel_w8(above, left, 16, pixels); + + __m128i wh[4], ww[2]; + load_weight_w8(sm_weight_arrays, 16, wh, ww); + + smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1); +} + +void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[8]; + load_pixel_w8(above, left, 32, pixels); + + __m128i wh[8], ww[2]; + load_weight_w8(sm_weight_arrays, 32, wh, ww); + + smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1); + dst += stride << 3; + smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1); +} + +static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left, uint32_t bw, + uint32_t bh) { + const uint8_t *const sm_weights_w = sm_weight_arrays + bw; + const uint8_t *const sm_weights_h = sm_weight_arrays + bh; + const __m128i zero = _mm_setzero_si128(); + const __m128i scale_value = + _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]); + const __m128i dup16 = _mm_set1_epi32(0x01000100); + const __m128i top_right = + _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + const __m128i round = _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale)); + + for (uint32_t y = 0; y < bh; ++y) { + const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]); + const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]); + const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y); + __m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left); + const __m128i wl_y = + _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0); + pred_scaled_bl = _mm_add_epi32(pred_scaled_bl, round); + pred_scaled_bl = _mm_shuffle_epi32(pred_scaled_bl, 0); + + for (uint32_t x = 0; x < bw; x += 8) { + const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x)); + const __m128i weights_x = + _mm_loadl_epi64((const __m128i *)(sm_weights_w + x)); + const __m128i tw_x = _mm_unpacklo_epi8(top_x, weights_x); + const __m128i tw_x_lo = _mm_unpacklo_epi8(tw_x, zero); + const __m128i tw_x_hi = _mm_unpackhi_epi8(tw_x, zero); + + __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y); + __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y); + + const __m128i scale_m_weights_x = + _mm_sub_epi16(scale_value, _mm_unpacklo_epi8(weights_x, zero)); + const __m128i swxtr = _mm_mullo_epi16(scale_m_weights_x, top_right); + const __m128i swxtr_lo = _mm_unpacklo_epi16(swxtr, zero); + const __m128i swxtr_hi = _mm_unpackhi_epi16(swxtr, zero); + + pred_lo = _mm_add_epi32(pred_lo, pred_scaled_bl); + pred_hi = _mm_add_epi32(pred_hi, pred_scaled_bl); + + pred_lo = _mm_add_epi32(pred_lo, swxtr_lo); + pred_hi = _mm_add_epi32(pred_hi, swxtr_hi); + + pred_lo = _mm_srai_epi32(pred_lo, (1 + sm_weight_log2_scale)); + pred_hi = _mm_srai_epi32(pred_hi, (1 + sm_weight_log2_scale)); + + __m128i pred = _mm_packus_epi16(pred_lo, pred_hi); + pred = _mm_shuffle_epi8(pred, gat); + _mm_storel_epi64((__m128i *)(dst + x), pred); + } + dst += stride; + } +} + +void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 16, 4); +} + +void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 16, 8); +} + +void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 16, 16); +} + +void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 16, 32); +} + +void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 32, 8); +} + +void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 32, 16); +} + +void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 32, 32); +} + +void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 32, 64); +} + +void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 64, 64); +} + +void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 64, 32); +} + +void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 64, 16); +} + +void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 16, 64); +} + +// ----------------------------------------------------------------------------- +// SMOOTH_V_PRED + +// pixels[0]: above and below_pred interleave vector +static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + const __m128i zero = _mm_setzero_si128(); + __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]); + const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); + d = _mm_unpacklo_epi8(d, zero); + pixels[0] = _mm_unpacklo_epi16(d, bp); +} + +// weights[0]: weights_h vector +// weights[1]: scale - weights_h vector +static INLINE void load_weight_v_w4(const uint8_t *weight_array, int height, + __m128i *weights) { + const __m128i zero = _mm_setzero_si128(); + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + + if (height == 4) { + const __m128i weight = + _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]); + weights[0] = _mm_unpacklo_epi8(weight, zero); + weights[1] = _mm_sub_epi16(d, weights[0]); + } else if (height == 8) { + const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]); + weights[0] = _mm_unpacklo_epi8(weight, zero); + weights[1] = _mm_sub_epi16(d, weights[0]); + } else { + const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]); + weights[0] = _mm_unpacklo_epi8(weight, zero); + weights[1] = _mm_sub_epi16(d, weights[0]); + weights[2] = _mm_unpackhi_epi8(weight, zero); + weights[3] = _mm_sub_epi16(d, weights[2]); + } +} + +static INLINE void smooth_v_pred_4xh(const __m128i *pixel, + const __m128i *weight, int h, uint8_t *dst, + ptrdiff_t stride) { + const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1))); + const __m128i inc = _mm_set1_epi16(0x202); + const __m128i gat = _mm_set1_epi32(0xc080400); + __m128i d = _mm_set1_epi16(0x100); + + for (int i = 0; i < h; ++i) { + const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d); + const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d); + const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); + __m128i sum = _mm_madd_epi16(pixel[0], wh_sc); + sum = _mm_add_epi32(sum, pred_round); + sum = _mm_srai_epi32(sum, sm_weight_log2_scale); + sum = _mm_shuffle_epi8(sum, gat); + *(uint32_t *)dst = _mm_cvtsi128_si32(sum); + dst += stride; + d = _mm_add_epi16(d, inc); + } +} + +void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels; + load_pixel_v_w4(above, left, 4, &pixels); + + __m128i weights[2]; + load_weight_v_w4(sm_weight_arrays, 4, weights); + + smooth_v_pred_4xh(&pixels, weights, 4, dst, stride); +} + +void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels; + load_pixel_v_w4(above, left, 8, &pixels); + + __m128i weights[2]; + load_weight_v_w4(sm_weight_arrays, 8, weights); + + smooth_v_pred_4xh(&pixels, weights, 8, dst, stride); +} + +void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels; + load_pixel_v_w4(above, left, 16, &pixels); + + __m128i weights[4]; + load_weight_v_w4(sm_weight_arrays, 16, weights); + + smooth_v_pred_4xh(&pixels, weights, 8, dst, stride); + dst += stride << 3; + smooth_v_pred_4xh(&pixels, &weights[2], 8, dst, stride); +} + +// pixels[0]: above and below_pred interleave vector, first half +// pixels[1]: above and below_pred interleave vector, second half +static INLINE void load_pixel_v_w8(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + const __m128i zero = _mm_setzero_si128(); + __m128i d = _mm_loadl_epi64((const __m128i *)above); + const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); + d = _mm_unpacklo_epi8(d, zero); + pixels[0] = _mm_unpacklo_epi16(d, bp); + pixels[1] = _mm_unpackhi_epi16(d, bp); +} + +// weight_h[0]: weight_h vector +// weight_h[1]: scale - weight_h vector +// weight_h[2]: same as [0], offset 8 +// weight_h[3]: same as [1], offset 8 +// weight_h[4]: same as [0], offset 16 +// weight_h[5]: same as [1], offset 16 +// weight_h[6]: same as [0], offset 24 +// weight_h[7]: same as [1], offset 24 +static INLINE void load_weight_v_w8(const uint8_t *weight_array, int height, + __m128i *weight_h) { + const __m128i zero = _mm_setzero_si128(); + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + + if (height < 16) { + const int offset = height < 8 ? 4 : 8; + const __m128i weight = + _mm_loadu_si128((const __m128i *)&weight_array[offset]); + weight_h[0] = _mm_unpacklo_epi8(weight, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + } else if (height == 16) { + const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]); + weight_h[0] = _mm_unpacklo_epi8(weight, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(weight, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + } else { + const __m128i weight_lo = + _mm_loadu_si128((const __m128i *)&weight_array[32]); + weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + const __m128i weight_hi = + _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]); + weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero); + weight_h[5] = _mm_sub_epi16(d, weight_h[4]); + weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero); + weight_h[7] = _mm_sub_epi16(d, weight_h[6]); + } +} + +static INLINE void smooth_v_pred_8xh(const __m128i *pixels, const __m128i *wh, + int h, uint8_t *dst, ptrdiff_t stride) { + const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1))); + const __m128i inc = _mm_set1_epi16(0x202); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + __m128i d = _mm_set1_epi16(0x100); + + for (int i = 0; i < h; ++i) { + const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); + const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); + const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); + __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc); + __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc); + + s0 = _mm_add_epi32(s0, pred_round); + s0 = _mm_srai_epi32(s0, sm_weight_log2_scale); + + s1 = _mm_add_epi32(s1, pred_round); + s1 = _mm_srai_epi32(s1, sm_weight_log2_scale); + + __m128i sum01 = _mm_packus_epi16(s0, s1); + sum01 = _mm_shuffle_epi8(sum01, gat); + _mm_storel_epi64((__m128i *)dst, sum01); + dst += stride; + + d = _mm_add_epi16(d, inc); + } +} + +void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_v_w8(above, left, 4, pixels); + + __m128i wh[2]; + load_weight_v_w8(sm_weight_arrays, 4, wh); + + smooth_v_pred_8xh(pixels, wh, 4, dst, stride); +} + +void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_v_w8(above, left, 8, pixels); + + __m128i wh[2]; + load_weight_v_w8(sm_weight_arrays, 8, wh); + + smooth_v_pred_8xh(pixels, wh, 8, dst, stride); +} + +void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_v_w8(above, left, 16, pixels); + + __m128i wh[4]; + load_weight_v_w8(sm_weight_arrays, 16, wh); + + smooth_v_pred_8xh(pixels, wh, 8, dst, stride); + dst += stride << 3; + smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride); +} + +void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_v_w8(above, left, 32, pixels); + + __m128i wh[8]; + load_weight_v_w8(sm_weight_arrays, 32, wh); + + smooth_v_pred_8xh(pixels, &wh[0], 8, dst, stride); + dst += stride << 3; + smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride); + dst += stride << 3; + smooth_v_pred_8xh(pixels, &wh[4], 8, dst, stride); + dst += stride << 3; + smooth_v_pred_8xh(pixels, &wh[6], 8, dst, stride); +} + +static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left, uint32_t bw, + uint32_t bh) { + const uint8_t *const sm_weights_h = sm_weight_arrays + bh; + const __m128i zero = _mm_setzero_si128(); + const __m128i scale_value = + _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + const __m128i dup16 = _mm_set1_epi32(0x01000100); + const __m128i bottom_left = + _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + const __m128i round = + _mm_set1_epi32((uint16_t)(1 << (sm_weight_log2_scale - 1))); + + for (uint32_t y = 0; y < bh; ++y) { + const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]); + const __m128i scale_m_weights_y = + _mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16); + const __m128i wl_y = + _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, bottom_left), 0); + + for (uint32_t x = 0; x < bw; x += 8) { + const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x)); + // 8 -> 16 + const __m128i tw_x = _mm_unpacklo_epi8(top_x, zero); + const __m128i tw_x_lo = _mm_unpacklo_epi16(tw_x, scale_m_weights_y); + const __m128i tw_x_hi = _mm_unpackhi_epi16(tw_x, scale_m_weights_y); + // top_x * weights_y + scale_m_weights_y * bottom_left + __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y); + __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y); + + pred_lo = _mm_add_epi32(pred_lo, round); + pred_hi = _mm_add_epi32(pred_hi, round); + pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale); + pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale); + + __m128i pred = _mm_packus_epi16(pred_lo, pred_hi); + pred = _mm_shuffle_epi8(pred, gat); + _mm_storel_epi64((__m128i *)(dst + x), pred); + } + dst += stride; + } +} + +void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 16, 4); +} + +void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 16, 8); +} + +void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 16, 16); +} + +void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 16, 32); +} + +void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 32, 8); +} + +void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 32, 16); +} + +void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 32, 32); +} + +void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 32, 64); +} + +void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 64, 64); +} + +void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 64, 32); +} + +void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 64, 16); +} + +void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 16, 64); +} + +// ----------------------------------------------------------------------------- +// SMOOTH_H_PRED + +// pixels[0]: left vector +// pixels[1]: right_pred vector +static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + if (height == 4) + pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]); + else if (height == 8) + pixels[0] = _mm_loadl_epi64(((const __m128i *)left)); + else + pixels[0] = _mm_loadu_si128(((const __m128i *)left)); + pixels[1] = _mm_set1_epi16((uint16_t)above[3]); +} + +// weights[0]: weights_w and scale - weights_w interleave vector +static INLINE void load_weight_h_w4(const uint8_t *weight_array, int height, + __m128i *weights) { + (void)height; + const __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]); + const __m128i zero = _mm_setzero_si128(); + + const __m128i weights_0 = _mm_unpacklo_epi8(t, zero); + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + const __m128i weights_1 = _mm_sub_epi16(d, weights_0); + weights[0] = _mm_unpacklo_epi16(weights_0, weights_1); +} + +static INLINE void smooth_h_pred_4xh(const __m128i *pixel, + const __m128i *weight, int h, uint8_t *dst, + ptrdiff_t stride) { + const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1))); + const __m128i one = _mm_set1_epi16(1); + const __m128i gat = _mm_set1_epi32(0xc080400); + __m128i rep = _mm_set1_epi16((short)0x8000); + + for (int i = 0; i < h; ++i) { + __m128i b = _mm_shuffle_epi8(pixel[0], rep); + b = _mm_unpacklo_epi16(b, pixel[1]); + __m128i sum = _mm_madd_epi16(b, weight[0]); + + sum = _mm_add_epi32(sum, pred_round); + sum = _mm_srai_epi32(sum, sm_weight_log2_scale); + + sum = _mm_shuffle_epi8(sum, gat); + *(uint32_t *)dst = _mm_cvtsi128_si32(sum); + dst += stride; + + rep = _mm_add_epi16(rep, one); + } +} + +void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_h_w4(above, left, 4, pixels); + + __m128i weights; + load_weight_h_w4(sm_weight_arrays, 4, &weights); + + smooth_h_pred_4xh(pixels, &weights, 4, dst, stride); +} + +void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_h_w4(above, left, 8, pixels); + + __m128i weights; + load_weight_h_w4(sm_weight_arrays, 8, &weights); + + smooth_h_pred_4xh(pixels, &weights, 8, dst, stride); +} + +void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_h_w4(above, left, 16, pixels); + + __m128i weights; + load_weight_h_w4(sm_weight_arrays, 8, &weights); + + smooth_h_pred_4xh(pixels, &weights, 8, dst, stride); + dst += stride << 3; + + pixels[0] = _mm_srli_si128(pixels[0], 8); + smooth_h_pred_4xh(pixels, &weights, 8, dst, stride); +} + +// pixels[0]: left vector +// pixels[1]: right_pred vector +// pixels[2]: left vector + 16 +// pixels[3]: right_pred vector +static INLINE void load_pixel_h_w8(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + pixels[1] = _mm_set1_epi16((uint16_t)above[7]); + + if (height == 4) { + pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]); + } else if (height == 8) { + pixels[0] = _mm_loadl_epi64((const __m128i *)left); + } else if (height == 16) { + pixels[0] = _mm_load_si128((const __m128i *)left); + } else { + pixels[0] = _mm_load_si128((const __m128i *)left); + pixels[2] = _mm_load_si128((const __m128i *)(left + 16)); + pixels[3] = pixels[1]; + } +} + +// weight_w[0]: weights_w and scale - weights_w interleave vector, first half +// weight_w[1]: weights_w and scale - weights_w interleave vector, second half +static INLINE void load_weight_h_w8(const uint8_t *weight_array, int height, + __m128i *weight_w) { + (void)height; + const __m128i zero = _mm_setzero_si128(); + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + const __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[8]); + const __m128i tmp1 = _mm_unpacklo_epi8(we, zero); + const __m128i tmp2 = _mm_sub_epi16(d, tmp1); + weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2); + weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2); +} + +static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww, + int h, uint8_t *dst, ptrdiff_t stride, + int second_half) { + const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1))); + const __m128i one = _mm_set1_epi16(1); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + __m128i rep = second_half ? _mm_set1_epi16((short)0x8008) + : _mm_set1_epi16((short)0x8000); + + for (int i = 0; i < h; ++i) { + __m128i b = _mm_shuffle_epi8(pixels[0], rep); + b = _mm_unpacklo_epi16(b, pixels[1]); + __m128i sum0 = _mm_madd_epi16(b, ww[0]); + __m128i sum1 = _mm_madd_epi16(b, ww[1]); + + sum0 = _mm_add_epi32(sum0, pred_round); + sum0 = _mm_srai_epi32(sum0, sm_weight_log2_scale); + + sum1 = _mm_add_epi32(sum1, pred_round); + sum1 = _mm_srai_epi32(sum1, sm_weight_log2_scale); + + sum0 = _mm_packus_epi16(sum0, sum1); + sum0 = _mm_shuffle_epi8(sum0, gat); + _mm_storel_epi64((__m128i *)dst, sum0); + dst += stride; + + rep = _mm_add_epi16(rep, one); + } +} + +void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_h_w8(above, left, 4, pixels); + + __m128i ww[2]; + load_weight_h_w8(sm_weight_arrays, 4, ww); + + smooth_h_pred_8xh(pixels, ww, 4, dst, stride, 0); +} + +void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_h_w8(above, left, 8, pixels); + + __m128i ww[2]; + load_weight_h_w8(sm_weight_arrays, 8, ww); + + smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0); +} + +void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_h_w8(above, left, 16, pixels); + + __m128i ww[2]; + load_weight_h_w8(sm_weight_arrays, 16, ww); + + smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 1); +} + +void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[4]; + load_pixel_h_w8(above, left, 32, pixels); + + __m128i ww[2]; + load_weight_h_w8(sm_weight_arrays, 32, ww); + + smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 1); + dst += stride << 3; + smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 1); +} + +static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left, uint32_t bw, + uint32_t bh) { + const uint8_t *const sm_weights_w = sm_weight_arrays + bw; + const __m128i zero = _mm_setzero_si128(); + const __m128i scale_value = + _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1))); + + for (uint32_t y = 0; y < bh; ++y) { + const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]); + const __m128i tr_ly = + _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0); + + for (uint32_t x = 0; x < bw; x += 8) { + const __m128i weights_x = + _mm_loadl_epi64((const __m128i *)(sm_weights_w + x)); + const __m128i weights_xw = _mm_unpacklo_epi8(weights_x, zero); + const __m128i scale_m_weights_x = _mm_sub_epi16(scale_value, weights_xw); + const __m128i wx_lo = _mm_unpacklo_epi16(scale_m_weights_x, weights_xw); + const __m128i wx_hi = _mm_unpackhi_epi16(scale_m_weights_x, weights_xw); + __m128i pred_lo = _mm_madd_epi16(wx_lo, tr_ly); + __m128i pred_hi = _mm_madd_epi16(wx_hi, tr_ly); + + pred_lo = _mm_add_epi32(pred_lo, pred_round); + pred_hi = _mm_add_epi32(pred_hi, pred_round); + + pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale); + pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale); + + __m128i pred = _mm_packus_epi16(pred_lo, pred_hi); + pred = _mm_shuffle_epi8(pred, gat); + _mm_storel_epi64((__m128i *)(dst + x), pred); + } + dst += stride; + } +} + +void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 16, 4); +} + +void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 16, 8); +} + +void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 16, 16); +} + +void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 16, 32); +} + +void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 16, 64); +} + +void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 32, 8); +} + +void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 32, 16); +} + +void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 32, 32); +} + +void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 32, 64); +} + +void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 64, 64); +} + +void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 64, 32); +} + +void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 64, 16); +} diff --git a/libs/libaom/src/aom_dsp/x86/intrapred_x86.h b/libs/libaom/src/aom_dsp/x86/intrapred_x86.h new file mode 100644 index 000000000..b13f575a7 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/intrapred_x86.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_INTRAPRED_X86_H_ +#define AOM_AOM_DSP_X86_INTRAPRED_X86_H_ + +#include // SSE2 +#include "aom/aom_integer.h" +#include "config/aom_config.h" + +static INLINE __m128i dc_sum_16_sse2(const uint8_t *ref) { + __m128i x = _mm_load_si128((__m128i const *)ref); + const __m128i zero = _mm_setzero_si128(); + x = _mm_sad_epu8(x, zero); + const __m128i high = _mm_unpackhi_epi64(x, x); + return _mm_add_epi16(x, high); +} + +static INLINE __m128i dc_sum_32_sse2(const uint8_t *ref) { + __m128i x0 = _mm_load_si128((__m128i const *)ref); + __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); + const __m128i zero = _mm_setzero_si128(); + x0 = _mm_sad_epu8(x0, zero); + x1 = _mm_sad_epu8(x1, zero); + x0 = _mm_add_epi16(x0, x1); + const __m128i high = _mm_unpackhi_epi64(x0, x0); + return _mm_add_epi16(x0, high); +} + +#endif // AOM_AOM_DSP_X86_INTRAPRED_X86_H_ diff --git a/libs/libaom/src/aom_dsp/x86/inv_wht_sse2.asm b/libs/libaom/src/aom_dsp/x86/inv_wht_sse2.asm new file mode 100644 index 000000000..0bc841a7a --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/inv_wht_sse2.asm @@ -0,0 +1,107 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro REORDER_INPUTS 0 + ; a c d b to a b c d + SWAP 1, 3, 2 +%endmacro + +%macro TRANSFORM_COLS 0 + ; input: + ; m0 a + ; m1 b + ; m2 c + ; m3 d + paddw m0, m2 + psubw m3, m1 + + ; wide subtract + punpcklwd m4, m0 + punpcklwd m5, m3 + psrad m4, 16 + psrad m5, 16 + psubd m4, m5 + psrad m4, 1 + packssdw m4, m4 ; e + + psubw m5, m4, m1 ; b + psubw m4, m2 ; c + psubw m0, m5 + paddw m3, m4 + ; m0 a + SWAP 1, 5 ; m1 b + SWAP 2, 4 ; m2 c + ; m3 d +%endmacro + +%macro TRANSPOSE_4X4 0 + punpcklwd m0, m2 + punpcklwd m1, m3 + mova m2, m0 + punpcklwd m0, m1 + punpckhwd m2, m1 + pshufd m1, m0, 0x0e + pshufd m3, m2, 0x0e +%endmacro + +; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3 +%macro TRANSPOSE_4X4_WIDE 0 + mova m3, m0 + punpcklwd m0, m1 + punpckhwd m3, m1 + mova m2, m0 + punpcklwd m0, m3 + punpckhwd m2, m3 + pshufd m1, m0, 0x0e + pshufd m3, m2, 0x0e +%endmacro + +%macro ADD_STORE_4P_2X 5 ; src1, src2, tmp1, tmp2, zero + movd m%3, [outputq] + movd m%4, [outputq + strideq] + punpcklbw m%3, m%5 + punpcklbw m%4, m%5 + paddw m%1, m%3 + paddw m%2, m%4 + packuswb m%1, m%5 + packuswb m%2, m%5 + movd [outputq], m%1 + movd [outputq + strideq], m%2 +%endmacro + +INIT_XMM sse2 +cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride + mova m0, [inputq + 0] + packssdw m0, [inputq + 16] + mova m1, [inputq + 32] + packssdw m1, [inputq + 48] + psraw m0, 2 + psraw m1, 2 + + TRANSPOSE_4X4_WIDE + REORDER_INPUTS + TRANSFORM_COLS + TRANSPOSE_4X4 + REORDER_INPUTS + TRANSFORM_COLS + + pxor m4, m4 + ADD_STORE_4P_2X 0, 1, 5, 6, 4 + lea outputq, [outputq + 2 * strideq] + ADD_STORE_4P_2X 2, 3, 5, 6, 4 + + RET diff --git a/libs/libaom/src/aom_dsp/x86/jnt_sad_ssse3.c b/libs/libaom/src/aom_dsp/x86/jnt_sad_ssse3.c new file mode 100644 index 000000000..2e3e2be10 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/jnt_sad_ssse3.c @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" + +unsigned int aom_sad4xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int i; + assert(width == 4); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; i += 4) { + __m128i x0 = xx_loadl_32(a + 0 * a_stride); + __m128i x1 = xx_loadl_32(a + 1 * a_stride); + __m128i x2 = xx_loadl_32(a + 2 * a_stride); + __m128i x3 = xx_loadl_32(a + 3 * a_stride); + __m128i x_lo = _mm_unpacklo_epi32(x0, x1); + __m128i x_hi = _mm_unpacklo_epi32(x2, x3); + + __m128i x = _mm_unpacklo_epi64(x_lo, x_hi); + + x0 = xx_loadl_32(b + 0 * b_stride); + x1 = xx_loadl_32(b + 1 * b_stride); + x2 = xx_loadl_32(b + 2 * b_stride); + x3 = xx_loadl_32(b + 3 * b_stride); + x_lo = _mm_unpacklo_epi32(x0, x1); + x_hi = _mm_unpacklo_epi32(x2, x3); + + __m128i y = _mm_unpacklo_epi64(x_lo, x_hi); + + __m128i sad4x4 = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad4x4); + + a += 4 * a_stride; + b += 4 * b_stride; + } + + // At this point, we have two 32-bit partial SADs at bit[0:31] and [64:95]. + const unsigned int res = + _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); + + return res; +} + +unsigned int aom_sad8xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int i; + assert(width == 8); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; i += 2) { + __m128i x0 = xx_loadl_64(a + 0 * a_stride); + __m128i x1 = xx_loadl_64(a + 1 * a_stride); + + __m128i x = _mm_unpacklo_epi64(x0, x1); + + x0 = xx_loadl_64(b + 0 * b_stride); + x1 = xx_loadl_64(b + 1 * b_stride); + + __m128i y = _mm_unpacklo_epi64(x0, x1); + + __m128i sad8x2 = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad8x2); + + a += 2 * a_stride; + b += 2 * b_stride; + } + + const unsigned int res = + _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); + + return res; +} + +unsigned int aom_sad16xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int i; + assert(width == 16); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; ++i) { + __m128i x = xx_loadu_128(a); + __m128i y = xx_loadu_128(b); + + __m128i sad16x1 = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad16x1); + + a += a_stride; + b += b_stride; + } + + const unsigned int res = + _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); + + return res; +} + +unsigned int aom_sad32xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int i, j; + assert(width == 32); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; ++i) { + for (j = 0; j < 2; ++j) { + __m128i x = xx_loadu_128(a + j * 16); + __m128i y = xx_loadu_128(b + j * 16); + + __m128i sad32_half = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad32_half); + } + + a += a_stride; + b += b_stride; + } + + const unsigned int res = + _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); + + return res; +} + +unsigned int aom_sad64xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int i, j; + assert(width == 64); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; ++i) { + for (j = 0; j < 4; ++j) { + __m128i x = xx_loadu_128(a + j * 16); + __m128i y = xx_loadu_128(b + j * 16); + + __m128i sad64_quarter = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad64_quarter); + } + + a += a_stride; + b += b_stride; + } + + const unsigned int res = + _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); + + return res; +} + +unsigned int aom_sad128xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int i, j; + assert(width == 128); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; ++i) { + for (j = 0; j < 8; ++j) { + __m128i x = xx_loadu_128(a + j * 16); + __m128i y = xx_loadu_128(b + j * 16); + + __m128i sad64_quarter = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad64_quarter); + } + + a += a_stride; + b += b_stride; + } + + const unsigned int res = + _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); + + return res; +} + +#define dist_wtd_sadMxN_sse2(m, n) \ + unsigned int aom_dist_wtd_sad##m##x##n##_avg_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + uint8_t comp_pred[m * n]; \ + aom_dist_wtd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \ + jcp_param); \ + return aom_sad##m##xh_sse2(src, src_stride, comp_pred, m, m, n); \ + } + +#define dist_wtd_sadMxN_avx2(m, n) \ + unsigned int aom_dist_wtd_sad##m##x##n##_avg_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + uint8_t comp_pred[m * n]; \ + aom_dist_wtd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \ + jcp_param); \ + return aom_sad##m##xh_avx2(src, src_stride, comp_pred, m, m, n); \ + } + +/* clang-format off */ +dist_wtd_sadMxN_sse2(128, 128) +dist_wtd_sadMxN_sse2(128, 64) +dist_wtd_sadMxN_sse2(64, 128) +dist_wtd_sadMxN_sse2(64, 64) +dist_wtd_sadMxN_sse2(64, 32) +dist_wtd_sadMxN_sse2(32, 64) +dist_wtd_sadMxN_sse2(32, 32) +dist_wtd_sadMxN_sse2(32, 16) +dist_wtd_sadMxN_sse2(16, 32) +dist_wtd_sadMxN_sse2(16, 16) +dist_wtd_sadMxN_sse2(16, 8) +dist_wtd_sadMxN_sse2(8, 16) +dist_wtd_sadMxN_sse2(8, 8) +dist_wtd_sadMxN_sse2(8, 4) +dist_wtd_sadMxN_sse2(4, 8) +dist_wtd_sadMxN_sse2(4, 4) +dist_wtd_sadMxN_sse2(4, 16) +dist_wtd_sadMxN_sse2(16, 4) +dist_wtd_sadMxN_sse2(8, 32) +dist_wtd_sadMxN_sse2(32, 8) +dist_wtd_sadMxN_sse2(16, 64) +dist_wtd_sadMxN_sse2(64, 16) + /* clang-format on */ diff --git a/libs/libaom/src/aom_dsp/x86/jnt_variance_ssse3.c b/libs/libaom/src/aom_dsp/x86/jnt_variance_ssse3.c new file mode 100644 index 000000000..c8b02f556 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/jnt_variance_ssse3.c @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" + +void aom_var_filter_block2d_bil_first_pass_ssse3( + const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter); + +void aom_var_filter_block2d_bil_second_pass_ssse3( + const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter); + +static INLINE void compute_dist_wtd_avg(__m128i *p0, __m128i *p1, + const __m128i *w, const __m128i *r, + void *const result) { + __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1); + __m128i mult_lo = _mm_maddubs_epi16(p_lo, *w); + __m128i round_lo = _mm_add_epi16(mult_lo, *r); + __m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS); + + __m128i p_hi = _mm_unpackhi_epi8(*p0, *p1); + __m128i mult_hi = _mm_maddubs_epi16(p_hi, *w); + __m128i round_hi = _mm_add_epi16(mult_hi, *r); + __m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS); + + xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi)); +} + +void aom_dist_wtd_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, const uint8_t *ref, + int ref_stride, + const DIST_WTD_COMP_PARAMS *jcp_param) { + int i; + const uint8_t w0 = (uint8_t)jcp_param->fwd_offset; + const uint8_t w1 = (uint8_t)jcp_param->bck_offset; + const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, + w1, w0, w1, w0); + const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1); + const __m128i r = + _mm_set_epi16(round, round, round, round, round, round, round, round); + + if (width >= 16) { + // Read 16 pixels one row at a time + assert(!(width & 15)); + for (i = 0; i < height; ++i) { + int j; + for (j = 0; j < width; j += 16) { + __m128i p0 = xx_loadu_128(ref); + __m128i p1 = xx_loadu_128(pred); + + compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred); + + comp_pred += 16; + pred += 16; + ref += 16; + } + ref += ref_stride - width; + } + } else if (width >= 8) { + // Read 8 pixels two row at a time + assert(!(width & 7)); + assert(!(width & 1)); + for (i = 0; i < height; i += 2) { + __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride); + __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride); + __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1); + __m128i p1 = xx_loadu_128(pred); + + compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred); + + comp_pred += 16; + pred += 16; + ref += 2 * ref_stride; + } + } else { + // Read 4 pixels four row at a time + assert(!(width & 3)); + assert(!(height & 3)); + for (i = 0; i < height; i += 4) { + const uint8_t *row0 = ref + 0 * ref_stride; + const uint8_t *row1 = ref + 1 * ref_stride; + const uint8_t *row2 = ref + 2 * ref_stride; + const uint8_t *row3 = ref + 3 * ref_stride; + + __m128i p0 = + _mm_setr_epi8(row0[0], row0[1], row0[2], row0[3], row1[0], row1[1], + row1[2], row1[3], row2[0], row2[1], row2[2], row2[3], + row3[0], row3[1], row3[2], row3[3]); + __m128i p1 = xx_loadu_128(pred); + + compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred); + + comp_pred += 16; + pred += 16; + ref += 4 * ref_stride; + } + } +} + +void aom_dist_wtd_comp_avg_upsampled_pred_ssse3( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) { + int n; + int i; + aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search); + /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/ + assert(!(width * height & 15)); + n = width * height >> 4; + + const uint8_t w0 = (uint8_t)jcp_param->fwd_offset; + const uint8_t w1 = (uint8_t)jcp_param->bck_offset; + const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, + w1, w0, w1, w0); + const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1); + const __m128i r = + _mm_set_epi16(round, round, round, round, round, round, round, round); + + for (i = 0; i < n; i++) { + __m128i p0 = xx_loadu_128(comp_pred); + __m128i p1 = xx_loadu_128(pred); + + compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred); + + comp_pred += 16; + pred += 16; + } +} + +#define DIST_WTD_SUBPIX_AVG_VAR(W, H) \ + uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_ssse3( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + aom_var_filter_block2d_bil_first_pass_ssse3( \ + a, fdata3, a_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_ssse3( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_dist_wtd_comp_avg_pred_ssse3(temp3, second_pred, W, H, temp2, W, \ + jcp_param); \ + \ + return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \ + } + +DIST_WTD_SUBPIX_AVG_VAR(128, 128) +DIST_WTD_SUBPIX_AVG_VAR(128, 64) +DIST_WTD_SUBPIX_AVG_VAR(64, 128) +DIST_WTD_SUBPIX_AVG_VAR(64, 64) +DIST_WTD_SUBPIX_AVG_VAR(64, 32) +DIST_WTD_SUBPIX_AVG_VAR(32, 64) +DIST_WTD_SUBPIX_AVG_VAR(32, 32) +DIST_WTD_SUBPIX_AVG_VAR(32, 16) +DIST_WTD_SUBPIX_AVG_VAR(16, 32) +DIST_WTD_SUBPIX_AVG_VAR(16, 16) +DIST_WTD_SUBPIX_AVG_VAR(16, 8) +DIST_WTD_SUBPIX_AVG_VAR(8, 16) +DIST_WTD_SUBPIX_AVG_VAR(8, 8) +DIST_WTD_SUBPIX_AVG_VAR(8, 4) +DIST_WTD_SUBPIX_AVG_VAR(4, 8) +DIST_WTD_SUBPIX_AVG_VAR(4, 4) +DIST_WTD_SUBPIX_AVG_VAR(4, 16) +DIST_WTD_SUBPIX_AVG_VAR(16, 4) +DIST_WTD_SUBPIX_AVG_VAR(8, 32) +DIST_WTD_SUBPIX_AVG_VAR(32, 8) +DIST_WTD_SUBPIX_AVG_VAR(16, 64) +DIST_WTD_SUBPIX_AVG_VAR(64, 16) diff --git a/libs/libaom/src/aom_dsp/x86/loopfilter_sse2.c b/libs/libaom/src/aom_dsp/x86/loopfilter_sse2.c new file mode 100644 index 000000000..d534683fc --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/loopfilter_sse2.c @@ -0,0 +1,2100 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" +#include "aom_ports/mem.h" +#include "aom_ports/emmintrin_compat.h" +#include "aom_dsp/x86/lpf_common_sse2.h" + +static INLINE __m128i abs_diff(__m128i a, __m128i b) { + return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); +} + +// this function treats its input as 2 parallel 8x4 matrices, transposes each of +// them to 4x8 independently while flipping the second matrix horizontally. +// Used for 14 taps pq pairs creation +static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *q0p0, + __m128i *q1p1, __m128i *q2p2, + __m128i *q3p3, __m128i *q4p4, + __m128i *q5p5, __m128i *q6p6, + __m128i *q7p7) { + __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3; + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + w2 = _mm_unpackhi_epi8( + *x0, *x1); // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115 + w3 = _mm_unpackhi_epi8( + *x2, *x3); // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315 + + ww0 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ww1 = _mm_unpackhi_epi16( + w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + ww2 = _mm_unpacklo_epi16( + w2, w3); // 08 18 28 38 09 19 29 39 010 110 210 310 011 111 211 311 + ww3 = _mm_unpackhi_epi16( + w2, + w3); // 012 112 212 312 013 113 213 313 014 114 214 314 015 115 215 315 + + *q7p7 = _mm_unpacklo_epi32( + ww0, + _mm_srli_si128( + ww3, 12)); // 00 10 20 30 015 115 215 315 xx xx xx xx xx xx xx xx + *q6p6 = _mm_unpackhi_epi32( + _mm_slli_si128(ww0, 4), + ww3); // 01 11 21 31 014 114 214 314 xx xx xx xxxx xx xx xx + *q5p5 = _mm_unpackhi_epi32( + ww0, + _mm_slli_si128( + ww3, 4)); // 02 12 22 32 013 113 213 313 xx xx xx x xx xx xx xxx + *q4p4 = _mm_unpacklo_epi32( + _mm_srli_si128(ww0, 12), + ww3); // 03 13 23 33 012 112 212 312 xx xx xx xx xx xx xx xx + *q3p3 = _mm_unpacklo_epi32( + ww1, + _mm_srli_si128( + ww2, 12)); // 04 14 24 34 011 111 211 311 xx xx xx xx xx xx xx xx + *q2p2 = _mm_unpackhi_epi32( + _mm_slli_si128(ww1, 4), + ww2); // 05 15 25 35 010 110 210 310 xx xx xx xx xx xx xx xx + *q1p1 = _mm_unpackhi_epi32( + ww1, + _mm_slli_si128( + ww2, 4)); // 06 16 26 36 09 19 29 39 xx xx xx xx xx xx xx xx + *q0p0 = _mm_unpacklo_epi32( + _mm_srli_si128(ww1, 12), + ww2); // 07 17 27 37 08 18 28 38 xx xx xx xx xx xx xx xx +} + +// this function treats its input as 2 parallel 8x4 matrices, transposes each of +// them independently while flipping the second matrix horizontaly Used for 14 +// taps filter pq pairs inverse +static INLINE void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, + __m128i *pq0, __m128i *pq1, + __m128i *pq2, __m128i *pq3) { + __m128i w10, w11, w12, w13; + __m128i w0, w1, w2, w3, w4, w5; + __m128i d0, d1, d2, d3; + + w0 = _mm_unpacklo_epi8( + *x0, *x1); // p 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // p 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + w2 = _mm_unpacklo_epi8( + *x4, *x5); // p 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + w3 = _mm_unpacklo_epi8( + *x6, *x7); // p 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + + w4 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + w5 = _mm_unpacklo_epi16( + w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + + d0 = _mm_unpacklo_epi32( + w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + d2 = _mm_unpackhi_epi32( + w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + + w10 = _mm_unpacklo_epi8( + *x7, *x6); // q xx xx xx xx xx xx xx xx 00 10 01 11 02 12 03 13 + w11 = _mm_unpacklo_epi8( + *x5, *x4); // q xx xx xx xx xx xx xx xx 20 30 21 31 22 32 23 33 + w12 = _mm_unpacklo_epi8( + *x3, *x2); // q xx xx xx xx xx xx xx xx 40 50 41 51 42 52 43 53 + w13 = _mm_unpacklo_epi8( + *x1, *x0); // q xx xx xx xx xx xx xx xx 60 70 61 71 62 72 63 73 + + w4 = _mm_unpackhi_epi16( + w10, w11); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + w5 = _mm_unpackhi_epi16( + w12, w13); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + + d1 = _mm_unpacklo_epi32( + w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + d3 = _mm_unpackhi_epi32( + w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + + *pq0 = _mm_unpacklo_epi64(d0, d1); // pq + *pq1 = _mm_unpackhi_epi64(d0, d1); // pq + *pq2 = _mm_unpacklo_epi64(d2, d3); // pq + *pq3 = _mm_unpackhi_epi64(d2, d3); // pq +} + +static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0, + __m128i *hev, __m128i *mask, + __m128i *qs1qs0, __m128i *ps1ps0) { + __m128i filter, filter2filter1, work; + __m128i ps1ps0_work, qs1qs0_work; + __m128i hev1; + const __m128i t3t4 = + _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 4, 4); + const __m128i t80 = _mm_set1_epi8((char)0x80); + const __m128i ff = _mm_cmpeq_epi8(t80, t80); + + ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */ + qs1qs0_work = _mm_xor_si128(*q1q0, t80); + + /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ + work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work); + filter = _mm_and_si128(_mm_srli_si128(work, 4), *hev); + /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ + filter = _mm_subs_epi8(filter, work); + filter = _mm_subs_epi8(filter, work); + filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ + filter = _mm_and_si128(filter, *mask); /* & mask */ + filter = _mm_unpacklo_epi32(filter, filter); + + /* filter1 = signed_char_clamp(filter + 4) >> 3; */ + /* filter2 = signed_char_clamp(filter + 3) >> 3; */ + filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ + filter2filter1 = + _mm_unpacklo_epi8(filter2filter1, filter2filter1); // goto 16 bit + filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ + filter2filter1 = _mm_packs_epi16(filter2filter1, filter2filter1); + + /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ + filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ + filter = _mm_unpacklo_epi8(filter, filter); // goto 16 bit + filter = _mm_srai_epi16(filter, 9); /* round */ + filter = _mm_packs_epi16(filter, filter); + filter = _mm_andnot_si128(*hev, filter); + filter = _mm_unpacklo_epi32(filter, filter); + + filter2filter1 = _mm_unpacklo_epi32(filter2filter1, filter); + hev1 = _mm_srli_si128(filter2filter1, 8); + /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ + qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1); + /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ + ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1); + + *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */ + *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */ +} + +static AOM_FORCE_INLINE void filter4_dual_sse2(__m128i *p1p0, __m128i *q1q0, + __m128i *hev, __m128i *mask, + __m128i *qs1qs0, + __m128i *ps1ps0) { + const __m128i t3t4 = + _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4); + const __m128i t80 = _mm_set1_epi8((char)0x80); + __m128i filter, filter2filter1, work; + __m128i ps1ps0_work, qs1qs0_work; + __m128i hev1; + const __m128i ff = _mm_cmpeq_epi8(t80, t80); + + ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */ + qs1qs0_work = _mm_xor_si128(*q1q0, t80); + + /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ + work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work); + filter = _mm_and_si128(_mm_srli_si128(work, 8), *hev); + /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ + filter = _mm_subs_epi8(filter, work); + filter = _mm_subs_epi8(filter, work); + filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ + filter = _mm_and_si128(filter, *mask); /* & mask */ + filter = _mm_unpacklo_epi64(filter, filter); + + /* filter1 = signed_char_clamp(filter + 4) >> 3; */ + /* filter2 = signed_char_clamp(filter + 3) >> 3; */ + filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ + filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); + filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); + filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ + filter = _mm_srai_epi16(filter, 11); /* >> 3 */ + filter2filter1 = _mm_packs_epi16(filter2filter1, filter); + + /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ + filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ + filter = _mm_unpacklo_epi8(filter, filter); + filter = _mm_srai_epi16(filter, 9); /* round */ + filter = _mm_packs_epi16(filter, filter); + filter = _mm_andnot_si128(*hev, filter); + + hev1 = _mm_unpackhi_epi64(filter2filter1, filter); + filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); + + /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ + qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1); + /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ + ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1); + *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */ + *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */ +} + +static AOM_FORCE_INLINE void lpf_internal_4_sse2( + __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit, + __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) { + __m128i q1p1, q0p0, p1p0, q1q0; + __m128i abs_p0q0, abs_p1q1; + __m128i mask, flat, hev; + const __m128i zero = _mm_setzero_si128(); + + q1p1 = _mm_unpacklo_epi32(*p1, *q1); + q0p0 = _mm_unpacklo_epi32(*p0, *q0); + + p1p0 = _mm_unpacklo_epi32(q0p0, q1p1); + q1q0 = _mm_srli_si128(p1p0, 8); + + /* (abs(q1 - q0), abs(p1 - p0) */ + flat = abs_diff(q1p1, q0p0); + /* abs(p1 - q1), abs(p0 - q0) */ + __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); + + /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); + hev = _mm_unpacklo_epi8(flat, zero); + + hev = _mm_cmpgt_epi16(hev, *thresh); + hev = _mm_packs_epi16(hev, hev); + hev = _mm_unpacklo_epi32(hev, hev); + + abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ + abs_p1q1 = _mm_srli_si128(abs_p1q1p0q0, 4); /* abs(p1 - q1) */ + abs_p1q1 = _mm_unpacklo_epi8(abs_p1q1, abs_p1q1); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); + abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ + + mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); + mask = _mm_unpacklo_epi32(mask, flat); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + mask = _mm_and_si128(mask, _mm_srli_si128(mask, 4)); + + filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); +} + +static AOM_FORCE_INLINE void lpf_internal_4_dual_sse2( + __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit, + __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) { + __m128i q1p1, q0p0, p1p0, q1q0; + __m128i abs_p0q0, abs_p1q1; + __m128i mask, hev; + const __m128i zero = _mm_setzero_si128(); + + q1p1 = _mm_unpacklo_epi64(*p1, *q1); + q0p0 = _mm_unpacklo_epi64(*p0, *q0); + + p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); + q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); + + /* (abs(q1 - q0), abs(p1 - p0) */ + __m128i flat = abs_diff(q1p1, q0p0); + /* abs(p1 - q1), abs(p0 - q0) */ + const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); + + /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + hev = _mm_unpacklo_epi8(flat, zero); + + hev = _mm_cmpgt_epi16(hev, *thresh); + hev = _mm_packs_epi16(hev, hev); + + /* const int8_t mask = filter_mask2(*limit, *blimit, */ + /* p1, p0, q0, q1); */ + abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ + abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); + abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ + mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); + mask = _mm_unpacklo_epi64(mask, flat); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); + + filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); +} + +void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, + const uint8_t *_blimit, const uint8_t *_limit, + const uint8_t *_thresh) { + const __m128i zero = _mm_setzero_si128(); + __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit), + _mm_loadl_epi64((const __m128i *)_limit)); + __m128i thresh = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); + + __m128i qs1qs0, ps1ps0; + __m128i p1, p0, q0, q1; + + p1 = xx_loadl_32(s - 2 * p); + p0 = xx_loadl_32(s - 1 * p); + q0 = xx_loadl_32(s - 0 * p); + q1 = xx_loadl_32(s + 1 * p); + + lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0); + + xx_storel_32(s - 1 * p, ps1ps0); + xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 4)); + xx_storel_32(s + 0 * p, qs1qs0); + xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 4)); +} + +void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, + const uint8_t *_blimit, const uint8_t *_limit, + const uint8_t *_thresh) { + __m128i p1p0, q1q0; + __m128i p1, p0, q0, q1; + + const __m128i zero = _mm_setzero_si128(); + __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit), + _mm_loadl_epi64((const __m128i *)_limit)); + __m128i thresh = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); + + __m128i x0, x1, x2, x3; + __m128i d0, d1, d2, d3; + x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p)); + x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p)); + x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p)); + + transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &p1, &p0, &q0, &q1); + + lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0); + + // Transpose 8x4 to 4x8 + p1 = _mm_srli_si128(p1p0, 4); + q1 = _mm_srli_si128(q1q0, 4); + + transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3); + + xx_storel_32(s + 0 * p - 2, d0); + xx_storel_32(s + 1 * p - 2, d1); + xx_storel_32(s + 2 * p - 2, d2); + xx_storel_32(s + 3 * p - 2, d3); +} + +static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) { + xx_storel_32(s - (num + 1) * p, x); + xx_storel_32(s + num * p, _mm_srli_si128(x, 4)); +} + +static AOM_FORCE_INLINE void lpf_internal_14_dual_sse2( + __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2, + __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit, + __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi8(1); + __m128i mask, hev, flat, flat2; + __m128i qs0ps0, qs1ps1; + __m128i p1p0, q1q0, qs1qs0, ps1ps0; + __m128i abs_p1p0; + + p1p0 = _mm_unpacklo_epi64(*q0p0, *q1p1); + q1q0 = _mm_unpackhi_epi64(*q0p0, *q1p1); + + { + __m128i abs_p1q1, abs_p0q0, abs_q1q0; + __m128i fe, ff, work; + abs_p1p0 = abs_diff(*q1p1, *q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + fe = _mm_set1_epi8((char)0xfe); + ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + abs_p0q0 = abs_diff(p1p0, q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 8); + abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero); + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi64(hev, hev); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2)); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter - the same for 6, 8 and 14 versions + filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0); + qs0ps0 = _mm_unpacklo_epi64(ps1ps0, qs1qs0); + qs1ps1 = _mm_unpackhi_epi64(ps1ps0, qs1qs0); + // loopfilter done + + __m128i flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; + __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; + + __m128i work; + flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + // if flat ==0 then flat2 is zero as well and we don't need any calc below + // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; + __m128i q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; + __m128i pixelFilter_p, pixelFilter_q; + __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; + __m128i sum_p6, sum_q6; + __m128i sum_p3, sum_q3, res_p, res_q; + + p6_16 = _mm_unpacklo_epi8(*q6p6, zero); + p5_16 = _mm_unpacklo_epi8(*q5p5, zero); + p4_16 = _mm_unpacklo_epi8(*q4p4, zero); + p3_16 = _mm_unpacklo_epi8(*q3p3, zero); + p2_16 = _mm_unpacklo_epi8(*q2p2, zero); + p1_16 = _mm_unpacklo_epi8(*q1p1, zero); + p0_16 = _mm_unpacklo_epi8(*q0p0, zero); + q0_16 = _mm_unpackhi_epi8(*q0p0, zero); + q1_16 = _mm_unpackhi_epi8(*q1p1, zero); + q2_16 = _mm_unpackhi_epi8(*q2p2, zero); + q3_16 = _mm_unpackhi_epi8(*q3p3, zero); + q4_16 = _mm_unpackhi_epi8(*q4p4, zero); + q5_16 = _mm_unpackhi_epi8(*q5p5, zero); + q6_16 = _mm_unpackhi_epi8(*q6p6, zero); + pixelFilter_p = _mm_add_epi16(p5_16, _mm_add_epi16(p4_16, p3_16)); + pixelFilter_q = _mm_add_epi16(q5_16, _mm_add_epi16(q4_16, q3_16)); + + pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); + pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); + pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + pixelFilter_p = + _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); + pixetFilter_p2p1p0 = _mm_add_epi16( + four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, + _mm_add_epi16(_mm_add_epi16(p6_16, p0_16), + _mm_add_epi16(p1_16, q0_16))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, + _mm_add_epi16(_mm_add_epi16(q6_16, q0_16), + _mm_add_epi16(p0_16, q1_16))), + 4); + flat2_q0p0 = _mm_packus_epi16(res_p, res_q); + + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3); + + flat_q0p0 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(p6_16, p6_16); + sum_q6 = _mm_add_epi16(q6_16, q6_16); + sum_p3 = _mm_add_epi16(p3_16, p3_16); + sum_q3 = _mm_add_epi16(q3_16, q3_16); + + pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p5_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p1_16, _mm_add_epi16(p2_16, p0_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q1_16, _mm_add_epi16(q0_16, q2_16)))), + 4); + flat2_q1p1 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3); + flat_q1p1 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); + + sum_p3 = _mm_add_epi16(sum_p3, p3_16); + sum_q3 = _mm_add_epi16(sum_q3, q3_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3); + flat_q2p2 = _mm_packus_epi16(res_p, res_q); + + // work with flat2 + flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0)); + work = abs_diff(*q6p6, *q0p0); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + + // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + flat = _mm_unpacklo_epi64(flat, flat); + *q2p2 = _mm_andnot_si128(flat, *q2p2); + flat_q2p2 = _mm_and_si128(flat, flat_q2p2); + *q2p2 = _mm_or_si128(*q2p2, flat_q2p2); + + qs1ps1 = _mm_andnot_si128(flat, qs1ps1); + flat_q1p1 = _mm_and_si128(flat, flat_q1p1); + *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); + + qs0ps0 = _mm_andnot_si128(flat, qs0ps0); + flat_q0p0 = _mm_and_si128(flat, flat_q0p0); + *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); + + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) { + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))), + 4); + flat2_q2p2 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))), + 4); + flat2_q3p3 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))), + 4); + flat2_q4p4 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))), + 4); + flat2_q5p5 = _mm_packus_epi16(res_p, res_q); + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + flat2 = _mm_unpacklo_epi64(flat2, flat2); + + *q5p5 = _mm_andnot_si128(flat2, *q5p5); + flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); + *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5); + + *q4p4 = _mm_andnot_si128(flat2, *q4p4); + flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); + *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4); + + *q3p3 = _mm_andnot_si128(flat2, *q3p3); + flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); + *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3); + + *q2p2 = _mm_andnot_si128(flat2, *q2p2); + flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); + *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2); + + *q1p1 = _mm_andnot_si128(flat2, *q1p1); + flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); + *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1); + + *q0p0 = _mm_andnot_si128(flat2, *q0p0); + flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); + *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0); + } + } else { + *q0p0 = qs0ps0; + *q1p1 = qs1ps1; + } +} + +static AOM_FORCE_INLINE void lpf_internal_14_sse2( + __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2, + __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit, + __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi8(1); + __m128i mask, hev, flat, flat2; + __m128i flat2_pq[6], flat_pq[3]; + __m128i qs0ps0, qs1ps1; + __m128i p1p0, q1q0, qs1qs0, ps1ps0; + __m128i abs_p1p0; + + p1p0 = _mm_unpacklo_epi32(*q0p0, *q1p1); + q1q0 = _mm_srli_si128(p1p0, 8); + + __m128i fe, ff, work; + { + __m128i abs_p1q1, abs_p0q0, abs_q1q0; + abs_p1p0 = abs_diff(*q1p1, *q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 4); + fe = _mm_set1_epi8((char)0xfe); + ff = _mm_cmpeq_epi8(fe, fe); + abs_p0q0 = abs_diff(p1p0, q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 4); + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi32(hev, hev); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); + mask = _mm_unpacklo_epi32(mask, zero); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2)); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4)); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter - the same for 6, 8 and 14 versions + filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0); + qs0ps0 = _mm_unpacklo_epi32(ps1ps0, qs1qs0); + qs1ps1 = _mm_srli_si128(qs0ps0, 8); + // loopfilter done + + flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + flat = _mm_unpacklo_epi32(flat, flat); + flat = _mm_unpacklo_epi64(flat, flat); + + // if flat ==0 then flat2 is zero as well and we don't need any calc below + // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + __m128i q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; + __m128i pq_16[7]; + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i sum_p6; + __m128i sum_p3; + + pq_16[0] = _mm_unpacklo_epi8(*q0p0, zero); + pq_16[1] = _mm_unpacklo_epi8(*q1p1, zero); + pq_16[2] = _mm_unpacklo_epi8(*q2p2, zero); + pq_16[3] = _mm_unpacklo_epi8(*q3p3, zero); + pq_16[4] = _mm_unpacklo_epi8(*q4p4, zero); + pq_16[5] = _mm_unpacklo_epi8(*q5p5, zero); + pq_16[6] = _mm_unpacklo_epi8(*q6p6, zero); + q0_16 = _mm_srli_si128(pq_16[0], 8); + q1_16 = _mm_srli_si128(pq_16[1], 8); + q2_16 = _mm_srli_si128(pq_16[2], 8); + q3_16 = _mm_srli_si128(pq_16[3], 8); + q4_16 = _mm_srli_si128(pq_16[4], 8); + q5_16 = _mm_srli_si128(pq_16[5], 8); + + __m128i flat_p[3], flat_q[3]; + __m128i flat2_p[6], flat2_q[6]; + + __m128i work0, work0_0, work0_1, sum_p_0; + __m128i sum_p = _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[4], pq_16[3])); + __m128i sum_lp = _mm_add_epi16(pq_16[0], _mm_add_epi16(pq_16[2], pq_16[1])); + sum_p = _mm_add_epi16(sum_p, sum_lp); + + __m128i sum_lq = _mm_srli_si128(sum_lp, 8); + __m128i sum_q = _mm_srli_si128(sum_p, 8); + + sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); + sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); + + flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq_16[3], pq_16[0])); + flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q3_16, q0_16)); + + sum_p6 = _mm_add_epi16(pq_16[6], pq_16[6]); + sum_p3 = _mm_add_epi16(pq_16[3], pq_16[3]); + + sum_q = _mm_sub_epi16(sum_p_0, pq_16[5]); + sum_p = _mm_sub_epi16(sum_p_0, q5_16); + + work0_0 = _mm_add_epi16(_mm_add_epi16(pq_16[6], pq_16[0]), pq_16[1]); + work0_1 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[1], _mm_add_epi16(pq_16[2], pq_16[0]))); + + sum_lq = _mm_sub_epi16(sum_lp, pq_16[2]); + sum_lp = _mm_sub_epi16(sum_lp, q2_16); + + work0 = _mm_add_epi16(sum_p3, pq_16[1]); + flat_p[1] = _mm_add_epi16(sum_lp, work0); + flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); + + flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3); + flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3); + flat_pq[0] = _mm_packus_epi16(flat_pq[0], flat_pq[0]); + flat_pq[1] = _mm_packus_epi16(flat_pq[1], flat_pq[1]); + + sum_lp = _mm_sub_epi16(sum_lp, q1_16); + sum_lq = _mm_sub_epi16(sum_lq, pq_16[1]); + + sum_p3 = _mm_add_epi16(sum_p3, pq_16[3]); + work0 = _mm_add_epi16(sum_p3, pq_16[2]); + + flat_p[2] = _mm_add_epi16(sum_lp, work0); + flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); + flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3); + flat_pq[2] = _mm_packus_epi16(flat_pq[2], flat_pq[2]); + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0)); + + work = abs_diff(*q6p6, *q0p0); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 4)); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + flat2 = _mm_unpacklo_epi32(flat2, flat2); + + // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + qs0ps0 = _mm_andnot_si128(flat, qs0ps0); + flat_pq[0] = _mm_and_si128(flat, flat_pq[0]); + *q0p0 = _mm_or_si128(qs0ps0, flat_pq[0]); + + qs1ps1 = _mm_andnot_si128(flat, qs1ps1); + flat_pq[1] = _mm_and_si128(flat, flat_pq[1]); + *q1p1 = _mm_or_si128(qs1ps1, flat_pq[1]); + + *q2p2 = _mm_andnot_si128(flat, *q2p2); + flat_pq[2] = _mm_and_si128(flat, flat_pq[2]); + *q2p2 = _mm_or_si128(*q2p2, flat_pq[2]); + + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) { + flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q0_16)); + flat2_q[0] = _mm_add_epi16( + sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq_16[0])); + + flat2_p[1] = _mm_add_epi16(sum_p, work0_1); + flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8)); + + flat2_pq[0] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4); + flat2_pq[1] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4); + flat2_pq[0] = _mm_packus_epi16(flat2_pq[0], flat2_pq[0]); + flat2_pq[1] = _mm_packus_epi16(flat2_pq[1], flat2_pq[1]); + + sum_p = _mm_sub_epi16(sum_p, q4_16); + sum_q = _mm_sub_epi16(sum_q, pq_16[4]); + + sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); + work0 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[2], _mm_add_epi16(pq_16[3], pq_16[1]))); + flat2_p[2] = _mm_add_epi16(sum_p, work0); + flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[2] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4); + flat2_pq[2] = _mm_packus_epi16(flat2_pq[2], flat2_pq[2]); + + sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); + sum_p = _mm_sub_epi16(sum_p, q3_16); + sum_q = _mm_sub_epi16(sum_q, pq_16[3]); + + work0 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[3], _mm_add_epi16(pq_16[4], pq_16[2]))); + flat2_p[3] = _mm_add_epi16(sum_p, work0); + flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[3] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4); + flat2_pq[3] = _mm_packus_epi16(flat2_pq[3], flat2_pq[3]); + + sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); + sum_p = _mm_sub_epi16(sum_p, q2_16); + sum_q = _mm_sub_epi16(sum_q, pq_16[2]); + + work0 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[4], _mm_add_epi16(pq_16[5], pq_16[3]))); + flat2_p[4] = _mm_add_epi16(sum_p, work0); + flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[4] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4); + flat2_pq[4] = _mm_packus_epi16(flat2_pq[4], flat2_pq[4]); + + sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); + sum_p = _mm_sub_epi16(sum_p, q1_16); + sum_q = _mm_sub_epi16(sum_q, pq_16[1]); + + work0 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[6], pq_16[4]))); + flat2_p[5] = _mm_add_epi16(sum_p, work0); + flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[5] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4); + flat2_pq[5] = _mm_packus_epi16(flat2_pq[5], flat2_pq[5]); + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + *q0p0 = _mm_andnot_si128(flat2, *q0p0); + flat2_pq[0] = _mm_and_si128(flat2, flat2_pq[0]); + *q0p0 = _mm_or_si128(*q0p0, flat2_pq[0]); + + *q1p1 = _mm_andnot_si128(flat2, *q1p1); + flat2_pq[1] = _mm_and_si128(flat2, flat2_pq[1]); + *q1p1 = _mm_or_si128(*q1p1, flat2_pq[1]); + + *q2p2 = _mm_andnot_si128(flat2, *q2p2); + flat2_pq[2] = _mm_and_si128(flat2, flat2_pq[2]); + *q2p2 = _mm_or_si128(*q2p2, flat2_pq[2]); + + *q3p3 = _mm_andnot_si128(flat2, *q3p3); + flat2_pq[3] = _mm_and_si128(flat2, flat2_pq[3]); + *q3p3 = _mm_or_si128(*q3p3, flat2_pq[3]); + + *q4p4 = _mm_andnot_si128(flat2, *q4p4); + flat2_pq[4] = _mm_and_si128(flat2, flat2_pq[4]); + *q4p4 = _mm_or_si128(*q4p4, flat2_pq[4]); + + *q5p5 = _mm_andnot_si128(flat2, *q5p5); + flat2_pq[5] = _mm_and_si128(flat2, flat2_pq[5]); + *q5p5 = _mm_or_si128(*q5p5, flat2_pq[5]); + } + } else { + *q0p0 = qs0ps0; + *q1p1 = qs1ps1; + } +} + +void aom_lpf_horizontal_14_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; + __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + __m128i limit = _mm_load_si128((const __m128i *)_limit); + __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + + q4p4 = _mm_unpacklo_epi32(xx_loadl_32(s - 5 * p), xx_loadl_32(s + 4 * p)); + q3p3 = _mm_unpacklo_epi32(xx_loadl_32(s - 4 * p), xx_loadl_32(s + 3 * p)); + q2p2 = _mm_unpacklo_epi32(xx_loadl_32(s - 3 * p), xx_loadl_32(s + 2 * p)); + q1p1 = _mm_unpacklo_epi32(xx_loadl_32(s - 2 * p), xx_loadl_32(s + 1 * p)); + + q0p0 = _mm_unpacklo_epi32(xx_loadl_32(s - 1 * p), xx_loadl_32(s - 0 * p)); + + q5p5 = _mm_unpacklo_epi32(xx_loadl_32(s - 6 * p), xx_loadl_32(s + 5 * p)); + + q6p6 = _mm_unpacklo_epi32(xx_loadl_32(s - 7 * p), xx_loadl_32(s + 6 * p)); + + lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, + &limit, &thresh); + + store_buffer_horz_8(q0p0, p, 0, s); + store_buffer_horz_8(q1p1, p, 1, s); + store_buffer_horz_8(q2p2, p, 2, s); + store_buffer_horz_8(q3p3, p, 3, s); + store_buffer_horz_8(q4p4, p, 4, s); + store_buffer_horz_8(q5p5, p, 5, s); +} + +static AOM_FORCE_INLINE void lpf_internal_6_dual_sse2( + __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0, + __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit, + __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + __m128i mask, hev, flat; + __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1; + __m128i p2_16, q2_16, p1_16, q1_16, p0_16, q0_16; + __m128i ps1ps0, qs1qs0; + + q2p2 = _mm_unpacklo_epi64(*p2, *q2); + q1p1 = _mm_unpacklo_epi64(*p1, *q1); + q0p0 = _mm_unpacklo_epi64(*p0, *q0); + + *p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); + *q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); + + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8((char)0xfe); + const __m128i ff = _mm_cmpeq_epi8(fe, fe); + + { + // filter_mask and hev_mask + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + + abs_p0q0 = abs_diff(*p1p0, *q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 8); + abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero); + + // considering sse doesn't have unsigned elements comparison the idea is + // to find at least one case when X > limit, it means the corresponding + // mask bit is set. + // to achieve that we find global max value of all inputs of abs(x-y) or + // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set + // otherwise - not + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi64(hev, hev); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = abs_diff(q2p2, q1p1); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // lp filter - the same for 6, 8 and 14 versions + filter4_dual_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0); + + // flat_mask + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi64(flat, flat); + } + + // 5 tap filter + // need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + const __m128i four = _mm_set1_epi16(4); + __m128i workp_a, workp_b, workp_shft0, workp_shft1; + p2_16 = _mm_unpacklo_epi8(*p2, zero); + p1_16 = _mm_unpacklo_epi8(*p1, zero); + p0_16 = _mm_unpacklo_epi8(*p0, zero); + q0_16 = _mm_unpacklo_epi8(*q0, zero); + q1_16 = _mm_unpacklo_epi8(*q1, zero); + q2_16 = _mm_unpacklo_epi8(*q2, zero); + + // op1 + workp_a = _mm_add_epi16(_mm_add_epi16(p0_16, p0_16), + _mm_add_epi16(p1_16, p1_16)); // p0 *2 + p1 * 2 + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), + p2_16); // p2 + p0 * 2 + p1 * 2 + 4 + + workp_b = _mm_add_epi16(_mm_add_epi16(p2_16, p2_16), q0_16); + workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), + 3); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4 + + // op0 + workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q0_16), q1_16); // q0 * 2 + q1 + workp_a = _mm_add_epi16(workp_a, + workp_b); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4 + workp_shft1 = _mm_srli_epi16(workp_a, 3); + + flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0); + + // oq0 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p2_16), + p1_16); // p0 * 2 + p1 + q0 * 2 + q1 + 4 + workp_b = _mm_add_epi16(q1_16, q2_16); + workp_a = _mm_add_epi16( + workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4 + workp_shft0 = _mm_srli_epi16(workp_a, 3); + + // oq1 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p1_16), + p0_16); // p0 + q0 * 2 + q1 * 2 + q2 + 4 + workp_b = _mm_add_epi16(q2_16, q2_16); + workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), + 3); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4 + + flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1); + + qs1qs0 = _mm_andnot_si128(flat, *q1q0); + *q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0 = _mm_or_si128(qs1qs0, *q1q0); + + ps1ps0 = _mm_andnot_si128(flat, *p1p0); + *p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0 = _mm_or_si128(ps1ps0, *p1p0); + } +} + +static AOM_FORCE_INLINE void lpf_internal_6_sse2( + __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0, + __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit, + __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + __m128i mask, hev, flat; + __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1; + __m128i pq2_16, q2_16, pq1_16, pq0_16, q0_16; + __m128i ps1ps0, qs1qs0; + + q2p2 = _mm_unpacklo_epi32(*p2, *q2); + q1p1 = _mm_unpacklo_epi32(*p1, *q1); + q0p0 = _mm_unpacklo_epi32(*p0, *q0); + + *p1p0 = _mm_unpacklo_epi32(*p0, *p1); + *q1q0 = _mm_unpacklo_epi32(*q0, *q1); + + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8((char)0xfe); + const __m128i ff = _mm_cmpeq_epi8(fe, fe); + { + // filter_mask and hev_mask + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 4); + + abs_p0q0 = abs_diff(*p1p0, *q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 4); + + // considering sse doesn't have unsigned elements comparison the idea is + // to find at least one case when X > limit, it means the corresponding + // mask bit is set. + // to achieve that we find global max value of all inputs of abs(x-y) or + // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set + // otherwise - not + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi32(hev, hev); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); + mask = _mm_unpacklo_epi32(mask, zero); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = abs_diff(q2p2, q1p1); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4)); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // lp filter - the same for 6, 8 and 14 versions + filter4_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0); + + // flat_mask + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi32(flat, flat); + flat = _mm_unpacklo_epi64(flat, flat); + } + + // 5 tap filter + // need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + const __m128i four = _mm_set1_epi16(4); + __m128i workp_a, workp_b, workp_c; + __m128i pq0x2_pq1, pq1_pq2; + pq2_16 = _mm_unpacklo_epi8(q2p2, zero); + pq1_16 = _mm_unpacklo_epi8(q1p1, zero); + pq0_16 = _mm_unpacklo_epi8(q0p0, zero); + q0_16 = _mm_srli_si128(pq0_16, 8); + q2_16 = _mm_srli_si128(pq2_16, 8); + + // op1 + pq0x2_pq1 = + _mm_add_epi16(_mm_add_epi16(pq0_16, pq0_16), pq1_16); // p0 *2 + p1 + pq1_pq2 = _mm_add_epi16(pq1_16, pq2_16); // p1 + p2 + workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four), + pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4 + + workp_b = _mm_add_epi16(_mm_add_epi16(pq2_16, pq2_16), q0_16); + workp_b = + _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4 + + // op0 + workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1 + workp_a = _mm_add_epi16(workp_a, + workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4 + workp_b = _mm_unpacklo_epi64(workp_a, workp_b); + workp_b = _mm_srli_epi16(workp_b, 3); + + flat_p1p0 = _mm_packus_epi16(workp_b, workp_b); + + // oq0 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq2_16), + pq1_16); // p0 * 2 + p1 + q0 * 2 + q1 + 4 + workp_b = _mm_srli_si128(pq1_pq2, 8); + workp_a = _mm_add_epi16( + workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4 + // workp_shft0 = _mm_srli_epi16(workp_a, 3); + + // oq1 + workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq1_16), + pq0_16); // p0 + q0 * 2 + q1 * 2 + q2 + 4 + workp_b = _mm_add_epi16(q2_16, q2_16); + workp_b = + _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4 + + workp_a = _mm_unpacklo_epi64(workp_a, workp_b); + workp_a = _mm_srli_epi16(workp_a, 3); + + flat_q0q1 = _mm_packus_epi16(workp_a, workp_a); + + qs1qs0 = _mm_andnot_si128(flat, *q1q0); + *q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0 = _mm_or_si128(qs1qs0, *q1q0); + + ps1ps0 = _mm_andnot_si128(flat, *p1p0); + *p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0 = _mm_or_si128(ps1ps0, *p1p0); + } +} + +void aom_lpf_horizontal_6_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i p2, p1, p0, q0, q1, q2; + __m128i p1p0, q1q0; + __m128i blimit = _mm_load_si128((__m128i *)_blimit); + __m128i limit = _mm_load_si128((__m128i *)_limit); + __m128i thresh = _mm_load_si128((__m128i *)_thresh); + + p2 = xx_loadl_32(s - 3 * p); + p1 = xx_loadl_32(s - 2 * p); + p0 = xx_loadl_32(s - 1 * p); + q0 = xx_loadl_32(s - 0 * p); + q1 = xx_loadl_32(s + 1 * p); + q2 = xx_loadl_32(s + 2 * p); + + lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit, + &limit, &thresh); + + xx_storel_32(s - 1 * p, p1p0); + xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4)); + xx_storel_32(s + 0 * p, q1q0); + xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4)); +} + +void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p, + const unsigned char *_blimit0, + const unsigned char *_limit0, + const unsigned char *_thresh0, + const unsigned char *_blimit1, + const unsigned char *_limit1, + const unsigned char *_thresh1) { + __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0), + _mm_load_si128((__m128i *)_blimit1)); + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0), + _mm_load_si128((__m128i *)_limit1)); + __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0), + _mm_load_si128((__m128i *)_thresh1)); + + __m128i p2, p1, p0, q0, q1, q2; + __m128i p1p0, q1q0; + + p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); + + lpf_internal_6_dual_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit, + &limit, &thresh); + + _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); + _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); +} + +static AOM_FORCE_INLINE void lpf_internal_8_sse2( + __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, + __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out, + __m128i *blimit, __m128i *limit, __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + __m128i mask, hev, flat; + __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3, + flat_p1p0, flat_q0q1; + __m128i q2p2, q1p1, q0p0; + __m128i q1q0, p1p0, ps1ps0, qs1qs0; + __m128i work_pq, opq2, pq2; + + q3p3 = _mm_unpacklo_epi32(*p3, *q3); + q2p2 = _mm_unpacklo_epi32(*p2, *q2); + q1p1 = _mm_unpacklo_epi32(*p1, *q1); + q0p0 = _mm_unpacklo_epi32(*p0, *q0); + + p1p0 = _mm_unpacklo_epi32(q0p0, q1p1); // p1p0 q1q0 + q1q0 = _mm_srli_si128(p1p0, 8); + + // filter_mask and hev_mask + + // considering sse doesn't have unsigned elements comparison the idea is to + // find at least one case when X > limit, it means the corresponding mask + // bit is set. + // to achieve that we find global max value of all inputs of abs(x-y) or + // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set + // otherwise - not + + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8((char)0xfe); + const __m128i ff = _mm_cmpeq_epi8(fe, fe); + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 4); + + abs_p0q0 = abs_diff(p1p0, q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 4); + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi32(hev, hev); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); + mask = _mm_unpacklo_epi32(mask, zero); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); + + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4)); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // lp filter - the same for 6, 8 and 14 versions + filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); + + // flat_mask4 + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi32(flat, flat); + flat = _mm_unpacklo_epi64(flat, flat); + + // filter8 need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + const __m128i four = _mm_set1_epi16(4); + __m128i workp_a, workp_b, workp_c, workp_d, workp_shft1, workp_shft2; + p2_16 = _mm_unpacklo_epi8(*p2, zero); + p1_16 = _mm_unpacklo_epi8(*p1, zero); + p0_16 = _mm_unpacklo_epi8(*p0, zero); + q0_16 = _mm_unpacklo_epi8(*q0, zero); + q1_16 = _mm_unpacklo_epi8(*q1, zero); + q2_16 = _mm_unpacklo_epi8(*q2, zero); + p3_16 = _mm_unpacklo_epi8(*p3, zero); + q3_16 = _mm_unpacklo_epi8(*q3, zero); + + // op2 + workp_a = + _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16); + workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16); + workp_shft2 = _mm_add_epi16(workp_a, workp_b); + + // op1 + workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16); + workp_c = _mm_add_epi16(workp_a, workp_b); + // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // op0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16); + workp_d = _mm_add_epi16(workp_a, workp_b); + // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + workp_c = _mm_unpacklo_epi64(workp_d, workp_c); + workp_c = _mm_srli_epi16(workp_c, 3); + flat_p1p0 = _mm_packus_epi16(workp_c, workp_c); + + // oq0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16); + // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + workp_c = _mm_add_epi16(workp_a, workp_b); + + // oq1 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16); + workp_d = _mm_add_epi16(workp_a, workp_b); + // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + workp_c = _mm_unpacklo_epi64(workp_c, workp_d); + workp_c = _mm_srli_epi16(workp_c, 3); + flat_q0q1 = _mm_packus_epi16(workp_c, workp_c); + + // oq2 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16); + workp_shft1 = _mm_add_epi16(workp_a, workp_b); + + workp_c = _mm_unpacklo_epi64(workp_shft2, workp_shft1); + workp_c = _mm_srli_epi16(workp_c, 3); + + opq2 = _mm_packus_epi16(workp_c, workp_c); + + work_pq = _mm_andnot_si128(flat, q2p2); + pq2 = _mm_and_si128(flat, opq2); + *p2 = _mm_or_si128(work_pq, pq2); + *q2 = _mm_srli_si128(*p2, 4); + + qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); + + ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + } +} + +static AOM_FORCE_INLINE void lpf_internal_8_dual_sse2( + __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, + __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out, + __m128i *blimit, __m128i *limit, __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + __m128i mask, hev, flat; + __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3, + flat_p1p0, flat_q0q1; + __m128i q2p2, q1p1, q0p0; + __m128i q1q0, p1p0, ps1ps0, qs1qs0; + __m128i work_pq, opq2, pq2; + + q3p3 = _mm_unpacklo_epi64(*p3, *q3); + q2p2 = _mm_unpacklo_epi64(*p2, *q2); + q1p1 = _mm_unpacklo_epi64(*p1, *q1); + q0p0 = _mm_unpacklo_epi64(*p0, *q0); + + p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); + q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); + + { + // filter_mask and hev_mask + + // considering sse doesn't have unsigned elements comparison the idea is to + // find at least one case when X > limit, it means the corresponding mask + // bit is set. + // to achieve that we find global max value of all inputs of abs(x-y) or + // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set + // otherwise - not + + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8((char)0xfe); + const __m128i ff = _mm_cmpeq_epi8(fe, fe); + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + + abs_p0q0 = abs_diff(p1p0, q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 8); + abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, abs_p0q0); + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi64(hev, hev); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); + + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // lp filter - the same for 6, 8 and 14 versions + filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); + + // flat_mask4 + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi64(flat, flat); + } + + // filter8 need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + const __m128i four = _mm_set1_epi16(4); + + __m128i workp_a, workp_b, workp_shft0, workp_shft1, workp_shft2; + p2_16 = _mm_unpacklo_epi8(*p2, zero); + p1_16 = _mm_unpacklo_epi8(*p1, zero); + p0_16 = _mm_unpacklo_epi8(*p0, zero); + q0_16 = _mm_unpacklo_epi8(*q0, zero); + q1_16 = _mm_unpacklo_epi8(*q1, zero); + q2_16 = _mm_unpacklo_epi8(*q2, zero); + p3_16 = _mm_unpacklo_epi8(*p3, zero); + q3_16 = _mm_unpacklo_epi8(*q3, zero); + + // op2 + workp_a = + _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16); + workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16); + workp_shft2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // op1 + workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16); + workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // op0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16); + workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0); + + // oq0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16); + workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // oq1 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16); + workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1); + + // oq2 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16); + workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + opq2 = _mm_packus_epi16(workp_shft2, workp_shft1); + + work_pq = _mm_andnot_si128(flat, q2p2); + pq2 = _mm_and_si128(flat, opq2); + *p2 = _mm_or_si128(work_pq, pq2); + *q2 = _mm_srli_si128(*p2, 8); + + qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); + + ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + } +} + +void aom_lpf_horizontal_8_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i p3, p2, p1, p0, q0, q1, q2, q3; + __m128i q1q0, p1p0; + __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + __m128i limit = _mm_load_si128((const __m128i *)_limit); + __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + + p3 = xx_loadl_32(s - 4 * p); + p2 = xx_loadl_32(s - 3 * p); + p1 = xx_loadl_32(s - 2 * p); + p0 = xx_loadl_32(s - 1 * p); + q0 = xx_loadl_32(s - 0 * p); + q1 = xx_loadl_32(s + 1 * p); + q2 = xx_loadl_32(s + 2 * p); + q3 = xx_loadl_32(s + 3 * p); + + lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, + &blimit, &limit, &thresh); + + xx_storel_32(s - 1 * p, p1p0); + xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4)); + xx_storel_32(s + 0 * p, q1q0); + xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4)); + xx_storel_32(s - 3 * p, p2); + xx_storel_32(s + 2 * p, q2); +} + +void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p, + const unsigned char *_blimit0, + const unsigned char *_limit0, + const unsigned char *_thresh0, + const unsigned char *_blimit1, + const unsigned char *_limit1, + const unsigned char *_thresh1) { + __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; + __m128i blimit = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + __m128i thresh = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0), + _mm_load_si128((const __m128i *)_thresh1)); + + q4p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 5 * p)), + _mm_loadl_epi64((__m128i *)(s + 4 * p))); + q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)), + _mm_loadl_epi64((__m128i *)(s + 3 * p))); + q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), + _mm_loadl_epi64((__m128i *)(s + 2 * p))); + q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), + _mm_loadl_epi64((__m128i *)(s + 1 * p))); + + q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), + _mm_loadl_epi64((__m128i *)(s - 0 * p))); + + q5p5 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 6 * p)), + _mm_loadl_epi64((__m128i *)(s + 5 * p))); + + q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)), + _mm_loadl_epi64((__m128i *)(s + 6 * p))); + + lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, + &blimit, &limit, &thresh); + + _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), _mm_srli_si128(q0p0, 8)); + _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1p1, 8)); + _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); + _mm_storel_epi64((__m128i *)(s + 2 * p), _mm_srli_si128(q2p2, 8)); + _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); + _mm_storel_epi64((__m128i *)(s + 3 * p), _mm_srli_si128(q3p3, 8)); + _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); + _mm_storel_epi64((__m128i *)(s + 4 * p), _mm_srli_si128(q4p4, 8)); + _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); + _mm_storel_epi64((__m128i *)(s + 5 * p), _mm_srli_si128(q5p5, 8)); +} + +void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1) { + __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0), + _mm_load_si128((__m128i *)_blimit1)); + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0), + _mm_load_si128((__m128i *)_limit1)); + __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0), + _mm_load_si128((__m128i *)_thresh1)); + + __m128i p2, p1, p0, q0, q1, q2, p3, q3; + __m128i q1q0, p1p0; + + p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); + p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); + q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); + + lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, + &blimit, &limit, &thresh); + + _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); + _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); + _mm_storel_epi64((__m128i *)(s - 3 * p), p2); + _mm_storel_epi64((__m128i *)(s + 2 * p), q2); +} + +void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, + const unsigned char *_blimit0, + const unsigned char *_limit0, + const unsigned char *_thresh0, + const unsigned char *_blimit1, + const unsigned char *_limit1, + const unsigned char *_thresh1) { + __m128i p1, p0, q0, q1; + __m128i qs1qs0, ps1ps0; + + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + + const __m128i zero = _mm_setzero_si128(); + const __m128i blimit = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + const __m128i limit = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + + __m128i l = _mm_unpacklo_epi64(blimit, limit); + + __m128i thresh0 = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero); + + __m128i thresh1 = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero); + + __m128i t = _mm_unpacklo_epi64(thresh0, thresh1); + + lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0); + + _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(ps1ps0, 8)); + _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(qs1qs0, 8)); +} + +void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1) { + __m128i p0, q0, q1, p1; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i qs1qs0, ps1ps0; + + const __m128i zero = _mm_setzero_si128(); + const __m128i blimit = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + const __m128i limit = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + + __m128i l = _mm_unpacklo_epi64(blimit, limit); + + __m128i thresh0 = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero); + + __m128i thresh1 = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero); + + __m128i t = _mm_unpacklo_epi64(thresh0, thresh1); + + x0 = _mm_loadl_epi64((__m128i *)((s - 2))); + x1 = _mm_loadl_epi64((__m128i *)((s - 2) + p)); + x2 = _mm_loadl_epi64((__m128i *)((s - 2) + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)((s - 2) + 3 * p)); + x4 = _mm_loadl_epi64((__m128i *)((s - 2) + 4 * p)); + x5 = _mm_loadl_epi64((__m128i *)((s - 2) + 5 * p)); + x6 = _mm_loadl_epi64((__m128i *)((s - 2) + 6 * p)); + x7 = _mm_loadl_epi64((__m128i *)((s - 2) + 7 * p)); + + transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p1, &p0, &q0, + &q1); + + lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0); + + p1 = _mm_srli_si128(ps1ps0, 8); + q1 = _mm_srli_si128(qs1qs0, 8); + + transpose4x8_8x4_sse2(&p1, &ps1ps0, &qs1qs0, &q1, &d0, &d1, &d2, &d3, &d4, + &d5, &d6, &d7); + + xx_storel_32((s - 2 + 0 * p), d0); + xx_storel_32((s - 2 + 1 * p), d1); + xx_storel_32((s - 2 + 2 * p), d2); + xx_storel_32((s - 2 + 3 * p), d3); + xx_storel_32((s - 2 + 4 * p), d4); + xx_storel_32((s - 2 + 5 * p), d5); + xx_storel_32((s - 2 + 6 * p), d6); + xx_storel_32((s - 2 + 7 * p), d7); +} + +void aom_lpf_vertical_6_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i x2, x1, x0, x3; + __m128i p0, q0; + __m128i p1p0, q1q0; + __m128i blimit = _mm_load_si128((__m128i *)_blimit); + __m128i limit = _mm_load_si128((__m128i *)_limit); + __m128i thresh = _mm_load_si128((__m128i *)_thresh); + + x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p)); + x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p)); + x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p)); + x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p)); + + transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6, + &d7); + + lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit, + &limit, &thresh); + + p0 = _mm_srli_si128(p1p0, 4); + q0 = _mm_srli_si128(q1q0, 4); + + transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3); + + xx_storel_32(s + 0 * p - 2, d0); + xx_storel_32(s + 1 * p - 2, d1); + xx_storel_32(s + 2 * p - 2, d2); + xx_storel_32(s + 3 * p - 2, d3); +} + +void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1) { + __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0), + _mm_load_si128((__m128i *)_blimit1)); + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0), + _mm_load_si128((__m128i *)_limit1)); + __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0), + _mm_load_si128((__m128i *)_thresh1)); + + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i p0, q0; + __m128i p1p0, q1q0; + __m128i d0d1, d2d3, d4d5, d6d7; + + x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p)); + x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p)); + x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p)); + x4 = _mm_loadl_epi64((__m128i *)((s - 3) + 4 * p)); + x5 = _mm_loadl_epi64((__m128i *)((s - 3) + 5 * p)); + x6 = _mm_loadl_epi64((__m128i *)((s - 3) + 6 * p)); + x7 = _mm_loadl_epi64((__m128i *)((s - 3) + 7 * p)); + + transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5, + &d6d7); + + d1 = _mm_srli_si128(d0d1, 8); + d3 = _mm_srli_si128(d2d3, 8); + d5 = _mm_srli_si128(d4d5, 8); + d7 = _mm_srli_si128(d6d7, 8); + + lpf_internal_6_dual_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0, + &blimit, &limit, &thresh); + + p0 = _mm_srli_si128(p1p0, 8); + q0 = _mm_srli_si128(q1q0, 8); + + transpose4x8_8x4_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3, &d4, &d5, + &d6, &d7); + + xx_storel_32((s - 2 + 0 * p), d0); + xx_storel_32((s - 2 + 1 * p), d1); + xx_storel_32((s - 2 + 2 * p), d2); + xx_storel_32((s - 2 + 3 * p), d3); + xx_storel_32((s - 2 + 4 * p), d4); + xx_storel_32((s - 2 + 5 * p), d5); + xx_storel_32((s - 2 + 6 * p), d6); + xx_storel_32((s - 2 + 7 * p), d7); +} + +void aom_lpf_vertical_8_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + + __m128i p0, q0; + __m128i x2, x1, x0, x3; + __m128i q1q0, p1p0; + __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + __m128i limit = _mm_load_si128((const __m128i *)_limit); + __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + + x3 = _mm_loadl_epi64((__m128i *)((s - 4) + 0 * p)); + x2 = _mm_loadl_epi64((__m128i *)((s - 4) + 1 * p)); + x1 = _mm_loadl_epi64((__m128i *)((s - 4) + 2 * p)); + x0 = _mm_loadl_epi64((__m128i *)((s - 4) + 3 * p)); + + transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6, + &d7); + // Loop filtering + lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0, + &blimit, &limit, &thresh); + + p0 = _mm_srli_si128(p1p0, 4); + q0 = _mm_srli_si128(q1q0, 4); + + transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, &d1, + &d2, &d3); + + _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0); + _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), d1); + _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2); + _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), d3); +} + +void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1) { + __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0), + _mm_load_si128((__m128i *)_blimit1)); + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0), + _mm_load_si128((__m128i *)_limit1)); + __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0), + _mm_load_si128((__m128i *)_thresh1)); + + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i d1, d3, d5, d7; + __m128i q1q0, p1p0; + __m128i p1, q1; + __m128i d0d1, d2d3, d4d5, d6d7; + + x0 = _mm_loadl_epi64((__m128i *)(s - 4 + 0 * p)); + x1 = _mm_loadl_epi64((__m128i *)(s - 4 + 1 * p)); + x2 = _mm_loadl_epi64((__m128i *)(s - 4 + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)(s - 4 + 3 * p)); + x4 = _mm_loadl_epi64((__m128i *)(s - 4 + 4 * p)); + x5 = _mm_loadl_epi64((__m128i *)(s - 4 + 5 * p)); + x6 = _mm_loadl_epi64((__m128i *)(s - 4 + 6 * p)); + x7 = _mm_loadl_epi64((__m128i *)(s - 4 + 7 * p)); + + transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5, + &d6d7); + + d1 = _mm_srli_si128(d0d1, 8); + d3 = _mm_srli_si128(d2d3, 8); + d5 = _mm_srli_si128(d4d5, 8); + d7 = _mm_srli_si128(d6d7, 8); + + lpf_internal_8_dual_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5, + &q1q0, &p1p0, &blimit, &limit, &thresh); + + p1 = _mm_srli_si128(p1p0, 8); + q1 = _mm_srli_si128(q1q0, 8); + + transpose8x8_sse2(&d0d1, &d1, &p1, &p1p0, &q1q0, &q1, &d6d7, &d7, &d0d1, + &d2d3, &d4d5, &d6d7); + + _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0d1); + _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), _mm_srli_si128(d0d1, 8)); + _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2d3); + _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), _mm_srli_si128(d2d3, 8)); + _mm_storel_epi64((__m128i *)(s - 4 + 4 * p), d4d5); + _mm_storel_epi64((__m128i *)(s - 4 + 5 * p), _mm_srli_si128(d4d5, 8)); + _mm_storel_epi64((__m128i *)(s - 4 + 6 * p), d6d7); + _mm_storel_epi64((__m128i *)(s - 4 + 7 * p), _mm_srli_si128(d6d7, 8)); +} + +void aom_lpf_vertical_14_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; + __m128i x6, x5, x4, x3; + __m128i pq0, pq1, pq2, pq3; + __m128i blimit = _mm_load_si128((__m128i *)_blimit); + __m128i limit = _mm_load_si128((__m128i *)_limit); + __m128i thresh = _mm_load_si128((__m128i *)_thresh); + + x6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p)); + x5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p)); + x4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p)); + x3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p)); + + transpose_pq_14_sse2(&x6, &x5, &x4, &x3, &q0p0, &q1p1, &q2p2, &q3p3, &q4p4, + &q5p5, &q6p6, &q7p7); + + lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, + &limit, &thresh); + + transpose_pq_14_inv_sse2(&q7p7, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, + &q0p0, &pq0, &pq1, &pq2, &pq3); + _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), pq0); + _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), pq1); + _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), pq2); + _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), pq3); +} + +void aom_lpf_vertical_14_dual_sse2( + unsigned char *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1) { + __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; + __m128i x7, x6, x5, x4, x3, x2, x1, x0; + __m128i d0d1, d2d3, d4d5, d6d7, d8d9, d10d11, d12d13, d14d15; + __m128i q0, q1, q2, q3, q7; + __m128i p0p1, p2p3, p4p5, p6p7; + + __m128i blimit = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + __m128i thresh = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0), + _mm_load_si128((const __m128i *)_thresh1)); + + x7 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p)); + x6 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p)); + x5 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p)); + x4 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p)); + x3 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * p)); + x2 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * p)); + x1 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * p)); + x0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * p)); + + transpose8x16_16x8_sse2(&x7, &x6, &x5, &x4, &x3, &x2, &x1, &x0, &d0d1, &d2d3, + &d4d5, &d6d7, &d8d9, &d10d11, &d12d13, &d14d15); + + q6p6 = _mm_unpacklo_epi64(d2d3, _mm_srli_si128(d12d13, 8)); + q5p5 = _mm_unpacklo_epi64(d4d5, _mm_srli_si128(d10d11, 8)); + q4p4 = _mm_unpacklo_epi64(d6d7, _mm_srli_si128(d8d9, 8)); + q3p3 = _mm_unpacklo_epi64(d8d9, _mm_srli_si128(d6d7, 8)); + q2p2 = _mm_unpacklo_epi64(d10d11, _mm_srli_si128(d4d5, 8)); + q1p1 = _mm_unpacklo_epi64(d12d13, _mm_srli_si128(d2d3, 8)); + q0p0 = _mm_unpacklo_epi64(d14d15, _mm_srli_si128(d0d1, 8)); + q7 = _mm_srli_si128(d14d15, 8); + + lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, + &blimit, &limit, &thresh); + + x0 = _mm_srli_si128(q0p0, 8); + x1 = _mm_srli_si128(q1p1, 8); + x2 = _mm_srli_si128(q2p2, 8); + x3 = _mm_srli_si128(q3p3, 8); + x4 = _mm_srli_si128(q4p4, 8); + x5 = _mm_srli_si128(q5p5, 8); + x6 = _mm_srli_si128(q6p6, 8); + + transpose16x8_8x16_sse2(&d0d1, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, + &q0p0, &x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &p0p1, + &p2p3, &p4p5, &p6p7, &q0, &q1, &q2, &q3); + + _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), p0p1); + _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), p2p3); + _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), p4p5); + _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), p6p7); + _mm_storeu_si128((__m128i *)(s - 8 + 4 * p), q0); + _mm_storeu_si128((__m128i *)(s - 8 + 5 * p), q1); + _mm_storeu_si128((__m128i *)(s - 8 + 6 * p), q2); + _mm_storeu_si128((__m128i *)(s - 8 + 7 * p), q3); +} diff --git a/libs/libaom/src/aom_dsp/x86/lpf_common_sse2.h b/libs/libaom/src/aom_dsp/x86/lpf_common_sse2.h new file mode 100644 index 000000000..6ed2cbfdf --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/lpf_common_sse2.h @@ -0,0 +1,495 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ +#define AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ + +#include // SSE2 + +#include "config/aom_config.h" + +static INLINE void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *x4, __m128i *x5, + __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3, + __m128i *d4, __m128i *d5) { + __m128i w0, w1, w2, w3, w4, w5, ww0; + + // 00 01 02 03 04 05 xx xx + // 10 11 12 13 14 15 xx xx + // 20 21 22 23 24 25 xx xx + // 30 31 32 33 34 35 xx xx + // 40 41 42 43 44 45 xx xx + // 50 51 52 53 54 55 xx xx + + w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 + w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 + w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53 + + ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 + *d0 = _mm_unpacklo_epi64(ww0, w2); // 00 10 20 30 40 50 41 51 + *d1 = _mm_unpackhi_epi64(ww0, + _mm_srli_si128(w2, 4)); // 01 11 21 31 41 51 xx xx + + ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 + *d2 = _mm_unpacklo_epi64(ww0, + _mm_srli_si128(w2, 8)); // 02 12 22 32 42 52 xx xx + + w3 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 xx xx xx xx + w4 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 xx xx xx xx + w5 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 xx xx xx xx + + *d3 = _mm_unpackhi_epi64(ww0, _mm_srli_si128(w2, 4)); // 03 13 23 33 43 53 + + ww0 = _mm_unpacklo_epi32(w3, w4); // 04 14 24 34 05 15 25 35 + *d4 = _mm_unpacklo_epi64(ww0, w5); // 04 14 24 34 44 54 45 55 + *d5 = _mm_unpackhi_epi64(ww0, + _mm_slli_si128(w5, 4)); // 05 15 25 35 45 55 xx xx +} + +static INLINE void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3) { + __m128i zero = _mm_setzero_si128(); + __m128i w0, w1, ww0, ww1; + + w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 + w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 + + ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 + ww1 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 + + *d0 = _mm_unpacklo_epi64(ww0, zero); // 00 10 20 30 xx xx xx xx + *d1 = _mm_unpackhi_epi64(ww0, zero); // 01 11 21 31 xx xx xx xx + *d2 = _mm_unpacklo_epi64(ww1, zero); // 02 12 22 32 xx xx xx xx + *d3 = _mm_unpackhi_epi64(ww1, zero); // 03 13 23 33 xx xx xx xx +} + +static INLINE void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *d4, __m128i *d5, + __m128i *d6, __m128i *d7) { + __m128i w0, w1, ww2, ww3; + __m128i zero = _mm_setzero_si128(); + + w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17 + w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37 + + ww2 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35 + ww3 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37 + + *d4 = _mm_unpacklo_epi64(ww2, zero); // 04 14 24 34 xx xx xx xx + *d5 = _mm_unpackhi_epi64(ww2, zero); // 05 15 25 35 xx xx xx xx + *d6 = _mm_unpacklo_epi64(ww3, zero); // 06 16 26 36 xx xx xx xx + *d7 = _mm_unpackhi_epi64(ww3, zero); // 07 17 27 37 xx xx xx xx +} + +// here in and out pointers (x and d) should be different! we don't store their +// values inside +static INLINE void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3, + __m128i *d4, __m128i *d5, + __m128i *d6, __m128i *d7) { + // input + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + // output + // 00 10 20 30 xx xx xx xx + // 01 11 21 31 xx xx xx xx + // 02 12 22 32 xx xx xx xx + // 03 13 23 33 xx xx xx xx + // 04 14 24 34 xx xx xx xx + // 05 15 25 35 xx xx xx xx + // 06 16 26 36 xx xx xx xx + // 07 17 27 37 xx xx xx xx + highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3); + highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7); +} + +static INLINE void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, + __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3) { + __m128i w0, w1, w2, w3, ww0, ww1; + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + // x4 40 41 42 43 44 45 46 47 + // x5 50 51 52 53 54 55 56 57 + // x6 60 61 62 63 64 65 66 67 + // x7 70 71 72 73 74 75 76 77 + + w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 + w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 + w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53 + w3 = _mm_unpacklo_epi16(*x6, *x7); // 60 70 61 71 62 72 63 73 + + ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 + ww1 = _mm_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71 + + *d0 = _mm_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70 + *d1 = _mm_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71 + + ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 + ww1 = _mm_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73 + + *d2 = _mm_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72 + *d3 = _mm_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73 +} + +static INLINE void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, + __m128i *d4, __m128i *d5, + __m128i *d6, __m128i *d7) { + __m128i w0, w1, w2, w3, ww0, ww1; + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + // x4 40 41 42 43 44 45 46 47 + // x5 50 51 52 53 54 55 56 57 + // x6 60 61 62 63 64 65 66 67 + // x7 70 71 72 73 74 75 76 77 + w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17 + w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37 + w2 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 46 56 47 57 + w3 = _mm_unpackhi_epi16(*x6, *x7); // 64 74 65 75 66 76 67 77 + + ww0 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35 + ww1 = _mm_unpacklo_epi32(w2, w3); // 44 54 64 74 45 55 65 75 + + *d4 = _mm_unpacklo_epi64(ww0, ww1); // 04 14 24 34 44 54 64 74 + *d5 = _mm_unpackhi_epi64(ww0, ww1); // 05 15 25 35 45 55 65 75 + + ww0 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37 + ww1 = _mm_unpackhi_epi32(w2, w3); // 46 56 66 76 47 57 67 77 + + *d6 = _mm_unpacklo_epi64(ww0, ww1); // 06 16 26 36 46 56 66 76 + *d7 = _mm_unpackhi_epi64(ww0, ww1); // 07 17 27 37 47 57 67 77 +} + +// here in and out pointers (x and d) should be different! we don't store their +// values inside +static INLINE void highbd_transpose8x8_sse2( + __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, + __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6, + __m128i *d7) { + highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3); + highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7); +} + +// here in and out pointers (x and d arrays) should be different! we don't store +// their values inside +static INLINE void highbd_transpose8x16_sse2( + __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, + __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6, + __m128i *d7) { + highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4, + d5, d6, d7); + highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1, + x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1, + d4 + 1, d5 + 1, d6 + 1, d7 + 1); +} + +// Low bit depth functions +static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3) { + // input + // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx + // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx + // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx + // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx + // output + // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx + // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx + + __m128i w0, w1; + + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + + *d0 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + + *d1 = _mm_srli_si128(*d0, + 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + *d2 = _mm_srli_si128(*d0, + 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + *d3 = _mm_srli_si128(*d0, + 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx +} + +static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3, __m128i *d4, + __m128i *d5, __m128i *d6, + __m128i *d7) { + // input + // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx + // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx + // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx + // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx + // output + // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx + // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx + // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx + // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx + // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx + // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx + + __m128i w0, w1, ww0, ww1; + + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + + ww0 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ww1 = _mm_unpackhi_epi16( + w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + + *d0 = ww0; // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx + *d1 = _mm_srli_si128(ww0, + 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + *d2 = _mm_srli_si128(ww0, + 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + *d3 = _mm_srli_si128(ww0, + 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx + + *d4 = ww1; // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx + *d5 = _mm_srli_si128(ww1, + 4); // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx + *d6 = _mm_srli_si128(ww1, + 8); // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx + *d7 = _mm_srli_si128(ww1, + 12); // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx +} + +static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, __m128i *d0, + __m128i *d1, __m128i *d2, + __m128i *d3) { + // input + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + // x4 40 41 42 43 44 45 46 47 + // x5 50 51 52 53 54 55 56 57 + // x6 60 61 62 63 64 65 66 67 + // x7 70 71 72 73 74 75 76 77 + // output + // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx + // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx + // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx + // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx + + __m128i w0, w1, w2, w3, w4, w5; + + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + + w2 = _mm_unpacklo_epi8( + *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + + w3 = _mm_unpacklo_epi8( + *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + + w4 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + w5 = _mm_unpacklo_epi16( + w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + + *d0 = _mm_unpacklo_epi32( + w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + *d1 = _mm_srli_si128(*d0, 8); + *d2 = _mm_unpackhi_epi32( + w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + *d3 = _mm_srli_si128(*d2, 8); +} + +static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, __m128i *d0d1, + __m128i *d2d3, __m128i *d4d5, + __m128i *d6d7) { + __m128i w0, w1, w2, w3, w4, w5, w6, w7; + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + + // x4 40 41 42 43 44 45 46 47 + // x5 50 51 52 53 54 55 56 57 + w2 = _mm_unpacklo_epi8( + *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + + // x6 60 61 62 63 64 65 66 67 + // x7 70 71 72 73 74 75 76 77 + w3 = _mm_unpacklo_epi8( + *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + + w4 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + w5 = _mm_unpacklo_epi16( + w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + + *d0d1 = _mm_unpacklo_epi32( + w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + *d2d3 = _mm_unpackhi_epi32( + w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + + w6 = _mm_unpackhi_epi16( + w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + w7 = _mm_unpackhi_epi16( + w2, w3); // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + + *d4d5 = _mm_unpacklo_epi32( + w6, w7); // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + *d6d7 = _mm_unpackhi_epi32( + w6, w7); // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 +} + +static INLINE void transpose16x8_8x16_sse2( + __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, + __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9, + __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14, + __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3, + __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) { + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; + __m128i w10, w11, w12, w13, w14, w15; + + w0 = _mm_unpacklo_epi8(*x0, *x1); + w1 = _mm_unpacklo_epi8(*x2, *x3); + w2 = _mm_unpacklo_epi8(*x4, *x5); + w3 = _mm_unpacklo_epi8(*x6, *x7); + + w8 = _mm_unpacklo_epi8(*x8, *x9); + w9 = _mm_unpacklo_epi8(*x10, *x11); + w10 = _mm_unpacklo_epi8(*x12, *x13); + w11 = _mm_unpacklo_epi8(*x14, *x15); + + w4 = _mm_unpacklo_epi16(w0, w1); + w5 = _mm_unpacklo_epi16(w2, w3); + w12 = _mm_unpacklo_epi16(w8, w9); + w13 = _mm_unpacklo_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store first 4-line result + *d0 = _mm_unpacklo_epi64(w6, w14); + *d1 = _mm_unpackhi_epi64(w6, w14); + *d2 = _mm_unpacklo_epi64(w7, w15); + *d3 = _mm_unpackhi_epi64(w7, w15); + + w4 = _mm_unpackhi_epi16(w0, w1); + w5 = _mm_unpackhi_epi16(w2, w3); + w12 = _mm_unpackhi_epi16(w8, w9); + w13 = _mm_unpackhi_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store second 4-line result + *d4 = _mm_unpacklo_epi64(w6, w14); + *d5 = _mm_unpackhi_epi64(w6, w14); + *d6 = _mm_unpacklo_epi64(w7, w15); + *d7 = _mm_unpackhi_epi64(w7, w15); +} + +static INLINE void transpose8x16_16x8_sse2( + __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, + __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3, + __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11, + __m128i *d12d13, __m128i *d14d15) { + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; + __m128i w10, w11, w12, w13, w14, w15; + + w0 = _mm_unpacklo_epi8(*x0, *x1); + w1 = _mm_unpacklo_epi8(*x2, *x3); + w2 = _mm_unpacklo_epi8(*x4, *x5); + w3 = _mm_unpacklo_epi8(*x6, *x7); + + w8 = _mm_unpackhi_epi8(*x0, *x1); + w9 = _mm_unpackhi_epi8(*x2, *x3); + w10 = _mm_unpackhi_epi8(*x4, *x5); + w11 = _mm_unpackhi_epi8(*x6, *x7); + + w4 = _mm_unpacklo_epi16(w0, w1); + w5 = _mm_unpacklo_epi16(w2, w3); + w12 = _mm_unpacklo_epi16(w8, w9); + w13 = _mm_unpacklo_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store first 4-line result + *d0d1 = _mm_unpacklo_epi64(w6, w14); + *d2d3 = _mm_unpackhi_epi64(w6, w14); + *d4d5 = _mm_unpacklo_epi64(w7, w15); + *d6d7 = _mm_unpackhi_epi64(w7, w15); + + w4 = _mm_unpackhi_epi16(w0, w1); + w5 = _mm_unpackhi_epi16(w2, w3); + w12 = _mm_unpackhi_epi16(w8, w9); + w13 = _mm_unpackhi_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store second 4-line result + *d8d9 = _mm_unpacklo_epi64(w6, w14); + *d10d11 = _mm_unpackhi_epi64(w6, w14); + *d12d13 = _mm_unpacklo_epi64(w7, w15); + *d14d15 = _mm_unpackhi_epi64(w7, w15); +} + +#endif // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ diff --git a/libs/libaom/src/aom_dsp/x86/masked_sad4d_ssse3.c b/libs/libaom/src/aom_dsp/x86/masked_sad4d_ssse3.c new file mode 100644 index 000000000..8ef7ee0d7 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/masked_sad4d_ssse3.c @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/blend.h" +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" + +#include "aom_dsp/x86/masked_sad_intrin_ssse3.h" + +#define MASK_SAD16XH_ONE_REF(idx) \ + a = _mm_loadu_si128((const __m128i *)&ref##idx[x]); \ + data_l = _mm_unpacklo_epi8(a, b); \ + mask_l = _mm_unpacklo_epi8(m, m_inv); \ + pred_l = _mm_maddubs_epi16(data_l, mask_l); \ + pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); \ + \ + data_r = _mm_unpackhi_epi8(a, b); \ + mask_r = _mm_unpackhi_epi8(m, m_inv); \ + pred_r = _mm_maddubs_epi16(data_r, mask_r); \ + pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); \ + \ + pred = _mm_packus_epi16(pred_l, pred_r); \ + res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src)); + +static INLINE void masked_sadx4d_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr[], int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int width, int height, int inv_mask, + unsigned sad_array[]) { + int x, y; + __m128i a; + __m128i data_l, data_r, mask_l, mask_r, pred_l, pred_r, pred; + const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + __m128i res0 = _mm_setzero_si128(); + __m128i res1 = _mm_setzero_si128(); + __m128i res2 = _mm_setzero_si128(); + __m128i res3 = _mm_setzero_si128(); + const uint8_t *ref0 = a_ptr[0]; + const uint8_t *ref1 = a_ptr[1]; + const uint8_t *ref2 = a_ptr[2]; + const uint8_t *ref3 = a_ptr[3]; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); + const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); + const __m128i m_copy = _mm_loadu_si128((const __m128i *)&m_ptr[x]); + __m128i m_inv = _mm_sub_epi8(mask_max, m_copy); + __m128i m = inv_mask ? m_inv : m_copy; + m_inv = inv_mask ? m_copy : m_inv; + + MASK_SAD16XH_ONE_REF(0) + MASK_SAD16XH_ONE_REF(1) + MASK_SAD16XH_ONE_REF(2) + MASK_SAD16XH_ONE_REF(3) + } + + src_ptr += src_stride; + ref0 += a_stride; + ref1 += a_stride; + ref2 += a_stride; + ref3 += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + res0 = _mm_add_epi32(_mm_unpacklo_epi32(res0, res1), + _mm_unpackhi_epi32(res0, res1)); + res2 = _mm_add_epi32(_mm_unpacklo_epi32(res2, res3), + _mm_unpackhi_epi32(res2, res3)); + + res0 = _mm_unpacklo_epi64(res0, res2); + _mm_storeu_si128((__m128i *)sad_array, res0); +} + +#define MASK_SAD8XH_ONE_REF(idx) \ + const __m128i a##idx##0 = _mm_loadl_epi64((__m128i *)ref##idx); \ + const __m128i a##idx##1 = _mm_loadl_epi64((__m128i *)(ref##idx + a_stride)); \ + data_l = _mm_unpacklo_epi8(a##idx##0, b0); \ + mask_l = _mm_unpacklo_epi8(m, m_inv); \ + pred_l = _mm_maddubs_epi16(data_l, mask_l); \ + pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); \ + \ + data_r = _mm_unpacklo_epi8(a##idx##1, b1); \ + mask_r = _mm_unpackhi_epi8(m, m_inv); \ + pred_r = _mm_maddubs_epi16(data_r, mask_r); \ + pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); \ + \ + pred = _mm_packus_epi16(pred_l, pred_r); \ + res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src)); + +void aom_masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_array[], int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height, + int inv_mask, unsigned sad_array[]) { + const uint8_t *ref0 = ref_array[0]; + const uint8_t *ref1 = ref_array[1]; + const uint8_t *ref2 = ref_array[2]; + const uint8_t *ref3 = ref_array[3]; + __m128i data_l, data_r, pred_l, pred_r, mask_l, mask_r, pred; + __m128i res0 = _mm_setzero_si128(); + __m128i res1 = _mm_setzero_si128(); + __m128i res2 = _mm_setzero_si128(); + __m128i res3 = _mm_setzero_si128(); + const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + + for (int y = 0; y < height; y += 2) { + const __m128i src = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)src_ptr), + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride))); + const __m128i b0 = _mm_loadl_epi64((__m128i *)b_ptr); + const __m128i b1 = _mm_loadl_epi64((__m128i *)(b_ptr + b_stride)); + const __m128i m0 = _mm_loadl_epi64((__m128i *)m_ptr); + const __m128i m1 = _mm_loadl_epi64((__m128i *)(m_ptr + m_stride)); + __m128i m_copy = _mm_unpacklo_epi64(m0, m1); + __m128i m_inv = _mm_sub_epi8(mask_max, m_copy); + __m128i m = inv_mask ? m_inv : m_copy; + m_inv = inv_mask ? m_copy : m_inv; + + MASK_SAD8XH_ONE_REF(0) + MASK_SAD8XH_ONE_REF(1) + MASK_SAD8XH_ONE_REF(2) + MASK_SAD8XH_ONE_REF(3) + + ref0 += 2 * a_stride; + ref1 += 2 * a_stride; + ref2 += 2 * a_stride; + ref3 += 2 * a_stride; + src_ptr += 2 * src_stride; + b_ptr += 2 * b_stride; + m_ptr += 2 * m_stride; + } + res0 = _mm_add_epi32(_mm_unpacklo_epi32(res0, res1), + _mm_unpackhi_epi32(res0, res1)); + res2 = _mm_add_epi32(_mm_unpacklo_epi32(res2, res3), + _mm_unpackhi_epi32(res2, res3)); + res0 = _mm_unpacklo_epi64(res0, res2); + _mm_storeu_si128((__m128i *)sad_array, res0); +} + +#define MASK_SAD4XH_ONE_REF(idx) \ + a = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)ref##idx), \ + _mm_cvtsi32_si128(*(uint32_t *)&ref##idx[a_stride])); \ + data = _mm_unpacklo_epi8(a, b); \ + mask = _mm_unpacklo_epi8(m, m_inv); \ + pred = _mm_maddubs_epi16(data, mask); \ + pred = xx_roundn_epu16(pred, AOM_BLEND_A64_ROUND_BITS); \ + \ + pred = _mm_packus_epi16(pred, _mm_setzero_si128()); \ + res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src)); + +void aom_masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_array[], int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height, + int inv_mask, unsigned sad_array[]) { + const uint8_t *ref0 = ref_array[0]; + const uint8_t *ref1 = ref_array[1]; + const uint8_t *ref2 = ref_array[2]; + const uint8_t *ref3 = ref_array[3]; + __m128i data, pred, mask; + __m128i res0 = _mm_setzero_si128(); + __m128i res1 = _mm_setzero_si128(); + __m128i res2 = _mm_setzero_si128(); + __m128i res3 = _mm_setzero_si128(); + __m128i a; + const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + + for (int y = 0; y < height; y += 2) { + const __m128i src = _mm_unpacklo_epi32( + _mm_cvtsi32_si128(*(uint32_t *)src_ptr), + _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride])); + const __m128i b = + _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr), + _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride])); + const __m128i m_copy = + _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr), + _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride])); + + __m128i m_inv = _mm_sub_epi8(mask_max, m_copy); + __m128i m = inv_mask ? m_inv : m_copy; + m_inv = inv_mask ? m_copy : m_inv; + + MASK_SAD4XH_ONE_REF(0) + MASK_SAD4XH_ONE_REF(1) + MASK_SAD4XH_ONE_REF(2) + MASK_SAD4XH_ONE_REF(3) + + ref0 += 2 * a_stride; + ref1 += 2 * a_stride; + ref2 += 2 * a_stride; + ref3 += 2 * a_stride; + src_ptr += 2 * src_stride; + b_ptr += 2 * b_stride; + m_ptr += 2 * m_stride; + } + res0 = _mm_unpacklo_epi32(res0, res1); + res2 = _mm_unpacklo_epi32(res2, res3); + res0 = _mm_unpacklo_epi64(res0, res2); + _mm_storeu_si128((__m128i *)sad_array, res0); +} + +#define MASKSADMXN_SSSE3(m, n) \ + void aom_masked_sad##m##x##n##x4d_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref[], \ + int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \ + int msk_stride, int inv_mask, unsigned sad_array[]) { \ + masked_sadx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, m, msk, \ + msk_stride, m, n, inv_mask, sad_array); \ + } + +#define MASKSAD8XN_SSSE3(n) \ + void aom_masked_sad8x##n##x4d_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref[], \ + int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \ + int msk_stride, int inv_mask, unsigned sad_array[]) { \ + aom_masked_sad8xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \ + 8, msk, msk_stride, n, inv_mask, sad_array); \ + } + +#define MASKSAD4XN_SSSE3(n) \ + void aom_masked_sad4x##n##x4d_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref[], \ + int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \ + int msk_stride, int inv_mask, unsigned sad_array[]) { \ + aom_masked_sad4xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \ + 4, msk, msk_stride, n, inv_mask, sad_array); \ + } + +MASKSADMXN_SSSE3(128, 128) +MASKSADMXN_SSSE3(128, 64) +MASKSADMXN_SSSE3(64, 128) +MASKSADMXN_SSSE3(64, 64) +MASKSADMXN_SSSE3(64, 32) +MASKSADMXN_SSSE3(32, 64) +MASKSADMXN_SSSE3(32, 32) +MASKSADMXN_SSSE3(32, 16) +MASKSADMXN_SSSE3(16, 32) +MASKSADMXN_SSSE3(16, 16) +MASKSADMXN_SSSE3(16, 8) +MASKSAD8XN_SSSE3(16) +MASKSAD8XN_SSSE3(8) +MASKSAD8XN_SSSE3(4) +MASKSAD4XN_SSSE3(8) +MASKSAD4XN_SSSE3(4) +MASKSAD4XN_SSSE3(16) +MASKSADMXN_SSSE3(16, 4) +MASKSAD8XN_SSSE3(32) +MASKSADMXN_SSSE3(32, 8) +MASKSADMXN_SSSE3(16, 64) +MASKSADMXN_SSSE3(64, 16) diff --git a/libs/libaom/src/aom_dsp/x86/masked_sad_intrin_avx2.c b/libs/libaom/src/aom_dsp/x86/masked_sad_intrin_avx2.c new file mode 100644 index 000000000..60f0ab339 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/masked_sad_intrin_avx2.c @@ -0,0 +1,389 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/blend.h" +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/masked_sad_intrin_ssse3.h" + +static INLINE unsigned int masked_sad32xh_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, + int width, int height) { + int x, y; + __m256i res = _mm256_setzero_si256(); + const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_scale = + _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 32) { + const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]); + const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]); + const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]); + const __m256i m = _mm256_lddqu_si256((const __m256i *)&m_ptr[x]); + const __m256i m_inv = _mm256_sub_epi8(mask_max, m); + + // Calculate 16 predicted pixels. + // Note that the maximum value of any entry of 'pred_l' or 'pred_r' + // is 64 * 255, so we have plenty of space to add rounding constants. + const __m256i data_l = _mm256_unpacklo_epi8(a, b); + const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv); + __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l); + pred_l = _mm256_mulhrs_epi16(pred_l, round_scale); + + const __m256i data_r = _mm256_unpackhi_epi8(a, b); + const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv); + __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r); + pred_r = _mm256_mulhrs_epi16(pred_r, round_scale); + + const __m256i pred = _mm256_packus_epi16(pred_l, pred_r); + res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src)); + } + + src_ptr += src_stride; + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. + res = _mm256_shuffle_epi32(res, 0xd8); + res = _mm256_permute4x64_epi64(res, 0xd8); + res = _mm256_hadd_epi32(res, res); + res = _mm256_hadd_epi32(res, res); + int32_t sad = _mm256_extract_epi32(res, 0); + return sad; +} + +static INLINE __m256i xx_loadu2_m128i(const void *hi, const void *lo) { + __m128i a0 = _mm_lddqu_si128((const __m128i *)(lo)); + __m128i a1 = _mm_lddqu_si128((const __m128i *)(hi)); + __m256i a = _mm256_castsi128_si256(a0); + return _mm256_inserti128_si256(a, a1, 1); +} + +static INLINE unsigned int masked_sad16xh_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, + int height) { + int y; + __m256i res = _mm256_setzero_si256(); + const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_scale = + _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + for (y = 0; y < height; y += 2) { + const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr); + const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr); + const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr); + const __m256i m = xx_loadu2_m128i(m_ptr + m_stride, m_ptr); + const __m256i m_inv = _mm256_sub_epi8(mask_max, m); + + // Calculate 16 predicted pixels. + // Note that the maximum value of any entry of 'pred_l' or 'pred_r' + // is 64 * 255, so we have plenty of space to add rounding constants. + const __m256i data_l = _mm256_unpacklo_epi8(a, b); + const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv); + __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l); + pred_l = _mm256_mulhrs_epi16(pred_l, round_scale); + + const __m256i data_r = _mm256_unpackhi_epi8(a, b); + const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv); + __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r); + pred_r = _mm256_mulhrs_epi16(pred_r, round_scale); + + const __m256i pred = _mm256_packus_epi16(pred_l, pred_r); + res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src)); + + src_ptr += src_stride << 1; + a_ptr += a_stride << 1; + b_ptr += b_stride << 1; + m_ptr += m_stride << 1; + } + // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. + res = _mm256_shuffle_epi32(res, 0xd8); + res = _mm256_permute4x64_epi64(res, 0xd8); + res = _mm256_hadd_epi32(res, res); + res = _mm256_hadd_epi32(res, res); + int32_t sad = _mm256_extract_epi32(res, 0); + return sad; +} + +static INLINE unsigned int aom_masked_sad_avx2( + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, + int invert_mask, int m, int n) { + unsigned int sad; + if (!invert_mask) { + switch (m) { + case 4: + sad = aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, n); + break; + case 8: + sad = aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, n); + break; + case 16: + sad = masked_sad16xh_avx2(src, src_stride, ref, ref_stride, second_pred, + m, msk, msk_stride, n); + break; + default: + sad = masked_sad32xh_avx2(src, src_stride, ref, ref_stride, second_pred, + m, msk, msk_stride, m, n); + break; + } + } else { + switch (m) { + case 4: + sad = aom_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + case 8: + sad = aom_masked_sad8xh_ssse3(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + case 16: + sad = masked_sad16xh_avx2(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + default: + sad = masked_sad32xh_avx2(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, m, n); + break; + } + } + return sad; +} + +#define MASKSADMXN_AVX2(m, n) \ + unsigned int aom_masked_sad##m##x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ + int invert_mask) { \ + return aom_masked_sad_avx2(src, src_stride, ref, ref_stride, second_pred, \ + msk, msk_stride, invert_mask, m, n); \ + } + +MASKSADMXN_AVX2(4, 4) +MASKSADMXN_AVX2(4, 8) +MASKSADMXN_AVX2(8, 4) +MASKSADMXN_AVX2(8, 8) +MASKSADMXN_AVX2(8, 16) +MASKSADMXN_AVX2(16, 8) +MASKSADMXN_AVX2(16, 16) +MASKSADMXN_AVX2(16, 32) +MASKSADMXN_AVX2(32, 16) +MASKSADMXN_AVX2(32, 32) +MASKSADMXN_AVX2(32, 64) +MASKSADMXN_AVX2(64, 32) +MASKSADMXN_AVX2(64, 64) +MASKSADMXN_AVX2(64, 128) +MASKSADMXN_AVX2(128, 64) +MASKSADMXN_AVX2(128, 128) +MASKSADMXN_AVX2(4, 16) +MASKSADMXN_AVX2(16, 4) +MASKSADMXN_AVX2(8, 32) +MASKSADMXN_AVX2(32, 8) +MASKSADMXN_AVX2(16, 64) +MASKSADMXN_AVX2(64, 16) + +static INLINE unsigned int highbd_masked_sad8xh_avx2( + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, + int height) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); + int y; + __m256i res = _mm256_setzero_si256(); + const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_const = + _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m256i one = _mm256_set1_epi16(1); + + for (y = 0; y < height; y += 2) { + const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr); + const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr); + const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr); + // Zero-extend mask to 16 bits + const __m256i m = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(m_ptr)), + _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride)))); + const __m256i m_inv = _mm256_sub_epi16(mask_max, m); + + const __m256i data_l = _mm256_unpacklo_epi16(a, b); + const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv); + __m256i pred_l = _mm256_madd_epi16(data_l, mask_l); + pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m256i data_r = _mm256_unpackhi_epi16(a, b); + const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv); + __m256i pred_r = _mm256_madd_epi16(data_r, mask_r); + pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const), + AOM_BLEND_A64_ROUND_BITS); + + // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, + // so it is safe to do signed saturation here. + const __m256i pred = _mm256_packs_epi32(pred_l, pred_r); + // There is no 16-bit SAD instruction, so we have to synthesize + // an 8-element SAD. We do this by storing 4 32-bit partial SADs, + // and accumulating them at the end + const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src)); + res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one)); + + src_ptr += src_stride << 1; + a_ptr += a_stride << 1; + b_ptr += b_stride << 1; + m_ptr += m_stride << 1; + } + // At this point, we have four 32-bit partial SADs stored in 'res'. + res = _mm256_hadd_epi32(res, res); + res = _mm256_hadd_epi32(res, res); + int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4); + return sad; +} + +static INLINE unsigned int highbd_masked_sad16xh_avx2( + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, + int width, int height) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); + int x, y; + __m256i res = _mm256_setzero_si256(); + const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_const = + _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m256i one = _mm256_set1_epi16(1); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]); + const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]); + const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]); + // Zero-extend mask to 16 bits + const __m256i m = + _mm256_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)&m_ptr[x])); + const __m256i m_inv = _mm256_sub_epi16(mask_max, m); + + const __m256i data_l = _mm256_unpacklo_epi16(a, b); + const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv); + __m256i pred_l = _mm256_madd_epi16(data_l, mask_l); + pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m256i data_r = _mm256_unpackhi_epi16(a, b); + const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv); + __m256i pred_r = _mm256_madd_epi16(data_r, mask_r); + pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const), + AOM_BLEND_A64_ROUND_BITS); + + // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, + // so it is safe to do signed saturation here. + const __m256i pred = _mm256_packs_epi32(pred_l, pred_r); + // There is no 16-bit SAD instruction, so we have to synthesize + // an 8-element SAD. We do this by storing 4 32-bit partial SADs, + // and accumulating them at the end + const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src)); + res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one)); + } + + src_ptr += src_stride; + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + // At this point, we have four 32-bit partial SADs stored in 'res'. + res = _mm256_hadd_epi32(res, res); + res = _mm256_hadd_epi32(res, res); + int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4); + return sad; +} + +static INLINE unsigned int aom_highbd_masked_sad_avx2( + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, + int invert_mask, int m, int n) { + unsigned int sad; + if (!invert_mask) { + switch (m) { + case 4: + sad = + aom_highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, n); + break; + case 8: + sad = highbd_masked_sad8xh_avx2(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, n); + break; + default: + sad = highbd_masked_sad16xh_avx2(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, m, n); + break; + } + } else { + switch (m) { + case 4: + sad = + aom_highbd_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + case 8: + sad = highbd_masked_sad8xh_avx2(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + default: + sad = highbd_masked_sad16xh_avx2(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, m, n); + break; + } + } + return sad; +} + +#define HIGHBD_MASKSADMXN_AVX2(m, n) \ + unsigned int aom_highbd_masked_sad##m##x##n##_avx2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ + int msk_stride, int invert_mask) { \ + return aom_highbd_masked_sad_avx2(src8, src_stride, ref8, ref_stride, \ + second_pred8, msk, msk_stride, \ + invert_mask, m, n); \ + } + +HIGHBD_MASKSADMXN_AVX2(4, 4); +HIGHBD_MASKSADMXN_AVX2(4, 8); +HIGHBD_MASKSADMXN_AVX2(8, 4); +HIGHBD_MASKSADMXN_AVX2(8, 8); +HIGHBD_MASKSADMXN_AVX2(8, 16); +HIGHBD_MASKSADMXN_AVX2(16, 8); +HIGHBD_MASKSADMXN_AVX2(16, 16); +HIGHBD_MASKSADMXN_AVX2(16, 32); +HIGHBD_MASKSADMXN_AVX2(32, 16); +HIGHBD_MASKSADMXN_AVX2(32, 32); +HIGHBD_MASKSADMXN_AVX2(32, 64); +HIGHBD_MASKSADMXN_AVX2(64, 32); +HIGHBD_MASKSADMXN_AVX2(64, 64); +HIGHBD_MASKSADMXN_AVX2(64, 128); +HIGHBD_MASKSADMXN_AVX2(128, 64); +HIGHBD_MASKSADMXN_AVX2(128, 128); +HIGHBD_MASKSADMXN_AVX2(4, 16); +HIGHBD_MASKSADMXN_AVX2(16, 4); +HIGHBD_MASKSADMXN_AVX2(8, 32); +HIGHBD_MASKSADMXN_AVX2(32, 8); +HIGHBD_MASKSADMXN_AVX2(16, 64); +HIGHBD_MASKSADMXN_AVX2(64, 16); diff --git a/libs/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.c b/libs/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.c new file mode 100644 index 000000000..716827796 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.c @@ -0,0 +1,402 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/blend.h" +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" + +#include "aom_dsp/x86/masked_sad_intrin_ssse3.h" + +// For width a multiple of 16 +static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, + int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int width, int height); + +#define MASKSADMXN_SSSE3(m, n) \ + unsigned int aom_masked_sad##m##x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ + int invert_mask) { \ + if (!invert_mask) \ + return masked_sad_ssse3(src, src_stride, ref, ref_stride, second_pred, \ + m, msk, msk_stride, m, n); \ + else \ + return masked_sad_ssse3(src, src_stride, second_pred, m, ref, \ + ref_stride, msk, msk_stride, m, n); \ + } + +#define MASKSAD8XN_SSSE3(n) \ + unsigned int aom_masked_sad8x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ + int invert_mask) { \ + if (!invert_mask) \ + return aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, \ + second_pred, 8, msk, msk_stride, n); \ + else \ + return aom_masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref, \ + ref_stride, msk, msk_stride, n); \ + } + +#define MASKSAD4XN_SSSE3(n) \ + unsigned int aom_masked_sad4x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ + int invert_mask) { \ + if (!invert_mask) \ + return aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, \ + second_pred, 4, msk, msk_stride, n); \ + else \ + return aom_masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref, \ + ref_stride, msk, msk_stride, n); \ + } + +MASKSADMXN_SSSE3(128, 128) +MASKSADMXN_SSSE3(128, 64) +MASKSADMXN_SSSE3(64, 128) +MASKSADMXN_SSSE3(64, 64) +MASKSADMXN_SSSE3(64, 32) +MASKSADMXN_SSSE3(32, 64) +MASKSADMXN_SSSE3(32, 32) +MASKSADMXN_SSSE3(32, 16) +MASKSADMXN_SSSE3(16, 32) +MASKSADMXN_SSSE3(16, 16) +MASKSADMXN_SSSE3(16, 8) +MASKSAD8XN_SSSE3(16) +MASKSAD8XN_SSSE3(8) +MASKSAD8XN_SSSE3(4) +MASKSAD4XN_SSSE3(8) +MASKSAD4XN_SSSE3(4) +MASKSAD4XN_SSSE3(16) +MASKSADMXN_SSSE3(16, 4) +MASKSAD8XN_SSSE3(32) +MASKSADMXN_SSSE3(32, 8) +MASKSADMXN_SSSE3(16, 64) +MASKSADMXN_SSSE3(64, 16) + +static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, + int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int width, int height) { + int x, y; + __m128i res = _mm_setzero_si128(); + const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); + const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); + const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); + const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]); + const __m128i m_inv = _mm_sub_epi8(mask_max, m); + + // Calculate 16 predicted pixels. + // Note that the maximum value of any entry of 'pred_l' or 'pred_r' + // is 64 * 255, so we have plenty of space to add rounding constants. + const __m128i data_l = _mm_unpacklo_epi8(a, b); + const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv); + __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l); + pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); + + const __m128i data_r = _mm_unpackhi_epi8(a, b); + const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv); + __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r); + pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); + + const __m128i pred = _mm_packus_epi16(pred_l, pred_r); + res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); + } + + src_ptr += src_stride; + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. + int32_t sad = + _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8)); + return sad; +} + +unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height) { + int y; + __m128i res = _mm_setzero_si128(); + const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + + for (y = 0; y < height; y += 2) { + const __m128i src = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)src_ptr), + _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); + const __m128i a0 = _mm_loadl_epi64((const __m128i *)a_ptr); + const __m128i a1 = _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]); + const __m128i b0 = _mm_loadl_epi64((const __m128i *)b_ptr); + const __m128i b1 = _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]); + const __m128i m = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr), + _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride])); + const __m128i m_inv = _mm_sub_epi8(mask_max, m); + + const __m128i data_l = _mm_unpacklo_epi8(a0, b0); + const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv); + __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l); + pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); + + const __m128i data_r = _mm_unpacklo_epi8(a1, b1); + const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv); + __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r); + pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); + + const __m128i pred = _mm_packus_epi16(pred_l, pred_r); + res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); + + src_ptr += src_stride * 2; + a_ptr += a_stride * 2; + b_ptr += b_stride * 2; + m_ptr += m_stride * 2; + } + int32_t sad = + _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8)); + return sad; +} + +unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height) { + int y; + __m128i res = _mm_setzero_si128(); + const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + + for (y = 0; y < height; y += 2) { + // Load two rows at a time, this seems to be a bit faster + // than four rows at a time in this case. + const __m128i src = _mm_unpacklo_epi32( + _mm_cvtsi32_si128(*(uint32_t *)src_ptr), + _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride])); + const __m128i a = + _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)a_ptr), + _mm_cvtsi32_si128(*(uint32_t *)&a_ptr[a_stride])); + const __m128i b = + _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr), + _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride])); + const __m128i m = + _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr), + _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride])); + const __m128i m_inv = _mm_sub_epi8(mask_max, m); + + const __m128i data = _mm_unpacklo_epi8(a, b); + const __m128i mask = _mm_unpacklo_epi8(m, m_inv); + __m128i pred_16bit = _mm_maddubs_epi16(data, mask); + pred_16bit = xx_roundn_epu16(pred_16bit, AOM_BLEND_A64_ROUND_BITS); + + const __m128i pred = _mm_packus_epi16(pred_16bit, _mm_setzero_si128()); + res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); + + src_ptr += src_stride * 2; + a_ptr += a_stride * 2; + b_ptr += b_stride * 2; + m_ptr += m_stride * 2; + } + // At this point, the SAD is stored in lane 0 of 'res' + int32_t sad = _mm_cvtsi128_si32(res); + return sad; +} + +// For width a multiple of 8 +static INLINE unsigned int highbd_masked_sad_ssse3( + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, + int width, int height); + +#define HIGHBD_MASKSADMXN_SSSE3(m, n) \ + unsigned int aom_highbd_masked_sad##m##x##n##_ssse3( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ + int msk_stride, int invert_mask) { \ + if (!invert_mask) \ + return highbd_masked_sad_ssse3(src8, src_stride, ref8, ref_stride, \ + second_pred8, m, msk, msk_stride, m, n); \ + else \ + return highbd_masked_sad_ssse3(src8, src_stride, second_pred8, m, ref8, \ + ref_stride, msk, msk_stride, m, n); \ + } + +#define HIGHBD_MASKSAD4XN_SSSE3(n) \ + unsigned int aom_highbd_masked_sad4x##n##_ssse3( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ + int msk_stride, int invert_mask) { \ + if (!invert_mask) \ + return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, ref8, \ + ref_stride, second_pred8, 4, msk, \ + msk_stride, n); \ + else \ + return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4, \ + ref8, ref_stride, msk, msk_stride, \ + n); \ + } + +HIGHBD_MASKSADMXN_SSSE3(128, 128) +HIGHBD_MASKSADMXN_SSSE3(128, 64) +HIGHBD_MASKSADMXN_SSSE3(64, 128) +HIGHBD_MASKSADMXN_SSSE3(64, 64) +HIGHBD_MASKSADMXN_SSSE3(64, 32) +HIGHBD_MASKSADMXN_SSSE3(32, 64) +HIGHBD_MASKSADMXN_SSSE3(32, 32) +HIGHBD_MASKSADMXN_SSSE3(32, 16) +HIGHBD_MASKSADMXN_SSSE3(16, 32) +HIGHBD_MASKSADMXN_SSSE3(16, 16) +HIGHBD_MASKSADMXN_SSSE3(16, 8) +HIGHBD_MASKSADMXN_SSSE3(8, 16) +HIGHBD_MASKSADMXN_SSSE3(8, 8) +HIGHBD_MASKSADMXN_SSSE3(8, 4) +HIGHBD_MASKSAD4XN_SSSE3(8) +HIGHBD_MASKSAD4XN_SSSE3(4) +HIGHBD_MASKSAD4XN_SSSE3(16) +HIGHBD_MASKSADMXN_SSSE3(16, 4) +HIGHBD_MASKSADMXN_SSSE3(8, 32) +HIGHBD_MASKSADMXN_SSSE3(32, 8) +HIGHBD_MASKSADMXN_SSSE3(16, 64) +HIGHBD_MASKSADMXN_SSSE3(64, 16) + +static INLINE unsigned int highbd_masked_sad_ssse3( + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, + int width, int height) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); + int x, y; + __m128i res = _mm_setzero_si128(); + const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m128i round_const = + _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m128i one = _mm_set1_epi16(1); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 8) { + const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); + const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); + const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); + // Zero-extend mask to 16 bits + const __m128i m = _mm_unpacklo_epi8( + _mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128()); + const __m128i m_inv = _mm_sub_epi16(mask_max, m); + + const __m128i data_l = _mm_unpacklo_epi16(a, b); + const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); + __m128i pred_l = _mm_madd_epi16(data_l, mask_l); + pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i data_r = _mm_unpackhi_epi16(a, b); + const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); + __m128i pred_r = _mm_madd_epi16(data_r, mask_r); + pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), + AOM_BLEND_A64_ROUND_BITS); + + // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, + // so it is safe to do signed saturation here. + const __m128i pred = _mm_packs_epi32(pred_l, pred_r); + // There is no 16-bit SAD instruction, so we have to synthesize + // an 8-element SAD. We do this by storing 4 32-bit partial SADs, + // and accumulating them at the end + const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); + res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); + } + + src_ptr += src_stride; + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + // At this point, we have four 32-bit partial SADs stored in 'res'. + res = _mm_hadd_epi32(res, res); + res = _mm_hadd_epi32(res, res); + int sad = _mm_cvtsi128_si32(res); + return sad; +} + +unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride, + const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); + int y; + __m128i res = _mm_setzero_si128(); + const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m128i round_const = + _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m128i one = _mm_set1_epi16(1); + + for (y = 0; y < height; y += 2) { + const __m128i src = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)src_ptr), + _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); + const __m128i a = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr), + _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride])); + const __m128i b = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr), + _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride])); + // Zero-extend mask to 16 bits + const __m128i m = _mm_unpacklo_epi8( + _mm_unpacklo_epi32( + _mm_cvtsi32_si128(*(const uint32_t *)m_ptr), + _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])), + _mm_setzero_si128()); + const __m128i m_inv = _mm_sub_epi16(mask_max, m); + + const __m128i data_l = _mm_unpacklo_epi16(a, b); + const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); + __m128i pred_l = _mm_madd_epi16(data_l, mask_l); + pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i data_r = _mm_unpackhi_epi16(a, b); + const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); + __m128i pred_r = _mm_madd_epi16(data_r, mask_r); + pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i pred = _mm_packs_epi32(pred_l, pred_r); + const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); + res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); + + src_ptr += src_stride * 2; + a_ptr += a_stride * 2; + b_ptr += b_stride * 2; + m_ptr += m_stride * 2; + } + res = _mm_hadd_epi32(res, res); + res = _mm_hadd_epi32(res, res); + int sad = _mm_cvtsi128_si32(res); + return sad; +} diff --git a/libs/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.h b/libs/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.h new file mode 100644 index 000000000..cffbd9672 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_ +#define AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_ + +unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height); + +unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height); + +unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride, + const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height); + +#endif // AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_ diff --git a/libs/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.c b/libs/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.c new file mode 100644 index 000000000..fa93f0df4 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.c @@ -0,0 +1,1067 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/blend.h" +#include "aom_dsp/x86/masked_variance_intrin_ssse3.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_ports/mem.h" + +// For width a multiple of 16 +static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset, + int yoffset, uint8_t *dst, int w, int h); + +static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset, + int yoffset, uint8_t *dst, int h); + +static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset, + int yoffset, uint8_t *dst, int h); + +// For width a multiple of 16 +static void masked_variance(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int width, + int height, unsigned int *sse, int *sum_); + +static void masked_variance8xh(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, const uint8_t *b_ptr, + const uint8_t *m_ptr, int m_stride, int height, + unsigned int *sse, int *sum_); + +static void masked_variance4xh(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, const uint8_t *b_ptr, + const uint8_t *m_ptr, int m_stride, int height, + unsigned int *sse, int *sum_); + +#define MASK_SUBPIX_VAR_SSSE3(W, H) \ + unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + int sum; \ + uint8_t temp[(H + 1) * W]; \ + \ + bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \ + \ + if (!invert_mask) \ + masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \ + msk_stride, W, H, sse, &sum); \ + else \ + masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \ + msk_stride, W, H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + } + +#define MASK_SUBPIX_VAR8XH_SSSE3(H) \ + unsigned int aom_masked_sub_pixel_variance8x##H##_ssse3( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + int sum; \ + uint8_t temp[(H + 1) * 8]; \ + \ + bilinear_filter8xh(src, src_stride, xoffset, yoffset, temp, H); \ + \ + if (!invert_mask) \ + masked_variance8xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \ + H, sse, &sum); \ + else \ + masked_variance8xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \ + H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (8 * H)); \ + } + +#define MASK_SUBPIX_VAR4XH_SSSE3(H) \ + unsigned int aom_masked_sub_pixel_variance4x##H##_ssse3( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + int sum; \ + uint8_t temp[(H + 1) * 4]; \ + \ + bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \ + \ + if (!invert_mask) \ + masked_variance4xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \ + H, sse, &sum); \ + else \ + masked_variance4xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \ + H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \ + } + +MASK_SUBPIX_VAR_SSSE3(128, 128) +MASK_SUBPIX_VAR_SSSE3(128, 64) +MASK_SUBPIX_VAR_SSSE3(64, 128) +MASK_SUBPIX_VAR_SSSE3(64, 64) +MASK_SUBPIX_VAR_SSSE3(64, 32) +MASK_SUBPIX_VAR_SSSE3(32, 64) +MASK_SUBPIX_VAR_SSSE3(32, 32) +MASK_SUBPIX_VAR_SSSE3(32, 16) +MASK_SUBPIX_VAR_SSSE3(16, 32) +MASK_SUBPIX_VAR_SSSE3(16, 16) +MASK_SUBPIX_VAR_SSSE3(16, 8) +MASK_SUBPIX_VAR8XH_SSSE3(16) +MASK_SUBPIX_VAR8XH_SSSE3(8) +MASK_SUBPIX_VAR8XH_SSSE3(4) +MASK_SUBPIX_VAR4XH_SSSE3(8) +MASK_SUBPIX_VAR4XH_SSSE3(4) +MASK_SUBPIX_VAR4XH_SSSE3(16) +MASK_SUBPIX_VAR_SSSE3(16, 4) +MASK_SUBPIX_VAR8XH_SSSE3(32) +MASK_SUBPIX_VAR_SSSE3(32, 8) +MASK_SUBPIX_VAR_SSSE3(64, 16) +MASK_SUBPIX_VAR_SSSE3(16, 64) + +static INLINE __m128i filter_block(const __m128i a, const __m128i b, + const __m128i filter) { + __m128i v0 = _mm_unpacklo_epi8(a, b); + v0 = _mm_maddubs_epi16(v0, filter); + v0 = xx_roundn_epu16(v0, FILTER_BITS); + + __m128i v1 = _mm_unpackhi_epi8(a, b); + v1 = _mm_maddubs_epi16(v1, filter); + v1 = xx_roundn_epu16(v1, FILTER_BITS); + + return _mm_packus_epi16(v0, v1); +} + +static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset, + int yoffset, uint8_t *dst, int w, int h) { + int i, j; + // Horizontal filter + if (xoffset == 0) { + uint8_t *b = dst; + for (i = 0; i < h + 1; ++i) { + for (j = 0; j < w; j += 16) { + __m128i x = _mm_loadu_si128((__m128i *)&src[j]); + _mm_storeu_si128((__m128i *)&b[j], x); + } + src += src_stride; + b += w; + } + } else if (xoffset == 4) { + uint8_t *b = dst; + for (i = 0; i < h + 1; ++i) { + for (j = 0; j < w; j += 16) { + __m128i x = _mm_loadu_si128((__m128i *)&src[j]); + __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]); + __m128i z = _mm_alignr_epi8(y, x, 1); + _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu8(x, z)); + } + src += src_stride; + b += w; + } + } else { + uint8_t *b = dst; + const uint8_t *hfilter = bilinear_filters_2t[xoffset]; + const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8)); + for (i = 0; i < h + 1; ++i) { + for (j = 0; j < w; j += 16) { + const __m128i x = _mm_loadu_si128((__m128i *)&src[j]); + const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]); + const __m128i z = _mm_alignr_epi8(y, x, 1); + const __m128i res = filter_block(x, z, hfilter_vec); + _mm_storeu_si128((__m128i *)&b[j], res); + } + + src += src_stride; + b += w; + } + } + + // Vertical filter + if (yoffset == 0) { + // The data is already in 'dst', so no need to filter + } else if (yoffset == 4) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 16) { + __m128i x = _mm_loadu_si128((__m128i *)&dst[j]); + __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]); + _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu8(x, y)); + } + dst += w; + } + } else { + const uint8_t *vfilter = bilinear_filters_2t[yoffset]; + const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8)); + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 16) { + const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]); + const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]); + const __m128i res = filter_block(x, y, vfilter_vec); + _mm_storeu_si128((__m128i *)&dst[j], res); + } + + dst += w; + } + } +} + +static INLINE __m128i filter_block_2rows(const __m128i *a0, const __m128i *b0, + const __m128i *a1, const __m128i *b1, + const __m128i *filter) { + __m128i v0 = _mm_unpacklo_epi8(*a0, *b0); + v0 = _mm_maddubs_epi16(v0, *filter); + v0 = xx_roundn_epu16(v0, FILTER_BITS); + + __m128i v1 = _mm_unpacklo_epi8(*a1, *b1); + v1 = _mm_maddubs_epi16(v1, *filter); + v1 = xx_roundn_epu16(v1, FILTER_BITS); + + return _mm_packus_epi16(v0, v1); +} + +static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset, + int yoffset, uint8_t *dst, int h) { + int i; + // Horizontal filter + if (xoffset == 0) { + uint8_t *b = dst; + for (i = 0; i < h + 1; ++i) { + __m128i x = _mm_loadl_epi64((__m128i *)src); + _mm_storel_epi64((__m128i *)b, x); + src += src_stride; + b += 8; + } + } else if (xoffset == 4) { + uint8_t *b = dst; + for (i = 0; i < h + 1; ++i) { + __m128i x = _mm_loadu_si128((__m128i *)src); + __m128i z = _mm_srli_si128(x, 1); + _mm_storel_epi64((__m128i *)b, _mm_avg_epu8(x, z)); + src += src_stride; + b += 8; + } + } else { + uint8_t *b = dst; + const uint8_t *hfilter = bilinear_filters_2t[xoffset]; + const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8)); + for (i = 0; i < h; i += 2) { + const __m128i x0 = _mm_loadu_si128((__m128i *)src); + const __m128i z0 = _mm_srli_si128(x0, 1); + const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]); + const __m128i z1 = _mm_srli_si128(x1, 1); + const __m128i res = filter_block_2rows(&x0, &z0, &x1, &z1, &hfilter_vec); + _mm_storeu_si128((__m128i *)b, res); + + src += src_stride * 2; + b += 16; + } + // Handle i = h separately + const __m128i x0 = _mm_loadu_si128((__m128i *)src); + const __m128i z0 = _mm_srli_si128(x0, 1); + + __m128i v0 = _mm_unpacklo_epi8(x0, z0); + v0 = _mm_maddubs_epi16(v0, hfilter_vec); + v0 = xx_roundn_epu16(v0, FILTER_BITS); + + _mm_storel_epi64((__m128i *)b, _mm_packus_epi16(v0, v0)); + } + + // Vertical filter + if (yoffset == 0) { + // The data is already in 'dst', so no need to filter + } else if (yoffset == 4) { + for (i = 0; i < h; ++i) { + __m128i x = _mm_loadl_epi64((__m128i *)dst); + __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]); + _mm_storel_epi64((__m128i *)dst, _mm_avg_epu8(x, y)); + dst += 8; + } + } else { + const uint8_t *vfilter = bilinear_filters_2t[yoffset]; + const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8)); + for (i = 0; i < h; i += 2) { + const __m128i x = _mm_loadl_epi64((__m128i *)dst); + const __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]); + const __m128i z = _mm_loadl_epi64((__m128i *)&dst[16]); + const __m128i res = filter_block_2rows(&x, &y, &y, &z, &vfilter_vec); + _mm_storeu_si128((__m128i *)dst, res); + + dst += 16; + } + } +} + +static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset, + int yoffset, uint8_t *dst, int h) { + int i; + // Horizontal filter + if (xoffset == 0) { + uint8_t *b = dst; + for (i = 0; i < h + 1; ++i) { + __m128i x = xx_loadl_32((__m128i *)src); + xx_storel_32((__m128i *)b, x); + src += src_stride; + b += 4; + } + } else if (xoffset == 4) { + uint8_t *b = dst; + for (i = 0; i < h + 1; ++i) { + __m128i x = _mm_loadl_epi64((__m128i *)src); + __m128i z = _mm_srli_si128(x, 1); + xx_storel_32((__m128i *)b, _mm_avg_epu8(x, z)); + src += src_stride; + b += 4; + } + } else { + uint8_t *b = dst; + const uint8_t *hfilter = bilinear_filters_2t[xoffset]; + const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8)); + for (i = 0; i < h; i += 4) { + const __m128i x0 = _mm_loadl_epi64((__m128i *)src); + const __m128i z0 = _mm_srli_si128(x0, 1); + const __m128i x1 = _mm_loadl_epi64((__m128i *)&src[src_stride]); + const __m128i z1 = _mm_srli_si128(x1, 1); + const __m128i x2 = _mm_loadl_epi64((__m128i *)&src[src_stride * 2]); + const __m128i z2 = _mm_srli_si128(x2, 1); + const __m128i x3 = _mm_loadl_epi64((__m128i *)&src[src_stride * 3]); + const __m128i z3 = _mm_srli_si128(x3, 1); + + const __m128i a0 = _mm_unpacklo_epi32(x0, x1); + const __m128i b0 = _mm_unpacklo_epi32(z0, z1); + const __m128i a1 = _mm_unpacklo_epi32(x2, x3); + const __m128i b1 = _mm_unpacklo_epi32(z2, z3); + const __m128i res = filter_block_2rows(&a0, &b0, &a1, &b1, &hfilter_vec); + _mm_storeu_si128((__m128i *)b, res); + + src += src_stride * 4; + b += 16; + } + // Handle i = h separately + const __m128i x = _mm_loadl_epi64((__m128i *)src); + const __m128i z = _mm_srli_si128(x, 1); + + __m128i v0 = _mm_unpacklo_epi8(x, z); + v0 = _mm_maddubs_epi16(v0, hfilter_vec); + v0 = xx_roundn_epu16(v0, FILTER_BITS); + + xx_storel_32((__m128i *)b, _mm_packus_epi16(v0, v0)); + } + + // Vertical filter + if (yoffset == 0) { + // The data is already in 'dst', so no need to filter + } else if (yoffset == 4) { + for (i = 0; i < h; ++i) { + __m128i x = xx_loadl_32((__m128i *)dst); + __m128i y = xx_loadl_32((__m128i *)&dst[4]); + xx_storel_32((__m128i *)dst, _mm_avg_epu8(x, y)); + dst += 4; + } + } else { + const uint8_t *vfilter = bilinear_filters_2t[yoffset]; + const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8)); + for (i = 0; i < h; i += 4) { + const __m128i a = xx_loadl_32((__m128i *)dst); + const __m128i b = xx_loadl_32((__m128i *)&dst[4]); + const __m128i c = xx_loadl_32((__m128i *)&dst[8]); + const __m128i d = xx_loadl_32((__m128i *)&dst[12]); + const __m128i e = xx_loadl_32((__m128i *)&dst[16]); + + const __m128i a0 = _mm_unpacklo_epi32(a, b); + const __m128i b0 = _mm_unpacklo_epi32(b, c); + const __m128i a1 = _mm_unpacklo_epi32(c, d); + const __m128i b1 = _mm_unpacklo_epi32(d, e); + const __m128i res = filter_block_2rows(&a0, &b0, &a1, &b1, &vfilter_vec); + _mm_storeu_si128((__m128i *)dst, res); + + dst += 16; + } + } +} + +static INLINE void accumulate_block(const __m128i *src, const __m128i *a, + const __m128i *b, const __m128i *m, + __m128i *sum, __m128i *sum_sq) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m128i m_inv = _mm_sub_epi8(mask_max, *m); + + // Calculate 16 predicted pixels. + // Note that the maximum value of any entry of 'pred_l' or 'pred_r' + // is 64 * 255, so we have plenty of space to add rounding constants. + const __m128i data_l = _mm_unpacklo_epi8(*a, *b); + const __m128i mask_l = _mm_unpacklo_epi8(*m, m_inv); + __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l); + pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); + + const __m128i data_r = _mm_unpackhi_epi8(*a, *b); + const __m128i mask_r = _mm_unpackhi_epi8(*m, m_inv); + __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r); + pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); + + const __m128i src_l = _mm_unpacklo_epi8(*src, zero); + const __m128i src_r = _mm_unpackhi_epi8(*src, zero); + const __m128i diff_l = _mm_sub_epi16(pred_l, src_l); + const __m128i diff_r = _mm_sub_epi16(pred_r, src_r); + + // Update partial sums and partial sums of squares + *sum = + _mm_add_epi32(*sum, _mm_madd_epi16(_mm_add_epi16(diff_l, diff_r), one)); + *sum_sq = + _mm_add_epi32(*sum_sq, _mm_add_epi32(_mm_madd_epi16(diff_l, diff_l), + _mm_madd_epi16(diff_r, diff_r))); +} + +static void masked_variance(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int width, + int height, unsigned int *sse, int *sum_) { + int x, y; + __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); + const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); + const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); + const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]); + accumulate_block(&src, &a, &b, &m, &sum, &sum_sq); + } + + src_ptr += src_stride; + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + // Reduce down to a single sum and sum of squares + sum = _mm_hadd_epi32(sum, sum_sq); + sum = _mm_hadd_epi32(sum, sum); + *sum_ = _mm_cvtsi128_si32(sum); + *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); +} + +static void masked_variance8xh(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, const uint8_t *b_ptr, + const uint8_t *m_ptr, int m_stride, int height, + unsigned int *sse, int *sum_) { + int y; + __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); + + for (y = 0; y < height; y += 2) { + __m128i src = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)src_ptr), + _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); + const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr); + const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr); + const __m128i m = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr), + _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride])); + accumulate_block(&src, &a, &b, &m, &sum, &sum_sq); + + src_ptr += src_stride * 2; + a_ptr += 16; + b_ptr += 16; + m_ptr += m_stride * 2; + } + // Reduce down to a single sum and sum of squares + sum = _mm_hadd_epi32(sum, sum_sq); + sum = _mm_hadd_epi32(sum, sum); + *sum_ = _mm_cvtsi128_si32(sum); + *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); +} + +static void masked_variance4xh(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, const uint8_t *b_ptr, + const uint8_t *m_ptr, int m_stride, int height, + unsigned int *sse, int *sum_) { + int y; + __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); + + for (y = 0; y < height; y += 4) { + // Load four rows at a time + __m128i src = + _mm_setr_epi32(*(uint32_t *)src_ptr, *(uint32_t *)&src_ptr[src_stride], + *(uint32_t *)&src_ptr[src_stride * 2], + *(uint32_t *)&src_ptr[src_stride * 3]); + const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr); + const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr); + const __m128i m = _mm_setr_epi32( + *(uint32_t *)m_ptr, *(uint32_t *)&m_ptr[m_stride], + *(uint32_t *)&m_ptr[m_stride * 2], *(uint32_t *)&m_ptr[m_stride * 3]); + accumulate_block(&src, &a, &b, &m, &sum, &sum_sq); + + src_ptr += src_stride * 4; + a_ptr += 16; + b_ptr += 16; + m_ptr += m_stride * 4; + } + // Reduce down to a single sum and sum of squares + sum = _mm_hadd_epi32(sum, sum_sq); + sum = _mm_hadd_epi32(sum, sum); + *sum_ = _mm_cvtsi128_si32(sum); + *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); +} + +#if CONFIG_AV1_HIGHBITDEPTH +// For width a multiple of 8 +static void highbd_bilinear_filter(const uint16_t *src, int src_stride, + int xoffset, int yoffset, uint16_t *dst, + int w, int h); + +static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride, + int xoffset, int yoffset, uint16_t *dst, + int h); + +// For width a multiple of 8 +static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride, + const uint16_t *a_ptr, int a_stride, + const uint16_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int width, int height, uint64_t *sse, + int *sum_); + +static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride, + const uint16_t *a_ptr, + const uint16_t *b_ptr, + const uint8_t *m_ptr, int m_stride, + int height, int *sse, int *sum_); + +#define HIGHBD_MASK_SUBPIX_VAR_SSSE3(W, H) \ + unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ + const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ + uint64_t sse64; \ + int sum; \ + uint16_t temp[(H + 1) * W]; \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ + \ + highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \ + \ + if (!invert_mask) \ + highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \ + msk_stride, W, H, &sse64, &sum); \ + else \ + highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \ + msk_stride, W, H, &sse64, &sum); \ + *sse = (uint32_t)sse64; \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + } \ + unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ + const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ + uint64_t sse64; \ + int sum; \ + int64_t var; \ + uint16_t temp[(H + 1) * W]; \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ + \ + highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \ + \ + if (!invert_mask) \ + highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \ + msk_stride, W, H, &sse64, &sum); \ + else \ + highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \ + msk_stride, W, H, &sse64, &sum); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 4); \ + sum = ROUND_POWER_OF_TWO(sum, 2); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ + const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ + uint64_t sse64; \ + int sum; \ + int64_t var; \ + uint16_t temp[(H + 1) * W]; \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ + \ + highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \ + \ + if (!invert_mask) \ + highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \ + msk_stride, W, H, &sse64, &sum); \ + else \ + highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \ + msk_stride, W, H, &sse64, &sum); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 8); \ + sum = ROUND_POWER_OF_TWO(sum, 4); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(H) \ + unsigned int aom_highbd_8_masked_sub_pixel_variance4x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ + const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ + int sse_; \ + int sum; \ + uint16_t temp[(H + 1) * 4]; \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ + \ + highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \ + \ + if (!invert_mask) \ + highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \ + msk_stride, H, &sse_, &sum); \ + else \ + highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \ + msk_stride, H, &sse_, &sum); \ + *sse = (uint32_t)sse_; \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \ + } \ + unsigned int aom_highbd_10_masked_sub_pixel_variance4x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ + const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ + int sse_; \ + int sum; \ + int64_t var; \ + uint16_t temp[(H + 1) * 4]; \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ + \ + highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \ + \ + if (!invert_mask) \ + highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \ + msk_stride, H, &sse_, &sum); \ + else \ + highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \ + msk_stride, H, &sse_, &sum); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 4); \ + sum = ROUND_POWER_OF_TWO(sum, 2); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + unsigned int aom_highbd_12_masked_sub_pixel_variance4x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ + const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ + int sse_; \ + int sum; \ + int64_t var; \ + uint16_t temp[(H + 1) * 4]; \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ + \ + highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \ + \ + if (!invert_mask) \ + highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \ + msk_stride, H, &sse_, &sum); \ + else \ + highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \ + msk_stride, H, &sse_, &sum); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 8); \ + sum = ROUND_POWER_OF_TWO(sum, 4); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 128) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 64) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 128) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 64) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 32) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 64) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 32) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 16) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 32) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 16) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 8) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 16) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 8) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 4) +HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(8) +HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(4) +HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(16) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 4) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 32) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 8) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 64) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 16) + +static INLINE __m128i highbd_filter_block(const __m128i a, const __m128i b, + const __m128i filter) { + __m128i v0 = _mm_unpacklo_epi16(a, b); + v0 = _mm_madd_epi16(v0, filter); + v0 = xx_roundn_epu32(v0, FILTER_BITS); + + __m128i v1 = _mm_unpackhi_epi16(a, b); + v1 = _mm_madd_epi16(v1, filter); + v1 = xx_roundn_epu32(v1, FILTER_BITS); + + return _mm_packs_epi32(v0, v1); +} + +static void highbd_bilinear_filter(const uint16_t *src, int src_stride, + int xoffset, int yoffset, uint16_t *dst, + int w, int h) { + int i, j; + // Horizontal filter + if (xoffset == 0) { + uint16_t *b = dst; + for (i = 0; i < h + 1; ++i) { + for (j = 0; j < w; j += 8) { + __m128i x = _mm_loadu_si128((__m128i *)&src[j]); + _mm_storeu_si128((__m128i *)&b[j], x); + } + src += src_stride; + b += w; + } + } else if (xoffset == 4) { + uint16_t *b = dst; + for (i = 0; i < h + 1; ++i) { + for (j = 0; j < w; j += 8) { + __m128i x = _mm_loadu_si128((__m128i *)&src[j]); + __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]); + __m128i z = _mm_alignr_epi8(y, x, 2); + _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu16(x, z)); + } + src += src_stride; + b += w; + } + } else { + uint16_t *b = dst; + const uint8_t *hfilter = bilinear_filters_2t[xoffset]; + const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16)); + for (i = 0; i < h + 1; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i x = _mm_loadu_si128((__m128i *)&src[j]); + const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]); + const __m128i z = _mm_alignr_epi8(y, x, 2); + const __m128i res = highbd_filter_block(x, z, hfilter_vec); + _mm_storeu_si128((__m128i *)&b[j], res); + } + + src += src_stride; + b += w; + } + } + + // Vertical filter + if (yoffset == 0) { + // The data is already in 'dst', so no need to filter + } else if (yoffset == 4) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + __m128i x = _mm_loadu_si128((__m128i *)&dst[j]); + __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]); + _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu16(x, y)); + } + dst += w; + } + } else { + const uint8_t *vfilter = bilinear_filters_2t[yoffset]; + const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16)); + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]); + const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]); + const __m128i res = highbd_filter_block(x, y, vfilter_vec); + _mm_storeu_si128((__m128i *)&dst[j], res); + } + + dst += w; + } + } +} + +static INLINE __m128i highbd_filter_block_2rows(const __m128i *a0, + const __m128i *b0, + const __m128i *a1, + const __m128i *b1, + const __m128i *filter) { + __m128i v0 = _mm_unpacklo_epi16(*a0, *b0); + v0 = _mm_madd_epi16(v0, *filter); + v0 = xx_roundn_epu32(v0, FILTER_BITS); + + __m128i v1 = _mm_unpacklo_epi16(*a1, *b1); + v1 = _mm_madd_epi16(v1, *filter); + v1 = xx_roundn_epu32(v1, FILTER_BITS); + + return _mm_packs_epi32(v0, v1); +} + +static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride, + int xoffset, int yoffset, uint16_t *dst, + int h) { + int i; + // Horizontal filter + if (xoffset == 0) { + uint16_t *b = dst; + for (i = 0; i < h + 1; ++i) { + __m128i x = _mm_loadl_epi64((__m128i *)src); + _mm_storel_epi64((__m128i *)b, x); + src += src_stride; + b += 4; + } + } else if (xoffset == 4) { + uint16_t *b = dst; + for (i = 0; i < h + 1; ++i) { + __m128i x = _mm_loadu_si128((__m128i *)src); + __m128i z = _mm_srli_si128(x, 2); + _mm_storel_epi64((__m128i *)b, _mm_avg_epu16(x, z)); + src += src_stride; + b += 4; + } + } else { + uint16_t *b = dst; + const uint8_t *hfilter = bilinear_filters_2t[xoffset]; + const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16)); + for (i = 0; i < h; i += 2) { + const __m128i x0 = _mm_loadu_si128((__m128i *)src); + const __m128i z0 = _mm_srli_si128(x0, 2); + const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]); + const __m128i z1 = _mm_srli_si128(x1, 2); + const __m128i res = + highbd_filter_block_2rows(&x0, &z0, &x1, &z1, &hfilter_vec); + _mm_storeu_si128((__m128i *)b, res); + + src += src_stride * 2; + b += 8; + } + // Process i = h separately + __m128i x = _mm_loadu_si128((__m128i *)src); + __m128i z = _mm_srli_si128(x, 2); + + __m128i v0 = _mm_unpacklo_epi16(x, z); + v0 = _mm_madd_epi16(v0, hfilter_vec); + v0 = xx_roundn_epu32(v0, FILTER_BITS); + + _mm_storel_epi64((__m128i *)b, _mm_packs_epi32(v0, v0)); + } + + // Vertical filter + if (yoffset == 0) { + // The data is already in 'dst', so no need to filter + } else if (yoffset == 4) { + for (i = 0; i < h; ++i) { + __m128i x = _mm_loadl_epi64((__m128i *)dst); + __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]); + _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(x, y)); + dst += 4; + } + } else { + const uint8_t *vfilter = bilinear_filters_2t[yoffset]; + const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16)); + for (i = 0; i < h; i += 2) { + const __m128i x = _mm_loadl_epi64((__m128i *)dst); + const __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]); + const __m128i z = _mm_loadl_epi64((__m128i *)&dst[8]); + const __m128i res = + highbd_filter_block_2rows(&x, &y, &y, &z, &vfilter_vec); + _mm_storeu_si128((__m128i *)dst, res); + + dst += 8; + } + } +} + +static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride, + const uint16_t *a_ptr, int a_stride, + const uint16_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int width, int height, uint64_t *sse, + int *sum_) { + int x, y; + // Note on bit widths: + // The maximum value of 'sum' is (2^12 - 1) * 128 * 128 =~ 2^26, + // so this can be kept as four 32-bit values. + // But the maximum value of 'sum_sq' is (2^12 - 1)^2 * 128 * 128 =~ 2^38, + // so this must be stored as two 64-bit values. + __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); + const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m128i round_const = + _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m128i zero = _mm_setzero_si128(); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 8) { + const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); + const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); + const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); + const __m128i m = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&m_ptr[x]), zero); + const __m128i m_inv = _mm_sub_epi16(mask_max, m); + + // Calculate 8 predicted pixels. + const __m128i data_l = _mm_unpacklo_epi16(a, b); + const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); + __m128i pred_l = _mm_madd_epi16(data_l, mask_l); + pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i data_r = _mm_unpackhi_epi16(a, b); + const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); + __m128i pred_r = _mm_madd_epi16(data_r, mask_r); + pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i src_l = _mm_unpacklo_epi16(src, zero); + const __m128i src_r = _mm_unpackhi_epi16(src, zero); + __m128i diff_l = _mm_sub_epi32(pred_l, src_l); + __m128i diff_r = _mm_sub_epi32(pred_r, src_r); + + // Update partial sums and partial sums of squares + sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r)); + // A trick: Now each entry of diff_l and diff_r is stored in a 32-bit + // field, but the range of values is only [-(2^12 - 1), 2^12 - 1]. + // So we can re-pack into 16-bit fields and use _mm_madd_epi16 + // to calculate the squares and partially sum them. + const __m128i tmp = _mm_packs_epi32(diff_l, diff_r); + const __m128i prod = _mm_madd_epi16(tmp, tmp); + // Then we want to sign-extend to 64 bits and accumulate + const __m128i sign = _mm_srai_epi32(prod, 31); + const __m128i tmp_0 = _mm_unpacklo_epi32(prod, sign); + const __m128i tmp_1 = _mm_unpackhi_epi32(prod, sign); + sum_sq = _mm_add_epi64(sum_sq, _mm_add_epi64(tmp_0, tmp_1)); + } + + src_ptr += src_stride; + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + // Reduce down to a single sum and sum of squares + sum = _mm_hadd_epi32(sum, zero); + sum = _mm_hadd_epi32(sum, zero); + *sum_ = _mm_cvtsi128_si32(sum); + sum_sq = _mm_add_epi64(sum_sq, _mm_srli_si128(sum_sq, 8)); + _mm_storel_epi64((__m128i *)sse, sum_sq); +} + +static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride, + const uint16_t *a_ptr, + const uint16_t *b_ptr, + const uint8_t *m_ptr, int m_stride, + int height, int *sse, int *sum_) { + int y; + // Note: For this function, h <= 8 (or maybe 16 if we add 4:1 partitions). + // So the maximum value of sum is (2^12 - 1) * 4 * 16 =~ 2^18 + // and the maximum value of sum_sq is (2^12 - 1)^2 * 4 * 16 =~ 2^30. + // So we can safely pack sum_sq into 32-bit fields, which is slightly more + // convenient. + __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); + const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m128i round_const = + _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m128i zero = _mm_setzero_si128(); + + for (y = 0; y < height; y += 2) { + __m128i src = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)src_ptr), + _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); + const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr); + const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr); + const __m128i m = _mm_unpacklo_epi8( + _mm_unpacklo_epi32( + _mm_cvtsi32_si128(*(const uint32_t *)m_ptr), + _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])), + zero); + const __m128i m_inv = _mm_sub_epi16(mask_max, m); + + const __m128i data_l = _mm_unpacklo_epi16(a, b); + const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); + __m128i pred_l = _mm_madd_epi16(data_l, mask_l); + pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i data_r = _mm_unpackhi_epi16(a, b); + const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); + __m128i pred_r = _mm_madd_epi16(data_r, mask_r); + pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i src_l = _mm_unpacklo_epi16(src, zero); + const __m128i src_r = _mm_unpackhi_epi16(src, zero); + __m128i diff_l = _mm_sub_epi32(pred_l, src_l); + __m128i diff_r = _mm_sub_epi32(pred_r, src_r); + + // Update partial sums and partial sums of squares + sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r)); + const __m128i tmp = _mm_packs_epi32(diff_l, diff_r); + const __m128i prod = _mm_madd_epi16(tmp, tmp); + sum_sq = _mm_add_epi32(sum_sq, prod); + + src_ptr += src_stride * 2; + a_ptr += 8; + b_ptr += 8; + m_ptr += m_stride * 2; + } + // Reduce down to a single sum and sum of squares + sum = _mm_hadd_epi32(sum, sum_sq); + sum = _mm_hadd_epi32(sum, zero); + *sum_ = _mm_cvtsi128_si32(sum); + *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, const uint8_t *ref, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask) { + const uint8_t *src0 = invert_mask ? pred : ref; + const uint8_t *src1 = invert_mask ? ref : pred; + const int stride0 = invert_mask ? width : ref_stride; + const int stride1 = invert_mask ? ref_stride : width; + assert(height % 2 == 0); + int i = 0; + if (width == 8) { + comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1, + mask, mask_stride); + } else if (width == 16) { + do { + comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred); + comp_mask_pred_16_ssse3(src0 + stride0, src1 + stride1, + mask + mask_stride, comp_pred + width); + comp_pred += (width << 1); + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + i += 2; + } while (i < height); + } else { // width == 32 + assert(width == 32); + do { + comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred); + comp_mask_pred_16_ssse3(src0 + 16, src1 + 16, mask + 16, comp_pred + 16); + comp_pred += (width); + src0 += (stride0); + src1 += (stride1); + mask += (mask_stride); + i += 1; + } while (i < height); + } +} diff --git a/libs/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.h b/libs/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.h new file mode 100644 index 000000000..4faa098ac --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_ +#define AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/blend.h" + +static INLINE void comp_mask_pred_16_ssse3(const uint8_t *src0, + const uint8_t *src1, + const uint8_t *mask, uint8_t *dst) { + const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i round_offset = + _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + + const __m128i sA0 = _mm_lddqu_si128((const __m128i *)(src0)); + const __m128i sA1 = _mm_lddqu_si128((const __m128i *)(src1)); + const __m128i aA = _mm_load_si128((const __m128i *)(mask)); + + const __m128i maA = _mm_sub_epi8(alpha_max, aA); + + const __m128i ssAL = _mm_unpacklo_epi8(sA0, sA1); + const __m128i aaAL = _mm_unpacklo_epi8(aA, maA); + const __m128i ssAH = _mm_unpackhi_epi8(sA0, sA1); + const __m128i aaAH = _mm_unpackhi_epi8(aA, maA); + + const __m128i blendAL = _mm_maddubs_epi16(ssAL, aaAL); + const __m128i blendAH = _mm_maddubs_epi16(ssAH, aaAH); + + const __m128i roundAL = _mm_mulhrs_epi16(blendAL, round_offset); + const __m128i roundAH = _mm_mulhrs_epi16(blendAH, round_offset); + _mm_store_si128((__m128i *)dst, _mm_packus_epi16(roundAL, roundAH)); +} + +static INLINE void comp_mask_pred_8_ssse3(uint8_t *comp_pred, int height, + const uint8_t *src0, int stride0, + const uint8_t *src1, int stride1, + const uint8_t *mask, + int mask_stride) { + int i = 0; + const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i round_offset = + _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { + // odd line A + const __m128i sA0 = _mm_loadl_epi64((const __m128i *)(src0)); + const __m128i sA1 = _mm_loadl_epi64((const __m128i *)(src1)); + const __m128i aA = _mm_loadl_epi64((const __m128i *)(mask)); + // even line B + const __m128i sB0 = _mm_loadl_epi64((const __m128i *)(src0 + stride0)); + const __m128i sB1 = _mm_loadl_epi64((const __m128i *)(src1 + stride1)); + const __m128i a = _mm_castps_si128(_mm_loadh_pi( + _mm_castsi128_ps(aA), (const __m64 *)(mask + mask_stride))); + + const __m128i ssA = _mm_unpacklo_epi8(sA0, sA1); + const __m128i ssB = _mm_unpacklo_epi8(sB0, sB1); + + const __m128i ma = _mm_sub_epi8(alpha_max, a); + const __m128i aaA = _mm_unpacklo_epi8(a, ma); + const __m128i aaB = _mm_unpackhi_epi8(a, ma); + + const __m128i blendA = _mm_maddubs_epi16(ssA, aaA); + const __m128i blendB = _mm_maddubs_epi16(ssB, aaB); + const __m128i roundA = _mm_mulhrs_epi16(blendA, round_offset); + const __m128i roundB = _mm_mulhrs_epi16(blendB, round_offset); + const __m128i round = _mm_packus_epi16(roundA, roundB); + // comp_pred's stride == width == 8 + _mm_store_si128((__m128i *)(comp_pred), round); + comp_pred += (8 << 1); + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + i += 2; + } while (i < height); +} + +#endif // AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_ diff --git a/libs/libaom/src/aom_dsp/x86/mem_sse2.h b/libs/libaom/src/aom_dsp/x86/mem_sse2.h new file mode 100644 index 000000000..6c821673e --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/mem_sse2.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_MEM_SSE2_H_ +#define AOM_AOM_DSP_X86_MEM_SSE2_H_ + +#include // SSE2 + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" + +static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) { + return _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src)); +} + +static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src, + const int byte_stride) { + return _mm_setr_epi32(*(const int32_t *)((int8_t *)src + 0 * byte_stride), + *(const int32_t *)((int8_t *)src + 1 * byte_stride), + *(const int32_t *)((int8_t *)src + 2 * byte_stride), + *(const int32_t *)((int8_t *)src + 3 * byte_stride)); +} + +static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src, + const int byte_stride) { + __m128i dst; + dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride)); + dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst); + return dst; +} + +#endif // AOM_AOM_DSP_X86_MEM_SSE2_H_ diff --git a/libs/libaom/src/aom_dsp/x86/obmc_intrinsic_sse4.h b/libs/libaom/src/aom_dsp/x86/obmc_intrinsic_sse4.h new file mode 100644 index 000000000..5181e444c --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/obmc_intrinsic_sse4.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_ +#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_ + +#include + +#include "aom_dsp/x86/obmc_intrinsic_ssse3.h" + +static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + unsigned int *const sse, int *const sum, + const int h) { + const int pre_step = pre_stride - 4; + int n = 0; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + + assert(IS_POWER_OF_TWO(h)); + + do { + const __m128i v_p_b = _mm_cvtsi32_si128(*(const uint32_t *)(pre + n)); + const __m128i v_m_d = _mm_load_si128((const __m128i *)(mask + n)); + const __m128i v_w_d = _mm_load_si128((const __m128i *)(wsrc + n)); + + const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); + + const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); + const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12); + const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + n += 4; + + if (n % 4 == 0) pre += pre_step; + } while (n < 4 * h); + + *sum = xx_hsum_epi32_si32(v_sum_d); + *sse = xx_hsum_epi32_si32(v_sse_d); +} + +#endif // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_ diff --git a/libs/libaom/src/aom_dsp/x86/obmc_intrinsic_ssse3.h b/libs/libaom/src/aom_dsp/x86/obmc_intrinsic_ssse3.h new file mode 100644 index 000000000..48486c6c4 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/obmc_intrinsic_ssse3.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ +#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ + +#include + +#include "config/aom_config.h" + +static INLINE int32_t xx_hsum_epi32_si32(__m128i v_d) { + v_d = _mm_hadd_epi32(v_d, v_d); + v_d = _mm_hadd_epi32(v_d, v_d); + return _mm_cvtsi128_si32(v_d); +} + +static INLINE int64_t xx_hsum_epi64_si64(__m128i v_q) { + v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8)); +#if ARCH_X86_64 + return _mm_cvtsi128_si64(v_q); +#else + { + int64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, v_q); + return tmp; + } +#endif +} + +static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) { + const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128()); + const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d); + const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d); + return xx_hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q)); +} + +// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits) +static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) { + const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); + const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31); + const __m128i v_tmp_d = + _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d); + return _mm_srai_epi32(v_tmp_d, bits); +} + +#endif // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ diff --git a/libs/libaom/src/aom_dsp/x86/obmc_sad_avx2.c b/libs/libaom/src/aom_dsp/x86/obmc_sad_avx2.c new file mode 100644 index 000000000..2aa2a0555 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/obmc_sad_avx2.c @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/x86/obmc_intrinsic_ssse3.h" +#include "aom_dsp/x86/synonyms.h" + +//////////////////////////////////////////////////////////////////////////////// +// 8 bit +//////////////////////////////////////////////////////////////////////////////// + +static INLINE unsigned int obmc_sad_w4_avx2(const uint8_t *pre, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, + const int height) { + int n = 0; + __m256i v_sad_d = _mm256_setzero_si256(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + + do { + const __m128i v_p_b_0 = xx_loadl_32(pre); + const __m128i v_p_b_1 = xx_loadl_32(pre + pre_stride); + const __m128i v_p_b = _mm_unpacklo_epi32(v_p_b_0, v_p_b_1); + const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n)); + const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); + + const __m256i v_p_d = _mm256_cvtepu8_epi32(v_p_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d); + + const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d); + const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d); + + // Rounded absolute difference + const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d); + const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12); + + v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d); + + n += 8; + pre += pre_stride << 1; + } while (n < 8 * (height >> 1)); + + __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); + __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); + v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); + return xx_hsum_epi32_si32(v_sad_d_0); +} + +static INLINE unsigned int obmc_sad_w8n_avx2( + const uint8_t *pre, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, const int width, const int height) { + const int pre_step = pre_stride - width; + int n = 0; + __m256i v_sad_d = _mm256_setzero_si256(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + assert(width >= 8); + assert(IS_POWER_OF_TWO(width)); + + do { + const __m128i v_p0_b = xx_loadl_64(pre + n); + const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n)); + const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); + + const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p0_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d); + + const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d); + const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d); + + // Rounded absolute difference + const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d); + const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12); + + v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d); + + n += 8; + + if ((n & (width - 1)) == 0) pre += pre_step; + } while (n < width * height); + + __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); + __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); + v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); + return xx_hsum_epi32_si32(v_sad_d_0); +} + +#define OBMCSADWXH(w, h) \ + unsigned int aom_obmc_sad##w##x##h##_avx2( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *msk) { \ + if (w == 4) { \ + return obmc_sad_w4_avx2(pre, pre_stride, wsrc, msk, h); \ + } else { \ + return obmc_sad_w8n_avx2(pre, pre_stride, wsrc, msk, w, h); \ + } \ + } + +OBMCSADWXH(128, 128) +OBMCSADWXH(128, 64) +OBMCSADWXH(64, 128) +OBMCSADWXH(64, 64) +OBMCSADWXH(64, 32) +OBMCSADWXH(32, 64) +OBMCSADWXH(32, 32) +OBMCSADWXH(32, 16) +OBMCSADWXH(16, 32) +OBMCSADWXH(16, 16) +OBMCSADWXH(16, 8) +OBMCSADWXH(8, 16) +OBMCSADWXH(8, 8) +OBMCSADWXH(8, 4) +OBMCSADWXH(4, 8) +OBMCSADWXH(4, 4) +OBMCSADWXH(4, 16) +OBMCSADWXH(16, 4) +OBMCSADWXH(8, 32) +OBMCSADWXH(32, 8) +OBMCSADWXH(16, 64) +OBMCSADWXH(64, 16) + +//////////////////////////////////////////////////////////////////////////////// +// High bit-depth +//////////////////////////////////////////////////////////////////////////////// + +static INLINE unsigned int hbd_obmc_sad_w4_avx2(const uint8_t *pre8, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, + const int height) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + int n = 0; + __m256i v_sad_d = _mm256_setzero_si256(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + do { + const __m128i v_p_w_0 = xx_loadl_64(pre); + const __m128i v_p_w_1 = xx_loadl_64(pre + pre_stride); + const __m128i v_p_w = _mm_unpacklo_epi64(v_p_w_0, v_p_w_1); + const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n)); + const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); + + const __m256i v_p_d = _mm256_cvtepu16_epi32(v_p_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d); + + const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d); + const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d); + + // Rounded absolute difference + + const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d); + const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12); + + v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d); + + n += 8; + + pre += pre_stride << 1; + } while (n < 8 * (height >> 1)); + + __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); + __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); + v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); + return xx_hsum_epi32_si32(v_sad_d_0); +} + +static INLINE unsigned int hbd_obmc_sad_w8n_avx2( + const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, const int width, const int height) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + const int pre_step = pre_stride - width; + int n = 0; + __m256i v_sad_d = _mm256_setzero_si256(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + + assert(width >= 8); + assert(IS_POWER_OF_TWO(width)); + + do { + const __m128i v_p0_w = _mm_lddqu_si128((__m128i *)(pre + n)); + const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n)); + const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); + + const __m256i v_p0_d = _mm256_cvtepu16_epi32(v_p0_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d); + + const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d); + const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d); + + // Rounded absolute difference + const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d); + const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12); + + v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d); + + n += 8; + + if (n % width == 0) pre += pre_step; + } while (n < width * height); + + __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); + __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); + v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); + return xx_hsum_epi32_si32(v_sad_d_0); +} + +#define HBD_OBMCSADWXH(w, h) \ + unsigned int aom_highbd_obmc_sad##w##x##h##_avx2( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask) { \ + if (w == 4) { \ + return hbd_obmc_sad_w4_avx2(pre, pre_stride, wsrc, mask, h); \ + } else { \ + return hbd_obmc_sad_w8n_avx2(pre, pre_stride, wsrc, mask, w, h); \ + } \ + } + +HBD_OBMCSADWXH(128, 128) +HBD_OBMCSADWXH(128, 64) +HBD_OBMCSADWXH(64, 128) +HBD_OBMCSADWXH(64, 64) +HBD_OBMCSADWXH(64, 32) +HBD_OBMCSADWXH(32, 64) +HBD_OBMCSADWXH(32, 32) +HBD_OBMCSADWXH(32, 16) +HBD_OBMCSADWXH(16, 32) +HBD_OBMCSADWXH(16, 16) +HBD_OBMCSADWXH(16, 8) +HBD_OBMCSADWXH(8, 16) +HBD_OBMCSADWXH(8, 8) +HBD_OBMCSADWXH(8, 4) +HBD_OBMCSADWXH(4, 8) +HBD_OBMCSADWXH(4, 4) +HBD_OBMCSADWXH(4, 16) +HBD_OBMCSADWXH(16, 4) +HBD_OBMCSADWXH(8, 32) +HBD_OBMCSADWXH(32, 8) +HBD_OBMCSADWXH(16, 64) +HBD_OBMCSADWXH(64, 16) diff --git a/libs/libaom/src/aom_dsp/x86/obmc_sad_sse4.c b/libs/libaom/src/aom_dsp/x86/obmc_sad_sse4.c new file mode 100644 index 000000000..0338a8c77 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/obmc_sad_sse4.c @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/x86/obmc_intrinsic_ssse3.h" +#include "aom_dsp/x86/synonyms.h" + +//////////////////////////////////////////////////////////////////////////////// +// 8 bit +//////////////////////////////////////////////////////////////////////////////// + +static AOM_FORCE_INLINE unsigned int obmc_sad_w4(const uint8_t *pre, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, + const int height) { + const int pre_step = pre_stride - 4; + int n = 0; + __m128i v_sad_d = _mm_setzero_si128(); + + do { + const __m128i v_p_b = xx_loadl_32(pre + n); + const __m128i v_m_d = xx_load_128(mask + n); + const __m128i v_w_d = xx_load_128(wsrc + n); + + const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); + + const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); + const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d); + + // Rounded absolute difference + const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12); + + v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d); + + n += 4; + + if (n % 4 == 0) pre += pre_step; + } while (n < 4 * height); + + return xx_hsum_epi32_si32(v_sad_d); +} + +static AOM_FORCE_INLINE unsigned int obmc_sad_w8n( + const uint8_t *pre, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, const int width, const int height) { + const int pre_step = pre_stride - width; + int n = 0; + __m128i v_sad_d = _mm_setzero_si128(); + + assert(width >= 8); + assert(IS_POWER_OF_TWO(width)); + + do { + const __m128i v_p1_b = xx_loadl_32(pre + n + 4); + const __m128i v_m1_d = xx_load_128(mask + n + 4); + const __m128i v_w1_d = xx_load_128(wsrc + n + 4); + const __m128i v_p0_b = xx_loadl_32(pre + n); + const __m128i v_m0_d = xx_load_128(mask + n); + const __m128i v_w0_d = xx_load_128(wsrc + n); + + const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b); + const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); + const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); + + const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); + const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); + const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d); + const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d); + + // Rounded absolute difference + const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12); + const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12); + + v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d); + v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d); + + n += 8; + + if (n % width == 0) pre += pre_step; + } while (n < width * height); + + return xx_hsum_epi32_si32(v_sad_d); +} + +#define OBMCSADWXH(w, h) \ + unsigned int aom_obmc_sad##w##x##h##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *msk) { \ + if (w == 4) { \ + return obmc_sad_w4(pre, pre_stride, wsrc, msk, h); \ + } else { \ + return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h); \ + } \ + } + +OBMCSADWXH(128, 128) +OBMCSADWXH(128, 64) +OBMCSADWXH(64, 128) +OBMCSADWXH(64, 64) +OBMCSADWXH(64, 32) +OBMCSADWXH(32, 64) +OBMCSADWXH(32, 32) +OBMCSADWXH(32, 16) +OBMCSADWXH(16, 32) +OBMCSADWXH(16, 16) +OBMCSADWXH(16, 8) +OBMCSADWXH(8, 16) +OBMCSADWXH(8, 8) +OBMCSADWXH(8, 4) +OBMCSADWXH(4, 8) +OBMCSADWXH(4, 4) +OBMCSADWXH(4, 16) +OBMCSADWXH(16, 4) +OBMCSADWXH(8, 32) +OBMCSADWXH(32, 8) +OBMCSADWXH(16, 64) +OBMCSADWXH(64, 16) + +//////////////////////////////////////////////////////////////////////////////// +// High bit-depth +//////////////////////////////////////////////////////////////////////////////// + +static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, + const int height) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + const int pre_step = pre_stride - 4; + int n = 0; + __m128i v_sad_d = _mm_setzero_si128(); + + do { + const __m128i v_p_w = xx_loadl_64(pre + n); + const __m128i v_m_d = xx_load_128(mask + n); + const __m128i v_w_d = xx_load_128(wsrc + n); + + const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); + + const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); + const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d); + + // Rounded absolute difference + const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12); + + v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d); + + n += 4; + + if (n % 4 == 0) pre += pre_step; + } while (n < 4 * height); + + return xx_hsum_epi32_si32(v_sad_d); +} + +static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w8n( + const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, const int width, const int height) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + const int pre_step = pre_stride - width; + int n = 0; + __m128i v_sad_d = _mm_setzero_si128(); + + assert(width >= 8); + assert(IS_POWER_OF_TWO(width)); + + do { + const __m128i v_p1_w = xx_loadl_64(pre + n + 4); + const __m128i v_m1_d = xx_load_128(mask + n + 4); + const __m128i v_w1_d = xx_load_128(wsrc + n + 4); + const __m128i v_p0_w = xx_loadl_64(pre + n); + const __m128i v_m0_d = xx_load_128(mask + n); + const __m128i v_w0_d = xx_load_128(wsrc + n); + + const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w); + const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); + const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); + + const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); + const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); + const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d); + const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d); + + // Rounded absolute difference + const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12); + const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12); + + v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d); + v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d); + + n += 8; + + if (n % width == 0) pre += pre_step; + } while (n < width * height); + + return xx_hsum_epi32_si32(v_sad_d); +} + +#define HBD_OBMCSADWXH(w, h) \ + unsigned int aom_highbd_obmc_sad##w##x##h##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask) { \ + if (w == 4) { \ + return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h); \ + } else { \ + return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h); \ + } \ + } + +HBD_OBMCSADWXH(128, 128) +HBD_OBMCSADWXH(128, 64) +HBD_OBMCSADWXH(64, 128) +HBD_OBMCSADWXH(64, 64) +HBD_OBMCSADWXH(64, 32) +HBD_OBMCSADWXH(32, 64) +HBD_OBMCSADWXH(32, 32) +HBD_OBMCSADWXH(32, 16) +HBD_OBMCSADWXH(16, 32) +HBD_OBMCSADWXH(16, 16) +HBD_OBMCSADWXH(16, 8) +HBD_OBMCSADWXH(8, 16) +HBD_OBMCSADWXH(8, 8) +HBD_OBMCSADWXH(8, 4) +HBD_OBMCSADWXH(4, 8) +HBD_OBMCSADWXH(4, 4) +HBD_OBMCSADWXH(4, 16) +HBD_OBMCSADWXH(16, 4) +HBD_OBMCSADWXH(8, 32) +HBD_OBMCSADWXH(32, 8) +HBD_OBMCSADWXH(16, 64) +HBD_OBMCSADWXH(64, 16) diff --git a/libs/libaom/src/aom_dsp/x86/obmc_variance_avx2.c b/libs/libaom/src/aom_dsp/x86/obmc_variance_avx2.c new file mode 100644 index 000000000..bfec0e8a8 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/obmc_variance_avx2.c @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/obmc_intrinsic_sse4.h" + +//////////////////////////////////////////////////////////////////////////////// +// 8 bit +//////////////////////////////////////////////////////////////////////////////// + +static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + unsigned int *const sse, int *const sum, + const int w, const int h) { + int n = 0, width, height = h; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + __m128i v_d; + const uint8_t *pre_temp; + assert(w >= 8); + assert(IS_POWER_OF_TWO(w)); + assert(IS_POWER_OF_TWO(h)); + do { + width = w; + pre_temp = pre; + do { + const __m128i v_p_b = _mm_loadl_epi64((const __m128i *)pre_temp); + const __m256i v_m_d = _mm256_loadu_si256((__m256i const *)(mask + n)); + const __m256i v_w_d = _mm256_loadu_si256((__m256i const *)(wsrc + n)); + const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm_d = _mm256_madd_epi16(v_p0_d, v_m_d); + const __m256i v_diff0_d = _mm256_sub_epi32(v_w_d, v_pm_d); + + const __m256i v_sign_d = _mm256_srai_epi32(v_diff0_d, 31); + const __m256i v_tmp_d = + _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign_d); + const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp_d, 12); + const __m128i v_rdiff_d = _mm256_castsi256_si128(v_rdiff0_d); + const __m128i v_rdiff1_d = _mm256_extracti128_si256(v_rdiff0_d, 1); + + const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff_d, v_rdiff1_d); + const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + pre_temp += 8; + n += 8; + width -= 8; + } while (width > 0); + pre += pre_stride; + height -= 1; + } while (height > 0); + v_d = _mm_hadd_epi32(v_sum_d, v_sse_d); + v_d = _mm_hadd_epi32(v_d, v_d); + *sum = _mm_cvtsi128_si32(v_d); + *sse = _mm_cvtsi128_si32(_mm_srli_si128(v_d, 4)); +} + +static INLINE void obmc_variance_w16n(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + unsigned int *const sse, int *const sum, + const int w, const int h) { + int n = 0, width, height = h; + __m256i v_d; + __m128i res0; + const uint8_t *pre_temp; + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + __m256i v_sum_d = _mm256_setzero_si256(); + __m256i v_sse_d = _mm256_setzero_si256(); + + assert(w >= 16); + assert(IS_POWER_OF_TWO(w)); + assert(IS_POWER_OF_TWO(h)); + do { + width = w; + pre_temp = pre; + do { + const __m128i v_p_b = _mm_loadu_si128((__m128i *)pre_temp); + const __m256i v_m0_d = _mm256_loadu_si256((__m256i const *)(mask + n)); + const __m256i v_w0_d = _mm256_loadu_si256((__m256i const *)(wsrc + n)); + const __m256i v_m1_d = + _mm256_loadu_si256((__m256i const *)(mask + n + 8)); + const __m256i v_w1_d = + _mm256_loadu_si256((__m256i const *)(wsrc + n + 8)); + + const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b); + const __m256i v_p1_d = _mm256_cvtepu8_epi32(_mm_srli_si128(v_p_b, 8)); + + const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d); + const __m256i v_pm1_d = _mm256_madd_epi16(v_p1_d, v_m1_d); + + const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d); + const __m256i v_diff1_d = _mm256_sub_epi32(v_w1_d, v_pm1_d); + + const __m256i v_sign0_d = _mm256_srai_epi32(v_diff0_d, 31); + const __m256i v_sign1_d = _mm256_srai_epi32(v_diff1_d, 31); + + const __m256i v_tmp0_d = + _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign0_d); + const __m256i v_tmp1_d = + _mm256_add_epi32(_mm256_add_epi32(v_diff1_d, v_bias_d), v_sign1_d); + + const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp0_d, 12); + const __m256i v_rdiff2_d = _mm256_srai_epi32(v_tmp1_d, 12); + + const __m256i v_rdiff1_d = _mm256_add_epi32(v_rdiff0_d, v_rdiff2_d); + const __m256i v_rdiff01_w = _mm256_packs_epi32(v_rdiff0_d, v_rdiff2_d); + const __m256i v_sqrdiff_d = _mm256_madd_epi16(v_rdiff01_w, v_rdiff01_w); + + v_sum_d = _mm256_add_epi32(v_sum_d, v_rdiff1_d); + v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff_d); + + pre_temp += 16; + n += 16; + width -= 16; + } while (width > 0); + pre += pre_stride; + height -= 1; + } while (height > 0); + + v_d = _mm256_hadd_epi32(v_sum_d, v_sse_d); + v_d = _mm256_hadd_epi32(v_d, v_d); + res0 = _mm256_castsi256_si128(v_d); + res0 = _mm_add_epi32(res0, _mm256_extractf128_si256(v_d, 1)); + *sum = _mm_cvtsi128_si32(res0); + *sse = _mm_cvtsi128_si32(_mm_srli_si128(res0, 4)); +} + +#define OBMCVARWXH(W, H) \ + unsigned int aom_obmc_variance##W##x##H##_avx2( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + if (W == 4) { \ + obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \ + } else if (W == 8) { \ + obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \ + } else { \ + obmc_variance_w16n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \ + } \ + \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ + } + +OBMCVARWXH(128, 128) +OBMCVARWXH(128, 64) +OBMCVARWXH(64, 128) +OBMCVARWXH(64, 64) +OBMCVARWXH(64, 32) +OBMCVARWXH(32, 64) +OBMCVARWXH(32, 32) +OBMCVARWXH(32, 16) +OBMCVARWXH(16, 32) +OBMCVARWXH(16, 16) +OBMCVARWXH(16, 8) +OBMCVARWXH(8, 16) +OBMCVARWXH(8, 8) +OBMCVARWXH(8, 4) +OBMCVARWXH(4, 8) +OBMCVARWXH(4, 4) +OBMCVARWXH(4, 16) +OBMCVARWXH(16, 4) +OBMCVARWXH(8, 32) +OBMCVARWXH(32, 8) +OBMCVARWXH(16, 64) +OBMCVARWXH(64, 16) diff --git a/libs/libaom/src/aom_dsp/x86/obmc_variance_sse4.c b/libs/libaom/src/aom_dsp/x86/obmc_variance_sse4.c new file mode 100644 index 000000000..aa73c392d --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/obmc_variance_sse4.c @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/obmc_intrinsic_sse4.h" +#include "aom_dsp/x86/synonyms.h" + +//////////////////////////////////////////////////////////////////////////////// +// 8 bit +//////////////////////////////////////////////////////////////////////////////// + +void aom_var_filter_block2d_bil_first_pass_ssse3( + const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter); + +void aom_var_filter_block2d_bil_second_pass_ssse3( + const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter); + +static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + unsigned int *const sse, int *const sum, + const int w, const int h) { + const int pre_step = pre_stride - w; + int n = 0; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + + assert(w >= 8); + assert(IS_POWER_OF_TWO(w)); + assert(IS_POWER_OF_TWO(h)); + + do { + const __m128i v_p1_b = xx_loadl_32(pre + n + 4); + const __m128i v_m1_d = xx_load_128(mask + n + 4); + const __m128i v_w1_d = xx_load_128(wsrc + n + 4); + const __m128i v_p0_b = xx_loadl_32(pre + n); + const __m128i v_m0_d = xx_load_128(mask + n); + const __m128i v_w0_d = xx_load_128(wsrc + n); + + const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b); + const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); + const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); + + const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); + const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); + + const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12); + const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12); + const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d); + const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d); + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + n += 8; + + if (n % w == 0) pre += pre_step; + } while (n < w * h); + + *sum = xx_hsum_epi32_si32(v_sum_d); + *sse = xx_hsum_epi32_si32(v_sse_d); +} + +#define OBMCVARWXH(W, H) \ + unsigned int aom_obmc_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + if (W == 4) { \ + obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \ + } else { \ + obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \ + } \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ + } + +OBMCVARWXH(128, 128) +OBMCVARWXH(128, 64) +OBMCVARWXH(64, 128) +OBMCVARWXH(64, 64) +OBMCVARWXH(64, 32) +OBMCVARWXH(32, 64) +OBMCVARWXH(32, 32) +OBMCVARWXH(32, 16) +OBMCVARWXH(16, 32) +OBMCVARWXH(16, 16) +OBMCVARWXH(16, 8) +OBMCVARWXH(8, 16) +OBMCVARWXH(8, 8) +OBMCVARWXH(8, 4) +OBMCVARWXH(4, 8) +OBMCVARWXH(4, 4) +OBMCVARWXH(4, 16) +OBMCVARWXH(16, 4) +OBMCVARWXH(8, 32) +OBMCVARWXH(32, 8) +OBMCVARWXH(16, 64) +OBMCVARWXH(64, 16) + +#include "config/aom_dsp_rtcd.h" + +#define OBMC_SUBPIX_VAR(W, H) \ + uint32_t aom_obmc_sub_pixel_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + aom_var_filter_block2d_bil_first_pass_ssse3( \ + pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_ssse3( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_obmc_variance##W##x##H##_sse4_1(temp2, W, wsrc, mask, sse); \ + } + +OBMC_SUBPIX_VAR(128, 128) +OBMC_SUBPIX_VAR(128, 64) +OBMC_SUBPIX_VAR(64, 128) +OBMC_SUBPIX_VAR(64, 64) +OBMC_SUBPIX_VAR(64, 32) +OBMC_SUBPIX_VAR(32, 64) +OBMC_SUBPIX_VAR(32, 32) +OBMC_SUBPIX_VAR(32, 16) +OBMC_SUBPIX_VAR(16, 32) +OBMC_SUBPIX_VAR(16, 16) +OBMC_SUBPIX_VAR(16, 8) +OBMC_SUBPIX_VAR(8, 16) +OBMC_SUBPIX_VAR(8, 8) +OBMC_SUBPIX_VAR(8, 4) +OBMC_SUBPIX_VAR(4, 8) +OBMC_SUBPIX_VAR(4, 4) +OBMC_SUBPIX_VAR(4, 16) +OBMC_SUBPIX_VAR(16, 4) +OBMC_SUBPIX_VAR(8, 32) +OBMC_SUBPIX_VAR(32, 8) +OBMC_SUBPIX_VAR(16, 64) +OBMC_SUBPIX_VAR(64, 16) + +//////////////////////////////////////////////////////////////////////////////// +// High bit-depth +//////////////////////////////////////////////////////////////////////////////// +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void hbd_obmc_variance_w4( + const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + const int pre_step = pre_stride - 4; + int n = 0; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + + assert(IS_POWER_OF_TWO(h)); + + do { + const __m128i v_p_w = xx_loadl_64(pre + n); + const __m128i v_m_d = xx_load_128(mask + n); + const __m128i v_w_d = xx_load_128(wsrc + n); + + const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); + + const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); + const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12); + const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + n += 4; + + if (n % 4 == 0) pre += pre_step; + } while (n < 4 * h); + + *sum = xx_hsum_epi32_si32(v_sum_d); + *sse = xx_hsum_epi32_si32(v_sse_d); +} + +static INLINE void hbd_obmc_variance_w8n( + const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int w, + const int h) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + const int pre_step = pre_stride - w; + int n = 0; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + + assert(w >= 8); + assert(IS_POWER_OF_TWO(w)); + assert(IS_POWER_OF_TWO(h)); + + do { + const __m128i v_p1_w = xx_loadl_64(pre + n + 4); + const __m128i v_m1_d = xx_load_128(mask + n + 4); + const __m128i v_w1_d = xx_load_128(wsrc + n + 4); + const __m128i v_p0_w = xx_loadl_64(pre + n); + const __m128i v_m0_d = xx_load_128(mask + n); + const __m128i v_w0_d = xx_load_128(wsrc + n); + + const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w); + const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); + const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); + + const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); + const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); + + const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12); + const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12); + const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d); + const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d); + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + n += 8; + + if (n % w == 0) pre += pre_step; + } while (n < w * h); + + *sum += xx_hsum_epi32_si64(v_sum_d); + *sse += xx_hsum_epi32_si64(v_sse_d); +} + +static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64 = 0; + uint64_t sse64 = 0; + if (w == 4) { + hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h); + } else { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h); + } + *sum = (int)sum64; + *sse = (unsigned int)sse64; +} + +static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64 = 0; + uint64_t sse64 = 0; + if (w == 4) { + hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h); + } else if (w < 128 || h < 128) { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h); + } else { + assert(w == 128 && h == 128); + + do { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, + 64); + pre8 += 64 * pre_stride; + wsrc += 64 * w; + mask += 64 * w; + h -= 64; + } while (h > 0); + } + *sum = (int)ROUND_POWER_OF_TWO(sum64, 2); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4); +} + +static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64 = 0; + uint64_t sse64 = 0; + int max_pel_allowed_per_ovf = 512; + if (w == 4) { + hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h); + } else if (w * h <= max_pel_allowed_per_ovf) { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h); + } else { + int h_per_ovf = max_pel_allowed_per_ovf / w; + + assert(max_pel_allowed_per_ovf % w == 0); + do { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, + h_per_ovf); + pre8 += h_per_ovf * pre_stride; + wsrc += h_per_ovf * w; + mask += h_per_ovf * w; + h -= h_per_ovf; + } while (h > 0); + } + *sum = (int)ROUND_POWER_OF_TWO(sum64, 4); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8); +} + +#define HBD_OBMCVARWXH(W, H) \ + unsigned int aom_highbd_obmc_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ + } \ + \ + unsigned int aom_highbd_10_obmc_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + int64_t var; \ + highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + unsigned int aom_highbd_12_obmc_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + int64_t var; \ + highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +HBD_OBMCVARWXH(128, 128) +HBD_OBMCVARWXH(128, 64) +HBD_OBMCVARWXH(64, 128) +HBD_OBMCVARWXH(64, 64) +HBD_OBMCVARWXH(64, 32) +HBD_OBMCVARWXH(32, 64) +HBD_OBMCVARWXH(32, 32) +HBD_OBMCVARWXH(32, 16) +HBD_OBMCVARWXH(16, 32) +HBD_OBMCVARWXH(16, 16) +HBD_OBMCVARWXH(16, 8) +HBD_OBMCVARWXH(8, 16) +HBD_OBMCVARWXH(8, 8) +HBD_OBMCVARWXH(8, 4) +HBD_OBMCVARWXH(4, 8) +HBD_OBMCVARWXH(4, 4) +HBD_OBMCVARWXH(4, 16) +HBD_OBMCVARWXH(16, 4) +HBD_OBMCVARWXH(8, 32) +HBD_OBMCVARWXH(32, 8) +HBD_OBMCVARWXH(16, 64) +HBD_OBMCVARWXH(64, 16) +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/libs/libaom/src/aom_dsp/x86/quantize_avx_x86_64.asm b/libs/libaom/src/aom_dsp/x86/quantize_avx_x86_64.asm new file mode 100644 index 000000000..d6e15c4be --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/quantize_avx_x86_64.asm @@ -0,0 +1,464 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro QUANTIZE_FN 2 +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \ + shift, qcoeff, dqcoeff, dequant, \ + eob, scan, iscan + + vzeroupper + +%ifnidn %1, b_32x32 + + ; Special case for ncoeff == 16, as it is frequent and we can save on + ; not setting up a loop. + cmp ncoeffmp, 16 + jne .generic + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Special case of ncoeff == 16 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +.single: + + movifnidn coeffq, coeffmp + movifnidn zbinq, zbinmp + mova m0, [zbinq] ; m0 = zbin + + ; Get DC and first 15 AC coeffs - in this special case, that is all. + ; coeff stored as 32bit numbers but we process them as 16 bit numbers + mova m9, [coeffq] + packssdw m9, [coeffq+16] ; m9 = c[i] + mova m10, [coeffq+32] + packssdw m10, [coeffq+48] ; m10 = c[i] + + mov r0, eobmp ; Output pointer + mov r1, qcoeffmp ; Output pointer + mov r2, dqcoeffmp ; Output pointer + + pxor m5, m5 ; m5 = dedicated zero + + pcmpeqw m4, m4 ; All word lanes -1 + paddw m0, m4 ; m0 = zbin - 1 + + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin + punpckhqdq m0, m0 + pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin + + ; Check if all coeffs are less than zbin. If yes, we just write zeros + ; to the outputs and we are done. + por m14, m7, m12 + ptest m14, m14 + jnz .single_nonzero + + mova [r1 ], ymm5 + mova [r1+32], ymm5 + mova [r2 ], ymm5 + mova [r2+32], ymm5 + mov [r0], word 0 + + vzeroupper + RET + +.single_nonzero: + + ; Actual quantization of size 16 block - setup pointers, rounders, etc. + movifnidn r3, roundmp + movifnidn r4, quantmp + mov r6, dequantmp + mov r5, shiftmp + mova m1, [r3] ; m1 = round + mova m2, [r4] ; m2 = quant + mova m3, [r6] ; m3 = dequant + mova m4, [r5] ; m4 = shift + + mov r3, iscanmp + + DEFINE_ARGS eob, qcoeff, dqcoeff, iscan + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + paddsw m6, m1 ; m6 += round + punpckhqdq m1, m1 + paddsw m11, m1 ; m11 += round + pmulhw m8, m6, m2 ; m8 = m6*q>>16 + punpckhqdq m2, m2 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + paddw m8, m6 ; m8 += m6 + paddw m13, m11 ; m13 += m11 + pmulhw m8, m4 ; m8 = m8*qsh>>16 + punpckhqdq m4, m4 + pmulhw m13, m4 ; m13 = m13*qsh>>16 + psignw m8, m9 ; m8 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + pand m8, m7 + pand m13, m12 + + ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff + pcmpgtw m6, m5, m8 + punpckhwd m6, m8, m6 + pmovsxwd m11, m8 + mova [qcoeffq ], m11 + mova [qcoeffq+16], m6 + pcmpgtw m6, m5, m13 + punpckhwd m6, m13, m6 + pmovsxwd m11, m13 + mova [qcoeffq+32], m11 + mova [qcoeffq+48], m6 + + pmullw m8, m3 ; dqc[i] = qc[i] * q + punpckhqdq m3, m3 + pmullw m13, m3 ; dqc[i] = qc[i] * q + + ; Store 16bit numbers as 32bit numbers in array pointed to by dqcoeff + pcmpgtw m6, m5, m8 + punpckhwd m6, m8, m6 + pmovsxwd m11, m8 + mova [dqcoeffq ], m11 + mova [dqcoeffq+16], m6 + pcmpgtw m6, m5, m13 + punpckhwd m6, m13, m6 + pmovsxwd m11, m13 + mova [dqcoeffq+32], m11 + mova [dqcoeffq+48], m6 + + mova m6, [iscanq] ; m6 = scan[i] + mova m11, [iscanq+16] ; m11 = scan[i] + + pcmpeqw m8, m8, m5 ; m8 = c[i] == 0 + pcmpeqw m13, m13, m5 ; m13 = c[i] == 0 + psubw m6, m6, m7 ; m6 = scan[i] + 1 + psubw m11, m11, m12 ; m11 = scan[i] + 1 + pandn m8, m8, m6 ; m8 = max(eob) + pandn m13, m13, m11 ; m13 = max(eob) + pmaxsw m8, m8, m13 + + ; Horizontally accumulate/max eobs and write into [eob] memory pointer + pshufd m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0x1 + pmaxsw m8, m7 + movq rax, m8 + mov [eobq], ax + + vzeroupper + RET + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Generic case of ncoeff != 16 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +.generic: + +%endif ; %ifnidn %1, b_32x32 + +DEFINE_ARGS coeff, ncoeff, zbin, round, quant, shift, \ + qcoeff, dqcoeff, dequant, eob, scan, iscan + + ; Actual quantization loop - setup pointers, rounders, etc. + movifnidn coeffq, coeffmp + movifnidn ncoeffq, ncoeffmp + movifnidn zbinq, zbinmp + movifnidn roundq, roundmp + movifnidn quantq, quantmp + movifnidn dequantq, dequantmp + mova m0, [zbinq] ; m0 = zbin + mova m1, [roundq] ; m1 = round + mova m2, [quantq] ; m2 = quant + mova m3, [dequantq] ; m3 = dequant + pcmpeqw m4, m4 ; All lanes -1 +%ifidn %1, b_32x32 + psubw m0, m4 + psubw m1, m4 + psrlw m0, 1 ; m0 = (m0 + 1) / 2 + psrlw m1, 1 ; m1 = (m1 + 1) / 2 +%endif + paddw m0, m4 ; m0 = m0 + 1 + + mov r2, shiftmp + mov r3, qcoeffmp + mova m4, [r2] ; m4 = shift + mov r4, dqcoeffmp + mov r5, iscanmp + pxor m5, m5 ; m5 = dedicated zero + + DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob + + + lea coeffq, [ coeffq+ncoeffq*4] + lea qcoeffq, [ qcoeffq+ncoeffq*4] + lea dqcoeffq, [dqcoeffq+ncoeffq*4] + + lea iscanq, [ iscanq+ncoeffq*2] + neg ncoeffq + + ; get DC and first 15 AC coeffs + ; coeff stored as 32bit numbers & require 16bit numbers + mova m9, [coeffq+ncoeffq*4+ 0] + packssdw m9, [coeffq+ncoeffq*4+16] + mova m10, [coeffq+ncoeffq*4+32] + packssdw m10, [coeffq+ncoeffq*4+48] + + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin + punpckhqdq m0, m0 + pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin + + ; Check if all coeffs are less than zbin. If yes, skip forward quickly. + por m14, m7, m12 + ptest m14, m14 + jnz .first_nonzero + + mova [qcoeffq+ncoeffq*4 ], ymm5 + mova [qcoeffq+ncoeffq*4+32], ymm5 + mova [dqcoeffq+ncoeffq*4 ], ymm5 + mova [dqcoeffq+ncoeffq*4+32], ymm5 + add ncoeffq, mmsize + + punpckhqdq m1, m1 + punpckhqdq m2, m2 + punpckhqdq m3, m3 + punpckhqdq m4, m4 + pxor m8, m8 + + jmp .ac_only_loop + +.first_nonzero: + + paddsw m6, m1 ; m6 += round + punpckhqdq m1, m1 + paddsw m11, m1 ; m11 += round + pmulhw m8, m6, m2 ; m8 = m6*q>>16 + punpckhqdq m2, m2 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + paddw m8, m6 ; m8 += m6 + paddw m13, m11 ; m13 += m11 + %ifidn %1, b_32x32 + pmullw m5, m8, m4 ; store the lower 16 bits of m8*qsh + %endif + pmulhw m8, m4 ; m8 = m8*qsh>>16 + %ifidn %1, b_32x32 + psllw m8, 1 + psrlw m5, 15 + por m8, m5 + %endif + punpckhqdq m4, m4 + %ifidn %1, b_32x32 + pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh + %endif + pmulhw m13, m4 ; m13 = m13*qsh>>16 + %ifidn %1, b_32x32 + psllw m13, 1 + psrlw m5, 15 + por m13, m5 + pxor m5, m5 ; reset m5 to zero register + %endif + psignw m8, m9 ; m8 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + pand m8, m7 + pand m13, m12 + + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + pcmpgtw m6, m5, m8 + punpckhwd m6, m8, m6 + pmovsxwd m11, m8 + mova [qcoeffq+ncoeffq*4+ 0], m11 + mova [qcoeffq+ncoeffq*4+16], m6 + pcmpgtw m6, m5, m13 + punpckhwd m6, m13, m6 + pmovsxwd m11, m13 + mova [qcoeffq+ncoeffq*4+32], m11 + mova [qcoeffq+ncoeffq*4+48], m6 + +%ifidn %1, b_32x32 + pabsw m8, m8 + pabsw m13, m13 +%endif + pmullw m8, m3 ; dqc[i] = qc[i] * q + punpckhqdq m3, m3 + pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, b_32x32 + psrlw m8, 1 + psrlw m13, 1 + psignw m8, m9 + psignw m13, m10 +%endif + + ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff + pcmpgtw m6, m5, m8 + punpckhwd m6, m8, m6 + pmovsxwd m11, m8 + mova [dqcoeffq+ncoeffq*4+ 0], m11 + mova [dqcoeffq+ncoeffq*4+16], m6 + pcmpgtw m6, m5, m13 + punpckhwd m6, m13, m6 + pmovsxwd m11, m13 + mova [dqcoeffq+ncoeffq*4+32], m11 + mova [dqcoeffq+ncoeffq*4+48], m6 + + pcmpeqw m8, m5 ; m8 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [iscanq+ncoeffq*2] ; m6 = scan[i] + mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m12 ; m11 = scan[i] + 1 + pandn m8, m6 ; m8 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m13 + add ncoeffq, mmsize + +.ac_only_loop: + + ; pack coeff from 32bit to 16bit array + mova m9, [coeffq+ncoeffq*4+ 0] + packssdw m9, [coeffq+ncoeffq*4+16] + mova m10, [coeffq+ncoeffq*4+32] + packssdw m10, [coeffq+ncoeffq*4+48] + + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin + pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin + + ; Check if all coeffs are less than zbin. If yes, skip this itertion. + ; And just write zeros as the result would be. + por m14, m7, m12 + ptest m14, m14 + jnz .rest_nonzero + + mova [qcoeffq+ncoeffq*4+ 0], ymm5 + mova [qcoeffq+ncoeffq*4+32], ymm5 + mova [dqcoeffq+ncoeffq*4+ 0], ymm5 + mova [dqcoeffq+ncoeffq*4+32], ymm5 + + add ncoeffq, mmsize + jnz .ac_only_loop + + ; Horizontally accumulate/max eobs and write into [eob] memory pointer + mov r2, eobmp + pshufd m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0x1 + pmaxsw m8, m7 + movq rax, m8 + mov [r2], ax + vzeroupper + RET + +.rest_nonzero: + paddsw m6, m1 ; m6 += round + paddsw m11, m1 ; m11 += round + pmulhw m14, m6, m2 ; m14 = m6*q>>16 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + paddw m14, m6 ; m14 += m6 + paddw m13, m11 ; m13 += m11 + %ifidn %1, b_32x32 + pmullw m5, m14, m4 ; store the lower 16 bits of m14*qsh + %endif + pmulhw m14, m4 ; m14 = m14*qsh>>16 + %ifidn %1, b_32x32 + psllw m14, 1 + psrlw m5, 15 + por m14, m5 + pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh + %endif + pmulhw m13, m4 ; m13 = m13*qsh>>16 + %ifidn %1, b_32x32 + psllw m13, 1 + psrlw m5, 15 + por m13, m5 + pxor m5, m5 ; reset m5 to zero register + %endif + psignw m14, m9 ; m14 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + pand m14, m7 + pand m13, m12 + + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + pcmpgtw m6, m5, m14 + punpckhwd m6, m14, m6 + pmovsxwd m11, m14 + mova [qcoeffq+ncoeffq*4+ 0], m11 + mova [qcoeffq+ncoeffq*4+16], m6 + pcmpgtw m6, m5, m13 + punpckhwd m6, m13, m6 + pmovsxwd m11, m13 + mova [qcoeffq+ncoeffq*4+32], m11 + mova [qcoeffq+ncoeffq*4+48], m6 + +%ifidn %1, b_32x32 + pabsw m14, m14 + pabsw m13, m13 +%endif + pmullw m14, m3 ; dqc[i] = qc[i] * q + pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, b_32x32 + psrlw m14, 1 + psrlw m13, 1 + psignw m14, m9 + psignw m13, m10 +%endif + + ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff + pcmpgtw m6, m5, m14 + punpckhwd m6, m14, m6 + pmovsxwd m11, m14 + mova [dqcoeffq+ncoeffq*4+ 0], m11 + mova [dqcoeffq+ncoeffq*4+16], m6 + pcmpgtw m6, m5, m13 + punpckhwd m6, m13, m6 + pmovsxwd m11, m13 + mova [dqcoeffq+ncoeffq*4+32], m11 + mova [dqcoeffq+ncoeffq*4+48], m6 + + pcmpeqw m14, m5 ; m14 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m12 ; m11 = scan[i] + 1 + pandn m14, m6 ; m14 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m14 + pmaxsw m8, m13 + add ncoeffq, mmsize + jnz .ac_only_loop + + ; Horizontally accumulate/max eobs and write into [eob] memory pointer + mov r2, eobmp + pshufd m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0x1 + pmaxsw m8, m7 + movq rax, m8 + mov [r2], ax + vzeroupper + RET +%endmacro + +INIT_XMM avx +QUANTIZE_FN b, 9 +QUANTIZE_FN b_32x32, 9 diff --git a/libs/libaom/src/aom_dsp/x86/quantize_sse2.c b/libs/libaom/src/aom_dsp/x86/quantize_sse2.c new file mode 100644 index 000000000..ebef1fbac --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/quantize_sse2.c @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/quantize_x86.h" + +void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan_ptr, + const int16_t *iscan_ptr) { + const __m128i zero = _mm_setzero_si128(); + int index = 16; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i eob, eob0; + + (void)scan_ptr; + + // Setup global values. + load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, + dequant_ptr, &dequant, quant_shift_ptr, &shift); + + // Do DC and first 15 AC. + coeff0 = load_coefficients(coeff_ptr); + coeff1 = load_coefficients(coeff_ptr + 8); + + // Poor man's abs(). + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + calculate_qcoeff(&qcoeff0, round, quant, shift); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr); + store_coefficients(qcoeff1, qcoeff_ptr + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_coefficients(coeff0, dqcoeff_ptr); + store_coefficients(coeff1, dqcoeff_ptr + 8); + + eob = + scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero); + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_coefficients(coeff_ptr + index); + coeff1 = load_coefficients(coeff_ptr + index + 8); + + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr + index); + store_coefficients(qcoeff1, qcoeff_ptr + index + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_coefficients(coeff0, dqcoeff_ptr + index); + store_coefficients(coeff1, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, + index, zero); + eob = _mm_max_epi16(eob, eob0); + + index += 16; + } + + *eob_ptr = accumulate_eob(eob); +} diff --git a/libs/libaom/src/aom_dsp/x86/quantize_ssse3.c b/libs/libaom/src/aom_dsp/x86/quantize_ssse3.c new file mode 100644 index 000000000..25980a055 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/quantize_ssse3.c @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/quantize_x86.h" + +static INLINE void calculate_qcoeff_64x64(__m128i *coeff, const __m128i round, + const __m128i quant, + const __m128i *shift) { + __m128i tmp, qcoeff, tmp1; + qcoeff = _mm_adds_epi16(*coeff, round); + tmp = _mm_mulhi_epi16(qcoeff, quant); + qcoeff = _mm_add_epi16(tmp, qcoeff); + tmp = _mm_mullo_epi16(qcoeff, *shift); + tmp = _mm_srli_epi16(tmp, 14); + tmp1 = _mm_mulhi_epi16(qcoeff, *shift); + tmp1 = _mm_slli_epi16(tmp1, 2); + *coeff = _mm_or_si128(tmp, tmp1); +} + +static INLINE void calculate_dqcoeff_and_store_64x64(const __m128i qcoeff, + const __m128i dequant, + const __m128i zero, + tran_low_t *dqcoeff) { + // Un-sign to bias rounding like C. + const __m128i coeff = _mm_abs_epi16(qcoeff); + + const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff); + const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff); + + const __m128i low = _mm_mullo_epi16(coeff, dequant); + const __m128i high = _mm_mulhi_epi16(coeff, dequant); + __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high); + __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high); + + // "Divide" by 4. + dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, 2); + dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, 2); + + dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0); + dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1); + + _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0); + _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1); +} + +void aom_quantize_b_64x64_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i two = _mm_set1_epi16(2); + int index; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1, all_zero; + __m128i eob = zero, eob0; + + (void)scan; + (void)n_coeffs; + + // Setup global values. + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + + // Shift with rounding. + zbin = _mm_add_epi16(zbin, two); + round = _mm_add_epi16(round, two); + zbin = _mm_srli_epi16(zbin, 2); + round = _mm_srli_epi16(round, 2); + zbin = _mm_sub_epi16(zbin, one); + // Do DC and first 15 AC. + coeff0 = load_coefficients(coeff_ptr); + coeff1 = load_coefficients(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + // Mask out zbin threshold coeffs. + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr); + store_coefficients(qcoeff1, qcoeff_ptr + 8); + + calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero, dqcoeff_ptr); + dequant = _mm_unpackhi_epi64(dequant, dequant); + calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero, dqcoeff_ptr + 8); + + eob = + scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + } + + // AC only loop. + for (index = 16; index < 1024; index += 16) { + coeff0 = load_coefficients(coeff_ptr + index); + coeff1 = load_coefficients(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); + continue; + } + calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift); + calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift); + + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr + index); + store_coefficients(qcoeff1, qcoeff_ptr + index + 8); + + calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero, + dqcoeff_ptr + index); + calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero, + dqcoeff_ptr + 8 + index); + + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, + zero); + eob = _mm_max_epi16(eob, eob0); + } + + *eob_ptr = accumulate_eob(eob); +} diff --git a/libs/libaom/src/aom_dsp/x86/quantize_ssse3_x86_64.asm b/libs/libaom/src/aom_dsp/x86/quantize_ssse3_x86_64.asm new file mode 100644 index 000000000..fa616a6f1 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/quantize_ssse3_x86_64.asm @@ -0,0 +1,302 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_1: times 8 dw 1 + +SECTION .text + +%macro QUANTIZE_FN 2 +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \ + shift, qcoeff, dqcoeff, dequant, \ + eob, scan, iscan + + ; actual quantize loop - setup pointers, rounders, etc. + movifnidn coeffq, coeffmp + movifnidn ncoeffq, ncoeffmp + movifnidn zbinq, zbinmp + movifnidn roundq, roundmp + movifnidn quantq, quantmp + movifnidn dequantq, dequantmp + mova m0, [zbinq] ; m0 = zbin + mova m1, [roundq] ; m1 = round + mova m2, [quantq] ; m2 = quant +%ifidn %1, b_32x32 + pcmpeqw m5, m5 + psrlw m5, 15 + paddw m0, m5 + paddw m1, m5 + psrlw m0, 1 ; m0 = (m0 + 1) / 2 + psrlw m1, 1 ; m1 = (m1 + 1) / 2 +%endif + mova m3, [dequantq] ; m3 = dequant + mov r2, shiftmp + psubw m0, [GLOBAL(pw_1)] + mova m4, [r2] ; m4 = shift + mov r3, qcoeffmp + mov r4, dqcoeffmp + mov r5, iscanmp + pxor m5, m5 ; m5 = dedicated zero + DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob + lea coeffq, [ coeffq+ncoeffq*4] + lea qcoeffq, [ qcoeffq+ncoeffq*4] + lea dqcoeffq, [dqcoeffq+ncoeffq*4] + lea iscanq, [ iscanq+ncoeffq*2] + neg ncoeffq + + ; get DC and first 15 AC coeffs + ; coeff stored as 32bit numbers & require 16bit numbers + mova m9, [ coeffq+ncoeffq*4+ 0] + packssdw m9, [ coeffq+ncoeffq*4+16] + mova m10, [ coeffq+ncoeffq*4+32] + packssdw m10, [ coeffq+ncoeffq*4+48] + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin + punpckhqdq m0, m0 + pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin + paddsw m6, m1 ; m6 += round + punpckhqdq m1, m1 + paddsw m11, m1 ; m11 += round + pmulhw m8, m6, m2 ; m8 = m6*q>>16 + punpckhqdq m2, m2 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + paddw m8, m6 ; m8 += m6 + paddw m13, m11 ; m13 += m11 + %ifidn %1, b_32x32 + pmullw m5, m8, m4 ; store the lower 16 bits of m8*qsh + %endif + pmulhw m8, m4 ; m8 = m8*qsh>>16 + %ifidn %1, b_32x32 + psllw m8, 1 + psrlw m5, 15 + por m8, m5 + %endif + punpckhqdq m4, m4 + %ifidn %1, b_32x32 + pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh + %endif + pmulhw m13, m4 ; m13 = m13*qsh>>16 + %ifidn %1, b_32x32 + psllw m13, 1 + psrlw m5, 15 + por m13, m5 + pxor m5, m5 ; reset m5 to zero register + %endif + psignw m8, m9 ; m8 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + pand m8, m7 + pand m13, m12 + + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + mova m11, m8 + mova m6, m8 + pcmpgtw m5, m8 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [qcoeffq+ncoeffq*4+ 0], m11 + mova [qcoeffq+ncoeffq*4+16], m6 + pxor m5, m5 + mova m11, m13 + mova m6, m13 + pcmpgtw m5, m13 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [qcoeffq+ncoeffq*4+32], m11 + mova [qcoeffq+ncoeffq*4+48], m6 + pxor m5, m5 ; reset m5 to zero register + +%ifidn %1, b_32x32 + pabsw m8, m8 + pabsw m13, m13 +%endif + pmullw m8, m3 ; dqc[i] = qc[i] * q + punpckhqdq m3, m3 + pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, b_32x32 + psrlw m8, 1 + psrlw m13, 1 + psignw m8, m9 + psignw m13, m10 +%endif + ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff + mova m11, m8 + mova m6, m8 + pcmpgtw m5, m8 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [dqcoeffq+ncoeffq*4+ 0], m11 + mova [dqcoeffq+ncoeffq*4+16], m6 + pxor m5, m5 + mova m11, m13 + mova m6, m13 + pcmpgtw m5, m13 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [dqcoeffq+ncoeffq*4+32], m11 + mova [dqcoeffq+ncoeffq*4+48], m6 + pxor m5, m5 ; reset m5 to zero register + pcmpeqw m8, m5 ; m8 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m12 ; m11 = scan[i] + 1 + pandn m8, m6 ; m8 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m13 + add ncoeffq, mmsize + jz .accumulate_eob + +.ac_only_loop: + ; pack coeff from 32bit to 16bit array + mova m9, [ coeffq+ncoeffq*4+ 0] + packssdw m9, [ coeffq+ncoeffq*4+16] + mova m10, [ coeffq+ncoeffq*4+32] + packssdw m10, [ coeffq+ncoeffq*4+48] + + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin + pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin +%ifidn %1, b_32x32 + pmovmskb r6d, m7 + pmovmskb r2d, m12 + or r6, r2 + jz .skip_iter +%endif + paddsw m6, m1 ; m6 += round + paddsw m11, m1 ; m11 += round + pmulhw m14, m6, m2 ; m14 = m6*q>>16 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + paddw m14, m6 ; m14 += m6 + paddw m13, m11 ; m13 += m11 + %ifidn %1, b_32x32 + pmullw m5, m14, m4 ; store the lower 16 bits of m14*qsh + %endif + pmulhw m14, m4 ; m14 = m14*qsh>>16 + %ifidn %1, b_32x32 + psllw m14, 1 + psrlw m5, 15 + por m14, m5 + pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh + %endif + pmulhw m13, m4 ; m13 = m13*qsh>>16 + %ifidn %1, b_32x32 + psllw m13, 1 + psrlw m5, 15 + por m13, m5 + pxor m5, m5 ; reset m5 to zero register + %endif + psignw m14, m9 ; m14 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + pand m14, m7 + pand m13, m12 + + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + pxor m11, m11 + mova m11, m14 + mova m6, m14 + pcmpgtw m5, m14 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [qcoeffq+ncoeffq*4+ 0], m11 + mova [qcoeffq+ncoeffq*4+16], m6 + pxor m5, m5 + mova m11, m13 + mova m6, m13 + pcmpgtw m5, m13 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [qcoeffq+ncoeffq*4+32], m11 + mova [qcoeffq+ncoeffq*4+48], m6 + pxor m5, m5 ; reset m5 to zero register + +%ifidn %1, b_32x32 + pabsw m14, m14 + pabsw m13, m13 +%endif + pmullw m14, m3 ; dqc[i] = qc[i] * q + pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, b_32x32 + psrlw m14, 1 + psrlw m13, 1 + psignw m14, m9 + psignw m13, m10 +%endif + + ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff + mova m11, m14 + mova m6, m14 + pcmpgtw m5, m14 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [dqcoeffq+ncoeffq*4+ 0], m11 + mova [dqcoeffq+ncoeffq*4+16], m6 + pxor m5, m5 + mova m11, m13 + mova m6, m13 + pcmpgtw m5, m13 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [dqcoeffq+ncoeffq*4+32], m11 + mova [dqcoeffq+ncoeffq*4+48], m6 + pxor m5, m5 + + pcmpeqw m14, m5 ; m14 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m12 ; m11 = scan[i] + 1 + pandn m14, m6 ; m14 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m14 + pmaxsw m8, m13 + add ncoeffq, mmsize + jl .ac_only_loop + +%ifidn %1, b_32x32 + jmp .accumulate_eob +.skip_iter: + mova [qcoeffq+ncoeffq*4+ 0], m5 + mova [qcoeffq+ncoeffq*4+16], m5 + mova [qcoeffq+ncoeffq*4+32], m5 + mova [qcoeffq+ncoeffq*4+48], m5 + mova [dqcoeffq+ncoeffq*4+ 0], m5 + mova [dqcoeffq+ncoeffq*4+16], m5 + mova [dqcoeffq+ncoeffq*4+32], m5 + mova [dqcoeffq+ncoeffq*4+48], m5 + add ncoeffq, mmsize + jl .ac_only_loop +%endif + +.accumulate_eob: + ; horizontally accumulate/max eobs and write into [eob] memory pointer + mov r2, eobmp + pshufd m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0x1 + pmaxsw m8, m7 + pextrw r6, m8, 0 + mov [r2], r6 + RET +%endmacro + +INIT_XMM ssse3 +QUANTIZE_FN b, 9 +QUANTIZE_FN b_32x32, 9 diff --git a/libs/libaom/src/aom_dsp/x86/quantize_x86.h b/libs/libaom/src/aom_dsp/x86/quantize_x86.h new file mode 100644 index 000000000..5b040a278 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/quantize_x86.h @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom/aom_integer.h" + +static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin, + const int16_t *round_ptr, __m128i *round, + const int16_t *quant_ptr, __m128i *quant, + const int16_t *dequant_ptr, __m128i *dequant, + const int16_t *shift_ptr, __m128i *shift) { + *zbin = _mm_load_si128((const __m128i *)zbin_ptr); + *round = _mm_load_si128((const __m128i *)round_ptr); + *quant = _mm_load_si128((const __m128i *)quant_ptr); + *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1)); + *dequant = _mm_load_si128((const __m128i *)dequant_ptr); + *shift = _mm_load_si128((const __m128i *)shift_ptr); +} + +// With ssse3 and later abs() and sign() are preferred. +static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) { + a = _mm_xor_si128(a, sign); + return _mm_sub_epi16(a, sign); +} + +static INLINE __m128i invert_sign_32_sse2(__m128i a, __m128i sign) { + a = _mm_xor_si128(a, sign); + return _mm_sub_epi32(a, sign); +} + +static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round, + const __m128i quant, const __m128i shift) { + __m128i tmp, qcoeff; + qcoeff = _mm_adds_epi16(*coeff, round); + tmp = _mm_mulhi_epi16(qcoeff, quant); + qcoeff = _mm_add_epi16(tmp, qcoeff); + *coeff = _mm_mulhi_epi16(qcoeff, shift); +} + +static INLINE void calculate_qcoeff_log_scale(__m128i *coeff, + const __m128i round, + const __m128i quant, + const __m128i *shift, + const int *log_scale) { + __m128i tmp, tmp1, qcoeff; + qcoeff = _mm_adds_epi16(*coeff, round); + tmp = _mm_mulhi_epi16(qcoeff, quant); + qcoeff = _mm_add_epi16(tmp, qcoeff); + tmp = _mm_mullo_epi16(qcoeff, *shift); + tmp = _mm_srli_epi16(tmp, (16 - *log_scale)); + tmp1 = _mm_mulhi_epi16(qcoeff, *shift); + tmp1 = _mm_slli_epi16(tmp1, *log_scale); + *coeff = _mm_or_si128(tmp, tmp1); +} + +static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) { + return _mm_mullo_epi16(qcoeff, dequant); +} + +static INLINE void calculate_dqcoeff_and_store_log_scale(__m128i qcoeff, + __m128i dequant, + const __m128i zero, + tran_low_t *dqcoeff, + const int *log_scale) { + // calculate abs + __m128i coeff_sign = _mm_srai_epi16(qcoeff, 15); + __m128i coeff = invert_sign_sse2(qcoeff, coeff_sign); + + const __m128i sign_0 = _mm_unpacklo_epi16(coeff_sign, zero); + const __m128i sign_1 = _mm_unpackhi_epi16(coeff_sign, zero); + + const __m128i low = _mm_mullo_epi16(coeff, dequant); + const __m128i high = _mm_mulhi_epi16(coeff, dequant); + __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high); + __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high); + + dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, *log_scale); + dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, *log_scale); + + dqcoeff32_0 = invert_sign_32_sse2(dqcoeff32_0, sign_0); + dqcoeff32_1 = invert_sign_32_sse2(dqcoeff32_1, sign_1); + + _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0); + _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1); +} + +// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing +// to zbin to add 1 to the index in 'scan'. +static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1, + const __m128i zbin_mask0, + const __m128i zbin_mask1, + const int16_t *scan_ptr, const int index, + const __m128i zero) { + const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero); + const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero); + __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index)); + __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8)); + __m128i eob0, eob1; + // Add one to convert from indices to counts + scan0 = _mm_sub_epi16(scan0, zbin_mask0); + scan1 = _mm_sub_epi16(scan1, zbin_mask1); + eob0 = _mm_andnot_si128(zero_coeff0, scan0); + eob1 = _mm_andnot_si128(zero_coeff1, scan1); + return _mm_max_epi16(eob0, eob1); +} + +static INLINE int16_t accumulate_eob(__m128i eob) { + __m128i eob_shuffled; + eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + return _mm_extract_epi16(eob, 1); +} + +static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) { + assert(sizeof(tran_low_t) == 4); + const __m128i coeff1 = _mm_load_si128((__m128i *)(coeff_ptr)); + const __m128i coeff2 = _mm_load_si128((__m128i *)(coeff_ptr + 4)); + return _mm_packs_epi32(coeff1, coeff2); +} + +static INLINE void store_coefficients(__m128i coeff_vals, + tran_low_t *coeff_ptr) { + assert(sizeof(tran_low_t) == 4); + + __m128i one = _mm_set1_epi16(1); + __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); + __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); + __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); + __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); + _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); + _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); +} + +static INLINE void update_mask1(__m128i *cmp_mask0, __m128i *cmp_mask1, + const int16_t *iscan_ptr, int *is_found, + __m128i *mask) { + __m128i all_zero; + __m128i temp_mask = _mm_setzero_si128(); + all_zero = _mm_or_si128(*cmp_mask0, *cmp_mask1); + if (_mm_movemask_epi8(all_zero)) { + __m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr)); + __m128i mask0 = _mm_and_si128(*cmp_mask0, iscan0); + __m128i iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + 8)); + __m128i mask1 = _mm_and_si128(*cmp_mask1, iscan1); + temp_mask = _mm_max_epi16(mask0, mask1); + *is_found = 1; + } + *mask = _mm_max_epi16(temp_mask, *mask); +} + +static INLINE void update_mask0(__m128i *qcoeff0, __m128i *qcoeff1, + __m128i *threshold, const int16_t *iscan_ptr, + int *is_found, __m128i *mask) { + __m128i zero = _mm_setzero_si128(); + __m128i coeff[4], cmp_mask0, cmp_mask1, cmp_mask2, cmp_mask3; + + coeff[0] = _mm_unpacklo_epi16(*qcoeff0, zero); + coeff[1] = _mm_unpackhi_epi16(*qcoeff0, zero); + coeff[2] = _mm_unpacklo_epi16(*qcoeff1, zero); + coeff[3] = _mm_unpackhi_epi16(*qcoeff1, zero); + + coeff[0] = _mm_slli_epi32(coeff[0], AOM_QM_BITS); + cmp_mask0 = _mm_cmpgt_epi32(coeff[0], threshold[0]); + coeff[1] = _mm_slli_epi32(coeff[1], AOM_QM_BITS); + cmp_mask1 = _mm_cmpgt_epi32(coeff[1], threshold[1]); + coeff[2] = _mm_slli_epi32(coeff[2], AOM_QM_BITS); + cmp_mask2 = _mm_cmpgt_epi32(coeff[2], threshold[1]); + coeff[3] = _mm_slli_epi32(coeff[3], AOM_QM_BITS); + cmp_mask3 = _mm_cmpgt_epi32(coeff[3], threshold[1]); + + cmp_mask0 = _mm_packs_epi32(cmp_mask0, cmp_mask1); + cmp_mask1 = _mm_packs_epi32(cmp_mask2, cmp_mask3); + + update_mask1(&cmp_mask0, &cmp_mask1, iscan_ptr, is_found, mask); +} + +static INLINE int calculate_non_zero_count(__m128i mask) { + __m128i mask0, mask1; + int non_zero_count = 0; + mask0 = _mm_unpackhi_epi64(mask, mask); + mask1 = _mm_max_epi16(mask0, mask); + mask0 = _mm_shuffle_epi32(mask1, 1); + mask0 = _mm_max_epi16(mask0, mask1); + mask1 = _mm_srli_epi32(mask0, 16); + mask0 = _mm_max_epi16(mask0, mask1); + non_zero_count = _mm_extract_epi16(mask0, 0) + 1; + + return non_zero_count; +} diff --git a/libs/libaom/src/aom_dsp/x86/sad4d_avx2.c b/libs/libaom/src/aom_dsp/x86/sad4d_avx2.c new file mode 100644 index 000000000..077125258 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/sad4d_avx2.c @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include // AVX2 + +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" + +void aom_sadMxNx4d_avx2(int M, int N, const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4]) { + __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; + __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; + int i, j; + const uint8_t *ref0, *ref1, *ref2, *ref3; + + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + sum_ref0 = _mm256_setzero_si256(); + sum_ref2 = _mm256_setzero_si256(); + sum_ref1 = _mm256_setzero_si256(); + sum_ref3 = _mm256_setzero_si256(); + + for (i = 0; i < N; i++) { + for (j = 0; j < M; j += 32) { + // load src and all refs + src_reg = _mm256_loadu_si256((const __m256i *)(src + j)); + ref0_reg = _mm256_loadu_si256((const __m256i *)(ref0 + j)); + ref1_reg = _mm256_loadu_si256((const __m256i *)(ref1 + j)); + ref2_reg = _mm256_loadu_si256((const __m256i *)(ref2 + j)); + ref3_reg = _mm256_loadu_si256((const __m256i *)(ref3 + j)); + + // sum of the absolute differences between every ref-i to src + ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); + ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); + ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); + ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); + // sum every ref-i + sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); + sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); + sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); + sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); + } + src += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; + } + { + __m128i sum; + __m256i sum_mlow, sum_mhigh; + // in sum_ref-i the result is saved in the first 4 bytes + // the other 4 bytes are zeroed. + // sum_ref1 and sum_ref3 are shifted left by 4 bytes + sum_ref1 = _mm256_slli_si256(sum_ref1, 4); + sum_ref3 = _mm256_slli_si256(sum_ref3, 4); + + // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 + sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1); + sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3); + + // merge every 64 bit from each sum_ref-i + sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2); + sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2); + + // add the low 64 bit to the high 64 bit + sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); + + // add the low 128 bit to the high 128 bit + sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), + _mm256_extractf128_si256(sum_mlow, 1)); + + _mm_storeu_si128((__m128i *)(res), sum); + } +} + +#define sadMxN_avx2(m, n) \ + void aom_sad##m##x##n##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref[4], int ref_stride, \ + uint32_t res[4]) { \ + aom_sadMxNx4d_avx2(m, n, src, src_stride, ref, ref_stride, res); \ + } + +sadMxN_avx2(32, 8); +sadMxN_avx2(32, 16); +sadMxN_avx2(32, 32); +sadMxN_avx2(32, 64); + +sadMxN_avx2(64, 16); +sadMxN_avx2(64, 32); +sadMxN_avx2(64, 64); +sadMxN_avx2(64, 128); + +sadMxN_avx2(128, 64); +sadMxN_avx2(128, 128); diff --git a/libs/libaom/src/aom_dsp/x86/sad4d_sse2.asm b/libs/libaom/src/aom_dsp/x86/sad4d_sse2.asm new file mode 100644 index 000000000..a9043742d --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/sad4d_sse2.asm @@ -0,0 +1,428 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro AVG_4x2x4 2 + movh m2, [second_predq] + movlhps m2, m2 + pavgb %1, m2 + pavgb %2, m2 + lea second_predq, [second_predq+8] +%endmacro +; 'mflag' affect a lot how the code works. +; +; When 'mflag' is false, the 'src_strideq' resides in register, +; [srcq + src_strideq + offset] is allowed, so we can simply +; use such form to access src memory and don't bother to update +; 'srcq' at each line. We only update 'srcq' each two-lines using +; a compact LEA instruction like [srcq+src_strideq*2]. +; +; When 'mflag' is true, the 'src_strideq' resides in memory. +; we cannot use above form to access memory, we have to update +; 'srcq' at each line break. As we process two parts (first,second) +; together in each macro function, the second part may also sit +; in the next line, which means we also need to possibly add +; one 'src_strideq' to 'srcq' before processing second part. + +%macro HANDLE_FIRST_OFFSET 2 + %define first_offset %2 + %if mflag == 0 && %1 == 1 + %define first_offset (src_strideq + %2) + %endif +%endmacro + +; first_extraline, second_extraline, in_line_offset +%macro HANDLE_SECOND_OFFSET 3 + %define second_offset %3 + %if mflag && %1 == 0 && %2 == 1 + add srcq, src_strideq + %endif + %if mflag == 0 && %2 == 1 + %define second_offset (src_strideq + %3) + %endif +%endmacro + +; Notes for line_ending: +; 0 -- not a line ending +; 1 -- line ending of a odd line [line numbers starts from one] +; 2 -- line ending of a even line +; This is specically designed to handle when src_strideq is a +; memory position, under such case, we can not accomplish +; complex address calculation using LEA, and fall back to +; using simple ADD instruction at each line ending. +%macro ADVANCE_END_OF_LINE 1 + %if mflag + add srcq, src_strideq + %endif + %if mflag == 0 && %1 == 2 + lea srcq, [srcq +src_strideq*2] + %endif + + %if %1 == 2 + lea ref1q, [ref1q+ref_strideq*2] + lea ref2q, [ref2q+ref_strideq*2] + lea ref3q, [ref3q+ref_strideq*2] + lea ref4q, [ref4q+ref_strideq*2] + %endif +%endmacro + +; Please note that the second_offset of src is for in_line_offset, +; so it is less than src_stride. +; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, do_avg, +; {first, second}_extraline, line_ending +%macro PROCESS_4x2x4 9 + HANDLE_FIRST_OFFSET %7, %2 + movd m0, [srcq + first_offset] + HANDLE_SECOND_OFFSET %7, %8, %4 +%if %1 == 1 + movd m6, [ref1q+%3] + movd m4, [ref2q+%3] + movd m7, [ref3q+%3] + movd m5, [ref4q+%3] + + movd m1, [srcq + second_offset] + movd m2, [ref1q+%5] + punpckldq m0, m1 + punpckldq m6, m2 + movd m1, [ref2q+%5] + movd m2, [ref3q+%5] + movd m3, [ref4q+%5] + punpckldq m4, m1 + punpckldq m7, m2 + punpckldq m5, m3 + movlhps m0, m0 + movlhps m6, m4 + movlhps m7, m5 +%if %6 == 1 + AVG_4x2x4 m6, m7 +%endif + psadbw m6, m0 + psadbw m7, m0 +%else + movd m1, [ref1q+%3] + movd m5, [ref1q+%5] + movd m2, [ref2q+%3] + movd m4, [ref2q+%5] + punpckldq m1, m5 + punpckldq m2, m4 + movd m3, [ref3q+%3] + movd m5, [ref3q+%5] + punpckldq m3, m5 + movd m4, [ref4q+%3] + movd m5, [ref4q+%5] + punpckldq m4, m5 + movd m5, [srcq + second_offset] + punpckldq m0, m5 + movlhps m0, m0 + movlhps m1, m2 + movlhps m3, m4 +%if %6 == 1 + AVG_4x2x4 m1, m3 +%endif + psadbw m1, m0 + psadbw m3, m0 + paddd m6, m1 + paddd m7, m3 +%endif +%if %9 > 0 + ADVANCE_END_OF_LINE %9 +%endif +%endmacro + +; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, do_avg, +; {first,second}_extraline, line_ending +%macro PROCESS_8x2x4 9 + HANDLE_FIRST_OFFSET %7, %2 + movh m0, [srcq + first_offset] + HANDLE_SECOND_OFFSET %7, %8, %4 +%if %1 == 1 + movh m4, [ref1q+%3] + movh m5, [ref2q+%3] + movh m6, [ref3q+%3] + movh m7, [ref4q+%3] + movhps m0, [srcq + second_offset] + movhps m4, [ref1q+%5] + movhps m5, [ref2q+%5] + movhps m6, [ref3q+%5] + movhps m7, [ref4q+%5] +%if %6 == 1 + movu m3, [second_predq] + pavgb m4, m3 + pavgb m5, m3 + pavgb m6, m3 + pavgb m7, m3 + lea second_predq, [second_predq+mmsize] +%endif + psadbw m4, m0 + psadbw m5, m0 + psadbw m6, m0 + psadbw m7, m0 +%else + movh m1, [ref1q+%3] + movh m2, [ref2q+%3] + movhps m0, [srcq + second_offset] + movhps m1, [ref1q+%5] + movhps m2, [ref2q+%5] +%if %6 == 1 + movu m3, [second_predq] + pavgb m1, m3 + pavgb m2, m3 +%endif + psadbw m1, m0 + psadbw m2, m0 + paddd m4, m1 + paddd m5, m2 + + movh m1, [ref3q+%3] + movhps m1, [ref3q+%5] + movh m2, [ref4q+%3] + movhps m2, [ref4q+%5] +%if %6 == 1 + pavgb m1, m3 + pavgb m2, m3 + lea second_predq, [second_predq+mmsize] +%endif + psadbw m1, m0 + psadbw m2, m0 + paddd m6, m1 + paddd m7, m2 +%endif +%if %9 > 0 + ADVANCE_END_OF_LINE %9 +%endif +%endmacro + +; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, do_avg, +; {first,second}_extraline, line_ending +%macro PROCESS_16x2x4 9 + ; 1st 16 px + HANDLE_FIRST_OFFSET %7, %2 + mova m0, [srcq + first_offset] + HANDLE_SECOND_OFFSET %7, %8, %4 +%if %1 == 1 + movu m4, [ref1q+%3] + movu m5, [ref2q+%3] + movu m6, [ref3q+%3] + movu m7, [ref4q+%3] +%if %6 == 1 + movu m3, [second_predq] + pavgb m4, m3 + pavgb m5, m3 + pavgb m6, m3 + pavgb m7, m3 + lea second_predq, [second_predq+mmsize] +%endif + psadbw m4, m0 + psadbw m5, m0 + psadbw m6, m0 + psadbw m7, m0 +%else ; %1 == 1 + movu m1, [ref1q+%3] + movu m2, [ref2q+%3] +%if %6 == 1 + movu m3, [second_predq] + pavgb m1, m3 + pavgb m2, m3 +%endif + psadbw m1, m0 + psadbw m2, m0 + paddd m4, m1 + paddd m5, m2 + + movu m1, [ref3q+%3] + movu m2, [ref4q+%3] +%if %6 == 1 + pavgb m1, m3 + pavgb m2, m3 + lea second_predq, [second_predq+mmsize] +%endif + psadbw m1, m0 + psadbw m2, m0 + paddd m6, m1 + paddd m7, m2 +%endif ; %1 == 1 + + ; 2nd 16 px + mova m0, [srcq + second_offset] + movu m1, [ref1q+%5] + movu m2, [ref2q+%5] + +%if %6 == 1 + movu m3, [second_predq] + pavgb m1, m3 + pavgb m2, m3 +%endif + psadbw m1, m0 + psadbw m2, m0 + paddd m4, m1 + paddd m5, m2 + + movu m1, [ref3q+%5] + movu m2, [ref4q+%5] + +%if %9 > 0 + ADVANCE_END_OF_LINE %9 +%endif + +%if %6 == 1 + pavgb m1, m3 + pavgb m2, m3 + lea second_predq, [second_predq+mmsize] +%endif + psadbw m1, m0 + psadbw m2, m0 + paddd m6, m1 + paddd m7, m2 +%endmacro + +; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, do_avg, +; {first,second}_extraline, line_ending +%macro PROCESS_32x2x4 9 + PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16, %6, %7, %7, %8 - %7 + PROCESS_16x2x4 0, %4, %5, %4 + 16, %5 + 16, %6, %8, %8, %9 +%endmacro + +; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, do_avg, +; {first,second}_extraline, line_ending +%macro PROCESS_64x2x4 9 + PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32, %6, %7, %7, %8 - %7 + PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6, %8, %8, %9 +%endmacro + +; PROCESS_128x2x4 first, off_{first,second}_{src,ref}, do_avg, +; {first,second}_extraline, line_ending +%macro PROCESS_128x2x4 9 + PROCESS_64x2x4 %1, %2, %3, %2 + 64, %3 + 64, %6, %7, %7, %8 - %7 + PROCESS_64x2x4 0, %4, %5, %4 + 64, %5 + 64, %6, %8, %8, %9 +%endmacro + +; void aom_sadNxNx4d_sse2(uint8_t *src, int src_stride, +; uint8_t *ref[4], int ref_stride, +; uint32_t res[4]); +; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4 +%macro SADNXN4D 2-3 0 +%if %3 == 0 +%if UNIX64 +cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif +%else ; avg + +%if UNIX64 +cglobal sad%1x%2x4d_avg, 6, 10, 8, src, src_stride, ref1, ref_stride, \ + second_pred, res, ref2, ref3, ref4 +%else +cglobal sad%1x%2x4d_avg, 5, 7, 8, src, ref4, ref1, ref_stride, \ + second_pred, ref2, ref3 + %define src_strideq r1mp + %define src_strided r1mp +%endif +%endif + + %define mflag ((1 - UNIX64) & %3) + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + + mov ref2q, [ref1q+gprsize*1] + mov ref3q, [ref1q+gprsize*2] + mov ref4q, [ref1q+gprsize*3] + mov ref1q, [ref1q+gprsize*0] + + PROCESS_%1x2x4 1, 0, 0, 0, ref_strideq, %3, 0, 1, 2 +%rep (%2-4)/2 + PROCESS_%1x2x4 0, 0, 0, 0, ref_strideq, %3, 0, 1, 2 +%endrep + PROCESS_%1x2x4 0, 0, 0, 0, ref_strideq, %3, 0, 1, 2 + +%if %3 == 0 + %define resultq r4 + %define resultmp r4mp +%else + %define resultq r5 + %define resultmp r5mp +%endif + +%if %1 > 4 + pslldq m5, 4 + pslldq m7, 4 + por m4, m5 + por m6, m7 + mova m5, m4 + mova m7, m6 + punpcklqdq m4, m6 + punpckhqdq m5, m7 + paddd m4, m5 + movifnidn resultq, resultmp + movu [resultq], m4 + RET +%else + pshufd m6, m6, 0x08 + pshufd m7, m7, 0x08 + movifnidn resultq, resultmp + movq [resultq+0], m6 + movq [resultq+8], m7 + RET +%endif +%endmacro + +INIT_XMM sse2 +SADNXN4D 128, 128 +SADNXN4D 128, 64 +SADNXN4D 64, 128 +SADNXN4D 64, 64 +SADNXN4D 64, 32 +SADNXN4D 32, 64 +SADNXN4D 32, 32 +SADNXN4D 32, 16 +SADNXN4D 16, 32 +SADNXN4D 16, 16 +SADNXN4D 16, 8 +SADNXN4D 8, 16 +SADNXN4D 8, 8 +SADNXN4D 8, 4 +SADNXN4D 4, 8 +SADNXN4D 4, 4 +SADNXN4D 4, 16 +SADNXN4D 16, 4 +SADNXN4D 8, 32 +SADNXN4D 32, 8 +SADNXN4D 16, 64 +SADNXN4D 64, 16 +SADNXN4D 128, 128, 1 +SADNXN4D 128, 64, 1 +SADNXN4D 64, 128, 1 +SADNXN4D 64, 64, 1 +SADNXN4D 64, 32, 1 +SADNXN4D 32, 64, 1 +SADNXN4D 32, 32, 1 +SADNXN4D 32, 16, 1 +SADNXN4D 16, 32, 1 +SADNXN4D 16, 16, 1 +SADNXN4D 16, 8, 1 +SADNXN4D 8, 16, 1 +SADNXN4D 8, 8, 1 +SADNXN4D 8, 4, 1 +SADNXN4D 4, 8, 1 +SADNXN4D 4, 4, 1 +SADNXN4D 4, 16, 1 +SADNXN4D 16, 4, 1 +SADNXN4D 8, 32, 1 +SADNXN4D 32, 8, 1 +SADNXN4D 16, 64, 1 +SADNXN4D 64, 16, 1 diff --git a/libs/libaom/src/aom_dsp/x86/sad_avx2.c b/libs/libaom/src/aom_dsp/x86/sad_avx2.c new file mode 100644 index 000000000..a50dba64a --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/sad_avx2.c @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_ports/mem.h" + +#define FSAD64_H(h) \ + unsigned int aom_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + int i, res; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + for (i = 0; i < h; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ + sad1_reg = _mm256_sad_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8( \ + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ + sum_sad = \ + _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr += ref_stride; \ + src_ptr += src_stride; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + res = _mm_cvtsi128_si32(sum_sad128); \ + _mm256_zeroupper(); \ + return res; \ + } + +#define FSAD32_H(h) \ + unsigned int aom_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + int i, res; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + int ref2_stride = ref_stride << 1; \ + int src2_stride = src_stride << 1; \ + int max = h >> 1; \ + for (i = 0; i < max; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ + sad1_reg = _mm256_sad_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8( \ + ref2_reg, \ + _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ + sum_sad = \ + _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr += ref2_stride; \ + src_ptr += src2_stride; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + res = _mm_cvtsi128_si32(sum_sad128); \ + _mm256_zeroupper(); \ + return res; \ + } + +#define FSAD64 \ + FSAD64_H(64); \ + FSAD64_H(32); + +#define FSAD32 \ + FSAD32_H(64); \ + FSAD32_H(32); \ + FSAD32_H(16); + +/* clang-format off */ +FSAD64 +FSAD32 +/* clang-format on */ + +#undef FSAD64 +#undef FSAD32 +#undef FSAD64_H +#undef FSAD32_H + +#define FSADAVG64_H(h) \ + unsigned int aom_sad64x##h##_avg_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + int i, res; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + for (i = 0; i < h; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ + ref1_reg = _mm256_avg_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \ + ref2_reg = _mm256_avg_epu8( \ + ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \ + sad1_reg = _mm256_sad_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8( \ + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ + sum_sad = \ + _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr += ref_stride; \ + src_ptr += src_stride; \ + second_pred += 64; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + res = _mm_cvtsi128_si32(sum_sad128); \ + _mm256_zeroupper(); \ + return res; \ + } + +#define FSADAVG32_H(h) \ + unsigned int aom_sad32x##h##_avg_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + int i, res; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + int ref2_stride = ref_stride << 1; \ + int src2_stride = src_stride << 1; \ + int max = h >> 1; \ + for (i = 0; i < max; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ + ref1_reg = _mm256_avg_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \ + ref2_reg = _mm256_avg_epu8( \ + ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \ + sad1_reg = _mm256_sad_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8( \ + ref2_reg, \ + _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ + sum_sad = \ + _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr += ref2_stride; \ + src_ptr += src2_stride; \ + second_pred += 64; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + res = _mm_cvtsi128_si32(sum_sad128); \ + _mm256_zeroupper(); \ + return res; \ + } + +#define FSADAVG64 \ + FSADAVG64_H(64); \ + FSADAVG64_H(32); + +#define FSADAVG32 \ + FSADAVG32_H(64); \ + FSADAVG32_H(32); \ + FSADAVG32_H(16); + +/* clang-format off */ +FSADAVG64 +FSADAVG32 +/* clang-format on */ + +#undef FSADAVG64 +#undef FSADAVG32 +#undef FSADAVG64_H +#undef FSADAVG32_H diff --git a/libs/libaom/src/aom_dsp/x86/sad_highbd_avx2.c b/libs/libaom/src/aom_dsp/x86/sad_highbd_avx2.c new file mode 100644 index 000000000..2cff2e6a9 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/sad_highbd_avx2.c @@ -0,0 +1,699 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms_avx2.h" +#include "aom_ports/mem.h" + +// SAD +static INLINE unsigned int get_sad_from_mm256_epi32(const __m256i *v) { + // input 8 32-bit summation + __m128i lo128, hi128; + __m256i u = _mm256_srli_si256(*v, 8); + u = _mm256_add_epi32(u, *v); + + // 4 32-bit summation + hi128 = _mm256_extracti128_si256(u, 1); + lo128 = _mm256_castsi256_si128(u); + lo128 = _mm_add_epi32(hi128, lo128); + + // 2 32-bit summation + hi128 = _mm_srli_si128(lo128, 4); + lo128 = _mm_add_epi32(lo128, hi128); + + return (unsigned int)_mm_cvtsi128_si32(lo128); +} + +static INLINE void highbd_sad16x4_core_avx2(__m256i *s, __m256i *r, + __m256i *sad_acc) { + const __m256i zero = _mm256_setzero_si256(); + int i; + for (i = 0; i < 4; i++) { + s[i] = _mm256_sub_epi16(s[i], r[i]); + s[i] = _mm256_abs_epi16(s[i]); + } + + s[0] = _mm256_add_epi16(s[0], s[1]); + s[0] = _mm256_add_epi16(s[0], s[2]); + s[0] = _mm256_add_epi16(s[0], s[3]); + + r[0] = _mm256_unpacklo_epi16(s[0], zero); + r[1] = _mm256_unpackhi_epi16(s[0], zero); + + r[0] = _mm256_add_epi32(r[0], r[1]); + *sad_acc = _mm256_add_epi32(*sad_acc, r[0]); +} + +// If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD. +static INLINE void sad16x4(const uint16_t *src_ptr, int src_stride, + const uint16_t *ref_ptr, int ref_stride, + const uint16_t *sec_ptr, __m256i *sad_acc) { + __m256i s[4], r[4]; + s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); + s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); + s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride)); + s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride)); + + r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); + r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); + r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride)); + r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride)); + + if (sec_ptr) { + r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); + r[1] = _mm256_avg_epu16( + r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); + r[2] = _mm256_avg_epu16( + r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); + r[3] = _mm256_avg_epu16( + r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); + } + highbd_sad16x4_core_avx2(s, r, sad_acc); +} + +static AOM_FORCE_INLINE unsigned int aom_highbd_sad16xN_avx2(int N, + const uint8_t *src, + int src_stride, + const uint8_t *ref, + int ref_stride) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); + const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref); + int i; + __m256i sad = _mm256_setzero_si256(); + for (i = 0; i < N; i += 4) { + sad16x4(src_ptr, src_stride, ref_ptr, ref_stride, NULL, &sad); + src_ptr += src_stride << 2; + ref_ptr += ref_stride << 2; + } + return (unsigned int)get_sad_from_mm256_epi32(&sad); +} + +static void sad32x4(const uint16_t *src_ptr, int src_stride, + const uint16_t *ref_ptr, int ref_stride, + const uint16_t *sec_ptr, __m256i *sad_acc) { + __m256i s[4], r[4]; + int row_sections = 0; + + while (row_sections < 2) { + s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); + s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); + s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); + s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16)); + + r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); + r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); + r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); + r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16)); + + if (sec_ptr) { + r[0] = + _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); + r[1] = _mm256_avg_epu16( + r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); + r[2] = _mm256_avg_epu16( + r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); + r[3] = _mm256_avg_epu16( + r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); + sec_ptr += 32 << 1; + } + highbd_sad16x4_core_avx2(s, r, sad_acc); + + row_sections += 1; + src_ptr += src_stride << 1; + ref_ptr += ref_stride << 1; + } +} + +static AOM_FORCE_INLINE unsigned int aom_highbd_sad32xN_avx2(int N, + const uint8_t *src, + int src_stride, + const uint8_t *ref, + int ref_stride) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + const int left_shift = 2; + int i; + + for (i = 0; i < N; i += 4) { + sad32x4(srcp, src_stride, refp, ref_stride, NULL, &sad); + srcp += src_stride << left_shift; + refp += ref_stride << left_shift; + } + return get_sad_from_mm256_epi32(&sad); +} + +static void sad64x2(const uint16_t *src_ptr, int src_stride, + const uint16_t *ref_ptr, int ref_stride, + const uint16_t *sec_ptr, __m256i *sad_acc) { + __m256i s[4], r[4]; + int i; + for (i = 0; i < 2; i++) { + s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); + s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); + s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32)); + s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48)); + + r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); + r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); + r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32)); + r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48)); + if (sec_ptr) { + r[0] = + _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); + r[1] = _mm256_avg_epu16( + r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); + r[2] = _mm256_avg_epu16( + r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); + r[3] = _mm256_avg_epu16( + r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); + sec_ptr += 64; + } + highbd_sad16x4_core_avx2(s, r, sad_acc); + src_ptr += src_stride; + ref_ptr += ref_stride; + } +} + +static AOM_FORCE_INLINE unsigned int aom_highbd_sad64xN_avx2(int N, + const uint8_t *src, + int src_stride, + const uint8_t *ref, + int ref_stride) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + const int left_shift = 1; + int i; + for (i = 0; i < N; i += 2) { + sad64x2(srcp, src_stride, refp, ref_stride, NULL, &sad); + srcp += src_stride << left_shift; + refp += ref_stride << left_shift; + } + return get_sad_from_mm256_epi32(&sad); +} + +static void sad128x1(const uint16_t *src_ptr, const uint16_t *ref_ptr, + const uint16_t *sec_ptr, __m256i *sad_acc) { + __m256i s[4], r[4]; + int i; + for (i = 0; i < 2; i++) { + s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); + s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); + s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32)); + s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48)); + r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); + r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); + r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32)); + r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48)); + if (sec_ptr) { + r[0] = + _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); + r[1] = _mm256_avg_epu16( + r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); + r[2] = _mm256_avg_epu16( + r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); + r[3] = _mm256_avg_epu16( + r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); + sec_ptr += 64; + } + highbd_sad16x4_core_avx2(s, r, sad_acc); + src_ptr += 64; + ref_ptr += 64; + } +} + +static AOM_FORCE_INLINE unsigned int aom_highbd_sad128xN_avx2( + int N, const uint8_t *src, int src_stride, const uint8_t *ref, + int ref_stride) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + int row = 0; + while (row < N) { + sad128x1(srcp, refp, NULL, &sad); + srcp += src_stride; + refp += ref_stride; + row++; + } + return get_sad_from_mm256_epi32(&sad); +} + +#define highbd_sadMxN_avx2(m, n) \ + unsigned int aom_highbd_sad##m##x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return aom_highbd_sad##m##xN_avx2(n, src, src_stride, ref, ref_stride); \ + } + +highbd_sadMxN_avx2(16, 4); +highbd_sadMxN_avx2(16, 8); +highbd_sadMxN_avx2(16, 16); +highbd_sadMxN_avx2(16, 32); +highbd_sadMxN_avx2(16, 64); + +highbd_sadMxN_avx2(32, 8); +highbd_sadMxN_avx2(32, 16); +highbd_sadMxN_avx2(32, 32); +highbd_sadMxN_avx2(32, 64); + +highbd_sadMxN_avx2(64, 16); +highbd_sadMxN_avx2(64, 32); +highbd_sadMxN_avx2(64, 64); +highbd_sadMxN_avx2(64, 128); + +highbd_sadMxN_avx2(128, 64); +highbd_sadMxN_avx2(128, 128); + +unsigned int aom_highbd_sad16x4_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad); + + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad16x8_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + + sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad); + + // Next 4 rows + srcp += src_stride << 2; + refp += ref_stride << 2; + secp += 64; + sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad); + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad16x16_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 3; + uint32_t sum = aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 16 << left_shift; + sum += aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad16x32_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 4; + uint32_t sum = aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 16 << left_shift; + sum += aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad16x64_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 5; + uint32_t sum = aom_highbd_sad16x32_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 16 << left_shift; + sum += aom_highbd_sad16x32_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad32x8_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + const int left_shift = 2; + int row_section = 0; + + while (row_section < 2) { + sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad); + srcp += src_stride << left_shift; + refp += ref_stride << left_shift; + secp += 32 << left_shift; + row_section += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad32x16_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + const int left_shift = 2; + int row_section = 0; + + while (row_section < 4) { + sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad); + srcp += src_stride << left_shift; + refp += ref_stride << left_shift; + secp += 32 << left_shift; + row_section += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad32x32_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 4; + uint32_t sum = aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 32 << left_shift; + sum += aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad32x64_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 5; + uint32_t sum = aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 32 << left_shift; + sum += aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad64x16_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + const int left_shift = 1; + int row_section = 0; + + while (row_section < 8) { + sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad); + srcp += src_stride << left_shift; + refp += ref_stride << left_shift; + secp += 64 << left_shift; + row_section += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad64x32_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + const int left_shift = 1; + int row_section = 0; + + while (row_section < 16) { + sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad); + srcp += src_stride << left_shift; + refp += ref_stride << left_shift; + secp += 64 << left_shift; + row_section += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad64x64_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 5; + uint32_t sum = aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 64 << left_shift; + sum += aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad64x128_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 6; + uint32_t sum = aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 64 << left_shift; + sum += aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad128x64_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + int row = 0; + while (row < 64) { + sad128x1(srcp, refp, secp, &sad); + srcp += src_stride; + refp += ref_stride; + secp += 16 << 3; + row += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + unsigned int sum; + const int left_shift = 6; + + sum = aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 128 << left_shift; + sum += aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +// SAD 4D +// Combine 4 __m256i input vectors v to uint32_t result[4] +static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v, + uint32_t *res) { + __m256i u0, u1, u2, u3; + const __m256i mask = yy_set1_64_from_32i(UINT32_MAX); + __m128i sad; + + // 8 32-bit summation + u0 = _mm256_srli_si256(v[0], 4); + u1 = _mm256_srli_si256(v[1], 4); + u2 = _mm256_srli_si256(v[2], 4); + u3 = _mm256_srli_si256(v[3], 4); + + u0 = _mm256_add_epi32(u0, v[0]); + u1 = _mm256_add_epi32(u1, v[1]); + u2 = _mm256_add_epi32(u2, v[2]); + u3 = _mm256_add_epi32(u3, v[3]); + + u0 = _mm256_and_si256(u0, mask); + u1 = _mm256_and_si256(u1, mask); + u2 = _mm256_and_si256(u2, mask); + u3 = _mm256_and_si256(u3, mask); + // 4 32-bit summation, evenly positioned + + u1 = _mm256_slli_si256(u1, 4); + u3 = _mm256_slli_si256(u3, 4); + + u0 = _mm256_or_si256(u0, u1); + u2 = _mm256_or_si256(u2, u3); + // 8 32-bit summation, interleaved + + u1 = _mm256_unpacklo_epi64(u0, u2); + u3 = _mm256_unpackhi_epi64(u0, u2); + + u0 = _mm256_add_epi32(u1, u3); + sad = _mm_add_epi32(_mm256_extractf128_si256(u0, 1), + _mm256_castsi256_si128(u0)); + _mm_storeu_si128((__m128i *)res, sad); +} + +static void convert_pointers(const uint8_t *const ref8[], + const uint16_t *ref[]) { + ref[0] = CONVERT_TO_SHORTPTR(ref8[0]); + ref[1] = CONVERT_TO_SHORTPTR(ref8[1]); + ref[2] = CONVERT_TO_SHORTPTR(ref8[2]); + ref[3] = CONVERT_TO_SHORTPTR(ref8[3]); +} + +static void init_sad(__m256i *s) { + s[0] = _mm256_setzero_si256(); + s[1] = _mm256_setzero_si256(); + s[2] = _mm256_setzero_si256(); + s[3] = _mm256_setzero_si256(); +} + +static AOM_FORCE_INLINE void aom_highbd_sad16xNx4d_avx2( + int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + __m256i sad_vec[4]; + const uint16_t *refp[4]; + const uint16_t *keep = CONVERT_TO_SHORTPTR(src); + const uint16_t *srcp; + const int shift_for_4_rows = 2; + int i, j; + + init_sad(sad_vec); + convert_pointers(ref_array, refp); + + for (i = 0; i < 4; ++i) { + srcp = keep; + for (j = 0; j < N; j += 4) { + sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]); + srcp += src_stride << shift_for_4_rows; + refp[i] += ref_stride << shift_for_4_rows; + } + } + get_4d_sad_from_mm256_epi32(sad_vec, sad_array); +} + +static AOM_FORCE_INLINE void aom_highbd_sad32xNx4d_avx2( + int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + __m256i sad_vec[4]; + const uint16_t *refp[4]; + const uint16_t *keep = CONVERT_TO_SHORTPTR(src); + const uint16_t *srcp; + const int shift_for_4_rows = 2; + int i, r; + + init_sad(sad_vec); + convert_pointers(ref_array, refp); + + for (i = 0; i < 4; ++i) { + srcp = keep; + for (r = 0; r < N; r += 4) { + sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]); + srcp += src_stride << shift_for_4_rows; + refp[i] += ref_stride << shift_for_4_rows; + } + } + get_4d_sad_from_mm256_epi32(sad_vec, sad_array); +} + +static AOM_FORCE_INLINE void aom_highbd_sad64xNx4d_avx2( + int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + __m256i sad_vec[4]; + const uint16_t *refp[4]; + const uint16_t *keep = CONVERT_TO_SHORTPTR(src); + const uint16_t *srcp; + const int shift_for_rows = 1; + int i, r; + + init_sad(sad_vec); + convert_pointers(ref_array, refp); + + for (i = 0; i < 4; ++i) { + srcp = keep; + for (r = 0; r < N; r += 2) { + sad64x2(srcp, src_stride, refp[i], ref_stride, NULL, &sad_vec[i]); + srcp += src_stride << shift_for_rows; + refp[i] += ref_stride << shift_for_rows; + } + } + get_4d_sad_from_mm256_epi32(sad_vec, sad_array); +} + +static AOM_FORCE_INLINE void aom_highbd_sad128xNx4d_avx2( + int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + __m256i sad_vec[4]; + const uint16_t *refp[4]; + const uint16_t *keep = CONVERT_TO_SHORTPTR(src); + const uint16_t *srcp; + int i, r; + + init_sad(sad_vec); + convert_pointers(ref_array, refp); + + for (i = 0; i < 4; ++i) { + srcp = keep; + for (r = 0; r < N; r++) { + sad128x1(srcp, refp[i], NULL, &sad_vec[i]); + srcp += src_stride; + refp[i] += ref_stride; + } + } + get_4d_sad_from_mm256_epi32(sad_vec, sad_array); +} + +#define highbd_sadMxNx4d_avx2(m, n) \ + void aom_highbd_sad##m##x##n##x4d_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + aom_highbd_sad##m##xNx4d_avx2(n, src, src_stride, ref_array, ref_stride, \ + sad_array); \ + } + +highbd_sadMxNx4d_avx2(16, 4); +highbd_sadMxNx4d_avx2(16, 8); +highbd_sadMxNx4d_avx2(16, 16); +highbd_sadMxNx4d_avx2(16, 32); +highbd_sadMxNx4d_avx2(16, 64); + +highbd_sadMxNx4d_avx2(32, 8); +highbd_sadMxNx4d_avx2(32, 16); +highbd_sadMxNx4d_avx2(32, 32); +highbd_sadMxNx4d_avx2(32, 64); + +highbd_sadMxNx4d_avx2(64, 16); +highbd_sadMxNx4d_avx2(64, 32); +highbd_sadMxNx4d_avx2(64, 64); +highbd_sadMxNx4d_avx2(64, 128); + +highbd_sadMxNx4d_avx2(128, 64); +highbd_sadMxNx4d_avx2(128, 128); diff --git a/libs/libaom/src/aom_dsp/x86/sad_impl_avx2.c b/libs/libaom/src/aom_dsp/x86/sad_impl_avx2.c new file mode 100644 index 000000000..f77a585b4 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/sad_impl_avx2.c @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +static unsigned int sad32x32(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + __m256i s1, s2, r1, r2; + __m256i sum = _mm256_setzero_si256(); + __m128i sum_i128; + int i; + + for (i = 0; i < 16; ++i) { + r1 = _mm256_loadu_si256((__m256i const *)ref_ptr); + r2 = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); + s1 = _mm256_sad_epu8(r1, _mm256_loadu_si256((__m256i const *)src_ptr)); + s2 = _mm256_sad_epu8( + r2, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); + sum = _mm256_add_epi32(sum, _mm256_add_epi32(s1, s2)); + ref_ptr += ref_stride << 1; + src_ptr += src_stride << 1; + } + + sum = _mm256_add_epi32(sum, _mm256_srli_si256(sum, 8)); + sum_i128 = _mm_add_epi32(_mm256_extracti128_si256(sum, 1), + _mm256_castsi256_si128(sum)); + return _mm_cvtsi128_si32(sum_i128); +} + +static unsigned int sad64x32(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + unsigned int half_width = 32; + uint32_t sum = sad32x32(src_ptr, src_stride, ref_ptr, ref_stride); + src_ptr += half_width; + ref_ptr += half_width; + sum += sad32x32(src_ptr, src_stride, ref_ptr, ref_stride); + return sum; +} + +static unsigned int sad64x64(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + uint32_t sum = sad64x32(src_ptr, src_stride, ref_ptr, ref_stride); + src_ptr += src_stride << 5; + ref_ptr += ref_stride << 5; + sum += sad64x32(src_ptr, src_stride, ref_ptr, ref_stride); + return sum; +} + +unsigned int aom_sad128x64_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + unsigned int half_width = 64; + uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride); + src_ptr += half_width; + ref_ptr += half_width; + sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride); + return sum; +} + +unsigned int aom_sad64x128_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride); + src_ptr += src_stride << 6; + ref_ptr += ref_stride << 6; + sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride); + return sum; +} + +unsigned int aom_sad128x128_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + uint32_t sum = aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride); + src_ptr += src_stride << 6; + ref_ptr += ref_stride << 6; + sum += aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride); + return sum; +} + +static unsigned int sad_w64_avg_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const int h, const uint8_t *second_pred, + const int second_pred_stride) { + int i, res; + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; + __m256i sum_sad = _mm256_setzero_si256(); + __m256i sum_sad_h; + __m128i sum_sad128; + for (i = 0; i < h; i++) { + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); + ref1_reg = _mm256_avg_epu8( + ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); + ref2_reg = _mm256_avg_epu8( + ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); + sad1_reg = + _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); + sad2_reg = _mm256_sad_epu8( + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); + sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); + ref_ptr += ref_stride; + src_ptr += src_stride; + second_pred += second_pred_stride; + } + sum_sad_h = _mm256_srli_si256(sum_sad, 8); + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); + res = _mm_cvtsi128_si32(sum_sad128); + + return res; +} + +unsigned int aom_sad64x128_avg_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred) { + uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, + second_pred, 64); + src_ptr += src_stride << 6; + ref_ptr += ref_stride << 6; + second_pred += 64 << 6; + sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, + second_pred, 64); + return sum; +} + +unsigned int aom_sad128x64_avg_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred) { + unsigned int half_width = 64; + uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, + second_pred, 128); + src_ptr += half_width; + ref_ptr += half_width; + second_pred += half_width; + sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, + second_pred, 128); + return sum; +} + +unsigned int aom_sad128x128_avg_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred) { + uint32_t sum = aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr, + ref_stride, second_pred); + src_ptr += src_stride << 6; + ref_ptr += ref_stride << 6; + second_pred += 128 << 6; + sum += aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, + second_pred); + return sum; +} diff --git a/libs/libaom/src/aom_dsp/x86/sad_sse2.asm b/libs/libaom/src/aom_dsp/x86/sad_sse2.asm new file mode 100644 index 000000000..3251b7655 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/sad_sse2.asm @@ -0,0 +1,353 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro SAD_FN 4 +%if %4 == 0 +%if %3 == 5 +cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 +%else ; avg +%if %3 == 5 +cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ + second_pred, n_rows +%else ; %3 == 7 +cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \ + ref, ref_stride, \ + second_pred, \ + src_stride3, ref_stride3 +%if ARCH_X86_64 +%define n_rowsd r7d +%else ; x86-32 +%define n_rowsd dword r0m +%endif ; x86-32/64 +%endif ; %3 == 5/7 +%endif ; avg/sad + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided +%if %3 == 7 + lea src_stride3q, [src_strideq*3] + lea ref_stride3q, [ref_strideq*3] +%endif ; %3 == 7 +%endmacro + +; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD128XN 1-2 0 + SAD_FN 128, %1, 5, %2 + mov n_rowsd, %1 + pxor m0, m0 + +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+16] + psadbw m3, [srcq+32] + psadbw m4, [srcq+48] + + paddd m1, m2 + paddd m3, m4 + paddd m0, m1 + paddd m0, m3 + + movu m1, [refq+64] + movu m2, [refq+80] + movu m3, [refq+96] + movu m4, [refq+112] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*4] + pavgb m2, [second_predq+mmsize*5] + pavgb m3, [second_predq+mmsize*6] + pavgb m4, [second_predq+mmsize*7] + lea second_predq, [second_predq+mmsize*8] +%endif + psadbw m1, [srcq+64] + psadbw m2, [srcq+80] + psadbw m3, [srcq+96] + psadbw m4, [srcq+112] + + add refq, ref_strideq + add srcq, src_strideq + + paddd m1, m2 + paddd m3, m4 + paddd m0, m1 + paddd m0, m3 + + sub n_rowsd, 1 + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD128XN 128 ; sad128x128_sse2 +SAD128XN 128, 1 ; sad128x128_avg_sse2 +SAD128XN 64 ; sad128x64_sse2 +SAD128XN 64, 1 ; sad128x64_avg_sse2 + + +; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD64XN 1-2 0 + SAD_FN 64, %1, 5, %2 + mov n_rowsd, %1 + pxor m0, m0 +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+16] + psadbw m3, [srcq+32] + psadbw m4, [srcq+48] + paddd m1, m2 + paddd m3, m4 + add refq, ref_strideq + paddd m0, m1 + add srcq, src_strideq + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD64XN 128 ; sad64x128_sse2 +SAD64XN 128, 1 ; sad64x128_avg_sse2 +SAD64XN 64 ; sad64x64_sse2 +SAD64XN 32 ; sad64x32_sse2 +SAD64XN 64, 1 ; sad64x64_avg_sse2 +SAD64XN 32, 1 ; sad64x32_avg_sse2 +SAD64XN 16 ; sad64x16_sse2 +SAD64XN 16, 1 ; sad64x16_avg_sse2 + +; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD32XN 1-2 0 + SAD_FN 32, %1, 5, %2 + mov n_rowsd, %1/2 + pxor m0, m0 +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+ref_strideq] + movu m4, [refq+ref_strideq+16] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+16] + psadbw m3, [srcq+src_strideq] + psadbw m4, [srcq+src_strideq+16] + paddd m1, m2 + paddd m3, m4 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD32XN 64 ; sad32x64_sse2 +SAD32XN 32 ; sad32x32_sse2 +SAD32XN 16 ; sad32x16_sse2 +SAD32XN 64, 1 ; sad32x64_avg_sse2 +SAD32XN 32, 1 ; sad32x32_avg_sse2 +SAD32XN 16, 1 ; sad32x16_avg_sse2 +SAD32XN 8 ; sad_32x8_sse2 +SAD32XN 8, 1 ; sad_32x8_avg_sse2 + +; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD16XN 1-2 0 + SAD_FN 16, %1, 7, %2 + mov n_rowsd, %1/4 + pxor m0, m0 + +.loop: + movu m1, [refq] + movu m2, [refq+ref_strideq] + movu m3, [refq+ref_strideq*2] + movu m4, [refq+ref_stride3q] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+src_strideq] + psadbw m3, [srcq+src_strideq*2] + psadbw m4, [srcq+src_stride3q] + paddd m1, m2 + paddd m3, m4 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD16XN 32 ; sad16x32_sse2 +SAD16XN 16 ; sad16x16_sse2 +SAD16XN 8 ; sad16x8_sse2 +SAD16XN 32, 1 ; sad16x32_avg_sse2 +SAD16XN 16, 1 ; sad16x16_avg_sse2 +SAD16XN 8, 1 ; sad16x8_avg_sse2 +SAD16XN 4 ; sad_16x4_sse2 +SAD16XN 4, 1 ; sad_16x4_avg_sse2 +SAD16XN 64 ; sad_16x64_sse2 +SAD16XN 64, 1 ; sad_16x64_avg_sse2 + +; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD8XN 1-2 0 + SAD_FN 8, %1, 7, %2 + mov n_rowsd, %1/4 + pxor m0, m0 + +.loop: + movh m1, [refq] + movhps m1, [refq+ref_strideq] + movh m2, [refq+ref_strideq*2] + movhps m2, [refq+ref_stride3q] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + lea second_predq, [second_predq+mmsize*2] +%endif + movh m3, [srcq] + movhps m3, [srcq+src_strideq] + movh m4, [srcq+src_strideq*2] + movhps m4, [srcq+src_stride3q] + psadbw m1, m3 + psadbw m2, m4 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m2 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD8XN 16 ; sad8x16_sse2 +SAD8XN 8 ; sad8x8_sse2 +SAD8XN 4 ; sad8x4_sse2 +SAD8XN 16, 1 ; sad8x16_avg_sse2 +SAD8XN 8, 1 ; sad8x8_avg_sse2 +SAD8XN 4, 1 ; sad8x4_avg_sse2 +SAD8XN 32 ; sad_8x32_sse2 +SAD8XN 32, 1 ; sad_8x32_avg_sse2 + +; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD4XN 1-2 0 + SAD_FN 4, %1, 7, %2 + mov n_rowsd, %1/4 + pxor m0, m0 + +.loop: + movd m1, [refq] + movd m2, [refq+ref_strideq] + movd m3, [refq+ref_strideq*2] + movd m4, [refq+ref_stride3q] + punpckldq m1, m2 + punpckldq m3, m4 + movlhps m1, m3 +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + lea second_predq, [second_predq+mmsize*1] +%endif + movd m2, [srcq] + movd m5, [srcq+src_strideq] + movd m4, [srcq+src_strideq*2] + movd m3, [srcq+src_stride3q] + punpckldq m2, m5 + punpckldq m4, m3 + movlhps m2, m4 + psadbw m1, m2 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD4XN 8 ; sad4x8_sse +SAD4XN 4 ; sad4x4_sse +SAD4XN 8, 1 ; sad4x8_avg_sse +SAD4XN 4, 1 ; sad4x4_avg_sse +SAD4XN 16 ; sad_4x16_sse2 +SAD4XN 16, 1 ; sad_4x16_avg_sse2 diff --git a/libs/libaom/src/aom_dsp/x86/sse_avx2.c b/libs/libaom/src/aom_dsp/x86/sse_avx2.c new file mode 100644 index 000000000..e6ee2fcab --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/sse_avx2.c @@ -0,0 +1,384 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_ports/mem.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a, + const uint8_t *b) { + const __m256i v_a0 = yy_loadu_256(a); + const __m256i v_b0 = yy_loadu_256(b); + const __m256i zero = _mm256_setzero_si256(); + const __m256i v_a00_w = _mm256_unpacklo_epi8(v_a0, zero); + const __m256i v_a01_w = _mm256_unpackhi_epi8(v_a0, zero); + const __m256i v_b00_w = _mm256_unpacklo_epi8(v_b0, zero); + const __m256i v_b01_w = _mm256_unpackhi_epi8(v_b0, zero); + const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w); + const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w)); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w)); +} + +static INLINE int64_t summary_all_avx2(const __m256i *sum_all) { + int64_t sum; + __m256i zero = _mm256_setzero_si256(); + const __m256i sum0_4x64 = _mm256_unpacklo_epi32(*sum_all, zero); + const __m256i sum1_4x64 = _mm256_unpackhi_epi32(*sum_all, zero); + const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64); + const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64), + _mm256_extracti128_si256(sum_4x64, 1)); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + xx_storel_64(&sum, sum_1x64); + return sum; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void summary_32_avx2(const __m256i *sum32, __m256i *sum) { + const __m256i sum0_4x64 = + _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum32)); + const __m256i sum1_4x64 = + _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum32, 1)); + const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64); + *sum = _mm256_add_epi64(*sum, sum_4x64); +} + +static INLINE int64_t summary_4x64_avx2(const __m256i sum_4x64) { + int64_t sum; + const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64), + _mm256_extracti128_si256(sum_4x64, 1)); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + + xx_storel_64(&sum, sum_1x64); + return sum; +} +#endif + +static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, __m256i *sum) { + const __m128i v_a0 = xx_loadl_32(a); + const __m128i v_a1 = xx_loadl_32(a + a_stride); + const __m128i v_a2 = xx_loadl_32(a + a_stride * 2); + const __m128i v_a3 = xx_loadl_32(a + a_stride * 3); + const __m128i v_b0 = xx_loadl_32(b); + const __m128i v_b1 = xx_loadl_32(b + b_stride); + const __m128i v_b2 = xx_loadl_32(b + b_stride * 2); + const __m128i v_b3 = xx_loadl_32(b + b_stride * 3); + const __m128i v_a0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_a0, v_a1), + _mm_unpacklo_epi32(v_a2, v_a3)); + const __m128i v_b0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_b0, v_b1), + _mm_unpacklo_epi32(v_b2, v_b3)); + const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123); + const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} +static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, __m256i *sum) { + const __m128i v_a0 = xx_loadl_64(a); + const __m128i v_a1 = xx_loadl_64(a + a_stride); + const __m128i v_b0 = xx_loadl_64(b); + const __m128i v_b1 = xx_loadl_64(b + b_stride); + const __m256i v_a_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1)); + const __m256i v_b_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1)); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} +int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int32_t y = 0; + int64_t sse = 0; + __m256i sum = _mm256_setzero_si256(); + __m256i zero = _mm256_setzero_si256(); + switch (width) { + case 4: + do { + sse_w4x4_avx2(a, a_stride, b, b_stride, &sum); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 8: + do { + sse_w8x2_avx2(a, a_stride, b, b_stride, &sum); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 16: + do { + const __m128i v_a0 = xx_loadu_128(a); + const __m128i v_a1 = xx_loadu_128(a + a_stride); + const __m128i v_b0 = xx_loadu_128(b); + const __m128i v_b1 = xx_loadu_128(b + b_stride); + const __m256i v_a = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_a0), v_a1, 0x01); + const __m256i v_b = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_b0), v_b1, 0x01); + const __m256i v_al = _mm256_unpacklo_epi8(v_a, zero); + const __m256i v_au = _mm256_unpackhi_epi8(v_a, zero); + const __m256i v_bl = _mm256_unpacklo_epi8(v_b, zero); + const __m256i v_bu = _mm256_unpackhi_epi8(v_b, zero); + const __m256i v_asub = _mm256_sub_epi16(v_al, v_bl); + const __m256i v_bsub = _mm256_sub_epi16(v_au, v_bu); + const __m256i temp = + _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub), + _mm256_madd_epi16(v_bsub, v_bsub)); + sum = _mm256_add_epi32(sum, temp); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 32: + do { + sse_w32_avx2(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 64: + do { + sse_w32_avx2(&sum, a, b); + sse_w32_avx2(&sum, a + 32, b + 32); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 128: + do { + sse_w32_avx2(&sum, a, b); + sse_w32_avx2(&sum, a + 32, b + 32); + sse_w32_avx2(&sum, a + 64, b + 64); + sse_w32_avx2(&sum, a + 96, b + 96); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + default: + if ((width & 0x07) == 0) { + do { + int i = 0; + do { + sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum); + i += 8; + } while (i < width); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + } else { + do { + int i = 0; + do { + sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum); + const uint8_t *a2 = a + i + (a_stride << 1); + const uint8_t *b2 = b + i + (b_stride << 1); + sse_w8x2_avx2(a2, a_stride, b2, b_stride, &sum); + i += 8; + } while (i + 4 < width); + sse_w4x4_avx2(a + i, a_stride, b + i, b_stride, &sum); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + } + sse = summary_all_avx2(&sum); + break; + } + + return sse; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a, + const uint16_t *b) { + const __m256i v_a_w = yy_loadu_256(a); + const __m256i v_b_w = yy_loadu_256(b); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void highbd_sse_w4x4_avx2(__m256i *sum, const uint16_t *a, + int a_stride, const uint16_t *b, + int b_stride) { + const __m128i v_a0 = xx_loadl_64(a); + const __m128i v_a1 = xx_loadl_64(a + a_stride); + const __m128i v_a2 = xx_loadl_64(a + a_stride * 2); + const __m128i v_a3 = xx_loadl_64(a + a_stride * 3); + const __m128i v_b0 = xx_loadl_64(b); + const __m128i v_b1 = xx_loadl_64(b + b_stride); + const __m128i v_b2 = xx_loadl_64(b + b_stride * 2); + const __m128i v_b3 = xx_loadl_64(b + b_stride * 3); + const __m256i v_a_w = yy_set_m128i(_mm_unpacklo_epi64(v_a0, v_a1), + _mm_unpacklo_epi64(v_a2, v_a3)); + const __m256i v_b_w = yy_set_m128i(_mm_unpacklo_epi64(v_b0, v_b1), + _mm_unpacklo_epi64(v_b2, v_b3)); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void highbd_sse_w8x2_avx2(__m256i *sum, const uint16_t *a, + int a_stride, const uint16_t *b, + int b_stride) { + const __m256i v_a_w = yy_loadu2_128(a + a_stride, a); + const __m256i v_b_w = yy_loadu2_128(b + b_stride, b); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} +int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8, + int b_stride, int width, int height) { + int32_t y = 0; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + __m256i sum = _mm256_setzero_si256(); + switch (width) { + case 4: + do { + highbd_sse_w4x4_avx2(&sum, a, a_stride, b, b_stride); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 8: + do { + highbd_sse_w8x2_avx2(&sum, a, a_stride, b, b_stride); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 16: + do { + highbd_sse_w16_avx2(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 32: + do { + int l = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + highbd_sse_w16_avx2(&sum32, a, b); + highbd_sse_w16_avx2(&sum32, a + 16, b + 16); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 64 && l < (height - y)); + summary_32_avx2(&sum32, &sum); + y += 64; + } while (y < height); + sse = summary_4x64_avx2(sum); + break; + case 64: + do { + int l = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + highbd_sse_w16_avx2(&sum32, a, b); + highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1); + highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2); + highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 32 && l < (height - y)); + summary_32_avx2(&sum32, &sum); + y += 32; + } while (y < height); + sse = summary_4x64_avx2(sum); + break; + case 128: + do { + int l = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + highbd_sse_w16_avx2(&sum32, a, b); + highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1); + highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2); + highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3); + highbd_sse_w16_avx2(&sum32, a + 16 * 4, b + 16 * 4); + highbd_sse_w16_avx2(&sum32, a + 16 * 5, b + 16 * 5); + highbd_sse_w16_avx2(&sum32, a + 16 * 6, b + 16 * 6); + highbd_sse_w16_avx2(&sum32, a + 16 * 7, b + 16 * 7); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 16 && l < (height - y)); + summary_32_avx2(&sum32, &sum); + y += 16; + } while (y < height); + sse = summary_4x64_avx2(sum); + break; + default: + if (width & 0x7) { + do { + int i = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride); + const uint16_t *a2 = a + i + (a_stride << 1); + const uint16_t *b2 = b + i + (b_stride << 1); + highbd_sse_w8x2_avx2(&sum32, a2, a_stride, b2, b_stride); + i += 8; + } while (i + 4 < width); + highbd_sse_w4x4_avx2(&sum32, a + i, a_stride, b + i, b_stride); + summary_32_avx2(&sum32, &sum); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + } else { + do { + int l = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + int i = 0; + do { + highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride); + i += 8; + } while (i < width); + a += a_stride << 1; + b += b_stride << 1; + l += 2; + } while (l < 8 && l < (height - y)); + summary_32_avx2(&sum32, &sum); + y += 8; + } while (y < height); + } + sse = summary_4x64_avx2(sum); + break; + } + return sse; +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/libs/libaom/src/aom_dsp/x86/sse_sse4.c b/libs/libaom/src/aom_dsp/x86/sse_sse4.c new file mode 100644 index 000000000..5f95eb9ae --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/sse_sse4.c @@ -0,0 +1,353 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" + +static INLINE int64_t summary_all_sse4(const __m128i *sum_all) { + int64_t sum; + const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all); + const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8)); + const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + xx_storel_64(&sum, sum_1x64); + return sum; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void summary_32_sse4(const __m128i *sum32, __m128i *sum64) { + const __m128i sum0 = _mm_cvtepu32_epi64(*sum32); + const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum32, 8)); + *sum64 = _mm_add_epi64(sum0, *sum64); + *sum64 = _mm_add_epi64(sum1, *sum64); +} +#endif + +static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a, + const uint8_t *b) { + const __m128i v_a0 = xx_loadu_128(a); + const __m128i v_b0 = xx_loadu_128(b); + const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0); + const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8)); + const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0); + const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8)); + const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w); + const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w)); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w)); +} + +static INLINE void sse4x2_sse4_1(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, __m128i *sum) { + const __m128i v_a0 = xx_loadl_32(a); + const __m128i v_a1 = xx_loadl_32(a + a_stride); + const __m128i v_b0 = xx_loadl_32(b); + const __m128i v_b1 = xx_loadl_32(b + b_stride); + const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1)); + const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1)); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} +static INLINE void sse8_sse4_1(const uint8_t *a, const uint8_t *b, + __m128i *sum) { + const __m128i v_a0 = xx_loadl_64(a); + const __m128i v_b0 = xx_loadl_64(b); + const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0); + const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int y = 0; + int64_t sse = 0; + __m128i sum = _mm_setzero_si128(); + switch (width) { + case 4: + do { + sse4x2_sse4_1(a, a_stride, b, b_stride, &sum); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 8: + do { + sse8_sse4_1(a, b, &sum); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 16: + do { + sse_w16_sse4_1(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 32: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16, b + 16); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 64: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1); + sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2); + sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 128: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1); + sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2); + sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3); + sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4); + sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5); + sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6); + sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + default: + if (width & 0x07) { + do { + int i = 0; + do { + sse8_sse4_1(a + i, b + i, &sum); + sse8_sse4_1(a + i + a_stride, b + i + b_stride, &sum); + i += 8; + } while (i + 4 < width); + sse4x2_sse4_1(a + i, a_stride, b + i, b_stride, &sum); + a += (a_stride << 1); + b += (b_stride << 1); + y += 2; + } while (y < height); + } else { + do { + int i = 0; + do { + sse8_sse4_1(a + i, b + i, &sum); + i += 8; + } while (i < width); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + } + sse = summary_all_sse4(&sum); + break; + } + + return sse; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a, + int a_stride, const uint16_t *b, + int b_stride) { + const __m128i v_a0 = xx_loadl_64(a); + const __m128i v_a1 = xx_loadl_64(a + a_stride); + const __m128i v_b0 = xx_loadl_64(b); + const __m128i v_b1 = xx_loadl_64(b + b_stride); + const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1); + const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a, + const uint16_t *b) { + const __m128i v_a_w = xx_loadu_128(a); + const __m128i v_b_w = xx_loadu_128(b); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +int64_t aom_highbd_sse_sse4_1(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int width, + int height) { + int32_t y = 0; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + __m128i sum = _mm_setzero_si128(); + switch (width) { + case 4: + do { + highbd_sse_w4x2_sse4_1(&sum, a, a_stride, b, b_stride); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 8: + do { + highbd_sse_w8_sse4_1(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 16: + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + highbd_sse_w8_sse4_1(&sum32, a, b); + highbd_sse_w8_sse4_1(&sum32, a + 8, b + 8); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 64 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 64; + } while (y < height); + xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + case 32: + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + highbd_sse_w8_sse4_1(&sum32, a, b); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 32 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 32; + } while (y < height); + xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + case 64: + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + highbd_sse_w8_sse4_1(&sum32, a, b); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 16 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 16; + } while (y < height); + xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + case 128: + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + highbd_sse_w8_sse4_1(&sum32, a, b); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 8, b + 8 * 8); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 9, b + 8 * 9); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 10, b + 8 * 10); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 11, b + 8 * 11); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 12, b + 8 * 12); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 13, b + 8 * 13); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 14, b + 8 * 14); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 15, b + 8 * 15); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 8 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 8; + } while (y < height); + xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + default: + if (width & 0x7) { + do { + __m128i sum32 = _mm_setzero_si128(); + int i = 0; + do { + highbd_sse_w8_sse4_1(&sum32, a + i, b + i); + highbd_sse_w8_sse4_1(&sum32, a + i + a_stride, b + i + b_stride); + i += 8; + } while (i + 4 < width); + highbd_sse_w4x2_sse4_1(&sum32, a + i, a_stride, b + i, b_stride); + a += (a_stride << 1); + b += (b_stride << 1); + y += 2; + summary_32_sse4(&sum32, &sum); + } while (y < height); + } else { + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + int i = 0; + do { + highbd_sse_w8_sse4_1(&sum32, a + i, b + i); + i += 8; + } while (i < width); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 8 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 8; + } while (y < height); + } + xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + } + return sse; +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/libs/libaom/src/aom_dsp/x86/ssim_sse2_x86_64.asm b/libs/libaom/src/aom_dsp/x86/ssim_sse2_x86_64.asm new file mode 100644 index 000000000..6d9b5a12f --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/ssim_sse2_x86_64.asm @@ -0,0 +1,222 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "aom_ports/x86_abi_support.asm" + +; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr +%macro TABULATE_SSIM 0 + paddusw xmm15, xmm3 ; sum_s + paddusw xmm14, xmm4 ; sum_r + movdqa xmm1, xmm3 + pmaddwd xmm1, xmm1 + paddd xmm13, xmm1 ; sum_sq_s + movdqa xmm2, xmm4 + pmaddwd xmm2, xmm2 + paddd xmm12, xmm2 ; sum_sq_r + pmaddwd xmm3, xmm4 + paddd xmm11, xmm3 ; sum_sxr +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_Q 1 + movdqa xmm2,%1 + punpckldq %1,xmm0 + punpckhdq xmm2,xmm0 + paddq %1,xmm2 + movdqa xmm2,%1 + punpcklqdq %1,xmm0 + punpckhqdq xmm2,xmm0 + paddq %1,xmm2 +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_W 1 + movdqa xmm1, %1 + punpcklwd %1,xmm0 + punpckhwd xmm1,xmm0 + paddd %1, xmm1 + SUM_ACROSS_Q %1 +%endmacro + +SECTION .text + +;void ssim_parms_sse2( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; uint32_t *sum_s, +; uint32_t *sum_r, +; uint32_t *sum_sq_s, +; uint32_t *sum_sq_r, +; uint32_t *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +global sym(aom_ssim_parms_16x16_sse2) PRIVATE +sym(aom_ssim_parms_16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 15 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 16 ;row counter +.NextRow: + + ;grab source and reference pixels + movdqu xmm5, [rsi] + movdqu xmm6, [rdi] + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpckhbw xmm3, xmm0 ; high_s + punpckhbw xmm4, xmm0 ; high_r + + TABULATE_SSIM + + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz .NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movd [rdi], xmm15; + mov rdi,arg(5) + movd [rdi], xmm14; + mov rdi,arg(6) + movd [rdi], xmm13; + mov rdi,arg(7) + movd [rdi], xmm12; + mov rdi,arg(8) + movd [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void ssim_parms_sse2( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; uint32_t *sum_s, +; uint32_t *sum_r, +; uint32_t *sum_sq_s, +; uint32_t *sum_sq_r, +; uint32_t *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +global sym(aom_ssim_parms_8x8_sse2) PRIVATE +sym(aom_ssim_parms_8x8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 15 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 8 ;row counter +.NextRow: + + ;grab source and reference pixels + movq xmm3, [rsi] + movq xmm4, [rdi] + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz .NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movd [rdi], xmm15; + mov rdi,arg(5) + movd [rdi], xmm14; + mov rdi,arg(6) + movd [rdi], xmm13; + mov rdi,arg(7) + movd [rdi], xmm12; + mov rdi,arg(8) + movd [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/libs/libaom/src/aom_dsp/x86/subpel_variance_sse2.asm b/libs/libaom/src/aom_dsp/x86/subpel_variance_sse2.asm new file mode 100644 index 000000000..cbf28901b --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/subpel_variance_sse2.asm @@ -0,0 +1,1470 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_8: times 8 dw 8 +bilin_filter_m_sse2: times 8 dw 16 + times 8 dw 0 + times 8 dw 14 + times 8 dw 2 + times 8 dw 12 + times 8 dw 4 + times 8 dw 10 + times 8 dw 6 + times 16 dw 8 + times 8 dw 6 + times 8 dw 10 + times 8 dw 4 + times 8 dw 12 + times 8 dw 2 + times 8 dw 14 + +bilin_filter_m_ssse3: times 8 db 16, 0 + times 8 db 14, 2 + times 8 db 12, 4 + times 8 db 10, 6 + times 16 db 8 + times 8 db 6, 10 + times 8 db 4, 12 + times 8 db 2, 14 + +SECTION .text + +; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, +; int x_offset, int y_offset, +; const uint8_t *dst, ptrdiff_t dst_stride, +; int height, unsigned int *sse); +; +; This function returns the SE and stores SSE in the given pointer. + +%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse + psubw %3, %4 + psubw %1, %2 + paddw %5, %3 + pmaddwd %3, %3 + paddw %5, %1 + pmaddwd %1, %1 + paddd %6, %3 + paddd %6, %1 +%endmacro + +%macro STORE_AND_RET 1 +%if %1 > 4 + ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit + ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. + ; We have to sign-extend it before adding the words within the register + ; and outputing to a dword. + pcmpgtw m5, m6 ; mask for 0 > x + movhlps m3, m7 + punpcklwd m4, m6, m5 + punpckhwd m6, m5 ; sign-extend m6 word->dword + paddd m7, m3 + paddd m6, m4 + pshufd m3, m7, 0x1 + movhlps m4, m6 + paddd m7, m3 + paddd m6, m4 + mov r1, ssem ; r1 = unsigned int *sse + pshufd m4, m6, 0x1 + movd [r1], m7 ; store sse + paddd m6, m4 + movd raxd, m6 ; store sum as return value +%else ; 4xh + pshuflw m4, m6, 0xe + pshuflw m3, m7, 0xe + paddw m6, m4 + paddd m7, m3 + pcmpgtw m5, m6 ; mask for 0 > x + mov r1, ssem ; r1 = unsigned int *sse + punpcklwd m6, m5 ; sign-extend m6 word->dword + movd [r1], m7 ; store sse + pshuflw m4, m6, 0xe + paddd m6, m4 + movd raxd, m6 ; store sum as return value +%endif + RET +%endmacro + +%macro INC_SRC_BY_SRC_STRIDE 0 +%if ARCH_X86=1 && CONFIG_PIC=1 + add srcq, src_stridemp +%else + add srcq, src_strideq +%endif +%endmacro + +%macro SUBPEL_VARIANCE 1-2 0 ; W +%if cpuflag(ssse3) +%define bilin_filter_m bilin_filter_m_ssse3 +%define filter_idx_shift 4 +%else +%define bilin_filter_m bilin_filter_m_sse2 +%define filter_idx_shift 5 +%endif +; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses +; 11, not 13, if the registers are ordered correctly. May make a minor speed +; difference on Win64 + +%if ARCH_X86_64 + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + sec, sec_stride, height, sse + %define sec_str sec_strideq + %else + cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + height, sse + %endif + %define block_height heightd + %define bilin_filter sseq +%else + %if CONFIG_PIC=1 + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + sec, sec_stride, height, sse + %define block_height dword heightm + %define sec_str sec_stridemp + %else + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + height, sse + %define block_height heightd + %endif + + ; reuse argument stack space + %define g_bilin_filterm x_offsetm + %define g_pw_8m y_offsetm + + ;Store bilin_filter and pw_8 location in stack + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back + %else + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, sec, sec_stride, \ + height, sse + %define block_height dword heightm + %define sec_str sec_stridemp + %else + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + height, sse + %define block_height heightd + %endif + %define bilin_filter bilin_filter_m + %endif +%endif + +%if %1 == 4 + %define movx movd +%else + %define movx movh +%endif + + ASSERT %1 <= 16 ; m6 overflows if w > 16 + pxor m6, m6 ; sum + pxor m7, m7 ; sse + ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we + ; could perhaps use it for something more productive then + pxor m5, m5 ; dedicated zero register +%if %1 < 16 + sar block_height, 1 +%if %2 == 1 ; avg + shl sec_str, 1 +%endif +%endif + + ; FIXME(rbultje) replace by jumptable? + test x_offsetd, x_offsetd + jnz .x_nonzero + ; x_offset == 0 + test y_offsetd, y_offsetd + jnz .x_zero_y_nonzero + + ; x_offset == 0 && y_offset == 0 +.x_zero_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + mova m1, [dstq] +%if %2 == 1 ; avg + pavgb m0, [secq] + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%endif + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + +%if %2 == 0 ; !avg + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m0, [srcq+src_strideq] +%else ; 4xh + movx m1, [srcq+src_strideq] + punpckldq m0, m1 +%endif +%else ; !avg + movx m2, [srcq+src_strideq] +%endif + + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + +%if %2 == 1 ; avg +%if %1 > 4 + pavgb m0, [secq] +%else + movh m2, [secq] + pavgb m0, m2 +%endif + punpcklbw m3, m5 + punpcklbw m1, m5 +%if %1 > 4 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_zero_y_zero_loop + STORE_AND_RET %1 + +.x_zero_y_nonzero: + cmp y_offsetd, 4 + jne .x_zero_y_nonhalf + + ; x_offset == 0 && y_offset == 0.5 +.x_zero_y_half_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+src_strideq] + mova m1, [dstq] + pavgb m0, m4 + punpckhbw m3, m1, m5 +%if %2 == 1 ; avg + pavgb m0, [secq] +%endif + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m2, [srcq+src_strideq] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m2, [srcq+src_strideq*2] +%else ; 4xh + movx m1, [srcq+src_strideq*2] + punpckldq m2, m1 +%endif + movx m1, [dstq] +%if %1 > 4 + movlhps m0, m2 +%else ; 4xh + punpckldq m0, m2 +%endif + movx m3, [dstq+dst_strideq] + pavgb m0, m2 + punpcklbw m1, m5 +%if %1 > 4 + pavgb m0, [secq] + punpcklbw m3, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + movh m4, [secq] + pavgb m0, m4 + punpcklbw m3, m5 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + movx m4, [srcq+src_strideq*2] + movx m1, [dstq] + pavgb m0, m2 + movx m3, [dstq+dst_strideq] + pavgb m2, m4 + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_zero_y_half_loop + STORE_AND_RET %1 + +.x_zero_y_nonhalf: + ; x_offset == 0 && y_offset == bilin interpolation +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+y_offsetq+16] +%endif + mova m10, [GLOBAL(pw_8)] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ; x86-32 or mmx +%if ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0, reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +.x_zero_y_other_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+src_strideq] + mova m1, [dstq] +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + punpcklbw m0, m5 + punpcklbw m4, m5 + ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can + ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of + ; instructions is the same (5), but it is 1 mul instead of 2, so might be + ; slightly faster because of pmullw latency. It would also cut our rodata + ; tables in half for this function, and save 1-2 registers on x86-64. + pmullw m2, filter_y_a + pmullw m3, filter_y_b + paddw m2, filter_rnd + pmullw m0, filter_y_a + pmullw m4, filter_y_b + paddw m0, filter_rnd + paddw m2, m3 + paddw m0, m4 +%endif + psraw m2, 4 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m2, [srcq+src_strideq] + movx m4, [srcq+src_strideq*2] + movx m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + movx m1, [dstq] + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_y_a + pmullw m1, m2, filter_y_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_y_a + pmullw m4, filter_y_b + paddw m0, m1 + paddw m2, filter_rnd + movx m1, [dstq] + paddw m2, m4 +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_zero_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonzero: + cmp x_offsetd, 4 + jne .x_nonhalf + ; x_offset == 0.5 + test y_offsetd, y_offsetd + jnz .x_half_y_nonzero + + ; x_offset == 0.5 && y_offset == 0 +.x_half_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+1] + mova m1, [dstq] + pavgb m0, m4 + punpckhbw m3, m1, m5 +%if %2 == 1 ; avg + pavgb m0, [secq] +%endif + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m4, [srcq+1] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m0, [srcq+src_strideq] + movhps m4, [srcq+src_strideq+1] +%else ; 4xh + movx m1, [srcq+src_strideq] + punpckldq m0, m1 + movx m2, [srcq+src_strideq+1] + punpckldq m4, m2 +%endif + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + pavgb m0, m4 + punpcklbw m3, m5 +%if %1 > 4 + pavgb m0, [secq] + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + movh m2, [secq] + pavgb m0, m2 + punpcklbw m1, m5 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + movx m2, [srcq+src_strideq] + movx m1, [dstq] + pavgb m0, m4 + movx m4, [srcq+src_strideq+1] + movx m3, [dstq+dst_strideq] + pavgb m2, m4 + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_half_y_zero_loop + STORE_AND_RET %1 + +.x_half_y_nonzero: + cmp y_offsetd, 4 + jne .x_half_y_nonhalf + + ; x_offset == 0.5 && y_offset == 0.5 +%if %1 == 16 + movu m0, [srcq] + movu m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_half_loop: + movu m4, [srcq] + movu m3, [srcq+1] + mova m1, [dstq] + pavgb m4, m3 + punpckhbw m3, m1, m5 + pavgb m0, m4 +%if %2 == 1 ; avg + punpcklbw m1, m5 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_half_loop: + movx m2, [srcq] + movx m3, [srcq+1] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m2, [srcq+src_strideq] + movhps m3, [srcq+src_strideq+1] +%else + movx m1, [srcq+src_strideq] + punpckldq m2, m1 + movx m1, [srcq+src_strideq+1] + punpckldq m3, m1 +%endif + pavgb m2, m3 +%if %1 > 4 + movlhps m0, m2 + movhlps m4, m2 +%else ; 4xh + punpckldq m0, m2 + pshuflw m4, m2, 0xe +%endif + movx m1, [dstq] + pavgb m0, m2 + movx m3, [dstq+dst_strideq] +%if %1 > 4 + pavgb m0, [secq] +%else + movh m2, [secq] + pavgb m0, m2 +%endif + punpcklbw m3, m5 + punpcklbw m1, m5 +%if %1 > 4 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + movx m4, [srcq+src_strideq] + movx m1, [srcq+src_strideq+1] + pavgb m2, m3 + pavgb m4, m1 + pavgb m0, m2 + pavgb m2, m4 + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_half_y_half_loop + STORE_AND_RET %1 + +.x_half_y_nonhalf: + ; x_offset == 0.5 && y_offset == bilin interpolation +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+y_offsetq+16] +%endif + mova m10, [GLOBAL(pw_8)] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ;x86_32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0.5. We can reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_other_loop: + movu m4, [srcq] + movu m2, [srcq+1] + mova m1, [dstq] + pavgb m4, m2 +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + paddw m2, filter_rnd + paddw m0, filter_rnd + psraw m2, 4 +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + pmullw m2, filter_y_a + pmullw m3, filter_y_b + paddw m2, filter_rnd + punpcklbw m0, m5 + paddw m2, m3 + punpcklbw m3, m4, m5 + pmullw m0, filter_y_a + pmullw m3, filter_y_b + paddw m0, filter_rnd + psraw m2, 4 + paddw m0, m3 +%endif + punpckhbw m3, m1, m5 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +%if notcpuflag(ssse3) + punpcklbw m0, m5 +%endif +.x_half_y_other_loop: + movx m2, [srcq] + movx m1, [srcq+1] + movx m4, [srcq+src_strideq] + movx m3, [srcq+src_strideq+1] + pavgb m2, m1 + pavgb m4, m3 + movx m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + movx m1, [dstq] + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd +%else + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_y_a + pmullw m1, m2, filter_y_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_y_a + paddw m0, m1 + pmullw m1, m4, filter_y_b + paddw m2, filter_rnd + paddw m2, m1 + movx m1, [dstq] +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_half_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonhalf: + test y_offsetd, y_offsetd + jnz .x_nonhalf_y_nonzero + + ; x_offset == bilin interpolation && y_offset == 0 +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift +%if ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +;y_offset == 0. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +.x_other_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+1] + mova m1, [dstq] +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + punpcklbw m0, m5 + punpcklbw m4, m5 + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + pmullw m0, filter_x_a + pmullw m4, filter_x_b + paddw m0, filter_rnd + paddw m2, m3 + paddw m0, m4 +%endif + psraw m2, 4 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m1, [srcq+1] + movx m2, [srcq+src_strideq] + movx m4, [srcq+src_strideq+1] + movx m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + punpcklbw m0, m1 + movx m1, [dstq] + punpcklbw m2, m4 + pmaddubsw m0, filter_x_a + pmaddubsw m2, filter_x_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m0, m1 + paddw m2, filter_rnd + movx m1, [dstq] + paddw m2, m4 +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_other_y_zero_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonhalf_y_nonzero: + cmp y_offsetd, 4 + jne .x_nonhalf_y_nonhalf + + ; x_offset == bilin interpolation && y_offset == 0.5 +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift +%if ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; y_offset == 0.5. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+1] +%if cpuflag(ssse3) + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + paddw m0, m1 + paddw m2, m3 +%endif + psraw m0, 4 + psraw m2, 4 + add srcq, src_strideq + packuswb m0, m2 +.x_other_y_half_loop: + movu m4, [srcq] + movu m3, [srcq+1] +%if cpuflag(ssse3) + mova m1, [dstq] + punpckhbw m2, m4, m3 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m4, m2 + pavgb m0, m4 + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%else + punpckhbw m2, m4, m5 + punpckhbw m1, m3, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + paddw m4, m3 + paddw m2, m1 + mova m1, [dstq] + psraw m4, 4 + psraw m2, 4 + punpckhbw m3, m1, m5 + ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we + ; have a 1-register shortage to be able to store the backup of the bilin + ; filtered second line as words as cache for the next line. Packing into + ; a byte costs 1 pack and 2 unpacks, but saves a register. + packuswb m4, m2 + punpcklbw m1, m5 + pavgb m0, m4 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + pavgb m0, [secq] +%endif + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m1, [srcq+1] +%if cpuflag(ssse3) + punpcklbw m0, m1 + pmaddubsw m0, filter_x_a + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + paddw m0, m1 +%endif + add srcq, src_strideq + psraw m0, 4 +.x_other_y_half_loop: + movx m2, [srcq] + movx m1, [srcq+1] + movx m4, [srcq+src_strideq] + movx m3, [srcq+src_strideq+1] +%if cpuflag(ssse3) + punpcklbw m2, m1 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + paddw m2, filter_rnd + paddw m4, filter_rnd +%else + punpcklbw m2, m5 + punpcklbw m1, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + paddw m2, m1 + movx m1, [dstq] + paddw m4, m3 + movx m3, [dstq+dst_strideq] +%endif + psraw m2, 4 + psraw m4, 4 + pavgw m0, m2 + pavgw m2, m4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline - also consider going to bytes here +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m3, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_other_y_half_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonhalf_y_nonhalf: +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m11, [bilin_filter+y_offsetq+16] +%endif + mova m12, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_y_a m10 +%define filter_y_b m11 +%define filter_rnd m12 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; In this case, there is NO unused register. Used src_stride register. Later, +; src_stride has to be loaded from stack when it is needed. +%define tempq src_strideq + mov tempq, g_bilin_filterm + add x_offsetq, tempq + add y_offsetq, tempq +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter + add y_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + + ; x_offset == bilin interpolation && y_offset == bilin interpolation +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+1] +%if cpuflag(ssse3) + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + paddw m0, m1 + paddw m2, m3 +%endif + psraw m0, 4 + psraw m2, 4 + + INC_SRC_BY_SRC_STRIDE + + packuswb m0, m2 +.x_other_y_other_loop: +%if cpuflag(ssse3) + movu m4, [srcq] + movu m3, [srcq+1] + mova m1, [dstq] + punpckhbw m2, m4, m3 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + punpckhbw m3, m1, m5 + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m4, m2 + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + punpcklbw m1, m5 + paddw m2, filter_rnd + paddw m0, filter_rnd + psraw m2, 4 + psraw m0, 4 +%else + movu m3, [srcq] + movu m4, [srcq+1] + punpckhbw m1, m3, m5 + punpckhbw m2, m4, m5 + punpcklbw m3, m5 + punpcklbw m4, m5 + pmullw m3, filter_x_a + pmullw m4, filter_x_b + paddw m3, filter_rnd + pmullw m1, filter_x_a + pmullw m2, filter_x_b + paddw m1, filter_rnd + paddw m3, m4 + paddw m1, m2 + psraw m3, 4 + psraw m1, 4 + packuswb m4, m3, m1 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + pmullw m2, filter_y_a + pmullw m1, filter_y_b + paddw m2, filter_rnd + pmullw m0, filter_y_a + pmullw m3, filter_y_b + paddw m2, m1 + mova m1, [dstq] + paddw m0, filter_rnd + psraw m2, 4 + paddw m0, m3 + punpckhbw m3, m1, m5 + psraw m0, 4 + punpcklbw m1, m5 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + INC_SRC_BY_SRC_STRIDE + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m1, [srcq+1] +%if cpuflag(ssse3) + punpcklbw m0, m1 + pmaddubsw m0, filter_x_a + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + paddw m0, m1 +%endif + psraw m0, 4 +%if cpuflag(ssse3) + packuswb m0, m0 +%endif + + INC_SRC_BY_SRC_STRIDE + +.x_other_y_other_loop: + movx m2, [srcq] + movx m1, [srcq+1] + + INC_SRC_BY_SRC_STRIDE + movx m4, [srcq] + movx m3, [srcq+1] + +%if cpuflag(ssse3) + punpcklbw m2, m1 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + movx m3, [dstq+dst_strideq] + movx m1, [dstq] + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m2, m2 + packuswb m4, m4 + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd + psraw m0, 4 + psraw m2, 4 + punpcklbw m1, m5 +%else + punpcklbw m2, m5 + punpcklbw m1, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + paddw m2, m1 + paddw m4, m3 + psraw m2, 4 + psraw m4, 4 + pmullw m0, filter_y_a + pmullw m3, m2, filter_y_b + paddw m0, filter_rnd + pmullw m2, filter_y_a + pmullw m1, m4, filter_y_b + paddw m2, filter_rnd + paddw m0, m3 + movx m3, [dstq+dst_strideq] + paddw m2, m1 + movx m1, [dstq] + psraw m0, 4 + psraw m2, 4 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + INC_SRC_BY_SRC_STRIDE + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_other_y_other_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd +%undef movx + STORE_AND_RET %1 +%endmacro + +; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical +; between the ssse3 and non-ssse3 version. It may make sense to merge their +; code in the sense that the ssse3 version would jump to the appropriate +; location in the sse/2 version, rather than duplicating that code in the +; binary. + +INIT_XMM sse2 +SUBPEL_VARIANCE 4 +SUBPEL_VARIANCE 8 +SUBPEL_VARIANCE 16 + +INIT_XMM ssse3 +SUBPEL_VARIANCE 4 +SUBPEL_VARIANCE 8 +SUBPEL_VARIANCE 16 + +INIT_XMM sse2 +SUBPEL_VARIANCE 4, 1 +SUBPEL_VARIANCE 8, 1 +SUBPEL_VARIANCE 16, 1 + +INIT_XMM ssse3 +SUBPEL_VARIANCE 4, 1 +SUBPEL_VARIANCE 8, 1 +SUBPEL_VARIANCE 16, 1 diff --git a/libs/libaom/src/aom_dsp/x86/subtract_avx2.c b/libs/libaom/src/aom_dsp/x86/subtract_avx2.c new file mode 100644 index 000000000..40831600a --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/subtract_avx2.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "config/aom_dsp_rtcd.h" + +static INLINE void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr, + const uint8_t *pred_ptr) { + __m256i s = _mm256_lddqu_si256((__m256i *)(src_ptr)); + __m256i p = _mm256_lddqu_si256((__m256i *)(pred_ptr)); + __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s)); + __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1)); + __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p)); + __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1)); + const __m256i d_0 = _mm256_sub_epi16(s_0, p_0); + const __m256i d_1 = _mm256_sub_epi16(s_1, p_1); + _mm256_store_si256((__m256i *)(diff_ptr), d_0); + _mm256_store_si256((__m256i *)(diff_ptr + 16), d_1); +} + +static INLINE void subtract_block_16xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + for (int32_t j = 0; j < rows; ++j) { + __m128i s = _mm_lddqu_si128((__m128i *)(src_ptr)); + __m128i p = _mm_lddqu_si128((__m128i *)(pred_ptr)); + __m256i s_0 = _mm256_cvtepu8_epi16(s); + __m256i p_0 = _mm256_cvtepu8_epi16(p); + const __m256i d_0 = _mm256_sub_epi16(s_0, p_0); + _mm256_store_si256((__m256i *)(diff_ptr), d_0); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +static INLINE void subtract_block_32xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + for (int32_t j = 0; j < rows; ++j) { + subtract32_avx2(diff_ptr, src_ptr, pred_ptr); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +static INLINE void subtract_block_64xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + for (int32_t j = 0; j < rows; ++j) { + subtract32_avx2(diff_ptr, src_ptr, pred_ptr); + subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +static INLINE void subtract_block_128xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + for (int32_t j = 0; j < rows; ++j) { + subtract32_avx2(diff_ptr, src_ptr, pred_ptr); + subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32); + subtract32_avx2(diff_ptr + 64, src_ptr + 64, pred_ptr + 64); + subtract32_avx2(diff_ptr + 96, src_ptr + 96, pred_ptr + 96); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +void aom_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride) { + switch (cols) { + case 16: + subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + break; + case 32: + subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + break; + case 64: + subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + break; + case 128: + subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + default: + aom_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + } +} diff --git a/libs/libaom/src/aom_dsp/x86/subtract_sse2.asm b/libs/libaom/src/aom_dsp/x86/subtract_sse2.asm new file mode 100644 index 000000000..1a75a234f --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/subtract_sse2.asm @@ -0,0 +1,146 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; void aom_subtract_block(int rows, int cols, +; int16_t *diff, ptrdiff_t diff_stride, +; const uint8_t *src, ptrdiff_t src_stride, +; const uint8_t *pred, ptrdiff_t pred_stride) + +INIT_XMM sse2 +cglobal subtract_block, 7, 7, 8, \ + rows, cols, diff, diff_stride, src, src_stride, \ + pred, pred_stride +%define pred_str colsq + pxor m7, m7 ; dedicated zero register + cmp colsd, 4 + je .case_4 + cmp colsd, 8 + je .case_8 + cmp colsd, 16 + je .case_16 + cmp colsd, 32 + je .case_32 + cmp colsd, 64 + je .case_64 + +%macro loop16 6 + mova m0, [srcq+%1] + mova m4, [srcq+%2] + mova m1, [predq+%3] + mova m5, [predq+%4] + punpckhbw m2, m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m0, m7 + punpcklbw m1, m7 + psubw m2, m3 + psubw m0, m1 + punpckhbw m1, m4, m7 + punpckhbw m3, m5, m7 + punpcklbw m4, m7 + punpcklbw m5, m7 + psubw m1, m3 + psubw m4, m5 + mova [diffq+mmsize*0+%5], m0 + mova [diffq+mmsize*1+%5], m2 + mova [diffq+mmsize*0+%6], m4 + mova [diffq+mmsize*1+%6], m1 +%endmacro + + mov pred_str, pred_stridemp +.loop_128: + loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize + loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize + loop16 4*mmsize, 5*mmsize, 4*mmsize, 5*mmsize, 8*mmsize, 10*mmsize + loop16 6*mmsize, 7*mmsize, 6*mmsize, 7*mmsize, 12*mmsize, 14*mmsize + lea diffq, [diffq+diff_strideq*2] + add predq, pred_str + add srcq, src_strideq + sub rowsd, 1 + jnz .loop_128 + RET + +.case_64: + mov pred_str, pred_stridemp +.loop_64: + loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize + loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize + lea diffq, [diffq+diff_strideq*2] + add predq, pred_str + add srcq, src_strideq + dec rowsd + jg .loop_64 + RET + +.case_32: + mov pred_str, pred_stridemp +.loop_32: + loop16 0, mmsize, 0, mmsize, 0, 2*mmsize + lea diffq, [diffq+diff_strideq*2] + add predq, pred_str + add srcq, src_strideq + dec rowsd + jg .loop_32 + RET + +.case_16: + mov pred_str, pred_stridemp +.loop_16: + loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2 + lea diffq, [diffq+diff_strideq*4] + lea predq, [predq+pred_str*2] + lea srcq, [srcq+src_strideq*2] + sub rowsd, 2 + jg .loop_16 + RET + +%macro loop_h 0 + movh m0, [srcq] + movh m2, [srcq+src_strideq] + movh m1, [predq] + movh m3, [predq+pred_str] + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + psubw m0, m1 + psubw m2, m3 + mova [diffq], m0 + mova [diffq+diff_strideq*2], m2 +%endmacro + +.case_8: + mov pred_str, pred_stridemp +.loop_8: + loop_h + lea diffq, [diffq+diff_strideq*4] + lea srcq, [srcq+src_strideq*2] + lea predq, [predq+pred_str*2] + sub rowsd, 2 + jg .loop_8 + RET + +INIT_MMX +.case_4: + mov pred_str, pred_stridemp +.loop_4: + loop_h + lea diffq, [diffq+diff_strideq*4] + lea srcq, [srcq+src_strideq*2] + lea predq, [predq+pred_str*2] + sub rowsd, 2 + jg .loop_4 + RET diff --git a/libs/libaom/src/aom_dsp/x86/sum_squares_avx2.c b/libs/libaom/src/aom_dsp/x86/sum_squares_avx2.c new file mode 100644 index 000000000..97d78b684 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/sum_squares_avx2.c @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" +#include "aom_dsp/x86/sum_squares_sse2.h" +#include "config/aom_dsp_rtcd.h" + +static uint64_t aom_sum_squares_2d_i16_nxn_avx2(const int16_t *src, int stride, + int width, int height) { + uint64_t result; + __m256i v_acc_q = _mm256_setzero_si256(); + const __m256i v_zext_mask_q = yy_set1_64_from_32i(0xffffffff); + for (int col = 0; col < height; col += 4) { + __m256i v_acc_d = _mm256_setzero_si256(); + for (int row = 0; row < width; row += 16) { + const int16_t *tempsrc = src + row; + const __m256i v_val_0_w = + _mm256_loadu_si256((const __m256i *)(tempsrc + 0 * stride)); + const __m256i v_val_1_w = + _mm256_loadu_si256((const __m256i *)(tempsrc + 1 * stride)); + const __m256i v_val_2_w = + _mm256_loadu_si256((const __m256i *)(tempsrc + 2 * stride)); + const __m256i v_val_3_w = + _mm256_loadu_si256((const __m256i *)(tempsrc + 3 * stride)); + + const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w); + const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w); + const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w); + const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w); + + const __m256i v_sum_01_d = _mm256_add_epi32(v_sq_0_d, v_sq_1_d); + const __m256i v_sum_23_d = _mm256_add_epi32(v_sq_2_d, v_sq_3_d); + const __m256i v_sum_0123_d = _mm256_add_epi32(v_sum_01_d, v_sum_23_d); + + v_acc_d = _mm256_add_epi32(v_acc_d, v_sum_0123_d); + } + v_acc_q = + _mm256_add_epi64(v_acc_q, _mm256_and_si256(v_acc_d, v_zext_mask_q)); + v_acc_q = _mm256_add_epi64(v_acc_q, _mm256_srli_epi64(v_acc_d, 32)); + src += 4 * stride; + } + __m128i lower_64_2_Value = _mm256_castsi256_si128(v_acc_q); + __m128i higher_64_2_Value = _mm256_extracti128_si256(v_acc_q, 1); + __m128i result_64_2_int = _mm_add_epi64(lower_64_2_Value, higher_64_2_Value); + + result_64_2_int = _mm_add_epi64( + result_64_2_int, _mm_unpackhi_epi64(result_64_2_int, result_64_2_int)); + + xx_storel_64(&result, result_64_2_int); + + return result; +} + +uint64_t aom_sum_squares_2d_i16_avx2(const int16_t *src, int stride, int width, + int height) { + if (LIKELY(width == 4 && height == 4)) { + return aom_sum_squares_2d_i16_4x4_sse2(src, stride); + } else if (LIKELY(width == 4 && (height & 3) == 0)) { + return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height); + } else if (LIKELY(width == 8 && (height & 3) == 0)) { + return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height); + } else if (LIKELY(((width & 15) == 0) && ((height & 3) == 0))) { + return aom_sum_squares_2d_i16_nxn_avx2(src, stride, width, height); + } else { + return aom_sum_squares_2d_i16_c(src, stride, width, height); + } +} + +// Accumulate sum of 16-bit elements in the vector +static AOM_INLINE int32_t mm256_accumulate_epi16(__m256i vec_a) { + __m128i vtmp1 = _mm256_extracti128_si256(vec_a, 1); + __m128i vtmp2 = _mm256_castsi256_si128(vec_a); + vtmp1 = _mm_add_epi16(vtmp1, vtmp2); + vtmp2 = _mm_srli_si128(vtmp1, 8); + vtmp1 = _mm_add_epi16(vtmp1, vtmp2); + vtmp2 = _mm_srli_si128(vtmp1, 4); + vtmp1 = _mm_add_epi16(vtmp1, vtmp2); + vtmp2 = _mm_srli_si128(vtmp1, 2); + vtmp1 = _mm_add_epi16(vtmp1, vtmp2); + return _mm_extract_epi16(vtmp1, 0); +} + +// Accumulate sum of 32-bit elements in the vector +static AOM_INLINE int32_t mm256_accumulate_epi32(__m256i vec_a) { + __m128i vtmp1 = _mm256_extracti128_si256(vec_a, 1); + __m128i vtmp2 = _mm256_castsi256_si128(vec_a); + vtmp1 = _mm_add_epi32(vtmp1, vtmp2); + vtmp2 = _mm_srli_si128(vtmp1, 8); + vtmp1 = _mm_add_epi32(vtmp1, vtmp2); + vtmp2 = _mm_srli_si128(vtmp1, 4); + vtmp1 = _mm_add_epi32(vtmp1, vtmp2); + return _mm_cvtsi128_si32(vtmp1); +} + +uint64_t aom_var_2d_u8_avx2(uint8_t *src, int src_stride, int width, + int height) { + uint8_t *srcp; + uint64_t s = 0, ss = 0; + __m256i vzero = _mm256_setzero_si256(); + __m256i v_acc_sum = vzero; + __m256i v_acc_sqs = vzero; + int i, j; + + // Process 32 elements in a row + for (i = 0; i < width - 31; i += 32) { + srcp = src + i; + // Process 8 columns at a time + for (j = 0; j < height - 7; j += 8) { + __m256i vsrc[8]; + for (int k = 0; k < 8; k++) { + vsrc[k] = _mm256_loadu_si256((__m256i *)srcp); + srcp += src_stride; + } + for (int k = 0; k < 8; k++) { + __m256i vsrc0 = _mm256_unpacklo_epi8(vsrc[k], vzero); + __m256i vsrc1 = _mm256_unpackhi_epi8(vsrc[k], vzero); + v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc0); + v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc1); + + __m256i vsqs0 = _mm256_madd_epi16(vsrc0, vsrc0); + __m256i vsqs1 = _mm256_madd_epi16(vsrc1, vsrc1); + v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0); + v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs1); + } + + // Update total sum and clear the vectors + s += mm256_accumulate_epi16(v_acc_sum); + ss += mm256_accumulate_epi32(v_acc_sqs); + v_acc_sum = vzero; + v_acc_sqs = vzero; + } + + // Process remaining rows (height not a multiple of 8) + for (; j < height; j++) { + __m256i vsrc = _mm256_loadu_si256((__m256i *)srcp); + __m256i vsrc0 = _mm256_unpacklo_epi8(vsrc, vzero); + __m256i vsrc1 = _mm256_unpackhi_epi8(vsrc, vzero); + v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc0); + v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc1); + + __m256i vsqs0 = _mm256_madd_epi16(vsrc0, vsrc0); + __m256i vsqs1 = _mm256_madd_epi16(vsrc1, vsrc1); + v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0); + v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs1); + + srcp += src_stride; + } + + // Update total sum and clear the vectors + s += mm256_accumulate_epi16(v_acc_sum); + ss += mm256_accumulate_epi32(v_acc_sqs); + v_acc_sum = vzero; + v_acc_sqs = vzero; + } + + // Process the remaining area using C + srcp = src; + for (int k = 0; k < height; k++) { + for (int m = i; m < width; m++) { + uint8_t val = srcp[m]; + s += val; + ss += val * val; + } + srcp += src_stride; + } + return (ss - s * s / (width * height)); +} + +uint64_t aom_var_2d_u16_avx2(uint8_t *src, int src_stride, int width, + int height) { + uint16_t *srcp1 = CONVERT_TO_SHORTPTR(src), *srcp; + uint64_t s = 0, ss = 0; + __m256i vzero = _mm256_setzero_si256(); + __m256i v_acc_sum = vzero; + __m256i v_acc_sqs = vzero; + int i, j; + + // Process 16 elements in a row + for (i = 0; i < width - 15; i += 16) { + srcp = srcp1 + i; + // Process 8 columns at a time + for (j = 0; j < height - 8; j += 8) { + __m256i vsrc[8]; + for (int k = 0; k < 8; k++) { + vsrc[k] = _mm256_loadu_si256((__m256i *)srcp); + srcp += src_stride; + } + for (int k = 0; k < 8; k++) { + __m256i vsrc0 = _mm256_unpacklo_epi16(vsrc[k], vzero); + __m256i vsrc1 = _mm256_unpackhi_epi16(vsrc[k], vzero); + v_acc_sum = _mm256_add_epi32(vsrc0, v_acc_sum); + v_acc_sum = _mm256_add_epi32(vsrc1, v_acc_sum); + + __m256i vsqs0 = _mm256_madd_epi16(vsrc[k], vsrc[k]); + v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0); + } + + // Update total sum and clear the vectors + s += mm256_accumulate_epi32(v_acc_sum); + ss += mm256_accumulate_epi32(v_acc_sqs); + v_acc_sum = vzero; + v_acc_sqs = vzero; + } + + // Process remaining rows (height not a multiple of 8) + for (; j < height; j++) { + __m256i vsrc = _mm256_loadu_si256((__m256i *)srcp); + __m256i vsrc0 = _mm256_unpacklo_epi16(vsrc, vzero); + __m256i vsrc1 = _mm256_unpackhi_epi16(vsrc, vzero); + v_acc_sum = _mm256_add_epi32(vsrc0, v_acc_sum); + v_acc_sum = _mm256_add_epi32(vsrc1, v_acc_sum); + + __m256i vsqs0 = _mm256_madd_epi16(vsrc, vsrc); + v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0); + srcp += src_stride; + } + + // Update total sum and clear the vectors + s += mm256_accumulate_epi32(v_acc_sum); + ss += mm256_accumulate_epi32(v_acc_sqs); + v_acc_sum = vzero; + v_acc_sqs = vzero; + } + + // Process the remaining area using C + srcp = srcp1; + for (int k = 0; k < height; k++) { + for (int m = i; m < width; m++) { + uint16_t val = srcp[m]; + s += val; + ss += val * val; + } + srcp += src_stride; + } + return (ss - s * s / (width * height)); +} diff --git a/libs/libaom/src/aom_dsp/x86/sum_squares_sse2.c b/libs/libaom/src/aom_dsp/x86/sum_squares_sse2.c new file mode 100644 index 000000000..85b301a88 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/sum_squares_sse2.c @@ -0,0 +1,366 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/sum_squares_sse2.h" +#include "config/aom_dsp_rtcd.h" + +static INLINE __m128i xx_loadh_64(__m128i a, const void *b) { + const __m128d ad = _mm_castsi128_pd(a); + return _mm_castpd_si128(_mm_loadh_pd(ad, (double *)b)); +} + +static INLINE uint64_t xx_cvtsi128_si64(__m128i a) { +#if ARCH_X86_64 + return (uint64_t)_mm_cvtsi128_si64(a); +#else + { + uint64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, a); + return tmp; + } +#endif +} + +static INLINE __m128i sum_squares_i16_4x4_sse2(const int16_t *src, int stride) { + const __m128i v_val_0_w = xx_loadl_64(src + 0 * stride); + const __m128i v_val_2_w = xx_loadl_64(src + 2 * stride); + const __m128i v_val_01_w = xx_loadh_64(v_val_0_w, src + 1 * stride); + const __m128i v_val_23_w = xx_loadh_64(v_val_2_w, src + 3 * stride); + const __m128i v_sq_01_d = _mm_madd_epi16(v_val_01_w, v_val_01_w); + const __m128i v_sq_23_d = _mm_madd_epi16(v_val_23_w, v_val_23_w); + + return _mm_add_epi32(v_sq_01_d, v_sq_23_d); +} + +uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride) { + const __m128i v_sum_0123_d = sum_squares_i16_4x4_sse2(src, stride); + __m128i v_sum_d = + _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32)); + v_sum_d = _mm_add_epi32(v_sum_d, _mm_srli_si128(v_sum_d, 8)); + return (uint64_t)_mm_cvtsi128_si32(v_sum_d); +} + +uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride, + int height) { + int r = 0; + __m128i v_acc_q = _mm_setzero_si128(); + do { + const __m128i v_acc_d = sum_squares_i16_4x4_sse2(src, stride); + v_acc_q = _mm_add_epi32(v_acc_q, v_acc_d); + src += stride << 2; + r += 4; + } while (r < height); + const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff); + __m128i v_acc_64 = _mm_add_epi64(_mm_srli_epi64(v_acc_q, 32), + _mm_and_si128(v_acc_q, v_zext_mask_q)); + v_acc_64 = _mm_add_epi64(v_acc_64, _mm_srli_si128(v_acc_64, 8)); + return xx_cvtsi128_si64(v_acc_64); +} + +#ifdef __GNUC__ +// This prevents GCC/Clang from inlining this function into +// aom_sum_squares_2d_i16_sse2, which in turn saves some stack +// maintenance instructions in the common case of 4x4. +__attribute__((noinline)) +#endif +uint64_t +aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width, + int height) { + int r = 0; + + const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff); + __m128i v_acc_q = _mm_setzero_si128(); + + do { + __m128i v_acc_d = _mm_setzero_si128(); + int c = 0; + do { + const int16_t *b = src + c; + + const __m128i v_val_0_w = xx_load_128(b + 0 * stride); + const __m128i v_val_1_w = xx_load_128(b + 1 * stride); + const __m128i v_val_2_w = xx_load_128(b + 2 * stride); + const __m128i v_val_3_w = xx_load_128(b + 3 * stride); + + const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); + const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); + const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); + const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); + + const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); + const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); + + const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); + + v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d); + c += 8; + } while (c < width); + + v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q)); + v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32)); + + src += 4 * stride; + r += 4; + } while (r < height); + + v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); + return xx_cvtsi128_si64(v_acc_q); +} + +uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width, + int height) { + // 4 elements per row only requires half an XMM register, so this + // must be a special case, but also note that over 75% of all calls + // are with size == 4, so it is also the common case. + if (LIKELY(width == 4 && height == 4)) { + return aom_sum_squares_2d_i16_4x4_sse2(src, stride); + } else if (LIKELY(width == 4 && (height & 3) == 0)) { + return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height); + } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) { + // Generic case + return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height); + } else { + return aom_sum_squares_2d_i16_c(src, stride, width, height); + } +} + +////////////////////////////////////////////////////////////////////////////// +// 1D version +////////////////////////////////////////////////////////////////////////////// + +static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) { + const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff); + __m128i v_acc0_q = _mm_setzero_si128(); + __m128i v_acc1_q = _mm_setzero_si128(); + + const int16_t *const end = src + n; + + assert(n % 64 == 0); + + while (src < end) { + const __m128i v_val_0_w = xx_load_128(src); + const __m128i v_val_1_w = xx_load_128(src + 8); + const __m128i v_val_2_w = xx_load_128(src + 16); + const __m128i v_val_3_w = xx_load_128(src + 24); + const __m128i v_val_4_w = xx_load_128(src + 32); + const __m128i v_val_5_w = xx_load_128(src + 40); + const __m128i v_val_6_w = xx_load_128(src + 48); + const __m128i v_val_7_w = xx_load_128(src + 56); + + const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); + const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); + const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); + const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); + const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w); + const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w); + const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w); + const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w); + + const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); + const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); + const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d); + const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d); + + const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); + const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d); + + const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, v_sum_4567_d); + + v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_and_si128(v_sum_d, v_zext_mask_q)); + v_acc1_q = _mm_add_epi64(v_acc1_q, _mm_srli_epi64(v_sum_d, 32)); + + src += 64; + } + + v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q); + v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8)); + return xx_cvtsi128_si64(v_acc0_q); +} + +uint64_t aom_sum_squares_i16_sse2(const int16_t *src, uint32_t n) { + if (n % 64 == 0) { + return aom_sum_squares_i16_64n_sse2(src, n); + } else if (n > 64) { + int k = n & ~(64 - 1); + return aom_sum_squares_i16_64n_sse2(src, k) + + aom_sum_squares_i16_c(src + k, n - k); + } else { + return aom_sum_squares_i16_c(src, n); + } +} + +// Accumulate sum of 16-bit elements in the vector +static AOM_INLINE int32_t mm_accumulate_epi16(__m128i vec_a) { + __m128i vtmp = _mm_srli_si128(vec_a, 8); + vec_a = _mm_add_epi16(vec_a, vtmp); + vtmp = _mm_srli_si128(vec_a, 4); + vec_a = _mm_add_epi16(vec_a, vtmp); + vtmp = _mm_srli_si128(vec_a, 2); + vec_a = _mm_add_epi16(vec_a, vtmp); + return _mm_extract_epi16(vec_a, 0); +} + +// Accumulate sum of 32-bit elements in the vector +static AOM_INLINE int32_t mm_accumulate_epi32(__m128i vec_a) { + __m128i vtmp = _mm_srli_si128(vec_a, 8); + vec_a = _mm_add_epi32(vec_a, vtmp); + vtmp = _mm_srli_si128(vec_a, 4); + vec_a = _mm_add_epi32(vec_a, vtmp); + return _mm_cvtsi128_si32(vec_a); +} + +uint64_t aom_var_2d_u8_sse2(uint8_t *src, int src_stride, int width, + int height) { + uint8_t *srcp; + uint64_t s = 0, ss = 0; + __m128i vzero = _mm_setzero_si128(); + __m128i v_acc_sum = vzero; + __m128i v_acc_sqs = vzero; + int i, j; + + // Process 16 elements in a row + for (i = 0; i < width - 15; i += 16) { + srcp = src + i; + // Process 8 columns at a time + for (j = 0; j < height - 7; j += 8) { + __m128i vsrc[8]; + for (int k = 0; k < 8; k++) { + vsrc[k] = _mm_loadu_si128((__m128i *)srcp); + srcp += src_stride; + } + for (int k = 0; k < 8; k++) { + __m128i vsrc0 = _mm_unpacklo_epi8(vsrc[k], vzero); + __m128i vsrc1 = _mm_unpackhi_epi8(vsrc[k], vzero); + v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc0); + v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc1); + + __m128i vsqs0 = _mm_madd_epi16(vsrc0, vsrc0); + __m128i vsqs1 = _mm_madd_epi16(vsrc1, vsrc1); + v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0); + v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs1); + } + + // Update total sum and clear the vectors + s += mm_accumulate_epi16(v_acc_sum); + ss += mm_accumulate_epi32(v_acc_sqs); + v_acc_sum = vzero; + v_acc_sqs = vzero; + } + + // Process remaining rows (height not a multiple of 8) + for (; j < height; j++) { + __m128i vsrc = _mm_loadu_si128((__m128i *)srcp); + __m128i vsrc0 = _mm_unpacklo_epi8(vsrc, vzero); + __m128i vsrc1 = _mm_unpackhi_epi8(vsrc, vzero); + v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc0); + v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc1); + + __m128i vsqs0 = _mm_madd_epi16(vsrc0, vsrc0); + __m128i vsqs1 = _mm_madd_epi16(vsrc1, vsrc1); + v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0); + v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs1); + + srcp += src_stride; + } + + // Update total sum and clear the vectors + s += mm_accumulate_epi16(v_acc_sum); + ss += mm_accumulate_epi32(v_acc_sqs); + v_acc_sum = vzero; + v_acc_sqs = vzero; + } + + // Process the remaining area using C + srcp = src; + for (int k = 0; k < height; k++) { + for (int m = i; m < width; m++) { + uint8_t val = srcp[m]; + s += val; + ss += val * val; + } + srcp += src_stride; + } + return (ss - s * s / (width * height)); +} + +uint64_t aom_var_2d_u16_sse2(uint8_t *src, int src_stride, int width, + int height) { + uint16_t *srcp1 = CONVERT_TO_SHORTPTR(src), *srcp; + uint64_t s = 0, ss = 0; + __m128i vzero = _mm_setzero_si128(); + __m128i v_acc_sum = vzero; + __m128i v_acc_sqs = vzero; + int i, j; + + // Process 8 elements in a row + for (i = 0; i < width - 8; i += 8) { + srcp = srcp1 + i; + // Process 8 columns at a time + for (j = 0; j < height - 8; j += 8) { + __m128i vsrc[8]; + for (int k = 0; k < 8; k++) { + vsrc[k] = _mm_loadu_si128((__m128i *)srcp); + srcp += src_stride; + } + for (int k = 0; k < 8; k++) { + __m128i vsrc0 = _mm_unpacklo_epi16(vsrc[k], vzero); + __m128i vsrc1 = _mm_unpackhi_epi16(vsrc[k], vzero); + v_acc_sum = _mm_add_epi32(vsrc0, v_acc_sum); + v_acc_sum = _mm_add_epi32(vsrc1, v_acc_sum); + + __m128i vsqs0 = _mm_madd_epi16(vsrc[k], vsrc[k]); + v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0); + } + + // Update total sum and clear the vectors + s += mm_accumulate_epi32(v_acc_sum); + ss += mm_accumulate_epi32(v_acc_sqs); + v_acc_sum = vzero; + v_acc_sqs = vzero; + } + + // Process remaining rows (height not a multiple of 8) + for (; j < height; j++) { + __m128i vsrc = _mm_loadu_si128((__m128i *)srcp); + __m128i vsrc0 = _mm_unpacklo_epi16(vsrc, vzero); + __m128i vsrc1 = _mm_unpackhi_epi16(vsrc, vzero); + v_acc_sum = _mm_add_epi32(vsrc0, v_acc_sum); + v_acc_sum = _mm_add_epi32(vsrc1, v_acc_sum); + + __m128i vsqs0 = _mm_madd_epi16(vsrc, vsrc); + v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0); + srcp += src_stride; + } + + // Update total sum and clear the vectors + s += mm_accumulate_epi32(v_acc_sum); + ss += mm_accumulate_epi32(v_acc_sqs); + v_acc_sum = vzero; + v_acc_sqs = vzero; + } + + // Process the remaining area using C + srcp = srcp1; + for (int k = 0; k < height; k++) { + for (int m = i; m < width; m++) { + uint16_t val = srcp[m]; + s += val; + ss += val * val; + } + srcp += src_stride; + } + return (ss - s * s / (width * height)); +} diff --git a/libs/libaom/src/aom_dsp/x86/sum_squares_sse2.h b/libs/libaom/src/aom_dsp/x86/sum_squares_sse2.h new file mode 100644 index 000000000..491e31cc5 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/sum_squares_sse2.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_SUM_SQUARES_SSE2_H_ +#define AOM_DSP_X86_SUM_SQUARES_SSE2_H_ + +uint64_t aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, + int width, int height); + +uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride, + int height); +uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride); + +#endif // AOM_DSP_X86_SUM_SQUARES_SSE2_H_ diff --git a/libs/libaom/src/aom_dsp/x86/synonyms.h b/libs/libaom/src/aom_dsp/x86/synonyms.h new file mode 100644 index 000000000..2e99bee3e --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/synonyms.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_SYNONYMS_H_ +#define AOM_AOM_DSP_X86_SYNONYMS_H_ + +#include +#include + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" + +/** + * Various reusable shorthands for x86 SIMD intrinsics. + * + * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers. + * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers. + */ + +// Loads and stores to do away with the tedium of casting the address +// to the right type. +static INLINE __m128i xx_loadl_32(const void *a) { + int val; + memcpy(&val, a, sizeof(val)); + return _mm_cvtsi32_si128(val); +} + +static INLINE __m128i xx_loadl_64(const void *a) { + return _mm_loadl_epi64((const __m128i *)a); +} + +static INLINE __m128i xx_load_128(const void *a) { + return _mm_load_si128((const __m128i *)a); +} + +static INLINE __m128i xx_loadu_128(const void *a) { + return _mm_loadu_si128((const __m128i *)a); +} + +static INLINE void xx_storel_32(void *const a, const __m128i v) { + const int val = _mm_cvtsi128_si32(v); + memcpy(a, &val, sizeof(val)); +} + +static INLINE void xx_storel_64(void *const a, const __m128i v) { + _mm_storel_epi64((__m128i *)a, v); +} + +static INLINE void xx_store_128(void *const a, const __m128i v) { + _mm_store_si128((__m128i *)a, v); +} + +static INLINE void xx_storeu_128(void *const a, const __m128i v) { + _mm_storeu_si128((__m128i *)a, v); +} + +// The _mm_set_epi64x() intrinsic is undefined for some Visual Studio +// compilers. The following function is equivalent to _mm_set_epi64x() +// acting on 32-bit integers. +static INLINE __m128i xx_set_64_from_32i(int32_t e1, int32_t e0) { +#if defined(_MSC_VER) && _MSC_VER < 1900 + return _mm_set_epi32(0, e1, 0, e0); +#else + return _mm_set_epi64x((uint32_t)e1, (uint32_t)e0); +#endif +} + +// The _mm_set1_epi64x() intrinsic is undefined for some Visual Studio +// compilers. The following function is equivalent to _mm_set1_epi64x() +// acting on a 32-bit integer. +static INLINE __m128i xx_set1_64_from_32i(int32_t a) { +#if defined(_MSC_VER) && _MSC_VER < 1900 + return _mm_set_epi32(0, a, 0, a); +#else + return _mm_set1_epi64x((uint32_t)a); +#endif +} + +static INLINE __m128i xx_round_epu16(__m128i v_val_w) { + return _mm_avg_epu16(v_val_w, _mm_setzero_si128()); +} + +static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) { + const __m128i v_s_w = _mm_srli_epi16(v_val_w, bits - 1); + return _mm_avg_epu16(v_s_w, _mm_setzero_si128()); +} + +static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) { + const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); + const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d); + return _mm_srli_epi32(v_tmp_d, bits); +} + +// This is equivalent to ROUND_POWER_OF_TWO(v_val_d, bits) +static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) { + const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); + const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d); + return _mm_srai_epi32(v_tmp_d, bits); +} + +static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) { + const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1); + const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15); + const __m128i v_tmp_d = + _mm_add_epi16(_mm_add_epi16(v_val_d, v_bias_d), v_sign_d); + return _mm_srai_epi16(v_tmp_d, bits); +} + +#endif // AOM_AOM_DSP_X86_SYNONYMS_H_ diff --git a/libs/libaom/src/aom_dsp/x86/synonyms_avx2.h b/libs/libaom/src/aom_dsp/x86/synonyms_avx2.h new file mode 100644 index 000000000..4d6ee6ad6 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/synonyms_avx2.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_ +#define AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_ + +#include + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" + +/** + * Various reusable shorthands for x86 SIMD intrinsics. + * + * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers. + * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers. + */ + +// Loads and stores to do away with the tedium of casting the address +// to the right type. +static INLINE __m256i yy_load_256(const void *a) { + return _mm256_load_si256((const __m256i *)a); +} + +static INLINE __m256i yy_loadu_256(const void *a) { + return _mm256_loadu_si256((const __m256i *)a); +} + +static INLINE void yy_store_256(void *const a, const __m256i v) { + _mm256_store_si256((__m256i *)a, v); +} + +static INLINE void yy_storeu_256(void *const a, const __m256i v) { + _mm256_storeu_si256((__m256i *)a, v); +} + +// The _mm256_set1_epi64x() intrinsic is undefined for some Visual Studio +// compilers. The following function is equivalent to _mm256_set1_epi64x() +// acting on a 32-bit integer. +static INLINE __m256i yy_set1_64_from_32i(int32_t a) { +#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 + return _mm256_set_epi32(0, a, 0, a, 0, a, 0, a); +#else + return _mm256_set1_epi64x((uint32_t)a); +#endif +} + +// Some compilers don't have _mm256_set_m128i defined in immintrin.h. We +// therefore define an equivalent function using a different intrinsic. +// ([ hi ], [ lo ]) -> [ hi ][ lo ] +static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) { + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); +} + +static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) { + __m128i mhi = _mm_loadu_si128((__m128i *)(hi)); + __m128i mlo = _mm_loadu_si128((__m128i *)(lo)); + return yy_set_m128i(mhi, mlo); +} + +static INLINE void yy_storeu2_128(void *hi, void *lo, const __m256i a) { + _mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1)); + _mm_storeu_si128((__m128i *)lo, _mm256_castsi256_si128(a)); +} + +static INLINE __m256i yy_roundn_epu16(__m256i v_val_w, int bits) { + const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1); + return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256()); +} +#endif // AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_ diff --git a/libs/libaom/src/aom_dsp/x86/transpose_sse2.h b/libs/libaom/src/aom_dsp/x86/transpose_sse2.h new file mode 100644 index 000000000..7ac692c78 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/transpose_sse2.h @@ -0,0 +1,420 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_ +#define AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_ + +#include // SSE2 + +#include "config/aom_config.h" + +static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) { + // Unpack 8 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 10 11 12 13 + // in[2]: 20 21 22 23 + // in[3]: 30 31 32 33 + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); + + // Unpack 16 bit elements resulting in: + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + return _mm_unpacklo_epi16(a0, a1); +} + +static INLINE void transpose_8bit_8x8(const __m128i *const in, + __m128i *const out) { + // Unpack 8 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + // in[4]: 40 41 42 43 44 45 46 47 + // in[5]: 50 51 52 53 54 55 56 57 + // in[6]: 60 61 62 63 64 65 66 67 + // in[7]: 70 71 72 73 74 75 76 77 + // to: + // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]); + + // Unpack 16 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + const __m128i b0 = _mm_unpacklo_epi16(a0, a1); + const __m128i b1 = _mm_unpackhi_epi16(a0, a1); + const __m128i b2 = _mm_unpacklo_epi16(a2, a3); + const __m128i b3 = _mm_unpackhi_epi16(a2, a3); + + // Unpack 32 bit elements resulting in: + // c0: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + // c1: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + // c2: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + // c3: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + const __m128i c0 = _mm_unpacklo_epi32(b0, b2); + const __m128i c1 = _mm_unpackhi_epi32(b0, b2); + const __m128i c2 = _mm_unpacklo_epi32(b1, b3); + const __m128i c3 = _mm_unpackhi_epi32(b1, b3); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + // out[4]: 04 14 24 34 44 54 64 74 + // out[5]: 05 15 25 35 45 55 65 75 + // out[6]: 06 16 26 36 46 56 66 76 + // out[7]: 07 17 27 37 47 57 67 77 + out[0] = _mm_unpacklo_epi64(c0, c0); + out[1] = _mm_unpackhi_epi64(c0, c0); + out[2] = _mm_unpacklo_epi64(c1, c1); + out[3] = _mm_unpackhi_epi64(c1, c1); + out[4] = _mm_unpacklo_epi64(c2, c2); + out[5] = _mm_unpackhi_epi64(c2, c2); + out[6] = _mm_unpacklo_epi64(c3, c3); + out[7] = _mm_unpackhi_epi64(c3, c3); +} + +static INLINE void transpose_16bit_4x4(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 XX XX XX XX + // in[1]: 10 11 12 13 XX XX XX XX + // in[2]: 20 21 22 23 XX XX XX XX + // in[3]: 30 31 32 33 XX XX XX XX + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + + // Unpack 32 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + out[0] = _mm_unpacklo_epi32(a0, a1); + out[1] = _mm_srli_si128(out[0], 8); + out[2] = _mm_unpackhi_epi32(a0, a1); + out[3] = _mm_srli_si128(out[2], 8); +} + +static INLINE void transpose_16bit_4x8(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 XX XX XX XX + // in[1]: 10 11 12 13 XX XX XX XX + // in[2]: 20 21 22 23 XX XX XX XX + // in[3]: 30 31 32 33 XX XX XX XX + // in[4]: 40 41 42 43 XX XX XX XX + // in[5]: 50 51 52 53 XX XX XX XX + // in[6]: 60 61 62 63 XX XX XX XX + // in[7]: 70 71 72 73 XX XX XX XX + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + // a2: 40 50 41 51 42 52 43 53 + // a3: 60 70 61 71 62 72 63 73 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); + + // Unpack 32 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 + // b1: 40 50 60 70 41 51 61 71 + // b2: 02 12 22 32 03 13 23 33 + // b3: 42 52 62 72 43 53 63 73 + const __m128i b0 = _mm_unpacklo_epi32(a0, a1); + const __m128i b1 = _mm_unpacklo_epi32(a2, a3); + const __m128i b2 = _mm_unpackhi_epi32(a0, a1); + const __m128i b3 = _mm_unpackhi_epi32(a2, a3); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + out[0] = _mm_unpacklo_epi64(b0, b1); + out[1] = _mm_unpackhi_epi64(b0, b1); + out[2] = _mm_unpacklo_epi64(b2, b3); + out[3] = _mm_unpackhi_epi64(b2, b3); +} + +static INLINE void transpose_16bit_8x4(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + // a4: 04 14 05 15 06 16 07 17 + // a5: 24 34 25 35 26 36 27 37 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); + + // Unpack 32 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 + // b2: 04 14 24 34 05 15 25 35 + // b4: 02 12 22 32 03 13 23 33 + // b6: 06 16 26 36 07 17 27 37 + const __m128i b0 = _mm_unpacklo_epi32(a0, a1); + const __m128i b2 = _mm_unpacklo_epi32(a4, a5); + const __m128i b4 = _mm_unpackhi_epi32(a0, a1); + const __m128i b6 = _mm_unpackhi_epi32(a4, a5); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 XX XX XX XX + // out[1]: 01 11 21 31 XX XX XX XX + // out[2]: 02 12 22 32 XX XX XX XX + // out[3]: 03 13 23 33 XX XX XX XX + // out[4]: 04 14 24 34 XX XX XX XX + // out[5]: 05 15 25 35 XX XX XX XX + // out[6]: 06 16 26 36 XX XX XX XX + // out[7]: 07 17 27 37 XX XX XX XX + const __m128i zeros = _mm_setzero_si128(); + out[0] = _mm_unpacklo_epi64(b0, zeros); + out[1] = _mm_unpackhi_epi64(b0, zeros); + out[2] = _mm_unpacklo_epi64(b4, zeros); + out[3] = _mm_unpackhi_epi64(b4, zeros); + out[4] = _mm_unpacklo_epi64(b2, zeros); + out[5] = _mm_unpackhi_epi64(b2, zeros); + out[6] = _mm_unpacklo_epi64(b6, zeros); + out[7] = _mm_unpackhi_epi64(b6, zeros); +} + +static INLINE void transpose_16bit_8x8(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + // in[4]: 40 41 42 43 44 45 46 47 + // in[5]: 50 51 52 53 54 55 56 57 + // in[6]: 60 61 62 63 64 65 66 67 + // in[7]: 70 71 72 73 74 75 76 77 + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + // a2: 40 50 41 51 42 52 43 53 + // a3: 60 70 61 71 62 72 63 73 + // a4: 04 14 05 15 06 16 07 17 + // a5: 24 34 25 35 26 36 27 37 + // a6: 44 54 45 55 46 56 47 57 + // a7: 64 74 65 75 66 76 67 77 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); + const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); + const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]); + const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]); + + // Unpack 32 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 + // b1: 40 50 60 70 41 51 61 71 + // b2: 04 14 24 34 05 15 25 35 + // b3: 44 54 64 74 45 55 65 75 + // b4: 02 12 22 32 03 13 23 33 + // b5: 42 52 62 72 43 53 63 73 + // b6: 06 16 26 36 07 17 27 37 + // b7: 46 56 66 76 47 57 67 77 + const __m128i b0 = _mm_unpacklo_epi32(a0, a1); + const __m128i b1 = _mm_unpacklo_epi32(a2, a3); + const __m128i b2 = _mm_unpacklo_epi32(a4, a5); + const __m128i b3 = _mm_unpacklo_epi32(a6, a7); + const __m128i b4 = _mm_unpackhi_epi32(a0, a1); + const __m128i b5 = _mm_unpackhi_epi32(a2, a3); + const __m128i b6 = _mm_unpackhi_epi32(a4, a5); + const __m128i b7 = _mm_unpackhi_epi32(a6, a7); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + // out[4]: 04 14 24 34 44 54 64 74 + // out[5]: 05 15 25 35 45 55 65 75 + // out[6]: 06 16 26 36 46 56 66 76 + // out[7]: 07 17 27 37 47 57 67 77 + out[0] = _mm_unpacklo_epi64(b0, b1); + out[1] = _mm_unpackhi_epi64(b0, b1); + out[2] = _mm_unpacklo_epi64(b4, b5); + out[3] = _mm_unpackhi_epi64(b4, b5); + out[4] = _mm_unpacklo_epi64(b2, b3); + out[5] = _mm_unpackhi_epi64(b2, b3); + out[6] = _mm_unpacklo_epi64(b6, b7); + out[7] = _mm_unpackhi_epi64(b6, b7); +} + +// Transpose in-place +static INLINE void transpose_16bit_16x16(__m128i *const left, + __m128i *const right) { + __m128i tbuf[8]; + transpose_16bit_8x8(left, left); + transpose_16bit_8x8(right, tbuf); + transpose_16bit_8x8(left + 8, right); + transpose_16bit_8x8(right + 8, right + 8); + + left[8] = tbuf[0]; + left[9] = tbuf[1]; + left[10] = tbuf[2]; + left[11] = tbuf[3]; + left[12] = tbuf[4]; + left[13] = tbuf[5]; + left[14] = tbuf[6]; + left[15] = tbuf[7]; +} + +static INLINE void transpose_32bit_4x4(const __m128i *const in, + __m128i *const out) { + // Unpack 32 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 10 11 12 13 + // in[2]: 20 21 22 23 + // in[3]: 30 31 32 33 + // to: + // a0: 00 10 01 11 + // a1: 20 30 21 31 + // a2: 02 12 03 13 + // a3: 22 32 23 33 + + const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); + const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); + const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + out[0] = _mm_unpacklo_epi64(a0, a1); + out[1] = _mm_unpackhi_epi64(a0, a1); + out[2] = _mm_unpacklo_epi64(a2, a3); + out[3] = _mm_unpackhi_epi64(a2, a3); +} + +static INLINE void transpose_32bit_4x4x2(const __m128i *const in, + __m128i *const out) { + // Unpack 32 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 10 11 12 13 + // in[2]: 20 21 22 23 + // in[3]: 30 31 32 33 + // in[4]: 04 05 06 07 + // in[5]: 14 15 16 17 + // in[6]: 24 25 26 27 + // in[7]: 34 35 36 37 + // to: + // a0: 00 10 01 11 + // a1: 20 30 21 31 + // a2: 02 12 03 13 + // a3: 22 32 23 33 + // a4: 04 14 05 15 + // a5: 24 34 25 35 + // a6: 06 16 07 17 + // a7: 26 36 27 37 + const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); + const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); + const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); + const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]); + const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]); + const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]); + const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + // out[4]: 04 14 24 34 + // out[5]: 05 15 25 35 + // out[6]: 06 16 26 36 + // out[7]: 07 17 27 37 + out[0] = _mm_unpacklo_epi64(a0, a1); + out[1] = _mm_unpackhi_epi64(a0, a1); + out[2] = _mm_unpacklo_epi64(a2, a3); + out[3] = _mm_unpackhi_epi64(a2, a3); + out[4] = _mm_unpacklo_epi64(a4, a5); + out[5] = _mm_unpackhi_epi64(a4, a5); + out[6] = _mm_unpacklo_epi64(a6, a7); + out[7] = _mm_unpackhi_epi64(a6, a7); +} + +static INLINE void transpose_32bit_8x4(const __m128i *const in, + __m128i *const out) { + // Unpack 32 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 04 05 06 07 + // in[2]: 10 11 12 13 + // in[3]: 14 15 16 17 + // in[4]: 20 21 22 23 + // in[5]: 24 25 26 27 + // in[6]: 30 31 32 33 + // in[7]: 34 35 36 37 + // to: + // a0: 00 10 01 11 + // a1: 20 30 21 31 + // a2: 02 12 03 13 + // a3: 22 32 23 33 + // a4: 04 14 05 15 + // a5: 24 34 25 35 + // a6: 06 16 07 17 + // a7: 26 36 27 37 + const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]); + const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]); + const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]); + const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]); + const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]); + const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]); + const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]); + const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + // out[4]: 04 14 24 34 + // out[5]: 05 15 25 35 + // out[6]: 06 16 26 36 + // out[7]: 07 17 27 37 + out[0] = _mm_unpacklo_epi64(a0, a1); + out[1] = _mm_unpackhi_epi64(a0, a1); + out[2] = _mm_unpacklo_epi64(a2, a3); + out[3] = _mm_unpackhi_epi64(a2, a3); + out[4] = _mm_unpacklo_epi64(a4, a5); + out[5] = _mm_unpackhi_epi64(a4, a5); + out[6] = _mm_unpacklo_epi64(a6, a7); + out[7] = _mm_unpackhi_epi64(a6, a7); +} + +#endif // AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_ diff --git a/libs/libaom/src/aom_dsp/x86/txfm_common_avx2.h b/libs/libaom/src/aom_dsp/x86/txfm_common_avx2.h new file mode 100644 index 000000000..ea57c9f35 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/txfm_common_avx2.h @@ -0,0 +1,360 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_ +#define AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_ + +#include +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE __m256i pair_set_w16_epi16(int16_t a, int16_t b) { + return _mm256_set1_epi32( + (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16))); +} + +static INLINE void btf_16_w16_avx2(const __m256i w0, const __m256i w1, + __m256i *in0, __m256i *in1, const __m256i _r, + const int32_t cos_bit) { + __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1); + __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1); + __m256i u0 = _mm256_madd_epi16(t0, w0); + __m256i u1 = _mm256_madd_epi16(t1, w0); + __m256i v0 = _mm256_madd_epi16(t0, w1); + __m256i v1 = _mm256_madd_epi16(t1, w1); + + __m256i a0 = _mm256_add_epi32(u0, _r); + __m256i a1 = _mm256_add_epi32(u1, _r); + __m256i b0 = _mm256_add_epi32(v0, _r); + __m256i b1 = _mm256_add_epi32(v1, _r); + + __m256i c0 = _mm256_srai_epi32(a0, cos_bit); + __m256i c1 = _mm256_srai_epi32(a1, cos_bit); + __m256i d0 = _mm256_srai_epi32(b0, cos_bit); + __m256i d1 = _mm256_srai_epi32(b1, cos_bit); + + *in0 = _mm256_packs_epi32(c0, c1); + *in1 = _mm256_packs_epi32(d0, d1); +} + +static INLINE void btf_16_adds_subs_avx2(__m256i *in0, __m256i *in1) { + const __m256i _in0 = *in0; + const __m256i _in1 = *in1; + *in0 = _mm256_adds_epi16(_in0, _in1); + *in1 = _mm256_subs_epi16(_in0, _in1); +} + +static INLINE void btf_32_add_sub_avx2(__m256i *in0, __m256i *in1) { + const __m256i _in0 = *in0; + const __m256i _in1 = *in1; + *in0 = _mm256_add_epi32(_in0, _in1); + *in1 = _mm256_sub_epi32(_in0, _in1); +} + +static INLINE void btf_16_adds_subs_out_avx2(__m256i *out0, __m256i *out1, + __m256i in0, __m256i in1) { + const __m256i _in0 = in0; + const __m256i _in1 = in1; + *out0 = _mm256_adds_epi16(_in0, _in1); + *out1 = _mm256_subs_epi16(_in0, _in1); +} + +static INLINE void btf_32_add_sub_out_avx2(__m256i *out0, __m256i *out1, + __m256i in0, __m256i in1) { + const __m256i _in0 = in0; + const __m256i _in1 = in1; + *out0 = _mm256_add_epi32(_in0, _in1); + *out1 = _mm256_sub_epi32(_in0, _in1); +} + +static INLINE __m256i load_16bit_to_16bit_avx2(const int16_t *a) { + return _mm256_load_si256((const __m256i *)a); +} + +static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in, + int stride, __m256i *out, + int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = load_16bit_to_16bit_avx2(in + i * stride); + } +} + +static INLINE void load_buffer_16bit_to_16bit_flip_avx2(const int16_t *in, + int stride, + __m256i *out, + int out_size) { + for (int i = 0; i < out_size; ++i) { + out[out_size - i - 1] = load_16bit_to_16bit_avx2(in + i * stride); + } +} + +static INLINE __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) { + const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a); + const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8)); + return _mm256_permute4x64_epi64(b, 0xD8); +} + +static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in, + int stride, __m256i *out, + int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = load_32bit_to_16bit_w16_avx2(in + i * stride); + } +} + +static INLINE void transpose2_8x8_avx2(const __m256i *const in, + __m256i *const out) { + __m256i t[16], u[16]; + // (1st, 2nd) ==> (lo, hi) + // (0, 1) ==> (0, 1) + // (2, 3) ==> (2, 3) + // (4, 5) ==> (4, 5) + // (6, 7) ==> (6, 7) + for (int i = 0; i < 4; i++) { + t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]); + t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]); + } + + // (1st, 2nd) ==> (lo, hi) + // (0, 2) ==> (0, 2) + // (1, 3) ==> (1, 3) + // (4, 6) ==> (4, 6) + // (5, 7) ==> (5, 7) + for (int i = 0; i < 2; i++) { + u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]); + u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]); + + u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]); + u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]); + } + + // (1st, 2nd) ==> (lo, hi) + // (0, 4) ==> (0, 1) + // (1, 5) ==> (4, 5) + // (2, 6) ==> (2, 3) + // (3, 7) ==> (6, 7) + for (int i = 0; i < 2; i++) { + out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]); + out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]); + + out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]); + out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]); + } +} + +static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in, + __m256i *const out) { + __m256i t[16]; + +#define LOADL(idx) \ + t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \ + t[idx] = _mm256_inserti128_si256( \ + t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1); + +#define LOADR(idx) \ + t[8 + idx] = \ + _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \ + t[8 + idx] = _mm256_inserti128_si256( \ + t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1); + + // load left 8x16 + LOADL(0) + LOADL(1) + LOADL(2) + LOADL(3) + LOADL(4) + LOADL(5) + LOADL(6) + LOADL(7) + + // load right 8x16 + LOADR(0) + LOADR(1) + LOADR(2) + LOADR(3) + LOADR(4) + LOADR(5) + LOADR(6) + LOADR(7) + + // get the top 16x8 result + transpose2_8x8_avx2(t, out); + // get the bottom 16x8 result + transpose2_8x8_avx2(&t[8], &out[8]); +} + +static INLINE void transpose_16bit_16x8_avx2(const __m256i *const in, + __m256i *const out) { + const __m256i a0 = _mm256_unpacklo_epi16(in[0], in[1]); + const __m256i a1 = _mm256_unpacklo_epi16(in[2], in[3]); + const __m256i a2 = _mm256_unpacklo_epi16(in[4], in[5]); + const __m256i a3 = _mm256_unpacklo_epi16(in[6], in[7]); + const __m256i a4 = _mm256_unpackhi_epi16(in[0], in[1]); + const __m256i a5 = _mm256_unpackhi_epi16(in[2], in[3]); + const __m256i a6 = _mm256_unpackhi_epi16(in[4], in[5]); + const __m256i a7 = _mm256_unpackhi_epi16(in[6], in[7]); + + const __m256i b0 = _mm256_unpacklo_epi32(a0, a1); + const __m256i b1 = _mm256_unpacklo_epi32(a2, a3); + const __m256i b2 = _mm256_unpacklo_epi32(a4, a5); + const __m256i b3 = _mm256_unpacklo_epi32(a6, a7); + const __m256i b4 = _mm256_unpackhi_epi32(a0, a1); + const __m256i b5 = _mm256_unpackhi_epi32(a2, a3); + const __m256i b6 = _mm256_unpackhi_epi32(a4, a5); + const __m256i b7 = _mm256_unpackhi_epi32(a6, a7); + + out[0] = _mm256_unpacklo_epi64(b0, b1); + out[1] = _mm256_unpackhi_epi64(b0, b1); + out[2] = _mm256_unpacklo_epi64(b4, b5); + out[3] = _mm256_unpackhi_epi64(b4, b5); + out[4] = _mm256_unpacklo_epi64(b2, b3); + out[5] = _mm256_unpackhi_epi64(b2, b3); + out[6] = _mm256_unpacklo_epi64(b6, b7); + out[7] = _mm256_unpackhi_epi64(b6, b7); +} + +static INLINE void flip_buf_avx2(__m256i *in, __m256i *out, int size) { + for (int i = 0; i < size; ++i) { + out[size - i - 1] = in[i]; + } +} + +static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) { + if (bit < 0) { + bit = -bit; + __m256i round = _mm256_set1_epi16(1 << (bit - 1)); + for (int i = 0; i < size; ++i) { + in[i] = _mm256_adds_epi16(in[i], round); + in[i] = _mm256_srai_epi16(in[i], bit); + } + } else if (bit > 0) { + for (int i = 0; i < size; ++i) { + in[i] = _mm256_slli_epi16(in[i], bit); + } + } +} + +static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) { + __m256i tmp, round; + round = _mm256_set1_epi32(1 << (bit - 1)); + tmp = _mm256_add_epi32(vec, round); + return _mm256_srai_epi32(tmp, bit); +} + +static INLINE void av1_round_shift_array_32_avx2(__m256i *input, + __m256i *output, + const int size, + const int bit) { + if (bit > 0) { + int i; + for (i = 0; i < size; i++) { + output[i] = av1_round_shift_32_avx2(input[i], bit); + } + } else { + int i; + for (i = 0; i < size; i++) { + output[i] = _mm256_slli_epi32(input[i], -bit); + } + } +} + +static INLINE void av1_round_shift_rect_array_32_avx2(__m256i *input, + __m256i *output, + const int size, + const int bit, + const int val) { + const __m256i sqrt2 = _mm256_set1_epi32(val); + if (bit > 0) { + int i; + for (i = 0; i < size; i++) { + const __m256i r0 = av1_round_shift_32_avx2(input[i], bit); + const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0); + output[i] = av1_round_shift_32_avx2(r1, NewSqrt2Bits); + } + } else { + int i; + for (i = 0; i < size; i++) { + const __m256i r0 = _mm256_slli_epi32(input[i], -bit); + const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0); + output[i] = av1_round_shift_32_avx2(r1, NewSqrt2Bits); + } + } +} + +static INLINE __m256i scale_round_avx2(const __m256i a, const int scale) { + const __m256i scale_rounding = + pair_set_w16_epi16(scale, 1 << (NewSqrt2Bits - 1)); + const __m256i b = _mm256_madd_epi16(a, scale_rounding); + return _mm256_srai_epi32(b, NewSqrt2Bits); +} + +static INLINE void store_rect_16bit_to_32bit_w8_avx2(const __m256i a, + int32_t *const b) { + const __m256i one = _mm256_set1_epi16(1); + const __m256i a_lo = _mm256_unpacklo_epi16(a, one); + const __m256i a_hi = _mm256_unpackhi_epi16(a, one); + const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2); + const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2); + const __m256i temp = _mm256_permute2f128_si256(b_lo, b_hi, 0x31); + _mm_store_si128((__m128i *)b, _mm256_castsi256_si128(b_lo)); + _mm_store_si128((__m128i *)(b + 4), _mm256_castsi256_si128(b_hi)); + _mm256_store_si256((__m256i *)(b + 64), temp); +} + +static INLINE void store_rect_buffer_16bit_to_32bit_w8_avx2( + const __m256i *const in, int32_t *const out, const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + store_rect_16bit_to_32bit_w8_avx2(in[i], out + i * stride); + } +} + +static INLINE void pack_reg(const __m128i *in1, const __m128i *in2, + __m256i *out) { + out[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[0]), in2[0], 0x1); + out[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[1]), in2[1], 0x1); + out[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[2]), in2[2], 0x1); + out[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[3]), in2[3], 0x1); + out[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[4]), in2[4], 0x1); + out[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[5]), in2[5], 0x1); + out[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[6]), in2[6], 0x1); + out[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[7]), in2[7], 0x1); +} + +static INLINE void extract_reg(const __m256i *in, __m128i *out1) { + out1[0] = _mm256_castsi256_si128(in[0]); + out1[1] = _mm256_castsi256_si128(in[1]); + out1[2] = _mm256_castsi256_si128(in[2]); + out1[3] = _mm256_castsi256_si128(in[3]); + out1[4] = _mm256_castsi256_si128(in[4]); + out1[5] = _mm256_castsi256_si128(in[5]); + out1[6] = _mm256_castsi256_si128(in[6]); + out1[7] = _mm256_castsi256_si128(in[7]); + + out1[8] = _mm256_extracti128_si256(in[0], 0x01); + out1[9] = _mm256_extracti128_si256(in[1], 0x01); + out1[10] = _mm256_extracti128_si256(in[2], 0x01); + out1[11] = _mm256_extracti128_si256(in[3], 0x01); + out1[12] = _mm256_extracti128_si256(in[4], 0x01); + out1[13] = _mm256_extracti128_si256(in[5], 0x01); + out1[14] = _mm256_extracti128_si256(in[6], 0x01); + out1[15] = _mm256_extracti128_si256(in[7], 0x01); +} + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_ diff --git a/libs/libaom/src/aom_dsp/x86/txfm_common_sse2.h b/libs/libaom/src/aom_dsp/x86/txfm_common_sse2.h new file mode 100644 index 000000000..9c99eb93b --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/txfm_common_sse2.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_ +#define AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_ + +#include +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" + +#define pair_set_epi16(a, b) \ + _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16))) + +// Reverse the 8 16 bit words in __m128i +static INLINE __m128i mm_reverse_epi16(const __m128i x) { + const __m128i a = _mm_shufflelo_epi16(x, 0x1b); + const __m128i b = _mm_shufflehi_epi16(a, 0x1b); + return _mm_shuffle_epi32(b, 0x4e); +} + +#define octa_set_epi16(a, b, c, d, e, f, g, h) \ + _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \ + (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h)) + +#endif // AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_ diff --git a/libs/libaom/src/aom_dsp/x86/variance_avx2.c b/libs/libaom/src/aom_dsp/x86/variance_avx2.c new file mode 100644 index 000000000..c4919ba9b --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/variance_avx2.c @@ -0,0 +1,526 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/masked_variance_intrin_ssse3.h" + +static INLINE __m128i mm256_add_hi_lo_epi16(const __m256i val) { + return _mm_add_epi16(_mm256_castsi256_si128(val), + _mm256_extractf128_si256(val, 1)); +} + +static INLINE __m128i mm256_add_hi_lo_epi32(const __m256i val) { + return _mm_add_epi32(_mm256_castsi256_si128(val), + _mm256_extractf128_si256(val, 1)); +} + +static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref, + __m256i *const sse, + __m256i *const sum) { + const __m256i adj_sub = _mm256_set1_epi16((short)0xff01); // (1,-1) + + // unpack into pairs of source and reference values + const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref); + const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref); + + // subtract adjacent elements using src*1 + ref*-1 + const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub); + const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub); + const __m256i madd0 = _mm256_madd_epi16(diff0, diff0); + const __m256i madd1 = _mm256_madd_epi16(diff1, diff1); + + // add to the running totals + *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1)); + *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1)); +} + +static INLINE int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum, + unsigned int *const sse) { + // extract the low lane and add it to the high lane + const __m128i sse_reg_128 = mm256_add_hi_lo_epi32(vsse); + + // unpack sse and sum registers and add + const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum); + const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum); + const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi); + + // perform the final summation and extract the results + const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8)); + *((int *)sse) = _mm_cvtsi128_si32(res); + return _mm_extract_epi32(res, 1); +} + +// handle pixels (<= 512) +static INLINE int variance_final_512_avx2(__m256i vsse, __m256i vsum, + unsigned int *const sse) { + // extract the low lane and add it to the high lane + const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum); + const __m128i vsum_64 = _mm_add_epi16(vsum_128, _mm_srli_si128(vsum_128, 8)); + const __m128i sum_int32 = _mm_cvtepi16_epi32(vsum_64); + return variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse); +} + +// handle 1024 pixels (32x32, 16x64, 64x16) +static INLINE int variance_final_1024_avx2(__m256i vsse, __m256i vsum, + unsigned int *const sse) { + // extract the low lane and add it to the high lane + const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum); + const __m128i vsum_64 = + _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128), + _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8))); + return variance_final_from_32bit_sum_avx2(vsse, vsum_64, sse); +} + +static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) { + const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum)); + const __m256i sum_hi = + _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1)); + return _mm256_add_epi32(sum_lo, sum_hi); +} + +// handle 2048 pixels (32x64, 64x32) +static INLINE int variance_final_2048_avx2(__m256i vsse, __m256i vsum, + unsigned int *const sse) { + vsum = sum_to_32bit_avx2(vsum); + const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum); + return variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse); +} + +static INLINE void variance16_kernel_avx2( + const uint8_t *const src, const int src_stride, const uint8_t *const ref, + const int ref_stride, __m256i *const sse, __m256i *const sum) { + const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); + const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); + const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride)); + const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride)); + const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1); + const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1); + variance_kernel_avx2(s, r, sse, sum); +} + +static INLINE void variance32_kernel_avx2(const uint8_t *const src, + const uint8_t *const ref, + __m256i *const sse, + __m256i *const sum) { + const __m256i s = _mm256_loadu_si256((__m256i const *)(src)); + const __m256i r = _mm256_loadu_si256((__m256i const *)(ref)); + variance_kernel_avx2(s, r, sse, sum); +} + +static INLINE void variance16_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + *vsum = _mm256_setzero_si256(); + + for (int i = 0; i < h; i += 2) { + variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum); + src += 2 * src_stride; + ref += 2 * ref_stride; + } +} + +static INLINE void variance32_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + *vsum = _mm256_setzero_si256(); + + for (int i = 0; i < h; i++) { + variance32_kernel_avx2(src, ref, vsse, vsum); + src += src_stride; + ref += ref_stride; + } +} + +static INLINE void variance64_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + *vsum = _mm256_setzero_si256(); + + for (int i = 0; i < h; i++) { + variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum); + variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum); + src += src_stride; + ref += ref_stride; + } +} + +static INLINE void variance128_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + *vsum = _mm256_setzero_si256(); + + for (int i = 0; i < h; i++) { + variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum); + variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum); + variance32_kernel_avx2(src + 64, ref + 64, vsse, vsum); + variance32_kernel_avx2(src + 96, ref + 96, vsse, vsum); + src += src_stride; + ref += ref_stride; + } +} + +#define AOM_VAR_NO_LOOP_AVX2(bw, bh, bits, max_pixel) \ + unsigned int aom_variance##bw##x##bh##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + __m256i vsse = _mm256_setzero_si256(); \ + __m256i vsum; \ + variance##bw##_avx2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum); \ + const int sum = variance_final_##max_pixel##_avx2(vsse, vsum, sse); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \ + } + +AOM_VAR_NO_LOOP_AVX2(16, 4, 6, 512); +AOM_VAR_NO_LOOP_AVX2(16, 8, 7, 512); +AOM_VAR_NO_LOOP_AVX2(16, 16, 8, 512); +AOM_VAR_NO_LOOP_AVX2(16, 32, 9, 512); +AOM_VAR_NO_LOOP_AVX2(16, 64, 10, 1024); + +AOM_VAR_NO_LOOP_AVX2(32, 8, 8, 512); +AOM_VAR_NO_LOOP_AVX2(32, 16, 9, 512); +AOM_VAR_NO_LOOP_AVX2(32, 32, 10, 1024); +AOM_VAR_NO_LOOP_AVX2(32, 64, 11, 2048); + +AOM_VAR_NO_LOOP_AVX2(64, 16, 10, 1024); +AOM_VAR_NO_LOOP_AVX2(64, 32, 11, 2048); + +#define AOM_VAR_LOOP_AVX2(bw, bh, bits, uh) \ + unsigned int aom_variance##bw##x##bh##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + __m256i vsse = _mm256_setzero_si256(); \ + __m256i vsum = _mm256_setzero_si256(); \ + for (int i = 0; i < (bh / uh); i++) { \ + __m256i vsum16; \ + variance##bw##_avx2(src, src_stride, ref, ref_stride, uh, &vsse, \ + &vsum16); \ + vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16)); \ + src += uh * src_stride; \ + ref += uh * ref_stride; \ + } \ + const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum); \ + const int sum = variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse); \ + return *sse - (unsigned int)(((int64_t)sum * sum) >> bits); \ + } + +AOM_VAR_LOOP_AVX2(64, 64, 12, 32); // 64x32 * ( 64/32) +AOM_VAR_LOOP_AVX2(64, 128, 13, 32); // 64x32 * (128/32) +AOM_VAR_LOOP_AVX2(128, 64, 13, 16); // 128x16 * ( 64/16) +AOM_VAR_LOOP_AVX2(128, 128, 14, 16); // 128x16 * (128/16) + +unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + aom_variance16x16_avx2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + int height, unsigned int *sse); +unsigned int aom_sub_pixel_variance16xh_avx2(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + int height, unsigned int *sse); + +unsigned int aom_sub_pixel_avg_variance32xh_avx2( + const uint8_t *src, int src_stride, int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, + int height, unsigned int *sseptr); + +#define AOM_SUB_PIXEL_VAR_AVX2(w, h, wf, wlog2, hlog2) \ + unsigned int aom_sub_pixel_variance##w##x##h##_avx2( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ + /*Avoid overflow in helper by capping height.*/ \ + const int hf = AOMMIN(h, 64); \ + unsigned int sse = 0; \ + int se = 0; \ + for (int i = 0; i < (w / wf); ++i) { \ + const uint8_t *src_ptr = src; \ + const uint8_t *dst_ptr = dst; \ + for (int j = 0; j < (h / hf); ++j) { \ + unsigned int sse2; \ + const int se2 = aom_sub_pixel_variance##wf##xh_avx2( \ + src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \ + &sse2); \ + dst_ptr += hf * dst_stride; \ + src_ptr += hf * src_stride; \ + se += se2; \ + sse += sse2; \ + } \ + src += wf; \ + dst += wf; \ + } \ + *sse_ptr = sse; \ + return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2)); \ + } + +AOM_SUB_PIXEL_VAR_AVX2(128, 128, 32, 7, 7); +AOM_SUB_PIXEL_VAR_AVX2(128, 64, 32, 7, 6); +AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 6, 7); +AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 6, 6); +AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 6, 5); +AOM_SUB_PIXEL_VAR_AVX2(32, 64, 32, 5, 6); +AOM_SUB_PIXEL_VAR_AVX2(32, 32, 32, 5, 5); +AOM_SUB_PIXEL_VAR_AVX2(32, 16, 32, 5, 4); +AOM_SUB_PIXEL_VAR_AVX2(16, 64, 16, 4, 6); +AOM_SUB_PIXEL_VAR_AVX2(16, 32, 16, 4, 5); +AOM_SUB_PIXEL_VAR_AVX2(16, 16, 16, 4, 4); +AOM_SUB_PIXEL_VAR_AVX2(16, 8, 16, 4, 3); +AOM_SUB_PIXEL_VAR_AVX2(16, 4, 16, 4, 2); + +#define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, wlog2, hlog2) \ + unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \ + const uint8_t *sec) { \ + /*Avoid overflow in helper by capping height.*/ \ + const int hf = AOMMIN(h, 64); \ + unsigned int sse = 0; \ + int se = 0; \ + for (int i = 0; i < (w / wf); ++i) { \ + const uint8_t *src_ptr = src; \ + const uint8_t *dst_ptr = dst; \ + const uint8_t *sec_ptr = sec; \ + for (int j = 0; j < (h / hf); ++j) { \ + unsigned int sse2; \ + const int se2 = aom_sub_pixel_avg_variance##wf##xh_avx2( \ + src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \ + sec_ptr, w, hf, &sse2); \ + dst_ptr += hf * dst_stride; \ + src_ptr += hf * src_stride; \ + sec_ptr += hf * w; \ + se += se2; \ + sse += sse2; \ + } \ + src += wf; \ + dst += wf; \ + sec += wf; \ + } \ + *sse_ptr = sse; \ + return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2)); \ + } + +AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 128, 32, 7, 7); +AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 64, 32, 7, 6); +AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 128, 32, 6, 7); +AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 64, 32, 6, 6); +AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 32, 32, 6, 5); +AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 64, 32, 5, 6); +AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 32, 32, 5, 5); +AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 16, 32, 5, 4); + +static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) { + const __m256i d = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1)); + return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1); +} + +static INLINE __m256i mm256_loadu2_16(const uint16_t *p0, const uint16_t *p1) { + const __m256i d = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1)); + return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1); +} + +static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1, + const __m256i a, + uint8_t *comp_pred) { + const __m256i alpha_max = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const int16_t round_bits = 15 - AOM_BLEND_A64_ROUND_BITS; + const __m256i round_offset = _mm256_set1_epi16(1 << (round_bits)); + + const __m256i ma = _mm256_sub_epi8(alpha_max, a); + + const __m256i ssAL = _mm256_unpacklo_epi8(s0, s1); + const __m256i aaAL = _mm256_unpacklo_epi8(a, ma); + const __m256i ssAH = _mm256_unpackhi_epi8(s0, s1); + const __m256i aaAH = _mm256_unpackhi_epi8(a, ma); + + const __m256i blendAL = _mm256_maddubs_epi16(ssAL, aaAL); + const __m256i blendAH = _mm256_maddubs_epi16(ssAH, aaAH); + const __m256i roundAL = _mm256_mulhrs_epi16(blendAL, round_offset); + const __m256i roundAH = _mm256_mulhrs_epi16(blendAH, round_offset); + + const __m256i roundA = _mm256_packus_epi16(roundAL, roundAH); + _mm256_storeu_si256((__m256i *)(comp_pred), roundA); +} + +void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride, + const uint8_t *mask, int mask_stride, + int invert_mask) { + int i = 0; + const uint8_t *src0 = invert_mask ? pred : ref; + const uint8_t *src1 = invert_mask ? ref : pred; + const int stride0 = invert_mask ? width : ref_stride; + const int stride1 = invert_mask ? ref_stride : width; + if (width == 8) { + comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1, + mask, mask_stride); + } else if (width == 16) { + do { + const __m256i sA0 = mm256_loadu2(src0 + stride0, src0); + const __m256i sA1 = mm256_loadu2(src1 + stride1, src1); + const __m256i aA = mm256_loadu2(mask + mask_stride, mask); + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + const __m256i sB0 = mm256_loadu2(src0 + stride0, src0); + const __m256i sB1 = mm256_loadu2(src1 + stride1, src1); + const __m256i aB = mm256_loadu2(mask + mask_stride, mask); + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + // comp_pred's stride == width == 16 + comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred); + comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32); + comp_pred += (16 << 2); + i += 4; + } while (i < height); + } else { // for width == 32 + do { + const __m256i sA0 = _mm256_lddqu_si256((const __m256i *)(src0)); + const __m256i sA1 = _mm256_lddqu_si256((const __m256i *)(src1)); + const __m256i aA = _mm256_lddqu_si256((const __m256i *)(mask)); + + const __m256i sB0 = _mm256_lddqu_si256((const __m256i *)(src0 + stride0)); + const __m256i sB1 = _mm256_lddqu_si256((const __m256i *)(src1 + stride1)); + const __m256i aB = + _mm256_lddqu_si256((const __m256i *)(mask + mask_stride)); + + comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred); + comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32); + comp_pred += (32 << 1); + + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + i += 2; + } while (i < height); + } +} + +static INLINE __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0, + const __m256i s1, + const __m256i a) { + const __m256i alpha_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_const = + _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m256i a_inv = _mm256_sub_epi16(alpha_max, a); + + const __m256i s_lo = _mm256_unpacklo_epi16(s0, s1); + const __m256i a_lo = _mm256_unpacklo_epi16(a, a_inv); + const __m256i pred_lo = _mm256_madd_epi16(s_lo, a_lo); + const __m256i pred_l = _mm256_srai_epi32( + _mm256_add_epi32(pred_lo, round_const), AOM_BLEND_A64_ROUND_BITS); + + const __m256i s_hi = _mm256_unpackhi_epi16(s0, s1); + const __m256i a_hi = _mm256_unpackhi_epi16(a, a_inv); + const __m256i pred_hi = _mm256_madd_epi16(s_hi, a_hi); + const __m256i pred_h = _mm256_srai_epi32( + _mm256_add_epi32(pred_hi, round_const), AOM_BLEND_A64_ROUND_BITS); + + const __m256i comp = _mm256_packs_epi32(pred_l, pred_h); + + return comp; +} + +void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask) { + int i = 0; + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + const uint16_t *src0 = invert_mask ? pred : ref; + const uint16_t *src1 = invert_mask ? ref : pred; + const int stride0 = invert_mask ? width : ref_stride; + const int stride1 = invert_mask ? ref_stride : width; + const __m256i zero = _mm256_setzero_si256(); + + if (width == 8) { + do { + const __m256i s0 = mm256_loadu2_16(src0 + stride0, src0); + const __m256i s1 = mm256_loadu2_16(src1 + stride1, src1); + + const __m128i m_l = _mm_loadl_epi64((const __m128i *)mask); + const __m128i m_h = _mm_loadl_epi64((const __m128i *)(mask + 8)); + + __m256i m = _mm256_castsi128_si256(m_l); + m = _mm256_insertf128_si256(m, m_h, 1); + const __m256i m_16 = _mm256_unpacklo_epi8(m, zero); + + const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16); + + _mm_storeu_si128((__m128i *)(comp_pred), _mm256_castsi256_si128(comp)); + + _mm_storeu_si128((__m128i *)(comp_pred + width), + _mm256_extractf128_si256(comp, 1)); + + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + comp_pred += (width << 1); + i += 2; + } while (i < height); + } else if (width == 16) { + do { + const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0)); + const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1)); + const __m256i m_16 = + _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask)); + + const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16); + + _mm256_storeu_si256((__m256i *)comp_pred, comp); + + src0 += stride0; + src1 += stride1; + mask += mask_stride; + comp_pred += width; + i += 1; + } while (i < height); + } else if (width == 32) { + do { + const __m256i s0 = _mm256_loadu_si256((const __m256i *)src0); + const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + 16)); + const __m256i s1 = _mm256_loadu_si256((const __m256i *)src1); + const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + 16)); + + const __m256i m01_16 = + _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask)); + const __m256i m23_16 = + _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + 16))); + + const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16); + const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16); + + _mm256_storeu_si256((__m256i *)comp_pred, comp); + _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1); + + src0 += stride0; + src1 += stride1; + mask += mask_stride; + comp_pred += width; + i += 1; + } while (i < height); + } +} diff --git a/libs/libaom/src/aom_dsp/x86/variance_impl_avx2.c b/libs/libaom/src/aom_dsp/x86/variance_impl_avx2.c new file mode 100644 index 000000000..f779270ae --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/variance_impl_avx2.c @@ -0,0 +1,814 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // AVX2 + +#include "config/aom_dsp_rtcd.h" + +#include "aom_ports/mem.h" + +/* clang-format off */ +DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, + 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, + 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, + 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, + 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, + 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, + 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, + 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, + 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, + 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, + 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, +}; +/* clang-format on */ + +#define FILTER_SRC(filter) \ + /* filter the source */ \ + exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \ + exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \ + \ + /* add 8 to source */ \ + exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \ + exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \ + \ + /* divide source by 16 */ \ + exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ + exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); + +#define MERGE_WITH_SRC(src_reg, reg) \ + exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \ + exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg); + +#define LOAD_SRC_DST \ + /* load source and destination */ \ + src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ + dst_reg = _mm256_loadu_si256((__m256i const *)(dst)); + +#define AVG_NEXT_SRC(src_reg, size_stride) \ + src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \ + /* average between current and next stride source */ \ + src_reg = _mm256_avg_epu8(src_reg, src_next_reg); + +#define MERGE_NEXT_SRC(src_reg, size_stride) \ + src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \ + MERGE_WITH_SRC(src_reg, src_next_reg) + +#define CALC_SUM_SSE_INSIDE_LOOP \ + /* expand each byte to 2 bytes */ \ + exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \ + exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \ + /* source - dest */ \ + exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \ + exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \ + /* caculate sum */ \ + sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \ + exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \ + sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \ + exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \ + /* calculate sse */ \ + sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \ + sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi); + +// final calculation to sum and sse +#define CALC_SUM_AND_SSE \ + res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \ + sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \ + sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \ + sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \ + sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ + sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \ + \ + sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \ + sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \ + \ + sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ + sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ + *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \ + _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \ + sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \ + sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ + sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \ + _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1)); + +// Functions related to sub pixel variance width 16 +#define LOAD_SRC_DST_INSERT(src_stride, dst_stride) \ + /* load source and destination of 2 rows and insert*/ \ + src_reg = _mm256_inserti128_si256( \ + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))), \ + _mm_loadu_si128((__m128i *)(src + src_stride)), 1); \ + dst_reg = _mm256_inserti128_si256( \ + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \ + _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1); + +#define AVG_NEXT_SRC_INSERT(src_reg, size_stride) \ + src_next_reg = _mm256_inserti128_si256( \ + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \ + _mm_loadu_si128((__m128i *)(src + (size_stride << 1))), 1); \ + /* average between current and next stride source */ \ + src_reg = _mm256_avg_epu8(src_reg, src_next_reg); + +#define MERGE_NEXT_SRC_INSERT(src_reg, size_stride) \ + src_next_reg = _mm256_inserti128_si256( \ + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \ + _mm_loadu_si128((__m128i *)(src + (src_stride + size_stride))), 1); \ + MERGE_WITH_SRC(src_reg, src_next_reg) + +#define LOAD_SRC_NEXT_BYTE_INSERT \ + /* load source and another source from next row */ \ + src_reg = _mm256_inserti128_si256( \ + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))), \ + _mm_loadu_si128((__m128i *)(src + src_stride)), 1); \ + /* load source and next row source from 1 byte onwards */ \ + src_next_reg = _mm256_inserti128_si256( \ + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + 1))), \ + _mm_loadu_si128((__m128i *)(src + src_stride + 1)), 1); + +#define LOAD_DST_INSERT \ + dst_reg = _mm256_inserti128_si256( \ + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \ + _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1); + +#define LOAD_SRC_MERGE_128BIT(filter) \ + __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src)); \ + __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); \ + __m128i src_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1); \ + __m128i src_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1); \ + __m128i filter_128bit = _mm256_castsi256_si128(filter); \ + __m128i pw8_128bit = _mm256_castsi256_si128(pw8); + +#define FILTER_SRC_128BIT(filter) \ + /* filter the source */ \ + src_lo = _mm_maddubs_epi16(src_lo, filter); \ + src_hi = _mm_maddubs_epi16(src_hi, filter); \ + \ + /* add 8 to source */ \ + src_lo = _mm_add_epi16(src_lo, pw8_128bit); \ + src_hi = _mm_add_epi16(src_hi, pw8_128bit); \ + \ + /* divide source by 16 */ \ + src_lo = _mm_srai_epi16(src_lo, 4); \ + src_hi = _mm_srai_epi16(src_hi, 4); + +unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + int height, unsigned int *sse) { + __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; + __m256i zero_reg; + int i, sum; + sum_reg = _mm256_set1_epi16(0); + sse_reg = _mm256_set1_epi16(0); + zero_reg = _mm256_set1_epi16(0); + + // x_offset = 0 and y_offset = 0 + if (x_offset == 0) { + if (y_offset == 0) { + for (i = 0; i < height; i++) { + LOAD_SRC_DST + // expend each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + // x_offset = 0 and y_offset = 4 + } else if (y_offset == 4) { + __m256i src_next_reg; + for (i = 0; i < height; i++) { + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, src_stride) + // expend each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + // x_offset = 0 and y_offset = bilin interpolation + } else { + __m256i filter, pw8, src_next_reg; + + y_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height; i++) { + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, src_stride) + FILTER_SRC(filter) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + } + // x_offset = 4 and y_offset = 0 + } else if (x_offset == 4) { + if (y_offset == 0) { + __m256i src_next_reg; + for (i = 0; i < height; i++) { + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + // x_offset = 4 and y_offset = 4 + } else if (y_offset == 4) { + __m256i src_next_reg, src_avg; + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + AVG_NEXT_SRC(src_reg, 1) + for (i = 0; i < height; i++) { + src_avg = src_reg; + src += src_stride; + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + // average between previous average to current average + src_avg = _mm256_avg_epu8(src_avg, src_reg); + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_avg, zero_reg) + // save current source average + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + } + // x_offset = 4 and y_offset = bilin interpolation + } else { + __m256i filter, pw8, src_next_reg, src_avg; + y_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + AVG_NEXT_SRC(src_reg, 1) + for (i = 0; i < height; i++) { + // save current source average + src_avg = src_reg; + src += src_stride; + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + MERGE_WITH_SRC(src_avg, src_reg) + FILTER_SRC(filter) + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + } + } + // x_offset = bilin interpolation and y_offset = 0 + } else { + if (y_offset == 0) { + __m256i filter, pw8, src_next_reg; + x_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height; i++) { + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + // x_offset = bilin interpolation and y_offset = 4 + } else if (y_offset == 4) { + __m256i filter, pw8, src_next_reg, src_pack; + x_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + pw8 = _mm256_set1_epi16(8); + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + for (i = 0; i < height; i++) { + src += src_stride; + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // average between previous pack to the current + src_pack = _mm256_avg_epu8(src_pack, src_reg); + MERGE_WITH_SRC(src_pack, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src_pack = src_reg; + dst += dst_stride; + } + // x_offset = bilin interpolation and y_offset = bilin interpolation + } else { + __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; + x_offset <<= 5; + xfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + y_offset <<= 5; + yfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + MERGE_NEXT_SRC(src_reg, 1) + + FILTER_SRC(xfilter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + for (i = 0; i < height; i++) { + src += src_stride; + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(xfilter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // merge previous pack to current pack source + MERGE_WITH_SRC(src_pack, src_reg) + // filter the source + FILTER_SRC(yfilter) + src_pack = src_reg; + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + } + } + } + CALC_SUM_AND_SSE + _mm256_zeroupper(); + return sum; +} + +unsigned int aom_sub_pixel_variance16xh_avx2(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + int height, unsigned int *sse) { + __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; + __m256i zero_reg; + int i, sum; + sum_reg = _mm256_set1_epi16(0); + sse_reg = _mm256_set1_epi16(0); + zero_reg = _mm256_set1_epi16(0); + + // x_offset = 0 and y_offset = 0 + if (x_offset == 0) { + if (y_offset == 0) { + for (i = 0; i < height; i += 2) { + LOAD_SRC_DST_INSERT(src_stride, dst_stride) + // expend each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += (src_stride << 1); + dst += (dst_stride << 1); + } + // x_offset = 0 and y_offset = 4 + } else if (y_offset == 4) { + __m256i src_next_reg; + for (i = 0; i < height; i += 2) { + LOAD_SRC_DST_INSERT(src_stride, dst_stride) + AVG_NEXT_SRC_INSERT(src_reg, src_stride) + // expend each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += (src_stride << 1); + dst += (dst_stride << 1); + } + // x_offset = 0 and y_offset = bilin interpolation + } else { + __m256i filter, pw8, src_next_reg; + y_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height; i += 2) { + LOAD_SRC_DST_INSERT(src_stride, dst_stride) + MERGE_NEXT_SRC_INSERT(src_reg, src_stride) + FILTER_SRC(filter) + CALC_SUM_SSE_INSIDE_LOOP + src += (src_stride << 1); + dst += (dst_stride << 1); + } + } + // x_offset = 4 and y_offset = 0 + } else if (x_offset == 4) { + if (y_offset == 0) { + __m256i src_next_reg; + for (i = 0; i < height; i += 2) { + LOAD_SRC_NEXT_BYTE_INSERT + LOAD_DST_INSERT + /* average between current and next stride source */ + src_reg = _mm256_avg_epu8(src_reg, src_next_reg); + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += (src_stride << 1); + dst += (dst_stride << 1); + } + // x_offset = 4 and y_offset = 4 + } else if (y_offset == 4) { + __m256i src_next_reg, src_avg, src_temp; + // load and insert source and next row source + LOAD_SRC_NEXT_BYTE_INSERT + src_avg = _mm256_avg_epu8(src_reg, src_next_reg); + src += src_stride << 1; + for (i = 0; i < height - 2; i += 2) { + LOAD_SRC_NEXT_BYTE_INSERT + src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg); + src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21); + src_temp = _mm256_avg_epu8(src_avg, src_temp); + LOAD_DST_INSERT + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_temp, zero_reg) + // save current source average + src_avg = src_next_reg; + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride << 1; + src += src_stride << 1; + } + // last 2 rows processing happens here + __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src)); + __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); + src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1); + src_next_reg = _mm256_permute2x128_si256( + src_avg, _mm256_castsi128_si256(src_reg_0), 0x21); + LOAD_DST_INSERT + src_avg = _mm256_avg_epu8(src_avg, src_next_reg); + MERGE_WITH_SRC(src_avg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + } else { + // x_offset = 4 and y_offset = bilin interpolation + __m256i filter, pw8, src_next_reg, src_avg, src_temp; + y_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + // load and insert source and next row source + LOAD_SRC_NEXT_BYTE_INSERT + src_avg = _mm256_avg_epu8(src_reg, src_next_reg); + src += src_stride << 1; + for (i = 0; i < height - 2; i += 2) { + LOAD_SRC_NEXT_BYTE_INSERT + src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg); + src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21); + LOAD_DST_INSERT + MERGE_WITH_SRC(src_avg, src_temp) + // save current source average + src_avg = src_next_reg; + FILTER_SRC(filter) + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride << 1; + src += src_stride << 1; + } + // last 2 rows processing happens here + __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src)); + __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); + src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1); + src_next_reg = _mm256_permute2x128_si256( + src_avg, _mm256_castsi128_si256(src_reg_0), 0x21); + LOAD_DST_INSERT + MERGE_WITH_SRC(src_avg, src_next_reg) + FILTER_SRC(filter) + CALC_SUM_SSE_INSIDE_LOOP + } + // x_offset = bilin interpolation and y_offset = 0 + } else { + if (y_offset == 0) { + __m256i filter, pw8, src_next_reg; + x_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height; i += 2) { + LOAD_SRC_DST_INSERT(src_stride, dst_stride) + MERGE_NEXT_SRC_INSERT(src_reg, 1) + FILTER_SRC(filter) + CALC_SUM_SSE_INSIDE_LOOP + src += (src_stride << 1); + dst += (dst_stride << 1); + } + // x_offset = bilin interpolation and y_offset = 4 + } else if (y_offset == 4) { + __m256i filter, pw8, src_next_reg, src_pack; + x_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + pw8 = _mm256_set1_epi16(8); + // load and insert source and next row source + LOAD_SRC_NEXT_BYTE_INSERT + MERGE_WITH_SRC(src_reg, src_next_reg) + FILTER_SRC(filter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + src += src_stride << 1; + for (i = 0; i < height - 2; i += 2) { + LOAD_SRC_NEXT_BYTE_INSERT + LOAD_DST_INSERT + MERGE_WITH_SRC(src_reg, src_next_reg) + FILTER_SRC(filter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21); + // average between previous pack to the current + src_pack = _mm256_avg_epu8(src_pack, src_next_reg); + MERGE_WITH_SRC(src_pack, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src_pack = src_reg; + src += src_stride << 1; + dst += dst_stride << 1; + } + // last 2 rows processing happens here + LOAD_SRC_MERGE_128BIT(filter) + LOAD_DST_INSERT + FILTER_SRC_128BIT(filter_128bit) + src_reg_0 = _mm_packus_epi16(src_lo, src_hi); + src_next_reg = _mm256_permute2x128_si256( + src_pack, _mm256_castsi128_si256(src_reg_0), 0x21); + // average between previous pack to the current + src_pack = _mm256_avg_epu8(src_pack, src_next_reg); + MERGE_WITH_SRC(src_pack, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + } else { + // x_offset = bilin interpolation and y_offset = bilin interpolation + __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; + x_offset <<= 5; + xfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + y_offset <<= 5; + yfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + // load and insert source and next row source + LOAD_SRC_NEXT_BYTE_INSERT + MERGE_WITH_SRC(src_reg, src_next_reg) + FILTER_SRC(xfilter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + src += src_stride << 1; + for (i = 0; i < height - 2; i += 2) { + LOAD_SRC_NEXT_BYTE_INSERT + LOAD_DST_INSERT + MERGE_WITH_SRC(src_reg, src_next_reg) + FILTER_SRC(xfilter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21); + // average between previous pack to the current + MERGE_WITH_SRC(src_pack, src_next_reg) + // filter the source + FILTER_SRC(yfilter) + src_pack = src_reg; + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride << 1; + dst += dst_stride << 1; + } + // last 2 rows processing happens here + LOAD_SRC_MERGE_128BIT(xfilter) + LOAD_DST_INSERT + FILTER_SRC_128BIT(filter_128bit) + src_reg_0 = _mm_packus_epi16(src_lo, src_hi); + src_next_reg = _mm256_permute2x128_si256( + src_pack, _mm256_castsi128_si256(src_reg_0), 0x21); + MERGE_WITH_SRC(src_pack, src_next_reg) + FILTER_SRC(yfilter) + CALC_SUM_SSE_INSIDE_LOOP + } + } + CALC_SUM_AND_SSE + _mm256_zeroupper(); + return sum; +} + +unsigned int aom_sub_pixel_avg_variance32xh_avx2( + const uint8_t *src, int src_stride, int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, + int height, unsigned int *sse) { + __m256i sec_reg; + __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; + __m256i zero_reg; + int i, sum; + sum_reg = _mm256_set1_epi16(0); + sse_reg = _mm256_set1_epi16(0); + zero_reg = _mm256_set1_epi16(0); + + // x_offset = 0 and y_offset = 0 + if (x_offset == 0) { + if (y_offset == 0) { + for (i = 0; i < height; i++) { + LOAD_SRC_DST + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + sec += sec_stride; + // expend each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + } else if (y_offset == 8) { + __m256i src_next_reg; + for (i = 0; i < height; i++) { + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, src_stride) + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + sec += sec_stride; + // expend each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + // x_offset = 0 and y_offset = bilin interpolation + } else { + __m256i filter, pw8, src_next_reg; + + y_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height; i++) { + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, src_stride) + FILTER_SRC(filter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + sec += sec_stride; + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + } + // x_offset = 8 and y_offset = 0 + } else if (x_offset == 8) { + if (y_offset == 0) { + __m256i src_next_reg; + for (i = 0; i < height; i++) { + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + sec += sec_stride; + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + // x_offset = 8 and y_offset = 8 + } else if (y_offset == 8) { + __m256i src_next_reg, src_avg; + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + AVG_NEXT_SRC(src_reg, 1) + for (i = 0; i < height; i++) { + // save current source average + src_avg = src_reg; + src += src_stride; + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + // average between previous average to current average + src_avg = _mm256_avg_epu8(src_avg, src_reg); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_avg = _mm256_avg_epu8(src_avg, sec_reg); + sec += sec_stride; + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_avg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + } + // x_offset = 8 and y_offset = bilin interpolation + } else { + __m256i filter, pw8, src_next_reg, src_avg; + y_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + AVG_NEXT_SRC(src_reg, 1) + for (i = 0; i < height; i++) { + // save current source average + src_avg = src_reg; + src += src_stride; + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + MERGE_WITH_SRC(src_avg, src_reg) + FILTER_SRC(filter) + src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_avg = _mm256_avg_epu8(src_avg, sec_reg); + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_avg, zero_reg) + sec += sec_stride; + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + } + } + // x_offset = bilin interpolation and y_offset = 0 + } else { + if (y_offset == 0) { + __m256i filter, pw8, src_next_reg; + x_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height; i++) { + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + MERGE_WITH_SRC(src_reg, zero_reg) + sec += sec_stride; + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + // x_offset = bilin interpolation and y_offset = 8 + } else if (y_offset == 8) { + __m256i filter, pw8, src_next_reg, src_pack; + x_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + pw8 = _mm256_set1_epi16(8); + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + for (i = 0; i < height; i++) { + src += src_stride; + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // average between previous pack to the current + src_pack = _mm256_avg_epu8(src_pack, src_reg); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_pack = _mm256_avg_epu8(src_pack, sec_reg); + sec += sec_stride; + MERGE_WITH_SRC(src_pack, zero_reg) + src_pack = src_reg; + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + } + // x_offset = bilin interpolation and y_offset = bilin interpolation + } else { + __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; + x_offset <<= 5; + xfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + y_offset <<= 5; + yfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + MERGE_NEXT_SRC(src_reg, 1) + + FILTER_SRC(xfilter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + for (i = 0; i < height; i++) { + src += src_stride; + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(xfilter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // merge previous pack to current pack source + MERGE_WITH_SRC(src_pack, src_reg) + // filter the source + FILTER_SRC(yfilter) + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_pack = _mm256_avg_epu8(src_pack, sec_reg); + MERGE_WITH_SRC(src_pack, zero_reg) + src_pack = src_reg; + sec += sec_stride; + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + } + } + } + CALC_SUM_AND_SSE + _mm256_zeroupper(); + return sum; +} diff --git a/libs/libaom/src/aom_dsp/x86/variance_impl_ssse3.c b/libs/libaom/src/aom_dsp/x86/variance_impl_ssse3.c new file mode 100644 index 000000000..66b0d7d84 --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/variance_impl_ssse3.c @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" + +void aom_var_filter_block2d_bil_first_pass_ssse3( + const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { + // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow + // in computation using _mm_maddubs_epi16. + // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow. + const int16_t round = (1 << (FILTER_BITS - 1)) >> 1; + const __m128i r = _mm_set1_epi16(round); + const uint8_t f0 = filter[0] >> 1; + const uint8_t f1 = filter[1] >> 1; + const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1, + f0, f1, f0, f1, f0, f1); + unsigned int i, j; + (void)pixel_step; + + if (output_width >= 8) { + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 8) { + // load source + __m128i source_low = xx_loadl_64(a); + __m128i source_hi = xx_loadl_64(a + 1); + + // unpack to: + // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4], + // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] } + __m128i source = _mm_unpacklo_epi8(source_low, source_hi); + + // b[i] = a[i] * filter[0] + a[i + 1] * filter[1] + __m128i res = _mm_maddubs_epi16(source, filters); + + // round + res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1); + + xx_storeu_128(b, res); + + a += 8; + b += 8; + } + + a += src_pixels_per_line - output_width; + } + } else { + const __m128i shuffle_mask = + _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + for (i = 0; i < output_height; ++i) { + // load source, only first 5 values are meaningful: + // { a[0], a[1], a[2], a[3], a[4], xxxx } + __m128i source = xx_loadl_64(a); + + // shuffle, up to the first 8 are useful + // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4], + // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] } + __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); + + __m128i res = _mm_maddubs_epi16(source_shuffle, filters); + res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1); + + xx_storel_64(b, res); + + a += src_pixels_per_line; + b += output_width; + } + } +} + +void aom_var_filter_block2d_bil_second_pass_ssse3( + const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { + const int16_t round = (1 << FILTER_BITS) >> 1; + const __m128i r = _mm_set1_epi32(round); + const __m128i filters = + _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0], + filter[1], filter[0], filter[1]); + const __m128i shuffle_mask = + _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); + const __m128i mask = + _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 4) { + // load source as: + // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] } + __m128i source1 = xx_loadl_64(a); + __m128i source2 = xx_loadl_64(a + pixel_step); + __m128i source = _mm_unpacklo_epi64(source1, source2); + + // shuffle source to: + // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] } + __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); + + // b[i] = a[i] * filter[0] + a[w + i] * filter[1] + __m128i res = _mm_madd_epi16(source_shuffle, filters); + + // round + res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS); + + // shuffle to get each lower 8 bit of every 32 bit + res = _mm_shuffle_epi8(res, mask); + + xx_storel_32(b, res); + + a += 4; + b += 4; + } + + a += src_pixels_per_line - output_width; + } +} diff --git a/libs/libaom/src/aom_dsp/x86/variance_sse2.c b/libs/libaom/src/aom_dsp/x86/variance_sse2.c new file mode 100644 index 000000000..4e2b5a1aa --- /dev/null +++ b/libs/libaom/src/aom_dsp/x86/variance_sse2.c @@ -0,0 +1,757 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/blend.h" +#include "aom_dsp/x86/synonyms.h" + +#include "aom_ports/mem.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/filter.h" +#include "av1/common/reconinter.h" +#include "av1/encoder/reconinter_enc.h" + +unsigned int aom_get_mb_ss_sse2(const int16_t *src) { + __m128i vsum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 32; ++i) { + const __m128i v = xx_loadu_128(src); + vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); + src += 8; + } + + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); + return _mm_cvtsi128_si32(vsum); +} + +static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) { + const __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 0 * stride)); + const __m128i p1 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 1 * stride)); + return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128()); +} + +static INLINE __m128i load8_8to16_sse2(const uint8_t *const p) { + const __m128i p0 = _mm_loadl_epi64((const __m128i *)p); + return _mm_unpacklo_epi8(p0, _mm_setzero_si128()); +} + +// Accumulate 4 32bit numbers in val to 1 32bit number +static INLINE unsigned int add32x4_sse2(__m128i val) { + val = _mm_add_epi32(val, _mm_srli_si128(val, 8)); + val = _mm_add_epi32(val, _mm_srli_si128(val, 4)); + return _mm_cvtsi128_si32(val); +} + +// Accumulate 8 16bit in sum to 4 32bit number +static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) { + const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16); + const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16); + return _mm_add_epi32(sum_lo, sum_hi); +} + +static INLINE void variance_kernel_sse2(const __m128i src, const __m128i ref, + __m128i *const sse, + __m128i *const sum) { + const __m128i diff = _mm_sub_epi16(src, ref); + *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff)); + *sum = _mm_add_epi16(*sum, diff); +} + +// Can handle 128 pixels' diff sum (such as 8x16 or 16x8) +// Slightly faster than variance_final_256_pel_sse2() +// diff sum of 128 pixels can still fit in 16bit integer +static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); + + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); +} + +// Can handle 256 pixels' diff sum (such as 16x16) +static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); + + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); + *sum += (int16_t)_mm_extract_epi16(vsum, 1); +} + +// Can handle 512 pixels' diff sum (such as 16x32 or 32x16) +static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); + + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_unpacklo_epi16(vsum, vsum); + vsum = _mm_srai_epi32(vsum, 16); + *sum = add32x4_sse2(vsum); +} + +// Can handle 1024 pixels' diff sum (such as 32x32) +static INLINE void variance_final_1024_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); + + vsum = sum_to_32bit_sse2(vsum); + *sum = add32x4_sse2(vsum); +} + +static INLINE void variance4_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 256); // May overflow for larger height. + *sum = _mm_setzero_si128(); + + for (int i = 0; i < h; i += 2) { + const __m128i s = load4x2_sse2(src, src_stride); + const __m128i r = load4x2_sse2(ref, ref_stride); + + variance_kernel_sse2(s, r, sse, sum); + src += 2 * src_stride; + ref += 2 * ref_stride; + } +} + +static INLINE void variance8_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 128); // May overflow for larger height. + *sum = _mm_setzero_si128(); + *sse = _mm_setzero_si128(); + for (int i = 0; i < h; i++) { + const __m128i s = load8_8to16_sse2(src); + const __m128i r = load8_8to16_sse2(ref); + + variance_kernel_sse2(s, r, sse, sum); + src += src_stride; + ref += ref_stride; + } +} + +static INLINE void variance16_kernel_sse2(const uint8_t *const src, + const uint8_t *const ref, + __m128i *const sse, + __m128i *const sum) { + const __m128i zero = _mm_setzero_si128(); + const __m128i s = _mm_loadu_si128((const __m128i *)src); + const __m128i r = _mm_loadu_si128((const __m128i *)ref); + const __m128i src0 = _mm_unpacklo_epi8(s, zero); + const __m128i ref0 = _mm_unpacklo_epi8(r, zero); + const __m128i src1 = _mm_unpackhi_epi8(s, zero); + const __m128i ref1 = _mm_unpackhi_epi8(r, zero); + + variance_kernel_sse2(src0, ref0, sse, sum); + variance_kernel_sse2(src1, ref1, sse, sum); +} + +static INLINE void variance16_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 64); // May overflow for larger height. + *sum = _mm_setzero_si128(); + + for (int i = 0; i < h; ++i) { + variance16_kernel_sse2(src, ref, sse, sum); + src += src_stride; + ref += ref_stride; + } +} + +static INLINE void variance32_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 32); // May overflow for larger height. + // Don't initialize sse here since it's an accumulation. + *sum = _mm_setzero_si128(); + + for (int i = 0; i < h; ++i) { + variance16_kernel_sse2(src + 0, ref + 0, sse, sum); + variance16_kernel_sse2(src + 16, ref + 16, sse, sum); + src += src_stride; + ref += ref_stride; + } +} + +static INLINE void variance64_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 16); // May overflow for larger height. + *sum = _mm_setzero_si128(); + + for (int i = 0; i < h; ++i) { + variance16_kernel_sse2(src + 0, ref + 0, sse, sum); + variance16_kernel_sse2(src + 16, ref + 16, sse, sum); + variance16_kernel_sse2(src + 32, ref + 32, sse, sum); + variance16_kernel_sse2(src + 48, ref + 48, sse, sum); + src += src_stride; + ref += ref_stride; + } +} + +static INLINE void variance128_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 8); // May overflow for larger height. + *sum = _mm_setzero_si128(); + + for (int i = 0; i < h; ++i) { + for (int j = 0; j < 4; ++j) { + const int offset0 = j << 5; + const int offset1 = offset0 + 16; + variance16_kernel_sse2(src + offset0, ref + offset0, sse, sum); + variance16_kernel_sse2(src + offset1, ref + offset1, sse, sum); + } + src += src_stride; + ref += ref_stride; + } +} + +void aom_get8x8var_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + __m128i vsse, vsum; + variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); + variance_final_128_pel_sse2(vsse, vsum, sse, sum); +} + +#define AOM_VAR_NO_LOOP_SSE2(bw, bh, bits, max_pixels) \ + unsigned int aom_variance##bw##x##bh##_sse2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + __m128i vsse = _mm_setzero_si128(); \ + __m128i vsum; \ + int sum = 0; \ + variance##bw##_sse2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum); \ + variance_final_##max_pixels##_pel_sse2(vsse, vsum, sse, &sum); \ + assert(sum <= 255 * bw * bh); \ + assert(sum >= -255 * bw * bh); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \ + } + +AOM_VAR_NO_LOOP_SSE2(4, 4, 4, 128); +AOM_VAR_NO_LOOP_SSE2(4, 8, 5, 128); +AOM_VAR_NO_LOOP_SSE2(4, 16, 6, 128); + +AOM_VAR_NO_LOOP_SSE2(8, 4, 5, 128); +AOM_VAR_NO_LOOP_SSE2(8, 8, 6, 128); +AOM_VAR_NO_LOOP_SSE2(8, 16, 7, 128); +AOM_VAR_NO_LOOP_SSE2(8, 32, 8, 256); + +AOM_VAR_NO_LOOP_SSE2(16, 4, 6, 128); +AOM_VAR_NO_LOOP_SSE2(16, 8, 7, 128); +AOM_VAR_NO_LOOP_SSE2(16, 16, 8, 256); +AOM_VAR_NO_LOOP_SSE2(16, 32, 9, 512); +AOM_VAR_NO_LOOP_SSE2(16, 64, 10, 1024); + +AOM_VAR_NO_LOOP_SSE2(32, 8, 8, 256); +AOM_VAR_NO_LOOP_SSE2(32, 16, 9, 512); +AOM_VAR_NO_LOOP_SSE2(32, 32, 10, 1024); + +#define AOM_VAR_LOOP_SSE2(bw, bh, bits, uh) \ + unsigned int aom_variance##bw##x##bh##_sse2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + __m128i vsse = _mm_setzero_si128(); \ + __m128i vsum = _mm_setzero_si128(); \ + for (int i = 0; i < (bh / uh); ++i) { \ + __m128i vsum16; \ + variance##bw##_sse2(src, src_stride, ref, ref_stride, uh, &vsse, \ + &vsum16); \ + vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); \ + src += (src_stride * uh); \ + ref += (ref_stride * uh); \ + } \ + *sse = add32x4_sse2(vsse); \ + int sum = add32x4_sse2(vsum); \ + assert(sum <= 255 * bw * bh); \ + assert(sum >= -255 * bw * bh); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \ + } + +AOM_VAR_LOOP_SSE2(32, 64, 11, 32); // 32x32 * ( 64/32 ) + +AOM_VAR_NO_LOOP_SSE2(64, 16, 10, 1024); +AOM_VAR_LOOP_SSE2(64, 32, 11, 16); // 64x16 * ( 32/16 ) +AOM_VAR_LOOP_SSE2(64, 64, 12, 16); // 64x16 * ( 64/16 ) +AOM_VAR_LOOP_SSE2(64, 128, 13, 16); // 64x16 * ( 128/16 ) + +AOM_VAR_LOOP_SSE2(128, 64, 13, 8); // 128x8 * ( 64/8 ) +AOM_VAR_LOOP_SSE2(128, 128, 14, 8); // 128x8 * ( 128/8 ) + +unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + aom_variance8x8_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int aom_mse8x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + aom_variance8x16_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int aom_mse16x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + aom_variance16x8_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + aom_variance16x16_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +// The 2 unused parameters are place holders for PIC enabled build. +// These definitions are for functions defined in subpel_variance.asm +#define DECL(w, opt) \ + int aom_sub_pixel_variance##w##xh_##opt( \ + const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \ + void *unused0, void *unused) +#define DECLS(opt) \ + DECL(4, opt); \ + DECL(8, opt); \ + DECL(16, opt) + +DECLS(sse2); +DECLS(ssse3); +#undef DECLS +#undef DECL + +#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ + unsigned int aom_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ + /*Avoid overflow in helper by capping height.*/ \ + const int hf = AOMMIN(h, 64); \ + unsigned int sse = 0; \ + int se = 0; \ + for (int i = 0; i < (w / wf); ++i) { \ + const uint8_t *src_ptr = src; \ + const uint8_t *dst_ptr = dst; \ + for (int j = 0; j < (h / hf); ++j) { \ + unsigned int sse2; \ + const int se2 = aom_sub_pixel_variance##wf##xh_##opt( \ + src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \ + &sse2, NULL, NULL); \ + dst_ptr += hf * dst_stride; \ + src_ptr += hf * src_stride; \ + se += se2; \ + sse += sse2; \ + } \ + src += wf; \ + dst += wf; \ + } \ + *sse_ptr = sse; \ + return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ + } + +#define FNS(opt) \ + FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \ + FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)); \ + FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)); \ + FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ + FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ + FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ + FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ + FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ + FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ + FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ + FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)); \ + FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)); \ + FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)); \ + FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)); \ + FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)); \ + FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)); \ + FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \ + FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \ + FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)); \ + FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)); \ + FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)); \ + FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t)) + +FNS(sse2); +FNS(ssse3); + +#undef FNS +#undef FN + +// The 2 unused parameters are place holders for PIC enabled build. +#define DECL(w, opt) \ + int aom_sub_pixel_avg_variance##w##xh_##opt( \ + const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec, \ + ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ + void *unused) +#define DECLS(opt) \ + DECL(4, opt); \ + DECL(8, opt); \ + DECL(16, opt) + +DECLS(sse2); +DECLS(ssse3); +#undef DECL +#undef DECLS + +#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ + unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \ + const uint8_t *sec) { \ + /*Avoid overflow in helper by capping height.*/ \ + const int hf = AOMMIN(h, 64); \ + unsigned int sse = 0; \ + int se = 0; \ + for (int i = 0; i < (w / wf); ++i) { \ + const uint8_t *src_ptr = src; \ + const uint8_t *dst_ptr = dst; \ + const uint8_t *sec_ptr = sec; \ + for (int j = 0; j < (h / hf); ++j) { \ + unsigned int sse2; \ + const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \ + src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \ + sec_ptr, w, hf, &sse2, NULL, NULL); \ + dst_ptr += hf * dst_stride; \ + src_ptr += hf * src_stride; \ + sec_ptr += hf * w; \ + se += se2; \ + sse += sse2; \ + } \ + src += wf; \ + dst += wf; \ + sec += wf; \ + } \ + *sse_ptr = sse; \ + return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ + } + +#define FNS(opt) \ + FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \ + FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)); \ + FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)); \ + FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ + FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ + FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ + FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ + FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ + FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ + FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ + FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)); \ + FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)); \ + FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)); \ + FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)); \ + FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)); \ + FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)); \ + FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \ + FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \ + FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)); \ + FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)); \ + FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)); \ + FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t)) + +FNS(sse2); +FNS(ssse3); + +#undef FNS +#undef FN + +void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred, int width, int height, + int subpel_x_q3, int subpel_y_q3, + const uint8_t *ref, int ref_stride, + int subpel_search) { + // expect xd == NULL only in tests + if (xd != NULL) { + const MB_MODE_INFO *mi = xd->mi[0]; + const int ref_num = 0; + const int is_intrabc = is_intrabc_block(mi); + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; + const int is_scaled = av1_is_scaled(sf); + + if (is_scaled) { + int plane = 0; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const struct buf_2d *const dst_buf = &pd->dst; + const struct buf_2d *const pre_buf = + is_intrabc ? dst_buf : &pd->pre[ref_num]; + + InterPredParams inter_pred_params; + inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); + const int_interpfilters filters = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + av1_init_inter_params( + &inter_pred_params, width, height, mi_y >> pd->subsampling_y, + mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, + xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters); + av1_enc_build_one_inter_predictor(comp_pred, width, mv, + &inter_pred_params); + return; + } + } + + const InterpFilterParams *filter = av1_get_filter(subpel_search); + // (TODO:yunqing) 2-tap case uses 4-tap functions since there is no SIMD for + // 2-tap yet. + int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS; + + if (!subpel_x_q3 && !subpel_y_q3) { + if (width >= 16) { + int i; + assert(!(width & 15)); + /*Read 16 pixels one row at a time.*/ + for (i = 0; i < height; i++) { + int j; + for (j = 0; j < width; j += 16) { + xx_storeu_128(comp_pred, xx_loadu_128(ref)); + comp_pred += 16; + ref += 16; + } + ref += ref_stride - width; + } + } else if (width >= 8) { + int i; + assert(!(width & 7)); + assert(!(height & 1)); + /*Read 8 pixels two rows at a time.*/ + for (i = 0; i < height; i += 2) { + __m128i s0 = xx_loadl_64(ref + 0 * ref_stride); + __m128i s1 = xx_loadl_64(ref + 1 * ref_stride); + xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1)); + comp_pred += 16; + ref += 2 * ref_stride; + } + } else { + int i; + assert(!(width & 3)); + assert(!(height & 3)); + /*Read 4 pixels four rows at a time.*/ + for (i = 0; i < height; i++) { + const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride); + const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride); + const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride); + const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride); + const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1), + _mm_unpacklo_epi32(row2, row3)); + xx_storeu_128(comp_pred, reg); + comp_pred += 16; + ref += 4 * ref_stride; + } + } + } else if (!subpel_y_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1, + width, height); + } else if (!subpel_x_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16, + width, height); + } else { + DECLARE_ALIGNED(16, uint8_t, + temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); + const int16_t *const kernel_x = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + const int16_t *const kernel_y = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1); + uint8_t *temp_start_horiz = (subpel_search <= USE_4_TAPS) + ? temp + (filter_taps >> 1) * MAX_SB_SIZE + : temp; + uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1); + int intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps; + assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); + aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE, + kernel_x, 16, NULL, -1, width, intermediate_height); + aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1, + kernel_y, 16, width, height); + } +} + +void aom_comp_avg_upsampled_pred_sse2( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, int subpel_search) { + int n; + int i; + aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search); + /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/ + assert(!(width * height & 15)); + n = width * height >> 4; + for (i = 0; i < n; i++) { + __m128i s0 = xx_loadu_128(comp_pred); + __m128i p0 = xx_loadu_128(pred); + xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0)); + comp_pred += 16; + pred += 16; + } +} + +void aom_comp_mask_upsampled_pred_sse2( + MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, + int subpel_search) { + if (subpel_x_q3 | subpel_y_q3) { + aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, + subpel_search); + ref = comp_pred; + ref_stride = width; + } + aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask, + mask_stride, invert_mask); +} + +static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0, + const __m128i s1, + const __m128i a) { + const __m128i alpha_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m128i round_const = + _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m128i a_inv = _mm_sub_epi16(alpha_max, a); + + const __m128i s_lo = _mm_unpacklo_epi16(s0, s1); + const __m128i a_lo = _mm_unpacklo_epi16(a, a_inv); + const __m128i pred_lo = _mm_madd_epi16(s_lo, a_lo); + const __m128i pred_l = _mm_srai_epi32(_mm_add_epi32(pred_lo, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i s_hi = _mm_unpackhi_epi16(s0, s1); + const __m128i a_hi = _mm_unpackhi_epi16(a, a_inv); + const __m128i pred_hi = _mm_madd_epi16(s_hi, a_hi); + const __m128i pred_h = _mm_srai_epi32(_mm_add_epi32(pred_hi, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i comp = _mm_packs_epi32(pred_l, pred_h); + + return comp; +} + +void aom_highbd_comp_mask_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask) { + int i = 0; + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + const uint16_t *src0 = invert_mask ? pred : ref; + const uint16_t *src1 = invert_mask ? ref : pred; + const int stride0 = invert_mask ? width : ref_stride; + const int stride1 = invert_mask ? ref_stride : width; + const __m128i zero = _mm_setzero_si128(); + + if (width == 8) { + do { + const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0)); + const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1)); + const __m128i m_8 = _mm_loadl_epi64((const __m128i *)mask); + const __m128i m_16 = _mm_unpacklo_epi8(m_8, zero); + + const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m_16); + + _mm_storeu_si128((__m128i *)comp_pred, comp); + + src0 += stride0; + src1 += stride1; + mask += mask_stride; + comp_pred += width; + i += 1; + } while (i < height); + } else if (width == 16) { + do { + const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0)); + const __m128i s2 = _mm_loadu_si128((const __m128i *)(src0 + 8)); + const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1)); + const __m128i s3 = _mm_loadu_si128((const __m128i *)(src1 + 8)); + + const __m128i m_8 = _mm_loadu_si128((const __m128i *)mask); + const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero); + const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero); + + const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16); + const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16); + + _mm_storeu_si128((__m128i *)comp_pred, comp); + _mm_storeu_si128((__m128i *)(comp_pred + 8), comp1); + + src0 += stride0; + src1 += stride1; + mask += mask_stride; + comp_pred += width; + i += 1; + } while (i < height); + } else if (width == 32) { + do { + for (int j = 0; j < 2; j++) { + const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0 + j * 16)); + const __m128i s2 = + _mm_loadu_si128((const __m128i *)(src0 + 8 + j * 16)); + const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1 + j * 16)); + const __m128i s3 = + _mm_loadu_si128((const __m128i *)(src1 + 8 + j * 16)); + + const __m128i m_8 = _mm_loadu_si128((const __m128i *)(mask + j * 16)); + const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero); + const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero); + + const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16); + const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16); + + _mm_storeu_si128((__m128i *)(comp_pred + j * 16), comp); + _mm_storeu_si128((__m128i *)(comp_pred + 8 + j * 16), comp1); + } + src0 += stride0; + src1 += stride1; + mask += mask_stride; + comp_pred += width; + i += 1; + } while (i < height); + } +} diff --git a/libs/libaom/src/aom_mem/aom_mem.c b/libs/libaom/src/aom_mem/aom_mem.c new file mode 100644 index 000000000..e977b01d7 --- /dev/null +++ b/libs/libaom/src/aom_mem/aom_mem.c @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_mem.h" +#include +#include +#include +#include "include/aom_mem_intrnl.h" +#include "aom/aom_integer.h" + +#if defined(AOM_MAX_ALLOCABLE_MEMORY) +// Returns 0 in case of overflow of nmemb * size. +static int check_size_argument_overflow(uint64_t nmemb, uint64_t size) { + const uint64_t total_size = nmemb * size; + if (nmemb == 0) return 1; + if (size > AOM_MAX_ALLOCABLE_MEMORY / nmemb) return 0; + if (total_size != (size_t)total_size) return 0; + return 1; +} +#endif + +static size_t GetAlignedMallocSize(size_t size, size_t align) { + return size + align - 1 + ADDRESS_STORAGE_SIZE; +} + +static size_t *GetMallocAddressLocation(void *const mem) { + return ((size_t *)mem) - 1; +} + +static void SetActualMallocAddress(void *const mem, + const void *const malloc_addr) { + size_t *const malloc_addr_location = GetMallocAddressLocation(mem); + *malloc_addr_location = (size_t)malloc_addr; +} + +static void *GetActualMallocAddress(void *const mem) { + const size_t *const malloc_addr_location = GetMallocAddressLocation(mem); + return (void *)(*malloc_addr_location); +} + +void *aom_memalign(size_t align, size_t size) { + void *x = NULL; + const size_t aligned_size = GetAlignedMallocSize(size, align); +#if defined(AOM_MAX_ALLOCABLE_MEMORY) + if (!check_size_argument_overflow(1, aligned_size)) return NULL; +#endif + void *const addr = malloc(aligned_size); + if (addr) { + x = aom_align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, align); + SetActualMallocAddress(x, addr); + } + return x; +} + +void *aom_malloc(size_t size) { return aom_memalign(DEFAULT_ALIGNMENT, size); } + +void *aom_calloc(size_t num, size_t size) { + const size_t total_size = num * size; + void *const x = aom_malloc(total_size); + if (x) memset(x, 0, total_size); + return x; +} + +void aom_free(void *memblk) { + if (memblk) { + void *addr = GetActualMallocAddress(memblk); + free(addr); + } +} + +void *aom_memset16(void *dest, int val, size_t length) { + size_t i; + uint16_t *dest16 = (uint16_t *)dest; + for (i = 0; i < length; i++) *dest16++ = val; + return dest; +} diff --git a/libs/libaom/src/aom_mem/aom_mem.cmake b/libs/libaom/src/aom_mem/aom_mem.cmake new file mode 100644 index 000000000..346588d2d --- /dev/null +++ b/libs/libaom/src/aom_mem/aom_mem.cmake @@ -0,0 +1,29 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_AOM_MEM_AOM_MEM_CMAKE_) + return() +endif() # AOM_AOM_MEM_AOM_MEM_CMAKE_ +set(AOM_AOM_MEM_AOM_MEM_CMAKE_ 1) + +list(APPEND AOM_MEM_SOURCES "${AOM_ROOT}/aom_mem/aom_mem.c" + "${AOM_ROOT}/aom_mem/aom_mem.h" + "${AOM_ROOT}/aom_mem/include/aom_mem_intrnl.h") + +# Creates the aom_mem build target and makes libaom depend on it. The libaom +# target must exist before this function is called. +function(setup_aom_mem_targets) + add_library(aom_mem OBJECT ${AOM_MEM_SOURCES}) + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_mem PARENT_SCOPE) + target_sources(aom PRIVATE $) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE $) + endif() +endfunction() diff --git a/libs/libaom/src/aom_mem/aom_mem.h b/libs/libaom/src/aom_mem/aom_mem.h new file mode 100644 index 000000000..bc5d8bca3 --- /dev/null +++ b/libs/libaom/src/aom_mem/aom_mem.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_MEM_AOM_MEM_H_ +#define AOM_AOM_MEM_AOM_MEM_H_ + +#include "aom/aom_integer.h" +#include "config/aom_config.h" + +#if defined(__uClinux__) +#include +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +#ifndef AOM_MAX_ALLOCABLE_MEMORY +#if SIZE_MAX > (1ULL << 32) +#define AOM_MAX_ALLOCABLE_MEMORY 8589934592 // 8 GB +#else +// For 32-bit targets keep this below INT_MAX to avoid valgrind warnings. +#define AOM_MAX_ALLOCABLE_MEMORY ((1ULL << 31) - (1 << 16)) +#endif +#endif + +void *aom_memalign(size_t align, size_t size); +void *aom_malloc(size_t size); +void *aom_calloc(size_t num, size_t size); +void aom_free(void *memblk); +void *aom_memset16(void *dest, int val, size_t length); + +/*returns an addr aligned to the byte boundary specified by align*/ +#define aom_align_addr(addr, align) \ + (void *)(((uintptr_t)(addr) + ((align)-1)) & ~(uintptr_t)((align)-1)) + +#include + +#ifdef AOM_MEM_PLTFRM +#include AOM_MEM_PLTFRM +#endif + +#if CONFIG_DEBUG +#define AOM_CHECK_MEM_ERROR(error_info, lval, expr) \ + do { \ + lval = (expr); \ + if (!lval) \ + aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \ + "Failed to allocate " #lval " at %s:%d", __FILE__, \ + __LINE__); \ + } while (0) +#else +#define AOM_CHECK_MEM_ERROR(error_info, lval, expr) \ + do { \ + lval = (expr); \ + if (!lval) \ + aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \ + "Failed to allocate " #lval); \ + } while (0) +#endif + +#if defined(__cplusplus) +} +#endif + +#endif // AOM_AOM_MEM_AOM_MEM_H_ diff --git a/libs/libaom/src/aom_mem/include/aom_mem_intrnl.h b/libs/libaom/src/aom_mem/include/aom_mem_intrnl.h new file mode 100644 index 000000000..2c9819de9 --- /dev/null +++ b/libs/libaom/src/aom_mem/include/aom_mem_intrnl.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_MEM_INCLUDE_AOM_MEM_INTRNL_H_ +#define AOM_AOM_MEM_INCLUDE_AOM_MEM_INTRNL_H_ + +#include "config/aom_config.h" + +#define ADDRESS_STORAGE_SIZE sizeof(size_t) + +#ifndef DEFAULT_ALIGNMENT +#if defined(VXWORKS) +/*default addr alignment to use in calls to aom_* functions other than + aom_memalign*/ +#define DEFAULT_ALIGNMENT 32 +#else +#define DEFAULT_ALIGNMENT (2 * sizeof(void *)) /* NOLINT */ +#endif +#endif + +#endif // AOM_AOM_MEM_INCLUDE_AOM_MEM_INTRNL_H_ diff --git a/libs/libaom/src/aom_ports/aom_once.h b/libs/libaom/src/aom_ports/aom_once.h new file mode 100644 index 000000000..d1a031bf1 --- /dev/null +++ b/libs/libaom/src/aom_ports/aom_once.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_PORTS_AOM_ONCE_H_ +#define AOM_AOM_PORTS_AOM_ONCE_H_ + +#include "config/aom_config.h" + +/* Implement a function wrapper to guarantee initialization + * thread-safety for library singletons. + * + * NOTE: This function uses static locks, and can only be + * used with one common argument per compilation unit. So + * + * file1.c: + * aom_once(foo); + * ... + * aom_once(foo); + * + * file2.c: + * aom_once(bar); + * + * will ensure foo() and bar() are each called only once, but in + * + * file1.c: + * aom_once(foo); + * aom_once(bar): + * + * bar() will never be called because the lock is used up + * by the call to foo(). + */ + +#if CONFIG_MULTITHREAD && defined(_WIN32) +#include +/* Declare a per-compilation-unit state variable to track the progress + * of calling func() only once. This must be at global scope because + * local initializers are not thread-safe in MSVC prior to Visual + * Studio 2015. + */ +static INIT_ONCE aom_init_once = INIT_ONCE_STATIC_INIT; + +static void aom_once(void (*func)(void)) { + BOOL pending; + InitOnceBeginInitialize(&aom_init_once, 0, &pending, NULL); + if (!pending) { + // Initialization has already completed. + return; + } + func(); + InitOnceComplete(&aom_init_once, 0, NULL); +} + +#elif CONFIG_MULTITHREAD && defined(__OS2__) +#define INCL_DOS +#include +static void aom_once(void (*func)(void)) { + static int done; + + /* If the initialization is complete, return early. */ + if (done) return; + + /* Causes all other threads in the process to block themselves + * and give up their time slice. + */ + DosEnterCritSec(); + + if (!done) { + func(); + done = 1; + } + + /* Restores normal thread dispatching for the current process. */ + DosExitCritSec(); +} + +#elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H +#include +static void aom_once(void (*func)(void)) { + static pthread_once_t lock = PTHREAD_ONCE_INIT; + pthread_once(&lock, func); +} + +#else +/* Default version that performs no synchronization. */ + +static void aom_once(void (*func)(void)) { + static int done; + + if (!done) { + func(); + done = 1; + } +} +#endif + +#endif // AOM_AOM_PORTS_AOM_ONCE_H_ diff --git a/libs/libaom/src/aom_ports/aom_ports.cmake b/libs/libaom/src/aom_ports/aom_ports.cmake new file mode 100644 index 000000000..d57989654 --- /dev/null +++ b/libs/libaom/src/aom_ports/aom_ports.cmake @@ -0,0 +1,92 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_AOM_PORTS_AOM_PORTS_CMAKE_) + return() +endif() # AOM_AOM_PORTS_AOM_PORTS_CMAKE_ +set(AOM_AOM_PORTS_AOM_PORTS_CMAKE_ 1) + +list(APPEND AOM_PORTS_INCLUDES + "${AOM_ROOT}/aom_ports/aom_once.h" + "${AOM_ROOT}/aom_ports/aom_timer.h" + "${AOM_ROOT}/aom_ports/bitops.h" + "${AOM_ROOT}/aom_ports/emmintrin_compat.h" + "${AOM_ROOT}/aom_ports/mem.h" + "${AOM_ROOT}/aom_ports/mem_ops.h" + "${AOM_ROOT}/aom_ports/mem_ops_aligned.h" + "${AOM_ROOT}/aom_ports/msvc.h" + "${AOM_ROOT}/aom_ports/sanitizer.h" + "${AOM_ROOT}/aom_ports/system_state.h") + +list(APPEND AOM_PORTS_ASM_X86 "${AOM_ROOT}/aom_ports/emms.asm") + +list(APPEND AOM_PORTS_INCLUDES_X86 "${AOM_ROOT}/aom_ports/x86_abi_support.asm") + +list(APPEND AOM_PORTS_SOURCES_ARM "${AOM_ROOT}/aom_ports/arm.h" + "${AOM_ROOT}/aom_ports/arm_cpudetect.c") + +list(APPEND AOM_PORTS_SOURCES_PPC "${AOM_ROOT}/aom_ports/ppc.h" + "${AOM_ROOT}/aom_ports/ppc_cpudetect.c") + +# For arm and x86 targets: +# +# * Creates the aom_ports build target, adds the includes in aom_ports to the +# target, and makes libaom depend on it. +# +# Otherwise: +# +# * Adds the includes in aom_ports to the libaom target. +# +# For all target platforms: +# +# * The libaom target must exist before this function is called. +function(setup_aom_ports_targets) + if("${AOM_TARGET_CPU}" MATCHES "^x86") + add_asm_library("aom_ports" "AOM_PORTS_ASM_X86") + set(aom_ports_has_symbols 1) + elseif("${AOM_TARGET_CPU}" MATCHES "arm") + add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_ARM}) + set(aom_ports_has_symbols 1) + elseif("${AOM_TARGET_CPU}" MATCHES "ppc") + add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_PPC}) + set(aom_ports_has_symbols 1) + endif() + + if("${AOM_TARGET_CPU}" MATCHES "arm|ppc") + target_sources(aom PRIVATE $) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE $) + endif() + endif() + + if(aom_ports_has_symbols) + target_sources(aom_ports PRIVATE ${AOM_PORTS_INCLUDES}) + + if("${AOM_TARGET_CPU}" STREQUAL "x86" + OR "${AOM_TARGET_CPU}" STREQUAL "x86_64") + target_sources(aom_ports PRIVATE ${AOM_PORTS_INCLUDES_X86}) + endif() + + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE) + else() + target_sources(aom PRIVATE ${AOM_PORTS_INCLUDES}) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE ${AOM_PORTS_INCLUDES}) + endif() + + if("${AOM_TARGET_CPU}" STREQUAL "x86" + OR "${AOM_TARGET_CPU}" STREQUAL "x86_64") + target_sources(aom PRIVATE ${AOM_PORTS_INCLUDES_X86}) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE ${AOM_PORTS_INCLUDES_X86}) + endif() + endif() + endif() +endfunction() diff --git a/libs/libaom/src/aom_ports/aom_timer.h b/libs/libaom/src/aom_ports/aom_timer.h new file mode 100644 index 000000000..9b17b8983 --- /dev/null +++ b/libs/libaom/src/aom_ports/aom_timer.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_PORTS_AOM_TIMER_H_ +#define AOM_AOM_PORTS_AOM_TIMER_H_ + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" + +#if CONFIG_OS_SUPPORT + +#if defined(_WIN32) +/* + * Win32 specific includes + */ +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif +#include +#else +/* + * POSIX specific includes + */ +#include + +/* timersub is not provided by msys at this time. */ +#ifndef timersub +#define timersub(a, b, result) \ + do { \ + (result)->tv_sec = (a)->tv_sec - (b)->tv_sec; \ + (result)->tv_usec = (a)->tv_usec - (b)->tv_usec; \ + if ((result)->tv_usec < 0) { \ + --(result)->tv_sec; \ + (result)->tv_usec += 1000000; \ + } \ + } while (0) +#endif +#endif + +struct aom_usec_timer { +#if defined(_WIN32) + LARGE_INTEGER begin, end; +#else + struct timeval begin, end; +#endif +}; + +static INLINE void aom_usec_timer_start(struct aom_usec_timer *t) { +#if defined(_WIN32) + QueryPerformanceCounter(&t->begin); +#else + gettimeofday(&t->begin, NULL); +#endif +} + +static INLINE void aom_usec_timer_mark(struct aom_usec_timer *t) { +#if defined(_WIN32) + QueryPerformanceCounter(&t->end); +#else + gettimeofday(&t->end, NULL); +#endif +} + +static INLINE int64_t aom_usec_timer_elapsed(struct aom_usec_timer *t) { +#if defined(_WIN32) + LARGE_INTEGER freq, diff; + + diff.QuadPart = t->end.QuadPart - t->begin.QuadPart; + + QueryPerformanceFrequency(&freq); + return diff.QuadPart * 1000000 / freq.QuadPart; +#else + struct timeval diff; + + timersub(&t->end, &t->begin, &diff); + return ((int64_t)diff.tv_sec) * 1000000 + diff.tv_usec; +#endif +} + +#else /* CONFIG_OS_SUPPORT = 0*/ + +/* Empty timer functions if CONFIG_OS_SUPPORT = 0 */ +#ifndef timersub +#define timersub(a, b, result) +#endif + +struct aom_usec_timer { + void *dummy; +}; + +static INLINE void aom_usec_timer_start(struct aom_usec_timer *t) { (void)t; } + +static INLINE void aom_usec_timer_mark(struct aom_usec_timer *t) { (void)t; } + +static INLINE int aom_usec_timer_elapsed(struct aom_usec_timer *t) { + (void)t; + return 0; +} + +#endif /* CONFIG_OS_SUPPORT */ + +#endif // AOM_AOM_PORTS_AOM_TIMER_H_ diff --git a/libs/libaom/src/aom_ports/arm.h b/libs/libaom/src/aom_ports/arm.h new file mode 100644 index 000000000..cb1fb9bec --- /dev/null +++ b/libs/libaom/src/aom_ports/arm.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_PORTS_ARM_H_ +#define AOM_AOM_PORTS_ARM_H_ +#include + +#include "config/aom_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*ARMv5TE "Enhanced DSP" instructions.*/ +#define HAS_EDSP 0x01 +/*ARMv6 "Parallel" or "Media" instructions.*/ +#define HAS_MEDIA 0x02 +/*ARMv7 optional NEON instructions.*/ +#define HAS_NEON 0x04 + +int aom_arm_cpu_caps(void); + +// Earlier gcc compilers have issues with some neon intrinsics +#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 4 && \ + __GNUC_MINOR__ <= 6 +#define AOM_INCOMPATIBLE_GCC +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_PORTS_ARM_H_ diff --git a/libs/libaom/src/aom_ports/arm_cpudetect.c b/libs/libaom/src/aom_ports/arm_cpudetect.c new file mode 100644 index 000000000..5a75bb348 --- /dev/null +++ b/libs/libaom/src/aom_ports/arm_cpudetect.c @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include "aom_ports/arm.h" +#include "config/aom_config.h" + +#ifdef WINAPI_FAMILY +#include +#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) +#define getenv(x) NULL +#endif +#endif + +static int arm_cpu_env_flags(int *flags) { + char *env; + env = getenv("AOM_SIMD_CAPS"); + if (env && *env) { + *flags = (int)strtol(env, NULL, 0); + return 0; + } + *flags = 0; + return -1; +} + +static int arm_cpu_env_mask(void) { + char *env; + env = getenv("AOM_SIMD_CAPS_MASK"); + return env && *env ? (int)strtol(env, NULL, 0) : ~0; +} + +#if !CONFIG_RUNTIME_CPU_DETECT + +int aom_arm_cpu_caps(void) { + /* This function should actually be a no-op. There is no way to adjust any of + * these because the RTCD tables do not exist: the functions are called + * statically */ + int flags; + int mask; + if (!arm_cpu_env_flags(&flags)) { + return flags; + } + mask = arm_cpu_env_mask(); +#if HAVE_NEON + flags |= HAS_NEON; +#endif /* HAVE_NEON */ + return flags & mask; +} + +#elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT */ +/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/ +#define WIN32_LEAN_AND_MEAN +#define WIN32_EXTRA_LEAN +#include + +int aom_arm_cpu_caps(void) { + int flags; + int mask; + if (!arm_cpu_env_flags(&flags)) { + return flags; + } + mask = arm_cpu_env_mask(); +/* MSVC has no inline __asm support for ARM, but it does let you __emit + * instructions via their assembled hex code. + * All of these instructions should be essentially nops. + */ +#if HAVE_NEON + if (mask & HAS_NEON) { + __try { + /*VORR q0,q0,q0*/ + __emit(0xF2200150); + flags |= HAS_NEON; + } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) { + /*Ignore exception.*/ + } + } +#endif /* HAVE_NEON */ + return flags & mask; +} + +#elif defined(__ANDROID__) /* end _MSC_VER */ +#include + +int aom_arm_cpu_caps(void) { + int flags; + int mask; + uint64_t features; + if (!arm_cpu_env_flags(&flags)) { + return flags; + } + mask = arm_cpu_env_mask(); + features = android_getCpuFeatures(); + +#if HAVE_NEON + if (features & ANDROID_CPU_ARM_FEATURE_NEON) flags |= HAS_NEON; +#endif /* HAVE_NEON */ + return flags & mask; +} + +#elif defined(__linux__) /* end __ANDROID__ */ + +#include + +int aom_arm_cpu_caps(void) { + FILE *fin; + int flags; + int mask; + if (!arm_cpu_env_flags(&flags)) { + return flags; + } + mask = arm_cpu_env_mask(); + /* Reading /proc/self/auxv would be easier, but that doesn't work reliably + * on Android. + * This also means that detection will fail in Scratchbox. + */ + fin = fopen("/proc/cpuinfo", "r"); + if (fin != NULL) { + /* 512 should be enough for anybody (it's even enough for all the flags + * that x86 has accumulated... so far). + */ + char buf[512]; + while (fgets(buf, 511, fin) != NULL) { +#if HAVE_NEON + if (memcmp(buf, "Features", 8) == 0) { + char *p; + p = strstr(buf, " neon"); + if (p != NULL && (p[5] == ' ' || p[5] == '\n')) { + flags |= HAS_NEON; + } + } +#endif /* HAVE_NEON */ + } + fclose(fin); + } + return flags & mask; +} +#else /* end __linux__ */ +#error \ + "--enable-runtime-cpu-detect selected, but no CPU detection method " \ +"available for your platform. Reconfigure with --disable-runtime-cpu-detect." +#endif diff --git a/libs/libaom/src/aom_ports/bitops.h b/libs/libaom/src/aom_ports/bitops.h new file mode 100644 index 000000000..44df17307 --- /dev/null +++ b/libs/libaom/src/aom_ports/bitops.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_PORTS_BITOPS_H_ +#define AOM_AOM_PORTS_BITOPS_H_ + +#include + +#include "aom_ports/msvc.h" +#include "config/aom_config.h" + +#ifdef _MSC_VER +#if defined(_M_X64) || defined(_M_IX86) +#include +#define USE_MSC_INTRINSICS +#endif +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +// get_msb: +// Returns (int)floor(log2(n)). n must be > 0. +// These versions of get_msb() are only valid when n != 0 because all +// of the optimized versions are undefined when n == 0: +// https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html + +// use GNU builtins where available. +#if defined(__GNUC__) && \ + ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4) +static INLINE int get_msb(unsigned int n) { + assert(n != 0); + return 31 ^ __builtin_clz(n); +} +#elif defined(USE_MSC_INTRINSICS) +#pragma intrinsic(_BitScanReverse) + +static INLINE int get_msb(unsigned int n) { + unsigned long first_set_bit; + assert(n != 0); + _BitScanReverse(&first_set_bit, n); + return first_set_bit; +} +#undef USE_MSC_INTRINSICS +#else +static INLINE int get_msb(unsigned int n) { + int log = 0; + unsigned int value = n; + int i; + + assert(n != 0); + + for (i = 4; i >= 0; --i) { + const int shift = (1 << i); + const unsigned int x = value >> shift; + if (x != 0) { + value = x; + log += shift; + } + } + return log; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_PORTS_BITOPS_H_ diff --git a/libs/libaom/src/aom_ports/emmintrin_compat.h b/libs/libaom/src/aom_ports/emmintrin_compat.h new file mode 100644 index 000000000..85d218a3d --- /dev/null +++ b/libs/libaom/src/aom_ports/emmintrin_compat.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_PORTS_EMMINTRIN_COMPAT_H_ +#define AOM_AOM_PORTS_EMMINTRIN_COMPAT_H_ + +#if defined(__GNUC__) && __GNUC__ < 4 +/* From emmintrin.h (gcc 4.5.3) */ +/* Casts between various SP, DP, INT vector types. Note that these do no + conversion of values, they just change the type. */ +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castpd_ps(__m128d __A) { + return (__m128)__A; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castpd_si128(__m128d __A) { + return (__m128i)__A; +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castps_pd(__m128 __A) { + return (__m128d)__A; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castps_si128(__m128 __A) { + return (__m128i)__A; +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castsi128_ps(__m128i __A) { + return (__m128)__A; +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castsi128_pd(__m128i __A) { + return (__m128d)__A; +} +#endif + +#endif // AOM_AOM_PORTS_EMMINTRIN_COMPAT_H_ diff --git a/libs/libaom/src/aom_ports/emms.asm b/libs/libaom/src/aom_ports/emms.asm new file mode 100644 index 000000000..90776bacb --- /dev/null +++ b/libs/libaom/src/aom_ports/emms.asm @@ -0,0 +1,41 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + +%include "aom_ports/x86_abi_support.asm" + +section .text +global sym(aom_reset_mmx_state) PRIVATE +sym(aom_reset_mmx_state): + emms + ret + + +%if LIBAOM_YASM_WIN64 +global sym(aom_winx64_fldcw) PRIVATE +sym(aom_winx64_fldcw): + sub rsp, 8 + mov [rsp], rcx ; win x64 specific + fldcw [rsp] + add rsp, 8 + ret + + +global sym(aom_winx64_fstcw) PRIVATE +sym(aom_winx64_fstcw): + sub rsp, 8 + fstcw [rsp] + mov rax, [rsp] + add rsp, 8 + ret +%endif diff --git a/libs/libaom/src/aom_ports/mem.h b/libs/libaom/src/aom_ports/mem.h new file mode 100644 index 000000000..9e3d42403 --- /dev/null +++ b/libs/libaom/src/aom_ports/mem.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_PORTS_MEM_H_ +#define AOM_AOM_PORTS_MEM_H_ + +#include "aom/aom_integer.h" +#include "config/aom_config.h" + +#if (defined(__GNUC__) && __GNUC__) || defined(__SUNPRO_C) +#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n))) +#elif defined(_MSC_VER) +#define DECLARE_ALIGNED(n, typ, val) __declspec(align(n)) typ val +#else +#warning No alignment directives known for this compiler. +#define DECLARE_ALIGNED(n, typ, val) typ val +#endif + +/* Indicates that the usage of the specified variable has been audited to assure + * that it's safe to use uninitialized. Silences 'may be used uninitialized' + * warnings on gcc. + */ +#if defined(__GNUC__) && __GNUC__ +#define UNINITIALIZED_IS_SAFE(x) x = x +#else +#define UNINITIALIZED_IS_SAFE(x) x +#endif + +#if HAVE_NEON && defined(_MSC_VER) +#define __builtin_prefetch(x) +#endif + +/* Shift down with rounding for use when n >= 0, value >= 0 */ +#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n)) + +/* Shift down with rounding for signed integers, for use when n >= 0 */ +#define ROUND_POWER_OF_TWO_SIGNED(value, n) \ + (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \ + : ROUND_POWER_OF_TWO((value), (n))) + +/* Shift down with rounding for use when n >= 0, value >= 0 for (64 bit) */ +#define ROUND_POWER_OF_TWO_64(value, n) \ + (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n)) +/* Shift down with rounding for signed integers, for use when n >= 0 (64 bit) */ +#define ROUND_POWER_OF_TWO_SIGNED_64(value, n) \ + (((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \ + : ROUND_POWER_OF_TWO_64((value), (n))) + +/* shift right or left depending on sign of n */ +#define RIGHT_SIGNED_SHIFT(value, n) \ + ((n) < 0 ? ((value) << (-(n))) : ((value) >> (n))) + +#define ALIGN_POWER_OF_TWO(value, n) \ + (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1)) + +#define DIVIDE_AND_ROUND(x, y) (((x) + ((y) >> 1)) / (y)) + +#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1)) +#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1)) + +/*!\brief force enum to be unsigned 1 byte*/ +#define UENUM1BYTE(enumvar) \ + ; \ + typedef uint8_t enumvar + +/*!\brief force enum to be signed 1 byte*/ +#define SENUM1BYTE(enumvar) \ + ; \ + typedef int8_t enumvar + +/*!\brief force enum to be unsigned 2 byte*/ +#define UENUM2BYTE(enumvar) \ + ; \ + typedef uint16_t enumvar + +/*!\brief force enum to be signed 2 byte*/ +#define SENUM2BYTE(enumvar) \ + ; \ + typedef int16_t enumvar + +/*!\brief force enum to be unsigned 4 byte*/ +#define UENUM4BYTE(enumvar) \ + ; \ + typedef uint32_t enumvar + +/*!\brief force enum to be unsigned 4 byte*/ +#define SENUM4BYTE(enumvar) \ + ; \ + typedef int32_t enumvar + +#endif // AOM_AOM_PORTS_MEM_H_ diff --git a/libs/libaom/src/aom_ports/mem_ops.h b/libs/libaom/src/aom_ports/mem_ops.h new file mode 100644 index 000000000..2b5bc0f0f --- /dev/null +++ b/libs/libaom/src/aom_ports/mem_ops.h @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_PORTS_MEM_OPS_H_ +#define AOM_AOM_PORTS_MEM_OPS_H_ + +/* \file + * \brief Provides portable memory access primitives + * + * This function provides portable primitives for getting and setting of + * signed and unsigned integers in 16, 24, and 32 bit sizes. The operations + * can be performed on unaligned data regardless of hardware support for + * unaligned accesses. + * + * The type used to pass the integral values may be changed by defining + * MEM_VALUE_T with the appropriate type. The type given must be an integral + * numeric type. + * + * The actual functions instantiated have the MEM_VALUE_T type name pasted + * on to the symbol name. This allows the developer to instantiate these + * operations for multiple types within the same translation unit. This is + * of somewhat questionable utility, but the capability exists nonetheless. + * Users not making use of this functionality should call the functions + * without the type name appended, and the preprocessor will take care of + * it. + * + * NOTE: This code is not supported on platforms where char > 1 octet ATM. + */ + +#ifndef MAU_T +/* Minimum Access Unit for this target */ +#define MAU_T unsigned char +#endif + +#ifndef MEM_VALUE_T +#define MEM_VALUE_T int +#endif + +#undef MEM_VALUE_T_SZ_BITS +#define MEM_VALUE_T_SZ_BITS (sizeof(MEM_VALUE_T) << 3) + +#undef mem_ops_wrap_symbol +#define mem_ops_wrap_symbol(fn) mem_ops_wrap_symbol2(fn, MEM_VALUE_T) +#undef mem_ops_wrap_symbol2 +#define mem_ops_wrap_symbol2(fn, typ) mem_ops_wrap_symbol3(fn, typ) +#undef mem_ops_wrap_symbol3 +#define mem_ops_wrap_symbol3(fn, typ) fn##_as_##typ + +/* + * Include aligned access routines + */ +#define INCLUDED_BY_MEM_OPS_H +#include "mem_ops_aligned.h" +#undef INCLUDED_BY_MEM_OPS_H + +#undef mem_get_be16 +#define mem_get_be16 mem_ops_wrap_symbol(mem_get_be16) +static unsigned MEM_VALUE_T mem_get_be16(const void *vmem) { + unsigned MEM_VALUE_T val; + const MAU_T *mem = (const MAU_T *)vmem; + + val = mem[0] << 8; + val |= mem[1]; + return val; +} + +#undef mem_get_be24 +#define mem_get_be24 mem_ops_wrap_symbol(mem_get_be24) +static unsigned MEM_VALUE_T mem_get_be24(const void *vmem) { + unsigned MEM_VALUE_T val; + const MAU_T *mem = (const MAU_T *)vmem; + + val = mem[0] << 16; + val |= mem[1] << 8; + val |= mem[2]; + return val; +} + +#undef mem_get_be32 +#define mem_get_be32 mem_ops_wrap_symbol(mem_get_be32) +static unsigned MEM_VALUE_T mem_get_be32(const void *vmem) { + unsigned MEM_VALUE_T val; + const MAU_T *mem = (const MAU_T *)vmem; + + val = ((unsigned MEM_VALUE_T)mem[0]) << 24; + val |= mem[1] << 16; + val |= mem[2] << 8; + val |= mem[3]; + return val; +} + +#undef mem_get_le16 +#define mem_get_le16 mem_ops_wrap_symbol(mem_get_le16) +static unsigned MEM_VALUE_T mem_get_le16(const void *vmem) { + unsigned MEM_VALUE_T val; + const MAU_T *mem = (const MAU_T *)vmem; + + val = mem[1] << 8; + val |= mem[0]; + return val; +} + +#undef mem_get_le24 +#define mem_get_le24 mem_ops_wrap_symbol(mem_get_le24) +static unsigned MEM_VALUE_T mem_get_le24(const void *vmem) { + unsigned MEM_VALUE_T val; + const MAU_T *mem = (const MAU_T *)vmem; + + val = mem[2] << 16; + val |= mem[1] << 8; + val |= mem[0]; + return val; +} + +#undef mem_get_le32 +#define mem_get_le32 mem_ops_wrap_symbol(mem_get_le32) +static unsigned MEM_VALUE_T mem_get_le32(const void *vmem) { + unsigned MEM_VALUE_T val; + const MAU_T *mem = (const MAU_T *)vmem; + + val = ((unsigned MEM_VALUE_T)mem[3]) << 24; + val |= mem[2] << 16; + val |= mem[1] << 8; + val |= mem[0]; + return val; +} + +#define mem_get_s_generic(end, sz) \ + static AOM_INLINE signed MEM_VALUE_T mem_get_s##end##sz(const void *vmem) { \ + const MAU_T *mem = (const MAU_T *)vmem; \ + signed MEM_VALUE_T val = mem_get_##end##sz(mem); \ + return (val << (MEM_VALUE_T_SZ_BITS - sz)) >> (MEM_VALUE_T_SZ_BITS - sz); \ + } + +/* clang-format off */ +#undef mem_get_sbe16 +#define mem_get_sbe16 mem_ops_wrap_symbol(mem_get_sbe16) +mem_get_s_generic(be, 16) + +#undef mem_get_sbe24 +#define mem_get_sbe24 mem_ops_wrap_symbol(mem_get_sbe24) +mem_get_s_generic(be, 24) + +#undef mem_get_sbe32 +#define mem_get_sbe32 mem_ops_wrap_symbol(mem_get_sbe32) +mem_get_s_generic(be, 32) + +#undef mem_get_sle16 +#define mem_get_sle16 mem_ops_wrap_symbol(mem_get_sle16) +mem_get_s_generic(le, 16) + +#undef mem_get_sle24 +#define mem_get_sle24 mem_ops_wrap_symbol(mem_get_sle24) +mem_get_s_generic(le, 24) + +#undef mem_get_sle32 +#define mem_get_sle32 mem_ops_wrap_symbol(mem_get_sle32) +mem_get_s_generic(le, 32) + +#undef mem_put_be16 +#define mem_put_be16 mem_ops_wrap_symbol(mem_put_be16) +static AOM_INLINE void mem_put_be16(void *vmem, MEM_VALUE_T val) { + MAU_T *mem = (MAU_T *)vmem; + + mem[0] = (MAU_T)((val >> 8) & 0xff); + mem[1] = (MAU_T)((val >> 0) & 0xff); +} + +#undef mem_put_be24 +#define mem_put_be24 mem_ops_wrap_symbol(mem_put_be24) +static AOM_INLINE void mem_put_be24(void *vmem, MEM_VALUE_T val) { + MAU_T *mem = (MAU_T *)vmem; + + mem[0] = (MAU_T)((val >> 16) & 0xff); + mem[1] = (MAU_T)((val >> 8) & 0xff); + mem[2] = (MAU_T)((val >> 0) & 0xff); +} + +#undef mem_put_be32 +#define mem_put_be32 mem_ops_wrap_symbol(mem_put_be32) +static AOM_INLINE void mem_put_be32(void *vmem, MEM_VALUE_T val) { + MAU_T *mem = (MAU_T *)vmem; + + mem[0] = (MAU_T)((val >> 24) & 0xff); + mem[1] = (MAU_T)((val >> 16) & 0xff); + mem[2] = (MAU_T)((val >> 8) & 0xff); + mem[3] = (MAU_T)((val >> 0) & 0xff); +} + +#undef mem_put_le16 +#define mem_put_le16 mem_ops_wrap_symbol(mem_put_le16) +static AOM_INLINE void mem_put_le16(void *vmem, MEM_VALUE_T val) { + MAU_T *mem = (MAU_T *)vmem; + + mem[0] = (MAU_T)((val >> 0) & 0xff); + mem[1] = (MAU_T)((val >> 8) & 0xff); +} + +#undef mem_put_le24 +#define mem_put_le24 mem_ops_wrap_symbol(mem_put_le24) +static AOM_INLINE void mem_put_le24(void *vmem, MEM_VALUE_T val) { + MAU_T *mem = (MAU_T *)vmem; + + mem[0] = (MAU_T)((val >> 0) & 0xff); + mem[1] = (MAU_T)((val >> 8) & 0xff); + mem[2] = (MAU_T)((val >> 16) & 0xff); +} + +#undef mem_put_le32 +#define mem_put_le32 mem_ops_wrap_symbol(mem_put_le32) +static AOM_INLINE void mem_put_le32(void *vmem, MEM_VALUE_T val) { + MAU_T *mem = (MAU_T *)vmem; + + mem[0] = (MAU_T)((val >> 0) & 0xff); + mem[1] = (MAU_T)((val >> 8) & 0xff); + mem[2] = (MAU_T)((val >> 16) & 0xff); + mem[3] = (MAU_T)((val >> 24) & 0xff); +} +/* clang-format on */ +#endif // AOM_AOM_PORTS_MEM_OPS_H_ diff --git a/libs/libaom/src/aom_ports/mem_ops_aligned.h b/libs/libaom/src/aom_ports/mem_ops_aligned.h new file mode 100644 index 000000000..37c367531 --- /dev/null +++ b/libs/libaom/src/aom_ports/mem_ops_aligned.h @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_PORTS_MEM_OPS_ALIGNED_H_ +#define AOM_AOM_PORTS_MEM_OPS_ALIGNED_H_ + +#include "aom/aom_integer.h" + +/* \file + * \brief Provides portable memory access primitives for operating on aligned + * data + * + * This file is split from mem_ops.h for easier maintenance. See mem_ops.h + * for a more detailed description of these primitives. + */ +#ifndef INCLUDED_BY_MEM_OPS_H +#error Include mem_ops.h, not mem_ops_aligned.h directly. +#endif + +/* Architectures that provide instructions for doing this byte swapping + * could redefine these macros. + */ +#define swap_endian_16(val, raw) \ + do { \ + val = (uint16_t)(((raw >> 8) & 0x00ff) | ((raw << 8) & 0xff00)); \ + } while (0) +#define swap_endian_32(val, raw) \ + do { \ + val = ((raw >> 24) & 0x000000ff) | ((raw >> 8) & 0x0000ff00) | \ + ((raw << 8) & 0x00ff0000) | ((raw << 24) & 0xff000000); \ + } while (0) +#define swap_endian_16_se(val, raw) \ + do { \ + swap_endian_16(val, raw); \ + val = ((val << 16) >> 16); \ + } while (0) +#define swap_endian_32_se(val, raw) swap_endian_32(val, raw) + +#define mem_get_ne_aligned_generic(end, sz) \ + static AOM_INLINE unsigned MEM_VALUE_T mem_get_##end##sz##_aligned( \ + const void *vmem) { \ + const uint##sz##_t *mem = (const uint##sz##_t *)vmem; \ + return *mem; \ + } + +#define mem_get_sne_aligned_generic(end, sz) \ + static AOM_INLINE signed MEM_VALUE_T mem_get_s##end##sz##_aligned( \ + const void *vmem) { \ + const int##sz##_t *mem = (const int##sz##_t *)vmem; \ + return *mem; \ + } + +#define mem_get_se_aligned_generic(end, sz) \ + static AOM_INLINE unsigned MEM_VALUE_T mem_get_##end##sz##_aligned( \ + const void *vmem) { \ + const uint##sz##_t *mem = (const uint##sz##_t *)vmem; \ + unsigned MEM_VALUE_T val, raw = *mem; \ + swap_endian_##sz(val, raw); \ + return val; \ + } + +#define mem_get_sse_aligned_generic(end, sz) \ + static AOM_INLINE signed MEM_VALUE_T mem_get_s##end##sz##_aligned( \ + const void *vmem) { \ + const int##sz##_t *mem = (const int##sz##_t *)vmem; \ + unsigned MEM_VALUE_T val, raw = *mem; \ + swap_endian_##sz##_se(val, raw); \ + return val; \ + } + +#define mem_put_ne_aligned_generic(end, sz) \ + static AOM_INLINE void mem_put_##end##sz##_aligned(void *vmem, \ + MEM_VALUE_T val) { \ + uint##sz##_t *mem = (uint##sz##_t *)vmem; \ + *mem = (uint##sz##_t)val; \ + } + +#define mem_put_se_aligned_generic(end, sz) \ + static AOM_INLINE void mem_put_##end##sz##_aligned(void *vmem, \ + MEM_VALUE_T val) { \ + uint##sz##_t *mem = (uint##sz##_t *)vmem, raw; \ + swap_endian_##sz(raw, val); \ + *mem = (uint##sz##_t)raw; \ + } + +#include "config/aom_config.h" + +#if CONFIG_BIG_ENDIAN +#define mem_get_be_aligned_generic(sz) mem_get_ne_aligned_generic(be, sz) +#define mem_get_sbe_aligned_generic(sz) mem_get_sne_aligned_generic(be, sz) +#define mem_get_le_aligned_generic(sz) mem_get_se_aligned_generic(le, sz) +#define mem_get_sle_aligned_generic(sz) mem_get_sse_aligned_generic(le, sz) +#define mem_put_be_aligned_generic(sz) mem_put_ne_aligned_generic(be, sz) +#define mem_put_le_aligned_generic(sz) mem_put_se_aligned_generic(le, sz) +#else +#define mem_get_be_aligned_generic(sz) mem_get_se_aligned_generic(be, sz) +#define mem_get_sbe_aligned_generic(sz) mem_get_sse_aligned_generic(be, sz) +#define mem_get_le_aligned_generic(sz) mem_get_ne_aligned_generic(le, sz) +#define mem_get_sle_aligned_generic(sz) mem_get_sne_aligned_generic(le, sz) +#define mem_put_be_aligned_generic(sz) mem_put_se_aligned_generic(be, sz) +#define mem_put_le_aligned_generic(sz) mem_put_ne_aligned_generic(le, sz) +#endif + +/* clang-format off */ +#undef mem_get_be16_aligned +#define mem_get_be16_aligned mem_ops_wrap_symbol(mem_get_be16_aligned) +mem_get_be_aligned_generic(16) + +#undef mem_get_be32_aligned +#define mem_get_be32_aligned mem_ops_wrap_symbol(mem_get_be32_aligned) +mem_get_be_aligned_generic(32) + +#undef mem_get_le16_aligned +#define mem_get_le16_aligned mem_ops_wrap_symbol(mem_get_le16_aligned) +mem_get_le_aligned_generic(16) + +#undef mem_get_le32_aligned +#define mem_get_le32_aligned mem_ops_wrap_symbol(mem_get_le32_aligned) +mem_get_le_aligned_generic(32) + +#undef mem_get_sbe16_aligned +#define mem_get_sbe16_aligned mem_ops_wrap_symbol(mem_get_sbe16_aligned) +mem_get_sbe_aligned_generic(16) + +#undef mem_get_sbe32_aligned +#define mem_get_sbe32_aligned mem_ops_wrap_symbol(mem_get_sbe32_aligned) +mem_get_sbe_aligned_generic(32) + +#undef mem_get_sle16_aligned +#define mem_get_sle16_aligned mem_ops_wrap_symbol(mem_get_sle16_aligned) +mem_get_sle_aligned_generic(16) + +#undef mem_get_sle32_aligned +#define mem_get_sle32_aligned mem_ops_wrap_symbol(mem_get_sle32_aligned) +mem_get_sle_aligned_generic(32) + +#undef mem_put_be16_aligned +#define mem_put_be16_aligned mem_ops_wrap_symbol(mem_put_be16_aligned) +mem_put_be_aligned_generic(16) + +#undef mem_put_be32_aligned +#define mem_put_be32_aligned mem_ops_wrap_symbol(mem_put_be32_aligned) +mem_put_be_aligned_generic(32) + +#undef mem_put_le16_aligned +#define mem_put_le16_aligned mem_ops_wrap_symbol(mem_put_le16_aligned) +mem_put_le_aligned_generic(16) + +#undef mem_put_le32_aligned +#define mem_put_le32_aligned mem_ops_wrap_symbol(mem_put_le32_aligned) +mem_put_le_aligned_generic(32) + +#undef mem_get_ne_aligned_generic +#undef mem_get_se_aligned_generic +#undef mem_get_sne_aligned_generic +#undef mem_get_sse_aligned_generic +#undef mem_put_ne_aligned_generic +#undef mem_put_se_aligned_generic +#undef swap_endian_16 +#undef swap_endian_32 +#undef swap_endian_16_se +#undef swap_endian_32_se +/* clang-format on */ + +#endif // AOM_AOM_PORTS_MEM_OPS_ALIGNED_H_ diff --git a/libs/libaom/src/aom_ports/msvc.h b/libs/libaom/src/aom_ports/msvc.h new file mode 100644 index 000000000..e78e605f2 --- /dev/null +++ b/libs/libaom/src/aom_ports/msvc.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_PORTS_MSVC_H_ +#define AOM_AOM_PORTS_MSVC_H_ +#ifdef _MSC_VER + +#include "config/aom_config.h" + +#if _MSC_VER < 1900 // VS2015 provides snprintf +#define snprintf _snprintf +#endif // _MSC_VER < 1900 + +#if _MSC_VER < 1800 // VS2013 provides round +#include +static INLINE double round(double x) { + if (x < 0) + return ceil(x - 0.5); + else + return floor(x + 0.5); +} + +static INLINE float roundf(float x) { + if (x < 0) + return (float)ceil(x - 0.5f); + else + return (float)floor(x + 0.5f); +} + +static INLINE long lroundf(float x) { + if (x < 0) + return (long)(x - 0.5f); + else + return (long)(x + 0.5f); +} +#endif // _MSC_VER < 1800 + +#if HAVE_AVX +#include +// Note: +// _mm256_insert_epi16 intrinsics is available from vs2017. +// We define this macro for vs2015 and earlier. The +// intrinsics used here are in vs2015 document: +// https://msdn.microsoft.com/en-us/library/hh977022.aspx +// Input parameters: +// a: __m256i, +// d: int16_t, +// indx: imm8 (0 - 15) +#if _MSC_VER <= 1900 +#define _mm256_insert_epi16(a, d, indx) \ + _mm256_insertf128_si256( \ + a, \ + _mm_insert_epi16(_mm256_extractf128_si256(a, indx >> 3), d, indx % 8), \ + indx >> 3) + +static INLINE int _mm256_extract_epi32(__m256i a, const int i) { + return a.m256i_i32[i & 7]; +} +static INLINE __m256i _mm256_insert_epi32(__m256i a, int b, const int i) { + __m256i c = a; + c.m256i_i32[i & 7] = b; + return c; +} +#endif // _MSC_VER <= 1900 +#endif // HAVE_AVX +#endif // _MSC_VER +#endif // AOM_AOM_PORTS_MSVC_H_ diff --git a/libs/libaom/src/aom_ports/ppc.h b/libs/libaom/src/aom_ports/ppc.h new file mode 100644 index 000000000..3159bda68 --- /dev/null +++ b/libs/libaom/src/aom_ports/ppc.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_PORTS_PPC_H_ +#define AOM_AOM_PORTS_PPC_H_ +#include + +#include "config/aom_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define HAS_VSX 0x01 + +int ppc_simd_caps(void); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_PORTS_PPC_H_ diff --git a/libs/libaom/src/aom_ports/ppc_cpudetect.c b/libs/libaom/src/aom_ports/ppc_cpudetect.c new file mode 100644 index 000000000..ce4d5ae23 --- /dev/null +++ b/libs/libaom/src/aom_ports/ppc_cpudetect.c @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include + +#include "config/aom_config.h" + +#include "aom_ports/ppc.h" + +#if CONFIG_RUNTIME_CPU_DETECT +static int cpu_env_flags(int *flags) { + char *env; + env = getenv("AOM_SIMD_CAPS"); + if (env && *env) { + *flags = (int)strtol(env, NULL, 0); + return 0; + } + *flags = 0; + return -1; +} + +static int cpu_env_mask(void) { + char *env; + env = getenv("AOM_SIMD_CAPS_MASK"); + return env && *env ? (int)strtol(env, NULL, 0) : ~0; +} + +int ppc_simd_caps(void) { + int flags; + int mask; + int fd; + ssize_t count; + unsigned int i; + uint64_t buf[64]; + + // If AOM_SIMD_CAPS_MASK is set then allow only those capabilities. + if (!cpu_env_flags(&flags)) { + return flags; + } + + mask = cpu_env_mask(); + + fd = open("/proc/self/auxv", O_RDONLY); + if (fd < 0) { + return 0; + } + + while ((count = read(fd, buf, sizeof(buf))) > 0) { + for (i = 0; i < (count / sizeof(*buf)); i += 2) { + if (buf[i] == AT_HWCAP) { +#if HAVE_VSX + if (buf[i + 1] & PPC_FEATURE_HAS_VSX) { + flags |= HAS_VSX; + } +#endif // HAVE_VSX + goto out_close; + } else if (buf[i] == AT_NULL) { + goto out_close; + } + } + } +out_close: + close(fd); + return flags & mask; +} +#else +// If there is no RTCD the function pointers are not used and can not be +// changed. +int ppc_simd_caps(void) { return 0; } +#endif // CONFIG_RUNTIME_CPU_DETECT diff --git a/libs/libaom/src/aom_ports/sanitizer.h b/libs/libaom/src/aom_ports/sanitizer.h new file mode 100644 index 000000000..1dd8eb4cf --- /dev/null +++ b/libs/libaom/src/aom_ports/sanitizer.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_PORTS_SANITIZER_H_ +#define AOM_AOM_PORTS_SANITIZER_H_ + +// AddressSanitizer support. + +// Define AOM_ADDRESS_SANITIZER if AddressSanitizer is used. +// Clang. +#if defined(__has_feature) +#if __has_feature(address_sanitizer) +#define AOM_ADDRESS_SANITIZER 1 +#endif +#endif // defined(__has_feature) +// GCC. +#if defined(__SANITIZE_ADDRESS__) +#define AOM_ADDRESS_SANITIZER 1 +#endif // defined(__SANITIZE_ADDRESS__) + +// Define the macros for AddressSanitizer manual memory poisoning. See +// https://github.com/google/sanitizers/wiki/AddressSanitizerManualPoisoning. +#if defined(AOM_ADDRESS_SANITIZER) +#include +#else +#define ASAN_POISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size)) +#define ASAN_UNPOISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size)) +#endif + +#endif // AOM_AOM_PORTS_SANITIZER_H_ diff --git a/libs/libaom/src/aom_ports/system_state.h b/libs/libaom/src/aom_ports/system_state.h new file mode 100644 index 000000000..6640839d8 --- /dev/null +++ b/libs/libaom/src/aom_ports/system_state.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_PORTS_SYSTEM_STATE_H_ +#define AOM_AOM_PORTS_SYSTEM_STATE_H_ + +#include "config/aom_config.h" + +#if ARCH_X86 || ARCH_X86_64 +void aom_reset_mmx_state(void); +#define aom_clear_system_state() aom_reset_mmx_state() +#else +#define aom_clear_system_state() +#endif // ARCH_X86 || ARCH_X86_64 +#endif // AOM_AOM_PORTS_SYSTEM_STATE_H_ diff --git a/libs/libaom/src/aom_ports/x86.h b/libs/libaom/src/aom_ports/x86.h new file mode 100644 index 000000000..8c1844871 --- /dev/null +++ b/libs/libaom/src/aom_ports/x86.h @@ -0,0 +1,375 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_PORTS_X86_H_ +#define AOM_AOM_PORTS_X86_H_ +#include + +#if defined(_MSC_VER) +#include /* For __cpuidex, __rdtsc */ +#endif + +#include "aom/aom_integer.h" +#include "config/aom_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + AOM_CPU_UNKNOWN = -1, + AOM_CPU_AMD, + AOM_CPU_AMD_OLD, + AOM_CPU_CENTAUR, + AOM_CPU_CYRIX, + AOM_CPU_INTEL, + AOM_CPU_NEXGEN, + AOM_CPU_NSC, + AOM_CPU_RISE, + AOM_CPU_SIS, + AOM_CPU_TRANSMETA, + AOM_CPU_TRANSMETA_OLD, + AOM_CPU_UMC, + AOM_CPU_VIA, + + AOM_CPU_LAST +} aom_cpu_t; + +#if defined(__GNUC__) && __GNUC__ || defined(__ANDROID__) +#if ARCH_X86_64 +#define cpuid(func, func2, ax, bx, cx, dx) \ + __asm__ __volatile__("cpuid \n\t" \ + : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \ + : "a"(func), "c"(func2)); +#else +#define cpuid(func, func2, ax, bx, cx, dx) \ + __asm__ __volatile__( \ + "mov %%ebx, %%edi \n\t" \ + "cpuid \n\t" \ + "xchg %%edi, %%ebx \n\t" \ + : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \ + : "a"(func), "c"(func2)); +#endif +#elif defined(__SUNPRO_C) || \ + defined(__SUNPRO_CC) /* end __GNUC__ or __ANDROID__*/ +#if ARCH_X86_64 +#define cpuid(func, func2, ax, bx, cx, dx) \ + asm volatile( \ + "xchg %rsi, %rbx \n\t" \ + "cpuid \n\t" \ + "movl %ebx, %edi \n\t" \ + "xchg %rsi, %rbx \n\t" \ + : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \ + : "a"(func), "c"(func2)); +#else +#define cpuid(func, func2, ax, bx, cx, dx) \ + asm volatile( \ + "pushl %ebx \n\t" \ + "cpuid \n\t" \ + "movl %ebx, %edi \n\t" \ + "popl %ebx \n\t" \ + : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \ + : "a"(func), "c"(func2)); +#endif +#else /* end __SUNPRO__ */ +#if ARCH_X86_64 +#if defined(_MSC_VER) && _MSC_VER > 1500 +#define cpuid(func, func2, a, b, c, d) \ + do { \ + int regs[4]; \ + __cpuidex(regs, func, func2); \ + a = regs[0]; \ + b = regs[1]; \ + c = regs[2]; \ + d = regs[3]; \ + } while (0) +#else +#define cpuid(func, func2, a, b, c, d) \ + do { \ + int regs[4]; \ + __cpuid(regs, func); \ + a = regs[0]; \ + b = regs[1]; \ + c = regs[2]; \ + d = regs[3]; \ + } while (0) +#endif +#else +/* clang-format off */ +#define cpuid(func, func2, a, b, c, d) \ + __asm mov eax, func \ + __asm mov ecx, func2 \ + __asm cpuid \ + __asm mov a, eax \ + __asm mov b, ebx \ + __asm mov c, ecx \ + __asm mov d, edx +#endif +/* clang-format on */ +#endif /* end others */ + +// NaCl has no support for xgetbv or the raw opcode. +#if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__)) +static INLINE uint64_t xgetbv(void) { + const uint32_t ecx = 0; + uint32_t eax, edx; + // Use the raw opcode for xgetbv for compatibility with older toolchains. + __asm__ volatile(".byte 0x0f, 0x01, 0xd0\n" + : "=a"(eax), "=d"(edx) + : "c"(ecx)); + return ((uint64_t)edx << 32) | eax; +} +#elif (defined(_M_X64) || defined(_M_IX86)) && defined(_MSC_FULL_VER) && \ + _MSC_FULL_VER >= 160040219 // >= VS2010 SP1 +#include +#define xgetbv() _xgetbv(0) +#elif defined(_MSC_VER) && defined(_M_IX86) +static INLINE uint64_t xgetbv(void) { + uint32_t eax_, edx_; + __asm { + xor ecx, ecx // ecx = 0 + // Use the raw opcode for xgetbv for compatibility with older toolchains. + __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0 + mov eax_, eax + mov edx_, edx + } + return ((uint64_t)edx_ << 32) | eax_; +} +#else +#define xgetbv() 0U // no AVX for older x64 or unrecognized toolchains. +#endif + +#if defined(_MSC_VER) && _MSC_VER >= 1700 +#include +#if WINAPI_FAMILY_PARTITION(WINAPI_FAMILY_APP) +#define getenv(x) NULL +#endif +#endif + +#define HAS_MMX 0x01 +#define HAS_SSE 0x02 +#define HAS_SSE2 0x04 +#define HAS_SSE3 0x08 +#define HAS_SSSE3 0x10 +#define HAS_SSE4_1 0x20 +#define HAS_AVX 0x40 +#define HAS_AVX2 0x80 +#define HAS_SSE4_2 0x100 +#ifndef BIT +#define BIT(n) (1 << n) +#endif + +static INLINE int x86_simd_caps(void) { + unsigned int flags = 0; + unsigned int mask = ~0; + unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx; + char *env; + (void)reg_ebx; + + /* See if the CPU capabilities are being overridden by the environment */ + env = getenv("AOM_SIMD_CAPS"); + + if (env && *env) return (int)strtol(env, NULL, 0); + + env = getenv("AOM_SIMD_CAPS_MASK"); + + if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0); + + /* Ensure that the CPUID instruction supports extended features */ + cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx); + + if (max_cpuid_val < 1) return 0; + + /* Get the standard feature flags */ + cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); + + if (reg_edx & BIT(23)) flags |= HAS_MMX; + + if (reg_edx & BIT(25)) flags |= HAS_SSE; /* aka xmm */ + + if (reg_edx & BIT(26)) flags |= HAS_SSE2; /* aka wmt */ + + if (reg_ecx & BIT(0)) flags |= HAS_SSE3; + + if (reg_ecx & BIT(9)) flags |= HAS_SSSE3; + + if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1; + + if (reg_ecx & BIT(20)) flags |= HAS_SSE4_2; + + // bits 27 (OSXSAVE) & 28 (256-bit AVX) + if ((reg_ecx & (BIT(27) | BIT(28))) == (BIT(27) | BIT(28))) { + if ((xgetbv() & 0x6) == 0x6) { + flags |= HAS_AVX; + + if (max_cpuid_val >= 7) { + /* Get the leaf 7 feature flags. Needed to check for AVX2 support */ + cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); + + if (reg_ebx & BIT(5)) flags |= HAS_AVX2; + } + } + } + + return flags & mask; +} + +// Fine-Grain Measurement Functions +// +// If you are a timing a small region of code, access the timestamp counter +// (TSC) via: +// +// unsigned int start = x86_tsc_start(); +// ... +// unsigned int end = x86_tsc_end(); +// unsigned int diff = end - start; +// +// The start/end functions introduce a few more instructions than using +// x86_readtsc directly, but prevent the CPU's out-of-order execution from +// affecting the measurement (by having earlier/later instructions be evaluated +// in the time interval). See the white paper, "How to Benchmark Code +// Execution Times on Intel® IA-32 and IA-64 Instruction Set Architectures" by +// Gabriele Paoloni for more information. +// +// If you are timing a large function (CPU time > a couple of seconds), use +// x86_readtsc64 to read the timestamp counter in a 64-bit integer. The +// out-of-order leakage that can occur is minimal compared to total runtime. +static INLINE unsigned int x86_readtsc(void) { +#if defined(__GNUC__) && __GNUC__ + unsigned int tsc; + __asm__ __volatile__("rdtsc\n\t" : "=a"(tsc) :); + return tsc; +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) + unsigned int tsc; + asm volatile("rdtsc\n\t" : "=a"(tsc) :); + return tsc; +#else +#if ARCH_X86_64 + return (unsigned int)__rdtsc(); +#else + __asm rdtsc; +#endif +#endif +} +// 64-bit CPU cycle counter +static INLINE uint64_t x86_readtsc64(void) { +#if defined(__GNUC__) && __GNUC__ + uint32_t hi, lo; + __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); + return ((uint64_t)hi << 32) | lo; +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) + uint_t hi, lo; + asm volatile("rdtsc\n\t" : "=a"(lo), "=d"(hi)); + return ((uint64_t)hi << 32) | lo; +#else +#if ARCH_X86_64 + return (uint64_t)__rdtsc(); +#else + __asm rdtsc; +#endif +#endif +} + +// 32-bit CPU cycle counter with a partial fence against out-of-order execution. +static INLINE unsigned int x86_readtscp(void) { +#if defined(__GNUC__) && __GNUC__ + unsigned int tscp; + __asm__ __volatile__("rdtscp\n\t" : "=a"(tscp) :); + return tscp; +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) + unsigned int tscp; + asm volatile("rdtscp\n\t" : "=a"(tscp) :); + return tscp; +#elif defined(_MSC_VER) + unsigned int ui; + return (unsigned int)__rdtscp(&ui); +#else +#if ARCH_X86_64 + return (unsigned int)__rdtscp(); +#else + __asm rdtscp; +#endif +#endif +} + +static INLINE unsigned int x86_tsc_start(void) { + unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx; + cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); + return x86_readtsc(); +} + +static INLINE unsigned int x86_tsc_end(void) { + uint32_t v = x86_readtscp(); + unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx; + cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); + return v; +} + +#if defined(__GNUC__) && __GNUC__ +#define x86_pause_hint() __asm__ __volatile__("pause \n\t") +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) +#define x86_pause_hint() asm volatile("pause \n\t") +#else +#if ARCH_X86_64 +#define x86_pause_hint() _mm_pause(); +#else +#define x86_pause_hint() __asm pause +#endif +#endif + +#if defined(__GNUC__) && __GNUC__ +static void x87_set_control_word(unsigned short mode) { + __asm__ __volatile__("fldcw %0" : : "m"(*&mode)); +} +static unsigned short x87_get_control_word(void) { + unsigned short mode; + __asm__ __volatile__("fstcw %0\n\t" : "=m"(*&mode) :); + return mode; +} +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) +static void x87_set_control_word(unsigned short mode) { + asm volatile("fldcw %0" : : "m"(*&mode)); +} +static unsigned short x87_get_control_word(void) { + unsigned short mode; + asm volatile("fstcw %0\n\t" : "=m"(*&mode) :); + return mode; +} +#elif ARCH_X86_64 +/* No fldcw intrinsics on Windows x64, punt to external asm */ +extern void aom_winx64_fldcw(unsigned short mode); +extern unsigned short aom_winx64_fstcw(void); +#define x87_set_control_word aom_winx64_fldcw +#define x87_get_control_word aom_winx64_fstcw +#else +static void x87_set_control_word(unsigned short mode) { + __asm { fldcw mode } +} +static unsigned short x87_get_control_word(void) { + unsigned short mode; + __asm { fstcw mode } + return mode; +} +#endif + +static INLINE unsigned int x87_set_double_precision(void) { + unsigned int mode = x87_get_control_word(); + x87_set_control_word((mode & ~0x300) | 0x200); + return mode; +} + +extern void aom_reset_mmx_state(void); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_PORTS_X86_H_ diff --git a/libs/libaom/src/aom_ports/x86_abi_support.asm b/libs/libaom/src/aom_ports/x86_abi_support.asm new file mode 100644 index 000000000..64489908f --- /dev/null +++ b/libs/libaom/src/aom_ports/x86_abi_support.asm @@ -0,0 +1,402 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + +%include "config/aom_config.asm" + +; 32/64 bit compatibility macros +; +; In general, we make the source use 64 bit syntax, then twiddle with it using +; the preprocessor to get the 32 bit syntax on 32 bit platforms. +; +%ifidn __OUTPUT_FORMAT__,elf32 +%define ABI_IS_32BIT 1 +%elifidn __OUTPUT_FORMAT__,macho32 +%define ABI_IS_32BIT 1 +%elifidn __OUTPUT_FORMAT__,win32 +%define ABI_IS_32BIT 1 +%elifidn __OUTPUT_FORMAT__,aout +%define ABI_IS_32BIT 1 +%else +%define ABI_IS_32BIT 0 +%endif + +%if ABI_IS_32BIT +%define rax eax +%define rbx ebx +%define rcx ecx +%define rdx edx +%define rsi esi +%define rdi edi +%define rsp esp +%define rbp ebp +%define movsxd mov +%macro movq 2 + %ifidn %1,eax + movd %1,%2 + %elifidn %2,eax + movd %1,%2 + %elifidn %1,ebx + movd %1,%2 + %elifidn %2,ebx + movd %1,%2 + %elifidn %1,ecx + movd %1,%2 + %elifidn %2,ecx + movd %1,%2 + %elifidn %1,edx + movd %1,%2 + %elifidn %2,edx + movd %1,%2 + %elifidn %1,esi + movd %1,%2 + %elifidn %2,esi + movd %1,%2 + %elifidn %1,edi + movd %1,%2 + %elifidn %2,edi + movd %1,%2 + %elifidn %1,esp + movd %1,%2 + %elifidn %2,esp + movd %1,%2 + %elifidn %1,ebp + movd %1,%2 + %elifidn %2,ebp + movd %1,%2 + %else + movq %1,%2 + %endif +%endmacro +%endif + + +; LIBAOM_YASM_WIN64 +; Set LIBAOM_YASM_WIN64 if output is Windows 64bit so the code will work if x64 +; or win64 is defined on the Yasm command line. +%ifidn __OUTPUT_FORMAT__,win64 +%define LIBAOM_YASM_WIN64 1 +%elifidn __OUTPUT_FORMAT__,x64 +%define LIBAOM_YASM_WIN64 1 +%else +%define LIBAOM_YASM_WIN64 0 +%endif + +; sym() +; Return the proper symbol name for the target ABI. +; +; Certain ABIs, notably MS COFF and Darwin MACH-O, require that symbols +; with C linkage be prefixed with an underscore. +; +%ifidn __OUTPUT_FORMAT__,elf32 +%define sym(x) x +%elifidn __OUTPUT_FORMAT__,elf64 +%define sym(x) x +%elifidn __OUTPUT_FORMAT__,elfx32 +%define sym(x) x +%elif LIBAOM_YASM_WIN64 +%define sym(x) x +%else +%define sym(x) _ %+ x +%endif + +; PRIVATE +; Macro for the attribute to hide a global symbol for the target ABI. +; This is only active if CHROMIUM is defined. +; +; Chromium doesn't like exported global symbols due to symbol clashing with +; plugins among other things. +; +; Requires Chromium's patched copy of yasm: +; http://src.chromium.org/viewvc/chrome?view=rev&revision=73761 +; http://www.tortall.net/projects/yasm/ticket/236 +; +%ifdef CHROMIUM + %ifdef __NASM_VER__ + %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14 + ; nasm < 2.14 does not support :private_extern directive + %fatal Must use nasm 2.14 or newer + %endif + %endif + + %ifidn __OUTPUT_FORMAT__,elf32 + %define PRIVATE :hidden + %elifidn __OUTPUT_FORMAT__,elf64 + %define PRIVATE :hidden + %elifidn __OUTPUT_FORMAT__,elfx32 + %define PRIVATE :hidden + %elif LIBAOM_YASM_WIN64 + %define PRIVATE + %else + %define PRIVATE :private_extern + %endif +%else + %define PRIVATE +%endif + +; arg() +; Return the address specification of the given argument +; +%if ABI_IS_32BIT + %define arg(x) [ebp+8+4*x] +%else + ; 64 bit ABI passes arguments in registers. This is a workaround to get up + ; and running quickly. Relies on SHADOW_ARGS_TO_STACK + %if LIBAOM_YASM_WIN64 + %define arg(x) [rbp+16+8*x] + %else + %define arg(x) [rbp-8-8*x] + %endif +%endif + +; REG_SZ_BYTES, REG_SZ_BITS +; Size of a register +%if ABI_IS_32BIT +%define REG_SZ_BYTES 4 +%define REG_SZ_BITS 32 +%else +%define REG_SZ_BYTES 8 +%define REG_SZ_BITS 64 +%endif + + +; ALIGN_STACK +; This macro aligns the stack to the given alignment (in bytes). The stack +; is left such that the previous value of the stack pointer is the first +; argument on the stack (ie, the inverse of this macro is 'pop rsp.') +; This macro uses one temporary register, which is not preserved, and thus +; must be specified as an argument. +%macro ALIGN_STACK 2 + mov %2, rsp + and rsp, -%1 + lea rsp, [rsp - (%1 - REG_SZ_BYTES)] + push %2 +%endmacro + + +; +; The Microsoft assembler tries to impose a certain amount of type safety in +; its register usage. YASM doesn't recognize these directives, so we just +; %define them away to maintain as much compatibility as possible with the +; original inline assembler we're porting from. +; +%idefine PTR +%idefine XMMWORD +%idefine MMWORD + +; PIC macros +; +%if ABI_IS_32BIT + %if CONFIG_PIC=1 + %ifidn __OUTPUT_FORMAT__,elf32 + %define WRT_PLT wrt ..plt + %macro GET_GOT 1 + extern _GLOBAL_OFFSET_TABLE_ + push %1 + call %%get_got + %%sub_offset: + jmp %%exitGG + %%get_got: + mov %1, [esp] + add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc + ret + %%exitGG: + %undef GLOBAL + %define GLOBAL(x) x + %1 wrt ..gotoff + %undef RESTORE_GOT + %define RESTORE_GOT pop %1 + %endmacro + %elifidn __OUTPUT_FORMAT__,macho32 + %macro GET_GOT 1 + push %1 + call %%get_got + %%get_got: + pop %1 + %undef GLOBAL + %define GLOBAL(x) x + %1 - %%get_got + %undef RESTORE_GOT + %define RESTORE_GOT pop %1 + %endmacro + %endif + %endif + + %ifdef CHROMIUM + %ifidn __OUTPUT_FORMAT__,macho32 + %define HIDDEN_DATA(x) x:private_extern + %else + %define HIDDEN_DATA(x) x + %endif + %else + %define HIDDEN_DATA(x) x + %endif +%else + %macro GET_GOT 1 + %endmacro + %define GLOBAL(x) rel x + %ifidn __OUTPUT_FORMAT__,elf64 + %define WRT_PLT wrt ..plt + %define HIDDEN_DATA(x) x:data hidden + %elifidn __OUTPUT_FORMAT__,elfx32 + %define WRT_PLT wrt ..plt + %define HIDDEN_DATA(x) x:data hidden + %elifidn __OUTPUT_FORMAT__,macho64 + %ifdef CHROMIUM + %define HIDDEN_DATA(x) x:private_extern + %else + %define HIDDEN_DATA(x) x + %endif + %else + %define HIDDEN_DATA(x) x + %endif +%endif +%ifnmacro GET_GOT + %macro GET_GOT 1 + %endmacro + %define GLOBAL(x) x +%endif +%ifndef RESTORE_GOT +%define RESTORE_GOT +%endif +%ifndef WRT_PLT +%define WRT_PLT +%endif + +%if ABI_IS_32BIT + %macro SHADOW_ARGS_TO_STACK 1 + %endm + %define UNSHADOW_ARGS +%else +%if LIBAOM_YASM_WIN64 + %macro SHADOW_ARGS_TO_STACK 1 ; argc + %if %1 > 0 + mov arg(0),rcx + %endif + %if %1 > 1 + mov arg(1),rdx + %endif + %if %1 > 2 + mov arg(2),r8 + %endif + %if %1 > 3 + mov arg(3),r9 + %endif + %endm +%else + %macro SHADOW_ARGS_TO_STACK 1 ; argc + %if %1 > 0 + push rdi + %endif + %if %1 > 1 + push rsi + %endif + %if %1 > 2 + push rdx + %endif + %if %1 > 3 + push rcx + %endif + %if %1 > 4 + push r8 + %endif + %if %1 > 5 + push r9 + %endif + %if %1 > 6 + %assign i %1-6 + %assign off 16 + %rep i + mov rax,[rbp+off] + push rax + %assign off off+8 + %endrep + %endif + %endm +%endif + %define UNSHADOW_ARGS mov rsp, rbp +%endif + +; Win64 ABI requires that XMM6:XMM15 are callee saved +; SAVE_XMM n, [u] +; store registers 6-n on the stack +; if u is specified, use unaligned movs. +; Win64 ABI requires 16 byte stack alignment, but then pushes an 8 byte return +; value. Typically we follow this up with 'push rbp' - re-aligning the stack - +; but in some cases this is not done and unaligned movs must be used. +%if LIBAOM_YASM_WIN64 +%macro SAVE_XMM 1-2 a + %if %1 < 6 + %error Only xmm registers 6-15 must be preserved + %else + %assign last_xmm %1 + %define movxmm movdq %+ %2 + %assign xmm_stack_space ((last_xmm - 5) * 16) + sub rsp, xmm_stack_space + %assign i 6 + %rep (last_xmm - 5) + movxmm [rsp + ((i - 6) * 16)], xmm %+ i + %assign i i+1 + %endrep + %endif +%endmacro +%macro RESTORE_XMM 0 + %ifndef last_xmm + %error RESTORE_XMM must be paired with SAVE_XMM n + %else + %assign i last_xmm + %rep (last_xmm - 5) + movxmm xmm %+ i, [rsp +((i - 6) * 16)] + %assign i i-1 + %endrep + add rsp, xmm_stack_space + ; there are a couple functions which return from multiple places. + ; otherwise, we could uncomment these: + ; %undef last_xmm + ; %undef xmm_stack_space + ; %undef movxmm + %endif +%endmacro +%else +%macro SAVE_XMM 1-2 +%endmacro +%macro RESTORE_XMM 0 +%endmacro +%endif + +; Name of the rodata section +; +; .rodata seems to be an elf-ism, as it doesn't work on OSX. +; +%ifidn __OUTPUT_FORMAT__,macho64 +%define SECTION_RODATA section .text +%elifidn __OUTPUT_FORMAT__,macho32 +%macro SECTION_RODATA 0 +section .text +%endmacro +%elifidn __OUTPUT_FORMAT__,aout +%define SECTION_RODATA section .data +%else +%define SECTION_RODATA section .rodata +%endif + + +; Tell GNU ld that we don't require an executable stack. +%ifidn __OUTPUT_FORMAT__,elf32 +section .note.GNU-stack noalloc noexec nowrite progbits +section .text +%elifidn __OUTPUT_FORMAT__,elf64 +section .note.GNU-stack noalloc noexec nowrite progbits +section .text +%elifidn __OUTPUT_FORMAT__,elfx32 +section .note.GNU-stack noalloc noexec nowrite progbits +section .text +%endif diff --git a/libs/libaom/src/aom_scale/aom_scale.cmake b/libs/libaom/src/aom_scale/aom_scale.cmake new file mode 100644 index 000000000..e83299320 --- /dev/null +++ b/libs/libaom/src/aom_scale/aom_scale.cmake @@ -0,0 +1,45 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_AOM_SCALE_AOM_SCALE_CMAKE_) + return() +endif() # AOM_AOM_SCALE_AOM_SCALE_CMAKE_ +set(AOM_AOM_SCALE_AOM_SCALE_CMAKE_ 1) + +list(APPEND AOM_SCALE_SOURCES "${AOM_ROOT}/aom_scale/aom_scale.h" + "${AOM_ROOT}/aom_scale/generic/aom_scale.c" + "${AOM_ROOT}/aom_scale/generic/gen_scalers.c" + "${AOM_ROOT}/aom_scale/generic/yv12config.c" + "${AOM_ROOT}/aom_scale/generic/yv12extend.c" + "${AOM_ROOT}/aom_scale/yv12config.h") + +list(APPEND AOM_SCALE_INTRIN_DSPR2 + "${AOM_ROOT}/aom_scale/mips/dspr2/yv12extend_dspr2.c") + +# Creates the aom_scale build target and makes libaom depend on it. The libaom +# target must exist before this function is called. +function(setup_aom_scale_targets) + add_library(aom_scale OBJECT ${AOM_SCALE_SOURCES}) + target_sources(aom PRIVATE $) + + if(HAVE_DSPR2) + add_intrinsics_object_library("" "dspr2" "aom_scale" + "AOM_SCALE_INTRIN_DSPR2") + endif() + + target_sources(aom PRIVATE $) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE $) + endif() + + # Pass the new lib targets up to the parent scope instance of + # $AOM_LIB_TARGETS. + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_scale PARENT_SCOPE) +endfunction() diff --git a/libs/libaom/src/aom_scale/aom_scale.h b/libs/libaom/src/aom_scale/aom_scale.h new file mode 100644 index 000000000..11812a145 --- /dev/null +++ b/libs/libaom/src/aom_scale/aom_scale.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_SCALE_AOM_SCALE_H_ +#define AOM_AOM_SCALE_AOM_SCALE_H_ + +#include "aom_scale/yv12config.h" + +extern void aom_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, + unsigned char *temp_area, unsigned char temp_height, + unsigned int hscale, unsigned int hratio, + unsigned int vscale, unsigned int vratio, + unsigned int interlaced, const int num_planes); + +#endif // AOM_AOM_SCALE_AOM_SCALE_H_ diff --git a/libs/libaom/src/aom_scale/aom_scale_rtcd.c b/libs/libaom/src/aom_scale/aom_scale_rtcd.c new file mode 100644 index 000000000..a04e053b0 --- /dev/null +++ b/libs/libaom/src/aom_scale/aom_scale_rtcd.c @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "config/aom_config.h" + +#define RTCD_C +#include "config/aom_scale_rtcd.h" + +#include "aom_ports/aom_once.h" + +void aom_scale_rtcd() { aom_once(setup_rtcd_internal); } diff --git a/libs/libaom/src/aom_scale/aom_scale_rtcd.pl b/libs/libaom/src/aom_scale/aom_scale_rtcd.pl new file mode 100644 index 000000000..eef6f16a7 --- /dev/null +++ b/libs/libaom/src/aom_scale/aom_scale_rtcd.pl @@ -0,0 +1,55 @@ +## +## Copyright (c) 2017, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +sub aom_scale_forward_decls() { +print <> 4); + source += source_step; + dest += dest_step; + } +} + +/**************************************************************************** + * + * ROUTINE : scale1d_2t1_ps + * + * INPUTS : const unsigned char *source : Pointer to data to be scaled. + * int source_step : Number of pixels to step on + * in source. + * unsigned int source_scale : Scale for source (UNUSED). + * unsigned int source_length : Length of source (UNUSED). + * unsigned char *dest : Pointer to output data array. + * int dest_step : Number of pixels to step on + * in destination. + * unsigned int dest_scale : Scale for destination + * (UNUSED). + * unsigned int dest_length : Length of destination. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs 2-to-1 point subsampled scaling. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static void scale1d_2t1_ps(const unsigned char *source, int source_step, + unsigned int source_scale, + unsigned int source_length, unsigned char *dest, + int dest_step, unsigned int dest_scale, + unsigned int dest_length) { + const unsigned char *const dest_end = dest + dest_length * dest_step; + (void)source_length; + (void)source_scale; + (void)dest_scale; + + source_step *= 2; // Every other row. + + while (dest < dest_end) { + *dest = *source; + source += source_step; + dest += dest_step; + } +} +/**************************************************************************** + * + * ROUTINE : scale1d_c + * + * INPUTS : const unsigned char *source : Pointer to data to be scaled. + * int source_step : Number of pixels to step on + * in source. + * unsigned int source_scale : Scale for source. + * unsigned int source_length : Length of source (UNUSED). + * unsigned char *dest : Pointer to output data array. + * int dest_step : Number of pixels to step on + * in destination. + * unsigned int dest_scale : Scale for destination. + * unsigned int dest_length : Length of destination. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs linear interpolation in one dimension. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static void scale1d_c(const unsigned char *source, int source_step, + unsigned int source_scale, unsigned int source_length, + unsigned char *dest, int dest_step, + unsigned int dest_scale, unsigned int dest_length) { + const unsigned char *const dest_end = dest + dest_length * dest_step; + const unsigned int round_value = dest_scale / 2; + unsigned int left_modifier = dest_scale; + unsigned int right_modifier = 0; + unsigned char left_pixel = source[0]; + unsigned char right_pixel = source[source_step]; + + (void)source_length; + + /* These asserts are needed if there are boundary issues... */ + /* assert ( dest_scale > source_scale );*/ + /* assert ( (source_length - 1) * dest_scale >= (dest_length - 1) * + * source_scale);*/ + + while (dest < dest_end) { + *dest = (unsigned char)((left_modifier * left_pixel + + right_modifier * right_pixel + round_value) / + dest_scale); + + right_modifier += source_scale; + + while (right_modifier > dest_scale) { + right_modifier -= dest_scale; + source += source_step; + left_pixel = source[0]; + right_pixel = source[source_step]; + } + + left_modifier = dest_scale - right_modifier; + } +} + +/**************************************************************************** + * + * ROUTINE : Scale2D + * + * INPUTS : const unsigned char *source : Pointer to data to be + * scaled. + * int source_pitch : Stride of source image. + * unsigned int source_width : Width of input image. + * unsigned int source_height : Height of input image. + * unsigned char *dest : Pointer to output data + * array. + * int dest_pitch : Stride of destination + * image. + * unsigned int dest_width : Width of destination image. + * unsigned int dest_height : Height of destination + * image. + * unsigned char *temp_area : Pointer to temp work area. + * unsigned char temp_area_height : Height of temp work area. + * unsigned int hscale : Horizontal scale factor + * numerator. + * unsigned int hratio : Horizontal scale factor + * denominator. + * unsigned int vscale : Vertical scale factor + * numerator. + * unsigned int vratio : Vertical scale factor + * denominator. + * unsigned int interlaced : Interlace flag. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs 2-tap linear interpolation in two dimensions. + * + * SPECIAL NOTES : Expansion is performed one band at a time to help with + * caching. + * + ****************************************************************************/ +static void Scale2D( + /*const*/ + unsigned char *source, int source_pitch, unsigned int source_width, + unsigned int source_height, unsigned char *dest, int dest_pitch, + unsigned int dest_width, unsigned int dest_height, unsigned char *temp_area, + unsigned char temp_area_height, unsigned int hscale, unsigned int hratio, + unsigned int vscale, unsigned int vratio, unsigned int interlaced) { + unsigned int i, j, k; + unsigned int bands; + unsigned int dest_band_height; + unsigned int source_band_height; + + typedef void (*Scale1D)(const unsigned char *source, int source_step, + unsigned int source_scale, unsigned int source_length, + unsigned char *dest, int dest_step, + unsigned int dest_scale, unsigned int dest_length); + + Scale1D Scale1Dv = scale1d_c; + Scale1D Scale1Dh = scale1d_c; + + void (*horiz_line_scale)(const unsigned char *, unsigned int, unsigned char *, + unsigned int) = NULL; + void (*vert_band_scale)(unsigned char *, int, unsigned char *, int, + unsigned int) = NULL; + + int ratio_scalable = 1; + int interpolation = 0; + + unsigned char *source_base; + unsigned char *line_src; + + source_base = (unsigned char *)source; + + if (source_pitch < 0) { + int offset; + + offset = (source_height - 1); + offset *= source_pitch; + + source_base += offset; + } + + /* find out the ratio for each direction */ + switch (hratio * 10 / hscale) { + case 8: + /* 4-5 Scale in Width direction */ + horiz_line_scale = aom_horizontal_line_5_4_scale; + break; + case 6: + /* 3-5 Scale in Width direction */ + horiz_line_scale = aom_horizontal_line_5_3_scale; + break; + case 5: + /* 1-2 Scale in Width direction */ + horiz_line_scale = aom_horizontal_line_2_1_scale; + break; + default: + /* The ratio is not acceptable now */ + /* throw("The ratio is not acceptable for now!"); */ + ratio_scalable = 0; + break; + } + + switch (vratio * 10 / vscale) { + case 8: + /* 4-5 Scale in vertical direction */ + vert_band_scale = aom_vertical_band_5_4_scale; + source_band_height = 5; + dest_band_height = 4; + break; + case 6: + /* 3-5 Scale in vertical direction */ + vert_band_scale = aom_vertical_band_5_3_scale; + source_band_height = 5; + dest_band_height = 3; + break; + case 5: + /* 1-2 Scale in vertical direction */ + + if (interlaced) { + /* if the content is interlaced, point sampling is used */ + vert_band_scale = aom_vertical_band_2_1_scale; + } else { + interpolation = 1; + /* if the content is progressive, interplo */ + vert_band_scale = aom_vertical_band_2_1_scale_i; + } + + source_band_height = 2; + dest_band_height = 1; + break; + default: + /* The ratio is not acceptable now */ + /* throw("The ratio is not acceptable for now!"); */ + ratio_scalable = 0; + break; + } + + if (ratio_scalable) { + if (source_height == dest_height) { + /* for each band of the image */ + for (k = 0; k < dest_height; ++k) { + horiz_line_scale(source, source_width, dest, dest_width); + source += source_pitch; + dest += dest_pitch; + } + + return; + } + + if (interpolation) { + if (source < source_base) source = source_base; + + horiz_line_scale(source, source_width, temp_area, dest_width); + } + + for (k = 0; k < (dest_height + dest_band_height - 1) / dest_band_height; + ++k) { + /* scale one band horizontally */ + for (i = 0; i < source_band_height; ++i) { + /* Trap case where we could read off the base of the source buffer */ + + line_src = source + i * source_pitch; + + if (line_src < source_base) line_src = source_base; + + horiz_line_scale(line_src, source_width, + temp_area + (i + 1) * dest_pitch, dest_width); + } + + /* Vertical scaling is in place */ + vert_band_scale(temp_area + dest_pitch, dest_pitch, dest, dest_pitch, + dest_width); + + if (interpolation) + memcpy(temp_area, temp_area + source_band_height * dest_pitch, + dest_width); + + /* Next band... */ + source += (unsigned long)source_band_height * source_pitch; + dest += (unsigned long)dest_band_height * dest_pitch; + } + + return; + } + + if (hscale == 2 && hratio == 1) Scale1Dh = scale1d_2t1_ps; + + if (vscale == 2 && vratio == 1) { + if (interlaced) + Scale1Dv = scale1d_2t1_ps; + else + Scale1Dv = scale1d_2t1_i; + } + + if (source_height == dest_height) { + /* for each band of the image */ + for (k = 0; k < dest_height; ++k) { + Scale1Dh(source, 1, hscale, source_width + 1, dest, 1, hratio, + dest_width); + source += source_pitch; + dest += dest_pitch; + } + + return; + } + + if (dest_height > source_height) { + dest_band_height = temp_area_height - 1; + source_band_height = dest_band_height * source_height / dest_height; + } else { + source_band_height = temp_area_height - 1; + dest_band_height = source_band_height * vratio / vscale; + } + + /* first row needs to be done so that we can stay one row ahead for vertical + * zoom */ + Scale1Dh(source, 1, hscale, source_width + 1, temp_area, 1, hratio, + dest_width); + + /* for each band of the image */ + bands = (dest_height + dest_band_height - 1) / dest_band_height; + + for (k = 0; k < bands; ++k) { + /* scale one band horizontally */ + for (i = 1; i < source_band_height + 1; ++i) { + if (k * source_band_height + i < source_height) { + Scale1Dh(source + i * source_pitch, 1, hscale, source_width + 1, + temp_area + i * dest_pitch, 1, hratio, dest_width); + } else { /* Duplicate the last row */ + /* copy temp_area row 0 over from last row in the past */ + memcpy(temp_area + i * dest_pitch, temp_area + (i - 1) * dest_pitch, + dest_pitch); + } + } + + /* scale one band vertically */ + for (j = 0; j < dest_width; ++j) { + Scale1Dv(&temp_area[j], dest_pitch, vscale, source_band_height + 1, + &dest[j], dest_pitch, vratio, dest_band_height); + } + + /* copy temp_area row 0 over from last row in the past */ + memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_pitch); + + /* move to the next band */ + source += source_band_height * source_pitch; + dest += dest_band_height * dest_pitch; + } +} + +/**************************************************************************** + * + * ROUTINE : aom_scale_frame + * + * INPUTS : YV12_BUFFER_CONFIG *src : Pointer to frame to be + * scaled. + * YV12_BUFFER_CONFIG *dst : Pointer to buffer to hold + * scaled frame. + * unsigned char *temp_area : Pointer to temp work area. + * unsigned char temp_area_height : Height of temp work area. + * unsigned int hscale : Horizontal scale factor + * numerator. + * unsigned int hratio : Horizontal scale factor + * denominator. + * unsigned int vscale : Vertical scale factor + * numerator. + * unsigned int vratio : Vertical scale factor + * denominator. + * unsigned int interlaced : Interlace flag. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs 2-tap linear interpolation in two dimensions. + * + * SPECIAL NOTES : Expansion is performed one band at a time to help with + * caching. + * + ****************************************************************************/ +void aom_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, + unsigned char *temp_area, unsigned char temp_height, + unsigned int hscale, unsigned int hratio, + unsigned int vscale, unsigned int vratio, + unsigned int interlaced, const int num_planes) { + const int dw = (hscale - 1 + src->y_width * hratio) / hscale; + const int dh = (vscale - 1 + src->y_height * vratio) / vscale; + + for (int plane = 0; plane < num_planes; ++plane) { + const int is_uv = plane > 0; + const int plane_dw = dw >> is_uv; + const int plane_dh = dh >> is_uv; + + Scale2D((unsigned char *)src->buffers[plane], src->strides[is_uv], + src->widths[is_uv], src->heights[is_uv], + (unsigned char *)dst->buffers[plane], dst->strides[is_uv], plane_dw, + plane_dh, temp_area, temp_height, hscale, hratio, vscale, vratio, + interlaced); + + if (plane_dw < dst->widths[is_uv]) + for (int i = 0; i < plane_dh; ++i) + memset(dst->buffers[plane] + i * dst->strides[is_uv] + plane_dw - 1, + dst->buffers[plane][i * dst->strides[is_uv] + plane_dw - 2], + dst->widths[is_uv] - plane_dw + 1); + + if (plane_dh < dst->heights[is_uv]) + for (int i = plane_dh - 1; i < dst->heights[is_uv]; ++i) + memcpy(dst->buffers[plane] + i * dst->strides[is_uv], + dst->buffers[plane] + (plane_dh - 2) * dst->strides[is_uv], + dst->widths[is_uv] + 1); + } +} diff --git a/libs/libaom/src/aom_scale/generic/gen_scalers.c b/libs/libaom/src/aom_scale/generic/gen_scalers.c new file mode 100644 index 000000000..549e2aa69 --- /dev/null +++ b/libs/libaom/src/aom_scale/generic/gen_scalers.c @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_scale_rtcd.h" + +#include "aom_scale/aom_scale.h" +#include "aom_mem/aom_mem.h" +/**************************************************************************** + * Imports + ****************************************************************************/ + +/**************************************************************************** + * + * + * INPUTS : const unsigned char *source : Pointer to source data. + * unsigned int source_width : Stride of source. + * unsigned char *dest : Pointer to destination data. + * unsigned int dest_width : Stride of destination + * (NOT USED). + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies horizontal line of pixels from source to + * destination scaling up by 4 to 5. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +void aom_horizontal_line_5_4_scale_c(const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width) { + const unsigned char *const source_end = source + source_width; + (void)dest_width; + + while (source < source_end) { + const unsigned int a = source[0]; + const unsigned int b = source[1]; + const unsigned int c = source[2]; + const unsigned int d = source[3]; + const unsigned int e = source[4]; + + dest[0] = (unsigned char)a; + dest[1] = (unsigned char)((b * 192 + c * 64 + 128) >> 8); + dest[2] = (unsigned char)((c * 128 + d * 128 + 128) >> 8); + dest[3] = (unsigned char)((d * 64 + e * 192 + 128) >> 8); + + source += 5; + dest += 4; + } +} + +void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, + unsigned char *dest, int dest_pitch, + unsigned int dest_width) { + const unsigned char *const dest_end = dest + dest_width; + while (dest < dest_end) { + const unsigned int a = source[0 * src_pitch]; + const unsigned int b = source[1 * src_pitch]; + const unsigned int c = source[2 * src_pitch]; + const unsigned int d = source[3 * src_pitch]; + const unsigned int e = source[4 * src_pitch]; + + dest[0 * dest_pitch] = (unsigned char)a; + dest[1 * dest_pitch] = (unsigned char)((b * 192 + c * 64 + 128) >> 8); + dest[2 * dest_pitch] = (unsigned char)((c * 128 + d * 128 + 128) >> 8); + dest[3 * dest_pitch] = (unsigned char)((d * 64 + e * 192 + 128) >> 8); + + ++source; + ++dest; + } +} + +/*7*************************************************************************** + * + * ROUTINE : aom_horizontal_line_3_5_scale_c + * + * INPUTS : const unsigned char *source : Pointer to source data. + * unsigned int source_width : Stride of source. + * unsigned char *dest : Pointer to destination data. + * unsigned int dest_width : Stride of destination + * (NOT USED). + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies horizontal line of pixels from source to + * destination scaling up by 3 to 5. + * + * SPECIAL NOTES : None. + * + * + ****************************************************************************/ +void aom_horizontal_line_5_3_scale_c(const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width) { + const unsigned char *const source_end = source + source_width; + (void)dest_width; + while (source < source_end) { + const unsigned int a = source[0]; + const unsigned int b = source[1]; + const unsigned int c = source[2]; + const unsigned int d = source[3]; + const unsigned int e = source[4]; + + dest[0] = (unsigned char)a; + dest[1] = (unsigned char)((b * 85 + c * 171 + 128) >> 8); + dest[2] = (unsigned char)((d * 171 + e * 85 + 128) >> 8); + + source += 5; + dest += 3; + } +} + +void aom_vertical_band_5_3_scale_c(unsigned char *source, int src_pitch, + unsigned char *dest, int dest_pitch, + unsigned int dest_width) { + const unsigned char *const dest_end = dest + dest_width; + while (dest < dest_end) { + const unsigned int a = source[0 * src_pitch]; + const unsigned int b = source[1 * src_pitch]; + const unsigned int c = source[2 * src_pitch]; + const unsigned int d = source[3 * src_pitch]; + const unsigned int e = source[4 * src_pitch]; + + dest[0 * dest_pitch] = (unsigned char)a; + dest[1 * dest_pitch] = (unsigned char)((b * 85 + c * 171 + 128) >> 8); + dest[2 * dest_pitch] = (unsigned char)((d * 171 + e * 85 + 128) >> 8); + + ++source; + ++dest; + } +} + +/**************************************************************************** + * + * ROUTINE : aom_horizontal_line_1_2_scale_c + * + * INPUTS : const unsigned char *source : Pointer to source data. + * unsigned int source_width : Stride of source. + * unsigned char *dest : Pointer to destination data. + * unsigned int dest_width : Stride of destination + * (NOT USED). + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies horizontal line of pixels from source to + * destination scaling up by 1 to 2. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +void aom_horizontal_line_2_1_scale_c(const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width) { + const unsigned char *const source_end = source + source_width; + (void)dest_width; + while (source < source_end) { + dest[0] = source[0]; + source += 2; + ++dest; + } +} + +void aom_vertical_band_2_1_scale_c(unsigned char *source, int src_pitch, + unsigned char *dest, int dest_pitch, + unsigned int dest_width) { + (void)dest_pitch; + (void)src_pitch; + memcpy(dest, source, dest_width); +} + +void aom_vertical_band_2_1_scale_i_c(unsigned char *source, int src_pitch, + unsigned char *dest, int dest_pitch, + unsigned int dest_width) { + const unsigned char *const dest_end = dest + dest_width; + (void)dest_pitch; + while (dest < dest_end) { + const unsigned int a = source[-src_pitch] * 3; + const unsigned int b = source[0] * 10; + const unsigned int c = source[src_pitch] * 3; + dest[0] = (unsigned char)((8 + a + b + c) >> 4); + ++source; + ++dest; + } +} diff --git a/libs/libaom/src/aom_scale/generic/yv12config.c b/libs/libaom/src/aom_scale/generic/yv12config.c new file mode 100644 index 000000000..1f80d7ba7 --- /dev/null +++ b/libs/libaom/src/aom_scale/generic/yv12config.c @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom/internal/aom_image_internal.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "aom_scale/yv12config.h" +#include "av1/common/enums.h" + +/**************************************************************************** + * Exports + ****************************************************************************/ + +/**************************************************************************** + * + ****************************************************************************/ + +// TODO(jkoleszar): Maybe replace this with struct aom_image +int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) { + if (ybf) { + if (ybf->buffer_alloc_sz > 0) { + aom_free(ybf->buffer_alloc); + } + if (ybf->y_buffer_8bit) aom_free(ybf->y_buffer_8bit); + aom_remove_metadata_from_frame_buffer(ybf); + /* buffer_alloc isn't accessed by most functions. Rather y_buffer, + u_buffer and v_buffer point to buffer_alloc and are used. Clear out + all of this so that a freed pointer isn't inadvertently used */ + memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG)); + return 0; + } + + return AOM_CODEC_MEM_ERROR; +} + +static int realloc_frame_buffer_aligned( + YV12_BUFFER_CONFIG *ybf, int width, int height, int ss_x, int ss_y, + int use_highbitdepth, int border, int byte_alignment, + aom_codec_frame_buffer_t *fb, aom_get_frame_buffer_cb_fn_t cb, + void *cb_priv, const int y_stride, const uint64_t yplane_size, + const uint64_t uvplane_size, const int aligned_width, + const int aligned_height, const int uv_width, const int uv_height, + const int uv_stride, const int uv_border_w, const int uv_border_h) { + if (ybf) { + const int aom_byte_align = (byte_alignment == 0) ? 1 : byte_alignment; + const uint64_t frame_size = + (1 + use_highbitdepth) * (yplane_size + 2 * uvplane_size); + + uint8_t *buf = NULL; + +#if defined AOM_MAX_ALLOCABLE_MEMORY + // The size of ybf->buffer_alloc. + uint64_t alloc_size = frame_size; + // The size of ybf->y_buffer_8bit. + if (use_highbitdepth) alloc_size += yplane_size; + // The decoder may allocate REF_FRAMES frame buffers in the frame buffer + // pool. Bound the total amount of allocated memory as if these REF_FRAMES + // frame buffers were allocated in a single allocation. + if (alloc_size > AOM_MAX_ALLOCABLE_MEMORY / REF_FRAMES) + return AOM_CODEC_MEM_ERROR; +#endif + + if (cb != NULL) { + const int align_addr_extra_size = 31; + const uint64_t external_frame_size = frame_size + align_addr_extra_size; + + assert(fb != NULL); + + if (external_frame_size != (size_t)external_frame_size) + return AOM_CODEC_MEM_ERROR; + + // Allocation to hold larger frame, or first allocation. + if (cb(cb_priv, (size_t)external_frame_size, fb) < 0) + return AOM_CODEC_MEM_ERROR; + + if (fb->data == NULL || fb->size < external_frame_size) + return AOM_CODEC_MEM_ERROR; + + ybf->buffer_alloc = (uint8_t *)aom_align_addr(fb->data, 32); + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) + // This memset is needed for fixing the issue of using uninitialized + // value in msan test. It will cause a perf loss, so only do this for + // msan test. + memset(ybf->buffer_alloc, 0, (size_t)frame_size); +#endif +#endif + } else if (frame_size > ybf->buffer_alloc_sz) { + // Allocation to hold larger frame, or first allocation. + aom_free(ybf->buffer_alloc); + ybf->buffer_alloc = NULL; + ybf->buffer_alloc_sz = 0; + + if (frame_size != (size_t)frame_size) return AOM_CODEC_MEM_ERROR; + + ybf->buffer_alloc = (uint8_t *)aom_memalign(32, (size_t)frame_size); + if (!ybf->buffer_alloc) return AOM_CODEC_MEM_ERROR; + + ybf->buffer_alloc_sz = (size_t)frame_size; + + // This memset is needed for fixing valgrind error from C loop filter + // due to access uninitialized memory in frame border. It could be + // removed if border is totally removed. + memset(ybf->buffer_alloc, 0, ybf->buffer_alloc_sz); + } + + ybf->y_crop_width = width; + ybf->y_crop_height = height; + ybf->y_width = aligned_width; + ybf->y_height = aligned_height; + ybf->y_stride = y_stride; + + ybf->uv_crop_width = (width + ss_x) >> ss_x; + ybf->uv_crop_height = (height + ss_y) >> ss_y; + ybf->uv_width = uv_width; + ybf->uv_height = uv_height; + ybf->uv_stride = uv_stride; + + ybf->border = border; + ybf->frame_size = (size_t)frame_size; + ybf->subsampling_x = ss_x; + ybf->subsampling_y = ss_y; + + buf = ybf->buffer_alloc; + if (use_highbitdepth) { + // Store uint16 addresses when using 16bit framebuffers + buf = CONVERT_TO_BYTEPTR(ybf->buffer_alloc); + ybf->flags = YV12_FLAG_HIGHBITDEPTH; + } else { + ybf->flags = 0; + } + + ybf->y_buffer = (uint8_t *)aom_align_addr( + buf + (border * y_stride) + border, aom_byte_align); + ybf->u_buffer = (uint8_t *)aom_align_addr( + buf + yplane_size + (uv_border_h * uv_stride) + uv_border_w, + aom_byte_align); + ybf->v_buffer = + (uint8_t *)aom_align_addr(buf + yplane_size + uvplane_size + + (uv_border_h * uv_stride) + uv_border_w, + aom_byte_align); + + ybf->use_external_reference_buffers = 0; + + if (use_highbitdepth) { + if (ybf->y_buffer_8bit) aom_free(ybf->y_buffer_8bit); + ybf->y_buffer_8bit = (uint8_t *)aom_memalign(32, (size_t)yplane_size); + if (!ybf->y_buffer_8bit) return AOM_CODEC_MEM_ERROR; + } else { + if (ybf->y_buffer_8bit) { + aom_free(ybf->y_buffer_8bit); + ybf->y_buffer_8bit = NULL; + ybf->buf_8bit_valid = 0; + } + } + + ybf->corrupted = 0; /* assume not corrupted by errors */ + return 0; + } + return AOM_CODEC_MEM_ERROR; +} + +static int calc_stride_and_planesize(const int ss_x, const int ss_y, + const int aligned_width, + const int aligned_height, const int border, + const int byte_alignment, int *y_stride, + int *uv_stride, uint64_t *yplane_size, + uint64_t *uvplane_size, + const int uv_height) { + /* Only support allocating buffers that have a border that's a multiple + * of 32. The border restriction is required to get 16-byte alignment of + * the start of the chroma rows without introducing an arbitrary gap + * between planes, which would break the semantics of things like + * aom_img_set_rect(). */ + if (border & 0x1f) return AOM_CODEC_MEM_ERROR; + *y_stride = ((aligned_width + 2 * border) + 31) & ~31; + *yplane_size = + (aligned_height + 2 * border) * (uint64_t)(*y_stride) + byte_alignment; + + *uv_stride = *y_stride >> ss_x; + *uvplane_size = (uv_height + 2 * (border >> ss_y)) * (uint64_t)(*uv_stride) + + byte_alignment; + return 0; +} + +int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, + int ss_x, int ss_y, int use_highbitdepth, + int border, int byte_alignment, + aom_codec_frame_buffer_t *fb, + aom_get_frame_buffer_cb_fn_t cb, void *cb_priv) { +#if CONFIG_SIZE_LIMIT + if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) + return AOM_CODEC_MEM_ERROR; +#endif + + if (ybf) { + int y_stride = 0; + int uv_stride = 0; + uint64_t yplane_size = 0; + uint64_t uvplane_size = 0; + const int aligned_width = (width + 7) & ~7; + const int aligned_height = (height + 7) & ~7; + const int uv_width = aligned_width >> ss_x; + const int uv_height = aligned_height >> ss_y; + const int uv_border_w = border >> ss_x; + const int uv_border_h = border >> ss_y; + + int error = calc_stride_and_planesize( + ss_x, ss_y, aligned_width, aligned_height, border, byte_alignment, + &y_stride, &uv_stride, &yplane_size, &uvplane_size, uv_height); + if (error) return error; + return realloc_frame_buffer_aligned( + ybf, width, height, ss_x, ss_y, use_highbitdepth, border, + byte_alignment, fb, cb, cb_priv, y_stride, yplane_size, uvplane_size, + aligned_width, aligned_height, uv_width, uv_height, uv_stride, + uv_border_w, uv_border_h); + } + return AOM_CODEC_MEM_ERROR; +} + +int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, + int ss_x, int ss_y, int use_highbitdepth, int border, + int byte_alignment) { + if (ybf) { + aom_free_frame_buffer(ybf); + return aom_realloc_frame_buffer(ybf, width, height, ss_x, ss_y, + use_highbitdepth, border, byte_alignment, + NULL, NULL, NULL); + } + return AOM_CODEC_MEM_ERROR; +} + +void aom_remove_metadata_from_frame_buffer(YV12_BUFFER_CONFIG *ybf) { + if (ybf && ybf->metadata) { + aom_img_metadata_array_free(ybf->metadata); + ybf->metadata = NULL; + } +} + +int aom_copy_metadata_to_frame_buffer(YV12_BUFFER_CONFIG *ybf, + const aom_metadata_array_t *arr) { + if (!ybf || !arr || !arr->metadata_array) return -1; + aom_remove_metadata_from_frame_buffer(ybf); + ybf->metadata = aom_img_metadata_array_alloc(arr->sz); + if (!ybf->metadata) return -1; + for (size_t i = 0; i < ybf->metadata->sz; i++) { + ybf->metadata->metadata_array[i] = aom_img_metadata_alloc( + arr->metadata_array[i]->type, arr->metadata_array[i]->payload, + arr->metadata_array[i]->sz, arr->metadata_array[i]->insert_flag); + if (ybf->metadata->metadata_array[i] == NULL) { + aom_img_metadata_array_free(ybf->metadata); + ybf->metadata = NULL; + return -1; + } + } + ybf->metadata->sz = arr->sz; + return 0; +} diff --git a/libs/libaom/src/aom_scale/generic/yv12extend.c b/libs/libaom/src/aom_scale/generic/yv12extend.c new file mode 100644 index 000000000..834a59dbf --- /dev/null +++ b/libs/libaom/src/aom_scale/generic/yv12extend.c @@ -0,0 +1,477 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "aom_scale/yv12config.h" + +static void extend_plane(uint8_t *const src, int src_stride, int width, + int height, int extend_top, int extend_left, + int extend_bottom, int extend_right) { + int i; + const int linesize = extend_left + extend_right + width; + + /* copy the left and right most columns out */ + uint8_t *src_ptr1 = src; + uint8_t *src_ptr2 = src + width - 1; + uint8_t *dst_ptr1 = src - extend_left; + uint8_t *dst_ptr2 = src + width; + + for (i = 0; i < height; ++i) { + memset(dst_ptr1, src_ptr1[0], extend_left); + memset(dst_ptr2, src_ptr2[0], extend_right); + src_ptr1 += src_stride; + src_ptr2 += src_stride; + dst_ptr1 += src_stride; + dst_ptr2 += src_stride; + } + + /* Now copy the top and bottom lines into each line of the respective + * borders + */ + src_ptr1 = src - extend_left; + src_ptr2 = src + src_stride * (height - 1) - extend_left; + dst_ptr1 = src + src_stride * -extend_top - extend_left; + dst_ptr2 = src + src_stride * height - extend_left; + + for (i = 0; i < extend_top; ++i) { + memcpy(dst_ptr1, src_ptr1, linesize); + dst_ptr1 += src_stride; + } + + for (i = 0; i < extend_bottom; ++i) { + memcpy(dst_ptr2, src_ptr2, linesize); + dst_ptr2 += src_stride; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void extend_plane_high(uint8_t *const src8, int src_stride, int width, + int height, int extend_top, int extend_left, + int extend_bottom, int extend_right) { + int i; + const int linesize = extend_left + extend_right + width; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + + /* copy the left and right most columns out */ + uint16_t *src_ptr1 = src; + uint16_t *src_ptr2 = src + width - 1; + uint16_t *dst_ptr1 = src - extend_left; + uint16_t *dst_ptr2 = src + width; + + for (i = 0; i < height; ++i) { + aom_memset16(dst_ptr1, src_ptr1[0], extend_left); + aom_memset16(dst_ptr2, src_ptr2[0], extend_right); + src_ptr1 += src_stride; + src_ptr2 += src_stride; + dst_ptr1 += src_stride; + dst_ptr2 += src_stride; + } + + /* Now copy the top and bottom lines into each line of the respective + * borders + */ + src_ptr1 = src - extend_left; + src_ptr2 = src + src_stride * (height - 1) - extend_left; + dst_ptr1 = src + src_stride * -extend_top - extend_left; + dst_ptr2 = src + src_stride * height - extend_left; + + for (i = 0; i < extend_top; ++i) { + memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t)); + dst_ptr1 += src_stride; + } + + for (i = 0; i < extend_bottom; ++i) { + memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t)); + dst_ptr2 += src_stride; + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +void aom_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf, + const int num_planes) { + assert(ybf->border % 2 == 0); + assert(ybf->y_height - ybf->y_crop_height < 16); + assert(ybf->y_width - ybf->y_crop_width < 16); + assert(ybf->y_height - ybf->y_crop_height >= 0); + assert(ybf->y_width - ybf->y_crop_width >= 0); + +#if CONFIG_AV1_HIGHBITDEPTH + if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) { + for (int plane = 0; plane < num_planes; ++plane) { + const int is_uv = plane > 0; + const int plane_border = ybf->border >> is_uv; + extend_plane_high( + ybf->buffers[plane], ybf->strides[is_uv], ybf->crop_widths[is_uv], + ybf->crop_heights[is_uv], plane_border, plane_border, + plane_border + ybf->heights[is_uv] - ybf->crop_heights[is_uv], + plane_border + ybf->widths[is_uv] - ybf->crop_widths[is_uv]); + } + return; + } +#endif + + for (int plane = 0; plane < num_planes; ++plane) { + const int is_uv = plane > 0; + const int plane_border = ybf->border >> is_uv; + extend_plane(ybf->buffers[plane], ybf->strides[is_uv], + ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], + plane_border, plane_border, + plane_border + ybf->heights[is_uv] - ybf->crop_heights[is_uv], + plane_border + ybf->widths[is_uv] - ybf->crop_widths[is_uv]); + } +} + +static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size, + const int num_planes) { + const int ss_x = ybf->uv_width < ybf->y_width; + const int ss_y = ybf->uv_height < ybf->y_height; + + assert(ybf->y_height - ybf->y_crop_height < 16); + assert(ybf->y_width - ybf->y_crop_width < 16); + assert(ybf->y_height - ybf->y_crop_height >= 0); + assert(ybf->y_width - ybf->y_crop_width >= 0); + +#if CONFIG_AV1_HIGHBITDEPTH + if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) { + for (int plane = 0; plane < num_planes; ++plane) { + const int is_uv = plane > 0; + const int top = ext_size >> (is_uv ? ss_y : 0); + const int left = ext_size >> (is_uv ? ss_x : 0); + const int bottom = top + ybf->heights[is_uv] - ybf->crop_heights[is_uv]; + const int right = left + ybf->widths[is_uv] - ybf->crop_widths[is_uv]; + extend_plane_high(ybf->buffers[plane], ybf->strides[is_uv], + ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], top, + left, bottom, right); + } + return; + } +#endif + + for (int plane = 0; plane < num_planes; ++plane) { + const int is_uv = plane > 0; + const int top = ext_size >> (is_uv ? ss_y : 0); + const int left = ext_size >> (is_uv ? ss_x : 0); + const int bottom = top + ybf->heights[is_uv] - ybf->crop_heights[is_uv]; + const int right = left + ybf->widths[is_uv] - ybf->crop_widths[is_uv]; + extend_plane(ybf->buffers[plane], ybf->strides[is_uv], + ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], top, left, + bottom, right); + } +} + +void aom_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf, const int num_planes) { + extend_frame(ybf, ybf->border, num_planes); +} + +void aom_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf, + const int num_planes) { + const int inner_bw = (ybf->border > AOMINNERBORDERINPIXELS) + ? AOMINNERBORDERINPIXELS + : ybf->border; + extend_frame(ybf, inner_bw, num_planes); +} + +void aom_extend_frame_borders_y_c(YV12_BUFFER_CONFIG *ybf) { + int ext_size = ybf->border; + assert(ybf->y_height - ybf->y_crop_height < 16); + assert(ybf->y_width - ybf->y_crop_width < 16); + assert(ybf->y_height - ybf->y_crop_height >= 0); + assert(ybf->y_width - ybf->y_crop_width >= 0); +#if CONFIG_AV1_HIGHBITDEPTH + if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) { + extend_plane_high(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width, + ybf->y_crop_height, ext_size, ext_size, + ext_size + ybf->y_height - ybf->y_crop_height, + ext_size + ybf->y_width - ybf->y_crop_width); + return; + } +#endif + extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width, + ybf->y_crop_height, ext_size, ext_size, + ext_size + ybf->y_height - ybf->y_crop_height, + ext_size + ybf->y_width - ybf->y_crop_width); +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) { + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + memcpy(dst, src, num * sizeof(uint16_t)); +} +#endif + +// Copies the source image into the destination image and updates the +// destination's UMV borders. +// Note: The frames are assumed to be identical in size. +void aom_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_bc, + YV12_BUFFER_CONFIG *dst_bc, const int num_planes) { +#if 0 + /* These assertions are valid in the codec, but the libaom-tester uses + * this code slightly differently. + */ + assert(src_bc->y_width == dst_bc->y_width); + assert(src_bc->y_height == dst_bc->y_height); +#endif + +#if CONFIG_AV1_HIGHBITDEPTH + assert((src_bc->flags & YV12_FLAG_HIGHBITDEPTH) == + (dst_bc->flags & YV12_FLAG_HIGHBITDEPTH)); + + if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) { + for (int plane = 0; plane < num_planes; ++plane) { + const uint8_t *plane_src = src_bc->buffers[plane]; + uint8_t *plane_dst = dst_bc->buffers[plane]; + const int is_uv = plane > 0; + + for (int row = 0; row < src_bc->heights[is_uv]; ++row) { + memcpy_short_addr(plane_dst, plane_src, src_bc->widths[is_uv]); + plane_src += src_bc->strides[is_uv]; + plane_dst += dst_bc->strides[is_uv]; + } + } + aom_yv12_extend_frame_borders_c(dst_bc, num_planes); + return; + } +#endif + for (int plane = 0; plane < num_planes; ++plane) { + const uint8_t *plane_src = src_bc->buffers[plane]; + uint8_t *plane_dst = dst_bc->buffers[plane]; + const int is_uv = plane > 0; + + for (int row = 0; row < src_bc->heights[is_uv]; ++row) { + memcpy(plane_dst, plane_src, src_bc->widths[is_uv]); + plane_src += src_bc->strides[is_uv]; + plane_dst += dst_bc->strides[is_uv]; + } + } + aom_yv12_extend_frame_borders_c(dst_bc, num_planes); +} + +void aom_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc) { + int row; + const uint8_t *src = src_ybc->y_buffer; + uint8_t *dst = dst_ybc->y_buffer; + +#if CONFIG_AV1_HIGHBITDEPTH + if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) { + const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); + uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); + for (row = 0; row < src_ybc->y_height; ++row) { + memcpy(dst16, src16, src_ybc->y_width * sizeof(uint16_t)); + src16 += src_ybc->y_stride; + dst16 += dst_ybc->y_stride; + } + return; + } +#endif + + for (row = 0; row < src_ybc->y_height; ++row) { + memcpy(dst, src, src_ybc->y_width); + src += src_ybc->y_stride; + dst += dst_ybc->y_stride; + } +} + +void aom_yv12_copy_u_c(const YV12_BUFFER_CONFIG *src_bc, + YV12_BUFFER_CONFIG *dst_bc) { + int row; + const uint8_t *src = src_bc->u_buffer; + uint8_t *dst = dst_bc->u_buffer; +#if CONFIG_AV1_HIGHBITDEPTH + if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) { + const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); + uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); + for (row = 0; row < src_bc->uv_height; ++row) { + memcpy(dst16, src16, src_bc->uv_width * sizeof(uint16_t)); + src16 += src_bc->uv_stride; + dst16 += dst_bc->uv_stride; + } + return; + } +#endif + for (row = 0; row < src_bc->uv_height; ++row) { + memcpy(dst, src, src_bc->uv_width); + src += src_bc->uv_stride; + dst += dst_bc->uv_stride; + } +} + +void aom_yv12_copy_v_c(const YV12_BUFFER_CONFIG *src_bc, + YV12_BUFFER_CONFIG *dst_bc) { + int row; + const uint8_t *src = src_bc->v_buffer; + uint8_t *dst = dst_bc->v_buffer; +#if CONFIG_AV1_HIGHBITDEPTH + if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) { + const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); + uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); + for (row = 0; row < src_bc->uv_height; ++row) { + memcpy(dst16, src16, src_bc->uv_width * sizeof(uint16_t)); + src16 += src_bc->uv_stride; + dst16 += dst_bc->uv_stride; + } + return; + } +#endif + for (row = 0; row < src_bc->uv_height; ++row) { + memcpy(dst, src, src_bc->uv_width); + src += src_bc->uv_stride; + dst += dst_bc->uv_stride; + } +} + +void aom_yv12_partial_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc, int hstart1, + int hend1, int vstart1, int vend1, + YV12_BUFFER_CONFIG *dst_ybc, int hstart2, + int vstart2) { + int row; + const uint8_t *src = src_ybc->y_buffer; + uint8_t *dst = dst_ybc->y_buffer; +#if CONFIG_AV1_HIGHBITDEPTH + if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) { + const uint16_t *src16 = + CONVERT_TO_SHORTPTR(src + vstart1 * src_ybc->y_stride + hstart1); + uint16_t *dst16 = + CONVERT_TO_SHORTPTR(dst + vstart2 * dst_ybc->y_stride + hstart2); + + for (row = vstart1; row < vend1; ++row) { + memcpy(dst16, src16, (hend1 - hstart1) * sizeof(uint16_t)); + src16 += src_ybc->y_stride; + dst16 += dst_ybc->y_stride; + } + return; + } +#endif + src = (src + vstart1 * src_ybc->y_stride + hstart1); + dst = (dst + vstart2 * dst_ybc->y_stride + hstart2); + + for (row = vstart1; row < vend1; ++row) { + memcpy(dst, src, (hend1 - hstart1)); + src += src_ybc->y_stride; + dst += dst_ybc->y_stride; + } +} + +void aom_yv12_partial_coloc_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc, int hstart, + int hend, int vstart, int vend) { + aom_yv12_partial_copy_y_c(src_ybc, hstart, hend, vstart, vend, dst_ybc, + hstart, vstart); +} + +void aom_yv12_partial_copy_u_c(const YV12_BUFFER_CONFIG *src_bc, int hstart1, + int hend1, int vstart1, int vend1, + YV12_BUFFER_CONFIG *dst_bc, int hstart2, + int vstart2) { + int row; + const uint8_t *src = src_bc->u_buffer; + uint8_t *dst = dst_bc->u_buffer; +#if CONFIG_AV1_HIGHBITDEPTH + if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) { + const uint16_t *src16 = + CONVERT_TO_SHORTPTR(src + vstart1 * src_bc->uv_stride + hstart1); + uint16_t *dst16 = + CONVERT_TO_SHORTPTR(dst + vstart2 * dst_bc->uv_stride + hstart2); + for (row = vstart1; row < vend1; ++row) { + memcpy(dst16, src16, (hend1 - hstart1) * sizeof(uint16_t)); + src16 += src_bc->uv_stride; + dst16 += dst_bc->uv_stride; + } + return; + } +#endif + src = (src + vstart1 * src_bc->uv_stride + hstart1); + dst = (dst + vstart2 * dst_bc->uv_stride + hstart2); + + for (row = vstart1; row < vend1; ++row) { + memcpy(dst, src, (hend1 - hstart1)); + src += src_bc->uv_stride; + dst += dst_bc->uv_stride; + } +} + +void aom_yv12_partial_coloc_copy_u_c(const YV12_BUFFER_CONFIG *src_bc, + YV12_BUFFER_CONFIG *dst_bc, int hstart, + int hend, int vstart, int vend) { + aom_yv12_partial_copy_u_c(src_bc, hstart, hend, vstart, vend, dst_bc, hstart, + vstart); +} + +void aom_yv12_partial_copy_v_c(const YV12_BUFFER_CONFIG *src_bc, int hstart1, + int hend1, int vstart1, int vend1, + YV12_BUFFER_CONFIG *dst_bc, int hstart2, + int vstart2) { + int row; + const uint8_t *src = src_bc->v_buffer; + uint8_t *dst = dst_bc->v_buffer; +#if CONFIG_AV1_HIGHBITDEPTH + if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) { + const uint16_t *src16 = + CONVERT_TO_SHORTPTR(src + vstart1 * src_bc->uv_stride + hstart1); + uint16_t *dst16 = + CONVERT_TO_SHORTPTR(dst + vstart2 * dst_bc->uv_stride + hstart2); + for (row = vstart1; row < vend1; ++row) { + memcpy(dst16, src16, (hend1 - hstart1) * sizeof(uint16_t)); + src16 += src_bc->uv_stride; + dst16 += dst_bc->uv_stride; + } + return; + } +#endif + src = (src + vstart1 * src_bc->uv_stride + hstart1); + dst = (dst + vstart2 * dst_bc->uv_stride + hstart2); + + for (row = vstart1; row < vend1; ++row) { + memcpy(dst, src, (hend1 - hstart1)); + src += src_bc->uv_stride; + dst += dst_bc->uv_stride; + } +} + +void aom_yv12_partial_coloc_copy_v_c(const YV12_BUFFER_CONFIG *src_bc, + YV12_BUFFER_CONFIG *dst_bc, int hstart, + int hend, int vstart, int vend) { + aom_yv12_partial_copy_v_c(src_bc, hstart, hend, vstart, vend, dst_bc, hstart, + vstart); +} + +int aom_yv12_realloc_with_new_border_c(YV12_BUFFER_CONFIG *ybf, int new_border, + int byte_alignment, int num_planes) { + if (ybf) { + if (new_border == ybf->border) return 0; + YV12_BUFFER_CONFIG new_buf; + memset(&new_buf, 0, sizeof(new_buf)); + const int error = aom_alloc_frame_buffer( + &new_buf, ybf->y_crop_width, ybf->y_crop_height, ybf->subsampling_x, + ybf->subsampling_y, ybf->flags & YV12_FLAG_HIGHBITDEPTH, new_border, + byte_alignment); + if (error) return error; + // Copy image buffer + aom_yv12_copy_frame(ybf, &new_buf, num_planes); + + // Extend up to new border + aom_extend_frame_borders(&new_buf, num_planes); + + // Now free the old buffer and replace with the new + aom_free_frame_buffer(ybf); + memcpy(ybf, &new_buf, sizeof(new_buf)); + return 0; + } + return -2; +} diff --git a/libs/libaom/src/aom_scale/mips/dspr2/yv12extend_dspr2.c b/libs/libaom/src/aom_scale/mips/dspr2/yv12extend_dspr2.c new file mode 100644 index 000000000..869e594d7 --- /dev/null +++ b/libs/libaom/src/aom_scale/mips/dspr2/yv12extend_dspr2.c @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" + +#include "aom_scale/yv12config.h" +#include "aom_mem/aom_mem.h" +#include "aom_scale/aom_scale.h" + +#if HAVE_DSPR2 +static void extend_plane(uint8_t *const src, int src_stride, int width, + int height, int extend_top, int extend_left, + int extend_bottom, int extend_right) { + int i, j; + uint8_t *left_src, *right_src; + uint8_t *left_dst_start, *right_dst_start; + uint8_t *left_dst, *right_dst; + uint8_t *top_src, *bot_src; + uint8_t *top_dst, *bot_dst; + uint32_t left_pix; + uint32_t right_pix; + uint32_t linesize; + + /* copy the left and right most columns out */ + left_src = src; + right_src = src + width - 1; + left_dst_start = src - extend_left; + right_dst_start = src + width; + + for (i = height; i--;) { + left_dst = left_dst_start; + right_dst = right_dst_start; + + __asm__ __volatile__( + "lb %[left_pix], 0(%[left_src]) \n\t" + "lb %[right_pix], 0(%[right_src]) \n\t" + "replv.qb %[left_pix], %[left_pix] \n\t" + "replv.qb %[right_pix], %[right_pix] \n\t" + + : [left_pix] "=&r"(left_pix), [right_pix] "=&r"(right_pix) + : [left_src] "r"(left_src), [right_src] "r"(right_src)); + + for (j = extend_left / 4; j--;) { + __asm__ __volatile__( + "sw %[left_pix], 0(%[left_dst]) \n\t" + "sw %[right_pix], 0(%[right_dst]) \n\t" + + : + : [left_dst] "r"(left_dst), [left_pix] "r"(left_pix), + [right_dst] "r"(right_dst), [right_pix] "r"(right_pix)); + + left_dst += 4; + right_dst += 4; + } + + for (j = extend_left % 4; j--;) { + __asm__ __volatile__( + "sb %[left_pix], 0(%[left_dst]) \n\t" + "sb %[right_pix], 0(%[right_dst]) \n\t" + + : + : [left_dst] "r"(left_dst), [left_pix] "r"(left_pix), + [right_dst] "r"(right_dst), [right_pix] "r"(right_pix)); + + left_dst += 1; + right_dst += 1; + } + + left_src += src_stride; + right_src += src_stride; + left_dst_start += src_stride; + right_dst_start += src_stride; + } + + /* Now copy the top and bottom lines into each line of the respective + * borders + */ + top_src = src - extend_left; + bot_src = src + src_stride * (height - 1) - extend_left; + top_dst = src + src_stride * (-extend_top) - extend_left; + bot_dst = src + src_stride * (height)-extend_left; + linesize = extend_left + extend_right + width; + + for (i = 0; i < extend_top; i++) { + memcpy(top_dst, top_src, linesize); + top_dst += src_stride; + } + + for (i = 0; i < extend_bottom; i++) { + memcpy(bot_dst, bot_src, linesize); + bot_dst += src_stride; + } +} + +static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size) { + const int c_w = ybf->uv_crop_width; + const int c_h = ybf->uv_crop_height; + const int ss_x = ybf->uv_width < ybf->y_width; + const int ss_y = ybf->uv_height < ybf->y_height; + const int c_et = ext_size >> ss_y; + const int c_el = ext_size >> ss_x; + const int c_eb = c_et + ybf->uv_height - ybf->uv_crop_height; + const int c_er = c_el + ybf->uv_width - ybf->uv_crop_width; + + assert(ybf->y_height - ybf->y_crop_height < 16); + assert(ybf->y_width - ybf->y_crop_width < 16); + assert(ybf->y_height - ybf->y_crop_height >= 0); + assert(ybf->y_width - ybf->y_crop_width >= 0); + + extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width, + ybf->y_crop_height, ext_size, ext_size, + ext_size + ybf->y_height - ybf->y_crop_height, + ext_size + ybf->y_width - ybf->y_crop_width); + + extend_plane(ybf->u_buffer, ybf->uv_stride, c_w, c_h, c_et, c_el, c_eb, c_er); + + extend_plane(ybf->v_buffer, ybf->uv_stride, c_w, c_h, c_et, c_el, c_eb, c_er); +} + +void aom_extend_frame_borders_dspr2(YV12_BUFFER_CONFIG *ybf, + const int num_planes) { + extend_frame(ybf, ybf->border, num_planes); +} + +void aom_extend_frame_inner_borders_dspr2(YV12_BUFFER_CONFIG *ybf, + const int num_planes) { + const int inner_bw = (ybf->border > AOMINNERBORDERINPIXELS) + ? AOMINNERBORDERINPIXELS + : ybf->border; + extend_frame(ybf, inner_bw, num_planes); +} +#endif diff --git a/libs/libaom/src/aom_scale/yv12config.h b/libs/libaom/src/aom_scale/yv12config.h new file mode 100644 index 000000000..3642bb7f3 --- /dev/null +++ b/libs/libaom/src/aom_scale/yv12config.h @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_SCALE_YV12CONFIG_H_ +#define AOM_AOM_SCALE_YV12CONFIG_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "config/aom_config.h" + +#include "aom/aom_codec.h" +#include "aom/aom_frame_buffer.h" +#include "aom/aom_integer.h" +#include "aom/internal/aom_image_internal.h" + +#define AOMINNERBORDERINPIXELS 160 +#define AOM_INTERP_EXTEND 4 +#define AOM_BORDER_IN_PIXELS 288 +#define AOM_ENC_NO_SCALE_BORDER 160 +#define AOM_DEC_BORDER_IN_PIXELS 64 + +typedef struct yv12_buffer_config { + union { + struct { + int y_width; + int uv_width; + }; + int widths[2]; + }; + union { + struct { + int y_height; + int uv_height; + }; + int heights[2]; + }; + union { + struct { + int y_crop_width; + int uv_crop_width; + }; + int crop_widths[2]; + }; + union { + struct { + int y_crop_height; + int uv_crop_height; + }; + int crop_heights[2]; + }; + union { + struct { + int y_stride; + int uv_stride; + }; + int strides[2]; + }; + union { + struct { + uint8_t *y_buffer; + uint8_t *u_buffer; + uint8_t *v_buffer; + }; + uint8_t *buffers[3]; + }; + + // Indicate whether y_buffer, u_buffer, and v_buffer points to the internally + // allocated memory or external buffers. + int use_external_reference_buffers; + // This is needed to store y_buffer, u_buffer, and v_buffer when set reference + // uses an external refernece, and restore those buffer pointers after the + // external reference frame is no longer used. + uint8_t *store_buf_adr[3]; + + // If the frame is stored in a 16-bit buffer, this stores an 8-bit version + // for use in global motion detection. It is allocated on-demand. + uint8_t *y_buffer_8bit; + int buf_8bit_valid; + + uint8_t *buffer_alloc; + size_t buffer_alloc_sz; + int border; + size_t frame_size; + int subsampling_x; + int subsampling_y; + unsigned int bit_depth; + aom_color_primaries_t color_primaries; + aom_transfer_characteristics_t transfer_characteristics; + aom_matrix_coefficients_t matrix_coefficients; + uint8_t monochrome; + aom_chroma_sample_position_t chroma_sample_position; + aom_color_range_t color_range; + int render_width; + int render_height; + + int corrupted; + int flags; + aom_metadata_array_t *metadata; +} YV12_BUFFER_CONFIG; + +#define YV12_FLAG_HIGHBITDEPTH 8 + +int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, + int ss_x, int ss_y, int use_highbitdepth, int border, + int byte_alignment); + +// Updates the yv12 buffer config with the frame buffer. |byte_alignment| must +// be a power of 2, from 32 to 1024. 0 sets legacy alignment. If cb is not +// NULL, then libaom is using the frame buffer callbacks to handle memory. +// If cb is not NULL, libaom will call cb with minimum size in bytes needed +// to decode the current frame. If cb is NULL, libaom will allocate memory +// internally to decode the current frame. Returns 0 on success. Returns < 0 +// on failure. +int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, + int ss_x, int ss_y, int use_highbitdepth, + int border, int byte_alignment, + aom_codec_frame_buffer_t *fb, + aom_get_frame_buffer_cb_fn_t cb, void *cb_priv); + +int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf); + +/*!\brief Removes metadata from YUV_BUFFER_CONFIG struct. + * + * Frees metadata in frame buffer. + * Frame buffer metadata pointer will be set to NULL. + * + * \param[in] ybf Frame buffer struct pointer + */ +void aom_remove_metadata_from_frame_buffer(YV12_BUFFER_CONFIG *ybf); + +/*!\brief Copy metadata to YUV_BUFFER_CONFIG struct. + * + * Copies metadata in frame buffer. + * Frame buffer will clear any previous metadata and will reallocate the + * metadata array to the new metadata size. Then, it will copy the new metadata + * array into it. + * Returns 0 on success or -1 on failure. + * + * \param[in] ybf Frame buffer struct pointer + * \param[in] arr Metadata array struct pointer + */ +int aom_copy_metadata_to_frame_buffer(YV12_BUFFER_CONFIG *ybf, + const aom_metadata_array_t *arr); + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AOM_SCALE_YV12CONFIG_H_ diff --git a/libs/libaom/src/aom_util/aom_thread.c b/libs/libaom/src/aom_util/aom_thread.c new file mode 100644 index 000000000..a749a2240 --- /dev/null +++ b/libs/libaom/src/aom_util/aom_thread.c @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +// +// Multi-threaded worker +// +// Original source: +// https://chromium.googlesource.com/webm/libwebp + +// Enable GNU extensions in glibc so that we can call pthread_setname_np(). +// This must be before any #include statements. +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include // for memset() + +#include "aom_mem/aom_mem.h" +#include "aom_util/aom_thread.h" + +#if CONFIG_MULTITHREAD + +struct AVxWorkerImpl { + pthread_mutex_t mutex_; + pthread_cond_t condition_; + pthread_t thread_; +}; + +//------------------------------------------------------------------------------ + +static void execute(AVxWorker *const worker); // Forward declaration. + +static THREADFN thread_loop(void *ptr) { + AVxWorker *const worker = (AVxWorker *)ptr; +#ifdef __APPLE__ + if (worker->thread_name != NULL) { + // Apple's version of pthread_setname_np takes one argument and operates on + // the current thread only. The maximum size of the thread_name buffer was + // noted in the Chromium source code and was confirmed by experiments. If + // thread_name is too long, pthread_setname_np returns -1 with errno + // ENAMETOOLONG (63). + char thread_name[64]; + strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1); + thread_name[sizeof(thread_name) - 1] = '\0'; + pthread_setname_np(thread_name); + } +#elif defined(__GLIBC__) || defined(__BIONIC__) + if (worker->thread_name != NULL) { + // Linux and Android require names (with nul) fit in 16 chars, otherwise + // pthread_setname_np() returns ERANGE (34). + char thread_name[16]; + strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1); + thread_name[sizeof(thread_name) - 1] = '\0'; + pthread_setname_np(pthread_self(), thread_name); + } +#endif + int done = 0; + while (!done) { + pthread_mutex_lock(&worker->impl_->mutex_); + while (worker->status_ == OK) { // wait in idling mode + pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_); + } + if (worker->status_ == WORK) { + execute(worker); + worker->status_ = OK; + } else if (worker->status_ == NOT_OK) { // finish the worker + done = 1; + } + // signal to the main thread that we're done (for sync()) + pthread_cond_signal(&worker->impl_->condition_); + pthread_mutex_unlock(&worker->impl_->mutex_); + } + return THREAD_RETURN(NULL); // Thread is finished +} + +// main thread state control +static void change_state(AVxWorker *const worker, AVxWorkerStatus new_status) { + // No-op when attempting to change state on a thread that didn't come up. + // Checking status_ without acquiring the lock first would result in a data + // race. + if (worker->impl_ == NULL) return; + + pthread_mutex_lock(&worker->impl_->mutex_); + if (worker->status_ >= OK) { + // wait for the worker to finish + while (worker->status_ != OK) { + pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_); + } + // assign new status and release the working thread if needed + if (new_status != OK) { + worker->status_ = new_status; + pthread_cond_signal(&worker->impl_->condition_); + } + } + pthread_mutex_unlock(&worker->impl_->mutex_); +} + +#endif // CONFIG_MULTITHREAD + +//------------------------------------------------------------------------------ + +static void init(AVxWorker *const worker) { + memset(worker, 0, sizeof(*worker)); + worker->status_ = NOT_OK; +} + +static int sync(AVxWorker *const worker) { +#if CONFIG_MULTITHREAD + change_state(worker, OK); +#endif + assert(worker->status_ <= OK); + return !worker->had_error; +} + +static int reset(AVxWorker *const worker) { + int ok = 1; + worker->had_error = 0; + if (worker->status_ < OK) { +#if CONFIG_MULTITHREAD + worker->impl_ = (AVxWorkerImpl *)aom_calloc(1, sizeof(*worker->impl_)); + if (worker->impl_ == NULL) { + return 0; + } + if (pthread_mutex_init(&worker->impl_->mutex_, NULL)) { + goto Error; + } + if (pthread_cond_init(&worker->impl_->condition_, NULL)) { + pthread_mutex_destroy(&worker->impl_->mutex_); + goto Error; + } + pthread_mutex_lock(&worker->impl_->mutex_); + ok = !pthread_create(&worker->impl_->thread_, NULL, thread_loop, worker); + if (ok) worker->status_ = OK; + pthread_mutex_unlock(&worker->impl_->mutex_); + if (!ok) { + pthread_mutex_destroy(&worker->impl_->mutex_); + pthread_cond_destroy(&worker->impl_->condition_); + Error: + aom_free(worker->impl_); + worker->impl_ = NULL; + return 0; + } +#else + worker->status_ = OK; +#endif + } else if (worker->status_ > OK) { + ok = sync(worker); + } + assert(!ok || (worker->status_ == OK)); + return ok; +} + +static void execute(AVxWorker *const worker) { + if (worker->hook != NULL) { + worker->had_error |= !worker->hook(worker->data1, worker->data2); + } +} + +static void launch(AVxWorker *const worker) { +#if CONFIG_MULTITHREAD + change_state(worker, WORK); +#else + execute(worker); +#endif +} + +static void end(AVxWorker *const worker) { +#if CONFIG_MULTITHREAD + if (worker->impl_ != NULL) { + change_state(worker, NOT_OK); + pthread_join(worker->impl_->thread_, NULL); + pthread_mutex_destroy(&worker->impl_->mutex_); + pthread_cond_destroy(&worker->impl_->condition_); + aom_free(worker->impl_); + worker->impl_ = NULL; + } +#else + worker->status_ = NOT_OK; + assert(worker->impl_ == NULL); +#endif + assert(worker->status_ == NOT_OK); +} + +//------------------------------------------------------------------------------ + +static AVxWorkerInterface g_worker_interface = { init, reset, sync, + launch, execute, end }; + +int aom_set_worker_interface(const AVxWorkerInterface *const winterface) { + if (winterface == NULL || winterface->init == NULL || + winterface->reset == NULL || winterface->sync == NULL || + winterface->launch == NULL || winterface->execute == NULL || + winterface->end == NULL) { + return 0; + } + g_worker_interface = *winterface; + return 1; +} + +const AVxWorkerInterface *aom_get_worker_interface(void) { + return &g_worker_interface; +} + +//------------------------------------------------------------------------------ diff --git a/libs/libaom/src/aom_util/aom_thread.h b/libs/libaom/src/aom_util/aom_thread.h new file mode 100644 index 000000000..8d0431258 --- /dev/null +++ b/libs/libaom/src/aom_util/aom_thread.h @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +// +// Multi-threaded worker +// +// Original source: +// https://chromium.googlesource.com/webm/libwebp + +#ifndef AOM_AOM_UTIL_AOM_THREAD_H_ +#define AOM_AOM_UTIL_AOM_THREAD_H_ + +#include "config/aom_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_NUM_THREADS 64 + +#if CONFIG_MULTITHREAD + +#if defined(_WIN32) && !HAVE_PTHREAD_H +#include // NOLINT +#include // NOLINT +#include // NOLINT +typedef HANDLE pthread_t; +typedef CRITICAL_SECTION pthread_mutex_t; + +#if _WIN32_WINNT < 0x0600 +#error _WIN32_WINNT must target Windows Vista / Server 2008 or newer. +#endif +typedef CONDITION_VARIABLE pthread_cond_t; + +#ifndef WINAPI_FAMILY_PARTITION +#define WINAPI_PARTITION_DESKTOP 1 +#define WINAPI_FAMILY_PARTITION(x) x +#endif + +#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) +#define USE_CREATE_THREAD +#endif + +//------------------------------------------------------------------------------ +// simplistic pthread emulation layer + +// _beginthreadex requires __stdcall +#define THREADFN unsigned int __stdcall +#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val) + +static INLINE int pthread_create(pthread_t *const thread, const void *attr, + unsigned int(__stdcall *start)(void *), + void *arg) { + (void)attr; +#ifdef USE_CREATE_THREAD + *thread = CreateThread(NULL, /* lpThreadAttributes */ + 0, /* dwStackSize */ + start, arg, 0, /* dwStackSize */ + NULL); /* lpThreadId */ +#else + *thread = (pthread_t)_beginthreadex(NULL, /* void *security */ + 0, /* unsigned stack_size */ + start, arg, 0, /* unsigned initflag */ + NULL); /* unsigned *thrdaddr */ +#endif + if (*thread == NULL) return 1; + SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL); + return 0; +} + +static INLINE int pthread_join(pthread_t thread, void **value_ptr) { + (void)value_ptr; + return (WaitForSingleObjectEx(thread, INFINITE, FALSE /*bAlertable*/) != + WAIT_OBJECT_0 || + CloseHandle(thread) == 0); +} + +// Mutex +static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex, + void *mutexattr) { + (void)mutexattr; + InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/); + return 0; +} + +static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) { + return TryEnterCriticalSection(mutex) ? 0 : EBUSY; +} + +static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) { + EnterCriticalSection(mutex); + return 0; +} + +static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) { + LeaveCriticalSection(mutex); + return 0; +} + +static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) { + DeleteCriticalSection(mutex); + return 0; +} + +// Condition +static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) { + (void)condition; + return 0; +} + +static INLINE int pthread_cond_init(pthread_cond_t *const condition, + void *cond_attr) { + (void)cond_attr; + InitializeConditionVariable(condition); + return 0; +} + +static INLINE int pthread_cond_signal(pthread_cond_t *const condition) { + WakeConditionVariable(condition); + return 0; +} + +static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) { + WakeAllConditionVariable(condition); + return 0; +} + +static INLINE int pthread_cond_wait(pthread_cond_t *const condition, + pthread_mutex_t *const mutex) { + int ok; + ok = SleepConditionVariableCS(condition, mutex, INFINITE); + return !ok; +} +#elif defined(__OS2__) +#define INCL_DOS +#include // NOLINT + +#include // NOLINT +#include // NOLINT +#include // NOLINT + +#define pthread_t TID +#define pthread_mutex_t HMTX + +typedef struct { + HEV event_sem_; + HEV ack_sem_; + volatile unsigned wait_count_; +} pthread_cond_t; + +//------------------------------------------------------------------------------ +// simplistic pthread emulation layer + +#define THREADFN void * +#define THREAD_RETURN(val) (val) + +typedef struct { + void *(*start_)(void *); + void *arg_; +} thread_arg; + +static void thread_start(void *arg) { + thread_arg targ = *(thread_arg *)arg; + free(arg); + + targ.start_(targ.arg_); +} + +static INLINE int pthread_create(pthread_t *const thread, const void *attr, + void *(*start)(void *), void *arg) { + int tid; + thread_arg *targ = (thread_arg *)malloc(sizeof(*targ)); + if (targ == NULL) return 1; + + (void)attr; + + targ->start_ = start; + targ->arg_ = arg; + tid = (pthread_t)_beginthread(thread_start, NULL, 1024 * 1024, targ); + if (tid == -1) { + free(targ); + return 1; + } + + *thread = tid; + return 0; +} + +static INLINE int pthread_join(pthread_t thread, void **value_ptr) { + (void)value_ptr; + return DosWaitThread(&thread, DCWW_WAIT) != 0; +} + +// Mutex +static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex, + void *mutexattr) { + (void)mutexattr; + return DosCreateMutexSem(NULL, mutex, 0, FALSE) != 0; +} + +static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) { + return DosRequestMutexSem(*mutex, SEM_IMMEDIATE_RETURN) == 0 ? 0 : EBUSY; +} + +static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) { + return DosRequestMutexSem(*mutex, SEM_INDEFINITE_WAIT) != 0; +} + +static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) { + return DosReleaseMutexSem(*mutex) != 0; +} + +static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) { + return DosCloseMutexSem(*mutex) != 0; +} + +// Condition +static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) { + int ok = 1; + ok &= DosCloseEventSem(condition->event_sem_) == 0; + ok &= DosCloseEventSem(condition->ack_sem_) == 0; + return !ok; +} + +static INLINE int pthread_cond_init(pthread_cond_t *const condition, + void *cond_attr) { + int ok = 1; + (void)cond_attr; + + ok &= + DosCreateEventSem(NULL, &condition->event_sem_, DCE_POSTONE, FALSE) == 0; + ok &= DosCreateEventSem(NULL, &condition->ack_sem_, DCE_POSTONE, FALSE) == 0; + if (!ok) { + pthread_cond_destroy(condition); + return 1; + } + condition->wait_count_ = 0; + return 0; +} + +static INLINE int pthread_cond_signal(pthread_cond_t *const condition) { + int ok = 1; + + if (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0)) { + ok &= DosPostEventSem(condition->event_sem_) == 0; + ok &= DosWaitEventSem(condition->ack_sem_, SEM_INDEFINITE_WAIT) == 0; + } + + return !ok; +} + +static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) { + int ok = 1; + + while (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0)) + ok &= pthread_cond_signal(condition) == 0; + + return !ok; +} + +static INLINE int pthread_cond_wait(pthread_cond_t *const condition, + pthread_mutex_t *const mutex) { + int ok = 1; + + __atomic_increment(&condition->wait_count_); + + ok &= pthread_mutex_unlock(mutex) == 0; + + ok &= DosWaitEventSem(condition->event_sem_, SEM_INDEFINITE_WAIT) == 0; + + __atomic_decrement(&condition->wait_count_); + + ok &= DosPostEventSem(condition->ack_sem_) == 0; + + pthread_mutex_lock(mutex); + + return !ok; +} +#else // _WIN32 +#include // NOLINT +#define THREADFN void * +#define THREAD_RETURN(val) val +#endif + +#endif // CONFIG_MULTITHREAD + +// State of the worker thread object +typedef enum { + NOT_OK = 0, // object is unusable + OK, // ready to work + WORK // busy finishing the current task +} AVxWorkerStatus; + +// Function to be called by the worker thread. Takes two opaque pointers as +// arguments (data1 and data2). Should return true on success and return false +// in case of error. +typedef int (*AVxWorkerHook)(void *, void *); + +// Platform-dependent implementation details for the worker. +typedef struct AVxWorkerImpl AVxWorkerImpl; + +// Synchronization object used to launch job in the worker thread +typedef struct { + AVxWorkerImpl *impl_; + AVxWorkerStatus status_; + // Thread name for the debugger. If not NULL, must point to a string that + // outlives the worker thread. For portability, use a name <= 15 characters + // long (not including the terminating NUL character). + const char *thread_name; + AVxWorkerHook hook; // hook to call + void *data1; // first argument passed to 'hook' + void *data2; // second argument passed to 'hook' + int had_error; // true if a call to 'hook' returned false +} AVxWorker; + +// The interface for all thread-worker related functions. All these functions +// must be implemented. +typedef struct { + // Must be called first, before any other method. + void (*init)(AVxWorker *const worker); + // Must be called to initialize the object and spawn the thread. Re-entrant. + // Will potentially launch the thread. Returns false in case of error. + int (*reset)(AVxWorker *const worker); + // Makes sure the previous work is finished. Returns true if worker->had_error + // was not set and no error condition was triggered by the working thread. + int (*sync)(AVxWorker *const worker); + // Triggers the thread to call hook() with data1 and data2 arguments. These + // hook/data1/data2 values can be changed at any time before calling this + // function, but not be changed afterward until the next call to Sync(). + void (*launch)(AVxWorker *const worker); + // This function is similar to launch() except that it calls the + // hook directly instead of using a thread. Convenient to bypass the thread + // mechanism while still using the AVxWorker structs. sync() must + // still be called afterward (for error reporting). + void (*execute)(AVxWorker *const worker); + // Kill the thread and terminate the object. To use the object again, one + // must call reset() again. + void (*end)(AVxWorker *const worker); +} AVxWorkerInterface; + +// Install a new set of threading functions, overriding the defaults. This +// should be done before any workers are started, i.e., before any encoding or +// decoding takes place. The contents of the interface struct are copied, it +// is safe to free the corresponding memory after this call. This function is +// not thread-safe. Return false in case of invalid pointer or methods. +int aom_set_worker_interface(const AVxWorkerInterface *const winterface); + +// Retrieve the currently set thread worker interface. +const AVxWorkerInterface *aom_get_worker_interface(void); + +//------------------------------------------------------------------------------ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_UTIL_AOM_THREAD_H_ diff --git a/libs/libaom/src/aom_util/aom_util.cmake b/libs/libaom/src/aom_util/aom_util.cmake new file mode 100644 index 000000000..1a1bfe1e6 --- /dev/null +++ b/libs/libaom/src/aom_util/aom_util.cmake @@ -0,0 +1,31 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_AOM_UTIL_AOM_UTIL_CMAKE_) + return() +endif() # AOM_AOM_UTIL_AOM_UTIL_CMAKE_ +set(AOM_AOM_UTIL_AOM_UTIL_CMAKE_ 1) + +list(APPEND AOM_UTIL_SOURCES "${AOM_ROOT}/aom_util/aom_thread.c" + "${AOM_ROOT}/aom_util/aom_thread.h" + "${AOM_ROOT}/aom_util/endian_inl.h" + "${AOM_ROOT}/aom_util/debug_util.c" + "${AOM_ROOT}/aom_util/debug_util.h") + +# Creates the aom_util build target and makes libaom depend on it. The libaom +# target must exist before this function is called. +function(setup_aom_util_targets) + add_library(aom_util OBJECT ${AOM_UTIL_SOURCES}) + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_util PARENT_SCOPE) + target_sources(aom PRIVATE $) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE $) + endif() +endfunction() diff --git a/libs/libaom/src/aom_util/debug_util.c b/libs/libaom/src/aom_util/debug_util.c new file mode 100644 index 000000000..5762e693b --- /dev/null +++ b/libs/libaom/src/aom_util/debug_util.c @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include "aom_util/debug_util.h" + +static int frame_idx_w = 0; + +static int frame_idx_r = 0; + +void aom_bitstream_queue_set_frame_write(int frame_idx) { + frame_idx_w = frame_idx; +} + +int aom_bitstream_queue_get_frame_writee(void) { return frame_idx_w; } + +void aom_bitstream_queue_set_frame_read(int frame_idx) { + frame_idx_r = frame_idx; +} + +int aom_bitstream_queue_get_frame_read(void) { return frame_idx_r; } + +#if CONFIG_BITSTREAM_DEBUG +#define QUEUE_MAX_SIZE 2000000 +static int result_queue[QUEUE_MAX_SIZE]; +static int nsymbs_queue[QUEUE_MAX_SIZE]; +static aom_cdf_prob cdf_queue[QUEUE_MAX_SIZE][16]; + +static int queue_r = 0; +static int queue_w = 0; +static int queue_prev_w = -1; +static int skip_r = 0; +static int skip_w = 0; + +void bitstream_queue_set_skip_write(int skip) { skip_w = skip; } + +void bitstream_queue_set_skip_read(int skip) { skip_r = skip; } + +void bitstream_queue_record_write(void) { queue_prev_w = queue_w; } + +void bitstream_queue_reset_write(void) { queue_w = queue_prev_w; } + +int bitstream_queue_get_write(void) { return queue_w; } + +int bitstream_queue_get_read(void) { return queue_r; } + +void bitstream_queue_pop(int *result, aom_cdf_prob *cdf, int *nsymbs) { + if (!skip_r) { + if (queue_w == queue_r) { + printf("buffer underflow queue_w %d queue_r %d\n", queue_w, queue_r); + assert(0); + } + *result = result_queue[queue_r]; + *nsymbs = nsymbs_queue[queue_r]; + memcpy(cdf, cdf_queue[queue_r], *nsymbs * sizeof(*cdf)); + queue_r = (queue_r + 1) % QUEUE_MAX_SIZE; + } +} + +void bitstream_queue_push(int result, const aom_cdf_prob *cdf, int nsymbs) { + if (!skip_w) { + result_queue[queue_w] = result; + nsymbs_queue[queue_w] = nsymbs; + memcpy(cdf_queue[queue_w], cdf, nsymbs * sizeof(*cdf)); + queue_w = (queue_w + 1) % QUEUE_MAX_SIZE; + if (queue_w == queue_r) { + printf("buffer overflow queue_w %d queue_r %d\n", queue_w, queue_r); + assert(0); + } + } +} +#endif // CONFIG_BITSTREAM_DEBUG + +#if CONFIG_MISMATCH_DEBUG +static int frame_buf_idx_r = 0; +static int frame_buf_idx_w = 0; +static int max_frame_buf_num = 5; +#define MAX_FRAME_STRIDE 1280 +#define MAX_FRAME_HEIGHT 720 +static uint16_t + frame_pre[5][3][MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT]; // prediction only +static uint16_t + frame_tx[5][3][MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT]; // prediction + txfm +static int frame_stride = MAX_FRAME_STRIDE; +static int frame_height = MAX_FRAME_HEIGHT; +static int frame_size = MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT; +void mismatch_move_frame_idx_w() { + frame_buf_idx_w = (frame_buf_idx_w + 1) % max_frame_buf_num; + if (frame_buf_idx_w == frame_buf_idx_r) { + printf("frame_buf overflow\n"); + assert(0); + } +} + +void mismatch_reset_frame(int num_planes) { + for (int plane = 0; plane < num_planes; ++plane) { + memset(frame_pre[frame_buf_idx_w][plane], 0, + sizeof(frame_pre[frame_buf_idx_w][plane][0]) * frame_size); + memset(frame_tx[frame_buf_idx_w][plane], 0, + sizeof(frame_tx[frame_buf_idx_w][plane][0]) * frame_size); + } +} + +void mismatch_move_frame_idx_r() { + if (frame_buf_idx_w == frame_buf_idx_r) { + printf("frame_buf underflow\n"); + assert(0); + } + frame_buf_idx_r = (frame_buf_idx_r + 1) % max_frame_buf_num; +} + +void mismatch_record_block_pre(const uint8_t *src, int src_stride, + int frame_offset, int plane, int pixel_c, + int pixel_r, int blk_w, int blk_h, int highbd) { + if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) { + printf("frame_buf undersized\n"); + assert(0); + } + + const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL; + for (int r = 0; r < blk_h; ++r) { + for (int c = 0; c < blk_w; ++c) { + frame_pre[frame_buf_idx_w][plane] + [(r + pixel_r) * frame_stride + c + pixel_c] = + src16 ? src16[r * src_stride + c] : src[r * src_stride + c]; + } + } +#if 0 + int ref_frame_idx = 3; + int ref_frame_offset = 4; + int ref_plane = 1; + int ref_pixel_c = 162; + int ref_pixel_r = 16; + if (frame_idx_w == ref_frame_idx && plane == ref_plane && + frame_offset == ref_frame_offset && ref_pixel_c >= pixel_c && + ref_pixel_c < pixel_c + blk_w && ref_pixel_r >= pixel_r && + ref_pixel_r < pixel_r + blk_h) { + printf( + "\nrecord_block_pre frame_idx %d frame_offset %d plane %d pixel_c %d pixel_r %d blk_w " + "%d blk_h %d\n", + frame_idx_w, frame_offset, plane, pixel_c, pixel_r, blk_w, blk_h); + } +#endif +} +void mismatch_record_block_tx(const uint8_t *src, int src_stride, + int frame_offset, int plane, int pixel_c, + int pixel_r, int blk_w, int blk_h, int highbd) { + if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) { + printf("frame_buf undersized\n"); + assert(0); + } + + const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL; + for (int r = 0; r < blk_h; ++r) { + for (int c = 0; c < blk_w; ++c) { + frame_tx[frame_buf_idx_w][plane] + [(r + pixel_r) * frame_stride + c + pixel_c] = + src16 ? src16[r * src_stride + c] : src[r * src_stride + c]; + } + } +#if 0 + int ref_frame_idx = 3; + int ref_frame_offset = 4; + int ref_plane = 1; + int ref_pixel_c = 162; + int ref_pixel_r = 16; + if (frame_idx_w == ref_frame_idx && plane == ref_plane && frame_offset == ref_frame_offset && + ref_pixel_c >= pixel_c && ref_pixel_c < pixel_c + blk_w && + ref_pixel_r >= pixel_r && ref_pixel_r < pixel_r + blk_h) { + printf( + "\nrecord_block_tx frame_idx %d frame_offset %d plane %d pixel_c %d pixel_r %d blk_w " + "%d blk_h %d\n", + frame_idx_w, frame_offset, plane, pixel_c, pixel_r, blk_w, blk_h); + } +#endif +} +void mismatch_check_block_pre(const uint8_t *src, int src_stride, + int frame_offset, int plane, int pixel_c, + int pixel_r, int blk_w, int blk_h, int highbd) { + if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) { + printf("frame_buf undersized\n"); + assert(0); + } + + const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL; + int mismatch = 0; + for (int r = 0; r < blk_h; ++r) { + for (int c = 0; c < blk_w; ++c) { + if (frame_pre[frame_buf_idx_r][plane] + [(r + pixel_r) * frame_stride + c + pixel_c] != + (uint16_t)(src16 ? src16[r * src_stride + c] + : src[r * src_stride + c])) { + mismatch = 1; + } + } + } + if (mismatch) { + printf( + "\ncheck_block_pre failed frame_idx %d frame_offset %d plane %d " + "pixel_c %d pixel_r " + "%d blk_w %d blk_h %d\n", + frame_idx_r, frame_offset, plane, pixel_c, pixel_r, blk_w, blk_h); + printf("enc\n"); + for (int rr = 0; rr < blk_h; ++rr) { + for (int cc = 0; cc < blk_w; ++cc) { + printf("%d ", frame_pre[frame_buf_idx_r][plane] + [(rr + pixel_r) * frame_stride + cc + pixel_c]); + } + printf("\n"); + } + + printf("dec\n"); + for (int rr = 0; rr < blk_h; ++rr) { + for (int cc = 0; cc < blk_w; ++cc) { + printf("%d ", + src16 ? src16[rr * src_stride + cc] : src[rr * src_stride + cc]); + } + printf("\n"); + } + assert(0); + } +} +void mismatch_check_block_tx(const uint8_t *src, int src_stride, + int frame_offset, int plane, int pixel_c, + int pixel_r, int blk_w, int blk_h, int highbd) { + if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) { + printf("frame_buf undersized\n"); + assert(0); + } + + const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL; + int mismatch = 0; + for (int r = 0; r < blk_h; ++r) { + for (int c = 0; c < blk_w; ++c) { + if (frame_tx[frame_buf_idx_r][plane] + [(r + pixel_r) * frame_stride + c + pixel_c] != + (uint16_t)(src16 ? src16[r * src_stride + c] + : src[r * src_stride + c])) { + mismatch = 1; + } + } + } + if (mismatch) { + printf( + "\ncheck_block_tx failed frame_idx %d frame_offset %d plane %d pixel_c " + "%d pixel_r " + "%d blk_w %d blk_h %d\n", + frame_idx_r, frame_offset, plane, pixel_c, pixel_r, blk_w, blk_h); + printf("enc\n"); + for (int rr = 0; rr < blk_h; ++rr) { + for (int cc = 0; cc < blk_w; ++cc) { + printf("%d ", frame_tx[frame_buf_idx_r][plane] + [(rr + pixel_r) * frame_stride + cc + pixel_c]); + } + printf("\n"); + } + + printf("dec\n"); + for (int rr = 0; rr < blk_h; ++rr) { + for (int cc = 0; cc < blk_w; ++cc) { + printf("%d ", + src16 ? src16[rr * src_stride + cc] : src[rr * src_stride + cc]); + } + printf("\n"); + } + assert(0); + } +} +#endif // CONFIG_MISMATCH_DEBUG diff --git a/libs/libaom/src/aom_util/debug_util.h b/libs/libaom/src/aom_util/debug_util.h new file mode 100644 index 000000000..23cad2a5b --- /dev/null +++ b/libs/libaom/src/aom_util/debug_util.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_UTIL_DEBUG_UTIL_H_ +#define AOM_AOM_UTIL_DEBUG_UTIL_H_ + +#include "config/aom_config.h" + +#include "aom_dsp/prob.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void aom_bitstream_queue_set_frame_write(int frame_idx); +int aom_bitstream_queue_get_frame_writee(void); +void aom_bitstream_queue_set_frame_read(int frame_idx); +int aom_bitstream_queue_get_frame_read(void); + +#if CONFIG_BITSTREAM_DEBUG +/* This is a debug tool used to detect bitstream error. On encoder side, it + * pushes each bit and probability into a queue before the bit is written into + * the Arithmetic coder. On decoder side, whenever a bit is read out from the + * Arithmetic coder, it pops out the reference bit and probability from the + * queue as well. If the two results do not match, this debug tool will report + * an error. This tool can be used to pin down the bitstream error precisely. + * By combining gdb's backtrace method, we can detect which module causes the + * bitstream error. */ +int bitstream_queue_get_write(void); +int bitstream_queue_get_read(void); +void bitstream_queue_record_write(void); +void bitstream_queue_reset_write(void); +void bitstream_queue_pop(int *result, aom_cdf_prob *cdf, int *nsymbs); +void bitstream_queue_push(int result, const aom_cdf_prob *cdf, int nsymbs); +void bitstream_queue_set_skip_write(int skip); +void bitstream_queue_set_skip_read(int skip); +#endif // CONFIG_BITSTREAM_DEBUG + +#if CONFIG_MISMATCH_DEBUG +void mismatch_move_frame_idx_w(); +void mismatch_move_frame_idx_r(); +void mismatch_reset_frame(int num_planes); +void mismatch_record_block_pre(const uint8_t *src, int src_stride, + int frame_offset, int plane, int pixel_c, + int pixel_r, int blk_w, int blk_h, int highbd); +void mismatch_record_block_tx(const uint8_t *src, int src_stride, + int frame_offset, int plane, int pixel_c, + int pixel_r, int blk_w, int blk_h, int highbd); +void mismatch_check_block_pre(const uint8_t *src, int src_stride, + int frame_offset, int plane, int pixel_c, + int pixel_r, int blk_w, int blk_h, int highbd); +void mismatch_check_block_tx(const uint8_t *src, int src_stride, + int frame_offset, int plane, int pixel_c, + int pixel_r, int blk_w, int blk_h, int highbd); +#endif // CONFIG_MISMATCH_DEBUG + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_UTIL_DEBUG_UTIL_H_ diff --git a/libs/libaom/src/aom_util/endian_inl.h b/libs/libaom/src/aom_util/endian_inl.h new file mode 100644 index 000000000..f536ec5b8 --- /dev/null +++ b/libs/libaom/src/aom_util/endian_inl.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +// +// Endian related functions. + +#ifndef AOM_AOM_UTIL_ENDIAN_INL_H_ +#define AOM_AOM_UTIL_ENDIAN_INL_H_ + +#include + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" + +#if defined(__GNUC__) +#define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__) +#define LOCAL_GCC_PREREQ(maj, min) (LOCAL_GCC_VERSION >= (((maj) << 8) | (min))) +#else +#define LOCAL_GCC_VERSION 0 +#define LOCAL_GCC_PREREQ(maj, min) 0 +#endif + +// handle clang compatibility +#ifndef __has_builtin +#define __has_builtin(x) 0 +#endif + +// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__) +#if !defined(WORDS_BIGENDIAN) && \ + (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \ + (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__))) +#define WORDS_BIGENDIAN +#endif + +#if defined(WORDS_BIGENDIAN) +#define HToLE32 BSwap32 +#define HToLE16 BSwap16 +#define HToBE64(x) (x) +#define HToBE32(x) (x) +#else +#define HToLE32(x) (x) +#define HToLE16(x) (x) +#define HToBE64(X) BSwap64(X) +#define HToBE32(X) BSwap32(X) +#endif + +#if LOCAL_GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16) +#define HAVE_BUILTIN_BSWAP16 +#endif + +#if LOCAL_GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32) +#define HAVE_BUILTIN_BSWAP32 +#endif + +#if LOCAL_GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64) +#define HAVE_BUILTIN_BSWAP64 +#endif + +#if HAVE_MIPS32 && defined(__mips__) && !defined(__mips64) && \ + defined(__mips_isa_rev) && (__mips_isa_rev >= 2) && (__mips_isa_rev < 6) +#define AOM_USE_MIPS32_R2 +#endif + +static INLINE uint16_t BSwap16(uint16_t x) { +#if defined(HAVE_BUILTIN_BSWAP16) + return __builtin_bswap16(x); +#elif defined(_MSC_VER) + return _byteswap_ushort(x); +#else + // gcc will recognize a 'rorw $8, ...' here: + return (x >> 8) | ((x & 0xff) << 8); +#endif // HAVE_BUILTIN_BSWAP16 +} + +static INLINE uint32_t BSwap32(uint32_t x) { +#if defined(AOM_USE_MIPS32_R2) + uint32_t ret; + __asm__ volatile( + "wsbh %[ret], %[x] \n\t" + "rotr %[ret], %[ret], 16 \n\t" + : [ret] "=r"(ret) + : [x] "r"(x)); + return ret; +#elif defined(HAVE_BUILTIN_BSWAP32) + return __builtin_bswap32(x); +#elif defined(__i386__) || defined(__x86_64__) + uint32_t swapped_bytes; + __asm__ volatile("bswap %0" : "=r"(swapped_bytes) : "0"(x)); + return swapped_bytes; +#elif defined(_MSC_VER) + return (uint32_t)_byteswap_ulong(x); +#else + return (x >> 24) | ((x >> 8) & 0xff00) | ((x << 8) & 0xff0000) | (x << 24); +#endif // HAVE_BUILTIN_BSWAP32 +} + +static INLINE uint64_t BSwap64(uint64_t x) { +#if defined(HAVE_BUILTIN_BSWAP64) + return __builtin_bswap64(x); +#elif defined(__x86_64__) + uint64_t swapped_bytes; + __asm__ volatile("bswapq %0" : "=r"(swapped_bytes) : "0"(x)); + return swapped_bytes; +#elif defined(_MSC_VER) + return (uint64_t)_byteswap_uint64(x); +#else // generic code for swapping 64-bit values (suggested by bdb@) + x = ((x & 0xffffffff00000000ull) >> 32) | ((x & 0x00000000ffffffffull) << 32); + x = ((x & 0xffff0000ffff0000ull) >> 16) | ((x & 0x0000ffff0000ffffull) << 16); + x = ((x & 0xff00ff00ff00ff00ull) >> 8) | ((x & 0x00ff00ff00ff00ffull) << 8); + return x; +#endif // HAVE_BUILTIN_BSWAP64 +} + +#endif // AOM_AOM_UTIL_ENDIAN_INL_H_ diff --git a/libs/libaom/src/apps/aomdec.c b/libs/libaom/src/apps/aomdec.c new file mode 100644 index 000000000..2591d41a6 --- /dev/null +++ b/libs/libaom/src/apps/aomdec.c @@ -0,0 +1,1024 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include +#include + +#include "config/aom_config.h" + +#if CONFIG_OS_SUPPORT +#if HAVE_UNISTD_H +#include // NOLINT +#elif !defined(STDOUT_FILENO) +#define STDOUT_FILENO 1 +#endif +#endif + +#include "aom/aom_decoder.h" +#include "aom/aomdx.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/mem_ops.h" +#include "common/args.h" +#include "common/ivfdec.h" +#include "common/md5_utils.h" +#include "common/obudec.h" +#include "common/tools_common.h" + +#if CONFIG_WEBM_IO +#include "common/webmdec.h" +#endif + +#include "common/rawenc.h" +#include "common/y4menc.h" + +#if CONFIG_LIBYUV +#include "third_party/libyuv/include/libyuv/scale.h" +#endif + +static const char *exec_name; + +struct AvxDecInputContext { + struct AvxInputContext *aom_input_ctx; + struct ObuDecInputContext *obu_ctx; + struct WebmInputContext *webm_ctx; +}; + +static const arg_def_t help = + ARG_DEF(NULL, "help", 0, "Show usage options and exit"); +static const arg_def_t looparg = + ARG_DEF(NULL, "loops", 1, "Number of times to decode the file"); +static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use"); +static const arg_def_t use_yv12 = + ARG_DEF(NULL, "yv12", 0, "Output raw YV12 frames"); +static const arg_def_t use_i420 = + ARG_DEF(NULL, "i420", 0, "Output raw I420 frames"); +static const arg_def_t flipuvarg = + ARG_DEF(NULL, "flipuv", 0, "Flip the chroma planes in the output"); +static const arg_def_t rawvideo = + ARG_DEF(NULL, "rawvideo", 0, "Output raw YUV frames"); +static const arg_def_t noblitarg = + ARG_DEF(NULL, "noblit", 0, "Don't process the decoded frames"); +static const arg_def_t progressarg = + ARG_DEF(NULL, "progress", 0, "Show progress after each frame decodes"); +static const arg_def_t limitarg = + ARG_DEF(NULL, "limit", 1, "Stop decoding after n frames"); +static const arg_def_t skiparg = + ARG_DEF(NULL, "skip", 1, "Skip the first n input frames"); +static const arg_def_t summaryarg = + ARG_DEF(NULL, "summary", 0, "Show timing summary"); +static const arg_def_t outputfile = + ARG_DEF("o", "output", 1, "Output file name pattern (see below)"); +static const arg_def_t threadsarg = + ARG_DEF("t", "threads", 1, "Max threads to use"); +static const arg_def_t verbosearg = + ARG_DEF("v", "verbose", 0, "Show version string"); +static const arg_def_t scalearg = + ARG_DEF("S", "scale", 0, "Scale output frames uniformly"); +static const arg_def_t continuearg = + ARG_DEF("k", "keep-going", 0, "(debug) Continue decoding after error"); +static const arg_def_t fb_arg = + ARG_DEF(NULL, "frame-buffers", 1, "Number of frame buffers to use"); +static const arg_def_t md5arg = + ARG_DEF(NULL, "md5", 0, "Compute the MD5 sum of the decoded frame"); +static const arg_def_t framestatsarg = + ARG_DEF(NULL, "framestats", 1, "Output per-frame stats (.csv format)"); +static const arg_def_t outbitdeptharg = + ARG_DEF(NULL, "output-bit-depth", 1, "Output bit-depth for decoded frames"); +static const arg_def_t isannexb = + ARG_DEF(NULL, "annexb", 0, "Bitstream is in Annex-B format"); +static const arg_def_t oppointarg = ARG_DEF( + NULL, "oppoint", 1, "Select an operating point of a scalable bitstream"); +static const arg_def_t outallarg = ARG_DEF( + NULL, "all-layers", 0, "Output all decoded frames of a scalable bitstream"); +static const arg_def_t skipfilmgrain = + ARG_DEF(NULL, "skip-film-grain", 0, "Skip film grain application"); + +static const arg_def_t *all_args[] = { + &help, &codecarg, &use_yv12, &use_i420, &flipuvarg, + &rawvideo, &noblitarg, &progressarg, &limitarg, &skiparg, + &summaryarg, &outputfile, &threadsarg, &verbosearg, &scalearg, + &fb_arg, &md5arg, &framestatsarg, &continuearg, &outbitdeptharg, + &isannexb, &oppointarg, &outallarg, &skipfilmgrain, NULL +}; + +#if CONFIG_LIBYUV +static INLINE int libyuv_scale(aom_image_t *src, aom_image_t *dst, + FilterModeEnum mode) { + if (src->fmt == AOM_IMG_FMT_I42016) { + assert(dst->fmt == AOM_IMG_FMT_I42016); + return I420Scale_16( + (uint16_t *)src->planes[AOM_PLANE_Y], src->stride[AOM_PLANE_Y] / 2, + (uint16_t *)src->planes[AOM_PLANE_U], src->stride[AOM_PLANE_U] / 2, + (uint16_t *)src->planes[AOM_PLANE_V], src->stride[AOM_PLANE_V] / 2, + src->d_w, src->d_h, (uint16_t *)dst->planes[AOM_PLANE_Y], + dst->stride[AOM_PLANE_Y] / 2, (uint16_t *)dst->planes[AOM_PLANE_U], + dst->stride[AOM_PLANE_U] / 2, (uint16_t *)dst->planes[AOM_PLANE_V], + dst->stride[AOM_PLANE_V] / 2, dst->d_w, dst->d_h, mode); + } + assert(src->fmt == AOM_IMG_FMT_I420); + assert(dst->fmt == AOM_IMG_FMT_I420); + return I420Scale(src->planes[AOM_PLANE_Y], src->stride[AOM_PLANE_Y], + src->planes[AOM_PLANE_U], src->stride[AOM_PLANE_U], + src->planes[AOM_PLANE_V], src->stride[AOM_PLANE_V], src->d_w, + src->d_h, dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], + dst->planes[AOM_PLANE_U], dst->stride[AOM_PLANE_U], + dst->planes[AOM_PLANE_V], dst->stride[AOM_PLANE_V], dst->d_w, + dst->d_h, mode); +} +#endif + +static void show_help(FILE *fout, int shorthelp) { + fprintf(fout, "Usage: %s filename\n\n", exec_name); + + if (shorthelp) { + fprintf(fout, "Use --help to see the full list of options.\n"); + return; + } + + fprintf(fout, "Options:\n"); + arg_show_usage(fout, all_args); + fprintf(fout, + "\nOutput File Patterns:\n\n" + " The -o argument specifies the name of the file(s) to " + "write to. If the\n argument does not include any escape " + "characters, the output will be\n written to a single file. " + "Otherwise, the filename will be calculated by\n expanding " + "the following escape characters:\n"); + fprintf(fout, + "\n\t%%w - Frame width" + "\n\t%%h - Frame height" + "\n\t%% - Frame number, zero padded to places (1..9)" + "\n\n Pattern arguments are only supported in conjunction " + "with the --yv12 and\n --i420 options. If the -o option is " + "not specified, the output will be\n directed to stdout.\n"); + fprintf(fout, "\nIncluded decoders:\n\n"); + + for (int i = 0; i < get_aom_decoder_count(); ++i) { + const AvxInterface *const decoder = get_aom_decoder_by_index(i); + fprintf(fout, " %-6s - %s\n", decoder->name, + aom_codec_iface_name(decoder->codec_interface())); + } +} + +void usage_exit(void) { + show_help(stderr, 1); + exit(EXIT_FAILURE); +} + +static int raw_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read, + size_t *buffer_size) { + char raw_hdr[RAW_FRAME_HDR_SZ]; + size_t frame_size = 0; + + if (fread(raw_hdr, RAW_FRAME_HDR_SZ, 1, infile) != 1) { + if (!feof(infile)) warn("Failed to read RAW frame size\n"); + } else { + const size_t kCorruptFrameThreshold = 256 * 1024 * 1024; + const size_t kFrameTooSmallThreshold = 256 * 1024; + frame_size = mem_get_le32(raw_hdr); + + if (frame_size > kCorruptFrameThreshold) { + warn("Read invalid frame size (%u)\n", (unsigned int)frame_size); + frame_size = 0; + } + + if (frame_size < kFrameTooSmallThreshold) { + warn("Warning: Read invalid frame size (%u) - not a raw file?\n", + (unsigned int)frame_size); + } + + if (frame_size > *buffer_size) { + uint8_t *new_buf = realloc(*buffer, 2 * frame_size); + if (new_buf) { + *buffer = new_buf; + *buffer_size = 2 * frame_size; + } else { + warn("Failed to allocate compressed data buffer\n"); + frame_size = 0; + } + } + } + + if (!feof(infile)) { + if (fread(*buffer, 1, frame_size, infile) != frame_size) { + warn("Failed to read full frame\n"); + return 1; + } + *bytes_read = frame_size; + } + + return 0; +} + +static int read_frame(struct AvxDecInputContext *input, uint8_t **buf, + size_t *bytes_in_buffer, size_t *buffer_size) { + switch (input->aom_input_ctx->file_type) { +#if CONFIG_WEBM_IO + case FILE_TYPE_WEBM: + return webm_read_frame(input->webm_ctx, buf, bytes_in_buffer, + buffer_size); +#endif + case FILE_TYPE_RAW: + return raw_read_frame(input->aom_input_ctx->file, buf, bytes_in_buffer, + buffer_size); + case FILE_TYPE_IVF: + return ivf_read_frame(input->aom_input_ctx->file, buf, bytes_in_buffer, + buffer_size, NULL); + case FILE_TYPE_OBU: + return obudec_read_temporal_unit(input->obu_ctx, buf, bytes_in_buffer, + buffer_size); + default: return 1; + } +} + +static int file_is_raw(struct AvxInputContext *input) { + uint8_t buf[32]; + int is_raw = 0; + aom_codec_stream_info_t si; + memset(&si, 0, sizeof(si)); + + if (fread(buf, 1, 32, input->file) == 32) { + int i; + + if (mem_get_le32(buf) < 256 * 1024 * 1024) { + for (i = 0; i < get_aom_decoder_count(); ++i) { + const AvxInterface *const decoder = get_aom_decoder_by_index(i); + if (!aom_codec_peek_stream_info(decoder->codec_interface(), buf + 4, + 32 - 4, &si)) { + is_raw = 1; + input->fourcc = decoder->fourcc; + input->width = si.w; + input->height = si.h; + input->framerate.numerator = 30; + input->framerate.denominator = 1; + break; + } + } + } + } + + rewind(input->file); + return is_raw; +} + +static void show_progress(int frame_in, int frame_out, uint64_t dx_time) { + fprintf(stderr, + "%d decoded frames/%d showed frames in %" PRId64 " us (%.2f fps)\r", + frame_in, frame_out, dx_time, + (double)frame_out * 1000000.0 / (double)dx_time); +} + +struct ExternalFrameBuffer { + uint8_t *data; + size_t size; + int in_use; +}; + +struct ExternalFrameBufferList { + int num_external_frame_buffers; + struct ExternalFrameBuffer *ext_fb; +}; + +// Callback used by libaom to request an external frame buffer. |cb_priv| +// Application private data passed into the set function. |min_size| is the +// minimum size in bytes needed to decode the next frame. |fb| pointer to the +// frame buffer. +static int get_av1_frame_buffer(void *cb_priv, size_t min_size, + aom_codec_frame_buffer_t *fb) { + int i; + struct ExternalFrameBufferList *const ext_fb_list = + (struct ExternalFrameBufferList *)cb_priv; + if (ext_fb_list == NULL) return -1; + + // Find a free frame buffer. + for (i = 0; i < ext_fb_list->num_external_frame_buffers; ++i) { + if (!ext_fb_list->ext_fb[i].in_use) break; + } + + if (i == ext_fb_list->num_external_frame_buffers) return -1; + + if (ext_fb_list->ext_fb[i].size < min_size) { + free(ext_fb_list->ext_fb[i].data); + ext_fb_list->ext_fb[i].data = (uint8_t *)calloc(min_size, sizeof(uint8_t)); + if (!ext_fb_list->ext_fb[i].data) return -1; + + ext_fb_list->ext_fb[i].size = min_size; + } + + fb->data = ext_fb_list->ext_fb[i].data; + fb->size = ext_fb_list->ext_fb[i].size; + ext_fb_list->ext_fb[i].in_use = 1; + + // Set the frame buffer's private data to point at the external frame buffer. + fb->priv = &ext_fb_list->ext_fb[i]; + return 0; +} + +// Callback used by libaom when there are no references to the frame buffer. +// |cb_priv| user private data passed into the set function. |fb| pointer +// to the frame buffer. +static int release_av1_frame_buffer(void *cb_priv, + aom_codec_frame_buffer_t *fb) { + struct ExternalFrameBuffer *const ext_fb = + (struct ExternalFrameBuffer *)fb->priv; + (void)cb_priv; + ext_fb->in_use = 0; + return 0; +} + +static void generate_filename(const char *pattern, char *out, size_t q_len, + unsigned int d_w, unsigned int d_h, + unsigned int frame_in) { + const char *p = pattern; + char *q = out; + + do { + char *next_pat = strchr(p, '%'); + + if (p == next_pat) { + size_t pat_len; + + /* parse the pattern */ + q[q_len - 1] = '\0'; + switch (p[1]) { + case 'w': snprintf(q, q_len - 1, "%d", d_w); break; + case 'h': snprintf(q, q_len - 1, "%d", d_h); break; + case '1': snprintf(q, q_len - 1, "%d", frame_in); break; + case '2': snprintf(q, q_len - 1, "%02d", frame_in); break; + case '3': snprintf(q, q_len - 1, "%03d", frame_in); break; + case '4': snprintf(q, q_len - 1, "%04d", frame_in); break; + case '5': snprintf(q, q_len - 1, "%05d", frame_in); break; + case '6': snprintf(q, q_len - 1, "%06d", frame_in); break; + case '7': snprintf(q, q_len - 1, "%07d", frame_in); break; + case '8': snprintf(q, q_len - 1, "%08d", frame_in); break; + case '9': snprintf(q, q_len - 1, "%09d", frame_in); break; + default: die("Unrecognized pattern %%%c\n", p[1]); break; + } + + pat_len = strlen(q); + if (pat_len >= q_len - 1) die("Output filename too long.\n"); + q += pat_len; + p += 2; + q_len -= pat_len; + } else { + size_t copy_len; + + /* copy the next segment */ + if (!next_pat) + copy_len = strlen(p); + else + copy_len = next_pat - p; + + if (copy_len >= q_len - 1) die("Output filename too long.\n"); + + memcpy(q, p, copy_len); + q[copy_len] = '\0'; + q += copy_len; + p += copy_len; + q_len -= copy_len; + } + } while (*p); +} + +static int is_single_file(const char *outfile_pattern) { + const char *p = outfile_pattern; + + do { + p = strchr(p, '%'); + if (p && p[1] >= '1' && p[1] <= '9') + return 0; // pattern contains sequence number, so it's not unique + if (p) p++; + } while (p); + + return 1; +} + +static void print_md5(unsigned char digest[16], const char *filename) { + int i; + + for (i = 0; i < 16; ++i) printf("%02x", digest[i]); + printf(" %s\n", filename); +} + +static FILE *open_outfile(const char *name) { + if (strcmp("-", name) == 0) { + set_binary_mode(stdout); + return stdout; + } else { + FILE *file = fopen(name, "wb"); + if (!file) fatal("Failed to open output file '%s'", name); + return file; + } +} + +static int main_loop(int argc, const char **argv_) { + aom_codec_ctx_t decoder; + char *fn = NULL; + int i; + int ret = EXIT_FAILURE; + uint8_t *buf = NULL; + size_t bytes_in_buffer = 0, buffer_size = 0; + FILE *infile; + int frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0; + int do_md5 = 0, progress = 0; + int stop_after = 0, summary = 0, quiet = 1; + int arg_skip = 0; + int keep_going = 0; + const AvxInterface *interface = NULL; + const AvxInterface *fourcc_interface = NULL; + uint64_t dx_time = 0; + struct arg arg; + char **argv, **argi, **argj; + + int single_file; + int use_y4m = 1; + int opt_yv12 = 0; + int opt_i420 = 0; + int opt_raw = 0; + aom_codec_dec_cfg_t cfg = { 0, 0, 0, !FORCE_HIGHBITDEPTH_DECODING }; + unsigned int fixed_output_bit_depth = 0; + unsigned int is_annexb = 0; + int frames_corrupted = 0; + int dec_flags = 0; + int do_scale = 0; + int operating_point = 0; + int output_all_layers = 0; + int skip_film_grain = 0; + aom_image_t *scaled_img = NULL; + aom_image_t *img_shifted = NULL; + int frame_avail, got_data, flush_decoder = 0; + int num_external_frame_buffers = 0; + struct ExternalFrameBufferList ext_fb_list = { 0, NULL }; + + const char *outfile_pattern = NULL; + char outfile_name[PATH_MAX] = { 0 }; + FILE *outfile = NULL; + + FILE *framestats_file = NULL; + + MD5Context md5_ctx; + unsigned char md5_digest[16]; + + struct AvxDecInputContext input = { NULL, NULL, NULL }; + struct AvxInputContext aom_input_ctx; + memset(&aom_input_ctx, 0, sizeof(aom_input_ctx)); +#if CONFIG_WEBM_IO + struct WebmInputContext webm_ctx; + memset(&webm_ctx, 0, sizeof(webm_ctx)); + input.webm_ctx = &webm_ctx; +#endif + struct ObuDecInputContext obu_ctx = { NULL, NULL, 0, 0, 0 }; + int is_ivf = 0; + + obu_ctx.avx_ctx = &aom_input_ctx; + input.obu_ctx = &obu_ctx; + input.aom_input_ctx = &aom_input_ctx; + + /* Parse command line */ + exec_name = argv_[0]; + argv = argv_dup(argc - 1, argv_ + 1); + + for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { + memset(&arg, 0, sizeof(arg)); + arg.argv_step = 1; + + if (arg_match(&arg, &help, argi)) { + show_help(stdout, 0); + exit(EXIT_SUCCESS); + } else if (arg_match(&arg, &codecarg, argi)) { + interface = get_aom_decoder_by_name(arg.val); + if (!interface) + die("Error: Unrecognized argument (%s) to --codec\n", arg.val); + } else if (arg_match(&arg, &looparg, argi)) { + // no-op + } else if (arg_match(&arg, &outputfile, argi)) { + outfile_pattern = arg.val; + } else if (arg_match(&arg, &use_yv12, argi)) { + use_y4m = 0; + flipuv = 1; + opt_yv12 = 1; + opt_i420 = 0; + opt_raw = 0; + } else if (arg_match(&arg, &use_i420, argi)) { + use_y4m = 0; + flipuv = 0; + opt_yv12 = 0; + opt_i420 = 1; + opt_raw = 0; + } else if (arg_match(&arg, &rawvideo, argi)) { + use_y4m = 0; + opt_yv12 = 0; + opt_i420 = 0; + opt_raw = 1; + } else if (arg_match(&arg, &flipuvarg, argi)) { + flipuv = 1; + } else if (arg_match(&arg, &noblitarg, argi)) { + noblit = 1; + } else if (arg_match(&arg, &progressarg, argi)) { + progress = 1; + } else if (arg_match(&arg, &limitarg, argi)) { + stop_after = arg_parse_uint(&arg); + } else if (arg_match(&arg, &skiparg, argi)) { + arg_skip = arg_parse_uint(&arg); + } else if (arg_match(&arg, &md5arg, argi)) { + do_md5 = 1; + } else if (arg_match(&arg, &framestatsarg, argi)) { + framestats_file = fopen(arg.val, "w"); + if (!framestats_file) { + die("Error: Could not open --framestats file (%s) for writing.\n", + arg.val); + } + } else if (arg_match(&arg, &summaryarg, argi)) { + summary = 1; + } else if (arg_match(&arg, &threadsarg, argi)) { + cfg.threads = arg_parse_uint(&arg); +#if !CONFIG_MULTITHREAD + if (cfg.threads > 1) { + die("Error: --threads=%d is not supported when CONFIG_MULTITHREAD = " + "0.\n", + cfg.threads); + } +#endif + } else if (arg_match(&arg, &verbosearg, argi)) { + quiet = 0; + } else if (arg_match(&arg, &scalearg, argi)) { + do_scale = 1; + } else if (arg_match(&arg, &fb_arg, argi)) { + num_external_frame_buffers = arg_parse_uint(&arg); + } else if (arg_match(&arg, &continuearg, argi)) { + keep_going = 1; + } else if (arg_match(&arg, &outbitdeptharg, argi)) { + fixed_output_bit_depth = arg_parse_uint(&arg); + } else if (arg_match(&arg, &isannexb, argi)) { + is_annexb = 1; + input.obu_ctx->is_annexb = 1; + } else if (arg_match(&arg, &oppointarg, argi)) { + operating_point = arg_parse_int(&arg); + } else if (arg_match(&arg, &outallarg, argi)) { + output_all_layers = 1; + } else if (arg_match(&arg, &skipfilmgrain, argi)) { + skip_film_grain = 1; + } else { + argj++; + } + } + + /* Check for unrecognized options */ + for (argi = argv; *argi; argi++) + if (argi[0][0] == '-' && strlen(argi[0]) > 1) + die("Error: Unrecognized option %s\n", *argi); + + /* Handle non-option arguments */ + fn = argv[0]; + + if (!fn) { + free(argv); + fprintf(stderr, "No input file specified!\n"); + usage_exit(); + } + /* Open file */ + infile = strcmp(fn, "-") ? fopen(fn, "rb") : set_binary_mode(stdin); + + if (!infile) { + fatal("Failed to open input file '%s'", strcmp(fn, "-") ? fn : "stdin"); + } +#if CONFIG_OS_SUPPORT + /* Make sure we don't dump to the terminal, unless forced to with -o - */ + if (!outfile_pattern && isatty(STDOUT_FILENO) && !do_md5 && !noblit) { + fprintf(stderr, + "Not dumping raw video to your terminal. Use '-o -' to " + "override.\n"); + return EXIT_FAILURE; + } +#endif + input.aom_input_ctx->filename = fn; + input.aom_input_ctx->file = infile; + if (file_is_ivf(input.aom_input_ctx)) { + input.aom_input_ctx->file_type = FILE_TYPE_IVF; + is_ivf = 1; + } +#if CONFIG_WEBM_IO + else if (file_is_webm(input.webm_ctx, input.aom_input_ctx)) + input.aom_input_ctx->file_type = FILE_TYPE_WEBM; +#endif + else if (file_is_obu(&obu_ctx)) + input.aom_input_ctx->file_type = FILE_TYPE_OBU; + else if (file_is_raw(input.aom_input_ctx)) + input.aom_input_ctx->file_type = FILE_TYPE_RAW; + else { + fprintf(stderr, "Unrecognized input file type.\n"); +#if !CONFIG_WEBM_IO + fprintf(stderr, "aomdec was built without WebM container support.\n"); +#endif + free(argv); + return EXIT_FAILURE; + } + + outfile_pattern = outfile_pattern ? outfile_pattern : "-"; + single_file = is_single_file(outfile_pattern); + + if (!noblit && single_file) { + generate_filename(outfile_pattern, outfile_name, PATH_MAX, + aom_input_ctx.width, aom_input_ctx.height, 0); + if (do_md5) + MD5Init(&md5_ctx); + else + outfile = open_outfile(outfile_name); + } + + if (use_y4m && !noblit) { + if (!single_file) { + fprintf(stderr, + "YUV4MPEG2 not supported with output patterns," + " try --i420 or --yv12 or --rawvideo.\n"); + return EXIT_FAILURE; + } + +#if CONFIG_WEBM_IO + if (aom_input_ctx.file_type == FILE_TYPE_WEBM) { + if (webm_guess_framerate(input.webm_ctx, input.aom_input_ctx)) { + fprintf(stderr, + "Failed to guess framerate -- error parsing " + "webm file?\n"); + return EXIT_FAILURE; + } + } +#endif + } + + fourcc_interface = get_aom_decoder_by_fourcc(aom_input_ctx.fourcc); + + if (is_ivf && !fourcc_interface) + fatal("Unsupported fourcc: %x\n", aom_input_ctx.fourcc); + + if (interface && fourcc_interface && interface != fourcc_interface) + warn("Header indicates codec: %s\n", fourcc_interface->name); + else + interface = fourcc_interface; + + if (!interface) interface = get_aom_decoder_by_index(0); + + dec_flags = 0; + if (aom_codec_dec_init(&decoder, interface->codec_interface(), &cfg, + dec_flags)) { + fprintf(stderr, "Failed to initialize decoder: %s\n", + aom_codec_error(&decoder)); + goto fail2; + } + + if (!quiet) fprintf(stderr, "%s\n", decoder.name); + + if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_IS_ANNEXB, is_annexb)) { + fprintf(stderr, "Failed to set is_annexb: %s\n", aom_codec_error(&decoder)); + goto fail; + } + + if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_OPERATING_POINT, + operating_point)) { + fprintf(stderr, "Failed to set operating_point: %s\n", + aom_codec_error(&decoder)); + goto fail; + } + + if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_OUTPUT_ALL_LAYERS, + output_all_layers)) { + fprintf(stderr, "Failed to set output_all_layers: %s\n", + aom_codec_error(&decoder)); + goto fail; + } + + if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_SKIP_FILM_GRAIN, + skip_film_grain)) { + fprintf(stderr, "Failed to set skip_film_grain: %s\n", + aom_codec_error(&decoder)); + goto fail; + } + + if (arg_skip) fprintf(stderr, "Skipping first %d frames.\n", arg_skip); + while (arg_skip) { + if (read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) break; + arg_skip--; + } + + if (num_external_frame_buffers > 0) { + ext_fb_list.num_external_frame_buffers = num_external_frame_buffers; + ext_fb_list.ext_fb = (struct ExternalFrameBuffer *)calloc( + num_external_frame_buffers, sizeof(*ext_fb_list.ext_fb)); + if (aom_codec_set_frame_buffer_functions(&decoder, get_av1_frame_buffer, + release_av1_frame_buffer, + &ext_fb_list)) { + fprintf(stderr, "Failed to configure external frame buffers: %s\n", + aom_codec_error(&decoder)); + goto fail; + } + } + + frame_avail = 1; + got_data = 0; + + if (framestats_file) fprintf(framestats_file, "bytes,qp\r\n"); + + /* Decode file */ + while (frame_avail || got_data) { + aom_codec_iter_t iter = NULL; + aom_image_t *img; + struct aom_usec_timer timer; + int corrupted = 0; + + frame_avail = 0; + if (!stop_after || frame_in < stop_after) { + if (!read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) { + frame_avail = 1; + frame_in++; + + aom_usec_timer_start(&timer); + + if (aom_codec_decode(&decoder, buf, bytes_in_buffer, NULL)) { + const char *detail = aom_codec_error_detail(&decoder); + warn("Failed to decode frame %d: %s", frame_in, + aom_codec_error(&decoder)); + + if (detail) warn("Additional information: %s", detail); + if (!keep_going) goto fail; + } + + if (framestats_file) { + int qp; + if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AOMD_GET_LAST_QUANTIZER, + &qp)) { + warn("Failed AOMD_GET_LAST_QUANTIZER: %s", + aom_codec_error(&decoder)); + if (!keep_going) goto fail; + } + fprintf(framestats_file, "%d,%d\r\n", (int)bytes_in_buffer, qp); + } + + aom_usec_timer_mark(&timer); + dx_time += aom_usec_timer_elapsed(&timer); + } else { + flush_decoder = 1; + } + } else { + flush_decoder = 1; + } + + aom_usec_timer_start(&timer); + + if (flush_decoder) { + // Flush the decoder. + if (aom_codec_decode(&decoder, NULL, 0, NULL)) { + warn("Failed to flush decoder: %s", aom_codec_error(&decoder)); + } + } + + aom_usec_timer_mark(&timer); + dx_time += aom_usec_timer_elapsed(&timer); + + got_data = 0; + while ((img = aom_codec_get_frame(&decoder, &iter))) { + ++frame_out; + got_data = 1; + + if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AOMD_GET_FRAME_CORRUPTED, + &corrupted)) { + warn("Failed AOM_GET_FRAME_CORRUPTED: %s", aom_codec_error(&decoder)); + if (!keep_going) goto fail; + } + frames_corrupted += corrupted; + + if (progress) show_progress(frame_in, frame_out, dx_time); + + if (!noblit) { + const int PLANES_YUV[] = { AOM_PLANE_Y, AOM_PLANE_U, AOM_PLANE_V }; + const int PLANES_YVU[] = { AOM_PLANE_Y, AOM_PLANE_V, AOM_PLANE_U }; + const int *planes = flipuv ? PLANES_YVU : PLANES_YUV; + + if (do_scale) { + if (frame_out == 1) { + // If the output frames are to be scaled to a fixed display size + // then use the width and height specified in the container. If + // either of these is set to 0, use the display size set in the + // first frame header. If that is unavailable, use the raw decoded + // size of the first decoded frame. + int render_width = aom_input_ctx.width; + int render_height = aom_input_ctx.height; + if (!render_width || !render_height) { + int render_size[2]; + if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_GET_DISPLAY_SIZE, + render_size)) { + // As last resort use size of first frame as display size. + render_width = img->d_w; + render_height = img->d_h; + } else { + render_width = render_size[0]; + render_height = render_size[1]; + } + } + scaled_img = + aom_img_alloc(NULL, img->fmt, render_width, render_height, 16); + scaled_img->bit_depth = img->bit_depth; + scaled_img->monochrome = img->monochrome; + scaled_img->csp = img->csp; + } + + if (img->d_w != scaled_img->d_w || img->d_h != scaled_img->d_h) { +#if CONFIG_LIBYUV + libyuv_scale(img, scaled_img, kFilterBox); + img = scaled_img; +#else + fprintf( + stderr, + "Failed to scale output frame: %s.\n" + "libyuv is required for scaling but is currently disabled.\n" + "Be sure to specify -DCONFIG_LIBYUV=1 when running cmake.\n", + aom_codec_error(&decoder)); + goto fail; +#endif + } + } + // Default to codec bit depth if output bit depth not set + unsigned int output_bit_depth; + if (!fixed_output_bit_depth && single_file) { + output_bit_depth = img->bit_depth; + } else { + output_bit_depth = fixed_output_bit_depth; + } + // Shift up or down if necessary + if (output_bit_depth != 0) + aom_shift_img(output_bit_depth, &img, &img_shifted); + + aom_input_ctx.width = img->d_w; + aom_input_ctx.height = img->d_h; + + int num_planes = (opt_raw && img->monochrome) ? 1 : 3; + if (single_file) { + if (use_y4m) { + char y4m_buf[Y4M_BUFFER_SIZE] = { 0 }; + size_t len = 0; + if (frame_out == 1) { + // Y4M file header + len = y4m_write_file_header( + y4m_buf, sizeof(y4m_buf), aom_input_ctx.width, + aom_input_ctx.height, &aom_input_ctx.framerate, + img->monochrome, img->csp, img->fmt, img->bit_depth); + if (img->csp == AOM_CSP_COLOCATED) { + fprintf(stderr, + "Warning: Y4M lacks a colorspace for colocated " + "chroma. Using a placeholder.\n"); + } + if (do_md5) { + MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len); + } else { + fputs(y4m_buf, outfile); + } + } + + // Y4M frame header + len = y4m_write_frame_header(y4m_buf, sizeof(y4m_buf)); + if (do_md5) { + MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len); + y4m_update_image_md5(img, planes, &md5_ctx); + } else { + fputs(y4m_buf, outfile); + y4m_write_image_file(img, planes, outfile); + } + } else { + if (frame_out == 1) { + // Check if --yv12 or --i420 options are consistent with the + // bit-stream decoded + if (opt_i420) { + if (img->fmt != AOM_IMG_FMT_I420 && + img->fmt != AOM_IMG_FMT_I42016) { + fprintf(stderr, + "Cannot produce i420 output for bit-stream.\n"); + goto fail; + } + } + if (opt_yv12) { + if ((img->fmt != AOM_IMG_FMT_I420 && + img->fmt != AOM_IMG_FMT_YV12) || + img->bit_depth != 8) { + fprintf(stderr, + "Cannot produce yv12 output for bit-stream.\n"); + goto fail; + } + } + } + if (do_md5) { + raw_update_image_md5(img, planes, num_planes, &md5_ctx); + } else { + raw_write_image_file(img, planes, num_planes, outfile); + } + } + } else { + generate_filename(outfile_pattern, outfile_name, PATH_MAX, img->d_w, + img->d_h, frame_in); + if (do_md5) { + MD5Init(&md5_ctx); + if (use_y4m) { + y4m_update_image_md5(img, planes, &md5_ctx); + } else { + raw_update_image_md5(img, planes, num_planes, &md5_ctx); + } + MD5Final(md5_digest, &md5_ctx); + print_md5(md5_digest, outfile_name); + } else { + outfile = open_outfile(outfile_name); + if (use_y4m) { + y4m_write_image_file(img, planes, outfile); + } else { + raw_write_image_file(img, planes, num_planes, outfile); + } + fclose(outfile); + } + } + } + } + } + + if (summary || progress) { + show_progress(frame_in, frame_out, dx_time); + fprintf(stderr, "\n"); + } + + if (frames_corrupted) { + fprintf(stderr, "WARNING: %d frames corrupted.\n", frames_corrupted); + } else { + ret = EXIT_SUCCESS; + } + +fail: + + if (aom_codec_destroy(&decoder)) { + fprintf(stderr, "Failed to destroy decoder: %s\n", + aom_codec_error(&decoder)); + } + +fail2: + + if (!noblit && single_file) { + if (do_md5) { + MD5Final(md5_digest, &md5_ctx); + print_md5(md5_digest, outfile_name); + } else { + fclose(outfile); + } + } + +#if CONFIG_WEBM_IO + if (input.aom_input_ctx->file_type == FILE_TYPE_WEBM) + webm_free(input.webm_ctx); +#endif + if (input.aom_input_ctx->file_type == FILE_TYPE_OBU) + obudec_free(input.obu_ctx); + + if (input.aom_input_ctx->file_type != FILE_TYPE_WEBM) free(buf); + + if (scaled_img) aom_img_free(scaled_img); + if (img_shifted) aom_img_free(img_shifted); + + for (i = 0; i < ext_fb_list.num_external_frame_buffers; ++i) { + free(ext_fb_list.ext_fb[i].data); + } + free(ext_fb_list.ext_fb); + + fclose(infile); + if (framestats_file) fclose(framestats_file); + + free(argv); + + return ret; +} + +int main(int argc, const char **argv_) { + unsigned int loops = 1, i; + char **argv, **argi, **argj; + struct arg arg; + int error = 0; + + argv = argv_dup(argc - 1, argv_ + 1); + for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { + memset(&arg, 0, sizeof(arg)); + arg.argv_step = 1; + + if (arg_match(&arg, &looparg, argi)) { + loops = arg_parse_uint(&arg); + break; + } + } + free(argv); + for (i = 0; !error && i < loops; i++) error = main_loop(argc, argv_); + return error; +} diff --git a/libs/libaom/src/apps/aomenc.c b/libs/libaom/src/apps/aomenc.c new file mode 100644 index 000000000..bb57726b4 --- /dev/null +++ b/libs/libaom/src/apps/aomenc.c @@ -0,0 +1,2752 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "apps/aomenc.h" + +#include "config/aom_config.h" + +#include +#include +#include +#include +#include +#include +#include + +#if CONFIG_AV1_DECODER +#include "aom/aom_decoder.h" +#include "aom/aomdx.h" +#endif + +#include "aom/aom_encoder.h" +#include "aom/aom_integer.h" +#include "aom/aomcx.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/mem_ops.h" +#include "common/args.h" +#include "common/ivfenc.h" +#include "common/tools_common.h" +#include "common/warnings.h" + +#if CONFIG_WEBM_IO +#include "common/webmenc.h" +#endif + +#include "common/y4minput.h" +#include "examples/encoder_util.h" +#include "stats/aomstats.h" +#include "stats/rate_hist.h" + +#if CONFIG_LIBYUV +#include "third_party/libyuv/include/libyuv/scale.h" +#endif + +/* Swallow warnings about unused results of fread/fwrite */ +static size_t wrap_fread(void *ptr, size_t size, size_t nmemb, FILE *stream) { + return fread(ptr, size, nmemb, stream); +} +#define fread wrap_fread + +static size_t wrap_fwrite(const void *ptr, size_t size, size_t nmemb, + FILE *stream) { + return fwrite(ptr, size, nmemb, stream); +} +#define fwrite wrap_fwrite + +static const char *exec_name; + +static void warn_or_exit_on_errorv(aom_codec_ctx_t *ctx, int fatal, + const char *s, va_list ap) { + if (ctx->err) { + const char *detail = aom_codec_error_detail(ctx); + + vfprintf(stderr, s, ap); + fprintf(stderr, ": %s\n", aom_codec_error(ctx)); + + if (detail) fprintf(stderr, " %s\n", detail); + + if (fatal) exit(EXIT_FAILURE); + } +} + +static void ctx_exit_on_error(aom_codec_ctx_t *ctx, const char *s, ...) { + va_list ap; + + va_start(ap, s); + warn_or_exit_on_errorv(ctx, 1, s, ap); + va_end(ap); +} + +static void warn_or_exit_on_error(aom_codec_ctx_t *ctx, int fatal, + const char *s, ...) { + va_list ap; + + va_start(ap, s); + warn_or_exit_on_errorv(ctx, fatal, s, ap); + va_end(ap); +} + +static int read_frame(struct AvxInputContext *input_ctx, aom_image_t *img) { + FILE *f = input_ctx->file; + y4m_input *y4m = &input_ctx->y4m; + int shortread = 0; + + if (input_ctx->file_type == FILE_TYPE_Y4M) { + if (y4m_input_fetch_frame(y4m, f, img) < 1) return 0; + } else { + shortread = read_yuv_frame(input_ctx, img); + } + + return !shortread; +} + +static int file_is_y4m(const char detect[4]) { + if (memcmp(detect, "YUV4", 4) == 0) { + return 1; + } + return 0; +} + +static int fourcc_is_ivf(const char detect[4]) { + if (memcmp(detect, "DKIF", 4) == 0) { + return 1; + } + return 0; +} + +static const arg_def_t help = + ARG_DEF(NULL, "help", 0, "Show usage options and exit"); +static const arg_def_t debugmode = + ARG_DEF("D", "debug", 0, "Debug mode (makes output deterministic)"); +static const arg_def_t outputfile = + ARG_DEF("o", "output", 1, "Output filename"); +static const arg_def_t use_yv12 = + ARG_DEF(NULL, "yv12", 0, "Input file is YV12 "); +static const arg_def_t use_i420 = + ARG_DEF(NULL, "i420", 0, "Input file is I420 (default)"); +static const arg_def_t use_i422 = + ARG_DEF(NULL, "i422", 0, "Input file is I422"); +static const arg_def_t use_i444 = + ARG_DEF(NULL, "i444", 0, "Input file is I444"); +static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use"); +static const arg_def_t passes = + ARG_DEF("p", "passes", 1, "Number of passes (1/2)"); +static const arg_def_t pass_arg = + ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2)"); +static const arg_def_t fpf_name = + ARG_DEF(NULL, "fpf", 1, "First pass statistics file name"); +static const arg_def_t limit = + ARG_DEF(NULL, "limit", 1, "Stop encoding after n input frames"); +static const arg_def_t skip = + ARG_DEF(NULL, "skip", 1, "Skip the first n input frames"); +static const arg_def_t good_dl = + ARG_DEF(NULL, "good", 0, "Use Good Quality Deadline"); +static const arg_def_t rt_dl = + ARG_DEF(NULL, "rt", 0, "Use Realtime Quality Deadline"); +static const arg_def_t quietarg = + ARG_DEF("q", "quiet", 0, "Do not print encode progress"); +static const arg_def_t verbosearg = + ARG_DEF("v", "verbose", 0, "Show encoder parameters"); +static const arg_def_t psnrarg = + ARG_DEF(NULL, "psnr", 0, "Show PSNR in status line"); +static const arg_def_t use_cfg = ARG_DEF("c", "cfg", 1, "Config file to use"); + +static const struct arg_enum_list test_decode_enum[] = { + { "off", TEST_DECODE_OFF }, + { "fatal", TEST_DECODE_FATAL }, + { "warn", TEST_DECODE_WARN }, + { NULL, 0 } +}; +static const arg_def_t recontest = ARG_DEF_ENUM( + NULL, "test-decode", 1, "Test encode/decode mismatch", test_decode_enum); +static const arg_def_t framerate = + ARG_DEF(NULL, "fps", 1, "Stream frame rate (rate/scale)"); +static const arg_def_t use_webm = + ARG_DEF(NULL, "webm", 0, "Output WebM (default when WebM IO is enabled)"); +static const arg_def_t use_ivf = ARG_DEF(NULL, "ivf", 0, "Output IVF"); +static const arg_def_t use_obu = ARG_DEF(NULL, "obu", 0, "Output OBU"); +static const arg_def_t q_hist_n = + ARG_DEF(NULL, "q-hist", 1, "Show quantizer histogram (n-buckets)"); +static const arg_def_t rate_hist_n = + ARG_DEF(NULL, "rate-hist", 1, "Show rate histogram (n-buckets)"); +static const arg_def_t disable_warnings = + ARG_DEF(NULL, "disable-warnings", 0, + "Disable warnings about potentially incorrect encode settings."); +static const arg_def_t disable_warning_prompt = + ARG_DEF("y", "disable-warning-prompt", 0, + "Display warnings, but do not prompt user to continue."); +static const struct arg_enum_list bitdepth_enum[] = { + { "8", AOM_BITS_8 }, { "10", AOM_BITS_10 }, { "12", AOM_BITS_12 }, { NULL, 0 } +}; + +static const arg_def_t bitdeptharg = ARG_DEF_ENUM( + "b", "bit-depth", 1, + "Bit depth for codec (8 for version <=1, 10 or 12 for version 2)", + bitdepth_enum); +static const arg_def_t inbitdeptharg = + ARG_DEF(NULL, "input-bit-depth", 1, "Bit depth of input"); + +static const arg_def_t input_chroma_subsampling_x = ARG_DEF( + NULL, "input-chroma-subsampling-x", 1, "chroma subsampling x value."); +static const arg_def_t input_chroma_subsampling_y = ARG_DEF( + NULL, "input-chroma-subsampling-y", 1, "chroma subsampling y value."); + +static const arg_def_t *main_args[] = { &help, + &use_cfg, + &debugmode, + &outputfile, + &codecarg, + &passes, + &pass_arg, + &fpf_name, + &limit, + &skip, + &good_dl, + &rt_dl, + &quietarg, + &verbosearg, + &psnrarg, + &use_webm, + &use_ivf, + &use_obu, + &q_hist_n, + &rate_hist_n, + &disable_warnings, + &disable_warning_prompt, + &recontest, + NULL }; + +static const arg_def_t usage = + ARG_DEF("u", "usage", 1, "Usage profile number to use"); +static const arg_def_t threads = + ARG_DEF("t", "threads", 1, "Max number of threads to use"); +static const arg_def_t profile = + ARG_DEF(NULL, "profile", 1, "Bitstream profile number to use"); +static const arg_def_t width = ARG_DEF("w", "width", 1, "Frame width"); +static const arg_def_t height = ARG_DEF("h", "height", 1, "Frame height"); +static const arg_def_t forced_max_frame_width = ARG_DEF( + NULL, "forced_max_frame_width", 1, "Maximum frame width value to force"); +static const arg_def_t forced_max_frame_height = ARG_DEF( + NULL, "forced_max_frame_height", 1, "Maximum frame height value to force"); +#if CONFIG_WEBM_IO +static const struct arg_enum_list stereo_mode_enum[] = { + { "mono", STEREO_FORMAT_MONO }, + { "left-right", STEREO_FORMAT_LEFT_RIGHT }, + { "bottom-top", STEREO_FORMAT_BOTTOM_TOP }, + { "top-bottom", STEREO_FORMAT_TOP_BOTTOM }, + { "right-left", STEREO_FORMAT_RIGHT_LEFT }, + { NULL, 0 } +}; +static const arg_def_t stereo_mode = ARG_DEF_ENUM( + NULL, "stereo-mode", 1, "Stereo 3D video format", stereo_mode_enum); +#endif +static const arg_def_t timebase = ARG_DEF( + NULL, "timebase", 1, "Output timestamp precision (fractional seconds)"); +static const arg_def_t global_error_resilient = + ARG_DEF(NULL, "global-error-resilient", 1, + "Enable global error resiliency features"); +static const arg_def_t lag_in_frames = + ARG_DEF(NULL, "lag-in-frames", 1, "Max number of frames to lag"); +static const arg_def_t large_scale_tile = ARG_DEF( + NULL, "large-scale-tile", 1, + "Large scale tile coding (0: off (default), 1: on (ivf output only))"); +static const arg_def_t monochrome = + ARG_DEF(NULL, "monochrome", 0, "Monochrome video (no chroma planes)"); +static const arg_def_t full_still_picture_hdr = ARG_DEF( + NULL, "full-still-picture-hdr", 0, "Use full header for still picture"); + +static const arg_def_t *global_args[] = { &use_yv12, + &use_i420, + &use_i422, + &use_i444, + &usage, + &threads, + &profile, + &width, + &height, + &forced_max_frame_width, + &forced_max_frame_height, +#if CONFIG_WEBM_IO + &stereo_mode, +#endif + &timebase, + &framerate, + &global_error_resilient, + &bitdeptharg, + &lag_in_frames, + &large_scale_tile, + &monochrome, + &full_still_picture_hdr, + NULL }; + +static const arg_def_t dropframe_thresh = + ARG_DEF(NULL, "drop-frame", 1, "Temporal resampling threshold (buf %)"); +static const arg_def_t resize_mode = + ARG_DEF(NULL, "resize-mode", 1, "Frame resize mode"); +static const arg_def_t resize_denominator = + ARG_DEF(NULL, "resize-denominator", 1, "Frame resize denominator"); +static const arg_def_t resize_kf_denominator = ARG_DEF( + NULL, "resize-kf-denominator", 1, "Frame resize keyframe denominator"); +static const arg_def_t superres_mode = + ARG_DEF(NULL, "superres-mode", 1, "Frame super-resolution mode"); +static const arg_def_t superres_denominator = ARG_DEF( + NULL, "superres-denominator", 1, "Frame super-resolution denominator"); +static const arg_def_t superres_kf_denominator = + ARG_DEF(NULL, "superres-kf-denominator", 1, + "Frame super-resolution keyframe denominator"); +static const arg_def_t superres_qthresh = ARG_DEF( + NULL, "superres-qthresh", 1, "Frame super-resolution qindex threshold"); +static const arg_def_t superres_kf_qthresh = + ARG_DEF(NULL, "superres-kf-qthresh", 1, + "Frame super-resolution keyframe qindex threshold"); +static const struct arg_enum_list end_usage_enum[] = { { "vbr", AOM_VBR }, + { "cbr", AOM_CBR }, + { "cq", AOM_CQ }, + { "q", AOM_Q }, + { NULL, 0 } }; +static const arg_def_t end_usage = + ARG_DEF_ENUM(NULL, "end-usage", 1, "Rate control mode", end_usage_enum); +static const arg_def_t target_bitrate = + ARG_DEF(NULL, "target-bitrate", 1, "Bitrate (kbps)"); +static const arg_def_t min_quantizer = + ARG_DEF(NULL, "min-q", 1, "Minimum (best) quantizer"); +static const arg_def_t max_quantizer = + ARG_DEF(NULL, "max-q", 1, "Maximum (worst) quantizer"); +static const arg_def_t undershoot_pct = + ARG_DEF(NULL, "undershoot-pct", 1, "Datarate undershoot (min) target (%)"); +static const arg_def_t overshoot_pct = + ARG_DEF(NULL, "overshoot-pct", 1, "Datarate overshoot (max) target (%)"); +static const arg_def_t buf_sz = + ARG_DEF(NULL, "buf-sz", 1, "Client buffer size (ms)"); +static const arg_def_t buf_initial_sz = + ARG_DEF(NULL, "buf-initial-sz", 1, "Client initial buffer size (ms)"); +static const arg_def_t buf_optimal_sz = + ARG_DEF(NULL, "buf-optimal-sz", 1, "Client optimal buffer size (ms)"); +static const arg_def_t *rc_args[] = { &dropframe_thresh, + &resize_mode, + &resize_denominator, + &resize_kf_denominator, + &superres_mode, + &superres_denominator, + &superres_kf_denominator, + &superres_qthresh, + &superres_kf_qthresh, + &end_usage, + &target_bitrate, + &min_quantizer, + &max_quantizer, + &undershoot_pct, + &overshoot_pct, + &buf_sz, + &buf_initial_sz, + &buf_optimal_sz, + NULL }; + +static const arg_def_t bias_pct = + ARG_DEF(NULL, "bias-pct", 1, "CBR/VBR bias (0=CBR, 100=VBR)"); +static const arg_def_t minsection_pct = + ARG_DEF(NULL, "minsection-pct", 1, "GOP min bitrate (% of target)"); +static const arg_def_t maxsection_pct = + ARG_DEF(NULL, "maxsection-pct", 1, "GOP max bitrate (% of target)"); +static const arg_def_t *rc_twopass_args[] = { &bias_pct, &minsection_pct, + &maxsection_pct, NULL }; +static const arg_def_t fwd_kf_enabled = + ARG_DEF(NULL, "enable-fwd-kf", 1, "Enable forward reference keyframes"); +static const arg_def_t kf_min_dist = + ARG_DEF(NULL, "kf-min-dist", 1, "Minimum keyframe interval (frames)"); +static const arg_def_t kf_max_dist = + ARG_DEF(NULL, "kf-max-dist", 1, "Maximum keyframe interval (frames)"); +static const arg_def_t kf_disabled = + ARG_DEF(NULL, "disable-kf", 0, "Disable keyframe placement"); +static const arg_def_t *kf_args[] = { &fwd_kf_enabled, &kf_min_dist, + &kf_max_dist, &kf_disabled, NULL }; +static const arg_def_t sframe_dist = + ARG_DEF(NULL, "sframe-dist", 1, "S-Frame interval (frames)"); +static const arg_def_t sframe_mode = + ARG_DEF(NULL, "sframe-mode", 1, "S-Frame insertion mode (1..2)"); +static const arg_def_t save_as_annexb = + ARG_DEF(NULL, "annexb", 1, "Save as Annex-B"); +static const arg_def_t noise_sens = + ARG_DEF(NULL, "noise-sensitivity", 1, "Noise sensitivity (frames to blur)"); +static const arg_def_t sharpness = + ARG_DEF(NULL, "sharpness", 1, "Loop filter sharpness (0..7)"); +static const arg_def_t static_thresh = + ARG_DEF(NULL, "static-thresh", 1, "Motion detection threshold"); +static const arg_def_t auto_altref = + ARG_DEF(NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames"); +static const arg_def_t arnr_maxframes = + ARG_DEF(NULL, "arnr-maxframes", 1, "AltRef max frames (0..15)"); +static const arg_def_t arnr_strength = + ARG_DEF(NULL, "arnr-strength", 1, "AltRef filter strength (0..6)"); +static const struct arg_enum_list tuning_enum[] = { + { "psnr", AOM_TUNE_PSNR }, + { "ssim", AOM_TUNE_SSIM }, + { "vmaf_with_preprocessing", AOM_TUNE_VMAF_WITH_PREPROCESSING }, + { "vmaf_without_preprocessing", AOM_TUNE_VMAF_WITHOUT_PREPROCESSING }, + { "vmaf", AOM_TUNE_VMAF_MAX_GAIN }, + { NULL, 0 } +}; +static const arg_def_t tune_metric = + ARG_DEF_ENUM(NULL, "tune", 1, "Distortion metric tuned with", tuning_enum); +static const arg_def_t cq_level = + ARG_DEF(NULL, "cq-level", 1, "Constant/Constrained Quality level"); +static const arg_def_t max_intra_rate_pct = + ARG_DEF(NULL, "max-intra-rate", 1, "Max I-frame bitrate (pct)"); + +#if CONFIG_AV1_ENCODER +static const arg_def_t cpu_used_av1 = + ARG_DEF(NULL, "cpu-used", 1, + "Speed setting (0..6 in good mode, 6..8 in realtime mode)"); +static const arg_def_t rowmtarg = + ARG_DEF(NULL, "row-mt", 1, + "Enable row based multi-threading (0: off, 1: on (default))"); +static const arg_def_t tile_cols = + ARG_DEF(NULL, "tile-columns", 1, "Number of tile columns to use, log2"); +static const arg_def_t tile_rows = + ARG_DEF(NULL, "tile-rows", 1, "Number of tile rows to use, log2"); +static const arg_def_t enable_tpl_model = + ARG_DEF(NULL, "enable-tpl-model", 1, + "RDO based on frame temporal dependency " + "(0: off, 1: backward source based). " + "This is required for deltaq mode."); +static const arg_def_t enable_keyframe_filtering = + ARG_DEF(NULL, "enable-keyframe-filtering", 1, + "Apply temporal filtering on key frame " + "(0: false, 1: true (default)"); +static const arg_def_t tile_width = + ARG_DEF(NULL, "tile-width", 1, "Tile widths (comma separated)"); +static const arg_def_t tile_height = + ARG_DEF(NULL, "tile-height", 1, "Tile heights (command separated)"); +static const arg_def_t lossless = + ARG_DEF(NULL, "lossless", 1, "Lossless mode (0: false (default), 1: true)"); +static const arg_def_t enable_cdef = + ARG_DEF(NULL, "enable-cdef", 1, + "Enable the constrained directional enhancement filter (0: false, " + "1: true (default))"); +static const arg_def_t enable_restoration = ARG_DEF( + NULL, "enable-restoration", 1, + "Enable the loop restoration filter (0: false (default in Realtime mode), " + "1: true (default in Non-realtime mode))"); +static const arg_def_t enable_rect_partitions = + ARG_DEF(NULL, "enable-rect-partitions", 1, + "Enable rectangular partitions " + "(0: false, 1: true (default))"); +static const arg_def_t enable_ab_partitions = + ARG_DEF(NULL, "enable-ab-partitions", 1, + "Enable ab partitions (0: false, 1: true (default))"); +static const arg_def_t enable_1to4_partitions = + ARG_DEF(NULL, "enable-1to4-partitions", 1, + "Enable 1:4 and 4:1 partitions " + "(0: false, 1: true (default))"); +static const arg_def_t min_partition_size = + ARG_DEF(NULL, "min-partition-size", 4, + "Set min partition size " + "(4:4x4, 8:8x8, 16:16x16, 32:32x32, 64:64x64, 128:128x128). " + "On frame with 4k+ resolutions or higher speed settings, the min " + "partition size will have a minimum of 8."); +static const arg_def_t max_partition_size = + ARG_DEF(NULL, "max-partition-size", 128, + "Set max partition size " + "(4:4x4, 8:8x8, 16:16x16, 32:32x32, 64:64x64, 128:128x128)"); +static const arg_def_t enable_dual_filter = + ARG_DEF(NULL, "enable-dual-filter", 1, + "Enable dual filter " + "(0: false, 1: true (default))"); +static const arg_def_t enable_chroma_deltaq = + ARG_DEF(NULL, "enable-chroma-deltaq", 1, + "Enable chroma delta quant " + "(0: false (default), 1: true)"); +static const arg_def_t enable_intra_edge_filter = + ARG_DEF(NULL, "enable-intra-edge-filter", 1, + "Enable intra edge filtering " + "(0: false, 1: true (default))"); +static const arg_def_t enable_order_hint = + ARG_DEF(NULL, "enable-order-hint", 1, + "Enable order hint " + "(0: false, 1: true (default))"); +static const arg_def_t enable_tx64 = + ARG_DEF(NULL, "enable-tx64", 1, + "Enable 64-pt transform (0: false, 1: true (default))"); +static const arg_def_t enable_flip_idtx = + ARG_DEF(NULL, "enable-flip-idtx", 1, + "Enable extended transform type (0: false, 1: true (default)) " + "including FLIPADST_DCT, DCT_FLIPADST, FLIPADST_FLIPADST, " + "ADST_FLIPADST, FLIPADST_ADST, IDTX, V_DCT, H_DCT, V_ADST, " + "H_ADST, V_FLIPADST, H_FLIPADST"); +static const arg_def_t enable_dist_wtd_comp = + ARG_DEF(NULL, "enable-dist-wtd-comp", 1, + "Enable distance-weighted compound " + "(0: false, 1: true (default))"); +static const arg_def_t enable_masked_comp = + ARG_DEF(NULL, "enable-masked-comp", 1, + "Enable masked (wedge/diff-wtd) compound " + "(0: false, 1: true (default))"); +static const arg_def_t enable_onesided_comp = + ARG_DEF(NULL, "enable-onesided-comp", 1, + "Enable one sided compound " + "(0: false, 1: true (default))"); +static const arg_def_t enable_interintra_comp = + ARG_DEF(NULL, "enable-interintra-comp", 1, + "Enable interintra compound " + "(0: false, 1: true (default))"); +static const arg_def_t enable_smooth_interintra = + ARG_DEF(NULL, "enable-smooth-interintra", 1, + "Enable smooth interintra mode " + "(0: false, 1: true (default))"); +static const arg_def_t enable_diff_wtd_comp = + ARG_DEF(NULL, "enable-diff-wtd-comp", 1, + "Enable difference-weighted compound " + "(0: false, 1: true (default))"); +static const arg_def_t enable_interinter_wedge = + ARG_DEF(NULL, "enable-interinter-wedge", 1, + "Enable interinter wedge compound " + "(0: false, 1: true (default))"); +static const arg_def_t enable_interintra_wedge = + ARG_DEF(NULL, "enable-interintra-wedge", 1, + "Enable interintra wedge compound " + "(0: false, 1: true (default))"); +static const arg_def_t enable_global_motion = + ARG_DEF(NULL, "enable-global-motion", 1, + "Enable global motion " + "(0: false, 1: true (default))"); +static const arg_def_t enable_warped_motion = + ARG_DEF(NULL, "enable-warped-motion", 1, + "Enable local warped motion " + "(0: false, 1: true (default))"); +static const arg_def_t enable_filter_intra = + ARG_DEF(NULL, "enable-filter-intra", 1, + "Enable filter intra prediction mode " + "(0: false, 1: true (default))"); +static const arg_def_t enable_smooth_intra = + ARG_DEF(NULL, "enable-smooth-intra", 1, + "Enable smooth intra prediction modes " + "(0: false, 1: true (default))"); +static const arg_def_t enable_paeth_intra = + ARG_DEF(NULL, "enable-paeth-intra", 1, + "Enable Paeth intra prediction mode (0: false, 1: true (default))"); +static const arg_def_t enable_cfl_intra = + ARG_DEF(NULL, "enable-cfl-intra", 1, + "Enable chroma from luma intra prediction mode " + "(0: false, 1: true (default))"); +static const arg_def_t force_video_mode = + ARG_DEF(NULL, "force-video-mode", 1, + "Force video mode (0: false, 1: true (default))"); +static const arg_def_t enable_obmc = ARG_DEF( + NULL, "enable-obmc", 1, "Enable OBMC (0: false, 1: true (default))"); +static const arg_def_t enable_overlay = + ARG_DEF(NULL, "enable-overlay", 1, + "Enable coding overlay frames (0: false, 1: true (default))"); +static const arg_def_t enable_palette = + ARG_DEF(NULL, "enable-palette", 1, + "Enable palette prediction mode (0: false, 1: true (default))"); +static const arg_def_t enable_intrabc = + ARG_DEF(NULL, "enable-intrabc", 1, + "Enable intra block copy prediction mode " + "(0: false, 1: true (default))"); +static const arg_def_t enable_angle_delta = + ARG_DEF(NULL, "enable-angle-delta", 1, + "Enable intra angle delta (0: false, 1: true (default))"); +static const arg_def_t disable_trellis_quant = + ARG_DEF(NULL, "disable-trellis-quant", 1, + "Disable trellis optimization of quantized coefficients (0: false " + "1: true 2: true for rd search 3: true for estimate yrd serch " + "(default))"); +static const arg_def_t enable_qm = + ARG_DEF(NULL, "enable-qm", 1, + "Enable quantisation matrices (0: false (default), 1: true)"); +static const arg_def_t qm_min = ARG_DEF( + NULL, "qm-min", 1, "Min quant matrix flatness (0..15), default is 8"); +static const arg_def_t qm_max = ARG_DEF( + NULL, "qm-max", 1, "Max quant matrix flatness (0..15), default is 15"); +static const arg_def_t reduced_tx_type_set = ARG_DEF( + NULL, "reduced-tx-type-set", 1, "Use reduced set of transform types"); +static const arg_def_t use_intra_dct_only = + ARG_DEF(NULL, "use-intra-dct-only", 1, "Use DCT only for INTRA modes"); +static const arg_def_t use_inter_dct_only = + ARG_DEF(NULL, "use-inter-dct-only", 1, "Use DCT only for INTER modes"); +static const arg_def_t use_intra_default_tx_only = + ARG_DEF(NULL, "use-intra-default-tx-only", 1, + "Use Default-transform only for INTRA modes"); +static const arg_def_t quant_b_adapt = + ARG_DEF(NULL, "quant-b-adapt", 1, "Use adaptive quantize_b"); +static const arg_def_t coeff_cost_upd_freq = + ARG_DEF(NULL, "coeff-cost-upd-freq", 1, + "Update freq for coeff costs" + "0: SB, 1: SB Row per Tile, 2: Tile"); +static const arg_def_t mode_cost_upd_freq = + ARG_DEF(NULL, "mode-cost-upd-freq", 1, + "Update freq for mode costs" + "0: SB, 1: SB Row per Tile, 2: Tile"); +static const arg_def_t mv_cost_upd_freq = + ARG_DEF(NULL, "mv-cost-upd-freq", 1, + "Update freq for mv costs" + "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"); +static const arg_def_t num_tg = ARG_DEF( + NULL, "num-tile-groups", 1, "Maximum number of tile groups, default is 1"); +static const arg_def_t mtu_size = + ARG_DEF(NULL, "mtu-size", 1, + "MTU size for a tile group, default is 0 (no MTU targeting), " + "overrides maximum number of tile groups"); +static const struct arg_enum_list timing_info_enum[] = { + { "unspecified", AOM_TIMING_UNSPECIFIED }, + { "constant", AOM_TIMING_EQUAL }, + { "model", AOM_TIMING_DEC_MODEL }, + { NULL, 0 } +}; +static const arg_def_t timing_info = + ARG_DEF_ENUM(NULL, "timing-info", 1, + "Signal timing info in the bitstream (model unly works for no " + "hidden frames, no super-res yet):", + timing_info_enum); +#if CONFIG_TUNE_VMAF +static const arg_def_t vmaf_model_path = + ARG_DEF(NULL, "vmaf-model-path", 1, "Path to the VMAF model file"); +#endif +static const arg_def_t film_grain_test = + ARG_DEF(NULL, "film-grain-test", 1, + "Film grain test vectors (0: none (default), 1: test-1 2: test-2, " + "... 16: test-16)"); +static const arg_def_t film_grain_table = + ARG_DEF(NULL, "film-grain-table", 1, + "Path to file containing film grain parameters"); +#if CONFIG_DENOISE +static const arg_def_t denoise_noise_level = + ARG_DEF(NULL, "denoise-noise-level", 1, + "Amount of noise (from 0 = don't denoise, to 50)"); +static const arg_def_t denoise_block_size = + ARG_DEF(NULL, "denoise-block-size", 1, "Denoise block size (default = 32)"); +#endif +static const arg_def_t enable_ref_frame_mvs = + ARG_DEF(NULL, "enable-ref-frame-mvs", 1, + "Enable temporal mv prediction (default is 1)"); +static const arg_def_t frame_parallel_decoding = + ARG_DEF(NULL, "frame-parallel", 1, + "Enable frame parallel decodability features " + "(0: false (default), 1: true)"); +static const arg_def_t error_resilient_mode = + ARG_DEF(NULL, "error-resilient", 1, + "Enable error resilient features " + "(0: false (default), 1: true)"); +static const arg_def_t aq_mode = ARG_DEF( + NULL, "aq-mode", 1, + "Adaptive quantization mode (0: off (default), 1: variance 2: complexity, " + "3: cyclic refresh)"); +static const arg_def_t deltaq_mode = + ARG_DEF(NULL, "deltaq-mode", 1, + "Delta qindex mode (0: off, 1: deltaq objective (default), " + "2: deltaq perceptual). " + "Currently this requires enable-tpl-model as a prerequisite."); +static const arg_def_t deltalf_mode = ARG_DEF( + NULL, "delta-lf-mode", 1, "Enable delta-lf-mode (0: off (default), 1: on)"); +static const arg_def_t frame_periodic_boost = + ARG_DEF(NULL, "frame-boost", 1, + "Enable frame periodic boost (0: off (default), 1: on)"); +static const arg_def_t gf_cbr_boost_pct = ARG_DEF( + NULL, "gf-cbr-boost", 1, "Boost for Golden Frame in CBR mode (pct)"); +static const arg_def_t max_inter_rate_pct = + ARG_DEF(NULL, "max-inter-rate", 1, "Max P-frame bitrate (pct)"); +static const arg_def_t min_gf_interval = ARG_DEF( + NULL, "min-gf-interval", 1, + "min gf/arf frame interval (default 0, indicating in-built behavior)"); +static const arg_def_t max_gf_interval = ARG_DEF( + NULL, "max-gf-interval", 1, + "max gf/arf frame interval (default 0, indicating in-built behavior)"); +static const arg_def_t gf_min_pyr_height = + ARG_DEF(NULL, "gf-min-pyr-height", 1, + "Min height for GF group pyramid structure (0 (default) to 5)"); +static const arg_def_t gf_max_pyr_height = + ARG_DEF(NULL, "gf-max-pyr-height", 1, + "maximum height for GF group pyramid structure (0 to 5 (default))"); +static const arg_def_t max_reference_frames = ARG_DEF( + NULL, "max-reference-frames", 1, + "maximum number of reference frames allowed per frame (3 to 7 (default))"); +static const arg_def_t reduced_reference_set = + ARG_DEF(NULL, "reduced-reference-set", 1, + "Use reduced set of single and compound references (0: off " + "(default), 1: on)"); +static const arg_def_t target_seq_level_idx = + ARG_DEF(NULL, "target-seq-level-idx", 1, + "Target sequence level index. " + "Possible values are in the form of \"ABxy\"(pad leading zeros if " + "less than 4 digits). " + "AB: Operating point(OP) index; " + "xy: Target level index for the OP. " + "E.g. \"0\" means target level index 0 for the 0th OP; " + "\"1021\" means target level index 21 for the 10th OP."); +static const arg_def_t set_min_cr = + ARG_DEF(NULL, "min-cr", 1, + "Set minimum compression ratio. Take integer values. Default is 0. " + "If non-zero, encoder will try to keep the compression ratio of " + "each frame to be higher than the given value divided by 100."); + +static const struct arg_enum_list color_primaries_enum[] = { + { "bt709", AOM_CICP_CP_BT_709 }, + { "unspecified", AOM_CICP_CP_UNSPECIFIED }, + { "bt601", AOM_CICP_CP_BT_601 }, + { "bt470m", AOM_CICP_CP_BT_470_M }, + { "bt470bg", AOM_CICP_CP_BT_470_B_G }, + { "smpte240", AOM_CICP_CP_SMPTE_240 }, + { "film", AOM_CICP_CP_GENERIC_FILM }, + { "bt2020", AOM_CICP_CP_BT_2020 }, + { "xyz", AOM_CICP_CP_XYZ }, + { "smpte431", AOM_CICP_CP_SMPTE_431 }, + { "smpte432", AOM_CICP_CP_SMPTE_432 }, + { "ebu3213", AOM_CICP_CP_EBU_3213 }, + { NULL, 0 } +}; + +static const arg_def_t input_color_primaries = ARG_DEF_ENUM( + NULL, "color-primaries", 1, + "Color primaries (CICP) of input content:", color_primaries_enum); + +static const struct arg_enum_list transfer_characteristics_enum[] = { + { "unspecified", AOM_CICP_CP_UNSPECIFIED }, + { "bt709", AOM_CICP_TC_BT_709 }, + { "bt470m", AOM_CICP_TC_BT_470_M }, + { "bt470bg", AOM_CICP_TC_BT_470_B_G }, + { "bt601", AOM_CICP_TC_BT_601 }, + { "smpte240", AOM_CICP_TC_SMPTE_240 }, + { "lin", AOM_CICP_TC_LINEAR }, + { "log100", AOM_CICP_TC_LOG_100 }, + { "log100sq10", AOM_CICP_TC_LOG_100_SQRT10 }, + { "iec61966", AOM_CICP_TC_IEC_61966 }, + { "bt1361", AOM_CICP_TC_BT_1361 }, + { "srgb", AOM_CICP_TC_SRGB }, + { "bt2020-10bit", AOM_CICP_TC_BT_2020_10_BIT }, + { "bt2020-12bit", AOM_CICP_TC_BT_2020_12_BIT }, + { "smpte2084", AOM_CICP_TC_SMPTE_2084 }, + { "hlg", AOM_CICP_TC_HLG }, + { "smpte428", AOM_CICP_TC_SMPTE_428 }, + { NULL, 0 } +}; + +static const arg_def_t input_transfer_characteristics = + ARG_DEF_ENUM(NULL, "transfer-characteristics", 1, + "Transfer characteristics (CICP) of input content:", + transfer_characteristics_enum); + +static const struct arg_enum_list matrix_coefficients_enum[] = { + { "identity", AOM_CICP_MC_IDENTITY }, + { "bt709", AOM_CICP_MC_BT_709 }, + { "unspecified", AOM_CICP_MC_UNSPECIFIED }, + { "fcc73", AOM_CICP_MC_FCC }, + { "bt470bg", AOM_CICP_MC_BT_470_B_G }, + { "bt601", AOM_CICP_MC_BT_601 }, + { "smpte240", AOM_CICP_CP_SMPTE_240 }, + { "ycgco", AOM_CICP_MC_SMPTE_YCGCO }, + { "bt2020ncl", AOM_CICP_MC_BT_2020_NCL }, + { "bt2020cl", AOM_CICP_MC_BT_2020_CL }, + { "smpte2085", AOM_CICP_MC_SMPTE_2085 }, + { "chromncl", AOM_CICP_MC_CHROMAT_NCL }, + { "chromcl", AOM_CICP_MC_CHROMAT_CL }, + { "ictcp", AOM_CICP_MC_ICTCP }, + { NULL, 0 } +}; + +static const arg_def_t input_matrix_coefficients = ARG_DEF_ENUM( + NULL, "matrix-coefficients", 1, + "Matrix coefficients (CICP) of input content:", matrix_coefficients_enum); + +static const struct arg_enum_list chroma_sample_position_enum[] = { + { "unknown", AOM_CSP_UNKNOWN }, + { "vertical", AOM_CSP_VERTICAL }, + { "colocated", AOM_CSP_COLOCATED }, + { NULL, 0 } +}; + +static const arg_def_t input_chroma_sample_position = + ARG_DEF_ENUM(NULL, "chroma-sample-position", 1, + "The chroma sample position when chroma 4:2:0 is signaled:", + chroma_sample_position_enum); + +static const struct arg_enum_list tune_content_enum[] = { + { "default", AOM_CONTENT_DEFAULT }, + { "screen", AOM_CONTENT_SCREEN }, + { NULL, 0 } +}; + +static const arg_def_t tune_content = ARG_DEF_ENUM( + NULL, "tune-content", 1, "Tune content type", tune_content_enum); + +static const arg_def_t cdf_update_mode = + ARG_DEF(NULL, "cdf-update-mode", 1, + "CDF update mode for entropy coding " + "(0: no CDF update; 1: update CDF on all frames(default); " + "2: selectively update CDF on some frames"); + +static const struct arg_enum_list superblock_size_enum[] = { + { "dynamic", AOM_SUPERBLOCK_SIZE_DYNAMIC }, + { "64", AOM_SUPERBLOCK_SIZE_64X64 }, + { "128", AOM_SUPERBLOCK_SIZE_128X128 }, + { NULL, 0 } +}; +static const arg_def_t superblock_size = ARG_DEF_ENUM( + NULL, "sb-size", 1, "Superblock size to use", superblock_size_enum); + +static const arg_def_t set_tier_mask = + ARG_DEF(NULL, "set-tier-mask", 1, + "Set bit mask to specify which tier each of the 32 possible " + "operating points conforms to. " + "Bit value 0(defualt): Main Tier; 1: High Tier."); + +static const arg_def_t use_fixed_qp_offsets = + ARG_DEF(NULL, "use-fixed-qp-offsets", 1, + "Enable fixed QP offsets for frames at different levels of the " + "pyramid. Selected automatically from --cq-level if " + "--fixed-qp-offsets is not provided. If this option is not " + "specified (default), offsets are adaptively chosen by the " + "encoder."); + +static const arg_def_t fixed_qp_offsets = + ARG_DEF(NULL, "fixed-qp-offsets", 1, + "Set fixed QP offsets for frames at different levels of the " + "pyramid. Comma-separated list of 5 offsets for keyframe, ALTREF, " + "and 3 levels of internal alt-refs. If this option is not " + "specified (default), offsets are adaptively chosen by the " + "encoder."); + +static const arg_def_t *av1_args[] = { &cpu_used_av1, + &auto_altref, + &sharpness, + &static_thresh, + &rowmtarg, + &tile_cols, + &tile_rows, + &enable_tpl_model, + &enable_keyframe_filtering, + &arnr_maxframes, + &arnr_strength, + &tune_metric, + &cq_level, + &max_intra_rate_pct, + &max_inter_rate_pct, + &gf_cbr_boost_pct, + &lossless, + &enable_cdef, + &enable_restoration, + &enable_rect_partitions, + &enable_ab_partitions, + &enable_1to4_partitions, + &min_partition_size, + &max_partition_size, + &enable_dual_filter, + &enable_chroma_deltaq, + &enable_intra_edge_filter, + &enable_order_hint, + &enable_tx64, + &enable_flip_idtx, + &enable_dist_wtd_comp, + &enable_masked_comp, + &enable_onesided_comp, + &enable_interintra_comp, + &enable_smooth_interintra, + &enable_diff_wtd_comp, + &enable_interinter_wedge, + &enable_interintra_wedge, + &enable_global_motion, + &enable_warped_motion, + &enable_filter_intra, + &enable_smooth_intra, + &enable_paeth_intra, + &enable_cfl_intra, + &force_video_mode, + &enable_obmc, + &enable_overlay, + &enable_palette, + &enable_intrabc, + &enable_angle_delta, + &disable_trellis_quant, + &enable_qm, + &qm_min, + &qm_max, + &reduced_tx_type_set, + &use_intra_dct_only, + &use_inter_dct_only, + &use_intra_default_tx_only, + &quant_b_adapt, + &coeff_cost_upd_freq, + &mode_cost_upd_freq, + &mv_cost_upd_freq, + &frame_parallel_decoding, + &error_resilient_mode, + &aq_mode, + &deltaq_mode, + &deltalf_mode, + &frame_periodic_boost, + &noise_sens, + &tune_content, + &cdf_update_mode, + &input_color_primaries, + &input_transfer_characteristics, + &input_matrix_coefficients, + &input_chroma_sample_position, + &min_gf_interval, + &max_gf_interval, + &gf_min_pyr_height, + &gf_max_pyr_height, + &superblock_size, + &num_tg, + &mtu_size, + &timing_info, + &film_grain_test, + &film_grain_table, +#if CONFIG_DENOISE + &denoise_noise_level, + &denoise_block_size, +#endif // CONFIG_DENOISE + &max_reference_frames, + &reduced_reference_set, + &enable_ref_frame_mvs, + &target_seq_level_idx, + &set_tier_mask, + &set_min_cr, + &bitdeptharg, + &inbitdeptharg, + &input_chroma_subsampling_x, + &input_chroma_subsampling_y, + &sframe_dist, + &sframe_mode, + &save_as_annexb, +#if CONFIG_TUNE_VMAF + &vmaf_model_path, +#endif + NULL }; +static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED, + AOME_SET_ENABLEAUTOALTREF, + AOME_SET_SHARPNESS, + AOME_SET_STATIC_THRESHOLD, + AV1E_SET_ROW_MT, + AV1E_SET_TILE_COLUMNS, + AV1E_SET_TILE_ROWS, + AV1E_SET_ENABLE_TPL_MODEL, + AV1E_SET_ENABLE_KEYFRAME_FILTERING, + AOME_SET_ARNR_MAXFRAMES, + AOME_SET_ARNR_STRENGTH, + AOME_SET_TUNING, + AOME_SET_CQ_LEVEL, + AOME_SET_MAX_INTRA_BITRATE_PCT, + AV1E_SET_MAX_INTER_BITRATE_PCT, + AV1E_SET_GF_CBR_BOOST_PCT, + AV1E_SET_LOSSLESS, + AV1E_SET_ENABLE_CDEF, + AV1E_SET_ENABLE_RESTORATION, + AV1E_SET_ENABLE_RECT_PARTITIONS, + AV1E_SET_ENABLE_AB_PARTITIONS, + AV1E_SET_ENABLE_1TO4_PARTITIONS, + AV1E_SET_MIN_PARTITION_SIZE, + AV1E_SET_MAX_PARTITION_SIZE, + AV1E_SET_ENABLE_DUAL_FILTER, + AV1E_SET_ENABLE_CHROMA_DELTAQ, + AV1E_SET_ENABLE_INTRA_EDGE_FILTER, + AV1E_SET_ENABLE_ORDER_HINT, + AV1E_SET_ENABLE_TX64, + AV1E_SET_ENABLE_FLIP_IDTX, + AV1E_SET_ENABLE_DIST_WTD_COMP, + AV1E_SET_ENABLE_MASKED_COMP, + AV1E_SET_ENABLE_ONESIDED_COMP, + AV1E_SET_ENABLE_INTERINTRA_COMP, + AV1E_SET_ENABLE_SMOOTH_INTERINTRA, + AV1E_SET_ENABLE_DIFF_WTD_COMP, + AV1E_SET_ENABLE_INTERINTER_WEDGE, + AV1E_SET_ENABLE_INTERINTRA_WEDGE, + AV1E_SET_ENABLE_GLOBAL_MOTION, + AV1E_SET_ENABLE_WARPED_MOTION, + AV1E_SET_ENABLE_FILTER_INTRA, + AV1E_SET_ENABLE_SMOOTH_INTRA, + AV1E_SET_ENABLE_PAETH_INTRA, + AV1E_SET_ENABLE_CFL_INTRA, + AV1E_SET_FORCE_VIDEO_MODE, + AV1E_SET_ENABLE_OBMC, + AV1E_SET_ENABLE_OVERLAY, + AV1E_SET_ENABLE_PALETTE, + AV1E_SET_ENABLE_INTRABC, + AV1E_SET_ENABLE_ANGLE_DELTA, + AV1E_SET_DISABLE_TRELLIS_QUANT, + AV1E_SET_ENABLE_QM, + AV1E_SET_QM_MIN, + AV1E_SET_QM_MAX, + AV1E_SET_REDUCED_TX_TYPE_SET, + AV1E_SET_INTRA_DCT_ONLY, + AV1E_SET_INTER_DCT_ONLY, + AV1E_SET_INTRA_DEFAULT_TX_ONLY, + AV1E_SET_QUANT_B_ADAPT, + AV1E_SET_COEFF_COST_UPD_FREQ, + AV1E_SET_MODE_COST_UPD_FREQ, + AV1E_SET_MV_COST_UPD_FREQ, + AV1E_SET_FRAME_PARALLEL_DECODING, + AV1E_SET_ERROR_RESILIENT_MODE, + AV1E_SET_AQ_MODE, + AV1E_SET_DELTAQ_MODE, + AV1E_SET_DELTALF_MODE, + AV1E_SET_FRAME_PERIODIC_BOOST, + AV1E_SET_NOISE_SENSITIVITY, + AV1E_SET_TUNE_CONTENT, + AV1E_SET_CDF_UPDATE_MODE, + AV1E_SET_COLOR_PRIMARIES, + AV1E_SET_TRANSFER_CHARACTERISTICS, + AV1E_SET_MATRIX_COEFFICIENTS, + AV1E_SET_CHROMA_SAMPLE_POSITION, + AV1E_SET_MIN_GF_INTERVAL, + AV1E_SET_MAX_GF_INTERVAL, + AV1E_SET_GF_MIN_PYRAMID_HEIGHT, + AV1E_SET_GF_MAX_PYRAMID_HEIGHT, + AV1E_SET_SUPERBLOCK_SIZE, + AV1E_SET_NUM_TG, + AV1E_SET_MTU, + AV1E_SET_TIMING_INFO_TYPE, + AV1E_SET_FILM_GRAIN_TEST_VECTOR, + AV1E_SET_FILM_GRAIN_TABLE, +#if CONFIG_DENOISE + AV1E_SET_DENOISE_NOISE_LEVEL, + AV1E_SET_DENOISE_BLOCK_SIZE, +#endif // CONFIG_DENOISE + AV1E_SET_MAX_REFERENCE_FRAMES, + AV1E_SET_REDUCED_REFERENCE_SET, + AV1E_SET_ENABLE_REF_FRAME_MVS, + AV1E_SET_TARGET_SEQ_LEVEL_IDX, + AV1E_SET_TIER_MASK, + AV1E_SET_MIN_CR, +#if CONFIG_TUNE_VMAF + AV1E_SET_VMAF_MODEL_PATH, +#endif + 0 }; +#endif // CONFIG_AV1_ENCODER + +static const arg_def_t *no_args[] = { NULL }; + +static void show_help(FILE *fout, int shorthelp) { + fprintf(fout, "Usage: %s -o dst_filename src_filename \n", + exec_name); + + if (shorthelp) { + fprintf(fout, "Use --help to see the full list of options.\n"); + return; + } + + fprintf(fout, "\nOptions:\n"); + arg_show_usage(fout, main_args); + fprintf(fout, "\nEncoder Global Options:\n"); + arg_show_usage(fout, global_args); + fprintf(fout, "\nRate Control Options:\n"); + arg_show_usage(fout, rc_args); + fprintf(fout, "\nTwopass Rate Control Options:\n"); + arg_show_usage(fout, rc_twopass_args); + fprintf(fout, "\nKeyframe Placement Options:\n"); + arg_show_usage(fout, kf_args); +#if CONFIG_AV1_ENCODER + fprintf(fout, "\nAV1 Specific Options:\n"); + arg_show_usage(fout, av1_args); +#endif + fprintf(fout, + "\nStream timebase (--timebase):\n" + " The desired precision of timestamps in the output, expressed\n" + " in fractional seconds. Default is 1/1000.\n"); + fprintf(fout, "\nIncluded encoders:\n\n"); + + const int num_encoder = get_aom_encoder_count(); + for (int i = 0; i < num_encoder; ++i) { + const AvxInterface *const encoder = get_aom_encoder_by_index(i); + const char *defstr = (i == (num_encoder - 1)) ? "(default)" : ""; + fprintf(fout, " %-6s - %s %s\n", encoder->name, + aom_codec_iface_name(encoder->codec_interface()), defstr); + } + fprintf(fout, "\n "); + fprintf(fout, "Use --codec to switch to a non-default encoder.\n\n"); +} + +void usage_exit(void) { + show_help(stderr, 1); + exit(EXIT_FAILURE); +} + +#if CONFIG_AV1_ENCODER +#define ARG_CTRL_CNT_MAX NELEMENTS(av1_arg_ctrl_map) +#endif + +#if !CONFIG_WEBM_IO +typedef int stereo_format_t; +struct WebmOutputContext { + int debug; +}; +#endif + +/* Per-stream configuration */ +struct stream_config { + struct aom_codec_enc_cfg cfg; + const char *out_fn; + const char *stats_fn; + stereo_format_t stereo_fmt; + int arg_ctrls[ARG_CTRL_CNT_MAX][2]; + int arg_ctrl_cnt; + int write_webm; + const char *film_grain_filename; + int write_ivf; + // whether to use 16bit internal buffers + int use_16bit_internal; +#if CONFIG_TUNE_VMAF + const char *vmaf_model_path; +#endif +}; + +struct stream_state { + int index; + struct stream_state *next; + struct stream_config config; + FILE *file; + struct rate_hist *rate_hist; + struct WebmOutputContext webm_ctx; + uint64_t psnr_sse_total; + uint64_t psnr_samples_total; + double psnr_totals[4]; + int psnr_count; + int counts[64]; + aom_codec_ctx_t encoder; + unsigned int frames_out; + uint64_t cx_time; + size_t nbytes; + stats_io_t stats; + struct aom_image *img; + aom_codec_ctx_t decoder; + int mismatch_seen; + unsigned int chroma_subsampling_x; + unsigned int chroma_subsampling_y; +}; + +static void validate_positive_rational(const char *msg, + struct aom_rational *rat) { + if (rat->den < 0) { + rat->num *= -1; + rat->den *= -1; + } + + if (rat->num < 0) die("Error: %s must be positive\n", msg); + + if (!rat->den) die("Error: %s has zero denominator\n", msg); +} + +static void init_config(cfg_options_t *config) { + memset(config, 0, sizeof(cfg_options_t)); + config->super_block_size = 0; // Dynamic + config->max_partition_size = 128; + config->min_partition_size = 4; + config->disable_trellis_quant = 3; +} + +/* Parses global config arguments into the AvxEncoderConfig. Note that + * argv is modified and overwrites all parsed arguments. + */ +static void parse_global_config(struct AvxEncoderConfig *global, char ***argv) { + char **argi, **argj; + struct arg arg; + const int num_encoder = get_aom_encoder_count(); + char **argv_local = (char **)*argv; + if (num_encoder < 1) die("Error: no valid encoder available\n"); + + /* Initialize default parameters */ + memset(global, 0, sizeof(*global)); + global->codec = get_aom_encoder_by_index(num_encoder - 1); + global->passes = 0; + global->color_type = I420; + global->csp = AOM_CSP_UNKNOWN; + + int cfg_included = 0; + init_config(&global->encoder_config); + + for (argi = argj = argv_local; (*argj = *argi); argi += arg.argv_step) { + arg.argv_step = 1; + + if (arg_match(&arg, &use_cfg, argi)) { + if (cfg_included) continue; + parse_cfg(arg.val, &global->encoder_config); + cfg_included = 1; + continue; + } + if (arg_match(&arg, &help, argi)) { + show_help(stdout, 0); + exit(EXIT_SUCCESS); + } else if (arg_match(&arg, &codecarg, argi)) { + global->codec = get_aom_encoder_by_name(arg.val); + if (!global->codec) + die("Error: Unrecognized argument (%s) to --codec\n", arg.val); + } else if (arg_match(&arg, &passes, argi)) { + global->passes = arg_parse_uint(&arg); + + if (global->passes < 1 || global->passes > 2) + die("Error: Invalid number of passes (%d)\n", global->passes); + } else if (arg_match(&arg, &pass_arg, argi)) { + global->pass = arg_parse_uint(&arg); + + if (global->pass < 1 || global->pass > 2) + die("Error: Invalid pass selected (%d)\n", global->pass); + } else if (arg_match(&arg, &input_chroma_sample_position, argi)) { + global->csp = arg_parse_enum(&arg); + /* Flag is used by later code as well, preserve it. */ + argj++; + } else if (arg_match(&arg, &usage, argi)) + global->usage = arg_parse_uint(&arg); + else if (arg_match(&arg, &good_dl, argi)) + global->usage = AOM_USAGE_GOOD_QUALITY; // Good quality usage + else if (arg_match(&arg, &rt_dl, argi)) + global->usage = AOM_USAGE_REALTIME; // Real-time usage + else if (arg_match(&arg, &use_yv12, argi)) + global->color_type = YV12; + else if (arg_match(&arg, &use_i420, argi)) + global->color_type = I420; + else if (arg_match(&arg, &use_i422, argi)) + global->color_type = I422; + else if (arg_match(&arg, &use_i444, argi)) + global->color_type = I444; + else if (arg_match(&arg, &quietarg, argi)) + global->quiet = 1; + else if (arg_match(&arg, &verbosearg, argi)) + global->verbose = 1; + else if (arg_match(&arg, &limit, argi)) + global->limit = arg_parse_uint(&arg); + else if (arg_match(&arg, &skip, argi)) + global->skip_frames = arg_parse_uint(&arg); + else if (arg_match(&arg, &psnrarg, argi)) + global->show_psnr = 1; + else if (arg_match(&arg, &recontest, argi)) + global->test_decode = arg_parse_enum_or_int(&arg); + else if (arg_match(&arg, &framerate, argi)) { + global->framerate = arg_parse_rational(&arg); + validate_positive_rational(arg.name, &global->framerate); + global->have_framerate = 1; + } else if (arg_match(&arg, &debugmode, argi)) + global->debug = 1; + else if (arg_match(&arg, &q_hist_n, argi)) + global->show_q_hist_buckets = arg_parse_uint(&arg); + else if (arg_match(&arg, &rate_hist_n, argi)) + global->show_rate_hist_buckets = arg_parse_uint(&arg); + else if (arg_match(&arg, &disable_warnings, argi)) + global->disable_warnings = 1; + else if (arg_match(&arg, &disable_warning_prompt, argi)) + global->disable_warning_prompt = 1; + else + argj++; + } + + if (global->pass) { + /* DWIM: Assume the user meant passes=2 if pass=2 is specified */ + if (global->pass > global->passes) { + warn("Assuming --pass=%d implies --passes=%d\n", global->pass, + global->pass); + global->passes = global->pass; + } + } + /* Validate global config */ + if (global->passes == 0) { +#if CONFIG_AV1_ENCODER + // Make default AV1 passes = 2 until there is a better quality 1-pass + // encoder + if (global->codec != NULL && global->codec->name != NULL) + global->passes = (strcmp(global->codec->name, "av1") == 0 && + global->usage != AOM_USAGE_REALTIME) + ? 2 + : 1; +#else + global->passes = 1; +#endif + } + + if (global->usage == AOM_USAGE_REALTIME && global->passes > 1) { + warn("Enforcing one-pass encoding in realtime mode\n"); + global->passes = 1; + } +} + +static void open_input_file(struct AvxInputContext *input, + aom_chroma_sample_position_t csp) { + /* Parse certain options from the input file, if possible */ + input->file = strcmp(input->filename, "-") ? fopen(input->filename, "rb") + : set_binary_mode(stdin); + + if (!input->file) fatal("Failed to open input file"); + + if (!fseeko(input->file, 0, SEEK_END)) { + /* Input file is seekable. Figure out how long it is, so we can get + * progress info. + */ + input->length = ftello(input->file); + rewind(input->file); + } + + /* Default to 1:1 pixel aspect ratio. */ + input->pixel_aspect_ratio.numerator = 1; + input->pixel_aspect_ratio.denominator = 1; + + /* For RAW input sources, these bytes will applied on the first frame + * in read_frame(). + */ + input->detect.buf_read = fread(input->detect.buf, 1, 4, input->file); + input->detect.position = 0; + + if (input->detect.buf_read == 4 && file_is_y4m(input->detect.buf)) { + if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4, csp, + input->only_i420) >= 0) { + input->file_type = FILE_TYPE_Y4M; + input->width = input->y4m.pic_w; + input->height = input->y4m.pic_h; + input->pixel_aspect_ratio.numerator = input->y4m.par_n; + input->pixel_aspect_ratio.denominator = input->y4m.par_d; + input->framerate.numerator = input->y4m.fps_n; + input->framerate.denominator = input->y4m.fps_d; + input->fmt = input->y4m.aom_fmt; + input->bit_depth = input->y4m.bit_depth; + } else + fatal("Unsupported Y4M stream."); + } else if (input->detect.buf_read == 4 && fourcc_is_ivf(input->detect.buf)) { + fatal("IVF is not supported as input."); + } else { + input->file_type = FILE_TYPE_RAW; + } +} + +static void close_input_file(struct AvxInputContext *input) { + fclose(input->file); + if (input->file_type == FILE_TYPE_Y4M) y4m_input_close(&input->y4m); +} + +static struct stream_state *new_stream(struct AvxEncoderConfig *global, + struct stream_state *prev) { + struct stream_state *stream; + + stream = calloc(1, sizeof(*stream)); + if (stream == NULL) { + fatal("Failed to allocate new stream."); + } + + if (prev) { + memcpy(stream, prev, sizeof(*stream)); + stream->index++; + prev->next = stream; + } else { + aom_codec_err_t res; + + /* Populate encoder configuration */ + res = aom_codec_enc_config_default(global->codec->codec_interface(), + &stream->config.cfg, global->usage); + if (res) fatal("Failed to get config: %s\n", aom_codec_err_to_string(res)); + + /* Change the default timebase to a high enough value so that the + * encoder will always create strictly increasing timestamps. + */ + stream->config.cfg.g_timebase.den = 1000; + + /* Never use the library's default resolution, require it be parsed + * from the file or set on the command line. + */ + stream->config.cfg.g_w = 0; + stream->config.cfg.g_h = 0; + + /* Initialize remaining stream parameters */ + stream->config.write_webm = 1; + stream->config.write_ivf = 0; + +#if CONFIG_WEBM_IO + stream->config.stereo_fmt = STEREO_FORMAT_MONO; + stream->webm_ctx.last_pts_ns = -1; + stream->webm_ctx.writer = NULL; + stream->webm_ctx.segment = NULL; +#endif + + /* Allows removal of the application version from the EBML tags */ + stream->webm_ctx.debug = global->debug; + memcpy(&stream->config.cfg.encoder_cfg, &global->encoder_config, + sizeof(stream->config.cfg.encoder_cfg)); + } + + /* Output files must be specified for each stream */ + stream->config.out_fn = NULL; + + stream->next = NULL; + return stream; +} + +static void set_config_arg_ctrls(struct stream_config *config, int key, + const struct arg *arg) { + int j; + if (key == AV1E_SET_FILM_GRAIN_TABLE) { + config->film_grain_filename = arg->val; + return; + } + + // For target level, the settings should accumulate rather than overwrite, + // so we simply append it. + if (key == AV1E_SET_TARGET_SEQ_LEVEL_IDX) { + j = config->arg_ctrl_cnt; + assert(j < (int)ARG_CTRL_CNT_MAX); + config->arg_ctrls[j][0] = key; + config->arg_ctrls[j][1] = arg_parse_enum_or_int(arg); + ++config->arg_ctrl_cnt; + return; + } + + /* Point either to the next free element or the first instance of this + * control. + */ + for (j = 0; j < config->arg_ctrl_cnt; j++) + if (config->arg_ctrls[j][0] == key) break; + + /* Update/insert */ + assert(j < (int)ARG_CTRL_CNT_MAX); + config->arg_ctrls[j][0] = key; + config->arg_ctrls[j][1] = arg_parse_enum_or_int(arg); + + if (key == AOME_SET_ENABLEAUTOALTREF && config->arg_ctrls[j][1] > 1) { + warn("auto-alt-ref > 1 is deprecated... setting auto-alt-ref=1\n"); + config->arg_ctrls[j][1] = 1; + } + if (j == config->arg_ctrl_cnt) config->arg_ctrl_cnt++; +} + +static int parse_stream_params(struct AvxEncoderConfig *global, + struct stream_state *stream, char **argv) { + char **argi, **argj; + struct arg arg; + static const arg_def_t **ctrl_args = no_args; + static const int *ctrl_args_map = NULL; + struct stream_config *config = &stream->config; + int eos_mark_found = 0; + int webm_forced = 0; + + // Handle codec specific options + if (0) { +#if CONFIG_AV1_ENCODER + } else if (strcmp(global->codec->name, "av1") == 0) { + // TODO(jingning): Reuse AV1 specific encoder configuration parameters. + // Consider to expand this set for AV1 encoder control. + ctrl_args = av1_args; + ctrl_args_map = av1_arg_ctrl_map; +#endif + } + + for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { + arg.argv_step = 1; + + /* Once we've found an end-of-stream marker (--) we want to continue + * shifting arguments but not consuming them. + */ + if (eos_mark_found) { + argj++; + continue; + } else if (!strcmp(*argj, "--")) { + eos_mark_found = 1; + continue; + } + + if (arg_match(&arg, &outputfile, argi)) { + config->out_fn = arg.val; + if (!webm_forced) { + const size_t out_fn_len = strlen(config->out_fn); + if (out_fn_len >= 4 && + !strcmp(config->out_fn + out_fn_len - 4, ".ivf")) { + config->write_webm = 0; + config->write_ivf = 1; + } else if (out_fn_len >= 4 && + !strcmp(config->out_fn + out_fn_len - 4, ".obu")) { + config->write_webm = 0; + config->write_ivf = 0; + } + } + } else if (arg_match(&arg, &fpf_name, argi)) { + config->stats_fn = arg.val; + } else if (arg_match(&arg, &use_webm, argi)) { +#if CONFIG_WEBM_IO + config->write_webm = 1; + webm_forced = 1; +#else + die("Error: --webm specified but webm is disabled."); +#endif + } else if (arg_match(&arg, &use_ivf, argi)) { + config->write_webm = 0; + config->write_ivf = 1; + } else if (arg_match(&arg, &use_obu, argi)) { + config->write_webm = 0; + config->write_ivf = 0; + } else if (arg_match(&arg, &threads, argi)) { + config->cfg.g_threads = arg_parse_uint(&arg); + } else if (arg_match(&arg, &profile, argi)) { + config->cfg.g_profile = arg_parse_uint(&arg); + } else if (arg_match(&arg, &width, argi)) { + config->cfg.g_w = arg_parse_uint(&arg); + } else if (arg_match(&arg, &height, argi)) { + config->cfg.g_h = arg_parse_uint(&arg); + } else if (arg_match(&arg, &forced_max_frame_width, argi)) { + config->cfg.g_forced_max_frame_width = arg_parse_uint(&arg); + } else if (arg_match(&arg, &forced_max_frame_height, argi)) { + config->cfg.g_forced_max_frame_height = arg_parse_uint(&arg); + } else if (arg_match(&arg, &bitdeptharg, argi)) { + config->cfg.g_bit_depth = arg_parse_enum_or_int(&arg); + } else if (arg_match(&arg, &inbitdeptharg, argi)) { + config->cfg.g_input_bit_depth = arg_parse_uint(&arg); + } else if (arg_match(&arg, &input_chroma_subsampling_x, argi)) { + stream->chroma_subsampling_x = arg_parse_uint(&arg); + } else if (arg_match(&arg, &input_chroma_subsampling_y, argi)) { + stream->chroma_subsampling_y = arg_parse_uint(&arg); +#if CONFIG_WEBM_IO + } else if (arg_match(&arg, &stereo_mode, argi)) { + config->stereo_fmt = arg_parse_enum_or_int(&arg); +#endif + } else if (arg_match(&arg, &timebase, argi)) { + config->cfg.g_timebase = arg_parse_rational(&arg); + validate_positive_rational(arg.name, &config->cfg.g_timebase); + } else if (arg_match(&arg, &global_error_resilient, argi)) { + config->cfg.g_error_resilient = arg_parse_uint(&arg); + } else if (arg_match(&arg, &lag_in_frames, argi)) { + config->cfg.g_lag_in_frames = arg_parse_uint(&arg); + if (global->usage == AOM_USAGE_REALTIME && + config->cfg.rc_end_usage == AOM_CBR && + config->cfg.g_lag_in_frames != 0) { + warn("non-zero %s option ignored in realtime CBR mode.\n", arg.name); + config->cfg.g_lag_in_frames = 0; + } + } else if (arg_match(&arg, &large_scale_tile, argi)) { + config->cfg.large_scale_tile = arg_parse_uint(&arg); + if (config->cfg.large_scale_tile) global->codec = get_aom_lst_encoder(); + } else if (arg_match(&arg, &monochrome, argi)) { + config->cfg.monochrome = 1; + } else if (arg_match(&arg, &full_still_picture_hdr, argi)) { + config->cfg.full_still_picture_hdr = 1; + } else if (arg_match(&arg, &dropframe_thresh, argi)) { + config->cfg.rc_dropframe_thresh = arg_parse_uint(&arg); + } else if (arg_match(&arg, &resize_mode, argi)) { + config->cfg.rc_resize_mode = arg_parse_uint(&arg); + } else if (arg_match(&arg, &resize_denominator, argi)) { + config->cfg.rc_resize_denominator = arg_parse_uint(&arg); + } else if (arg_match(&arg, &resize_kf_denominator, argi)) { + config->cfg.rc_resize_kf_denominator = arg_parse_uint(&arg); + } else if (arg_match(&arg, &superres_mode, argi)) { + config->cfg.rc_superres_mode = arg_parse_uint(&arg); + } else if (arg_match(&arg, &superres_denominator, argi)) { + config->cfg.rc_superres_denominator = arg_parse_uint(&arg); + } else if (arg_match(&arg, &superres_kf_denominator, argi)) { + config->cfg.rc_superres_kf_denominator = arg_parse_uint(&arg); + } else if (arg_match(&arg, &superres_qthresh, argi)) { + config->cfg.rc_superres_qthresh = arg_parse_uint(&arg); + } else if (arg_match(&arg, &superres_kf_qthresh, argi)) { + config->cfg.rc_superres_kf_qthresh = arg_parse_uint(&arg); + } else if (arg_match(&arg, &end_usage, argi)) { + config->cfg.rc_end_usage = arg_parse_enum_or_int(&arg); + } else if (arg_match(&arg, &target_bitrate, argi)) { + config->cfg.rc_target_bitrate = arg_parse_uint(&arg); + } else if (arg_match(&arg, &min_quantizer, argi)) { + config->cfg.rc_min_quantizer = arg_parse_uint(&arg); + } else if (arg_match(&arg, &max_quantizer, argi)) { + config->cfg.rc_max_quantizer = arg_parse_uint(&arg); + } else if (arg_match(&arg, &undershoot_pct, argi)) { + config->cfg.rc_undershoot_pct = arg_parse_uint(&arg); + } else if (arg_match(&arg, &overshoot_pct, argi)) { + config->cfg.rc_overshoot_pct = arg_parse_uint(&arg); + } else if (arg_match(&arg, &buf_sz, argi)) { + config->cfg.rc_buf_sz = arg_parse_uint(&arg); + } else if (arg_match(&arg, &buf_initial_sz, argi)) { + config->cfg.rc_buf_initial_sz = arg_parse_uint(&arg); + } else if (arg_match(&arg, &buf_optimal_sz, argi)) { + config->cfg.rc_buf_optimal_sz = arg_parse_uint(&arg); + } else if (arg_match(&arg, &bias_pct, argi)) { + config->cfg.rc_2pass_vbr_bias_pct = arg_parse_uint(&arg); + if (global->passes < 2) + warn("option %s ignored in one-pass mode.\n", arg.name); + } else if (arg_match(&arg, &minsection_pct, argi)) { + config->cfg.rc_2pass_vbr_minsection_pct = arg_parse_uint(&arg); + + if (global->passes < 2) + warn("option %s ignored in one-pass mode.\n", arg.name); + } else if (arg_match(&arg, &maxsection_pct, argi)) { + config->cfg.rc_2pass_vbr_maxsection_pct = arg_parse_uint(&arg); + + if (global->passes < 2) + warn("option %s ignored in one-pass mode.\n", arg.name); + } else if (arg_match(&arg, &fwd_kf_enabled, argi)) { + config->cfg.fwd_kf_enabled = arg_parse_uint(&arg); + } else if (arg_match(&arg, &kf_min_dist, argi)) { + config->cfg.kf_min_dist = arg_parse_uint(&arg); + } else if (arg_match(&arg, &kf_max_dist, argi)) { + config->cfg.kf_max_dist = arg_parse_uint(&arg); + } else if (arg_match(&arg, &kf_disabled, argi)) { + config->cfg.kf_mode = AOM_KF_DISABLED; + } else if (arg_match(&arg, &sframe_dist, argi)) { + config->cfg.sframe_dist = arg_parse_uint(&arg); + } else if (arg_match(&arg, &sframe_mode, argi)) { + config->cfg.sframe_mode = arg_parse_uint(&arg); + } else if (arg_match(&arg, &save_as_annexb, argi)) { + config->cfg.save_as_annexb = arg_parse_uint(&arg); + } else if (arg_match(&arg, &tile_width, argi)) { + config->cfg.tile_width_count = + arg_parse_list(&arg, config->cfg.tile_widths, MAX_TILE_WIDTHS); + } else if (arg_match(&arg, &tile_height, argi)) { + config->cfg.tile_height_count = + arg_parse_list(&arg, config->cfg.tile_heights, MAX_TILE_HEIGHTS); +#if CONFIG_TUNE_VMAF + } else if (arg_match(&arg, &vmaf_model_path, argi)) { + config->vmaf_model_path = arg.val; +#endif + } else if (arg_match(&arg, &use_fixed_qp_offsets, argi)) { + config->cfg.use_fixed_qp_offsets = arg_parse_uint(&arg); + } else if (arg_match(&arg, &fixed_qp_offsets, argi)) { + const int fixed_qp_offset_count = arg_parse_list( + &arg, config->cfg.fixed_qp_offsets, FIXED_QP_OFFSET_COUNT); + if (fixed_qp_offset_count < FIXED_QP_OFFSET_COUNT) { + die("Option --fixed_qp_offsets requires %d comma-separated values, but " + "only %d values were provided.\n", + FIXED_QP_OFFSET_COUNT, fixed_qp_offset_count); + } + config->cfg.use_fixed_qp_offsets = 1; + } else if (global->usage == AOM_USAGE_REALTIME && + arg_match(&arg, &enable_restoration, argi)) { + if (arg_parse_uint(&arg) == 1) { + warn("non-zero %s option ignored in realtime mode.\n", arg.name); + } + } else { + int i, match = 0; + for (i = 0; ctrl_args[i]; i++) { + if (arg_match(&arg, ctrl_args[i], argi)) { + match = 1; + if (ctrl_args_map) { + set_config_arg_ctrls(config, ctrl_args_map[i], &arg); + } + } + } + if (!match) argj++; + } + } + config->use_16bit_internal = + config->cfg.g_bit_depth > AOM_BITS_8 || FORCE_HIGHBITDEPTH_DECODING; + return eos_mark_found; +} + +#define FOREACH_STREAM(iterator, list) \ + for (struct stream_state *iterator = list; iterator; \ + iterator = iterator->next) + +static void validate_stream_config(const struct stream_state *stream, + const struct AvxEncoderConfig *global) { + const struct stream_state *streami; + (void)global; + + if (!stream->config.cfg.g_w || !stream->config.cfg.g_h) + fatal( + "Stream %d: Specify stream dimensions with --width (-w) " + " and --height (-h)", + stream->index); + + /* Even if bit depth is set on the command line flag to be lower, + * it is upgraded to at least match the input bit depth. + */ + assert(stream->config.cfg.g_input_bit_depth <= + (unsigned int)stream->config.cfg.g_bit_depth); + + for (streami = stream; streami; streami = streami->next) { + /* All streams require output files */ + if (!streami->config.out_fn) + fatal("Stream %d: Output file is required (specify with -o)", + streami->index); + + /* Check for two streams outputting to the same file */ + if (streami != stream) { + const char *a = stream->config.out_fn; + const char *b = streami->config.out_fn; + if (!strcmp(a, b) && strcmp(a, "/dev/null") && strcmp(a, ":nul")) + fatal("Stream %d: duplicate output file (from stream %d)", + streami->index, stream->index); + } + + /* Check for two streams sharing a stats file. */ + if (streami != stream) { + const char *a = stream->config.stats_fn; + const char *b = streami->config.stats_fn; + if (a && b && !strcmp(a, b)) + fatal("Stream %d: duplicate stats file (from stream %d)", + streami->index, stream->index); + } + } +} + +static void set_stream_dimensions(struct stream_state *stream, unsigned int w, + unsigned int h) { + if (!stream->config.cfg.g_w) { + if (!stream->config.cfg.g_h) + stream->config.cfg.g_w = w; + else + stream->config.cfg.g_w = w * stream->config.cfg.g_h / h; + } + if (!stream->config.cfg.g_h) { + stream->config.cfg.g_h = h * stream->config.cfg.g_w / w; + } +} + +static const char *file_type_to_string(enum VideoFileType t) { + switch (t) { + case FILE_TYPE_RAW: return "RAW"; + case FILE_TYPE_Y4M: return "Y4M"; + default: return "Other"; + } +} + +static const char *image_format_to_string(aom_img_fmt_t f) { + switch (f) { + case AOM_IMG_FMT_I420: return "I420"; + case AOM_IMG_FMT_I422: return "I422"; + case AOM_IMG_FMT_I444: return "I444"; + case AOM_IMG_FMT_YV12: return "YV12"; + case AOM_IMG_FMT_YV1216: return "YV1216"; + case AOM_IMG_FMT_I42016: return "I42016"; + case AOM_IMG_FMT_I42216: return "I42216"; + case AOM_IMG_FMT_I44416: return "I44416"; + default: return "Other"; + } +} + +static void show_stream_config(struct stream_state *stream, + struct AvxEncoderConfig *global, + struct AvxInputContext *input) { +#define SHOW(field) \ + fprintf(stderr, " %-28s = %d\n", #field, stream->config.cfg.field) + + if (stream->index == 0) { + fprintf(stderr, "Codec: %s\n", + aom_codec_iface_name(global->codec->codec_interface())); + fprintf(stderr, "Source file: %s File Type: %s Format: %s\n", + input->filename, file_type_to_string(input->file_type), + image_format_to_string(input->fmt)); + } + if (stream->next || stream->index) + fprintf(stderr, "\nStream Index: %d\n", stream->index); + fprintf(stderr, "Destination file: %s\n", stream->config.out_fn); + fprintf(stderr, "Coding path: %s\n", + stream->config.use_16bit_internal ? "HBD" : "LBD"); + fprintf(stderr, "Encoder parameters:\n"); + + SHOW(g_usage); + SHOW(g_threads); + SHOW(g_profile); + SHOW(g_w); + SHOW(g_h); + SHOW(g_bit_depth); + SHOW(g_input_bit_depth); + SHOW(g_timebase.num); + SHOW(g_timebase.den); + SHOW(g_error_resilient); + SHOW(g_pass); + SHOW(g_lag_in_frames); + SHOW(large_scale_tile); + SHOW(rc_dropframe_thresh); + SHOW(rc_resize_mode); + SHOW(rc_resize_denominator); + SHOW(rc_resize_kf_denominator); + SHOW(rc_superres_mode); + SHOW(rc_superres_denominator); + SHOW(rc_superres_kf_denominator); + SHOW(rc_superres_qthresh); + SHOW(rc_superres_kf_qthresh); + SHOW(rc_end_usage); + SHOW(rc_target_bitrate); + SHOW(rc_min_quantizer); + SHOW(rc_max_quantizer); + SHOW(rc_undershoot_pct); + SHOW(rc_overshoot_pct); + SHOW(rc_buf_sz); + SHOW(rc_buf_initial_sz); + SHOW(rc_buf_optimal_sz); + SHOW(rc_2pass_vbr_bias_pct); + SHOW(rc_2pass_vbr_minsection_pct); + SHOW(rc_2pass_vbr_maxsection_pct); + SHOW(fwd_kf_enabled); + SHOW(kf_mode); + SHOW(kf_min_dist); + SHOW(kf_max_dist); + +#define SHOW_PARAMS(field) \ + fprintf(stderr, " %-28s = %d\n", #field, \ + stream->config.cfg.encoder_cfg.field) + SHOW_PARAMS(super_block_size); + SHOW_PARAMS(max_partition_size); + SHOW_PARAMS(min_partition_size); + SHOW_PARAMS(disable_ab_partition_type); + SHOW_PARAMS(disable_rect_partition_type); + SHOW_PARAMS(disable_1to4_partition_type); + SHOW_PARAMS(disable_flip_idtx); + SHOW_PARAMS(disable_cdef); + SHOW_PARAMS(disable_lr); + SHOW_PARAMS(disable_obmc); + SHOW_PARAMS(disable_warp_motion); + SHOW_PARAMS(disable_global_motion); + SHOW_PARAMS(disable_dist_wtd_comp); + SHOW_PARAMS(disable_diff_wtd_comp); + SHOW_PARAMS(disable_inter_intra_comp); + SHOW_PARAMS(disable_masked_comp); + SHOW_PARAMS(disable_one_sided_comp); + SHOW_PARAMS(disable_palette); + SHOW_PARAMS(disable_intrabc); + SHOW_PARAMS(disable_cfl); + SHOW_PARAMS(disable_smooth_intra); + SHOW_PARAMS(disable_filter_intra); + SHOW_PARAMS(disable_dual_filter); + SHOW_PARAMS(disable_intra_angle_delta); + SHOW_PARAMS(disable_intra_edge_filter); + SHOW_PARAMS(disable_tx_64x64); + SHOW_PARAMS(disable_smooth_inter_intra); + SHOW_PARAMS(disable_inter_inter_wedge); + SHOW_PARAMS(disable_inter_intra_wedge); + SHOW_PARAMS(disable_paeth_intra); + SHOW_PARAMS(disable_trellis_quant); + SHOW_PARAMS(disable_ref_frame_mv); + SHOW_PARAMS(reduced_reference_set); + SHOW_PARAMS(reduced_tx_type_set); +} + +static void open_output_file(struct stream_state *stream, + struct AvxEncoderConfig *global, + const struct AvxRational *pixel_aspect_ratio) { + const char *fn = stream->config.out_fn; + const struct aom_codec_enc_cfg *const cfg = &stream->config.cfg; + + if (cfg->g_pass == AOM_RC_FIRST_PASS) return; + + stream->file = strcmp(fn, "-") ? fopen(fn, "wb") : set_binary_mode(stdout); + + if (!stream->file) fatal("Failed to open output file"); + + if (stream->config.write_webm && fseek(stream->file, 0, SEEK_CUR)) + fatal("WebM output to pipes not supported."); + +#if CONFIG_WEBM_IO + if (stream->config.write_webm) { + stream->webm_ctx.stream = stream->file; + if (write_webm_file_header(&stream->webm_ctx, &stream->encoder, cfg, + stream->config.stereo_fmt, global->codec->fourcc, + pixel_aspect_ratio) != 0) { + fatal("WebM writer initialization failed."); + } + } +#else + (void)pixel_aspect_ratio; +#endif + + if (!stream->config.write_webm && stream->config.write_ivf) { + ivf_write_file_header(stream->file, cfg, global->codec->fourcc, 0); + } +} + +static void close_output_file(struct stream_state *stream, + unsigned int fourcc) { + const struct aom_codec_enc_cfg *const cfg = &stream->config.cfg; + + if (cfg->g_pass == AOM_RC_FIRST_PASS) return; + +#if CONFIG_WEBM_IO + if (stream->config.write_webm) { + if (write_webm_file_footer(&stream->webm_ctx) != 0) { + fatal("WebM writer finalization failed."); + } + } +#endif + + if (!stream->config.write_webm && stream->config.write_ivf) { + if (!fseek(stream->file, 0, SEEK_SET)) + ivf_write_file_header(stream->file, &stream->config.cfg, fourcc, + stream->frames_out); + } + + fclose(stream->file); +} + +static void setup_pass(struct stream_state *stream, + struct AvxEncoderConfig *global, int pass) { + if (stream->config.stats_fn) { + if (!stats_open_file(&stream->stats, stream->config.stats_fn, pass)) + fatal("Failed to open statistics store"); + } else { + if (!stats_open_mem(&stream->stats, pass)) + fatal("Failed to open statistics store"); + } + + stream->config.cfg.g_pass = global->passes == 2 + ? pass ? AOM_RC_LAST_PASS : AOM_RC_FIRST_PASS + : AOM_RC_ONE_PASS; + if (pass) { + stream->config.cfg.rc_twopass_stats_in = stats_get(&stream->stats); + } + + stream->cx_time = 0; + stream->nbytes = 0; + stream->frames_out = 0; +} + +static void initialize_encoder(struct stream_state *stream, + struct AvxEncoderConfig *global) { + int i; + int flags = 0; + + flags |= global->show_psnr ? AOM_CODEC_USE_PSNR : 0; + flags |= stream->config.use_16bit_internal ? AOM_CODEC_USE_HIGHBITDEPTH : 0; + + /* Construct Encoder Context */ + aom_codec_enc_init(&stream->encoder, global->codec->codec_interface(), + &stream->config.cfg, flags); + ctx_exit_on_error(&stream->encoder, "Failed to initialize encoder"); + + for (i = 0; i < stream->config.arg_ctrl_cnt; i++) { + int ctrl = stream->config.arg_ctrls[i][0]; + int value = stream->config.arg_ctrls[i][1]; + if (aom_codec_control(&stream->encoder, ctrl, value)) + fprintf(stderr, "Error: Tried to set control %d = %d\n", ctrl, value); + + ctx_exit_on_error(&stream->encoder, "Failed to control codec"); + } + +#if CONFIG_TUNE_VMAF + if (stream->config.vmaf_model_path) { + AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_VMAF_MODEL_PATH, + stream->config.vmaf_model_path); + } +#endif + + if (stream->config.film_grain_filename) { + AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_FILM_GRAIN_TABLE, + stream->config.film_grain_filename); + } + +#if CONFIG_AV1_DECODER + if (global->test_decode != TEST_DECODE_OFF) { + const AvxInterface *decoder = get_aom_decoder_by_name(global->codec->name); + aom_codec_dec_cfg_t cfg = { 0, 0, 0, !FORCE_HIGHBITDEPTH_DECODING }; + aom_codec_dec_init(&stream->decoder, decoder->codec_interface(), &cfg, 0); + + if (strcmp(global->codec->name, "av1") == 0) { + AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1_SET_TILE_MODE, + stream->config.cfg.large_scale_tile); + ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_mode"); + + AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1D_SET_IS_ANNEXB, + stream->config.cfg.save_as_annexb); + ctx_exit_on_error(&stream->decoder, "Failed to set is_annexb"); + + AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1_SET_DECODE_TILE_ROW, + -1); + ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_row"); + + AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1_SET_DECODE_TILE_COL, + -1); + ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_col"); + } + } +#endif +} + +static void encode_frame(struct stream_state *stream, + struct AvxEncoderConfig *global, struct aom_image *img, + unsigned int frames_in) { + aom_codec_pts_t frame_start, next_frame_start; + struct aom_codec_enc_cfg *cfg = &stream->config.cfg; + struct aom_usec_timer timer; + + frame_start = + (cfg->g_timebase.den * (int64_t)(frames_in - 1) * global->framerate.den) / + cfg->g_timebase.num / global->framerate.num; + next_frame_start = + (cfg->g_timebase.den * (int64_t)(frames_in)*global->framerate.den) / + cfg->g_timebase.num / global->framerate.num; + + /* Scale if necessary */ + if (img) { + if ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) && + (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) { + if (img->fmt != AOM_IMG_FMT_I42016) { + fprintf(stderr, "%s can only scale 4:2:0 inputs\n", exec_name); + exit(EXIT_FAILURE); + } +#if CONFIG_LIBYUV + if (!stream->img) { + stream->img = + aom_img_alloc(NULL, AOM_IMG_FMT_I42016, cfg->g_w, cfg->g_h, 16); + } + I420Scale_16( + (uint16_t *)img->planes[AOM_PLANE_Y], img->stride[AOM_PLANE_Y] / 2, + (uint16_t *)img->planes[AOM_PLANE_U], img->stride[AOM_PLANE_U] / 2, + (uint16_t *)img->planes[AOM_PLANE_V], img->stride[AOM_PLANE_V] / 2, + img->d_w, img->d_h, (uint16_t *)stream->img->planes[AOM_PLANE_Y], + stream->img->stride[AOM_PLANE_Y] / 2, + (uint16_t *)stream->img->planes[AOM_PLANE_U], + stream->img->stride[AOM_PLANE_U] / 2, + (uint16_t *)stream->img->planes[AOM_PLANE_V], + stream->img->stride[AOM_PLANE_V] / 2, stream->img->d_w, + stream->img->d_h, kFilterBox); + img = stream->img; +#else + stream->encoder.err = 1; + ctx_exit_on_error(&stream->encoder, + "Stream %d: Failed to encode frame.\n" + "libyuv is required for scaling but is currently " + "disabled.\n" + "Be sure to specify -DCONFIG_LIBYUV=1 when running " + "cmake.\n", + stream->index); +#endif + } + } + if (img && (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) { + if (img->fmt != AOM_IMG_FMT_I420 && img->fmt != AOM_IMG_FMT_YV12) { + fprintf(stderr, "%s can only scale 4:2:0 8bpp inputs\n", exec_name); + exit(EXIT_FAILURE); + } +#if CONFIG_LIBYUV + if (!stream->img) + stream->img = + aom_img_alloc(NULL, AOM_IMG_FMT_I420, cfg->g_w, cfg->g_h, 16); + I420Scale( + img->planes[AOM_PLANE_Y], img->stride[AOM_PLANE_Y], + img->planes[AOM_PLANE_U], img->stride[AOM_PLANE_U], + img->planes[AOM_PLANE_V], img->stride[AOM_PLANE_V], img->d_w, img->d_h, + stream->img->planes[AOM_PLANE_Y], stream->img->stride[AOM_PLANE_Y], + stream->img->planes[AOM_PLANE_U], stream->img->stride[AOM_PLANE_U], + stream->img->planes[AOM_PLANE_V], stream->img->stride[AOM_PLANE_V], + stream->img->d_w, stream->img->d_h, kFilterBox); + img = stream->img; +#else + stream->encoder.err = 1; + ctx_exit_on_error(&stream->encoder, + "Stream %d: Failed to encode frame.\n" + "Scaling disabled in this configuration. \n" + "To enable, configure with --enable-libyuv\n", + stream->index); +#endif + } + + aom_usec_timer_start(&timer); + aom_codec_encode(&stream->encoder, img, frame_start, + (uint32_t)(next_frame_start - frame_start), 0); + aom_usec_timer_mark(&timer); + stream->cx_time += aom_usec_timer_elapsed(&timer); + ctx_exit_on_error(&stream->encoder, "Stream %d: Failed to encode frame", + stream->index); +} + +static void update_quantizer_histogram(struct stream_state *stream) { + if (stream->config.cfg.g_pass != AOM_RC_FIRST_PASS) { + int q; + + AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AOME_GET_LAST_QUANTIZER_64, + &q); + ctx_exit_on_error(&stream->encoder, "Failed to read quantizer"); + stream->counts[q]++; + } +} + +static void get_cx_data(struct stream_state *stream, + struct AvxEncoderConfig *global, int *got_data) { + const aom_codec_cx_pkt_t *pkt; + const struct aom_codec_enc_cfg *cfg = &stream->config.cfg; + aom_codec_iter_t iter = NULL; + + *got_data = 0; + while ((pkt = aom_codec_get_cx_data(&stream->encoder, &iter))) { + static size_t fsize = 0; + static FileOffset ivf_header_pos = 0; + + switch (pkt->kind) { + case AOM_CODEC_CX_FRAME_PKT: + ++stream->frames_out; + if (!global->quiet) + fprintf(stderr, " %6luF", (unsigned long)pkt->data.frame.sz); + + update_rate_histogram(stream->rate_hist, cfg, pkt); +#if CONFIG_WEBM_IO + if (stream->config.write_webm) { + if (write_webm_block(&stream->webm_ctx, cfg, pkt) != 0) { + fatal("WebM writer failed."); + } + } +#endif + if (!stream->config.write_webm) { + if (stream->config.write_ivf) { + if (pkt->data.frame.partition_id <= 0) { + ivf_header_pos = ftello(stream->file); + fsize = pkt->data.frame.sz; + + ivf_write_frame_header(stream->file, pkt->data.frame.pts, fsize); + } else { + fsize += pkt->data.frame.sz; + + const FileOffset currpos = ftello(stream->file); + fseeko(stream->file, ivf_header_pos, SEEK_SET); + ivf_write_frame_size(stream->file, fsize); + fseeko(stream->file, currpos, SEEK_SET); + } + } + + (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, + stream->file); + } + stream->nbytes += pkt->data.raw.sz; + + *got_data = 1; +#if CONFIG_AV1_DECODER + if (global->test_decode != TEST_DECODE_OFF && !stream->mismatch_seen) { + aom_codec_decode(&stream->decoder, pkt->data.frame.buf, + pkt->data.frame.sz, NULL); + if (stream->decoder.err) { + warn_or_exit_on_error(&stream->decoder, + global->test_decode == TEST_DECODE_FATAL, + "Failed to decode frame %d in stream %d", + stream->frames_out + 1, stream->index); + stream->mismatch_seen = stream->frames_out + 1; + } + } +#endif + break; + case AOM_CODEC_STATS_PKT: + stream->frames_out++; + stats_write(&stream->stats, pkt->data.twopass_stats.buf, + pkt->data.twopass_stats.sz); + stream->nbytes += pkt->data.raw.sz; + break; + case AOM_CODEC_PSNR_PKT: + + if (global->show_psnr) { + int i; + + stream->psnr_sse_total += pkt->data.psnr.sse[0]; + stream->psnr_samples_total += pkt->data.psnr.samples[0]; + for (i = 0; i < 4; i++) { + if (!global->quiet) + fprintf(stderr, "%.3f ", pkt->data.psnr.psnr[i]); + stream->psnr_totals[i] += pkt->data.psnr.psnr[i]; + } + stream->psnr_count++; + } + + break; + default: break; + } + } +} + +static void show_psnr(struct stream_state *stream, double peak, int64_t bps) { + int i; + double ovpsnr; + + if (!stream->psnr_count) return; + + fprintf(stderr, "Stream %d PSNR (Overall/Avg/Y/U/V)", stream->index); + ovpsnr = sse_to_psnr((double)stream->psnr_samples_total, peak, + (double)stream->psnr_sse_total); + fprintf(stderr, " %.3f", ovpsnr); + + for (i = 0; i < 4; i++) { + fprintf(stderr, " %.3f", stream->psnr_totals[i] / stream->psnr_count); + } + if (bps > 0) { + fprintf(stderr, " %7" PRId64 " bps", bps); + } + fprintf(stderr, " %7" PRId64 " ms", stream->cx_time / 1000); + fprintf(stderr, "\n"); +} + +static float usec_to_fps(uint64_t usec, unsigned int frames) { + return (float)(usec > 0 ? frames * 1000000.0 / (float)usec : 0); +} + +static void test_decode(struct stream_state *stream, + enum TestDecodeFatality fatal) { + aom_image_t enc_img, dec_img; + + if (stream->mismatch_seen) return; + + /* Get the internal reference frame */ + AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1_GET_NEW_FRAME_IMAGE, + &enc_img); + AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1_GET_NEW_FRAME_IMAGE, + &dec_img); + + if ((enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) != + (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH)) { + if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + aom_image_t enc_hbd_img; + aom_img_alloc(&enc_hbd_img, enc_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH, + enc_img.d_w, enc_img.d_h, 16); + aom_img_truncate_16_to_8(&enc_hbd_img, &enc_img); + enc_img = enc_hbd_img; + } + if (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + aom_image_t dec_hbd_img; + aom_img_alloc(&dec_hbd_img, dec_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH, + dec_img.d_w, dec_img.d_h, 16); + aom_img_truncate_16_to_8(&dec_hbd_img, &dec_img); + dec_img = dec_hbd_img; + } + } + + ctx_exit_on_error(&stream->encoder, "Failed to get encoder reference frame"); + ctx_exit_on_error(&stream->decoder, "Failed to get decoder reference frame"); + + if (!aom_compare_img(&enc_img, &dec_img)) { + int y[4], u[4], v[4]; + if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + aom_find_mismatch_high(&enc_img, &dec_img, y, u, v); + } else { + aom_find_mismatch(&enc_img, &dec_img, y, u, v); + } + stream->decoder.err = 1; + warn_or_exit_on_error(&stream->decoder, fatal == TEST_DECODE_FATAL, + "Stream %d: Encode/decode mismatch on frame %d at" + " Y[%d, %d] {%d/%d}," + " U[%d, %d] {%d/%d}," + " V[%d, %d] {%d/%d}", + stream->index, stream->frames_out, y[0], y[1], y[2], + y[3], u[0], u[1], u[2], u[3], v[0], v[1], v[2], v[3]); + stream->mismatch_seen = stream->frames_out; + } + + aom_img_free(&enc_img); + aom_img_free(&dec_img); +} + +static void print_time(const char *label, int64_t etl) { + int64_t hours; + int64_t mins; + int64_t secs; + + if (etl >= 0) { + hours = etl / 3600; + etl -= hours * 3600; + mins = etl / 60; + etl -= mins * 60; + secs = etl; + + fprintf(stderr, "[%3s %2" PRId64 ":%02" PRId64 ":%02" PRId64 "] ", label, + hours, mins, secs); + } else { + fprintf(stderr, "[%3s unknown] ", label); + } +} + +int main(int argc, const char **argv_) { + int pass; + aom_image_t raw; + aom_image_t raw_shift; + int allocated_raw_shift = 0; + int use_16bit_internal = 0; + int input_shift = 0; + int frame_avail, got_data; + + struct AvxInputContext input; + struct AvxEncoderConfig global; + struct stream_state *streams = NULL; + char **argv, **argi; + uint64_t cx_time = 0; + int stream_cnt = 0; + int res = 0; + int profile_updated = 0; + + memset(&input, 0, sizeof(input)); + exec_name = argv_[0]; + + /* Setup default input stream settings */ + input.framerate.numerator = 30; + input.framerate.denominator = 1; + input.only_i420 = 1; + input.bit_depth = 0; + + /* First parse the global configuration values, because we want to apply + * other parameters on top of the default configuration provided by the + * codec. + */ + argv = argv_dup(argc - 1, argv_ + 1); + parse_global_config(&global, &argv); + + if (argc < 2) usage_exit(); + + switch (global.color_type) { + case I420: input.fmt = AOM_IMG_FMT_I420; break; + case I422: input.fmt = AOM_IMG_FMT_I422; break; + case I444: input.fmt = AOM_IMG_FMT_I444; break; + case YV12: input.fmt = AOM_IMG_FMT_YV12; break; + } + + { + /* Now parse each stream's parameters. Using a local scope here + * due to the use of 'stream' as loop variable in FOREACH_STREAM + * loops + */ + struct stream_state *stream = NULL; + + do { + stream = new_stream(&global, stream); + stream_cnt++; + if (!streams) streams = stream; + } while (parse_stream_params(&global, stream, argv)); + } + + /* Check for unrecognized options */ + for (argi = argv; *argi; argi++) + if (argi[0][0] == '-' && argi[0][1]) + die("Error: Unrecognized option %s\n", *argi); + + FOREACH_STREAM(stream, streams) { + check_encoder_config(global.disable_warning_prompt, &global, + &stream->config.cfg); + + // If large_scale_tile = 1, only support to output to ivf format. + if (stream->config.cfg.large_scale_tile && !stream->config.write_ivf) + die("only support ivf output format while large-scale-tile=1\n"); + } + + /* Handle non-option arguments */ + input.filename = argv[0]; + + if (!input.filename) { + fprintf(stderr, "No input file specified!\n"); + usage_exit(); + } + + /* Decide if other chroma subsamplings than 4:2:0 are supported */ + if (global.codec->fourcc == AV1_FOURCC) input.only_i420 = 0; + + for (pass = global.pass ? global.pass - 1 : 0; pass < global.passes; pass++) { + int frames_in = 0, seen_frames = 0; + int64_t estimated_time_left = -1; + int64_t average_rate = -1; + int64_t lagged_count = 0; + + open_input_file(&input, global.csp); + + /* If the input file doesn't specify its w/h (raw files), try to get + * the data from the first stream's configuration. + */ + if (!input.width || !input.height) { + FOREACH_STREAM(stream, streams) { + if (stream->config.cfg.g_w && stream->config.cfg.g_h) { + input.width = stream->config.cfg.g_w; + input.height = stream->config.cfg.g_h; + break; + } + }; + } + + /* Update stream configurations from the input file's parameters */ + if (!input.width || !input.height) + fatal( + "Specify stream dimensions with --width (-w) " + " and --height (-h)"); + + /* If input file does not specify bit-depth but input-bit-depth parameter + * exists, assume that to be the input bit-depth. However, if the + * input-bit-depth paramter does not exist, assume the input bit-depth + * to be the same as the codec bit-depth. + */ + if (!input.bit_depth) { + FOREACH_STREAM(stream, streams) { + if (stream->config.cfg.g_input_bit_depth) + input.bit_depth = stream->config.cfg.g_input_bit_depth; + else + input.bit_depth = stream->config.cfg.g_input_bit_depth = + (int)stream->config.cfg.g_bit_depth; + } + if (input.bit_depth > 8) input.fmt |= AOM_IMG_FMT_HIGHBITDEPTH; + } else { + FOREACH_STREAM(stream, streams) { + stream->config.cfg.g_input_bit_depth = input.bit_depth; + } + } + + FOREACH_STREAM(stream, streams) { + if (input.fmt != AOM_IMG_FMT_I420 && input.fmt != AOM_IMG_FMT_I42016) { + /* Automatically upgrade if input is non-4:2:0 but a 4:2:0 profile + was selected. */ + switch (stream->config.cfg.g_profile) { + case 0: + if (input.bit_depth < 12 && (input.fmt == AOM_IMG_FMT_I444 || + input.fmt == AOM_IMG_FMT_I44416)) { + if (!stream->config.cfg.monochrome) { + stream->config.cfg.g_profile = 1; + profile_updated = 1; + } + } else if (input.bit_depth == 12 || input.fmt == AOM_IMG_FMT_I422 || + input.fmt == AOM_IMG_FMT_I42216) { + stream->config.cfg.g_profile = 2; + profile_updated = 1; + } + break; + case 1: + if (input.bit_depth == 12 || input.fmt == AOM_IMG_FMT_I422 || + input.fmt == AOM_IMG_FMT_I42216) { + stream->config.cfg.g_profile = 2; + profile_updated = 1; + } else if (input.bit_depth < 12 && + (input.fmt == AOM_IMG_FMT_I420 || + input.fmt == AOM_IMG_FMT_I42016)) { + stream->config.cfg.g_profile = 0; + profile_updated = 1; + } + break; + case 2: + if (input.bit_depth < 12 && (input.fmt == AOM_IMG_FMT_I444 || + input.fmt == AOM_IMG_FMT_I44416)) { + stream->config.cfg.g_profile = 1; + profile_updated = 1; + } else if (input.bit_depth < 12 && + (input.fmt == AOM_IMG_FMT_I420 || + input.fmt == AOM_IMG_FMT_I42016)) { + stream->config.cfg.g_profile = 0; + profile_updated = 1; + } else if (input.bit_depth == 12 && + input.file_type == FILE_TYPE_Y4M) { + // Note that here the input file values for chroma subsampling + // are used instead of those from the command line. + AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, + AV1E_SET_CHROMA_SUBSAMPLING_X, + input.y4m.dst_c_dec_h >> 1); + AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, + AV1E_SET_CHROMA_SUBSAMPLING_Y, + input.y4m.dst_c_dec_v >> 1); + } else if (input.bit_depth == 12 && + input.file_type == FILE_TYPE_RAW) { + AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, + AV1E_SET_CHROMA_SUBSAMPLING_X, + stream->chroma_subsampling_x); + AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, + AV1E_SET_CHROMA_SUBSAMPLING_Y, + stream->chroma_subsampling_y); + } + break; + default: break; + } + } + /* Automatically set the codec bit depth to match the input bit depth. + * Upgrade the profile if required. */ + if (stream->config.cfg.g_input_bit_depth > + (unsigned int)stream->config.cfg.g_bit_depth) { + stream->config.cfg.g_bit_depth = stream->config.cfg.g_input_bit_depth; + if (!global.quiet) { + fprintf(stderr, + "Warning: automatically updating bit depth to %d to " + "match input format.\n", + stream->config.cfg.g_input_bit_depth); + } + } + if (stream->config.cfg.g_bit_depth > 10) { + switch (stream->config.cfg.g_profile) { + case 0: + case 1: + stream->config.cfg.g_profile = 2; + profile_updated = 1; + break; + default: break; + } + } + if (stream->config.cfg.g_bit_depth > 8) { + stream->config.use_16bit_internal = 1; + } + if (profile_updated && !global.quiet) { + fprintf(stderr, + "Warning: automatically updating to profile %d to " + "match input format.\n", + stream->config.cfg.g_profile); + } + /* Set limit */ + stream->config.cfg.g_limit = global.limit; + } + + FOREACH_STREAM(stream, streams) { + set_stream_dimensions(stream, input.width, input.height); + } + FOREACH_STREAM(stream, streams) { validate_stream_config(stream, &global); } + + /* Ensure that --passes and --pass are consistent. If --pass is set and + * --passes=2, ensure --fpf was set. + */ + if (global.pass && global.passes == 2) { + FOREACH_STREAM(stream, streams) { + if (!stream->config.stats_fn) + die("Stream %d: Must specify --fpf when --pass=%d" + " and --passes=2\n", + stream->index, global.pass); + } + } + +#if !CONFIG_WEBM_IO + FOREACH_STREAM(stream, streams) { + if (stream->config.write_webm) { + stream->config.write_webm = 0; + stream->config.write_ivf = 0; + warn("aomenc compiled w/o WebM support. Writing OBU stream."); + } + } +#endif + + /* Use the frame rate from the file only if none was specified + * on the command-line. + */ + if (!global.have_framerate) { + global.framerate.num = input.framerate.numerator; + global.framerate.den = input.framerate.denominator; + } + FOREACH_STREAM(stream, streams) { + stream->config.cfg.g_timebase.den = global.framerate.num; + stream->config.cfg.g_timebase.num = global.framerate.den; + } + /* Show configuration */ + if (global.verbose && pass == 0) { + FOREACH_STREAM(stream, streams) { + show_stream_config(stream, &global, &input); + } + } + + if (pass == (global.pass ? global.pass - 1 : 0)) { + if (input.file_type == FILE_TYPE_Y4M) + /*The Y4M reader does its own allocation. + Just initialize this here to avoid problems if we never read any + frames.*/ + memset(&raw, 0, sizeof(raw)); + else + aom_img_alloc(&raw, input.fmt, input.width, input.height, 32); + + FOREACH_STREAM(stream, streams) { + stream->rate_hist = + init_rate_histogram(&stream->config.cfg, &global.framerate); + } + } + + FOREACH_STREAM(stream, streams) { setup_pass(stream, &global, pass); } + FOREACH_STREAM(stream, streams) { initialize_encoder(stream, &global); } + FOREACH_STREAM(stream, streams) { + open_output_file(stream, &global, &input.pixel_aspect_ratio); + } + + if (strcmp(global.codec->name, "av1") == 0 || + strcmp(global.codec->name, "av1") == 0) { + // Check to see if at least one stream uses 16 bit internal. + // Currently assume that the bit_depths for all streams using + // highbitdepth are the same. + FOREACH_STREAM(stream, streams) { + if (stream->config.use_16bit_internal) { + use_16bit_internal = 1; + } + input_shift = (int)stream->config.cfg.g_bit_depth - + stream->config.cfg.g_input_bit_depth; + }; + } + + frame_avail = 1; + got_data = 0; + + while (frame_avail || got_data) { + struct aom_usec_timer timer; + + if (!global.limit || frames_in < global.limit) { + frame_avail = read_frame(&input, &raw); + + if (frame_avail) frames_in++; + seen_frames = + frames_in > global.skip_frames ? frames_in - global.skip_frames : 0; + + if (!global.quiet) { + float fps = usec_to_fps(cx_time, seen_frames); + fprintf(stderr, "\rPass %d/%d ", pass + 1, global.passes); + + if (stream_cnt == 1) + fprintf(stderr, "frame %4d/%-4d %7" PRId64 "B ", frames_in, + streams->frames_out, (int64_t)streams->nbytes); + else + fprintf(stderr, "frame %4d ", frames_in); + + fprintf(stderr, "%7" PRId64 " %s %.2f %s ", + cx_time > 9999999 ? cx_time / 1000 : cx_time, + cx_time > 9999999 ? "ms" : "us", fps >= 1.0 ? fps : fps * 60, + fps >= 1.0 ? "fps" : "fpm"); + print_time("ETA", estimated_time_left); + } + + } else { + frame_avail = 0; + } + + if (frames_in > global.skip_frames) { + aom_image_t *frame_to_encode; + if (input_shift || (use_16bit_internal && input.bit_depth == 8)) { + assert(use_16bit_internal); + // Input bit depth and stream bit depth do not match, so up + // shift frame to stream bit depth + if (!allocated_raw_shift) { + aom_img_alloc(&raw_shift, raw.fmt | AOM_IMG_FMT_HIGHBITDEPTH, + input.width, input.height, 32); + allocated_raw_shift = 1; + } + aom_img_upshift(&raw_shift, &raw, input_shift); + frame_to_encode = &raw_shift; + } else { + frame_to_encode = &raw; + } + aom_usec_timer_start(&timer); + if (use_16bit_internal) { + assert(frame_to_encode->fmt & AOM_IMG_FMT_HIGHBITDEPTH); + FOREACH_STREAM(stream, streams) { + if (stream->config.use_16bit_internal) + encode_frame(stream, &global, + frame_avail ? frame_to_encode : NULL, frames_in); + else + assert(0); + }; + } else { + assert((frame_to_encode->fmt & AOM_IMG_FMT_HIGHBITDEPTH) == 0); + FOREACH_STREAM(stream, streams) { + encode_frame(stream, &global, frame_avail ? frame_to_encode : NULL, + frames_in); + } + } + aom_usec_timer_mark(&timer); + cx_time += aom_usec_timer_elapsed(&timer); + + FOREACH_STREAM(stream, streams) { update_quantizer_histogram(stream); } + + got_data = 0; + FOREACH_STREAM(stream, streams) { + get_cx_data(stream, &global, &got_data); + } + + if (!got_data && input.length && streams != NULL && + !streams->frames_out) { + lagged_count = global.limit ? seen_frames : ftello(input.file); + } else if (input.length) { + int64_t remaining; + int64_t rate; + + if (global.limit) { + const int64_t frame_in_lagged = (seen_frames - lagged_count) * 1000; + + rate = cx_time ? frame_in_lagged * (int64_t)1000000 / cx_time : 0; + remaining = 1000 * (global.limit - global.skip_frames - + seen_frames + lagged_count); + } else { + const int64_t input_pos = ftello(input.file); + const int64_t input_pos_lagged = input_pos - lagged_count; + const int64_t input_limit = input.length; + + rate = cx_time ? input_pos_lagged * (int64_t)1000000 / cx_time : 0; + remaining = input_limit - input_pos + lagged_count; + } + + average_rate = + (average_rate <= 0) ? rate : (average_rate * 7 + rate) / 8; + estimated_time_left = average_rate ? remaining / average_rate : -1; + } + + if (got_data && global.test_decode != TEST_DECODE_OFF) { + FOREACH_STREAM(stream, streams) { + test_decode(stream, global.test_decode); + } + } + } + + fflush(stdout); + if (!global.quiet) fprintf(stderr, "\033[K"); + } + + if (stream_cnt > 1) fprintf(stderr, "\n"); + + if (!global.quiet) { + FOREACH_STREAM(stream, streams) { + const int64_t bpf = + seen_frames ? (int64_t)(stream->nbytes * 8 / seen_frames) : 0; + const int64_t bps = bpf * global.framerate.num / global.framerate.den; + fprintf(stderr, + "\rPass %d/%d frame %4d/%-4d %7" PRId64 "B %7" PRId64 + "b/f %7" PRId64 + "b/s" + " %7" PRId64 " %s (%.2f fps)\033[K\n", + pass + 1, global.passes, frames_in, stream->frames_out, + (int64_t)stream->nbytes, bpf, bps, + stream->cx_time > 9999999 ? stream->cx_time / 1000 + : stream->cx_time, + stream->cx_time > 9999999 ? "ms" : "us", + usec_to_fps(stream->cx_time, seen_frames)); + } + } + + if (global.show_psnr) { + if (global.codec->fourcc == AV1_FOURCC) { + FOREACH_STREAM(stream, streams) { + int64_t bps = 0; + if (stream->psnr_count && seen_frames && global.framerate.den) { + bps = (int64_t)stream->nbytes * 8 * (int64_t)global.framerate.num / + global.framerate.den / seen_frames; + } + show_psnr(stream, (1 << stream->config.cfg.g_input_bit_depth) - 1, + bps); + } + } else { + FOREACH_STREAM(stream, streams) { show_psnr(stream, 255.0, 0); } + } + } + + FOREACH_STREAM(stream, streams) { aom_codec_destroy(&stream->encoder); } + + if (global.test_decode != TEST_DECODE_OFF) { + FOREACH_STREAM(stream, streams) { aom_codec_destroy(&stream->decoder); } + } + + close_input_file(&input); + + if (global.test_decode == TEST_DECODE_FATAL) { + FOREACH_STREAM(stream, streams) { res |= stream->mismatch_seen; } + } + FOREACH_STREAM(stream, streams) { + close_output_file(stream, global.codec->fourcc); + } + + FOREACH_STREAM(stream, streams) { + stats_close(&stream->stats, global.passes - 1); + } + + if (global.pass) break; + } + + if (global.show_q_hist_buckets) { + FOREACH_STREAM(stream, streams) { + show_q_histogram(stream->counts, global.show_q_hist_buckets); + } + } + + if (global.show_rate_hist_buckets) { + FOREACH_STREAM(stream, streams) { + show_rate_histogram(stream->rate_hist, &stream->config.cfg, + global.show_rate_hist_buckets); + } + } + FOREACH_STREAM(stream, streams) { destroy_rate_histogram(stream->rate_hist); } + +#if CONFIG_INTERNAL_STATS + /* TODO(jkoleszar): This doesn't belong in this executable. Do it for now, + * to match some existing utilities. + */ + if (!(global.pass == 1 && global.passes == 2)) { + FOREACH_STREAM(stream, streams) { + FILE *f = fopen("opsnr.stt", "a"); + if (stream->mismatch_seen) { + fprintf(f, "First mismatch occurred in frame %d\n", + stream->mismatch_seen); + } else { + fprintf(f, "No mismatch detected in recon buffers\n"); + } + fclose(f); + } + } +#endif + + if (allocated_raw_shift) aom_img_free(&raw_shift); + aom_img_free(&raw); + free(argv); + free(streams); + return res ? EXIT_FAILURE : EXIT_SUCCESS; +} diff --git a/libs/libaom/src/apps/aomenc.h b/libs/libaom/src/apps/aomenc.h new file mode 100644 index 000000000..a38258b87 --- /dev/null +++ b/libs/libaom/src/apps/aomenc.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_APPS_AOMENC_H_ +#define AOM_APPS_AOMENC_H_ + +#include "aom/aom_codec.h" +#include "aom/aom_encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +enum TestDecodeFatality { + TEST_DECODE_OFF, + TEST_DECODE_FATAL, + TEST_DECODE_WARN, +}; + +typedef enum { + I420, // 4:2:0 8+ bit-depth + I422, // 4:2:2 8+ bit-depth + I444, // 4:4:4 8+ bit-depth + YV12, // 4:2:0 with uv flipped, only 8-bit depth +} ColorInputType; + +struct AvxInterface; + +/* Configuration elements common to all streams. */ +struct AvxEncoderConfig { + const struct AvxInterface *codec; + int passes; + int pass; + unsigned int usage; + ColorInputType color_type; + int quiet; + int verbose; + int limit; + int skip_frames; + int show_psnr; + enum TestDecodeFatality test_decode; + int have_framerate; + struct aom_rational framerate; + int debug; + int show_q_hist_buckets; + int show_rate_hist_buckets; + int disable_warnings; + int disable_warning_prompt; + int experimental_bitstream; + aom_chroma_sample_position_t csp; + cfg_options_t encoder_config; +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_APPS_AOMENC_H_ diff --git a/libs/libaom/src/av1/av1.cmake b/libs/libaom/src/av1/av1.cmake new file mode 100644 index 000000000..2ab349630 --- /dev/null +++ b/libs/libaom/src/av1/av1.cmake @@ -0,0 +1,580 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_AV1_AV1_CMAKE_) + return() +endif() # AOM_AV1_AV1_CMAKE_ +set(AOM_AV1_AV1_CMAKE_ 1) + +list(APPEND AOM_AV1_COMMON_SOURCES + "${AOM_ROOT}/av1/av1_iface_common.h" + "${AOM_ROOT}/av1/common/alloccommon.c" + "${AOM_ROOT}/av1/common/alloccommon.h" + "${AOM_ROOT}/av1/common/av1_common_int.h" + "${AOM_ROOT}/av1/common/av1_inv_txfm1d.c" + "${AOM_ROOT}/av1/common/av1_inv_txfm1d.h" + "${AOM_ROOT}/av1/common/av1_inv_txfm1d_cfg.h" + "${AOM_ROOT}/av1/common/av1_inv_txfm2d.c" + "${AOM_ROOT}/av1/common/av1_loopfilter.c" + "${AOM_ROOT}/av1/common/av1_loopfilter.h" + "${AOM_ROOT}/av1/common/av1_txfm.c" + "${AOM_ROOT}/av1/common/av1_txfm.h" + "${AOM_ROOT}/av1/common/blockd.c" + "${AOM_ROOT}/av1/common/blockd.h" + "${AOM_ROOT}/av1/common/cdef.c" + "${AOM_ROOT}/av1/common/cdef.h" + "${AOM_ROOT}/av1/common/cdef_block.c" + "${AOM_ROOT}/av1/common/cdef_block.h" + "${AOM_ROOT}/av1/common/cfl.c" + "${AOM_ROOT}/av1/common/cfl.h" + "${AOM_ROOT}/av1/common/common.h" + "${AOM_ROOT}/av1/common/common_data.h" + "${AOM_ROOT}/av1/common/convolve.c" + "${AOM_ROOT}/av1/common/convolve.h" + "${AOM_ROOT}/av1/common/debugmodes.c" + "${AOM_ROOT}/av1/common/entropy.c" + "${AOM_ROOT}/av1/common/entropy.h" + "${AOM_ROOT}/av1/common/entropymode.c" + "${AOM_ROOT}/av1/common/entropymode.h" + "${AOM_ROOT}/av1/common/entropymv.c" + "${AOM_ROOT}/av1/common/entropymv.h" + "${AOM_ROOT}/av1/common/enums.h" + "${AOM_ROOT}/av1/common/filter.h" + "${AOM_ROOT}/av1/common/frame_buffers.c" + "${AOM_ROOT}/av1/common/frame_buffers.h" + "${AOM_ROOT}/av1/common/idct.c" + "${AOM_ROOT}/av1/common/idct.h" + "${AOM_ROOT}/av1/common/mv.h" + "${AOM_ROOT}/av1/common/mvref_common.c" + "${AOM_ROOT}/av1/common/mvref_common.h" + "${AOM_ROOT}/av1/common/obu_util.c" + "${AOM_ROOT}/av1/common/obu_util.h" + "${AOM_ROOT}/av1/common/odintrin.c" + "${AOM_ROOT}/av1/common/odintrin.h" + "${AOM_ROOT}/av1/common/pred_common.c" + "${AOM_ROOT}/av1/common/pred_common.h" + "${AOM_ROOT}/av1/common/quant_common.c" + "${AOM_ROOT}/av1/common/quant_common.h" + "${AOM_ROOT}/av1/common/reconinter.c" + "${AOM_ROOT}/av1/common/reconinter.h" + "${AOM_ROOT}/av1/common/reconintra.c" + "${AOM_ROOT}/av1/common/reconintra.h" + "${AOM_ROOT}/av1/common/resize.c" + "${AOM_ROOT}/av1/common/resize.h" + "${AOM_ROOT}/av1/common/restoration.c" + "${AOM_ROOT}/av1/common/restoration.h" + "${AOM_ROOT}/av1/common/scale.c" + "${AOM_ROOT}/av1/common/scale.h" + "${AOM_ROOT}/av1/common/scan.c" + "${AOM_ROOT}/av1/common/scan.h" + "${AOM_ROOT}/av1/common/seg_common.c" + "${AOM_ROOT}/av1/common/seg_common.h" + "${AOM_ROOT}/av1/common/thread_common.c" + "${AOM_ROOT}/av1/common/thread_common.h" + "${AOM_ROOT}/av1/common/tile_common.c" + "${AOM_ROOT}/av1/common/tile_common.h" + "${AOM_ROOT}/av1/common/timing.c" + "${AOM_ROOT}/av1/common/timing.h" + "${AOM_ROOT}/av1/common/token_cdfs.h" + "${AOM_ROOT}/av1/common/txb_common.c" + "${AOM_ROOT}/av1/common/txb_common.h" + "${AOM_ROOT}/av1/common/warped_motion.c" + "${AOM_ROOT}/av1/common/warped_motion.h") + +if(CONFIG_LPF_MASK) + list(APPEND AOM_AV1_COMMON_SOURCES "${AOM_ROOT}/av1/common/loopfiltermask.c") +endif() + +list(APPEND AOM_AV1_DECODER_SOURCES + "${AOM_ROOT}/av1/av1_dx_iface.c" + "${AOM_ROOT}/av1/decoder/decodeframe.c" + "${AOM_ROOT}/av1/decoder/decodeframe.h" + "${AOM_ROOT}/av1/decoder/decodemv.c" + "${AOM_ROOT}/av1/decoder/decodemv.h" + "${AOM_ROOT}/av1/decoder/decoder.c" + "${AOM_ROOT}/av1/decoder/decoder.h" + "${AOM_ROOT}/av1/decoder/decodetxb.c" + "${AOM_ROOT}/av1/decoder/decodetxb.h" + "${AOM_ROOT}/av1/decoder/detokenize.c" + "${AOM_ROOT}/av1/decoder/detokenize.h" + "${AOM_ROOT}/av1/decoder/dthread.h" + "${AOM_ROOT}/av1/decoder/obu.h" + "${AOM_ROOT}/av1/decoder/obu.c") + +list(APPEND AOM_AV1_ENCODER_SOURCES + "${AOM_ROOT}/av1/av1_cx_iface.c" + "${AOM_ROOT}/av1/encoder/aq_complexity.c" + "${AOM_ROOT}/av1/encoder/aq_complexity.h" + "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.c" + "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.h" + "${AOM_ROOT}/av1/encoder/aq_variance.c" + "${AOM_ROOT}/av1/encoder/aq_variance.h" + "${AOM_ROOT}/av1/encoder/enc_enums.h" + "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d.c" + "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d.h" + "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d_cfg.h" + "${AOM_ROOT}/av1/encoder/av1_fwd_txfm2d.c" + "${AOM_ROOT}/av1/encoder/av1_multi_thread.c" + "${AOM_ROOT}/av1/encoder/av1_multi_thread.h" + "${AOM_ROOT}/av1/encoder/av1_quantize.c" + "${AOM_ROOT}/av1/encoder/av1_quantize.h" + "${AOM_ROOT}/av1/encoder/bitstream.c" + "${AOM_ROOT}/av1/encoder/bitstream.h" + "${AOM_ROOT}/av1/encoder/block.h" + "${AOM_ROOT}/av1/encoder/cnn.c" + "${AOM_ROOT}/av1/encoder/cnn.h" + "${AOM_ROOT}/av1/encoder/compound_type.c" + "${AOM_ROOT}/av1/encoder/compound_type.h" + "${AOM_ROOT}/av1/encoder/context_tree.c" + "${AOM_ROOT}/av1/encoder/context_tree.h" + "${AOM_ROOT}/av1/encoder/corner_detect.c" + "${AOM_ROOT}/av1/encoder/corner_detect.h" + "${AOM_ROOT}/av1/encoder/corner_match.c" + "${AOM_ROOT}/av1/encoder/corner_match.h" + "${AOM_ROOT}/av1/encoder/cost.c" + "${AOM_ROOT}/av1/encoder/cost.h" + "${AOM_ROOT}/av1/encoder/encodeframe.c" + "${AOM_ROOT}/av1/encoder/encodeframe.h" + "${AOM_ROOT}/av1/encoder/encodemb.c" + "${AOM_ROOT}/av1/encoder/encodemb.h" + "${AOM_ROOT}/av1/encoder/encodemv.c" + "${AOM_ROOT}/av1/encoder/encodemv.h" + "${AOM_ROOT}/av1/encoder/encode_strategy.c" + "${AOM_ROOT}/av1/encoder/encode_strategy.h" + "${AOM_ROOT}/av1/encoder/encoder.c" + "${AOM_ROOT}/av1/encoder/encoder.h" + "${AOM_ROOT}/av1/encoder/encodetxb.c" + "${AOM_ROOT}/av1/encoder/encodetxb.h" + "${AOM_ROOT}/av1/encoder/ethread.c" + "${AOM_ROOT}/av1/encoder/ethread.h" + "${AOM_ROOT}/av1/encoder/extend.c" + "${AOM_ROOT}/av1/encoder/extend.h" + "${AOM_ROOT}/av1/encoder/firstpass.c" + "${AOM_ROOT}/av1/encoder/firstpass.h" + "${AOM_ROOT}/av1/encoder/global_motion.c" + "${AOM_ROOT}/av1/encoder/global_motion.h" + "${AOM_ROOT}/av1/encoder/gop_structure.c" + "${AOM_ROOT}/av1/encoder/gop_structure.h" + "${AOM_ROOT}/av1/encoder/grain_test_vectors.h" + "${AOM_ROOT}/av1/encoder/hash.c" + "${AOM_ROOT}/av1/encoder/hash.h" + "${AOM_ROOT}/av1/encoder/hash_motion.c" + "${AOM_ROOT}/av1/encoder/hash_motion.h" + "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.c" + "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.h" + "${AOM_ROOT}/av1/encoder/interp_search.c" + "${AOM_ROOT}/av1/encoder/interp_search.h" + "${AOM_ROOT}/av1/encoder/level.c" + "${AOM_ROOT}/av1/encoder/level.h" + "${AOM_ROOT}/av1/encoder/lookahead.c" + "${AOM_ROOT}/av1/encoder/lookahead.h" + "${AOM_ROOT}/av1/encoder/mcomp.c" + "${AOM_ROOT}/av1/encoder/mcomp.h" + "${AOM_ROOT}/av1/encoder/ml.c" + "${AOM_ROOT}/av1/encoder/ml.h" + "${AOM_ROOT}/av1/encoder/model_rd.h" + "${AOM_ROOT}/av1/encoder/motion_search_facade.c" + "${AOM_ROOT}/av1/encoder/motion_search_facade.h" + "${AOM_ROOT}/av1/encoder/mv_prec.c" + "${AOM_ROOT}/av1/encoder/mv_prec.h" + "${AOM_ROOT}/av1/encoder/palette.c" + "${AOM_ROOT}/av1/encoder/palette.h" + "${AOM_ROOT}/av1/encoder/partition_strategy.h" + "${AOM_ROOT}/av1/encoder/partition_strategy.c" + "${AOM_ROOT}/av1/encoder/pass2_strategy.h" + "${AOM_ROOT}/av1/encoder/pass2_strategy.c" + "${AOM_ROOT}/av1/encoder/pickcdef.c" + "${AOM_ROOT}/av1/encoder/picklpf.c" + "${AOM_ROOT}/av1/encoder/picklpf.h" + "${AOM_ROOT}/av1/encoder/pickrst.c" + "${AOM_ROOT}/av1/encoder/pickrst.h" + "${AOM_ROOT}/av1/encoder/ransac.c" + "${AOM_ROOT}/av1/encoder/ransac.h" + "${AOM_ROOT}/av1/encoder/ratectrl.c" + "${AOM_ROOT}/av1/encoder/ratectrl.h" + "${AOM_ROOT}/av1/encoder/rd.c" + "${AOM_ROOT}/av1/encoder/rd.h" + "${AOM_ROOT}/av1/encoder/rdopt.c" + "${AOM_ROOT}/av1/encoder/nonrd_pickmode.c" + "${AOM_ROOT}/av1/encoder/rdopt.h" + "${AOM_ROOT}/av1/encoder/rdopt_data_defs.h" + "${AOM_ROOT}/av1/encoder/rdopt_utils.h" + "${AOM_ROOT}/av1/encoder/reconinter_enc.c" + "${AOM_ROOT}/av1/encoder/reconinter_enc.h" + "${AOM_ROOT}/av1/encoder/segmentation.c" + "${AOM_ROOT}/av1/encoder/segmentation.h" + "${AOM_ROOT}/av1/encoder/speed_features.c" + "${AOM_ROOT}/av1/encoder/speed_features.h" + "${AOM_ROOT}/av1/encoder/svc_layercontext.c" + "${AOM_ROOT}/av1/encoder/svc_layercontext.h" + "${AOM_ROOT}/av1/encoder/temporal_filter.c" + "${AOM_ROOT}/av1/encoder/temporal_filter.h" + "${AOM_ROOT}/av1/encoder/tokenize.c" + "${AOM_ROOT}/av1/encoder/tokenize.h" + "${AOM_ROOT}/av1/encoder/tpl_model.c" + "${AOM_ROOT}/av1/encoder/tpl_model.h" + "${AOM_ROOT}/av1/encoder/tx_search.c" + "${AOM_ROOT}/av1/encoder/tx_search.h" + "${AOM_ROOT}/av1/encoder/intra_mode_search.c" + "${AOM_ROOT}/av1/encoder/intra_mode_search.h" + "${AOM_ROOT}/av1/encoder/wedge_utils.c" + "${AOM_ROOT}/av1/encoder/var_based_part.c" + "${AOM_ROOT}/av1/encoder/var_based_part.h" + "${AOM_ROOT}/third_party/fastfeat/fast.c" + "${AOM_ROOT}/third_party/fastfeat/fast.h" + "${AOM_ROOT}/third_party/fastfeat/fast_9.c" + "${AOM_ROOT}/third_party/fastfeat/nonmax.c" + "${AOM_ROOT}/third_party/vector/vector.c" + "${AOM_ROOT}/third_party/vector/vector.h" + "${AOM_ROOT}/av1/encoder/dwt.c" + "${AOM_ROOT}/av1/encoder/dwt.h") + +if(CONFIG_TUNE_VMAF) + list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/tune_vmaf.c" + "${AOM_ROOT}/av1/encoder/tune_vmaf.h") +endif() + +list(APPEND AOM_AV1_COMMON_INTRIN_SSE2 + "${AOM_ROOT}/av1/common/cdef_block_sse2.c" + "${AOM_ROOT}/av1/common/x86/cfl_sse2.c" + "${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c" + "${AOM_ROOT}/av1/common/x86/convolve_sse2.c" + "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse2.c" + "${AOM_ROOT}/av1/common/x86/jnt_convolve_sse2.c" + "${AOM_ROOT}/av1/common/x86/wiener_convolve_sse2.c" + "${AOM_ROOT}/av1/common/x86/av1_txfm_sse2.h" + "${AOM_ROOT}/av1/common/x86/warp_plane_sse2.c") + +if(NOT CONFIG_AV1_HIGHBITDEPTH) + list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_SSE2 + "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse2.c") +endif() + +list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3 + "${AOM_ROOT}/av1/common/cdef_block_ssse3.c" + "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_ssse3.c" + "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_ssse3.h" + "${AOM_ROOT}/av1/common/x86/cfl_ssse3.c" + "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_ssse3.c" + "${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_ssse3.c" + "${AOM_ROOT}/av1/common/x86/jnt_convolve_ssse3.c" + "${AOM_ROOT}/av1/common/x86/reconinter_ssse3.c") + +if(NOT CONFIG_AV1_HIGHBITDEPTH) + list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_SSSE3 + "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_ssse3.c") +endif() + +list(APPEND AOM_AV1_COMMON_INTRIN_SSE4_1 + "${AOM_ROOT}/av1/common/cdef_block_sse4.c" + "${AOM_ROOT}/av1/common/x86/av1_convolve_horiz_rs_sse4.c" + "${AOM_ROOT}/av1/common/x86/av1_convolve_scale_sse4.c" + "${AOM_ROOT}/av1/common/x86/av1_txfm_sse4.c" + "${AOM_ROOT}/av1/common/x86/av1_txfm_sse4.h" + "${AOM_ROOT}/av1/common/x86/filterintra_sse4.c" + "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse4.c" + "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_sse4.c" + "${AOM_ROOT}/av1/common/x86/highbd_jnt_convolve_sse4.c" + "${AOM_ROOT}/av1/common/x86/highbd_warp_plane_sse4.c" + "${AOM_ROOT}/av1/common/x86/intra_edge_sse4.c" + "${AOM_ROOT}/av1/common/x86/reconinter_sse4.c" + "${AOM_ROOT}/av1/common/x86/selfguided_sse4.c" + "${AOM_ROOT}/av1/common/x86/warp_plane_sse4.c") + +if(NOT CONFIG_AV1_HIGHBITDEPTH) + list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_SSE4_1 + "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse4.c" + "${AOM_ROOT}/av1/common/x86/highbd_warp_plane_sse4.c") +endif() + +list(APPEND AOM_AV1_COMMON_INTRIN_AVX2 + "${AOM_ROOT}/av1/common/cdef_block_avx2.c" + "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_avx2.c" + "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_avx2.h" + "${AOM_ROOT}/av1/common/x86/cfl_avx2.c" + "${AOM_ROOT}/av1/common/x86/convolve_2d_avx2.c" + "${AOM_ROOT}/av1/common/x86/convolve_avx2.c" + "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_avx2.c" + "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_avx2.c" + "${AOM_ROOT}/av1/common/x86/highbd_jnt_convolve_avx2.c" + "${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_avx2.c" + "${AOM_ROOT}/av1/common/x86/jnt_convolve_avx2.c" + "${AOM_ROOT}/av1/common/x86/reconinter_avx2.c" + "${AOM_ROOT}/av1/common/x86/selfguided_avx2.c" + "${AOM_ROOT}/av1/common/x86/warp_plane_avx2.c" + "${AOM_ROOT}/av1/common/x86/wiener_convolve_avx2.c") + +if(NOT CONFIG_AV1_HIGHBITDEPTH) + list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_AVX2 + "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_avx2.c") +endif() + +list(APPEND AOM_AV1_ENCODER_ASM_SSE2 "${AOM_ROOT}/av1/encoder/x86/dct_sse2.asm" + "${AOM_ROOT}/av1/encoder/x86/error_sse2.asm") + +list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2 + "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_sse2.c" + "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_sse2.h" + "${AOM_ROOT}/av1/encoder/x86/av1_quantize_sse2.c" + "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse2.c" + "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c" + "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse2.c" + "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c") + +if(NOT CONFIG_AV1_HIGHBITDEPTH) + list( + REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_SSE2 + "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c") +endif() + +list(APPEND AOM_AV1_ENCODER_INTRIN_SSE3 "${AOM_ROOT}/av1/encoder/x86/ml_sse3.c") + +list(APPEND AOM_AV1_ENCODER_ASM_SSSE3_X86_64 + "${AOM_ROOT}/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm") + +list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_1 + "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm1d_sse4.c" + "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_sse4.c" + "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_sse4.c" + "${AOM_ROOT}/av1/encoder/x86/corner_match_sse4.c" + "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse4.c" + "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c" + "${AOM_ROOT}/av1/encoder/x86/rdopt_sse4.c" + "${AOM_ROOT}/av1/encoder/x86/temporal_filter_constants.h" + "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse4.c" + "${AOM_ROOT}/av1/encoder/x86/pickrst_sse4.c") + +list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2 + "${AOM_ROOT}/av1/encoder/x86/av1_quantize_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/corner_match_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_avx2.h" + "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/encodetxb_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/rdopt_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/temporal_filter_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c") + +if(NOT CONFIG_AV1_HIGHBITDEPTH) + list( + REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_AVX2 + "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_avx2.c") +endif() + +list(APPEND AOM_AV1_ENCODER_INTRIN_NEON + "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c" + "${AOM_ROOT}/av1/encoder/arm/neon/av1_error_neon.c") + +list(APPEND AOM_AV1_ENCODER_INTRIN_MSA + "${AOM_ROOT}/av1/encoder/mips/msa/error_msa.c" + "${AOM_ROOT}/av1/encoder/mips/msa/fdct4x4_msa.c" + "${AOM_ROOT}/av1/encoder/mips/msa/temporal_filter_msa.c") + +list(APPEND AOM_AV1_COMMON_INTRIN_NEON + "${AOM_ROOT}/av1/common/arm/av1_txfm_neon.c" + "${AOM_ROOT}/av1/common/arm/cfl_neon.c" + "${AOM_ROOT}/av1/common/arm/convolve_neon.c" + "${AOM_ROOT}/av1/common/arm/convolve_neon.h" + "${AOM_ROOT}/av1/common/arm/jnt_convolve_neon.c" + "${AOM_ROOT}/av1/common/arm/mem_neon.h" + "${AOM_ROOT}/av1/common/arm/transpose_neon.h" + "${AOM_ROOT}/av1/common/arm/blend_a64_hmask_neon.c" + "${AOM_ROOT}/av1/common/arm/blend_a64_vmask_neon.c" + "${AOM_ROOT}/av1/common/arm/reconinter_neon.c" + "${AOM_ROOT}/av1/common/arm/wiener_convolve_neon.c" + "${AOM_ROOT}/av1/common/arm/selfguided_neon.c" + "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c" + "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.h" + "${AOM_ROOT}/av1/common/arm/warp_plane_neon.c" + "${AOM_ROOT}/av1/common/cdef_block_neon.c") + +list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2 + "${AOM_ROOT}/av1/encoder/x86/hash_sse42.c") + +list(APPEND AOM_AV1_COMMON_INTRIN_VSX "${AOM_ROOT}/av1/common/ppc/cfl_ppc.c") + +if(CONFIG_ACCOUNTING) + list(APPEND AOM_AV1_DECODER_SOURCES "${AOM_ROOT}/av1/decoder/accounting.c" + "${AOM_ROOT}/av1/decoder/accounting.h") +endif() + +if(CONFIG_INSPECTION) + list(APPEND AOM_AV1_DECODER_SOURCES "${AOM_ROOT}/av1/decoder/inspection.c" + "${AOM_ROOT}/av1/decoder/inspection.h") +endif() + +if(CONFIG_INTERNAL_STATS) + list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/blockiness.c") +endif() + +if(CONFIG_REALTIME_ONLY) + list(REMOVE_ITEM AOM_AV1_ENCODER_SOURCES + "${AOM_ROOT}/av1/encoder/cnn.c" + "${AOM_ROOT}/av1/encoder/cnn.h" + "${AOM_ROOT}/av1/encoder/firstpass.c" + "${AOM_ROOT}/av1/encoder/firstpass.h" + "${AOM_ROOT}/av1/encoder/gop_structure.c" + "${AOM_ROOT}/av1/encoder/gop_structure.h" + "${AOM_ROOT}/av1/encoder/misc_model_weights.h" + "${AOM_ROOT}/av1/encoder/partition_cnn_weights.h" + "${AOM_ROOT}/av1/encoder/partition_model_weights.h" + "${AOM_ROOT}/av1/encoder/pass2_strategy.c" + "${AOM_ROOT}/av1/encoder/temporal_filter.c" + "${AOM_ROOT}/av1/encoder/temporal_filter.h" + "${AOM_ROOT}/av1/encoder/temporal_filter_constants.h" + "${AOM_ROOT}/av1/encoder/tpl_model.c" + "${AOM_ROOT}/av1/encoder/tpl_model.h" + "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse4.c") +endif() + +# Setup AV1 common/decoder/encoder targets. The libaom target must exist before +# this function is called. +function(setup_av1_targets) + add_library(aom_av1_common OBJECT ${AOM_AV1_COMMON_SOURCES}) + list(APPEND AOM_LIB_TARGETS aom_av1_common) + target_sources(aom PRIVATE $) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE $) + endif() + + if(CONFIG_AV1_DECODER) + add_library(aom_av1_decoder OBJECT ${AOM_AV1_DECODER_SOURCES}) + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_decoder) + target_sources(aom PRIVATE $) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE $) + endif() + endif() + + if(CONFIG_AV1_ENCODER) + add_library(aom_av1_encoder OBJECT ${AOM_AV1_ENCODER_SOURCES}) + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_encoder) + target_sources(aom PRIVATE $) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE $) + endif() + endif() + + if(HAVE_SSE2) + require_compiler_flag_nomsvc("-msse2" NO) + add_intrinsics_object_library("-msse2" "sse2" "aom_av1_common" + "AOM_AV1_COMMON_INTRIN_SSE2") + if(CONFIG_AV1_DECODER) + if(AOM_AV1_DECODER_ASM_SSE2) + add_asm_library("aom_av1_decoder_sse2" "AOM_AV1_DECODER_ASM_SSE2") + endif() + + if(AOM_AV1_DECODER_INTRIN_SSE2) + add_intrinsics_object_library("-msse2" "sse2" "aom_av1_decoder" + "AOM_AV1_DECODER_INTRIN_SSE2") + endif() + endif() + + if(CONFIG_AV1_ENCODER) + add_asm_library("aom_av1_encoder_sse2" "AOM_AV1_ENCODER_ASM_SSE2") + add_intrinsics_object_library("-msse2" "sse2" "aom_av1_encoder" + "AOM_AV1_ENCODER_INTRIN_SSE2") + endif() + endif() + + if(HAVE_SSE3) + require_compiler_flag_nomsvc("-msse3" NO) + if(CONFIG_AV1_ENCODER) + add_intrinsics_object_library("-msse3" "sse3" "aom_av1_encoder" + "AOM_AV1_ENCODER_INTRIN_SSE3") + endif() + endif() + + if(HAVE_SSSE3) + require_compiler_flag_nomsvc("-mssse3" NO) + add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_common" + "AOM_AV1_COMMON_INTRIN_SSSE3") + + if(CONFIG_AV1_DECODER) + if(AOM_AV1_DECODER_INTRIN_SSSE3) + add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_decoder" + "AOM_AV1_DECODER_INTRIN_SSSE3") + endif() + endif() + endif() + + if(HAVE_SSE4_1) + require_compiler_flag_nomsvc("-msse4.1" NO) + add_intrinsics_object_library("-msse4.1" "sse4" "aom_av1_common" + "AOM_AV1_COMMON_INTRIN_SSE4_1") + + if(CONFIG_AV1_ENCODER) + if("${AOM_TARGET_CPU}" STREQUAL "x86_64") + add_asm_library("aom_av1_encoder_ssse3" + "AOM_AV1_ENCODER_ASM_SSSE3_X86_64") + endif() + + if(AOM_AV1_ENCODER_INTRIN_SSE4_1) + add_intrinsics_object_library("-msse4.1" "sse4" "aom_av1_encoder" + "AOM_AV1_ENCODER_INTRIN_SSE4_1") + endif() + endif() + endif() + + if(HAVE_SSE4_2) + require_compiler_flag_nomsvc("-msse4.2" NO) + if(CONFIG_AV1_ENCODER) + if(AOM_AV1_ENCODER_INTRIN_SSE4_2) + add_intrinsics_object_library("-msse4.2" "sse42" "aom_av1_encoder" + "AOM_AV1_ENCODER_INTRIN_SSE4_2") + endif() + endif() + endif() + + if(HAVE_AVX2) + require_compiler_flag_nomsvc("-mavx2" NO) + add_intrinsics_object_library("-mavx2" "avx2" "aom_av1_common" + "AOM_AV1_COMMON_INTRIN_AVX2") + + if(CONFIG_AV1_ENCODER) + add_intrinsics_object_library("-mavx2" "avx2" "aom_av1_encoder" + "AOM_AV1_ENCODER_INTRIN_AVX2") + endif() + endif() + + if(HAVE_NEON) + if(AOM_AV1_COMMON_INTRIN_NEON) + add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon" + "aom_av1_common" + "AOM_AV1_COMMON_INTRIN_NEON") + endif() + + if(AOM_AV1_ENCODER_INTRIN_NEON) + add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon" + "aom_av1_encoder" + "AOM_AV1_ENCODER_INTRIN_NEON") + endif() + endif() + + if(HAVE_VSX) + if(AOM_AV1_COMMON_INTRIN_VSX) + add_intrinsics_object_library("-mvsx -maltivec" "vsx" "aom_av1_common" + "AOM_AV1_COMMON_INTRIN_VSX") + endif() + endif() + + if(HAVE_MSA) + add_intrinsics_object_library("" "msa" "aom_av1_encoder" + "AOM_AV1_ENCODER_INTRIN_MSA") + endif() + + # Pass the new lib targets up to the parent scope instance of + # $AOM_LIB_TARGETS. + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE) +endfunction() diff --git a/libs/libaom/src/av1/av1_cx_iface.c b/libs/libaom/src/av1/av1_cx_iface.c new file mode 100644 index 000000000..676eaa0ad --- /dev/null +++ b/libs/libaom/src/av1/av1_cx_iface.c @@ -0,0 +1,2936 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include + +#include "config/aom_config.h" +#include "config/aom_version.h" + +#include "aom_ports/aom_once.h" +#include "aom_ports/mem_ops.h" +#include "aom_ports/system_state.h" + +#include "aom/aom_encoder.h" +#include "aom/internal/aom_codec_internal.h" + +#include "av1/av1_iface_common.h" +#include "av1/encoder/bitstream.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/firstpass.h" + +#define MAG_SIZE (4) + +struct av1_extracfg { + int cpu_used; + unsigned int enable_auto_alt_ref; + unsigned int enable_auto_bwd_ref; + unsigned int noise_sensitivity; + unsigned int sharpness; + unsigned int static_thresh; + unsigned int row_mt; + unsigned int tile_columns; // log2 number of tile columns + unsigned int tile_rows; // log2 number of tile rows + unsigned int enable_tpl_model; + unsigned int enable_keyframe_filtering; + unsigned int arnr_max_frames; + unsigned int arnr_strength; + unsigned int min_gf_interval; + unsigned int max_gf_interval; + unsigned int gf_min_pyr_height; + unsigned int gf_max_pyr_height; + aom_tune_metric tuning; + const char *vmaf_model_path; + unsigned int cq_level; // constrained quality level + unsigned int rc_max_intra_bitrate_pct; + unsigned int rc_max_inter_bitrate_pct; + unsigned int gf_cbr_boost_pct; + unsigned int lossless; + unsigned int enable_cdef; + unsigned int enable_restoration; + unsigned int force_video_mode; + unsigned int enable_obmc; + unsigned int disable_trellis_quant; + unsigned int enable_qm; + unsigned int qm_y; + unsigned int qm_u; + unsigned int qm_v; + unsigned int qm_min; + unsigned int qm_max; + unsigned int num_tg; + unsigned int mtu_size; + + aom_timing_info_type_t timing_info_type; + unsigned int frame_parallel_decoding_mode; + int enable_dual_filter; + unsigned int enable_chroma_deltaq; + AQ_MODE aq_mode; + DELTAQ_MODE deltaq_mode; + int deltalf_mode; + unsigned int frame_periodic_boost; + aom_bit_depth_t bit_depth; + aom_tune_content content; + aom_color_primaries_t color_primaries; + aom_transfer_characteristics_t transfer_characteristics; + aom_matrix_coefficients_t matrix_coefficients; + aom_chroma_sample_position_t chroma_sample_position; + int color_range; + int render_width; + int render_height; + aom_superblock_size_t superblock_size; + unsigned int single_tile_decoding; + int error_resilient_mode; + int s_frame_mode; + + int film_grain_test_vector; + const char *film_grain_table_filename; + unsigned int motion_vector_unit_test; + unsigned int cdf_update_mode; + int enable_rect_partitions; // enable rectangular partitions for sequence + int enable_ab_partitions; // enable AB partitions for sequence + int enable_1to4_partitions; // enable 1:4 and 4:1 partitions for sequence + int min_partition_size; // min partition size [4,8,16,32,64,128] + int max_partition_size; // max partition size [4,8,16,32,64,128] + int enable_intra_edge_filter; // enable intra-edge filter for sequence + int enable_order_hint; // enable order hint for sequence + int enable_tx64; // enable 64-pt transform usage for sequence + int enable_flip_idtx; // enable flip and identity transform types + int enable_dist_wtd_comp; // enable dist wtd compound for sequence + int max_reference_frames; // maximum number of references per frame + int enable_reduced_reference_set; // enable reduced set of references + int enable_ref_frame_mvs; // sequence level + int allow_ref_frame_mvs; // frame level + int enable_masked_comp; // enable masked compound for sequence + int enable_onesided_comp; // enable one sided compound for sequence + int enable_interintra_comp; // enable interintra compound for sequence + int enable_smooth_interintra; // enable smooth interintra mode usage + int enable_diff_wtd_comp; // enable diff-wtd compound usage + int enable_interinter_wedge; // enable interinter-wedge compound usage + int enable_interintra_wedge; // enable interintra-wedge compound usage + int enable_global_motion; // enable global motion usage for sequence + int enable_warped_motion; // sequence level + int allow_warped_motion; // frame level + int enable_filter_intra; // enable filter intra for sequence + int enable_smooth_intra; // enable smooth intra modes for sequence + int enable_paeth_intra; // enable Paeth intra mode for sequence + int enable_cfl_intra; // enable CFL uv intra mode for sequence + int enable_superres; + int enable_overlay; // enable overlay for filtered arf frames + int enable_palette; + int enable_intrabc; + int enable_angle_delta; +#if CONFIG_DENOISE + float noise_level; + int noise_block_size; +#endif + + unsigned int chroma_subsampling_x; + unsigned int chroma_subsampling_y; + int reduced_tx_type_set; + int use_intra_dct_only; + int use_inter_dct_only; + int use_intra_default_tx_only; + int quant_b_adapt; + AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS]; + // Bit mask to specify which tier each of the 32 possible operating points + // conforms to. + unsigned int tier_mask; + // min_cr / 100 is the target minimum compression ratio for each frame. + unsigned int min_cr; + COST_UPDATE_TYPE coeff_cost_upd_freq; + COST_UPDATE_TYPE mode_cost_upd_freq; + COST_UPDATE_TYPE mv_cost_upd_freq; + unsigned int ext_tile_debug; + unsigned int sb_multipass_unit_test; +}; + +static struct av1_extracfg default_extra_cfg = { + 0, // cpu_used + 1, // enable_auto_alt_ref + 0, // enable_auto_bwd_ref + 0, // noise_sensitivity + 0, // sharpness + 0, // static_thresh + 1, // row_mt + 0, // tile_columns + 0, // tile_rows + 1, // enable_tpl_model + 1, // enable_keyframe_filtering + 7, // arnr_max_frames + 5, // arnr_strength + 0, // min_gf_interval; 0 -> default decision + 0, // max_gf_interval; 0 -> default decision + 0, // gf_min_pyr_height + 5, // gf_max_pyr_height + AOM_TUNE_PSNR, // tuning + "/usr/local/share/model/vmaf_v0.6.1.pkl", // VMAF model path + 10, // cq_level + 0, // rc_max_intra_bitrate_pct + 0, // rc_max_inter_bitrate_pct + 0, // gf_cbr_boost_pct + 0, // lossless + 1, // enable_cdef + 1, // enable_restoration + 0, // force_video_mode + 1, // enable_obmc + 3, // disable_trellis_quant + 0, // enable_qm + DEFAULT_QM_Y, // qm_y + DEFAULT_QM_U, // qm_u + DEFAULT_QM_V, // qm_v + DEFAULT_QM_FIRST, // qm_min + DEFAULT_QM_LAST, // qm_max + 1, // max number of tile groups + 0, // mtu_size + AOM_TIMING_UNSPECIFIED, // No picture timing signaling in bitstream + 0, // frame_parallel_decoding_mode + 1, // enable dual filter + 0, // enable delta quant in chroma planes + NO_AQ, // aq_mode + DELTA_Q_OBJECTIVE, // deltaq_mode + 0, // delta lf mode + 0, // frame_periodic_delta_q + AOM_BITS_8, // Bit depth + AOM_CONTENT_DEFAULT, // content + AOM_CICP_CP_UNSPECIFIED, // CICP color space + AOM_CICP_TC_UNSPECIFIED, // CICP transfer characteristics + AOM_CICP_MC_UNSPECIFIED, // CICP matrix coefficients + AOM_CSP_UNKNOWN, // chroma sample position + 0, // color range + 0, // render width + 0, // render height + AOM_SUPERBLOCK_SIZE_DYNAMIC, // superblock_size + 1, // this depends on large_scale_tile. + 0, // error_resilient_mode off by default. + 0, // s_frame_mode off by default. + 0, // film_grain_test_vector + 0, // film_grain_table_filename + 0, // motion_vector_unit_test + 1, // CDF update mode + 1, // enable rectangular partitions + 1, // enable ab shape partitions + 1, // enable 1:4 and 4:1 partitions + 4, // min_partition_size + 128, // max_partition_size + 1, // enable intra edge filter + 1, // frame order hint + 1, // enable 64-pt transform usage + 1, // enable flip and identity transform + 1, // dist-wtd compound + 7, // max_reference_frames + 0, // enable_reduced_reference_set + 1, // enable_ref_frame_mvs sequence level + 1, // allow ref_frame_mvs frame level + 1, // enable masked compound at sequence level + 1, // enable one sided compound at sequence level + 1, // enable interintra compound at sequence level + 1, // enable smooth interintra mode + 1, // enable difference-weighted compound + 1, // enable interinter wedge compound + 1, // enable interintra wedge compound + 1, // enable_global_motion usage + 1, // enable_warped_motion at sequence level + 1, // allow_warped_motion at frame level + 1, // enable filter intra at sequence level + 1, // enable smooth intra modes usage for sequence + 1, // enable Paeth intra mode usage for sequence + 1, // enable CFL uv intra mode usage for sequence + 1, // superres + 1, // enable overlay + 1, // enable palette + !CONFIG_SHARP_SETTINGS, // enable intrabc + 1, // enable angle delta +#if CONFIG_DENOISE + 0, // noise_level + 32, // noise_block_size +#endif + 0, // chroma_subsampling_x + 0, // chroma_subsampling_y + 0, // reduced_tx_type_set + 0, // use_intra_dct_only + 0, // use_inter_dct_only + 0, // use_intra_default_tx_only + 0, // quant_b_adapt + { + SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, + SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, + SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, + SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, + SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, + SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, + SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, + }, // target_seq_level_idx + 0, // tier_mask + 0, // min_cr + COST_UPD_SB, // coeff_cost_upd_freq + COST_UPD_SB, // mode_cost_upd_freq + COST_UPD_SB, // mv_cost_upd_freq + 0, // ext_tile_debug + 0, // sb_multipass_unit_test +}; + +struct aom_codec_alg_priv { + aom_codec_priv_t base; + aom_codec_enc_cfg_t cfg; + struct av1_extracfg extra_cfg; + aom_rational64_t timestamp_ratio; + aom_codec_pts_t pts_offset; + unsigned char pts_offset_initialized; + AV1EncoderConfig oxcf; + AV1_COMP *cpi; + unsigned char *cx_data; + size_t cx_data_sz; + unsigned char *pending_cx_data; + size_t pending_cx_data_sz; + int pending_frame_count; + size_t pending_frame_sizes[8]; + aom_image_t preview_img; + aom_enc_frame_flags_t next_frame_flags; + aom_codec_pkt_list_decl(256) pkt_list; + unsigned int fixed_kf_cntr; + // BufferPool that holds all reference frames. + BufferPool *buffer_pool; + + // lookahead instance variables + BufferPool *buffer_pool_lap; + AV1_COMP *cpi_lap; + FIRSTPASS_STATS *frame_stats_buffer; + // Number of stats buffers required for look ahead + int num_lap_buffers; + STATS_BUFFER_CTX stats_buf_context; +}; + +static INLINE int gcd(int64_t a, int b) { + int remainder; // remainder + while (b > 0) { + remainder = (int)(a % b); + a = b; + b = remainder; + } + + return (int)a; +} + +static INLINE void reduce_ratio(aom_rational64_t *ratio) { + const int denom = gcd(ratio->num, ratio->den); + ratio->num /= denom; + ratio->den /= denom; +} + +static aom_codec_err_t update_error_state( + aom_codec_alg_priv_t *ctx, const struct aom_internal_error_info *error) { + const aom_codec_err_t res = error->error_code; + + if (res != AOM_CODEC_OK) + ctx->base.err_detail = error->has_detail ? error->detail : NULL; + + return res; +} + +#undef ERROR +#define ERROR(str) \ + do { \ + ctx->base.err_detail = str; \ + return AOM_CODEC_INVALID_PARAM; \ + } while (0) + +#define RANGE_CHECK(p, memb, lo, hi) \ + do { \ + if (!((p)->memb >= (lo) && (p)->memb <= (hi))) \ + ERROR(#memb " out of range [" #lo ".." #hi "]"); \ + } while (0) + +#define RANGE_CHECK_HI(p, memb, hi) \ + do { \ + if (!((p)->memb <= (hi))) ERROR(#memb " out of range [.." #hi "]"); \ + } while (0) + +#define RANGE_CHECK_BOOL(p, memb) \ + do { \ + if (!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean"); \ + } while (0) + +static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx, + const aom_codec_enc_cfg_t *cfg, + const struct av1_extracfg *extra_cfg) { + RANGE_CHECK(cfg, g_w, 1, 65535); // 16 bits available + RANGE_CHECK(cfg, g_h, 1, 65535); // 16 bits available + RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000); + RANGE_CHECK(cfg, g_timebase.num, 1, cfg->g_timebase.den); + RANGE_CHECK_HI(cfg, g_profile, MAX_PROFILES - 1); + + RANGE_CHECK_HI(cfg, rc_max_quantizer, 63); + RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer); + RANGE_CHECK_BOOL(extra_cfg, lossless); + RANGE_CHECK_HI(extra_cfg, aq_mode, AQ_MODE_COUNT - 1); + RANGE_CHECK_HI(extra_cfg, deltaq_mode, DELTA_Q_MODE_COUNT - 1); + RANGE_CHECK_HI(extra_cfg, deltalf_mode, 1); + RANGE_CHECK_HI(extra_cfg, frame_periodic_boost, 1); + RANGE_CHECK_HI(cfg, g_usage, 1); + RANGE_CHECK_HI(cfg, g_threads, MAX_NUM_THREADS); + RANGE_CHECK(cfg, rc_end_usage, AOM_VBR, AOM_Q); + RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100); + RANGE_CHECK_HI(cfg, rc_overshoot_pct, 100); + RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100); + RANGE_CHECK(cfg, kf_mode, AOM_KF_DISABLED, AOM_KF_AUTO); + RANGE_CHECK_HI(cfg, rc_dropframe_thresh, 100); + RANGE_CHECK(cfg, g_pass, AOM_RC_ONE_PASS, AOM_RC_LAST_PASS); + if (cfg->g_pass == AOM_RC_ONE_PASS) { + RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_TOTAL_BUFFERS); + } else { + RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS); + } + RANGE_CHECK_HI(extra_cfg, min_gf_interval, MAX_LAG_BUFFERS - 1); + RANGE_CHECK_HI(extra_cfg, max_gf_interval, MAX_LAG_BUFFERS - 1); + if (extra_cfg->max_gf_interval > 0) { + RANGE_CHECK(extra_cfg, max_gf_interval, + AOMMAX(2, extra_cfg->min_gf_interval), (MAX_LAG_BUFFERS - 1)); + } + RANGE_CHECK_HI(extra_cfg, gf_min_pyr_height, 5); + RANGE_CHECK_HI(extra_cfg, gf_max_pyr_height, 5); + if (extra_cfg->gf_min_pyr_height > extra_cfg->gf_max_pyr_height) { + ERROR( + "gf_min_pyr_height must be less than or equal to " + "gf_max_pyramid_height"); + } + + RANGE_CHECK_HI(cfg, rc_resize_mode, RESIZE_MODES - 1); + RANGE_CHECK(cfg, rc_resize_denominator, SCALE_NUMERATOR, + SCALE_NUMERATOR << 1); + RANGE_CHECK(cfg, rc_resize_kf_denominator, SCALE_NUMERATOR, + SCALE_NUMERATOR << 1); + RANGE_CHECK_HI(cfg, rc_superres_mode, SUPERRES_MODES - 1); + RANGE_CHECK(cfg, rc_superres_denominator, SCALE_NUMERATOR, + SCALE_NUMERATOR << 1); + RANGE_CHECK(cfg, rc_superres_kf_denominator, SCALE_NUMERATOR, + SCALE_NUMERATOR << 1); + RANGE_CHECK(cfg, rc_superres_qthresh, 1, 63); + RANGE_CHECK(cfg, rc_superres_kf_qthresh, 1, 63); + RANGE_CHECK_HI(extra_cfg, cdf_update_mode, 2); + + // AV1 does not support a lower bound on the keyframe interval in + // automatic keyframe placement mode. + if (cfg->kf_mode != AOM_KF_DISABLED && cfg->kf_min_dist != cfg->kf_max_dist && + cfg->kf_min_dist > 0) + ERROR( + "kf_min_dist not supported in auto mode, use 0 " + "or kf_max_dist instead."); + + RANGE_CHECK_HI(extra_cfg, motion_vector_unit_test, 2); + RANGE_CHECK_HI(extra_cfg, sb_multipass_unit_test, 1); + RANGE_CHECK_HI(extra_cfg, ext_tile_debug, 1); + RANGE_CHECK_HI(extra_cfg, enable_auto_alt_ref, 1); + RANGE_CHECK_HI(extra_cfg, enable_auto_bwd_ref, 2); + RANGE_CHECK(extra_cfg, cpu_used, 0, 8); + RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6); + RANGE_CHECK(extra_cfg, superblock_size, AOM_SUPERBLOCK_SIZE_64X64, + AOM_SUPERBLOCK_SIZE_DYNAMIC); + RANGE_CHECK_HI(cfg, large_scale_tile, 1); + RANGE_CHECK_HI(extra_cfg, single_tile_decoding, 1); + + RANGE_CHECK_HI(extra_cfg, row_mt, 1); + + RANGE_CHECK_HI(extra_cfg, tile_columns, 6); + RANGE_CHECK_HI(extra_cfg, tile_rows, 6); + + RANGE_CHECK_HI(cfg, monochrome, 1); + + if (cfg->large_scale_tile && extra_cfg->aq_mode) + ERROR( + "Adaptive quantization are not supported in large scale tile " + "coding."); + + RANGE_CHECK_HI(extra_cfg, sharpness, 7); + RANGE_CHECK_HI(extra_cfg, arnr_max_frames, 15); + RANGE_CHECK_HI(extra_cfg, arnr_strength, 6); + RANGE_CHECK_HI(extra_cfg, cq_level, 63); + RANGE_CHECK(cfg, g_bit_depth, AOM_BITS_8, AOM_BITS_12); + RANGE_CHECK(cfg, g_input_bit_depth, 8, 12); + RANGE_CHECK(extra_cfg, content, AOM_CONTENT_DEFAULT, AOM_CONTENT_INVALID - 1); + + if (cfg->g_pass == AOM_RC_LAST_PASS) { + const size_t packet_sz = sizeof(FIRSTPASS_STATS); + const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz); + const FIRSTPASS_STATS *stats; + + if (cfg->rc_twopass_stats_in.buf == NULL) + ERROR("rc_twopass_stats_in.buf not set."); + + if (cfg->rc_twopass_stats_in.sz % packet_sz) + ERROR("rc_twopass_stats_in.sz indicates truncated packet."); + + if (cfg->rc_twopass_stats_in.sz < 2 * packet_sz) + ERROR("rc_twopass_stats_in requires at least two packets."); + + stats = + (const FIRSTPASS_STATS *)cfg->rc_twopass_stats_in.buf + n_packets - 1; + + if ((int)(stats->count + 0.5) != n_packets - 1) + ERROR("rc_twopass_stats_in missing EOS stats packet"); + } + + if (cfg->g_profile <= (unsigned int)PROFILE_1 && + cfg->g_bit_depth > AOM_BITS_10) { + ERROR("Codec bit-depth 12 not supported in profile < 2"); + } + if (cfg->g_profile <= (unsigned int)PROFILE_1 && + cfg->g_input_bit_depth > 10) { + ERROR("Source bit-depth 12 not supported in profile < 2"); + } + + if (cfg->rc_end_usage == AOM_Q) { + RANGE_CHECK_HI(cfg, use_fixed_qp_offsets, 1); + for (int i = 0; i < FIXED_QP_OFFSET_COUNT; ++i) { + RANGE_CHECK_HI(cfg, fixed_qp_offsets[i], 63); + } + } else { + if (cfg->use_fixed_qp_offsets > 0) { + ERROR("--use_fixed_qp_offsets can only be used with --end-usage=q"); + } + for (int i = 0; i < FIXED_QP_OFFSET_COUNT; ++i) { + if (cfg->fixed_qp_offsets[i] >= 0) { + ERROR("--fixed_qp_offsets can only be used with --end-usage=q"); + } + } + } + + RANGE_CHECK(extra_cfg, color_primaries, AOM_CICP_CP_BT_709, + AOM_CICP_CP_EBU_3213); // Need to check range more precisely to + // check for reserved values? + RANGE_CHECK(extra_cfg, transfer_characteristics, AOM_CICP_TC_BT_709, + AOM_CICP_TC_HLG); + RANGE_CHECK(extra_cfg, matrix_coefficients, AOM_CICP_MC_IDENTITY, + AOM_CICP_MC_ICTCP); + RANGE_CHECK(extra_cfg, color_range, 0, 1); + +#if !CONFIG_TUNE_VMAF + if (extra_cfg->tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING || + extra_cfg->tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING || + extra_cfg->tuning == AOM_TUNE_VMAF_MAX_GAIN) { + ERROR( + "This error may be related to the wrong configuration options: try to " + "set -DCONFIG_TUNE_VMAF=1 at the time CMake is run."); + } +#endif + +#if CONFIG_TUNE_VMAF + RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR, AOM_TUNE_VMAF_MAX_GAIN); +#else + RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR, AOM_TUNE_SSIM); +#endif + + RANGE_CHECK(extra_cfg, timing_info_type, AOM_TIMING_UNSPECIFIED, + AOM_TIMING_DEC_MODEL); + + RANGE_CHECK(extra_cfg, film_grain_test_vector, 0, 16); + + if (extra_cfg->lossless) { + if (extra_cfg->aq_mode != 0) + ERROR("Only --aq_mode=0 can be used with --lossless=1."); + if (extra_cfg->enable_chroma_deltaq) + ERROR("Only --enable_chroma_deltaq=0 can be used with --lossless=1."); + } + + if (cfg->rc_resize_mode != RESIZE_NONE && + extra_cfg->aq_mode == CYCLIC_REFRESH_AQ) { + ERROR("--aq_mode=3 is only supported for --resize-mode=0."); + } + + RANGE_CHECK(extra_cfg, max_reference_frames, 3, 7); + RANGE_CHECK(extra_cfg, enable_reduced_reference_set, 0, 1); + RANGE_CHECK_HI(extra_cfg, chroma_subsampling_x, 1); + RANGE_CHECK_HI(extra_cfg, chroma_subsampling_y, 1); + + RANGE_CHECK_HI(extra_cfg, disable_trellis_quant, 3); + RANGE_CHECK(extra_cfg, coeff_cost_upd_freq, 0, 2); + RANGE_CHECK(extra_cfg, mode_cost_upd_freq, 0, 2); + RANGE_CHECK(extra_cfg, mv_cost_upd_freq, 0, 3); + + RANGE_CHECK(extra_cfg, min_partition_size, 4, 128); + RANGE_CHECK(extra_cfg, max_partition_size, 4, 128); + RANGE_CHECK_HI(extra_cfg, min_partition_size, extra_cfg->max_partition_size); + + for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) { + const int level_idx = extra_cfg->target_seq_level_idx[i]; + if (!is_valid_seq_level_idx(level_idx) && level_idx != SEQ_LEVELS) { + ERROR("Target sequence level index is invalid"); + } + } + + return AOM_CODEC_OK; +} + +static aom_codec_err_t validate_img(aom_codec_alg_priv_t *ctx, + const aom_image_t *img) { + switch (img->fmt) { + case AOM_IMG_FMT_YV12: + case AOM_IMG_FMT_I420: + case AOM_IMG_FMT_YV1216: + case AOM_IMG_FMT_I42016: break; + case AOM_IMG_FMT_I444: + case AOM_IMG_FMT_I44416: + if (ctx->cfg.g_profile == (unsigned int)PROFILE_0 && + !ctx->cfg.monochrome) { + ERROR("Invalid image format. I444 images not supported in profile."); + } + break; + case AOM_IMG_FMT_I422: + case AOM_IMG_FMT_I42216: + if (ctx->cfg.g_profile != (unsigned int)PROFILE_2) { + ERROR("Invalid image format. I422 images not supported in profile."); + } + break; + default: + ERROR( + "Invalid image format. Only YV12, I420, I422, I444 images are " + "supported."); + break; + } + + if (img->d_w != ctx->cfg.g_w || img->d_h != ctx->cfg.g_h) + ERROR("Image size must match encoder init configuration size"); + + if (img->fmt != AOM_IMG_FMT_I420 && !ctx->extra_cfg.enable_tx64) { + ERROR("TX64 can only be disabled on I420 images."); + } + + return AOM_CODEC_OK; +} + +static int get_image_bps(const aom_image_t *img) { + switch (img->fmt) { + case AOM_IMG_FMT_YV12: + case AOM_IMG_FMT_I420: return 12; + case AOM_IMG_FMT_I422: return 16; + case AOM_IMG_FMT_I444: return 24; + case AOM_IMG_FMT_YV1216: + case AOM_IMG_FMT_I42016: return 24; + case AOM_IMG_FMT_I42216: return 32; + case AOM_IMG_FMT_I44416: return 48; + default: assert(0 && "Invalid image format"); break; + } + return 0; +} + +// Set appropriate options to disable frame super-resolution. +static void disable_superres(AV1EncoderConfig *const oxcf) { + oxcf->superres_mode = SUPERRES_NONE; + oxcf->superres_scale_denominator = SCALE_NUMERATOR; + oxcf->superres_kf_scale_denominator = SCALE_NUMERATOR; + oxcf->superres_qthresh = 255; + oxcf->superres_kf_qthresh = 255; +} + +static void update_default_encoder_config(const cfg_options_t *cfg, + struct av1_extracfg *extra_cfg) { + extra_cfg->enable_cdef = (cfg->disable_cdef == 0); + extra_cfg->enable_restoration = (cfg->disable_lr == 0); + extra_cfg->superblock_size = (cfg->super_block_size == 64) + ? AOM_SUPERBLOCK_SIZE_64X64 + : (cfg->super_block_size == 128) + ? AOM_SUPERBLOCK_SIZE_128X128 + : AOM_SUPERBLOCK_SIZE_DYNAMIC; + extra_cfg->enable_warped_motion = (cfg->disable_warp_motion == 0); + extra_cfg->enable_dist_wtd_comp = (cfg->disable_dist_wtd_comp == 0); + extra_cfg->enable_diff_wtd_comp = (cfg->disable_diff_wtd_comp == 0); + extra_cfg->enable_dual_filter = (cfg->disable_dual_filter == 0); + extra_cfg->enable_angle_delta = (cfg->disable_intra_angle_delta == 0); + extra_cfg->enable_rect_partitions = (cfg->disable_rect_partition_type == 0); + extra_cfg->enable_ab_partitions = (cfg->disable_ab_partition_type == 0); + extra_cfg->enable_1to4_partitions = (cfg->disable_1to4_partition_type == 0); + extra_cfg->max_partition_size = cfg->max_partition_size; + extra_cfg->min_partition_size = cfg->min_partition_size; + extra_cfg->enable_intra_edge_filter = (cfg->disable_intra_edge_filter == 0); + extra_cfg->enable_tx64 = (cfg->disable_tx_64x64 == 0); + extra_cfg->enable_flip_idtx = (cfg->disable_flip_idtx == 0); + extra_cfg->enable_masked_comp = (cfg->disable_masked_comp == 0); + extra_cfg->enable_interintra_comp = (cfg->disable_inter_intra_comp == 0); + extra_cfg->enable_smooth_interintra = (cfg->disable_smooth_inter_intra == 0); + extra_cfg->enable_interinter_wedge = (cfg->disable_inter_inter_wedge == 0); + extra_cfg->enable_interintra_wedge = (cfg->disable_inter_intra_wedge == 0); + extra_cfg->enable_global_motion = (cfg->disable_global_motion == 0); + extra_cfg->enable_filter_intra = (cfg->disable_filter_intra == 0); + extra_cfg->enable_smooth_intra = (cfg->disable_smooth_intra == 0); + extra_cfg->enable_paeth_intra = (cfg->disable_paeth_intra == 0); + extra_cfg->enable_cfl_intra = (cfg->disable_cfl == 0); + extra_cfg->enable_obmc = (cfg->disable_obmc == 0); + extra_cfg->enable_palette = (cfg->disable_palette == 0); + extra_cfg->enable_intrabc = (cfg->disable_intrabc == 0); + extra_cfg->disable_trellis_quant = cfg->disable_trellis_quant; + extra_cfg->allow_ref_frame_mvs = (cfg->disable_ref_frame_mv == 0); + extra_cfg->enable_ref_frame_mvs = (cfg->disable_ref_frame_mv == 0); + extra_cfg->enable_onesided_comp = (cfg->disable_one_sided_comp == 0); + extra_cfg->enable_reduced_reference_set = cfg->reduced_reference_set; + extra_cfg->reduced_tx_type_set = cfg->reduced_tx_type_set; +} + +static double convert_qp_offset(int cq_level, int q_offset, int bit_depth) { + const double base_q_val = av1_convert_qindex_to_q(cq_level, bit_depth); + const int new_q_index_offset = av1_quantizer_to_qindex(q_offset); + const int new_q_index = AOMMAX(cq_level - new_q_index_offset, 0); + const double new_q_val = av1_convert_qindex_to_q(new_q_index, bit_depth); + return (base_q_val - new_q_val); +} + +static double get_modeled_qp_offset(int cq_level, int level, int bit_depth) { + // 80% for keyframe was derived empirically. + // 40% similar to rc_pick_q_and_bounds_one_pass_vbr() for Q mode ARF. + // Rest derived similar to rc_pick_q_and_bounds_two_pass() + static const int percents[FIXED_QP_OFFSET_COUNT] = { 76, 60, 30, 15, 8 }; + const double q_val = av1_convert_qindex_to_q(cq_level, bit_depth); + return q_val * percents[level] / 100; +} + +static aom_codec_err_t set_encoder_config(AV1EncoderConfig *oxcf, + const aom_codec_enc_cfg_t *cfg, + struct av1_extracfg *extra_cfg) { + if (cfg->encoder_cfg.init_by_cfg_file) { + update_default_encoder_config(&cfg->encoder_cfg, extra_cfg); + } + + const int is_vbr = cfg->rc_end_usage == AOM_VBR; + oxcf->profile = cfg->g_profile; + oxcf->fwd_kf_enabled = cfg->fwd_kf_enabled; + oxcf->max_threads = (int)cfg->g_threads; + oxcf->mode = (cfg->g_usage == AOM_USAGE_REALTIME) ? REALTIME : GOOD; + oxcf->width = cfg->g_w; + oxcf->height = cfg->g_h; + oxcf->forced_max_frame_width = cfg->g_forced_max_frame_width; + oxcf->forced_max_frame_height = cfg->g_forced_max_frame_height; + oxcf->bit_depth = cfg->g_bit_depth; + oxcf->input_bit_depth = cfg->g_input_bit_depth; + // guess a frame rate if out of whack, use 30 + oxcf->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num; + if (extra_cfg->timing_info_type == AOM_TIMING_EQUAL || + extra_cfg->timing_info_type == AOM_TIMING_DEC_MODEL) { + oxcf->timing_info_present = 1; + oxcf->timing_info.num_units_in_display_tick = cfg->g_timebase.num; + oxcf->timing_info.time_scale = cfg->g_timebase.den; + oxcf->timing_info.num_ticks_per_picture = 1; + } else { + oxcf->timing_info_present = 0; + } + if (extra_cfg->timing_info_type == AOM_TIMING_EQUAL) { + oxcf->timing_info.equal_picture_interval = 1; + oxcf->decoder_model_info_present_flag = 0; + oxcf->display_model_info_present_flag = 1; + } else if (extra_cfg->timing_info_type == AOM_TIMING_DEC_MODEL) { + // if( extra_cfg->arnr_strength > 0 ) + // { + // printf("Only --arnr-strength=0 can currently be used with + // --timing-info=model."); return AOM_CODEC_INVALID_PARAM; + // } + // if( extra_cfg->enable_superres) + // { + // printf("Only --superres-mode=0 can currently be used with + // --timing-info=model."); return AOM_CODEC_INVALID_PARAM; + // } + oxcf->buffer_model.num_units_in_decoding_tick = cfg->g_timebase.num; + oxcf->timing_info.equal_picture_interval = 0; + oxcf->decoder_model_info_present_flag = 1; + oxcf->buffer_removal_time_present = 1; + oxcf->display_model_info_present_flag = 1; + } + if (oxcf->init_framerate > 180) { + oxcf->init_framerate = 30; + oxcf->timing_info_present = 0; + } + oxcf->encoder_cfg = &cfg->encoder_cfg; + + switch (cfg->g_pass) { + case AOM_RC_ONE_PASS: oxcf->pass = 0; break; + case AOM_RC_FIRST_PASS: oxcf->pass = 1; break; + case AOM_RC_LAST_PASS: oxcf->pass = 2; break; + } + + oxcf->lag_in_frames = clamp(cfg->g_lag_in_frames, 0, MAX_LAG_BUFFERS); + oxcf->rc_mode = cfg->rc_end_usage; + + // Convert target bandwidth from Kbit/s to Bit/s + oxcf->target_bandwidth = 1000 * cfg->rc_target_bitrate; + oxcf->rc_max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct; + oxcf->rc_max_inter_bitrate_pct = extra_cfg->rc_max_inter_bitrate_pct; + oxcf->gf_cbr_boost_pct = extra_cfg->gf_cbr_boost_pct; + + oxcf->best_allowed_q = + extra_cfg->lossless ? 0 : av1_quantizer_to_qindex(cfg->rc_min_quantizer); + oxcf->worst_allowed_q = + extra_cfg->lossless ? 0 : av1_quantizer_to_qindex(cfg->rc_max_quantizer); + oxcf->cq_level = av1_quantizer_to_qindex(extra_cfg->cq_level); + oxcf->fixed_q = -1; + + oxcf->enable_cdef = extra_cfg->enable_cdef; + oxcf->enable_restoration = + (cfg->g_usage == AOM_USAGE_REALTIME) ? 0 : extra_cfg->enable_restoration; + oxcf->force_video_mode = extra_cfg->force_video_mode; + oxcf->enable_obmc = extra_cfg->enable_obmc; + oxcf->enable_overlay = extra_cfg->enable_overlay; + oxcf->enable_palette = extra_cfg->enable_palette; + oxcf->enable_intrabc = extra_cfg->enable_intrabc; + oxcf->enable_angle_delta = extra_cfg->enable_angle_delta; + oxcf->disable_trellis_quant = extra_cfg->disable_trellis_quant; + oxcf->allow_ref_frame_mvs = extra_cfg->enable_ref_frame_mvs; + oxcf->using_qm = extra_cfg->enable_qm; + oxcf->qm_y = extra_cfg->qm_y; + oxcf->qm_u = extra_cfg->qm_u; + oxcf->qm_v = extra_cfg->qm_v; + oxcf->qm_minlevel = extra_cfg->qm_min; + oxcf->qm_maxlevel = extra_cfg->qm_max; + oxcf->reduced_tx_type_set = extra_cfg->reduced_tx_type_set; + oxcf->use_intra_dct_only = extra_cfg->use_intra_dct_only; + oxcf->use_inter_dct_only = extra_cfg->use_inter_dct_only; + oxcf->use_intra_default_tx_only = extra_cfg->use_intra_default_tx_only; + oxcf->quant_b_adapt = extra_cfg->quant_b_adapt; + oxcf->coeff_cost_upd_freq = (COST_UPDATE_TYPE)extra_cfg->coeff_cost_upd_freq; + oxcf->mode_cost_upd_freq = (COST_UPDATE_TYPE)extra_cfg->mode_cost_upd_freq; + oxcf->mv_cost_upd_freq = (COST_UPDATE_TYPE)extra_cfg->mv_cost_upd_freq; + oxcf->num_tile_groups = extra_cfg->num_tg; + // In large-scale tile encoding mode, num_tile_groups is always 1. + if (cfg->large_scale_tile) oxcf->num_tile_groups = 1; + oxcf->mtu = extra_cfg->mtu_size; + + // FIXME(debargha): Should this be: + // oxcf->allow_ref_frame_mvs = extra_cfg->allow_ref_frame_mvs & + // extra_cfg->enable_order_hint ? + // Disallow using temporal MVs while large_scale_tile = 1. + oxcf->allow_ref_frame_mvs = + extra_cfg->allow_ref_frame_mvs && !cfg->large_scale_tile; + oxcf->under_shoot_pct = cfg->rc_undershoot_pct; + oxcf->over_shoot_pct = cfg->rc_overshoot_pct; + + oxcf->resize_mode = (RESIZE_MODE)cfg->rc_resize_mode; + oxcf->resize_scale_denominator = (uint8_t)cfg->rc_resize_denominator; + oxcf->resize_kf_scale_denominator = (uint8_t)cfg->rc_resize_kf_denominator; + if (oxcf->resize_mode == RESIZE_FIXED && + oxcf->resize_scale_denominator == SCALE_NUMERATOR && + oxcf->resize_kf_scale_denominator == SCALE_NUMERATOR) + oxcf->resize_mode = RESIZE_NONE; + + if (extra_cfg->lossless || cfg->large_scale_tile) { + disable_superres(oxcf); + } else { + oxcf->superres_mode = (SUPERRES_MODE)cfg->rc_superres_mode; + oxcf->superres_scale_denominator = (uint8_t)cfg->rc_superres_denominator; + oxcf->superres_kf_scale_denominator = + (uint8_t)cfg->rc_superres_kf_denominator; + oxcf->superres_qthresh = av1_quantizer_to_qindex(cfg->rc_superres_qthresh); + oxcf->superres_kf_qthresh = + av1_quantizer_to_qindex(cfg->rc_superres_kf_qthresh); + if (oxcf->superres_mode == SUPERRES_FIXED && + oxcf->superres_scale_denominator == SCALE_NUMERATOR && + oxcf->superres_kf_scale_denominator == SCALE_NUMERATOR) { + disable_superres(oxcf); + } + if (oxcf->superres_mode == SUPERRES_QTHRESH && + oxcf->superres_qthresh == 255 && oxcf->superres_kf_qthresh == 255) { + disable_superres(oxcf); + } + } + + oxcf->maximum_buffer_size_ms = is_vbr ? 240000 : cfg->rc_buf_sz; + oxcf->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz; + oxcf->optimal_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_optimal_sz; + + oxcf->drop_frames_water_mark = cfg->rc_dropframe_thresh; + + oxcf->two_pass_vbrbias = cfg->rc_2pass_vbr_bias_pct; + oxcf->two_pass_vbrmin_section = cfg->rc_2pass_vbr_minsection_pct; + oxcf->two_pass_vbrmax_section = cfg->rc_2pass_vbr_maxsection_pct; + + oxcf->auto_key = + cfg->kf_mode == AOM_KF_AUTO && cfg->kf_min_dist != cfg->kf_max_dist; + + oxcf->key_freq = cfg->kf_max_dist; + oxcf->sframe_dist = cfg->sframe_dist; + oxcf->sframe_mode = cfg->sframe_mode; + oxcf->sframe_enabled = cfg->sframe_dist != 0; + oxcf->speed = extra_cfg->cpu_used; + oxcf->enable_auto_arf = extra_cfg->enable_auto_alt_ref; + oxcf->enable_auto_brf = extra_cfg->enable_auto_bwd_ref; + oxcf->noise_sensitivity = extra_cfg->noise_sensitivity; + oxcf->sharpness = extra_cfg->sharpness; + + oxcf->two_pass_stats_in = cfg->rc_twopass_stats_in; + + oxcf->color_primaries = extra_cfg->color_primaries; + oxcf->transfer_characteristics = extra_cfg->transfer_characteristics; + oxcf->matrix_coefficients = extra_cfg->matrix_coefficients; + oxcf->chroma_sample_position = extra_cfg->chroma_sample_position; + + oxcf->color_range = extra_cfg->color_range; + oxcf->render_width = extra_cfg->render_width; + oxcf->render_height = extra_cfg->render_height; + oxcf->arnr_max_frames = extra_cfg->arnr_max_frames; + oxcf->arnr_strength = extra_cfg->arnr_strength; + oxcf->min_gf_interval = extra_cfg->min_gf_interval; + oxcf->max_gf_interval = extra_cfg->max_gf_interval; + oxcf->gf_min_pyr_height = extra_cfg->gf_min_pyr_height; + oxcf->gf_max_pyr_height = extra_cfg->gf_max_pyr_height; + + oxcf->tuning = extra_cfg->tuning; + oxcf->vmaf_model_path = extra_cfg->vmaf_model_path; + oxcf->content = extra_cfg->content; + oxcf->cdf_update_mode = (uint8_t)extra_cfg->cdf_update_mode; + oxcf->superblock_size = extra_cfg->superblock_size; + if (cfg->large_scale_tile) { + oxcf->film_grain_test_vector = 0; + oxcf->film_grain_table_filename = NULL; + } else { + oxcf->film_grain_test_vector = extra_cfg->film_grain_test_vector; + oxcf->film_grain_table_filename = extra_cfg->film_grain_table_filename; + } +#if CONFIG_DENOISE + oxcf->noise_level = extra_cfg->noise_level; + oxcf->noise_block_size = extra_cfg->noise_block_size; +#endif + oxcf->large_scale_tile = cfg->large_scale_tile; + oxcf->single_tile_decoding = + (oxcf->large_scale_tile) ? extra_cfg->single_tile_decoding : 0; + if (oxcf->large_scale_tile) { + // The superblock_size can only be AOM_SUPERBLOCK_SIZE_64X64 or + // AOM_SUPERBLOCK_SIZE_128X128 while oxcf->large_scale_tile = 1. If + // superblock_size = AOM_SUPERBLOCK_SIZE_DYNAMIC, hard set it to + // AOM_SUPERBLOCK_SIZE_64X64(default value in large_scale_tile). + if (extra_cfg->superblock_size != AOM_SUPERBLOCK_SIZE_64X64 && + extra_cfg->superblock_size != AOM_SUPERBLOCK_SIZE_128X128) + oxcf->superblock_size = AOM_SUPERBLOCK_SIZE_64X64; + } + + oxcf->row_mt = extra_cfg->row_mt; + + oxcf->tile_columns = extra_cfg->tile_columns; + oxcf->tile_rows = extra_cfg->tile_rows; + + oxcf->monochrome = cfg->monochrome; + oxcf->full_still_picture_hdr = cfg->full_still_picture_hdr; + oxcf->enable_dual_filter = extra_cfg->enable_dual_filter; + oxcf->enable_rect_partitions = extra_cfg->enable_rect_partitions; + oxcf->enable_ab_partitions = extra_cfg->enable_ab_partitions; + oxcf->enable_1to4_partitions = extra_cfg->enable_1to4_partitions; + oxcf->min_partition_size = extra_cfg->min_partition_size; + oxcf->max_partition_size = extra_cfg->max_partition_size; + oxcf->enable_intra_edge_filter = extra_cfg->enable_intra_edge_filter; + oxcf->enable_tx64 = extra_cfg->enable_tx64; + oxcf->enable_flip_idtx = extra_cfg->enable_flip_idtx; + oxcf->enable_order_hint = extra_cfg->enable_order_hint; + oxcf->enable_dist_wtd_comp = + extra_cfg->enable_dist_wtd_comp & extra_cfg->enable_order_hint; + oxcf->max_reference_frames = extra_cfg->max_reference_frames; + oxcf->enable_reduced_reference_set = extra_cfg->enable_reduced_reference_set; + oxcf->enable_masked_comp = extra_cfg->enable_masked_comp; + oxcf->enable_onesided_comp = extra_cfg->enable_onesided_comp; + oxcf->enable_diff_wtd_comp = + extra_cfg->enable_masked_comp & extra_cfg->enable_diff_wtd_comp; + oxcf->enable_interinter_wedge = + extra_cfg->enable_masked_comp & extra_cfg->enable_interinter_wedge; + oxcf->enable_interintra_comp = extra_cfg->enable_interintra_comp; + oxcf->enable_smooth_interintra = + extra_cfg->enable_interintra_comp && extra_cfg->enable_smooth_interintra; + oxcf->enable_interintra_wedge = + extra_cfg->enable_interintra_comp & extra_cfg->enable_interintra_wedge; + oxcf->enable_ref_frame_mvs = + extra_cfg->enable_ref_frame_mvs & extra_cfg->enable_order_hint; + + oxcf->enable_global_motion = extra_cfg->enable_global_motion; + oxcf->enable_warped_motion = extra_cfg->enable_warped_motion; + oxcf->allow_warped_motion = + (cfg->g_usage == AOM_USAGE_REALTIME) + ? 0 + : (extra_cfg->allow_warped_motion & extra_cfg->enable_warped_motion); + oxcf->enable_filter_intra = extra_cfg->enable_filter_intra; + oxcf->enable_smooth_intra = extra_cfg->enable_smooth_intra; + oxcf->enable_paeth_intra = extra_cfg->enable_paeth_intra; + oxcf->enable_cfl_intra = extra_cfg->enable_cfl_intra; + + oxcf->enable_superres = + (oxcf->superres_mode != SUPERRES_NONE) && extra_cfg->enable_superres; + if (!oxcf->enable_superres) { + disable_superres(oxcf); + } + + oxcf->tile_width_count = AOMMIN(cfg->tile_width_count, MAX_TILE_COLS); + oxcf->tile_height_count = AOMMIN(cfg->tile_height_count, MAX_TILE_ROWS); + for (int i = 0; i < oxcf->tile_width_count; i++) { + oxcf->tile_widths[i] = AOMMAX(cfg->tile_widths[i], 1); + } + for (int i = 0; i < oxcf->tile_height_count; i++) { + oxcf->tile_heights[i] = AOMMAX(cfg->tile_heights[i], 1); + } + oxcf->error_resilient_mode = + cfg->g_error_resilient | extra_cfg->error_resilient_mode; + oxcf->s_frame_mode = extra_cfg->s_frame_mode; + oxcf->frame_parallel_decoding_mode = extra_cfg->frame_parallel_decoding_mode; + if (cfg->g_pass == AOM_RC_LAST_PASS) { + const size_t packet_sz = sizeof(FIRSTPASS_STATS); + const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz); + oxcf->limit = n_packets - 1; + } else { + oxcf->limit = cfg->g_limit; + } + + if (oxcf->limit == 1) { + // still picture mode, display model and timing is meaningless + oxcf->display_model_info_present_flag = 0; + oxcf->timing_info_present = 0; + } + + oxcf->enable_tpl_model = extra_cfg->enable_tpl_model; + oxcf->enable_keyframe_filtering = extra_cfg->enable_keyframe_filtering; + + oxcf->enable_chroma_deltaq = extra_cfg->enable_chroma_deltaq; + oxcf->aq_mode = extra_cfg->aq_mode; + oxcf->deltaq_mode = extra_cfg->deltaq_mode; + + oxcf->deltalf_mode = + (oxcf->deltaq_mode != NO_DELTA_Q) && extra_cfg->deltalf_mode; + + oxcf->save_as_annexb = cfg->save_as_annexb; + + oxcf->frame_periodic_boost = extra_cfg->frame_periodic_boost; + oxcf->motion_vector_unit_test = extra_cfg->motion_vector_unit_test; + oxcf->sb_multipass_unit_test = extra_cfg->sb_multipass_unit_test; + oxcf->ext_tile_debug = extra_cfg->ext_tile_debug; + + oxcf->chroma_subsampling_x = extra_cfg->chroma_subsampling_x; + oxcf->chroma_subsampling_y = extra_cfg->chroma_subsampling_y; + oxcf->border_in_pixels = (oxcf->resize_mode || oxcf->superres_mode) + ? AOM_BORDER_IN_PIXELS + : AOM_ENC_NO_SCALE_BORDER; + memcpy(oxcf->target_seq_level_idx, extra_cfg->target_seq_level_idx, + sizeof(oxcf->target_seq_level_idx)); + oxcf->tier_mask = extra_cfg->tier_mask; + + oxcf->use_fixed_qp_offsets = + cfg->use_fixed_qp_offsets && (oxcf->rc_mode == AOM_Q); + for (int i = 0; i < FIXED_QP_OFFSET_COUNT; ++i) { + if (oxcf->use_fixed_qp_offsets) { + if (cfg->fixed_qp_offsets[i] >= 0) { // user-provided qp offset + oxcf->fixed_qp_offsets[i] = convert_qp_offset( + oxcf->cq_level, cfg->fixed_qp_offsets[i], oxcf->bit_depth); + } else { // auto-selected qp offset + oxcf->fixed_qp_offsets[i] = + get_modeled_qp_offset(oxcf->cq_level, i, oxcf->bit_depth); + } + } else { + oxcf->fixed_qp_offsets[i] = -1.0; + } + } + + oxcf->min_cr = extra_cfg->min_cr; + return AOM_CODEC_OK; +} + +static aom_codec_err_t encoder_set_config(aom_codec_alg_priv_t *ctx, + const aom_codec_enc_cfg_t *cfg) { + aom_codec_err_t res; + int force_key = 0; + + if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) { + if (cfg->g_lag_in_frames > 1 || cfg->g_pass != AOM_RC_ONE_PASS) + ERROR("Cannot change width or height after initialization"); + if (!valid_ref_frame_size(ctx->cfg.g_w, ctx->cfg.g_h, cfg->g_w, cfg->g_h) || + (ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) || + (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height)) + force_key = 1; + } + + // Prevent increasing lag_in_frames. This check is stricter than it needs + // to be -- the limit is not increasing past the first lag_in_frames + // value, but we don't track the initial config, only the last successful + // config. + if (cfg->g_lag_in_frames > ctx->cfg.g_lag_in_frames) + ERROR("Cannot increase lag_in_frames"); + // Prevent changing lag_in_frames if Lookahead Processing is enabled + if (cfg->g_lag_in_frames != ctx->cfg.g_lag_in_frames && + ctx->num_lap_buffers > 0) + ERROR("Cannot change lag_in_frames if LAP is enabled"); + + res = validate_config(ctx, cfg, &ctx->extra_cfg); + + if (res == AOM_CODEC_OK) { + ctx->cfg = *cfg; + set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); + // On profile change, request a key frame + force_key |= ctx->cpi->common.seq_params.profile != ctx->oxcf.profile; + av1_change_config(ctx->cpi, &ctx->oxcf); + } + + if (force_key) ctx->next_frame_flags |= AOM_EFLAG_FORCE_KF; + + return res; +} + +static aom_fixed_buf_t *encoder_get_global_headers(aom_codec_alg_priv_t *ctx) { + return av1_get_global_headers(ctx->cpi); +} + +static aom_codec_err_t ctrl_get_quantizer(aom_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) return AOM_CODEC_INVALID_PARAM; + *arg = av1_get_quantizer(ctx->cpi); + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_get_quantizer64(aom_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) return AOM_CODEC_INVALID_PARAM; + *arg = av1_qindex_to_quantizer(av1_get_quantizer(ctx->cpi)); + return AOM_CODEC_OK; +} + +static aom_codec_err_t update_extra_cfg(aom_codec_alg_priv_t *ctx, + struct av1_extracfg *extra_cfg) { + const aom_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg); + if (res == AOM_CODEC_OK) { + ctx->extra_cfg = *extra_cfg; + set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); + av1_change_config(ctx->cpi, &ctx->oxcf); + } + return res; +} + +static aom_codec_err_t ctrl_set_cpuused(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.cpu_used = CAST(AOME_SET_CPUUSED, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_auto_alt_ref(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_auto_alt_ref = CAST(AOME_SET_ENABLEAUTOALTREF, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_auto_bwd_ref(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_auto_bwd_ref = CAST(AOME_SET_ENABLEAUTOBWDREF, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_noise_sensitivity(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.noise_sensitivity = CAST(AV1E_SET_NOISE_SENSITIVITY, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_sharpness(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.sharpness = CAST(AOME_SET_SHARPNESS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_static_thresh(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.static_thresh = CAST(AOME_SET_STATIC_THRESHOLD, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_row_mt(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.row_mt = CAST(AV1E_SET_ROW_MT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_tile_columns(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.tile_columns = CAST(AV1E_SET_TILE_COLUMNS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_tile_rows(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.tile_rows = CAST(AV1E_SET_TILE_ROWS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_tpl_model(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_tpl_model = CAST(AV1E_SET_ENABLE_TPL_MODEL, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_keyframe_filtering( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_keyframe_filtering = + CAST(AV1E_SET_ENABLE_KEYFRAME_FILTERING, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_arnr_max_frames(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.arnr_max_frames = CAST(AOME_SET_ARNR_MAXFRAMES, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_arnr_strength(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.arnr_strength = CAST(AOME_SET_ARNR_STRENGTH, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_tuning(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.tuning = CAST(AOME_SET_TUNING, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_cq_level(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.cq_level = CAST(AOME_SET_CQ_LEVEL, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_rc_max_intra_bitrate_pct( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.rc_max_intra_bitrate_pct = + CAST(AOME_SET_MAX_INTRA_BITRATE_PCT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_rc_max_inter_bitrate_pct( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.rc_max_inter_bitrate_pct = + CAST(AOME_SET_MAX_INTER_BITRATE_PCT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_rc_gf_cbr_boost_pct(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.gf_cbr_boost_pct = CAST(AV1E_SET_GF_CBR_BOOST_PCT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_lossless(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.lossless = CAST(AV1E_SET_LOSSLESS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_cdef(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_cdef = CAST(AV1E_SET_ENABLE_CDEF, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_restoration(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_restoration = CAST(AV1E_SET_ENABLE_RESTORATION, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_force_video_mode(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.force_video_mode = CAST(AV1E_SET_FORCE_VIDEO_MODE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_obmc(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_obmc = CAST(AV1E_SET_ENABLE_OBMC, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_disable_trellis_quant(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.disable_trellis_quant = CAST(AV1E_SET_DISABLE_TRELLIS_QUANT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_qm(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_qm = CAST(AV1E_SET_ENABLE_QM, args); + return update_extra_cfg(ctx, &extra_cfg); +} +static aom_codec_err_t ctrl_set_qm_y(aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.qm_y = CAST(AV1E_SET_QM_Y, args); + return update_extra_cfg(ctx, &extra_cfg); +} +static aom_codec_err_t ctrl_set_qm_u(aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.qm_u = CAST(AV1E_SET_QM_U, args); + return update_extra_cfg(ctx, &extra_cfg); +} +static aom_codec_err_t ctrl_set_qm_v(aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.qm_v = CAST(AV1E_SET_QM_V, args); + return update_extra_cfg(ctx, &extra_cfg); +} +static aom_codec_err_t ctrl_set_qm_min(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.qm_min = CAST(AV1E_SET_QM_MIN, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_qm_max(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.qm_max = CAST(AV1E_SET_QM_MAX, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_num_tg(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.num_tg = CAST(AV1E_SET_NUM_TG, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_mtu(aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.mtu_size = CAST(AV1E_SET_MTU, args); + return update_extra_cfg(ctx, &extra_cfg); +} +static aom_codec_err_t ctrl_set_timing_info_type(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.timing_info_type = CAST(AV1E_SET_TIMING_INFO_TYPE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_dual_filter(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_dual_filter = CAST(AV1E_SET_ENABLE_DUAL_FILTER, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_chroma_deltaq(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_chroma_deltaq = CAST(AV1E_SET_ENABLE_CHROMA_DELTAQ, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_rect_partitions( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_rect_partitions = + CAST(AV1E_SET_ENABLE_RECT_PARTITIONS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_ab_partitions(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_ab_partitions = CAST(AV1E_SET_ENABLE_AB_PARTITIONS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_1to4_partitions( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_1to4_partitions = + CAST(AV1E_SET_ENABLE_1TO4_PARTITIONS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_min_partition_size(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.min_partition_size = CAST(AV1E_SET_MIN_PARTITION_SIZE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_max_partition_size(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.max_partition_size = CAST(AV1E_SET_MAX_PARTITION_SIZE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_intra_edge_filter( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_intra_edge_filter = + CAST(AV1E_SET_ENABLE_INTRA_EDGE_FILTER, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_order_hint(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_order_hint = CAST(AV1E_SET_ENABLE_ORDER_HINT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_tx64(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_tx64 = CAST(AV1E_SET_ENABLE_TX64, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_flip_idtx(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_flip_idtx = CAST(AV1E_SET_ENABLE_FLIP_IDTX, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_dist_wtd_comp(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_dist_wtd_comp = CAST(AV1E_SET_ENABLE_DIST_WTD_COMP, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_max_reference_frames(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.max_reference_frames = CAST(AV1E_SET_MAX_REFERENCE_FRAMES, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_reduced_reference_set( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_reduced_reference_set = + CAST(AV1E_SET_REDUCED_REFERENCE_SET, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_ref_frame_mvs(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_ref_frame_mvs = CAST(AV1E_SET_ENABLE_REF_FRAME_MVS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_allow_ref_frame_mvs(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.allow_ref_frame_mvs = CAST(AV1E_SET_ALLOW_REF_FRAME_MVS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_masked_comp(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_masked_comp = CAST(AV1E_SET_ENABLE_MASKED_COMP, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_onesided_comp(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_onesided_comp = CAST(AV1E_SET_ENABLE_ONESIDED_COMP, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_interintra_comp( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_interintra_comp = + CAST(AV1E_SET_ENABLE_INTERINTRA_COMP, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_smooth_interintra( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_smooth_interintra = + CAST(AV1E_SET_ENABLE_SMOOTH_INTERINTRA, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_diff_wtd_comp(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_diff_wtd_comp = CAST(AV1E_SET_ENABLE_DIFF_WTD_COMP, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_interinter_wedge( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_interinter_wedge = + CAST(AV1E_SET_ENABLE_INTERINTER_WEDGE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_interintra_wedge( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_interintra_wedge = + CAST(AV1E_SET_ENABLE_INTERINTRA_WEDGE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_global_motion(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_global_motion = CAST(AV1E_SET_ENABLE_GLOBAL_MOTION, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_warped_motion(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_warped_motion = CAST(AV1E_SET_ENABLE_WARPED_MOTION, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_allow_warped_motion(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.allow_warped_motion = CAST(AV1E_SET_ALLOW_WARPED_MOTION, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_filter_intra(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_filter_intra = CAST(AV1E_SET_ENABLE_FILTER_INTRA, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_smooth_intra(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_smooth_intra = CAST(AV1E_SET_ENABLE_SMOOTH_INTRA, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_paeth_intra(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_paeth_intra = CAST(AV1E_SET_ENABLE_PAETH_INTRA, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_cfl_intra(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_cfl_intra = CAST(AV1E_SET_ENABLE_CFL_INTRA, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_superres(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_superres = CAST(AV1E_SET_ENABLE_SUPERRES, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_overlay(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_overlay = CAST(AV1E_SET_ENABLE_OVERLAY, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_palette(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_palette = CAST(AV1E_SET_ENABLE_PALETTE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_intrabc(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_intrabc = CAST(AV1E_SET_ENABLE_INTRABC, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_angle_delta(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_angle_delta = CAST(AV1E_SET_ENABLE_ANGLE_DELTA, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_error_resilient_mode(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.error_resilient_mode = CAST(AV1E_SET_ERROR_RESILIENT_MODE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_s_frame_mode(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.s_frame_mode = CAST(AV1E_SET_S_FRAME_MODE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_frame_parallel_decoding_mode( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.frame_parallel_decoding_mode = + CAST(AV1E_SET_FRAME_PARALLEL_DECODING, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_single_tile_decoding(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.single_tile_decoding = CAST(AV1E_SET_SINGLE_TILE_DECODING, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_aq_mode(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.aq_mode = CAST(AV1E_SET_AQ_MODE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_reduced_tx_type_set(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.reduced_tx_type_set = CAST(AV1E_SET_REDUCED_TX_TYPE_SET, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_intra_dct_only(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.use_intra_dct_only = CAST(AV1E_SET_INTRA_DCT_ONLY, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_inter_dct_only(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.use_inter_dct_only = CAST(AV1E_SET_INTER_DCT_ONLY, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_intra_default_tx_only(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.use_intra_default_tx_only = + CAST(AV1E_SET_INTRA_DEFAULT_TX_ONLY, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_quant_b_adapt(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.quant_b_adapt = CAST(AV1E_SET_QUANT_B_ADAPT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_coeff_cost_upd_freq(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.coeff_cost_upd_freq = CAST(AV1E_SET_COEFF_COST_UPD_FREQ, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_mode_cost_upd_freq(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.mode_cost_upd_freq = CAST(AV1E_SET_MODE_COST_UPD_FREQ, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_mv_cost_upd_freq(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.mv_cost_upd_freq = CAST(AV1E_SET_MV_COST_UPD_FREQ, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_vmaf_model_path(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.vmaf_model_path = CAST(AV1E_SET_VMAF_MODEL_PATH, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_film_grain_test_vector( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.film_grain_test_vector = + CAST(AV1E_SET_FILM_GRAIN_TEST_VECTOR, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_film_grain_table(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.film_grain_table_filename = CAST(AV1E_SET_FILM_GRAIN_TABLE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_denoise_noise_level(aom_codec_alg_priv_t *ctx, + va_list args) { +#if !CONFIG_DENOISE + (void)ctx; + (void)args; + return AOM_CODEC_INCAPABLE; +#else + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.noise_level = + ((float)CAST(AV1E_SET_DENOISE_NOISE_LEVEL, args)) / 10.0f; + return update_extra_cfg(ctx, &extra_cfg); +#endif +} + +static aom_codec_err_t ctrl_set_denoise_block_size(aom_codec_alg_priv_t *ctx, + va_list args) { +#if !CONFIG_DENOISE + (void)ctx; + (void)args; + return AOM_CODEC_INCAPABLE; +#else + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.noise_block_size = CAST(AV1E_SET_DENOISE_BLOCK_SIZE, args); + return update_extra_cfg(ctx, &extra_cfg); +#endif +} + +static aom_codec_err_t ctrl_set_deltaq_mode(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.deltaq_mode = CAST(AV1E_SET_DELTAQ_MODE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_deltalf_mode(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.deltalf_mode = CAST(AV1E_SET_DELTALF_MODE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_min_gf_interval(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.min_gf_interval = CAST(AV1E_SET_MIN_GF_INTERVAL, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_max_gf_interval(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.max_gf_interval = CAST(AV1E_SET_MAX_GF_INTERVAL, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_gf_min_pyr_height(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.gf_min_pyr_height = CAST(AV1E_SET_GF_MIN_PYRAMID_HEIGHT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_gf_max_pyr_height(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.gf_max_pyr_height = CAST(AV1E_SET_GF_MAX_PYRAMID_HEIGHT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_frame_periodic_boost(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.frame_periodic_boost = CAST(AV1E_SET_FRAME_PERIODIC_BOOST, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_enable_motion_vector_unit_test( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.motion_vector_unit_test = + CAST(AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_enable_ext_tile_debug(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.ext_tile_debug = CAST(AV1E_ENABLE_EXT_TILE_DEBUG, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_target_seq_level_idx(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + const int val = CAST(AV1E_SET_TARGET_SEQ_LEVEL_IDX, args); + const int level = val % 100; + const int operating_point_idx = val / 100; + if (operating_point_idx >= 0 && + operating_point_idx < MAX_NUM_OPERATING_POINTS) { + extra_cfg.target_seq_level_idx[operating_point_idx] = (AV1_LEVEL)level; + } + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_tier_mask(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.tier_mask = CAST(AV1E_SET_TIER_MASK, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_min_cr(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.min_cr = CAST(AV1E_SET_MIN_CR, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_enable_sb_multipass_unit_test( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.sb_multipass_unit_test = + CAST(AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +#if !CONFIG_REALTIME_ONLY +static aom_codec_err_t create_stats_buffer(FIRSTPASS_STATS **frame_stats_buffer, + STATS_BUFFER_CTX *stats_buf_context, + int num_lap_buffers) { + aom_codec_err_t res = AOM_CODEC_OK; + + int size = get_stats_buf_size(num_lap_buffers, MAX_LAG_BUFFERS); + *frame_stats_buffer = + (FIRSTPASS_STATS *)aom_calloc(size, sizeof(FIRSTPASS_STATS)); + if (*frame_stats_buffer == NULL) return AOM_CODEC_MEM_ERROR; + + stats_buf_context->stats_in_start = *frame_stats_buffer; + stats_buf_context->stats_in_end = stats_buf_context->stats_in_start; + stats_buf_context->stats_in_buf_end = + stats_buf_context->stats_in_start + size; + + stats_buf_context->total_left_stats = aom_calloc(1, sizeof(FIRSTPASS_STATS)); + if (stats_buf_context->total_left_stats == NULL) return AOM_CODEC_MEM_ERROR; + av1_twopass_zero_stats(stats_buf_context->total_left_stats); + stats_buf_context->total_stats = aom_calloc(1, sizeof(FIRSTPASS_STATS)); + if (stats_buf_context->total_stats == NULL) return AOM_CODEC_MEM_ERROR; + av1_twopass_zero_stats(stats_buf_context->total_stats); + return res; +} +#endif + +static aom_codec_err_t create_context_and_bufferpool( + AV1_COMP **p_cpi, BufferPool **p_buffer_pool, AV1EncoderConfig *oxcf, + struct aom_codec_pkt_list *pkt_list_head, FIRSTPASS_STATS *frame_stats_buf, + COMPRESSOR_STAGE stage, int num_lap_buffers, int lap_lag_in_frames, + STATS_BUFFER_CTX *stats_buf_context) { + aom_codec_err_t res = AOM_CODEC_OK; + + *p_buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool)); + if (*p_buffer_pool == NULL) return AOM_CODEC_MEM_ERROR; + +#if CONFIG_MULTITHREAD + if (pthread_mutex_init(&((*p_buffer_pool)->pool_mutex), NULL)) { + return AOM_CODEC_MEM_ERROR; + } +#endif + *p_cpi = av1_create_compressor(oxcf, *p_buffer_pool, frame_stats_buf, stage, + num_lap_buffers, lap_lag_in_frames, + stats_buf_context); + if (*p_cpi == NULL) + res = AOM_CODEC_MEM_ERROR; + else + (*p_cpi)->output_pkt_list = pkt_list_head; + + return res; +} + +static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx) { + aom_codec_err_t res = AOM_CODEC_OK; + + if (ctx->priv == NULL) { + aom_codec_alg_priv_t *const priv = aom_calloc(1, sizeof(*priv)); + if (priv == NULL) return AOM_CODEC_MEM_ERROR; + + ctx->priv = (aom_codec_priv_t *)priv; + ctx->priv->init_flags = ctx->init_flags; + + if (ctx->config.enc) { + // Update the reference to the config structure to an internal copy. + priv->cfg = *ctx->config.enc; + ctx->config.enc = &priv->cfg; + } + + priv->extra_cfg = default_extra_cfg; + aom_once(av1_initialize_enc); + + res = validate_config(priv, &priv->cfg, &priv->extra_cfg); + + if (res == AOM_CODEC_OK) { + int *num_lap_buffers = &priv->num_lap_buffers; + int lap_lag_in_frames = 0; + *num_lap_buffers = 0; + priv->timestamp_ratio.den = priv->cfg.g_timebase.den; + priv->timestamp_ratio.num = + (int64_t)priv->cfg.g_timebase.num * TICKS_PER_SEC; + reduce_ratio(&priv->timestamp_ratio); + + set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg); + if (priv->oxcf.rc_mode == AOM_Q && priv->oxcf.pass == 0 && + priv->oxcf.mode == GOOD) { + // Enable look ahead + *num_lap_buffers = priv->cfg.g_lag_in_frames; + *num_lap_buffers = + clamp(*num_lap_buffers, 1, + AOMMIN(MAX_LAP_BUFFERS, + priv->oxcf.key_freq + SCENE_CUT_KEY_TEST_INTERVAL)); + if ((int)priv->cfg.g_lag_in_frames - (*num_lap_buffers) >= + LAP_LAG_IN_FRAMES) { + lap_lag_in_frames = LAP_LAG_IN_FRAMES; + } + } + priv->oxcf.use_highbitdepth = + (ctx->init_flags & AOM_CODEC_USE_HIGHBITDEPTH) ? 1 : 0; + +#if !CONFIG_REALTIME_ONLY + res = create_stats_buffer(&priv->frame_stats_buffer, + &priv->stats_buf_context, *num_lap_buffers); + if (res != AOM_CODEC_OK) return AOM_CODEC_MEM_ERROR; +#endif + + res = create_context_and_bufferpool( + &priv->cpi, &priv->buffer_pool, &priv->oxcf, &priv->pkt_list.head, + priv->frame_stats_buffer, ENCODE_STAGE, *num_lap_buffers, -1, + &priv->stats_buf_context); + + // Create another compressor if look ahead is enabled + if (res == AOM_CODEC_OK && *num_lap_buffers) { + res = create_context_and_bufferpool( + &priv->cpi_lap, &priv->buffer_pool_lap, &priv->oxcf, NULL, + priv->frame_stats_buffer, LAP_STAGE, *num_lap_buffers, + clamp(lap_lag_in_frames, 0, MAX_LAG_BUFFERS), + &priv->stats_buf_context); + } + } + } + + return res; +} + +static void destroy_context_and_bufferpool(AV1_COMP *cpi, + BufferPool *buffer_pool) { + av1_remove_compressor(cpi); +#if CONFIG_MULTITHREAD + if (buffer_pool) pthread_mutex_destroy(&buffer_pool->pool_mutex); +#endif + aom_free(buffer_pool); +} + +static void destroy_stats_buffer(STATS_BUFFER_CTX *stats_buf_context, + FIRSTPASS_STATS *frame_stats_buffer) { + aom_free(stats_buf_context->total_left_stats); + aom_free(stats_buf_context->total_stats); + aom_free(frame_stats_buffer); +} + +static aom_codec_err_t encoder_destroy(aom_codec_alg_priv_t *ctx) { + free(ctx->cx_data); + destroy_context_and_bufferpool(ctx->cpi, ctx->buffer_pool); + if (ctx->cpi_lap) { + // As both cpi and cpi_lap have the same lookahead_ctx, it is already freed + // when destroy is called on cpi. Thus, setting lookahead_ctx to null here, + // so that it doesn't attempt to free it again. + ctx->cpi_lap->lookahead = NULL; + destroy_context_and_bufferpool(ctx->cpi_lap, ctx->buffer_pool_lap); + } + destroy_stats_buffer(&ctx->stats_buf_context, ctx->frame_stats_buffer); + aom_free(ctx); + return AOM_CODEC_OK; +} + +static aom_codec_frame_flags_t get_frame_pkt_flags(const AV1_COMP *cpi, + unsigned int lib_flags) { + aom_codec_frame_flags_t flags = lib_flags << 16; + + if (lib_flags & FRAMEFLAGS_KEY) flags |= AOM_FRAME_IS_KEY; + if (lib_flags & FRAMEFLAGS_INTRAONLY) flags |= AOM_FRAME_IS_INTRAONLY; + if (lib_flags & FRAMEFLAGS_SWITCH) flags |= AOM_FRAME_IS_SWITCH; + if (lib_flags & FRAMEFLAGS_ERROR_RESILIENT) + flags |= AOM_FRAME_IS_ERROR_RESILIENT; + if (cpi->droppable) flags |= AOM_FRAME_IS_DROPPABLE; + + return flags; +} + +// TODO(Mufaddal): Check feasibility of abstracting functions related to LAP +// into a separate function. +static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, + const aom_image_t *img, + aom_codec_pts_t pts, + unsigned long duration, + aom_enc_frame_flags_t enc_flags) { + const size_t kMinCompressedSize = 8192; + volatile aom_codec_err_t res = AOM_CODEC_OK; + AV1_COMP *const cpi = ctx->cpi; + const aom_rational64_t *const timestamp_ratio = &ctx->timestamp_ratio; + volatile aom_codec_pts_t ptsvol = pts; + // LAP context + AV1_COMP *cpi_lap = ctx->cpi_lap; + + if (cpi == NULL) return AOM_CODEC_INVALID_PARAM; + + if (cpi->lap_enabled && cpi_lap == NULL && cpi->oxcf.pass == 0) + return AOM_CODEC_INVALID_PARAM; + + if (img != NULL) { + res = validate_img(ctx, img); + // TODO(jzern) the checks related to cpi's validity should be treated as a + // failure condition, encoder setup is done fully in init() currently. + if (res == AOM_CODEC_OK) { + size_t data_sz = ALIGN_POWER_OF_TWO(ctx->cfg.g_w, 5) * + ALIGN_POWER_OF_TWO(ctx->cfg.g_h, 5) * get_image_bps(img); + if (data_sz < kMinCompressedSize) data_sz = kMinCompressedSize; + if (ctx->cx_data == NULL || ctx->cx_data_sz < data_sz) { + ctx->cx_data_sz = data_sz; + free(ctx->cx_data); + ctx->cx_data = (unsigned char *)malloc(ctx->cx_data_sz); + if (ctx->cx_data == NULL) { + return AOM_CODEC_MEM_ERROR; + } + } + } + } + if (ctx->oxcf.mode != GOOD && ctx->oxcf.mode != REALTIME) { + ctx->oxcf.mode = GOOD; + av1_change_config(ctx->cpi, &ctx->oxcf); + } + + if (!ctx->pts_offset_initialized) { + ctx->pts_offset = ptsvol; + ctx->pts_offset_initialized = 1; + } + ptsvol -= ctx->pts_offset; + + aom_codec_pkt_list_init(&ctx->pkt_list); + + volatile aom_enc_frame_flags_t flags = enc_flags; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(cpi->common.error.jmp)) { + cpi->common.error.setjmp = 0; + res = update_error_state(ctx, &cpi->common.error); + aom_clear_system_state(); + return res; + } + cpi->common.error.setjmp = 1; + if (cpi_lap != NULL) { + if (setjmp(cpi_lap->common.error.jmp)) { + cpi_lap->common.error.setjmp = 0; + res = update_error_state(ctx, &cpi_lap->common.error); + aom_clear_system_state(); + return res; + } + cpi_lap->common.error.setjmp = 1; + } + + // Note(yunqing): While applying encoding flags, always start from enabling + // all, and then modifying according to the flags. Previous frame's flags are + // overwritten. + av1_apply_encoding_flags(cpi, flags); + if (cpi_lap != NULL) { + av1_apply_encoding_flags(cpi_lap, flags); + } + + // Handle fixed keyframe intervals + if (is_stat_generation_stage(cpi)) { + if (ctx->cfg.kf_mode == AOM_KF_AUTO && + ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist) { + if (cpi->common.spatial_layer_id == 0 && + ++ctx->fixed_kf_cntr > ctx->cfg.kf_min_dist) { + flags |= AOM_EFLAG_FORCE_KF; + ctx->fixed_kf_cntr = 1; + } + } + } + + if (res == AOM_CODEC_OK) { + int64_t dst_time_stamp = timebase_units_to_ticks(timestamp_ratio, ptsvol); + int64_t dst_end_time_stamp = + timebase_units_to_ticks(timestamp_ratio, ptsvol + duration); + + // Set up internal flags + if (ctx->base.init_flags & AOM_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1; + + if (img != NULL) { + YV12_BUFFER_CONFIG sd; + int use_highbitdepth, subsampling_x, subsampling_y; + res = image2yuvconfig(img, &sd); + use_highbitdepth = (sd.flags & YV12_FLAG_HIGHBITDEPTH) != 0; + subsampling_x = sd.subsampling_x; + subsampling_y = sd.subsampling_y; + + if (!cpi->lookahead) { + int lag_in_frames = cpi_lap != NULL ? cpi_lap->oxcf.lag_in_frames + : cpi->oxcf.lag_in_frames; + + cpi->lookahead = av1_lookahead_init( + cpi->oxcf.width, cpi->oxcf.height, subsampling_x, subsampling_y, + use_highbitdepth, lag_in_frames, cpi->oxcf.border_in_pixels, + cpi->common.features.byte_alignment, ctx->num_lap_buffers); + } + if (!cpi->lookahead) + aom_internal_error(&cpi->common.error, AOM_CODEC_MEM_ERROR, + "Failed to allocate lag buffers"); + + av1_check_initial_width(cpi, use_highbitdepth, subsampling_x, + subsampling_y); + if (cpi_lap != NULL) { + cpi_lap->lookahead = cpi->lookahead; + av1_check_initial_width(cpi_lap, use_highbitdepth, subsampling_x, + subsampling_y); + } + + // Store the original flags in to the frame buffer. Will extract the + // key frame flag when we actually encode this frame. + if (av1_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd, + dst_time_stamp, dst_end_time_stamp)) { + res = update_error_state(ctx, &cpi->common.error); + } + ctx->next_frame_flags = 0; + } + + unsigned char *cx_data = ctx->cx_data; + size_t cx_data_sz = ctx->cx_data_sz; + + assert(!(cx_data == NULL && cx_data_sz != 0)); + + /* Any pending invisible frames? */ + if (ctx->pending_cx_data) { + memmove(cx_data, ctx->pending_cx_data, ctx->pending_cx_data_sz); + ctx->pending_cx_data = cx_data; + cx_data += ctx->pending_cx_data_sz; + cx_data_sz -= ctx->pending_cx_data_sz; + + /* TODO: this is a minimal check, the underlying codec doesn't respect + * the buffer size anyway. + */ + if (cx_data_sz < ctx->cx_data_sz / 2) { + aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, + "Compressed data buffer too small"); + } + } + + size_t frame_size = 0; + unsigned int lib_flags = 0; + int is_frame_visible = 0; + int index_size = 0; + int has_fwd_keyframe = 0; + + // Call for LAP stage + if (cpi_lap != NULL) { + int status; + aom_rational64_t timestamp_ratio_la = *timestamp_ratio; + int64_t dst_time_stamp_la = dst_time_stamp; + int64_t dst_end_time_stamp_la = dst_end_time_stamp; + status = av1_get_compressed_data( + cpi_lap, &lib_flags, &frame_size, NULL, &dst_time_stamp_la, + &dst_end_time_stamp_la, !img, ×tamp_ratio_la); + if (status != -1) { + if (status != AOM_CODEC_OK) { + aom_internal_error(&cpi_lap->common.error, AOM_CODEC_ERROR, NULL); + } + cpi_lap->seq_params_locked = 1; + } + lib_flags = 0; + frame_size = 0; + } + + // invisible frames get packed with the next visible frame + while (cx_data_sz - index_size >= ctx->cx_data_sz / 2 && + !is_frame_visible) { + const int status = av1_get_compressed_data( + cpi, &lib_flags, &frame_size, cx_data, &dst_time_stamp, + &dst_end_time_stamp, !img, timestamp_ratio); + if (status == -1) break; + if (status != AOM_CODEC_OK) { + aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL); + } + + cpi->seq_params_locked = 1; + if (frame_size) { + if (ctx->pending_cx_data == 0) ctx->pending_cx_data = cx_data; + + const int write_temporal_delimiter = + !cpi->common.spatial_layer_id && !ctx->pending_frame_count; + + if (write_temporal_delimiter) { + uint32_t obu_header_size = 1; + const uint32_t obu_payload_size = 0; + const size_t length_field_size = + aom_uleb_size_in_bytes(obu_payload_size); + + if (ctx->pending_cx_data) { + const size_t move_offset = length_field_size + 1; + memmove(ctx->pending_cx_data + move_offset, ctx->pending_cx_data, + frame_size); + } + const uint32_t obu_header_offset = 0; + obu_header_size = av1_write_obu_header( + &cpi->level_params, OBU_TEMPORAL_DELIMITER, 0, + (uint8_t *)(ctx->pending_cx_data + obu_header_offset)); + + // OBUs are preceded/succeeded by an unsigned leb128 coded integer. + if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, + ctx->pending_cx_data) != AOM_CODEC_OK) { + aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL); + } + + frame_size += obu_header_size + obu_payload_size + length_field_size; + } + + if (ctx->oxcf.save_as_annexb) { + size_t curr_frame_size = frame_size; + if (av1_convert_sect5obus_to_annexb(cx_data, &curr_frame_size) != + AOM_CODEC_OK) { + aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL); + } + frame_size = curr_frame_size; + + // B_PRIME (add frame size) + const size_t length_field_size = aom_uleb_size_in_bytes(frame_size); + if (ctx->pending_cx_data) { + const size_t move_offset = length_field_size; + memmove(cx_data + move_offset, cx_data, frame_size); + } + if (av1_write_uleb_obu_size(0, (uint32_t)frame_size, cx_data) != + AOM_CODEC_OK) { + aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL); + } + frame_size += length_field_size; + } + + ctx->pending_frame_sizes[ctx->pending_frame_count++] = frame_size; + ctx->pending_cx_data_sz += frame_size; + + cx_data += frame_size; + cx_data_sz -= frame_size; + + index_size = MAG_SIZE * (ctx->pending_frame_count - 1) + 2; + + is_frame_visible = cpi->common.show_frame; + + has_fwd_keyframe |= (!is_frame_visible && + cpi->common.current_frame.frame_type == KEY_FRAME); + } + } + if (is_frame_visible) { + // Add the frame packet to the list of returned packets. + aom_codec_cx_pkt_t pkt; + + if (ctx->oxcf.save_as_annexb) { + // B_PRIME (add TU size) + size_t tu_size = ctx->pending_cx_data_sz; + const size_t length_field_size = aom_uleb_size_in_bytes(tu_size); + if (ctx->pending_cx_data) { + const size_t move_offset = length_field_size; + memmove(ctx->pending_cx_data + move_offset, ctx->pending_cx_data, + tu_size); + } + if (av1_write_uleb_obu_size(0, (uint32_t)tu_size, + ctx->pending_cx_data) != AOM_CODEC_OK) { + aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL); + } + ctx->pending_cx_data_sz += length_field_size; + } + + pkt.kind = AOM_CODEC_CX_FRAME_PKT; + + pkt.data.frame.buf = ctx->pending_cx_data; + pkt.data.frame.sz = ctx->pending_cx_data_sz; + pkt.data.frame.partition_id = -1; + pkt.data.frame.vis_frame_size = frame_size; + + pkt.data.frame.pts = + ticks_to_timebase_units(timestamp_ratio, dst_time_stamp) + + ctx->pts_offset; + pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags); + if (has_fwd_keyframe) { + // If one of the invisible frames in the packet is a keyframe, set + // the delayed random access point flag. + pkt.data.frame.flags |= AOM_FRAME_IS_DELAYED_RANDOM_ACCESS_POINT; + } + pkt.data.frame.duration = (uint32_t)ticks_to_timebase_units( + timestamp_ratio, dst_end_time_stamp - dst_time_stamp); + + aom_codec_pkt_list_add(&ctx->pkt_list.head, &pkt); + + ctx->pending_cx_data = NULL; + ctx->pending_cx_data_sz = 0; + ctx->pending_frame_count = 0; + } + } + + cpi->common.error.setjmp = 0; + return res; +} + +static const aom_codec_cx_pkt_t *encoder_get_cxdata(aom_codec_alg_priv_t *ctx, + aom_codec_iter_t *iter) { + return aom_codec_pkt_list_get(&ctx->pkt_list.head, iter); +} + +static aom_codec_err_t ctrl_set_reference(aom_codec_alg_priv_t *ctx, + va_list args) { + av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *); + + if (frame != NULL) { + YV12_BUFFER_CONFIG sd; + + image2yuvconfig(&frame->img, &sd); + av1_set_reference_enc(ctx->cpi, frame->idx, &sd); + return AOM_CODEC_OK; + } else { + return AOM_CODEC_INVALID_PARAM; + } +} + +static aom_codec_err_t ctrl_copy_reference(aom_codec_alg_priv_t *ctx, + va_list args) { + av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *); + + if (frame != NULL) { + YV12_BUFFER_CONFIG sd; + + image2yuvconfig(&frame->img, &sd); + av1_copy_reference_enc(ctx->cpi, frame->idx, &sd); + return AOM_CODEC_OK; + } else { + return AOM_CODEC_INVALID_PARAM; + } +} + +static aom_codec_err_t ctrl_get_reference(aom_codec_alg_priv_t *ctx, + va_list args) { + av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *); + + if (frame != NULL) { + YV12_BUFFER_CONFIG *fb = get_ref_frame(&ctx->cpi->common, frame->idx); + if (fb == NULL) return AOM_CODEC_ERROR; + + yuvconfig2image(&frame->img, fb, NULL); + return AOM_CODEC_OK; + } else { + return AOM_CODEC_INVALID_PARAM; + } +} + +static aom_codec_err_t ctrl_get_new_frame_image(aom_codec_alg_priv_t *ctx, + va_list args) { + aom_image_t *const new_img = va_arg(args, aom_image_t *); + + if (new_img != NULL) { + YV12_BUFFER_CONFIG new_frame; + + if (av1_get_last_show_frame(ctx->cpi, &new_frame) == 0) { + yuvconfig2image(new_img, &new_frame, NULL); + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } else { + return AOM_CODEC_INVALID_PARAM; + } +} + +static aom_codec_err_t ctrl_copy_new_frame_image(aom_codec_alg_priv_t *ctx, + va_list args) { + aom_image_t *const new_img = va_arg(args, aom_image_t *); + + if (new_img != NULL) { + YV12_BUFFER_CONFIG new_frame; + + if (av1_get_last_show_frame(ctx->cpi, &new_frame) == 0) { + YV12_BUFFER_CONFIG sd; + image2yuvconfig(new_img, &sd); + return av1_copy_new_frame_enc(&ctx->cpi->common, &new_frame, &sd); + } else { + return AOM_CODEC_ERROR; + } + } else { + return AOM_CODEC_INVALID_PARAM; + } +} + +static aom_image_t *encoder_get_preview(aom_codec_alg_priv_t *ctx) { + YV12_BUFFER_CONFIG sd; + + if (av1_get_preview_raw_frame(ctx->cpi, &sd) == 0) { + yuvconfig2image(&ctx->preview_img, &sd, NULL); + return &ctx->preview_img; + } else { + return NULL; + } +} + +static aom_codec_err_t ctrl_use_reference(aom_codec_alg_priv_t *ctx, + va_list args) { + const int reference_flag = va_arg(args, int); + + av1_use_as_reference(&ctx->cpi->ext_flags.ref_frame_flags, reference_flag); + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_roi_map(aom_codec_alg_priv_t *ctx, + va_list args) { + (void)ctx; + (void)args; + + // TODO(yaowu): Need to re-implement and test for AV1. + return AOM_CODEC_INVALID_PARAM; +} + +static aom_codec_err_t ctrl_set_active_map(aom_codec_alg_priv_t *ctx, + va_list args) { + aom_active_map_t *const map = va_arg(args, aom_active_map_t *); + + if (map) { + if (!av1_set_active_map(ctx->cpi, map->active_map, (int)map->rows, + (int)map->cols)) + return AOM_CODEC_OK; + else + return AOM_CODEC_INVALID_PARAM; + } else { + return AOM_CODEC_INVALID_PARAM; + } +} + +static aom_codec_err_t ctrl_get_active_map(aom_codec_alg_priv_t *ctx, + va_list args) { + aom_active_map_t *const map = va_arg(args, aom_active_map_t *); + + if (map) { + if (!av1_get_active_map(ctx->cpi, map->active_map, (int)map->rows, + (int)map->cols)) + return AOM_CODEC_OK; + else + return AOM_CODEC_INVALID_PARAM; + } else { + return AOM_CODEC_INVALID_PARAM; + } +} + +static aom_codec_err_t ctrl_set_scale_mode(aom_codec_alg_priv_t *ctx, + va_list args) { + aom_scaling_mode_t *const mode = va_arg(args, aom_scaling_mode_t *); + + if (mode) { + const int res = av1_set_internal_size( + &ctx->cpi->oxcf, &ctx->cpi->resize_pending_params, + (AOM_SCALING)mode->h_scaling_mode, (AOM_SCALING)mode->v_scaling_mode); + return (res == 0) ? AOM_CODEC_OK : AOM_CODEC_INVALID_PARAM; + } else { + return AOM_CODEC_INVALID_PARAM; + } +} + +static aom_codec_err_t ctrl_set_spatial_layer_id(aom_codec_alg_priv_t *ctx, + va_list args) { + const int spatial_layer_id = va_arg(args, int); + if (spatial_layer_id >= MAX_NUM_SPATIAL_LAYERS) + return AOM_CODEC_INVALID_PARAM; + ctx->cpi->common.spatial_layer_id = spatial_layer_id; + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_number_spatial_layers(aom_codec_alg_priv_t *ctx, + va_list args) { + const int number_spatial_layers = va_arg(args, int); + if (number_spatial_layers > MAX_NUM_SPATIAL_LAYERS) + return AOM_CODEC_INVALID_PARAM; + ctx->cpi->common.number_spatial_layers = number_spatial_layers; + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_layer_id(aom_codec_alg_priv_t *ctx, + va_list args) { + aom_svc_layer_id_t *const data = va_arg(args, aom_svc_layer_id_t *); + ctx->cpi->common.spatial_layer_id = data->spatial_layer_id; + ctx->cpi->common.temporal_layer_id = data->temporal_layer_id; + ctx->cpi->svc.spatial_layer_id = data->spatial_layer_id; + ctx->cpi->svc.temporal_layer_id = data->temporal_layer_id; + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_svc_params(aom_codec_alg_priv_t *ctx, + va_list args) { + AV1_COMP *const cpi = ctx->cpi; + aom_svc_params_t *const params = va_arg(args, aom_svc_params_t *); + cpi->common.number_spatial_layers = params->number_spatial_layers; + cpi->common.number_temporal_layers = params->number_temporal_layers; + cpi->svc.number_spatial_layers = params->number_spatial_layers; + cpi->svc.number_temporal_layers = params->number_temporal_layers; + if (cpi->common.number_spatial_layers > 1 || + cpi->common.number_temporal_layers > 1) { + unsigned int sl, tl; + cpi->use_svc = 1; + for (sl = 0; sl < cpi->common.number_spatial_layers; ++sl) { + for (tl = 0; tl < cpi->common.number_temporal_layers; ++tl) { + const int layer = + LAYER_IDS_TO_IDX(sl, tl, cpi->common.number_temporal_layers); + LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; + lc->max_q = params->max_quantizers[layer]; + lc->min_q = params->min_quantizers[layer]; + lc->scaling_factor_num = params->scaling_factor_num[sl]; + lc->scaling_factor_den = params->scaling_factor_den[sl]; + lc->layer_target_bitrate = 1000 * params->layer_target_bitrate[layer]; + lc->framerate_factor = params->framerate_factor[tl]; + } + } + if (cpi->common.current_frame.frame_number == 0) + av1_init_layer_context(cpi); + else + av1_update_layer_context_change_config(cpi, cpi->oxcf.target_bandwidth); + } + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_svc_ref_frame_config(aom_codec_alg_priv_t *ctx, + va_list args) { + AV1_COMP *const cpi = ctx->cpi; + aom_svc_ref_frame_config_t *const data = + va_arg(args, aom_svc_ref_frame_config_t *); + cpi->svc.external_ref_frame_config = 1; + for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + cpi->svc.reference[i] = data->reference[i]; + cpi->svc.ref_idx[i] = data->ref_idx[i]; + } + for (unsigned int i = 0; i < REF_FRAMES; ++i) + cpi->svc.refresh[i] = data->refresh[i]; + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_tune_content(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.content = CAST(AV1E_SET_TUNE_CONTENT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_cdf_update_mode(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.cdf_update_mode = CAST(AV1E_SET_CDF_UPDATE_MODE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_color_primaries(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.color_primaries = CAST(AV1E_SET_COLOR_PRIMARIES, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_transfer_characteristics( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.transfer_characteristics = + CAST(AV1E_SET_TRANSFER_CHARACTERISTICS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_matrix_coefficients(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.matrix_coefficients = CAST(AV1E_SET_MATRIX_COEFFICIENTS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_chroma_sample_position( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.chroma_sample_position = + CAST(AV1E_SET_CHROMA_SAMPLE_POSITION, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_color_range(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.color_range = CAST(AV1E_SET_COLOR_RANGE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_render_size(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + int *const render_size = va_arg(args, int *); + extra_cfg.render_width = render_size[0]; + extra_cfg.render_height = render_size[1]; + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_superblock_size(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.superblock_size = CAST(AV1E_SET_SUPERBLOCK_SIZE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_chroma_subsampling_x(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.chroma_subsampling_x = CAST(AV1E_SET_CHROMA_SUBSAMPLING_X, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_chroma_subsampling_y(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.chroma_subsampling_y = CAST(AV1E_SET_CHROMA_SUBSAMPLING_Y, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_get_seq_level_idx(aom_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + const AV1_COMP *const cpi = ctx->cpi; + if (arg == NULL) return AOM_CODEC_INVALID_PARAM; + return av1_get_seq_level_idx(&cpi->common.seq_params, &cpi->level_params, + arg); +} + +static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { + { AV1_COPY_REFERENCE, ctrl_copy_reference }, + { AOME_USE_REFERENCE, ctrl_use_reference }, + + // Setters + { AV1_SET_REFERENCE, ctrl_set_reference }, + { AOME_SET_ROI_MAP, ctrl_set_roi_map }, + { AOME_SET_ACTIVEMAP, ctrl_set_active_map }, + { AOME_SET_SCALEMODE, ctrl_set_scale_mode }, + { AOME_SET_SPATIAL_LAYER_ID, ctrl_set_spatial_layer_id }, + { AOME_SET_CPUUSED, ctrl_set_cpuused }, + { AOME_SET_ENABLEAUTOALTREF, ctrl_set_enable_auto_alt_ref }, + { AOME_SET_ENABLEAUTOBWDREF, ctrl_set_enable_auto_bwd_ref }, + { AOME_SET_SHARPNESS, ctrl_set_sharpness }, + { AOME_SET_STATIC_THRESHOLD, ctrl_set_static_thresh }, + { AV1E_SET_ROW_MT, ctrl_set_row_mt }, + { AV1E_SET_TILE_COLUMNS, ctrl_set_tile_columns }, + { AV1E_SET_TILE_ROWS, ctrl_set_tile_rows }, + { AV1E_SET_ENABLE_TPL_MODEL, ctrl_set_enable_tpl_model }, + { AV1E_SET_ENABLE_KEYFRAME_FILTERING, ctrl_set_enable_keyframe_filtering }, + { AOME_SET_ARNR_MAXFRAMES, ctrl_set_arnr_max_frames }, + { AOME_SET_ARNR_STRENGTH, ctrl_set_arnr_strength }, + { AOME_SET_TUNING, ctrl_set_tuning }, + { AOME_SET_CQ_LEVEL, ctrl_set_cq_level }, + { AOME_SET_MAX_INTRA_BITRATE_PCT, ctrl_set_rc_max_intra_bitrate_pct }, + { AOME_SET_NUMBER_SPATIAL_LAYERS, ctrl_set_number_spatial_layers }, + { AV1E_SET_MAX_INTER_BITRATE_PCT, ctrl_set_rc_max_inter_bitrate_pct }, + { AV1E_SET_GF_CBR_BOOST_PCT, ctrl_set_rc_gf_cbr_boost_pct }, + { AV1E_SET_LOSSLESS, ctrl_set_lossless }, + { AV1E_SET_ENABLE_CDEF, ctrl_set_enable_cdef }, + { AV1E_SET_ENABLE_RESTORATION, ctrl_set_enable_restoration }, + { AV1E_SET_FORCE_VIDEO_MODE, ctrl_set_force_video_mode }, + { AV1E_SET_ENABLE_OBMC, ctrl_set_enable_obmc }, + { AV1E_SET_DISABLE_TRELLIS_QUANT, ctrl_set_disable_trellis_quant }, + { AV1E_SET_ENABLE_QM, ctrl_set_enable_qm }, + { AV1E_SET_QM_Y, ctrl_set_qm_y }, + { AV1E_SET_QM_U, ctrl_set_qm_u }, + { AV1E_SET_QM_V, ctrl_set_qm_v }, + { AV1E_SET_QM_MIN, ctrl_set_qm_min }, + { AV1E_SET_QM_MAX, ctrl_set_qm_max }, + { AV1E_SET_NUM_TG, ctrl_set_num_tg }, + { AV1E_SET_MTU, ctrl_set_mtu }, + { AV1E_SET_TIMING_INFO_TYPE, ctrl_set_timing_info_type }, + { AV1E_SET_FRAME_PARALLEL_DECODING, ctrl_set_frame_parallel_decoding_mode }, + { AV1E_SET_ERROR_RESILIENT_MODE, ctrl_set_error_resilient_mode }, + { AV1E_SET_S_FRAME_MODE, ctrl_set_s_frame_mode }, + { AV1E_SET_ENABLE_RECT_PARTITIONS, ctrl_set_enable_rect_partitions }, + { AV1E_SET_ENABLE_AB_PARTITIONS, ctrl_set_enable_ab_partitions }, + { AV1E_SET_ENABLE_1TO4_PARTITIONS, ctrl_set_enable_1to4_partitions }, + { AV1E_SET_MIN_PARTITION_SIZE, ctrl_set_min_partition_size }, + { AV1E_SET_MAX_PARTITION_SIZE, ctrl_set_max_partition_size }, + { AV1E_SET_ENABLE_DUAL_FILTER, ctrl_set_enable_dual_filter }, + { AV1E_SET_ENABLE_CHROMA_DELTAQ, ctrl_set_enable_chroma_deltaq }, + { AV1E_SET_ENABLE_INTRA_EDGE_FILTER, ctrl_set_enable_intra_edge_filter }, + { AV1E_SET_ENABLE_ORDER_HINT, ctrl_set_enable_order_hint }, + { AV1E_SET_ENABLE_TX64, ctrl_set_enable_tx64 }, + { AV1E_SET_ENABLE_FLIP_IDTX, ctrl_set_enable_flip_idtx }, + { AV1E_SET_ENABLE_DIST_WTD_COMP, ctrl_set_enable_dist_wtd_comp }, + { AV1E_SET_MAX_REFERENCE_FRAMES, ctrl_set_max_reference_frames }, + { AV1E_SET_REDUCED_REFERENCE_SET, ctrl_set_enable_reduced_reference_set }, + { AV1E_SET_ENABLE_REF_FRAME_MVS, ctrl_set_enable_ref_frame_mvs }, + { AV1E_SET_ALLOW_REF_FRAME_MVS, ctrl_set_allow_ref_frame_mvs }, + { AV1E_SET_ENABLE_MASKED_COMP, ctrl_set_enable_masked_comp }, + { AV1E_SET_ENABLE_ONESIDED_COMP, ctrl_set_enable_onesided_comp }, + { AV1E_SET_ENABLE_INTERINTRA_COMP, ctrl_set_enable_interintra_comp }, + { AV1E_SET_ENABLE_SMOOTH_INTERINTRA, ctrl_set_enable_smooth_interintra }, + { AV1E_SET_ENABLE_DIFF_WTD_COMP, ctrl_set_enable_diff_wtd_comp }, + { AV1E_SET_ENABLE_INTERINTER_WEDGE, ctrl_set_enable_interinter_wedge }, + { AV1E_SET_ENABLE_INTERINTRA_WEDGE, ctrl_set_enable_interintra_wedge }, + { AV1E_SET_ENABLE_GLOBAL_MOTION, ctrl_set_enable_global_motion }, + { AV1E_SET_ENABLE_WARPED_MOTION, ctrl_set_enable_warped_motion }, + { AV1E_SET_ALLOW_WARPED_MOTION, ctrl_set_allow_warped_motion }, + { AV1E_SET_ENABLE_FILTER_INTRA, ctrl_set_enable_filter_intra }, + { AV1E_SET_ENABLE_SMOOTH_INTRA, ctrl_set_enable_smooth_intra }, + { AV1E_SET_ENABLE_PAETH_INTRA, ctrl_set_enable_paeth_intra }, + { AV1E_SET_ENABLE_CFL_INTRA, ctrl_set_enable_cfl_intra }, + { AV1E_SET_ENABLE_SUPERRES, ctrl_set_enable_superres }, + { AV1E_SET_ENABLE_OVERLAY, ctrl_set_enable_overlay }, + { AV1E_SET_ENABLE_PALETTE, ctrl_set_enable_palette }, + { AV1E_SET_ENABLE_INTRABC, ctrl_set_enable_intrabc }, + { AV1E_SET_ENABLE_ANGLE_DELTA, ctrl_set_enable_angle_delta }, + { AV1E_SET_AQ_MODE, ctrl_set_aq_mode }, + { AV1E_SET_REDUCED_TX_TYPE_SET, ctrl_set_reduced_tx_type_set }, + { AV1E_SET_INTRA_DCT_ONLY, ctrl_set_intra_dct_only }, + { AV1E_SET_INTER_DCT_ONLY, ctrl_set_inter_dct_only }, + { AV1E_SET_INTRA_DEFAULT_TX_ONLY, ctrl_set_intra_default_tx_only }, + { AV1E_SET_QUANT_B_ADAPT, ctrl_set_quant_b_adapt }, + { AV1E_SET_COEFF_COST_UPD_FREQ, ctrl_set_coeff_cost_upd_freq }, + { AV1E_SET_MODE_COST_UPD_FREQ, ctrl_set_mode_cost_upd_freq }, + { AV1E_SET_MV_COST_UPD_FREQ, ctrl_set_mv_cost_upd_freq }, + { AV1E_SET_DELTAQ_MODE, ctrl_set_deltaq_mode }, + { AV1E_SET_DELTALF_MODE, ctrl_set_deltalf_mode }, + { AV1E_SET_FRAME_PERIODIC_BOOST, ctrl_set_frame_periodic_boost }, + { AV1E_SET_TUNE_CONTENT, ctrl_set_tune_content }, + { AV1E_SET_CDF_UPDATE_MODE, ctrl_set_cdf_update_mode }, + { AV1E_SET_COLOR_PRIMARIES, ctrl_set_color_primaries }, + { AV1E_SET_TRANSFER_CHARACTERISTICS, ctrl_set_transfer_characteristics }, + { AV1E_SET_MATRIX_COEFFICIENTS, ctrl_set_matrix_coefficients }, + { AV1E_SET_CHROMA_SAMPLE_POSITION, ctrl_set_chroma_sample_position }, + { AV1E_SET_COLOR_RANGE, ctrl_set_color_range }, + { AV1E_SET_NOISE_SENSITIVITY, ctrl_set_noise_sensitivity }, + { AV1E_SET_MIN_GF_INTERVAL, ctrl_set_min_gf_interval }, + { AV1E_SET_MAX_GF_INTERVAL, ctrl_set_max_gf_interval }, + { AV1E_SET_GF_MIN_PYRAMID_HEIGHT, ctrl_set_gf_min_pyr_height }, + { AV1E_SET_GF_MAX_PYRAMID_HEIGHT, ctrl_set_gf_max_pyr_height }, + { AV1E_SET_RENDER_SIZE, ctrl_set_render_size }, + { AV1E_SET_SUPERBLOCK_SIZE, ctrl_set_superblock_size }, + { AV1E_SET_SINGLE_TILE_DECODING, ctrl_set_single_tile_decoding }, + { AV1E_SET_VMAF_MODEL_PATH, ctrl_set_vmaf_model_path }, + { AV1E_SET_FILM_GRAIN_TEST_VECTOR, ctrl_set_film_grain_test_vector }, + { AV1E_SET_FILM_GRAIN_TABLE, ctrl_set_film_grain_table }, + { AV1E_SET_DENOISE_NOISE_LEVEL, ctrl_set_denoise_noise_level }, + { AV1E_SET_DENOISE_BLOCK_SIZE, ctrl_set_denoise_block_size }, + { AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test }, + { AV1E_ENABLE_EXT_TILE_DEBUG, ctrl_enable_ext_tile_debug }, + { AV1E_SET_TARGET_SEQ_LEVEL_IDX, ctrl_set_target_seq_level_idx }, + { AV1E_SET_TIER_MASK, ctrl_set_tier_mask }, + { AV1E_SET_MIN_CR, ctrl_set_min_cr }, + { AV1E_SET_SVC_LAYER_ID, ctrl_set_layer_id }, + { AV1E_SET_SVC_PARAMS, ctrl_set_svc_params }, + { AV1E_SET_SVC_REF_FRAME_CONFIG, ctrl_set_svc_ref_frame_config }, + { AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST, ctrl_enable_sb_multipass_unit_test }, + + // Getters + { AOME_GET_LAST_QUANTIZER, ctrl_get_quantizer }, + { AOME_GET_LAST_QUANTIZER_64, ctrl_get_quantizer64 }, + { AV1_GET_REFERENCE, ctrl_get_reference }, + { AV1E_GET_ACTIVEMAP, ctrl_get_active_map }, + { AV1_GET_NEW_FRAME_IMAGE, ctrl_get_new_frame_image }, + { AV1_COPY_NEW_FRAME_IMAGE, ctrl_copy_new_frame_image }, + { AV1E_SET_CHROMA_SUBSAMPLING_X, ctrl_set_chroma_subsampling_x }, + { AV1E_SET_CHROMA_SUBSAMPLING_Y, ctrl_set_chroma_subsampling_y }, + { AV1E_GET_SEQ_LEVEL_IDX, ctrl_get_seq_level_idx }, + { -1, NULL }, +}; + +static const aom_codec_enc_cfg_t encoder_usage_cfg[] = { + { + // NOLINT + AOM_USAGE_GOOD_QUALITY, // g_usage - non-realtime usage + 0, // g_threads + 0, // g_profile + + 320, // g_width + 240, // g_height + 0, // g_limit + 0, // g_forced_max_frame_width + 0, // g_forced_max_frame_height + AOM_BITS_8, // g_bit_depth + 8, // g_input_bit_depth + + { 1, 30 }, // g_timebase + + 0, // g_error_resilient + + AOM_RC_ONE_PASS, // g_pass + + 19, // g_lag_in_frames + + 0, // rc_dropframe_thresh + RESIZE_NONE, // rc_resize_mode + SCALE_NUMERATOR, // rc_resize_denominator + SCALE_NUMERATOR, // rc_resize_kf_denominator + + SUPERRES_NONE, // rc_superres_mode + SCALE_NUMERATOR, // rc_superres_denominator + SCALE_NUMERATOR, // rc_superres_kf_denominator + 63, // rc_superres_qthresh + 32, // rc_superres_kf_qthresh + + AOM_VBR, // rc_end_usage + { NULL, 0 }, // rc_twopass_stats_in + { NULL, 0 }, // rc_firstpass_mb_stats_in + 256, // rc_target_bandwidth + 0, // rc_min_quantizer + 63, // rc_max_quantizer + 25, // rc_undershoot_pct + 25, // rc_overshoot_pct + + 6000, // rc_max_buffer_size + 4000, // rc_buffer_initial_size + 5000, // rc_buffer_optimal_size + + 50, // rc_two_pass_vbrbias + 0, // rc_two_pass_vbrmin_section + 2000, // rc_two_pass_vbrmax_section + + // keyframing settings (kf) + 0, // fwd_kf_enabled + AOM_KF_AUTO, // g_kfmode + 0, // kf_min_dist + 9999, // kf_max_dist + 0, // sframe_dist + 1, // sframe_mode + 0, // large_scale_tile + 0, // monochrome + 0, // full_still_picture_hdr + 0, // save_as_annexb + 0, // tile_width_count + 0, // tile_height_count + { 0 }, // tile_widths + { 0 }, // tile_heights + 0, // use_fixed_qp_offsets + { -1, -1, -1, -1, -1 }, // fixed_qp_offsets + { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // cfg + }, + { + // NOLINT + AOM_USAGE_REALTIME, // g_usage - real-time usage + 0, // g_threads + 0, // g_profile + + 320, // g_width + 240, // g_height + 0, // g_limit + 0, // g_forced_max_frame_width + 0, // g_forced_max_frame_height + AOM_BITS_8, // g_bit_depth + 8, // g_input_bit_depth + + { 1, 30 }, // g_timebase + + 0, // g_error_resilient + + AOM_RC_ONE_PASS, // g_pass + + 1, // g_lag_in_frames + + 0, // rc_dropframe_thresh + RESIZE_NONE, // rc_resize_mode + SCALE_NUMERATOR, // rc_resize_denominator + SCALE_NUMERATOR, // rc_resize_kf_denominator + + 0, // rc_superres_mode + SCALE_NUMERATOR, // rc_superres_denominator + SCALE_NUMERATOR, // rc_superres_kf_denominator + 63, // rc_superres_qthresh + 32, // rc_superres_kf_qthresh + + AOM_CBR, // rc_end_usage + { NULL, 0 }, // rc_twopass_stats_in + { NULL, 0 }, // rc_firstpass_mb_stats_in + 256, // rc_target_bandwidth + 0, // rc_min_quantizer + 63, // rc_max_quantizer + 25, // rc_undershoot_pct + 25, // rc_overshoot_pct + + 6000, // rc_max_buffer_size + 4000, // rc_buffer_initial_size + 5000, // rc_buffer_optimal_size + + 50, // rc_two_pass_vbrbias + 0, // rc_two_pass_vbrmin_section + 2000, // rc_two_pass_vbrmax_section + + // keyframing settings (kf) + 0, // fwd_kf_enabled + AOM_KF_AUTO, // g_kfmode + 0, // kf_min_dist + 9999, // kf_max_dist + 0, // sframe_dist + 1, // sframe_mode + 0, // large_scale_tile + 0, // monochrome + 0, // full_still_picture_hdr + 0, // save_as_annexb + 0, // tile_width_count + 0, // tile_height_count + { 0 }, // tile_widths + { 0 }, // tile_heights + 0, // use_fixed_qp_offsets + { -1, -1, -1, -1, -1 }, // fixed_qp_offsets + { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // cfg + }, +}; + +#ifndef VERSION_STRING +#define VERSION_STRING +#endif +CODEC_INTERFACE(aom_codec_av1_cx) = { + "AOMedia Project AV1 Encoder" VERSION_STRING, + AOM_CODEC_INTERNAL_ABI_VERSION, + AOM_CODEC_CAP_HIGHBITDEPTH | AOM_CODEC_CAP_ENCODER | + AOM_CODEC_CAP_PSNR, // aom_codec_caps_t + encoder_init, // aom_codec_init_fn_t + encoder_destroy, // aom_codec_destroy_fn_t + encoder_ctrl_maps, // aom_codec_ctrl_fn_map_t + { + // NOLINT + NULL, // aom_codec_peek_si_fn_t + NULL, // aom_codec_get_si_fn_t + NULL, // aom_codec_decode_fn_t + NULL, // aom_codec_get_frame_fn_t + NULL // aom_codec_set_fb_fn_t + }, + { + // NOLINT + 2, // 2 cfg + encoder_usage_cfg, // aom_codec_enc_cfg_t + encoder_encode, // aom_codec_encode_fn_t + encoder_get_cxdata, // aom_codec_get_cx_data_fn_t + encoder_set_config, // aom_codec_enc_config_set_fn_t + encoder_get_global_headers, // aom_codec_get_global_headers_fn_t + encoder_get_preview // aom_codec_get_preview_frame_fn_t + } +}; diff --git a/libs/libaom/src/av1/av1_dx_iface.c b/libs/libaom/src/av1/av1_dx_iface.c new file mode 100644 index 000000000..d821a52f6 --- /dev/null +++ b/libs/libaom/src/av1/av1_dx_iface.c @@ -0,0 +1,1397 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_version.h" + +#include "aom/internal/aom_codec_internal.h" +#include "aom/internal/aom_image_internal.h" +#include "aom/aomdx.h" +#include "aom/aom_decoder.h" +#include "aom_dsp/bitreader_buffer.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem_ops.h" +#include "aom_util/aom_thread.h" + +#include "av1/common/alloccommon.h" +#include "av1/common/frame_buffers.h" +#include "av1/common/enums.h" +#include "av1/common/obu_util.h" + +#include "av1/decoder/decoder.h" +#include "av1/decoder/decodeframe.h" +#include "av1/decoder/obu.h" + +#include "av1/av1_iface_common.h" + +struct aom_codec_alg_priv { + aom_codec_priv_t base; + aom_codec_dec_cfg_t cfg; + aom_codec_stream_info_t si; + aom_image_t img; + int img_avail; + int flushed; + int invert_tile_order; + RefCntBuffer *last_show_frame; // Last output frame buffer + int byte_alignment; + int skip_loop_filter; + int skip_film_grain; + int decode_tile_row; + int decode_tile_col; + unsigned int tile_mode; + unsigned int ext_tile_debug; + unsigned int row_mt; + EXTERNAL_REFERENCES ext_refs; + unsigned int is_annexb; + int operating_point; + int output_all_layers; + + AVxWorker *frame_worker; + + aom_image_t image_with_grain; + aom_codec_frame_buffer_t grain_image_frame_buffers[MAX_NUM_SPATIAL_LAYERS]; + size_t num_grain_image_frame_buffers; + int need_resync; // wait for key/intra-only frame + // BufferPool that holds all reference frames. Shared by all the FrameWorkers. + BufferPool *buffer_pool; + + // External frame buffer info to save for AV1 common. + void *ext_priv; // Private data associated with the external frame buffers. + aom_get_frame_buffer_cb_fn_t get_ext_fb_cb; + aom_release_frame_buffer_cb_fn_t release_ext_fb_cb; + +#if CONFIG_INSPECTION + aom_inspect_cb inspect_cb; + void *inspect_ctx; +#endif +}; + +static aom_codec_err_t decoder_init(aom_codec_ctx_t *ctx) { + // This function only allocates space for the aom_codec_alg_priv_t + // structure. More memory may be required at the time the stream + // information becomes known. + if (!ctx->priv) { + aom_codec_alg_priv_t *const priv = + (aom_codec_alg_priv_t *)aom_calloc(1, sizeof(*priv)); + if (priv == NULL) return AOM_CODEC_MEM_ERROR; + + ctx->priv = (aom_codec_priv_t *)priv; + ctx->priv->init_flags = ctx->init_flags; + priv->flushed = 0; + + // TODO(tdaede): this should not be exposed to the API + priv->cfg.allow_lowbitdepth = !FORCE_HIGHBITDEPTH_DECODING; + if (ctx->config.dec) { + priv->cfg = *ctx->config.dec; + ctx->config.dec = &priv->cfg; + } + priv->num_grain_image_frame_buffers = 0; + // Turn row_mt on by default. + priv->row_mt = 1; + + // Turn on normal tile coding mode by default. + // 0 is for normal tile coding mode, and 1 is for large scale tile coding + // mode(refer to lightfield example). + priv->tile_mode = 0; + priv->decode_tile_row = -1; + priv->decode_tile_col = -1; + } + + return AOM_CODEC_OK; +} + +static aom_codec_err_t decoder_destroy(aom_codec_alg_priv_t *ctx) { + if (ctx->frame_worker != NULL) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + aom_get_worker_interface()->end(worker); + aom_free(frame_worker_data->pbi->common.tpl_mvs); + frame_worker_data->pbi->common.tpl_mvs = NULL; + av1_remove_common(&frame_worker_data->pbi->common); + av1_free_restoration_buffers(&frame_worker_data->pbi->common); + av1_decoder_remove(frame_worker_data->pbi); + aom_free(frame_worker_data); +#if CONFIG_MULTITHREAD + pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex); +#endif + } + + if (ctx->buffer_pool) { + for (size_t i = 0; i < ctx->num_grain_image_frame_buffers; i++) { + ctx->buffer_pool->release_fb_cb(ctx->buffer_pool->cb_priv, + &ctx->grain_image_frame_buffers[i]); + } + av1_free_ref_frame_buffers(ctx->buffer_pool); + av1_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers); + } + + aom_free(ctx->frame_worker); + aom_free(ctx->buffer_pool); + aom_img_free(&ctx->img); + aom_free(ctx); + return AOM_CODEC_OK; +} + +static aom_codec_err_t parse_timing_info(struct aom_read_bit_buffer *rb) { + const uint32_t num_units_in_display_tick = + aom_rb_read_unsigned_literal(rb, 32); + const uint32_t time_scale = aom_rb_read_unsigned_literal(rb, 32); + if (num_units_in_display_tick == 0 || time_scale == 0) + return AOM_CODEC_UNSUP_BITSTREAM; + const uint8_t equal_picture_interval = aom_rb_read_bit(rb); + if (equal_picture_interval) { + const uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(rb); + if (num_ticks_per_picture_minus_1 == UINT32_MAX) { + // num_ticks_per_picture_minus_1 cannot be (1 << 32) − 1. + return AOM_CODEC_UNSUP_BITSTREAM; + } + } + return AOM_CODEC_OK; +} + +static aom_codec_err_t parse_decoder_model_info( + struct aom_read_bit_buffer *rb, int *buffer_delay_length_minus_1) { + *buffer_delay_length_minus_1 = aom_rb_read_literal(rb, 5); + const uint32_t num_units_in_decoding_tick = + aom_rb_read_unsigned_literal(rb, 32); + const uint8_t buffer_removal_time_length_minus_1 = aom_rb_read_literal(rb, 5); + const uint8_t frame_presentation_time_length_minus_1 = + aom_rb_read_literal(rb, 5); + (void)num_units_in_decoding_tick; + (void)buffer_removal_time_length_minus_1; + (void)frame_presentation_time_length_minus_1; + return AOM_CODEC_OK; +} + +static aom_codec_err_t parse_op_parameters_info( + struct aom_read_bit_buffer *rb, int buffer_delay_length_minus_1) { + const int n = buffer_delay_length_minus_1 + 1; + const uint32_t decoder_buffer_delay = aom_rb_read_unsigned_literal(rb, n); + const uint32_t encoder_buffer_delay = aom_rb_read_unsigned_literal(rb, n); + const uint8_t low_delay_mode_flag = aom_rb_read_bit(rb); + (void)decoder_buffer_delay; + (void)encoder_buffer_delay; + (void)low_delay_mode_flag; + return AOM_CODEC_OK; +} + +// Parses the operating points (including operating_point_idc, seq_level_idx, +// and seq_tier) and then sets si->number_spatial_layers and +// si->number_temporal_layers based on operating_point_idc[0]. +static aom_codec_err_t parse_operating_points(struct aom_read_bit_buffer *rb, + int is_reduced_header, + aom_codec_stream_info_t *si) { + int operating_point_idc0 = 0; + if (is_reduced_header) { + aom_rb_read_literal(rb, LEVEL_BITS); // level + } else { + uint8_t decoder_model_info_present_flag = 0; + int buffer_delay_length_minus_1 = 0; + aom_codec_err_t status; + const uint8_t timing_info_present_flag = aom_rb_read_bit(rb); + if (timing_info_present_flag) { + if ((status = parse_timing_info(rb)) != AOM_CODEC_OK) return status; + decoder_model_info_present_flag = aom_rb_read_bit(rb); + if (decoder_model_info_present_flag) { + if ((status = parse_decoder_model_info( + rb, &buffer_delay_length_minus_1)) != AOM_CODEC_OK) + return status; + } + } + const uint8_t initial_display_delay_present_flag = aom_rb_read_bit(rb); + const uint8_t operating_points_cnt_minus_1 = + aom_rb_read_literal(rb, OP_POINTS_CNT_MINUS_1_BITS); + for (int i = 0; i < operating_points_cnt_minus_1 + 1; i++) { + int operating_point_idc; + operating_point_idc = aom_rb_read_literal(rb, OP_POINTS_IDC_BITS); + if (i == 0) operating_point_idc0 = operating_point_idc; + int seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS); // level + if (seq_level_idx > 7) aom_rb_read_bit(rb); // tier + if (decoder_model_info_present_flag) { + const uint8_t decoder_model_present_for_this_op = aom_rb_read_bit(rb); + if (decoder_model_present_for_this_op) { + if ((status = parse_op_parameters_info( + rb, buffer_delay_length_minus_1)) != AOM_CODEC_OK) + return status; + } + } + if (initial_display_delay_present_flag) { + const uint8_t initial_display_delay_present_for_this_op = + aom_rb_read_bit(rb); + if (initial_display_delay_present_for_this_op) + aom_rb_read_literal(rb, 4); // initial_display_delay_minus_1 + } + } + } + + if (aom_get_num_layers_from_operating_point_idc( + operating_point_idc0, &si->number_spatial_layers, + &si->number_temporal_layers) != AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + + return AOM_CODEC_OK; +} + +static aom_codec_err_t decoder_peek_si_internal(const uint8_t *data, + size_t data_sz, + aom_codec_stream_info_t *si, + int *is_intra_only) { + int intra_only_flag = 0; + int got_sequence_header = 0; + int found_keyframe = 0; + + if (data + data_sz <= data || data_sz < 1) return AOM_CODEC_INVALID_PARAM; + + si->w = 0; + si->h = 0; + si->is_kf = 0; // is_kf indicates whether the current packet contains a RAP + + ObuHeader obu_header; + memset(&obu_header, 0, sizeof(obu_header)); + size_t payload_size = 0; + size_t bytes_read = 0; + uint8_t reduced_still_picture_hdr = 0; + aom_codec_err_t status = aom_read_obu_header_and_size( + data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read); + if (status != AOM_CODEC_OK) return status; + + // If the first OBU is a temporal delimiter, skip over it and look at the next + // OBU in the bitstream + if (obu_header.type == OBU_TEMPORAL_DELIMITER) { + // Skip any associated payload (there shouldn't be one, but just in case) + if (data_sz < bytes_read + payload_size) return AOM_CODEC_CORRUPT_FRAME; + data += bytes_read + payload_size; + data_sz -= bytes_read + payload_size; + + status = aom_read_obu_header_and_size( + data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read); + if (status != AOM_CODEC_OK) return status; + } + while (1) { + data += bytes_read; + data_sz -= bytes_read; + if (data_sz < payload_size) return AOM_CODEC_CORRUPT_FRAME; + // Check that the selected OBU is a sequence header + if (obu_header.type == OBU_SEQUENCE_HEADER) { + // Sanity check on sequence header size + if (data_sz < 2) return AOM_CODEC_CORRUPT_FRAME; + // Read a few values from the sequence header payload + struct aom_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL }; + + av1_read_profile(&rb); // profile + const uint8_t still_picture = aom_rb_read_bit(&rb); + reduced_still_picture_hdr = aom_rb_read_bit(&rb); + + if (!still_picture && reduced_still_picture_hdr) { + return AOM_CODEC_UNSUP_BITSTREAM; + } + + if (parse_operating_points(&rb, reduced_still_picture_hdr, si) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + + int num_bits_width = aom_rb_read_literal(&rb, 4) + 1; + int num_bits_height = aom_rb_read_literal(&rb, 4) + 1; + int max_frame_width = aom_rb_read_literal(&rb, num_bits_width) + 1; + int max_frame_height = aom_rb_read_literal(&rb, num_bits_height) + 1; + si->w = max_frame_width; + si->h = max_frame_height; + got_sequence_header = 1; + } else if (obu_header.type == OBU_FRAME_HEADER || + obu_header.type == OBU_FRAME) { + if (got_sequence_header && reduced_still_picture_hdr) { + found_keyframe = 1; + break; + } else { + // make sure we have enough bits to get the frame type out + if (data_sz < 1) return AOM_CODEC_CORRUPT_FRAME; + struct aom_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL }; + const int show_existing_frame = aom_rb_read_bit(&rb); + if (!show_existing_frame) { + const FRAME_TYPE frame_type = (FRAME_TYPE)aom_rb_read_literal(&rb, 2); + if (frame_type == KEY_FRAME) { + found_keyframe = 1; + break; // Stop here as no further OBUs will change the outcome. + } else if (frame_type == INTRA_ONLY_FRAME) { + intra_only_flag = 1; + } + } + } + } + // skip past any unread OBU header data + data += payload_size; + data_sz -= payload_size; + if (data_sz == 0) break; // exit if we're out of OBUs + status = aom_read_obu_header_and_size( + data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read); + if (status != AOM_CODEC_OK) return status; + } + if (got_sequence_header && found_keyframe) si->is_kf = 1; + if (is_intra_only != NULL) *is_intra_only = intra_only_flag; + return AOM_CODEC_OK; +} + +static aom_codec_err_t decoder_peek_si(const uint8_t *data, size_t data_sz, + aom_codec_stream_info_t *si) { + return decoder_peek_si_internal(data, data_sz, si, NULL); +} + +static aom_codec_err_t decoder_get_si(aom_codec_alg_priv_t *ctx, + aom_codec_stream_info_t *si) { + memcpy(si, &ctx->si, sizeof(*si)); + + return AOM_CODEC_OK; +} + +static void set_error_detail(aom_codec_alg_priv_t *ctx, + const char *const error) { + ctx->base.err_detail = error; +} + +static aom_codec_err_t update_error_state( + aom_codec_alg_priv_t *ctx, const struct aom_internal_error_info *error) { + if (error->error_code) + set_error_detail(ctx, error->has_detail ? error->detail : NULL); + + return error->error_code; +} + +static void init_buffer_callbacks(aom_codec_alg_priv_t *ctx) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + AV1Decoder *const pbi = frame_worker_data->pbi; + AV1_COMMON *const cm = &pbi->common; + BufferPool *const pool = cm->buffer_pool; + + cm->cur_frame = NULL; + cm->features.byte_alignment = ctx->byte_alignment; + pbi->skip_loop_filter = ctx->skip_loop_filter; + pbi->skip_film_grain = ctx->skip_film_grain; + + if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) { + pool->get_fb_cb = ctx->get_ext_fb_cb; + pool->release_fb_cb = ctx->release_ext_fb_cb; + pool->cb_priv = ctx->ext_priv; + } else { + pool->get_fb_cb = av1_get_frame_buffer; + pool->release_fb_cb = av1_release_frame_buffer; + + if (av1_alloc_internal_frame_buffers(&pool->int_frame_buffers)) + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to initialize internal frame buffers"); + + pool->cb_priv = &pool->int_frame_buffers; + } +} + +static int frame_worker_hook(void *arg1, void *arg2) { + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1; + const uint8_t *data = frame_worker_data->data; + (void)arg2; + + int result = av1_receive_compressed_data(frame_worker_data->pbi, + frame_worker_data->data_size, &data); + frame_worker_data->data_end = data; + + if (result != 0) { + // Check decode result in serial decode. + frame_worker_data->pbi->need_resync = 1; + } + return !result; +} + +static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + + ctx->last_show_frame = NULL; + ctx->need_resync = 1; + ctx->flushed = 0; + + ctx->buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool)); + if (ctx->buffer_pool == NULL) return AOM_CODEC_MEM_ERROR; + +#if CONFIG_MULTITHREAD + if (pthread_mutex_init(&ctx->buffer_pool->pool_mutex, NULL)) { + set_error_detail(ctx, "Failed to allocate buffer pool mutex"); + return AOM_CODEC_MEM_ERROR; + } +#endif + + ctx->frame_worker = (AVxWorker *)aom_malloc(sizeof(*ctx->frame_worker)); + if (ctx->frame_worker == NULL) { + set_error_detail(ctx, "Failed to allocate frame_worker"); + return AOM_CODEC_MEM_ERROR; + } + + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *frame_worker_data = NULL; + winterface->init(worker); + worker->thread_name = "aom frameworker"; + worker->data1 = aom_memalign(32, sizeof(FrameWorkerData)); + if (worker->data1 == NULL) { + set_error_detail(ctx, "Failed to allocate frame_worker_data"); + return AOM_CODEC_MEM_ERROR; + } + frame_worker_data = (FrameWorkerData *)worker->data1; + frame_worker_data->pbi = av1_decoder_create(ctx->buffer_pool); + if (frame_worker_data->pbi == NULL) { + set_error_detail(ctx, "Failed to allocate frame_worker_data"); + return AOM_CODEC_MEM_ERROR; + } + frame_worker_data->frame_context_ready = 0; + frame_worker_data->received_frame = 0; + frame_worker_data->pbi->allow_lowbitdepth = ctx->cfg.allow_lowbitdepth; + + // If decoding in serial mode, FrameWorker thread could create tile worker + // thread or loopfilter thread. + frame_worker_data->pbi->max_threads = ctx->cfg.threads; + frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order; + frame_worker_data->pbi->common.tiles.large_scale = ctx->tile_mode; + frame_worker_data->pbi->is_annexb = ctx->is_annexb; + frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row; + frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col; + frame_worker_data->pbi->operating_point = ctx->operating_point; + frame_worker_data->pbi->output_all_layers = ctx->output_all_layers; + frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug; + frame_worker_data->pbi->row_mt = ctx->row_mt; + + worker->hook = frame_worker_hook; + + init_buffer_callbacks(ctx); + + return AOM_CODEC_OK; +} + +static INLINE void check_resync(aom_codec_alg_priv_t *const ctx, + const AV1Decoder *const pbi) { + // Clear resync flag if worker got a key frame or intra only frame. + if (ctx->need_resync == 1 && pbi->need_resync == 0 && + frame_is_intra_only(&pbi->common)) + ctx->need_resync = 0; +} + +static aom_codec_err_t decode_one(aom_codec_alg_priv_t *ctx, + const uint8_t **data, size_t data_sz, + void *user_priv) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + + // Determine the stream parameters. Note that we rely on peek_si to + // validate that we have a buffer that does not wrap around the top + // of the heap. + if (!ctx->si.h) { + int is_intra_only = 0; + ctx->si.is_annexb = ctx->is_annexb; + const aom_codec_err_t res = + decoder_peek_si_internal(*data, data_sz, &ctx->si, &is_intra_only); + if (res != AOM_CODEC_OK) return res; + + if (!ctx->si.is_kf && !is_intra_only) return AOM_CODEC_ERROR; + } + + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + frame_worker_data->data = *data; + frame_worker_data->data_size = data_sz; + frame_worker_data->user_priv = user_priv; + frame_worker_data->received_frame = 1; + + frame_worker_data->pbi->common.tiles.large_scale = ctx->tile_mode; + frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row; + frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col; + frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug; + frame_worker_data->pbi->row_mt = ctx->row_mt; + frame_worker_data->pbi->ext_refs = ctx->ext_refs; + + frame_worker_data->pbi->is_annexb = ctx->is_annexb; + + worker->had_error = 0; + winterface->execute(worker); + + // Update data pointer after decode. + *data = frame_worker_data->data_end; + + if (worker->had_error) + return update_error_state(ctx, &frame_worker_data->pbi->common.error); + + check_resync(ctx, frame_worker_data->pbi); + + return AOM_CODEC_OK; +} + +#if CONFIG_INSPECTION +// This function enables the inspector to inspect non visible frames. +static aom_codec_err_t decoder_inspect(aom_codec_alg_priv_t *ctx, + const uint8_t *data, size_t data_sz, + void *user_priv) { + aom_codec_err_t res = AOM_CODEC_OK; + + const uint8_t *const data_end = data + data_sz; + Av1DecodeReturn *data2 = (Av1DecodeReturn *)user_priv; + + if (ctx->frame_worker == NULL) { + res = init_decoder(ctx); + if (res != AOM_CODEC_OK) return res; + } + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)ctx->frame_worker->data1; + AV1Decoder *const pbi = frame_worker_data->pbi; + AV1_COMMON *const cm = &pbi->common; + frame_worker_data->pbi->inspect_cb = ctx->inspect_cb; + frame_worker_data->pbi->inspect_ctx = ctx->inspect_ctx; + res = av1_receive_compressed_data(frame_worker_data->pbi, data_sz, &data); + check_resync(ctx, frame_worker_data->pbi); + + if (ctx->frame_worker->had_error) + return update_error_state(ctx, &frame_worker_data->pbi->common.error); + + // Allow extra zero bytes after the frame end + while (data < data_end) { + const uint8_t marker = data[0]; + if (marker) break; + ++data; + } + + data2->idx = -1; + for (int i = 0; i < REF_FRAMES; ++i) + if (cm->ref_frame_map[i] == cm->cur_frame) data2->idx = i; + data2->buf = data; + data2->show_existing = cm->show_existing_frame; + return res; +} +#endif + +static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx, + const uint8_t *data, size_t data_sz, + void *user_priv) { + aom_codec_err_t res = AOM_CODEC_OK; + +#if CONFIG_INSPECTION + if (user_priv != 0) { + return decoder_inspect(ctx, data, data_sz, user_priv); + } +#endif + // Release any pending output frames from the previous decoder_decode call. + // We need to do this even if the decoder is being flushed or the input + // arguments are invalid. + if (ctx->frame_worker) { + BufferPool *const pool = ctx->buffer_pool; + lock_buffer_pool(pool); + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + struct AV1Decoder *pbi = frame_worker_data->pbi; + for (size_t j = 0; j < pbi->num_output_frames; j++) { + decrease_ref_count(pbi->output_frames[j], pool); + } + pbi->num_output_frames = 0; + unlock_buffer_pool(pool); + for (size_t j = 0; j < ctx->num_grain_image_frame_buffers; j++) { + pool->release_fb_cb(pool->cb_priv, &ctx->grain_image_frame_buffers[j]); + ctx->grain_image_frame_buffers[j].data = NULL; + ctx->grain_image_frame_buffers[j].size = 0; + ctx->grain_image_frame_buffers[j].priv = NULL; + } + ctx->num_grain_image_frame_buffers = 0; + } + + /* Sanity checks */ + /* NULL data ptr allowed if data_sz is 0 too */ + if (data == NULL && data_sz == 0) { + ctx->flushed = 1; + return AOM_CODEC_OK; + } + if (data == NULL || data_sz == 0) return AOM_CODEC_INVALID_PARAM; + + // Reset flushed when receiving a valid frame. + ctx->flushed = 0; + + // Initialize the decoder worker on the first frame. + if (ctx->frame_worker == NULL) { + res = init_decoder(ctx); + if (res != AOM_CODEC_OK) return res; + } + + const uint8_t *data_start = data; + const uint8_t *data_end = data + data_sz; + + if (ctx->is_annexb) { + // read the size of this temporal unit + size_t length_of_size; + uint64_t temporal_unit_size; + if (aom_uleb_decode(data_start, data_sz, &temporal_unit_size, + &length_of_size) != 0) { + return AOM_CODEC_CORRUPT_FRAME; + } + data_start += length_of_size; + if (temporal_unit_size > (size_t)(data_end - data_start)) + return AOM_CODEC_CORRUPT_FRAME; + data_end = data_start + temporal_unit_size; + } + + // Decode in serial mode. + while (data_start < data_end) { + uint64_t frame_size; + if (ctx->is_annexb) { + // read the size of this frame unit + size_t length_of_size; + if (aom_uleb_decode(data_start, (size_t)(data_end - data_start), + &frame_size, &length_of_size) != 0) { + return AOM_CODEC_CORRUPT_FRAME; + } + data_start += length_of_size; + if (frame_size > (size_t)(data_end - data_start)) + return AOM_CODEC_CORRUPT_FRAME; + } else { + frame_size = (uint64_t)(data_end - data_start); + } + + res = decode_one(ctx, &data_start, (size_t)frame_size, user_priv); + if (res != AOM_CODEC_OK) return res; + + // Allow extra zero bytes after the frame end + while (data_start < data_end) { + const uint8_t marker = data_start[0]; + if (marker) break; + ++data_start; + } + } + + return res; +} + +typedef struct { + BufferPool *pool; + aom_codec_frame_buffer_t *fb; +} AllocCbParam; + +static void *AllocWithGetFrameBufferCb(void *priv, size_t size) { + AllocCbParam *param = (AllocCbParam *)priv; + if (param->pool->get_fb_cb(param->pool->cb_priv, size, param->fb) < 0) + return NULL; + if (param->fb->data == NULL || param->fb->size < size) return NULL; + return param->fb->data; +} + +// If grain_params->apply_grain is false, returns img. Otherwise, adds film +// grain to img, saves the result in grain_img, and returns grain_img. +static aom_image_t *add_grain_if_needed(aom_codec_alg_priv_t *ctx, + aom_image_t *img, + aom_image_t *grain_img, + aom_film_grain_t *grain_params) { + if (!grain_params->apply_grain) return img; + + const int w_even = ALIGN_POWER_OF_TWO(img->d_w, 1); + const int h_even = ALIGN_POWER_OF_TWO(img->d_h, 1); + + BufferPool *const pool = ctx->buffer_pool; + aom_codec_frame_buffer_t *fb = + &ctx->grain_image_frame_buffers[ctx->num_grain_image_frame_buffers]; + AllocCbParam param; + param.pool = pool; + param.fb = fb; + if (!aom_img_alloc_with_cb(grain_img, img->fmt, w_even, h_even, 16, + AllocWithGetFrameBufferCb, ¶m)) { + return NULL; + } + + grain_img->user_priv = img->user_priv; + grain_img->fb_priv = fb->priv; + if (av1_add_film_grain(grain_params, img, grain_img)) { + pool->release_fb_cb(pool->cb_priv, fb); + return NULL; + } + + ctx->num_grain_image_frame_buffers++; + return grain_img; +} + +// Copies and clears the metadata from AV1Decoder. +static void move_decoder_metadata_to_img(AV1Decoder *pbi, aom_image_t *img) { + if (pbi->metadata && img) { + assert(!img->metadata); + img->metadata = pbi->metadata; + pbi->metadata = NULL; + } +} + +static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx, + aom_codec_iter_t *iter) { + aom_image_t *img = NULL; + + if (!iter) { + return NULL; + } + + // To avoid having to allocate any extra storage, treat 'iter' as + // simply a pointer to an integer index + uintptr_t *index = (uintptr_t *)iter; + + if (ctx->frame_worker != NULL) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + AV1Decoder *const pbi = frame_worker_data->pbi; + AV1_COMMON *const cm = &pbi->common; + CommonTileParams *const tiles = &cm->tiles; + // Wait for the frame from worker thread. + if (winterface->sync(worker)) { + // Check if worker has received any frames. + if (frame_worker_data->received_frame == 1) { + frame_worker_data->received_frame = 0; + check_resync(ctx, frame_worker_data->pbi); + } + YV12_BUFFER_CONFIG *sd; + aom_film_grain_t *grain_params; + if (av1_get_raw_frame(frame_worker_data->pbi, *index, &sd, + &grain_params) == 0) { + RefCntBuffer *const output_frame_buf = pbi->output_frames[*index]; + ctx->last_show_frame = output_frame_buf; + if (ctx->need_resync) return NULL; + aom_img_remove_metadata(&ctx->img); + yuvconfig2image(&ctx->img, sd, frame_worker_data->user_priv); + move_decoder_metadata_to_img(pbi, &ctx->img); + + if (!pbi->ext_tile_debug && tiles->large_scale) { + *index += 1; // Advance the iterator to point to the next image + aom_img_remove_metadata(&ctx->img); + yuvconfig2image(&ctx->img, &pbi->tile_list_outbuf, NULL); + move_decoder_metadata_to_img(pbi, &ctx->img); + img = &ctx->img; + return img; + } + + const int num_planes = av1_num_planes(cm); + if (pbi->ext_tile_debug && tiles->single_tile_decoding && + pbi->dec_tile_row >= 0) { + int tile_width, tile_height; + av1_get_uniform_tile_size(cm, &tile_width, &tile_height); + const int tile_row = AOMMIN(pbi->dec_tile_row, tiles->rows - 1); + const int mi_row = tile_row * tile_height; + const int ssy = ctx->img.y_chroma_shift; + int plane; + ctx->img.planes[0] += mi_row * MI_SIZE * ctx->img.stride[0]; + if (num_planes > 1) { + for (plane = 1; plane < MAX_MB_PLANE; ++plane) { + ctx->img.planes[plane] += + mi_row * (MI_SIZE >> ssy) * ctx->img.stride[plane]; + } + } + ctx->img.d_h = + AOMMIN(tile_height, cm->mi_params.mi_rows - mi_row) * MI_SIZE; + } + + if (pbi->ext_tile_debug && tiles->single_tile_decoding && + pbi->dec_tile_col >= 0) { + int tile_width, tile_height; + av1_get_uniform_tile_size(cm, &tile_width, &tile_height); + const int tile_col = AOMMIN(pbi->dec_tile_col, tiles->cols - 1); + const int mi_col = tile_col * tile_width; + const int ssx = ctx->img.x_chroma_shift; + const int is_hbd = (ctx->img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0; + int plane; + ctx->img.planes[0] += mi_col * MI_SIZE * (1 + is_hbd); + if (num_planes > 1) { + for (plane = 1; plane < MAX_MB_PLANE; ++plane) { + ctx->img.planes[plane] += + mi_col * (MI_SIZE >> ssx) * (1 + is_hbd); + } + } + ctx->img.d_w = + AOMMIN(tile_width, cm->mi_params.mi_cols - mi_col) * MI_SIZE; + } + + ctx->img.fb_priv = output_frame_buf->raw_frame_buffer.priv; + img = &ctx->img; + img->temporal_id = cm->temporal_layer_id; + img->spatial_id = cm->spatial_layer_id; + if (pbi->skip_film_grain) grain_params->apply_grain = 0; + aom_image_t *res = + add_grain_if_needed(ctx, img, &ctx->image_with_grain, grain_params); + if (!res) { + aom_internal_error(&pbi->common.error, AOM_CODEC_CORRUPT_FRAME, + "Grain systhesis failed\n"); + } + *index += 1; // Advance the iterator to point to the next image + return res; + } + } else { + // Decoding failed. Release the worker thread. + frame_worker_data->received_frame = 0; + ctx->need_resync = 1; + if (ctx->flushed != 1) return NULL; + } + } + return NULL; +} + +static aom_codec_err_t decoder_set_fb_fn( + aom_codec_alg_priv_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get, + aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) { + if (cb_get == NULL || cb_release == NULL) { + return AOM_CODEC_INVALID_PARAM; + } else if (ctx->frame_worker == NULL) { + // If the decoder has already been initialized, do not accept changes to + // the frame buffer functions. + ctx->get_ext_fb_cb = cb_get; + ctx->release_ext_fb_cb = cb_release; + ctx->ext_priv = cb_priv; + return AOM_CODEC_OK; + } + + return AOM_CODEC_ERROR; +} + +static aom_codec_err_t ctrl_set_reference(aom_codec_alg_priv_t *ctx, + va_list args) { + av1_ref_frame_t *const data = va_arg(args, av1_ref_frame_t *); + + if (data) { + av1_ref_frame_t *const frame = data; + YV12_BUFFER_CONFIG sd; + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + image2yuvconfig(&frame->img, &sd); + return av1_set_reference_dec(&frame_worker_data->pbi->common, frame->idx, + frame->use_external_ref, &sd); + } else { + return AOM_CODEC_INVALID_PARAM; + } +} + +static aom_codec_err_t ctrl_copy_reference(aom_codec_alg_priv_t *ctx, + va_list args) { + const av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *); + if (frame) { + YV12_BUFFER_CONFIG sd; + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + image2yuvconfig(&frame->img, &sd); + return av1_copy_reference_dec(frame_worker_data->pbi, frame->idx, &sd); + } else { + return AOM_CODEC_INVALID_PARAM; + } +} + +static aom_codec_err_t ctrl_get_reference(aom_codec_alg_priv_t *ctx, + va_list args) { + av1_ref_frame_t *data = va_arg(args, av1_ref_frame_t *); + if (data) { + YV12_BUFFER_CONFIG *fb; + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + fb = get_ref_frame(&frame_worker_data->pbi->common, data->idx); + if (fb == NULL) return AOM_CODEC_ERROR; + yuvconfig2image(&data->img, fb, NULL); + return AOM_CODEC_OK; + } else { + return AOM_CODEC_INVALID_PARAM; + } +} + +static aom_codec_err_t ctrl_get_new_frame_image(aom_codec_alg_priv_t *ctx, + va_list args) { + aom_image_t *new_img = va_arg(args, aom_image_t *); + if (new_img) { + YV12_BUFFER_CONFIG new_frame; + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + + if (av1_get_frame_to_show(frame_worker_data->pbi, &new_frame) == 0) { + yuvconfig2image(new_img, &new_frame, NULL); + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } else { + return AOM_CODEC_INVALID_PARAM; + } +} + +static aom_codec_err_t ctrl_copy_new_frame_image(aom_codec_alg_priv_t *ctx, + va_list args) { + aom_image_t *img = va_arg(args, aom_image_t *); + if (img) { + YV12_BUFFER_CONFIG new_frame; + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + + if (av1_get_frame_to_show(frame_worker_data->pbi, &new_frame) == 0) { + YV12_BUFFER_CONFIG sd; + image2yuvconfig(img, &sd); + return av1_copy_new_frame_dec(&frame_worker_data->pbi->common, &new_frame, + &sd); + } else { + return AOM_CODEC_ERROR; + } + } else { + return AOM_CODEC_INVALID_PARAM; + } +} + +static aom_codec_err_t ctrl_get_last_ref_updates(aom_codec_alg_priv_t *ctx, + va_list args) { + int *const update_info = va_arg(args, int *); + + if (update_info) { + if (ctx->frame_worker) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + *update_info = + frame_worker_data->pbi->common.current_frame.refresh_frame_flags; + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } + + return AOM_CODEC_INVALID_PARAM; +} + +static aom_codec_err_t ctrl_get_last_quantizer(aom_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) return AOM_CODEC_INVALID_PARAM; + *arg = ((FrameWorkerData *)ctx->frame_worker->data1) + ->pbi->common.quant_params.base_qindex; + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_get_frame_corrupted(aom_codec_alg_priv_t *ctx, + va_list args) { + int *corrupted = va_arg(args, int *); + + if (corrupted) { + if (ctx->frame_worker) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + AV1Decoder *const pbi = frame_worker_data->pbi; + if (pbi->seen_frame_header && pbi->num_output_frames == 0) + return AOM_CODEC_ERROR; + if (ctx->last_show_frame != NULL) + *corrupted = ctx->last_show_frame->buf.corrupted; + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } + + return AOM_CODEC_INVALID_PARAM; +} + +static aom_codec_err_t ctrl_get_frame_size(aom_codec_alg_priv_t *ctx, + va_list args) { + int *const frame_size = va_arg(args, int *); + + if (frame_size) { + if (ctx->frame_worker) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + const AV1_COMMON *const cm = &frame_worker_data->pbi->common; + frame_size[0] = cm->width; + frame_size[1] = cm->height; + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } + + return AOM_CODEC_INVALID_PARAM; +} + +static aom_codec_err_t ctrl_get_frame_header_info(aom_codec_alg_priv_t *ctx, + va_list args) { + aom_tile_data *const frame_header_info = va_arg(args, aom_tile_data *); + + if (frame_header_info) { + if (ctx->frame_worker) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + const AV1Decoder *pbi = frame_worker_data->pbi; + frame_header_info->coded_tile_data_size = pbi->obu_size_hdr.size; + frame_header_info->coded_tile_data = pbi->obu_size_hdr.data; + frame_header_info->extra_size = pbi->frame_header_size; + } else { + return AOM_CODEC_ERROR; + } + } + + return AOM_CODEC_INVALID_PARAM; +} + +static aom_codec_err_t ctrl_get_tile_data(aom_codec_alg_priv_t *ctx, + va_list args) { + aom_tile_data *const tile_data = va_arg(args, aom_tile_data *); + + if (tile_data) { + if (ctx->frame_worker) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + const AV1Decoder *pbi = frame_worker_data->pbi; + tile_data->coded_tile_data_size = + pbi->tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].size; + tile_data->coded_tile_data = + pbi->tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].data; + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } + + return AOM_CODEC_INVALID_PARAM; +} + +static aom_codec_err_t ctrl_set_ext_ref_ptr(aom_codec_alg_priv_t *ctx, + va_list args) { + av1_ext_ref_frame_t *const data = va_arg(args, av1_ext_ref_frame_t *); + + if (data) { + av1_ext_ref_frame_t *const ext_frames = data; + ctx->ext_refs.num = ext_frames->num; + for (int i = 0; i < ctx->ext_refs.num; i++) { + image2yuvconfig(ext_frames->img++, &ctx->ext_refs.refs[i]); + } + return AOM_CODEC_OK; + } else { + return AOM_CODEC_INVALID_PARAM; + } +} + +static aom_codec_err_t ctrl_get_render_size(aom_codec_alg_priv_t *ctx, + va_list args) { + int *const render_size = va_arg(args, int *); + + if (render_size) { + if (ctx->frame_worker) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + const AV1_COMMON *const cm = &frame_worker_data->pbi->common; + render_size[0] = cm->render_width; + render_size[1] = cm->render_height; + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } + + return AOM_CODEC_INVALID_PARAM; +} + +static aom_codec_err_t ctrl_get_bit_depth(aom_codec_alg_priv_t *ctx, + va_list args) { + unsigned int *const bit_depth = va_arg(args, unsigned int *); + AVxWorker *const worker = ctx->frame_worker; + + if (bit_depth) { + if (worker) { + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + const AV1_COMMON *const cm = &frame_worker_data->pbi->common; + *bit_depth = cm->seq_params.bit_depth; + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } + + return AOM_CODEC_INVALID_PARAM; +} + +static aom_img_fmt_t get_img_format(int subsampling_x, int subsampling_y, + int use_highbitdepth) { + aom_img_fmt_t fmt = 0; + + if (subsampling_x == 0 && subsampling_y == 0) + fmt = AOM_IMG_FMT_I444; + else if (subsampling_x == 1 && subsampling_y == 0) + fmt = AOM_IMG_FMT_I422; + else if (subsampling_x == 1 && subsampling_y == 1) + fmt = AOM_IMG_FMT_I420; + + if (use_highbitdepth) fmt |= AOM_IMG_FMT_HIGHBITDEPTH; + return fmt; +} + +static aom_codec_err_t ctrl_get_img_format(aom_codec_alg_priv_t *ctx, + va_list args) { + aom_img_fmt_t *const img_fmt = va_arg(args, aom_img_fmt_t *); + AVxWorker *const worker = ctx->frame_worker; + + if (img_fmt) { + if (worker) { + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + const AV1_COMMON *const cm = &frame_worker_data->pbi->common; + + *img_fmt = get_img_format(cm->seq_params.subsampling_x, + cm->seq_params.subsampling_y, + cm->seq_params.use_highbitdepth); + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } + + return AOM_CODEC_INVALID_PARAM; +} + +static aom_codec_err_t ctrl_get_tile_size(aom_codec_alg_priv_t *ctx, + va_list args) { + unsigned int *const tile_size = va_arg(args, unsigned int *); + AVxWorker *const worker = ctx->frame_worker; + + if (tile_size) { + if (worker) { + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + const AV1_COMMON *const cm = &frame_worker_data->pbi->common; + int tile_width, tile_height; + av1_get_uniform_tile_size(cm, &tile_width, &tile_height); + *tile_size = ((tile_width * MI_SIZE) << 16) + tile_height * MI_SIZE; + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } + return AOM_CODEC_INVALID_PARAM; +} + +static aom_codec_err_t ctrl_get_tile_count(aom_codec_alg_priv_t *ctx, + va_list args) { + unsigned int *const tile_count = va_arg(args, unsigned int *); + + if (tile_count) { + AVxWorker *const worker = ctx->frame_worker; + if (worker) { + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + *tile_count = frame_worker_data->pbi->tile_count_minus_1 + 1; + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } + return AOM_CODEC_INVALID_PARAM; +} + +static aom_codec_err_t ctrl_set_invert_tile_order(aom_codec_alg_priv_t *ctx, + va_list args) { + ctx->invert_tile_order = va_arg(args, int); + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_byte_alignment(aom_codec_alg_priv_t *ctx, + va_list args) { + const int legacy_byte_alignment = 0; + const int min_byte_alignment = 32; + const int max_byte_alignment = 1024; + const int byte_alignment = va_arg(args, int); + + if (byte_alignment != legacy_byte_alignment && + (byte_alignment < min_byte_alignment || + byte_alignment > max_byte_alignment || + (byte_alignment & (byte_alignment - 1)) != 0)) + return AOM_CODEC_INVALID_PARAM; + + ctx->byte_alignment = byte_alignment; + if (ctx->frame_worker) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + frame_worker_data->pbi->common.features.byte_alignment = byte_alignment; + } + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_skip_loop_filter(aom_codec_alg_priv_t *ctx, + va_list args) { + ctx->skip_loop_filter = va_arg(args, int); + + if (ctx->frame_worker) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + frame_worker_data->pbi->skip_loop_filter = ctx->skip_loop_filter; + } + + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_skip_film_grain(aom_codec_alg_priv_t *ctx, + va_list args) { + ctx->skip_film_grain = va_arg(args, int); + + if (ctx->frame_worker) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + frame_worker_data->pbi->skip_film_grain = ctx->skip_film_grain; + } + + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_get_accounting(aom_codec_alg_priv_t *ctx, + va_list args) { +#if !CONFIG_ACCOUNTING + (void)ctx; + (void)args; + return AOM_CODEC_INCAPABLE; +#else + if (ctx->frame_worker) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + AV1Decoder *pbi = frame_worker_data->pbi; + Accounting **acct = va_arg(args, Accounting **); + *acct = &pbi->accounting; + return AOM_CODEC_OK; + } + return AOM_CODEC_ERROR; +#endif +} +static aom_codec_err_t ctrl_set_decode_tile_row(aom_codec_alg_priv_t *ctx, + va_list args) { + ctx->decode_tile_row = va_arg(args, int); + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_decode_tile_col(aom_codec_alg_priv_t *ctx, + va_list args) { + ctx->decode_tile_col = va_arg(args, int); + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_tile_mode(aom_codec_alg_priv_t *ctx, + va_list args) { + ctx->tile_mode = va_arg(args, unsigned int); + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_is_annexb(aom_codec_alg_priv_t *ctx, + va_list args) { + ctx->is_annexb = va_arg(args, unsigned int); + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_operating_point(aom_codec_alg_priv_t *ctx, + va_list args) { + ctx->operating_point = va_arg(args, int); + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_output_all_layers(aom_codec_alg_priv_t *ctx, + va_list args) { + ctx->output_all_layers = va_arg(args, int); + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_inspection_callback(aom_codec_alg_priv_t *ctx, + va_list args) { +#if !CONFIG_INSPECTION + (void)ctx; + (void)args; + return AOM_CODEC_INCAPABLE; +#else + aom_inspect_init *init = va_arg(args, aom_inspect_init *); + ctx->inspect_cb = init->inspect_cb; + ctx->inspect_ctx = init->inspect_ctx; + return AOM_CODEC_OK; +#endif +} + +static aom_codec_err_t ctrl_ext_tile_debug(aom_codec_alg_priv_t *ctx, + va_list args) { + ctx->ext_tile_debug = va_arg(args, int); + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_row_mt(aom_codec_alg_priv_t *ctx, + va_list args) { + ctx->row_mt = va_arg(args, unsigned int); + return AOM_CODEC_OK; +} + +static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { + { AV1_COPY_REFERENCE, ctrl_copy_reference }, + + // Setters + { AV1_SET_REFERENCE, ctrl_set_reference }, + { AV1_INVERT_TILE_DECODE_ORDER, ctrl_set_invert_tile_order }, + { AV1_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment }, + { AV1_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter }, + { AV1_SET_DECODE_TILE_ROW, ctrl_set_decode_tile_row }, + { AV1_SET_DECODE_TILE_COL, ctrl_set_decode_tile_col }, + { AV1_SET_TILE_MODE, ctrl_set_tile_mode }, + { AV1D_SET_IS_ANNEXB, ctrl_set_is_annexb }, + { AV1D_SET_OPERATING_POINT, ctrl_set_operating_point }, + { AV1D_SET_OUTPUT_ALL_LAYERS, ctrl_set_output_all_layers }, + { AV1_SET_INSPECTION_CALLBACK, ctrl_set_inspection_callback }, + { AV1D_EXT_TILE_DEBUG, ctrl_ext_tile_debug }, + { AV1D_SET_ROW_MT, ctrl_set_row_mt }, + { AV1D_SET_EXT_REF_PTR, ctrl_set_ext_ref_ptr }, + { AV1D_SET_SKIP_FILM_GRAIN, ctrl_set_skip_film_grain }, + + // Getters + { AOMD_GET_FRAME_CORRUPTED, ctrl_get_frame_corrupted }, + { AOMD_GET_LAST_QUANTIZER, ctrl_get_last_quantizer }, + { AOMD_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates }, + { AV1D_GET_BIT_DEPTH, ctrl_get_bit_depth }, + { AV1D_GET_IMG_FORMAT, ctrl_get_img_format }, + { AV1D_GET_TILE_SIZE, ctrl_get_tile_size }, + { AV1D_GET_TILE_COUNT, ctrl_get_tile_count }, + { AV1D_GET_DISPLAY_SIZE, ctrl_get_render_size }, + { AV1D_GET_FRAME_SIZE, ctrl_get_frame_size }, + { AV1_GET_ACCOUNTING, ctrl_get_accounting }, + { AV1_GET_NEW_FRAME_IMAGE, ctrl_get_new_frame_image }, + { AV1_COPY_NEW_FRAME_IMAGE, ctrl_copy_new_frame_image }, + { AV1_GET_REFERENCE, ctrl_get_reference }, + { AV1D_GET_FRAME_HEADER_INFO, ctrl_get_frame_header_info }, + { AV1D_GET_TILE_DATA, ctrl_get_tile_data }, + + { -1, NULL }, +}; + +#ifndef VERSION_STRING +#define VERSION_STRING +#endif +CODEC_INTERFACE(aom_codec_av1_dx) = { + "AOMedia Project AV1 Decoder" VERSION_STRING, + AOM_CODEC_INTERNAL_ABI_VERSION, + AOM_CODEC_CAP_DECODER | + AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER, // aom_codec_caps_t + decoder_init, // aom_codec_init_fn_t + decoder_destroy, // aom_codec_destroy_fn_t + decoder_ctrl_maps, // aom_codec_ctrl_fn_map_t + { + // NOLINT + decoder_peek_si, // aom_codec_peek_si_fn_t + decoder_get_si, // aom_codec_get_si_fn_t + decoder_decode, // aom_codec_decode_fn_t + decoder_get_frame, // aom_codec_get_frame_fn_t + decoder_set_fb_fn, // aom_codec_set_fb_fn_t + }, + { + // NOLINT + 0, + NULL, // aom_codec_enc_cfg_t + NULL, // aom_codec_encode_fn_t + NULL, // aom_codec_get_cx_data_fn_t + NULL, // aom_codec_enc_config_set_fn_t + NULL, // aom_codec_get_global_headers_fn_t + NULL // aom_codec_get_preview_frame_fn_t + } +}; diff --git a/libs/libaom/src/av1/av1_iface_common.h b/libs/libaom/src/av1/av1_iface_common.h new file mode 100644 index 000000000..9b5ffcba4 --- /dev/null +++ b/libs/libaom/src/av1/av1_iface_common.h @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_AV1_IFACE_COMMON_H_ +#define AOM_AV1_AV1_IFACE_COMMON_H_ + +#include + +#include "aom_ports/mem.h" +#include "aom_scale/yv12config.h" + +static void yuvconfig2image(aom_image_t *img, const YV12_BUFFER_CONFIG *yv12, + void *user_priv) { + /* aom_img_wrap() doesn't allow specifying independent strides for + * the Y, U, and V planes, nor other alignment adjustments that + * might be representable by a YV12_BUFFER_CONFIG, so we just + * initialize all the fields. + */ + int bps; + if (!yv12->subsampling_y) { + if (!yv12->subsampling_x) { + img->fmt = AOM_IMG_FMT_I444; + bps = 24; + } else { + img->fmt = AOM_IMG_FMT_I422; + bps = 16; + } + } else { + img->fmt = AOM_IMG_FMT_I420; + bps = 12; + } + img->cp = yv12->color_primaries; + img->tc = yv12->transfer_characteristics; + img->mc = yv12->matrix_coefficients; + img->monochrome = yv12->monochrome; + img->csp = yv12->chroma_sample_position; + img->range = yv12->color_range; + img->bit_depth = 8; + img->w = yv12->y_width; + img->h = yv12->y_height; + img->d_w = yv12->y_crop_width; + img->d_h = yv12->y_crop_height; + img->r_w = yv12->render_width; + img->r_h = yv12->render_height; + img->x_chroma_shift = yv12->subsampling_x; + img->y_chroma_shift = yv12->subsampling_y; + img->planes[AOM_PLANE_Y] = yv12->y_buffer; + img->planes[AOM_PLANE_U] = yv12->u_buffer; + img->planes[AOM_PLANE_V] = yv12->v_buffer; + img->stride[AOM_PLANE_Y] = yv12->y_stride; + img->stride[AOM_PLANE_U] = yv12->uv_stride; + img->stride[AOM_PLANE_V] = yv12->uv_stride; + if (yv12->flags & YV12_FLAG_HIGHBITDEPTH) { + bps *= 2; + // aom_image_t uses byte strides and a pointer to the first byte + // of the image. + img->fmt = (aom_img_fmt_t)(img->fmt | AOM_IMG_FMT_HIGHBITDEPTH); + img->bit_depth = yv12->bit_depth; + img->planes[AOM_PLANE_Y] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->y_buffer); + img->planes[AOM_PLANE_U] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->u_buffer); + img->planes[AOM_PLANE_V] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->v_buffer); + img->stride[AOM_PLANE_Y] = 2 * yv12->y_stride; + img->stride[AOM_PLANE_U] = 2 * yv12->uv_stride; + img->stride[AOM_PLANE_V] = 2 * yv12->uv_stride; + } + img->bps = bps; + img->user_priv = user_priv; + img->img_data = yv12->buffer_alloc; + img->img_data_owner = 0; + img->self_allocd = 0; + img->sz = yv12->frame_size; + assert(!yv12->metadata); + img->metadata = NULL; +} + +static aom_codec_err_t image2yuvconfig(const aom_image_t *img, + YV12_BUFFER_CONFIG *yv12) { + yv12->y_buffer = img->planes[AOM_PLANE_Y]; + yv12->u_buffer = img->planes[AOM_PLANE_U]; + yv12->v_buffer = img->planes[AOM_PLANE_V]; + + yv12->y_crop_width = img->d_w; + yv12->y_crop_height = img->d_h; + yv12->render_width = img->r_w; + yv12->render_height = img->r_h; + yv12->y_width = img->w; + yv12->y_height = img->h; + + yv12->uv_width = + img->x_chroma_shift == 1 ? (1 + yv12->y_width) / 2 : yv12->y_width; + yv12->uv_height = + img->y_chroma_shift == 1 ? (1 + yv12->y_height) / 2 : yv12->y_height; + yv12->uv_crop_width = yv12->uv_width; + yv12->uv_crop_height = yv12->uv_height; + + yv12->y_stride = img->stride[AOM_PLANE_Y]; + yv12->uv_stride = img->stride[AOM_PLANE_U]; + yv12->color_primaries = img->cp; + yv12->transfer_characteristics = img->tc; + yv12->matrix_coefficients = img->mc; + yv12->monochrome = img->monochrome; + yv12->chroma_sample_position = img->csp; + yv12->color_range = img->range; + + if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + // In aom_image_t + // planes point to uint8 address of start of data + // stride counts uint8s to reach next row + // In YV12_BUFFER_CONFIG + // y_buffer, u_buffer, v_buffer point to uint16 address of data + // stride and border counts in uint16s + // This means that all the address calculations in the main body of code + // should work correctly. + // However, before we do any pixel operations we need to cast the address + // to a uint16 ponter and double its value. + yv12->y_buffer = CONVERT_TO_BYTEPTR(yv12->y_buffer); + yv12->u_buffer = CONVERT_TO_BYTEPTR(yv12->u_buffer); + yv12->v_buffer = CONVERT_TO_BYTEPTR(yv12->v_buffer); + yv12->y_stride >>= 1; + yv12->uv_stride >>= 1; + yv12->flags = YV12_FLAG_HIGHBITDEPTH; + } else { + yv12->flags = 0; + } + + // Note(yunqing): if img is allocated the same as the frame buffer, y_stride + // is 32-byte aligned. Also, handle the cases while allocating img without a + // border or stride_align is less than 32. + int border = (yv12->y_stride - (int)((img->w + 31) & ~31)) / 2; + yv12->border = (border < 0) ? 0 : border; + yv12->subsampling_x = img->x_chroma_shift; + yv12->subsampling_y = img->y_chroma_shift; + yv12->metadata = img->metadata; + return AOM_CODEC_OK; +} + +#endif // AOM_AV1_AV1_IFACE_COMMON_H_ diff --git a/libs/libaom/src/av1/common/alloccommon.c b/libs/libaom/src/av1/common/alloccommon.c new file mode 100644 index 000000000..badee3df9 --- /dev/null +++ b/libs/libaom/src/av1/common/alloccommon.c @@ -0,0 +1,309 @@ +/* + * + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" + +#include "aom_mem/aom_mem.h" + +#include "av1/common/alloccommon.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/entropymode.h" +#include "av1/common/entropymv.h" + +int av1_get_MBs(int width, int height) { + const int aligned_width = ALIGN_POWER_OF_TWO(width, 3); + const int aligned_height = ALIGN_POWER_OF_TWO(height, 3); + const int mi_cols = aligned_width >> MI_SIZE_LOG2; + const int mi_rows = aligned_height >> MI_SIZE_LOG2; + + const int mb_cols = (mi_cols + 2) >> 2; + const int mb_rows = (mi_rows + 2) >> 2; + return mb_rows * mb_cols; +} + +void av1_free_ref_frame_buffers(BufferPool *pool) { + int i; + + for (i = 0; i < FRAME_BUFFERS; ++i) { + if (pool->frame_bufs[i].ref_count > 0 && + pool->frame_bufs[i].raw_frame_buffer.data != NULL) { + pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer); + pool->frame_bufs[i].raw_frame_buffer.data = NULL; + pool->frame_bufs[i].raw_frame_buffer.size = 0; + pool->frame_bufs[i].raw_frame_buffer.priv = NULL; + pool->frame_bufs[i].ref_count = 0; + } + aom_free(pool->frame_bufs[i].mvs); + pool->frame_bufs[i].mvs = NULL; + aom_free(pool->frame_bufs[i].seg_map); + pool->frame_bufs[i].seg_map = NULL; + aom_free_frame_buffer(&pool->frame_bufs[i].buf); + } +} + +// Assumes cm->rst_info[p].restoration_unit_size is already initialized +void av1_alloc_restoration_buffers(AV1_COMMON *cm) { + const int num_planes = av1_num_planes(cm); + for (int p = 0; p < num_planes; ++p) + av1_alloc_restoration_struct(cm, &cm->rst_info[p], p > 0); + + if (cm->rst_tmpbuf == NULL) { + CHECK_MEM_ERROR(cm, cm->rst_tmpbuf, + (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE)); + } + + if (cm->rlbs == NULL) { + CHECK_MEM_ERROR(cm, cm->rlbs, aom_malloc(sizeof(RestorationLineBuffers))); + } + + // For striped loop restoration, we divide each row of tiles into "stripes", + // of height 64 luma pixels but with an offset by RESTORATION_UNIT_OFFSET + // luma pixels to match the output from CDEF. We will need to store 2 * + // RESTORATION_CTX_VERT lines of data for each stripe, and also need to be + // able to quickly answer the question "Where is the 'th stripe for tile + // row ?" To make that efficient, we generate the rst_last_stripe array. + int num_stripes = 0; + for (int i = 0; i < cm->tiles.rows; ++i) { + TileInfo tile_info; + av1_tile_set_row(&tile_info, cm, i); + const int mi_h = tile_info.mi_row_end - tile_info.mi_row_start; + const int ext_h = RESTORATION_UNIT_OFFSET + (mi_h << MI_SIZE_LOG2); + const int tile_stripes = (ext_h + 63) / 64; + num_stripes += tile_stripes; + } + + // Now we need to allocate enough space to store the line buffers for the + // stripes + const int frame_w = cm->superres_upscaled_width; + const int use_highbd = cm->seq_params.use_highbitdepth; + + for (int p = 0; p < num_planes; ++p) { + const int is_uv = p > 0; + const int ss_x = is_uv && cm->seq_params.subsampling_x; + const int plane_w = ((frame_w + ss_x) >> ss_x) + 2 * RESTORATION_EXTRA_HORZ; + const int stride = ALIGN_POWER_OF_TWO(plane_w, 5); + const int buf_size = num_stripes * stride * RESTORATION_CTX_VERT + << use_highbd; + RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries; + + if (buf_size != boundaries->stripe_boundary_size || + boundaries->stripe_boundary_above == NULL || + boundaries->stripe_boundary_below == NULL) { + aom_free(boundaries->stripe_boundary_above); + aom_free(boundaries->stripe_boundary_below); + + CHECK_MEM_ERROR(cm, boundaries->stripe_boundary_above, + (uint8_t *)aom_memalign(32, buf_size)); + CHECK_MEM_ERROR(cm, boundaries->stripe_boundary_below, + (uint8_t *)aom_memalign(32, buf_size)); + + boundaries->stripe_boundary_size = buf_size; + } + boundaries->stripe_boundary_stride = stride; + } +} + +void av1_free_restoration_buffers(AV1_COMMON *cm) { + int p; + for (p = 0; p < MAX_MB_PLANE; ++p) + av1_free_restoration_struct(&cm->rst_info[p]); + aom_free(cm->rst_tmpbuf); + cm->rst_tmpbuf = NULL; + aom_free(cm->rlbs); + cm->rlbs = NULL; + for (p = 0; p < MAX_MB_PLANE; ++p) { + RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries; + aom_free(boundaries->stripe_boundary_above); + aom_free(boundaries->stripe_boundary_below); + boundaries->stripe_boundary_above = NULL; + boundaries->stripe_boundary_below = NULL; + } + + aom_free_frame_buffer(&cm->rst_frame); +} + +void av1_free_above_context_buffers(CommonContexts *above_contexts) { + int i; + const int num_planes = above_contexts->num_planes; + + for (int tile_row = 0; tile_row < above_contexts->num_tile_rows; tile_row++) { + for (i = 0; i < num_planes; i++) { + aom_free(above_contexts->entropy[i][tile_row]); + above_contexts->entropy[i][tile_row] = NULL; + } + aom_free(above_contexts->partition[tile_row]); + above_contexts->partition[tile_row] = NULL; + + aom_free(above_contexts->txfm[tile_row]); + above_contexts->txfm[tile_row] = NULL; + } + for (i = 0; i < num_planes; i++) { + aom_free(above_contexts->entropy[i]); + above_contexts->entropy[i] = NULL; + } + aom_free(above_contexts->partition); + above_contexts->partition = NULL; + + aom_free(above_contexts->txfm); + above_contexts->txfm = NULL; + + above_contexts->num_tile_rows = 0; + above_contexts->num_mi_cols = 0; + above_contexts->num_planes = 0; +} + +void av1_free_context_buffers(AV1_COMMON *cm) { + cm->mi_params.free_mi(&cm->mi_params); + + av1_free_above_context_buffers(&cm->above_contexts); + +#if CONFIG_LPF_MASK + av1_free_loop_filter_mask(cm); +#endif +} + +int av1_alloc_above_context_buffers(CommonContexts *above_contexts, + int num_tile_rows, int num_mi_cols, + int num_planes) { + const int aligned_mi_cols = + ALIGN_POWER_OF_TWO(num_mi_cols, MAX_MIB_SIZE_LOG2); + + // Allocate above context buffers + above_contexts->num_tile_rows = num_tile_rows; + above_contexts->num_mi_cols = aligned_mi_cols; + above_contexts->num_planes = num_planes; + for (int plane_idx = 0; plane_idx < num_planes; plane_idx++) { + above_contexts->entropy[plane_idx] = (ENTROPY_CONTEXT **)aom_calloc( + num_tile_rows, sizeof(above_contexts->entropy[0])); + if (!above_contexts->entropy[plane_idx]) return 1; + } + + above_contexts->partition = (PARTITION_CONTEXT **)aom_calloc( + num_tile_rows, sizeof(above_contexts->partition)); + if (!above_contexts->partition) return 1; + + above_contexts->txfm = + (TXFM_CONTEXT **)aom_calloc(num_tile_rows, sizeof(above_contexts->txfm)); + if (!above_contexts->txfm) return 1; + + for (int tile_row = 0; tile_row < num_tile_rows; tile_row++) { + for (int plane_idx = 0; plane_idx < num_planes; plane_idx++) { + above_contexts->entropy[plane_idx][tile_row] = + (ENTROPY_CONTEXT *)aom_calloc( + aligned_mi_cols, sizeof(*above_contexts->entropy[0][tile_row])); + if (!above_contexts->entropy[plane_idx][tile_row]) return 1; + } + + above_contexts->partition[tile_row] = (PARTITION_CONTEXT *)aom_calloc( + aligned_mi_cols, sizeof(*above_contexts->partition[tile_row])); + if (!above_contexts->partition[tile_row]) return 1; + + above_contexts->txfm[tile_row] = (TXFM_CONTEXT *)aom_calloc( + aligned_mi_cols, sizeof(*above_contexts->txfm[tile_row])); + if (!above_contexts->txfm[tile_row]) return 1; + } + + return 0; +} + +// Allocate the dynamically allocated arrays in 'mi_params' assuming +// 'mi_params->set_mb_mi()' was already called earlier to initialize the rest of +// the struct members. +static int alloc_mi(CommonModeInfoParams *mi_params) { + const int aligned_mi_rows = calc_mi_size(mi_params->mi_rows); + const int mi_grid_size = mi_params->mi_stride * aligned_mi_rows; + const int alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize]; + const int alloc_mi_size = + mi_params->mi_alloc_stride * (aligned_mi_rows / alloc_size_1d); + + if (mi_params->mi_alloc_size < alloc_mi_size || + mi_params->mi_grid_size < mi_grid_size) { + mi_params->free_mi(mi_params); + + mi_params->mi_alloc = + aom_calloc(alloc_mi_size, sizeof(*mi_params->mi_alloc)); + if (!mi_params->mi_alloc) return 1; + mi_params->mi_alloc_size = alloc_mi_size; + + mi_params->mi_grid_base = (MB_MODE_INFO **)aom_calloc( + mi_grid_size, sizeof(*mi_params->mi_grid_base)); + if (!mi_params->mi_grid_base) return 1; + mi_params->mi_grid_size = mi_grid_size; + + mi_params->tx_type_map = + aom_calloc(mi_grid_size, sizeof(*mi_params->tx_type_map)); + if (!mi_params->tx_type_map) return 1; + } + + return 0; +} + +int av1_alloc_context_buffers(AV1_COMMON *cm, int width, int height) { + CommonModeInfoParams *const mi_params = &cm->mi_params; + mi_params->set_mb_mi(mi_params, width, height); + if (alloc_mi(mi_params)) goto fail; + return 0; + +fail: + // clear the mi_* values to force a realloc on resync + mi_params->set_mb_mi(mi_params, 0, 0); + av1_free_context_buffers(cm); + return 1; +} + +void av1_remove_common(AV1_COMMON *cm) { + av1_free_context_buffers(cm); + + aom_free(cm->fc); + cm->fc = NULL; + aom_free(cm->default_frame_context); + cm->default_frame_context = NULL; +} + +void av1_init_mi_buffers(CommonModeInfoParams *mi_params) { + mi_params->setup_mi(mi_params); +} + +#if CONFIG_LPF_MASK +int av1_alloc_loop_filter_mask(AV1_COMMON *cm) { + aom_free(cm->lf.lfm); + cm->lf.lfm = NULL; + + // Each lfm holds bit masks for all the 4x4 blocks in a max + // 64x64 (128x128 for ext_partitions) region. The stride + // and rows are rounded up / truncated to a multiple of 16 + // (32 for ext_partition). + cm->lf.lfm_stride = + (cm->mi_params.mi_cols + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2; + cm->lf.lfm_num = + ((cm->mi_params.mi_rows + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2) * + cm->lf.lfm_stride; + cm->lf.lfm = + (LoopFilterMask *)aom_calloc(cm->lf.lfm_num, sizeof(*cm->lf.lfm)); + if (!cm->lf.lfm) return 1; + + unsigned int i; + for (i = 0; i < cm->lf.lfm_num; ++i) av1_zero(cm->lf.lfm[i]); + + return 0; +} + +void av1_free_loop_filter_mask(AV1_COMMON *cm) { + if (cm->lf.lfm == NULL) return; + + aom_free(cm->lf.lfm); + cm->lf.lfm = NULL; + cm->lf.lfm_num = 0; + cm->lf.lfm_stride = 0; +} +#endif diff --git a/libs/libaom/src/av1/common/alloccommon.h b/libs/libaom/src/av1/common/alloccommon.h new file mode 100644 index 000000000..fe8e0c530 --- /dev/null +++ b/libs/libaom/src/av1/common/alloccommon.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_ALLOCCOMMON_H_ +#define AOM_AV1_COMMON_ALLOCCOMMON_H_ + +#define INVALID_IDX -1 // Invalid buffer index. + +#include "config/aom_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1Common; +struct BufferPool; +struct CommonContexts; +struct CommonModeInfoParams; + +void av1_remove_common(struct AV1Common *cm); + +int av1_alloc_above_context_buffers(struct CommonContexts *above_contexts, + int num_tile_rows, int num_mi_cols, + int num_planes); +void av1_free_above_context_buffers(struct CommonContexts *above_contexts); +int av1_alloc_context_buffers(struct AV1Common *cm, int width, int height); +void av1_init_mi_buffers(struct CommonModeInfoParams *mi_params); +void av1_free_context_buffers(struct AV1Common *cm); + +void av1_free_ref_frame_buffers(struct BufferPool *pool); +void av1_alloc_restoration_buffers(struct AV1Common *cm); +void av1_free_restoration_buffers(struct AV1Common *cm); + +int av1_alloc_state_buffers(struct AV1Common *cm, int width, int height); +void av1_free_state_buffers(struct AV1Common *cm); + +int av1_get_MBs(int width, int height); + +#if CONFIG_LPF_MASK +int av1_alloc_loop_filter_mask(struct AV1Common *cm); +void av1_free_loop_filter_mask(struct AV1Common *cm); +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_ALLOCCOMMON_H_ diff --git a/libs/libaom/src/av1/common/arm/av1_inv_txfm_neon.c b/libs/libaom/src/av1/common/arm/av1_inv_txfm_neon.c new file mode 100644 index 000000000..2f3567aea --- /dev/null +++ b/libs/libaom/src/av1/common/arm/av1_inv_txfm_neon.c @@ -0,0 +1,4271 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "av1/common/av1_inv_txfm1d.h" +#include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/av1_txfm.h" +#include "av1/common/enums.h" +#include "av1/common/idct.h" +#include "av1/common/arm/av1_inv_txfm_neon.h" +#include "av1/common/arm/transpose_neon.h" + +// 1D itx types +typedef enum ATTRIBUTE_PACKED { + IDCT_1D, + IADST_1D, + IFLIPADST_1D = IADST_1D, + IIDENTITY_1D, + ITX_TYPES_1D, +} ITX_TYPE_1D; + +static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = { + IDCT_1D, IADST_1D, IDCT_1D, IADST_1D, + IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D, + IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D, + IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D, +}; + +static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = { + IDCT_1D, IDCT_1D, IADST_1D, IADST_1D, + IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D, + IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D, + IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D, +}; + +// 1D functions +static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = { + { av1_idct4, av1_iadst4, av1_iidentity4_c }, + { av1_idct8, av1_iadst8, av1_iidentity8_c }, + { av1_idct16, av1_iadst16, av1_iidentity16_c }, + { av1_idct32, NULL, NULL }, + { av1_idct64, NULL, NULL }, +}; + +static INLINE void lowbd_add_flip_buffer_8xn_neon(int16x8_t *in, + uint8_t *output, int stride, + int flipud, + const int height) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + int16x8_t temp_output; + for (int i = 0; i < height; ++i, j += step) { + temp_output = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(output))); + temp_output = vaddq_s16(temp_output, in[j]); + vst1_u8(output, vqmovun_s16(temp_output)); + output += stride; + } +} + +static INLINE uint8x16_t lowbd_get_recon_16x16_neon(const uint8x16_t pred, + int16x8_t res0, + int16x8_t res1) { + int16x8_t temp_output[2]; + uint8x16_t temp_output_8q; + temp_output[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pred))); + temp_output[0] = vaddq_s16(temp_output[0], res0); + temp_output[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pred))); + temp_output[1] = vaddq_s16(temp_output[1], res1); + temp_output_8q = + vcombine_u8(vqmovun_s16(temp_output[0]), vqmovun_s16(temp_output[1])); + return temp_output_8q; +} + +static INLINE void lowbd_add_flip_buffer_16xn_neon(int16x8_t *in, + uint8_t *output, int stride, + int flipud, int height) { + uint8x16_t temp_output_8q; + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + temp_output_8q = vld1q_u8(output + i * stride); + temp_output_8q = + lowbd_get_recon_16x16_neon(temp_output_8q, in[j], in[j + height]); + vst1q_u8((output + i * stride), temp_output_8q); + } +} + +static INLINE void lowbd_inv_txfm2d_memset_neon(int16x8_t *a, int size, + int value) { + for (int i = 0; i < size; i++) { + a[i] = vdupq_n_s16((int16_t)value); + } +} + +static INLINE void btf_16_lane_0_1_neon(const int16x8_t in0, + const int16x8_t in1, const int16x4_t c, + int16x8_t *t0, int16x8_t *t1) { + int32x4_t s0[2], s1[2]; + int16x4_t v0[2], v1[2]; + + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0); + + v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT); + v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT); + v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT); + v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT); + + *t0 = vcombine_s16(v0[0], v0[1]); + *t1 = vcombine_s16(v1[0], v1[1]); +} + +static INLINE void btf_16_lane_1_0_neon(const int16x8_t in0, + const int16x8_t in1, const int16x4_t c, + int16x8_t *t0, int16x8_t *t1) { + int32x4_t s0[2], s1[2]; + int16x4_t v0[2], v1[2]; + + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 1); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 1); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 0); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 0); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 0); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 0); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 1); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 1); + + v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT); + v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT); + v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT); + v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT); + + *t0 = vcombine_s16(v0[0], v0[1]); + *t1 = vcombine_s16(v1[0], v1[1]); +} + +static INLINE void btf_16_lane_2_3_neon(const int16x8_t in0, + const int16x8_t in1, const int16x4_t c, + int16x8_t *t0, int16x8_t *t1) { + int32x4_t s0[2], s1[2]; + int16x4_t v0[2], v1[2]; + + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2); + + v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT); + v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT); + v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT); + v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT); + + *t0 = vcombine_s16(v0[0], v0[1]); + *t1 = vcombine_s16(v1[0], v1[1]); +} + +static INLINE void btf_16_neon(const int16x8_t in0, int16_t coef1, + int16_t coef2, int16x8_t *t0, int16x8_t *t1) { + int32x4_t s0_l, s0_h, s1_l, s1_h; + int16x4_t v0[2], v1[2]; + + s0_l = vmull_n_s16(vget_low_s16(in0), coef1); + s0_h = vmull_n_s16(vget_high_s16(in0), coef1); + s1_l = vmull_n_s16(vget_low_s16(in0), coef2); + s1_h = vmull_n_s16(vget_high_s16(in0), coef2); + + v0[0] = vrshrn_n_s32(s0_l, INV_COS_BIT); + v0[1] = vrshrn_n_s32(s0_h, INV_COS_BIT); + v1[0] = vrshrn_n_s32(s1_l, INV_COS_BIT); + v1[1] = vrshrn_n_s32(s1_h, INV_COS_BIT); + + *t0 = vcombine_s16(v0[0], v0[1]); + *t1 = vcombine_s16(v1[0], v1[1]); +} + +static INLINE void btf_16_lane_3_2_neon(const int16x8_t in0, + const int16x8_t in1, const int16x4_t c, + int16x8_t *t0, int16x8_t *t1) { + int32x4_t s0[2], s1[2]; + int16x4_t v0[2], v1[2]; + + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3); + + v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT); + v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT); + v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT); + v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT); + + *t0 = vcombine_s16(v0[0], v0[1]); + *t1 = vcombine_s16(v1[0], v1[1]); +} + +static INLINE void btf_16_half_neon(int16x8_t *const x, const int16x4_t c) { + int32x4_t t0[2], t1[2]; + int16x4_t v0[2], v1[2]; + + // Don't add/sub before multiply, which will overflow in iadst8. + const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0); + const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0); + const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0); + const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0); + + t0[0] = vaddq_s32(x0_lo, x1_lo); + t0[1] = vaddq_s32(x0_hi, x1_hi); + t1[0] = vsubq_s32(x0_lo, x1_lo); + t1[1] = vsubq_s32(x0_hi, x1_hi); + + v0[0] = vrshrn_n_s32(t0[0], INV_COS_BIT); + v0[1] = vrshrn_n_s32(t0[1], INV_COS_BIT); + v1[0] = vrshrn_n_s32(t1[0], INV_COS_BIT); + v1[1] = vrshrn_n_s32(t1[1], INV_COS_BIT); + + x[0] = vcombine_s16(v0[0], v0[1]); + x[1] = vcombine_s16(v1[0], v1[1]); +} + +static INLINE int16x4_t set_s16x4_neon(const int16_t c0, const int16_t c1, + const int16_t c2, const int16_t c3) { + int16x4_t val = vdup_n_s16((int16_t)0); + val = vset_lane_s16(c0, val, 0); + val = vset_lane_s16(c1, val, 1); + val = vset_lane_s16(c2, val, 2); + val = vset_lane_s16(c3, val, 3); + return val; +} + +static INLINE void iadst8_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], + (int16_t)cospi[20], (int16_t)cospi[44]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[36], (int16_t)cospi[28], + (int16_t)cospi[52], (int16_t)cospi[12]); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + + int16x8_t x[8]; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + + // Stage 1 + x[0] = in[7]; + x[1] = in[0]; + x[2] = in[5]; + x[3] = in[2]; + x[4] = in[3]; + x[5] = in[4]; + x[6] = in[1]; + x[7] = in[6]; + + // Stage 2 + btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1); + btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3); + btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5); + btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7); + + // Stage 3 + x[0] = vqaddq_s16(s0, s4); + x[1] = vqaddq_s16(s1, s5); + x[2] = vqaddq_s16(s2, s6); + x[3] = vqaddq_s16(s3, s7); + x[4] = vqsubq_s16(s0, s4); + x[5] = vqsubq_s16(s1, s5); + x[6] = vqsubq_s16(s2, s6); + x[7] = vqsubq_s16(s3, s7); + + // Stage 4 + s0 = x[0]; + s1 = x[1]; + s2 = x[2]; + s3 = x[3]; + btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5); + btf_16_lane_3_2_neon(x[7], x[6], c2, &s7, &s6); + + // Stage 5 + x[0] = vqaddq_s16(s0, s2); + x[1] = vqaddq_s16(s1, s3); + x[2] = vqsubq_s16(s0, s2); + x[3] = vqsubq_s16(s1, s3); + x[4] = vqaddq_s16(s4, s6); + x[5] = vqaddq_s16(s5, s7); + x[6] = vqsubq_s16(s4, s6); + x[7] = vqsubq_s16(s5, s7); + + // stage 6 + btf_16_half_neon(x + 2, c2); + btf_16_half_neon(x + 6, c2); + + // Stage 7 + out[0] = x[0]; + out[1] = vqnegq_s16(x[4]); + out[2] = x[6]; + out[3] = vqnegq_s16(x[2]); + out[4] = x[3]; + out[5] = vqnegq_s16(x[7]); + out[6] = x[5]; + out[7] = vqnegq_s16(x[1]); +} + +static INLINE void iadst8_low1_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + + int16x8_t x[8]; + int16x8_t s0, s1, s4, s5; + + // Stage 1 + x[1] = in[0]; + + // Stage 2 + + btf_16_neon(x[1], cospi[60], -cospi[4], &s0, &s1); + + // Stage 3 + x[0] = s0; + x[1] = s1; + x[4] = s0; + x[5] = s1; + + // Stage 4 + s0 = x[0]; + s1 = x[1]; + btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5); + + // Stage 5 + x[0] = s0; + x[1] = s1; + x[2] = s0; + x[3] = s1; + x[4] = s4; + x[5] = s5; + x[6] = s4; + x[7] = s5; + + // stage 6 + btf_16_half_neon(x + 2, c2); + btf_16_half_neon(x + 6, c2); + + // Stage 7 + out[0] = x[0]; + out[1] = vqnegq_s16(x[4]); + out[2] = x[6]; + out[3] = vqnegq_s16(x[2]); + out[4] = x[3]; + out[5] = vqnegq_s16(x[7]); + out[6] = x[5]; + out[7] = vqnegq_s16(x[1]); +} + +static INLINE void idct8_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit, + int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[8], step2[8]; + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + + // stage 2 + btf_16_lane_0_1_neon(in[1], in[7], c0, &step1[7], &step1[4]); + btf_16_lane_2_3_neon(in[5], in[3], c0, &step1[6], &step1[5]); + + // stage 3 + btf_16_lane_0_1_neon(in[0], in[4], c1, &step2[0], &step2[1]); + btf_16_lane_2_3_neon(in[2], in[6], c1, &step2[3], &step2[2]); + step2[4] = vqaddq_s16(step1[4], step1[5]); + step2[5] = vqsubq_s16(step1[4], step1[5]); + step2[6] = vqsubq_s16(step1[7], step1[6]); + step2[7] = vqaddq_s16(step1[7], step1[6]); + + // stage 4 + step1[0] = vqaddq_s16(step2[0], step2[3]); + step1[1] = vqaddq_s16(step2[1], step2[2]); + step1[2] = vqsubq_s16(step2[1], step2[2]); + step1[3] = vqsubq_s16(step2[0], step2[3]); + btf_16_lane_0_1_neon(step2[6], step2[5], c1, &step1[6], &step1[5]); + + // stage 5 + out[0] = vqaddq_s16(step1[0], step2[7]); + out[1] = vqaddq_s16(step1[1], step1[6]); + out[2] = vqaddq_s16(step1[2], step1[5]); + out[3] = vqaddq_s16(step1[3], step2[4]); + out[4] = vqsubq_s16(step1[3], step2[4]); + out[5] = vqsubq_s16(step1[2], step1[5]); + out[6] = vqsubq_s16(step1[1], step1[6]); + out[7] = vqsubq_s16(step1[0], step2[7]); +} + +static INLINE void idct8_low1_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1; + int32x4_t t32[2]; + + // stage 1 + // stage 2 + // stage 3 + t32[0] = vmull_n_s16(vget_low_s16(in[0]), (int16_t)cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(in[0]), (int16_t)cospi[32]); + + step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + + // stage 4 + // stage 5 + out[0] = step1; + out[1] = step1; + out[2] = step1; + out[3] = step1; + out[4] = step1; + out[5] = step1; + out[6] = step1; + out[7] = step1; +} + +void av1_round_shift_array_16_neon(int16x8_t *arr, int size, int bit) { + assert(!(size % 4)); + if (!bit) return; + const int16x8_t dup_bits_n_16x8 = vdupq_n_s16((int16_t)(-bit)); + for (int i = 0; i < size; i++) { + arr[i] = vrshlq_s16(arr[i], dup_bits_n_16x8); + } +} + +static INLINE void flip_buf_ud_neon(int16x8_t *input, int size) { + int16x8_t temp[8]; + for (int i = 0; i < size; ++i) { + temp[i] = input[size - 1 - i]; + } + for (int i = 0; i < size; ++i) { + input[i] = temp[i]; + } +} + +static INLINE void load_buffer_32bit_to_16bit_neon(const int32_t *input, + int16x8_t *const a, + int out_size) { + for (int i = 0; i < 8; ++i) { + a[i] = vcombine_s16(vmovn_s32(vld1q_s32(input)), + vmovn_s32(vld1q_s32(input + 4))); + input += out_size; + } +} + +static int16_t sqrt_2_list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096, + 4 * 5793 }; + +static INLINE void identity_txfm_round_neon(int16x8_t *input, int16x8_t *output, + int txw_idx, int8_t size, int bit) { + const int32x4_t dup_bits_n_32x4 = vdupq_n_s32((int32_t)(-bit)); + int16x4_t scale = vdup_n_s16(sqrt_2_list[txw_idx]); + int16x4_t low_i16, high_i16; + int32x4_t low_i32, high_i32; + for (int i = 0; i < size; i++) { + int32x4_t temp_out_low = vmull_s16(vget_low_s16(input[i]), scale); + int32x4_t temp_out_high = vmull_s16(vget_high_s16(input[i]), scale); + low_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_low, 12), dup_bits_n_32x4); + high_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_high, 12), dup_bits_n_32x4); + low_i16 = vqmovn_s32(low_i32); + high_i16 = vqmovn_s32(high_i32); + output[i] = vcombine_s16(low_i16, high_i16); + } +} + +static INLINE void round_shift_for_rect(int16x8_t *input, int16x8_t *output, + int size) { + int32x4_t out_low, out_high; + int16x4_t low, high; + + for (int z = 0; z < size; ++z) { + out_low = vmull_n_s16(vget_low_s16(input[z]), (int16_t)NewInvSqrt2); + out_high = vmull_n_s16(vget_high_s16(input[z]), (int16_t)NewInvSqrt2); + + low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits); + high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits); + + output[z] = vcombine_s16(low, high); + } +} + +static INLINE void idct16_low1_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1; + int32x4_t t32[2]; + + // stage 4 + + t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]); + step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + + // stage 6 + // stage 7 + out[0] = step1; + out[1] = step1; + out[2] = step1; + out[3] = step1; + out[4] = step1; + out[5] = step1; + out[6] = step1; + out[7] = step1; + out[8] = step1; + out[9] = step1; + out[10] = step1; + out[11] = step1; + out[12] = step1; + out[13] = step1; + out[14] = step1; + out[15] = step1; +} + +static INLINE void idct16_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit, + int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[16], step2[16]; + + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], + (int16_t)cospi[36], (int16_t)cospi[28]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], + (int16_t)cospi[52], (int16_t)cospi[12]); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + const int16x4_t c4 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); + // stage 2 + + btf_16_lane_0_1_neon(in[1], in[15], c0, &step2[15], &step2[8]); + btf_16_lane_2_3_neon(in[9], in[7], c0, &step2[14], &step2[9]); + btf_16_lane_0_1_neon(in[5], in[11], c1, &step2[13], &step2[10]); + btf_16_lane_2_3_neon(in[13], in[3], c1, &step2[12], &step2[11]); + + step2[0] = in[0]; + step2[1] = in[8]; + step2[2] = in[4]; + step2[3] = in[12]; + step2[4] = in[2]; + step2[5] = in[10]; + step2[6] = in[6]; + step2[7] = in[14]; + + // stage 3 + + btf_16_lane_0_1_neon(step2[4], step2[7], c2, &step1[7], &step1[4]); + btf_16_lane_2_3_neon(step2[5], step2[6], c2, &step1[6], &step1[5]); + + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + step1[8] = vqaddq_s16(step2[8], step2[9]); + step1[9] = vqsubq_s16(step2[8], step2[9]); + step1[10] = vqsubq_s16(step2[11], step2[10]); + step1[11] = vqaddq_s16(step2[11], step2[10]); + step1[12] = vqaddq_s16(step2[12], step2[13]); + step1[13] = vqsubq_s16(step2[12], step2[13]); + step1[14] = vqsubq_s16(step2[15], step2[14]); + step1[15] = vqaddq_s16(step2[15], step2[14]); + + // stage 4 + + btf_16_lane_0_1_neon(step1[0], step1[1], c3, &step2[0], &step2[1]); + btf_16_lane_2_3_neon(step1[2], step1[3], c3, &step2[3], &step2[2]); + btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]); + btf_16_lane_3_2_neon(step1[10], step1[13], c4, &step2[10], &step2[13]); + + step2[4] = vqaddq_s16(step1[4], step1[5]); + step2[5] = vqsubq_s16(step1[4], step1[5]); + step2[6] = vqsubq_s16(step1[7], step1[6]); + step2[7] = vqaddq_s16(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + + btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]); + + step1[0] = vqaddq_s16(step2[0], step2[3]); + step1[1] = vqaddq_s16(step2[1], step2[2]); + step1[2] = vqsubq_s16(step2[1], step2[2]); + step1[3] = vqsubq_s16(step2[0], step2[3]); + step1[4] = step2[4]; + step1[7] = step2[7]; + step1[8] = vqaddq_s16(step2[8], step2[11]); + step1[9] = vqaddq_s16(step2[9], step2[10]); + step1[10] = vqsubq_s16(step2[9], step2[10]); + step1[11] = vqsubq_s16(step2[8], step2[11]); + step1[12] = vqsubq_s16(step2[15], step2[12]); + step1[13] = vqsubq_s16(step2[14], step2[13]); + step1[14] = vqaddq_s16(step2[14], step2[13]); + step1[15] = vqaddq_s16(step2[15], step2[12]); + + // stage 6 + + btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]); + btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]); + + step2[0] = vqaddq_s16(step1[0], step1[7]); + step2[1] = vqaddq_s16(step1[1], step1[6]); + step2[2] = vqaddq_s16(step1[2], step1[5]); + step2[3] = vqaddq_s16(step1[3], step1[4]); + step2[4] = vqsubq_s16(step1[3], step1[4]); + step2[5] = vqsubq_s16(step1[2], step1[5]); + step2[6] = vqsubq_s16(step1[1], step1[6]); + step2[7] = vqsubq_s16(step1[0], step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + out[0] = vqaddq_s16(step2[0], step2[15]); + out[1] = vqaddq_s16(step2[1], step2[14]); + out[2] = vqaddq_s16(step2[2], step2[13]); + out[3] = vqaddq_s16(step2[3], step2[12]); + out[4] = vqaddq_s16(step2[4], step2[11]); + out[5] = vqaddq_s16(step2[5], step2[10]); + out[6] = vqaddq_s16(step2[6], step2[9]); + out[7] = vqaddq_s16(step2[7], step2[8]); + out[8] = vqsubq_s16(step2[7], step2[8]); + out[9] = vqsubq_s16(step2[6], step2[9]); + out[10] = vqsubq_s16(step2[5], step2[10]); + out[11] = vqsubq_s16(step2[4], step2[11]); + out[12] = vqsubq_s16(step2[3], step2[12]); + out[13] = vqsubq_s16(step2[2], step2[13]); + out[14] = vqsubq_s16(step2[1], step2[14]); + out[15] = vqsubq_s16(step2[0], step2[15]); +} + +static INLINE void idct16_low8_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[16], step2[16]; + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + const int16x4_t c1 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); + + // stage 1 + // stage 2 + + step2[0] = in[0]; + step2[2] = in[4]; + step2[4] = in[2]; + step2[6] = in[6]; + + btf_16_neon(in[1], cospi[60], cospi[4], &step2[8], &step2[15]); + btf_16_neon(in[7], -cospi[36], cospi[28], &step2[9], &step2[14]); + btf_16_neon(in[5], cospi[44], cospi[20], &step2[10], &step2[13]); + btf_16_neon(in[3], -cospi[52], cospi[12], &step2[11], &step2[12]); + + // stage 3 + + btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]); + btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]); + + step1[0] = step2[0]; + step1[2] = step2[2]; + step1[8] = vqaddq_s16(step2[8], step2[9]); + step1[9] = vqsubq_s16(step2[8], step2[9]); + step1[10] = vqsubq_s16(step2[11], step2[10]); + step1[11] = vqaddq_s16(step2[11], step2[10]); + step1[12] = vqaddq_s16(step2[12], step2[13]); + step1[13] = vqsubq_s16(step2[12], step2[13]); + step1[14] = vqsubq_s16(step2[15], step2[14]); + step1[15] = vqaddq_s16(step2[15], step2[14]); + + // stage 4 + + btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]); + btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]); + btf_16_lane_2_3_neon(step1[14], step1[9], c0, &step2[14], &step2[9]); + btf_16_lane_3_2_neon(step1[10], step1[13], c1, &step2[10], &step2[13]); + + step2[4] = vqaddq_s16(step1[4], step1[5]); + step2[5] = vqsubq_s16(step1[4], step1[5]); + step2[6] = vqsubq_s16(step1[7], step1[6]); + step2[7] = vqaddq_s16(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + + btf_16_lane_0_1_neon(step2[6], step2[5], c0, &step1[6], &step1[5]); + step1[0] = vqaddq_s16(step2[0], step2[3]); + step1[1] = vqaddq_s16(step2[1], step2[2]); + step1[2] = vqsubq_s16(step2[1], step2[2]); + step1[3] = vqsubq_s16(step2[0], step2[3]); + step1[4] = step2[4]; + step1[7] = step2[7]; + step1[8] = vqaddq_s16(step2[8], step2[11]); + step1[9] = vqaddq_s16(step2[9], step2[10]); + step1[10] = vqsubq_s16(step2[9], step2[10]); + step1[11] = vqsubq_s16(step2[8], step2[11]); + step1[12] = vqsubq_s16(step2[15], step2[12]); + step1[13] = vqsubq_s16(step2[14], step2[13]); + step1[14] = vqaddq_s16(step2[14], step2[13]); + step1[15] = vqaddq_s16(step2[15], step2[12]); + + // stage 6 + btf_16_lane_0_1_neon(step1[13], step1[10], c0, &step2[13], &step2[10]); + btf_16_lane_0_1_neon(step1[12], step1[11], c0, &step2[12], &step2[11]); + + step2[0] = vqaddq_s16(step1[0], step1[7]); + step2[1] = vqaddq_s16(step1[1], step1[6]); + step2[2] = vqaddq_s16(step1[2], step1[5]); + step2[3] = vqaddq_s16(step1[3], step1[4]); + step2[4] = vqsubq_s16(step1[3], step1[4]); + step2[5] = vqsubq_s16(step1[2], step1[5]); + step2[6] = vqsubq_s16(step1[1], step1[6]); + step2[7] = vqsubq_s16(step1[0], step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + + out[0] = vqaddq_s16(step2[0], step2[15]); + out[1] = vqaddq_s16(step2[1], step2[14]); + out[2] = vqaddq_s16(step2[2], step2[13]); + out[3] = vqaddq_s16(step2[3], step2[12]); + out[4] = vqaddq_s16(step2[4], step2[11]); + out[5] = vqaddq_s16(step2[5], step2[10]); + out[6] = vqaddq_s16(step2[6], step2[9]); + out[7] = vqaddq_s16(step2[7], step2[8]); + out[8] = vqsubq_s16(step2[7], step2[8]); + out[9] = vqsubq_s16(step2[6], step2[9]); + out[10] = vqsubq_s16(step2[5], step2[10]); + out[11] = vqsubq_s16(step2[4], step2[11]); + out[12] = vqsubq_s16(step2[3], step2[12]); + out[13] = vqsubq_s16(step2[2], step2[13]); + out[14] = vqsubq_s16(step2[1], step2[14]); + out[15] = vqsubq_s16(step2[0], step2[15]); +} + +static INLINE void iadst16_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62], + (int16_t)cospi[10], (int16_t)cospi[54]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46], + (int16_t)cospi[26], (int16_t)cospi[38]); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[34], (int16_t)cospi[30], + (int16_t)cospi[42], (int16_t)cospi[22]); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[50], (int16_t)cospi[14], + (int16_t)cospi[58], (int16_t)cospi[6]); + const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + + int16x8_t x[16]; + int16x8_t t[14]; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + int16x8_t s8, s9, s10, s11, s12, s13, s14, s15; + + // Stage 1 + x[0] = in[15]; + x[1] = in[0]; + x[2] = in[13]; + x[3] = in[2]; + x[4] = in[11]; + x[5] = in[4]; + x[6] = in[9]; + x[7] = in[6]; + x[8] = in[7]; + x[9] = in[8]; + x[10] = in[5]; + x[11] = in[10]; + x[12] = in[3]; + x[13] = in[12]; + x[14] = in[1]; + x[15] = in[14]; + + // Stage 2 + btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1); + btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3); + btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5); + btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7); + btf_16_lane_0_1_neon(x[8], x[9], c2, &s8, &s9); + btf_16_lane_2_3_neon(x[10], x[11], c2, &s10, &s11); + btf_16_lane_0_1_neon(x[12], x[13], c3, &s12, &s13); + btf_16_lane_2_3_neon(x[14], x[15], c3, &s14, &s15); + + // Stage 3 + x[0] = vqaddq_s16(s0, s8); + x[1] = vqaddq_s16(s1, s9); + x[2] = vqaddq_s16(s2, s10); + x[3] = vqaddq_s16(s3, s11); + x[4] = vqaddq_s16(s4, s12); + x[5] = vqaddq_s16(s5, s13); + x[6] = vqaddq_s16(s6, s14); + x[7] = vqaddq_s16(s7, s15); + x[8] = vqsubq_s16(s0, s8); + x[9] = vqsubq_s16(s1, s9); + x[10] = vqsubq_s16(s2, s10); + x[11] = vqsubq_s16(s3, s11); + x[12] = vqsubq_s16(s4, s12); + x[13] = vqsubq_s16(s5, s13); + x[14] = vqsubq_s16(s6, s14); + x[15] = vqsubq_s16(s7, s15); + + // Stage 4 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + t[4] = x[4]; + t[5] = x[5]; + t[6] = x[6]; + t[7] = x[7]; + btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9); + btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11); + btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12); + btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14); + + // Stage 5 + x[0] = vqaddq_s16(t[0], t[4]); + x[1] = vqaddq_s16(t[1], t[5]); + x[2] = vqaddq_s16(t[2], t[6]); + x[3] = vqaddq_s16(t[3], t[7]); + x[4] = vqsubq_s16(t[0], t[4]); + x[5] = vqsubq_s16(t[1], t[5]); + x[6] = vqsubq_s16(t[2], t[6]); + x[7] = vqsubq_s16(t[3], t[7]); + x[8] = vqaddq_s16(s8, s12); + x[9] = vqaddq_s16(s9, s13); + x[10] = vqaddq_s16(s10, s14); + x[11] = vqaddq_s16(s11, s15); + x[12] = vqsubq_s16(s8, s12); + x[13] = vqsubq_s16(s9, s13); + x[14] = vqsubq_s16(s10, s14); + x[15] = vqsubq_s16(s11, s15); + + // stage 6 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + btf_16_lane_2_3_neon(x[4], x[5], c5, &s4, &s5); + btf_16_lane_3_2_neon(x[7], x[6], c5, &s7, &s6); + t[8] = x[8]; + t[9] = x[9]; + t[10] = x[10]; + t[11] = x[11]; + btf_16_lane_2_3_neon(x[12], x[13], c5, &s12, &s13); + btf_16_lane_3_2_neon(x[15], x[14], c5, &s15, &s14); + + // Stage 7 + x[0] = vqaddq_s16(t[0], t[2]); + x[1] = vqaddq_s16(t[1], t[3]); + x[2] = vqsubq_s16(t[0], t[2]); + x[3] = vqsubq_s16(t[1], t[3]); + x[4] = vqaddq_s16(s4, s6); + x[5] = vqaddq_s16(s5, s7); + x[6] = vqsubq_s16(s4, s6); + x[7] = vqsubq_s16(s5, s7); + x[8] = vqaddq_s16(t[8], t[10]); + x[9] = vqaddq_s16(t[9], t[11]); + x[10] = vqsubq_s16(t[8], t[10]); + x[11] = vqsubq_s16(t[9], t[11]); + x[12] = vqaddq_s16(s12, s14); + x[13] = vqaddq_s16(s13, s15); + x[14] = vqsubq_s16(s12, s14); + x[15] = vqsubq_s16(s13, s15); + + // Stage 8 + btf_16_half_neon(x + 2, c5); + btf_16_half_neon(x + 6, c5); + btf_16_half_neon(x + 10, c5); + btf_16_half_neon(x + 14, c5); + + // Stage 9 + out[0] = x[0]; + out[1] = vqnegq_s16(x[8]); + out[2] = x[12]; + out[3] = vqnegq_s16(x[4]); + out[4] = x[6]; + out[5] = vqnegq_s16(x[14]); + out[6] = x[10]; + out[7] = vqnegq_s16(x[2]); + out[8] = x[3]; + out[9] = vqnegq_s16(x[11]); + out[10] = x[15]; + out[11] = vqnegq_s16(x[7]); + out[12] = x[5]; + out[13] = vqnegq_s16(x[13]); + out[14] = x[9]; + out[15] = vqnegq_s16(x[1]); +} + +static INLINE void iadst16_low1_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + + int16x8_t x[16]; + int16x8_t t[10]; + int16x8_t s0, s1, s4, s5; + int16x8_t s8, s9, s12, s13; + + // Stage 1 + x[1] = in[0]; + + // Stage 2 + btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1); + + // Stage 3 + x[0] = s0; + x[1] = s1; + x[8] = s0; + x[9] = s1; + + // Stage 4 + t[0] = x[0]; + t[1] = x[1]; + btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9); + + // Stage 5 + x[0] = t[0]; + x[1] = t[1]; + x[4] = t[0]; + x[5] = t[1]; + x[8] = s8; + x[9] = s9; + x[12] = s8; + x[13] = s9; + + // stage 6 + t[0] = x[0]; + t[1] = x[1]; + btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5); + t[8] = x[8]; + t[9] = x[9]; + btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13); + + // Stage 7 + x[0] = t[0]; + x[1] = t[1]; + x[2] = t[0]; + x[3] = t[1]; + x[4] = s4; + x[5] = s5; + x[6] = s4; + x[7] = s5; + x[8] = t[8]; + x[9] = t[9]; + x[10] = t[8]; + x[11] = t[9]; + x[12] = s12; + x[13] = s13; + x[14] = s12; + x[15] = s13; + + // Stage 8 + btf_16_half_neon(x + 2, c1); + btf_16_half_neon(x + 6, c1); + btf_16_half_neon(x + 10, c1); + btf_16_half_neon(x + 14, c1); + + // Stage 9 + out[0] = x[0]; + out[1] = vqnegq_s16(x[8]); + out[2] = x[12]; + out[3] = vqnegq_s16(x[4]); + out[4] = x[6]; + out[5] = vqnegq_s16(x[14]); + out[6] = x[10]; + out[7] = vqnegq_s16(x[2]); + out[8] = x[3]; + out[9] = vqnegq_s16(x[11]); + out[10] = x[15]; + out[11] = vqnegq_s16(x[7]); + out[12] = x[5]; + out[13] = vqnegq_s16(x[13]); + out[14] = x[9]; + out[15] = vqnegq_s16(x[1]); +} + +static INLINE void iadst16_low8_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + + int16x8_t x[16]; + int16x8_t t[14]; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + int16x8_t s8, s9, s10, s11, s12, s13, s14, s15; + + // Stage 1 + x[1] = in[0]; + x[3] = in[2]; + x[5] = in[4]; + x[7] = in[6]; + x[8] = in[7]; + x[10] = in[5]; + x[12] = in[3]; + x[14] = in[1]; + + // Stage 2 + btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1); + btf_16_neon(x[3], cospi[54], -cospi[10], &s2, &s3); + btf_16_neon(x[5], cospi[46], -cospi[18], &s4, &s5); + btf_16_neon(x[7], cospi[38], -cospi[26], &s6, &s7); + + btf_16_neon(x[8], cospi[34], cospi[30], &s8, &s9); + btf_16_neon(x[10], cospi[42], cospi[22], &s10, &s11); + btf_16_neon(x[12], cospi[50], cospi[14], &s12, &s13); + btf_16_neon(x[14], cospi[58], cospi[6], &s14, &s15); + + // Stage 3 + x[0] = vqaddq_s16(s0, s8); + x[1] = vqaddq_s16(s1, s9); + x[2] = vqaddq_s16(s2, s10); + x[3] = vqaddq_s16(s3, s11); + x[4] = vqaddq_s16(s4, s12); + x[5] = vqaddq_s16(s5, s13); + x[6] = vqaddq_s16(s6, s14); + x[7] = vqaddq_s16(s7, s15); + x[8] = vqsubq_s16(s0, s8); + x[9] = vqsubq_s16(s1, s9); + x[10] = vqsubq_s16(s2, s10); + x[11] = vqsubq_s16(s3, s11); + x[12] = vqsubq_s16(s4, s12); + x[13] = vqsubq_s16(s5, s13); + x[14] = vqsubq_s16(s6, s14); + x[15] = vqsubq_s16(s7, s15); + + // Stage 4 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + t[4] = x[4]; + t[5] = x[5]; + t[6] = x[6]; + t[7] = x[7]; + btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9); + btf_16_lane_2_3_neon(x[10], x[11], c0, &s10, &s11); + btf_16_lane_1_0_neon(x[13], x[12], c0, &s13, &s12); + btf_16_lane_3_2_neon(x[15], x[14], c0, &s15, &s14); + + // Stage 5 + x[0] = vqaddq_s16(t[0], t[4]); + x[1] = vqaddq_s16(t[1], t[5]); + x[2] = vqaddq_s16(t[2], t[6]); + x[3] = vqaddq_s16(t[3], t[7]); + x[4] = vqsubq_s16(t[0], t[4]); + x[5] = vqsubq_s16(t[1], t[5]); + x[6] = vqsubq_s16(t[2], t[6]); + x[7] = vqsubq_s16(t[3], t[7]); + x[8] = vqaddq_s16(s8, s12); + x[9] = vqaddq_s16(s9, s13); + x[10] = vqaddq_s16(s10, s14); + x[11] = vqaddq_s16(s11, s15); + x[12] = vqsubq_s16(s8, s12); + x[13] = vqsubq_s16(s9, s13); + x[14] = vqsubq_s16(s10, s14); + x[15] = vqsubq_s16(s11, s15); + + // stage 6 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5); + btf_16_lane_3_2_neon(x[7], x[6], c1, &s7, &s6); + t[8] = x[8]; + t[9] = x[9]; + t[10] = x[10]; + t[11] = x[11]; + btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13); + btf_16_lane_3_2_neon(x[15], x[14], c1, &s15, &s14); + + // Stage 7 + x[0] = vqaddq_s16(t[0], t[2]); + x[1] = vqaddq_s16(t[1], t[3]); + x[2] = vqsubq_s16(t[0], t[2]); + x[3] = vqsubq_s16(t[1], t[3]); + x[4] = vqaddq_s16(s4, s6); + x[5] = vqaddq_s16(s5, s7); + x[6] = vqsubq_s16(s4, s6); + x[7] = vqsubq_s16(s5, s7); + x[8] = vqaddq_s16(t[8], t[10]); + x[9] = vqaddq_s16(t[9], t[11]); + x[10] = vqsubq_s16(t[8], t[10]); + x[11] = vqsubq_s16(t[9], t[11]); + x[12] = vqaddq_s16(s12, s14); + x[13] = vqaddq_s16(s13, s15); + x[14] = vqsubq_s16(s12, s14); + x[15] = vqsubq_s16(s13, s15); + + // Stage 8 + btf_16_half_neon(x + 2, c1); + btf_16_half_neon(x + 6, c1); + btf_16_half_neon(x + 10, c1); + btf_16_half_neon(x + 14, c1); + + // Stage 9 + out[0] = x[0]; + out[1] = vqnegq_s16(x[8]); + out[2] = x[12]; + out[3] = vqnegq_s16(x[4]); + out[4] = x[6]; + out[5] = vqnegq_s16(x[14]); + out[6] = x[10]; + out[7] = vqnegq_s16(x[2]); + out[8] = x[3]; + out[9] = vqnegq_s16(x[11]); + out[10] = x[15]; + out[11] = vqnegq_s16(x[7]); + out[12] = x[5]; + out[13] = vqnegq_s16(x[13]); + out[14] = x[9]; + out[15] = vqnegq_s16(x[1]); +} + +static INLINE void idct32_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit, + int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[32], step2[32]; + + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62], + (int16_t)cospi[34], (int16_t)cospi[30]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46], + (int16_t)cospi[50], (int16_t)cospi[14]); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[10], (int16_t)cospi[54], + (int16_t)cospi[42], (int16_t)cospi[22]); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[26], (int16_t)cospi[38], + (int16_t)cospi[58], (int16_t)cospi[6]); + const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], + (int16_t)cospi[36], (int16_t)cospi[28]); + const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], + (int16_t)cospi[52], (int16_t)cospi[12]); + const int16x4_t c6 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c7 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + const int16x4_t c8 = + set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), + (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); + const int16x4_t c9 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); + + // stage 2 + + btf_16_lane_0_1_neon(in[1], in[31], c0, &step2[31], &step2[16]); + btf_16_lane_2_3_neon(in[17], in[15], c0, &step2[30], &step2[17]); + btf_16_lane_0_1_neon(in[9], in[23], c1, &step2[29], &step2[18]); + btf_16_lane_2_3_neon(in[25], in[7], c1, &step2[28], &step2[19]); + btf_16_lane_0_1_neon(in[5], in[27], c2, &step2[27], &step2[20]); + btf_16_lane_2_3_neon(in[21], in[11], c2, &step2[26], &step2[21]); + btf_16_lane_0_1_neon(in[13], in[19], c3, &step2[25], &step2[22]); + btf_16_lane_2_3_neon(in[29], in[3], c3, &step2[24], &step2[23]); + + step2[0] = in[0]; + step2[1] = in[16]; + step2[2] = in[8]; + step2[3] = in[24]; + step2[4] = in[4]; + step2[5] = in[20]; + step2[6] = in[12]; + step2[7] = in[28]; + step2[8] = in[2]; + step2[9] = in[18]; + step2[10] = in[10]; + step2[11] = in[26]; + step2[12] = in[6]; + step2[13] = in[22]; + step2[14] = in[14]; + step2[15] = in[30]; + + // stage 3 + + btf_16_lane_0_1_neon(step2[8], step2[15], c4, &step1[15], &step1[8]); + btf_16_lane_2_3_neon(step2[9], step2[14], c4, &step1[14], &step1[9]); + btf_16_lane_0_1_neon(step2[10], step2[13], c5, &step1[13], &step1[10]); + btf_16_lane_2_3_neon(step2[11], step2[12], c5, &step1[12], &step1[11]); + + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + step1[4] = step2[4]; + step1[5] = step2[5]; + step1[6] = step2[6]; + step1[7] = step2[7]; + + step1[16] = vqaddq_s16(step2[16], step2[17]); + step1[17] = vqsubq_s16(step2[16], step2[17]); + step1[18] = vqsubq_s16(step2[19], step2[18]); + step1[19] = vqaddq_s16(step2[19], step2[18]); + step1[20] = vqaddq_s16(step2[20], step2[21]); + step1[21] = vqsubq_s16(step2[20], step2[21]); + step1[22] = vqsubq_s16(step2[23], step2[22]); + step1[23] = vqaddq_s16(step2[23], step2[22]); + step1[24] = vqaddq_s16(step2[24], step2[25]); + step1[25] = vqsubq_s16(step2[24], step2[25]); + step1[26] = vqsubq_s16(step2[27], step2[26]); + step1[27] = vqaddq_s16(step2[27], step2[26]); + step1[28] = vqaddq_s16(step2[28], step2[29]); + step1[29] = vqsubq_s16(step2[28], step2[29]); + step1[30] = vqsubq_s16(step2[31], step2[30]); + step1[31] = vqaddq_s16(step2[31], step2[30]); + + // stage 4 + + btf_16_lane_0_1_neon(step1[4], step1[7], c6, &step2[7], &step2[4]); + btf_16_lane_2_3_neon(step1[5], step1[6], c6, &step2[6], &step2[5]); + btf_16_lane_0_1_neon(step1[30], step1[17], c6, &step2[30], &step2[17]); + btf_16_lane_1_0_neon(step1[18], step1[29], c8, &step2[18], &step2[29]); + btf_16_lane_2_3_neon(step1[26], step1[21], c6, &step2[26], &step2[21]); + btf_16_lane_3_2_neon(step1[22], step1[25], c8, &step2[22], &step2[25]); + + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[8] = vqaddq_s16(step1[8], step1[9]); + step2[9] = vqsubq_s16(step1[8], step1[9]); + step2[10] = vqsubq_s16(step1[11], step1[10]); + step2[11] = vqaddq_s16(step1[11], step1[10]); + step2[12] = vqaddq_s16(step1[12], step1[13]); + step2[13] = vqsubq_s16(step1[12], step1[13]); + step2[14] = vqsubq_s16(step1[15], step1[14]); + step2[15] = vqaddq_s16(step1[15], step1[14]); + step2[16] = step1[16]; + step2[19] = step1[19]; + step2[20] = step1[20]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[27] = step1[27]; + step2[28] = step1[28]; + step2[31] = step1[31]; + + // stage 5 + + btf_16_lane_0_1_neon(step2[0], step2[1], c7, &step1[0], &step1[1]); + btf_16_lane_2_3_neon(step2[2], step2[3], c7, &step1[3], &step1[2]); + btf_16_lane_2_3_neon(step2[14], step2[9], c7, &step1[14], &step1[9]); + btf_16_lane_3_2_neon(step2[10], step2[13], c9, &step1[10], &step1[13]); + + step1[4] = vqaddq_s16(step2[4], step2[5]); + step1[5] = vqsubq_s16(step2[4], step2[5]); + step1[6] = vqsubq_s16(step2[7], step2[6]); + step1[7] = vqaddq_s16(step2[7], step2[6]); + step1[8] = step2[8]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[19]); + step1[17] = vqaddq_s16(step2[17], step2[18]); + step1[18] = vqsubq_s16(step2[17], step2[18]); + step1[19] = vqsubq_s16(step2[16], step2[19]); + step1[20] = vqsubq_s16(step2[23], step2[20]); + step1[21] = vqsubq_s16(step2[22], step2[21]); + step1[22] = vqaddq_s16(step2[22], step2[21]); + step1[23] = vqaddq_s16(step2[23], step2[20]); + step1[24] = vqaddq_s16(step2[24], step2[27]); + step1[25] = vqaddq_s16(step2[25], step2[26]); + step1[26] = vqsubq_s16(step2[25], step2[26]); + step1[27] = vqsubq_s16(step2[24], step2[27]); + step1[28] = vqsubq_s16(step2[31], step2[28]); + step1[29] = vqsubq_s16(step2[30], step2[29]); + step1[30] = vqaddq_s16(step2[30], step2[29]); + step1[31] = vqaddq_s16(step2[31], step2[28]); + + // stage 6 + + btf_16_lane_0_1_neon(step1[6], step1[5], c7, &step2[6], &step2[5]); + btf_16_lane_2_3_neon(step1[29], step1[18], c7, &step2[29], &step2[18]); + btf_16_lane_2_3_neon(step1[28], step1[19], c7, &step2[28], &step2[19]); + btf_16_lane_3_2_neon(step1[20], step1[27], c9, &step2[20], &step2[27]); + btf_16_lane_3_2_neon(step1[21], step1[26], c9, &step2[21], &step2[26]); + + step2[0] = vqaddq_s16(step1[0], step1[3]); + step2[1] = vqaddq_s16(step1[1], step1[2]); + step2[2] = vqsubq_s16(step1[1], step1[2]); + step2[3] = vqsubq_s16(step1[0], step1[3]); + step2[4] = step1[4]; + step2[7] = step1[7]; + step2[8] = vqaddq_s16(step1[8], step1[11]); + step2[9] = vqaddq_s16(step1[9], step1[10]); + step2[10] = vqsubq_s16(step1[9], step1[10]); + step2[11] = vqsubq_s16(step1[8], step1[11]); + step2[12] = vqsubq_s16(step1[15], step1[12]); + step2[13] = vqsubq_s16(step1[14], step1[13]); + step2[14] = vqaddq_s16(step1[14], step1[13]); + step2[15] = vqaddq_s16(step1[15], step1[12]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[22] = step1[22]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[25] = step1[25]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 7 + + btf_16_lane_0_1_neon(step2[13], step2[10], c7, &step1[13], &step1[10]); + btf_16_lane_0_1_neon(step2[12], step2[11], c7, &step1[12], &step1[11]); + + step1[0] = vqaddq_s16(step2[0], step2[7]); + step1[1] = vqaddq_s16(step2[1], step2[6]); + step1[2] = vqaddq_s16(step2[2], step2[5]); + step1[3] = vqaddq_s16(step2[3], step2[4]); + step1[4] = vqsubq_s16(step2[3], step2[4]); + step1[5] = vqsubq_s16(step2[2], step2[5]); + step1[6] = vqsubq_s16(step2[1], step2[6]); + step1[7] = vqsubq_s16(step2[0], step2[7]); + step1[8] = step2[8]; + step1[9] = step2[9]; + step1[14] = step2[14]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[23]); + step1[17] = vqaddq_s16(step2[17], step2[22]); + step1[18] = vqaddq_s16(step2[18], step2[21]); + step1[19] = vqaddq_s16(step2[19], step2[20]); + step1[20] = vqsubq_s16(step2[19], step2[20]); + step1[21] = vqsubq_s16(step2[18], step2[21]); + step1[22] = vqsubq_s16(step2[17], step2[22]); + step1[23] = vqsubq_s16(step2[16], step2[23]); + step1[24] = vqsubq_s16(step2[31], step2[24]); + step1[25] = vqsubq_s16(step2[30], step2[25]); + step1[26] = vqsubq_s16(step2[29], step2[26]); + step1[27] = vqsubq_s16(step2[28], step2[27]); + step1[28] = vqaddq_s16(step2[27], step2[28]); + step1[29] = vqaddq_s16(step2[26], step2[29]); + step1[30] = vqaddq_s16(step2[25], step2[30]); + step1[31] = vqaddq_s16(step2[24], step2[31]); + + // stage 8 + + btf_16_lane_0_1_neon(step1[27], step1[20], c7, &step2[27], &step2[20]); + btf_16_lane_0_1_neon(step1[26], step1[21], c7, &step2[26], &step2[21]); + btf_16_lane_0_1_neon(step1[25], step1[22], c7, &step2[25], &step2[22]); + btf_16_lane_0_1_neon(step1[24], step1[23], c7, &step2[24], &step2[23]); + + step2[0] = vqaddq_s16(step1[0], step1[15]); + step2[1] = vqaddq_s16(step1[1], step1[14]); + step2[2] = vqaddq_s16(step1[2], step1[13]); + step2[3] = vqaddq_s16(step1[3], step1[12]); + step2[4] = vqaddq_s16(step1[4], step1[11]); + step2[5] = vqaddq_s16(step1[5], step1[10]); + step2[6] = vqaddq_s16(step1[6], step1[9]); + step2[7] = vqaddq_s16(step1[7], step1[8]); + step2[8] = vqsubq_s16(step1[7], step1[8]); + step2[9] = vqsubq_s16(step1[6], step1[9]); + step2[10] = vqsubq_s16(step1[5], step1[10]); + step2[11] = vqsubq_s16(step1[4], step1[11]); + step2[12] = vqsubq_s16(step1[3], step1[12]); + step2[13] = vqsubq_s16(step1[2], step1[13]); + step2[14] = vqsubq_s16(step1[1], step1[14]); + step2[15] = vqsubq_s16(step1[0], step1[15]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[18] = step1[18]; + step2[19] = step1[19]; + step2[28] = step1[28]; + step2[29] = step1[29]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 9 + + out[0] = vqaddq_s16(step2[0], step2[31]); + out[1] = vqaddq_s16(step2[1], step2[30]); + out[2] = vqaddq_s16(step2[2], step2[29]); + out[3] = vqaddq_s16(step2[3], step2[28]); + out[4] = vqaddq_s16(step2[4], step2[27]); + out[5] = vqaddq_s16(step2[5], step2[26]); + out[6] = vqaddq_s16(step2[6], step2[25]); + out[7] = vqaddq_s16(step2[7], step2[24]); + out[8] = vqaddq_s16(step2[8], step2[23]); + out[9] = vqaddq_s16(step2[9], step2[22]); + out[10] = vqaddq_s16(step2[10], step2[21]); + out[11] = vqaddq_s16(step2[11], step2[20]); + out[12] = vqaddq_s16(step2[12], step2[19]); + out[13] = vqaddq_s16(step2[13], step2[18]); + out[14] = vqaddq_s16(step2[14], step2[17]); + out[15] = vqaddq_s16(step2[15], step2[16]); + out[16] = vqsubq_s16(step2[15], step2[16]); + out[17] = vqsubq_s16(step2[14], step2[17]); + out[18] = vqsubq_s16(step2[13], step2[18]); + out[19] = vqsubq_s16(step2[12], step2[19]); + out[20] = vqsubq_s16(step2[11], step2[20]); + out[21] = vqsubq_s16(step2[10], step2[21]); + out[22] = vqsubq_s16(step2[9], step2[22]); + out[23] = vqsubq_s16(step2[8], step2[23]); + out[24] = vqsubq_s16(step2[7], step2[24]); + out[25] = vqsubq_s16(step2[6], step2[25]); + out[26] = vqsubq_s16(step2[5], step2[26]); + out[27] = vqsubq_s16(step2[4], step2[27]); + out[28] = vqsubq_s16(step2[3], step2[28]); + out[29] = vqsubq_s16(step2[2], step2[29]); + out[30] = vqsubq_s16(step2[1], step2[30]); + out[31] = vqsubq_s16(step2[0], step2[31]); +} + +static INLINE void idct32_low1_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1; + int32x4_t t32[2]; + + // stage 1 + // stage 2 + // stage 3 + // stage 4 + // stage 5 + + t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]); + step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + + // stage 6 + // stage 7 + // stage 8 + // stage 9 + + out[0] = step1; + out[1] = step1; + out[2] = step1; + out[3] = step1; + out[4] = step1; + out[5] = step1; + out[6] = step1; + out[7] = step1; + out[8] = step1; + out[9] = step1; + out[10] = step1; + out[11] = step1; + out[12] = step1; + out[13] = step1; + out[14] = step1; + out[15] = step1; + out[16] = step1; + out[17] = step1; + out[18] = step1; + out[19] = step1; + out[20] = step1; + out[21] = step1; + out[22] = step1; + out[23] = step1; + out[24] = step1; + out[25] = step1; + out[26] = step1; + out[27] = step1; + out[28] = step1; + out[29] = step1; + out[30] = step1; + out[31] = step1; +} + +static INLINE void idct32_low8_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[32], step2[32]; + int32x4_t t32[16]; + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], cospi[48]); + const int16x4_t c2 = + set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), + (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); + const int16x4_t c3 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); + // stage 1 + // stage 2 + + step2[0] = in[0]; + step2[4] = in[4]; + step2[8] = in[2]; + step2[12] = in[6]; + + btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]); + btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]); + btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]); + btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]); + + // stage 3 + step1[0] = step2[0]; + step1[4] = step2[4]; + + btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]); + btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]); + + step1[16] = step2[16]; + step1[17] = step2[16]; + step1[18] = step2[19]; + step1[19] = step2[19]; + step1[20] = step2[20]; + step1[21] = step2[20]; + step1[22] = step2[23]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[24]; + step1[26] = step2[27]; + step1[27] = step2[27]; + step1[28] = step2[28]; + step1[29] = step2[28]; + step1[30] = step2[31]; + step1[31] = step2[31]; + + // stage 4 + + btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]); + btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]); + btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]); + btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]); + btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]); + + step2[0] = step1[0]; + step2[8] = step1[8]; + step2[9] = step1[8]; + step2[10] = step1[11]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[13] = step1[12]; + step2[14] = step1[15]; + step2[15] = step1[15]; + step2[16] = step1[16]; + step2[19] = step1[19]; + step2[20] = step1[20]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[27] = step1[27]; + step2[28] = step1[28]; + step2[31] = step1[31]; + + // stage 5 + + t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]); + step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + + btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]); + btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]); + + step1[4] = step2[4]; + step1[5] = step2[4]; + step1[6] = step2[7]; + step1[7] = step2[7]; + step1[8] = step2[8]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[19]); + step1[17] = vqaddq_s16(step2[17], step2[18]); + step1[18] = vqsubq_s16(step2[17], step2[18]); + step1[19] = vqsubq_s16(step2[16], step2[19]); + step1[20] = vqsubq_s16(step2[23], step2[20]); + step1[21] = vqsubq_s16(step2[22], step2[21]); + step1[22] = vqaddq_s16(step2[22], step2[21]); + step1[23] = vqaddq_s16(step2[23], step2[20]); + step1[24] = vqaddq_s16(step2[24], step2[27]); + step1[25] = vqaddq_s16(step2[25], step2[26]); + step1[26] = vqsubq_s16(step2[25], step2[26]); + step1[27] = vqsubq_s16(step2[24], step2[27]); + step1[28] = vqsubq_s16(step2[31], step2[28]); + step1[29] = vqsubq_s16(step2[30], step2[29]); + step1[30] = vqaddq_s16(step2[30], step2[29]); + step1[31] = vqaddq_s16(step2[31], step2[28]); + + // stage 6 + + btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]); + btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]); + btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]); + btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]); + btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]); + + step2[0] = step1[0]; + step2[1] = step1[0]; + step2[2] = step1[0]; + step2[3] = step1[0]; + step2[4] = step1[4]; + step2[7] = step1[7]; + step2[8] = vqaddq_s16(step1[8], step1[11]); + step2[9] = vqaddq_s16(step1[9], step1[10]); + step2[10] = vqsubq_s16(step1[9], step1[10]); + step2[11] = vqsubq_s16(step1[8], step1[11]); + step2[12] = vqsubq_s16(step1[15], step1[12]); + step2[13] = vqsubq_s16(step1[14], step1[13]); + step2[14] = vqaddq_s16(step1[14], step1[13]); + step2[15] = vqaddq_s16(step1[15], step1[12]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[22] = step1[22]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[25] = step1[25]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 7 + + btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]); + btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]); + + step1[0] = vqaddq_s16(step2[0], step2[7]); + step1[1] = vqaddq_s16(step2[1], step2[6]); + step1[2] = vqaddq_s16(step2[2], step2[5]); + step1[3] = vqaddq_s16(step2[3], step2[4]); + step1[4] = vqsubq_s16(step2[3], step2[4]); + step1[5] = vqsubq_s16(step2[2], step2[5]); + step1[6] = vqsubq_s16(step2[1], step2[6]); + step1[7] = vqsubq_s16(step2[0], step2[7]); + step1[8] = step2[8]; + step1[9] = step2[9]; + step1[14] = step2[14]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[23]); + step1[17] = vqaddq_s16(step2[17], step2[22]); + step1[18] = vqaddq_s16(step2[18], step2[21]); + step1[19] = vqaddq_s16(step2[19], step2[20]); + step1[20] = vqsubq_s16(step2[19], step2[20]); + step1[21] = vqsubq_s16(step2[18], step2[21]); + step1[22] = vqsubq_s16(step2[17], step2[22]); + step1[23] = vqsubq_s16(step2[16], step2[23]); + step1[24] = vqsubq_s16(step2[31], step2[24]); + step1[25] = vqsubq_s16(step2[30], step2[25]); + step1[26] = vqsubq_s16(step2[29], step2[26]); + step1[27] = vqsubq_s16(step2[28], step2[27]); + step1[28] = vqaddq_s16(step2[27], step2[28]); + step1[29] = vqaddq_s16(step2[26], step2[29]); + step1[30] = vqaddq_s16(step2[25], step2[30]); + step1[31] = vqaddq_s16(step2[24], step2[31]); + + // stage 8 + + btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]); + btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]); + btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]); + btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]); + + step2[0] = vqaddq_s16(step1[0], step1[15]); + step2[1] = vqaddq_s16(step1[1], step1[14]); + step2[2] = vqaddq_s16(step1[2], step1[13]); + step2[3] = vqaddq_s16(step1[3], step1[12]); + step2[4] = vqaddq_s16(step1[4], step1[11]); + step2[5] = vqaddq_s16(step1[5], step1[10]); + step2[6] = vqaddq_s16(step1[6], step1[9]); + step2[7] = vqaddq_s16(step1[7], step1[8]); + step2[8] = vqsubq_s16(step1[7], step1[8]); + step2[9] = vqsubq_s16(step1[6], step1[9]); + step2[10] = vqsubq_s16(step1[5], step1[10]); + step2[11] = vqsubq_s16(step1[4], step1[11]); + step2[12] = vqsubq_s16(step1[3], step1[12]); + step2[13] = vqsubq_s16(step1[2], step1[13]); + step2[14] = vqsubq_s16(step1[1], step1[14]); + step2[15] = vqsubq_s16(step1[0], step1[15]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[18] = step1[18]; + step2[19] = step1[19]; + step2[28] = step1[28]; + step2[29] = step1[29]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 9 + + out[0] = vqaddq_s16(step2[0], step2[31]); + out[1] = vqaddq_s16(step2[1], step2[30]); + out[2] = vqaddq_s16(step2[2], step2[29]); + out[3] = vqaddq_s16(step2[3], step2[28]); + out[4] = vqaddq_s16(step2[4], step2[27]); + out[5] = vqaddq_s16(step2[5], step2[26]); + out[6] = vqaddq_s16(step2[6], step2[25]); + out[7] = vqaddq_s16(step2[7], step2[24]); + out[8] = vqaddq_s16(step2[8], step2[23]); + out[9] = vqaddq_s16(step2[9], step2[22]); + out[10] = vqaddq_s16(step2[10], step2[21]); + out[11] = vqaddq_s16(step2[11], step2[20]); + out[12] = vqaddq_s16(step2[12], step2[19]); + out[13] = vqaddq_s16(step2[13], step2[18]); + out[14] = vqaddq_s16(step2[14], step2[17]); + out[15] = vqaddq_s16(step2[15], step2[16]); + out[16] = vqsubq_s16(step2[15], step2[16]); + out[17] = vqsubq_s16(step2[14], step2[17]); + out[18] = vqsubq_s16(step2[13], step2[18]); + out[19] = vqsubq_s16(step2[12], step2[19]); + out[20] = vqsubq_s16(step2[11], step2[20]); + out[21] = vqsubq_s16(step2[10], step2[21]); + out[22] = vqsubq_s16(step2[9], step2[22]); + out[23] = vqsubq_s16(step2[8], step2[23]); + out[24] = vqsubq_s16(step2[7], step2[24]); + out[25] = vqsubq_s16(step2[6], step2[25]); + out[26] = vqsubq_s16(step2[5], step2[26]); + out[27] = vqsubq_s16(step2[4], step2[27]); + out[28] = vqsubq_s16(step2[3], step2[28]); + out[29] = vqsubq_s16(step2[2], step2[29]); + out[30] = vqsubq_s16(step2[1], step2[30]); + out[31] = vqsubq_s16(step2[0], step2[31]); +} + +static INLINE void idct32_low16_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[32], step2[32]; + int32x4_t t32[16]; + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + const int16x4_t c2 = + set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), + (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); + const int16x4_t c3 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); + + // stage 1 + // stage 2 + + btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]); + btf_16_neon(in[15], -cospi[34], cospi[30], &step2[17], &step2[30]); + btf_16_neon(in[9], cospi[46], cospi[18], &step2[18], &step2[29]); + btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]); + btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]); + btf_16_neon(in[11], -cospi[42], cospi[22], &step2[21], &step2[26]); + btf_16_neon(in[13], cospi[38], cospi[26], &step2[22], &step2[25]); + btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]); + + step2[0] = in[0]; + step2[2] = in[8]; + step2[4] = in[4]; + step2[6] = in[12]; + step2[8] = in[2]; + step2[10] = in[10]; + step2[12] = in[6]; + step2[14] = in[14]; + + // stage 3 + + btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]); + btf_16_neon(step2[14], -cospi[36], cospi[28], &step1[9], &step1[14]); + btf_16_neon(step2[10], cospi[44], cospi[20], &step1[10], &step1[13]); + btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]); + + step1[0] = step2[0]; + step1[2] = step2[2]; + step1[4] = step2[4]; + step1[6] = step2[6]; + step1[16] = vqaddq_s16(step2[16], step2[17]); + step1[17] = vqsubq_s16(step2[16], step2[17]); + step1[18] = vqsubq_s16(step2[19], step2[18]); + step1[19] = vqaddq_s16(step2[19], step2[18]); + step1[20] = vqaddq_s16(step2[20], step2[21]); + step1[21] = vqsubq_s16(step2[20], step2[21]); + step1[22] = vqsubq_s16(step2[23], step2[22]); + step1[23] = vqaddq_s16(step2[23], step2[22]); + step1[24] = vqaddq_s16(step2[24], step2[25]); + step1[25] = vqsubq_s16(step2[24], step2[25]); + step1[26] = vqsubq_s16(step2[27], step2[26]); + step1[27] = vqaddq_s16(step2[27], step2[26]); + step1[28] = vqaddq_s16(step2[28], step2[29]); + step1[29] = vqsubq_s16(step2[28], step2[29]); + step1[30] = vqsubq_s16(step2[31], step2[30]); + step1[31] = vqaddq_s16(step2[31], step2[30]); + + // stage 4 + + btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]); + btf_16_neon(step1[6], -cospi[40], cospi[24], &step2[5], &step2[6]); + btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]); + btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]); + btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]); + btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]); + + step2[0] = step1[0]; + step2[2] = step1[2]; + step2[8] = vqaddq_s16(step1[8], step1[9]); + step2[9] = vqsubq_s16(step1[8], step1[9]); + step2[10] = vqsubq_s16(step1[11], step1[10]); + step2[11] = vqaddq_s16(step1[11], step1[10]); + step2[12] = vqaddq_s16(step1[12], step1[13]); + step2[13] = vqsubq_s16(step1[12], step1[13]); + step2[14] = vqsubq_s16(step1[15], step1[14]); + step2[15] = vqaddq_s16(step1[15], step1[14]); + step2[16] = step1[16]; + step2[19] = step1[19]; + step2[20] = step1[20]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[27] = step1[27]; + step2[28] = step1[28]; + step2[31] = step1[31]; + + // stage 5 + + t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]); + + step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + + btf_16_neon(step2[2], cospi[48], cospi[16], &step1[2], &step1[3]); + btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]); + btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]); + + step1[4] = vqaddq_s16(step2[4], step2[5]); + step1[5] = vqsubq_s16(step2[4], step2[5]); + step1[6] = vqsubq_s16(step2[7], step2[6]); + step1[7] = vqaddq_s16(step2[7], step2[6]); + step1[8] = step2[8]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[19]); + step1[17] = vqaddq_s16(step2[17], step2[18]); + step1[18] = vqsubq_s16(step2[17], step2[18]); + step1[19] = vqsubq_s16(step2[16], step2[19]); + step1[20] = vqsubq_s16(step2[23], step2[20]); + step1[21] = vqsubq_s16(step2[22], step2[21]); + step1[22] = vqaddq_s16(step2[22], step2[21]); + step1[23] = vqaddq_s16(step2[23], step2[20]); + step1[24] = vqaddq_s16(step2[24], step2[27]); + step1[25] = vqaddq_s16(step2[25], step2[26]); + step1[26] = vqsubq_s16(step2[25], step2[26]); + step1[27] = vqsubq_s16(step2[24], step2[27]); + step1[28] = vqsubq_s16(step2[31], step2[28]); + step1[29] = vqsubq_s16(step2[30], step2[29]); + step1[30] = vqaddq_s16(step2[30], step2[29]); + step1[31] = vqaddq_s16(step2[31], step2[28]); + + // stage 6 + + btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]); + btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]); + btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]); + btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]); + btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]); + + step2[0] = vqaddq_s16(step1[0], step1[3]); + step2[1] = vqaddq_s16(step1[0], step1[2]); + step2[2] = vqsubq_s16(step1[0], step1[2]); + step2[3] = vqsubq_s16(step1[0], step1[3]); + step2[4] = step1[4]; + step2[7] = step1[7]; + step2[8] = vqaddq_s16(step1[8], step1[11]); + step2[9] = vqaddq_s16(step1[9], step1[10]); + step2[10] = vqsubq_s16(step1[9], step1[10]); + step2[11] = vqsubq_s16(step1[8], step1[11]); + step2[12] = vqsubq_s16(step1[15], step1[12]); + step2[13] = vqsubq_s16(step1[14], step1[13]); + step2[14] = vqaddq_s16(step1[14], step1[13]); + step2[15] = vqaddq_s16(step1[15], step1[12]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[22] = step1[22]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[25] = step1[25]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 7 + + btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]); + btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]); + + step1[0] = vqaddq_s16(step2[0], step2[7]); + step1[1] = vqaddq_s16(step2[1], step2[6]); + step1[2] = vqaddq_s16(step2[2], step2[5]); + step1[3] = vqaddq_s16(step2[3], step2[4]); + step1[4] = vqsubq_s16(step2[3], step2[4]); + step1[5] = vqsubq_s16(step2[2], step2[5]); + step1[6] = vqsubq_s16(step2[1], step2[6]); + step1[7] = vqsubq_s16(step2[0], step2[7]); + step1[8] = step2[8]; + step1[9] = step2[9]; + step1[14] = step2[14]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[23]); + step1[17] = vqaddq_s16(step2[17], step2[22]); + step1[18] = vqaddq_s16(step2[18], step2[21]); + step1[19] = vqaddq_s16(step2[19], step2[20]); + step1[20] = vqsubq_s16(step2[19], step2[20]); + step1[21] = vqsubq_s16(step2[18], step2[21]); + step1[22] = vqsubq_s16(step2[17], step2[22]); + step1[23] = vqsubq_s16(step2[16], step2[23]); + step1[24] = vqsubq_s16(step2[31], step2[24]); + step1[25] = vqsubq_s16(step2[30], step2[25]); + step1[26] = vqsubq_s16(step2[29], step2[26]); + step1[27] = vqsubq_s16(step2[28], step2[27]); + step1[28] = vqaddq_s16(step2[27], step2[28]); + step1[29] = vqaddq_s16(step2[26], step2[29]); + step1[30] = vqaddq_s16(step2[25], step2[30]); + step1[31] = vqaddq_s16(step2[24], step2[31]); + + // stage 8 + + btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]); + btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]); + btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]); + btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]); + + step2[0] = vqaddq_s16(step1[0], step1[15]); + step2[1] = vqaddq_s16(step1[1], step1[14]); + step2[2] = vqaddq_s16(step1[2], step1[13]); + step2[3] = vqaddq_s16(step1[3], step1[12]); + step2[4] = vqaddq_s16(step1[4], step1[11]); + step2[5] = vqaddq_s16(step1[5], step1[10]); + step2[6] = vqaddq_s16(step1[6], step1[9]); + step2[7] = vqaddq_s16(step1[7], step1[8]); + step2[8] = vqsubq_s16(step1[7], step1[8]); + step2[9] = vqsubq_s16(step1[6], step1[9]); + step2[10] = vqsubq_s16(step1[5], step1[10]); + step2[11] = vqsubq_s16(step1[4], step1[11]); + step2[12] = vqsubq_s16(step1[3], step1[12]); + step2[13] = vqsubq_s16(step1[2], step1[13]); + step2[14] = vqsubq_s16(step1[1], step1[14]); + step2[15] = vqsubq_s16(step1[0], step1[15]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[18] = step1[18]; + step2[19] = step1[19]; + step2[28] = step1[28]; + step2[29] = step1[29]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 9 + + out[0] = vqaddq_s16(step2[0], step2[31]); + out[1] = vqaddq_s16(step2[1], step2[30]); + out[2] = vqaddq_s16(step2[2], step2[29]); + out[3] = vqaddq_s16(step2[3], step2[28]); + out[4] = vqaddq_s16(step2[4], step2[27]); + out[5] = vqaddq_s16(step2[5], step2[26]); + out[6] = vqaddq_s16(step2[6], step2[25]); + out[7] = vqaddq_s16(step2[7], step2[24]); + out[8] = vqaddq_s16(step2[8], step2[23]); + out[9] = vqaddq_s16(step2[9], step2[22]); + out[10] = vqaddq_s16(step2[10], step2[21]); + out[11] = vqaddq_s16(step2[11], step2[20]); + out[12] = vqaddq_s16(step2[12], step2[19]); + out[13] = vqaddq_s16(step2[13], step2[18]); + out[14] = vqaddq_s16(step2[14], step2[17]); + out[15] = vqaddq_s16(step2[15], step2[16]); + out[16] = vqsubq_s16(step2[15], step2[16]); + out[17] = vqsubq_s16(step2[14], step2[17]); + out[18] = vqsubq_s16(step2[13], step2[18]); + out[19] = vqsubq_s16(step2[12], step2[19]); + out[20] = vqsubq_s16(step2[11], step2[20]); + out[21] = vqsubq_s16(step2[10], step2[21]); + out[22] = vqsubq_s16(step2[9], step2[22]); + out[23] = vqsubq_s16(step2[8], step2[23]); + out[24] = vqsubq_s16(step2[7], step2[24]); + out[25] = vqsubq_s16(step2[6], step2[25]); + out[26] = vqsubq_s16(step2[5], step2[26]); + out[27] = vqsubq_s16(step2[4], step2[27]); + out[28] = vqsubq_s16(step2[3], step2[28]); + out[29] = vqsubq_s16(step2[2], step2[29]); + out[30] = vqsubq_s16(step2[1], step2[30]); + out[31] = vqsubq_s16(step2[0], step2[31]); +} +static INLINE void idct64_stage9_neon(int16x8_t *step2, int16x8_t *step1, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + + btf_16_lane_0_1_neon(step2[27], step2[20], c3, &step1[27], &step1[20]); + btf_16_lane_0_1_neon(step2[26], step2[21], c3, &step1[26], &step1[21]); + btf_16_lane_0_1_neon(step2[25], step2[22], c3, &step1[25], &step1[22]); + btf_16_lane_0_1_neon(step2[24], step2[23], c3, &step1[24], &step1[23]); + + step1[0] = vqaddq_s16(step2[0], step2[15]); + step1[1] = vqaddq_s16(step2[1], step2[14]); + step1[2] = vqaddq_s16(step2[2], step2[13]); + step1[3] = vqaddq_s16(step2[3], step2[12]); + step1[4] = vqaddq_s16(step2[4], step2[11]); + step1[5] = vqaddq_s16(step2[5], step2[10]); + step1[6] = vqaddq_s16(step2[6], step2[9]); + step1[7] = vqaddq_s16(step2[7], step2[8]); + step1[8] = vqsubq_s16(step2[7], step2[8]); + step1[9] = vqsubq_s16(step2[6], step2[9]); + step1[10] = vqsubq_s16(step2[5], step2[10]); + step1[11] = vqsubq_s16(step2[4], step2[11]); + step1[12] = vqsubq_s16(step2[3], step2[12]); + step1[13] = vqsubq_s16(step2[2], step2[13]); + step1[14] = vqsubq_s16(step2[1], step2[14]); + step1[15] = vqsubq_s16(step2[0], step2[15]); + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[18] = step2[18]; + step1[19] = step2[19]; + step1[28] = step2[28]; + step1[29] = step2[29]; + step1[30] = step2[30]; + step1[31] = step2[31]; + step1[32] = vqaddq_s16(step2[32], step2[47]); + step1[33] = vqaddq_s16(step2[33], step2[46]); + step1[34] = vqaddq_s16(step2[34], step2[45]); + step1[35] = vqaddq_s16(step2[35], step2[44]); + step1[36] = vqaddq_s16(step2[36], step2[43]); + step1[37] = vqaddq_s16(step2[37], step2[42]); + step1[38] = vqaddq_s16(step2[38], step2[41]); + step1[39] = vqaddq_s16(step2[39], step2[40]); + step1[40] = vqsubq_s16(step2[39], step2[40]); + step1[41] = vqsubq_s16(step2[38], step2[41]); + step1[42] = vqsubq_s16(step2[37], step2[42]); + step1[43] = vqsubq_s16(step2[36], step2[43]); + step1[44] = vqsubq_s16(step2[35], step2[44]); + step1[45] = vqsubq_s16(step2[34], step2[45]); + step1[46] = vqsubq_s16(step2[33], step2[46]); + step1[47] = vqsubq_s16(step2[32], step2[47]); + step1[48] = vqsubq_s16(step2[63], step2[48]); + step1[49] = vqsubq_s16(step2[62], step2[49]); + step1[50] = vqsubq_s16(step2[61], step2[50]); + step1[51] = vqsubq_s16(step2[60], step2[51]); + step1[52] = vqsubq_s16(step2[59], step2[52]); + step1[53] = vqsubq_s16(step2[58], step2[53]); + step1[54] = vqsubq_s16(step2[57], step2[54]); + step1[55] = vqsubq_s16(step2[56], step2[55]); + step1[56] = vqaddq_s16(step2[56], step2[55]); + step1[57] = vqaddq_s16(step2[57], step2[54]); + step1[58] = vqaddq_s16(step2[58], step2[53]); + step1[59] = vqaddq_s16(step2[59], step2[52]); + step1[60] = vqaddq_s16(step2[60], step2[51]); + step1[61] = vqaddq_s16(step2[61], step2[50]); + step1[62] = vqaddq_s16(step2[62], step2[49]); + step1[63] = vqaddq_s16(step2[63], step2[48]); +} + +static INLINE void idct64_stage10_neon(int16x8_t *step1, int16x8_t *step2, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + + btf_16_lane_0_1_neon(step1[55], step1[40], c3, &step2[55], &step2[40]); + btf_16_lane_0_1_neon(step1[54], step1[41], c3, &step2[54], &step2[41]); + btf_16_lane_0_1_neon(step1[53], step1[42], c3, &step2[53], &step2[42]); + btf_16_lane_0_1_neon(step1[52], step1[43], c3, &step2[52], &step2[43]); + btf_16_lane_0_1_neon(step1[51], step1[44], c3, &step2[51], &step2[44]); + btf_16_lane_0_1_neon(step1[50], step1[45], c3, &step2[50], &step2[45]); + btf_16_lane_0_1_neon(step1[49], step1[46], c3, &step2[49], &step2[46]); + btf_16_lane_0_1_neon(step1[48], step1[47], c3, &step2[48], &step2[47]); + + step2[0] = vqaddq_s16(step1[0], step1[31]); + step2[1] = vqaddq_s16(step1[1], step1[30]); + step2[2] = vqaddq_s16(step1[2], step1[29]); + step2[3] = vqaddq_s16(step1[3], step1[28]); + step2[4] = vqaddq_s16(step1[4], step1[27]); + step2[5] = vqaddq_s16(step1[5], step1[26]); + step2[6] = vqaddq_s16(step1[6], step1[25]); + step2[7] = vqaddq_s16(step1[7], step1[24]); + step2[8] = vqaddq_s16(step1[8], step1[23]); + step2[9] = vqaddq_s16(step1[9], step1[22]); + step2[10] = vqaddq_s16(step1[10], step1[21]); + step2[11] = vqaddq_s16(step1[11], step1[20]); + step2[12] = vqaddq_s16(step1[12], step1[19]); + step2[13] = vqaddq_s16(step1[13], step1[18]); + step2[14] = vqaddq_s16(step1[14], step1[17]); + step2[15] = vqaddq_s16(step1[15], step1[16]); + step2[16] = vqsubq_s16(step1[15], step1[16]); + step2[17] = vqsubq_s16(step1[14], step1[17]); + step2[18] = vqsubq_s16(step1[13], step1[18]); + step2[19] = vqsubq_s16(step1[12], step1[19]); + step2[20] = vqsubq_s16(step1[11], step1[20]); + step2[21] = vqsubq_s16(step1[10], step1[21]); + step2[22] = vqsubq_s16(step1[9], step1[22]); + step2[23] = vqsubq_s16(step1[8], step1[23]); + step2[24] = vqsubq_s16(step1[7], step1[24]); + step2[25] = vqsubq_s16(step1[6], step1[25]); + step2[26] = vqsubq_s16(step1[5], step1[26]); + step2[27] = vqsubq_s16(step1[4], step1[27]); + step2[28] = vqsubq_s16(step1[3], step1[28]); + step2[29] = vqsubq_s16(step1[2], step1[29]); + step2[30] = vqsubq_s16(step1[1], step1[30]); + step2[31] = vqsubq_s16(step1[0], step1[31]); + step2[32] = step1[32]; + step2[33] = step1[33]; + step2[34] = step1[34]; + step2[35] = step1[35]; + step2[36] = step1[36]; + step2[37] = step1[37]; + step2[38] = step1[38]; + step2[39] = step1[39]; + step2[56] = step1[56]; + step2[57] = step1[57]; + step2[58] = step1[58]; + step2[59] = step1[59]; + step2[60] = step1[60]; + step2[61] = step1[61]; + step2[62] = step1[62]; + step2[63] = step1[63]; +} + +static INLINE void idct64_low32_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step2[64], step1[64]; + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], + (int16_t)cospi[36], (int16_t)cospi[28]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], + (int16_t)cospi[52], (int16_t)cospi[12]); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + const int16x4_t c4 = + set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]), + (int16_t)(-cospi[36]), (int16_t)(-cospi[28])); + const int16x4_t c5 = + set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]), + (int16_t)(-cospi[52]), (int16_t)(-cospi[12])); + const int16x4_t c6 = + set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), + (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); + const int16x4_t c7 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); + + // stage 1 + // stage 2 + + step2[0] = in[0]; + step2[2] = in[16]; + step2[4] = in[8]; + step2[6] = in[24]; + step2[8] = in[4]; + step2[10] = in[20]; + step2[12] = in[12]; + step2[14] = in[28]; + step2[16] = in[2]; + step2[18] = in[18]; + step2[20] = in[10]; + step2[22] = in[26]; + step2[24] = in[6]; + step2[26] = in[22]; + step2[28] = in[14]; + step2[30] = in[30]; + + btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]); + btf_16_neon(in[31], -cospi[33], cospi[31], &step2[33], &step2[62]); + btf_16_neon(in[17], cospi[47], cospi[17], &step2[34], &step2[61]); + btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]); + btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]); + btf_16_neon(in[23], -cospi[41], cospi[23], &step2[37], &step2[58]); + btf_16_neon(in[25], cospi[39], cospi[25], &step2[38], &step2[57]); + btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]); + btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]); + btf_16_neon(in[27], -cospi[37], cospi[27], &step2[41], &step2[54]); + btf_16_neon(in[21], cospi[43], cospi[21], &step2[42], &step2[53]); + btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]); + btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]); + btf_16_neon(in[19], -cospi[45], cospi[19], &step2[45], &step2[50]); + btf_16_neon(in[29], cospi[35], cospi[29], &step2[46], &step2[49]); + btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]); + + // stage 3 + + step1[0] = step2[0]; + step1[2] = step2[2]; + step1[4] = step2[4]; + step1[6] = step2[6]; + step1[8] = step2[8]; + step1[10] = step2[10]; + step1[12] = step2[12]; + step1[14] = step2[14]; + + btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]); + btf_16_neon(step2[30], -cospi[34], cospi[30], &step1[17], &step1[30]); + btf_16_neon(step2[18], cospi[46], cospi[18], &step1[18], &step1[29]); + btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]); + btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]); + btf_16_neon(step2[26], -cospi[42], cospi[22], &step1[21], &step1[26]); + btf_16_neon(step2[22], cospi[38], cospi[26], &step1[22], &step1[25]); + btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]); + + step1[32] = vqaddq_s16(step2[32], step2[33]); + step1[33] = vqsubq_s16(step2[32], step2[33]); + step1[34] = vqsubq_s16(step2[35], step2[34]); + step1[35] = vqaddq_s16(step2[35], step2[34]); + step1[36] = vqaddq_s16(step2[36], step2[37]); + step1[37] = vqsubq_s16(step2[36], step2[37]); + step1[38] = vqsubq_s16(step2[39], step2[38]); + step1[39] = vqaddq_s16(step2[39], step2[38]); + step1[40] = vqaddq_s16(step2[40], step2[41]); + step1[41] = vqsubq_s16(step2[40], step2[41]); + step1[42] = vqsubq_s16(step2[43], step2[42]); + step1[43] = vqaddq_s16(step2[43], step2[42]); + step1[44] = vqaddq_s16(step2[44], step2[45]); + step1[45] = vqsubq_s16(step2[44], step2[45]); + step1[46] = vqsubq_s16(step2[47], step2[46]); + step1[47] = vqaddq_s16(step2[47], step2[46]); + step1[48] = vqaddq_s16(step2[48], step2[49]); + step1[49] = vqsubq_s16(step2[48], step2[49]); + step1[50] = vqsubq_s16(step2[51], step2[50]); + step1[51] = vqaddq_s16(step2[51], step2[50]); + step1[52] = vqaddq_s16(step2[52], step2[53]); + step1[53] = vqsubq_s16(step2[52], step2[53]); + step1[54] = vqsubq_s16(step2[55], step2[54]); + step1[55] = vqaddq_s16(step2[55], step2[54]); + step1[56] = vqaddq_s16(step2[56], step2[57]); + step1[57] = vqsubq_s16(step2[56], step2[57]); + step1[58] = vqsubq_s16(step2[59], step2[58]); + step1[59] = vqaddq_s16(step2[59], step2[58]); + step1[60] = vqaddq_s16(step2[60], step2[61]); + step1[61] = vqsubq_s16(step2[60], step2[61]); + step1[62] = vqsubq_s16(step2[63], step2[62]); + step1[63] = vqaddq_s16(step2[63], step2[62]); + + // stage 4 + + step2[0] = step1[0]; + step2[2] = step1[2]; + step2[4] = step1[4]; + step2[6] = step1[6]; + + btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]); + btf_16_neon(step1[14], -cospi[36], cospi[28], &step2[9], &step2[14]); + btf_16_neon(step1[10], cospi[44], cospi[20], &step2[10], &step2[13]); + btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]); + btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]); + btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]); + btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]); + btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]); + btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]); + btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]); + btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]); + btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]); + + step2[16] = vqaddq_s16(step1[16], step1[17]); + step2[17] = vqsubq_s16(step1[16], step1[17]); + step2[18] = vqsubq_s16(step1[19], step1[18]); + step2[19] = vqaddq_s16(step1[19], step1[18]); + step2[20] = vqaddq_s16(step1[20], step1[21]); + step2[21] = vqsubq_s16(step1[20], step1[21]); + step2[22] = vqsubq_s16(step1[23], step1[22]); + step2[23] = vqaddq_s16(step1[23], step1[22]); + step2[24] = vqaddq_s16(step1[24], step1[25]); + step2[25] = vqsubq_s16(step1[24], step1[25]); + step2[26] = vqsubq_s16(step1[27], step1[26]); + step2[27] = vqaddq_s16(step1[27], step1[26]); + step2[28] = vqaddq_s16(step1[28], step1[29]); + step2[29] = vqsubq_s16(step1[28], step1[29]); + step2[30] = vqsubq_s16(step1[31], step1[30]); + step2[31] = vqaddq_s16(step1[31], step1[30]); + step2[32] = step1[32]; + step2[35] = step1[35]; + step2[36] = step1[36]; + step2[39] = step1[39]; + step2[40] = step1[40]; + step2[43] = step1[43]; + step2[44] = step1[44]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[51] = step1[51]; + step2[52] = step1[52]; + step2[55] = step1[55]; + step2[56] = step1[56]; + step2[59] = step1[59]; + step2[60] = step1[60]; + step2[63] = step1[63]; + + // stage 5 + + step1[0] = step2[0]; + step1[2] = step2[2]; + + btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]); + btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]); + btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]); + btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]); + btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]); + btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]); + + step1[8] = vqaddq_s16(step2[8], step2[9]); + step1[9] = vqsubq_s16(step2[8], step2[9]); + step1[10] = vqsubq_s16(step2[11], step2[10]); + step1[11] = vqaddq_s16(step2[11], step2[10]); + step1[12] = vqaddq_s16(step2[12], step2[13]); + step1[13] = vqsubq_s16(step2[12], step2[13]); + step1[14] = vqsubq_s16(step2[15], step2[14]); + step1[15] = vqaddq_s16(step2[15], step2[14]); + step1[16] = step2[16]; + step1[19] = step2[19]; + step1[20] = step2[20]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + step1[31] = step2[31]; + step1[32] = vqaddq_s16(step2[32], step2[35]); + step1[33] = vqaddq_s16(step2[33], step2[34]); + step1[34] = vqsubq_s16(step2[33], step2[34]); + step1[35] = vqsubq_s16(step2[32], step2[35]); + step1[36] = vqsubq_s16(step2[39], step2[36]); + step1[37] = vqsubq_s16(step2[38], step2[37]); + step1[38] = vqaddq_s16(step2[38], step2[37]); + step1[39] = vqaddq_s16(step2[39], step2[36]); + step1[40] = vqaddq_s16(step2[40], step2[43]); + step1[41] = vqaddq_s16(step2[41], step2[42]); + step1[42] = vqsubq_s16(step2[41], step2[42]); + step1[43] = vqsubq_s16(step2[40], step2[43]); + step1[44] = vqsubq_s16(step2[47], step2[44]); + step1[45] = vqsubq_s16(step2[46], step2[45]); + step1[46] = vqaddq_s16(step2[46], step2[45]); + step1[47] = vqaddq_s16(step2[47], step2[44]); + step1[48] = vqaddq_s16(step2[48], step2[51]); + step1[49] = vqaddq_s16(step2[49], step2[50]); + step1[50] = vqsubq_s16(step2[49], step2[50]); + step1[51] = vqsubq_s16(step2[48], step2[51]); + step1[52] = vqsubq_s16(step2[55], step2[52]); + step1[53] = vqsubq_s16(step2[54], step2[53]); + step1[54] = vqaddq_s16(step2[54], step2[53]); + step1[55] = vqaddq_s16(step2[55], step2[52]); + step1[56] = vqaddq_s16(step2[56], step2[59]); + step1[57] = vqaddq_s16(step2[57], step2[58]); + step1[58] = vqsubq_s16(step2[57], step2[58]); + step1[59] = vqsubq_s16(step2[56], step2[59]); + step1[60] = vqsubq_s16(step2[63], step2[60]); + step1[61] = vqsubq_s16(step2[62], step2[61]); + step1[62] = vqaddq_s16(step2[62], step2[61]); + step1[63] = vqaddq_s16(step2[63], step2[60]); + + // stage 6 + + btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]); + btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]); + btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]); + btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]); + btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]); + btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]); + btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]); + btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]); + btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]); + btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]); + btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]); + btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]); + + step2[4] = vqaddq_s16(step1[4], step1[5]); + step2[5] = vqsubq_s16(step1[4], step1[5]); + step2[6] = vqsubq_s16(step1[7], step1[6]); + step2[7] = vqaddq_s16(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + step2[16] = vqaddq_s16(step1[16], step1[19]); + step2[17] = vqaddq_s16(step1[17], step1[18]); + step2[18] = vqsubq_s16(step1[17], step1[18]); + step2[19] = vqsubq_s16(step1[16], step1[19]); + step2[20] = vqsubq_s16(step1[23], step1[20]); + step2[21] = vqsubq_s16(step1[22], step1[21]); + step2[22] = vqaddq_s16(step1[22], step1[21]); + step2[23] = vqaddq_s16(step1[23], step1[20]); + step2[24] = vqaddq_s16(step1[24], step1[27]); + step2[25] = vqaddq_s16(step1[25], step1[26]); + step2[26] = vqsubq_s16(step1[25], step1[26]); + step2[27] = vqsubq_s16(step1[24], step1[27]); + step2[28] = vqsubq_s16(step1[31], step1[28]); + step2[29] = vqsubq_s16(step1[30], step1[29]); + step2[30] = vqaddq_s16(step1[30], step1[29]); + step2[31] = vqaddq_s16(step1[31], step1[28]); + step2[32] = step1[32]; + step2[33] = step1[33]; + step2[38] = step1[38]; + step2[39] = step1[39]; + step2[40] = step1[40]; + step2[41] = step1[41]; + step2[46] = step1[46]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[49] = step1[49]; + step2[54] = step1[54]; + step2[55] = step1[55]; + step2[56] = step1[56]; + step2[57] = step1[57]; + step2[62] = step1[62]; + step2[63] = step1[63]; + + // stage 7 + + btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]); + btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]); + btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]); + btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]); + btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]); + + step1[0] = vqaddq_s16(step2[0], step2[3]); + step1[1] = vqaddq_s16(step2[1], step2[2]); + step1[2] = vqsubq_s16(step2[1], step2[2]); + step1[3] = vqsubq_s16(step2[0], step2[3]); + step1[4] = step2[4]; + step1[7] = step2[7]; + step1[8] = vqaddq_s16(step2[8], step2[11]); + step1[9] = vqaddq_s16(step2[9], step2[10]); + step1[10] = vqsubq_s16(step2[9], step2[10]); + step1[11] = vqsubq_s16(step2[8], step2[11]); + step1[12] = vqsubq_s16(step2[15], step2[12]); + step1[13] = vqsubq_s16(step2[14], step2[13]); + step1[14] = vqaddq_s16(step2[14], step2[13]); + step1[15] = vqaddq_s16(step2[15], step2[12]); + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + step1[32] = vqaddq_s16(step2[32], step2[39]); + step1[33] = vqaddq_s16(step2[33], step2[38]); + step1[34] = vqaddq_s16(step2[34], step2[37]); + step1[35] = vqaddq_s16(step2[35], step2[36]); + step1[36] = vqsubq_s16(step2[35], step2[36]); + step1[37] = vqsubq_s16(step2[34], step2[37]); + step1[38] = vqsubq_s16(step2[33], step2[38]); + step1[39] = vqsubq_s16(step2[32], step2[39]); + step1[40] = vqsubq_s16(step2[47], step2[40]); + step1[41] = vqsubq_s16(step2[46], step2[41]); + step1[42] = vqsubq_s16(step2[45], step2[42]); + step1[43] = vqsubq_s16(step2[44], step2[43]); + step1[44] = vqaddq_s16(step2[43], step2[44]); + step1[45] = vqaddq_s16(step2[42], step2[45]); + step1[46] = vqaddq_s16(step2[41], step2[46]); + step1[47] = vqaddq_s16(step2[40], step2[47]); + step1[48] = vqaddq_s16(step2[48], step2[55]); + step1[49] = vqaddq_s16(step2[49], step2[54]); + step1[50] = vqaddq_s16(step2[50], step2[53]); + step1[51] = vqaddq_s16(step2[51], step2[52]); + step1[52] = vqsubq_s16(step2[51], step2[52]); + step1[53] = vqsubq_s16(step2[50], step2[53]); + step1[54] = vqsubq_s16(step2[49], step2[54]); + step1[55] = vqsubq_s16(step2[48], step2[55]); + step1[56] = vqsubq_s16(step2[63], step2[56]); + step1[57] = vqsubq_s16(step2[62], step2[57]); + step1[58] = vqsubq_s16(step2[61], step2[58]); + step1[59] = vqsubq_s16(step2[60], step2[59]); + step1[60] = vqaddq_s16(step2[59], step2[60]); + step1[61] = vqaddq_s16(step2[58], step2[61]); + step1[62] = vqaddq_s16(step2[57], step2[62]); + step1[63] = vqaddq_s16(step2[56], step2[63]); + + // stage 8 + + btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]); + btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]); + btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]); + btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]); + btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]); + btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]); + btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]); + btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]); + btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]); + btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]); + + step2[0] = vqaddq_s16(step1[0], step1[7]); + step2[1] = vqaddq_s16(step1[1], step1[6]); + step2[2] = vqaddq_s16(step1[2], step1[5]); + step2[3] = vqaddq_s16(step1[3], step1[4]); + step2[4] = vqsubq_s16(step1[3], step1[4]); + step2[5] = vqsubq_s16(step1[2], step1[5]); + step2[6] = vqsubq_s16(step1[1], step1[6]); + step2[7] = vqsubq_s16(step1[0], step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + step2[16] = vqaddq_s16(step1[16], step1[23]); + step2[17] = vqaddq_s16(step1[17], step1[22]); + step2[18] = vqaddq_s16(step1[18], step1[21]); + step2[19] = vqaddq_s16(step1[19], step1[20]); + step2[20] = vqsubq_s16(step1[19], step1[20]); + step2[21] = vqsubq_s16(step1[18], step1[21]); + step2[22] = vqsubq_s16(step1[17], step1[22]); + step2[23] = vqsubq_s16(step1[16], step1[23]); + step2[24] = vqsubq_s16(step1[31], step1[24]); + step2[25] = vqsubq_s16(step1[30], step1[25]); + step2[26] = vqsubq_s16(step1[29], step1[26]); + step2[27] = vqsubq_s16(step1[28], step1[27]); + step2[28] = vqaddq_s16(step1[28], step1[27]); + step2[29] = vqaddq_s16(step1[29], step1[26]); + step2[30] = vqaddq_s16(step1[30], step1[25]); + step2[31] = vqaddq_s16(step1[31], step1[24]); + step2[32] = step1[32]; + step2[33] = step1[33]; + step2[34] = step1[34]; + step2[35] = step1[35]; + step2[44] = step1[44]; + step2[45] = step1[45]; + step2[46] = step1[46]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[49] = step1[49]; + step2[50] = step1[50]; + step2[51] = step1[51]; + step2[60] = step1[60]; + step2[61] = step1[61]; + step2[62] = step1[62]; + step2[63] = step1[63]; + + // stage 9 + idct64_stage9_neon(step2, step1, cos_bit); + + // stage 10 + idct64_stage10_neon(step1, step2, cos_bit); + + // stage 11 + + out[0] = vqaddq_s16(step2[0], step2[63]); + out[1] = vqaddq_s16(step2[1], step2[62]); + out[2] = vqaddq_s16(step2[2], step2[61]); + out[3] = vqaddq_s16(step2[3], step2[60]); + out[4] = vqaddq_s16(step2[4], step2[59]); + out[5] = vqaddq_s16(step2[5], step2[58]); + out[6] = vqaddq_s16(step2[6], step2[57]); + out[7] = vqaddq_s16(step2[7], step2[56]); + out[8] = vqaddq_s16(step2[8], step2[55]); + out[9] = vqaddq_s16(step2[9], step2[54]); + out[10] = vqaddq_s16(step2[10], step2[53]); + out[11] = vqaddq_s16(step2[11], step2[52]); + out[12] = vqaddq_s16(step2[12], step2[51]); + out[13] = vqaddq_s16(step2[13], step2[50]); + out[14] = vqaddq_s16(step2[14], step2[49]); + out[15] = vqaddq_s16(step2[15], step2[48]); + out[16] = vqaddq_s16(step2[16], step2[47]); + out[17] = vqaddq_s16(step2[17], step2[46]); + out[18] = vqaddq_s16(step2[18], step2[45]); + out[19] = vqaddq_s16(step2[19], step2[44]); + out[20] = vqaddq_s16(step2[20], step2[43]); + out[21] = vqaddq_s16(step2[21], step2[42]); + out[22] = vqaddq_s16(step2[22], step2[41]); + out[23] = vqaddq_s16(step2[23], step2[40]); + out[24] = vqaddq_s16(step2[24], step2[39]); + out[25] = vqaddq_s16(step2[25], step2[38]); + out[26] = vqaddq_s16(step2[26], step2[37]); + out[27] = vqaddq_s16(step2[27], step2[36]); + out[28] = vqaddq_s16(step2[28], step2[35]); + out[29] = vqaddq_s16(step2[29], step2[34]); + out[30] = vqaddq_s16(step2[30], step2[33]); + out[31] = vqaddq_s16(step2[31], step2[32]); + out[32] = vqsubq_s16(step2[31], step2[32]); + out[33] = vqsubq_s16(step2[30], step2[33]); + out[34] = vqsubq_s16(step2[29], step2[34]); + out[35] = vqsubq_s16(step2[28], step2[35]); + out[36] = vqsubq_s16(step2[27], step2[36]); + out[37] = vqsubq_s16(step2[26], step2[37]); + out[38] = vqsubq_s16(step2[25], step2[38]); + out[39] = vqsubq_s16(step2[24], step2[39]); + out[40] = vqsubq_s16(step2[23], step2[40]); + out[41] = vqsubq_s16(step2[22], step2[41]); + out[42] = vqsubq_s16(step2[21], step2[42]); + out[43] = vqsubq_s16(step2[20], step2[43]); + out[44] = vqsubq_s16(step2[19], step2[44]); + out[45] = vqsubq_s16(step2[18], step2[45]); + out[46] = vqsubq_s16(step2[17], step2[46]); + out[47] = vqsubq_s16(step2[16], step2[47]); + out[48] = vqsubq_s16(step2[15], step2[48]); + out[49] = vqsubq_s16(step2[14], step2[49]); + out[50] = vqsubq_s16(step2[13], step2[50]); + out[51] = vqsubq_s16(step2[12], step2[51]); + out[52] = vqsubq_s16(step2[11], step2[52]); + out[53] = vqsubq_s16(step2[10], step2[53]); + out[54] = vqsubq_s16(step2[9], step2[54]); + out[55] = vqsubq_s16(step2[8], step2[55]); + out[56] = vqsubq_s16(step2[7], step2[56]); + out[57] = vqsubq_s16(step2[6], step2[57]); + out[58] = vqsubq_s16(step2[5], step2[58]); + out[59] = vqsubq_s16(step2[4], step2[59]); + out[60] = vqsubq_s16(step2[3], step2[60]); + out[61] = vqsubq_s16(step2[2], step2[61]); + out[62] = vqsubq_s16(step2[1], step2[62]); + out[63] = vqsubq_s16(step2[0], step2[63]); +} + +static INLINE void idct64_low1_neon(int16x8_t *input, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1; + int32x4_t t32[2]; + + // stage 1 + // stage 2 + // stage 3 + // stage 4 + // stage 5 + // stage 6 + + t32[0] = vmull_n_s16(vget_low_s16(input[0]), cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(input[0]), cospi[32]); + + step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + // stage 7 + // stage 8 + // stage 9 + // stage 10 + // stage 11 + out[0] = step1; + out[1] = step1; + out[2] = step1; + out[3] = step1; + out[4] = step1; + out[5] = step1; + out[6] = step1; + out[7] = step1; + out[8] = step1; + out[9] = step1; + out[10] = step1; + out[11] = step1; + out[12] = step1; + out[13] = step1; + out[14] = step1; + out[15] = step1; + out[16] = step1; + out[17] = step1; + out[18] = step1; + out[19] = step1; + out[20] = step1; + out[21] = step1; + out[22] = step1; + out[23] = step1; + out[24] = step1; + out[25] = step1; + out[26] = step1; + out[27] = step1; + out[28] = step1; + out[29] = step1; + out[30] = step1; + out[31] = step1; + out[32] = step1; + out[33] = step1; + out[34] = step1; + out[35] = step1; + out[36] = step1; + out[37] = step1; + out[38] = step1; + out[39] = step1; + out[40] = step1; + out[41] = step1; + out[42] = step1; + out[43] = step1; + out[44] = step1; + out[45] = step1; + out[46] = step1; + out[47] = step1; + out[48] = step1; + out[49] = step1; + out[50] = step1; + out[51] = step1; + out[52] = step1; + out[53] = step1; + out[54] = step1; + out[55] = step1; + out[56] = step1; + out[57] = step1; + out[58] = step1; + out[59] = step1; + out[60] = step1; + out[61] = step1; + out[62] = step1; + out[63] = step1; +} + +static INLINE void idct64_low8_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step2[64], step1[64]; + + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], + (int16_t)cospi[36], (int16_t)cospi[28]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], + (int16_t)cospi[52], (int16_t)cospi[12]); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + const int16x4_t c4 = + set_s16x4_neon((int16_t)(-cospi[36]), (int16_t)(-cospi[28]), + (int16_t)(-cospi[52]), (int16_t)(-cospi[12])); + const int16x4_t c5 = + set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), + (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); + const int16x4_t c6 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); + + // stage 1 + // stage 2 + + step2[0] = in[0]; + step2[8] = in[4]; + step2[16] = in[2]; + step2[24] = in[6]; + + btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]); + btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]); + btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]); + btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]); + + // stage 3 + + step1[0] = step2[0]; + step1[8] = step2[8]; + + btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]); + btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]); + + step1[32] = step2[32]; + step1[33] = step2[32]; + step1[38] = step2[39]; + step1[39] = step2[39]; + step1[40] = step2[40]; + step1[41] = step2[40]; + step1[46] = step2[47]; + step1[47] = step2[47]; + step1[48] = step2[48]; + step1[49] = step2[48]; + step1[54] = step2[55]; + step1[55] = step2[55]; + step1[56] = step2[56]; + step1[57] = step2[56]; + step1[62] = step2[63]; + step1[63] = step2[63]; + + // stage 4 + + step2[0] = step1[0]; + + btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]); + btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]); + btf_16_lane_1_0_neon(step1[38], step1[57], c4, &step2[38], &step2[57]); + btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]); + btf_16_lane_3_2_neon(step1[46], step1[49], c4, &step2[46], &step2[49]); + + step2[16] = step1[16]; + step2[17] = step1[16]; + step2[22] = step1[23]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[25] = step1[24]; + step2[30] = step1[31]; + step2[31] = step1[31]; + step2[32] = step1[32]; + step2[39] = step1[39]; + step2[40] = step1[40]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[55] = step1[55]; + step2[56] = step1[56]; + step2[63] = step1[63]; + + // stage 5 + + step1[0] = step2[0]; + + btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]); + btf_16_lane_3_2_neon(step2[22], step2[25], c5, &step1[22], &step1[25]); + + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[14] = step2[15]; + step1[15] = step2[15]; + + step1[16] = step2[16]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[31] = step2[31]; + step1[32] = step2[32]; + step1[33] = step2[33]; + step1[34] = step2[33]; + step1[35] = step2[32]; + step1[36] = step2[39]; + step1[37] = step2[38]; + step1[38] = step2[38]; + step1[39] = step2[39]; + step1[40] = step2[40]; + step1[41] = step2[41]; + step1[42] = step2[41]; + step1[43] = step2[40]; + step1[44] = step2[47]; + step1[45] = step2[46]; + step1[46] = step2[46]; + step1[47] = step2[47]; + step1[48] = step2[48]; + step1[49] = step2[49]; + step1[50] = step2[49]; + step1[51] = step2[48]; + step1[52] = step2[55]; + step1[53] = step2[54]; + step1[54] = step2[54]; + step1[55] = step2[55]; + step1[56] = step2[56]; + step1[57] = step2[57]; + step1[58] = step2[57]; + step1[59] = step2[56]; + step1[60] = step2[63]; + step1[61] = step2[62]; + step1[62] = step2[62]; + step1[63] = step2[63]; + + // stage 6 + + btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]); + btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]); + btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]); + btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]); + btf_16_lane_1_0_neon(step1[36], step1[59], c5, &step2[36], &step2[59]); + btf_16_lane_1_0_neon(step1[37], step1[58], c5, &step2[37], &step2[58]); + btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]); + btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]); + btf_16_lane_3_2_neon(step1[44], step1[51], c5, &step2[44], &step2[51]); + btf_16_lane_3_2_neon(step1[45], step1[50], c5, &step2[45], &step2[50]); + + step2[8] = step1[8]; + step2[15] = step1[15]; + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[18] = step1[17]; + step2[19] = step1[16]; + step2[20] = step1[23]; + step2[21] = step1[22]; + step2[22] = step1[22]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[25] = step1[25]; + step2[26] = step1[25]; + step2[27] = step1[24]; + step2[28] = step1[31]; + step2[29] = step1[30]; + step2[30] = step1[30]; + step2[31] = step1[31]; + step2[32] = step1[32]; + step2[33] = step1[33]; + step2[38] = step1[38]; + step2[39] = step1[39]; + step2[40] = step1[40]; + step2[41] = step1[41]; + step2[46] = step1[46]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[49] = step1[49]; + step2[54] = step1[54]; + step2[55] = step1[55]; + step2[56] = step1[56]; + step2[57] = step1[57]; + step2[62] = step1[62]; + step2[63] = step1[63]; + + // stage 7 + + btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]); + btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]); + btf_16_lane_3_2_neon(step2[20], step2[27], c6, &step1[20], &step1[27]); + btf_16_lane_3_2_neon(step2[21], step2[26], c6, &step1[21], &step1[26]); + + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[8] = step2[8]; + step1[9] = step2[9]; + step1[10] = step2[9]; + step1[11] = step2[8]; + step1[12] = step2[15]; + step1[13] = step2[14]; + step1[14] = step2[14]; + step1[15] = step2[15]; + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + step1[32] = vqaddq_s16(step2[32], step2[39]); + step1[33] = vqaddq_s16(step2[33], step2[38]); + step1[34] = vqaddq_s16(step2[34], step2[37]); + step1[35] = vqaddq_s16(step2[35], step2[36]); + step1[36] = vqsubq_s16(step2[35], step2[36]); + step1[37] = vqsubq_s16(step2[34], step2[37]); + step1[38] = vqsubq_s16(step2[33], step2[38]); + step1[39] = vqsubq_s16(step2[32], step2[39]); + step1[40] = vqsubq_s16(step2[47], step2[40]); + step1[41] = vqsubq_s16(step2[46], step2[41]); + step1[42] = vqsubq_s16(step2[45], step2[42]); + step1[43] = vqsubq_s16(step2[44], step2[43]); + step1[44] = vqaddq_s16(step2[43], step2[44]); + step1[45] = vqaddq_s16(step2[42], step2[45]); + step1[46] = vqaddq_s16(step2[41], step2[46]); + step1[47] = vqaddq_s16(step2[40], step2[47]); + step1[48] = vqaddq_s16(step2[48], step2[55]); + step1[49] = vqaddq_s16(step2[49], step2[54]); + step1[50] = vqaddq_s16(step2[50], step2[53]); + step1[51] = vqaddq_s16(step2[51], step2[52]); + step1[52] = vqsubq_s16(step2[51], step2[52]); + step1[53] = vqsubq_s16(step2[50], step2[53]); + step1[54] = vqsubq_s16(step2[49], step2[54]); + step1[55] = vqsubq_s16(step2[48], step2[55]); + step1[56] = vqsubq_s16(step2[63], step2[56]); + step1[57] = vqsubq_s16(step2[62], step2[57]); + step1[58] = vqsubq_s16(step2[61], step2[58]); + step1[59] = vqsubq_s16(step2[60], step2[59]); + step1[60] = vqaddq_s16(step2[59], step2[60]); + step1[61] = vqaddq_s16(step2[58], step2[61]); + step1[62] = vqaddq_s16(step2[57], step2[62]); + step1[63] = vqaddq_s16(step2[56], step2[63]); + + // stage 8 + + btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]); + btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]); + btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]); + btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]); + btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]); + btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]); + btf_16_lane_3_2_neon(step1[40], step1[55], c6, &step2[40], &step2[55]); + btf_16_lane_3_2_neon(step1[41], step1[54], c6, &step2[41], &step2[54]); + btf_16_lane_3_2_neon(step1[42], step1[53], c6, &step2[42], &step2[53]); + btf_16_lane_3_2_neon(step1[43], step1[52], c6, &step2[43], &step2[52]); + + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[3]; + step2[5] = step1[2]; + step2[6] = step1[1]; + step2[7] = step1[0]; + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + step2[16] = vqaddq_s16(step1[16], step1[23]); + step2[17] = vqaddq_s16(step1[17], step1[22]); + step2[18] = vqaddq_s16(step1[18], step1[21]); + step2[19] = vqaddq_s16(step1[19], step1[20]); + step2[20] = vqsubq_s16(step1[19], step1[20]); + step2[21] = vqsubq_s16(step1[18], step1[21]); + step2[22] = vqsubq_s16(step1[17], step1[22]); + step2[23] = vqsubq_s16(step1[16], step1[23]); + step2[24] = vqsubq_s16(step1[31], step1[24]); + step2[25] = vqsubq_s16(step1[30], step1[25]); + step2[26] = vqsubq_s16(step1[29], step1[26]); + step2[27] = vqsubq_s16(step1[28], step1[27]); + step2[28] = vqaddq_s16(step1[28], step1[27]); + step2[29] = vqaddq_s16(step1[29], step1[26]); + step2[30] = vqaddq_s16(step1[30], step1[25]); + step2[31] = vqaddq_s16(step1[31], step1[24]); + step2[32] = step1[32]; + step2[33] = step1[33]; + step2[34] = step1[34]; + step2[35] = step1[35]; + step2[44] = step1[44]; + step2[45] = step1[45]; + step2[46] = step1[46]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[49] = step1[49]; + step2[50] = step1[50]; + step2[51] = step1[51]; + step2[60] = step1[60]; + step2[61] = step1[61]; + step2[62] = step1[62]; + step2[63] = step1[63]; + + // stage 9 + idct64_stage9_neon(step2, step1, cos_bit); + + // stage 10 + idct64_stage10_neon(step1, step2, cos_bit); + + // stage 11 + + out[0] = vqaddq_s16(step2[0], step2[63]); + out[1] = vqaddq_s16(step2[1], step2[62]); + out[2] = vqaddq_s16(step2[2], step2[61]); + out[3] = vqaddq_s16(step2[3], step2[60]); + out[4] = vqaddq_s16(step2[4], step2[59]); + out[5] = vqaddq_s16(step2[5], step2[58]); + out[6] = vqaddq_s16(step2[6], step2[57]); + out[7] = vqaddq_s16(step2[7], step2[56]); + out[8] = vqaddq_s16(step2[8], step2[55]); + out[9] = vqaddq_s16(step2[9], step2[54]); + out[10] = vqaddq_s16(step2[10], step2[53]); + out[11] = vqaddq_s16(step2[11], step2[52]); + out[12] = vqaddq_s16(step2[12], step2[51]); + out[13] = vqaddq_s16(step2[13], step2[50]); + out[14] = vqaddq_s16(step2[14], step2[49]); + out[15] = vqaddq_s16(step2[15], step2[48]); + out[16] = vqaddq_s16(step2[16], step2[47]); + out[17] = vqaddq_s16(step2[17], step2[46]); + out[18] = vqaddq_s16(step2[18], step2[45]); + out[19] = vqaddq_s16(step2[19], step2[44]); + out[20] = vqaddq_s16(step2[20], step2[43]); + out[21] = vqaddq_s16(step2[21], step2[42]); + out[22] = vqaddq_s16(step2[22], step2[41]); + out[23] = vqaddq_s16(step2[23], step2[40]); + out[24] = vqaddq_s16(step2[24], step2[39]); + out[25] = vqaddq_s16(step2[25], step2[38]); + out[26] = vqaddq_s16(step2[26], step2[37]); + out[27] = vqaddq_s16(step2[27], step2[36]); + out[28] = vqaddq_s16(step2[28], step2[35]); + out[29] = vqaddq_s16(step2[29], step2[34]); + out[30] = vqaddq_s16(step2[30], step2[33]); + out[31] = vqaddq_s16(step2[31], step2[32]); + out[32] = vqsubq_s16(step2[31], step2[32]); + out[33] = vqsubq_s16(step2[30], step2[33]); + out[34] = vqsubq_s16(step2[29], step2[34]); + out[35] = vqsubq_s16(step2[28], step2[35]); + out[36] = vqsubq_s16(step2[27], step2[36]); + out[37] = vqsubq_s16(step2[26], step2[37]); + out[38] = vqsubq_s16(step2[25], step2[38]); + out[39] = vqsubq_s16(step2[24], step2[39]); + out[40] = vqsubq_s16(step2[23], step2[40]); + out[41] = vqsubq_s16(step2[22], step2[41]); + out[42] = vqsubq_s16(step2[21], step2[42]); + out[43] = vqsubq_s16(step2[20], step2[43]); + out[44] = vqsubq_s16(step2[19], step2[44]); + out[45] = vqsubq_s16(step2[18], step2[45]); + out[46] = vqsubq_s16(step2[17], step2[46]); + out[47] = vqsubq_s16(step2[16], step2[47]); + out[48] = vqsubq_s16(step2[15], step2[48]); + out[49] = vqsubq_s16(step2[14], step2[49]); + out[50] = vqsubq_s16(step2[13], step2[50]); + out[51] = vqsubq_s16(step2[12], step2[51]); + out[52] = vqsubq_s16(step2[11], step2[52]); + out[53] = vqsubq_s16(step2[10], step2[53]); + out[54] = vqsubq_s16(step2[9], step2[54]); + out[55] = vqsubq_s16(step2[8], step2[55]); + out[56] = vqsubq_s16(step2[7], step2[56]); + out[57] = vqsubq_s16(step2[6], step2[57]); + out[58] = vqsubq_s16(step2[5], step2[58]); + out[59] = vqsubq_s16(step2[4], step2[59]); + out[60] = vqsubq_s16(step2[3], step2[60]); + out[61] = vqsubq_s16(step2[2], step2[61]); + out[62] = vqsubq_s16(step2[1], step2[62]); + out[63] = vqsubq_s16(step2[0], step2[63]); +} + +static INLINE void idct64_low16_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step2[64], step1[64]; + + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], + (int16_t)cospi[36], (int16_t)cospi[28]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], + (int16_t)cospi[52], (int16_t)cospi[12]); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + const int16x4_t c4 = + set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]), + (int16_t)(-cospi[36]), (int16_t)(-cospi[28])); + const int16x4_t c5 = + set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]), + (int16_t)(-cospi[52]), (int16_t)(-cospi[12])); + const int16x4_t c6 = + set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), + (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); + const int16x4_t c7 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); + + // stage 1 + // stage 2 + + step2[0] = in[0]; + step2[4] = in[8]; + step2[8] = in[4]; + step2[12] = in[12]; + step2[16] = in[2]; + step2[20] = in[10]; + step2[24] = in[6]; + step2[28] = in[14]; + + btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]); + btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]); + btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]); + btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]); + btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]); + btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]); + btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]); + btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]); + + // stage 3 + + step1[0] = step2[0]; + step1[4] = step2[4]; + step1[8] = step2[8]; + step1[12] = step2[12]; + + btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]); + btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]); + btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]); + btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]); + + step1[32] = step2[32]; + step1[33] = step2[32]; + step1[34] = step2[35]; + step1[35] = step2[35]; + step1[36] = step2[36]; + step1[37] = step2[36]; + step1[38] = step2[39]; + step1[39] = step2[39]; + step1[40] = step2[40]; + step1[41] = step2[40]; + step1[42] = step2[43]; + step1[43] = step2[43]; + step1[44] = step2[44]; + step1[45] = step2[44]; + step1[46] = step2[47]; + step1[47] = step2[47]; + step1[48] = step2[48]; + step1[49] = step2[48]; + step1[50] = step2[51]; + step1[51] = step2[51]; + step1[52] = step2[52]; + step1[53] = step2[52]; + step1[54] = step2[55]; + step1[55] = step2[55]; + step1[56] = step2[56]; + step1[57] = step2[56]; + step1[58] = step2[59]; + step1[59] = step2[59]; + step1[60] = step2[60]; + step1[61] = step2[60]; + step1[62] = step2[63]; + step1[63] = step2[63]; + + // stage 4 + + step2[0] = step1[0]; + step2[4] = step1[4]; + + btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]); + btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]); + btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]); + btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]); + btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]); + btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]); + btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]); + btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]); + btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]); + btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]); + + step2[16] = step1[16]; + step2[17] = step1[16]; + step2[18] = step1[19]; + step2[19] = step1[19]; + step2[20] = step1[20]; + step2[21] = step1[20]; + step2[22] = step1[23]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[25] = step1[24]; + step2[26] = step1[27]; + step2[27] = step1[27]; + step2[28] = step1[28]; + step2[29] = step1[28]; + step2[30] = step1[31]; + step2[31] = step1[31]; + step2[32] = step1[32]; + step2[35] = step1[35]; + step2[36] = step1[36]; + step2[39] = step1[39]; + step2[40] = step1[40]; + step2[43] = step1[43]; + step2[44] = step1[44]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[51] = step1[51]; + step2[52] = step1[52]; + step2[55] = step1[55]; + step2[56] = step1[56]; + step2[59] = step1[59]; + step2[60] = step1[60]; + step2[63] = step1[63]; + + // stage 5 + + step1[0] = step2[0]; + + btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]); + btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]); + btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]); + btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]); + btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]); + + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + step1[14] = step2[15]; + step1[15] = step2[15]; + step1[16] = step2[16]; + step1[19] = step2[19]; + step1[20] = step2[20]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + step1[31] = step2[31]; + step1[32] = vqaddq_s16(step2[32], step2[35]); + step1[33] = vqaddq_s16(step2[33], step2[34]); + step1[34] = vqsubq_s16(step2[33], step2[34]); + step1[35] = vqsubq_s16(step2[32], step2[35]); + step1[36] = vqsubq_s16(step2[39], step2[36]); + step1[37] = vqsubq_s16(step2[38], step2[37]); + step1[38] = vqaddq_s16(step2[38], step2[37]); + step1[39] = vqaddq_s16(step2[39], step2[36]); + step1[40] = vqaddq_s16(step2[40], step2[43]); + step1[41] = vqaddq_s16(step2[41], step2[42]); + step1[42] = vqsubq_s16(step2[41], step2[42]); + step1[43] = vqsubq_s16(step2[40], step2[43]); + step1[44] = vqsubq_s16(step2[47], step2[44]); + step1[45] = vqsubq_s16(step2[46], step2[45]); + step1[46] = vqaddq_s16(step2[46], step2[45]); + step1[47] = vqaddq_s16(step2[47], step2[44]); + step1[48] = vqaddq_s16(step2[48], step2[51]); + step1[49] = vqaddq_s16(step2[49], step2[50]); + step1[50] = vqsubq_s16(step2[49], step2[50]); + step1[51] = vqsubq_s16(step2[48], step2[51]); + step1[52] = vqsubq_s16(step2[55], step2[52]); + step1[53] = vqsubq_s16(step2[54], step2[53]); + step1[54] = vqaddq_s16(step2[54], step2[53]); + step1[55] = vqaddq_s16(step2[55], step2[52]); + step1[56] = vqaddq_s16(step2[56], step2[59]); + step1[57] = vqaddq_s16(step2[57], step2[58]); + step1[58] = vqsubq_s16(step2[57], step2[58]); + step1[59] = vqsubq_s16(step2[56], step2[59]); + step1[60] = vqsubq_s16(step2[63], step2[60]); + step1[61] = vqsubq_s16(step2[62], step2[61]); + step1[62] = vqaddq_s16(step2[62], step2[61]); + step1[63] = vqaddq_s16(step2[63], step2[60]); + + // stage 6 + + btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]); + btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]); + btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]); + btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]); + btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]); + btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]); + btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]); + btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]); + btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]); + btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]); + btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]); + + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + step2[16] = vqaddq_s16(step1[16], step1[19]); + step2[17] = vqaddq_s16(step1[17], step1[18]); + step2[18] = vqsubq_s16(step1[17], step1[18]); + step2[19] = vqsubq_s16(step1[16], step1[19]); + step2[20] = vqsubq_s16(step1[23], step1[20]); + step2[21] = vqsubq_s16(step1[22], step1[21]); + step2[22] = vqaddq_s16(step1[22], step1[21]); + step2[23] = vqaddq_s16(step1[23], step1[20]); + step2[24] = vqaddq_s16(step1[24], step1[27]); + step2[25] = vqaddq_s16(step1[25], step1[26]); + step2[26] = vqsubq_s16(step1[25], step1[26]); + step2[27] = vqsubq_s16(step1[24], step1[27]); + step2[28] = vqsubq_s16(step1[31], step1[28]); + step2[29] = vqsubq_s16(step1[30], step1[29]); + step2[30] = vqaddq_s16(step1[30], step1[29]); + step2[31] = vqaddq_s16(step1[31], step1[28]); + step2[32] = step1[32]; + step2[33] = step1[33]; + step2[38] = step1[38]; + step2[39] = step1[39]; + step2[40] = step1[40]; + step2[41] = step1[41]; + step2[46] = step1[46]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[49] = step1[49]; + step2[54] = step1[54]; + step2[55] = step1[55]; + step2[56] = step1[56]; + step2[57] = step1[57]; + step2[62] = step1[62]; + step2[63] = step1[63]; + + // stage 7 + + btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]); + btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]); + btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]); + btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]); + btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]); + + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[4] = step2[4]; + step1[7] = step2[7]; + step1[8] = vqaddq_s16(step2[8], step2[11]); + step1[9] = vqaddq_s16(step2[9], step2[10]); + step1[10] = vqsubq_s16(step2[9], step2[10]); + step1[11] = vqsubq_s16(step2[8], step2[11]); + step1[12] = vqsubq_s16(step2[15], step2[12]); + step1[13] = vqsubq_s16(step2[14], step2[13]); + step1[14] = vqaddq_s16(step2[14], step2[13]); + step1[15] = vqaddq_s16(step2[15], step2[12]); + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + step1[32] = vqaddq_s16(step2[32], step2[39]); + step1[33] = vqaddq_s16(step2[33], step2[38]); + step1[34] = vqaddq_s16(step2[34], step2[37]); + step1[35] = vqaddq_s16(step2[35], step2[36]); + step1[36] = vqsubq_s16(step2[35], step2[36]); + step1[37] = vqsubq_s16(step2[34], step2[37]); + step1[38] = vqsubq_s16(step2[33], step2[38]); + step1[39] = vqsubq_s16(step2[32], step2[39]); + step1[40] = vqsubq_s16(step2[47], step2[40]); + step1[41] = vqsubq_s16(step2[46], step2[41]); + step1[42] = vqsubq_s16(step2[45], step2[42]); + step1[43] = vqsubq_s16(step2[44], step2[43]); + step1[44] = vqaddq_s16(step2[43], step2[44]); + step1[45] = vqaddq_s16(step2[42], step2[45]); + step1[46] = vqaddq_s16(step2[41], step2[46]); + step1[47] = vqaddq_s16(step2[40], step2[47]); + step1[48] = vqaddq_s16(step2[48], step2[55]); + step1[49] = vqaddq_s16(step2[49], step2[54]); + step1[50] = vqaddq_s16(step2[50], step2[53]); + step1[51] = vqaddq_s16(step2[51], step2[52]); + step1[52] = vqsubq_s16(step2[51], step2[52]); + step1[53] = vqsubq_s16(step2[50], step2[53]); + step1[54] = vqsubq_s16(step2[49], step2[54]); + step1[55] = vqsubq_s16(step2[48], step2[55]); + step1[56] = vqsubq_s16(step2[63], step2[56]); + step1[57] = vqsubq_s16(step2[62], step2[57]); + step1[58] = vqsubq_s16(step2[61], step2[58]); + step1[59] = vqsubq_s16(step2[60], step2[59]); + step1[60] = vqaddq_s16(step2[59], step2[60]); + step1[61] = vqaddq_s16(step2[58], step2[61]); + step1[62] = vqaddq_s16(step2[57], step2[62]); + step1[63] = vqaddq_s16(step2[56], step2[63]); + + // stage 8 + + btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]); + btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]); + btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]); + btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]); + btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]); + btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]); + btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]); + btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]); + btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]); + btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]); + + step2[0] = vqaddq_s16(step1[0], step1[7]); + step2[1] = vqaddq_s16(step1[1], step1[6]); + step2[2] = vqaddq_s16(step1[2], step1[5]); + step2[3] = vqaddq_s16(step1[3], step1[4]); + step2[4] = vqsubq_s16(step1[3], step1[4]); + step2[5] = vqsubq_s16(step1[2], step1[5]); + step2[6] = vqsubq_s16(step1[1], step1[6]); + step2[7] = vqsubq_s16(step1[0], step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + step2[16] = vqaddq_s16(step1[16], step1[23]); + step2[17] = vqaddq_s16(step1[17], step1[22]); + step2[18] = vqaddq_s16(step1[18], step1[21]); + step2[19] = vqaddq_s16(step1[19], step1[20]); + step2[20] = vqsubq_s16(step1[19], step1[20]); + step2[21] = vqsubq_s16(step1[18], step1[21]); + step2[22] = vqsubq_s16(step1[17], step1[22]); + step2[23] = vqsubq_s16(step1[16], step1[23]); + step2[24] = vqsubq_s16(step1[31], step1[24]); + step2[25] = vqsubq_s16(step1[30], step1[25]); + step2[26] = vqsubq_s16(step1[29], step1[26]); + step2[27] = vqsubq_s16(step1[28], step1[27]); + step2[28] = vqaddq_s16(step1[28], step1[27]); + step2[29] = vqaddq_s16(step1[29], step1[26]); + step2[30] = vqaddq_s16(step1[30], step1[25]); + step2[31] = vqaddq_s16(step1[31], step1[24]); + step2[32] = step1[32]; + step2[33] = step1[33]; + step2[34] = step1[34]; + step2[35] = step1[35]; + step2[44] = step1[44]; + step2[45] = step1[45]; + step2[46] = step1[46]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[49] = step1[49]; + step2[50] = step1[50]; + step2[51] = step1[51]; + step2[60] = step1[60]; + step2[61] = step1[61]; + step2[62] = step1[62]; + step2[63] = step1[63]; + + // stage 9 + idct64_stage9_neon(step2, step1, cos_bit); + + // stage 10 + idct64_stage10_neon(step1, step2, cos_bit); + + // stage 11 + + out[0] = vqaddq_s16(step2[0], step2[63]); + out[1] = vqaddq_s16(step2[1], step2[62]); + out[2] = vqaddq_s16(step2[2], step2[61]); + out[3] = vqaddq_s16(step2[3], step2[60]); + out[4] = vqaddq_s16(step2[4], step2[59]); + out[5] = vqaddq_s16(step2[5], step2[58]); + out[6] = vqaddq_s16(step2[6], step2[57]); + out[7] = vqaddq_s16(step2[7], step2[56]); + out[8] = vqaddq_s16(step2[8], step2[55]); + out[9] = vqaddq_s16(step2[9], step2[54]); + out[10] = vqaddq_s16(step2[10], step2[53]); + out[11] = vqaddq_s16(step2[11], step2[52]); + out[12] = vqaddq_s16(step2[12], step2[51]); + out[13] = vqaddq_s16(step2[13], step2[50]); + out[14] = vqaddq_s16(step2[14], step2[49]); + out[15] = vqaddq_s16(step2[15], step2[48]); + out[16] = vqaddq_s16(step2[16], step2[47]); + out[17] = vqaddq_s16(step2[17], step2[46]); + out[18] = vqaddq_s16(step2[18], step2[45]); + out[19] = vqaddq_s16(step2[19], step2[44]); + out[20] = vqaddq_s16(step2[20], step2[43]); + out[21] = vqaddq_s16(step2[21], step2[42]); + out[22] = vqaddq_s16(step2[22], step2[41]); + out[23] = vqaddq_s16(step2[23], step2[40]); + out[24] = vqaddq_s16(step2[24], step2[39]); + out[25] = vqaddq_s16(step2[25], step2[38]); + out[26] = vqaddq_s16(step2[26], step2[37]); + out[27] = vqaddq_s16(step2[27], step2[36]); + out[28] = vqaddq_s16(step2[28], step2[35]); + out[29] = vqaddq_s16(step2[29], step2[34]); + out[30] = vqaddq_s16(step2[30], step2[33]); + out[31] = vqaddq_s16(step2[31], step2[32]); + out[32] = vqsubq_s16(step2[31], step2[32]); + out[33] = vqsubq_s16(step2[30], step2[33]); + out[34] = vqsubq_s16(step2[29], step2[34]); + out[35] = vqsubq_s16(step2[28], step2[35]); + out[36] = vqsubq_s16(step2[27], step2[36]); + out[37] = vqsubq_s16(step2[26], step2[37]); + out[38] = vqsubq_s16(step2[25], step2[38]); + out[39] = vqsubq_s16(step2[24], step2[39]); + out[40] = vqsubq_s16(step2[23], step2[40]); + out[41] = vqsubq_s16(step2[22], step2[41]); + out[42] = vqsubq_s16(step2[21], step2[42]); + out[43] = vqsubq_s16(step2[20], step2[43]); + out[44] = vqsubq_s16(step2[19], step2[44]); + out[45] = vqsubq_s16(step2[18], step2[45]); + out[46] = vqsubq_s16(step2[17], step2[46]); + out[47] = vqsubq_s16(step2[16], step2[47]); + out[48] = vqsubq_s16(step2[15], step2[48]); + out[49] = vqsubq_s16(step2[14], step2[49]); + out[50] = vqsubq_s16(step2[13], step2[50]); + out[51] = vqsubq_s16(step2[12], step2[51]); + out[52] = vqsubq_s16(step2[11], step2[52]); + out[53] = vqsubq_s16(step2[10], step2[53]); + out[54] = vqsubq_s16(step2[9], step2[54]); + out[55] = vqsubq_s16(step2[8], step2[55]); + out[56] = vqsubq_s16(step2[7], step2[56]); + out[57] = vqsubq_s16(step2[6], step2[57]); + out[58] = vqsubq_s16(step2[5], step2[58]); + out[59] = vqsubq_s16(step2[4], step2[59]); + out[60] = vqsubq_s16(step2[3], step2[60]); + out[61] = vqsubq_s16(step2[2], step2[61]); + out[62] = vqsubq_s16(step2[1], step2[62]); + out[63] = vqsubq_s16(step2[0], step2[63]); +} + +// Functions for blocks with eob at DC and within +// topleft 8x8, 16x16, 32x32 corner +static const transform_neon + lowbd_txfm_all_1d_zeros_w_arr[TX_SIZES][ITX_TYPES_1D][4] = { + { + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { idct8_low1_neon, idct8_neon, NULL, NULL }, + { iadst8_low1_neon, iadst8_neon, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + { + { idct16_low1_neon, idct16_low8_neon, idct16_neon, NULL }, + { iadst16_low1_neon, iadst16_low8_neon, iadst16_neon, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { idct32_low1_neon, idct32_low8_neon, idct32_low16_neon, idct32_neon }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + { { idct64_low1_neon, idct64_low8_neon, idct64_low16_neon, + idct64_low32_neon }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } } + }; + +static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + (void)tx_type; + int16x8_t a[32 * 4]; + int16x8_t b[32 * 4]; + int eobx, eoby; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3), + 0); + lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3), + 0); + const int buf_size_w_div8 = txfm_size_col >> 3; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int32_t *input_1; + int temp_b = 0; + + for (int i = 0; i < buf_size_nonzero_h_div8; i++) { + input_1 = input; + for (int j = 0; j < buf_size_nonzero_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col); + transpose_s16_8x8q(&a[k], &a[k]); + input_1 += 8; + } + input += (txfm_size_col * 8); + if (abs(rect_type) == 1) { + int y = i * txfm_size_col; + round_shift_for_rect(&a[y], &a[y], txfm_size_col); + } + identity_txfm_round_neon(&a[i * txfm_size_col], &a[i * txfm_size_col], + txw_idx, txfm_size_col, -shift[0]); + for (int j = 0; j < buf_size_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]); + } + temp_b += 8; + } + for (int j = 0; j < buf_size_w_div8; ++j) { + identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row], + txh_idx, txfm_size_row, -shift[1]); + } + if (txfm_size_col >= 16) { + for (int i = 0; i < (txfm_size_col >> 4); i++) { + lowbd_add_flip_buffer_16xn_neon( + &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row); + } + } else if (txfm_size_col == 8) { + lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row); + } +} + +static INLINE void lowbd_inv_txfm2d_add_v_identity_neon( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + int16x8_t a[16 * 2]; + int16x8_t b[16 * 2]; + int eobx, eoby, ud_flip, lr_flip; + get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3), + 0); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int buf_size_w_div8 = txfm_size_col >> 3; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int32_t *input_1; + int temp_b = 0; + const transform_neon row_txfm = + lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + + assert(row_txfm != NULL); + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < buf_size_nonzero_h_div8; i++) { + input_1 = input; + for (int j = 0; j < buf_size_nonzero_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col); + transpose_s16_8x8q(&a[k], &a[k]); + input_1 += 8; + } + input += (txfm_size_col * 8); + if (abs(rect_type) == 1) { + int y = i * txfm_size_col; + round_shift_for_rect(&a[y], &a[y], txfm_size_col); + } + row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0); + av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col, + -shift[0]); + if (lr_flip == 1) { + for (int j = 0; j < buf_size_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + flip_buf_ud_neon(&a[k], 8); + transpose_s16_8x8q( + &a[k], &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]); + } + temp_b += 8; + } else { + for (int j = 0; j < buf_size_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]); + } + temp_b += 8; + } + } + for (int j = 0; j < buf_size_w_div8; ++j) { + identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row], + txh_idx, txfm_size_row, -shift[1]); + } + if (txfm_size_col >= 16) { + for (int i = 0; i < (txfm_size_col >> 4); i++) { + lowbd_add_flip_buffer_16xn_neon( + &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row); + } + } else if (txfm_size_col == 8) { + lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row); + } +} + +static INLINE void lowbd_inv_txfm2d_add_h_identity_neon( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + int16x8_t a[16 * 2]; + int16x8_t b[16 * 2]; + int eobx, eoby, ud_flip, lr_flip; + get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3), + 0); + const int buf_size_w_div8 = txfm_size_col >> 3; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const int32_t *input_1; + int temp_b = 0; + const transform_neon col_txfm = + lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < buf_size_nonzero_h_div8; i++) { + input_1 = input; + for (int j = 0; j < buf_size_nonzero_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col); + transpose_s16_8x8q(&a[k], &a[k]); + input_1 += 8; + } + input += (txfm_size_col * 8); + if (abs(rect_type) == 1) { + int y = i * txfm_size_col; + round_shift_for_rect(&a[y], &a[y], txfm_size_col); + } + identity_txfm_round_neon(&a[i * txfm_size_col], &a[i * txfm_size_col], + txw_idx, txfm_size_col, -shift[0]); + for (int j = 0; j < buf_size_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]); + } + temp_b += 8; + } + for (int j = 0; j < buf_size_w_div8; ++j) { + col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0); + av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, + -shift[1]); + } + if (txfm_size_col >= 16) { + for (int i = 0; i < (txfm_size_col >> 4); i++) { + lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2], + output + 16 * i, stride, ud_flip, + txfm_size_row); + } + } else if (txfm_size_col == 8) { + lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row); + } +} + +static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, int eob) { + (void)eob; + TX_SIZE tx_size = TX_4X4; + DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 8 + 8]); + int32_t *temp_in = txfm_buf; + + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); + int32_t *temp_out = temp_in + buf_offset; + int32_t *buf = temp_out + buf_offset; + int32_t *buf_ptr = buf; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, 16, 16 }; + int r, bd = 8; + const transform_1d_neon row_txfm = + lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_neon col_txfm = + lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < txfm_size_row; i++) { + row_txfm(input, buf_ptr, cos_bit_row, stage_range); + + input += txfm_size_col; + buf_ptr += txfm_size_col; + } + + for (int c = 0; c < txfm_size_col; ++c) { + if (lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + c]; + } else { + // flip left right + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; + } + clamp_buf(temp_in, txfm_size_row, bd + 8); + col_txfm(temp_in, temp_out, cos_bit_col, stage_range); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + + if (ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = + highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd); + } + } else { + // flip upside down + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = highbd_clip_pixel_add( + output[r * stride + c], temp_out[txfm_size_row - r - 1], bd); + } + } + } +} + +void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, int eob) { + (void)eob; + TX_SIZE tx_size = TX_4X8; + DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]); + int32_t *temp_in = txfm_buf; + + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); + int32_t *temp_out = temp_in + buf_offset; + int32_t *buf = temp_out + buf_offset; + int32_t *buf_ptr = buf; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, + 16, 16, 16, 16 }; + int r, bd = 8; + const transform_1d_neon row_txfm = + lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_neon col_txfm = + lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < txfm_size_row; i++) { + for (int j = 0; j < txfm_size_col; j++) + temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits); + + row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range); + input += txfm_size_col; + buf_ptr += txfm_size_col; + } + + for (int c = 0; c < txfm_size_col; ++c) { + if (lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + c]; + } else { + // flip left right + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; + } + clamp_buf(temp_in, txfm_size_row, bd + 8); + col_txfm(temp_in, temp_out, cos_bit_col, stage_range); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + + if (ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = + highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd); + } + } else { + // flip upside down + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = highbd_clip_pixel_add( + output[r * stride + c], temp_out[txfm_size_row - r - 1], bd); + } + } + } +} + +void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, int eob) { + (void)eob; + TX_SIZE tx_size = TX_8X4; + DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]); + int32_t *temp_in = txfm_buf; + + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); + int32_t *temp_out = temp_in + buf_offset; + int32_t *buf = temp_out + buf_offset; + int32_t *buf_ptr = buf; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, + 16, 16, 16, 16 }; + int r, bd = 8; + const transform_1d_neon row_txfm = + lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_neon col_txfm = + lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < txfm_size_row; i++) { + for (int j = 0; j < txfm_size_col; j++) + temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits); + + row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range); + input += txfm_size_col; + buf_ptr += txfm_size_col; + } + + for (int c = 0; c < txfm_size_col; ++c) { + if (lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + c]; + } else { + // flip left right + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; + } + clamp_buf(temp_in, txfm_size_row, bd + 8); + col_txfm(temp_in, temp_out, cos_bit_col, stage_range); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + + if (ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = + highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd); + } + } else { + // flip upside down + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = highbd_clip_pixel_add( + output[r * stride + c], temp_out[txfm_size_row - r - 1], bd); + } + } + } +} + +void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, int eob) { + (void)eob; + TX_SIZE tx_size = TX_4X16; + DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]); + int32_t *temp_in = txfm_buf; + + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); + int32_t *temp_out = temp_in + buf_offset; + int32_t *buf = temp_out + buf_offset; + int32_t *buf_ptr = buf; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16 }; + int r, bd = 8; + const transform_1d_neon row_txfm = + lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_neon col_txfm = + lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < txfm_size_row; i++) { + row_txfm(input, buf_ptr, cos_bit_row, stage_range); + av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]); + input += txfm_size_col; + buf_ptr += txfm_size_col; + } + + for (int c = 0; c < txfm_size_col; ++c) { + if (lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + c]; + } else { + // flip left right + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; + } + clamp_buf(temp_in, txfm_size_row, bd + 8); + col_txfm(temp_in, temp_out, cos_bit_col, stage_range); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + + if (ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = + highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd); + } + } else { + // flip upside down + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = highbd_clip_pixel_add( + output[r * stride + c], temp_out[txfm_size_row - r - 1], bd); + } + } + } +} + +void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, int eob) { + (void)eob; + TX_SIZE tx_size = TX_16X4; + DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]); + int32_t *temp_in = txfm_buf; + + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); + int32_t *temp_out = temp_in + buf_offset; + int32_t *buf = temp_out + buf_offset; + int32_t *buf_ptr = buf; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16 }; + int r, bd = 8; + const transform_1d_neon row_txfm = + lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_neon col_txfm = + lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < txfm_size_row; i++) { + row_txfm(input, buf_ptr, cos_bit_row, stage_range); + av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]); + input += txfm_size_col; + buf_ptr += txfm_size_col; + } + + for (int c = 0; c < txfm_size_col; ++c) { + if (lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + c]; + } else { + // flip left right + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; + } + clamp_buf(temp_in, txfm_size_row, bd + 8); + col_txfm(temp_in, temp_out, cos_bit_col, stage_range); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + + if (ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = + highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd); + } + } else { + // flip upside down + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = highbd_clip_pixel_add( + output[r * stride + c], temp_out[txfm_size_row - r - 1], bd); + } + } + } +} + +static INLINE void lowbd_inv_txfm2d_add_no_identity_neon( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + int16x8_t a[64 * 8]; + int16x8_t b[64 * 8]; + int eobx, eoby, ud_flip, lr_flip; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int buf_size_w_div8 = txfm_size_col >> 3; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int input_stride = AOMMIN(32, txfm_size_col); + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const int32_t *input_1; + int temp_b = 0; + + const transform_neon row_txfm = + lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_neon col_txfm = + lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < buf_size_nonzero_h_div8; i++) { + input_1 = input; + for (int j = 0; j < buf_size_nonzero_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + load_buffer_32bit_to_16bit_neon(input_1, &a[k], input_stride); + transpose_s16_8x8q(&a[k], &a[k]); + input_1 += 8; + } + input += (input_stride * 8); + if (abs(rect_type) == 1) { + int y = i * txfm_size_col; + round_shift_for_rect(&a[y], &a[y], input_stride); + } + row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0); + av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col, + -shift[0]); + if (lr_flip == 1) { + for (int j = 0; j < buf_size_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + flip_buf_ud_neon(&a[k], 8); + transpose_s16_8x8q( + &a[k], &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]); + } + temp_b += 8; + } else { + for (int j = 0; j < buf_size_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]); + } + temp_b += 8; + } + } + for (int j = 0; j < buf_size_w_div8; ++j) { + col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0); + av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, + -shift[1]); + } + + if (txfm_size_col >= 16) { + for (int i = 0; i < (txfm_size_col >> 4); i++) { + lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2], + output + 16 * i, stride, ud_flip, + txfm_size_row); + } + } else if (txfm_size_col == 8) { + lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row); + } +} + +static INLINE void lowbd_inv_txfm2d_add_universe_neon( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + switch (tx_type) { + case IDTX: + lowbd_inv_txfm2d_add_idtx_neon(input, output, stride, tx_type, tx_size, + eob); + break; + + case H_DCT: + case H_ADST: + case H_FLIPADST: + lowbd_inv_txfm2d_add_v_identity_neon(input, output, stride, tx_type, + tx_size, eob); + break; + + case V_DCT: + case V_ADST: + case V_FLIPADST: + lowbd_inv_txfm2d_add_h_identity_neon(input, output, stride, tx_type, + tx_size, eob); + break; + + default: + lowbd_inv_txfm2d_add_no_identity_neon(input, output, stride, tx_type, + tx_size, eob); + break; + } +} + +void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, TX_SIZE tx_size, + int eob) { + switch (tx_size) { + case TX_4X4: + lowbd_inv_txfm2d_add_4x4_neon(input, output, stride, tx_type, eob); + break; + + case TX_4X8: + lowbd_inv_txfm2d_add_4x8_neon(input, output, stride, tx_type, eob); + break; + + case TX_8X4: + lowbd_inv_txfm2d_add_8x4_neon(input, output, stride, tx_type, eob); + break; + + case TX_4X16: + lowbd_inv_txfm2d_add_4x16_neon(input, output, stride, tx_type, eob); + break; + + case TX_16X4: + lowbd_inv_txfm2d_add_16x4_neon(input, output, stride, tx_type, eob); + break; + + default: + lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type, + tx_size, eob); + break; + } +} +void av1_inv_txfm_add_neon(const tran_low_t *dqcoeff, uint8_t *dst, int stride, + const TxfmParam *txfm_param) { + const TX_TYPE tx_type = txfm_param->tx_type; + if (!txfm_param->lossless) { + av1_lowbd_inv_txfm2d_add_neon(dqcoeff, dst, stride, tx_type, + txfm_param->tx_size, txfm_param->eob); + } else { + av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param); + } +} diff --git a/libs/libaom/src/av1/common/arm/av1_inv_txfm_neon.h b/libs/libaom/src/av1/common/arm/av1_inv_txfm_neon.h new file mode 100644 index 000000000..9ec658291 --- /dev/null +++ b/libs/libaom/src/av1/common/arm/av1_inv_txfm_neon.h @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_ +#define AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_ + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "av1/common/enums.h" +#include "av1/common/av1_inv_txfm1d.h" +#include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/av1_txfm.h" + +typedef void (*transform_1d_neon)(const int32_t *input, int32_t *output, + const int8_t cos_bit, + const int8_t *stage_ptr); +typedef void (*transform_neon)(int16x8_t *input, int16x8_t *output, + int8_t cos_bit, int bit); + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = { + 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_16x16_default[16]) = { + 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, + 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_32x32_default[32]) = { + 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, + 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, + 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, + 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = { + 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07, + 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = { + 0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_16x32_default[32]) = { + 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, + 0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, + 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, + 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_32x16_default[16]) = { + 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, + 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = { + 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07, + 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07, + 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, + 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = { + 0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f, +}; + +DECLARE_ALIGNED(16, static const int16_t *, + av1_eob_to_eobxy_default[TX_SIZES_ALL]) = { + NULL, + av1_eob_to_eobxy_8x8_default, + av1_eob_to_eobxy_16x16_default, + av1_eob_to_eobxy_32x32_default, + av1_eob_to_eobxy_32x32_default, + NULL, + NULL, + av1_eob_to_eobxy_8x16_default, + av1_eob_to_eobxy_16x8_default, + av1_eob_to_eobxy_16x32_default, + av1_eob_to_eobxy_32x16_default, + av1_eob_to_eobxy_32x32_default, + av1_eob_to_eobxy_32x32_default, + NULL, + NULL, + av1_eob_to_eobxy_8x32_default, + av1_eob_to_eobxy_32x8_default, + av1_eob_to_eobxy_16x32_default, + av1_eob_to_eobxy_32x16_default, +}; + +static const int lowbd_txfm_all_1d_zeros_idx[32] = { + 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, +}; + +// Transform block width in log2 for eob (size of 64 map to 32) +static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = { + 2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5, +}; + +static int eob_fill[32] = { + 0, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, +}; + +static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby, + TX_SIZE tx_size, int eob) { + if (eob == 1) { + *eobx = 0; + *eoby = 0; + return; + } + + const int tx_w_log2 = tx_size_wide_log2_eob[tx_size]; + const int eob_row = (eob - 1) >> tx_w_log2; + const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row]; + *eobx = eobxy & 0xFF; + *eoby = eobxy >> 8; +} + +static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby, + TX_SIZE tx_size, int eob) { + eob -= 1; + const int txfm_size_row = tx_size_high[tx_size]; + const int eoby_max = AOMMIN(32, txfm_size_row) - 1; + *eobx = eob / (eoby_max + 1); + *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob]; +} + +static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby, + TX_SIZE tx_size, int eob) { + eob -= 1; + const int txfm_size_col = tx_size_wide[tx_size]; + const int eobx_max = AOMMIN(32, txfm_size_col) - 1; + *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob]; + const int temp_eoby = eob / (eobx_max + 1); + assert(temp_eoby < 32); + *eoby = eob_fill[temp_eoby]; +} + +#endif // AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_ diff --git a/libs/libaom/src/av1/common/arm/av1_txfm_neon.c b/libs/libaom/src/av1/common/arm/av1_txfm_neon.c new file mode 100644 index 000000000..7e3a05ab7 --- /dev/null +++ b/libs/libaom/src/av1/common/arm/av1_txfm_neon.c @@ -0,0 +1,30 @@ +/* + * + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_ports/mem.h" +#include "av1/common/arm/mem_neon.h" + +void av1_round_shift_array_neon(int32_t *arr, int size, int bit) { + assert(!(size % 4)); + if (!bit) return; + const int32x4_t dup_bits_n_32x4 = vdupq_n_s32((int32_t)(-bit)); + for (int i = 0; i < size; i += 4) { + int32x4_t tmp_q_s32 = vld1q_s32(arr); + tmp_q_s32 = vrshlq_s32(tmp_q_s32, dup_bits_n_32x4); + vst1q_s32(arr, tmp_q_s32); + arr += 4; + } +} diff --git a/libs/libaom/src/av1/common/arm/blend_a64_hmask_neon.c b/libs/libaom/src/av1/common/arm/blend_a64_hmask_neon.c new file mode 100644 index 000000000..7134f183e --- /dev/null +++ b/libs/libaom/src/av1/common/arm/blend_a64_hmask_neon.c @@ -0,0 +1,134 @@ +/* + * + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom/aom_integer.h" +#include "aom_dsp/blend.h" +#include "aom_ports/mem.h" +#include "av1/common/arm/mem_neon.h" +#include "aom_dsp/aom_dsp_common.h" +#include "config/aom_dsp_rtcd.h" + +void aom_blend_a64_hmask_neon(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 2); + assert(w >= 2); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + uint8x8_t tmp0, tmp1; + uint8x16_t res_q; + uint16x8_t res, res_low, res_high; + uint32x2_t tmp0_32 = vdup_n_u32(0), tmp1_32 = vdup_n_u32(0); + uint16x4_t tmp0_16 = vdup_n_u16(0), tmp1_16 = vdup_n_u16(0); + const uint8x8_t vdup_64 = vdup_n_u8((uint8_t)64); + + if (w >= 16) { + const uint8x16_t vdup_64_q = vdupq_n_u8((uint8_t)64); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + __builtin_prefetch(src0); + __builtin_prefetch(src1); + const uint8x16_t tmp0_q = vld1q_u8(src0); + const uint8x16_t tmp1_q = vld1q_u8(src1); + const uint8x16_t m_q = vld1q_u8(mask); + const uint8x16_t max_minus_m_q = vsubq_u8(vdup_64_q, m_q); + res_low = vmull_u8(vget_low_u8(m_q), vget_low_u8(tmp0_q)); + res_low = + vmlal_u8(res_low, vget_low_u8(max_minus_m_q), vget_low_u8(tmp1_q)); + res_high = vmull_u8(vget_high_u8(m_q), vget_high_u8(tmp0_q)); + res_high = vmlal_u8(res_high, vget_high_u8(max_minus_m_q), + vget_high_u8(tmp1_q)); + res_q = vcombine_u8(vrshrn_n_u16(res_low, AOM_BLEND_A64_ROUND_BITS), + vrshrn_n_u16(res_high, AOM_BLEND_A64_ROUND_BITS)); + vst1q_u8(dst, res_q); + src0 += 16; + src1 += 16; + dst += 16; + mask += 16; + } + src0 += src0_stride - w; + src1 += src1_stride - w; + dst += dst_stride - w; + mask -= w; + } + } else if (w == 8) { + const uint8x8_t m = vld1_u8(mask); + const uint8x8_t max_minus_m = vsub_u8(vdup_64, m); + for (int i = 0; i < h; ++i) { + __builtin_prefetch(src0); + __builtin_prefetch(src1); + tmp0 = vld1_u8(src0); + tmp1 = vld1_u8(src1); + res = vmull_u8(m, tmp0); + res = vmlal_u8(res, max_minus_m, tmp1); + vst1_u8(dst, vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)); + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } + } else if (w == 4) { + const uint8x8_t m = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)mask)); + const uint8x8_t max_minus_m = vsub_u8(vdup_64, m); + for (int i = 0; i < h; i += 2) { + __builtin_prefetch(src0 + 0 * src0_stride); + __builtin_prefetch(src0 + 1 * src0_stride); + __builtin_prefetch(src1 + 0 * src1_stride); + __builtin_prefetch(src1 + 1 * src1_stride); + load_unaligned_u8_4x2(src0, src0_stride, &tmp0_32); + tmp0 = vreinterpret_u8_u32(tmp0_32); + load_unaligned_u8_4x2(src1, src1_stride, &tmp1_32); + tmp1 = vreinterpret_u8_u32(tmp1_32); + res = vmull_u8(m, tmp0); + res = vmlal_u8(res, max_minus_m, tmp1); + vst1_lane_u32( + (uint32_t *)(dst + (0 * dst_stride)), + vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0); + vst1_lane_u32( + (uint32_t *)(dst + (1 * dst_stride)), + vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1); + src0 += (2 * src0_stride); + src1 += (2 * src1_stride); + dst += (2 * dst_stride); + } + } else if (w == 2) { + const uint8x8_t m = vreinterpret_u8_u16(vld1_dup_u16((uint16_t *)mask)); + const uint8x8_t max_minus_m = vsub_u8(vdup_64, m); + for (int i = 0; i < h; i += 2) { + __builtin_prefetch(src0 + 0 * src0_stride); + __builtin_prefetch(src0 + 1 * src0_stride); + __builtin_prefetch(src1 + 0 * src1_stride); + __builtin_prefetch(src1 + 1 * src1_stride); + load_unaligned_u8_2x2(src0, src0_stride, &tmp0_16); + tmp0 = vreinterpret_u8_u16(tmp0_16); + load_unaligned_u8_2x2(src1, src1_stride, &tmp1_16); + tmp1 = vreinterpret_u8_u16(tmp1_16); + res = vmull_u8(m, tmp0); + res = vmlal_u8(res, max_minus_m, tmp1); + vst1_lane_u16( + (uint16_t *)(dst + (0 * dst_stride)), + vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0); + vst1_lane_u16( + (uint16_t *)(dst + (1 * dst_stride)), + vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1); + src0 += (2 * src0_stride); + src1 += (2 * src1_stride); + dst += (2 * dst_stride); + } + } +} diff --git a/libs/libaom/src/av1/common/arm/blend_a64_vmask_neon.c b/libs/libaom/src/av1/common/arm/blend_a64_vmask_neon.c new file mode 100644 index 000000000..194e94c8c --- /dev/null +++ b/libs/libaom/src/av1/common/arm/blend_a64_vmask_neon.c @@ -0,0 +1,141 @@ +/* + * + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom/aom_integer.h" +#include "aom_dsp/blend.h" +#include "aom_ports/mem.h" +#include "av1/common/arm/mem_neon.h" +#include "aom_dsp/aom_dsp_common.h" +#include "config/aom_dsp_rtcd.h" + +void aom_blend_a64_vmask_neon(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + uint8x8_t tmp0, tmp1; + uint8x16_t tmp0_q, tmp1_q, res_q; + uint16x8_t res, res_low, res_high; + uint32x2_t tmp0_32 = vdup_n_u32(0), tmp1_32 = vdup_n_u32(0); + uint16x4_t tmp0_16 = vdup_n_u16(0), tmp1_16 = vdup_n_u16(0); + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 2); + assert(w >= 2); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (w >= 16) { + for (int i = 0; i < h; ++i) { + const uint8x8_t m = vdup_n_u8((uint8_t)mask[i]); + const uint8x8_t max_minus_m = vdup_n_u8(64 - (uint8_t)mask[i]); + for (int j = 0; j < w; j += 16) { + __builtin_prefetch(src0); + __builtin_prefetch(src1); + tmp0_q = vld1q_u8(src0); + tmp1_q = vld1q_u8(src1); + res_low = vmull_u8(m, vget_low_u8(tmp0_q)); + res_low = vmlal_u8(res_low, max_minus_m, vget_low_u8(tmp1_q)); + res_high = vmull_u8(m, vget_high_u8(tmp0_q)); + res_high = vmlal_u8(res_high, max_minus_m, vget_high_u8(tmp1_q)); + res_q = vcombine_u8(vrshrn_n_u16(res_low, AOM_BLEND_A64_ROUND_BITS), + vrshrn_n_u16(res_high, AOM_BLEND_A64_ROUND_BITS)); + vst1q_u8(dst, res_q); + src0 += 16; + src1 += 16; + dst += 16; + } + src0 += src0_stride - w; + src1 += src1_stride - w; + dst += dst_stride - w; + } + } else if (w == 8) { + for (int i = 0; i < h; ++i) { + __builtin_prefetch(src0); + __builtin_prefetch(src1); + const uint8x8_t m = vdup_n_u8((uint8_t)mask[i]); + const uint8x8_t max_minus_m = vdup_n_u8(64 - (uint8_t)mask[i]); + tmp0 = vld1_u8(src0); + tmp1 = vld1_u8(src1); + res = vmull_u8(m, tmp0); + res = vmlal_u8(res, max_minus_m, tmp1); + vst1_u8(dst, vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)); + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } + } else if (w == 4) { + for (int i = 0; i < h; i += 2) { + __builtin_prefetch(src0 + 0 * src0_stride); + __builtin_prefetch(src0 + 1 * src0_stride); + __builtin_prefetch(src1 + 0 * src1_stride); + __builtin_prefetch(src1 + 1 * src1_stride); + const uint16x4_t m1 = vdup_n_u16((uint16_t)mask[i]); + const uint16x4_t m2 = vdup_n_u16((uint16_t)mask[i + 1]); + const uint8x8_t m = vmovn_u16(vcombine_u16(m1, m2)); + const uint16x4_t max_minus_m1 = vdup_n_u16(64 - (uint16_t)mask[i]); + const uint16x4_t max_minus_m2 = vdup_n_u16(64 - (uint16_t)mask[i + 1]); + const uint8x8_t max_minus_m = + vmovn_u16(vcombine_u16(max_minus_m1, max_minus_m2)); + load_unaligned_u8_4x2(src0, src0_stride, &tmp0_32); + tmp0 = vreinterpret_u8_u32(tmp0_32); + load_unaligned_u8_4x2(src1, src1_stride, &tmp1_32); + tmp1 = vreinterpret_u8_u32(tmp1_32); + res = vmull_u8(m, tmp0); + res = vmlal_u8(res, max_minus_m, tmp1); + vst1_lane_u32( + (uint32_t *)(dst + (0 * dst_stride)), + vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0); + vst1_lane_u32( + (uint32_t *)(dst + (1 * dst_stride)), + vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1); + src0 += (2 * src0_stride); + src1 += (2 * src1_stride); + dst += (2 * dst_stride); + } + } else if (w == 2) { + for (int i = 0; i < h; i += 2) { + __builtin_prefetch(src0 + 0 * src0_stride); + __builtin_prefetch(src0 + 1 * src0_stride); + __builtin_prefetch(src1 + 0 * src1_stride); + __builtin_prefetch(src1 + 1 * src1_stride); + const uint8x8_t m1 = vdup_n_u8(mask[i]); + const uint8x8_t m2 = vdup_n_u8(mask[i + 1]); + const uint16x4x2_t m_trn = + vtrn_u16(vreinterpret_u16_u8(m1), vreinterpret_u16_u8(m2)); + const uint8x8_t m = vreinterpret_u8_u16(m_trn.val[0]); + const uint8x8_t max_minus_m1 = vdup_n_u8(64 - mask[i]); + const uint8x8_t max_minus_m2 = vdup_n_u8(64 - mask[i + 1]); + const uint16x4x2_t max_minus_m_trn = vtrn_u16( + vreinterpret_u16_u8(max_minus_m1), vreinterpret_u16_u8(max_minus_m2)); + const uint8x8_t max_minus_m = vreinterpret_u8_u16(max_minus_m_trn.val[0]); + load_unaligned_u8_2x2(src0, src0_stride, &tmp0_16); + tmp0 = vreinterpret_u8_u16(tmp0_16); + load_unaligned_u8_2x2(src1, src1_stride, &tmp1_16); + tmp1 = vreinterpret_u8_u16(tmp1_16); + res = vmull_u8(m, tmp0); + res = vmlal_u8(res, max_minus_m, tmp1); + vst1_lane_u16( + (uint16_t *)(dst + (0 * dst_stride)), + vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0); + vst1_lane_u16( + (uint16_t *)(dst + (1 * dst_stride)), + vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1); + src0 += (2 * src0_stride); + src1 += (2 * src1_stride); + dst += (2 * dst_stride); + } + } +} diff --git a/libs/libaom/src/av1/common/arm/cfl_neon.c b/libs/libaom/src/av1/common/arm/cfl_neon.c new file mode 100644 index 000000000..371be5f0e --- /dev/null +++ b/libs/libaom/src/av1/common/arm/cfl_neon.c @@ -0,0 +1,588 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/cfl.h" + +static INLINE void vldsubstq_s16(int16_t *dst, const uint16_t *src, int offset, + int16x8_t sub) { + vst1q_s16(dst + offset, + vsubq_s16(vreinterpretq_s16_u16(vld1q_u16(src + offset)), sub)); +} + +static INLINE uint16x8_t vldaddq_u16(const uint16_t *buf, size_t offset) { + return vaddq_u16(vld1q_u16(buf), vld1q_u16(buf + offset)); +} + +// Load half of a vector and duplicated in other half +static INLINE uint8x8_t vldh_dup_u8(const uint8_t *ptr) { + return vreinterpret_u8_u32(vld1_dup_u32((const uint32_t *)ptr)); +} + +// Store half of a vector. +static INLINE void vsth_u16(uint16_t *ptr, uint16x4_t val) { + *((uint32_t *)ptr) = vreinterpret_u32_u16(val)[0]; +} + +// Store half of a vector. +static INLINE void vsth_u8(uint8_t *ptr, uint8x8_t val) { + *((uint32_t *)ptr) = vreinterpret_u32_u8(val)[0]; +} + +static void cfl_luma_subsampling_420_lbd_neon(const uint8_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE; + const int luma_stride = input_stride << 1; + do { + if (width == 4) { + const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input)); + const uint16x4_t sum = vpadal_u8(top, vldh_dup_u8(input + input_stride)); + vsth_u16(pred_buf_q3, vshl_n_u16(sum, 1)); + } else if (width == 8) { + const uint16x4_t top = vpaddl_u8(vld1_u8(input)); + const uint16x4_t sum = vpadal_u8(top, vld1_u8(input + input_stride)); + vst1_u16(pred_buf_q3, vshl_n_u16(sum, 1)); + } else if (width == 16) { + const uint16x8_t top = vpaddlq_u8(vld1q_u8(input)); + const uint16x8_t sum = vpadalq_u8(top, vld1q_u8(input + input_stride)); + vst1q_u16(pred_buf_q3, vshlq_n_u16(sum, 1)); + } else { + const uint8x8x4_t top = vld4_u8(input); + const uint8x8x4_t bot = vld4_u8(input + input_stride); + // equivalent to a vpaddlq_u8 (because vld4q interleaves) + const uint16x8_t top_0 = vaddl_u8(top.val[0], top.val[1]); + // equivalent to a vpaddlq_u8 (because vld4q interleaves) + const uint16x8_t bot_0 = vaddl_u8(bot.val[0], bot.val[1]); + // equivalent to a vpaddlq_u8 (because vld4q interleaves) + const uint16x8_t top_1 = vaddl_u8(top.val[2], top.val[3]); + // equivalent to a vpaddlq_u8 (because vld4q interleaves) + const uint16x8_t bot_1 = vaddl_u8(bot.val[2], bot.val[3]); + uint16x8x2_t sum; + sum.val[0] = vshlq_n_u16(vaddq_u16(top_0, bot_0), 1); + sum.val[1] = vshlq_n_u16(vaddq_u16(top_1, bot_1), 1); + vst2q_u16(pred_buf_q3, sum); + } + input += luma_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); +} + +static void cfl_luma_subsampling_422_lbd_neon(const uint8_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE; + do { + if (width == 4) { + const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input)); + vsth_u16(pred_buf_q3, vshl_n_u16(top, 2)); + } else if (width == 8) { + const uint16x4_t top = vpaddl_u8(vld1_u8(input)); + vst1_u16(pred_buf_q3, vshl_n_u16(top, 2)); + } else if (width == 16) { + const uint16x8_t top = vpaddlq_u8(vld1q_u8(input)); + vst1q_u16(pred_buf_q3, vshlq_n_u16(top, 2)); + } else { + const uint8x8x4_t top = vld4_u8(input); + uint16x8x2_t sum; + // vaddl_u8 is equivalent to a vpaddlq_u8 (because vld4q interleaves) + sum.val[0] = vshlq_n_u16(vaddl_u8(top.val[0], top.val[1]), 2); + sum.val[1] = vshlq_n_u16(vaddl_u8(top.val[2], top.val[3]), 2); + vst2q_u16(pred_buf_q3, sum); + } + input += input_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); +} + +static void cfl_luma_subsampling_444_lbd_neon(const uint8_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE; + do { + if (width == 4) { + const uint16x8_t top = vshll_n_u8(vldh_dup_u8(input), 3); + vst1_u16(pred_buf_q3, vget_low_u16(top)); + } else if (width == 8) { + const uint16x8_t top = vshll_n_u8(vld1_u8(input), 3); + vst1q_u16(pred_buf_q3, top); + } else { + const uint8x16_t top = vld1q_u8(input); + vst1q_u16(pred_buf_q3, vshll_n_u8(vget_low_u8(top), 3)); + vst1q_u16(pred_buf_q3 + 8, vshll_n_u8(vget_high_u8(top), 3)); + if (width == 32) { + const uint8x16_t next_top = vld1q_u8(input + 16); + vst1q_u16(pred_buf_q3 + 16, vshll_n_u8(vget_low_u8(next_top), 3)); + vst1q_u16(pred_buf_q3 + 24, vshll_n_u8(vget_high_u8(next_top), 3)); + } + } + input += input_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); +} + +#if CONFIG_AV1_HIGHBITDEPTH +#ifndef __aarch64__ +uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) { + return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)), + vpadd_u16(vget_low_u16(b), vget_high_u16(b))); +} +#endif + +static void cfl_luma_subsampling_420_hbd_neon(const uint16_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE; + const int luma_stride = input_stride << 1; + do { + if (width == 4) { + const uint16x4_t top = vld1_u16(input); + const uint16x4_t bot = vld1_u16(input + input_stride); + const uint16x4_t sum = vadd_u16(top, bot); + const uint16x4_t hsum = vpadd_u16(sum, sum); + vsth_u16(pred_buf_q3, vshl_n_u16(hsum, 1)); + } else if (width < 32) { + const uint16x8_t top = vld1q_u16(input); + const uint16x8_t bot = vld1q_u16(input + input_stride); + const uint16x8_t sum = vaddq_u16(top, bot); + if (width == 8) { + const uint16x4_t hsum = vget_low_u16(vpaddq_u16(sum, sum)); + vst1_u16(pred_buf_q3, vshl_n_u16(hsum, 1)); + } else { + const uint16x8_t top_1 = vld1q_u16(input + 8); + const uint16x8_t bot_1 = vld1q_u16(input + 8 + input_stride); + const uint16x8_t sum_1 = vaddq_u16(top_1, bot_1); + const uint16x8_t hsum = vpaddq_u16(sum, sum_1); + vst1q_u16(pred_buf_q3, vshlq_n_u16(hsum, 1)); + } + } else { + const uint16x8x4_t top = vld4q_u16(input); + const uint16x8x4_t bot = vld4q_u16(input + input_stride); + // equivalent to a vpaddq_u16 (because vld4q interleaves) + const uint16x8_t top_0 = vaddq_u16(top.val[0], top.val[1]); + // equivalent to a vpaddq_u16 (because vld4q interleaves) + const uint16x8_t bot_0 = vaddq_u16(bot.val[0], bot.val[1]); + // equivalent to a vpaddq_u16 (because vld4q interleaves) + const uint16x8_t top_1 = vaddq_u16(top.val[2], top.val[3]); + // equivalent to a vpaddq_u16 (because vld4q interleaves) + const uint16x8_t bot_1 = vaddq_u16(bot.val[2], bot.val[3]); + uint16x8x2_t sum; + sum.val[0] = vshlq_n_u16(vaddq_u16(top_0, bot_0), 1); + sum.val[1] = vshlq_n_u16(vaddq_u16(top_1, bot_1), 1); + vst2q_u16(pred_buf_q3, sum); + } + input += luma_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); +} + +static void cfl_luma_subsampling_422_hbd_neon(const uint16_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE; + do { + if (width == 4) { + const uint16x4_t top = vld1_u16(input); + const uint16x4_t hsum = vpadd_u16(top, top); + vsth_u16(pred_buf_q3, vshl_n_u16(hsum, 2)); + } else if (width == 8) { + const uint16x4x2_t top = vld2_u16(input); + // equivalent to a vpadd_u16 (because vld2 interleaves) + const uint16x4_t hsum = vadd_u16(top.val[0], top.val[1]); + vst1_u16(pred_buf_q3, vshl_n_u16(hsum, 2)); + } else if (width == 16) { + const uint16x8x2_t top = vld2q_u16(input); + // equivalent to a vpaddq_u16 (because vld2q interleaves) + const uint16x8_t hsum = vaddq_u16(top.val[0], top.val[1]); + vst1q_u16(pred_buf_q3, vshlq_n_u16(hsum, 2)); + } else { + const uint16x8x4_t top = vld4q_u16(input); + // equivalent to a vpaddq_u16 (because vld4q interleaves) + const uint16x8_t hsum_0 = vaddq_u16(top.val[0], top.val[1]); + // equivalent to a vpaddq_u16 (because vld4q interleaves) + const uint16x8_t hsum_1 = vaddq_u16(top.val[2], top.val[3]); + uint16x8x2_t result = { { vshlq_n_u16(hsum_0, 2), + vshlq_n_u16(hsum_1, 2) } }; + vst2q_u16(pred_buf_q3, result); + } + input += input_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); +} + +static void cfl_luma_subsampling_444_hbd_neon(const uint16_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE; + do { + if (width == 4) { + const uint16x4_t top = vld1_u16(input); + vst1_u16(pred_buf_q3, vshl_n_u16(top, 3)); + } else if (width == 8) { + const uint16x8_t top = vld1q_u16(input); + vst1q_u16(pred_buf_q3, vshlq_n_u16(top, 3)); + } else if (width == 16) { + uint16x8x2_t top = vld2q_u16(input); + top.val[0] = vshlq_n_u16(top.val[0], 3); + top.val[1] = vshlq_n_u16(top.val[1], 3); + vst2q_u16(pred_buf_q3, top); + } else { + uint16x8x4_t top = vld4q_u16(input); + top.val[0] = vshlq_n_u16(top.val[0], 3); + top.val[1] = vshlq_n_u16(top.val[1], 3); + top.val[2] = vshlq_n_u16(top.val[2], 3); + top.val[3] = vshlq_n_u16(top.val[3], 3); + vst4q_u16(pred_buf_q3, top); + } + input += input_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +CFL_GET_SUBSAMPLE_FUNCTION(neon) + +static INLINE void subtract_average_neon(const uint16_t *src, int16_t *dst, + int width, int height, + int round_offset, + const int num_pel_log2) { + const uint16_t *const end = src + height * CFL_BUF_LINE; + + // Round offset is not needed, because NEON will handle the rounding. + (void)round_offset; + + // To optimize the use of the CPU pipeline, we process 4 rows per iteration + const int step = 4 * CFL_BUF_LINE; + + // At this stage, the prediction buffer contains scaled reconstructed luma + // pixels, which are positive integer and only require 15 bits. By using + // unsigned integer for the sum, we can do one addition operation inside 16 + // bits (8 lanes) before having to convert to 32 bits (4 lanes). + const uint16_t *sum_buf = src; + uint32x4_t sum_32x4 = { 0, 0, 0, 0 }; + do { + // For all widths, we load, add and combine the data so it fits in 4 lanes. + if (width == 4) { + const uint16x4_t a0 = + vadd_u16(vld1_u16(sum_buf), vld1_u16(sum_buf + CFL_BUF_LINE)); + const uint16x4_t a1 = vadd_u16(vld1_u16(sum_buf + 2 * CFL_BUF_LINE), + vld1_u16(sum_buf + 3 * CFL_BUF_LINE)); + sum_32x4 = vaddq_u32(sum_32x4, vaddl_u16(a0, a1)); + } else if (width == 8) { + const uint16x8_t a0 = vldaddq_u16(sum_buf, CFL_BUF_LINE); + const uint16x8_t a1 = + vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE, CFL_BUF_LINE); + sum_32x4 = vpadalq_u16(sum_32x4, a0); + sum_32x4 = vpadalq_u16(sum_32x4, a1); + } else { + const uint16x8_t row0 = vldaddq_u16(sum_buf, 8); + const uint16x8_t row1 = vldaddq_u16(sum_buf + CFL_BUF_LINE, 8); + const uint16x8_t row2 = vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE, 8); + const uint16x8_t row3 = vldaddq_u16(sum_buf + 3 * CFL_BUF_LINE, 8); + sum_32x4 = vpadalq_u16(sum_32x4, row0); + sum_32x4 = vpadalq_u16(sum_32x4, row1); + sum_32x4 = vpadalq_u16(sum_32x4, row2); + sum_32x4 = vpadalq_u16(sum_32x4, row3); + + if (width == 32) { + const uint16x8_t row0_1 = vldaddq_u16(sum_buf + 16, 8); + const uint16x8_t row1_1 = vldaddq_u16(sum_buf + CFL_BUF_LINE + 16, 8); + const uint16x8_t row2_1 = + vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE + 16, 8); + const uint16x8_t row3_1 = + vldaddq_u16(sum_buf + 3 * CFL_BUF_LINE + 16, 8); + + sum_32x4 = vpadalq_u16(sum_32x4, row0_1); + sum_32x4 = vpadalq_u16(sum_32x4, row1_1); + sum_32x4 = vpadalq_u16(sum_32x4, row2_1); + sum_32x4 = vpadalq_u16(sum_32x4, row3_1); + } + } + sum_buf += step; + } while (sum_buf < end); + + // Permute and add in such a way that each lane contains the block sum. + // [A+C+B+D, B+D+A+C, C+A+D+B, D+B+C+A] +#ifdef __aarch64__ + sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4); + sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4); +#else + uint32x4_t flip = + vcombine_u32(vget_high_u32(sum_32x4), vget_low_u32(sum_32x4)); + sum_32x4 = vaddq_u32(sum_32x4, flip); + sum_32x4 = vaddq_u32(sum_32x4, vrev64q_u32(sum_32x4)); +#endif + + // Computing the average could be done using scalars, but getting off the NEON + // engine introduces latency, so we use vqrshrn. + int16x4_t avg_16x4; + // Constant propagation makes for some ugly code. + switch (num_pel_log2) { + case 4: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 4)); break; + case 5: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 5)); break; + case 6: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 6)); break; + case 7: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 7)); break; + case 8: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 8)); break; + case 9: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 9)); break; + case 10: + avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 10)); + break; + default: assert(0); + } + + if (width == 4) { + do { + vst1_s16(dst, vsub_s16(vreinterpret_s16_u16(vld1_u16(src)), avg_16x4)); + src += CFL_BUF_LINE; + dst += CFL_BUF_LINE; + } while (src < end); + } else { + const int16x8_t avg_16x8 = vcombine_s16(avg_16x4, avg_16x4); + do { + vldsubstq_s16(dst, src, 0, avg_16x8); + vldsubstq_s16(dst, src, CFL_BUF_LINE, avg_16x8); + vldsubstq_s16(dst, src, 2 * CFL_BUF_LINE, avg_16x8); + vldsubstq_s16(dst, src, 3 * CFL_BUF_LINE, avg_16x8); + + if (width > 8) { + vldsubstq_s16(dst, src, 8, avg_16x8); + vldsubstq_s16(dst, src, 8 + CFL_BUF_LINE, avg_16x8); + vldsubstq_s16(dst, src, 8 + 2 * CFL_BUF_LINE, avg_16x8); + vldsubstq_s16(dst, src, 8 + 3 * CFL_BUF_LINE, avg_16x8); + } + if (width == 32) { + vldsubstq_s16(dst, src, 16, avg_16x8); + vldsubstq_s16(dst, src, 16 + CFL_BUF_LINE, avg_16x8); + vldsubstq_s16(dst, src, 16 + 2 * CFL_BUF_LINE, avg_16x8); + vldsubstq_s16(dst, src, 16 + 3 * CFL_BUF_LINE, avg_16x8); + vldsubstq_s16(dst, src, 24, avg_16x8); + vldsubstq_s16(dst, src, 24 + CFL_BUF_LINE, avg_16x8); + vldsubstq_s16(dst, src, 24 + 2 * CFL_BUF_LINE, avg_16x8); + vldsubstq_s16(dst, src, 24 + 3 * CFL_BUF_LINE, avg_16x8); + } + src += step; + dst += step; + } while (src < end); + } +} + +CFL_SUB_AVG_FN(neon) + +// Saturating negate 16-bit integers in a when the corresponding signed 16-bit +// integer in b is negative. +// Notes: +// * Negating INT16_MIN results in INT16_MIN. However, this cannot occur in +// practice, as scaled_luma is the multiplication of two absolute values. +// * In the Intel equivalent, elements in a are zeroed out when the +// corresponding elements in b are zero. Because vsign is used twice in a +// row, with b in the first call becoming a in the second call, there's no +// impact from not zeroing out. +static int16x4_t vsign_s16(int16x4_t a, int16x4_t b) { + const int16x4_t mask = vshr_n_s16(b, 15); + return veor_s16(vadd_s16(a, mask), mask); +} + +// Saturating negate 16-bit integers in a when the corresponding signed 16-bit +// integer in b is negative. +// Notes: +// * Negating INT16_MIN results in INT16_MIN. However, this cannot occur in +// practice, as scaled_luma is the multiplication of two absolute values. +// * In the Intel equivalent, elements in a are zeroed out when the +// corresponding elements in b are zero. Because vsignq is used twice in a +// row, with b in the first call becoming a in the second call, there's no +// impact from not zeroing out. +static int16x8_t vsignq_s16(int16x8_t a, int16x8_t b) { + const int16x8_t mask = vshrq_n_s16(b, 15); + return veorq_s16(vaddq_s16(a, mask), mask); +} + +static INLINE int16x4_t predict_w4(const int16_t *pred_buf_q3, + int16x4_t alpha_sign, int abs_alpha_q12, + int16x4_t dc) { + const int16x4_t ac_q3 = vld1_s16(pred_buf_q3); + const int16x4_t ac_sign = veor_s16(alpha_sign, ac_q3); + int16x4_t scaled_luma = vqrdmulh_n_s16(vabs_s16(ac_q3), abs_alpha_q12); + return vadd_s16(vsign_s16(scaled_luma, ac_sign), dc); +} + +static INLINE int16x8_t predict_w8(const int16_t *pred_buf_q3, + int16x8_t alpha_sign, int abs_alpha_q12, + int16x8_t dc) { + const int16x8_t ac_q3 = vld1q_s16(pred_buf_q3); + const int16x8_t ac_sign = veorq_s16(alpha_sign, ac_q3); + int16x8_t scaled_luma = vqrdmulhq_n_s16(vabsq_s16(ac_q3), abs_alpha_q12); + return vaddq_s16(vsignq_s16(scaled_luma, ac_sign), dc); +} + +static INLINE int16x8x2_t predict_w16(const int16_t *pred_buf_q3, + int16x8_t alpha_sign, int abs_alpha_q12, + int16x8_t dc) { + // vld2q_s16 interleaves, which is not useful for prediction. vst1q_s16_x2 + // does not interleave, but is not currently available in the compilier used + // by the AOM build system. + const int16x8x2_t ac_q3 = vld2q_s16(pred_buf_q3); + const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]); + const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]); + const int16x8_t scaled_luma_0 = + vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[0]), abs_alpha_q12); + const int16x8_t scaled_luma_1 = + vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[1]), abs_alpha_q12); + int16x8x2_t result; + result.val[0] = vaddq_s16(vsignq_s16(scaled_luma_0, ac_sign_0), dc); + result.val[1] = vaddq_s16(vsignq_s16(scaled_luma_1, ac_sign_1), dc); + return result; +} + +static INLINE int16x8x4_t predict_w32(const int16_t *pred_buf_q3, + int16x8_t alpha_sign, int abs_alpha_q12, + int16x8_t dc) { + // vld4q_s16 interleaves, which is not useful for prediction. vst1q_s16_x4 + // does not interleave, but is not currently available in the compilier used + // by the AOM build system. + const int16x8x4_t ac_q3 = vld4q_s16(pred_buf_q3); + const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]); + const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]); + const int16x8_t ac_sign_2 = veorq_s16(alpha_sign, ac_q3.val[2]); + const int16x8_t ac_sign_3 = veorq_s16(alpha_sign, ac_q3.val[3]); + const int16x8_t scaled_luma_0 = + vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[0]), abs_alpha_q12); + const int16x8_t scaled_luma_1 = + vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[1]), abs_alpha_q12); + const int16x8_t scaled_luma_2 = + vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[2]), abs_alpha_q12); + const int16x8_t scaled_luma_3 = + vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[3]), abs_alpha_q12); + int16x8x4_t result; + result.val[0] = vaddq_s16(vsignq_s16(scaled_luma_0, ac_sign_0), dc); + result.val[1] = vaddq_s16(vsignq_s16(scaled_luma_1, ac_sign_1), dc); + result.val[2] = vaddq_s16(vsignq_s16(scaled_luma_2, ac_sign_2), dc); + result.val[3] = vaddq_s16(vsignq_s16(scaled_luma_3, ac_sign_3), dc); + return result; +} + +static INLINE void cfl_predict_lbd_neon(const int16_t *pred_buf_q3, + uint8_t *dst, int dst_stride, + int alpha_q3, int width, int height) { + const int16_t abs_alpha_q12 = abs(alpha_q3) << 9; + const int16_t *const end = pred_buf_q3 + height * CFL_BUF_LINE; + if (width == 4) { + const int16x4_t alpha_sign = vdup_n_s16(alpha_q3); + const int16x4_t dc = vdup_n_s16(*dst); + do { + const int16x4_t pred = + predict_w4(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); + vsth_u8(dst, vqmovun_s16(vcombine_s16(pred, pred))); + dst += dst_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); + } else { + const int16x8_t alpha_sign = vdupq_n_s16(alpha_q3); + const int16x8_t dc = vdupq_n_s16(*dst); + do { + if (width == 8) { + vst1_u8(dst, vqmovun_s16(predict_w8(pred_buf_q3, alpha_sign, + abs_alpha_q12, dc))); + } else if (width == 16) { + const int16x8x2_t pred = + predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); + const uint8x8x2_t predun = { { vqmovun_s16(pred.val[0]), + vqmovun_s16(pred.val[1]) } }; + vst2_u8(dst, predun); + } else { + const int16x8x4_t pred = + predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); + const uint8x8x4_t predun = { + { vqmovun_s16(pred.val[0]), vqmovun_s16(pred.val[1]), + vqmovun_s16(pred.val[2]), vqmovun_s16(pred.val[3]) } + }; + vst4_u8(dst, predun); + } + dst += dst_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); + } +} + +CFL_PREDICT_FN(neon, lbd) + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE uint16x4_t clamp_s16(int16x4_t a, int16x4_t max) { + return vreinterpret_u16_s16(vmax_s16(vmin_s16(a, max), vdup_n_s16(0))); +} + +static INLINE uint16x8_t clampq_s16(int16x8_t a, int16x8_t max) { + return vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(a, max), vdupq_n_s16(0))); +} + +static INLINE uint16x8x2_t clamp2q_s16(int16x8x2_t a, int16x8_t max) { + uint16x8x2_t result; + result.val[0] = vreinterpretq_u16_s16( + vmaxq_s16(vminq_s16(a.val[0], max), vdupq_n_s16(0))); + result.val[1] = vreinterpretq_u16_s16( + vmaxq_s16(vminq_s16(a.val[1], max), vdupq_n_s16(0))); + return result; +} + +static INLINE uint16x8x4_t clamp4q_s16(int16x8x4_t a, int16x8_t max) { + uint16x8x4_t result; + result.val[0] = vreinterpretq_u16_s16( + vmaxq_s16(vminq_s16(a.val[0], max), vdupq_n_s16(0))); + result.val[1] = vreinterpretq_u16_s16( + vmaxq_s16(vminq_s16(a.val[1], max), vdupq_n_s16(0))); + result.val[2] = vreinterpretq_u16_s16( + vmaxq_s16(vminq_s16(a.val[2], max), vdupq_n_s16(0))); + result.val[3] = vreinterpretq_u16_s16( + vmaxq_s16(vminq_s16(a.val[3], max), vdupq_n_s16(0))); + return result; +} + +static INLINE void cfl_predict_hbd_neon(const int16_t *pred_buf_q3, + uint16_t *dst, int dst_stride, + int alpha_q3, int bd, int width, + int height) { + const int max = (1 << bd) - 1; + const int16_t abs_alpha_q12 = abs(alpha_q3) << 9; + const int16_t *const end = pred_buf_q3 + height * CFL_BUF_LINE; + if (width == 4) { + const int16x4_t alpha_sign = vdup_n_s16(alpha_q3); + const int16x4_t dc = vdup_n_s16(*dst); + const int16x4_t max_16x4 = vdup_n_s16(max); + do { + const int16x4_t scaled_luma = + predict_w4(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); + vst1_u16(dst, clamp_s16(scaled_luma, max_16x4)); + dst += dst_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); + } else { + const int16x8_t alpha_sign = vdupq_n_s16(alpha_q3); + const int16x8_t dc = vdupq_n_s16(*dst); + const int16x8_t max_16x8 = vdupq_n_s16(max); + do { + if (width == 8) { + const int16x8_t pred = + predict_w8(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); + vst1q_u16(dst, clampq_s16(pred, max_16x8)); + } else if (width == 16) { + const int16x8x2_t pred = + predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); + vst2q_u16(dst, clamp2q_s16(pred, max_16x8)); + } else { + const int16x8x4_t pred = + predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); + vst4q_u16(dst, clamp4q_s16(pred, max_16x8)); + } + dst += dst_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); + } +} + +CFL_PREDICT_FN(neon, hbd) +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/libs/libaom/src/av1/common/arm/convolve_neon.c b/libs/libaom/src/av1/common/arm/convolve_neon.c new file mode 100644 index 000000000..51c96961c --- /dev/null +++ b/libs/libaom/src/av1/common/arm/convolve_neon.c @@ -0,0 +1,1593 @@ +/* + * + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" +#include "av1/common/arm/convolve_neon.h" +#include "av1/common/arm/mem_neon.h" +#include "av1/common/arm/transpose_neon.h" + +static INLINE int16x4_t convolve8_4x4(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16_t *filter) { + int16x4_t sum; + + sum = vmul_n_s16(s0, filter[0]); + sum = vmla_n_s16(sum, s1, filter[1]); + sum = vmla_n_s16(sum, s2, filter[2]); + sum = vmla_n_s16(sum, s5, filter[5]); + sum = vmla_n_s16(sum, s6, filter[6]); + sum = vmla_n_s16(sum, s7, filter[7]); + /* filter[3] can take a max value of 128. So the max value of the result : + * 128*255 + sum > 16 bits + */ + sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3])); + sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4])); + + return sum; +} + +static INLINE uint8x8_t convolve8_horiz_8x8( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, const int16_t *filter, + const int16x8_t shift_round_0, const int16x8_t shift_by_bits) { + int16x8_t sum; + + sum = vmulq_n_s16(s0, filter[0]); + sum = vmlaq_n_s16(sum, s1, filter[1]); + sum = vmlaq_n_s16(sum, s2, filter[2]); + sum = vmlaq_n_s16(sum, s5, filter[5]); + sum = vmlaq_n_s16(sum, s6, filter[6]); + sum = vmlaq_n_s16(sum, s7, filter[7]); + /* filter[3] can take a max value of 128. So the max value of the result : + * 128*255 + sum > 16 bits + */ + sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3])); + sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4])); + + sum = vqrshlq_s16(sum, shift_round_0); + sum = vqrshlq_s16(sum, shift_by_bits); + + return vqmovun_s16(sum); +} + +#if !defined(__aarch64__) +static INLINE uint8x8_t convolve8_horiz_4x1( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16_t *filter, + const int16x4_t shift_round_0, const int16x4_t shift_by_bits) { + int16x4_t sum; + + sum = vmul_n_s16(s0, filter[0]); + sum = vmla_n_s16(sum, s1, filter[1]); + sum = vmla_n_s16(sum, s2, filter[2]); + sum = vmla_n_s16(sum, s5, filter[5]); + sum = vmla_n_s16(sum, s6, filter[6]); + sum = vmla_n_s16(sum, s7, filter[7]); + /* filter[3] can take a max value of 128. So the max value of the result : + * 128*255 + sum > 16 bits + */ + sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3])); + sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4])); + + sum = vqrshl_s16(sum, shift_round_0); + sum = vqrshl_s16(sum, shift_by_bits); + + return vqmovun_s16(vcombine_s16(sum, sum)); +} +#endif // !defined(__arch64__) + +static INLINE uint8x8_t convolve8_vert_8x4( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, const int16_t *filter) { + int16x8_t sum; + + sum = vmulq_n_s16(s0, filter[0]); + sum = vmlaq_n_s16(sum, s1, filter[1]); + sum = vmlaq_n_s16(sum, s2, filter[2]); + sum = vmlaq_n_s16(sum, s5, filter[5]); + sum = vmlaq_n_s16(sum, s6, filter[6]); + sum = vmlaq_n_s16(sum, s7, filter[7]); + /* filter[3] can take a max value of 128. So the max value of the result : + * 128*255 + sum > 16 bits + */ + sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3])); + sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4])); + + return vqrshrun_n_s16(sum, FILTER_BITS); +} + +static INLINE uint16x4_t convolve8_vert_4x4_s32( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16_t *y_filter, + const int32x4_t round_shift_vec, const int32x4_t offset_const, + const int32x4_t sub_const_vec) { + int32x4_t sum0; + uint16x4_t res; + const int32x4_t zero = vdupq_n_s32(0); + + sum0 = vmull_n_s16(s0, y_filter[0]); + sum0 = vmlal_n_s16(sum0, s1, y_filter[1]); + sum0 = vmlal_n_s16(sum0, s2, y_filter[2]); + sum0 = vmlal_n_s16(sum0, s3, y_filter[3]); + sum0 = vmlal_n_s16(sum0, s4, y_filter[4]); + sum0 = vmlal_n_s16(sum0, s5, y_filter[5]); + sum0 = vmlal_n_s16(sum0, s6, y_filter[6]); + sum0 = vmlal_n_s16(sum0, s7, y_filter[7]); + + sum0 = vaddq_s32(sum0, offset_const); + sum0 = vqrshlq_s32(sum0, round_shift_vec); + sum0 = vsubq_s32(sum0, sub_const_vec); + sum0 = vmaxq_s32(sum0, zero); + + res = vmovn_u32(vreinterpretq_u32_s32(sum0)); + + return res; +} + +static INLINE uint8x8_t convolve8_vert_8x4_s32( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, const int16_t *y_filter, + const int32x4_t round_shift_vec, const int32x4_t offset_const, + const int32x4_t sub_const_vec, const int16x8_t vec_round_bits) { + int32x4_t sum0, sum1; + uint16x8_t res; + const int32x4_t zero = vdupq_n_s32(0); + + sum0 = vmull_n_s16(vget_low_s16(s0), y_filter[0]); + sum0 = vmlal_n_s16(sum0, vget_low_s16(s1), y_filter[1]); + sum0 = vmlal_n_s16(sum0, vget_low_s16(s2), y_filter[2]); + sum0 = vmlal_n_s16(sum0, vget_low_s16(s3), y_filter[3]); + sum0 = vmlal_n_s16(sum0, vget_low_s16(s4), y_filter[4]); + sum0 = vmlal_n_s16(sum0, vget_low_s16(s5), y_filter[5]); + sum0 = vmlal_n_s16(sum0, vget_low_s16(s6), y_filter[6]); + sum0 = vmlal_n_s16(sum0, vget_low_s16(s7), y_filter[7]); + + sum1 = vmull_n_s16(vget_high_s16(s0), y_filter[0]); + sum1 = vmlal_n_s16(sum1, vget_high_s16(s1), y_filter[1]); + sum1 = vmlal_n_s16(sum1, vget_high_s16(s2), y_filter[2]); + sum1 = vmlal_n_s16(sum1, vget_high_s16(s3), y_filter[3]); + sum1 = vmlal_n_s16(sum1, vget_high_s16(s4), y_filter[4]); + sum1 = vmlal_n_s16(sum1, vget_high_s16(s5), y_filter[5]); + sum1 = vmlal_n_s16(sum1, vget_high_s16(s6), y_filter[6]); + sum1 = vmlal_n_s16(sum1, vget_high_s16(s7), y_filter[7]); + + sum0 = vaddq_s32(sum0, offset_const); + sum1 = vaddq_s32(sum1, offset_const); + sum0 = vqrshlq_s32(sum0, round_shift_vec); + sum1 = vqrshlq_s32(sum1, round_shift_vec); + sum0 = vsubq_s32(sum0, sub_const_vec); + sum1 = vsubq_s32(sum1, sub_const_vec); + sum0 = vmaxq_s32(sum0, zero); + sum1 = vmaxq_s32(sum1, zero); + res = vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(sum0)), + vqmovn_u32(vreinterpretq_u32_s32(sum1))); + + res = vqrshlq_u16(res, vec_round_bits); + + return vqmovn_u16(res); +} + +void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + const uint8_t horiz_offset = filter_params_x->taps / 2 - 1; + const int8_t bits = FILTER_BITS - conv_params->round_0; + + (void)subpel_y_qn; + (void)conv_params; + (void)filter_params_y; + + uint8x8_t t0; +#if defined(__aarch64__) + uint8x8_t t1, t2, t3; +#endif + + assert(bits >= 0); + assert((FILTER_BITS - conv_params->round_1) >= 0 || + ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); + + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0); + const int16x8_t shift_by_bits = vdupq_n_s16(-bits); + + src -= horiz_offset; +#if defined(__aarch64__) + if (h == 4) { + uint8x8_t d01, d23; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; + int16x8_t d01_temp, d23_temp; + + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); + + s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + src += 7; + + do { + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); + + s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + + d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, x_filter); + + d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, x_filter); + + d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, x_filter); + + d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, x_filter); + + d01_temp = vqrshlq_s16(vcombine_s16(d0, d1), shift_round_0); + d23_temp = vqrshlq_s16(vcombine_s16(d2, d3), shift_round_0); + + d01_temp = vqrshlq_s16(d01_temp, shift_by_bits); + d23_temp = vqrshlq_s16(d23_temp, shift_by_bits); + + d01 = vqmovun_s16(d01_temp); + d23 = vqmovun_s16(d23_temp); + + transpose_u8_4x4(&d01, &d23); + + if (w != 2) { + vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride), // 00 01 02 03 + vreinterpret_u32_u8(d01), 0); + vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride), // 10 11 12 13 + vreinterpret_u32_u8(d23), 0); + vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride), // 20 21 22 23 + vreinterpret_u32_u8(d01), 1); + vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride), // 30 31 32 33 + vreinterpret_u32_u8(d23), 1); + } else { + vst1_lane_u16((uint16_t *)(dst + 0 * dst_stride), // 00 01 + vreinterpret_u16_u8(d01), 0); + vst1_lane_u16((uint16_t *)(dst + 1 * dst_stride), // 10 11 + vreinterpret_u16_u8(d23), 0); + vst1_lane_u16((uint16_t *)(dst + 2 * dst_stride), // 20 21 + vreinterpret_u16_u8(d01), 2); + vst1_lane_u16((uint16_t *)(dst + 3 * dst_stride), // 30 31 + vreinterpret_u16_u8(d23), 2); + } + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + src += 4; + dst += 4; + w -= 4; + } while (w > 0); + } else { +#endif + int width; + const uint8_t *s; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + +#if defined(__aarch64__) + int16x8_t s8, s9, s10; + uint8x8_t t4, t5, t6, t7; +#endif + + if (w <= 4) { +#if defined(__aarch64__) + do { + load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, + &t7); + src += 8 * src_stride; + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(dst + 4 * dst_stride); + __builtin_prefetch(dst + 5 * dst_stride); + __builtin_prefetch(dst + 6 * dst_stride); + __builtin_prefetch(dst + 7 * dst_stride); + + transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7); + + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + __builtin_prefetch(src + 7 * src_stride); + t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, + shift_round_0, shift_by_bits); + t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, + shift_round_0, shift_by_bits); + t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, + shift_round_0, shift_by_bits); + t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, + shift_round_0, shift_by_bits); + + transpose_u8_8x4(&t0, &t1, &t2, &t3); + + if ((w == 4) && (h > 4)) { + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), + 0); // 00 01 02 03 + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), + 0); // 10 11 12 13 + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), + 0); // 20 21 22 23 + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), + 0); // 30 31 32 33 + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), + 1); // 40 41 42 43 + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), + 1); // 50 51 52 53 + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), + 1); // 60 61 62 63 + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), + 1); // 70 71 72 73 + dst += dst_stride; + } else if ((w == 4) && (h == 2)) { + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), + 0); // 00 01 02 03 + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), + 0); // 10 11 12 13 + dst += dst_stride; + } else if ((w == 2) && (h > 4)) { + vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0); // 00 01 + dst += dst_stride; + vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 0); // 10 11 + dst += dst_stride; + vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2), 0); // 20 21 + dst += dst_stride; + vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3), 0); // 30 31 + dst += dst_stride; + vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 2); // 40 41 + dst += dst_stride; + vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 2); // 50 51 + dst += dst_stride; + vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2), 2); // 60 61 + dst += dst_stride; + vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3), 2); // 70 71 + dst += dst_stride; + } else if ((w == 2) && (h == 2)) { + vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0); // 00 01 + dst += dst_stride; + vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 0); // 10 11 + dst += dst_stride; + } + h -= 8; + } while (h > 0); +#else + int16x8_t tt0; + int16x4_t x0, x1, x2, x3, x4, x5, x6, x7; + const int16x4_t shift_round_0_low = vget_low_s16(shift_round_0); + const int16x4_t shift_by_bits_low = vget_low_s16(shift_by_bits); + do { + t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7 + tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + x0 = vget_low_s16(tt0); // a0 a1 a2 a3 + x4 = vget_high_s16(tt0); // a4 a5 a6 a7 + + t0 = vld1_u8(src + 8); // a8 a9 a10 a11 a12 a13 a14 a15 + tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + x7 = vget_low_s16(tt0); // a8 a9 a10 a11 + + x1 = vext_s16(x0, x4, 1); // a1 a2 a3 a4 + x2 = vext_s16(x0, x4, 2); // a2 a3 a4 a5 + x3 = vext_s16(x0, x4, 3); // a3 a4 a5 a6 + x5 = vext_s16(x4, x7, 1); // a5 a6 a7 a8 + x6 = vext_s16(x4, x7, 2); // a6 a7 a8 a9 + x7 = vext_s16(x4, x7, 3); // a7 a8 a9 a10 + + src += src_stride; + + t0 = convolve8_horiz_4x1(x0, x1, x2, x3, x4, x5, x6, x7, x_filter, + shift_round_0_low, shift_by_bits_low); + + if (w == 4) { + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), + 0); // 00 01 02 03 + dst += dst_stride; + } else if (w == 2) { + vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0); // 00 01 + dst += dst_stride; + } + h -= 1; + } while (h > 0); +#endif + } else { + uint8_t *d; + int16x8_t s11; +#if defined(__aarch64__) + int16x8_t s12, s13, s14; + do { + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + __builtin_prefetch(src + 7 * src_stride); + load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + width = w; + s = src + 7; + d = dst; + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(dst + 4 * dst_stride); + __builtin_prefetch(dst + 5 * dst_stride); + __builtin_prefetch(dst + 6 * dst_stride); + __builtin_prefetch(dst + 7 * dst_stride); + + do { + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, + shift_round_0, shift_by_bits); + + t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, + shift_round_0, shift_by_bits); + + t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, + shift_round_0, shift_by_bits); + + t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, + shift_round_0, shift_by_bits); + + t4 = convolve8_horiz_8x8(s4, s5, s6, s7, s8, s9, s10, s11, x_filter, + shift_round_0, shift_by_bits); + + t5 = convolve8_horiz_8x8(s5, s6, s7, s8, s9, s10, s11, s12, x_filter, + shift_round_0, shift_by_bits); + + t6 = convolve8_horiz_8x8(s6, s7, s8, s9, s10, s11, s12, s13, x_filter, + shift_round_0, shift_by_bits); + + t7 = convolve8_horiz_8x8(s7, s8, s9, s10, s11, s12, s13, s14, + x_filter, shift_round_0, shift_by_bits); + + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + if (h != 2) { + store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7); + } else { + store_row2_u8_8x8(d, dst_stride, t0, t1); + } + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 8 * src_stride; + dst += 8 * dst_stride; + h -= 8; + } while (h > 0); +#else + do { + t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7 + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + + width = w; + s = src + 8; + d = dst; + __builtin_prefetch(dst); + + do { + t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s11 = s0; + s0 = s7; + + s1 = vextq_s16(s11, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + s2 = vextq_s16(s11, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + s3 = vextq_s16(s11, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + s4 = vextq_s16(s11, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + s5 = vextq_s16(s11, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + s6 = vextq_s16(s11, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + s7 = vextq_s16(s11, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + t0 = convolve8_horiz_8x8(s11, s1, s2, s3, s4, s5, s6, s7, x_filter, + shift_round_0, shift_by_bits); + vst1_u8(d, t0); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += src_stride; + dst += dst_stride; + h -= 1; + } while (h > 0); +#endif + } +#if defined(__aarch64__) + } +#endif +} + +void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + const int vert_offset = filter_params_y->taps / 2 - 1; + + src -= vert_offset * src_stride; + + (void)filter_params_x; + (void)subpel_x_qn; + (void)conv_params; + + assert(conv_params->round_0 <= FILTER_BITS); + assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) || + ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); + + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + if (w <= 4) { + uint8x8_t d01; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0; +#if defined(__aarch64__) + uint8x8_t d23; + int16x4_t s8, s9, s10, d1, d2, d3; +#endif + s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + + do { + s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; +#if defined(__aarch64__) + s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); + d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter); + d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter); + d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter); + + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + if ((w == 4) && (h != 2)) { + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), + 0); // 00 01 02 03 + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), + 1); // 10 11 12 13 + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), + 0); // 20 21 22 23 + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), + 1); // 30 31 32 33 + dst += dst_stride; + } else if ((w == 4) && (h == 2)) { + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), + 0); // 00 01 02 03 + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), + 1); // 10 11 12 13 + dst += dst_stride; + } else if ((w == 2) && (h != 2)) { + vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0); // 00 01 + dst += dst_stride; + vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 2); // 10 11 + dst += dst_stride; + vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d23), 0); // 20 21 + dst += dst_stride; + vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d23), 2); // 30 31 + dst += dst_stride; + } else if ((w == 2) && (h == 2)) { + vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0); // 00 01 + dst += dst_stride; + vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 2); // 10 11 + dst += dst_stride; + } + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + h -= 4; +#else + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(src + 0 * src_stride); + + d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); + + d01 = vqrshrun_n_s16(vcombine_s16(d0, d0), FILTER_BITS); + + if (w == 4) { + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); + dst += dst_stride; + } else if (w == 2) { + vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0); + dst += dst_stride; + } + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + h -= 1; +#endif + } while (h > 0); + } else { + int height; + const uint8_t *s; + uint8_t *d; + uint8x8_t t0; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; +#if defined(__aarch64__) + uint8x8_t t1, t2, t3; + int16x8_t s8, s9, s10; +#endif + do { + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + s = src; + s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; + s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; + s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; + s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; + s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; + s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; + s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; + d = dst; + height = h; + + do { + s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; +#if defined(__aarch64__) + s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; + s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; + s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); + t1 = convolve8_vert_8x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter); + t2 = convolve8_vert_8x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter); + t3 = convolve8_vert_8x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter); + if (h != 2) { + vst1_u8(d, t0); + d += dst_stride; + vst1_u8(d, t1); + d += dst_stride; + vst1_u8(d, t2); + d += dst_stride; + vst1_u8(d, t3); + d += dst_stride; + } else { + vst1_u8(d, t0); + d += dst_stride; + vst1_u8(d, t1); + d += dst_stride; + } + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + height -= 4; +#else + __builtin_prefetch(d); + __builtin_prefetch(s); + + t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); + + vst1_u8(d, t0); + d += dst_stride; + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + height -= 1; +#endif + } while (height > 0); + src += 8; + dst += 8; + w -= 8; + } while (w > 0); + } +} + +// Horizontal filtering for convolve_2d_sr for width multiple of 8 +// Processes one row at a time +static INLINE void horiz_filter_w8_single_row( + const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr, + const int dst_stride, int width, int height, const int16_t *x_filter, + const int16x8_t horiz_const, const int16x8_t shift_round_0) { + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + do { + uint8x8_t t0 = vld1_u8(src_ptr); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7 + + int width_tmp = width; + const uint8_t *s = src_ptr + 8; + int16_t *dst_tmp = dst_ptr; + + __builtin_prefetch(dst_ptr); + + do { + t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t sum = s0; + s0 = s7; + + s1 = vextq_s16(sum, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + s2 = vextq_s16(sum, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + s3 = vextq_s16(sum, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + s4 = vextq_s16(sum, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + s5 = vextq_s16(sum, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + s6 = vextq_s16(sum, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + s7 = vextq_s16(sum, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + int16x8_t res0 = convolve8_8x8_s16(sum, s1, s2, s3, s4, s5, s6, s7, + x_filter, horiz_const, shift_round_0); + + vst1q_s16(dst_tmp, res0); + + s += 8; + dst_tmp += 8; + width_tmp -= 8; + } while (width_tmp > 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + height--; + } while (height > 0); +} + +// Horizontal filtering for convolve_2d_sr for width <= 4 +// Processes one row at a time +static INLINE void horiz_filter_w4_single_row( + const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr, + const int dst_stride, int width, int height, const int16_t *x_filter, + const int16x4_t horiz_const, const int16x4_t shift_round_0) { + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7; + do { + const uint8_t *s = src_ptr; + + __builtin_prefetch(s); + + uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 + int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s0 = vget_low_s16(tt0); + s4 = vget_high_s16(tt0); + + __builtin_prefetch(dst_ptr); + s += 8; + + t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + + s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 + s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 + s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 + s5 = vext_s16(s4, s7, 1); // a5 a6 a7 a8 + s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9 + s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10 + + int16x4_t d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, + horiz_const, shift_round_0); + + if (width == 4) { + vst1_s16(dst_ptr, d0); + dst_ptr += dst_stride; + } else if (width == 2) { + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0); + dst_ptr += dst_stride; + } + + src_ptr += src_stride; + height--; + } while (height > 0); +} + +void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + int im_dst_stride; + int width, height; +#if defined(__aarch64__) + uint8x8_t t0; + uint8x8_t t1, t2, t3, t4, t5, t6, t7; + const uint8_t *s; +#endif + + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]); + + const int bd = 8; + const int im_h = h + filter_params_y->taps - 1; + const int im_stride = MAX_SB_SIZE; + const int vert_offset = filter_params_y->taps / 2 - 1; + const int horiz_offset = filter_params_x->taps / 2 - 1; + + const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; + + int16_t *dst_ptr; + + dst_ptr = im_block; + im_dst_stride = im_stride; + height = im_h; + width = w; + + const int16_t round_bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits); + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + int16_t x_filter_tmp[8]; + int16x8_t filter_x_coef = vld1q_s16(x_filter); + + // filter coeffs are even, so downshifting by 1 to reduce intermediate + // precision requirements. + filter_x_coef = vshrq_n_s16(filter_x_coef, 1); + vst1q_s16(&x_filter_tmp[0], filter_x_coef); + + assert(conv_params->round_0 > 0); + + if (w <= 4) { + const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2))); + const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1)); + +#if defined(__aarch64__) + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; + do { + assert(height >= 4); + s = src_ptr; + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); + + s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + + __builtin_prefetch(dst_ptr + 0 * im_dst_stride); + __builtin_prefetch(dst_ptr + 1 * im_dst_stride); + __builtin_prefetch(dst_ptr + 2 * im_dst_stride); + __builtin_prefetch(dst_ptr + 3 * im_dst_stride); + s += 7; + + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); + + s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + + d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp, + horiz_const, shift_round_0); + d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp, + horiz_const, shift_round_0); + d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp, + horiz_const, shift_round_0); + d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp, + horiz_const, shift_round_0); + + transpose_s16_4x4d(&d0, &d1, &d2, &d3); + if (w == 4) { + vst1_s16((dst_ptr + 0 * im_dst_stride), d0); + vst1_s16((dst_ptr + 1 * im_dst_stride), d1); + vst1_s16((dst_ptr + 2 * im_dst_stride), d2); + vst1_s16((dst_ptr + 3 * im_dst_stride), d3); + } else if (w == 2) { + vst1_lane_u32((uint32_t *)(dst_ptr + 0 * im_dst_stride), + vreinterpret_u32_s16(d0), 0); + vst1_lane_u32((uint32_t *)(dst_ptr + 1 * im_dst_stride), + vreinterpret_u32_s16(d1), 0); + vst1_lane_u32((uint32_t *)(dst_ptr + 2 * im_dst_stride), + vreinterpret_u32_s16(d2), 0); + vst1_lane_u32((uint32_t *)(dst_ptr + 3 * im_dst_stride), + vreinterpret_u32_s16(d3), 0); + } + src_ptr += 4 * src_stride; + dst_ptr += 4 * im_dst_stride; + height -= 4; + } while (height >= 4); + + if (height) { + assert(height < 4); + horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w, + height, x_filter_tmp, horiz_const, + shift_round_0); + } +#else + horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w, + height, x_filter_tmp, horiz_const, + shift_round_0); +#endif + + } else { + const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2))); + const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1)); + +#if defined(__aarch64__) + int16_t *d_tmp; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14; + int16x8_t res0, res1, res2, res3, res4, res5, res6, res7; + do { + assert(height >= 8); + __builtin_prefetch(src_ptr + 0 * src_stride); + __builtin_prefetch(src_ptr + 1 * src_stride); + __builtin_prefetch(src_ptr + 2 * src_stride); + __builtin_prefetch(src_ptr + 3 * src_stride); + __builtin_prefetch(src_ptr + 4 * src_stride); + __builtin_prefetch(src_ptr + 5 * src_stride); + __builtin_prefetch(src_ptr + 6 * src_stride); + __builtin_prefetch(src_ptr + 7 * src_stride); + + load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + width = w; + s = src_ptr + 7; + d_tmp = dst_ptr; + + __builtin_prefetch(dst_ptr + 0 * im_dst_stride); + __builtin_prefetch(dst_ptr + 1 * im_dst_stride); + __builtin_prefetch(dst_ptr + 2 * im_dst_stride); + __builtin_prefetch(dst_ptr + 3 * im_dst_stride); + __builtin_prefetch(dst_ptr + 4 * im_dst_stride); + __builtin_prefetch(dst_ptr + 5 * im_dst_stride); + __builtin_prefetch(dst_ptr + 6 * im_dst_stride); + __builtin_prefetch(dst_ptr + 7 * im_dst_stride); + + do { + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp, + horiz_const, shift_round_0); + res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp, + horiz_const, shift_round_0); + res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp, + horiz_const, shift_round_0); + res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp, + horiz_const, shift_round_0); + res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp, + horiz_const, shift_round_0); + res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12, + x_filter_tmp, horiz_const, shift_round_0); + res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13, + x_filter_tmp, horiz_const, shift_round_0); + res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14, + x_filter_tmp, horiz_const, shift_round_0); + + transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6, + &res7); + + store_s16_8x8(d_tmp, im_dst_stride, res0, res1, res2, res3, res4, res5, + res6, res7); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d_tmp += 8; + width -= 8; + } while (width > 0); + src_ptr += 8 * src_stride; + dst_ptr += 8 * im_dst_stride; + height -= 8; + } while (height >= 8); + + if (height >= 4) { + assert(height < 8); + int16x4_t reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, + reg10, reg11, reg12, reg13, reg14; + int16x4_t d0, d1, d2, d3, d4, d5, d6, d7; + int16x8_t out0, out1, out2, out3; + + __builtin_prefetch(src_ptr + 0 * src_stride); + __builtin_prefetch(src_ptr + 1 * src_stride); + __builtin_prefetch(src_ptr + 2 * src_stride); + __builtin_prefetch(src_ptr + 3 * src_stride); + + load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); + + reg0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + reg1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + reg2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + reg3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + reg4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + reg5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + reg6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + + __builtin_prefetch(dst_ptr + 0 * dst_stride); + __builtin_prefetch(dst_ptr + 1 * dst_stride); + __builtin_prefetch(dst_ptr + 2 * dst_stride); + __builtin_prefetch(dst_ptr + 3 * dst_stride); + + s = src_ptr + 7; + d_tmp = dst_ptr; + width = w; + + do { + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); + + reg7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + reg8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + reg9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + reg10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + reg11 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + reg12 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + reg13 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + reg14 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + + d0 = convolve8_4x4(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, + x_filter_tmp); + + d1 = convolve8_4x4(reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, + x_filter_tmp); + + d2 = convolve8_4x4(reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, + x_filter_tmp); + + d3 = convolve8_4x4(reg3, reg4, reg5, reg6, reg7, reg8, reg9, reg10, + x_filter_tmp); + + d4 = convolve8_4x4(reg4, reg5, reg6, reg7, reg8, reg9, reg10, reg11, + x_filter_tmp); + + d5 = convolve8_4x4(reg5, reg6, reg7, reg8, reg9, reg10, reg11, reg12, + x_filter_tmp); + + d6 = convolve8_4x4(reg6, reg7, reg8, reg9, reg10, reg11, reg12, reg13, + x_filter_tmp); + + d7 = convolve8_4x4(reg7, reg8, reg9, reg10, reg11, reg12, reg13, reg14, + x_filter_tmp); + + transpose_s16_4x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &out0, &out1, + &out2, &out3); + + out0 = vaddq_s16(out0, horiz_const); + out0 = vqrshlq_s16(out0, shift_round_0); + + out1 = vaddq_s16(out1, horiz_const); + out1 = vqrshlq_s16(out1, shift_round_0); + + out2 = vaddq_s16(out2, horiz_const); + out2 = vqrshlq_s16(out2, shift_round_0); + + out3 = vaddq_s16(out3, horiz_const); + out3 = vqrshlq_s16(out3, shift_round_0); + + store_s16_8x4(d_tmp, im_dst_stride, out0, out1, out2, out3); + + reg0 = reg8; + reg1 = reg9; + reg2 = reg10; + reg3 = reg11; + reg4 = reg12; + reg5 = reg13; + reg6 = reg14; + s += 8; + d_tmp += 8; + width -= 8; + } while (width > 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * im_dst_stride; + height -= 4; + } + + if (height) { + assert(height < 4); + horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w, + height, x_filter_tmp, horiz_const, + shift_round_0); + } +#else + + horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w, + height, x_filter_tmp, horiz_const, + shift_round_0); +#endif + } + + // vertical + { + uint8_t *dst_u8_ptr, *d_u8; + int16_t *v_src_ptr, *v_s; + + const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1)); + const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits); + const int32x4_t sub_const_vec = vdupq_n_s32(sub_const); + + src_stride = im_stride; + v_src_ptr = im_block; + dst_u8_ptr = dst; + + height = h; + width = w; + + if (width <= 4) { + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7; + uint16x4_t d0; + uint16x8_t dd0; + uint8x8_t d01; + +#if defined(__aarch64__) + int16x4_t s8, s9, s10; + uint16x4_t d1, d2, d3; + uint16x8_t dd1; + uint8x8_t d23; +#endif + + d_u8 = dst_u8_ptr; + v_s = v_src_ptr; + + __builtin_prefetch(v_s + 0 * im_stride); + __builtin_prefetch(v_s + 1 * im_stride); + __builtin_prefetch(v_s + 2 * im_stride); + __builtin_prefetch(v_s + 3 * im_stride); + __builtin_prefetch(v_s + 4 * im_stride); + __builtin_prefetch(v_s + 5 * im_stride); + __builtin_prefetch(v_s + 6 * im_stride); + __builtin_prefetch(v_s + 7 * im_stride); + + load_s16_4x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + v_s += (7 * im_stride); + + do { +#if defined(__aarch64__) + load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10); + v_s += (im_stride << 2); + + __builtin_prefetch(d_u8 + 0 * dst_stride); + __builtin_prefetch(d_u8 + 1 * dst_stride); + __builtin_prefetch(d_u8 + 2 * dst_stride); + __builtin_prefetch(d_u8 + 3 * dst_stride); + + d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + round_shift_vec, offset_const, + sub_const_vec); + d1 = convolve8_vert_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, + round_shift_vec, offset_const, + sub_const_vec); + d2 = convolve8_vert_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, + round_shift_vec, offset_const, + sub_const_vec); + d3 = convolve8_vert_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, + round_shift_vec, offset_const, + sub_const_vec); + + dd0 = vqrshlq_u16(vcombine_u16(d0, d1), vec_round_bits); + dd1 = vqrshlq_u16(vcombine_u16(d2, d3), vec_round_bits); + + d01 = vqmovn_u16(dd0); + d23 = vqmovn_u16(dd1); + + if ((w == 4) && (h != 2)) { + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01), + 0); // 00 01 02 03 + d_u8 += dst_stride; + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01), + 1); // 10 11 12 13 + d_u8 += dst_stride; + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23), + 0); // 20 21 22 23 + d_u8 += dst_stride; + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23), + 1); // 30 31 32 33 + d_u8 += dst_stride; + } else if ((w == 2) && (h != 2)) { + vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01), + 0); // 00 01 + d_u8 += dst_stride; + vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01), + 2); // 10 11 + d_u8 += dst_stride; + vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23), + 0); // 20 21 + d_u8 += dst_stride; + vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23), + 2); // 30 31 + d_u8 += dst_stride; + } else if ((w == 4) && (h == 2)) { + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01), + 0); // 00 01 02 03 + d_u8 += dst_stride; + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01), + 1); // 10 11 12 13 + d_u8 += dst_stride; + } else if ((w == 2) && (h == 2)) { + vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01), + 0); // 00 01 + d_u8 += dst_stride; + vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01), + 2); // 10 11 + d_u8 += dst_stride; + } + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + height -= 4; +#else + s7 = vld1_s16(v_s); + v_s += im_stride; + + __builtin_prefetch(d_u8 + 0 * dst_stride); + + d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + round_shift_vec, offset_const, + sub_const_vec); + + dd0 = vqrshlq_u16(vcombine_u16(d0, d0), vec_round_bits); + d01 = vqmovn_u16(dd0); + + if (w == 4) { + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01), + 0); // 00 01 02 03 + d_u8 += dst_stride; + + } else if (w == 2) { + vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01), + 0); // 00 01 + d_u8 += dst_stride; + } + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + height -= 1; +#endif + } while (height > 0); + } else { + // if width is a multiple of 8 & height is a multiple of 4 + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + uint8x8_t res0; +#if defined(__aarch64__) + int16x8_t s8, s9, s10; + uint8x8_t res1, res2, res3; +#endif + + do { + __builtin_prefetch(v_src_ptr + 0 * im_stride); + __builtin_prefetch(v_src_ptr + 1 * im_stride); + __builtin_prefetch(v_src_ptr + 2 * im_stride); + __builtin_prefetch(v_src_ptr + 3 * im_stride); + __builtin_prefetch(v_src_ptr + 4 * im_stride); + __builtin_prefetch(v_src_ptr + 5 * im_stride); + __builtin_prefetch(v_src_ptr + 6 * im_stride); + __builtin_prefetch(v_src_ptr + 7 * im_stride); + + v_s = v_src_ptr; + load_s16_8x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + v_s += (7 * im_stride); + + d_u8 = dst_u8_ptr; + height = h; + + do { +#if defined(__aarch64__) + load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10); + v_s += (im_stride << 2); + + __builtin_prefetch(d_u8 + 4 * dst_stride); + __builtin_prefetch(d_u8 + 5 * dst_stride); + __builtin_prefetch(d_u8 + 6 * dst_stride); + __builtin_prefetch(d_u8 + 7 * dst_stride); + + res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, round_shift_vec, offset_const, + sub_const_vec, vec_round_bits); + res1 = convolve8_vert_8x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, + y_filter, round_shift_vec, offset_const, + sub_const_vec, vec_round_bits); + res2 = convolve8_vert_8x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, + y_filter, round_shift_vec, offset_const, + sub_const_vec, vec_round_bits); + res3 = convolve8_vert_8x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, + y_filter, round_shift_vec, offset_const, + sub_const_vec, vec_round_bits); + + if (h != 2) { + vst1_u8(d_u8, res0); + d_u8 += dst_stride; + vst1_u8(d_u8, res1); + d_u8 += dst_stride; + vst1_u8(d_u8, res2); + d_u8 += dst_stride; + vst1_u8(d_u8, res3); + d_u8 += dst_stride; + } else { + vst1_u8(d_u8, res0); + d_u8 += dst_stride; + vst1_u8(d_u8, res1); + d_u8 += dst_stride; + } + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + height -= 4; +#else + s7 = vld1q_s16(v_s); + v_s += im_stride; + + __builtin_prefetch(d_u8 + 0 * dst_stride); + + res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, round_shift_vec, offset_const, + sub_const_vec, vec_round_bits); + + vst1_u8(d_u8, res0); + d_u8 += dst_stride; + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + height -= 1; +#endif + } while (height > 0); + v_src_ptr += 8; + dst_u8_ptr += 8; + w -= 8; + } while (w > 0); + } + } +} +void av1_convolve_2d_copy_sr_neon(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + (void)filter_params_x; + (void)filter_params_y; + (void)subpel_x_qn; + (void)subpel_y_qn; + (void)conv_params; + + const uint8_t *src1; + uint8_t *dst1; + int y; + + if (!(w & 0x0F)) { + for (y = 0; y < h; ++y) { + src1 = src; + dst1 = dst; + for (int x = 0; x < (w >> 4); ++x) { + vst1q_u8(dst1, vld1q_u8(src1)); + src1 += 16; + dst1 += 16; + } + src += src_stride; + dst += dst_stride; + } + } else if (!(w & 0x07)) { + for (y = 0; y < h; ++y) { + vst1_u8(dst, vld1_u8(src)); + src += src_stride; + dst += dst_stride; + } + } else if (!(w & 0x03)) { + for (y = 0; y < h; ++y) { + vst1_lane_u32((uint32_t *)(dst), vreinterpret_u32_u8(vld1_u8(src)), 0); + src += src_stride; + dst += dst_stride; + } + } else if (!(w & 0x01)) { + for (y = 0; y < h; ++y) { + vst1_lane_u16((uint16_t *)(dst), vreinterpret_u16_u8(vld1_u8(src)), 0); + src += src_stride; + dst += dst_stride; + } + } +} diff --git a/libs/libaom/src/av1/common/arm/convolve_neon.h b/libs/libaom/src/av1/common/arm/convolve_neon.h new file mode 100644 index 000000000..dbcfab631 --- /dev/null +++ b/libs/libaom/src/av1/common/arm/convolve_neon.h @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_ +#define AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_ + +#include + +#define HORIZ_EXTRA_ROWS ((SUBPEL_TAPS + 7) & ~0x07) + +static INLINE uint8x8_t wiener_convolve8_vert_4x8( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, int16_t *filter_y, const int bd, + const int round1_bits) { + int16x8_t ss0, ss1, ss2; + int32x4_t sum0, sum1; + uint16x4_t tmp0, tmp1; + uint16x8_t tmp; + uint8x8_t res; + + const int32_t round_const = (1 << (bd + round1_bits - 1)); + const int32x4_t round_bits = vdupq_n_s32(-round1_bits); + const int32x4_t zero = vdupq_n_s32(0); + const int32x4_t round_vec = vdupq_n_s32(round_const); + + ss0 = vaddq_s16(s0, s6); + ss1 = vaddq_s16(s1, s5); + ss2 = vaddq_s16(s2, s4); + + sum0 = vmull_n_s16(vget_low_s16(ss0), filter_y[0]); + sum0 = vmlal_n_s16(sum0, vget_low_s16(ss1), filter_y[1]); + sum0 = vmlal_n_s16(sum0, vget_low_s16(ss2), filter_y[2]); + sum0 = vmlal_n_s16(sum0, vget_low_s16(s3), filter_y[3]); + + sum1 = vmull_n_s16(vget_high_s16(ss0), filter_y[0]); + sum1 = vmlal_n_s16(sum1, vget_high_s16(ss1), filter_y[1]); + sum1 = vmlal_n_s16(sum1, vget_high_s16(ss2), filter_y[2]); + sum1 = vmlal_n_s16(sum1, vget_high_s16(s3), filter_y[3]); + + sum0 = vsubq_s32(sum0, round_vec); + sum1 = vsubq_s32(sum1, round_vec); + + /* right shift & rounding */ + sum0 = vrshlq_s32(sum0, round_bits); + sum1 = vrshlq_s32(sum1, round_bits); + + sum0 = vmaxq_s32(sum0, zero); + sum1 = vmaxq_s32(sum1, zero); + + /* from int32x4_t to uint8x8_t */ + tmp0 = vqmovn_u32(vreinterpretq_u32_s32(sum0)); + tmp1 = vqmovn_u32(vreinterpretq_u32_s32(sum1)); + tmp = vcombine_u16(tmp0, tmp1); + res = vqmovn_u16(tmp); + + return res; +} + +static INLINE uint16x8_t wiener_convolve8_horiz_8x8( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, int16_t *filter_x, const int bd, + const int round0_bits) { + int16x8_t sum; + uint16x8_t res; + int32x4_t sum_0, sum_1; + int32x4_t s3_0, s3_1; + const int32_t round_const_0 = (1 << (bd + FILTER_BITS - 1)); + const int32_t round_const_1 = (1 << (bd + 1 + FILTER_BITS - round0_bits)) - 1; + + /* for the purpose of right shift by { conv_params->round_0 } */ + const int32x4_t round_bits = vdupq_n_s32(-round0_bits); + + const int32x4_t round_vec_0 = vdupq_n_s32(round_const_0); + const int32x4_t round_vec_1 = vdupq_n_s32(round_const_1); + + sum = vmulq_n_s16(s0, filter_x[0]); + sum = vmlaq_n_s16(sum, s1, filter_x[1]); + sum = vmlaq_n_s16(sum, s2, filter_x[2]); + + /* sum from 16x8 to 2 32x4 registers */ + sum_0 = vmovl_s16(vget_low_s16(sum)); + sum_1 = vmovl_s16(vget_high_s16(sum)); + + /* s[3]*128 -- and filter coef max can be 128 + * then max value possible = 128*128*255 exceeding 16 bit + */ + + s3_0 = vmull_n_s16(vget_low_s16(s3), filter_x[3]); + s3_1 = vmull_n_s16(vget_high_s16(s3), filter_x[3]); + sum_0 = vaddq_s32(sum_0, s3_0); + sum_1 = vaddq_s32(sum_1, s3_1); + + /* Add the constant value */ + sum_0 = vaddq_s32(sum_0, round_vec_0); + sum_1 = vaddq_s32(sum_1, round_vec_0); + + /* right shift & rounding & saturating */ + sum_0 = vqrshlq_s32(sum_0, round_bits); + sum_1 = vqrshlq_s32(sum_1, round_bits); + + /* Clipping to max value */ + sum_0 = vminq_s32(sum_0, round_vec_1); + sum_1 = vminq_s32(sum_1, round_vec_1); + + res = vcombine_u16(vqmovun_s32(sum_0), vqmovun_s32(sum_1)); + return res; +} + +static INLINE uint16x4_t wiener_convolve8_horiz_4x8( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, int16_t *filter_x, const int bd, + const int round0_bits) { + uint16x4_t res; + int32x4_t sum_0, s3_0; + int16x4_t sum, temp0, temp1, temp2; + + const int32_t round_const_0 = (1 << (bd + FILTER_BITS - 1)); + const int32_t round_const_1 = (1 << (bd + 1 + FILTER_BITS - round0_bits)) - 1; + const int32x4_t round_bits = vdupq_n_s32(-round0_bits); + const int32x4_t zero = vdupq_n_s32(0); + const int32x4_t round_vec_0 = vdupq_n_s32(round_const_0); + const int32x4_t round_vec_1 = vdupq_n_s32(round_const_1); + + temp0 = vadd_s16(s0, s6); + temp1 = vadd_s16(s1, s5); + temp2 = vadd_s16(s2, s4); + + sum = vmul_n_s16(temp0, filter_x[0]); + sum = vmla_n_s16(sum, temp1, filter_x[1]); + sum = vmla_n_s16(sum, temp2, filter_x[2]); + sum_0 = vmovl_s16(sum); + + /* s[3]*128 -- and filter coff max can be 128. + * then max value possible = 128*128*255 Therefore, 32 bits are required to + * hold the result. + */ + s3_0 = vmull_n_s16(s3, filter_x[3]); + sum_0 = vaddq_s32(sum_0, s3_0); + + sum_0 = vaddq_s32(sum_0, round_vec_0); + sum_0 = vrshlq_s32(sum_0, round_bits); + + sum_0 = vmaxq_s32(sum_0, zero); + sum_0 = vminq_s32(sum_0, round_vec_1); + res = vqmovun_s32(sum_0); + return res; +} + +static INLINE int16x8_t +convolve8_8x8_s16(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, const int16_t *filter, + const int16x8_t horiz_const, const int16x8_t shift_round_0) { + int16x8_t sum; + int16x8_t res; + + sum = horiz_const; + sum = vmlaq_n_s16(sum, s0, filter[0]); + sum = vmlaq_n_s16(sum, s1, filter[1]); + sum = vmlaq_n_s16(sum, s2, filter[2]); + sum = vmlaq_n_s16(sum, s3, filter[3]); + sum = vmlaq_n_s16(sum, s4, filter[4]); + sum = vmlaq_n_s16(sum, s5, filter[5]); + sum = vmlaq_n_s16(sum, s6, filter[6]); + sum = vmlaq_n_s16(sum, s7, filter[7]); + + res = vqrshlq_s16(sum, shift_round_0); + + return res; +} + +static INLINE int16x4_t +convolve8_4x4_s16(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16_t *filter, + const int16x4_t horiz_const, const int16x4_t shift_round_0) { + int16x4_t sum; + sum = horiz_const; + sum = vmla_n_s16(sum, s0, filter[0]); + sum = vmla_n_s16(sum, s1, filter[1]); + sum = vmla_n_s16(sum, s2, filter[2]); + sum = vmla_n_s16(sum, s3, filter[3]); + sum = vmla_n_s16(sum, s4, filter[4]); + sum = vmla_n_s16(sum, s5, filter[5]); + sum = vmla_n_s16(sum, s6, filter[6]); + sum = vmla_n_s16(sum, s7, filter[7]); + + sum = vqrshl_s16(sum, shift_round_0); + + return sum; +} + +static INLINE uint16x4_t convolve8_4x4_s32( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16_t *y_filter, + const int32x4_t round_shift_vec, const int32x4_t offset_const) { + int32x4_t sum0; + uint16x4_t res; + const int32x4_t zero = vdupq_n_s32(0); + + sum0 = vmull_n_s16(s0, y_filter[0]); + sum0 = vmlal_n_s16(sum0, s1, y_filter[1]); + sum0 = vmlal_n_s16(sum0, s2, y_filter[2]); + sum0 = vmlal_n_s16(sum0, s3, y_filter[3]); + sum0 = vmlal_n_s16(sum0, s4, y_filter[4]); + sum0 = vmlal_n_s16(sum0, s5, y_filter[5]); + sum0 = vmlal_n_s16(sum0, s6, y_filter[6]); + sum0 = vmlal_n_s16(sum0, s7, y_filter[7]); + + sum0 = vaddq_s32(sum0, offset_const); + sum0 = vqrshlq_s32(sum0, round_shift_vec); + sum0 = vmaxq_s32(sum0, zero); + res = vmovn_u32(vreinterpretq_u32_s32(sum0)); + + return res; +} + +#endif // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_ diff --git a/libs/libaom/src/av1/common/arm/jnt_convolve_neon.c b/libs/libaom/src/av1/common/arm/jnt_convolve_neon.c new file mode 100644 index 000000000..92112fb85 --- /dev/null +++ b/libs/libaom/src/av1/common/arm/jnt_convolve_neon.c @@ -0,0 +1,1739 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/txfm_common.h" +#include "aom_ports/mem.h" +#include "av1/common/common.h" +#include "av1/common/arm/convolve_neon.h" +#include "av1/common/arm/mem_neon.h" +#include "av1/common/arm/transpose_neon.h" + +#if !defined(__aarch64__) +static INLINE void compute_avg_4x1( + uint16x4_t res0, uint16x4_t d0, const uint16_t fwd_offset, + const uint16_t bck_offset, const int16x4_t sub_const_vec, + const int16_t round_bits, const int use_dist_wtd_comp_avg, uint8x8_t *t0) { + int16x4_t tmp0; + uint16x4_t tmp_u0; + uint32x4_t sum0; + int32x4_t dst0; + int16x8_t tmp4; + + if (use_dist_wtd_comp_avg) { + const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits)); + + sum0 = vmull_n_u16(res0, fwd_offset); + sum0 = vmlal_n_u16(sum0, d0, bck_offset); + + sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS); + + dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), vmovl_s16(sub_const_vec)); + + dst0 = vqrshlq_s32(dst0, round_bits_vec); + + tmp0 = vqmovn_s32(dst0); + tmp4 = vcombine_s16(tmp0, tmp0); + + *t0 = vqmovun_s16(tmp4); + } else { + const int16x4_t round_bits_vec = vdup_n_s16(-round_bits); + tmp_u0 = vhadd_u16(res0, d0); + + tmp0 = vsub_s16(vreinterpret_s16_u16(tmp_u0), sub_const_vec); + + tmp0 = vqrshl_s16(tmp0, round_bits_vec); + + tmp4 = vcombine_s16(tmp0, tmp0); + + *t0 = vqmovun_s16(tmp4); + } +} + +static INLINE void compute_avg_8x1( + uint16x8_t res0, uint16x8_t d0, const uint16_t fwd_offset, + const uint16_t bck_offset, const int16x4_t sub_const, + const int16_t round_bits, const int use_dist_wtd_comp_avg, uint8x8_t *t0) { + int16x4_t tmp0, tmp2; + int16x8_t f0; + uint32x4_t sum0, sum2; + int32x4_t dst0, dst2; + + uint16x8_t tmp_u0; + + if (use_dist_wtd_comp_avg) { + const int32x4_t sub_const_vec = vmovl_s16(sub_const); + const int32x4_t round_bits_vec = vdupq_n_s32(-(int32_t)round_bits); + + sum0 = vmull_n_u16(vget_low_u16(res0), fwd_offset); + sum0 = vmlal_n_u16(sum0, vget_low_u16(d0), bck_offset); + sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS); + + sum2 = vmull_n_u16(vget_high_u16(res0), fwd_offset); + sum2 = vmlal_n_u16(sum2, vget_high_u16(d0), bck_offset); + sum2 = vshrq_n_u32(sum2, DIST_PRECISION_BITS); + + dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), sub_const_vec); + dst2 = vsubq_s32(vreinterpretq_s32_u32(sum2), sub_const_vec); + + dst0 = vqrshlq_s32(dst0, round_bits_vec); + dst2 = vqrshlq_s32(dst2, round_bits_vec); + + tmp0 = vqmovn_s32(dst0); + tmp2 = vqmovn_s32(dst2); + + f0 = vcombine_s16(tmp0, tmp2); + + *t0 = vqmovun_s16(f0); + + } else { + const int16x8_t sub_const_vec = vcombine_s16(sub_const, sub_const); + const int16x8_t round_bits_vec = vdupq_n_s16(-round_bits); + + tmp_u0 = vhaddq_u16(res0, d0); + + f0 = vsubq_s16(vreinterpretq_s16_u16(tmp_u0), sub_const_vec); + + f0 = vqrshlq_s16(f0, round_bits_vec); + + *t0 = vqmovun_s16(f0); + } +} +#endif // !defined(__arch64__) + +static INLINE void compute_avg_4x4( + uint16x4_t res0, uint16x4_t res1, uint16x4_t res2, uint16x4_t res3, + uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3, + const uint16_t fwd_offset, const uint16_t bck_offset, + const int16x4_t sub_const_vec, const int16_t round_bits, + const int use_dist_wtd_comp_avg, uint8x8_t *t0, uint8x8_t *t1) { + int16x4_t tmp0, tmp1, tmp2, tmp3; + uint16x4_t tmp_u0, tmp_u1, tmp_u2, tmp_u3; + uint32x4_t sum0, sum1, sum2, sum3; + + int32x4_t dst0, dst1, dst2, dst3; + int16x8_t tmp4, tmp5; + const int16x8_t zero = vdupq_n_s16(0); + + if (use_dist_wtd_comp_avg) { + const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits)); + const int32x4_t const_vec = vmovl_s16(sub_const_vec); + + sum0 = vmull_n_u16(res0, fwd_offset); + sum0 = vmlal_n_u16(sum0, d0, bck_offset); + sum1 = vmull_n_u16(res1, fwd_offset); + sum1 = vmlal_n_u16(sum1, d1, bck_offset); + sum2 = vmull_n_u16(res2, fwd_offset); + sum2 = vmlal_n_u16(sum2, d2, bck_offset); + sum3 = vmull_n_u16(res3, fwd_offset); + sum3 = vmlal_n_u16(sum3, d3, bck_offset); + + sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS); + sum1 = vshrq_n_u32(sum1, DIST_PRECISION_BITS); + sum2 = vshrq_n_u32(sum2, DIST_PRECISION_BITS); + sum3 = vshrq_n_u32(sum3, DIST_PRECISION_BITS); + + dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), const_vec); + dst1 = vsubq_s32(vreinterpretq_s32_u32(sum1), const_vec); + dst2 = vsubq_s32(vreinterpretq_s32_u32(sum2), const_vec); + dst3 = vsubq_s32(vreinterpretq_s32_u32(sum3), const_vec); + + dst0 = vqrshlq_s32(dst0, round_bits_vec); + dst1 = vqrshlq_s32(dst1, round_bits_vec); + dst2 = vqrshlq_s32(dst2, round_bits_vec); + dst3 = vqrshlq_s32(dst3, round_bits_vec); + + tmp0 = vqmovn_s32(dst0); + tmp1 = vqmovn_s32(dst1); + tmp2 = vqmovn_s32(dst2); + tmp3 = vqmovn_s32(dst3); + tmp4 = vcombine_s16(tmp0, tmp1); + tmp5 = vcombine_s16(tmp2, tmp3); + tmp4 = vmaxq_s16(tmp4, zero); + tmp5 = vmaxq_s16(tmp5, zero); + + *t0 = vqmovn_u16(vreinterpretq_u16_s16(tmp4)); + *t1 = vqmovn_u16(vreinterpretq_u16_s16(tmp5)); + } else { + const int16x4_t round_bits_vec = vdup_n_s16(-round_bits); + tmp_u0 = vhadd_u16(res0, d0); + tmp_u1 = vhadd_u16(res1, d1); + tmp_u2 = vhadd_u16(res2, d2); + tmp_u3 = vhadd_u16(res3, d3); + + tmp0 = vsub_s16(vreinterpret_s16_u16(tmp_u0), sub_const_vec); + tmp1 = vsub_s16(vreinterpret_s16_u16(tmp_u1), sub_const_vec); + tmp2 = vsub_s16(vreinterpret_s16_u16(tmp_u2), sub_const_vec); + tmp3 = vsub_s16(vreinterpret_s16_u16(tmp_u3), sub_const_vec); + + tmp0 = vqrshl_s16(tmp0, round_bits_vec); + tmp1 = vqrshl_s16(tmp1, round_bits_vec); + tmp2 = vqrshl_s16(tmp2, round_bits_vec); + tmp3 = vqrshl_s16(tmp3, round_bits_vec); + + tmp4 = vcombine_s16(tmp0, tmp1); + tmp5 = vcombine_s16(tmp2, tmp3); + tmp4 = vmaxq_s16(tmp4, zero); + tmp5 = vmaxq_s16(tmp5, zero); + + *t0 = vqmovn_u16(vreinterpretq_u16_s16(tmp4)); + *t1 = vqmovn_u16(vreinterpretq_u16_s16(tmp5)); + } +} + +static INLINE void compute_avg_8x4( + uint16x8_t res0, uint16x8_t res1, uint16x8_t res2, uint16x8_t res3, + uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3, + const uint16_t fwd_offset, const uint16_t bck_offset, + const int16x4_t sub_const, const int16_t round_bits, + const int use_dist_wtd_comp_avg, uint8x8_t *t0, uint8x8_t *t1, + uint8x8_t *t2, uint8x8_t *t3) { + int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int16x8_t f0, f1, f2, f3; + uint32x4_t sum0, sum1, sum2, sum3; + uint32x4_t sum4, sum5, sum6, sum7; + int32x4_t dst0, dst1, dst2, dst3; + int32x4_t dst4, dst5, dst6, dst7; + uint16x8_t tmp_u0, tmp_u1, tmp_u2, tmp_u3; + const int16x8_t zero = vdupq_n_s16(0); + + if (use_dist_wtd_comp_avg) { + const int32x4_t sub_const_vec = vmovl_s16(sub_const); + const int32x4_t round_bits_vec = vdupq_n_s32(-(int32_t)round_bits); + + sum0 = vmull_n_u16(vget_low_u16(res0), fwd_offset); + sum0 = vmlal_n_u16(sum0, vget_low_u16(d0), bck_offset); + sum1 = vmull_n_u16(vget_low_u16(res1), fwd_offset); + sum1 = vmlal_n_u16(sum1, vget_low_u16(d1), bck_offset); + sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS); + sum1 = vshrq_n_u32(sum1, DIST_PRECISION_BITS); + + sum2 = vmull_n_u16(vget_high_u16(res0), fwd_offset); + sum2 = vmlal_n_u16(sum2, vget_high_u16(d0), bck_offset); + sum3 = vmull_n_u16(vget_high_u16(res1), fwd_offset); + sum3 = vmlal_n_u16(sum3, vget_high_u16(d1), bck_offset); + sum2 = vshrq_n_u32(sum2, DIST_PRECISION_BITS); + sum3 = vshrq_n_u32(sum3, DIST_PRECISION_BITS); + + sum4 = vmull_n_u16(vget_low_u16(res2), fwd_offset); + sum4 = vmlal_n_u16(sum4, vget_low_u16(d2), bck_offset); + sum5 = vmull_n_u16(vget_low_u16(res3), fwd_offset); + sum5 = vmlal_n_u16(sum5, vget_low_u16(d3), bck_offset); + sum4 = vshrq_n_u32(sum4, DIST_PRECISION_BITS); + sum5 = vshrq_n_u32(sum5, DIST_PRECISION_BITS); + + sum6 = vmull_n_u16(vget_high_u16(res2), fwd_offset); + sum6 = vmlal_n_u16(sum6, vget_high_u16(d2), bck_offset); + sum7 = vmull_n_u16(vget_high_u16(res3), fwd_offset); + sum7 = vmlal_n_u16(sum7, vget_high_u16(d3), bck_offset); + sum6 = vshrq_n_u32(sum6, DIST_PRECISION_BITS); + sum7 = vshrq_n_u32(sum7, DIST_PRECISION_BITS); + + dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), sub_const_vec); + dst1 = vsubq_s32(vreinterpretq_s32_u32(sum1), sub_const_vec); + dst2 = vsubq_s32(vreinterpretq_s32_u32(sum2), sub_const_vec); + dst3 = vsubq_s32(vreinterpretq_s32_u32(sum3), sub_const_vec); + dst4 = vsubq_s32(vreinterpretq_s32_u32(sum4), sub_const_vec); + dst5 = vsubq_s32(vreinterpretq_s32_u32(sum5), sub_const_vec); + dst6 = vsubq_s32(vreinterpretq_s32_u32(sum6), sub_const_vec); + dst7 = vsubq_s32(vreinterpretq_s32_u32(sum7), sub_const_vec); + + dst0 = vqrshlq_s32(dst0, round_bits_vec); + dst1 = vqrshlq_s32(dst1, round_bits_vec); + dst2 = vqrshlq_s32(dst2, round_bits_vec); + dst3 = vqrshlq_s32(dst3, round_bits_vec); + dst4 = vqrshlq_s32(dst4, round_bits_vec); + dst5 = vqrshlq_s32(dst5, round_bits_vec); + dst6 = vqrshlq_s32(dst6, round_bits_vec); + dst7 = vqrshlq_s32(dst7, round_bits_vec); + + tmp0 = vqmovn_s32(dst0); + tmp1 = vqmovn_s32(dst1); + tmp2 = vqmovn_s32(dst2); + tmp3 = vqmovn_s32(dst3); + tmp4 = vqmovn_s32(dst4); + tmp5 = vqmovn_s32(dst5); + tmp6 = vqmovn_s32(dst6); + tmp7 = vqmovn_s32(dst7); + + f0 = vcombine_s16(tmp0, tmp2); + f1 = vcombine_s16(tmp1, tmp3); + f2 = vcombine_s16(tmp4, tmp6); + f3 = vcombine_s16(tmp5, tmp7); + + f0 = vmaxq_s16(f0, zero); + f1 = vmaxq_s16(f1, zero); + f2 = vmaxq_s16(f2, zero); + f3 = vmaxq_s16(f3, zero); + + *t0 = vqmovn_u16(vreinterpretq_u16_s16(f0)); + *t1 = vqmovn_u16(vreinterpretq_u16_s16(f1)); + *t2 = vqmovn_u16(vreinterpretq_u16_s16(f2)); + *t3 = vqmovn_u16(vreinterpretq_u16_s16(f3)); + + } else { + const int16x8_t sub_const_vec = vcombine_s16(sub_const, sub_const); + const int16x8_t round_bits_vec = vdupq_n_s16(-round_bits); + + tmp_u0 = vhaddq_u16(res0, d0); + tmp_u1 = vhaddq_u16(res1, d1); + tmp_u2 = vhaddq_u16(res2, d2); + tmp_u3 = vhaddq_u16(res3, d3); + + f0 = vsubq_s16(vreinterpretq_s16_u16(tmp_u0), sub_const_vec); + f1 = vsubq_s16(vreinterpretq_s16_u16(tmp_u1), sub_const_vec); + f2 = vsubq_s16(vreinterpretq_s16_u16(tmp_u2), sub_const_vec); + f3 = vsubq_s16(vreinterpretq_s16_u16(tmp_u3), sub_const_vec); + + f0 = vqrshlq_s16(f0, round_bits_vec); + f1 = vqrshlq_s16(f1, round_bits_vec); + f2 = vqrshlq_s16(f2, round_bits_vec); + f3 = vqrshlq_s16(f3, round_bits_vec); + + f0 = vmaxq_s16(f0, zero); + f1 = vmaxq_s16(f1, zero); + f2 = vmaxq_s16(f2, zero); + f3 = vmaxq_s16(f3, zero); + + *t0 = vqmovn_u16(vreinterpretq_u16_s16(f0)); + *t1 = vqmovn_u16(vreinterpretq_u16_s16(f1)); + *t2 = vqmovn_u16(vreinterpretq_u16_s16(f2)); + *t3 = vqmovn_u16(vreinterpretq_u16_s16(f3)); + } +} + +static INLINE void dist_wtd_convolve_2d_horiz_neon( + const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride, + int16_t *x_filter_tmp, const int im_h, int w, const int round_0) { + const int bd = 8; + const uint8_t *s; + int16_t *dst_ptr; + int dst_stride; + int width, height; + + dst_ptr = im_block; + dst_stride = im_stride; + height = im_h; + width = w; + + if (w == 4) { + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0; + int16x8_t tt0; + uint8x8_t t0; + + const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2))); + const int16x4_t shift_round_0 = vdup_n_s16(-(round_0)); + +#if defined(__aarch64__) + int16x4_t s8, s9, s10, d1, d2, d3; + int16x8_t tt1, tt2, tt3; + uint8x8_t t1, t2, t3; +#endif + do { + s = src; + __builtin_prefetch(s + 0 * src_stride); +#if defined(__aarch64__) + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); + tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s0 = vget_low_s16(tt0); + s1 = vget_low_s16(tt1); + s2 = vget_low_s16(tt2); + s3 = vget_low_s16(tt3); + s4 = vget_high_s16(tt0); + s5 = vget_high_s16(tt1); + s6 = vget_high_s16(tt2); + __builtin_prefetch(dst_ptr + 0 * dst_stride); + __builtin_prefetch(dst_ptr + 1 * dst_stride); + __builtin_prefetch(dst_ptr + 2 * dst_stride); + __builtin_prefetch(dst_ptr + 3 * dst_stride); + s += 7; + + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); + tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s7 = vget_low_s16(tt0); + s8 = vget_low_s16(tt1); + s9 = vget_low_s16(tt2); + s10 = vget_low_s16(tt3); + + d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp, + horiz_const, shift_round_0); + d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp, + horiz_const, shift_round_0); + d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp, + horiz_const, shift_round_0); + d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp, + horiz_const, shift_round_0); + + transpose_s16_4x4d(&d0, &d1, &d2, &d3); + + vst1_s16((dst_ptr + 0 * dst_stride), d0); + vst1_s16((dst_ptr + 1 * dst_stride), d1); + vst1_s16((dst_ptr + 2 * dst_stride), d2); + vst1_s16((dst_ptr + 3 * dst_stride), d3); + + src += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; +#else + t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 + tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7 + s0 = vget_low_s16(tt0); // a0 a1 a2 a3 + s4 = vget_high_s16(tt0); // a4 a5 a6 a7 + __builtin_prefetch(dst_ptr); + s += 8; + t0 = vld1_u8(s); // a8 a9 a10 a11 + + // a8 a9 a10 a11 + s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + + s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 + s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 + s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 + s5 = vext_s16(s4, s7, 1); // a5 a6 a7 a8 + s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9 + s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10 + + d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp, + horiz_const, shift_round_0); + + vst1_s16(dst_ptr, d0); + + src += src_stride; + dst_ptr += dst_stride; + height -= 1; +#endif + } while (height > 0); + } else { + int16_t *d_tmp; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + int16x8_t res0; + uint8x8_t t0; + + const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2))); + const int16x8_t shift_round_0 = vdupq_n_s16(-(round_0)); + do { +#if defined(__aarch64__) + uint8x8_t t1, t2, t3, t4, t5, t6, t7; + int16x8_t s8, s9, s10, s11, s12, s13, s14; + int16x8_t res1, res2, res3, res4, res5, res6, res7; + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + __builtin_prefetch(src + 7 * src_stride); + load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + width = w; + s = src + 7; + d_tmp = dst_ptr; + __builtin_prefetch(dst_ptr + 0 * dst_stride); + __builtin_prefetch(dst_ptr + 1 * dst_stride); + __builtin_prefetch(dst_ptr + 2 * dst_stride); + __builtin_prefetch(dst_ptr + 3 * dst_stride); + __builtin_prefetch(dst_ptr + 4 * dst_stride); + __builtin_prefetch(dst_ptr + 5 * dst_stride); + __builtin_prefetch(dst_ptr + 6 * dst_stride); + __builtin_prefetch(dst_ptr + 7 * dst_stride); + + do { + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp, + horiz_const, shift_round_0); + res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp, + horiz_const, shift_round_0); + res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp, + horiz_const, shift_round_0); + res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp, + horiz_const, shift_round_0); + res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp, + horiz_const, shift_round_0); + res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12, + x_filter_tmp, horiz_const, shift_round_0); + res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13, + x_filter_tmp, horiz_const, shift_round_0); + res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14, + x_filter_tmp, horiz_const, shift_round_0); + + transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6, + &res7); + + store_s16_8x8(d_tmp, dst_stride, res0, res1, res2, res3, res4, res5, + res6, res7); + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d_tmp += 8; + width -= 8; + } while (width > 0); + src += 8 * src_stride; + dst_ptr += 8 * dst_stride; + height -= 8; +#else + int16x8_t temp_0; + t0 = vld1_u8(src); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7 + + width = w; + s = src + 8; + d_tmp = dst_ptr; + __builtin_prefetch(dst_ptr); + + do { + t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + temp_0 = s0; + s0 = s7; + + s1 = vextq_s16(temp_0, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + s2 = vextq_s16(temp_0, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + s3 = vextq_s16(temp_0, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + s4 = vextq_s16(temp_0, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + s5 = vextq_s16(temp_0, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + s6 = vextq_s16(temp_0, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + s7 = vextq_s16(temp_0, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7, + x_filter_tmp, horiz_const, shift_round_0); + vst1q_s16(d_tmp, res0); + + s += 8; + d_tmp += 8; + width -= 8; + } while (width > 0); + src += src_stride; + dst_ptr += dst_stride; + height -= 1; +#endif + } while (height > 0); + } +} + +static INLINE void dist_wtd_convolve_2d_vert_neon( + int16_t *im_block, const int im_stride, uint8_t *dst8, int dst8_stride, + ConvolveParams *conv_params, const int16_t *y_filter, int h, int w) { + uint8_t *dst_u8_ptr, *d_u8; + CONV_BUF_TYPE *dst_ptr, *dst; + int16_t *src_ptr, *s; + uint16_t *d; + + const int bd = 8; + int height; + int dst_stride = conv_params->dst_stride; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int16_t sub_const = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + + const int16_t round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = bd + 2 * FILTER_BITS - conv_params->round_0; + const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1)); + const int32x4_t offset_const = vdupq_n_s32(1 << offset); + const int16x4_t sub_const_vec = vdup_n_s16(sub_const); + const uint16_t fwd_offset = conv_params->fwd_offset; + const uint16_t bck_offset = conv_params->bck_offset; + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7; + uint16x4_t res4, d0; + uint8x8_t t0; + +#if defined(__aarch64__) + int16x4_t s8, s9, s10; + uint16x4_t res5, res6, res7, d1, d2, d3; + uint8x8_t t1; +#endif + + dst = conv_params->dst; + src_ptr = im_block; + dst_u8_ptr = dst8; + dst_ptr = dst; + height = h; + + do { + d = dst_ptr; + d_u8 = dst_u8_ptr; + s = src_ptr; + height = h; + + __builtin_prefetch(s + 0 * im_stride); + __builtin_prefetch(s + 1 * im_stride); + __builtin_prefetch(s + 2 * im_stride); + __builtin_prefetch(s + 3 * im_stride); + __builtin_prefetch(s + 4 * im_stride); + __builtin_prefetch(s + 5 * im_stride); + __builtin_prefetch(s + 6 * im_stride); + __builtin_prefetch(s + 7 * im_stride); + + load_s16_4x8(s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + s += (7 * im_stride); + + do { +#if defined(__aarch64__) + load_s16_4x4(s, im_stride, &s7, &s8, &s9, &s10); + s += (im_stride << 2); + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + + __builtin_prefetch(d_u8 + 4 * dst8_stride); + __builtin_prefetch(d_u8 + 5 * dst8_stride); + __builtin_prefetch(d_u8 + 6 * dst8_stride); + __builtin_prefetch(d_u8 + 7 * dst8_stride); + + d0 = convolve8_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + round_shift_vec, offset_const); + d1 = convolve8_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, + round_shift_vec, offset_const); + d2 = convolve8_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, + round_shift_vec, offset_const); + d3 = convolve8_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, + round_shift_vec, offset_const); + + if (do_average) { + load_u16_4x4(d, dst_stride, &res4, &res5, &res6, &res7); + d += (dst_stride << 2); + + compute_avg_4x4(res4, res5, res6, res7, d0, d1, d2, d3, fwd_offset, + bck_offset, sub_const_vec, round_bits, + use_dist_wtd_comp_avg, &t0, &t1); + + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0); + d_u8 += dst8_stride; + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 1); + d_u8 += dst8_stride; + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 0); + d_u8 += dst8_stride; + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 1); + d_u8 += dst8_stride; + + } else { + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + d += (dst_stride << 2); + } + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + height -= 4; +#else + s7 = vld1_s16(s); + s += (im_stride); + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d_u8 + 0 * dst8_stride); + + d0 = convolve8_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + round_shift_vec, offset_const); + + if (do_average) { + res4 = vld1_u16(d); + d += (dst_stride); + + compute_avg_4x1(res4, d0, fwd_offset, bck_offset, sub_const_vec, + round_bits, use_dist_wtd_comp_avg, &t0); + + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0); + d_u8 += dst8_stride; + + } else { + vst1_u16(d, d0); + d += (dst_stride); + } + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + height--; +#endif + } while (height > 0); + src_ptr += 4; + dst_ptr += 4; + dst_u8_ptr += 4; + w -= 4; + } while (w > 0); +} + +void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride, + uint8_t *dst8, int dst8_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + assert(!(w % 4)); + assert(!(h % 4)); + + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]); + + const int im_h = h + filter_params_y->taps - 1; + const int im_stride = MAX_SB_SIZE; + const int vert_offset = filter_params_y->taps / 2 - 1; + const int horiz_offset = filter_params_x->taps / 2 - 1; + const int round_0 = conv_params->round_0 - 1; + const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + int16_t x_filter_tmp[8]; + int16x8_t filter_x_coef = vld1q_s16(x_filter); + + // filter coeffs are even, so downshifting by 1 to reduce intermediate + // precision requirements. + filter_x_coef = vshrq_n_s16(filter_x_coef, 1); + vst1q_s16(&x_filter_tmp[0], filter_x_coef); + + dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride, + x_filter_tmp, im_h, w, round_0); + + dist_wtd_convolve_2d_vert_neon(im_block, im_stride, dst8, dst8_stride, + conv_params, y_filter, h, w); +} + +void av1_dist_wtd_convolve_2d_copy_neon( + const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params) { + uint8x8_t res0_8, res1_8, res2_8, res3_8, tmp_shift0, tmp_shift1, tmp_shift2, + tmp_shift3; + uint16x8_t res_q0, res_q1, res_q2, res_q3, tmp_q0, tmp_q1, tmp_q2, tmp_q3; + uint16x4_t tmp4, tmp5, tmp6, tmp7, res4, res5, res6, res7; + const uint8_t *src1, *src2; + uint8_t *dst8_1; + CONV_BUF_TYPE *dst = conv_params->dst, *dst_1, *dst_2; + const int dst_stride = conv_params->dst_stride; + int x, y; + const int16_t bits = + FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int16x4_t sub_const_vec = vdup_n_s16((int16_t)round_offset); + const uint16x8_t dup_round_offset16x8 = vdupq_n_u16((uint16_t)round_offset); + const int16x4_t dup_bits16x4 = vdup_n_s16(bits); + const int16x8_t dup_bits16x8 = vdupq_n_s16(bits); + + (void)filter_params_x; + (void)filter_params_y; + (void)subpel_x_qn; + (void)subpel_y_qn; + + if (!(w & 0x07)) { + for (y = 0; y < (h >> 2); ++y) { + src1 = src; + dst8_1 = dst8; + dst_1 = dst; + for (x = 0; x < (w >> 3); ++x) { + src2 = src1; + load_u8_8x4(src2, src_stride, &res0_8, &res1_8, &res2_8, &res3_8); + + res_q0 = vaddq_u16(vshlq_u16(vmovl_u8(res0_8), dup_bits16x8), + dup_round_offset16x8); + res_q1 = vaddq_u16(vshlq_u16(vmovl_u8(res1_8), dup_bits16x8), + dup_round_offset16x8); + res_q2 = vaddq_u16(vshlq_u16(vmovl_u8(res2_8), dup_bits16x8), + dup_round_offset16x8); + res_q3 = vaddq_u16(vshlq_u16(vmovl_u8(res3_8), dup_bits16x8), + dup_round_offset16x8); + + if (conv_params->do_average) { + dst_2 = dst_1; + load_u16_8x4(dst_2, dst_stride, &tmp_q0, &tmp_q1, &tmp_q2, &tmp_q3); + + compute_avg_8x4(tmp_q0, tmp_q1, tmp_q2, tmp_q3, res_q0, res_q1, + res_q2, res_q3, conv_params->fwd_offset, + conv_params->bck_offset, sub_const_vec, bits, + conv_params->use_dist_wtd_comp_avg, &tmp_shift0, + &tmp_shift1, &tmp_shift2, &tmp_shift3); + + vst1_u8(dst8_1 + (0 * dst8_stride), tmp_shift0); + vst1_u8(dst8_1 + (1 * dst8_stride), tmp_shift1); + vst1_u8(dst8_1 + (2 * dst8_stride), tmp_shift2); + vst1_u8(dst8_1 + (3 * dst8_stride), tmp_shift3); + + } else { + vst1q_u16(dst_1 + (0 * dst_stride), res_q0); + vst1q_u16(dst_1 + (1 * dst_stride), res_q1); + vst1q_u16(dst_1 + (2 * dst_stride), res_q2); + vst1q_u16(dst_1 + (3 * dst_stride), res_q3); + } + src1 = src1 + 8; + dst_1 = dst_1 + 8; + dst8_1 = dst8_1 + 8; + } + src += src_stride * 4; + dst8 += dst8_stride * 4; + dst += dst_stride * 4; + } + } else if (!(w & 0x03)) { + for (y = 0; y < (h >> 2); ++y) { + src1 = src; + dst8_1 = dst8; + dst_1 = dst; + + load_u8_8x4(src1, src_stride, &res0_8, &res1_8, &res2_8, &res3_8); + + res4 = vadd_u16(vshl_u16(vget_low_u16(vmovl_u8(res0_8)), dup_bits16x4), + vreinterpret_u16_s16(sub_const_vec)); + res5 = vadd_u16(vshl_u16(vget_low_u16(vmovl_u8(res1_8)), dup_bits16x4), + vreinterpret_u16_s16(sub_const_vec)); + res6 = vadd_u16(vshl_u16(vget_low_u16(vmovl_u8(res2_8)), dup_bits16x4), + vreinterpret_u16_s16(sub_const_vec)); + res7 = vadd_u16(vshl_u16(vget_low_u16(vmovl_u8(res3_8)), dup_bits16x4), + vreinterpret_u16_s16(sub_const_vec)); + if (conv_params->do_average) { + load_u16_4x4(dst_1, dst_stride, &tmp4, &tmp5, &tmp6, &tmp7); + + compute_avg_4x4(tmp4, tmp5, tmp6, tmp7, res4, res5, res6, res7, + conv_params->fwd_offset, conv_params->bck_offset, + sub_const_vec, bits, conv_params->use_dist_wtd_comp_avg, + &tmp_shift0, &tmp_shift1); + + vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift0), 0); + dst8_1 += dst8_stride; + vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift0), 1); + dst8_1 += dst8_stride; + vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift1), 0); + dst8_1 += dst8_stride; + vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift1), 1); + + } else { + vst1_u16(dst_1, res4); + dst_1 += dst_stride; + vst1_u16(dst_1, res5); + dst_1 += dst_stride; + vst1_u16(dst_1, res6); + dst_1 += dst_stride; + vst1_u16(dst_1, res7); + } + src += src_stride * 4; + dst += dst_stride * 4; + dst8 += dst8_stride * 4; + } + } +} + +void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride, + uint8_t *dst8, int dst8_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + assert(!(w % 4)); + assert(!(h % 4)); + + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int horiz_offset = filter_params_x->taps / 2 - 1; + const int bits = FILTER_BITS - conv_params->round_1; + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const uint16_t fwd_offset = conv_params->fwd_offset; + const uint16_t bck_offset = conv_params->bck_offset; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + + (void)filter_params_y; + (void)subpel_y_qn; + + // horizontal filter + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + const uint8_t *src_ptr = src - horiz_offset; + + int16_t x_filter_tmp[8]; + int16x8_t filter_x_coef = vld1q_s16(x_filter); + + // filter coeffs are even, so downshifting by 1 to reduce intermediate + // precision requirements. + filter_x_coef = vshrq_n_s16(filter_x_coef, 1); + vst1q_s16(&x_filter_tmp[0], filter_x_coef); + + const uint8_t *s; + uint8_t *d_u8; + uint8_t *dst_u8_ptr; + CONV_BUF_TYPE *d, *dst_ptr; + int width, height; + uint8x8_t t0; +#if defined(__aarch64__) + uint8x8_t t1, t2, t3, t4, t5, t6, t7; +#endif + s = src_ptr; + dst_ptr = dst; + dst_u8_ptr = dst8; + width = w; + height = h; + + if ((w == 4) || (h == 4)) { + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0; + int16x8_t tt0; + uint16x4_t res4; +#if defined(__aarch64__) + int16x4_t s8, s9, s10, d1, d2, d3; + int16x8_t tt1, tt2, tt3; + uint16x4_t res5, res6, res7; + uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0); + int16x8_t u0, u1; +#else + int16x4_t temp_0; +#endif + const int16x4_t zero = vdup_n_s16(0); + const int16x4_t round_offset_vec = vdup_n_s16(round_offset); + const int16x4_t shift_round_0 = vdup_n_s16(-conv_params->round_0 + 1); + const int16x4_t horiz_const = vdup_n_s16(bits); + do { + s = src_ptr; + d = dst_ptr; + d_u8 = dst_u8_ptr; + width = w; + __builtin_prefetch(s + 0 * src_stride); +#if defined(__aarch64__) + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); + tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s0 = vget_low_s16(tt0); + s1 = vget_low_s16(tt1); + s2 = vget_low_s16(tt2); + s3 = vget_low_s16(tt3); + s4 = vget_high_s16(tt0); + s5 = vget_high_s16(tt1); + s6 = vget_high_s16(tt2); + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + s += 7; + do { + load_unaligned_u8_4x4(s, src_stride, &tu0, &tu1); + t0 = vreinterpret_u8_u32(tu0); + t1 = vreinterpret_u8_u32(tu1); + + transpose_u8_4x4(&t0, &t1); + u0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + u1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + + s7 = vget_low_s16(u0); + s8 = vget_low_s16(u1); + s9 = vget_high_s16(u0); + s10 = vget_high_s16(u1); + + d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp, + zero, shift_round_0); + d0 = vrshl_s16(d0, horiz_const); + d0 = vadd_s16(d0, round_offset_vec); + d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp, + zero, shift_round_0); + d1 = vrshl_s16(d1, horiz_const); + d1 = vadd_s16(d1, round_offset_vec); + d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp, + zero, shift_round_0); + d2 = vrshl_s16(d2, horiz_const); + d2 = vadd_s16(d2, round_offset_vec); + d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp, + zero, shift_round_0); + d3 = vrshl_s16(d3, horiz_const); + d3 = vadd_s16(d3, round_offset_vec); + + transpose_s16_4x4d(&d0, &d1, &d2, &d3); + + if (conv_params->do_average) { + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + + __builtin_prefetch(d_u8 + 0 * dst8_stride); + __builtin_prefetch(d_u8 + 1 * dst8_stride); + __builtin_prefetch(d_u8 + 2 * dst8_stride); + __builtin_prefetch(d_u8 + 3 * dst8_stride); + + load_u16_4x4(d, dst_stride, &res4, &res5, &res6, &res7); + + compute_avg_4x4(res4, res5, res6, res7, vreinterpret_u16_s16(d0), + vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2), + vreinterpret_u16_s16(d3), fwd_offset, bck_offset, + round_offset_vec, round_bits, use_dist_wtd_comp_avg, + &t0, &t1); + + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), + 0); // 00 01 02 03 + vst1_lane_u32((uint32_t *)(d_u8 + dst8_stride), + vreinterpret_u32_u8(t0), + 1); // 10 11 12 13 + vst1_lane_u32((uint32_t *)(d_u8 + 2 * dst8_stride), + vreinterpret_u32_u8(t1), + 0); // 20 21 22 23 + vst1_lane_u32((uint32_t *)(d_u8 + 3 * dst8_stride), + vreinterpret_u32_u8(t1), + 1); // 30 31 32 33 + } else { + store_u16_4x4(d, dst_stride, vreinterpret_u16_s16(d0), + vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2), + vreinterpret_u16_s16(d3)); + } + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + + s += 4; + width -= 4; + d += 4; + d_u8 += 4; + } while (width > 0); + src_ptr += (src_stride << 2); + dst_ptr += (dst_stride << 2); + dst_u8_ptr += (dst8_stride << 2); + height -= 4; +#else + t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 + tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7 + s0 = vget_low_s16(tt0); // a0 a1 a2 a3 + s4 = vget_high_s16(tt0); // a4 a5 a6 a7 + __builtin_prefetch(d); + + s += 8; + do { + t0 = vld1_u8(s); // a8 a9 a10 a11 + + // a8 a9 a10 a11 + s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + temp_0 = s7; + s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 + s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 + s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 + s5 = vext_s16(s4, s7, 1); // a5 a6 a7 a8 + s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9 + s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10 + + d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp, + zero, shift_round_0); + d0 = vrshl_s16(d0, horiz_const); + d0 = vadd_s16(d0, round_offset_vec); + s0 = s4; + s4 = temp_0; + if (conv_params->do_average) { + __builtin_prefetch(d); + __builtin_prefetch(d_u8); + + res4 = vld1_u16(d); + + compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset, + bck_offset, round_offset_vec, round_bits, + use_dist_wtd_comp_avg, &t0); + + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), + 0); // 00 01 02 03 + } else { + vst1_u16(d, vreinterpret_u16_s16(d0)); + } + + s += 4; + width -= 4; + d += 4; + d_u8 += 4; + } while (width > 0); + src_ptr += (src_stride); + dst_ptr += (dst_stride); + dst_u8_ptr += (dst8_stride); + height--; +#endif + } while (height > 0); + } else { + CONV_BUF_TYPE *d_tmp; + uint8_t *d_u8_tmp; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + int16x8_t res0; + uint16x8_t res8; + const int16x8_t round_offset128 = vdupq_n_s16(round_offset); + const int16x4_t round_offset64 = vdup_n_s16(round_offset); + const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0 + 1); + const int16x8_t horiz_const = vdupq_n_s16(bits); + const int16x8_t zero = vdupq_n_s16(0); + + d = dst_ptr = dst; + d_u8 = dst_u8_ptr = dst8; + do { +#if defined(__aarch64__) + int16x8_t s11, s12, s13, s14; + int16x8_t s8, s9, s10; + int16x8_t res1, res2, res3, res4, res5, res6, res7; + uint16x8_t res9, res10, res11; + __builtin_prefetch(src_ptr + 0 * src_stride); + __builtin_prefetch(src_ptr + 1 * src_stride); + __builtin_prefetch(src_ptr + 2 * src_stride); + __builtin_prefetch(src_ptr + 3 * src_stride); + __builtin_prefetch(src_ptr + 4 * src_stride); + __builtin_prefetch(src_ptr + 5 * src_stride); + __builtin_prefetch(src_ptr + 6 * src_stride); + __builtin_prefetch(src_ptr + 7 * src_stride); + load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + width = w; + s = src_ptr + 7; + d = dst_ptr; + d_u8_tmp = dst_u8_ptr; + + __builtin_prefetch(dst_ptr + 0 * dst_stride); + __builtin_prefetch(dst_ptr + 1 * dst_stride); + __builtin_prefetch(dst_ptr + 2 * dst_stride); + __builtin_prefetch(dst_ptr + 3 * dst_stride); + __builtin_prefetch(dst_ptr + 4 * dst_stride); + __builtin_prefetch(dst_ptr + 5 * dst_stride); + __builtin_prefetch(dst_ptr + 6 * dst_stride); + __builtin_prefetch(dst_ptr + 7 * dst_stride); + + do { + d_u8 = d_u8_tmp; + d_tmp = d; + + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp, + zero, shift_round_0); + + res0 = vrshlq_s16(res0, horiz_const); + res0 = vaddq_s16(res0, round_offset128); + + res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp, + zero, shift_round_0); + res1 = vrshlq_s16(res1, horiz_const); + res1 = vaddq_s16(res1, round_offset128); + res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp, + zero, shift_round_0); + res2 = vrshlq_s16(res2, horiz_const); + res2 = vaddq_s16(res2, round_offset128); + res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp, + zero, shift_round_0); + res3 = vrshlq_s16(res3, horiz_const); + res3 = vaddq_s16(res3, round_offset128); + res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp, + zero, shift_round_0); + res4 = vrshlq_s16(res4, horiz_const); + res4 = vaddq_s16(res4, round_offset128); + res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12, + x_filter_tmp, zero, shift_round_0); + res5 = vrshlq_s16(res5, horiz_const); + res5 = vaddq_s16(res5, round_offset128); + res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13, + x_filter_tmp, zero, shift_round_0); + res6 = vrshlq_s16(res6, horiz_const); + res6 = vaddq_s16(res6, round_offset128); + res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14, + x_filter_tmp, zero, shift_round_0); + res7 = vrshlq_s16(res7, horiz_const); + res7 = vaddq_s16(res7, round_offset128); + + transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6, + &res7); + + if (conv_params->do_average) { + load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11); + d_tmp += (dst_stride << 2); + + compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res0), + vreinterpretq_u16_s16(res1), + vreinterpretq_u16_s16(res2), + vreinterpretq_u16_s16(res3), fwd_offset, bck_offset, + round_offset64, round_bits, use_dist_wtd_comp_avg, + &t0, &t1, &t2, &t3); + + store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3); + d_u8 += (dst8_stride << 2); + + load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11); + d_tmp += (dst_stride << 2); + + compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res4), + vreinterpretq_u16_s16(res5), + vreinterpretq_u16_s16(res6), + vreinterpretq_u16_s16(res7), fwd_offset, bck_offset, + round_offset64, round_bits, use_dist_wtd_comp_avg, + &t0, &t1, &t2, &t3); + + store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3); + d_u8 += (dst8_stride << 2); + } else { + store_u16_8x8( + d_tmp, dst_stride, vreinterpretq_u16_s16(res0), + vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2), + vreinterpretq_u16_s16(res3), vreinterpretq_u16_s16(res4), + vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6), + vreinterpretq_u16_s16(res7)); + d_tmp += (dst_stride << 3); + } + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d += 8; + width -= 8; + d_u8_tmp += 8; + } while (width > 0); + src_ptr += 8 * src_stride; + dst_ptr += 8 * dst_stride; + dst_u8_ptr += 8 * dst8_stride; + height -= 8; +#else + int16x8_t temp_0; + __builtin_prefetch(src_ptr); + t0 = vld1_u8(src_ptr); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7 + + width = w; + s = src_ptr + 8; + d = dst_ptr; + d_u8_tmp = dst_u8_ptr; + + __builtin_prefetch(dst_ptr); + + do { + d_u8 = d_u8_tmp; + d_tmp = d; + + t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + temp_0 = s0; + s0 = s7; + + s1 = vextq_s16(temp_0, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + s2 = vextq_s16(temp_0, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + s3 = vextq_s16(temp_0, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + s4 = vextq_s16(temp_0, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + s5 = vextq_s16(temp_0, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + s6 = vextq_s16(temp_0, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + s7 = vextq_s16(temp_0, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7, + x_filter_tmp, zero, shift_round_0); + + res0 = vrshlq_s16(res0, horiz_const); + res0 = vaddq_s16(res0, round_offset128); + + if (conv_params->do_average) { + res8 = vld1q_u16(d_tmp); + d_tmp += (dst_stride); + + compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset, + bck_offset, round_offset64, round_bits, + use_dist_wtd_comp_avg, &t0); + + vst1_u8(d_u8, t0); + d_u8 += (dst8_stride); + } else { + vst1q_u16(d_tmp, vreinterpretq_u16_s16(res0)); + d_tmp += (dst_stride); + } + + s += 8; + d += 8; + width -= 8; + d_u8_tmp += 8; + } while (width > 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + dst_u8_ptr += dst8_stride; + height--; +#endif + } while (height > 0); + } +} + +void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride, + uint8_t *dst8, int dst8_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + assert(!(w % 4)); + assert(!(h % 4)); + + CONV_BUF_TYPE *dst = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + const int vert_offset = filter_params_y->taps / 2 - 1; + const int bits = FILTER_BITS - conv_params->round_0; + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const uint16_t fwd_offset = conv_params->fwd_offset; + const uint16_t bck_offset = conv_params->bck_offset; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int shift_value = (conv_params->round_1 - 1 - bits); + + (void)filter_params_x; + (void)subpel_x_qn; + + // vertical filter + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + const uint8_t *src_ptr = src - (vert_offset * src_stride); + + int16_t y_filter_tmp[8]; + int16x8_t filter_y_coef = vld1q_s16(y_filter); + + // filter coeffs are even, so downshifting by 1 to reduce intermediate + // precision requirements. + filter_y_coef = vshrq_n_s16(filter_y_coef, 1); + vst1q_s16(&y_filter_tmp[0], filter_y_coef); + + const uint8_t *s; + uint8_t *d_u8; + uint8_t *dst_u8_ptr; + CONV_BUF_TYPE *d, *dst_ptr; + int width, height; + + s = src_ptr; + dst_ptr = dst; + dst_u8_ptr = dst8; + width = w; + height = h; + + // used to get rid of multiplication = (vertical filter output sum) * + // (1<round_1 - 2) >= bits); + + if ((w == 4) || (h == 4)) { + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0; + uint16x4_t res4; + uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0), tu2 = vdup_n_u32(0), + tu3 = vdup_n_u32(0); + int16x8_t u0, u1, u2, u3; + uint8x8_t t0; + +#if defined(__aarch64__) + int16x4_t s8, s9, s10, d1, d2, d3; + uint16x4_t res5, res6, res7; + uint8x8_t t1; +#endif + const int16x4_t round_offset64 = vdup_n_s16(round_offset); + const int16x4_t shift_vec = vdup_n_s16(-shift_value); + const int16x4_t zero = vdup_n_s16(0); + + do { + s = src_ptr; + d = dst_ptr; + d_u8 = dst_u8_ptr; + height = h; + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + + load_unaligned_u8_4x8(s, src_stride, &tu0, &tu1, &tu2, &tu3); + + u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0))); + u1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1))); + u2 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu2))); + u3 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu3))); + + s0 = vget_low_s16(u0); + s1 = vget_high_s16(u0); + s2 = vget_low_s16(u1); + s3 = vget_high_s16(u1); + s4 = vget_low_s16(u2); + s5 = vget_high_s16(u2); + s6 = vget_low_s16(u3); + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + + s += (7 * src_stride); + do { +#if defined(__aarch64__) + load_unaligned_u8_4x4(s, src_stride, &tu0, &tu1); + + u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0))); + u1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1))); + + s7 = vget_low_s16(u0); + s8 = vget_high_s16(u0); + s9 = vget_low_s16(u1); + s10 = vget_high_s16(u1); + + d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp, + zero, shift_vec); + d0 = vadd_s16(d0, round_offset64); + d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter_tmp, + zero, shift_vec); + d1 = vadd_s16(d1, round_offset64); + d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter_tmp, + zero, shift_vec); + d2 = vadd_s16(d2, round_offset64); + d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter_tmp, + zero, shift_vec); + d3 = vadd_s16(d3, round_offset64); + + if (conv_params->do_average) { + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + + __builtin_prefetch(d_u8 + 0 * dst8_stride); + __builtin_prefetch(d_u8 + 1 * dst8_stride); + __builtin_prefetch(d_u8 + 2 * dst8_stride); + __builtin_prefetch(d_u8 + 3 * dst8_stride); + + load_u16_4x4(d, dst_stride, &res4, &res5, &res6, &res7); + d += (dst_stride << 2); + + compute_avg_4x4(res4, res5, res6, res7, vreinterpret_u16_s16(d0), + vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2), + vreinterpret_u16_s16(d3), fwd_offset, bck_offset, + round_offset64, round_bits, use_dist_wtd_comp_avg, + &t0, &t1); + + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0); + d_u8 += dst8_stride; + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 1); + d_u8 += dst8_stride; + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 0); + d_u8 += dst8_stride; + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 1); + d_u8 += dst8_stride; + } else { + store_u16_4x4(d, dst_stride, vreinterpret_u16_s16(d0), + vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2), + vreinterpret_u16_s16(d3)); + d += (dst_stride << 2); + } + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + + s += (src_stride << 2); + height -= 4; +#else + load_unaligned_u8_4x1(s, src_stride, &tu0); + u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0))); + s7 = vget_low_s16(u0); + + d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp, + zero, shift_vec); + + d0 = vadd_s16(d0, round_offset64); + + if (conv_params->do_average) { + __builtin_prefetch(d); + + res4 = vld1_u16(d); + d += (dst_stride); + + compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset, + bck_offset, round_offset64, round_bits, + use_dist_wtd_comp_avg, &t0); + + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0); + d_u8 += dst8_stride; + } else { + vst1_u16(d, vreinterpret_u16_s16(d0)); + d += (dst_stride); + } + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + + s += (src_stride); + height--; +#endif + } while (height > 0); + src_ptr += 4; + dst_ptr += 4; + dst_u8_ptr += 4; + width -= 4; + } while (width > 0); + } else { + CONV_BUF_TYPE *d_tmp; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + int16x8_t res0; + uint16x8_t res8; + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + const int16x8_t round_offset128 = vdupq_n_s16(round_offset); + const int16x8_t shift_vec = vdupq_n_s16(-shift_value); + const int16x4_t round_offset64 = vdup_n_s16(round_offset); + const int16x8_t zero = vdupq_n_s16(0); +#if defined(__aarch64__) + int16x8_t s8, s9, s10, s11, s12, s13, s14; + int16x8_t res1, res2, res3, res4, res5, res6, res7; + uint16x8_t res10, res11, res9; +#endif + dst_ptr = dst; + dst_u8_ptr = dst8; + do { + __builtin_prefetch(src_ptr + 0 * src_stride); + __builtin_prefetch(src_ptr + 1 * src_stride); + __builtin_prefetch(src_ptr + 2 * src_stride); + __builtin_prefetch(src_ptr + 3 * src_stride); + __builtin_prefetch(src_ptr + 4 * src_stride); + __builtin_prefetch(src_ptr + 5 * src_stride); + __builtin_prefetch(src_ptr + 6 * src_stride); + __builtin_prefetch(src_ptr + 7 * src_stride); + load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + height = h; + s = src_ptr + (7 * src_stride); + d_tmp = dst_ptr; + d_u8 = dst_u8_ptr; + + do { +#if defined(__aarch64__) + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + __builtin_prefetch(dst_ptr + 0 * dst_stride); + __builtin_prefetch(dst_ptr + 1 * dst_stride); + __builtin_prefetch(dst_ptr + 2 * dst_stride); + __builtin_prefetch(dst_ptr + 3 * dst_stride); + + res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp, + zero, shift_vec); + res0 = vaddq_s16(res0, round_offset128); + res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter_tmp, + zero, shift_vec); + res1 = vaddq_s16(res1, round_offset128); + res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter_tmp, + zero, shift_vec); + res2 = vaddq_s16(res2, round_offset128); + res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter_tmp, + zero, shift_vec); + res3 = vaddq_s16(res3, round_offset128); + res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, y_filter_tmp, + zero, shift_vec); + res4 = vaddq_s16(res4, round_offset128); + res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12, + y_filter_tmp, zero, shift_vec); + res5 = vaddq_s16(res5, round_offset128); + res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13, + y_filter_tmp, zero, shift_vec); + res6 = vaddq_s16(res6, round_offset128); + res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14, + y_filter_tmp, zero, shift_vec); + res7 = vaddq_s16(res7, round_offset128); + + if (conv_params->do_average) { + __builtin_prefetch(d_tmp + 0 * dst8_stride); + __builtin_prefetch(d_tmp + 1 * dst8_stride); + __builtin_prefetch(d_tmp + 2 * dst8_stride); + __builtin_prefetch(d_tmp + 3 * dst8_stride); + + load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11); + d_tmp += (dst_stride << 2); + + compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res0), + vreinterpretq_u16_s16(res1), + vreinterpretq_u16_s16(res2), + vreinterpretq_u16_s16(res3), fwd_offset, bck_offset, + round_offset64, round_bits, use_dist_wtd_comp_avg, + &t0, &t1, &t2, &t3); + + store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3); + d_u8 += (dst8_stride << 2); + + load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11); + d_tmp += (dst_stride << 2); + + compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res4), + vreinterpretq_u16_s16(res5), + vreinterpretq_u16_s16(res6), + vreinterpretq_u16_s16(res7), fwd_offset, bck_offset, + round_offset64, round_bits, use_dist_wtd_comp_avg, + &t0, &t1, &t2, &t3); + + store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3); + d_u8 += (dst8_stride << 2); + } else { + store_u16_8x8( + d_tmp, dst_stride, vreinterpretq_u16_s16(res0), + vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2), + vreinterpretq_u16_s16(res3), vreinterpretq_u16_s16(res4), + vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6), + vreinterpretq_u16_s16(res7)); + d_tmp += (dst_stride << 3); + } + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += (8 * src_stride); + height -= 8; +#else + s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + + __builtin_prefetch(dst_ptr); + + res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp, + zero, shift_vec); + res0 = vaddq_s16(res0, round_offset128); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + + if (conv_params->do_average) { + __builtin_prefetch(d_tmp); + + res8 = vld1q_u16(d_tmp); + d_tmp += (dst_stride); + + compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset, + bck_offset, round_offset64, round_bits, + use_dist_wtd_comp_avg, &t0); + + vst1_u8(d_u8, t0); + d_u8 += (dst8_stride); + } else { + vst1q_u16(d_tmp, vreinterpretq_u16_s16(res0)); + d_tmp += dst_stride; + } + + s += (src_stride); + height--; +#endif + } while (height > 0); + src_ptr += 8; + dst_ptr += 8; + dst_u8_ptr += 8; + width -= 8; + } while (width > 0); + } +} diff --git a/libs/libaom/src/av1/common/arm/mem_neon.h b/libs/libaom/src/av1/common/arm/mem_neon.h new file mode 100644 index 000000000..171055fe1 --- /dev/null +++ b/libs/libaom/src/av1/common/arm/mem_neon.h @@ -0,0 +1,539 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef AOM_AV1_COMMON_ARM_MEM_NEON_H_ +#define AOM_AV1_COMMON_ARM_MEM_NEON_H_ + +#include +#include +#include "aom_dsp/aom_dsp_common.h" + +static INLINE void store_row2_u8_8x8(uint8_t *s, int p, const uint8x8_t s0, + const uint8x8_t s1) { + vst1_u8(s, s0); + s += p; + vst1_u8(s, s1); + s += p; +} + +/* These intrinsics require immediate values, so we must use #defines + to enforce that. */ +#define load_u8_4x1(s, s0, lane) \ + do { \ + *(s0) = vreinterpret_u8_u32( \ + vld1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(*(s0)), lane)); \ + } while (0) + +static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2, uint8x8_t *const s3, + uint8x8_t *const s4, uint8x8_t *const s5, + uint8x8_t *const s6, uint8x8_t *const s7) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); + s += p; + *s3 = vld1_u8(s); + s += p; + *s4 = vld1_u8(s); + s += p; + *s5 = vld1_u8(s); + s += p; + *s6 = vld1_u8(s); + s += p; + *s7 = vld1_u8(s); +} + +static INLINE void load_u8_8x16(const uint8_t *s, ptrdiff_t p, + uint8x16_t *const s0, uint8x16_t *const s1, + uint8x16_t *const s2, uint8x16_t *const s3) { + *s0 = vld1q_u8(s); + s += p; + *s1 = vld1q_u8(s); + s += p; + *s2 = vld1q_u8(s); + s += p; + *s3 = vld1q_u8(s); +} + +static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2, uint8x8_t *const s3) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); + s += p; + *s3 = vld1_u8(s); +} + +static INLINE void load_u16_4x4(const uint16_t *s, const ptrdiff_t p, + uint16x4_t *const s0, uint16x4_t *const s1, + uint16x4_t *const s2, uint16x4_t *const s3) { + *s0 = vld1_u16(s); + s += p; + *s1 = vld1_u16(s); + s += p; + *s2 = vld1_u16(s); + s += p; + *s3 = vld1_u16(s); + s += p; +} + +static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p, + uint16x8_t *const s0, uint16x8_t *const s1, + uint16x8_t *const s2, uint16x8_t *const s3) { + *s0 = vld1q_u16(s); + s += p; + *s1 = vld1q_u16(s); + s += p; + *s2 = vld1q_u16(s); + s += p; + *s3 = vld1q_u16(s); + s += p; +} + +static INLINE void load_s16_4x8(const int16_t *s, ptrdiff_t p, + int16x4_t *const s0, int16x4_t *const s1, + int16x4_t *const s2, int16x4_t *const s3, + int16x4_t *const s4, int16x4_t *const s5, + int16x4_t *const s6, int16x4_t *const s7) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); + s += p; + *s3 = vld1_s16(s); + s += p; + *s4 = vld1_s16(s); + s += p; + *s5 = vld1_s16(s); + s += p; + *s6 = vld1_s16(s); + s += p; + *s7 = vld1_s16(s); +} + +static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p, + int16x4_t *const s0, int16x4_t *const s1, + int16x4_t *const s2, int16x4_t *const s3) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); + s += p; + *s3 = vld1_s16(s); +} + +/* These intrinsics require immediate values, so we must use #defines + to enforce that. */ +#define store_u8_4x1(s, s0, lane) \ + do { \ + vst1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(s0), lane); \ + } while (0) + +static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0, + const uint8x8_t s1, const uint8x8_t s2, + const uint8x8_t s3, const uint8x8_t s4, + const uint8x8_t s5, const uint8x8_t s6, + const uint8x8_t s7) { + vst1_u8(s, s0); + s += p; + vst1_u8(s, s1); + s += p; + vst1_u8(s, s2); + s += p; + vst1_u8(s, s3); + s += p; + vst1_u8(s, s4); + s += p; + vst1_u8(s, s5); + s += p; + vst1_u8(s, s6); + s += p; + vst1_u8(s, s7); +} + +static INLINE void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0, + const uint8x8_t s1, const uint8x8_t s2, + const uint8x8_t s3) { + vst1_u8(s, s0); + s += p; + vst1_u8(s, s1); + s += p; + vst1_u8(s, s2); + s += p; + vst1_u8(s, s3); +} + +static INLINE void store_u8_8x16(uint8_t *s, ptrdiff_t p, const uint8x16_t s0, + const uint8x16_t s1, const uint8x16_t s2, + const uint8x16_t s3) { + vst1q_u8(s, s0); + s += p; + vst1q_u8(s, s1); + s += p; + vst1q_u8(s, s2); + s += p; + vst1q_u8(s, s3); +} + +static INLINE void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride, + const uint16x8_t s0, const uint16x8_t s1, + const uint16x8_t s2, const uint16x8_t s3, + const uint16x8_t s4, const uint16x8_t s5, + const uint16x8_t s6, const uint16x8_t s7) { + vst1q_u16(s, s0); + s += dst_stride; + vst1q_u16(s, s1); + s += dst_stride; + vst1q_u16(s, s2); + s += dst_stride; + vst1q_u16(s, s3); + s += dst_stride; + vst1q_u16(s, s4); + s += dst_stride; + vst1q_u16(s, s5); + s += dst_stride; + vst1q_u16(s, s6); + s += dst_stride; + vst1q_u16(s, s7); +} + +static INLINE void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride, + const uint16x4_t s0, const uint16x4_t s1, + const uint16x4_t s2, const uint16x4_t s3) { + vst1_u16(s, s0); + s += dst_stride; + vst1_u16(s, s1); + s += dst_stride; + vst1_u16(s, s2); + s += dst_stride; + vst1_u16(s, s3); +} + +static INLINE void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride, + const uint16x8_t s0, const uint16x8_t s1, + const uint16x8_t s2, const uint16x8_t s3) { + vst1q_u16(s, s0); + s += dst_stride; + vst1q_u16(s, s1); + s += dst_stride; + vst1q_u16(s, s2); + s += dst_stride; + vst1q_u16(s, s3); +} + +static INLINE void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride, + const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7) { + vst1q_s16(s, s0); + s += dst_stride; + vst1q_s16(s, s1); + s += dst_stride; + vst1q_s16(s, s2); + s += dst_stride; + vst1q_s16(s, s3); + s += dst_stride; + vst1q_s16(s, s4); + s += dst_stride; + vst1q_s16(s, s5); + s += dst_stride; + vst1q_s16(s, s6); + s += dst_stride; + vst1q_s16(s, s7); +} + +static INLINE void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride, + const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3) { + vst1_s16(s, s0); + s += dst_stride; + vst1_s16(s, s1); + s += dst_stride; + vst1_s16(s, s2); + s += dst_stride; + vst1_s16(s, s3); +} + +static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride, + const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3) { + vst1q_s16(s, s0); + s += dst_stride; + vst1q_s16(s, s1); + s += dst_stride; + vst1q_s16(s, s2); + s += dst_stride; + vst1q_s16(s, s3); +} + +static INLINE void load_s16_8x8(const int16_t *s, ptrdiff_t p, + int16x8_t *const s0, int16x8_t *const s1, + int16x8_t *const s2, int16x8_t *const s3, + int16x8_t *const s4, int16x8_t *const s5, + int16x8_t *const s6, int16x8_t *const s7) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); + s += p; + *s3 = vld1q_s16(s); + s += p; + *s4 = vld1q_s16(s); + s += p; + *s5 = vld1q_s16(s); + s += p; + *s6 = vld1q_s16(s); + s += p; + *s7 = vld1q_s16(s); +} + +static INLINE void load_s16_8x4(const int16_t *s, ptrdiff_t p, + int16x8_t *const s0, int16x8_t *const s1, + int16x8_t *const s2, int16x8_t *const s3) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); + s += p; + *s3 = vld1q_s16(s); +} + +// Load 4 sets of 4 bytes when alignment is not guaranteed. +static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) { + uint32_t a; + uint32x4_t a_u32 = vdupq_n_u32(0); + if (stride == 4) return vld1q_u8(buf); + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vsetq_lane_u32(a, a_u32, 0); + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vsetq_lane_u32(a, a_u32, 1); + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vsetq_lane_u32(a, a_u32, 2); + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vsetq_lane_u32(a, a_u32, 3); + return vreinterpretq_u8_u32(a_u32); +} + +static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride, + uint32x2_t *tu0, uint32x2_t *tu1, + uint32x2_t *tu2, uint32x2_t *tu3) { + uint32_t a; + + memcpy(&a, buf, 4); + buf += stride; + *tu0 = vset_lane_u32(a, *tu0, 0); + memcpy(&a, buf, 4); + buf += stride; + *tu0 = vset_lane_u32(a, *tu0, 1); + memcpy(&a, buf, 4); + buf += stride; + *tu1 = vset_lane_u32(a, *tu1, 0); + memcpy(&a, buf, 4); + buf += stride; + *tu1 = vset_lane_u32(a, *tu1, 1); + memcpy(&a, buf, 4); + buf += stride; + *tu2 = vset_lane_u32(a, *tu2, 0); + memcpy(&a, buf, 4); + buf += stride; + *tu2 = vset_lane_u32(a, *tu2, 1); + memcpy(&a, buf, 4); + buf += stride; + *tu3 = vset_lane_u32(a, *tu3, 0); + memcpy(&a, buf, 4); + *tu3 = vset_lane_u32(a, *tu3, 1); +} + +static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride, + uint32x2_t *tu0, uint32x2_t *tu1) { + uint32_t a; + + memcpy(&a, buf, 4); + buf += stride; + *tu0 = vset_lane_u32(a, *tu0, 0); + memcpy(&a, buf, 4); + buf += stride; + *tu0 = vset_lane_u32(a, *tu0, 1); + memcpy(&a, buf, 4); + buf += stride; + *tu1 = vset_lane_u32(a, *tu1, 0); + memcpy(&a, buf, 4); + *tu1 = vset_lane_u32(a, *tu1, 1); +} + +static INLINE void load_unaligned_u8_4x1(const uint8_t *buf, int stride, + uint32x2_t *tu0) { + uint32_t a; + + memcpy(&a, buf, 4); + buf += stride; + *tu0 = vset_lane_u32(a, *tu0, 0); +} + +static INLINE void load_unaligned_u8_4x2(const uint8_t *buf, int stride, + uint32x2_t *tu0) { + uint32_t a; + + memcpy(&a, buf, 4); + buf += stride; + *tu0 = vset_lane_u32(a, *tu0, 0); + memcpy(&a, buf, 4); + buf += stride; + *tu0 = vset_lane_u32(a, *tu0, 1); +} + +/* These intrinsics require immediate values, so we must use #defines + to enforce that. */ +#define store_unaligned_u8_4x1(dst, src, lane) \ + do { \ + uint32_t a; \ + a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \ + memcpy(dst, &a, 4); \ + } while (0) + +static INLINE void load_unaligned_u8_2x2(const uint8_t *buf, int stride, + uint16x4_t *tu0) { + uint16_t a; + + memcpy(&a, buf, 2); + buf += stride; + *tu0 = vset_lane_u16(a, *tu0, 0); + memcpy(&a, buf, 2); + buf += stride; + *tu0 = vset_lane_u16(a, *tu0, 1); +} + +static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p, + uint8x16_t *const s0, uint8x16_t *const s1, + uint8x16_t *const s2, uint8x16_t *const s3, + uint8x16_t *const s4, uint8x16_t *const s5, + uint8x16_t *const s6, uint8x16_t *const s7) { + *s0 = vld1q_u8(s); + s += p; + *s1 = vld1q_u8(s); + s += p; + *s2 = vld1q_u8(s); + s += p; + *s3 = vld1q_u8(s); + s += p; + *s4 = vld1q_u8(s); + s += p; + *s5 = vld1q_u8(s); + s += p; + *s6 = vld1q_u8(s); + s += p; + *s7 = vld1q_u8(s); +} + +static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p, + uint8x16_t *const s0, uint8x16_t *const s1, + uint8x16_t *const s2, uint8x16_t *const s3) { + *s0 = vld1q_u8(s); + s += p; + *s1 = vld1q_u8(s); + s += p; + *s2 = vld1q_u8(s); + s += p; + *s3 = vld1q_u8(s); +} + +static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride, + uint64x2_t *tu0, uint64x2_t *tu1) { + uint64_t a; + + memcpy(&a, buf, 8); + buf += stride; + *tu0 = vsetq_lane_u64(a, *tu0, 0); + memcpy(&a, buf, 8); + buf += stride; + *tu0 = vsetq_lane_u64(a, *tu0, 1); + memcpy(&a, buf, 8); + buf += stride; + *tu1 = vsetq_lane_u64(a, *tu1, 0); + memcpy(&a, buf, 8); + *tu1 = vsetq_lane_u64(a, *tu1, 1); +} + +static INLINE void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1, + int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) { + *s1 = vld1q_s32(s); + s += p; + *s2 = vld1q_s32(s); + s += p; + *s3 = vld1q_s32(s); + s += p; + *s4 = vld1q_s32(s); +} + +static INLINE void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1, + int32x4_t s2, int32x4_t s3, int32x4_t s4) { + vst1q_s32(s, s1); + s += p; + vst1q_s32(s, s2); + s += p; + vst1q_s32(s, s3); + s += p; + vst1q_s32(s, s4); +} + +static INLINE void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1, + uint32x4_t *s2, uint32x4_t *s3, + uint32x4_t *s4) { + *s1 = vld1q_u32(s); + s += p; + *s2 = vld1q_u32(s); + s += p; + *s3 = vld1q_u32(s); + s += p; + *s4 = vld1q_u32(s); +} + +static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1, + uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) { + vst1q_u32(s, s1); + s += p; + vst1q_u32(s, s2); + s += p; + vst1q_u32(s, s3); + s += p; + vst1q_u32(s, s4); +} + +static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) { + const int32x4_t v0 = vld1q_s32(buf); + const int32x4_t v1 = vld1q_s32(buf + 4); + const int16x4_t s0 = vmovn_s32(v0); + const int16x4_t s1 = vmovn_s32(v1); + return vcombine_s16(s0, s1); +} + +static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) { + const int32x4_t v0 = vmovl_s16(vget_low_s16(a)); + const int32x4_t v1 = vmovl_s16(vget_high_s16(a)); + vst1q_s32(buf, v0); + vst1q_s32(buf + 4, v1); +} + +#endif // AOM_AV1_COMMON_ARM_MEM_NEON_H_ diff --git a/libs/libaom/src/av1/common/arm/reconinter_neon.c b/libs/libaom/src/av1/common/arm/reconinter_neon.c new file mode 100644 index 000000000..44e064195 --- /dev/null +++ b/libs/libaom/src/av1/common/arm/reconinter_neon.c @@ -0,0 +1,86 @@ +/* + * + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom/aom_integer.h" +#include "aom_dsp/blend.h" +#include "aom_ports/mem.h" +#include "av1/common/arm/mem_neon.h" +#include "av1/common/blockd.h" +#include "config/av1_rtcd.h" + +void av1_build_compound_diffwtd_mask_d16_neon( + uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, + int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, + ConvolveParams *conv_params, int bd) { + assert(h >= 4); + assert(w >= 4); + assert((mask_type == DIFFWTD_38_INV) || (mask_type == DIFFWTD_38)); + const int round = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8); + uint16x8_t diff_q, tmp0, tmp1; + uint8x8_t diff_d, diff_select; + const CONV_BUF_TYPE *src0_1, *src1_1; + const int16x8_t dup_round = vdupq_n_s16((int16_t)(-round)); + const uint8x8_t dup_38 = vdup_n_u8(38); + const uint8x8_t dup_64 = vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA); + if (mask_type == DIFFWTD_38) { + diff_select = vdup_n_u8(255); + } else { + diff_select = vdup_n_u8(0); + } + if (w >= 8) { + for (int i = 0; i < h; ++i) { + src0_1 = src0; + src1_1 = src1; + for (int j = 0; j < w; j += 8) { + __builtin_prefetch(src0_1); + __builtin_prefetch(src1_1); + diff_q = vabdq_u16(vld1q_u16(src0_1), vld1q_u16(src1_1)); + diff_q = vrshlq_u16(diff_q, dup_round); + diff_d = vshrn_n_u16(diff_q, DIFF_FACTOR_LOG2); + diff_d = vmin_u8(vadd_u8(diff_d, dup_38), dup_64); + diff_d = vbsl_u8(diff_select, diff_d, vsub_u8(dup_64, diff_d)); + vst1_u8(mask, diff_d); + src0_1 += 8; + src1_1 += 8; + mask += 8; + } + src0 += src0_stride; + src1 += src1_stride; + } + } else if (w == 4) { + for (int i = 0; i < h; i += 2) { + src0_1 = src0; + src1_1 = src1; + __builtin_prefetch(src0_1 + 0 * src0_stride); + __builtin_prefetch(src0_1 + 1 * src0_stride); + __builtin_prefetch(src1_1 + 0 * src1_stride); + __builtin_prefetch(src1_1 + 1 * src1_stride); + tmp0 = vcombine_u16(vld1_u16(src0_1 + (0 * src0_stride)), + vld1_u16(src0_1 + (1 * src0_stride))); + tmp1 = vcombine_u16(vld1_u16(src1_1 + (0 * src1_stride)), + vld1_u16(src1_1 + (1 * src1_stride))); + diff_q = vabdq_u16(tmp0, tmp1); + diff_q = vrshlq_u16(diff_q, dup_round); + diff_d = vshrn_n_u16(diff_q, DIFF_FACTOR_LOG2); + diff_d = vmin_u8(vadd_u8(diff_d, dup_38), dup_64); + diff_d = vbsl_u8(diff_select, diff_d, vsub_u8(dup_64, diff_d)); + vst1_u8(mask, diff_d); + src0 += src0_stride * 2; + src1 += src1_stride * 2; + mask += w * 2; + } + } +} diff --git a/libs/libaom/src/av1/common/arm/selfguided_neon.c b/libs/libaom/src/av1/common/arm/selfguided_neon.c new file mode 100644 index 000000000..fc404a64a --- /dev/null +++ b/libs/libaom/src/av1/common/arm/selfguided_neon.c @@ -0,0 +1,1590 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/txfm_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/common.h" +#include "av1/common/resize.h" +#include "av1/common/restoration.h" +#include "av1/common/arm/mem_neon.h" +#include "av1/common/arm/transpose_neon.h" + +// Constants used for right shift in final_filter calculation. +#define NB_EVEN 5 +#define NB_ODD 4 + +static INLINE void calc_ab_fast_internal_common( + uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4, + uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, int32x4_t sr4, int32x4_t sr5, + int32x4_t sr6, int32x4_t sr7, uint32x4_t const_n_val, uint32x4_t s_vec, + uint32x4_t const_val, uint32x4_t one_by_n_minus_1_vec, + uint16x4_t sgrproj_sgr, int32_t *src1, uint16_t *dst_A16, int32_t *src2, + const int buf_stride) { + uint32x4_t q0, q1, q2, q3; + uint32x4_t p0, p1, p2, p3; + uint16x4_t d0, d1, d2, d3; + + s0 = vmulq_u32(s0, const_n_val); + s1 = vmulq_u32(s1, const_n_val); + s2 = vmulq_u32(s2, const_n_val); + s3 = vmulq_u32(s3, const_n_val); + + q0 = vmulq_u32(s4, s4); + q1 = vmulq_u32(s5, s5); + q2 = vmulq_u32(s6, s6); + q3 = vmulq_u32(s7, s7); + + p0 = vcleq_u32(q0, s0); + p1 = vcleq_u32(q1, s1); + p2 = vcleq_u32(q2, s2); + p3 = vcleq_u32(q3, s3); + + q0 = vsubq_u32(s0, q0); + q1 = vsubq_u32(s1, q1); + q2 = vsubq_u32(s2, q2); + q3 = vsubq_u32(s3, q3); + + p0 = vandq_u32(p0, q0); + p1 = vandq_u32(p1, q1); + p2 = vandq_u32(p2, q2); + p3 = vandq_u32(p3, q3); + + p0 = vmulq_u32(p0, s_vec); + p1 = vmulq_u32(p1, s_vec); + p2 = vmulq_u32(p2, s_vec); + p3 = vmulq_u32(p3, s_vec); + + p0 = vrshrq_n_u32(p0, SGRPROJ_MTABLE_BITS); + p1 = vrshrq_n_u32(p1, SGRPROJ_MTABLE_BITS); + p2 = vrshrq_n_u32(p2, SGRPROJ_MTABLE_BITS); + p3 = vrshrq_n_u32(p3, SGRPROJ_MTABLE_BITS); + + p0 = vminq_u32(p0, const_val); + p1 = vminq_u32(p1, const_val); + p2 = vminq_u32(p2, const_val); + p3 = vminq_u32(p3, const_val); + + { + store_u32_4x4((uint32_t *)src1, buf_stride, p0, p1, p2, p3); + + for (int x = 0; x < 4; x++) { + for (int y = 0; y < 4; y++) { + dst_A16[x * buf_stride + y] = av1_x_by_xplus1[src1[x * buf_stride + y]]; + } + } + load_u16_4x4(dst_A16, buf_stride, &d0, &d1, &d2, &d3); + } + p0 = vsubl_u16(sgrproj_sgr, d0); + p1 = vsubl_u16(sgrproj_sgr, d1); + p2 = vsubl_u16(sgrproj_sgr, d2); + p3 = vsubl_u16(sgrproj_sgr, d3); + + s4 = vmulq_u32(vreinterpretq_u32_s32(sr4), one_by_n_minus_1_vec); + s5 = vmulq_u32(vreinterpretq_u32_s32(sr5), one_by_n_minus_1_vec); + s6 = vmulq_u32(vreinterpretq_u32_s32(sr6), one_by_n_minus_1_vec); + s7 = vmulq_u32(vreinterpretq_u32_s32(sr7), one_by_n_minus_1_vec); + + s4 = vmulq_u32(s4, p0); + s5 = vmulq_u32(s5, p1); + s6 = vmulq_u32(s6, p2); + s7 = vmulq_u32(s7, p3); + + p0 = vrshrq_n_u32(s4, SGRPROJ_RECIP_BITS); + p1 = vrshrq_n_u32(s5, SGRPROJ_RECIP_BITS); + p2 = vrshrq_n_u32(s6, SGRPROJ_RECIP_BITS); + p3 = vrshrq_n_u32(s7, SGRPROJ_RECIP_BITS); + + store_s32_4x4(src2, buf_stride, vreinterpretq_s32_u32(p0), + vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2), + vreinterpretq_s32_u32(p3)); +} +static INLINE void calc_ab_internal_common( + uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4, + uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, uint16x8_t s16_0, + uint16x8_t s16_1, uint16x8_t s16_2, uint16x8_t s16_3, uint16x8_t s16_4, + uint16x8_t s16_5, uint16x8_t s16_6, uint16x8_t s16_7, + uint32x4_t const_n_val, uint32x4_t s_vec, uint32x4_t const_val, + uint16x4_t one_by_n_minus_1_vec, uint16x8_t sgrproj_sgr, int32_t *src1, + uint16_t *dst_A16, int32_t *dst2, const int buf_stride) { + uint16x4_t d0, d1, d2, d3, d4, d5, d6, d7; + uint32x4_t q0, q1, q2, q3, q4, q5, q6, q7; + uint32x4_t p0, p1, p2, p3, p4, p5, p6, p7; + + s0 = vmulq_u32(s0, const_n_val); + s1 = vmulq_u32(s1, const_n_val); + s2 = vmulq_u32(s2, const_n_val); + s3 = vmulq_u32(s3, const_n_val); + s4 = vmulq_u32(s4, const_n_val); + s5 = vmulq_u32(s5, const_n_val); + s6 = vmulq_u32(s6, const_n_val); + s7 = vmulq_u32(s7, const_n_val); + + d0 = vget_low_u16(s16_4); + d1 = vget_low_u16(s16_5); + d2 = vget_low_u16(s16_6); + d3 = vget_low_u16(s16_7); + d4 = vget_high_u16(s16_4); + d5 = vget_high_u16(s16_5); + d6 = vget_high_u16(s16_6); + d7 = vget_high_u16(s16_7); + + q0 = vmull_u16(d0, d0); + q1 = vmull_u16(d1, d1); + q2 = vmull_u16(d2, d2); + q3 = vmull_u16(d3, d3); + q4 = vmull_u16(d4, d4); + q5 = vmull_u16(d5, d5); + q6 = vmull_u16(d6, d6); + q7 = vmull_u16(d7, d7); + + p0 = vcleq_u32(q0, s0); + p1 = vcleq_u32(q1, s1); + p2 = vcleq_u32(q2, s2); + p3 = vcleq_u32(q3, s3); + p4 = vcleq_u32(q4, s4); + p5 = vcleq_u32(q5, s5); + p6 = vcleq_u32(q6, s6); + p7 = vcleq_u32(q7, s7); + + q0 = vsubq_u32(s0, q0); + q1 = vsubq_u32(s1, q1); + q2 = vsubq_u32(s2, q2); + q3 = vsubq_u32(s3, q3); + q4 = vsubq_u32(s4, q4); + q5 = vsubq_u32(s5, q5); + q6 = vsubq_u32(s6, q6); + q7 = vsubq_u32(s7, q7); + + p0 = vandq_u32(p0, q0); + p1 = vandq_u32(p1, q1); + p2 = vandq_u32(p2, q2); + p3 = vandq_u32(p3, q3); + p4 = vandq_u32(p4, q4); + p5 = vandq_u32(p5, q5); + p6 = vandq_u32(p6, q6); + p7 = vandq_u32(p7, q7); + + p0 = vmulq_u32(p0, s_vec); + p1 = vmulq_u32(p1, s_vec); + p2 = vmulq_u32(p2, s_vec); + p3 = vmulq_u32(p3, s_vec); + p4 = vmulq_u32(p4, s_vec); + p5 = vmulq_u32(p5, s_vec); + p6 = vmulq_u32(p6, s_vec); + p7 = vmulq_u32(p7, s_vec); + + p0 = vrshrq_n_u32(p0, SGRPROJ_MTABLE_BITS); + p1 = vrshrq_n_u32(p1, SGRPROJ_MTABLE_BITS); + p2 = vrshrq_n_u32(p2, SGRPROJ_MTABLE_BITS); + p3 = vrshrq_n_u32(p3, SGRPROJ_MTABLE_BITS); + p4 = vrshrq_n_u32(p4, SGRPROJ_MTABLE_BITS); + p5 = vrshrq_n_u32(p5, SGRPROJ_MTABLE_BITS); + p6 = vrshrq_n_u32(p6, SGRPROJ_MTABLE_BITS); + p7 = vrshrq_n_u32(p7, SGRPROJ_MTABLE_BITS); + + p0 = vminq_u32(p0, const_val); + p1 = vminq_u32(p1, const_val); + p2 = vminq_u32(p2, const_val); + p3 = vminq_u32(p3, const_val); + p4 = vminq_u32(p4, const_val); + p5 = vminq_u32(p5, const_val); + p6 = vminq_u32(p6, const_val); + p7 = vminq_u32(p7, const_val); + + { + store_u32_4x4((uint32_t *)src1, buf_stride, p0, p1, p2, p3); + store_u32_4x4((uint32_t *)src1 + 4, buf_stride, p4, p5, p6, p7); + + for (int x = 0; x < 4; x++) { + for (int y = 0; y < 8; y++) { + dst_A16[x * buf_stride + y] = av1_x_by_xplus1[src1[x * buf_stride + y]]; + } + } + load_u16_8x4(dst_A16, buf_stride, &s16_4, &s16_5, &s16_6, &s16_7); + } + + s16_4 = vsubq_u16(sgrproj_sgr, s16_4); + s16_5 = vsubq_u16(sgrproj_sgr, s16_5); + s16_6 = vsubq_u16(sgrproj_sgr, s16_6); + s16_7 = vsubq_u16(sgrproj_sgr, s16_7); + + s0 = vmull_u16(vget_low_u16(s16_0), one_by_n_minus_1_vec); + s1 = vmull_u16(vget_low_u16(s16_1), one_by_n_minus_1_vec); + s2 = vmull_u16(vget_low_u16(s16_2), one_by_n_minus_1_vec); + s3 = vmull_u16(vget_low_u16(s16_3), one_by_n_minus_1_vec); + s4 = vmull_u16(vget_high_u16(s16_0), one_by_n_minus_1_vec); + s5 = vmull_u16(vget_high_u16(s16_1), one_by_n_minus_1_vec); + s6 = vmull_u16(vget_high_u16(s16_2), one_by_n_minus_1_vec); + s7 = vmull_u16(vget_high_u16(s16_3), one_by_n_minus_1_vec); + + s0 = vmulq_u32(s0, vmovl_u16(vget_low_u16(s16_4))); + s1 = vmulq_u32(s1, vmovl_u16(vget_low_u16(s16_5))); + s2 = vmulq_u32(s2, vmovl_u16(vget_low_u16(s16_6))); + s3 = vmulq_u32(s3, vmovl_u16(vget_low_u16(s16_7))); + s4 = vmulq_u32(s4, vmovl_u16(vget_high_u16(s16_4))); + s5 = vmulq_u32(s5, vmovl_u16(vget_high_u16(s16_5))); + s6 = vmulq_u32(s6, vmovl_u16(vget_high_u16(s16_6))); + s7 = vmulq_u32(s7, vmovl_u16(vget_high_u16(s16_7))); + + p0 = vrshrq_n_u32(s0, SGRPROJ_RECIP_BITS); + p1 = vrshrq_n_u32(s1, SGRPROJ_RECIP_BITS); + p2 = vrshrq_n_u32(s2, SGRPROJ_RECIP_BITS); + p3 = vrshrq_n_u32(s3, SGRPROJ_RECIP_BITS); + p4 = vrshrq_n_u32(s4, SGRPROJ_RECIP_BITS); + p5 = vrshrq_n_u32(s5, SGRPROJ_RECIP_BITS); + p6 = vrshrq_n_u32(s6, SGRPROJ_RECIP_BITS); + p7 = vrshrq_n_u32(s7, SGRPROJ_RECIP_BITS); + + store_s32_4x4(dst2, buf_stride, vreinterpretq_s32_u32(p0), + vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2), + vreinterpretq_s32_u32(p3)); + store_s32_4x4(dst2 + 4, buf_stride, vreinterpretq_s32_u32(p4), + vreinterpretq_s32_u32(p5), vreinterpretq_s32_u32(p6), + vreinterpretq_s32_u32(p7)); +} + +static INLINE void boxsum2_square_sum_calc( + int16x4_t t1, int16x4_t t2, int16x4_t t3, int16x4_t t4, int16x4_t t5, + int16x4_t t6, int16x4_t t7, int16x4_t t8, int16x4_t t9, int16x4_t t10, + int16x4_t t11, int32x4_t *r0, int32x4_t *r1, int32x4_t *r2, int32x4_t *r3) { + int32x4_t d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11; + int32x4_t r12, r34, r67, r89, r1011; + int32x4_t r345, r6789, r789; + + d1 = vmull_s16(t1, t1); + d2 = vmull_s16(t2, t2); + d3 = vmull_s16(t3, t3); + d4 = vmull_s16(t4, t4); + d5 = vmull_s16(t5, t5); + d6 = vmull_s16(t6, t6); + d7 = vmull_s16(t7, t7); + d8 = vmull_s16(t8, t8); + d9 = vmull_s16(t9, t9); + d10 = vmull_s16(t10, t10); + d11 = vmull_s16(t11, t11); + + r12 = vaddq_s32(d1, d2); + r34 = vaddq_s32(d3, d4); + r67 = vaddq_s32(d6, d7); + r89 = vaddq_s32(d8, d9); + r1011 = vaddq_s32(d10, d11); + r345 = vaddq_s32(r34, d5); + r6789 = vaddq_s32(r67, r89); + r789 = vsubq_s32(r6789, d6); + *r0 = vaddq_s32(r12, r345); + *r1 = vaddq_s32(r67, r345); + *r2 = vaddq_s32(d5, r6789); + *r3 = vaddq_s32(r789, r1011); +} + +static INLINE void boxsum2(int16_t *src, const int src_stride, int16_t *dst16, + int32_t *dst32, int32_t *dst2, const int dst_stride, + const int width, const int height) { + assert(width > 2 * SGRPROJ_BORDER_HORZ); + assert(height > 2 * SGRPROJ_BORDER_VERT); + + int16_t *dst1_16_ptr, *src_ptr; + int32_t *dst2_ptr; + int h, w, count = 0; + const int dst_stride_2 = (dst_stride << 1); + const int dst_stride_8 = (dst_stride << 3); + + dst1_16_ptr = dst16; + dst2_ptr = dst2; + src_ptr = src; + w = width; + { + int16x8_t t1, t2, t3, t4, t5, t6, t7; + int16x8_t t8, t9, t10, t11, t12; + + int16x8_t q12345, q56789, q34567, q7891011; + int16x8_t q12, q34, q67, q89, q1011; + int16x8_t q345, q6789, q789; + + int32x4_t r12345, r56789, r34567, r7891011; + + do { + h = height; + dst1_16_ptr = dst16 + (count << 3); + dst2_ptr = dst2 + (count << 3); + src_ptr = src + (count << 3); + + dst1_16_ptr += dst_stride_2; + dst2_ptr += dst_stride_2; + do { + load_s16_8x4(src_ptr, src_stride, &t1, &t2, &t3, &t4); + src_ptr += 4 * src_stride; + load_s16_8x4(src_ptr, src_stride, &t5, &t6, &t7, &t8); + src_ptr += 4 * src_stride; + load_s16_8x4(src_ptr, src_stride, &t9, &t10, &t11, &t12); + + q12 = vaddq_s16(t1, t2); + q34 = vaddq_s16(t3, t4); + q67 = vaddq_s16(t6, t7); + q89 = vaddq_s16(t8, t9); + q1011 = vaddq_s16(t10, t11); + q345 = vaddq_s16(q34, t5); + q6789 = vaddq_s16(q67, q89); + q789 = vaddq_s16(q89, t7); + q12345 = vaddq_s16(q12, q345); + q34567 = vaddq_s16(q67, q345); + q56789 = vaddq_s16(t5, q6789); + q7891011 = vaddq_s16(q789, q1011); + + store_s16_8x4(dst1_16_ptr, dst_stride_2, q12345, q34567, q56789, + q7891011); + dst1_16_ptr += dst_stride_8; + + boxsum2_square_sum_calc( + vget_low_s16(t1), vget_low_s16(t2), vget_low_s16(t3), + vget_low_s16(t4), vget_low_s16(t5), vget_low_s16(t6), + vget_low_s16(t7), vget_low_s16(t8), vget_low_s16(t9), + vget_low_s16(t10), vget_low_s16(t11), &r12345, &r34567, &r56789, + &r7891011); + + store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r34567, r56789, r7891011); + + boxsum2_square_sum_calc( + vget_high_s16(t1), vget_high_s16(t2), vget_high_s16(t3), + vget_high_s16(t4), vget_high_s16(t5), vget_high_s16(t6), + vget_high_s16(t7), vget_high_s16(t8), vget_high_s16(t9), + vget_high_s16(t10), vget_high_s16(t11), &r12345, &r34567, &r56789, + &r7891011); + + store_s32_4x4(dst2_ptr + 4, dst_stride_2, r12345, r34567, r56789, + r7891011); + dst2_ptr += (dst_stride_8); + h -= 8; + } while (h > 0); + w -= 8; + count++; + } while (w > 0); + + // memset needed for row pixels as 2nd stage of boxsum filter uses + // first 2 rows of dst16, dst2 buffer which is not filled in first stage. + for (int x = 0; x < 2; x++) { + memset(dst16 + x * dst_stride, 0, (width + 4) * sizeof(*dst16)); + memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2)); + } + + // memset needed for extra columns as 2nd stage of boxsum filter uses + // last 2 columns of dst16, dst2 buffer which is not filled in first stage. + for (int x = 2; x < height + 2; x++) { + int dst_offset = x * dst_stride + width + 2; + memset(dst16 + dst_offset, 0, 3 * sizeof(*dst16)); + memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2)); + } + } + + { + int16x4_t s1, s2, s3, s4, s5, s6, s7, s8; + int32x4_t d1, d2, d3, d4, d5, d6, d7, d8; + int32x4_t q12345, q34567, q23456, q45678; + int32x4_t q23, q45, q67; + int32x4_t q2345, q4567; + + int32x4_t r12345, r34567, r23456, r45678; + int32x4_t r23, r45, r67; + int32x4_t r2345, r4567; + + int32_t *src2_ptr, *dst1_32_ptr; + int16_t *src1_ptr; + count = 0; + h = height; + do { + dst1_32_ptr = dst32 + count * dst_stride_8 + (dst_stride_2); + dst2_ptr = dst2 + count * dst_stride_8 + (dst_stride_2); + src1_ptr = dst16 + count * dst_stride_8 + (dst_stride_2); + src2_ptr = dst2 + count * dst_stride_8 + (dst_stride_2); + w = width; + + dst1_32_ptr += 2; + dst2_ptr += 2; + load_s16_4x4(src1_ptr, dst_stride_2, &s1, &s2, &s3, &s4); + transpose_s16_4x4d(&s1, &s2, &s3, &s4); + load_s32_4x4(src2_ptr, dst_stride_2, &d1, &d2, &d3, &d4); + transpose_s32_4x4(&d1, &d2, &d3, &d4); + do { + src1_ptr += 4; + src2_ptr += 4; + load_s16_4x4(src1_ptr, dst_stride_2, &s5, &s6, &s7, &s8); + transpose_s16_4x4d(&s5, &s6, &s7, &s8); + load_s32_4x4(src2_ptr, dst_stride_2, &d5, &d6, &d7, &d8); + transpose_s32_4x4(&d5, &d6, &d7, &d8); + q23 = vaddl_s16(s2, s3); + q45 = vaddl_s16(s4, s5); + q67 = vaddl_s16(s6, s7); + q2345 = vaddq_s32(q23, q45); + q4567 = vaddq_s32(q45, q67); + q12345 = vaddq_s32(vmovl_s16(s1), q2345); + q23456 = vaddq_s32(q2345, vmovl_s16(s6)); + q34567 = vaddq_s32(q4567, vmovl_s16(s3)); + q45678 = vaddq_s32(q4567, vmovl_s16(s8)); + + transpose_s32_4x4(&q12345, &q23456, &q34567, &q45678); + store_s32_4x4(dst1_32_ptr, dst_stride_2, q12345, q23456, q34567, + q45678); + dst1_32_ptr += 4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + + r23 = vaddq_s32(d2, d3); + r45 = vaddq_s32(d4, d5); + r67 = vaddq_s32(d6, d7); + r2345 = vaddq_s32(r23, r45); + r4567 = vaddq_s32(r45, r67); + r12345 = vaddq_s32(d1, r2345); + r23456 = vaddq_s32(r2345, d6); + r34567 = vaddq_s32(r4567, d3); + r45678 = vaddq_s32(r4567, d8); + + transpose_s32_4x4(&r12345, &r23456, &r34567, &r45678); + store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r23456, r34567, r45678); + dst2_ptr += 4; + d1 = d5; + d2 = d6; + d3 = d7; + d4 = d8; + w -= 4; + } while (w > 0); + h -= 8; + count++; + } while (h > 0); + } +} + +static INLINE void calc_ab_internal_lbd(int32_t *A, uint16_t *A16, + uint16_t *B16, int32_t *B, + const int buf_stride, const int width, + const int height, const int r, + const int s, const int ht_inc) { + int32_t *src1, *dst2, count = 0; + uint16_t *dst_A16, *src2; + const uint32_t n = (2 * r + 1) * (2 * r + 1); + const uint32x4_t const_n_val = vdupq_n_u32(n); + const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR); + const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(av1_one_by_x[n - 1]); + const uint32x4_t const_val = vdupq_n_u32(255); + + uint16x8_t s16_0, s16_1, s16_2, s16_3, s16_4, s16_5, s16_6, s16_7; + + uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7; + + const uint32x4_t s_vec = vdupq_n_u32(s); + int w, h = height; + + do { + dst_A16 = A16 + (count << 2) * buf_stride; + src1 = A + (count << 2) * buf_stride; + src2 = B16 + (count << 2) * buf_stride; + dst2 = B + (count << 2) * buf_stride; + w = width; + do { + load_u32_4x4((uint32_t *)src1, buf_stride, &s0, &s1, &s2, &s3); + load_u32_4x4((uint32_t *)src1 + 4, buf_stride, &s4, &s5, &s6, &s7); + load_u16_8x4(src2, buf_stride, &s16_0, &s16_1, &s16_2, &s16_3); + + s16_4 = s16_0; + s16_5 = s16_1; + s16_6 = s16_2; + s16_7 = s16_3; + + calc_ab_internal_common( + s0, s1, s2, s3, s4, s5, s6, s7, s16_0, s16_1, s16_2, s16_3, s16_4, + s16_5, s16_6, s16_7, const_n_val, s_vec, const_val, + one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, dst2, buf_stride); + + w -= 8; + dst2 += 8; + src1 += 8; + src2 += 8; + dst_A16 += 8; + } while (w > 0); + count++; + h -= (ht_inc * 4); + } while (h > 0); +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16, + uint16_t *B16, int32_t *B, + const int buf_stride, const int width, + const int height, const int bit_depth, + const int r, const int s, + const int ht_inc) { + int32_t *src1, *dst2, count = 0; + uint16_t *dst_A16, *src2; + const uint32_t n = (2 * r + 1) * (2 * r + 1); + const int16x8_t bd_min_2_vec = vdupq_n_s16(-(bit_depth - 8)); + const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1)); + const uint32x4_t const_n_val = vdupq_n_u32(n); + const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR); + const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(av1_one_by_x[n - 1]); + const uint32x4_t const_val = vdupq_n_u32(255); + + int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7; + uint16x8_t s16_0, s16_1, s16_2, s16_3; + uint16x8_t s16_4, s16_5, s16_6, s16_7; + uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7; + + const uint32x4_t s_vec = vdupq_n_u32(s); + int w, h = height; + + do { + src1 = A + (count << 2) * buf_stride; + src2 = B16 + (count << 2) * buf_stride; + dst2 = B + (count << 2) * buf_stride; + dst_A16 = A16 + (count << 2) * buf_stride; + w = width; + do { + load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3); + load_s32_4x4(src1 + 4, buf_stride, &sr4, &sr5, &sr6, &sr7); + load_u16_8x4(src2, buf_stride, &s16_0, &s16_1, &s16_2, &s16_3); + + s0 = vrshlq_u32(vreinterpretq_u32_s32(sr0), bd_min_1_vec); + s1 = vrshlq_u32(vreinterpretq_u32_s32(sr1), bd_min_1_vec); + s2 = vrshlq_u32(vreinterpretq_u32_s32(sr2), bd_min_1_vec); + s3 = vrshlq_u32(vreinterpretq_u32_s32(sr3), bd_min_1_vec); + s4 = vrshlq_u32(vreinterpretq_u32_s32(sr4), bd_min_1_vec); + s5 = vrshlq_u32(vreinterpretq_u32_s32(sr5), bd_min_1_vec); + s6 = vrshlq_u32(vreinterpretq_u32_s32(sr6), bd_min_1_vec); + s7 = vrshlq_u32(vreinterpretq_u32_s32(sr7), bd_min_1_vec); + + s16_4 = vrshlq_u16(s16_0, bd_min_2_vec); + s16_5 = vrshlq_u16(s16_1, bd_min_2_vec); + s16_6 = vrshlq_u16(s16_2, bd_min_2_vec); + s16_7 = vrshlq_u16(s16_3, bd_min_2_vec); + + calc_ab_internal_common( + s0, s1, s2, s3, s4, s5, s6, s7, s16_0, s16_1, s16_2, s16_3, s16_4, + s16_5, s16_6, s16_7, const_n_val, s_vec, const_val, + one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, dst2, buf_stride); + + w -= 8; + dst2 += 8; + src1 += 8; + src2 += 8; + dst_A16 += 8; + } while (w > 0); + count++; + h -= (ht_inc * 4); + } while (h > 0); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16, + int32_t *B, const int buf_stride, + const int width, const int height, + const int r, const int s, + const int ht_inc) { + int32_t *src1, *src2, count = 0; + uint16_t *dst_A16; + const uint32_t n = (2 * r + 1) * (2 * r + 1); + const uint32x4_t const_n_val = vdupq_n_u32(n); + const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR); + const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(av1_one_by_x[n - 1]); + const uint32x4_t const_val = vdupq_n_u32(255); + + int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7; + uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7; + + const uint32x4_t s_vec = vdupq_n_u32(s); + int w, h = height; + + do { + src1 = A + (count << 2) * buf_stride; + src2 = B + (count << 2) * buf_stride; + dst_A16 = A16 + (count << 2) * buf_stride; + w = width; + do { + load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3); + load_s32_4x4(src2, buf_stride, &sr4, &sr5, &sr6, &sr7); + + s0 = vreinterpretq_u32_s32(sr0); + s1 = vreinterpretq_u32_s32(sr1); + s2 = vreinterpretq_u32_s32(sr2); + s3 = vreinterpretq_u32_s32(sr3); + s4 = vreinterpretq_u32_s32(sr4); + s5 = vreinterpretq_u32_s32(sr5); + s6 = vreinterpretq_u32_s32(sr6); + s7 = vreinterpretq_u32_s32(sr7); + + calc_ab_fast_internal_common(s0, s1, s2, s3, s4, s5, s6, s7, sr4, sr5, + sr6, sr7, const_n_val, s_vec, const_val, + one_by_n_minus_1_vec, sgrproj_sgr, src1, + dst_A16, src2, buf_stride); + + w -= 4; + src1 += 4; + src2 += 4; + dst_A16 += 4; + } while (w > 0); + count++; + h -= (ht_inc * 4); + } while (h > 0); +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16, + int32_t *B, const int buf_stride, + const int width, const int height, + const int bit_depth, const int r, + const int s, const int ht_inc) { + int32_t *src1, *src2, count = 0; + uint16_t *dst_A16; + const uint32_t n = (2 * r + 1) * (2 * r + 1); + const int32x4_t bd_min_2_vec = vdupq_n_s32(-(bit_depth - 8)); + const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1)); + const uint32x4_t const_n_val = vdupq_n_u32(n); + const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR); + const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(av1_one_by_x[n - 1]); + const uint32x4_t const_val = vdupq_n_u32(255); + + int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7; + uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7; + + const uint32x4_t s_vec = vdupq_n_u32(s); + int w, h = height; + + do { + src1 = A + (count << 2) * buf_stride; + src2 = B + (count << 2) * buf_stride; + dst_A16 = A16 + (count << 2) * buf_stride; + w = width; + do { + load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3); + load_s32_4x4(src2, buf_stride, &sr4, &sr5, &sr6, &sr7); + + s0 = vrshlq_u32(vreinterpretq_u32_s32(sr0), bd_min_1_vec); + s1 = vrshlq_u32(vreinterpretq_u32_s32(sr1), bd_min_1_vec); + s2 = vrshlq_u32(vreinterpretq_u32_s32(sr2), bd_min_1_vec); + s3 = vrshlq_u32(vreinterpretq_u32_s32(sr3), bd_min_1_vec); + s4 = vrshlq_u32(vreinterpretq_u32_s32(sr4), bd_min_2_vec); + s5 = vrshlq_u32(vreinterpretq_u32_s32(sr5), bd_min_2_vec); + s6 = vrshlq_u32(vreinterpretq_u32_s32(sr6), bd_min_2_vec); + s7 = vrshlq_u32(vreinterpretq_u32_s32(sr7), bd_min_2_vec); + + calc_ab_fast_internal_common(s0, s1, s2, s3, s4, s5, s6, s7, sr4, sr5, + sr6, sr7, const_n_val, s_vec, const_val, + one_by_n_minus_1_vec, sgrproj_sgr, src1, + dst_A16, src2, buf_stride); + + w -= 4; + src1 += 4; + src2 += 4; + dst_A16 += 4; + } while (w > 0); + count++; + h -= (ht_inc * 4); + } while (h > 0); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static INLINE void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1, + int32_t *dst2, const int dst_stride, const int width, + const int height) { + assert(width > 2 * SGRPROJ_BORDER_HORZ); + assert(height > 2 * SGRPROJ_BORDER_VERT); + + int16_t *src_ptr; + int32_t *dst2_ptr; + uint16_t *dst1_ptr; + int h, w, count = 0; + + w = width; + { + int16x8_t s1, s2, s3, s4, s5, s6, s7, s8; + int16x8_t q23, q34, q56, q234, q345, q456, q567; + int32x4_t r23, r56, r345, r456, r567, r78, r678; + int32x4_t r4_low, r4_high, r34_low, r34_high, r234_low, r234_high; + int32x4_t r2, r3, r5, r6, r7, r8; + int16x8_t q678, q78; + + do { + dst1_ptr = dst1 + (count << 3); + dst2_ptr = dst2 + (count << 3); + src_ptr = src + (count << 3); + h = height; + + load_s16_8x4(src_ptr, src_stride, &s1, &s2, &s3, &s4); + src_ptr += 4 * src_stride; + + q23 = vaddq_s16(s2, s3); + q234 = vaddq_s16(q23, s4); + q34 = vaddq_s16(s3, s4); + dst1_ptr += (dst_stride << 1); + + r2 = vmull_s16(vget_low_s16(s2), vget_low_s16(s2)); + r3 = vmull_s16(vget_low_s16(s3), vget_low_s16(s3)); + r4_low = vmull_s16(vget_low_s16(s4), vget_low_s16(s4)); + r23 = vaddq_s32(r2, r3); + r234_low = vaddq_s32(r23, r4_low); + r34_low = vaddq_s32(r3, r4_low); + + r2 = vmull_s16(vget_high_s16(s2), vget_high_s16(s2)); + r3 = vmull_s16(vget_high_s16(s3), vget_high_s16(s3)); + r4_high = vmull_s16(vget_high_s16(s4), vget_high_s16(s4)); + r23 = vaddq_s32(r2, r3); + r234_high = vaddq_s32(r23, r4_high); + r34_high = vaddq_s32(r3, r4_high); + + dst2_ptr += (dst_stride << 1); + + do { + load_s16_8x4(src_ptr, src_stride, &s5, &s6, &s7, &s8); + src_ptr += 4 * src_stride; + + q345 = vaddq_s16(s5, q34); + q56 = vaddq_s16(s5, s6); + q456 = vaddq_s16(s4, q56); + q567 = vaddq_s16(s7, q56); + q78 = vaddq_s16(s7, s8); + q678 = vaddq_s16(s6, q78); + + store_s16_8x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567); + dst1_ptr += (dst_stride << 2); + + s4 = s8; + q34 = q78; + q234 = q678; + + r5 = vmull_s16(vget_low_s16(s5), vget_low_s16(s5)); + r6 = vmull_s16(vget_low_s16(s6), vget_low_s16(s6)); + r7 = vmull_s16(vget_low_s16(s7), vget_low_s16(s7)); + r8 = vmull_s16(vget_low_s16(s8), vget_low_s16(s8)); + + r345 = vaddq_s32(r5, r34_low); + r56 = vaddq_s32(r5, r6); + r456 = vaddq_s32(r4_low, r56); + r567 = vaddq_s32(r7, r56); + r78 = vaddq_s32(r7, r8); + r678 = vaddq_s32(r6, r78); + store_s32_4x4(dst2_ptr, dst_stride, r234_low, r345, r456, r567); + + r4_low = r8; + r34_low = r78; + r234_low = r678; + + r5 = vmull_s16(vget_high_s16(s5), vget_high_s16(s5)); + r6 = vmull_s16(vget_high_s16(s6), vget_high_s16(s6)); + r7 = vmull_s16(vget_high_s16(s7), vget_high_s16(s7)); + r8 = vmull_s16(vget_high_s16(s8), vget_high_s16(s8)); + + r345 = vaddq_s32(r5, r34_high); + r56 = vaddq_s32(r5, r6); + r456 = vaddq_s32(r4_high, r56); + r567 = vaddq_s32(r7, r56); + r78 = vaddq_s32(r7, r8); + r678 = vaddq_s32(r6, r78); + store_s32_4x4((dst2_ptr + 4), dst_stride, r234_high, r345, r456, r567); + dst2_ptr += (dst_stride << 2); + + r4_high = r8; + r34_high = r78; + r234_high = r678; + + h -= 4; + } while (h > 0); + w -= 8; + count++; + } while (w > 0); + + // memset needed for row pixels as 2nd stage of boxsum filter uses + // first 2 rows of dst1, dst2 buffer which is not filled in first stage. + for (int x = 0; x < 2; x++) { + memset(dst1 + x * dst_stride, 0, (width + 4) * sizeof(*dst1)); + memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2)); + } + + // memset needed for extra columns as 2nd stage of boxsum filter uses + // last 2 columns of dst1, dst2 buffer which is not filled in first stage. + for (int x = 2; x < height + 2; x++) { + int dst_offset = x * dst_stride + width + 2; + memset(dst1 + dst_offset, 0, 3 * sizeof(*dst1)); + memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2)); + } + } + + { + int16x4_t d1, d2, d3, d4, d5, d6, d7, d8; + int16x4_t q23, q34, q56, q234, q345, q456, q567; + int32x4_t r23, r56, r234, r345, r456, r567, r34, r78, r678; + int32x4_t r1, r2, r3, r4, r5, r6, r7, r8; + int16x4_t q678, q78; + + int32_t *src2_ptr; + uint16_t *src1_ptr; + count = 0; + h = height; + w = width; + do { + dst1_ptr = dst1 + (count << 2) * dst_stride; + dst2_ptr = dst2 + (count << 2) * dst_stride; + src1_ptr = dst1 + (count << 2) * dst_stride; + src2_ptr = dst2 + (count << 2) * dst_stride; + w = width; + + load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d1, &d2, &d3, &d4); + transpose_s16_4x4d(&d1, &d2, &d3, &d4); + load_s32_4x4(src2_ptr, dst_stride, &r1, &r2, &r3, &r4); + transpose_s32_4x4(&r1, &r2, &r3, &r4); + src1_ptr += 4; + src2_ptr += 4; + + q23 = vadd_s16(d2, d3); + q234 = vadd_s16(q23, d4); + q34 = vadd_s16(d3, d4); + dst1_ptr += 2; + r23 = vaddq_s32(r2, r3); + r234 = vaddq_s32(r23, r4); + r34 = vaddq_s32(r3, r4); + dst2_ptr += 2; + + do { + load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d5, &d6, &d7, &d8); + transpose_s16_4x4d(&d5, &d6, &d7, &d8); + load_s32_4x4(src2_ptr, dst_stride, &r5, &r6, &r7, &r8); + transpose_s32_4x4(&r5, &r6, &r7, &r8); + src1_ptr += 4; + src2_ptr += 4; + + q345 = vadd_s16(d5, q34); + q56 = vadd_s16(d5, d6); + q456 = vadd_s16(d4, q56); + q567 = vadd_s16(d7, q56); + q78 = vadd_s16(d7, d8); + q678 = vadd_s16(d6, q78); + transpose_s16_4x4d(&q234, &q345, &q456, &q567); + store_s16_4x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567); + dst1_ptr += 4; + + d4 = d8; + q34 = q78; + q234 = q678; + + r345 = vaddq_s32(r5, r34); + r56 = vaddq_s32(r5, r6); + r456 = vaddq_s32(r4, r56); + r567 = vaddq_s32(r7, r56); + r78 = vaddq_s32(r7, r8); + r678 = vaddq_s32(r6, r78); + transpose_s32_4x4(&r234, &r345, &r456, &r567); + store_s32_4x4(dst2_ptr, dst_stride, r234, r345, r456, r567); + dst2_ptr += 4; + + r4 = r8; + r34 = r78; + r234 = r678; + w -= 4; + } while (w > 0); + h -= 4; + count++; + } while (h > 0); + } +} + +static INLINE int32x4_t cross_sum_inp_s32(int32_t *buf, int buf_stride) { + int32x4_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl; + int32x4_t fours, threes, res; + + xtl = vld1q_s32(buf - buf_stride - 1); + xt = vld1q_s32(buf - buf_stride); + xtr = vld1q_s32(buf - buf_stride + 1); + xl = vld1q_s32(buf - 1); + x = vld1q_s32(buf); + xr = vld1q_s32(buf + 1); + xbl = vld1q_s32(buf + buf_stride - 1); + xb = vld1q_s32(buf + buf_stride); + xbr = vld1q_s32(buf + buf_stride + 1); + + fours = vaddq_s32(xl, vaddq_s32(xt, vaddq_s32(xr, vaddq_s32(xb, x)))); + threes = vaddq_s32(xtl, vaddq_s32(xtr, vaddq_s32(xbr, xbl))); + res = vsubq_s32(vshlq_n_s32(vaddq_s32(fours, threes), 2), threes); + return res; +} + +static INLINE void cross_sum_inp_u16(uint16_t *buf, int buf_stride, + int32x4_t *a0, int32x4_t *a1) { + uint16x8_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl; + uint16x8_t r0, r1; + + xtl = vld1q_u16(buf - buf_stride - 1); + xt = vld1q_u16(buf - buf_stride); + xtr = vld1q_u16(buf - buf_stride + 1); + xl = vld1q_u16(buf - 1); + x = vld1q_u16(buf); + xr = vld1q_u16(buf + 1); + xbl = vld1q_u16(buf + buf_stride - 1); + xb = vld1q_u16(buf + buf_stride); + xbr = vld1q_u16(buf + buf_stride + 1); + + xb = vaddq_u16(xb, x); + xt = vaddq_u16(xt, xr); + xl = vaddq_u16(xl, xb); + xl = vaddq_u16(xl, xt); + + r0 = vshlq_n_u16(xl, 2); + + xbl = vaddq_u16(xbl, xbr); + xtl = vaddq_u16(xtl, xtr); + xtl = vaddq_u16(xtl, xbl); + + r1 = vshlq_n_u16(xtl, 2); + r1 = vsubq_u16(r1, xtl); + + *a0 = vreinterpretq_s32_u32( + vaddq_u32(vmovl_u16(vget_low_u16(r0)), vmovl_u16(vget_low_u16(r1)))); + *a1 = vreinterpretq_s32_u32( + vaddq_u32(vmovl_u16(vget_high_u16(r0)), vmovl_u16(vget_high_u16(r1)))); +} + +static INLINE int32x4_t cross_sum_fast_even_row(int32_t *buf, int buf_stride) { + int32x4_t xtr, xt, xtl, xbr, xb, xbl; + int32x4_t fives, sixes, fives_plus_sixes; + + xtl = vld1q_s32(buf - buf_stride - 1); + xt = vld1q_s32(buf - buf_stride); + xtr = vld1q_s32(buf - buf_stride + 1); + xbl = vld1q_s32(buf + buf_stride - 1); + xb = vld1q_s32(buf + buf_stride); + xbr = vld1q_s32(buf + buf_stride + 1); + + fives = vaddq_s32(xtl, vaddq_s32(xtr, vaddq_s32(xbr, xbl))); + sixes = vaddq_s32(xt, xb); + fives_plus_sixes = vaddq_s32(fives, sixes); + + return vaddq_s32( + vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes); +} + +static INLINE void cross_sum_fast_even_row_inp16(uint16_t *buf, int buf_stride, + int32x4_t *a0, int32x4_t *a1) { + uint16x8_t xtr, xt, xtl, xbr, xb, xbl, xb0; + + xtl = vld1q_u16(buf - buf_stride - 1); + xt = vld1q_u16(buf - buf_stride); + xtr = vld1q_u16(buf - buf_stride + 1); + xbl = vld1q_u16(buf + buf_stride - 1); + xb = vld1q_u16(buf + buf_stride); + xbr = vld1q_u16(buf + buf_stride + 1); + + xbr = vaddq_u16(xbr, xbl); + xtr = vaddq_u16(xtr, xtl); + xbr = vaddq_u16(xbr, xtr); + xtl = vshlq_n_u16(xbr, 2); + xbr = vaddq_u16(xtl, xbr); + + xb = vaddq_u16(xb, xt); + xb0 = vshlq_n_u16(xb, 1); + xb = vshlq_n_u16(xb, 2); + xb = vaddq_u16(xb, xb0); + + *a0 = vreinterpretq_s32_u32( + vaddq_u32(vmovl_u16(vget_low_u16(xbr)), vmovl_u16(vget_low_u16(xb)))); + *a1 = vreinterpretq_s32_u32( + vaddq_u32(vmovl_u16(vget_high_u16(xbr)), vmovl_u16(vget_high_u16(xb)))); +} + +static INLINE int32x4_t cross_sum_fast_odd_row(int32_t *buf) { + int32x4_t xl, x, xr; + int32x4_t fives, sixes, fives_plus_sixes; + + xl = vld1q_s32(buf - 1); + x = vld1q_s32(buf); + xr = vld1q_s32(buf + 1); + fives = vaddq_s32(xl, xr); + sixes = x; + fives_plus_sixes = vaddq_s32(fives, sixes); + + return vaddq_s32( + vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes); +} + +static INLINE void cross_sum_fast_odd_row_inp16(uint16_t *buf, int32x4_t *a0, + int32x4_t *a1) { + uint16x8_t xl, x, xr; + uint16x8_t x0; + + xl = vld1q_u16(buf - 1); + x = vld1q_u16(buf); + xr = vld1q_u16(buf + 1); + xl = vaddq_u16(xl, xr); + x0 = vshlq_n_u16(xl, 2); + xl = vaddq_u16(xl, x0); + + x0 = vshlq_n_u16(x, 1); + x = vshlq_n_u16(x, 2); + x = vaddq_u16(x, x0); + + *a0 = vreinterpretq_s32_u32( + vaddq_u32(vmovl_u16(vget_low_u16(xl)), vmovl_u16(vget_low_u16(x)))); + *a1 = vreinterpretq_s32_u32( + vaddq_u32(vmovl_u16(vget_high_u16(xl)), vmovl_u16(vget_high_u16(x)))); +} + +static void final_filter_fast_internal(uint16_t *A, int32_t *B, + const int buf_stride, int16_t *src, + const int src_stride, int32_t *dst, + const int dst_stride, const int width, + const int height) { + int16x8_t s0; + int32_t *B_tmp, *dst_ptr; + uint16_t *A_tmp; + int16_t *src_ptr; + int32x4_t a_res0, a_res1, b_res0, b_res1; + int w, h, count = 0; + assert(SGRPROJ_SGR_BITS == 8); + assert(SGRPROJ_RST_BITS == 4); + + A_tmp = A; + B_tmp = B; + src_ptr = src; + dst_ptr = dst; + h = height; + do { + A_tmp = (A + count * buf_stride); + B_tmp = (B + count * buf_stride); + src_ptr = (src + count * src_stride); + dst_ptr = (dst + count * dst_stride); + w = width; + if (!(count & 1)) { + do { + s0 = vld1q_s16(src_ptr); + cross_sum_fast_even_row_inp16(A_tmp, buf_stride, &a_res0, &a_res1); + a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0); + a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1); + + b_res0 = cross_sum_fast_even_row(B_tmp, buf_stride); + b_res1 = cross_sum_fast_even_row(B_tmp + 4, buf_stride); + a_res0 = vaddq_s32(a_res0, b_res0); + a_res1 = vaddq_s32(a_res1, b_res1); + + a_res0 = + vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS); + a_res1 = + vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS); + + vst1q_s32(dst_ptr, a_res0); + vst1q_s32(dst_ptr + 4, a_res1); + + A_tmp += 8; + B_tmp += 8; + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w > 0); + } else { + do { + s0 = vld1q_s16(src_ptr); + cross_sum_fast_odd_row_inp16(A_tmp, &a_res0, &a_res1); + a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0); + a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1); + + b_res0 = cross_sum_fast_odd_row(B_tmp); + b_res1 = cross_sum_fast_odd_row(B_tmp + 4); + a_res0 = vaddq_s32(a_res0, b_res0); + a_res1 = vaddq_s32(a_res1, b_res1); + + a_res0 = + vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_ODD - SGRPROJ_RST_BITS); + a_res1 = + vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_ODD - SGRPROJ_RST_BITS); + + vst1q_s32(dst_ptr, a_res0); + vst1q_s32(dst_ptr + 4, a_res1); + + A_tmp += 8; + B_tmp += 8; + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w > 0); + } + count++; + h -= 1; + } while (h > 0); +} + +void final_filter_internal(uint16_t *A, int32_t *B, const int buf_stride, + int16_t *src, const int src_stride, int32_t *dst, + const int dst_stride, const int width, + const int height) { + int16x8_t s0; + int32_t *B_tmp, *dst_ptr; + uint16_t *A_tmp; + int16_t *src_ptr; + int32x4_t a_res0, a_res1, b_res0, b_res1; + int w, h, count = 0; + + assert(SGRPROJ_SGR_BITS == 8); + assert(SGRPROJ_RST_BITS == 4); + h = height; + + do { + A_tmp = (A + count * buf_stride); + B_tmp = (B + count * buf_stride); + src_ptr = (src + count * src_stride); + dst_ptr = (dst + count * dst_stride); + w = width; + do { + s0 = vld1q_s16(src_ptr); + cross_sum_inp_u16(A_tmp, buf_stride, &a_res0, &a_res1); + a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0); + a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1); + + b_res0 = cross_sum_inp_s32(B_tmp, buf_stride); + b_res1 = cross_sum_inp_s32(B_tmp + 4, buf_stride); + a_res0 = vaddq_s32(a_res0, b_res0); + a_res1 = vaddq_s32(a_res1, b_res1); + + a_res0 = + vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS); + a_res1 = + vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS); + vst1q_s32(dst_ptr, a_res0); + vst1q_s32(dst_ptr + 4, a_res1); + + A_tmp += 8; + B_tmp += 8; + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w > 0); + count++; + h -= 1; + } while (h > 0); +} + +static INLINE void restoration_fast_internal(uint16_t *dgd16, int width, + int height, int dgd_stride, + int32_t *dst, int dst_stride, + int bit_depth, int sgr_params_idx, + int radius_idx) { + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + const int r = params->r[radius_idx]; + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; + + const int buf_stride = ((width_ext + 3) & ~3) + 16; + int32_t A_[RESTORATION_PROC_UNIT_PELS]; + uint16_t A16_[RESTORATION_PROC_UNIT_PELS]; + int32_t B_[RESTORATION_PROC_UNIT_PELS]; + int32_t *square_sum_buf = A_; + int32_t *sum_buf = B_; + uint16_t *tmp16_buf = A16_; + + assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r"); + assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 && + "Need SGRPROJ_BORDER_* >= r+1"); + + assert(radius_idx == 0); + assert(r == 2); + + // input(dgd16) is 16bit. + // sum of pixels 1st stage output will be in 16bit(tmp16_buf). End output is + // kept in 32bit [sum_buf]. sum of squares output is kept in 32bit + // buffer(square_sum_buf). + boxsum2((int16_t *)(dgd16 - dgd_stride * SGRPROJ_BORDER_VERT - + SGRPROJ_BORDER_HORZ), + dgd_stride, (int16_t *)tmp16_buf, sum_buf, square_sum_buf, buf_stride, + width_ext, height_ext); + + square_sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + tmp16_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + + // Calculation of a, b. a output is in 16bit tmp_buf which is in range of + // [1, 256] for all bit depths. b output is kept in 32bit buffer. + +#if CONFIG_AV1_HIGHBITDEPTH + if (bit_depth > 8) { + calc_ab_fast_internal_hbd( + (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1), + (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, + bit_depth, r, params->s[radius_idx], 2); + } else { + calc_ab_fast_internal_lbd( + (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1), + (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, r, + params->s[radius_idx], 2); + } +#else + (void)bit_depth; + calc_ab_fast_internal_lbd((square_sum_buf - buf_stride - 1), + (tmp16_buf - buf_stride - 1), + (sum_buf - buf_stride - 1), buf_stride * 2, + width + 2, height + 2, r, params->s[radius_idx], 2); +#endif + final_filter_fast_internal(tmp16_buf, sum_buf, buf_stride, (int16_t *)dgd16, + dgd_stride, dst, dst_stride, width, height); +} + +static INLINE void restoration_internal(uint16_t *dgd16, int width, int height, + int dgd_stride, int32_t *dst, + int dst_stride, int bit_depth, + int sgr_params_idx, int radius_idx) { + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + const int r = params->r[radius_idx]; + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; + + int buf_stride = ((width_ext + 3) & ~3) + 16; + int32_t A_[RESTORATION_PROC_UNIT_PELS]; + uint16_t A16_[RESTORATION_PROC_UNIT_PELS]; + uint16_t B16_[RESTORATION_PROC_UNIT_PELS]; + int32_t B_[RESTORATION_PROC_UNIT_PELS]; + int32_t *square_sum_buf = A_; + uint16_t *sum_buf = B16_; + uint16_t *A16 = A16_; + int32_t *B = B_; + + assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r"); + assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 && + "Need SGRPROJ_BORDER_* >= r+1"); + + assert(radius_idx == 1); + assert(r == 1); + + // input(dgd16) is 16bit. + // sum of pixels output will be in 16bit(sum_buf). + // sum of squares output is kept in 32bit buffer(square_sum_buf). + boxsum1((int16_t *)(dgd16 - dgd_stride * SGRPROJ_BORDER_VERT - + SGRPROJ_BORDER_HORZ), + dgd_stride, sum_buf, square_sum_buf, buf_stride, width_ext, + height_ext); + + square_sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + A16 += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + +#if CONFIG_AV1_HIGHBITDEPTH + // Calculation of a, b. a output is in 16bit tmp_buf which is in range of + // [1, 256] for all bit depths. b output is kept in 32bit buffer. + if (bit_depth > 8) { + calc_ab_internal_hbd((square_sum_buf - buf_stride - 1), + (A16 - buf_stride - 1), (sum_buf - buf_stride - 1), + (B - buf_stride - 1), buf_stride, width + 2, + height + 2, bit_depth, r, params->s[radius_idx], 1); + } else { + calc_ab_internal_lbd((square_sum_buf - buf_stride - 1), + (A16 - buf_stride - 1), (sum_buf - buf_stride - 1), + (B - buf_stride - 1), buf_stride, width + 2, + height + 2, r, params->s[radius_idx], 1); + } +#else + (void)bit_depth; + calc_ab_internal_lbd((square_sum_buf - buf_stride - 1), + (A16 - buf_stride - 1), (sum_buf - buf_stride - 1), + (B - buf_stride - 1), buf_stride, width + 2, height + 2, + r, params->s[radius_idx], 1); +#endif + final_filter_internal(A16, B, buf_stride, (int16_t *)dgd16, dgd_stride, dst, + dst_stride, width, height); +} + +static INLINE void src_convert_u8_to_u16(const uint8_t *src, + const int src_stride, uint16_t *dst, + const int dst_stride, const int width, + const int height) { + const uint8_t *src_ptr; + uint16_t *dst_ptr; + int h, w, count = 0; + + uint8x8_t t1, t2, t3, t4; + uint16x8_t s1, s2, s3, s4; + h = height; + do { + src_ptr = src + (count << 2) * src_stride; + dst_ptr = dst + (count << 2) * dst_stride; + w = width; + if (w >= 7) { + do { + load_u8_8x4(src_ptr, src_stride, &t1, &t2, &t3, &t4); + s1 = vmovl_u8(t1); + s2 = vmovl_u8(t2); + s3 = vmovl_u8(t3); + s4 = vmovl_u8(t4); + store_u16_8x4(dst_ptr, dst_stride, s1, s2, s3, s4); + + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w > 7); + } + + for (int y = 0; y < w; y++) { + dst_ptr[y] = src_ptr[y]; + dst_ptr[y + 1 * dst_stride] = src_ptr[y + 1 * src_stride]; + dst_ptr[y + 2 * dst_stride] = src_ptr[y + 2 * src_stride]; + dst_ptr[y + 3 * dst_stride] = src_ptr[y + 3 * src_stride]; + } + count++; + h -= 4; + } while (h > 3); + + src_ptr = src + (count << 2) * src_stride; + dst_ptr = dst + (count << 2) * dst_stride; + for (int x = 0; x < h; x++) { + for (int y = 0; y < width; y++) { + dst_ptr[y + x * dst_stride] = src_ptr[y + x * src_stride]; + } + } + + // memset uninitialized rows of src buffer as they are needed for the + // boxsum filter calculation. + for (int x = height; x < height + 5; x++) + memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst)); +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride, + uint16_t *dst, const int dst_stride, + int width, int height) { + const uint16_t *src_ptr; + uint16_t *dst_ptr; + int h, w, count = 0; + uint16x8_t s1, s2, s3, s4; + + h = height; + do { + src_ptr = src + (count << 2) * src_stride; + dst_ptr = dst + (count << 2) * dst_stride; + w = width; + do { + load_u16_8x4(src_ptr, src_stride, &s1, &s2, &s3, &s4); + store_u16_8x4(dst_ptr, dst_stride, s1, s2, s3, s4); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w > 7); + + for (int y = 0; y < w; y++) { + dst_ptr[y] = src_ptr[y]; + dst_ptr[y + 1 * dst_stride] = src_ptr[y + 1 * src_stride]; + dst_ptr[y + 2 * dst_stride] = src_ptr[y + 2 * src_stride]; + dst_ptr[y + 3 * dst_stride] = src_ptr[y + 3 * src_stride]; + } + count++; + h -= 4; + } while (h > 3); + + src_ptr = src + (count << 2) * src_stride; + dst_ptr = dst + (count << 2) * dst_stride; + + for (int x = 0; x < h; x++) { + memcpy((dst_ptr + x * dst_stride), (src_ptr + x * src_stride), + sizeof(uint16_t) * width); + } + // memset uninitialized rows of src buffer as they are needed for the + // boxsum filter calculation. + for (int x = height; x < height + 5; x++) + memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst)); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height, + int stride, int32_t *flt0, int32_t *flt1, + int flt_stride, int sgr_params_idx, + int bit_depth, int highbd) { + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + assert(!(params->r[0] == 0 && params->r[1] == 0)); + + uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS]; + const int dgd16_stride = width + 2 * SGRPROJ_BORDER_HORZ; + uint16_t *dgd16 = + dgd16_ + dgd16_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ; + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; + const int dgd_stride = stride; + +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd) { + const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8); + src_convert_hbd_copy( + dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, + dgd_stride, + dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, + dgd16_stride, width_ext, height_ext); + } else { + src_convert_u8_to_u16( + dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, + dgd_stride, + dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, + dgd16_stride, width_ext, height_ext); + } +#else + (void)highbd; + src_convert_u8_to_u16( + dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride, + dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, + dgd16_stride, width_ext, height_ext); +#endif + + if (params->r[0] > 0) + restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0, + flt_stride, bit_depth, sgr_params_idx, 0); + if (params->r[1] > 0) + restoration_internal(dgd16, width, height, dgd16_stride, flt1, flt_stride, + bit_depth, sgr_params_idx, 1); + return 0; +} + +void av1_apply_selfguided_restoration_neon(const uint8_t *dat8, int width, + int height, int stride, int eps, + const int *xqd, uint8_t *dst8, + int dst_stride, int32_t *tmpbuf, + int bit_depth, int highbd) { + int32_t *flt0 = tmpbuf; + int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; + assert(width * height <= RESTORATION_UNITPELS_MAX); + uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS]; + const int dgd16_stride = width + 2 * SGRPROJ_BORDER_HORZ; + uint16_t *dgd16 = + dgd16_ + dgd16_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ; + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; + const int dgd_stride = stride; + const sgr_params_type *const params = &av1_sgr_params[eps]; + int xq[2]; + + assert(!(params->r[0] == 0 && params->r[1] == 0)); + +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd) { + const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8); + src_convert_hbd_copy( + dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, + dgd_stride, + dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, + dgd16_stride, width_ext, height_ext); + } else { + src_convert_u8_to_u16( + dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, + dgd_stride, + dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, + dgd16_stride, width_ext, height_ext); + } +#else + (void)highbd; + src_convert_u8_to_u16( + dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride, + dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, + dgd16_stride, width_ext, height_ext); +#endif + if (params->r[0] > 0) + restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0, width, + bit_depth, eps, 0); + if (params->r[1] > 0) + restoration_internal(dgd16, width, height, dgd16_stride, flt1, width, + bit_depth, eps, 1); + + av1_decode_xq(xqd, xq, params); + + { + int16_t *src_ptr; + uint8_t *dst_ptr; + uint16_t *dst16_ptr; + int16x4_t d0, d4; + int16x8_t r0, s0; + uint16x8_t r4; + int32x4_t u0, u4, v0, v4, f00, f10; + uint8x8_t t0; + int count = 0, w = width, h = height, rc = 0; + + const int32x4_t xq0_vec = vdupq_n_s32(xq[0]); + const int32x4_t xq1_vec = vdupq_n_s32(xq[1]); + const int16x8_t zero = vdupq_n_s16(0); + const uint16x8_t max = vdupq_n_u16((1 << bit_depth) - 1); + uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst8); + dst_ptr = dst8; + src_ptr = (int16_t *)dgd16; + do { + w = width; + count = 0; + dst_ptr = dst8 + rc * dst_stride; + dst16_ptr = dst16 + rc * dst_stride; + do { + s0 = vld1q_s16(src_ptr + count); + + u0 = vshll_n_s16(vget_low_s16(s0), SGRPROJ_RST_BITS); + u4 = vshll_n_s16(vget_high_s16(s0), SGRPROJ_RST_BITS); + + v0 = vshlq_n_s32(u0, SGRPROJ_PRJ_BITS); + v4 = vshlq_n_s32(u4, SGRPROJ_PRJ_BITS); + + if (params->r[0] > 0) { + f00 = vld1q_s32(flt0 + count); + f10 = vld1q_s32(flt0 + count + 4); + + f00 = vsubq_s32(f00, u0); + f10 = vsubq_s32(f10, u4); + + v0 = vmlaq_s32(v0, xq0_vec, f00); + v4 = vmlaq_s32(v4, xq0_vec, f10); + } + + if (params->r[1] > 0) { + f00 = vld1q_s32(flt1 + count); + f10 = vld1q_s32(flt1 + count + 4); + + f00 = vsubq_s32(f00, u0); + f10 = vsubq_s32(f10, u4); + + v0 = vmlaq_s32(v0, xq1_vec, f00); + v4 = vmlaq_s32(v4, xq1_vec, f10); + } + + d0 = vqrshrn_n_s32(v0, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + d4 = vqrshrn_n_s32(v4, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + + r0 = vcombine_s16(d0, d4); + + r4 = vreinterpretq_u16_s16(vmaxq_s16(r0, zero)); + +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd) { + r4 = vminq_u16(r4, max); + vst1q_u16(dst16_ptr, r4); + } else { + t0 = vqmovn_u16(r4); + vst1_u8(dst_ptr, t0); + } +#else + (void)max; + t0 = vqmovn_u16(r4); + vst1_u8(dst_ptr, t0); +#endif + w -= 8; + count += 8; + dst_ptr += 8; + dst16_ptr += 8; + } while (w > 0); + + src_ptr += dgd16_stride; + flt1 += width; + flt0 += width; + rc++; + h--; + } while (h > 0); + } +} diff --git a/libs/libaom/src/av1/common/arm/transpose_neon.h b/libs/libaom/src/av1/common/arm/transpose_neon.h new file mode 100644 index 000000000..91d89b43f --- /dev/null +++ b/libs/libaom/src/av1/common/arm/transpose_neon.h @@ -0,0 +1,602 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_ +#define AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_ + +#include + +static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, + uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5, + uint8x8_t *a6, uint8x8_t *a7) { + // Swap 8 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56 + // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57 + // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76 + // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77 + + const uint8x16x2_t b0 = + vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5)); + const uint8x16x2_t b1 = + vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7)); + + // Swap 16 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74 + // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76 + // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75 + // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77 + + const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), + vreinterpretq_u16_u8(b1.val[0])); + const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), + vreinterpretq_u16_u8(b1.val[1])); + + // Unzip 32 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]), + vreinterpretq_u32_u16(c1.val[0])); + const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]), + vreinterpretq_u32_u16(c1.val[1])); + + *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0])); + *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0])); + *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0])); + *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0])); + *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1])); + *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1])); + *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1])); + *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1])); +} + +static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, + uint8x8_t *a3) { + // Swap 8 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + + const uint8x8x2_t b0 = vtrn_u8(*a0, *a1); + const uint8x8x2_t b1 = vtrn_u8(*a2, *a3); + + // Swap 16 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + + const uint16x4x2_t c0 = + vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0])); + const uint16x4x2_t c1 = + vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1])); + + *a0 = vreinterpret_u8_u16(c0.val[0]); + *a1 = vreinterpret_u8_u16(c1.val[0]); + *a2 = vreinterpret_u8_u16(c0.val[1]); + *a3 = vreinterpret_u8_u16(c1.val[1]); +} + +static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 10 11 12 13 + // a1: 20 21 22 23 30 31 32 33 + // to: + // b0.val[0]: 00 01 20 21 10 11 30 31 + // b0.val[1]: 02 03 22 23 12 13 32 33 + + const uint16x4x2_t b0 = + vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1)); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 01 20 21 02 03 22 23 + // c0.val[1]: 10 11 30 31 12 13 32 33 + + const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]), + vreinterpret_u32_u16(b0.val[1])); + + // Swap 8 bit elements resulting in: + // d0.val[0]: 00 10 20 30 02 12 22 32 + // d0.val[1]: 01 11 21 31 03 13 23 33 + + const uint8x8x2_t d0 = + vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1])); + + *a0 = d0.val[0]; + *a1 = d0.val[1]; +} + +static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, + uint8x8_t *a3, const uint8x8_t a4, + const uint8x8_t a5, const uint8x8_t a6, + const uint8x8_t a7) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 XX XX XX XX + // a1: 10 11 12 13 XX XX XX XX + // a2: 20 21 22 23 XX XX XX XX + // a3; 30 31 32 33 XX XX XX XX + // a4: 40 41 42 43 XX XX XX XX + // a5: 50 51 52 53 XX XX XX XX + // a6: 60 61 62 63 XX XX XX XX + // a7: 70 71 72 73 XX XX XX XX + // to: + // b0.val[0]: 00 01 02 03 40 41 42 43 + // b1.val[0]: 10 11 12 13 50 51 52 53 + // b2.val[0]: 20 21 22 23 60 61 62 63 + // b3.val[0]: 30 31 32 33 70 71 72 73 + + const uint32x2x2_t b0 = + vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4)); + const uint32x2x2_t b1 = + vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5)); + const uint32x2x2_t b2 = + vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6)); + const uint32x2x2_t b3 = + vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7)); + + // Swap 16 bit elements resulting in: + // c0.val[0]: 00 01 20 21 40 41 60 61 + // c0.val[1]: 02 03 22 23 42 43 62 63 + // c1.val[0]: 10 11 30 31 50 51 70 71 + // c1.val[1]: 12 13 32 33 52 53 72 73 + + const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]), + vreinterpret_u16_u32(b2.val[0])); + const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]), + vreinterpret_u16_u32(b3.val[0])); + + // Swap 8 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 01 11 21 31 41 51 61 71 + // d1.val[0]: 02 12 22 32 42 52 62 72 + // d1.val[1]: 03 13 23 33 43 53 63 73 + + const uint8x8x2_t d0 = + vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0])); + const uint8x8x2_t d1 = + vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1])); + + *a0 = d0.val[0]; + *a1 = d0.val[1]; + *a2 = d1.val[0]; + *a3 = d1.val[1]; +} + +static INLINE void transpose_u16_4x8(uint16x4_t *a0, uint16x4_t *a1, + uint16x4_t *a2, uint16x4_t *a3, + uint16x4_t *a4, uint16x4_t *a5, + uint16x4_t *a6, uint16x4_t *a7, + uint16x8_t *o0, uint16x8_t *o1, + uint16x8_t *o2, uint16x8_t *o3) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 + // a1: 10 11 12 13 + // a2: 20 21 22 23 + // a3: 30 31 32 33 + // a4: 40 41 42 43 + // a5: 50 51 52 53 + // a6: 60 61 62 63 + // a7: 70 71 72 73 + // to: + // b0.val[0]: 00 10 02 12 + // b0.val[1]: 01 11 03 13 + // b1.val[0]: 20 30 22 32 + // b1.val[1]: 21 31 23 33 + // b2.val[0]: 40 50 42 52 + // b2.val[1]: 41 51 43 53 + // b3.val[0]: 60 70 62 72 + // b3.val[1]: 61 71 63 73 + + uint16x4x2_t b0 = vtrn_u16(*a0, *a1); + uint16x4x2_t b1 = vtrn_u16(*a2, *a3); + uint16x4x2_t b2 = vtrn_u16(*a4, *a5); + uint16x4x2_t b3 = vtrn_u16(*a6, *a7); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 + // c0.val[1]: 02 12 22 32 + // c1.val[0]: 01 11 21 31 + // c1.val[1]: 03 13 23 33 + // c2.val[0]: 40 50 60 70 + // c2.val[1]: 42 52 62 72 + // c3.val[0]: 41 51 61 71 + // c3.val[1]: 43 53 63 73 + + uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]), + vreinterpret_u32_u16(b1.val[0])); + uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]), + vreinterpret_u32_u16(b1.val[1])); + uint32x2x2_t c2 = vtrn_u32(vreinterpret_u32_u16(b2.val[0]), + vreinterpret_u32_u16(b3.val[0])); + uint32x2x2_t c3 = vtrn_u32(vreinterpret_u32_u16(b2.val[1]), + vreinterpret_u32_u16(b3.val[1])); + + // Swap 64 bit elements resulting in: + // o0: 00 10 20 30 40 50 60 70 + // o1: 01 11 21 31 41 51 61 71 + // o2: 02 12 22 32 42 52 62 72 + // o3: 03 13 23 33 43 53 63 73 + + *o0 = vcombine_u16(vreinterpret_u16_u32(c0.val[0]), + vreinterpret_u16_u32(c2.val[0])); + *o1 = vcombine_u16(vreinterpret_u16_u32(c1.val[0]), + vreinterpret_u16_u32(c3.val[0])); + *o2 = vcombine_u16(vreinterpret_u16_u32(c0.val[1]), + vreinterpret_u16_u32(c2.val[1])); + *o3 = vcombine_u16(vreinterpret_u16_u32(c1.val[1]), + vreinterpret_u16_u32(c3.val[1])); +} + +static INLINE void transpose_s16_4x8(int16x4_t *a0, int16x4_t *a1, + int16x4_t *a2, int16x4_t *a3, + int16x4_t *a4, int16x4_t *a5, + int16x4_t *a6, int16x4_t *a7, + int16x8_t *o0, int16x8_t *o1, + int16x8_t *o2, int16x8_t *o3) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 + // a1: 10 11 12 13 + // a2: 20 21 22 23 + // a3: 30 31 32 33 + // a4: 40 41 42 43 + // a5: 50 51 52 53 + // a6: 60 61 62 63 + // a7: 70 71 72 73 + // to: + // b0.val[0]: 00 10 02 12 + // b0.val[1]: 01 11 03 13 + // b1.val[0]: 20 30 22 32 + // b1.val[1]: 21 31 23 33 + // b2.val[0]: 40 50 42 52 + // b2.val[1]: 41 51 43 53 + // b3.val[0]: 60 70 62 72 + // b3.val[1]: 61 71 63 73 + + int16x4x2_t b0 = vtrn_s16(*a0, *a1); + int16x4x2_t b1 = vtrn_s16(*a2, *a3); + int16x4x2_t b2 = vtrn_s16(*a4, *a5); + int16x4x2_t b3 = vtrn_s16(*a6, *a7); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 + // c0.val[1]: 02 12 22 32 + // c1.val[0]: 01 11 21 31 + // c1.val[1]: 03 13 23 33 + // c2.val[0]: 40 50 60 70 + // c2.val[1]: 42 52 62 72 + // c3.val[0]: 41 51 61 71 + // c3.val[1]: 43 53 63 73 + + int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]), + vreinterpret_s32_s16(b1.val[0])); + int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]), + vreinterpret_s32_s16(b1.val[1])); + int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]), + vreinterpret_s32_s16(b3.val[0])); + int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]), + vreinterpret_s32_s16(b3.val[1])); + + // Swap 64 bit elements resulting in: + // o0: 00 10 20 30 40 50 60 70 + // o1: 01 11 21 31 41 51 61 71 + // o2: 02 12 22 32 42 52 62 72 + // o3: 03 13 23 33 43 53 63 73 + + *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]), + vreinterpret_s16_s32(c2.val[0])); + *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]), + vreinterpret_s16_s32(c3.val[0])); + *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]), + vreinterpret_s16_s32(c2.val[1])); + *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]), + vreinterpret_s16_s32(c3.val[1])); +} + +static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1, + uint16x8_t *a2, uint16x8_t *a3, + uint16x8_t *a4, uint16x8_t *a5, + uint16x8_t *a6, uint16x8_t *a7) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + // b2.val[0]: 40 50 42 52 44 54 46 56 + // b2.val[1]: 41 51 43 53 45 55 47 57 + // b3.val[0]: 60 70 62 72 64 74 66 76 + // b3.val[1]: 61 71 63 73 65 75 67 77 + + const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1); + const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3); + const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5); + const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + // c2.val[0]: 40 50 60 70 44 54 64 74 + // c2.val[1]: 42 52 62 72 46 56 66 76 + // c3.val[0]: 41 51 61 71 45 55 65 75 + // c3.val[1]: 43 53 63 73 47 57 67 77 + + const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]), + vreinterpretq_u32_u16(b1.val[0])); + const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]), + vreinterpretq_u32_u16(b1.val[1])); + const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]), + vreinterpretq_u32_u16(b3.val[0])); + const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]), + vreinterpretq_u32_u16(b3.val[1])); + + *a0 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c0.val[0])), + vget_low_u16(vreinterpretq_u16_u32(c2.val[0]))); + *a4 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c0.val[0])), + vget_high_u16(vreinterpretq_u16_u32(c2.val[0]))); + + *a2 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c0.val[1])), + vget_low_u16(vreinterpretq_u16_u32(c2.val[1]))); + *a6 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c0.val[1])), + vget_high_u16(vreinterpretq_u16_u32(c2.val[1]))); + + *a1 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c1.val[0])), + vget_low_u16(vreinterpretq_u16_u32(c3.val[0]))); + *a5 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c1.val[0])), + vget_high_u16(vreinterpretq_u16_u32(c3.val[0]))); + + *a3 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c1.val[1])), + vget_low_u16(vreinterpretq_u16_u32(c3.val[1]))); + *a7 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c1.val[1])), + vget_high_u16(vreinterpretq_u16_u32(c3.val[1]))); +} + +static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, + int16x8_t *a2, int16x8_t *a3, + int16x8_t *a4, int16x8_t *a5, + int16x8_t *a6, int16x8_t *a7) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + // b2.val[0]: 40 50 42 52 44 54 46 56 + // b2.val[1]: 41 51 43 53 45 55 47 57 + // b3.val[0]: 60 70 62 72 64 74 66 76 + // b3.val[1]: 61 71 63 73 65 75 67 77 + + const int16x8x2_t b0 = vtrnq_s16(*a0, *a1); + const int16x8x2_t b1 = vtrnq_s16(*a2, *a3); + const int16x8x2_t b2 = vtrnq_s16(*a4, *a5); + const int16x8x2_t b3 = vtrnq_s16(*a6, *a7); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + // c2.val[0]: 40 50 60 70 44 54 64 74 + // c2.val[1]: 42 52 62 72 46 56 66 76 + // c3.val[0]: 41 51 61 71 45 55 65 75 + // c3.val[1]: 43 53 63 73 47 57 67 77 + + const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), + vreinterpretq_s32_s16(b1.val[0])); + const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), + vreinterpretq_s32_s16(b1.val[1])); + const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]), + vreinterpretq_s32_s16(b3.val[0])); + const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]), + vreinterpretq_s32_s16(b3.val[1])); + + *a0 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c0.val[0])), + vget_low_s16(vreinterpretq_s16_s32(c2.val[0]))); + *a4 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c0.val[0])), + vget_high_s16(vreinterpretq_s16_s32(c2.val[0]))); + + *a2 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c0.val[1])), + vget_low_s16(vreinterpretq_s16_s32(c2.val[1]))); + *a6 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c0.val[1])), + vget_high_s16(vreinterpretq_s16_s32(c2.val[1]))); + + *a1 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c1.val[0])), + vget_low_s16(vreinterpretq_s16_s32(c3.val[0]))); + *a5 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c1.val[0])), + vget_high_s16(vreinterpretq_s16_s32(c3.val[0]))); + + *a3 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c1.val[1])), + vget_low_s16(vreinterpretq_s16_s32(c3.val[1]))); + *a7 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c1.val[1])), + vget_high_s16(vreinterpretq_s16_s32(c3.val[1]))); +} + +static INLINE int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) { + int16x8x2_t b0; + b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)), + vreinterpret_s16_s32(vget_low_s32(a1))); + b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)), + vreinterpret_s16_s32(vget_high_s32(a1))); + return b0; +} + +static INLINE void transpose_s16_8x8q(int16x8_t *a0, int16x8_t *out) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + // b2.val[0]: 40 50 42 52 44 54 46 56 + // b2.val[1]: 41 51 43 53 45 55 47 57 + // b3.val[0]: 60 70 62 72 64 74 66 76 + // b3.val[1]: 61 71 63 73 65 75 67 77 + + const int16x8x2_t b0 = vtrnq_s16(*a0, *(a0 + 1)); + const int16x8x2_t b1 = vtrnq_s16(*(a0 + 2), *(a0 + 3)); + const int16x8x2_t b2 = vtrnq_s16(*(a0 + 4), *(a0 + 5)); + const int16x8x2_t b3 = vtrnq_s16(*(a0 + 6), *(a0 + 7)); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + // c2.val[0]: 40 50 60 70 44 54 64 74 + // c2.val[1]: 42 52 62 72 46 56 66 76 + // c3.val[0]: 41 51 61 71 45 55 65 75 + // c3.val[1]: 43 53 63 73 47 57 67 77 + + const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), + vreinterpretq_s32_s16(b1.val[0])); + const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), + vreinterpretq_s32_s16(b1.val[1])); + const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]), + vreinterpretq_s32_s16(b3.val[0])); + const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]), + vreinterpretq_s32_s16(b3.val[1])); + + // Swap 64 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 04 14 24 34 44 54 64 74 + // d1.val[0]: 01 11 21 31 41 51 61 71 + // d1.val[1]: 05 15 25 35 45 55 65 75 + // d2.val[0]: 02 12 22 32 42 52 62 72 + // d2.val[1]: 06 16 26 36 46 56 66 76 + // d3.val[0]: 03 13 23 33 43 53 63 73 + // d3.val[1]: 07 17 27 37 47 57 67 77 + const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]); + const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]); + const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]); + const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]); + + *out = d0.val[0]; + *(out + 1) = d1.val[0]; + *(out + 2) = d2.val[0]; + *(out + 3) = d3.val[0]; + *(out + 4) = d0.val[1]; + *(out + 5) = d1.val[1]; + *(out + 6) = d2.val[1]; + *(out + 7) = d3.val[1]; +} + +static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1, + int16x4_t *a2, int16x4_t *a3) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 + // a1: 10 11 12 13 + // a2: 20 21 22 23 + // a3: 30 31 32 33 + // to: + // b0.val[0]: 00 10 02 12 + // b0.val[1]: 01 11 03 13 + // b1.val[0]: 20 30 22 32 + // b1.val[1]: 21 31 23 33 + + const int16x4x2_t b0 = vtrn_s16(*a0, *a1); + const int16x4x2_t b1 = vtrn_s16(*a2, *a3); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 + // c0.val[1]: 02 12 22 32 + // c1.val[0]: 01 11 21 31 + // c1.val[1]: 03 13 23 33 + + const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]), + vreinterpret_s32_s16(b1.val[0])); + const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]), + vreinterpret_s32_s16(b1.val[1])); + + *a0 = vreinterpret_s16_s32(c0.val[0]); + *a1 = vreinterpret_s16_s32(c1.val[0]); + *a2 = vreinterpret_s16_s32(c0.val[1]); + *a3 = vreinterpret_s16_s32(c1.val[1]); +} + +static INLINE int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) { + int32x4x2_t b0; + b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1)); + b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1)); + return b0; +} + +static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1, + int32x4_t *a2, int32x4_t *a3) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 + // a1: 10 11 12 13 + // a2: 20 21 22 23 + // a3: 30 31 32 33 + // to: + // b0.val[0]: 00 10 02 12 + // b0.val[1]: 01 11 03 13 + // b1.val[0]: 20 30 22 32 + // b1.val[1]: 21 31 23 33 + + const int32x4x2_t b0 = vtrnq_s32(*a0, *a1); + const int32x4x2_t b1 = vtrnq_s32(*a2, *a3); + + // Swap 64 bit elements resulting in: + // c0.val[0]: 00 10 20 30 + // c0.val[1]: 02 12 22 32 + // c1.val[0]: 01 11 21 31 + // c1.val[1]: 03 13 23 33 + + const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]); + const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]); + + *a0 = c0.val[0]; + *a1 = c1.val[0]; + *a2 = c0.val[1]; + *a3 = c1.val[1]; +} + +#endif // AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_ diff --git a/libs/libaom/src/av1/common/arm/warp_plane_neon.c b/libs/libaom/src/av1/common/arm/warp_plane_neon.c new file mode 100644 index 000000000..c10a34fcd --- /dev/null +++ b/libs/libaom/src/av1/common/arm/warp_plane_neon.c @@ -0,0 +1,714 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" +#include "config/av1_rtcd.h" +#include "av1/common/warped_motion.h" +#include "av1/common/scale.h" + +/* This is a modified version of 'av1_warped_filter' from warped_motion.c: + * Each coefficient is stored in 8 bits instead of 16 bits + * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7 + + This is done in order to avoid overflow: Since the tap with the largest + coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation + order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular + convolve functions. + + Instead, we use the summation order + ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)). + The rearrangement of coefficients in this table is so that we can get the + coefficients into the correct order more quickly. +*/ +/* clang-format off */ +DECLARE_ALIGNED(8, static const int8_t, + filter_8bit_neon[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = { +#if WARPEDPIXEL_PREC_BITS == 6 + // [-1, 0) + { 0, 127, 0, 0, 0, 1, 0, 0}, { 0, 127, 0, 0, -1, 2, 0, 0}, + { 1, 127, -1, 0, -3, 4, 0, 0}, { 1, 126, -2, 0, -4, 6, 1, 0}, + { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 125, -4, 0, -6, 11, 1, 0}, + { 1, 124, -4, 0, -7, 13, 1, 0}, { 2, 123, -5, 0, -8, 15, 1, 0}, + { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 121, -6, 0, -10, 20, 1, 0}, + { 2, 120, -7, 0, -11, 22, 2, 0}, { 2, 119, -8, 0, -12, 25, 2, 0}, + { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 116, -9, 0, -13, 29, 2, 0}, + { 3, 114, -10, 0, -14, 32, 3, 0}, { 3, 113, -10, 0, -15, 35, 2, 0}, + { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 109, -11, 0, -16, 40, 3, 0}, + { 3, 108, -12, 0, -16, 42, 3, 0}, { 4, 106, -13, 0, -17, 45, 3, 0}, + { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 102, -14, 0, -17, 50, 3, 0}, + { 4, 100, -14, 0, -17, 52, 3, 0}, { 4, 98, -15, 0, -18, 55, 4, 0}, + { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 94, -16, 0, -18, 60, 4, 0}, + { 4, 91, -16, 0, -18, 63, 4, 0}, { 4, 89, -16, 0, -18, 65, 4, 0}, + { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 85, -17, 0, -18, 70, 4, 0}, + { 4, 82, -17, 0, -18, 73, 4, 0}, { 4, 80, -17, 0, -18, 75, 4, 0}, + { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 75, -18, 0, -17, 80, 4, 0}, + { 4, 73, -18, 0, -17, 82, 4, 0}, { 4, 70, -18, 0, -17, 85, 4, 0}, + { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 65, -18, 0, -16, 89, 4, 0}, + { 4, 63, -18, 0, -16, 91, 4, 0}, { 4, 60, -18, 0, -16, 94, 4, 0}, + { 3, 58, -18, 0, -15, 96, 4, 0}, { 4, 55, -18, 0, -15, 98, 4, 0}, + { 3, 52, -17, 0, -14, 100, 4, 0}, { 3, 50, -17, 0, -14, 102, 4, 0}, + { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 45, -17, 0, -13, 106, 4, 0}, + { 3, 42, -16, 0, -12, 108, 3, 0}, { 3, 40, -16, 0, -11, 109, 3, 0}, + { 3, 37, -15, 0, -11, 111, 3, 0}, { 2, 35, -15, 0, -10, 113, 3, 0}, + { 3, 32, -14, 0, -10, 114, 3, 0}, { 2, 29, -13, 0, -9, 116, 3, 0}, + { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 25, -12, 0, -8, 119, 2, 0}, + { 2, 22, -11, 0, -7, 120, 2, 0}, { 1, 20, -10, 0, -6, 121, 2, 0}, + { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 15, -8, 0, -5, 123, 2, 0}, + { 1, 13, -7, 0, -4, 124, 1, 0}, { 1, 11, -6, 0, -4, 125, 1, 0}, + { 1, 8, -5, 0, -3, 126, 1, 0}, { 1, 6, -4, 0, -2, 126, 1, 0}, + { 0, 4, -3, 0, -1, 127, 1, 0}, { 0, 2, -1, 0, 0, 127, 0, 0}, + // [0, 1) + { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -1, 2, 0, 0, 127, 0, 0}, + { 0, -3, 4, 1, 1, 127, -2, 0}, { 0, -5, 6, 1, 1, 127, -2, 0}, + { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -7, 11, 2, 2, 126, -4, -1}, + {-1, -8, 13, 2, 3, 125, -5, -1}, {-1, -10, 16, 3, 3, 124, -6, -1}, + {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -12, 20, 3, 4, 122, -7, -1}, + {-1, -13, 23, 3, 4, 121, -8, -1}, {-2, -14, 25, 4, 5, 120, -9, -1}, + {-1, -15, 27, 4, 5, 119, -10, -1}, {-1, -16, 30, 4, 5, 118, -11, -1}, + {-2, -17, 33, 5, 6, 116, -12, -1}, {-2, -17, 35, 5, 6, 114, -12, -1}, + {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 41, 6, 7, 111, -14, -2}, + {-2, -19, 43, 6, 7, 110, -15, -2}, {-2, -20, 46, 6, 7, 108, -15, -2}, + {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 51, 7, 7, 104, -16, -2}, + {-2, -21, 54, 7, 7, 102, -17, -2}, {-2, -21, 56, 7, 8, 100, -18, -2}, + {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 62, 7, 8, 96, -19, -2}, + {-2, -22, 64, 7, 8, 94, -19, -2}, {-2, -22, 67, 8, 8, 91, -20, -2}, + {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -22, 72, 8, 8, 87, -21, -2}, + {-2, -21, 74, 8, 8, 84, -21, -2}, {-2, -22, 77, 8, 8, 82, -21, -2}, + {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 82, 8, 8, 77, -22, -2}, + {-2, -21, 84, 8, 8, 74, -21, -2}, {-2, -21, 87, 8, 8, 72, -22, -2}, + {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -20, 91, 8, 8, 67, -22, -2}, + {-2, -19, 94, 8, 7, 64, -22, -2}, {-2, -19, 96, 8, 7, 62, -22, -2}, + {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -18, 100, 8, 7, 56, -21, -2}, + {-2, -17, 102, 7, 7, 54, -21, -2}, {-2, -16, 104, 7, 7, 51, -21, -2}, + {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 108, 7, 6, 46, -20, -2}, + {-2, -15, 110, 7, 6, 43, -19, -2}, {-2, -14, 111, 7, 6, 41, -19, -2}, + {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 114, 6, 5, 35, -17, -2}, + {-1, -12, 116, 6, 5, 33, -17, -2}, {-1, -11, 118, 5, 4, 30, -16, -1}, + {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -9, 120, 5, 4, 25, -14, -2}, + {-1, -8, 121, 4, 3, 23, -13, -1}, {-1, -7, 122, 4, 3, 20, -12, -1}, + {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -6, 124, 3, 3, 16, -10, -1}, + {-1, -5, 125, 3, 2, 13, -8, -1}, {-1, -4, 126, 2, 2, 11, -7, -1}, + { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 6, -5, 0}, + { 0, -2, 127, 1, 1, 4, -3, 0}, { 0, 0, 127, 0, 0, 2, -1, 0}, + // [1, 2) + { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 0, 127, 0, 0, -1, 2, 0}, + { 0, 1, 127, -1, 0, -3, 4, 0}, { 0, 1, 126, -2, 0, -4, 6, 1}, + { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 125, -4, 0, -6, 11, 1}, + { 0, 1, 124, -4, 0, -7, 13, 1}, { 0, 2, 123, -5, 0, -8, 15, 1}, + { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 121, -6, 0, -10, 20, 1}, + { 0, 2, 120, -7, 0, -11, 22, 2}, { 0, 2, 119, -8, 0, -12, 25, 2}, + { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 116, -9, 0, -13, 29, 2}, + { 0, 3, 114, -10, 0, -14, 32, 3}, { 0, 3, 113, -10, 0, -15, 35, 2}, + { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 109, -11, 0, -16, 40, 3}, + { 0, 3, 108, -12, 0, -16, 42, 3}, { 0, 4, 106, -13, 0, -17, 45, 3}, + { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 102, -14, 0, -17, 50, 3}, + { 0, 4, 100, -14, 0, -17, 52, 3}, { 0, 4, 98, -15, 0, -18, 55, 4}, + { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 94, -16, 0, -18, 60, 4}, + { 0, 4, 91, -16, 0, -18, 63, 4}, { 0, 4, 89, -16, 0, -18, 65, 4}, + { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 85, -17, 0, -18, 70, 4}, + { 0, 4, 82, -17, 0, -18, 73, 4}, { 0, 4, 80, -17, 0, -18, 75, 4}, + { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 75, -18, 0, -17, 80, 4}, + { 0, 4, 73, -18, 0, -17, 82, 4}, { 0, 4, 70, -18, 0, -17, 85, 4}, + { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 65, -18, 0, -16, 89, 4}, + { 0, 4, 63, -18, 0, -16, 91, 4}, { 0, 4, 60, -18, 0, -16, 94, 4}, + { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 4, 55, -18, 0, -15, 98, 4}, + { 0, 3, 52, -17, 0, -14, 100, 4}, { 0, 3, 50, -17, 0, -14, 102, 4}, + { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 45, -17, 0, -13, 106, 4}, + { 0, 3, 42, -16, 0, -12, 108, 3}, { 0, 3, 40, -16, 0, -11, 109, 3}, + { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 2, 35, -15, 0, -10, 113, 3}, + { 0, 3, 32, -14, 0, -10, 114, 3}, { 0, 2, 29, -13, 0, -9, 116, 3}, + { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 25, -12, 0, -8, 119, 2}, + { 0, 2, 22, -11, 0, -7, 120, 2}, { 0, 1, 20, -10, 0, -6, 121, 2}, + { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 15, -8, 0, -5, 123, 2}, + { 0, 1, 13, -7, 0, -4, 124, 1}, { 0, 1, 11, -6, 0, -4, 125, 1}, + { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 1, 6, -4, 0, -2, 126, 1}, + { 0, 0, 4, -3, 0, -1, 127, 1}, { 0, 0, 2, -1, 0, 0, 127, 0}, + // dummy (replicate row index 191) + { 0, 0, 2, -1, 0, 0, 127, 0}, + +#else + // [-1, 0) + { 0, 127, 0, 0, 0, 1, 0, 0}, { 1, 127, -1, 0, -3, 4, 0, 0}, + { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 124, -4, 0, -7, 13, 1, 0}, + { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 120, -7, 0, -11, 22, 2, 0}, + { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 114, -10, 0, -14, 32, 3, 0}, + { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 108, -12, 0, -16, 42, 3, 0}, + { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 100, -14, 0, -17, 52, 3, 0}, + { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 91, -16, 0, -18, 63, 4, 0}, + { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 82, -17, 0, -18, 73, 4, 0}, + { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 73, -18, 0, -17, 82, 4, 0}, + { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 63, -18, 0, -16, 91, 4, 0}, + { 3, 58, -18, 0, -15, 96, 4, 0}, { 3, 52, -17, 0, -14, 100, 4, 0}, + { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 42, -16, 0, -12, 108, 3, 0}, + { 3, 37, -15, 0, -11, 111, 3, 0}, { 3, 32, -14, 0, -10, 114, 3, 0}, + { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 22, -11, 0, -7, 120, 2, 0}, + { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 13, -7, 0, -4, 124, 1, 0}, + { 1, 8, -5, 0, -3, 126, 1, 0}, { 0, 4, -3, 0, -1, 127, 1, 0}, + // [0, 1) + { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -3, 4, 1, 1, 127, -2, 0}, + { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -8, 13, 2, 3, 125, -5, -1}, + {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -13, 23, 3, 4, 121, -8, -1}, + {-1, -15, 27, 4, 5, 119, -10, -1}, {-2, -17, 33, 5, 6, 116, -12, -1}, + {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 43, 6, 7, 110, -15, -2}, + {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 54, 7, 7, 102, -17, -2}, + {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 64, 7, 8, 94, -19, -2}, + {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -21, 74, 8, 8, 84, -21, -2}, + {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 84, 8, 8, 74, -21, -2}, + {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -19, 94, 8, 7, 64, -22, -2}, + {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -17, 102, 7, 7, 54, -21, -2}, + {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 110, 7, 6, 43, -19, -2}, + {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 116, 6, 5, 33, -17, -2}, + {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -8, 121, 4, 3, 23, -13, -1}, + {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -5, 125, 3, 2, 13, -8, -1}, + { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 4, -3, 0}, + // [1, 2) + { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 1, 127, -1, 0, -3, 4, 0}, + { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 124, -4, 0, -7, 13, 1}, + { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 120, -7, 0, -11, 22, 2}, + { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 114, -10, 0, -14, 32, 3}, + { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 108, -12, 0, -16, 42, 3}, + { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 100, -14, 0, -17, 52, 3}, + { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 91, -16, 0, -18, 63, 4}, + { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 82, -17, 0, -18, 73, 4}, + { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 73, -18, 0, -17, 82, 4}, + { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 63, -18, 0, -16, 91, 4}, + { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 3, 52, -17, 0, -14, 100, 4}, + { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 42, -16, 0, -12, 108, 3}, + { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 3, 32, -14, 0, -10, 114, 3}, + { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 22, -11, 0, -7, 120, 2}, + { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 13, -7, 0, -4, 124, 1}, + { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 0, 4, -3, 0, -1, 127, 1}, + // dummy (replicate row index 95) + { 0, 0, 4, -3, 0, -1, 127, 1}, +#endif // WARPEDPIXEL_PREC_BITS == 6 +}; +/* clang-format on */ + +static INLINE void convolve(int32x2x2_t x0, int32x2x2_t x1, uint8x8_t src_0, + uint8x8_t src_1, int16x4_t *res) { + int16x8_t coeff_0, coeff_1; + int16x8_t pix_0, pix_1; + + coeff_0 = vcombine_s16(vreinterpret_s16_s32(x0.val[0]), + vreinterpret_s16_s32(x1.val[0])); + coeff_1 = vcombine_s16(vreinterpret_s16_s32(x0.val[1]), + vreinterpret_s16_s32(x1.val[1])); + + pix_0 = vreinterpretq_s16_u16(vmovl_u8(src_0)); + pix_0 = vmulq_s16(coeff_0, pix_0); + + pix_1 = vreinterpretq_s16_u16(vmovl_u8(src_1)); + pix_0 = vmlaq_s16(pix_0, coeff_1, pix_1); + + *res = vpadd_s16(vget_low_s16(pix_0), vget_high_s16(pix_0)); +} + +static INLINE void horizontal_filter_neon(uint8x16_t src_1, uint8x16_t src_2, + uint8x16_t src_3, uint8x16_t src_4, + int16x8_t *tmp_dst, int sx, int alpha, + int k, const int offset_bits_horiz, + const int reduce_bits_horiz) { + const uint8x16_t mask = { 255, 0, 255, 0, 255, 0, 255, 0, + 255, 0, 255, 0, 255, 0, 255, 0 }; + const int32x4_t add_const = vdupq_n_s32((int32_t)(1 << offset_bits_horiz)); + const int16x8_t shift = vdupq_n_s16(-(int16_t)reduce_bits_horiz); + + int16x8_t f0, f1, f2, f3, f4, f5, f6, f7; + int32x2x2_t b0, b1; + uint8x8_t src_1_low, src_2_low, src_3_low, src_4_low, src_5_low, src_6_low; + int32x4_t tmp_res_low, tmp_res_high; + uint16x8_t res; + int16x4_t res_0246_even, res_0246_odd, res_1357_even, res_1357_odd; + + uint8x16_t tmp_0 = vandq_u8(src_1, mask); + uint8x16_t tmp_1 = vandq_u8(src_2, mask); + uint8x16_t tmp_2 = vandq_u8(src_3, mask); + uint8x16_t tmp_3 = vandq_u8(src_4, mask); + + tmp_2 = vextq_u8(tmp_0, tmp_0, 1); + tmp_3 = vextq_u8(tmp_1, tmp_1, 1); + + src_1 = vaddq_u8(tmp_0, tmp_2); + src_2 = vaddq_u8(tmp_1, tmp_3); + + src_1_low = vget_low_u8(src_1); + src_2_low = vget_low_u8(src_2); + src_3_low = vget_low_u8(vextq_u8(src_1, src_1, 4)); + src_4_low = vget_low_u8(vextq_u8(src_2, src_2, 4)); + src_5_low = vget_low_u8(vextq_u8(src_1, src_1, 2)); + src_6_low = vget_low_u8(vextq_u8(src_1, src_1, 6)); + + // Loading the 8 filter taps + f0 = vmovl_s8( + vld1_s8(filter_8bit_neon[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS])); + f1 = vmovl_s8( + vld1_s8(filter_8bit_neon[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS])); + f2 = vmovl_s8( + vld1_s8(filter_8bit_neon[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS])); + f3 = vmovl_s8( + vld1_s8(filter_8bit_neon[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS])); + f4 = vmovl_s8( + vld1_s8(filter_8bit_neon[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS])); + f5 = vmovl_s8( + vld1_s8(filter_8bit_neon[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS])); + f6 = vmovl_s8( + vld1_s8(filter_8bit_neon[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS])); + f7 = vmovl_s8( + vld1_s8(filter_8bit_neon[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS])); + + b0 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f0)), + vreinterpret_s32_s16(vget_low_s16(f2))); + b1 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f4)), + vreinterpret_s32_s16(vget_low_s16(f6))); + convolve(b0, b1, src_1_low, src_3_low, &res_0246_even); + + b0 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f1)), + vreinterpret_s32_s16(vget_low_s16(f3))); + b1 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f5)), + vreinterpret_s32_s16(vget_low_s16(f7))); + convolve(b0, b1, src_2_low, src_4_low, &res_0246_odd); + + b0 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f0)), + vreinterpret_s32_s16(vget_high_s16(f2))); + b1 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f4)), + vreinterpret_s32_s16(vget_high_s16(f6))); + convolve(b0, b1, src_2_low, src_4_low, &res_1357_even); + + b0 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f1)), + vreinterpret_s32_s16(vget_high_s16(f3))); + b1 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f5)), + vreinterpret_s32_s16(vget_high_s16(f7))); + convolve(b0, b1, src_5_low, src_6_low, &res_1357_odd); + + tmp_res_low = vaddl_s16(res_0246_even, res_1357_even); + tmp_res_high = vaddl_s16(res_0246_odd, res_1357_odd); + + tmp_res_low = vaddq_s32(tmp_res_low, add_const); + tmp_res_high = vaddq_s32(tmp_res_high, add_const); + + res = vcombine_u16(vqmovun_s32(tmp_res_low), vqmovun_s32(tmp_res_high)); + res = vqrshlq_u16(res, shift); + + tmp_dst[k + 7] = vreinterpretq_s16_u16(res); +} + +static INLINE void vertical_filter_neon(const int16x8_t *src, + int32x4_t *res_low, int32x4_t *res_high, + int sy, int gamma) { + int16x4_t src_0, src_1, fltr_0, fltr_1; + int32x4_t res_0, res_1; + int32x2_t res_0_im, res_1_im; + int32x4_t res_even, res_odd, im_res_0, im_res_1; + + int16x8_t f0, f1, f2, f3, f4, f5, f6, f7; + int16x8x2_t b0, b1, b2, b3; + int32x4x2_t c0, c1, c2, c3; + int32x4x2_t d0, d1, d2, d3; + + b0 = vtrnq_s16(src[0], src[1]); + b1 = vtrnq_s16(src[2], src[3]); + b2 = vtrnq_s16(src[4], src[5]); + b3 = vtrnq_s16(src[6], src[7]); + + c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), + vreinterpretq_s32_s16(b0.val[1])); + c1 = vtrnq_s32(vreinterpretq_s32_s16(b1.val[0]), + vreinterpretq_s32_s16(b1.val[1])); + c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]), + vreinterpretq_s32_s16(b2.val[1])); + c3 = vtrnq_s32(vreinterpretq_s32_s16(b3.val[0]), + vreinterpretq_s32_s16(b3.val[1])); + + f0 = vld1q_s16((int16_t *)(av1_warped_filter + + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); + f1 = vld1q_s16((int16_t *)(av1_warped_filter + + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); + f2 = vld1q_s16((int16_t *)(av1_warped_filter + + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); + f3 = vld1q_s16((int16_t *)(av1_warped_filter + + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); + f4 = vld1q_s16((int16_t *)(av1_warped_filter + + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); + f5 = vld1q_s16((int16_t *)(av1_warped_filter + + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); + f6 = vld1q_s16((int16_t *)(av1_warped_filter + + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); + f7 = vld1q_s16((int16_t *)(av1_warped_filter + + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); + + d0 = vtrnq_s32(vreinterpretq_s32_s16(f0), vreinterpretq_s32_s16(f2)); + d1 = vtrnq_s32(vreinterpretq_s32_s16(f4), vreinterpretq_s32_s16(f6)); + d2 = vtrnq_s32(vreinterpretq_s32_s16(f1), vreinterpretq_s32_s16(f3)); + d3 = vtrnq_s32(vreinterpretq_s32_s16(f5), vreinterpretq_s32_s16(f7)); + + // row:0,1 even_col:0,2 + src_0 = vget_low_s16(vreinterpretq_s16_s32(c0.val[0])); + fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d0.val[0])); + res_0 = vmull_s16(src_0, fltr_0); + + // row:0,1,2,3 even_col:0,2 + src_0 = vget_low_s16(vreinterpretq_s16_s32(c1.val[0])); + fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d0.val[1])); + res_0 = vmlal_s16(res_0, src_0, fltr_0); + res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0)); + + // row:0,1 even_col:4,6 + src_1 = vget_low_s16(vreinterpretq_s16_s32(c0.val[1])); + fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d1.val[0])); + res_1 = vmull_s16(src_1, fltr_1); + + // row:0,1,2,3 even_col:4,6 + src_1 = vget_low_s16(vreinterpretq_s16_s32(c1.val[1])); + fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d1.val[1])); + res_1 = vmlal_s16(res_1, src_1, fltr_1); + res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1)); + + // row:0,1,2,3 even_col:0,2,4,6 + im_res_0 = vcombine_s32(res_0_im, res_1_im); + + // row:4,5 even_col:0,2 + src_0 = vget_low_s16(vreinterpretq_s16_s32(c2.val[0])); + fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d0.val[0])); + res_0 = vmull_s16(src_0, fltr_0); + + // row:4,5,6,7 even_col:0,2 + src_0 = vget_low_s16(vreinterpretq_s16_s32(c3.val[0])); + fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d0.val[1])); + res_0 = vmlal_s16(res_0, src_0, fltr_0); + res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0)); + + // row:4,5 even_col:4,6 + src_1 = vget_low_s16(vreinterpretq_s16_s32(c2.val[1])); + fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d1.val[0])); + res_1 = vmull_s16(src_1, fltr_1); + + // row:4,5,6,7 even_col:4,6 + src_1 = vget_low_s16(vreinterpretq_s16_s32(c3.val[1])); + fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d1.val[1])); + res_1 = vmlal_s16(res_1, src_1, fltr_1); + res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1)); + + // row:4,5,6,7 even_col:0,2,4,6 + im_res_1 = vcombine_s32(res_0_im, res_1_im); + + // row:0-7 even_col:0,2,4,6 + res_even = vaddq_s32(im_res_0, im_res_1); + + // row:0,1 odd_col:1,3 + src_0 = vget_high_s16(vreinterpretq_s16_s32(c0.val[0])); + fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d2.val[0])); + res_0 = vmull_s16(src_0, fltr_0); + + // row:0,1,2,3 odd_col:1,3 + src_0 = vget_high_s16(vreinterpretq_s16_s32(c1.val[0])); + fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d2.val[1])); + res_0 = vmlal_s16(res_0, src_0, fltr_0); + res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0)); + + // row:0,1 odd_col:5,7 + src_1 = vget_high_s16(vreinterpretq_s16_s32(c0.val[1])); + fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d3.val[0])); + res_1 = vmull_s16(src_1, fltr_1); + + // row:0,1,2,3 odd_col:5,7 + src_1 = vget_high_s16(vreinterpretq_s16_s32(c1.val[1])); + fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d3.val[1])); + res_1 = vmlal_s16(res_1, src_1, fltr_1); + res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1)); + + // row:0,1,2,3 odd_col:1,3,5,7 + im_res_0 = vcombine_s32(res_0_im, res_1_im); + + // row:4,5 odd_col:1,3 + src_0 = vget_high_s16(vreinterpretq_s16_s32(c2.val[0])); + fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d2.val[0])); + res_0 = vmull_s16(src_0, fltr_0); + + // row:4,5,6,7 odd_col:1,3 + src_0 = vget_high_s16(vreinterpretq_s16_s32(c3.val[0])); + fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d2.val[1])); + res_0 = vmlal_s16(res_0, src_0, fltr_0); + res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0)); + + // row:4,5 odd_col:5,7 + src_1 = vget_high_s16(vreinterpretq_s16_s32(c2.val[1])); + fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d3.val[0])); + res_1 = vmull_s16(src_1, fltr_1); + + // row:4,5,6,7 odd_col:5,7 + src_1 = vget_high_s16(vreinterpretq_s16_s32(c3.val[1])); + fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d3.val[1])); + res_1 = vmlal_s16(res_1, src_1, fltr_1); + res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1)); + + // row:4,5,6,7 odd_col:1,3,5,7 + im_res_1 = vcombine_s32(res_0_im, res_1_im); + + // row:0-7 odd_col:1,3,5,7 + res_odd = vaddq_s32(im_res_0, im_res_1); + + // reordering as 0 1 2 3 | 4 5 6 7 + c0 = vtrnq_s32(res_even, res_odd); + + // Final store + *res_low = vcombine_s32(vget_low_s32(c0.val[0]), vget_low_s32(c0.val[1])); + *res_high = vcombine_s32(vget_high_s32(c0.val[0]), vget_high_s32(c0.val[1])); +} + +void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width, + int height, int stride, uint8_t *pred, int p_col, + int p_row, int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + int16x8_t tmp[15]; + const int bd = 8; + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const int32x4_t fwd = vdupq_n_s32((int32_t)w0); + const int32x4_t bwd = vdupq_n_s32((int32_t)w1); + const int16x8_t sub_constant = vdupq_n_s16((1 << (bd - 1)) + (1 << bd)); + + int limit = 0; + uint8x16_t vec_dup, mask_val; + int32x4_t res_lo, res_hi; + int16x8_t result_final; + uint8x16_t src_1, src_2, src_3, src_4; + uint8x16_t indx_vec = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + }; + uint8x16_t cmp_vec; + + const int reduce_bits_horiz = conv_params->round_0; + const int reduce_bits_vert = conv_params->is_compound + ? conv_params->round_1 + : 2 * FILTER_BITS - reduce_bits_horiz; + const int32x4_t shift_vert = vdupq_n_s32(-(int32_t)reduce_bits_vert); + const int offset_bits_horiz = bd + FILTER_BITS - 1; + + assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); + + const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; + int32x4_t add_const_vert = vdupq_n_s32((int32_t)(1 << offset_bits_vert)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int16x4_t round_bits_vec = vdup_n_s16(-(int16_t)round_bits); + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int16x4_t res_sub_const = + vdup_n_s16(-((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)))); + int k; + + assert(IMPLIES(conv_params->do_average, conv_params->is_compound)); + + for (int i = 0; i < p_height; i += 8) { + for (int j = 0; j < p_width; j += 8) { + const int32_t src_x = (p_col + j + 4) << subsampling_x; + const int32_t src_y = (p_row + i + 4) << subsampling_y; + const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0]; + const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1]; + const int32_t x4 = dst_x >> subsampling_x; + const int32_t y4 = dst_y >> subsampling_y; + + int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS; + int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS; + int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + + sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + // horizontal + if (ix4 <= -7) { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int16_t dup_val = + (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + + ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)); + + tmp[k + 7] = vdupq_n_s16(dup_val); + } + } else if (ix4 >= width + 6) { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int16_t dup_val = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + + ref[iy * stride + (width - 1)] * + (1 << (FILTER_BITS - reduce_bits_horiz)); + tmp[k + 7] = vdupq_n_s16(dup_val); + } + } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) { + const int out_of_boundary_left = -(ix4 - 6); + const int out_of_boundary_right = (ix4 + 8) - width; + + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + const uint8_t *src = ref + iy * stride + ix4 - 7; + src_1 = vld1q_u8(src); + + if (out_of_boundary_left >= 0) { + limit = out_of_boundary_left + 1; + cmp_vec = vdupq_n_u8(out_of_boundary_left); + vec_dup = vdupq_n_u8(*(src + limit)); + mask_val = vcleq_u8(indx_vec, cmp_vec); + src_1 = vbslq_u8(mask_val, vec_dup, src_1); + } + if (out_of_boundary_right >= 0) { + limit = 15 - (out_of_boundary_right + 1); + cmp_vec = vdupq_n_u8(15 - out_of_boundary_right); + vec_dup = vdupq_n_u8(*(src + limit)); + mask_val = vcgeq_u8(indx_vec, cmp_vec); + src_1 = vbslq_u8(mask_val, vec_dup, src_1); + } + src_2 = vextq_u8(src_1, src_1, 1); + src_3 = vextq_u8(src_2, src_2, 1); + src_4 = vextq_u8(src_3, src_3, 1); + + horizontal_filter_neon(src_1, src_2, src_3, src_4, tmp, sx, alpha, k, + offset_bits_horiz, reduce_bits_horiz); + } + } else { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + const uint8_t *src = ref + iy * stride + ix4 - 7; + src_1 = vld1q_u8(src); + src_2 = vextq_u8(src_1, src_1, 1); + src_3 = vextq_u8(src_2, src_2, 1); + src_4 = vextq_u8(src_3, src_3, 1); + + horizontal_filter_neon(src_1, src_2, src_3, src_4, tmp, sx, alpha, k, + offset_bits_horiz, reduce_bits_horiz); + } + } + + // vertical + for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + + const int16x8_t *v_src = tmp + (k + 4); + + vertical_filter_neon(v_src, &res_lo, &res_hi, sy, gamma); + + res_lo = vaddq_s32(res_lo, add_const_vert); + res_hi = vaddq_s32(res_hi, add_const_vert); + + if (conv_params->is_compound) { + uint16_t *const p = + (uint16_t *)&conv_params + ->dst[(i + k + 4) * conv_params->dst_stride + j]; + + res_lo = vrshlq_s32(res_lo, shift_vert); + if (conv_params->do_average) { + uint8_t *const dst8 = &pred[(i + k + 4) * p_stride + j]; + uint16x4_t tmp16_lo = vld1_u16(p); + int32x4_t tmp32_lo = vreinterpretq_s32_u32(vmovl_u16(tmp16_lo)); + int16x4_t tmp16_low; + if (conv_params->use_dist_wtd_comp_avg) { + res_lo = vmulq_s32(res_lo, bwd); + tmp32_lo = vmulq_s32(tmp32_lo, fwd); + tmp32_lo = vaddq_s32(tmp32_lo, res_lo); + tmp16_low = vshrn_n_s32(tmp32_lo, DIST_PRECISION_BITS); + } else { + tmp32_lo = vaddq_s32(tmp32_lo, res_lo); + tmp16_low = vshrn_n_s32(tmp32_lo, 1); + } + int16x4_t res_low = vadd_s16(tmp16_low, res_sub_const); + res_low = vqrshl_s16(res_low, round_bits_vec); + int16x8_t final_res_low = vcombine_s16(res_low, res_low); + uint8x8_t res_8_low = vqmovun_s16(final_res_low); + + vst1_lane_u32((uint32_t *)dst8, vreinterpret_u32_u8(res_8_low), 0); + } else { + uint16x4_t res_u16_low = vqmovun_s32(res_lo); + vst1_u16(p, res_u16_low); + } + if (p_width > 4) { + uint16_t *const p4 = + (uint16_t *)&conv_params + ->dst[(i + k + 4) * conv_params->dst_stride + j + 4]; + + res_hi = vrshlq_s32(res_hi, shift_vert); + if (conv_params->do_average) { + uint8_t *const dst8_4 = &pred[(i + k + 4) * p_stride + j + 4]; + + uint16x4_t tmp16_hi = vld1_u16(p4); + int32x4_t tmp32_hi = vreinterpretq_s32_u32(vmovl_u16(tmp16_hi)); + int16x4_t tmp16_high; + if (conv_params->use_dist_wtd_comp_avg) { + res_hi = vmulq_s32(res_hi, bwd); + tmp32_hi = vmulq_s32(tmp32_hi, fwd); + tmp32_hi = vaddq_s32(tmp32_hi, res_hi); + tmp16_high = vshrn_n_s32(tmp32_hi, DIST_PRECISION_BITS); + } else { + tmp32_hi = vaddq_s32(tmp32_hi, res_hi); + tmp16_high = vshrn_n_s32(tmp32_hi, 1); + } + int16x4_t res_high = vadd_s16(tmp16_high, res_sub_const); + res_high = vqrshl_s16(res_high, round_bits_vec); + int16x8_t final_res_high = vcombine_s16(res_high, res_high); + uint8x8_t res_8_high = vqmovun_s16(final_res_high); + + vst1_lane_u32((uint32_t *)dst8_4, vreinterpret_u32_u8(res_8_high), + 0); + } else { + uint16x4_t res_u16_high = vqmovun_s32(res_hi); + vst1_u16(p4, res_u16_high); + } + } + } else { + res_lo = vrshlq_s32(res_lo, shift_vert); + res_hi = vrshlq_s32(res_hi, shift_vert); + + result_final = vcombine_s16(vmovn_s32(res_lo), vmovn_s32(res_hi)); + result_final = vsubq_s16(result_final, sub_constant); + + uint8_t *const p = (uint8_t *)&pred[(i + k + 4) * p_stride + j]; + uint8x8_t val = vqmovun_s16(result_final); + + if (p_width == 4) { + vst1_lane_u32((uint32_t *)p, vreinterpret_u32_u8(val), 0); + } else { + vst1_u8(p, val); + } + } + } + } + } +} diff --git a/libs/libaom/src/av1/common/arm/wiener_convolve_neon.c b/libs/libaom/src/av1/common/arm/wiener_convolve_neon.c new file mode 100644 index 000000000..a9bb5bcf0 --- /dev/null +++ b/libs/libaom/src/av1/common/arm/wiener_convolve_neon.c @@ -0,0 +1,530 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/txfm_common.h" +#include "aom_ports/mem.h" +#include "av1/common/common.h" +#include "av1/common/arm/convolve_neon.h" +#include "av1/common/arm/mem_neon.h" +#include "av1/common/arm/transpose_neon.h" + +/* Wiener filter 2D + Apply horizontal filter and store in a temporary buffer. When applying + vertical filter, overwrite the original pixel values. + */ +void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, + const ConvolveParams *conv_params) { + uint16_t *d_tmp; + uint8_t *d; + const uint8_t *src_ptr, *s_tmp; + uint16_t *dst_ptr; + (void)x_step_q4; + (void)y_step_q4; + + int width, height; + const int bd = 8; + const int intermediate_height = h + SUBPEL_TAPS - 1; + const int center_tap = ((SUBPEL_TAPS - 1) / 2); + int16_t filter_x_tmp[7], filter_y_tmp[7]; + + DECLARE_ALIGNED(16, uint16_t, + temp[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]); + + assert(x_step_q4 == 16 && y_step_q4 == 16); + assert(!(w % 8)); + + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + + assert(filter_x[7] == 0); + assert(filter_y[7] == 0); + + /* assumption of horizontal filtering output will not exceed 15 bit. + ((bd) + 1 + FILTER_BITS - conv_params->round_0) <= 15 + 16 - conv_params->round_0 <= 15 -- (conv_params->round_0) >= 1 + */ + assert((conv_params->round_0) >= 1); + + memcpy(&filter_x_tmp[0], filter_x, sizeof(*filter_x) * FILTER_BITS); + memcpy(&filter_y_tmp[0], filter_y, sizeof(*filter_y) * FILTER_BITS); + + filter_x_tmp[3] += (1 << FILTER_BITS); + filter_y_tmp[3] += (1 << FILTER_BITS); + + s_tmp = src - center_tap * src_stride - center_tap; + dst_ptr = temp; + src_ptr = s_tmp; + height = intermediate_height; + + /* if height is a multiple of 8 */ + if (!(h & 7)) { + int16x8_t res0, res1, res2, res3; + uint16x8_t res4; + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; +#if defined(__aarch64__) + uint16x8_t res5, res6, res7, res8, res9, res10, res11; + uint8x8_t t8, t9, t10, t11, t12, t13, t14; + + do { + const uint8_t *s; + + __builtin_prefetch(src_ptr + 0 * src_stride); + __builtin_prefetch(src_ptr + 1 * src_stride); + __builtin_prefetch(src_ptr + 2 * src_stride); + __builtin_prefetch(src_ptr + 3 * src_stride); + __builtin_prefetch(src_ptr + 4 * src_stride); + __builtin_prefetch(src_ptr + 5 * src_stride); + __builtin_prefetch(src_ptr + 6 * src_stride); + __builtin_prefetch(src_ptr + 7 * src_stride); + + load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + s = src_ptr + 7; + d_tmp = dst_ptr; + width = w; + + __builtin_prefetch(dst_ptr + 0 * dst_stride); + __builtin_prefetch(dst_ptr + 1 * dst_stride); + __builtin_prefetch(dst_ptr + 2 * dst_stride); + __builtin_prefetch(dst_ptr + 3 * dst_stride); + __builtin_prefetch(dst_ptr + 4 * dst_stride); + __builtin_prefetch(dst_ptr + 5 * dst_stride); + __builtin_prefetch(dst_ptr + 6 * dst_stride); + __builtin_prefetch(dst_ptr + 7 * dst_stride); + + do { + load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14); + transpose_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14); + + res0 = vreinterpretq_s16_u16(vaddl_u8(t0, t6)); + res1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5)); + res2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4)); + res3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + res4 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp, + bd, conv_params->round_0); + + res0 = vreinterpretq_s16_u16(vaddl_u8(t1, t7)); + res1 = vreinterpretq_s16_u16(vaddl_u8(t2, t6)); + res2 = vreinterpretq_s16_u16(vaddl_u8(t3, t5)); + res3 = vreinterpretq_s16_u16(vmovl_u8(t4)); + res5 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp, + bd, conv_params->round_0); + + res0 = vreinterpretq_s16_u16(vaddl_u8(t2, t8)); + res1 = vreinterpretq_s16_u16(vaddl_u8(t3, t7)); + res2 = vreinterpretq_s16_u16(vaddl_u8(t4, t6)); + res3 = vreinterpretq_s16_u16(vmovl_u8(t5)); + res6 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp, + bd, conv_params->round_0); + + res0 = vreinterpretq_s16_u16(vaddl_u8(t3, t9)); + res1 = vreinterpretq_s16_u16(vaddl_u8(t4, t8)); + res2 = vreinterpretq_s16_u16(vaddl_u8(t5, t7)); + res3 = vreinterpretq_s16_u16(vmovl_u8(t6)); + res7 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp, + bd, conv_params->round_0); + + res0 = vreinterpretq_s16_u16(vaddl_u8(t4, t10)); + res1 = vreinterpretq_s16_u16(vaddl_u8(t5, t9)); + res2 = vreinterpretq_s16_u16(vaddl_u8(t6, t8)); + res3 = vreinterpretq_s16_u16(vmovl_u8(t7)); + res8 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp, + bd, conv_params->round_0); + + res0 = vreinterpretq_s16_u16(vaddl_u8(t5, t11)); + res1 = vreinterpretq_s16_u16(vaddl_u8(t6, t10)); + res2 = vreinterpretq_s16_u16(vaddl_u8(t7, t9)); + res3 = vreinterpretq_s16_u16(vmovl_u8(t8)); + res9 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp, + bd, conv_params->round_0); + + res0 = vreinterpretq_s16_u16(vaddl_u8(t6, t12)); + res1 = vreinterpretq_s16_u16(vaddl_u8(t7, t11)); + res2 = vreinterpretq_s16_u16(vaddl_u8(t8, t10)); + res3 = vreinterpretq_s16_u16(vmovl_u8(t9)); + res10 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp, + bd, conv_params->round_0); + + res0 = vreinterpretq_s16_u16(vaddl_u8(t7, t13)); + res1 = vreinterpretq_s16_u16(vaddl_u8(t8, t12)); + res2 = vreinterpretq_s16_u16(vaddl_u8(t9, t11)); + res3 = vreinterpretq_s16_u16(vmovl_u8(t10)); + res11 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp, + bd, conv_params->round_0); + + transpose_u16_8x8(&res4, &res5, &res6, &res7, &res8, &res9, &res10, + &res11); + store_u16_8x8(d_tmp, MAX_SB_SIZE, res4, res5, res6, res7, res8, res9, + res10, res11); + + t0 = t8; + t1 = t9; + t2 = t10; + t3 = t11; + t4 = t12; + t5 = t13; + t6 = t14; + s += 8; + d_tmp += 8; + width -= 8; + } while (width > 0); + src_ptr += 8 * src_stride; + dst_ptr += 8 * MAX_SB_SIZE; + height -= 8; + } while (height > 0); +#else + uint8x8_t temp_0; + + do { + const uint8_t *s; + + __builtin_prefetch(src_ptr); + + t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7 + s = src_ptr + 8; + d_tmp = dst_ptr; + width = w; + + __builtin_prefetch(dst_ptr); + + do { + t7 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + temp_0 = t0; + t0 = t7; + + t1 = vext_u8(temp_0, t7, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + t2 = vext_u8(temp_0, t7, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + t3 = vext_u8(temp_0, t7, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + t4 = vext_u8(temp_0, t7, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + t5 = vext_u8(temp_0, t7, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + t6 = vext_u8(temp_0, t7, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + t7 = vext_u8(temp_0, t7, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + res0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6)); + res1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5)); + res2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4)); + res3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + res4 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp, + bd, conv_params->round_0); + + vst1q_u16(d_tmp, res4); + + s += 8; + d_tmp += 8; + width -= 8; + } while (width > 0); + src_ptr += src_stride; + dst_ptr += MAX_SB_SIZE; + height--; + } while (height > 0); +#endif + } else { + /*if height is a multiple of 4*/ + const uint8_t *s; + int16x8_t tt0, tt1, tt2, tt3; + uint16x8_t d0; + uint8x8_t t0, t1, t2, t3; + +#if defined(__aarch64__) + uint16x4_t res0, res1, res2, res3, res4, res5, res6, res7; + uint16x8_t d1, d2, d3; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + int16x4_t s11, s12, s13, s14; + do { + __builtin_prefetch(src_ptr + 0 * src_stride); + __builtin_prefetch(src_ptr + 1 * src_stride); + __builtin_prefetch(src_ptr + 2 * src_stride); + __builtin_prefetch(src_ptr + 3 * src_stride); + + load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3); /*8x4*/ + transpose_u8_8x4(&t0, &t1, &t2, + &t3); /*first 8 pixels of 4 rows transposed-- 4x8*/ + + tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + + s0 = vget_low_s16(tt0); /*pa0 pb0 pc0 pd0 -- pixel_a0*/ + s1 = vget_low_s16(tt1); /*pa1 pb1 pc1 pd1 */ + s2 = vget_low_s16(tt2); /*pa2 pb2 pc2 pd2 */ + s3 = vget_low_s16(tt3); /*pa3 pb3 pc3 pd3 */ + s4 = vget_high_s16(tt0); /*pa4 pb4 pc4 pd4 */ + s5 = vget_high_s16(tt1); /*pa5 pb5 pc5 pd5 */ + s6 = vget_high_s16(tt2); /*pa6 pb6 pc6 pd6 */ + + __builtin_prefetch(dst_ptr + 0 * dst_stride); + __builtin_prefetch(dst_ptr + 1 * dst_stride); + __builtin_prefetch(dst_ptr + 2 * dst_stride); + __builtin_prefetch(dst_ptr + 3 * dst_stride); + + s = src_ptr + 7; + d_tmp = dst_ptr; + width = w; + + do { + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); /*8x4*/ + transpose_u8_8x4(&t0, &t1, &t2, &t3); + + tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + + s7 = vget_low_s16(tt0); /*pa7 pb7 pc7 pd7 */ /*4x8*/ + s8 = vget_low_s16(tt1); /*pa8 pb8 pc8 pd8 */ + s9 = vget_low_s16(tt2); /*pa9 pb9 pc9 pd9 */ + s10 = vget_low_s16(tt3); /*pa10 pb10 pc10 pd10 */ + s11 = vget_high_s16(tt0); /*pa11 pb11 pc11 pd11 */ + s12 = vget_high_s16(tt1); /*pa12 pb12 pc12 pd12 */ + s13 = vget_high_s16(tt2); /*pa13 pb13 pc13 pd13 */ + s14 = vget_high_s16(tt3); /*pa14 pb14 pc14 pd14 */ + + res0 = wiener_convolve8_horiz_4x8( + s0, s1, s2, s3, s4, s5, s6, filter_x_tmp, bd, conv_params->round_0); + res1 = wiener_convolve8_horiz_4x8( + s1, s2, s3, s4, s5, s6, s7, filter_x_tmp, bd, conv_params->round_0); + res2 = wiener_convolve8_horiz_4x8( + s2, s3, s4, s5, s6, s7, s8, filter_x_tmp, bd, conv_params->round_0); + res3 = wiener_convolve8_horiz_4x8( + s3, s4, s5, s6, s7, s8, s9, filter_x_tmp, bd, conv_params->round_0); + res4 = + wiener_convolve8_horiz_4x8(s4, s5, s6, s7, s8, s9, s10, + filter_x_tmp, bd, conv_params->round_0); + res5 = + wiener_convolve8_horiz_4x8(s5, s6, s7, s8, s9, s10, s11, + filter_x_tmp, bd, conv_params->round_0); + res6 = + wiener_convolve8_horiz_4x8(s6, s7, s8, s9, s10, s11, s12, + filter_x_tmp, bd, conv_params->round_0); + res7 = + wiener_convolve8_horiz_4x8(s7, s8, s9, s10, s11, s12, s13, + filter_x_tmp, bd, conv_params->round_0); + + transpose_u16_4x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6, + &res7, &d0, &d1, &d2, &d3); + + store_u16_8x4(d_tmp, MAX_SB_SIZE, d0, d1, d2, d3); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d_tmp += 8; + width -= 8; + } while (width > 0); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * MAX_SB_SIZE; + height -= 4; + } while (height > 0); +#else + uint8x8_t temp_0, t4, t5, t6, t7; + + do { + __builtin_prefetch(src_ptr); + + t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7 + + __builtin_prefetch(dst_ptr); + + s = src_ptr + 8; + d_tmp = dst_ptr; + width = w; + + do { + t7 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + temp_0 = t0; + t0 = t7; + + t1 = vext_u8(temp_0, t7, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + t2 = vext_u8(temp_0, t7, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + t3 = vext_u8(temp_0, t7, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + t4 = vext_u8(temp_0, t7, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + t5 = vext_u8(temp_0, t7, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + t6 = vext_u8(temp_0, t7, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + t7 = vext_u8(temp_0, t7, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + tt0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6)); + tt1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5)); + tt2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4)); + tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + d0 = wiener_convolve8_horiz_8x8(tt0, tt1, tt2, tt3, filter_x_tmp, bd, + conv_params->round_0); + + vst1q_u16(d_tmp, d0); + + s += 8; + d_tmp += 8; + width -= 8; + } while (width > 0); + + src_ptr += src_stride; + dst_ptr += MAX_SB_SIZE; + height -= 1; + } while (height > 0); +#endif + } + + { + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + uint8x8_t t0; +#if defined(__aarch64__) + int16x8_t s8, s9, s10; + uint8x8_t t1, t2, t3; +#endif + int16_t *src_tmp_ptr, *s; + uint8_t *dst_tmp_ptr; + height = h; + width = w; + src_tmp_ptr = (int16_t *)temp; + dst_tmp_ptr = dst; + src_stride = MAX_SB_SIZE; + + do { + s = src_tmp_ptr; + s0 = vld1q_s16(s); + s += src_stride; + s1 = vld1q_s16(s); + s += src_stride; + s2 = vld1q_s16(s); + s += src_stride; + s3 = vld1q_s16(s); + s += src_stride; + s4 = vld1q_s16(s); + s += src_stride; + s5 = vld1q_s16(s); + s += src_stride; + s6 = vld1q_s16(s); + s += src_stride; + d = dst_tmp_ptr; + height = h; + +#if defined(__aarch64__) + do { + __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride); + __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride); + __builtin_prefetch(dst_tmp_ptr + 2 * dst_stride); + __builtin_prefetch(dst_tmp_ptr + 3 * dst_stride); + + s7 = vld1q_s16(s); + s += src_stride; + s8 = vld1q_s16(s); + s += src_stride; + s9 = vld1q_s16(s); + s += src_stride; + s10 = vld1q_s16(s); + s += src_stride; + + t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp, + bd, conv_params->round_1); + t1 = wiener_convolve8_vert_4x8(s1, s2, s3, s4, s5, s6, s7, filter_y_tmp, + bd, conv_params->round_1); + t2 = wiener_convolve8_vert_4x8(s2, s3, s4, s5, s6, s7, s8, filter_y_tmp, + bd, conv_params->round_1); + t3 = wiener_convolve8_vert_4x8(s3, s4, s5, s6, s7, s8, s9, filter_y_tmp, + bd, conv_params->round_1); + + vst1_u8(d, t0); + d += dst_stride; + vst1_u8(d, t1); + d += dst_stride; + vst1_u8(d, t2); + d += dst_stride; + vst1_u8(d, t3); + d += dst_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + height -= 4; + } while (height > 3); + + if (height != 0) { + __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride); + __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride); + + do { + s7 = vld1q_s16(s); + s += src_stride; + + t0 = + wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, + filter_y_tmp, bd, conv_params->round_1); + vst1_u8(d, t0); + d += dst_stride; + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + height -= 1; + } while (height > 0); + } + + src_tmp_ptr += 8; + dst_tmp_ptr += 8; + + w -= 8; + } while (w > 0); +#else + do { + __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride); + + s7 = vld1q_s16(s); + s += src_stride; + + t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp, + bd, conv_params->round_1); + + vst1_u8(d, t0); + d += dst_stride; + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + height -= 1; + } while (height > 0); + + src_tmp_ptr += 8; + dst_tmp_ptr += 8; + + w -= 8; + } while (w > 0); +#endif + } +} diff --git a/libs/libaom/src/av1/common/av1_common_int.h b/libs/libaom/src/av1/common/av1_common_int.h new file mode 100644 index 000000000..0403405e9 --- /dev/null +++ b/libs/libaom/src/av1/common/av1_common_int.h @@ -0,0 +1,1557 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_AV1_COMMON_INT_H_ +#define AOM_AV1_COMMON_AV1_COMMON_INT_H_ + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom/internal/aom_codec_internal.h" +#include "aom_util/aom_thread.h" +#include "av1/common/alloccommon.h" +#include "av1/common/av1_loopfilter.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/entropymv.h" +#include "av1/common/enums.h" +#include "av1/common/frame_buffers.h" +#include "av1/common/mv.h" +#include "av1/common/quant_common.h" +#include "av1/common/restoration.h" +#include "av1/common/tile_common.h" +#include "av1/common/timing.h" +#include "av1/common/odintrin.h" +#include "av1/encoder/hash_motion.h" +#include "aom_dsp/grain_synthesis.h" +#include "aom_dsp/grain_table.h" +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(__clang__) && defined(__has_warning) +#if __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough") +#define AOM_FALLTHROUGH_INTENDED [[clang::fallthrough]] // NOLINT +#endif +#elif defined(__GNUC__) && __GNUC__ >= 7 +#define AOM_FALLTHROUGH_INTENDED __attribute__((fallthrough)) // NOLINT +#endif + +#ifndef AOM_FALLTHROUGH_INTENDED +#define AOM_FALLTHROUGH_INTENDED \ + do { \ + } while (0) +#endif + +#define CDEF_MAX_STRENGTHS 16 + +/* Constant values while waiting for the sequence header */ +#define FRAME_ID_LENGTH 15 +#define DELTA_FRAME_ID_LENGTH 14 + +#define FRAME_CONTEXTS (FRAME_BUFFERS + 1) +// Extra frame context which is always kept at default values +#define FRAME_CONTEXT_DEFAULTS (FRAME_CONTEXTS - 1) +#define PRIMARY_REF_BITS 3 +#define PRIMARY_REF_NONE 7 + +#define NUM_PING_PONG_BUFFERS 2 + +#define MAX_NUM_TEMPORAL_LAYERS 8 +#define MAX_NUM_SPATIAL_LAYERS 4 +/* clang-format off */ +// clang-format seems to think this is a pointer dereference and not a +// multiplication. +#define MAX_NUM_OPERATING_POINTS \ + (MAX_NUM_TEMPORAL_LAYERS * MAX_NUM_SPATIAL_LAYERS) +/* clang-format on */ + +// TODO(jingning): Turning this on to set up transform coefficient +// processing timer. +#define TXCOEFF_TIMER 0 +#define TXCOEFF_COST_TIMER 0 + +enum { + SINGLE_REFERENCE = 0, + COMPOUND_REFERENCE = 1, + REFERENCE_MODE_SELECT = 2, + REFERENCE_MODES = 3, +} UENUM1BYTE(REFERENCE_MODE); + +enum { + /** + * Frame context updates are disabled + */ + REFRESH_FRAME_CONTEXT_DISABLED, + /** + * Update frame context to values resulting from backward probability + * updates based on entropy/counts in the decoded frame + */ + REFRESH_FRAME_CONTEXT_BACKWARD, +} UENUM1BYTE(REFRESH_FRAME_CONTEXT_MODE); + +#define MFMV_STACK_SIZE 3 +typedef struct { + int_mv mfmv0; + uint8_t ref_frame_offset; +} TPL_MV_REF; + +typedef struct { + int_mv mv; + MV_REFERENCE_FRAME ref_frame; +} MV_REF; + +typedef struct RefCntBuffer { + // For a RefCntBuffer, the following are reference-holding variables: + // - cm->ref_frame_map[] + // - cm->cur_frame + // - cm->scaled_ref_buf[] (encoder only) + // - pbi->output_frame_index[] (decoder only) + // With that definition, 'ref_count' is the number of reference-holding + // variables that are currently referencing this buffer. + // For example: + // - suppose this buffer is at index 'k' in the buffer pool, and + // - Total 'n' of the variables / array elements above have value 'k' (that + // is, they are pointing to buffer at index 'k'). + // Then, pool->frame_bufs[k].ref_count = n. + int ref_count; + + unsigned int order_hint; + unsigned int ref_order_hints[INTER_REFS_PER_FRAME]; + + // These variables are used only in encoder and compare the absolute + // display order hint to compute the relative distance and overcome + // the limitation of get_relative_dist() which returns incorrect + // distance when a very old frame is used as a reference. + unsigned int display_order_hint; + unsigned int ref_display_order_hint[INTER_REFS_PER_FRAME]; + + MV_REF *mvs; + uint8_t *seg_map; + struct segmentation seg; + int mi_rows; + int mi_cols; + // Width and height give the size of the buffer (before any upscaling, unlike + // the sizes that can be derived from the buf structure) + int width; + int height; + WarpedMotionParams global_motion[REF_FRAMES]; + int showable_frame; // frame can be used as show existing frame in future + uint8_t film_grain_params_present; + aom_film_grain_t film_grain_params; + aom_codec_frame_buffer_t raw_frame_buffer; + YV12_BUFFER_CONFIG buf; + FRAME_TYPE frame_type; + + // This is only used in the encoder but needs to be indexed per ref frame + // so it's extremely convenient to keep it here. + int interp_filter_selected[SWITCHABLE]; + + // Inter frame reference frame delta for loop filter + int8_t ref_deltas[REF_FRAMES]; + + // 0 = ZERO_MV, MV + int8_t mode_deltas[MAX_MODE_LF_DELTAS]; + + FRAME_CONTEXT frame_context; +} RefCntBuffer; + +typedef struct BufferPool { +// Protect BufferPool from being accessed by several FrameWorkers at +// the same time during frame parallel decode. +// TODO(hkuang): Try to use atomic variable instead of locking the whole pool. +// TODO(wtc): Remove this. See +// https://chromium-review.googlesource.com/c/webm/libvpx/+/560630. +#if CONFIG_MULTITHREAD + pthread_mutex_t pool_mutex; +#endif + + // Private data associated with the frame buffer callbacks. + void *cb_priv; + + aom_get_frame_buffer_cb_fn_t get_fb_cb; + aom_release_frame_buffer_cb_fn_t release_fb_cb; + + RefCntBuffer frame_bufs[FRAME_BUFFERS]; + + // Frame buffers allocated internally by the codec. + InternalFrameBufferList int_frame_buffers; +} BufferPool; + +typedef struct { + int cdef_damping; + int nb_cdef_strengths; + int cdef_strengths[CDEF_MAX_STRENGTHS]; + int cdef_uv_strengths[CDEF_MAX_STRENGTHS]; + int cdef_bits; +} CdefInfo; + +typedef struct { + int delta_q_present_flag; + // Resolution of delta quant + int delta_q_res; + int delta_lf_present_flag; + // Resolution of delta lf level + int delta_lf_res; + // This is a flag for number of deltas of loop filter level + // 0: use 1 delta, for y_vertical, y_horizontal, u, and v + // 1: use separate deltas for each filter level + int delta_lf_multi; +} DeltaQInfo; + +typedef struct { + int enable_order_hint; // 0 - disable order hint, and related tools + int order_hint_bits_minus_1; // dist_wtd_comp, ref_frame_mvs, + // frame_sign_bias + // if 0, enable_dist_wtd_comp and + // enable_ref_frame_mvs must be set as 0. + int enable_dist_wtd_comp; // 0 - disable dist-wtd compound modes + // 1 - enable it + int enable_ref_frame_mvs; // 0 - disable ref frame mvs + // 1 - enable it +} OrderHintInfo; + +// Sequence header structure. +// Note: All syntax elements of sequence_header_obu that need to be +// bit-identical across multiple sequence headers must be part of this struct, +// so that consistency is checked by are_seq_headers_consistent() function. +// One exception is the last member 'op_params' that is ignored by +// are_seq_headers_consistent() function. +typedef struct SequenceHeader { + int num_bits_width; + int num_bits_height; + int max_frame_width; + int max_frame_height; + uint8_t frame_id_numbers_present_flag; + int frame_id_length; + int delta_frame_id_length; + BLOCK_SIZE sb_size; // Size of the superblock used for this frame + int mib_size; // Size of the superblock in units of MI blocks + int mib_size_log2; // Log 2 of above. + + OrderHintInfo order_hint_info; + + uint8_t force_screen_content_tools; // 0 - force off + // 1 - force on + // 2 - adaptive + uint8_t still_picture; // Video is a single frame still picture + uint8_t reduced_still_picture_hdr; // Use reduced header for still picture + uint8_t force_integer_mv; // 0 - Don't force. MV can use subpel + // 1 - force to integer + // 2 - adaptive + uint8_t enable_filter_intra; // enables/disables filterintra + uint8_t enable_intra_edge_filter; // enables/disables edge upsampling + uint8_t enable_interintra_compound; // enables/disables interintra_compound + uint8_t enable_masked_compound; // enables/disables masked compound + uint8_t enable_dual_filter; // 0 - disable dual interpolation filter + // 1 - enable vert/horz filter selection + uint8_t enable_warped_motion; // 0 - disable warp for the sequence + // 1 - enable warp for the sequence + uint8_t enable_superres; // 0 - Disable superres for the sequence + // and no frame level superres flag + // 1 - Enable superres for the sequence + // enable per-frame superres flag + uint8_t enable_cdef; // To turn on/off CDEF + uint8_t enable_restoration; // To turn on/off loop restoration + BITSTREAM_PROFILE profile; + + // Color config. + aom_bit_depth_t bit_depth; // AOM_BITS_8 in profile 0 or 1, + // AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3. + uint8_t use_highbitdepth; // If true, we need to use 16bit frame buffers. + uint8_t monochrome; // Monochorme video + aom_color_primaries_t color_primaries; + aom_transfer_characteristics_t transfer_characteristics; + aom_matrix_coefficients_t matrix_coefficients; + int color_range; + int subsampling_x; // Chroma subsampling for x + int subsampling_y; // Chroma subsampling for y + aom_chroma_sample_position_t chroma_sample_position; + uint8_t separate_uv_delta_q; + uint8_t film_grain_params_present; + + // Operating point info. + int operating_points_cnt_minus_1; + int operating_point_idc[MAX_NUM_OPERATING_POINTS]; + int timing_info_present; + aom_timing_info_t timing_info; + uint8_t decoder_model_info_present_flag; + aom_dec_model_info_t decoder_model_info; + uint8_t display_model_info_present_flag; + AV1_LEVEL seq_level_idx[MAX_NUM_OPERATING_POINTS]; + uint8_t tier[MAX_NUM_OPERATING_POINTS]; // seq_tier in spec. One bit: 0 or 1. + + // IMPORTANT: the op_params member must be at the end of the struct so that + // are_seq_headers_consistent() can be implemented with a memcmp() call. + // TODO(urvang): We probably don't need the +1 here. + aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1]; +} SequenceHeader; + +typedef struct { + int skip_mode_allowed; + int skip_mode_flag; + int ref_frame_idx_0; + int ref_frame_idx_1; +} SkipModeInfo; + +typedef struct { + FRAME_TYPE frame_type; + REFERENCE_MODE reference_mode; + + unsigned int order_hint; + unsigned int display_order_hint; + unsigned int frame_number; + SkipModeInfo skip_mode_info; + int refresh_frame_flags; // Which ref frames are overwritten by this frame + int frame_refs_short_signaling; +} CurrentFrame; + +// Struct containing some frame level features. +typedef struct { + bool disable_cdf_update; + bool allow_high_precision_mv; + bool cur_frame_force_integer_mv; // 0 the default in AOM, 1 only integer + bool allow_screen_content_tools; + bool allow_intrabc; + bool allow_warped_motion; + // Whether to use previous frames' motion vectors for prediction. + bool allow_ref_frame_mvs; + bool coded_lossless; // frame is fully lossless at the coded resolution. + bool all_lossless; // frame is fully lossless at the upscaled resolution. + bool reduced_tx_set_used; + bool error_resilient_mode; + bool switchable_motion_mode; + TX_MODE tx_mode; + InterpFilter interp_filter; + int primary_ref_frame; + int byte_alignment; + // Flag signaling how frame contexts should be updated at the end of + // a frame decode + REFRESH_FRAME_CONTEXT_MODE refresh_frame_context; +} FeatureFlags; + +// Struct containing params related to tiles. +typedef struct CommonTileParams { + int cols; // number of tile columns that frame is divided into + int rows; // number of tile rows that frame is divided into + int max_width_sb; // maximum tile width in superblock units. + int max_height_sb; // maximum tile height in superblock units. + // Min width of non-rightmost tile in MI units. Only valid if cols > 1. + int min_inner_width; + + // If true, tiles are uniformly spaced with power-of-two number of rows and + // columns. + // If false, tiles have explicitly configured widths and heights. + int uniform_spacing; + + // Following members are only valid when uniform_spacing == 1 + int log2_cols; // log2 of 'cols'. + int log2_rows; // log2 of 'rows'. + int width; // tile width in MI units + int height; // tile height in MI units + // End of members that are only valid when uniform_spacing == 1 + + // Min num of tile columns possible based on 'max_width_sb' and frame width. + int min_log2_cols; + // Min num of tile rows possible based on 'max_height_sb' and frame height. + int min_log2_rows; + // Min num of tile columns possible based on frame width. + int max_log2_cols; + // Max num of tile columns possible based on frame width. + int max_log2_rows; + // log2 of min number of tiles (same as min_log2_cols + min_log2_rows). + int min_log2; + // col_start_sb[i] is the start position of tile column i in superblock units. + // valid for 0 <= i <= cols + int col_start_sb[MAX_TILE_COLS + 1]; + // row_start_sb[i] is the start position of tile row i in superblock units. + // valid for 0 <= i <= rows + int row_start_sb[MAX_TILE_ROWS + 1]; + // If true, we are using large scale tile mode. + unsigned int large_scale; + // Only relevant when large_scale == 1. + // If true, the independent decoding of a single tile or a section of a frame + // is allowed. + unsigned int single_tile_decoding; +} CommonTileParams; + +// Struct containing params related to MB_MODE_INFO arrays and related info. +typedef struct CommonModeInfoParams CommonModeInfoParams; +struct CommonModeInfoParams { + // Number of rows/cols in the frame in 16 pixel units. + // This is computed from frame width and height aligned to a multiple of 8. + int mb_rows; + int mb_cols; + // Total MBs = mb_rows * mb_cols. + int MBs; + + // Number of rows/cols in the frame in 4 pixel (MB_MODE_INFO) units. + // This is computed from frame width and height aligned to a multiple of 8. + int mi_rows; + int mi_cols; + + // An array of MB_MODE_INFO structs for every 'mi_alloc_bsize' sized block + // in the frame. + // Note: This array should be treated like a scratch memory, and should NOT be + // accessed directly, in most cases. Please use 'mi_grid_base' array instead. + MB_MODE_INFO *mi_alloc; + // Number of allocated elements in 'mi_alloc'. + int mi_alloc_size; + // Stride for 'mi_alloc' array. + int mi_alloc_stride; + // The minimum block size that each element in 'mi_alloc' can correspond to. + // For decoder, this is always BLOCK_4X4. + // For encoder, this is currently set to BLOCK_4X4 for resolution < 4k, + // and BLOCK_8X8 for resolution >= 4k. + BLOCK_SIZE mi_alloc_bsize; + + // Grid of pointers to 4x4 MB_MODE_INFO structs allocated in 'mi_alloc'. + // It's possible that: + // - Multiple pointers in the grid point to the same element in 'mi_alloc' + // (for example, for all 4x4 blocks that belong to the same partition block). + // - Some pointers can be NULL (for example, for blocks outside visible area). + MB_MODE_INFO **mi_grid_base; + // Number of allocated elements in 'mi_grid_base' (and 'tx_type_map' also). + int mi_grid_size; + // Stride for 'mi_grid_base' (and 'tx_type_map' also). + int mi_stride; + + // An array of tx types for each 4x4 block in the frame. + // Number of allocated elements is same as 'mi_grid_size', and stride is + // same as 'mi_grid_size'. So, indexing into 'tx_type_map' is same as that of + // 'mi_grid_base'. + TX_TYPE *tx_type_map; + + // Function pointers to allow separate logic for encoder and decoder. + void (*free_mi)(struct CommonModeInfoParams *mi_params); + void (*setup_mi)(struct CommonModeInfoParams *mi_params); + void (*set_mb_mi)(struct CommonModeInfoParams *mi_params, int width, + int height); +}; + +// Parameters related to quantization at the frame level. +typedef struct CommonQuantParams CommonQuantParams; +struct CommonQuantParams { + // Base qindex of the frame in the range 0 to 255. + int base_qindex; + + // Delta of qindex (from base_qindex) for Y plane DC coefficient. + // Note: y_ac_delta_q is implicitly 0. + int y_dc_delta_q; + + // Delta of qindex (from base_qindex) for U plane DC and AC coefficients. + int u_dc_delta_q; + int v_dc_delta_q; + + // Delta of qindex (from base_qindex) for V plane DC and AC coefficients. + // Same as those for U plane if cm->seq_params.separate_uv_delta_q == 0. + int u_ac_delta_q; + int v_ac_delta_q; + + // Note: The qindex per superblock may have a delta from the qindex obtained + // at frame level from parameters above, based on 'cm->delta_q_info'. + + // The dequantizers below are true dequantizers used only in the + // dequantization process. They have the same coefficient + // shift/scale as TX. + int16_t y_dequant_QTX[MAX_SEGMENTS][2]; + int16_t u_dequant_QTX[MAX_SEGMENTS][2]; + int16_t v_dequant_QTX[MAX_SEGMENTS][2]; + + // Global quant matrix tables + const qm_val_t *giqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL]; + const qm_val_t *gqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL]; + + // Local quant matrix tables for each frame + const qm_val_t *y_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL]; + const qm_val_t *u_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL]; + const qm_val_t *v_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL]; + + // Flag indicating whether quantization matrices are being used: + // - If true, qm_level_y, qm_level_u and qm_level_v indicate the level + // indices to be used to access appropriate global quant matrix tables. + // - If false, we implicitly use level index 'NUM_QM_LEVELS - 1'. + bool using_qmatrix; + int qmatrix_level_y; + int qmatrix_level_u; + int qmatrix_level_v; +}; + +// Context used for transmitting various symbols in the bistream. +typedef struct CommonContexts CommonContexts; +struct CommonContexts { + // Context used by 'FRAME_CONTEXT.partition_cdf' to transmit partition type. + // partition[i][j] is the context for ith tile row, jth mi_col. + PARTITION_CONTEXT **partition; + + // Context used to derive context for multiple symbols: + // - 'TXB_CTX.txb_skip_ctx' used by 'FRAME_CONTEXT.txb_skip_cdf' to transmit + // to transmit skip_txfm flag. + // - 'TXB_CTX.dc_sign_ctx' used by 'FRAME_CONTEXT.dc_sign_cdf' to transmit + // sign. + // entropy[i][j][k] is the context for ith plane, jth tile row, kth mi_col. + ENTROPY_CONTEXT **entropy[MAX_MB_PLANE]; + + // Context used to derive context for 'FRAME_CONTEXT.txfm_partition_cdf' to + // transmit 'is_split' flag to indicate if this transform block should be + // split into smaller sub-blocks. + // txfm[i][j] is the context for ith tile row, jth mi_col. + TXFM_CONTEXT **txfm; + + // Dimensions that were used to allocate the arrays above. + // If these dimensions change, the arrays may have to be re-allocated. + int num_planes; // Corresponds to av1_num_planes(cm) + int num_tile_rows; // Corresponds to cm->tiles.row + int num_mi_cols; // Corresponds to cm->mi_params.mi_cols +}; + +typedef struct AV1Common { + // Information about the current frame that is being coded. + CurrentFrame current_frame; + // Code and details about current error status. + struct aom_internal_error_info error; + + // AV1 allows two types of frame scaling operations: + // (1) Frame super-resolution: that allows coding a frame at lower resolution + // and after decoding the frame, normatively uscales and restores the frame -- + // inside the coding loop. + // (2) Frame resize: that allows coding frame at lower/higher resolution, and + // then non-normatively upscale the frame at the time of rendering -- outside + // the coding loop. + // Hence, the need for 3 types of dimensions. + + // Coded frame dimensions. + int width; + int height; + + // Rendered frame dimensions, after applying both super-resolution and resize + // to the coded frame. + // Different from coded dimensions if super-resolution and/or resize are + // being used for this frame. + int render_width; + int render_height; + + // Frame dimensions after applying super-resolution to the coded frame (if + // present), but before applying resize. + // Larger than the coded dimensions if super-resolution is being used for + // this frame. + // Different from rendered dimensions if resize is being used for this frame. + int superres_upscaled_width; + int superres_upscaled_height; + + // The denominator of the superres scale used by this frame. + // Note: The numerator is fixed to be SCALE_NUMERATOR. + uint8_t superres_scale_denominator; + + // If true, buffer removal times are present. + bool buffer_removal_time_present; + // buffer_removal_times[op_num] specifies the frame removal time in units of + // DecCT clock ticks counted from the removal time of the last random access + // point for operating point op_num. + // TODO(urvang): We probably don't need the +1 here. + uint32_t buffer_removal_times[MAX_NUM_OPERATING_POINTS + 1]; + // Presentation time of the frame in clock ticks DispCT counted from the + // removal time of the last random access point for the operating point that + // is being decoded. + uint32_t frame_presentation_time; + + // Buffer where previous frame is stored. + RefCntBuffer *prev_frame; + + // Buffer into which the current frame will be stored and other related info. + // TODO(hkuang): Combine this with cur_buf in macroblockd. + RefCntBuffer *cur_frame; + + // For encoder, we have a two-level mapping from reference frame type to the + // corresponding buffer in the buffer pool: + // * 'remapped_ref_idx[i - 1]' maps reference type 'i' (range: LAST_FRAME ... + // EXTREF_FRAME) to a remapped index 'j' (in range: 0 ... REF_FRAMES - 1) + // * Later, 'cm->ref_frame_map[j]' maps the remapped index 'j' to a pointer to + // the reference counted buffer structure RefCntBuffer, taken from the buffer + // pool cm->buffer_pool->frame_bufs. + // + // LAST_FRAME, ..., EXTREF_FRAME + // | | + // v v + // remapped_ref_idx[LAST_FRAME - 1], ..., remapped_ref_idx[EXTREF_FRAME - 1] + // | | + // v v + // ref_frame_map[], ..., ref_frame_map[] + // + // Note: INTRA_FRAME always refers to the current frame, so there's no need to + // have a remapped index for the same. + int remapped_ref_idx[REF_FRAMES]; + + // Scale of the current frame with respect to itself. + // This is currently used for intra block copy, which behaves like an inter + // prediction mode, where the reference frame is the current frame itself. + struct scale_factors sf_identity; + + // Scale factors of the reference frame with respect to the current frame. + // This is required for generating inter prediction and will be non-identity + // for a reference frame, if it has different dimensions than the coded + // dimensions of the current frame. + struct scale_factors ref_scale_factors[REF_FRAMES]; + + // For decoder, ref_frame_map[i] maps reference type 'i' to a pointer to + // the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'. + // For encoder, ref_frame_map[j] (where j = remapped_ref_idx[i]) maps + // remapped reference index 'j' (that is, original reference type 'i') to + // a pointer to the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'. + RefCntBuffer *ref_frame_map[REF_FRAMES]; + + // If true, this frame is actually shown after decoding. + // If false, this frame is coded in the bitstream, but not shown. It is only + // used as a reference for other frames coded later. + int show_frame; + + // If true, this frame can be used as a show-existing frame for other frames + // coded later. + // When 'show_frame' is true, this is always true for all non-keyframes. + // When 'show_frame' is false, this value is transmitted in the bitstream. + int showable_frame; + + // If true, show an existing frame coded before, instead of actually coding a + // frame. The existing frame comes from one of the existing reference buffers, + // as signaled in the bitstream. + int show_existing_frame; + + // Whether some features are allowed or not. + FeatureFlags features; + + // Params related to MB_MODE_INFO arrays and related info. + CommonModeInfoParams mi_params; + +#if CONFIG_ENTROPY_STATS + int coef_cdf_category; +#endif + // Quantization params. + CommonQuantParams quant_params; + + // Segmentation info for current frame. + struct segmentation seg; + + // Segmentation map for previous frame. + uint8_t *last_frame_seg_map; + + // Deblocking filter parameters. + loop_filter_info_n lf_info; + struct loopfilter lf; + + // Loop Restoration filter parameters. + RestorationInfo rst_info[MAX_MB_PLANE]; // Loop Restoration filter info. + int32_t *rst_tmpbuf; // Scratch buffer for self-guided restoration filter. + RestorationLineBuffers *rlbs; // Line buffers required by loop restoration. + YV12_BUFFER_CONFIG rst_frame; // Stores the output of loop restoration. + + // CDEF (Constrained Directional Enhancement Filter) parameters. + CdefInfo cdef_info; + + // Parameters for film grain synthesis. + aom_film_grain_t film_grain_params; + + // Parameters for delta quantization and delta loop filter level. + DeltaQInfo delta_q_info; + + // Global motion parameters for each reference frame. + WarpedMotionParams global_motion[REF_FRAMES]; + + // Elements part of the sequence header, that are applicable for all the + // frames in the video. + SequenceHeader seq_params; + + // Current CDFs of all the symbols for the current frame. + FRAME_CONTEXT *fc; + // Default CDFs used when features.primary_ref_frame = PRIMARY_REF_NONE + // (e.g. for a keyframe). These default CDFs are defined by the bitstream and + // copied from default CDF tables for each symbol. + FRAME_CONTEXT *default_frame_context; + + // Parameters related to tiling. + CommonTileParams tiles; + + // External BufferPool passed from outside. + BufferPool *buffer_pool; + + // Above context buffers and their sizes. + // Note: above contexts are allocated in this struct, as their size is + // dependent on frame width, while left contexts are declared and allocated in + // MACROBLOCKD struct, as they have a fixed size. + CommonContexts above_contexts; + + // When cm->seq_params.frame_id_numbers_present_flag == 1, current and + // reference frame IDs are signaled in the bitstream. + int current_frame_id; + int ref_frame_id[REF_FRAMES]; + + // Motion vectors provided by motion field estimation. + // tpl_mvs[row * stride + col] stores MV for block at [mi_row, mi_col] where: + // mi_row = 2 * row, + // mi_col = 2 * col, and + // stride = cm->mi_params.mi_stride / 2 + TPL_MV_REF *tpl_mvs; + // Allocated size of 'tpl_mvs' array. Refer to 'ensure_mv_buffer()' function. + int tpl_mvs_mem_size; + // ref_frame_sign_bias[k] is 1 if relative distance between reference 'k' and + // current frame is positive; and 0 otherwise. + int ref_frame_sign_bias[REF_FRAMES]; + // ref_frame_side[k] is 1 if relative distance between reference 'k' and + // current frame is positive, -1 if relative distance is 0; and 0 otherwise. + // TODO(jingning): This can be combined with sign_bias later. + int8_t ref_frame_side[REF_FRAMES]; + + // Number of temporal layers: may be > 1 for SVC (scalable vector coding). + unsigned int number_temporal_layers; + // Temporal layer ID of this frame + // (in the range 0 ... (number_temporal_layers - 1)). + int temporal_layer_id; + + // Number of spatial layers: may be > 1 for SVC (scalable vector coding). + unsigned int number_spatial_layers; + // Spatial layer ID of this frame + // (in the range 0 ... (number_spatial_layers - 1)). + int spatial_layer_id; + +#if TXCOEFF_TIMER + int64_t cum_txcoeff_timer; + int64_t txcoeff_timer; + int txb_count; +#endif // TXCOEFF_TIMER + +#if TXCOEFF_COST_TIMER + int64_t cum_txcoeff_cost_timer; + int64_t txcoeff_cost_timer; + int64_t txcoeff_cost_count; +#endif // TXCOEFF_COST_TIMER + +#if CONFIG_LPF_MASK + int is_decoding; +#endif // CONFIG_LPF_MASK +} AV1_COMMON; + +// TODO(hkuang): Don't need to lock the whole pool after implementing atomic +// frame reference count. +static void lock_buffer_pool(BufferPool *const pool) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&pool->pool_mutex); +#else + (void)pool; +#endif +} + +static void unlock_buffer_pool(BufferPool *const pool) { +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(&pool->pool_mutex); +#else + (void)pool; +#endif +} + +static INLINE YV12_BUFFER_CONFIG *get_ref_frame(AV1_COMMON *cm, int index) { + if (index < 0 || index >= REF_FRAMES) return NULL; + if (cm->ref_frame_map[index] == NULL) return NULL; + return &cm->ref_frame_map[index]->buf; +} + +static INLINE int get_free_fb(AV1_COMMON *cm) { + RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + int i; + + lock_buffer_pool(cm->buffer_pool); + for (i = 0; i < FRAME_BUFFERS; ++i) + if (frame_bufs[i].ref_count == 0) break; + + if (i != FRAME_BUFFERS) { + if (frame_bufs[i].buf.use_external_reference_buffers) { + // If this frame buffer's y_buffer, u_buffer, and v_buffer point to the + // external reference buffers. Restore the buffer pointers to point to the + // internally allocated memory. + YV12_BUFFER_CONFIG *ybf = &frame_bufs[i].buf; + ybf->y_buffer = ybf->store_buf_adr[0]; + ybf->u_buffer = ybf->store_buf_adr[1]; + ybf->v_buffer = ybf->store_buf_adr[2]; + ybf->use_external_reference_buffers = 0; + } + + frame_bufs[i].ref_count = 1; + } else { + // We should never run out of free buffers. If this assertion fails, there + // is a reference leak. + assert(0 && "Ran out of free frame buffers. Likely a reference leak."); + // Reset i to be INVALID_IDX to indicate no free buffer found. + i = INVALID_IDX; + } + + unlock_buffer_pool(cm->buffer_pool); + return i; +} + +static INLINE RefCntBuffer *assign_cur_frame_new_fb(AV1_COMMON *const cm) { + // Release the previously-used frame-buffer + if (cm->cur_frame != NULL) { + --cm->cur_frame->ref_count; + cm->cur_frame = NULL; + } + + // Assign a new framebuffer + const int new_fb_idx = get_free_fb(cm); + if (new_fb_idx == INVALID_IDX) return NULL; + + cm->cur_frame = &cm->buffer_pool->frame_bufs[new_fb_idx]; + cm->cur_frame->buf.buf_8bit_valid = 0; + av1_zero(cm->cur_frame->interp_filter_selected); + return cm->cur_frame; +} + +// Modify 'lhs_ptr' to reference the buffer at 'rhs_ptr', and update the ref +// counts accordingly. +static INLINE void assign_frame_buffer_p(RefCntBuffer **lhs_ptr, + RefCntBuffer *rhs_ptr) { + RefCntBuffer *const old_ptr = *lhs_ptr; + if (old_ptr != NULL) { + assert(old_ptr->ref_count > 0); + // One less reference to the buffer at 'old_ptr', so decrease ref count. + --old_ptr->ref_count; + } + + *lhs_ptr = rhs_ptr; + // One more reference to the buffer at 'rhs_ptr', so increase ref count. + ++rhs_ptr->ref_count; +} + +static INLINE int frame_is_intra_only(const AV1_COMMON *const cm) { + return cm->current_frame.frame_type == KEY_FRAME || + cm->current_frame.frame_type == INTRA_ONLY_FRAME; +} + +static INLINE int frame_is_sframe(const AV1_COMMON *cm) { + return cm->current_frame.frame_type == S_FRAME; +} + +// These functions take a reference frame label between LAST_FRAME and +// EXTREF_FRAME inclusive. Note that this is different to the indexing +// previously used by the frame_refs[] array. +static INLINE int get_ref_frame_map_idx(const AV1_COMMON *const cm, + const MV_REFERENCE_FRAME ref_frame) { + return (ref_frame >= LAST_FRAME && ref_frame <= EXTREF_FRAME) + ? cm->remapped_ref_idx[ref_frame - LAST_FRAME] + : INVALID_IDX; +} + +static INLINE RefCntBuffer *get_ref_frame_buf( + const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) { + const int map_idx = get_ref_frame_map_idx(cm, ref_frame); + return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL; +} + +// Both const and non-const versions of this function are provided so that it +// can be used with a const AV1_COMMON if needed. +static INLINE const struct scale_factors *get_ref_scale_factors_const( + const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) { + const int map_idx = get_ref_frame_map_idx(cm, ref_frame); + return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL; +} + +static INLINE struct scale_factors *get_ref_scale_factors( + AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) { + const int map_idx = get_ref_frame_map_idx(cm, ref_frame); + return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL; +} + +static INLINE RefCntBuffer *get_primary_ref_frame_buf( + const AV1_COMMON *const cm) { + const int primary_ref_frame = cm->features.primary_ref_frame; + if (primary_ref_frame == PRIMARY_REF_NONE) return NULL; + const int map_idx = get_ref_frame_map_idx(cm, primary_ref_frame + 1); + return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL; +} + +// Returns 1 if this frame might allow mvs from some reference frame. +static INLINE int frame_might_allow_ref_frame_mvs(const AV1_COMMON *cm) { + return !cm->features.error_resilient_mode && + cm->seq_params.order_hint_info.enable_ref_frame_mvs && + cm->seq_params.order_hint_info.enable_order_hint && + !frame_is_intra_only(cm); +} + +// Returns 1 if this frame might use warped_motion +static INLINE int frame_might_allow_warped_motion(const AV1_COMMON *cm) { + return !cm->features.error_resilient_mode && !frame_is_intra_only(cm) && + cm->seq_params.enable_warped_motion; +} + +static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) { + const int buf_rows = buf->mi_rows; + const int buf_cols = buf->mi_cols; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + + if (buf->mvs == NULL || buf_rows != mi_params->mi_rows || + buf_cols != mi_params->mi_cols) { + aom_free(buf->mvs); + buf->mi_rows = mi_params->mi_rows; + buf->mi_cols = mi_params->mi_cols; + CHECK_MEM_ERROR(cm, buf->mvs, + (MV_REF *)aom_calloc(((mi_params->mi_rows + 1) >> 1) * + ((mi_params->mi_cols + 1) >> 1), + sizeof(*buf->mvs))); + aom_free(buf->seg_map); + CHECK_MEM_ERROR( + cm, buf->seg_map, + (uint8_t *)aom_calloc(mi_params->mi_rows * mi_params->mi_cols, + sizeof(*buf->seg_map))); + } + + const int mem_size = + ((mi_params->mi_rows + MAX_MIB_SIZE) >> 1) * (mi_params->mi_stride >> 1); + int realloc = cm->tpl_mvs == NULL; + if (cm->tpl_mvs) realloc |= cm->tpl_mvs_mem_size < mem_size; + + if (realloc) { + aom_free(cm->tpl_mvs); + CHECK_MEM_ERROR(cm, cm->tpl_mvs, + (TPL_MV_REF *)aom_calloc(mem_size, sizeof(*cm->tpl_mvs))); + cm->tpl_mvs_mem_size = mem_size; + } +} + +void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params); + +static INLINE int av1_num_planes(const AV1_COMMON *cm) { + return cm->seq_params.monochrome ? 1 : MAX_MB_PLANE; +} + +static INLINE void av1_init_above_context(CommonContexts *above_contexts, + int num_planes, int tile_row, + MACROBLOCKD *xd) { + for (int i = 0; i < num_planes; ++i) { + xd->above_entropy_context[i] = above_contexts->entropy[i][tile_row]; + } + xd->above_partition_context = above_contexts->partition[tile_row]; + xd->above_txfm_context = above_contexts->txfm[tile_row]; +} + +static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd, + tran_low_t *dqcoeff) { + const int num_planes = av1_num_planes(cm); + const CommonQuantParams *const quant_params = &cm->quant_params; + + for (int i = 0; i < num_planes; ++i) { + xd->plane[i].dqcoeff = dqcoeff; + + if (xd->plane[i].plane_type == PLANE_TYPE_Y) { + memcpy(xd->plane[i].seg_dequant_QTX, quant_params->y_dequant_QTX, + sizeof(quant_params->y_dequant_QTX)); + memcpy(xd->plane[i].seg_iqmatrix, quant_params->y_iqmatrix, + sizeof(quant_params->y_iqmatrix)); + + } else { + if (i == AOM_PLANE_U) { + memcpy(xd->plane[i].seg_dequant_QTX, quant_params->u_dequant_QTX, + sizeof(quant_params->u_dequant_QTX)); + memcpy(xd->plane[i].seg_iqmatrix, quant_params->u_iqmatrix, + sizeof(quant_params->u_iqmatrix)); + } else { + memcpy(xd->plane[i].seg_dequant_QTX, quant_params->v_dequant_QTX, + sizeof(quant_params->v_dequant_QTX)); + memcpy(xd->plane[i].seg_iqmatrix, quant_params->v_iqmatrix, + sizeof(quant_params->v_iqmatrix)); + } + } + } + xd->mi_stride = cm->mi_params.mi_stride; + xd->error_info = &cm->error; + cfl_init(&xd->cfl, &cm->seq_params); +} + +static INLINE void set_entropy_context(MACROBLOCKD *xd, int mi_row, int mi_col, + const int num_planes) { + int i; + int row_offset = mi_row; + int col_offset = mi_col; + for (i = 0; i < num_planes; ++i) { + struct macroblockd_plane *const pd = &xd->plane[i]; + // Offset the buffer pointer + const BLOCK_SIZE bsize = xd->mi[0]->sb_type; + if (pd->subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1)) + row_offset = mi_row - 1; + if (pd->subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1)) + col_offset = mi_col - 1; + int above_idx = col_offset; + int left_idx = row_offset & MAX_MIB_MASK; + pd->above_entropy_context = + &xd->above_entropy_context[i][above_idx >> pd->subsampling_x]; + pd->left_entropy_context = + &xd->left_entropy_context[i][left_idx >> pd->subsampling_y]; + } +} + +static INLINE int calc_mi_size(int len) { + // len is in mi units. Align to a multiple of SBs. + return ALIGN_POWER_OF_TWO(len, MAX_MIB_SIZE_LOG2); +} + +static INLINE void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh, + const int num_planes) { + int i; + for (i = 0; i < num_planes; i++) { + xd->plane[i].width = (bw * MI_SIZE) >> xd->plane[i].subsampling_x; + xd->plane[i].height = (bh * MI_SIZE) >> xd->plane[i].subsampling_y; + + xd->plane[i].width = AOMMAX(xd->plane[i].width, 4); + xd->plane[i].height = AOMMAX(xd->plane[i].height, 4); + } +} + +static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile, + int mi_row, int bh, int mi_col, int bw, + int mi_rows, int mi_cols) { + xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE); + xd->mb_to_bottom_edge = GET_MV_SUBPEL((mi_rows - bh - mi_row) * MI_SIZE); + xd->mb_to_left_edge = -GET_MV_SUBPEL((mi_col * MI_SIZE)); + xd->mb_to_right_edge = GET_MV_SUBPEL((mi_cols - bw - mi_col) * MI_SIZE); + + xd->mi_row = mi_row; + xd->mi_col = mi_col; + + // Are edges available for intra prediction? + xd->up_available = (mi_row > tile->mi_row_start); + + const int ss_x = xd->plane[1].subsampling_x; + const int ss_y = xd->plane[1].subsampling_y; + + xd->left_available = (mi_col > tile->mi_col_start); + xd->chroma_up_available = xd->up_available; + xd->chroma_left_available = xd->left_available; + if (ss_x && bw < mi_size_wide[BLOCK_8X8]) + xd->chroma_left_available = (mi_col - 1) > tile->mi_col_start; + if (ss_y && bh < mi_size_high[BLOCK_8X8]) + xd->chroma_up_available = (mi_row - 1) > tile->mi_row_start; + if (xd->up_available) { + xd->above_mbmi = xd->mi[-xd->mi_stride]; + } else { + xd->above_mbmi = NULL; + } + + if (xd->left_available) { + xd->left_mbmi = xd->mi[-1]; + } else { + xd->left_mbmi = NULL; + } + + const int chroma_ref = ((mi_row & 0x01) || !(bh & 0x01) || !ss_y) && + ((mi_col & 0x01) || !(bw & 0x01) || !ss_x); + xd->is_chroma_ref = chroma_ref; + if (chroma_ref) { + // To help calculate the "above" and "left" chroma blocks, note that the + // current block may cover multiple luma blocks (eg, if partitioned into + // 4x4 luma blocks). + // First, find the top-left-most luma block covered by this chroma block + MB_MODE_INFO **base_mi = + &xd->mi[-(mi_row & ss_y) * xd->mi_stride - (mi_col & ss_x)]; + + // Then, we consider the luma region covered by the left or above 4x4 chroma + // prediction. We want to point to the chroma reference block in that + // region, which is the bottom-right-most mi unit. + // This leads to the following offsets: + MB_MODE_INFO *chroma_above_mi = + xd->chroma_up_available ? base_mi[-xd->mi_stride + ss_x] : NULL; + xd->chroma_above_mbmi = chroma_above_mi; + + MB_MODE_INFO *chroma_left_mi = + xd->chroma_left_available ? base_mi[ss_y * xd->mi_stride - 1] : NULL; + xd->chroma_left_mbmi = chroma_left_mi; + } + + xd->height = bh; + xd->width = bw; + xd->is_sec_rect = 0; + if (xd->width < xd->height) { + // Only mark is_sec_rect as 1 for the last block. + // For PARTITION_VERT_4, it would be (0, 0, 0, 1); + // For other partitions, it would be (0, 1). + if (!((mi_col + xd->width) & (xd->height - 1))) xd->is_sec_rect = 1; + } + + if (xd->width > xd->height) + if (mi_row & (xd->width - 1)) xd->is_sec_rect = 1; +} + +static INLINE aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx, + const MB_MODE_INFO *above_mi, + const MB_MODE_INFO *left_mi) { + const PREDICTION_MODE above = av1_above_block_mode(above_mi); + const PREDICTION_MODE left = av1_left_block_mode(left_mi); + const int above_ctx = intra_mode_context[above]; + const int left_ctx = intra_mode_context[left]; + return tile_ctx->kf_y_cdf[above_ctx][left_ctx]; +} + +static INLINE void update_partition_context(MACROBLOCKD *xd, int mi_row, + int mi_col, BLOCK_SIZE subsize, + BLOCK_SIZE bsize) { + PARTITION_CONTEXT *const above_ctx = xd->above_partition_context + mi_col; + PARTITION_CONTEXT *const left_ctx = + xd->left_partition_context + (mi_row & MAX_MIB_MASK); + + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + memset(above_ctx, partition_context_lookup[subsize].above, bw); + memset(left_ctx, partition_context_lookup[subsize].left, bh); +} + +static INLINE int is_chroma_reference(int mi_row, int mi_col, BLOCK_SIZE bsize, + int subsampling_x, int subsampling_y) { + assert(bsize < BLOCK_SIZES_ALL); + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + int ref_pos = ((mi_row & 0x01) || !(bh & 0x01) || !subsampling_y) && + ((mi_col & 0x01) || !(bw & 0x01) || !subsampling_x); + return ref_pos; +} + +static INLINE aom_cdf_prob cdf_element_prob(const aom_cdf_prob *cdf, + size_t element) { + assert(cdf != NULL); + return (element > 0 ? cdf[element - 1] : CDF_PROB_TOP) - cdf[element]; +} + +static INLINE void partition_gather_horz_alike(aom_cdf_prob *out, + const aom_cdf_prob *const in, + BLOCK_SIZE bsize) { + (void)bsize; + out[0] = CDF_PROB_TOP; + out[0] -= cdf_element_prob(in, PARTITION_HORZ); + out[0] -= cdf_element_prob(in, PARTITION_SPLIT); + out[0] -= cdf_element_prob(in, PARTITION_HORZ_A); + out[0] -= cdf_element_prob(in, PARTITION_HORZ_B); + out[0] -= cdf_element_prob(in, PARTITION_VERT_A); + if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_HORZ_4); + out[0] = AOM_ICDF(out[0]); + out[1] = AOM_ICDF(CDF_PROB_TOP); +} + +static INLINE void partition_gather_vert_alike(aom_cdf_prob *out, + const aom_cdf_prob *const in, + BLOCK_SIZE bsize) { + (void)bsize; + out[0] = CDF_PROB_TOP; + out[0] -= cdf_element_prob(in, PARTITION_VERT); + out[0] -= cdf_element_prob(in, PARTITION_SPLIT); + out[0] -= cdf_element_prob(in, PARTITION_HORZ_A); + out[0] -= cdf_element_prob(in, PARTITION_VERT_A); + out[0] -= cdf_element_prob(in, PARTITION_VERT_B); + if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_VERT_4); + out[0] = AOM_ICDF(out[0]); + out[1] = AOM_ICDF(CDF_PROB_TOP); +} + +static INLINE void update_ext_partition_context(MACROBLOCKD *xd, int mi_row, + int mi_col, BLOCK_SIZE subsize, + BLOCK_SIZE bsize, + PARTITION_TYPE partition) { + if (bsize >= BLOCK_8X8) { + const int hbs = mi_size_wide[bsize] / 2; + BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT); + switch (partition) { + case PARTITION_SPLIT: + if (bsize != BLOCK_8X8) break; + AOM_FALLTHROUGH_INTENDED; + case PARTITION_NONE: + case PARTITION_HORZ: + case PARTITION_VERT: + case PARTITION_HORZ_4: + case PARTITION_VERT_4: + update_partition_context(xd, mi_row, mi_col, subsize, bsize); + break; + case PARTITION_HORZ_A: + update_partition_context(xd, mi_row, mi_col, bsize2, subsize); + update_partition_context(xd, mi_row + hbs, mi_col, subsize, subsize); + break; + case PARTITION_HORZ_B: + update_partition_context(xd, mi_row, mi_col, subsize, subsize); + update_partition_context(xd, mi_row + hbs, mi_col, bsize2, subsize); + break; + case PARTITION_VERT_A: + update_partition_context(xd, mi_row, mi_col, bsize2, subsize); + update_partition_context(xd, mi_row, mi_col + hbs, subsize, subsize); + break; + case PARTITION_VERT_B: + update_partition_context(xd, mi_row, mi_col, subsize, subsize); + update_partition_context(xd, mi_row, mi_col + hbs, bsize2, subsize); + break; + default: assert(0 && "Invalid partition type"); + } + } +} + +static INLINE int partition_plane_context(const MACROBLOCKD *xd, int mi_row, + int mi_col, BLOCK_SIZE bsize) { + const PARTITION_CONTEXT *above_ctx = xd->above_partition_context + mi_col; + const PARTITION_CONTEXT *left_ctx = + xd->left_partition_context + (mi_row & MAX_MIB_MASK); + // Minimum partition point is 8x8. Offset the bsl accordingly. + const int bsl = mi_size_wide_log2[bsize] - mi_size_wide_log2[BLOCK_8X8]; + int above = (*above_ctx >> bsl) & 1, left = (*left_ctx >> bsl) & 1; + + assert(mi_size_wide_log2[bsize] == mi_size_high_log2[bsize]); + assert(bsl >= 0); + + return (left * 2 + above) + bsl * PARTITION_PLOFFSET; +} + +// Return the number of elements in the partition CDF when +// partitioning the (square) block with luma block size of bsize. +static INLINE int partition_cdf_length(BLOCK_SIZE bsize) { + if (bsize <= BLOCK_8X8) + return PARTITION_TYPES; + else if (bsize == BLOCK_128X128) + return EXT_PARTITION_TYPES - 2; + else + return EXT_PARTITION_TYPES; +} + +static INLINE int max_block_wide(const MACROBLOCKD *xd, BLOCK_SIZE bsize, + int plane) { + assert(bsize < BLOCK_SIZES_ALL); + int max_blocks_wide = block_size_wide[bsize]; + + if (xd->mb_to_right_edge < 0) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x); + } + + // Scale the width in the transform block unit. + return max_blocks_wide >> MI_SIZE_LOG2; +} + +static INLINE int max_block_high(const MACROBLOCKD *xd, BLOCK_SIZE bsize, + int plane) { + int max_blocks_high = block_size_high[bsize]; + + if (xd->mb_to_bottom_edge < 0) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y); + } + + // Scale the height in the transform block unit. + return max_blocks_high >> MI_SIZE_LOG2; +} + +static INLINE void av1_zero_above_context(AV1_COMMON *const cm, + const MACROBLOCKD *xd, + int mi_col_start, int mi_col_end, + const int tile_row) { + const SequenceHeader *const seq_params = &cm->seq_params; + const int num_planes = av1_num_planes(cm); + const int width = mi_col_end - mi_col_start; + const int aligned_width = + ALIGN_POWER_OF_TWO(width, seq_params->mib_size_log2); + const int offset_y = mi_col_start; + const int width_y = aligned_width; + const int offset_uv = offset_y >> seq_params->subsampling_x; + const int width_uv = width_y >> seq_params->subsampling_x; + CommonContexts *const above_contexts = &cm->above_contexts; + + av1_zero_array(above_contexts->entropy[0][tile_row] + offset_y, width_y); + if (num_planes > 1) { + if (above_contexts->entropy[1][tile_row] && + above_contexts->entropy[2][tile_row]) { + av1_zero_array(above_contexts->entropy[1][tile_row] + offset_uv, + width_uv); + av1_zero_array(above_contexts->entropy[2][tile_row] + offset_uv, + width_uv); + } else { + aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, + "Invalid value of planes"); + } + } + + av1_zero_array(above_contexts->partition[tile_row] + mi_col_start, + aligned_width); + + memset(above_contexts->txfm[tile_row] + mi_col_start, + tx_size_wide[TX_SIZES_LARGEST], aligned_width * sizeof(TXFM_CONTEXT)); +} + +static INLINE void av1_zero_left_context(MACROBLOCKD *const xd) { + av1_zero(xd->left_entropy_context); + av1_zero(xd->left_partition_context); + + memset(xd->left_txfm_context_buffer, tx_size_high[TX_SIZES_LARGEST], + sizeof(xd->left_txfm_context_buffer)); +} + +// Disable array-bounds checks as the TX_SIZE enum contains values larger than +// TX_SIZES_ALL (TX_INVALID) which make extending the array as a workaround +// infeasible. The assert is enough for static analysis and this or other tools +// asan, valgrind would catch oob access at runtime. +#if defined(__GNUC__) && __GNUC__ >= 4 +#pragma GCC diagnostic ignored "-Warray-bounds" +#endif + +#if defined(__GNUC__) && __GNUC__ >= 4 +#pragma GCC diagnostic warning "-Warray-bounds" +#endif + +static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) { + int i; + for (i = 0; i < len; ++i) txfm_ctx[i] = txs; +} + +static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n4_w, int n4_h, int skip, + const MACROBLOCKD *xd) { + uint8_t bw = tx_size_wide[tx_size]; + uint8_t bh = tx_size_high[tx_size]; + + if (skip) { + bw = n4_w * MI_SIZE; + bh = n4_h * MI_SIZE; + } + + set_txfm_ctx(xd->above_txfm_context, bw, n4_w); + set_txfm_ctx(xd->left_txfm_context, bh, n4_h); +} + +static INLINE int get_mi_grid_idx(const CommonModeInfoParams *const mi_params, + int mi_row, int mi_col) { + return mi_row * mi_params->mi_stride + mi_col; +} + +static INLINE int get_alloc_mi_idx(const CommonModeInfoParams *const mi_params, + int mi_row, int mi_col) { + const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize]; + const int mi_alloc_row = mi_row / mi_alloc_size_1d; + const int mi_alloc_col = mi_col / mi_alloc_size_1d; + + return mi_alloc_row * mi_params->mi_alloc_stride + mi_alloc_col; +} + +// For this partition block, set pointers in mi_params->mi_grid_base and xd->mi. +static INLINE void set_mi_offsets(const CommonModeInfoParams *const mi_params, + MACROBLOCKD *const xd, int mi_row, + int mi_col) { + // 'mi_grid_base' should point to appropriate memory in 'mi'. + const int mi_grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col); + const int mi_alloc_idx = get_alloc_mi_idx(mi_params, mi_row, mi_col); + mi_params->mi_grid_base[mi_grid_idx] = &mi_params->mi_alloc[mi_alloc_idx]; + // 'xd->mi' should point to an offset in 'mi_grid_base'; + xd->mi = mi_params->mi_grid_base + mi_grid_idx; + // 'xd->tx_type_map' should point to an offset in 'mi_params->tx_type_map'. + xd->tx_type_map = mi_params->tx_type_map + mi_grid_idx; + xd->tx_type_map_stride = mi_params->mi_stride; +} + +static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx, + TXFM_CONTEXT *left_ctx, + TX_SIZE tx_size, TX_SIZE txb_size) { + BLOCK_SIZE bsize = txsize_to_bsize[txb_size]; + int bh = mi_size_high[bsize]; + int bw = mi_size_wide[bsize]; + uint8_t txw = tx_size_wide[tx_size]; + uint8_t txh = tx_size_high[tx_size]; + int i; + for (i = 0; i < bh; ++i) left_ctx[i] = txh; + for (i = 0; i < bw; ++i) above_ctx[i] = txw; +} + +static INLINE TX_SIZE get_sqr_tx_size(int tx_dim) { + switch (tx_dim) { + case 128: + case 64: return TX_64X64; break; + case 32: return TX_32X32; break; + case 16: return TX_16X16; break; + case 8: return TX_8X8; break; + default: return TX_4X4; + } +} + +static INLINE TX_SIZE get_tx_size(int width, int height) { + if (width == height) { + return get_sqr_tx_size(width); + } + if (width < height) { + if (width + width == height) { + switch (width) { + case 4: return TX_4X8; break; + case 8: return TX_8X16; break; + case 16: return TX_16X32; break; + case 32: return TX_32X64; break; + } + } else { + switch (width) { + case 4: return TX_4X16; break; + case 8: return TX_8X32; break; + case 16: return TX_16X64; break; + } + } + } else { + if (height + height == width) { + switch (height) { + case 4: return TX_8X4; break; + case 8: return TX_16X8; break; + case 16: return TX_32X16; break; + case 32: return TX_64X32; break; + } + } else { + switch (height) { + case 4: return TX_16X4; break; + case 8: return TX_32X8; break; + case 16: return TX_64X16; break; + } + } + } + assert(0); + return TX_4X4; +} + +static INLINE int txfm_partition_context(const TXFM_CONTEXT *const above_ctx, + const TXFM_CONTEXT *const left_ctx, + BLOCK_SIZE bsize, TX_SIZE tx_size) { + const uint8_t txw = tx_size_wide[tx_size]; + const uint8_t txh = tx_size_high[tx_size]; + const int above = *above_ctx < txw; + const int left = *left_ctx < txh; + int category = TXFM_PARTITION_CONTEXTS; + + // dummy return, not used by others. + if (tx_size <= TX_4X4) return 0; + + TX_SIZE max_tx_size = + get_sqr_tx_size(AOMMAX(block_size_wide[bsize], block_size_high[bsize])); + + if (max_tx_size >= TX_8X8) { + category = + (txsize_sqr_up_map[tx_size] != max_tx_size && max_tx_size > TX_8X8) + + (TX_SIZES - 1 - max_tx_size) * 2; + } + assert(category != TXFM_PARTITION_CONTEXTS); + return category * 3 + above + left; +} + +// Compute the next partition in the direction of the sb_type stored in the mi +// array, starting with bsize. +static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const CommonModeInfoParams *const mi_params = &cm->mi_params; + if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) + return PARTITION_INVALID; + + const int offset = mi_row * mi_params->mi_stride + mi_col; + MB_MODE_INFO **mi = mi_params->mi_grid_base + offset; + const BLOCK_SIZE subsize = mi[0]->sb_type; + + if (subsize == bsize) return PARTITION_NONE; + + const int bhigh = mi_size_high[bsize]; + const int bwide = mi_size_wide[bsize]; + const int sshigh = mi_size_high[subsize]; + const int sswide = mi_size_wide[subsize]; + + if (bsize > BLOCK_8X8 && mi_row + bwide / 2 < mi_params->mi_rows && + mi_col + bhigh / 2 < mi_params->mi_cols) { + // In this case, the block might be using an extended partition + // type. + const MB_MODE_INFO *const mbmi_right = mi[bwide / 2]; + const MB_MODE_INFO *const mbmi_below = mi[bhigh / 2 * mi_params->mi_stride]; + + if (sswide == bwide) { + // Smaller height but same width. Is PARTITION_HORZ_4, PARTITION_HORZ or + // PARTITION_HORZ_B. To distinguish the latter two, check if the lower + // half was split. + if (sshigh * 4 == bhigh) return PARTITION_HORZ_4; + assert(sshigh * 2 == bhigh); + + if (mbmi_below->sb_type == subsize) + return PARTITION_HORZ; + else + return PARTITION_HORZ_B; + } else if (sshigh == bhigh) { + // Smaller width but same height. Is PARTITION_VERT_4, PARTITION_VERT or + // PARTITION_VERT_B. To distinguish the latter two, check if the right + // half was split. + if (sswide * 4 == bwide) return PARTITION_VERT_4; + assert(sswide * 2 == bhigh); + + if (mbmi_right->sb_type == subsize) + return PARTITION_VERT; + else + return PARTITION_VERT_B; + } else { + // Smaller width and smaller height. Might be PARTITION_SPLIT or could be + // PARTITION_HORZ_A or PARTITION_VERT_A. If subsize isn't halved in both + // dimensions, we immediately know this is a split (which will recurse to + // get to subsize). Otherwise look down and to the right. With + // PARTITION_VERT_A, the right block will have height bhigh; with + // PARTITION_HORZ_A, the lower block with have width bwide. Otherwise + // it's PARTITION_SPLIT. + if (sswide * 2 != bwide || sshigh * 2 != bhigh) return PARTITION_SPLIT; + + if (mi_size_wide[mbmi_below->sb_type] == bwide) return PARTITION_HORZ_A; + if (mi_size_high[mbmi_right->sb_type] == bhigh) return PARTITION_VERT_A; + + return PARTITION_SPLIT; + } + } + const int vert_split = sswide < bwide; + const int horz_split = sshigh < bhigh; + const int split_idx = (vert_split << 1) | horz_split; + assert(split_idx != 0); + + static const PARTITION_TYPE base_partitions[4] = { + PARTITION_INVALID, PARTITION_HORZ, PARTITION_VERT, PARTITION_SPLIT + }; + + return base_partitions[split_idx]; +} + +static INLINE void set_sb_size(SequenceHeader *const seq_params, + BLOCK_SIZE sb_size) { + seq_params->sb_size = sb_size; + seq_params->mib_size = mi_size_wide[seq_params->sb_size]; + seq_params->mib_size_log2 = mi_size_wide_log2[seq_params->sb_size]; +} + +// Returns true if the frame is fully lossless at the coded resolution. +// Note: If super-resolution is used, such a frame will still NOT be lossless at +// the upscaled resolution. +static INLINE int is_coded_lossless(const AV1_COMMON *cm, + const MACROBLOCKD *xd) { + int coded_lossless = 1; + if (cm->seg.enabled) { + for (int i = 0; i < MAX_SEGMENTS; ++i) { + if (!xd->lossless[i]) { + coded_lossless = 0; + break; + } + } + } else { + coded_lossless = xd->lossless[0]; + } + return coded_lossless; +} + +static INLINE int is_valid_seq_level_idx(AV1_LEVEL seq_level_idx) { + return seq_level_idx == SEQ_LEVEL_MAX || + (seq_level_idx < SEQ_LEVELS && + // The following levels are currently undefined. + seq_level_idx != SEQ_LEVEL_2_2 && seq_level_idx != SEQ_LEVEL_2_3 && + seq_level_idx != SEQ_LEVEL_3_2 && seq_level_idx != SEQ_LEVEL_3_3 && + seq_level_idx != SEQ_LEVEL_4_2 && seq_level_idx != SEQ_LEVEL_4_3 && + seq_level_idx != SEQ_LEVEL_7_0 && seq_level_idx != SEQ_LEVEL_7_1 && + seq_level_idx != SEQ_LEVEL_7_2 && seq_level_idx != SEQ_LEVEL_7_3); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_AV1_COMMON_INT_H_ diff --git a/libs/libaom/src/av1/common/av1_inv_txfm1d.c b/libs/libaom/src/av1/common/av1_inv_txfm1d.c new file mode 100644 index 000000000..8d69efcd2 --- /dev/null +++ b/libs/libaom/src/av1/common/av1_inv_txfm1d.c @@ -0,0 +1,1841 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "av1/common/av1_inv_txfm1d.h" +#include "av1/common/av1_txfm.h" + +void av1_idct4(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + assert(output != input); + const int32_t size = 4; + const int32_t *cospi = cospi_arr(cos_bit); + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[4]; + + // stage 0; + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0]; + bf1[1] = input[2]; + bf1[2] = input[1]; + bf1[3] = input[3]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]); + bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]); + bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]); +} + +void av1_idct8(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + assert(output != input); + const int32_t size = 8; + const int32_t *cospi = cospi_arr(cos_bit); + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[8]; + + // stage 0; + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0]; + bf1[1] = input[4]; + bf1[2] = input[2]; + bf1[3] = input[6]; + bf1[4] = input[1]; + bf1[5] = input[5]; + bf1[6] = input[3]; + bf1[7] = input[7]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit); + bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit); + bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit); + bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]); + bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]); + bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]); + bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]); + bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]); + bf1[4] = bf0[4]; + bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[7] = bf0[7]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]); + bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]); + bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]); + bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]); +} + +void av1_idct16(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + assert(output != input); + const int32_t size = 16; + const int32_t *cospi = cospi_arr(cos_bit); + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[16]; + + // stage 0; + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0]; + bf1[1] = input[8]; + bf1[2] = input[4]; + bf1[3] = input[12]; + bf1[4] = input[2]; + bf1[5] = input[10]; + bf1[6] = input[6]; + bf1[7] = input[14]; + bf1[8] = input[1]; + bf1[9] = input[9]; + bf1[10] = input[5]; + bf1[11] = input[13]; + bf1[12] = input[3]; + bf1[13] = input[11]; + bf1[14] = input[7]; + bf1[15] = input[15]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit); + bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit); + bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit); + bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit); + bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit); + bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit); + bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit); + bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit); + bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit); + bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]); + bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]); + bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]); + bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]); + bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]); + bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]); + bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]); + bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit); + bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]); + bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]); + bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]); + bf1[8] = bf0[8]; + bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); + bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit); + bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit); + bf1[15] = bf0[15]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]); + bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]); + bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]); + bf1[4] = bf0[4]; + bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[7] = bf0[7]; + bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]); + bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]); + bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]); + bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]); + bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]); + bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]); + bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]); + bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]); + bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]); + bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]); + bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]); + bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]); + bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]); + bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]); + bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]); + bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]); + bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]); + bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]); + bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]); + bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]); + bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]); + bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]); + bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]); +} + +void av1_idct32(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + assert(output != input); + const int32_t size = 32; + const int32_t *cospi = cospi_arr(cos_bit); + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[32]; + + // stage 0; + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0]; + bf1[1] = input[16]; + bf1[2] = input[8]; + bf1[3] = input[24]; + bf1[4] = input[4]; + bf1[5] = input[20]; + bf1[6] = input[12]; + bf1[7] = input[28]; + bf1[8] = input[2]; + bf1[9] = input[18]; + bf1[10] = input[10]; + bf1[11] = input[26]; + bf1[12] = input[6]; + bf1[13] = input[22]; + bf1[14] = input[14]; + bf1[15] = input[30]; + bf1[16] = input[1]; + bf1[17] = input[17]; + bf1[18] = input[9]; + bf1[19] = input[25]; + bf1[20] = input[5]; + bf1[21] = input[21]; + bf1[22] = input[13]; + bf1[23] = input[29]; + bf1[24] = input[3]; + bf1[25] = input[19]; + bf1[26] = input[11]; + bf1[27] = input[27]; + bf1[28] = input[7]; + bf1[29] = input[23]; + bf1[30] = input[15]; + bf1[31] = input[31]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = bf0[10]; + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = bf0[13]; + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit); + bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit); + bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit); + bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit); + bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit); + bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit); + bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit); + bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit); + bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit); + bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit); + bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit); + bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit); + bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit); + bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit); + bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit); + bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit); + bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit); + bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit); + bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit); + bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit); + bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit); + bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit); + bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]); + bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]); + bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]); + bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]); + bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]); + bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]); + bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]); + bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]); + bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]); + bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]); + bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]); + bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]); + bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]); + bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]); + bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]); + bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit); + bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit); + bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit); + bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]); + bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]); + bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]); + bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]); + bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]); + bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]); + bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]); + bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]); + bf1[16] = bf0[16]; + bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit); + bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit); + bf1[19] = bf0[19]; + bf1[20] = bf0[20]; + bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit); + bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit); + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit); + bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit); + bf1[27] = bf0[27]; + bf1[28] = bf0[28]; + bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit); + bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit); + bf1[31] = bf0[31]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit); + bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]); + bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]); + bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]); + bf1[8] = bf0[8]; + bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); + bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit); + bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit); + bf1[15] = bf0[15]; + bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]); + bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]); + bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]); + bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]); + bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]); + bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]); + bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]); + bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]); + bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]); + bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]); + bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]); + bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]); + bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]); + bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]); + bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]); + bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]); + bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]); + bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]); + bf1[4] = bf0[4]; + bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[7] = bf0[7]; + bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]); + bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]); + bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]); + bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]); + bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]); + bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]); + bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]); + bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]); + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit); + bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit); + bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit); + bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit); + bf1[22] = bf0[22]; + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = bf0[25]; + bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit); + bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit); + bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit); + bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit); + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]); + bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]); + bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]); + bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]); + bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]); + bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]); + bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]); + bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]); + bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]); + bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]); + bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]); + bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]); + bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]); + bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]); + bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]); + bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]); + bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]); + bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]); + bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 8 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]); + bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]); + bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]); + bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]); + bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]); + bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]); + bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]); + bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]); + bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]); + bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]); + bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]); + bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]); + bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]); + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = bf0[18]; + bf1[19] = bf0[19]; + bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); + bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); + bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); + bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); + bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); + bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); + bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); + bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); + bf1[28] = bf0[28]; + bf1[29] = bf0[29]; + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 9 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]); + bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]); + bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]); + bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]); + bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]); + bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]); + bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]); + bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]); + bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]); + bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]); + bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]); + bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]); + bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]); + bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]); + bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]); + bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]); + bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]); + bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]); + bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]); + bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]); + bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]); + bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]); + bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]); + bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]); + bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]); + bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]); + bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]); + bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]); + bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]); +} + +void av1_iadst4(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + int bit = cos_bit; + const int32_t *sinpi = sinpi_arr(bit); + int32_t s0, s1, s2, s3, s4, s5, s6, s7; + + int32_t x0 = input[0]; + int32_t x1 = input[1]; + int32_t x2 = input[2]; + int32_t x3 = input[3]; + + if (!(x0 | x1 | x2 | x3)) { + output[0] = output[1] = output[2] = output[3] = 0; + return; + } + + assert(sinpi[1] + sinpi[2] == sinpi[4]); + + // stage 1 + s0 = range_check_value(sinpi[1] * x0, stage_range[1] + bit); + s1 = range_check_value(sinpi[2] * x0, stage_range[1] + bit); + s2 = range_check_value(sinpi[3] * x1, stage_range[1] + bit); + s3 = range_check_value(sinpi[4] * x2, stage_range[1] + bit); + s4 = range_check_value(sinpi[1] * x2, stage_range[1] + bit); + s5 = range_check_value(sinpi[2] * x3, stage_range[1] + bit); + s6 = range_check_value(sinpi[4] * x3, stage_range[1] + bit); + + // stage 2 + // NOTICE: (x0 - x2) here may use one extra bit compared to the + // opt_range_row/col specified in av1_gen_inv_stage_range() + s7 = range_check_value((x0 - x2) + x3, stage_range[2]); + + // stage 3 + s0 = range_check_value(s0 + s3, stage_range[3] + bit); + s1 = range_check_value(s1 - s4, stage_range[3] + bit); + s3 = range_check_value(s2, stage_range[3] + bit); + s2 = range_check_value(sinpi[3] * s7, stage_range[3] + bit); + + // stage 4 + s0 = range_check_value(s0 + s5, stage_range[4] + bit); + s1 = range_check_value(s1 - s6, stage_range[4] + bit); + + // stage 5 + x0 = range_check_value(s0 + s3, stage_range[5] + bit); + x1 = range_check_value(s1 + s3, stage_range[5] + bit); + x2 = range_check_value(s2, stage_range[5] + bit); + x3 = range_check_value(s0 + s1, stage_range[5] + bit); + + // stage 6 + x3 = range_check_value(x3 - s3, stage_range[6] + bit); + + output[0] = round_shift(x0, bit); + output[1] = round_shift(x1, bit); + output[2] = round_shift(x2, bit); + output[3] = round_shift(x3, bit); +} + +void av1_iadst8(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + assert(output != input); + const int32_t size = 8; + const int32_t *cospi = cospi_arr(cos_bit); + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[8]; + + // stage 0; + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[7]; + bf1[1] = input[0]; + bf1[2] = input[5]; + bf1[3] = input[2]; + bf1[4] = input[3]; + bf1[5] = input[4]; + bf1[6] = input[1]; + bf1[7] = input[6]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit); + bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit); + bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit); + bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]); + bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]); + bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]); + bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit); + bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit); + bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]); + bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]); + bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]); + bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]); + bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]); + bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]); + bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit); + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = -bf0[4]; + bf1[2] = bf0[6]; + bf1[3] = -bf0[2]; + bf1[4] = bf0[3]; + bf1[5] = -bf0[7]; + bf1[6] = bf0[5]; + bf1[7] = -bf0[1]; +} + +void av1_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + assert(output != input); + const int32_t size = 16; + const int32_t *cospi = cospi_arr(cos_bit); + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[16]; + + // stage 0; + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[15]; + bf1[1] = input[0]; + bf1[2] = input[13]; + bf1[3] = input[2]; + bf1[4] = input[11]; + bf1[5] = input[4]; + bf1[6] = input[9]; + bf1[7] = input[6]; + bf1[8] = input[7]; + bf1[9] = input[8]; + bf1[10] = input[5]; + bf1[11] = input[10]; + bf1[12] = input[3]; + bf1[13] = input[12]; + bf1[14] = input[1]; + bf1[15] = input[14]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit); + bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit); + bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit); + bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit); + bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit); + bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit); + bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit); + bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit); + bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit); + bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit); + bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit); + bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[8], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[9], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[10], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[11], stage_range[stage]); + bf1[4] = clamp_value(bf0[4] + bf0[12], stage_range[stage]); + bf1[5] = clamp_value(bf0[5] + bf0[13], stage_range[stage]); + bf1[6] = clamp_value(bf0[6] + bf0[14], stage_range[stage]); + bf1[7] = clamp_value(bf0[7] + bf0[15], stage_range[stage]); + bf1[8] = clamp_value(bf0[0] - bf0[8], stage_range[stage]); + bf1[9] = clamp_value(bf0[1] - bf0[9], stage_range[stage]); + bf1[10] = clamp_value(bf0[2] - bf0[10], stage_range[stage]); + bf1[11] = clamp_value(bf0[3] - bf0[11], stage_range[stage]); + bf1[12] = clamp_value(bf0[4] - bf0[12], stage_range[stage]); + bf1[13] = clamp_value(bf0[5] - bf0[13], stage_range[stage]); + bf1[14] = clamp_value(bf0[6] - bf0[14], stage_range[stage]); + bf1[15] = clamp_value(bf0[7] - bf0[15], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit); + bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit); + bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit); + bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit); + bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit); + bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit); + bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit); + bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]); + bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]); + bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]); + bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]); + bf1[8] = clamp_value(bf0[8] + bf0[12], stage_range[stage]); + bf1[9] = clamp_value(bf0[9] + bf0[13], stage_range[stage]); + bf1[10] = clamp_value(bf0[10] + bf0[14], stage_range[stage]); + bf1[11] = clamp_value(bf0[11] + bf0[15], stage_range[stage]); + bf1[12] = clamp_value(bf0[8] - bf0[12], stage_range[stage]); + bf1[13] = clamp_value(bf0[9] - bf0[13], stage_range[stage]); + bf1[14] = clamp_value(bf0[10] - bf0[14], stage_range[stage]); + bf1[15] = clamp_value(bf0[11] - bf0[15], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit); + bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit); + bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = bf0[10]; + bf1[11] = bf0[11]; + bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit); + bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit); + bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit); + bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]); + bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]); + bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]); + bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]); + bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]); + bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]); + bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]); + bf1[8] = clamp_value(bf0[8] + bf0[10], stage_range[stage]); + bf1[9] = clamp_value(bf0[9] + bf0[11], stage_range[stage]); + bf1[10] = clamp_value(bf0[8] - bf0[10], stage_range[stage]); + bf1[11] = clamp_value(bf0[9] - bf0[11], stage_range[stage]); + bf1[12] = clamp_value(bf0[12] + bf0[14], stage_range[stage]); + bf1[13] = clamp_value(bf0[13] + bf0[15], stage_range[stage]); + bf1[14] = clamp_value(bf0[12] - bf0[14], stage_range[stage]); + bf1[15] = clamp_value(bf0[13] - bf0[15], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 8 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit); + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit); + bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit); + bf1[12] = bf0[12]; + bf1[13] = bf0[13]; + bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit); + bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 9 + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = -bf0[8]; + bf1[2] = bf0[12]; + bf1[3] = -bf0[4]; + bf1[4] = bf0[6]; + bf1[5] = -bf0[14]; + bf1[6] = bf0[10]; + bf1[7] = -bf0[2]; + bf1[8] = bf0[3]; + bf1[9] = -bf0[11]; + bf1[10] = bf0[15]; + bf1[11] = -bf0[7]; + bf1[12] = bf0[5]; + bf1[13] = -bf0[13]; + bf1[14] = bf0[9]; + bf1[15] = -bf0[1]; +} + +void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + (void)cos_bit; + (void)stage_range; + for (int i = 0; i < 4; ++i) { + output[i] = round_shift((int64_t)NewSqrt2 * input[i], NewSqrt2Bits); + } + assert(stage_range[0] + NewSqrt2Bits <= 32); +} + +void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + (void)cos_bit; + (void)stage_range; + for (int i = 0; i < 8; ++i) output[i] = (int32_t)((int64_t)input[i] * 2); +} + +void av1_iidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + (void)cos_bit; + (void)stage_range; + for (int i = 0; i < 16; ++i) + output[i] = round_shift((int64_t)NewSqrt2 * 2 * input[i], NewSqrt2Bits); + assert(stage_range[0] + NewSqrt2Bits <= 32); +} + +void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + (void)cos_bit; + (void)stage_range; + for (int i = 0; i < 32; ++i) output[i] = (int32_t)((int64_t)input[i] * 4); +} + +void av1_idct64(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + assert(output != input); + const int32_t size = 64; + const int32_t *cospi = cospi_arr(cos_bit); + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[64]; + + // stage 0; + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0]; + bf1[1] = input[32]; + bf1[2] = input[16]; + bf1[3] = input[48]; + bf1[4] = input[8]; + bf1[5] = input[40]; + bf1[6] = input[24]; + bf1[7] = input[56]; + bf1[8] = input[4]; + bf1[9] = input[36]; + bf1[10] = input[20]; + bf1[11] = input[52]; + bf1[12] = input[12]; + bf1[13] = input[44]; + bf1[14] = input[28]; + bf1[15] = input[60]; + bf1[16] = input[2]; + bf1[17] = input[34]; + bf1[18] = input[18]; + bf1[19] = input[50]; + bf1[20] = input[10]; + bf1[21] = input[42]; + bf1[22] = input[26]; + bf1[23] = input[58]; + bf1[24] = input[6]; + bf1[25] = input[38]; + bf1[26] = input[22]; + bf1[27] = input[54]; + bf1[28] = input[14]; + bf1[29] = input[46]; + bf1[30] = input[30]; + bf1[31] = input[62]; + bf1[32] = input[1]; + bf1[33] = input[33]; + bf1[34] = input[17]; + bf1[35] = input[49]; + bf1[36] = input[9]; + bf1[37] = input[41]; + bf1[38] = input[25]; + bf1[39] = input[57]; + bf1[40] = input[5]; + bf1[41] = input[37]; + bf1[42] = input[21]; + bf1[43] = input[53]; + bf1[44] = input[13]; + bf1[45] = input[45]; + bf1[46] = input[29]; + bf1[47] = input[61]; + bf1[48] = input[3]; + bf1[49] = input[35]; + bf1[50] = input[19]; + bf1[51] = input[51]; + bf1[52] = input[11]; + bf1[53] = input[43]; + bf1[54] = input[27]; + bf1[55] = input[59]; + bf1[56] = input[7]; + bf1[57] = input[39]; + bf1[58] = input[23]; + bf1[59] = input[55]; + bf1[60] = input[15]; + bf1[61] = input[47]; + bf1[62] = input[31]; + bf1[63] = input[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = bf0[10]; + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = bf0[13]; + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = bf0[18]; + bf1[19] = bf0[19]; + bf1[20] = bf0[20]; + bf1[21] = bf0[21]; + bf1[22] = bf0[22]; + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = bf0[25]; + bf1[26] = bf0[26]; + bf1[27] = bf0[27]; + bf1[28] = bf0[28]; + bf1[29] = bf0[29]; + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + bf1[32] = half_btf(cospi[63], bf0[32], -cospi[1], bf0[63], cos_bit); + bf1[33] = half_btf(cospi[31], bf0[33], -cospi[33], bf0[62], cos_bit); + bf1[34] = half_btf(cospi[47], bf0[34], -cospi[17], bf0[61], cos_bit); + bf1[35] = half_btf(cospi[15], bf0[35], -cospi[49], bf0[60], cos_bit); + bf1[36] = half_btf(cospi[55], bf0[36], -cospi[9], bf0[59], cos_bit); + bf1[37] = half_btf(cospi[23], bf0[37], -cospi[41], bf0[58], cos_bit); + bf1[38] = half_btf(cospi[39], bf0[38], -cospi[25], bf0[57], cos_bit); + bf1[39] = half_btf(cospi[7], bf0[39], -cospi[57], bf0[56], cos_bit); + bf1[40] = half_btf(cospi[59], bf0[40], -cospi[5], bf0[55], cos_bit); + bf1[41] = half_btf(cospi[27], bf0[41], -cospi[37], bf0[54], cos_bit); + bf1[42] = half_btf(cospi[43], bf0[42], -cospi[21], bf0[53], cos_bit); + bf1[43] = half_btf(cospi[11], bf0[43], -cospi[53], bf0[52], cos_bit); + bf1[44] = half_btf(cospi[51], bf0[44], -cospi[13], bf0[51], cos_bit); + bf1[45] = half_btf(cospi[19], bf0[45], -cospi[45], bf0[50], cos_bit); + bf1[46] = half_btf(cospi[35], bf0[46], -cospi[29], bf0[49], cos_bit); + bf1[47] = half_btf(cospi[3], bf0[47], -cospi[61], bf0[48], cos_bit); + bf1[48] = half_btf(cospi[61], bf0[47], cospi[3], bf0[48], cos_bit); + bf1[49] = half_btf(cospi[29], bf0[46], cospi[35], bf0[49], cos_bit); + bf1[50] = half_btf(cospi[45], bf0[45], cospi[19], bf0[50], cos_bit); + bf1[51] = half_btf(cospi[13], bf0[44], cospi[51], bf0[51], cos_bit); + bf1[52] = half_btf(cospi[53], bf0[43], cospi[11], bf0[52], cos_bit); + bf1[53] = half_btf(cospi[21], bf0[42], cospi[43], bf0[53], cos_bit); + bf1[54] = half_btf(cospi[37], bf0[41], cospi[27], bf0[54], cos_bit); + bf1[55] = half_btf(cospi[5], bf0[40], cospi[59], bf0[55], cos_bit); + bf1[56] = half_btf(cospi[57], bf0[39], cospi[7], bf0[56], cos_bit); + bf1[57] = half_btf(cospi[25], bf0[38], cospi[39], bf0[57], cos_bit); + bf1[58] = half_btf(cospi[41], bf0[37], cospi[23], bf0[58], cos_bit); + bf1[59] = half_btf(cospi[9], bf0[36], cospi[55], bf0[59], cos_bit); + bf1[60] = half_btf(cospi[49], bf0[35], cospi[15], bf0[60], cos_bit); + bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit); + bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit); + bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = bf0[10]; + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = bf0[13]; + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit); + bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit); + bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit); + bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit); + bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit); + bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit); + bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit); + bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit); + bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit); + bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit); + bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit); + bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit); + bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit); + bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit); + bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit); + bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit); + bf1[32] = clamp_value(bf0[32] + bf0[33], stage_range[stage]); + bf1[33] = clamp_value(bf0[32] - bf0[33], stage_range[stage]); + bf1[34] = clamp_value(-bf0[34] + bf0[35], stage_range[stage]); + bf1[35] = clamp_value(bf0[34] + bf0[35], stage_range[stage]); + bf1[36] = clamp_value(bf0[36] + bf0[37], stage_range[stage]); + bf1[37] = clamp_value(bf0[36] - bf0[37], stage_range[stage]); + bf1[38] = clamp_value(-bf0[38] + bf0[39], stage_range[stage]); + bf1[39] = clamp_value(bf0[38] + bf0[39], stage_range[stage]); + bf1[40] = clamp_value(bf0[40] + bf0[41], stage_range[stage]); + bf1[41] = clamp_value(bf0[40] - bf0[41], stage_range[stage]); + bf1[42] = clamp_value(-bf0[42] + bf0[43], stage_range[stage]); + bf1[43] = clamp_value(bf0[42] + bf0[43], stage_range[stage]); + bf1[44] = clamp_value(bf0[44] + bf0[45], stage_range[stage]); + bf1[45] = clamp_value(bf0[44] - bf0[45], stage_range[stage]); + bf1[46] = clamp_value(-bf0[46] + bf0[47], stage_range[stage]); + bf1[47] = clamp_value(bf0[46] + bf0[47], stage_range[stage]); + bf1[48] = clamp_value(bf0[48] + bf0[49], stage_range[stage]); + bf1[49] = clamp_value(bf0[48] - bf0[49], stage_range[stage]); + bf1[50] = clamp_value(-bf0[50] + bf0[51], stage_range[stage]); + bf1[51] = clamp_value(bf0[50] + bf0[51], stage_range[stage]); + bf1[52] = clamp_value(bf0[52] + bf0[53], stage_range[stage]); + bf1[53] = clamp_value(bf0[52] - bf0[53], stage_range[stage]); + bf1[54] = clamp_value(-bf0[54] + bf0[55], stage_range[stage]); + bf1[55] = clamp_value(bf0[54] + bf0[55], stage_range[stage]); + bf1[56] = clamp_value(bf0[56] + bf0[57], stage_range[stage]); + bf1[57] = clamp_value(bf0[56] - bf0[57], stage_range[stage]); + bf1[58] = clamp_value(-bf0[58] + bf0[59], stage_range[stage]); + bf1[59] = clamp_value(bf0[58] + bf0[59], stage_range[stage]); + bf1[60] = clamp_value(bf0[60] + bf0[61], stage_range[stage]); + bf1[61] = clamp_value(bf0[60] - bf0[61], stage_range[stage]); + bf1[62] = clamp_value(-bf0[62] + bf0[63], stage_range[stage]); + bf1[63] = clamp_value(bf0[62] + bf0[63], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit); + bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit); + bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit); + bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit); + bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit); + bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit); + bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit); + bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]); + bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]); + bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]); + bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]); + bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]); + bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]); + bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]); + bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]); + bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]); + bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]); + bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]); + bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]); + bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]); + bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]); + bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]); + bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]); + bf1[32] = bf0[32]; + bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit); + bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit); + bf1[35] = bf0[35]; + bf1[36] = bf0[36]; + bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit); + bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit); + bf1[39] = bf0[39]; + bf1[40] = bf0[40]; + bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit); + bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit); + bf1[43] = bf0[43]; + bf1[44] = bf0[44]; + bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit); + bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit); + bf1[47] = bf0[47]; + bf1[48] = bf0[48]; + bf1[49] = half_btf(-cospi[52], bf0[46], cospi[12], bf0[49], cos_bit); + bf1[50] = half_btf(cospi[12], bf0[45], cospi[52], bf0[50], cos_bit); + bf1[51] = bf0[51]; + bf1[52] = bf0[52]; + bf1[53] = half_btf(-cospi[20], bf0[42], cospi[44], bf0[53], cos_bit); + bf1[54] = half_btf(cospi[44], bf0[41], cospi[20], bf0[54], cos_bit); + bf1[55] = bf0[55]; + bf1[56] = bf0[56]; + bf1[57] = half_btf(-cospi[36], bf0[38], cospi[28], bf0[57], cos_bit); + bf1[58] = half_btf(cospi[28], bf0[37], cospi[36], bf0[58], cos_bit); + bf1[59] = bf0[59]; + bf1[60] = bf0[60]; + bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit); + bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit); + bf1[63] = bf0[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit); + bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit); + bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit); + bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]); + bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]); + bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]); + bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]); + bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]); + bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]); + bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]); + bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]); + bf1[16] = bf0[16]; + bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit); + bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit); + bf1[19] = bf0[19]; + bf1[20] = bf0[20]; + bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit); + bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit); + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit); + bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit); + bf1[27] = bf0[27]; + bf1[28] = bf0[28]; + bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit); + bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit); + bf1[31] = bf0[31]; + bf1[32] = clamp_value(bf0[32] + bf0[35], stage_range[stage]); + bf1[33] = clamp_value(bf0[33] + bf0[34], stage_range[stage]); + bf1[34] = clamp_value(bf0[33] - bf0[34], stage_range[stage]); + bf1[35] = clamp_value(bf0[32] - bf0[35], stage_range[stage]); + bf1[36] = clamp_value(-bf0[36] + bf0[39], stage_range[stage]); + bf1[37] = clamp_value(-bf0[37] + bf0[38], stage_range[stage]); + bf1[38] = clamp_value(bf0[37] + bf0[38], stage_range[stage]); + bf1[39] = clamp_value(bf0[36] + bf0[39], stage_range[stage]); + bf1[40] = clamp_value(bf0[40] + bf0[43], stage_range[stage]); + bf1[41] = clamp_value(bf0[41] + bf0[42], stage_range[stage]); + bf1[42] = clamp_value(bf0[41] - bf0[42], stage_range[stage]); + bf1[43] = clamp_value(bf0[40] - bf0[43], stage_range[stage]); + bf1[44] = clamp_value(-bf0[44] + bf0[47], stage_range[stage]); + bf1[45] = clamp_value(-bf0[45] + bf0[46], stage_range[stage]); + bf1[46] = clamp_value(bf0[45] + bf0[46], stage_range[stage]); + bf1[47] = clamp_value(bf0[44] + bf0[47], stage_range[stage]); + bf1[48] = clamp_value(bf0[48] + bf0[51], stage_range[stage]); + bf1[49] = clamp_value(bf0[49] + bf0[50], stage_range[stage]); + bf1[50] = clamp_value(bf0[49] - bf0[50], stage_range[stage]); + bf1[51] = clamp_value(bf0[48] - bf0[51], stage_range[stage]); + bf1[52] = clamp_value(-bf0[52] + bf0[55], stage_range[stage]); + bf1[53] = clamp_value(-bf0[53] + bf0[54], stage_range[stage]); + bf1[54] = clamp_value(bf0[53] + bf0[54], stage_range[stage]); + bf1[55] = clamp_value(bf0[52] + bf0[55], stage_range[stage]); + bf1[56] = clamp_value(bf0[56] + bf0[59], stage_range[stage]); + bf1[57] = clamp_value(bf0[57] + bf0[58], stage_range[stage]); + bf1[58] = clamp_value(bf0[57] - bf0[58], stage_range[stage]); + bf1[59] = clamp_value(bf0[56] - bf0[59], stage_range[stage]); + bf1[60] = clamp_value(-bf0[60] + bf0[63], stage_range[stage]); + bf1[61] = clamp_value(-bf0[61] + bf0[62], stage_range[stage]); + bf1[62] = clamp_value(bf0[61] + bf0[62], stage_range[stage]); + bf1[63] = clamp_value(bf0[60] + bf0[63], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit); + bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]); + bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]); + bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]); + bf1[8] = bf0[8]; + bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); + bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit); + bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit); + bf1[15] = bf0[15]; + bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]); + bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]); + bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]); + bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]); + bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]); + bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]); + bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]); + bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]); + bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]); + bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]); + bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]); + bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]); + bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]); + bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]); + bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]); + bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]); + bf1[32] = bf0[32]; + bf1[33] = bf0[33]; + bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit); + bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit); + bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit); + bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit); + bf1[38] = bf0[38]; + bf1[39] = bf0[39]; + bf1[40] = bf0[40]; + bf1[41] = bf0[41]; + bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit); + bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit); + bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit); + bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit); + bf1[46] = bf0[46]; + bf1[47] = bf0[47]; + bf1[48] = bf0[48]; + bf1[49] = bf0[49]; + bf1[50] = half_btf(-cospi[40], bf0[45], cospi[24], bf0[50], cos_bit); + bf1[51] = half_btf(-cospi[40], bf0[44], cospi[24], bf0[51], cos_bit); + bf1[52] = half_btf(cospi[24], bf0[43], cospi[40], bf0[52], cos_bit); + bf1[53] = half_btf(cospi[24], bf0[42], cospi[40], bf0[53], cos_bit); + bf1[54] = bf0[54]; + bf1[55] = bf0[55]; + bf1[56] = bf0[56]; + bf1[57] = bf0[57]; + bf1[58] = half_btf(-cospi[8], bf0[37], cospi[56], bf0[58], cos_bit); + bf1[59] = half_btf(-cospi[8], bf0[36], cospi[56], bf0[59], cos_bit); + bf1[60] = half_btf(cospi[56], bf0[35], cospi[8], bf0[60], cos_bit); + bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit); + bf1[62] = bf0[62]; + bf1[63] = bf0[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]); + bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]); + bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]); + bf1[4] = bf0[4]; + bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[7] = bf0[7]; + bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]); + bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]); + bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]); + bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]); + bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]); + bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]); + bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]); + bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]); + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit); + bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit); + bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit); + bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit); + bf1[22] = bf0[22]; + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = bf0[25]; + bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit); + bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit); + bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit); + bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit); + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + bf1[32] = clamp_value(bf0[32] + bf0[39], stage_range[stage]); + bf1[33] = clamp_value(bf0[33] + bf0[38], stage_range[stage]); + bf1[34] = clamp_value(bf0[34] + bf0[37], stage_range[stage]); + bf1[35] = clamp_value(bf0[35] + bf0[36], stage_range[stage]); + bf1[36] = clamp_value(bf0[35] - bf0[36], stage_range[stage]); + bf1[37] = clamp_value(bf0[34] - bf0[37], stage_range[stage]); + bf1[38] = clamp_value(bf0[33] - bf0[38], stage_range[stage]); + bf1[39] = clamp_value(bf0[32] - bf0[39], stage_range[stage]); + bf1[40] = clamp_value(-bf0[40] + bf0[47], stage_range[stage]); + bf1[41] = clamp_value(-bf0[41] + bf0[46], stage_range[stage]); + bf1[42] = clamp_value(-bf0[42] + bf0[45], stage_range[stage]); + bf1[43] = clamp_value(-bf0[43] + bf0[44], stage_range[stage]); + bf1[44] = clamp_value(bf0[43] + bf0[44], stage_range[stage]); + bf1[45] = clamp_value(bf0[42] + bf0[45], stage_range[stage]); + bf1[46] = clamp_value(bf0[41] + bf0[46], stage_range[stage]); + bf1[47] = clamp_value(bf0[40] + bf0[47], stage_range[stage]); + bf1[48] = clamp_value(bf0[48] + bf0[55], stage_range[stage]); + bf1[49] = clamp_value(bf0[49] + bf0[54], stage_range[stage]); + bf1[50] = clamp_value(bf0[50] + bf0[53], stage_range[stage]); + bf1[51] = clamp_value(bf0[51] + bf0[52], stage_range[stage]); + bf1[52] = clamp_value(bf0[51] - bf0[52], stage_range[stage]); + bf1[53] = clamp_value(bf0[50] - bf0[53], stage_range[stage]); + bf1[54] = clamp_value(bf0[49] - bf0[54], stage_range[stage]); + bf1[55] = clamp_value(bf0[48] - bf0[55], stage_range[stage]); + bf1[56] = clamp_value(-bf0[56] + bf0[63], stage_range[stage]); + bf1[57] = clamp_value(-bf0[57] + bf0[62], stage_range[stage]); + bf1[58] = clamp_value(-bf0[58] + bf0[61], stage_range[stage]); + bf1[59] = clamp_value(-bf0[59] + bf0[60], stage_range[stage]); + bf1[60] = clamp_value(bf0[59] + bf0[60], stage_range[stage]); + bf1[61] = clamp_value(bf0[58] + bf0[61], stage_range[stage]); + bf1[62] = clamp_value(bf0[57] + bf0[62], stage_range[stage]); + bf1[63] = clamp_value(bf0[56] + bf0[63], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 8 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]); + bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]); + bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]); + bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]); + bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]); + bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]); + bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]); + bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]); + bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]); + bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]); + bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]); + bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]); + bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]); + bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]); + bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]); + bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]); + bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]); + bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]); + bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]); + bf1[32] = bf0[32]; + bf1[33] = bf0[33]; + bf1[34] = bf0[34]; + bf1[35] = bf0[35]; + bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit); + bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit); + bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit); + bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit); + bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit); + bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit); + bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit); + bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit); + bf1[44] = bf0[44]; + bf1[45] = bf0[45]; + bf1[46] = bf0[46]; + bf1[47] = bf0[47]; + bf1[48] = bf0[48]; + bf1[49] = bf0[49]; + bf1[50] = bf0[50]; + bf1[51] = bf0[51]; + bf1[52] = half_btf(-cospi[16], bf0[43], cospi[48], bf0[52], cos_bit); + bf1[53] = half_btf(-cospi[16], bf0[42], cospi[48], bf0[53], cos_bit); + bf1[54] = half_btf(-cospi[16], bf0[41], cospi[48], bf0[54], cos_bit); + bf1[55] = half_btf(-cospi[16], bf0[40], cospi[48], bf0[55], cos_bit); + bf1[56] = half_btf(cospi[48], bf0[39], cospi[16], bf0[56], cos_bit); + bf1[57] = half_btf(cospi[48], bf0[38], cospi[16], bf0[57], cos_bit); + bf1[58] = half_btf(cospi[48], bf0[37], cospi[16], bf0[58], cos_bit); + bf1[59] = half_btf(cospi[48], bf0[36], cospi[16], bf0[59], cos_bit); + bf1[60] = bf0[60]; + bf1[61] = bf0[61]; + bf1[62] = bf0[62]; + bf1[63] = bf0[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 9 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]); + bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]); + bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]); + bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]); + bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]); + bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]); + bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]); + bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]); + bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]); + bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]); + bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]); + bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]); + bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]); + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = bf0[18]; + bf1[19] = bf0[19]; + bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); + bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); + bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); + bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); + bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); + bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); + bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); + bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); + bf1[28] = bf0[28]; + bf1[29] = bf0[29]; + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + bf1[32] = clamp_value(bf0[32] + bf0[47], stage_range[stage]); + bf1[33] = clamp_value(bf0[33] + bf0[46], stage_range[stage]); + bf1[34] = clamp_value(bf0[34] + bf0[45], stage_range[stage]); + bf1[35] = clamp_value(bf0[35] + bf0[44], stage_range[stage]); + bf1[36] = clamp_value(bf0[36] + bf0[43], stage_range[stage]); + bf1[37] = clamp_value(bf0[37] + bf0[42], stage_range[stage]); + bf1[38] = clamp_value(bf0[38] + bf0[41], stage_range[stage]); + bf1[39] = clamp_value(bf0[39] + bf0[40], stage_range[stage]); + bf1[40] = clamp_value(bf0[39] - bf0[40], stage_range[stage]); + bf1[41] = clamp_value(bf0[38] - bf0[41], stage_range[stage]); + bf1[42] = clamp_value(bf0[37] - bf0[42], stage_range[stage]); + bf1[43] = clamp_value(bf0[36] - bf0[43], stage_range[stage]); + bf1[44] = clamp_value(bf0[35] - bf0[44], stage_range[stage]); + bf1[45] = clamp_value(bf0[34] - bf0[45], stage_range[stage]); + bf1[46] = clamp_value(bf0[33] - bf0[46], stage_range[stage]); + bf1[47] = clamp_value(bf0[32] - bf0[47], stage_range[stage]); + bf1[48] = clamp_value(-bf0[48] + bf0[63], stage_range[stage]); + bf1[49] = clamp_value(-bf0[49] + bf0[62], stage_range[stage]); + bf1[50] = clamp_value(-bf0[50] + bf0[61], stage_range[stage]); + bf1[51] = clamp_value(-bf0[51] + bf0[60], stage_range[stage]); + bf1[52] = clamp_value(-bf0[52] + bf0[59], stage_range[stage]); + bf1[53] = clamp_value(-bf0[53] + bf0[58], stage_range[stage]); + bf1[54] = clamp_value(-bf0[54] + bf0[57], stage_range[stage]); + bf1[55] = clamp_value(-bf0[55] + bf0[56], stage_range[stage]); + bf1[56] = clamp_value(bf0[55] + bf0[56], stage_range[stage]); + bf1[57] = clamp_value(bf0[54] + bf0[57], stage_range[stage]); + bf1[58] = clamp_value(bf0[53] + bf0[58], stage_range[stage]); + bf1[59] = clamp_value(bf0[52] + bf0[59], stage_range[stage]); + bf1[60] = clamp_value(bf0[51] + bf0[60], stage_range[stage]); + bf1[61] = clamp_value(bf0[50] + bf0[61], stage_range[stage]); + bf1[62] = clamp_value(bf0[49] + bf0[62], stage_range[stage]); + bf1[63] = clamp_value(bf0[48] + bf0[63], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 10 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]); + bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]); + bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]); + bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]); + bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]); + bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]); + bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]); + bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]); + bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]); + bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]); + bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]); + bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]); + bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]); + bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]); + bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]); + bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]); + bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]); + bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]); + bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]); + bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]); + bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]); + bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]); + bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]); + bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]); + bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]); + bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]); + bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]); + bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]); + bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]); + bf1[32] = bf0[32]; + bf1[33] = bf0[33]; + bf1[34] = bf0[34]; + bf1[35] = bf0[35]; + bf1[36] = bf0[36]; + bf1[37] = bf0[37]; + bf1[38] = bf0[38]; + bf1[39] = bf0[39]; + bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit); + bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit); + bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit); + bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit); + bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit); + bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit); + bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit); + bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit); + bf1[48] = half_btf(cospi[32], bf0[47], cospi[32], bf0[48], cos_bit); + bf1[49] = half_btf(cospi[32], bf0[46], cospi[32], bf0[49], cos_bit); + bf1[50] = half_btf(cospi[32], bf0[45], cospi[32], bf0[50], cos_bit); + bf1[51] = half_btf(cospi[32], bf0[44], cospi[32], bf0[51], cos_bit); + bf1[52] = half_btf(cospi[32], bf0[43], cospi[32], bf0[52], cos_bit); + bf1[53] = half_btf(cospi[32], bf0[42], cospi[32], bf0[53], cos_bit); + bf1[54] = half_btf(cospi[32], bf0[41], cospi[32], bf0[54], cos_bit); + bf1[55] = half_btf(cospi[32], bf0[40], cospi[32], bf0[55], cos_bit); + bf1[56] = bf0[56]; + bf1[57] = bf0[57]; + bf1[58] = bf0[58]; + bf1[59] = bf0[59]; + bf1[60] = bf0[60]; + bf1[61] = bf0[61]; + bf1[62] = bf0[62]; + bf1[63] = bf0[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 11 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[63], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[62], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[61], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[60], stage_range[stage]); + bf1[4] = clamp_value(bf0[4] + bf0[59], stage_range[stage]); + bf1[5] = clamp_value(bf0[5] + bf0[58], stage_range[stage]); + bf1[6] = clamp_value(bf0[6] + bf0[57], stage_range[stage]); + bf1[7] = clamp_value(bf0[7] + bf0[56], stage_range[stage]); + bf1[8] = clamp_value(bf0[8] + bf0[55], stage_range[stage]); + bf1[9] = clamp_value(bf0[9] + bf0[54], stage_range[stage]); + bf1[10] = clamp_value(bf0[10] + bf0[53], stage_range[stage]); + bf1[11] = clamp_value(bf0[11] + bf0[52], stage_range[stage]); + bf1[12] = clamp_value(bf0[12] + bf0[51], stage_range[stage]); + bf1[13] = clamp_value(bf0[13] + bf0[50], stage_range[stage]); + bf1[14] = clamp_value(bf0[14] + bf0[49], stage_range[stage]); + bf1[15] = clamp_value(bf0[15] + bf0[48], stage_range[stage]); + bf1[16] = clamp_value(bf0[16] + bf0[47], stage_range[stage]); + bf1[17] = clamp_value(bf0[17] + bf0[46], stage_range[stage]); + bf1[18] = clamp_value(bf0[18] + bf0[45], stage_range[stage]); + bf1[19] = clamp_value(bf0[19] + bf0[44], stage_range[stage]); + bf1[20] = clamp_value(bf0[20] + bf0[43], stage_range[stage]); + bf1[21] = clamp_value(bf0[21] + bf0[42], stage_range[stage]); + bf1[22] = clamp_value(bf0[22] + bf0[41], stage_range[stage]); + bf1[23] = clamp_value(bf0[23] + bf0[40], stage_range[stage]); + bf1[24] = clamp_value(bf0[24] + bf0[39], stage_range[stage]); + bf1[25] = clamp_value(bf0[25] + bf0[38], stage_range[stage]); + bf1[26] = clamp_value(bf0[26] + bf0[37], stage_range[stage]); + bf1[27] = clamp_value(bf0[27] + bf0[36], stage_range[stage]); + bf1[28] = clamp_value(bf0[28] + bf0[35], stage_range[stage]); + bf1[29] = clamp_value(bf0[29] + bf0[34], stage_range[stage]); + bf1[30] = clamp_value(bf0[30] + bf0[33], stage_range[stage]); + bf1[31] = clamp_value(bf0[31] + bf0[32], stage_range[stage]); + bf1[32] = clamp_value(bf0[31] - bf0[32], stage_range[stage]); + bf1[33] = clamp_value(bf0[30] - bf0[33], stage_range[stage]); + bf1[34] = clamp_value(bf0[29] - bf0[34], stage_range[stage]); + bf1[35] = clamp_value(bf0[28] - bf0[35], stage_range[stage]); + bf1[36] = clamp_value(bf0[27] - bf0[36], stage_range[stage]); + bf1[37] = clamp_value(bf0[26] - bf0[37], stage_range[stage]); + bf1[38] = clamp_value(bf0[25] - bf0[38], stage_range[stage]); + bf1[39] = clamp_value(bf0[24] - bf0[39], stage_range[stage]); + bf1[40] = clamp_value(bf0[23] - bf0[40], stage_range[stage]); + bf1[41] = clamp_value(bf0[22] - bf0[41], stage_range[stage]); + bf1[42] = clamp_value(bf0[21] - bf0[42], stage_range[stage]); + bf1[43] = clamp_value(bf0[20] - bf0[43], stage_range[stage]); + bf1[44] = clamp_value(bf0[19] - bf0[44], stage_range[stage]); + bf1[45] = clamp_value(bf0[18] - bf0[45], stage_range[stage]); + bf1[46] = clamp_value(bf0[17] - bf0[46], stage_range[stage]); + bf1[47] = clamp_value(bf0[16] - bf0[47], stage_range[stage]); + bf1[48] = clamp_value(bf0[15] - bf0[48], stage_range[stage]); + bf1[49] = clamp_value(bf0[14] - bf0[49], stage_range[stage]); + bf1[50] = clamp_value(bf0[13] - bf0[50], stage_range[stage]); + bf1[51] = clamp_value(bf0[12] - bf0[51], stage_range[stage]); + bf1[52] = clamp_value(bf0[11] - bf0[52], stage_range[stage]); + bf1[53] = clamp_value(bf0[10] - bf0[53], stage_range[stage]); + bf1[54] = clamp_value(bf0[9] - bf0[54], stage_range[stage]); + bf1[55] = clamp_value(bf0[8] - bf0[55], stage_range[stage]); + bf1[56] = clamp_value(bf0[7] - bf0[56], stage_range[stage]); + bf1[57] = clamp_value(bf0[6] - bf0[57], stage_range[stage]); + bf1[58] = clamp_value(bf0[5] - bf0[58], stage_range[stage]); + bf1[59] = clamp_value(bf0[4] - bf0[59], stage_range[stage]); + bf1[60] = clamp_value(bf0[3] - bf0[60], stage_range[stage]); + bf1[61] = clamp_value(bf0[2] - bf0[61], stage_range[stage]); + bf1[62] = clamp_value(bf0[1] - bf0[62], stage_range[stage]); + bf1[63] = clamp_value(bf0[0] - bf0[63], stage_range[stage]); +} diff --git a/libs/libaom/src/av1/common/av1_inv_txfm1d.h b/libs/libaom/src/av1/common/av1_inv_txfm1d.h new file mode 100644 index 000000000..e1d5d98d1 --- /dev/null +++ b/libs/libaom/src/av1/common/av1_inv_txfm1d.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_AV1_INV_TXFM1D_H_ +#define AOM_AV1_COMMON_AV1_INV_TXFM1D_H_ + +#include "av1/common/av1_txfm.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE int32_t clamp_value(int32_t value, int8_t bit) { + if (bit <= 0) return value; // Do nothing for invalid clamp bit. + const int64_t max_value = (1LL << (bit - 1)) - 1; + const int64_t min_value = -(1LL << (bit - 1)); + return (int32_t)clamp64(value, min_value, max_value); +} + +static INLINE void clamp_buf(int32_t *buf, int32_t size, int8_t bit) { + for (int i = 0; i < size; ++i) buf[i] = clamp_value(buf[i], bit); +} + +void av1_idct4(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_idct8(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_idct16(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_idct32(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_idct64(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_iadst4(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_iadst8(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_iidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AV1_COMMON_AV1_INV_TXFM1D_H_ diff --git a/libs/libaom/src/av1/common/av1_inv_txfm1d_cfg.h b/libs/libaom/src/av1/common/av1_inv_txfm1d_cfg.h new file mode 100644 index 000000000..47fedbd2a --- /dev/null +++ b/libs/libaom/src/av1/common/av1_inv_txfm1d_cfg.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_ +#define AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_ +#include "av1/common/av1_inv_txfm1d.h" + +// sum of fwd_shift_## +static const int8_t inv_start_range[TX_SIZES_ALL] = { + 5, // 4x4 transform + 6, // 8x8 transform + 7, // 16x16 transform + 7, // 32x32 transform + 7, // 64x64 transform + 5, // 4x8 transform + 5, // 8x4 transform + 6, // 8x16 transform + 6, // 16x8 transform + 6, // 16x32 transform + 6, // 32x16 transform + 6, // 32x64 transform + 6, // 64x32 transform + 6, // 4x16 transform + 6, // 16x4 transform + 7, // 8x32 transform + 7, // 32x8 transform + 7, // 16x64 transform + 7, // 64x16 transform +}; + +extern const int8_t *av1_inv_txfm_shift_ls[TX_SIZES_ALL]; + +// Values in both av1_inv_cos_bit_col and av1_inv_cos_bit_row are always 12 +// for each valid row and col combination +#define INV_COS_BIT 12 +extern const int8_t av1_inv_cos_bit_col[5 /*row*/][5 /*col*/]; +extern const int8_t av1_inv_cos_bit_row[5 /*row*/][5 /*col*/]; + +#endif // AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_ diff --git a/libs/libaom/src/av1/common/av1_inv_txfm2d.c b/libs/libaom/src/av1/common/av1_inv_txfm2d.c new file mode 100644 index 000000000..559d12129 --- /dev/null +++ b/libs/libaom/src/av1/common/av1_inv_txfm2d.c @@ -0,0 +1,504 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "av1/common/enums.h" +#include "av1/common/av1_txfm.h" +#include "av1/common/av1_inv_txfm1d.h" +#include "av1/common/av1_inv_txfm1d_cfg.h" + +void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, + 0.5 shifts per pixel. */ + int i; + tran_low_t output[16]; + tran_low_t a1, b1, c1, d1, e1; + const tran_low_t *ip = input; + tran_low_t *op = output; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + for (i = 0; i < 4; i++) { + a1 = ip[0] >> UNIT_QUANT_SHIFT; + c1 = ip[1] >> UNIT_QUANT_SHIFT; + d1 = ip[2] >> UNIT_QUANT_SHIFT; + b1 = ip[3] >> UNIT_QUANT_SHIFT; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + + op[0] = a1; + op[1] = b1; + op[2] = c1; + op[3] = d1; + ip += 4; + op += 4; + } + + ip = output; + for (i = 0; i < 4; i++) { + a1 = ip[4 * 0]; + c1 = ip[4 * 1]; + d1 = ip[4 * 2]; + b1 = ip[4 * 3]; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + + range_check_value(a1, bd + 1); + range_check_value(b1, bd + 1); + range_check_value(c1, bd + 1); + range_check_value(d1, bd + 1); + + dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd); + dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd); + dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd); + dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd); + + ip++; + dest++; + } +} + +void av1_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, + int dest_stride, int bd) { + int i; + tran_low_t a1, e1; + tran_low_t tmp[4]; + const tran_low_t *ip = in; + tran_low_t *op = tmp; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + (void)bd; + + a1 = ip[0] >> UNIT_QUANT_SHIFT; + e1 = a1 >> 1; + a1 -= e1; + op[0] = a1; + op[1] = op[2] = op[3] = e1; + + ip = tmp; + for (i = 0; i < 4; i++) { + e1 = ip[0] >> 1; + a1 = ip[0] - e1; + dest[dest_stride * 0] = + highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd); + dest[dest_stride * 1] = + highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd); + dest[dest_stride * 2] = + highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd); + dest[dest_stride * 3] = + highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd); + ip++; + dest++; + } +} + +static INLINE TxfmFunc inv_txfm_type_to_func(TXFM_TYPE txfm_type) { + switch (txfm_type) { + case TXFM_TYPE_DCT4: return av1_idct4; + case TXFM_TYPE_DCT8: return av1_idct8; + case TXFM_TYPE_DCT16: return av1_idct16; + case TXFM_TYPE_DCT32: return av1_idct32; + case TXFM_TYPE_DCT64: return av1_idct64; + case TXFM_TYPE_ADST4: return av1_iadst4; + case TXFM_TYPE_ADST8: return av1_iadst8; + case TXFM_TYPE_ADST16: return av1_iadst16; + case TXFM_TYPE_IDENTITY4: return av1_iidentity4_c; + case TXFM_TYPE_IDENTITY8: return av1_iidentity8_c; + case TXFM_TYPE_IDENTITY16: return av1_iidentity16_c; + case TXFM_TYPE_IDENTITY32: return av1_iidentity32_c; + default: assert(0); return NULL; + } +} + +static const int8_t inv_shift_4x4[2] = { 0, -4 }; +static const int8_t inv_shift_8x8[2] = { -1, -4 }; +static const int8_t inv_shift_16x16[2] = { -2, -4 }; +static const int8_t inv_shift_32x32[2] = { -2, -4 }; +static const int8_t inv_shift_64x64[2] = { -2, -4 }; +static const int8_t inv_shift_4x8[2] = { 0, -4 }; +static const int8_t inv_shift_8x4[2] = { 0, -4 }; +static const int8_t inv_shift_8x16[2] = { -1, -4 }; +static const int8_t inv_shift_16x8[2] = { -1, -4 }; +static const int8_t inv_shift_16x32[2] = { -1, -4 }; +static const int8_t inv_shift_32x16[2] = { -1, -4 }; +static const int8_t inv_shift_32x64[2] = { -1, -4 }; +static const int8_t inv_shift_64x32[2] = { -1, -4 }; +static const int8_t inv_shift_4x16[2] = { -1, -4 }; +static const int8_t inv_shift_16x4[2] = { -1, -4 }; +static const int8_t inv_shift_8x32[2] = { -2, -4 }; +static const int8_t inv_shift_32x8[2] = { -2, -4 }; +static const int8_t inv_shift_16x64[2] = { -2, -4 }; +static const int8_t inv_shift_64x16[2] = { -2, -4 }; + +const int8_t *av1_inv_txfm_shift_ls[TX_SIZES_ALL] = { + inv_shift_4x4, inv_shift_8x8, inv_shift_16x16, inv_shift_32x32, + inv_shift_64x64, inv_shift_4x8, inv_shift_8x4, inv_shift_8x16, + inv_shift_16x8, inv_shift_16x32, inv_shift_32x16, inv_shift_32x64, + inv_shift_64x32, inv_shift_4x16, inv_shift_16x4, inv_shift_8x32, + inv_shift_32x8, inv_shift_16x64, inv_shift_64x16, +}; + +/* clang-format off */ +const int8_t av1_inv_cos_bit_col[MAX_TXWH_IDX] // txw_idx + [MAX_TXWH_IDX] = { // txh_idx + { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, 0, 0 }, + { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, 0 }, + { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT }, + { 0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT }, + { 0, 0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT } + }; + +const int8_t av1_inv_cos_bit_row[MAX_TXWH_IDX] // txw_idx + [MAX_TXWH_IDX] = { // txh_idx + { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, 0, 0 }, + { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, 0 }, + { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT }, + { 0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT }, + { 0, 0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT } + }; +/* clang-format on */ + +static const int8_t iadst4_range[7] = { 0, 1, 0, 0, 0, 0, 0 }; + +void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size, + TXFM_2D_FLIP_CFG *cfg) { + assert(cfg != NULL); + cfg->tx_size = tx_size; + av1_zero(cfg->stage_range_col); + av1_zero(cfg->stage_range_row); + set_flip_cfg(tx_type, cfg); + const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type]; + const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type]; + cfg->shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + cfg->cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; + cfg->cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; + cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col]; + if (cfg->txfm_type_col == TXFM_TYPE_ADST4) { + memcpy(cfg->stage_range_col, iadst4_range, sizeof(iadst4_range)); + } + cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row]; + if (cfg->txfm_type_row == TXFM_TYPE_ADST4) { + memcpy(cfg->stage_range_row, iadst4_range, sizeof(iadst4_range)); + } + cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col]; + cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row]; +} + +void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row, + const TXFM_2D_FLIP_CFG *cfg, TX_SIZE tx_size, + int bd) { + const int fwd_shift = inv_start_range[tx_size]; + const int8_t *shift = cfg->shift; + int8_t opt_range_row, opt_range_col; + if (bd == 8) { + opt_range_row = 16; + opt_range_col = 16; + } else if (bd == 10) { + opt_range_row = 18; + opt_range_col = 16; + } else { + assert(bd == 12); + opt_range_row = 20; + opt_range_col = 18; + } + // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning + for (int i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) { + int real_range_row = cfg->stage_range_row[i] + fwd_shift + bd + 1; + (void)real_range_row; + if (cfg->txfm_type_row == TXFM_TYPE_ADST4 && i == 1) { + // the adst4 may use 1 extra bit on top of opt_range_row at stage 1 + // so opt_range_row >= real_range_row will not hold + stage_range_row[i] = opt_range_row; + } else { + assert(opt_range_row >= real_range_row); + stage_range_row[i] = opt_range_row; + } + } + // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning + for (int i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) { + int real_range_col = + cfg->stage_range_col[i] + fwd_shift + shift[0] + bd + 1; + (void)real_range_col; + if (cfg->txfm_type_col == TXFM_TYPE_ADST4 && i == 1) { + // the adst4 may use 1 extra bit on top of opt_range_col at stage 1 + // so opt_range_col >= real_range_col will not hold + stage_range_col[i] = opt_range_col; + } else { + assert(opt_range_col >= real_range_col); + stage_range_col[i] = opt_range_col; + } + } +} + +static INLINE void inv_txfm2d_add_c(const int32_t *input, uint16_t *output, + int stride, TXFM_2D_FLIP_CFG *cfg, + int32_t *txfm_buf, TX_SIZE tx_size, + int bd) { + // Note when assigning txfm_size_col, we use the txfm_size from the + // row configuration and vice versa. This is intentionally done to + // accurately perform rectangular transforms. When the transform is + // rectangular, the number of columns will be the same as the + // txfm_size stored in the row cfg struct. It will make no difference + // for square transforms. + const int txfm_size_col = tx_size_wide[cfg->tx_size]; + const int txfm_size_row = tx_size_high[cfg->tx_size]; + // Take the shift from the larger dimension in the rectangular case. + const int8_t *shift = cfg->shift; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + int8_t stage_range_row[MAX_TXFM_STAGE_NUM]; + int8_t stage_range_col[MAX_TXFM_STAGE_NUM]; + assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM); + assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM); + av1_gen_inv_stage_range(stage_range_col, stage_range_row, cfg, tx_size, bd); + + const int8_t cos_bit_col = cfg->cos_bit_col; + const int8_t cos_bit_row = cfg->cos_bit_row; + const TxfmFunc txfm_func_col = inv_txfm_type_to_func(cfg->txfm_type_col); + const TxfmFunc txfm_func_row = inv_txfm_type_to_func(cfg->txfm_type_row); + + // txfm_buf's length is txfm_size_row * txfm_size_col + 2 * + // AOMMAX(txfm_size_row, txfm_size_col) + // it is used for intermediate data buffering + const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); + int32_t *temp_in = txfm_buf; + int32_t *temp_out = temp_in + buf_offset; + int32_t *buf = temp_out + buf_offset; + int32_t *buf_ptr = buf; + int c, r; + + // Rows + for (r = 0; r < txfm_size_row; ++r) { + if (abs(rect_type) == 1) { + for (c = 0; c < txfm_size_col; ++c) { + temp_in[c] = round_shift((int64_t)input[c] * NewInvSqrt2, NewSqrt2Bits); + } + clamp_buf(temp_in, txfm_size_col, bd + 8); + txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row); + } else { + for (c = 0; c < txfm_size_col; ++c) { + temp_in[c] = input[c]; + } + clamp_buf(temp_in, txfm_size_col, bd + 8); + txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row); + } + av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]); + input += txfm_size_col; + buf_ptr += txfm_size_col; + } + + // Columns + for (c = 0; c < txfm_size_col; ++c) { + if (cfg->lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + c]; + } else { + // flip left right + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; + } + clamp_buf(temp_in, txfm_size_row, AOMMAX(bd + 6, 16)); + txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + if (cfg->ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = + highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd); + } + } else { + // flip upside down + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = highbd_clip_pixel_add( + output[r * stride + c], temp_out[txfm_size_row - r - 1], bd); + } + } + } +} + +static INLINE void inv_txfm2d_add_facade(const int32_t *input, uint16_t *output, + int stride, int32_t *txfm_buf, + TX_TYPE tx_type, TX_SIZE tx_size, + int bd) { + TXFM_2D_FLIP_CFG cfg; + av1_get_inv_txfm_cfg(tx_type, tx_size, &cfg); + // Forward shift sum uses larger square size, to be consistent with what + // av1_gen_inv_stage_range() does for inverse shifts. + inv_txfm2d_add_c(input, output, stride, &cfg, txfm_buf, tx_size, bd); +} + +void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X8, bd); +} + +void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X4, bd); +} + +void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[8 * 16 + 16 + 16]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X16, bd); +} + +void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[16 * 8 + 16 + 16]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X8, bd); +} + +void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[16 * 32 + 32 + 32]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X32, bd); +} + +void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[32 * 16 + 32 + 32]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X16, bd); +} + +void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 4 + 4]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X4, bd); +} + +void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[8 * 8 + 8 + 8]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X8, bd); +} + +void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[16 * 16 + 16 + 16]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X16, bd); +} + +void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X32, bd); +} + +void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + // TODO(urvang): Can the same array be reused, instead of using a new array? + // Remap 32x32 input into a modified 64x64 by: + // - Copying over these values in top-left 32x32 locations. + // - Setting the rest of the locations to 0. + int32_t mod_input[64 * 64]; + for (int row = 0; row < 32; ++row) { + memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input)); + memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input)); + } + memset(mod_input + 32 * 64, 0, 32 * 64 * sizeof(*mod_input)); + DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]); + inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X64, + bd); +} + +void av1_inv_txfm2d_add_64x32_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + // Remap 32x32 input into a modified 64x32 by: + // - Copying over these values in top-left 32x32 locations. + // - Setting the rest of the locations to 0. + int32_t mod_input[64 * 32]; + for (int row = 0; row < 32; ++row) { + memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input)); + memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input)); + } + DECLARE_ALIGNED(32, int, txfm_buf[64 * 32 + 64 + 64]); + inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X32, + bd); +} + +void av1_inv_txfm2d_add_32x64_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + // Remap 32x32 input into a modified 32x64 input by: + // - Copying over these values in top-left 32x32 locations. + // - Setting the rest of the locations to 0. + int32_t mod_input[32 * 64]; + memcpy(mod_input, input, 32 * 32 * sizeof(*mod_input)); + memset(mod_input + 32 * 32, 0, 32 * 32 * sizeof(*mod_input)); + DECLARE_ALIGNED(32, int, txfm_buf[64 * 32 + 64 + 64]); + inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_32X64, + bd); +} + +void av1_inv_txfm2d_add_16x64_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + // Remap 16x32 input into a modified 16x64 input by: + // - Copying over these values in top-left 16x32 locations. + // - Setting the rest of the locations to 0. + int32_t mod_input[16 * 64]; + memcpy(mod_input, input, 16 * 32 * sizeof(*mod_input)); + memset(mod_input + 16 * 32, 0, 16 * 32 * sizeof(*mod_input)); + DECLARE_ALIGNED(32, int, txfm_buf[16 * 64 + 64 + 64]); + inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_16X64, + bd); +} + +void av1_inv_txfm2d_add_64x16_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + // Remap 32x16 input into a modified 64x16 by: + // - Copying over these values in top-left 32x16 locations. + // - Setting the rest of the locations to 0. + int32_t mod_input[64 * 16]; + for (int row = 0; row < 16; ++row) { + memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input)); + memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input)); + } + DECLARE_ALIGNED(32, int, txfm_buf[16 * 64 + 64 + 64]); + inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X16, + bd); +} + +void av1_inv_txfm2d_add_4x16_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X16, bd); +} + +void av1_inv_txfm2d_add_16x4_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X4, bd); +} + +void av1_inv_txfm2d_add_8x32_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[8 * 32 + 32 + 32]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X32, bd); +} + +void av1_inv_txfm2d_add_32x8_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[8 * 32 + 32 + 32]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X8, bd); +} diff --git a/libs/libaom/src/av1/common/av1_loopfilter.c b/libs/libaom/src/av1/common/av1_loopfilter.c new file mode 100644 index 000000000..c756760de --- /dev/null +++ b/libs/libaom/src/av1/common/av1_loopfilter.c @@ -0,0 +1,790 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/av1_loopfilter.h" +#include "av1/common/reconinter.h" +#include "av1/common/seg_common.h" + +static const SEG_LVL_FEATURES seg_lvl_lf_lut[MAX_MB_PLANE][2] = { + { SEG_LVL_ALT_LF_Y_V, SEG_LVL_ALT_LF_Y_H }, + { SEG_LVL_ALT_LF_U, SEG_LVL_ALT_LF_U }, + { SEG_LVL_ALT_LF_V, SEG_LVL_ALT_LF_V } +}; + +static const int delta_lf_id_lut[MAX_MB_PLANE][2] = { { 0, 1 }, + { 2, 2 }, + { 3, 3 } }; + +static const int mode_lf_lut[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // INTRA_MODES + 1, 1, 0, 1, // INTER_MODES (GLOBALMV == 0) + 1, 1, 1, 1, 1, 1, 0, 1 // INTER_COMPOUND_MODES (GLOBAL_GLOBALMV == 0) +}; + +static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) { + int lvl; + + // For each possible value for the loop filter fill out limits + for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) { + // Set loop filter parameters that control sharpness. + int block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4)); + + if (sharpness_lvl > 0) { + if (block_inside_limit > (9 - sharpness_lvl)) + block_inside_limit = (9 - sharpness_lvl); + } + + if (block_inside_limit < 1) block_inside_limit = 1; + + memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH); + memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit), + SIMD_WIDTH); + } +} + +uint8_t av1_get_filter_level(const AV1_COMMON *cm, + const loop_filter_info_n *lfi_n, const int dir_idx, + int plane, const MB_MODE_INFO *mbmi) { + const int segment_id = mbmi->segment_id; + if (cm->delta_q_info.delta_lf_present_flag) { + int8_t delta_lf; + if (cm->delta_q_info.delta_lf_multi) { + const int delta_lf_idx = delta_lf_id_lut[plane][dir_idx]; + delta_lf = mbmi->delta_lf[delta_lf_idx]; + } else { + delta_lf = mbmi->delta_lf_from_base; + } + int base_level; + if (plane == 0) + base_level = cm->lf.filter_level[dir_idx]; + else if (plane == 1) + base_level = cm->lf.filter_level_u; + else + base_level = cm->lf.filter_level_v; + int lvl_seg = clamp(delta_lf + base_level, 0, MAX_LOOP_FILTER); + assert(plane >= 0 && plane <= 2); + const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir_idx]; + if (segfeature_active(&cm->seg, segment_id, seg_lf_feature_id)) { + const int data = get_segdata(&cm->seg, segment_id, seg_lf_feature_id); + lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER); + } + + if (cm->lf.mode_ref_delta_enabled) { + const int scale = 1 << (lvl_seg >> 5); + lvl_seg += cm->lf.ref_deltas[mbmi->ref_frame[0]] * scale; + if (mbmi->ref_frame[0] > INTRA_FRAME) + lvl_seg += cm->lf.mode_deltas[mode_lf_lut[mbmi->mode]] * scale; + lvl_seg = clamp(lvl_seg, 0, MAX_LOOP_FILTER); + } + return lvl_seg; + } else { + return lfi_n->lvl[plane][segment_id][dir_idx][mbmi->ref_frame[0]] + [mode_lf_lut[mbmi->mode]]; + } +} + +void av1_loop_filter_init(AV1_COMMON *cm) { + assert(MB_MODE_COUNT == NELEMENTS(mode_lf_lut)); + loop_filter_info_n *lfi = &cm->lf_info; + struct loopfilter *lf = &cm->lf; + int lvl; + + lf->combine_vert_horz_lf = 1; + + // init limits for given sharpness + update_sharpness(lfi, lf->sharpness_level); + + // init hev threshold const vectors + for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) + memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH); +} + +// Update the loop filter for the current frame. +// This should be called before loop_filter_rows(), +// av1_loop_filter_frame() calls this function directly. +void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start, + int plane_end) { + int filt_lvl[MAX_MB_PLANE], filt_lvl_r[MAX_MB_PLANE]; + int plane; + int seg_id; + // n_shift is the multiplier for lf_deltas + // the multiplier is 1 for when filter_lvl is between 0 and 31; + // 2 when filter_lvl is between 32 and 63 + loop_filter_info_n *const lfi = &cm->lf_info; + struct loopfilter *const lf = &cm->lf; + const struct segmentation *const seg = &cm->seg; + + // update sharpness limits + update_sharpness(lfi, lf->sharpness_level); + + filt_lvl[0] = cm->lf.filter_level[0]; + filt_lvl[1] = cm->lf.filter_level_u; + filt_lvl[2] = cm->lf.filter_level_v; + + filt_lvl_r[0] = cm->lf.filter_level[1]; + filt_lvl_r[1] = cm->lf.filter_level_u; + filt_lvl_r[2] = cm->lf.filter_level_v; + + assert(plane_start >= AOM_PLANE_Y); + assert(plane_end <= MAX_MB_PLANE); + + for (plane = plane_start; plane < plane_end; plane++) { + if (plane == 0 && !filt_lvl[0] && !filt_lvl_r[0]) + break; + else if (plane == 1 && !filt_lvl[1]) + continue; + else if (plane == 2 && !filt_lvl[2]) + continue; + + for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) { + for (int dir = 0; dir < 2; ++dir) { + int lvl_seg = (dir == 0) ? filt_lvl[plane] : filt_lvl_r[plane]; + const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir]; + if (segfeature_active(seg, seg_id, seg_lf_feature_id)) { + const int data = get_segdata(&cm->seg, seg_id, seg_lf_feature_id); + lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER); + } + + if (!lf->mode_ref_delta_enabled) { + // we could get rid of this if we assume that deltas are set to + // zero when not in use; encoder always uses deltas + memset(lfi->lvl[plane][seg_id][dir], lvl_seg, + sizeof(lfi->lvl[plane][seg_id][dir])); + } else { + int ref, mode; + const int scale = 1 << (lvl_seg >> 5); + const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale; + lfi->lvl[plane][seg_id][dir][INTRA_FRAME][0] = + clamp(intra_lvl, 0, MAX_LOOP_FILTER); + + for (ref = LAST_FRAME; ref < REF_FRAMES; ++ref) { + for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) { + const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale + + lf->mode_deltas[mode] * scale; + lfi->lvl[plane][seg_id][dir][ref][mode] = + clamp(inter_lvl, 0, MAX_LOOP_FILTER); + } + } + } + } + } + } +} + +static TX_SIZE get_transform_size(const MACROBLOCKD *const xd, + const MB_MODE_INFO *const mbmi, + const EDGE_DIR edge_dir, const int mi_row, + const int mi_col, const int plane, + const struct macroblockd_plane *plane_ptr) { + assert(mbmi != NULL); + if (xd && xd->lossless[mbmi->segment_id]) return TX_4X4; + + TX_SIZE tx_size = + (plane == AOM_PLANE_Y) + ? mbmi->tx_size + : av1_get_max_uv_txsize(mbmi->sb_type, plane_ptr->subsampling_x, + plane_ptr->subsampling_y); + assert(tx_size < TX_SIZES_ALL); + if ((plane == AOM_PLANE_Y) && is_inter_block(mbmi) && !mbmi->skip) { + const BLOCK_SIZE sb_type = mbmi->sb_type; + const int blk_row = mi_row & (mi_size_high[sb_type] - 1); + const int blk_col = mi_col & (mi_size_wide[sb_type] - 1); + const TX_SIZE mb_tx_size = + mbmi->inter_tx_size[av1_get_txb_size_index(sb_type, blk_row, blk_col)]; + assert(mb_tx_size < TX_SIZES_ALL); + tx_size = mb_tx_size; + } + + // since in case of chrominance or non-square transform need to convert + // transform size into transform size in particular direction. + // for vertical edge, filter direction is horizontal, for horizontal + // edge, filter direction is vertical. + tx_size = (VERT_EDGE == edge_dir) ? txsize_horz_map[tx_size] + : txsize_vert_map[tx_size]; + return tx_size; +} + +typedef struct AV1_DEBLOCKING_PARAMETERS { + // length of the filter applied to the outer edge + uint32_t filter_length; + // deblocking limits + const uint8_t *lim; + const uint8_t *mblim; + const uint8_t *hev_thr; +} AV1_DEBLOCKING_PARAMETERS; + +// Return TX_SIZE from get_transform_size(), so it is plane and direction +// aware +static TX_SIZE set_lpf_parameters( + AV1_DEBLOCKING_PARAMETERS *const params, const ptrdiff_t mode_step, + const AV1_COMMON *const cm, const MACROBLOCKD *const xd, + const EDGE_DIR edge_dir, const uint32_t x, const uint32_t y, + const int plane, const struct macroblockd_plane *const plane_ptr) { + // reset to initial values + params->filter_length = 0; + + // no deblocking is required + const uint32_t width = plane_ptr->dst.width; + const uint32_t height = plane_ptr->dst.height; + if ((width <= x) || (height <= y)) { + // just return the smallest transform unit size + return TX_4X4; + } + + const uint32_t scale_horz = plane_ptr->subsampling_x; + const uint32_t scale_vert = plane_ptr->subsampling_y; + // for sub8x8 block, chroma prediction mode is obtained from the bottom/right + // mi structure of the co-located 8x8 luma block. so for chroma plane, mi_row + // and mi_col should map to the bottom/right mi structure, i.e, both mi_row + // and mi_col should be odd number for chroma plane. + const int mi_row = scale_vert | ((y << scale_vert) >> MI_SIZE_LOG2); + const int mi_col = scale_horz | ((x << scale_horz) >> MI_SIZE_LOG2); + MB_MODE_INFO **mi = + cm->mi_params.mi_grid_base + mi_row * cm->mi_params.mi_stride + mi_col; + const MB_MODE_INFO *mbmi = mi[0]; + // If current mbmi is not correctly setup, return an invalid value to stop + // filtering. One example is that if this tile is not coded, then its mbmi + // it not set up. + if (mbmi == NULL) return TX_INVALID; + + const TX_SIZE ts = + get_transform_size(xd, mi[0], edge_dir, mi_row, mi_col, plane, plane_ptr); + + { + const uint32_t coord = (VERT_EDGE == edge_dir) ? (x) : (y); + const uint32_t transform_masks = + edge_dir == VERT_EDGE ? tx_size_wide[ts] - 1 : tx_size_high[ts] - 1; + const int32_t tu_edge = (coord & transform_masks) ? (0) : (1); + + if (!tu_edge) return ts; + + // prepare outer edge parameters. deblock the edge if it's an edge of a TU + { + const uint32_t curr_level = + av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi); + const int curr_skipped = mbmi->skip && is_inter_block(mbmi); + uint32_t level = curr_level; + if (coord) { + { + const MB_MODE_INFO *const mi_prev = *(mi - mode_step); + if (mi_prev == NULL) return TX_INVALID; + const int pv_row = + (VERT_EDGE == edge_dir) ? (mi_row) : (mi_row - (1 << scale_vert)); + const int pv_col = + (VERT_EDGE == edge_dir) ? (mi_col - (1 << scale_horz)) : (mi_col); + const TX_SIZE pv_ts = get_transform_size( + xd, mi_prev, edge_dir, pv_row, pv_col, plane, plane_ptr); + + const uint32_t pv_lvl = + av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev); + + const int pv_skip = mi_prev->skip && is_inter_block(mi_prev); + const BLOCK_SIZE bsize = + get_plane_block_size(mbmi->sb_type, plane_ptr->subsampling_x, + plane_ptr->subsampling_y); + assert(bsize < BLOCK_SIZES_ALL); + const int prediction_masks = edge_dir == VERT_EDGE + ? block_size_wide[bsize] - 1 + : block_size_high[bsize] - 1; + const int32_t pu_edge = !(coord & prediction_masks); + // if the current and the previous blocks are skipped, + // deblock the edge if the edge belongs to a PU's edge only. + if ((curr_level || pv_lvl) && + (!pv_skip || !curr_skipped || pu_edge)) { + const TX_SIZE min_ts = AOMMIN(ts, pv_ts); + if (TX_4X4 >= min_ts) { + params->filter_length = 4; + } else if (TX_8X8 == min_ts) { + if (plane != 0) + params->filter_length = 6; + else + params->filter_length = 8; + } else { + params->filter_length = 14; + // No wide filtering for chroma plane + if (plane != 0) { + params->filter_length = 6; + } + } + + // update the level if the current block is skipped, + // but the previous one is not + level = (curr_level) ? (curr_level) : (pv_lvl); + } + } + } + // prepare common parameters + if (params->filter_length) { + const loop_filter_thresh *const limits = cm->lf_info.lfthr + level; + params->lim = limits->lim; + params->mblim = limits->mblim; + params->hev_thr = limits->hev_thr; + } + } + } + + return ts; +} + +void av1_filter_block_plane_vert(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, const int plane, + const MACROBLOCKD_PLANE *const plane_ptr, + const uint32_t mi_row, const uint32_t mi_col) { + const uint32_t scale_horz = plane_ptr->subsampling_x; + const uint32_t scale_vert = plane_ptr->subsampling_y; + uint8_t *const dst_ptr = plane_ptr->dst.buf; + const int dst_stride = plane_ptr->dst.stride; + const int y_range = (MAX_MIB_SIZE >> scale_vert); + const int x_range = (MAX_MIB_SIZE >> scale_horz); + for (int y = 0; y < y_range; y++) { + uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride; + for (int x = 0; x < x_range;) { + // inner loop always filter vertical edges in a MI block. If MI size + // is 8x8, it will filter the vertical edge aligned with a 8x8 block. + // If 4x4 transform is used, it will then filter the internal edge + // aligned with a 4x4 block + const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE; + const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE; + uint32_t advance_units; + TX_SIZE tx_size; + AV1_DEBLOCKING_PARAMETERS params; + memset(¶ms, 0, sizeof(params)); + + tx_size = + set_lpf_parameters(¶ms, ((ptrdiff_t)1 << scale_horz), cm, xd, + VERT_EDGE, curr_x, curr_y, plane, plane_ptr); + if (tx_size == TX_INVALID) { + params.filter_length = 0; + tx_size = TX_4X4; + } + +#if CONFIG_AV1_HIGHBITDEPTH + const int use_highbitdepth = cm->seq_params.use_highbitdepth; + const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth; + switch (params.filter_length) { + // apply 4-tap filtering + case 4: + if (use_highbitdepth) + aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(p), dst_stride, + params.mblim, params.lim, params.hev_thr, + bit_depth); + else + aom_lpf_vertical_4(p, dst_stride, params.mblim, params.lim, + params.hev_thr); + break; + case 6: // apply 6-tap filter for chroma plane only + assert(plane != 0); + if (use_highbitdepth) + aom_highbd_lpf_vertical_6(CONVERT_TO_SHORTPTR(p), dst_stride, + params.mblim, params.lim, params.hev_thr, + bit_depth); + else + aom_lpf_vertical_6(p, dst_stride, params.mblim, params.lim, + params.hev_thr); + break; + // apply 8-tap filtering + case 8: + if (use_highbitdepth) + aom_highbd_lpf_vertical_8(CONVERT_TO_SHORTPTR(p), dst_stride, + params.mblim, params.lim, params.hev_thr, + bit_depth); + else + aom_lpf_vertical_8(p, dst_stride, params.mblim, params.lim, + params.hev_thr); + break; + // apply 14-tap filtering + case 14: + if (use_highbitdepth) + aom_highbd_lpf_vertical_14(CONVERT_TO_SHORTPTR(p), dst_stride, + params.mblim, params.lim, params.hev_thr, + bit_depth); + else + aom_lpf_vertical_14(p, dst_stride, params.mblim, params.lim, + params.hev_thr); + break; + // no filtering + default: break; + } +#else + switch (params.filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_vertical_4(p, dst_stride, params.mblim, params.lim, + params.hev_thr); + break; + case 6: // apply 6-tap filter for chroma plane only + assert(plane != 0); + aom_lpf_vertical_6(p, dst_stride, params.mblim, params.lim, + params.hev_thr); + break; + // apply 8-tap filtering + case 8: + aom_lpf_vertical_8(p, dst_stride, params.mblim, params.lim, + params.hev_thr); + break; + // apply 14-tap filtering + case 14: + aom_lpf_vertical_14(p, dst_stride, params.mblim, params.lim, + params.hev_thr); + break; + // no filtering + default: break; + } +#endif // CONFIG_AV1_HIGHBITDEPTH + // advance the destination pointer + advance_units = tx_size_wide_unit[tx_size]; + x += advance_units; + p += advance_units * MI_SIZE; + } + } +} + +void av1_filter_block_plane_horz(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, const int plane, + const MACROBLOCKD_PLANE *const plane_ptr, + const uint32_t mi_row, const uint32_t mi_col) { + const uint32_t scale_horz = plane_ptr->subsampling_x; + const uint32_t scale_vert = plane_ptr->subsampling_y; + uint8_t *const dst_ptr = plane_ptr->dst.buf; + const int dst_stride = plane_ptr->dst.stride; + const int y_range = (MAX_MIB_SIZE >> scale_vert); + const int x_range = (MAX_MIB_SIZE >> scale_horz); + for (int x = 0; x < x_range; x++) { + uint8_t *p = dst_ptr + x * MI_SIZE; + for (int y = 0; y < y_range;) { + // inner loop always filter vertical edges in a MI block. If MI size + // is 8x8, it will first filter the vertical edge aligned with a 8x8 + // block. If 4x4 transform is used, it will then filter the internal + // edge aligned with a 4x4 block + const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE; + const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE; + uint32_t advance_units; + TX_SIZE tx_size; + AV1_DEBLOCKING_PARAMETERS params; + memset(¶ms, 0, sizeof(params)); + + tx_size = set_lpf_parameters( + ¶ms, (cm->mi_params.mi_stride << scale_vert), cm, xd, HORZ_EDGE, + curr_x, curr_y, plane, plane_ptr); + if (tx_size == TX_INVALID) { + params.filter_length = 0; + tx_size = TX_4X4; + } + +#if CONFIG_AV1_HIGHBITDEPTH + const int use_highbitdepth = cm->seq_params.use_highbitdepth; + const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth; + switch (params.filter_length) { + // apply 4-tap filtering + case 4: + if (use_highbitdepth) + aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(p), dst_stride, + params.mblim, params.lim, + params.hev_thr, bit_depth); + else + aom_lpf_horizontal_4(p, dst_stride, params.mblim, params.lim, + params.hev_thr); + break; + // apply 6-tap filtering + case 6: + assert(plane != 0); + if (use_highbitdepth) + aom_highbd_lpf_horizontal_6(CONVERT_TO_SHORTPTR(p), dst_stride, + params.mblim, params.lim, + params.hev_thr, bit_depth); + else + aom_lpf_horizontal_6(p, dst_stride, params.mblim, params.lim, + params.hev_thr); + break; + // apply 8-tap filtering + case 8: + if (use_highbitdepth) + aom_highbd_lpf_horizontal_8(CONVERT_TO_SHORTPTR(p), dst_stride, + params.mblim, params.lim, + params.hev_thr, bit_depth); + else + aom_lpf_horizontal_8(p, dst_stride, params.mblim, params.lim, + params.hev_thr); + break; + // apply 14-tap filtering + case 14: + if (use_highbitdepth) + aom_highbd_lpf_horizontal_14(CONVERT_TO_SHORTPTR(p), dst_stride, + params.mblim, params.lim, + params.hev_thr, bit_depth); + else + aom_lpf_horizontal_14(p, dst_stride, params.mblim, params.lim, + params.hev_thr); + break; + // no filtering + default: break; + } +#else + switch (params.filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_horizontal_4(p, dst_stride, params.mblim, params.lim, + params.hev_thr); + break; + // apply 6-tap filtering + case 6: + assert(plane != 0); + aom_lpf_horizontal_6(p, dst_stride, params.mblim, params.lim, + params.hev_thr); + break; + // apply 8-tap filtering + case 8: + aom_lpf_horizontal_8(p, dst_stride, params.mblim, params.lim, + params.hev_thr); + break; + // apply 14-tap filtering + case 14: + aom_lpf_horizontal_14(p, dst_stride, params.mblim, params.lim, + params.hev_thr); + break; + // no filtering + default: break; + } +#endif // CONFIG_AV1_HIGHBITDEPTH + + // advance the destination pointer + advance_units = tx_size_high_unit[tx_size]; + y += advance_units; + p += advance_units * dst_stride * MI_SIZE; + } + } +} + +void av1_filter_block_plane_vert_test(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, + const int plane, + const MACROBLOCKD_PLANE *const plane_ptr, + const uint32_t mi_row, + const uint32_t mi_col) { + const uint32_t scale_horz = plane_ptr->subsampling_x; + const uint32_t scale_vert = plane_ptr->subsampling_y; + uint8_t *const dst_ptr = plane_ptr->dst.buf; + const int dst_stride = plane_ptr->dst.stride; + const int y_range = cm->mi_params.mi_rows >> scale_vert; + const int x_range = cm->mi_params.mi_cols >> scale_horz; + for (int y = 0; y < y_range; y++) { + uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride; + for (int x = 0; x < x_range;) { + // inner loop always filter vertical edges in a MI block. If MI size + // is 8x8, it will filter the vertical edge aligned with a 8x8 block. + // If 4x4 transform is used, it will then filter the internal edge + // aligned with a 4x4 block + const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE; + const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE; + uint32_t advance_units; + TX_SIZE tx_size; + AV1_DEBLOCKING_PARAMETERS params; + memset(¶ms, 0, sizeof(params)); + + tx_size = + set_lpf_parameters(¶ms, ((ptrdiff_t)1 << scale_horz), cm, xd, + VERT_EDGE, curr_x, curr_y, plane, plane_ptr); + if (tx_size == TX_INVALID) { + params.filter_length = 0; + tx_size = TX_4X4; + } + + // advance the destination pointer + advance_units = tx_size_wide_unit[tx_size]; + x += advance_units; + p += advance_units * MI_SIZE; + } + } +} + +void av1_filter_block_plane_horz_test(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, + const int plane, + const MACROBLOCKD_PLANE *const plane_ptr, + const uint32_t mi_row, + const uint32_t mi_col) { + const uint32_t scale_horz = plane_ptr->subsampling_x; + const uint32_t scale_vert = plane_ptr->subsampling_y; + uint8_t *const dst_ptr = plane_ptr->dst.buf; + const int dst_stride = plane_ptr->dst.stride; + const int y_range = cm->mi_params.mi_rows >> scale_vert; + const int x_range = cm->mi_params.mi_cols >> scale_horz; + for (int x = 0; x < x_range; x++) { + uint8_t *p = dst_ptr + x * MI_SIZE; + for (int y = 0; y < y_range;) { + // inner loop always filter vertical edges in a MI block. If MI size + // is 8x8, it will first filter the vertical edge aligned with a 8x8 + // block. If 4x4 transform is used, it will then filter the internal + // edge aligned with a 4x4 block + const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE; + const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE; + uint32_t advance_units; + TX_SIZE tx_size; + AV1_DEBLOCKING_PARAMETERS params; + memset(¶ms, 0, sizeof(params)); + + tx_size = set_lpf_parameters( + ¶ms, (cm->mi_params.mi_stride << scale_vert), cm, xd, HORZ_EDGE, + curr_x, curr_y, plane, plane_ptr); + if (tx_size == TX_INVALID) { + params.filter_length = 0; + tx_size = TX_4X4; + } + + // advance the destination pointer + advance_units = tx_size_high_unit[tx_size]; + y += advance_units; + p += advance_units * dst_stride * MI_SIZE; + } + } +} + +static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm, + MACROBLOCKD *xd, int start, int stop, +#if CONFIG_LPF_MASK + int is_decoding, +#endif + int plane_start, int plane_end) { + struct macroblockd_plane *pd = xd->plane; + const int col_start = 0; + const int col_end = cm->mi_params.mi_cols; + int mi_row, mi_col; + int plane; + +#if CONFIG_LPF_MASK + if (is_decoding) { + cm->is_decoding = is_decoding; + for (plane = plane_start; plane < plane_end; plane++) { + if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1])) + break; + else if (plane == 1 && !(cm->lf.filter_level_u)) + continue; + else if (plane == 2 && !(cm->lf.filter_level_v)) + continue; + + av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, 0, 0, + plane, plane + 1); + + av1_build_bitmask_vert_info(cm, &pd[plane], plane); + av1_build_bitmask_horz_info(cm, &pd[plane], plane); + + // apply loop filtering which only goes through buffer once + for (mi_row = start; mi_row < stop; mi_row += MI_SIZE_64X64) { + for (mi_col = col_start; mi_col < col_end; mi_col += MI_SIZE_64X64) { + av1_setup_dst_planes(pd, BLOCK_64X64, frame_buffer, mi_row, mi_col, + plane, plane + 1); + av1_filter_block_plane_bitmask_vert(cm, &pd[plane], plane, mi_row, + mi_col); + if (mi_col - MI_SIZE_64X64 >= 0) { + av1_setup_dst_planes(pd, BLOCK_64X64, frame_buffer, mi_row, + mi_col - MI_SIZE_64X64, plane, plane + 1); + av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row, + mi_col - MI_SIZE_64X64); + } + } + av1_setup_dst_planes(pd, BLOCK_64X64, frame_buffer, mi_row, + mi_col - MI_SIZE_64X64, plane, plane + 1); + av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row, + mi_col - MI_SIZE_64X64); + } + } + return; + } +#endif + + for (plane = plane_start; plane < plane_end; plane++) { + if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1])) + break; + else if (plane == 1 && !(cm->lf.filter_level_u)) + continue; + else if (plane == 2 && !(cm->lf.filter_level_v)) + continue; + + if (cm->lf.combine_vert_horz_lf) { + // filter all vertical and horizontal edges in every 128x128 super block + for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) { + for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) { + // filter vertical edges + av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row, + mi_col, plane, plane + 1); + av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row, + mi_col); + // filter horizontal edges + if (mi_col - MAX_MIB_SIZE >= 0) { + av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, + mi_row, mi_col - MAX_MIB_SIZE, plane, + plane + 1); + av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row, + mi_col - MAX_MIB_SIZE); + } + } + // filter horizontal edges + av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row, + mi_col - MAX_MIB_SIZE, plane, plane + 1); + av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row, + mi_col - MAX_MIB_SIZE); + } + } else { + // filter all vertical edges in every 128x128 super block + for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) { + for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) { + av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row, + mi_col, plane, plane + 1); + av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row, + mi_col); + } + } + + // filter all horizontal edges in every 128x128 super block + for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) { + for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) { + av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row, + mi_col, plane, plane + 1); + av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row, + mi_col); + } + } + } + } +} + +void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, + MACROBLOCKD *xd, +#if CONFIG_LPF_MASK + int is_decoding, +#endif + int plane_start, int plane_end, int partial_frame) { + int start_mi_row, end_mi_row, mi_rows_to_filter; + + start_mi_row = 0; + mi_rows_to_filter = cm->mi_params.mi_rows; + if (partial_frame && cm->mi_params.mi_rows > 8) { + start_mi_row = cm->mi_params.mi_rows >> 1; + start_mi_row &= 0xfffffff8; + mi_rows_to_filter = AOMMAX(cm->mi_params.mi_rows / 8, 8); + } + end_mi_row = start_mi_row + mi_rows_to_filter; + av1_loop_filter_frame_init(cm, plane_start, plane_end); + loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row, +#if CONFIG_LPF_MASK + is_decoding, +#endif + plane_start, plane_end); +} diff --git a/libs/libaom/src/av1/common/av1_loopfilter.h b/libs/libaom/src/av1/common/av1_loopfilter.h new file mode 100644 index 000000000..ce26d1647 --- /dev/null +++ b/libs/libaom/src/av1/common/av1_loopfilter.h @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_AV1_LOOPFILTER_H_ +#define AOM_AV1_COMMON_AV1_LOOPFILTER_H_ + +#include "config/aom_config.h" + +#include "aom_ports/mem.h" +#include "av1/common/blockd.h" +#include "av1/common/seg_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_LOOP_FILTER 63 +#define MAX_SHARPNESS 7 + +#define SIMD_WIDTH 16 + +enum lf_path { + LF_PATH_420, + LF_PATH_444, + LF_PATH_SLOW, +}; + +enum { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } UENUM1BYTE(EDGE_DIR); +typedef struct { + uint64_t bits[4]; +} FilterMask; + +#if CONFIG_LPF_MASK +// This structure holds bit masks for all 4x4 blocks in a 64x64 region. +// Each 1 bit represents a position in which we want to apply the loop filter. +// For Y plane, 4x4 in 64x64 requires 16x16 = 256 bit, therefore we use 4 +// uint64_t; For U, V plane, for 420 format, plane size is 32x32, thus we use +// a uint64_t to represent bitmask. +// Left_ entries refer to whether we apply a filter on the border to the +// left of the block. Above_ entries refer to whether or not to apply a +// filter on the above border. +// Since each transform is accompanied by a potentially different type of +// loop filter there is a different entry in the array for each transform size. +typedef struct { + FilterMask left_y[TX_SIZES]; + FilterMask above_y[TX_SIZES]; + FilterMask left_u[TX_SIZES]; + FilterMask above_u[TX_SIZES]; + FilterMask left_v[TX_SIZES]; + FilterMask above_v[TX_SIZES]; + + // Y plane vertical edge and horizontal edge filter level + uint8_t lfl_y_hor[MI_SIZE_64X64][MI_SIZE_64X64]; + uint8_t lfl_y_ver[MI_SIZE_64X64][MI_SIZE_64X64]; + + // U plane filter level + uint8_t lfl_u_ver[MI_SIZE_64X64][MI_SIZE_64X64]; + uint8_t lfl_u_hor[MI_SIZE_64X64][MI_SIZE_64X64]; + + // V plane filter level + uint8_t lfl_v_ver[MI_SIZE_64X64][MI_SIZE_64X64]; + uint8_t lfl_v_hor[MI_SIZE_64X64][MI_SIZE_64X64]; + + // other info + FilterMask skip; + FilterMask is_vert_border; + FilterMask is_horz_border; + // Y or UV planes, 5 tx sizes: 4x4, 8x8, 16x16, 32x32, 64x64 + FilterMask tx_size_ver[2][5]; + FilterMask tx_size_hor[2][5]; +} LoopFilterMask; +#endif // CONFIG_LPF_MASK + +struct loopfilter { + int filter_level[2]; + int filter_level_u; + int filter_level_v; + + int sharpness_level; + + uint8_t mode_ref_delta_enabled; + uint8_t mode_ref_delta_update; + + // 0 = Intra, Last, Last2+Last3, + // GF, BRF, ARF2, ARF + int8_t ref_deltas[REF_FRAMES]; + + // 0 = ZERO_MV, MV + int8_t mode_deltas[MAX_MODE_LF_DELTAS]; + + int combine_vert_horz_lf; + +#if CONFIG_LPF_MASK + LoopFilterMask *lfm; + size_t lfm_num; + int lfm_stride; +#endif // CONFIG_LPF_MASK +}; + +// Need to align this structure so when it is declared and +// passed it can be loaded into vector registers. +typedef struct { + DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, mblim[SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, lim[SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, hev_thr[SIMD_WIDTH]); +} loop_filter_thresh; + +typedef struct { + loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1]; + uint8_t lvl[MAX_MB_PLANE][MAX_SEGMENTS][2][REF_FRAMES][MAX_MODE_LF_DELTAS]; +} loop_filter_info_n; + +/* assorted loopfilter functions which get used elsewhere */ +struct AV1Common; +struct macroblockd; +struct AV1LfSyncData; + +void av1_loop_filter_init(struct AV1Common *cm); + +void av1_loop_filter_frame_init(struct AV1Common *cm, int plane_start, + int plane_end); + +#if CONFIG_LPF_MASK +void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm, + struct macroblockd *xd, int is_decoding, + int plane_start, int plane_end, int partial_frame); +#else +void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm, + struct macroblockd *xd, int plane_start, + int plane_end, int partial_frame); +#endif + +void av1_filter_block_plane_vert(const struct AV1Common *const cm, + const MACROBLOCKD *const xd, const int plane, + const MACROBLOCKD_PLANE *const plane_ptr, + const uint32_t mi_row, const uint32_t mi_col); + +void av1_filter_block_plane_horz(const struct AV1Common *const cm, + const MACROBLOCKD *const xd, const int plane, + const MACROBLOCKD_PLANE *const plane_ptr, + const uint32_t mi_row, const uint32_t mi_col); + +typedef struct LoopFilterWorkerData { + YV12_BUFFER_CONFIG *frame_buffer; + struct AV1Common *cm; + struct macroblockd_plane planes[MAX_MB_PLANE]; + // TODO(Ranjit): When the filter functions are modified to use xd->lossless + // add lossless as a member here. + MACROBLOCKD *xd; +} LFWorkerData; + +uint8_t av1_get_filter_level(const struct AV1Common *cm, + const loop_filter_info_n *lfi_n, const int dir_idx, + int plane, const MB_MODE_INFO *mbmi); +#if CONFIG_LPF_MASK +void av1_filter_block_plane_ver(struct AV1Common *const cm, + struct macroblockd_plane *const plane_ptr, + int pl, int mi_row, int mi_col); + +void av1_filter_block_plane_hor(struct AV1Common *const cm, + struct macroblockd_plane *const plane, int pl, + int mi_row, int mi_col); + +int get_index_shift(int mi_col, int mi_row, int *index); + +void av1_build_bitmask_vert_info( + struct AV1Common *const cm, const struct macroblockd_plane *const plane_ptr, + int plane); + +void av1_build_bitmask_horz_info( + struct AV1Common *const cm, const struct macroblockd_plane *const plane_ptr, + int plane); + +void av1_filter_block_plane_bitmask_vert( + struct AV1Common *const cm, struct macroblockd_plane *const plane_ptr, + int pl, int mi_row, int mi_col); + +void av1_filter_block_plane_bitmask_horz( + struct AV1Common *const cm, struct macroblockd_plane *const plane_ptr, + int pl, int mi_row, int mi_col); + +void av1_store_bitmask_univariant_tx(struct AV1Common *cm, int mi_row, + int mi_col, BLOCK_SIZE bsize, + MB_MODE_INFO *mbmi); + +void av1_store_bitmask_other_info(struct AV1Common *cm, int mi_row, int mi_col, + BLOCK_SIZE bsize, MB_MODE_INFO *mbmi, + int is_horz_coding_block_border, + int is_vert_coding_block_border); + +void av1_store_bitmask_vartx(struct AV1Common *cm, int mi_row, int mi_col, + BLOCK_SIZE bsize, TX_SIZE tx_size, + MB_MODE_INFO *mbmi); +#endif // CONFIG_LPF_MASK + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_AV1_LOOPFILTER_H_ diff --git a/libs/libaom/src/av1/common/av1_rtcd.c b/libs/libaom/src/av1/common/av1_rtcd.c new file mode 100644 index 000000000..a77a4d254 --- /dev/null +++ b/libs/libaom/src/av1/common/av1_rtcd.c @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "config/aom_config.h" + +#define RTCD_C +#include "config/av1_rtcd.h" + +#include "aom_ports/aom_once.h" + +void av1_rtcd() { + // TODO(JBB): Remove this aom_once, by insuring that both the encoder and + // decoder setup functions are protected by aom_once(); + aom_once(setup_rtcd_internal); +} diff --git a/libs/libaom/src/av1/common/av1_rtcd_defs.pl b/libs/libaom/src/av1/common/av1_rtcd_defs.pl new file mode 100644 index 000000000..296c6c572 --- /dev/null +++ b/libs/libaom/src/av1/common/av1_rtcd_defs.pl @@ -0,0 +1,496 @@ +## +## Copyright (c) 2017, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +sub av1_common_forward_decls() { +print < 0) { + for (i = 0; i < size; i++) { + arr[i] = round_shift(arr[i], bit); + } + } else { + for (i = 0; i < size; i++) { + arr[i] = (int32_t)clamp64(((int64_t)1 << (-bit)) * arr[i], INT32_MIN, + INT32_MAX); + } + } + } +} + +const TXFM_TYPE av1_txfm_type_ls[5][TX_TYPES_1D] = { + { TXFM_TYPE_DCT4, TXFM_TYPE_ADST4, TXFM_TYPE_ADST4, TXFM_TYPE_IDENTITY4 }, + { TXFM_TYPE_DCT8, TXFM_TYPE_ADST8, TXFM_TYPE_ADST8, TXFM_TYPE_IDENTITY8 }, + { TXFM_TYPE_DCT16, TXFM_TYPE_ADST16, TXFM_TYPE_ADST16, TXFM_TYPE_IDENTITY16 }, + { TXFM_TYPE_DCT32, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID, + TXFM_TYPE_IDENTITY32 }, + { TXFM_TYPE_DCT64, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID } +}; + +const int8_t av1_txfm_stage_num_list[TXFM_TYPES] = { + 4, // TXFM_TYPE_DCT4 + 6, // TXFM_TYPE_DCT8 + 8, // TXFM_TYPE_DCT16 + 10, // TXFM_TYPE_DCT32 + 12, // TXFM_TYPE_DCT64 + 7, // TXFM_TYPE_ADST4 + 8, // TXFM_TYPE_ADST8 + 10, // TXFM_TYPE_ADST16 + 1, // TXFM_TYPE_IDENTITY4 + 1, // TXFM_TYPE_IDENTITY8 + 1, // TXFM_TYPE_IDENTITY16 + 1, // TXFM_TYPE_IDENTITY32 +}; + +void av1_range_check_buf(int32_t stage, const int32_t *input, + const int32_t *buf, int32_t size, int8_t bit) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + const int64_t max_value = (1LL << (bit - 1)) - 1; + const int64_t min_value = -(1LL << (bit - 1)); + + int in_range = 1; + + for (int i = 0; i < size; ++i) { + if (buf[i] < min_value || buf[i] > max_value) { + in_range = 0; + } + } + + if (!in_range) { + fprintf(stderr, "Error: coeffs contain out-of-range values\n"); + fprintf(stderr, "size: %d\n", size); + fprintf(stderr, "stage: %d\n", stage); + fprintf(stderr, "allowed range: [%" PRId64 ";%" PRId64 "]\n", min_value, + max_value); + + fprintf(stderr, "coeffs: "); + + fprintf(stderr, "["); + for (int j = 0; j < size; j++) { + if (j > 0) fprintf(stderr, ", "); + fprintf(stderr, "%d", input[j]); + } + fprintf(stderr, "]\n"); + + fprintf(stderr, " buf: "); + + fprintf(stderr, "["); + for (int j = 0; j < size; j++) { + if (j > 0) fprintf(stderr, ", "); + fprintf(stderr, "%d", buf[j]); + } + fprintf(stderr, "]\n\n"); + } + + assert(in_range); +#else + (void)stage; + (void)input; + (void)buf; + (void)size; + (void)bit; +#endif +} diff --git a/libs/libaom/src/av1/common/av1_txfm.h b/libs/libaom/src/av1/common/av1_txfm.h new file mode 100644 index 000000000..20049b680 --- /dev/null +++ b/libs/libaom/src/av1/common/av1_txfm.h @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_AV1_TXFM_H_ +#define AOM_AV1_COMMON_AV1_TXFM_H_ + +#include +#include +#include + +#include "config/aom_config.h" + +#include "av1/common/enums.h" +#include "av1/common/blockd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if !defined(DO_RANGE_CHECK_CLAMP) +#define DO_RANGE_CHECK_CLAMP 0 +#endif + +extern const int32_t av1_cospi_arr_data[7][64]; +extern const int32_t av1_sinpi_arr_data[7][5]; + +#define MAX_TXFM_STAGE_NUM 12 + +static const int cos_bit_min = 10; +static const int cos_bit_max = 16; + +#define NewSqrt2Bits ((int32_t)12) +// 2^12 * sqrt(2) +static const int32_t NewSqrt2 = 5793; +// 2^12 / sqrt(2) +static const int32_t NewInvSqrt2 = 2896; + +static INLINE const int32_t *cospi_arr(int n) { + return av1_cospi_arr_data[n - cos_bit_min]; +} + +static INLINE const int32_t *sinpi_arr(int n) { + return av1_sinpi_arr_data[n - cos_bit_min]; +} + +static INLINE int32_t range_check_value(int32_t value, int8_t bit) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + const int64_t max_value = (1LL << (bit - 1)) - 1; + const int64_t min_value = -(1LL << (bit - 1)); + if (value < min_value || value > max_value) { + fprintf(stderr, "coeff out of bit range, value: %d bit %d\n", value, bit); +#if !CONFIG_AV1_ENCODER + assert(0); +#endif + } +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING +#if DO_RANGE_CHECK_CLAMP + bit = AOMMIN(bit, 31); + return clamp(value, -(1 << (bit - 1)), (1 << (bit - 1)) - 1); +#endif // DO_RANGE_CHECK_CLAMP + (void)bit; + return value; +} + +static INLINE int32_t round_shift(int64_t value, int bit) { + assert(bit >= 1); + return (int32_t)((value + (1ll << (bit - 1))) >> bit); +} + +static INLINE int32_t half_btf(int32_t w0, int32_t in0, int32_t w1, int32_t in1, + int bit) { + int64_t result_64 = (int64_t)(w0 * in0) + (int64_t)(w1 * in1); + int64_t intermediate = result_64 + (1LL << (bit - 1)); + // NOTE(david.barker): The value 'result_64' may not necessarily fit + // into 32 bits. However, the result of this function is nominally + // ROUND_POWER_OF_TWO_64(result_64, bit) + // and that is required to fit into stage_range[stage] many bits + // (checked by range_check_buf()). + // + // Here we've unpacked that rounding operation, and it can be shown + // that the value of 'intermediate' here *does* fit into 32 bits + // for any conformant bitstream. + // The upshot is that, if you do all this calculation using + // wrapping 32-bit arithmetic instead of (non-wrapping) 64-bit arithmetic, + // then you'll still get the correct result. + // To provide a check on this logic, we assert that 'intermediate' + // would fit into an int32 if range checking is enabled. +#if CONFIG_COEFFICIENT_RANGE_CHECKING + assert(intermediate >= INT32_MIN && intermediate <= INT32_MAX); +#endif + return (int32_t)(intermediate >> bit); +} + +static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans, + int bd) { + return clip_pixel_highbd(dest + (int)trans, bd); +} + +typedef void (*TxfmFunc)(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); + +typedef void (*FwdTxfm2dFunc)(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd); + +enum { + TXFM_TYPE_DCT4, + TXFM_TYPE_DCT8, + TXFM_TYPE_DCT16, + TXFM_TYPE_DCT32, + TXFM_TYPE_DCT64, + TXFM_TYPE_ADST4, + TXFM_TYPE_ADST8, + TXFM_TYPE_ADST16, + TXFM_TYPE_IDENTITY4, + TXFM_TYPE_IDENTITY8, + TXFM_TYPE_IDENTITY16, + TXFM_TYPE_IDENTITY32, + TXFM_TYPES, + TXFM_TYPE_INVALID, +} UENUM1BYTE(TXFM_TYPE); + +typedef struct TXFM_2D_FLIP_CFG { + TX_SIZE tx_size; + int ud_flip; // flip upside down + int lr_flip; // flip left to right + const int8_t *shift; + int8_t cos_bit_col; + int8_t cos_bit_row; + int8_t stage_range_col[MAX_TXFM_STAGE_NUM]; + int8_t stage_range_row[MAX_TXFM_STAGE_NUM]; + TXFM_TYPE txfm_type_col; + TXFM_TYPE txfm_type_row; + int stage_num_col; + int stage_num_row; +} TXFM_2D_FLIP_CFG; + +static INLINE void get_flip_cfg(TX_TYPE tx_type, int *ud_flip, int *lr_flip) { + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: + *ud_flip = 0; + *lr_flip = 0; + break; + case IDTX: + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: + *ud_flip = 0; + *lr_flip = 0; + break; + case FLIPADST_DCT: + case FLIPADST_ADST: + case V_FLIPADST: + *ud_flip = 1; + *lr_flip = 0; + break; + case DCT_FLIPADST: + case ADST_FLIPADST: + case H_FLIPADST: + *ud_flip = 0; + *lr_flip = 1; + break; + case FLIPADST_FLIPADST: + *ud_flip = 1; + *lr_flip = 1; + break; + default: + *ud_flip = 0; + *lr_flip = 0; + assert(0); + } +} + +static INLINE void set_flip_cfg(TX_TYPE tx_type, TXFM_2D_FLIP_CFG *cfg) { + get_flip_cfg(tx_type, &cfg->ud_flip, &cfg->lr_flip); +} + +// Utility function that returns the log of the ratio of the col and row +// sizes. +static INLINE int get_rect_tx_log_ratio(int col, int row) { + if (col == row) return 0; + if (col > row) { + if (col == row * 2) return 1; + if (col == row * 4) return 2; + assert(0 && "Unsupported transform size"); + } else { + if (row == col * 2) return -1; + if (row == col * 4) return -2; + assert(0 && "Unsupported transform size"); + } + return 0; // Invalid +} + +void av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row, + const TXFM_2D_FLIP_CFG *cfg, int bd); + +void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row, + const TXFM_2D_FLIP_CFG *cfg, TX_SIZE tx_size, + int bd); + +void av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size, + TXFM_2D_FLIP_CFG *cfg); +void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size, + TXFM_2D_FLIP_CFG *cfg); +extern const TXFM_TYPE av1_txfm_type_ls[5][TX_TYPES_1D]; +extern const int8_t av1_txfm_stage_num_list[TXFM_TYPES]; +static INLINE int get_txw_idx(TX_SIZE tx_size) { + return tx_size_wide_log2[tx_size] - tx_size_wide_log2[0]; +} +static INLINE int get_txh_idx(TX_SIZE tx_size) { + return tx_size_high_log2[tx_size] - tx_size_high_log2[0]; +} + +void av1_range_check_buf(int32_t stage, const int32_t *input, + const int32_t *buf, int32_t size, int8_t bit); +#define MAX_TXWH_IDX 5 +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif // AOM_AV1_COMMON_AV1_TXFM_H_ diff --git a/libs/libaom/src/av1/common/blockd.c b/libs/libaom/src/av1/common/blockd.c new file mode 100644 index 000000000..00725ea2d --- /dev/null +++ b/libs/libaom/src/av1/common/blockd.c @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_ports/system_state.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" + +PREDICTION_MODE av1_left_block_mode(const MB_MODE_INFO *left_mi) { + if (!left_mi) return DC_PRED; + assert(!is_inter_block(left_mi) || is_intrabc_block(left_mi)); + return left_mi->mode; +} + +PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi) { + if (!above_mi) return DC_PRED; + assert(!is_inter_block(above_mi) || is_intrabc_block(above_mi)); + return above_mi->mode; +} + +void av1_set_entropy_contexts(const MACROBLOCKD *xd, + struct macroblockd_plane *pd, int plane, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + int has_eob, int aoff, int loff) { + ENTROPY_CONTEXT *const a = pd->above_entropy_context + aoff; + ENTROPY_CONTEXT *const l = pd->left_entropy_context + loff; + const int txs_wide = tx_size_wide_unit[tx_size]; + const int txs_high = tx_size_high_unit[tx_size]; + + // above + if (has_eob && xd->mb_to_right_edge < 0) { + const int blocks_wide = max_block_wide(xd, plane_bsize, plane); + const int above_contexts = AOMMIN(txs_wide, blocks_wide - aoff); + memset(a, has_eob, sizeof(*a) * above_contexts); + memset(a + above_contexts, 0, sizeof(*a) * (txs_wide - above_contexts)); + } else { + memset(a, has_eob, sizeof(*a) * txs_wide); + } + + // left + if (has_eob && xd->mb_to_bottom_edge < 0) { + const int blocks_high = max_block_high(xd, plane_bsize, plane); + const int left_contexts = AOMMIN(txs_high, blocks_high - loff); + memset(l, has_eob, sizeof(*l) * left_contexts); + memset(l + left_contexts, 0, sizeof(*l) * (txs_high - left_contexts)); + } else { + memset(l, has_eob, sizeof(*l) * txs_high); + } +} +void av1_reset_entropy_context(MACROBLOCKD *xd, BLOCK_SIZE bsize, + const int num_planes) { + assert(bsize < BLOCK_SIZES_ALL); + const int nplanes = 1 + (num_planes - 1) * xd->is_chroma_ref; + for (int i = 0; i < nplanes; i++) { + struct macroblockd_plane *const pd = &xd->plane[i]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + const int txs_wide = mi_size_wide[plane_bsize]; + const int txs_high = mi_size_high[plane_bsize]; + memset(pd->above_entropy_context, 0, sizeof(ENTROPY_CONTEXT) * txs_wide); + memset(pd->left_entropy_context, 0, sizeof(ENTROPY_CONTEXT) * txs_high); + } +} + +void av1_reset_loop_filter_delta(MACROBLOCKD *xd, int num_planes) { + xd->delta_lf_from_base = 0; + const int frame_lf_count = + num_planes > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) xd->delta_lf[lf_id] = 0; +} + +void av1_reset_loop_restoration(MACROBLOCKD *xd, const int num_planes) { + for (int p = 0; p < num_planes; ++p) { + set_default_wiener(xd->wiener_info + p); + set_default_sgrproj(xd->sgrproj_info + p); + } +} + +void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y, + const int num_planes) { + int i; + + for (i = 0; i < num_planes; i++) { + xd->plane[i].plane_type = get_plane_type(i); + xd->plane[i].subsampling_x = i ? ss_x : 0; + xd->plane[i].subsampling_y = i ? ss_y : 0; + } + for (i = num_planes; i < MAX_MB_PLANE; i++) { + xd->plane[i].subsampling_x = 1; + xd->plane[i].subsampling_y = 1; + } +} diff --git a/libs/libaom/src/av1/common/blockd.h b/libs/libaom/src/av1/common/blockd.h new file mode 100644 index 000000000..47597bc83 --- /dev/null +++ b/libs/libaom/src/av1/common/blockd.h @@ -0,0 +1,1296 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_BLOCKD_H_ +#define AOM_AV1_COMMON_BLOCKD_H_ + +#include "config/aom_config.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" +#include "aom_scale/yv12config.h" + +#include "av1/common/common_data.h" +#include "av1/common/quant_common.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/mv.h" +#include "av1/common/scale.h" +#include "av1/common/seg_common.h" +#include "av1/common/tile_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define USE_B_QUANT_NO_TRELLIS 1 + +#define MAX_MB_PLANE 3 + +#define MAX_DIFFWTD_MASK_BITS 1 + +#define INTERINTRA_WEDGE_SIGN 0 + +// DIFFWTD_MASK_TYPES should not surpass 1 << MAX_DIFFWTD_MASK_BITS +enum { + DIFFWTD_38 = 0, + DIFFWTD_38_INV, + DIFFWTD_MASK_TYPES, +} UENUM1BYTE(DIFFWTD_MASK_TYPE); + +enum { + KEY_FRAME = 0, + INTER_FRAME = 1, + INTRA_ONLY_FRAME = 2, // replaces intra-only + S_FRAME = 3, + FRAME_TYPES, +} UENUM1BYTE(FRAME_TYPE); + +static INLINE int is_comp_ref_allowed(BLOCK_SIZE bsize) { + return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8; +} + +static INLINE int is_inter_mode(PREDICTION_MODE mode) { + return mode >= INTER_MODE_START && mode < INTER_MODE_END; +} + +typedef struct { + uint8_t *plane[MAX_MB_PLANE]; + int stride[MAX_MB_PLANE]; +} BUFFER_SET; + +static INLINE int is_inter_singleref_mode(PREDICTION_MODE mode) { + return mode >= SINGLE_INTER_MODE_START && mode < SINGLE_INTER_MODE_END; +} +static INLINE int is_inter_compound_mode(PREDICTION_MODE mode) { + return mode >= COMP_INTER_MODE_START && mode < COMP_INTER_MODE_END; +} + +static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) { + static const PREDICTION_MODE lut[] = { + DC_PRED, // DC_PRED + V_PRED, // V_PRED + H_PRED, // H_PRED + D45_PRED, // D45_PRED + D135_PRED, // D135_PRED + D113_PRED, // D113_PRED + D157_PRED, // D157_PRED + D203_PRED, // D203_PRED + D67_PRED, // D67_PRED + SMOOTH_PRED, // SMOOTH_PRED + SMOOTH_V_PRED, // SMOOTH_V_PRED + SMOOTH_H_PRED, // SMOOTH_H_PRED + PAETH_PRED, // PAETH_PRED + NEARESTMV, // NEARESTMV + NEARMV, // NEARMV + GLOBALMV, // GLOBALMV + NEWMV, // NEWMV + NEARESTMV, // NEAREST_NEARESTMV + NEARMV, // NEAR_NEARMV + NEARESTMV, // NEAREST_NEWMV + NEWMV, // NEW_NEARESTMV + NEARMV, // NEAR_NEWMV + NEWMV, // NEW_NEARMV + GLOBALMV, // GLOBAL_GLOBALMV + NEWMV, // NEW_NEWMV + }; + assert(NELEMENTS(lut) == MB_MODE_COUNT); + assert(is_inter_compound_mode(mode) || is_inter_singleref_mode(mode)); + return lut[mode]; +} + +static INLINE PREDICTION_MODE compound_ref1_mode(PREDICTION_MODE mode) { + static const PREDICTION_MODE lut[] = { + MB_MODE_COUNT, // DC_PRED + MB_MODE_COUNT, // V_PRED + MB_MODE_COUNT, // H_PRED + MB_MODE_COUNT, // D45_PRED + MB_MODE_COUNT, // D135_PRED + MB_MODE_COUNT, // D113_PRED + MB_MODE_COUNT, // D157_PRED + MB_MODE_COUNT, // D203_PRED + MB_MODE_COUNT, // D67_PRED + MB_MODE_COUNT, // SMOOTH_PRED + MB_MODE_COUNT, // SMOOTH_V_PRED + MB_MODE_COUNT, // SMOOTH_H_PRED + MB_MODE_COUNT, // PAETH_PRED + MB_MODE_COUNT, // NEARESTMV + MB_MODE_COUNT, // NEARMV + MB_MODE_COUNT, // GLOBALMV + MB_MODE_COUNT, // NEWMV + NEARESTMV, // NEAREST_NEARESTMV + NEARMV, // NEAR_NEARMV + NEWMV, // NEAREST_NEWMV + NEARESTMV, // NEW_NEARESTMV + NEWMV, // NEAR_NEWMV + NEARMV, // NEW_NEARMV + GLOBALMV, // GLOBAL_GLOBALMV + NEWMV, // NEW_NEWMV + }; + assert(NELEMENTS(lut) == MB_MODE_COUNT); + assert(is_inter_compound_mode(mode)); + return lut[mode]; +} + +static INLINE int have_nearmv_in_inter_mode(PREDICTION_MODE mode) { + return (mode == NEARMV || mode == NEAR_NEARMV || mode == NEAR_NEWMV || + mode == NEW_NEARMV); +} + +static INLINE int have_newmv_in_inter_mode(PREDICTION_MODE mode) { + return (mode == NEWMV || mode == NEW_NEWMV || mode == NEAREST_NEWMV || + mode == NEW_NEARESTMV || mode == NEAR_NEWMV || mode == NEW_NEARMV); +} + +static INLINE int is_masked_compound_type(COMPOUND_TYPE type) { + return (type == COMPOUND_WEDGE || type == COMPOUND_DIFFWTD); +} + +/* For keyframes, intra block modes are predicted by the (already decoded) + modes for the Y blocks to the left and above us; for interframes, there + is a single probability table. */ + +typedef struct { + // Value of base colors for Y, U, and V + uint16_t palette_colors[3 * PALETTE_MAX_SIZE]; + // Number of base colors for Y (0) and UV (1) + uint8_t palette_size[2]; +} PALETTE_MODE_INFO; + +typedef struct { + FILTER_INTRA_MODE filter_intra_mode; + uint8_t use_filter_intra; +} FILTER_INTRA_MODE_INFO; + +static const PREDICTION_MODE fimode_to_intradir[FILTER_INTRA_MODES] = { + DC_PRED, V_PRED, H_PRED, D157_PRED, DC_PRED +}; + +#if CONFIG_RD_DEBUG +#define TXB_COEFF_COST_MAP_SIZE (MAX_MIB_SIZE) +#endif + +typedef struct RD_STATS { + int rate; + int64_t dist; + // Please be careful of using rdcost, it's not guaranteed to be set all the + // time. + // TODO(angiebird): Create a set of functions to manipulate the RD_STATS. In + // these functions, make sure rdcost is always up-to-date according to + // rate/dist. + int64_t rdcost; + int64_t sse; + int skip; // sse should equal to dist when skip == 1 + int zero_rate; +#if CONFIG_RD_DEBUG + int txb_coeff_cost[MAX_MB_PLANE]; + // TODO(jingning): Temporary solution to silence stack over-size warning + // in handle_inter_mode. This should be fixed after rate-distortion + // optimization refactoring. + int16_t txb_coeff_cost_map[MAX_MB_PLANE][TXB_COEFF_COST_MAP_SIZE] + [TXB_COEFF_COST_MAP_SIZE]; +#endif // CONFIG_RD_DEBUG +} RD_STATS; + +// This struct is used to group function args that are commonly +// sent together in functions related to interinter compound modes +typedef struct { + uint8_t *seg_mask; + int8_t wedge_index; + int8_t wedge_sign; + DIFFWTD_MASK_TYPE mask_type; + COMPOUND_TYPE type; +} INTERINTER_COMPOUND_DATA; + +#define INTER_TX_SIZE_BUF_LEN 16 +#define TXK_TYPE_BUF_LEN 64 +// This structure now relates to 4x4 block regions. +typedef struct MB_MODE_INFO { + // interinter members + INTERINTER_COMPOUND_DATA interinter_comp; + WarpedMotionParams wm_params; + int_mv mv[2]; + int current_qindex; + // Only for INTER blocks + int_interpfilters interp_filters; + // TODO(debargha): Consolidate these flags +#if CONFIG_RD_DEBUG + RD_STATS rd_stats; + int mi_row; + int mi_col; +#endif +#if CONFIG_INSPECTION + int16_t tx_skip[TXK_TYPE_BUF_LEN]; +#endif + PALETTE_MODE_INFO palette_mode_info; + // Common for both INTER and INTRA blocks + BLOCK_SIZE sb_type; + PREDICTION_MODE mode; + // Only for INTRA blocks + UV_PREDICTION_MODE uv_mode; + // interintra members + INTERINTRA_MODE interintra_mode; + MOTION_MODE motion_mode; + PARTITION_TYPE partition; + MV_REFERENCE_FRAME ref_frame[2]; + FILTER_INTRA_MODE_INFO filter_intra_mode_info; + int8_t skip; + uint8_t inter_tx_size[INTER_TX_SIZE_BUF_LEN]; + TX_SIZE tx_size; + int8_t delta_lf_from_base; + int8_t delta_lf[FRAME_LF_COUNT]; + int8_t interintra_wedge_index; + // The actual prediction angle is the base angle + (angle_delta * step). + int8_t angle_delta[PLANE_TYPES]; + /* deringing gain *per-superblock* */ + // Joint sign of alpha Cb and alpha Cr + int8_t cfl_alpha_signs; + // Index of the alpha Cb and alpha Cr combination + uint8_t cfl_alpha_idx; + uint8_t num_proj_ref; + uint8_t overlappable_neighbors[2]; + // If comp_group_idx=0, indicate if dist_wtd_comp(0) or avg_comp(1) is used. + uint8_t compound_idx; + uint8_t use_wedge_interintra : 1; + uint8_t segment_id : 3; + uint8_t seg_id_predicted : 1; // valid only when temporal_update is enabled + uint8_t skip_mode : 1; + uint8_t use_intrabc : 1; + uint8_t ref_mv_idx : 2; + // Indicate if masked compound is used(1) or not(0). + uint8_t comp_group_idx : 1; + int8_t cdef_strength : 4; +} MB_MODE_INFO; + +static INLINE int is_intrabc_block(const MB_MODE_INFO *mbmi) { + return mbmi->use_intrabc; +} + +static INLINE PREDICTION_MODE get_uv_mode(UV_PREDICTION_MODE mode) { + assert(mode < UV_INTRA_MODES); + static const PREDICTION_MODE uv2y[] = { + DC_PRED, // UV_DC_PRED + V_PRED, // UV_V_PRED + H_PRED, // UV_H_PRED + D45_PRED, // UV_D45_PRED + D135_PRED, // UV_D135_PRED + D113_PRED, // UV_D113_PRED + D157_PRED, // UV_D157_PRED + D203_PRED, // UV_D203_PRED + D67_PRED, // UV_D67_PRED + SMOOTH_PRED, // UV_SMOOTH_PRED + SMOOTH_V_PRED, // UV_SMOOTH_V_PRED + SMOOTH_H_PRED, // UV_SMOOTH_H_PRED + PAETH_PRED, // UV_PAETH_PRED + DC_PRED, // UV_CFL_PRED + INTRA_INVALID, // UV_INTRA_MODES + INTRA_INVALID, // UV_MODE_INVALID + }; + return uv2y[mode]; +} + +static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) { + return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME; +} + +static INLINE int has_second_ref(const MB_MODE_INFO *mbmi) { + return mbmi->ref_frame[1] > INTRA_FRAME; +} + +static INLINE int has_uni_comp_refs(const MB_MODE_INFO *mbmi) { + return has_second_ref(mbmi) && (!((mbmi->ref_frame[0] >= BWDREF_FRAME) ^ + (mbmi->ref_frame[1] >= BWDREF_FRAME))); +} + +static INLINE MV_REFERENCE_FRAME comp_ref0(int ref_idx) { + static const MV_REFERENCE_FRAME lut[] = { + LAST_FRAME, // LAST_LAST2_FRAMES, + LAST_FRAME, // LAST_LAST3_FRAMES, + LAST_FRAME, // LAST_GOLDEN_FRAMES, + BWDREF_FRAME, // BWDREF_ALTREF_FRAMES, + LAST2_FRAME, // LAST2_LAST3_FRAMES + LAST2_FRAME, // LAST2_GOLDEN_FRAMES, + LAST3_FRAME, // LAST3_GOLDEN_FRAMES, + BWDREF_FRAME, // BWDREF_ALTREF2_FRAMES, + ALTREF2_FRAME, // ALTREF2_ALTREF_FRAMES, + }; + assert(NELEMENTS(lut) == TOTAL_UNIDIR_COMP_REFS); + return lut[ref_idx]; +} + +static INLINE MV_REFERENCE_FRAME comp_ref1(int ref_idx) { + static const MV_REFERENCE_FRAME lut[] = { + LAST2_FRAME, // LAST_LAST2_FRAMES, + LAST3_FRAME, // LAST_LAST3_FRAMES, + GOLDEN_FRAME, // LAST_GOLDEN_FRAMES, + ALTREF_FRAME, // BWDREF_ALTREF_FRAMES, + LAST3_FRAME, // LAST2_LAST3_FRAMES + GOLDEN_FRAME, // LAST2_GOLDEN_FRAMES, + GOLDEN_FRAME, // LAST3_GOLDEN_FRAMES, + ALTREF2_FRAME, // BWDREF_ALTREF2_FRAMES, + ALTREF_FRAME, // ALTREF2_ALTREF_FRAMES, + }; + assert(NELEMENTS(lut) == TOTAL_UNIDIR_COMP_REFS); + return lut[ref_idx]; +} + +PREDICTION_MODE av1_left_block_mode(const MB_MODE_INFO *left_mi); + +PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi); + +static INLINE int is_global_mv_block(const MB_MODE_INFO *const mbmi, + TransformationType type) { + const PREDICTION_MODE mode = mbmi->mode; + const BLOCK_SIZE bsize = mbmi->sb_type; + const int block_size_allowed = + AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8; + return (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) && type > TRANSLATION && + block_size_allowed; +} + +#if CONFIG_MISMATCH_DEBUG +static INLINE void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col, + int mi_row, int tx_blk_col, int tx_blk_row, + int subsampling_x, int subsampling_y) { + *pixel_c = ((mi_col >> subsampling_x) << MI_SIZE_LOG2) + + (tx_blk_col << MI_SIZE_LOG2); + *pixel_r = ((mi_row >> subsampling_y) << MI_SIZE_LOG2) + + (tx_blk_row << MI_SIZE_LOG2); +} +#endif + +enum { MV_PRECISION_Q3, MV_PRECISION_Q4 } UENUM1BYTE(mv_precision); + +struct buf_2d { + uint8_t *buf; + uint8_t *buf0; + int width; + int height; + int stride; +}; + +typedef struct eob_info { + uint16_t eob; + uint16_t max_scan_line; +} eob_info; + +typedef struct { + DECLARE_ALIGNED(32, tran_low_t, dqcoeff[MAX_MB_PLANE][MAX_SB_SQUARE]); + eob_info eob_data[MAX_MB_PLANE] + [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)]; + DECLARE_ALIGNED(16, uint8_t, color_index_map[2][MAX_SB_SQUARE]); +} CB_BUFFER; + +typedef struct macroblockd_plane { + tran_low_t *dqcoeff; + tran_low_t *dqcoeff_block; + eob_info *eob_data; + PLANE_TYPE plane_type; + int subsampling_x; + int subsampling_y; + struct buf_2d dst; + struct buf_2d pre[2]; + ENTROPY_CONTEXT *above_entropy_context; + ENTROPY_CONTEXT *left_entropy_context; + + // The dequantizers below are true dequantizers used only in the + // dequantization process. They have the same coefficient + // shift/scale as TX. + int16_t seg_dequant_QTX[MAX_SEGMENTS][2]; + uint8_t *color_index_map; + + // block size in pixels + uint8_t width, height; + + qm_val_t *seg_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL]; + qm_val_t *seg_qmatrix[MAX_SEGMENTS][TX_SIZES_ALL]; +} MACROBLOCKD_PLANE; + +#define BLOCK_OFFSET(i) ((i) << 4) + +typedef struct { + DECLARE_ALIGNED(16, InterpKernel, vfilter); + DECLARE_ALIGNED(16, InterpKernel, hfilter); +} WienerInfo; + +typedef struct { + int ep; + int xqd[2]; +} SgrprojInfo; + +#if CONFIG_DEBUG +#define CFL_SUB8X8_VAL_MI_SIZE (4) +#define CFL_SUB8X8_VAL_MI_SQUARE \ + (CFL_SUB8X8_VAL_MI_SIZE * CFL_SUB8X8_VAL_MI_SIZE) +#endif // CONFIG_DEBUG +#define CFL_MAX_BLOCK_SIZE (BLOCK_32X32) +#define CFL_BUF_LINE (32) +#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3) +#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4) +#define CFL_BUF_SQUARE (CFL_BUF_LINE * CFL_BUF_LINE) +typedef struct cfl_ctx { + // Q3 reconstructed luma pixels (only Q2 is required, but Q3 is used to avoid + // shifts) + uint16_t recon_buf_q3[CFL_BUF_SQUARE]; + // Q3 AC contributions (reconstructed luma pixels - tx block avg) + int16_t ac_buf_q3[CFL_BUF_SQUARE]; + + // Cache the DC_PRED when performing RDO, so it does not have to be recomputed + // for every scaling parameter + int dc_pred_is_cached[CFL_PRED_PLANES]; + // The DC_PRED cache is disable when decoding + int use_dc_pred_cache; + // Only cache the first row of the DC_PRED + int16_t dc_pred_cache[CFL_PRED_PLANES][CFL_BUF_LINE]; + + // Height and width currently used in the CfL prediction buffer. + int buf_height, buf_width; + + int are_parameters_computed; + + // Chroma subsampling + int subsampling_x, subsampling_y; + + // Whether the reconstructed luma pixels need to be stored + int store_y; + +#if CONFIG_DEBUG + int rate; +#endif // CONFIG_DEBUG +} CFL_CTX; + +typedef struct dist_wtd_comp_params { + int use_dist_wtd_comp_avg; + int fwd_offset; + int bck_offset; +} DIST_WTD_COMP_PARAMS; + +struct scale_factors; + +// Most/all of the pointers are mere pointers to actual arrays are allocated +// elsewhere. This is mostly for coding convenience. +typedef struct macroblockd { + // Row and column position of current macroblock in mi units. + int mi_row; + int mi_col; + // Same as cm->mi_params.mi_stride, copied here for convenience. + int mi_stride; + + // True if current block transmits chroma information. + // More detail: + // Smallest supported block size for both luma and chroma plane is 4x4. Hence, + // in case of subsampled chroma plane (YUV 4:2:0 or YUV 4:2:2), multiple luma + // blocks smaller than 8x8 maybe combined into one chroma block. + // For example, for YUV 4:2:0, let's say an 8x8 area is split into four 4x4 + // luma blocks. Then, a single chroma block of size 4x4 will cover the area of + // these four luma blocks. This is implemented in bitstream as follows: + // - There are four MB_MODE_INFO structs for the four luma blocks. + // - First 3 MB_MODE_INFO have is_chroma_ref = false, and so do not transmit + // any information for chroma planes. + // - Last block will have is_chroma_ref = true and transmits chroma + // information for the 4x4 chroma block that covers whole 8x8 area covered by + // four luma blocks. + // Similar logic applies for chroma blocks that cover 2 or 3 luma blocks. + bool is_chroma_ref; + + struct macroblockd_plane plane[MAX_MB_PLANE]; + + TileInfo tile; + + // Appropriate offset inside cm->mi_params.mi_grid_base based on current + // mi_row and mi_col. + MB_MODE_INFO **mi; + + // True if 4x4 block above the current block is available. + bool up_available; + // True if 4x4 block to the left of the current block is available. + bool left_available; + // True if the above chrome reference block is available. + bool chroma_up_available; + // True if the left chrome reference block is available. + bool chroma_left_available; + + // MB_MODE_INFO for 4x4 block to the left of the current block, if + // left_available == true; otherwise NULL. + MB_MODE_INFO *left_mbmi; + // MB_MODE_INFO for 4x4 block above the current block, if + // up_available == true; otherwise NULL. + MB_MODE_INFO *above_mbmi; + // Above chroma reference block if is_chroma_ref == true for the current block + // and chroma_up_available == true; otherwise NULL. + // See also: the special case logic when current chroma block covers more than + // one luma blocks in set_mi_row_col(). + MB_MODE_INFO *chroma_left_mbmi; + // Left chroma reference block if is_chroma_ref == true for the current block + // and chroma_left_available == true; otherwise NULL. + // See also: the special case logic when current chroma block covers more than + // one luma blocks in set_mi_row_col(). + MB_MODE_INFO *chroma_above_mbmi; + + // Appropriate offset based on current 'mi_row' and 'mi_col', inside + // 'tx_type_map' in one of 'CommonModeInfoParams', 'PICK_MODE_CONTEXT' or + // 'MACROBLOCK' structs. + uint8_t *tx_type_map; + // Stride for 'tx_type_map'. Note that this may / may not be same as + // 'mi_stride', depending on which actual array 'tx_type_map' points to. + int tx_type_map_stride; + + // Distance of this macroblock from frame edges in 1/8th pixel units. + int mb_to_left_edge; + int mb_to_right_edge; + int mb_to_top_edge; + int mb_to_bottom_edge; + + // Scale factors for reference frames of the current block. + // These are pointers into 'cm->ref_scale_factors'. + const struct scale_factors *block_ref_scale_factors[2]; + + const YV12_BUFFER_CONFIG *cur_buf; + + // Entropy contexts for the above blocks. + // above_entropy_context[i][j] corresponds to above entropy context for ith + // plane and jth mi column of this *frame*, wrt current 'mi_row'. + // These are pointers into 'cm->above_contexts.entropy'. + ENTROPY_CONTEXT *above_entropy_context[MAX_MB_PLANE]; + // Entropy contexts for the left blocks. + // left_entropy_context[i][j] corresponds to left entropy context for ith + // plane and jth mi row of this *superblock*, wrt current 'mi_col'. + // Note: These contain actual data, NOT pointers. + ENTROPY_CONTEXT left_entropy_context[MAX_MB_PLANE][MAX_MIB_SIZE]; + + // Partition contexts for the above blocks. + // above_partition_context[i] corresponds to above partition context for ith + // mi column of this *frame*, wrt current 'mi_row'. + // These are pointers into 'cm->above_contexts.partition'. + PARTITION_CONTEXT *above_partition_context; + // Partition contexts for the left blocks. + // left_partition_context[i] corresponds to left partition context for ith + // mi row of this *superblock*, wrt current 'mi_col'. + // Note: These contain actual data, NOT pointers. + PARTITION_CONTEXT left_partition_context[MAX_MIB_SIZE]; + + // Transform contexts for the above blocks. + // TODO(urvang): Indexed two different ways from cm->above_contexts.txfm in + // code currently. Need to make it consistent / document why. + TXFM_CONTEXT *above_txfm_context; + // Transform contexts for the left blocks. + TXFM_CONTEXT *left_txfm_context; + // TODO(urvang): 'left_txfm_context' points to 'left_txfm_context_buffer'. + // Can we remove this indirection? + TXFM_CONTEXT left_txfm_context_buffer[MAX_MIB_SIZE]; + + // Default values for the two restoration filters for each plane. + // These values are used as reference values when writing the bitstream. That + // is, we transmit the delta between the actual values in + // cm->rst_info[plane].unit_info[unit_idx] and these reference values. + WienerInfo wiener_info[MAX_MB_PLANE]; + SgrprojInfo sgrproj_info[MAX_MB_PLANE]; + + // Block dimensions in MB_MODE_INFO units. + uint8_t width; + uint8_t height; + + uint8_t ref_mv_count[MODE_CTX_REF_FRAMES]; + CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE]; + uint16_t weight[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE]; + uint8_t is_sec_rect; + + // Counts of each reference frame in the above and left neighboring blocks. + // NOTE: Take into account both single and comp references. + uint8_t neighbors_ref_counts[REF_FRAMES]; + + FRAME_CONTEXT *tile_ctx; + // Bit depth: copied from cm->seq_params.bit_depth for convenience. + int bd; + + int qindex[MAX_SEGMENTS]; + int lossless[MAX_SEGMENTS]; + // TODO(urvang): Move to decoder. + int corrupted; + // Same as cm->features.cur_frame_force_integer_mv. + int cur_frame_force_integer_mv; + // Pointer to cm->error. + struct aom_internal_error_info *error_info; + // Same as cm->global_motion. + const WarpedMotionParams *global_motion; + int delta_qindex; + int current_qindex; + // Since actual frame level loop filtering level value is not available + // at the beginning of the tile (only available during actual filtering) + // at encoder side.we record the delta_lf (against the frame level loop + // filtering level) and code the delta between previous superblock's delta + // lf and current delta lf. It is equivalent to the delta between previous + // superblock's actual lf and current lf. + int8_t delta_lf_from_base; + // For this experiment, we have four frame filter levels for different plane + // and direction. So, to support the per superblock update, we need to add + // a few more params as below. + // 0: delta loop filter level for y plane vertical + // 1: delta loop filter level for y plane horizontal + // 2: delta loop filter level for u plane + // 3: delta loop filter level for v plane + // To make it consistent with the reference to each filter level in segment, + // we need to -1, since + // SEG_LVL_ALT_LF_Y_V = 1; + // SEG_LVL_ALT_LF_Y_H = 2; + // SEG_LVL_ALT_LF_U = 3; + // SEG_LVL_ALT_LF_V = 4; + int8_t delta_lf[FRAME_LF_COUNT]; + // cdef_transmitted[i] is true if CDEF strength for ith CDEF unit in the + // current superblock has already been read from (decoder) / written to + // (encoder) the bitstream; and false otherwise. + // More detail: + // (1) CDEF strength is transmitted only once per CDEF unit, in the 1st + // non-skip coding block. So, we need this array to keep track of whether CDEF + // strengths for the given CDEF units have been transmitted yet or not. + // (2) Superblock size can be either 128x128 or 64x64, but CDEF unit size is + // fixed to be 64x64. So, there may be 4 CDEF units within a superblock (if + // superblock size is 128x128). Hence the array size is 4. + // (3) In the current implementation, CDEF strength for this CDEF unit is + // stored in the MB_MODE_INFO of the 1st block in this CDEF unit (inside + // cm->mi_params.mi_grid_base). + bool cdef_transmitted[4]; + + DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]); + uint8_t *mc_buf[2]; + CFL_CTX cfl; + + DIST_WTD_COMP_PARAMS jcp_param; + + uint16_t cb_offset[MAX_MB_PLANE]; + uint16_t txb_offset[MAX_MB_PLANE]; + uint16_t color_index_map_offset[2]; + + CONV_BUF_TYPE *tmp_conv_dst; + uint8_t *tmp_obmc_bufs[2]; +} MACROBLOCKD; + +static INLINE int is_cur_buf_hbd(const MACROBLOCKD *xd) { + return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0; +} + +static INLINE uint8_t *get_buf_by_bd(const MACROBLOCKD *xd, uint8_t *buf16) { + return (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + ? CONVERT_TO_BYTEPTR(buf16) + : buf16; +} + +static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) { + switch (bsize) { + case BLOCK_4X4: return 0; + case BLOCK_8X8: return 1; + case BLOCK_16X16: return 2; + case BLOCK_32X32: return 3; + case BLOCK_64X64: return 4; + case BLOCK_128X128: return 5; + default: return SQR_BLOCK_SIZES; + } +} + +// For a square block size 'bsize', returns the size of the sub-blocks used by +// the given partition type. If the partition produces sub-blocks of different +// sizes, then the function returns the largest sub-block size. +// Implements the Partition_Subsize lookup table in the spec (Section 9.3. +// Conversion tables). +// Note: the input block size should be square. +// Otherwise it's considered invalid. +static INLINE BLOCK_SIZE get_partition_subsize(BLOCK_SIZE bsize, + PARTITION_TYPE partition) { + if (partition == PARTITION_INVALID) { + return BLOCK_INVALID; + } else { + const int sqr_bsize_idx = get_sqr_bsize_idx(bsize); + return sqr_bsize_idx >= SQR_BLOCK_SIZES + ? BLOCK_INVALID + : subsize_lookup[partition][sqr_bsize_idx]; + } +} + +static TX_TYPE intra_mode_to_tx_type(const MB_MODE_INFO *mbmi, + PLANE_TYPE plane_type) { + static const TX_TYPE _intra_mode_to_tx_type[INTRA_MODES] = { + DCT_DCT, // DC_PRED + ADST_DCT, // V_PRED + DCT_ADST, // H_PRED + DCT_DCT, // D45_PRED + ADST_ADST, // D135_PRED + ADST_DCT, // D113_PRED + DCT_ADST, // D157_PRED + DCT_ADST, // D203_PRED + ADST_DCT, // D67_PRED + ADST_ADST, // SMOOTH_PRED + ADST_DCT, // SMOOTH_V_PRED + DCT_ADST, // SMOOTH_H_PRED + ADST_ADST, // PAETH_PRED + }; + const PREDICTION_MODE mode = + (plane_type == PLANE_TYPE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode); + assert(mode < INTRA_MODES); + return _intra_mode_to_tx_type[mode]; +} + +static INLINE int is_rect_tx(TX_SIZE tx_size) { return tx_size >= TX_SIZES; } + +static INLINE int block_signals_txsize(BLOCK_SIZE bsize) { + return bsize > BLOCK_4X4; +} + +// Number of transform types in each set type +static const int av1_num_ext_tx_set[EXT_TX_SET_TYPES] = { + 1, 2, 5, 7, 12, 16, +}; + +static const int av1_ext_tx_used[EXT_TX_SET_TYPES][TX_TYPES] = { + { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 }, + { 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 }, + { 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0 }, + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 }, + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, +}; + +static const uint16_t av1_reduced_intra_tx_used_flag[INTRA_MODES] = { + 0x080F, // DC_PRED: 0000 1000 0000 1111 + 0x040F, // V_PRED: 0000 0100 0000 1111 + 0x080F, // H_PRED: 0000 1000 0000 1111 + 0x020F, // D45_PRED: 0000 0010 0000 1111 + 0x080F, // D135_PRED: 0000 1000 0000 1111 + 0x040F, // D113_PRED: 0000 0100 0000 1111 + 0x080F, // D157_PRED: 0000 1000 0000 1111 + 0x080F, // D203_PRED: 0000 1000 0000 1111 + 0x040F, // D67_PRED: 0000 0100 0000 1111 + 0x080F, // SMOOTH_PRED: 0000 1000 0000 1111 + 0x040F, // SMOOTH_V_PRED: 0000 0100 0000 1111 + 0x080F, // SMOOTH_H_PRED: 0000 1000 0000 1111 + 0x0C0E, // PAETH_PRED: 0000 1100 0000 1110 +}; + +static const uint16_t av1_ext_tx_used_flag[EXT_TX_SET_TYPES] = { + 0x0001, // 0000 0000 0000 0001 + 0x0201, // 0000 0010 0000 0001 + 0x020F, // 0000 0010 0000 1111 + 0x0E0F, // 0000 1110 0000 1111 + 0x0FFF, // 0000 1111 1111 1111 + 0xFFFF, // 1111 1111 1111 1111 +}; + +static const TxSetType av1_ext_tx_set_lookup[2][2] = { + { EXT_TX_SET_DTT4_IDTX_1DDCT, EXT_TX_SET_DTT4_IDTX }, + { EXT_TX_SET_ALL16, EXT_TX_SET_DTT9_IDTX_1DDCT }, +}; + +static INLINE TxSetType av1_get_ext_tx_set_type(TX_SIZE tx_size, int is_inter, + int use_reduced_set) { + const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size]; + if (tx_size_sqr_up > TX_32X32) return EXT_TX_SET_DCTONLY; + if (tx_size_sqr_up == TX_32X32) + return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DCTONLY; + if (use_reduced_set) + return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX; + const TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size]; + return av1_ext_tx_set_lookup[is_inter][tx_size_sqr == TX_16X16]; +} + +// Maps tx set types to the indices. +static const int ext_tx_set_index[2][EXT_TX_SET_TYPES] = { + { // Intra + 0, -1, 2, 1, -1, -1 }, + { // Inter + 0, 3, -1, -1, 2, 1 }, +}; + +static INLINE int get_ext_tx_set(TX_SIZE tx_size, int is_inter, + int use_reduced_set) { + const TxSetType set_type = + av1_get_ext_tx_set_type(tx_size, is_inter, use_reduced_set); + return ext_tx_set_index[is_inter][set_type]; +} + +static INLINE int get_ext_tx_types(TX_SIZE tx_size, int is_inter, + int use_reduced_set) { + const int set_type = + av1_get_ext_tx_set_type(tx_size, is_inter, use_reduced_set); + return av1_num_ext_tx_set[set_type]; +} + +#define TXSIZEMAX(t1, t2) (tx_size_2d[(t1)] >= tx_size_2d[(t2)] ? (t1) : (t2)) +#define TXSIZEMIN(t1, t2) (tx_size_2d[(t1)] <= tx_size_2d[(t2)] ? (t1) : (t2)) + +static INLINE TX_SIZE tx_size_from_tx_mode(BLOCK_SIZE bsize, TX_MODE tx_mode) { + const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[tx_mode]; + const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bsize]; + if (bsize == BLOCK_4X4) + return AOMMIN(max_txsize_lookup[bsize], largest_tx_size); + if (txsize_sqr_map[max_rect_tx_size] <= largest_tx_size) + return max_rect_tx_size; + else + return largest_tx_size; +} + +static const uint8_t mode_to_angle_map[] = { + 0, 90, 180, 45, 135, 113, 157, 203, 67, 0, 0, 0, 0, +}; + +// Converts block_index for given transform size to index of the block in raster +// order. +static INLINE int av1_block_index_to_raster_order(TX_SIZE tx_size, + int block_idx) { + // For transform size 4x8, the possible block_idx values are 0 & 2, because + // block_idx values are incremented in steps of size 'tx_width_unit x + // tx_height_unit'. But, for this transform size, block_idx = 2 corresponds to + // block number 1 in raster order, inside an 8x8 MI block. + // For any other transform size, the two indices are equivalent. + return (tx_size == TX_4X8 && block_idx == 2) ? 1 : block_idx; +} + +// Inverse of above function. +// Note: only implemented for transform sizes 4x4, 4x8 and 8x4 right now. +static INLINE int av1_raster_order_to_block_index(TX_SIZE tx_size, + int raster_order) { + assert(tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4); + // We ensure that block indices are 0 & 2 if tx size is 4x8 or 8x4. + return (tx_size == TX_4X4) ? raster_order : (raster_order > 0) ? 2 : 0; +} + +static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type, + const MACROBLOCKD *xd, + TX_SIZE tx_size, + int is_screen_content_type) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + + if (is_inter_block(mbmi) || plane_type != PLANE_TYPE_Y || + xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32 || + is_screen_content_type) + return DCT_DCT; + + return intra_mode_to_tx_type(mbmi, plane_type); +} + +// Implements the get_plane_residual_size() function in the spec (Section +// 5.11.38. Get plane residual size function). +static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize, + int subsampling_x, + int subsampling_y) { + assert(bsize < BLOCK_SIZES_ALL); + assert(subsampling_x >= 0 && subsampling_x < 2); + assert(subsampling_y >= 0 && subsampling_y < 2); + return ss_size_lookup[bsize][subsampling_x][subsampling_y]; +} + +/* + * Logic to generate the lookup tables: + * + * TX_SIZE txs = max_txsize_rect_lookup[bsize]; + * for (int level = 0; level < MAX_VARTX_DEPTH - 1; ++level) + * txs = sub_tx_size_map[txs]; + * const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2; + * const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2; + * const int bw_uint_log2 = mi_size_wide_log2[bsize]; + * const int stride_log2 = bw_uint_log2 - tx_w_log2; + */ +static INLINE int av1_get_txb_size_index(BLOCK_SIZE bsize, int blk_row, + int blk_col) { + static const uint8_t tw_w_log2_table[BLOCK_SIZES_ALL] = { + 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 1, 1, 2, 2, 3, + }; + static const uint8_t tw_h_log2_table[BLOCK_SIZES_ALL] = { + 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 0, 2, 1, 3, 2, + }; + static const uint8_t stride_log2_table[BLOCK_SIZES_ALL] = { + 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 2, 2, 0, 1, 0, 1, 0, 1, + }; + const int index = + ((blk_row >> tw_h_log2_table[bsize]) << stride_log2_table[bsize]) + + (blk_col >> tw_w_log2_table[bsize]); + assert(index < INTER_TX_SIZE_BUF_LEN); + return index; +} + +#if CONFIG_INSPECTION +/* + * Here is the logic to generate the lookup tables: + * + * TX_SIZE txs = max_txsize_rect_lookup[bsize]; + * for (int level = 0; level < MAX_VARTX_DEPTH; ++level) + * txs = sub_tx_size_map[txs]; + * const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2; + * const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2; + * const int bw_uint_log2 = mi_size_wide_log2[bsize]; + * const int stride_log2 = bw_uint_log2 - tx_w_log2; + */ +static INLINE int av1_get_txk_type_index(BLOCK_SIZE bsize, int blk_row, + int blk_col) { + static const uint8_t tw_w_log2_table[BLOCK_SIZES_ALL] = { + 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2, 2, + }; + static const uint8_t tw_h_log2_table[BLOCK_SIZES_ALL] = { + 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2, 2, + }; + static const uint8_t stride_log2_table[BLOCK_SIZES_ALL] = { + 0, 0, 1, 1, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 3, 3, 0, 2, 0, 2, 0, 2, + }; + const int index = + ((blk_row >> tw_h_log2_table[bsize]) << stride_log2_table[bsize]) + + (blk_col >> tw_w_log2_table[bsize]); + assert(index < TXK_TYPE_BUF_LEN); + return index; +} +#endif // CONFIG_INSPECTION + +static INLINE void update_txk_array(MACROBLOCKD *const xd, int blk_row, + int blk_col, TX_SIZE tx_size, + TX_TYPE tx_type) { + const int stride = xd->tx_type_map_stride; + xd->tx_type_map[blk_row * stride + blk_col] = tx_type; + + const int txw = tx_size_wide_unit[tx_size]; + const int txh = tx_size_high_unit[tx_size]; + // The 16x16 unit is due to the constraint from tx_64x64 which sets the + // maximum tx size for chroma as 32x32. Coupled with 4x1 transform block + // size, the constraint takes effect in 32x16 / 16x32 size too. To solve + // the intricacy, cover all the 16x16 units inside a 64 level transform. + if (txw == tx_size_wide_unit[TX_64X64] || + txh == tx_size_high_unit[TX_64X64]) { + const int tx_unit = tx_size_wide_unit[TX_16X16]; + for (int idy = 0; idy < txh; idy += tx_unit) { + for (int idx = 0; idx < txw; idx += tx_unit) { + xd->tx_type_map[(blk_row + idy) * stride + blk_col + idx] = tx_type; + } + } + } +} + +static INLINE TX_TYPE av1_get_tx_type(const MACROBLOCKD *xd, + PLANE_TYPE plane_type, int blk_row, + int blk_col, TX_SIZE tx_size, + int reduced_tx_set) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32) { + return DCT_DCT; + } + + TX_TYPE tx_type; + if (plane_type == PLANE_TYPE_Y) { + tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col]; + } else { + if (is_inter_block(mbmi)) { + // scale back to y plane's coordinate + const struct macroblockd_plane *const pd = &xd->plane[plane_type]; + blk_row <<= pd->subsampling_y; + blk_col <<= pd->subsampling_x; + tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col]; + } else { + // In intra mode, uv planes don't share the same prediction mode as y + // plane, so the tx_type should not be shared + tx_type = intra_mode_to_tx_type(mbmi, PLANE_TYPE_UV); + } + const TxSetType tx_set_type = + av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi), reduced_tx_set); + if (!av1_ext_tx_used[tx_set_type][tx_type]) tx_type = DCT_DCT; + } + assert(tx_type < TX_TYPES); + assert(av1_ext_tx_used[av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi), + reduced_tx_set)][tx_type]); + return tx_type; +} + +void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y, + const int num_planes); + +/* + * Logic to generate the lookup table: + * + * TX_SIZE tx_size = max_txsize_rect_lookup[bsize]; + * int depth = 0; + * while (depth < MAX_TX_DEPTH && tx_size != TX_4X4) { + * depth++; + * tx_size = sub_tx_size_map[tx_size]; + * } + */ +static INLINE int bsize_to_max_depth(BLOCK_SIZE bsize) { + static const uint8_t bsize_to_max_depth_table[BLOCK_SIZES_ALL] = { + 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + }; + return bsize_to_max_depth_table[bsize]; +} + +/* + * Logic to generate the lookup table: + * + * TX_SIZE tx_size = max_txsize_rect_lookup[bsize]; + * assert(tx_size != TX_4X4); + * int depth = 0; + * while (tx_size != TX_4X4) { + * depth++; + * tx_size = sub_tx_size_map[tx_size]; + * } + * assert(depth < 10); + */ +static INLINE int bsize_to_tx_size_cat(BLOCK_SIZE bsize) { + assert(bsize < BLOCK_SIZES_ALL); + static const uint8_t bsize_to_tx_size_depth_table[BLOCK_SIZES_ALL] = { + 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 2, 2, 3, 3, 4, 4, + }; + const int depth = bsize_to_tx_size_depth_table[bsize]; + assert(depth <= MAX_TX_CATS); + return depth - 1; +} + +static INLINE TX_SIZE depth_to_tx_size(int depth, BLOCK_SIZE bsize) { + TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize]; + TX_SIZE tx_size = max_tx_size; + for (int d = 0; d < depth; ++d) tx_size = sub_tx_size_map[tx_size]; + return tx_size; +} + +static INLINE TX_SIZE av1_get_adjusted_tx_size(TX_SIZE tx_size) { + switch (tx_size) { + case TX_64X64: + case TX_64X32: + case TX_32X64: return TX_32X32; + case TX_64X16: return TX_32X16; + case TX_16X64: return TX_16X32; + default: return tx_size; + } +} + +static INLINE TX_SIZE av1_get_max_uv_txsize(BLOCK_SIZE bsize, int subsampling_x, + int subsampling_y) { + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, subsampling_x, subsampling_y); + assert(plane_bsize < BLOCK_SIZES_ALL); + const TX_SIZE uv_tx = max_txsize_rect_lookup[plane_bsize]; + return av1_get_adjusted_tx_size(uv_tx); +} + +static INLINE TX_SIZE av1_get_tx_size(int plane, const MACROBLOCKD *xd) { + const MB_MODE_INFO *mbmi = xd->mi[0]; + if (xd->lossless[mbmi->segment_id]) return TX_4X4; + if (plane == 0) return mbmi->tx_size; + const MACROBLOCKD_PLANE *pd = &xd->plane[plane]; + return av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x, + pd->subsampling_y); +} + +void av1_reset_entropy_context(MACROBLOCKD *xd, BLOCK_SIZE bsize, + const int num_planes); + +void av1_reset_loop_filter_delta(MACROBLOCKD *xd, int num_planes); + +void av1_reset_loop_restoration(MACROBLOCKD *xd, const int num_planes); + +typedef void (*foreach_transformed_block_visitor)(int plane, int block, + int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg); + +void av1_set_entropy_contexts(const MACROBLOCKD *xd, + struct macroblockd_plane *pd, int plane, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + int has_eob, int aoff, int loff); + +#define MAX_INTERINTRA_SB_SQUARE 32 * 32 +static INLINE int is_interintra_mode(const MB_MODE_INFO *mbmi) { + return (mbmi->ref_frame[0] > INTRA_FRAME && + mbmi->ref_frame[1] == INTRA_FRAME); +} + +static INLINE int is_interintra_allowed_bsize(const BLOCK_SIZE bsize) { + return (bsize >= BLOCK_8X8) && (bsize <= BLOCK_32X32); +} + +static INLINE int is_interintra_allowed_mode(const PREDICTION_MODE mode) { + return (mode >= SINGLE_INTER_MODE_START) && (mode < SINGLE_INTER_MODE_END); +} + +static INLINE int is_interintra_allowed_ref(const MV_REFERENCE_FRAME rf[2]) { + return (rf[0] > INTRA_FRAME) && (rf[1] <= INTRA_FRAME); +} + +static INLINE int is_interintra_allowed(const MB_MODE_INFO *mbmi) { + return is_interintra_allowed_bsize(mbmi->sb_type) && + is_interintra_allowed_mode(mbmi->mode) && + is_interintra_allowed_ref(mbmi->ref_frame); +} + +static INLINE int is_interintra_allowed_bsize_group(int group) { + int i; + for (i = 0; i < BLOCK_SIZES_ALL; i++) { + if (size_group_lookup[i] == group && + is_interintra_allowed_bsize((BLOCK_SIZE)i)) { + return 1; + } + } + return 0; +} + +static INLINE int is_interintra_pred(const MB_MODE_INFO *mbmi) { + return mbmi->ref_frame[0] > INTRA_FRAME && + mbmi->ref_frame[1] == INTRA_FRAME && is_interintra_allowed(mbmi); +} + +static INLINE int get_vartx_max_txsize(const MACROBLOCKD *xd, BLOCK_SIZE bsize, + int plane) { + if (xd->lossless[xd->mi[0]->segment_id]) return TX_4X4; + const TX_SIZE max_txsize = max_txsize_rect_lookup[bsize]; + if (plane == 0) return max_txsize; // luma + return av1_get_adjusted_tx_size(max_txsize); // chroma +} + +static INLINE int is_motion_variation_allowed_bsize(BLOCK_SIZE bsize) { + assert(bsize < BLOCK_SIZES_ALL); + return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8; +} + +static INLINE int is_motion_variation_allowed_compound( + const MB_MODE_INFO *mbmi) { + return !has_second_ref(mbmi); +} + +// input: log2 of length, 0(4), 1(8), ... +static const int max_neighbor_obmc[6] = { 0, 1, 2, 3, 4, 4 }; + +static INLINE int check_num_overlappable_neighbors(const MB_MODE_INFO *mbmi) { + return !(mbmi->overlappable_neighbors[0] == 0 && + mbmi->overlappable_neighbors[1] == 0); +} + +static INLINE MOTION_MODE +motion_mode_allowed(const WarpedMotionParams *gm_params, const MACROBLOCKD *xd, + const MB_MODE_INFO *mbmi, int allow_warped_motion) { + if (xd->cur_frame_force_integer_mv == 0) { + const TransformationType gm_type = gm_params[mbmi->ref_frame[0]].wmtype; + if (is_global_mv_block(mbmi, gm_type)) return SIMPLE_TRANSLATION; + } + if (is_motion_variation_allowed_bsize(mbmi->sb_type) && + is_inter_mode(mbmi->mode) && mbmi->ref_frame[1] != INTRA_FRAME && + is_motion_variation_allowed_compound(mbmi)) { + if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION; + assert(!has_second_ref(mbmi)); + if (mbmi->num_proj_ref >= 1 && + (allow_warped_motion && + !av1_is_scaled(xd->block_ref_scale_factors[0]))) { + if (xd->cur_frame_force_integer_mv) { + return OBMC_CAUSAL; + } + return WARPED_CAUSAL; + } + return OBMC_CAUSAL; + } else { + return SIMPLE_TRANSLATION; + } +} + +static INLINE int is_neighbor_overlappable(const MB_MODE_INFO *mbmi) { + return (is_inter_block(mbmi)); +} + +static INLINE int av1_allow_palette(int allow_screen_content_tools, + BLOCK_SIZE sb_type) { + assert(sb_type < BLOCK_SIZES_ALL); + return allow_screen_content_tools && block_size_wide[sb_type] <= 64 && + block_size_high[sb_type] <= 64 && sb_type >= BLOCK_8X8; +} + +// Returns sub-sampled dimensions of the given block. +// The output values for 'rows_within_bounds' and 'cols_within_bounds' will +// differ from 'height' and 'width' when part of the block is outside the +// right +// and/or bottom image boundary. +static INLINE void av1_get_block_dimensions(BLOCK_SIZE bsize, int plane, + const MACROBLOCKD *xd, int *width, + int *height, + int *rows_within_bounds, + int *cols_within_bounds) { + const int block_height = block_size_high[bsize]; + const int block_width = block_size_wide[bsize]; + const int block_rows = (xd->mb_to_bottom_edge >= 0) + ? block_height + : (xd->mb_to_bottom_edge >> 3) + block_height; + const int block_cols = (xd->mb_to_right_edge >= 0) + ? block_width + : (xd->mb_to_right_edge >> 3) + block_width; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + assert(IMPLIES(plane == PLANE_TYPE_Y, pd->subsampling_x == 0)); + assert(IMPLIES(plane == PLANE_TYPE_Y, pd->subsampling_y == 0)); + assert(block_width >= block_cols); + assert(block_height >= block_rows); + const int plane_block_width = block_width >> pd->subsampling_x; + const int plane_block_height = block_height >> pd->subsampling_y; + // Special handling for chroma sub8x8. + const int is_chroma_sub8_x = plane > 0 && plane_block_width < 4; + const int is_chroma_sub8_y = plane > 0 && plane_block_height < 4; + if (width) *width = plane_block_width + 2 * is_chroma_sub8_x; + if (height) *height = plane_block_height + 2 * is_chroma_sub8_y; + if (rows_within_bounds) { + *rows_within_bounds = + (block_rows >> pd->subsampling_y) + 2 * is_chroma_sub8_y; + } + if (cols_within_bounds) { + *cols_within_bounds = + (block_cols >> pd->subsampling_x) + 2 * is_chroma_sub8_x; + } +} + +/* clang-format off */ +typedef aom_cdf_prob (*MapCdf)[PALETTE_COLOR_INDEX_CONTEXTS] + [CDF_SIZE(PALETTE_COLORS)]; +typedef const int (*ColorCost)[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS] + [PALETTE_COLORS]; +/* clang-format on */ + +typedef struct { + int rows; + int cols; + int n_colors; + int plane_width; + int plane_height; + uint8_t *color_map; + MapCdf map_cdf; + ColorCost color_cost; +} Av1ColorMapParam; + +static INLINE int is_nontrans_global_motion(const MACROBLOCKD *xd, + const MB_MODE_INFO *mbmi) { + int ref; + + // First check if all modes are GLOBALMV + if (mbmi->mode != GLOBALMV && mbmi->mode != GLOBAL_GLOBALMV) return 0; + + if (AOMMIN(mi_size_wide[mbmi->sb_type], mi_size_high[mbmi->sb_type]) < 2) + return 0; + + // Now check if all global motion is non translational + for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) { + if (xd->global_motion[mbmi->ref_frame[ref]].wmtype == TRANSLATION) return 0; + } + return 1; +} + +static INLINE PLANE_TYPE get_plane_type(int plane) { + return (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV; +} + +static INLINE int av1_get_max_eob(TX_SIZE tx_size) { + if (tx_size == TX_64X64 || tx_size == TX_64X32 || tx_size == TX_32X64) { + return 1024; + } + if (tx_size == TX_16X64 || tx_size == TX_64X16) { + return 512; + } + return tx_size_2d[tx_size]; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_BLOCKD_H_ diff --git a/libs/libaom/src/av1/common/cdef.c b/libs/libaom/src/av1/common/cdef.c new file mode 100644 index 000000000..ef7b866b5 --- /dev/null +++ b/libs/libaom/src/av1/common/cdef.c @@ -0,0 +1,388 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_integer.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/cdef.h" +#include "av1/common/cdef_block.h" +#include "av1/common/reconinter.h" + +static int is_8x8_block_skip(MB_MODE_INFO **grid, int mi_row, int mi_col, + int mi_stride) { + MB_MODE_INFO **mbmi = grid + mi_row * mi_stride + mi_col; + for (int r = 0; r < mi_size_high[BLOCK_8X8]; ++r, mbmi += mi_stride) { + for (int c = 0; c < mi_size_wide[BLOCK_8X8]; ++c) { + if (!mbmi[c]->skip) return 0; + } + } + + return 1; +} + +int av1_cdef_compute_sb_list(const CommonModeInfoParams *const mi_params, + int mi_row, int mi_col, cdef_list *dlist, + BLOCK_SIZE bs) { + MB_MODE_INFO **grid = mi_params->mi_grid_base; + int maxc = mi_params->mi_cols - mi_col; + int maxr = mi_params->mi_rows - mi_row; + + if (bs == BLOCK_128X128 || bs == BLOCK_128X64) + maxc = AOMMIN(maxc, MI_SIZE_128X128); + else + maxc = AOMMIN(maxc, MI_SIZE_64X64); + if (bs == BLOCK_128X128 || bs == BLOCK_64X128) + maxr = AOMMIN(maxr, MI_SIZE_128X128); + else + maxr = AOMMIN(maxr, MI_SIZE_64X64); + + const int r_step = 2; // mi_size_high[BLOCK_8X8] + const int c_step = 2; // mi_size_wide[BLOCK_8X8] + const int r_shift = 1; + const int c_shift = 1; + int count = 0; + for (int r = 0; r < maxr; r += r_step) { + for (int c = 0; c < maxc; c += c_step) { + if (!is_8x8_block_skip(grid, mi_row + r, mi_col + c, + mi_params->mi_stride)) { + dlist[count].by = r >> r_shift; + dlist[count].bx = c >> c_shift; + count++; + } + } + } + return count; +} + +void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, + const uint8_t *src, int sstride, int v, + int h) { + for (int i = 0; i < v; i++) { + for (int j = 0; j < h; j++) { + dst[i * dstride + j] = src[i * sstride + j]; + } + } +} + +void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, + const uint16_t *src, int sstride, int v, + int h) { + for (int i = 0; i < v; i++) { + for (int j = 0; j < h; j++) { + dst[i * dstride + j] = src[i * sstride + j]; + } + } +} + +static void copy_sb8_16(AV1_COMMON *cm, uint16_t *dst, int dstride, + const uint8_t *src, int src_voffset, int src_hoffset, + int sstride, int vsize, int hsize) { + if (cm->seq_params.use_highbitdepth) { + const uint16_t *base = + &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset]; + cdef_copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, vsize, hsize); + } else { + const uint8_t *base = &src[src_voffset * sstride + src_hoffset]; + cdef_copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, vsize, hsize); + } +} + +static INLINE void fill_rect(uint16_t *dst, int dstride, int v, int h, + uint16_t x) { + for (int i = 0; i < v; i++) { + for (int j = 0; j < h; j++) { + dst[i * dstride + j] = x; + } + } +} + +static INLINE void copy_rect(uint16_t *dst, int dstride, const uint16_t *src, + int sstride, int v, int h) { + for (int i = 0; i < v; i++) { + for (int j = 0; j < h; j++) { + dst[i * dstride + j] = src[i * sstride + j]; + } + } +} + +void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, + MACROBLOCKD *xd) { + const CdefInfo *const cdef_info = &cm->cdef_info; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int num_planes = av1_num_planes(cm); + DECLARE_ALIGNED(16, uint16_t, src[CDEF_INBUF_SIZE]); + uint16_t *linebuf[3]; + uint16_t *colbuf[3]; + cdef_list dlist[MI_SIZE_64X64 * MI_SIZE_64X64]; + unsigned char *row_cdef, *prev_row_cdef, *curr_row_cdef; + int cdef_count; + int dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } }; + int var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } }; + int mi_wide_l2[3]; + int mi_high_l2[3]; + int xdec[3]; + int ydec[3]; + int coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0); + const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0, + num_planes); + row_cdef = aom_malloc(sizeof(*row_cdef) * (nhfb + 2) * 2); + memset(row_cdef, 1, sizeof(*row_cdef) * (nhfb + 2) * 2); + prev_row_cdef = row_cdef + 1; + curr_row_cdef = prev_row_cdef + nhfb + 2; + for (int pli = 0; pli < num_planes; pli++) { + xdec[pli] = xd->plane[pli].subsampling_x; + ydec[pli] = xd->plane[pli].subsampling_y; + mi_wide_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_x; + mi_high_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_y; + } + const int stride = (mi_params->mi_cols << MI_SIZE_LOG2) + 2 * CDEF_HBORDER; + for (int pli = 0; pli < num_planes; pli++) { + linebuf[pli] = aom_malloc(sizeof(*linebuf) * CDEF_VBORDER * stride); + colbuf[pli] = + aom_malloc(sizeof(*colbuf) * + ((CDEF_BLOCKSIZE << mi_high_l2[pli]) + 2 * CDEF_VBORDER) * + CDEF_HBORDER); + } + for (int fbr = 0; fbr < nvfb; fbr++) { + for (int pli = 0; pli < num_planes; pli++) { + const int block_height = + (MI_SIZE_64X64 << mi_high_l2[pli]) + 2 * CDEF_VBORDER; + fill_rect(colbuf[pli], CDEF_HBORDER, block_height, CDEF_HBORDER, + CDEF_VERY_LARGE); + } + int cdef_left = 1; + for (int fbc = 0; fbc < nhfb; fbc++) { + int level, sec_strength; + int uv_level, uv_sec_strength; + int nhb, nvb; + int cstart = 0; + curr_row_cdef[fbc] = 0; + if (mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride + + MI_SIZE_64X64 * fbc] == NULL || + mi_params + ->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride + + MI_SIZE_64X64 * fbc] + ->cdef_strength == -1) { + cdef_left = 0; + continue; + } + if (!cdef_left) cstart = -CDEF_HBORDER; + nhb = AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc); + nvb = AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr); + int frame_top, frame_left, frame_bottom, frame_right; + + int mi_row = MI_SIZE_64X64 * fbr; + int mi_col = MI_SIZE_64X64 * fbc; + // for the current filter block, it's top left corner mi structure (mi_tl) + // is first accessed to check whether the top and left boundaries are + // frame boundaries. Then bottom-left and top-right mi structures are + // accessed to check whether the bottom and right boundaries + // (respectively) are frame boundaries. + // + // Note that we can't just check the bottom-right mi structure - eg. if + // we're at the right-hand edge of the frame but not the bottom, then + // the bottom-right mi is NULL but the bottom-left is not. + frame_top = (mi_row == 0) ? 1 : 0; + frame_left = (mi_col == 0) ? 1 : 0; + + if (fbr != nvfb - 1) + frame_bottom = (mi_row + MI_SIZE_64X64 == mi_params->mi_rows) ? 1 : 0; + else + frame_bottom = 1; + + if (fbc != nhfb - 1) + frame_right = (mi_col + MI_SIZE_64X64 == mi_params->mi_cols) ? 1 : 0; + else + frame_right = 1; + + const int mbmi_cdef_strength = + mi_params + ->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride + + MI_SIZE_64X64 * fbc] + ->cdef_strength; + level = + cdef_info->cdef_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS; + sec_strength = + cdef_info->cdef_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS; + sec_strength += sec_strength == 3; + uv_level = + cdef_info->cdef_uv_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS; + uv_sec_strength = + cdef_info->cdef_uv_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS; + uv_sec_strength += uv_sec_strength == 3; + if ((level == 0 && sec_strength == 0 && uv_level == 0 && + uv_sec_strength == 0) || + (cdef_count = av1_cdef_compute_sb_list(mi_params, fbr * MI_SIZE_64X64, + fbc * MI_SIZE_64X64, dlist, + BLOCK_64X64)) == 0) { + cdef_left = 0; + continue; + } + + curr_row_cdef[fbc] = 1; + for (int pli = 0; pli < num_planes; pli++) { + int coffset; + int rend, cend; + int damping = cdef_info->cdef_damping; + int hsize = nhb << mi_wide_l2[pli]; + int vsize = nvb << mi_high_l2[pli]; + + if (pli) { + level = uv_level; + sec_strength = uv_sec_strength; + } + + if (fbc == nhfb - 1) + cend = hsize; + else + cend = hsize + CDEF_HBORDER; + + if (fbr == nvfb - 1) + rend = vsize; + else + rend = vsize + CDEF_VBORDER; + + coffset = fbc * MI_SIZE_64X64 << mi_wide_l2[pli]; + if (fbc == nhfb - 1) { + /* On the last superblock column, fill in the right border with + CDEF_VERY_LARGE to avoid filtering with the outside. */ + fill_rect(&src[cend + CDEF_HBORDER], CDEF_BSTRIDE, + rend + CDEF_VBORDER, hsize + CDEF_HBORDER - cend, + CDEF_VERY_LARGE); + } + if (fbr == nvfb - 1) { + /* On the last superblock row, fill in the bottom border with + CDEF_VERY_LARGE to avoid filtering with the outside. */ + fill_rect(&src[(rend + CDEF_VBORDER) * CDEF_BSTRIDE], CDEF_BSTRIDE, + CDEF_VBORDER, hsize + 2 * CDEF_HBORDER, CDEF_VERY_LARGE); + } + /* Copy in the pixels we need from the current superblock for + deringing.*/ + copy_sb8_16(cm, + &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER + cstart], + CDEF_BSTRIDE, xd->plane[pli].dst.buf, + (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr, coffset + cstart, + xd->plane[pli].dst.stride, rend, cend - cstart); + if (!prev_row_cdef[fbc]) { + copy_sb8_16(cm, &src[CDEF_HBORDER], CDEF_BSTRIDE, + xd->plane[pli].dst.buf, + (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr - CDEF_VBORDER, + coffset, xd->plane[pli].dst.stride, CDEF_VBORDER, hsize); + } else if (fbr > 0) { + copy_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, &linebuf[pli][coffset], + stride, CDEF_VBORDER, hsize); + } else { + fill_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hsize, + CDEF_VERY_LARGE); + } + if (!prev_row_cdef[fbc - 1]) { + copy_sb8_16(cm, src, CDEF_BSTRIDE, xd->plane[pli].dst.buf, + (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr - CDEF_VBORDER, + coffset - CDEF_HBORDER, xd->plane[pli].dst.stride, + CDEF_VBORDER, CDEF_HBORDER); + } else if (fbr > 0 && fbc > 0) { + copy_rect(src, CDEF_BSTRIDE, &linebuf[pli][coffset - CDEF_HBORDER], + stride, CDEF_VBORDER, CDEF_HBORDER); + } else { + fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, + CDEF_VERY_LARGE); + } + if (!prev_row_cdef[fbc + 1]) { + copy_sb8_16(cm, &src[CDEF_HBORDER + (nhb << mi_wide_l2[pli])], + CDEF_BSTRIDE, xd->plane[pli].dst.buf, + (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr - CDEF_VBORDER, + coffset + hsize, xd->plane[pli].dst.stride, CDEF_VBORDER, + CDEF_HBORDER); + } else if (fbr > 0 && fbc < nhfb - 1) { + copy_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, + &linebuf[pli][coffset + hsize], stride, CDEF_VBORDER, + CDEF_HBORDER); + } else { + fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, + CDEF_HBORDER, CDEF_VERY_LARGE); + } + if (cdef_left) { + /* If we deringed the superblock on the left then we need to copy in + saved pixels. */ + copy_rect(src, CDEF_BSTRIDE, colbuf[pli], CDEF_HBORDER, + rend + CDEF_VBORDER, CDEF_HBORDER); + } + /* Saving pixels in case we need to dering the superblock on the + right. */ + copy_rect(colbuf[pli], CDEF_HBORDER, src + hsize, CDEF_BSTRIDE, + rend + CDEF_VBORDER, CDEF_HBORDER); + copy_sb8_16( + cm, &linebuf[pli][coffset], stride, xd->plane[pli].dst.buf, + (MI_SIZE_64X64 << mi_high_l2[pli]) * (fbr + 1) - CDEF_VBORDER, + coffset, xd->plane[pli].dst.stride, CDEF_VBORDER, hsize); + + if (frame_top) { + fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, hsize + 2 * CDEF_HBORDER, + CDEF_VERY_LARGE); + } + if (frame_left) { + fill_rect(src, CDEF_BSTRIDE, vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, + CDEF_VERY_LARGE); + } + if (frame_bottom) { + fill_rect(&src[(vsize + CDEF_VBORDER) * CDEF_BSTRIDE], CDEF_BSTRIDE, + CDEF_VBORDER, hsize + 2 * CDEF_HBORDER, CDEF_VERY_LARGE); + } + if (frame_right) { + fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, + vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE); + } + + if (cm->seq_params.use_highbitdepth) { + av1_cdef_filter_fb( + NULL, + &CONVERT_TO_SHORTPTR( + xd->plane[pli] + .dst.buf)[xd->plane[pli].dst.stride * + (MI_SIZE_64X64 * fbr << mi_high_l2[pli]) + + (fbc * MI_SIZE_64X64 << mi_wide_l2[pli])], + xd->plane[pli].dst.stride, + &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli], + ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level, + sec_strength, damping, coeff_shift); + } else { + av1_cdef_filter_fb( + &xd->plane[pli] + .dst.buf[xd->plane[pli].dst.stride * + (MI_SIZE_64X64 * fbr << mi_high_l2[pli]) + + (fbc * MI_SIZE_64X64 << mi_wide_l2[pli])], + NULL, xd->plane[pli].dst.stride, + &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli], + ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level, + sec_strength, damping, coeff_shift); + } + } + cdef_left = 1; + } + { + unsigned char *tmp = prev_row_cdef; + prev_row_cdef = curr_row_cdef; + curr_row_cdef = tmp; + } + } + aom_free(row_cdef); + for (int pli = 0; pli < num_planes; pli++) { + aom_free(linebuf[pli]); + aom_free(colbuf[pli]); + } +} diff --git a/libs/libaom/src/av1/common/cdef.h b/libs/libaom/src/av1/common/cdef.h new file mode 100644 index 000000000..c36fd135a --- /dev/null +++ b/libs/libaom/src/av1/common/cdef.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_CDEF_H_ +#define AOM_AV1_COMMON_CDEF_H_ + +#define CDEF_STRENGTH_BITS 6 + +#define CDEF_PRI_STRENGTHS 16 +#define CDEF_SEC_STRENGTHS 4 + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/cdef_block.h" + +static INLINE int sign(int i) { return i < 0 ? -1 : 1; } + +static INLINE int constrain(int diff, int threshold, int damping) { + if (!threshold) return 0; + + const int shift = AOMMAX(0, damping - get_msb(threshold)); + return sign(diff) * + AOMMIN(abs(diff), AOMMAX(0, threshold - (abs(diff) >> shift))); +} + +#ifdef __cplusplus +extern "C" { +#endif + +int av1_cdef_compute_sb_list(const CommonModeInfoParams *const mi_params, + int mi_row, int mi_col, cdef_list *dlist, + BLOCK_SIZE bsize); +void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd); + +void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref, + AV1_COMMON *cm, MACROBLOCKD *xd, int pick_method, + int rdmult); + +#ifdef __cplusplus +} // extern "C" +#endif +#endif // AOM_AV1_COMMON_CDEF_H_ diff --git a/libs/libaom/src/av1/common/cdef_block.c b/libs/libaom/src/av1/common/cdef_block.c new file mode 100644 index 000000000..7120705d3 --- /dev/null +++ b/libs/libaom/src/av1/common/cdef_block.c @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "av1/common/cdef.h" + +/* Generated from gen_filter_tables.c. */ +DECLARE_ALIGNED(16, const int, cdef_directions[8][2]) = { + { -1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2 }, + { 0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2 }, + { 0 * CDEF_BSTRIDE + 1, 0 * CDEF_BSTRIDE + 2 }, + { 0 * CDEF_BSTRIDE + 1, 1 * CDEF_BSTRIDE + 2 }, + { 1 * CDEF_BSTRIDE + 1, 2 * CDEF_BSTRIDE + 2 }, + { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 1 }, + { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0 }, + { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1 } +}; + +/* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on. + The search minimizes the weighted variance along all the lines in a + particular direction, i.e. the squared error between the input and a + "predicted" block where each pixel is replaced by the average along a line + in a particular direction. Since each direction have the same sum(x^2) term, + that term is never computed. See Section 2, step 2, of: + http://jmvalin.ca/notes/intra_paint.pdf */ +int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, + int coeff_shift) { + int i; + int32_t cost[8] = { 0 }; + int partial[8][15] = { { 0 } }; + int32_t best_cost = 0; + int best_dir = 0; + /* Instead of dividing by n between 2 and 8, we multiply by 3*5*7*8/n. + The output is then 840 times larger, but we don't care for finding + the max. */ + static const int div_table[] = { 0, 840, 420, 280, 210, 168, 140, 120, 105 }; + for (i = 0; i < 8; i++) { + int j; + for (j = 0; j < 8; j++) { + int x; + /* We subtract 128 here to reduce the maximum range of the squared + partial sums. */ + x = (img[i * stride + j] >> coeff_shift) - 128; + partial[0][i + j] += x; + partial[1][i + j / 2] += x; + partial[2][i] += x; + partial[3][3 + i - j / 2] += x; + partial[4][7 + i - j] += x; + partial[5][3 - i / 2 + j] += x; + partial[6][j] += x; + partial[7][i / 2 + j] += x; + } + } + for (i = 0; i < 8; i++) { + cost[2] += partial[2][i] * partial[2][i]; + cost[6] += partial[6][i] * partial[6][i]; + } + cost[2] *= div_table[8]; + cost[6] *= div_table[8]; + for (i = 0; i < 7; i++) { + cost[0] += (partial[0][i] * partial[0][i] + + partial[0][14 - i] * partial[0][14 - i]) * + div_table[i + 1]; + cost[4] += (partial[4][i] * partial[4][i] + + partial[4][14 - i] * partial[4][14 - i]) * + div_table[i + 1]; + } + cost[0] += partial[0][7] * partial[0][7] * div_table[8]; + cost[4] += partial[4][7] * partial[4][7] * div_table[8]; + for (i = 1; i < 8; i += 2) { + int j; + for (j = 0; j < 4 + 1; j++) { + cost[i] += partial[i][3 + j] * partial[i][3 + j]; + } + cost[i] *= div_table[8]; + for (j = 0; j < 4 - 1; j++) { + cost[i] += (partial[i][j] * partial[i][j] + + partial[i][10 - j] * partial[i][10 - j]) * + div_table[2 * j + 2]; + } + } + for (i = 0; i < 8; i++) { + if (cost[i] > best_cost) { + best_cost = cost[i]; + best_dir = i; + } + } + /* Difference between the optimal variance and the variance along the + orthogonal direction. Again, the sum(x^2) terms cancel out. */ + *var = best_cost - cost[(best_dir + 4) & 7]; + /* We'd normally divide by 840, but dividing by 1024 is close enough + for what we're going to do with this. */ + *var >>= 10; + return best_dir; +} + +const int cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } }; +const int cdef_sec_taps[2] = { 2, 1 }; + +/* Smooth in the direction detected. */ +void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride, + const uint16_t *in, int pri_strength, int sec_strength, + int dir, int pri_damping, int sec_damping, int bsize, + int coeff_shift) { + int i, j, k; + const int s = CDEF_BSTRIDE; + const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; + const int *sec_taps = cdef_sec_taps; + for (i = 0; i < 4 << (bsize == BLOCK_8X8 || bsize == BLOCK_4X8); i++) { + for (j = 0; j < 4 << (bsize == BLOCK_8X8 || bsize == BLOCK_8X4); j++) { + int16_t sum = 0; + int16_t y; + int16_t x = in[i * s + j]; + int max = x; + int min = x; + for (k = 0; k < 2; k++) { + int16_t p0 = in[i * s + j + cdef_directions[dir][k]]; + int16_t p1 = in[i * s + j - cdef_directions[dir][k]]; + sum += pri_taps[k] * constrain(p0 - x, pri_strength, pri_damping); + sum += pri_taps[k] * constrain(p1 - x, pri_strength, pri_damping); + if (p0 != CDEF_VERY_LARGE) max = AOMMAX(p0, max); + if (p1 != CDEF_VERY_LARGE) max = AOMMAX(p1, max); + min = AOMMIN(p0, min); + min = AOMMIN(p1, min); + int16_t s0 = in[i * s + j + cdef_directions[(dir + 2) & 7][k]]; + int16_t s1 = in[i * s + j - cdef_directions[(dir + 2) & 7][k]]; + int16_t s2 = in[i * s + j + cdef_directions[(dir + 6) & 7][k]]; + int16_t s3 = in[i * s + j - cdef_directions[(dir + 6) & 7][k]]; + if (s0 != CDEF_VERY_LARGE) max = AOMMAX(s0, max); + if (s1 != CDEF_VERY_LARGE) max = AOMMAX(s1, max); + if (s2 != CDEF_VERY_LARGE) max = AOMMAX(s2, max); + if (s3 != CDEF_VERY_LARGE) max = AOMMAX(s3, max); + min = AOMMIN(s0, min); + min = AOMMIN(s1, min); + min = AOMMIN(s2, min); + min = AOMMIN(s3, min); + sum += sec_taps[k] * constrain(s0 - x, sec_strength, sec_damping); + sum += sec_taps[k] * constrain(s1 - x, sec_strength, sec_damping); + sum += sec_taps[k] * constrain(s2 - x, sec_strength, sec_damping); + sum += sec_taps[k] * constrain(s3 - x, sec_strength, sec_damping); + } + y = clamp((int16_t)x + ((8 + sum - (sum < 0)) >> 4), min, max); + if (dst8) + dst8[i * dstride + j] = (uint8_t)y; + else + dst16[i * dstride + j] = (uint16_t)y; + } + } +} + +/* Compute the primary filter strength for an 8x8 block based on the + directional variance difference. A high variance difference means + that we have a highly directional pattern (e.g. a high contrast + edge), so we can apply more deringing. A low variance means that we + either have a low contrast edge, or a non-directional texture, so + we want to be careful not to blur. */ +static INLINE int adjust_strength(int strength, int32_t var) { + const int i = var >> 6 ? AOMMIN(get_msb(var >> 6), 12) : 0; + /* We use the variance of 8x8 blocks to adjust the strength. */ + return var ? (strength * (4 + i) + 8) >> 4 : 0; +} + +void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, + uint16_t *in, int xdec, int ydec, + int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit, + int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli, + cdef_list *dlist, int cdef_count, int level, + int sec_strength, int damping, int coeff_shift) { + int bi; + int bx; + int by; + const int pri_strength = level << coeff_shift; + sec_strength <<= coeff_shift; + damping += coeff_shift - (pli != AOM_PLANE_Y); + const int bw_log2 = 3 - xdec; + const int bh_log2 = 3 - ydec; + if (dirinit && pri_strength == 0 && sec_strength == 0) { + // If we're here, both primary and secondary strengths are 0, and + // we still haven't written anything to y[] yet, so we just copy + // the input to y[]. This is necessary only for av1_cdef_search() + // and only av1_cdef_search() sets dirinit. + for (bi = 0; bi < cdef_count; bi++) { + by = dlist[bi].by; + bx = dlist[bi].bx; + // TODO(stemidts/jmvalin): SIMD optimisations + for (int iy = 0; iy < 1 << bh_log2; iy++) { + memcpy(&dst16[(bi << (bw_log2 + bh_log2)) + (iy << bw_log2)], + &in[((by << bh_log2) + iy) * CDEF_BSTRIDE + (bx << bw_log2)], + ((size_t)1 << bw_log2) * sizeof(*dst16)); + } + } + return; + } + + if (pli == 0) { + if (!dirinit || !*dirinit) { + for (bi = 0; bi < cdef_count; bi++) { + by = dlist[bi].by; + bx = dlist[bi].bx; + dir[by][bx] = cdef_find_dir(&in[8 * by * CDEF_BSTRIDE + 8 * bx], + CDEF_BSTRIDE, &var[by][bx], coeff_shift); + } + if (dirinit) *dirinit = 1; + } + } + if (pli == 1 && xdec != ydec) { + for (bi = 0; bi < cdef_count; bi++) { + static const int conv422[8] = { 7, 0, 2, 4, 5, 6, 6, 6 }; + static const int conv440[8] = { 1, 2, 2, 2, 3, 4, 6, 0 }; + by = dlist[bi].by; + bx = dlist[bi].bx; + dir[by][bx] = (xdec ? conv422 : conv440)[dir[by][bx]]; + } + } + + const int bsize = + ydec ? (xdec ? BLOCK_4X4 : BLOCK_8X4) : (xdec ? BLOCK_4X8 : BLOCK_8X8); + const int t = pri_strength; + const int s = sec_strength; + for (bi = 0; bi < cdef_count; bi++) { + by = dlist[bi].by; + bx = dlist[bi].bx; + if (dst8) { + cdef_filter_block( + &dst8[(by << bh_log2) * dstride + (bx << bw_log2)], NULL, dstride, + &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)], + (pli ? t : adjust_strength(t, var[by][bx])), s, t ? dir[by][bx] : 0, + damping, damping, bsize, coeff_shift); + } else { + cdef_filter_block( + NULL, + &dst16[dirinit ? bi << (bw_log2 + bh_log2) + : (by << bh_log2) * dstride + (bx << bw_log2)], + dirinit ? 1 << bw_log2 : dstride, + &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)], + (pli ? t : adjust_strength(t, var[by][bx])), s, t ? dir[by][bx] : 0, + damping, damping, bsize, coeff_shift); + } + } +} diff --git a/libs/libaom/src/av1/common/cdef_block.h b/libs/libaom/src/av1/common/cdef_block.h new file mode 100644 index 000000000..6b0ae0a9d --- /dev/null +++ b/libs/libaom/src/av1/common/cdef_block.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_CDEF_BLOCK_H_ +#define AOM_AV1_COMMON_CDEF_BLOCK_H_ + +#include "av1/common/odintrin.h" + +#define CDEF_BLOCKSIZE 64 +#define CDEF_BLOCKSIZE_LOG2 6 +#define CDEF_NBLOCKS ((1 << MAX_SB_SIZE_LOG2) / 8) +#define CDEF_SB_SHIFT (MAX_SB_SIZE_LOG2 - CDEF_BLOCKSIZE_LOG2) + +/* We need to buffer three vertical lines. */ +#define CDEF_VBORDER (3) +/* We only need to buffer three horizontal pixels too, but let's align to + 16 bytes (8 x 16 bits) to make vectorization easier. */ +#define CDEF_HBORDER (8) +#define CDEF_BSTRIDE \ + ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3) + +#define CDEF_VERY_LARGE (30000) +#define CDEF_INBUF_SIZE \ + (CDEF_BSTRIDE * ((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_VBORDER)) + +extern const int cdef_pri_taps[2][2]; +extern const int cdef_sec_taps[2]; +DECLARE_ALIGNED(16, extern const int, cdef_directions[8][2]); + +typedef struct { + uint8_t by; + uint8_t bx; +} cdef_list; + +typedef void (*cdef_filter_block_func)(uint8_t *dst8, uint16_t *dst16, + int dstride, const uint16_t *in, + int pri_strength, int sec_strength, + int dir, int pri_damping, + int sec_damping, int bsize, + int coeff_shift); +void copy_cdef_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src, + cdef_list *dlist, int cdef_count, int bsize); + +void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, + uint16_t *in, int xdec, int ydec, + int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit, + int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli, + cdef_list *dlist, int cdef_count, int level, + int sec_strength, int damping, int coeff_shift); +#endif // AOM_AV1_COMMON_CDEF_BLOCK_H_ diff --git a/libs/libaom/src/av1/common/cdef_block_avx2.c b/libs/libaom/src/av1/common/cdef_block_avx2.c new file mode 100644 index 000000000..e2b85b3e2 --- /dev/null +++ b/libs/libaom/src/av1/common/cdef_block_avx2.c @@ -0,0 +1,14 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_simd.h" +#define SIMD_FUNC(name) name##_avx2 +#include "av1/common/cdef_block_simd.h" diff --git a/libs/libaom/src/av1/common/cdef_block_neon.c b/libs/libaom/src/av1/common/cdef_block_neon.c new file mode 100644 index 000000000..2d6bc65e3 --- /dev/null +++ b/libs/libaom/src/av1/common/cdef_block_neon.c @@ -0,0 +1,14 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_simd.h" +#define SIMD_FUNC(name) name##_neon +#include "av1/common/cdef_block_simd.h" diff --git a/libs/libaom/src/av1/common/cdef_block_simd.h b/libs/libaom/src/av1/common/cdef_block_simd.h new file mode 100644 index 000000000..5a52bc1e4 --- /dev/null +++ b/libs/libaom/src/av1/common/cdef_block_simd.h @@ -0,0 +1,915 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_ +#define AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_ + +#include "config/av1_rtcd.h" + +#include "av1/common/cdef_block.h" + +/* partial A is a 16-bit vector of the form: + [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form: + [0 y1 y2 y3 y4 y5 y6 y7]. + This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ... + (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1 + and const2. */ +static INLINE v128 fold_mul_and_sum(v128 partiala, v128 partialb, v128 const1, + v128 const2) { + v128 tmp; + /* Reverse partial B. */ + partialb = v128_shuffle_8( + partialb, v128_from_32(0x0f0e0100, 0x03020504, 0x07060908, 0x0b0a0d0c)); + /* Interleave the x and y values of identical indices and pair x8 with 0. */ + tmp = partiala; + partiala = v128_ziplo_16(partialb, partiala); + partialb = v128_ziphi_16(partialb, tmp); + /* Square and add the corresponding x and y values. */ + partiala = v128_madd_s16(partiala, partiala); + partialb = v128_madd_s16(partialb, partialb); + /* Multiply by constant. */ + partiala = v128_mullo_s32(partiala, const1); + partialb = v128_mullo_s32(partialb, const2); + /* Sum all results. */ + partiala = v128_add_32(partiala, partialb); + return partiala; +} + +static INLINE v128 hsum4(v128 x0, v128 x1, v128 x2, v128 x3) { + v128 t0, t1, t2, t3; + t0 = v128_ziplo_32(x1, x0); + t1 = v128_ziplo_32(x3, x2); + t2 = v128_ziphi_32(x1, x0); + t3 = v128_ziphi_32(x3, x2); + x0 = v128_ziplo_64(t1, t0); + x1 = v128_ziphi_64(t1, t0); + x2 = v128_ziplo_64(t3, t2); + x3 = v128_ziphi_64(t3, t2); + return v128_add_32(v128_add_32(x0, x1), v128_add_32(x2, x3)); +} + +/* Computes cost for directions 0, 5, 6 and 7. We can call this function again + to compute the remaining directions. */ +static INLINE v128 compute_directions(v128 lines[8], int32_t tmp_cost1[4]) { + v128 partial4a, partial4b, partial5a, partial5b, partial7a, partial7b; + v128 partial6; + v128 tmp; + /* Partial sums for lines 0 and 1. */ + partial4a = v128_shl_n_byte(lines[0], 14); + partial4b = v128_shr_n_byte(lines[0], 2); + partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[1], 12)); + partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[1], 4)); + tmp = v128_add_16(lines[0], lines[1]); + partial5a = v128_shl_n_byte(tmp, 10); + partial5b = v128_shr_n_byte(tmp, 6); + partial7a = v128_shl_n_byte(tmp, 4); + partial7b = v128_shr_n_byte(tmp, 12); + partial6 = tmp; + + /* Partial sums for lines 2 and 3. */ + partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[2], 10)); + partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[2], 6)); + partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[3], 8)); + partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[3], 8)); + tmp = v128_add_16(lines[2], lines[3]); + partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 8)); + partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 8)); + partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 6)); + partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 10)); + partial6 = v128_add_16(partial6, tmp); + + /* Partial sums for lines 4 and 5. */ + partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[4], 6)); + partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[4], 10)); + partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[5], 4)); + partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[5], 12)); + tmp = v128_add_16(lines[4], lines[5]); + partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 6)); + partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 10)); + partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 8)); + partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 8)); + partial6 = v128_add_16(partial6, tmp); + + /* Partial sums for lines 6 and 7. */ + partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[6], 2)); + partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[6], 14)); + partial4a = v128_add_16(partial4a, lines[7]); + tmp = v128_add_16(lines[6], lines[7]); + partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 4)); + partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 12)); + partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 10)); + partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 6)); + partial6 = v128_add_16(partial6, tmp); + + /* Compute costs in terms of partial sums. */ + partial4a = + fold_mul_and_sum(partial4a, partial4b, v128_from_32(210, 280, 420, 840), + v128_from_32(105, 120, 140, 168)); + partial7a = + fold_mul_and_sum(partial7a, partial7b, v128_from_32(210, 420, 0, 0), + v128_from_32(105, 105, 105, 140)); + partial5a = + fold_mul_and_sum(partial5a, partial5b, v128_from_32(210, 420, 0, 0), + v128_from_32(105, 105, 105, 140)); + partial6 = v128_madd_s16(partial6, partial6); + partial6 = v128_mullo_s32(partial6, v128_dup_32(105)); + + partial4a = hsum4(partial4a, partial5a, partial6, partial7a); + v128_store_unaligned(tmp_cost1, partial4a); + return partial4a; +} + +/* transpose and reverse the order of the lines -- equivalent to a 90-degree + counter-clockwise rotation of the pixels. */ +static INLINE void array_reverse_transpose_8x8(v128 *in, v128 *res) { + const v128 tr0_0 = v128_ziplo_16(in[1], in[0]); + const v128 tr0_1 = v128_ziplo_16(in[3], in[2]); + const v128 tr0_2 = v128_ziphi_16(in[1], in[0]); + const v128 tr0_3 = v128_ziphi_16(in[3], in[2]); + const v128 tr0_4 = v128_ziplo_16(in[5], in[4]); + const v128 tr0_5 = v128_ziplo_16(in[7], in[6]); + const v128 tr0_6 = v128_ziphi_16(in[5], in[4]); + const v128 tr0_7 = v128_ziphi_16(in[7], in[6]); + + const v128 tr1_0 = v128_ziplo_32(tr0_1, tr0_0); + const v128 tr1_1 = v128_ziplo_32(tr0_5, tr0_4); + const v128 tr1_2 = v128_ziphi_32(tr0_1, tr0_0); + const v128 tr1_3 = v128_ziphi_32(tr0_5, tr0_4); + const v128 tr1_4 = v128_ziplo_32(tr0_3, tr0_2); + const v128 tr1_5 = v128_ziplo_32(tr0_7, tr0_6); + const v128 tr1_6 = v128_ziphi_32(tr0_3, tr0_2); + const v128 tr1_7 = v128_ziphi_32(tr0_7, tr0_6); + + res[7] = v128_ziplo_64(tr1_1, tr1_0); + res[6] = v128_ziphi_64(tr1_1, tr1_0); + res[5] = v128_ziplo_64(tr1_3, tr1_2); + res[4] = v128_ziphi_64(tr1_3, tr1_2); + res[3] = v128_ziplo_64(tr1_5, tr1_4); + res[2] = v128_ziphi_64(tr1_5, tr1_4); + res[1] = v128_ziplo_64(tr1_7, tr1_6); + res[0] = v128_ziphi_64(tr1_7, tr1_6); +} + +int SIMD_FUNC(cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, + int coeff_shift) { + int i; + int32_t cost[8]; + int32_t best_cost = 0; + int best_dir = 0; + v128 lines[8]; + for (i = 0; i < 8; i++) { + lines[i] = v128_load_unaligned(&img[i * stride]); + lines[i] = + v128_sub_16(v128_shr_s16(lines[i], coeff_shift), v128_dup_16(128)); + } + + /* Compute "mostly vertical" directions. */ + v128 dir47 = compute_directions(lines, cost + 4); + + array_reverse_transpose_8x8(lines, lines); + + /* Compute "mostly horizontal" directions. */ + v128 dir03 = compute_directions(lines, cost); + + v128 max = v128_max_s32(dir03, dir47); + max = v128_max_s32(max, v128_align(max, max, 8)); + max = v128_max_s32(max, v128_align(max, max, 4)); + best_cost = v128_low_u32(max); + v128 t = + v128_pack_s32_s16(v128_cmpeq_32(max, dir47), v128_cmpeq_32(max, dir03)); + best_dir = v128_movemask_8(v128_pack_s16_s8(t, t)); + best_dir = get_msb(best_dir ^ (best_dir - 1)); // Count trailing zeros + + /* Difference between the optimal variance and the variance along the + orthogonal direction. Again, the sum(x^2) terms cancel out. */ + *var = best_cost - cost[(best_dir + 4) & 7]; + /* We'd normally divide by 840, but dividing by 1024 is close enough + for what we're going to do with this. */ + *var >>= 10; + return best_dir; +} + +// sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp))) +SIMD_INLINE v256 constrain16(v256 a, v256 b, unsigned int threshold, + unsigned int adjdamp) { + v256 diff = v256_sub_16(a, b); + const v256 sign = v256_shr_n_s16(diff, 15); + diff = v256_abs_s16(diff); + const v256 s = + v256_ssub_u16(v256_dup_16(threshold), v256_shr_u16(diff, adjdamp)); + return v256_xor(v256_add_16(sign, v256_min_s16(diff, s)), sign); +} + +// sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp))) +SIMD_INLINE v128 constrain(v256 a, v256 b, unsigned int strength, + unsigned int adjdamp) { + const v256 diff16 = v256_sub_16(a, b); + v128 diff = v128_pack_s16_s8(v256_high_v128(diff16), v256_low_v128(diff16)); + const v128 sign = v128_cmplt_s8(diff, v128_zero()); + diff = v128_abs_s8(diff); + return v128_xor( + v128_add_8(sign, + v128_min_u8(diff, v128_ssub_u8(v128_dup_8(strength), + v128_shr_u8(diff, adjdamp)))), + sign); +} + +void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride, + const uint16_t *in, int pri_strength, + int sec_strength, int dir, + int pri_damping, int sec_damping, + int coeff_shift) { + v128 p0, p1, p2, p3; + v256 sum, row, tap, res; + v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE); + int po1 = cdef_directions[dir][0]; + int po2 = cdef_directions[dir][1]; + int s1o1 = cdef_directions[(dir + 2) & 7][0]; + int s1o2 = cdef_directions[(dir + 2) & 7][1]; + int s2o1 = cdef_directions[(dir + 6) & 7][0]; + int s2o2 = cdef_directions[(dir + 6) & 7][1]; + + const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; + const int *sec_taps = cdef_sec_taps; + + if (pri_strength) + pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); + if (sec_strength) + sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); + + sum = v256_zero(); + row = v256_from_v64(v64_load_aligned(&in[0 * CDEF_BSTRIDE]), + v64_load_aligned(&in[1 * CDEF_BSTRIDE]), + v64_load_aligned(&in[2 * CDEF_BSTRIDE]), + v64_load_aligned(&in[3 * CDEF_BSTRIDE])); + max = min = row; + + if (pri_strength) { + // Primary near taps + tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + po1]), + v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po1]), + v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po1]), + v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po1])); + max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large))); + min = v256_min_s16(min, tap); + p0 = constrain(tap, row, pri_strength, pri_damping); + tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po1]), + v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po1]), + v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po1]), + v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po1])); + max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large))); + min = v256_min_s16(min, tap); + p1 = constrain(tap, row, pri_strength, pri_damping); + + // sum += pri_taps[0] * (p0 + p1) + sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[0]), + v256_from_v128(v128_ziphi_8(p0, p1), + v128_ziplo_8(p0, p1)))); + + // Primary far taps + tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + po2]), + v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po2]), + v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po2]), + v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po2])); + max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large))); + min = v256_min_s16(min, tap); + p0 = constrain(tap, row, pri_strength, pri_damping); + tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po2]), + v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po2]), + v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po2]), + v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po2])); + max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large))); + min = v256_min_s16(min, tap); + p1 = constrain(tap, row, pri_strength, pri_damping); + + // sum += pri_taps[1] * (p0 + p1) + sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[1]), + v256_from_v128(v128_ziphi_8(p0, p1), + v128_ziplo_8(p0, p1)))); + } + + if (sec_strength) { + // Secondary near taps + tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s1o1]), + v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s1o1]), + v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s1o1]), + v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s1o1])); + max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large))); + min = v256_min_s16(min, tap); + p0 = constrain(tap, row, sec_strength, sec_damping); + tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s1o1]), + v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s1o1]), + v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s1o1]), + v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s1o1])); + max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large))); + min = v256_min_s16(min, tap); + p1 = constrain(tap, row, sec_strength, sec_damping); + tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s2o1]), + v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s2o1]), + v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s2o1]), + v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s2o1])); + max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large))); + min = v256_min_s16(min, tap); + p2 = constrain(tap, row, sec_strength, sec_damping); + tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s2o1]), + v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s2o1]), + v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s2o1]), + v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s2o1])); + max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large))); + min = v256_min_s16(min, tap); + p3 = constrain(tap, row, sec_strength, sec_damping); + + // sum += sec_taps[0] * (p0 + p1 + p2 + p3) + p0 = v128_add_8(p0, p1); + p2 = v128_add_8(p2, p3); + sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[0]), + v256_from_v128(v128_ziphi_8(p0, p2), + v128_ziplo_8(p0, p2)))); + + // Secondary far taps + tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s1o2]), + v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s1o2]), + v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s1o2]), + v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s1o2])); + max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large))); + min = v256_min_s16(min, tap); + p0 = constrain(tap, row, sec_strength, sec_damping); + tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s1o2]), + v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s1o2]), + v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s1o2]), + v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s1o2])); + max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large))); + min = v256_min_s16(min, tap); + p1 = constrain(tap, row, sec_strength, sec_damping); + tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s2o2]), + v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s2o2]), + v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s2o2]), + v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s2o2])); + max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large))); + min = v256_min_s16(min, tap); + p2 = constrain(tap, row, sec_strength, sec_damping); + tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s2o2]), + v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s2o2]), + v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s2o2]), + v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s2o2])); + max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large))); + min = v256_min_s16(min, tap); + p3 = constrain(tap, row, sec_strength, sec_damping); + + // sum += sec_taps[1] * (p0 + p1 + p2 + p3) + p0 = v128_add_8(p0, p1); + p2 = v128_add_8(p2, p3); + + sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[1]), + v256_from_v128(v128_ziphi_8(p0, p2), + v128_ziplo_8(p0, p2)))); + } + + // res = row + ((sum - (sum < 0) + 8) >> 4) + sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero())); + res = v256_add_16(sum, v256_dup_16(8)); + res = v256_shr_n_s16(res, 4); + res = v256_add_16(row, res); + res = v256_min_s16(v256_max_s16(res, min), max); + res = v256_pack_s16_u8(res, res); + + p0 = v256_low_v128(res); + u32_store_aligned(&dst[0 * dstride], v64_high_u32(v128_high_v64(p0))); + u32_store_aligned(&dst[1 * dstride], v64_low_u32(v128_high_v64(p0))); + u32_store_aligned(&dst[2 * dstride], v64_high_u32(v128_low_v64(p0))); + u32_store_aligned(&dst[3 * dstride], v64_low_u32(v128_low_v64(p0))); +} + +void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride, + const uint16_t *in, int pri_strength, + int sec_strength, int dir, + int pri_damping, int sec_damping, + int coeff_shift) { + int i; + v128 p0, p1, p2, p3; + v256 sum, row, res, tap; + v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE); + int po1 = cdef_directions[dir][0]; + int po2 = cdef_directions[dir][1]; + int s1o1 = cdef_directions[(dir + 2) & 7][0]; + int s1o2 = cdef_directions[(dir + 2) & 7][1]; + int s2o1 = cdef_directions[(dir + 6) & 7][0]; + int s2o2 = cdef_directions[(dir + 6) & 7][1]; + + const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; + const int *sec_taps = cdef_sec_taps; + + if (pri_strength) + pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); + if (sec_strength) + sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); + for (i = 0; i < 8; i += 2) { + sum = v256_zero(); + row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]), + v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE])); + + max = min = row; + // Primary near taps + tap = + v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1])); + max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large))); + min = v256_min_s16(min, tap); + p0 = constrain(tap, row, pri_strength, pri_damping); + tap = + v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1])); + max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large))); + min = v256_min_s16(min, tap); + p1 = constrain(tap, row, pri_strength, pri_damping); + + // sum += pri_taps[0] * (p0 + p1) + sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[0]), + v256_from_v128(v128_ziphi_8(p0, p1), + v128_ziplo_8(p0, p1)))); + + // Primary far taps + tap = + v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2])); + max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large))); + min = v256_min_s16(min, tap); + p0 = constrain(tap, row, pri_strength, pri_damping); + tap = + v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2])); + max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large))); + min = v256_min_s16(min, tap); + p1 = constrain(tap, row, pri_strength, pri_damping); + + // sum += pri_taps[1] * (p0 + p1) + sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[1]), + v256_from_v128(v128_ziphi_8(p0, p1), + v128_ziplo_8(p0, p1)))); + + // Secondary near taps + tap = + v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1])); + max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large))); + min = v256_min_s16(min, tap); + p0 = constrain(tap, row, sec_strength, sec_damping); + tap = + v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1])); + max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large))); + min = v256_min_s16(min, tap); + p1 = constrain(tap, row, sec_strength, sec_damping); + tap = + v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1])); + max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large))); + min = v256_min_s16(min, tap); + p2 = constrain(tap, row, sec_strength, sec_damping); + tap = + v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1])); + max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large))); + min = v256_min_s16(min, tap); + p3 = constrain(tap, row, sec_strength, sec_damping); + + // sum += sec_taps[0] * (p0 + p1 + p2 + p3) + p0 = v128_add_8(p0, p1); + p2 = v128_add_8(p2, p3); + sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[0]), + v256_from_v128(v128_ziphi_8(p0, p2), + v128_ziplo_8(p0, p2)))); + + // Secondary far taps + tap = + v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2])); + max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large))); + min = v256_min_s16(min, tap); + p0 = constrain(tap, row, sec_strength, sec_damping); + tap = + v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2])); + max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large))); + min = v256_min_s16(min, tap); + p1 = constrain(tap, row, sec_strength, sec_damping); + tap = + v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2])); + max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large))); + min = v256_min_s16(min, tap); + p2 = constrain(tap, row, sec_strength, sec_damping); + tap = + v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2])); + max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large))); + min = v256_min_s16(min, tap); + p3 = constrain(tap, row, sec_strength, sec_damping); + + // sum += sec_taps[1] * (p0 + p1 + p2 + p3) + p0 = v128_add_8(p0, p1); + p2 = v128_add_8(p2, p3); + sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[1]), + v256_from_v128(v128_ziphi_8(p0, p2), + v128_ziplo_8(p0, p2)))); + + // res = row + ((sum - (sum < 0) + 8) >> 4) + sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero())); + res = v256_add_16(sum, v256_dup_16(8)); + res = v256_shr_n_s16(res, 4); + res = v256_add_16(row, res); + res = v256_min_s16(v256_max_s16(res, min), max); + res = v256_pack_s16_u8(res, res); + + p0 = v256_low_v128(res); + v64_store_aligned(&dst[i * dstride], v128_high_v64(p0)); + v64_store_aligned(&dst[(i + 1) * dstride], v128_low_v64(p0)); + } +} + +void SIMD_FUNC(cdef_filter_block_4x4_16)(uint16_t *dst, int dstride, + const uint16_t *in, int pri_strength, + int sec_strength, int dir, + int pri_damping, int sec_damping, + int coeff_shift) { + int i; + v256 p0, p1, p2, p3, sum, row, res; + v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE); + int po1 = cdef_directions[dir][0]; + int po2 = cdef_directions[dir][1]; + int s1o1 = cdef_directions[(dir + 2) & 7][0]; + int s1o2 = cdef_directions[(dir + 2) & 7][1]; + int s2o1 = cdef_directions[(dir + 6) & 7][0]; + int s2o2 = cdef_directions[(dir + 6) & 7][1]; + + const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; + const int *sec_taps = cdef_sec_taps; + + if (pri_strength) + pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); + if (sec_strength) + sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); + for (i = 0; i < 4; i += 4) { + sum = v256_zero(); + row = v256_from_v64(v64_load_aligned(&in[i * CDEF_BSTRIDE]), + v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]), + v64_load_aligned(&in[(i + 2) * CDEF_BSTRIDE]), + v64_load_aligned(&in[(i + 3) * CDEF_BSTRIDE])); + min = max = row; + + // Primary near taps + p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po1]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po1]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po1])); + p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po1]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po1]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po1])); + max = + v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))), + v256_andn(p1, v256_cmpeq_16(p1, large))); + min = v256_min_s16(v256_min_s16(min, p0), p1); + p0 = constrain16(p0, row, pri_strength, pri_damping); + p1 = constrain16(p1, row, pri_strength, pri_damping); + + // sum += pri_taps[0] * (p0 + p1) + sum = v256_add_16( + sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1))); + + // Primary far taps + p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po2]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po2]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po2])); + p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po2]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po2]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po2])); + max = + v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))), + v256_andn(p1, v256_cmpeq_16(p1, large))); + min = v256_min_s16(v256_min_s16(min, p0), p1); + p0 = constrain16(p0, row, pri_strength, pri_damping); + p1 = constrain16(p1, row, pri_strength, pri_damping); + + // sum += pri_taps[1] * (p0 + p1) + sum = v256_add_16( + sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1))); + + // Secondary near taps + p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o1]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o1])); + p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o1]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o1])); + p2 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o1]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o1])); + p3 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o1]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o1])); + max = + v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))), + v256_andn(p1, v256_cmpeq_16(p1, large))); + max = + v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))), + v256_andn(p3, v256_cmpeq_16(p3, large))); + min = v256_min_s16( + v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3); + p0 = constrain16(p0, row, sec_strength, sec_damping); + p1 = constrain16(p1, row, sec_strength, sec_damping); + p2 = constrain16(p2, row, sec_strength, sec_damping); + p3 = constrain16(p3, row, sec_strength, sec_damping); + + // sum += sec_taps[0] * (p0 + p1 + p2 + p3) + sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]), + v256_add_16(v256_add_16(p0, p1), + v256_add_16(p2, p3)))); + + // Secondary far taps + p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o2]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o2])); + p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o2]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o2])); + p2 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o2]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o2])); + p3 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o2]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o2])); + max = + v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))), + v256_andn(p1, v256_cmpeq_16(p1, large))); + max = + v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))), + v256_andn(p3, v256_cmpeq_16(p3, large))); + min = v256_min_s16( + v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3); + p0 = constrain16(p0, row, sec_strength, sec_damping); + p1 = constrain16(p1, row, sec_strength, sec_damping); + p2 = constrain16(p2, row, sec_strength, sec_damping); + p3 = constrain16(p3, row, sec_strength, sec_damping); + + // sum += sec_taps[1] * (p0 + p1 + p2 + p3) + sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]), + v256_add_16(v256_add_16(p0, p1), + v256_add_16(p2, p3)))); + + // res = row + ((sum - (sum < 0) + 8) >> 4) + sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero())); + res = v256_add_16(sum, v256_dup_16(8)); + res = v256_shr_n_s16(res, 4); + res = v256_add_16(row, res); + res = v256_min_s16(v256_max_s16(res, min), max); + + v64_store_aligned(&dst[i * dstride], v128_high_v64(v256_high_v128(res))); + v64_store_aligned(&dst[(i + 1) * dstride], + v128_low_v64(v256_high_v128(res))); + v64_store_aligned(&dst[(i + 2) * dstride], + v128_high_v64(v256_low_v128(res))); + v64_store_aligned(&dst[(i + 3) * dstride], + v128_low_v64(v256_low_v128(res))); + } +} + +void SIMD_FUNC(cdef_filter_block_8x8_16)(uint16_t *dst, int dstride, + const uint16_t *in, int pri_strength, + int sec_strength, int dir, + int pri_damping, int sec_damping, + int coeff_shift) { + int i; + v256 sum, p0, p1, p2, p3, row, res; + v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE); + int po1 = cdef_directions[dir][0]; + int po2 = cdef_directions[dir][1]; + int s1o1 = cdef_directions[(dir + 2) & 7][0]; + int s1o2 = cdef_directions[(dir + 2) & 7][1]; + int s2o1 = cdef_directions[(dir + 6) & 7][0]; + int s2o2 = cdef_directions[(dir + 6) & 7][1]; + + const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; + const int *sec_taps = cdef_sec_taps; + + if (pri_strength) + pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); + if (sec_strength) + sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); + + for (i = 0; i < 8; i += 2) { + sum = v256_zero(); + row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]), + v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE])); + + min = max = row; + // Primary near taps + p0 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1])); + p1 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1])); + max = + v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))), + v256_andn(p1, v256_cmpeq_16(p1, large))); + min = v256_min_s16(v256_min_s16(min, p0), p1); + p0 = constrain16(p0, row, pri_strength, pri_damping); + p1 = constrain16(p1, row, pri_strength, pri_damping); + + // sum += pri_taps[0] * (p0 + p1) + sum = v256_add_16( + sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1))); + + // Primary far taps + p0 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2])); + p1 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2])); + max = + v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))), + v256_andn(p1, v256_cmpeq_16(p1, large))); + min = v256_min_s16(v256_min_s16(min, p0), p1); + p0 = constrain16(p0, row, pri_strength, pri_damping); + p1 = constrain16(p1, row, pri_strength, pri_damping); + + // sum += pri_taps[1] * (p0 + p1) + sum = v256_add_16( + sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1))); + + // Secondary near taps + p0 = + v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1])); + p1 = + v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1])); + p2 = + v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1])); + p3 = + v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1])); + max = + v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))), + v256_andn(p1, v256_cmpeq_16(p1, large))); + max = + v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))), + v256_andn(p3, v256_cmpeq_16(p3, large))); + min = v256_min_s16( + v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3); + p0 = constrain16(p0, row, sec_strength, sec_damping); + p1 = constrain16(p1, row, sec_strength, sec_damping); + p2 = constrain16(p2, row, sec_strength, sec_damping); + p3 = constrain16(p3, row, sec_strength, sec_damping); + + // sum += sec_taps[0] * (p0 + p1 + p2 + p3) + sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]), + v256_add_16(v256_add_16(p0, p1), + v256_add_16(p2, p3)))); + + // Secondary far taps + p0 = + v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2])); + p1 = + v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2])); + p2 = + v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2])); + p3 = + v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2])); + max = + v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))), + v256_andn(p1, v256_cmpeq_16(p1, large))); + max = + v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))), + v256_andn(p3, v256_cmpeq_16(p3, large))); + min = v256_min_s16( + v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3); + p0 = constrain16(p0, row, sec_strength, sec_damping); + p1 = constrain16(p1, row, sec_strength, sec_damping); + p2 = constrain16(p2, row, sec_strength, sec_damping); + p3 = constrain16(p3, row, sec_strength, sec_damping); + + // sum += sec_taps[1] * (p0 + p1 + p2 + p3) + sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]), + v256_add_16(v256_add_16(p0, p1), + v256_add_16(p2, p3)))); + + // res = row + ((sum - (sum < 0) + 8) >> 4) + sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero())); + res = v256_add_16(sum, v256_dup_16(8)); + res = v256_shr_n_s16(res, 4); + res = v256_add_16(row, res); + res = v256_min_s16(v256_max_s16(res, min), max); + v128_store_unaligned(&dst[i * dstride], v256_high_v128(res)); + v128_store_unaligned(&dst[(i + 1) * dstride], v256_low_v128(res)); + } +} + +void SIMD_FUNC(cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride, + const uint16_t *in, int pri_strength, + int sec_strength, int dir, int pri_damping, + int sec_damping, int bsize, int coeff_shift) { + if (dst8) { + if (bsize == BLOCK_8X8) { + SIMD_FUNC(cdef_filter_block_8x8_8) + (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping, + sec_damping, coeff_shift); + } else if (bsize == BLOCK_4X8) { + SIMD_FUNC(cdef_filter_block_4x4_8) + (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping, + sec_damping, coeff_shift); + SIMD_FUNC(cdef_filter_block_4x4_8) + (dst8 + 4 * dstride, dstride, in + 4 * CDEF_BSTRIDE, pri_strength, + sec_strength, dir, pri_damping, sec_damping, coeff_shift); + } else if (bsize == BLOCK_8X4) { + SIMD_FUNC(cdef_filter_block_4x4_8) + (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping, + sec_damping, coeff_shift); + SIMD_FUNC(cdef_filter_block_4x4_8) + (dst8 + 4, dstride, in + 4, pri_strength, sec_strength, dir, pri_damping, + sec_damping, coeff_shift); + } else { + SIMD_FUNC(cdef_filter_block_4x4_8) + (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping, + sec_damping, coeff_shift); + } + } else { + if (bsize == BLOCK_8X8) { + SIMD_FUNC(cdef_filter_block_8x8_16) + (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping, + sec_damping, coeff_shift); + } else if (bsize == BLOCK_4X8) { + SIMD_FUNC(cdef_filter_block_4x4_16) + (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping, + sec_damping, coeff_shift); + SIMD_FUNC(cdef_filter_block_4x4_16) + (dst16 + 4 * dstride, dstride, in + 4 * CDEF_BSTRIDE, pri_strength, + sec_strength, dir, pri_damping, sec_damping, coeff_shift); + } else if (bsize == BLOCK_8X4) { + SIMD_FUNC(cdef_filter_block_4x4_16) + (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping, + sec_damping, coeff_shift); + SIMD_FUNC(cdef_filter_block_4x4_16) + (dst16 + 4, dstride, in + 4, pri_strength, sec_strength, dir, pri_damping, + sec_damping, coeff_shift); + } else { + assert(bsize == BLOCK_4X4); + SIMD_FUNC(cdef_filter_block_4x4_16) + (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping, + sec_damping, coeff_shift); + } + } +} + +void SIMD_FUNC(cdef_copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, + const uint8_t *src, int sstride, + int v, int h) { + int i, j; + for (i = 0; i < v; i++) { + for (j = 0; j < (h & ~0x7); j += 8) { + v64 row = v64_load_unaligned(&src[i * sstride + j]); + v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row)); + } + for (; j < h; j++) { + dst[i * dstride + j] = src[i * sstride + j]; + } + } +} + +void SIMD_FUNC(cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, + const uint16_t *src, int sstride, + int v, int h) { + int i, j; + for (i = 0; i < v; i++) { + for (j = 0; j < (h & ~0x7); j += 8) { + v128 row = v128_load_unaligned(&src[i * sstride + j]); + v128_store_unaligned(&dst[i * dstride + j], row); + } + for (; j < h; j++) { + dst[i * dstride + j] = src[i * sstride + j]; + } + } +} + +#endif // AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_ diff --git a/libs/libaom/src/av1/common/cdef_block_sse2.c b/libs/libaom/src/av1/common/cdef_block_sse2.c new file mode 100644 index 000000000..73f115d17 --- /dev/null +++ b/libs/libaom/src/av1/common/cdef_block_sse2.c @@ -0,0 +1,14 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_simd.h" +#define SIMD_FUNC(name) name##_sse2 +#include "av1/common/cdef_block_simd.h" diff --git a/libs/libaom/src/av1/common/cdef_block_sse4.c b/libs/libaom/src/av1/common/cdef_block_sse4.c new file mode 100644 index 000000000..349329af6 --- /dev/null +++ b/libs/libaom/src/av1/common/cdef_block_sse4.c @@ -0,0 +1,14 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_simd.h" +#define SIMD_FUNC(name) name##_sse4_1 +#include "av1/common/cdef_block_simd.h" diff --git a/libs/libaom/src/av1/common/cdef_block_ssse3.c b/libs/libaom/src/av1/common/cdef_block_ssse3.c new file mode 100644 index 000000000..3a93b150f --- /dev/null +++ b/libs/libaom/src/av1/common/cdef_block_ssse3.c @@ -0,0 +1,14 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_simd.h" +#define SIMD_FUNC(name) name##_ssse3 +#include "av1/common/cdef_block_simd.h" diff --git a/libs/libaom/src/av1/common/cfl.c b/libs/libaom/src/av1/common/cfl.c new file mode 100644 index 000000000..98199cb95 --- /dev/null +++ b/libs/libaom/src/av1/common/cfl.c @@ -0,0 +1,436 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/av1_common_int.h" +#include "av1/common/cfl.h" +#include "av1/common/common_data.h" + +#include "config/av1_rtcd.h" + +void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params) { + assert(block_size_wide[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE); + assert(block_size_high[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE); + + memset(&cfl->recon_buf_q3, 0, sizeof(cfl->recon_buf_q3)); + memset(&cfl->ac_buf_q3, 0, sizeof(cfl->ac_buf_q3)); + cfl->subsampling_x = seq_params->subsampling_x; + cfl->subsampling_y = seq_params->subsampling_y; + cfl->are_parameters_computed = 0; + cfl->store_y = 0; + // The DC_PRED cache is disabled by default and is only enabled in + // cfl_rd_pick_alpha + cfl->use_dc_pred_cache = 0; + cfl->dc_pred_is_cached[CFL_PRED_U] = 0; + cfl->dc_pred_is_cached[CFL_PRED_V] = 0; +} + +void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input, + CFL_PRED_TYPE pred_plane, int width) { + assert(pred_plane < CFL_PRED_PLANES); + assert(width <= CFL_BUF_LINE); + + if (is_cur_buf_hbd(xd)) { + uint16_t *const input_16 = CONVERT_TO_SHORTPTR(input); + memcpy(xd->cfl.dc_pred_cache[pred_plane], input_16, width << 1); + return; + } + + memcpy(xd->cfl.dc_pred_cache[pred_plane], input, width); +} + +static void cfl_load_dc_pred_lbd(const int16_t *dc_pred_cache, uint8_t *dst, + int dst_stride, int width, int height) { + for (int j = 0; j < height; j++) { + memcpy(dst, dc_pred_cache, width); + dst += dst_stride; + } +} + +static void cfl_load_dc_pred_hbd(const int16_t *dc_pred_cache, uint16_t *dst, + int dst_stride, int width, int height) { + const size_t num_bytes = width << 1; + for (int j = 0; j < height; j++) { + memcpy(dst, dc_pred_cache, num_bytes); + dst += dst_stride; + } +} +void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride, + TX_SIZE tx_size, CFL_PRED_TYPE pred_plane) { + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + assert(pred_plane < CFL_PRED_PLANES); + assert(width <= CFL_BUF_LINE); + assert(height <= CFL_BUF_LINE); + if (is_cur_buf_hbd(xd)) { + uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst); + cfl_load_dc_pred_hbd(xd->cfl.dc_pred_cache[pred_plane], dst_16, dst_stride, + width, height); + return; + } + cfl_load_dc_pred_lbd(xd->cfl.dc_pred_cache[pred_plane], dst, dst_stride, + width, height); +} + +// Due to frame boundary issues, it is possible that the total area covered by +// chroma exceeds that of luma. When this happens, we fill the missing pixels by +// repeating the last columns and/or rows. +static INLINE void cfl_pad(CFL_CTX *cfl, int width, int height) { + const int diff_width = width - cfl->buf_width; + const int diff_height = height - cfl->buf_height; + + if (diff_width > 0) { + const int min_height = height - diff_height; + uint16_t *recon_buf_q3 = cfl->recon_buf_q3 + (width - diff_width); + for (int j = 0; j < min_height; j++) { + const uint16_t last_pixel = recon_buf_q3[-1]; + assert(recon_buf_q3 + diff_width <= cfl->recon_buf_q3 + CFL_BUF_SQUARE); + for (int i = 0; i < diff_width; i++) { + recon_buf_q3[i] = last_pixel; + } + recon_buf_q3 += CFL_BUF_LINE; + } + cfl->buf_width = width; + } + if (diff_height > 0) { + uint16_t *recon_buf_q3 = + cfl->recon_buf_q3 + ((height - diff_height) * CFL_BUF_LINE); + for (int j = 0; j < diff_height; j++) { + const uint16_t *last_row_q3 = recon_buf_q3 - CFL_BUF_LINE; + assert(recon_buf_q3 + width <= cfl->recon_buf_q3 + CFL_BUF_SQUARE); + for (int i = 0; i < width; i++) { + recon_buf_q3[i] = last_row_q3[i]; + } + recon_buf_q3 += CFL_BUF_LINE; + } + cfl->buf_height = height; + } +} + +static void subtract_average_c(const uint16_t *src, int16_t *dst, int width, + int height, int round_offset, int num_pel_log2) { + int sum = round_offset; + const uint16_t *recon = src; + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i++) { + sum += recon[i]; + } + recon += CFL_BUF_LINE; + } + const int avg = sum >> num_pel_log2; + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i++) { + dst[i] = src[i] - avg; + } + src += CFL_BUF_LINE; + dst += CFL_BUF_LINE; + } +} + +CFL_SUB_AVG_FN(c) + +static INLINE int cfl_idx_to_alpha(uint8_t alpha_idx, int8_t joint_sign, + CFL_PRED_TYPE pred_type) { + const int alpha_sign = (pred_type == CFL_PRED_U) ? CFL_SIGN_U(joint_sign) + : CFL_SIGN_V(joint_sign); + if (alpha_sign == CFL_SIGN_ZERO) return 0; + const int abs_alpha_q3 = + (pred_type == CFL_PRED_U) ? CFL_IDX_U(alpha_idx) : CFL_IDX_V(alpha_idx); + return (alpha_sign == CFL_SIGN_POS) ? abs_alpha_q3 + 1 : -abs_alpha_q3 - 1; +} + +static INLINE void cfl_predict_lbd_c(const int16_t *ac_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3, int width, + int height) { + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i++) { + dst[i] = clip_pixel(get_scaled_luma_q0(alpha_q3, ac_buf_q3[i]) + dst[i]); + } + dst += dst_stride; + ac_buf_q3 += CFL_BUF_LINE; + } +} + +CFL_PREDICT_FN(c, lbd) + +#if CONFIG_AV1_HIGHBITDEPTH +void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst, int dst_stride, + int alpha_q3, int bit_depth, int width, int height) { + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i++) { + dst[i] = clip_pixel_highbd( + get_scaled_luma_q0(alpha_q3, ac_buf_q3[i]) + dst[i], bit_depth); + } + dst += dst_stride; + ac_buf_q3 += CFL_BUF_LINE; + } +} + +CFL_PREDICT_FN(c, hbd) +#endif + +static void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) { + CFL_CTX *const cfl = &xd->cfl; + // Do not call cfl_compute_parameters multiple time on the same values. + assert(cfl->are_parameters_computed == 0); + + cfl_pad(cfl, tx_size_wide[tx_size], tx_size_high[tx_size]); + cfl_get_subtract_average_fn(tx_size)(cfl->recon_buf_q3, cfl->ac_buf_q3); + cfl->are_parameters_computed = 1; +} + +void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride, + TX_SIZE tx_size, int plane) { + CFL_CTX *const cfl = &xd->cfl; + MB_MODE_INFO *mbmi = xd->mi[0]; + assert(is_cfl_allowed(xd)); + + if (!cfl->are_parameters_computed) cfl_compute_parameters(xd, tx_size); + + const int alpha_q3 = + cfl_idx_to_alpha(mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, plane - 1); + assert((tx_size_high[tx_size] - 1) * CFL_BUF_LINE + tx_size_wide[tx_size] <= + CFL_BUF_SQUARE); +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst); + cfl_get_predict_hbd_fn(tx_size)(cfl->ac_buf_q3, dst_16, dst_stride, + alpha_q3, xd->bd); + return; + } +#endif + cfl_get_predict_lbd_fn(tx_size)(cfl->ac_buf_q3, dst, dst_stride, alpha_q3); +} + +static void cfl_luma_subsampling_420_lbd_c(const uint8_t *input, + int input_stride, + uint16_t *output_q3, int width, + int height) { + for (int j = 0; j < height; j += 2) { + for (int i = 0; i < width; i += 2) { + const int bot = i + input_stride; + output_q3[i >> 1] = + (input[i] + input[i + 1] + input[bot] + input[bot + 1]) << 1; + } + input += input_stride << 1; + output_q3 += CFL_BUF_LINE; + } +} + +static void cfl_luma_subsampling_422_lbd_c(const uint8_t *input, + int input_stride, + uint16_t *output_q3, int width, + int height) { + assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE); + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i += 2) { + output_q3[i >> 1] = (input[i] + input[i + 1]) << 2; + } + input += input_stride; + output_q3 += CFL_BUF_LINE; + } +} + +static void cfl_luma_subsampling_444_lbd_c(const uint8_t *input, + int input_stride, + uint16_t *output_q3, int width, + int height) { + assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE); + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i++) { + output_q3[i] = input[i] << 3; + } + input += input_stride; + output_q3 += CFL_BUF_LINE; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void cfl_luma_subsampling_420_hbd_c(const uint16_t *input, + int input_stride, + uint16_t *output_q3, int width, + int height) { + for (int j = 0; j < height; j += 2) { + for (int i = 0; i < width; i += 2) { + const int bot = i + input_stride; + output_q3[i >> 1] = + (input[i] + input[i + 1] + input[bot] + input[bot + 1]) << 1; + } + input += input_stride << 1; + output_q3 += CFL_BUF_LINE; + } +} + +static void cfl_luma_subsampling_422_hbd_c(const uint16_t *input, + int input_stride, + uint16_t *output_q3, int width, + int height) { + assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE); + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i += 2) { + output_q3[i >> 1] = (input[i] + input[i + 1]) << 2; + } + input += input_stride; + output_q3 += CFL_BUF_LINE; + } +} + +static void cfl_luma_subsampling_444_hbd_c(const uint16_t *input, + int input_stride, + uint16_t *output_q3, int width, + int height) { + assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE); + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i++) { + output_q3[i] = input[i] << 3; + } + input += input_stride; + output_q3 += CFL_BUF_LINE; + } +} +#endif + +CFL_GET_SUBSAMPLE_FUNCTION(c) + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE cfl_subsample_hbd_fn cfl_subsampling_hbd(TX_SIZE tx_size, + int sub_x, int sub_y) { + if (sub_x == 1) { + if (sub_y == 1) { + return cfl_get_luma_subsampling_420_hbd(tx_size); + } + return cfl_get_luma_subsampling_422_hbd(tx_size); + } + return cfl_get_luma_subsampling_444_hbd(tx_size); +} +#endif + +static INLINE cfl_subsample_lbd_fn cfl_subsampling_lbd(TX_SIZE tx_size, + int sub_x, int sub_y) { + if (sub_x == 1) { + if (sub_y == 1) { + return cfl_get_luma_subsampling_420_lbd(tx_size); + } + return cfl_get_luma_subsampling_422_lbd(tx_size); + } + return cfl_get_luma_subsampling_444_lbd(tx_size); +} + +static void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride, + int row, int col, TX_SIZE tx_size, int use_hbd) { + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const int tx_off_log2 = MI_SIZE_LOG2; + const int sub_x = cfl->subsampling_x; + const int sub_y = cfl->subsampling_y; + const int store_row = row << (tx_off_log2 - sub_y); + const int store_col = col << (tx_off_log2 - sub_x); + const int store_height = height >> sub_y; + const int store_width = width >> sub_x; + + // Invalidate current parameters + cfl->are_parameters_computed = 0; + + // Store the surface of the pixel buffer that was written to, this way we + // can manage chroma overrun (e.g. when the chroma surfaces goes beyond the + // frame boundary) + if (col == 0 && row == 0) { + cfl->buf_width = store_width; + cfl->buf_height = store_height; + } else { + cfl->buf_width = OD_MAXI(store_col + store_width, cfl->buf_width); + cfl->buf_height = OD_MAXI(store_row + store_height, cfl->buf_height); + } + + // Check that we will remain inside the pixel buffer. + assert(store_row + store_height <= CFL_BUF_LINE); + assert(store_col + store_width <= CFL_BUF_LINE); + + // Store the input into the CfL pixel buffer + uint16_t *recon_buf_q3 = + cfl->recon_buf_q3 + (store_row * CFL_BUF_LINE + store_col); +#if CONFIG_AV1_HIGHBITDEPTH + if (use_hbd) { + cfl_subsampling_hbd(tx_size, sub_x, sub_y)(CONVERT_TO_SHORTPTR(input), + input_stride, recon_buf_q3); + } else { + cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride, + recon_buf_q3); + } +#else + (void)use_hbd; + cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride, recon_buf_q3); +#endif +} + +// Adjust the row and column of blocks smaller than 8X8, as chroma-referenced +// and non-chroma-referenced blocks are stored together in the CfL buffer. +static INLINE void sub8x8_adjust_offset(const CFL_CTX *cfl, int mi_row, + int mi_col, int *row_out, + int *col_out) { + // Increment row index for bottom: 8x4, 16x4 or both bottom 4x4s. + if ((mi_row & 0x01) && cfl->subsampling_y) { + assert(*row_out == 0); + (*row_out)++; + } + + // Increment col index for right: 4x8, 4x16 or both right 4x4s. + if ((mi_col & 0x01) && cfl->subsampling_x) { + assert(*col_out == 0); + (*col_out)++; + } +} + +void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size, + BLOCK_SIZE bsize) { + CFL_CTX *const cfl = &xd->cfl; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; + uint8_t *dst = &pd->dst.buf[(row * pd->dst.stride + col) << MI_SIZE_LOG2]; + + if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) { + // Only dimensions of size 4 can have an odd offset. + assert(!((col & 1) && tx_size_wide[tx_size] != 4)); + assert(!((row & 1) && tx_size_high[tx_size] != 4)); + sub8x8_adjust_offset(cfl, xd->mi_row, xd->mi_col, &row, &col); + } + cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size, is_cur_buf_hbd(xd)); +} + +static INLINE int max_intra_block_width(const MACROBLOCKD *xd, + BLOCK_SIZE plane_bsize, int plane, + TX_SIZE tx_size) { + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane) + << MI_SIZE_LOG2; + return ALIGN_POWER_OF_TWO(max_blocks_wide, tx_size_wide_log2[tx_size]); +} + +static INLINE int max_intra_block_height(const MACROBLOCKD *xd, + BLOCK_SIZE plane_bsize, int plane, + TX_SIZE tx_size) { + const int max_blocks_high = max_block_high(xd, plane_bsize, plane) + << MI_SIZE_LOG2; + return ALIGN_POWER_OF_TWO(max_blocks_high, tx_size_high_log2[tx_size]); +} + +void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) { + CFL_CTX *const cfl = &xd->cfl; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; + int row = 0; + int col = 0; + + if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) { + sub8x8_adjust_offset(cfl, xd->mi_row, xd->mi_col, &row, &col); + } + const int width = max_intra_block_width(xd, bsize, AOM_PLANE_Y, tx_size); + const int height = max_intra_block_height(xd, bsize, AOM_PLANE_Y, tx_size); + tx_size = get_tx_size(width, height); + cfl_store(cfl, pd->dst.buf, pd->dst.stride, row, col, tx_size, + is_cur_buf_hbd(xd)); +} diff --git a/libs/libaom/src/av1/common/cfl.h b/libs/libaom/src/av1/common/cfl.h new file mode 100644 index 000000000..a1d6dc2ea --- /dev/null +++ b/libs/libaom/src/av1/common/cfl.h @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_CFL_H_ +#define AOM_AV1_COMMON_CFL_H_ + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" + +// Can we use CfL for the current block? +static INLINE CFL_ALLOWED_TYPE is_cfl_allowed(const MACROBLOCKD *xd) { + const MB_MODE_INFO *mbmi = xd->mi[0]; + const BLOCK_SIZE bsize = mbmi->sb_type; + assert(bsize < BLOCK_SIZES_ALL); + if (xd->lossless[mbmi->segment_id]) { + // In lossless, CfL is available when the partition size is equal to the + // transform size. + const int ssx = xd->plane[AOM_PLANE_U].subsampling_x; + const int ssy = xd->plane[AOM_PLANE_U].subsampling_y; + const int plane_bsize = get_plane_block_size(bsize, ssx, ssy); + return (CFL_ALLOWED_TYPE)(plane_bsize == BLOCK_4X4); + } + // Spec: CfL is available to luma partitions lesser than or equal to 32x32 + return (CFL_ALLOWED_TYPE)(block_size_wide[bsize] <= 32 && + block_size_high[bsize] <= 32); +} + +// Do we need to save the luma pixels from the current block, +// for a possible future CfL prediction? +static INLINE CFL_ALLOWED_TYPE store_cfl_required(const AV1_COMMON *cm, + const MACROBLOCKD *xd) { + const MB_MODE_INFO *mbmi = xd->mi[0]; + + if (cm->seq_params.monochrome) return CFL_DISALLOWED; + + if (!xd->is_chroma_ref) { + // For non-chroma-reference blocks, we should always store the luma pixels, + // in case the corresponding chroma-reference block uses CfL. + // Note that this can only happen for block sizes which are <8 on + // their shortest side, as otherwise they would be chroma reference + // blocks. + return CFL_ALLOWED; + } + + // If this block has chroma information, we know whether we're + // actually going to perform a CfL prediction + return (CFL_ALLOWED_TYPE)(!is_inter_block(mbmi) && + mbmi->uv_mode == UV_CFL_PRED); +} + +static INLINE int get_scaled_luma_q0(int alpha_q3, int16_t pred_buf_q3) { + int scaled_luma_q6 = alpha_q3 * pred_buf_q3; + return ROUND_POWER_OF_TWO_SIGNED(scaled_luma_q6, 6); +} + +static INLINE CFL_PRED_TYPE get_cfl_pred_type(PLANE_TYPE plane) { + assert(plane > 0); + return (CFL_PRED_TYPE)(plane - 1); +} + +void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride, + TX_SIZE tx_size, int plane); + +void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size); + +void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size, + BLOCK_SIZE bsize); + +void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input, + CFL_PRED_TYPE pred_plane, int width); + +void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride, + TX_SIZE tx_size, CFL_PRED_TYPE pred_plane); + +// Allows the CFL_SUBSAMPLE function to switch types depending on the bitdepth. +#define CFL_lbd_TYPE uint8_t *cfl_type +#define CFL_hbd_TYPE uint16_t *cfl_type + +// Declare a size-specific wrapper for the size-generic function. The compiler +// will inline the size generic function in here, the advantage is that the size +// will be constant allowing for loop unrolling and other constant propagated +// goodness. +#define CFL_SUBSAMPLE(arch, sub, bd, width, height) \ + void cfl_subsample_##bd##_##sub##_##width##x##height##_##arch( \ + const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) { \ + cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride, \ + output_q3, width, height); \ + } + +// Declare size-specific wrappers for all valid CfL sizes. +#define CFL_SUBSAMPLE_FUNCTIONS(arch, sub, bd) \ + CFL_SUBSAMPLE(arch, sub, bd, 4, 4) \ + CFL_SUBSAMPLE(arch, sub, bd, 8, 8) \ + CFL_SUBSAMPLE(arch, sub, bd, 16, 16) \ + CFL_SUBSAMPLE(arch, sub, bd, 32, 32) \ + CFL_SUBSAMPLE(arch, sub, bd, 4, 8) \ + CFL_SUBSAMPLE(arch, sub, bd, 8, 4) \ + CFL_SUBSAMPLE(arch, sub, bd, 8, 16) \ + CFL_SUBSAMPLE(arch, sub, bd, 16, 8) \ + CFL_SUBSAMPLE(arch, sub, bd, 16, 32) \ + CFL_SUBSAMPLE(arch, sub, bd, 32, 16) \ + CFL_SUBSAMPLE(arch, sub, bd, 4, 16) \ + CFL_SUBSAMPLE(arch, sub, bd, 16, 4) \ + CFL_SUBSAMPLE(arch, sub, bd, 8, 32) \ + CFL_SUBSAMPLE(arch, sub, bd, 32, 8) \ + cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_##arch( \ + TX_SIZE tx_size) { \ + CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd) \ + return subfn_##sub[tx_size]; \ + } + +// Declare an architecture-specific array of function pointers for size-specific +// wrappers. +#define CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd) \ + static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \ + cfl_subsample_##bd##_##sub##_4x4_##arch, /* 4x4 */ \ + cfl_subsample_##bd##_##sub##_8x8_##arch, /* 8x8 */ \ + cfl_subsample_##bd##_##sub##_16x16_##arch, /* 16x16 */ \ + cfl_subsample_##bd##_##sub##_32x32_##arch, /* 32x32 */ \ + NULL, /* 64x64 (invalid CFL size) */ \ + cfl_subsample_##bd##_##sub##_4x8_##arch, /* 4x8 */ \ + cfl_subsample_##bd##_##sub##_8x4_##arch, /* 8x4 */ \ + cfl_subsample_##bd##_##sub##_8x16_##arch, /* 8x16 */ \ + cfl_subsample_##bd##_##sub##_16x8_##arch, /* 16x8 */ \ + cfl_subsample_##bd##_##sub##_16x32_##arch, /* 16x32 */ \ + cfl_subsample_##bd##_##sub##_32x16_##arch, /* 32x16 */ \ + NULL, /* 32x64 (invalid CFL size) */ \ + NULL, /* 64x32 (invalid CFL size) */ \ + cfl_subsample_##bd##_##sub##_4x16_##arch, /* 4x16 */ \ + cfl_subsample_##bd##_##sub##_16x4_##arch, /* 16x4 */ \ + cfl_subsample_##bd##_##sub##_8x32_##arch, /* 8x32 */ \ + cfl_subsample_##bd##_##sub##_32x8_##arch, /* 32x8 */ \ + NULL, /* 16x64 (invalid CFL size) */ \ + NULL, /* 64x16 (invalid CFL size) */ \ + }; + +// The RTCD script does not support passing in an array, so we wrap it in this +// function. +#if CONFIG_AV1_HIGHBITDEPTH +#define CFL_GET_SUBSAMPLE_FUNCTION(arch) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 444, lbd) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 420, hbd) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 422, hbd) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 444, hbd) +#else +#define CFL_GET_SUBSAMPLE_FUNCTION(arch) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 444, lbd) +#endif + +// Declare a size-specific wrapper for the size-generic function. The compiler +// will inline the size generic function in here, the advantage is that the size +// will be constant allowing for loop unrolling and other constant propagated +// goodness. +#define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2) \ + void cfl_subtract_average_##width##x##height##_##arch(const uint16_t *src, \ + int16_t *dst) { \ + subtract_average_##arch(src, dst, width, height, round_offset, \ + num_pel_log2); \ + } + +// Declare size-specific wrappers for all valid CfL sizes. +#define CFL_SUB_AVG_FN(arch) \ + CFL_SUB_AVG_X(arch, 4, 4, 8, 4) \ + CFL_SUB_AVG_X(arch, 4, 8, 16, 5) \ + CFL_SUB_AVG_X(arch, 4, 16, 32, 6) \ + CFL_SUB_AVG_X(arch, 8, 4, 16, 5) \ + CFL_SUB_AVG_X(arch, 8, 8, 32, 6) \ + CFL_SUB_AVG_X(arch, 8, 16, 64, 7) \ + CFL_SUB_AVG_X(arch, 8, 32, 128, 8) \ + CFL_SUB_AVG_X(arch, 16, 4, 32, 6) \ + CFL_SUB_AVG_X(arch, 16, 8, 64, 7) \ + CFL_SUB_AVG_X(arch, 16, 16, 128, 8) \ + CFL_SUB_AVG_X(arch, 16, 32, 256, 9) \ + CFL_SUB_AVG_X(arch, 32, 8, 128, 8) \ + CFL_SUB_AVG_X(arch, 32, 16, 256, 9) \ + CFL_SUB_AVG_X(arch, 32, 32, 512, 10) \ + cfl_subtract_average_fn cfl_get_subtract_average_fn_##arch( \ + TX_SIZE tx_size) { \ + static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { \ + cfl_subtract_average_4x4_##arch, /* 4x4 */ \ + cfl_subtract_average_8x8_##arch, /* 8x8 */ \ + cfl_subtract_average_16x16_##arch, /* 16x16 */ \ + cfl_subtract_average_32x32_##arch, /* 32x32 */ \ + NULL, /* 64x64 (invalid CFL size) */ \ + cfl_subtract_average_4x8_##arch, /* 4x8 */ \ + cfl_subtract_average_8x4_##arch, /* 8x4 */ \ + cfl_subtract_average_8x16_##arch, /* 8x16 */ \ + cfl_subtract_average_16x8_##arch, /* 16x8 */ \ + cfl_subtract_average_16x32_##arch, /* 16x32 */ \ + cfl_subtract_average_32x16_##arch, /* 32x16 */ \ + NULL, /* 32x64 (invalid CFL size) */ \ + NULL, /* 64x32 (invalid CFL size) */ \ + cfl_subtract_average_4x16_##arch, /* 4x16 (invalid CFL size) */ \ + cfl_subtract_average_16x4_##arch, /* 16x4 (invalid CFL size) */ \ + cfl_subtract_average_8x32_##arch, /* 8x32 (invalid CFL size) */ \ + cfl_subtract_average_32x8_##arch, /* 32x8 (invalid CFL size) */ \ + NULL, /* 16x64 (invalid CFL size) */ \ + NULL, /* 64x16 (invalid CFL size) */ \ + }; \ + /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \ + /* index the function pointer array out of bounds. */ \ + return sub_avg[tx_size % TX_SIZES_ALL]; \ + } + +// For VSX SIMD optimization, the C versions of width == 4 subtract are +// faster than the VSX. As such, the VSX code calls the C versions. +void cfl_subtract_average_4x4_c(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_4x8_c(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_4x16_c(const uint16_t *src, int16_t *dst); + +#define CFL_PREDICT_lbd(arch, width, height) \ + void cfl_predict_lbd_##width##x##height##_##arch( \ + const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, \ + int alpha_q3) { \ + cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width, \ + height); \ + } + +#if CONFIG_AV1_HIGHBITDEPTH +#define CFL_PREDICT_hbd(arch, width, height) \ + void cfl_predict_hbd_##width##x##height##_##arch( \ + const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, \ + int bd) { \ + cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width, \ + height); \ + } +#endif + +// This wrapper exists because clang format does not like calling macros with +// lowercase letters. +#define CFL_PREDICT_X(arch, width, height, bd) \ + CFL_PREDICT_##bd(arch, width, height) + +#define CFL_PREDICT_FN(arch, bd) \ + CFL_PREDICT_X(arch, 4, 4, bd) \ + CFL_PREDICT_X(arch, 4, 8, bd) \ + CFL_PREDICT_X(arch, 4, 16, bd) \ + CFL_PREDICT_X(arch, 8, 4, bd) \ + CFL_PREDICT_X(arch, 8, 8, bd) \ + CFL_PREDICT_X(arch, 8, 16, bd) \ + CFL_PREDICT_X(arch, 8, 32, bd) \ + CFL_PREDICT_X(arch, 16, 4, bd) \ + CFL_PREDICT_X(arch, 16, 8, bd) \ + CFL_PREDICT_X(arch, 16, 16, bd) \ + CFL_PREDICT_X(arch, 16, 32, bd) \ + CFL_PREDICT_X(arch, 32, 8, bd) \ + CFL_PREDICT_X(arch, 32, 16, bd) \ + CFL_PREDICT_X(arch, 32, 32, bd) \ + cfl_predict_##bd##_fn cfl_get_predict_##bd##_fn_##arch(TX_SIZE tx_size) { \ + static const cfl_predict_##bd##_fn pred[TX_SIZES_ALL] = { \ + cfl_predict_##bd##_4x4_##arch, /* 4x4 */ \ + cfl_predict_##bd##_8x8_##arch, /* 8x8 */ \ + cfl_predict_##bd##_16x16_##arch, /* 16x16 */ \ + cfl_predict_##bd##_32x32_##arch, /* 32x32 */ \ + NULL, /* 64x64 (invalid CFL size) */ \ + cfl_predict_##bd##_4x8_##arch, /* 4x8 */ \ + cfl_predict_##bd##_8x4_##arch, /* 8x4 */ \ + cfl_predict_##bd##_8x16_##arch, /* 8x16 */ \ + cfl_predict_##bd##_16x8_##arch, /* 16x8 */ \ + cfl_predict_##bd##_16x32_##arch, /* 16x32 */ \ + cfl_predict_##bd##_32x16_##arch, /* 32x16 */ \ + NULL, /* 32x64 (invalid CFL size) */ \ + NULL, /* 64x32 (invalid CFL size) */ \ + cfl_predict_##bd##_4x16_##arch, /* 4x16 */ \ + cfl_predict_##bd##_16x4_##arch, /* 16x4 */ \ + cfl_predict_##bd##_8x32_##arch, /* 8x32 */ \ + cfl_predict_##bd##_32x8_##arch, /* 32x8 */ \ + NULL, /* 16x64 (invalid CFL size) */ \ + NULL, /* 64x16 (invalid CFL size) */ \ + }; \ + /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \ + /* index the function pointer array out of bounds. */ \ + return pred[tx_size % TX_SIZES_ALL]; \ + } + +#endif // AOM_AV1_COMMON_CFL_H_ diff --git a/libs/libaom/src/av1/common/common.h b/libs/libaom/src/av1/common/common.h new file mode 100644 index 000000000..bed6083db --- /dev/null +++ b/libs/libaom/src/av1/common/common.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_COMMON_H_ +#define AOM_AV1_COMMON_COMMON_H_ + +/* Interface header for common constant data structures and lookup tables */ + +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom/aom_integer.h" +#include "aom_ports/bitops.h" +#include "config/aom_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define PI 3.141592653589793238462643383279502884 + +// Only need this for fixed-size arrays, for structs just assign. +#define av1_copy(dest, src) \ + { \ + assert(sizeof(dest) == sizeof(src)); \ + memcpy(dest, src, sizeof(src)); \ + } + +// Use this for variably-sized arrays. +#define av1_copy_array(dest, src, n) \ + { \ + assert(sizeof(*(dest)) == sizeof(*(src))); \ + memcpy(dest, src, n * sizeof(*(src))); \ + } + +#define av1_zero(dest) memset(&(dest), 0, sizeof(dest)) +#define av1_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest))) + +static INLINE int get_unsigned_bits(unsigned int num_values) { + return num_values > 0 ? get_msb(num_values) + 1 : 0; +} + +#define CHECK_MEM_ERROR(cm, lval, expr) \ + AOM_CHECK_MEM_ERROR(&cm->error, lval, expr) + +#define AOM_FRAME_MARKER 0x2 + +#define AV1_MIN_TILE_SIZE_BYTES 1 + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_COMMON_H_ diff --git a/libs/libaom/src/av1/common/common_data.h b/libs/libaom/src/av1/common/common_data.h new file mode 100644 index 000000000..402845caf --- /dev/null +++ b/libs/libaom/src/av1/common/common_data.h @@ -0,0 +1,446 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_COMMON_DATA_H_ +#define AOM_AV1_COMMON_COMMON_DATA_H_ + +#include "av1/common/enums.h" +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Log 2 conversion lookup tables in units of mode info (4x4). +// The Mi_Width_Log2 table in the spec (Section 9.3. Conversion tables). +static const uint8_t mi_size_wide_log2[BLOCK_SIZES_ALL] = { + 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 0, 2, 1, 3, 2, 4 +}; +// The Mi_Height_Log2 table in the spec (Section 9.3. Conversion tables). +static const uint8_t mi_size_high_log2[BLOCK_SIZES_ALL] = { + 0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 2, 0, 3, 1, 4, 2 +}; + +// Width/height lookup tables in units of mode info (4x4). +// The Num_4x4_Blocks_Wide table in the spec (Section 9.3. Conversion tables). +static const uint8_t mi_size_wide[BLOCK_SIZES_ALL] = { + 1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 1, 4, 2, 8, 4, 16 +}; + +// The Num_4x4_Blocks_High table in the spec (Section 9.3. Conversion tables). +static const uint8_t mi_size_high[BLOCK_SIZES_ALL] = { + 1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 4, 1, 8, 2, 16, 4 +}; + +// Width/height lookup tables in units of samples. +// The Block_Width table in the spec (Section 9.3. Conversion tables). +static const uint8_t block_size_wide[BLOCK_SIZES_ALL] = { + 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 32, + 64, 64, 64, 128, 128, 4, 16, 8, 32, 16, 64 +}; + +// The Block_Height table in the spec (Section 9.3. Conversion tables). +static const uint8_t block_size_high[BLOCK_SIZES_ALL] = { + 4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 64, + 32, 64, 128, 64, 128, 16, 4, 32, 8, 64, 16 +}; + +// Maps a block size to a context. +// The Size_Group table in the spec (Section 9.3. Conversion tables). +// AOMMIN(3, AOMMIN(mi_size_wide_log2(bsize), mi_size_high_log2(bsize))) +static const uint8_t size_group_lookup[BLOCK_SIZES_ALL] = { + 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0, 0, 1, 1, 2, 2 +}; + +static const uint8_t num_pels_log2_lookup[BLOCK_SIZES_ALL] = { + 4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 6, 6, 8, 8, 10, 10 +}; + +// A compressed version of the Partition_Subsize table in the spec (9.3. +// Conversion tables), for square block sizes only. +/* clang-format off */ +static const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][SQR_BLOCK_SIZES] = { + { // PARTITION_NONE + BLOCK_4X4, BLOCK_8X8, BLOCK_16X16, + BLOCK_32X32, BLOCK_64X64, BLOCK_128X128 + }, { // PARTITION_HORZ + BLOCK_INVALID, BLOCK_8X4, BLOCK_16X8, + BLOCK_32X16, BLOCK_64X32, BLOCK_128X64 + }, { // PARTITION_VERT + BLOCK_INVALID, BLOCK_4X8, BLOCK_8X16, + BLOCK_16X32, BLOCK_32X64, BLOCK_64X128 + }, { // PARTITION_SPLIT + BLOCK_INVALID, BLOCK_4X4, BLOCK_8X8, + BLOCK_16X16, BLOCK_32X32, BLOCK_64X64 + }, { // PARTITION_HORZ_A + BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8, + BLOCK_32X16, BLOCK_64X32, BLOCK_128X64 + }, { // PARTITION_HORZ_B + BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8, + BLOCK_32X16, BLOCK_64X32, BLOCK_128X64 + }, { // PARTITION_VERT_A + BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16, + BLOCK_16X32, BLOCK_32X64, BLOCK_64X128 + }, { // PARTITION_VERT_B + BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16, + BLOCK_16X32, BLOCK_32X64, BLOCK_64X128 + }, { // PARTITION_HORZ_4 + BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X4, + BLOCK_32X8, BLOCK_64X16, BLOCK_INVALID + }, { // PARTITION_VERT_4 + BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16, + BLOCK_8X32, BLOCK_16X64, BLOCK_INVALID + } +}; + +static const TX_SIZE max_txsize_lookup[BLOCK_SIZES_ALL] = { + // 4X4 + TX_4X4, + // 4X8, 8X4, 8X8 + TX_4X4, TX_4X4, TX_8X8, + // 8X16, 16X8, 16X16 + TX_8X8, TX_8X8, TX_16X16, + // 16X32, 32X16, 32X32 + TX_16X16, TX_16X16, TX_32X32, + // 32X64, 64X32, + TX_32X32, TX_32X32, + // 64X64 + TX_64X64, + // 64x128, 128x64, 128x128 + TX_64X64, TX_64X64, TX_64X64, + // 4x16, 16x4, 8x32 + TX_4X4, TX_4X4, TX_8X8, + // 32x8, 16x64 64x16 + TX_8X8, TX_16X16, TX_16X16 +}; + +static const TX_SIZE max_txsize_rect_lookup[BLOCK_SIZES_ALL] = { + // 4X4 + TX_4X4, + // 4X8, 8X4, 8X8 + TX_4X8, TX_8X4, TX_8X8, + // 8X16, 16X8, 16X16 + TX_8X16, TX_16X8, TX_16X16, + // 16X32, 32X16, 32X32 + TX_16X32, TX_32X16, TX_32X32, + // 32X64, 64X32, + TX_32X64, TX_64X32, + // 64X64 + TX_64X64, + // 64x128, 128x64, 128x128 + TX_64X64, TX_64X64, TX_64X64, + // 4x16, 16x4, + TX_4X16, TX_16X4, + // 8x32, 32x8 + TX_8X32, TX_32X8, + // 16x64, 64x16 + TX_16X64, TX_64X16 +}; + +static const TX_TYPE_1D vtx_tab[TX_TYPES] = { + DCT_1D, ADST_1D, DCT_1D, ADST_1D, + FLIPADST_1D, DCT_1D, FLIPADST_1D, ADST_1D, FLIPADST_1D, IDTX_1D, + DCT_1D, IDTX_1D, ADST_1D, IDTX_1D, FLIPADST_1D, IDTX_1D, +}; + +static const TX_TYPE_1D htx_tab[TX_TYPES] = { + DCT_1D, DCT_1D, ADST_1D, ADST_1D, + DCT_1D, FLIPADST_1D, FLIPADST_1D, FLIPADST_1D, ADST_1D, IDTX_1D, + IDTX_1D, DCT_1D, IDTX_1D, ADST_1D, IDTX_1D, FLIPADST_1D, +}; + +#define TXSIZE_CAT_INVALID (-1) + +/* clang-format on */ + +static const TX_SIZE sub_tx_size_map[TX_SIZES_ALL] = { + TX_4X4, // TX_4X4 + TX_4X4, // TX_8X8 + TX_8X8, // TX_16X16 + TX_16X16, // TX_32X32 + TX_32X32, // TX_64X64 + TX_4X4, // TX_4X8 + TX_4X4, // TX_8X4 + TX_8X8, // TX_8X16 + TX_8X8, // TX_16X8 + TX_16X16, // TX_16X32 + TX_16X16, // TX_32X16 + TX_32X32, // TX_32X64 + TX_32X32, // TX_64X32 + TX_4X8, // TX_4X16 + TX_8X4, // TX_16X4 + TX_8X16, // TX_8X32 + TX_16X8, // TX_32X8 + TX_16X32, // TX_16X64 + TX_32X16, // TX_64X16 +}; + +static const TX_SIZE txsize_horz_map[TX_SIZES_ALL] = { + TX_4X4, // TX_4X4 + TX_8X8, // TX_8X8 + TX_16X16, // TX_16X16 + TX_32X32, // TX_32X32 + TX_64X64, // TX_64X64 + TX_4X4, // TX_4X8 + TX_8X8, // TX_8X4 + TX_8X8, // TX_8X16 + TX_16X16, // TX_16X8 + TX_16X16, // TX_16X32 + TX_32X32, // TX_32X16 + TX_32X32, // TX_32X64 + TX_64X64, // TX_64X32 + TX_4X4, // TX_4X16 + TX_16X16, // TX_16X4 + TX_8X8, // TX_8X32 + TX_32X32, // TX_32X8 + TX_16X16, // TX_16X64 + TX_64X64, // TX_64X16 +}; + +static const TX_SIZE txsize_vert_map[TX_SIZES_ALL] = { + TX_4X4, // TX_4X4 + TX_8X8, // TX_8X8 + TX_16X16, // TX_16X16 + TX_32X32, // TX_32X32 + TX_64X64, // TX_64X64 + TX_8X8, // TX_4X8 + TX_4X4, // TX_8X4 + TX_16X16, // TX_8X16 + TX_8X8, // TX_16X8 + TX_32X32, // TX_16X32 + TX_16X16, // TX_32X16 + TX_64X64, // TX_32X64 + TX_32X32, // TX_64X32 + TX_16X16, // TX_4X16 + TX_4X4, // TX_16X4 + TX_32X32, // TX_8X32 + TX_8X8, // TX_32X8 + TX_64X64, // TX_16X64 + TX_16X16, // TX_64X16 +}; + +#define TX_SIZE_W_MIN 4 + +// Transform block width in pixels +static const int tx_size_wide[TX_SIZES_ALL] = { + 4, 8, 16, 32, 64, 4, 8, 8, 16, 16, 32, 32, 64, 4, 16, 8, 32, 16, 64, +}; + +#define TX_SIZE_H_MIN 4 + +// Transform block height in pixels +static const int tx_size_high[TX_SIZES_ALL] = { + 4, 8, 16, 32, 64, 8, 4, 16, 8, 32, 16, 64, 32, 16, 4, 32, 8, 64, 16, +}; + +// Transform block width in unit +static const int tx_size_wide_unit[TX_SIZES_ALL] = { + 1, 2, 4, 8, 16, 1, 2, 2, 4, 4, 8, 8, 16, 1, 4, 2, 8, 4, 16, +}; + +// Transform block height in unit +static const int tx_size_high_unit[TX_SIZES_ALL] = { + 1, 2, 4, 8, 16, 2, 1, 4, 2, 8, 4, 16, 8, 4, 1, 8, 2, 16, 4, +}; + +// Transform block width in log2 +static const int tx_size_wide_log2[TX_SIZES_ALL] = { + 2, 3, 4, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6, 2, 4, 3, 5, 4, 6, +}; + +// Transform block height in log2 +static const int tx_size_high_log2[TX_SIZES_ALL] = { + 2, 3, 4, 5, 6, 3, 2, 4, 3, 5, 4, 6, 5, 4, 2, 5, 3, 6, 4, +}; + +static const int tx_size_2d[TX_SIZES_ALL + 1] = { + 16, 64, 256, 1024, 4096, 32, 32, 128, 128, 512, + 512, 2048, 2048, 64, 64, 256, 256, 1024, 1024, +}; + +static const BLOCK_SIZE txsize_to_bsize[TX_SIZES_ALL] = { + BLOCK_4X4, // TX_4X4 + BLOCK_8X8, // TX_8X8 + BLOCK_16X16, // TX_16X16 + BLOCK_32X32, // TX_32X32 + BLOCK_64X64, // TX_64X64 + BLOCK_4X8, // TX_4X8 + BLOCK_8X4, // TX_8X4 + BLOCK_8X16, // TX_8X16 + BLOCK_16X8, // TX_16X8 + BLOCK_16X32, // TX_16X32 + BLOCK_32X16, // TX_32X16 + BLOCK_32X64, // TX_32X64 + BLOCK_64X32, // TX_64X32 + BLOCK_4X16, // TX_4X16 + BLOCK_16X4, // TX_16X4 + BLOCK_8X32, // TX_8X32 + BLOCK_32X8, // TX_32X8 + BLOCK_16X64, // TX_16X64 + BLOCK_64X16, // TX_64X16 +}; + +static const TX_SIZE txsize_sqr_map[TX_SIZES_ALL] = { + TX_4X4, // TX_4X4 + TX_8X8, // TX_8X8 + TX_16X16, // TX_16X16 + TX_32X32, // TX_32X32 + TX_64X64, // TX_64X64 + TX_4X4, // TX_4X8 + TX_4X4, // TX_8X4 + TX_8X8, // TX_8X16 + TX_8X8, // TX_16X8 + TX_16X16, // TX_16X32 + TX_16X16, // TX_32X16 + TX_32X32, // TX_32X64 + TX_32X32, // TX_64X32 + TX_4X4, // TX_4X16 + TX_4X4, // TX_16X4 + TX_8X8, // TX_8X32 + TX_8X8, // TX_32X8 + TX_16X16, // TX_16X64 + TX_16X16, // TX_64X16 +}; + +static const TX_SIZE txsize_sqr_up_map[TX_SIZES_ALL] = { + TX_4X4, // TX_4X4 + TX_8X8, // TX_8X8 + TX_16X16, // TX_16X16 + TX_32X32, // TX_32X32 + TX_64X64, // TX_64X64 + TX_8X8, // TX_4X8 + TX_8X8, // TX_8X4 + TX_16X16, // TX_8X16 + TX_16X16, // TX_16X8 + TX_32X32, // TX_16X32 + TX_32X32, // TX_32X16 + TX_64X64, // TX_32X64 + TX_64X64, // TX_64X32 + TX_16X16, // TX_4X16 + TX_16X16, // TX_16X4 + TX_32X32, // TX_8X32 + TX_32X32, // TX_32X8 + TX_64X64, // TX_16X64 + TX_64X64, // TX_64X16 +}; + +static const int8_t txsize_log2_minus4[TX_SIZES_ALL] = { + 0, // TX_4X4 + 2, // TX_8X8 + 4, // TX_16X16 + 6, // TX_32X32 + 6, // TX_64X64 + 1, // TX_4X8 + 1, // TX_8X4 + 3, // TX_8X16 + 3, // TX_16X8 + 5, // TX_16X32 + 5, // TX_32X16 + 6, // TX_32X64 + 6, // TX_64X32 + 2, // TX_4X16 + 2, // TX_16X4 + 4, // TX_8X32 + 4, // TX_32X8 + 5, // TX_16X64 + 5, // TX_64X16 +}; + +/* clang-format off */ +static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = { + TX_4X4, // ONLY_4X4 + TX_64X64, // TX_MODE_LARGEST + TX_64X64, // TX_MODE_SELECT +}; + +// The Subsampled_Size table in the spec (Section 5.11.38. Get plane residual +// size function). +static const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES_ALL][2][2] = { + // ss_x == 0 ss_x == 0 ss_x == 1 ss_x == 1 + // ss_y == 0 ss_y == 1 ss_y == 0 ss_y == 1 + { { BLOCK_4X4, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } }, + { { BLOCK_4X8, BLOCK_4X4 }, { BLOCK_INVALID, BLOCK_4X4 } }, + { { BLOCK_8X4, BLOCK_INVALID }, { BLOCK_4X4, BLOCK_4X4 } }, + { { BLOCK_8X8, BLOCK_8X4 }, { BLOCK_4X8, BLOCK_4X4 } }, + { { BLOCK_8X16, BLOCK_8X8 }, { BLOCK_INVALID, BLOCK_4X8 } }, + { { BLOCK_16X8, BLOCK_INVALID }, { BLOCK_8X8, BLOCK_8X4 } }, + { { BLOCK_16X16, BLOCK_16X8 }, { BLOCK_8X16, BLOCK_8X8 } }, + { { BLOCK_16X32, BLOCK_16X16 }, { BLOCK_INVALID, BLOCK_8X16 } }, + { { BLOCK_32X16, BLOCK_INVALID }, { BLOCK_16X16, BLOCK_16X8 } }, + { { BLOCK_32X32, BLOCK_32X16 }, { BLOCK_16X32, BLOCK_16X16 } }, + { { BLOCK_32X64, BLOCK_32X32 }, { BLOCK_INVALID, BLOCK_16X32 } }, + { { BLOCK_64X32, BLOCK_INVALID }, { BLOCK_32X32, BLOCK_32X16 } }, + { { BLOCK_64X64, BLOCK_64X32 }, { BLOCK_32X64, BLOCK_32X32 } }, + { { BLOCK_64X128, BLOCK_64X64 }, { BLOCK_INVALID, BLOCK_32X64 } }, + { { BLOCK_128X64, BLOCK_INVALID }, { BLOCK_64X64, BLOCK_64X32 } }, + { { BLOCK_128X128, BLOCK_128X64 }, { BLOCK_64X128, BLOCK_64X64 } }, + { { BLOCK_4X16, BLOCK_4X8 }, { BLOCK_INVALID, BLOCK_4X8 } }, + { { BLOCK_16X4, BLOCK_INVALID }, { BLOCK_8X4, BLOCK_8X4 } }, + { { BLOCK_8X32, BLOCK_8X16 }, { BLOCK_INVALID, BLOCK_4X16 } }, + { { BLOCK_32X8, BLOCK_INVALID }, { BLOCK_16X8, BLOCK_16X4 } }, + { { BLOCK_16X64, BLOCK_16X32 }, { BLOCK_INVALID, BLOCK_8X32 } }, + { { BLOCK_64X16, BLOCK_INVALID }, { BLOCK_32X16, BLOCK_32X8 } } +}; +/* clang-format on */ + +// Generates 5 bit field in which each bit set to 1 represents +// a blocksize partition 11111 means we split 128x128, 64x64, 32x32, 16x16 +// and 8x8. 10000 means we just split the 128x128 to 64x64 +/* clang-format off */ +static const struct { + PARTITION_CONTEXT above; + PARTITION_CONTEXT left; +} partition_context_lookup[BLOCK_SIZES_ALL] = { + { 31, 31 }, // 4X4 - {0b11111, 0b11111} + { 31, 30 }, // 4X8 - {0b11111, 0b11110} + { 30, 31 }, // 8X4 - {0b11110, 0b11111} + { 30, 30 }, // 8X8 - {0b11110, 0b11110} + { 30, 28 }, // 8X16 - {0b11110, 0b11100} + { 28, 30 }, // 16X8 - {0b11100, 0b11110} + { 28, 28 }, // 16X16 - {0b11100, 0b11100} + { 28, 24 }, // 16X32 - {0b11100, 0b11000} + { 24, 28 }, // 32X16 - {0b11000, 0b11100} + { 24, 24 }, // 32X32 - {0b11000, 0b11000} + { 24, 16 }, // 32X64 - {0b11000, 0b10000} + { 16, 24 }, // 64X32 - {0b10000, 0b11000} + { 16, 16 }, // 64X64 - {0b10000, 0b10000} + { 16, 0 }, // 64X128- {0b10000, 0b00000} + { 0, 16 }, // 128X64- {0b00000, 0b10000} + { 0, 0 }, // 128X128-{0b00000, 0b00000} + { 31, 28 }, // 4X16 - {0b11111, 0b11100} + { 28, 31 }, // 16X4 - {0b11100, 0b11111} + { 30, 24 }, // 8X32 - {0b11110, 0b11000} + { 24, 30 }, // 32X8 - {0b11000, 0b11110} + { 28, 16 }, // 16X64 - {0b11100, 0b10000} + { 16, 28 }, // 64X16 - {0b10000, 0b11100} +}; +/* clang-format on */ + +static const int intra_mode_context[INTRA_MODES] = { + 0, 1, 2, 3, 4, 4, 4, 4, 3, 0, 1, 2, 0, +}; + +// Note: this is also used in unit tests. So whenever one changes the table, +// the unit tests need to be changed accordingly. +static const int quant_dist_weight[4][2] = { + { 2, 3 }, { 2, 5 }, { 2, 7 }, { 1, MAX_FRAME_DISTANCE } +}; +static const int quant_dist_lookup_table[2][4][2] = { + { { 9, 7 }, { 11, 5 }, { 12, 4 }, { 13, 3 } }, + { { 7, 9 }, { 5, 11 }, { 4, 12 }, { 3, 13 } }, +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_COMMON_DATA_H_ diff --git a/libs/libaom/src/av1/common/convolve.c b/libs/libaom/src/av1/common/convolve.c new file mode 100644 index 000000000..e177e3cad --- /dev/null +++ b/libs/libaom/src/av1/common/convolve.c @@ -0,0 +1,1274 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" +#include "av1/common/resize.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const int16_t *x_filters, int x0_qn, + int x_step_qn) { + src -= UPSCALE_NORMATIVE_TAPS / 2 - 1; + for (int y = 0; y < h; ++y) { + int x_qn = x0_qn; + for (int x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS]; + const int x_filter_idx = + (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + assert(x_filter_idx <= RS_SUBPEL_MASK); + const int16_t *const x_filter = + &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS]; + int sum = 0; + for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k) + sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + x_qn += x_step_qn; + } + src += src_stride; + dst += dst_stride; + } +} + +void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const int16_t *x_filters, int x0_qn, + int x_step_qn, int bd) { + src -= UPSCALE_NORMATIVE_TAPS / 2 - 1; + for (int y = 0; y < h; ++y) { + int x_qn = x0_qn; + for (int x = 0; x < w; ++x) { + const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS]; + const int x_filter_idx = + (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + assert(x_filter_idx <= RS_SUBPEL_MASK); + const int16_t *const x_filter = + &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS]; + int sum = 0; + for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k) + sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + x_qn += x_step_qn; + } + src += src_stride; + dst += dst_stride; + } +} + +void av1_convolve_2d_sobel_y_c(const uint8_t *src, int src_stride, double *dst, + int dst_stride, int w, int h, int dir, + double norm) { + int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; + DECLARE_ALIGNED(256, static const int16_t, sobel_a[3]) = { 1, 0, -1 }; + DECLARE_ALIGNED(256, static const int16_t, sobel_b[3]) = { 1, 2, 1 }; + const int taps = 3; + int im_h = h + taps - 1; + int im_stride = w; + const int fo_vert = 1; + const int fo_horiz = 1; + + // horizontal filter + const uint8_t *src_horiz = src - fo_vert * src_stride; + const int16_t *x_filter = dir ? sobel_a : sobel_b; + for (int y = 0; y < im_h; ++y) { + for (int x = 0; x < w; ++x) { + int16_t sum = 0; + for (int k = 0; k < taps; ++k) { + sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; + } + im_block[y * im_stride + x] = sum; + } + } + + // vertical filter + int16_t *src_vert = im_block + fo_vert * im_stride; + const int16_t *y_filter = dir ? sobel_b : sobel_a; + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int16_t sum = 0; + for (int k = 0; k < taps; ++k) { + sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; + } + dst[y * dst_stride + x] = sum * norm; + } + } +} + +void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; + int im_h = h + filter_params_y->taps - 1; + int im_stride = w; + assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bd = 8; + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + + // horizontal filter + const uint8_t *src_horiz = src - fo_vert * src_stride; + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + for (int y = 0; y < im_h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t sum = (1 << (bd + FILTER_BITS - 1)); + for (int k = 0; k < filter_params_x->taps; ++k) { + sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; + } + assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))); + im_block[y * im_stride + x] = + (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); + } + } + + // vertical filter + int16_t *src_vert = im_block + fo_vert * im_stride; + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t sum = 1 << offset_bits; + for (int k = 0; k < filter_params_y->taps; ++k) { + sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; + } + assert(0 <= sum && sum < (1 << (offset_bits + 2))); + int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - + ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits)); + } + } +} + +void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + const int fo_vert = filter_params_y->taps / 2 - 1; + (void)filter_params_x; + (void)subpel_x_qn; + (void)conv_params; + + assert(conv_params->round_0 <= FILTER_BITS); + assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) || + ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); + + // vertical filter + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_y->taps; ++k) { + res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x]; + } + dst[y * dst_stride + x] = + clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS)); + } + } +} + +void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bits = FILTER_BITS - conv_params->round_0; + (void)filter_params_y; + (void)subpel_y_qn; + (void)conv_params; + + assert(bits >= 0); + assert((FILTER_BITS - conv_params->round_1) >= 0 || + ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); + + // horizontal filter + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_x->taps; ++k) { + res += x_filter[k] * src[y * src_stride + x - fo_horiz + k]; + } + res = ROUND_POWER_OF_TWO(res, conv_params->round_0); + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits)); + } + } +} + +void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + (void)filter_params_x; + (void)filter_params_y; + (void)subpel_x_qn; + (void)subpel_y_qn; + (void)conv_params; + + for (int y = 0; y < h; ++y) { + memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0])); + } +} + +void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; + int im_h = h + filter_params_y->taps - 1; + int im_stride = w; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bd = 8; + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + + // horizontal filter + const uint8_t *src_horiz = src - fo_vert * src_stride; + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + for (int y = 0; y < im_h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t sum = (1 << (bd + FILTER_BITS - 1)); + for (int k = 0; k < filter_params_x->taps; ++k) { + sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; + } + assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))); + im_block[y * im_stride + x] = + (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); + } + } + + // vertical filter + int16_t *src_vert = im_block + fo_vert * im_stride; + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t sum = 1 << offset_bits; + for (int k = 0; k < filter_params_y->taps; ++k) { + sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; + } + assert(0 <= sum && sum < (1 << (offset_bits + 2))); + CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + dst[y * dst_stride + x] = + clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits)); + } else { + dst16[y * dst16_stride + x] = res; + } + } + } +} + +void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int bits = FILTER_BITS - conv_params->round_0; + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + (void)filter_params_x; + (void)subpel_x_qn; + + // vertical filter + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_y->taps; ++k) { + res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x]; + } + res *= (1 << bits); + res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset; + + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= round_offset; + dst[y * dst_stride + x] = + clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits)); + } else { + dst16[y * dst16_stride + x] = res; + } + } + } +} + +void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bits = FILTER_BITS - conv_params->round_1; + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + (void)filter_params_y; + (void)subpel_y_qn; + + // horizontal filter + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_x->taps; ++k) { + res += x_filter[k] * src[y * src_stride + x - fo_horiz + k]; + } + res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0); + res += round_offset; + + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= round_offset; + dst[y * dst_stride + x] = + clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits)); + } else { + dst16[y * dst16_stride + x] = res; + } + } + } +} + +void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, + const int subpel_y_qn, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + const int bits = + FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + (void)filter_params_x; + (void)filter_params_y; + (void)subpel_x_qn; + (void)subpel_y_qn; + + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + CONV_BUF_TYPE res = src[y * src_stride + x] << bits; + res += round_offset; + + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= round_offset; + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); + } else { + dst16[y * dst16_stride + x] = res; + } + } + } +} + +void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int x_step_qn, + const int subpel_y_qn, const int y_step_qn, + ConvolveParams *conv_params) { + int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]; + int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + + filter_params_y->taps; + CONV_BUF_TYPE *dst16 = conv_params->dst; + const int dst16_stride = conv_params->dst_stride; + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + assert(bits >= 0); + int im_stride = w; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bd = 8; + + // horizontal filter + const uint8_t *src_horiz = src - fo_vert * src_stride; + for (int y = 0; y < im_h; ++y) { + int x_qn = subpel_x_qn; + for (int x = 0; x < w; ++x, x_qn += x_step_qn) { + const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)]; + const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + assert(x_filter_idx < SUBPEL_SHIFTS); + const int16_t *x_filter = + av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx); + int32_t sum = (1 << (bd + FILTER_BITS - 1)); + for (int k = 0; k < filter_params_x->taps; ++k) { + sum += x_filter[k] * src_x[k - fo_horiz]; + } + assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))); + im_block[y * im_stride + x] = + (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); + } + src_horiz += src_stride; + } + + // vertical filter + int16_t *src_vert = im_block + fo_vert * im_stride; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + for (int x = 0; x < w; ++x) { + int y_qn = subpel_y_qn; + for (int y = 0; y < h; ++y, y_qn += y_step_qn) { + const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride]; + const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + assert(y_filter_idx < SUBPEL_SHIFTS); + const int16_t *y_filter = + av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx); + int32_t sum = 1 << offset_bits; + for (int k = 0; k < filter_params_y->taps; ++k) { + sum += y_filter[k] * src_y[(k - fo_vert) * im_stride]; + } + assert(0 <= sum && sum < (1 << (offset_bits + 2))); + CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); + if (conv_params->is_compound) { + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + /* Subtract round offset and convolve round */ + tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); + } else { + dst16[y * dst16_stride + x] = res; + } + } else { + /* Subtract round offset and convolve round */ + int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); + } + } + src_vert++; + } +} + +static void convolve_2d_scale_wrapper( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int x_step_qn, const int subpel_y_qn, const int y_step_qn, + ConvolveParams *conv_params) { + if (conv_params->is_compound) { + assert(conv_params->dst != NULL); + } + av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x, + filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn, + y_step_qn, conv_params); +} + +void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *interp_filters[2], + const int subpel_x_qn, int x_step_q4, + const int subpel_y_qn, int y_step_q4, int scaled, + ConvolveParams *conv_params, + const struct scale_factors *sf) { + (void)x_step_q4; + (void)y_step_q4; + (void)dst; + (void)dst_stride; + + const InterpFilterParams *filter_params_x = interp_filters[0]; + const InterpFilterParams *filter_params_y = interp_filters[1]; + + // TODO(jingning, yunqing): Add SIMD support to 2-tap filter case. + // Do we have SIMD support to 4-tap case? + // 2-tap filter indicates that it is for IntraBC. + if (filter_params_x->taps == 2 || filter_params_y->taps == 2) { + assert(filter_params_x->taps == 2 && filter_params_y->taps == 2); + assert(!scaled); + if (subpel_x_qn && subpel_y_qn) { + av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params); + return; + } else if (subpel_x_qn) { + av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params); + return; + } else if (subpel_y_qn) { + av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params); + return; + } + } + + if (scaled) { + convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + x_step_q4, subpel_y_qn, y_step_q4, conv_params); + } else { + sf->convolve[subpel_x_qn != 0][subpel_y_qn != 0][conv_params->is_compound]( + src, src_stride, dst, dst_stride, w, h, filter_params_x, + filter_params_y, subpel_x_qn, subpel_y_qn, conv_params); + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +void av1_highbd_convolve_2d_copy_sr_c( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + (void)filter_params_x; + (void)filter_params_y; + (void)subpel_x_qn; + (void)subpel_y_qn; + (void)conv_params; + (void)bd; + + for (int y = 0; y < h; ++y) { + memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0])); + } +} + +void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params, int bd) { + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bits = FILTER_BITS - conv_params->round_0; + (void)filter_params_y; + (void)subpel_y_qn; + + assert(bits >= 0); + assert((FILTER_BITS - conv_params->round_1) >= 0 || + ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); + + // horizontal filter + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_x->taps; ++k) { + res += x_filter[k] * src[y * src_stride + x - fo_horiz + k]; + } + res = ROUND_POWER_OF_TWO(res, conv_params->round_0); + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd); + } + } +} + +void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params, int bd) { + const int fo_vert = filter_params_y->taps / 2 - 1; + (void)filter_params_x; + (void)subpel_x_qn; + (void)conv_params; + + assert(conv_params->round_0 <= FILTER_BITS); + assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) || + ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); + // vertical filter + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_y->taps; ++k) { + res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x]; + } + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd); + } + } +} + +void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params, int bd) { + int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; + int im_h = h + filter_params_y->taps - 1; + int im_stride = w; + assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + assert(bits >= 0); + + // horizontal filter + const uint16_t *src_horiz = src - fo_vert * src_stride; + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + for (int y = 0; y < im_h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t sum = (1 << (bd + FILTER_BITS - 1)); + for (int k = 0; k < filter_params_x->taps; ++k) { + sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; + } + assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))); + im_block[y * im_stride + x] = + ROUND_POWER_OF_TWO(sum, conv_params->round_0); + } + } + + // vertical filter + int16_t *src_vert = im_block + fo_vert * im_stride; + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t sum = 1 << offset_bits; + for (int k = 0; k < filter_params_y->taps; ++k) { + sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; + } + assert(0 <= sum && sum < (1 << (offset_bits + 2))); + int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - + ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd); + } + } +} + +void av1_highbd_dist_wtd_convolve_2d_c( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + int x, y, k; + int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + int im_h = h + filter_params_y->taps - 1; + int im_stride = w; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + assert(round_bits >= 0); + + // horizontal filter + const uint16_t *src_horiz = src - fo_vert * src_stride; + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + for (y = 0; y < im_h; ++y) { + for (x = 0; x < w; ++x) { + int32_t sum = (1 << (bd + FILTER_BITS - 1)); + for (k = 0; k < filter_params_x->taps; ++k) { + sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; + } + assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))); + (void)bd; + im_block[y * im_stride + x] = + (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); + } + } + + // vertical filter + int16_t *src_vert = im_block + fo_vert * im_stride; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + int32_t sum = 1 << offset_bits; + for (k = 0; k < filter_params_y->taps; ++k) { + sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; + } + assert(0 <= sum && sum < (1 << (offset_bits + 2))); + CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd); + } else { + dst16[y * dst16_stride + x] = res; + } + } + } +} + +void av1_highbd_dist_wtd_convolve_x_c( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bits = FILTER_BITS - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + assert(round_bits >= 0); + (void)filter_params_y; + (void)subpel_y_qn; + assert(bits >= 0); + // horizontal filter + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_x->taps; ++k) { + res += x_filter[k] * src[y * src_stride + x - fo_horiz + k]; + } + res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0); + res += round_offset; + + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= round_offset; + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd); + } else { + dst16[y * dst16_stride + x] = res; + } + } + } +} + +void av1_highbd_dist_wtd_convolve_y_c( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int bits = FILTER_BITS - conv_params->round_0; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + assert(round_bits >= 0); + (void)filter_params_x; + (void)subpel_x_qn; + assert(bits >= 0); + // vertical filter + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_y->taps; ++k) { + res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x]; + } + res *= (1 << bits); + res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset; + + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= round_offset; + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd); + } else { + dst16[y * dst16_stride + x] = res; + } + } + } +} + +void av1_highbd_dist_wtd_convolve_2d_copy_c( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + const int bits = + FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + assert(bits >= 0); + (void)filter_params_x; + (void)filter_params_y; + (void)subpel_x_qn; + (void)subpel_y_qn; + + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + CONV_BUF_TYPE res = src[y * src_stride + x] << bits; + res += round_offset; + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= round_offset; + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); + } else { + dst16[y * dst16_stride + x] = res; + } + } + } +} + +void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int x_step_qn, + const int subpel_y_qn, const int y_step_qn, + ConvolveParams *conv_params, int bd) { + int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]; + int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + + filter_params_y->taps; + int im_stride = w; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + CONV_BUF_TYPE *dst16 = conv_params->dst; + const int dst16_stride = conv_params->dst_stride; + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + assert(bits >= 0); + // horizontal filter + const uint16_t *src_horiz = src - fo_vert * src_stride; + for (int y = 0; y < im_h; ++y) { + int x_qn = subpel_x_qn; + for (int x = 0; x < w; ++x, x_qn += x_step_qn) { + const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)]; + const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + assert(x_filter_idx < SUBPEL_SHIFTS); + const int16_t *x_filter = + av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx); + int32_t sum = (1 << (bd + FILTER_BITS - 1)); + for (int k = 0; k < filter_params_x->taps; ++k) { + sum += x_filter[k] * src_x[k - fo_horiz]; + } + assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))); + im_block[y * im_stride + x] = + (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); + } + src_horiz += src_stride; + } + + // vertical filter + int16_t *src_vert = im_block + fo_vert * im_stride; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + for (int x = 0; x < w; ++x) { + int y_qn = subpel_y_qn; + for (int y = 0; y < h; ++y, y_qn += y_step_qn) { + const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride]; + const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + assert(y_filter_idx < SUBPEL_SHIFTS); + const int16_t *y_filter = + av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx); + int32_t sum = 1 << offset_bits; + for (int k = 0; k < filter_params_y->taps; ++k) { + sum += y_filter[k] * src_y[(k - fo_vert) * im_stride]; + } + assert(0 <= sum && sum < (1 << (offset_bits + 2))); + CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); + if (conv_params->is_compound) { + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + /* Subtract round offset and convolve round */ + tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); + } else { + dst16[y * dst16_stride + x] = res; + } + } else { + /* Subtract round offset and convolve round */ + int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); + } + } + src_vert++; + } +} + +void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride, + uint8_t *dst8, int dst_stride, int w, int h, + const InterpFilterParams *interp_filters[2], + const int subpel_x_qn, int x_step_q4, + const int subpel_y_qn, int y_step_q4, + int scaled, ConvolveParams *conv_params, + const struct scale_factors *sf, int bd) { + (void)x_step_q4; + (void)y_step_q4; + (void)dst_stride; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + + const int need_filter_params_x = (subpel_x_qn != 0) | scaled; + const int need_filter_params_y = (subpel_y_qn != 0) | scaled; + const InterpFilterParams *filter_params_x = + need_filter_params_x ? interp_filters[0] : NULL; + const InterpFilterParams *filter_params_y = + need_filter_params_y ? interp_filters[1] : NULL; + + if (scaled) { + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + if (conv_params->is_compound) { + assert(conv_params->dst != NULL); + } + av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + x_step_q4, subpel_y_qn, y_step_q4, conv_params, + bd); + } else { + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + + sf->highbd_convolve[subpel_x_qn != 0][subpel_y_qn != + 0][conv_params->is_compound]( + src, src_stride, dst, dst_stride, w, h, filter_params_x, + filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +// Note: Fixed size intermediate buffers, place limits on parameters +// of some functions. 2d filtering proceeds in 2 steps: +// (1) Interpolate horizontally into an intermediate buffer, temp. +// (2) Interpolate temp vertically to derive the sub-pixel result. +// Deriving the maximum number of rows in the temp buffer (135): +// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). +// --Largest block size is 128x128 pixels. +// --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the +// original frame (in 1/16th pixel units). +// --Must round-up because block may be located at sub-pixel position. +// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. +// --((128 - 1) * 32 + 15) >> 4 + 8 = 263. +#define WIENER_MAX_EXT_SIZE 263 + +static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) { + int sum = 0; + for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k]; + return sum; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE int highbd_horz_scalar_product(const uint16_t *a, + const int16_t *b) { + int sum = 0; + for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k]; + return sum; +} +#endif + +static INLINE int highbd_vert_scalar_product(const uint16_t *a, + ptrdiff_t a_stride, + const int16_t *b) { + int sum = 0; + for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k]; + return sum; +} + +static const InterpKernel *get_filter_base(const int16_t *filter) { + // NOTE: This assumes that the filter table is 256-byte aligned. + // TODO(agrange) Modify to make independent of table alignment. + return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); +} + +static int get_filter_offset(const int16_t *f, const InterpKernel *base) { + return (int)((const InterpKernel *)(intptr_t)f - base); +} + +static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h, + int round0_bits) { + const int bd = 8; + src -= SUBPEL_TAPS / 2 - 1; + for (int y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (int x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) + + (1 << (bd + FILTER_BITS - 1)); + const int sum = horz_scalar_product(src_x, x_filter) + rounding; + dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0, + WIENER_CLAMP_LIMIT(round0_bits, bd) - 1); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h, + int round1_bits) { + const int bd = 8; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (int x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (int y = 0; y < h; ++y) { + const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + const int rounding = + ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) - + (1 << (bd + round1_bits - 1)); + const int sum = + highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding; + dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits)); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, + const ConvolveParams *conv_params) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE]; + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1; + memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE); + + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + + convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4, + x_step_q4, w, intermediate_height, + conv_params->round_0); + convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), + MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4, + y_step_q4, w, h, conv_params->round_1); +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void highbd_convolve_add_src_horiz_hip( + const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h, int round0_bits, int bd) { + const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd); + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + src -= SUBPEL_TAPS / 2 - 1; + for (int y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (int x = 0; x < w; ++x) { + const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) + + (1 << (bd + FILTER_BITS - 1)); + const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding; + dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0, + extraprec_clamp_limit - 1); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void highbd_convolve_add_src_vert_hip( + const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8, + ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h, int round1_bits, int bd) { + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (int x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (int y = 0; y < h; ++y) { + const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + const int rounding = + ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) - + (1 << (bd + round1_bits - 1)); + const int sum = + highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding; + dst[y * dst_stride] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +void av1_highbd_wiener_convolve_add_src_c( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, int h, + const ConvolveParams *conv_params, int bd) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE]; + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16); + + highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, MAX_SB_SIZE, filters_x, + x0_q4, x_step_q4, w, intermediate_height, + conv_params->round_0, bd); + highbd_convolve_add_src_vert_hip( + temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride, + filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd); +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/libs/libaom/src/av1/common/convolve.h b/libs/libaom/src/av1/common/convolve.h new file mode 100644 index 000000000..04df86c42 --- /dev/null +++ b/libs/libaom/src/av1/common/convolve.h @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_CONVOLVE_H_ +#define AOM_AV1_COMMON_CONVOLVE_H_ +#include "av1/common/filter.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef uint16_t CONV_BUF_TYPE; +typedef struct ConvolveParams { + int do_average; + CONV_BUF_TYPE *dst; + int dst_stride; + int round_0; + int round_1; + int plane; + int is_compound; + int compound_index; // 0: the first single in compound mode, 1: the second. + int use_dist_wtd_comp_avg; + int fwd_offset; + int bck_offset; +} ConvolveParams; + +#define ROUND0_BITS 3 +#define COMPOUND_ROUND1_BITS 7 +#define WIENER_ROUND0_BITS 3 + +#define WIENER_CLAMP_LIMIT(r0, bd) (1 << ((bd) + 1 + FILTER_BITS - r0)) + +typedef void (*aom_convolve_fn_t)(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params); + +typedef void (*aom_highbd_convolve_fn_t)( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd); + +struct AV1Common; +struct scale_factors; + +void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *interp_filters[2], + const int subpel_x_qn, int x_step_q4, + const int subpel_y_qn, int y_step_q4, int scaled, + ConvolveParams *conv_params, + const struct scale_factors *sf); + +static INLINE ConvolveParams get_conv_params_no_round(int cmp_index, int plane, + CONV_BUF_TYPE *dst, + int dst_stride, + int is_compound, int bd) { + ConvolveParams conv_params; + conv_params.compound_index = cmp_index; + assert(IMPLIES(cmp_index, is_compound)); + + conv_params.is_compound = is_compound; + conv_params.round_0 = ROUND0_BITS; + conv_params.round_1 = is_compound ? COMPOUND_ROUND1_BITS + : 2 * FILTER_BITS - conv_params.round_0; + const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2; + assert(IMPLIES(bd < 12, intbufrange <= 16)); + if (intbufrange > 16) { + conv_params.round_0 += intbufrange - 16; + if (!is_compound) conv_params.round_1 -= intbufrange - 16; + } + // TODO(yunqing): The following dst should only be valid while + // is_compound = 1; + conv_params.dst = dst; + conv_params.dst_stride = dst_stride; + conv_params.plane = plane; + + // By default, set do average to 1 if this is the second single prediction + // in a compound mode. + conv_params.do_average = cmp_index; + return conv_params; +} + +static INLINE ConvolveParams get_conv_params(int do_average, int plane, + int bd) { + return get_conv_params_no_round(do_average, plane, NULL, 0, 0, bd); +} + +static INLINE ConvolveParams get_conv_params_wiener(int bd) { + ConvolveParams conv_params; + (void)bd; + conv_params.do_average = 0; + conv_params.is_compound = 0; + conv_params.round_0 = WIENER_ROUND0_BITS; + conv_params.round_1 = 2 * FILTER_BITS - conv_params.round_0; + const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2; + assert(IMPLIES(bd < 12, intbufrange <= 16)); + if (intbufrange > 16) { + conv_params.round_0 += intbufrange - 16; + conv_params.round_1 -= intbufrange - 16; + } + conv_params.dst = NULL; + conv_params.dst_stride = 0; + conv_params.plane = 0; + return conv_params; +} + +void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *interp_filters[2], + const int subpel_x_qn, int x_step_q4, + const int subpel_y_qn, int y_step_q4, + int scaled, ConvolveParams *conv_params, + const struct scale_factors *sf, int bd); + +// TODO(sarahparker) This will need to be integerized and optimized +void av1_convolve_2d_sobel_y_c(const uint8_t *src, int src_stride, double *dst, + int dst_stride, int w, int h, int dir, + double norm); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_CONVOLVE_H_ diff --git a/libs/libaom/src/av1/common/debugmodes.c b/libs/libaom/src/av1/common/debugmodes.c new file mode 100644 index 000000000..ff02ddde0 --- /dev/null +++ b/libs/libaom/src/av1/common/debugmodes.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/enums.h" + +static void log_frame_info(AV1_COMMON *cm, const char *str, FILE *f) { + fprintf(f, "%s", str); + fprintf(f, "(Frame %d, Show:%d, Q:%d): \n", cm->current_frame.frame_number, + cm->show_frame, cm->quant_params.base_qindex); +} +/* This function dereferences a pointer to the mbmi structure + * and uses the passed in member offset to print out the value of an integer + * for each mbmi member value in the mi structure. + */ +static void print_mi_data(AV1_COMMON *cm, FILE *file, const char *descriptor, + size_t member_offset) { + const CommonModeInfoParams *const mi_params = &cm->mi_params; + MB_MODE_INFO **mi = mi_params->mi_grid_base; + int rows = mi_params->mi_rows; + int cols = mi_params->mi_cols; + char prefix = descriptor[0]; + + log_frame_info(cm, descriptor, file); + for (int mi_row = 0; mi_row < rows; mi_row++) { + fprintf(file, "%c ", prefix); + for (int mi_col = 0; mi_col < cols; mi_col++) { + fprintf(file, "%2d ", *((char *)((char *)(mi[0]) + member_offset))); + mi++; + } + fprintf(file, "\n"); + mi += mi_params->mi_stride - cols; + } + fprintf(file, "\n"); +} + +void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) { + CommonModeInfoParams *mi_params = &cm->mi_params; + FILE *mvs = fopen(file, "a"); + MB_MODE_INFO **mi = mi_params->mi_grid_base; + const int rows = mi_params->mi_rows; + const int cols = mi_params->mi_cols; + + print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type)); + print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode)); + print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0])); + print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size)); + print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode)); + + // output skip infomation. + log_frame_info(cm, "Skips:", mvs); + for (int mi_row = 0; mi_row < rows; mi_row++) { + fprintf(mvs, "S "); + for (int mi_col = 0; mi_col < cols; mi_col++) { + fprintf(mvs, "%2d ", mi[0]->skip); + mi++; + } + fprintf(mvs, "\n"); + mi += mi_params->mi_stride - cols; + } + fprintf(mvs, "\n"); + + // output motion vectors. + log_frame_info(cm, "Vectors ", mvs); + mi = mi_params->mi_grid_base; + for (int mi_row = 0; mi_row < rows; mi_row++) { + fprintf(mvs, "V "); + for (int mi_col = 0; mi_col < cols; mi_col++) { + fprintf(mvs, "%4d:%4d ", mi[0]->mv[0].as_mv.row, mi[0]->mv[0].as_mv.col); + mi++; + } + fprintf(mvs, "\n"); + mi += mi_params->mi_stride - cols; + } + fprintf(mvs, "\n"); + + fclose(mvs); +} + +void av1_print_uncompressed_frame_header(const uint8_t *data, int size, + const char *filename) { + FILE *hdrFile = fopen(filename, "w"); + fwrite(data, size, sizeof(uint8_t), hdrFile); + + // Reset order hints(7bit + a previous bit) to 0, so that all camera frame + // headers are identical in large scale coding. + uint8_t zero = 0; + fseek(hdrFile, 1, SEEK_SET); + // Reset second byte. + fwrite(&zero, 1, sizeof(uint8_t), hdrFile); + fclose(hdrFile); +} + +void av1_print_frame_contexts(const FRAME_CONTEXT *fc, const char *filename) { + FILE *fcFile = fopen(filename, "w"); + const uint16_t *fcp = (uint16_t *)fc; + const unsigned int n_contexts = sizeof(FRAME_CONTEXT) / sizeof(uint16_t); + unsigned int i; + + for (i = 0; i < n_contexts; ++i) fprintf(fcFile, "%d ", *fcp++); + fclose(fcFile); +} diff --git a/libs/libaom/src/av1/common/entropy.c b/libs/libaom/src/av1/common/entropy.c new file mode 100644 index 000000000..1f7a0efe0 --- /dev/null +++ b/libs/libaom/src/av1/common/entropy.c @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_mem/aom_mem.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/scan.h" +#include "av1/common/token_cdfs.h" +#include "av1/common/txb_common.h" + +static int get_q_ctx(int q) { + if (q <= 20) return 0; + if (q <= 60) return 1; + if (q <= 120) return 2; + return 3; +} + +void av1_default_coef_probs(AV1_COMMON *cm) { + const int index = get_q_ctx(cm->quant_params.base_qindex); +#if CONFIG_ENTROPY_STATS + cm->coef_cdf_category = index; +#endif + + av1_copy(cm->fc->txb_skip_cdf, av1_default_txb_skip_cdfs[index]); + av1_copy(cm->fc->eob_extra_cdf, av1_default_eob_extra_cdfs[index]); + av1_copy(cm->fc->dc_sign_cdf, av1_default_dc_sign_cdfs[index]); + av1_copy(cm->fc->coeff_br_cdf, av1_default_coeff_lps_multi_cdfs[index]); + av1_copy(cm->fc->coeff_base_cdf, av1_default_coeff_base_multi_cdfs[index]); + av1_copy(cm->fc->coeff_base_eob_cdf, + av1_default_coeff_base_eob_multi_cdfs[index]); + av1_copy(cm->fc->eob_flag_cdf16, av1_default_eob_multi16_cdfs[index]); + av1_copy(cm->fc->eob_flag_cdf32, av1_default_eob_multi32_cdfs[index]); + av1_copy(cm->fc->eob_flag_cdf64, av1_default_eob_multi64_cdfs[index]); + av1_copy(cm->fc->eob_flag_cdf128, av1_default_eob_multi128_cdfs[index]); + av1_copy(cm->fc->eob_flag_cdf256, av1_default_eob_multi256_cdfs[index]); + av1_copy(cm->fc->eob_flag_cdf512, av1_default_eob_multi512_cdfs[index]); + av1_copy(cm->fc->eob_flag_cdf1024, av1_default_eob_multi1024_cdfs[index]); +} + +static AOM_INLINE void reset_cdf_symbol_counter(aom_cdf_prob *cdf_ptr, + int num_cdfs, int cdf_stride, + int nsymbs) { + for (int i = 0; i < num_cdfs; i++) { + cdf_ptr[i * cdf_stride + nsymbs] = 0; + } +} + +#define RESET_CDF_COUNTER(cname, nsymbs) \ + RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs)) + +#define RESET_CDF_COUNTER_STRIDE(cname, nsymbs, cdf_stride) \ + do { \ + aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname; \ + int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob); \ + int num_cdfs = array_size / cdf_stride; \ + reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \ + } while (0) + +static AOM_INLINE void reset_nmv_counter(nmv_context *nmv) { + RESET_CDF_COUNTER(nmv->joints_cdf, 4); + for (int i = 0; i < 2; i++) { + RESET_CDF_COUNTER(nmv->comps[i].classes_cdf, MV_CLASSES); + RESET_CDF_COUNTER(nmv->comps[i].class0_fp_cdf, MV_FP_SIZE); + RESET_CDF_COUNTER(nmv->comps[i].fp_cdf, MV_FP_SIZE); + RESET_CDF_COUNTER(nmv->comps[i].sign_cdf, 2); + RESET_CDF_COUNTER(nmv->comps[i].class0_hp_cdf, 2); + RESET_CDF_COUNTER(nmv->comps[i].hp_cdf, 2); + RESET_CDF_COUNTER(nmv->comps[i].class0_cdf, CLASS0_SIZE); + RESET_CDF_COUNTER(nmv->comps[i].bits_cdf, 2); + } +} + +void av1_reset_cdf_symbol_counters(FRAME_CONTEXT *fc) { + RESET_CDF_COUNTER(fc->txb_skip_cdf, 2); + RESET_CDF_COUNTER(fc->eob_extra_cdf, 2); + RESET_CDF_COUNTER(fc->dc_sign_cdf, 2); + RESET_CDF_COUNTER(fc->eob_flag_cdf16, 5); + RESET_CDF_COUNTER(fc->eob_flag_cdf32, 6); + RESET_CDF_COUNTER(fc->eob_flag_cdf64, 7); + RESET_CDF_COUNTER(fc->eob_flag_cdf128, 8); + RESET_CDF_COUNTER(fc->eob_flag_cdf256, 9); + RESET_CDF_COUNTER(fc->eob_flag_cdf512, 10); + RESET_CDF_COUNTER(fc->eob_flag_cdf1024, 11); + RESET_CDF_COUNTER(fc->coeff_base_eob_cdf, 3); + RESET_CDF_COUNTER(fc->coeff_base_cdf, 4); + RESET_CDF_COUNTER(fc->coeff_br_cdf, BR_CDF_SIZE); + RESET_CDF_COUNTER(fc->newmv_cdf, 2); + RESET_CDF_COUNTER(fc->zeromv_cdf, 2); + RESET_CDF_COUNTER(fc->refmv_cdf, 2); + RESET_CDF_COUNTER(fc->drl_cdf, 2); + RESET_CDF_COUNTER(fc->inter_compound_mode_cdf, INTER_COMPOUND_MODES); + RESET_CDF_COUNTER(fc->compound_type_cdf, MASKED_COMPOUND_TYPES); + RESET_CDF_COUNTER(fc->wedge_idx_cdf, 16); + RESET_CDF_COUNTER(fc->interintra_cdf, 2); + RESET_CDF_COUNTER(fc->wedge_interintra_cdf, 2); + RESET_CDF_COUNTER(fc->interintra_mode_cdf, INTERINTRA_MODES); + RESET_CDF_COUNTER(fc->motion_mode_cdf, MOTION_MODES); + RESET_CDF_COUNTER(fc->obmc_cdf, 2); + RESET_CDF_COUNTER(fc->palette_y_size_cdf, PALETTE_SIZES); + RESET_CDF_COUNTER(fc->palette_uv_size_cdf, PALETTE_SIZES); + for (int j = 0; j < PALETTE_SIZES; j++) { + int nsymbs = j + PALETTE_MIN_SIZE; + RESET_CDF_COUNTER_STRIDE(fc->palette_y_color_index_cdf[j], nsymbs, + CDF_SIZE(PALETTE_COLORS)); + RESET_CDF_COUNTER_STRIDE(fc->palette_uv_color_index_cdf[j], nsymbs, + CDF_SIZE(PALETTE_COLORS)); + } + RESET_CDF_COUNTER(fc->palette_y_mode_cdf, 2); + RESET_CDF_COUNTER(fc->palette_uv_mode_cdf, 2); + RESET_CDF_COUNTER(fc->comp_inter_cdf, 2); + RESET_CDF_COUNTER(fc->single_ref_cdf, 2); + RESET_CDF_COUNTER(fc->comp_ref_type_cdf, 2); + RESET_CDF_COUNTER(fc->uni_comp_ref_cdf, 2); + RESET_CDF_COUNTER(fc->comp_ref_cdf, 2); + RESET_CDF_COUNTER(fc->comp_bwdref_cdf, 2); + RESET_CDF_COUNTER(fc->txfm_partition_cdf, 2); + RESET_CDF_COUNTER(fc->compound_index_cdf, 2); + RESET_CDF_COUNTER(fc->comp_group_idx_cdf, 2); + RESET_CDF_COUNTER(fc->skip_mode_cdfs, 2); + RESET_CDF_COUNTER(fc->skip_cdfs, 2); + RESET_CDF_COUNTER(fc->intra_inter_cdf, 2); + reset_nmv_counter(&fc->nmvc); + reset_nmv_counter(&fc->ndvc); + RESET_CDF_COUNTER(fc->intrabc_cdf, 2); + RESET_CDF_COUNTER(fc->seg.tree_cdf, MAX_SEGMENTS); + RESET_CDF_COUNTER(fc->seg.pred_cdf, 2); + RESET_CDF_COUNTER(fc->seg.spatial_pred_seg_cdf, MAX_SEGMENTS); + RESET_CDF_COUNTER(fc->filter_intra_cdfs, 2); + RESET_CDF_COUNTER(fc->filter_intra_mode_cdf, FILTER_INTRA_MODES); + RESET_CDF_COUNTER(fc->switchable_restore_cdf, RESTORE_SWITCHABLE_TYPES); + RESET_CDF_COUNTER(fc->wiener_restore_cdf, 2); + RESET_CDF_COUNTER(fc->sgrproj_restore_cdf, 2); + RESET_CDF_COUNTER(fc->y_mode_cdf, INTRA_MODES); + RESET_CDF_COUNTER_STRIDE(fc->uv_mode_cdf[0], UV_INTRA_MODES - 1, + CDF_SIZE(UV_INTRA_MODES)); + RESET_CDF_COUNTER(fc->uv_mode_cdf[1], UV_INTRA_MODES); + for (int i = 0; i < PARTITION_CONTEXTS; i++) { + if (i < 4) { + RESET_CDF_COUNTER_STRIDE(fc->partition_cdf[i], 4, CDF_SIZE(10)); + } else if (i < 16) { + RESET_CDF_COUNTER(fc->partition_cdf[i], 10); + } else { + RESET_CDF_COUNTER_STRIDE(fc->partition_cdf[i], 8, CDF_SIZE(10)); + } + } + RESET_CDF_COUNTER(fc->switchable_interp_cdf, SWITCHABLE_FILTERS); + RESET_CDF_COUNTER(fc->kf_y_cdf, INTRA_MODES); + RESET_CDF_COUNTER(fc->angle_delta_cdf, 2 * MAX_ANGLE_DELTA + 1); + RESET_CDF_COUNTER_STRIDE(fc->tx_size_cdf[0], MAX_TX_DEPTH, + CDF_SIZE(MAX_TX_DEPTH + 1)); + RESET_CDF_COUNTER(fc->tx_size_cdf[1], MAX_TX_DEPTH + 1); + RESET_CDF_COUNTER(fc->tx_size_cdf[2], MAX_TX_DEPTH + 1); + RESET_CDF_COUNTER(fc->tx_size_cdf[3], MAX_TX_DEPTH + 1); + RESET_CDF_COUNTER(fc->delta_q_cdf, DELTA_Q_PROBS + 1); + RESET_CDF_COUNTER(fc->delta_lf_cdf, DELTA_LF_PROBS + 1); + for (int i = 0; i < FRAME_LF_COUNT; i++) { + RESET_CDF_COUNTER(fc->delta_lf_multi_cdf[i], DELTA_LF_PROBS + 1); + } + RESET_CDF_COUNTER_STRIDE(fc->intra_ext_tx_cdf[1], 7, CDF_SIZE(TX_TYPES)); + RESET_CDF_COUNTER_STRIDE(fc->intra_ext_tx_cdf[2], 5, CDF_SIZE(TX_TYPES)); + RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[1], 16, CDF_SIZE(TX_TYPES)); + RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[2], 12, CDF_SIZE(TX_TYPES)); + RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[3], 2, CDF_SIZE(TX_TYPES)); + RESET_CDF_COUNTER(fc->cfl_sign_cdf, CFL_JOINT_SIGNS); + RESET_CDF_COUNTER(fc->cfl_alpha_cdf, CFL_ALPHABET_SIZE); +} diff --git a/libs/libaom/src/av1/common/entropy.h b/libs/libaom/src/av1/common/entropy.h new file mode 100644 index 000000000..ee78f56a3 --- /dev/null +++ b/libs/libaom/src/av1/common/entropy.h @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_ENTROPY_H_ +#define AOM_AV1_COMMON_ENTROPY_H_ + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/prob.h" + +#include "av1/common/common.h" +#include "av1/common/common_data.h" +#include "av1/common/enums.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define TOKEN_CDF_Q_CTXS 4 + +#define TXB_SKIP_CONTEXTS 13 + +#define EOB_COEF_CONTEXTS 9 + +#define SIG_COEF_CONTEXTS_2D 26 +#define SIG_COEF_CONTEXTS_1D 16 +#define SIG_COEF_CONTEXTS_EOB 4 +#define SIG_COEF_CONTEXTS (SIG_COEF_CONTEXTS_2D + SIG_COEF_CONTEXTS_1D) + +#define COEFF_BASE_CONTEXTS (SIG_COEF_CONTEXTS) +#define DC_SIGN_CONTEXTS 3 + +#define BR_TMP_OFFSET 12 +#define BR_REF_CAT 4 +#define LEVEL_CONTEXTS 21 + +#define NUM_BASE_LEVELS 2 + +#define BR_CDF_SIZE (4) +#define COEFF_BASE_RANGE (4 * (BR_CDF_SIZE - 1)) + +#define COEFF_CONTEXT_BITS 3 +#define COEFF_CONTEXT_MASK ((1 << COEFF_CONTEXT_BITS) - 1) +#define MAX_BASE_BR_RANGE (COEFF_BASE_RANGE + NUM_BASE_LEVELS + 1) + +#define BASE_CONTEXT_POSITION_NUM 12 + +enum { + TX_CLASS_2D = 0, + TX_CLASS_HORIZ = 1, + TX_CLASS_VERT = 2, + TX_CLASSES = 3, +} UENUM1BYTE(TX_CLASS); + +#define DCT_MAX_VALUE 16384 +#define DCT_MAX_VALUE_HIGH10 65536 +#define DCT_MAX_VALUE_HIGH12 262144 + +/* Coefficients are predicted via a 3-dimensional probability table indexed on + * REF_TYPES, COEF_BANDS and COEF_CONTEXTS. */ +#define REF_TYPES 2 // intra=0, inter=1 + +struct AV1Common; +struct frame_contexts; +void av1_reset_cdf_symbol_counters(struct frame_contexts *fc); +void av1_default_coef_probs(struct AV1Common *cm); + +struct frame_contexts; + +typedef char ENTROPY_CONTEXT; + +static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a, + ENTROPY_CONTEXT b) { + return (a != 0) + (b != 0); +} + +static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, + const ENTROPY_CONTEXT *l) { + ENTROPY_CONTEXT above_ec = 0, left_ec = 0; + + switch (tx_size) { + case TX_4X4: + above_ec = a[0] != 0; + left_ec = l[0] != 0; + break; + case TX_4X8: + above_ec = a[0] != 0; + left_ec = !!*(const uint16_t *)l; + break; + case TX_8X4: + above_ec = !!*(const uint16_t *)a; + left_ec = l[0] != 0; + break; + case TX_8X16: + above_ec = !!*(const uint16_t *)a; + left_ec = !!*(const uint32_t *)l; + break; + case TX_16X8: + above_ec = !!*(const uint32_t *)a; + left_ec = !!*(const uint16_t *)l; + break; + case TX_16X32: + above_ec = !!*(const uint32_t *)a; + left_ec = !!*(const uint64_t *)l; + break; + case TX_32X16: + above_ec = !!*(const uint64_t *)a; + left_ec = !!*(const uint32_t *)l; + break; + case TX_8X8: + above_ec = !!*(const uint16_t *)a; + left_ec = !!*(const uint16_t *)l; + break; + case TX_16X16: + above_ec = !!*(const uint32_t *)a; + left_ec = !!*(const uint32_t *)l; + break; + case TX_32X32: + above_ec = !!*(const uint64_t *)a; + left_ec = !!*(const uint64_t *)l; + break; + case TX_64X64: + above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8)); + left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8)); + break; + case TX_32X64: + above_ec = !!*(const uint64_t *)a; + left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8)); + break; + case TX_64X32: + above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8)); + left_ec = !!*(const uint64_t *)l; + break; + case TX_4X16: + above_ec = a[0] != 0; + left_ec = !!*(const uint32_t *)l; + break; + case TX_16X4: + above_ec = !!*(const uint32_t *)a; + left_ec = l[0] != 0; + break; + case TX_8X32: + above_ec = !!*(const uint16_t *)a; + left_ec = !!*(const uint64_t *)l; + break; + case TX_32X8: + above_ec = !!*(const uint64_t *)a; + left_ec = !!*(const uint16_t *)l; + break; + case TX_16X64: + above_ec = !!*(const uint32_t *)a; + left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8)); + break; + case TX_64X16: + above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8)); + left_ec = !!*(const uint32_t *)l; + break; + default: assert(0 && "Invalid transform size."); break; + } + return combine_entropy_contexts(above_ec, left_ec); +} + +static INLINE TX_SIZE get_txsize_entropy_ctx(TX_SIZE txsize) { + return (TX_SIZE)((txsize_sqr_map[txsize] + txsize_sqr_up_map[txsize] + 1) >> + 1); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_ENTROPY_H_ diff --git a/libs/libaom/src/av1/common/entropymode.c b/libs/libaom/src/av1/common/entropymode.c new file mode 100644 index 000000000..5f061be35 --- /dev/null +++ b/libs/libaom/src/av1/common/entropymode.c @@ -0,0 +1,1103 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_mem/aom_mem.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/reconinter.h" +#include "av1/common/scan.h" +#include "av1/common/seg_common.h" +#include "av1/common/txb_common.h" + +static const aom_cdf_prob + default_kf_y_mode_cdf[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS][CDF_SIZE( + INTRA_MODES)] = { + { { AOM_CDF13(15588, 17027, 19338, 20218, 20682, 21110, 21825, 23244, + 24189, 28165, 29093, 30466) }, + { AOM_CDF13(12016, 18066, 19516, 20303, 20719, 21444, 21888, 23032, + 24434, 28658, 30172, 31409) }, + { AOM_CDF13(10052, 10771, 22296, 22788, 23055, 23239, 24133, 25620, + 26160, 29336, 29929, 31567) }, + { AOM_CDF13(14091, 15406, 16442, 18808, 19136, 19546, 19998, 22096, + 24746, 29585, 30958, 32462) }, + { AOM_CDF13(12122, 13265, 15603, 16501, 18609, 20033, 22391, 25583, + 26437, 30261, 31073, 32475) } }, + { { AOM_CDF13(10023, 19585, 20848, 21440, 21832, 22760, 23089, 24023, + 25381, 29014, 30482, 31436) }, + { AOM_CDF13(5983, 24099, 24560, 24886, 25066, 25795, 25913, 26423, + 27610, 29905, 31276, 31794) }, + { AOM_CDF13(7444, 12781, 20177, 20728, 21077, 21607, 22170, 23405, + 24469, 27915, 29090, 30492) }, + { AOM_CDF13(8537, 14689, 15432, 17087, 17408, 18172, 18408, 19825, + 24649, 29153, 31096, 32210) }, + { AOM_CDF13(7543, 14231, 15496, 16195, 17905, 20717, 21984, 24516, + 26001, 29675, 30981, 31994) } }, + { { AOM_CDF13(12613, 13591, 21383, 22004, 22312, 22577, 23401, 25055, + 25729, 29538, 30305, 32077) }, + { AOM_CDF13(9687, 13470, 18506, 19230, 19604, 20147, 20695, 22062, + 23219, 27743, 29211, 30907) }, + { AOM_CDF13(6183, 6505, 26024, 26252, 26366, 26434, 27082, 28354, 28555, + 30467, 30794, 32086) }, + { AOM_CDF13(10718, 11734, 14954, 17224, 17565, 17924, 18561, 21523, + 23878, 28975, 30287, 32252) }, + { AOM_CDF13(9194, 9858, 16501, 17263, 18424, 19171, 21563, 25961, 26561, + 30072, 30737, 32463) } }, + { { AOM_CDF13(12602, 14399, 15488, 18381, 18778, 19315, 19724, 21419, + 25060, 29696, 30917, 32409) }, + { AOM_CDF13(8203, 13821, 14524, 17105, 17439, 18131, 18404, 19468, + 25225, 29485, 31158, 32342) }, + { AOM_CDF13(8451, 9731, 15004, 17643, 18012, 18425, 19070, 21538, 24605, + 29118, 30078, 32018) }, + { AOM_CDF13(7714, 9048, 9516, 16667, 16817, 16994, 17153, 18767, 26743, + 30389, 31536, 32528) }, + { AOM_CDF13(8843, 10280, 11496, 15317, 16652, 17943, 19108, 22718, + 25769, 29953, 30983, 32485) } }, + { { AOM_CDF13(12578, 13671, 15979, 16834, 19075, 20913, 22989, 25449, + 26219, 30214, 31150, 32477) }, + { AOM_CDF13(9563, 13626, 15080, 15892, 17756, 20863, 22207, 24236, + 25380, 29653, 31143, 32277) }, + { AOM_CDF13(8356, 8901, 17616, 18256, 19350, 20106, 22598, 25947, 26466, + 29900, 30523, 32261) }, + { AOM_CDF13(10835, 11815, 13124, 16042, 17018, 18039, 18947, 22753, + 24615, 29489, 30883, 32482) }, + { AOM_CDF13(7618, 8288, 9859, 10509, 15386, 18657, 22903, 28776, 29180, + 31355, 31802, 32593) } } + }; + +static const aom_cdf_prob default_angle_delta_cdf[DIRECTIONAL_MODES][CDF_SIZE( + 2 * MAX_ANGLE_DELTA + 1)] = { + { AOM_CDF7(2180, 5032, 7567, 22776, 26989, 30217) }, + { AOM_CDF7(2301, 5608, 8801, 23487, 26974, 30330) }, + { AOM_CDF7(3780, 11018, 13699, 19354, 23083, 31286) }, + { AOM_CDF7(4581, 11226, 15147, 17138, 21834, 28397) }, + { AOM_CDF7(1737, 10927, 14509, 19588, 22745, 28823) }, + { AOM_CDF7(2664, 10176, 12485, 17650, 21600, 30495) }, + { AOM_CDF7(2240, 11096, 15453, 20341, 22561, 28917) }, + { AOM_CDF7(3605, 10428, 12459, 17676, 21244, 30655) } +}; + +static const aom_cdf_prob default_if_y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE( + INTRA_MODES)] = { { AOM_CDF13(22801, 23489, 24293, 24756, 25601, 26123, + 26606, 27418, 27945, 29228, 29685, 30349) }, + { AOM_CDF13(18673, 19845, 22631, 23318, 23950, 24649, + 25527, 27364, 28152, 29701, 29984, 30852) }, + { AOM_CDF13(19770, 20979, 23396, 23939, 24241, 24654, + 25136, 27073, 27830, 29360, 29730, 30659) }, + { AOM_CDF13(20155, 21301, 22838, 23178, 23261, 23533, + 23703, 24804, 25352, 26575, 27016, 28049) } }; + +static const aom_cdf_prob + default_uv_mode_cdf[CFL_ALLOWED_TYPES][INTRA_MODES][CDF_SIZE( + UV_INTRA_MODES)] = { + { { AOM_CDF13(22631, 24152, 25378, 25661, 25986, 26520, 27055, 27923, + 28244, 30059, 30941, 31961) }, + { AOM_CDF13(9513, 26881, 26973, 27046, 27118, 27664, 27739, 27824, + 28359, 29505, 29800, 31796) }, + { AOM_CDF13(9845, 9915, 28663, 28704, 28757, 28780, 29198, 29822, 29854, + 30764, 31777, 32029) }, + { AOM_CDF13(13639, 13897, 14171, 25331, 25606, 25727, 25953, 27148, + 28577, 30612, 31355, 32493) }, + { AOM_CDF13(9764, 9835, 9930, 9954, 25386, 27053, 27958, 28148, 28243, + 31101, 31744, 32363) }, + { AOM_CDF13(11825, 13589, 13677, 13720, 15048, 29213, 29301, 29458, + 29711, 31161, 31441, 32550) }, + { AOM_CDF13(14175, 14399, 16608, 16821, 17718, 17775, 28551, 30200, + 30245, 31837, 32342, 32667) }, + { AOM_CDF13(12885, 13038, 14978, 15590, 15673, 15748, 16176, 29128, + 29267, 30643, 31961, 32461) }, + { AOM_CDF13(12026, 13661, 13874, 15305, 15490, 15726, 15995, 16273, + 28443, 30388, 30767, 32416) }, + { AOM_CDF13(19052, 19840, 20579, 20916, 21150, 21467, 21885, 22719, + 23174, 28861, 30379, 32175) }, + { AOM_CDF13(18627, 19649, 20974, 21219, 21492, 21816, 22199, 23119, + 23527, 27053, 31397, 32148) }, + { AOM_CDF13(17026, 19004, 19997, 20339, 20586, 21103, 21349, 21907, + 22482, 25896, 26541, 31819) }, + { AOM_CDF13(12124, 13759, 14959, 14992, 15007, 15051, 15078, 15166, + 15255, 15753, 16039, 16606) } }, + { { AOM_CDF14(10407, 11208, 12900, 13181, 13823, 14175, 14899, 15656, + 15986, 20086, 20995, 22455, 24212) }, + { AOM_CDF14(4532, 19780, 20057, 20215, 20428, 21071, 21199, 21451, + 22099, 24228, 24693, 27032, 29472) }, + { AOM_CDF14(5273, 5379, 20177, 20270, 20385, 20439, 20949, 21695, 21774, + 23138, 24256, 24703, 26679) }, + { AOM_CDF14(6740, 7167, 7662, 14152, 14536, 14785, 15034, 16741, 18371, + 21520, 22206, 23389, 24182) }, + { AOM_CDF14(4987, 5368, 5928, 6068, 19114, 20315, 21857, 22253, 22411, + 24911, 25380, 26027, 26376) }, + { AOM_CDF14(5370, 6889, 7247, 7393, 9498, 21114, 21402, 21753, 21981, + 24780, 25386, 26517, 27176) }, + { AOM_CDF14(4816, 4961, 7204, 7326, 8765, 8930, 20169, 20682, 20803, + 23188, 23763, 24455, 24940) }, + { AOM_CDF14(6608, 6740, 8529, 9049, 9257, 9356, 9735, 18827, 19059, + 22336, 23204, 23964, 24793) }, + { AOM_CDF14(5998, 7419, 7781, 8933, 9255, 9549, 9753, 10417, 18898, + 22494, 23139, 24764, 25989) }, + { AOM_CDF14(10660, 11298, 12550, 12957, 13322, 13624, 14040, 15004, + 15534, 20714, 21789, 23443, 24861) }, + { AOM_CDF14(10522, 11530, 12552, 12963, 13378, 13779, 14245, 15235, + 15902, 20102, 22696, 23774, 25838) }, + { AOM_CDF14(10099, 10691, 12639, 13049, 13386, 13665, 14125, 15163, + 15636, 19676, 20474, 23519, 25208) }, + { AOM_CDF14(3144, 5087, 7382, 7504, 7593, 7690, 7801, 8064, 8232, 9248, + 9875, 10521, 29048) } } + }; + +static const aom_cdf_prob default_partition_cdf[PARTITION_CONTEXTS][CDF_SIZE( + EXT_PARTITION_TYPES)] = { + { AOM_CDF4(19132, 25510, 30392) }, + { AOM_CDF4(13928, 19855, 28540) }, + { AOM_CDF4(12522, 23679, 28629) }, + { AOM_CDF4(9896, 18783, 25853) }, + { AOM_CDF10(15597, 20929, 24571, 26706, 27664, 28821, 29601, 30571, 31902) }, + { AOM_CDF10(7925, 11043, 16785, 22470, 23971, 25043, 26651, 28701, 29834) }, + { AOM_CDF10(5414, 13269, 15111, 20488, 22360, 24500, 25537, 26336, 32117) }, + { AOM_CDF10(2662, 6362, 8614, 20860, 23053, 24778, 26436, 27829, 31171) }, + { AOM_CDF10(18462, 20920, 23124, 27647, 28227, 29049, 29519, 30178, 31544) }, + { AOM_CDF10(7689, 9060, 12056, 24992, 25660, 26182, 26951, 28041, 29052) }, + { AOM_CDF10(6015, 9009, 10062, 24544, 25409, 26545, 27071, 27526, 32047) }, + { AOM_CDF10(1394, 2208, 2796, 28614, 29061, 29466, 29840, 30185, 31899) }, + { AOM_CDF10(20137, 21547, 23078, 29566, 29837, 30261, 30524, 30892, 31724) }, + { AOM_CDF10(6732, 7490, 9497, 27944, 28250, 28515, 28969, 29630, 30104) }, + { AOM_CDF10(5945, 7663, 8348, 28683, 29117, 29749, 30064, 30298, 32238) }, + { AOM_CDF10(870, 1212, 1487, 31198, 31394, 31574, 31743, 31881, 32332) }, + { AOM_CDF8(27899, 28219, 28529, 32484, 32539, 32619, 32639) }, + { AOM_CDF8(6607, 6990, 8268, 32060, 32219, 32338, 32371) }, + { AOM_CDF8(5429, 6676, 7122, 32027, 32227, 32531, 32582) }, + { AOM_CDF8(711, 966, 1172, 32448, 32538, 32617, 32664) }, +}; + +static const aom_cdf_prob default_intra_ext_tx_cdf + [EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES][CDF_SIZE(TX_TYPES)] = { + { + { + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + }, + { + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + }, + { + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + }, + { + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + }, + }, + { + { + { AOM_CDF7(1535, 8035, 9461, 12751, 23467, 27825) }, + { AOM_CDF7(564, 3335, 9709, 10870, 18143, 28094) }, + { AOM_CDF7(672, 3247, 3676, 11982, 19415, 23127) }, + { AOM_CDF7(5279, 13885, 15487, 18044, 23527, 30252) }, + { AOM_CDF7(4423, 6074, 7985, 10416, 25693, 29298) }, + { AOM_CDF7(1486, 4241, 9460, 10662, 16456, 27694) }, + { AOM_CDF7(439, 2838, 3522, 6737, 18058, 23754) }, + { AOM_CDF7(1190, 4233, 4855, 11670, 20281, 24377) }, + { AOM_CDF7(1045, 4312, 8647, 10159, 18644, 29335) }, + { AOM_CDF7(202, 3734, 4747, 7298, 17127, 24016) }, + { AOM_CDF7(447, 4312, 6819, 8884, 16010, 23858) }, + { AOM_CDF7(277, 4369, 5255, 8905, 16465, 22271) }, + { AOM_CDF7(3409, 5436, 10599, 15599, 19687, 24040) }, + }, + { + { AOM_CDF7(1870, 13742, 14530, 16498, 23770, 27698) }, + { AOM_CDF7(326, 8796, 14632, 15079, 19272, 27486) }, + { AOM_CDF7(484, 7576, 7712, 14443, 19159, 22591) }, + { AOM_CDF7(1126, 15340, 15895, 17023, 20896, 30279) }, + { AOM_CDF7(655, 4854, 5249, 5913, 22099, 27138) }, + { AOM_CDF7(1299, 6458, 8885, 9290, 14851, 25497) }, + { AOM_CDF7(311, 5295, 5552, 6885, 16107, 22672) }, + { AOM_CDF7(883, 8059, 8270, 11258, 17289, 21549) }, + { AOM_CDF7(741, 7580, 9318, 10345, 16688, 29046) }, + { AOM_CDF7(110, 7406, 7915, 9195, 16041, 23329) }, + { AOM_CDF7(363, 7974, 9357, 10673, 15629, 24474) }, + { AOM_CDF7(153, 7647, 8112, 9936, 15307, 19996) }, + { AOM_CDF7(3511, 6332, 11165, 15335, 19323, 23594) }, + }, + { + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + }, + { + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + }, + }, + { + { + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + }, + { + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + }, + { + { AOM_CDF5(1127, 12814, 22772, 27483) }, + { AOM_CDF5(145, 6761, 11980, 26667) }, + { AOM_CDF5(362, 5887, 11678, 16725) }, + { AOM_CDF5(385, 15213, 18587, 30693) }, + { AOM_CDF5(25, 2914, 23134, 27903) }, + { AOM_CDF5(60, 4470, 11749, 23991) }, + { AOM_CDF5(37, 3332, 14511, 21448) }, + { AOM_CDF5(157, 6320, 13036, 17439) }, + { AOM_CDF5(119, 6719, 12906, 29396) }, + { AOM_CDF5(47, 5537, 12576, 21499) }, + { AOM_CDF5(269, 6076, 11258, 23115) }, + { AOM_CDF5(83, 5615, 12001, 17228) }, + { AOM_CDF5(1968, 5556, 12023, 18547) }, + }, + { + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + }, + }, + }; + +static const aom_cdf_prob + default_inter_ext_tx_cdf[EXT_TX_SETS_INTER][EXT_TX_SIZES][CDF_SIZE( + TX_TYPES)] = { + { + { 0 }, + { 0 }, + { 0 }, + { 0 }, + }, + { + { AOM_CDF16(4458, 5560, 7695, 9709, 13330, 14789, 17537, 20266, 21504, + 22848, 23934, 25474, 27727, 28915, 30631) }, + { AOM_CDF16(1645, 2573, 4778, 5711, 7807, 8622, 10522, 15357, 17674, + 20408, 22517, 25010, 27116, 28856, 30749) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, + 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, + 20480, 22528, 24576, 26624, 28672, 30720) }, + }, + { + { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845, + 24576, 27307, 30037) }, + { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845, + 24576, 27307, 30037) }, + { AOM_CDF12(770, 2421, 5225, 12907, 15819, 18927, 21561, 24089, 26595, + 28526, 30529) }, + { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845, + 24576, 27307, 30037) }, + }, + { + { AOM_CDF2(16384) }, + { AOM_CDF2(4167) }, + { AOM_CDF2(1998) }, + { AOM_CDF2(748) }, + }, + }; + +static const aom_cdf_prob default_cfl_sign_cdf[CDF_SIZE(CFL_JOINT_SIGNS)] = { + AOM_CDF8(1418, 2123, 13340, 18405, 26972, 28343, 32294) +}; + +static const aom_cdf_prob + default_cfl_alpha_cdf[CFL_ALPHA_CONTEXTS][CDF_SIZE(CFL_ALPHABET_SIZE)] = { + { AOM_CDF16(7637, 20719, 31401, 32481, 32657, 32688, 32692, 32696, 32700, + 32704, 32708, 32712, 32716, 32720, 32724) }, + { AOM_CDF16(14365, 23603, 28135, 31168, 32167, 32395, 32487, 32573, 32620, + 32647, 32668, 32672, 32676, 32680, 32684) }, + { AOM_CDF16(11532, 22380, 28445, 31360, 32349, 32523, 32584, 32649, 32673, + 32677, 32681, 32685, 32689, 32693, 32697) }, + { AOM_CDF16(26990, 31402, 32282, 32571, 32692, 32696, 32700, 32704, 32708, + 32712, 32716, 32720, 32724, 32728, 32732) }, + { AOM_CDF16(17248, 26058, 28904, 30608, 31305, 31877, 32126, 32321, 32394, + 32464, 32516, 32560, 32576, 32593, 32622) }, + { AOM_CDF16(14738, 21678, 25779, 27901, 29024, 30302, 30980, 31843, 32144, + 32413, 32520, 32594, 32622, 32656, 32660) } + }; + +static const aom_cdf_prob + default_switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS][CDF_SIZE( + SWITCHABLE_FILTERS)] = { + { AOM_CDF3(31935, 32720) }, { AOM_CDF3(5568, 32719) }, + { AOM_CDF3(422, 2938) }, { AOM_CDF3(28244, 32608) }, + { AOM_CDF3(31206, 31953) }, { AOM_CDF3(4862, 32121) }, + { AOM_CDF3(770, 1152) }, { AOM_CDF3(20889, 25637) }, + { AOM_CDF3(31910, 32724) }, { AOM_CDF3(4120, 32712) }, + { AOM_CDF3(305, 2247) }, { AOM_CDF3(27403, 32636) }, + { AOM_CDF3(31022, 32009) }, { AOM_CDF3(2963, 32093) }, + { AOM_CDF3(601, 943) }, { AOM_CDF3(14969, 21398) } + }; + +static const aom_cdf_prob default_newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE( + 2)] = { { AOM_CDF2(24035) }, { AOM_CDF2(16630) }, { AOM_CDF2(15339) }, + { AOM_CDF2(8386) }, { AOM_CDF2(12222) }, { AOM_CDF2(4676) } }; + +static const aom_cdf_prob default_zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE( + 2)] = { { AOM_CDF2(2175) }, { AOM_CDF2(1054) } }; + +static const aom_cdf_prob default_refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE( + 2)] = { { AOM_CDF2(23974) }, { AOM_CDF2(24188) }, { AOM_CDF2(17848) }, + { AOM_CDF2(28622) }, { AOM_CDF2(24312) }, { AOM_CDF2(19923) } }; + +static const aom_cdf_prob default_drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)] = { + { AOM_CDF2(13104) }, { AOM_CDF2(24560) }, { AOM_CDF2(18945) } +}; + +static const aom_cdf_prob + default_inter_compound_mode_cdf[INTER_MODE_CONTEXTS][CDF_SIZE( + INTER_COMPOUND_MODES)] = { + { AOM_CDF8(7760, 13823, 15808, 17641, 19156, 20666, 26891) }, + { AOM_CDF8(10730, 19452, 21145, 22749, 24039, 25131, 28724) }, + { AOM_CDF8(10664, 20221, 21588, 22906, 24295, 25387, 28436) }, + { AOM_CDF8(13298, 16984, 20471, 24182, 25067, 25736, 26422) }, + { AOM_CDF8(18904, 23325, 25242, 27432, 27898, 28258, 30758) }, + { AOM_CDF8(10725, 17454, 20124, 22820, 24195, 25168, 26046) }, + { AOM_CDF8(17125, 24273, 25814, 27492, 28214, 28704, 30592) }, + { AOM_CDF8(13046, 23214, 24505, 25942, 27435, 28442, 29330) } + }; + +static const aom_cdf_prob default_interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE( + 2)] = { { AOM_CDF2(16384) }, + { AOM_CDF2(26887) }, + { AOM_CDF2(27597) }, + { AOM_CDF2(30237) } }; + +static const aom_cdf_prob + default_interintra_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE( + INTERINTRA_MODES)] = { { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(1875, 11082, 27332) }, + { AOM_CDF4(2473, 9996, 26388) }, + { AOM_CDF4(4238, 11537, 25926) } }; + +static const aom_cdf_prob + default_wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)] = { + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(20036) }, { AOM_CDF2(24957) }, { AOM_CDF2(26704) }, + { AOM_CDF2(27530) }, { AOM_CDF2(29564) }, { AOM_CDF2(29444) }, + { AOM_CDF2(26872) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } + }; + +static const aom_cdf_prob default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE( + MASKED_COMPOUND_TYPES)] = { + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(23431) }, { AOM_CDF2(13171) }, { AOM_CDF2(11470) }, + { AOM_CDF2(9770) }, { AOM_CDF2(9100) }, { AOM_CDF2(8233) }, + { AOM_CDF2(6172) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(11820) }, { AOM_CDF2(7701) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } +}; + +static const aom_cdf_prob default_wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE( + 16)] = { { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2438, 4440, 6599, 8663, 11005, 12874, 15751, 18094, + 20359, 22362, 24127, 25702, 27752, 29450, 31171) }, + { AOM_CDF16(806, 3266, 6005, 6738, 7218, 7367, 7771, 14588, 16323, + 17367, 18452, 19422, 22839, 26127, 29629) }, + { AOM_CDF16(2779, 3738, 4683, 7213, 7775, 8017, 8655, 14357, 17939, + 21332, 24520, 27470, 29456, 30529, 31656) }, + { AOM_CDF16(1684, 3625, 5675, 7108, 9302, 11274, 14429, 17144, + 19163, 20961, 22884, 24471, 26719, 28714, 30877) }, + { AOM_CDF16(1142, 3491, 6277, 7314, 8089, 8355, 9023, 13624, 15369, + 16730, 18114, 19313, 22521, 26012, 29550) }, + { AOM_CDF16(2742, 4195, 5727, 8035, 8980, 9336, 10146, 14124, + 17270, 20533, 23434, 25972, 27944, 29570, 31416) }, + { AOM_CDF16(1727, 3948, 6101, 7796, 9841, 12344, 15766, 18944, + 20638, 22038, 23963, 25311, 26988, 28766, 31012) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(154, 987, 1925, 2051, 2088, 2111, 2151, 23033, 23703, + 24284, 24985, 25684, 27259, 28883, 30911) }, + { AOM_CDF16(1135, 1322, 1493, 2635, 2696, 2737, 2770, 21016, 22935, + 25057, 27251, 29173, 30089, 30960, 31933) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) } }; + +static const aom_cdf_prob default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE( + MOTION_MODES)] = { { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) }, { AOM_CDF3(7651, 24760) }, + { AOM_CDF3(4738, 24765) }, { AOM_CDF3(5391, 25528) }, + { AOM_CDF3(19419, 26810) }, { AOM_CDF3(5123, 23606) }, + { AOM_CDF3(11606, 24308) }, { AOM_CDF3(26260, 29116) }, + { AOM_CDF3(20360, 28062) }, { AOM_CDF3(21679, 26830) }, + { AOM_CDF3(29516, 30701) }, { AOM_CDF3(28898, 30397) }, + { AOM_CDF3(30878, 31335) }, { AOM_CDF3(32507, 32558) }, + { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(28799, 31390) }, { AOM_CDF3(26431, 30774) }, + { AOM_CDF3(28973, 31594) }, { AOM_CDF3(29742, 31203) } }; + +static const aom_cdf_prob default_obmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)] = { + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(10437) }, { AOM_CDF2(9371) }, { AOM_CDF2(9301) }, + { AOM_CDF2(17432) }, { AOM_CDF2(14423) }, { AOM_CDF2(15142) }, + { AOM_CDF2(25817) }, { AOM_CDF2(22823) }, { AOM_CDF2(22083) }, + { AOM_CDF2(30128) }, { AOM_CDF2(31014) }, { AOM_CDF2(31560) }, + { AOM_CDF2(32638) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(23664) }, { AOM_CDF2(20901) }, { AOM_CDF2(24008) }, + { AOM_CDF2(26879) } +}; + +static const aom_cdf_prob default_intra_inter_cdf[INTRA_INTER_CONTEXTS] + [CDF_SIZE(2)] = { + { AOM_CDF2(806) }, + { AOM_CDF2(16662) }, + { AOM_CDF2(20186) }, + { AOM_CDF2(26538) } + }; + +static const aom_cdf_prob default_comp_inter_cdf[COMP_INTER_CONTEXTS][CDF_SIZE( + 2)] = { { AOM_CDF2(26828) }, + { AOM_CDF2(24035) }, + { AOM_CDF2(12031) }, + { AOM_CDF2(10640) }, + { AOM_CDF2(2901) } }; + +static const aom_cdf_prob default_comp_ref_type_cdf[COMP_REF_TYPE_CONTEXTS] + [CDF_SIZE(2)] = { + { AOM_CDF2(1198) }, + { AOM_CDF2(2070) }, + { AOM_CDF2(9166) }, + { AOM_CDF2(7499) }, + { AOM_CDF2(22475) } + }; + +static const aom_cdf_prob + default_uni_comp_ref_cdf[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - + 1][CDF_SIZE(2)] = { + { { AOM_CDF2(5284) }, { AOM_CDF2(3865) }, { AOM_CDF2(3128) } }, + { { AOM_CDF2(23152) }, { AOM_CDF2(14173) }, { AOM_CDF2(15270) } }, + { { AOM_CDF2(31774) }, { AOM_CDF2(25120) }, { AOM_CDF2(26710) } } + }; + +static const aom_cdf_prob default_single_ref_cdf[REF_CONTEXTS][SINGLE_REFS - 1] + [CDF_SIZE(2)] = { + { { AOM_CDF2(4897) }, + { AOM_CDF2(1555) }, + { AOM_CDF2(4236) }, + { AOM_CDF2(8650) }, + { AOM_CDF2(904) }, + { AOM_CDF2(1444) } }, + { { AOM_CDF2(16973) }, + { AOM_CDF2(16751) }, + { AOM_CDF2(19647) }, + { AOM_CDF2(24773) }, + { AOM_CDF2(11014) }, + { AOM_CDF2(15087) } }, + { { AOM_CDF2(29744) }, + { AOM_CDF2(30279) }, + { AOM_CDF2(31194) }, + { AOM_CDF2(31895) }, + { AOM_CDF2(26875) }, + { AOM_CDF2(30304) } } + }; + +static const aom_cdf_prob + default_comp_ref_cdf[REF_CONTEXTS][FWD_REFS - 1][CDF_SIZE(2)] = { + { { AOM_CDF2(4946) }, { AOM_CDF2(9468) }, { AOM_CDF2(1503) } }, + { { AOM_CDF2(19891) }, { AOM_CDF2(22441) }, { AOM_CDF2(15160) } }, + { { AOM_CDF2(30731) }, { AOM_CDF2(31059) }, { AOM_CDF2(27544) } } + }; + +static const aom_cdf_prob + default_comp_bwdref_cdf[REF_CONTEXTS][BWD_REFS - 1][CDF_SIZE(2)] = { + { { AOM_CDF2(2235) }, { AOM_CDF2(1423) } }, + { { AOM_CDF2(17182) }, { AOM_CDF2(15175) } }, + { { AOM_CDF2(30606) }, { AOM_CDF2(30489) } } + }; + +static const aom_cdf_prob + default_palette_y_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)] = { + { AOM_CDF7(7952, 13000, 18149, 21478, 25527, 29241) }, + { AOM_CDF7(7139, 11421, 16195, 19544, 23666, 28073) }, + { AOM_CDF7(7788, 12741, 17325, 20500, 24315, 28530) }, + { AOM_CDF7(8271, 14064, 18246, 21564, 25071, 28533) }, + { AOM_CDF7(12725, 19180, 21863, 24839, 27535, 30120) }, + { AOM_CDF7(9711, 14888, 16923, 21052, 25661, 27875) }, + { AOM_CDF7(14940, 20797, 21678, 24186, 27033, 28999) } + }; + +static const aom_cdf_prob + default_palette_uv_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)] = { + { AOM_CDF7(8713, 19979, 27128, 29609, 31331, 32272) }, + { AOM_CDF7(5839, 15573, 23581, 26947, 29848, 31700) }, + { AOM_CDF7(4426, 11260, 17999, 21483, 25863, 29430) }, + { AOM_CDF7(3228, 9464, 14993, 18089, 22523, 27420) }, + { AOM_CDF7(3768, 8886, 13091, 17852, 22495, 27207) }, + { AOM_CDF7(2464, 8451, 12861, 21632, 25525, 28555) }, + { AOM_CDF7(1269, 5435, 10433, 18963, 21700, 25865) } + }; + +static const aom_cdf_prob default_palette_y_mode_cdf + [PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][CDF_SIZE(2)] = { + { { AOM_CDF2(31676) }, { AOM_CDF2(3419) }, { AOM_CDF2(1261) } }, + { { AOM_CDF2(31912) }, { AOM_CDF2(2859) }, { AOM_CDF2(980) } }, + { { AOM_CDF2(31823) }, { AOM_CDF2(3400) }, { AOM_CDF2(781) } }, + { { AOM_CDF2(32030) }, { AOM_CDF2(3561) }, { AOM_CDF2(904) } }, + { { AOM_CDF2(32309) }, { AOM_CDF2(7337) }, { AOM_CDF2(1462) } }, + { { AOM_CDF2(32265) }, { AOM_CDF2(4015) }, { AOM_CDF2(1521) } }, + { { AOM_CDF2(32450) }, { AOM_CDF2(7946) }, { AOM_CDF2(129) } } + }; + +static const aom_cdf_prob + default_palette_uv_mode_cdf[PALETTE_UV_MODE_CONTEXTS][CDF_SIZE(2)] = { + { AOM_CDF2(32461) }, { AOM_CDF2(21488) } + }; + +static const aom_cdf_prob default_palette_y_color_index_cdf + [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)] = { + { + { AOM_CDF2(28710) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(10553) }, + { AOM_CDF2(27036) }, + { AOM_CDF2(31603) }, + }, + { + { AOM_CDF3(27877, 30490) }, + { AOM_CDF3(11532, 25697) }, + { AOM_CDF3(6544, 30234) }, + { AOM_CDF3(23018, 28072) }, + { AOM_CDF3(31915, 32385) }, + }, + { + { AOM_CDF4(25572, 28046, 30045) }, + { AOM_CDF4(9478, 21590, 27256) }, + { AOM_CDF4(7248, 26837, 29824) }, + { AOM_CDF4(19167, 24486, 28349) }, + { AOM_CDF4(31400, 31825, 32250) }, + }, + { + { AOM_CDF5(24779, 26955, 28576, 30282) }, + { AOM_CDF5(8669, 20364, 24073, 28093) }, + { AOM_CDF5(4255, 27565, 29377, 31067) }, + { AOM_CDF5(19864, 23674, 26716, 29530) }, + { AOM_CDF5(31646, 31893, 32147, 32426) }, + }, + { + { AOM_CDF6(23132, 25407, 26970, 28435, 30073) }, + { AOM_CDF6(7443, 17242, 20717, 24762, 27982) }, + { AOM_CDF6(6300, 24862, 26944, 28784, 30671) }, + { AOM_CDF6(18916, 22895, 25267, 27435, 29652) }, + { AOM_CDF6(31270, 31550, 31808, 32059, 32353) }, + }, + { + { AOM_CDF7(23105, 25199, 26464, 27684, 28931, 30318) }, + { AOM_CDF7(6950, 15447, 18952, 22681, 25567, 28563) }, + { AOM_CDF7(7560, 23474, 25490, 27203, 28921, 30708) }, + { AOM_CDF7(18544, 22373, 24457, 26195, 28119, 30045) }, + { AOM_CDF7(31198, 31451, 31670, 31882, 32123, 32391) }, + }, + { + { AOM_CDF8(21689, 23883, 25163, 26352, 27506, 28827, 30195) }, + { AOM_CDF8(6892, 15385, 17840, 21606, 24287, 26753, 29204) }, + { AOM_CDF8(5651, 23182, 25042, 26518, 27982, 29392, 30900) }, + { AOM_CDF8(19349, 22578, 24418, 25994, 27524, 29031, 30448) }, + { AOM_CDF8(31028, 31270, 31504, 31705, 31927, 32153, 32392) }, + }, + }; + +static const aom_cdf_prob default_palette_uv_color_index_cdf + [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)] = { + { + { AOM_CDF2(29089) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(8713) }, + { AOM_CDF2(29257) }, + { AOM_CDF2(31610) }, + }, + { + { AOM_CDF3(25257, 29145) }, + { AOM_CDF3(12287, 27293) }, + { AOM_CDF3(7033, 27960) }, + { AOM_CDF3(20145, 25405) }, + { AOM_CDF3(30608, 31639) }, + }, + { + { AOM_CDF4(24210, 27175, 29903) }, + { AOM_CDF4(9888, 22386, 27214) }, + { AOM_CDF4(5901, 26053, 29293) }, + { AOM_CDF4(18318, 22152, 28333) }, + { AOM_CDF4(30459, 31136, 31926) }, + }, + { + { AOM_CDF5(22980, 25479, 27781, 29986) }, + { AOM_CDF5(8413, 21408, 24859, 28874) }, + { AOM_CDF5(2257, 29449, 30594, 31598) }, + { AOM_CDF5(19189, 21202, 25915, 28620) }, + { AOM_CDF5(31844, 32044, 32281, 32518) }, + }, + { + { AOM_CDF6(22217, 24567, 26637, 28683, 30548) }, + { AOM_CDF6(7307, 16406, 19636, 24632, 28424) }, + { AOM_CDF6(4441, 25064, 26879, 28942, 30919) }, + { AOM_CDF6(17210, 20528, 23319, 26750, 29582) }, + { AOM_CDF6(30674, 30953, 31396, 31735, 32207) }, + }, + { + { AOM_CDF7(21239, 23168, 25044, 26962, 28705, 30506) }, + { AOM_CDF7(6545, 15012, 18004, 21817, 25503, 28701) }, + { AOM_CDF7(3448, 26295, 27437, 28704, 30126, 31442) }, + { AOM_CDF7(15889, 18323, 21704, 24698, 26976, 29690) }, + { AOM_CDF7(30988, 31204, 31479, 31734, 31983, 32325) }, + }, + { + { AOM_CDF8(21442, 23288, 24758, 26246, 27649, 28980, 30563) }, + { AOM_CDF8(5863, 14933, 17552, 20668, 23683, 26411, 29273) }, + { AOM_CDF8(3415, 25810, 26877, 27990, 29223, 30394, 31618) }, + { AOM_CDF8(17965, 20084, 22232, 23974, 26274, 28402, 30390) }, + { AOM_CDF8(31190, 31329, 31516, 31679, 31825, 32026, 32322) }, + }, + }; + +static const aom_cdf_prob + default_txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)] = { + { AOM_CDF2(28581) }, { AOM_CDF2(23846) }, { AOM_CDF2(20847) }, + { AOM_CDF2(24315) }, { AOM_CDF2(18196) }, { AOM_CDF2(12133) }, + { AOM_CDF2(18791) }, { AOM_CDF2(10887) }, { AOM_CDF2(11005) }, + { AOM_CDF2(27179) }, { AOM_CDF2(20004) }, { AOM_CDF2(11281) }, + { AOM_CDF2(26549) }, { AOM_CDF2(19308) }, { AOM_CDF2(14224) }, + { AOM_CDF2(28015) }, { AOM_CDF2(21546) }, { AOM_CDF2(14400) }, + { AOM_CDF2(28165) }, { AOM_CDF2(22401) }, { AOM_CDF2(16088) } + }; + +static const aom_cdf_prob default_skip_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)] = { + { AOM_CDF2(31671) }, { AOM_CDF2(16515) }, { AOM_CDF2(4576) } +}; + +static const aom_cdf_prob default_skip_mode_cdfs[SKIP_MODE_CONTEXTS][CDF_SIZE( + 2)] = { { AOM_CDF2(32621) }, { AOM_CDF2(20708) }, { AOM_CDF2(8127) } }; + +static const aom_cdf_prob + default_compound_idx_cdfs[COMP_INDEX_CONTEXTS][CDF_SIZE(2)] = { + { AOM_CDF2(18244) }, { AOM_CDF2(12865) }, { AOM_CDF2(7053) }, + { AOM_CDF2(13259) }, { AOM_CDF2(9334) }, { AOM_CDF2(4644) } + }; + +static const aom_cdf_prob + default_comp_group_idx_cdfs[COMP_GROUP_IDX_CONTEXTS][CDF_SIZE(2)] = { + { AOM_CDF2(26607) }, { AOM_CDF2(22891) }, { AOM_CDF2(18840) }, + { AOM_CDF2(24594) }, { AOM_CDF2(19934) }, { AOM_CDF2(22674) } + }; + +static const aom_cdf_prob default_intrabc_cdf[CDF_SIZE(2)] = { AOM_CDF2( + 30531) }; + +static const aom_cdf_prob default_filter_intra_mode_cdf[CDF_SIZE( + FILTER_INTRA_MODES)] = { AOM_CDF5(8949, 12776, 17211, 29558) }; + +static const aom_cdf_prob default_filter_intra_cdfs[BLOCK_SIZES_ALL][CDF_SIZE( + 2)] = { { AOM_CDF2(4621) }, { AOM_CDF2(6743) }, { AOM_CDF2(5893) }, + { AOM_CDF2(7866) }, { AOM_CDF2(12551) }, { AOM_CDF2(9394) }, + { AOM_CDF2(12408) }, { AOM_CDF2(14301) }, { AOM_CDF2(12756) }, + { AOM_CDF2(22343) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, { AOM_CDF2(12770) }, { AOM_CDF2(10368) }, + { AOM_CDF2(20229) }, { AOM_CDF2(18101) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }; + +static const aom_cdf_prob default_switchable_restore_cdf[CDF_SIZE( + RESTORE_SWITCHABLE_TYPES)] = { AOM_CDF3(9413, 22581) }; + +static const aom_cdf_prob default_wiener_restore_cdf[CDF_SIZE(2)] = { AOM_CDF2( + 11570) }; + +static const aom_cdf_prob default_sgrproj_restore_cdf[CDF_SIZE(2)] = { AOM_CDF2( + 16855) }; + +static const aom_cdf_prob default_delta_q_cdf[CDF_SIZE(DELTA_Q_PROBS + 1)] = { + AOM_CDF4(28160, 32120, 32677) +}; + +static const aom_cdf_prob default_delta_lf_multi_cdf[FRAME_LF_COUNT][CDF_SIZE( + DELTA_LF_PROBS + 1)] = { { AOM_CDF4(28160, 32120, 32677) }, + { AOM_CDF4(28160, 32120, 32677) }, + { AOM_CDF4(28160, 32120, 32677) }, + { AOM_CDF4(28160, 32120, 32677) } }; +static const aom_cdf_prob default_delta_lf_cdf[CDF_SIZE(DELTA_LF_PROBS + 1)] = { + AOM_CDF4(28160, 32120, 32677) +}; + +// FIXME(someone) need real defaults here +static const aom_cdf_prob default_seg_tree_cdf[CDF_SIZE(MAX_SEGMENTS)] = { + AOM_CDF8(4096, 8192, 12288, 16384, 20480, 24576, 28672) +}; + +static const aom_cdf_prob + default_segment_pred_cdf[SEG_TEMPORAL_PRED_CTXS][CDF_SIZE(2)] = { + { AOM_CDF2(128 * 128) }, { AOM_CDF2(128 * 128) }, { AOM_CDF2(128 * 128) } + }; + +static const aom_cdf_prob + default_spatial_pred_seg_tree_cdf[SPATIAL_PREDICTION_PROBS][CDF_SIZE( + MAX_SEGMENTS)] = { + { + AOM_CDF8(5622, 7893, 16093, 18233, 27809, 28373, 32533), + }, + { + AOM_CDF8(14274, 18230, 22557, 24935, 29980, 30851, 32344), + }, + { + AOM_CDF8(27527, 28487, 28723, 28890, 32397, 32647, 32679), + }, + }; + +static const aom_cdf_prob default_tx_size_cdf[MAX_TX_CATS][TX_SIZE_CONTEXTS] + [CDF_SIZE(MAX_TX_DEPTH + 1)] = { + { { AOM_CDF2(19968) }, + { AOM_CDF2(19968) }, + { AOM_CDF2(24320) } }, + { { AOM_CDF3(12272, 30172) }, + { AOM_CDF3(12272, 30172) }, + { AOM_CDF3(18677, 30848) } }, + { { AOM_CDF3(12986, 15180) }, + { AOM_CDF3(12986, 15180) }, + { AOM_CDF3(24302, 25602) } }, + { { AOM_CDF3(5782, 11475) }, + { AOM_CDF3(5782, 11475) }, + { AOM_CDF3(16803, 22759) } }, + }; + +#define MAX_COLOR_CONTEXT_HASH 8 +// Negative values are invalid +static const int palette_color_index_context_lookup[MAX_COLOR_CONTEXT_HASH + + 1] = { -1, -1, 0, -1, -1, + 4, 3, 2, 1 }; + +#define NUM_PALETTE_NEIGHBORS 3 // left, top-left and top. +int av1_get_palette_color_index_context(const uint8_t *color_map, int stride, + int r, int c, int palette_size, + uint8_t *color_order, int *color_idx) { + assert(palette_size <= PALETTE_MAX_SIZE); + assert(r > 0 || c > 0); + + // Get color indices of neighbors. + int color_neighbors[NUM_PALETTE_NEIGHBORS]; + color_neighbors[0] = (c - 1 >= 0) ? color_map[r * stride + c - 1] : -1; + color_neighbors[1] = + (c - 1 >= 0 && r - 1 >= 0) ? color_map[(r - 1) * stride + c - 1] : -1; + color_neighbors[2] = (r - 1 >= 0) ? color_map[(r - 1) * stride + c] : -1; + + // The +10 below should not be needed. But we get a warning "array subscript + // is above array bounds [-Werror=array-bounds]" without it, possibly due to + // this (or similar) bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59124 + int scores[PALETTE_MAX_SIZE + 10] = { 0 }; + int i; + static const int weights[NUM_PALETTE_NEIGHBORS] = { 2, 1, 2 }; + for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) { + if (color_neighbors[i] >= 0) { + scores[color_neighbors[i]] += weights[i]; + } + } + + int inverse_color_order[PALETTE_MAX_SIZE]; + for (i = 0; i < PALETTE_MAX_SIZE; ++i) { + color_order[i] = i; + inverse_color_order[i] = i; + } + + // Get the top NUM_PALETTE_NEIGHBORS scores (sorted from large to small). + for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) { + int max = scores[i]; + int max_idx = i; + for (int j = i + 1; j < palette_size; ++j) { + if (scores[j] > max) { + max = scores[j]; + max_idx = j; + } + } + if (max_idx != i) { + // Move the score at index 'max_idx' to index 'i', and shift the scores + // from 'i' to 'max_idx - 1' by 1. + const int max_score = scores[max_idx]; + const uint8_t max_color_order = color_order[max_idx]; + for (int k = max_idx; k > i; --k) { + scores[k] = scores[k - 1]; + color_order[k] = color_order[k - 1]; + inverse_color_order[color_order[k]] = k; + } + scores[i] = max_score; + color_order[i] = max_color_order; + inverse_color_order[color_order[i]] = i; + } + } + + if (color_idx != NULL) + *color_idx = inverse_color_order[color_map[r * stride + c]]; + + // Get hash value of context. + int color_index_ctx_hash = 0; + static const int hash_multipliers[NUM_PALETTE_NEIGHBORS] = { 1, 2, 2 }; + for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) { + color_index_ctx_hash += scores[i] * hash_multipliers[i]; + } + assert(color_index_ctx_hash > 0); + assert(color_index_ctx_hash <= MAX_COLOR_CONTEXT_HASH); + + // Lookup context from hash. + const int color_index_ctx = + palette_color_index_context_lookup[color_index_ctx_hash]; + assert(color_index_ctx >= 0); + assert(color_index_ctx < PALETTE_COLOR_INDEX_CONTEXTS); + return color_index_ctx; +} +#undef NUM_PALETTE_NEIGHBORS +#undef MAX_COLOR_CONTEXT_HASH + +static void init_mode_probs(FRAME_CONTEXT *fc) { + av1_copy(fc->palette_y_size_cdf, default_palette_y_size_cdf); + av1_copy(fc->palette_uv_size_cdf, default_palette_uv_size_cdf); + av1_copy(fc->palette_y_color_index_cdf, default_palette_y_color_index_cdf); + av1_copy(fc->palette_uv_color_index_cdf, default_palette_uv_color_index_cdf); + av1_copy(fc->kf_y_cdf, default_kf_y_mode_cdf); + av1_copy(fc->angle_delta_cdf, default_angle_delta_cdf); + av1_copy(fc->comp_inter_cdf, default_comp_inter_cdf); + av1_copy(fc->comp_ref_type_cdf, default_comp_ref_type_cdf); + av1_copy(fc->uni_comp_ref_cdf, default_uni_comp_ref_cdf); + av1_copy(fc->palette_y_mode_cdf, default_palette_y_mode_cdf); + av1_copy(fc->palette_uv_mode_cdf, default_palette_uv_mode_cdf); + av1_copy(fc->comp_ref_cdf, default_comp_ref_cdf); + av1_copy(fc->comp_bwdref_cdf, default_comp_bwdref_cdf); + av1_copy(fc->single_ref_cdf, default_single_ref_cdf); + av1_copy(fc->txfm_partition_cdf, default_txfm_partition_cdf); + av1_copy(fc->compound_index_cdf, default_compound_idx_cdfs); + av1_copy(fc->comp_group_idx_cdf, default_comp_group_idx_cdfs); + av1_copy(fc->newmv_cdf, default_newmv_cdf); + av1_copy(fc->zeromv_cdf, default_zeromv_cdf); + av1_copy(fc->refmv_cdf, default_refmv_cdf); + av1_copy(fc->drl_cdf, default_drl_cdf); + av1_copy(fc->motion_mode_cdf, default_motion_mode_cdf); + av1_copy(fc->obmc_cdf, default_obmc_cdf); + av1_copy(fc->inter_compound_mode_cdf, default_inter_compound_mode_cdf); + av1_copy(fc->compound_type_cdf, default_compound_type_cdf); + av1_copy(fc->wedge_idx_cdf, default_wedge_idx_cdf); + av1_copy(fc->interintra_cdf, default_interintra_cdf); + av1_copy(fc->wedge_interintra_cdf, default_wedge_interintra_cdf); + av1_copy(fc->interintra_mode_cdf, default_interintra_mode_cdf); + av1_copy(fc->seg.pred_cdf, default_segment_pred_cdf); + av1_copy(fc->seg.tree_cdf, default_seg_tree_cdf); + av1_copy(fc->filter_intra_cdfs, default_filter_intra_cdfs); + av1_copy(fc->filter_intra_mode_cdf, default_filter_intra_mode_cdf); + av1_copy(fc->switchable_restore_cdf, default_switchable_restore_cdf); + av1_copy(fc->wiener_restore_cdf, default_wiener_restore_cdf); + av1_copy(fc->sgrproj_restore_cdf, default_sgrproj_restore_cdf); + av1_copy(fc->y_mode_cdf, default_if_y_mode_cdf); + av1_copy(fc->uv_mode_cdf, default_uv_mode_cdf); + av1_copy(fc->switchable_interp_cdf, default_switchable_interp_cdf); + av1_copy(fc->partition_cdf, default_partition_cdf); + av1_copy(fc->intra_ext_tx_cdf, default_intra_ext_tx_cdf); + av1_copy(fc->inter_ext_tx_cdf, default_inter_ext_tx_cdf); + av1_copy(fc->skip_mode_cdfs, default_skip_mode_cdfs); + av1_copy(fc->skip_cdfs, default_skip_cdfs); + av1_copy(fc->intra_inter_cdf, default_intra_inter_cdf); + for (int i = 0; i < SPATIAL_PREDICTION_PROBS; i++) + av1_copy(fc->seg.spatial_pred_seg_cdf[i], + default_spatial_pred_seg_tree_cdf[i]); + av1_copy(fc->tx_size_cdf, default_tx_size_cdf); + av1_copy(fc->delta_q_cdf, default_delta_q_cdf); + av1_copy(fc->delta_lf_cdf, default_delta_lf_cdf); + av1_copy(fc->delta_lf_multi_cdf, default_delta_lf_multi_cdf); + av1_copy(fc->cfl_sign_cdf, default_cfl_sign_cdf); + av1_copy(fc->cfl_alpha_cdf, default_cfl_alpha_cdf); + av1_copy(fc->intrabc_cdf, default_intrabc_cdf); +} + +void av1_set_default_ref_deltas(int8_t *ref_deltas) { + assert(ref_deltas != NULL); + + ref_deltas[INTRA_FRAME] = 1; + ref_deltas[LAST_FRAME] = 0; + ref_deltas[LAST2_FRAME] = ref_deltas[LAST_FRAME]; + ref_deltas[LAST3_FRAME] = ref_deltas[LAST_FRAME]; + ref_deltas[BWDREF_FRAME] = ref_deltas[LAST_FRAME]; + ref_deltas[GOLDEN_FRAME] = -1; + ref_deltas[ALTREF2_FRAME] = -1; + ref_deltas[ALTREF_FRAME] = -1; +} + +void av1_set_default_mode_deltas(int8_t *mode_deltas) { + assert(mode_deltas != NULL); + + mode_deltas[0] = 0; + mode_deltas[1] = 0; +} + +static void set_default_lf_deltas(struct loopfilter *lf) { + lf->mode_ref_delta_enabled = 1; + lf->mode_ref_delta_update = 1; + + av1_set_default_ref_deltas(lf->ref_deltas); + av1_set_default_mode_deltas(lf->mode_deltas); +} + +void av1_setup_frame_contexts(AV1_COMMON *cm) { + // Store the frame context into a special slot (not associated with any + // reference buffer), so that we can set up cm->pre_fc correctly later + // This function must ONLY be called when cm->fc has been initialized with + // default probs, either by av1_setup_past_independence or after manually + // initializing them + *cm->default_frame_context = *cm->fc; + // TODO(jack.haughton@argondesign.com): don't think this should be necessary, + // but could do with fuller testing + if (cm->tiles.large_scale) { + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + RefCntBuffer *const buf = get_ref_frame_buf(cm, i); + if (buf != NULL) buf->frame_context = *cm->fc; + } + for (int i = 0; i < FRAME_BUFFERS; ++i) + cm->buffer_pool->frame_bufs[i].frame_context = *cm->fc; + } +} + +void av1_setup_past_independence(AV1_COMMON *cm) { + // Reset the segment feature data to the default stats: + // Features disabled, 0, with delta coding (Default state). + av1_clearall_segfeatures(&cm->seg); + + if (cm->cur_frame->seg_map) + memset(cm->cur_frame->seg_map, 0, + (cm->mi_params.mi_rows * cm->mi_params.mi_cols)); + + // reset mode ref deltas + av1_set_default_ref_deltas(cm->cur_frame->ref_deltas); + av1_set_default_mode_deltas(cm->cur_frame->mode_deltas); + set_default_lf_deltas(&cm->lf); + + av1_default_coef_probs(cm); + init_mode_probs(cm->fc); + av1_init_mv_probs(cm); + cm->fc->initialized = 1; + av1_setup_frame_contexts(cm); +} diff --git a/libs/libaom/src/av1/common/entropymode.h b/libs/libaom/src/av1/common/entropymode.h new file mode 100644 index 000000000..bbbf55dc8 --- /dev/null +++ b/libs/libaom/src/av1/common/entropymode.h @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_ENTROPYMODE_H_ +#define AOM_AV1_COMMON_ENTROPYMODE_H_ + +#include "av1/common/entropy.h" +#include "av1/common/entropymv.h" +#include "av1/common/filter.h" +#include "av1/common/seg_common.h" +#include "aom_dsp/aom_filter.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define BLOCK_SIZE_GROUPS 4 + +#define TX_SIZE_CONTEXTS 3 + +#define INTER_OFFSET(mode) ((mode)-NEARESTMV) +#define INTER_COMPOUND_OFFSET(mode) (uint8_t)((mode)-NEAREST_NEARESTMV) + +// Number of possible contexts for a color index. +// As can be seen from av1_get_palette_color_index_context(), the possible +// contexts are (2,0,0), (2,2,1), (3,2,0), (4,1,0), (5,0,0). These are mapped to +// a value from 0 to 4 using 'palette_color_index_context_lookup' table. +#define PALETTE_COLOR_INDEX_CONTEXTS 5 + +// Palette Y mode context for a block is determined by number of neighboring +// blocks (top and/or left) using a palette for Y plane. So, possible Y mode' +// context values are: +// 0 if neither left nor top block uses palette for Y plane, +// 1 if exactly one of left or top block uses palette for Y plane, and +// 2 if both left and top blocks use palette for Y plane. +#define PALETTE_Y_MODE_CONTEXTS 3 + +// Palette UV mode context for a block is determined by whether this block uses +// palette for the Y plane. So, possible values are: +// 0 if this block doesn't use palette for Y plane. +// 1 if this block uses palette for Y plane (i.e. Y palette size > 0). +#define PALETTE_UV_MODE_CONTEXTS 2 + +// Map the number of pixels in a block size to a context +// 64(BLOCK_8X8, BLOCK_4x16, BLOCK_16X4) -> 0 +// 128(BLOCK_8X16, BLOCK_16x8) -> 1 +// ... +// 4096(BLOCK_64X64) -> 6 +#define PALATTE_BSIZE_CTXS 7 + +#define KF_MODE_CONTEXTS 5 + +struct AV1Common; + +typedef struct { + const int16_t *scan; + const int16_t *iscan; +} SCAN_ORDER; + +typedef struct frame_contexts { + aom_cdf_prob txb_skip_cdf[TX_SIZES][TXB_SKIP_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob eob_extra_cdf[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS] + [CDF_SIZE(2)]; + aom_cdf_prob dc_sign_cdf[PLANE_TYPES][DC_SIGN_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob eob_flag_cdf16[PLANE_TYPES][2][CDF_SIZE(5)]; + aom_cdf_prob eob_flag_cdf32[PLANE_TYPES][2][CDF_SIZE(6)]; + aom_cdf_prob eob_flag_cdf64[PLANE_TYPES][2][CDF_SIZE(7)]; + aom_cdf_prob eob_flag_cdf128[PLANE_TYPES][2][CDF_SIZE(8)]; + aom_cdf_prob eob_flag_cdf256[PLANE_TYPES][2][CDF_SIZE(9)]; + aom_cdf_prob eob_flag_cdf512[PLANE_TYPES][2][CDF_SIZE(10)]; + aom_cdf_prob eob_flag_cdf1024[PLANE_TYPES][2][CDF_SIZE(11)]; + aom_cdf_prob coeff_base_eob_cdf[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB] + [CDF_SIZE(3)]; + aom_cdf_prob coeff_base_cdf[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS] + [CDF_SIZE(4)]; + aom_cdf_prob coeff_br_cdf[TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS] + [CDF_SIZE(BR_CDF_SIZE)]; + + aom_cdf_prob newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)]; + + aom_cdf_prob inter_compound_mode_cdf[INTER_MODE_CONTEXTS] + [CDF_SIZE(INTER_COMPOUND_MODES)]; + aom_cdf_prob compound_type_cdf[BLOCK_SIZES_ALL] + [CDF_SIZE(MASKED_COMPOUND_TYPES)]; + aom_cdf_prob wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)]; + aom_cdf_prob interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(2)]; + aom_cdf_prob wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)]; + aom_cdf_prob interintra_mode_cdf[BLOCK_SIZE_GROUPS] + [CDF_SIZE(INTERINTRA_MODES)]; + aom_cdf_prob motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(MOTION_MODES)]; + aom_cdf_prob obmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)]; + aom_cdf_prob palette_y_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)]; + aom_cdf_prob palette_uv_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)]; + aom_cdf_prob palette_y_color_index_cdf[PALETTE_SIZES] + [PALETTE_COLOR_INDEX_CONTEXTS] + [CDF_SIZE(PALETTE_COLORS)]; + aom_cdf_prob palette_uv_color_index_cdf[PALETTE_SIZES] + [PALETTE_COLOR_INDEX_CONTEXTS] + [CDF_SIZE(PALETTE_COLORS)]; + aom_cdf_prob palette_y_mode_cdf[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS] + [CDF_SIZE(2)]; + aom_cdf_prob palette_uv_mode_cdf[PALETTE_UV_MODE_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob comp_inter_cdf[COMP_INTER_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob single_ref_cdf[REF_CONTEXTS][SINGLE_REFS - 1][CDF_SIZE(2)]; + aom_cdf_prob comp_ref_type_cdf[COMP_REF_TYPE_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob uni_comp_ref_cdf[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1] + [CDF_SIZE(2)]; + aom_cdf_prob comp_ref_cdf[REF_CONTEXTS][FWD_REFS - 1][CDF_SIZE(2)]; + aom_cdf_prob comp_bwdref_cdf[REF_CONTEXTS][BWD_REFS - 1][CDF_SIZE(2)]; + aom_cdf_prob txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob compound_index_cdf[COMP_INDEX_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob comp_group_idx_cdf[COMP_GROUP_IDX_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob skip_mode_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob skip_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob intra_inter_cdf[INTRA_INTER_CONTEXTS][CDF_SIZE(2)]; + nmv_context nmvc; + nmv_context ndvc; + aom_cdf_prob intrabc_cdf[CDF_SIZE(2)]; + struct segmentation_probs seg; + aom_cdf_prob filter_intra_cdfs[BLOCK_SIZES_ALL][CDF_SIZE(2)]; + aom_cdf_prob filter_intra_mode_cdf[CDF_SIZE(FILTER_INTRA_MODES)]; + aom_cdf_prob switchable_restore_cdf[CDF_SIZE(RESTORE_SWITCHABLE_TYPES)]; + aom_cdf_prob wiener_restore_cdf[CDF_SIZE(2)]; + aom_cdf_prob sgrproj_restore_cdf[CDF_SIZE(2)]; + aom_cdf_prob y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTRA_MODES)]; + aom_cdf_prob uv_mode_cdf[CFL_ALLOWED_TYPES][INTRA_MODES] + [CDF_SIZE(UV_INTRA_MODES)]; + aom_cdf_prob partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(EXT_PARTITION_TYPES)]; + aom_cdf_prob switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS] + [CDF_SIZE(SWITCHABLE_FILTERS)]; + /* kf_y_cdf is discarded after use, so does not require persistent storage. + However, we keep it with the other CDFs in this struct since it needs to + be copied to each tile to support parallelism just like the others. + */ + aom_cdf_prob kf_y_cdf[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS] + [CDF_SIZE(INTRA_MODES)]; + + aom_cdf_prob angle_delta_cdf[DIRECTIONAL_MODES] + [CDF_SIZE(2 * MAX_ANGLE_DELTA + 1)]; + + aom_cdf_prob tx_size_cdf[MAX_TX_CATS][TX_SIZE_CONTEXTS] + [CDF_SIZE(MAX_TX_DEPTH + 1)]; + aom_cdf_prob delta_q_cdf[CDF_SIZE(DELTA_Q_PROBS + 1)]; + aom_cdf_prob delta_lf_multi_cdf[FRAME_LF_COUNT][CDF_SIZE(DELTA_LF_PROBS + 1)]; + aom_cdf_prob delta_lf_cdf[CDF_SIZE(DELTA_LF_PROBS + 1)]; + aom_cdf_prob intra_ext_tx_cdf[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES] + [CDF_SIZE(TX_TYPES)]; + aom_cdf_prob inter_ext_tx_cdf[EXT_TX_SETS_INTER][EXT_TX_SIZES] + [CDF_SIZE(TX_TYPES)]; + aom_cdf_prob cfl_sign_cdf[CDF_SIZE(CFL_JOINT_SIGNS)]; + aom_cdf_prob cfl_alpha_cdf[CFL_ALPHA_CONTEXTS][CDF_SIZE(CFL_ALPHABET_SIZE)]; + int initialized; +} FRAME_CONTEXT; + +static const int av1_ext_tx_ind[EXT_TX_SET_TYPES][TX_TYPES] = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1, 3, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1, 5, 6, 4, 0, 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0 }, + { 3, 4, 5, 8, 6, 7, 9, 10, 11, 0, 1, 2, 0, 0, 0, 0 }, + { 7, 8, 9, 12, 10, 11, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6 }, +}; + +static const int av1_ext_tx_inv[EXT_TX_SET_TYPES][TX_TYPES] = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 9, 0, 3, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 9, 0, 10, 11, 3, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 9, 10, 11, 0, 1, 2, 4, 5, 3, 6, 7, 8, 0, 0, 0, 0 }, + { 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 4, 5, 3, 6, 7, 8 }, +}; + +void av1_set_default_ref_deltas(int8_t *ref_deltas); +void av1_set_default_mode_deltas(int8_t *mode_deltas); +void av1_setup_frame_contexts(struct AV1Common *cm); +void av1_setup_past_independence(struct AV1Common *cm); + +// Returns (int)ceil(log2(n)). +// NOTE: This implementation only works for n <= 2^30. +static INLINE int av1_ceil_log2(int n) { + if (n < 2) return 0; + int i = 1, p = 2; + while (p < n) { + i++; + p = p << 1; + } + return i; +} + +// Returns the context for palette color index at row 'r' and column 'c', +// along with the 'color_order' of neighbors and the 'color_idx'. +// The 'color_map' is a 2D array with the given 'stride'. +int av1_get_palette_color_index_context(const uint8_t *color_map, int stride, + int r, int c, int palette_size, + uint8_t *color_order, int *color_idx); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_ENTROPYMODE_H_ diff --git a/libs/libaom/src/av1/common/entropymv.c b/libs/libaom/src/av1/common/entropymv.c new file mode 100644 index 000000000..e1e42f2f1 --- /dev/null +++ b/libs/libaom/src/av1/common/entropymv.c @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/av1_common_int.h" +#include "av1/common/entropymv.h" + +static const nmv_context default_nmv_context = { + { AOM_CDF4(4096, 11264, 19328) }, // joints_cdf + { { + // Vertical component + { AOM_CDF11(28672, 30976, 31858, 32320, 32551, 32656, 32740, 32757, + 32762, 32767) }, // class_cdf // fp + { { AOM_CDF4(16384, 24576, 26624) }, + { AOM_CDF4(12288, 21248, 24128) } }, // class0_fp_cdf + { AOM_CDF4(8192, 17408, 21248) }, // fp_cdf + { AOM_CDF2(128 * 128) }, // sign_cdf + { AOM_CDF2(160 * 128) }, // class0_hp_cdf + { AOM_CDF2(128 * 128) }, // hp_cdf + { AOM_CDF2(216 * 128) }, // class0_cdf + { { AOM_CDF2(128 * 136) }, + { AOM_CDF2(128 * 140) }, + { AOM_CDF2(128 * 148) }, + { AOM_CDF2(128 * 160) }, + { AOM_CDF2(128 * 176) }, + { AOM_CDF2(128 * 192) }, + { AOM_CDF2(128 * 224) }, + { AOM_CDF2(128 * 234) }, + { AOM_CDF2(128 * 234) }, + { AOM_CDF2(128 * 240) } }, // bits_cdf + }, + { + // Horizontal component + { AOM_CDF11(28672, 30976, 31858, 32320, 32551, 32656, 32740, 32757, + 32762, 32767) }, // class_cdf // fp + { { AOM_CDF4(16384, 24576, 26624) }, + { AOM_CDF4(12288, 21248, 24128) } }, // class0_fp_cdf + { AOM_CDF4(8192, 17408, 21248) }, // fp_cdf + { AOM_CDF2(128 * 128) }, // sign_cdf + { AOM_CDF2(160 * 128) }, // class0_hp_cdf + { AOM_CDF2(128 * 128) }, // hp_cdf + { AOM_CDF2(216 * 128) }, // class0_cdf + { { AOM_CDF2(128 * 136) }, + { AOM_CDF2(128 * 140) }, + { AOM_CDF2(128 * 148) }, + { AOM_CDF2(128 * 160) }, + { AOM_CDF2(128 * 176) }, + { AOM_CDF2(128 * 192) }, + { AOM_CDF2(128 * 224) }, + { AOM_CDF2(128 * 234) }, + { AOM_CDF2(128 * 234) }, + { AOM_CDF2(128 * 240) } }, // bits_cdf + } }, +}; + +void av1_init_mv_probs(AV1_COMMON *cm) { + // NB: this sets CDFs too + cm->fc->nmvc = default_nmv_context; + cm->fc->ndvc = default_nmv_context; +} diff --git a/libs/libaom/src/av1/common/entropymv.h b/libs/libaom/src/av1/common/entropymv.h new file mode 100644 index 000000000..cddc80768 --- /dev/null +++ b/libs/libaom/src/av1/common/entropymv.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_ENTROPYMV_H_ +#define AOM_AV1_COMMON_ENTROPYMV_H_ + +#include "config/aom_config.h" + +#include "aom_dsp/prob.h" + +#include "av1/common/mv.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1Common; + +void av1_init_mv_probs(struct AV1Common *cm); + +#define MV_UPDATE_PROB 252 + +/* Symbols for coding which components are zero jointly */ +#define MV_JOINTS 4 +enum { + MV_JOINT_ZERO = 0, /* Zero vector */ + MV_JOINT_HNZVZ = 1, /* Vert zero, hor nonzero */ + MV_JOINT_HZVNZ = 2, /* Hor zero, vert nonzero */ + MV_JOINT_HNZVNZ = 3, /* Both components nonzero */ +} UENUM1BYTE(MV_JOINT_TYPE); + +static INLINE int mv_joint_vertical(MV_JOINT_TYPE type) { + return type == MV_JOINT_HZVNZ || type == MV_JOINT_HNZVNZ; +} + +static INLINE int mv_joint_horizontal(MV_JOINT_TYPE type) { + return type == MV_JOINT_HNZVZ || type == MV_JOINT_HNZVNZ; +} + +/* Symbols for coding magnitude class of nonzero components */ +#define MV_CLASSES 11 +enum { + MV_CLASS_0 = 0, /* (0, 2] integer pel */ + MV_CLASS_1 = 1, /* (2, 4] integer pel */ + MV_CLASS_2 = 2, /* (4, 8] integer pel */ + MV_CLASS_3 = 3, /* (8, 16] integer pel */ + MV_CLASS_4 = 4, /* (16, 32] integer pel */ + MV_CLASS_5 = 5, /* (32, 64] integer pel */ + MV_CLASS_6 = 6, /* (64, 128] integer pel */ + MV_CLASS_7 = 7, /* (128, 256] integer pel */ + MV_CLASS_8 = 8, /* (256, 512] integer pel */ + MV_CLASS_9 = 9, /* (512, 1024] integer pel */ + MV_CLASS_10 = 10, /* (1024,2048] integer pel */ +} UENUM1BYTE(MV_CLASS_TYPE); + +#define CLASS0_BITS 1 /* bits at integer precision for class 0 */ +#define CLASS0_SIZE (1 << CLASS0_BITS) +#define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2) +#define MV_BITS_CONTEXTS 6 +#define MV_FP_SIZE 4 + +#define MV_MAX_BITS (MV_CLASSES + CLASS0_BITS + 2) +#define MV_MAX ((1 << MV_MAX_BITS) - 1) +#define MV_VALS ((MV_MAX << 1) + 1) + +#define MV_IN_USE_BITS 14 +#define MV_UPP (1 << MV_IN_USE_BITS) +#define MV_LOW (-(1 << MV_IN_USE_BITS)) + +typedef struct { + aom_cdf_prob classes_cdf[CDF_SIZE(MV_CLASSES)]; + aom_cdf_prob class0_fp_cdf[CLASS0_SIZE][CDF_SIZE(MV_FP_SIZE)]; + aom_cdf_prob fp_cdf[CDF_SIZE(MV_FP_SIZE)]; + aom_cdf_prob sign_cdf[CDF_SIZE(2)]; + aom_cdf_prob class0_hp_cdf[CDF_SIZE(2)]; + aom_cdf_prob hp_cdf[CDF_SIZE(2)]; + aom_cdf_prob class0_cdf[CDF_SIZE(CLASS0_SIZE)]; + aom_cdf_prob bits_cdf[MV_OFFSET_BITS][CDF_SIZE(2)]; +} nmv_component; + +typedef struct { + aom_cdf_prob joints_cdf[CDF_SIZE(MV_JOINTS)]; + nmv_component comps[2]; +} nmv_context; + +enum { + MV_SUBPEL_NONE = -1, + MV_SUBPEL_LOW_PRECISION = 0, + MV_SUBPEL_HIGH_PRECISION, +} SENUM1BYTE(MvSubpelPrecision); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_ENTROPYMV_H_ diff --git a/libs/libaom/src/av1/common/enums.h b/libs/libaom/src/av1/common/enums.h new file mode 100644 index 000000000..0c09a1bc7 --- /dev/null +++ b/libs/libaom/src/av1/common/enums.h @@ -0,0 +1,678 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_ENUMS_H_ +#define AOM_AV1_COMMON_ENUMS_H_ + +#include "config/aom_config.h" + +#include "aom/aom_codec.h" +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#undef MAX_SB_SIZE + +// Max superblock size +#define MAX_SB_SIZE_LOG2 7 +#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2) +#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE) + +// Min superblock size +#define MIN_SB_SIZE_LOG2 6 + +// Pixels per Mode Info (MI) unit +#define MI_SIZE_LOG2 2 +#define MI_SIZE (1 << MI_SIZE_LOG2) + +// MI-units per max superblock (MI Block - MIB) +#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2) +#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2) + +// MI-units per min superblock +#define MIN_MIB_SIZE_LOG2 (MIN_SB_SIZE_LOG2 - MI_SIZE_LOG2) + +// Mask to extract MI offset within max MIB +#define MAX_MIB_MASK (MAX_MIB_SIZE - 1) + +// Maximum number of tile rows and tile columns +#define MAX_TILE_ROWS 64 +#define MAX_TILE_COLS 64 + +#define MAX_VARTX_DEPTH 2 + +#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2) +#define MI_SIZE_128X128 (128 >> MI_SIZE_LOG2) + +#define MAX_PALETTE_SQUARE (64 * 64) +// Maximum number of colors in a palette. +#define PALETTE_MAX_SIZE 8 +// Minimum number of colors in a palette. +#define PALETTE_MIN_SIZE 2 + +#define FRAME_OFFSET_BITS 5 +#define MAX_FRAME_DISTANCE ((1 << FRAME_OFFSET_BITS) - 1) + +// 4 frame filter levels: y plane vertical, y plane horizontal, +// u plane, and v plane +#define FRAME_LF_COUNT 4 +#define DEFAULT_DELTA_LF_MULTI 0 +#define MAX_MODE_LF_DELTAS 2 + +#define DIST_PRECISION_BITS 4 +#define DIST_PRECISION (1 << DIST_PRECISION_BITS) // 16 + +#define PROFILE_BITS 3 +// The following three profiles are currently defined. +// Profile 0. 8-bit and 10-bit 4:2:0 and 4:0:0 only. +// Profile 1. 8-bit and 10-bit 4:4:4 +// Profile 2. 8-bit and 10-bit 4:2:2 +// 12-bit 4:0:0, 4:2:2 and 4:4:4 +// Since we have three bits for the profiles, it can be extended later. +enum { + PROFILE_0, + PROFILE_1, + PROFILE_2, + MAX_PROFILES, +} SENUM1BYTE(BITSTREAM_PROFILE); + +#define OP_POINTS_CNT_MINUS_1_BITS 5 +#define OP_POINTS_IDC_BITS 12 + +// Note: Some enums use the attribute 'packed' to use smallest possible integer +// type, so that we can save memory when they are used in structs/arrays. + +typedef enum ATTRIBUTE_PACKED { + BLOCK_4X4, + BLOCK_4X8, + BLOCK_8X4, + BLOCK_8X8, + BLOCK_8X16, + BLOCK_16X8, + BLOCK_16X16, + BLOCK_16X32, + BLOCK_32X16, + BLOCK_32X32, + BLOCK_32X64, + BLOCK_64X32, + BLOCK_64X64, + BLOCK_64X128, + BLOCK_128X64, + BLOCK_128X128, + BLOCK_4X16, + BLOCK_16X4, + BLOCK_8X32, + BLOCK_32X8, + BLOCK_16X64, + BLOCK_64X16, + BLOCK_SIZES_ALL, + BLOCK_SIZES = BLOCK_4X16, + BLOCK_INVALID = 255, + BLOCK_LARGEST = (BLOCK_SIZES - 1) +} BLOCK_SIZE; + +// 4X4, 8X8, 16X16, 32X32, 64X64, 128X128 +#define SQR_BLOCK_SIZES 6 + +// Partition types. R: Recursive +// +// NONE HORZ VERT SPLIT +// +-------+ +-------+ +---+---+ +---+---+ +// | | | | | | | | R | R | +// | | +-------+ | | | +---+---+ +// | | | | | | | | R | R | +// +-------+ +-------+ +---+---+ +---+---+ +// +// HORZ_A HORZ_B VERT_A VERT_B +// +---+---+ +-------+ +---+---+ +---+---+ +// | | | | | | | | | | | +// +---+---+ +---+---+ +---+ | | +---+ +// | | | | | | | | | | | +// +-------+ +---+---+ +---+---+ +---+---+ +// +// HORZ_4 VERT_4 +// +-----+ +-+-+-+ +// +-----+ | | | | +// +-----+ | | | | +// +-----+ +-+-+-+ +enum { + PARTITION_NONE, + PARTITION_HORZ, + PARTITION_VERT, + PARTITION_SPLIT, + PARTITION_HORZ_A, // HORZ split and the top partition is split again + PARTITION_HORZ_B, // HORZ split and the bottom partition is split again + PARTITION_VERT_A, // VERT split and the left partition is split again + PARTITION_VERT_B, // VERT split and the right partition is split again + PARTITION_HORZ_4, // 4:1 horizontal partition + PARTITION_VERT_4, // 4:1 vertical partition + EXT_PARTITION_TYPES, + PARTITION_TYPES = PARTITION_SPLIT + 1, + PARTITION_INVALID = 255 +} UENUM1BYTE(PARTITION_TYPE); + +typedef char PARTITION_CONTEXT; +#define PARTITION_PLOFFSET 4 // number of probability models per block size +#define PARTITION_BLOCK_SIZES 5 +#define PARTITION_CONTEXTS (PARTITION_BLOCK_SIZES * PARTITION_PLOFFSET) + +// block transform size +enum { + TX_4X4, // 4x4 transform + TX_8X8, // 8x8 transform + TX_16X16, // 16x16 transform + TX_32X32, // 32x32 transform + TX_64X64, // 64x64 transform + TX_4X8, // 4x8 transform + TX_8X4, // 8x4 transform + TX_8X16, // 8x16 transform + TX_16X8, // 16x8 transform + TX_16X32, // 16x32 transform + TX_32X16, // 32x16 transform + TX_32X64, // 32x64 transform + TX_64X32, // 64x32 transform + TX_4X16, // 4x16 transform + TX_16X4, // 16x4 transform + TX_8X32, // 8x32 transform + TX_32X8, // 32x8 transform + TX_16X64, // 16x64 transform + TX_64X16, // 64x16 transform + TX_SIZES_ALL, // Includes rectangular transforms + TX_SIZES = TX_4X8, // Does NOT include rectangular transforms + TX_SIZES_LARGEST = TX_64X64, + TX_INVALID = 255 // Invalid transform size +} UENUM1BYTE(TX_SIZE); + +#define TX_SIZE_LUMA_MIN (TX_4X4) +/* We don't need to code a transform size unless the allowed size is at least + one more than the minimum. */ +#define TX_SIZE_CTX_MIN (TX_SIZE_LUMA_MIN + 1) + +// Maximum tx_size categories +#define MAX_TX_CATS (TX_SIZES - TX_SIZE_CTX_MIN) +#define MAX_TX_DEPTH 2 + +#define MAX_TX_SIZE_LOG2 (6) +#define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2) +#define MIN_TX_SIZE_LOG2 2 +#define MIN_TX_SIZE (1 << MIN_TX_SIZE_LOG2) +#define MAX_TX_SQUARE (MAX_TX_SIZE * MAX_TX_SIZE) + +// Pad 4 extra columns to remove horizontal availability check. +#define TX_PAD_HOR_LOG2 2 +#define TX_PAD_HOR 4 +// Pad 6 extra rows (2 on top and 4 on bottom) to remove vertical availability +// check. +#define TX_PAD_TOP 0 +#define TX_PAD_BOTTOM 4 +#define TX_PAD_VER (TX_PAD_TOP + TX_PAD_BOTTOM) +// Pad 16 extra bytes to avoid reading overflow in SIMD optimization. +#define TX_PAD_END 16 +#define TX_PAD_2D ((32 + TX_PAD_HOR) * (32 + TX_PAD_VER) + TX_PAD_END) + +// Number of maxium size transform blocks in the maximum size superblock +#define MAX_TX_BLOCKS_IN_MAX_SB_LOG2 ((MAX_SB_SIZE_LOG2 - MAX_TX_SIZE_LOG2) * 2) +#define MAX_TX_BLOCKS_IN_MAX_SB (1 << MAX_TX_BLOCKS_IN_MAX_SB_LOG2) + +// frame transform mode +enum { + ONLY_4X4, // use only 4x4 transform + TX_MODE_LARGEST, // transform size is the largest possible for pu size + TX_MODE_SELECT, // transform specified for each block + TX_MODES, +} UENUM1BYTE(TX_MODE); + +// 1D tx types +enum { + DCT_1D, + ADST_1D, + FLIPADST_1D, + IDTX_1D, + TX_TYPES_1D, +} UENUM1BYTE(TX_TYPE_1D); + +enum { + DCT_DCT, // DCT in both horizontal and vertical + ADST_DCT, // ADST in vertical, DCT in horizontal + DCT_ADST, // DCT in vertical, ADST in horizontal + ADST_ADST, // ADST in both directions + FLIPADST_DCT, // FLIPADST in vertical, DCT in horizontal + DCT_FLIPADST, // DCT in vertical, FLIPADST in horizontal + FLIPADST_FLIPADST, // FLIPADST in both directions + ADST_FLIPADST, // ADST in vertical, FLIPADST in horizontal + FLIPADST_ADST, // FLIPADST in vertical, ADST in horizontal + IDTX, // Identity in both directions + V_DCT, // DCT in vertical, identity in horizontal + H_DCT, // Identity in vertical, DCT in horizontal + V_ADST, // ADST in vertical, identity in horizontal + H_ADST, // Identity in vertical, ADST in horizontal + V_FLIPADST, // FLIPADST in vertical, identity in horizontal + H_FLIPADST, // Identity in vertical, FLIPADST in horizontal + TX_TYPES, + DCT_ADST_TX_MASK = 0x000F, // Either DCT or ADST in each direction +} UENUM1BYTE(TX_TYPE); + +enum { + REG_REG, + REG_SMOOTH, + REG_SHARP, + SMOOTH_REG, + SMOOTH_SMOOTH, + SMOOTH_SHARP, + SHARP_REG, + SHARP_SMOOTH, + SHARP_SHARP, +} UENUM1BYTE(DUAL_FILTER_TYPE); + +enum { + // DCT only + EXT_TX_SET_DCTONLY, + // DCT + Identity only + EXT_TX_SET_DCT_IDTX, + // Discrete Trig transforms w/o flip (4) + Identity (1) + EXT_TX_SET_DTT4_IDTX, + // Discrete Trig transforms w/o flip (4) + Identity (1) + 1D Hor/vert DCT (2) + EXT_TX_SET_DTT4_IDTX_1DDCT, + // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver DCT (2) + EXT_TX_SET_DTT9_IDTX_1DDCT, + // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver (6) + EXT_TX_SET_ALL16, + EXT_TX_SET_TYPES +} UENUM1BYTE(TxSetType); + +#define EXT_TX_SIZES 4 // number of sizes that use extended transforms +#define EXT_TX_SETS_INTER 4 // Sets of transform selections for INTER +#define EXT_TX_SETS_INTRA 3 // Sets of transform selections for INTRA + +enum { + AOM_LAST_FLAG = 1 << 0, + AOM_LAST2_FLAG = 1 << 1, + AOM_LAST3_FLAG = 1 << 2, + AOM_GOLD_FLAG = 1 << 3, + AOM_BWD_FLAG = 1 << 4, + AOM_ALT2_FLAG = 1 << 5, + AOM_ALT_FLAG = 1 << 6, + AOM_REFFRAME_ALL = (1 << 7) - 1 +} UENUM1BYTE(AOM_REFFRAME); + +enum { + UNIDIR_COMP_REFERENCE, + BIDIR_COMP_REFERENCE, + COMP_REFERENCE_TYPES, +} UENUM1BYTE(COMP_REFERENCE_TYPE); + +enum { PLANE_TYPE_Y, PLANE_TYPE_UV, PLANE_TYPES } UENUM1BYTE(PLANE_TYPE); + +#define CFL_ALPHABET_SIZE_LOG2 4 +#define CFL_ALPHABET_SIZE (1 << CFL_ALPHABET_SIZE_LOG2) +#define CFL_MAGS_SIZE ((2 << CFL_ALPHABET_SIZE_LOG2) + 1) +#define CFL_IDX_U(idx) (idx >> CFL_ALPHABET_SIZE_LOG2) +#define CFL_IDX_V(idx) (idx & (CFL_ALPHABET_SIZE - 1)) + +enum { CFL_PRED_U, CFL_PRED_V, CFL_PRED_PLANES } UENUM1BYTE(CFL_PRED_TYPE); + +enum { + CFL_SIGN_ZERO, + CFL_SIGN_NEG, + CFL_SIGN_POS, + CFL_SIGNS +} UENUM1BYTE(CFL_SIGN_TYPE); + +enum { + CFL_DISALLOWED, + CFL_ALLOWED, + CFL_ALLOWED_TYPES +} UENUM1BYTE(CFL_ALLOWED_TYPE); + +// CFL_SIGN_ZERO,CFL_SIGN_ZERO is invalid +#define CFL_JOINT_SIGNS (CFL_SIGNS * CFL_SIGNS - 1) +// CFL_SIGN_U is equivalent to (js + 1) / 3 for js in 0 to 8 +#define CFL_SIGN_U(js) (((js + 1) * 11) >> 5) +// CFL_SIGN_V is equivalent to (js + 1) % 3 for js in 0 to 8 +#define CFL_SIGN_V(js) ((js + 1) - CFL_SIGNS * CFL_SIGN_U(js)) + +// There is no context when the alpha for a given plane is zero. +// So there are 2 fewer contexts than joint signs. +#define CFL_ALPHA_CONTEXTS (CFL_JOINT_SIGNS + 1 - CFL_SIGNS) +#define CFL_CONTEXT_U(js) (js + 1 - CFL_SIGNS) +// Also, the contexts are symmetric under swapping the planes. +#define CFL_CONTEXT_V(js) \ + (CFL_SIGN_V(js) * CFL_SIGNS + CFL_SIGN_U(js) - CFL_SIGNS) + +enum { + PALETTE_MAP, + COLOR_MAP_TYPES, +} UENUM1BYTE(COLOR_MAP_TYPE); + +enum { + TWO_COLORS, + THREE_COLORS, + FOUR_COLORS, + FIVE_COLORS, + SIX_COLORS, + SEVEN_COLORS, + EIGHT_COLORS, + PALETTE_SIZES +} UENUM1BYTE(PALETTE_SIZE); + +enum { + PALETTE_COLOR_ONE, + PALETTE_COLOR_TWO, + PALETTE_COLOR_THREE, + PALETTE_COLOR_FOUR, + PALETTE_COLOR_FIVE, + PALETTE_COLOR_SIX, + PALETTE_COLOR_SEVEN, + PALETTE_COLOR_EIGHT, + PALETTE_COLORS +} UENUM1BYTE(PALETTE_COLOR); + +// Note: All directional predictors must be between V_PRED and D67_PRED (both +// inclusive). +enum { + DC_PRED, // Average of above and left pixels + V_PRED, // Vertical + H_PRED, // Horizontal + D45_PRED, // Directional 45 degree + D135_PRED, // Directional 135 degree + D113_PRED, // Directional 113 degree + D157_PRED, // Directional 157 degree + D203_PRED, // Directional 203 degree + D67_PRED, // Directional 67 degree + SMOOTH_PRED, // Combination of horizontal and vertical interpolation + SMOOTH_V_PRED, // Vertical interpolation + SMOOTH_H_PRED, // Horizontal interpolation + PAETH_PRED, // Predict from the direction of smallest gradient + NEARESTMV, + NEARMV, + GLOBALMV, + NEWMV, + // Compound ref compound modes + NEAREST_NEARESTMV, + NEAR_NEARMV, + NEAREST_NEWMV, + NEW_NEARESTMV, + NEAR_NEWMV, + NEW_NEARMV, + GLOBAL_GLOBALMV, + NEW_NEWMV, + MB_MODE_COUNT, + INTRA_MODE_START = DC_PRED, + INTRA_MODE_END = NEARESTMV, + DIR_MODE_START = V_PRED, + DIR_MODE_END = D67_PRED + 1, + INTRA_MODE_NUM = INTRA_MODE_END - INTRA_MODE_START, + SINGLE_INTER_MODE_START = NEARESTMV, + SINGLE_INTER_MODE_END = NEAREST_NEARESTMV, + SINGLE_INTER_MODE_NUM = SINGLE_INTER_MODE_END - SINGLE_INTER_MODE_START, + COMP_INTER_MODE_START = NEAREST_NEARESTMV, + COMP_INTER_MODE_END = MB_MODE_COUNT, + COMP_INTER_MODE_NUM = COMP_INTER_MODE_END - COMP_INTER_MODE_START, + INTER_MODE_START = NEARESTMV, + INTER_MODE_END = MB_MODE_COUNT, + INTRA_MODES = PAETH_PRED + 1, // PAETH_PRED has to be the last intra mode. + INTRA_INVALID = MB_MODE_COUNT // For uv_mode in inter blocks +} UENUM1BYTE(PREDICTION_MODE); + +// TODO(ltrudeau) Do we really want to pack this? +// TODO(ltrudeau) Do we match with PREDICTION_MODE? +enum { + UV_DC_PRED, // Average of above and left pixels + UV_V_PRED, // Vertical + UV_H_PRED, // Horizontal + UV_D45_PRED, // Directional 45 degree + UV_D135_PRED, // Directional 135 degree + UV_D113_PRED, // Directional 113 degree + UV_D157_PRED, // Directional 157 degree + UV_D203_PRED, // Directional 203 degree + UV_D67_PRED, // Directional 67 degree + UV_SMOOTH_PRED, // Combination of horizontal and vertical interpolation + UV_SMOOTH_V_PRED, // Vertical interpolation + UV_SMOOTH_H_PRED, // Horizontal interpolation + UV_PAETH_PRED, // Predict from the direction of smallest gradient + UV_CFL_PRED, // Chroma-from-Luma + UV_INTRA_MODES, + UV_MODE_INVALID, // For uv_mode in inter blocks +} UENUM1BYTE(UV_PREDICTION_MODE); + +enum { + SIMPLE_TRANSLATION, + OBMC_CAUSAL, // 2-sided OBMC + WARPED_CAUSAL, // 2-sided WARPED + MOTION_MODES +} UENUM1BYTE(MOTION_MODE); + +enum { + II_DC_PRED, + II_V_PRED, + II_H_PRED, + II_SMOOTH_PRED, + INTERINTRA_MODES +} UENUM1BYTE(INTERINTRA_MODE); + +enum { + COMPOUND_AVERAGE, + COMPOUND_DISTWTD, + COMPOUND_WEDGE, + COMPOUND_DIFFWTD, + COMPOUND_TYPES, + MASKED_COMPOUND_TYPES = 2, +} UENUM1BYTE(COMPOUND_TYPE); + +enum { + FILTER_DC_PRED, + FILTER_V_PRED, + FILTER_H_PRED, + FILTER_D157_PRED, + FILTER_PAETH_PRED, + FILTER_INTRA_MODES, +} UENUM1BYTE(FILTER_INTRA_MODE); + +enum { + SEQ_LEVEL_2_0, + SEQ_LEVEL_2_1, + SEQ_LEVEL_2_2, + SEQ_LEVEL_2_3, + SEQ_LEVEL_3_0, + SEQ_LEVEL_3_1, + SEQ_LEVEL_3_2, + SEQ_LEVEL_3_3, + SEQ_LEVEL_4_0, + SEQ_LEVEL_4_1, + SEQ_LEVEL_4_2, + SEQ_LEVEL_4_3, + SEQ_LEVEL_5_0, + SEQ_LEVEL_5_1, + SEQ_LEVEL_5_2, + SEQ_LEVEL_5_3, + SEQ_LEVEL_6_0, + SEQ_LEVEL_6_1, + SEQ_LEVEL_6_2, + SEQ_LEVEL_6_3, + SEQ_LEVEL_7_0, + SEQ_LEVEL_7_1, + SEQ_LEVEL_7_2, + SEQ_LEVEL_7_3, + SEQ_LEVELS, + SEQ_LEVEL_MAX = 31 +} UENUM1BYTE(AV1_LEVEL); + +#define LEVEL_BITS 5 + +#define DIRECTIONAL_MODES 8 +#define MAX_ANGLE_DELTA 3 +#define ANGLE_STEP 3 + +#define INTER_MODES (1 + NEWMV - NEARESTMV) + +#define INTER_COMPOUND_MODES (1 + NEW_NEWMV - NEAREST_NEARESTMV) + +#define SKIP_CONTEXTS 3 +#define SKIP_MODE_CONTEXTS 3 + +#define COMP_INDEX_CONTEXTS 6 +#define COMP_GROUP_IDX_CONTEXTS 6 + +#define NMV_CONTEXTS 3 + +#define NEWMV_MODE_CONTEXTS 6 +#define GLOBALMV_MODE_CONTEXTS 2 +#define REFMV_MODE_CONTEXTS 6 +#define DRL_MODE_CONTEXTS 3 + +#define GLOBALMV_OFFSET 3 +#define REFMV_OFFSET 4 + +#define NEWMV_CTX_MASK ((1 << GLOBALMV_OFFSET) - 1) +#define GLOBALMV_CTX_MASK ((1 << (REFMV_OFFSET - GLOBALMV_OFFSET)) - 1) +#define REFMV_CTX_MASK ((1 << (8 - REFMV_OFFSET)) - 1) + +#define COMP_NEWMV_CTXS 5 +#define INTER_MODE_CONTEXTS 8 + +#define DELTA_Q_SMALL 3 +#define DELTA_Q_PROBS (DELTA_Q_SMALL) +#define DEFAULT_DELTA_Q_RES_PERCEPTUAL 4 +#define DEFAULT_DELTA_Q_RES_OBJECTIVE 4 + +#define DELTA_LF_SMALL 3 +#define DELTA_LF_PROBS (DELTA_LF_SMALL) +#define DEFAULT_DELTA_LF_RES 2 + +/* Segment Feature Masks */ +#define MAX_MV_REF_CANDIDATES 2 + +#define MAX_REF_MV_STACK_SIZE 8 +#define USABLE_REF_MV_STACK_SIZE 4 +#define REF_CAT_LEVEL 640 + +#define INTRA_INTER_CONTEXTS 4 +#define COMP_INTER_CONTEXTS 5 +#define REF_CONTEXTS 3 + +#define COMP_REF_TYPE_CONTEXTS 5 +#define UNI_COMP_REF_CONTEXTS 3 + +#define TXFM_PARTITION_CONTEXTS ((TX_SIZES - TX_8X8) * 6 - 3) +typedef uint8_t TXFM_CONTEXT; + +// An enum for single reference types (and some derived values). +enum { + NONE_FRAME = -1, + INTRA_FRAME, + LAST_FRAME, + LAST2_FRAME, + LAST3_FRAME, + GOLDEN_FRAME, + BWDREF_FRAME, + ALTREF2_FRAME, + ALTREF_FRAME, + REF_FRAMES, + + // Extra/scratch reference frame. It may be: + // - used to update the ALTREF2_FRAME ref (see lshift_bwd_ref_frames()), or + // - updated from ALTREF2_FRAME ref (see rshift_bwd_ref_frames()). + EXTREF_FRAME = REF_FRAMES, + + // Number of inter (non-intra) reference types. + INTER_REFS_PER_FRAME = ALTREF_FRAME - LAST_FRAME + 1, + + // Number of forward (aka past) reference types. + FWD_REFS = GOLDEN_FRAME - LAST_FRAME + 1, + + // Number of backward (aka future) reference types. + BWD_REFS = ALTREF_FRAME - BWDREF_FRAME + 1, + + SINGLE_REFS = FWD_REFS + BWD_REFS, +}; + +#define REF_FRAMES_LOG2 3 + +// REF_FRAMES for the cm->ref_frame_map array, 1 scratch frame for the new +// frame in cm->cur_frame, INTER_REFS_PER_FRAME for scaled references on the +// encoder in the cpi->scaled_ref_buf array. +#define FRAME_BUFFERS (REF_FRAMES + 1 + INTER_REFS_PER_FRAME) + +#define FWD_RF_OFFSET(ref) (ref - LAST_FRAME) +#define BWD_RF_OFFSET(ref) (ref - BWDREF_FRAME) + +enum { + LAST_LAST2_FRAMES, // { LAST_FRAME, LAST2_FRAME } + LAST_LAST3_FRAMES, // { LAST_FRAME, LAST3_FRAME } + LAST_GOLDEN_FRAMES, // { LAST_FRAME, GOLDEN_FRAME } + BWDREF_ALTREF_FRAMES, // { BWDREF_FRAME, ALTREF_FRAME } + LAST2_LAST3_FRAMES, // { LAST2_FRAME, LAST3_FRAME } + LAST2_GOLDEN_FRAMES, // { LAST2_FRAME, GOLDEN_FRAME } + LAST3_GOLDEN_FRAMES, // { LAST3_FRAME, GOLDEN_FRAME } + BWDREF_ALTREF2_FRAMES, // { BWDREF_FRAME, ALTREF2_FRAME } + ALTREF2_ALTREF_FRAMES, // { ALTREF2_FRAME, ALTREF_FRAME } + TOTAL_UNIDIR_COMP_REFS, + // NOTE: UNIDIR_COMP_REFS is the number of uni-directional reference pairs + // that are explicitly signaled. + UNIDIR_COMP_REFS = BWDREF_ALTREF_FRAMES + 1, +} UENUM1BYTE(UNIDIR_COMP_REF); + +#define TOTAL_COMP_REFS (FWD_REFS * BWD_REFS + TOTAL_UNIDIR_COMP_REFS) + +#define COMP_REFS (FWD_REFS * BWD_REFS + UNIDIR_COMP_REFS) + +// NOTE: A limited number of unidirectional reference pairs can be signalled for +// compound prediction. The use of skip mode, on the other hand, makes it +// possible to have a reference pair not listed for explicit signaling. +#define MODE_CTX_REF_FRAMES (REF_FRAMES + TOTAL_COMP_REFS) + +// Note: It includes single and compound references. So, it can take values from +// NONE_FRAME to (MODE_CTX_REF_FRAMES - 1). Hence, it is not defined as an enum. +typedef int8_t MV_REFERENCE_FRAME; + +enum { + RESTORE_NONE, + RESTORE_WIENER, + RESTORE_SGRPROJ, + RESTORE_SWITCHABLE, + RESTORE_SWITCHABLE_TYPES = RESTORE_SWITCHABLE, + RESTORE_TYPES = 4, +} UENUM1BYTE(RestorationType); + +// Picture prediction structures (0-12 are predefined) in scalability metadata. +enum { + SCALABILITY_L1T2 = 0, + SCALABILITY_L1T3 = 1, + SCALABILITY_L2T1 = 2, + SCALABILITY_L2T2 = 3, + SCALABILITY_L2T3 = 4, + SCALABILITY_S2T1 = 5, + SCALABILITY_S2T2 = 6, + SCALABILITY_S2T3 = 7, + SCALABILITY_L2T1h = 8, + SCALABILITY_L2T2h = 9, + SCALABILITY_L2T3h = 10, + SCALABILITY_S2T1h = 11, + SCALABILITY_S2T2h = 12, + SCALABILITY_S2T3h = 13, + SCALABILITY_SS = 14 +} UENUM1BYTE(SCALABILITY_STRUCTURES); + +#define SUPERRES_SCALE_BITS 3 +#define SUPERRES_SCALE_DENOMINATOR_MIN (SCALE_NUMERATOR + 1) + +// In large_scale_tile coding, external references are used. +#define MAX_EXTERNAL_REFERENCES 128 +#define MAX_TILES 512 + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_ENUMS_H_ diff --git a/libs/libaom/src/av1/common/filter.h b/libs/libaom/src/av1/common/filter.h new file mode 100644 index 000000000..91791d3dc --- /dev/null +++ b/libs/libaom/src/av1/common/filter.h @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_FILTER_H_ +#define AOM_AV1_COMMON_FILTER_H_ + +#include + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_filter.h" +#include "aom_ports/mem.h" +#include "av1/common/enums.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_FILTER_TAP 8 + +typedef enum ATTRIBUTE_PACKED { + EIGHTTAP_REGULAR, + EIGHTTAP_SMOOTH, + MULTITAP_SHARP, + BILINEAR, + INTERP_FILTERS_ALL, + SWITCHABLE_FILTERS = BILINEAR, + SWITCHABLE = SWITCHABLE_FILTERS + 1, /* the last switchable one */ + EXTRA_FILTERS = INTERP_FILTERS_ALL - SWITCHABLE_FILTERS, + INTERP_INVALID = 0xff, +} InterpFilter; + +enum { + USE_2_TAPS_ORIG = 0, // This is used in temporal filtering. + USE_2_TAPS, + USE_4_TAPS, + USE_8_TAPS, +} UENUM1BYTE(SUBPEL_SEARCH_TYPE); + +enum { + INTERP_EVAL_LUMA_EVAL_CHROMA = 0, + INTERP_SKIP_LUMA_EVAL_CHROMA, + INTERP_EVAL_INVALID, + INTERP_SKIP_LUMA_SKIP_CHROMA, +} UENUM1BYTE(INTERP_EVAL_PLANE); + +enum { + INTERP_HORZ_NEQ_VERT_NEQ = 0, + INTERP_HORZ_EQ_VERT_NEQ, + INTERP_HORZ_NEQ_VERT_EQ, + INTERP_HORZ_EQ_VERT_EQ, + INTERP_PRED_TYPE_ALL, +} UENUM1BYTE(INTERP_PRED_TYPE); +// Pack two InterpFilter's into a uint32_t: since there are at most 10 filters, +// we can use 16 bits for each and have more than enough space. This reduces +// argument passing and unifies the operation of setting a (pair of) filters. +typedef struct InterpFilters { + uint16_t y_filter; + uint16_t x_filter; +} InterpFilters; + +typedef union int_interpfilters { + uint32_t as_int; + InterpFilters as_filters; +} int_interpfilters; + +static INLINE InterpFilter av1_extract_interp_filter(int_interpfilters filters, + int dir) { + return (InterpFilter)((dir) ? filters.as_filters.x_filter + : filters.as_filters.y_filter); +} + +static INLINE int_interpfilters +av1_broadcast_interp_filter(InterpFilter filter) { + int_interpfilters filters; + filters.as_filters.x_filter = filter; + filters.as_filters.y_filter = filter; + return filters; +} + +static INLINE InterpFilter av1_unswitchable_filter(InterpFilter filter) { + return filter == SWITCHABLE ? EIGHTTAP_REGULAR : filter; +} + +/* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */ +#define LOG_SWITCHABLE_FILTERS 2 + +#define SWITCHABLE_FILTER_CONTEXTS ((SWITCHABLE_FILTERS + 1) * 4) +#define INTER_FILTER_COMP_OFFSET (SWITCHABLE_FILTERS + 1) +#define INTER_FILTER_DIR_OFFSET ((SWITCHABLE_FILTERS + 1) * 2) +#define ALLOW_ALL_INTERP_FILT_MASK (0x01ff) + +typedef struct InterpFilterParams { + const int16_t *filter_ptr; + uint16_t taps; + uint16_t subpel_shifts; + InterpFilter interp_filter; +} InterpFilterParams; + +DECLARE_ALIGNED(256, static const InterpKernel, + av1_bilinear_filters[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 0, 120, 8, 0, 0, 0 }, + { 0, 0, 0, 112, 16, 0, 0, 0 }, { 0, 0, 0, 104, 24, 0, 0, 0 }, + { 0, 0, 0, 96, 32, 0, 0, 0 }, { 0, 0, 0, 88, 40, 0, 0, 0 }, + { 0, 0, 0, 80, 48, 0, 0, 0 }, { 0, 0, 0, 72, 56, 0, 0, 0 }, + { 0, 0, 0, 64, 64, 0, 0, 0 }, { 0, 0, 0, 56, 72, 0, 0, 0 }, + { 0, 0, 0, 48, 80, 0, 0, 0 }, { 0, 0, 0, 40, 88, 0, 0, 0 }, + { 0, 0, 0, 32, 96, 0, 0, 0 }, { 0, 0, 0, 24, 104, 0, 0, 0 }, + { 0, 0, 0, 16, 112, 0, 0, 0 }, { 0, 0, 0, 8, 120, 0, 0, 0 } +}; + +DECLARE_ALIGNED(256, static const InterpKernel, + av1_sub_pel_filters_8[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 2, -6, 126, 8, -2, 0, 0 }, + { 0, 2, -10, 122, 18, -4, 0, 0 }, { 0, 2, -12, 116, 28, -8, 2, 0 }, + { 0, 2, -14, 110, 38, -10, 2, 0 }, { 0, 2, -14, 102, 48, -12, 2, 0 }, + { 0, 2, -16, 94, 58, -12, 2, 0 }, { 0, 2, -14, 84, 66, -12, 2, 0 }, + { 0, 2, -14, 76, 76, -14, 2, 0 }, { 0, 2, -12, 66, 84, -14, 2, 0 }, + { 0, 2, -12, 58, 94, -16, 2, 0 }, { 0, 2, -12, 48, 102, -14, 2, 0 }, + { 0, 2, -10, 38, 110, -14, 2, 0 }, { 0, 2, -8, 28, 116, -12, 2, 0 }, + { 0, 0, -4, 18, 122, -10, 2, 0 }, { 0, 0, -2, 8, 126, -6, 2, 0 } +}; + +DECLARE_ALIGNED(256, static const InterpKernel, + av1_sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { -2, 2, -6, 126, 8, -2, 2, 0 }, + { -2, 6, -12, 124, 16, -6, 4, -2 }, { -2, 8, -18, 120, 26, -10, 6, -2 }, + { -4, 10, -22, 116, 38, -14, 6, -2 }, { -4, 10, -22, 108, 48, -18, 8, -2 }, + { -4, 10, -24, 100, 60, -20, 8, -2 }, { -4, 10, -24, 90, 70, -22, 10, -2 }, + { -4, 12, -24, 80, 80, -24, 12, -4 }, { -2, 10, -22, 70, 90, -24, 10, -4 }, + { -2, 8, -20, 60, 100, -24, 10, -4 }, { -2, 8, -18, 48, 108, -22, 10, -4 }, + { -2, 6, -14, 38, 116, -22, 10, -4 }, { -2, 6, -10, 26, 120, -18, 8, -2 }, + { -2, 4, -6, 16, 124, -12, 6, -2 }, { 0, 2, -2, 8, 126, -6, 2, -2 } +}; + +DECLARE_ALIGNED(256, static const InterpKernel, + av1_sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 2, 28, 62, 34, 2, 0, 0 }, + { 0, 0, 26, 62, 36, 4, 0, 0 }, { 0, 0, 22, 62, 40, 4, 0, 0 }, + { 0, 0, 20, 60, 42, 6, 0, 0 }, { 0, 0, 18, 58, 44, 8, 0, 0 }, + { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, -2, 16, 54, 48, 12, 0, 0 }, + { 0, -2, 14, 52, 52, 14, -2, 0 }, { 0, 0, 12, 48, 54, 16, -2, 0 }, + { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 }, + { 0, 0, 6, 42, 60, 20, 0, 0 }, { 0, 0, 4, 40, 62, 22, 0, 0 }, + { 0, 0, 4, 36, 62, 26, 0, 0 }, { 0, 0, 2, 34, 62, 28, 2, 0 } +}; + +static const InterpFilterParams + av1_interp_filter_params_list[SWITCHABLE_FILTERS + 1] = { + { (const int16_t *)av1_sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS, + EIGHTTAP_REGULAR }, + { (const int16_t *)av1_sub_pel_filters_8smooth, SUBPEL_TAPS, + SUBPEL_SHIFTS, EIGHTTAP_SMOOTH }, + { (const int16_t *)av1_sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS, + MULTITAP_SHARP }, + { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS, + BILINEAR } + }; + +// A special 2-tap bilinear filter for IntraBC chroma. IntraBC uses full pixel +// MV for luma. If sub-sampling exists, chroma may possibly use half-pel MV. +DECLARE_ALIGNED(256, static const int16_t, + av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = { + 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static const InterpFilterParams av1_intrabc_filter_params = { + av1_intrabc_bilinear_filter, 2, 0, BILINEAR +}; + +DECLARE_ALIGNED(256, static const InterpKernel, + av1_sub_pel_filters_4[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -4, 126, 8, -2, 0, 0 }, + { 0, 0, -8, 122, 18, -4, 0, 0 }, { 0, 0, -10, 116, 28, -6, 0, 0 }, + { 0, 0, -12, 110, 38, -8, 0, 0 }, { 0, 0, -12, 102, 48, -10, 0, 0 }, + { 0, 0, -14, 94, 58, -10, 0, 0 }, { 0, 0, -12, 84, 66, -10, 0, 0 }, + { 0, 0, -12, 76, 76, -12, 0, 0 }, { 0, 0, -10, 66, 84, -12, 0, 0 }, + { 0, 0, -10, 58, 94, -14, 0, 0 }, { 0, 0, -10, 48, 102, -12, 0, 0 }, + { 0, 0, -8, 38, 110, -12, 0, 0 }, { 0, 0, -6, 28, 116, -10, 0, 0 }, + { 0, 0, -4, 18, 122, -8, 0, 0 }, { 0, 0, -2, 8, 126, -4, 0, 0 } +}; +DECLARE_ALIGNED(256, static const InterpKernel, + av1_sub_pel_filters_4smooth[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 30, 62, 34, 2, 0, 0 }, + { 0, 0, 26, 62, 36, 4, 0, 0 }, { 0, 0, 22, 62, 40, 4, 0, 0 }, + { 0, 0, 20, 60, 42, 6, 0, 0 }, { 0, 0, 18, 58, 44, 8, 0, 0 }, + { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, 0, 14, 54, 48, 12, 0, 0 }, + { 0, 0, 12, 52, 52, 12, 0, 0 }, { 0, 0, 12, 48, 54, 14, 0, 0 }, + { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 }, + { 0, 0, 6, 42, 60, 20, 0, 0 }, { 0, 0, 4, 40, 62, 22, 0, 0 }, + { 0, 0, 4, 36, 62, 26, 0, 0 }, { 0, 0, 2, 34, 62, 30, 0, 0 } +}; + +static const uint16_t + av1_interp_dual_filt_mask[INTERP_PRED_TYPE_ALL - 2][SWITCHABLE_FILTERS] = { + { (1 << REG_REG) | (1 << SMOOTH_REG) | (1 << SHARP_REG), + (1 << REG_SMOOTH) | (1 << SMOOTH_SMOOTH) | (1 << SHARP_SMOOTH), + (1 << REG_SHARP) | (1 << SMOOTH_SHARP) | (1 << SHARP_SHARP) }, + { (1 << REG_REG) | (1 << REG_SMOOTH) | (1 << REG_SHARP), + (1 << SMOOTH_REG) | (1 << SMOOTH_SMOOTH) | (1 << SMOOTH_SHARP), + (1 << SHARP_REG) | (1 << SHARP_SMOOTH) | (1 << SHARP_SHARP) } + }; + +// For w<=4, MULTITAP_SHARP is the same as EIGHTTAP_REGULAR +static const InterpFilterParams av1_interp_4tap[SWITCHABLE_FILTERS + 1] = { + { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS, + EIGHTTAP_REGULAR }, + { (const int16_t *)av1_sub_pel_filters_4smooth, SUBPEL_TAPS, SUBPEL_SHIFTS, + EIGHTTAP_SMOOTH }, + { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS, + EIGHTTAP_REGULAR }, + { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS, + BILINEAR }, +}; + +static INLINE const InterpFilterParams * +av1_get_interp_filter_params_with_block_size(const InterpFilter interp_filter, + const int w) { + if (w <= 4) return &av1_interp_4tap[interp_filter]; + return &av1_interp_filter_params_list[interp_filter]; +} + +static INLINE const int16_t *av1_get_interp_filter_kernel( + const InterpFilter interp_filter, int subpel_search) { + assert(subpel_search >= USE_2_TAPS); + return (subpel_search == USE_2_TAPS) + ? av1_interp_4tap[BILINEAR].filter_ptr + : ((subpel_search == USE_4_TAPS) + ? av1_interp_4tap[interp_filter].filter_ptr + : av1_interp_filter_params_list[interp_filter].filter_ptr); +} + +static INLINE const int16_t *av1_get_interp_filter_subpel_kernel( + const InterpFilterParams *const filter_params, const int subpel) { + return filter_params->filter_ptr + filter_params->taps * subpel; +} + +static INLINE const InterpFilterParams *av1_get_filter(int subpel_search) { + assert(subpel_search >= USE_2_TAPS); + + switch (subpel_search) { + case USE_2_TAPS: return &av1_interp_4tap[BILINEAR]; + case USE_4_TAPS: return &av1_interp_4tap[EIGHTTAP_REGULAR]; + case USE_8_TAPS: return &av1_interp_filter_params_list[EIGHTTAP_REGULAR]; + default: assert(0); return NULL; + } +} + +static INLINE void reset_interp_filter_allowed_mask( + uint16_t *allow_interp_mask, DUAL_FILTER_TYPE filt_type) { + uint16_t tmp = (~(1 << filt_type)) & 0xffff; + *allow_interp_mask &= (tmp & ALLOW_ALL_INTERP_FILT_MASK); +} + +static INLINE void set_interp_filter_allowed_mask(uint16_t *allow_interp_mask, + DUAL_FILTER_TYPE filt_type) { + *allow_interp_mask |= (1 << filt_type); +} + +static INLINE uint8_t get_interp_filter_allowed_mask( + uint16_t allow_interp_mask, DUAL_FILTER_TYPE filt_type) { + return (allow_interp_mask >> filt_type) & 1; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_FILTER_H_ diff --git a/libs/libaom/src/av1/common/frame_buffers.c b/libs/libaom/src/av1/common/frame_buffers.c new file mode 100644 index 000000000..f10ccd594 --- /dev/null +++ b/libs/libaom/src/av1/common/frame_buffers.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/frame_buffers.h" +#include "aom_mem/aom_mem.h" + +int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list) { + assert(list != NULL); + av1_free_internal_frame_buffers(list); + + list->num_internal_frame_buffers = + AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS; + list->int_fb = (InternalFrameBuffer *)aom_calloc( + list->num_internal_frame_buffers, sizeof(*list->int_fb)); + if (list->int_fb == NULL) { + list->num_internal_frame_buffers = 0; + return 1; + } + return 0; +} + +void av1_free_internal_frame_buffers(InternalFrameBufferList *list) { + int i; + + assert(list != NULL); + + for (i = 0; i < list->num_internal_frame_buffers; ++i) { + aom_free(list->int_fb[i].data); + list->int_fb[i].data = NULL; + } + aom_free(list->int_fb); + list->int_fb = NULL; + list->num_internal_frame_buffers = 0; +} + +void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list) { + int i; + + assert(list != NULL); + + for (i = 0; i < list->num_internal_frame_buffers; ++i) { + if (list->int_fb[i].data && !list->int_fb[i].in_use) + memset(list->int_fb[i].data, 0, list->int_fb[i].size); + } +} + +int av1_get_frame_buffer(void *cb_priv, size_t min_size, + aom_codec_frame_buffer_t *fb) { + int i; + InternalFrameBufferList *const int_fb_list = + (InternalFrameBufferList *)cb_priv; + if (int_fb_list == NULL) return -1; + + // Find a free frame buffer. + for (i = 0; i < int_fb_list->num_internal_frame_buffers; ++i) { + if (!int_fb_list->int_fb[i].in_use) break; + } + + if (i == int_fb_list->num_internal_frame_buffers) return -1; + + if (int_fb_list->int_fb[i].size < min_size) { + aom_free(int_fb_list->int_fb[i].data); + // The data must be zeroed to fix a valgrind error from the C loop filter + // due to access uninitialized memory in frame border. It could be + // skipped if border were totally removed. + int_fb_list->int_fb[i].data = (uint8_t *)aom_calloc(1, min_size); + if (!int_fb_list->int_fb[i].data) { + int_fb_list->int_fb[i].size = 0; + return -1; + } + int_fb_list->int_fb[i].size = min_size; + } + + fb->data = int_fb_list->int_fb[i].data; + fb->size = int_fb_list->int_fb[i].size; + int_fb_list->int_fb[i].in_use = 1; + + // Set the frame buffer's private data to point at the internal frame buffer. + fb->priv = &int_fb_list->int_fb[i]; + return 0; +} + +int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb) { + InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv; + (void)cb_priv; + if (int_fb) int_fb->in_use = 0; + return 0; +} diff --git a/libs/libaom/src/av1/common/frame_buffers.h b/libs/libaom/src/av1/common/frame_buffers.h new file mode 100644 index 000000000..16188e51c --- /dev/null +++ b/libs/libaom/src/av1/common/frame_buffers.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_FRAME_BUFFERS_H_ +#define AOM_AV1_COMMON_FRAME_BUFFERS_H_ + +#include "aom/aom_frame_buffer.h" +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct InternalFrameBuffer { + uint8_t *data; + size_t size; + int in_use; +} InternalFrameBuffer; + +typedef struct InternalFrameBufferList { + int num_internal_frame_buffers; + InternalFrameBuffer *int_fb; +} InternalFrameBufferList; + +// Initializes |list|. Returns 0 on success. +int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list); + +// Free any data allocated to the frame buffers. +void av1_free_internal_frame_buffers(InternalFrameBufferList *list); + +// Zeros all unused internal frame buffers. In particular, this zeros the +// frame borders. Call this function after a sequence header change to +// re-initialize the frame borders for the different width, height, or bit +// depth. +void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list); + +// Callback used by libaom to request an external frame buffer. |cb_priv| +// Callback private data, which points to an InternalFrameBufferList. +// |min_size| is the minimum size in bytes needed to decode the next frame. +// |fb| pointer to the frame buffer. +int av1_get_frame_buffer(void *cb_priv, size_t min_size, + aom_codec_frame_buffer_t *fb); + +// Callback used by libaom when there are no references to the frame buffer. +// |cb_priv| is not used. |fb| pointer to the frame buffer. +int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_FRAME_BUFFERS_H_ diff --git a/libs/libaom/src/av1/common/idct.c b/libs/libaom/src/av1/common/idct.c new file mode 100644 index 000000000..bff438f3c --- /dev/null +++ b/libs/libaom/src/av1/common/idct.c @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_ports/mem.h" +#include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/av1_txfm.h" +#include "av1/common/blockd.h" +#include "av1/common/enums.h" +#include "av1/common/idct.h" + +int av1_get_tx_scale(const TX_SIZE tx_size) { + const int pels = tx_size_2d[tx_size]; + // Largest possible pels is 4096 (64x64). + return (pels > 256) + (pels > 1024); +} + +// NOTE: The implementation of all inverses need to be aware of the fact +// that input and output could be the same buffer. + +// idct +void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob, int bd) { + if (eob > 1) + av1_highbd_iwht4x4_16_add(input, dest, stride, bd); + else + av1_highbd_iwht4x4_1_add(input, dest, stride, bd); +} + +void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + int eob = txfm_param->eob; + int bd = txfm_param->bd; + int lossless = txfm_param->lossless; + const int32_t *src = cast_to_int32(input); + const TX_TYPE tx_type = txfm_param->tx_type; + if (lossless) { + assert(tx_type == DCT_DCT); + av1_highbd_iwht4x4_add(input, dest, stride, eob, bd); + return; + } + + av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd); +} + +void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_4x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_8x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_16x32_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_32x16_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_16x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_4x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_32x8_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_32x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_8x32_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_8x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_32x64_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_32x64_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_64x32_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_64x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_16x64_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_16x64_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_64x16_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_64x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + + av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd); +} + +void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + + av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, + bd); +} + +void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + + av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, + bd); +} + +void av1_highbd_inv_txfm_add_64x64_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + assert(tx_type == DCT_DCT); + av1_inv_txfm2d_add_64x64_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, + bd); +} + +static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size, + TX_TYPE tx_type, int eob, int reduced_tx_set, + TxfmParam *txfm_param) { + (void)plane; + txfm_param->tx_type = tx_type; + txfm_param->tx_size = tx_size; + txfm_param->eob = eob; + txfm_param->lossless = xd->lossless[xd->mi[0]->segment_id]; + txfm_param->bd = xd->bd; + txfm_param->is_hbd = is_cur_buf_hbd(xd); + txfm_param->tx_set_type = av1_get_ext_tx_set_type( + txfm_param->tx_size, is_inter_block(xd->mi[0]), reduced_tx_set); +} + +void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + const TX_SIZE tx_size = txfm_param->tx_size; + switch (tx_size) { + case TX_32X32: + av1_highbd_inv_txfm_add_32x32_c(input, dest, stride, txfm_param); + break; + case TX_16X16: + av1_highbd_inv_txfm_add_16x16_c(input, dest, stride, txfm_param); + break; + case TX_8X8: + av1_highbd_inv_txfm_add_8x8_c(input, dest, stride, txfm_param); + break; + case TX_4X8: + av1_highbd_inv_txfm_add_4x8_c(input, dest, stride, txfm_param); + break; + case TX_8X4: + av1_highbd_inv_txfm_add_8x4_c(input, dest, stride, txfm_param); + break; + case TX_8X16: + av1_highbd_inv_txfm_add_8x16_c(input, dest, stride, txfm_param); + break; + case TX_16X8: + av1_highbd_inv_txfm_add_16x8_c(input, dest, stride, txfm_param); + break; + case TX_16X32: + av1_highbd_inv_txfm_add_16x32_c(input, dest, stride, txfm_param); + break; + case TX_32X16: + av1_highbd_inv_txfm_add_32x16_c(input, dest, stride, txfm_param); + break; + case TX_64X64: + av1_highbd_inv_txfm_add_64x64_c(input, dest, stride, txfm_param); + break; + case TX_32X64: + av1_highbd_inv_txfm_add_32x64_c(input, dest, stride, txfm_param); + break; + case TX_64X32: + av1_highbd_inv_txfm_add_64x32_c(input, dest, stride, txfm_param); + break; + case TX_16X64: + av1_highbd_inv_txfm_add_16x64_c(input, dest, stride, txfm_param); + break; + case TX_64X16: + av1_highbd_inv_txfm_add_64x16_c(input, dest, stride, txfm_param); + break; + case TX_4X4: + // this is like av1_short_idct4x4 but has a special case around eob<=1 + // which is significant (not just an optimization) for the lossless + // case. + av1_highbd_inv_txfm_add_4x4_c(input, dest, stride, txfm_param); + break; + case TX_16X4: + av1_highbd_inv_txfm_add_16x4_c(input, dest, stride, txfm_param); + break; + case TX_4X16: + av1_highbd_inv_txfm_add_4x16_c(input, dest, stride, txfm_param); + break; + case TX_8X32: + av1_highbd_inv_txfm_add_8x32_c(input, dest, stride, txfm_param); + break; + case TX_32X8: + av1_highbd_inv_txfm_add_32x8_c(input, dest, stride, txfm_param); + break; + default: assert(0 && "Invalid transform size"); break; + } +} + +void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, + const TxfmParam *txfm_param) { + const TX_SIZE tx_size = txfm_param->tx_size; + DECLARE_ALIGNED(32, uint16_t, tmp[MAX_TX_SQUARE]); + int tmp_stride = MAX_TX_SIZE; + int w = tx_size_wide[tx_size]; + int h = tx_size_high[tx_size]; + for (int r = 0; r < h; ++r) { + for (int c = 0; c < w; ++c) { + tmp[r * tmp_stride + c] = dst[r * stride + c]; + } + } + + av1_highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride, + txfm_param); + + for (int r = 0; r < h; ++r) { + for (int c = 0; c < w; ++c) { + dst[r * stride + c] = (uint8_t)tmp[r * tmp_stride + c]; + } + } +} + +void av1_inverse_transform_block(const MACROBLOCKD *xd, + const tran_low_t *dqcoeff, int plane, + TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst, + int stride, int eob, int reduced_tx_set) { + if (!eob) return; + + assert(eob <= av1_get_max_eob(tx_size)); + + TxfmParam txfm_param; + init_txfm_param(xd, plane, tx_size, tx_type, eob, reduced_tx_set, + &txfm_param); + assert(av1_ext_tx_used[txfm_param.tx_set_type][txfm_param.tx_type]); + + if (txfm_param.is_hbd) { + av1_highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param); + } else { + av1_inv_txfm_add(dqcoeff, dst, stride, &txfm_param); + } +} diff --git a/libs/libaom/src/av1/common/idct.h b/libs/libaom/src/av1/common/idct.h new file mode 100644 index 000000000..004d25d49 --- /dev/null +++ b/libs/libaom/src/av1/common/idct.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_IDCT_H_ +#define AOM_AV1_COMMON_IDCT_H_ + +#include "config/aom_config.h" + +#include "av1/common/blockd.h" +#include "av1/common/common.h" +#include "av1/common/enums.h" +#include "aom_dsp/txfm_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*transform_1d)(const tran_low_t *, tran_low_t *); + +typedef struct { + transform_1d cols, rows; // vertical and horizontal +} transform_2d; + +#define MAX_TX_SCALE 1 +int av1_get_tx_scale(const TX_SIZE tx_size); + +void av1_inverse_transform_block(const MACROBLOCKD *xd, + const tran_low_t *dqcoeff, int plane, + TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst, + int stride, int eob, int reduced_tx_set); +void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob, int bd); + +static INLINE const int32_t *cast_to_int32(const tran_low_t *input) { + assert(sizeof(int32_t) == sizeof(tran_low_t)); + return (const int32_t *)input; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_IDCT_H_ diff --git a/libs/libaom/src/av1/common/loopfiltermask.c b/libs/libaom/src/av1/common/loopfiltermask.c new file mode 100644 index 000000000..157310f2d --- /dev/null +++ b/libs/libaom/src/av1/common/loopfiltermask.c @@ -0,0 +1,1458 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/av1_loopfilter.h" +#include "av1/common/reconinter.h" +#include "av1/common/seg_common.h" + +// 256 bit masks (64x64 / 4x4) for left transform size for Y plane. +// We use 4 uint64_t to represent the 256 bit. +// Each 1 represents a position where we should apply a loop filter +// across the left border of an 4x4 block boundary. +// +// In the case of TX_8x8-> ( in low order byte first we end up with +// a mask that looks like this (-- and | are used for better view) +// +// 10101010|10101010 +// 10101010|10101010 +// 10101010|10101010 +// 10101010|10101010 +// 10101010|10101010 +// 10101010|10101010 +// 10101010|10101010 +// 10101010|10101010 +// ----------------- +// 10101010|10101010 +// 10101010|10101010 +// 10101010|10101010 +// 10101010|10101010 +// 10101010|10101010 +// 10101010|10101010 +// 10101010|10101010 +// 10101010|10101010 +// +// A loopfilter should be applied to every other 4x4 horizontally. + +// 256 bit masks (64x64 / 4x4) for above transform size for Y plane. +// We use 4 uint64_t to represent the 256 bit. +// Each 1 represents a position where we should apply a loop filter +// across the top border of an 4x4 block boundary. +// +// In the case of TX_8x8-> ( in low order byte first we end up with +// a mask that looks like this +// +// 11111111|11111111 +// 00000000|00000000 +// 11111111|11111111 +// 00000000|00000000 +// 11111111|11111111 +// 00000000|00000000 +// 11111111|11111111 +// 00000000|00000000 +// ----------------- +// 11111111|11111111 +// 00000000|00000000 +// 11111111|11111111 +// 00000000|00000000 +// 11111111|11111111 +// 00000000|00000000 +// 11111111|11111111 +// 00000000|00000000 +// +// A loopfilter should be applied to every other 4x4 horizontally. +#if CONFIG_LPF_MASK +static const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1, 13, 14, 15, 16, 17, 18 +}; + +static const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL] = { + -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, 10, 11, 12, 13 +}; + +static const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL] = { + -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, -1, 7, 8 +}; + +static const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, + 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1 +}; +static const int mask_id_table_vert_border[BLOCK_SIZES_ALL] = { + 0, 47, 49, 19, 51, 53, 33, 55, 57, 42, 59, + 60, 46, -1, -1, -1, 61, 62, 63, 64, 65, 66 +}; + +static const FilterMask left_mask_univariant_reordered[67] = { + // TX_4X4 + { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X4, TX_4X4 + { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X8, TX_4X4 + { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X4, TX_4X4 + { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X8, TX_4X4 + { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X16, TX_4X4 + { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X8, TX_4X4 + { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X16, TX_4X4 + { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_4X4 + { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_4X4 + { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_4X4 + { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, + 0x00ff00ff00ff00ffULL } }, // block size 32X64, TX_4X4 + { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_4X4 + { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, + 0xffffffffffffffffULL } }, // block size 64X64, TX_4X4 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X16, TX_4X4 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X4, TX_4X4 + { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_4X4 + { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_4X4 + { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, + 0x000f000f000f000fULL } }, // block size 16X64, TX_4X4 + { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_4X4 + // TX_8X8 + { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X8, TX_8X8 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X16, TX_8X8 + { { 0x0000000000050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X8, TX_8X8 + { { 0x0005000500050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X16, TX_8X8 + { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_8X8 + { { 0x0055005500550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_8X8 + { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_8X8 + { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0055005500550055ULL, + 0x0055005500550055ULL } }, // block size 32X64, TX_8X8 + { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_8X8 + { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL, + 0x5555555555555555ULL } }, // block size 64X64, TX_8X8 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_8X8 + { { 0x0000000000550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_8X8 + { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0005000500050005ULL, + 0x0005000500050005ULL } }, // block size 16X64, TX_8X8 + { { 0x5555555555555555ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_8X8 + // TX_16X16 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X16, TX_16X16 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_16X16 + { { 0x0011001100110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_16X16 + { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_16X16 + { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0011001100110011ULL, + 0x0011001100110011ULL } }, // block size 32X64, TX_16X16 + { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_16X16 + { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL, + 0x1111111111111111ULL } }, // block size 64X64, TX_16X16 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL, + 0x0001000100010001ULL } }, // block size 16X64, TX_16X16 + { { 0x1111111111111111ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_16X16 + // TX_32X32 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_32X32 + { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL, + 0x0101010101010101ULL } }, // block size 32X64, TX_32X32 + { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_32X32 + { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL, + 0x0101010101010101ULL } }, // block size 64X64, TX_32X32 + // TX_64X64 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL, + 0x0001000100010001ULL } }, // block size 64X64, TX_64X64 + // 2:1, 1:2 transform sizes. + { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X8, TX_4X8 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X16, TX_4X8 + { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X4, TX_8X4 + { { 0x0000000000000005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X4, TX_8X4 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X16, TX_8X16 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_8X16 + { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X8, TX_16X8 + { { 0x0000000000110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_16X8 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_16X32 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL, + 0x0001000100010001ULL } }, // block size 16X64, TX_16X32 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_32X16 + { { 0x0101010101010101ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_32X16 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL, + 0x0001000100010001ULL } }, // block size 32X64, TX_32X64 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_64X32 + // 4:1, 1:4 transform sizes. + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X16, TX_4X16 + { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X4, TX_16X4 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_8X32 + { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_32X8 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL, + 0x0001000100010001ULL } }, // block size 16X64, TX_16X64 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_64X16 +}; + +static const FilterMask above_mask_univariant_reordered[67] = { + // TX_4X4 + { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X4, TX_4X4 + { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X8, TX_4X4 + { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X4, TX_4X4 + { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X8, TX_4X4 + { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X16, TX_4X4 + { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X8, TX_4X4 + { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X16, TX_4X4 + { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_4X4 + { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_4X4 + { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_4X4 + { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, + 0x00ff00ff00ff00ffULL } }, // block size 32X64, TX_4X4 + { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_4X4 + { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, + 0xffffffffffffffffULL } }, // block size 64X64, TX_4x4 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X16, TX_4X4 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X4, TX_4X4 + { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_4X4 + { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_4X4 + { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, + 0x000f000f000f000fULL } }, // block size 16X64, TX_4X4 + { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_4X4 + // TX_8X8 + { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X8, TX_8X8 + { { 0x0000000300000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X16, TX_8X8 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X8, TX_8X8 + { { 0x0000000f0000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X16, TX_8X8 + { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_8X8 + { { 0x000000ff000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_8X8 + { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_8X8 + { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x000000ff000000ffULL, + 0x000000ff000000ffULL } }, // block size 32X64, TX_8X8 + { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_8X8 + { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, + 0x0000ffff0000ffffULL } }, // block size 64X64, TX_8X8 + { { 0x0000000300000003ULL, 0x0000000300000003ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_8X8 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_8X8 + { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000f0000000fULL, + 0x0000000f0000000fULL } }, // block size 16X64, TX_8X8 + { { 0x0000ffff0000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_8X8 + // TX_16X16 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X16, TX_16X16 + { { 0x000000000000000fULL, 0x000000000000000fULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_16X16 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_16X16 + { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_16X16 + { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x00000000000000ffULL, + 0x00000000000000ffULL } }, // block size 32X64, TX_16X16 + { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_16X16 + { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL, + 0x000000000000ffffULL } }, // block size 64X64, TX_16X16 + { { 0x000000000000000fULL, 0x000000000000000fULL, 0x000000000000000fULL, + 0x000000000000000fULL } }, // block size 16X64, TX_16X16 + { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_16X16 + // TX_32X32 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_32X32 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x00000000000000ffULL, + 0x0000000000000000ULL } }, // block size 32X64, TX_32X32 + { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_32X32 + { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x000000000000ffffULL, + 0x0000000000000000ULL } }, // block size 64X64, TX_32X32 + // TX_64X64 + { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X64, TX_64X64 + // 2:1, 1:2 transform sizes. + { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X8, TX_4X8 + { { 0x0000000100000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X16, TX_4X8 + { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X4, TX_8X4 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X4, TX_8X4 + { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X16, TX_8X16 + { { 0x0000000000000003ULL, 0x0000000000000003ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_8X16 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X8, TX_16X8 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_16X8 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_16X32 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x000000000000000fULL, + 0x0000000000000000ULL } }, // block size 16X64, TX_16X32 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_32X16 + { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_32X16 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X64, TX_32X64 + { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_64X32 + // 4:1, 1:4 transform sizes. + { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X16, TX_4X16 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X4, TX_16X4 + { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_8X32 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_32X8 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X64, TX_16X64 + { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_64X16 +}; + +static LoopFilterMask *get_loop_filter_mask(const AV1_COMMON *const cm, + int mi_row, int mi_col) { + assert(cm->lf.lfm != NULL); + const int row = mi_row >> MIN_MIB_SIZE_LOG2; // 64x64 + const int col = mi_col >> MIN_MIB_SIZE_LOG2; + return &cm->lf.lfm[row * cm->lf.lfm_stride + col]; +} + +typedef void (*LpfFunc)(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh); + +typedef void (*LpfDualFunc)(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1); + +typedef void (*HbdLpfFunc)(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, int bd); + +typedef void (*HbdLpfDualFunc)(uint16_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd); +// A 64x64 tx block requires 256 bits to represent each 4x4 tx block. +// Every 4 rows is represented by one uint64_t mask. Hence, +// there are 4 uint64_t bitmask[4] to represent the 64x64 block. +// +// Given a location by (mi_col, mi_row), This function returns the index +// 0, 1, 2, 3 to select which bitmask[] to use, and the shift value. +// +// For example, mi_row is the offset of pixels in mi size (4), +// (mi_row / 4) returns which uint64_t. +// After locating which uint64_t, mi_row % 4 is the +// row offset, and each row has 16 = 1 << stride_log2 4x4 units. +// Therefore, shift = (row << stride_log2) + mi_col; +int get_index_shift(int mi_col, int mi_row, int *index) { + // *index = mi_row >> 2; + // rows = mi_row % 4; + // stride_log2 = 4; + // shift = (rows << stride_log2) + mi_col; + *index = mi_row >> 2; + return ((mi_row & 3) << 4) | mi_col; +} + +static void filter_selectively_vert_row2( + int subsampling_factor, uint8_t *s, int pitch, int plane, + uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0, + uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1, + const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2) { + uint64_t mask; + const int step = 1 << subsampling_factor; + + for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 | + mask_8x8_1 | mask_4x4_1; + mask; mask >>= step) { + const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl; + const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2; + + if (mask & 1) { + if ((mask_16x16_0 | mask_16x16_1) & 1) { + // chroma plane filters less pixels introduced in deblock_13tap + // experiment + LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_14; + + if ((mask_16x16_0 & mask_16x16_1) & 1) { + if (plane) { + aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, lfi1->lim, + lfi1->hev_thr); + } else { + aom_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, lfi1->lim, + lfi1->hev_thr); + } + } else if (mask_16x16_0 & 1) { + lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr); + } else { + lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim, + lfi1->hev_thr); + } + } + + if ((mask_8x8_0 | mask_8x8_1) & 1) { + // chroma plane filters less pixels introduced in deblock_13tap + // experiment + LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_8; + + if ((mask_8x8_0 & mask_8x8_1) & 1) { + if (plane) { + aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, lfi1->lim, + lfi1->hev_thr); + } else { + aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, lfi1->lim, + lfi1->hev_thr); + } + } else if (mask_8x8_0 & 1) { + lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr); + } else { + lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim, + lfi1->hev_thr); + } + } + + if ((mask_4x4_0 | mask_4x4_1) & 1) { + if ((mask_4x4_0 & mask_4x4_1) & 1) { + aom_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, lfi1->lim, + lfi1->hev_thr); + } else if (mask_4x4_0 & 1) { + aom_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr); + } else { + aom_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim, + lfi1->hev_thr); + } + } + } + + s += 4; + lfl += step; + lfl2 += step; + mask_16x16_0 >>= step; + mask_8x8_0 >>= step; + mask_4x4_0 >>= step; + mask_16x16_1 >>= step; + mask_8x8_1 >>= step; + mask_4x4_1 >>= step; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void highbd_filter_selectively_vert_row2( + int subsampling_factor, uint16_t *s, int pitch, int plane, + uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0, + uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1, + const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2, int bd) { + uint64_t mask; + const int step = 1 << subsampling_factor; + + for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 | + mask_8x8_1 | mask_4x4_1; + mask; mask >>= step) { + const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl; + const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2; + + if (mask & 1) { + if ((mask_16x16_0 | mask_16x16_1) & 1) { + // chroma plane filters less pixels introduced in deblock_13tap + // experiment + HbdLpfFunc highbd_lpf_vertical = + plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_14; + + if ((mask_16x16_0 & mask_16x16_1) & 1) { + if (plane) { + aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, + lfi1->lim, lfi1->hev_thr, bd); + } else { + aom_highbd_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, + lfi1->lim, lfi1->hev_thr, bd); + } + } else if (mask_16x16_0 & 1) { + highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, + bd); + } else { + highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim, + lfi1->hev_thr, bd); + } + } + + if ((mask_8x8_0 | mask_8x8_1) & 1) { + HbdLpfFunc highbd_lpf_vertical = + plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_8; + + if ((mask_8x8_0 & mask_8x8_1) & 1) { + if (plane) { + aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, + lfi1->lim, lfi1->hev_thr, bd); + } else { + aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, + lfi1->lim, lfi1->hev_thr, bd); + } + } else if (mask_8x8_0 & 1) { + highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, + bd); + } else { + highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim, + lfi1->hev_thr, bd); + } + } + + if ((mask_4x4_0 | mask_4x4_1) & 1) { + if ((mask_4x4_0 & mask_4x4_1) & 1) { + aom_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, lfi1->lim, + lfi1->hev_thr, bd); + } else if (mask_4x4_0 & 1) { + aom_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, bd); + } else { + aom_highbd_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim, + lfi1->lim, lfi1->hev_thr, bd); + } + } + } + + s += 4; + lfl += step; + lfl2 += step; + mask_16x16_0 >>= step; + mask_8x8_0 >>= step; + mask_4x4_0 >>= step; + mask_16x16_1 >>= step; + mask_8x8_1 >>= step; + mask_4x4_1 >>= step; + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static void filter_selectively_horiz(uint8_t *s, int pitch, int plane, + int subsampling, uint64_t mask_16x16, + uint64_t mask_8x8, uint64_t mask_4x4, + const loop_filter_info_n *lfi_n, + const uint8_t *lfl) { + uint64_t mask; + int count; + const int step = 1 << subsampling; + const unsigned int two_block_mask = subsampling ? 5 : 3; + int offset = 0; + + for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) { + const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl; + // Next block's thresholds, when it is within current 64x64 block. + // If it is out of bound, its mask is zero, and it points to current edge's + // filter parameters, instead of next edge's. + int next_edge = step; + if (offset + next_edge >= MI_SIZE_64X64) next_edge = 0; + const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + next_edge); + + count = 1; + if (mask & 1) { + if (mask_16x16 & 1) { + // chroma plane filters less pixels introduced in deblock_13tap + // experiment + LpfFunc lpf_horizontal = + plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_14; + + if ((mask_16x16 & two_block_mask) == two_block_mask) { + if (plane) { + aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr); + } else { + aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr); + } + count = 2; + } else { + lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); + } + } else if (mask_8x8 & 1) { + // chroma plane filters less pixels introduced in deblock_13tap + // experiment + LpfFunc lpf_horizontal = + plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_8; + + if ((mask_8x8 & two_block_mask) == two_block_mask) { + if (plane) { + aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr); + } else { + aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr); + } + count = 2; + } else { + lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); + } + } else if (mask_4x4 & 1) { + if ((mask_4x4 & two_block_mask) == two_block_mask) { + aom_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr); + count = 2; + } else { + aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); + } + } + } + + s += 4 * count; + lfl += step * count; + mask_16x16 >>= step * count; + mask_8x8 >>= step * count; + mask_4x4 >>= step * count; + offset += step * count; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void highbd_filter_selectively_horiz( + uint16_t *s, int pitch, int plane, int subsampling, uint64_t mask_16x16, + uint64_t mask_8x8, uint64_t mask_4x4, const loop_filter_info_n *lfi_n, + uint8_t *lfl, int bd) { + uint64_t mask; + int count; + const int step = 1 << subsampling; + const unsigned int two_block_mask = subsampling ? 5 : 3; + int offset = 0; + + for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) { + const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl; + // Next block's thresholds, when it is within current 64x64 block. + // If it is out of bound, its mask is zero, and it points to current edge's + // filter parameters, instead of next edge's. + int next_edge = step; + if (offset + next_edge >= MI_SIZE_64X64) next_edge = 0; + const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + next_edge); + + count = 1; + if (mask & 1) { + if (mask_16x16 & 1) { + HbdLpfFunc highbd_lpf_horizontal = + plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_14; + + if ((mask_16x16 & two_block_mask) == two_block_mask) { + if (plane) { + aom_highbd_lpf_horizontal_6_dual_c(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, + lfin->lim, lfin->hev_thr, bd); + } else { + aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, + lfin->lim, lfin->hev_thr, bd); + } + count = 2; + } else { + highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, + bd); + } + } else if (mask_8x8 & 1) { + HbdLpfFunc highbd_lpf_horizontal = + plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_8; + + if ((mask_8x8 & two_block_mask) == two_block_mask) { + if (plane) { + aom_highbd_lpf_horizontal_6_dual_c(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, + lfin->lim, lfin->hev_thr, bd); + } else { + aom_highbd_lpf_horizontal_8_dual_c(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, + lfin->lim, lfin->hev_thr, bd); + } + count = 2; + } else { + highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, + bd); + } + } else if (mask_4x4 & 1) { + if ((mask_4x4 & two_block_mask) == two_block_mask) { + aom_highbd_lpf_horizontal_4_dual_c(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, + lfin->lim, lfin->hev_thr, bd); + count = 2; + } else { + aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, bd); + } + } + } + + s += 4 * count; + lfl += step * count; + mask_16x16 >>= step * count; + mask_8x8 >>= step * count; + mask_4x4 >>= step * count; + offset += step * count; + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +void av1_build_bitmask_vert_info( + AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr, + int plane) { + const int subsampling_x = plane_ptr->subsampling_x; + const int subsampling_y = plane_ptr->subsampling_y; + const int is_uv = plane > 0; + TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16; + uint8_t level, prev_level = 1; + uint64_t skip, prev_skip = 0; + uint64_t is_coding_block_border; + + for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height; r++) { + const int mi_row = r << subsampling_y; + const int row = mi_row % MI_SIZE_64X64; + const int row_uv = row | subsampling_y; + int index = 0; + const int shift = get_index_shift(0, row, &index); + + for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width; + c += (tx_size_wide_unit[TX_64X64] >> subsampling_x)) { + const int mi_col = c << subsampling_x; + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); + + for (int col_in_unit = 0; + col_in_unit < (tx_size_wide_unit[TX_64X64] >> subsampling_x);) { + const int x = (c + col_in_unit) << MI_SIZE_LOG2; + if (x >= plane_ptr->dst.width) break; + const int col = col_in_unit << subsampling_x; + const int col_uv = col | subsampling_x; + const uint64_t mask = ((uint64_t)1 << (shift | col)); + skip = lfm->skip.bits[index] & mask; + is_coding_block_border = lfm->is_vert_border.bits[index] & mask; + switch (plane) { + case 0: level = lfm->lfl_y_ver[row_uv][col_uv]; break; + case 1: level = lfm->lfl_u_ver[row_uv][col_uv]; break; + case 2: level = lfm->lfl_v_ver[row_uv][col_uv]; break; + default: assert(plane >= 0 && plane <= 2); return; + } + for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) { + if (is_uv && ts == TX_64X64) continue; + if (lfm->tx_size_ver[is_uv][ts].bits[index] & mask) { + tx_size = ts; + break; + } + } + if ((c + col_in_unit > 0) && (level || prev_level) && + (!prev_skip || !skip || is_coding_block_border)) { + const TX_SIZE min_tx_size = + AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size)); + const int shift_1 = get_index_shift(col_uv, row_uv, &index); + const uint64_t mask_1 = ((uint64_t)1 << shift_1); + switch (plane) { + case 0: lfm->left_y[min_tx_size].bits[index] |= mask_1; break; + case 1: lfm->left_u[min_tx_size].bits[index] |= mask_1; break; + case 2: lfm->left_v[min_tx_size].bits[index] |= mask_1; break; + default: assert(plane >= 0 && plane <= 2); return; + } + if (level == 0 && prev_level != 0) { + switch (plane) { + case 0: lfm->lfl_y_ver[row_uv][col_uv] = prev_level; break; + case 1: lfm->lfl_u_ver[row_uv][col_uv] = prev_level; break; + case 2: lfm->lfl_v_ver[row_uv][col_uv] = prev_level; break; + default: assert(plane >= 0 && plane <= 2); return; + } + } + } + + // update prev info + prev_level = level; + prev_skip = skip; + prev_tx_size = tx_size; + // advance + col_in_unit += tx_size_wide_unit[tx_size]; + } + } + } +} + +void av1_build_bitmask_horz_info( + AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr, + int plane) { + const int subsampling_x = plane_ptr->subsampling_x; + const int subsampling_y = plane_ptr->subsampling_y; + const int is_uv = plane > 0; + TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16; + uint8_t level, prev_level = 1; + uint64_t skip, prev_skip = 0; + uint64_t is_coding_block_border; + + for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width; c++) { + const int mi_col = c << subsampling_x; + const int col = mi_col % MI_SIZE_64X64; + const int col_uv = col | subsampling_x; + + for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height; + r += (tx_size_high_unit[TX_64X64] >> subsampling_y)) { + const int mi_row = r << subsampling_y; + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); + + for (int r_in_unit = 0; + r_in_unit < (tx_size_high_unit[TX_64X64] >> subsampling_y);) { + const int y = (r + r_in_unit) << MI_SIZE_LOG2; + if (y >= plane_ptr->dst.height) break; + const int row = r_in_unit << subsampling_y; + const int row_uv = row | subsampling_y; + int index = 0; + const int shift = get_index_shift(col, row, &index); + const uint64_t mask = ((uint64_t)1 << shift); + skip = lfm->skip.bits[index] & mask; + is_coding_block_border = lfm->is_horz_border.bits[index] & mask; + switch (plane) { + case 0: level = lfm->lfl_y_hor[row_uv][col_uv]; break; + case 1: level = lfm->lfl_u_hor[row_uv][col_uv]; break; + case 2: level = lfm->lfl_v_hor[row_uv][col_uv]; break; + default: assert(plane >= 0 && plane <= 2); return; + } + for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) { + if (is_uv && ts == TX_64X64) continue; + if (lfm->tx_size_hor[is_uv][ts].bits[index] & mask) { + tx_size = ts; + break; + } + } + if ((r + r_in_unit > 0) && (level || prev_level) && + (!prev_skip || !skip || is_coding_block_border)) { + const TX_SIZE min_tx_size = + AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size)); + const int shift_1 = get_index_shift(col_uv, row_uv, &index); + const uint64_t mask_1 = ((uint64_t)1 << shift_1); + + switch (plane) { + case 0: lfm->above_y[min_tx_size].bits[index] |= mask_1; break; + case 1: lfm->above_u[min_tx_size].bits[index] |= mask_1; break; + case 2: lfm->above_v[min_tx_size].bits[index] |= mask_1; break; + default: assert(plane >= 0 && plane <= 2); return; + } + if (level == 0 && prev_level != 0) { + switch (plane) { + case 0: lfm->lfl_y_hor[row_uv][col_uv] = prev_level; break; + case 1: lfm->lfl_u_hor[row_uv][col_uv] = prev_level; break; + case 2: lfm->lfl_v_hor[row_uv][col_uv] = prev_level; break; + default: assert(plane >= 0 && plane <= 2); return; + } + } + } + + // update prev info + prev_level = level; + prev_skip = skip; + prev_tx_size = tx_size; + // advance + r_in_unit += tx_size_high_unit[tx_size]; + } + } + } +} + +void av1_filter_block_plane_bitmask_vert( + AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl, + int mi_row, int mi_col) { + struct buf_2d *const dst = &plane_ptr->dst; + uint8_t *const buf0 = dst->buf; + const int ssx = plane_ptr->subsampling_x; + const int ssy = plane_ptr->subsampling_y; + const int mask_cutoff = 0xffff; + const int row_step = 1 << ssy; + const int two_row_step = 2 << ssy; + const int row_stride = dst->stride << MI_SIZE_LOG2; + const int two_row_stride = row_stride << 1; + uint64_t mask_16x16 = 0; + uint64_t mask_8x8 = 0; + uint64_t mask_4x4 = 0; + uint8_t *lfl; + uint8_t *lfl2; + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); + assert(lfm); + + // 1. vertical filtering. filter two rows at a time + for (int r = 0; + ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64; + r += two_row_step) { + const int row = r | ssy; + const int row_next = row + row_step; + const int col = ssx; + int index = 0; + const int shift = get_index_shift(col, row, &index); + int index_next = 0; + const int shift_next = get_index_shift(col, row_next, &index_next); + const int has_next_row = row_next < cm->mi_params.mi_rows; + switch (pl) { + case 0: + mask_16x16 = lfm->left_y[TX_16X16].bits[index]; + mask_8x8 = lfm->left_y[TX_8X8].bits[index]; + mask_4x4 = lfm->left_y[TX_4X4].bits[index]; + lfl = &lfm->lfl_y_ver[row][col]; + lfl2 = &lfm->lfl_y_ver[row_next][col]; + break; + case 1: + mask_16x16 = lfm->left_u[TX_16X16].bits[index]; + mask_8x8 = lfm->left_u[TX_8X8].bits[index]; + mask_4x4 = lfm->left_u[TX_4X4].bits[index]; + lfl = &lfm->lfl_u_ver[row][col]; + lfl2 = &lfm->lfl_u_ver[row_next][col]; + break; + case 2: + mask_16x16 = lfm->left_v[TX_16X16].bits[index]; + mask_8x8 = lfm->left_v[TX_8X8].bits[index]; + mask_4x4 = lfm->left_v[TX_4X4].bits[index]; + lfl = &lfm->lfl_v_ver[row][col]; + lfl2 = &lfm->lfl_v_ver[row_next][col]; + break; + default: assert(pl >= 0 && pl <= 2); return; + } + uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff; + uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff; + uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff; + uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff; + uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff; + uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff; + if (!has_next_row) { + mask_16x16_1 = 0; + mask_8x8_1 = 0; + mask_4x4_1 = 0; + } + +#if CONFIG_AV1_HIGHBITDEPTH + if (cm->seq_params.use_highbitdepth) + highbd_filter_selectively_vert_row2( + ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0, + mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1, + &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth); + else + filter_selectively_vert_row2( + ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0, + mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2); +#else + filter_selectively_vert_row2( + ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0, + mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2); +#endif + dst->buf += two_row_stride; + } + // reset buf pointer for horizontal filtering + dst->buf = buf0; +} + +void av1_filter_block_plane_bitmask_horz( + AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl, + int mi_row, int mi_col) { + struct buf_2d *const dst = &plane_ptr->dst; + uint8_t *const buf0 = dst->buf; + const int ssx = plane_ptr->subsampling_x; + const int ssy = plane_ptr->subsampling_y; + const int mask_cutoff = 0xffff; + const int row_step = 1 << ssy; + const int row_stride = dst->stride << MI_SIZE_LOG2; + uint64_t mask_16x16 = 0; + uint64_t mask_8x8 = 0; + uint64_t mask_4x4 = 0; + uint8_t *lfl; + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); + assert(lfm); + for (int r = 0; + ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64; + r += row_step) { + if (mi_row + r == 0) { + dst->buf += row_stride; + continue; + } + const int row = r | ssy; + const int col = ssx; + int index = 0; + const int shift = get_index_shift(col, row, &index); + switch (pl) { + case 0: + mask_16x16 = lfm->above_y[TX_16X16].bits[index]; + mask_8x8 = lfm->above_y[TX_8X8].bits[index]; + mask_4x4 = lfm->above_y[TX_4X4].bits[index]; + lfl = &lfm->lfl_y_hor[row][col]; + break; + case 1: + mask_16x16 = lfm->above_u[TX_16X16].bits[index]; + mask_8x8 = lfm->above_u[TX_8X8].bits[index]; + mask_4x4 = lfm->above_u[TX_4X4].bits[index]; + lfl = &lfm->lfl_u_hor[row][col]; + break; + case 2: + mask_16x16 = lfm->above_v[TX_16X16].bits[index]; + mask_8x8 = lfm->above_v[TX_8X8].bits[index]; + mask_4x4 = lfm->above_v[TX_4X4].bits[index]; + lfl = &lfm->lfl_v_hor[row][col]; + break; + default: assert(pl >= 0 && pl <= 2); return; + } + mask_16x16 = (mask_16x16 >> shift) & mask_cutoff; + mask_8x8 = (mask_8x8 >> shift) & mask_cutoff; + mask_4x4 = (mask_4x4 >> shift) & mask_cutoff; + +#if CONFIG_AV1_HIGHBITDEPTH + if (cm->seq_params.use_highbitdepth) + highbd_filter_selectively_horiz( + CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16, + mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->seq_params.bit_depth); + else + filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16, + mask_8x8, mask_4x4, &cm->lf_info, lfl); +#else + filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16, + mask_8x8, mask_4x4, &cm->lf_info, lfl); +#endif + dst->buf += row_stride; + } + // reset buf pointer for next block + dst->buf = buf0; +} + +void av1_filter_block_plane_ver(AV1_COMMON *const cm, + struct macroblockd_plane *const plane_ptr, + int pl, int mi_row, int mi_col) { + struct buf_2d *const dst = &plane_ptr->dst; + int r, c; + const int ssx = plane_ptr->subsampling_x; + const int ssy = plane_ptr->subsampling_y; + const int mask_cutoff = 0xffff; + const int single_step = 1 << ssy; + const int r_step = 2 << ssy; + uint64_t mask_16x16 = 0; + uint64_t mask_8x8 = 0; + uint64_t mask_4x4 = 0; + uint8_t *lfl; + uint8_t *lfl2; + + // filter two rows at a time + for (r = 0; r < cm->seq_params.mib_size && + ((mi_row + r) << MI_SIZE_LOG2 < cm->height); + r += r_step) { + for (c = 0; c < cm->seq_params.mib_size && + ((mi_col + c) << MI_SIZE_LOG2 < cm->width); + c += MI_SIZE_64X64) { + dst->buf += ((c << MI_SIZE_LOG2) >> ssx); + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c); + assert(lfm); + const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64; + const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64; + int index = 0; + const int shift = get_index_shift(col, row, &index); + // current and next row should belong to the same mask_idx and index + // next row's shift + const int row_next = row + single_step; + int index_next = 0; + const int shift_next = get_index_shift(col, row_next, &index_next); + switch (pl) { + case 0: + mask_16x16 = lfm->left_y[TX_16X16].bits[index]; + mask_8x8 = lfm->left_y[TX_8X8].bits[index]; + mask_4x4 = lfm->left_y[TX_4X4].bits[index]; + lfl = &lfm->lfl_y_ver[row][col]; + lfl2 = &lfm->lfl_y_ver[row_next][col]; + break; + case 1: + mask_16x16 = lfm->left_u[TX_16X16].bits[index]; + mask_8x8 = lfm->left_u[TX_8X8].bits[index]; + mask_4x4 = lfm->left_u[TX_4X4].bits[index]; + lfl = &lfm->lfl_u_ver[row][col]; + lfl2 = &lfm->lfl_u_ver[row_next][col]; + break; + case 2: + mask_16x16 = lfm->left_v[TX_16X16].bits[index]; + mask_8x8 = lfm->left_v[TX_8X8].bits[index]; + mask_4x4 = lfm->left_v[TX_4X4].bits[index]; + lfl = &lfm->lfl_v_ver[row][col]; + lfl2 = &lfm->lfl_v_ver[row_next][col]; + break; + default: assert(pl >= 0 && pl <= 2); return; + } + uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff; + uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff; + uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff; + uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff; + uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff; + uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff; + +#if CONFIG_AV1_HIGHBITDEPTH + if (cm->seq_params.use_highbitdepth) + highbd_filter_selectively_vert_row2( + ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0, + mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1, + &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth); + else + filter_selectively_vert_row2(ssx, dst->buf, dst->stride, pl, + mask_16x16_0, mask_8x8_0, mask_4x4_0, + mask_16x16_1, mask_8x8_1, mask_4x4_1, + &cm->lf_info, lfl, lfl2); +#else + filter_selectively_vert_row2( + ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0, + mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2); +#endif + dst->buf -= ((c << MI_SIZE_LOG2) >> ssx); + } + dst->buf += 2 * MI_SIZE * dst->stride; + } +} + +void av1_filter_block_plane_hor(AV1_COMMON *const cm, + struct macroblockd_plane *const plane_ptr, + int pl, int mi_row, int mi_col) { + struct buf_2d *const dst = &plane_ptr->dst; + int r, c; + const int ssx = plane_ptr->subsampling_x; + const int ssy = plane_ptr->subsampling_y; + const int mask_cutoff = 0xffff; + const int r_step = 1 << ssy; + uint64_t mask_16x16 = 0; + uint64_t mask_8x8 = 0; + uint64_t mask_4x4 = 0; + uint8_t *lfl; + + for (r = 0; r < cm->seq_params.mib_size && + ((mi_row + r) << MI_SIZE_LOG2 < cm->height); + r += r_step) { + for (c = 0; c < cm->seq_params.mib_size && + ((mi_col + c) << MI_SIZE_LOG2 < cm->width); + c += MI_SIZE_64X64) { + if (mi_row + r == 0) continue; + + dst->buf += ((c << MI_SIZE_LOG2) >> ssx); + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c); + assert(lfm); + const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64; + const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64; + int index = 0; + const int shift = get_index_shift(col, row, &index); + switch (pl) { + case 0: + mask_16x16 = lfm->above_y[TX_16X16].bits[index]; + mask_8x8 = lfm->above_y[TX_8X8].bits[index]; + mask_4x4 = lfm->above_y[TX_4X4].bits[index]; + lfl = &lfm->lfl_y_hor[row][col]; + break; + case 1: + mask_16x16 = lfm->above_u[TX_16X16].bits[index]; + mask_8x8 = lfm->above_u[TX_8X8].bits[index]; + mask_4x4 = lfm->above_u[TX_4X4].bits[index]; + lfl = &lfm->lfl_u_hor[row][col]; + break; + case 2: + mask_16x16 = lfm->above_v[TX_16X16].bits[index]; + mask_8x8 = lfm->above_v[TX_8X8].bits[index]; + mask_4x4 = lfm->above_v[TX_4X4].bits[index]; + lfl = &lfm->lfl_v_hor[row][col]; + break; + default: assert(pl >= 0 && pl <= 2); return; + } + mask_16x16 = (mask_16x16 >> shift) & mask_cutoff; + mask_8x8 = (mask_8x8 >> shift) & mask_cutoff; + mask_4x4 = (mask_4x4 >> shift) & mask_cutoff; + +#if CONFIG_AV1_HIGHBITDEPTH + if (cm->seq_params.use_highbitdepth) + highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf), + dst->stride, pl, ssx, mask_16x16, + mask_8x8, mask_4x4, &cm->lf_info, lfl, + (int)cm->seq_params.bit_depth); + else + filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16, + mask_8x8, mask_4x4, &cm->lf_info, lfl); +#else + filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16, + mask_8x8, mask_4x4, &cm->lf_info, lfl); +#endif + dst->buf -= ((c << MI_SIZE_LOG2) >> ssx); + } + dst->buf += MI_SIZE * dst->stride; + } +} + +void av1_store_bitmask_vartx(AV1_COMMON *cm, int mi_row, int mi_col, + BLOCK_SIZE bsize, TX_SIZE tx_size, + MB_MODE_INFO *mbmi) { + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); + const TX_SIZE tx_size_y_vert = txsize_vert_map[tx_size]; + const TX_SIZE tx_size_y_horz = txsize_horz_map[tx_size]; + const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize( + mbmi->sb_type, cm->seq_params.subsampling_x, + cm->seq_params.subsampling_y)]; + const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize( + mbmi->sb_type, cm->seq_params.subsampling_x, + cm->seq_params.subsampling_y)]; + const int is_square_transform_size = tx_size <= TX_64X64; + int mask_id = 0; + int offset = 0; + const int half_ratio_tx_size_max32 = + (tx_size > TX_64X64) & (tx_size <= TX_32X16); + if (is_square_transform_size) { + switch (tx_size) { + case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break; + case TX_8X8: + mask_id = mask_id_table_tx_8x8[bsize]; + offset = 19; + break; + case TX_16X16: + mask_id = mask_id_table_tx_16x16[bsize]; + offset = 33; + break; + case TX_32X32: + mask_id = mask_id_table_tx_32x32[bsize]; + offset = 42; + break; + case TX_64X64: mask_id = 46; break; + default: assert(!is_square_transform_size); return; + } + mask_id += offset; + } else if (half_ratio_tx_size_max32) { + int tx_size_equal_block_size = bsize == txsize_to_bsize[tx_size]; + mask_id = 47 + 2 * (tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1); + } else if (tx_size == TX_32X64) { + mask_id = 59; + } else if (tx_size == TX_64X32) { + mask_id = 60; + } else { // quarter ratio tx size + mask_id = 61 + (tx_size - TX_4X16); + } + int index = 0; + const int row = mi_row % MI_SIZE_64X64; + const int col = mi_col % MI_SIZE_64X64; + const int shift = get_index_shift(col, row, &index); + const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col; + for (int i = 0; i + index < 4; ++i) { + // y vertical. + lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |= + (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift); + // y horizontal. + lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |= + (above_mask_univariant_reordered[mask_id].bits[i] << shift); + // u/v vertical. + lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |= + (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift); + // u/v horizontal. + lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |= + (above_mask_univariant_reordered[mask_id].bits[i] << shift); + } +} + +void av1_store_bitmask_univariant_tx(AV1_COMMON *cm, int mi_row, int mi_col, + BLOCK_SIZE bsize, MB_MODE_INFO *mbmi) { + // Use a lookup table that provides one bitmask for a given block size and + // a univariant transform size. + int index; + int shift; + int row; + int col; + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); + const TX_SIZE tx_size_y_vert = txsize_vert_map[mbmi->tx_size]; + const TX_SIZE tx_size_y_horz = txsize_horz_map[mbmi->tx_size]; + const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize( + mbmi->sb_type, cm->seq_params.subsampling_x, + cm->seq_params.subsampling_y)]; + const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize( + mbmi->sb_type, cm->seq_params.subsampling_x, + cm->seq_params.subsampling_y)]; + const int is_square_transform_size = mbmi->tx_size <= TX_64X64; + int mask_id = 0; + int offset = 0; + const int half_ratio_tx_size_max32 = + (mbmi->tx_size > TX_64X64) & (mbmi->tx_size <= TX_32X16); + if (is_square_transform_size) { + switch (mbmi->tx_size) { + case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break; + case TX_8X8: + mask_id = mask_id_table_tx_8x8[bsize]; + offset = 19; + break; + case TX_16X16: + mask_id = mask_id_table_tx_16x16[bsize]; + offset = 33; + break; + case TX_32X32: + mask_id = mask_id_table_tx_32x32[bsize]; + offset = 42; + break; + case TX_64X64: mask_id = 46; break; + default: assert(!is_square_transform_size); return; + } + mask_id += offset; + } else if (half_ratio_tx_size_max32) { + int tx_size_equal_block_size = bsize == txsize_to_bsize[mbmi->tx_size]; + mask_id = + 47 + 2 * (mbmi->tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1); + } else if (mbmi->tx_size == TX_32X64) { + mask_id = 59; + } else if (mbmi->tx_size == TX_64X32) { + mask_id = 60; + } else { // quarter ratio tx size + mask_id = 61 + (mbmi->tx_size - TX_4X16); + } + row = mi_row % MI_SIZE_64X64; + col = mi_col % MI_SIZE_64X64; + shift = get_index_shift(col, row, &index); + const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col; + for (int i = 0; i + index < 4; ++i) { + // y vertical. + lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |= + (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift); + // y horizontal. + lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |= + (above_mask_univariant_reordered[mask_id].bits[i] << shift); + // u/v vertical. + lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |= + (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift); + // u/v horizontal. + lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |= + (above_mask_univariant_reordered[mask_id].bits[i] << shift); + } +} + +void av1_store_bitmask_other_info(AV1_COMMON *cm, int mi_row, int mi_col, + BLOCK_SIZE bsize, MB_MODE_INFO *mbmi, + int is_horz_coding_block_border, + int is_vert_coding_block_border) { + int index; + int shift; + int row; + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); + const int row_start = mi_row % MI_SIZE_64X64; + const int col_start = mi_col % MI_SIZE_64X64; + shift = get_index_shift(col_start, row_start, &index); + if (is_horz_coding_block_border) { + const int block_shift = shift + mi_size_wide[bsize]; + assert(block_shift <= 64); + const uint64_t right_edge_shift = + (block_shift == 64) ? 0xffffffffffffffff : ((uint64_t)1 << block_shift); + const uint64_t left_edge_shift = (block_shift == 64) + ? (((uint64_t)1 << shift) - 1) + : ((uint64_t)1 << shift); + assert(right_edge_shift > left_edge_shift); + const uint64_t top_edge_mask = right_edge_shift - left_edge_shift; + lfm->is_horz_border.bits[index] |= top_edge_mask; + } + if (is_vert_coding_block_border) { + const int is_vert_border = mask_id_table_vert_border[bsize]; + const int vert_shift = block_size_high[bsize] <= 8 ? shift : col_start; + for (int i = 0; i + index < 4; ++i) { + lfm->is_vert_border.bits[i + index] |= + (left_mask_univariant_reordered[is_vert_border].bits[i] + << vert_shift); + } + } + const int is_skip = mbmi->skip && is_inter_block(mbmi); + if (is_skip) { + const int is_skip_mask = mask_id_table_tx_4x4[bsize]; + for (int i = 0; i + index < 4; ++i) { + lfm->skip.bits[i + index] |= + (above_mask_univariant_reordered[is_skip_mask].bits[i] << shift); + } + } + const uint8_t level_vert_y = + av1_get_filter_level(cm, &cm->lf_info, 0, 0, mbmi); + const uint8_t level_horz_y = + av1_get_filter_level(cm, &cm->lf_info, 1, 0, mbmi); + const uint8_t level_u = av1_get_filter_level(cm, &cm->lf_info, 0, 1, mbmi); + const uint8_t level_v = av1_get_filter_level(cm, &cm->lf_info, 0, 2, mbmi); + for (int r = mi_row; r < mi_row + mi_size_high[bsize]; r++) { + index = 0; + row = r % MI_SIZE_64X64; + memset(&lfm->lfl_y_ver[row][col_start], level_vert_y, + sizeof(uint8_t) * mi_size_wide[bsize]); + memset(&lfm->lfl_y_hor[row][col_start], level_horz_y, + sizeof(uint8_t) * mi_size_wide[bsize]); + memset(&lfm->lfl_u_ver[row][col_start], level_u, + sizeof(uint8_t) * mi_size_wide[bsize]); + memset(&lfm->lfl_u_hor[row][col_start], level_u, + sizeof(uint8_t) * mi_size_wide[bsize]); + memset(&lfm->lfl_v_ver[row][col_start], level_v, + sizeof(uint8_t) * mi_size_wide[bsize]); + memset(&lfm->lfl_v_hor[row][col_start], level_v, + sizeof(uint8_t) * mi_size_wide[bsize]); + } +} +#endif // CONFIG_LPF_MASK diff --git a/libs/libaom/src/av1/common/mv.h b/libs/libaom/src/av1/common/mv.h new file mode 100644 index 000000000..be539e820 --- /dev/null +++ b/libs/libaom/src/av1/common/mv.h @@ -0,0 +1,354 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_MV_H_ +#define AOM_AV1_COMMON_MV_H_ + +#include "av1/common/common.h" +#include "av1/common/common_data.h" +#include "aom_dsp/aom_filter.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define INVALID_MV 0x80008000 +#define GET_MV_RAWPEL(x) (((x) + 3 + ((x) >= 0)) >> 3) +#define GET_MV_SUBPEL(x) ((x)*8) + +#define MARK_MV_INVALID(mv) \ + do { \ + ((int_mv *)(mv))->as_int = INVALID_MV; \ + } while (0); +#define CHECK_MV_EQUAL(x, y) (((x).row == (y).row) && ((x).col == (y).col)) + +// The motion vector in units of full pixel +typedef struct fullpel_mv { + int16_t row; + int16_t col; +} FULLPEL_MV; + +// The motion vector in units of 1/8-pel +typedef struct mv { + int16_t row; + int16_t col; +} MV; + +static const MV kZeroMv = { 0, 0 }; +static const FULLPEL_MV kZeroFullMv = { 0, 0 }; + +typedef union int_mv { + uint32_t as_int; + MV as_mv; + FULLPEL_MV as_fullmv; +} int_mv; /* facilitates faster equality tests and copies */ + +typedef struct mv32 { + int32_t row; + int32_t col; +} MV32; + +// The mv limit for fullpel mvs +typedef struct { + int col_min; + int col_max; + int row_min; + int row_max; +} FullMvLimits; + +// The mv limit for subpel mvs +typedef struct { + int col_min; + int col_max; + int row_min; + int row_max; +} SubpelMvLimits; + +static AOM_INLINE FULLPEL_MV get_fullmv_from_mv(const MV *subpel_mv) { + const FULLPEL_MV full_mv = { (int16_t)GET_MV_RAWPEL(subpel_mv->row), + (int16_t)GET_MV_RAWPEL(subpel_mv->col) }; + return full_mv; +} + +static AOM_INLINE MV get_mv_from_fullmv(const FULLPEL_MV *full_mv) { + const MV subpel_mv = { (int16_t)GET_MV_SUBPEL(full_mv->row), + (int16_t)GET_MV_SUBPEL(full_mv->col) }; + return subpel_mv; +} + +static AOM_INLINE void convert_fullmv_to_mv(int_mv *mv) { + mv->as_mv = get_mv_from_fullmv(&mv->as_fullmv); +} + +// Bits of precision used for the model +#define WARPEDMODEL_PREC_BITS 16 +#define WARPEDMODEL_ROW3HOMO_PREC_BITS 16 + +#define WARPEDMODEL_TRANS_CLAMP (128 << WARPEDMODEL_PREC_BITS) +#define WARPEDMODEL_NONDIAGAFFINE_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 3)) +#define WARPEDMODEL_ROW3HOMO_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 2)) + +// Bits of subpel precision for warped interpolation +#define WARPEDPIXEL_PREC_BITS 6 +#define WARPEDPIXEL_PREC_SHIFTS (1 << WARPEDPIXEL_PREC_BITS) + +#define WARP_PARAM_REDUCE_BITS 6 + +#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS) + +/* clang-format off */ +enum { + IDENTITY = 0, // identity transformation, 0-parameter + TRANSLATION = 1, // translational motion 2-parameter + ROTZOOM = 2, // simplified affine with rotation + zoom only, 4-parameter + AFFINE = 3, // affine, 6-parameter + TRANS_TYPES, +} UENUM1BYTE(TransformationType); +/* clang-format on */ + +// Number of types used for global motion (must be >= 3 and <= TRANS_TYPES) +// The following can be useful: +// GLOBAL_TRANS_TYPES 3 - up to rotation-zoom +// GLOBAL_TRANS_TYPES 4 - up to affine +// GLOBAL_TRANS_TYPES 6 - up to hor/ver trapezoids +// GLOBAL_TRANS_TYPES 7 - up to full homography +#define GLOBAL_TRANS_TYPES 4 + +typedef struct { + int global_warp_allowed; + int local_warp_allowed; +} WarpTypesAllowed; + +// number of parameters used by each transformation in TransformationTypes +static const int trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 }; + +// The order of values in the wmmat matrix below is best described +// by the homography: +// [x' (m2 m3 m0 [x +// z . y' = m4 m5 m1 * y +// 1] m6 m7 1) 1] +typedef struct { + int32_t wmmat[8]; + int16_t alpha, beta, gamma, delta; + TransformationType wmtype; + int8_t invalid; +} WarpedMotionParams; + +/* clang-format off */ +static const WarpedMotionParams default_warp_params = { + { 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0, + 0 }, + 0, 0, 0, 0, + IDENTITY, + 0, +}; +/* clang-format on */ + +// The following constants describe the various precisions +// of different parameters in the global motion experiment. +// +// Given the general homography: +// [x' (a b c [x +// z . y' = d e f * y +// 1] g h i) 1] +// +// Constants using the name ALPHA here are related to parameters +// a, b, d, e. Constants using the name TRANS are related +// to parameters c and f. +// +// Anything ending in PREC_BITS is the number of bits of precision +// to maintain when converting from double to integer. +// +// The ABS parameters are used to create an upper and lower bound +// for each parameter. In other words, after a parameter is integerized +// it is clamped between -(1 << ABS_XXX_BITS) and (1 << ABS_XXX_BITS). +// +// XXX_PREC_DIFF and XXX_DECODE_FACTOR +// are computed once here to prevent repetitive +// computation on the decoder side. These are +// to allow the global motion parameters to be encoded in a lower +// precision than the warped model precision. This means that they +// need to be changed to warped precision when they are decoded. +// +// XX_MIN, XX_MAX are also computed to avoid repeated computation + +#define SUBEXPFIN_K 3 +#define GM_TRANS_PREC_BITS 6 +#define GM_ABS_TRANS_BITS 12 +#define GM_ABS_TRANS_ONLY_BITS (GM_ABS_TRANS_BITS - GM_TRANS_PREC_BITS + 3) +#define GM_TRANS_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_TRANS_PREC_BITS) +#define GM_TRANS_ONLY_PREC_DIFF (WARPEDMODEL_PREC_BITS - 3) +#define GM_TRANS_DECODE_FACTOR (1 << GM_TRANS_PREC_DIFF) +#define GM_TRANS_ONLY_DECODE_FACTOR (1 << GM_TRANS_ONLY_PREC_DIFF) + +#define GM_ALPHA_PREC_BITS 15 +#define GM_ABS_ALPHA_BITS 12 +#define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS) +#define GM_ALPHA_DECODE_FACTOR (1 << GM_ALPHA_PREC_DIFF) + +#define GM_ROW3HOMO_PREC_BITS 16 +#define GM_ABS_ROW3HOMO_BITS 11 +#define GM_ROW3HOMO_PREC_DIFF \ + (WARPEDMODEL_ROW3HOMO_PREC_BITS - GM_ROW3HOMO_PREC_BITS) +#define GM_ROW3HOMO_DECODE_FACTOR (1 << GM_ROW3HOMO_PREC_DIFF) + +#define GM_TRANS_MAX (1 << GM_ABS_TRANS_BITS) +#define GM_ALPHA_MAX (1 << GM_ABS_ALPHA_BITS) +#define GM_ROW3HOMO_MAX (1 << GM_ABS_ROW3HOMO_BITS) + +#define GM_TRANS_MIN -GM_TRANS_MAX +#define GM_ALPHA_MIN -GM_ALPHA_MAX +#define GM_ROW3HOMO_MIN -GM_ROW3HOMO_MAX + +static INLINE int block_center_x(int mi_col, BLOCK_SIZE bs) { + const int bw = block_size_wide[bs]; + return mi_col * MI_SIZE + bw / 2 - 1; +} + +static INLINE int block_center_y(int mi_row, BLOCK_SIZE bs) { + const int bh = block_size_high[bs]; + return mi_row * MI_SIZE + bh / 2 - 1; +} + +static INLINE int convert_to_trans_prec(int allow_hp, int coor) { + if (allow_hp) + return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 3); + else + return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 2) * 2; +} +static INLINE void integer_mv_precision(MV *mv) { + int mod = (mv->row % 8); + if (mod != 0) { + mv->row -= mod; + if (abs(mod) > 4) { + if (mod > 0) { + mv->row += 8; + } else { + mv->row -= 8; + } + } + } + + mod = (mv->col % 8); + if (mod != 0) { + mv->col -= mod; + if (abs(mod) > 4) { + if (mod > 0) { + mv->col += 8; + } else { + mv->col -= 8; + } + } + } +} +// Convert a global motion vector into a motion vector at the centre of the +// given block. +// +// The resulting motion vector will have three fractional bits of precision. If +// allow_hp is zero, the bottom bit will always be zero. If CONFIG_AMVR and +// is_integer is true, the bottom three bits will be zero (so the motion vector +// represents an integer) +static INLINE int_mv gm_get_motion_vector(const WarpedMotionParams *gm, + int allow_hp, BLOCK_SIZE bsize, + int mi_col, int mi_row, + int is_integer) { + int_mv res; + + if (gm->wmtype == IDENTITY) { + res.as_int = 0; + return res; + } + + const int32_t *mat = gm->wmmat; + int x, y, tx, ty; + + if (gm->wmtype == TRANSLATION) { + // All global motion vectors are stored with WARPEDMODEL_PREC_BITS (16) + // bits of fractional precision. The offset for a translation is stored in + // entries 0 and 1. For translations, all but the top three (two if + // cm->features.allow_high_precision_mv is false) fractional bits are always + // zero. + // + // After the right shifts, there are 3 fractional bits of precision. If + // allow_hp is false, the bottom bit is always zero (so we don't need a + // call to convert_to_trans_prec here) + res.as_mv.row = gm->wmmat[0] >> GM_TRANS_ONLY_PREC_DIFF; + res.as_mv.col = gm->wmmat[1] >> GM_TRANS_ONLY_PREC_DIFF; + assert(IMPLIES(1 & (res.as_mv.row | res.as_mv.col), allow_hp)); + if (is_integer) { + integer_mv_precision(&res.as_mv); + } + return res; + } + + x = block_center_x(mi_col, bsize); + y = block_center_y(mi_row, bsize); + + if (gm->wmtype == ROTZOOM) { + assert(gm->wmmat[5] == gm->wmmat[2]); + assert(gm->wmmat[4] == -gm->wmmat[3]); + } + + const int xc = + (mat[2] - (1 << WARPEDMODEL_PREC_BITS)) * x + mat[3] * y + mat[0]; + const int yc = + mat[4] * x + (mat[5] - (1 << WARPEDMODEL_PREC_BITS)) * y + mat[1]; + tx = convert_to_trans_prec(allow_hp, xc); + ty = convert_to_trans_prec(allow_hp, yc); + + res.as_mv.row = ty; + res.as_mv.col = tx; + + if (is_integer) { + integer_mv_precision(&res.as_mv); + } + return res; +} + +static INLINE TransformationType get_wmtype(const WarpedMotionParams *gm) { + if (gm->wmmat[5] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[4] && + gm->wmmat[2] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[3]) { + return ((!gm->wmmat[1] && !gm->wmmat[0]) ? IDENTITY : TRANSLATION); + } + if (gm->wmmat[2] == gm->wmmat[5] && gm->wmmat[3] == -gm->wmmat[4]) + return ROTZOOM; + else + return AFFINE; +} + +typedef struct candidate_mv { + int_mv this_mv; + int_mv comp_mv; +} CANDIDATE_MV; + +static INLINE int is_zero_mv(const MV *mv) { + return *((const uint32_t *)mv) == 0; +} + +static INLINE int is_equal_mv(const MV *a, const MV *b) { + return *((const uint32_t *)a) == *((const uint32_t *)b); +} + +static INLINE void clamp_mv(MV *mv, const SubpelMvLimits *mv_limits) { + mv->col = clamp(mv->col, mv_limits->col_min, mv_limits->col_max); + mv->row = clamp(mv->row, mv_limits->row_min, mv_limits->row_max); +} + +static INLINE void clamp_fullmv(FULLPEL_MV *mv, const FullMvLimits *mv_limits) { + mv->col = clamp(mv->col, mv_limits->col_min, mv_limits->col_max); + mv->row = clamp(mv->row, mv_limits->row_min, mv_limits->row_max); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_MV_H_ diff --git a/libs/libaom/src/av1/common/mvref_common.c b/libs/libaom/src/av1/common/mvref_common.c new file mode 100644 index 000000000..db3098cc0 --- /dev/null +++ b/libs/libaom/src/av1/common/mvref_common.c @@ -0,0 +1,1511 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/mvref_common.h" +#include "av1/common/warped_motion.h" + +// Although we assign 32 bit integers, all the values are strictly under 14 +// bits. +static int div_mult[32] = { 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340, + 2048, 1820, 1638, 1489, 1365, 1260, 1170, 1092, + 1024, 963, 910, 862, 819, 780, 744, 712, + 682, 655, 630, 606, 585, 564, 546, 528 }; + +// TODO(jingning): Consider the use of lookup table for (num / den) +// altogether. +static AOM_INLINE void get_mv_projection(MV *output, MV ref, int num, int den) { + den = AOMMIN(den, MAX_FRAME_DISTANCE); + num = num > 0 ? AOMMIN(num, MAX_FRAME_DISTANCE) + : AOMMAX(num, -MAX_FRAME_DISTANCE); + const int mv_row = + ROUND_POWER_OF_TWO_SIGNED(ref.row * num * div_mult[den], 14); + const int mv_col = + ROUND_POWER_OF_TWO_SIGNED(ref.col * num * div_mult[den], 14); + const int clamp_max = MV_UPP - 1; + const int clamp_min = MV_LOW + 1; + output->row = (int16_t)clamp(mv_row, clamp_min, clamp_max); + output->col = (int16_t)clamp(mv_col, clamp_min, clamp_max); +} + +void av1_copy_frame_mvs(const AV1_COMMON *const cm, + const MB_MODE_INFO *const mi, int mi_row, int mi_col, + int x_mis, int y_mis) { + const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, 1); + MV_REF *frame_mvs = + cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1); + x_mis = ROUND_POWER_OF_TWO(x_mis, 1); + y_mis = ROUND_POWER_OF_TWO(y_mis, 1); + int w, h; + + for (h = 0; h < y_mis; h++) { + MV_REF *mv = frame_mvs; + for (w = 0; w < x_mis; w++) { + mv->ref_frame = NONE_FRAME; + mv->mv.as_int = 0; + + for (int idx = 0; idx < 2; ++idx) { + MV_REFERENCE_FRAME ref_frame = mi->ref_frame[idx]; + if (ref_frame > INTRA_FRAME) { + int8_t ref_idx = cm->ref_frame_side[ref_frame]; + if (ref_idx) continue; + if ((abs(mi->mv[idx].as_mv.row) > REFMVS_LIMIT) || + (abs(mi->mv[idx].as_mv.col) > REFMVS_LIMIT)) + continue; + mv->ref_frame = ref_frame; + mv->mv.as_int = mi->mv[idx].as_int; + } + } + mv++; + } + frame_mvs += frame_mvs_stride; + } +} + +static AOM_INLINE void add_ref_mv_candidate( + const MB_MODE_INFO *const candidate, const MV_REFERENCE_FRAME rf[2], + uint8_t *refmv_count, uint8_t *ref_match_count, uint8_t *newmv_count, + CANDIDATE_MV *ref_mv_stack, uint16_t *ref_mv_weight, + int_mv *gm_mv_candidates, const WarpedMotionParams *gm_params, + uint16_t weight) { + if (!is_inter_block(candidate)) return; + assert(weight % 2 == 0); + int index, ref; + + if (rf[1] == NONE_FRAME) { + // single reference frame + for (ref = 0; ref < 2; ++ref) { + if (candidate->ref_frame[ref] == rf[0]) { + const int is_gm_block = + is_global_mv_block(candidate, gm_params[rf[0]].wmtype); + const int_mv this_refmv = + is_gm_block ? gm_mv_candidates[0] : get_block_mv(candidate, ref); + for (index = 0; index < *refmv_count; ++index) { + if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) { + ref_mv_weight[index] += weight; + break; + } + } + + // Add a new item to the list. + if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) { + ref_mv_stack[index].this_mv = this_refmv; + ref_mv_weight[index] = weight; + ++(*refmv_count); + } + if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count; + ++*ref_match_count; + } + } + } else { + // compound reference frame + if (candidate->ref_frame[0] == rf[0] && candidate->ref_frame[1] == rf[1]) { + int_mv this_refmv[2]; + + for (ref = 0; ref < 2; ++ref) { + if (is_global_mv_block(candidate, gm_params[rf[ref]].wmtype)) + this_refmv[ref] = gm_mv_candidates[ref]; + else + this_refmv[ref] = get_block_mv(candidate, ref); + } + + for (index = 0; index < *refmv_count; ++index) { + if ((ref_mv_stack[index].this_mv.as_int == this_refmv[0].as_int) && + (ref_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int)) { + ref_mv_weight[index] += weight; + break; + } + } + + // Add a new item to the list. + if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) { + ref_mv_stack[index].this_mv = this_refmv[0]; + ref_mv_stack[index].comp_mv = this_refmv[1]; + ref_mv_weight[index] = weight; + ++(*refmv_count); + } + if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count; + ++*ref_match_count; + } + } +} + +static AOM_INLINE void scan_row_mbmi( + const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_col, + const MV_REFERENCE_FRAME rf[2], int row_offset, CANDIDATE_MV *ref_mv_stack, + uint16_t *ref_mv_weight, uint8_t *refmv_count, uint8_t *ref_match_count, + uint8_t *newmv_count, int_mv *gm_mv_candidates, int max_row_offset, + int *processed_rows) { + int end_mi = AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col); + end_mi = AOMMIN(end_mi, mi_size_wide[BLOCK_64X64]); + const int width_8x8 = mi_size_wide[BLOCK_8X8]; + const int width_16x16 = mi_size_wide[BLOCK_16X16]; + int col_offset = 0; + // TODO(jingning): Revisit this part after cb4x4 is stable. + if (abs(row_offset) > 1) { + col_offset = 1; + if ((mi_col & 0x01) && xd->width < width_8x8) --col_offset; + } + const int use_step_16 = (xd->width >= 16); + MB_MODE_INFO **const candidate_mi0 = xd->mi + row_offset * xd->mi_stride; + + for (int i = 0; i < end_mi;) { + const MB_MODE_INFO *const candidate = candidate_mi0[col_offset + i]; + const int candidate_bsize = candidate->sb_type; + const int n4_w = mi_size_wide[candidate_bsize]; + int len = AOMMIN(xd->width, n4_w); + if (use_step_16) + len = AOMMAX(width_16x16, len); + else if (abs(row_offset) > 1) + len = AOMMAX(len, width_8x8); + + uint16_t weight = 2; + if (xd->width >= width_8x8 && xd->width <= n4_w) { + uint16_t inc = AOMMIN(-max_row_offset + row_offset + 1, + mi_size_high[candidate_bsize]); + // Obtain range used in weight calculation. + weight = AOMMAX(weight, inc); + // Update processed rows. + *processed_rows = inc - row_offset - 1; + } + + add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count, + newmv_count, ref_mv_stack, ref_mv_weight, + gm_mv_candidates, cm->global_motion, len * weight); + + i += len; + } +} + +static AOM_INLINE void scan_col_mbmi( + const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_row, + const MV_REFERENCE_FRAME rf[2], int col_offset, CANDIDATE_MV *ref_mv_stack, + uint16_t *ref_mv_weight, uint8_t *refmv_count, uint8_t *ref_match_count, + uint8_t *newmv_count, int_mv *gm_mv_candidates, int max_col_offset, + int *processed_cols) { + int end_mi = AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row); + end_mi = AOMMIN(end_mi, mi_size_high[BLOCK_64X64]); + const int n8_h_8 = mi_size_high[BLOCK_8X8]; + const int n8_h_16 = mi_size_high[BLOCK_16X16]; + int i; + int row_offset = 0; + if (abs(col_offset) > 1) { + row_offset = 1; + if ((mi_row & 0x01) && xd->height < n8_h_8) --row_offset; + } + const int use_step_16 = (xd->height >= 16); + + for (i = 0; i < end_mi;) { + const MB_MODE_INFO *const candidate = + xd->mi[(row_offset + i) * xd->mi_stride + col_offset]; + const int candidate_bsize = candidate->sb_type; + const int n4_h = mi_size_high[candidate_bsize]; + int len = AOMMIN(xd->height, n4_h); + if (use_step_16) + len = AOMMAX(n8_h_16, len); + else if (abs(col_offset) > 1) + len = AOMMAX(len, n8_h_8); + + int weight = 2; + if (xd->height >= n8_h_8 && xd->height <= n4_h) { + int inc = AOMMIN(-max_col_offset + col_offset + 1, + mi_size_wide[candidate_bsize]); + // Obtain range used in weight calculation. + weight = AOMMAX(weight, inc); + // Update processed cols. + *processed_cols = inc - col_offset - 1; + } + + add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count, + newmv_count, ref_mv_stack, ref_mv_weight, + gm_mv_candidates, cm->global_motion, len * weight); + + i += len; + } +} + +static AOM_INLINE void scan_blk_mbmi( + const AV1_COMMON *cm, const MACROBLOCKD *xd, const int mi_row, + const int mi_col, const MV_REFERENCE_FRAME rf[2], int row_offset, + int col_offset, CANDIDATE_MV *ref_mv_stack, uint16_t *ref_mv_weight, + uint8_t *ref_match_count, uint8_t *newmv_count, int_mv *gm_mv_candidates, + uint8_t *refmv_count) { + const TileInfo *const tile = &xd->tile; + POSITION mi_pos; + + mi_pos.row = row_offset; + mi_pos.col = col_offset; + + if (is_inside(tile, mi_col, mi_row, &mi_pos)) { + const MB_MODE_INFO *const candidate = + xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col]; + const int len = mi_size_wide[BLOCK_8X8]; + + add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count, + newmv_count, ref_mv_stack, ref_mv_weight, + gm_mv_candidates, cm->global_motion, 2 * len); + } // Analyze a single 8x8 block motion information. +} + +static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd, + int mi_row, int mi_col, int bs) { + const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size]; + const int mask_row = mi_row & (sb_mi_size - 1); + const int mask_col = mi_col & (sb_mi_size - 1); + + if (bs > mi_size_wide[BLOCK_64X64]) return 0; + + // In a split partition all apart from the bottom right has a top right + int has_tr = !((mask_row & bs) && (mask_col & bs)); + + // bs > 0 and bs is a power of 2 + assert(bs > 0 && !(bs & (bs - 1))); + + // For each 4x4 group of blocks, when the bottom right is decoded the blocks + // to the right have not been decoded therefore the bottom right does + // not have a top right + while (bs < sb_mi_size) { + if (mask_col & bs) { + if ((mask_col & (2 * bs)) && (mask_row & (2 * bs))) { + has_tr = 0; + break; + } + } else { + break; + } + bs <<= 1; + } + + // The left hand of two vertical rectangles always has a top right (as the + // block above will have been decoded) + if (xd->width < xd->height) + if (!xd->is_sec_rect) has_tr = 1; + + // The bottom of two horizontal rectangles never has a top right (as the block + // to the right won't have been decoded) + if (xd->width > xd->height) + if (xd->is_sec_rect) has_tr = 0; + + // The bottom left square of a Vertical A (in the old format) does + // not have a top right as it is decoded before the right hand + // rectangle of the partition + if (xd->mi[0]->partition == PARTITION_VERT_A) { + if (xd->width == xd->height) + if (mask_row & bs) has_tr = 0; + } + + return has_tr; +} + +static int check_sb_border(const int mi_row, const int mi_col, + const int row_offset, const int col_offset) { + const int sb_mi_size = mi_size_wide[BLOCK_64X64]; + const int row = mi_row & (sb_mi_size - 1); + const int col = mi_col & (sb_mi_size - 1); + + if (row + row_offset < 0 || row + row_offset >= sb_mi_size || + col + col_offset < 0 || col + col_offset >= sb_mi_size) + return 0; + + return 1; +} + +static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd, + int mi_row, int mi_col, MV_REFERENCE_FRAME ref_frame, + int blk_row, int blk_col, int_mv *gm_mv_candidates, + uint8_t *const refmv_count, + CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE], + uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE], + int16_t *mode_context) { + POSITION mi_pos; + mi_pos.row = (mi_row & 0x01) ? blk_row : blk_row + 1; + mi_pos.col = (mi_col & 0x01) ? blk_col : blk_col + 1; + + if (!is_inside(&xd->tile, mi_col, mi_row, &mi_pos)) return 0; + + const TPL_MV_REF *prev_frame_mvs = + cm->tpl_mvs + + ((mi_row + mi_pos.row) >> 1) * (cm->mi_params.mi_stride >> 1) + + ((mi_col + mi_pos.col) >> 1); + if (prev_frame_mvs->mfmv0.as_int == INVALID_MV) return 0; + + MV_REFERENCE_FRAME rf[2]; + av1_set_ref_frame(rf, ref_frame); + + const uint16_t weight_unit = 1; // mi_size_wide[BLOCK_8X8]; + const int cur_frame_index = cm->cur_frame->order_hint; + const RefCntBuffer *const buf_0 = get_ref_frame_buf(cm, rf[0]); + const int frame0_index = buf_0->order_hint; + const int cur_offset_0 = get_relative_dist(&cm->seq_params.order_hint_info, + cur_frame_index, frame0_index); + int idx; + const int allow_high_precision_mv = cm->features.allow_high_precision_mv; + const int force_integer_mv = cm->features.cur_frame_force_integer_mv; + + int_mv this_refmv; + get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv, + cur_offset_0, prev_frame_mvs->ref_frame_offset); + lower_mv_precision(&this_refmv.as_mv, allow_high_precision_mv, + force_integer_mv); + + if (rf[1] == NONE_FRAME) { + if (blk_row == 0 && blk_col == 0) { + if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 || + abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16) + mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET); + } + + for (idx = 0; idx < *refmv_count; ++idx) + if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int) break; + + if (idx < *refmv_count) ref_mv_weight[idx] += 2 * weight_unit; + + if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) { + ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int; + ref_mv_weight[idx] = 2 * weight_unit; + ++(*refmv_count); + } + } else { + // Process compound inter mode + const RefCntBuffer *const buf_1 = get_ref_frame_buf(cm, rf[1]); + const int frame1_index = buf_1->order_hint; + const int cur_offset_1 = get_relative_dist(&cm->seq_params.order_hint_info, + cur_frame_index, frame1_index); + int_mv comp_refmv; + get_mv_projection(&comp_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv, + cur_offset_1, prev_frame_mvs->ref_frame_offset); + lower_mv_precision(&comp_refmv.as_mv, allow_high_precision_mv, + force_integer_mv); + + if (blk_row == 0 && blk_col == 0) { + if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 || + abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16 || + abs(comp_refmv.as_mv.row - gm_mv_candidates[1].as_mv.row) >= 16 || + abs(comp_refmv.as_mv.col - gm_mv_candidates[1].as_mv.col) >= 16) + mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET); + } + + for (idx = 0; idx < *refmv_count; ++idx) { + if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int && + comp_refmv.as_int == ref_mv_stack[idx].comp_mv.as_int) + break; + } + + if (idx < *refmv_count) ref_mv_weight[idx] += 2 * weight_unit; + + if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) { + ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int; + ref_mv_stack[idx].comp_mv.as_int = comp_refmv.as_int; + ref_mv_weight[idx] = 2 * weight_unit; + ++(*refmv_count); + } + } + + return 1; +} + +static AOM_INLINE void process_compound_ref_mv_candidate( + const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm, + const MV_REFERENCE_FRAME *const rf, int_mv ref_id[2][2], + int ref_id_count[2], int_mv ref_diff[2][2], int ref_diff_count[2]) { + for (int rf_idx = 0; rf_idx < 2; ++rf_idx) { + MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx]; + + for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) { + if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) { + ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx]; + ++ref_id_count[cmp_idx]; + } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) { + int_mv this_mv = candidate->mv[rf_idx]; + if (cm->ref_frame_sign_bias[can_rf] != + cm->ref_frame_sign_bias[rf[cmp_idx]]) { + this_mv.as_mv.row = -this_mv.as_mv.row; + this_mv.as_mv.col = -this_mv.as_mv.col; + } + ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv; + ++ref_diff_count[cmp_idx]; + } + } + } +} + +static AOM_INLINE void process_single_ref_mv_candidate( + const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm, + MV_REFERENCE_FRAME ref_frame, uint8_t *const refmv_count, + CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE], + uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE]) { + for (int rf_idx = 0; rf_idx < 2; ++rf_idx) { + if (candidate->ref_frame[rf_idx] > INTRA_FRAME) { + int_mv this_mv = candidate->mv[rf_idx]; + if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] != + cm->ref_frame_sign_bias[ref_frame]) { + this_mv.as_mv.row = -this_mv.as_mv.row; + this_mv.as_mv.col = -this_mv.as_mv.col; + } + int stack_idx; + for (stack_idx = 0; stack_idx < *refmv_count; ++stack_idx) { + const int_mv stack_mv = ref_mv_stack[stack_idx].this_mv; + if (this_mv.as_int == stack_mv.as_int) break; + } + + if (stack_idx == *refmv_count) { + ref_mv_stack[stack_idx].this_mv = this_mv; + + // TODO(jingning): Set an arbitrary small number here. The weight + // doesn't matter as long as it is properly initialized. + ref_mv_weight[stack_idx] = 2; + ++(*refmv_count); + } + } + } +} + +static AOM_INLINE void setup_ref_mv_list( + const AV1_COMMON *cm, const MACROBLOCKD *xd, MV_REFERENCE_FRAME ref_frame, + uint8_t *const refmv_count, + CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE], + uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE], + int_mv mv_ref_list[MAX_MV_REF_CANDIDATES], int_mv *gm_mv_candidates, + int mi_row, int mi_col, int16_t *mode_context) { + const int bs = AOMMAX(xd->width, xd->height); + const int has_tr = has_top_right(cm, xd, mi_row, mi_col, bs); + MV_REFERENCE_FRAME rf[2]; + + const TileInfo *const tile = &xd->tile; + int max_row_offset = 0, max_col_offset = 0; + const int row_adj = (xd->height < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01); + const int col_adj = (xd->width < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01); + int processed_rows = 0; + int processed_cols = 0; + + av1_set_ref_frame(rf, ref_frame); + mode_context[ref_frame] = 0; + *refmv_count = 0; + + // Find valid maximum row/col offset. + if (xd->up_available) { + max_row_offset = -(MVREF_ROW_COLS << 1) + row_adj; + + if (xd->height < mi_size_high[BLOCK_8X8]) + max_row_offset = -(2 << 1) + row_adj; + + max_row_offset = find_valid_row_offset(tile, mi_row, max_row_offset); + } + + if (xd->left_available) { + max_col_offset = -(MVREF_ROW_COLS << 1) + col_adj; + + if (xd->width < mi_size_wide[BLOCK_8X8]) + max_col_offset = -(2 << 1) + col_adj; + + max_col_offset = find_valid_col_offset(tile, mi_col, max_col_offset); + } + + uint8_t col_match_count = 0; + uint8_t row_match_count = 0; + uint8_t newmv_count = 0; + + // Scan the first above row mode info. row_offset = -1; + if (abs(max_row_offset) >= 1) + scan_row_mbmi(cm, xd, mi_col, rf, -1, ref_mv_stack, ref_mv_weight, + refmv_count, &row_match_count, &newmv_count, gm_mv_candidates, + max_row_offset, &processed_rows); + // Scan the first left column mode info. col_offset = -1; + if (abs(max_col_offset) >= 1) + scan_col_mbmi(cm, xd, mi_row, rf, -1, ref_mv_stack, ref_mv_weight, + refmv_count, &col_match_count, &newmv_count, gm_mv_candidates, + max_col_offset, &processed_cols); + // Check top-right boundary + if (has_tr) + scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->width, ref_mv_stack, + ref_mv_weight, &row_match_count, &newmv_count, + gm_mv_candidates, refmv_count); + + const uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0); + const uint8_t nearest_refmv_count = *refmv_count; + + // TODO(yunqing): for comp_search, do it for all 3 cases. + for (int idx = 0; idx < nearest_refmv_count; ++idx) + ref_mv_weight[idx] += REF_CAT_LEVEL; + + if (cm->features.allow_ref_frame_mvs) { + int is_available = 0; + const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->height); + const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->width); + const int blk_row_end = AOMMIN(xd->height, mi_size_high[BLOCK_64X64]); + const int blk_col_end = AOMMIN(xd->width, mi_size_wide[BLOCK_64X64]); + + const int tpl_sample_pos[3][2] = { + { voffset, -2 }, + { voffset, hoffset }, + { voffset - 2, hoffset }, + }; + const int allow_extension = (xd->height >= mi_size_high[BLOCK_8X8]) && + (xd->height < mi_size_high[BLOCK_64X64]) && + (xd->width >= mi_size_wide[BLOCK_8X8]) && + (xd->width < mi_size_wide[BLOCK_64X64]); + + const int step_h = (xd->height >= mi_size_high[BLOCK_64X64]) + ? mi_size_high[BLOCK_16X16] + : mi_size_high[BLOCK_8X8]; + const int step_w = (xd->width >= mi_size_wide[BLOCK_64X64]) + ? mi_size_wide[BLOCK_16X16] + : mi_size_wide[BLOCK_8X8]; + + for (int blk_row = 0; blk_row < blk_row_end; blk_row += step_h) { + for (int blk_col = 0; blk_col < blk_col_end; blk_col += step_w) { + int ret = add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row, + blk_col, gm_mv_candidates, refmv_count, + ref_mv_stack, ref_mv_weight, mode_context); + if (blk_row == 0 && blk_col == 0) is_available = ret; + } + } + + if (is_available == 0) mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET); + + for (int i = 0; i < 3 && allow_extension; ++i) { + const int blk_row = tpl_sample_pos[i][0]; + const int blk_col = tpl_sample_pos[i][1]; + + if (!check_sb_border(mi_row, mi_col, blk_row, blk_col)) continue; + add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row, blk_col, + gm_mv_candidates, refmv_count, ref_mv_stack, ref_mv_weight, + mode_context); + } + } + + uint8_t dummy_newmv_count = 0; + + // Scan the second outer area. + scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, -1, ref_mv_stack, ref_mv_weight, + &row_match_count, &dummy_newmv_count, gm_mv_candidates, + refmv_count); + + for (int idx = 2; idx <= MVREF_ROW_COLS; ++idx) { + const int row_offset = -(idx << 1) + 1 + row_adj; + const int col_offset = -(idx << 1) + 1 + col_adj; + + if (abs(row_offset) <= abs(max_row_offset) && + abs(row_offset) > processed_rows) + scan_row_mbmi(cm, xd, mi_col, rf, row_offset, ref_mv_stack, ref_mv_weight, + refmv_count, &row_match_count, &dummy_newmv_count, + gm_mv_candidates, max_row_offset, &processed_rows); + + if (abs(col_offset) <= abs(max_col_offset) && + abs(col_offset) > processed_cols) + scan_col_mbmi(cm, xd, mi_row, rf, col_offset, ref_mv_stack, ref_mv_weight, + refmv_count, &col_match_count, &dummy_newmv_count, + gm_mv_candidates, max_col_offset, &processed_cols); + } + + const uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0); + + switch (nearest_match) { + case 0: + if (ref_match_count >= 1) mode_context[ref_frame] |= 1; + if (ref_match_count == 1) + mode_context[ref_frame] |= (1 << REFMV_OFFSET); + else if (ref_match_count >= 2) + mode_context[ref_frame] |= (2 << REFMV_OFFSET); + break; + case 1: + mode_context[ref_frame] |= (newmv_count > 0) ? 2 : 3; + if (ref_match_count == 1) + mode_context[ref_frame] |= (3 << REFMV_OFFSET); + else if (ref_match_count >= 2) + mode_context[ref_frame] |= (4 << REFMV_OFFSET); + break; + case 2: + default: + if (newmv_count >= 1) + mode_context[ref_frame] |= 4; + else + mode_context[ref_frame] |= 5; + + mode_context[ref_frame] |= (5 << REFMV_OFFSET); + break; + } + + // Rank the likelihood and assign nearest and near mvs. + int len = nearest_refmv_count; + while (len > 0) { + int nr_len = 0; + for (int idx = 1; idx < len; ++idx) { + if (ref_mv_weight[idx - 1] < ref_mv_weight[idx]) { + const CANDIDATE_MV tmp_mv = ref_mv_stack[idx - 1]; + const uint16_t tmp_ref_mv_weight = ref_mv_weight[idx - 1]; + ref_mv_stack[idx - 1] = ref_mv_stack[idx]; + ref_mv_stack[idx] = tmp_mv; + ref_mv_weight[idx - 1] = ref_mv_weight[idx]; + ref_mv_weight[idx] = tmp_ref_mv_weight; + nr_len = idx; + } + } + len = nr_len; + } + + len = *refmv_count; + while (len > nearest_refmv_count) { + int nr_len = nearest_refmv_count; + for (int idx = nearest_refmv_count + 1; idx < len; ++idx) { + if (ref_mv_weight[idx - 1] < ref_mv_weight[idx]) { + const CANDIDATE_MV tmp_mv = ref_mv_stack[idx - 1]; + const uint16_t tmp_ref_mv_weight = ref_mv_weight[idx - 1]; + ref_mv_stack[idx - 1] = ref_mv_stack[idx]; + ref_mv_stack[idx] = tmp_mv; + ref_mv_weight[idx - 1] = ref_mv_weight[idx]; + ref_mv_weight[idx] = tmp_ref_mv_weight; + nr_len = idx; + } + } + len = nr_len; + } + + int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->width); + mi_width = AOMMIN(mi_width, cm->mi_params.mi_cols - mi_col); + int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->height); + mi_height = AOMMIN(mi_height, cm->mi_params.mi_rows - mi_row); + const int mi_size = AOMMIN(mi_width, mi_height); + if (rf[1] > NONE_FRAME) { + // TODO(jingning, yunqing): Refactor and consolidate the compound and + // single reference frame modes. Reduce unnecessary redundancy. + if (*refmv_count < MAX_MV_REF_CANDIDATES) { + int_mv ref_id[2][2], ref_diff[2][2]; + int ref_id_count[2] = { 0 }, ref_diff_count[2] = { 0 }; + + for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size;) { + const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx]; + process_compound_ref_mv_candidate( + candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count); + idx += mi_size_wide[candidate->sb_type]; + } + + for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size;) { + const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1]; + process_compound_ref_mv_candidate( + candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count); + idx += mi_size_high[candidate->sb_type]; + } + + // Build up the compound mv predictor + int_mv comp_list[MAX_MV_REF_CANDIDATES][2]; + + for (int idx = 0; idx < 2; ++idx) { + int comp_idx = 0; + for (int list_idx = 0; + list_idx < ref_id_count[idx] && comp_idx < MAX_MV_REF_CANDIDATES; + ++list_idx, ++comp_idx) + comp_list[comp_idx][idx] = ref_id[idx][list_idx]; + for (int list_idx = 0; + list_idx < ref_diff_count[idx] && comp_idx < MAX_MV_REF_CANDIDATES; + ++list_idx, ++comp_idx) + comp_list[comp_idx][idx] = ref_diff[idx][list_idx]; + for (; comp_idx < MAX_MV_REF_CANDIDATES; ++comp_idx) + comp_list[comp_idx][idx] = gm_mv_candidates[idx]; + } + + if (*refmv_count) { + assert(*refmv_count == 1); + if (comp_list[0][0].as_int == ref_mv_stack[0].this_mv.as_int && + comp_list[0][1].as_int == ref_mv_stack[0].comp_mv.as_int) { + ref_mv_stack[*refmv_count].this_mv = comp_list[1][0]; + ref_mv_stack[*refmv_count].comp_mv = comp_list[1][1]; + } else { + ref_mv_stack[*refmv_count].this_mv = comp_list[0][0]; + ref_mv_stack[*refmv_count].comp_mv = comp_list[0][1]; + } + ref_mv_weight[*refmv_count] = 2; + ++*refmv_count; + } else { + for (int idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx) { + ref_mv_stack[*refmv_count].this_mv = comp_list[idx][0]; + ref_mv_stack[*refmv_count].comp_mv = comp_list[idx][1]; + ref_mv_weight[*refmv_count] = 2; + ++*refmv_count; + } + } + } + + assert(*refmv_count >= 2); + + for (int idx = 0; idx < *refmv_count; ++idx) { + clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->width << MI_SIZE_LOG2, + xd->height << MI_SIZE_LOG2, xd); + clamp_mv_ref(&ref_mv_stack[idx].comp_mv.as_mv, xd->width << MI_SIZE_LOG2, + xd->height << MI_SIZE_LOG2, xd); + } + } else { + // Handle single reference frame extension + for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size && + *refmv_count < MAX_MV_REF_CANDIDATES;) { + const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx]; + process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count, + ref_mv_stack, ref_mv_weight); + idx += mi_size_wide[candidate->sb_type]; + } + + for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size && + *refmv_count < MAX_MV_REF_CANDIDATES;) { + const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1]; + process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count, + ref_mv_stack, ref_mv_weight); + idx += mi_size_high[candidate->sb_type]; + } + + for (int idx = 0; idx < *refmv_count; ++idx) { + clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->width << MI_SIZE_LOG2, + xd->height << MI_SIZE_LOG2, xd); + } + + if (mv_ref_list != NULL) { + for (int idx = *refmv_count; idx < MAX_MV_REF_CANDIDATES; ++idx) + mv_ref_list[idx].as_int = gm_mv_candidates[0].as_int; + + for (int idx = 0; idx < AOMMIN(MAX_MV_REF_CANDIDATES, *refmv_count); + ++idx) { + mv_ref_list[idx].as_int = ref_mv_stack[idx].this_mv.as_int; + } + } + } +} + +void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd, + MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, + uint8_t ref_mv_count[MODE_CTX_REF_FRAMES], + CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE], + uint16_t ref_mv_weight[][MAX_REF_MV_STACK_SIZE], + int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES], + int_mv *global_mvs, int16_t *mode_context) { + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + int_mv gm_mv[2]; + + if (ref_frame == INTRA_FRAME) { + gm_mv[0].as_int = gm_mv[1].as_int = 0; + if (global_mvs != NULL) { + global_mvs[ref_frame].as_int = INVALID_MV; + } + } else { + const BLOCK_SIZE bsize = mi->sb_type; + const int allow_high_precision_mv = cm->features.allow_high_precision_mv; + const int force_integer_mv = cm->features.cur_frame_force_integer_mv; + if (ref_frame < REF_FRAMES) { + gm_mv[0] = gm_get_motion_vector(&cm->global_motion[ref_frame], + allow_high_precision_mv, bsize, mi_col, + mi_row, force_integer_mv); + gm_mv[1].as_int = 0; + if (global_mvs != NULL) global_mvs[ref_frame] = gm_mv[0]; + } else { + MV_REFERENCE_FRAME rf[2]; + av1_set_ref_frame(rf, ref_frame); + gm_mv[0] = gm_get_motion_vector(&cm->global_motion[rf[0]], + allow_high_precision_mv, bsize, mi_col, + mi_row, force_integer_mv); + gm_mv[1] = gm_get_motion_vector(&cm->global_motion[rf[1]], + allow_high_precision_mv, bsize, mi_col, + mi_row, force_integer_mv); + } + } + + setup_ref_mv_list(cm, xd, ref_frame, &ref_mv_count[ref_frame], + ref_mv_stack[ref_frame], ref_mv_weight[ref_frame], + mv_ref_list ? mv_ref_list[ref_frame] : NULL, gm_mv, mi_row, + mi_col, mode_context); +} + +void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv, + int_mv *near_mv, int is_integer) { + int i; + // Make sure all the candidates are properly clamped etc + for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) { + lower_mv_precision(&mvlist[i].as_mv, allow_hp, is_integer); + } + *nearest_mv = mvlist[0]; + *near_mv = mvlist[1]; +} + +void av1_setup_frame_buf_refs(AV1_COMMON *cm) { + cm->cur_frame->order_hint = cm->current_frame.order_hint; + cm->cur_frame->display_order_hint = cm->current_frame.display_order_hint; + + MV_REFERENCE_FRAME ref_frame; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + if (buf != NULL) { + cm->cur_frame->ref_order_hints[ref_frame - LAST_FRAME] = buf->order_hint; + cm->cur_frame->ref_display_order_hint[ref_frame - LAST_FRAME] = + buf->display_order_hint; + } + } +} + +void av1_setup_frame_sign_bias(AV1_COMMON *cm) { + MV_REFERENCE_FRAME ref_frame; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + if (cm->seq_params.order_hint_info.enable_order_hint && buf != NULL) { + const int ref_order_hint = buf->order_hint; + cm->ref_frame_sign_bias[ref_frame] = + (get_relative_dist(&cm->seq_params.order_hint_info, ref_order_hint, + (int)cm->current_frame.order_hint) <= 0) + ? 0 + : 1; + } else { + cm->ref_frame_sign_bias[ref_frame] = 0; + } + } +} + +#define MAX_OFFSET_WIDTH 64 +#define MAX_OFFSET_HEIGHT 0 + +static int get_block_position(AV1_COMMON *cm, int *mi_r, int *mi_c, int blk_row, + int blk_col, MV mv, int sign_bias) { + const int base_blk_row = (blk_row >> 3) << 3; + const int base_blk_col = (blk_col >> 3) << 3; + + const int row_offset = (mv.row >= 0) ? (mv.row >> (4 + MI_SIZE_LOG2)) + : -((-mv.row) >> (4 + MI_SIZE_LOG2)); + + const int col_offset = (mv.col >= 0) ? (mv.col >> (4 + MI_SIZE_LOG2)) + : -((-mv.col) >> (4 + MI_SIZE_LOG2)); + + const int row = + (sign_bias == 1) ? blk_row - row_offset : blk_row + row_offset; + const int col = + (sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset; + + if (row < 0 || row >= (cm->mi_params.mi_rows >> 1) || col < 0 || + col >= (cm->mi_params.mi_cols >> 1)) + return 0; + + if (row < base_blk_row - (MAX_OFFSET_HEIGHT >> 3) || + row >= base_blk_row + 8 + (MAX_OFFSET_HEIGHT >> 3) || + col < base_blk_col - (MAX_OFFSET_WIDTH >> 3) || + col >= base_blk_col + 8 + (MAX_OFFSET_WIDTH >> 3)) + return 0; + + *mi_r = row; + *mi_c = col; + + return 1; +} + +// Note: motion_filed_projection finds motion vectors of current frame's +// reference frame, and projects them to current frame. To make it clear, +// let's call current frame's reference frame as start frame. +// Call Start frame's reference frames as reference frames. +// Call ref_offset as frame distances between start frame and its reference +// frames. +static int motion_field_projection(AV1_COMMON *cm, + MV_REFERENCE_FRAME start_frame, int dir) { + TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs; + int ref_offset[REF_FRAMES] = { 0 }; + + const RefCntBuffer *const start_frame_buf = + get_ref_frame_buf(cm, start_frame); + if (start_frame_buf == NULL) return 0; + + if (start_frame_buf->frame_type == KEY_FRAME || + start_frame_buf->frame_type == INTRA_ONLY_FRAME) + return 0; + + if (start_frame_buf->mi_rows != cm->mi_params.mi_rows || + start_frame_buf->mi_cols != cm->mi_params.mi_cols) + return 0; + + const int start_frame_order_hint = start_frame_buf->order_hint; + const unsigned int *const ref_order_hints = + &start_frame_buf->ref_order_hints[0]; + const int cur_order_hint = cm->cur_frame->order_hint; + int start_to_current_frame_offset = get_relative_dist( + &cm->seq_params.order_hint_info, start_frame_order_hint, cur_order_hint); + + for (MV_REFERENCE_FRAME rf = LAST_FRAME; rf <= INTER_REFS_PER_FRAME; ++rf) { + ref_offset[rf] = get_relative_dist(&cm->seq_params.order_hint_info, + start_frame_order_hint, + ref_order_hints[rf - LAST_FRAME]); + } + + if (dir == 2) start_to_current_frame_offset = -start_to_current_frame_offset; + + MV_REF *mv_ref_base = start_frame_buf->mvs; + const int mvs_rows = (cm->mi_params.mi_rows + 1) >> 1; + const int mvs_cols = (cm->mi_params.mi_cols + 1) >> 1; + + for (int blk_row = 0; blk_row < mvs_rows; ++blk_row) { + for (int blk_col = 0; blk_col < mvs_cols; ++blk_col) { + MV_REF *mv_ref = &mv_ref_base[blk_row * mvs_cols + blk_col]; + MV fwd_mv = mv_ref->mv.as_mv; + + if (mv_ref->ref_frame > INTRA_FRAME) { + int_mv this_mv; + int mi_r, mi_c; + const int ref_frame_offset = ref_offset[mv_ref->ref_frame]; + + int pos_valid = + abs(ref_frame_offset) <= MAX_FRAME_DISTANCE && + ref_frame_offset > 0 && + abs(start_to_current_frame_offset) <= MAX_FRAME_DISTANCE; + + if (pos_valid) { + get_mv_projection(&this_mv.as_mv, fwd_mv, + start_to_current_frame_offset, ref_frame_offset); + pos_valid = get_block_position(cm, &mi_r, &mi_c, blk_row, blk_col, + this_mv.as_mv, dir >> 1); + } + + if (pos_valid) { + const int mi_offset = mi_r * (cm->mi_params.mi_stride >> 1) + mi_c; + + tpl_mvs_base[mi_offset].mfmv0.as_mv.row = fwd_mv.row; + tpl_mvs_base[mi_offset].mfmv0.as_mv.col = fwd_mv.col; + tpl_mvs_base[mi_offset].ref_frame_offset = ref_frame_offset; + } + } + } + } + + return 1; +} + +void av1_setup_motion_field(AV1_COMMON *cm) { + const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info; + + memset(cm->ref_frame_side, 0, sizeof(cm->ref_frame_side)); + if (!order_hint_info->enable_order_hint) return; + + TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs; + int size = ((cm->mi_params.mi_rows + MAX_MIB_SIZE) >> 1) * + (cm->mi_params.mi_stride >> 1); + for (int idx = 0; idx < size; ++idx) { + tpl_mvs_base[idx].mfmv0.as_int = INVALID_MV; + tpl_mvs_base[idx].ref_frame_offset = 0; + } + + const int cur_order_hint = cm->cur_frame->order_hint; + + const RefCntBuffer *ref_buf[INTER_REFS_PER_FRAME]; + int ref_order_hint[INTER_REFS_PER_FRAME]; + + for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { + const int ref_idx = ref_frame - LAST_FRAME; + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + int order_hint = 0; + + if (buf != NULL) order_hint = buf->order_hint; + + ref_buf[ref_idx] = buf; + ref_order_hint[ref_idx] = order_hint; + + if (get_relative_dist(order_hint_info, order_hint, cur_order_hint) > 0) + cm->ref_frame_side[ref_frame] = 1; + else if (order_hint == cur_order_hint) + cm->ref_frame_side[ref_frame] = -1; + } + + int ref_stamp = MFMV_STACK_SIZE - 1; + + if (ref_buf[LAST_FRAME - LAST_FRAME] != NULL) { + const int alt_of_lst_order_hint = + ref_buf[LAST_FRAME - LAST_FRAME] + ->ref_order_hints[ALTREF_FRAME - LAST_FRAME]; + + const int is_lst_overlay = + (alt_of_lst_order_hint == ref_order_hint[GOLDEN_FRAME - LAST_FRAME]); + if (!is_lst_overlay) motion_field_projection(cm, LAST_FRAME, 2); + --ref_stamp; + } + + if (get_relative_dist(order_hint_info, + ref_order_hint[BWDREF_FRAME - LAST_FRAME], + cur_order_hint) > 0) { + if (motion_field_projection(cm, BWDREF_FRAME, 0)) --ref_stamp; + } + + if (get_relative_dist(order_hint_info, + ref_order_hint[ALTREF2_FRAME - LAST_FRAME], + cur_order_hint) > 0) { + if (motion_field_projection(cm, ALTREF2_FRAME, 0)) --ref_stamp; + } + + if (get_relative_dist(order_hint_info, + ref_order_hint[ALTREF_FRAME - LAST_FRAME], + cur_order_hint) > 0 && + ref_stamp >= 0) + if (motion_field_projection(cm, ALTREF_FRAME, 0)) --ref_stamp; + + if (ref_stamp >= 0) motion_field_projection(cm, LAST2_FRAME, 2); +} + +static INLINE void record_samples(const MB_MODE_INFO *mbmi, int *pts, + int *pts_inref, int row_offset, int sign_r, + int col_offset, int sign_c) { + int bw = block_size_wide[mbmi->sb_type]; + int bh = block_size_high[mbmi->sb_type]; + int x = col_offset * MI_SIZE + sign_c * AOMMAX(bw, MI_SIZE) / 2 - 1; + int y = row_offset * MI_SIZE + sign_r * AOMMAX(bh, MI_SIZE) / 2 - 1; + + pts[0] = GET_MV_SUBPEL(x); + pts[1] = GET_MV_SUBPEL(y); + pts_inref[0] = GET_MV_SUBPEL(x) + mbmi->mv[0].as_mv.col; + pts_inref[1] = GET_MV_SUBPEL(y) + mbmi->mv[0].as_mv.row; +} + +// Select samples according to the motion vector difference. +uint8_t av1_selectSamples(MV *mv, int *pts, int *pts_inref, int len, + BLOCK_SIZE bsize) { + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const int thresh = clamp(AOMMAX(bw, bh), 16, 112); + int pts_mvd[SAMPLES_ARRAY_SIZE] = { 0 }; + int i, j, k, l = len; + uint8_t ret = 0; + assert(len <= LEAST_SQUARES_SAMPLES_MAX); + + // Obtain the motion vector difference. + for (i = 0; i < len; ++i) { + pts_mvd[i] = abs(pts_inref[2 * i] - pts[2 * i] - mv->col) + + abs(pts_inref[2 * i + 1] - pts[2 * i + 1] - mv->row); + + if (pts_mvd[i] > thresh) + pts_mvd[i] = -1; + else + ret++; + } + + // Keep at least 1 sample. + if (!ret) return 1; + + i = 0; + j = l - 1; + for (k = 0; k < l - ret; k++) { + while (pts_mvd[i] != -1) i++; + while (pts_mvd[j] == -1) j--; + assert(i != j); + if (i > j) break; + + // Replace the discarded samples; + pts_mvd[i] = pts_mvd[j]; + pts[2 * i] = pts[2 * j]; + pts[2 * i + 1] = pts[2 * j + 1]; + pts_inref[2 * i] = pts_inref[2 * j]; + pts_inref[2 * i + 1] = pts_inref[2 * j + 1]; + i++; + j--; + } + + return ret; +} + +// Note: Samples returned are at 1/8-pel precision +// Sample are the neighbor block center point's coordinates relative to the +// left-top pixel of current block. +uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts, + int *pts_inref) { + const MB_MODE_INFO *const mbmi0 = xd->mi[0]; + const int ref_frame = mbmi0->ref_frame[0]; + const int up_available = xd->up_available; + const int left_available = xd->left_available; + int i, mi_step; + uint8_t np = 0; + int do_tl = 1; + int do_tr = 1; + const int mi_stride = xd->mi_stride; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + // scan the nearest above rows + if (up_available) { + const int mi_row_offset = -1; + const MB_MODE_INFO *mbmi = xd->mi[mi_row_offset * mi_stride]; + uint8_t superblock_width = mi_size_wide[mbmi->sb_type]; + + if (xd->width <= superblock_width) { + // Handle "current block width <= above block width" case. + const int col_offset = -mi_col % superblock_width; + + if (col_offset < 0) do_tl = 0; + if (col_offset + superblock_width > xd->width) do_tr = 0; + + if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { + record_samples(mbmi, pts, pts_inref, 0, -1, col_offset, 1); + pts += 2; + pts_inref += 2; + np++; + if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX; + } + } else { + // Handle "current block width > above block width" case. + for (i = 0; i < AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col); + i += mi_step) { + mbmi = xd->mi[i + mi_row_offset * mi_stride]; + superblock_width = mi_size_wide[mbmi->sb_type]; + mi_step = AOMMIN(xd->width, superblock_width); + + if (mbmi->ref_frame[0] == ref_frame && + mbmi->ref_frame[1] == NONE_FRAME) { + record_samples(mbmi, pts, pts_inref, 0, -1, i, 1); + pts += 2; + pts_inref += 2; + np++; + if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX; + } + } + } + } + assert(np <= LEAST_SQUARES_SAMPLES_MAX); + + // scan the nearest left columns + if (left_available) { + const int mi_col_offset = -1; + const MB_MODE_INFO *mbmi = xd->mi[mi_col_offset]; + uint8_t superblock_height = mi_size_high[mbmi->sb_type]; + + if (xd->height <= superblock_height) { + // Handle "current block height <= above block height" case. + const int row_offset = -mi_row % superblock_height; + + if (row_offset < 0) do_tl = 0; + + if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { + record_samples(mbmi, pts, pts_inref, row_offset, 1, 0, -1); + pts += 2; + pts_inref += 2; + np++; + if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX; + } + } else { + // Handle "current block height > above block height" case. + for (i = 0; i < AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row); + i += mi_step) { + mbmi = xd->mi[mi_col_offset + i * mi_stride]; + superblock_height = mi_size_high[mbmi->sb_type]; + mi_step = AOMMIN(xd->height, superblock_height); + + if (mbmi->ref_frame[0] == ref_frame && + mbmi->ref_frame[1] == NONE_FRAME) { + record_samples(mbmi, pts, pts_inref, i, 1, 0, -1); + pts += 2; + pts_inref += 2; + np++; + if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX; + } + } + } + } + assert(np <= LEAST_SQUARES_SAMPLES_MAX); + + // Top-left block + if (do_tl && left_available && up_available) { + const int mi_row_offset = -1; + const int mi_col_offset = -1; + MB_MODE_INFO *mbmi = xd->mi[mi_col_offset + mi_row_offset * mi_stride]; + + if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { + record_samples(mbmi, pts, pts_inref, 0, -1, 0, -1); + pts += 2; + pts_inref += 2; + np++; + if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX; + } + } + assert(np <= LEAST_SQUARES_SAMPLES_MAX); + + // Top-right block + if (do_tr && + has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->width, xd->height))) { + const POSITION trb_pos = { -1, xd->width }; + const TileInfo *const tile = &xd->tile; + if (is_inside(tile, mi_col, mi_row, &trb_pos)) { + const int mi_row_offset = -1; + const int mi_col_offset = xd->width; + const MB_MODE_INFO *mbmi = + xd->mi[mi_col_offset + mi_row_offset * mi_stride]; + + if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { + record_samples(mbmi, pts, pts_inref, 0, -1, xd->width, 1); + np++; + if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX; + } + } + } + assert(np <= LEAST_SQUARES_SAMPLES_MAX); + + return np; +} + +void av1_setup_skip_mode_allowed(AV1_COMMON *cm) { + const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info; + SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info; + + skip_mode_info->skip_mode_allowed = 0; + skip_mode_info->ref_frame_idx_0 = INVALID_IDX; + skip_mode_info->ref_frame_idx_1 = INVALID_IDX; + + if (!order_hint_info->enable_order_hint || frame_is_intra_only(cm) || + cm->current_frame.reference_mode == SINGLE_REFERENCE) + return; + + const int cur_order_hint = cm->current_frame.order_hint; + int ref_order_hints[2] = { -1, INT_MAX }; + int ref_idx[2] = { INVALID_IDX, INVALID_IDX }; + + // Identify the nearest forward and backward references. + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, LAST_FRAME + i); + if (buf == NULL) continue; + + const int ref_order_hint = buf->order_hint; + if (get_relative_dist(order_hint_info, ref_order_hint, cur_order_hint) < + 0) { + // Forward reference + if (ref_order_hints[0] == -1 || + get_relative_dist(order_hint_info, ref_order_hint, + ref_order_hints[0]) > 0) { + ref_order_hints[0] = ref_order_hint; + ref_idx[0] = i; + } + } else if (get_relative_dist(order_hint_info, ref_order_hint, + cur_order_hint) > 0) { + // Backward reference + if (ref_order_hints[1] == INT_MAX || + get_relative_dist(order_hint_info, ref_order_hint, + ref_order_hints[1]) < 0) { + ref_order_hints[1] = ref_order_hint; + ref_idx[1] = i; + } + } + } + + if (ref_idx[0] != INVALID_IDX && ref_idx[1] != INVALID_IDX) { + // == Bi-directional prediction == + skip_mode_info->skip_mode_allowed = 1; + skip_mode_info->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]); + skip_mode_info->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]); + } else if (ref_idx[0] != INVALID_IDX && ref_idx[1] == INVALID_IDX) { + // == Forward prediction only == + // Identify the second nearest forward reference. + ref_order_hints[1] = -1; + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, LAST_FRAME + i); + if (buf == NULL) continue; + + const int ref_order_hint = buf->order_hint; + if ((ref_order_hints[0] != -1 && + get_relative_dist(order_hint_info, ref_order_hint, + ref_order_hints[0]) < 0) && + (ref_order_hints[1] == -1 || + get_relative_dist(order_hint_info, ref_order_hint, + ref_order_hints[1]) > 0)) { + // Second closest forward reference + ref_order_hints[1] = ref_order_hint; + ref_idx[1] = i; + } + } + if (ref_order_hints[1] != -1) { + skip_mode_info->skip_mode_allowed = 1; + skip_mode_info->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]); + skip_mode_info->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]); + } + } +} + +typedef struct { + int map_idx; // frame map index + RefCntBuffer *buf; // frame buffer + int sort_idx; // index based on the offset to be used for sorting +} REF_FRAME_INFO; + +// Compares the sort_idx fields. If they are equal, then compares the map_idx +// fields to break the tie. This ensures a stable sort. +static int compare_ref_frame_info(const void *arg_a, const void *arg_b) { + const REF_FRAME_INFO *info_a = (REF_FRAME_INFO *)arg_a; + const REF_FRAME_INFO *info_b = (REF_FRAME_INFO *)arg_b; + + const int sort_idx_diff = info_a->sort_idx - info_b->sort_idx; + if (sort_idx_diff != 0) return sort_idx_diff; + return info_a->map_idx - info_b->map_idx; +} + +static AOM_INLINE void set_ref_frame_info(int *remapped_ref_idx, int frame_idx, + REF_FRAME_INFO *ref_info) { + assert(frame_idx >= 0 && frame_idx < INTER_REFS_PER_FRAME); + + remapped_ref_idx[frame_idx] = ref_info->map_idx; +} + +void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx, + int lst_map_idx, int gld_map_idx) { + int lst_frame_sort_idx = -1; + int gld_frame_sort_idx = -1; + + assert(cm->seq_params.order_hint_info.enable_order_hint); + assert(cm->seq_params.order_hint_info.order_hint_bits_minus_1 >= 0); + const int cur_order_hint = (int)cm->current_frame.order_hint; + const int cur_frame_sort_idx = + 1 << cm->seq_params.order_hint_info.order_hint_bits_minus_1; + + REF_FRAME_INFO ref_frame_info[REF_FRAMES]; + int ref_flag_list[INTER_REFS_PER_FRAME] = { 0, 0, 0, 0, 0, 0, 0 }; + + for (int i = 0; i < REF_FRAMES; ++i) { + const int map_idx = i; + + ref_frame_info[i].map_idx = map_idx; + ref_frame_info[i].sort_idx = -1; + + RefCntBuffer *const buf = cm->ref_frame_map[map_idx]; + ref_frame_info[i].buf = buf; + + if (buf == NULL) continue; + // If this assertion fails, there is a reference leak. + assert(buf->ref_count > 0); + + const int offset = (int)buf->order_hint; + ref_frame_info[i].sort_idx = + (offset == -1) ? -1 + : cur_frame_sort_idx + + get_relative_dist(&cm->seq_params.order_hint_info, + offset, cur_order_hint); + assert(ref_frame_info[i].sort_idx >= -1); + + if (map_idx == lst_map_idx) lst_frame_sort_idx = ref_frame_info[i].sort_idx; + if (map_idx == gld_map_idx) gld_frame_sort_idx = ref_frame_info[i].sort_idx; + } + + // Confirm both LAST_FRAME and GOLDEN_FRAME are valid forward reference + // frames. + if (lst_frame_sort_idx == -1 || lst_frame_sort_idx >= cur_frame_sort_idx) { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Inter frame requests a look-ahead frame as LAST"); + } + if (gld_frame_sort_idx == -1 || gld_frame_sort_idx >= cur_frame_sort_idx) { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Inter frame requests a look-ahead frame as GOLDEN"); + } + + // Sort ref frames based on their frame_offset values. + qsort(ref_frame_info, REF_FRAMES, sizeof(REF_FRAME_INFO), + compare_ref_frame_info); + + // Identify forward and backward reference frames. + // Forward reference: offset < order_hint + // Backward reference: offset >= order_hint + int fwd_start_idx = 0, fwd_end_idx = REF_FRAMES - 1; + + for (int i = 0; i < REF_FRAMES; i++) { + if (ref_frame_info[i].sort_idx == -1) { + fwd_start_idx++; + continue; + } + + if (ref_frame_info[i].sort_idx >= cur_frame_sort_idx) { + fwd_end_idx = i - 1; + break; + } + } + + int bwd_start_idx = fwd_end_idx + 1; + int bwd_end_idx = REF_FRAMES - 1; + + // === Backward Reference Frames === + + // == ALTREF_FRAME == + if (bwd_start_idx <= bwd_end_idx) { + set_ref_frame_info(remapped_ref_idx, ALTREF_FRAME - LAST_FRAME, + &ref_frame_info[bwd_end_idx]); + ref_flag_list[ALTREF_FRAME - LAST_FRAME] = 1; + bwd_end_idx--; + } + + // == BWDREF_FRAME == + if (bwd_start_idx <= bwd_end_idx) { + set_ref_frame_info(remapped_ref_idx, BWDREF_FRAME - LAST_FRAME, + &ref_frame_info[bwd_start_idx]); + ref_flag_list[BWDREF_FRAME - LAST_FRAME] = 1; + bwd_start_idx++; + } + + // == ALTREF2_FRAME == + if (bwd_start_idx <= bwd_end_idx) { + set_ref_frame_info(remapped_ref_idx, ALTREF2_FRAME - LAST_FRAME, + &ref_frame_info[bwd_start_idx]); + ref_flag_list[ALTREF2_FRAME - LAST_FRAME] = 1; + } + + // === Forward Reference Frames === + + for (int i = fwd_start_idx; i <= fwd_end_idx; ++i) { + // == LAST_FRAME == + if (ref_frame_info[i].map_idx == lst_map_idx) { + set_ref_frame_info(remapped_ref_idx, LAST_FRAME - LAST_FRAME, + &ref_frame_info[i]); + ref_flag_list[LAST_FRAME - LAST_FRAME] = 1; + } + + // == GOLDEN_FRAME == + if (ref_frame_info[i].map_idx == gld_map_idx) { + set_ref_frame_info(remapped_ref_idx, GOLDEN_FRAME - LAST_FRAME, + &ref_frame_info[i]); + ref_flag_list[GOLDEN_FRAME - LAST_FRAME] = 1; + } + } + + assert(ref_flag_list[LAST_FRAME - LAST_FRAME] == 1 && + ref_flag_list[GOLDEN_FRAME - LAST_FRAME] == 1); + + // == LAST2_FRAME == + // == LAST3_FRAME == + // == BWDREF_FRAME == + // == ALTREF2_FRAME == + // == ALTREF_FRAME == + + // Set up the reference frames in the anti-chronological order. + static const MV_REFERENCE_FRAME ref_frame_list[INTER_REFS_PER_FRAME - 2] = { + LAST2_FRAME, LAST3_FRAME, BWDREF_FRAME, ALTREF2_FRAME, ALTREF_FRAME + }; + + int ref_idx; + for (ref_idx = 0; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) { + const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx]; + + if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue; + + while (fwd_start_idx <= fwd_end_idx && + (ref_frame_info[fwd_end_idx].map_idx == lst_map_idx || + ref_frame_info[fwd_end_idx].map_idx == gld_map_idx)) { + fwd_end_idx--; + } + if (fwd_start_idx > fwd_end_idx) break; + + set_ref_frame_info(remapped_ref_idx, ref_frame - LAST_FRAME, + &ref_frame_info[fwd_end_idx]); + ref_flag_list[ref_frame - LAST_FRAME] = 1; + + fwd_end_idx--; + } + + // Assign all the remaining frame(s), if any, to the earliest reference + // frame. + for (; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) { + const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx]; + if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue; + set_ref_frame_info(remapped_ref_idx, ref_frame - LAST_FRAME, + &ref_frame_info[fwd_start_idx]); + ref_flag_list[ref_frame - LAST_FRAME] = 1; + } + + for (int i = 0; i < INTER_REFS_PER_FRAME; i++) { + assert(ref_flag_list[i] == 1); + } +} diff --git a/libs/libaom/src/av1/common/mvref_common.h b/libs/libaom/src/av1/common/mvref_common.h new file mode 100644 index 000000000..05a0dbc04 --- /dev/null +++ b/libs/libaom/src/av1/common/mvref_common.h @@ -0,0 +1,341 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_MVREF_COMMON_H_ +#define AOM_AV1_COMMON_MVREF_COMMON_H_ + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MVREF_ROW_COLS 3 + +// Set the upper limit of the motion vector component magnitude. +// This would make a motion vector fit in 26 bits. Plus 3 bits for the +// reference frame index. A tuple of motion vector can hence be stored within +// 32 bit range for efficient load/store operations. +#define REFMVS_LIMIT ((1 << 12) - 1) + +typedef struct position { + int row; + int col; +} POSITION; + +// clamp_mv_ref +#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units + +static INLINE int get_relative_dist(const OrderHintInfo *oh, int a, int b) { + if (!oh->enable_order_hint) return 0; + + const int bits = oh->order_hint_bits_minus_1 + 1; + + assert(bits >= 1); + assert(a >= 0 && a < (1 << bits)); + assert(b >= 0 && b < (1 << bits)); + + int diff = a - b; + const int m = 1 << (bits - 1); + diff = (diff & (m - 1)) - (diff & m); + return diff; +} + +static INLINE void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) { + const SubpelMvLimits mv_limits = { + xd->mb_to_left_edge - GET_MV_SUBPEL(bw) - MV_BORDER, + xd->mb_to_right_edge + GET_MV_SUBPEL(bw) + MV_BORDER, + xd->mb_to_top_edge - GET_MV_SUBPEL(bh) - MV_BORDER, + xd->mb_to_bottom_edge + GET_MV_SUBPEL(bh) + MV_BORDER + }; + clamp_mv(mv, &mv_limits); +} + +static INLINE int_mv get_block_mv(const MB_MODE_INFO *candidate, int which_mv) { + return candidate->mv[which_mv]; +} + +// Checks that the given mi_row, mi_col and search point +// are inside the borders of the tile. +static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row, + const POSITION *mi_pos) { + return !(mi_row + mi_pos->row < tile->mi_row_start || + mi_col + mi_pos->col < tile->mi_col_start || + mi_row + mi_pos->row >= tile->mi_row_end || + mi_col + mi_pos->col >= tile->mi_col_end); +} + +static INLINE int find_valid_row_offset(const TileInfo *const tile, int mi_row, + int row_offset) { + return clamp(row_offset, tile->mi_row_start - mi_row, + tile->mi_row_end - mi_row - 1); +} + +static INLINE int find_valid_col_offset(const TileInfo *const tile, int mi_col, + int col_offset) { + return clamp(col_offset, tile->mi_col_start - mi_col, + tile->mi_col_end - mi_col - 1); +} + +static INLINE void lower_mv_precision(MV *mv, int allow_hp, int is_integer) { + if (is_integer) { + integer_mv_precision(mv); + } else { + if (!allow_hp) { + if (mv->row & 1) mv->row += (mv->row > 0 ? -1 : 1); + if (mv->col & 1) mv->col += (mv->col > 0 ? -1 : 1); + } + } +} + +static INLINE int8_t get_uni_comp_ref_idx(const MV_REFERENCE_FRAME *const rf) { + // Single ref pred + if (rf[1] <= INTRA_FRAME) return -1; + + // Bi-directional comp ref pred + if ((rf[0] < BWDREF_FRAME) && (rf[1] >= BWDREF_FRAME)) return -1; + + for (int8_t ref_idx = 0; ref_idx < TOTAL_UNIDIR_COMP_REFS; ++ref_idx) { + if (rf[0] == comp_ref0(ref_idx) && rf[1] == comp_ref1(ref_idx)) + return ref_idx; + } + return -1; +} + +static INLINE int8_t av1_ref_frame_type(const MV_REFERENCE_FRAME *const rf) { + if (rf[1] > INTRA_FRAME) { + const int8_t uni_comp_ref_idx = get_uni_comp_ref_idx(rf); + if (uni_comp_ref_idx >= 0) { + assert((REF_FRAMES + FWD_REFS * BWD_REFS + uni_comp_ref_idx) < + MODE_CTX_REF_FRAMES); + return REF_FRAMES + FWD_REFS * BWD_REFS + uni_comp_ref_idx; + } else { + return REF_FRAMES + FWD_RF_OFFSET(rf[0]) + + BWD_RF_OFFSET(rf[1]) * FWD_REFS; + } + } + + return rf[0]; +} + +// clang-format off +static MV_REFERENCE_FRAME ref_frame_map[TOTAL_COMP_REFS][2] = { + { LAST_FRAME, BWDREF_FRAME }, { LAST2_FRAME, BWDREF_FRAME }, + { LAST3_FRAME, BWDREF_FRAME }, { GOLDEN_FRAME, BWDREF_FRAME }, + + { LAST_FRAME, ALTREF2_FRAME }, { LAST2_FRAME, ALTREF2_FRAME }, + { LAST3_FRAME, ALTREF2_FRAME }, { GOLDEN_FRAME, ALTREF2_FRAME }, + + { LAST_FRAME, ALTREF_FRAME }, { LAST2_FRAME, ALTREF_FRAME }, + { LAST3_FRAME, ALTREF_FRAME }, { GOLDEN_FRAME, ALTREF_FRAME }, + + { LAST_FRAME, LAST2_FRAME }, { LAST_FRAME, LAST3_FRAME }, + { LAST_FRAME, GOLDEN_FRAME }, { BWDREF_FRAME, ALTREF_FRAME }, + + // NOTE: Following reference frame pairs are not supported to be explicitly + // signalled, but they are possibly chosen by the use of skip_mode, + // which may use the most recent one-sided reference frame pair. + { LAST2_FRAME, LAST3_FRAME }, { LAST2_FRAME, GOLDEN_FRAME }, + { LAST3_FRAME, GOLDEN_FRAME }, {BWDREF_FRAME, ALTREF2_FRAME}, + { ALTREF2_FRAME, ALTREF_FRAME } +}; +// clang-format on + +static INLINE void av1_set_ref_frame(MV_REFERENCE_FRAME *rf, + MV_REFERENCE_FRAME ref_frame_type) { + if (ref_frame_type >= REF_FRAMES) { + rf[0] = ref_frame_map[ref_frame_type - REF_FRAMES][0]; + rf[1] = ref_frame_map[ref_frame_type - REF_FRAMES][1]; + } else { + assert(ref_frame_type > NONE_FRAME); + rf[0] = ref_frame_type; + rf[1] = NONE_FRAME; + } +} + +static uint16_t compound_mode_ctx_map[3][COMP_NEWMV_CTXS] = { + { 0, 1, 1, 1, 1 }, + { 1, 2, 3, 4, 4 }, + { 4, 4, 5, 6, 7 }, +}; + +static INLINE int16_t av1_mode_context_analyzer( + const int16_t *const mode_context, const MV_REFERENCE_FRAME *const rf) { + const int8_t ref_frame = av1_ref_frame_type(rf); + + if (rf[1] <= INTRA_FRAME) return mode_context[ref_frame]; + + const int16_t newmv_ctx = mode_context[ref_frame] & NEWMV_CTX_MASK; + const int16_t refmv_ctx = + (mode_context[ref_frame] >> REFMV_OFFSET) & REFMV_CTX_MASK; + + const int16_t comp_ctx = compound_mode_ctx_map[refmv_ctx >> 1][AOMMIN( + newmv_ctx, COMP_NEWMV_CTXS - 1)]; + return comp_ctx; +} + +static INLINE uint8_t av1_drl_ctx(const uint16_t *ref_mv_weight, int ref_idx) { + if (ref_mv_weight[ref_idx] >= REF_CAT_LEVEL && + ref_mv_weight[ref_idx + 1] >= REF_CAT_LEVEL) + return 0; + + if (ref_mv_weight[ref_idx] >= REF_CAT_LEVEL && + ref_mv_weight[ref_idx + 1] < REF_CAT_LEVEL) + return 1; + + if (ref_mv_weight[ref_idx] < REF_CAT_LEVEL && + ref_mv_weight[ref_idx + 1] < REF_CAT_LEVEL) + return 2; + + return 0; +} + +void av1_setup_frame_buf_refs(AV1_COMMON *cm); +void av1_setup_frame_sign_bias(AV1_COMMON *cm); +void av1_setup_skip_mode_allowed(AV1_COMMON *cm); +void av1_setup_motion_field(AV1_COMMON *cm); +void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx, + int lst_map_idx, int gld_map_idx); + +static INLINE void av1_collect_neighbors_ref_counts(MACROBLOCKD *const xd) { + av1_zero(xd->neighbors_ref_counts); + + uint8_t *const ref_counts = xd->neighbors_ref_counts; + + const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; + const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; + const int above_in_image = xd->up_available; + const int left_in_image = xd->left_available; + + // Above neighbor + if (above_in_image && is_inter_block(above_mbmi)) { + ref_counts[above_mbmi->ref_frame[0]]++; + if (has_second_ref(above_mbmi)) { + ref_counts[above_mbmi->ref_frame[1]]++; + } + } + + // Left neighbor + if (left_in_image && is_inter_block(left_mbmi)) { + ref_counts[left_mbmi->ref_frame[0]]++; + if (has_second_ref(left_mbmi)) { + ref_counts[left_mbmi->ref_frame[1]]++; + } + } +} + +void av1_copy_frame_mvs(const AV1_COMMON *const cm, + const MB_MODE_INFO *const mi, int mi_row, int mi_col, + int x_mis, int y_mis); + +// The global_mvs output parameter points to an array of REF_FRAMES elements. +// The caller may pass a null global_mvs if it does not need the global_mvs +// output. +void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd, + MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, + uint8_t ref_mv_count[MODE_CTX_REF_FRAMES], + CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE], + uint16_t ref_mv_weight[][MAX_REF_MV_STACK_SIZE], + int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES], + int_mv *global_mvs, int16_t *mode_context); + +// check a list of motion vectors by sad score using a number rows of pixels +// above and a number cols of pixels in the left to select the one with best +// score to use as ref motion vector +void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv, + int_mv *near_mv, int is_integer); + +uint8_t av1_selectSamples(MV *mv, int *pts, int *pts_inref, int len, + BLOCK_SIZE bsize); +uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts, + int *pts_inref); + +#define INTRABC_DELAY_PIXELS 256 // Delay of 256 pixels +#define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64) + +static INLINE void av1_find_ref_dv(int_mv *ref_dv, const TileInfo *const tile, + int mib_size, int mi_row) { + if (mi_row - mib_size < tile->mi_row_start) { + ref_dv->as_fullmv.row = 0; + ref_dv->as_fullmv.col = -MI_SIZE * mib_size - INTRABC_DELAY_PIXELS; + } else { + ref_dv->as_fullmv.row = -MI_SIZE * mib_size; + ref_dv->as_fullmv.col = 0; + } + convert_fullmv_to_mv(ref_dv); +} + +static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm, + const MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize, int mib_size_log2) { + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const int SCALE_PX_TO_MV = 8; + // Disallow subpixel for now + // SUBPEL_MASK is not the correct scale + if (((dv.row & (SCALE_PX_TO_MV - 1)) || (dv.col & (SCALE_PX_TO_MV - 1)))) + return 0; + + const TileInfo *const tile = &xd->tile; + // Is the source top-left inside the current tile? + const int src_top_edge = mi_row * MI_SIZE * SCALE_PX_TO_MV + dv.row; + const int tile_top_edge = tile->mi_row_start * MI_SIZE * SCALE_PX_TO_MV; + if (src_top_edge < tile_top_edge) return 0; + const int src_left_edge = mi_col * MI_SIZE * SCALE_PX_TO_MV + dv.col; + const int tile_left_edge = tile->mi_col_start * MI_SIZE * SCALE_PX_TO_MV; + if (src_left_edge < tile_left_edge) return 0; + // Is the bottom right inside the current tile? + const int src_bottom_edge = (mi_row * MI_SIZE + bh) * SCALE_PX_TO_MV + dv.row; + const int tile_bottom_edge = tile->mi_row_end * MI_SIZE * SCALE_PX_TO_MV; + if (src_bottom_edge > tile_bottom_edge) return 0; + const int src_right_edge = (mi_col * MI_SIZE + bw) * SCALE_PX_TO_MV + dv.col; + const int tile_right_edge = tile->mi_col_end * MI_SIZE * SCALE_PX_TO_MV; + if (src_right_edge > tile_right_edge) return 0; + + // Special case for sub 8x8 chroma cases, to prevent referring to chroma + // pixels outside current tile. + if (xd->is_chroma_ref && av1_num_planes(cm) > 1) { + const struct macroblockd_plane *const pd = &xd->plane[1]; + if (bw < 8 && pd->subsampling_x) + if (src_left_edge < tile_left_edge + 4 * SCALE_PX_TO_MV) return 0; + if (bh < 8 && pd->subsampling_y) + if (src_top_edge < tile_top_edge + 4 * SCALE_PX_TO_MV) return 0; + } + + // Is the bottom right within an already coded SB? Also consider additional + // constraints to facilitate HW decoder. + const int max_mib_size = 1 << mib_size_log2; + const int active_sb_row = mi_row >> mib_size_log2; + const int active_sb64_col = (mi_col * MI_SIZE) >> 6; + const int sb_size = max_mib_size * MI_SIZE; + const int src_sb_row = ((src_bottom_edge >> 3) - 1) / sb_size; + const int src_sb64_col = ((src_right_edge >> 3) - 1) >> 6; + const int total_sb64_per_row = + ((tile->mi_col_end - tile->mi_col_start - 1) >> 4) + 1; + const int active_sb64 = active_sb_row * total_sb64_per_row + active_sb64_col; + const int src_sb64 = src_sb_row * total_sb64_per_row + src_sb64_col; + if (src_sb64 >= active_sb64 - INTRABC_DELAY_SB64) return 0; + + // Wavefront constraint: use only top left area of frame for reference. + const int gradient = 1 + INTRABC_DELAY_SB64 + (sb_size > 64); + const int wf_offset = gradient * (active_sb_row - src_sb_row); + if (src_sb_row > active_sb_row || + src_sb64_col >= active_sb64_col - INTRABC_DELAY_SB64 + wf_offset) + return 0; + + return 1; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_MVREF_COMMON_H_ diff --git a/libs/libaom/src/av1/common/obmc.h b/libs/libaom/src/av1/common/obmc.h new file mode 100644 index 000000000..cc97b6bb1 --- /dev/null +++ b/libs/libaom/src/av1/common/obmc.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_OBMC_H_ +#define AOM_AV1_COMMON_OBMC_H_ + +typedef void (*overlappable_nb_visitor_t)(MACROBLOCKD *xd, int rel_mi_row, + int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *nb_mi, + void *fun_ctxt, const int num_planes); + +static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm, + MACROBLOCKD *xd, int nb_max, + overlappable_nb_visitor_t fun, + void *fun_ctxt) { + if (!xd->up_available) return; + + const int num_planes = av1_num_planes(cm); + int nb_count = 0; + const int mi_col = xd->mi_col; + // prev_row_mi points into the mi array, starting at the beginning of the + // previous row. + MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride; + const int end_col = AOMMIN(mi_col + xd->width, cm->mi_params.mi_cols); + uint8_t mi_step; + for (int above_mi_col = mi_col; above_mi_col < end_col && nb_count < nb_max; + above_mi_col += mi_step) { + MB_MODE_INFO **above_mi = prev_row_mi + above_mi_col; + mi_step = + AOMMIN(mi_size_wide[above_mi[0]->sb_type], mi_size_wide[BLOCK_64X64]); + // If we're considering a block with width 4, it should be treated as + // half of a pair of blocks with chroma information in the second. Move + // above_mi_col back to the start of the pair if needed, set above_mbmi + // to point at the block with chroma information, and set mi_step to 2 to + // step over the entire pair at the end of the iteration. + if (mi_step == 1) { + above_mi_col &= ~1; + above_mi = prev_row_mi + above_mi_col + 1; + mi_step = 2; + } + if (is_neighbor_overlappable(*above_mi)) { + ++nb_count; + fun(xd, 0, above_mi_col - mi_col, AOMMIN(xd->width, mi_step), 0, + *above_mi, fun_ctxt, num_planes); + } + } +} + +static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm, + MACROBLOCKD *xd, int nb_max, + overlappable_nb_visitor_t fun, + void *fun_ctxt) { + if (!xd->left_available) return; + + const int num_planes = av1_num_planes(cm); + int nb_count = 0; + // prev_col_mi points into the mi array, starting at the top of the + // previous column + const int mi_row = xd->mi_row; + MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride; + const int end_row = AOMMIN(mi_row + xd->height, cm->mi_params.mi_rows); + uint8_t mi_step; + for (int left_mi_row = mi_row; left_mi_row < end_row && nb_count < nb_max; + left_mi_row += mi_step) { + MB_MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride; + mi_step = + AOMMIN(mi_size_high[left_mi[0]->sb_type], mi_size_high[BLOCK_64X64]); + if (mi_step == 1) { + left_mi_row &= ~1; + left_mi = prev_col_mi + (left_mi_row + 1) * xd->mi_stride; + mi_step = 2; + } + if (is_neighbor_overlappable(*left_mi)) { + ++nb_count; + fun(xd, left_mi_row - mi_row, 0, AOMMIN(xd->height, mi_step), 1, *left_mi, + fun_ctxt, num_planes); + } + } +} + +#endif // AOM_AV1_COMMON_OBMC_H_ diff --git a/libs/libaom/src/av1/common/obu_util.c b/libs/libaom/src/av1/common/obu_util.c new file mode 100644 index 000000000..7d2694b89 --- /dev/null +++ b/libs/libaom/src/av1/common/obu_util.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "av1/common/obu_util.h" + +#include "aom_dsp/bitreader_buffer.h" + +// Returns 1 when OBU type is valid, and 0 otherwise. +static int valid_obu_type(int obu_type) { + int valid_type = 0; + switch (obu_type) { + case OBU_SEQUENCE_HEADER: + case OBU_TEMPORAL_DELIMITER: + case OBU_FRAME_HEADER: + case OBU_TILE_GROUP: + case OBU_METADATA: + case OBU_FRAME: + case OBU_REDUNDANT_FRAME_HEADER: + case OBU_TILE_LIST: + case OBU_PADDING: valid_type = 1; break; + default: break; + } + return valid_type; +} + +static aom_codec_err_t read_obu_size(const uint8_t *data, + size_t bytes_available, + size_t *const obu_size, + size_t *const length_field_size) { + uint64_t u_obu_size = 0; + if (aom_uleb_decode(data, bytes_available, &u_obu_size, length_field_size) != + 0) { + return AOM_CODEC_CORRUPT_FRAME; + } + + if (u_obu_size > UINT32_MAX) return AOM_CODEC_CORRUPT_FRAME; + *obu_size = (size_t)u_obu_size; + return AOM_CODEC_OK; +} + +// Parses OBU header and stores values in 'header'. +static aom_codec_err_t read_obu_header(struct aom_read_bit_buffer *rb, + int is_annexb, ObuHeader *header) { + if (!rb || !header) return AOM_CODEC_INVALID_PARAM; + + const ptrdiff_t bit_buffer_byte_length = rb->bit_buffer_end - rb->bit_buffer; + if (bit_buffer_byte_length < 1) return AOM_CODEC_CORRUPT_FRAME; + + header->size = 1; + + if (aom_rb_read_bit(rb) != 0) { + // Forbidden bit. Must not be set. + return AOM_CODEC_CORRUPT_FRAME; + } + + header->type = (OBU_TYPE)aom_rb_read_literal(rb, 4); + + if (!valid_obu_type(header->type)) return AOM_CODEC_CORRUPT_FRAME; + + header->has_extension = aom_rb_read_bit(rb); + header->has_size_field = aom_rb_read_bit(rb); + + if (!header->has_size_field && !is_annexb) { + // section 5 obu streams must have obu_size field set. + return AOM_CODEC_UNSUP_BITSTREAM; + } + + if (aom_rb_read_bit(rb) != 0) { + // obu_reserved_1bit must be set to 0. + return AOM_CODEC_CORRUPT_FRAME; + } + + if (header->has_extension) { + if (bit_buffer_byte_length == 1) return AOM_CODEC_CORRUPT_FRAME; + + header->size += 1; + header->temporal_layer_id = aom_rb_read_literal(rb, 3); + header->spatial_layer_id = aom_rb_read_literal(rb, 2); + if (aom_rb_read_literal(rb, 3) != 0) { + // extension_header_reserved_3bits must be set to 0. + return AOM_CODEC_CORRUPT_FRAME; + } + } + + return AOM_CODEC_OK; +} + +aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length, + size_t *consumed, ObuHeader *header, + int is_annexb) { + if (buffer_length < 1 || !consumed || !header) return AOM_CODEC_INVALID_PARAM; + + // TODO(tomfinegan): Set the error handler here and throughout this file, and + // confirm parsing work done via aom_read_bit_buffer is successful. + struct aom_read_bit_buffer rb = { buffer, buffer + buffer_length, 0, NULL, + NULL }; + aom_codec_err_t parse_result = read_obu_header(&rb, is_annexb, header); + if (parse_result == AOM_CODEC_OK) *consumed = header->size; + return parse_result; +} + +aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data, + size_t bytes_available, + int is_annexb, + ObuHeader *obu_header, + size_t *const payload_size, + size_t *const bytes_read) { + size_t length_field_size_obu = 0; + size_t length_field_size_payload = 0; + size_t obu_size = 0; + aom_codec_err_t status; + + if (is_annexb) { + // Size field comes before the OBU header, and includes the OBU header + status = + read_obu_size(data, bytes_available, &obu_size, &length_field_size_obu); + + if (status != AOM_CODEC_OK) return status; + } + + struct aom_read_bit_buffer rb = { data + length_field_size_obu, + data + bytes_available, 0, NULL, NULL }; + + status = read_obu_header(&rb, is_annexb, obu_header); + if (status != AOM_CODEC_OK) return status; + + if (!obu_header->has_size_field) { + assert(is_annexb); + // Derive the payload size from the data we've already read + if (obu_size < obu_header->size) return AOM_CODEC_CORRUPT_FRAME; + + *payload_size = obu_size - obu_header->size; + } else { + // Size field comes after the OBU header, and is just the payload size + status = read_obu_size( + data + length_field_size_obu + obu_header->size, + bytes_available - length_field_size_obu - obu_header->size, + payload_size, &length_field_size_payload); + if (status != AOM_CODEC_OK) return status; + } + + *bytes_read = + length_field_size_obu + obu_header->size + length_field_size_payload; + return AOM_CODEC_OK; +} diff --git a/libs/libaom/src/av1/common/obu_util.h b/libs/libaom/src/av1/common/obu_util.h new file mode 100644 index 000000000..7c56904c8 --- /dev/null +++ b/libs/libaom/src/av1/common/obu_util.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_OBU_UTIL_H_ +#define AOM_AV1_COMMON_OBU_UTIL_H_ + +#include "aom/aom_codec.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + size_t size; // Size (1 or 2 bytes) of the OBU header (including the + // optional OBU extension header) in the bitstream. + OBU_TYPE type; + int has_size_field; + int has_extension; + // The following fields come from the OBU extension header and therefore are + // only used if has_extension is true. + int temporal_layer_id; + int spatial_layer_id; +} ObuHeader; + +aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length, + size_t *consumed, ObuHeader *header, + int is_annexb); + +aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data, + size_t bytes_available, + int is_annexb, + ObuHeader *obu_header, + size_t *const payload_size, + size_t *const bytes_read); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_OBU_UTIL_H_ diff --git a/libs/libaom/src/av1/common/odintrin.c b/libs/libaom/src/av1/common/odintrin.c new file mode 100644 index 000000000..7584b2e52 --- /dev/null +++ b/libs/libaom/src/av1/common/odintrin.c @@ -0,0 +1,541 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/* clang-format off */ + +#include "av1/common/odintrin.h" + +/*Constants for use with OD_DIVU_SMALL(). + See \cite{Rob05} for details on computing these constants. + @INPROCEEDINGS{Rob05, + author="Arch D. Robison", + title="{N}-bit Unsigned Division via {N}-bit Multiply-Add", + booktitle="Proc. of the 17th IEEE Symposium on Computer Arithmetic + (ARITH'05)", + pages="131--139", + address="Cape Cod, MA", + month=Jun, + year=2005 + }*/ +uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2] = { + { 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xAAAAAAAB, 0 }, { 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xCCCCCCCD, 0 }, { 0xAAAAAAAB, 0 }, + { 0x92492492, 0x92492492 }, { 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xE38E38E4, 0 }, { 0xCCCCCCCD, 0 }, + { 0xBA2E8BA3, 0 }, { 0xAAAAAAAB, 0 }, + { 0x9D89D89E, 0 }, { 0x92492492, 0x92492492 }, + { 0x88888889, 0 }, { 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xF0F0F0F1, 0 }, { 0xE38E38E4, 0 }, + { 0xD79435E5, 0xD79435E5 }, { 0xCCCCCCCD, 0 }, + { 0xC30C30C3, 0xC30C30C3 }, { 0xBA2E8BA3, 0 }, + { 0xB21642C9, 0 }, { 0xAAAAAAAB, 0 }, + { 0xA3D70A3E, 0 }, { 0x9D89D89E, 0 }, + { 0x97B425ED, 0x97B425ED }, { 0x92492492, 0x92492492 }, + { 0x8D3DCB09, 0 }, { 0x88888889, 0 }, + { 0x84210842, 0x84210842 }, { 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xF83E0F84, 0 }, { 0xF0F0F0F1, 0 }, + { 0xEA0EA0EA, 0xEA0EA0EA }, { 0xE38E38E4, 0 }, + { 0xDD67C8A6, 0xDD67C8A6 }, { 0xD79435E5, 0xD79435E5 }, + { 0xD20D20D2, 0xD20D20D2 }, { 0xCCCCCCCD, 0 }, + { 0xC7CE0C7D, 0 }, { 0xC30C30C3, 0xC30C30C3 }, + { 0xBE82FA0C, 0 }, { 0xBA2E8BA3, 0 }, + { 0xB60B60B6, 0xB60B60B6 }, { 0xB21642C9, 0 }, + { 0xAE4C415D, 0 }, { 0xAAAAAAAB, 0 }, + { 0xA72F053A, 0 }, { 0xA3D70A3E, 0 }, + { 0xA0A0A0A1, 0 }, { 0x9D89D89E, 0 }, + { 0x9A90E7D9, 0x9A90E7D9 }, { 0x97B425ED, 0x97B425ED }, + { 0x94F2094F, 0x94F2094F }, { 0x92492492, 0x92492492 }, + { 0x8FB823EE, 0x8FB823EE }, { 0x8D3DCB09, 0 }, + { 0x8AD8F2FC, 0 }, { 0x88888889, 0 }, + { 0x864B8A7E, 0 }, { 0x84210842, 0x84210842 }, + { 0x82082082, 0x82082082 }, { 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFC0FC0FD, 0 }, { 0xF83E0F84, 0 }, + { 0xF4898D60, 0 }, { 0xF0F0F0F1, 0 }, + { 0xED7303B6, 0 }, { 0xEA0EA0EA, 0xEA0EA0EA }, + { 0xE6C2B449, 0 }, { 0xE38E38E4, 0 }, + { 0xE070381C, 0xE070381C }, { 0xDD67C8A6, 0xDD67C8A6 }, + { 0xDA740DA8, 0 }, { 0xD79435E5, 0xD79435E5 }, + { 0xD4C77B04, 0 }, { 0xD20D20D2, 0xD20D20D2 }, + { 0xCF6474A9, 0 }, { 0xCCCCCCCD, 0 }, + { 0xCA4587E7, 0 }, { 0xC7CE0C7D, 0 }, + { 0xC565C87C, 0 }, { 0xC30C30C3, 0xC30C30C3 }, + { 0xC0C0C0C1, 0 }, { 0xBE82FA0C, 0 }, + { 0xBC52640C, 0 }, { 0xBA2E8BA3, 0 }, + { 0xB81702E1, 0 }, { 0xB60B60B6, 0xB60B60B6 }, + { 0xB40B40B4, 0xB40B40B4 }, { 0xB21642C9, 0 }, + { 0xB02C0B03, 0 }, { 0xAE4C415D, 0 }, + { 0xAC769184, 0xAC769184 }, { 0xAAAAAAAB, 0 }, + { 0xA8E83F57, 0xA8E83F57 }, { 0xA72F053A, 0 }, + { 0xA57EB503, 0 }, { 0xA3D70A3E, 0 }, + { 0xA237C32B, 0xA237C32B }, { 0xA0A0A0A1, 0 }, + { 0x9F1165E7, 0x9F1165E7 }, { 0x9D89D89E, 0 }, + { 0x9C09C09C, 0x9C09C09C }, { 0x9A90E7D9, 0x9A90E7D9 }, + { 0x991F1A51, 0x991F1A51 }, { 0x97B425ED, 0x97B425ED }, + { 0x964FDA6C, 0x964FDA6C }, { 0x94F2094F, 0x94F2094F }, + { 0x939A85C4, 0x939A85C4 }, { 0x92492492, 0x92492492 }, + { 0x90FDBC09, 0x90FDBC09 }, { 0x8FB823EE, 0x8FB823EE }, + { 0x8E78356D, 0x8E78356D }, { 0x8D3DCB09, 0 }, + { 0x8C08C08C, 0x8C08C08C }, { 0x8AD8F2FC, 0 }, + { 0x89AE408A, 0 }, { 0x88888889, 0 }, + { 0x8767AB5F, 0x8767AB5F }, { 0x864B8A7E, 0 }, + { 0x85340853, 0x85340853 }, { 0x84210842, 0x84210842 }, + { 0x83126E98, 0 }, { 0x82082082, 0x82082082 }, + { 0x81020408, 0x81020408 }, { 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFE03F810, 0 }, { 0xFC0FC0FD, 0 }, + { 0xFA232CF3, 0 }, { 0xF83E0F84, 0 }, + { 0xF6603D99, 0 }, { 0xF4898D60, 0 }, + { 0xF2B9D649, 0 }, { 0xF0F0F0F1, 0 }, + { 0xEF2EB720, 0 }, { 0xED7303B6, 0 }, + { 0xEBBDB2A6, 0 }, { 0xEA0EA0EA, 0xEA0EA0EA }, + { 0xE865AC7C, 0 }, { 0xE6C2B449, 0 }, + { 0xE525982B, 0 }, { 0xE38E38E4, 0 }, + { 0xE1FC780F, 0 }, { 0xE070381C, 0xE070381C }, + { 0xDEE95C4D, 0 }, { 0xDD67C8A6, 0xDD67C8A6 }, + { 0xDBEB61EF, 0 }, { 0xDA740DA8, 0 }, + { 0xD901B204, 0 }, { 0xD79435E5, 0xD79435E5 }, + { 0xD62B80D7, 0 }, { 0xD4C77B04, 0 }, + { 0xD3680D37, 0 }, { 0xD20D20D2, 0xD20D20D2 }, + { 0xD0B69FCC, 0 }, { 0xCF6474A9, 0 }, + { 0xCE168A77, 0xCE168A77 }, { 0xCCCCCCCD, 0 }, + { 0xCB8727C1, 0 }, { 0xCA4587E7, 0 }, + { 0xC907DA4F, 0 }, { 0xC7CE0C7D, 0 }, + { 0xC6980C6A, 0 }, { 0xC565C87C, 0 }, + { 0xC4372F86, 0 }, { 0xC30C30C3, 0xC30C30C3 }, + { 0xC1E4BBD6, 0 }, { 0xC0C0C0C1, 0 }, + { 0xBFA02FE8, 0xBFA02FE8 }, { 0xBE82FA0C, 0 }, + { 0xBD691047, 0xBD691047 }, { 0xBC52640C, 0 }, + { 0xBB3EE722, 0 }, { 0xBA2E8BA3, 0 }, + { 0xB92143FA, 0xB92143FA }, { 0xB81702E1, 0 }, + { 0xB70FBB5A, 0xB70FBB5A }, { 0xB60B60B6, 0xB60B60B6 }, + { 0xB509E68B, 0 }, { 0xB40B40B4, 0xB40B40B4 }, + { 0xB30F6353, 0 }, { 0xB21642C9, 0 }, + { 0xB11FD3B8, 0xB11FD3B8 }, { 0xB02C0B03, 0 }, + { 0xAF3ADDC7, 0 }, { 0xAE4C415D, 0 }, + { 0xAD602B58, 0xAD602B58 }, { 0xAC769184, 0xAC769184 }, + { 0xAB8F69E3, 0 }, { 0xAAAAAAAB, 0 }, + { 0xA9C84A48, 0 }, { 0xA8E83F57, 0xA8E83F57 }, + { 0xA80A80A8, 0xA80A80A8 }, { 0xA72F053A, 0 }, + { 0xA655C439, 0xA655C439 }, { 0xA57EB503, 0 }, + { 0xA4A9CF1E, 0 }, { 0xA3D70A3E, 0 }, + { 0xA3065E40, 0 }, { 0xA237C32B, 0xA237C32B }, + { 0xA16B312F, 0 }, { 0xA0A0A0A1, 0 }, + { 0x9FD809FE, 0 }, { 0x9F1165E7, 0x9F1165E7 }, + { 0x9E4CAD24, 0 }, { 0x9D89D89E, 0 }, + { 0x9CC8E161, 0 }, { 0x9C09C09C, 0x9C09C09C }, + { 0x9B4C6F9F, 0 }, { 0x9A90E7D9, 0x9A90E7D9 }, + { 0x99D722DB, 0 }, { 0x991F1A51, 0x991F1A51 }, + { 0x9868C80A, 0 }, { 0x97B425ED, 0x97B425ED }, + { 0x97012E02, 0x97012E02 }, { 0x964FDA6C, 0x964FDA6C }, + { 0x95A02568, 0x95A02568 }, { 0x94F2094F, 0x94F2094F }, + { 0x94458094, 0x94458094 }, { 0x939A85C4, 0x939A85C4 }, + { 0x92F11384, 0x92F11384 }, { 0x92492492, 0x92492492 }, + { 0x91A2B3C5, 0 }, { 0x90FDBC09, 0x90FDBC09 }, + { 0x905A3863, 0x905A3863 }, { 0x8FB823EE, 0x8FB823EE }, + { 0x8F1779DA, 0 }, { 0x8E78356D, 0x8E78356D }, + { 0x8DDA5202, 0x8DDA5202 }, { 0x8D3DCB09, 0 }, + { 0x8CA29C04, 0x8CA29C04 }, { 0x8C08C08C, 0x8C08C08C }, + { 0x8B70344A, 0x8B70344A }, { 0x8AD8F2FC, 0 }, + { 0x8A42F870, 0x8A42F870 }, { 0x89AE408A, 0 }, + { 0x891AC73B, 0 }, { 0x88888889, 0 }, + { 0x87F78088, 0 }, { 0x8767AB5F, 0x8767AB5F }, + { 0x86D90545, 0 }, { 0x864B8A7E, 0 }, + { 0x85BF3761, 0x85BF3761 }, { 0x85340853, 0x85340853 }, + { 0x84A9F9C8, 0x84A9F9C8 }, { 0x84210842, 0x84210842 }, + { 0x83993052, 0x83993052 }, { 0x83126E98, 0 }, + { 0x828CBFBF, 0 }, { 0x82082082, 0x82082082 }, + { 0x81848DA9, 0 }, { 0x81020408, 0x81020408 }, + { 0x80808081, 0 }, { 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFF00FF01, 0 }, { 0xFE03F810, 0 }, + { 0xFD08E551, 0 }, { 0xFC0FC0FD, 0 }, + { 0xFB188566, 0 }, { 0xFA232CF3, 0 }, + { 0xF92FB222, 0 }, { 0xF83E0F84, 0 }, + { 0xF74E3FC3, 0 }, { 0xF6603D99, 0 }, + { 0xF57403D6, 0 }, { 0xF4898D60, 0 }, + { 0xF3A0D52D, 0 }, { 0xF2B9D649, 0 }, + { 0xF1D48BCF, 0 }, { 0xF0F0F0F1, 0 }, + { 0xF00F00F0, 0xF00F00F0 }, { 0xEF2EB720, 0 }, + { 0xEE500EE5, 0xEE500EE5 }, { 0xED7303B6, 0 }, + { 0xEC979119, 0 }, { 0xEBBDB2A6, 0 }, + { 0xEAE56404, 0 }, { 0xEA0EA0EA, 0xEA0EA0EA }, + { 0xE9396520, 0 }, { 0xE865AC7C, 0 }, + { 0xE79372E3, 0 }, { 0xE6C2B449, 0 }, + { 0xE5F36CB0, 0xE5F36CB0 }, { 0xE525982B, 0 }, + { 0xE45932D8, 0 }, { 0xE38E38E4, 0 }, + { 0xE2C4A689, 0 }, { 0xE1FC780F, 0 }, + { 0xE135A9CA, 0 }, { 0xE070381C, 0xE070381C }, + { 0xDFAC1F75, 0 }, { 0xDEE95C4D, 0 }, + { 0xDE27EB2D, 0 }, { 0xDD67C8A6, 0xDD67C8A6 }, + { 0xDCA8F159, 0 }, { 0xDBEB61EF, 0 }, + { 0xDB2F171E, 0 }, { 0xDA740DA8, 0 }, + { 0xD9BA4257, 0 }, { 0xD901B204, 0 }, + { 0xD84A598F, 0 }, { 0xD79435E5, 0xD79435E5 }, + { 0xD6DF43FD, 0 }, { 0xD62B80D7, 0 }, + { 0xD578E97D, 0 }, { 0xD4C77B04, 0 }, + { 0xD417328A, 0 }, { 0xD3680D37, 0 }, + { 0xD2BA083C, 0 }, { 0xD20D20D2, 0xD20D20D2 }, + { 0xD161543E, 0xD161543E }, { 0xD0B69FCC, 0 }, + { 0xD00D00D0, 0xD00D00D0 }, { 0xCF6474A9, 0 }, + { 0xCEBCF8BC, 0 }, { 0xCE168A77, 0xCE168A77 }, + { 0xCD712753, 0 }, { 0xCCCCCCCD, 0 }, + { 0xCC29786D, 0 }, { 0xCB8727C1, 0 }, + { 0xCAE5D85F, 0xCAE5D85F }, { 0xCA4587E7, 0 }, + { 0xC9A633FD, 0 }, { 0xC907DA4F, 0 }, + { 0xC86A7890, 0xC86A7890 }, { 0xC7CE0C7D, 0 }, + { 0xC73293D8, 0 }, { 0xC6980C6A, 0 }, + { 0xC5FE7403, 0xC5FE7403 }, { 0xC565C87C, 0 }, + { 0xC4CE07B0, 0xC4CE07B0 }, { 0xC4372F86, 0 }, + { 0xC3A13DE6, 0xC3A13DE6 }, { 0xC30C30C3, 0xC30C30C3 }, + { 0xC2780614, 0 }, { 0xC1E4BBD6, 0 }, + { 0xC152500C, 0xC152500C }, { 0xC0C0C0C1, 0 }, + { 0xC0300C03, 0xC0300C03 }, { 0xBFA02FE8, 0xBFA02FE8 }, + { 0xBF112A8B, 0 }, { 0xBE82FA0C, 0 }, + { 0xBDF59C92, 0 }, { 0xBD691047, 0xBD691047 }, + { 0xBCDD535E, 0 }, { 0xBC52640C, 0 }, + { 0xBBC8408D, 0 }, { 0xBB3EE722, 0 }, + { 0xBAB65610, 0xBAB65610 }, { 0xBA2E8BA3, 0 }, + { 0xB9A7862A, 0xB9A7862A }, { 0xB92143FA, 0xB92143FA }, + { 0xB89BC36D, 0 }, { 0xB81702E1, 0 }, + { 0xB79300B8, 0 }, { 0xB70FBB5A, 0xB70FBB5A }, + { 0xB68D3134, 0xB68D3134 }, { 0xB60B60B6, 0xB60B60B6 }, + { 0xB58A4855, 0xB58A4855 }, { 0xB509E68B, 0 }, + { 0xB48A39D4, 0xB48A39D4 }, { 0xB40B40B4, 0xB40B40B4 }, + { 0xB38CF9B0, 0xB38CF9B0 }, { 0xB30F6353, 0 }, + { 0xB2927C2A, 0 }, { 0xB21642C9, 0 }, + { 0xB19AB5C5, 0 }, { 0xB11FD3B8, 0xB11FD3B8 }, + { 0xB0A59B42, 0 }, { 0xB02C0B03, 0 }, + { 0xAFB321A1, 0xAFB321A1 }, { 0xAF3ADDC7, 0 }, + { 0xAEC33E20, 0 }, { 0xAE4C415D, 0 }, + { 0xADD5E632, 0xADD5E632 }, { 0xAD602B58, 0xAD602B58 }, + { 0xACEB0F89, 0xACEB0F89 }, { 0xAC769184, 0xAC769184 }, + { 0xAC02B00B, 0 }, { 0xAB8F69E3, 0 }, + { 0xAB1CBDD4, 0 }, { 0xAAAAAAAB, 0 }, + { 0xAA392F36, 0 }, { 0xA9C84A48, 0 }, + { 0xA957FAB5, 0xA957FAB5 }, { 0xA8E83F57, 0xA8E83F57 }, + { 0xA8791709, 0 }, { 0xA80A80A8, 0xA80A80A8 }, + { 0xA79C7B17, 0 }, { 0xA72F053A, 0 }, + { 0xA6C21DF7, 0 }, { 0xA655C439, 0xA655C439 }, + { 0xA5E9F6ED, 0xA5E9F6ED }, { 0xA57EB503, 0 }, + { 0xA513FD6C, 0 }, { 0xA4A9CF1E, 0 }, + { 0xA4402910, 0xA4402910 }, { 0xA3D70A3E, 0 }, + { 0xA36E71A3, 0 }, { 0xA3065E40, 0 }, + { 0xA29ECF16, 0xA29ECF16 }, { 0xA237C32B, 0xA237C32B }, + { 0xA1D13986, 0 }, { 0xA16B312F, 0 }, + { 0xA105A933, 0 }, { 0xA0A0A0A1, 0 }, + { 0xA03C1689, 0 }, { 0x9FD809FE, 0 }, + { 0x9F747A15, 0x9F747A15 }, { 0x9F1165E7, 0x9F1165E7 }, + { 0x9EAECC8D, 0x9EAECC8D }, { 0x9E4CAD24, 0 }, + { 0x9DEB06C9, 0x9DEB06C9 }, { 0x9D89D89E, 0 }, + { 0x9D2921C4, 0 }, { 0x9CC8E161, 0 }, + { 0x9C69169B, 0x9C69169B }, { 0x9C09C09C, 0x9C09C09C }, + { 0x9BAADE8E, 0x9BAADE8E }, { 0x9B4C6F9F, 0 }, + { 0x9AEE72FD, 0 }, { 0x9A90E7D9, 0x9A90E7D9 }, + { 0x9A33CD67, 0x9A33CD67 }, { 0x99D722DB, 0 }, + { 0x997AE76B, 0x997AE76B }, { 0x991F1A51, 0x991F1A51 }, + { 0x98C3BAC7, 0x98C3BAC7 }, { 0x9868C80A, 0 }, + { 0x980E4156, 0x980E4156 }, { 0x97B425ED, 0x97B425ED }, + { 0x975A7510, 0 }, { 0x97012E02, 0x97012E02 }, + { 0x96A8500A, 0 }, { 0x964FDA6C, 0x964FDA6C }, + { 0x95F7CC73, 0 }, { 0x95A02568, 0x95A02568 }, + { 0x9548E498, 0 }, { 0x94F2094F, 0x94F2094F }, + { 0x949B92DE, 0 }, { 0x94458094, 0x94458094 }, + { 0x93EFD1C5, 0x93EFD1C5 }, { 0x939A85C4, 0x939A85C4 }, + { 0x93459BE7, 0 }, { 0x92F11384, 0x92F11384 }, + { 0x929CEBF5, 0 }, { 0x92492492, 0x92492492 }, + { 0x91F5BCB9, 0 }, { 0x91A2B3C5, 0 }, + { 0x91500915, 0x91500915 }, { 0x90FDBC09, 0x90FDBC09 }, + { 0x90ABCC02, 0x90ABCC02 }, { 0x905A3863, 0x905A3863 }, + { 0x90090090, 0x90090090 }, { 0x8FB823EE, 0x8FB823EE }, + { 0x8F67A1E4, 0 }, { 0x8F1779DA, 0 }, + { 0x8EC7AB3A, 0 }, { 0x8E78356D, 0x8E78356D }, + { 0x8E2917E1, 0 }, { 0x8DDA5202, 0x8DDA5202 }, + { 0x8D8BE340, 0 }, { 0x8D3DCB09, 0 }, + { 0x8CF008CF, 0x8CF008CF }, { 0x8CA29C04, 0x8CA29C04 }, + { 0x8C55841D, 0 }, { 0x8C08C08C, 0x8C08C08C }, + { 0x8BBC50C9, 0 }, { 0x8B70344A, 0x8B70344A }, + { 0x8B246A88, 0 }, { 0x8AD8F2FC, 0 }, + { 0x8A8DCD20, 0 }, { 0x8A42F870, 0x8A42F870 }, + { 0x89F8746A, 0 }, { 0x89AE408A, 0 }, + { 0x89645C4F, 0x89645C4F }, { 0x891AC73B, 0 }, + { 0x88D180CD, 0x88D180CD }, { 0x88888889, 0 }, + { 0x883FDDF0, 0x883FDDF0 }, { 0x87F78088, 0 }, + { 0x87AF6FD6, 0 }, { 0x8767AB5F, 0x8767AB5F }, + { 0x872032AC, 0x872032AC }, { 0x86D90545, 0 }, + { 0x869222B2, 0 }, { 0x864B8A7E, 0 }, + { 0x86053C34, 0x86053C34 }, { 0x85BF3761, 0x85BF3761 }, + { 0x85797B91, 0x85797B91 }, { 0x85340853, 0x85340853 }, + { 0x84EEDD36, 0 }, { 0x84A9F9C8, 0x84A9F9C8 }, + { 0x84655D9C, 0 }, { 0x84210842, 0x84210842 }, + { 0x83DCF94E, 0 }, { 0x83993052, 0x83993052 }, + { 0x8355ACE4, 0 }, { 0x83126E98, 0 }, + { 0x82CF7504, 0 }, { 0x828CBFBF, 0 }, + { 0x824A4E61, 0 }, { 0x82082082, 0x82082082 }, + { 0x81C635BC, 0x81C635BC }, { 0x81848DA9, 0 }, + { 0x814327E4, 0 }, { 0x81020408, 0x81020408 }, + { 0x80C121B3, 0 }, { 0x80808081, 0 }, + { 0x80402010, 0x80402010 }, { 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFF803FE1, 0 }, { 0xFF00FF01, 0 }, + { 0xFE823CA6, 0 }, { 0xFE03F810, 0 }, + { 0xFD863087, 0 }, { 0xFD08E551, 0 }, + { 0xFC8C15B5, 0 }, { 0xFC0FC0FD, 0 }, + { 0xFB93E673, 0 }, { 0xFB188566, 0 }, + { 0xFA9D9D20, 0 }, { 0xFA232CF3, 0 }, + { 0xF9A9342D, 0 }, { 0xF92FB222, 0 }, + { 0xF8B6A622, 0xF8B6A622 }, { 0xF83E0F84, 0 }, + { 0xF7C5ED9D, 0 }, { 0xF74E3FC3, 0 }, + { 0xF6D7054E, 0 }, { 0xF6603D99, 0 }, + { 0xF5E9E7FD, 0 }, { 0xF57403D6, 0 }, + { 0xF4FE9083, 0 }, { 0xF4898D60, 0 }, + { 0xF414F9CE, 0 }, { 0xF3A0D52D, 0 }, + { 0xF32D1EE0, 0 }, { 0xF2B9D649, 0 }, + { 0xF246FACC, 0 }, { 0xF1D48BCF, 0 }, + { 0xF16288B9, 0 }, { 0xF0F0F0F1, 0 }, + { 0xF07FC3E0, 0xF07FC3E0 }, { 0xF00F00F0, 0xF00F00F0 }, + { 0xEF9EA78C, 0 }, { 0xEF2EB720, 0 }, + { 0xEEBF2F19, 0 }, { 0xEE500EE5, 0xEE500EE5 }, + { 0xEDE155F4, 0 }, { 0xED7303B6, 0 }, + { 0xED05179C, 0xED05179C }, { 0xEC979119, 0 }, + { 0xEC2A6FA0, 0xEC2A6FA0 }, { 0xEBBDB2A6, 0 }, + { 0xEB5159A0, 0 }, { 0xEAE56404, 0 }, + { 0xEA79D14A, 0 }, { 0xEA0EA0EA, 0xEA0EA0EA }, + { 0xE9A3D25E, 0xE9A3D25E }, { 0xE9396520, 0 }, + { 0xE8CF58AB, 0 }, { 0xE865AC7C, 0 }, + { 0xE7FC600F, 0 }, { 0xE79372E3, 0 }, + { 0xE72AE476, 0 }, { 0xE6C2B449, 0 }, + { 0xE65AE1DC, 0 }, { 0xE5F36CB0, 0xE5F36CB0 }, + { 0xE58C544A, 0 }, { 0xE525982B, 0 }, + { 0xE4BF37D9, 0 }, { 0xE45932D8, 0 }, + { 0xE3F388AF, 0 }, { 0xE38E38E4, 0 }, + { 0xE32942FF, 0 }, { 0xE2C4A689, 0 }, + { 0xE260630B, 0 }, { 0xE1FC780F, 0 }, + { 0xE198E520, 0 }, { 0xE135A9CA, 0 }, + { 0xE0D2C59A, 0 }, { 0xE070381C, 0xE070381C }, + { 0xE00E00E0, 0xE00E00E0 }, { 0xDFAC1F75, 0 }, + { 0xDF4A9369, 0 }, { 0xDEE95C4D, 0 }, + { 0xDE8879B3, 0 }, { 0xDE27EB2D, 0 }, + { 0xDDC7B04D, 0 }, { 0xDD67C8A6, 0xDD67C8A6 }, + { 0xDD0833CE, 0 }, { 0xDCA8F159, 0 }, + { 0xDC4A00DD, 0 }, { 0xDBEB61EF, 0 }, + { 0xDB8D1428, 0 }, { 0xDB2F171E, 0 }, + { 0xDAD16A6B, 0 }, { 0xDA740DA8, 0 }, + { 0xDA17006D, 0xDA17006D }, { 0xD9BA4257, 0 }, + { 0xD95DD300, 0 }, { 0xD901B204, 0 }, + { 0xD8A5DEFF, 0 }, { 0xD84A598F, 0 }, + { 0xD7EF2152, 0 }, { 0xD79435E5, 0xD79435E5 }, + { 0xD73996E9, 0 }, { 0xD6DF43FD, 0 }, + { 0xD6853CC1, 0 }, { 0xD62B80D7, 0 }, + { 0xD5D20FDF, 0 }, { 0xD578E97D, 0 }, + { 0xD5200D52, 0xD5200D52 }, { 0xD4C77B04, 0 }, + { 0xD46F3235, 0 }, { 0xD417328A, 0 }, + { 0xD3BF7BA9, 0 }, { 0xD3680D37, 0 }, + { 0xD310E6DB, 0 }, { 0xD2BA083C, 0 }, + { 0xD2637101, 0 }, { 0xD20D20D2, 0xD20D20D2 }, + { 0xD1B71759, 0 }, { 0xD161543E, 0xD161543E }, + { 0xD10BD72C, 0 }, { 0xD0B69FCC, 0 }, + { 0xD061ADCA, 0 }, { 0xD00D00D0, 0xD00D00D0 }, + { 0xCFB8988C, 0 }, { 0xCF6474A9, 0 }, + { 0xCF1094D4, 0 }, { 0xCEBCF8BC, 0 }, + { 0xCE69A00D, 0 }, { 0xCE168A77, 0xCE168A77 }, + { 0xCDC3B7A9, 0xCDC3B7A9 }, { 0xCD712753, 0 }, + { 0xCD1ED924, 0 }, { 0xCCCCCCCD, 0 }, + { 0xCC7B0200, 0 }, { 0xCC29786D, 0 }, + { 0xCBD82FC7, 0 }, { 0xCB8727C1, 0 }, + { 0xCB36600D, 0 }, { 0xCAE5D85F, 0xCAE5D85F }, + { 0xCA95906C, 0 }, { 0xCA4587E7, 0 }, + { 0xC9F5BE86, 0 }, { 0xC9A633FD, 0 }, + { 0xC956E803, 0xC956E803 }, { 0xC907DA4F, 0 }, + { 0xC8B90A96, 0 }, { 0xC86A7890, 0xC86A7890 }, + { 0xC81C23F5, 0xC81C23F5 }, { 0xC7CE0C7D, 0 }, + { 0xC78031E0, 0xC78031E0 }, { 0xC73293D8, 0 }, + { 0xC6E5321D, 0 }, { 0xC6980C6A, 0 }, + { 0xC64B2278, 0xC64B2278 }, { 0xC5FE7403, 0xC5FE7403 }, + { 0xC5B200C6, 0 }, { 0xC565C87C, 0 }, + { 0xC519CAE0, 0xC519CAE0 }, { 0xC4CE07B0, 0xC4CE07B0 }, + { 0xC4827EA8, 0xC4827EA8 }, { 0xC4372F86, 0 }, + { 0xC3EC1A06, 0 }, { 0xC3A13DE6, 0xC3A13DE6 }, + { 0xC3569AE6, 0 }, { 0xC30C30C3, 0xC30C30C3 }, + { 0xC2C1FF3E, 0 }, { 0xC2780614, 0 }, + { 0xC22E4507, 0 }, { 0xC1E4BBD6, 0 }, + { 0xC19B6A42, 0 }, { 0xC152500C, 0xC152500C }, + { 0xC1096CF6, 0 }, { 0xC0C0C0C1, 0 }, + { 0xC0784B2F, 0 }, { 0xC0300C03, 0xC0300C03 }, + { 0xBFE80300, 0 }, { 0xBFA02FE8, 0xBFA02FE8 }, + { 0xBF589280, 0 }, { 0xBF112A8B, 0 }, + { 0xBEC9F7CE, 0 }, { 0xBE82FA0C, 0 }, + { 0xBE3C310C, 0 }, { 0xBDF59C92, 0 }, + { 0xBDAF3C64, 0 }, { 0xBD691047, 0xBD691047 }, + { 0xBD231803, 0 }, { 0xBCDD535E, 0 }, + { 0xBC97C21E, 0xBC97C21E }, { 0xBC52640C, 0 }, + { 0xBC0D38EE, 0xBC0D38EE }, { 0xBBC8408D, 0 }, + { 0xBB837AB1, 0 }, { 0xBB3EE722, 0 }, + { 0xBAFA85A9, 0xBAFA85A9 }, { 0xBAB65610, 0xBAB65610 }, + { 0xBA725820, 0xBA725820 }, { 0xBA2E8BA3, 0 }, + { 0xB9EAF063, 0 }, { 0xB9A7862A, 0xB9A7862A }, + { 0xB9644CC4, 0 }, { 0xB92143FA, 0xB92143FA }, + { 0xB8DE6B9A, 0 }, { 0xB89BC36D, 0 }, + { 0xB8594B41, 0 }, { 0xB81702E1, 0 }, + { 0xB7D4EA19, 0xB7D4EA19 }, { 0xB79300B8, 0 }, + { 0xB7514689, 0 }, { 0xB70FBB5A, 0xB70FBB5A }, + { 0xB6CE5EF9, 0xB6CE5EF9 }, { 0xB68D3134, 0xB68D3134 }, + { 0xB64C31D9, 0 }, { 0xB60B60B6, 0xB60B60B6 }, + { 0xB5CABD9B, 0 }, { 0xB58A4855, 0xB58A4855 }, + { 0xB54A00B5, 0xB54A00B5 }, { 0xB509E68B, 0 }, + { 0xB4C9F9A5, 0 }, { 0xB48A39D4, 0xB48A39D4 }, + { 0xB44AA6E9, 0xB44AA6E9 }, { 0xB40B40B4, 0xB40B40B4 }, + { 0xB3CC0706, 0 }, { 0xB38CF9B0, 0xB38CF9B0 }, + { 0xB34E1884, 0 }, { 0xB30F6353, 0 }, + { 0xB2D0D9EF, 0 }, { 0xB2927C2A, 0 }, + { 0xB25449D7, 0 }, { 0xB21642C9, 0 }, + { 0xB1D866D1, 0xB1D866D1 }, { 0xB19AB5C5, 0 }, + { 0xB15D2F76, 0 }, { 0xB11FD3B8, 0xB11FD3B8 }, + { 0xB0E2A260, 0xB0E2A260 }, { 0xB0A59B42, 0 }, + { 0xB068BE31, 0 }, { 0xB02C0B03, 0 }, + { 0xAFEF818C, 0 }, { 0xAFB321A1, 0xAFB321A1 }, + { 0xAF76EB19, 0 }, { 0xAF3ADDC7, 0 }, + { 0xAEFEF982, 0 }, { 0xAEC33E20, 0 }, + { 0xAE87AB76, 0xAE87AB76 }, { 0xAE4C415D, 0 }, + { 0xAE10FFA9, 0 }, { 0xADD5E632, 0xADD5E632 }, + { 0xAD9AF4D0, 0 }, { 0xAD602B58, 0xAD602B58 }, + { 0xAD2589A4, 0 }, { 0xACEB0F89, 0xACEB0F89 }, + { 0xACB0BCE1, 0xACB0BCE1 }, { 0xAC769184, 0xAC769184 }, + { 0xAC3C8D4A, 0 }, { 0xAC02B00B, 0 }, + { 0xABC8F9A0, 0xABC8F9A0 }, { 0xAB8F69E3, 0 }, + { 0xAB5600AC, 0 }, { 0xAB1CBDD4, 0 }, + { 0xAAE3A136, 0 }, { 0xAAAAAAAB, 0 }, + { 0xAA71DA0D, 0 }, { 0xAA392F36, 0 }, + { 0xAA00AA01, 0 }, { 0xA9C84A48, 0 }, + { 0xA9900FE6, 0 }, { 0xA957FAB5, 0xA957FAB5 }, + { 0xA9200A92, 0xA9200A92 }, { 0xA8E83F57, 0xA8E83F57 }, + { 0xA8B098E0, 0xA8B098E0 }, { 0xA8791709, 0 }, + { 0xA841B9AD, 0 }, { 0xA80A80A8, 0xA80A80A8 }, + { 0xA7D36BD8, 0 }, { 0xA79C7B17, 0 }, + { 0xA765AE44, 0 }, { 0xA72F053A, 0 }, + { 0xA6F87FD6, 0xA6F87FD6 }, { 0xA6C21DF7, 0 }, + { 0xA68BDF79, 0 }, { 0xA655C439, 0xA655C439 }, + { 0xA61FCC16, 0xA61FCC16 }, { 0xA5E9F6ED, 0xA5E9F6ED }, + { 0xA5B4449D, 0 }, { 0xA57EB503, 0 }, + { 0xA54947FE, 0 }, { 0xA513FD6C, 0 }, + { 0xA4DED52C, 0xA4DED52C }, { 0xA4A9CF1E, 0 }, + { 0xA474EB1F, 0xA474EB1F }, { 0xA4402910, 0xA4402910 }, + { 0xA40B88D0, 0 }, { 0xA3D70A3E, 0 }, + { 0xA3A2AD39, 0xA3A2AD39 }, { 0xA36E71A3, 0 }, + { 0xA33A575A, 0xA33A575A }, { 0xA3065E40, 0 }, + { 0xA2D28634, 0 }, { 0xA29ECF16, 0xA29ECF16 }, + { 0xA26B38C9, 0 }, { 0xA237C32B, 0xA237C32B }, + { 0xA2046E1F, 0xA2046E1F }, { 0xA1D13986, 0 }, + { 0xA19E2540, 0 }, { 0xA16B312F, 0 }, + { 0xA1385D35, 0 }, { 0xA105A933, 0 }, + { 0xA0D3150C, 0 }, { 0xA0A0A0A1, 0 }, + { 0xA06E4BD4, 0xA06E4BD4 }, { 0xA03C1689, 0 }, + { 0xA00A00A0, 0xA00A00A0 }, { 0x9FD809FE, 0 }, + { 0x9FA63284, 0 }, { 0x9F747A15, 0x9F747A15 }, + { 0x9F42E095, 0x9F42E095 }, { 0x9F1165E7, 0x9F1165E7 }, + { 0x9EE009EE, 0x9EE009EE }, { 0x9EAECC8D, 0x9EAECC8D }, + { 0x9E7DADA9, 0 }, { 0x9E4CAD24, 0 }, + { 0x9E1BCAE3, 0 }, { 0x9DEB06C9, 0x9DEB06C9 }, + { 0x9DBA60BB, 0x9DBA60BB }, { 0x9D89D89E, 0 }, + { 0x9D596E54, 0x9D596E54 }, { 0x9D2921C4, 0 }, + { 0x9CF8F2D1, 0x9CF8F2D1 }, { 0x9CC8E161, 0 }, + { 0x9C98ED58, 0 }, { 0x9C69169B, 0x9C69169B }, + { 0x9C395D10, 0x9C395D10 }, { 0x9C09C09C, 0x9C09C09C }, + { 0x9BDA4124, 0x9BDA4124 }, { 0x9BAADE8E, 0x9BAADE8E }, + { 0x9B7B98C0, 0 }, { 0x9B4C6F9F, 0 }, + { 0x9B1D6311, 0x9B1D6311 }, { 0x9AEE72FD, 0 }, + { 0x9ABF9F48, 0x9ABF9F48 }, { 0x9A90E7D9, 0x9A90E7D9 }, + { 0x9A624C97, 0 }, { 0x9A33CD67, 0x9A33CD67 }, + { 0x9A056A31, 0 }, { 0x99D722DB, 0 }, + { 0x99A8F74C, 0 }, { 0x997AE76B, 0x997AE76B }, + { 0x994CF320, 0x994CF320 }, { 0x991F1A51, 0x991F1A51 }, + { 0x98F15CE7, 0 }, { 0x98C3BAC7, 0x98C3BAC7 }, + { 0x989633DB, 0x989633DB }, { 0x9868C80A, 0 }, + { 0x983B773B, 0 }, { 0x980E4156, 0x980E4156 }, + { 0x97E12644, 0x97E12644 }, { 0x97B425ED, 0x97B425ED }, + { 0x97874039, 0 }, { 0x975A7510, 0 }, + { 0x972DC45B, 0 }, { 0x97012E02, 0x97012E02 }, + { 0x96D4B1EF, 0 }, { 0x96A8500A, 0 }, + { 0x967C083B, 0 }, { 0x964FDA6C, 0x964FDA6C }, + { 0x9623C686, 0x9623C686 }, { 0x95F7CC73, 0 }, + { 0x95CBEC1B, 0 }, { 0x95A02568, 0x95A02568 }, + { 0x95747844, 0 }, { 0x9548E498, 0 }, + { 0x951D6A4E, 0 }, { 0x94F2094F, 0x94F2094F }, + { 0x94C6C187, 0 }, { 0x949B92DE, 0 }, + { 0x94707D3F, 0 }, { 0x94458094, 0x94458094 }, + { 0x941A9CC8, 0x941A9CC8 }, { 0x93EFD1C5, 0x93EFD1C5 }, + { 0x93C51F76, 0 }, { 0x939A85C4, 0x939A85C4 }, + { 0x9370049C, 0 }, { 0x93459BE7, 0 }, + { 0x931B4B91, 0 }, { 0x92F11384, 0x92F11384 }, + { 0x92C6F3AC, 0x92C6F3AC }, { 0x929CEBF5, 0 }, + { 0x9272FC48, 0x9272FC48 }, { 0x92492492, 0x92492492 }, + { 0x921F64BF, 0 }, { 0x91F5BCB9, 0 }, + { 0x91CC2C6C, 0x91CC2C6C }, { 0x91A2B3C5, 0 }, + { 0x917952AF, 0 }, { 0x91500915, 0x91500915 }, + { 0x9126D6E5, 0 }, { 0x90FDBC09, 0x90FDBC09 }, + { 0x90D4B86F, 0 }, { 0x90ABCC02, 0x90ABCC02 }, + { 0x9082F6B0, 0 }, { 0x905A3863, 0x905A3863 }, + { 0x9031910A, 0 }, { 0x90090090, 0x90090090 }, + { 0x8FE086E3, 0 }, { 0x8FB823EE, 0x8FB823EE }, + { 0x8F8FD7A0, 0 }, { 0x8F67A1E4, 0 }, + { 0x8F3F82A8, 0x8F3F82A8 }, { 0x8F1779DA, 0 }, + { 0x8EEF8766, 0 }, { 0x8EC7AB3A, 0 }, + { 0x8E9FE542, 0x8E9FE542 }, { 0x8E78356D, 0x8E78356D }, + { 0x8E509BA8, 0x8E509BA8 }, { 0x8E2917E1, 0 }, + { 0x8E01AA05, 0 }, { 0x8DDA5202, 0x8DDA5202 }, + { 0x8DB30FC6, 0x8DB30FC6 }, { 0x8D8BE340, 0 }, + { 0x8D64CC5C, 0 }, { 0x8D3DCB09, 0 }, + { 0x8D16DF35, 0x8D16DF35 }, { 0x8CF008CF, 0x8CF008CF }, + { 0x8CC947C5, 0 }, { 0x8CA29C04, 0x8CA29C04 }, + { 0x8C7C057D, 0 }, { 0x8C55841D, 0 }, + { 0x8C2F17D2, 0x8C2F17D2 }, { 0x8C08C08C, 0x8C08C08C }, + { 0x8BE27E39, 0x8BE27E39 }, { 0x8BBC50C9, 0 }, + { 0x8B963829, 0x8B963829 }, { 0x8B70344A, 0x8B70344A }, + { 0x8B4A451A, 0 }, { 0x8B246A88, 0 }, + { 0x8AFEA483, 0x8AFEA483 }, { 0x8AD8F2FC, 0 }, + { 0x8AB355E0, 0x8AB355E0 }, { 0x8A8DCD20, 0 }, + { 0x8A6858AB, 0 }, { 0x8A42F870, 0x8A42F870 }, + { 0x8A1DAC60, 0x8A1DAC60 }, { 0x89F8746A, 0 }, + { 0x89D3507D, 0 }, { 0x89AE408A, 0 }, + { 0x89894480, 0 }, { 0x89645C4F, 0x89645C4F }, + { 0x893F87E8, 0x893F87E8 }, { 0x891AC73B, 0 }, + { 0x88F61A37, 0x88F61A37 }, { 0x88D180CD, 0x88D180CD }, + { 0x88ACFAEE, 0 }, { 0x88888889, 0 }, + { 0x8864298F, 0 }, { 0x883FDDF0, 0x883FDDF0 }, + { 0x881BA59E, 0 }, { 0x87F78088, 0 }, + { 0x87D36EA0, 0 }, { 0x87AF6FD6, 0 }, + { 0x878B841B, 0 }, { 0x8767AB5F, 0x8767AB5F }, + { 0x8743E595, 0 }, { 0x872032AC, 0x872032AC }, + { 0x86FC9296, 0x86FC9296 }, { 0x86D90545, 0 }, + { 0x86B58AA8, 0 }, { 0x869222B2, 0 }, + { 0x866ECD53, 0x866ECD53 }, { 0x864B8A7E, 0 }, + { 0x86285A23, 0x86285A23 }, { 0x86053C34, 0x86053C34 }, + { 0x85E230A3, 0x85E230A3 }, { 0x85BF3761, 0x85BF3761 }, + { 0x859C5060, 0x859C5060 }, { 0x85797B91, 0x85797B91 }, + { 0x8556B8E7, 0x8556B8E7 }, { 0x85340853, 0x85340853 }, + { 0x851169C7, 0x851169C7 }, { 0x84EEDD36, 0 }, + { 0x84CC6290, 0 }, { 0x84A9F9C8, 0x84A9F9C8 }, + { 0x8487A2D1, 0 }, { 0x84655D9C, 0 }, + { 0x84432A1B, 0x84432A1B }, { 0x84210842, 0x84210842 }, + { 0x83FEF802, 0x83FEF802 }, { 0x83DCF94E, 0 }, + { 0x83BB0C18, 0 }, { 0x83993052, 0x83993052 }, + { 0x837765F0, 0x837765F0 }, { 0x8355ACE4, 0 }, + { 0x83340520, 0x83340520 }, { 0x83126E98, 0 }, + { 0x82F0E93D, 0x82F0E93D }, { 0x82CF7504, 0 }, + { 0x82AE11DE, 0 }, { 0x828CBFBF, 0 }, + { 0x826B7E99, 0x826B7E99 }, { 0x824A4E61, 0 }, + { 0x82292F08, 0 }, { 0x82082082, 0x82082082 }, + { 0x81E722C2, 0x81E722C2 }, { 0x81C635BC, 0x81C635BC }, + { 0x81A55963, 0 }, { 0x81848DA9, 0 }, + { 0x8163D283, 0 }, { 0x814327E4, 0 }, + { 0x81228DBF, 0 }, { 0x81020408, 0x81020408 }, + { 0x80E18AB3, 0 }, { 0x80C121B3, 0 }, + { 0x80A0C8FB, 0x80A0C8FB }, { 0x80808081, 0 }, + { 0x80604836, 0x80604836 }, { 0x80402010, 0x80402010 }, + { 0x80200802, 0x80200802 }, { 0xFFFFFFFF, 0xFFFFFFFF } +}; diff --git a/libs/libaom/src/av1/common/odintrin.h b/libs/libaom/src/av1/common/odintrin.h new file mode 100644 index 000000000..e1db0f44d --- /dev/null +++ b/libs/libaom/src/av1/common/odintrin.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/* clang-format off */ + +#ifndef AOM_AV1_COMMON_ODINTRIN_H_ +#define AOM_AV1_COMMON_ODINTRIN_H_ + +#include +#include + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/bitops.h" +#include "av1/common/enums.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef int od_coeff; + +#define OD_DIVU_DMAX (1024) + +extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2]; + +/*Translate unsigned division by small divisors into multiplications.*/ +#define OD_DIVU_SMALL(_x, _d) \ + ((uint32_t)((OD_DIVU_SMALL_CONSTS[(_d)-1][0] * (uint64_t)(_x) + \ + OD_DIVU_SMALL_CONSTS[(_d)-1][1]) >> \ + 32) >> \ + (OD_ILOG_NZ(_d) - 1)) + +#define OD_DIVU(_x, _d) \ + (((_d) < OD_DIVU_DMAX) ? (OD_DIVU_SMALL((_x), (_d))) : ((_x) / (_d))) + +#define OD_MINI AOMMIN +#define OD_MAXI AOMMAX +#define OD_CLAMPI(min, val, max) (OD_MAXI(min, OD_MINI(val, max))) + +/*Integer logarithm (base 2) of a nonzero unsigned 32-bit integer. + OD_ILOG_NZ(x) = (int)floor(log2(x)) + 1.*/ +#define OD_ILOG_NZ(x) (1 + get_msb(x)) + +/*Enable special features for gcc and compatible compilers.*/ +#if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__) +#define OD_GNUC_PREREQ(maj, min, pat) \ + ((__GNUC__ << 16) + (__GNUC_MINOR__ << 8) + __GNUC_PATCHLEVEL__ >= \ + ((maj) << 16) + ((min) << 8) + pat) // NOLINT +#else +#define OD_GNUC_PREREQ(maj, min, pat) (0) +#endif + +#if OD_GNUC_PREREQ(3, 4, 0) +#define OD_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__)) +#else +#define OD_WARN_UNUSED_RESULT +#endif + +#if OD_GNUC_PREREQ(3, 4, 0) +#define OD_ARG_NONNULL(x) __attribute__((__nonnull__(x))) +#else +#define OD_ARG_NONNULL(x) +#endif + +/** Copy n elements of memory from src to dst. The 0* term provides + compile-time type checking */ +#if !defined(OVERRIDE_OD_COPY) +#define OD_COPY(dst, src, n) \ + (memcpy((dst), (src), sizeof(*(dst)) * (n) + 0 * ((dst) - (src)))) +#endif + +/** Copy n elements of memory from src to dst, allowing overlapping regions. + The 0* term provides compile-time type checking */ +#if !defined(OVERRIDE_OD_MOVE) +# define OD_MOVE(dst, src, n) \ + (memmove((dst), (src), sizeof(*(dst))*(n) + 0*((dst) - (src)) )) +#endif + +/*All of these macros should expect floats as arguments.*/ +# define OD_SIGNMASK(a) (-((a) < 0)) +# define OD_FLIPSIGNI(a, b) (((a) + OD_SIGNMASK(b)) ^ OD_SIGNMASK(b)) + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_ODINTRIN_H_ diff --git a/libs/libaom/src/av1/common/ppc/cfl_ppc.c b/libs/libaom/src/av1/common/ppc/cfl_ppc.c new file mode 100644 index 000000000..6f88768f2 --- /dev/null +++ b/libs/libaom/src/av1/common/ppc/cfl_ppc.c @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/cfl.h" + +#define OFF_0 0 +#define OFF_1 16 +#define OFF_2 32 +#define OFF_3 48 +#define CFL_BUF_LINE_BYTES 64 +#define CFL_LINE_1 64 +#define CFL_LINE_2 128 +#define CFL_LINE_3 192 + +typedef vector signed char int8x16_t; // NOLINT(runtime/int) +typedef vector unsigned char uint8x16_t; // NOLINT(runtime/int) +typedef vector signed short int16x8_t; // NOLINT(runtime/int) +typedef vector unsigned short uint16x8_t; // NOLINT(runtime/int) +typedef vector signed int int32x4_t; // NOLINT(runtime/int) +typedef vector unsigned int uint32x4_t; // NOLINT(runtime/int) +typedef vector unsigned long long uint64x2_t; // NOLINT(runtime/int) + +static INLINE void subtract_average_vsx(const uint16_t *src_ptr, int16_t *dst, + int width, int height, int round_offset, + int num_pel_log2) { + // int16_t *dst = dst_ptr; + const int16_t *dst_end = dst + height * CFL_BUF_LINE; + const int16_t *sum_buf = (const int16_t *)src_ptr; + const int16_t *end = sum_buf + height * CFL_BUF_LINE; + const uint32x4_t div_shift = vec_splats((uint32_t)num_pel_log2); + const uint8x16_t mask_64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; + const uint8x16_t mask_32 = { 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, + 0x1C, 0x1D, 0x1E, 0x1F, 0x08, 0x09, 0x0A, 0x0B }; + + int32x4_t sum_32x4_0 = { 0, 0, 0, round_offset }; + int32x4_t sum_32x4_1 = { 0, 0, 0, 0 }; + do { + sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_0, sum_buf), sum_32x4_0); + sum_32x4_1 = vec_sum4s(vec_vsx_ld(OFF_0 + CFL_LINE_1, sum_buf), sum_32x4_1); + if (width >= 16) { + sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_1, sum_buf), sum_32x4_0); + sum_32x4_1 = + vec_sum4s(vec_vsx_ld(OFF_1 + CFL_LINE_1, sum_buf), sum_32x4_1); + } + if (width == 32) { + sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_2, sum_buf), sum_32x4_0); + sum_32x4_1 = + vec_sum4s(vec_vsx_ld(OFF_2 + CFL_LINE_1, sum_buf), sum_32x4_1); + sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_3, sum_buf), sum_32x4_0); + sum_32x4_1 = + vec_sum4s(vec_vsx_ld(OFF_3 + CFL_LINE_1, sum_buf), sum_32x4_1); + } + } while ((sum_buf += (CFL_BUF_LINE * 2)) < end); + int32x4_t sum_32x4 = vec_add(sum_32x4_0, sum_32x4_1); + + const int32x4_t perm_64 = vec_perm(sum_32x4, sum_32x4, mask_64); + sum_32x4 = vec_add(sum_32x4, perm_64); + const int32x4_t perm_32 = vec_perm(sum_32x4, sum_32x4, mask_32); + sum_32x4 = vec_add(sum_32x4, perm_32); + const int32x4_t avg = vec_sr(sum_32x4, div_shift); + const int16x8_t vec_avg = vec_pack(avg, avg); + do { + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, dst), vec_avg), OFF_0, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, dst), vec_avg), + OFF_0 + CFL_BUF_LINE_BYTES, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, dst), vec_avg), + OFF_0 + CFL_LINE_2, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, dst), vec_avg), + OFF_0 + CFL_LINE_3, dst); + if (width >= 16) { + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, dst), vec_avg), OFF_1, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, dst), vec_avg), + OFF_1 + CFL_LINE_1, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, dst), vec_avg), + OFF_1 + CFL_LINE_2, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, dst), vec_avg), + OFF_1 + CFL_LINE_3, dst); + } + if (width == 32) { + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, dst), vec_avg), OFF_2, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, dst), vec_avg), + OFF_2 + CFL_LINE_1, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, dst), vec_avg), + OFF_2 + CFL_LINE_2, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, dst), vec_avg), + OFF_2 + CFL_LINE_3, dst); + + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, dst), vec_avg), OFF_3, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, dst), vec_avg), + OFF_3 + CFL_LINE_1, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, dst), vec_avg), + OFF_3 + CFL_LINE_2, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, dst), vec_avg), + OFF_3 + CFL_LINE_3, dst); + } + } while ((dst += CFL_BUF_LINE * 4) < dst_end); +} + +// Declare wrappers for VSX sizes +CFL_SUB_AVG_X(vsx, 8, 4, 16, 5) +CFL_SUB_AVG_X(vsx, 8, 8, 32, 6) +CFL_SUB_AVG_X(vsx, 8, 16, 64, 7) +CFL_SUB_AVG_X(vsx, 8, 32, 128, 8) +CFL_SUB_AVG_X(vsx, 16, 4, 32, 6) +CFL_SUB_AVG_X(vsx, 16, 8, 64, 7) +CFL_SUB_AVG_X(vsx, 16, 16, 128, 8) +CFL_SUB_AVG_X(vsx, 16, 32, 256, 9) +CFL_SUB_AVG_X(vsx, 32, 8, 128, 8) +CFL_SUB_AVG_X(vsx, 32, 16, 256, 9) +CFL_SUB_AVG_X(vsx, 32, 32, 512, 10) + +// Based on observation, for small blocks VSX does not outperform C (no 64bit +// load and store intrinsics). So we call the C code for block widths 4. +cfl_subtract_average_fn cfl_get_subtract_average_fn_vsx(TX_SIZE tx_size) { + static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { + cfl_subtract_average_4x4_c, /* 4x4 */ + cfl_subtract_average_8x8_vsx, /* 8x8 */ + cfl_subtract_average_16x16_vsx, /* 16x16 */ + cfl_subtract_average_32x32_vsx, /* 32x32 */ + NULL, /* 64x64 (invalid CFL size) */ + cfl_subtract_average_4x8_c, /* 4x8 */ + cfl_subtract_average_8x4_vsx, /* 8x4 */ + cfl_subtract_average_8x16_vsx, /* 8x16 */ + cfl_subtract_average_16x8_vsx, /* 16x8 */ + cfl_subtract_average_16x32_vsx, /* 16x32 */ + cfl_subtract_average_32x16_vsx, /* 32x16 */ + NULL, /* 32x64 (invalid CFL size) */ + NULL, /* 64x32 (invalid CFL size) */ + cfl_subtract_average_4x16_c, /* 4x16 */ + cfl_subtract_average_16x4_vsx, /* 16x4 */ + cfl_subtract_average_8x32_vsx, /* 8x32 */ + cfl_subtract_average_32x8_vsx, /* 32x8 */ + NULL, /* 16x64 (invalid CFL size) */ + NULL, /* 64x16 (invalid CFL size) */ + }; + // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to + // index the function pointer array out of bounds. + return sub_avg[tx_size % TX_SIZES_ALL]; +} diff --git a/libs/libaom/src/av1/common/pred_common.c b/libs/libaom/src/av1/common/pred_common.c new file mode 100644 index 000000000..5952441d1 --- /dev/null +++ b/libs/libaom/src/av1/common/pred_common.c @@ -0,0 +1,501 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/common.h" +#include "av1/common/pred_common.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/common/seg_common.h" + +// Returns a context number for the given MB prediction signal +static InterpFilter get_ref_filter_type(const MB_MODE_INFO *ref_mbmi, + const MACROBLOCKD *xd, int dir, + MV_REFERENCE_FRAME ref_frame) { + (void)xd; + + return ((ref_mbmi->ref_frame[0] == ref_frame || + ref_mbmi->ref_frame[1] == ref_frame) + ? av1_extract_interp_filter(ref_mbmi->interp_filters, dir & 0x01) + : SWITCHABLE_FILTERS); +} + +int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const int ctx_offset = + (mbmi->ref_frame[1] > INTRA_FRAME) * INTER_FILTER_COMP_OFFSET; + assert(dir == 0 || dir == 1); + const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0]; + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries corresponding to real macroblocks. + // The prediction flags in these dummy entries are initialized to 0. + int filter_type_ctx = ctx_offset + (dir & 0x01) * INTER_FILTER_DIR_OFFSET; + int left_type = SWITCHABLE_FILTERS; + int above_type = SWITCHABLE_FILTERS; + + if (xd->left_available) + left_type = get_ref_filter_type(xd->mi[-1], xd, dir, ref_frame); + + if (xd->up_available) + above_type = + get_ref_filter_type(xd->mi[-xd->mi_stride], xd, dir, ref_frame); + + if (left_type == above_type) { + filter_type_ctx += left_type; + } else if (left_type == SWITCHABLE_FILTERS) { + assert(above_type != SWITCHABLE_FILTERS); + filter_type_ctx += above_type; + } else if (above_type == SWITCHABLE_FILTERS) { + assert(left_type != SWITCHABLE_FILTERS); + filter_type_ctx += left_type; + } else { + filter_type_ctx += SWITCHABLE_FILTERS; + } + + return filter_type_ctx; +} + +static void palette_add_to_cache(uint16_t *cache, int *n, uint16_t val) { + // Do not add an already existing value + if (*n > 0 && val == cache[*n - 1]) return; + + cache[(*n)++] = val; +} + +int av1_get_palette_cache(const MACROBLOCKD *const xd, int plane, + uint16_t *cache) { + const int row = -xd->mb_to_top_edge >> 3; + // Do not refer to above SB row when on SB boundary. + const MB_MODE_INFO *const above_mi = + (row % (1 << MIN_SB_SIZE_LOG2)) ? xd->above_mbmi : NULL; + const MB_MODE_INFO *const left_mi = xd->left_mbmi; + int above_n = 0, left_n = 0; + if (above_mi) above_n = above_mi->palette_mode_info.palette_size[plane != 0]; + if (left_mi) left_n = left_mi->palette_mode_info.palette_size[plane != 0]; + if (above_n == 0 && left_n == 0) return 0; + int above_idx = plane * PALETTE_MAX_SIZE; + int left_idx = plane * PALETTE_MAX_SIZE; + int n = 0; + const uint16_t *above_colors = + above_mi ? above_mi->palette_mode_info.palette_colors : NULL; + const uint16_t *left_colors = + left_mi ? left_mi->palette_mode_info.palette_colors : NULL; + // Merge the sorted lists of base colors from above and left to get + // combined sorted color cache. + while (above_n > 0 && left_n > 0) { + uint16_t v_above = above_colors[above_idx]; + uint16_t v_left = left_colors[left_idx]; + if (v_left < v_above) { + palette_add_to_cache(cache, &n, v_left); + ++left_idx, --left_n; + } else { + palette_add_to_cache(cache, &n, v_above); + ++above_idx, --above_n; + if (v_left == v_above) ++left_idx, --left_n; + } + } + while (above_n-- > 0) { + uint16_t val = above_colors[above_idx++]; + palette_add_to_cache(cache, &n, val); + } + while (left_n-- > 0) { + uint16_t val = left_colors[left_idx++]; + palette_add_to_cache(cache, &n, val); + } + assert(n <= 2 * PALETTE_MAX_SIZE); + return n; +} + +// The mode info data structure has a one element border above and to the +// left of the entries corresponding to real macroblocks. +// The prediction flags in these dummy entries are initialized to 0. +// 0 - inter/inter, inter/--, --/inter, --/-- +// 1 - intra/inter, inter/intra +// 2 - intra/--, --/intra +// 3 - intra/intra +int av1_get_intra_inter_context(const MACROBLOCKD *xd) { + const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; + const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; + const int has_above = xd->up_available; + const int has_left = xd->left_available; + + if (has_above && has_left) { // both edges available + const int above_intra = !is_inter_block(above_mbmi); + const int left_intra = !is_inter_block(left_mbmi); + return left_intra && above_intra ? 3 : left_intra || above_intra; + } else if (has_above || has_left) { // one edge available + return 2 * !is_inter_block(has_above ? above_mbmi : left_mbmi); + } else { + return 0; + } +} + +#define CHECK_BACKWARD_REFS(ref_frame) \ + (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME)) +#define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame) + +int av1_get_reference_mode_context(const MACROBLOCKD *xd) { + int ctx; + const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; + const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; + const int has_above = xd->up_available; + const int has_left = xd->left_available; + + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries corresponding to real macroblocks. + // The prediction flags in these dummy entries are initialized to 0. + if (has_above && has_left) { // both edges available + if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi)) + // neither edge uses comp pred (0/1) + ctx = IS_BACKWARD_REF_FRAME(above_mbmi->ref_frame[0]) ^ + IS_BACKWARD_REF_FRAME(left_mbmi->ref_frame[0]); + else if (!has_second_ref(above_mbmi)) + // one of two edges uses comp pred (2/3) + ctx = 2 + (IS_BACKWARD_REF_FRAME(above_mbmi->ref_frame[0]) || + !is_inter_block(above_mbmi)); + else if (!has_second_ref(left_mbmi)) + // one of two edges uses comp pred (2/3) + ctx = 2 + (IS_BACKWARD_REF_FRAME(left_mbmi->ref_frame[0]) || + !is_inter_block(left_mbmi)); + else // both edges use comp pred (4) + ctx = 4; + } else if (has_above || has_left) { // one edge available + const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi; + + if (!has_second_ref(edge_mbmi)) + // edge does not use comp pred (0/1) + ctx = IS_BACKWARD_REF_FRAME(edge_mbmi->ref_frame[0]); + else + // edge uses comp pred (3) + ctx = 3; + } else { // no edges available (1) + ctx = 1; + } + assert(ctx >= 0 && ctx < COMP_INTER_CONTEXTS); + return ctx; +} + +int av1_get_comp_reference_type_context(const MACROBLOCKD *xd) { + int pred_context; + const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; + const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; + const int above_in_image = xd->up_available; + const int left_in_image = xd->left_available; + + if (above_in_image && left_in_image) { // both edges available + const int above_intra = !is_inter_block(above_mbmi); + const int left_intra = !is_inter_block(left_mbmi); + + if (above_intra && left_intra) { // intra/intra + pred_context = 2; + } else if (above_intra || left_intra) { // intra/inter + const MB_MODE_INFO *inter_mbmi = above_intra ? left_mbmi : above_mbmi; + + if (!has_second_ref(inter_mbmi)) // single pred + pred_context = 2; + else // comp pred + pred_context = 1 + 2 * has_uni_comp_refs(inter_mbmi); + } else { // inter/inter + const int a_sg = !has_second_ref(above_mbmi); + const int l_sg = !has_second_ref(left_mbmi); + const MV_REFERENCE_FRAME frfa = above_mbmi->ref_frame[0]; + const MV_REFERENCE_FRAME frfl = left_mbmi->ref_frame[0]; + + if (a_sg && l_sg) { // single/single + pred_context = 1 + 2 * (!(IS_BACKWARD_REF_FRAME(frfa) ^ + IS_BACKWARD_REF_FRAME(frfl))); + } else if (l_sg || a_sg) { // single/comp + const int uni_rfc = + a_sg ? has_uni_comp_refs(left_mbmi) : has_uni_comp_refs(above_mbmi); + + if (!uni_rfc) // comp bidir + pred_context = 1; + else // comp unidir + pred_context = 3 + (!(IS_BACKWARD_REF_FRAME(frfa) ^ + IS_BACKWARD_REF_FRAME(frfl))); + } else { // comp/comp + const int a_uni_rfc = has_uni_comp_refs(above_mbmi); + const int l_uni_rfc = has_uni_comp_refs(left_mbmi); + + if (!a_uni_rfc && !l_uni_rfc) // bidir/bidir + pred_context = 0; + else if (!a_uni_rfc || !l_uni_rfc) // unidir/bidir + pred_context = 2; + else // unidir/unidir + pred_context = + 3 + (!((frfa == BWDREF_FRAME) ^ (frfl == BWDREF_FRAME))); + } + } + } else if (above_in_image || left_in_image) { // one edge available + const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi; + + if (!is_inter_block(edge_mbmi)) { // intra + pred_context = 2; + } else { // inter + if (!has_second_ref(edge_mbmi)) // single pred + pred_context = 2; + else // comp pred + pred_context = 4 * has_uni_comp_refs(edge_mbmi); + } + } else { // no edges available + pred_context = 2; + } + + assert(pred_context >= 0 && pred_context < COMP_REF_TYPE_CONTEXTS); + return pred_context; +} + +// Returns a context number for the given MB prediction signal +// +// Signal the uni-directional compound reference frame pair as either +// (BWDREF, ALTREF), or (LAST, LAST2) / (LAST, LAST3) / (LAST, GOLDEN), +// conditioning on the pair is known as uni-directional. +// +// 3 contexts: Voting is used to compare the count of forward references with +// that of backward references from the spatial neighbors. +int av1_get_pred_context_uni_comp_ref_p(const MACROBLOCKD *xd) { + const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; + + // Count of forward references (L, L2, L3, or G) + const int frf_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME] + + ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME]; + // Count of backward references (B or A) + const int brf_count = ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME] + + ref_counts[ALTREF_FRAME]; + + const int pred_context = + (frf_count == brf_count) ? 1 : ((frf_count < brf_count) ? 0 : 2); + + assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS); + return pred_context; +} + +// Returns a context number for the given MB prediction signal +// +// Signal the uni-directional compound reference frame pair as +// either (LAST, LAST2), or (LAST, LAST3) / (LAST, GOLDEN), +// conditioning on the pair is known as one of the above three. +// +// 3 contexts: Voting is used to compare the count of LAST2_FRAME with the +// total count of LAST3/GOLDEN from the spatial neighbors. +int av1_get_pred_context_uni_comp_ref_p1(const MACROBLOCKD *xd) { + const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; + + // Count of LAST2 + const int last2_count = ref_counts[LAST2_FRAME]; + // Count of LAST3 or GOLDEN + const int last3_or_gld_count = + ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME]; + + const int pred_context = (last2_count == last3_or_gld_count) + ? 1 + : ((last2_count < last3_or_gld_count) ? 0 : 2); + + assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS); + return pred_context; +} + +// Returns a context number for the given MB prediction signal +// +// Signal the uni-directional compound reference frame pair as +// either (LAST, LAST3) or (LAST, GOLDEN), +// conditioning on the pair is known as one of the above two. +// +// 3 contexts: Voting is used to compare the count of LAST3_FRAME with the +// total count of GOLDEN_FRAME from the spatial neighbors. +int av1_get_pred_context_uni_comp_ref_p2(const MACROBLOCKD *xd) { + const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; + + // Count of LAST3 + const int last3_count = ref_counts[LAST3_FRAME]; + // Count of GOLDEN + const int gld_count = ref_counts[GOLDEN_FRAME]; + + const int pred_context = + (last3_count == gld_count) ? 1 : ((last3_count < gld_count) ? 0 : 2); + + assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS); + return pred_context; +} + +// == Common context functions for both comp and single ref == +// +// Obtain contexts to signal a reference frame to be either LAST/LAST2 or +// LAST3/GOLDEN. +static int get_pred_context_ll2_or_l3gld(const MACROBLOCKD *xd) { + const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; + + // Count of LAST + LAST2 + const int last_last2_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME]; + // Count of LAST3 + GOLDEN + const int last3_gld_count = + ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME]; + + const int pred_context = (last_last2_count == last3_gld_count) + ? 1 + : ((last_last2_count < last3_gld_count) ? 0 : 2); + + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); + return pred_context; +} + +// Obtain contexts to signal a reference frame to be either LAST or LAST2. +static int get_pred_context_last_or_last2(const MACROBLOCKD *xd) { + const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; + + // Count of LAST + const int last_count = ref_counts[LAST_FRAME]; + // Count of LAST2 + const int last2_count = ref_counts[LAST2_FRAME]; + + const int pred_context = + (last_count == last2_count) ? 1 : ((last_count < last2_count) ? 0 : 2); + + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); + return pred_context; +} + +// Obtain contexts to signal a reference frame to be either LAST3 or GOLDEN. +static int get_pred_context_last3_or_gld(const MACROBLOCKD *xd) { + const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; + + // Count of LAST3 + const int last3_count = ref_counts[LAST3_FRAME]; + // Count of GOLDEN + const int gld_count = ref_counts[GOLDEN_FRAME]; + + const int pred_context = + (last3_count == gld_count) ? 1 : ((last3_count < gld_count) ? 0 : 2); + + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); + return pred_context; +} + +// Obtain contexts to signal a reference frame be either BWDREF/ALTREF2, or +// ALTREF. +static int get_pred_context_brfarf2_or_arf(const MACROBLOCKD *xd) { + const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; + + // Counts of BWDREF, ALTREF2, or ALTREF frames (B, A2, or A) + const int brfarf2_count = + ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME]; + const int arf_count = ref_counts[ALTREF_FRAME]; + + const int pred_context = + (brfarf2_count == arf_count) ? 1 : ((brfarf2_count < arf_count) ? 0 : 2); + + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); + return pred_context; +} + +// Obtain contexts to signal a reference frame be either BWDREF or ALTREF2. +static int get_pred_context_brf_or_arf2(const MACROBLOCKD *xd) { + const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; + + // Count of BWDREF frames (B) + const int brf_count = ref_counts[BWDREF_FRAME]; + // Count of ALTREF2 frames (A2) + const int arf2_count = ref_counts[ALTREF2_FRAME]; + + const int pred_context = + (brf_count == arf2_count) ? 1 : ((brf_count < arf2_count) ? 0 : 2); + + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); + return pred_context; +} + +// == Context functions for comp ref == +// +// Returns a context number for the given MB prediction signal +// Signal the first reference frame for a compound mode be either +// GOLDEN/LAST3, or LAST/LAST2. +int av1_get_pred_context_comp_ref_p(const MACROBLOCKD *xd) { + return get_pred_context_ll2_or_l3gld(xd); +} + +// Returns a context number for the given MB prediction signal +// Signal the first reference frame for a compound mode be LAST, +// conditioning on that it is known either LAST/LAST2. +int av1_get_pred_context_comp_ref_p1(const MACROBLOCKD *xd) { + return get_pred_context_last_or_last2(xd); +} + +// Returns a context number for the given MB prediction signal +// Signal the first reference frame for a compound mode be GOLDEN, +// conditioning on that it is known either GOLDEN or LAST3. +int av1_get_pred_context_comp_ref_p2(const MACROBLOCKD *xd) { + return get_pred_context_last3_or_gld(xd); +} + +// Signal the 2nd reference frame for a compound mode be either +// ALTREF, or ALTREF2/BWDREF. +int av1_get_pred_context_comp_bwdref_p(const MACROBLOCKD *xd) { + return get_pred_context_brfarf2_or_arf(xd); +} + +// Signal the 2nd reference frame for a compound mode be either +// ALTREF2 or BWDREF. +int av1_get_pred_context_comp_bwdref_p1(const MACROBLOCKD *xd) { + return get_pred_context_brf_or_arf2(xd); +} + +// == Context functions for single ref == +// +// For the bit to signal whether the single reference is a forward reference +// frame or a backward reference frame. +int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) { + const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; + + // Count of forward reference frames + const int fwd_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME] + + ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME]; + // Count of backward reference frames + const int bwd_count = ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME] + + ref_counts[ALTREF_FRAME]; + + const int pred_context = + (fwd_count == bwd_count) ? 1 : ((fwd_count < bwd_count) ? 0 : 2); + + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); + return pred_context; +} + +// For the bit to signal whether the single reference is ALTREF_FRAME or +// non-ALTREF backward reference frame, knowing that it shall be either of +// these 2 choices. +int av1_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { + return get_pred_context_brfarf2_or_arf(xd); +} + +// For the bit to signal whether the single reference is LAST3/GOLDEN or +// LAST2/LAST, knowing that it shall be either of these 2 choices. +int av1_get_pred_context_single_ref_p3(const MACROBLOCKD *xd) { + return get_pred_context_ll2_or_l3gld(xd); +} + +// For the bit to signal whether the single reference is LAST2_FRAME or +// LAST_FRAME, knowing that it shall be either of these 2 choices. +int av1_get_pred_context_single_ref_p4(const MACROBLOCKD *xd) { + return get_pred_context_last_or_last2(xd); +} + +// For the bit to signal whether the single reference is GOLDEN_FRAME or +// LAST3_FRAME, knowing that it shall be either of these 2 choices. +int av1_get_pred_context_single_ref_p5(const MACROBLOCKD *xd) { + return get_pred_context_last3_or_gld(xd); +} + +// For the bit to signal whether the single reference is ALTREF2_FRAME or +// BWDREF_FRAME, knowing that it shall be either of these 2 choices. +int av1_get_pred_context_single_ref_p6(const MACROBLOCKD *xd) { + return get_pred_context_brf_or_arf2(xd); +} diff --git a/libs/libaom/src/av1/common/pred_common.h b/libs/libaom/src/av1/common/pred_common.h new file mode 100644 index 000000000..d1dab97e7 --- /dev/null +++ b/libs/libaom/src/av1/common/pred_common.h @@ -0,0 +1,374 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_PRED_COMMON_H_ +#define AOM_AV1_COMMON_PRED_COMMON_H_ + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/mvref_common.h" +#include "aom_dsp/aom_dsp_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE int get_segment_id(const CommonModeInfoParams *const mi_params, + const uint8_t *segment_ids, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + const int mi_offset = mi_row * mi_params->mi_cols + mi_col; + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + const int xmis = AOMMIN(mi_params->mi_cols - mi_col, bw); + const int ymis = AOMMIN(mi_params->mi_rows - mi_row, bh); + int segment_id = MAX_SEGMENTS; + + for (int y = 0; y < ymis; ++y) { + for (int x = 0; x < xmis; ++x) { + segment_id = AOMMIN(segment_id, + segment_ids[mi_offset + y * mi_params->mi_cols + x]); + } + } + + assert(segment_id >= 0 && segment_id < MAX_SEGMENTS); + return segment_id; +} + +static INLINE int av1_get_spatial_seg_pred(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, + int *cdf_index) { + int prev_ul = -1; // top left segment_id + int prev_l = -1; // left segment_id + int prev_u = -1; // top segment_id + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const uint8_t *seg_map = cm->cur_frame->seg_map; + if ((xd->up_available) && (xd->left_available)) { + prev_ul = + get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - 1, mi_col - 1); + } + if (xd->up_available) { + prev_u = + get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - 1, mi_col - 0); + } + if (xd->left_available) { + prev_l = + get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - 0, mi_col - 1); + } + // This property follows from the fact that get_segment_id() returns a + // nonnegative value. This allows us to test for all edge cases with a simple + // prev_ul < 0 check. + assert(IMPLIES(prev_ul >= 0, prev_u >= 0 && prev_l >= 0)); + + // Pick CDF index based on number of matching/out-of-bounds segment IDs. + if (prev_ul < 0) /* Edge cases */ + *cdf_index = 0; + else if ((prev_ul == prev_u) && (prev_ul == prev_l)) + *cdf_index = 2; + else if ((prev_ul == prev_u) || (prev_ul == prev_l) || (prev_u == prev_l)) + *cdf_index = 1; + else + *cdf_index = 0; + + // If 2 or more are identical returns that as predictor, otherwise prev_l. + if (prev_u == -1) // edge case + return prev_l == -1 ? 0 : prev_l; + if (prev_l == -1) // edge case + return prev_u; + return (prev_ul == prev_u) ? prev_u : prev_l; +} + +static INLINE int av1_get_pred_context_seg_id(const MACROBLOCKD *xd) { + const MB_MODE_INFO *const above_mi = xd->above_mbmi; + const MB_MODE_INFO *const left_mi = xd->left_mbmi; + const int above_sip = (above_mi != NULL) ? above_mi->seg_id_predicted : 0; + const int left_sip = (left_mi != NULL) ? left_mi->seg_id_predicted : 0; + + return above_sip + left_sip; +} + +static INLINE int get_comp_index_context(const AV1_COMMON *cm, + const MACROBLOCKD *xd) { + MB_MODE_INFO *mbmi = xd->mi[0]; + const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]); + const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]); + int bck_frame_index = 0, fwd_frame_index = 0; + int cur_frame_index = cm->cur_frame->order_hint; + + if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint; + if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint; + + int fwd = abs(get_relative_dist(&cm->seq_params.order_hint_info, + fwd_frame_index, cur_frame_index)); + int bck = abs(get_relative_dist(&cm->seq_params.order_hint_info, + cur_frame_index, bck_frame_index)); + + const MB_MODE_INFO *const above_mi = xd->above_mbmi; + const MB_MODE_INFO *const left_mi = xd->left_mbmi; + + int above_ctx = 0, left_ctx = 0; + const int offset = (fwd == bck); + + if (above_mi != NULL) { + if (has_second_ref(above_mi)) + above_ctx = above_mi->compound_idx; + else if (above_mi->ref_frame[0] == ALTREF_FRAME) + above_ctx = 1; + } + + if (left_mi != NULL) { + if (has_second_ref(left_mi)) + left_ctx = left_mi->compound_idx; + else if (left_mi->ref_frame[0] == ALTREF_FRAME) + left_ctx = 1; + } + + return above_ctx + left_ctx + 3 * offset; +} + +static INLINE int get_comp_group_idx_context(const MACROBLOCKD *xd) { + const MB_MODE_INFO *const above_mi = xd->above_mbmi; + const MB_MODE_INFO *const left_mi = xd->left_mbmi; + int above_ctx = 0, left_ctx = 0; + + if (above_mi) { + if (has_second_ref(above_mi)) + above_ctx = above_mi->comp_group_idx; + else if (above_mi->ref_frame[0] == ALTREF_FRAME) + above_ctx = 3; + } + if (left_mi) { + if (has_second_ref(left_mi)) + left_ctx = left_mi->comp_group_idx; + else if (left_mi->ref_frame[0] == ALTREF_FRAME) + left_ctx = 3; + } + + return AOMMIN(5, above_ctx + left_ctx); +} + +static INLINE aom_cdf_prob *av1_get_pred_cdf_seg_id( + struct segmentation_probs *segp, const MACROBLOCKD *xd) { + return segp->pred_cdf[av1_get_pred_context_seg_id(xd)]; +} + +static INLINE int av1_get_skip_mode_context(const MACROBLOCKD *xd) { + const MB_MODE_INFO *const above_mi = xd->above_mbmi; + const MB_MODE_INFO *const left_mi = xd->left_mbmi; + const int above_skip_mode = above_mi ? above_mi->skip_mode : 0; + const int left_skip_mode = left_mi ? left_mi->skip_mode : 0; + return above_skip_mode + left_skip_mode; +} + +static INLINE int av1_get_skip_context(const MACROBLOCKD *xd) { + const MB_MODE_INFO *const above_mi = xd->above_mbmi; + const MB_MODE_INFO *const left_mi = xd->left_mbmi; + const int above_skip = above_mi ? above_mi->skip : 0; + const int left_skip = left_mi ? left_mi->skip : 0; + return above_skip + left_skip; +} + +int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir); + +// Get a list of palette base colors that are used in the above and left blocks, +// referred to as "color cache". The return value is the number of colors in the +// cache (<= 2 * PALETTE_MAX_SIZE). The color values are stored in "cache" +// in ascending order. +int av1_get_palette_cache(const MACROBLOCKD *const xd, int plane, + uint16_t *cache); + +static INLINE int av1_get_palette_bsize_ctx(BLOCK_SIZE bsize) { + assert(bsize < BLOCK_SIZES_ALL); + return num_pels_log2_lookup[bsize] - num_pels_log2_lookup[BLOCK_8X8]; +} + +static INLINE int av1_get_palette_mode_ctx(const MACROBLOCKD *xd) { + const MB_MODE_INFO *const above_mi = xd->above_mbmi; + const MB_MODE_INFO *const left_mi = xd->left_mbmi; + int ctx = 0; + if (above_mi) ctx += (above_mi->palette_mode_info.palette_size[0] > 0); + if (left_mi) ctx += (left_mi->palette_mode_info.palette_size[0] > 0); + return ctx; +} + +int av1_get_intra_inter_context(const MACROBLOCKD *xd); + +int av1_get_reference_mode_context(const MACROBLOCKD *xd); + +static INLINE aom_cdf_prob *av1_get_reference_mode_cdf(const MACROBLOCKD *xd) { + return xd->tile_ctx->comp_inter_cdf[av1_get_reference_mode_context(xd)]; +} + +static INLINE aom_cdf_prob *av1_get_skip_cdf(const MACROBLOCKD *xd) { + return xd->tile_ctx->skip_cdfs[av1_get_skip_context(xd)]; +} + +int av1_get_comp_reference_type_context(const MACROBLOCKD *xd); + +// == Uni-directional contexts == + +int av1_get_pred_context_uni_comp_ref_p(const MACROBLOCKD *xd); + +int av1_get_pred_context_uni_comp_ref_p1(const MACROBLOCKD *xd); + +int av1_get_pred_context_uni_comp_ref_p2(const MACROBLOCKD *xd); + +static INLINE aom_cdf_prob *av1_get_comp_reference_type_cdf( + const MACROBLOCKD *xd) { + const int pred_context = av1_get_comp_reference_type_context(xd); + return xd->tile_ctx->comp_ref_type_cdf[pred_context]; +} + +static INLINE aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p( + const MACROBLOCKD *xd) { + const int pred_context = av1_get_pred_context_uni_comp_ref_p(xd); + return xd->tile_ctx->uni_comp_ref_cdf[pred_context][0]; +} + +static INLINE aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p1( + const MACROBLOCKD *xd) { + const int pred_context = av1_get_pred_context_uni_comp_ref_p1(xd); + return xd->tile_ctx->uni_comp_ref_cdf[pred_context][1]; +} + +static INLINE aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p2( + const MACROBLOCKD *xd) { + const int pred_context = av1_get_pred_context_uni_comp_ref_p2(xd); + return xd->tile_ctx->uni_comp_ref_cdf[pred_context][2]; +} + +// == Bi-directional contexts == + +int av1_get_pred_context_comp_ref_p(const MACROBLOCKD *xd); + +int av1_get_pred_context_comp_ref_p1(const MACROBLOCKD *xd); + +int av1_get_pred_context_comp_ref_p2(const MACROBLOCKD *xd); + +int av1_get_pred_context_comp_bwdref_p(const MACROBLOCKD *xd); + +int av1_get_pred_context_comp_bwdref_p1(const MACROBLOCKD *xd); + +static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p(const MACROBLOCKD *xd) { + const int pred_context = av1_get_pred_context_comp_ref_p(xd); + return xd->tile_ctx->comp_ref_cdf[pred_context][0]; +} + +static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p1( + const MACROBLOCKD *xd) { + const int pred_context = av1_get_pred_context_comp_ref_p1(xd); + return xd->tile_ctx->comp_ref_cdf[pred_context][1]; +} + +static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p2( + const MACROBLOCKD *xd) { + const int pred_context = av1_get_pred_context_comp_ref_p2(xd); + return xd->tile_ctx->comp_ref_cdf[pred_context][2]; +} + +static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_bwdref_p( + const MACROBLOCKD *xd) { + const int pred_context = av1_get_pred_context_comp_bwdref_p(xd); + return xd->tile_ctx->comp_bwdref_cdf[pred_context][0]; +} + +static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_bwdref_p1( + const MACROBLOCKD *xd) { + const int pred_context = av1_get_pred_context_comp_bwdref_p1(xd); + return xd->tile_ctx->comp_bwdref_cdf[pred_context][1]; +} + +// == Single contexts == + +int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd); + +int av1_get_pred_context_single_ref_p2(const MACROBLOCKD *xd); + +int av1_get_pred_context_single_ref_p3(const MACROBLOCKD *xd); + +int av1_get_pred_context_single_ref_p4(const MACROBLOCKD *xd); + +int av1_get_pred_context_single_ref_p5(const MACROBLOCKD *xd); + +int av1_get_pred_context_single_ref_p6(const MACROBLOCKD *xd); + +static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p1( + const MACROBLOCKD *xd) { + return xd->tile_ctx + ->single_ref_cdf[av1_get_pred_context_single_ref_p1(xd)][0]; +} +static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p2( + const MACROBLOCKD *xd) { + return xd->tile_ctx + ->single_ref_cdf[av1_get_pred_context_single_ref_p2(xd)][1]; +} +static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p3( + const MACROBLOCKD *xd) { + return xd->tile_ctx + ->single_ref_cdf[av1_get_pred_context_single_ref_p3(xd)][2]; +} +static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p4( + const MACROBLOCKD *xd) { + return xd->tile_ctx + ->single_ref_cdf[av1_get_pred_context_single_ref_p4(xd)][3]; +} +static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p5( + const MACROBLOCKD *xd) { + return xd->tile_ctx + ->single_ref_cdf[av1_get_pred_context_single_ref_p5(xd)][4]; +} +static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p6( + const MACROBLOCKD *xd) { + return xd->tile_ctx + ->single_ref_cdf[av1_get_pred_context_single_ref_p6(xd)][5]; +} + +// Returns a context number for the given MB prediction signal +// The mode info data structure has a one element border above and to the +// left of the entries corresponding to real blocks. +// The prediction flags in these dummy entries are initialized to 0. +static INLINE int get_tx_size_context(const MACROBLOCKD *xd) { + const MB_MODE_INFO *mbmi = xd->mi[0]; + const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; + const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; + const TX_SIZE max_tx_size = max_txsize_rect_lookup[mbmi->sb_type]; + const int max_tx_wide = tx_size_wide[max_tx_size]; + const int max_tx_high = tx_size_high[max_tx_size]; + const int has_above = xd->up_available; + const int has_left = xd->left_available; + + int above = xd->above_txfm_context[0] >= max_tx_wide; + int left = xd->left_txfm_context[0] >= max_tx_high; + + if (has_above) + if (is_inter_block(above_mbmi)) + above = block_size_wide[above_mbmi->sb_type] >= max_tx_wide; + + if (has_left) + if (is_inter_block(left_mbmi)) + left = block_size_high[left_mbmi->sb_type] >= max_tx_high; + + if (has_above && has_left) + return (above + left); + else if (has_above) + return above; + else if (has_left) + return left; + else + return 0; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_PRED_COMMON_H_ diff --git a/libs/libaom/src/av1/common/quant_common.c b/libs/libaom/src/av1/common/quant_common.c new file mode 100644 index 000000000..e96d71a3b --- /dev/null +++ b/libs/libaom/src/av1/common/quant_common.c @@ -0,0 +1,12875 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/common.h" +#include "av1/common/entropy.h" +#include "av1/common/quant_common.h" +#include "av1/common/seg_common.h" + +static const int16_t dc_qlookup_QTX[QINDEX_RANGE] = { + 4, 8, 8, 9, 10, 11, 12, 12, 13, 14, 15, 16, 17, 18, + 19, 19, 20, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, + 31, 32, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42, + 43, 43, 44, 45, 46, 47, 48, 48, 49, 50, 51, 52, 53, 53, + 54, 55, 56, 57, 57, 58, 59, 60, 61, 62, 62, 63, 64, 65, + 66, 66, 67, 68, 69, 70, 70, 71, 72, 73, 74, 74, 75, 76, + 77, 78, 78, 79, 80, 81, 81, 82, 83, 84, 85, 85, 87, 88, + 90, 92, 93, 95, 96, 98, 99, 101, 102, 104, 105, 107, 108, 110, + 111, 113, 114, 116, 117, 118, 120, 121, 123, 125, 127, 129, 131, 134, + 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 161, 164, + 166, 169, 172, 174, 177, 180, 182, 185, 187, 190, 192, 195, 199, 202, + 205, 208, 211, 214, 217, 220, 223, 226, 230, 233, 237, 240, 243, 247, + 250, 253, 257, 261, 265, 269, 272, 276, 280, 284, 288, 292, 296, 300, + 304, 309, 313, 317, 322, 326, 330, 335, 340, 344, 349, 354, 359, 364, + 369, 374, 379, 384, 389, 395, 400, 406, 411, 417, 423, 429, 435, 441, + 447, 454, 461, 467, 475, 482, 489, 497, 505, 513, 522, 530, 539, 549, + 559, 569, 579, 590, 602, 614, 626, 640, 654, 668, 684, 700, 717, 736, + 755, 775, 796, 819, 843, 869, 896, 925, 955, 988, 1022, 1058, 1098, 1139, + 1184, 1232, 1282, 1336, +}; + +static const int16_t dc_qlookup_10_QTX[QINDEX_RANGE] = { + 4, 9, 10, 13, 15, 17, 20, 22, 25, 28, 31, 34, 37, + 40, 43, 47, 50, 53, 57, 60, 64, 68, 71, 75, 78, 82, + 86, 90, 93, 97, 101, 105, 109, 113, 116, 120, 124, 128, 132, + 136, 140, 143, 147, 151, 155, 159, 163, 166, 170, 174, 178, 182, + 185, 189, 193, 197, 200, 204, 208, 212, 215, 219, 223, 226, 230, + 233, 237, 241, 244, 248, 251, 255, 259, 262, 266, 269, 273, 276, + 280, 283, 287, 290, 293, 297, 300, 304, 307, 310, 314, 317, 321, + 324, 327, 331, 334, 337, 343, 350, 356, 362, 369, 375, 381, 387, + 394, 400, 406, 412, 418, 424, 430, 436, 442, 448, 454, 460, 466, + 472, 478, 484, 490, 499, 507, 516, 525, 533, 542, 550, 559, 567, + 576, 584, 592, 601, 609, 617, 625, 634, 644, 655, 666, 676, 687, + 698, 708, 718, 729, 739, 749, 759, 770, 782, 795, 807, 819, 831, + 844, 856, 868, 880, 891, 906, 920, 933, 947, 961, 975, 988, 1001, + 1015, 1030, 1045, 1061, 1076, 1090, 1105, 1120, 1137, 1153, 1170, 1186, 1202, + 1218, 1236, 1253, 1271, 1288, 1306, 1323, 1342, 1361, 1379, 1398, 1416, 1436, + 1456, 1476, 1496, 1516, 1537, 1559, 1580, 1601, 1624, 1647, 1670, 1692, 1717, + 1741, 1766, 1791, 1817, 1844, 1871, 1900, 1929, 1958, 1990, 2021, 2054, 2088, + 2123, 2159, 2197, 2236, 2276, 2319, 2363, 2410, 2458, 2508, 2561, 2616, 2675, + 2737, 2802, 2871, 2944, 3020, 3102, 3188, 3280, 3375, 3478, 3586, 3702, 3823, + 3953, 4089, 4236, 4394, 4559, 4737, 4929, 5130, 5347, +}; + +static const int16_t dc_qlookup_12_QTX[QINDEX_RANGE] = { + 4, 12, 18, 25, 33, 41, 50, 60, 70, 80, 91, + 103, 115, 127, 140, 153, 166, 180, 194, 208, 222, 237, + 251, 266, 281, 296, 312, 327, 343, 358, 374, 390, 405, + 421, 437, 453, 469, 484, 500, 516, 532, 548, 564, 580, + 596, 611, 627, 643, 659, 674, 690, 706, 721, 737, 752, + 768, 783, 798, 814, 829, 844, 859, 874, 889, 904, 919, + 934, 949, 964, 978, 993, 1008, 1022, 1037, 1051, 1065, 1080, + 1094, 1108, 1122, 1136, 1151, 1165, 1179, 1192, 1206, 1220, 1234, + 1248, 1261, 1275, 1288, 1302, 1315, 1329, 1342, 1368, 1393, 1419, + 1444, 1469, 1494, 1519, 1544, 1569, 1594, 1618, 1643, 1668, 1692, + 1717, 1741, 1765, 1789, 1814, 1838, 1862, 1885, 1909, 1933, 1957, + 1992, 2027, 2061, 2096, 2130, 2165, 2199, 2233, 2267, 2300, 2334, + 2367, 2400, 2434, 2467, 2499, 2532, 2575, 2618, 2661, 2704, 2746, + 2788, 2830, 2872, 2913, 2954, 2995, 3036, 3076, 3127, 3177, 3226, + 3275, 3324, 3373, 3421, 3469, 3517, 3565, 3621, 3677, 3733, 3788, + 3843, 3897, 3951, 4005, 4058, 4119, 4181, 4241, 4301, 4361, 4420, + 4479, 4546, 4612, 4677, 4742, 4807, 4871, 4942, 5013, 5083, 5153, + 5222, 5291, 5367, 5442, 5517, 5591, 5665, 5745, 5825, 5905, 5984, + 6063, 6149, 6234, 6319, 6404, 6495, 6587, 6678, 6769, 6867, 6966, + 7064, 7163, 7269, 7376, 7483, 7599, 7715, 7832, 7958, 8085, 8214, + 8352, 8492, 8635, 8788, 8945, 9104, 9275, 9450, 9639, 9832, 10031, + 10245, 10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409, 12750, 13118, + 13501, 13913, 14343, 14807, 15290, 15812, 16356, 16943, 17575, 18237, 18949, + 19718, 20521, 21387, +}; + +static const int16_t ac_qlookup_QTX[QINDEX_RANGE] = { + 4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, + 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, + 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, + 98, 99, 100, 101, 102, 104, 106, 108, 110, 112, 114, 116, 118, + 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, + 146, 148, 150, 152, 155, 158, 161, 164, 167, 170, 173, 176, 179, + 182, 185, 188, 191, 194, 197, 200, 203, 207, 211, 215, 219, 223, + 227, 231, 235, 239, 243, 247, 251, 255, 260, 265, 270, 275, 280, + 285, 290, 295, 300, 305, 311, 317, 323, 329, 335, 341, 347, 353, + 359, 366, 373, 380, 387, 394, 401, 408, 416, 424, 432, 440, 448, + 456, 465, 474, 483, 492, 501, 510, 520, 530, 540, 550, 560, 571, + 582, 593, 604, 615, 627, 639, 651, 663, 676, 689, 702, 715, 729, + 743, 757, 771, 786, 801, 816, 832, 848, 864, 881, 898, 915, 933, + 951, 969, 988, 1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151, 1173, 1196, + 1219, 1243, 1267, 1292, 1317, 1343, 1369, 1396, 1423, 1451, 1479, 1508, 1537, + 1567, 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828, +}; + +static const int16_t ac_qlookup_10_QTX[QINDEX_RANGE] = { + 4, 9, 11, 13, 16, 18, 21, 24, 27, 30, 33, 37, 40, + 44, 48, 51, 55, 59, 63, 67, 71, 75, 79, 83, 88, 92, + 96, 100, 105, 109, 114, 118, 122, 127, 131, 136, 140, 145, 149, + 154, 158, 163, 168, 172, 177, 181, 186, 190, 195, 199, 204, 208, + 213, 217, 222, 226, 231, 235, 240, 244, 249, 253, 258, 262, 267, + 271, 275, 280, 284, 289, 293, 297, 302, 306, 311, 315, 319, 324, + 328, 332, 337, 341, 345, 349, 354, 358, 362, 367, 371, 375, 379, + 384, 388, 392, 396, 401, 409, 417, 425, 433, 441, 449, 458, 466, + 474, 482, 490, 498, 506, 514, 523, 531, 539, 547, 555, 563, 571, + 579, 588, 596, 604, 616, 628, 640, 652, 664, 676, 688, 700, 713, + 725, 737, 749, 761, 773, 785, 797, 809, 825, 841, 857, 873, 889, + 905, 922, 938, 954, 970, 986, 1002, 1018, 1038, 1058, 1078, 1098, 1118, + 1138, 1158, 1178, 1198, 1218, 1242, 1266, 1290, 1314, 1338, 1362, 1386, 1411, + 1435, 1463, 1491, 1519, 1547, 1575, 1603, 1631, 1663, 1695, 1727, 1759, 1791, + 1823, 1859, 1895, 1931, 1967, 2003, 2039, 2079, 2119, 2159, 2199, 2239, 2283, + 2327, 2371, 2415, 2459, 2507, 2555, 2603, 2651, 2703, 2755, 2807, 2859, 2915, + 2971, 3027, 3083, 3143, 3203, 3263, 3327, 3391, 3455, 3523, 3591, 3659, 3731, + 3803, 3876, 3952, 4028, 4104, 4184, 4264, 4348, 4432, 4516, 4604, 4692, 4784, + 4876, 4972, 5068, 5168, 5268, 5372, 5476, 5584, 5692, 5804, 5916, 6032, 6148, + 6268, 6388, 6512, 6640, 6768, 6900, 7036, 7172, 7312, +}; + +static const int16_t ac_qlookup_12_QTX[QINDEX_RANGE] = { + 4, 13, 19, 27, 35, 44, 54, 64, 75, 87, 99, + 112, 126, 139, 154, 168, 183, 199, 214, 230, 247, 263, + 280, 297, 314, 331, 349, 366, 384, 402, 420, 438, 456, + 475, 493, 511, 530, 548, 567, 586, 604, 623, 642, 660, + 679, 698, 716, 735, 753, 772, 791, 809, 828, 846, 865, + 884, 902, 920, 939, 957, 976, 994, 1012, 1030, 1049, 1067, + 1085, 1103, 1121, 1139, 1157, 1175, 1193, 1211, 1229, 1246, 1264, + 1282, 1299, 1317, 1335, 1352, 1370, 1387, 1405, 1422, 1440, 1457, + 1474, 1491, 1509, 1526, 1543, 1560, 1577, 1595, 1627, 1660, 1693, + 1725, 1758, 1791, 1824, 1856, 1889, 1922, 1954, 1987, 2020, 2052, + 2085, 2118, 2150, 2183, 2216, 2248, 2281, 2313, 2346, 2378, 2411, + 2459, 2508, 2556, 2605, 2653, 2701, 2750, 2798, 2847, 2895, 2943, + 2992, 3040, 3088, 3137, 3185, 3234, 3298, 3362, 3426, 3491, 3555, + 3619, 3684, 3748, 3812, 3876, 3941, 4005, 4069, 4149, 4230, 4310, + 4390, 4470, 4550, 4631, 4711, 4791, 4871, 4967, 5064, 5160, 5256, + 5352, 5448, 5544, 5641, 5737, 5849, 5961, 6073, 6185, 6297, 6410, + 6522, 6650, 6778, 6906, 7034, 7162, 7290, 7435, 7579, 7723, 7867, + 8011, 8155, 8315, 8475, 8635, 8795, 8956, 9132, 9308, 9484, 9660, + 9836, 10028, 10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661, 11885, + 12109, 12333, 12573, 12813, 13053, 13309, 13565, 13821, 14093, 14365, 14637, + 14925, 15213, 15502, 15806, 16110, 16414, 16734, 17054, 17390, 17726, 18062, + 18414, 18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486, 21902, 22334, + 22766, 23214, 23662, 24126, 24590, 25070, 25551, 26047, 26559, 27071, 27599, + 28143, 28687, 29247, +}; + +// Coefficient scaling and quantization with AV1 TX are tailored to +// the AV1 TX transforms. Regardless of the bit-depth of the input, +// the transform stages scale the coefficient values up by a factor of +// 8 (3 bits) over the scale of the pixel values. Thus, for 8-bit +// input, the coefficients have effectively 11 bits of scale depth +// (8+3), 10-bit input pixels result in 13-bit coefficient depth +// (10+3) and 12-bit pixels yield 15-bit (12+3) coefficient depth. +// All quantizers are built using this invariant of x8, 3-bit scaling, +// thus the Q3 suffix. + +// A partial exception to this rule is large transforms; to avoid +// overflow, TX blocks with > 256 pels (>16x16) are scaled only +// 4-times unity (2 bits) over the pixel depth, and TX blocks with +// over 1024 pixels (>32x32) are scaled up only 2x unity (1 bit). +// This descaling is found via av1_tx_get_scale(). Thus, 16x32, 32x16 +// and 32x32 transforms actually return Q2 coefficients, and 32x64, +// 64x32 and 64x64 transforms return Q1 coefficients. However, the +// quantizers are de-scaled down on-the-fly by the same amount +// (av1_tx_get_scale()) during quantization, and as such the +// dequantized/decoded coefficients, even for large TX blocks, are always +// effectively Q3. Meanwhile, quantized/coded coefficients are Q0 +// because Qn quantizers are applied to Qn tx coefficients. + +// Note that encoder decision making (which uses the quantizer to +// generate several bespoke lamdas for RDO and other heuristics) +// expects quantizers to be larger for higher-bitdepth input. In +// addition, the minimum allowable quantizer is 4; smaller values will +// underflow to 0 in the actual quantization routines. + +int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) { + const int q_clamped = clamp(qindex + delta, 0, MAXQ); + switch (bit_depth) { + case AOM_BITS_8: return dc_qlookup_QTX[q_clamped]; + case AOM_BITS_10: return dc_qlookup_10_QTX[q_clamped]; + case AOM_BITS_12: return dc_qlookup_12_QTX[q_clamped]; + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + return -1; + } +} + +int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) { + const int q_clamped = clamp(qindex + delta, 0, MAXQ); + switch (bit_depth) { + case AOM_BITS_8: return ac_qlookup_QTX[q_clamped]; + case AOM_BITS_10: return ac_qlookup_10_QTX[q_clamped]; + case AOM_BITS_12: return ac_qlookup_12_QTX[q_clamped]; + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + return -1; + } +} + +int av1_get_qindex(const struct segmentation *seg, int segment_id, + int base_qindex) { + if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) { + const int data = get_segdata(seg, segment_id, SEG_LVL_ALT_Q); + const int seg_qindex = base_qindex + data; + return clamp(seg_qindex, 0, MAXQ); + } else { + return base_qindex; + } +} + +bool av1_use_qmatrix(const CommonQuantParams *quant_params, + const struct macroblockd *xd, int segment_id) { + // True if explicit Q matrix levels and this is not a lossless segment. + return quant_params->using_qmatrix && !xd->lossless[segment_id]; +} + +const qm_val_t *av1_iqmatrix(const CommonQuantParams *quant_params, int qmlevel, + int plane, TX_SIZE tx_size) { + assert(quant_params->giqmatrix[qmlevel][plane][tx_size] != NULL || + qmlevel == NUM_QM_LEVELS - 1); + return quant_params->giqmatrix[qmlevel][plane][tx_size]; +} +const qm_val_t *av1_qmatrix(const CommonQuantParams *quant_params, int qmlevel, + int plane, TX_SIZE tx_size) { + assert(quant_params->gqmatrix[qmlevel][plane][tx_size] != NULL || + qmlevel == NUM_QM_LEVELS - 1); + return quant_params->gqmatrix[qmlevel][plane][tx_size]; +} + +// Returns true if the tx_type corresponds to non-identity transform in both +// horizontal and vertical directions. +static INLINE bool is_2d_transform(TX_TYPE tx_type) { return (tx_type < IDTX); } + +const qm_val_t *av1_get_iqmatrix(const CommonQuantParams *quant_params, + const MACROBLOCKD *xd, int plane, + TX_SIZE tx_size, TX_TYPE tx_type) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const int seg_id = mbmi->segment_id; + const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size); + // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms + return is_2d_transform(tx_type) + ? pd->seg_iqmatrix[seg_id][qm_tx_size] + : quant_params->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size]; +} + +const qm_val_t *av1_get_qmatrix(const CommonQuantParams *quant_params, + const MACROBLOCKD *xd, int plane, + TX_SIZE tx_size, TX_TYPE tx_type) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const int seg_id = mbmi->segment_id; + const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size); + // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms + return is_2d_transform(tx_type) + ? pd->seg_qmatrix[seg_id][qm_tx_size] + : quant_params->gqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size]; +} + +#define QM_TOTAL_SIZE 3344 +// We only use wt_matrix_ref[q] and iwt_matrix_ref[q] +// for q = 0, ..., NUM_QM_LEVELS - 2. +static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE]; +static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE]; + +void av1_qm_init(CommonQuantParams *quant_params, int num_planes) { + for (int q = 0; q < NUM_QM_LEVELS; ++q) { + for (int c = 0; c < num_planes; ++c) { + int current = 0; + for (int t = 0; t < TX_SIZES_ALL; ++t) { + const int size = tx_size_2d[t]; + const int qm_tx_size = av1_get_adjusted_tx_size(t); + if (q == NUM_QM_LEVELS - 1) { + quant_params->gqmatrix[q][c][t] = NULL; + quant_params->giqmatrix[q][c][t] = NULL; + } else if (t != qm_tx_size) { // Reuse matrices for 'qm_tx_size' + assert(t > qm_tx_size); + quant_params->gqmatrix[q][c][t] = + quant_params->gqmatrix[q][c][qm_tx_size]; + quant_params->giqmatrix[q][c][t] = + quant_params->giqmatrix[q][c][qm_tx_size]; + } else { + assert(current + size <= QM_TOTAL_SIZE); + quant_params->gqmatrix[q][c][t] = &wt_matrix_ref[q][c >= 1][current]; + quant_params->giqmatrix[q][c][t] = + &iwt_matrix_ref[q][c >= 1][current]; + current += size; + } + } + } + } +} + +/* Provide 15 sets of quantization matrices for chroma and luma + and each TX size. Matrices for different TX sizes are in fact + sub-sampled from the 32x32 and 16x16 sizes, but explicitly + defined here for convenience. Intra and inter matrix sets are the + same but changing DEFAULT_QM_INTER_OFFSET from zero allows + for different matrices for inter and intra blocks in the same + frame. + Matrices for different QM levels have been rescaled in the + frequency domain according to different nominal viewing + distances. Matrices for QM level 15 are omitted because they are + not used. + */ +static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE] = { + { + { /* Luma */ + /* Size 4x4 */ + 32, 43, 73, 97, 43, 67, 94, 110, 73, 94, 137, 150, 97, 110, 150, 200, + /* Size 8x8 */ + 32, 32, 38, 51, 68, 84, 95, 109, 32, 35, 40, 49, 63, 76, 89, 102, 38, + 40, 54, 65, 78, 91, 98, 106, 51, 49, 65, 82, 97, 111, 113, 121, 68, 63, + 78, 97, 117, 134, 138, 142, 84, 76, 91, 111, 134, 152, 159, 168, 95, 89, + 98, 113, 138, 159, 183, 199, 109, 102, 106, 121, 142, 168, 199, 220, + /* Size 16x16 */ + 32, 31, 31, 34, 36, 44, 48, 59, 65, 80, 83, 91, 97, 104, 111, 119, 31, + 32, 32, 33, 34, 41, 44, 54, 59, 72, 75, 83, 90, 97, 104, 112, 31, 32, + 33, 35, 36, 42, 45, 54, 59, 71, 74, 81, 86, 93, 100, 107, 34, 33, 35, + 39, 42, 47, 51, 58, 63, 74, 76, 81, 84, 90, 97, 105, 36, 34, 36, 42, 48, + 54, 57, 64, 68, 79, 81, 88, 91, 96, 102, 105, 44, 41, 42, 47, 54, 63, + 67, 75, 79, 90, 92, 95, 100, 102, 109, 112, 48, 44, 45, 51, 57, 67, 71, + 80, 85, 96, 99, 107, 108, 111, 117, 120, 59, 54, 54, 58, 64, 75, 80, 92, + 98, 110, 113, 115, 116, 122, 125, 130, 65, 59, 59, 63, 68, 79, 85, 98, + 105, 118, 121, 127, 130, 134, 135, 140, 80, 72, 71, 74, 79, 90, 96, 110, + 118, 134, 137, 140, 143, 144, 146, 152, 83, 75, 74, 76, 81, 92, 99, 113, + 121, 137, 140, 151, 152, 155, 158, 165, 91, 83, 81, 81, 88, 95, 107, + 115, 127, 140, 151, 159, 166, 169, 173, 179, 97, 90, 86, 84, 91, 100, + 108, 116, 130, 143, 152, 166, 174, 182, 189, 193, 104, 97, 93, 90, 96, + 102, 111, 122, 134, 144, 155, 169, 182, 191, 200, 210, 111, 104, 100, + 97, 102, 109, 117, 125, 135, 146, 158, 173, 189, 200, 210, 220, 119, + 112, 107, 105, 105, 112, 120, 130, 140, 152, 165, 179, 193, 210, 220, + 231, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 32, 34, 35, 36, 39, 44, 46, 48, 54, 59, 62, 65, 71, + 80, 81, 83, 88, 91, 94, 97, 101, 104, 107, 111, 115, 119, 123, 31, 32, + 32, 32, 32, 32, 34, 34, 35, 38, 42, 44, 46, 51, 56, 59, 62, 68, 76, 77, + 78, 84, 86, 89, 92, 95, 99, 102, 105, 109, 113, 116, 31, 32, 32, 32, 32, + 32, 33, 34, 34, 37, 41, 42, 44, 49, 54, 56, 59, 65, 72, 73, 75, 80, 83, + 86, 90, 93, 97, 101, 104, 108, 112, 116, 31, 32, 32, 32, 33, 33, 34, 35, + 35, 38, 41, 43, 45, 49, 54, 56, 59, 64, 72, 73, 74, 79, 82, 85, 88, 91, + 94, 97, 101, 104, 107, 111, 31, 32, 32, 33, 33, 34, 35, 36, 36, 39, 42, + 44, 45, 50, 54, 56, 59, 64, 71, 72, 74, 78, 81, 84, 86, 89, 93, 96, 100, + 104, 107, 111, 32, 32, 32, 33, 34, 35, 37, 37, 38, 40, 42, 44, 46, 49, + 53, 55, 58, 63, 69, 70, 72, 76, 79, 82, 85, 89, 93, 96, 99, 102, 106, + 109, 34, 34, 33, 34, 35, 37, 39, 41, 42, 45, 47, 49, 51, 54, 58, 60, 63, + 68, 74, 75, 76, 80, 81, 82, 84, 87, 90, 93, 97, 101, 105, 110, 35, 34, + 34, 35, 36, 37, 41, 43, 45, 47, 50, 52, 53, 57, 61, 63, 65, 70, 76, 77, + 79, 82, 84, 86, 89, 91, 92, 93, 96, 100, 103, 107, 36, 35, 34, 35, 36, + 38, 42, 45, 48, 50, 54, 55, 57, 60, 64, 66, 68, 73, 79, 80, 81, 85, 88, + 90, 91, 93, 96, 99, 102, 103, 105, 107, 39, 38, 37, 38, 39, 40, 45, 47, + 50, 54, 58, 59, 61, 65, 69, 71, 73, 78, 84, 85, 86, 91, 92, 92, 95, 98, + 100, 101, 103, 106, 110, 114, 44, 42, 41, 41, 42, 42, 47, 50, 54, 58, + 63, 65, 67, 71, 75, 77, 79, 84, 90, 91, 92, 95, 95, 97, 100, 101, 102, + 105, 109, 111, 112, 114, 46, 44, 42, 43, 44, 44, 49, 52, 55, 59, 65, 67, + 69, 74, 78, 80, 82, 87, 93, 94, 95, 98, 100, 103, 102, 105, 108, 110, + 111, 113, 117, 121, 48, 46, 44, 45, 45, 46, 51, 53, 57, 61, 67, 69, 71, + 76, 80, 83, 85, 90, 96, 97, 99, 103, 107, 105, 108, 111, 111, 113, 117, + 119, 120, 122, 54, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 74, 76, 82, + 87, 89, 92, 97, 104, 105, 106, 111, 110, 111, 114, 113, 116, 120, 120, + 121, 125, 130, 59, 56, 54, 54, 54, 53, 58, 61, 64, 69, 75, 78, 80, 87, + 92, 95, 98, 103, 110, 111, 113, 115, 115, 119, 116, 120, 122, 122, 125, + 129, 130, 130, 62, 59, 56, 56, 56, 55, 60, 63, 66, 71, 77, 80, 83, 89, + 95, 98, 101, 107, 114, 115, 117, 119, 123, 121, 125, 126, 125, 129, 131, + 131, 135, 140, 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92, + 98, 101, 105, 111, 118, 119, 121, 126, 127, 128, 130, 130, 134, 133, + 135, 140, 140, 140, 71, 68, 65, 64, 64, 63, 68, 70, 73, 78, 84, 87, 90, + 97, 103, 107, 111, 117, 125, 126, 128, 134, 132, 136, 133, 138, 137, + 140, 143, 142, 145, 150, 80, 76, 72, 72, 71, 69, 74, 76, 79, 84, 90, 93, + 96, 104, 110, 114, 118, 125, 134, 135, 137, 139, 140, 139, 143, 142, + 144, 146, 146, 151, 152, 151, 81, 77, 73, 73, 72, 70, 75, 77, 80, 85, + 91, 94, 97, 105, 111, 115, 119, 126, 135, 137, 138, 144, 147, 146, 148, + 149, 151, 150, 156, 155, 157, 163, 83, 78, 75, 74, 74, 72, 76, 79, 81, + 86, 92, 95, 99, 106, 113, 117, 121, 128, 137, 138, 140, 147, 151, 156, + 152, 157, 155, 161, 158, 162, 165, 164, 88, 84, 80, 79, 78, 76, 80, 82, + 85, 91, 95, 98, 103, 111, 115, 119, 126, 134, 139, 144, 147, 152, 154, + 158, 163, 159, 165, 163, 168, 168, 169, 176, 91, 86, 83, 82, 81, 79, 81, + 84, 88, 92, 95, 100, 107, 110, 115, 123, 127, 132, 140, 147, 151, 154, + 159, 161, 166, 171, 169, 173, 173, 176, 179, 177, 94, 89, 86, 85, 84, + 82, 82, 86, 90, 92, 97, 103, 105, 111, 119, 121, 128, 136, 139, 146, + 156, 158, 161, 166, 168, 174, 179, 178, 180, 183, 183, 190, 97, 92, 90, + 88, 86, 85, 84, 89, 91, 95, 100, 102, 108, 114, 116, 125, 130, 133, 143, + 148, 152, 163, 166, 168, 174, 176, 182, 187, 189, 188, 193, 191, 101, + 95, 93, 91, 89, 89, 87, 91, 93, 98, 101, 105, 111, 113, 120, 126, 130, + 138, 142, 149, 157, 159, 171, 174, 176, 183, 184, 191, 195, 199, 197, + 204, 104, 99, 97, 94, 93, 93, 90, 92, 96, 100, 102, 108, 111, 116, 122, + 125, 134, 137, 144, 151, 155, 165, 169, 179, 182, 184, 191, 193, 200, + 204, 210, 206, 107, 102, 101, 97, 96, 96, 93, 93, 99, 101, 105, 110, + 113, 120, 122, 129, 133, 140, 146, 150, 161, 163, 173, 178, 187, 191, + 193, 200, 202, 210, 214, 222, 111, 105, 104, 101, 100, 99, 97, 96, 102, + 103, 109, 111, 117, 120, 125, 131, 135, 143, 146, 156, 158, 168, 173, + 180, 189, 195, 200, 202, 210, 212, 220, 224, 115, 109, 108, 104, 104, + 102, 101, 100, 103, 106, 111, 113, 119, 121, 129, 131, 140, 142, 151, + 155, 162, 168, 176, 183, 188, 199, 204, 210, 212, 220, 222, 230, 119, + 113, 112, 107, 107, 106, 105, 103, 105, 110, 112, 117, 120, 125, 130, + 135, 140, 145, 152, 157, 165, 169, 179, 183, 193, 197, 210, 214, 220, + 222, 231, 232, 123, 116, 116, 111, 111, 109, 110, 107, 107, 114, 114, + 121, 122, 130, 130, 140, 140, 150, 151, 163, 164, 176, 177, 190, 191, + 204, 206, 222, 224, 230, 232, 242, + /* Size 4x8 */ + 32, 42, 75, 91, 33, 42, 69, 86, 37, 58, 84, 91, 49, 71, 103, 110, 65, + 84, 125, 128, 80, 97, 142, 152, 91, 100, 145, 178, 104, 112, 146, 190, + /* Size 8x4 */ + 32, 33, 37, 49, 65, 80, 91, 104, 42, 42, 58, 71, 84, 97, 100, 112, 75, + 69, 84, 103, 125, 142, 145, 146, 91, 86, 91, 110, 128, 152, 178, 190, + /* Size 8x16 */ + 32, 32, 36, 53, 65, 87, 93, 99, 31, 33, 34, 49, 59, 78, 86, 93, 32, 34, + 36, 50, 59, 77, 82, 89, 34, 37, 42, 54, 63, 79, 80, 88, 36, 38, 48, 60, + 68, 84, 86, 90, 44, 43, 53, 71, 79, 95, 94, 97, 48, 46, 56, 76, 85, 102, + 105, 105, 58, 54, 63, 87, 98, 116, 112, 115, 65, 58, 68, 92, 105, 124, + 122, 124, 79, 70, 79, 104, 118, 141, 135, 135, 82, 72, 81, 106, 121, + 144, 149, 146, 91, 80, 88, 106, 130, 148, 162, 159, 97, 86, 94, 107, + 128, 157, 167, 171, 103, 93, 98, 114, 131, 150, 174, 186, 110, 100, 101, + 117, 138, 161, 183, 193, 118, 107, 105, 118, 136, 157, 182, 203, + /* Size 16x8 */ + 32, 31, 32, 34, 36, 44, 48, 58, 65, 79, 82, 91, 97, 103, 110, 118, 32, + 33, 34, 37, 38, 43, 46, 54, 58, 70, 72, 80, 86, 93, 100, 107, 36, 34, + 36, 42, 48, 53, 56, 63, 68, 79, 81, 88, 94, 98, 101, 105, 53, 49, 50, + 54, 60, 71, 76, 87, 92, 104, 106, 106, 107, 114, 117, 118, 65, 59, 59, + 63, 68, 79, 85, 98, 105, 118, 121, 130, 128, 131, 138, 136, 87, 78, 77, + 79, 84, 95, 102, 116, 124, 141, 144, 148, 157, 150, 161, 157, 93, 86, + 82, 80, 86, 94, 105, 112, 122, 135, 149, 162, 167, 174, 183, 182, 99, + 93, 89, 88, 90, 97, 105, 115, 124, 135, 146, 159, 171, 186, 193, 203, + /* Size 16x32 */ + 32, 31, 32, 34, 36, 44, 53, 59, 65, 79, 87, 90, 93, 96, 99, 102, 31, 32, + 32, 34, 35, 42, 51, 56, 62, 75, 82, 85, 88, 91, 94, 97, 31, 32, 33, 33, + 34, 41, 49, 54, 59, 72, 78, 82, 86, 90, 93, 97, 31, 32, 33, 34, 35, 41, + 49, 54, 59, 71, 78, 81, 84, 87, 90, 93, 32, 32, 34, 35, 36, 42, 50, 54, + 59, 71, 77, 80, 82, 86, 89, 93, 32, 33, 35, 37, 38, 42, 49, 53, 58, 69, + 75, 78, 82, 86, 89, 92, 34, 34, 37, 39, 42, 48, 54, 58, 63, 73, 79, 78, + 80, 83, 88, 92, 35, 34, 37, 41, 45, 50, 57, 61, 65, 76, 82, 83, 84, 84, + 87, 90, 36, 34, 38, 43, 48, 54, 60, 64, 68, 78, 84, 87, 86, 89, 90, 90, + 39, 37, 40, 45, 50, 58, 65, 69, 73, 84, 89, 89, 91, 91, 93, 96, 44, 41, + 43, 48, 53, 63, 71, 75, 79, 90, 95, 93, 94, 95, 97, 97, 46, 43, 44, 49, + 55, 65, 73, 78, 82, 93, 98, 100, 98, 100, 99, 103, 48, 45, 46, 51, 56, + 67, 76, 80, 85, 96, 102, 102, 105, 102, 105, 104, 53, 49, 50, 54, 60, + 71, 82, 87, 92, 103, 109, 107, 107, 110, 107, 111, 58, 54, 54, 58, 63, + 75, 87, 92, 98, 110, 116, 115, 112, 111, 115, 112, 61, 57, 56, 60, 66, + 77, 89, 95, 101, 114, 120, 118, 119, 118, 116, 120, 65, 60, 58, 63, 68, + 79, 92, 98, 105, 118, 124, 123, 122, 123, 124, 121, 71, 65, 63, 68, 73, + 84, 97, 103, 111, 125, 132, 132, 130, 128, 127, 130, 79, 72, 70, 74, 79, + 90, 104, 110, 118, 133, 141, 136, 135, 135, 135, 131, 81, 74, 71, 75, + 80, 91, 105, 112, 119, 135, 142, 140, 140, 138, 139, 142, 82, 75, 72, + 76, 81, 92, 106, 113, 121, 136, 144, 151, 149, 149, 146, 143, 88, 80, + 77, 80, 85, 97, 108, 115, 126, 142, 149, 153, 153, 152, 152, 154, 91, + 83, 80, 81, 88, 100, 106, 114, 130, 142, 148, 155, 162, 160, 159, 155, + 94, 85, 83, 82, 91, 100, 105, 118, 131, 137, 153, 160, 165, 167, 166, + 168, 97, 88, 86, 85, 94, 100, 107, 123, 128, 140, 157, 161, 167, 173, + 171, 169, 100, 91, 89, 87, 97, 100, 111, 121, 127, 145, 152, 164, 173, + 178, 182, 181, 103, 94, 93, 90, 98, 101, 114, 120, 131, 144, 150, 170, + 174, 180, 186, 183, 107, 97, 96, 93, 100, 104, 117, 119, 136, 142, 155, + 168, 177, 187, 191, 198, 110, 101, 100, 97, 101, 108, 117, 123, 138, + 141, 161, 165, 183, 188, 193, 200, 114, 104, 104, 100, 103, 112, 117, + 127, 137, 146, 159, 167, 185, 190, 201, 206, 118, 108, 107, 103, 105, + 115, 118, 131, 136, 151, 157, 172, 182, 197, 203, 208, 122, 111, 111, + 107, 107, 119, 119, 136, 136, 156, 156, 178, 179, 203, 204, 217, + /* Size 32x16 */ + 32, 31, 31, 31, 32, 32, 34, 35, 36, 39, 44, 46, 48, 53, 58, 61, 65, 71, + 79, 81, 82, 88, 91, 94, 97, 100, 103, 107, 110, 114, 118, 122, 31, 32, + 32, 32, 32, 33, 34, 34, 34, 37, 41, 43, 45, 49, 54, 57, 60, 65, 72, 74, + 75, 80, 83, 85, 88, 91, 94, 97, 101, 104, 108, 111, 32, 32, 33, 33, 34, + 35, 37, 37, 38, 40, 43, 44, 46, 50, 54, 56, 58, 63, 70, 71, 72, 77, 80, + 83, 86, 89, 93, 96, 100, 104, 107, 111, 34, 34, 33, 34, 35, 37, 39, 41, + 43, 45, 48, 49, 51, 54, 58, 60, 63, 68, 74, 75, 76, 80, 81, 82, 85, 87, + 90, 93, 97, 100, 103, 107, 36, 35, 34, 35, 36, 38, 42, 45, 48, 50, 53, + 55, 56, 60, 63, 66, 68, 73, 79, 80, 81, 85, 88, 91, 94, 97, 98, 100, + 101, 103, 105, 107, 44, 42, 41, 41, 42, 42, 48, 50, 54, 58, 63, 65, 67, + 71, 75, 77, 79, 84, 90, 91, 92, 97, 100, 100, 100, 100, 101, 104, 108, + 112, 115, 119, 53, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 73, 76, 82, + 87, 89, 92, 97, 104, 105, 106, 108, 106, 105, 107, 111, 114, 117, 117, + 117, 118, 119, 59, 56, 54, 54, 54, 53, 58, 61, 64, 69, 75, 78, 80, 87, + 92, 95, 98, 103, 110, 112, 113, 115, 114, 118, 123, 121, 120, 119, 123, + 127, 131, 136, 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92, + 98, 101, 105, 111, 118, 119, 121, 126, 130, 131, 128, 127, 131, 136, + 138, 137, 136, 136, 79, 75, 72, 71, 71, 69, 73, 76, 78, 84, 90, 93, 96, + 103, 110, 114, 118, 125, 133, 135, 136, 142, 142, 137, 140, 145, 144, + 142, 141, 146, 151, 156, 87, 82, 78, 78, 77, 75, 79, 82, 84, 89, 95, 98, + 102, 109, 116, 120, 124, 132, 141, 142, 144, 149, 148, 153, 157, 152, + 150, 155, 161, 159, 157, 156, 90, 85, 82, 81, 80, 78, 78, 83, 87, 89, + 93, 100, 102, 107, 115, 118, 123, 132, 136, 140, 151, 153, 155, 160, + 161, 164, 170, 168, 165, 167, 172, 178, 93, 88, 86, 84, 82, 82, 80, 84, + 86, 91, 94, 98, 105, 107, 112, 119, 122, 130, 135, 140, 149, 153, 162, + 165, 167, 173, 174, 177, 183, 185, 182, 179, 96, 91, 90, 87, 86, 86, 83, + 84, 89, 91, 95, 100, 102, 110, 111, 118, 123, 128, 135, 138, 149, 152, + 160, 167, 173, 178, 180, 187, 188, 190, 197, 203, 99, 94, 93, 90, 89, + 89, 88, 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135, 139, 146, + 152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204, 102, 97, 97, 93, + 93, 92, 92, 90, 90, 96, 97, 103, 104, 111, 112, 120, 121, 130, 131, 142, + 143, 154, 155, 168, 169, 181, 183, 198, 200, 206, 208, 217, + /* Size 4x16 */ + 31, 44, 79, 96, 32, 41, 72, 90, 32, 42, 71, 86, 34, 48, 73, 83, 34, 54, + 78, 89, 41, 63, 90, 95, 45, 67, 96, 102, 54, 75, 110, 111, 60, 79, 118, + 123, 72, 90, 133, 135, 75, 92, 136, 149, 83, 100, 142, 160, 88, 100, + 140, 173, 94, 101, 144, 180, 101, 108, 141, 188, 108, 115, 151, 197, + /* Size 16x4 */ + 31, 32, 32, 34, 34, 41, 45, 54, 60, 72, 75, 83, 88, 94, 101, 108, 44, + 41, 42, 48, 54, 63, 67, 75, 79, 90, 92, 100, 100, 101, 108, 115, 79, 72, + 71, 73, 78, 90, 96, 110, 118, 133, 136, 142, 140, 144, 141, 151, 96, 90, + 86, 83, 89, 95, 102, 111, 123, 135, 149, 160, 173, 180, 188, 197, + /* Size 8x32 */ + 32, 32, 36, 53, 65, 87, 93, 99, 31, 32, 35, 51, 62, 82, 88, 94, 31, 33, + 34, 49, 59, 78, 86, 93, 31, 33, 35, 49, 59, 78, 84, 90, 32, 34, 36, 50, + 59, 77, 82, 89, 32, 35, 38, 49, 58, 75, 82, 89, 34, 37, 42, 54, 63, 79, + 80, 88, 35, 37, 45, 57, 65, 82, 84, 87, 36, 38, 48, 60, 68, 84, 86, 90, + 39, 40, 50, 65, 73, 89, 91, 93, 44, 43, 53, 71, 79, 95, 94, 97, 46, 44, + 55, 73, 82, 98, 98, 99, 48, 46, 56, 76, 85, 102, 105, 105, 53, 50, 60, + 82, 92, 109, 107, 107, 58, 54, 63, 87, 98, 116, 112, 115, 61, 56, 66, + 89, 101, 120, 119, 116, 65, 58, 68, 92, 105, 124, 122, 124, 71, 63, 73, + 97, 111, 132, 130, 127, 79, 70, 79, 104, 118, 141, 135, 135, 81, 71, 80, + 105, 119, 142, 140, 139, 82, 72, 81, 106, 121, 144, 149, 146, 88, 77, + 85, 108, 126, 149, 153, 152, 91, 80, 88, 106, 130, 148, 162, 159, 94, + 83, 91, 105, 131, 153, 165, 166, 97, 86, 94, 107, 128, 157, 167, 171, + 100, 89, 97, 111, 127, 152, 173, 182, 103, 93, 98, 114, 131, 150, 174, + 186, 107, 96, 100, 117, 136, 155, 177, 191, 110, 100, 101, 117, 138, + 161, 183, 193, 114, 104, 103, 117, 137, 159, 185, 201, 118, 107, 105, + 118, 136, 157, 182, 203, 122, 111, 107, 119, 136, 156, 179, 204, + /* Size 32x8 */ + 32, 31, 31, 31, 32, 32, 34, 35, 36, 39, 44, 46, 48, 53, 58, 61, 65, 71, + 79, 81, 82, 88, 91, 94, 97, 100, 103, 107, 110, 114, 118, 122, 32, 32, + 33, 33, 34, 35, 37, 37, 38, 40, 43, 44, 46, 50, 54, 56, 58, 63, 70, 71, + 72, 77, 80, 83, 86, 89, 93, 96, 100, 104, 107, 111, 36, 35, 34, 35, 36, + 38, 42, 45, 48, 50, 53, 55, 56, 60, 63, 66, 68, 73, 79, 80, 81, 85, 88, + 91, 94, 97, 98, 100, 101, 103, 105, 107, 53, 51, 49, 49, 50, 49, 54, 57, + 60, 65, 71, 73, 76, 82, 87, 89, 92, 97, 104, 105, 106, 108, 106, 105, + 107, 111, 114, 117, 117, 117, 118, 119, 65, 62, 59, 59, 59, 58, 63, 65, + 68, 73, 79, 82, 85, 92, 98, 101, 105, 111, 118, 119, 121, 126, 130, 131, + 128, 127, 131, 136, 138, 137, 136, 136, 87, 82, 78, 78, 77, 75, 79, 82, + 84, 89, 95, 98, 102, 109, 116, 120, 124, 132, 141, 142, 144, 149, 148, + 153, 157, 152, 150, 155, 161, 159, 157, 156, 93, 88, 86, 84, 82, 82, 80, + 84, 86, 91, 94, 98, 105, 107, 112, 119, 122, 130, 135, 140, 149, 153, + 162, 165, 167, 173, 174, 177, 183, 185, 182, 179, 99, 94, 93, 90, 89, + 89, 88, 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135, 139, 146, + 152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204 }, + { /* Chroma */ + /* Size 4x4 */ + 35, 46, 57, 66, 46, 60, 69, 71, 57, 69, 90, 90, 66, 71, 90, 109, + /* Size 8x8 */ + 31, 38, 47, 50, 57, 63, 67, 71, 38, 47, 46, 47, 52, 57, 62, 67, 47, 46, + 54, 57, 61, 66, 67, 68, 50, 47, 57, 66, 72, 77, 75, 75, 57, 52, 61, 72, + 82, 88, 86, 84, 63, 57, 66, 77, 88, 96, 95, 95, 67, 62, 67, 75, 86, 95, + 104, 107, 71, 67, 68, 75, 84, 95, 107, 113, + /* Size 16x16 */ + 32, 30, 33, 41, 49, 49, 50, 54, 57, 63, 65, 68, 70, 72, 74, 76, 30, 32, + 35, 42, 46, 45, 46, 49, 52, 57, 58, 62, 64, 67, 70, 72, 33, 35, 39, 45, + 47, 45, 46, 49, 51, 56, 57, 60, 62, 64, 66, 69, 41, 42, 45, 48, 50, 49, + 50, 52, 53, 57, 58, 59, 60, 61, 64, 67, 49, 46, 47, 50, 53, 53, 54, 55, + 56, 60, 61, 64, 64, 65, 66, 66, 49, 45, 45, 49, 53, 58, 60, 62, 63, 67, + 68, 67, 69, 68, 70, 70, 50, 46, 46, 50, 54, 60, 61, 65, 67, 71, 71, 74, + 73, 73, 74, 74, 54, 49, 49, 52, 55, 62, 65, 71, 73, 78, 79, 78, 77, 78, + 78, 78, 57, 52, 51, 53, 56, 63, 67, 73, 76, 82, 83, 84, 84, 84, 82, 83, + 63, 57, 56, 57, 60, 67, 71, 78, 82, 89, 90, 90, 89, 88, 87, 88, 65, 58, + 57, 58, 61, 68, 71, 79, 83, 90, 91, 94, 93, 93, 92, 93, 68, 62, 60, 59, + 64, 67, 74, 78, 84, 90, 94, 98, 99, 98, 98, 98, 70, 64, 62, 60, 64, 69, + 73, 77, 84, 89, 93, 99, 102, 103, 104, 104, 72, 67, 64, 61, 65, 68, 73, + 78, 84, 88, 93, 98, 103, 106, 108, 109, 74, 70, 66, 64, 66, 70, 74, 78, + 82, 87, 92, 98, 104, 108, 111, 112, 76, 72, 69, 67, 66, 70, 74, 78, 83, + 88, 93, 98, 104, 109, 112, 116, + /* Size 32x32 */ + 32, 31, 30, 32, 33, 36, 41, 45, 49, 48, 49, 50, 50, 52, 54, 56, 57, 60, + 63, 64, 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 31, 31, 31, 33, + 34, 38, 42, 45, 47, 47, 47, 47, 48, 50, 52, 53, 54, 57, 60, 61, 61, 63, + 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 30, 31, 32, 33, 35, 40, 42, 44, + 46, 45, 45, 45, 46, 47, 49, 51, 52, 54, 57, 58, 58, 61, 62, 63, 64, 66, + 67, 68, 70, 71, 72, 74, 32, 33, 33, 35, 37, 41, 43, 45, 47, 46, 45, 46, + 46, 47, 49, 50, 51, 54, 57, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, + 69, 70, 33, 34, 35, 37, 39, 43, 45, 46, 47, 46, 45, 46, 46, 47, 49, 50, + 51, 53, 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 68, 69, 70, 36, 38, + 40, 41, 43, 47, 47, 47, 48, 46, 45, 46, 46, 47, 48, 49, 50, 52, 54, 55, + 55, 57, 58, 59, 61, 62, 64, 65, 66, 67, 68, 69, 41, 42, 42, 43, 45, 47, + 48, 49, 50, 49, 49, 49, 50, 50, 52, 52, 53, 55, 57, 58, 58, 60, 59, 59, + 60, 61, 61, 63, 64, 66, 67, 69, 45, 45, 44, 45, 46, 47, 49, 50, 51, 51, + 51, 51, 52, 52, 53, 54, 55, 57, 59, 59, 60, 61, 61, 62, 63, 63, 63, 63, + 63, 64, 65, 66, 49, 47, 46, 47, 47, 48, 50, 51, 53, 53, 53, 54, 54, 54, + 55, 56, 56, 58, 60, 61, 61, 63, 64, 64, 64, 64, 65, 66, 66, 66, 66, 66, + 48, 47, 45, 46, 46, 46, 49, 51, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, + 63, 64, 64, 66, 66, 65, 66, 67, 67, 67, 67, 68, 69, 70, 49, 47, 45, 45, + 45, 45, 49, 51, 53, 55, 58, 59, 60, 61, 62, 63, 63, 65, 67, 67, 68, 69, + 67, 68, 69, 68, 68, 69, 70, 70, 70, 70, 50, 47, 45, 46, 46, 46, 49, 51, + 54, 56, 59, 60, 60, 62, 64, 64, 65, 67, 69, 69, 70, 70, 71, 71, 70, 70, + 71, 71, 71, 71, 72, 74, 50, 48, 46, 46, 46, 46, 50, 52, 54, 56, 60, 60, + 61, 63, 65, 66, 67, 68, 71, 71, 71, 73, 74, 72, 73, 74, 73, 73, 74, 74, + 74, 74, 52, 50, 47, 47, 47, 47, 50, 52, 54, 57, 61, 62, 63, 66, 68, 69, + 70, 72, 75, 75, 75, 77, 75, 75, 76, 75, 75, 76, 75, 75, 76, 77, 54, 52, + 49, 49, 49, 48, 52, 53, 55, 58, 62, 64, 65, 68, 71, 72, 73, 75, 78, 78, + 79, 79, 78, 79, 77, 78, 78, 77, 78, 79, 78, 78, 56, 53, 51, 50, 50, 49, + 52, 54, 56, 59, 63, 64, 66, 69, 72, 73, 75, 77, 80, 80, 81, 81, 82, 80, + 81, 81, 79, 81, 80, 79, 81, 82, 57, 54, 52, 51, 51, 50, 53, 55, 56, 60, + 63, 65, 67, 70, 73, 75, 76, 79, 82, 82, 83, 85, 84, 83, 84, 83, 84, 82, + 82, 84, 83, 82, 60, 57, 54, 54, 53, 52, 55, 57, 58, 61, 65, 67, 68, 72, + 75, 77, 79, 82, 85, 85, 86, 88, 86, 87, 85, 86, 85, 85, 86, 84, 85, 86, + 63, 60, 57, 57, 56, 54, 57, 59, 60, 63, 67, 69, 71, 75, 78, 80, 82, 85, + 89, 89, 90, 90, 90, 89, 89, 88, 88, 88, 87, 88, 88, 87, 64, 61, 58, 57, + 57, 55, 58, 59, 61, 64, 67, 69, 71, 75, 78, 80, 82, 85, 89, 90, 91, 92, + 93, 92, 92, 91, 91, 90, 91, 90, 90, 92, 65, 61, 58, 58, 57, 55, 58, 60, + 61, 64, 68, 70, 71, 75, 79, 81, 83, 86, 90, 91, 91, 94, 94, 96, 93, 94, + 93, 94, 92, 93, 93, 92, 67, 63, 61, 60, 59, 57, 60, 61, 63, 66, 69, 70, + 73, 77, 79, 81, 85, 88, 90, 92, 94, 96, 96, 97, 98, 95, 97, 95, 96, 95, + 95, 96, 68, 64, 62, 61, 60, 58, 59, 61, 64, 66, 67, 71, 74, 75, 78, 82, + 84, 86, 90, 93, 94, 96, 98, 98, 99, 100, 98, 99, 98, 98, 98, 97, 69, 65, + 63, 62, 61, 59, 59, 62, 64, 65, 68, 71, 72, 75, 79, 80, 83, 87, 89, 92, + 96, 97, 98, 100, 100, 101, 102, 101, 101, 101, 100, 102, 70, 66, 64, 63, + 62, 61, 60, 63, 64, 66, 69, 70, 73, 76, 77, 81, 84, 85, 89, 92, 93, 98, + 99, 100, 102, 102, 103, 104, 104, 103, 104, 102, 71, 67, 66, 64, 63, 62, + 61, 63, 64, 67, 68, 70, 74, 75, 78, 81, 83, 86, 88, 91, 94, 95, 100, + 101, 102, 104, 104, 105, 106, 107, 105, 107, 72, 68, 67, 65, 64, 64, 61, + 63, 65, 67, 68, 71, 73, 75, 78, 79, 84, 85, 88, 91, 93, 97, 98, 102, + 103, 104, 106, 106, 108, 108, 109, 107, 73, 69, 68, 66, 65, 65, 63, 63, + 66, 67, 69, 71, 73, 76, 77, 81, 82, 85, 88, 90, 94, 95, 99, 101, 104, + 105, 106, 109, 108, 110, 111, 112, 74, 70, 70, 67, 66, 66, 64, 63, 66, + 67, 70, 71, 74, 75, 78, 80, 82, 86, 87, 91, 92, 96, 98, 101, 104, 106, + 108, 108, 111, 111, 112, 113, 75, 71, 71, 68, 68, 67, 66, 64, 66, 68, + 70, 71, 74, 75, 79, 79, 84, 84, 88, 90, 93, 95, 98, 101, 103, 107, 108, + 110, 111, 113, 113, 115, 76, 72, 72, 69, 69, 68, 67, 65, 66, 69, 70, 72, + 74, 76, 78, 81, 83, 85, 88, 90, 93, 95, 98, 100, 104, 105, 109, 111, + 112, 113, 116, 115, 78, 74, 74, 70, 70, 69, 69, 66, 66, 70, 70, 74, 74, + 77, 78, 82, 82, 86, 87, 92, 92, 96, 97, 102, 102, 107, 107, 112, 113, + 115, 115, 118, + /* Size 4x8 */ + 31, 47, 60, 66, 40, 45, 54, 61, 46, 56, 64, 64, 48, 61, 75, 73, 54, 65, + 85, 82, 61, 69, 92, 92, 64, 68, 90, 102, 68, 71, 87, 105, + /* Size 8x4 */ + 31, 40, 46, 48, 54, 61, 64, 68, 47, 45, 56, 61, 65, 69, 68, 71, 60, 54, + 64, 75, 85, 92, 90, 87, 66, 61, 64, 73, 82, 92, 102, 105, + /* Size 8x16 */ + 32, 37, 48, 52, 57, 66, 68, 71, 30, 40, 46, 48, 52, 60, 63, 66, 33, 43, + 47, 47, 51, 59, 60, 63, 42, 47, 50, 50, 53, 60, 59, 62, 49, 48, 53, 54, + 57, 62, 62, 62, 49, 46, 53, 61, 64, 69, 66, 66, 50, 46, 54, 64, 67, 73, + 72, 70, 54, 49, 55, 68, 73, 80, 76, 75, 57, 50, 56, 70, 76, 84, 80, 79, + 63, 55, 60, 75, 82, 92, 87, 84, 64, 56, 61, 75, 83, 93, 93, 89, 68, 59, + 64, 74, 86, 94, 98, 94, 70, 62, 66, 73, 83, 96, 99, 98, 72, 64, 66, 75, + 83, 92, 101, 104, 74, 67, 66, 74, 84, 94, 103, 106, 76, 69, 67, 73, 82, + 91, 101, 109, + /* Size 16x8 */ + 32, 30, 33, 42, 49, 49, 50, 54, 57, 63, 64, 68, 70, 72, 74, 76, 37, 40, + 43, 47, 48, 46, 46, 49, 50, 55, 56, 59, 62, 64, 67, 69, 48, 46, 47, 50, + 53, 53, 54, 55, 56, 60, 61, 64, 66, 66, 66, 67, 52, 48, 47, 50, 54, 61, + 64, 68, 70, 75, 75, 74, 73, 75, 74, 73, 57, 52, 51, 53, 57, 64, 67, 73, + 76, 82, 83, 86, 83, 83, 84, 82, 66, 60, 59, 60, 62, 69, 73, 80, 84, 92, + 93, 94, 96, 92, 94, 91, 68, 63, 60, 59, 62, 66, 72, 76, 80, 87, 93, 98, + 99, 101, 103, 101, 71, 66, 63, 62, 62, 66, 70, 75, 79, 84, 89, 94, 98, + 104, 106, 109, + /* Size 16x32 */ + 32, 31, 37, 42, 48, 49, 52, 54, 57, 63, 66, 67, 68, 69, 71, 72, 31, 31, + 38, 42, 47, 47, 50, 52, 54, 60, 63, 64, 65, 66, 67, 68, 30, 32, 40, 42, + 46, 45, 48, 50, 52, 57, 60, 62, 63, 65, 66, 68, 32, 34, 41, 44, 46, 45, + 48, 49, 51, 57, 59, 61, 62, 63, 64, 65, 33, 36, 43, 45, 47, 46, 47, 49, + 51, 56, 59, 60, 60, 62, 63, 65, 37, 40, 47, 47, 47, 45, 47, 48, 50, 54, + 57, 58, 60, 61, 62, 63, 42, 43, 47, 48, 50, 49, 50, 52, 53, 57, 60, 58, + 59, 60, 62, 63, 45, 44, 47, 49, 51, 51, 52, 54, 55, 59, 61, 61, 61, 60, + 61, 61, 49, 46, 48, 50, 53, 53, 54, 55, 57, 60, 62, 63, 62, 63, 62, 62, + 48, 46, 47, 50, 53, 56, 57, 59, 60, 64, 66, 65, 65, 64, 64, 65, 49, 45, + 46, 49, 53, 58, 61, 62, 64, 67, 69, 67, 66, 66, 66, 65, 49, 46, 46, 49, + 53, 59, 62, 64, 65, 69, 71, 70, 68, 68, 67, 68, 50, 46, 46, 50, 54, 59, + 64, 65, 67, 71, 73, 72, 72, 70, 70, 69, 52, 48, 47, 50, 54, 61, 66, 68, + 71, 75, 77, 74, 73, 73, 71, 72, 54, 50, 49, 52, 55, 62, 68, 71, 73, 78, + 80, 78, 76, 74, 75, 73, 55, 51, 49, 52, 56, 63, 69, 72, 75, 80, 82, 80, + 79, 78, 76, 77, 57, 52, 50, 53, 56, 64, 70, 73, 76, 82, 84, 82, 80, 80, + 79, 77, 60, 54, 52, 55, 58, 65, 72, 75, 79, 85, 88, 86, 84, 82, 81, 81, + 63, 57, 55, 58, 60, 67, 75, 78, 82, 89, 92, 88, 87, 85, 84, 81, 64, 58, + 55, 58, 61, 68, 75, 78, 82, 89, 92, 90, 89, 87, 86, 86, 64, 59, 56, 58, + 61, 68, 75, 79, 83, 90, 93, 95, 93, 91, 89, 87, 67, 61, 58, 60, 63, 69, + 76, 79, 85, 92, 95, 96, 94, 92, 91, 91, 68, 62, 59, 60, 64, 71, 74, 78, + 86, 91, 94, 96, 98, 96, 94, 91, 69, 62, 60, 60, 65, 70, 72, 79, 85, 88, + 95, 98, 99, 98, 97, 96, 70, 63, 62, 60, 66, 69, 73, 81, 83, 89, 96, 97, + 99, 101, 98, 97, 71, 64, 63, 61, 67, 68, 74, 79, 82, 90, 93, 98, 102, + 102, 102, 101, 72, 65, 64, 62, 66, 68, 75, 78, 83, 89, 92, 100, 101, + 103, 104, 102, 73, 66, 65, 63, 66, 69, 75, 76, 84, 87, 93, 98, 102, 105, + 106, 107, 74, 67, 67, 64, 66, 70, 74, 77, 84, 86, 94, 96, 103, 105, 106, + 107, 75, 68, 68, 65, 66, 71, 74, 78, 83, 87, 93, 96, 103, 105, 109, 109, + 76, 69, 69, 66, 67, 72, 73, 80, 82, 88, 91, 97, 101, 107, 109, 110, 77, + 70, 70, 67, 67, 73, 73, 81, 81, 90, 90, 99, 99, 108, 108, 113, + /* Size 32x16 */ + 32, 31, 30, 32, 33, 37, 42, 45, 49, 48, 49, 49, 50, 52, 54, 55, 57, 60, + 63, 64, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 31, 31, 32, 34, + 36, 40, 43, 44, 46, 46, 45, 46, 46, 48, 50, 51, 52, 54, 57, 58, 59, 61, + 62, 62, 63, 64, 65, 66, 67, 68, 69, 70, 37, 38, 40, 41, 43, 47, 47, 47, + 48, 47, 46, 46, 46, 47, 49, 49, 50, 52, 55, 55, 56, 58, 59, 60, 62, 63, + 64, 65, 67, 68, 69, 70, 42, 42, 42, 44, 45, 47, 48, 49, 50, 50, 49, 49, + 50, 50, 52, 52, 53, 55, 58, 58, 58, 60, 60, 60, 60, 61, 62, 63, 64, 65, + 66, 67, 48, 47, 46, 46, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 55, 56, + 56, 58, 60, 61, 61, 63, 64, 65, 66, 67, 66, 66, 66, 66, 67, 67, 49, 47, + 45, 45, 46, 45, 49, 51, 53, 56, 58, 59, 59, 61, 62, 63, 64, 65, 67, 68, + 68, 69, 71, 70, 69, 68, 68, 69, 70, 71, 72, 73, 52, 50, 48, 48, 47, 47, + 50, 52, 54, 57, 61, 62, 64, 66, 68, 69, 70, 72, 75, 75, 75, 76, 74, 72, + 73, 74, 75, 75, 74, 74, 73, 73, 54, 52, 50, 49, 49, 48, 52, 54, 55, 59, + 62, 64, 65, 68, 71, 72, 73, 75, 78, 78, 79, 79, 78, 79, 81, 79, 78, 76, + 77, 78, 80, 81, 57, 54, 52, 51, 51, 50, 53, 55, 57, 60, 64, 65, 67, 71, + 73, 75, 76, 79, 82, 82, 83, 85, 86, 85, 83, 82, 83, 84, 84, 83, 82, 81, + 63, 60, 57, 57, 56, 54, 57, 59, 60, 64, 67, 69, 71, 75, 78, 80, 82, 85, + 89, 89, 90, 92, 91, 88, 89, 90, 89, 87, 86, 87, 88, 90, 66, 63, 60, 59, + 59, 57, 60, 61, 62, 66, 69, 71, 73, 77, 80, 82, 84, 88, 92, 92, 93, 95, + 94, 95, 96, 93, 92, 93, 94, 93, 91, 90, 67, 64, 62, 61, 60, 58, 58, 61, + 63, 65, 67, 70, 72, 74, 78, 80, 82, 86, 88, 90, 95, 96, 96, 98, 97, 98, + 100, 98, 96, 96, 97, 99, 68, 65, 63, 62, 60, 60, 59, 61, 62, 65, 66, 68, + 72, 73, 76, 79, 80, 84, 87, 89, 93, 94, 98, 99, 99, 102, 101, 102, 103, + 103, 101, 99, 69, 66, 65, 63, 62, 61, 60, 60, 63, 64, 66, 68, 70, 73, + 74, 78, 80, 82, 85, 87, 91, 92, 96, 98, 101, 102, 103, 105, 105, 105, + 107, 108, 71, 67, 66, 64, 63, 62, 62, 61, 62, 64, 66, 67, 70, 71, 75, + 76, 79, 81, 84, 86, 89, 91, 94, 97, 98, 102, 104, 106, 106, 109, 109, + 108, 72, 68, 68, 65, 65, 63, 63, 61, 62, 65, 65, 68, 69, 72, 73, 77, 77, + 81, 81, 86, 87, 91, 91, 96, 97, 101, 102, 107, 107, 109, 110, 113, + /* Size 4x16 */ + 31, 49, 63, 69, 32, 45, 57, 65, 36, 46, 56, 62, 43, 49, 57, 60, 46, 53, + 60, 63, 45, 58, 67, 66, 46, 59, 71, 70, 50, 62, 78, 74, 52, 64, 82, 80, + 57, 67, 89, 85, 59, 68, 90, 91, 62, 71, 91, 96, 63, 69, 89, 101, 65, 68, + 89, 103, 67, 70, 86, 105, 69, 72, 88, 107, + /* Size 16x4 */ + 31, 32, 36, 43, 46, 45, 46, 50, 52, 57, 59, 62, 63, 65, 67, 69, 49, 45, + 46, 49, 53, 58, 59, 62, 64, 67, 68, 71, 69, 68, 70, 72, 63, 57, 56, 57, + 60, 67, 71, 78, 82, 89, 90, 91, 89, 89, 86, 88, 69, 65, 62, 60, 63, 66, + 70, 74, 80, 85, 91, 96, 101, 103, 105, 107, + /* Size 8x32 */ + 32, 37, 48, 52, 57, 66, 68, 71, 31, 38, 47, 50, 54, 63, 65, 67, 30, 40, + 46, 48, 52, 60, 63, 66, 32, 41, 46, 48, 51, 59, 62, 64, 33, 43, 47, 47, + 51, 59, 60, 63, 37, 47, 47, 47, 50, 57, 60, 62, 42, 47, 50, 50, 53, 60, + 59, 62, 45, 47, 51, 52, 55, 61, 61, 61, 49, 48, 53, 54, 57, 62, 62, 62, + 48, 47, 53, 57, 60, 66, 65, 64, 49, 46, 53, 61, 64, 69, 66, 66, 49, 46, + 53, 62, 65, 71, 68, 67, 50, 46, 54, 64, 67, 73, 72, 70, 52, 47, 54, 66, + 71, 77, 73, 71, 54, 49, 55, 68, 73, 80, 76, 75, 55, 49, 56, 69, 75, 82, + 79, 76, 57, 50, 56, 70, 76, 84, 80, 79, 60, 52, 58, 72, 79, 88, 84, 81, + 63, 55, 60, 75, 82, 92, 87, 84, 64, 55, 61, 75, 82, 92, 89, 86, 64, 56, + 61, 75, 83, 93, 93, 89, 67, 58, 63, 76, 85, 95, 94, 91, 68, 59, 64, 74, + 86, 94, 98, 94, 69, 60, 65, 72, 85, 95, 99, 97, 70, 62, 66, 73, 83, 96, + 99, 98, 71, 63, 67, 74, 82, 93, 102, 102, 72, 64, 66, 75, 83, 92, 101, + 104, 73, 65, 66, 75, 84, 93, 102, 106, 74, 67, 66, 74, 84, 94, 103, 106, + 75, 68, 66, 74, 83, 93, 103, 109, 76, 69, 67, 73, 82, 91, 101, 109, 77, + 70, 67, 73, 81, 90, 99, 108, + /* Size 32x8 */ + 32, 31, 30, 32, 33, 37, 42, 45, 49, 48, 49, 49, 50, 52, 54, 55, 57, 60, + 63, 64, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 37, 38, 40, 41, + 43, 47, 47, 47, 48, 47, 46, 46, 46, 47, 49, 49, 50, 52, 55, 55, 56, 58, + 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 48, 47, 46, 46, 47, 47, 50, 51, + 53, 53, 53, 53, 54, 54, 55, 56, 56, 58, 60, 61, 61, 63, 64, 65, 66, 67, + 66, 66, 66, 66, 67, 67, 52, 50, 48, 48, 47, 47, 50, 52, 54, 57, 61, 62, + 64, 66, 68, 69, 70, 72, 75, 75, 75, 76, 74, 72, 73, 74, 75, 75, 74, 74, + 73, 73, 57, 54, 52, 51, 51, 50, 53, 55, 57, 60, 64, 65, 67, 71, 73, 75, + 76, 79, 82, 82, 83, 85, 86, 85, 83, 82, 83, 84, 84, 83, 82, 81, 66, 63, + 60, 59, 59, 57, 60, 61, 62, 66, 69, 71, 73, 77, 80, 82, 84, 88, 92, 92, + 93, 95, 94, 95, 96, 93, 92, 93, 94, 93, 91, 90, 68, 65, 63, 62, 60, 60, + 59, 61, 62, 65, 66, 68, 72, 73, 76, 79, 80, 84, 87, 89, 93, 94, 98, 99, + 99, 102, 101, 102, 103, 103, 101, 99, 71, 67, 66, 64, 63, 62, 62, 61, + 62, 64, 66, 67, 70, 71, 75, 76, 79, 81, 84, 86, 89, 91, 94, 97, 98, 102, + 104, 106, 106, 109, 109, 108 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 41, 69, 92, 41, 63, 88, 103, 69, 88, 127, 140, 92, 103, 140, 184, + /* Size 8x8 */ + 32, 32, 37, 47, 62, 78, 90, 102, 32, 35, 39, 46, 58, 72, 84, 96, 37, 39, + 51, 60, 71, 84, 93, 100, 47, 46, 60, 73, 87, 100, 106, 113, 62, 58, 71, + 87, 105, 121, 129, 132, 78, 72, 84, 100, 121, 140, 148, 155, 90, 84, 93, + 106, 129, 148, 169, 183, 102, 96, 100, 113, 132, 155, 183, 201, + /* Size 16x16 */ + 32, 31, 31, 32, 36, 39, 47, 54, 61, 71, 80, 86, 92, 98, 104, 111, 31, + 32, 32, 33, 34, 37, 44, 50, 56, 65, 73, 79, 85, 91, 98, 105, 31, 32, 33, + 34, 36, 39, 45, 50, 56, 64, 71, 77, 82, 88, 94, 100, 32, 33, 34, 36, 40, + 42, 47, 51, 57, 65, 71, 76, 80, 85, 91, 98, 36, 34, 36, 40, 48, 50, 56, + 60, 65, 73, 79, 84, 86, 90, 95, 98, 39, 37, 39, 42, 50, 54, 60, 65, 70, + 78, 84, 89, 95, 96, 102, 105, 47, 44, 45, 47, 56, 60, 69, 75, 81, 89, + 95, 100, 102, 104, 109, 112, 54, 50, 50, 51, 60, 65, 75, 82, 89, 97, + 104, 109, 110, 114, 117, 121, 61, 56, 56, 57, 65, 70, 81, 89, 97, 106, + 113, 119, 122, 126, 125, 130, 71, 65, 64, 65, 73, 78, 89, 97, 106, 117, + 125, 131, 134, 134, 136, 141, 80, 73, 71, 71, 79, 84, 95, 104, 113, 125, + 134, 140, 142, 145, 146, 152, 86, 79, 77, 76, 84, 89, 100, 109, 119, + 131, 140, 147, 154, 157, 160, 165, 92, 85, 82, 80, 86, 95, 102, 110, + 122, 134, 142, 154, 162, 168, 174, 178, 98, 91, 88, 85, 90, 96, 104, + 114, 126, 134, 145, 157, 168, 176, 184, 193, 104, 98, 94, 91, 95, 102, + 109, 117, 125, 136, 146, 160, 174, 184, 193, 201, 111, 105, 100, 98, 98, + 105, 112, 121, 130, 141, 152, 165, 178, 193, 201, 210, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 32, 32, 34, 36, 38, 39, 44, 47, 49, 54, 59, 61, 65, + 71, 76, 80, 83, 86, 89, 92, 95, 98, 101, 104, 108, 111, 114, 31, 32, 32, + 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 56, 58, 62, 68, 72, 76, + 78, 82, 85, 88, 90, 93, 96, 99, 102, 105, 109, 31, 32, 32, 32, 32, 32, + 33, 33, 34, 36, 37, 41, 44, 46, 50, 54, 56, 60, 65, 70, 73, 76, 79, 82, + 85, 88, 91, 95, 98, 101, 105, 109, 31, 32, 32, 32, 32, 33, 33, 34, 35, + 36, 38, 41, 44, 45, 49, 54, 56, 59, 65, 69, 72, 75, 78, 81, 84, 86, 89, + 92, 95, 98, 101, 104, 31, 32, 32, 32, 33, 34, 34, 35, 36, 38, 39, 42, + 45, 46, 50, 54, 56, 59, 64, 68, 71, 74, 77, 79, 82, 85, 88, 91, 94, 97, + 100, 104, 32, 32, 32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 45, 46, 49, + 53, 55, 58, 63, 66, 69, 72, 74, 78, 81, 84, 87, 90, 93, 96, 99, 102, 32, + 33, 33, 33, 34, 36, 36, 38, 40, 41, 42, 44, 47, 48, 51, 55, 57, 60, 65, + 68, 71, 73, 76, 78, 80, 82, 85, 88, 91, 95, 98, 102, 34, 34, 33, 34, 35, + 37, 38, 39, 42, 44, 45, 47, 50, 51, 54, 58, 60, 63, 68, 71, 74, 76, 79, + 82, 85, 86, 87, 88, 90, 93, 96, 99, 36, 35, 34, 35, 36, 38, 40, 42, 48, + 50, 50, 54, 56, 57, 60, 64, 65, 68, 73, 76, 79, 81, 84, 86, 86, 88, 90, + 93, 95, 97, 98, 100, 38, 37, 36, 36, 38, 39, 41, 44, 50, 51, 52, 56, 58, + 60, 63, 67, 68, 71, 76, 79, 82, 84, 87, 87, 90, 93, 94, 95, 96, 100, + 103, 106, 39, 38, 37, 38, 39, 40, 42, 45, 50, 52, 54, 58, 60, 62, 65, + 69, 70, 73, 78, 81, 84, 86, 89, 92, 95, 95, 96, 99, 102, 104, 105, 106, + 44, 42, 41, 41, 42, 42, 44, 47, 54, 56, 58, 63, 66, 68, 71, 75, 77, 79, + 84, 88, 90, 92, 95, 97, 97, 99, 102, 103, 103, 106, 109, 113, 47, 45, + 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89, 92, + 95, 97, 100, 100, 102, 105, 104, 106, 109, 111, 112, 113, 49, 47, 46, + 45, 46, 46, 48, 51, 57, 60, 62, 68, 71, 73, 77, 81, 83, 87, 92, 95, 98, + 100, 103, 105, 107, 106, 109, 112, 112, 113, 117, 120, 54, 51, 50, 49, + 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, 89, 92, 97, 101, 104, + 106, 109, 112, 110, 113, 114, 114, 117, 121, 121, 121, 59, 56, 54, 54, + 54, 53, 55, 58, 64, 67, 69, 75, 79, 81, 87, 92, 94, 98, 103, 107, 110, + 113, 116, 114, 117, 118, 117, 121, 122, 122, 125, 129, 61, 58, 56, 56, + 56, 55, 57, 60, 65, 68, 70, 77, 81, 83, 89, 94, 97, 101, 106, 110, 113, + 116, 119, 120, 122, 121, 126, 124, 125, 130, 130, 130, 65, 62, 60, 59, + 59, 58, 60, 63, 68, 71, 73, 79, 84, 87, 92, 98, 101, 105, 111, 115, 118, + 121, 124, 128, 125, 129, 128, 131, 133, 132, 135, 139, 71, 68, 65, 65, + 64, 63, 65, 68, 73, 76, 78, 84, 89, 92, 97, 103, 106, 111, 117, 122, + 125, 128, 131, 131, 134, 132, 134, 136, 136, 140, 141, 140, 76, 72, 70, + 69, 68, 66, 68, 71, 76, 79, 81, 88, 92, 95, 101, 107, 110, 115, 122, + 127, 130, 133, 136, 136, 138, 139, 141, 140, 145, 143, 146, 151, 80, 76, + 73, 72, 71, 69, 71, 74, 79, 82, 84, 90, 95, 98, 104, 110, 113, 118, 125, + 130, 134, 137, 140, 146, 142, 146, 145, 149, 146, 150, 152, 151, 83, 78, + 76, 75, 74, 72, 73, 76, 81, 84, 86, 92, 97, 100, 106, 113, 116, 121, + 128, 133, 137, 140, 144, 147, 152, 148, 154, 151, 156, 155, 156, 162, + 86, 82, 79, 78, 77, 74, 76, 79, 84, 87, 89, 95, 100, 103, 109, 116, 119, + 124, 131, 136, 140, 144, 147, 150, 154, 159, 157, 160, 160, 162, 165, + 162, 89, 85, 82, 81, 79, 78, 78, 82, 86, 87, 92, 97, 100, 105, 112, 114, + 120, 128, 131, 136, 146, 147, 150, 155, 156, 161, 166, 165, 167, 169, + 169, 175, 92, 88, 85, 84, 82, 81, 80, 85, 86, 90, 95, 97, 102, 107, 110, + 117, 122, 125, 134, 138, 142, 152, 154, 156, 162, 163, 168, 173, 174, + 174, 178, 176, 95, 90, 88, 86, 85, 84, 82, 86, 88, 93, 95, 99, 105, 106, + 113, 118, 121, 129, 132, 139, 146, 148, 159, 161, 163, 169, 170, 176, + 180, 183, 181, 187, 98, 93, 91, 89, 88, 87, 85, 87, 90, 94, 96, 102, + 104, 109, 114, 117, 126, 128, 134, 141, 145, 154, 157, 166, 168, 170, + 176, 178, 184, 188, 193, 188, 101, 96, 95, 92, 91, 90, 88, 88, 93, 95, + 99, 103, 106, 112, 114, 121, 124, 131, 136, 140, 149, 151, 160, 165, + 173, 176, 178, 184, 186, 192, 196, 203, 104, 99, 98, 95, 94, 93, 91, 90, + 95, 96, 102, 103, 109, 112, 117, 122, 125, 133, 136, 145, 146, 156, 160, + 167, 174, 180, 184, 186, 193, 194, 201, 204, 108, 102, 101, 98, 97, 96, + 95, 93, 97, 100, 104, 106, 111, 113, 121, 122, 130, 132, 140, 143, 150, + 155, 162, 169, 174, 183, 188, 192, 194, 201, 202, 210, 111, 105, 105, + 101, 100, 99, 98, 96, 98, 103, 105, 109, 112, 117, 121, 125, 130, 135, + 141, 146, 152, 156, 165, 169, 178, 181, 193, 196, 201, 202, 210, 211, + 114, 109, 109, 104, 104, 102, 102, 99, 100, 106, 106, 113, 113, 120, + 121, 129, 130, 139, 140, 151, 151, 162, 162, 175, 176, 187, 188, 203, + 204, 210, 211, 219, + /* Size 4x8 */ + 32, 42, 69, 88, 33, 42, 64, 83, 36, 56, 77, 88, 46, 67, 93, 105, 60, 79, + 112, 122, 75, 92, 130, 144, 86, 95, 136, 167, 98, 105, 136, 177, + /* Size 8x4 */ + 32, 33, 36, 46, 60, 75, 86, 98, 42, 42, 56, 67, 79, 92, 95, 105, 69, 64, + 77, 93, 112, 130, 136, 136, 88, 83, 88, 105, 122, 144, 167, 177, + /* Size 8x16 */ + 32, 32, 36, 47, 65, 79, 90, 96, 31, 32, 35, 44, 60, 72, 84, 90, 32, 34, + 36, 45, 59, 71, 80, 87, 32, 35, 40, 47, 60, 71, 78, 85, 36, 37, 48, 56, + 68, 78, 83, 87, 39, 40, 50, 60, 73, 84, 91, 94, 47, 45, 56, 69, 84, 95, + 101, 101, 53, 50, 60, 75, 92, 103, 108, 110, 61, 56, 65, 81, 100, 113, + 116, 118, 71, 64, 73, 89, 111, 125, 129, 129, 79, 70, 79, 95, 118, 133, + 142, 138, 86, 76, 84, 100, 124, 140, 153, 150, 92, 82, 89, 101, 121, + 148, 157, 161, 98, 88, 93, 108, 124, 141, 163, 174, 104, 94, 95, 110, + 129, 151, 171, 181, 110, 100, 98, 111, 127, 147, 169, 188, + /* Size 16x8 */ + 32, 31, 32, 32, 36, 39, 47, 53, 61, 71, 79, 86, 92, 98, 104, 110, 32, + 32, 34, 35, 37, 40, 45, 50, 56, 64, 70, 76, 82, 88, 94, 100, 36, 35, 36, + 40, 48, 50, 56, 60, 65, 73, 79, 84, 89, 93, 95, 98, 47, 44, 45, 47, 56, + 60, 69, 75, 81, 89, 95, 100, 101, 108, 110, 111, 65, 60, 59, 60, 68, 73, + 84, 92, 100, 111, 118, 124, 121, 124, 129, 127, 79, 72, 71, 71, 78, 84, + 95, 103, 113, 125, 133, 140, 148, 141, 151, 147, 90, 84, 80, 78, 83, 91, + 101, 108, 116, 129, 142, 153, 157, 163, 171, 169, 96, 90, 87, 85, 87, + 94, 101, 110, 118, 129, 138, 150, 161, 174, 181, 188, + /* Size 16x32 */ + 32, 31, 32, 32, 36, 44, 47, 53, 65, 73, 79, 87, 90, 93, 96, 99, 31, 32, + 32, 33, 35, 42, 45, 51, 62, 69, 75, 83, 86, 88, 91, 94, 31, 32, 32, 33, + 35, 41, 44, 49, 60, 67, 72, 80, 84, 87, 90, 94, 31, 32, 33, 33, 35, 41, + 44, 49, 59, 66, 71, 79, 82, 84, 87, 90, 32, 32, 34, 34, 36, 42, 45, 50, + 59, 65, 71, 78, 80, 83, 87, 90, 32, 33, 35, 36, 38, 42, 45, 49, 58, 64, + 69, 76, 80, 83, 86, 88, 32, 33, 35, 36, 40, 44, 47, 51, 60, 66, 71, 76, + 78, 81, 85, 89, 34, 34, 36, 38, 42, 48, 50, 54, 63, 69, 73, 80, 82, 81, + 84, 86, 36, 34, 37, 40, 48, 54, 56, 60, 68, 74, 78, 84, 83, 86, 87, 87, + 38, 36, 39, 41, 49, 56, 58, 63, 71, 77, 81, 86, 88, 88, 90, 93, 39, 37, + 40, 42, 50, 58, 60, 65, 73, 79, 84, 90, 91, 92, 94, 93, 44, 41, 42, 45, + 53, 63, 66, 71, 79, 85, 90, 96, 94, 96, 96, 99, 47, 44, 45, 47, 56, 66, + 69, 75, 84, 90, 95, 99, 101, 98, 101, 99, 49, 46, 47, 48, 57, 67, 71, + 77, 86, 93, 97, 103, 103, 105, 102, 106, 53, 49, 50, 51, 60, 71, 75, 82, + 92, 99, 103, 111, 108, 107, 110, 107, 58, 54, 54, 55, 63, 75, 79, 87, + 98, 105, 110, 114, 114, 113, 111, 115, 61, 56, 56, 57, 65, 77, 81, 89, + 100, 107, 113, 118, 116, 117, 118, 116, 65, 60, 59, 60, 68, 79, 84, 92, + 105, 112, 118, 126, 124, 122, 121, 124, 71, 65, 64, 65, 73, 84, 89, 97, + 111, 119, 125, 130, 129, 129, 129, 125, 76, 69, 68, 69, 76, 88, 92, 101, + 115, 123, 130, 134, 134, 131, 132, 135, 79, 72, 70, 71, 79, 90, 95, 104, + 118, 127, 133, 143, 142, 141, 138, 136, 82, 75, 73, 74, 81, 92, 97, 106, + 121, 130, 136, 146, 145, 144, 144, 145, 86, 78, 76, 77, 84, 95, 100, + 109, 124, 133, 140, 147, 153, 151, 150, 146, 89, 81, 79, 78, 87, 95, 99, + 112, 124, 130, 145, 152, 156, 157, 156, 158, 92, 84, 82, 80, 89, 95, + 101, 116, 121, 132, 148, 151, 157, 163, 161, 159, 95, 86, 85, 83, 92, + 95, 105, 114, 120, 136, 143, 155, 163, 167, 171, 170, 98, 89, 88, 85, + 93, 95, 108, 113, 124, 136, 141, 160, 163, 169, 174, 171, 101, 92, 91, + 88, 94, 98, 110, 112, 128, 133, 146, 158, 166, 175, 179, 185, 104, 95, + 94, 91, 95, 101, 110, 115, 129, 132, 151, 154, 171, 175, 181, 186, 107, + 98, 97, 94, 96, 105, 110, 119, 128, 136, 149, 156, 173, 177, 188, 192, + 110, 101, 100, 97, 98, 108, 111, 123, 127, 141, 147, 161, 169, 183, 188, + 193, 114, 104, 104, 100, 100, 111, 111, 126, 127, 145, 145, 166, 166, + 189, 190, 201, + /* Size 32x16 */ + 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 61, 65, + 71, 76, 79, 82, 86, 89, 92, 95, 98, 101, 104, 107, 110, 114, 31, 32, 32, + 32, 32, 33, 33, 34, 34, 36, 37, 41, 44, 46, 49, 54, 56, 60, 65, 69, 72, + 75, 78, 81, 84, 86, 89, 92, 95, 98, 101, 104, 32, 32, 32, 33, 34, 35, + 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 56, 59, 64, 68, 70, 73, 76, 79, + 82, 85, 88, 91, 94, 97, 100, 104, 32, 33, 33, 33, 34, 36, 36, 38, 40, + 41, 42, 45, 47, 48, 51, 55, 57, 60, 65, 69, 71, 74, 77, 78, 80, 83, 85, + 88, 91, 94, 97, 100, 36, 35, 35, 35, 36, 38, 40, 42, 48, 49, 50, 53, 56, + 57, 60, 63, 65, 68, 73, 76, 79, 81, 84, 87, 89, 92, 93, 94, 95, 96, 98, + 100, 44, 42, 41, 41, 42, 42, 44, 48, 54, 56, 58, 63, 66, 67, 71, 75, 77, + 79, 84, 88, 90, 92, 95, 95, 95, 95, 95, 98, 101, 105, 108, 111, 47, 45, + 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89, 92, + 95, 97, 100, 99, 101, 105, 108, 110, 110, 110, 111, 111, 53, 51, 49, 49, + 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, 89, 92, 97, 101, 104, + 106, 109, 112, 116, 114, 113, 112, 115, 119, 123, 126, 65, 62, 60, 59, + 59, 58, 60, 63, 68, 71, 73, 79, 84, 86, 92, 98, 100, 105, 111, 115, 118, + 121, 124, 124, 121, 120, 124, 128, 129, 128, 127, 127, 73, 69, 67, 66, + 65, 64, 66, 69, 74, 77, 79, 85, 90, 93, 99, 105, 107, 112, 119, 123, + 127, 130, 133, 130, 132, 136, 136, 133, 132, 136, 141, 145, 79, 75, 72, + 71, 71, 69, 71, 73, 78, 81, 84, 90, 95, 97, 103, 110, 113, 118, 125, + 130, 133, 136, 140, 145, 148, 143, 141, 146, 151, 149, 147, 145, 87, 83, + 80, 79, 78, 76, 76, 80, 84, 86, 90, 96, 99, 103, 111, 114, 118, 126, + 130, 134, 143, 146, 147, 152, 151, 155, 160, 158, 154, 156, 161, 166, + 90, 86, 84, 82, 80, 80, 78, 82, 83, 88, 91, 94, 101, 103, 108, 114, 116, + 124, 129, 134, 142, 145, 153, 156, 157, 163, 163, 166, 171, 173, 169, + 166, 93, 88, 87, 84, 83, 83, 81, 81, 86, 88, 92, 96, 98, 105, 107, 113, + 117, 122, 129, 131, 141, 144, 151, 157, 163, 167, 169, 175, 175, 177, + 183, 189, 96, 91, 90, 87, 87, 86, 85, 84, 87, 90, 94, 96, 101, 102, 110, + 111, 118, 121, 129, 132, 138, 144, 150, 156, 161, 171, 174, 179, 181, + 188, 188, 190, 99, 94, 94, 90, 90, 88, 89, 86, 87, 93, 93, 99, 99, 106, + 107, 115, 116, 124, 125, 135, 136, 145, 146, 158, 159, 170, 171, 185, + 186, 192, 193, 201, + /* Size 4x16 */ + 31, 44, 73, 93, 32, 41, 67, 87, 32, 42, 65, 83, 33, 44, 66, 81, 34, 54, + 74, 86, 37, 58, 79, 92, 44, 66, 90, 98, 49, 71, 99, 107, 56, 77, 107, + 117, 65, 84, 119, 129, 72, 90, 127, 141, 78, 95, 133, 151, 84, 95, 132, + 163, 89, 95, 136, 169, 95, 101, 132, 175, 101, 108, 141, 183, + /* Size 16x4 */ + 31, 32, 32, 33, 34, 37, 44, 49, 56, 65, 72, 78, 84, 89, 95, 101, 44, 41, + 42, 44, 54, 58, 66, 71, 77, 84, 90, 95, 95, 95, 101, 108, 73, 67, 65, + 66, 74, 79, 90, 99, 107, 119, 127, 133, 132, 136, 132, 141, 93, 87, 83, + 81, 86, 92, 98, 107, 117, 129, 141, 151, 163, 169, 175, 183, + /* Size 8x32 */ + 32, 32, 36, 47, 65, 79, 90, 96, 31, 32, 35, 45, 62, 75, 86, 91, 31, 32, + 35, 44, 60, 72, 84, 90, 31, 33, 35, 44, 59, 71, 82, 87, 32, 34, 36, 45, + 59, 71, 80, 87, 32, 35, 38, 45, 58, 69, 80, 86, 32, 35, 40, 47, 60, 71, + 78, 85, 34, 36, 42, 50, 63, 73, 82, 84, 36, 37, 48, 56, 68, 78, 83, 87, + 38, 39, 49, 58, 71, 81, 88, 90, 39, 40, 50, 60, 73, 84, 91, 94, 44, 42, + 53, 66, 79, 90, 94, 96, 47, 45, 56, 69, 84, 95, 101, 101, 49, 47, 57, + 71, 86, 97, 103, 102, 53, 50, 60, 75, 92, 103, 108, 110, 58, 54, 63, 79, + 98, 110, 114, 111, 61, 56, 65, 81, 100, 113, 116, 118, 65, 59, 68, 84, + 105, 118, 124, 121, 71, 64, 73, 89, 111, 125, 129, 129, 76, 68, 76, 92, + 115, 130, 134, 132, 79, 70, 79, 95, 118, 133, 142, 138, 82, 73, 81, 97, + 121, 136, 145, 144, 86, 76, 84, 100, 124, 140, 153, 150, 89, 79, 87, 99, + 124, 145, 156, 156, 92, 82, 89, 101, 121, 148, 157, 161, 95, 85, 92, + 105, 120, 143, 163, 171, 98, 88, 93, 108, 124, 141, 163, 174, 101, 91, + 94, 110, 128, 146, 166, 179, 104, 94, 95, 110, 129, 151, 171, 181, 107, + 97, 96, 110, 128, 149, 173, 188, 110, 100, 98, 111, 127, 147, 169, 188, + 114, 104, 100, 111, 127, 145, 166, 190, + /* Size 32x8 */ + 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 61, 65, + 71, 76, 79, 82, 86, 89, 92, 95, 98, 101, 104, 107, 110, 114, 32, 32, 32, + 33, 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 56, 59, 64, 68, 70, + 73, 76, 79, 82, 85, 88, 91, 94, 97, 100, 104, 36, 35, 35, 35, 36, 38, + 40, 42, 48, 49, 50, 53, 56, 57, 60, 63, 65, 68, 73, 76, 79, 81, 84, 87, + 89, 92, 93, 94, 95, 96, 98, 100, 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, + 60, 66, 69, 71, 75, 79, 81, 84, 89, 92, 95, 97, 100, 99, 101, 105, 108, + 110, 110, 110, 111, 111, 65, 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79, + 84, 86, 92, 98, 100, 105, 111, 115, 118, 121, 124, 124, 121, 120, 124, + 128, 129, 128, 127, 127, 79, 75, 72, 71, 71, 69, 71, 73, 78, 81, 84, 90, + 95, 97, 103, 110, 113, 118, 125, 130, 133, 136, 140, 145, 148, 143, 141, + 146, 151, 149, 147, 145, 90, 86, 84, 82, 80, 80, 78, 82, 83, 88, 91, 94, + 101, 103, 108, 114, 116, 124, 129, 134, 142, 145, 153, 156, 157, 163, + 163, 166, 171, 173, 169, 166, 96, 91, 90, 87, 87, 86, 85, 84, 87, 90, + 94, 96, 101, 102, 110, 111, 118, 121, 129, 132, 138, 144, 150, 156, 161, + 171, 174, 179, 181, 188, 188, 190 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 45, 56, 64, 45, 58, 66, 69, 56, 66, 86, 87, 64, 69, 87, 105, + /* Size 8x8 */ + 31, 38, 47, 48, 54, 61, 66, 69, 38, 47, 47, 46, 50, 55, 61, 65, 47, 47, + 53, 55, 58, 63, 65, 66, 48, 46, 55, 62, 67, 72, 73, 73, 54, 50, 58, 67, + 76, 83, 84, 82, 61, 55, 63, 72, 83, 91, 92, 92, 66, 61, 65, 73, 84, 92, + 101, 103, 69, 65, 66, 73, 82, 92, 103, 109, + /* Size 16x16 */ + 32, 30, 33, 38, 49, 48, 50, 52, 55, 60, 63, 66, 68, 70, 72, 74, 30, 31, + 35, 41, 46, 46, 46, 48, 51, 55, 58, 60, 63, 65, 68, 70, 33, 35, 39, 44, + 47, 46, 46, 47, 50, 53, 56, 58, 60, 62, 65, 67, 38, 41, 44, 47, 49, 48, + 47, 48, 50, 53, 55, 58, 58, 60, 62, 65, 49, 46, 47, 49, 53, 53, 54, 54, + 56, 58, 60, 62, 62, 63, 64, 64, 48, 46, 46, 48, 53, 54, 56, 57, 59, 61, + 63, 65, 67, 66, 68, 68, 50, 46, 46, 47, 54, 56, 61, 63, 65, 68, 70, 72, + 71, 71, 72, 72, 52, 48, 47, 48, 54, 57, 63, 66, 69, 72, 75, 76, 75, 76, + 76, 76, 55, 51, 50, 50, 56, 59, 65, 69, 73, 77, 79, 81, 81, 81, 80, 80, + 60, 55, 53, 53, 58, 61, 68, 72, 77, 82, 85, 87, 87, 85, 84, 85, 63, 58, + 56, 55, 60, 63, 70, 75, 79, 85, 89, 91, 91, 90, 89, 90, 66, 60, 58, 58, + 62, 65, 72, 76, 81, 87, 91, 94, 96, 95, 95, 95, 68, 63, 60, 58, 62, 67, + 71, 75, 81, 87, 91, 96, 99, 100, 100, 100, 70, 65, 62, 60, 63, 66, 71, + 76, 81, 85, 90, 95, 100, 103, 104, 105, 72, 68, 65, 62, 64, 68, 72, 76, + 80, 84, 89, 95, 100, 104, 107, 108, 74, 70, 67, 65, 64, 68, 72, 76, 80, + 85, 90, 95, 100, 105, 108, 111, + /* Size 32x32 */ + 32, 31, 30, 31, 33, 36, 38, 41, 49, 49, 48, 49, 50, 51, 52, 54, 55, 57, + 60, 62, 63, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 31, 31, 31, 32, + 34, 38, 40, 42, 47, 47, 47, 47, 48, 48, 50, 52, 53, 54, 57, 59, 60, 61, + 63, 64, 65, 66, 67, 67, 68, 69, 70, 71, 30, 31, 31, 32, 35, 39, 41, 42, + 46, 46, 46, 45, 46, 47, 48, 50, 51, 52, 55, 57, 58, 59, 60, 62, 63, 64, + 65, 67, 68, 69, 70, 71, 31, 32, 32, 33, 36, 40, 41, 43, 46, 46, 45, 45, + 46, 46, 47, 49, 50, 51, 54, 56, 57, 58, 59, 61, 62, 63, 63, 64, 65, 66, + 67, 68, 33, 34, 35, 36, 39, 43, 44, 45, 47, 46, 46, 45, 46, 47, 47, 49, + 50, 51, 53, 55, 56, 57, 58, 59, 60, 61, 62, 63, 65, 66, 67, 68, 36, 38, + 39, 40, 43, 47, 47, 47, 48, 47, 46, 45, 46, 46, 47, 48, 49, 50, 52, 53, + 54, 55, 56, 58, 59, 61, 62, 63, 64, 65, 66, 66, 38, 40, 41, 41, 44, 47, + 47, 48, 49, 48, 48, 47, 47, 47, 48, 49, 50, 51, 53, 54, 55, 56, 58, 58, + 58, 59, 60, 61, 62, 64, 65, 66, 41, 42, 42, 43, 45, 47, 48, 48, 50, 50, + 49, 49, 50, 50, 50, 52, 52, 53, 55, 56, 57, 58, 59, 60, 61, 61, 61, 61, + 62, 63, 63, 64, 49, 47, 46, 46, 47, 48, 49, 50, 53, 53, 53, 53, 54, 54, + 54, 55, 56, 56, 58, 59, 60, 61, 62, 63, 62, 62, 63, 64, 64, 64, 64, 64, + 49, 47, 46, 46, 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, 58, 58, + 60, 61, 62, 63, 64, 64, 64, 65, 65, 65, 65, 66, 67, 68, 48, 47, 46, 45, + 46, 46, 48, 49, 53, 54, 54, 55, 56, 56, 57, 58, 59, 60, 61, 63, 63, 64, + 65, 66, 67, 66, 66, 67, 68, 68, 68, 68, 49, 47, 45, 45, 45, 45, 47, 49, + 53, 55, 55, 58, 59, 60, 61, 62, 63, 63, 65, 66, 67, 68, 69, 69, 68, 68, + 69, 69, 69, 69, 70, 71, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, + 61, 61, 63, 64, 65, 66, 68, 69, 70, 71, 72, 71, 71, 72, 71, 71, 72, 72, + 72, 71, 51, 48, 47, 46, 47, 46, 47, 50, 54, 55, 56, 60, 61, 62, 64, 66, + 66, 67, 69, 70, 71, 72, 73, 73, 74, 73, 73, 74, 73, 73, 74, 75, 52, 50, + 48, 47, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, 68, 69, 70, 72, 74, + 75, 75, 76, 77, 75, 76, 76, 75, 76, 77, 76, 75, 54, 52, 50, 49, 49, 48, + 49, 52, 55, 57, 58, 62, 64, 66, 68, 71, 72, 73, 75, 77, 78, 79, 80, 78, + 79, 78, 77, 78, 78, 77, 78, 79, 55, 53, 51, 50, 50, 49, 50, 52, 56, 58, + 59, 63, 65, 66, 69, 72, 73, 74, 77, 78, 79, 80, 81, 81, 81, 80, 81, 80, + 80, 81, 80, 79, 57, 54, 52, 51, 51, 50, 51, 53, 56, 58, 60, 63, 66, 67, + 70, 73, 74, 76, 79, 80, 82, 83, 84, 85, 83, 84, 83, 83, 83, 82, 82, 83, + 60, 57, 55, 54, 53, 52, 53, 55, 58, 60, 61, 65, 68, 69, 72, 75, 77, 79, + 82, 84, 85, 86, 87, 86, 87, 85, 85, 85, 84, 86, 85, 84, 62, 59, 57, 56, + 55, 53, 54, 56, 59, 61, 63, 66, 69, 70, 74, 77, 78, 80, 84, 86, 87, 88, + 90, 89, 89, 88, 88, 87, 88, 87, 87, 88, 63, 60, 58, 57, 56, 54, 55, 57, + 60, 62, 63, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90, 91, 93, 91, 91, + 90, 91, 89, 90, 90, 89, 65, 61, 59, 58, 57, 55, 56, 58, 61, 63, 64, 68, + 71, 72, 75, 79, 80, 83, 86, 88, 90, 91, 93, 94, 95, 92, 94, 92, 93, 92, + 91, 93, 66, 63, 60, 59, 58, 56, 58, 59, 62, 64, 65, 69, 72, 73, 76, 80, + 81, 84, 87, 90, 91, 93, 94, 95, 96, 97, 95, 95, 95, 95, 95, 93, 67, 64, + 62, 61, 59, 58, 58, 60, 63, 64, 66, 69, 71, 73, 77, 78, 81, 85, 86, 89, + 93, 94, 95, 97, 97, 98, 99, 97, 97, 97, 96, 98, 68, 65, 63, 62, 60, 59, + 58, 61, 62, 64, 67, 68, 71, 74, 75, 79, 81, 83, 87, 89, 91, 95, 96, 97, + 99, 98, 100, 100, 100, 99, 100, 98, 69, 66, 64, 63, 61, 61, 59, 61, 62, + 65, 66, 68, 72, 73, 76, 78, 80, 84, 85, 88, 91, 92, 97, 98, 98, 101, + 100, 102, 102, 103, 101, 102, 70, 67, 65, 63, 62, 62, 60, 61, 63, 65, + 66, 69, 71, 73, 76, 77, 81, 83, 85, 88, 90, 94, 95, 99, 100, 100, 103, + 102, 104, 104, 105, 103, 71, 67, 67, 64, 63, 63, 61, 61, 64, 65, 67, 69, + 71, 74, 75, 78, 80, 83, 85, 87, 91, 92, 95, 97, 100, 102, 102, 105, 104, + 106, 106, 108, 72, 68, 68, 65, 65, 64, 62, 62, 64, 65, 68, 69, 72, 73, + 76, 78, 80, 83, 84, 88, 89, 93, 95, 97, 100, 102, 104, 104, 107, 106, + 108, 108, 73, 69, 69, 66, 66, 65, 64, 63, 64, 66, 68, 69, 72, 73, 77, + 77, 81, 82, 86, 87, 90, 92, 95, 97, 99, 103, 104, 106, 106, 109, 108, + 110, 74, 70, 70, 67, 67, 66, 65, 63, 64, 67, 68, 70, 72, 74, 76, 78, 80, + 82, 85, 87, 90, 91, 95, 96, 100, 101, 105, 106, 108, 108, 111, 110, 75, + 71, 71, 68, 68, 66, 66, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 84, + 88, 89, 93, 93, 98, 98, 102, 103, 108, 108, 110, 110, 113, + /* Size 4x8 */ + 31, 47, 57, 65, 40, 45, 52, 61, 46, 55, 61, 63, 47, 60, 70, 72, 52, 64, + 79, 81, 59, 68, 87, 90, 63, 66, 88, 99, 66, 69, 85, 102, + /* Size 8x4 */ + 31, 40, 46, 47, 52, 59, 63, 66, 47, 45, 55, 60, 64, 68, 66, 69, 57, 52, + 61, 70, 79, 87, 88, 85, 65, 61, 63, 72, 81, 90, 99, 102, + /* Size 8x16 */ + 32, 35, 48, 50, 57, 63, 68, 70, 30, 38, 46, 46, 52, 58, 63, 65, 33, 41, + 47, 46, 51, 56, 60, 63, 39, 46, 48, 47, 51, 55, 58, 61, 49, 48, 53, 54, + 57, 60, 61, 61, 48, 46, 53, 56, 60, 64, 65, 65, 50, 46, 54, 61, 66, 70, + 71, 69, 52, 47, 54, 63, 71, 75, 75, 74, 55, 49, 56, 65, 74, 79, 79, 78, + 60, 53, 58, 68, 79, 85, 85, 82, 63, 55, 60, 70, 82, 89, 91, 87, 66, 58, + 62, 72, 84, 91, 95, 91, 68, 60, 64, 71, 81, 94, 97, 96, 70, 62, 65, 73, + 81, 89, 98, 101, 72, 65, 65, 72, 82, 92, 100, 103, 74, 67, 65, 71, 79, + 89, 98, 105, + /* Size 16x8 */ + 32, 30, 33, 39, 49, 48, 50, 52, 55, 60, 63, 66, 68, 70, 72, 74, 35, 38, + 41, 46, 48, 46, 46, 47, 49, 53, 55, 58, 60, 62, 65, 67, 48, 46, 47, 48, + 53, 53, 54, 54, 56, 58, 60, 62, 64, 65, 65, 65, 50, 46, 46, 47, 54, 56, + 61, 63, 65, 68, 70, 72, 71, 73, 72, 71, 57, 52, 51, 51, 57, 60, 66, 71, + 74, 79, 82, 84, 81, 81, 82, 79, 63, 58, 56, 55, 60, 64, 70, 75, 79, 85, + 89, 91, 94, 89, 92, 89, 68, 63, 60, 58, 61, 65, 71, 75, 79, 85, 91, 95, + 97, 98, 100, 98, 70, 65, 63, 61, 61, 65, 69, 74, 78, 82, 87, 91, 96, + 101, 103, 105, + /* Size 16x32 */ + 32, 31, 35, 38, 48, 49, 50, 52, 57, 61, 63, 67, 68, 69, 70, 71, 31, 31, + 37, 40, 47, 47, 48, 50, 54, 57, 60, 63, 64, 65, 66, 67, 30, 32, 38, 40, + 46, 45, 46, 48, 52, 55, 58, 61, 63, 64, 65, 67, 31, 33, 38, 41, 46, 45, + 46, 48, 52, 55, 57, 60, 61, 62, 63, 64, 33, 36, 41, 44, 47, 46, 46, 47, + 51, 54, 56, 59, 60, 61, 63, 64, 37, 40, 45, 47, 47, 45, 46, 47, 50, 52, + 54, 57, 59, 61, 62, 62, 39, 41, 46, 47, 48, 47, 47, 48, 51, 54, 55, 57, + 58, 59, 61, 62, 42, 43, 46, 48, 50, 49, 50, 50, 53, 56, 57, 60, 60, 59, + 60, 60, 49, 46, 48, 49, 53, 53, 54, 54, 57, 59, 60, 63, 61, 62, 61, 61, + 48, 46, 47, 48, 53, 55, 55, 56, 58, 61, 62, 64, 64, 63, 63, 64, 48, 46, + 46, 48, 53, 56, 56, 57, 60, 62, 64, 66, 65, 65, 65, 64, 49, 45, 45, 47, + 53, 58, 59, 61, 64, 66, 67, 69, 67, 67, 66, 67, 50, 46, 46, 48, 54, 59, + 61, 63, 66, 68, 70, 71, 71, 68, 69, 67, 51, 47, 47, 48, 54, 60, 61, 64, + 68, 70, 71, 73, 72, 72, 70, 71, 52, 48, 47, 48, 54, 61, 63, 66, 71, 73, + 75, 77, 75, 73, 74, 71, 54, 50, 49, 50, 55, 62, 65, 68, 73, 76, 78, 79, + 78, 76, 74, 75, 55, 51, 49, 50, 56, 63, 65, 69, 74, 77, 79, 81, 79, 78, + 78, 75, 57, 52, 50, 51, 56, 64, 66, 70, 76, 79, 82, 85, 83, 81, 79, 79, + 60, 54, 53, 53, 58, 65, 68, 72, 79, 82, 85, 87, 85, 84, 82, 80, 62, 56, + 54, 55, 60, 66, 69, 74, 81, 84, 87, 88, 87, 85, 84, 84, 63, 57, 55, 56, + 60, 67, 70, 75, 82, 86, 89, 92, 91, 89, 87, 84, 64, 59, 56, 57, 61, 68, + 71, 75, 83, 87, 90, 93, 92, 90, 89, 89, 66, 60, 58, 58, 62, 69, 72, 76, + 84, 88, 91, 94, 95, 93, 91, 89, 67, 61, 59, 58, 63, 68, 71, 78, 83, 86, + 93, 96, 96, 96, 94, 94, 68, 62, 60, 59, 64, 67, 71, 79, 81, 86, 94, 95, + 97, 98, 96, 94, 69, 63, 61, 60, 65, 66, 72, 77, 80, 88, 91, 96, 99, 99, + 100, 98, 70, 64, 62, 60, 65, 66, 73, 76, 81, 87, 89, 97, 98, 100, 101, + 99, 71, 65, 64, 61, 65, 67, 73, 74, 82, 85, 90, 95, 99, 102, 103, 104, + 72, 65, 65, 62, 65, 68, 72, 75, 82, 83, 92, 93, 100, 102, 103, 104, 73, + 66, 66, 63, 65, 69, 72, 76, 81, 85, 90, 93, 100, 102, 105, 106, 74, 67, + 67, 64, 65, 70, 71, 77, 79, 86, 89, 94, 98, 103, 105, 106, 75, 68, 68, + 65, 65, 71, 71, 78, 78, 87, 87, 96, 96, 105, 105, 109, + /* Size 32x16 */ + 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 55, 57, + 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 31, 31, 32, 33, + 36, 40, 41, 43, 46, 46, 46, 45, 46, 47, 48, 50, 51, 52, 54, 56, 57, 59, + 60, 61, 62, 63, 64, 65, 65, 66, 67, 68, 35, 37, 38, 38, 41, 45, 46, 46, + 48, 47, 46, 45, 46, 47, 47, 49, 49, 50, 53, 54, 55, 56, 58, 59, 60, 61, + 62, 64, 65, 66, 67, 68, 38, 40, 40, 41, 44, 47, 47, 48, 49, 48, 48, 47, + 48, 48, 48, 50, 50, 51, 53, 55, 56, 57, 58, 58, 59, 60, 60, 61, 62, 63, + 64, 65, 48, 47, 46, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 54, 55, + 56, 56, 58, 60, 60, 61, 62, 63, 64, 65, 65, 65, 65, 65, 65, 65, 49, 47, + 45, 45, 46, 45, 47, 49, 53, 55, 56, 58, 59, 60, 61, 62, 63, 64, 65, 66, + 67, 68, 69, 68, 67, 66, 66, 67, 68, 69, 70, 71, 50, 48, 46, 46, 46, 46, + 47, 50, 54, 55, 56, 59, 61, 61, 63, 65, 65, 66, 68, 69, 70, 71, 72, 71, + 71, 72, 73, 73, 72, 72, 71, 71, 52, 50, 48, 48, 47, 47, 48, 50, 54, 56, + 57, 61, 63, 64, 66, 68, 69, 70, 72, 74, 75, 75, 76, 78, 79, 77, 76, 74, + 75, 76, 77, 78, 57, 54, 52, 52, 51, 50, 51, 53, 57, 58, 60, 64, 66, 68, + 71, 73, 74, 76, 79, 81, 82, 83, 84, 83, 81, 80, 81, 82, 82, 81, 79, 78, + 61, 57, 55, 55, 54, 52, 54, 56, 59, 61, 62, 66, 68, 70, 73, 76, 77, 79, + 82, 84, 86, 87, 88, 86, 86, 88, 87, 85, 83, 85, 86, 87, 63, 60, 58, 57, + 56, 54, 55, 57, 60, 62, 64, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90, + 91, 93, 94, 91, 89, 90, 92, 90, 89, 87, 67, 63, 61, 60, 59, 57, 57, 60, + 63, 64, 66, 69, 71, 73, 77, 79, 81, 85, 87, 88, 92, 93, 94, 96, 95, 96, + 97, 95, 93, 93, 94, 96, 68, 64, 63, 61, 60, 59, 58, 60, 61, 64, 65, 67, + 71, 72, 75, 78, 79, 83, 85, 87, 91, 92, 95, 96, 97, 99, 98, 99, 100, + 100, 98, 96, 69, 65, 64, 62, 61, 61, 59, 59, 62, 63, 65, 67, 68, 72, 73, + 76, 78, 81, 84, 85, 89, 90, 93, 96, 98, 99, 100, 102, 102, 102, 103, + 105, 70, 66, 65, 63, 63, 62, 61, 60, 61, 63, 65, 66, 69, 70, 74, 74, 78, + 79, 82, 84, 87, 89, 91, 94, 96, 100, 101, 103, 103, 105, 105, 105, 71, + 67, 67, 64, 64, 62, 62, 60, 61, 64, 64, 67, 67, 71, 71, 75, 75, 79, 80, + 84, 84, 89, 89, 94, 94, 98, 99, 104, 104, 106, 106, 109, + /* Size 4x16 */ + 31, 49, 61, 69, 32, 45, 55, 64, 36, 46, 54, 61, 41, 47, 54, 59, 46, 53, + 59, 62, 46, 56, 62, 65, 46, 59, 68, 68, 48, 61, 73, 73, 51, 63, 77, 78, + 54, 65, 82, 84, 57, 67, 86, 89, 60, 69, 88, 93, 62, 67, 86, 98, 64, 66, + 87, 100, 65, 68, 83, 102, 67, 70, 86, 103, + /* Size 16x4 */ + 31, 32, 36, 41, 46, 46, 46, 48, 51, 54, 57, 60, 62, 64, 65, 67, 49, 45, + 46, 47, 53, 56, 59, 61, 63, 65, 67, 69, 67, 66, 68, 70, 61, 55, 54, 54, + 59, 62, 68, 73, 77, 82, 86, 88, 86, 87, 83, 86, 69, 64, 61, 59, 62, 65, + 68, 73, 78, 84, 89, 93, 98, 100, 102, 103, + /* Size 8x32 */ + 32, 35, 48, 50, 57, 63, 68, 70, 31, 37, 47, 48, 54, 60, 64, 66, 30, 38, + 46, 46, 52, 58, 63, 65, 31, 38, 46, 46, 52, 57, 61, 63, 33, 41, 47, 46, + 51, 56, 60, 63, 37, 45, 47, 46, 50, 54, 59, 62, 39, 46, 48, 47, 51, 55, + 58, 61, 42, 46, 50, 50, 53, 57, 60, 60, 49, 48, 53, 54, 57, 60, 61, 61, + 48, 47, 53, 55, 58, 62, 64, 63, 48, 46, 53, 56, 60, 64, 65, 65, 49, 45, + 53, 59, 64, 67, 67, 66, 50, 46, 54, 61, 66, 70, 71, 69, 51, 47, 54, 61, + 68, 71, 72, 70, 52, 47, 54, 63, 71, 75, 75, 74, 54, 49, 55, 65, 73, 78, + 78, 74, 55, 49, 56, 65, 74, 79, 79, 78, 57, 50, 56, 66, 76, 82, 83, 79, + 60, 53, 58, 68, 79, 85, 85, 82, 62, 54, 60, 69, 81, 87, 87, 84, 63, 55, + 60, 70, 82, 89, 91, 87, 64, 56, 61, 71, 83, 90, 92, 89, 66, 58, 62, 72, + 84, 91, 95, 91, 67, 59, 63, 71, 83, 93, 96, 94, 68, 60, 64, 71, 81, 94, + 97, 96, 69, 61, 65, 72, 80, 91, 99, 100, 70, 62, 65, 73, 81, 89, 98, + 101, 71, 64, 65, 73, 82, 90, 99, 103, 72, 65, 65, 72, 82, 92, 100, 103, + 73, 66, 65, 72, 81, 90, 100, 105, 74, 67, 65, 71, 79, 89, 98, 105, 75, + 68, 65, 71, 78, 87, 96, 105, + /* Size 32x8 */ + 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 55, 57, + 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 35, 37, 38, 38, + 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 49, 50, 53, 54, 55, 56, + 58, 59, 60, 61, 62, 64, 65, 66, 67, 68, 48, 47, 46, 46, 47, 47, 48, 50, + 53, 53, 53, 53, 54, 54, 54, 55, 56, 56, 58, 60, 60, 61, 62, 63, 64, 65, + 65, 65, 65, 65, 65, 65, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, + 61, 61, 63, 65, 65, 66, 68, 69, 70, 71, 72, 71, 71, 72, 73, 73, 72, 72, + 71, 71, 57, 54, 52, 52, 51, 50, 51, 53, 57, 58, 60, 64, 66, 68, 71, 73, + 74, 76, 79, 81, 82, 83, 84, 83, 81, 80, 81, 82, 82, 81, 79, 78, 63, 60, + 58, 57, 56, 54, 55, 57, 60, 62, 64, 67, 70, 71, 75, 78, 79, 82, 85, 87, + 89, 90, 91, 93, 94, 91, 89, 90, 92, 90, 89, 87, 68, 64, 63, 61, 60, 59, + 58, 60, 61, 64, 65, 67, 71, 72, 75, 78, 79, 83, 85, 87, 91, 92, 95, 96, + 97, 99, 98, 99, 100, 100, 98, 96, 70, 66, 65, 63, 63, 62, 61, 60, 61, + 63, 65, 66, 69, 70, 74, 74, 78, 79, 82, 84, 87, 89, 91, 94, 96, 100, + 101, 103, 103, 105, 105, 105 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 38, 63, 86, 38, 56, 78, 97, 63, 78, 113, 130, 86, 97, 130, 169, + /* Size 8x8 */ + 32, 32, 35, 46, 57, 76, 85, 96, 32, 34, 37, 45, 54, 70, 79, 90, 35, 37, + 48, 56, 64, 79, 87, 93, 46, 45, 56, 70, 80, 96, 100, 105, 57, 54, 64, + 80, 93, 111, 121, 122, 76, 70, 79, 96, 111, 134, 138, 144, 85, 79, 87, + 100, 121, 138, 156, 168, 96, 90, 93, 105, 122, 144, 168, 184, + /* Size 16x16 */ + 32, 31, 31, 32, 34, 39, 44, 49, 58, 65, 71, 81, 87, 93, 98, 104, 31, 32, + 32, 32, 34, 38, 41, 46, 54, 60, 66, 75, 81, 86, 92, 98, 31, 32, 33, 34, + 36, 39, 42, 46, 53, 59, 64, 73, 78, 83, 88, 94, 32, 32, 34, 35, 37, 40, + 42, 46, 52, 58, 63, 71, 75, 80, 86, 92, 34, 34, 36, 37, 42, 47, 50, 53, + 59, 65, 70, 77, 82, 85, 89, 92, 39, 38, 39, 40, 47, 54, 58, 62, 68, 73, + 78, 85, 90, 90, 96, 98, 44, 41, 42, 42, 50, 58, 63, 68, 74, 79, 84, 91, + 96, 98, 102, 104, 49, 46, 46, 46, 53, 62, 68, 73, 81, 87, 92, 99, 103, + 107, 109, 112, 58, 54, 53, 52, 59, 68, 74, 81, 90, 97, 102, 110, 114, + 118, 117, 121, 65, 60, 59, 58, 65, 73, 79, 87, 97, 105, 111, 120, 125, + 125, 126, 130, 71, 66, 64, 63, 70, 78, 84, 92, 102, 111, 117, 127, 133, + 134, 136, 141, 81, 75, 73, 71, 77, 85, 91, 99, 110, 120, 127, 137, 143, + 145, 148, 152, 87, 81, 78, 75, 82, 90, 96, 103, 114, 125, 133, 143, 150, + 156, 160, 163, 93, 86, 83, 80, 85, 90, 98, 107, 118, 125, 134, 145, 156, + 163, 169, 177, 98, 92, 88, 86, 89, 96, 102, 109, 117, 126, 136, 148, + 160, 169, 176, 184, 104, 98, 94, 92, 92, 98, 104, 112, 121, 130, 141, + 152, 163, 177, 184, 191, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 32, 32, 34, 34, 36, 39, 41, 44, 48, 49, 54, 58, 59, + 65, 69, 71, 80, 81, 83, 87, 90, 93, 95, 98, 101, 104, 107, 31, 32, 32, + 32, 32, 32, 32, 34, 34, 35, 38, 39, 42, 46, 47, 51, 55, 57, 62, 66, 68, + 76, 77, 78, 83, 85, 88, 90, 93, 96, 99, 101, 31, 32, 32, 32, 32, 32, 32, + 33, 34, 34, 38, 39, 41, 45, 46, 50, 54, 55, 60, 64, 66, 73, 75, 76, 81, + 83, 86, 89, 92, 95, 98, 101, 31, 32, 32, 32, 32, 32, 32, 33, 34, 34, 37, + 38, 41, 44, 45, 49, 53, 54, 59, 63, 65, 72, 74, 75, 79, 81, 84, 86, 89, + 91, 94, 97, 31, 32, 32, 32, 33, 33, 34, 35, 36, 36, 39, 40, 42, 45, 46, + 50, 53, 54, 59, 63, 64, 71, 73, 74, 78, 80, 83, 85, 88, 91, 94, 97, 32, + 32, 32, 32, 33, 34, 34, 36, 36, 37, 40, 40, 42, 45, 46, 49, 53, 54, 58, + 62, 63, 70, 72, 73, 77, 79, 82, 85, 87, 90, 92, 95, 32, 32, 32, 32, 34, + 34, 35, 37, 37, 38, 40, 41, 42, 45, 46, 49, 52, 54, 58, 61, 63, 69, 71, + 72, 75, 78, 80, 83, 86, 89, 92, 95, 34, 34, 33, 33, 35, 36, 37, 39, 41, + 42, 45, 46, 47, 50, 51, 54, 57, 59, 63, 66, 68, 74, 75, 76, 80, 81, 82, + 83, 85, 87, 90, 93, 34, 34, 34, 34, 36, 36, 37, 41, 42, 45, 47, 48, 50, + 53, 53, 56, 59, 61, 65, 68, 70, 76, 77, 78, 82, 83, 85, 88, 89, 90, 92, + 93, 36, 35, 34, 34, 36, 37, 38, 42, 45, 48, 50, 51, 54, 56, 57, 60, 63, + 64, 68, 71, 73, 79, 80, 81, 85, 87, 89, 89, 90, 93, 96, 99, 39, 38, 38, + 37, 39, 40, 40, 45, 47, 50, 54, 55, 58, 61, 62, 65, 68, 69, 73, 76, 78, + 84, 85, 86, 90, 89, 90, 93, 96, 97, 98, 99, 41, 39, 39, 38, 40, 40, 41, + 46, 48, 51, 55, 56, 59, 62, 63, 67, 70, 71, 75, 78, 80, 86, 87, 88, 91, + 93, 96, 97, 97, 99, 102, 105, 44, 42, 41, 41, 42, 42, 42, 47, 50, 54, + 58, 59, 63, 66, 68, 71, 74, 75, 79, 83, 84, 90, 91, 92, 96, 98, 98, 99, + 102, 104, 104, 105, 48, 46, 45, 44, 45, 45, 45, 50, 53, 56, 61, 62, 66, + 70, 71, 76, 79, 80, 85, 88, 90, 96, 97, 98, 101, 100, 102, 105, 105, + 105, 109, 112, 49, 47, 46, 45, 46, 46, 46, 51, 53, 57, 62, 63, 68, 71, + 73, 77, 81, 82, 87, 90, 92, 98, 99, 100, 103, 106, 107, 106, 109, 112, + 112, 112, 54, 51, 50, 49, 50, 49, 49, 54, 56, 60, 65, 67, 71, 76, 77, + 82, 86, 87, 92, 96, 97, 104, 105, 106, 110, 110, 109, 113, 114, 113, + 116, 120, 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74, 79, 81, + 86, 90, 91, 97, 100, 102, 109, 110, 111, 114, 114, 118, 116, 117, 121, + 121, 120, 59, 57, 55, 54, 54, 54, 54, 59, 61, 64, 69, 71, 75, 80, 82, + 87, 91, 93, 99, 102, 104, 111, 112, 113, 117, 121, 120, 122, 124, 122, + 125, 129, 65, 62, 60, 59, 59, 58, 58, 63, 65, 68, 73, 75, 79, 85, 87, + 92, 97, 99, 105, 109, 111, 118, 120, 121, 125, 124, 125, 127, 126, 130, + 130, 129, 69, 66, 64, 63, 63, 62, 61, 66, 68, 71, 76, 78, 83, 88, 90, + 96, 100, 102, 109, 113, 115, 123, 125, 126, 129, 130, 131, 130, 134, + 133, 135, 139, 71, 68, 66, 65, 64, 63, 63, 68, 70, 73, 78, 80, 84, 90, + 92, 97, 102, 104, 111, 115, 117, 125, 127, 128, 133, 136, 134, 139, 136, + 139, 141, 140, 80, 76, 73, 72, 71, 70, 69, 74, 76, 79, 84, 86, 90, 96, + 98, 104, 109, 111, 118, 123, 125, 134, 136, 137, 142, 138, 143, 140, + 144, 144, 144, 149, 81, 77, 75, 74, 73, 72, 71, 75, 77, 80, 85, 87, 91, + 97, 99, 105, 110, 112, 120, 125, 127, 136, 137, 139, 143, 148, 145, 148, + 148, 150, 152, 149, 83, 78, 76, 75, 74, 73, 72, 76, 78, 81, 86, 88, 92, + 98, 100, 106, 111, 113, 121, 126, 128, 137, 139, 140, 145, 149, 153, + 153, 154, 155, 155, 161, 87, 83, 81, 79, 78, 77, 75, 80, 82, 85, 90, 91, + 96, 101, 103, 110, 114, 117, 125, 129, 133, 142, 143, 145, 150, 151, + 156, 159, 160, 160, 163, 161, 90, 85, 83, 81, 80, 79, 78, 81, 83, 87, + 89, 93, 98, 100, 106, 110, 114, 121, 124, 130, 136, 138, 148, 149, 151, + 156, 157, 162, 166, 168, 166, 172, 93, 88, 86, 84, 83, 82, 80, 82, 85, + 89, 90, 96, 98, 102, 107, 109, 118, 120, 125, 131, 134, 143, 145, 153, + 156, 157, 163, 164, 169, 172, 177, 172, 95, 90, 89, 86, 85, 85, 83, 83, + 88, 89, 93, 97, 99, 105, 106, 113, 116, 122, 127, 130, 139, 140, 148, + 153, 159, 162, 164, 169, 170, 176, 179, 185, 98, 93, 92, 89, 88, 87, 86, + 85, 89, 90, 96, 97, 102, 105, 109, 114, 117, 124, 126, 134, 136, 144, + 148, 154, 160, 166, 169, 170, 176, 177, 184, 186, 101, 96, 95, 91, 91, + 90, 89, 87, 90, 93, 97, 99, 104, 105, 112, 113, 121, 122, 130, 133, 139, + 144, 150, 155, 160, 168, 172, 176, 177, 184, 185, 191, 104, 99, 98, 94, + 94, 92, 92, 90, 92, 96, 98, 102, 104, 109, 112, 116, 121, 125, 130, 135, + 141, 144, 152, 155, 163, 166, 177, 179, 184, 185, 191, 192, 107, 101, + 101, 97, 97, 95, 95, 93, 93, 99, 99, 105, 105, 112, 112, 120, 120, 129, + 129, 139, 140, 149, 149, 161, 161, 172, 172, 185, 186, 191, 192, 199, + /* Size 4x8 */ + 32, 38, 62, 86, 32, 40, 58, 80, 34, 51, 68, 85, 44, 61, 85, 101, 54, 69, + 98, 117, 72, 84, 118, 136, 82, 89, 129, 157, 92, 98, 127, 165, + /* Size 8x4 */ + 32, 32, 34, 44, 54, 72, 82, 92, 38, 40, 51, 61, 69, 84, 89, 98, 62, 58, + 68, 85, 98, 118, 129, 127, 86, 80, 85, 101, 117, 136, 157, 165, + /* Size 8x16 */ + 32, 32, 36, 44, 58, 79, 88, 93, 31, 32, 35, 41, 54, 73, 81, 88, 32, 33, + 36, 42, 53, 71, 78, 84, 32, 34, 38, 42, 52, 69, 76, 82, 34, 36, 44, 50, + 59, 75, 81, 84, 39, 39, 50, 58, 68, 84, 88, 90, 44, 42, 53, 63, 74, 90, + 97, 97, 49, 46, 57, 67, 81, 97, 104, 105, 57, 53, 63, 74, 90, 108, 111, + 113, 65, 59, 68, 79, 97, 118, 123, 122, 71, 64, 73, 84, 102, 125, 135, + 131, 81, 72, 80, 91, 110, 135, 145, 141, 87, 77, 85, 96, 114, 140, 148, + 151, 92, 83, 88, 102, 117, 133, 153, 163, 98, 88, 89, 103, 121, 141, + 160, 169, 103, 94, 92, 103, 119, 137, 158, 175, + /* Size 16x8 */ + 32, 31, 32, 32, 34, 39, 44, 49, 57, 65, 71, 81, 87, 92, 98, 103, 32, 32, + 33, 34, 36, 39, 42, 46, 53, 59, 64, 72, 77, 83, 88, 94, 36, 35, 36, 38, + 44, 50, 53, 57, 63, 68, 73, 80, 85, 88, 89, 92, 44, 41, 42, 42, 50, 58, + 63, 67, 74, 79, 84, 91, 96, 102, 103, 103, 58, 54, 53, 52, 59, 68, 74, + 81, 90, 97, 102, 110, 114, 117, 121, 119, 79, 73, 71, 69, 75, 84, 90, + 97, 108, 118, 125, 135, 140, 133, 141, 137, 88, 81, 78, 76, 81, 88, 97, + 104, 111, 123, 135, 145, 148, 153, 160, 158, 93, 88, 84, 82, 84, 90, 97, + 105, 113, 122, 131, 141, 151, 163, 169, 175, + /* Size 16x32 */ + 32, 31, 32, 32, 36, 39, 44, 53, 58, 65, 79, 81, 88, 90, 93, 96, 31, 32, + 32, 32, 35, 38, 42, 51, 55, 62, 75, 77, 83, 86, 88, 91, 31, 32, 32, 32, + 35, 38, 41, 50, 54, 60, 73, 75, 81, 84, 88, 91, 31, 32, 32, 33, 34, 37, + 41, 49, 53, 59, 72, 74, 79, 82, 84, 87, 32, 32, 33, 34, 36, 39, 42, 50, + 53, 59, 71, 72, 78, 81, 84, 87, 32, 32, 34, 34, 37, 40, 42, 49, 53, 58, + 70, 71, 77, 80, 83, 85, 32, 33, 34, 35, 38, 40, 42, 49, 52, 58, 69, 70, + 76, 78, 82, 86, 34, 34, 35, 37, 42, 45, 48, 54, 57, 63, 73, 75, 79, 79, + 81, 83, 34, 34, 36, 37, 44, 47, 50, 56, 59, 65, 75, 77, 81, 83, 84, 84, + 36, 34, 37, 38, 48, 51, 54, 60, 63, 68, 78, 80, 85, 85, 86, 89, 39, 37, + 39, 40, 50, 54, 58, 65, 68, 73, 84, 85, 88, 89, 90, 89, 40, 38, 40, 41, + 51, 55, 59, 67, 70, 75, 85, 87, 91, 92, 92, 95, 44, 41, 42, 43, 53, 58, + 63, 71, 74, 79, 90, 91, 97, 94, 97, 95, 47, 44, 45, 46, 56, 61, 66, 75, + 79, 85, 95, 97, 99, 101, 98, 102, 49, 46, 46, 47, 57, 62, 67, 77, 81, + 86, 97, 99, 104, 102, 105, 102, 53, 49, 50, 50, 60, 65, 71, 82, 86, 92, + 103, 105, 109, 108, 106, 110, 57, 53, 53, 53, 63, 68, 74, 86, 90, 97, + 108, 110, 111, 112, 113, 110, 59, 54, 54, 54, 64, 69, 75, 87, 91, 98, + 111, 112, 119, 117, 115, 118, 65, 60, 59, 58, 68, 73, 79, 92, 97, 105, + 118, 119, 123, 123, 122, 119, 69, 63, 62, 62, 71, 76, 83, 96, 100, 109, + 122, 124, 127, 125, 125, 128, 71, 65, 64, 63, 73, 78, 84, 97, 102, 111, + 125, 127, 135, 134, 131, 129, 79, 72, 71, 70, 79, 84, 90, 104, 109, 118, + 133, 135, 137, 136, 136, 137, 81, 74, 72, 71, 80, 85, 91, 105, 110, 120, + 135, 137, 145, 143, 141, 138, 82, 75, 73, 72, 81, 86, 92, 106, 111, 121, + 136, 139, 147, 148, 147, 149, 87, 79, 77, 76, 85, 90, 96, 110, 114, 125, + 140, 143, 148, 154, 151, 149, 90, 82, 80, 78, 87, 89, 99, 108, 113, 129, + 135, 146, 153, 157, 160, 159, 92, 84, 83, 81, 88, 90, 102, 106, 117, + 128, 133, 150, 153, 158, 163, 160, 95, 87, 85, 83, 88, 92, 103, 105, + 120, 125, 137, 148, 155, 164, 168, 173, 98, 89, 88, 85, 89, 95, 103, + 108, 121, 124, 141, 144, 160, 164, 169, 174, 100, 92, 91, 88, 90, 98, + 103, 111, 120, 127, 139, 146, 161, 165, 175, 179, 103, 94, 94, 90, 92, + 101, 103, 114, 119, 131, 137, 150, 158, 170, 175, 180, 106, 97, 97, 93, + 93, 104, 104, 118, 118, 135, 135, 154, 155, 175, 176, 187, + /* Size 32x16 */ + 32, 31, 31, 31, 32, 32, 32, 34, 34, 36, 39, 40, 44, 47, 49, 53, 57, 59, + 65, 69, 71, 79, 81, 82, 87, 90, 92, 95, 98, 100, 103, 106, 31, 32, 32, + 32, 32, 32, 33, 34, 34, 34, 37, 38, 41, 44, 46, 49, 53, 54, 60, 63, 65, + 72, 74, 75, 79, 82, 84, 87, 89, 92, 94, 97, 32, 32, 32, 32, 33, 34, 34, + 35, 36, 37, 39, 40, 42, 45, 46, 50, 53, 54, 59, 62, 64, 71, 72, 73, 77, + 80, 83, 85, 88, 91, 94, 97, 32, 32, 32, 33, 34, 34, 35, 37, 37, 38, 40, + 41, 43, 46, 47, 50, 53, 54, 58, 62, 63, 70, 71, 72, 76, 78, 81, 83, 85, + 88, 90, 93, 36, 35, 35, 34, 36, 37, 38, 42, 44, 48, 50, 51, 53, 56, 57, + 60, 63, 64, 68, 71, 73, 79, 80, 81, 85, 87, 88, 88, 89, 90, 92, 93, 39, + 38, 38, 37, 39, 40, 40, 45, 47, 51, 54, 55, 58, 61, 62, 65, 68, 69, 73, + 76, 78, 84, 85, 86, 90, 89, 90, 92, 95, 98, 101, 104, 44, 42, 41, 41, + 42, 42, 42, 48, 50, 54, 58, 59, 63, 66, 67, 71, 74, 75, 79, 83, 84, 90, + 91, 92, 96, 99, 102, 103, 103, 103, 103, 104, 53, 51, 50, 49, 50, 49, + 49, 54, 56, 60, 65, 67, 71, 75, 77, 82, 86, 87, 92, 96, 97, 104, 105, + 106, 110, 108, 106, 105, 108, 111, 114, 118, 58, 55, 54, 53, 53, 53, 52, + 57, 59, 63, 68, 70, 74, 79, 81, 86, 90, 91, 97, 100, 102, 109, 110, 111, + 114, 113, 117, 120, 121, 120, 119, 118, 65, 62, 60, 59, 59, 58, 58, 63, + 65, 68, 73, 75, 79, 85, 86, 92, 97, 98, 105, 109, 111, 118, 120, 121, + 125, 129, 128, 125, 124, 127, 131, 135, 79, 75, 73, 72, 71, 70, 69, 73, + 75, 78, 84, 85, 90, 95, 97, 103, 108, 111, 118, 122, 125, 133, 135, 136, + 140, 135, 133, 137, 141, 139, 137, 135, 81, 77, 75, 74, 72, 71, 70, 75, + 77, 80, 85, 87, 91, 97, 99, 105, 110, 112, 119, 124, 127, 135, 137, 139, + 143, 146, 150, 148, 144, 146, 150, 154, 88, 83, 81, 79, 78, 77, 76, 79, + 81, 85, 88, 91, 97, 99, 104, 109, 111, 119, 123, 127, 135, 137, 145, + 147, 148, 153, 153, 155, 160, 161, 158, 155, 90, 86, 84, 82, 81, 80, 78, + 79, 83, 85, 89, 92, 94, 101, 102, 108, 112, 117, 123, 125, 134, 136, + 143, 148, 154, 157, 158, 164, 164, 165, 170, 175, 93, 88, 88, 84, 84, + 83, 82, 81, 84, 86, 90, 92, 97, 98, 105, 106, 113, 115, 122, 125, 131, + 136, 141, 147, 151, 160, 163, 168, 169, 175, 175, 176, 96, 91, 91, 87, + 87, 85, 86, 83, 84, 89, 89, 95, 95, 102, 102, 110, 110, 118, 119, 128, + 129, 137, 138, 149, 149, 159, 160, 173, 174, 179, 180, 187, + /* Size 4x16 */ + 31, 39, 65, 90, 32, 38, 60, 84, 32, 39, 59, 81, 33, 40, 58, 78, 34, 47, + 65, 83, 37, 54, 73, 89, 41, 58, 79, 94, 46, 62, 86, 102, 53, 68, 97, + 112, 60, 73, 105, 123, 65, 78, 111, 134, 74, 85, 120, 143, 79, 90, 125, + 154, 84, 90, 128, 158, 89, 95, 124, 164, 94, 101, 131, 170, + /* Size 16x4 */ + 31, 32, 32, 33, 34, 37, 41, 46, 53, 60, 65, 74, 79, 84, 89, 94, 39, 38, + 39, 40, 47, 54, 58, 62, 68, 73, 78, 85, 90, 90, 95, 101, 65, 60, 59, 58, + 65, 73, 79, 86, 97, 105, 111, 120, 125, 128, 124, 131, 90, 84, 81, 78, + 83, 89, 94, 102, 112, 123, 134, 143, 154, 158, 164, 170, + /* Size 8x32 */ + 32, 32, 36, 44, 58, 79, 88, 93, 31, 32, 35, 42, 55, 75, 83, 88, 31, 32, + 35, 41, 54, 73, 81, 88, 31, 32, 34, 41, 53, 72, 79, 84, 32, 33, 36, 42, + 53, 71, 78, 84, 32, 34, 37, 42, 53, 70, 77, 83, 32, 34, 38, 42, 52, 69, + 76, 82, 34, 35, 42, 48, 57, 73, 79, 81, 34, 36, 44, 50, 59, 75, 81, 84, + 36, 37, 48, 54, 63, 78, 85, 86, 39, 39, 50, 58, 68, 84, 88, 90, 40, 40, + 51, 59, 70, 85, 91, 92, 44, 42, 53, 63, 74, 90, 97, 97, 47, 45, 56, 66, + 79, 95, 99, 98, 49, 46, 57, 67, 81, 97, 104, 105, 53, 50, 60, 71, 86, + 103, 109, 106, 57, 53, 63, 74, 90, 108, 111, 113, 59, 54, 64, 75, 91, + 111, 119, 115, 65, 59, 68, 79, 97, 118, 123, 122, 69, 62, 71, 83, 100, + 122, 127, 125, 71, 64, 73, 84, 102, 125, 135, 131, 79, 71, 79, 90, 109, + 133, 137, 136, 81, 72, 80, 91, 110, 135, 145, 141, 82, 73, 81, 92, 111, + 136, 147, 147, 87, 77, 85, 96, 114, 140, 148, 151, 90, 80, 87, 99, 113, + 135, 153, 160, 92, 83, 88, 102, 117, 133, 153, 163, 95, 85, 88, 103, + 120, 137, 155, 168, 98, 88, 89, 103, 121, 141, 160, 169, 100, 91, 90, + 103, 120, 139, 161, 175, 103, 94, 92, 103, 119, 137, 158, 175, 106, 97, + 93, 104, 118, 135, 155, 176, + /* Size 32x8 */ + 32, 31, 31, 31, 32, 32, 32, 34, 34, 36, 39, 40, 44, 47, 49, 53, 57, 59, + 65, 69, 71, 79, 81, 82, 87, 90, 92, 95, 98, 100, 103, 106, 32, 32, 32, + 32, 33, 34, 34, 35, 36, 37, 39, 40, 42, 45, 46, 50, 53, 54, 59, 62, 64, + 71, 72, 73, 77, 80, 83, 85, 88, 91, 94, 97, 36, 35, 35, 34, 36, 37, 38, + 42, 44, 48, 50, 51, 53, 56, 57, 60, 63, 64, 68, 71, 73, 79, 80, 81, 85, + 87, 88, 88, 89, 90, 92, 93, 44, 42, 41, 41, 42, 42, 42, 48, 50, 54, 58, + 59, 63, 66, 67, 71, 74, 75, 79, 83, 84, 90, 91, 92, 96, 99, 102, 103, + 103, 103, 103, 104, 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74, + 79, 81, 86, 90, 91, 97, 100, 102, 109, 110, 111, 114, 113, 117, 120, + 121, 120, 119, 118, 79, 75, 73, 72, 71, 70, 69, 73, 75, 78, 84, 85, 90, + 95, 97, 103, 108, 111, 118, 122, 125, 133, 135, 136, 140, 135, 133, 137, + 141, 139, 137, 135, 88, 83, 81, 79, 78, 77, 76, 79, 81, 85, 88, 91, 97, + 99, 104, 109, 111, 119, 123, 127, 135, 137, 145, 147, 148, 153, 153, + 155, 160, 161, 158, 155, 93, 88, 88, 84, 84, 83, 82, 81, 84, 86, 90, 92, + 97, 98, 105, 106, 113, 115, 122, 125, 131, 136, 141, 147, 151, 160, 163, + 168, 169, 175, 175, 176 }, + { /* Chroma */ + /* Size 4x4 */ + 32, 45, 53, 63, 45, 55, 62, 67, 53, 62, 80, 84, 63, 67, 84, 101, + /* Size 8x8 */ + 31, 36, 47, 48, 52, 60, 64, 67, 36, 43, 47, 46, 49, 55, 59, 63, 47, 47, + 53, 54, 55, 60, 63, 64, 48, 46, 54, 61, 65, 70, 71, 71, 52, 49, 55, 65, + 71, 78, 81, 79, 60, 55, 60, 70, 78, 89, 89, 89, 64, 59, 63, 71, 81, 89, + 97, 99, 67, 63, 64, 71, 79, 89, 99, 104, + /* Size 16x16 */ + 32, 30, 33, 36, 44, 48, 49, 51, 54, 57, 60, 64, 67, 68, 70, 72, 30, 31, + 35, 39, 44, 46, 46, 47, 50, 53, 55, 59, 61, 64, 66, 68, 33, 35, 39, 43, + 46, 46, 45, 47, 49, 51, 53, 57, 59, 61, 63, 65, 36, 39, 43, 47, 47, 46, + 45, 46, 48, 50, 52, 55, 57, 58, 61, 63, 44, 44, 46, 47, 50, 51, 51, 51, + 53, 54, 56, 59, 61, 61, 63, 62, 48, 46, 46, 46, 51, 54, 55, 56, 58, 60, + 61, 64, 65, 64, 66, 66, 49, 46, 45, 45, 51, 55, 58, 60, 62, 63, 65, 68, + 69, 69, 69, 69, 51, 47, 47, 46, 51, 56, 60, 62, 65, 67, 69, 72, 73, 74, + 73, 73, 54, 50, 49, 48, 53, 58, 62, 65, 70, 73, 75, 78, 79, 79, 77, 77, + 57, 53, 51, 50, 54, 60, 63, 67, 73, 76, 79, 82, 84, 83, 82, 82, 60, 55, + 53, 52, 56, 61, 65, 69, 75, 79, 82, 86, 88, 87, 86, 87, 64, 59, 57, 55, + 59, 64, 68, 72, 78, 82, 86, 90, 93, 92, 91, 92, 67, 61, 59, 57, 61, 65, + 69, 73, 79, 84, 88, 93, 95, 96, 96, 96, 68, 64, 61, 58, 61, 64, 69, 74, + 79, 83, 87, 92, 96, 99, 100, 101, 70, 66, 63, 61, 63, 66, 69, 73, 77, + 82, 86, 91, 96, 100, 103, 104, 72, 68, 65, 63, 62, 66, 69, 73, 77, 82, + 87, 92, 96, 101, 104, 106, + /* Size 32x32 */ + 32, 31, 30, 30, 33, 35, 36, 41, 44, 49, 48, 48, 49, 50, 51, 52, 54, 55, + 57, 59, 60, 63, 64, 65, 67, 68, 68, 69, 70, 71, 72, 73, 31, 31, 31, 31, + 34, 36, 38, 42, 44, 47, 47, 47, 47, 48, 48, 50, 51, 52, 54, 56, 57, 60, + 61, 61, 63, 64, 65, 66, 67, 67, 68, 69, 30, 31, 31, 31, 35, 37, 39, 42, + 44, 47, 46, 46, 46, 47, 47, 48, 50, 51, 53, 54, 55, 58, 59, 60, 61, 63, + 64, 65, 66, 67, 68, 69, 30, 31, 31, 32, 35, 37, 40, 42, 44, 46, 45, 45, + 45, 46, 46, 47, 49, 50, 52, 53, 54, 57, 58, 58, 60, 61, 62, 63, 63, 64, + 65, 66, 33, 34, 35, 35, 39, 41, 43, 45, 46, 47, 46, 46, 45, 46, 47, 47, + 49, 49, 51, 53, 53, 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 35, 36, + 37, 37, 41, 43, 45, 46, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52, + 53, 55, 56, 56, 58, 59, 60, 61, 62, 63, 64, 64, 36, 38, 39, 40, 43, 45, + 47, 47, 47, 48, 46, 46, 45, 46, 46, 47, 48, 48, 50, 51, 52, 54, 55, 55, + 57, 58, 58, 59, 61, 62, 63, 64, 41, 42, 42, 42, 45, 46, 47, 48, 49, 50, + 49, 49, 49, 50, 50, 50, 51, 52, 53, 54, 55, 57, 58, 58, 60, 60, 59, 59, + 60, 61, 61, 62, 44, 44, 44, 44, 46, 46, 47, 49, 50, 51, 51, 51, 51, 51, + 51, 52, 53, 53, 54, 56, 56, 59, 59, 59, 61, 61, 61, 62, 63, 62, 62, 62, + 49, 47, 47, 46, 47, 47, 48, 50, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, + 56, 58, 58, 60, 61, 61, 63, 63, 64, 63, 63, 64, 65, 66, 48, 47, 46, 45, + 46, 46, 46, 49, 51, 53, 54, 54, 55, 56, 56, 57, 58, 59, 60, 61, 61, 63, + 64, 64, 65, 65, 64, 65, 66, 66, 66, 66, 48, 47, 46, 45, 46, 46, 46, 49, + 51, 53, 54, 55, 56, 57, 57, 58, 59, 60, 61, 62, 63, 65, 65, 65, 66, 67, + 68, 67, 67, 67, 68, 69, 49, 47, 46, 45, 45, 45, 45, 49, 51, 53, 55, 56, + 58, 59, 60, 61, 62, 62, 63, 65, 65, 67, 68, 68, 69, 70, 69, 69, 69, 70, + 69, 69, 50, 48, 47, 46, 46, 46, 46, 50, 51, 54, 56, 57, 59, 61, 62, 63, + 64, 65, 66, 68, 68, 70, 71, 71, 72, 71, 71, 72, 71, 71, 71, 72, 51, 48, + 47, 46, 47, 46, 46, 50, 51, 54, 56, 57, 60, 62, 62, 64, 65, 66, 67, 69, + 69, 71, 72, 72, 73, 74, 74, 72, 73, 74, 73, 73, 52, 50, 48, 47, 47, 47, + 47, 50, 52, 54, 57, 58, 61, 63, 64, 66, 68, 68, 70, 72, 72, 75, 75, 75, + 77, 76, 75, 76, 76, 74, 75, 76, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, + 58, 59, 62, 64, 65, 68, 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 78, + 77, 78, 77, 77, 55, 52, 51, 50, 49, 49, 48, 52, 53, 55, 59, 60, 62, 65, + 66, 68, 70, 71, 73, 75, 76, 78, 79, 79, 80, 81, 80, 80, 81, 79, 79, 81, + 57, 54, 53, 52, 51, 50, 50, 53, 54, 56, 60, 61, 63, 66, 67, 70, 73, 73, + 76, 78, 79, 82, 82, 83, 84, 83, 83, 83, 82, 83, 82, 81, 59, 56, 54, 53, + 53, 52, 51, 54, 56, 58, 61, 62, 65, 68, 69, 72, 74, 75, 78, 80, 81, 84, + 85, 85, 86, 86, 86, 84, 85, 84, 84, 85, 60, 57, 55, 54, 53, 53, 52, 55, + 56, 58, 61, 63, 65, 68, 69, 72, 75, 76, 79, 81, 82, 85, 86, 86, 88, 88, + 87, 88, 86, 87, 87, 85, 63, 60, 58, 57, 56, 55, 54, 57, 59, 60, 63, 65, + 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, 89, 90, 92, 89, 91, 89, 90, 89, + 88, 89, 64, 61, 59, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, + 78, 79, 82, 85, 86, 89, 90, 91, 93, 94, 92, 92, 91, 91, 92, 90, 65, 61, + 60, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 83, 85, + 86, 90, 91, 91, 93, 94, 95, 94, 94, 94, 93, 94, 67, 63, 61, 60, 59, 58, + 57, 60, 61, 63, 65, 66, 69, 72, 73, 77, 79, 80, 84, 86, 88, 92, 93, 93, + 95, 95, 96, 97, 96, 95, 96, 94, 68, 64, 63, 61, 60, 59, 58, 60, 61, 63, + 65, 67, 70, 71, 74, 76, 78, 81, 83, 86, 88, 89, 94, 94, 95, 97, 97, 98, + 99, 99, 97, 99, 68, 65, 64, 62, 61, 60, 58, 59, 61, 64, 64, 68, 69, 71, + 74, 75, 79, 80, 83, 86, 87, 91, 92, 95, 96, 97, 99, 99, 100, 100, 101, + 99, 69, 66, 65, 63, 62, 61, 59, 59, 62, 63, 65, 67, 69, 72, 72, 76, 78, + 80, 83, 84, 88, 89, 92, 94, 97, 98, 99, 101, 100, 102, 102, 104, 70, 67, + 66, 63, 63, 62, 61, 60, 63, 63, 66, 67, 69, 71, 73, 76, 77, 81, 82, 85, + 86, 90, 91, 94, 96, 99, 100, 100, 103, 102, 104, 104, 71, 67, 67, 64, + 64, 63, 62, 61, 62, 64, 66, 67, 70, 71, 74, 74, 78, 79, 83, 84, 87, 89, + 91, 94, 95, 99, 100, 102, 102, 104, 104, 106, 72, 68, 68, 65, 65, 64, + 63, 61, 62, 65, 66, 68, 69, 71, 73, 75, 77, 79, 82, 84, 87, 88, 92, 93, + 96, 97, 101, 102, 104, 104, 106, 106, 73, 69, 69, 66, 66, 64, 64, 62, + 62, 66, 66, 69, 69, 72, 73, 76, 77, 81, 81, 85, 85, 89, 90, 94, 94, 99, + 99, 104, 104, 106, 106, 108, + /* Size 4x8 */ + 31, 47, 54, 64, 38, 46, 50, 60, 46, 53, 57, 62, 46, 56, 66, 71, 50, 59, + 74, 79, 57, 64, 82, 88, 61, 65, 85, 97, 65, 67, 82, 99, + /* Size 8x4 */ + 31, 38, 46, 46, 50, 57, 61, 65, 47, 46, 53, 56, 59, 64, 65, 67, 54, 50, + 57, 66, 74, 82, 85, 82, 64, 60, 62, 71, 79, 88, 97, 99, + /* Size 8x16 */ + 32, 34, 48, 49, 54, 63, 67, 69, 31, 36, 46, 46, 50, 58, 62, 65, 33, 40, + 47, 46, 49, 56, 59, 62, 37, 44, 47, 45, 48, 54, 57, 60, 44, 46, 51, 51, + 53, 59, 60, 61, 48, 46, 53, 56, 58, 64, 64, 64, 49, 45, 53, 58, 62, 67, + 70, 68, 51, 47, 54, 60, 65, 71, 73, 72, 54, 49, 55, 62, 70, 77, 77, 76, + 57, 51, 56, 64, 73, 82, 83, 81, 60, 53, 58, 65, 75, 85, 89, 85, 64, 57, + 61, 68, 78, 89, 93, 89, 66, 59, 63, 69, 79, 91, 94, 93, 68, 61, 63, 71, + 79, 87, 96, 98, 70, 63, 63, 70, 80, 89, 97, 100, 72, 65, 63, 69, 77, 86, + 95, 102, + /* Size 16x8 */ + 32, 31, 33, 37, 44, 48, 49, 51, 54, 57, 60, 64, 66, 68, 70, 72, 34, 36, + 40, 44, 46, 46, 45, 47, 49, 51, 53, 57, 59, 61, 63, 65, 48, 46, 47, 47, + 51, 53, 53, 54, 55, 56, 58, 61, 63, 63, 63, 63, 49, 46, 46, 45, 51, 56, + 58, 60, 62, 64, 65, 68, 69, 71, 70, 69, 54, 50, 49, 48, 53, 58, 62, 65, + 70, 73, 75, 78, 79, 79, 80, 77, 63, 58, 56, 54, 59, 64, 67, 71, 77, 82, + 85, 89, 91, 87, 89, 86, 67, 62, 59, 57, 60, 64, 70, 73, 77, 83, 89, 93, + 94, 96, 97, 95, 69, 65, 62, 60, 61, 64, 68, 72, 76, 81, 85, 89, 93, 98, + 100, 102, + /* Size 16x32 */ + 32, 31, 34, 37, 48, 48, 49, 52, 54, 57, 63, 64, 67, 68, 69, 69, 31, 31, + 35, 38, 47, 47, 47, 50, 51, 54, 60, 61, 63, 64, 65, 66, 31, 32, 36, 39, + 46, 46, 46, 48, 50, 53, 58, 59, 62, 63, 65, 66, 30, 32, 36, 40, 46, 45, + 45, 48, 49, 52, 57, 58, 60, 61, 62, 63, 33, 36, 40, 43, 47, 46, 46, 47, + 49, 51, 56, 57, 59, 60, 62, 63, 35, 38, 42, 45, 47, 46, 45, 47, 48, 50, + 55, 56, 58, 60, 61, 61, 37, 40, 44, 47, 47, 46, 45, 47, 48, 50, 54, 55, + 57, 58, 60, 61, 42, 43, 45, 47, 50, 50, 49, 50, 51, 53, 57, 58, 59, 58, + 59, 59, 44, 44, 46, 47, 51, 51, 51, 52, 53, 54, 59, 59, 60, 61, 61, 60, + 49, 46, 47, 48, 53, 53, 53, 54, 55, 57, 60, 61, 63, 62, 62, 63, 48, 46, + 46, 47, 53, 54, 56, 57, 58, 60, 64, 64, 64, 64, 64, 63, 48, 45, 46, 46, + 53, 55, 56, 58, 59, 61, 65, 65, 66, 66, 65, 66, 49, 45, 45, 46, 53, 56, + 58, 61, 62, 64, 67, 68, 70, 67, 68, 66, 50, 46, 46, 46, 54, 56, 59, 63, + 65, 66, 70, 71, 70, 71, 68, 70, 51, 47, 47, 47, 54, 57, 60, 64, 65, 68, + 71, 72, 73, 71, 72, 70, 52, 48, 47, 47, 54, 57, 61, 66, 68, 71, 75, 75, + 76, 75, 73, 73, 54, 49, 49, 48, 55, 58, 62, 68, 70, 73, 77, 78, 77, 77, + 76, 74, 54, 50, 49, 49, 55, 59, 62, 68, 70, 74, 78, 79, 81, 79, 77, 78, + 57, 52, 51, 50, 56, 60, 64, 70, 73, 76, 82, 82, 83, 82, 81, 78, 59, 54, + 52, 52, 58, 61, 65, 72, 74, 78, 84, 85, 85, 83, 82, 82, 60, 54, 53, 52, + 58, 62, 65, 72, 75, 79, 85, 86, 89, 87, 85, 82, 63, 57, 56, 55, 60, 64, + 67, 75, 77, 82, 89, 90, 90, 88, 87, 86, 64, 58, 57, 55, 61, 64, 68, 75, + 78, 82, 89, 90, 93, 91, 89, 87, 64, 59, 57, 56, 61, 65, 68, 75, 78, 83, + 90, 91, 94, 93, 92, 91, 66, 60, 59, 57, 63, 66, 69, 77, 79, 84, 91, 93, + 94, 95, 93, 91, 67, 61, 60, 58, 63, 65, 70, 75, 78, 85, 88, 93, 96, 97, + 97, 95, 68, 62, 61, 59, 63, 64, 71, 74, 79, 84, 87, 94, 96, 97, 98, 96, + 69, 63, 62, 60, 63, 65, 71, 72, 80, 82, 88, 93, 96, 99, 100, 101, 70, + 64, 63, 60, 63, 66, 70, 73, 80, 81, 89, 90, 97, 99, 100, 101, 71, 65, + 64, 61, 63, 67, 70, 74, 78, 82, 88, 90, 97, 99, 102, 103, 72, 65, 65, + 62, 63, 68, 69, 75, 77, 83, 86, 92, 95, 100, 102, 103, 73, 66, 66, 63, + 63, 69, 69, 76, 76, 84, 84, 93, 93, 101, 101, 105, + /* Size 32x16 */ + 32, 31, 31, 30, 33, 35, 37, 42, 44, 49, 48, 48, 49, 50, 51, 52, 54, 54, + 57, 59, 60, 63, 64, 64, 66, 67, 68, 69, 70, 71, 72, 73, 31, 31, 32, 32, + 36, 38, 40, 43, 44, 46, 46, 45, 45, 46, 47, 48, 49, 50, 52, 54, 54, 57, + 58, 59, 60, 61, 62, 63, 64, 65, 65, 66, 34, 35, 36, 36, 40, 42, 44, 45, + 46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 52, 53, 56, 57, 57, 59, 60, + 61, 62, 63, 64, 65, 66, 37, 38, 39, 40, 43, 45, 47, 47, 47, 48, 47, 46, + 46, 46, 47, 47, 48, 49, 50, 52, 52, 55, 55, 56, 57, 58, 59, 60, 60, 61, + 62, 63, 48, 47, 46, 46, 47, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 54, + 55, 55, 56, 58, 58, 60, 61, 61, 63, 63, 63, 63, 63, 63, 63, 63, 48, 47, + 46, 45, 46, 46, 46, 50, 51, 53, 54, 55, 56, 56, 57, 57, 58, 59, 60, 61, + 62, 64, 64, 65, 66, 65, 64, 65, 66, 67, 68, 69, 49, 47, 46, 45, 46, 45, + 45, 49, 51, 53, 56, 56, 58, 59, 60, 61, 62, 62, 64, 65, 65, 67, 68, 68, + 69, 70, 71, 71, 70, 70, 69, 69, 52, 50, 48, 48, 47, 47, 47, 50, 52, 54, + 57, 58, 61, 63, 64, 66, 68, 68, 70, 72, 72, 75, 75, 75, 77, 75, 74, 72, + 73, 74, 75, 76, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 65, + 65, 68, 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 80, 80, 78, 77, 76, + 57, 54, 53, 52, 51, 50, 50, 53, 54, 57, 60, 61, 64, 66, 68, 71, 73, 74, + 76, 78, 79, 82, 82, 83, 84, 85, 84, 82, 81, 82, 83, 84, 63, 60, 58, 57, + 56, 55, 54, 57, 59, 60, 64, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, + 89, 90, 91, 88, 87, 88, 89, 88, 86, 84, 64, 61, 59, 58, 57, 56, 55, 58, + 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 82, 85, 86, 90, 90, 91, 93, 93, + 94, 93, 90, 90, 92, 93, 67, 63, 62, 60, 59, 58, 57, 59, 60, 63, 64, 66, + 70, 70, 73, 76, 77, 81, 83, 85, 89, 90, 93, 94, 94, 96, 96, 96, 97, 97, + 95, 93, 68, 64, 63, 61, 60, 60, 58, 58, 61, 62, 64, 66, 67, 71, 71, 75, + 77, 79, 82, 83, 87, 88, 91, 93, 95, 97, 97, 99, 99, 99, 100, 101, 69, + 65, 65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68, 72, 73, 76, 77, 81, + 82, 85, 87, 89, 92, 93, 97, 98, 100, 100, 102, 102, 101, 69, 66, 66, 63, + 63, 61, 61, 59, 60, 63, 63, 66, 66, 70, 70, 73, 74, 78, 78, 82, 82, 86, + 87, 91, 91, 95, 96, 101, 101, 103, 103, 105, + /* Size 4x16 */ + 31, 48, 57, 68, 32, 46, 53, 63, 36, 46, 51, 60, 40, 46, 50, 58, 44, 51, + 54, 61, 46, 54, 60, 64, 45, 56, 64, 67, 47, 57, 68, 71, 49, 58, 73, 77, + 52, 60, 76, 82, 54, 62, 79, 87, 58, 64, 82, 91, 60, 66, 84, 95, 62, 64, + 84, 97, 64, 66, 81, 99, 65, 68, 83, 100, + /* Size 16x4 */ + 31, 32, 36, 40, 44, 46, 45, 47, 49, 52, 54, 58, 60, 62, 64, 65, 48, 46, + 46, 46, 51, 54, 56, 57, 58, 60, 62, 64, 66, 64, 66, 68, 57, 53, 51, 50, + 54, 60, 64, 68, 73, 76, 79, 82, 84, 84, 81, 83, 68, 63, 60, 58, 61, 64, + 67, 71, 77, 82, 87, 91, 95, 97, 99, 100, + /* Size 8x32 */ + 32, 34, 48, 49, 54, 63, 67, 69, 31, 35, 47, 47, 51, 60, 63, 65, 31, 36, + 46, 46, 50, 58, 62, 65, 30, 36, 46, 45, 49, 57, 60, 62, 33, 40, 47, 46, + 49, 56, 59, 62, 35, 42, 47, 45, 48, 55, 58, 61, 37, 44, 47, 45, 48, 54, + 57, 60, 42, 45, 50, 49, 51, 57, 59, 59, 44, 46, 51, 51, 53, 59, 60, 61, + 49, 47, 53, 53, 55, 60, 63, 62, 48, 46, 53, 56, 58, 64, 64, 64, 48, 46, + 53, 56, 59, 65, 66, 65, 49, 45, 53, 58, 62, 67, 70, 68, 50, 46, 54, 59, + 65, 70, 70, 68, 51, 47, 54, 60, 65, 71, 73, 72, 52, 47, 54, 61, 68, 75, + 76, 73, 54, 49, 55, 62, 70, 77, 77, 76, 54, 49, 55, 62, 70, 78, 81, 77, + 57, 51, 56, 64, 73, 82, 83, 81, 59, 52, 58, 65, 74, 84, 85, 82, 60, 53, + 58, 65, 75, 85, 89, 85, 63, 56, 60, 67, 77, 89, 90, 87, 64, 57, 61, 68, + 78, 89, 93, 89, 64, 57, 61, 68, 78, 90, 94, 92, 66, 59, 63, 69, 79, 91, + 94, 93, 67, 60, 63, 70, 78, 88, 96, 97, 68, 61, 63, 71, 79, 87, 96, 98, + 69, 62, 63, 71, 80, 88, 96, 100, 70, 63, 63, 70, 80, 89, 97, 100, 71, + 64, 63, 70, 78, 88, 97, 102, 72, 65, 63, 69, 77, 86, 95, 102, 73, 66, + 63, 69, 76, 84, 93, 101, + /* Size 32x8 */ + 32, 31, 31, 30, 33, 35, 37, 42, 44, 49, 48, 48, 49, 50, 51, 52, 54, 54, + 57, 59, 60, 63, 64, 64, 66, 67, 68, 69, 70, 71, 72, 73, 34, 35, 36, 36, + 40, 42, 44, 45, 46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 52, 53, 56, + 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 48, 47, 46, 46, 47, 47, 47, 50, + 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 58, 58, 60, 61, 61, 63, 63, + 63, 63, 63, 63, 63, 63, 49, 47, 46, 45, 46, 45, 45, 49, 51, 53, 56, 56, + 58, 59, 60, 61, 62, 62, 64, 65, 65, 67, 68, 68, 69, 70, 71, 71, 70, 70, + 69, 69, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 65, 65, 68, + 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 80, 80, 78, 77, 76, 63, 60, + 58, 57, 56, 55, 54, 57, 59, 60, 64, 65, 67, 70, 71, 75, 77, 78, 82, 84, + 85, 89, 89, 90, 91, 88, 87, 88, 89, 88, 86, 84, 67, 63, 62, 60, 59, 58, + 57, 59, 60, 63, 64, 66, 70, 70, 73, 76, 77, 81, 83, 85, 89, 90, 93, 94, + 94, 96, 96, 96, 97, 97, 95, 93, 69, 65, 65, 62, 62, 61, 60, 59, 61, 62, + 64, 65, 68, 68, 72, 73, 76, 77, 81, 82, 85, 87, 89, 92, 93, 97, 98, 100, + 100, 102, 102, 101 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 37, 58, 81, 37, 54, 72, 91, 58, 72, 102, 121, 81, 91, 121, 156, + /* Size 8x8 */ + 32, 32, 35, 42, 53, 68, 78, 90, 32, 33, 36, 42, 51, 64, 74, 84, 35, 36, + 46, 52, 60, 72, 80, 87, 42, 42, 52, 63, 73, 84, 92, 98, 53, 51, 60, 73, + 86, 100, 109, 114, 68, 64, 72, 84, 100, 117, 128, 133, 78, 74, 80, 92, + 109, 128, 140, 155, 90, 84, 87, 98, 114, 133, 155, 168, + /* Size 16x16 */ + 32, 31, 31, 32, 34, 36, 41, 47, 54, 59, 65, 74, 82, 87, 92, 97, 31, 32, + 32, 32, 34, 35, 39, 45, 50, 55, 61, 69, 76, 81, 87, 92, 31, 32, 33, 33, + 35, 36, 40, 44, 49, 54, 59, 67, 73, 78, 83, 88, 32, 32, 33, 35, 37, 38, + 41, 45, 49, 53, 58, 65, 71, 75, 80, 86, 34, 34, 35, 37, 39, 42, 46, 50, + 54, 58, 63, 70, 76, 80, 84, 85, 36, 35, 36, 38, 42, 48, 52, 56, 60, 64, + 68, 75, 80, 85, 90, 91, 41, 39, 40, 41, 46, 52, 57, 62, 67, 71, 75, 83, + 88, 92, 95, 97, 47, 45, 44, 45, 50, 56, 62, 69, 75, 79, 84, 91, 97, 100, + 102, 104, 54, 50, 49, 49, 54, 60, 67, 75, 82, 87, 92, 100, 106, 110, + 109, 112, 59, 55, 54, 53, 58, 64, 71, 79, 87, 92, 98, 106, 112, 117, + 117, 121, 65, 61, 59, 58, 63, 68, 75, 84, 92, 98, 105, 114, 120, 125, + 126, 130, 74, 69, 67, 65, 70, 75, 83, 91, 100, 106, 114, 123, 131, 135, + 137, 140, 82, 76, 73, 71, 76, 80, 88, 97, 106, 112, 120, 131, 139, 144, + 148, 150, 87, 81, 78, 75, 80, 85, 92, 100, 110, 117, 125, 135, 144, 150, + 155, 162, 92, 87, 83, 80, 84, 90, 95, 102, 109, 117, 126, 137, 148, 155, + 162, 168, 97, 92, 88, 86, 85, 91, 97, 104, 112, 121, 130, 140, 150, 162, + 168, 174, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 54, 56, + 59, 64, 65, 71, 74, 80, 82, 83, 87, 90, 92, 95, 97, 100, 31, 32, 32, 32, + 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 51, 53, 56, 61, 62, 68, + 71, 76, 78, 78, 83, 85, 88, 90, 92, 95, 31, 32, 32, 32, 32, 32, 32, 33, + 34, 34, 35, 38, 39, 42, 45, 45, 50, 52, 55, 60, 61, 67, 69, 74, 76, 77, + 81, 84, 87, 89, 92, 95, 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 37, + 38, 41, 44, 44, 49, 51, 54, 58, 59, 65, 68, 72, 74, 75, 79, 81, 84, 86, + 88, 90, 31, 32, 32, 32, 33, 33, 33, 34, 35, 36, 36, 39, 40, 42, 44, 45, + 49, 51, 54, 58, 59, 64, 67, 71, 73, 74, 78, 80, 83, 85, 88, 90, 31, 32, + 32, 32, 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 45, 45, 50, 51, 54, 58, + 59, 64, 67, 71, 73, 74, 78, 80, 82, 84, 86, 89, 32, 32, 32, 32, 33, 34, + 35, 36, 37, 38, 38, 40, 41, 42, 45, 46, 49, 51, 53, 57, 58, 63, 65, 69, + 71, 72, 75, 78, 80, 83, 86, 89, 32, 33, 33, 33, 34, 34, 36, 36, 38, 39, + 40, 42, 43, 44, 47, 47, 51, 53, 55, 59, 60, 65, 67, 71, 73, 73, 77, 78, + 80, 82, 84, 86, 34, 34, 34, 33, 35, 35, 37, 38, 39, 42, 42, 45, 46, 47, + 50, 51, 54, 56, 58, 62, 63, 68, 70, 74, 76, 76, 80, 82, 84, 85, 85, 86, + 35, 35, 34, 34, 36, 36, 38, 39, 42, 46, 47, 49, 50, 52, 55, 55, 59, 60, + 62, 66, 67, 72, 74, 78, 79, 80, 83, 84, 85, 87, 90, 92, 36, 35, 35, 34, + 36, 36, 38, 40, 42, 47, 48, 50, 52, 54, 56, 57, 60, 61, 64, 67, 68, 73, + 75, 79, 80, 81, 85, 87, 90, 91, 91, 92, 39, 38, 38, 37, 39, 39, 40, 42, + 45, 49, 50, 54, 55, 58, 60, 61, 65, 66, 69, 72, 73, 78, 80, 84, 86, 86, + 90, 91, 91, 92, 95, 97, 41, 40, 39, 38, 40, 40, 41, 43, 46, 50, 52, 55, + 57, 60, 62, 63, 67, 69, 71, 75, 75, 80, 83, 86, 88, 89, 92, 93, 95, 97, + 97, 98, 44, 42, 42, 41, 42, 42, 42, 44, 47, 52, 54, 58, 60, 63, 66, 67, + 71, 73, 75, 79, 79, 84, 86, 90, 92, 92, 96, 98, 98, 98, 101, 104, 47, + 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66, 69, 70, 75, 77, 79, + 83, 84, 89, 91, 95, 97, 97, 100, 99, 102, 105, 104, 104, 48, 46, 45, 44, + 45, 45, 46, 47, 51, 55, 57, 61, 63, 67, 70, 71, 76, 78, 80, 84, 85, 90, + 93, 96, 98, 99, 102, 106, 106, 105, 108, 111, 54, 51, 50, 49, 49, 50, + 49, 51, 54, 59, 60, 65, 67, 71, 75, 76, 82, 84, 87, 91, 92, 97, 100, + 104, 106, 106, 110, 108, 109, 112, 112, 111, 56, 53, 52, 51, 51, 51, 51, + 53, 56, 60, 61, 66, 69, 73, 77, 78, 84, 86, 89, 93, 94, 100, 102, 106, + 108, 109, 112, 113, 115, 114, 116, 119, 59, 56, 55, 54, 54, 54, 53, 55, + 58, 62, 64, 69, 71, 75, 79, 80, 87, 89, 92, 97, 98, 103, 106, 110, 112, + 113, 117, 118, 117, 121, 121, 119, 64, 61, 60, 58, 58, 58, 57, 59, 62, + 66, 67, 72, 75, 79, 83, 84, 91, 93, 97, 102, 103, 109, 112, 116, 118, + 119, 122, 121, 125, 123, 125, 128, 65, 62, 61, 59, 59, 59, 58, 60, 63, + 67, 68, 73, 75, 79, 84, 85, 92, 94, 98, 103, 105, 111, 114, 118, 120, + 121, 125, 129, 126, 129, 130, 129, 71, 68, 67, 65, 64, 64, 63, 65, 68, + 72, 73, 78, 80, 84, 89, 90, 97, 100, 103, 109, 111, 117, 120, 125, 127, + 128, 133, 130, 134, 133, 133, 137, 74, 71, 69, 68, 67, 67, 65, 67, 70, + 74, 75, 80, 83, 86, 91, 93, 100, 102, 106, 112, 114, 120, 123, 128, 131, + 131, 135, 137, 137, 138, 140, 137, 80, 76, 74, 72, 71, 71, 69, 71, 74, + 78, 79, 84, 86, 90, 95, 96, 104, 106, 110, 116, 118, 125, 128, 134, 136, + 137, 142, 141, 142, 143, 143, 147, 82, 78, 76, 74, 73, 73, 71, 73, 76, + 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139, + 139, 144, 147, 148, 147, 150, 148, 83, 78, 77, 75, 74, 74, 72, 73, 76, + 80, 81, 86, 89, 92, 97, 99, 106, 109, 113, 119, 121, 128, 131, 137, 139, + 140, 145, 150, 152, 155, 152, 157, 87, 83, 81, 79, 78, 78, 75, 77, 80, + 83, 85, 90, 92, 96, 100, 102, 110, 112, 117, 122, 125, 133, 135, 142, + 144, 145, 150, 151, 155, 158, 162, 158, 90, 85, 84, 81, 80, 80, 78, 78, + 82, 84, 87, 91, 93, 98, 99, 106, 108, 113, 118, 121, 129, 130, 137, 141, + 147, 150, 151, 156, 156, 161, 164, 169, 92, 88, 87, 84, 83, 82, 80, 80, + 84, 85, 90, 91, 95, 98, 102, 106, 109, 115, 117, 125, 126, 134, 137, + 142, 148, 152, 155, 156, 162, 162, 168, 170, 95, 90, 89, 86, 85, 84, 83, + 82, 85, 87, 91, 92, 97, 98, 105, 105, 112, 114, 121, 123, 129, 133, 138, + 143, 147, 155, 158, 161, 162, 168, 168, 174, 97, 92, 92, 88, 88, 86, 86, + 84, 85, 90, 91, 95, 97, 101, 104, 108, 112, 116, 121, 125, 130, 133, + 140, 143, 150, 152, 162, 164, 168, 168, 174, 175, 100, 95, 95, 90, 90, + 89, 89, 86, 86, 92, 92, 97, 98, 104, 104, 111, 111, 119, 119, 128, 129, + 137, 137, 147, 148, 157, 158, 169, 170, 174, 175, 181, + /* Size 4x8 */ + 32, 35, 59, 83, 32, 36, 57, 78, 34, 47, 65, 82, 41, 53, 78, 97, 51, 61, + 92, 111, 65, 73, 108, 129, 75, 81, 117, 148, 86, 92, 119, 154, + /* Size 8x4 */ + 32, 32, 34, 41, 51, 65, 75, 86, 35, 36, 47, 53, 61, 73, 81, 92, 59, 57, + 65, 78, 92, 108, 117, 119, 83, 78, 82, 97, 111, 129, 148, 154, + /* Size 8x16 */ + 32, 31, 35, 44, 53, 65, 82, 90, 31, 32, 34, 41, 50, 61, 76, 85, 31, 33, + 35, 42, 49, 59, 73, 81, 32, 34, 37, 42, 49, 58, 71, 79, 34, 35, 41, 48, + 54, 63, 76, 81, 36, 36, 46, 54, 60, 68, 80, 87, 41, 40, 49, 60, 67, 76, + 88, 93, 47, 44, 53, 66, 75, 84, 97, 101, 53, 50, 57, 71, 82, 92, 106, + 108, 58, 54, 61, 75, 87, 98, 112, 116, 65, 59, 66, 79, 92, 105, 120, + 124, 74, 67, 73, 86, 100, 113, 131, 134, 82, 73, 79, 92, 105, 120, 139, + 142, 87, 78, 83, 96, 110, 125, 144, 153, 92, 83, 84, 97, 114, 132, 150, + 157, 97, 88, 86, 97, 111, 128, 147, 163, + /* Size 16x8 */ + 32, 31, 31, 32, 34, 36, 41, 47, 53, 58, 65, 74, 82, 87, 92, 97, 31, 32, + 33, 34, 35, 36, 40, 44, 50, 54, 59, 67, 73, 78, 83, 88, 35, 34, 35, 37, + 41, 46, 49, 53, 57, 61, 66, 73, 79, 83, 84, 86, 44, 41, 42, 42, 48, 54, + 60, 66, 71, 75, 79, 86, 92, 96, 97, 97, 53, 50, 49, 49, 54, 60, 67, 75, + 82, 87, 92, 100, 105, 110, 114, 111, 65, 61, 59, 58, 63, 68, 76, 84, 92, + 98, 105, 113, 120, 125, 132, 128, 82, 76, 73, 71, 76, 80, 88, 97, 106, + 112, 120, 131, 139, 144, 150, 147, 90, 85, 81, 79, 81, 87, 93, 101, 108, + 116, 124, 134, 142, 153, 157, 163, + /* Size 16x32 */ + 32, 31, 31, 32, 35, 36, 44, 47, 53, 62, 65, 79, 82, 88, 90, 93, 31, 32, + 32, 32, 35, 35, 42, 45, 51, 59, 62, 75, 78, 83, 86, 88, 31, 32, 32, 32, + 34, 35, 41, 45, 50, 58, 61, 74, 76, 82, 85, 88, 31, 32, 32, 33, 34, 34, + 41, 44, 49, 57, 59, 72, 74, 79, 82, 84, 31, 32, 33, 34, 35, 36, 42, 44, + 49, 57, 59, 71, 73, 79, 81, 84, 32, 32, 33, 34, 36, 36, 42, 45, 50, 57, + 59, 71, 73, 78, 80, 82, 32, 33, 34, 35, 37, 38, 42, 45, 49, 56, 58, 69, + 71, 76, 79, 83, 32, 33, 34, 36, 39, 40, 44, 47, 51, 58, 60, 71, 73, 76, + 78, 80, 34, 34, 35, 37, 41, 42, 48, 50, 54, 61, 63, 73, 76, 81, 81, 80, + 35, 34, 36, 38, 45, 47, 52, 55, 59, 65, 67, 77, 79, 82, 83, 86, 36, 34, + 36, 38, 46, 48, 54, 56, 60, 66, 68, 78, 80, 85, 87, 86, 39, 37, 39, 40, + 48, 50, 58, 60, 65, 71, 73, 84, 86, 89, 88, 91, 41, 39, 40, 41, 49, 51, + 60, 62, 67, 74, 76, 86, 88, 91, 93, 91, 44, 41, 42, 43, 51, 53, 63, 66, + 71, 78, 79, 90, 92, 97, 94, 97, 47, 44, 44, 45, 53, 56, 66, 69, 75, 82, + 84, 95, 97, 98, 101, 98, 48, 45, 45, 46, 54, 56, 67, 70, 76, 83, 85, 96, + 98, 104, 101, 105, 53, 49, 50, 50, 57, 60, 71, 75, 82, 90, 92, 103, 106, + 107, 108, 105, 55, 51, 51, 51, 59, 61, 72, 77, 84, 92, 94, 106, 108, + 111, 110, 112, 58, 54, 54, 54, 61, 63, 75, 79, 87, 95, 98, 110, 112, + 117, 116, 113, 63, 58, 58, 57, 65, 67, 78, 83, 91, 100, 103, 116, 118, + 119, 119, 121, 65, 60, 59, 58, 66, 68, 79, 84, 92, 102, 105, 118, 120, + 127, 124, 122, 71, 65, 64, 63, 71, 73, 84, 89, 97, 108, 111, 125, 127, + 129, 129, 130, 74, 68, 67, 66, 73, 75, 86, 91, 100, 110, 113, 128, 131, + 135, 134, 130, 79, 72, 71, 70, 77, 79, 90, 95, 104, 115, 118, 133, 136, + 140, 139, 140, 82, 75, 73, 72, 79, 81, 92, 97, 105, 117, 120, 136, 139, + 145, 142, 140, 82, 75, 74, 72, 79, 81, 92, 97, 106, 117, 121, 136, 139, + 148, 150, 149, 87, 79, 78, 76, 83, 85, 96, 100, 110, 120, 125, 141, 144, + 148, 153, 150, 89, 82, 81, 78, 83, 87, 97, 99, 113, 118, 128, 139, 145, + 153, 157, 161, 92, 84, 83, 80, 84, 89, 97, 101, 114, 116, 132, 135, 150, + 153, 157, 162, 94, 86, 85, 82, 85, 92, 97, 104, 112, 119, 130, 136, 151, + 154, 163, 166, 97, 88, 88, 85, 86, 94, 97, 107, 111, 123, 128, 140, 147, + 159, 163, 167, 99, 91, 91, 87, 87, 97, 97, 110, 110, 126, 126, 144, 144, + 163, 163, 173, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 53, 55, + 58, 63, 65, 71, 74, 79, 82, 82, 87, 89, 92, 94, 97, 99, 31, 32, 32, 32, + 32, 32, 33, 33, 34, 34, 34, 37, 39, 41, 44, 45, 49, 51, 54, 58, 60, 65, + 68, 72, 75, 75, 79, 82, 84, 86, 88, 91, 31, 32, 32, 32, 33, 33, 34, 34, + 35, 36, 36, 39, 40, 42, 44, 45, 50, 51, 54, 58, 59, 64, 67, 71, 73, 74, + 78, 81, 83, 85, 88, 91, 32, 32, 32, 33, 34, 34, 35, 36, 37, 38, 38, 40, + 41, 43, 45, 46, 50, 51, 54, 57, 58, 63, 66, 70, 72, 72, 76, 78, 80, 82, + 85, 87, 35, 35, 34, 34, 35, 36, 37, 39, 41, 45, 46, 48, 49, 51, 53, 54, + 57, 59, 61, 65, 66, 71, 73, 77, 79, 79, 83, 83, 84, 85, 86, 87, 36, 35, + 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, 60, 61, 63, 67, + 68, 73, 75, 79, 81, 81, 85, 87, 89, 92, 94, 97, 44, 42, 41, 41, 42, 42, + 42, 44, 48, 52, 54, 58, 60, 63, 66, 67, 71, 72, 75, 78, 79, 84, 86, 90, + 92, 92, 96, 97, 97, 97, 97, 97, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, + 56, 60, 62, 66, 69, 70, 75, 77, 79, 83, 84, 89, 91, 95, 97, 97, 100, 99, + 101, 104, 107, 110, 53, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67, + 71, 75, 76, 82, 84, 87, 91, 92, 97, 100, 104, 105, 106, 110, 113, 114, + 112, 111, 110, 62, 59, 58, 57, 57, 57, 56, 58, 61, 65, 66, 71, 74, 78, + 82, 83, 90, 92, 95, 100, 102, 108, 110, 115, 117, 117, 120, 118, 116, + 119, 123, 126, 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 76, 79, + 84, 85, 92, 94, 98, 103, 105, 111, 113, 118, 120, 121, 125, 128, 132, + 130, 128, 126, 79, 75, 74, 72, 71, 71, 69, 71, 73, 77, 78, 84, 86, 90, + 95, 96, 103, 106, 110, 116, 118, 125, 128, 133, 136, 136, 141, 139, 135, + 136, 140, 144, 82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92, + 97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139, 139, 144, 145, 150, + 151, 147, 144, 88, 83, 82, 79, 79, 78, 76, 76, 81, 82, 85, 89, 91, 97, + 98, 104, 107, 111, 117, 119, 127, 129, 135, 140, 145, 148, 148, 153, + 153, 154, 159, 163, 90, 86, 85, 82, 81, 80, 79, 78, 81, 83, 87, 88, 93, + 94, 101, 101, 108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153, + 157, 157, 163, 163, 163, 93, 88, 88, 84, 84, 82, 83, 80, 80, 86, 86, 91, + 91, 97, 98, 105, 105, 112, 113, 121, 122, 130, 130, 140, 140, 149, 150, + 161, 162, 166, 167, 173, + /* Size 4x16 */ + 31, 36, 62, 88, 32, 35, 58, 82, 32, 36, 57, 79, 33, 38, 56, 76, 34, 42, + 61, 81, 34, 48, 66, 85, 39, 51, 74, 91, 44, 56, 82, 98, 49, 60, 90, 107, + 54, 63, 95, 117, 60, 68, 102, 127, 68, 75, 110, 135, 75, 81, 117, 145, + 79, 85, 120, 148, 84, 89, 116, 153, 88, 94, 123, 159, + /* Size 16x4 */ + 31, 32, 32, 33, 34, 34, 39, 44, 49, 54, 60, 68, 75, 79, 84, 88, 36, 35, + 36, 38, 42, 48, 51, 56, 60, 63, 68, 75, 81, 85, 89, 94, 62, 58, 57, 56, + 61, 66, 74, 82, 90, 95, 102, 110, 117, 120, 116, 123, 88, 82, 79, 76, + 81, 85, 91, 98, 107, 117, 127, 135, 145, 148, 153, 159, + /* Size 8x32 */ + 32, 31, 35, 44, 53, 65, 82, 90, 31, 32, 35, 42, 51, 62, 78, 86, 31, 32, + 34, 41, 50, 61, 76, 85, 31, 32, 34, 41, 49, 59, 74, 82, 31, 33, 35, 42, + 49, 59, 73, 81, 32, 33, 36, 42, 50, 59, 73, 80, 32, 34, 37, 42, 49, 58, + 71, 79, 32, 34, 39, 44, 51, 60, 73, 78, 34, 35, 41, 48, 54, 63, 76, 81, + 35, 36, 45, 52, 59, 67, 79, 83, 36, 36, 46, 54, 60, 68, 80, 87, 39, 39, + 48, 58, 65, 73, 86, 88, 41, 40, 49, 60, 67, 76, 88, 93, 44, 42, 51, 63, + 71, 79, 92, 94, 47, 44, 53, 66, 75, 84, 97, 101, 48, 45, 54, 67, 76, 85, + 98, 101, 53, 50, 57, 71, 82, 92, 106, 108, 55, 51, 59, 72, 84, 94, 108, + 110, 58, 54, 61, 75, 87, 98, 112, 116, 63, 58, 65, 78, 91, 103, 118, + 119, 65, 59, 66, 79, 92, 105, 120, 124, 71, 64, 71, 84, 97, 111, 127, + 129, 74, 67, 73, 86, 100, 113, 131, 134, 79, 71, 77, 90, 104, 118, 136, + 139, 82, 73, 79, 92, 105, 120, 139, 142, 82, 74, 79, 92, 106, 121, 139, + 150, 87, 78, 83, 96, 110, 125, 144, 153, 89, 81, 83, 97, 113, 128, 145, + 157, 92, 83, 84, 97, 114, 132, 150, 157, 94, 85, 85, 97, 112, 130, 151, + 163, 97, 88, 86, 97, 111, 128, 147, 163, 99, 91, 87, 97, 110, 126, 144, + 163, + /* Size 32x8 */ + 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 53, 55, + 58, 63, 65, 71, 74, 79, 82, 82, 87, 89, 92, 94, 97, 99, 31, 32, 32, 32, + 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 50, 51, 54, 58, 59, 64, + 67, 71, 73, 74, 78, 81, 83, 85, 88, 91, 35, 35, 34, 34, 35, 36, 37, 39, + 41, 45, 46, 48, 49, 51, 53, 54, 57, 59, 61, 65, 66, 71, 73, 77, 79, 79, + 83, 83, 84, 85, 86, 87, 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58, + 60, 63, 66, 67, 71, 72, 75, 78, 79, 84, 86, 90, 92, 92, 96, 97, 97, 97, + 97, 97, 53, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67, 71, 75, 76, + 82, 84, 87, 91, 92, 97, 100, 104, 105, 106, 110, 113, 114, 112, 111, + 110, 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 76, 79, 84, 85, 92, + 94, 98, 103, 105, 111, 113, 118, 120, 121, 125, 128, 132, 130, 128, 126, + 82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92, 97, 98, 106, + 108, 112, 118, 120, 127, 131, 136, 139, 139, 144, 145, 150, 151, 147, + 144, 90, 86, 85, 82, 81, 80, 79, 78, 81, 83, 87, 88, 93, 94, 101, 101, + 108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153, 157, 157, 163, + 163, 163 }, + { /* Chroma */ + /* Size 4x4 */ + 32, 45, 51, 61, 45, 54, 59, 65, 51, 59, 75, 81, 61, 65, 81, 97, + /* Size 8x8 */ + 31, 34, 46, 47, 50, 57, 61, 65, 34, 39, 47, 45, 48, 53, 57, 61, 46, 47, + 52, 52, 54, 58, 61, 62, 47, 45, 52, 58, 62, 65, 68, 68, 50, 48, 54, 62, + 68, 73, 77, 76, 57, 53, 58, 65, 73, 82, 86, 86, 61, 57, 61, 68, 77, 86, + 91, 95, 65, 61, 62, 68, 76, 86, 95, 100, + /* Size 16x16 */ + 32, 31, 33, 36, 41, 49, 49, 50, 52, 54, 57, 61, 64, 67, 68, 70, 31, 31, + 34, 39, 42, 47, 46, 47, 49, 51, 53, 57, 60, 62, 64, 66, 33, 34, 37, 42, + 44, 47, 46, 46, 47, 49, 51, 55, 57, 59, 61, 63, 36, 39, 42, 47, 47, 48, + 46, 46, 47, 48, 50, 53, 55, 57, 59, 61, 41, 42, 44, 47, 48, 50, 49, 50, + 50, 52, 53, 56, 58, 60, 61, 60, 49, 47, 47, 48, 50, 53, 53, 54, 54, 55, + 56, 59, 61, 63, 64, 64, 49, 46, 46, 46, 49, 53, 55, 57, 59, 60, 61, 64, + 66, 67, 67, 67, 50, 47, 46, 46, 50, 54, 57, 61, 63, 64, 66, 69, 70, 72, + 71, 71, 52, 49, 47, 47, 50, 54, 59, 63, 66, 68, 70, 73, 75, 77, 75, 75, + 54, 51, 49, 48, 52, 55, 60, 64, 68, 71, 73, 76, 79, 80, 79, 79, 57, 53, + 51, 50, 53, 56, 61, 66, 70, 73, 76, 80, 82, 84, 83, 84, 61, 57, 55, 53, + 56, 59, 64, 69, 73, 76, 80, 84, 87, 89, 88, 88, 64, 60, 57, 55, 58, 61, + 66, 70, 75, 79, 82, 87, 91, 93, 93, 93, 67, 62, 59, 57, 60, 63, 67, 72, + 77, 80, 84, 89, 93, 95, 96, 97, 68, 64, 61, 59, 61, 64, 67, 71, 75, 79, + 83, 88, 93, 96, 99, 100, 70, 66, 63, 61, 60, 64, 67, 71, 75, 79, 84, 88, + 93, 97, 100, 102, + /* Size 32x32 */ + 32, 31, 31, 30, 33, 33, 36, 38, 41, 47, 49, 48, 49, 49, 50, 50, 52, 53, + 54, 56, 57, 60, 61, 63, 64, 65, 67, 67, 68, 69, 70, 71, 31, 31, 31, 31, + 34, 34, 38, 40, 42, 46, 47, 47, 47, 47, 48, 48, 50, 50, 52, 54, 54, 57, + 58, 60, 61, 61, 63, 64, 65, 65, 66, 67, 31, 31, 31, 31, 34, 35, 39, 40, + 42, 46, 47, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 56, 57, 59, 60, 60, + 62, 63, 64, 65, 66, 67, 30, 31, 31, 32, 34, 35, 40, 41, 42, 45, 46, 45, + 45, 45, 46, 46, 47, 48, 49, 51, 52, 54, 55, 57, 58, 58, 60, 61, 62, 62, + 63, 64, 33, 34, 34, 34, 37, 38, 42, 43, 44, 46, 47, 46, 46, 45, 46, 46, + 47, 48, 49, 51, 51, 53, 55, 56, 57, 57, 59, 60, 61, 62, 63, 64, 33, 34, + 35, 35, 38, 39, 43, 44, 45, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, + 51, 53, 54, 56, 57, 57, 59, 60, 60, 61, 62, 62, 36, 38, 39, 40, 42, 43, + 47, 47, 47, 47, 48, 46, 46, 45, 46, 46, 47, 47, 48, 49, 50, 52, 53, 54, + 55, 55, 57, 58, 59, 60, 61, 62, 38, 40, 40, 41, 43, 44, 47, 47, 48, 48, + 49, 48, 47, 47, 47, 47, 48, 49, 49, 51, 51, 53, 54, 55, 56, 56, 58, 58, + 58, 59, 60, 60, 41, 42, 42, 42, 44, 45, 47, 48, 48, 50, 50, 49, 49, 49, + 50, 50, 50, 51, 52, 53, 53, 55, 56, 57, 58, 58, 60, 61, 61, 61, 60, 60, + 47, 46, 46, 45, 46, 47, 47, 48, 50, 52, 52, 52, 52, 52, 53, 53, 53, 54, + 55, 55, 56, 58, 58, 60, 60, 61, 62, 61, 61, 62, 63, 64, 49, 47, 47, 46, + 47, 47, 48, 49, 50, 52, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 56, 58, + 59, 60, 61, 61, 63, 63, 64, 64, 64, 64, 48, 47, 46, 45, 46, 46, 46, 48, + 49, 52, 53, 54, 55, 55, 56, 56, 57, 58, 58, 59, 60, 61, 62, 63, 64, 64, + 66, 65, 65, 65, 66, 67, 49, 47, 46, 45, 46, 46, 46, 47, 49, 52, 53, 55, + 55, 57, 57, 58, 59, 59, 60, 61, 61, 63, 64, 65, 66, 66, 67, 67, 67, 68, + 67, 67, 49, 47, 46, 45, 45, 45, 45, 47, 49, 52, 53, 55, 57, 58, 59, 60, + 61, 62, 62, 63, 63, 65, 66, 67, 68, 68, 69, 70, 69, 68, 69, 70, 50, 48, + 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, 61, 61, 63, 64, 64, 66, + 66, 68, 69, 70, 70, 71, 72, 70, 71, 72, 71, 70, 50, 48, 47, 46, 46, 46, + 46, 47, 50, 53, 54, 56, 58, 60, 61, 61, 63, 64, 65, 66, 67, 68, 69, 71, + 71, 71, 73, 74, 73, 72, 73, 74, 52, 50, 49, 47, 47, 47, 47, 48, 50, 53, + 54, 57, 59, 61, 63, 63, 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 75, + 75, 76, 75, 74, 53, 50, 50, 48, 48, 48, 47, 49, 51, 54, 55, 58, 59, 62, + 64, 64, 67, 68, 69, 71, 71, 73, 74, 76, 77, 77, 78, 78, 78, 76, 77, 78, + 54, 52, 51, 49, 49, 49, 48, 49, 52, 55, 55, 58, 60, 62, 64, 65, 68, 69, + 71, 73, 73, 75, 76, 78, 79, 79, 80, 80, 79, 80, 79, 78, 56, 54, 53, 51, + 51, 51, 49, 51, 53, 55, 56, 59, 61, 63, 66, 66, 70, 71, 73, 75, 76, 78, + 79, 81, 82, 82, 83, 81, 83, 81, 81, 82, 57, 54, 53, 52, 51, 51, 50, 51, + 53, 56, 56, 60, 61, 63, 66, 67, 70, 71, 73, 76, 76, 79, 80, 82, 82, 83, + 84, 85, 83, 84, 84, 82, 60, 57, 56, 54, 53, 53, 52, 53, 55, 58, 58, 61, + 63, 65, 68, 68, 72, 73, 75, 78, 79, 82, 83, 85, 86, 86, 88, 86, 87, 86, + 85, 86, 61, 58, 57, 55, 55, 54, 53, 54, 56, 58, 59, 62, 64, 66, 69, 69, + 73, 74, 76, 79, 80, 83, 84, 86, 87, 88, 89, 89, 88, 88, 88, 86, 63, 60, + 59, 57, 56, 56, 54, 55, 57, 60, 60, 63, 65, 67, 70, 71, 75, 76, 78, 81, + 82, 85, 86, 89, 90, 90, 92, 91, 91, 90, 89, 91, 64, 61, 60, 58, 57, 57, + 55, 56, 58, 60, 61, 64, 66, 68, 70, 71, 75, 77, 79, 82, 82, 86, 87, 90, + 91, 91, 93, 93, 93, 92, 93, 91, 65, 61, 60, 58, 57, 57, 55, 56, 58, 61, + 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 88, 90, 91, 91, 93, 94, + 95, 95, 93, 95, 67, 63, 62, 60, 59, 59, 57, 58, 60, 62, 63, 66, 67, 69, + 72, 73, 77, 78, 80, 83, 84, 88, 89, 92, 93, 93, 95, 95, 96, 96, 97, 95, + 67, 64, 63, 61, 60, 60, 58, 58, 61, 61, 63, 65, 67, 70, 70, 74, 75, 78, + 80, 81, 85, 86, 89, 91, 93, 94, 95, 97, 97, 98, 98, 100, 68, 65, 64, 62, + 61, 60, 59, 58, 61, 61, 64, 65, 67, 69, 71, 73, 75, 78, 79, 83, 83, 87, + 88, 91, 93, 95, 96, 97, 99, 98, 100, 100, 69, 65, 65, 62, 62, 61, 60, + 59, 61, 62, 64, 65, 68, 68, 72, 72, 76, 76, 80, 81, 84, 86, 88, 90, 92, + 95, 96, 98, 98, 100, 100, 101, 70, 66, 66, 63, 63, 62, 61, 60, 60, 63, + 64, 66, 67, 69, 71, 73, 75, 77, 79, 81, 84, 85, 88, 89, 93, 93, 97, 98, + 100, 100, 102, 101, 71, 67, 67, 64, 64, 62, 62, 60, 60, 64, 64, 67, 67, + 70, 70, 74, 74, 78, 78, 82, 82, 86, 86, 91, 91, 95, 95, 100, 100, 101, + 101, 104, + /* Size 4x8 */ + 31, 47, 53, 63, 36, 47, 50, 59, 46, 52, 55, 61, 45, 53, 63, 70, 49, 55, + 71, 77, 54, 58, 77, 86, 59, 61, 81, 94, 63, 65, 80, 95, + /* Size 8x4 */ + 31, 36, 46, 45, 49, 54, 59, 63, 47, 47, 52, 53, 55, 58, 61, 65, 53, 50, + 55, 63, 71, 77, 81, 80, 63, 59, 61, 70, 77, 86, 94, 95, + /* Size 8x16 */ + 32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 46, 49, 53, 60, 64, 33, 37, + 46, 45, 47, 51, 57, 61, 37, 43, 47, 45, 47, 50, 55, 59, 42, 44, 49, 49, + 50, 53, 58, 60, 49, 47, 52, 53, 54, 57, 61, 63, 48, 46, 51, 57, 59, 61, + 66, 67, 50, 46, 52, 59, 63, 66, 71, 71, 52, 47, 53, 61, 66, 71, 75, 74, + 54, 49, 54, 62, 68, 73, 79, 79, 57, 51, 55, 64, 70, 76, 83, 83, 61, 55, + 58, 66, 73, 80, 87, 87, 64, 57, 60, 68, 75, 83, 91, 91, 66, 59, 61, 69, + 77, 84, 93, 95, 68, 61, 61, 68, 77, 86, 94, 97, 70, 63, 61, 67, 75, 83, + 92, 98, + /* Size 16x8 */ + 32, 31, 33, 37, 42, 49, 48, 50, 52, 54, 57, 61, 64, 66, 68, 70, 33, 34, + 37, 43, 44, 47, 46, 46, 47, 49, 51, 55, 57, 59, 61, 63, 45, 45, 46, 47, + 49, 52, 51, 52, 53, 54, 55, 58, 60, 61, 61, 61, 49, 46, 45, 45, 49, 53, + 57, 59, 61, 62, 64, 66, 68, 69, 68, 67, 52, 49, 47, 47, 50, 54, 59, 63, + 66, 68, 70, 73, 75, 77, 77, 75, 57, 53, 51, 50, 53, 57, 61, 66, 71, 73, + 76, 80, 83, 84, 86, 83, 64, 60, 57, 55, 58, 61, 66, 71, 75, 79, 83, 87, + 91, 93, 94, 92, 68, 64, 61, 59, 60, 63, 67, 71, 74, 79, 83, 87, 91, 95, + 97, 98, + /* Size 16x32 */ + 32, 31, 33, 37, 45, 48, 49, 50, 52, 56, 57, 63, 64, 67, 68, 68, 31, 31, + 34, 38, 45, 47, 47, 48, 50, 53, 54, 60, 61, 63, 64, 65, 31, 32, 34, 39, + 45, 46, 46, 47, 49, 52, 53, 59, 60, 62, 64, 65, 30, 32, 35, 40, 44, 46, + 45, 46, 48, 51, 52, 57, 58, 60, 61, 62, 33, 35, 37, 42, 46, 47, 45, 46, + 47, 50, 51, 56, 57, 60, 61, 62, 33, 36, 38, 43, 46, 47, 46, 46, 47, 50, + 51, 56, 57, 59, 60, 60, 37, 40, 43, 47, 47, 47, 45, 46, 47, 49, 50, 54, + 55, 57, 59, 61, 39, 41, 43, 47, 48, 48, 47, 47, 48, 50, 51, 55, 56, 57, + 58, 59, 42, 43, 44, 47, 49, 50, 49, 50, 50, 53, 53, 57, 58, 60, 60, 59, + 47, 46, 46, 48, 51, 52, 53, 53, 53, 55, 56, 60, 61, 61, 61, 62, 49, 46, + 47, 48, 52, 53, 53, 54, 54, 56, 57, 60, 61, 63, 63, 62, 48, 46, 46, 47, + 51, 53, 56, 56, 57, 59, 60, 64, 64, 65, 64, 65, 48, 45, 46, 46, 51, 53, + 57, 57, 59, 61, 61, 65, 66, 66, 67, 65, 49, 45, 45, 46, 51, 53, 58, 59, + 61, 63, 64, 67, 68, 70, 67, 68, 50, 46, 46, 46, 52, 54, 59, 61, 63, 65, + 66, 70, 71, 70, 71, 68, 50, 46, 46, 46, 52, 54, 59, 61, 64, 66, 67, 71, + 71, 73, 71, 72, 52, 48, 47, 47, 53, 54, 61, 63, 66, 70, 71, 75, 75, 75, + 74, 72, 53, 49, 48, 48, 53, 55, 61, 64, 67, 71, 72, 76, 77, 77, 75, 76, + 54, 50, 49, 49, 54, 55, 62, 65, 68, 72, 73, 78, 79, 80, 79, 76, 56, 51, + 51, 50, 55, 56, 63, 66, 70, 74, 76, 81, 82, 81, 80, 80, 57, 52, 51, 50, + 55, 56, 64, 66, 70, 75, 76, 82, 83, 85, 83, 80, 60, 54, 54, 52, 57, 58, + 65, 68, 72, 77, 79, 85, 86, 86, 85, 84, 61, 56, 55, 53, 58, 59, 66, 69, + 73, 79, 80, 86, 87, 89, 87, 84, 63, 57, 56, 55, 59, 60, 67, 70, 75, 80, + 82, 89, 90, 91, 89, 89, 64, 58, 57, 56, 60, 61, 68, 71, 75, 81, 83, 90, + 91, 93, 91, 89, 64, 59, 58, 56, 60, 61, 68, 71, 75, 81, 83, 90, 91, 94, + 94, 93, 66, 60, 59, 57, 61, 63, 69, 72, 77, 82, 84, 92, 93, 94, 95, 93, + 67, 61, 60, 58, 61, 63, 69, 70, 78, 80, 85, 90, 93, 96, 97, 97, 68, 62, + 61, 59, 61, 64, 68, 71, 77, 79, 86, 88, 94, 96, 97, 98, 69, 63, 62, 59, + 61, 65, 68, 72, 76, 80, 85, 88, 94, 95, 99, 99, 70, 63, 63, 60, 61, 66, + 67, 73, 75, 81, 83, 89, 92, 97, 98, 99, 70, 64, 64, 61, 61, 67, 67, 74, + 74, 82, 82, 90, 90, 98, 98, 102, + /* Size 32x16 */ + 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 52, 53, + 54, 56, 57, 60, 61, 63, 64, 64, 66, 67, 68, 69, 70, 70, 31, 31, 32, 32, + 35, 36, 40, 41, 43, 46, 46, 46, 45, 45, 46, 46, 48, 49, 50, 51, 52, 54, + 56, 57, 58, 59, 60, 61, 62, 63, 63, 64, 33, 34, 34, 35, 37, 38, 43, 43, + 44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 54, 55, 56, 57, 58, + 59, 60, 61, 62, 63, 64, 37, 38, 39, 40, 42, 43, 47, 47, 47, 48, 48, 47, + 46, 46, 46, 46, 47, 48, 49, 50, 50, 52, 53, 55, 56, 56, 57, 58, 59, 59, + 60, 61, 45, 45, 45, 44, 46, 46, 47, 48, 49, 51, 52, 51, 51, 51, 52, 52, + 53, 53, 54, 55, 55, 57, 58, 59, 60, 60, 61, 61, 61, 61, 61, 61, 48, 47, + 46, 46, 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, + 56, 58, 59, 60, 61, 61, 63, 63, 64, 65, 66, 67, 49, 47, 46, 45, 45, 46, + 45, 47, 49, 53, 53, 56, 57, 58, 59, 59, 61, 61, 62, 63, 64, 65, 66, 67, + 68, 68, 69, 69, 68, 68, 67, 67, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, + 54, 56, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 69, 70, 71, 71, 72, 70, + 71, 72, 73, 74, 52, 50, 49, 48, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, + 63, 64, 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 78, 77, 76, 75, 74, + 56, 53, 52, 51, 50, 50, 49, 50, 53, 55, 56, 59, 61, 63, 65, 66, 70, 71, + 72, 74, 75, 77, 79, 80, 81, 81, 82, 80, 79, 80, 81, 82, 57, 54, 53, 52, + 51, 51, 50, 51, 53, 56, 57, 60, 61, 64, 66, 67, 71, 72, 73, 76, 76, 79, + 80, 82, 83, 83, 84, 85, 86, 85, 83, 82, 63, 60, 59, 57, 56, 56, 54, 55, + 57, 60, 60, 64, 65, 67, 70, 71, 75, 76, 78, 81, 82, 85, 86, 89, 90, 90, + 92, 90, 88, 88, 89, 90, 64, 61, 60, 58, 57, 57, 55, 56, 58, 61, 61, 64, + 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 87, 90, 91, 91, 93, 93, 94, 94, + 92, 90, 67, 63, 62, 60, 60, 59, 57, 57, 60, 61, 63, 65, 66, 70, 70, 73, + 75, 77, 80, 81, 85, 86, 89, 91, 93, 94, 94, 96, 96, 95, 97, 98, 68, 64, + 64, 61, 61, 60, 59, 58, 60, 61, 63, 64, 67, 67, 71, 71, 74, 75, 79, 80, + 83, 85, 87, 89, 91, 94, 95, 97, 97, 99, 98, 98, 68, 65, 65, 62, 62, 60, + 61, 59, 59, 62, 62, 65, 65, 68, 68, 72, 72, 76, 76, 80, 80, 84, 84, 89, + 89, 93, 93, 97, 98, 99, 99, 102, + /* Size 4x16 */ + 31, 48, 56, 67, 32, 46, 52, 62, 35, 47, 50, 60, 40, 47, 49, 57, 43, 50, + 53, 60, 46, 53, 56, 63, 45, 53, 61, 66, 46, 54, 65, 70, 48, 54, 70, 75, + 50, 55, 72, 80, 52, 56, 75, 85, 56, 59, 79, 89, 58, 61, 81, 93, 60, 63, + 82, 94, 62, 64, 79, 96, 63, 66, 81, 97, + /* Size 16x4 */ + 31, 32, 35, 40, 43, 46, 45, 46, 48, 50, 52, 56, 58, 60, 62, 63, 48, 46, + 47, 47, 50, 53, 53, 54, 54, 55, 56, 59, 61, 63, 64, 66, 56, 52, 50, 49, + 53, 56, 61, 65, 70, 72, 75, 79, 81, 82, 79, 81, 67, 62, 60, 57, 60, 63, + 66, 70, 75, 80, 85, 89, 93, 94, 96, 97, + /* Size 8x32 */ + 32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 47, 50, 54, 61, 64, 31, 34, + 45, 46, 49, 53, 60, 64, 30, 35, 44, 45, 48, 52, 58, 61, 33, 37, 46, 45, + 47, 51, 57, 61, 33, 38, 46, 46, 47, 51, 57, 60, 37, 43, 47, 45, 47, 50, + 55, 59, 39, 43, 48, 47, 48, 51, 56, 58, 42, 44, 49, 49, 50, 53, 58, 60, + 47, 46, 51, 53, 53, 56, 61, 61, 49, 47, 52, 53, 54, 57, 61, 63, 48, 46, + 51, 56, 57, 60, 64, 64, 48, 46, 51, 57, 59, 61, 66, 67, 49, 45, 51, 58, + 61, 64, 68, 67, 50, 46, 52, 59, 63, 66, 71, 71, 50, 46, 52, 59, 64, 67, + 71, 71, 52, 47, 53, 61, 66, 71, 75, 74, 53, 48, 53, 61, 67, 72, 77, 75, + 54, 49, 54, 62, 68, 73, 79, 79, 56, 51, 55, 63, 70, 76, 82, 80, 57, 51, + 55, 64, 70, 76, 83, 83, 60, 54, 57, 65, 72, 79, 86, 85, 61, 55, 58, 66, + 73, 80, 87, 87, 63, 56, 59, 67, 75, 82, 90, 89, 64, 57, 60, 68, 75, 83, + 91, 91, 64, 58, 60, 68, 75, 83, 91, 94, 66, 59, 61, 69, 77, 84, 93, 95, + 67, 60, 61, 69, 78, 85, 93, 97, 68, 61, 61, 68, 77, 86, 94, 97, 69, 62, + 61, 68, 76, 85, 94, 99, 70, 63, 61, 67, 75, 83, 92, 98, 70, 64, 61, 67, + 74, 82, 90, 98, + /* Size 32x8 */ + 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 52, 53, + 54, 56, 57, 60, 61, 63, 64, 64, 66, 67, 68, 69, 70, 70, 33, 34, 34, 35, + 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 54, + 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 45, 45, 45, 44, 46, 46, 47, 48, + 49, 51, 52, 51, 51, 51, 52, 52, 53, 53, 54, 55, 55, 57, 58, 59, 60, 60, + 61, 61, 61, 61, 61, 61, 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56, + 57, 58, 59, 59, 61, 61, 62, 63, 64, 65, 66, 67, 68, 68, 69, 69, 68, 68, + 67, 67, 52, 50, 49, 48, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, 63, 64, + 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 78, 77, 76, 75, 74, 57, 54, + 53, 52, 51, 51, 50, 51, 53, 56, 57, 60, 61, 64, 66, 67, 71, 72, 73, 76, + 76, 79, 80, 82, 83, 83, 84, 85, 86, 85, 83, 82, 64, 61, 60, 58, 57, 57, + 55, 56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 87, 90, + 91, 91, 93, 93, 94, 94, 92, 90, 68, 64, 64, 61, 61, 60, 59, 58, 60, 61, + 63, 64, 67, 67, 71, 71, 74, 75, 79, 80, 83, 85, 87, 89, 91, 94, 95, 97, + 97, 99, 98, 98 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 34, 53, 75, 34, 49, 64, 81, 53, 64, 91, 112, 75, 81, 112, 140, + /* Size 8x8 */ + 32, 32, 34, 39, 50, 62, 76, 84, 32, 33, 35, 40, 48, 59, 71, 79, 34, 35, + 39, 46, 53, 63, 74, 81, 39, 40, 46, 56, 65, 75, 86, 92, 50, 48, 53, 65, + 78, 90, 101, 106, 62, 59, 63, 75, 90, 105, 118, 123, 76, 71, 74, 86, + 101, 118, 134, 142, 84, 79, 81, 92, 106, 123, 142, 153, + /* Size 16x16 */ + 32, 31, 31, 32, 33, 36, 39, 44, 48, 54, 59, 66, 74, 81, 86, 91, 31, 32, + 32, 32, 33, 35, 38, 42, 46, 51, 56, 63, 70, 77, 81, 86, 31, 32, 32, 33, + 34, 35, 38, 41, 45, 49, 54, 60, 67, 73, 77, 82, 32, 32, 33, 34, 36, 37, + 40, 42, 45, 49, 53, 59, 66, 71, 75, 80, 33, 33, 34, 36, 38, 42, 44, 46, + 50, 53, 57, 63, 69, 74, 78, 80, 36, 35, 35, 37, 42, 48, 50, 54, 57, 60, + 64, 69, 75, 80, 84, 85, 39, 38, 38, 40, 44, 50, 54, 58, 61, 65, 69, 74, + 80, 85, 89, 91, 44, 42, 41, 42, 46, 54, 58, 63, 67, 71, 75, 80, 86, 91, + 95, 97, 48, 46, 45, 45, 50, 57, 61, 67, 71, 76, 80, 86, 93, 98, 101, + 104, 54, 51, 49, 49, 53, 60, 65, 71, 76, 82, 87, 93, 100, 105, 109, 112, + 59, 56, 54, 53, 57, 64, 69, 75, 80, 87, 92, 99, 106, 112, 116, 120, 66, + 63, 60, 59, 63, 69, 74, 80, 86, 93, 99, 107, 115, 121, 125, 129, 74, 70, + 67, 66, 69, 75, 80, 86, 93, 100, 106, 115, 123, 130, 135, 138, 81, 77, + 73, 71, 74, 80, 85, 91, 98, 105, 112, 121, 130, 137, 142, 148, 86, 81, + 77, 75, 78, 84, 89, 95, 101, 109, 116, 125, 135, 142, 147, 153, 91, 86, + 82, 80, 80, 85, 91, 97, 104, 112, 120, 129, 138, 148, 153, 159, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 32, 32, 33, 34, 36, 36, 39, 41, 44, 46, 48, 52, + 54, 58, 59, 65, 66, 71, 74, 80, 81, 83, 86, 89, 91, 93, 31, 32, 32, 32, + 32, 32, 32, 32, 33, 34, 35, 35, 38, 39, 42, 44, 46, 50, 51, 56, 56, 62, + 63, 68, 71, 76, 77, 78, 82, 84, 86, 88, 31, 32, 32, 32, 32, 32, 32, 32, + 33, 34, 35, 35, 38, 39, 42, 44, 46, 49, 51, 55, 56, 61, 63, 67, 70, 75, + 77, 78, 81, 84, 86, 88, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, + 37, 38, 41, 42, 44, 48, 49, 53, 54, 59, 60, 65, 68, 72, 74, 75, 78, 80, + 82, 84, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43, + 45, 48, 49, 53, 54, 59, 60, 65, 67, 72, 73, 74, 77, 80, 82, 84, 31, 32, + 32, 32, 33, 33, 33, 34, 35, 35, 36, 36, 39, 40, 42, 44, 45, 48, 50, 53, + 54, 59, 60, 64, 67, 71, 73, 74, 77, 79, 81, 83, 32, 32, 32, 32, 33, 33, + 34, 35, 36, 36, 37, 38, 40, 40, 42, 44, 45, 48, 49, 53, 53, 58, 59, 63, + 66, 70, 71, 72, 75, 78, 80, 83, 32, 32, 32, 32, 33, 34, 35, 35, 36, 37, + 38, 38, 40, 41, 42, 44, 46, 48, 49, 53, 53, 58, 59, 63, 65, 69, 71, 72, + 74, 77, 79, 80, 33, 33, 33, 33, 34, 35, 36, 36, 38, 39, 42, 42, 44, 45, + 46, 48, 50, 52, 53, 57, 57, 62, 63, 67, 69, 73, 74, 75, 78, 79, 80, 81, + 34, 34, 34, 33, 34, 35, 36, 37, 39, 39, 42, 43, 45, 46, 47, 49, 51, 53, + 54, 58, 58, 63, 64, 68, 70, 74, 75, 76, 79, 81, 84, 86, 36, 35, 35, 34, + 35, 36, 37, 38, 42, 42, 48, 48, 50, 51, 54, 55, 57, 59, 60, 63, 64, 68, + 69, 73, 75, 79, 80, 81, 84, 85, 85, 86, 36, 35, 35, 34, 35, 36, 38, 38, + 42, 43, 48, 49, 51, 52, 54, 55, 57, 59, 60, 64, 64, 68, 69, 73, 75, 79, + 80, 81, 84, 86, 88, 91, 39, 38, 38, 37, 38, 39, 40, 40, 44, 45, 50, 51, + 54, 55, 58, 59, 61, 64, 65, 68, 69, 73, 74, 78, 80, 84, 85, 86, 89, 91, + 91, 91, 41, 39, 39, 38, 39, 40, 40, 41, 45, 46, 51, 52, 55, 56, 59, 61, + 63, 65, 67, 70, 70, 75, 76, 80, 82, 86, 87, 88, 91, 92, 94, 96, 44, 42, + 42, 41, 41, 42, 42, 42, 46, 47, 54, 54, 58, 59, 63, 65, 67, 70, 71, 75, + 75, 79, 80, 84, 86, 90, 91, 92, 95, 97, 97, 97, 46, 44, 44, 42, 43, 44, + 44, 44, 48, 49, 55, 55, 59, 61, 65, 67, 69, 72, 74, 77, 78, 82, 83, 87, + 89, 93, 94, 95, 98, 98, 100, 103, 48, 46, 46, 44, 45, 45, 45, 46, 50, + 51, 57, 57, 61, 63, 67, 69, 71, 74, 76, 80, 80, 85, 86, 90, 93, 96, 98, + 99, 101, 104, 104, 103, 52, 50, 49, 48, 48, 48, 48, 48, 52, 53, 59, 59, + 64, 65, 70, 72, 74, 78, 80, 84, 85, 90, 91, 95, 97, 101, 103, 104, 106, + 106, 107, 110, 54, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, 60, 65, 67, + 71, 74, 76, 80, 82, 86, 87, 92, 93, 97, 100, 104, 105, 106, 109, 112, + 112, 110, 58, 56, 55, 53, 53, 53, 53, 53, 57, 58, 63, 64, 68, 70, 75, + 77, 80, 84, 86, 91, 91, 97, 98, 103, 105, 110, 111, 112, 115, 114, 115, + 118, 59, 56, 56, 54, 54, 54, 53, 53, 57, 58, 64, 64, 69, 70, 75, 78, 80, + 85, 87, 91, 92, 98, 99, 103, 106, 110, 112, 113, 116, 119, 120, 119, 65, + 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, 92, + 97, 98, 105, 106, 111, 114, 118, 120, 121, 124, 123, 123, 126, 66, 63, + 63, 60, 60, 60, 59, 59, 63, 64, 69, 69, 74, 76, 80, 83, 86, 91, 93, 98, + 99, 106, 107, 112, 115, 119, 121, 122, 125, 128, 129, 126, 71, 68, 67, + 65, 65, 64, 63, 63, 67, 68, 73, 73, 78, 80, 84, 87, 90, 95, 97, 103, + 103, 111, 112, 117, 120, 125, 127, 128, 131, 132, 132, 135, 74, 71, 70, + 68, 67, 67, 66, 65, 69, 70, 75, 75, 80, 82, 86, 89, 93, 97, 100, 105, + 106, 114, 115, 120, 123, 128, 130, 131, 135, 135, 138, 136, 80, 76, 75, + 72, 72, 71, 70, 69, 73, 74, 79, 79, 84, 86, 90, 93, 96, 101, 104, 110, + 110, 118, 119, 125, 128, 134, 136, 137, 140, 142, 140, 144, 81, 77, 77, + 74, 73, 73, 71, 71, 74, 75, 80, 80, 85, 87, 91, 94, 98, 103, 105, 111, + 112, 120, 121, 127, 130, 136, 137, 139, 142, 145, 148, 144, 83, 78, 78, + 75, 74, 74, 72, 72, 75, 76, 81, 81, 86, 88, 92, 95, 99, 104, 106, 112, + 113, 121, 122, 128, 131, 137, 139, 140, 144, 148, 150, 155, 86, 82, 81, + 78, 77, 77, 75, 74, 78, 79, 84, 84, 89, 91, 95, 98, 101, 106, 109, 115, + 116, 124, 125, 131, 135, 140, 142, 144, 147, 149, 153, 155, 89, 84, 84, + 80, 80, 79, 78, 77, 79, 81, 85, 86, 91, 92, 97, 98, 104, 106, 112, 114, + 119, 123, 128, 132, 135, 142, 145, 148, 149, 153, 154, 159, 91, 86, 86, + 82, 82, 81, 80, 79, 80, 84, 85, 88, 91, 94, 97, 100, 104, 107, 112, 115, + 120, 123, 129, 132, 138, 140, 148, 150, 153, 154, 159, 159, 93, 88, 88, + 84, 84, 83, 83, 80, 81, 86, 86, 91, 91, 96, 97, 103, 103, 110, 110, 118, + 119, 126, 126, 135, 136, 144, 144, 155, 155, 159, 159, 164, + /* Size 4x8 */ + 32, 35, 51, 77, 32, 36, 50, 72, 34, 42, 54, 75, 38, 51, 67, 87, 48, 59, + 80, 103, 60, 68, 92, 119, 72, 79, 104, 135, 81, 86, 112, 144, + /* Size 8x4 */ + 32, 32, 34, 38, 48, 60, 72, 81, 35, 36, 42, 51, 59, 68, 79, 86, 51, 50, + 54, 67, 80, 92, 104, 112, 77, 72, 75, 87, 103, 119, 135, 144, + /* Size 8x16 */ + 32, 31, 33, 40, 51, 65, 79, 87, 31, 32, 33, 39, 49, 61, 74, 82, 31, 32, + 34, 38, 47, 59, 71, 79, 32, 33, 36, 40, 48, 58, 69, 77, 33, 34, 38, 44, + 52, 62, 72, 78, 36, 35, 42, 51, 58, 68, 78, 84, 39, 38, 44, 54, 63, 73, + 84, 89, 44, 41, 46, 59, 69, 79, 90, 96, 48, 45, 50, 62, 74, 85, 96, 103, + 53, 49, 53, 66, 79, 92, 103, 111, 58, 54, 57, 70, 84, 98, 110, 118, 66, + 60, 63, 75, 90, 106, 119, 126, 74, 67, 69, 81, 97, 113, 128, 134, 81, + 73, 75, 86, 102, 120, 135, 143, 86, 78, 78, 90, 106, 124, 140, 147, 91, + 82, 80, 90, 103, 119, 137, 151, + /* Size 16x8 */ + 32, 31, 31, 32, 33, 36, 39, 44, 48, 53, 58, 66, 74, 81, 86, 91, 31, 32, + 32, 33, 34, 35, 38, 41, 45, 49, 54, 60, 67, 73, 78, 82, 33, 33, 34, 36, + 38, 42, 44, 46, 50, 53, 57, 63, 69, 75, 78, 80, 40, 39, 38, 40, 44, 51, + 54, 59, 62, 66, 70, 75, 81, 86, 90, 90, 51, 49, 47, 48, 52, 58, 63, 69, + 74, 79, 84, 90, 97, 102, 106, 103, 65, 61, 59, 58, 62, 68, 73, 79, 85, + 92, 98, 106, 113, 120, 124, 119, 79, 74, 71, 69, 72, 78, 84, 90, 96, + 103, 110, 119, 128, 135, 140, 137, 87, 82, 79, 77, 78, 84, 89, 96, 103, + 111, 118, 126, 134, 143, 147, 151, + /* Size 16x32 */ + 32, 31, 31, 32, 33, 36, 40, 44, 51, 53, 65, 66, 79, 81, 87, 90, 31, 32, + 32, 32, 33, 35, 39, 42, 49, 51, 62, 63, 75, 77, 83, 85, 31, 32, 32, 32, + 33, 35, 39, 42, 49, 51, 61, 62, 74, 76, 82, 85, 31, 32, 32, 33, 33, 34, + 38, 41, 47, 49, 59, 60, 72, 74, 79, 81, 31, 32, 32, 33, 34, 35, 38, 41, + 47, 49, 59, 60, 71, 73, 79, 81, 32, 32, 33, 34, 35, 36, 39, 42, 48, 50, + 59, 60, 71, 72, 78, 80, 32, 32, 33, 35, 36, 37, 40, 42, 48, 49, 58, 59, + 69, 71, 77, 80, 32, 33, 33, 35, 36, 38, 41, 42, 48, 49, 58, 59, 69, 70, + 75, 77, 33, 33, 34, 36, 38, 41, 44, 46, 52, 53, 62, 63, 72, 74, 78, 78, + 34, 34, 34, 37, 39, 42, 45, 48, 53, 54, 63, 64, 73, 75, 80, 83, 36, 34, + 35, 38, 42, 48, 51, 54, 58, 60, 68, 69, 78, 80, 84, 83, 36, 35, 35, 38, + 42, 48, 51, 54, 59, 60, 68, 69, 79, 80, 85, 87, 39, 37, 38, 40, 44, 50, + 54, 58, 63, 65, 73, 74, 84, 85, 89, 88, 40, 38, 39, 41, 45, 51, 56, 59, + 65, 67, 75, 76, 85, 87, 90, 93, 44, 41, 41, 43, 46, 53, 59, 63, 69, 71, + 79, 80, 90, 91, 96, 93, 46, 43, 43, 44, 48, 55, 60, 65, 72, 73, 82, 83, + 93, 94, 97, 100, 48, 45, 45, 46, 50, 56, 62, 67, 74, 76, 85, 86, 96, 98, + 103, 100, 52, 48, 48, 49, 52, 59, 65, 70, 78, 80, 90, 91, 101, 103, 105, + 107, 53, 49, 49, 50, 53, 60, 66, 71, 79, 82, 92, 93, 103, 105, 111, 107, + 58, 53, 53, 53, 57, 63, 69, 74, 83, 86, 97, 98, 109, 111, 113, 115, 58, + 54, 54, 54, 57, 63, 70, 75, 84, 87, 98, 99, 110, 112, 118, 115, 65, 60, + 59, 58, 62, 68, 74, 79, 89, 92, 105, 106, 118, 119, 122, 123, 66, 61, + 60, 59, 63, 69, 75, 80, 90, 93, 106, 107, 119, 121, 126, 123, 71, 65, + 65, 63, 67, 73, 79, 84, 94, 97, 111, 112, 125, 127, 131, 132, 74, 68, + 67, 66, 69, 75, 81, 86, 97, 100, 113, 115, 128, 130, 134, 132, 79, 72, + 72, 70, 73, 79, 85, 90, 101, 104, 118, 119, 133, 135, 141, 140, 81, 74, + 73, 71, 75, 80, 86, 91, 102, 105, 120, 121, 135, 137, 143, 140, 82, 75, + 74, 72, 75, 81, 87, 92, 103, 106, 121, 122, 136, 139, 147, 151, 86, 78, + 78, 75, 78, 84, 90, 95, 106, 109, 124, 125, 140, 142, 147, 151, 88, 81, + 80, 77, 80, 86, 90, 98, 105, 112, 122, 127, 140, 144, 152, 155, 91, 83, + 82, 79, 80, 88, 90, 100, 103, 114, 119, 130, 137, 148, 151, 155, 93, 85, + 85, 81, 81, 90, 90, 102, 103, 117, 117, 134, 134, 151, 152, 160, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 32, 32, 32, 33, 34, 36, 36, 39, 40, 44, 46, 48, 52, + 53, 58, 58, 65, 66, 71, 74, 79, 81, 82, 86, 88, 91, 93, 31, 32, 32, 32, + 32, 32, 32, 33, 33, 34, 34, 35, 37, 38, 41, 43, 45, 48, 49, 53, 54, 60, + 61, 65, 68, 72, 74, 75, 78, 81, 83, 85, 31, 32, 32, 32, 32, 33, 33, 33, + 34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54, 59, 60, 65, 67, 72, + 73, 74, 78, 80, 82, 85, 32, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, + 40, 41, 43, 44, 46, 49, 50, 53, 54, 58, 59, 63, 66, 70, 71, 72, 75, 77, + 79, 81, 33, 33, 33, 33, 34, 35, 36, 36, 38, 39, 42, 42, 44, 45, 46, 48, + 50, 52, 53, 57, 57, 62, 63, 67, 69, 73, 75, 75, 78, 80, 80, 81, 36, 35, + 35, 34, 35, 36, 37, 38, 41, 42, 48, 48, 50, 51, 53, 55, 56, 59, 60, 63, + 63, 68, 69, 73, 75, 79, 80, 81, 84, 86, 88, 90, 40, 39, 39, 38, 38, 39, + 40, 41, 44, 45, 51, 51, 54, 56, 59, 60, 62, 65, 66, 69, 70, 74, 75, 79, + 81, 85, 86, 87, 90, 90, 90, 90, 44, 42, 42, 41, 41, 42, 42, 42, 46, 48, + 54, 54, 58, 59, 63, 65, 67, 70, 71, 74, 75, 79, 80, 84, 86, 90, 91, 92, + 95, 98, 100, 102, 51, 49, 49, 47, 47, 48, 48, 48, 52, 53, 58, 59, 63, + 65, 69, 72, 74, 78, 79, 83, 84, 89, 90, 94, 97, 101, 102, 103, 106, 105, + 103, 103, 53, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, 60, 65, 67, 71, + 73, 76, 80, 82, 86, 87, 92, 93, 97, 100, 104, 105, 106, 109, 112, 114, + 117, 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, + 90, 92, 97, 98, 105, 106, 111, 113, 118, 120, 121, 124, 122, 119, 117, + 66, 63, 62, 60, 60, 60, 59, 59, 63, 64, 69, 69, 74, 76, 80, 83, 86, 91, + 93, 98, 99, 106, 107, 112, 115, 119, 121, 122, 125, 127, 130, 134, 79, + 75, 74, 72, 71, 71, 69, 69, 72, 73, 78, 79, 84, 85, 90, 93, 96, 101, + 103, 109, 110, 118, 119, 125, 128, 133, 135, 136, 140, 140, 137, 134, + 81, 77, 76, 74, 73, 72, 71, 70, 74, 75, 80, 80, 85, 87, 91, 94, 98, 103, + 105, 111, 112, 119, 121, 127, 130, 135, 137, 139, 142, 144, 148, 151, + 87, 83, 82, 79, 79, 78, 77, 75, 78, 80, 84, 85, 89, 90, 96, 97, 103, + 105, 111, 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151, + 152, 90, 85, 85, 81, 81, 80, 80, 77, 78, 83, 83, 87, 88, 93, 93, 100, + 100, 107, 107, 115, 115, 123, 123, 132, 132, 140, 140, 151, 151, 155, + 155, 160, + /* Size 4x16 */ + 31, 36, 53, 81, 32, 35, 51, 76, 32, 35, 49, 73, 32, 37, 49, 71, 33, 41, + 53, 74, 34, 48, 60, 80, 37, 50, 65, 85, 41, 53, 71, 91, 45, 56, 76, 98, + 49, 60, 82, 105, 54, 63, 87, 112, 61, 69, 93, 121, 68, 75, 100, 130, 74, + 80, 105, 137, 78, 84, 109, 142, 83, 88, 114, 148, + /* Size 16x4 */ + 31, 32, 32, 32, 33, 34, 37, 41, 45, 49, 54, 61, 68, 74, 78, 83, 36, 35, + 35, 37, 41, 48, 50, 53, 56, 60, 63, 69, 75, 80, 84, 88, 53, 51, 49, 49, + 53, 60, 65, 71, 76, 82, 87, 93, 100, 105, 109, 114, 81, 76, 73, 71, 74, + 80, 85, 91, 98, 105, 112, 121, 130, 137, 142, 148, + /* Size 8x32 */ + 32, 31, 33, 40, 51, 65, 79, 87, 31, 32, 33, 39, 49, 62, 75, 83, 31, 32, + 33, 39, 49, 61, 74, 82, 31, 32, 33, 38, 47, 59, 72, 79, 31, 32, 34, 38, + 47, 59, 71, 79, 32, 33, 35, 39, 48, 59, 71, 78, 32, 33, 36, 40, 48, 58, + 69, 77, 32, 33, 36, 41, 48, 58, 69, 75, 33, 34, 38, 44, 52, 62, 72, 78, + 34, 34, 39, 45, 53, 63, 73, 80, 36, 35, 42, 51, 58, 68, 78, 84, 36, 35, + 42, 51, 59, 68, 79, 85, 39, 38, 44, 54, 63, 73, 84, 89, 40, 39, 45, 56, + 65, 75, 85, 90, 44, 41, 46, 59, 69, 79, 90, 96, 46, 43, 48, 60, 72, 82, + 93, 97, 48, 45, 50, 62, 74, 85, 96, 103, 52, 48, 52, 65, 78, 90, 101, + 105, 53, 49, 53, 66, 79, 92, 103, 111, 58, 53, 57, 69, 83, 97, 109, 113, + 58, 54, 57, 70, 84, 98, 110, 118, 65, 59, 62, 74, 89, 105, 118, 122, 66, + 60, 63, 75, 90, 106, 119, 126, 71, 65, 67, 79, 94, 111, 125, 131, 74, + 67, 69, 81, 97, 113, 128, 134, 79, 72, 73, 85, 101, 118, 133, 141, 81, + 73, 75, 86, 102, 120, 135, 143, 82, 74, 75, 87, 103, 121, 136, 147, 86, + 78, 78, 90, 106, 124, 140, 147, 88, 80, 80, 90, 105, 122, 140, 152, 91, + 82, 80, 90, 103, 119, 137, 151, 93, 85, 81, 90, 103, 117, 134, 152, + /* Size 32x8 */ + 32, 31, 31, 31, 31, 32, 32, 32, 33, 34, 36, 36, 39, 40, 44, 46, 48, 52, + 53, 58, 58, 65, 66, 71, 74, 79, 81, 82, 86, 88, 91, 93, 31, 32, 32, 32, + 32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54, 59, + 60, 65, 67, 72, 73, 74, 78, 80, 82, 85, 33, 33, 33, 33, 34, 35, 36, 36, + 38, 39, 42, 42, 44, 45, 46, 48, 50, 52, 53, 57, 57, 62, 63, 67, 69, 73, + 75, 75, 78, 80, 80, 81, 40, 39, 39, 38, 38, 39, 40, 41, 44, 45, 51, 51, + 54, 56, 59, 60, 62, 65, 66, 69, 70, 74, 75, 79, 81, 85, 86, 87, 90, 90, + 90, 90, 51, 49, 49, 47, 47, 48, 48, 48, 52, 53, 58, 59, 63, 65, 69, 72, + 74, 78, 79, 83, 84, 89, 90, 94, 97, 101, 102, 103, 106, 105, 103, 103, + 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, + 92, 97, 98, 105, 106, 111, 113, 118, 120, 121, 124, 122, 119, 117, 79, + 75, 74, 72, 71, 71, 69, 69, 72, 73, 78, 79, 84, 85, 90, 93, 96, 101, + 103, 109, 110, 118, 119, 125, 128, 133, 135, 136, 140, 140, 137, 134, + 87, 83, 82, 79, 79, 78, 77, 75, 78, 80, 84, 85, 89, 90, 96, 97, 103, + 105, 111, 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151, + 152 }, + { /* Chroma */ + /* Size 4x4 */ + 32, 46, 49, 58, 46, 53, 55, 62, 49, 55, 70, 78, 58, 62, 78, 91, + /* Size 8x8 */ + 31, 34, 42, 47, 49, 54, 60, 64, 34, 39, 45, 46, 47, 51, 56, 59, 42, 45, + 48, 49, 50, 53, 57, 60, 47, 46, 49, 55, 58, 61, 65, 66, 49, 47, 50, 58, + 65, 69, 73, 74, 54, 51, 53, 61, 69, 76, 82, 83, 60, 56, 57, 65, 73, 82, + 89, 92, 64, 59, 60, 66, 74, 83, 92, 96, + /* Size 16x16 */ + 32, 31, 31, 35, 40, 49, 48, 49, 50, 52, 54, 57, 61, 64, 66, 68, 31, 31, + 32, 37, 41, 47, 47, 46, 48, 49, 51, 54, 57, 60, 62, 64, 31, 32, 34, 39, + 43, 46, 46, 45, 46, 47, 49, 52, 55, 57, 59, 61, 35, 37, 39, 44, 46, 47, + 46, 45, 46, 47, 48, 51, 53, 56, 57, 59, 40, 41, 43, 46, 48, 50, 49, 48, + 49, 49, 51, 53, 55, 57, 59, 59, 49, 47, 46, 47, 50, 53, 53, 53, 54, 54, + 55, 57, 59, 61, 62, 62, 48, 47, 46, 46, 49, 53, 54, 55, 56, 57, 58, 60, + 62, 64, 65, 65, 49, 46, 45, 45, 48, 53, 55, 58, 60, 61, 62, 64, 66, 68, + 69, 69, 50, 48, 46, 46, 49, 54, 56, 60, 61, 63, 65, 67, 69, 71, 72, 72, + 52, 49, 47, 47, 49, 54, 57, 61, 63, 66, 68, 71, 73, 75, 76, 77, 54, 51, + 49, 48, 51, 55, 58, 62, 65, 68, 71, 74, 76, 78, 80, 81, 57, 54, 52, 51, + 53, 57, 60, 64, 67, 71, 74, 77, 80, 83, 84, 85, 61, 57, 55, 53, 55, 59, + 62, 66, 69, 73, 76, 80, 84, 87, 89, 89, 64, 60, 57, 56, 57, 61, 64, 68, + 71, 75, 78, 83, 87, 90, 92, 94, 66, 62, 59, 57, 59, 62, 65, 69, 72, 76, + 80, 84, 89, 92, 94, 96, 68, 64, 61, 59, 59, 62, 65, 69, 72, 77, 81, 85, + 89, 94, 96, 98, + /* Size 32x32 */ + 32, 31, 31, 30, 31, 33, 35, 36, 40, 41, 49, 49, 48, 48, 49, 50, 50, 52, + 52, 54, 54, 57, 57, 60, 61, 63, 64, 65, 66, 67, 68, 69, 31, 31, 31, 31, + 32, 34, 37, 38, 41, 42, 47, 47, 47, 47, 47, 47, 48, 49, 50, 52, 52, 54, + 55, 57, 58, 60, 61, 61, 63, 64, 64, 65, 31, 31, 31, 31, 32, 35, 37, 39, + 41, 42, 47, 47, 47, 46, 46, 47, 48, 49, 49, 51, 51, 54, 54, 56, 57, 59, + 60, 61, 62, 63, 64, 65, 30, 31, 31, 32, 33, 35, 38, 40, 42, 42, 46, 46, + 45, 45, 45, 45, 46, 47, 47, 49, 49, 52, 52, 54, 55, 57, 58, 58, 60, 61, + 61, 62, 31, 32, 32, 33, 34, 37, 39, 41, 43, 43, 46, 46, 46, 45, 45, 46, + 46, 47, 47, 49, 49, 51, 52, 54, 55, 57, 57, 58, 59, 60, 61, 62, 33, 34, + 35, 35, 37, 39, 41, 43, 44, 45, 47, 47, 46, 46, 45, 46, 46, 47, 47, 49, + 49, 51, 51, 53, 54, 56, 57, 57, 58, 59, 60, 61, 35, 37, 37, 38, 39, 41, + 44, 46, 46, 46, 47, 47, 46, 46, 45, 46, 46, 47, 47, 48, 48, 50, 51, 52, + 53, 55, 56, 56, 57, 58, 59, 61, 36, 38, 39, 40, 41, 43, 46, 47, 47, 47, + 48, 47, 46, 46, 45, 46, 46, 46, 47, 48, 48, 50, 50, 52, 53, 54, 55, 55, + 56, 57, 58, 58, 40, 41, 41, 42, 43, 44, 46, 47, 48, 48, 50, 49, 49, 49, + 48, 49, 49, 49, 49, 51, 51, 52, 53, 54, 55, 57, 57, 58, 59, 59, 59, 59, + 41, 42, 42, 42, 43, 45, 46, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50, 50, + 50, 52, 52, 53, 53, 55, 56, 57, 58, 58, 59, 60, 61, 62, 49, 47, 47, 46, + 46, 47, 47, 48, 50, 50, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, + 57, 58, 59, 60, 61, 61, 62, 62, 62, 62, 49, 47, 47, 46, 46, 47, 47, 47, + 49, 50, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 56, 57, 57, 59, 59, 61, + 61, 62, 63, 63, 64, 65, 48, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53, + 54, 54, 55, 56, 56, 57, 57, 58, 58, 60, 60, 61, 62, 63, 64, 64, 65, 66, + 65, 65, 48, 47, 46, 45, 45, 46, 46, 46, 49, 49, 53, 53, 54, 55, 56, 57, + 57, 58, 58, 59, 60, 61, 61, 63, 63, 65, 65, 65, 66, 66, 67, 68, 49, 47, + 46, 45, 45, 45, 45, 45, 48, 49, 53, 54, 55, 56, 58, 59, 60, 61, 61, 62, + 62, 63, 64, 65, 66, 67, 68, 68, 69, 70, 69, 68, 50, 47, 47, 45, 46, 46, + 46, 46, 49, 49, 54, 54, 56, 57, 59, 60, 60, 62, 62, 63, 64, 65, 65, 67, + 68, 69, 69, 70, 70, 70, 71, 71, 50, 48, 48, 46, 46, 46, 46, 46, 49, 50, + 54, 54, 56, 57, 60, 60, 61, 63, 63, 65, 65, 67, 67, 68, 69, 71, 71, 71, + 72, 73, 72, 71, 52, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, + 61, 62, 63, 65, 65, 67, 67, 69, 70, 71, 72, 73, 74, 74, 75, 74, 74, 75, + 52, 50, 49, 47, 47, 47, 47, 47, 49, 50, 54, 54, 57, 58, 61, 62, 63, 65, + 66, 68, 68, 70, 71, 72, 73, 75, 75, 75, 76, 77, 77, 75, 54, 52, 51, 49, + 49, 49, 48, 48, 51, 52, 55, 55, 58, 59, 62, 63, 65, 67, 68, 70, 70, 73, + 73, 75, 76, 78, 78, 78, 79, 78, 78, 79, 54, 52, 51, 49, 49, 49, 48, 48, + 51, 52, 55, 56, 58, 60, 62, 64, 65, 67, 68, 70, 71, 73, 74, 75, 76, 78, + 78, 79, 80, 81, 81, 79, 57, 54, 54, 52, 51, 51, 50, 50, 52, 53, 56, 57, + 60, 61, 63, 65, 67, 69, 70, 73, 73, 76, 77, 79, 80, 82, 82, 83, 84, 83, + 82, 83, 57, 55, 54, 52, 52, 51, 51, 50, 53, 53, 57, 57, 60, 61, 64, 65, + 67, 70, 71, 73, 74, 77, 77, 79, 80, 82, 83, 83, 84, 85, 85, 83, 60, 57, + 56, 54, 54, 53, 52, 52, 54, 55, 58, 59, 61, 63, 65, 67, 68, 71, 72, 75, + 75, 79, 79, 82, 83, 85, 86, 86, 87, 87, 86, 87, 61, 58, 57, 55, 55, 54, + 53, 53, 55, 56, 59, 59, 62, 63, 66, 68, 69, 72, 73, 76, 76, 80, 80, 83, + 84, 86, 87, 88, 89, 89, 89, 87, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, + 60, 61, 63, 65, 67, 69, 71, 73, 75, 78, 78, 82, 82, 85, 86, 89, 89, 90, + 91, 92, 90, 91, 64, 61, 60, 58, 57, 57, 56, 55, 57, 58, 61, 61, 64, 65, + 68, 69, 71, 74, 75, 78, 78, 82, 83, 86, 87, 89, 90, 91, 92, 93, 94, 91, + 65, 61, 61, 58, 58, 57, 56, 55, 58, 58, 61, 62, 64, 65, 68, 70, 71, 74, + 75, 78, 79, 83, 83, 86, 88, 90, 91, 91, 93, 94, 94, 96, 66, 63, 62, 60, + 59, 58, 57, 56, 59, 59, 62, 63, 65, 66, 69, 70, 72, 75, 76, 79, 80, 84, + 84, 87, 89, 91, 92, 93, 94, 94, 96, 96, 67, 64, 63, 61, 60, 59, 58, 57, + 59, 60, 62, 63, 66, 66, 70, 70, 73, 74, 77, 78, 81, 83, 85, 87, 89, 92, + 93, 94, 94, 96, 96, 97, 68, 64, 64, 61, 61, 60, 59, 58, 59, 61, 62, 64, + 65, 67, 69, 71, 72, 74, 77, 78, 81, 82, 85, 86, 89, 90, 94, 94, 96, 96, + 98, 97, 69, 65, 65, 62, 62, 61, 61, 58, 59, 62, 62, 65, 65, 68, 68, 71, + 71, 75, 75, 79, 79, 83, 83, 87, 87, 91, 91, 96, 96, 97, 97, 99, + /* Size 4x8 */ + 31, 47, 50, 61, 36, 47, 47, 57, 43, 50, 50, 58, 45, 53, 58, 65, 47, 54, + 66, 74, 52, 56, 70, 82, 57, 60, 75, 90, 61, 63, 77, 93, + /* Size 8x4 */ + 31, 36, 43, 45, 47, 52, 57, 61, 47, 47, 50, 53, 54, 56, 60, 63, 50, 47, + 50, 58, 66, 70, 75, 77, 61, 57, 58, 65, 74, 82, 90, 93, + /* Size 8x16 */ + 32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 59, 63, 31, 35, + 43, 46, 47, 51, 57, 60, 35, 39, 46, 46, 47, 50, 55, 58, 41, 43, 48, 49, + 49, 52, 57, 59, 49, 47, 50, 53, 54, 57, 60, 62, 48, 46, 49, 54, 57, 60, + 64, 65, 49, 45, 48, 56, 61, 64, 67, 69, 50, 46, 49, 57, 63, 67, 71, 73, + 52, 48, 50, 58, 65, 71, 75, 77, 54, 50, 51, 59, 67, 73, 78, 81, 57, 52, + 53, 61, 69, 77, 82, 85, 61, 55, 56, 63, 72, 80, 86, 88, 64, 58, 58, 65, + 73, 82, 89, 92, 66, 59, 59, 66, 75, 84, 91, 94, 68, 61, 59, 65, 72, 81, + 89, 95, + /* Size 16x8 */ + 32, 31, 31, 35, 41, 49, 48, 49, 50, 52, 54, 57, 61, 64, 66, 68, 32, 33, + 35, 39, 43, 47, 46, 45, 46, 48, 50, 52, 55, 58, 59, 61, 40, 41, 43, 46, + 48, 50, 49, 48, 49, 50, 51, 53, 56, 58, 59, 59, 49, 47, 46, 46, 49, 53, + 54, 56, 57, 58, 59, 61, 63, 65, 66, 65, 51, 49, 47, 47, 49, 54, 57, 61, + 63, 65, 67, 69, 72, 73, 75, 72, 57, 54, 51, 50, 52, 57, 60, 64, 67, 71, + 73, 77, 80, 82, 84, 81, 63, 59, 57, 55, 57, 60, 64, 67, 71, 75, 78, 82, + 86, 89, 91, 89, 67, 63, 60, 58, 59, 62, 65, 69, 73, 77, 81, 85, 88, 92, + 94, 95, + /* Size 16x32 */ + 32, 31, 32, 37, 40, 48, 49, 49, 51, 52, 57, 58, 63, 64, 67, 67, 31, 31, + 33, 38, 41, 47, 47, 47, 49, 50, 54, 55, 60, 61, 63, 64, 31, 31, 33, 38, + 41, 47, 47, 47, 49, 49, 54, 54, 59, 60, 63, 64, 30, 32, 33, 40, 42, 46, + 45, 45, 47, 48, 52, 52, 57, 58, 60, 61, 31, 33, 35, 41, 43, 46, 46, 45, + 47, 48, 51, 52, 57, 57, 60, 61, 33, 36, 37, 43, 44, 47, 46, 46, 47, 47, + 51, 52, 56, 57, 59, 60, 35, 38, 39, 45, 46, 47, 46, 45, 47, 47, 50, 51, + 55, 56, 58, 60, 37, 40, 41, 47, 47, 47, 46, 45, 46, 47, 50, 50, 54, 55, + 57, 58, 41, 42, 43, 47, 48, 49, 49, 48, 49, 50, 52, 53, 57, 57, 59, 58, + 42, 43, 43, 47, 48, 50, 49, 49, 50, 50, 53, 54, 57, 58, 60, 61, 49, 46, + 47, 48, 50, 53, 53, 53, 54, 54, 57, 57, 60, 61, 62, 61, 49, 46, 47, 48, + 50, 53, 53, 54, 54, 55, 57, 57, 61, 61, 63, 64, 48, 46, 46, 47, 49, 53, + 54, 56, 57, 57, 60, 60, 64, 64, 65, 64, 48, 45, 46, 46, 49, 53, 55, 56, + 58, 58, 61, 61, 65, 65, 66, 67, 49, 45, 45, 46, 48, 53, 56, 58, 61, 61, + 64, 64, 67, 68, 69, 67, 49, 46, 46, 46, 49, 53, 57, 59, 62, 62, 65, 66, + 69, 69, 70, 70, 50, 46, 46, 46, 49, 54, 57, 59, 63, 64, 67, 67, 71, 71, + 73, 71, 51, 47, 47, 47, 49, 54, 58, 61, 64, 66, 69, 70, 73, 74, 74, 74, + 52, 48, 48, 47, 50, 54, 58, 61, 65, 66, 71, 71, 75, 75, 77, 74, 54, 50, + 49, 48, 51, 55, 59, 62, 67, 68, 73, 73, 77, 78, 78, 78, 54, 50, 50, 49, + 51, 55, 59, 62, 67, 68, 73, 74, 78, 78, 81, 78, 57, 52, 52, 50, 52, 56, + 60, 64, 69, 70, 76, 77, 82, 82, 83, 82, 57, 52, 52, 51, 53, 57, 61, 64, + 69, 71, 77, 77, 82, 83, 85, 82, 60, 54, 54, 52, 55, 58, 62, 65, 71, 72, + 79, 79, 85, 86, 87, 86, 61, 56, 55, 53, 56, 59, 63, 66, 72, 73, 80, 81, + 86, 87, 88, 86, 63, 57, 57, 55, 57, 60, 64, 67, 73, 75, 82, 82, 89, 90, + 92, 90, 64, 58, 58, 55, 58, 61, 65, 68, 73, 75, 82, 83, 89, 90, 92, 90, + 64, 59, 58, 56, 58, 61, 65, 68, 74, 75, 83, 83, 90, 91, 94, 95, 66, 60, + 59, 57, 59, 62, 66, 69, 75, 76, 84, 85, 91, 92, 94, 95, 67, 61, 60, 58, + 59, 63, 66, 70, 74, 77, 82, 85, 91, 93, 96, 96, 68, 62, 61, 58, 59, 64, + 65, 71, 72, 78, 81, 86, 89, 94, 95, 96, 68, 62, 62, 59, 59, 65, 65, 71, + 71, 79, 79, 87, 87, 95, 95, 98, + /* Size 32x16 */ + 32, 31, 31, 30, 31, 33, 35, 37, 41, 42, 49, 49, 48, 48, 49, 49, 50, 51, + 52, 54, 54, 57, 57, 60, 61, 63, 64, 64, 66, 67, 68, 68, 31, 31, 31, 32, + 33, 36, 38, 40, 42, 43, 46, 46, 46, 45, 45, 46, 46, 47, 48, 50, 50, 52, + 52, 54, 56, 57, 58, 59, 60, 61, 62, 62, 32, 33, 33, 33, 35, 37, 39, 41, + 43, 43, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52, 52, 54, 55, 57, + 58, 58, 59, 60, 61, 62, 37, 38, 38, 40, 41, 43, 45, 47, 47, 47, 48, 48, + 47, 46, 46, 46, 46, 47, 47, 48, 49, 50, 51, 52, 53, 55, 55, 56, 57, 58, + 58, 59, 40, 41, 41, 42, 43, 44, 46, 47, 48, 48, 50, 50, 49, 49, 48, 49, + 49, 49, 50, 51, 51, 52, 53, 55, 56, 57, 58, 58, 59, 59, 59, 59, 48, 47, + 47, 46, 46, 47, 47, 47, 49, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 55, + 55, 56, 57, 58, 59, 60, 61, 61, 62, 63, 64, 65, 49, 47, 47, 45, 46, 46, + 46, 46, 49, 49, 53, 53, 54, 55, 56, 57, 57, 58, 58, 59, 59, 60, 61, 62, + 63, 64, 65, 65, 66, 66, 65, 65, 49, 47, 47, 45, 45, 46, 45, 45, 48, 49, + 53, 54, 56, 56, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 66, 67, 68, 68, + 69, 70, 71, 71, 51, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, + 61, 62, 63, 64, 65, 67, 67, 69, 69, 71, 72, 73, 73, 74, 75, 74, 72, 71, + 52, 50, 49, 48, 48, 47, 47, 47, 50, 50, 54, 55, 57, 58, 61, 62, 64, 66, + 66, 68, 68, 70, 71, 72, 73, 75, 75, 75, 76, 77, 78, 79, 57, 54, 54, 52, + 51, 51, 50, 50, 52, 53, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 73, 76, + 77, 79, 80, 82, 82, 83, 84, 82, 81, 79, 58, 55, 54, 52, 52, 52, 51, 50, + 53, 54, 57, 57, 60, 61, 64, 66, 67, 70, 71, 73, 74, 77, 77, 79, 81, 82, + 83, 83, 85, 85, 86, 87, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61, + 64, 65, 67, 69, 71, 73, 75, 77, 78, 82, 82, 85, 86, 89, 89, 90, 91, 91, + 89, 87, 64, 61, 60, 58, 57, 57, 56, 55, 57, 58, 61, 61, 64, 65, 68, 69, + 71, 74, 75, 78, 78, 82, 83, 86, 87, 90, 90, 91, 92, 93, 94, 95, 67, 63, + 63, 60, 60, 59, 58, 57, 59, 60, 62, 63, 65, 66, 69, 70, 73, 74, 77, 78, + 81, 83, 85, 87, 88, 92, 92, 94, 94, 96, 95, 95, 67, 64, 64, 61, 61, 60, + 60, 58, 58, 61, 61, 64, 64, 67, 67, 70, 71, 74, 74, 78, 78, 82, 82, 86, + 86, 90, 90, 95, 95, 96, 96, 98, + /* Size 4x16 */ + 31, 48, 52, 64, 31, 47, 49, 60, 33, 46, 48, 57, 38, 47, 47, 56, 42, 49, + 50, 57, 46, 53, 54, 61, 46, 53, 57, 64, 45, 53, 61, 68, 46, 54, 64, 71, + 48, 54, 66, 75, 50, 55, 68, 78, 52, 57, 71, 83, 56, 59, 73, 87, 58, 61, + 75, 90, 60, 62, 76, 92, 62, 64, 78, 94, + /* Size 16x4 */ + 31, 31, 33, 38, 42, 46, 46, 45, 46, 48, 50, 52, 56, 58, 60, 62, 48, 47, + 46, 47, 49, 53, 53, 53, 54, 54, 55, 57, 59, 61, 62, 64, 52, 49, 48, 47, + 50, 54, 57, 61, 64, 66, 68, 71, 73, 75, 76, 78, 64, 60, 57, 56, 57, 61, + 64, 68, 71, 75, 78, 83, 87, 90, 92, 94, + /* Size 8x32 */ + 32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 60, 63, 31, 33, + 41, 47, 49, 54, 59, 63, 30, 33, 42, 45, 47, 52, 57, 60, 31, 35, 43, 46, + 47, 51, 57, 60, 33, 37, 44, 46, 47, 51, 56, 59, 35, 39, 46, 46, 47, 50, + 55, 58, 37, 41, 47, 46, 46, 50, 54, 57, 41, 43, 48, 49, 49, 52, 57, 59, + 42, 43, 48, 49, 50, 53, 57, 60, 49, 47, 50, 53, 54, 57, 60, 62, 49, 47, + 50, 53, 54, 57, 61, 63, 48, 46, 49, 54, 57, 60, 64, 65, 48, 46, 49, 55, + 58, 61, 65, 66, 49, 45, 48, 56, 61, 64, 67, 69, 49, 46, 49, 57, 62, 65, + 69, 70, 50, 46, 49, 57, 63, 67, 71, 73, 51, 47, 49, 58, 64, 69, 73, 74, + 52, 48, 50, 58, 65, 71, 75, 77, 54, 49, 51, 59, 67, 73, 77, 78, 54, 50, + 51, 59, 67, 73, 78, 81, 57, 52, 52, 60, 69, 76, 82, 83, 57, 52, 53, 61, + 69, 77, 82, 85, 60, 54, 55, 62, 71, 79, 85, 87, 61, 55, 56, 63, 72, 80, + 86, 88, 63, 57, 57, 64, 73, 82, 89, 92, 64, 58, 58, 65, 73, 82, 89, 92, + 64, 58, 58, 65, 74, 83, 90, 94, 66, 59, 59, 66, 75, 84, 91, 94, 67, 60, + 59, 66, 74, 82, 91, 96, 68, 61, 59, 65, 72, 81, 89, 95, 68, 62, 59, 65, + 71, 79, 87, 95, + /* Size 32x8 */ + 32, 31, 31, 30, 31, 33, 35, 37, 41, 42, 49, 49, 48, 48, 49, 49, 50, 51, + 52, 54, 54, 57, 57, 60, 61, 63, 64, 64, 66, 67, 68, 68, 32, 33, 33, 33, + 35, 37, 39, 41, 43, 43, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52, + 52, 54, 55, 57, 58, 58, 59, 60, 61, 62, 40, 41, 41, 42, 43, 44, 46, 47, + 48, 48, 50, 50, 49, 49, 48, 49, 49, 49, 50, 51, 51, 52, 53, 55, 56, 57, + 58, 58, 59, 59, 59, 59, 49, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53, + 54, 55, 56, 57, 57, 58, 58, 59, 59, 60, 61, 62, 63, 64, 65, 65, 66, 66, + 65, 65, 51, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62, + 63, 64, 65, 67, 67, 69, 69, 71, 72, 73, 73, 74, 75, 74, 72, 71, 57, 54, + 54, 52, 51, 51, 50, 50, 52, 53, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, + 73, 76, 77, 79, 80, 82, 82, 83, 84, 82, 81, 79, 63, 60, 59, 57, 57, 56, + 55, 54, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 75, 77, 78, 82, 82, 85, + 86, 89, 89, 90, 91, 91, 89, 87, 67, 63, 63, 60, 60, 59, 58, 57, 59, 60, + 62, 63, 65, 66, 69, 70, 73, 74, 77, 78, 81, 83, 85, 87, 88, 92, 92, 94, + 94, 96, 95, 95 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 34, 49, 72, 34, 48, 60, 79, 49, 60, 82, 104, 72, 79, 104, 134, + /* Size 8x8 */ + 32, 32, 34, 38, 46, 56, 68, 78, 32, 33, 35, 39, 45, 54, 64, 74, 34, 35, + 39, 45, 51, 58, 68, 76, 38, 39, 45, 54, 61, 69, 78, 86, 46, 45, 51, 61, + 71, 80, 90, 99, 56, 54, 58, 69, 80, 92, 103, 113, 68, 64, 68, 78, 90, + 103, 117, 128, 78, 74, 76, 86, 99, 113, 128, 140, + /* Size 16x16 */ + 32, 31, 31, 31, 32, 34, 36, 39, 44, 48, 54, 59, 65, 71, 80, 83, 31, 32, + 32, 32, 32, 34, 35, 38, 42, 46, 51, 56, 62, 68, 76, 78, 31, 32, 32, 32, + 32, 33, 34, 37, 41, 44, 49, 54, 59, 65, 72, 75, 31, 32, 32, 33, 34, 35, + 36, 39, 42, 45, 50, 54, 59, 64, 71, 74, 32, 32, 32, 34, 35, 37, 38, 40, + 42, 46, 49, 53, 58, 63, 69, 72, 34, 34, 33, 35, 37, 39, 42, 45, 47, 51, + 54, 58, 63, 68, 74, 76, 36, 35, 34, 36, 38, 42, 48, 50, 54, 57, 60, 64, + 68, 73, 79, 81, 39, 38, 37, 39, 40, 45, 50, 54, 58, 61, 65, 69, 73, 78, + 84, 86, 44, 42, 41, 42, 42, 47, 54, 58, 63, 67, 71, 75, 79, 84, 90, 92, + 48, 46, 44, 45, 46, 51, 57, 61, 67, 71, 76, 80, 85, 90, 96, 99, 54, 51, + 49, 50, 49, 54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106, 59, 56, 54, + 54, 53, 58, 64, 69, 75, 80, 87, 92, 98, 103, 110, 113, 65, 62, 59, 59, + 58, 63, 68, 73, 79, 85, 92, 98, 105, 111, 118, 121, 71, 68, 65, 64, 63, + 68, 73, 78, 84, 90, 97, 103, 111, 117, 125, 128, 80, 76, 72, 71, 69, 74, + 79, 84, 90, 96, 104, 110, 118, 125, 134, 137, 83, 78, 75, 74, 72, 76, + 81, 86, 92, 99, 106, 113, 121, 128, 137, 140, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48, + 48, 54, 54, 59, 59, 65, 65, 71, 71, 80, 80, 83, 83, 87, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 42, 46, 46, 51, 51, 56, + 56, 62, 62, 68, 68, 76, 76, 78, 78, 83, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 34, 34, 35, 35, 38, 38, 42, 42, 46, 46, 51, 51, 56, 56, 62, 62, 68, + 68, 76, 76, 78, 78, 83, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, + 34, 37, 37, 41, 41, 44, 44, 49, 49, 54, 54, 59, 59, 65, 65, 72, 72, 75, + 75, 79, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 37, 37, 41, + 41, 44, 44, 49, 49, 54, 54, 59, 59, 65, 65, 72, 72, 75, 75, 79, 31, 32, + 32, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 42, 45, 45, 50, + 50, 54, 54, 59, 59, 64, 64, 71, 71, 74, 74, 77, 31, 32, 32, 32, 32, 33, + 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 42, 45, 45, 50, 50, 54, 54, 59, + 59, 64, 64, 71, 71, 74, 74, 77, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, + 37, 38, 38, 40, 40, 42, 42, 46, 46, 49, 49, 53, 53, 58, 58, 63, 63, 69, + 69, 72, 72, 75, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40, + 40, 42, 42, 46, 46, 49, 49, 53, 53, 58, 58, 63, 63, 69, 69, 72, 72, 75, + 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 47, 51, + 51, 54, 54, 58, 58, 63, 63, 68, 68, 74, 74, 76, 76, 80, 34, 34, 34, 33, + 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 47, 51, 51, 54, 54, 58, + 58, 63, 63, 68, 68, 74, 74, 76, 76, 80, 36, 35, 35, 34, 34, 36, 36, 38, + 38, 42, 42, 48, 48, 50, 50, 54, 54, 57, 57, 60, 60, 64, 64, 68, 68, 73, + 73, 79, 79, 81, 81, 84, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, + 48, 50, 50, 54, 54, 57, 57, 60, 60, 64, 64, 68, 68, 73, 73, 79, 79, 81, + 81, 84, 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50, 54, 54, 58, + 58, 61, 61, 65, 65, 69, 69, 73, 73, 78, 78, 84, 84, 86, 86, 90, 39, 38, + 38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50, 54, 54, 58, 58, 61, 61, 65, + 65, 69, 69, 73, 73, 78, 78, 84, 84, 86, 86, 90, 44, 42, 42, 41, 41, 42, + 42, 42, 42, 47, 47, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, + 79, 84, 84, 90, 90, 92, 92, 96, 44, 42, 42, 41, 41, 42, 42, 42, 42, 47, + 47, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, + 90, 92, 92, 96, 48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61, + 61, 67, 67, 71, 71, 76, 76, 80, 80, 85, 85, 90, 90, 96, 96, 99, 99, 102, + 48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61, 61, 67, 67, 71, + 71, 76, 76, 80, 80, 85, 85, 90, 90, 96, 96, 99, 99, 102, 54, 51, 51, 49, + 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, + 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 54, 51, 51, 49, 49, 50, 50, + 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, 87, 92, 92, + 97, 97, 104, 104, 106, 106, 109, 59, 56, 56, 54, 54, 54, 54, 53, 53, 58, + 58, 64, 64, 69, 69, 75, 75, 80, 80, 87, 87, 92, 92, 98, 98, 103, 103, + 110, 110, 113, 113, 116, 59, 56, 56, 54, 54, 54, 54, 53, 53, 58, 58, 64, + 64, 69, 69, 75, 75, 80, 80, 87, 87, 92, 92, 98, 98, 103, 103, 110, 110, + 113, 113, 116, 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, + 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, + 121, 124, 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, + 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121, + 124, 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, 78, 78, 84, 84, + 90, 90, 97, 97, 103, 103, 111, 111, 117, 117, 125, 125, 128, 128, 132, + 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, 78, 78, 84, 84, 90, + 90, 97, 97, 103, 103, 111, 111, 117, 117, 125, 125, 128, 128, 132, 80, + 76, 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84, 84, 90, 90, 96, 96, + 104, 104, 110, 110, 118, 118, 125, 125, 134, 134, 137, 137, 141, 80, 76, + 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84, 84, 90, 90, 96, 96, 104, + 104, 110, 110, 118, 118, 125, 125, 134, 134, 137, 137, 141, 83, 78, 78, + 75, 75, 74, 74, 72, 72, 76, 76, 81, 81, 86, 86, 92, 92, 99, 99, 106, + 106, 113, 113, 121, 121, 128, 128, 137, 137, 140, 140, 144, 83, 78, 78, + 75, 75, 74, 74, 72, 72, 76, 76, 81, 81, 86, 86, 92, 92, 99, 99, 106, + 106, 113, 113, 121, 121, 128, 128, 137, 137, 140, 140, 144, 87, 83, 83, + 79, 79, 77, 77, 75, 75, 80, 80, 84, 84, 90, 90, 96, 96, 102, 102, 109, + 109, 116, 116, 124, 124, 132, 132, 141, 141, 144, 144, 149, + /* Size 4x8 */ + 32, 35, 51, 75, 32, 36, 50, 71, 34, 42, 54, 73, 37, 50, 65, 84, 45, 56, + 76, 96, 54, 63, 87, 110, 65, 73, 97, 125, 75, 81, 106, 136, + /* Size 8x4 */ + 32, 32, 34, 37, 45, 54, 65, 75, 35, 36, 42, 50, 56, 63, 73, 81, 51, 50, + 54, 65, 76, 87, 97, 106, 75, 71, 73, 84, 96, 110, 125, 136, + /* Size 8x16 */ + 32, 31, 32, 36, 44, 53, 65, 79, 31, 32, 32, 35, 42, 51, 62, 75, 31, 32, + 33, 34, 41, 49, 59, 72, 32, 32, 34, 36, 42, 50, 59, 71, 32, 33, 35, 38, + 42, 49, 58, 69, 34, 34, 37, 42, 48, 54, 63, 73, 36, 34, 38, 48, 54, 60, + 68, 78, 39, 37, 40, 50, 58, 65, 73, 84, 44, 41, 43, 53, 63, 71, 79, 90, + 48, 45, 46, 56, 67, 76, 85, 96, 53, 49, 50, 60, 71, 82, 92, 103, 58, 54, + 54, 63, 75, 87, 98, 110, 65, 60, 58, 68, 79, 92, 105, 118, 71, 65, 63, + 73, 84, 97, 111, 125, 79, 72, 70, 79, 90, 104, 118, 133, 82, 75, 72, 81, + 92, 106, 121, 136, + /* Size 16x8 */ + 32, 31, 31, 32, 32, 34, 36, 39, 44, 48, 53, 58, 65, 71, 79, 82, 31, 32, + 32, 32, 33, 34, 34, 37, 41, 45, 49, 54, 60, 65, 72, 75, 32, 32, 33, 34, + 35, 37, 38, 40, 43, 46, 50, 54, 58, 63, 70, 72, 36, 35, 34, 36, 38, 42, + 48, 50, 53, 56, 60, 63, 68, 73, 79, 81, 44, 42, 41, 42, 42, 48, 54, 58, + 63, 67, 71, 75, 79, 84, 90, 92, 53, 51, 49, 50, 49, 54, 60, 65, 71, 76, + 82, 87, 92, 97, 104, 106, 65, 62, 59, 59, 58, 63, 68, 73, 79, 85, 92, + 98, 105, 111, 118, 121, 79, 75, 72, 71, 69, 73, 78, 84, 90, 96, 103, + 110, 118, 125, 133, 136, + /* Size 16x32 */ + 32, 31, 31, 32, 32, 36, 36, 44, 44, 53, 53, 65, 65, 79, 79, 87, 31, 32, + 32, 32, 32, 35, 35, 42, 42, 51, 51, 62, 62, 75, 75, 82, 31, 32, 32, 32, + 32, 35, 35, 42, 42, 51, 51, 62, 62, 75, 75, 82, 31, 32, 32, 33, 33, 34, + 34, 41, 41, 49, 49, 59, 59, 72, 72, 78, 31, 32, 32, 33, 33, 34, 34, 41, + 41, 49, 49, 59, 59, 72, 72, 78, 32, 32, 32, 34, 34, 36, 36, 42, 42, 50, + 50, 59, 59, 71, 71, 77, 32, 32, 32, 34, 34, 36, 36, 42, 42, 50, 50, 59, + 59, 71, 71, 77, 32, 33, 33, 35, 35, 38, 38, 42, 42, 49, 49, 58, 58, 69, + 69, 75, 32, 33, 33, 35, 35, 38, 38, 42, 42, 49, 49, 58, 58, 69, 69, 75, + 34, 34, 34, 37, 37, 42, 42, 48, 48, 54, 54, 63, 63, 73, 73, 79, 34, 34, + 34, 37, 37, 42, 42, 48, 48, 54, 54, 63, 63, 73, 73, 79, 36, 34, 34, 38, + 38, 48, 48, 54, 54, 60, 60, 68, 68, 78, 78, 84, 36, 34, 34, 38, 38, 48, + 48, 54, 54, 60, 60, 68, 68, 78, 78, 84, 39, 37, 37, 40, 40, 50, 50, 58, + 58, 65, 65, 73, 73, 84, 84, 89, 39, 37, 37, 40, 40, 50, 50, 58, 58, 65, + 65, 73, 73, 84, 84, 89, 44, 41, 41, 43, 43, 53, 53, 63, 63, 71, 71, 79, + 79, 90, 90, 95, 44, 41, 41, 43, 43, 53, 53, 63, 63, 71, 71, 79, 79, 90, + 90, 95, 48, 45, 45, 46, 46, 56, 56, 67, 67, 76, 76, 85, 85, 96, 96, 102, + 48, 45, 45, 46, 46, 56, 56, 67, 67, 76, 76, 85, 85, 96, 96, 102, 53, 49, + 49, 50, 50, 60, 60, 71, 71, 82, 82, 92, 92, 103, 103, 109, 53, 49, 49, + 50, 50, 60, 60, 71, 71, 82, 82, 92, 92, 103, 103, 109, 58, 54, 54, 54, + 54, 63, 63, 75, 75, 87, 87, 98, 98, 110, 110, 116, 58, 54, 54, 54, 54, + 63, 63, 75, 75, 87, 87, 98, 98, 110, 110, 116, 65, 60, 60, 58, 58, 68, + 68, 79, 79, 92, 92, 105, 105, 118, 118, 124, 65, 60, 60, 58, 58, 68, 68, + 79, 79, 92, 92, 105, 105, 118, 118, 124, 71, 65, 65, 63, 63, 73, 73, 84, + 84, 97, 97, 111, 111, 125, 125, 132, 71, 65, 65, 63, 63, 73, 73, 84, 84, + 97, 97, 111, 111, 125, 125, 132, 79, 72, 72, 70, 70, 79, 79, 90, 90, + 104, 104, 118, 118, 133, 133, 141, 79, 72, 72, 70, 70, 79, 79, 90, 90, + 104, 104, 118, 118, 133, 133, 141, 82, 75, 75, 72, 72, 81, 81, 92, 92, + 106, 106, 121, 121, 136, 136, 144, 82, 75, 75, 72, 72, 81, 81, 92, 92, + 106, 106, 121, 121, 136, 136, 144, 87, 79, 79, 76, 76, 84, 84, 96, 96, + 109, 109, 124, 124, 141, 141, 149, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48, + 48, 53, 53, 58, 58, 65, 65, 71, 71, 79, 79, 82, 82, 87, 31, 32, 32, 32, + 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54, + 54, 60, 60, 65, 65, 72, 72, 75, 75, 79, 31, 32, 32, 32, 32, 32, 32, 33, + 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54, 54, 60, 60, 65, + 65, 72, 72, 75, 75, 79, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, + 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72, + 72, 76, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, + 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72, 72, 76, 36, 35, + 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60, + 60, 63, 63, 68, 68, 73, 73, 79, 79, 81, 81, 84, 36, 35, 35, 34, 34, 36, + 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, + 68, 73, 73, 79, 79, 81, 81, 84, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, + 48, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, + 90, 92, 92, 96, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, + 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96, + 53, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, + 76, 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 53, 51, 51, + 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, + 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 65, 62, 62, 59, 59, 59, + 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, + 105, 111, 111, 118, 118, 121, 121, 124, 65, 62, 62, 59, 59, 59, 59, 58, + 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, + 111, 111, 118, 118, 121, 121, 124, 79, 75, 75, 72, 72, 71, 71, 69, 69, + 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118, + 125, 125, 133, 133, 136, 136, 141, 79, 75, 75, 72, 72, 71, 71, 69, 69, + 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118, + 125, 125, 133, 133, 136, 136, 141, 87, 82, 82, 78, 78, 77, 77, 75, 75, + 79, 79, 84, 84, 89, 89, 95, 95, 102, 102, 109, 109, 116, 116, 124, 124, + 132, 132, 141, 141, 144, 144, 149, + /* Size 4x16 */ + 31, 36, 53, 79, 32, 35, 51, 75, 32, 34, 49, 72, 32, 36, 50, 71, 33, 38, + 49, 69, 34, 42, 54, 73, 34, 48, 60, 78, 37, 50, 65, 84, 41, 53, 71, 90, + 45, 56, 76, 96, 49, 60, 82, 103, 54, 63, 87, 110, 60, 68, 92, 118, 65, + 73, 97, 125, 72, 79, 104, 133, 75, 81, 106, 136, + /* Size 16x4 */ + 31, 32, 32, 32, 33, 34, 34, 37, 41, 45, 49, 54, 60, 65, 72, 75, 36, 35, + 34, 36, 38, 42, 48, 50, 53, 56, 60, 63, 68, 73, 79, 81, 53, 51, 49, 50, + 49, 54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106, 79, 75, 72, 71, 69, + 73, 78, 84, 90, 96, 103, 110, 118, 125, 133, 136, + /* Size 8x32 */ + 32, 31, 32, 36, 44, 53, 65, 79, 31, 32, 32, 35, 42, 51, 62, 75, 31, 32, + 32, 35, 42, 51, 62, 75, 31, 32, 33, 34, 41, 49, 59, 72, 31, 32, 33, 34, + 41, 49, 59, 72, 32, 32, 34, 36, 42, 50, 59, 71, 32, 32, 34, 36, 42, 50, + 59, 71, 32, 33, 35, 38, 42, 49, 58, 69, 32, 33, 35, 38, 42, 49, 58, 69, + 34, 34, 37, 42, 48, 54, 63, 73, 34, 34, 37, 42, 48, 54, 63, 73, 36, 34, + 38, 48, 54, 60, 68, 78, 36, 34, 38, 48, 54, 60, 68, 78, 39, 37, 40, 50, + 58, 65, 73, 84, 39, 37, 40, 50, 58, 65, 73, 84, 44, 41, 43, 53, 63, 71, + 79, 90, 44, 41, 43, 53, 63, 71, 79, 90, 48, 45, 46, 56, 67, 76, 85, 96, + 48, 45, 46, 56, 67, 76, 85, 96, 53, 49, 50, 60, 71, 82, 92, 103, 53, 49, + 50, 60, 71, 82, 92, 103, 58, 54, 54, 63, 75, 87, 98, 110, 58, 54, 54, + 63, 75, 87, 98, 110, 65, 60, 58, 68, 79, 92, 105, 118, 65, 60, 58, 68, + 79, 92, 105, 118, 71, 65, 63, 73, 84, 97, 111, 125, 71, 65, 63, 73, 84, + 97, 111, 125, 79, 72, 70, 79, 90, 104, 118, 133, 79, 72, 70, 79, 90, + 104, 118, 133, 82, 75, 72, 81, 92, 106, 121, 136, 82, 75, 72, 81, 92, + 106, 121, 136, 87, 79, 76, 84, 96, 109, 124, 141, + /* Size 32x8 */ + 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48, + 48, 53, 53, 58, 58, 65, 65, 71, 71, 79, 79, 82, 82, 87, 31, 32, 32, 32, + 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54, + 54, 60, 60, 65, 65, 72, 72, 75, 75, 79, 32, 32, 32, 33, 33, 34, 34, 35, + 35, 37, 37, 38, 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, + 63, 70, 70, 72, 72, 76, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, + 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, 68, 73, 73, 79, 79, 81, + 81, 84, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63, + 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96, 53, 51, + 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, + 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 65, 62, 62, 59, 59, + 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, + 105, 105, 111, 111, 118, 118, 121, 121, 124, 79, 75, 75, 72, 72, 71, 71, + 69, 69, 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, + 118, 125, 125, 133, 133, 136, 136, 141 }, + { /* Chroma */ + /* Size 4x4 */ + 32, 46, 47, 57, 46, 53, 54, 60, 47, 54, 66, 75, 57, 60, 75, 89, + /* Size 8x8 */ + 31, 34, 42, 47, 48, 52, 57, 61, 34, 39, 45, 46, 46, 49, 53, 57, 42, 45, + 48, 49, 50, 52, 55, 58, 47, 46, 49, 54, 56, 58, 61, 64, 48, 46, 50, 56, + 61, 65, 68, 71, 52, 49, 52, 58, 65, 71, 75, 79, 57, 53, 55, 61, 68, 75, + 82, 86, 61, 57, 58, 64, 71, 79, 86, 91, + /* Size 16x16 */ + 32, 31, 30, 33, 36, 41, 49, 48, 49, 50, 52, 54, 57, 60, 63, 65, 31, 31, + 31, 34, 38, 42, 47, 47, 47, 48, 50, 52, 54, 57, 60, 61, 30, 31, 32, 35, + 40, 42, 46, 45, 45, 46, 47, 49, 52, 54, 57, 58, 33, 34, 35, 39, 43, 45, + 47, 46, 45, 46, 47, 49, 51, 53, 56, 57, 36, 38, 40, 43, 47, 47, 48, 46, + 45, 46, 47, 48, 50, 52, 54, 55, 41, 42, 42, 45, 47, 48, 50, 49, 49, 50, + 50, 52, 53, 55, 57, 58, 49, 47, 46, 47, 48, 50, 53, 53, 53, 54, 54, 55, + 56, 58, 60, 61, 48, 47, 45, 46, 46, 49, 53, 54, 55, 56, 57, 58, 60, 61, + 63, 64, 49, 47, 45, 45, 45, 49, 53, 55, 58, 60, 61, 62, 63, 65, 67, 68, + 50, 48, 46, 46, 46, 50, 54, 56, 60, 61, 63, 65, 67, 68, 71, 71, 52, 50, + 47, 47, 47, 50, 54, 57, 61, 63, 66, 68, 70, 72, 75, 75, 54, 52, 49, 49, + 48, 52, 55, 58, 62, 65, 68, 71, 73, 75, 78, 79, 57, 54, 52, 51, 50, 53, + 56, 60, 63, 67, 70, 73, 76, 79, 82, 83, 60, 57, 54, 53, 52, 55, 58, 61, + 65, 68, 72, 75, 79, 82, 85, 86, 63, 60, 57, 56, 54, 57, 60, 63, 67, 71, + 75, 78, 82, 85, 89, 90, 65, 61, 58, 57, 55, 58, 61, 64, 68, 71, 75, 79, + 83, 86, 90, 91, + /* Size 32x32 */ + 32, 31, 31, 30, 30, 33, 33, 36, 36, 41, 41, 49, 49, 48, 48, 49, 49, 50, + 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 65, 65, 67, 31, 31, 31, 31, + 31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 47, 48, 48, 50, 50, 52, + 52, 54, 54, 57, 57, 60, 60, 61, 61, 63, 31, 31, 31, 31, 31, 34, 34, 38, + 38, 42, 42, 47, 47, 47, 47, 47, 47, 48, 48, 50, 50, 52, 52, 54, 54, 57, + 57, 60, 60, 61, 61, 63, 30, 31, 31, 32, 32, 35, 35, 40, 40, 42, 42, 46, + 46, 45, 45, 45, 45, 46, 46, 47, 47, 49, 49, 52, 52, 54, 54, 57, 57, 58, + 58, 60, 30, 31, 31, 32, 32, 35, 35, 40, 40, 42, 42, 46, 46, 45, 45, 45, + 45, 46, 46, 47, 47, 49, 49, 52, 52, 54, 54, 57, 57, 58, 58, 60, 33, 34, + 34, 35, 35, 39, 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 45, 46, 46, 47, + 47, 49, 49, 51, 51, 53, 53, 56, 56, 57, 57, 59, 33, 34, 34, 35, 35, 39, + 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 45, 46, 46, 47, 47, 49, 49, 51, + 51, 53, 53, 56, 56, 57, 57, 59, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, + 47, 48, 48, 46, 46, 45, 45, 46, 46, 47, 47, 48, 48, 50, 50, 52, 52, 54, + 54, 55, 55, 57, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46, + 46, 45, 45, 46, 46, 47, 47, 48, 48, 50, 50, 52, 52, 54, 54, 55, 55, 57, + 41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50, + 50, 50, 50, 52, 52, 53, 53, 55, 55, 57, 57, 58, 58, 60, 41, 42, 42, 42, + 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50, 50, 50, 50, 52, + 52, 53, 53, 55, 55, 57, 57, 58, 58, 60, 49, 47, 47, 46, 46, 47, 47, 48, + 48, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, + 58, 60, 60, 61, 61, 62, 49, 47, 47, 46, 46, 47, 47, 48, 48, 50, 50, 53, + 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61, + 61, 62, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 54, 55, + 55, 56, 56, 57, 57, 58, 58, 60, 60, 61, 61, 63, 63, 64, 64, 66, 48, 47, + 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 54, 55, 55, 56, 56, 57, + 57, 58, 58, 60, 60, 61, 61, 63, 63, 64, 64, 66, 49, 47, 47, 45, 45, 45, + 45, 45, 45, 49, 49, 53, 53, 55, 55, 58, 58, 60, 60, 61, 61, 62, 62, 63, + 63, 65, 65, 67, 67, 68, 68, 69, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, + 49, 53, 53, 55, 55, 58, 58, 60, 60, 61, 61, 62, 62, 63, 63, 65, 65, 67, + 67, 68, 68, 69, 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, + 56, 60, 60, 61, 61, 63, 63, 65, 65, 67, 67, 68, 68, 71, 71, 71, 71, 72, + 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60, 61, + 61, 63, 63, 65, 65, 67, 67, 68, 68, 71, 71, 71, 71, 72, 52, 50, 50, 47, + 47, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 63, 63, 66, 66, 68, + 68, 70, 70, 72, 72, 75, 75, 75, 75, 76, 52, 50, 50, 47, 47, 47, 47, 47, + 47, 50, 50, 54, 54, 57, 57, 61, 61, 63, 63, 66, 66, 68, 68, 70, 70, 72, + 72, 75, 75, 75, 75, 76, 54, 52, 52, 49, 49, 49, 49, 48, 48, 52, 52, 55, + 55, 58, 58, 62, 62, 65, 65, 68, 68, 71, 71, 73, 73, 75, 75, 78, 78, 79, + 79, 80, 54, 52, 52, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 58, 58, 62, + 62, 65, 65, 68, 68, 71, 71, 73, 73, 75, 75, 78, 78, 79, 79, 80, 57, 54, + 54, 52, 52, 51, 51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70, + 70, 73, 73, 76, 76, 79, 79, 82, 82, 83, 83, 84, 57, 54, 54, 52, 52, 51, + 51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70, 70, 73, 73, 76, + 76, 79, 79, 82, 82, 83, 83, 84, 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, + 55, 58, 58, 61, 61, 65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 82, 85, + 85, 86, 86, 88, 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61, + 61, 65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 82, 85, 85, 86, 86, 88, + 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, + 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 63, 60, 60, 57, + 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71, 75, 75, 78, + 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 65, 61, 61, 58, 58, 57, 57, 55, + 55, 58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86, + 86, 90, 90, 91, 91, 93, 65, 61, 61, 58, 58, 57, 57, 55, 55, 58, 58, 61, + 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86, 86, 90, 90, 91, + 91, 93, 67, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60, 62, 62, 66, 66, 69, + 69, 72, 72, 76, 76, 80, 80, 84, 84, 88, 88, 92, 92, 93, 93, 95, + /* Size 4x8 */ + 31, 47, 50, 60, 36, 47, 47, 56, 43, 50, 50, 57, 46, 53, 57, 64, 46, 54, + 64, 71, 50, 55, 68, 78, 54, 58, 72, 85, 59, 61, 75, 90, + /* Size 8x4 */ + 31, 36, 43, 46, 46, 50, 54, 59, 47, 47, 50, 53, 54, 55, 58, 61, 50, 47, + 50, 57, 64, 68, 72, 75, 60, 56, 57, 64, 71, 78, 85, 90, + /* Size 8x16 */ + 32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60, 30, 32, + 40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56, 37, 40, 47, 47, + 45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57, 49, 46, 48, 53, 53, 54, + 57, 60, 48, 46, 47, 53, 56, 57, 60, 64, 49, 45, 46, 53, 58, 61, 64, 67, + 50, 46, 46, 54, 59, 64, 67, 71, 52, 48, 47, 54, 61, 66, 71, 75, 54, 50, + 49, 55, 62, 68, 73, 78, 57, 52, 50, 56, 64, 70, 76, 82, 60, 54, 52, 58, + 65, 72, 79, 85, 63, 57, 55, 60, 67, 75, 82, 89, 64, 59, 56, 61, 68, 75, + 83, 90, + /* Size 16x8 */ + 32, 31, 30, 33, 37, 42, 49, 48, 49, 50, 52, 54, 57, 60, 63, 64, 31, 31, + 32, 36, 40, 43, 46, 46, 45, 46, 48, 50, 52, 54, 57, 59, 37, 38, 40, 43, + 47, 47, 48, 47, 46, 46, 47, 49, 50, 52, 55, 56, 48, 47, 46, 47, 47, 50, + 53, 53, 53, 54, 54, 55, 56, 58, 60, 61, 49, 47, 45, 46, 45, 49, 53, 56, + 58, 59, 61, 62, 64, 65, 67, 68, 52, 50, 48, 47, 47, 50, 54, 57, 61, 64, + 66, 68, 70, 72, 75, 75, 57, 54, 52, 51, 50, 53, 57, 60, 64, 67, 71, 73, + 76, 79, 82, 83, 63, 60, 57, 56, 54, 57, 60, 64, 67, 71, 75, 78, 82, 85, + 89, 90, + /* Size 16x32 */ + 32, 31, 31, 37, 37, 48, 48, 49, 49, 52, 52, 57, 57, 63, 63, 66, 31, 31, + 31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 31, 31, 31, 38, + 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 30, 32, 32, 40, 40, 46, + 46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 30, 32, 32, 40, 40, 46, 46, 45, + 45, 48, 48, 52, 52, 57, 57, 60, 33, 36, 36, 43, 43, 47, 47, 46, 46, 47, + 47, 51, 51, 56, 56, 59, 33, 36, 36, 43, 43, 47, 47, 46, 46, 47, 47, 51, + 51, 56, 56, 59, 37, 40, 40, 47, 47, 47, 47, 45, 45, 47, 47, 50, 50, 54, + 54, 57, 37, 40, 40, 47, 47, 47, 47, 45, 45, 47, 47, 50, 50, 54, 54, 57, + 42, 43, 43, 47, 47, 50, 50, 49, 49, 50, 50, 53, 53, 57, 57, 60, 42, 43, + 43, 47, 47, 50, 50, 49, 49, 50, 50, 53, 53, 57, 57, 60, 49, 46, 46, 48, + 48, 53, 53, 53, 53, 54, 54, 57, 57, 60, 60, 62, 49, 46, 46, 48, 48, 53, + 53, 53, 53, 54, 54, 57, 57, 60, 60, 62, 48, 46, 46, 47, 47, 53, 53, 56, + 56, 57, 57, 60, 60, 64, 64, 66, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57, + 57, 60, 60, 64, 64, 66, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, + 64, 67, 67, 69, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67, + 67, 69, 50, 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, + 50, 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 52, 48, + 48, 47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 52, 48, 48, 47, + 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 54, 50, 50, 49, 49, 55, + 55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 54, 50, 50, 49, 49, 55, 55, 62, + 62, 68, 68, 73, 73, 78, 78, 80, 57, 52, 52, 50, 50, 56, 56, 64, 64, 70, + 70, 76, 76, 82, 82, 84, 57, 52, 52, 50, 50, 56, 56, 64, 64, 70, 70, 76, + 76, 82, 82, 84, 60, 54, 54, 52, 52, 58, 58, 65, 65, 72, 72, 79, 79, 85, + 85, 88, 60, 54, 54, 52, 52, 58, 58, 65, 65, 72, 72, 79, 79, 85, 85, 88, + 63, 57, 57, 55, 55, 60, 60, 67, 67, 75, 75, 82, 82, 89, 89, 92, 63, 57, + 57, 55, 55, 60, 60, 67, 67, 75, 75, 82, 82, 89, 89, 92, 64, 59, 59, 56, + 56, 61, 61, 68, 68, 75, 75, 83, 83, 90, 90, 93, 64, 59, 59, 56, 56, 61, + 61, 68, 68, 75, 75, 83, 83, 90, 90, 93, 66, 60, 60, 57, 57, 63, 63, 69, + 69, 77, 77, 84, 84, 92, 92, 95, + /* Size 32x16 */ + 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 49, 50, + 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 64, 64, 66, 31, 31, 31, 32, + 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50, + 50, 52, 52, 54, 54, 57, 57, 59, 59, 60, 31, 31, 31, 32, 32, 36, 36, 40, + 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50, 50, 52, 52, 54, + 54, 57, 57, 59, 59, 60, 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, + 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56, + 56, 57, 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, + 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56, 56, 57, 48, 47, + 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, + 54, 55, 55, 56, 56, 58, 58, 60, 60, 61, 61, 63, 48, 47, 47, 46, 46, 47, + 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, + 56, 58, 58, 60, 60, 61, 61, 63, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, + 49, 53, 53, 56, 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, + 67, 68, 68, 69, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, + 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69, + 52, 50, 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, + 64, 66, 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 52, 50, 50, 48, + 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66, 66, 68, + 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 57, 54, 54, 52, 52, 51, 51, 50, + 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79, + 79, 82, 82, 83, 83, 84, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 57, + 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79, 79, 82, 82, 83, + 83, 84, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67, + 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 63, 60, + 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 75, + 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 66, 63, 63, 60, 60, 59, + 59, 57, 57, 60, 60, 62, 62, 66, 66, 69, 69, 73, 73, 77, 77, 80, 80, 84, + 84, 88, 88, 92, 92, 93, 93, 95, + /* Size 4x16 */ + 31, 48, 52, 63, 31, 47, 50, 60, 32, 46, 48, 57, 36, 47, 47, 56, 40, 47, + 47, 54, 43, 50, 50, 57, 46, 53, 54, 60, 46, 53, 57, 64, 45, 53, 61, 67, + 46, 54, 64, 71, 48, 54, 66, 75, 50, 55, 68, 78, 52, 56, 70, 82, 54, 58, + 72, 85, 57, 60, 75, 89, 59, 61, 75, 90, + /* Size 16x4 */ + 31, 31, 32, 36, 40, 43, 46, 46, 45, 46, 48, 50, 52, 54, 57, 59, 48, 47, + 46, 47, 47, 50, 53, 53, 53, 54, 54, 55, 56, 58, 60, 61, 52, 50, 48, 47, + 47, 50, 54, 57, 61, 64, 66, 68, 70, 72, 75, 75, 63, 60, 57, 56, 54, 57, + 60, 64, 67, 71, 75, 78, 82, 85, 89, 90, + /* Size 8x32 */ + 32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60, 31, 31, + 38, 47, 47, 50, 54, 60, 30, 32, 40, 46, 45, 48, 52, 57, 30, 32, 40, 46, + 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56, 33, 36, 43, 47, 46, 47, + 51, 56, 37, 40, 47, 47, 45, 47, 50, 54, 37, 40, 47, 47, 45, 47, 50, 54, + 42, 43, 47, 50, 49, 50, 53, 57, 42, 43, 47, 50, 49, 50, 53, 57, 49, 46, + 48, 53, 53, 54, 57, 60, 49, 46, 48, 53, 53, 54, 57, 60, 48, 46, 47, 53, + 56, 57, 60, 64, 48, 46, 47, 53, 56, 57, 60, 64, 49, 45, 46, 53, 58, 61, + 64, 67, 49, 45, 46, 53, 58, 61, 64, 67, 50, 46, 46, 54, 59, 64, 67, 71, + 50, 46, 46, 54, 59, 64, 67, 71, 52, 48, 47, 54, 61, 66, 71, 75, 52, 48, + 47, 54, 61, 66, 71, 75, 54, 50, 49, 55, 62, 68, 73, 78, 54, 50, 49, 55, + 62, 68, 73, 78, 57, 52, 50, 56, 64, 70, 76, 82, 57, 52, 50, 56, 64, 70, + 76, 82, 60, 54, 52, 58, 65, 72, 79, 85, 60, 54, 52, 58, 65, 72, 79, 85, + 63, 57, 55, 60, 67, 75, 82, 89, 63, 57, 55, 60, 67, 75, 82, 89, 64, 59, + 56, 61, 68, 75, 83, 90, 64, 59, 56, 61, 68, 75, 83, 90, 66, 60, 57, 63, + 69, 77, 84, 92, + /* Size 32x8 */ + 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 49, 50, + 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 64, 64, 66, 31, 31, 31, 32, + 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50, + 50, 52, 52, 54, 54, 57, 57, 59, 59, 60, 37, 38, 38, 40, 40, 43, 43, 47, + 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, + 52, 55, 55, 56, 56, 57, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, + 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61, + 61, 63, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58, + 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69, 52, 50, + 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66, + 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 57, 54, 54, 52, 52, 51, + 51, 50, 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, + 76, 79, 79, 82, 82, 83, 83, 84, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, + 57, 60, 60, 64, 64, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, + 89, 90, 90, 92 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 33, 45, 62, 33, 39, 51, 64, 45, 51, 71, 87, 62, 64, 87, 108, + /* Size 8x8 */ + 31, 32, 32, 35, 42, 51, 59, 69, 32, 32, 33, 35, 41, 49, 56, 65, 32, 33, + 35, 38, 43, 49, 56, 64, 35, 35, 38, 48, 54, 59, 66, 73, 42, 41, 43, 54, + 63, 71, 77, 85, 51, 49, 49, 59, 71, 81, 89, 97, 59, 56, 56, 66, 77, 89, + 98, 108, 69, 65, 64, 73, 85, 97, 108, 119, + /* Size 16x16 */ + 32, 31, 31, 31, 32, 34, 35, 38, 41, 45, 48, 54, 59, 65, 71, 80, 31, 32, + 32, 32, 32, 34, 35, 37, 40, 43, 46, 51, 56, 62, 68, 76, 31, 32, 32, 32, + 32, 33, 34, 36, 38, 41, 44, 49, 54, 59, 65, 72, 31, 32, 32, 33, 34, 35, + 36, 38, 40, 42, 45, 50, 54, 59, 64, 71, 32, 32, 32, 34, 35, 37, 38, 39, + 41, 43, 46, 49, 53, 58, 63, 69, 34, 34, 33, 35, 37, 39, 42, 44, 46, 48, + 51, 54, 58, 63, 68, 74, 35, 35, 34, 36, 38, 42, 46, 48, 50, 53, 55, 59, + 62, 67, 72, 78, 38, 37, 36, 38, 39, 44, 48, 51, 54, 57, 59, 63, 67, 71, + 76, 82, 41, 40, 38, 40, 41, 46, 50, 54, 57, 60, 63, 67, 71, 75, 80, 86, + 45, 43, 41, 42, 43, 48, 53, 57, 60, 65, 68, 72, 76, 81, 85, 91, 48, 46, + 44, 45, 46, 51, 55, 59, 63, 68, 71, 76, 80, 85, 90, 96, 54, 51, 49, 50, + 49, 54, 59, 63, 67, 72, 76, 82, 87, 92, 97, 104, 59, 56, 54, 54, 53, 58, + 62, 67, 71, 76, 80, 87, 92, 98, 103, 110, 65, 62, 59, 59, 58, 63, 67, + 71, 75, 81, 85, 92, 98, 105, 111, 118, 71, 68, 65, 64, 63, 68, 72, 76, + 80, 85, 90, 97, 103, 111, 117, 125, 80, 76, 72, 71, 69, 74, 78, 82, 86, + 91, 96, 104, 110, 118, 125, 134, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44, + 45, 48, 48, 53, 54, 57, 59, 62, 65, 67, 71, 72, 80, 80, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 38, 40, 42, 43, 46, 46, 51, + 52, 55, 56, 59, 62, 64, 68, 69, 76, 76, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 34, 34, 35, 35, 37, 38, 40, 42, 43, 46, 46, 51, 51, 55, 56, 59, + 62, 64, 68, 69, 76, 76, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 34, 34, 36, 38, 39, 41, 42, 45, 45, 49, 50, 53, 54, 57, 60, 62, 66, 66, + 73, 73, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 36, 37, + 38, 41, 41, 44, 44, 49, 49, 52, 54, 56, 59, 61, 65, 65, 72, 72, 31, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, 37, 38, 39, 41, 42, 45, + 45, 49, 49, 52, 54, 56, 59, 61, 64, 65, 72, 72, 31, 32, 32, 32, 32, 33, + 33, 33, 34, 34, 35, 35, 36, 36, 38, 39, 40, 42, 42, 45, 45, 49, 50, 52, + 54, 56, 59, 60, 64, 65, 71, 71, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, + 35, 35, 36, 37, 38, 39, 40, 42, 43, 45, 45, 49, 49, 52, 54, 56, 59, 60, + 64, 64, 70, 70, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 37, 37, 38, 38, + 39, 40, 41, 42, 43, 46, 46, 49, 49, 52, 53, 55, 58, 59, 63, 63, 69, 69, + 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 41, 41, 43, + 43, 46, 46, 49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70, 34, 34, 34, 33, + 33, 34, 35, 35, 37, 37, 39, 39, 42, 42, 44, 45, 46, 47, 48, 51, 51, 54, + 54, 57, 58, 60, 63, 64, 68, 68, 74, 74, 34, 34, 34, 33, 33, 34, 35, 35, + 37, 37, 39, 39, 42, 42, 44, 45, 46, 47, 48, 51, 51, 54, 54, 57, 58, 60, + 63, 64, 68, 68, 74, 74, 35, 35, 35, 34, 34, 35, 36, 36, 38, 38, 42, 42, + 46, 47, 48, 49, 50, 52, 53, 55, 55, 58, 59, 61, 62, 64, 67, 68, 72, 72, + 78, 78, 36, 35, 35, 34, 34, 35, 36, 37, 38, 38, 42, 42, 47, 48, 50, 50, + 52, 54, 54, 57, 57, 59, 60, 62, 64, 66, 68, 69, 73, 73, 79, 79, 38, 37, + 37, 36, 36, 37, 38, 38, 39, 40, 44, 44, 48, 50, 51, 52, 54, 56, 57, 59, + 59, 62, 63, 65, 67, 69, 71, 72, 76, 76, 82, 82, 39, 38, 38, 38, 37, 38, + 39, 39, 40, 41, 45, 45, 49, 50, 52, 54, 55, 58, 58, 61, 61, 64, 65, 67, + 69, 71, 73, 74, 78, 78, 84, 84, 41, 40, 40, 39, 38, 39, 40, 40, 41, 41, + 46, 46, 50, 52, 54, 55, 57, 60, 60, 63, 63, 67, 67, 70, 71, 73, 75, 77, + 80, 81, 86, 86, 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 47, 47, 52, 54, + 56, 58, 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90, + 45, 43, 43, 42, 41, 42, 42, 43, 43, 43, 48, 48, 53, 54, 57, 58, 60, 64, + 65, 68, 68, 72, 72, 75, 76, 78, 81, 82, 85, 86, 91, 91, 48, 46, 46, 45, + 44, 45, 45, 45, 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71, 71, 75, + 76, 79, 80, 83, 85, 87, 90, 91, 96, 96, 48, 46, 46, 45, 44, 45, 45, 45, + 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71, 71, 75, 76, 79, 80, 83, + 85, 87, 90, 91, 96, 96, 53, 51, 51, 49, 49, 49, 49, 49, 49, 49, 54, 54, + 58, 59, 62, 64, 67, 71, 72, 75, 75, 81, 81, 85, 86, 89, 91, 93, 97, 97, + 103, 103, 54, 52, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, + 65, 67, 71, 72, 76, 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, + 57, 55, 55, 53, 52, 52, 52, 52, 52, 52, 57, 57, 61, 62, 65, 67, 70, 74, + 75, 79, 79, 85, 85, 89, 90, 93, 96, 98, 102, 102, 108, 108, 59, 56, 56, + 54, 54, 54, 54, 54, 53, 54, 58, 58, 62, 64, 67, 69, 71, 75, 76, 80, 80, + 86, 87, 90, 92, 95, 98, 99, 103, 104, 110, 110, 62, 59, 59, 57, 56, 56, + 56, 56, 55, 56, 60, 60, 64, 66, 69, 71, 73, 77, 78, 83, 83, 89, 89, 93, + 95, 98, 101, 103, 107, 108, 114, 114, 65, 62, 62, 60, 59, 59, 59, 59, + 58, 58, 63, 63, 67, 68, 71, 73, 75, 79, 81, 85, 85, 91, 92, 96, 98, 101, + 105, 106, 111, 111, 118, 118, 67, 64, 64, 62, 61, 61, 60, 60, 59, 60, + 64, 64, 68, 69, 72, 74, 77, 81, 82, 87, 87, 93, 94, 98, 99, 103, 106, + 108, 113, 113, 120, 120, 71, 68, 68, 66, 65, 64, 64, 64, 63, 63, 68, 68, + 72, 73, 76, 78, 80, 84, 85, 90, 90, 97, 97, 102, 103, 107, 111, 113, + 117, 118, 125, 125, 72, 69, 69, 66, 65, 65, 65, 64, 63, 64, 68, 68, 72, + 73, 76, 78, 81, 85, 86, 91, 91, 97, 98, 102, 104, 108, 111, 113, 118, + 119, 126, 126, 80, 76, 76, 73, 72, 72, 71, 70, 69, 70, 74, 74, 78, 79, + 82, 84, 86, 90, 91, 96, 96, 103, 104, 108, 110, 114, 118, 120, 125, 126, + 134, 134, 80, 76, 76, 73, 72, 72, 71, 70, 69, 70, 74, 74, 78, 79, 82, + 84, 86, 90, 91, 96, 96, 103, 104, 108, 110, 114, 118, 120, 125, 126, + 134, 134, + /* Size 4x8 */ + 32, 34, 43, 62, 32, 34, 42, 59, 33, 37, 44, 58, 35, 43, 54, 68, 41, 48, + 64, 79, 49, 54, 71, 91, 57, 60, 78, 101, 66, 68, 86, 111, + /* Size 8x4 */ + 32, 32, 33, 35, 41, 49, 57, 66, 34, 34, 37, 43, 48, 54, 60, 68, 43, 42, + 44, 54, 64, 71, 78, 86, 62, 59, 58, 68, 79, 91, 101, 111, + /* Size 8x16 */ + 32, 31, 32, 36, 44, 53, 62, 73, 31, 32, 32, 35, 42, 51, 59, 69, 31, 32, + 33, 34, 41, 49, 57, 66, 32, 32, 34, 36, 42, 50, 57, 65, 32, 33, 35, 38, + 42, 49, 56, 64, 34, 34, 37, 42, 48, 54, 61, 69, 35, 34, 38, 47, 52, 59, + 65, 73, 38, 36, 40, 49, 56, 63, 69, 77, 41, 39, 41, 51, 60, 67, 74, 81, + 44, 42, 43, 54, 64, 72, 79, 86, 48, 45, 46, 56, 67, 76, 83, 91, 53, 49, + 50, 60, 71, 82, 90, 99, 58, 54, 54, 63, 75, 87, 95, 105, 65, 60, 58, 68, + 79, 92, 102, 112, 71, 65, 63, 73, 84, 97, 108, 119, 79, 72, 70, 79, 90, + 104, 115, 127, + /* Size 16x8 */ + 32, 31, 31, 32, 32, 34, 35, 38, 41, 44, 48, 53, 58, 65, 71, 79, 31, 32, + 32, 32, 33, 34, 34, 36, 39, 42, 45, 49, 54, 60, 65, 72, 32, 32, 33, 34, + 35, 37, 38, 40, 41, 43, 46, 50, 54, 58, 63, 70, 36, 35, 34, 36, 38, 42, + 47, 49, 51, 54, 56, 60, 63, 68, 73, 79, 44, 42, 41, 42, 42, 48, 52, 56, + 60, 64, 67, 71, 75, 79, 84, 90, 53, 51, 49, 50, 49, 54, 59, 63, 67, 72, + 76, 82, 87, 92, 97, 104, 62, 59, 57, 57, 56, 61, 65, 69, 74, 79, 83, 90, + 95, 102, 108, 115, 73, 69, 66, 65, 64, 69, 73, 77, 81, 86, 91, 99, 105, + 112, 119, 127, + /* Size 16x32 */ + 32, 31, 31, 32, 32, 34, 36, 38, 44, 44, 53, 53, 62, 65, 73, 79, 31, 32, + 32, 32, 32, 34, 35, 37, 42, 43, 51, 51, 60, 62, 70, 75, 31, 32, 32, 32, + 32, 34, 35, 37, 42, 43, 51, 51, 59, 62, 69, 75, 31, 32, 32, 32, 32, 33, + 35, 36, 41, 42, 50, 50, 58, 60, 67, 73, 31, 32, 32, 32, 33, 33, 34, 36, + 41, 41, 49, 49, 57, 59, 66, 72, 31, 32, 32, 33, 33, 34, 35, 37, 41, 42, + 49, 49, 57, 59, 66, 71, 32, 32, 32, 33, 34, 35, 36, 38, 42, 43, 50, 50, + 57, 59, 65, 71, 32, 32, 32, 34, 34, 35, 37, 38, 42, 43, 49, 49, 56, 59, + 65, 70, 32, 32, 33, 34, 35, 37, 38, 39, 42, 43, 49, 49, 56, 58, 64, 69, + 32, 33, 33, 34, 35, 37, 39, 40, 43, 44, 50, 50, 56, 58, 64, 69, 34, 34, + 34, 36, 37, 39, 42, 44, 48, 48, 54, 54, 61, 63, 69, 73, 34, 34, 34, 36, + 37, 39, 42, 44, 48, 48, 54, 54, 61, 63, 69, 73, 35, 34, 34, 37, 38, 42, + 47, 48, 52, 53, 59, 59, 65, 67, 73, 77, 36, 35, 34, 37, 38, 43, 48, 49, + 54, 54, 60, 60, 66, 68, 74, 78, 38, 36, 36, 38, 40, 44, 49, 51, 56, 57, + 63, 63, 69, 71, 77, 81, 39, 38, 37, 40, 40, 45, 50, 52, 58, 58, 65, 65, + 71, 73, 79, 84, 41, 39, 39, 41, 41, 46, 51, 54, 60, 60, 67, 67, 74, 76, + 81, 86, 44, 41, 41, 42, 43, 48, 53, 56, 63, 64, 71, 71, 78, 79, 85, 90, + 44, 42, 42, 43, 43, 48, 54, 56, 64, 64, 72, 72, 79, 81, 86, 91, 48, 45, + 45, 46, 46, 51, 56, 59, 67, 67, 76, 76, 83, 85, 91, 96, 48, 45, 45, 46, + 46, 51, 56, 59, 67, 67, 76, 76, 83, 85, 91, 96, 53, 49, 49, 49, 49, 54, + 59, 62, 71, 71, 81, 81, 89, 91, 98, 103, 53, 50, 49, 50, 50, 54, 60, 63, + 71, 72, 82, 82, 90, 92, 99, 103, 57, 53, 52, 52, 52, 57, 62, 65, 74, 75, + 85, 85, 94, 96, 103, 108, 58, 54, 54, 54, 54, 58, 63, 67, 75, 76, 87, + 87, 95, 98, 105, 110, 61, 57, 57, 56, 56, 60, 66, 69, 77, 78, 89, 89, + 98, 101, 108, 114, 65, 60, 60, 59, 58, 63, 68, 71, 79, 80, 92, 92, 102, + 105, 112, 118, 67, 62, 61, 60, 60, 64, 69, 72, 81, 82, 94, 94, 103, 106, + 114, 120, 71, 66, 65, 64, 63, 68, 73, 76, 84, 85, 97, 97, 108, 111, 119, + 125, 72, 66, 66, 64, 64, 68, 73, 76, 85, 86, 98, 98, 108, 111, 119, 125, + 79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133, + 79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44, + 44, 48, 48, 53, 53, 57, 58, 61, 65, 67, 71, 72, 79, 79, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 38, 39, 41, 42, 45, 45, 49, + 50, 53, 54, 57, 60, 62, 66, 66, 73, 73, 31, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 34, 34, 34, 34, 36, 37, 39, 41, 42, 45, 45, 49, 49, 52, 54, 57, + 60, 61, 65, 66, 72, 72, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 36, 36, + 37, 37, 38, 40, 41, 42, 43, 46, 46, 49, 50, 52, 54, 56, 59, 60, 64, 64, + 71, 71, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, + 41, 43, 43, 46, 46, 49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70, 34, 34, + 34, 33, 33, 34, 35, 35, 37, 37, 39, 39, 42, 43, 44, 45, 46, 48, 48, 51, + 51, 54, 54, 57, 58, 60, 63, 64, 68, 68, 74, 74, 36, 35, 35, 35, 34, 35, + 36, 37, 38, 39, 42, 42, 47, 48, 49, 50, 51, 53, 54, 56, 56, 59, 60, 62, + 63, 66, 68, 69, 73, 73, 79, 79, 38, 37, 37, 36, 36, 37, 38, 38, 39, 40, + 44, 44, 48, 49, 51, 52, 54, 56, 56, 59, 59, 62, 63, 65, 67, 69, 71, 72, + 76, 76, 82, 82, 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 48, 48, 52, 54, + 56, 58, 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90, + 44, 43, 43, 42, 41, 42, 43, 43, 43, 44, 48, 48, 53, 54, 57, 58, 60, 64, + 64, 67, 67, 71, 72, 75, 76, 78, 80, 82, 85, 86, 91, 91, 53, 51, 51, 50, + 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, + 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, 53, 51, 51, 50, 49, 49, 50, + 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 85, 87, + 89, 92, 94, 97, 98, 104, 104, 62, 60, 59, 58, 57, 57, 57, 56, 56, 56, + 61, 61, 65, 66, 69, 71, 74, 78, 79, 83, 83, 89, 90, 94, 95, 98, 102, + 103, 108, 108, 115, 115, 65, 62, 62, 60, 59, 59, 59, 59, 58, 58, 63, 63, + 67, 68, 71, 73, 76, 79, 81, 85, 85, 91, 92, 96, 98, 101, 105, 106, 111, + 111, 118, 118, 73, 70, 69, 67, 66, 66, 65, 65, 64, 64, 69, 69, 73, 74, + 77, 79, 81, 85, 86, 91, 91, 98, 99, 103, 105, 108, 112, 114, 119, 119, + 127, 127, 79, 75, 75, 73, 72, 71, 71, 70, 69, 69, 73, 73, 77, 78, 81, + 84, 86, 90, 91, 96, 96, 103, 103, 108, 110, 114, 118, 120, 125, 125, + 133, 133, + /* Size 4x16 */ + 31, 34, 44, 65, 32, 34, 43, 62, 32, 33, 41, 59, 32, 35, 43, 59, 32, 37, + 43, 58, 34, 39, 48, 63, 34, 42, 53, 67, 36, 44, 57, 71, 39, 46, 60, 76, + 42, 48, 64, 81, 45, 51, 67, 85, 50, 54, 72, 92, 54, 58, 76, 98, 60, 63, + 80, 105, 66, 68, 85, 111, 73, 74, 91, 118, + /* Size 16x4 */ + 31, 32, 32, 32, 32, 34, 34, 36, 39, 42, 45, 50, 54, 60, 66, 73, 34, 34, + 33, 35, 37, 39, 42, 44, 46, 48, 51, 54, 58, 63, 68, 74, 44, 43, 41, 43, + 43, 48, 53, 57, 60, 64, 67, 72, 76, 80, 85, 91, 65, 62, 59, 59, 58, 63, + 67, 71, 76, 81, 85, 92, 98, 105, 111, 118, + /* Size 8x32 */ + 32, 31, 32, 36, 44, 53, 62, 73, 31, 32, 32, 35, 42, 51, 60, 70, 31, 32, + 32, 35, 42, 51, 59, 69, 31, 32, 32, 35, 41, 50, 58, 67, 31, 32, 33, 34, + 41, 49, 57, 66, 31, 32, 33, 35, 41, 49, 57, 66, 32, 32, 34, 36, 42, 50, + 57, 65, 32, 32, 34, 37, 42, 49, 56, 65, 32, 33, 35, 38, 42, 49, 56, 64, + 32, 33, 35, 39, 43, 50, 56, 64, 34, 34, 37, 42, 48, 54, 61, 69, 34, 34, + 37, 42, 48, 54, 61, 69, 35, 34, 38, 47, 52, 59, 65, 73, 36, 34, 38, 48, + 54, 60, 66, 74, 38, 36, 40, 49, 56, 63, 69, 77, 39, 37, 40, 50, 58, 65, + 71, 79, 41, 39, 41, 51, 60, 67, 74, 81, 44, 41, 43, 53, 63, 71, 78, 85, + 44, 42, 43, 54, 64, 72, 79, 86, 48, 45, 46, 56, 67, 76, 83, 91, 48, 45, + 46, 56, 67, 76, 83, 91, 53, 49, 49, 59, 71, 81, 89, 98, 53, 49, 50, 60, + 71, 82, 90, 99, 57, 52, 52, 62, 74, 85, 94, 103, 58, 54, 54, 63, 75, 87, + 95, 105, 61, 57, 56, 66, 77, 89, 98, 108, 65, 60, 58, 68, 79, 92, 102, + 112, 67, 61, 60, 69, 81, 94, 103, 114, 71, 65, 63, 73, 84, 97, 108, 119, + 72, 66, 64, 73, 85, 98, 108, 119, 79, 72, 70, 79, 90, 104, 115, 127, 79, + 72, 70, 79, 90, 104, 115, 127, + /* Size 32x8 */ + 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44, + 44, 48, 48, 53, 53, 57, 58, 61, 65, 67, 71, 72, 79, 79, 31, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 36, 37, 39, 41, 42, 45, 45, 49, + 49, 52, 54, 57, 60, 61, 65, 66, 72, 72, 32, 32, 32, 32, 33, 33, 34, 34, + 35, 35, 37, 37, 38, 38, 40, 40, 41, 43, 43, 46, 46, 49, 50, 52, 54, 56, + 58, 60, 63, 64, 70, 70, 36, 35, 35, 35, 34, 35, 36, 37, 38, 39, 42, 42, + 47, 48, 49, 50, 51, 53, 54, 56, 56, 59, 60, 62, 63, 66, 68, 69, 73, 73, + 79, 79, 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 48, 48, 52, 54, 56, 58, + 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90, 53, 51, + 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, + 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, 62, 60, 59, 58, 57, + 57, 57, 56, 56, 56, 61, 61, 65, 66, 69, 71, 74, 78, 79, 83, 83, 89, 90, + 94, 95, 98, 102, 103, 108, 108, 115, 115, 73, 70, 69, 67, 66, 66, 65, + 65, 64, 64, 69, 69, 73, 74, 77, 79, 81, 85, 86, 91, 91, 98, 99, 103, + 105, 108, 112, 114, 119, 119, 127, 127 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 42, 47, 53, 42, 48, 50, 54, 47, 50, 61, 67, 53, 54, 67, 78, + /* Size 8x8 */ + 31, 32, 38, 48, 47, 50, 53, 57, 32, 35, 42, 47, 45, 47, 50, 54, 38, 42, + 47, 48, 45, 47, 49, 52, 48, 47, 48, 53, 53, 54, 56, 58, 47, 45, 45, 53, + 58, 61, 63, 65, 50, 47, 47, 54, 61, 66, 69, 72, 53, 50, 49, 56, 63, 69, + 73, 77, 57, 54, 52, 58, 65, 72, 77, 82, + /* Size 16x16 */ + 32, 31, 30, 33, 36, 41, 47, 49, 49, 49, 50, 52, 54, 57, 60, 63, 31, 31, + 31, 34, 38, 42, 46, 47, 47, 47, 48, 50, 52, 54, 57, 60, 30, 31, 32, 35, + 40, 42, 45, 46, 45, 45, 46, 47, 49, 52, 54, 57, 33, 34, 35, 39, 43, 45, + 47, 46, 46, 45, 46, 47, 49, 51, 53, 56, 36, 38, 40, 43, 47, 47, 47, 47, + 46, 45, 46, 47, 48, 50, 52, 54, 41, 42, 42, 45, 47, 48, 50, 50, 49, 49, + 50, 50, 52, 53, 55, 57, 47, 46, 45, 47, 47, 50, 52, 52, 52, 52, 53, 53, + 55, 56, 58, 60, 49, 47, 46, 46, 47, 50, 52, 53, 54, 55, 55, 56, 57, 58, + 60, 62, 49, 47, 45, 46, 46, 49, 52, 54, 55, 57, 58, 59, 60, 61, 63, 65, + 49, 47, 45, 45, 45, 49, 52, 55, 57, 59, 60, 61, 63, 64, 66, 68, 50, 48, + 46, 46, 46, 50, 53, 55, 58, 60, 61, 63, 65, 67, 68, 71, 52, 50, 47, 47, + 47, 50, 53, 56, 59, 61, 63, 66, 68, 70, 72, 75, 54, 52, 49, 49, 48, 52, + 55, 57, 60, 63, 65, 68, 71, 73, 75, 78, 57, 54, 52, 51, 50, 53, 56, 58, + 61, 64, 67, 70, 73, 76, 79, 82, 60, 57, 54, 53, 52, 55, 58, 60, 63, 66, + 68, 72, 75, 79, 82, 85, 63, 60, 57, 56, 54, 57, 60, 62, 65, 68, 71, 75, + 78, 82, 85, 89, + /* Size 32x32 */ + 32, 31, 31, 30, 30, 32, 33, 34, 36, 37, 41, 41, 47, 49, 49, 48, 49, 49, + 49, 50, 50, 52, 52, 54, 54, 56, 57, 58, 60, 60, 63, 63, 31, 31, 31, 31, + 31, 32, 34, 35, 38, 38, 42, 42, 46, 48, 47, 47, 47, 47, 47, 48, 48, 50, + 50, 51, 52, 53, 54, 55, 57, 57, 60, 60, 31, 31, 31, 31, 31, 33, 34, 35, + 38, 39, 42, 42, 46, 47, 47, 47, 47, 47, 47, 48, 48, 49, 50, 51, 52, 53, + 54, 55, 57, 57, 60, 60, 30, 31, 31, 31, 31, 33, 35, 36, 39, 40, 42, 42, + 46, 47, 46, 46, 46, 45, 46, 47, 47, 48, 48, 50, 50, 51, 52, 53, 55, 55, + 58, 58, 30, 31, 31, 31, 32, 33, 35, 36, 40, 40, 42, 42, 45, 46, 46, 45, + 45, 45, 45, 46, 46, 47, 47, 49, 49, 51, 52, 52, 54, 54, 57, 57, 32, 32, + 33, 33, 33, 35, 37, 38, 41, 42, 43, 43, 46, 47, 46, 46, 45, 45, 45, 46, + 46, 47, 47, 49, 49, 50, 51, 52, 54, 54, 57, 57, 33, 34, 34, 35, 35, 37, + 39, 40, 43, 43, 45, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 49, + 49, 50, 51, 52, 53, 54, 56, 56, 34, 35, 35, 36, 36, 38, 40, 41, 44, 44, + 45, 45, 47, 47, 47, 46, 46, 45, 45, 46, 46, 47, 47, 48, 49, 50, 51, 51, + 53, 53, 55, 55, 36, 38, 38, 39, 40, 41, 43, 44, 47, 47, 47, 47, 47, 48, + 47, 46, 46, 45, 45, 46, 46, 46, 47, 48, 48, 49, 50, 50, 52, 52, 54, 54, + 37, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47, 46, 45, + 46, 46, 46, 47, 47, 48, 48, 49, 50, 51, 52, 52, 55, 55, 41, 42, 42, 42, + 42, 43, 45, 45, 47, 47, 48, 48, 50, 50, 50, 49, 49, 49, 49, 50, 50, 50, + 50, 51, 52, 52, 53, 54, 55, 55, 57, 57, 41, 42, 42, 42, 42, 43, 45, 45, + 47, 47, 48, 48, 50, 50, 50, 49, 49, 49, 49, 50, 50, 50, 50, 51, 52, 52, + 53, 54, 55, 55, 57, 57, 47, 46, 46, 46, 45, 46, 47, 47, 47, 48, 50, 50, + 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 54, 55, 55, 56, 56, 58, 58, + 60, 60, 49, 48, 47, 47, 46, 47, 47, 47, 48, 48, 50, 50, 52, 53, 53, 53, + 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58, 58, 60, 60, 49, 47, + 47, 46, 46, 46, 46, 47, 47, 47, 50, 50, 52, 53, 53, 54, 54, 55, 55, 55, + 55, 56, 56, 57, 57, 58, 58, 59, 60, 60, 62, 62, 48, 47, 47, 46, 45, 46, + 46, 46, 46, 47, 49, 49, 52, 53, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, + 58, 59, 60, 60, 61, 62, 63, 63, 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, + 49, 49, 52, 53, 54, 55, 55, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, + 63, 63, 65, 65, 49, 47, 47, 45, 45, 45, 45, 45, 45, 45, 49, 49, 52, 53, + 55, 55, 57, 58, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64, 65, 65, 67, 67, + 49, 47, 47, 46, 45, 45, 45, 45, 45, 46, 49, 49, 52, 53, 55, 56, 57, 59, + 59, 60, 60, 61, 61, 62, 63, 63, 64, 65, 66, 66, 68, 68, 50, 48, 48, 47, + 46, 46, 46, 46, 46, 46, 50, 50, 53, 54, 55, 56, 58, 60, 60, 61, 61, 63, + 63, 65, 65, 66, 67, 67, 68, 69, 71, 71, 50, 48, 48, 47, 46, 46, 46, 46, + 46, 46, 50, 50, 53, 54, 55, 56, 58, 60, 60, 61, 61, 63, 63, 65, 65, 66, + 67, 67, 68, 69, 71, 71, 52, 50, 49, 48, 47, 47, 47, 47, 46, 47, 50, 50, + 53, 54, 56, 57, 59, 61, 61, 63, 63, 66, 66, 67, 68, 69, 70, 71, 72, 72, + 74, 74, 52, 50, 50, 48, 47, 47, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, + 59, 61, 61, 63, 63, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 54, 51, + 51, 50, 49, 49, 49, 48, 48, 48, 51, 51, 54, 55, 57, 58, 60, 62, 62, 65, + 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 77, 77, 54, 52, 52, 50, 49, 49, + 49, 49, 48, 48, 52, 52, 55, 55, 57, 58, 60, 62, 63, 65, 65, 68, 68, 70, + 71, 72, 73, 74, 75, 76, 78, 78, 56, 53, 53, 51, 51, 50, 50, 50, 49, 49, + 52, 52, 55, 56, 58, 59, 61, 63, 63, 66, 66, 69, 69, 71, 72, 73, 75, 75, + 77, 77, 80, 80, 57, 54, 54, 52, 52, 51, 51, 51, 50, 50, 53, 53, 56, 56, + 58, 60, 61, 63, 64, 67, 67, 70, 70, 72, 73, 75, 76, 77, 79, 79, 82, 82, + 58, 55, 55, 53, 52, 52, 52, 51, 50, 51, 54, 54, 56, 57, 59, 60, 62, 64, + 65, 67, 67, 71, 71, 73, 74, 75, 77, 78, 80, 80, 83, 83, 60, 57, 57, 55, + 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 60, 61, 63, 65, 66, 68, 68, 72, + 72, 74, 75, 77, 79, 80, 82, 82, 85, 85, 60, 57, 57, 55, 54, 54, 54, 53, + 52, 52, 55, 55, 58, 58, 60, 62, 63, 65, 66, 69, 69, 72, 73, 75, 76, 77, + 79, 80, 82, 82, 85, 85, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, + 60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, + 89, 89, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60, 62, 63, + 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89, 89, + /* Size 4x8 */ + 31, 42, 47, 54, 33, 44, 45, 51, 40, 47, 46, 50, 47, 50, 54, 57, 45, 49, + 59, 64, 48, 50, 61, 70, 51, 52, 63, 75, 55, 55, 66, 79, + /* Size 8x4 */ + 31, 33, 40, 47, 45, 48, 51, 55, 42, 44, 47, 50, 49, 50, 52, 55, 47, 45, + 46, 54, 59, 61, 63, 66, 54, 51, 50, 57, 64, 70, 75, 79, + /* Size 8x16 */ + 32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 53, 57, 30, 32, + 40, 46, 45, 48, 51, 55, 33, 36, 43, 47, 46, 47, 50, 54, 37, 40, 47, 47, + 45, 47, 49, 52, 42, 43, 47, 50, 49, 50, 53, 56, 47, 46, 48, 52, 53, 53, + 55, 58, 48, 46, 47, 53, 55, 56, 58, 61, 48, 45, 46, 53, 57, 59, 61, 63, + 49, 45, 46, 53, 58, 62, 64, 66, 50, 46, 46, 54, 59, 64, 66, 69, 52, 48, + 47, 54, 61, 66, 70, 73, 54, 50, 49, 55, 62, 68, 72, 76, 57, 52, 50, 56, + 64, 70, 75, 79, 60, 54, 52, 58, 65, 72, 77, 82, 63, 57, 55, 60, 67, 75, + 80, 86, + /* Size 16x8 */ + 32, 31, 30, 33, 37, 42, 47, 48, 48, 49, 50, 52, 54, 57, 60, 63, 31, 31, + 32, 36, 40, 43, 46, 46, 45, 45, 46, 48, 50, 52, 54, 57, 37, 38, 40, 43, + 47, 47, 48, 47, 46, 46, 46, 47, 49, 50, 52, 55, 48, 47, 46, 47, 47, 50, + 52, 53, 53, 53, 54, 54, 55, 56, 58, 60, 49, 47, 45, 46, 45, 49, 53, 55, + 57, 58, 59, 61, 62, 64, 65, 67, 52, 50, 48, 47, 47, 50, 53, 56, 59, 62, + 64, 66, 68, 70, 72, 75, 56, 53, 51, 50, 49, 53, 55, 58, 61, 64, 66, 70, + 72, 75, 77, 80, 61, 57, 55, 54, 52, 56, 58, 61, 63, 66, 69, 73, 76, 79, + 82, 86, + /* Size 16x32 */ + 32, 31, 31, 35, 37, 42, 48, 48, 49, 49, 52, 52, 56, 57, 61, 63, 31, 31, + 31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 54, 54, 58, 60, 31, 31, 31, 36, + 38, 42, 47, 47, 47, 47, 50, 50, 53, 54, 57, 60, 30, 32, 32, 37, 39, 42, + 46, 46, 46, 46, 48, 48, 52, 52, 56, 58, 30, 32, 32, 37, 40, 42, 46, 46, + 45, 45, 48, 48, 51, 52, 55, 57, 32, 33, 34, 39, 41, 44, 46, 46, 45, 45, + 48, 48, 51, 51, 54, 57, 33, 35, 36, 40, 43, 45, 47, 46, 46, 46, 47, 47, + 50, 51, 54, 56, 34, 37, 37, 42, 44, 45, 47, 47, 45, 46, 47, 47, 50, 51, + 53, 55, 37, 40, 40, 45, 47, 47, 47, 47, 45, 46, 47, 47, 49, 50, 52, 54, + 37, 40, 40, 45, 47, 47, 48, 47, 46, 46, 47, 47, 49, 50, 53, 55, 42, 43, + 43, 46, 47, 48, 50, 50, 49, 49, 50, 50, 53, 53, 56, 57, 42, 43, 43, 46, + 47, 48, 50, 50, 49, 49, 50, 50, 53, 53, 56, 57, 47, 46, 46, 47, 48, 50, + 52, 52, 53, 53, 53, 53, 55, 56, 58, 60, 49, 47, 46, 47, 48, 50, 53, 53, + 53, 54, 54, 54, 56, 57, 59, 60, 48, 46, 46, 47, 47, 50, 53, 53, 55, 55, + 56, 56, 58, 58, 61, 62, 48, 46, 46, 46, 47, 50, 53, 54, 56, 56, 57, 57, + 59, 60, 62, 64, 48, 46, 45, 46, 46, 49, 53, 54, 57, 57, 59, 59, 61, 61, + 63, 65, 49, 45, 45, 45, 46, 49, 53, 55, 58, 59, 61, 61, 63, 64, 66, 67, + 49, 46, 45, 46, 46, 49, 53, 55, 58, 59, 62, 62, 64, 64, 66, 68, 50, 47, + 46, 46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 50, 47, 46, 46, + 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 52, 48, 48, 47, 47, 50, + 54, 56, 61, 61, 66, 66, 69, 70, 72, 74, 52, 48, 48, 47, 47, 50, 54, 56, + 61, 61, 66, 66, 70, 71, 73, 75, 53, 50, 49, 48, 48, 51, 55, 57, 62, 62, + 68, 68, 71, 72, 75, 77, 54, 50, 50, 49, 49, 52, 55, 57, 62, 63, 68, 68, + 72, 73, 76, 78, 55, 51, 51, 50, 49, 52, 56, 58, 63, 63, 69, 69, 74, 75, + 78, 80, 57, 52, 52, 51, 50, 53, 56, 58, 64, 64, 70, 70, 75, 76, 79, 82, + 58, 53, 53, 51, 51, 54, 57, 59, 64, 65, 71, 71, 76, 77, 80, 83, 60, 55, + 54, 53, 52, 55, 58, 60, 65, 66, 72, 72, 77, 79, 82, 85, 60, 55, 55, 53, + 53, 55, 59, 60, 65, 66, 73, 73, 78, 79, 83, 85, 63, 58, 57, 56, 55, 58, + 60, 62, 67, 68, 75, 75, 80, 82, 86, 89, 63, 58, 57, 56, 55, 58, 60, 62, + 67, 68, 75, 75, 80, 82, 86, 89, + /* Size 32x16 */ + 32, 31, 31, 30, 30, 32, 33, 34, 37, 37, 42, 42, 47, 49, 48, 48, 48, 49, + 49, 50, 50, 52, 52, 53, 54, 55, 57, 58, 60, 60, 63, 63, 31, 31, 31, 32, + 32, 33, 35, 37, 40, 40, 43, 43, 46, 47, 46, 46, 46, 45, 46, 47, 47, 48, + 48, 50, 50, 51, 52, 53, 55, 55, 58, 58, 31, 31, 31, 32, 32, 34, 36, 37, + 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 45, 46, 46, 48, 48, 49, 50, 51, + 52, 53, 54, 55, 57, 57, 35, 36, 36, 37, 37, 39, 40, 42, 45, 45, 46, 46, + 47, 47, 47, 46, 46, 45, 46, 46, 46, 47, 47, 48, 49, 50, 51, 51, 53, 53, + 56, 56, 37, 38, 38, 39, 40, 41, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47, + 46, 46, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 52, 53, 55, 55, 42, 42, + 42, 42, 42, 44, 45, 45, 47, 47, 48, 48, 50, 50, 50, 50, 49, 49, 49, 50, + 50, 50, 50, 51, 52, 52, 53, 54, 55, 55, 58, 58, 48, 47, 47, 46, 46, 46, + 47, 47, 47, 48, 50, 50, 52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, + 55, 56, 56, 57, 58, 59, 60, 60, 48, 47, 47, 46, 46, 46, 46, 47, 47, 47, + 50, 50, 52, 53, 53, 54, 54, 55, 55, 55, 55, 56, 56, 57, 57, 58, 58, 59, + 60, 60, 62, 62, 49, 47, 47, 46, 45, 45, 46, 45, 45, 46, 49, 49, 53, 53, + 55, 56, 57, 58, 58, 59, 59, 61, 61, 62, 62, 63, 64, 64, 65, 65, 67, 67, + 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 53, 54, 55, 56, 57, 59, + 59, 60, 60, 61, 61, 62, 63, 63, 64, 65, 66, 66, 68, 68, 52, 50, 50, 48, + 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66, + 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 52, 50, 50, 48, 48, 48, 47, 47, + 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66, 66, 68, 68, 69, + 70, 71, 72, 73, 75, 75, 56, 54, 53, 52, 51, 51, 50, 50, 49, 49, 53, 53, + 55, 56, 58, 59, 61, 63, 64, 66, 66, 69, 70, 71, 72, 74, 75, 76, 77, 78, + 80, 80, 57, 54, 54, 52, 52, 51, 51, 51, 50, 50, 53, 53, 56, 57, 58, 60, + 61, 64, 64, 67, 67, 70, 71, 72, 73, 75, 76, 77, 79, 79, 82, 82, 61, 58, + 57, 56, 55, 54, 54, 53, 52, 53, 56, 56, 58, 59, 61, 62, 63, 66, 66, 69, + 69, 72, 73, 75, 76, 78, 79, 80, 82, 83, 86, 86, 63, 60, 60, 58, 57, 57, + 56, 55, 54, 55, 57, 57, 60, 60, 62, 64, 65, 67, 68, 71, 71, 74, 75, 77, + 78, 80, 82, 83, 85, 85, 89, 89, + /* Size 4x16 */ + 31, 42, 49, 57, 31, 42, 47, 54, 32, 42, 45, 52, 35, 45, 46, 51, 40, 47, + 46, 50, 43, 48, 49, 53, 46, 50, 53, 56, 46, 50, 55, 58, 46, 49, 57, 61, + 46, 49, 59, 64, 47, 50, 60, 67, 48, 50, 61, 71, 50, 52, 63, 73, 52, 53, + 64, 76, 55, 55, 66, 79, 58, 58, 68, 82, + /* Size 16x4 */ + 31, 31, 32, 35, 40, 43, 46, 46, 46, 46, 47, 48, 50, 52, 55, 58, 42, 42, + 42, 45, 47, 48, 50, 50, 49, 49, 50, 50, 52, 53, 55, 58, 49, 47, 45, 46, + 46, 49, 53, 55, 57, 59, 60, 61, 63, 64, 66, 68, 57, 54, 52, 51, 50, 53, + 56, 58, 61, 64, 67, 71, 73, 76, 79, 82, + /* Size 8x32 */ + 32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 54, 58, 31, 31, + 38, 47, 47, 50, 53, 57, 30, 32, 39, 46, 46, 48, 52, 56, 30, 32, 40, 46, + 45, 48, 51, 55, 32, 34, 41, 46, 45, 48, 51, 54, 33, 36, 43, 47, 46, 47, + 50, 54, 34, 37, 44, 47, 45, 47, 50, 53, 37, 40, 47, 47, 45, 47, 49, 52, + 37, 40, 47, 48, 46, 47, 49, 53, 42, 43, 47, 50, 49, 50, 53, 56, 42, 43, + 47, 50, 49, 50, 53, 56, 47, 46, 48, 52, 53, 53, 55, 58, 49, 46, 48, 53, + 53, 54, 56, 59, 48, 46, 47, 53, 55, 56, 58, 61, 48, 46, 47, 53, 56, 57, + 59, 62, 48, 45, 46, 53, 57, 59, 61, 63, 49, 45, 46, 53, 58, 61, 63, 66, + 49, 45, 46, 53, 58, 62, 64, 66, 50, 46, 46, 54, 59, 64, 66, 69, 50, 46, + 46, 54, 59, 64, 66, 69, 52, 48, 47, 54, 61, 66, 69, 72, 52, 48, 47, 54, + 61, 66, 70, 73, 53, 49, 48, 55, 62, 68, 71, 75, 54, 50, 49, 55, 62, 68, + 72, 76, 55, 51, 49, 56, 63, 69, 74, 78, 57, 52, 50, 56, 64, 70, 75, 79, + 58, 53, 51, 57, 64, 71, 76, 80, 60, 54, 52, 58, 65, 72, 77, 82, 60, 55, + 53, 59, 65, 73, 78, 83, 63, 57, 55, 60, 67, 75, 80, 86, 63, 57, 55, 60, + 67, 75, 80, 86, + /* Size 32x8 */ + 32, 31, 31, 30, 30, 32, 33, 34, 37, 37, 42, 42, 47, 49, 48, 48, 48, 49, + 49, 50, 50, 52, 52, 53, 54, 55, 57, 58, 60, 60, 63, 63, 31, 31, 31, 32, + 32, 34, 36, 37, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 45, 46, 46, 48, + 48, 49, 50, 51, 52, 53, 54, 55, 57, 57, 37, 38, 38, 39, 40, 41, 43, 44, + 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 49, 49, + 50, 51, 52, 53, 55, 55, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 50, 50, + 52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58, 59, + 60, 60, 49, 47, 47, 46, 45, 45, 46, 45, 45, 46, 49, 49, 53, 53, 55, 56, + 57, 58, 58, 59, 59, 61, 61, 62, 62, 63, 64, 64, 65, 65, 67, 67, 52, 50, + 50, 48, 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, + 64, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 56, 54, 53, 52, 51, 51, + 50, 50, 49, 49, 53, 53, 55, 56, 58, 59, 61, 63, 64, 66, 66, 69, 70, 71, + 72, 74, 75, 76, 77, 78, 80, 80, 61, 58, 57, 56, 55, 54, 54, 53, 52, 53, + 56, 56, 58, 59, 61, 62, 63, 66, 66, 69, 69, 72, 73, 75, 76, 78, 79, 80, + 82, 83, 86, 86 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 33, 42, 55, 33, 38, 46, 57, 42, 46, 63, 75, 55, 57, 75, 92, + /* Size 8x8 */ + 31, 32, 32, 34, 38, 46, 52, 63, 32, 32, 32, 34, 37, 44, 49, 59, 32, 32, + 35, 37, 40, 45, 49, 58, 34, 34, 37, 42, 47, 52, 56, 65, 38, 37, 40, 47, + 54, 60, 65, 73, 46, 44, 45, 52, 60, 69, 75, 84, 52, 49, 49, 56, 65, 75, + 82, 92, 63, 59, 58, 65, 73, 84, 92, 105, + /* Size 16x16 */ + 32, 31, 31, 31, 32, 32, 34, 36, 38, 41, 44, 48, 54, 58, 61, 65, 31, 32, + 32, 32, 32, 32, 34, 35, 38, 40, 42, 46, 51, 55, 58, 62, 31, 32, 32, 32, + 32, 32, 33, 34, 37, 38, 41, 44, 49, 53, 56, 59, 31, 32, 32, 33, 33, 33, + 35, 36, 38, 40, 42, 45, 49, 53, 56, 59, 32, 32, 32, 33, 34, 34, 36, 37, + 39, 40, 42, 45, 49, 53, 55, 59, 32, 32, 32, 33, 34, 35, 37, 38, 40, 41, + 42, 46, 49, 52, 55, 58, 34, 34, 33, 35, 36, 37, 39, 42, 44, 46, 47, 51, + 54, 57, 60, 63, 36, 35, 34, 36, 37, 38, 42, 48, 50, 52, 54, 57, 60, 63, + 65, 68, 38, 38, 37, 38, 39, 40, 44, 50, 52, 54, 57, 60, 64, 67, 69, 72, + 41, 40, 38, 40, 40, 41, 46, 52, 54, 57, 60, 63, 67, 70, 73, 75, 44, 42, + 41, 42, 42, 42, 47, 54, 57, 60, 63, 67, 71, 74, 77, 79, 48, 46, 44, 45, + 45, 46, 51, 57, 60, 63, 67, 71, 76, 79, 82, 85, 54, 51, 49, 49, 49, 49, + 54, 60, 64, 67, 71, 76, 82, 86, 89, 92, 58, 55, 53, 53, 53, 52, 57, 63, + 67, 70, 74, 79, 86, 90, 93, 97, 61, 58, 56, 56, 55, 55, 60, 65, 69, 73, + 77, 82, 89, 93, 97, 101, 65, 62, 59, 59, 59, 58, 63, 68, 72, 75, 79, 85, + 92, 97, 101, 105, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39, + 41, 44, 44, 47, 48, 50, 54, 54, 58, 59, 61, 65, 65, 70, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 38, 38, 40, 42, 42, 46, + 47, 49, 52, 52, 56, 57, 59, 63, 63, 67, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 34, 34, 35, 35, 38, 38, 40, 42, 42, 45, 46, 48, 51, 51, + 55, 56, 58, 62, 62, 67, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 34, 34, 35, 35, 37, 38, 39, 42, 42, 45, 45, 47, 50, 50, 54, 55, 57, 61, + 61, 65, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, + 37, 37, 38, 41, 41, 44, 44, 46, 49, 49, 53, 54, 56, 59, 59, 64, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 37, 38, 41, + 41, 44, 44, 46, 49, 49, 53, 54, 56, 59, 59, 64, 31, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 34, 35, 35, 36, 36, 38, 39, 40, 42, 42, 44, 45, 47, + 49, 49, 53, 54, 56, 59, 59, 63, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, + 34, 35, 35, 36, 36, 36, 38, 39, 40, 42, 42, 45, 45, 47, 50, 50, 53, 54, + 56, 59, 59, 63, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 36, 36, + 37, 37, 39, 39, 40, 42, 42, 45, 45, 47, 49, 49, 53, 54, 55, 59, 59, 63, + 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, + 41, 42, 42, 45, 46, 47, 49, 49, 52, 53, 55, 58, 58, 62, 32, 32, 32, 32, + 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 42, 42, 45, + 46, 47, 49, 49, 52, 53, 55, 58, 58, 62, 33, 33, 33, 33, 33, 33, 34, 35, + 35, 36, 36, 38, 39, 40, 42, 42, 43, 44, 45, 46, 46, 49, 50, 51, 53, 53, + 56, 57, 59, 62, 62, 66, 34, 34, 34, 34, 33, 33, 35, 35, 36, 37, 37, 39, + 39, 41, 42, 42, 44, 45, 46, 47, 47, 50, 51, 52, 54, 54, 57, 58, 60, 63, + 63, 67, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 42, 45, 45, + 46, 47, 48, 50, 50, 52, 53, 54, 56, 56, 59, 60, 62, 65, 65, 69, 36, 35, + 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48, 48, 50, 50, 52, 54, + 54, 56, 57, 58, 60, 60, 63, 64, 65, 68, 68, 72, 36, 35, 35, 35, 34, 34, + 36, 36, 37, 38, 38, 42, 42, 45, 48, 48, 50, 50, 52, 54, 54, 56, 57, 58, + 60, 60, 63, 64, 65, 68, 68, 72, 38, 38, 38, 37, 37, 37, 38, 38, 39, 40, + 40, 43, 44, 46, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, + 69, 72, 72, 76, 39, 38, 38, 38, 37, 37, 39, 39, 39, 40, 40, 44, 45, 47, + 50, 50, 53, 54, 55, 58, 58, 60, 61, 62, 65, 65, 68, 69, 70, 73, 73, 77, + 41, 40, 40, 39, 38, 38, 40, 40, 40, 41, 41, 45, 46, 48, 52, 52, 54, 55, + 57, 60, 60, 62, 63, 65, 67, 67, 70, 71, 73, 75, 75, 79, 44, 42, 42, 42, + 41, 41, 42, 42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63, 63, 66, + 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 44, 42, 42, 42, 41, 41, 42, 42, + 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63, 63, 66, 67, 68, 71, 71, + 74, 75, 77, 79, 79, 83, 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 49, + 50, 52, 56, 56, 59, 60, 62, 66, 66, 69, 70, 72, 75, 75, 78, 79, 81, 84, + 84, 88, 48, 47, 46, 45, 44, 44, 45, 45, 45, 46, 46, 50, 51, 53, 57, 57, + 60, 61, 63, 67, 67, 70, 71, 73, 76, 76, 79, 80, 82, 85, 85, 89, 50, 49, + 48, 47, 46, 46, 47, 47, 47, 47, 47, 51, 52, 54, 58, 58, 61, 62, 65, 68, + 68, 72, 73, 75, 78, 78, 82, 83, 85, 88, 88, 92, 54, 52, 51, 50, 49, 49, + 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, + 82, 82, 86, 87, 89, 92, 92, 96, 54, 52, 51, 50, 49, 49, 49, 50, 49, 49, + 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87, + 89, 92, 92, 96, 58, 56, 55, 54, 53, 53, 53, 53, 53, 52, 52, 56, 57, 59, + 63, 63, 67, 68, 70, 74, 74, 78, 79, 82, 86, 86, 90, 91, 93, 97, 97, 101, + 59, 57, 56, 55, 54, 54, 54, 54, 54, 53, 53, 57, 58, 60, 64, 64, 68, 69, + 71, 75, 75, 79, 80, 83, 87, 87, 91, 92, 94, 98, 98, 102, 61, 59, 58, 57, + 56, 56, 56, 56, 55, 55, 55, 59, 60, 62, 65, 65, 69, 70, 73, 77, 77, 81, + 82, 85, 89, 89, 93, 94, 97, 101, 101, 105, 65, 63, 62, 61, 59, 59, 59, + 59, 59, 58, 58, 62, 63, 65, 68, 68, 72, 73, 75, 79, 79, 84, 85, 88, 92, + 92, 97, 98, 101, 105, 105, 109, 65, 63, 62, 61, 59, 59, 59, 59, 59, 58, + 58, 62, 63, 65, 68, 68, 72, 73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98, + 101, 105, 105, 109, 70, 67, 67, 65, 64, 64, 63, 63, 63, 62, 62, 66, 67, + 69, 72, 72, 76, 77, 79, 83, 83, 88, 89, 92, 96, 96, 101, 102, 105, 109, + 109, 114, + /* Size 4x8 */ + 32, 32, 42, 56, 32, 33, 41, 53, 32, 35, 42, 52, 34, 37, 50, 59, 38, 40, + 58, 68, 44, 45, 66, 78, 50, 50, 71, 86, 61, 58, 79, 97, + /* Size 8x4 */ + 32, 32, 32, 34, 38, 44, 50, 61, 32, 33, 35, 37, 40, 45, 50, 58, 42, 41, + 42, 50, 58, 66, 71, 79, 56, 53, 52, 59, 68, 78, 86, 97, + /* Size 8x16 */ + 32, 31, 32, 35, 39, 44, 53, 65, 31, 32, 32, 35, 38, 42, 51, 62, 31, 32, + 33, 34, 37, 41, 49, 59, 31, 32, 34, 35, 38, 42, 49, 59, 32, 32, 34, 36, + 39, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58, 34, 34, 37, 41, 44, 48, + 54, 63, 36, 34, 38, 46, 50, 54, 60, 68, 38, 37, 40, 47, 52, 57, 64, 72, + 41, 39, 41, 49, 54, 60, 67, 76, 44, 41, 43, 51, 57, 63, 71, 79, 48, 45, + 46, 54, 60, 67, 76, 85, 53, 49, 50, 57, 64, 71, 82, 92, 57, 53, 53, 60, + 67, 74, 86, 97, 61, 56, 56, 63, 69, 77, 89, 100, 65, 60, 58, 66, 72, 79, + 92, 105, + /* Size 16x8 */ + 32, 31, 31, 31, 32, 32, 34, 36, 38, 41, 44, 48, 53, 57, 61, 65, 31, 32, + 32, 32, 32, 33, 34, 34, 37, 39, 41, 45, 49, 53, 56, 60, 32, 32, 33, 34, + 34, 35, 37, 38, 40, 41, 43, 46, 50, 53, 56, 58, 35, 35, 34, 35, 36, 37, + 41, 46, 47, 49, 51, 54, 57, 60, 63, 66, 39, 38, 37, 38, 39, 40, 44, 50, + 52, 54, 57, 60, 64, 67, 69, 72, 44, 42, 41, 42, 42, 42, 48, 54, 57, 60, + 63, 67, 71, 74, 77, 79, 53, 51, 49, 49, 49, 49, 54, 60, 64, 67, 71, 76, + 82, 86, 89, 92, 65, 62, 59, 59, 58, 58, 63, 68, 72, 76, 79, 85, 92, 97, + 100, 105, + /* Size 16x32 */ + 32, 31, 31, 31, 32, 32, 35, 36, 39, 44, 44, 51, 53, 58, 65, 65, 31, 32, + 32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 52, 56, 63, 63, 31, 32, 32, 32, + 32, 32, 35, 35, 38, 42, 42, 49, 51, 55, 62, 62, 31, 32, 32, 32, 32, 32, + 34, 35, 37, 41, 41, 48, 50, 54, 61, 61, 31, 32, 32, 32, 33, 33, 34, 34, + 37, 41, 41, 47, 49, 53, 59, 59, 31, 32, 32, 32, 33, 33, 34, 34, 37, 41, + 41, 47, 49, 53, 59, 59, 31, 32, 32, 33, 34, 34, 35, 36, 38, 42, 42, 48, + 49, 53, 59, 59, 32, 32, 32, 33, 34, 34, 36, 36, 38, 42, 42, 48, 50, 53, + 59, 59, 32, 32, 32, 33, 34, 34, 36, 37, 39, 42, 42, 48, 49, 53, 58, 58, + 32, 32, 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52, 58, 58, 32, 32, + 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52, 58, 58, 33, 33, 33, 35, + 36, 36, 40, 41, 43, 46, 46, 52, 53, 56, 62, 62, 34, 34, 34, 35, 37, 37, + 41, 42, 44, 48, 48, 53, 54, 57, 63, 63, 34, 34, 34, 35, 37, 37, 43, 44, + 46, 50, 50, 55, 56, 59, 65, 65, 36, 35, 34, 36, 38, 38, 46, 48, 50, 54, + 54, 58, 60, 63, 68, 68, 36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58, + 60, 63, 68, 68, 38, 37, 37, 38, 40, 40, 47, 50, 52, 57, 57, 62, 64, 67, + 72, 72, 39, 38, 37, 39, 40, 40, 48, 50, 53, 58, 58, 63, 65, 68, 73, 73, + 41, 39, 39, 40, 41, 41, 49, 51, 54, 60, 60, 66, 67, 70, 76, 76, 44, 41, + 41, 42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74, 79, 79, 44, 41, 41, 42, + 43, 43, 51, 53, 57, 63, 63, 69, 71, 74, 79, 79, 47, 44, 44, 44, 45, 45, + 53, 56, 59, 66, 66, 73, 75, 78, 84, 84, 48, 45, 45, 45, 46, 46, 54, 56, + 60, 67, 67, 74, 76, 79, 85, 85, 50, 47, 46, 47, 47, 47, 55, 58, 61, 68, + 68, 76, 78, 82, 88, 88, 53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79, + 82, 86, 92, 92, 53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79, 82, 86, + 92, 92, 57, 54, 53, 53, 53, 53, 60, 63, 67, 74, 74, 83, 86, 90, 97, 97, + 58, 55, 54, 54, 54, 54, 61, 63, 68, 75, 75, 84, 87, 91, 98, 98, 61, 57, + 56, 56, 56, 56, 63, 65, 69, 77, 77, 86, 89, 93, 100, 100, 65, 61, 60, + 59, 58, 58, 66, 68, 72, 79, 79, 89, 92, 97, 105, 105, 65, 61, 60, 59, + 58, 58, 66, 68, 72, 79, 79, 89, 92, 97, 105, 105, 70, 65, 64, 63, 62, + 62, 70, 72, 76, 83, 83, 93, 96, 101, 109, 109, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39, + 41, 44, 44, 47, 48, 50, 53, 53, 57, 58, 61, 65, 65, 70, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 37, 38, 39, 41, 41, 44, + 45, 47, 50, 50, 54, 55, 57, 61, 61, 65, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 34, 34, 34, 34, 37, 37, 39, 41, 41, 44, 45, 46, 49, 49, + 53, 54, 56, 60, 60, 64, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, + 35, 35, 36, 36, 38, 39, 40, 42, 42, 44, 45, 47, 50, 50, 53, 54, 56, 59, + 59, 63, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, + 40, 40, 41, 43, 43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62, 32, 32, + 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, + 43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62, 35, 35, 35, 34, 34, 34, + 35, 36, 36, 37, 37, 40, 41, 43, 46, 46, 47, 48, 49, 51, 51, 53, 54, 55, + 57, 57, 60, 61, 63, 66, 66, 70, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, + 38, 41, 42, 44, 48, 48, 50, 50, 51, 53, 53, 56, 56, 58, 60, 60, 63, 63, + 65, 68, 68, 72, 39, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, + 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76, + 44, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, + 60, 63, 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 44, 42, 42, 41, + 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63, 63, 66, + 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 51, 49, 49, 48, 47, 47, 48, 48, + 48, 48, 48, 52, 53, 55, 58, 58, 62, 63, 66, 69, 69, 73, 74, 76, 79, 79, + 83, 84, 86, 89, 89, 93, 53, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, + 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87, 89, 92, + 92, 96, 58, 56, 55, 54, 53, 53, 53, 53, 53, 52, 52, 56, 57, 59, 63, 63, + 67, 68, 70, 74, 74, 78, 79, 82, 86, 86, 90, 91, 93, 97, 97, 101, 65, 63, + 62, 61, 59, 59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, + 79, 84, 85, 88, 92, 92, 97, 98, 100, 105, 105, 109, 65, 63, 62, 61, 59, + 59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, + 88, 92, 92, 97, 98, 100, 105, 105, 109, + /* Size 4x16 */ + 31, 32, 44, 58, 32, 32, 42, 55, 32, 33, 41, 53, 32, 34, 42, 53, 32, 34, + 42, 53, 32, 35, 42, 52, 34, 37, 48, 57, 35, 38, 54, 63, 37, 40, 57, 67, + 39, 41, 60, 70, 41, 43, 63, 74, 45, 46, 67, 79, 50, 50, 71, 86, 54, 53, + 74, 90, 57, 56, 77, 93, 61, 58, 79, 97, + /* Size 16x4 */ + 31, 32, 32, 32, 32, 32, 34, 35, 37, 39, 41, 45, 50, 54, 57, 61, 32, 32, + 33, 34, 34, 35, 37, 38, 40, 41, 43, 46, 50, 53, 56, 58, 44, 42, 41, 42, + 42, 42, 48, 54, 57, 60, 63, 67, 71, 74, 77, 79, 58, 55, 53, 53, 53, 52, + 57, 63, 67, 70, 74, 79, 86, 90, 93, 97, + /* Size 8x32 */ + 32, 31, 32, 35, 39, 44, 53, 65, 31, 32, 32, 35, 38, 42, 52, 63, 31, 32, + 32, 35, 38, 42, 51, 62, 31, 32, 32, 34, 37, 41, 50, 61, 31, 32, 33, 34, + 37, 41, 49, 59, 31, 32, 33, 34, 37, 41, 49, 59, 31, 32, 34, 35, 38, 42, + 49, 59, 32, 32, 34, 36, 38, 42, 50, 59, 32, 32, 34, 36, 39, 42, 49, 58, + 32, 33, 35, 37, 40, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58, 33, 33, + 36, 40, 43, 46, 53, 62, 34, 34, 37, 41, 44, 48, 54, 63, 34, 34, 37, 43, + 46, 50, 56, 65, 36, 34, 38, 46, 50, 54, 60, 68, 36, 34, 38, 46, 50, 54, + 60, 68, 38, 37, 40, 47, 52, 57, 64, 72, 39, 37, 40, 48, 53, 58, 65, 73, + 41, 39, 41, 49, 54, 60, 67, 76, 44, 41, 43, 51, 57, 63, 71, 79, 44, 41, + 43, 51, 57, 63, 71, 79, 47, 44, 45, 53, 59, 66, 75, 84, 48, 45, 46, 54, + 60, 67, 76, 85, 50, 46, 47, 55, 61, 68, 78, 88, 53, 49, 50, 57, 64, 71, + 82, 92, 53, 49, 50, 57, 64, 71, 82, 92, 57, 53, 53, 60, 67, 74, 86, 97, + 58, 54, 54, 61, 68, 75, 87, 98, 61, 56, 56, 63, 69, 77, 89, 100, 65, 60, + 58, 66, 72, 79, 92, 105, 65, 60, 58, 66, 72, 79, 92, 105, 70, 64, 62, + 70, 76, 83, 96, 109, + /* Size 32x8 */ + 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39, + 41, 44, 44, 47, 48, 50, 53, 53, 57, 58, 61, 65, 65, 70, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 37, 37, 39, 41, 41, 44, + 45, 46, 49, 49, 53, 54, 56, 60, 60, 64, 32, 32, 32, 32, 33, 33, 34, 34, + 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, 43, 45, 46, 47, 50, 50, + 53, 54, 56, 58, 58, 62, 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40, + 41, 43, 46, 46, 47, 48, 49, 51, 51, 53, 54, 55, 57, 57, 60, 61, 63, 66, + 66, 70, 39, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, 50, 50, + 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76, 44, 42, + 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63, + 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 53, 52, 51, 50, 49, 49, + 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, + 82, 82, 86, 87, 89, 92, 92, 96, 65, 63, 62, 61, 59, 59, 59, 59, 58, 58, + 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97, 98, + 100, 105, 105, 109 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 41, 46, 51, 41, 48, 48, 51, 46, 48, 58, 62, 51, 51, 62, 71, + /* Size 8x8 */ + 31, 31, 38, 44, 47, 48, 50, 55, 31, 32, 40, 44, 45, 46, 47, 52, 38, 40, + 47, 47, 46, 46, 47, 50, 44, 44, 47, 50, 51, 51, 52, 54, 47, 45, 46, 51, + 54, 56, 57, 60, 48, 46, 46, 51, 56, 61, 63, 66, 50, 47, 47, 52, 57, 63, + 66, 70, 55, 52, 50, 54, 60, 66, 70, 76, + /* Size 16x16 */ + 32, 31, 30, 33, 34, 36, 41, 49, 48, 49, 49, 50, 52, 54, 55, 57, 31, 31, + 31, 34, 36, 38, 42, 47, 47, 47, 47, 48, 50, 51, 53, 54, 30, 31, 32, 34, + 37, 40, 42, 46, 45, 45, 45, 46, 47, 49, 50, 52, 33, 34, 34, 37, 40, 42, + 44, 47, 46, 46, 45, 46, 47, 49, 50, 51, 34, 36, 37, 40, 42, 45, 46, 47, + 46, 46, 45, 46, 47, 48, 49, 50, 36, 38, 40, 42, 45, 47, 47, 48, 47, 46, + 45, 46, 47, 48, 49, 50, 41, 42, 42, 44, 46, 47, 48, 50, 50, 49, 49, 50, + 50, 51, 52, 53, 49, 47, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 55, + 56, 56, 48, 47, 45, 46, 46, 47, 50, 53, 54, 54, 55, 56, 57, 58, 58, 59, + 49, 47, 45, 46, 46, 46, 49, 53, 54, 55, 57, 58, 59, 60, 60, 61, 49, 47, + 45, 45, 45, 45, 49, 53, 55, 57, 58, 60, 61, 62, 63, 63, 50, 48, 46, 46, + 46, 46, 50, 54, 56, 58, 60, 61, 63, 65, 66, 67, 52, 50, 47, 47, 47, 47, + 50, 54, 57, 59, 61, 63, 66, 68, 69, 70, 54, 51, 49, 49, 48, 48, 51, 55, + 58, 60, 62, 65, 68, 70, 71, 73, 55, 53, 50, 50, 49, 49, 52, 56, 58, 60, + 63, 66, 69, 71, 73, 74, 57, 54, 52, 51, 50, 50, 53, 56, 59, 61, 63, 67, + 70, 73, 74, 76, + /* Size 32x32 */ + 32, 31, 31, 31, 30, 30, 33, 33, 34, 36, 36, 40, 41, 44, 49, 49, 48, 48, + 49, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31, 31, 31, + 31, 31, 33, 34, 36, 38, 38, 41, 42, 44, 48, 48, 47, 47, 47, 47, 47, 48, + 49, 49, 50, 50, 52, 52, 53, 55, 55, 57, 31, 31, 31, 31, 31, 31, 34, 34, + 36, 38, 38, 41, 42, 44, 47, 47, 47, 47, 47, 47, 47, 48, 48, 49, 50, 50, + 51, 52, 53, 54, 54, 56, 31, 31, 31, 31, 31, 31, 34, 35, 36, 39, 39, 41, + 42, 44, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 52, 53, + 53, 55, 30, 31, 31, 31, 32, 32, 34, 35, 37, 40, 40, 42, 42, 44, 46, 46, + 45, 45, 45, 45, 45, 46, 46, 47, 47, 47, 49, 49, 50, 52, 52, 54, 30, 31, + 31, 31, 32, 32, 34, 35, 37, 40, 40, 42, 42, 44, 46, 46, 45, 45, 45, 45, + 45, 46, 46, 47, 47, 47, 49, 49, 50, 52, 52, 54, 33, 33, 34, 34, 34, 34, + 37, 38, 40, 42, 42, 44, 44, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, + 47, 47, 49, 49, 50, 51, 51, 53, 33, 34, 34, 35, 35, 35, 38, 39, 40, 43, + 43, 44, 45, 46, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 49, 49, + 50, 51, 51, 53, 34, 36, 36, 36, 37, 37, 40, 40, 42, 45, 45, 45, 46, 46, + 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, + 36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 47, 46, + 46, 45, 45, 46, 46, 46, 47, 47, 48, 48, 49, 50, 50, 51, 36, 38, 38, 39, + 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45, 45, 46, + 46, 46, 47, 47, 48, 48, 49, 50, 50, 51, 40, 41, 41, 41, 42, 42, 44, 44, + 45, 47, 47, 48, 48, 49, 50, 50, 49, 49, 49, 48, 48, 49, 49, 49, 49, 49, + 51, 51, 51, 52, 52, 54, 41, 42, 42, 42, 42, 42, 44, 45, 46, 47, 47, 48, + 48, 49, 50, 50, 50, 49, 49, 49, 49, 50, 50, 50, 50, 50, 51, 52, 52, 53, + 53, 55, 44, 44, 44, 44, 44, 44, 45, 46, 46, 47, 47, 49, 49, 50, 51, 51, + 51, 51, 51, 51, 51, 51, 51, 51, 52, 52, 53, 53, 54, 54, 54, 56, 49, 48, + 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53, 53, 53, 53, 53, 53, + 53, 54, 54, 54, 54, 54, 55, 55, 56, 56, 56, 58, 49, 48, 47, 47, 46, 46, + 47, 47, 47, 48, 48, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, + 54, 54, 55, 55, 56, 56, 56, 58, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, + 47, 49, 50, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, + 58, 59, 59, 60, 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, + 53, 53, 54, 54, 55, 55, 55, 56, 56, 57, 57, 57, 58, 58, 59, 60, 60, 61, + 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54, 55, + 55, 57, 57, 57, 58, 58, 59, 59, 60, 60, 60, 61, 61, 63, 49, 47, 47, 46, + 45, 45, 45, 45, 45, 45, 45, 48, 49, 51, 53, 53, 55, 55, 57, 58, 58, 59, + 60, 60, 61, 61, 62, 62, 63, 63, 63, 65, 49, 47, 47, 46, 45, 45, 45, 45, + 45, 45, 45, 48, 49, 51, 53, 53, 55, 55, 57, 58, 58, 59, 60, 60, 61, 61, + 62, 62, 63, 63, 63, 65, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 49, + 50, 51, 54, 54, 56, 56, 57, 59, 59, 61, 61, 62, 63, 63, 64, 64, 65, 66, + 66, 67, 50, 49, 48, 47, 46, 46, 46, 46, 46, 46, 46, 49, 50, 51, 54, 54, + 56, 56, 58, 60, 60, 61, 61, 62, 63, 63, 65, 65, 66, 67, 67, 68, 51, 49, + 49, 48, 47, 47, 47, 47, 47, 46, 46, 49, 50, 51, 54, 54, 56, 57, 58, 60, + 60, 62, 62, 63, 65, 65, 66, 66, 67, 68, 68, 70, 52, 50, 50, 49, 47, 47, + 47, 47, 47, 47, 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63, 65, + 66, 66, 68, 68, 69, 70, 70, 72, 52, 50, 50, 49, 47, 47, 47, 47, 47, 47, + 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63, 65, 66, 66, 68, 68, + 69, 70, 70, 72, 54, 52, 51, 50, 49, 49, 49, 49, 48, 48, 48, 51, 51, 53, + 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 70, 71, 73, 73, 74, + 54, 52, 52, 51, 49, 49, 49, 49, 49, 48, 48, 51, 52, 53, 55, 55, 58, 58, + 60, 62, 62, 64, 65, 66, 68, 68, 70, 71, 72, 73, 73, 75, 55, 53, 53, 52, + 50, 50, 50, 50, 49, 49, 49, 51, 52, 54, 56, 56, 58, 59, 60, 63, 63, 65, + 66, 67, 69, 69, 71, 72, 73, 74, 74, 76, 57, 55, 54, 53, 52, 52, 51, 51, + 50, 50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70, + 73, 73, 74, 76, 76, 78, 57, 55, 54, 53, 52, 52, 51, 51, 50, 50, 50, 52, + 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70, 73, 73, 74, 76, + 76, 78, 59, 57, 56, 55, 54, 54, 53, 53, 52, 51, 51, 54, 55, 56, 58, 58, + 60, 61, 63, 65, 65, 67, 68, 70, 72, 72, 74, 75, 76, 78, 78, 80, + /* Size 4x8 */ + 31, 38, 47, 52, 32, 40, 45, 49, 39, 47, 45, 48, 44, 47, 51, 53, 46, 47, + 56, 58, 47, 46, 59, 64, 48, 47, 61, 68, 53, 50, 64, 73, + /* Size 8x4 */ + 31, 32, 39, 44, 46, 47, 48, 53, 38, 40, 47, 47, 47, 46, 47, 50, 47, 45, + 45, 51, 56, 59, 61, 64, 52, 49, 48, 53, 58, 64, 68, 73, + /* Size 8x16 */ + 32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 54, 30, 32, + 40, 44, 45, 45, 48, 52, 33, 35, 42, 46, 46, 45, 47, 51, 35, 37, 44, 46, + 46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50, 42, 43, 47, 49, 50, 49, + 50, 53, 49, 46, 48, 52, 53, 53, 54, 57, 48, 46, 47, 51, 54, 55, 57, 59, + 48, 45, 46, 51, 54, 57, 59, 61, 49, 45, 46, 51, 55, 58, 61, 64, 50, 46, + 46, 52, 56, 59, 64, 67, 52, 48, 47, 53, 57, 61, 66, 71, 54, 49, 48, 54, + 58, 62, 68, 73, 55, 51, 49, 54, 58, 63, 69, 74, 57, 52, 50, 55, 59, 64, + 70, 76, + /* Size 16x8 */ + 32, 31, 30, 33, 35, 37, 42, 49, 48, 48, 49, 50, 52, 54, 55, 57, 31, 31, + 32, 35, 37, 40, 43, 46, 46, 45, 45, 46, 48, 49, 51, 52, 37, 38, 40, 42, + 44, 47, 47, 48, 47, 46, 46, 46, 47, 48, 49, 50, 45, 45, 44, 46, 46, 47, + 49, 52, 51, 51, 51, 52, 53, 54, 54, 55, 48, 47, 45, 46, 46, 47, 50, 53, + 54, 54, 55, 56, 57, 58, 58, 59, 49, 47, 45, 45, 45, 45, 49, 53, 55, 57, + 58, 59, 61, 62, 63, 64, 52, 50, 48, 47, 47, 47, 50, 54, 57, 59, 61, 64, + 66, 68, 69, 70, 57, 54, 52, 51, 51, 50, 53, 57, 59, 61, 64, 67, 71, 73, + 74, 76, + /* Size 16x32 */ + 32, 31, 31, 33, 37, 37, 45, 48, 48, 49, 49, 51, 52, 54, 57, 57, 31, 31, + 31, 34, 38, 38, 45, 47, 47, 47, 47, 50, 50, 52, 55, 55, 31, 31, 31, 34, + 38, 38, 45, 47, 47, 47, 47, 49, 50, 51, 54, 54, 31, 31, 32, 34, 39, 39, + 45, 46, 46, 46, 46, 48, 49, 51, 53, 53, 30, 32, 32, 35, 40, 40, 44, 46, + 45, 45, 45, 47, 48, 49, 52, 52, 30, 32, 32, 35, 40, 40, 44, 46, 45, 45, + 45, 47, 48, 49, 52, 52, 33, 34, 35, 37, 42, 42, 46, 47, 46, 45, 45, 47, + 47, 49, 51, 51, 33, 35, 36, 38, 43, 43, 46, 47, 46, 46, 46, 47, 47, 49, + 51, 51, 35, 37, 37, 40, 44, 44, 46, 47, 46, 45, 45, 47, 47, 48, 51, 51, + 37, 39, 40, 43, 47, 47, 47, 47, 47, 45, 45, 46, 47, 48, 50, 50, 37, 39, + 40, 43, 47, 47, 47, 47, 47, 45, 45, 46, 47, 48, 50, 50, 41, 42, 42, 44, + 47, 47, 49, 49, 49, 48, 48, 49, 50, 51, 52, 52, 42, 42, 43, 44, 47, 47, + 49, 50, 50, 49, 49, 50, 50, 51, 53, 53, 44, 44, 44, 45, 47, 47, 50, 51, + 51, 51, 51, 52, 52, 53, 54, 54, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, + 53, 54, 54, 55, 57, 57, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54, + 54, 55, 57, 57, 48, 46, 46, 46, 47, 47, 51, 53, 54, 55, 55, 56, 57, 58, + 59, 59, 48, 46, 46, 46, 47, 47, 51, 53, 54, 56, 56, 57, 57, 58, 60, 60, + 48, 46, 45, 46, 46, 46, 51, 53, 54, 57, 57, 58, 59, 60, 61, 61, 49, 46, + 45, 45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 49, 46, 45, 45, + 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 50, 47, 46, 46, 46, 46, + 52, 54, 56, 59, 59, 62, 63, 64, 66, 66, 50, 47, 46, 46, 46, 46, 52, 54, + 56, 59, 59, 63, 64, 65, 67, 67, 51, 48, 47, 47, 47, 47, 52, 54, 56, 60, + 60, 64, 65, 66, 68, 68, 52, 48, 48, 47, 47, 47, 53, 54, 57, 61, 61, 65, + 66, 68, 71, 71, 52, 48, 48, 47, 47, 47, 53, 54, 57, 61, 61, 65, 66, 68, + 71, 71, 54, 50, 49, 49, 48, 48, 54, 55, 58, 62, 62, 67, 68, 70, 73, 73, + 54, 51, 50, 49, 49, 49, 54, 55, 58, 62, 62, 67, 68, 70, 73, 73, 55, 51, + 51, 50, 49, 49, 54, 56, 58, 63, 63, 68, 69, 71, 74, 74, 57, 53, 52, 51, + 50, 50, 55, 56, 59, 64, 64, 69, 70, 73, 76, 76, 57, 53, 52, 51, 50, 50, + 55, 56, 59, 64, 64, 69, 70, 73, 76, 76, 59, 55, 54, 53, 52, 52, 57, 58, + 61, 65, 65, 70, 72, 74, 78, 78, + /* Size 32x16 */ + 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 48, 48, + 48, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31, 31, 31, + 32, 32, 34, 35, 37, 39, 39, 42, 42, 44, 47, 47, 46, 46, 46, 46, 46, 47, + 47, 48, 48, 48, 50, 51, 51, 53, 53, 55, 31, 31, 31, 32, 32, 32, 35, 36, + 37, 40, 40, 42, 43, 44, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, + 49, 50, 51, 52, 52, 54, 33, 34, 34, 34, 35, 35, 37, 38, 40, 43, 43, 44, + 44, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 49, 49, 50, 51, + 51, 53, 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, + 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, 37, 38, + 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, + 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, 45, 45, 45, 45, 44, 44, + 46, 46, 46, 47, 47, 49, 49, 50, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, + 53, 53, 54, 54, 54, 55, 55, 57, 48, 47, 47, 46, 46, 46, 47, 47, 47, 47, + 47, 49, 50, 51, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 55, + 56, 56, 56, 58, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, + 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 61, + 49, 47, 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, + 57, 58, 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 49, 47, 47, 46, + 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58, 58, 59, + 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 51, 50, 49, 48, 47, 47, 47, 47, + 47, 46, 46, 49, 50, 52, 54, 54, 56, 57, 58, 61, 61, 62, 63, 64, 65, 65, + 67, 67, 68, 69, 69, 70, 52, 50, 50, 49, 48, 48, 47, 47, 47, 47, 47, 50, + 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 68, 69, 70, + 70, 72, 54, 52, 51, 51, 49, 49, 49, 49, 48, 48, 48, 51, 51, 53, 55, 55, + 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 70, 71, 73, 73, 74, 57, 55, + 54, 53, 52, 52, 51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, + 64, 66, 67, 68, 71, 71, 73, 73, 74, 76, 76, 78, 57, 55, 54, 53, 52, 52, + 51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, + 71, 71, 73, 73, 74, 76, 76, 78, + /* Size 4x16 */ + 31, 37, 49, 54, 31, 38, 47, 51, 32, 40, 45, 49, 34, 42, 45, 49, 37, 44, + 45, 48, 39, 47, 45, 48, 42, 47, 49, 51, 47, 48, 53, 55, 46, 47, 55, 58, + 46, 46, 57, 60, 46, 46, 58, 62, 47, 46, 59, 65, 48, 47, 61, 68, 50, 48, + 62, 70, 51, 49, 63, 71, 53, 50, 64, 73, + /* Size 16x4 */ + 31, 31, 32, 34, 37, 39, 42, 47, 46, 46, 46, 47, 48, 50, 51, 53, 37, 38, + 40, 42, 44, 47, 47, 48, 47, 46, 46, 46, 47, 48, 49, 50, 49, 47, 45, 45, + 45, 45, 49, 53, 55, 57, 58, 59, 61, 62, 63, 64, 54, 51, 49, 49, 48, 48, + 51, 55, 58, 60, 62, 65, 68, 70, 71, 73, + /* Size 8x32 */ + 32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 55, 31, 31, + 38, 45, 47, 47, 50, 54, 31, 32, 39, 45, 46, 46, 49, 53, 30, 32, 40, 44, + 45, 45, 48, 52, 30, 32, 40, 44, 45, 45, 48, 52, 33, 35, 42, 46, 46, 45, + 47, 51, 33, 36, 43, 46, 46, 46, 47, 51, 35, 37, 44, 46, 46, 45, 47, 51, + 37, 40, 47, 47, 47, 45, 47, 50, 37, 40, 47, 47, 47, 45, 47, 50, 41, 42, + 47, 49, 49, 48, 50, 52, 42, 43, 47, 49, 50, 49, 50, 53, 44, 44, 47, 50, + 51, 51, 52, 54, 49, 46, 48, 52, 53, 53, 54, 57, 49, 46, 48, 52, 53, 53, + 54, 57, 48, 46, 47, 51, 54, 55, 57, 59, 48, 46, 47, 51, 54, 56, 57, 60, + 48, 45, 46, 51, 54, 57, 59, 61, 49, 45, 46, 51, 55, 58, 61, 64, 49, 45, + 46, 51, 55, 58, 61, 64, 50, 46, 46, 52, 56, 59, 63, 66, 50, 46, 46, 52, + 56, 59, 64, 67, 51, 47, 47, 52, 56, 60, 65, 68, 52, 48, 47, 53, 57, 61, + 66, 71, 52, 48, 47, 53, 57, 61, 66, 71, 54, 49, 48, 54, 58, 62, 68, 73, + 54, 50, 49, 54, 58, 62, 68, 73, 55, 51, 49, 54, 58, 63, 69, 74, 57, 52, + 50, 55, 59, 64, 70, 76, 57, 52, 50, 55, 59, 64, 70, 76, 59, 54, 52, 57, + 61, 65, 72, 78, + /* Size 32x8 */ + 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 48, 48, + 48, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31, 31, 32, + 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 46, 46, 45, 45, 45, 46, + 46, 47, 48, 48, 49, 50, 51, 52, 52, 54, 37, 38, 38, 39, 40, 40, 42, 43, + 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, + 48, 49, 49, 50, 50, 52, 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49, + 49, 50, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 53, 53, 54, 54, 54, 55, + 55, 57, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, + 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 61, 49, 47, + 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58, + 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 52, 50, 50, 49, 48, 48, + 47, 47, 47, 47, 47, 50, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 64, 65, + 66, 66, 68, 68, 69, 70, 70, 72, 57, 55, 54, 53, 52, 52, 51, 51, 51, 50, + 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, 71, 71, 73, 73, + 74, 76, 76, 78 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 32, 38, 51, 32, 35, 40, 49, 38, 40, 54, 64, 51, 49, 64, 81, + /* Size 8x8 */ + 31, 32, 32, 34, 35, 41, 47, 53, 32, 32, 32, 33, 34, 40, 44, 50, 32, 32, + 34, 35, 37, 41, 45, 51, 34, 33, 35, 39, 42, 47, 51, 55, 35, 34, 37, 42, + 48, 53, 57, 61, 41, 40, 41, 47, 53, 60, 65, 70, 47, 44, 45, 51, 57, 65, + 71, 77, 53, 50, 51, 55, 61, 70, 77, 85, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 32, 32, 34, 36, 38, 39, 44, 47, 49, 54, 59, 31, 32, + 32, 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 56, 31, 32, 32, 32, + 32, 32, 33, 33, 34, 36, 37, 41, 44, 46, 50, 54, 31, 32, 32, 32, 32, 33, + 33, 34, 35, 36, 38, 41, 44, 45, 49, 54, 31, 32, 32, 32, 33, 34, 34, 35, + 36, 38, 39, 42, 45, 46, 50, 54, 32, 32, 32, 33, 34, 35, 36, 37, 38, 39, + 40, 42, 45, 46, 49, 53, 32, 33, 33, 33, 34, 36, 36, 38, 40, 41, 42, 44, + 47, 48, 51, 55, 34, 34, 33, 34, 35, 37, 38, 39, 42, 44, 45, 47, 50, 51, + 54, 58, 36, 35, 34, 35, 36, 38, 40, 42, 48, 50, 50, 54, 56, 57, 60, 64, + 38, 37, 36, 36, 38, 39, 41, 44, 50, 51, 52, 56, 58, 60, 63, 67, 39, 38, + 37, 38, 39, 40, 42, 45, 50, 52, 54, 58, 60, 62, 65, 69, 44, 42, 41, 41, + 42, 42, 44, 47, 54, 56, 58, 63, 66, 68, 71, 75, 47, 45, 44, 44, 45, 45, + 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 49, 47, 46, 45, 46, 46, 48, 51, + 57, 60, 62, 68, 71, 73, 77, 81, 54, 51, 50, 49, 50, 49, 51, 54, 60, 63, + 65, 71, 75, 77, 82, 87, 59, 56, 54, 54, 54, 53, 55, 58, 64, 67, 69, 75, + 79, 81, 87, 92, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 36, + 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 54, 55, 59, 59, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 39, 39, 41, + 43, 43, 46, 47, 48, 51, 52, 53, 57, 57, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 38, 38, 41, 42, 43, 45, 46, + 47, 51, 51, 53, 56, 56, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 34, 34, 34, 35, 35, 37, 38, 38, 41, 42, 42, 45, 46, 47, 51, 51, 52, + 56, 56, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, + 34, 34, 36, 37, 37, 40, 41, 41, 44, 45, 46, 49, 50, 51, 54, 54, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 36, 37, + 37, 40, 41, 41, 44, 44, 45, 49, 49, 50, 54, 54, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 36, 38, 38, 40, 41, 41, + 44, 45, 45, 49, 49, 50, 54, 54, 31, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 34, 34, 34, 35, 35, 35, 36, 36, 38, 39, 39, 41, 42, 42, 44, 45, 46, 49, + 50, 51, 54, 54, 31, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, + 35, 36, 36, 36, 38, 39, 39, 41, 42, 42, 45, 45, 46, 49, 50, 51, 54, 54, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, + 38, 39, 39, 41, 42, 42, 45, 45, 46, 49, 49, 51, 54, 54, 32, 32, 32, 32, + 32, 32, 33, 34, 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 39, 40, 40, 42, + 42, 43, 45, 46, 46, 49, 49, 50, 53, 53, 32, 32, 32, 32, 32, 32, 33, 34, + 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 39, 40, 40, 42, 42, 43, 45, 46, + 46, 49, 49, 50, 53, 53, 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36, + 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 44, 45, 47, 47, 48, 51, 51, 52, + 55, 55, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39, 39, 41, + 42, 42, 44, 45, 45, 47, 47, 48, 50, 51, 51, 54, 54, 55, 58, 58, 34, 34, + 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39, 39, 41, 42, 42, 44, 45, + 45, 47, 47, 48, 50, 51, 51, 54, 54, 55, 58, 58, 35, 34, 34, 34, 34, 34, + 34, 35, 36, 36, 37, 37, 39, 41, 41, 43, 45, 45, 47, 47, 47, 49, 50, 51, + 53, 53, 54, 57, 57, 58, 61, 61, 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, + 38, 38, 40, 42, 42, 45, 48, 48, 50, 50, 50, 53, 54, 54, 56, 57, 57, 59, + 60, 61, 64, 64, 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, + 42, 45, 48, 48, 50, 50, 50, 53, 54, 54, 56, 57, 57, 59, 60, 61, 64, 64, + 38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 47, 50, 50, + 51, 52, 52, 55, 56, 56, 58, 59, 60, 62, 63, 64, 67, 67, 39, 39, 38, 38, + 37, 37, 38, 39, 39, 39, 40, 40, 42, 45, 45, 47, 50, 50, 52, 54, 54, 56, + 58, 58, 60, 61, 62, 64, 65, 66, 69, 69, 39, 39, 38, 38, 37, 37, 38, 39, + 39, 39, 40, 40, 42, 45, 45, 47, 50, 50, 52, 54, 54, 56, 58, 58, 60, 61, + 62, 64, 65, 66, 69, 69, 42, 41, 41, 41, 40, 40, 40, 41, 41, 41, 42, 42, + 44, 47, 47, 49, 53, 53, 55, 56, 56, 60, 61, 62, 64, 65, 66, 69, 69, 70, + 73, 73, 44, 43, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 47, 47, 50, + 54, 54, 56, 58, 58, 61, 63, 64, 66, 67, 68, 71, 71, 72, 75, 75, 44, 43, + 43, 42, 41, 41, 41, 42, 42, 42, 43, 43, 45, 48, 48, 51, 54, 54, 56, 58, + 58, 62, 64, 64, 66, 67, 68, 71, 72, 73, 76, 76, 47, 46, 45, 45, 44, 44, + 44, 44, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, + 69, 70, 71, 74, 75, 76, 79, 79, 48, 47, 46, 46, 45, 44, 45, 45, 45, 45, + 46, 46, 47, 51, 51, 53, 57, 57, 59, 61, 61, 65, 67, 67, 70, 71, 72, 75, + 76, 77, 80, 80, 49, 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 51, + 51, 54, 57, 57, 60, 62, 62, 66, 68, 68, 71, 72, 73, 77, 77, 78, 81, 81, + 53, 51, 51, 51, 49, 49, 49, 49, 49, 49, 49, 49, 51, 54, 54, 57, 59, 59, + 62, 64, 64, 69, 71, 71, 74, 75, 77, 81, 81, 83, 86, 86, 54, 52, 51, 51, + 50, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, + 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, 55, 53, 53, 52, 51, 50, 50, 51, + 51, 51, 50, 50, 52, 55, 55, 58, 61, 61, 64, 66, 66, 70, 72, 73, 76, 77, + 78, 83, 83, 85, 88, 88, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, + 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, + 92, 92, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58, 58, 61, + 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92, 92, + /* Size 4x8 */ + 32, 32, 37, 52, 32, 33, 36, 49, 32, 34, 38, 49, 34, 37, 44, 54, 35, 38, + 49, 60, 40, 42, 55, 69, 46, 46, 59, 76, 52, 51, 64, 83, + /* Size 8x4 */ + 32, 32, 32, 34, 35, 40, 46, 52, 32, 33, 34, 37, 38, 42, 46, 51, 37, 36, + 38, 44, 49, 55, 59, 64, 52, 49, 49, 54, 60, 69, 76, 83, + /* Size 8x16 */ + 32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 42, 45, 51, 31, 32, + 32, 33, 35, 41, 44, 49, 31, 32, 33, 33, 35, 41, 44, 49, 32, 32, 34, 34, + 36, 42, 45, 50, 32, 33, 35, 36, 38, 42, 45, 49, 32, 33, 35, 36, 40, 44, + 47, 51, 34, 34, 36, 38, 42, 48, 50, 54, 36, 34, 37, 40, 48, 54, 56, 60, + 38, 36, 39, 41, 49, 56, 58, 63, 39, 37, 40, 42, 50, 58, 60, 65, 44, 41, + 42, 45, 53, 63, 66, 71, 47, 44, 45, 47, 56, 66, 69, 75, 49, 46, 47, 48, + 57, 67, 71, 77, 53, 49, 50, 51, 60, 71, 75, 82, 58, 54, 54, 55, 63, 75, + 79, 87, + /* Size 16x8 */ + 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 31, 32, + 32, 32, 32, 33, 33, 34, 34, 36, 37, 41, 44, 46, 49, 54, 32, 32, 32, 33, + 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 32, 33, 33, 33, 34, 36, + 36, 38, 40, 41, 42, 45, 47, 48, 51, 55, 36, 35, 35, 35, 36, 38, 40, 42, + 48, 49, 50, 53, 56, 57, 60, 63, 44, 42, 41, 41, 42, 42, 44, 48, 54, 56, + 58, 63, 66, 67, 71, 75, 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, + 69, 71, 75, 79, 53, 51, 49, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, + 82, 87, + /* Size 16x32 */ + 32, 31, 31, 31, 32, 32, 32, 35, 36, 38, 44, 44, 47, 53, 53, 59, 31, 32, + 32, 32, 32, 32, 33, 35, 35, 37, 43, 43, 46, 52, 52, 57, 31, 32, 32, 32, + 32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32, 32, 32, 32, + 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32, 32, 32, 32, 33, 34, + 35, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 32, 33, 33, 34, 34, 36, + 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 33, 33, 33, 35, 35, 36, 41, 41, + 44, 49, 49, 54, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 42, 42, 45, 49, + 49, 54, 32, 32, 32, 33, 34, 34, 34, 36, 36, 38, 42, 42, 45, 50, 50, 54, + 32, 32, 32, 33, 34, 34, 35, 37, 37, 38, 42, 42, 45, 49, 49, 54, 32, 32, + 33, 33, 35, 35, 36, 38, 38, 39, 42, 42, 45, 49, 49, 53, 32, 32, 33, 33, + 35, 35, 36, 38, 38, 39, 42, 42, 45, 49, 49, 53, 32, 33, 33, 33, 35, 36, + 36, 39, 40, 41, 44, 44, 47, 51, 51, 55, 34, 34, 34, 34, 36, 37, 38, 42, + 42, 44, 48, 48, 50, 54, 54, 58, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44, + 48, 48, 50, 54, 54, 58, 35, 34, 34, 34, 37, 37, 39, 44, 45, 46, 50, 50, + 53, 57, 57, 61, 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, + 60, 64, 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64, + 38, 37, 36, 37, 39, 40, 41, 48, 49, 51, 56, 56, 58, 63, 63, 67, 39, 38, + 37, 38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 39, 38, 37, 38, + 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 42, 40, 40, 40, 42, 42, + 44, 51, 52, 55, 61, 61, 64, 69, 69, 73, 44, 42, 41, 41, 42, 43, 45, 52, + 53, 56, 63, 63, 66, 71, 71, 75, 44, 42, 41, 41, 43, 43, 45, 52, 54, 56, + 63, 63, 66, 72, 72, 76, 47, 45, 44, 44, 45, 45, 47, 54, 56, 58, 66, 66, + 69, 75, 75, 79, 48, 46, 45, 45, 46, 46, 48, 55, 56, 59, 67, 67, 70, 76, + 76, 80, 49, 47, 46, 46, 47, 47, 48, 56, 57, 60, 67, 67, 71, 77, 77, 81, + 53, 50, 49, 49, 49, 49, 51, 58, 59, 62, 71, 71, 74, 81, 81, 86, 53, 51, + 49, 49, 50, 50, 51, 59, 60, 63, 71, 71, 75, 82, 82, 87, 55, 52, 51, 51, + 51, 51, 53, 60, 61, 64, 72, 72, 76, 83, 83, 88, 58, 55, 54, 54, 54, 54, + 55, 62, 63, 67, 75, 75, 79, 87, 87, 92, 58, 55, 54, 54, 54, 54, 55, 62, + 63, 67, 75, 75, 79, 87, 87, 92, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 36, 36, + 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 53, 55, 58, 58, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 38, 38, 40, + 42, 42, 45, 46, 47, 50, 51, 52, 55, 55, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 36, 37, 37, 40, 41, 41, 44, 45, + 46, 49, 49, 51, 54, 54, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 34, 34, 34, 35, 35, 37, 38, 38, 40, 41, 41, 44, 45, 46, 49, 49, 51, + 54, 54, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 36, 37, + 37, 37, 39, 40, 40, 42, 42, 43, 45, 46, 47, 49, 50, 51, 54, 54, 32, 32, + 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 40, 40, + 40, 42, 43, 43, 45, 46, 47, 49, 50, 51, 54, 54, 32, 33, 33, 33, 33, 33, + 33, 34, 34, 35, 36, 36, 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 45, 45, + 47, 48, 48, 51, 51, 53, 55, 55, 35, 35, 35, 35, 34, 34, 35, 36, 36, 37, + 38, 38, 39, 42, 42, 44, 47, 47, 48, 49, 49, 51, 52, 52, 54, 55, 56, 58, + 59, 60, 62, 62, 36, 35, 35, 35, 35, 34, 35, 36, 36, 37, 38, 38, 40, 42, + 42, 45, 48, 48, 49, 50, 50, 52, 53, 54, 56, 56, 57, 59, 60, 61, 63, 63, + 38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 46, 49, 49, + 51, 52, 52, 55, 56, 56, 58, 59, 60, 62, 63, 64, 67, 67, 44, 43, 42, 42, + 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61, + 63, 63, 66, 67, 67, 71, 71, 72, 75, 75, 44, 43, 42, 42, 41, 41, 41, 42, + 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61, 63, 63, 66, 67, + 67, 71, 71, 72, 75, 75, 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 45, + 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, 69, 70, 71, 74, 75, 76, + 79, 79, 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57, + 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, 53, 52, + 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, + 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, 59, 57, 56, 56, 54, 54, + 54, 54, 54, 54, 53, 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, + 79, 80, 81, 86, 87, 88, 92, 92, + /* Size 4x16 */ + 31, 32, 38, 53, 32, 32, 37, 51, 32, 32, 36, 49, 32, 33, 36, 49, 32, 34, + 38, 50, 32, 35, 39, 49, 33, 36, 41, 51, 34, 37, 44, 54, 35, 38, 49, 60, + 37, 40, 51, 63, 38, 40, 52, 65, 42, 43, 56, 71, 45, 45, 58, 75, 47, 47, + 60, 77, 51, 50, 63, 82, 55, 54, 67, 87, + /* Size 16x4 */ + 31, 32, 32, 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 55, 32, 32, + 32, 33, 34, 35, 36, 37, 38, 40, 40, 43, 45, 47, 50, 54, 38, 37, 36, 36, + 38, 39, 41, 44, 49, 51, 52, 56, 58, 60, 63, 67, 53, 51, 49, 49, 50, 49, + 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, + /* Size 8x32 */ + 32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 43, 46, 52, 31, 32, + 32, 33, 35, 42, 45, 51, 31, 32, 32, 33, 35, 42, 45, 51, 31, 32, 32, 33, + 35, 41, 44, 49, 31, 32, 32, 33, 34, 41, 44, 49, 31, 32, 33, 33, 35, 41, + 44, 49, 32, 32, 33, 34, 36, 42, 45, 49, 32, 32, 34, 34, 36, 42, 45, 50, + 32, 32, 34, 35, 37, 42, 45, 49, 32, 33, 35, 36, 38, 42, 45, 49, 32, 33, + 35, 36, 38, 42, 45, 49, 32, 33, 35, 36, 40, 44, 47, 51, 34, 34, 36, 38, + 42, 48, 50, 54, 34, 34, 36, 38, 42, 48, 50, 54, 35, 34, 37, 39, 45, 50, + 53, 57, 36, 34, 37, 40, 48, 54, 56, 60, 36, 34, 37, 40, 48, 54, 56, 60, + 38, 36, 39, 41, 49, 56, 58, 63, 39, 37, 40, 42, 50, 58, 60, 65, 39, 37, + 40, 42, 50, 58, 60, 65, 42, 40, 42, 44, 52, 61, 64, 69, 44, 41, 42, 45, + 53, 63, 66, 71, 44, 41, 43, 45, 54, 63, 66, 72, 47, 44, 45, 47, 56, 66, + 69, 75, 48, 45, 46, 48, 56, 67, 70, 76, 49, 46, 47, 48, 57, 67, 71, 77, + 53, 49, 49, 51, 59, 71, 74, 81, 53, 49, 50, 51, 60, 71, 75, 82, 55, 51, + 51, 53, 61, 72, 76, 83, 58, 54, 54, 55, 63, 75, 79, 87, 58, 54, 54, 55, + 63, 75, 79, 87, + /* Size 32x8 */ + 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 36, 36, + 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 53, 55, 58, 58, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 36, 37, 37, 40, + 41, 41, 44, 45, 46, 49, 49, 51, 54, 54, 32, 32, 32, 32, 32, 32, 33, 33, + 34, 34, 35, 35, 35, 36, 36, 37, 37, 37, 39, 40, 40, 42, 42, 43, 45, 46, + 47, 49, 50, 51, 54, 54, 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36, + 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 45, 45, 47, 48, 48, 51, 51, 53, + 55, 55, 36, 35, 35, 35, 35, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, + 48, 48, 49, 50, 50, 52, 53, 54, 56, 56, 57, 59, 60, 61, 63, 63, 44, 43, + 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, + 58, 61, 63, 63, 66, 67, 67, 71, 71, 72, 75, 75, 47, 46, 45, 45, 44, 44, + 44, 45, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, + 69, 70, 71, 74, 75, 76, 79, 79, 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, + 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, + 82, 83, 87, 87 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 38, 47, 49, 38, 47, 46, 46, 47, 46, 54, 57, 49, 46, 57, 66, + /* Size 8x8 */ + 31, 31, 35, 42, 48, 47, 49, 51, 31, 32, 36, 42, 46, 45, 46, 48, 35, 36, + 41, 45, 47, 45, 46, 48, 42, 42, 45, 48, 50, 49, 50, 51, 48, 46, 47, 50, + 53, 53, 54, 54, 47, 45, 45, 49, 53, 57, 59, 60, 49, 46, 46, 50, 54, 59, + 61, 64, 51, 48, 48, 51, 54, 60, 64, 68, + /* Size 16x16 */ + 32, 31, 30, 31, 33, 36, 38, 41, 49, 49, 48, 49, 50, 51, 52, 54, 31, 31, + 31, 32, 34, 38, 40, 42, 47, 47, 47, 47, 48, 48, 50, 52, 30, 31, 31, 32, + 35, 39, 41, 42, 46, 46, 46, 45, 46, 47, 48, 50, 31, 32, 32, 33, 36, 40, + 41, 43, 46, 46, 45, 45, 46, 46, 47, 49, 33, 34, 35, 36, 39, 43, 44, 45, + 47, 46, 46, 45, 46, 47, 47, 49, 36, 38, 39, 40, 43, 47, 47, 47, 48, 47, + 46, 45, 46, 46, 47, 48, 38, 40, 41, 41, 44, 47, 47, 48, 49, 48, 48, 47, + 47, 47, 48, 49, 41, 42, 42, 43, 45, 47, 48, 48, 50, 50, 49, 49, 50, 50, + 50, 52, 49, 47, 46, 46, 47, 48, 49, 50, 53, 53, 53, 53, 54, 54, 54, 55, + 49, 47, 46, 46, 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, 48, 47, + 46, 45, 46, 46, 48, 49, 53, 54, 54, 55, 56, 56, 57, 58, 49, 47, 45, 45, + 45, 45, 47, 49, 53, 55, 55, 58, 59, 60, 61, 62, 50, 48, 46, 46, 46, 46, + 47, 50, 54, 55, 56, 59, 61, 61, 63, 64, 51, 48, 47, 46, 47, 46, 47, 50, + 54, 55, 56, 60, 61, 62, 64, 66, 52, 50, 48, 47, 47, 47, 48, 50, 54, 56, + 57, 61, 63, 64, 66, 68, 54, 52, 50, 49, 49, 48, 49, 52, 55, 57, 58, 62, + 64, 66, 68, 71, + /* Size 32x32 */ + 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 36, 36, 38, 41, 41, 45, 49, 49, + 49, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31, 31, 31, + 31, 31, 31, 34, 34, 35, 38, 38, 39, 42, 42, 45, 48, 48, 47, 47, 47, 47, + 47, 47, 49, 49, 49, 50, 50, 51, 53, 53, 31, 31, 31, 31, 31, 31, 32, 34, + 34, 35, 38, 38, 40, 42, 42, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, + 48, 49, 50, 50, 52, 52, 31, 31, 31, 31, 31, 31, 32, 34, 34, 36, 38, 38, + 40, 42, 42, 45, 47, 47, 47, 47, 47, 47, 46, 47, 48, 48, 48, 49, 49, 50, + 52, 52, 30, 31, 31, 31, 31, 31, 32, 35, 35, 36, 39, 39, 41, 42, 42, 44, + 46, 46, 46, 46, 46, 45, 45, 45, 46, 47, 47, 48, 48, 48, 50, 50, 30, 31, + 31, 31, 31, 32, 32, 35, 35, 36, 40, 40, 41, 42, 42, 44, 46, 46, 46, 45, + 45, 45, 45, 45, 46, 46, 46, 47, 47, 48, 49, 49, 31, 31, 32, 32, 32, 32, + 33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 45, 45, 45, 45, 45, + 46, 46, 46, 47, 47, 48, 49, 49, 33, 34, 34, 34, 35, 35, 35, 38, 38, 40, + 43, 43, 43, 44, 44, 46, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, + 47, 48, 49, 49, 33, 34, 34, 34, 35, 35, 36, 38, 39, 40, 43, 43, 44, 45, + 45, 46, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 48, 49, 49, + 34, 35, 35, 36, 36, 36, 37, 40, 40, 41, 44, 44, 45, 45, 45, 46, 47, 47, + 47, 46, 46, 45, 45, 45, 46, 46, 46, 47, 47, 48, 49, 49, 36, 38, 38, 38, + 39, 40, 40, 43, 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45, + 45, 45, 46, 46, 46, 46, 47, 47, 48, 48, 36, 38, 38, 38, 39, 40, 40, 43, + 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45, 45, 45, 46, 46, + 46, 46, 47, 47, 48, 48, 38, 39, 40, 40, 41, 41, 41, 43, 44, 45, 47, 47, + 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 47, 47, 47, 48, 48, 48, + 49, 49, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48, 48, 49, + 50, 50, 50, 49, 49, 49, 49, 49, 50, 50, 50, 50, 50, 51, 52, 52, 41, 42, + 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48, 48, 49, 50, 50, 50, 49, + 49, 49, 49, 49, 50, 50, 50, 50, 50, 51, 52, 52, 45, 45, 45, 45, 44, 44, + 44, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 51, 51, 51, 51, 51, 51, 51, + 52, 52, 52, 52, 52, 52, 53, 53, 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, + 48, 48, 49, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, + 54, 54, 55, 55, 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, + 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55, + 49, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, 53, 53, + 53, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 57, 57, 48, 47, 47, 47, + 46, 45, 45, 46, 46, 46, 46, 46, 48, 49, 49, 51, 53, 53, 54, 54, 54, 55, + 55, 56, 56, 56, 56, 57, 57, 58, 58, 58, 48, 47, 47, 47, 46, 45, 45, 46, + 46, 46, 46, 46, 48, 49, 49, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, + 56, 57, 57, 58, 58, 58, 49, 47, 47, 47, 45, 45, 45, 45, 45, 45, 45, 45, + 47, 49, 49, 51, 53, 53, 54, 55, 55, 57, 57, 58, 58, 59, 59, 60, 60, 60, + 61, 61, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, + 53, 53, 55, 55, 55, 57, 58, 58, 59, 60, 60, 61, 61, 61, 62, 62, 49, 47, + 47, 47, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, + 56, 58, 58, 59, 59, 60, 60, 61, 61, 62, 63, 63, 50, 49, 48, 48, 46, 46, + 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 59, + 61, 61, 61, 63, 63, 63, 64, 64, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, + 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 61, 62, 63, + 63, 64, 65, 65, 51, 49, 48, 48, 47, 46, 46, 47, 47, 46, 46, 46, 47, 50, + 50, 52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 62, 62, 64, 64, 64, 66, 66, + 52, 50, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 48, 50, 50, 52, 54, 54, + 56, 57, 57, 60, 61, 61, 63, 63, 64, 66, 66, 67, 68, 68, 52, 50, 50, 49, + 48, 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, + 61, 61, 63, 63, 64, 66, 66, 67, 68, 68, 53, 51, 50, 50, 48, 48, 48, 48, + 48, 48, 47, 47, 48, 51, 51, 52, 54, 54, 56, 58, 58, 60, 61, 62, 63, 64, + 64, 67, 67, 68, 69, 69, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48, + 49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69, + 71, 71, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48, 49, 52, 52, 53, + 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69, 71, 71, + /* Size 4x8 */ + 31, 38, 47, 50, 31, 40, 46, 48, 36, 44, 47, 47, 42, 47, 50, 50, 47, 48, + 53, 54, 46, 46, 54, 60, 48, 46, 55, 64, 50, 48, 56, 67, + /* Size 8x4 */ + 31, 31, 36, 42, 47, 46, 48, 50, 38, 40, 44, 47, 48, 46, 46, 48, 47, 46, + 47, 50, 53, 54, 55, 56, 50, 48, 47, 50, 54, 60, 64, 67, + /* Size 8x16 */ + 32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 37, 40, 47, 47, 48, 50, 30, 32, + 38, 40, 46, 45, 46, 48, 31, 33, 38, 41, 46, 45, 46, 48, 33, 36, 41, 44, + 47, 46, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47, 39, 41, 46, 47, 48, 47, + 47, 48, 42, 43, 46, 48, 50, 49, 50, 50, 49, 46, 48, 49, 53, 53, 54, 54, + 48, 46, 47, 48, 53, 55, 55, 56, 48, 46, 46, 48, 53, 56, 56, 57, 49, 45, + 45, 47, 53, 58, 59, 61, 50, 46, 46, 48, 54, 59, 61, 63, 51, 47, 47, 48, + 54, 60, 61, 64, 52, 48, 47, 48, 54, 61, 63, 66, 54, 50, 49, 50, 55, 62, + 65, 68, + /* Size 16x8 */ + 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 31, 31, + 32, 33, 36, 40, 41, 43, 46, 46, 46, 45, 46, 47, 48, 50, 35, 37, 38, 38, + 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 38, 40, 40, 41, 44, 47, + 47, 48, 49, 48, 48, 47, 48, 48, 48, 50, 48, 47, 46, 46, 47, 47, 48, 50, + 53, 53, 53, 53, 54, 54, 54, 55, 49, 47, 45, 45, 46, 45, 47, 49, 53, 55, + 56, 58, 59, 60, 61, 62, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, + 61, 61, 63, 65, 52, 50, 48, 48, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, + 66, 68, + /* Size 16x32 */ + 32, 31, 31, 31, 35, 37, 38, 47, 48, 48, 49, 49, 50, 52, 52, 54, 31, 31, + 31, 32, 36, 38, 39, 46, 47, 47, 48, 48, 49, 50, 50, 53, 31, 31, 31, 32, + 37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 31, 31, 31, 32, 37, 38, + 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 30, 31, 32, 32, 38, 39, 40, 45, + 46, 46, 45, 45, 46, 48, 48, 50, 30, 31, 32, 33, 38, 40, 41, 45, 46, 46, + 45, 45, 46, 48, 48, 50, 31, 32, 33, 33, 38, 40, 41, 45, 46, 46, 45, 45, + 46, 48, 48, 50, 33, 35, 35, 36, 41, 43, 43, 46, 47, 46, 45, 45, 46, 47, + 47, 49, 33, 35, 36, 36, 41, 43, 44, 46, 47, 46, 46, 46, 46, 47, 47, 49, + 34, 36, 37, 37, 42, 44, 45, 47, 47, 47, 45, 45, 46, 47, 47, 49, 37, 39, + 40, 41, 45, 47, 47, 47, 47, 47, 45, 45, 46, 47, 47, 48, 37, 39, 40, 41, + 45, 47, 47, 47, 47, 47, 45, 45, 46, 47, 47, 48, 39, 40, 41, 42, 46, 47, + 47, 48, 48, 48, 47, 47, 47, 48, 48, 50, 42, 42, 43, 43, 46, 47, 48, 50, + 50, 50, 49, 49, 50, 50, 50, 52, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50, + 49, 49, 50, 50, 50, 52, 45, 45, 44, 45, 47, 47, 48, 51, 51, 51, 51, 51, + 52, 52, 52, 54, 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, + 54, 55, 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55, + 48, 47, 46, 46, 47, 47, 48, 52, 53, 53, 55, 55, 55, 56, 56, 57, 48, 46, + 46, 46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 48, 46, 46, 46, + 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 49, 46, 45, 45, 46, 46, + 47, 52, 53, 54, 57, 57, 58, 60, 60, 61, 49, 46, 45, 45, 45, 46, 47, 52, + 53, 55, 58, 58, 59, 61, 61, 62, 49, 46, 45, 45, 46, 46, 47, 52, 53, 55, + 58, 58, 60, 61, 61, 63, 50, 47, 46, 46, 46, 46, 48, 53, 54, 55, 59, 59, + 61, 63, 63, 65, 50, 48, 46, 46, 46, 46, 48, 53, 54, 55, 59, 59, 61, 64, + 64, 65, 51, 48, 47, 47, 47, 47, 48, 53, 54, 55, 60, 60, 61, 64, 64, 66, + 52, 49, 48, 48, 47, 47, 48, 53, 54, 56, 61, 61, 63, 66, 66, 68, 52, 49, + 48, 48, 47, 47, 48, 53, 54, 56, 61, 61, 63, 66, 66, 68, 53, 50, 48, 48, + 48, 48, 49, 54, 54, 56, 61, 61, 63, 67, 67, 69, 54, 51, 50, 50, 49, 49, + 50, 55, 55, 57, 62, 62, 65, 68, 68, 71, 54, 51, 50, 50, 49, 49, 50, 55, + 55, 57, 62, 62, 65, 68, 68, 71, + /* Size 32x16 */ + 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 37, 37, 39, 42, 42, 45, 49, 49, + 48, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31, 31, 31, + 31, 31, 32, 35, 35, 36, 39, 39, 40, 42, 42, 45, 47, 47, 47, 46, 46, 46, + 46, 46, 47, 48, 48, 49, 49, 50, 51, 51, 31, 31, 31, 31, 32, 32, 33, 35, + 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46, + 47, 48, 48, 48, 50, 50, 31, 32, 32, 32, 32, 33, 33, 36, 36, 37, 41, 41, + 42, 43, 43, 45, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 48, + 50, 50, 35, 36, 37, 37, 38, 38, 38, 41, 41, 42, 45, 45, 46, 46, 46, 47, + 48, 48, 47, 46, 46, 46, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49, 37, 38, + 38, 38, 39, 40, 40, 43, 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 47, + 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 38, 39, 40, 40, 40, 41, + 41, 43, 44, 45, 47, 47, 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, + 48, 48, 48, 48, 48, 49, 50, 50, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, + 47, 47, 48, 50, 50, 51, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, + 53, 54, 55, 55, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 50, + 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55, + 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, 53, 53, + 53, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 57, 57, 49, 48, 47, 47, + 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57, + 58, 58, 59, 59, 60, 61, 61, 61, 62, 62, 49, 48, 47, 47, 45, 45, 45, 45, + 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57, 58, 58, 59, 59, + 60, 61, 61, 61, 62, 62, 50, 49, 48, 48, 46, 46, 46, 46, 46, 46, 46, 46, + 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 60, 61, 61, 61, 63, 63, 63, + 65, 65, 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52, + 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68, 52, 50, + 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, + 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68, 54, 53, 52, 52, 50, 50, + 50, 49, 49, 49, 48, 48, 50, 52, 52, 54, 55, 55, 57, 59, 59, 61, 62, 63, + 65, 65, 66, 68, 68, 69, 71, 71, + /* Size 4x16 */ + 31, 37, 48, 52, 31, 38, 47, 50, 31, 39, 46, 48, 32, 40, 46, 48, 35, 43, + 46, 47, 39, 47, 47, 47, 40, 47, 48, 48, 42, 47, 50, 50, 47, 48, 53, 54, + 47, 47, 53, 56, 46, 47, 54, 57, 46, 46, 55, 61, 47, 46, 55, 63, 48, 47, + 55, 64, 49, 47, 56, 66, 51, 49, 57, 68, + /* Size 16x4 */ + 31, 31, 31, 32, 35, 39, 40, 42, 47, 47, 46, 46, 47, 48, 49, 51, 37, 38, + 39, 40, 43, 47, 47, 47, 48, 47, 47, 46, 46, 47, 47, 49, 48, 47, 46, 46, + 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, 52, 50, 48, 48, 47, 47, + 48, 50, 54, 56, 57, 61, 63, 64, 66, 68, + /* Size 8x32 */ + 32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 36, 39, 47, 48, 49, 50, 31, 31, + 37, 40, 47, 47, 48, 50, 31, 31, 37, 40, 47, 47, 48, 50, 30, 32, 38, 40, + 46, 45, 46, 48, 30, 32, 38, 41, 46, 45, 46, 48, 31, 33, 38, 41, 46, 45, + 46, 48, 33, 35, 41, 43, 47, 45, 46, 47, 33, 36, 41, 44, 47, 46, 46, 47, + 34, 37, 42, 45, 47, 45, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47, 37, 40, + 45, 47, 47, 45, 46, 47, 39, 41, 46, 47, 48, 47, 47, 48, 42, 43, 46, 48, + 50, 49, 50, 50, 42, 43, 46, 48, 50, 49, 50, 50, 45, 44, 47, 48, 51, 51, + 52, 52, 49, 46, 48, 49, 53, 53, 54, 54, 49, 46, 48, 49, 53, 53, 54, 54, + 48, 46, 47, 48, 53, 55, 55, 56, 48, 46, 46, 48, 53, 56, 56, 57, 48, 46, + 46, 48, 53, 56, 56, 57, 49, 45, 46, 47, 53, 57, 58, 60, 49, 45, 45, 47, + 53, 58, 59, 61, 49, 45, 46, 47, 53, 58, 60, 61, 50, 46, 46, 48, 54, 59, + 61, 63, 50, 46, 46, 48, 54, 59, 61, 64, 51, 47, 47, 48, 54, 60, 61, 64, + 52, 48, 47, 48, 54, 61, 63, 66, 52, 48, 47, 48, 54, 61, 63, 66, 53, 48, + 48, 49, 54, 61, 63, 67, 54, 50, 49, 50, 55, 62, 65, 68, 54, 50, 49, 50, + 55, 62, 65, 68, + /* Size 32x8 */ + 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 37, 37, 39, 42, 42, 45, 49, 49, + 48, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31, 31, 31, + 32, 32, 33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 46, 46, 45, + 45, 45, 46, 46, 47, 48, 48, 48, 50, 50, 35, 36, 37, 37, 38, 38, 38, 41, + 41, 42, 45, 45, 46, 46, 46, 47, 48, 48, 47, 46, 46, 46, 45, 46, 46, 46, + 47, 47, 47, 48, 49, 49, 38, 39, 40, 40, 40, 41, 41, 43, 44, 45, 47, 47, + 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 48, 48, 48, 48, 48, 49, + 50, 50, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 51, + 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55, 49, 48, + 47, 47, 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, + 56, 57, 58, 58, 59, 59, 60, 61, 61, 61, 62, 62, 50, 49, 48, 48, 46, 46, + 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 60, + 61, 61, 61, 63, 63, 63, 65, 65, 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, + 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66, + 66, 67, 68, 68 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 32, 35, 43, 32, 34, 37, 43, 35, 37, 48, 54, 43, 43, 54, 65, + /* Size 8x8 */ + 31, 31, 32, 32, 34, 37, 43, 47, 31, 32, 32, 32, 34, 36, 41, 44, 32, 32, + 33, 34, 35, 38, 42, 45, 32, 32, 34, 35, 37, 39, 42, 46, 34, 34, 35, 37, + 41, 45, 49, 52, 37, 36, 38, 39, 45, 51, 56, 59, 43, 41, 42, 42, 49, 56, + 63, 67, 47, 44, 45, 46, 52, 59, 67, 71, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 31, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 31, 32, + 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 31, 32, 32, 32, + 32, 32, 32, 33, 34, 34, 35, 38, 39, 42, 45, 45, 31, 32, 32, 32, 32, 32, + 32, 33, 33, 34, 34, 37, 38, 41, 44, 44, 31, 32, 32, 32, 33, 33, 33, 34, + 35, 36, 36, 39, 40, 42, 44, 45, 31, 32, 32, 32, 33, 33, 34, 34, 35, 36, + 36, 39, 40, 42, 45, 45, 32, 32, 32, 32, 33, 34, 35, 36, 37, 38, 38, 40, + 41, 42, 45, 46, 32, 33, 33, 33, 34, 34, 36, 36, 38, 39, 40, 42, 43, 44, + 47, 47, 34, 34, 34, 33, 35, 35, 37, 38, 39, 42, 42, 45, 46, 47, 50, 51, + 35, 35, 34, 34, 36, 36, 38, 39, 42, 46, 47, 49, 50, 52, 55, 55, 36, 35, + 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 52, 54, 56, 57, 39, 38, 38, 37, + 39, 39, 40, 42, 45, 49, 50, 54, 55, 58, 60, 61, 41, 40, 39, 38, 40, 40, + 41, 43, 46, 50, 52, 55, 57, 60, 62, 63, 44, 42, 42, 41, 42, 42, 42, 44, + 47, 52, 54, 58, 60, 63, 66, 67, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, + 56, 60, 62, 66, 69, 70, 48, 46, 45, 44, 45, 45, 46, 47, 51, 55, 57, 61, + 63, 67, 70, 71, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 34, + 35, 36, 36, 38, 39, 39, 41, 44, 44, 45, 47, 48, 48, 51, 31, 31, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37, + 39, 39, 40, 43, 43, 44, 46, 47, 47, 50, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 40, 42, + 42, 43, 45, 46, 46, 49, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 40, 42, 42, 43, 45, 46, + 46, 49, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, + 34, 34, 34, 35, 35, 36, 38, 38, 39, 42, 42, 42, 45, 45, 45, 48, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, + 34, 36, 37, 37, 38, 41, 41, 41, 44, 44, 44, 47, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37, + 38, 41, 41, 41, 44, 44, 44, 47, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 36, 38, 38, 39, 41, 41, 42, + 44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 34, 35, 35, 35, 36, 36, 36, 37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48, + 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, + 36, 36, 36, 38, 39, 39, 40, 42, 42, 42, 45, 45, 45, 48, 31, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 38, + 39, 39, 40, 42, 42, 42, 45, 45, 45, 48, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 39, 40, 40, 41, 42, + 42, 43, 45, 45, 45, 48, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, + 35, 35, 36, 37, 37, 37, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 46, + 46, 48, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 37, + 37, 37, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 46, 46, 48, 32, 33, + 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36, 36, 38, 38, 38, 39, 40, + 40, 41, 42, 42, 43, 44, 44, 45, 47, 47, 47, 50, 34, 34, 34, 34, 34, 33, + 33, 34, 35, 35, 35, 36, 37, 37, 38, 39, 39, 40, 42, 42, 42, 44, 45, 45, + 46, 47, 47, 48, 50, 51, 51, 53, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, + 35, 36, 37, 37, 38, 39, 39, 40, 42, 42, 42, 44, 45, 45, 46, 47, 47, 48, + 50, 51, 51, 53, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, + 38, 40, 40, 41, 43, 44, 44, 45, 46, 46, 47, 49, 49, 49, 51, 52, 52, 54, + 35, 35, 35, 35, 34, 34, 34, 34, 36, 36, 36, 37, 38, 38, 39, 42, 42, 43, + 46, 47, 47, 48, 49, 49, 50, 52, 52, 53, 55, 55, 55, 57, 36, 35, 35, 35, + 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 44, 47, 48, 48, 50, + 50, 50, 52, 54, 54, 54, 56, 57, 57, 58, 36, 35, 35, 35, 35, 34, 34, 35, + 36, 36, 36, 37, 38, 38, 40, 42, 42, 44, 47, 48, 48, 50, 50, 50, 52, 54, + 54, 54, 56, 57, 57, 58, 38, 37, 37, 37, 36, 36, 36, 36, 37, 38, 38, 39, + 39, 39, 41, 44, 44, 45, 48, 50, 50, 51, 52, 52, 54, 56, 56, 57, 58, 59, + 59, 61, 39, 39, 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45, + 45, 46, 49, 50, 50, 52, 54, 54, 55, 58, 58, 58, 60, 61, 61, 63, 39, 39, + 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45, 45, 46, 49, 50, + 50, 52, 54, 54, 55, 58, 58, 58, 60, 61, 61, 63, 41, 40, 40, 40, 39, 38, + 38, 39, 40, 40, 40, 41, 41, 41, 43, 46, 46, 47, 50, 52, 52, 54, 55, 55, + 57, 60, 60, 60, 62, 63, 63, 66, 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, + 42, 42, 42, 42, 44, 47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, + 66, 67, 67, 69, 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42, + 44, 47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69, + 45, 44, 43, 43, 42, 41, 41, 42, 42, 42, 42, 43, 43, 43, 45, 48, 48, 49, + 53, 54, 54, 57, 58, 58, 60, 64, 64, 65, 67, 68, 68, 70, 47, 46, 45, 45, + 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, + 60, 60, 62, 66, 66, 67, 69, 70, 70, 73, 48, 47, 46, 46, 45, 44, 44, 45, + 45, 45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67, + 67, 68, 70, 71, 71, 74, 48, 47, 46, 46, 45, 44, 44, 45, 45, 45, 45, 45, + 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67, 67, 68, 70, 71, + 71, 74, 51, 50, 49, 49, 48, 47, 47, 47, 48, 48, 48, 48, 48, 48, 50, 53, + 53, 54, 57, 58, 58, 61, 63, 63, 66, 69, 69, 70, 73, 74, 74, 77, + /* Size 4x8 */ + 31, 32, 35, 43, 32, 33, 34, 41, 32, 34, 36, 42, 32, 35, 38, 42, 34, 37, + 43, 49, 37, 40, 49, 56, 42, 43, 53, 63, 46, 46, 56, 67, + /* Size 8x4 */ + 31, 32, 32, 32, 34, 37, 42, 46, 32, 33, 34, 35, 37, 40, 43, 46, 35, 34, + 36, 38, 43, 49, 53, 56, 43, 41, 42, 42, 49, 56, 63, 67, + /* Size 8x16 */ + 32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 42, 45, 31, 32, + 32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44, 31, 32, 33, 34, + 35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45, 32, 33, 34, 35, 37, 38, + 42, 45, 32, 33, 34, 36, 39, 40, 44, 47, 34, 34, 35, 37, 41, 42, 48, 50, + 35, 34, 36, 38, 45, 47, 52, 55, 36, 34, 36, 38, 46, 48, 54, 56, 39, 37, + 39, 40, 48, 50, 58, 60, 41, 39, 40, 41, 49, 51, 60, 62, 44, 41, 42, 43, + 51, 53, 63, 66, 47, 44, 44, 45, 53, 56, 66, 69, 48, 45, 45, 46, 54, 56, + 67, 70, + /* Size 16x8 */ + 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 31, 32, + 32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 39, 41, 44, 45, 31, 32, 32, 32, + 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 32, 32, 32, 33, 34, 34, + 35, 36, 37, 38, 38, 40, 41, 43, 45, 46, 35, 35, 34, 34, 35, 36, 37, 39, + 41, 45, 46, 48, 49, 51, 53, 54, 36, 35, 35, 34, 36, 36, 38, 40, 42, 47, + 48, 50, 51, 53, 56, 56, 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58, + 60, 63, 66, 67, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66, + 69, 70, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 32, 32, 32, 35, 36, 36, 40, 44, 44, 47, 53, 31, 31, + 32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 43, 43, 46, 52, 31, 32, 32, 32, + 32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32, 32, 32, 32, + 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32, 32, 32, 32, 32, 33, + 34, 35, 35, 39, 41, 41, 45, 50, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, + 34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 38, + 41, 41, 44, 49, 31, 32, 32, 32, 32, 33, 33, 33, 34, 35, 35, 38, 41, 41, + 44, 49, 31, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 39, 42, 42, 44, 49, + 32, 32, 32, 32, 33, 34, 34, 34, 36, 36, 36, 39, 42, 42, 45, 50, 32, 32, + 32, 32, 33, 34, 34, 34, 36, 36, 36, 39, 42, 42, 45, 50, 32, 32, 32, 32, + 33, 35, 35, 35, 37, 37, 37, 40, 42, 42, 45, 49, 32, 32, 33, 33, 34, 35, + 35, 36, 37, 38, 38, 41, 42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36, + 37, 38, 38, 41, 42, 42, 45, 49, 32, 33, 33, 33, 34, 36, 36, 36, 39, 40, + 40, 42, 44, 44, 47, 51, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, + 48, 48, 50, 54, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48, + 50, 54, 34, 34, 34, 34, 35, 37, 37, 38, 42, 43, 43, 46, 49, 49, 51, 55, + 35, 35, 34, 34, 36, 38, 38, 39, 45, 47, 47, 50, 52, 52, 55, 59, 36, 35, + 34, 34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 36, 35, 34, 34, + 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 38, 37, 36, 36, 37, 40, + 40, 41, 47, 49, 49, 53, 56, 56, 58, 63, 39, 38, 37, 37, 39, 40, 40, 42, + 48, 50, 50, 54, 58, 58, 60, 65, 39, 38, 37, 37, 39, 40, 40, 42, 48, 50, + 50, 54, 58, 58, 60, 65, 41, 40, 39, 39, 40, 41, 41, 43, 49, 51, 51, 56, + 60, 60, 62, 67, 44, 42, 41, 41, 42, 43, 43, 45, 51, 53, 53, 59, 63, 63, + 66, 71, 44, 42, 41, 41, 42, 43, 43, 45, 51, 53, 53, 59, 63, 63, 66, 71, + 44, 43, 42, 42, 42, 43, 43, 45, 51, 54, 54, 59, 64, 64, 67, 72, 47, 45, + 44, 44, 44, 45, 45, 47, 53, 56, 56, 61, 66, 66, 69, 75, 48, 46, 45, 45, + 45, 46, 46, 48, 54, 56, 56, 62, 67, 67, 70, 76, 48, 46, 45, 45, 45, 46, + 46, 48, 54, 56, 56, 62, 67, 67, 70, 76, 51, 49, 47, 47, 48, 48, 48, 50, + 56, 58, 58, 64, 69, 69, 73, 79, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 34, + 35, 36, 36, 38, 39, 39, 41, 44, 44, 44, 47, 48, 48, 51, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37, + 38, 38, 40, 42, 42, 43, 45, 46, 46, 49, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41, + 41, 42, 44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41, 41, 42, 44, 45, + 45, 47, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, + 35, 35, 36, 36, 36, 37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48, 32, 32, + 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, + 38, 40, 40, 40, 41, 43, 43, 43, 45, 46, 46, 48, 32, 32, 32, 32, 32, 33, + 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, + 41, 43, 43, 43, 45, 46, 46, 48, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, + 34, 35, 36, 36, 36, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 45, 45, + 47, 48, 48, 50, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 37, 37, 37, + 39, 41, 41, 42, 45, 46, 46, 47, 48, 48, 49, 51, 51, 51, 53, 54, 54, 56, + 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, + 47, 48, 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 36, 35, 35, 35, + 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48, 48, 49, + 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 40, 39, 39, 39, 39, 38, 38, 38, + 39, 39, 39, 40, 41, 41, 42, 45, 45, 46, 50, 51, 51, 53, 54, 54, 56, 59, + 59, 59, 61, 62, 62, 64, 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42, + 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, + 67, 69, 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 48, + 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69, 47, 46, + 45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, + 56, 58, 60, 60, 62, 66, 66, 67, 69, 70, 70, 73, 53, 52, 51, 51, 50, 49, + 49, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 55, 59, 60, 60, 63, 65, 65, + 67, 71, 71, 72, 75, 76, 76, 79, + /* Size 4x16 */ + 31, 32, 36, 44, 32, 32, 35, 42, 32, 32, 35, 41, 32, 33, 34, 41, 32, 34, + 36, 42, 32, 34, 36, 42, 32, 35, 38, 42, 33, 36, 40, 44, 34, 37, 42, 48, + 35, 38, 47, 52, 35, 38, 48, 54, 38, 40, 50, 58, 40, 41, 51, 60, 42, 43, + 53, 63, 45, 45, 56, 66, 46, 46, 56, 67, + /* Size 16x4 */ + 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 32, 32, + 32, 33, 34, 34, 35, 36, 37, 38, 38, 40, 41, 43, 45, 46, 36, 35, 35, 34, + 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, 44, 42, 41, 41, 42, 42, + 42, 44, 48, 52, 54, 58, 60, 63, 66, 67, + /* Size 8x32 */ + 32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 43, 46, 31, 32, + 32, 32, 35, 35, 42, 45, 31, 32, 32, 32, 35, 35, 42, 45, 31, 32, 32, 32, + 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44, 31, 32, 32, 33, 34, 34, + 41, 44, 31, 32, 32, 33, 34, 35, 41, 44, 31, 32, 33, 34, 35, 36, 42, 44, + 32, 32, 33, 34, 36, 36, 42, 45, 32, 32, 33, 34, 36, 36, 42, 45, 32, 32, + 33, 35, 37, 37, 42, 45, 32, 33, 34, 35, 37, 38, 42, 45, 32, 33, 34, 35, + 37, 38, 42, 45, 32, 33, 34, 36, 39, 40, 44, 47, 34, 34, 35, 37, 41, 42, + 48, 50, 34, 34, 35, 37, 41, 42, 48, 50, 34, 34, 35, 37, 42, 43, 49, 51, + 35, 34, 36, 38, 45, 47, 52, 55, 36, 34, 36, 38, 46, 48, 54, 56, 36, 34, + 36, 38, 46, 48, 54, 56, 38, 36, 37, 40, 47, 49, 56, 58, 39, 37, 39, 40, + 48, 50, 58, 60, 39, 37, 39, 40, 48, 50, 58, 60, 41, 39, 40, 41, 49, 51, + 60, 62, 44, 41, 42, 43, 51, 53, 63, 66, 44, 41, 42, 43, 51, 53, 63, 66, + 44, 42, 42, 43, 51, 54, 64, 67, 47, 44, 44, 45, 53, 56, 66, 69, 48, 45, + 45, 46, 54, 56, 67, 70, 48, 45, 45, 46, 54, 56, 67, 70, 51, 47, 48, 48, + 56, 58, 69, 73, + /* Size 32x8 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 34, + 35, 36, 36, 38, 39, 39, 41, 44, 44, 44, 47, 48, 48, 51, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, + 37, 37, 39, 41, 41, 42, 44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 39, 39, 40, 42, + 42, 42, 44, 45, 45, 48, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, + 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, 41, 43, 43, 43, 45, 46, + 46, 48, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 37, 37, 37, 39, 41, + 41, 42, 45, 46, 46, 47, 48, 48, 49, 51, 51, 51, 53, 54, 54, 56, 36, 35, + 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48, + 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 44, 43, 42, 42, 41, 41, + 41, 41, 42, 42, 42, 42, 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, + 60, 63, 63, 64, 66, 67, 67, 69, 47, 46, 45, 45, 45, 44, 44, 44, 44, 45, + 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, 60, 60, 62, 66, 66, 67, + 69, 70, 70, 73 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 37, 47, 47, 37, 44, 47, 45, 47, 47, 53, 53, 47, 45, 53, 59, + /* Size 8x8 */ + 31, 31, 34, 37, 43, 48, 47, 49, 31, 32, 35, 40, 43, 46, 45, 46, 34, 35, + 39, 43, 45, 46, 45, 46, 37, 40, 43, 47, 47, 47, 45, 46, 43, 43, 45, 47, + 49, 50, 50, 50, 48, 46, 46, 47, 50, 53, 55, 55, 47, 45, 45, 45, 50, 55, + 58, 60, 49, 46, 46, 46, 50, 55, 60, 61, + /* Size 16x16 */ + 32, 31, 31, 30, 33, 33, 36, 38, 41, 47, 49, 48, 49, 49, 50, 50, 31, 31, + 31, 31, 34, 34, 38, 40, 42, 46, 47, 47, 47, 47, 48, 48, 31, 31, 31, 31, + 34, 35, 39, 40, 42, 46, 47, 46, 46, 46, 47, 47, 30, 31, 31, 32, 34, 35, + 40, 41, 42, 45, 46, 45, 45, 45, 46, 46, 33, 34, 34, 34, 37, 38, 42, 43, + 44, 46, 47, 46, 46, 45, 46, 46, 33, 34, 35, 35, 38, 39, 43, 44, 45, 47, + 47, 46, 46, 45, 46, 46, 36, 38, 39, 40, 42, 43, 47, 47, 47, 47, 48, 46, + 46, 45, 46, 46, 38, 40, 40, 41, 43, 44, 47, 47, 48, 48, 49, 48, 47, 47, + 47, 47, 41, 42, 42, 42, 44, 45, 47, 48, 48, 50, 50, 49, 49, 49, 50, 50, + 47, 46, 46, 45, 46, 47, 47, 48, 50, 52, 52, 52, 52, 52, 53, 53, 49, 47, + 47, 46, 47, 47, 48, 49, 50, 52, 53, 53, 53, 53, 54, 54, 48, 47, 46, 45, + 46, 46, 46, 48, 49, 52, 53, 54, 55, 55, 56, 56, 49, 47, 46, 45, 46, 46, + 46, 47, 49, 52, 53, 55, 55, 57, 57, 58, 49, 47, 46, 45, 45, 45, 45, 47, + 49, 52, 53, 55, 57, 58, 59, 60, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, + 54, 56, 57, 59, 61, 61, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, + 58, 60, 61, 61, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 36, 36, 38, 41, 41, 43, + 47, 49, 49, 49, 48, 48, 49, 49, 49, 49, 50, 50, 50, 51, 31, 31, 31, 31, + 31, 31, 31, 31, 33, 34, 34, 36, 37, 37, 39, 42, 42, 43, 47, 48, 48, 48, + 47, 47, 47, 47, 47, 48, 49, 49, 49, 50, 31, 31, 31, 31, 31, 31, 31, 32, + 34, 34, 34, 37, 38, 38, 40, 42, 42, 43, 46, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 48, 48, 48, 49, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 37, + 38, 38, 40, 42, 42, 43, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, + 48, 49, 31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, + 42, 43, 46, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, 47, 47, 48, 30, 31, + 31, 31, 31, 32, 32, 32, 34, 35, 35, 38, 40, 40, 41, 42, 42, 43, 45, 46, + 46, 46, 45, 45, 45, 45, 45, 45, 46, 46, 46, 47, 30, 31, 31, 31, 31, 32, + 32, 32, 34, 35, 35, 38, 40, 40, 41, 42, 42, 43, 45, 46, 46, 46, 45, 45, + 45, 45, 45, 45, 46, 46, 46, 47, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, + 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 45, 45, 45, 45, 45, 45, + 46, 46, 46, 47, 33, 33, 34, 34, 34, 34, 34, 35, 37, 38, 38, 41, 42, 42, + 43, 44, 44, 45, 46, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, + 33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 45, 45, + 47, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 33, 34, 34, 34, + 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 45, 45, 47, 47, 47, 46, + 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 35, 36, 37, 37, 37, 38, 38, 38, + 41, 41, 41, 44, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 46, 46, 46, 45, + 45, 45, 46, 46, 46, 47, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46, + 47, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 46, 45, 45, 45, 46, 46, + 46, 46, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46, 47, 47, 47, 47, + 47, 47, 47, 48, 48, 47, 46, 46, 46, 45, 45, 45, 46, 46, 46, 46, 38, 39, + 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47, 47, 48, 48, 48, 48, 49, + 49, 48, 48, 48, 47, 47, 47, 47, 47, 47, 47, 48, 41, 42, 42, 42, 42, 42, + 42, 43, 44, 45, 45, 46, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 49, 49, + 49, 49, 49, 49, 50, 50, 50, 50, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, + 45, 46, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, + 50, 50, 50, 50, 43, 43, 43, 43, 43, 43, 43, 43, 45, 45, 45, 46, 47, 47, + 48, 49, 49, 49, 50, 51, 51, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 51, + 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 50, + 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 49, 48, 47, 47, + 47, 46, 46, 46, 47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 53, 53, + 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 47, 46, 46, 46, + 47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 53, 53, 53, 53, 53, 53, + 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, + 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 54, 54, 54, 55, 55, 55, 55, 55, + 55, 56, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49, + 49, 50, 52, 53, 53, 54, 54, 54, 55, 55, 55, 56, 56, 56, 56, 57, 48, 47, + 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49, 49, 50, 52, 53, + 53, 54, 54, 54, 55, 55, 55, 56, 56, 56, 56, 57, 49, 47, 47, 47, 46, 45, + 45, 45, 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 55, 55, + 55, 57, 57, 57, 57, 58, 58, 58, 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 58, 59, + 59, 60, 60, 60, 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, + 47, 49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 58, 59, 59, 60, 60, 60, + 49, 48, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50, + 52, 53, 53, 55, 56, 56, 57, 59, 59, 59, 60, 60, 60, 61, 50, 49, 48, 48, + 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, + 56, 56, 57, 59, 59, 60, 61, 61, 61, 62, 50, 49, 48, 48, 47, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60, + 60, 60, 61, 61, 61, 63, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60, 60, 60, 61, 61, + 61, 63, 51, 50, 49, 49, 48, 47, 47, 47, 47, 47, 47, 47, 46, 46, 48, 50, + 50, 51, 53, 54, 54, 56, 57, 57, 58, 60, 60, 61, 62, 63, 63, 64, + /* Size 4x8 */ + 31, 38, 47, 48, 31, 40, 46, 45, 35, 43, 47, 46, 39, 47, 47, 45, 43, 47, + 50, 50, 47, 47, 53, 55, 46, 46, 53, 58, 48, 46, 54, 59, + /* Size 8x4 */ + 31, 31, 35, 39, 43, 47, 46, 48, 38, 40, 43, 47, 47, 47, 46, 46, 47, 46, + 47, 47, 50, 53, 53, 54, 48, 45, 46, 45, 50, 55, 58, 59, + /* Size 8x16 */ + 32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 34, 38, 45, 47, 47, 48, 31, 32, + 34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46, 33, 35, 37, 42, + 46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46, 37, 40, 43, 47, 47, 47, + 45, 46, 39, 41, 43, 47, 48, 48, 47, 47, 42, 43, 44, 47, 49, 50, 49, 50, + 47, 46, 46, 48, 51, 52, 53, 53, 49, 46, 47, 48, 52, 53, 53, 54, 48, 46, + 46, 47, 51, 53, 56, 56, 48, 45, 46, 46, 51, 53, 57, 57, 49, 45, 45, 46, + 51, 53, 58, 59, 50, 46, 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, + 59, 61, + /* Size 16x8 */ + 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 31, 31, + 32, 32, 35, 36, 40, 41, 43, 46, 46, 46, 45, 45, 46, 46, 33, 34, 34, 35, + 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 37, 38, 39, 40, 42, 43, + 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 45, 45, 45, 44, 46, 46, 47, 48, + 49, 51, 52, 51, 51, 51, 52, 52, 48, 47, 46, 46, 47, 47, 47, 48, 50, 52, + 53, 53, 53, 53, 54, 54, 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56, + 57, 58, 59, 59, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, + 61, 61, + /* Size 16x32 */ + 32, 31, 31, 31, 33, 37, 37, 38, 45, 48, 48, 49, 49, 49, 50, 52, 31, 31, + 31, 31, 33, 38, 38, 39, 45, 47, 47, 48, 48, 48, 49, 51, 31, 31, 31, 31, + 34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 31, 31, 34, 38, + 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 32, 32, 34, 39, 39, 40, + 45, 46, 46, 46, 46, 46, 47, 49, 30, 31, 32, 32, 35, 40, 40, 41, 44, 46, + 46, 45, 45, 45, 46, 48, 30, 31, 32, 32, 35, 40, 40, 41, 44, 46, 46, 45, + 45, 45, 46, 48, 31, 32, 33, 33, 35, 40, 40, 41, 45, 46, 46, 45, 45, 45, + 46, 48, 33, 34, 35, 35, 37, 42, 42, 43, 46, 47, 47, 46, 45, 45, 46, 47, + 33, 35, 36, 36, 38, 43, 43, 44, 46, 47, 47, 46, 46, 46, 46, 47, 33, 35, + 36, 36, 38, 43, 43, 44, 46, 47, 47, 46, 46, 46, 46, 47, 35, 37, 38, 38, + 41, 45, 45, 46, 47, 47, 47, 46, 45, 45, 46, 47, 37, 39, 40, 40, 43, 47, + 47, 47, 47, 47, 47, 46, 45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47, + 47, 47, 47, 46, 45, 45, 46, 47, 39, 40, 41, 41, 43, 47, 47, 47, 48, 48, + 48, 47, 47, 47, 47, 48, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, + 49, 49, 50, 50, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49, + 50, 50, 43, 43, 43, 43, 45, 47, 47, 48, 50, 50, 50, 50, 50, 50, 50, 51, + 47, 46, 46, 46, 46, 48, 48, 48, 51, 52, 52, 52, 53, 53, 53, 53, 49, 47, + 46, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 49, 47, 46, 46, + 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 48, 47, 46, 46, 46, 47, + 47, 48, 52, 53, 53, 54, 55, 55, 55, 56, 48, 47, 46, 46, 46, 47, 47, 48, + 51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 46, 46, 46, 47, 47, 48, 51, 53, + 53, 54, 56, 56, 56, 57, 48, 47, 45, 45, 46, 46, 46, 47, 51, 53, 53, 55, + 57, 57, 57, 59, 49, 46, 45, 45, 45, 46, 46, 47, 51, 53, 53, 56, 58, 58, + 59, 61, 49, 46, 45, 45, 45, 46, 46, 47, 51, 53, 53, 56, 58, 58, 59, 61, + 49, 47, 45, 45, 45, 46, 46, 47, 52, 53, 53, 56, 58, 58, 60, 62, 50, 48, + 46, 46, 46, 46, 46, 48, 52, 54, 54, 57, 59, 59, 61, 63, 50, 48, 46, 46, + 46, 46, 46, 48, 52, 54, 54, 57, 59, 59, 61, 64, 50, 48, 46, 46, 46, 46, + 46, 48, 52, 54, 54, 57, 59, 59, 61, 64, 51, 49, 47, 47, 47, 47, 47, 48, + 52, 54, 54, 58, 60, 60, 62, 65, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 42, 43, + 47, 49, 49, 48, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 31, 31, 31, 31, + 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, 42, 43, 46, 47, 47, 47, + 47, 47, 47, 46, 46, 47, 48, 48, 48, 49, 31, 31, 31, 31, 32, 32, 32, 33, + 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45, + 45, 45, 46, 46, 46, 47, 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, + 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45, 45, 45, 46, 46, + 46, 47, 33, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 41, 43, 43, 43, 44, + 44, 45, 46, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 37, 38, + 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48, + 48, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, 37, 38, 38, 38, 39, 40, + 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, + 46, 46, 46, 46, 46, 46, 46, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, + 44, 46, 47, 47, 47, 48, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 47, + 48, 48, 48, 48, 45, 45, 45, 45, 45, 44, 44, 45, 46, 46, 46, 47, 47, 47, + 48, 49, 49, 50, 51, 52, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, + 48, 47, 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, + 52, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 48, 47, 47, 47, + 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, + 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 45, 45, 45, + 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 54, 54, 55, 56, + 56, 56, 57, 57, 57, 58, 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45, + 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59, + 59, 60, 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45, 45, 45, 47, 49, + 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59, 59, 60, 50, 49, + 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, + 54, 55, 56, 56, 57, 59, 59, 60, 61, 61, 61, 62, 52, 51, 50, 50, 49, 48, + 48, 48, 47, 47, 47, 47, 47, 47, 48, 50, 50, 51, 53, 54, 54, 56, 57, 57, + 59, 61, 61, 62, 63, 64, 64, 65, + /* Size 4x16 */ + 31, 37, 48, 49, 31, 38, 47, 47, 31, 39, 46, 46, 31, 40, 46, 45, 34, 42, + 47, 45, 35, 43, 47, 46, 39, 47, 47, 45, 40, 47, 48, 47, 42, 47, 50, 49, + 46, 48, 52, 53, 47, 48, 53, 53, 47, 47, 53, 56, 47, 46, 53, 57, 46, 46, + 53, 58, 48, 46, 54, 59, 48, 46, 54, 59, + /* Size 16x4 */ + 31, 31, 31, 31, 34, 35, 39, 40, 42, 46, 47, 47, 47, 46, 48, 48, 37, 38, + 39, 40, 42, 43, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 48, 47, 46, 46, + 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, 49, 47, 46, 45, 45, 46, + 45, 47, 49, 53, 53, 56, 57, 58, 59, 59, + /* Size 8x32 */ + 32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 33, 38, 45, 47, 48, 49, 31, 31, + 34, 38, 45, 47, 47, 48, 31, 31, 34, 38, 45, 47, 47, 48, 31, 32, 34, 39, + 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46, 30, 32, 35, 40, 44, 46, + 45, 46, 31, 33, 35, 40, 45, 46, 45, 46, 33, 35, 37, 42, 46, 47, 45, 46, + 33, 36, 38, 43, 46, 47, 46, 46, 33, 36, 38, 43, 46, 47, 46, 46, 35, 38, + 41, 45, 47, 47, 45, 46, 37, 40, 43, 47, 47, 47, 45, 46, 37, 40, 43, 47, + 47, 47, 45, 46, 39, 41, 43, 47, 48, 48, 47, 47, 42, 43, 44, 47, 49, 50, + 49, 50, 42, 43, 44, 47, 49, 50, 49, 50, 43, 43, 45, 47, 50, 50, 50, 50, + 47, 46, 46, 48, 51, 52, 53, 53, 49, 46, 47, 48, 52, 53, 53, 54, 49, 46, + 47, 48, 52, 53, 53, 54, 48, 46, 46, 47, 52, 53, 55, 55, 48, 46, 46, 47, + 51, 53, 56, 56, 48, 46, 46, 47, 51, 53, 56, 56, 48, 45, 46, 46, 51, 53, + 57, 57, 49, 45, 45, 46, 51, 53, 58, 59, 49, 45, 45, 46, 51, 53, 58, 59, + 49, 45, 45, 46, 52, 53, 58, 60, 50, 46, 46, 46, 52, 54, 59, 61, 50, 46, + 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, 59, 61, 51, 47, 47, 47, + 52, 54, 60, 62, + /* Size 32x8 */ + 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 42, 43, + 47, 49, 49, 48, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 31, 31, 31, 31, + 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, + 46, 46, 45, 45, 45, 45, 46, 46, 46, 47, 33, 33, 34, 34, 34, 35, 35, 35, + 37, 38, 38, 41, 43, 43, 43, 44, 44, 45, 46, 47, 47, 46, 46, 46, 46, 45, + 45, 45, 46, 46, 46, 47, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, + 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 46, + 46, 47, 45, 45, 45, 45, 45, 44, 44, 45, 46, 46, 46, 47, 47, 47, 48, 49, + 49, 50, 51, 52, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, 48, 47, + 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53, + 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 45, + 45, 45, 45, 46, 46, 45, 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, + 57, 58, 58, 58, 59, 59, 59, 60, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 57, 59, 59, 60, + 61, 61, 61, 62 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 32, 34, 38, 32, 33, 35, 39, 34, 35, 39, 45, 38, 39, 45, 54, + /* Size 8x8 */ + 31, 31, 32, 32, 33, 34, 37, 41, 31, 32, 32, 32, 33, 34, 36, 39, 32, 32, + 32, 33, 34, 35, 37, 40, 32, 32, 33, 34, 35, 36, 38, 41, 33, 33, 34, 35, + 37, 39, 41, 44, 34, 34, 35, 36, 39, 43, 46, 49, 37, 36, 37, 38, 41, 46, + 51, 54, 41, 39, 40, 41, 44, 49, 54, 58, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 31, 31, 32, 32, 34, 34, 36, 36, 39, 39, 44, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 34, 34, 37, 37, 41, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 34, 34, 37, 37, 41, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, + 35, 36, 36, 39, 39, 42, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 36, + 36, 39, 39, 42, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40, + 40, 42, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 42, + 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 34, 34, + 34, 33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 36, 35, 35, 34, + 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 54, 36, 35, 35, 34, 34, 36, + 36, 38, 38, 42, 42, 48, 48, 50, 50, 54, 39, 38, 38, 37, 37, 39, 39, 40, + 40, 45, 45, 50, 50, 54, 54, 58, 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, + 45, 50, 50, 54, 54, 58, 44, 42, 42, 41, 41, 42, 42, 42, 42, 47, 47, 54, + 54, 58, 58, 63, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, + 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, + 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 37, + 38, 38, 38, 40, 42, 42, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 37, 38, 38, 38, 40, + 42, 42, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 34, 34, 34, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, + 34, 34, 35, 35, 35, 36, 38, 38, 38, 39, 41, 41, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, + 34, 36, 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37, + 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, + 34, 34, 34, 35, 35, 35, 35, 37, 38, 38, 38, 40, 41, 41, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, + 36, 36, 36, 38, 39, 39, 39, 40, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 38, + 39, 39, 39, 40, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 38, 39, 39, 39, 40, + 42, 42, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, + 34, 35, 36, 36, 36, 36, 37, 37, 37, 38, 40, 40, 40, 41, 42, 42, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, + 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 42, 42, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, + 38, 39, 40, 40, 40, 41, 42, 42, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, + 40, 41, 42, 42, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, + 36, 36, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 42, 42, 42, 44, 45, 45, + 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, + 39, 39, 39, 41, 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 34, 34, 34, 34, + 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 41, + 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 34, 34, 34, 34, 34, 34, 33, 33, + 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 41, 42, 42, 42, 44, + 45, 45, 45, 46, 47, 47, 35, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 36, + 36, 36, 37, 37, 37, 39, 41, 41, 41, 43, 45, 45, 45, 46, 47, 47, 47, 49, + 50, 50, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, + 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 54, 54, 36, 35, + 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, + 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 54, 54, 36, 35, 35, 35, 35, 35, + 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, + 48, 49, 50, 50, 50, 52, 54, 54, 37, 37, 37, 37, 37, 36, 36, 36, 36, 37, + 38, 38, 38, 38, 39, 39, 39, 41, 44, 44, 44, 46, 49, 49, 49, 51, 52, 52, + 52, 54, 56, 56, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, + 40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 54, 56, 58, 58, + 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, + 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 54, 56, 58, 58, 39, 39, 38, 38, + 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, + 50, 50, 50, 52, 54, 54, 54, 56, 58, 58, 41, 41, 40, 40, 40, 39, 39, 39, + 39, 40, 40, 40, 40, 41, 41, 41, 41, 44, 46, 46, 46, 49, 52, 52, 52, 54, + 56, 56, 56, 58, 60, 60, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, + 42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60, + 63, 63, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, + 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63, + /* Size 4x8 */ + 31, 32, 34, 39, 32, 32, 34, 38, 32, 33, 34, 38, 32, 33, 36, 40, 33, 34, + 38, 42, 34, 36, 41, 47, 37, 38, 44, 52, 40, 40, 46, 56, + /* Size 8x4 */ + 31, 32, 32, 32, 33, 34, 37, 40, 32, 32, 33, 33, 34, 36, 38, 40, 34, 34, + 34, 36, 38, 41, 44, 46, 39, 38, 38, 40, 42, 47, 52, 56, + /* Size 8x16 */ + 32, 31, 31, 32, 32, 36, 36, 44, 31, 32, 32, 32, 32, 35, 35, 42, 31, 32, + 32, 32, 32, 35, 35, 42, 31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, + 33, 34, 34, 41, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 36, + 36, 42, 32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35, 35, 38, 38, 42, + 34, 34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37, 42, 42, 48, 36, 34, + 34, 38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48, 48, 54, 39, 37, 37, 40, + 40, 50, 50, 58, 39, 37, 37, 40, 40, 50, 50, 58, 44, 41, 41, 43, 43, 53, + 53, 63, + /* Size 16x8 */ + 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 31, 32, + 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 31, 32, 32, 32, + 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 32, 32, 32, 33, 33, 34, + 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 32, 32, 32, 33, 33, 34, 34, 35, + 35, 37, 37, 38, 38, 40, 40, 43, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, + 42, 48, 48, 50, 50, 53, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, + 48, 50, 50, 53, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, + 58, 63, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 39, 44, 44, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 34, 35, 35, 35, 39, 43, 43, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32, 32, 34, + 35, 35, 35, 38, 41, 41, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, + 34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 37, + 41, 41, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 37, 41, 41, + 31, 32, 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 38, 41, 41, 32, 32, + 32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 36, 39, 42, 42, 32, 32, 32, 32, + 32, 33, 34, 34, 34, 35, 36, 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, + 34, 34, 34, 35, 36, 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, + 34, 36, 37, 37, 37, 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, + 38, 38, 38, 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, + 38, 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40, + 42, 42, 33, 33, 33, 33, 33, 34, 36, 36, 36, 38, 40, 40, 40, 42, 45, 45, + 34, 34, 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34, + 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34, 34, 34, + 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 35, 34, 34, 34, 34, 36, + 37, 37, 37, 41, 45, 45, 45, 47, 50, 50, 36, 35, 34, 34, 34, 36, 38, 38, + 38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36, 38, 38, 38, 43, + 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36, 38, 38, 38, 43, 48, 48, + 48, 51, 54, 54, 37, 37, 36, 36, 36, 38, 39, 39, 39, 44, 49, 49, 49, 52, + 56, 56, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45, 50, 50, 50, 54, 58, 58, + 39, 38, 37, 37, 37, 39, 40, 40, 40, 45, 50, 50, 50, 54, 58, 58, 39, 38, + 37, 37, 37, 39, 40, 40, 40, 45, 50, 50, 50, 54, 58, 58, 41, 40, 39, 39, + 39, 40, 42, 42, 42, 46, 52, 52, 52, 56, 60, 60, 44, 42, 41, 41, 41, 42, + 43, 43, 43, 48, 53, 53, 53, 58, 63, 63, 44, 42, 41, 41, 41, 42, 43, 43, + 43, 48, 53, 53, 53, 58, 63, 63, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, + 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, + 35, 35, 35, 37, 38, 38, 38, 40, 42, 42, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, + 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, + 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, + 35, 36, 36, 36, 36, 38, 39, 39, 39, 40, 42, 42, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, + 38, 39, 40, 40, 40, 42, 43, 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, + 40, 42, 43, 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, + 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, + 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, + 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 48, 48, 36, 35, 35, 35, + 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, + 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35, 34, 34, + 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, + 50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, + 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, + 53, 53, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, + 40, 42, 45, 45, 45, 47, 51, 51, 51, 52, 54, 54, 54, 56, 58, 58, 44, 43, + 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, + 48, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63, 44, 43, 42, 42, 42, 41, + 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, + 54, 56, 58, 58, 58, 60, 63, 63, + /* Size 4x16 */ + 31, 32, 34, 39, 32, 32, 34, 38, 32, 32, 34, 38, 32, 32, 33, 37, 32, 32, + 33, 37, 32, 33, 35, 39, 32, 33, 35, 39, 32, 34, 37, 40, 32, 34, 37, 40, + 34, 35, 39, 45, 34, 35, 39, 45, 35, 36, 43, 51, 35, 36, 43, 51, 38, 39, + 45, 54, 38, 39, 45, 54, 42, 42, 48, 58, + /* Size 16x4 */ + 31, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 32, 32, + 32, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 34, 34, 34, 33, + 33, 35, 35, 37, 37, 39, 39, 43, 43, 45, 45, 48, 39, 38, 38, 37, 37, 39, + 39, 40, 40, 45, 45, 51, 51, 54, 54, 58, + /* Size 8x32 */ + 32, 31, 31, 32, 32, 36, 36, 44, 31, 31, 31, 32, 32, 35, 35, 43, 31, 32, + 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, + 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 41, 31, 32, 32, 33, 33, 34, + 34, 41, 31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 34, 34, 41, + 31, 32, 32, 33, 33, 35, 35, 41, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, + 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, + 34, 37, 37, 42, 32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35, 35, 38, + 38, 42, 32, 33, 33, 35, 35, 38, 38, 42, 33, 33, 33, 36, 36, 40, 40, 45, + 34, 34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37, 42, 42, 48, 34, 34, + 34, 37, 37, 42, 42, 48, 35, 34, 34, 37, 37, 45, 45, 50, 36, 34, 34, 38, + 38, 48, 48, 54, 36, 34, 34, 38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48, + 48, 54, 37, 36, 36, 39, 39, 49, 49, 56, 39, 37, 37, 40, 40, 50, 50, 58, + 39, 37, 37, 40, 40, 50, 50, 58, 39, 37, 37, 40, 40, 50, 50, 58, 41, 39, + 39, 42, 42, 52, 52, 60, 44, 41, 41, 43, 43, 53, 53, 63, 44, 41, 41, 43, + 43, 53, 53, 63, + /* Size 32x8 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, + 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, + 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, + 37, 37, 37, 39, 41, 41, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, + 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, + 43, 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, + 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 36, 35, + 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, + 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35, + 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, + 48, 49, 50, 50, 50, 52, 53, 53, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, + 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58, + 58, 60, 63, 63 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 34, 42, 47, 34, 39, 45, 46, 42, 45, 48, 49, 47, 46, 49, 54, + /* Size 8x8 */ + 31, 31, 32, 35, 39, 45, 48, 48, 31, 31, 33, 37, 41, 44, 46, 46, 32, 33, + 35, 39, 42, 45, 46, 45, 35, 37, 39, 43, 45, 47, 47, 46, 39, 41, 42, 45, + 47, 48, 48, 47, 45, 44, 45, 47, 48, 50, 51, 51, 48, 46, 46, 47, 48, 51, + 53, 54, 48, 46, 45, 46, 47, 51, 54, 56, + /* Size 16x16 */ + 32, 31, 31, 30, 30, 33, 33, 36, 36, 41, 41, 49, 49, 48, 48, 49, 31, 31, + 31, 31, 31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 31, 31, 31, 31, + 31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 30, 31, 31, 32, 32, 35, + 35, 40, 40, 42, 42, 46, 46, 45, 45, 45, 30, 31, 31, 32, 32, 35, 35, 40, + 40, 42, 42, 46, 46, 45, 45, 45, 33, 34, 34, 35, 35, 39, 39, 43, 43, 45, + 45, 47, 47, 46, 46, 45, 33, 34, 34, 35, 35, 39, 39, 43, 43, 45, 45, 47, + 47, 46, 46, 45, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46, + 46, 45, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46, 46, 45, + 41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 41, 42, + 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 49, 47, 47, 46, + 46, 47, 47, 48, 48, 50, 50, 53, 53, 53, 53, 53, 49, 47, 47, 46, 46, 47, + 47, 48, 48, 50, 50, 53, 53, 53, 53, 53, 48, 47, 47, 45, 45, 46, 46, 46, + 46, 49, 49, 53, 53, 54, 54, 55, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, + 49, 53, 53, 54, 54, 55, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, + 53, 55, 55, 58, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 36, 36, 36, 39, + 41, 41, 41, 45, 49, 49, 49, 49, 48, 48, 48, 49, 49, 49, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42, 42, 42, 45, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, + 34, 36, 38, 38, 38, 40, 42, 42, 42, 45, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 34, 36, 38, 38, + 38, 40, 42, 42, 42, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 33, 35, 35, 35, 37, 39, 39, 39, 41, 42, 42, + 42, 44, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 30, 31, 31, 31, 31, 31, + 32, 32, 32, 33, 35, 35, 35, 37, 40, 40, 40, 41, 42, 42, 42, 44, 46, 46, + 46, 46, 45, 45, 45, 45, 45, 45, 30, 31, 31, 31, 31, 31, 32, 32, 32, 33, + 35, 35, 35, 37, 40, 40, 40, 41, 42, 42, 42, 44, 46, 46, 46, 46, 45, 45, + 45, 45, 45, 45, 30, 31, 31, 31, 31, 31, 32, 32, 32, 33, 35, 35, 35, 37, + 40, 40, 40, 41, 42, 42, 42, 44, 46, 46, 46, 46, 45, 45, 45, 45, 45, 45, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 35, 37, 37, 37, 39, 41, 41, 41, 42, + 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 46, 45, 45, 45, 33, 34, 34, 34, + 34, 35, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, + 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 33, 34, 34, 34, 34, 35, 35, 35, + 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, + 46, 46, 46, 46, 45, 45, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 39, + 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 46, 46, 46, 46, + 45, 45, 35, 35, 36, 36, 36, 37, 37, 37, 37, 39, 41, 41, 41, 43, 45, 45, + 45, 45, 46, 46, 46, 47, 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 36, 37, + 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, + 47, 47, 48, 48, 48, 47, 46, 46, 46, 46, 45, 45, 36, 37, 38, 38, 38, 39, + 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, + 48, 47, 46, 46, 46, 46, 45, 45, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, + 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 46, 46, + 46, 46, 45, 45, 39, 39, 40, 40, 40, 41, 41, 41, 41, 42, 44, 44, 44, 45, + 47, 47, 47, 47, 48, 48, 48, 48, 49, 49, 49, 48, 48, 48, 48, 47, 47, 47, + 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, + 48, 48, 48, 49, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 41, 42, 42, 42, + 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 49, + 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 41, 42, 42, 42, 42, 42, 42, 42, + 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 49, 50, 50, 50, 50, + 49, 49, 49, 49, 49, 49, 45, 45, 45, 45, 45, 44, 44, 44, 44, 45, 46, 46, + 46, 47, 47, 47, 47, 48, 49, 49, 49, 50, 51, 51, 51, 51, 51, 51, 51, 51, + 51, 51, 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, + 48, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 49, 48, + 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, + 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 49, 48, 47, 47, 47, 47, + 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53, + 53, 53, 53, 53, 53, 53, 53, 53, 49, 48, 47, 47, 47, 46, 46, 46, 46, 46, + 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, + 53, 54, 54, 54, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, + 46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 55, + 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, + 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 55, 48, 48, 47, 47, + 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51, + 53, 53, 53, 53, 54, 54, 54, 55, 55, 55, 49, 48, 47, 47, 47, 46, 45, 45, + 45, 45, 46, 46, 46, 46, 46, 46, 46, 47, 49, 49, 49, 51, 53, 53, 53, 54, + 55, 55, 55, 56, 57, 57, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57, + 58, 58, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, + 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57, 58, 58, + /* Size 4x8 */ + 31, 34, 42, 48, 31, 35, 42, 46, 33, 37, 44, 46, 36, 41, 46, 46, 40, 44, + 48, 48, 45, 46, 49, 51, 47, 47, 50, 54, 47, 46, 49, 55, + /* Size 8x4 */ + 31, 31, 33, 36, 40, 45, 47, 47, 34, 35, 37, 41, 44, 46, 47, 46, 42, 42, + 44, 46, 48, 49, 50, 49, 48, 46, 46, 46, 48, 51, 54, 55, + /* Size 8x16 */ + 32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 38, 38, 47, 47, 47, 31, 31, + 31, 38, 38, 47, 47, 47, 30, 32, 32, 40, 40, 46, 46, 45, 30, 32, 32, 40, + 40, 46, 46, 45, 33, 36, 36, 43, 43, 47, 47, 46, 33, 36, 36, 43, 43, 47, + 47, 46, 37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47, 45, + 42, 43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47, 50, 50, 49, 49, 46, + 46, 48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53, 53, 53, 48, 46, 46, 47, + 47, 53, 53, 56, 48, 46, 46, 47, 47, 53, 53, 56, 49, 45, 45, 46, 46, 53, + 53, 58, + /* Size 16x8 */ + 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 31, 31, + 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 31, 31, 31, 32, + 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 37, 38, 38, 40, 40, 43, + 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 37, 38, 38, 40, 40, 43, 43, 47, + 47, 47, 47, 48, 48, 47, 47, 46, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, + 50, 53, 53, 53, 53, 53, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, + 53, 53, 53, 53, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, + 56, 58, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 33, 37, 37, 37, 42, 48, 48, 48, 48, 49, 49, 31, 31, + 31, 31, 31, 34, 37, 37, 37, 42, 47, 47, 47, 48, 48, 48, 31, 31, 31, 31, + 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 34, + 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 34, 38, 38, + 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 32, 32, 32, 35, 39, 39, 39, 42, + 46, 46, 46, 46, 46, 46, 30, 31, 32, 32, 32, 35, 40, 40, 40, 42, 46, 46, + 46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40, 40, 40, 42, 46, 46, 46, 45, + 45, 45, 30, 31, 32, 32, 32, 35, 40, 40, 40, 42, 46, 46, 46, 45, 45, 45, + 32, 33, 34, 34, 34, 37, 41, 41, 41, 44, 46, 46, 46, 46, 45, 45, 33, 34, + 36, 36, 36, 39, 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 33, 34, 36, 36, + 36, 39, 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39, + 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 35, 36, 38, 38, 38, 41, 45, 45, + 45, 46, 47, 47, 47, 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, + 47, 47, 47, 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, + 47, 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46, + 45, 45, 39, 40, 41, 41, 41, 44, 47, 47, 47, 48, 49, 49, 49, 48, 47, 47, + 42, 42, 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42, + 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42, 43, 43, + 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 45, 45, 44, 44, 44, 46, + 47, 47, 47, 49, 51, 51, 51, 51, 51, 51, 49, 48, 46, 46, 46, 47, 48, 48, + 48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47, 48, 48, 48, 50, + 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47, 48, 48, 48, 50, 53, 53, + 53, 53, 53, 53, 48, 47, 46, 46, 46, 47, 47, 47, 47, 50, 53, 53, 53, 54, + 54, 54, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50, 53, 53, 53, 54, 56, 56, + 48, 47, 46, 46, 46, 46, 47, 47, 47, 50, 53, 53, 53, 54, 56, 56, 48, 47, + 46, 46, 46, 46, 47, 47, 47, 50, 53, 53, 53, 54, 56, 56, 48, 47, 45, 45, + 45, 46, 46, 46, 46, 49, 53, 53, 53, 55, 57, 57, 49, 47, 45, 45, 45, 45, + 46, 46, 46, 49, 53, 53, 53, 56, 58, 58, 49, 47, 45, 45, 45, 45, 46, 46, + 46, 49, 53, 53, 53, 56, 58, 58, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 37, 37, 37, 39, + 42, 42, 42, 45, 49, 49, 49, 48, 48, 48, 48, 48, 49, 49, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42, 42, 42, 45, + 48, 48, 48, 47, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, + 46, 46, 46, 45, 45, 45, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, + 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, + 45, 45, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, + 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 33, 34, + 34, 34, 34, 35, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, + 45, 46, 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 37, 37, 38, 38, 38, 39, + 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, + 48, 47, 47, 47, 47, 46, 46, 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, + 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, + 47, 46, 46, 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, + 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 47, 48, + 48, 48, 48, 49, 50, 50, 50, 50, 50, 50, 50, 49, 49, 49, 48, 47, 47, 47, + 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, + 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46, 46, 46, + 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, + 53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, + 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, + 53, 53, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, + 46, 48, 50, 50, 50, 51, 53, 53, 53, 54, 54, 54, 54, 55, 56, 56, 49, 48, + 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, + 49, 51, 53, 53, 53, 54, 56, 56, 56, 57, 58, 58, 49, 48, 47, 47, 47, 46, + 45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, + 53, 54, 56, 56, 56, 57, 58, 58, + /* Size 4x16 */ + 31, 33, 42, 48, 31, 34, 42, 47, 31, 34, 42, 47, 31, 35, 42, 45, 31, 35, + 42, 45, 34, 39, 45, 46, 34, 39, 45, 46, 38, 43, 47, 46, 38, 43, 47, 46, + 42, 45, 48, 50, 42, 45, 48, 50, 48, 47, 50, 53, 48, 47, 50, 53, 47, 46, + 50, 54, 47, 46, 50, 54, 47, 45, 49, 56, + /* Size 16x4 */ + 31, 31, 31, 31, 31, 34, 34, 38, 38, 42, 42, 48, 48, 47, 47, 47, 33, 34, + 34, 35, 35, 39, 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 42, 42, 42, 42, + 42, 45, 45, 47, 47, 48, 48, 50, 50, 50, 50, 49, 48, 47, 47, 45, 45, 46, + 46, 46, 46, 50, 50, 53, 53, 54, 54, 56, + /* Size 8x32 */ + 32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 37, 37, 47, 47, 48, 31, 31, + 31, 38, 38, 47, 47, 47, 31, 31, 31, 38, 38, 47, 47, 47, 31, 31, 31, 38, + 38, 47, 47, 47, 31, 32, 32, 39, 39, 46, 46, 46, 30, 32, 32, 40, 40, 46, + 46, 45, 30, 32, 32, 40, 40, 46, 46, 45, 30, 32, 32, 40, 40, 46, 46, 45, + 32, 34, 34, 41, 41, 46, 46, 45, 33, 36, 36, 43, 43, 47, 47, 46, 33, 36, + 36, 43, 43, 47, 47, 46, 33, 36, 36, 43, 43, 47, 47, 46, 35, 38, 38, 45, + 45, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47, 47, 47, + 47, 45, 37, 40, 40, 47, 47, 47, 47, 45, 39, 41, 41, 47, 47, 49, 49, 47, + 42, 43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47, 50, 50, 49, 42, 43, + 43, 47, 47, 50, 50, 49, 45, 44, 44, 47, 47, 51, 51, 51, 49, 46, 46, 48, + 48, 53, 53, 53, 49, 46, 46, 48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53, + 53, 53, 48, 46, 46, 47, 47, 53, 53, 54, 48, 46, 46, 47, 47, 53, 53, 56, + 48, 46, 46, 47, 47, 53, 53, 56, 48, 46, 46, 47, 47, 53, 53, 56, 48, 45, + 45, 46, 46, 53, 53, 57, 49, 45, 45, 46, 46, 53, 53, 58, 49, 45, 45, 46, + 46, 53, 53, 58, + /* Size 32x8 */ + 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 37, 37, 37, 39, + 42, 42, 42, 45, 49, 49, 49, 48, 48, 48, 48, 48, 49, 49, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, + 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, + 46, 46, 46, 45, 45, 45, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, + 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, + 46, 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, + 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, 48, 47, + 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, + 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46, + 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, + 53, 53, 53, 53, 53, 53, 53, 53, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, + 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 56, 56, + 56, 57, 58, 58 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 32, 32, 35, 32, 32, 33, 35, 32, 33, 35, 38, 35, 35, 38, 46, + /* Size 8x8 */ + 31, 31, 31, 32, 32, 32, 34, 35, 31, 32, 32, 32, 32, 33, 34, 35, 31, 32, + 32, 32, 32, 33, 33, 34, 32, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 34, + 35, 35, 36, 38, 32, 33, 33, 34, 35, 36, 38, 40, 34, 34, 33, 35, 36, 38, + 39, 42, 35, 35, 34, 36, 38, 40, 42, 48, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 34, 36, 36, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 34, 34, 34, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, + 35, 35, 36, 36, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 36, + 36, 36, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 36, 36, 37, 37, + 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 32, 32, + 32, 32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 33, 33, 33, 33, + 33, 33, 34, 35, 35, 36, 36, 38, 39, 40, 42, 42, 34, 34, 34, 34, 33, 33, + 35, 35, 36, 37, 37, 39, 39, 41, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36, + 36, 37, 37, 40, 41, 42, 45, 45, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, + 38, 42, 42, 45, 48, 48, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, + 42, 45, 48, 48, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, + 34, 35, 35, 35, 35, 36, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, + 35, 36, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, + 34, 34, 34, 34, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, + 34, 34, 34, 35, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, + 34, 35, 35, 35, 35, 36, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36, 36, + 36, 37, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, + 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, + 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, + 35, 35, 36, 36, 36, 36, 36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, + 37, 37, 37, 38, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, + 34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, 38, 38, 38, 39, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, + 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, + 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, + 37, 38, 38, 38, 38, 39, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 39, 40, 40, + 40, 41, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, + 35, 36, 36, 36, 36, 37, 38, 39, 39, 39, 40, 41, 42, 42, 42, 42, 34, 34, + 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, + 37, 38, 39, 39, 39, 39, 41, 42, 42, 42, 42, 43, 34, 34, 34, 34, 34, 34, + 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, + 39, 39, 41, 42, 42, 42, 42, 43, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, + 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 39, 39, 41, 42, + 42, 42, 42, 43, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, + 36, 36, 36, 37, 37, 37, 37, 38, 40, 41, 41, 41, 42, 44, 45, 45, 45, 45, + 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, + 38, 38, 38, 39, 41, 42, 42, 42, 44, 46, 47, 47, 47, 48, 36, 35, 35, 35, + 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, + 42, 42, 42, 42, 45, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, 35, 34, + 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, + 45, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 35, + 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47, 48, 48, + 48, 49, 37, 37, 36, 36, 36, 36, 36, 35, 35, 35, 35, 36, 37, 37, 37, 37, + 38, 39, 39, 39, 39, 41, 42, 43, 43, 43, 45, 48, 49, 49, 49, 50, + /* Size 4x8 */ + 31, 31, 32, 35, 32, 32, 32, 35, 32, 32, 33, 34, 32, 32, 34, 36, 32, 33, + 35, 38, 33, 33, 36, 40, 34, 34, 37, 42, 35, 34, 38, 48, + /* Size 8x4 */ + 31, 32, 32, 32, 32, 33, 34, 35, 31, 32, 32, 32, 33, 33, 34, 34, 32, 32, + 33, 34, 35, 36, 37, 38, 35, 35, 34, 36, 38, 40, 42, 48, + /* Size 8x16 */ + 32, 31, 31, 31, 32, 32, 35, 36, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, + 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 34, 35, 31, 32, 32, 32, + 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 33, 34, 34, + 35, 36, 32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36, 37, + 32, 32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38, 33, 33, + 33, 35, 36, 36, 40, 41, 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, + 37, 37, 43, 44, 36, 35, 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, + 46, 48, + /* Size 16x8 */ + 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, + 33, 33, 33, 34, 34, 35, 35, 35, 36, 36, 32, 32, 32, 32, 33, 33, 34, 34, + 34, 35, 35, 36, 37, 37, 38, 38, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, + 35, 36, 37, 37, 38, 38, 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40, + 41, 43, 46, 46, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 41, 42, 44, + 48, 48, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 35, + 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, + 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 31, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 34, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, + 33, 33, 34, 34, 34, 34, 35, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, + 34, 34, 34, 35, 36, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, + 34, 35, 36, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, + 36, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 37, + 37, 37, 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 37, 38, 38, 38, + 32, 32, 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32, + 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32, 32, 33, + 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 33, 33, 33, 33, 33, + 34, 35, 36, 36, 36, 37, 39, 40, 40, 40, 33, 33, 33, 33, 33, 33, 35, 36, + 36, 36, 36, 38, 40, 41, 41, 41, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, + 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 39, + 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 39, 41, 42, + 42, 42, 34, 34, 34, 34, 34, 34, 35, 37, 37, 37, 37, 40, 43, 44, 44, 44, + 35, 35, 34, 34, 34, 34, 36, 37, 38, 38, 38, 41, 45, 47, 47, 47, 36, 35, + 35, 34, 34, 34, 36, 37, 38, 38, 38, 42, 46, 48, 48, 48, 36, 35, 35, 34, + 34, 34, 36, 37, 38, 38, 38, 42, 46, 48, 48, 48, 36, 35, 35, 34, 34, 34, + 36, 37, 38, 38, 38, 42, 46, 48, 48, 48, 37, 36, 36, 36, 36, 36, 37, 38, + 39, 39, 39, 42, 46, 49, 49, 49, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, + 34, 34, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, + 34, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, + 35, 35, 35, 36, 36, 36, 36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, + 37, 37, 37, 38, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, + 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, + 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, + 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 38, 39, 39, 39, + 40, 41, 42, 42, 42, 42, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, + 35, 36, 36, 36, 36, 37, 37, 37, 37, 39, 40, 41, 41, 41, 43, 45, 46, 46, + 46, 46, 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, + 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49, 36, 35, + 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, + 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, + 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, + 42, 42, 44, 47, 48, 48, 48, 49, + /* Size 4x16 */ + 31, 31, 32, 36, 31, 32, 32, 35, 32, 32, 32, 35, 32, 32, 32, 35, 32, 32, + 33, 34, 32, 32, 33, 34, 32, 32, 34, 36, 32, 32, 34, 36, 32, 32, 34, 37, + 32, 33, 35, 38, 32, 33, 35, 38, 33, 33, 36, 41, 34, 34, 37, 42, 34, 34, + 37, 44, 35, 34, 38, 48, 35, 34, 38, 48, + /* Size 16x4 */ + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 32, 32, 32, 32, + 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 36, 35, 35, 35, 34, 34, + 36, 36, 37, 38, 38, 41, 42, 44, 48, 48, + /* Size 8x32 */ + 32, 31, 31, 31, 32, 32, 35, 36, 31, 31, 31, 32, 32, 32, 35, 35, 31, 32, + 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, + 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, + 34, 35, 31, 32, 32, 32, 32, 32, 34, 35, 31, 32, 32, 32, 33, 33, 34, 34, + 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, + 32, 33, 33, 33, 35, 35, 31, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 33, + 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, + 36, 36, 32, 32, 32, 33, 34, 34, 36, 37, 32, 32, 33, 33, 35, 35, 37, 38, + 32, 32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38, 32, 32, + 33, 34, 35, 35, 37, 38, 32, 33, 33, 34, 36, 36, 39, 40, 33, 33, 33, 35, + 36, 36, 40, 41, 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, + 41, 42, 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 43, 44, + 35, 34, 34, 36, 38, 38, 45, 47, 36, 35, 34, 36, 38, 38, 46, 48, 36, 35, + 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, 46, 48, 37, 36, 36, 37, + 39, 39, 46, 49, + /* Size 32x8 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 34, 34, 34, 34, 34, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, + 34, 34, 34, 34, 34, 36, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36, + 36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, + 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, + 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 35, 35, 35, 35, 35, 35, + 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 39, 40, 41, + 41, 41, 43, 45, 46, 46, 46, 46, 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, + 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, + 48, 48, 48, 49 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 32, 38, 46, 32, 34, 41, 46, 38, 41, 47, 47, 46, 46, 47, 52, + /* Size 8x8 */ + 31, 31, 30, 34, 36, 39, 42, 48, 31, 31, 31, 34, 37, 40, 42, 47, 30, 31, + 32, 35, 39, 41, 42, 46, 34, 34, 35, 39, 42, 44, 45, 47, 36, 37, 39, 42, + 46, 47, 47, 47, 39, 40, 41, 44, 47, 47, 48, 49, 42, 42, 42, 45, 47, 48, + 48, 50, 48, 47, 46, 47, 47, 49, 50, 53, + /* Size 16x16 */ + 32, 31, 31, 31, 30, 30, 33, 33, 34, 36, 36, 40, 41, 44, 49, 49, 31, 31, + 31, 31, 31, 31, 33, 34, 36, 38, 38, 41, 42, 44, 48, 48, 31, 31, 31, 31, + 31, 31, 34, 34, 36, 38, 38, 41, 42, 44, 47, 47, 31, 31, 31, 31, 31, 31, + 34, 35, 36, 39, 39, 41, 42, 44, 47, 47, 30, 31, 31, 31, 32, 32, 34, 35, + 37, 40, 40, 42, 42, 44, 46, 46, 30, 31, 31, 31, 32, 32, 34, 35, 37, 40, + 40, 42, 42, 44, 46, 46, 33, 33, 34, 34, 34, 34, 37, 38, 40, 42, 42, 44, + 44, 45, 47, 47, 33, 34, 34, 35, 35, 35, 38, 39, 40, 43, 43, 44, 45, 46, + 47, 47, 34, 36, 36, 36, 37, 37, 40, 40, 42, 45, 45, 45, 46, 46, 47, 47, + 36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 36, 38, + 38, 39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 40, 41, 41, 41, + 42, 42, 44, 44, 45, 47, 47, 48, 48, 49, 50, 50, 41, 42, 42, 42, 42, 42, + 44, 45, 46, 47, 47, 48, 48, 49, 50, 50, 44, 44, 44, 44, 44, 44, 45, 46, + 46, 47, 47, 49, 49, 50, 51, 51, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, + 48, 50, 50, 51, 53, 53, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, + 50, 51, 53, 53, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 34, 36, + 36, 36, 36, 38, 40, 41, 41, 41, 44, 47, 49, 49, 49, 49, 31, 31, 31, 31, + 31, 31, 31, 31, 30, 30, 30, 32, 33, 34, 34, 34, 35, 36, 37, 37, 37, 39, + 41, 42, 42, 42, 44, 47, 48, 48, 48, 48, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 33, 34, 34, 34, 36, 37, 38, 38, 38, 39, 41, 42, 42, 42, + 44, 46, 48, 48, 48, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 34, 34, 34, 34, 36, 37, 38, 38, 38, 40, 41, 42, 42, 42, 44, 46, 47, 47, + 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, + 36, 37, 38, 38, 38, 40, 41, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 36, 37, 38, 38, + 38, 40, 41, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 33, 34, 35, 35, 35, 36, 38, 39, 39, 39, 40, 41, 42, + 42, 42, 44, 46, 47, 47, 47, 47, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 41, 42, 42, 42, 42, 44, 46, + 46, 46, 46, 46, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, + 35, 35, 37, 39, 40, 40, 40, 41, 42, 42, 42, 42, 44, 45, 46, 46, 46, 46, + 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 39, + 40, 40, 40, 41, 42, 42, 42, 42, 44, 45, 46, 46, 46, 46, 30, 30, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 39, 40, 40, 40, 41, + 42, 42, 42, 42, 44, 45, 46, 46, 46, 46, 31, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 34, 36, 37, 37, 37, 38, 40, 41, 41, 41, 42, 43, 43, 43, 43, + 44, 46, 46, 46, 46, 46, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, + 37, 38, 38, 38, 40, 41, 42, 42, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, + 47, 46, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39, + 40, 42, 43, 43, 43, 44, 44, 45, 45, 45, 46, 47, 47, 47, 47, 47, 33, 34, + 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 43, 43, + 43, 44, 44, 45, 45, 45, 46, 47, 47, 47, 47, 47, 33, 34, 34, 34, 34, 34, + 35, 35, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 43, 43, 43, 44, 44, 45, + 45, 45, 46, 47, 47, 47, 47, 47, 34, 35, 36, 36, 36, 36, 36, 37, 37, 37, + 37, 38, 40, 40, 40, 40, 42, 44, 45, 45, 45, 45, 45, 46, 46, 46, 46, 47, + 47, 47, 47, 47, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, 39, 40, 41, 42, + 42, 42, 44, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 36, 37, 38, 38, + 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 36, 37, 38, 38, 38, 38, 39, 39, + 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 48, 48, 48, 47, 38, 39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 42, + 43, 44, 44, 44, 45, 47, 47, 47, 47, 47, 48, 48, 48, 48, 48, 48, 49, 49, + 49, 48, 40, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 43, 44, 44, 44, 44, + 45, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 49, 50, 50, 50, 49, 41, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, + 47, 48, 48, 48, 48, 48, 49, 50, 50, 50, 50, 50, 41, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, + 48, 48, 49, 50, 50, 50, 50, 50, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 50, + 50, 50, 50, 50, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 45, 46, + 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 51, 51, 51, 51, + 47, 47, 46, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 47, 47, + 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 52, 52, 52, 52, 49, 48, 48, 47, + 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, + 50, 50, 50, 50, 51, 52, 53, 53, 53, 53, 49, 48, 48, 47, 47, 47, 47, 46, + 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, + 51, 52, 53, 53, 53, 53, 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, + 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52, 53, 53, + 53, 53, 49, 48, 47, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, + 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, + /* Size 4x8 */ + 31, 31, 37, 48, 31, 31, 38, 47, 31, 32, 40, 46, 34, 36, 43, 47, 37, 39, + 46, 47, 39, 41, 47, 48, 42, 43, 47, 50, 48, 46, 48, 53, + /* Size 8x4 */ + 31, 31, 31, 34, 37, 39, 42, 48, 31, 31, 32, 36, 39, 41, 43, 46, 37, 38, + 40, 43, 46, 47, 47, 48, 48, 47, 46, 47, 47, 48, 50, 53, + /* Size 8x16 */ + 32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, + 31, 34, 38, 38, 45, 47, 31, 31, 32, 34, 39, 39, 45, 46, 30, 32, 32, 35, + 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46, 33, 34, 35, 37, 42, 42, + 46, 47, 33, 35, 36, 38, 43, 43, 46, 47, 35, 37, 37, 40, 44, 44, 46, 47, + 37, 39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47, 41, 42, + 42, 44, 47, 47, 49, 49, 42, 42, 43, 44, 47, 47, 49, 50, 44, 44, 44, 45, + 47, 47, 50, 51, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, + 52, 53, + /* Size 16x8 */ + 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 31, 31, + 31, 31, 32, 32, 34, 35, 37, 39, 39, 42, 42, 44, 47, 47, 31, 31, 31, 32, + 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 33, 34, 34, 34, 35, 35, + 37, 38, 40, 43, 43, 44, 44, 45, 47, 47, 37, 38, 38, 39, 40, 40, 42, 43, + 44, 47, 47, 47, 47, 47, 48, 48, 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, + 47, 47, 47, 47, 48, 48, 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49, + 49, 50, 52, 52, 48, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 50, 51, + 53, 53, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 40, 45, 48, 48, 48, 31, 31, + 31, 31, 31, 31, 33, 36, 37, 37, 37, 41, 45, 48, 48, 48, 31, 31, 31, 31, + 31, 31, 34, 36, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31, 31, + 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31, 31, 34, 37, + 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31, 31, 34, 37, 38, 38, + 38, 41, 45, 47, 47, 47, 31, 31, 31, 32, 32, 32, 34, 37, 39, 39, 39, 41, + 45, 46, 46, 46, 30, 31, 31, 32, 32, 32, 34, 38, 39, 39, 39, 42, 44, 46, + 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40, 40, 40, 42, 44, 46, 46, 46, + 30, 31, 32, 32, 32, 32, 35, 38, 40, 40, 40, 42, 44, 46, 46, 46, 30, 31, + 32, 32, 32, 32, 35, 38, 40, 40, 40, 42, 44, 46, 46, 46, 31, 32, 33, 33, + 33, 33, 36, 39, 41, 41, 41, 43, 45, 46, 46, 46, 33, 34, 34, 35, 35, 35, + 37, 40, 42, 42, 42, 44, 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, + 43, 43, 43, 44, 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, + 43, 44, 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, + 46, 47, 47, 47, 35, 36, 37, 37, 37, 37, 40, 43, 44, 44, 44, 45, 46, 47, + 47, 47, 36, 37, 38, 39, 39, 39, 42, 44, 46, 46, 46, 47, 47, 47, 47, 47, + 37, 38, 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38, + 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, + 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 39, 39, 40, 41, 41, 41, + 43, 46, 47, 47, 47, 48, 48, 48, 48, 48, 41, 41, 42, 42, 42, 42, 44, 46, + 47, 47, 47, 48, 49, 49, 49, 49, 42, 42, 42, 43, 43, 43, 44, 46, 47, 47, + 47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44, 46, 47, 47, 47, 48, + 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44, 46, 47, 47, 47, 48, 49, 50, + 50, 50, 44, 44, 44, 44, 44, 44, 45, 47, 47, 47, 47, 49, 50, 51, 51, 51, + 47, 46, 46, 46, 46, 46, 46, 47, 48, 48, 48, 49, 51, 52, 52, 52, 49, 48, + 47, 46, 46, 46, 47, 48, 48, 48, 48, 50, 52, 53, 53, 53, 49, 48, 47, 46, + 46, 46, 47, 48, 48, 48, 48, 50, 52, 53, 53, 53, 49, 48, 47, 46, 46, 46, + 47, 48, 48, 48, 48, 50, 52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 47, + 47, 47, 47, 49, 52, 53, 53, 53, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 35, 36, + 37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 49, 49, 49, 49, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 36, 37, 38, 38, 38, 39, + 41, 42, 42, 42, 44, 46, 48, 48, 48, 48, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 42, 42, 42, + 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, + 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, + 46, 46, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, + 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, + 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46, 33, 33, 34, 34, 34, 34, + 34, 34, 35, 35, 35, 36, 37, 38, 38, 38, 40, 42, 43, 43, 43, 43, 44, 44, + 44, 44, 45, 46, 47, 47, 47, 47, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, + 38, 39, 40, 41, 41, 41, 43, 44, 45, 45, 45, 46, 46, 46, 46, 46, 47, 47, + 48, 48, 48, 47, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, + 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, + 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 37, 37, 38, 38, + 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 40, 41, 41, 41, 41, 41, 41, 42, + 42, 42, 42, 43, 44, 44, 44, 44, 45, 47, 47, 47, 47, 48, 48, 48, 48, 48, + 49, 49, 50, 50, 50, 49, 45, 45, 45, 45, 45, 45, 45, 44, 44, 44, 44, 45, + 46, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 52, 52, + 52, 52, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, 48, 48, + 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, 48, 48, 47, 47, 47, 47, + 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, + 50, 50, 51, 52, 53, 53, 53, 53, + /* Size 4x16 */ + 31, 31, 37, 48, 31, 31, 38, 47, 31, 31, 38, 47, 31, 32, 39, 46, 31, 32, + 40, 46, 31, 32, 40, 46, 34, 35, 42, 47, 34, 36, 43, 47, 36, 37, 44, 47, + 38, 40, 47, 47, 38, 40, 47, 47, 41, 42, 47, 49, 42, 43, 47, 50, 44, 44, + 47, 51, 48, 46, 48, 53, 48, 46, 48, 53, + /* Size 16x4 */ + 31, 31, 31, 31, 31, 31, 34, 34, 36, 38, 38, 41, 42, 44, 48, 48, 31, 31, + 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 37, 38, 38, 39, + 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 46, 46, 46, + 47, 47, 47, 47, 47, 49, 50, 51, 53, 53, + /* Size 8x32 */ + 32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 33, 37, 37, 45, 48, 31, 31, + 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, + 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 32, 34, 39, 39, + 45, 46, 30, 31, 32, 34, 39, 39, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46, + 30, 32, 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46, 31, 33, + 33, 36, 41, 41, 45, 46, 33, 34, 35, 37, 42, 42, 46, 47, 33, 35, 36, 38, + 43, 43, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47, 33, 35, 36, 38, 43, 43, + 46, 47, 35, 37, 37, 40, 44, 44, 46, 47, 36, 38, 39, 42, 46, 46, 47, 47, + 37, 39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47, 37, 39, + 40, 43, 47, 47, 47, 47, 39, 40, 41, 43, 47, 47, 48, 48, 41, 42, 42, 44, + 47, 47, 49, 49, 42, 42, 43, 44, 47, 47, 49, 50, 42, 42, 43, 44, 47, 47, + 49, 50, 42, 42, 43, 44, 47, 47, 49, 50, 44, 44, 44, 45, 47, 47, 50, 51, + 47, 46, 46, 46, 48, 48, 51, 52, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, + 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, + 47, 47, 52, 53, + /* Size 32x8 */ + 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 35, 36, + 37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 49, 49, 49, 49, 31, 31, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 40, + 42, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, + 44, 46, 46, 46, 46, 46, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, + 37, 38, 38, 38, 40, 42, 43, 43, 43, 43, 44, 44, 44, 44, 45, 46, 47, 47, + 47, 47, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, + 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 37, 37, + 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 45, 45, 45, 45, 45, 45, + 45, 44, 44, 44, 44, 45, 46, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, + 49, 49, 50, 51, 52, 52, 52, 52, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, + 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, + 53, 53, 53, 53 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 33, 34, 32, 33, 34, 35, + /* Size 8x8 */ + 31, 31, 31, 31, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, + 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, + 33, 33, 34, 35, 32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32, 32, 34, 34, + 35, 36, 33, 33, 33, 33, 35, 35, 36, 38, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 31, 31, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 35, + 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 31, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 34, 34, 35, 35, 35, 36, 37, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 34, 34, 35, 35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, + 34, 35, 36, 36, 36, 38, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, + 37, 37, 38, 39, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 34, 34, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 33, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, + 34, 34, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, + 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, + 34, 34, 34, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, + 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, + 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, + 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, + 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, + 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, + 35, 35, 36, 36, 36, 36, 36, 37, 38, 38, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, + 36, 36, 37, 38, 38, 38, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, + 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, + 39, 39, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 34, 34, + 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, + /* Size 4x8 */ + 31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, + 33, 34, 32, 32, 34, 34, 32, 33, 34, 35, 33, 33, 35, 36, + /* Size 8x4 */ + 31, 31, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 33, 34, 34, 35, 32, 32, 32, 33, 34, 34, 35, 36, + /* Size 8x16 */ + 32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 33, 31, 32, + 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, + 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, + 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 33, 34, 34, 34, + 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, + 32, 32, 33, 35, 35, 35, 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, + 34, 35, 35, 36, 32, 33, 33, 33, 34, 36, 36, 36, 34, 34, 34, 34, 35, 37, + 37, 38, + /* Size 16x8 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 34, 34, 34, 35, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, + 34, 35, 35, 35, 36, 37, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, + 35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36, + 36, 38, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 33, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, + 34, 35, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 34, 35, 35, 35, 35, 35, 36, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, + 35, 35, 35, 35, 36, 36, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, + 35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, + 36, 37, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 37, + 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 37, 32, 33, + 33, 33, 33, 33, 33, 33, 34, 35, 36, 36, 36, 36, 36, 38, 33, 33, 33, 33, + 33, 33, 33, 34, 34, 35, 36, 36, 36, 36, 37, 38, 34, 34, 34, 34, 34, 34, + 34, 34, 35, 36, 37, 37, 37, 37, 38, 39, 34, 34, 34, 34, 34, 34, 34, 34, + 35, 36, 37, 37, 37, 37, 38, 39, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, + 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, + 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, + 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, + 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 38, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, + 37, 37, 37, 37, 38, 38, 39, 39, + /* Size 4x16 */ + 31, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, + 32, 32, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 33, 33, 32, 32, 33, 34, + 32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 35, 32, 33, 34, 35, 32, 33, + 34, 35, 33, 33, 35, 36, 34, 34, 36, 37, + /* Size 16x4 */ + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 36, 32, 32, 32, 32, 32, 33, + 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 33, 31, 31, + 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, + 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, + 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, + 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, + 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, + 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 33, 33, + 33, 34, 31, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, + 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, + 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32, 32, + 33, 35, 35, 35, 32, 32, 33, 33, 33, 35, 35, 36, 32, 32, 33, 33, 34, 35, + 35, 36, 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36, + 32, 32, 33, 33, 34, 35, 35, 36, 32, 33, 33, 33, 34, 36, 36, 36, 33, 33, + 33, 33, 34, 36, 36, 37, 34, 34, 34, 34, 35, 37, 37, 38, 34, 34, 34, 34, + 35, 37, 37, 38, + /* Size 32x8 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, + 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, + 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, + 35, 35, 35, 35, 36, 36, 37, 37, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 36, + 36, 37, 38, 38 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 31, 34, 38, 31, 32, 35, 40, 34, 35, 39, 43, 38, 40, 43, 47, + /* Size 8x8 */ + 31, 31, 31, 30, 34, 35, 37, 40, 31, 31, 31, 31, 34, 35, 38, 41, 31, 31, + 31, 31, 35, 36, 39, 41, 30, 31, 31, 32, 35, 36, 40, 42, 34, 34, 35, 35, + 39, 40, 43, 44, 35, 35, 36, 36, 40, 41, 44, 45, 37, 38, 39, 40, 43, 44, + 47, 47, 40, 41, 41, 42, 44, 45, 47, 48, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 36, 36, 38, 41, 31, 31, + 31, 31, 31, 31, 31, 31, 33, 34, 34, 36, 37, 37, 39, 42, 31, 31, 31, 31, + 31, 31, 31, 32, 34, 34, 34, 37, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, + 31, 32, 34, 34, 34, 37, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, + 34, 35, 35, 37, 39, 39, 40, 42, 30, 31, 31, 31, 31, 32, 32, 32, 34, 35, + 35, 38, 40, 40, 41, 42, 30, 31, 31, 31, 31, 32, 32, 32, 34, 35, 35, 38, + 40, 40, 41, 42, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, + 41, 43, 33, 33, 34, 34, 34, 34, 34, 35, 37, 38, 38, 41, 42, 42, 43, 44, + 33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 33, 34, + 34, 34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 35, 36, 37, 37, + 37, 38, 38, 38, 41, 41, 41, 44, 46, 46, 46, 46, 36, 37, 38, 38, 39, 40, + 40, 40, 42, 43, 43, 46, 47, 47, 47, 47, 36, 37, 38, 38, 39, 40, 40, 40, + 42, 43, 43, 46, 47, 47, 47, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, + 44, 46, 47, 47, 47, 48, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, + 47, 47, 48, 48, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33, + 33, 33, 33, 34, 35, 36, 36, 36, 36, 37, 38, 40, 41, 41, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 32, 33, 34, 34, 34, 34, 35, + 36, 37, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 35, 36, 37, 37, 37, + 37, 38, 39, 40, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 34, 34, 34, 34, 34, 35, 36, 38, 38, 38, 38, 38, 40, 41, + 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, + 34, 34, 34, 34, 34, 35, 37, 38, 38, 38, 38, 39, 40, 41, 42, 42, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, + 34, 35, 37, 38, 38, 38, 38, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 35, 37, 38, + 38, 38, 38, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 36, 37, 38, 38, 38, 38, 39, + 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 33, 34, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 42, 42, + 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, + 35, 35, 35, 36, 37, 39, 39, 39, 39, 40, 40, 41, 42, 42, 30, 30, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 35, 36, + 38, 39, 40, 40, 40, 40, 41, 42, 42, 42, 30, 30, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 35, 36, 38, 39, 40, 40, + 40, 40, 41, 42, 42, 42, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 33, 34, 35, 35, 35, 35, 36, 38, 39, 40, 40, 40, 40, 41, 42, + 42, 42, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, + 34, 35, 35, 35, 35, 36, 38, 39, 40, 40, 40, 40, 41, 42, 42, 42, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, + 36, 37, 38, 40, 40, 40, 40, 41, 41, 42, 43, 43, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 38, 39, 41, + 41, 41, 41, 42, 42, 43, 43, 43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 42, 42, 42, 43, + 43, 44, 44, 44, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, + 36, 37, 38, 39, 39, 39, 39, 40, 41, 43, 43, 43, 43, 43, 44, 44, 45, 45, + 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, + 39, 39, 39, 40, 41, 43, 43, 43, 43, 43, 44, 44, 45, 45, 33, 34, 34, 34, + 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, + 41, 43, 43, 43, 43, 43, 44, 44, 45, 45, 33, 34, 34, 34, 34, 34, 34, 34, + 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 43, 43, 43, + 43, 43, 44, 44, 45, 45, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, + 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 44, 44, 44, 44, 44, 45, 45, + 45, 45, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39, + 41, 41, 41, 41, 41, 42, 44, 45, 46, 46, 46, 46, 46, 46, 46, 46, 36, 37, + 37, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 40, 41, 42, 43, 43, 43, + 43, 44, 45, 46, 47, 47, 47, 47, 47, 47, 47, 47, 36, 37, 37, 38, 38, 38, + 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, + 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, + 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 37, 37, 38, 38, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, + 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 38, 39, 39, 40, + 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45, + 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 40, 40, 40, 41, 41, 41, 41, 41, + 41, 41, 42, 42, 42, 42, 42, 43, 44, 44, 44, 44, 44, 45, 46, 47, 47, 47, + 47, 47, 48, 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, + 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 43, + 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48, 48, + /* Size 4x8 */ + 31, 31, 35, 37, 31, 31, 36, 38, 31, 32, 37, 39, 31, 32, 37, 40, 34, 36, + 40, 43, 35, 37, 42, 44, 38, 40, 45, 47, 41, 42, 45, 47, + /* Size 8x4 */ + 31, 31, 31, 31, 34, 35, 38, 41, 31, 31, 32, 32, 36, 37, 40, 42, 35, 36, + 37, 37, 40, 42, 45, 45, 37, 38, 39, 40, 43, 44, 47, 47, + /* Size 8x16 */ + 32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 38, 38, 39, 31, 31, + 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 32, 32, + 34, 39, 39, 40, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, + 40, 41, 31, 32, 33, 33, 35, 40, 40, 41, 33, 34, 35, 35, 37, 42, 42, 43, + 33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44, 35, 37, + 38, 38, 41, 45, 45, 46, 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, + 43, 47, 47, 47, 39, 40, 41, 41, 43, 47, 47, 47, 42, 42, 43, 43, 44, 47, + 47, 48, + /* Size 16x8 */ + 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 31, 31, + 31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, 31, 31, 31, 31, + 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 31, 31, 31, 31, 32, 32, + 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 33, 33, 34, 34, 34, 35, 35, 35, + 37, 38, 38, 41, 43, 43, 43, 44, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, + 43, 45, 47, 47, 47, 47, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, + 47, 47, 47, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47, + 47, 48, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 38, 42, 31, 31, + 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 39, 42, 31, 31, 31, 31, + 31, 31, 31, 32, 33, 35, 38, 38, 38, 38, 39, 42, 31, 31, 31, 31, 31, 31, + 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, + 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 36, + 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 36, 38, 38, + 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 36, 38, 38, 38, 38, + 40, 42, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 39, 39, 39, 39, 40, 42, + 30, 31, 31, 32, 32, 32, 32, 32, 34, 37, 39, 39, 39, 39, 40, 42, 30, 31, + 31, 32, 32, 32, 32, 33, 35, 37, 40, 40, 40, 40, 41, 42, 30, 31, 31, 32, + 32, 32, 32, 33, 35, 37, 40, 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, + 32, 33, 35, 37, 40, 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, + 35, 37, 40, 40, 40, 40, 41, 42, 31, 31, 32, 32, 33, 33, 33, 33, 35, 38, + 40, 40, 40, 40, 41, 43, 32, 32, 33, 33, 34, 34, 34, 34, 36, 39, 41, 41, + 41, 41, 42, 44, 33, 33, 34, 35, 35, 35, 35, 35, 37, 40, 42, 42, 42, 42, + 43, 44, 33, 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, + 33, 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34, + 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34, 35, 35, + 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 34, 35, 36, 37, 37, 37, + 37, 37, 39, 42, 44, 44, 44, 44, 45, 45, 35, 36, 37, 38, 38, 38, 38, 39, + 41, 43, 45, 45, 45, 45, 46, 46, 36, 37, 38, 39, 39, 39, 39, 40, 42, 44, + 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47, + 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47, 47, 47, + 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47, 47, 47, 47, 47, + 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47, 47, 47, 47, 47, 39, 39, + 40, 41, 41, 41, 41, 42, 43, 45, 47, 47, 47, 47, 47, 48, 40, 41, 41, 42, + 42, 42, 42, 42, 44, 45, 47, 47, 47, 47, 47, 48, 42, 42, 42, 43, 43, 43, + 43, 43, 44, 46, 47, 47, 47, 47, 48, 48, 42, 42, 42, 43, 43, 43, 43, 43, + 44, 46, 47, 47, 47, 47, 48, 48, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33, + 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 35, + 36, 37, 38, 38, 38, 38, 39, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35, 35, 35, 36, 37, 38, 39, 39, + 39, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 33, 35, 35, 35, 35, 35, 37, 38, 39, 40, 40, 40, 40, 41, 42, + 43, 43, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, + 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, + 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, + 40, 40, 40, 40, 41, 42, 43, 43, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 34, 35, 36, 36, 36, 36, 37, 39, 40, 41, 41, 41, 41, + 42, 42, 43, 43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, + 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 43, 43, 43, 43, 43, 44, 44, 44, + 35, 35, 35, 36, 36, 36, 36, 36, 36, 37, 37, 37, 37, 37, 38, 39, 40, 40, + 40, 40, 40, 42, 43, 44, 45, 45, 45, 45, 45, 45, 46, 46, 37, 37, 38, 38, + 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, + 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 38, 38, + 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, + 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, + 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 38, 39, + 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, + 44, 45, 46, 47, 47, 47, 47, 47, 47, 47, 48, 48, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 44, 45, 45, 45, 45, 45, 46, 47, + 47, 47, 47, 47, 48, 48, 48, 48, + /* Size 4x16 */ + 31, 31, 35, 37, 31, 31, 35, 38, 31, 31, 36, 38, 31, 31, 36, 38, 31, 32, + 36, 39, 31, 32, 37, 40, 31, 32, 37, 40, 31, 33, 38, 40, 33, 35, 40, 42, + 34, 36, 40, 43, 34, 36, 40, 43, 36, 38, 43, 45, 38, 40, 45, 47, 38, 40, + 45, 47, 39, 41, 45, 47, 42, 43, 46, 47, + /* Size 16x4 */ + 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 36, 38, 38, 39, 42, 31, 31, + 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 35, 35, 36, 36, + 36, 37, 37, 38, 40, 40, 40, 43, 45, 45, 45, 46, 37, 38, 38, 38, 39, 40, + 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, + /* Size 8x32 */ + 32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 37, 37, 39, 31, 31, + 31, 31, 33, 38, 38, 39, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, + 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, + 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 32, 32, 34, 39, 39, 40, + 30, 31, 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, + 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, + 35, 40, 40, 41, 31, 32, 33, 33, 35, 40, 40, 41, 32, 33, 34, 34, 36, 41, + 41, 42, 33, 34, 35, 35, 37, 42, 42, 43, 33, 35, 36, 36, 38, 43, 43, 44, + 33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44, 33, 35, + 36, 36, 38, 43, 43, 44, 34, 36, 37, 37, 39, 44, 44, 45, 35, 37, 38, 38, + 41, 45, 45, 46, 36, 38, 39, 39, 42, 47, 47, 47, 37, 39, 40, 40, 43, 47, + 47, 47, 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47, + 37, 39, 40, 40, 43, 47, 47, 47, 39, 40, 41, 41, 43, 47, 47, 47, 40, 41, + 42, 42, 44, 47, 47, 47, 42, 42, 43, 43, 44, 47, 47, 48, 42, 42, 43, 43, + 44, 47, 47, 48, + /* Size 32x8 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33, + 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35, 35, 35, 36, + 37, 38, 39, 39, 39, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, + 40, 40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, + 43, 43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, + 37, 38, 38, 38, 38, 39, 41, 42, 43, 43, 43, 43, 43, 44, 44, 44, 37, 37, + 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, + 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, + 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 38, 39, 39, 40, 40, 40, 40, 40, 40, 40, + 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47, + 47, 47, 48, 48 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33, + /* Size 8x8 */ + 31, 31, 31, 31, 31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, + 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, + 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 33, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + /* Size 4x8 */ + 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, + 32, 32, 31, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, + /* Size 8x4 */ + 31, 31, 31, 31, 31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + /* Size 8x16 */ + 32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31, + 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, + 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, + 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, + 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, + 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, + 33, 34, + /* Size 16x8 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, + 34, 34, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 34, 34, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 34, 34, 34, 34, 34, 34, 34, 34, + /* Size 4x16 */ + 31, 31, 31, 32, 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, + 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, + 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, + 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, + /* Size 16x4 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31, + 31, 31, 31, 31, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, + 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, + 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, + 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, + 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, + 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, + 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, + 33, 33, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, + 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, + 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, + 32, 32, 33, 34, + /* Size 32x8 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, + 34, 34, 34, 34 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 32, 35, 34, 35, 35, 39, + /* Size 8x8 */ + 31, 31, 31, 31, 30, 31, 33, 33, 31, 31, 31, 31, 31, 32, 34, 34, 31, 31, + 31, 31, 31, 32, 34, 34, 31, 31, 31, 31, 31, 32, 35, 35, 30, 31, 31, 31, + 32, 32, 35, 35, 31, 32, 32, 32, 32, 33, 36, 36, 33, 34, 34, 35, 35, 36, + 39, 39, 33, 34, 34, 35, 35, 36, 39, 39, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 31, 31, + 31, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 34, 34, 34, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, + 34, 35, 35, 35, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, + 35, 35, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, + 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 30, 30, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 34, 36, 37, 37, 37, 33, 33, 33, 34, 34, 34, + 34, 34, 34, 34, 34, 36, 37, 38, 38, 38, 33, 34, 34, 34, 34, 34, 35, 35, + 35, 35, 35, 37, 38, 39, 39, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, + 35, 37, 38, 39, 39, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, + 38, 39, 39, 39, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, + 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 31, + 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 31, 32, 32, 33, 34, + 34, 34, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 34, 34, 34, 34, + 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 34, 34, 34, 34, 34, 35, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, + 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, + 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 35, + 35, 35, 35, 35, 35, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 35, 35, 35, 35, 35, + 35, 35, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 33, 33, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, + 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, + 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, + 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, + 35, 35, 35, 35, 35, 36, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 36, 36, 36, 36, 36, + 36, 37, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 34, 34, 35, 36, 37, 37, 37, 37, 37, 37, 37, 32, 32, + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, + 34, 34, 35, 36, 37, 37, 37, 37, 37, 37, 37, 38, 33, 33, 33, 33, 33, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, + 37, 38, 38, 38, 38, 38, 38, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, + 39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39, 40, + 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39, 40, 33, 33, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, + 37, 37, 38, 39, 39, 39, 39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, + 39, 39, 39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, + 39, 40, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, + 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40, 40, + /* Size 4x8 */ + 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36, 31, 32, + 32, 36, 31, 33, 33, 37, 34, 36, 36, 40, 34, 36, 36, 40, + /* Size 8x4 */ + 31, 31, 31, 31, 31, 31, 34, 34, 31, 31, 31, 32, 32, 33, 36, 36, 31, 31, + 31, 32, 32, 33, 36, 36, 34, 35, 35, 36, 36, 37, 40, 40, + /* Size 8x16 */ + 32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 36, 31, 31, + 31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, + 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 32, 32, 32, + 34, 37, 30, 31, 31, 32, 32, 32, 34, 38, 30, 31, 32, 32, 32, 32, 35, 38, + 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 31, 32, + 33, 33, 33, 33, 36, 39, 33, 34, 34, 35, 35, 35, 37, 40, 33, 34, 35, 36, + 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, + 38, 41, + /* Size 16x8 */ + 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 33, 35, 36, 36, 36, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, + 37, 38, 38, 38, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 40, 41, + 41, 41, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 36, 37, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 33, 35, 36, 38, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 34, 35, 36, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, + 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, + 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 33, 34, 36, 37, 39, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 33, 34, 36, 37, 39, 30, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 33, 34, 36, 38, 39, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, + 35, 36, 38, 40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, + 38, 40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, + 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 31, 31, 31, 32, 32, 33, + 33, 33, 33, 33, 33, 34, 35, 37, 38, 40, 31, 32, 32, 33, 33, 33, 33, 33, + 33, 33, 33, 35, 36, 37, 39, 41, 32, 32, 33, 33, 34, 34, 34, 34, 34, 34, + 34, 35, 37, 38, 40, 41, 33, 33, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, + 37, 39, 40, 42, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, + 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, + 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 33, 34, + 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, + 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, + 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 34, 34, 35, 35, 36, 36, 36, 36, + 36, 36, 36, 38, 39, 40, 42, 44, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, + 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, + 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 35, 35, 35, 35, 35, + 35, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, + 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, + 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 32, 32, 32, 32, 32, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 35, 35, 36, 37, + 37, 37, 37, 37, 37, 38, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 38, 38, 38, 38, + 38, 39, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40, 40, 35, 35, + 36, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, + 38, 38, 39, 40, 40, 41, 41, 41, 41, 41, 41, 42, 37, 37, 37, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, + 42, 43, 43, 43, 43, 43, 43, 44, + /* Size 4x16 */ + 31, 31, 31, 34, 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, 31, 31, + 31, 35, 31, 31, 31, 35, 31, 32, 32, 36, 31, 32, 32, 36, 31, 32, 32, 36, + 31, 32, 32, 36, 31, 32, 32, 36, 32, 33, 33, 37, 33, 35, 35, 39, 34, 36, + 36, 40, 34, 36, 36, 40, 34, 36, 36, 40, + /* Size 16x4 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 34, 34, 35, 35, 35, 35, + 36, 36, 36, 36, 36, 37, 39, 40, 40, 40, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 35, 31, 31, + 31, 31, 31, 31, 33, 36, 31, 31, 31, 31, 31, 31, 33, 36, 31, 31, 31, 31, + 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, + 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, + 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, + 31, 31, 31, 31, 34, 37, 31, 31, 31, 32, 32, 32, 34, 37, 31, 31, 31, 32, + 32, 32, 34, 37, 30, 31, 31, 32, 32, 32, 34, 38, 30, 31, 32, 32, 32, 32, + 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, + 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, + 32, 32, 32, 32, 35, 38, 31, 31, 32, 33, 33, 33, 35, 38, 31, 32, 33, 33, + 33, 33, 36, 39, 32, 33, 34, 34, 34, 34, 37, 40, 33, 34, 34, 35, 35, 35, + 37, 40, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, + 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, + 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 34, 35, 36, 36, + 36, 36, 39, 42, + /* Size 32x8 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, + 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, + 35, 35, 35, 35, 35, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, + 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 33, 33, 33, 33, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 37, + 37, 38, 38, 38, 38, 38, 38, 39, 35, 35, 36, 36, 36, 37, 37, 37, 37, 37, + 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 39, 40, 40, 41, 41, 41, + 41, 41, 41, 42 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, + /* Size 8x8 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 4x8 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, + 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, + /* Size 8x4 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, + 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + /* Size 8x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 32, + 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, + 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 32, + /* Size 16x8 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 4x16 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 32, + 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, + 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, + 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, + /* Size 16x4 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, + 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, + 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, + 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32, + /* Size 32x8 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + /* Size 8x8 */ + 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, + 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 32, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + /* Size 4x8 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 31, 32, 32, + /* Size 8x4 */ + 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, + /* Size 8x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, + 31, 32, 32, 32, 30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 32, 32, + 32, 32, + /* Size 16x8 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 30, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 30, 30, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 4x16 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, + 32, 32, 31, 31, 32, 32, 30, 31, 32, 32, + /* Size 16x4 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, + 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, + 30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 31, 32, 32, 32, 30, 31, + 31, 31, 32, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32, 30, 31, 31, 31, + 32, 32, 32, 32, + /* Size 32x8 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32 }, + }, +}; + +static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE] = { + { + { /* Luma */ + /* Size 4x4 */ + 32, 24, 14, 11, 24, 15, 11, 9, 14, 11, 7, 7, 11, 9, 7, 5, + /* Size 8x8 */ + 32, 32, 27, 20, 15, 12, 11, 9, 32, 29, 26, 21, 16, 13, 12, 10, 27, 26, + 19, 16, 13, 11, 10, 10, 20, 21, 16, 12, 11, 9, 9, 8, 15, 16, 13, 11, 9, + 8, 7, 7, 12, 13, 11, 9, 8, 7, 6, 6, 11, 12, 10, 9, 7, 6, 6, 5, 9, 10, + 10, 8, 7, 6, 5, 5, + /* Size 16x16 */ + 32, 33, 33, 30, 28, 23, 21, 17, 16, 13, 12, 11, 11, 10, 9, 9, 33, 32, + 32, 31, 30, 25, 23, 19, 17, 14, 14, 12, 11, 11, 10, 9, 33, 32, 31, 29, + 28, 24, 23, 19, 17, 14, 14, 13, 12, 11, 10, 10, 30, 31, 29, 26, 24, 22, + 20, 18, 16, 14, 13, 13, 12, 11, 11, 10, 28, 30, 28, 24, 21, 19, 18, 16, + 15, 13, 13, 12, 11, 11, 10, 10, 23, 25, 24, 22, 19, 16, 15, 14, 13, 11, + 11, 11, 10, 10, 9, 9, 21, 23, 23, 20, 18, 15, 14, 13, 12, 11, 10, 10, 9, + 9, 9, 9, 17, 19, 19, 18, 16, 14, 13, 11, 10, 9, 9, 9, 9, 8, 8, 8, 16, + 17, 17, 16, 15, 13, 12, 10, 10, 9, 8, 8, 8, 8, 8, 7, 13, 14, 14, 14, 13, + 11, 11, 9, 9, 8, 7, 7, 7, 7, 7, 7, 12, 14, 14, 13, 13, 11, 10, 9, 8, 7, + 7, 7, 7, 7, 6, 6, 11, 12, 13, 13, 12, 11, 10, 9, 8, 7, 7, 6, 6, 6, 6, 6, + 11, 11, 12, 12, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 5, 5, 10, 11, 11, 11, + 11, 10, 9, 8, 8, 7, 7, 6, 6, 5, 5, 5, 9, 10, 10, 11, 10, 9, 9, 8, 8, 7, + 6, 6, 5, 5, 5, 5, 9, 9, 10, 10, 10, 9, 9, 8, 7, 7, 6, 6, 5, 5, 5, 4, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 32, 30, 29, 28, 26, 23, 22, 21, 19, 17, 17, 16, 14, + 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 33, 32, 32, 32, 32, + 32, 30, 30, 29, 27, 24, 23, 22, 20, 18, 17, 17, 15, 13, 13, 13, 12, 12, + 12, 11, 11, 10, 10, 10, 9, 9, 9, 33, 32, 32, 32, 32, 32, 31, 30, 30, 28, + 25, 24, 23, 21, 19, 18, 17, 16, 14, 14, 14, 13, 12, 12, 11, 11, 11, 10, + 10, 9, 9, 9, 33, 32, 32, 32, 31, 31, 30, 29, 29, 27, 25, 24, 23, 21, 19, + 18, 17, 16, 14, 14, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 33, + 32, 32, 31, 31, 30, 29, 28, 28, 26, 24, 23, 23, 20, 19, 18, 17, 16, 14, + 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 32, 32, 32, 31, 30, + 29, 28, 28, 27, 26, 24, 23, 22, 21, 19, 19, 18, 16, 15, 15, 14, 13, 13, + 12, 12, 12, 11, 11, 10, 10, 10, 9, 30, 30, 31, 30, 29, 28, 26, 25, 24, + 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, + 11, 11, 10, 10, 9, 29, 30, 30, 29, 28, 28, 25, 24, 23, 22, 20, 20, 19, + 18, 17, 16, 16, 15, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, + 10, 28, 29, 30, 29, 28, 27, 24, 23, 21, 20, 19, 19, 18, 17, 16, 16, 15, + 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 26, 27, 28, + 27, 26, 26, 23, 22, 20, 19, 18, 17, 17, 16, 15, 14, 14, 13, 12, 12, 12, + 11, 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 23, 24, 25, 25, 24, 24, 22, + 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 12, 11, 11, 11, 11, 11, 11, 10, + 10, 10, 10, 9, 9, 9, 9, 22, 23, 24, 24, 23, 23, 21, 20, 19, 17, 16, 15, + 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, + 8, 21, 22, 23, 23, 23, 22, 20, 19, 18, 17, 15, 15, 14, 13, 13, 12, 12, + 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 19, 20, 21, 21, 20, + 21, 19, 18, 17, 16, 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, + 9, 9, 9, 9, 9, 8, 8, 8, 17, 18, 19, 19, 19, 19, 18, 17, 16, 15, 14, 13, + 13, 12, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 17, + 17, 18, 18, 18, 19, 17, 16, 16, 14, 13, 13, 12, 12, 11, 10, 10, 10, 9, + 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 16, 17, 17, 17, 17, 18, 16, 16, + 15, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 7, 7, 7, 14, 15, 16, 16, 16, 16, 15, 15, 14, 13, 12, 12, 11, 11, 10, 10, + 9, 9, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 13, 13, 14, 14, 14, 15, + 14, 13, 13, 12, 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 13, 13, 14, 14, 14, 15, 14, 13, 13, 12, 11, 11, 11, 10, + 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 12, 13, 14, 14, + 14, 14, 13, 13, 13, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 6, 6, 6, 6, 6, 12, 12, 13, 13, 13, 13, 13, 12, 12, 11, 11, 10, 10, + 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, + 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 10, 10, + 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 11, 11, 11, 12, + 12, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, + 6, 6, 5, 5, 5, 5, 5, 10, 11, 11, 11, 12, 12, 12, 11, 11, 10, 10, 10, 9, + 9, 9, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 10, 10, 11, 11, + 11, 11, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, + 5, 5, 5, 5, 5, 5, 10, 10, 10, 11, 11, 11, 11, 11, 10, 10, 10, 9, 9, 9, + 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 9, 10, 10, 10, 10, + 10, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, + 5, 5, 5, 5, 5, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, + 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 4, 9, 9, 9, 10, 10, 10, 10, + 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, + 4, 4, 8, 9, 9, 9, 9, 9, 9, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, + 6, 6, 5, 5, 5, 5, 5, 5, 4, 4, 4, + /* Size 4x8 */ + 32, 24, 14, 11, 31, 24, 15, 12, 28, 18, 12, 11, 21, 14, 10, 9, 16, 12, + 8, 8, 13, 11, 7, 7, 11, 10, 7, 6, 10, 9, 7, 5, + /* Size 8x4 */ + 32, 31, 28, 21, 16, 13, 11, 10, 24, 24, 18, 14, 12, 11, 10, 9, 14, 15, + 12, 10, 8, 7, 7, 7, 11, 12, 11, 9, 8, 7, 6, 5, + /* Size 8x16 */ + 32, 32, 28, 19, 16, 12, 11, 10, 33, 31, 30, 21, 17, 13, 12, 11, 32, 30, + 28, 20, 17, 13, 12, 12, 30, 28, 24, 19, 16, 13, 13, 12, 28, 27, 21, 17, + 15, 12, 12, 11, 23, 24, 19, 14, 13, 11, 11, 11, 21, 22, 18, 13, 12, 10, + 10, 10, 18, 19, 16, 12, 10, 9, 9, 9, 16, 18, 15, 11, 10, 8, 8, 8, 13, + 15, 13, 10, 9, 7, 8, 8, 12, 14, 13, 10, 8, 7, 7, 7, 11, 13, 12, 10, 8, + 7, 6, 6, 11, 12, 11, 10, 8, 7, 6, 6, 10, 11, 10, 9, 8, 7, 6, 6, 9, 10, + 10, 9, 7, 6, 6, 5, 9, 10, 10, 9, 8, 7, 6, 5, + /* Size 16x8 */ + 32, 33, 32, 30, 28, 23, 21, 18, 16, 13, 12, 11, 11, 10, 9, 9, 32, 31, + 30, 28, 27, 24, 22, 19, 18, 15, 14, 13, 12, 11, 10, 10, 28, 30, 28, 24, + 21, 19, 18, 16, 15, 13, 13, 12, 11, 10, 10, 10, 19, 21, 20, 19, 17, 14, + 13, 12, 11, 10, 10, 10, 10, 9, 9, 9, 16, 17, 17, 16, 15, 13, 12, 10, 10, + 9, 8, 8, 8, 8, 7, 8, 12, 13, 13, 13, 12, 11, 10, 9, 8, 7, 7, 7, 7, 7, 6, + 7, 11, 12, 12, 13, 12, 11, 10, 9, 8, 8, 7, 6, 6, 6, 6, 6, 10, 11, 12, + 12, 11, 11, 10, 9, 8, 8, 7, 6, 6, 6, 5, 5, + /* Size 16x32 */ + 32, 33, 32, 30, 28, 23, 19, 17, 16, 13, 12, 11, 11, 11, 10, 10, 33, 32, + 32, 30, 29, 24, 20, 18, 17, 14, 12, 12, 12, 11, 11, 11, 33, 32, 31, 31, + 30, 25, 21, 19, 17, 14, 13, 12, 12, 11, 11, 11, 33, 32, 31, 30, 29, 25, + 21, 19, 17, 14, 13, 13, 12, 12, 11, 11, 32, 32, 30, 29, 28, 24, 20, 19, + 17, 14, 13, 13, 12, 12, 12, 11, 32, 31, 29, 28, 27, 24, 21, 19, 18, 15, + 14, 13, 12, 12, 12, 11, 30, 30, 28, 26, 24, 21, 19, 18, 16, 14, 13, 13, + 13, 12, 12, 11, 29, 30, 28, 25, 23, 20, 18, 17, 16, 13, 12, 12, 12, 12, + 12, 11, 28, 30, 27, 24, 21, 19, 17, 16, 15, 13, 12, 12, 12, 12, 11, 11, + 26, 28, 26, 23, 20, 18, 16, 15, 14, 12, 12, 12, 11, 11, 11, 11, 23, 25, + 24, 21, 19, 16, 14, 14, 13, 11, 11, 11, 11, 11, 11, 11, 22, 24, 23, 21, + 19, 16, 14, 13, 12, 11, 10, 10, 10, 10, 10, 10, 21, 23, 22, 20, 18, 15, + 13, 13, 12, 11, 10, 10, 10, 10, 10, 10, 19, 21, 20, 19, 17, 14, 12, 12, + 11, 10, 9, 10, 10, 9, 10, 9, 18, 19, 19, 18, 16, 14, 12, 11, 10, 9, 9, + 9, 9, 9, 9, 9, 17, 18, 18, 17, 16, 13, 12, 11, 10, 9, 9, 9, 9, 9, 9, 9, + 16, 17, 18, 16, 15, 13, 11, 10, 10, 9, 8, 8, 8, 8, 8, 8, 14, 16, 16, 15, + 14, 12, 11, 10, 9, 8, 8, 8, 8, 8, 8, 8, 13, 14, 15, 14, 13, 11, 10, 9, + 9, 8, 7, 8, 8, 8, 8, 8, 13, 14, 14, 14, 13, 11, 10, 9, 9, 8, 7, 7, 7, 7, + 7, 7, 12, 14, 14, 13, 13, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 7, 12, 13, 13, + 13, 12, 11, 9, 9, 8, 7, 7, 7, 7, 7, 7, 7, 11, 12, 13, 13, 12, 10, 10, 9, + 8, 7, 7, 7, 6, 6, 6, 7, 11, 12, 12, 12, 11, 10, 10, 9, 8, 7, 7, 6, 6, 6, + 6, 6, 11, 12, 12, 12, 11, 10, 10, 8, 8, 7, 7, 6, 6, 6, 6, 6, 10, 11, 12, + 12, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, 6, 6, 10, 11, 11, 11, 10, 10, 9, 9, + 8, 7, 7, 6, 6, 6, 6, 6, 10, 11, 11, 11, 10, 10, 9, 9, 8, 7, 7, 6, 6, 5, + 5, 5, 9, 10, 10, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 5, 5, 5, 9, 10, 10, 10, + 10, 9, 9, 8, 7, 7, 6, 6, 6, 5, 5, 5, 9, 9, 10, 10, 10, 9, 9, 8, 8, 7, 7, + 6, 6, 5, 5, 5, 8, 9, 9, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 5, + /* Size 32x16 */ + 32, 33, 33, 33, 32, 32, 30, 29, 28, 26, 23, 22, 21, 19, 18, 17, 16, 14, + 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 33, 32, 32, 32, 32, + 31, 30, 30, 30, 28, 25, 24, 23, 21, 19, 18, 17, 16, 14, 14, 14, 13, 12, + 12, 12, 11, 11, 11, 10, 10, 9, 9, 32, 32, 31, 31, 30, 29, 28, 28, 27, + 26, 24, 23, 22, 20, 19, 18, 18, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, + 11, 10, 10, 10, 9, 30, 30, 31, 30, 29, 28, 26, 25, 24, 23, 21, 21, 20, + 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, + 10, 28, 29, 30, 29, 28, 27, 24, 23, 21, 20, 19, 19, 18, 17, 16, 16, 15, + 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 23, 24, 25, + 25, 24, 24, 21, 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 12, 11, 11, 11, + 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 19, 20, 21, 21, 20, 21, 19, 18, + 17, 16, 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 10, 10, 10, 9, 9, + 9, 9, 9, 9, 9, 17, 18, 19, 19, 19, 19, 18, 17, 16, 15, 14, 13, 13, 12, + 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 8, 8, 9, 9, 8, 8, 8, 8, 16, 17, 17, + 17, 17, 18, 16, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, 8, 8, + 8, 8, 8, 8, 8, 8, 7, 7, 8, 8, 13, 14, 14, 14, 14, 15, 14, 13, 13, 12, + 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 12, 12, 13, 13, 13, 14, 13, 12, 12, 12, 11, 10, 10, 9, 9, 9, 8, 8, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 7, 7, 11, 12, 12, 13, 13, 13, 13, 12, 12, + 12, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 11, 12, 12, 12, 12, 12, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, + 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 11, 11, 11, 12, 12, 12, 12, + 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, + 5, 5, 5, 10, 11, 11, 11, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, + 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 10, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, + 5, 5, 5, 5, + /* Size 4x16 */ + 33, 23, 13, 11, 32, 25, 14, 11, 32, 24, 14, 12, 30, 21, 14, 12, 30, 19, + 13, 12, 25, 16, 11, 11, 23, 15, 11, 10, 19, 14, 9, 9, 17, 13, 9, 8, 14, + 11, 8, 8, 14, 11, 8, 7, 12, 10, 7, 6, 12, 10, 7, 6, 11, 10, 7, 6, 10, 9, + 7, 5, 9, 9, 7, 5, + /* Size 16x4 */ + 33, 32, 32, 30, 30, 25, 23, 19, 17, 14, 14, 12, 12, 11, 10, 9, 23, 25, + 24, 21, 19, 16, 15, 14, 13, 11, 11, 10, 10, 10, 9, 9, 13, 14, 14, 14, + 13, 11, 11, 9, 9, 8, 8, 7, 7, 7, 7, 7, 11, 11, 12, 12, 12, 11, 10, 9, 8, + 8, 7, 6, 6, 6, 5, 5, + /* Size 8x32 */ + 32, 32, 28, 19, 16, 12, 11, 10, 33, 32, 29, 20, 17, 12, 12, 11, 33, 31, + 30, 21, 17, 13, 12, 11, 33, 31, 29, 21, 17, 13, 12, 11, 32, 30, 28, 20, + 17, 13, 12, 12, 32, 29, 27, 21, 18, 14, 12, 12, 30, 28, 24, 19, 16, 13, + 13, 12, 29, 28, 23, 18, 16, 12, 12, 12, 28, 27, 21, 17, 15, 12, 12, 11, + 26, 26, 20, 16, 14, 12, 11, 11, 23, 24, 19, 14, 13, 11, 11, 11, 22, 23, + 19, 14, 12, 10, 10, 10, 21, 22, 18, 13, 12, 10, 10, 10, 19, 20, 17, 12, + 11, 9, 10, 10, 18, 19, 16, 12, 10, 9, 9, 9, 17, 18, 16, 12, 10, 9, 9, 9, + 16, 18, 15, 11, 10, 8, 8, 8, 14, 16, 14, 11, 9, 8, 8, 8, 13, 15, 13, 10, + 9, 7, 8, 8, 13, 14, 13, 10, 9, 7, 7, 7, 12, 14, 13, 10, 8, 7, 7, 7, 12, + 13, 12, 9, 8, 7, 7, 7, 11, 13, 12, 10, 8, 7, 6, 6, 11, 12, 11, 10, 8, 7, + 6, 6, 11, 12, 11, 10, 8, 7, 6, 6, 10, 12, 11, 9, 8, 7, 6, 6, 10, 11, 10, + 9, 8, 7, 6, 6, 10, 11, 10, 9, 8, 7, 6, 5, 9, 10, 10, 9, 7, 6, 6, 5, 9, + 10, 10, 9, 7, 6, 6, 5, 9, 10, 10, 9, 8, 7, 6, 5, 8, 9, 10, 9, 8, 7, 6, + 5, + /* Size 32x8 */ + 32, 33, 33, 33, 32, 32, 30, 29, 28, 26, 23, 22, 21, 19, 18, 17, 16, 14, + 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 32, 32, 31, 31, 30, + 29, 28, 28, 27, 26, 24, 23, 22, 20, 19, 18, 18, 16, 15, 14, 14, 13, 13, + 12, 12, 12, 11, 11, 10, 10, 10, 9, 28, 29, 30, 29, 28, 27, 24, 23, 21, + 20, 19, 19, 18, 17, 16, 16, 15, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, + 10, 10, 10, 10, 10, 19, 20, 21, 21, 20, 21, 19, 18, 17, 16, 14, 14, 13, + 12, 12, 12, 11, 11, 10, 10, 10, 9, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 16, + 17, 17, 17, 17, 18, 16, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, + 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 8, 8, 12, 12, 13, 13, 13, 14, 13, 12, 12, + 12, 11, 10, 10, 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 7, 7, + 11, 12, 12, 12, 12, 12, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, + 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 10, 11, 11, 11, 12, 12, 12, 12, + 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5, + 5, 5 }, + { /* Chroma */ + /* Size 4x4 */ + 29, 22, 18, 16, 22, 17, 15, 14, 18, 15, 11, 11, 16, 14, 11, 9, + /* Size 8x8 */ + 33, 27, 22, 20, 18, 16, 15, 14, 27, 22, 22, 22, 20, 18, 17, 15, 22, 22, + 19, 18, 17, 16, 15, 15, 20, 22, 18, 16, 14, 13, 14, 14, 18, 20, 17, 14, + 12, 12, 12, 12, 16, 18, 16, 13, 12, 11, 11, 11, 15, 17, 15, 14, 12, 11, + 10, 10, 14, 15, 15, 14, 12, 11, 10, 9, + /* Size 16x16 */ + 32, 34, 31, 25, 21, 21, 20, 19, 18, 16, 16, 15, 15, 14, 14, 13, 34, 32, + 29, 24, 22, 23, 22, 21, 20, 18, 18, 17, 16, 15, 15, 14, 31, 29, 26, 23, + 22, 23, 22, 21, 20, 18, 18, 17, 17, 16, 16, 15, 25, 24, 23, 21, 20, 21, + 20, 20, 19, 18, 18, 17, 17, 17, 16, 15, 21, 22, 22, 20, 19, 19, 19, 19, + 18, 17, 17, 16, 16, 16, 16, 16, 21, 23, 23, 21, 19, 18, 17, 17, 16, 15, + 15, 15, 15, 15, 15, 15, 20, 22, 22, 20, 19, 17, 17, 16, 15, 14, 14, 14, + 14, 14, 14, 14, 19, 21, 21, 20, 19, 17, 16, 14, 14, 13, 13, 13, 13, 13, + 13, 13, 18, 20, 20, 19, 18, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, + 16, 18, 18, 18, 17, 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 12, 16, 18, + 18, 18, 17, 15, 14, 13, 12, 11, 11, 11, 11, 11, 11, 11, 15, 17, 17, 17, + 16, 15, 14, 13, 12, 11, 11, 10, 10, 10, 10, 10, 15, 16, 17, 17, 16, 15, + 14, 13, 12, 12, 11, 10, 10, 10, 10, 10, 14, 15, 16, 17, 16, 15, 14, 13, + 12, 12, 11, 10, 10, 10, 9, 9, 14, 15, 16, 16, 16, 15, 14, 13, 12, 12, + 11, 10, 10, 9, 9, 9, 13, 14, 15, 15, 16, 15, 14, 13, 12, 12, 11, 10, 10, + 9, 9, 9, + /* Size 32x32 */ + 32, 33, 34, 32, 31, 28, 25, 23, 21, 21, 21, 20, 20, 20, 19, 18, 18, 17, + 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 33, 33, 33, 31, + 30, 27, 24, 23, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 17, 17, 17, 16, + 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 34, 33, 32, 31, 29, 26, 24, 23, + 22, 23, 23, 23, 22, 22, 21, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, + 15, 15, 15, 14, 14, 14, 32, 31, 31, 29, 28, 25, 24, 23, 22, 22, 23, 22, + 22, 22, 21, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, + 15, 15, 31, 30, 29, 28, 26, 24, 23, 22, 22, 22, 23, 22, 22, 22, 21, 20, + 20, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 28, 27, + 26, 25, 24, 22, 22, 22, 21, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, + 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 25, 24, 24, 24, 23, 22, + 21, 21, 20, 21, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, + 17, 17, 17, 16, 16, 16, 15, 15, 23, 23, 23, 23, 22, 22, 21, 20, 20, 20, + 20, 20, 20, 20, 19, 19, 19, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, + 16, 16, 16, 16, 21, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, + 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 21, 22, 23, 22, 22, 22, 21, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, + 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 21, 22, 23, 23, + 23, 23, 21, 20, 19, 19, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 20, 22, 23, 22, 22, 22, 21, 20, + 19, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 15, 15, + 14, 14, 14, 14, 14, 14, 20, 21, 22, 22, 22, 22, 20, 20, 19, 18, 17, 17, + 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 20, 20, 22, 22, 22, 22, 20, 20, 19, 18, 17, 17, 16, 16, 15, 15, + 15, 14, 14, 14, 14, 13, 14, 14, 13, 14, 14, 13, 14, 14, 13, 13, 19, 20, + 21, 21, 21, 21, 20, 19, 19, 18, 17, 16, 16, 15, 14, 14, 14, 14, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 18, 19, 20, 20, 20, 21, + 20, 19, 18, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, 13, 13, 12, 13, + 13, 13, 13, 13, 13, 13, 13, 12, 18, 19, 20, 20, 20, 20, 19, 19, 18, 17, + 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 17, 18, 19, 19, 19, 20, 19, 18, 18, 17, 16, 15, 15, 14, + 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 16, 17, 18, 18, 18, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, + 12, 12, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 18, 18, + 18, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 16, 17, 18, 18, 18, 19, 18, 17, + 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, 15, + 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 10, 11, 11, 11, 11, 11, + 11, 11, 15, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 12, + 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 15, 16, + 16, 17, 17, 17, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, + 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15, 16, 16, 16, 17, 17, + 17, 16, 16, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 14, 15, 16, 16, 16, 17, 17, 16, 16, 15, + 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 14, 15, 15, 16, 16, 16, 17, 16, 16, 15, 15, 14, 14, 14, + 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 10, 14, + 15, 15, 16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, + 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 14, 15, 15, 15, 16, 16, + 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, + 10, 10, 9, 9, 9, 9, 9, 9, 14, 14, 14, 15, 15, 15, 16, 16, 16, 15, 15, + 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, + 9, 9, 13, 14, 14, 15, 15, 15, 15, 16, 16, 15, 15, 14, 14, 13, 13, 13, + 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 13, 14, 14, + 15, 15, 15, 15, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, + 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, + /* Size 4x8 */ + 33, 22, 17, 16, 26, 23, 19, 17, 22, 18, 16, 16, 21, 17, 14, 14, 19, 16, + 12, 12, 17, 15, 11, 11, 16, 15, 11, 10, 15, 14, 12, 10, + /* Size 8x4 */ + 33, 26, 22, 21, 19, 17, 16, 15, 22, 23, 18, 17, 16, 15, 15, 14, 17, 19, + 16, 14, 12, 11, 11, 12, 16, 17, 16, 14, 12, 11, 10, 10, + /* Size 8x16 */ + 32, 28, 21, 20, 18, 16, 15, 14, 34, 26, 22, 21, 20, 17, 16, 16, 31, 24, + 22, 22, 20, 17, 17, 16, 24, 22, 20, 20, 19, 17, 17, 17, 21, 21, 19, 19, + 18, 17, 17, 17, 21, 22, 19, 17, 16, 15, 16, 16, 20, 22, 19, 16, 15, 14, + 14, 15, 19, 21, 19, 15, 14, 13, 13, 14, 18, 20, 18, 15, 13, 12, 13, 13, + 16, 19, 17, 14, 12, 11, 12, 12, 16, 18, 17, 14, 12, 11, 11, 12, 15, 17, + 16, 14, 12, 11, 10, 11, 15, 17, 16, 14, 12, 11, 10, 10, 14, 16, 16, 14, + 12, 11, 10, 10, 14, 15, 16, 14, 12, 11, 10, 10, 13, 15, 15, 14, 12, 11, + 10, 9, + /* Size 16x8 */ + 32, 34, 31, 24, 21, 21, 20, 19, 18, 16, 16, 15, 15, 14, 14, 13, 28, 26, + 24, 22, 21, 22, 22, 21, 20, 19, 18, 17, 17, 16, 15, 15, 21, 22, 22, 20, + 19, 19, 19, 19, 18, 17, 17, 16, 16, 16, 16, 15, 20, 21, 22, 20, 19, 17, + 16, 15, 15, 14, 14, 14, 14, 14, 14, 14, 18, 20, 20, 19, 18, 16, 15, 14, + 13, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 17, 17, 15, 14, 13, 12, 11, + 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 14, 13, 13, 12, 11, 10, + 10, 10, 10, 10, 14, 16, 16, 17, 17, 16, 15, 14, 13, 12, 12, 11, 10, 10, + 10, 9, + /* Size 16x32 */ + 32, 33, 28, 24, 21, 21, 20, 19, 18, 16, 16, 15, 15, 15, 14, 14, 33, 33, + 27, 24, 22, 22, 20, 20, 19, 17, 16, 16, 16, 16, 15, 15, 34, 32, 26, 24, + 22, 23, 21, 20, 20, 18, 17, 17, 16, 16, 16, 15, 32, 30, 25, 23, 22, 23, + 21, 21, 20, 18, 17, 17, 17, 16, 16, 16, 31, 28, 24, 23, 22, 22, 22, 21, + 20, 18, 17, 17, 17, 17, 16, 16, 28, 26, 22, 22, 22, 23, 22, 21, 20, 19, + 18, 18, 17, 17, 17, 16, 24, 24, 22, 21, 20, 21, 20, 20, 19, 18, 17, 18, + 17, 17, 17, 16, 23, 23, 22, 21, 20, 20, 20, 19, 19, 17, 17, 17, 17, 17, + 17, 17, 21, 22, 21, 20, 19, 19, 19, 19, 18, 17, 17, 16, 17, 16, 17, 17, + 21, 22, 22, 20, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 16, 16, 21, 23, + 22, 21, 19, 18, 17, 17, 16, 15, 15, 15, 16, 16, 16, 16, 21, 22, 22, 21, + 19, 17, 17, 16, 16, 15, 14, 15, 15, 15, 15, 15, 20, 22, 22, 20, 19, 17, + 16, 16, 15, 14, 14, 14, 14, 15, 15, 15, 20, 21, 22, 20, 19, 17, 16, 15, + 14, 14, 13, 14, 14, 14, 14, 14, 19, 20, 21, 20, 19, 17, 15, 14, 14, 13, + 13, 13, 13, 14, 14, 14, 19, 20, 21, 20, 18, 16, 15, 14, 14, 13, 12, 13, + 13, 13, 13, 13, 18, 20, 20, 19, 18, 16, 15, 14, 13, 12, 12, 12, 13, 13, + 13, 13, 17, 19, 20, 19, 18, 16, 14, 14, 13, 12, 12, 12, 12, 12, 13, 13, + 16, 18, 19, 18, 17, 15, 14, 13, 12, 12, 11, 12, 12, 12, 12, 13, 16, 18, + 19, 18, 17, 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 12, 16, 17, 18, 18, + 17, 15, 14, 13, 12, 11, 11, 11, 11, 11, 12, 12, 15, 17, 18, 17, 16, 15, + 13, 13, 12, 11, 11, 11, 11, 11, 11, 11, 15, 17, 17, 17, 16, 14, 14, 13, + 12, 11, 11, 11, 10, 11, 11, 11, 15, 17, 17, 17, 16, 15, 14, 13, 12, 12, + 11, 10, 10, 10, 11, 11, 15, 16, 17, 17, 16, 15, 14, 13, 12, 12, 11, 11, + 10, 10, 10, 11, 14, 16, 16, 17, 15, 15, 14, 13, 12, 11, 11, 10, 10, 10, + 10, 10, 14, 16, 16, 17, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 10, 10, + 14, 16, 16, 16, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 10, 10, 14, 15, + 15, 16, 16, 15, 14, 13, 12, 12, 11, 11, 10, 10, 10, 10, 14, 15, 15, 16, + 16, 14, 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 13, 15, 15, 16, 15, 14, + 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 13, 15, 15, 15, 15, 14, 14, 13, + 13, 11, 11, 10, 10, 9, 9, 9, + /* Size 32x16 */ + 32, 33, 34, 32, 31, 28, 24, 23, 21, 21, 21, 21, 20, 20, 19, 19, 18, 17, + 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 33, 33, 32, 30, + 28, 26, 24, 23, 22, 22, 23, 22, 22, 21, 20, 20, 20, 19, 18, 18, 17, 17, + 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 28, 27, 26, 25, 24, 22, 22, 22, + 21, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, + 16, 16, 15, 15, 15, 15, 24, 24, 24, 23, 23, 22, 21, 21, 20, 20, 21, 21, + 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, + 16, 15, 21, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18, + 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 16, 16, 16, 16, 15, 15, 21, 22, + 23, 23, 22, 23, 21, 20, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, + 15, 15, 14, 15, 15, 15, 15, 15, 15, 14, 14, 14, 20, 20, 21, 21, 22, 22, + 20, 20, 19, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 19, 20, 20, 21, 21, 21, 20, 19, 19, 17, + 17, 16, 16, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 18, 19, 20, 20, 20, 20, 19, 19, 18, 17, 16, 16, 15, 14, + 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, + 16, 17, 18, 18, 18, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, + 12, 12, 11, 11, 11, 12, 12, 11, 12, 12, 12, 12, 12, 11, 16, 16, 17, 17, + 17, 18, 17, 17, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, 18, 17, + 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 11, 10, + 10, 10, 11, 11, 11, 10, 15, 16, 16, 17, 17, 17, 17, 17, 17, 16, 16, 15, + 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 15, 16, 16, 16, 17, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, + 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 9, 14, 15, + 16, 16, 16, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, + 12, 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 9, 14, 15, 15, 16, 16, 16, 16, + 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 11, 11, 11, 11, + 10, 10, 10, 10, 9, 9, 9, + /* Size 4x16 */ + 33, 21, 16, 15, 32, 23, 18, 16, 28, 22, 18, 17, 24, 21, 18, 17, 22, 19, + 17, 16, 23, 18, 15, 16, 22, 17, 14, 15, 20, 17, 13, 14, 20, 16, 12, 13, + 18, 15, 12, 12, 17, 15, 11, 11, 17, 14, 11, 11, 16, 15, 12, 10, 16, 15, + 12, 10, 15, 15, 12, 10, 15, 14, 12, 10, + /* Size 16x4 */ + 33, 32, 28, 24, 22, 23, 22, 20, 20, 18, 17, 17, 16, 16, 15, 15, 21, 23, + 22, 21, 19, 18, 17, 17, 16, 15, 15, 14, 15, 15, 15, 14, 16, 18, 18, 18, + 17, 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 12, 15, 16, 17, 17, 16, 16, + 15, 14, 13, 12, 11, 11, 10, 10, 10, 10, + /* Size 8x32 */ + 32, 28, 21, 20, 18, 16, 15, 14, 33, 27, 22, 20, 19, 16, 16, 15, 34, 26, + 22, 21, 20, 17, 16, 16, 32, 25, 22, 21, 20, 17, 17, 16, 31, 24, 22, 22, + 20, 17, 17, 16, 28, 22, 22, 22, 20, 18, 17, 17, 24, 22, 20, 20, 19, 17, + 17, 17, 23, 22, 20, 20, 19, 17, 17, 17, 21, 21, 19, 19, 18, 17, 17, 17, + 21, 22, 19, 18, 17, 16, 16, 16, 21, 22, 19, 17, 16, 15, 16, 16, 21, 22, + 19, 17, 16, 14, 15, 15, 20, 22, 19, 16, 15, 14, 14, 15, 20, 22, 19, 16, + 14, 13, 14, 14, 19, 21, 19, 15, 14, 13, 13, 14, 19, 21, 18, 15, 14, 12, + 13, 13, 18, 20, 18, 15, 13, 12, 13, 13, 17, 20, 18, 14, 13, 12, 12, 13, + 16, 19, 17, 14, 12, 11, 12, 12, 16, 19, 17, 14, 12, 11, 12, 12, 16, 18, + 17, 14, 12, 11, 11, 12, 15, 18, 16, 13, 12, 11, 11, 11, 15, 17, 16, 14, + 12, 11, 10, 11, 15, 17, 16, 14, 12, 11, 10, 11, 15, 17, 16, 14, 12, 11, + 10, 10, 14, 16, 15, 14, 12, 11, 10, 10, 14, 16, 16, 14, 12, 11, 10, 10, + 14, 16, 16, 14, 12, 11, 10, 10, 14, 15, 16, 14, 12, 11, 10, 10, 14, 15, + 16, 14, 12, 11, 10, 9, 13, 15, 15, 14, 12, 11, 10, 9, 13, 15, 15, 14, + 13, 11, 10, 9, + /* Size 32x8 */ + 32, 33, 34, 32, 31, 28, 24, 23, 21, 21, 21, 21, 20, 20, 19, 19, 18, 17, + 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 28, 27, 26, 25, + 24, 22, 22, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, + 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 21, 22, 22, 22, 22, 22, 20, 20, + 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, + 16, 16, 16, 16, 15, 15, 20, 20, 21, 21, 22, 22, 20, 20, 19, 18, 17, 17, + 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 18, 19, 20, 20, 20, 20, 19, 19, 18, 17, 16, 16, 15, 14, 14, 14, + 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 16, 16, + 17, 17, 17, 18, 17, 17, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, + 17, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 14, 15, 16, 16, 16, 17, 17, 17, 17, 16, + 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, + 10, 9, 9, 9 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 25, 15, 11, 25, 16, 12, 10, 15, 12, 8, 7, 11, 10, 7, 6, + /* Size 8x8 */ + 32, 32, 28, 22, 17, 13, 11, 10, 32, 29, 26, 22, 18, 14, 12, 11, 28, 26, + 20, 17, 14, 12, 11, 10, 22, 22, 17, 14, 12, 10, 10, 9, 17, 18, 14, 12, + 10, 8, 8, 8, 13, 14, 12, 10, 8, 7, 7, 7, 11, 12, 11, 10, 8, 7, 6, 6, 10, + 11, 10, 9, 8, 7, 6, 5, + /* Size 16x16 */ + 32, 33, 33, 32, 28, 26, 22, 19, 17, 14, 13, 12, 11, 10, 10, 9, 33, 32, + 32, 31, 30, 28, 23, 20, 18, 16, 14, 13, 12, 11, 10, 10, 33, 32, 31, 30, + 28, 26, 23, 20, 18, 16, 14, 13, 12, 12, 11, 10, 32, 31, 30, 28, 26, 24, + 22, 20, 18, 16, 14, 13, 13, 12, 11, 10, 28, 30, 28, 26, 21, 20, 18, 17, + 16, 14, 13, 12, 12, 11, 11, 10, 26, 28, 26, 24, 20, 19, 17, 16, 15, 13, + 12, 12, 11, 11, 10, 10, 22, 23, 23, 22, 18, 17, 15, 14, 13, 12, 11, 10, + 10, 10, 9, 9, 19, 20, 20, 20, 17, 16, 14, 12, 12, 11, 10, 9, 9, 9, 9, 8, + 17, 18, 18, 18, 16, 15, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 14, 16, 16, + 16, 14, 13, 12, 11, 10, 9, 8, 8, 8, 8, 8, 7, 13, 14, 14, 14, 13, 12, 11, + 10, 9, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 13, 12, 12, 10, 9, 9, 8, 7, 7, + 7, 7, 6, 6, 11, 12, 12, 13, 12, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, 6, 10, + 11, 12, 12, 11, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, 5, 10, 10, 11, 11, 11, + 10, 9, 9, 8, 8, 7, 6, 6, 6, 5, 5, 9, 10, 10, 10, 10, 10, 9, 8, 8, 7, 7, + 6, 6, 5, 5, 5, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 17, 17, 16, + 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 33, 32, 32, 32, 32, + 32, 31, 30, 29, 28, 27, 24, 23, 22, 20, 18, 18, 17, 15, 14, 13, 13, 12, + 12, 12, 11, 11, 11, 10, 10, 10, 9, 33, 32, 32, 32, 32, 32, 31, 31, 30, + 28, 28, 25, 23, 22, 20, 19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 11, + 11, 10, 10, 10, 9, 33, 32, 32, 32, 32, 31, 31, 30, 29, 28, 27, 25, 23, + 23, 21, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, + 10, 33, 32, 32, 32, 31, 30, 30, 29, 28, 27, 26, 24, 23, 22, 20, 19, 18, + 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 32, 32, 32, + 31, 30, 29, 28, 28, 27, 26, 26, 24, 23, 22, 21, 19, 19, 18, 16, 16, 15, + 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 32, 31, 31, 31, 30, 28, 28, + 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, + 12, 12, 12, 11, 11, 10, 10, 30, 30, 31, 30, 29, 28, 27, 26, 24, 23, 23, + 22, 20, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, + 11, 11, 10, 28, 29, 30, 29, 28, 27, 26, 24, 21, 20, 20, 19, 18, 18, 17, + 16, 16, 15, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 27, + 28, 28, 28, 27, 26, 25, 23, 20, 20, 20, 18, 18, 17, 16, 15, 15, 14, 13, + 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 26, 27, 28, 27, 26, + 26, 24, 23, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, 12, 12, + 11, 11, 11, 11, 10, 10, 10, 10, 10, 23, 24, 25, 25, 24, 24, 23, 22, 19, + 18, 18, 16, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 10, 10, + 10, 10, 10, 9, 9, 22, 23, 23, 23, 23, 23, 22, 20, 18, 18, 17, 16, 15, + 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, + 21, 22, 22, 23, 22, 22, 21, 20, 18, 17, 17, 15, 14, 14, 13, 13, 12, 12, + 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 19, 20, 20, 21, 20, + 21, 20, 19, 17, 16, 16, 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, + 9, 9, 9, 9, 9, 9, 8, 8, 8, 17, 18, 19, 19, 19, 19, 19, 18, 16, 15, 15, + 14, 13, 13, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, + 17, 18, 18, 18, 18, 19, 18, 17, 16, 15, 15, 13, 13, 12, 12, 11, 11, 10, + 10, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 16, 17, 17, 17, 17, 18, 17, + 16, 15, 14, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 7, 14, 15, 16, 16, 16, 16, 16, 15, 14, 13, 13, 12, 12, 11, + 11, 10, 10, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 13, 14, 15, 15, + 15, 16, 15, 14, 13, 13, 13, 12, 11, 11, 10, 10, 9, 9, 8, 8, 8, 8, 8, 8, + 7, 7, 7, 7, 7, 7, 7, 7, 13, 13, 14, 14, 14, 15, 14, 14, 13, 12, 12, 11, + 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 13, + 13, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 8, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 12, 12, 13, 13, 13, 14, 13, 13, 12, 12, + 12, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 7, 6, 6, 6, 6, 6, + 12, 12, 12, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, + 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, 12, 13, 13, 12, + 12, 11, 11, 11, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, + 6, 6, 11, 11, 12, 12, 12, 12, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, + 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 10, 11, 11, 12, 12, 12, 12, + 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, + 5, 5, 5, 10, 11, 11, 11, 11, 11, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, + 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 10, 10, 10, 11, 11, 11, 11, + 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, + 5, 5, 5, 9, 10, 10, 10, 11, 11, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, + 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 9, 10, 10, 10, 10, 10, 10, + 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, + 5, 5, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, + 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, + /* Size 4x8 */ + 32, 24, 15, 12, 31, 24, 16, 12, 28, 18, 13, 12, 22, 15, 11, 10, 17, 13, + 9, 8, 14, 11, 8, 7, 12, 11, 8, 6, 10, 10, 8, 6, + /* Size 8x4 */ + 32, 31, 28, 22, 17, 14, 12, 10, 24, 24, 18, 15, 13, 11, 11, 10, 15, 16, + 13, 11, 9, 8, 8, 8, 12, 12, 12, 10, 8, 7, 6, 6, + /* Size 8x16 */ + 32, 32, 28, 22, 16, 13, 11, 11, 33, 32, 29, 23, 17, 14, 12, 11, 32, 30, + 28, 23, 17, 14, 13, 12, 32, 29, 26, 22, 17, 14, 13, 12, 28, 28, 21, 18, + 15, 13, 12, 12, 26, 26, 20, 17, 14, 12, 11, 11, 22, 23, 18, 15, 12, 11, + 10, 10, 19, 20, 17, 14, 11, 10, 9, 9, 17, 18, 16, 13, 10, 9, 9, 9, 14, + 16, 14, 12, 9, 8, 8, 8, 13, 15, 13, 11, 9, 8, 7, 7, 12, 13, 12, 10, 8, + 7, 7, 7, 11, 12, 12, 10, 8, 7, 7, 6, 10, 12, 11, 9, 8, 7, 6, 6, 10, 11, + 11, 9, 8, 7, 6, 6, 9, 10, 10, 9, 8, 7, 6, 5, + /* Size 16x8 */ + 32, 33, 32, 32, 28, 26, 22, 19, 17, 14, 13, 12, 11, 10, 10, 9, 32, 32, + 30, 29, 28, 26, 23, 20, 18, 16, 15, 13, 12, 12, 11, 10, 28, 29, 28, 26, + 21, 20, 18, 17, 16, 14, 13, 12, 12, 11, 11, 10, 22, 23, 23, 22, 18, 17, + 15, 14, 13, 12, 11, 10, 10, 9, 9, 9, 16, 17, 17, 17, 15, 14, 12, 11, 10, + 9, 9, 8, 8, 8, 8, 8, 13, 14, 14, 14, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, + 7, 7, 11, 12, 13, 13, 12, 11, 10, 9, 9, 8, 7, 7, 7, 6, 6, 6, 11, 11, 12, + 12, 12, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 5, + /* Size 16x32 */ + 32, 33, 32, 32, 28, 23, 22, 19, 16, 14, 13, 12, 11, 11, 11, 10, 33, 32, + 32, 31, 29, 24, 23, 20, 17, 15, 14, 12, 12, 12, 11, 11, 33, 32, 32, 31, + 29, 25, 23, 21, 17, 15, 14, 13, 12, 12, 11, 11, 33, 32, 31, 31, 29, 25, + 23, 21, 17, 16, 14, 13, 12, 12, 12, 11, 32, 32, 30, 30, 28, 24, 23, 20, + 17, 16, 14, 13, 13, 12, 12, 11, 32, 31, 29, 28, 27, 24, 23, 21, 18, 16, + 15, 13, 13, 12, 12, 12, 32, 31, 29, 28, 26, 23, 22, 20, 17, 16, 14, 13, + 13, 13, 12, 12, 30, 30, 28, 27, 24, 21, 20, 19, 16, 15, 14, 13, 12, 13, + 12, 12, 28, 30, 28, 26, 21, 19, 18, 17, 15, 14, 13, 12, 12, 12, 12, 12, + 27, 28, 26, 25, 21, 18, 18, 16, 14, 13, 13, 12, 12, 12, 11, 11, 26, 28, + 26, 24, 20, 18, 17, 16, 14, 13, 12, 11, 11, 11, 11, 11, 23, 25, 24, 23, + 19, 16, 16, 14, 13, 12, 11, 11, 11, 11, 11, 10, 22, 23, 23, 22, 18, 16, + 15, 14, 12, 11, 11, 10, 10, 10, 10, 10, 21, 22, 22, 21, 18, 15, 14, 13, + 12, 11, 11, 10, 10, 10, 10, 10, 19, 21, 20, 20, 17, 14, 14, 12, 11, 10, + 10, 9, 9, 10, 9, 10, 18, 19, 19, 19, 16, 14, 13, 12, 10, 10, 9, 9, 9, 9, + 9, 9, 17, 18, 18, 18, 16, 13, 13, 12, 10, 10, 9, 9, 9, 9, 9, 9, 16, 17, + 17, 17, 15, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 8, 14, 16, 16, 16, 14, 12, + 12, 11, 9, 9, 8, 8, 8, 8, 8, 8, 13, 15, 15, 15, 13, 12, 11, 10, 9, 8, 8, + 8, 8, 8, 8, 8, 13, 14, 15, 14, 13, 11, 11, 10, 9, 8, 8, 7, 7, 7, 7, 8, + 12, 14, 14, 14, 13, 11, 11, 10, 8, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 13, + 12, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 7, 12, 13, 13, 13, 12, 11, 10, 9, 8, + 8, 7, 7, 7, 7, 7, 6, 11, 12, 12, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, 6, 6, + 6, 11, 12, 12, 12, 11, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 6, 10, 12, 12, + 12, 11, 11, 9, 9, 8, 8, 7, 6, 6, 6, 6, 6, 10, 11, 11, 12, 11, 10, 9, 9, + 8, 8, 7, 6, 6, 6, 6, 6, 10, 11, 11, 11, 11, 10, 9, 9, 8, 8, 7, 7, 6, 6, + 6, 6, 10, 10, 11, 11, 11, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 9, 10, 10, + 11, 10, 9, 9, 8, 8, 7, 7, 6, 6, 6, 5, 5, 9, 10, 10, 10, 10, 9, 9, 8, 8, + 7, 7, 6, 6, 5, 5, 5, + /* Size 32x16 */ + 32, 33, 33, 33, 32, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 18, 17, 16, + 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 33, 32, 32, 32, + 32, 31, 31, 30, 30, 28, 28, 25, 23, 22, 21, 19, 18, 17, 16, 15, 14, 14, + 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 32, 32, 32, 31, 30, 29, 29, 28, + 28, 26, 26, 24, 23, 22, 20, 19, 18, 17, 16, 15, 15, 14, 13, 13, 12, 12, + 12, 11, 11, 11, 10, 10, 32, 31, 31, 31, 30, 28, 28, 27, 26, 25, 24, 23, + 22, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, + 11, 10, 28, 29, 29, 29, 28, 27, 26, 24, 21, 21, 20, 19, 18, 18, 17, 16, + 16, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 23, 24, + 25, 25, 24, 24, 23, 21, 19, 18, 18, 16, 16, 15, 14, 14, 13, 13, 12, 12, + 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 9, 9, 22, 23, 23, 23, 23, 23, + 22, 20, 18, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, + 10, 10, 9, 9, 9, 9, 9, 9, 19, 20, 21, 21, 20, 21, 20, 19, 17, 16, 16, + 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 8, + 8, 16, 17, 17, 17, 17, 18, 17, 16, 15, 14, 14, 13, 12, 12, 11, 10, 10, + 10, 9, 9, 9, 8, 8, 8, 8, 9, 8, 8, 8, 8, 8, 8, 14, 15, 15, 16, 16, 16, + 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 7, 7, 13, 14, 14, 14, 14, 15, 14, 14, 13, 13, 12, 11, 11, + 11, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, + 13, 13, 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, + 7, 7, 7, 6, 6, 7, 7, 6, 6, 11, 12, 12, 12, 13, 13, 13, 12, 12, 12, 11, + 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 11, + 12, 12, 12, 12, 12, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, + 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 11, 11, 11, 12, 12, 12, 12, 12, 12, + 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, + 5, 10, 11, 11, 11, 11, 12, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 8, + 8, 8, 8, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 5, + /* Size 4x16 */ + 33, 23, 14, 11, 32, 25, 15, 12, 32, 24, 16, 12, 31, 23, 16, 13, 30, 19, + 14, 12, 28, 18, 13, 11, 23, 16, 11, 10, 21, 14, 10, 10, 18, 13, 10, 9, + 16, 12, 9, 8, 14, 11, 8, 7, 13, 11, 8, 7, 12, 11, 8, 6, 12, 11, 8, 6, + 11, 10, 8, 6, 10, 9, 7, 6, + /* Size 16x4 */ + 33, 32, 32, 31, 30, 28, 23, 21, 18, 16, 14, 13, 12, 12, 11, 10, 23, 25, + 24, 23, 19, 18, 16, 14, 13, 12, 11, 11, 11, 11, 10, 9, 14, 15, 16, 16, + 14, 13, 11, 10, 10, 9, 8, 8, 8, 8, 8, 7, 11, 12, 12, 13, 12, 11, 10, 10, + 9, 8, 7, 7, 6, 6, 6, 6, + /* Size 8x32 */ + 32, 32, 28, 22, 16, 13, 11, 11, 33, 32, 29, 23, 17, 14, 12, 11, 33, 32, + 29, 23, 17, 14, 12, 11, 33, 31, 29, 23, 17, 14, 12, 12, 32, 30, 28, 23, + 17, 14, 13, 12, 32, 29, 27, 23, 18, 15, 13, 12, 32, 29, 26, 22, 17, 14, + 13, 12, 30, 28, 24, 20, 16, 14, 12, 12, 28, 28, 21, 18, 15, 13, 12, 12, + 27, 26, 21, 18, 14, 13, 12, 11, 26, 26, 20, 17, 14, 12, 11, 11, 23, 24, + 19, 16, 13, 11, 11, 11, 22, 23, 18, 15, 12, 11, 10, 10, 21, 22, 18, 14, + 12, 11, 10, 10, 19, 20, 17, 14, 11, 10, 9, 9, 18, 19, 16, 13, 10, 9, 9, + 9, 17, 18, 16, 13, 10, 9, 9, 9, 16, 17, 15, 12, 10, 9, 8, 8, 14, 16, 14, + 12, 9, 8, 8, 8, 13, 15, 13, 11, 9, 8, 8, 8, 13, 15, 13, 11, 9, 8, 7, 7, + 12, 14, 13, 11, 8, 8, 7, 7, 12, 13, 12, 10, 8, 7, 7, 7, 12, 13, 12, 10, + 8, 7, 7, 7, 11, 12, 12, 10, 8, 7, 7, 6, 11, 12, 11, 10, 9, 7, 6, 6, 10, + 12, 11, 9, 8, 7, 6, 6, 10, 11, 11, 9, 8, 7, 6, 6, 10, 11, 11, 9, 8, 7, + 6, 6, 10, 11, 11, 9, 8, 7, 6, 5, 9, 10, 10, 9, 8, 7, 6, 5, 9, 10, 10, 9, + 8, 7, 6, 5, + /* Size 32x8 */ + 32, 33, 33, 33, 32, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 18, 17, 16, + 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 32, 32, 32, 31, + 30, 29, 29, 28, 28, 26, 26, 24, 23, 22, 20, 19, 18, 17, 16, 15, 15, 14, + 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 28, 29, 29, 29, 28, 27, 26, 24, + 21, 21, 20, 19, 18, 18, 17, 16, 16, 15, 14, 13, 13, 13, 12, 12, 12, 11, + 11, 11, 11, 11, 10, 10, 22, 23, 23, 23, 23, 23, 22, 20, 18, 18, 17, 16, + 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, + 9, 16, 17, 17, 17, 17, 18, 17, 16, 15, 14, 14, 13, 12, 12, 11, 10, 10, + 10, 9, 9, 9, 8, 8, 8, 8, 9, 8, 8, 8, 8, 8, 8, 13, 14, 14, 14, 14, 15, + 14, 14, 13, 13, 12, 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 11, 12, 12, 12, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, + 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 11, 11, 11, 12, + 12, 12, 12, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, + 6, 6, 6, 6, 5, 5, 5 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 23, 18, 16, 23, 18, 16, 15, 18, 16, 12, 12, 16, 15, 12, 10, + /* Size 8x8 */ + 33, 27, 22, 21, 19, 17, 16, 15, 27, 22, 22, 22, 20, 19, 17, 16, 22, 22, + 19, 19, 18, 16, 16, 16, 21, 22, 19, 17, 15, 14, 14, 14, 19, 20, 18, 15, + 13, 12, 12, 12, 17, 19, 16, 14, 12, 11, 11, 11, 16, 17, 16, 14, 12, 11, + 10, 10, 15, 16, 16, 14, 12, 11, 10, 9, + /* Size 16x16 */ + 32, 34, 31, 27, 21, 21, 20, 20, 19, 17, 16, 16, 15, 15, 14, 14, 34, 33, + 29, 25, 22, 22, 22, 21, 20, 19, 18, 17, 16, 16, 15, 15, 31, 29, 26, 23, + 22, 22, 22, 22, 20, 19, 18, 18, 17, 17, 16, 15, 27, 25, 23, 22, 21, 21, + 22, 21, 20, 19, 19, 18, 18, 17, 17, 16, 21, 22, 22, 21, 19, 19, 19, 19, + 18, 18, 17, 17, 17, 16, 16, 16, 21, 22, 22, 21, 19, 19, 18, 18, 17, 17, + 16, 16, 15, 16, 15, 15, 20, 22, 22, 22, 19, 18, 17, 16, 16, 15, 15, 14, + 14, 14, 14, 14, 20, 21, 22, 21, 19, 18, 16, 16, 15, 14, 14, 13, 14, 13, + 13, 13, 19, 20, 20, 20, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, + 17, 19, 19, 19, 18, 17, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, 16, 18, + 18, 19, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 12, 11, 16, 17, 18, 18, + 17, 16, 14, 13, 13, 12, 11, 11, 11, 11, 11, 11, 15, 16, 17, 18, 17, 15, + 14, 14, 13, 12, 11, 11, 10, 10, 10, 10, 15, 16, 17, 17, 16, 16, 14, 13, + 13, 12, 11, 11, 10, 10, 10, 10, 14, 15, 16, 17, 16, 15, 14, 13, 13, 12, + 12, 11, 10, 10, 10, 9, 14, 15, 15, 16, 16, 15, 14, 13, 13, 12, 11, 11, + 10, 10, 9, 9, + /* Size 32x32 */ + 32, 33, 34, 33, 31, 28, 27, 25, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18, + 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 33, 33, 33, 32, + 30, 27, 26, 24, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 17, 17, 17, + 16, 16, 16, 16, 15, 15, 15, 15, 15, 14, 34, 33, 33, 32, 29, 26, 25, 24, + 22, 22, 22, 23, 22, 22, 21, 20, 20, 20, 19, 18, 18, 17, 17, 17, 16, 16, + 16, 15, 15, 15, 15, 14, 33, 32, 32, 31, 28, 26, 25, 24, 22, 22, 23, 23, + 22, 22, 22, 21, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, + 15, 15, 31, 30, 29, 28, 26, 24, 23, 23, 22, 22, 22, 23, 22, 22, 22, 21, + 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 28, 27, + 26, 26, 24, 22, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, + 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 27, 26, 25, 25, 23, 22, + 22, 21, 21, 21, 21, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 18, 18, 18, + 18, 17, 17, 17, 17, 16, 16, 16, 25, 24, 24, 24, 23, 22, 21, 21, 20, 20, + 21, 21, 20, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, + 17, 16, 16, 16, 21, 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, 19, + 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 17, 17, 16, 16, 16, 16, 16, 16, + 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, + 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 21, 22, 22, 23, + 22, 22, 21, 21, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 16, 16, 16, + 16, 16, 15, 16, 16, 15, 15, 15, 15, 15, 21, 22, 23, 23, 23, 23, 22, 21, + 19, 19, 19, 18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 14, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17, + 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17, 17, 17, 16, 16, + 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 20, + 21, 22, 22, 22, 21, 20, 19, 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, + 14, 14, 13, 13, 14, 13, 13, 14, 13, 13, 13, 14, 19, 20, 20, 21, 21, 21, + 21, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 19, 19, 20, 20, 20, 21, 20, 20, 18, 18, + 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 18, 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 16, 16, 15, + 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 17, 18, 19, 19, 19, 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 17, 17, 18, 18, + 19, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 12, + 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 18, 18, 18, 19, 19, 18, + 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, + 11, 11, 12, 11, 11, 12, 16, 17, 17, 18, 18, 19, 18, 18, 17, 16, 16, 15, + 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 16, 16, 17, 17, 18, 18, 18, 17, 17, 16, 16, 15, 14, 14, 13, 13, + 13, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, + 17, 17, 17, 18, 18, 17, 16, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, + 11, 11, 11, 11, 11, 10, 10, 11, 11, 11, 11, 10, 15, 16, 16, 17, 17, 17, + 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, + 10, 10, 10, 10, 10, 10, 10, 10, 15, 16, 16, 16, 17, 17, 17, 17, 17, 16, + 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 15, 15, 16, 16, 17, 17, 17, 17, 16, 16, 16, 15, 14, 14, + 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 14, 15, 15, 16, 16, 16, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, + 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 9, 14, 15, 15, 16, + 16, 16, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, + 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 14, 15, 15, 16, 16, 16, 16, 16, + 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, + 10, 10, 10, 9, 9, 9, 14, 15, 15, 15, 15, 16, 16, 16, 16, 15, 15, 15, 14, + 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, + 14, 14, 14, 15, 15, 16, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, + 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, + /* Size 4x8 */ + 33, 22, 18, 16, 26, 23, 20, 17, 22, 19, 17, 16, 22, 17, 15, 14, 20, 16, + 13, 13, 17, 15, 12, 11, 16, 16, 12, 10, 16, 15, 12, 10, + /* Size 8x4 */ + 33, 26, 22, 22, 20, 17, 16, 16, 22, 23, 19, 17, 16, 15, 16, 15, 18, 20, + 17, 15, 13, 12, 12, 12, 16, 17, 16, 14, 13, 11, 10, 10, + /* Size 8x16 */ + 32, 29, 21, 20, 18, 16, 15, 15, 34, 27, 22, 22, 20, 18, 16, 16, 31, 25, + 22, 22, 20, 18, 17, 16, 26, 22, 21, 22, 20, 19, 18, 17, 21, 21, 19, 19, + 18, 17, 17, 17, 21, 22, 19, 18, 17, 16, 16, 16, 20, 22, 19, 17, 16, 15, + 14, 15, 20, 22, 19, 16, 14, 14, 14, 14, 19, 21, 18, 16, 14, 13, 13, 13, + 17, 19, 18, 15, 13, 12, 12, 12, 16, 19, 17, 15, 12, 12, 11, 12, 16, 18, + 17, 14, 12, 11, 11, 11, 15, 17, 16, 14, 13, 11, 11, 11, 15, 17, 16, 14, + 13, 12, 10, 10, 14, 16, 16, 14, 12, 11, 10, 10, 14, 15, 16, 14, 13, 12, + 10, 10, + /* Size 16x8 */ + 32, 34, 31, 26, 21, 21, 20, 20, 19, 17, 16, 16, 15, 15, 14, 14, 29, 27, + 25, 22, 21, 22, 22, 22, 21, 19, 19, 18, 17, 17, 16, 15, 21, 22, 22, 21, + 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 20, 22, 22, 22, 19, 18, + 17, 16, 16, 15, 15, 14, 14, 14, 14, 14, 18, 20, 20, 20, 18, 17, 16, 14, + 14, 13, 12, 12, 13, 13, 12, 13, 16, 18, 18, 19, 17, 16, 15, 14, 13, 12, + 12, 11, 11, 12, 11, 12, 15, 16, 17, 18, 17, 16, 14, 14, 13, 12, 11, 11, + 11, 10, 10, 10, 15, 16, 16, 17, 17, 16, 15, 14, 13, 12, 12, 11, 11, 10, + 10, 10, + /* Size 16x32 */ + 32, 33, 29, 27, 21, 21, 20, 20, 18, 17, 16, 15, 15, 15, 15, 14, 33, 33, + 28, 26, 22, 22, 21, 20, 19, 18, 17, 16, 16, 16, 16, 15, 34, 32, 27, 26, + 22, 23, 22, 21, 20, 19, 18, 17, 16, 16, 16, 15, 33, 31, 27, 25, 22, 23, + 22, 21, 20, 19, 18, 17, 17, 17, 16, 16, 31, 28, 25, 23, 22, 22, 22, 22, + 20, 19, 18, 17, 17, 17, 16, 16, 28, 26, 23, 22, 22, 23, 22, 22, 20, 20, + 19, 18, 17, 17, 17, 17, 26, 25, 22, 22, 21, 22, 22, 21, 20, 19, 19, 18, + 18, 17, 17, 17, 24, 24, 22, 21, 20, 21, 20, 20, 19, 18, 18, 17, 17, 17, + 17, 17, 21, 22, 21, 21, 19, 19, 19, 19, 18, 17, 17, 16, 17, 17, 17, 17, + 21, 22, 22, 21, 19, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 21, 22, + 22, 21, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 16, 16, 21, 23, 23, 22, + 19, 18, 17, 17, 16, 16, 15, 15, 15, 15, 16, 15, 20, 22, 22, 21, 19, 17, + 17, 16, 16, 15, 15, 14, 14, 15, 15, 15, 20, 22, 22, 21, 19, 17, 17, 16, + 15, 15, 14, 14, 14, 14, 15, 14, 20, 21, 22, 21, 19, 17, 16, 16, 14, 14, + 14, 13, 14, 14, 14, 14, 19, 20, 21, 20, 19, 17, 16, 15, 14, 13, 13, 13, + 13, 13, 14, 14, 19, 20, 21, 20, 18, 16, 16, 15, 14, 13, 13, 13, 13, 13, + 13, 14, 18, 20, 20, 20, 18, 16, 16, 15, 13, 13, 12, 12, 12, 13, 13, 13, + 17, 19, 19, 19, 18, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 13, 17, 18, + 19, 19, 17, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, 16, 18, 19, 18, + 17, 15, 15, 14, 12, 12, 12, 11, 11, 12, 12, 12, 16, 17, 18, 18, 17, 15, + 14, 14, 12, 12, 11, 11, 11, 11, 12, 12, 16, 17, 18, 18, 17, 15, 14, 13, + 12, 12, 11, 11, 11, 11, 11, 12, 15, 17, 17, 18, 16, 15, 14, 13, 12, 12, + 11, 11, 11, 11, 11, 11, 15, 17, 17, 17, 16, 15, 14, 13, 13, 12, 11, 11, + 11, 10, 11, 11, 15, 16, 17, 17, 16, 16, 14, 13, 13, 12, 11, 11, 10, 10, + 10, 10, 15, 16, 17, 17, 16, 16, 14, 13, 13, 12, 12, 11, 10, 10, 10, 10, + 14, 16, 16, 17, 16, 15, 14, 14, 12, 12, 11, 11, 10, 10, 10, 10, 14, 16, + 16, 17, 16, 15, 14, 14, 12, 12, 11, 11, 10, 10, 10, 10, 14, 16, 16, 16, + 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 10, 10, 14, 15, 15, 16, 16, 15, + 14, 13, 13, 12, 12, 11, 10, 10, 10, 10, 14, 15, 15, 16, 16, 14, 14, 13, + 13, 12, 12, 11, 11, 10, 10, 9, + /* Size 32x16 */ + 32, 33, 34, 33, 31, 28, 26, 24, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18, + 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 33, 33, 32, 31, + 28, 26, 25, 24, 22, 22, 22, 23, 22, 22, 21, 20, 20, 20, 19, 18, 18, 17, + 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 29, 28, 27, 27, 25, 23, 22, 22, + 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 19, 19, 19, 18, 18, 17, 17, 17, + 17, 16, 16, 16, 15, 15, 27, 26, 26, 25, 23, 22, 22, 21, 21, 21, 21, 22, + 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 16, + 16, 16, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, + 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, + 23, 23, 22, 23, 22, 21, 19, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, + 15, 15, 15, 15, 15, 16, 16, 15, 15, 15, 15, 14, 20, 21, 22, 22, 22, 22, + 22, 20, 19, 19, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 20, 20, 21, 21, 22, 22, 21, 20, 19, 18, + 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 14, + 14, 13, 13, 13, 18, 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 16, 16, 15, + 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 13, 13, 13, 12, 12, 13, 13, 13, + 17, 18, 19, 19, 19, 20, 19, 18, 17, 17, 17, 16, 15, 15, 14, 13, 13, 13, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 18, 18, + 18, 19, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, + 11, 11, 11, 11, 12, 11, 11, 11, 12, 12, 15, 16, 17, 17, 17, 18, 18, 17, + 16, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, + 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, + 10, 11, 15, 16, 16, 17, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, + 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 15, 16, + 16, 16, 16, 17, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, + 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 17, + 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, + 11, 10, 10, 10, 10, 10, 10, 9, + /* Size 4x16 */ + 33, 21, 17, 15, 32, 23, 19, 16, 28, 22, 19, 17, 25, 22, 19, 17, 22, 19, + 17, 17, 22, 18, 17, 16, 22, 17, 15, 15, 21, 17, 14, 14, 20, 16, 13, 13, + 19, 16, 12, 12, 18, 15, 12, 12, 17, 15, 12, 11, 17, 15, 12, 10, 16, 16, + 12, 10, 16, 15, 12, 10, 15, 15, 12, 10, + /* Size 16x4 */ + 33, 32, 28, 25, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 16, 15, 21, 23, + 22, 22, 19, 18, 17, 17, 16, 16, 15, 15, 15, 16, 15, 15, 17, 19, 19, 19, + 17, 17, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, 15, 16, 17, 17, 17, 16, + 15, 14, 13, 12, 12, 11, 10, 10, 10, 10, + /* Size 8x32 */ + 32, 29, 21, 20, 18, 16, 15, 15, 33, 28, 22, 21, 19, 17, 16, 16, 34, 27, + 22, 22, 20, 18, 16, 16, 33, 27, 22, 22, 20, 18, 17, 16, 31, 25, 22, 22, + 20, 18, 17, 16, 28, 23, 22, 22, 20, 19, 17, 17, 26, 22, 21, 22, 20, 19, + 18, 17, 24, 22, 20, 20, 19, 18, 17, 17, 21, 21, 19, 19, 18, 17, 17, 17, + 21, 22, 19, 19, 18, 17, 16, 16, 21, 22, 19, 18, 17, 16, 16, 16, 21, 23, + 19, 17, 16, 15, 15, 16, 20, 22, 19, 17, 16, 15, 14, 15, 20, 22, 19, 17, + 15, 14, 14, 15, 20, 22, 19, 16, 14, 14, 14, 14, 19, 21, 19, 16, 14, 13, + 13, 14, 19, 21, 18, 16, 14, 13, 13, 13, 18, 20, 18, 16, 13, 12, 12, 13, + 17, 19, 18, 15, 13, 12, 12, 12, 17, 19, 17, 15, 13, 12, 12, 12, 16, 19, + 17, 15, 12, 12, 11, 12, 16, 18, 17, 14, 12, 11, 11, 12, 16, 18, 17, 14, + 12, 11, 11, 11, 15, 17, 16, 14, 12, 11, 11, 11, 15, 17, 16, 14, 13, 11, + 11, 11, 15, 17, 16, 14, 13, 11, 10, 10, 15, 17, 16, 14, 13, 12, 10, 10, + 14, 16, 16, 14, 12, 11, 10, 10, 14, 16, 16, 14, 12, 11, 10, 10, 14, 16, + 16, 14, 13, 11, 10, 10, 14, 15, 16, 14, 13, 12, 10, 10, 14, 15, 16, 14, + 13, 12, 11, 10, + /* Size 32x8 */ + 32, 33, 34, 33, 31, 28, 26, 24, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18, + 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 29, 28, 27, 27, + 25, 23, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 19, 19, 19, 18, + 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 21, 22, 22, 22, 22, 22, 21, 20, + 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17, + 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 18, 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14, + 14, 13, 13, 13, 12, 12, 12, 12, 13, 13, 13, 12, 12, 13, 13, 13, 16, 17, + 18, 18, 18, 19, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, + 12, 11, 11, 11, 11, 11, 12, 11, 11, 11, 12, 12, 15, 16, 16, 17, 17, 17, + 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, + 11, 10, 10, 10, 10, 10, 10, 11, 15, 16, 16, 16, 16, 17, 17, 17, 17, 16, + 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, + 10, 10, 10, 10 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 27, 16, 12, 27, 18, 13, 11, 16, 13, 9, 8, 12, 11, 8, 6, + /* Size 8x8 */ + 32, 32, 29, 22, 18, 13, 12, 11, 32, 30, 28, 23, 19, 15, 13, 11, 29, 28, + 21, 18, 16, 13, 12, 11, 22, 23, 18, 15, 13, 11, 10, 10, 18, 19, 16, 13, + 11, 9, 8, 8, 13, 15, 13, 11, 9, 8, 7, 7, 12, 13, 12, 10, 8, 7, 7, 6, 11, + 11, 11, 10, 8, 7, 6, 6, + /* Size 16x16 */ + 32, 33, 33, 32, 30, 26, 23, 21, 18, 16, 14, 13, 12, 11, 10, 10, 33, 32, + 32, 32, 30, 27, 25, 22, 19, 17, 16, 14, 13, 12, 11, 10, 33, 32, 31, 30, + 28, 26, 24, 22, 19, 17, 16, 14, 13, 12, 12, 11, 32, 32, 30, 29, 28, 26, + 24, 22, 20, 18, 16, 14, 14, 13, 12, 11, 30, 30, 28, 28, 24, 22, 20, 19, + 17, 16, 15, 13, 12, 12, 12, 11, 26, 27, 26, 26, 22, 19, 18, 17, 15, 14, + 13, 12, 11, 11, 11, 10, 23, 25, 24, 24, 20, 18, 16, 15, 14, 13, 12, 11, + 11, 10, 10, 10, 21, 22, 22, 22, 19, 17, 15, 14, 13, 12, 11, 10, 10, 10, + 9, 9, 18, 19, 19, 20, 17, 15, 14, 13, 11, 11, 10, 9, 9, 9, 9, 8, 16, 17, + 17, 18, 16, 14, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 14, 16, 16, 16, 15, + 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 7, 13, 14, 14, 14, 13, 12, 11, 10, 9, + 9, 8, 7, 7, 7, 7, 7, 12, 13, 13, 14, 12, 11, 11, 10, 9, 8, 8, 7, 7, 7, + 6, 6, 11, 12, 12, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 6, 6, 6, 10, 11, + 12, 12, 12, 11, 10, 9, 9, 8, 8, 7, 6, 6, 6, 6, 10, 10, 11, 11, 11, 10, + 10, 9, 8, 8, 7, 7, 6, 6, 6, 5, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 32, 32, 30, 30, 28, 26, 25, 23, 21, 21, 19, 18, 17, + 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 33, 32, 32, 32, + 32, 32, 32, 30, 30, 29, 27, 26, 24, 22, 22, 20, 19, 18, 17, 16, 15, 13, + 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 33, 32, 32, 32, 32, 32, 32, 31, + 30, 30, 27, 26, 25, 23, 22, 20, 19, 19, 17, 16, 16, 14, 14, 13, 13, 12, + 12, 12, 11, 11, 10, 10, 33, 32, 32, 32, 32, 32, 32, 31, 30, 30, 28, 27, + 25, 23, 23, 21, 19, 19, 17, 16, 16, 14, 14, 14, 13, 13, 12, 12, 12, 11, + 11, 11, 33, 32, 32, 32, 31, 31, 30, 29, 28, 28, 26, 26, 24, 23, 22, 20, + 19, 19, 17, 16, 16, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 32, 32, + 32, 32, 31, 30, 30, 28, 28, 28, 26, 26, 24, 23, 22, 21, 19, 19, 18, 17, + 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 32, 32, 32, 32, 30, 30, + 29, 28, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 14, + 14, 13, 13, 12, 12, 12, 11, 11, 30, 30, 31, 31, 29, 28, 28, 26, 25, 24, + 23, 22, 22, 20, 20, 19, 18, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, + 12, 12, 11, 11, 30, 30, 30, 30, 28, 28, 28, 25, 24, 23, 22, 21, 20, 19, + 19, 18, 17, 17, 16, 15, 15, 13, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11, + 28, 29, 30, 30, 28, 28, 27, 24, 23, 21, 20, 20, 19, 18, 18, 17, 16, 16, + 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 26, 27, 27, 28, + 26, 26, 26, 23, 22, 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, + 12, 12, 11, 12, 11, 11, 11, 11, 10, 10, 25, 26, 26, 27, 26, 26, 25, 22, + 21, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, + 11, 11, 11, 10, 10, 10, 23, 24, 25, 25, 24, 24, 24, 22, 20, 19, 18, 17, + 16, 16, 15, 14, 14, 14, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, + 10, 10, 21, 22, 23, 23, 23, 23, 23, 20, 19, 18, 17, 17, 16, 15, 14, 13, + 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 9, 9, 21, 22, + 22, 23, 22, 22, 22, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 11, + 11, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 19, 20, 20, 21, 20, 21, 21, + 19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, + 9, 9, 9, 9, 9, 9, 9, 18, 19, 19, 19, 19, 19, 20, 18, 17, 16, 15, 15, 14, + 13, 13, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 9, 17, 18, + 19, 19, 19, 19, 19, 17, 17, 16, 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, + 10, 9, 9, 9, 9, 8, 9, 8, 8, 8, 8, 8, 16, 17, 17, 17, 17, 18, 18, 16, 16, + 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 15, 16, 16, 16, 16, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, 11, + 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 14, 15, 16, 16, 16, + 16, 16, 15, 15, 14, 13, 13, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, + 8, 8, 7, 8, 7, 7, 7, 13, 13, 14, 14, 14, 15, 15, 14, 13, 13, 12, 12, 11, + 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 13, 13, 14, + 14, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 13, 13, 13, 12, + 12, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 12, + 12, 13, 13, 13, 13, 14, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, + 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 11, 12, 12, 13, 13, 13, 13, 13, 12, + 12, 12, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, + 6, 11, 12, 12, 12, 12, 12, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, + 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 11, 11, 12, 12, 12, 12, 12, + 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, + 6, 6, 6, 6, 10, 11, 11, 12, 12, 12, 12, 12, 12, 11, 11, 11, 10, 10, 9, + 9, 9, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 10, 11, 11, 11, 11, + 11, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, + 6, 6, 6, 6, 6, 5, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 9, + 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 10, 10, 10, 11, + 11, 11, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 8, 8, 7, 7, 7, 7, 6, 6, + 6, 6, 6, 6, 5, 5, 5, + /* Size 4x8 */ + 32, 27, 17, 12, 32, 26, 18, 13, 30, 20, 15, 12, 23, 17, 12, 10, 19, 15, + 10, 9, 14, 12, 9, 8, 12, 12, 8, 7, 11, 10, 8, 6, + /* Size 8x4 */ + 32, 32, 30, 23, 19, 14, 12, 11, 27, 26, 20, 17, 15, 12, 12, 10, 17, 18, + 15, 12, 10, 9, 8, 8, 12, 13, 12, 10, 9, 8, 7, 6, + /* Size 8x16 */ + 32, 32, 28, 23, 18, 13, 12, 11, 33, 32, 29, 25, 19, 14, 13, 12, 32, 31, + 28, 24, 19, 14, 13, 12, 32, 30, 27, 24, 20, 15, 13, 12, 30, 28, 23, 20, + 17, 14, 13, 12, 26, 26, 20, 18, 15, 12, 12, 11, 23, 24, 19, 16, 14, 11, + 11, 11, 21, 22, 18, 15, 13, 11, 10, 10, 18, 19, 16, 14, 11, 9, 9, 9, 16, + 17, 15, 13, 11, 9, 8, 8, 14, 16, 14, 12, 10, 8, 8, 8, 13, 14, 13, 11, 9, + 8, 7, 7, 12, 13, 12, 11, 9, 7, 7, 7, 11, 12, 12, 10, 9, 8, 7, 6, 10, 12, + 12, 10, 8, 7, 6, 6, 10, 11, 11, 10, 9, 7, 6, 6, + /* Size 16x8 */ + 32, 33, 32, 32, 30, 26, 23, 21, 18, 16, 14, 13, 12, 11, 10, 10, 32, 32, + 31, 30, 28, 26, 24, 22, 19, 17, 16, 14, 13, 12, 12, 11, 28, 29, 28, 27, + 23, 20, 19, 18, 16, 15, 14, 13, 12, 12, 12, 11, 23, 25, 24, 24, 20, 18, + 16, 15, 14, 13, 12, 11, 11, 10, 10, 10, 18, 19, 19, 20, 17, 15, 14, 13, + 11, 11, 10, 9, 9, 9, 8, 9, 13, 14, 14, 15, 14, 12, 11, 11, 9, 9, 8, 8, + 7, 8, 7, 7, 12, 13, 13, 13, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, 6, 6, 11, + 12, 12, 12, 12, 11, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, + /* Size 16x32 */ + 32, 33, 32, 32, 28, 26, 23, 19, 18, 16, 13, 13, 12, 11, 11, 11, 33, 32, + 32, 32, 29, 27, 24, 20, 19, 17, 14, 13, 12, 12, 12, 11, 33, 32, 32, 32, + 29, 27, 25, 20, 19, 17, 14, 14, 13, 12, 12, 11, 33, 32, 32, 31, 30, 28, + 25, 21, 19, 17, 14, 14, 13, 12, 12, 12, 32, 32, 31, 30, 28, 26, 24, 20, + 19, 17, 14, 14, 13, 13, 12, 12, 32, 32, 30, 30, 28, 26, 24, 21, 19, 18, + 15, 14, 13, 13, 12, 12, 32, 31, 30, 29, 27, 26, 24, 21, 20, 18, 15, 15, + 13, 13, 12, 12, 30, 30, 29, 28, 24, 23, 21, 19, 18, 16, 14, 14, 13, 13, + 13, 12, 30, 30, 28, 28, 23, 22, 20, 18, 17, 16, 14, 13, 13, 12, 12, 12, + 28, 30, 28, 27, 21, 20, 19, 17, 16, 15, 13, 13, 12, 12, 12, 12, 26, 28, + 26, 26, 20, 19, 18, 16, 15, 14, 12, 12, 12, 12, 11, 12, 26, 27, 26, 25, + 20, 19, 17, 15, 15, 14, 12, 12, 11, 11, 11, 11, 23, 25, 24, 24, 19, 18, + 16, 14, 14, 13, 11, 11, 11, 11, 11, 11, 22, 23, 23, 22, 18, 17, 16, 14, + 13, 12, 11, 11, 10, 10, 10, 10, 21, 22, 22, 22, 18, 17, 15, 13, 13, 12, + 11, 10, 10, 10, 10, 10, 19, 21, 20, 20, 17, 16, 14, 12, 12, 11, 10, 10, + 9, 9, 10, 9, 18, 19, 19, 19, 16, 15, 14, 12, 11, 11, 9, 9, 9, 9, 9, 9, + 17, 19, 19, 19, 16, 15, 14, 12, 11, 10, 9, 9, 9, 9, 9, 9, 16, 17, 17, + 18, 15, 14, 13, 11, 11, 10, 9, 9, 8, 8, 8, 9, 15, 16, 17, 17, 14, 13, + 12, 11, 10, 9, 8, 8, 8, 8, 8, 8, 14, 16, 16, 16, 14, 13, 12, 11, 10, 9, + 8, 8, 8, 8, 8, 8, 13, 14, 14, 15, 13, 12, 11, 10, 9, 9, 8, 8, 7, 8, 8, + 7, 13, 14, 14, 14, 13, 12, 11, 10, 9, 9, 8, 7, 7, 7, 7, 7, 12, 14, 14, + 14, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 13, 12, 11, 11, + 9, 9, 8, 7, 7, 7, 7, 7, 7, 11, 12, 13, 13, 12, 12, 10, 9, 9, 8, 8, 7, 7, + 7, 6, 6, 11, 12, 12, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 6, 6, 6, 11, 12, + 12, 12, 12, 11, 10, 10, 9, 8, 7, 7, 7, 6, 6, 6, 10, 12, 12, 12, 12, 11, + 10, 9, 8, 8, 7, 7, 6, 6, 6, 6, 10, 11, 11, 12, 11, 10, 10, 9, 9, 8, 7, + 7, 6, 6, 6, 6, 10, 11, 11, 11, 11, 10, 10, 9, 9, 8, 7, 7, 6, 6, 6, 6, + 10, 11, 11, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, + /* Size 32x16 */ + 32, 33, 33, 33, 32, 32, 32, 30, 30, 28, 26, 26, 23, 22, 21, 19, 18, 17, + 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 33, 32, 32, 32, + 32, 32, 31, 30, 30, 30, 28, 27, 25, 23, 22, 21, 19, 19, 17, 16, 16, 14, + 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 32, 32, 32, 32, 31, 30, 30, 29, + 28, 28, 26, 26, 24, 23, 22, 20, 19, 19, 17, 17, 16, 14, 14, 14, 13, 13, + 12, 12, 12, 11, 11, 11, 32, 32, 32, 31, 30, 30, 29, 28, 28, 27, 26, 25, + 24, 22, 22, 20, 19, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, + 11, 11, 28, 29, 29, 30, 28, 28, 27, 24, 23, 21, 20, 20, 19, 18, 18, 17, + 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11, 26, 27, + 27, 28, 26, 26, 26, 23, 22, 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 13, + 13, 12, 12, 12, 11, 12, 11, 11, 11, 10, 10, 10, 23, 24, 25, 25, 24, 24, + 24, 21, 20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 12, 12, 11, 11, 11, + 11, 10, 10, 10, 10, 10, 10, 10, 19, 20, 20, 21, 20, 21, 21, 19, 18, 17, + 16, 15, 14, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 10, 10, 9, + 9, 9, 9, 18, 19, 19, 19, 19, 19, 20, 18, 17, 16, 15, 15, 14, 13, 13, 12, + 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 9, 9, 9, 16, 17, 17, 17, 17, + 18, 18, 16, 16, 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 13, 14, 14, 14, 14, 15, 15, 14, 14, 13, 12, 12, + 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 8, 8, 7, 7, 7, 7, 8, 13, 13, + 14, 14, 14, 14, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, + 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, 13, 13, 13, 13, 13, 13, 12, + 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 7, + 11, 12, 12, 12, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, + 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, 12, 12, 12, 13, + 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, + 6, 6, 6, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 10, 10, 9, + 9, 9, 9, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, + /* Size 4x16 */ + 33, 26, 16, 11, 32, 27, 17, 12, 32, 26, 17, 13, 31, 26, 18, 13, 30, 22, + 16, 12, 28, 19, 14, 12, 25, 18, 13, 11, 22, 17, 12, 10, 19, 15, 11, 9, + 17, 14, 10, 8, 16, 13, 9, 8, 14, 12, 9, 7, 13, 11, 8, 7, 12, 11, 8, 6, + 12, 11, 8, 6, 11, 10, 8, 6, + /* Size 16x4 */ + 33, 32, 32, 31, 30, 28, 25, 22, 19, 17, 16, 14, 13, 12, 12, 11, 26, 27, + 26, 26, 22, 19, 18, 17, 15, 14, 13, 12, 11, 11, 11, 10, 16, 17, 17, 18, + 16, 14, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 11, 12, 13, 13, 12, 12, 11, + 10, 9, 8, 8, 7, 7, 6, 6, 6, + /* Size 8x32 */ + 32, 32, 28, 23, 18, 13, 12, 11, 33, 32, 29, 24, 19, 14, 12, 12, 33, 32, + 29, 25, 19, 14, 13, 12, 33, 32, 30, 25, 19, 14, 13, 12, 32, 31, 28, 24, + 19, 14, 13, 12, 32, 30, 28, 24, 19, 15, 13, 12, 32, 30, 27, 24, 20, 15, + 13, 12, 30, 29, 24, 21, 18, 14, 13, 13, 30, 28, 23, 20, 17, 14, 13, 12, + 28, 28, 21, 19, 16, 13, 12, 12, 26, 26, 20, 18, 15, 12, 12, 11, 26, 26, + 20, 17, 15, 12, 11, 11, 23, 24, 19, 16, 14, 11, 11, 11, 22, 23, 18, 16, + 13, 11, 10, 10, 21, 22, 18, 15, 13, 11, 10, 10, 19, 20, 17, 14, 12, 10, + 9, 10, 18, 19, 16, 14, 11, 9, 9, 9, 17, 19, 16, 14, 11, 9, 9, 9, 16, 17, + 15, 13, 11, 9, 8, 8, 15, 17, 14, 12, 10, 8, 8, 8, 14, 16, 14, 12, 10, 8, + 8, 8, 13, 14, 13, 11, 9, 8, 7, 8, 13, 14, 13, 11, 9, 8, 7, 7, 12, 14, + 13, 11, 9, 8, 7, 7, 12, 13, 12, 11, 9, 7, 7, 7, 11, 13, 12, 10, 9, 8, 7, + 6, 11, 12, 12, 10, 9, 8, 7, 6, 11, 12, 12, 10, 9, 7, 7, 6, 10, 12, 12, + 10, 8, 7, 6, 6, 10, 11, 11, 10, 9, 7, 6, 6, 10, 11, 11, 10, 9, 7, 6, 6, + 10, 11, 11, 10, 9, 8, 7, 6, + /* Size 32x8 */ + 32, 33, 33, 33, 32, 32, 32, 30, 30, 28, 26, 26, 23, 22, 21, 19, 18, 17, + 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 32, 32, 32, 32, + 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 22, 20, 19, 19, 17, 17, 16, 14, + 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 28, 29, 29, 30, 28, 28, 27, 24, + 23, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, + 12, 12, 12, 11, 11, 11, 23, 24, 25, 25, 24, 24, 24, 21, 20, 19, 18, 17, + 16, 16, 15, 14, 14, 14, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, + 10, 10, 18, 19, 19, 19, 19, 19, 20, 18, 17, 16, 15, 15, 14, 13, 13, 12, + 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 9, 9, 9, 13, 14, 14, 14, 14, + 15, 15, 14, 14, 13, 12, 12, 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, + 8, 8, 7, 7, 7, 7, 8, 12, 12, 13, 13, 13, 13, 13, 13, 13, 12, 12, 11, 11, + 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 7, 11, 12, 12, + 12, 12, 12, 12, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, + 7, 7, 6, 6, 6, 6, 6, 6, 6 }, + { /* Chroma */ + /* Size 4x4 */ + 32, 23, 19, 16, 23, 19, 17, 15, 19, 17, 13, 12, 16, 15, 12, 10, + /* Size 8x8 */ + 33, 28, 22, 21, 20, 17, 16, 15, 28, 24, 22, 22, 21, 19, 17, 16, 22, 22, + 19, 19, 19, 17, 16, 16, 21, 22, 19, 17, 16, 15, 14, 14, 20, 21, 19, 16, + 14, 13, 13, 13, 17, 19, 17, 15, 13, 12, 12, 12, 16, 17, 16, 14, 13, 12, + 11, 10, 15, 16, 16, 14, 13, 12, 10, 10, + /* Size 16x16 */ + 32, 34, 31, 28, 23, 21, 21, 20, 19, 18, 17, 16, 15, 15, 15, 14, 34, 33, + 29, 26, 23, 22, 22, 22, 20, 19, 19, 17, 17, 16, 16, 15, 31, 29, 26, 24, + 22, 22, 23, 22, 21, 20, 19, 18, 17, 17, 16, 16, 28, 26, 24, 22, 22, 22, + 23, 22, 21, 20, 20, 19, 18, 18, 17, 16, 23, 23, 22, 22, 20, 20, 20, 20, + 19, 19, 18, 17, 17, 17, 16, 17, 21, 22, 22, 22, 20, 19, 19, 18, 18, 17, + 17, 16, 16, 16, 16, 16, 21, 22, 23, 23, 20, 19, 18, 17, 17, 16, 16, 15, + 15, 15, 15, 15, 20, 22, 22, 22, 20, 18, 17, 17, 16, 15, 15, 14, 14, 14, + 14, 14, 19, 20, 21, 21, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 13, 13, + 18, 19, 20, 20, 19, 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 12, 17, 19, + 19, 20, 18, 17, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 16, 17, 18, 19, + 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 11, 15, 17, 17, 18, 17, 16, + 15, 14, 13, 12, 12, 11, 11, 11, 11, 11, 15, 16, 17, 18, 17, 16, 15, 14, + 13, 12, 12, 11, 11, 10, 10, 10, 15, 16, 16, 17, 16, 16, 15, 14, 13, 12, + 12, 11, 11, 10, 10, 10, 14, 15, 16, 16, 17, 16, 15, 14, 13, 12, 12, 11, + 11, 10, 10, 10, + /* Size 32x32 */ + 32, 33, 34, 34, 31, 29, 28, 25, 23, 21, 21, 21, 21, 20, 20, 20, 19, 19, + 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 14, 33, 33, 33, 33, + 30, 28, 27, 24, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 18, 18, 17, + 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 34, 33, 33, 33, 29, 28, 26, 24, + 23, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 18, 17, 17, 17, 16, + 16, 16, 16, 15, 15, 15, 34, 33, 33, 32, 29, 28, 26, 24, 23, 22, 23, 23, + 23, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, + 16, 16, 31, 30, 29, 29, 26, 25, 24, 23, 22, 22, 22, 22, 23, 22, 22, 22, + 21, 21, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 29, 28, + 28, 28, 25, 24, 23, 22, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, + 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 28, 27, 26, 26, 24, 23, + 22, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, + 18, 18, 18, 17, 17, 17, 16, 16, 25, 24, 24, 24, 23, 22, 22, 21, 21, 20, + 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, + 17, 17, 17, 17, 23, 23, 23, 23, 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, + 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 17, 17, 17, + 21, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, + 22, 22, 22, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 22, 22, 22, 21, + 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, + 15, 15, 15, 15, 15, 15, 21, 22, 22, 23, 23, 23, 23, 21, 20, 19, 19, 18, + 18, 17, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 20, 21, 22, 22, 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 17, 16, + 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 21, + 22, 22, 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, + 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 20, 21, 22, 22, 22, + 22, 20, 20, 19, 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, + 13, 13, 14, 13, 13, 14, 14, 13, 19, 20, 20, 21, 21, 21, 21, 20, 19, 19, + 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 19, 20, 20, 20, 21, 21, 21, 20, 19, 19, 17, 17, 17, 16, + 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 18, 19, 19, 20, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, + 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 17, 18, 19, 19, + 19, 20, 20, 19, 18, 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 17, 18, 19, 19, 19, 19, 20, 19, + 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 16, 17, 18, 18, 18, 19, 19, 18, 17, 17, 16, 16, + 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 11, 12, 11, 12, 11, 12, + 12, 12, 16, 17, 17, 18, 18, 18, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, + 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 16, 17, + 17, 18, 18, 18, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, + 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, + 18, 17, 17, 16, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 18, 17, 17, 16, + 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, + 10, 10, 11, 10, 15, 16, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, 15, 14, + 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, + 15, 16, 16, 16, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, + 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 15, 15, 16, 16, + 16, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, + 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 16, 17, 17, + 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, + 10, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 16, 16, 17, 17, 16, 16, 15, + 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, + 10, 10, 14, 15, 15, 16, 16, 16, 16, 17, 17, 16, 16, 15, 15, 14, 14, 13, + 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9, + /* Size 4x8 */ + 33, 22, 19, 16, 27, 22, 20, 17, 22, 19, 18, 17, 22, 18, 16, 14, 20, 17, + 14, 13, 18, 16, 12, 12, 17, 16, 12, 11, 16, 15, 12, 10, + /* Size 8x4 */ + 33, 27, 22, 22, 20, 18, 17, 16, 22, 22, 19, 18, 17, 16, 16, 15, 19, 20, + 18, 16, 14, 12, 12, 12, 16, 17, 17, 14, 13, 12, 11, 10, + /* Size 8x16 */ + 32, 30, 21, 21, 19, 16, 15, 15, 33, 28, 22, 22, 20, 18, 17, 16, 31, 26, + 22, 22, 21, 18, 17, 17, 28, 23, 22, 23, 21, 19, 18, 17, 23, 22, 20, 20, + 19, 17, 17, 17, 21, 22, 19, 18, 18, 16, 16, 16, 21, 23, 19, 18, 17, 15, + 15, 15, 20, 22, 19, 17, 16, 14, 14, 14, 19, 21, 19, 17, 15, 13, 13, 13, + 18, 20, 18, 16, 14, 12, 12, 13, 17, 19, 18, 16, 14, 12, 12, 12, 16, 18, + 17, 15, 13, 12, 11, 12, 16, 17, 16, 15, 13, 11, 11, 11, 15, 17, 16, 14, + 13, 12, 11, 10, 15, 16, 16, 15, 13, 12, 11, 10, 14, 16, 16, 15, 13, 12, + 11, 10, + /* Size 16x8 */ + 32, 33, 31, 28, 23, 21, 21, 20, 19, 18, 17, 16, 16, 15, 15, 14, 30, 28, + 26, 23, 22, 22, 23, 22, 21, 20, 19, 18, 17, 17, 16, 16, 21, 22, 22, 22, + 20, 19, 19, 19, 19, 18, 18, 17, 16, 16, 16, 16, 21, 22, 22, 23, 20, 18, + 18, 17, 17, 16, 16, 15, 15, 14, 15, 15, 19, 20, 21, 21, 19, 18, 17, 16, + 15, 14, 14, 13, 13, 13, 13, 13, 16, 18, 18, 19, 17, 16, 15, 14, 13, 12, + 12, 12, 11, 12, 12, 12, 15, 17, 17, 18, 17, 16, 15, 14, 13, 12, 12, 11, + 11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, 10, + 10, 10, + /* Size 16x32 */ + 32, 33, 30, 28, 21, 21, 21, 20, 19, 18, 16, 16, 15, 15, 15, 15, 33, 33, + 29, 27, 22, 22, 22, 20, 20, 19, 17, 17, 16, 16, 16, 16, 33, 32, 28, 26, + 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 16, 16, 34, 32, 28, 26, 22, 23, + 23, 21, 21, 20, 18, 18, 17, 17, 17, 16, 31, 28, 26, 24, 22, 22, 22, 22, + 21, 20, 18, 18, 17, 17, 17, 16, 29, 27, 24, 23, 22, 22, 23, 22, 21, 20, + 19, 18, 18, 17, 17, 17, 28, 26, 23, 22, 22, 22, 23, 22, 21, 20, 19, 19, + 18, 18, 17, 17, 24, 24, 23, 22, 20, 20, 21, 20, 20, 19, 18, 18, 17, 18, + 17, 17, 23, 23, 22, 22, 20, 20, 20, 20, 19, 19, 17, 17, 17, 17, 17, 17, + 21, 22, 22, 21, 19, 19, 19, 19, 19, 18, 17, 17, 16, 17, 17, 16, 21, 22, + 22, 22, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 16, 16, 21, 23, 22, 22, + 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 16, 21, 23, 23, 22, 19, 18, + 18, 17, 17, 16, 15, 15, 15, 15, 15, 16, 20, 22, 22, 22, 19, 18, 17, 16, + 16, 16, 15, 14, 15, 14, 15, 15, 20, 22, 22, 22, 19, 18, 17, 16, 16, 15, + 14, 14, 14, 14, 14, 15, 20, 21, 22, 22, 19, 18, 17, 16, 15, 14, 14, 14, + 13, 14, 14, 14, 19, 21, 21, 21, 19, 18, 17, 15, 15, 14, 13, 13, 13, 13, + 13, 14, 19, 20, 21, 21, 19, 17, 17, 15, 15, 14, 13, 13, 13, 13, 13, 13, + 18, 20, 20, 20, 18, 17, 16, 15, 14, 13, 12, 12, 12, 12, 13, 13, 17, 19, + 20, 20, 18, 17, 16, 14, 14, 13, 12, 12, 12, 12, 12, 12, 17, 19, 19, 20, + 18, 17, 16, 14, 14, 13, 12, 12, 12, 12, 12, 12, 16, 18, 18, 19, 17, 16, + 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 16, 18, 18, 19, 17, 16, 15, 14, + 13, 12, 12, 11, 11, 11, 12, 12, 16, 17, 18, 18, 17, 16, 15, 14, 13, 12, + 11, 11, 11, 11, 11, 11, 16, 17, 17, 18, 16, 16, 15, 13, 13, 12, 11, 11, + 11, 11, 11, 11, 15, 17, 17, 18, 16, 16, 15, 14, 13, 12, 12, 11, 11, 11, + 11, 11, 15, 17, 17, 17, 16, 16, 14, 14, 13, 12, 12, 11, 11, 11, 10, 11, + 15, 16, 17, 17, 16, 16, 14, 14, 13, 12, 12, 11, 11, 10, 10, 10, 15, 16, + 16, 17, 16, 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 10, 14, 16, 16, 17, + 16, 15, 15, 14, 13, 12, 12, 11, 11, 10, 10, 10, 14, 16, 16, 17, 16, 15, + 15, 14, 13, 12, 12, 11, 11, 10, 10, 10, 14, 16, 16, 16, 16, 15, 15, 13, + 13, 12, 12, 11, 11, 10, 10, 10, + /* Size 32x16 */ + 32, 33, 33, 34, 31, 29, 28, 24, 23, 21, 21, 21, 21, 20, 20, 20, 19, 19, + 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 33, 33, 32, 32, + 28, 27, 26, 24, 23, 22, 22, 23, 23, 22, 22, 21, 21, 20, 20, 19, 19, 18, + 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 30, 29, 28, 28, 26, 24, 23, 23, + 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 18, 18, 18, 17, 17, + 17, 17, 16, 16, 16, 16, 28, 27, 26, 26, 24, 23, 22, 22, 22, 21, 22, 22, + 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, + 17, 16, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, + 22, 23, 22, 22, 22, 20, 20, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, + 17, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 21, 22, 22, 23, 22, 23, + 23, 21, 20, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, + 15, 15, 14, 14, 15, 15, 15, 15, 20, 20, 21, 21, 22, 22, 22, 20, 20, 19, + 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 13, 14, 14, 14, + 14, 14, 14, 13, 19, 20, 20, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, + 16, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 18, 19, 19, 20, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, + 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 13, 12, 12, 12, 16, 17, 18, 18, + 18, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, + 12, 11, 11, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 18, + 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, 18, 17, 17, 16, 16, 16, + 15, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 15, 16, 16, 17, 17, 17, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, + 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 15, 16, + 16, 17, 17, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, + 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 15, 16, 16, 16, 16, 17, + 17, 17, 17, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, + 11, 11, 11, 10, 10, 10, 10, 10, + /* Size 4x16 */ + 33, 21, 18, 15, 32, 22, 19, 16, 28, 22, 20, 17, 26, 22, 20, 18, 23, 20, + 19, 17, 22, 19, 17, 16, 23, 18, 16, 15, 22, 18, 15, 14, 21, 18, 14, 13, + 20, 17, 13, 12, 19, 17, 13, 12, 18, 16, 12, 11, 17, 16, 12, 11, 17, 16, + 12, 11, 16, 16, 13, 10, 16, 15, 12, 10, + /* Size 16x4 */ + 33, 32, 28, 26, 23, 22, 23, 22, 21, 20, 19, 18, 17, 17, 16, 16, 21, 22, + 22, 22, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 15, 18, 19, 20, 20, + 19, 17, 16, 15, 14, 13, 13, 12, 12, 12, 13, 12, 15, 16, 17, 18, 17, 16, + 15, 14, 13, 12, 12, 11, 11, 11, 10, 10, + /* Size 8x32 */ + 32, 30, 21, 21, 19, 16, 15, 15, 33, 29, 22, 22, 20, 17, 16, 16, 33, 28, + 22, 22, 20, 18, 17, 16, 34, 28, 22, 23, 21, 18, 17, 17, 31, 26, 22, 22, + 21, 18, 17, 17, 29, 24, 22, 23, 21, 19, 18, 17, 28, 23, 22, 23, 21, 19, + 18, 17, 24, 23, 20, 21, 20, 18, 17, 17, 23, 22, 20, 20, 19, 17, 17, 17, + 21, 22, 19, 19, 19, 17, 16, 17, 21, 22, 19, 18, 18, 16, 16, 16, 21, 22, + 19, 18, 17, 16, 16, 16, 21, 23, 19, 18, 17, 15, 15, 15, 20, 22, 19, 17, + 16, 15, 15, 15, 20, 22, 19, 17, 16, 14, 14, 14, 20, 22, 19, 17, 15, 14, + 13, 14, 19, 21, 19, 17, 15, 13, 13, 13, 19, 21, 19, 17, 15, 13, 13, 13, + 18, 20, 18, 16, 14, 12, 12, 13, 17, 20, 18, 16, 14, 12, 12, 12, 17, 19, + 18, 16, 14, 12, 12, 12, 16, 18, 17, 15, 13, 12, 11, 12, 16, 18, 17, 15, + 13, 12, 11, 12, 16, 18, 17, 15, 13, 11, 11, 11, 16, 17, 16, 15, 13, 11, + 11, 11, 15, 17, 16, 15, 13, 12, 11, 11, 15, 17, 16, 14, 13, 12, 11, 10, + 15, 17, 16, 14, 13, 12, 11, 10, 15, 16, 16, 15, 13, 12, 11, 10, 14, 16, + 16, 15, 13, 12, 11, 10, 14, 16, 16, 15, 13, 12, 11, 10, 14, 16, 16, 15, + 13, 12, 11, 10, + /* Size 32x8 */ + 32, 33, 33, 34, 31, 29, 28, 24, 23, 21, 21, 21, 21, 20, 20, 20, 19, 19, + 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 30, 29, 28, 28, + 26, 24, 23, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 18, + 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 21, 22, 22, 22, 22, 22, 22, 20, + 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, + 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 22, 23, 23, 21, 20, 19, 18, 18, + 18, 17, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 15, 15, + 15, 15, 19, 20, 20, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, + 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 16, 17, + 18, 18, 18, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, + 12, 12, 12, 11, 11, 12, 12, 12, 12, 12, 12, 12, 15, 16, 17, 17, 17, 18, + 18, 17, 17, 16, 16, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 17, 17, 17, 17, + 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, + 10, 10, 10, 10 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 28, 18, 13, 28, 19, 14, 11, 18, 14, 10, 8, 13, 11, 8, 7, + /* Size 8x8 */ + 32, 32, 29, 24, 19, 15, 13, 11, 32, 31, 28, 24, 20, 16, 14, 12, 29, 28, + 22, 20, 17, 14, 13, 12, 24, 24, 20, 16, 14, 12, 11, 10, 19, 20, 17, 14, + 12, 10, 9, 9, 15, 16, 14, 12, 10, 9, 8, 8, 13, 14, 13, 11, 9, 8, 7, 7, + 11, 12, 12, 10, 9, 8, 7, 6, + /* Size 16x16 */ + 32, 33, 33, 32, 30, 28, 25, 22, 19, 17, 16, 14, 12, 12, 11, 11, 33, 32, + 32, 32, 30, 29, 26, 23, 20, 19, 17, 15, 13, 13, 12, 11, 33, 32, 31, 31, + 29, 28, 26, 23, 21, 19, 17, 15, 14, 13, 12, 12, 32, 32, 31, 29, 28, 27, + 25, 23, 21, 19, 18, 16, 14, 14, 13, 12, 30, 30, 29, 28, 26, 24, 22, 20, + 19, 18, 16, 15, 13, 13, 12, 12, 28, 29, 28, 27, 24, 21, 20, 18, 17, 16, + 15, 14, 13, 12, 11, 11, 25, 26, 26, 25, 22, 20, 18, 17, 15, 14, 14, 12, + 12, 11, 11, 11, 22, 23, 23, 23, 20, 18, 17, 15, 14, 13, 12, 11, 11, 10, + 10, 10, 19, 20, 21, 21, 19, 17, 15, 14, 12, 12, 11, 10, 10, 9, 9, 9, 17, + 19, 19, 19, 18, 16, 14, 13, 12, 11, 10, 10, 9, 9, 9, 8, 16, 17, 17, 18, + 16, 15, 14, 12, 11, 10, 10, 9, 9, 8, 8, 8, 14, 15, 15, 16, 15, 14, 12, + 11, 10, 10, 9, 8, 8, 8, 7, 7, 12, 13, 14, 14, 13, 13, 12, 11, 10, 9, 9, + 8, 7, 7, 7, 7, 12, 13, 13, 14, 13, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 6, + 11, 12, 12, 13, 12, 11, 11, 10, 9, 9, 8, 7, 7, 7, 6, 6, 11, 11, 12, 12, + 12, 11, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 19, 18, + 17, 16, 16, 14, 14, 13, 12, 12, 12, 11, 11, 11, 11, 10, 33, 32, 32, 32, + 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 20, 19, 18, 17, 17, 15, + 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 33, 32, 32, 32, 32, 32, 32, 31, + 30, 30, 29, 27, 26, 24, 23, 23, 20, 20, 19, 17, 17, 15, 15, 14, 13, 13, + 13, 12, 12, 12, 11, 11, 33, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 28, + 27, 25, 23, 23, 21, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, + 12, 11, 33, 32, 32, 32, 31, 31, 31, 30, 29, 28, 28, 26, 26, 24, 23, 23, + 21, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 33, 32, + 32, 32, 31, 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 23, 20, 20, 19, 18, + 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 32, 32, 32, 32, 31, 30, + 29, 28, 28, 27, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 18, 16, 16, 15, + 14, 14, 14, 13, 13, 12, 12, 12, 32, 31, 31, 31, 30, 30, 28, 28, 27, 26, + 26, 24, 24, 23, 22, 22, 20, 19, 19, 17, 17, 16, 15, 14, 14, 14, 13, 13, + 13, 12, 12, 12, 30, 30, 30, 31, 29, 29, 28, 27, 26, 24, 24, 23, 22, 22, + 20, 20, 19, 18, 18, 17, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 12, 12, + 29, 29, 30, 30, 28, 28, 27, 26, 24, 22, 22, 21, 20, 20, 19, 19, 17, 17, + 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 28, 29, 29, 30, + 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 17, 17, 16, 15, 15, 14, + 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 26, 27, 27, 28, 26, 26, 26, 24, + 23, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 12, + 11, 11, 11, 11, 11, 11, 25, 26, 26, 27, 26, 26, 25, 24, 22, 20, 20, 19, + 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 11, + 11, 10, 23, 24, 24, 25, 24, 24, 24, 23, 22, 20, 19, 18, 17, 16, 16, 15, + 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 22, 23, + 23, 23, 23, 23, 23, 22, 20, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, + 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 21, 22, 23, 23, 23, 23, + 22, 22, 20, 19, 18, 17, 16, 15, 15, 14, 13, 13, 13, 12, 12, 11, 11, 11, + 10, 10, 10, 10, 10, 10, 9, 9, 19, 20, 20, 21, 21, 20, 21, 20, 19, 17, + 17, 16, 15, 14, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, + 9, 9, 9, 18, 19, 20, 20, 20, 20, 20, 19, 18, 17, 17, 16, 15, 14, 13, 13, + 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 17, 18, 19, 19, + 19, 19, 19, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, + 10, 9, 9, 9, 9, 9, 9, 8, 8, 9, 16, 17, 17, 18, 18, 18, 18, 17, 17, 16, + 15, 14, 14, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 8, 8, 8, 8, + 8, 8, 16, 17, 17, 17, 17, 17, 18, 17, 16, 15, 15, 14, 14, 13, 12, 12, + 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 14, 15, 15, 16, 16, + 16, 16, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, + 8, 8, 8, 8, 8, 8, 8, 7, 14, 14, 15, 15, 15, 15, 16, 15, 15, 14, 14, 13, + 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 13, + 13, 14, 14, 14, 14, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, + 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 14, + 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, + 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, + 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, 13, 13, 13, + 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, + 7, 7, 6, 6, 6, 11, 12, 12, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, + 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 11, 12, 12, 12, + 12, 12, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, + 7, 7, 7, 7, 6, 6, 6, 6, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, + 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 11, 11, + 11, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, + 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 10, 11, 11, 11, 11, 12, 12, 12, 12, 11, + 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, + /* Size 4x8 */ + 32, 29, 17, 12, 32, 28, 18, 13, 30, 22, 16, 12, 25, 19, 13, 11, 20, 17, + 11, 9, 16, 14, 9, 8, 14, 13, 9, 7, 12, 11, 9, 7, + /* Size 8x4 */ + 32, 32, 30, 25, 20, 16, 14, 12, 29, 28, 22, 19, 17, 14, 13, 11, 17, 18, + 16, 13, 11, 9, 9, 9, 12, 13, 12, 11, 9, 8, 7, 7, + /* Size 8x16 */ + 32, 33, 29, 23, 19, 16, 12, 11, 33, 32, 30, 25, 20, 17, 13, 12, 33, 31, + 29, 24, 21, 17, 14, 13, 32, 30, 28, 24, 21, 18, 14, 13, 30, 29, 25, 21, + 19, 16, 13, 13, 28, 28, 22, 19, 17, 15, 13, 12, 25, 26, 21, 17, 15, 13, + 12, 11, 22, 23, 19, 16, 14, 12, 11, 10, 19, 20, 18, 14, 12, 11, 10, 9, + 18, 19, 17, 14, 12, 10, 9, 9, 16, 17, 16, 13, 11, 10, 9, 8, 14, 15, 14, + 12, 10, 9, 8, 8, 12, 14, 13, 11, 10, 9, 7, 7, 12, 13, 12, 11, 9, 8, 7, + 7, 11, 12, 12, 11, 9, 8, 7, 7, 11, 12, 12, 11, 9, 8, 7, 6, + /* Size 16x8 */ + 32, 33, 33, 32, 30, 28, 25, 22, 19, 18, 16, 14, 12, 12, 11, 11, 33, 32, + 31, 30, 29, 28, 26, 23, 20, 19, 17, 15, 14, 13, 12, 12, 29, 30, 29, 28, + 25, 22, 21, 19, 18, 17, 16, 14, 13, 12, 12, 12, 23, 25, 24, 24, 21, 19, + 17, 16, 14, 14, 13, 12, 11, 11, 11, 11, 19, 20, 21, 21, 19, 17, 15, 14, + 12, 12, 11, 10, 10, 9, 9, 9, 16, 17, 17, 18, 16, 15, 13, 12, 11, 10, 10, + 9, 9, 8, 8, 8, 12, 13, 14, 14, 13, 13, 12, 11, 10, 9, 9, 8, 7, 7, 7, 7, + 11, 12, 13, 13, 13, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 6, + /* Size 16x32 */ + 32, 33, 33, 32, 29, 28, 23, 22, 19, 17, 16, 13, 12, 12, 11, 11, 33, 32, + 32, 32, 29, 29, 24, 23, 20, 17, 17, 14, 13, 12, 12, 12, 33, 32, 32, 32, + 30, 29, 25, 23, 20, 18, 17, 14, 13, 12, 12, 12, 33, 32, 32, 31, 30, 30, + 25, 23, 21, 18, 17, 14, 14, 13, 12, 12, 33, 32, 31, 30, 29, 28, 24, 23, + 21, 18, 17, 14, 14, 13, 13, 12, 32, 32, 31, 30, 28, 28, 24, 23, 20, 18, + 17, 14, 14, 13, 13, 12, 32, 31, 30, 29, 28, 27, 24, 23, 21, 18, 18, 15, + 14, 13, 13, 12, 32, 31, 30, 28, 26, 26, 23, 22, 20, 18, 17, 14, 14, 13, + 13, 13, 30, 30, 29, 28, 25, 24, 21, 20, 19, 17, 16, 14, 13, 13, 13, 13, + 29, 30, 28, 27, 23, 22, 20, 19, 17, 16, 15, 13, 13, 12, 12, 12, 28, 30, + 28, 27, 22, 21, 19, 18, 17, 16, 15, 13, 13, 12, 12, 12, 26, 28, 26, 26, + 21, 20, 18, 17, 16, 14, 14, 12, 12, 12, 12, 11, 25, 26, 26, 25, 21, 20, + 17, 17, 15, 14, 13, 12, 12, 11, 11, 11, 23, 25, 24, 24, 20, 19, 16, 16, + 14, 13, 13, 11, 11, 11, 11, 11, 22, 23, 23, 23, 19, 18, 16, 15, 14, 12, + 12, 11, 11, 10, 10, 10, 21, 23, 23, 22, 19, 18, 15, 15, 13, 12, 12, 11, + 10, 10, 10, 10, 19, 21, 20, 20, 18, 17, 14, 14, 12, 11, 11, 10, 10, 10, + 9, 10, 19, 20, 20, 20, 17, 17, 14, 13, 12, 11, 11, 10, 9, 9, 9, 9, 18, + 19, 19, 19, 17, 16, 14, 13, 12, 11, 10, 9, 9, 9, 9, 9, 16, 18, 18, 18, + 16, 15, 13, 12, 11, 10, 10, 9, 9, 9, 9, 8, 16, 17, 17, 18, 16, 15, 13, + 12, 11, 10, 10, 9, 9, 8, 8, 8, 14, 16, 16, 16, 14, 14, 12, 12, 11, 9, 9, + 8, 8, 8, 8, 8, 14, 15, 15, 16, 14, 14, 12, 11, 10, 9, 9, 8, 8, 8, 8, 8, + 13, 14, 14, 15, 13, 13, 11, 11, 10, 9, 9, 8, 8, 7, 7, 7, 12, 14, 14, 14, + 13, 13, 11, 11, 10, 9, 9, 8, 7, 7, 7, 7, 12, 14, 14, 14, 13, 13, 11, 11, + 10, 9, 8, 8, 7, 7, 7, 7, 12, 13, 13, 13, 12, 12, 11, 10, 9, 9, 8, 7, 7, + 7, 7, 7, 12, 12, 13, 13, 12, 12, 11, 10, 9, 9, 8, 7, 7, 7, 7, 6, 11, 12, + 12, 13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 6, 11, 12, 12, 12, 12, 11, + 11, 10, 9, 9, 8, 8, 7, 7, 6, 6, 11, 12, 12, 12, 12, 11, 11, 10, 9, 8, 8, + 7, 7, 6, 6, 6, 10, 11, 11, 12, 12, 11, 11, 9, 9, 8, 8, 7, 7, 6, 6, 6, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 32, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 19, 19, + 18, 16, 16, 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 10, 33, 32, 32, 32, + 32, 32, 31, 31, 30, 30, 30, 28, 26, 25, 23, 23, 21, 20, 19, 18, 17, 16, + 15, 14, 14, 14, 13, 12, 12, 12, 12, 11, 33, 32, 32, 32, 31, 31, 30, 30, + 29, 28, 28, 26, 26, 24, 23, 23, 20, 20, 19, 18, 17, 16, 15, 14, 14, 14, + 13, 13, 12, 12, 12, 11, 32, 32, 32, 31, 30, 30, 29, 28, 28, 27, 27, 26, + 25, 24, 23, 22, 20, 20, 19, 18, 18, 16, 16, 15, 14, 14, 13, 13, 13, 12, + 12, 12, 29, 29, 30, 30, 29, 28, 28, 26, 25, 23, 22, 21, 21, 20, 19, 19, + 18, 17, 17, 16, 16, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 28, 29, + 29, 30, 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 17, 17, 16, 15, + 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 23, 24, 25, 25, 24, 24, + 24, 23, 21, 20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 22, 23, 23, 23, 23, 23, 23, 22, 20, 19, + 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, + 10, 10, 10, 9, 19, 20, 20, 21, 21, 20, 21, 20, 19, 17, 17, 16, 15, 14, + 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 17, + 17, 18, 18, 18, 18, 18, 18, 17, 16, 16, 14, 14, 13, 12, 12, 11, 11, 11, + 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 16, 17, 17, 17, 17, 17, 18, 17, + 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, + 8, 8, 8, 8, 13, 14, 14, 14, 14, 14, 15, 14, 14, 13, 13, 12, 12, 11, 11, + 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 8, 8, 7, 7, 12, 13, 13, 14, + 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, + 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 12, 13, 13, 13, 13, 13, 13, 12, 12, 12, + 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 11, 12, + 12, 12, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, + 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 11, 12, 12, 12, 12, 12, 12, 13, 13, 12, + 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, + /* Size 4x16 */ + 33, 28, 17, 12, 32, 29, 18, 12, 32, 28, 18, 13, 31, 27, 18, 13, 30, 24, + 17, 13, 30, 21, 16, 12, 26, 20, 14, 11, 23, 18, 12, 10, 21, 17, 11, 10, + 19, 16, 11, 9, 17, 15, 10, 8, 15, 14, 9, 8, 14, 13, 9, 7, 13, 12, 9, 7, + 12, 12, 9, 7, 12, 11, 8, 6, + /* Size 16x4 */ + 33, 32, 32, 31, 30, 30, 26, 23, 21, 19, 17, 15, 14, 13, 12, 12, 28, 29, + 28, 27, 24, 21, 20, 18, 17, 16, 15, 14, 13, 12, 12, 11, 17, 18, 18, 18, + 17, 16, 14, 12, 11, 11, 10, 9, 9, 9, 9, 8, 12, 12, 13, 13, 13, 12, 11, + 10, 10, 9, 8, 8, 7, 7, 7, 6, + /* Size 8x32 */ + 32, 33, 29, 23, 19, 16, 12, 11, 33, 32, 29, 24, 20, 17, 13, 12, 33, 32, + 30, 25, 20, 17, 13, 12, 33, 32, 30, 25, 21, 17, 14, 12, 33, 31, 29, 24, + 21, 17, 14, 13, 32, 31, 28, 24, 20, 17, 14, 13, 32, 30, 28, 24, 21, 18, + 14, 13, 32, 30, 26, 23, 20, 17, 14, 13, 30, 29, 25, 21, 19, 16, 13, 13, + 29, 28, 23, 20, 17, 15, 13, 12, 28, 28, 22, 19, 17, 15, 13, 12, 26, 26, + 21, 18, 16, 14, 12, 12, 25, 26, 21, 17, 15, 13, 12, 11, 23, 24, 20, 16, + 14, 13, 11, 11, 22, 23, 19, 16, 14, 12, 11, 10, 21, 23, 19, 15, 13, 12, + 10, 10, 19, 20, 18, 14, 12, 11, 10, 9, 19, 20, 17, 14, 12, 11, 9, 9, 18, + 19, 17, 14, 12, 10, 9, 9, 16, 18, 16, 13, 11, 10, 9, 9, 16, 17, 16, 13, + 11, 10, 9, 8, 14, 16, 14, 12, 11, 9, 8, 8, 14, 15, 14, 12, 10, 9, 8, 8, + 13, 14, 13, 11, 10, 9, 8, 7, 12, 14, 13, 11, 10, 9, 7, 7, 12, 14, 13, + 11, 10, 8, 7, 7, 12, 13, 12, 11, 9, 8, 7, 7, 12, 13, 12, 11, 9, 8, 7, 7, + 11, 12, 12, 11, 9, 8, 7, 7, 11, 12, 12, 11, 9, 8, 7, 6, 11, 12, 12, 11, + 9, 8, 7, 6, 10, 11, 12, 11, 9, 8, 7, 6, + /* Size 32x8 */ + 32, 33, 33, 33, 33, 32, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 19, 19, + 18, 16, 16, 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 10, 33, 32, 32, 32, + 31, 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 23, 20, 20, 19, 18, 17, 16, + 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 29, 29, 30, 30, 29, 28, 28, 26, + 25, 23, 22, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 14, 14, 13, 13, 13, + 12, 12, 12, 12, 12, 12, 23, 24, 25, 25, 24, 24, 24, 23, 21, 20, 19, 18, + 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 19, 20, 20, 21, 21, 20, 21, 20, 19, 17, 17, 16, 15, 14, 14, 13, + 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 16, 17, 17, + 17, 17, 17, 18, 17, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, + 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 12, 13, 13, 14, 14, 14, 14, 14, 13, 13, + 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, + 11, 12, 12, 12, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, + 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6 }, + { /* Chroma */ + /* Size 4x4 */ + 32, 23, 20, 17, 23, 19, 17, 16, 20, 17, 14, 13, 17, 16, 13, 11, + /* Size 8x8 */ + 33, 30, 22, 22, 20, 18, 17, 16, 30, 26, 22, 23, 21, 19, 18, 17, 22, 22, + 20, 20, 19, 18, 17, 17, 22, 23, 20, 18, 17, 16, 15, 15, 20, 21, 19, 17, + 15, 14, 13, 13, 18, 19, 18, 16, 14, 12, 12, 12, 17, 18, 17, 15, 13, 12, + 11, 11, 16, 17, 17, 15, 13, 12, 11, 10, + /* Size 16x16 */ + 32, 33, 31, 28, 25, 21, 21, 20, 20, 19, 18, 17, 16, 15, 15, 15, 33, 33, + 30, 26, 24, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 16, 31, 30, 28, 24, + 23, 22, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 28, 26, 24, 22, 22, 21, + 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 25, 24, 23, 22, 21, 20, 21, 20, + 20, 20, 19, 18, 18, 17, 17, 17, 21, 22, 22, 21, 20, 19, 19, 19, 19, 19, + 18, 17, 17, 16, 16, 16, 21, 22, 22, 22, 21, 19, 19, 18, 17, 17, 17, 16, + 16, 15, 15, 15, 20, 22, 22, 22, 20, 19, 18, 17, 16, 16, 16, 15, 15, 14, + 14, 14, 20, 21, 22, 22, 20, 19, 17, 16, 16, 15, 15, 14, 14, 13, 14, 14, + 19, 20, 21, 21, 20, 19, 17, 16, 15, 14, 14, 13, 13, 13, 13, 13, 18, 19, + 20, 20, 19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 17, 18, 19, 19, + 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 12, 16, 17, 18, 19, 18, 17, + 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 15, 17, 17, 18, 17, 16, 15, 14, + 13, 13, 12, 12, 11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 15, 14, 14, 13, + 12, 12, 11, 11, 10, 10, 15, 16, 16, 17, 17, 16, 15, 14, 14, 13, 12, 12, + 11, 11, 10, 10, + /* Size 32x32 */ + 32, 33, 33, 34, 31, 31, 28, 27, 25, 22, 21, 21, 21, 21, 20, 20, 20, 19, + 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 33, 33, 33, 33, + 30, 30, 27, 26, 24, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, + 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 33, 33, 33, 33, 30, 29, 26, 26, + 24, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, + 17, 16, 16, 16, 16, 15, 34, 33, 33, 32, 30, 29, 26, 25, 24, 23, 22, 23, + 23, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, + 16, 16, 31, 30, 30, 30, 28, 27, 24, 24, 23, 22, 22, 22, 22, 23, 22, 22, + 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 31, 30, + 29, 29, 27, 26, 24, 23, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, + 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 28, 27, 26, 26, 24, 24, + 22, 22, 22, 22, 21, 22, 22, 23, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, + 19, 19, 18, 18, 17, 17, 17, 17, 27, 26, 26, 25, 24, 23, 22, 22, 21, 21, + 21, 21, 22, 22, 22, 22, 21, 21, 21, 20, 20, 19, 19, 19, 18, 18, 18, 18, + 18, 17, 17, 17, 25, 24, 24, 24, 23, 23, 22, 21, 21, 20, 20, 21, 21, 21, + 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, + 22, 22, 22, 23, 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, + 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 21, 22, 22, 22, + 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, + 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 22, 22, 22, 21, + 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, + 16, 16, 16, 16, 16, 15, 21, 22, 22, 23, 22, 22, 22, 22, 21, 20, 19, 19, + 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15, + 15, 15, 21, 22, 22, 23, 23, 23, 23, 22, 21, 20, 19, 19, 18, 18, 17, 17, + 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 20, 21, + 22, 22, 22, 22, 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, + 16, 15, 15, 15, 15, 14, 14, 15, 14, 14, 14, 15, 20, 21, 22, 22, 22, 22, + 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 20, 20, 21, 22, 22, 22, 22, 21, 20, 19, + 19, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 14, + 14, 13, 14, 14, 19, 20, 20, 21, 21, 21, 22, 21, 20, 19, 19, 18, 17, 17, + 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 19, 20, 20, 21, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, + 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 18, 19, 19, 20, + 20, 20, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, + 13, 13, 12, 12, 12, 13, 12, 13, 13, 12, 18, 19, 19, 20, 20, 20, 20, 20, + 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 20, 19, 19, 18, 18, 17, + 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 17, 18, 18, 19, 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, + 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, + 17, 18, 18, 18, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, + 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 12, 11, 16, 17, 17, 18, 18, 18, + 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 16, 17, 17, 18, 18, 18, 19, 18, 18, 17, + 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 15, 16, 17, 17, 17, 17, 18, 18, 17, 17, 16, 16, 15, 15, + 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 15, 16, 16, 17, 17, 17, 18, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, + 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 10, 10, 10, 15, 16, 16, 17, + 17, 17, 17, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, + 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 15, 16, 16, 17, 17, 17, 17, 17, + 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 11, 11, 11, + 11, 10, 10, 10, 10, 10, 15, 16, 16, 16, 16, 17, 17, 17, 17, 16, 16, 16, + 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, + 10, 10, 14, 15, 15, 16, 16, 17, 17, 17, 17, 16, 16, 15, 15, 15, 15, 14, + 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, + /* Size 4x8 */ + 33, 22, 19, 16, 28, 22, 20, 17, 22, 20, 19, 17, 23, 19, 16, 15, 21, 19, + 14, 13, 19, 18, 13, 12, 17, 17, 13, 11, 16, 16, 13, 11, + /* Size 8x4 */ + 33, 28, 22, 23, 21, 19, 17, 16, 22, 22, 20, 19, 19, 18, 17, 16, 19, 20, + 19, 16, 14, 13, 13, 13, 16, 17, 17, 15, 13, 12, 11, 11, + /* Size 8x16 */ + 32, 31, 23, 21, 20, 18, 16, 15, 33, 30, 23, 22, 21, 19, 17, 16, 31, 28, + 22, 23, 22, 20, 18, 17, 28, 24, 22, 23, 22, 20, 19, 17, 24, 23, 21, 21, + 20, 19, 18, 17, 21, 22, 20, 19, 19, 18, 17, 16, 21, 22, 20, 18, 17, 17, + 16, 15, 20, 22, 20, 17, 16, 16, 14, 14, 20, 22, 19, 17, 16, 14, 14, 14, + 19, 21, 19, 17, 15, 14, 13, 13, 18, 20, 19, 16, 15, 13, 12, 12, 17, 19, + 18, 16, 14, 13, 12, 12, 16, 18, 17, 15, 14, 12, 11, 11, 16, 17, 17, 15, + 13, 12, 11, 11, 15, 17, 17, 15, 13, 12, 11, 11, 15, 16, 17, 15, 14, 12, + 11, 10, + /* Size 16x8 */ + 32, 33, 31, 28, 24, 21, 21, 20, 20, 19, 18, 17, 16, 16, 15, 15, 31, 30, + 28, 24, 23, 22, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 23, 23, 22, 22, + 21, 20, 20, 20, 19, 19, 19, 18, 17, 17, 17, 17, 21, 22, 23, 23, 21, 19, + 18, 17, 17, 17, 16, 16, 15, 15, 15, 15, 20, 21, 22, 22, 20, 19, 17, 16, + 16, 15, 15, 14, 14, 13, 13, 14, 18, 19, 20, 20, 19, 18, 17, 16, 14, 14, + 13, 13, 12, 12, 12, 12, 16, 17, 18, 19, 18, 17, 16, 14, 14, 13, 12, 12, + 11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 15, 14, 14, 13, 12, 12, 11, 11, + 11, 10, + /* Size 16x32 */ + 32, 33, 31, 28, 23, 21, 21, 20, 20, 18, 18, 16, 16, 15, 15, 15, 33, 33, + 30, 27, 23, 22, 22, 21, 20, 19, 19, 17, 17, 16, 16, 16, 33, 32, 30, 26, + 23, 22, 22, 22, 21, 20, 19, 17, 17, 17, 16, 16, 34, 32, 29, 26, 23, 22, + 23, 22, 21, 20, 20, 18, 18, 17, 17, 17, 31, 29, 28, 24, 22, 22, 23, 22, + 22, 20, 20, 18, 18, 17, 17, 17, 31, 28, 27, 24, 22, 22, 22, 22, 22, 20, + 20, 18, 18, 17, 17, 17, 28, 26, 24, 22, 22, 22, 23, 22, 22, 21, 20, 19, + 19, 18, 17, 17, 26, 25, 24, 22, 21, 21, 22, 22, 21, 20, 20, 19, 18, 18, + 18, 17, 24, 24, 23, 22, 21, 20, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, + 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 18, 17, 17, 17, 17, 17, 21, 22, + 22, 21, 20, 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 17, 21, 22, 22, 22, + 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 16, 21, 23, 22, 22, 20, 19, + 18, 18, 17, 17, 17, 16, 16, 16, 15, 16, 21, 23, 23, 22, 20, 19, 18, 17, + 17, 16, 16, 15, 15, 15, 15, 15, 20, 22, 22, 22, 20, 19, 17, 17, 16, 16, + 16, 15, 14, 15, 14, 15, 20, 22, 22, 22, 20, 19, 17, 17, 16, 16, 15, 14, + 14, 14, 14, 14, 20, 21, 22, 22, 19, 19, 17, 16, 16, 15, 14, 14, 14, 14, + 14, 14, 19, 21, 21, 21, 19, 19, 17, 16, 15, 14, 14, 13, 13, 13, 14, 13, + 19, 20, 21, 21, 19, 19, 17, 16, 15, 14, 14, 13, 13, 13, 13, 13, 18, 20, + 20, 20, 19, 18, 16, 16, 15, 14, 13, 13, 12, 13, 13, 13, 18, 20, 20, 20, + 19, 18, 16, 16, 15, 14, 13, 12, 12, 12, 12, 13, 17, 19, 19, 20, 18, 18, + 16, 15, 14, 13, 13, 12, 12, 12, 12, 12, 17, 18, 19, 19, 18, 17, 16, 15, + 14, 13, 13, 12, 12, 12, 12, 12, 16, 18, 18, 19, 17, 17, 15, 15, 14, 13, + 12, 12, 11, 11, 12, 12, 16, 18, 18, 18, 17, 17, 15, 14, 14, 13, 12, 11, + 11, 11, 11, 12, 16, 17, 18, 18, 17, 17, 15, 14, 14, 13, 12, 11, 11, 11, + 11, 11, 16, 17, 17, 18, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 11, + 15, 17, 17, 18, 17, 16, 15, 15, 13, 13, 12, 11, 11, 11, 11, 11, 15, 17, + 17, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 15, 16, 17, 17, + 17, 16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 15, 16, 16, 17, 17, 16, + 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 15, 16, 16, 17, 17, 15, 15, 14, + 14, 12, 12, 11, 11, 10, 10, 10, + /* Size 32x16 */ + 32, 33, 33, 34, 31, 31, 28, 26, 24, 22, 21, 21, 21, 21, 20, 20, 20, 19, + 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 33, 33, 32, 32, + 29, 28, 26, 25, 24, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20, 20, 20, 19, + 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 31, 30, 30, 29, 28, 27, 24, 24, + 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, + 17, 17, 17, 17, 16, 16, 28, 27, 26, 26, 24, 24, 22, 22, 22, 21, 21, 22, + 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, + 17, 17, 23, 23, 23, 23, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 20, 20, + 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 17, 21, 22, + 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, + 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 21, 22, 22, 23, 23, 22, + 23, 22, 21, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19, + 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 15, + 14, 14, 14, 14, 20, 20, 21, 21, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, + 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 14, 14, + 18, 19, 20, 20, 20, 20, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, + 14, 14, 14, 13, 13, 13, 13, 13, 12, 13, 13, 13, 13, 12, 18, 19, 19, 20, + 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 13, + 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 19, + 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, + 11, 11, 12, 12, 12, 11, 16, 17, 17, 18, 18, 18, 19, 18, 18, 17, 17, 16, + 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 15, 16, 17, 17, 17, 17, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, + 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 10, 15, 16, + 16, 17, 17, 17, 17, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, + 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 15, 16, 16, 17, 17, 17, + 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, + 12, 11, 11, 11, 10, 10, 10, 10, + /* Size 4x16 */ + 33, 21, 18, 15, 32, 22, 20, 17, 29, 22, 20, 17, 26, 22, 21, 18, 24, 20, + 19, 17, 22, 19, 18, 16, 23, 19, 17, 16, 22, 19, 16, 15, 21, 19, 15, 14, + 20, 19, 14, 13, 20, 18, 14, 12, 18, 17, 13, 12, 18, 17, 13, 11, 17, 16, + 12, 11, 17, 16, 13, 11, 16, 16, 13, 11, + /* Size 16x4 */ + 33, 32, 29, 26, 24, 22, 23, 22, 21, 20, 20, 18, 18, 17, 17, 16, 21, 22, + 22, 22, 20, 19, 19, 19, 19, 19, 18, 17, 17, 16, 16, 16, 18, 20, 20, 21, + 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 13, 13, 15, 17, 17, 18, 17, 16, + 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, + /* Size 8x32 */ + 32, 31, 23, 21, 20, 18, 16, 15, 33, 30, 23, 22, 20, 19, 17, 16, 33, 30, + 23, 22, 21, 19, 17, 16, 34, 29, 23, 23, 21, 20, 18, 17, 31, 28, 22, 23, + 22, 20, 18, 17, 31, 27, 22, 22, 22, 20, 18, 17, 28, 24, 22, 23, 22, 20, + 19, 17, 26, 24, 21, 22, 21, 20, 18, 18, 24, 23, 21, 21, 20, 19, 18, 17, + 22, 22, 20, 19, 19, 18, 17, 17, 21, 22, 20, 19, 19, 18, 17, 16, 21, 22, + 20, 18, 18, 17, 16, 16, 21, 22, 20, 18, 17, 17, 16, 15, 21, 23, 20, 18, + 17, 16, 15, 15, 20, 22, 20, 17, 16, 16, 14, 14, 20, 22, 20, 17, 16, 15, + 14, 14, 20, 22, 19, 17, 16, 14, 14, 14, 19, 21, 19, 17, 15, 14, 13, 14, + 19, 21, 19, 17, 15, 14, 13, 13, 18, 20, 19, 16, 15, 13, 12, 13, 18, 20, + 19, 16, 15, 13, 12, 12, 17, 19, 18, 16, 14, 13, 12, 12, 17, 19, 18, 16, + 14, 13, 12, 12, 16, 18, 17, 15, 14, 12, 11, 12, 16, 18, 17, 15, 14, 12, + 11, 11, 16, 18, 17, 15, 14, 12, 11, 11, 16, 17, 17, 15, 13, 12, 11, 11, + 15, 17, 17, 15, 13, 12, 11, 11, 15, 17, 17, 15, 13, 12, 11, 11, 15, 17, + 17, 15, 13, 12, 11, 10, 15, 16, 17, 15, 14, 12, 11, 10, 15, 16, 17, 15, + 14, 12, 11, 10, + /* Size 32x8 */ + 32, 33, 33, 34, 31, 31, 28, 26, 24, 22, 21, 21, 21, 21, 20, 20, 20, 19, + 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 31, 30, 30, 29, + 28, 27, 24, 24, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, + 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 23, 23, 23, 23, 22, 22, 22, 21, + 21, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 21, 22, 22, 23, 23, 22, 23, 22, 21, 19, 19, 18, + 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 20, 20, 21, 21, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 16, 16, + 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 14, 14, 18, 19, + 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, + 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, + 19, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 17, 18, 17, 17, + 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, + 11, 10, 10, 10 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 30, 19, 14, 30, 21, 16, 13, 19, 16, 11, 9, 14, 13, 9, 7, + /* Size 8x8 */ + 32, 32, 30, 26, 20, 17, 13, 12, 32, 31, 29, 26, 21, 17, 14, 13, 30, 29, + 26, 22, 19, 16, 14, 13, 26, 26, 22, 18, 16, 14, 12, 11, 20, 21, 19, 16, + 13, 11, 10, 10, 17, 17, 16, 14, 11, 10, 9, 8, 13, 14, 14, 12, 10, 9, 8, + 7, 12, 13, 13, 11, 10, 8, 7, 7, + /* Size 16x16 */ + 32, 33, 33, 32, 31, 28, 26, 23, 21, 19, 17, 16, 14, 13, 12, 11, 33, 32, + 32, 32, 31, 29, 27, 24, 22, 20, 18, 16, 15, 13, 13, 12, 33, 32, 32, 31, + 30, 29, 27, 25, 23, 21, 19, 17, 15, 14, 13, 12, 32, 32, 31, 30, 28, 28, + 26, 24, 23, 21, 19, 17, 16, 14, 14, 13, 31, 31, 30, 28, 27, 24, 23, 22, + 20, 19, 18, 16, 15, 14, 13, 13, 28, 29, 29, 28, 24, 21, 20, 19, 18, 17, + 16, 15, 14, 13, 12, 12, 26, 27, 27, 26, 23, 20, 19, 18, 17, 16, 15, 14, + 13, 12, 12, 11, 23, 24, 25, 24, 22, 19, 18, 16, 15, 14, 14, 13, 12, 11, + 11, 11, 21, 22, 23, 23, 20, 18, 17, 15, 14, 13, 13, 12, 11, 10, 10, 10, + 19, 20, 21, 21, 19, 17, 16, 14, 13, 12, 12, 11, 10, 10, 9, 9, 17, 18, + 19, 19, 18, 16, 15, 14, 13, 12, 11, 10, 10, 9, 9, 9, 16, 16, 17, 17, 16, + 15, 14, 13, 12, 11, 10, 10, 9, 8, 8, 8, 14, 15, 15, 16, 15, 14, 13, 12, + 11, 10, 10, 9, 8, 8, 8, 7, 13, 13, 14, 14, 14, 13, 12, 11, 10, 10, 9, 8, + 8, 7, 7, 7, 12, 13, 13, 14, 13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 11, + 12, 12, 13, 13, 12, 11, 11, 10, 9, 9, 8, 7, 7, 7, 6, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 32, 32, 31, 30, 28, 28, 26, 25, 23, 22, 21, 20, + 19, 18, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 11, 11, 33, 32, 32, 32, + 32, 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 20, 20, 18, 18, 17, + 16, 15, 14, 13, 13, 13, 12, 12, 12, 12, 33, 32, 32, 32, 32, 32, 32, 32, + 31, 30, 29, 29, 27, 26, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 15, 14, + 13, 13, 13, 12, 12, 12, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, + 28, 27, 25, 24, 23, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 14, 13, 13, + 12, 12, 33, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29, 29, 27, 26, 25, 24, + 23, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 33, 32, + 32, 32, 31, 31, 31, 30, 29, 29, 28, 28, 26, 26, 24, 23, 23, 21, 20, 19, + 19, 17, 17, 16, 15, 14, 14, 14, 13, 13, 13, 12, 32, 32, 32, 32, 31, 31, + 30, 29, 28, 28, 28, 27, 26, 26, 24, 23, 23, 21, 21, 19, 19, 18, 17, 16, + 16, 15, 14, 14, 14, 13, 13, 12, 32, 32, 32, 32, 31, 30, 29, 29, 28, 28, + 27, 27, 26, 25, 24, 23, 22, 21, 21, 19, 19, 18, 17, 16, 16, 15, 14, 14, + 14, 13, 13, 13, 31, 31, 31, 31, 30, 29, 28, 28, 27, 26, 24, 24, 23, 23, + 22, 21, 20, 20, 19, 18, 18, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, + 30, 30, 30, 31, 30, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 21, 20, 19, + 19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 28, 29, 29, 30, + 29, 28, 28, 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, + 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 28, 29, 29, 30, 29, 28, 27, 27, + 24, 24, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, + 13, 13, 12, 12, 12, 11, 26, 27, 27, 28, 27, 26, 26, 26, 23, 23, 20, 20, + 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, + 11, 11, 25, 26, 26, 27, 26, 26, 26, 25, 23, 22, 20, 20, 19, 18, 17, 17, + 16, 16, 15, 15, 15, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 23, 24, + 24, 25, 25, 24, 24, 24, 22, 22, 19, 19, 18, 17, 16, 16, 15, 15, 14, 14, + 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 11, 22, 23, 23, 24, 24, 23, + 23, 23, 21, 21, 19, 19, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, + 12, 11, 11, 11, 10, 10, 10, 10, 21, 22, 22, 23, 23, 23, 23, 22, 20, 20, + 18, 18, 17, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10, + 10, 10, 10, 10, 20, 20, 21, 21, 21, 21, 21, 21, 20, 19, 17, 17, 16, 16, + 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9, + 19, 20, 20, 21, 21, 20, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 13, 13, + 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 18, 18, 19, 19, 19, + 19, 19, 19, 18, 18, 16, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, + 10, 10, 9, 9, 9, 9, 9, 9, 9, 17, 18, 18, 19, 19, 19, 19, 19, 18, 18, 16, + 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, + 9, 9, 16, 17, 17, 17, 17, 17, 18, 18, 17, 16, 15, 15, 14, 14, 13, 12, + 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 16, 16, 16, 17, + 17, 17, 17, 17, 16, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, + 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 14, 15, 15, 16, 16, 16, 16, 16, 15, 15, + 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, + 8, 8, 14, 14, 15, 15, 15, 15, 16, 16, 15, 15, 14, 14, 13, 12, 12, 12, + 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7, 8, 13, 13, 14, 14, 14, + 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, + 8, 8, 8, 7, 7, 7, 7, 7, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 13, 13, + 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 12, + 13, 13, 14, 14, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 10, + 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, 13, 13, 13, 14, 14, + 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, + 7, 7, 7, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 12, 12, 11, 11, 11, 10, + 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 11, 12, 12, 12, 12, + 13, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, + 7, 7, 7, 7, 7, 6, 6, 11, 12, 12, 12, 12, 12, 12, 13, 13, 12, 12, 11, 11, + 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, + /* Size 4x8 */ + 32, 29, 20, 13, 32, 28, 20, 14, 30, 24, 19, 14, 27, 20, 15, 12, 21, 17, + 13, 10, 17, 15, 11, 9, 14, 13, 10, 8, 13, 12, 9, 7, + /* Size 8x4 */ + 32, 32, 30, 27, 21, 17, 14, 13, 29, 28, 24, 20, 17, 15, 13, 12, 20, 20, + 19, 15, 13, 11, 10, 9, 13, 14, 14, 12, 10, 9, 8, 7, + /* Size 8x16 */ + 32, 33, 31, 26, 20, 16, 13, 12, 33, 32, 31, 26, 21, 17, 14, 12, 33, 32, + 30, 27, 22, 17, 14, 13, 32, 31, 28, 26, 21, 18, 15, 13, 31, 30, 27, 23, + 20, 17, 14, 13, 28, 29, 24, 20, 18, 15, 13, 12, 26, 27, 23, 19, 16, 14, + 12, 12, 23, 25, 22, 17, 15, 13, 11, 11, 21, 23, 20, 17, 14, 12, 11, 10, + 19, 21, 19, 16, 13, 11, 10, 9, 18, 19, 18, 15, 12, 10, 9, 9, 16, 17, 16, + 14, 11, 10, 9, 8, 14, 15, 15, 13, 11, 9, 8, 8, 13, 14, 14, 12, 10, 9, 8, + 7, 12, 13, 13, 11, 10, 8, 7, 7, 11, 12, 13, 11, 10, 9, 7, 7, + /* Size 16x8 */ + 32, 33, 33, 32, 31, 28, 26, 23, 21, 19, 18, 16, 14, 13, 12, 11, 33, 32, + 32, 31, 30, 29, 27, 25, 23, 21, 19, 17, 15, 14, 13, 12, 31, 31, 30, 28, + 27, 24, 23, 22, 20, 19, 18, 16, 15, 14, 13, 13, 26, 26, 27, 26, 23, 20, + 19, 17, 17, 16, 15, 14, 13, 12, 11, 11, 20, 21, 22, 21, 20, 18, 16, 15, + 14, 13, 12, 11, 11, 10, 10, 10, 16, 17, 17, 18, 17, 15, 14, 13, 12, 11, + 10, 10, 9, 9, 8, 9, 13, 14, 14, 15, 14, 13, 12, 11, 11, 10, 9, 9, 8, 8, + 7, 7, 12, 12, 13, 13, 13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, + /* Size 16x32 */ + 32, 33, 33, 32, 31, 28, 26, 23, 20, 19, 16, 16, 13, 13, 12, 11, 33, 32, + 32, 32, 31, 29, 26, 24, 21, 20, 17, 16, 14, 13, 12, 12, 33, 32, 32, 32, + 31, 29, 26, 24, 21, 20, 17, 17, 14, 13, 12, 12, 33, 32, 32, 31, 31, 30, + 27, 25, 22, 21, 17, 17, 14, 14, 13, 13, 33, 32, 32, 31, 30, 29, 27, 25, + 22, 21, 17, 17, 14, 14, 13, 13, 32, 32, 31, 30, 29, 28, 26, 24, 21, 20, + 17, 17, 14, 14, 13, 13, 32, 32, 31, 29, 28, 28, 26, 24, 21, 21, 18, 17, + 15, 14, 13, 13, 32, 31, 31, 29, 28, 27, 25, 24, 21, 21, 18, 17, 15, 15, + 14, 13, 31, 31, 30, 28, 27, 25, 23, 22, 20, 19, 17, 16, 14, 14, 13, 13, + 30, 30, 30, 28, 26, 24, 23, 21, 19, 19, 16, 16, 14, 14, 13, 12, 28, 30, + 29, 27, 24, 21, 20, 19, 18, 17, 15, 15, 13, 13, 12, 12, 28, 29, 29, 27, + 24, 21, 20, 19, 17, 17, 15, 15, 13, 13, 12, 12, 26, 28, 27, 26, 23, 20, + 19, 18, 16, 16, 14, 14, 12, 12, 12, 12, 26, 27, 26, 25, 23, 20, 18, 17, + 16, 15, 14, 13, 12, 12, 11, 11, 23, 25, 25, 24, 22, 19, 17, 16, 15, 14, + 13, 13, 11, 11, 11, 11, 22, 24, 24, 23, 21, 19, 17, 16, 14, 14, 12, 12, + 11, 11, 11, 10, 21, 23, 23, 22, 20, 18, 17, 15, 14, 13, 12, 12, 11, 10, + 10, 10, 20, 21, 21, 21, 20, 17, 16, 15, 13, 13, 11, 11, 10, 10, 10, 10, + 19, 21, 21, 20, 19, 17, 16, 14, 13, 12, 11, 11, 10, 10, 9, 10, 18, 19, + 19, 19, 18, 16, 15, 14, 12, 12, 11, 10, 9, 9, 9, 9, 18, 19, 19, 19, 18, + 16, 15, 14, 12, 12, 10, 10, 9, 9, 9, 9, 16, 17, 17, 18, 17, 15, 14, 13, + 12, 11, 10, 10, 9, 9, 8, 8, 16, 17, 17, 17, 16, 15, 14, 13, 11, 11, 10, + 10, 9, 8, 8, 8, 14, 16, 16, 16, 15, 14, 13, 12, 11, 11, 9, 9, 8, 8, 8, + 8, 14, 15, 15, 16, 15, 14, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 13, 14, 14, + 15, 14, 13, 12, 11, 10, 10, 9, 9, 8, 8, 7, 7, 13, 14, 14, 14, 14, 13, + 12, 11, 10, 10, 9, 8, 8, 7, 7, 7, 12, 14, 14, 14, 14, 13, 12, 11, 10, + 10, 8, 8, 8, 7, 7, 7, 12, 13, 13, 14, 13, 12, 11, 11, 10, 9, 8, 8, 7, 7, + 7, 7, 12, 13, 13, 13, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 7, 7, 11, 12, + 12, 13, 13, 12, 11, 10, 10, 9, 9, 8, 7, 7, 7, 7, 11, 12, 12, 13, 13, 11, + 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 32, 32, 32, 31, 30, 28, 28, 26, 26, 23, 22, 21, 20, + 19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 11, 11, 33, 32, 32, 32, + 32, 32, 32, 31, 31, 30, 30, 29, 28, 27, 25, 24, 23, 21, 21, 19, 19, 17, + 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 33, 32, 32, 32, 32, 31, 31, 31, + 30, 30, 29, 29, 27, 26, 25, 24, 23, 21, 21, 19, 19, 17, 17, 16, 15, 14, + 14, 14, 13, 13, 12, 12, 32, 32, 32, 31, 31, 30, 29, 29, 28, 28, 27, 27, + 26, 25, 24, 23, 22, 21, 20, 19, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, + 13, 13, 31, 31, 31, 31, 30, 29, 28, 28, 27, 26, 24, 24, 23, 23, 22, 21, + 20, 20, 19, 18, 18, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 28, 29, + 29, 30, 29, 28, 28, 27, 25, 24, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16, + 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 26, 26, 26, 27, 27, 26, + 26, 25, 23, 23, 20, 20, 19, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, + 13, 12, 12, 12, 11, 11, 11, 11, 23, 24, 24, 25, 25, 24, 24, 24, 22, 21, + 19, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, + 11, 10, 10, 10, 20, 21, 21, 22, 22, 21, 21, 21, 20, 19, 18, 17, 16, 16, + 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, + 19, 20, 20, 21, 21, 20, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 13, 13, + 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 16, 17, 17, 17, 17, + 17, 18, 18, 17, 16, 15, 15, 14, 14, 13, 12, 12, 11, 11, 11, 10, 10, 10, + 9, 9, 9, 9, 8, 8, 8, 9, 9, 16, 16, 17, 17, 17, 17, 17, 17, 16, 16, 15, + 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, + 8, 13, 14, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, + 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, 8, 13, 13, 13, 14, 14, 14, + 14, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, + 7, 7, 7, 7, 7, 7, 12, 12, 12, 13, 13, 13, 13, 14, 13, 13, 12, 12, 12, + 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 11, 12, + 12, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, + 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, + /* Size 4x16 */ + 33, 28, 19, 13, 32, 29, 20, 13, 32, 29, 21, 14, 32, 28, 21, 14, 31, 25, + 19, 14, 30, 21, 17, 13, 28, 20, 16, 12, 25, 19, 14, 11, 23, 18, 13, 10, + 21, 17, 12, 10, 19, 16, 12, 9, 17, 15, 11, 8, 15, 14, 10, 8, 14, 13, 10, + 7, 13, 12, 9, 7, 12, 12, 9, 7, + /* Size 16x4 */ + 33, 32, 32, 32, 31, 30, 28, 25, 23, 21, 19, 17, 15, 14, 13, 12, 28, 29, + 29, 28, 25, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 12, 19, 20, 21, 21, + 19, 17, 16, 14, 13, 12, 12, 11, 10, 10, 9, 9, 13, 13, 14, 14, 14, 13, + 12, 11, 10, 10, 9, 8, 8, 7, 7, 7, + /* Size 8x32 */ + 32, 33, 31, 26, 20, 16, 13, 12, 33, 32, 31, 26, 21, 17, 14, 12, 33, 32, + 31, 26, 21, 17, 14, 12, 33, 32, 31, 27, 22, 17, 14, 13, 33, 32, 30, 27, + 22, 17, 14, 13, 32, 31, 29, 26, 21, 17, 14, 13, 32, 31, 28, 26, 21, 18, + 15, 13, 32, 31, 28, 25, 21, 18, 15, 14, 31, 30, 27, 23, 20, 17, 14, 13, + 30, 30, 26, 23, 19, 16, 14, 13, 28, 29, 24, 20, 18, 15, 13, 12, 28, 29, + 24, 20, 17, 15, 13, 12, 26, 27, 23, 19, 16, 14, 12, 12, 26, 26, 23, 18, + 16, 14, 12, 11, 23, 25, 22, 17, 15, 13, 11, 11, 22, 24, 21, 17, 14, 12, + 11, 11, 21, 23, 20, 17, 14, 12, 11, 10, 20, 21, 20, 16, 13, 11, 10, 10, + 19, 21, 19, 16, 13, 11, 10, 9, 18, 19, 18, 15, 12, 11, 9, 9, 18, 19, 18, + 15, 12, 10, 9, 9, 16, 17, 17, 14, 12, 10, 9, 8, 16, 17, 16, 14, 11, 10, + 9, 8, 14, 16, 15, 13, 11, 9, 8, 8, 14, 15, 15, 13, 11, 9, 8, 8, 13, 14, + 14, 12, 10, 9, 8, 7, 13, 14, 14, 12, 10, 9, 8, 7, 12, 14, 14, 12, 10, 8, + 8, 7, 12, 13, 13, 11, 10, 8, 7, 7, 12, 13, 13, 11, 10, 8, 7, 7, 11, 12, + 13, 11, 10, 9, 7, 7, 11, 12, 13, 11, 10, 9, 8, 7, + /* Size 32x8 */ + 32, 33, 33, 33, 33, 32, 32, 32, 31, 30, 28, 28, 26, 26, 23, 22, 21, 20, + 19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 11, 11, 33, 32, 32, 32, + 32, 31, 31, 31, 30, 30, 29, 29, 27, 26, 25, 24, 23, 21, 21, 19, 19, 17, + 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 31, 31, 31, 31, 30, 29, 28, 28, + 27, 26, 24, 24, 23, 23, 22, 21, 20, 20, 19, 18, 18, 17, 16, 15, 15, 14, + 14, 14, 13, 13, 13, 13, 26, 26, 26, 27, 27, 26, 26, 25, 23, 23, 20, 20, + 19, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, + 11, 11, 20, 21, 21, 22, 22, 21, 21, 21, 20, 19, 18, 17, 16, 16, 15, 14, + 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 16, 17, + 17, 17, 17, 17, 18, 18, 17, 16, 15, 15, 14, 14, 13, 12, 12, 11, 11, 11, + 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 9, 9, 13, 14, 14, 14, 14, 14, 15, 15, + 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, + 7, 7, 7, 8, 12, 12, 12, 13, 13, 13, 13, 14, 13, 13, 12, 12, 12, 11, 11, + 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7 }, + { /* Chroma */ + /* Size 4x4 */ + 32, 22, 21, 18, 22, 19, 19, 17, 21, 19, 15, 13, 18, 17, 13, 11, + /* Size 8x8 */ + 33, 30, 24, 22, 21, 19, 17, 16, 30, 26, 23, 22, 22, 20, 18, 17, 24, 23, + 21, 21, 20, 19, 18, 17, 22, 22, 21, 19, 18, 17, 16, 16, 21, 22, 20, 18, + 16, 15, 14, 14, 19, 20, 19, 17, 15, 13, 12, 12, 17, 18, 18, 16, 14, 12, + 12, 11, 16, 17, 17, 16, 14, 12, 11, 11, + /* Size 16x16 */ + 32, 33, 33, 29, 26, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 15, 33, 33, + 32, 28, 25, 22, 22, 22, 21, 21, 20, 19, 18, 17, 17, 16, 33, 32, 30, 26, + 24, 22, 22, 23, 22, 22, 21, 20, 19, 18, 17, 17, 29, 28, 26, 23, 22, 22, + 22, 23, 22, 22, 21, 20, 19, 18, 18, 17, 26, 25, 24, 22, 21, 20, 21, 21, + 21, 21, 20, 19, 19, 18, 17, 17, 21, 22, 22, 22, 20, 19, 19, 19, 19, 19, + 19, 18, 17, 17, 17, 17, 21, 22, 22, 22, 21, 19, 19, 19, 18, 18, 18, 17, + 17, 16, 16, 16, 21, 22, 23, 23, 21, 19, 19, 18, 17, 17, 17, 16, 16, 15, + 15, 15, 20, 21, 22, 22, 21, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, + 20, 21, 22, 22, 21, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 19, 20, + 21, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 13, 18, 19, 20, 20, + 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 17, 18, 19, 19, 19, 17, + 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 16, 17, 18, 18, 18, 17, 16, 15, + 14, 14, 13, 12, 12, 11, 11, 11, 16, 17, 17, 18, 17, 17, 16, 15, 14, 13, + 13, 12, 12, 11, 11, 11, 15, 16, 17, 17, 17, 17, 16, 15, 14, 13, 13, 12, + 12, 11, 11, 10, + /* Size 32x32 */ + 32, 33, 33, 34, 33, 31, 29, 28, 26, 25, 21, 21, 21, 21, 21, 20, 20, 20, + 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 33, 33, 33, 33, + 32, 30, 28, 27, 25, 24, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, + 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 33, 33, 33, 33, 32, 29, 28, 26, + 25, 24, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17, + 17, 17, 17, 16, 16, 16, 34, 33, 33, 32, 31, 29, 27, 26, 24, 24, 22, 22, + 23, 23, 23, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, + 17, 17, 33, 32, 32, 31, 30, 28, 26, 25, 24, 24, 22, 22, 22, 23, 23, 22, + 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 31, 30, + 29, 29, 28, 26, 25, 24, 23, 23, 22, 22, 22, 22, 23, 22, 22, 22, 22, 21, + 21, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, 17, 29, 28, 28, 27, 26, 25, + 23, 22, 22, 22, 22, 22, 22, 22, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, + 19, 19, 18, 18, 18, 18, 17, 17, 28, 27, 26, 26, 25, 24, 22, 22, 22, 22, + 21, 22, 22, 22, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19, + 18, 18, 18, 18, 26, 25, 25, 24, 24, 23, 22, 22, 21, 21, 20, 21, 21, 21, + 21, 21, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, + 25, 24, 24, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 20, 20, + 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 21, 22, 22, 22, + 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, + 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 21, 22, 22, 22, 22, 22, 22, 22, + 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, + 17, 17, 16, 16, 16, 16, 21, 22, 22, 23, 22, 22, 22, 22, 21, 21, 19, 19, + 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, + 16, 16, 21, 22, 22, 23, 23, 22, 22, 22, 21, 21, 19, 19, 19, 19, 18, 18, + 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 15, 15, 21, 22, + 22, 23, 23, 23, 23, 23, 21, 21, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, + 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 20, 22, 22, 23, 22, 22, + 22, 22, 21, 21, 19, 19, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, + 15, 15, 15, 15, 15, 15, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20, + 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, + 14, 14, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, + 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 20, 20, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, 17, 17, 16, 16, + 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 14, 19, 20, 20, 21, + 21, 21, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 15, 15, 14, + 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 19, 20, 20, 21, 21, 21, 21, 21, + 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, + 13, 13, 13, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 20, 19, 18, 18, + 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, + 12, 12, 18, 19, 19, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, + 15, 15, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 17, 18, + 18, 19, 19, 19, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, + 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, + 19, 19, 19, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 19, 18, 18, + 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, + 11, 11, 11, 11, 16, 17, 17, 18, 18, 18, 18, 19, 18, 18, 17, 17, 16, 16, + 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, + 16, 17, 17, 18, 18, 18, 18, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, + 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 16, 16, 17, 17, + 17, 18, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, + 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 18, 18, + 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, + 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 17, 18, 17, 17, 17, 16, + 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, + 10, 11, 15, 16, 16, 17, 17, 17, 17, 18, 17, 17, 17, 16, 16, 15, 15, 14, + 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 10, + /* Size 4x8 */ + 33, 22, 20, 17, 28, 22, 22, 18, 24, 20, 20, 18, 23, 19, 18, 16, 22, 19, + 16, 14, 20, 18, 15, 12, 18, 17, 14, 11, 17, 16, 13, 11, + /* Size 8x4 */ + 33, 28, 24, 23, 22, 20, 18, 17, 22, 22, 20, 19, 19, 18, 17, 16, 20, 22, + 20, 18, 16, 15, 14, 13, 17, 18, 18, 16, 14, 12, 11, 11, + /* Size 8x16 */ + 32, 32, 26, 21, 20, 18, 16, 15, 33, 31, 25, 22, 21, 19, 17, 16, 33, 29, + 24, 22, 22, 20, 18, 17, 29, 26, 22, 22, 22, 20, 19, 18, 25, 24, 21, 21, + 21, 20, 18, 17, 21, 22, 20, 19, 19, 18, 17, 17, 21, 22, 21, 19, 18, 17, + 16, 16, 21, 23, 21, 18, 17, 16, 15, 15, 20, 22, 21, 18, 16, 15, 14, 14, + 20, 21, 20, 18, 16, 14, 14, 13, 19, 20, 20, 17, 15, 14, 13, 13, 18, 20, + 19, 17, 15, 13, 12, 12, 17, 19, 18, 16, 14, 13, 12, 12, 16, 18, 18, 16, + 14, 12, 12, 11, 16, 17, 17, 16, 14, 12, 11, 11, 15, 17, 17, 16, 14, 13, + 12, 11, + /* Size 16x8 */ + 32, 33, 33, 29, 25, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 15, 32, 31, + 29, 26, 24, 22, 22, 23, 22, 21, 20, 20, 19, 18, 17, 17, 26, 25, 24, 22, + 21, 20, 21, 21, 21, 20, 20, 19, 18, 18, 17, 17, 21, 22, 22, 22, 21, 19, + 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 20, 21, 22, 22, 21, 19, 18, 17, + 16, 16, 15, 15, 14, 14, 14, 14, 18, 19, 20, 20, 20, 18, 17, 16, 15, 14, + 14, 13, 13, 12, 12, 13, 16, 17, 18, 19, 18, 17, 16, 15, 14, 14, 13, 12, + 12, 12, 11, 12, 15, 16, 17, 18, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, + 11, 11, + /* Size 16x32 */ + 32, 33, 32, 28, 26, 21, 21, 21, 20, 20, 18, 18, 16, 16, 15, 15, 33, 33, + 31, 27, 25, 22, 22, 22, 21, 20, 19, 19, 17, 17, 16, 16, 33, 33, 31, 27, + 25, 22, 22, 22, 21, 21, 19, 19, 17, 17, 16, 16, 34, 32, 31, 26, 24, 22, + 23, 23, 22, 21, 20, 20, 18, 18, 17, 17, 33, 31, 29, 25, 24, 22, 22, 23, + 22, 21, 20, 20, 18, 18, 17, 17, 31, 28, 28, 24, 23, 22, 22, 22, 22, 22, + 20, 20, 18, 18, 17, 17, 29, 27, 26, 23, 22, 22, 22, 23, 22, 22, 20, 20, + 19, 18, 18, 17, 28, 26, 25, 22, 22, 22, 22, 23, 22, 22, 20, 20, 19, 19, + 18, 18, 25, 24, 24, 22, 21, 21, 21, 21, 21, 20, 20, 19, 18, 18, 17, 18, + 24, 24, 24, 22, 21, 20, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 21, 22, + 22, 21, 20, 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 21, 22, 22, 21, + 20, 19, 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 21, 22, 22, 22, 21, 19, + 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 21, 23, 22, 22, 21, 19, 19, 18, + 18, 18, 17, 17, 16, 16, 16, 15, 21, 23, 23, 22, 21, 19, 18, 18, 17, 17, + 16, 16, 15, 15, 15, 15, 21, 22, 22, 22, 21, 19, 18, 17, 17, 17, 16, 16, + 15, 15, 15, 15, 20, 22, 22, 22, 21, 19, 18, 17, 16, 16, 15, 15, 14, 14, + 14, 14, 20, 22, 22, 22, 21, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14, + 20, 21, 21, 22, 20, 19, 18, 17, 16, 16, 14, 14, 14, 14, 13, 14, 19, 20, + 21, 21, 20, 19, 17, 17, 15, 15, 14, 14, 13, 13, 13, 13, 19, 20, 20, 21, + 20, 19, 17, 17, 15, 15, 14, 14, 13, 13, 13, 13, 18, 20, 20, 20, 20, 18, + 17, 16, 15, 15, 13, 13, 12, 12, 12, 12, 18, 20, 20, 20, 19, 18, 17, 16, + 15, 14, 13, 13, 12, 12, 12, 12, 17, 19, 19, 20, 19, 18, 17, 16, 14, 14, + 13, 13, 12, 12, 12, 12, 17, 18, 19, 19, 18, 17, 16, 16, 14, 14, 13, 13, + 12, 12, 12, 12, 16, 18, 18, 19, 18, 17, 16, 15, 14, 14, 12, 12, 12, 11, + 11, 11, 16, 18, 18, 19, 18, 17, 16, 15, 14, 14, 12, 12, 12, 11, 11, 11, + 16, 17, 18, 18, 18, 17, 16, 15, 14, 14, 12, 12, 11, 11, 11, 11, 16, 17, + 17, 18, 17, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 15, 17, 17, 18, + 17, 16, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 15, 17, 17, 18, 17, 16, + 16, 14, 14, 13, 13, 12, 12, 11, 11, 11, 15, 17, 17, 17, 17, 16, 16, 14, + 14, 13, 13, 12, 12, 11, 11, 10, + /* Size 32x16 */ + 32, 33, 33, 34, 33, 31, 29, 28, 25, 24, 21, 21, 21, 21, 21, 21, 20, 20, + 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 33, 33, 33, 32, + 31, 28, 27, 26, 24, 24, 22, 22, 22, 23, 23, 22, 22, 22, 21, 20, 20, 20, + 20, 19, 18, 18, 18, 17, 17, 17, 17, 17, 32, 31, 31, 31, 29, 28, 26, 25, + 24, 24, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, + 18, 18, 17, 17, 17, 17, 28, 27, 27, 26, 25, 24, 23, 22, 22, 22, 21, 21, + 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, + 18, 17, 26, 25, 25, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, + 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 21, 22, + 22, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 21, 22, 22, 23, 22, 22, + 22, 22, 21, 21, 19, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, + 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 23, 22, 23, 23, 21, 21, + 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, + 15, 15, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, + 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 20, 20, 21, 21, 21, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, + 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 18, 19, 19, 20, + 20, 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, + 13, 13, 13, 12, 12, 12, 12, 12, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, + 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 12, + 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 19, 18, 18, 17, 17, + 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, + 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 18, 18, 17, 17, 16, 16, 15, 15, + 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 15, 16, + 16, 17, 17, 17, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, + 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, + 17, 18, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, + 12, 11, 11, 11, 11, 11, 11, 10, + /* Size 4x16 */ + 33, 21, 20, 16, 33, 22, 21, 17, 31, 22, 21, 18, 27, 22, 22, 18, 24, 21, + 20, 18, 22, 19, 19, 17, 22, 19, 18, 16, 23, 19, 17, 15, 22, 19, 16, 14, + 21, 19, 16, 14, 20, 19, 15, 13, 20, 18, 14, 12, 18, 17, 14, 12, 18, 17, + 14, 11, 17, 17, 13, 11, 17, 16, 13, 11, + /* Size 16x4 */ + 33, 33, 31, 27, 24, 22, 22, 23, 22, 21, 20, 20, 18, 18, 17, 17, 21, 22, + 22, 22, 21, 19, 19, 19, 19, 19, 19, 18, 17, 17, 17, 16, 20, 21, 21, 22, + 20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 16, 17, 18, 18, 18, 17, + 16, 15, 14, 14, 13, 12, 12, 11, 11, 11, + /* Size 8x32 */ + 32, 32, 26, 21, 20, 18, 16, 15, 33, 31, 25, 22, 21, 19, 17, 16, 33, 31, + 25, 22, 21, 19, 17, 16, 34, 31, 24, 23, 22, 20, 18, 17, 33, 29, 24, 22, + 22, 20, 18, 17, 31, 28, 23, 22, 22, 20, 18, 17, 29, 26, 22, 22, 22, 20, + 19, 18, 28, 25, 22, 22, 22, 20, 19, 18, 25, 24, 21, 21, 21, 20, 18, 17, + 24, 24, 21, 21, 20, 19, 18, 17, 21, 22, 20, 19, 19, 18, 17, 17, 21, 22, + 20, 19, 19, 18, 17, 16, 21, 22, 21, 19, 18, 17, 16, 16, 21, 22, 21, 19, + 18, 17, 16, 16, 21, 23, 21, 18, 17, 16, 15, 15, 21, 22, 21, 18, 17, 16, + 15, 15, 20, 22, 21, 18, 16, 15, 14, 14, 20, 22, 21, 18, 16, 15, 14, 14, + 20, 21, 20, 18, 16, 14, 14, 13, 19, 21, 20, 17, 15, 14, 13, 13, 19, 20, + 20, 17, 15, 14, 13, 13, 18, 20, 20, 17, 15, 13, 12, 12, 18, 20, 19, 17, + 15, 13, 12, 12, 17, 19, 19, 17, 14, 13, 12, 12, 17, 19, 18, 16, 14, 13, + 12, 12, 16, 18, 18, 16, 14, 12, 12, 11, 16, 18, 18, 16, 14, 12, 12, 11, + 16, 18, 18, 16, 14, 12, 11, 11, 16, 17, 17, 16, 14, 12, 11, 11, 15, 17, + 17, 16, 14, 12, 11, 11, 15, 17, 17, 16, 14, 13, 12, 11, 15, 17, 17, 16, + 14, 13, 12, 11, + /* Size 32x8 */ + 32, 33, 33, 34, 33, 31, 29, 28, 25, 24, 21, 21, 21, 21, 21, 21, 20, 20, + 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 32, 31, 31, 31, + 29, 28, 26, 25, 24, 24, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, + 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 26, 25, 25, 24, 24, 23, 22, 22, + 21, 21, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, + 18, 18, 17, 17, 17, 17, 21, 22, 22, 23, 22, 22, 22, 22, 21, 21, 19, 19, + 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, + 16, 16, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, 17, 17, + 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 18, 19, + 19, 20, 20, 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, + 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 13, 13, 16, 17, 17, 18, 18, 18, + 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, + 12, 12, 12, 11, 11, 11, 12, 12, 15, 16, 16, 17, 17, 17, 18, 18, 17, 17, + 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, + 11, 11, 11, 11 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 30, 21, 14, 30, 21, 17, 13, 21, 17, 12, 10, 14, 13, 10, 8, + /* Size 8x8 */ + 32, 32, 30, 27, 22, 18, 15, 13, 32, 31, 29, 26, 23, 19, 16, 14, 30, 29, + 26, 23, 20, 18, 15, 13, 27, 26, 23, 19, 17, 15, 13, 12, 22, 23, 20, 17, + 14, 13, 11, 10, 18, 19, 18, 15, 13, 11, 10, 9, 15, 16, 15, 13, 11, 10, + 9, 8, 13, 14, 13, 12, 10, 9, 8, 7, + /* Size 16x16 */ + 32, 33, 33, 33, 32, 30, 28, 26, 23, 21, 19, 17, 16, 14, 13, 12, 33, 32, + 32, 32, 32, 30, 29, 27, 24, 22, 20, 18, 17, 15, 13, 13, 33, 32, 32, 32, + 32, 31, 30, 28, 25, 23, 21, 19, 17, 16, 14, 14, 33, 32, 32, 31, 30, 29, + 28, 26, 24, 23, 20, 19, 17, 16, 14, 14, 32, 32, 32, 30, 29, 28, 27, 26, + 24, 22, 21, 19, 18, 16, 15, 14, 30, 30, 31, 29, 28, 26, 24, 23, 22, 20, + 19, 18, 16, 15, 14, 13, 28, 29, 30, 28, 27, 24, 21, 20, 19, 18, 17, 16, + 15, 14, 13, 13, 26, 27, 28, 26, 26, 23, 20, 19, 18, 17, 16, 15, 14, 13, + 12, 12, 23, 24, 25, 24, 24, 22, 19, 18, 16, 15, 14, 14, 13, 12, 11, 11, + 21, 22, 23, 23, 22, 20, 18, 17, 15, 14, 13, 13, 12, 11, 11, 10, 19, 20, + 21, 20, 21, 19, 17, 16, 14, 13, 12, 12, 11, 11, 10, 10, 17, 18, 19, 19, + 19, 18, 16, 15, 14, 13, 12, 11, 10, 10, 9, 9, 16, 17, 17, 17, 18, 16, + 15, 14, 13, 12, 11, 10, 10, 9, 9, 8, 14, 15, 16, 16, 16, 15, 14, 13, 12, + 11, 11, 10, 9, 9, 8, 8, 13, 13, 14, 14, 15, 14, 13, 12, 11, 11, 10, 9, + 9, 8, 8, 7, 12, 13, 14, 14, 14, 13, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 32, 32, 30, 30, 28, 28, 26, 26, 23, 23, 21, + 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 24, 22, 22, 20, 20, 18, + 18, 17, 17, 15, 15, 13, 13, 13, 13, 12, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 30, 30, 29, 29, 27, 27, 24, 24, 22, 22, 20, 20, 18, 18, 17, 17, 15, + 15, 13, 13, 13, 13, 12, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, + 30, 28, 28, 25, 25, 23, 23, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 14, + 14, 13, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 28, 28, 25, + 25, 23, 23, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 33, 32, + 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 20, + 20, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 33, 32, 32, 32, 32, 31, + 31, 30, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 20, 20, 19, 19, 17, + 17, 16, 16, 14, 14, 14, 14, 13, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, + 28, 27, 27, 26, 26, 24, 24, 22, 22, 21, 21, 19, 19, 18, 18, 16, 16, 15, + 15, 14, 14, 14, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 28, 27, 27, 26, + 26, 24, 24, 22, 22, 21, 21, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, + 30, 30, 30, 31, 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 20, + 20, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 13, 30, 30, 30, 31, + 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 20, 20, 19, 19, 18, + 18, 16, 16, 15, 15, 14, 14, 13, 13, 13, 28, 29, 29, 30, 30, 28, 28, 27, + 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, + 14, 13, 13, 13, 13, 12, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21, + 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, + 13, 12, 26, 27, 27, 28, 28, 26, 26, 26, 26, 23, 23, 20, 20, 19, 19, 18, + 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 26, 27, + 27, 28, 28, 26, 26, 26, 26, 23, 23, 20, 20, 19, 19, 18, 18, 17, 17, 16, + 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 23, 24, 24, 25, 25, 24, + 24, 24, 24, 22, 22, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 14, 13, + 13, 12, 12, 11, 11, 11, 11, 11, 23, 24, 24, 25, 25, 24, 24, 24, 24, 22, + 22, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11, + 11, 11, 11, 11, 21, 22, 22, 23, 23, 23, 23, 22, 22, 20, 20, 18, 18, 17, + 17, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, + 21, 22, 22, 23, 23, 23, 23, 22, 22, 20, 20, 18, 18, 17, 17, 15, 15, 14, + 14, 13, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 19, 20, 20, 21, + 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, + 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 19, 20, 20, 21, 21, 20, 20, 21, + 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, + 11, 10, 10, 10, 10, 9, 17, 18, 18, 19, 19, 19, 19, 19, 19, 18, 18, 16, + 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, + 9, 17, 18, 18, 19, 19, 19, 19, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, + 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 16, 17, 17, 17, + 17, 17, 17, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, + 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 16, 17, 17, 17, 17, 17, 17, 18, 18, 16, + 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, + 8, 8, 8, 14, 15, 15, 16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, + 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 14, 15, 15, 16, + 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, + 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 13, 13, 13, 14, 14, 14, 14, 15, 15, 14, + 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, + 7, 7, 13, 13, 13, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, + 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 12, 13, 13, 14, 14, + 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 8, + 8, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 14, 14, 13, 13, 13, + 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 12, + 12, 12, 13, 13, 13, 13, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, + 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, + /* Size 4x8 */ + 32, 29, 20, 14, 32, 28, 20, 14, 30, 24, 19, 14, 28, 20, 16, 12, 23, 18, + 13, 11, 19, 16, 12, 9, 16, 14, 11, 8, 14, 13, 10, 8, + /* Size 8x4 */ + 32, 32, 30, 28, 23, 19, 16, 14, 29, 28, 24, 20, 18, 16, 14, 13, 20, 20, + 19, 16, 13, 12, 11, 10, 14, 14, 14, 12, 11, 9, 8, 8, + /* Size 8x16 */ + 32, 33, 32, 28, 23, 19, 16, 13, 33, 32, 32, 29, 24, 20, 17, 14, 33, 32, + 31, 30, 25, 21, 17, 14, 32, 32, 30, 28, 24, 20, 17, 14, 32, 31, 29, 27, + 24, 21, 18, 15, 30, 30, 28, 24, 21, 19, 16, 14, 28, 30, 27, 21, 19, 17, + 15, 13, 26, 28, 26, 20, 18, 16, 14, 12, 23, 25, 24, 19, 16, 14, 13, 11, + 21, 23, 22, 18, 15, 13, 12, 11, 19, 21, 20, 17, 14, 12, 11, 10, 18, 19, + 19, 16, 14, 12, 10, 9, 16, 17, 18, 15, 13, 11, 10, 9, 14, 16, 16, 14, + 12, 11, 9, 8, 13, 14, 15, 13, 11, 10, 9, 8, 12, 14, 14, 13, 11, 10, 8, + 8, + /* Size 16x8 */ + 32, 33, 33, 32, 32, 30, 28, 26, 23, 21, 19, 18, 16, 14, 13, 12, 33, 32, + 32, 32, 31, 30, 30, 28, 25, 23, 21, 19, 17, 16, 14, 14, 32, 32, 31, 30, + 29, 28, 27, 26, 24, 22, 20, 19, 18, 16, 15, 14, 28, 29, 30, 28, 27, 24, + 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 23, 24, 25, 24, 24, 21, 19, 18, + 16, 15, 14, 14, 13, 12, 11, 11, 19, 20, 21, 20, 21, 19, 17, 16, 14, 13, + 12, 12, 11, 11, 10, 10, 16, 17, 17, 17, 18, 16, 15, 14, 13, 12, 11, 10, + 10, 9, 9, 8, 13, 14, 14, 14, 15, 14, 13, 12, 11, 11, 10, 9, 9, 8, 8, 8, + /* Size 16x32 */ + 32, 33, 33, 32, 32, 28, 28, 23, 23, 19, 19, 16, 16, 13, 13, 12, 33, 32, + 32, 32, 32, 29, 29, 24, 24, 20, 20, 17, 17, 14, 14, 12, 33, 32, 32, 32, + 32, 29, 29, 24, 24, 20, 20, 17, 17, 14, 14, 12, 33, 32, 32, 31, 31, 30, + 30, 25, 25, 21, 21, 17, 17, 14, 14, 13, 33, 32, 32, 31, 31, 30, 30, 25, + 25, 21, 21, 17, 17, 14, 14, 13, 32, 32, 32, 30, 30, 28, 28, 24, 24, 20, + 20, 17, 17, 14, 14, 13, 32, 32, 32, 30, 30, 28, 28, 24, 24, 20, 20, 17, + 17, 14, 14, 13, 32, 31, 31, 29, 29, 27, 27, 24, 24, 21, 21, 18, 18, 15, + 15, 14, 32, 31, 31, 29, 29, 27, 27, 24, 24, 21, 21, 18, 18, 15, 15, 14, + 30, 30, 30, 28, 28, 24, 24, 21, 21, 19, 19, 16, 16, 14, 14, 13, 30, 30, + 30, 28, 28, 24, 24, 21, 21, 19, 19, 16, 16, 14, 14, 13, 28, 30, 30, 27, + 27, 21, 21, 19, 19, 17, 17, 15, 15, 13, 13, 12, 28, 30, 30, 27, 27, 21, + 21, 19, 19, 17, 17, 15, 15, 13, 13, 12, 26, 28, 28, 26, 26, 20, 20, 18, + 18, 16, 16, 14, 14, 12, 12, 12, 26, 28, 28, 26, 26, 20, 20, 18, 18, 16, + 16, 14, 14, 12, 12, 12, 23, 25, 25, 24, 24, 19, 19, 16, 16, 14, 14, 13, + 13, 11, 11, 11, 23, 25, 25, 24, 24, 19, 19, 16, 16, 14, 14, 13, 13, 11, + 11, 11, 21, 23, 23, 22, 22, 18, 18, 15, 15, 13, 13, 12, 12, 11, 11, 10, + 21, 23, 23, 22, 22, 18, 18, 15, 15, 13, 13, 12, 12, 11, 11, 10, 19, 21, + 21, 20, 20, 17, 17, 14, 14, 12, 12, 11, 11, 10, 10, 9, 19, 21, 21, 20, + 20, 17, 17, 14, 14, 12, 12, 11, 11, 10, 10, 9, 18, 19, 19, 19, 19, 16, + 16, 14, 14, 12, 12, 10, 10, 9, 9, 9, 18, 19, 19, 19, 19, 16, 16, 14, 14, + 12, 12, 10, 10, 9, 9, 9, 16, 17, 17, 18, 18, 15, 15, 13, 13, 11, 11, 10, + 10, 9, 9, 8, 16, 17, 17, 18, 18, 15, 15, 13, 13, 11, 11, 10, 10, 9, 9, + 8, 14, 16, 16, 16, 16, 14, 14, 12, 12, 11, 11, 9, 9, 8, 8, 8, 14, 16, + 16, 16, 16, 14, 14, 12, 12, 11, 11, 9, 9, 8, 8, 8, 13, 14, 14, 15, 15, + 13, 13, 11, 11, 10, 10, 9, 9, 8, 8, 7, 13, 14, 14, 15, 15, 13, 13, 11, + 11, 10, 10, 9, 9, 8, 8, 7, 12, 14, 14, 14, 14, 13, 13, 11, 11, 10, 10, + 8, 8, 8, 8, 7, 12, 14, 14, 14, 14, 13, 13, 11, 11, 10, 10, 8, 8, 8, 8, + 7, 12, 13, 13, 13, 13, 12, 12, 11, 11, 9, 9, 8, 8, 7, 7, 7, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 28, 28, 26, 26, 23, 23, 21, + 21, 19, 19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 33, 32, 32, 32, + 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 25, 23, 23, 21, 21, 19, + 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 33, 32, 32, 32, 32, 32, 32, 31, + 31, 30, 30, 30, 30, 28, 28, 25, 25, 23, 23, 21, 21, 19, 19, 17, 17, 16, + 16, 14, 14, 14, 14, 13, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, + 27, 26, 26, 24, 24, 22, 22, 20, 20, 19, 19, 18, 18, 16, 16, 15, 15, 14, + 14, 13, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 24, + 24, 22, 22, 20, 20, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 28, 29, + 29, 30, 30, 28, 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 18, 17, + 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 28, 29, 29, 30, 30, 28, + 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, + 15, 14, 14, 13, 13, 13, 13, 12, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21, + 21, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11, + 11, 11, 11, 11, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21, 21, 19, 19, 18, + 18, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, + 19, 20, 20, 21, 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, + 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 19, 20, 20, 21, + 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, + 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 16, 17, 17, 17, 17, 17, 17, 18, + 18, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, + 9, 9, 9, 8, 8, 8, 16, 17, 17, 17, 17, 17, 17, 18, 18, 16, 16, 15, 15, + 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 13, + 14, 14, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, + 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7, 13, 14, 14, 14, 14, 14, 14, 15, + 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, + 8, 8, 8, 7, 12, 12, 12, 13, 13, 13, 13, 14, 14, 13, 13, 12, 12, 12, 12, + 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, + /* Size 4x16 */ + 33, 28, 19, 13, 32, 29, 20, 14, 32, 30, 21, 14, 32, 28, 20, 14, 31, 27, + 21, 15, 30, 24, 19, 14, 30, 21, 17, 13, 28, 20, 16, 12, 25, 19, 14, 11, + 23, 18, 13, 11, 21, 17, 12, 10, 19, 16, 12, 9, 17, 15, 11, 9, 16, 14, + 11, 8, 14, 13, 10, 8, 14, 13, 10, 8, + /* Size 16x4 */ + 33, 32, 32, 32, 31, 30, 30, 28, 25, 23, 21, 19, 17, 16, 14, 14, 28, 29, + 30, 28, 27, 24, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 19, 20, 21, 20, + 21, 19, 17, 16, 14, 13, 12, 12, 11, 11, 10, 10, 13, 14, 14, 14, 15, 14, + 13, 12, 11, 11, 10, 9, 9, 8, 8, 8, + /* Size 8x32 */ + 32, 33, 32, 28, 23, 19, 16, 13, 33, 32, 32, 29, 24, 20, 17, 14, 33, 32, + 32, 29, 24, 20, 17, 14, 33, 32, 31, 30, 25, 21, 17, 14, 33, 32, 31, 30, + 25, 21, 17, 14, 32, 32, 30, 28, 24, 20, 17, 14, 32, 32, 30, 28, 24, 20, + 17, 14, 32, 31, 29, 27, 24, 21, 18, 15, 32, 31, 29, 27, 24, 21, 18, 15, + 30, 30, 28, 24, 21, 19, 16, 14, 30, 30, 28, 24, 21, 19, 16, 14, 28, 30, + 27, 21, 19, 17, 15, 13, 28, 30, 27, 21, 19, 17, 15, 13, 26, 28, 26, 20, + 18, 16, 14, 12, 26, 28, 26, 20, 18, 16, 14, 12, 23, 25, 24, 19, 16, 14, + 13, 11, 23, 25, 24, 19, 16, 14, 13, 11, 21, 23, 22, 18, 15, 13, 12, 11, + 21, 23, 22, 18, 15, 13, 12, 11, 19, 21, 20, 17, 14, 12, 11, 10, 19, 21, + 20, 17, 14, 12, 11, 10, 18, 19, 19, 16, 14, 12, 10, 9, 18, 19, 19, 16, + 14, 12, 10, 9, 16, 17, 18, 15, 13, 11, 10, 9, 16, 17, 18, 15, 13, 11, + 10, 9, 14, 16, 16, 14, 12, 11, 9, 8, 14, 16, 16, 14, 12, 11, 9, 8, 13, + 14, 15, 13, 11, 10, 9, 8, 13, 14, 15, 13, 11, 10, 9, 8, 12, 14, 14, 13, + 11, 10, 8, 8, 12, 14, 14, 13, 11, 10, 8, 8, 12, 13, 13, 12, 11, 9, 8, 7, + /* Size 32x8 */ + 32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 28, 28, 26, 26, 23, 23, 21, + 21, 19, 19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 33, 32, 32, 32, + 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 25, 23, 23, 21, 21, 19, + 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 32, 32, 32, 31, 31, 30, 30, 29, + 29, 28, 28, 27, 27, 26, 26, 24, 24, 22, 22, 20, 20, 19, 19, 18, 18, 16, + 16, 15, 15, 14, 14, 13, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21, + 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, + 13, 12, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21, 21, 19, 19, 18, 18, 16, + 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 19, 20, + 20, 21, 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, + 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 16, 17, 17, 17, 17, 17, + 17, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, + 10, 9, 9, 9, 9, 8, 8, 8, 13, 14, 14, 14, 14, 14, 14, 15, 15, 14, 14, 13, + 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7 }, + { /* Chroma */ + /* Size 4x4 */ + 32, 22, 22, 18, 22, 19, 19, 17, 22, 19, 16, 14, 18, 17, 14, 12, + /* Size 8x8 */ + 33, 30, 24, 22, 21, 20, 18, 17, 30, 26, 23, 22, 22, 21, 19, 18, 24, 23, + 21, 21, 20, 20, 19, 18, 22, 22, 21, 19, 18, 18, 17, 16, 21, 22, 20, 18, + 17, 16, 15, 14, 20, 21, 20, 18, 16, 14, 14, 13, 18, 19, 19, 17, 15, 14, + 12, 12, 17, 18, 18, 16, 14, 13, 12, 11, + /* Size 16x16 */ + 32, 33, 34, 31, 28, 25, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 33, 33, + 33, 30, 27, 24, 22, 22, 22, 21, 20, 20, 19, 18, 17, 17, 34, 33, 32, 29, + 26, 24, 22, 23, 23, 22, 22, 21, 20, 19, 18, 18, 31, 30, 29, 26, 24, 23, + 22, 22, 23, 22, 22, 21, 20, 19, 18, 18, 28, 27, 26, 24, 22, 22, 21, 22, + 23, 22, 22, 21, 20, 20, 19, 19, 25, 24, 24, 23, 22, 21, 20, 21, 21, 20, + 20, 20, 19, 19, 18, 18, 21, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, + 18, 18, 17, 17, 21, 22, 23, 22, 22, 21, 19, 19, 19, 18, 18, 18, 17, 17, + 16, 16, 21, 22, 23, 23, 23, 21, 19, 19, 18, 17, 17, 17, 16, 16, 15, 15, + 20, 21, 22, 22, 22, 20, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 20, 20, + 22, 22, 22, 20, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 19, 20, 21, 21, + 21, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 13, 18, 19, 20, 20, 20, 19, + 18, 17, 16, 15, 15, 14, 13, 13, 12, 12, 17, 18, 19, 19, 20, 19, 18, 17, + 16, 15, 14, 14, 13, 12, 12, 12, 16, 17, 18, 18, 19, 18, 17, 16, 15, 14, + 14, 13, 12, 12, 12, 11, 16, 17, 18, 18, 19, 18, 17, 16, 15, 14, 14, 13, + 12, 12, 11, 11, + /* Size 32x32 */ + 32, 33, 33, 34, 34, 31, 31, 28, 28, 25, 25, 21, 21, 21, 21, 21, 21, 20, + 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 33, 33, 33, 33, + 33, 30, 30, 27, 27, 24, 24, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, + 20, 19, 19, 18, 18, 17, 17, 17, 17, 16, 33, 33, 33, 33, 33, 30, 30, 27, + 27, 24, 24, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18, + 18, 17, 17, 17, 17, 16, 34, 33, 33, 32, 32, 29, 29, 26, 26, 24, 24, 22, + 22, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, + 18, 17, 34, 33, 33, 32, 32, 29, 29, 26, 26, 24, 24, 22, 22, 23, 23, 23, + 23, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 18, 17, 31, 30, + 30, 29, 29, 26, 26, 24, 24, 23, 23, 22, 22, 22, 22, 23, 23, 22, 22, 22, + 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 18, 17, 31, 30, 30, 29, 29, 26, + 26, 24, 24, 23, 23, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, + 20, 19, 19, 18, 18, 18, 18, 17, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, + 22, 21, 21, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, + 19, 19, 19, 18, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, + 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 18, + 25, 24, 24, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 20, + 20, 20, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 25, 24, 24, 24, + 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 20, 20, 20, 20, 20, + 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 21, 22, 22, 22, 22, 22, 22, 21, + 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, + 18, 17, 17, 17, 17, 17, 21, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, + 17, 17, 21, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 19, 19, + 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 21, 22, + 22, 23, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 19, 19, 19, 18, 18, 18, + 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 21, 22, 22, 23, 23, 23, + 23, 23, 23, 21, 21, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16, + 16, 16, 16, 15, 15, 15, 15, 15, 21, 22, 22, 23, 23, 23, 23, 23, 23, 21, + 21, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, + 15, 15, 15, 15, 20, 21, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 18, + 18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, + 20, 21, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, + 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 20, 20, 20, 22, + 22, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, + 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 20, 20, 20, 22, 22, 22, 22, 22, + 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, + 14, 14, 14, 14, 14, 13, 19, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 19, + 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, + 13, 13, 19, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17, + 17, 16, 16, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 18, 19, + 19, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15, + 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 18, 19, 19, 20, 20, 20, + 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15, 15, 14, 14, 13, + 13, 13, 13, 12, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 19, 20, 20, 19, + 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, + 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 19, 20, 20, 19, 19, 18, 18, 17, + 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, + 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, + 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 17, 17, 18, + 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, + 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 17, 17, 18, 18, 18, 18, 19, + 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, + 12, 11, 11, 11, 11, 11, 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, + 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, + 11, 11, 15, 16, 16, 17, 17, 17, 17, 18, 18, 17, 17, 17, 17, 16, 16, 15, + 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, + /* Size 4x8 */ + 33, 22, 20, 17, 28, 22, 22, 18, 24, 20, 20, 18, 22, 19, 18, 16, 22, 19, + 16, 14, 20, 19, 15, 13, 19, 18, 14, 12, 17, 17, 14, 11, + /* Size 8x4 */ + 33, 28, 24, 22, 22, 20, 19, 17, 22, 22, 20, 19, 19, 19, 18, 17, 20, 22, + 20, 18, 16, 15, 14, 14, 17, 18, 18, 16, 14, 13, 12, 11, + /* Size 8x16 */ + 32, 33, 28, 21, 21, 20, 18, 16, 33, 33, 27, 22, 22, 20, 19, 17, 34, 32, + 26, 22, 23, 21, 20, 18, 31, 28, 24, 22, 22, 22, 20, 18, 28, 26, 22, 22, + 23, 22, 20, 19, 24, 24, 22, 20, 21, 20, 19, 18, 21, 22, 21, 19, 19, 19, + 18, 17, 21, 22, 22, 19, 18, 18, 17, 16, 21, 23, 22, 19, 18, 17, 16, 15, + 20, 22, 22, 19, 17, 16, 15, 14, 20, 21, 22, 19, 17, 16, 14, 14, 19, 20, + 21, 19, 17, 15, 14, 13, 18, 20, 20, 18, 16, 15, 13, 12, 17, 19, 20, 18, + 16, 14, 13, 12, 16, 18, 19, 17, 15, 14, 12, 12, 16, 17, 18, 17, 15, 14, + 12, 11, + /* Size 16x8 */ + 32, 33, 34, 31, 28, 24, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 33, 33, + 32, 28, 26, 24, 22, 22, 23, 22, 21, 20, 20, 19, 18, 17, 28, 27, 26, 24, + 22, 22, 21, 22, 22, 22, 22, 21, 20, 20, 19, 18, 21, 22, 22, 22, 22, 20, + 19, 19, 19, 19, 19, 19, 18, 18, 17, 17, 21, 22, 23, 22, 23, 21, 19, 18, + 18, 17, 17, 17, 16, 16, 15, 15, 20, 20, 21, 22, 22, 20, 19, 18, 17, 16, + 16, 15, 15, 14, 14, 14, 18, 19, 20, 20, 20, 19, 18, 17, 16, 15, 14, 14, + 13, 13, 12, 12, 16, 17, 18, 18, 19, 18, 17, 16, 15, 14, 14, 13, 12, 12, + 12, 11, + /* Size 16x32 */ + 32, 33, 33, 28, 28, 21, 21, 21, 21, 20, 20, 18, 18, 16, 16, 16, 33, 33, + 33, 27, 27, 22, 22, 22, 22, 20, 20, 19, 19, 17, 17, 16, 33, 33, 33, 27, + 27, 22, 22, 22, 22, 20, 20, 19, 19, 17, 17, 16, 34, 32, 32, 26, 26, 22, + 22, 23, 23, 21, 21, 20, 20, 18, 18, 17, 34, 32, 32, 26, 26, 22, 22, 23, + 23, 21, 21, 20, 20, 18, 18, 17, 31, 28, 28, 24, 24, 22, 22, 22, 22, 22, + 22, 20, 20, 18, 18, 17, 31, 28, 28, 24, 24, 22, 22, 22, 22, 22, 22, 20, + 20, 18, 18, 17, 28, 26, 26, 22, 22, 22, 22, 23, 23, 22, 22, 20, 20, 19, + 19, 18, 28, 26, 26, 22, 22, 22, 22, 23, 23, 22, 22, 20, 20, 19, 19, 18, + 24, 24, 24, 22, 22, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 17, 24, 24, + 24, 22, 22, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 17, 21, 22, 22, 21, + 21, 19, 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 21, 22, 22, 21, 21, 19, + 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 21, 22, 22, 22, 22, 19, 19, 18, + 18, 18, 18, 17, 17, 16, 16, 16, 21, 22, 22, 22, 22, 19, 19, 18, 18, 18, + 18, 17, 17, 16, 16, 16, 21, 23, 23, 22, 22, 19, 19, 18, 18, 17, 17, 16, + 16, 15, 15, 15, 21, 23, 23, 22, 22, 19, 19, 18, 18, 17, 17, 16, 16, 15, + 15, 15, 20, 22, 22, 22, 22, 19, 19, 17, 17, 16, 16, 15, 15, 14, 14, 14, + 20, 22, 22, 22, 22, 19, 19, 17, 17, 16, 16, 15, 15, 14, 14, 14, 20, 21, + 21, 22, 22, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 20, 21, 21, 22, + 22, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 19, 20, 20, 21, 21, 19, + 19, 17, 17, 15, 15, 14, 14, 13, 13, 13, 19, 20, 20, 21, 21, 19, 19, 17, + 17, 15, 15, 14, 14, 13, 13, 13, 18, 20, 20, 20, 20, 18, 18, 16, 16, 15, + 15, 13, 13, 12, 12, 12, 18, 20, 20, 20, 20, 18, 18, 16, 16, 15, 15, 13, + 13, 12, 12, 12, 17, 19, 19, 20, 20, 18, 18, 16, 16, 14, 14, 13, 13, 12, + 12, 12, 17, 19, 19, 20, 20, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, + 16, 18, 18, 19, 19, 17, 17, 15, 15, 14, 14, 12, 12, 12, 12, 11, 16, 18, + 18, 19, 19, 17, 17, 15, 15, 14, 14, 12, 12, 12, 12, 11, 16, 17, 17, 18, + 18, 17, 17, 15, 15, 14, 14, 12, 12, 11, 11, 11, 16, 17, 17, 18, 18, 17, + 17, 15, 15, 14, 14, 12, 12, 11, 11, 11, 16, 17, 17, 18, 18, 16, 16, 15, + 15, 13, 13, 12, 12, 11, 11, 11, + /* Size 32x16 */ + 32, 33, 33, 34, 34, 31, 31, 28, 28, 24, 24, 21, 21, 21, 21, 21, 21, 20, + 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 33, 33, 33, 32, + 32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20, + 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 33, 33, 33, 32, 32, 28, 28, 26, + 26, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20, 20, 20, 20, 19, + 19, 18, 18, 17, 17, 17, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, + 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18, + 18, 18, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, 22, 22, + 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 21, 22, + 22, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, 21, 22, 22, 22, 22, 22, + 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, + 18, 18, 18, 17, 17, 17, 17, 16, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21, + 21, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, + 15, 15, 15, 15, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21, 21, 19, 19, 18, + 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, + 20, 20, 20, 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, + 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 20, 20, 20, 21, + 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, + 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 18, 19, 19, 20, 20, 20, 20, 20, + 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, + 13, 12, 12, 12, 12, 12, 18, 19, 19, 20, 20, 20, 20, 20, 20, 19, 19, 18, + 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, + 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, + 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 17, + 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, + 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 16, 16, 17, 17, 17, + 17, 18, 18, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, + 12, 12, 12, 11, 11, 11, 11, 11, + /* Size 4x16 */ + 33, 21, 20, 16, 33, 22, 20, 17, 32, 22, 21, 18, 28, 22, 22, 18, 26, 22, + 22, 19, 24, 20, 20, 18, 22, 19, 19, 17, 22, 19, 18, 16, 23, 19, 17, 15, + 22, 19, 16, 14, 21, 19, 16, 14, 20, 19, 15, 13, 20, 18, 15, 12, 19, 18, + 14, 12, 18, 17, 14, 12, 17, 17, 14, 11, + /* Size 16x4 */ + 33, 33, 32, 28, 26, 24, 22, 22, 23, 22, 21, 20, 20, 19, 18, 17, 21, 22, + 22, 22, 22, 20, 19, 19, 19, 19, 19, 19, 18, 18, 17, 17, 20, 20, 21, 22, + 22, 20, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 16, 17, 18, 18, 19, 18, + 17, 16, 15, 14, 14, 13, 12, 12, 12, 11, + /* Size 8x32 */ + 32, 33, 28, 21, 21, 20, 18, 16, 33, 33, 27, 22, 22, 20, 19, 17, 33, 33, + 27, 22, 22, 20, 19, 17, 34, 32, 26, 22, 23, 21, 20, 18, 34, 32, 26, 22, + 23, 21, 20, 18, 31, 28, 24, 22, 22, 22, 20, 18, 31, 28, 24, 22, 22, 22, + 20, 18, 28, 26, 22, 22, 23, 22, 20, 19, 28, 26, 22, 22, 23, 22, 20, 19, + 24, 24, 22, 20, 21, 20, 19, 18, 24, 24, 22, 20, 21, 20, 19, 18, 21, 22, + 21, 19, 19, 19, 18, 17, 21, 22, 21, 19, 19, 19, 18, 17, 21, 22, 22, 19, + 18, 18, 17, 16, 21, 22, 22, 19, 18, 18, 17, 16, 21, 23, 22, 19, 18, 17, + 16, 15, 21, 23, 22, 19, 18, 17, 16, 15, 20, 22, 22, 19, 17, 16, 15, 14, + 20, 22, 22, 19, 17, 16, 15, 14, 20, 21, 22, 19, 17, 16, 14, 14, 20, 21, + 22, 19, 17, 16, 14, 14, 19, 20, 21, 19, 17, 15, 14, 13, 19, 20, 21, 19, + 17, 15, 14, 13, 18, 20, 20, 18, 16, 15, 13, 12, 18, 20, 20, 18, 16, 15, + 13, 12, 17, 19, 20, 18, 16, 14, 13, 12, 17, 19, 20, 18, 16, 14, 13, 12, + 16, 18, 19, 17, 15, 14, 12, 12, 16, 18, 19, 17, 15, 14, 12, 12, 16, 17, + 18, 17, 15, 14, 12, 11, 16, 17, 18, 17, 15, 14, 12, 11, 16, 17, 18, 16, + 15, 13, 12, 11, + /* Size 32x8 */ + 32, 33, 33, 34, 34, 31, 31, 28, 28, 24, 24, 21, 21, 21, 21, 21, 21, 20, + 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 33, 33, 33, 32, + 32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20, + 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 28, 27, 27, 26, 26, 24, 24, 22, + 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, + 20, 19, 19, 18, 18, 18, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, + 17, 16, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21, 21, 19, 19, 18, 18, 18, + 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 20, 20, + 20, 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, + 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 18, 19, 19, 20, 20, 20, + 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, + 13, 13, 13, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, + 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, + 12, 11, 11, 11 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 31, 23, 17, 31, 26, 20, 16, 23, 20, 14, 12, 17, 16, 12, 9, + /* Size 8x8 */ + 33, 32, 32, 29, 24, 20, 17, 15, 32, 32, 31, 29, 25, 21, 18, 16, 32, 31, + 29, 27, 24, 21, 18, 16, 29, 29, 27, 21, 19, 17, 16, 14, 24, 25, 24, 19, + 16, 14, 13, 12, 20, 21, 21, 17, 14, 13, 12, 11, 17, 18, 18, 16, 13, 12, + 10, 9, 15, 16, 16, 14, 12, 11, 9, 9, + /* Size 16x16 */ + 32, 33, 33, 33, 32, 30, 29, 27, 25, 23, 21, 19, 17, 16, 14, 13, 33, 32, + 32, 32, 32, 30, 29, 28, 26, 24, 22, 20, 18, 17, 15, 13, 33, 32, 32, 32, + 32, 31, 30, 28, 27, 25, 23, 21, 19, 17, 16, 14, 33, 32, 32, 31, 30, 29, + 28, 27, 26, 24, 23, 20, 19, 17, 16, 14, 32, 32, 32, 30, 29, 28, 27, 26, + 25, 24, 22, 21, 19, 18, 16, 15, 30, 30, 31, 29, 28, 26, 24, 23, 22, 21, + 20, 19, 18, 16, 15, 14, 29, 29, 30, 28, 27, 24, 22, 21, 20, 19, 19, 17, + 17, 15, 14, 13, 27, 28, 28, 27, 26, 23, 21, 20, 19, 18, 17, 16, 15, 14, + 13, 12, 25, 26, 27, 26, 25, 22, 20, 19, 18, 17, 16, 15, 14, 14, 13, 12, + 23, 24, 25, 24, 24, 21, 19, 18, 17, 16, 15, 14, 13, 13, 12, 11, 21, 22, + 23, 23, 22, 20, 19, 17, 16, 15, 14, 13, 13, 12, 11, 11, 19, 20, 21, 20, + 21, 19, 17, 16, 15, 14, 13, 12, 12, 11, 11, 10, 17, 18, 19, 19, 19, 18, + 17, 15, 14, 13, 13, 12, 11, 10, 10, 9, 16, 17, 17, 17, 18, 16, 15, 14, + 14, 13, 12, 11, 10, 10, 9, 9, 14, 15, 16, 16, 16, 15, 14, 13, 13, 12, + 11, 11, 10, 9, 9, 8, 13, 13, 14, 14, 15, 14, 13, 12, 12, 11, 11, 10, 9, + 9, 8, 8, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 30, 30, 29, 28, 27, 26, 25, 23, + 23, 21, 21, 19, 19, 18, 17, 17, 16, 15, 14, 14, 13, 13, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 27, 26, 24, 24, 22, 22, 20, + 20, 19, 18, 17, 17, 16, 15, 15, 13, 13, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 30, 30, 29, 29, 28, 27, 26, 24, 24, 22, 22, 20, 20, 19, 18, 17, + 17, 16, 15, 15, 13, 13, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 30, 30, 28, 27, 26, 25, 24, 23, 23, 21, 20, 19, 19, 18, 17, 17, 16, 16, + 14, 14, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 28, 28, + 27, 25, 25, 23, 23, 21, 21, 20, 19, 18, 17, 17, 16, 16, 14, 14, 33, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29, 28, 27, 26, 25, 24, 23, + 23, 21, 21, 20, 19, 18, 17, 17, 16, 16, 14, 14, 33, 32, 32, 32, 32, 31, + 31, 31, 30, 30, 29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 21, 20, 20, + 19, 18, 17, 17, 16, 16, 14, 14, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, + 29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 21, 21, 20, 19, 18, 17, 17, + 16, 16, 15, 15, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 27, 27, + 26, 26, 25, 24, 24, 22, 22, 21, 21, 20, 19, 19, 18, 17, 16, 16, 15, 15, + 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 25, 25, 24, + 24, 22, 22, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 15, 30, 30, 30, 31, + 31, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 21, 20, 20, 19, + 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 30, 30, 30, 31, 31, 30, 29, 29, + 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 21, 20, 20, 19, 19, 18, 18, 17, + 16, 16, 15, 15, 14, 14, 29, 29, 29, 30, 30, 29, 28, 28, 27, 27, 24, 24, + 22, 22, 21, 21, 20, 20, 19, 19, 19, 18, 17, 17, 17, 16, 15, 15, 14, 14, + 13, 13, 28, 29, 29, 30, 30, 29, 28, 28, 27, 27, 24, 24, 22, 21, 20, 20, + 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 27, 28, + 28, 28, 28, 28, 27, 27, 26, 26, 23, 23, 21, 20, 20, 20, 19, 18, 18, 17, + 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 26, 27, 27, 27, 28, 27, + 26, 26, 26, 25, 23, 23, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, + 15, 14, 14, 14, 13, 13, 12, 12, 25, 26, 26, 26, 27, 26, 26, 26, 25, 25, + 22, 22, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 13, + 13, 13, 12, 12, 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 22, 22, 20, 19, + 18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, + 23, 24, 24, 24, 25, 24, 24, 24, 24, 24, 21, 21, 19, 19, 18, 18, 17, 16, + 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 21, 22, 22, 23, + 23, 23, 23, 23, 22, 22, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, 14, + 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 21, 22, 22, 23, 23, 23, 23, 23, + 22, 22, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, + 12, 12, 11, 11, 11, 11, 19, 20, 20, 21, 21, 21, 21, 21, 21, 21, 19, 19, + 18, 17, 17, 16, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, + 10, 10, 19, 20, 20, 20, 21, 21, 20, 21, 21, 20, 19, 19, 17, 17, 16, 16, + 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 18, 19, + 19, 19, 20, 20, 20, 20, 20, 20, 18, 18, 17, 17, 16, 15, 15, 14, 14, 13, + 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 17, 18, 18, 19, 19, 19, + 19, 19, 19, 19, 18, 18, 17, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, + 11, 11, 10, 10, 10, 10, 9, 9, 17, 17, 17, 18, 18, 18, 18, 18, 19, 18, + 17, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 10, 10, 10, + 10, 9, 9, 9, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 16, 16, 15, 15, 14, + 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 15, 16, + 16, 17, 17, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, + 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 14, 15, 15, 16, 16, 16, 16, + 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, + 10, 9, 9, 9, 9, 8, 8, 14, 15, 15, 16, 16, 16, 16, 16, 16, 16, 15, 15, + 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 8, 8, + 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, + 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 8, 8, 8, 8, 13, 13, 13, 14, 14, 14, + 14, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 9, + 9, 9, 9, 9, 8, 8, 8, 8, + /* Size 4x8 */ + 32, 30, 24, 17, 32, 30, 24, 17, 31, 28, 23, 18, 29, 24, 19, 15, 25, 21, + 16, 13, 21, 19, 14, 11, 18, 17, 13, 10, 16, 15, 12, 9, + /* Size 8x4 */ + 32, 32, 31, 29, 25, 21, 18, 16, 30, 30, 28, 24, 21, 19, 17, 15, 24, 24, + 23, 19, 16, 14, 13, 12, 17, 17, 18, 15, 13, 11, 10, 9, + /* Size 8x16 */ + 32, 33, 32, 28, 23, 19, 17, 14, 33, 32, 32, 29, 24, 20, 17, 15, 33, 32, + 31, 30, 25, 21, 18, 16, 32, 32, 30, 28, 24, 20, 18, 16, 32, 31, 29, 27, + 24, 21, 18, 16, 30, 30, 28, 24, 21, 19, 17, 15, 29, 30, 27, 22, 20, 17, + 16, 14, 27, 28, 26, 21, 18, 16, 15, 13, 25, 26, 25, 20, 17, 15, 14, 13, + 23, 24, 24, 19, 16, 14, 13, 12, 21, 23, 22, 18, 15, 13, 12, 11, 19, 21, + 20, 17, 14, 12, 11, 10, 18, 19, 19, 16, 14, 12, 11, 10, 16, 17, 18, 15, + 13, 11, 10, 9, 14, 16, 16, 14, 12, 11, 9, 9, 13, 14, 15, 13, 11, 10, 9, + 8, + /* Size 16x8 */ + 32, 33, 33, 32, 32, 30, 29, 27, 25, 23, 21, 19, 18, 16, 14, 13, 33, 32, + 32, 32, 31, 30, 30, 28, 26, 24, 23, 21, 19, 17, 16, 14, 32, 32, 31, 30, + 29, 28, 27, 26, 25, 24, 22, 20, 19, 18, 16, 15, 28, 29, 30, 28, 27, 24, + 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 23, 24, 25, 24, 24, 21, 20, 18, + 17, 16, 15, 14, 14, 13, 12, 11, 19, 20, 21, 20, 21, 19, 17, 16, 15, 14, + 13, 12, 12, 11, 11, 10, 17, 17, 18, 18, 18, 17, 16, 15, 14, 13, 12, 11, + 11, 10, 9, 9, 14, 15, 16, 16, 16, 15, 14, 13, 13, 12, 11, 10, 10, 9, 9, + 8, + /* Size 16x32 */ + 32, 33, 33, 32, 32, 30, 28, 27, 23, 23, 19, 19, 17, 16, 14, 13, 33, 32, + 32, 32, 32, 30, 29, 28, 24, 24, 20, 20, 17, 17, 15, 14, 33, 32, 32, 32, + 32, 30, 29, 28, 24, 24, 20, 20, 17, 17, 15, 14, 33, 32, 32, 32, 32, 31, + 29, 28, 25, 24, 20, 20, 18, 17, 15, 14, 33, 32, 32, 32, 31, 31, 30, 28, + 25, 25, 21, 21, 18, 17, 16, 14, 33, 32, 32, 31, 31, 30, 29, 28, 25, 24, + 21, 21, 18, 17, 16, 14, 32, 32, 32, 31, 30, 29, 28, 27, 24, 24, 20, 20, + 18, 17, 16, 14, 32, 32, 32, 30, 30, 29, 28, 27, 24, 24, 21, 21, 18, 17, + 16, 15, 32, 32, 31, 30, 29, 28, 27, 26, 24, 24, 21, 21, 18, 18, 16, 15, + 32, 31, 31, 30, 29, 28, 26, 26, 24, 23, 20, 20, 18, 18, 16, 15, 30, 30, + 30, 28, 28, 26, 24, 23, 21, 21, 19, 19, 17, 16, 15, 14, 30, 30, 30, 28, + 28, 26, 24, 23, 21, 21, 19, 19, 17, 16, 15, 14, 29, 30, 30, 28, 27, 24, + 22, 21, 20, 19, 17, 17, 16, 15, 14, 13, 28, 29, 30, 28, 27, 24, 21, 21, + 19, 19, 17, 17, 16, 15, 14, 13, 27, 28, 28, 27, 26, 23, 21, 20, 18, 18, + 16, 16, 15, 14, 13, 13, 26, 27, 28, 26, 26, 23, 20, 20, 18, 18, 16, 16, + 14, 14, 13, 12, 25, 26, 26, 25, 25, 22, 20, 19, 17, 17, 15, 15, 14, 13, + 13, 12, 23, 25, 25, 24, 24, 21, 19, 18, 16, 16, 14, 14, 13, 13, 12, 11, + 23, 24, 24, 24, 24, 21, 19, 18, 16, 16, 14, 14, 13, 13, 12, 11, 21, 23, + 23, 22, 22, 20, 18, 17, 15, 15, 13, 13, 12, 12, 11, 11, 21, 23, 23, 22, + 22, 20, 18, 17, 15, 15, 13, 13, 12, 12, 11, 11, 19, 21, 21, 21, 21, 19, + 17, 17, 14, 14, 13, 13, 12, 11, 10, 10, 19, 20, 21, 20, 20, 19, 17, 16, + 14, 14, 12, 12, 11, 11, 10, 10, 18, 19, 20, 20, 20, 18, 17, 16, 14, 14, + 12, 12, 11, 11, 10, 9, 18, 19, 19, 19, 19, 18, 16, 15, 14, 13, 12, 12, + 11, 10, 10, 9, 17, 18, 18, 18, 18, 17, 16, 15, 13, 13, 12, 12, 10, 10, + 9, 9, 16, 17, 17, 17, 18, 16, 15, 14, 13, 13, 11, 11, 10, 10, 9, 9, 15, + 17, 17, 17, 17, 16, 15, 14, 13, 12, 11, 11, 10, 10, 9, 9, 14, 16, 16, + 16, 16, 15, 14, 13, 12, 12, 11, 11, 9, 9, 9, 8, 14, 16, 16, 16, 16, 15, + 14, 13, 12, 12, 10, 10, 9, 9, 9, 8, 13, 14, 14, 14, 15, 14, 13, 12, 11, + 11, 10, 10, 9, 9, 8, 8, 13, 14, 14, 14, 15, 14, 13, 12, 11, 11, 10, 10, + 9, 9, 8, 8, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 29, 28, 27, 26, 25, 23, + 23, 21, 21, 19, 19, 18, 18, 17, 16, 15, 14, 14, 13, 13, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 28, 27, 26, 25, 24, 23, 23, 21, + 20, 19, 19, 18, 17, 17, 16, 16, 14, 14, 33, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 24, 23, 23, 21, 21, 20, 19, 18, + 17, 17, 16, 16, 14, 14, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 28, 28, + 28, 28, 27, 26, 25, 24, 24, 22, 22, 21, 20, 20, 19, 18, 17, 17, 16, 16, + 14, 14, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, + 25, 24, 24, 22, 22, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 15, 30, 30, + 30, 31, 31, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 21, 21, 20, + 20, 19, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 28, 29, 29, 29, 30, 29, + 28, 28, 27, 26, 24, 24, 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, + 16, 16, 15, 15, 14, 14, 13, 13, 27, 28, 28, 28, 28, 28, 27, 27, 26, 26, + 23, 23, 21, 21, 20, 20, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, + 13, 13, 12, 12, 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 21, 21, 20, 19, + 18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, + 23, 24, 24, 24, 25, 24, 24, 24, 24, 23, 21, 21, 19, 19, 18, 18, 17, 16, + 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 19, 20, 20, 20, + 21, 21, 20, 21, 21, 20, 19, 19, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13, + 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 19, 20, 20, 20, 21, 21, 20, 21, + 21, 20, 19, 19, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, + 11, 11, 11, 10, 10, 10, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 17, 17, + 16, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, + 9, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 16, 16, 15, 15, 14, 14, 13, + 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 14, 15, 15, 15, + 16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, + 10, 10, 10, 9, 9, 9, 9, 9, 8, 8, 13, 14, 14, 14, 14, 14, 14, 15, 15, 15, + 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 8, 8, + 8, 8, + /* Size 4x16 */ + 33, 30, 23, 16, 32, 30, 24, 17, 32, 31, 25, 17, 32, 29, 24, 17, 32, 28, + 24, 18, 30, 26, 21, 16, 30, 24, 19, 15, 28, 23, 18, 14, 26, 22, 17, 13, + 24, 21, 16, 13, 23, 20, 15, 12, 20, 19, 14, 11, 19, 18, 13, 10, 17, 16, + 13, 10, 16, 15, 12, 9, 14, 14, 11, 9, + /* Size 16x4 */ + 33, 32, 32, 32, 32, 30, 30, 28, 26, 24, 23, 20, 19, 17, 16, 14, 30, 30, + 31, 29, 28, 26, 24, 23, 22, 21, 20, 19, 18, 16, 15, 14, 23, 24, 25, 24, + 24, 21, 19, 18, 17, 16, 15, 14, 13, 13, 12, 11, 16, 17, 17, 17, 18, 16, + 15, 14, 13, 13, 12, 11, 10, 10, 9, 9, + /* Size 8x32 */ + 32, 33, 32, 28, 23, 19, 17, 14, 33, 32, 32, 29, 24, 20, 17, 15, 33, 32, + 32, 29, 24, 20, 17, 15, 33, 32, 32, 29, 25, 20, 18, 15, 33, 32, 31, 30, + 25, 21, 18, 16, 33, 32, 31, 29, 25, 21, 18, 16, 32, 32, 30, 28, 24, 20, + 18, 16, 32, 32, 30, 28, 24, 21, 18, 16, 32, 31, 29, 27, 24, 21, 18, 16, + 32, 31, 29, 26, 24, 20, 18, 16, 30, 30, 28, 24, 21, 19, 17, 15, 30, 30, + 28, 24, 21, 19, 17, 15, 29, 30, 27, 22, 20, 17, 16, 14, 28, 30, 27, 21, + 19, 17, 16, 14, 27, 28, 26, 21, 18, 16, 15, 13, 26, 28, 26, 20, 18, 16, + 14, 13, 25, 26, 25, 20, 17, 15, 14, 13, 23, 25, 24, 19, 16, 14, 13, 12, + 23, 24, 24, 19, 16, 14, 13, 12, 21, 23, 22, 18, 15, 13, 12, 11, 21, 23, + 22, 18, 15, 13, 12, 11, 19, 21, 21, 17, 14, 13, 12, 10, 19, 21, 20, 17, + 14, 12, 11, 10, 18, 20, 20, 17, 14, 12, 11, 10, 18, 19, 19, 16, 14, 12, + 11, 10, 17, 18, 18, 16, 13, 12, 10, 9, 16, 17, 18, 15, 13, 11, 10, 9, + 15, 17, 17, 15, 13, 11, 10, 9, 14, 16, 16, 14, 12, 11, 9, 9, 14, 16, 16, + 14, 12, 10, 9, 9, 13, 14, 15, 13, 11, 10, 9, 8, 13, 14, 15, 13, 11, 10, + 9, 8, + /* Size 32x8 */ + 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 29, 28, 27, 26, 25, 23, + 23, 21, 21, 19, 19, 18, 18, 17, 16, 15, 14, 14, 13, 13, 33, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 24, 23, 23, 21, + 21, 20, 19, 18, 17, 17, 16, 16, 14, 14, 32, 32, 32, 32, 31, 31, 30, 30, + 29, 29, 28, 28, 27, 27, 26, 26, 25, 24, 24, 22, 22, 21, 20, 20, 19, 18, + 18, 17, 16, 16, 15, 15, 28, 29, 29, 29, 30, 29, 28, 28, 27, 26, 24, 24, + 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, + 13, 13, 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 21, 21, 20, 19, 18, 18, + 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 19, 20, + 20, 20, 21, 21, 20, 21, 21, 20, 19, 19, 17, 17, 16, 16, 15, 14, 14, 13, + 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 17, 17, 17, 18, 18, 18, + 18, 18, 18, 18, 17, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, + 11, 10, 10, 10, 9, 9, 9, 9, 14, 15, 15, 15, 16, 16, 16, 16, 16, 16, 15, + 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, + 8, 8 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 24, 22, 19, 24, 21, 20, 19, 22, 20, 17, 15, 19, 19, 15, 13, + /* Size 8x8 */ + 33, 32, 27, 21, 22, 20, 19, 18, 32, 29, 24, 22, 23, 22, 20, 19, 27, 24, + 22, 21, 23, 22, 21, 20, 21, 22, 21, 19, 19, 19, 18, 18, 22, 23, 23, 19, + 18, 17, 16, 16, 20, 22, 22, 19, 17, 16, 15, 14, 19, 20, 21, 18, 16, 15, + 14, 13, 18, 19, 20, 18, 16, 14, 13, 12, + /* Size 16x16 */ + 32, 33, 34, 31, 28, 25, 22, 21, 21, 21, 20, 20, 19, 18, 17, 16, 33, 33, + 33, 30, 27, 24, 22, 22, 22, 22, 21, 20, 20, 19, 18, 17, 34, 33, 32, 29, + 26, 24, 23, 22, 23, 23, 22, 22, 21, 20, 19, 18, 31, 30, 29, 26, 24, 23, + 22, 22, 22, 23, 22, 22, 21, 20, 19, 18, 28, 27, 26, 24, 22, 22, 22, 22, + 22, 23, 22, 22, 21, 20, 20, 19, 25, 24, 24, 23, 22, 21, 20, 20, 21, 21, + 20, 20, 20, 19, 19, 18, 22, 22, 23, 22, 22, 20, 20, 20, 20, 20, 19, 19, + 19, 18, 18, 17, 21, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 18, 18, 18, + 17, 17, 21, 22, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, + 21, 22, 23, 23, 23, 21, 20, 19, 18, 17, 17, 17, 16, 16, 16, 15, 20, 21, + 22, 22, 22, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 20, 20, 22, 22, + 22, 20, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 19, 20, 21, 21, 21, 20, + 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 18, 19, 20, 20, 20, 19, 18, 18, + 17, 16, 15, 15, 14, 13, 13, 12, 17, 18, 19, 19, 20, 19, 18, 17, 16, 16, + 15, 14, 14, 13, 12, 12, 16, 17, 18, 18, 19, 18, 17, 17, 16, 15, 14, 14, + 13, 12, 12, 12, + /* Size 32x32 */ + 32, 33, 33, 34, 34, 32, 31, 30, 28, 28, 25, 25, 22, 21, 21, 21, 21, 21, + 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 16, 16, 33, 33, 33, 33, + 33, 32, 30, 29, 27, 27, 24, 24, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, + 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 33, 33, 33, 33, 33, 31, 30, 29, + 27, 26, 24, 24, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, + 19, 19, 18, 18, 17, 17, 34, 33, 33, 33, 33, 31, 29, 28, 26, 26, 24, 24, + 22, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 19, + 18, 18, 34, 33, 33, 33, 32, 31, 29, 28, 26, 26, 24, 24, 23, 22, 22, 23, + 23, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 32, 32, + 31, 31, 31, 29, 28, 27, 25, 24, 24, 24, 22, 22, 22, 22, 23, 23, 23, 22, + 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 31, 30, 30, 29, 29, 28, + 26, 26, 24, 24, 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, + 21, 20, 20, 20, 19, 19, 18, 18, 30, 29, 29, 28, 28, 27, 26, 25, 23, 23, + 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, + 19, 19, 19, 19, 28, 27, 27, 26, 26, 25, 24, 23, 22, 22, 22, 22, 22, 21, + 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, + 28, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23, + 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 25, 24, 24, 24, + 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 20, 21, 21, 21, 21, 20, 20, 20, + 20, 20, 20, 20, 19, 19, 19, 19, 18, 18, 25, 24, 24, 24, 24, 24, 23, 23, + 22, 22, 21, 21, 20, 20, 20, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, + 19, 19, 19, 19, 18, 18, 22, 22, 22, 22, 23, 22, 22, 22, 22, 21, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, + 17, 17, 21, 21, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 21, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, + 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 21, 22, 22, 22, 23, 22, + 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, + 18, 17, 17, 17, 17, 17, 16, 16, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, + 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, + 16, 16, 16, 16, 21, 22, 22, 23, 23, 23, 23, 23, 23, 23, 21, 21, 20, 19, + 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, + 21, 22, 22, 22, 23, 23, 23, 23, 23, 22, 21, 21, 20, 19, 19, 18, 18, 17, + 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 15, 15, 20, 21, 21, 22, + 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 16, + 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, + 22, 22, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, + 15, 15, 15, 15, 14, 14, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 20, 20, + 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, + 14, 14, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, + 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 19, 20, + 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, + 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 19, 20, 20, 20, 21, 21, + 21, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 15, + 14, 14, 14, 14, 14, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 21, 21, + 20, 20, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, + 13, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, + 18, 17, 17, 16, 16, 15, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 12, 12, + 18, 19, 19, 19, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, + 16, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 17, 18, 18, 19, + 19, 19, 19, 19, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, + 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 19, 19, + 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, + 13, 13, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 19, 18, 18, + 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, + 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 19, 18, 18, 17, 17, 17, 16, + 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, + /* Size 4x8 */ + 33, 24, 22, 19, 31, 23, 23, 20, 26, 22, 22, 20, 22, 20, 19, 18, 23, 21, + 17, 16, 21, 20, 17, 15, 20, 20, 16, 14, 19, 19, 16, 13, + /* Size 8x4 */ + 33, 31, 26, 22, 23, 21, 20, 19, 24, 23, 22, 20, 21, 20, 20, 19, 22, 23, + 22, 19, 17, 17, 16, 16, 19, 20, 20, 18, 16, 15, 14, 13, + /* Size 8x16 */ + 32, 33, 28, 21, 21, 20, 18, 17, 33, 33, 27, 22, 22, 20, 19, 18, 34, 32, + 26, 22, 23, 21, 20, 19, 31, 28, 24, 22, 22, 22, 20, 19, 28, 26, 22, 22, + 23, 22, 21, 20, 24, 24, 22, 20, 21, 20, 19, 18, 22, 22, 21, 20, 19, 19, + 19, 18, 21, 22, 22, 19, 19, 18, 18, 17, 21, 23, 22, 19, 18, 17, 17, 16, + 21, 23, 22, 19, 18, 17, 16, 16, 20, 22, 22, 19, 17, 16, 16, 15, 20, 21, + 22, 19, 17, 16, 15, 14, 19, 20, 21, 19, 17, 15, 14, 13, 18, 20, 20, 18, + 16, 15, 14, 13, 17, 19, 20, 18, 16, 14, 13, 12, 16, 18, 19, 17, 15, 14, + 13, 12, + /* Size 16x8 */ + 32, 33, 34, 31, 28, 24, 22, 21, 21, 21, 20, 20, 19, 18, 17, 16, 33, 33, + 32, 28, 26, 24, 22, 22, 23, 23, 22, 21, 20, 20, 19, 18, 28, 27, 26, 24, + 22, 22, 21, 22, 22, 22, 22, 22, 21, 20, 20, 19, 21, 22, 22, 22, 22, 20, + 20, 19, 19, 19, 19, 19, 19, 18, 18, 17, 21, 22, 23, 22, 23, 21, 19, 19, + 18, 18, 17, 17, 17, 16, 16, 15, 20, 20, 21, 22, 22, 20, 19, 18, 17, 17, + 16, 16, 15, 15, 14, 14, 18, 19, 20, 20, 21, 19, 19, 18, 17, 16, 16, 15, + 14, 14, 13, 13, 17, 18, 19, 19, 20, 18, 18, 17, 16, 16, 15, 14, 13, 13, + 12, 12, + /* Size 16x32 */ + 32, 33, 33, 29, 28, 24, 21, 21, 21, 21, 20, 20, 18, 18, 17, 16, 33, 33, + 33, 28, 27, 24, 22, 22, 22, 22, 20, 20, 19, 19, 18, 17, 33, 33, 33, 28, + 27, 24, 22, 22, 22, 22, 20, 20, 19, 19, 18, 17, 34, 32, 32, 28, 26, 24, + 22, 22, 22, 22, 21, 21, 20, 20, 18, 18, 34, 32, 32, 28, 26, 24, 22, 22, + 23, 23, 21, 21, 20, 20, 19, 18, 32, 31, 30, 26, 25, 23, 22, 22, 23, 23, + 21, 21, 20, 20, 19, 18, 31, 29, 28, 26, 24, 23, 22, 22, 22, 22, 22, 22, + 20, 20, 19, 18, 30, 28, 28, 24, 23, 23, 22, 22, 23, 22, 22, 22, 20, 20, + 19, 19, 28, 26, 26, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 20, 20, 19, + 28, 26, 26, 23, 22, 22, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 24, 24, + 24, 22, 22, 21, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 24, 24, 24, 22, + 22, 21, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 22, 22, 22, 22, 21, 20, + 20, 20, 19, 19, 19, 19, 19, 18, 18, 17, 21, 22, 22, 22, 21, 20, 19, 19, + 19, 19, 19, 19, 18, 18, 17, 17, 21, 22, 22, 22, 22, 20, 19, 19, 19, 19, + 18, 18, 18, 18, 17, 17, 21, 22, 22, 22, 22, 20, 19, 19, 18, 18, 18, 18, + 17, 17, 17, 16, 21, 22, 23, 22, 22, 21, 19, 19, 18, 18, 17, 17, 17, 17, + 16, 16, 21, 23, 23, 23, 22, 21, 19, 19, 18, 17, 17, 17, 16, 16, 16, 15, + 21, 22, 23, 22, 22, 21, 19, 19, 18, 17, 17, 17, 16, 16, 16, 15, 20, 22, + 22, 22, 22, 20, 19, 19, 17, 17, 16, 16, 16, 15, 15, 14, 20, 22, 22, 22, + 22, 20, 19, 19, 17, 17, 16, 16, 16, 15, 15, 14, 20, 21, 21, 22, 22, 20, + 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 20, 21, 21, 22, 22, 20, 19, 18, + 17, 17, 16, 16, 15, 14, 14, 14, 19, 20, 21, 21, 21, 20, 19, 18, 17, 17, + 15, 15, 14, 14, 14, 13, 19, 20, 20, 21, 21, 20, 19, 18, 17, 16, 15, 15, + 14, 14, 13, 13, 19, 20, 20, 20, 21, 20, 18, 18, 16, 16, 15, 15, 14, 14, + 13, 13, 18, 20, 20, 20, 20, 19, 18, 18, 16, 16, 15, 15, 14, 13, 13, 12, + 18, 19, 19, 20, 20, 19, 18, 17, 16, 16, 14, 14, 13, 13, 13, 12, 17, 19, + 19, 19, 20, 19, 18, 17, 16, 16, 14, 14, 13, 13, 12, 12, 17, 19, 19, 19, + 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 16, 18, 18, 18, 19, 18, + 17, 17, 15, 15, 14, 14, 13, 12, 12, 12, 16, 18, 18, 18, 19, 18, 17, 17, + 15, 15, 14, 14, 13, 12, 12, 12, + /* Size 32x16 */ + 32, 33, 33, 34, 34, 32, 31, 30, 28, 28, 24, 24, 22, 21, 21, 21, 21, 21, + 21, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 16, 16, 33, 33, 33, 32, + 32, 31, 29, 28, 26, 26, 24, 24, 22, 22, 22, 22, 22, 23, 22, 22, 22, 21, + 21, 20, 20, 20, 20, 19, 19, 19, 18, 18, 33, 33, 33, 32, 32, 30, 28, 28, + 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 23, 22, 22, 21, 21, 21, 20, 20, + 20, 19, 19, 19, 18, 18, 29, 28, 28, 28, 28, 26, 26, 24, 23, 23, 22, 22, + 22, 22, 22, 22, 22, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, + 18, 18, 28, 27, 27, 26, 26, 25, 24, 23, 22, 22, 22, 22, 21, 21, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 24, 24, + 24, 24, 24, 23, 23, 23, 22, 22, 21, 21, 20, 20, 20, 20, 21, 21, 21, 20, + 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 18, 18, 21, 22, 22, 22, 22, 22, + 22, 22, 22, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 18, 18, 18, 18, 17, 17, 17, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, + 17, 17, 17, 17, 21, 22, 22, 22, 23, 23, 22, 23, 23, 22, 21, 21, 19, 19, + 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, + 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 18, 18, 17, + 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 15, 15, 20, 20, 20, 21, + 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, + 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 20, 20, 20, 21, 21, 21, 22, 22, + 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, + 15, 14, 14, 14, 14, 14, 18, 19, 19, 20, 20, 20, 20, 20, 21, 21, 19, 19, + 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, + 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, + 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 17, 18, + 18, 18, 19, 19, 19, 19, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 15, + 15, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, + 18, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, + 13, 13, 12, 12, 12, 12, 12, 12, + /* Size 4x16 */ + 33, 24, 21, 18, 33, 24, 22, 19, 32, 24, 23, 20, 29, 23, 22, 20, 26, 22, + 22, 20, 24, 21, 21, 19, 22, 20, 19, 18, 22, 20, 19, 18, 22, 21, 18, 17, + 22, 21, 17, 16, 22, 20, 17, 15, 21, 20, 17, 14, 20, 20, 16, 14, 20, 19, + 16, 13, 19, 19, 16, 13, 18, 18, 15, 12, + /* Size 16x4 */ + 33, 33, 32, 29, 26, 24, 22, 22, 22, 22, 22, 21, 20, 20, 19, 18, 24, 24, + 24, 23, 22, 21, 20, 20, 21, 21, 20, 20, 20, 19, 19, 18, 21, 22, 23, 22, + 22, 21, 19, 19, 18, 17, 17, 17, 16, 16, 16, 15, 18, 19, 20, 20, 20, 19, + 18, 18, 17, 16, 15, 14, 14, 13, 13, 12, + /* Size 8x32 */ + 32, 33, 28, 21, 21, 20, 18, 17, 33, 33, 27, 22, 22, 20, 19, 18, 33, 33, + 27, 22, 22, 20, 19, 18, 34, 32, 26, 22, 22, 21, 20, 18, 34, 32, 26, 22, + 23, 21, 20, 19, 32, 30, 25, 22, 23, 21, 20, 19, 31, 28, 24, 22, 22, 22, + 20, 19, 30, 28, 23, 22, 23, 22, 20, 19, 28, 26, 22, 22, 23, 22, 21, 20, + 28, 26, 22, 21, 22, 22, 21, 19, 24, 24, 22, 20, 21, 20, 19, 18, 24, 24, + 22, 20, 21, 20, 19, 18, 22, 22, 21, 20, 19, 19, 19, 18, 21, 22, 21, 19, + 19, 19, 18, 17, 21, 22, 22, 19, 19, 18, 18, 17, 21, 22, 22, 19, 18, 18, + 17, 17, 21, 23, 22, 19, 18, 17, 17, 16, 21, 23, 22, 19, 18, 17, 16, 16, + 21, 23, 22, 19, 18, 17, 16, 16, 20, 22, 22, 19, 17, 16, 16, 15, 20, 22, + 22, 19, 17, 16, 16, 15, 20, 21, 22, 19, 17, 16, 15, 14, 20, 21, 22, 19, + 17, 16, 15, 14, 19, 21, 21, 19, 17, 15, 14, 14, 19, 20, 21, 19, 17, 15, + 14, 13, 19, 20, 21, 18, 16, 15, 14, 13, 18, 20, 20, 18, 16, 15, 14, 13, + 18, 19, 20, 18, 16, 14, 13, 13, 17, 19, 20, 18, 16, 14, 13, 12, 17, 19, + 19, 17, 16, 14, 13, 12, 16, 18, 19, 17, 15, 14, 13, 12, 16, 18, 19, 17, + 15, 14, 13, 12, + /* Size 32x8 */ + 32, 33, 33, 34, 34, 32, 31, 30, 28, 28, 24, 24, 22, 21, 21, 21, 21, 21, + 21, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 16, 16, 33, 33, 33, 32, + 32, 30, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 23, 22, 22, 21, + 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 28, 27, 27, 26, 26, 25, 24, 23, + 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, + 20, 20, 20, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, + 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, + 17, 17, 21, 22, 22, 22, 23, 23, 22, 23, 23, 22, 21, 21, 19, 19, 19, 18, + 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 20, 20, + 20, 21, 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, + 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 18, 19, 19, 20, 20, 20, + 20, 20, 21, 21, 19, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 14, + 14, 14, 14, 13, 13, 13, 13, 13, 17, 18, 18, 18, 19, 19, 19, 19, 20, 19, + 18, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, + 12, 12, 12, 12 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 31, 24, 19, 31, 27, 22, 18, 24, 22, 16, 14, 19, 18, 14, 11, + /* Size 8x8 */ + 33, 32, 32, 30, 27, 22, 20, 16, 32, 32, 32, 30, 28, 23, 21, 17, 32, 32, + 29, 28, 26, 23, 21, 18, 30, 30, 28, 24, 22, 20, 18, 16, 27, 28, 26, 22, + 19, 17, 16, 14, 22, 23, 23, 20, 17, 15, 14, 12, 20, 21, 21, 18, 16, 14, + 12, 11, 16, 17, 18, 16, 14, 12, 11, 10, + /* Size 16x16 */ + 32, 33, 33, 33, 32, 32, 30, 28, 27, 25, 23, 21, 19, 18, 17, 16, 33, 32, + 32, 32, 32, 32, 30, 29, 27, 26, 24, 22, 20, 19, 18, 17, 33, 32, 32, 32, + 32, 32, 31, 30, 28, 27, 25, 23, 21, 19, 18, 17, 33, 32, 32, 31, 31, 31, + 29, 28, 27, 26, 24, 23, 21, 19, 18, 17, 32, 32, 32, 31, 30, 30, 28, 28, + 26, 26, 24, 23, 21, 19, 19, 17, 32, 32, 32, 31, 30, 29, 28, 27, 26, 25, + 24, 22, 21, 20, 19, 18, 30, 30, 31, 29, 28, 28, 26, 24, 23, 22, 22, 20, + 19, 18, 17, 16, 28, 29, 30, 28, 28, 27, 24, 21, 20, 20, 19, 18, 17, 16, + 16, 15, 27, 27, 28, 27, 26, 26, 23, 20, 20, 19, 18, 17, 16, 15, 15, 14, + 25, 26, 27, 26, 26, 25, 22, 20, 19, 18, 17, 16, 15, 15, 14, 14, 23, 24, + 25, 24, 24, 24, 22, 19, 18, 17, 16, 15, 14, 14, 13, 13, 21, 22, 23, 23, + 23, 22, 20, 18, 17, 16, 15, 14, 13, 13, 12, 12, 19, 20, 21, 21, 21, 21, + 19, 17, 16, 15, 14, 13, 12, 12, 12, 11, 18, 19, 19, 19, 19, 20, 18, 16, + 15, 15, 14, 13, 12, 11, 11, 11, 17, 18, 18, 18, 19, 19, 17, 16, 15, 14, + 13, 12, 12, 11, 11, 10, 16, 17, 17, 17, 17, 18, 16, 15, 14, 14, 13, 12, + 11, 11, 10, 10, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 30, 28, 28, 27, 26, + 25, 23, 23, 22, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 27, 27, 26, 24, 24, 22, + 22, 21, 20, 20, 18, 18, 17, 16, 16, 15, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 30, 30, 29, 29, 27, 27, 26, 24, 24, 23, 22, 21, 20, 20, + 19, 18, 18, 17, 17, 15, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 30, 30, 29, 29, 28, 27, 26, 24, 24, 23, 23, 22, 20, 20, 19, 19, 18, 17, + 17, 16, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, + 28, 28, 27, 25, 25, 23, 23, 22, 21, 21, 19, 19, 18, 17, 17, 16, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 28, 28, 27, 25, + 25, 23, 23, 22, 21, 21, 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 30, 29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 22, + 21, 21, 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, 31, 31, 31, 30, + 30, 29, 29, 28, 28, 28, 27, 26, 26, 24, 24, 23, 23, 22, 20, 20, 19, 19, + 18, 17, 17, 16, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 28, 28, + 28, 28, 26, 26, 26, 24, 24, 23, 23, 22, 21, 21, 19, 19, 19, 17, 17, 16, + 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, + 25, 24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 18, 18, 17, 32, 32, 32, 32, + 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 25, 24, 24, 23, + 22, 22, 21, 21, 20, 19, 19, 18, 18, 17, 31, 31, 31, 31, 31, 31, 30, 29, + 29, 28, 28, 27, 26, 26, 24, 24, 24, 23, 23, 22, 22, 21, 20, 20, 19, 19, + 18, 18, 17, 17, 17, 16, 30, 30, 30, 30, 31, 31, 29, 29, 28, 28, 28, 26, + 26, 25, 24, 24, 23, 23, 22, 22, 22, 20, 20, 20, 19, 19, 18, 18, 17, 16, + 16, 15, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, 25, 24, 23, 23, + 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 28, 29, + 29, 29, 30, 30, 28, 28, 28, 27, 27, 24, 24, 23, 21, 21, 20, 20, 20, 19, + 19, 18, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 28, 29, 29, 29, 30, 30, + 28, 28, 28, 27, 27, 24, 24, 23, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, + 17, 17, 16, 16, 16, 15, 15, 14, 27, 27, 27, 28, 28, 28, 27, 27, 26, 26, + 26, 24, 23, 22, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, + 15, 14, 14, 13, 26, 27, 27, 27, 28, 28, 26, 26, 26, 26, 26, 23, 23, 22, + 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, + 25, 26, 26, 26, 27, 27, 26, 26, 26, 25, 25, 23, 22, 21, 20, 20, 19, 19, + 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 23, 24, 24, 24, + 25, 25, 24, 24, 24, 24, 24, 22, 22, 20, 19, 19, 18, 18, 17, 16, 16, 16, + 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 23, 24, 24, 24, 25, 25, 24, 24, + 24, 24, 24, 22, 22, 20, 19, 19, 18, 18, 17, 16, 16, 16, 15, 15, 14, 14, + 14, 14, 13, 13, 13, 12, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 21, + 20, 20, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, + 12, 12, 21, 22, 22, 23, 23, 23, 23, 23, 23, 22, 22, 20, 20, 19, 18, 18, + 17, 17, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 20, 21, + 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 16, 15, + 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 19, 20, 20, 20, 21, 21, + 21, 20, 21, 21, 21, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, + 12, 12, 12, 12, 12, 11, 11, 11, 19, 20, 20, 20, 21, 21, 21, 20, 21, 21, + 21, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, + 12, 11, 11, 11, 18, 18, 19, 19, 19, 19, 19, 19, 19, 20, 20, 18, 18, 17, + 16, 16, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, + 17, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 17, 16, 16, 15, 15, + 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 17, 17, 18, 18, + 18, 18, 18, 18, 19, 19, 19, 17, 17, 17, 16, 16, 15, 15, 14, 13, 13, 13, + 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 16, 16, 17, 17, 17, 17, 17, 17, + 17, 18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, + 11, 10, 10, 10, 10, 9, 16, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 17, + 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, + 10, 9, 15, 15, 15, 16, 16, 16, 16, 16, 16, 17, 17, 16, 15, 15, 14, 14, + 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, + /* Size 4x8 */ + 32, 32, 24, 18, 32, 31, 25, 19, 32, 29, 24, 20, 30, 28, 20, 17, 27, 26, + 18, 15, 23, 23, 16, 13, 20, 20, 14, 12, 17, 18, 13, 11, + /* Size 8x4 */ + 32, 32, 32, 30, 27, 23, 20, 17, 32, 31, 29, 28, 26, 23, 20, 18, 24, 25, + 24, 20, 18, 16, 14, 13, 18, 19, 20, 17, 15, 13, 12, 11, + /* Size 8x16 */ + 32, 33, 32, 29, 26, 23, 19, 16, 33, 32, 32, 29, 27, 24, 20, 17, 33, 32, + 31, 30, 28, 25, 21, 17, 33, 32, 30, 29, 27, 24, 21, 17, 32, 32, 30, 28, + 26, 24, 21, 18, 32, 31, 29, 28, 26, 24, 21, 18, 30, 30, 28, 25, 23, 21, + 19, 16, 28, 30, 27, 22, 20, 19, 17, 15, 27, 28, 26, 22, 20, 18, 16, 14, + 25, 26, 25, 21, 19, 17, 15, 13, 23, 25, 24, 20, 18, 16, 14, 13, 21, 23, + 22, 19, 17, 15, 13, 12, 19, 21, 20, 18, 16, 14, 12, 11, 18, 19, 19, 17, + 15, 14, 12, 11, 17, 18, 18, 16, 15, 13, 12, 10, 16, 17, 18, 16, 14, 13, + 11, 10, + /* Size 16x8 */ + 32, 33, 33, 33, 32, 32, 30, 28, 27, 25, 23, 21, 19, 18, 17, 16, 33, 32, + 32, 32, 32, 31, 30, 30, 28, 26, 25, 23, 21, 19, 18, 17, 32, 32, 31, 30, + 30, 29, 28, 27, 26, 25, 24, 22, 20, 19, 18, 18, 29, 29, 30, 29, 28, 28, + 25, 22, 22, 21, 20, 19, 18, 17, 16, 16, 26, 27, 28, 27, 26, 26, 23, 20, + 20, 19, 18, 17, 16, 15, 15, 14, 23, 24, 25, 24, 24, 24, 21, 19, 18, 17, + 16, 15, 14, 14, 13, 13, 19, 20, 21, 21, 21, 21, 19, 17, 16, 15, 14, 13, + 12, 12, 12, 11, 16, 17, 17, 17, 18, 18, 16, 15, 14, 13, 13, 12, 11, 11, + 10, 10, + /* Size 16x32 */ + 32, 33, 33, 33, 32, 32, 29, 28, 26, 23, 23, 20, 19, 18, 16, 16, 33, 32, + 32, 32, 32, 32, 29, 29, 27, 24, 24, 21, 20, 18, 16, 16, 33, 32, 32, 32, + 32, 32, 29, 29, 27, 24, 24, 21, 20, 19, 17, 17, 33, 32, 32, 32, 32, 32, + 30, 29, 28, 25, 25, 21, 20, 19, 17, 17, 33, 32, 32, 32, 31, 31, 30, 30, + 28, 25, 25, 22, 21, 19, 17, 17, 33, 32, 32, 32, 31, 31, 30, 30, 28, 25, + 25, 22, 21, 19, 17, 17, 33, 32, 32, 31, 30, 30, 29, 28, 27, 24, 24, 21, + 21, 19, 17, 17, 32, 32, 32, 31, 30, 30, 28, 28, 27, 24, 24, 21, 20, 19, + 17, 17, 32, 32, 32, 31, 30, 30, 28, 28, 26, 24, 24, 21, 21, 19, 18, 18, + 32, 32, 31, 30, 29, 29, 28, 27, 26, 24, 24, 21, 21, 20, 18, 18, 32, 32, + 31, 30, 29, 29, 28, 27, 26, 24, 24, 21, 21, 20, 18, 18, 31, 31, 31, 29, + 28, 28, 26, 25, 24, 22, 22, 20, 19, 18, 17, 17, 30, 30, 30, 29, 28, 28, + 25, 24, 23, 21, 21, 19, 19, 18, 16, 16, 30, 30, 30, 29, 28, 28, 24, 23, + 22, 20, 20, 19, 18, 17, 16, 16, 28, 29, 30, 28, 27, 27, 22, 21, 20, 19, + 19, 18, 17, 16, 15, 15, 28, 29, 30, 28, 27, 27, 22, 21, 20, 19, 19, 18, + 17, 16, 15, 15, 27, 28, 28, 27, 26, 26, 22, 20, 20, 18, 18, 17, 16, 15, + 14, 14, 26, 27, 28, 26, 26, 26, 21, 20, 19, 18, 18, 16, 16, 15, 14, 14, + 25, 26, 26, 26, 25, 25, 21, 20, 19, 17, 17, 16, 15, 15, 13, 13, 23, 25, + 25, 24, 24, 24, 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 23, 25, 25, 24, + 24, 24, 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 22, 23, 23, 23, 23, 23, + 19, 18, 17, 16, 16, 14, 14, 13, 12, 12, 21, 23, 23, 23, 22, 22, 19, 18, + 17, 15, 15, 14, 13, 13, 12, 12, 20, 22, 22, 22, 22, 22, 19, 18, 17, 15, + 15, 13, 13, 12, 12, 12, 19, 20, 21, 20, 20, 20, 18, 17, 16, 14, 14, 13, + 12, 12, 11, 11, 19, 20, 21, 20, 20, 20, 18, 17, 16, 14, 14, 13, 12, 12, + 11, 11, 18, 19, 19, 19, 19, 19, 17, 16, 15, 14, 14, 12, 12, 11, 11, 11, + 18, 19, 19, 19, 19, 19, 17, 16, 15, 14, 14, 12, 12, 11, 10, 10, 17, 18, + 18, 18, 18, 18, 16, 16, 15, 13, 13, 12, 12, 11, 10, 10, 16, 17, 17, 17, + 18, 18, 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 16, 17, 17, 17, 18, 18, + 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 15, 16, 16, 16, 17, 17, 15, 14, + 13, 12, 12, 11, 11, 10, 9, 9, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 30, 30, 28, 28, 27, 26, + 25, 23, 23, 22, 21, 20, 19, 19, 18, 18, 17, 16, 16, 15, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 27, 26, 25, 25, 23, + 23, 22, 20, 20, 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 25, 23, 23, 22, 21, 21, + 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29, + 29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 22, 20, 20, 19, 19, 18, 17, + 17, 16, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 27, 27, + 26, 26, 25, 24, 24, 23, 22, 22, 20, 20, 19, 19, 18, 18, 18, 17, 32, 32, + 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 25, 24, + 24, 23, 22, 22, 20, 20, 19, 19, 18, 18, 18, 17, 29, 29, 29, 30, 30, 30, + 29, 28, 28, 28, 28, 26, 25, 24, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, + 18, 18, 17, 17, 16, 16, 16, 15, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27, + 27, 25, 24, 23, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 16, 16, + 16, 15, 15, 14, 26, 27, 27, 28, 28, 28, 27, 27, 26, 26, 26, 24, 23, 22, + 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, + 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 22, 21, 20, 19, 19, 18, 18, + 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 23, 24, 24, 25, + 25, 25, 24, 24, 24, 24, 24, 22, 21, 20, 19, 19, 18, 18, 17, 16, 16, 16, + 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 20, 21, 21, 21, 22, 22, 21, 21, + 21, 21, 21, 20, 19, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, + 12, 12, 12, 12, 12, 11, 19, 20, 20, 20, 21, 21, 21, 20, 21, 21, 21, 19, + 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, + 11, 11, 18, 18, 19, 19, 19, 19, 19, 19, 19, 20, 20, 18, 18, 17, 16, 16, + 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 16, 16, + 17, 17, 17, 17, 17, 17, 18, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, + 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 16, 16, 17, 17, 17, 17, + 17, 17, 18, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, + 11, 11, 11, 10, 10, 10, 10, 9, + /* Size 4x16 */ + 33, 32, 23, 18, 32, 32, 24, 19, 32, 31, 25, 19, 32, 30, 24, 19, 32, 30, + 24, 19, 32, 29, 24, 20, 30, 28, 21, 18, 29, 27, 19, 16, 28, 26, 18, 15, + 26, 25, 17, 15, 25, 24, 16, 14, 23, 22, 15, 13, 20, 20, 14, 12, 19, 19, + 14, 11, 18, 18, 13, 11, 17, 18, 13, 11, + /* Size 16x4 */ + 33, 32, 32, 32, 32, 32, 30, 29, 28, 26, 25, 23, 20, 19, 18, 17, 32, 32, + 31, 30, 30, 29, 28, 27, 26, 25, 24, 22, 20, 19, 18, 18, 23, 24, 25, 24, + 24, 24, 21, 19, 18, 17, 16, 15, 14, 14, 13, 13, 18, 19, 19, 19, 19, 20, + 18, 16, 15, 15, 14, 13, 12, 11, 11, 11, + /* Size 8x32 */ + 32, 33, 32, 29, 26, 23, 19, 16, 33, 32, 32, 29, 27, 24, 20, 16, 33, 32, + 32, 29, 27, 24, 20, 17, 33, 32, 32, 30, 28, 25, 20, 17, 33, 32, 31, 30, + 28, 25, 21, 17, 33, 32, 31, 30, 28, 25, 21, 17, 33, 32, 30, 29, 27, 24, + 21, 17, 32, 32, 30, 28, 27, 24, 20, 17, 32, 32, 30, 28, 26, 24, 21, 18, + 32, 31, 29, 28, 26, 24, 21, 18, 32, 31, 29, 28, 26, 24, 21, 18, 31, 31, + 28, 26, 24, 22, 19, 17, 30, 30, 28, 25, 23, 21, 19, 16, 30, 30, 28, 24, + 22, 20, 18, 16, 28, 30, 27, 22, 20, 19, 17, 15, 28, 30, 27, 22, 20, 19, + 17, 15, 27, 28, 26, 22, 20, 18, 16, 14, 26, 28, 26, 21, 19, 18, 16, 14, + 25, 26, 25, 21, 19, 17, 15, 13, 23, 25, 24, 20, 18, 16, 14, 13, 23, 25, + 24, 20, 18, 16, 14, 13, 22, 23, 23, 19, 17, 16, 14, 12, 21, 23, 22, 19, + 17, 15, 13, 12, 20, 22, 22, 19, 17, 15, 13, 12, 19, 21, 20, 18, 16, 14, + 12, 11, 19, 21, 20, 18, 16, 14, 12, 11, 18, 19, 19, 17, 15, 14, 12, 11, + 18, 19, 19, 17, 15, 14, 12, 10, 17, 18, 18, 16, 15, 13, 12, 10, 16, 17, + 18, 16, 14, 13, 11, 10, 16, 17, 18, 16, 14, 13, 11, 10, 15, 16, 17, 15, + 13, 12, 11, 9, + /* Size 32x8 */ + 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 30, 30, 28, 28, 27, 26, + 25, 23, 23, 22, 21, 20, 19, 19, 18, 18, 17, 16, 16, 15, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 25, 23, + 23, 22, 21, 21, 19, 19, 18, 17, 17, 16, 32, 32, 32, 32, 31, 31, 30, 30, + 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 25, 24, 24, 23, 22, 22, 20, 20, + 19, 19, 18, 18, 18, 17, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 26, + 25, 24, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 18, 18, 17, 17, 16, 16, + 16, 15, 26, 27, 27, 28, 28, 28, 27, 27, 26, 26, 26, 24, 23, 22, 20, 20, + 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, 23, 24, + 24, 25, 25, 25, 24, 24, 24, 24, 24, 22, 21, 20, 19, 19, 18, 18, 17, 16, + 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 19, 20, 20, 20, 21, 21, + 21, 20, 21, 21, 21, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, + 12, 12, 12, 12, 12, 11, 11, 11, 16, 16, 17, 17, 17, 17, 17, 17, 18, 18, + 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, + 10, 10, 10, 9 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 25, 22, 20, 25, 21, 21, 20, 22, 21, 18, 17, 20, 20, 17, 14, + /* Size 8x8 */ + 33, 33, 27, 23, 22, 21, 20, 19, 33, 32, 26, 23, 23, 22, 22, 20, 27, 26, + 22, 22, 22, 22, 22, 20, 23, 23, 22, 20, 20, 20, 20, 19, 22, 23, 22, 20, + 19, 18, 18, 17, 21, 22, 22, 20, 18, 17, 16, 16, 20, 22, 22, 20, 18, 16, + 16, 15, 19, 20, 20, 19, 17, 16, 15, 13, + /* Size 16x16 */ + 32, 33, 34, 31, 30, 28, 25, 21, 21, 21, 21, 20, 20, 19, 19, 18, 33, 33, + 33, 30, 28, 27, 24, 22, 22, 22, 22, 21, 20, 20, 19, 19, 34, 33, 32, 30, + 28, 26, 24, 22, 23, 23, 23, 22, 22, 21, 20, 20, 31, 30, 30, 28, 26, 24, + 23, 22, 22, 22, 23, 22, 22, 21, 20, 20, 30, 28, 28, 26, 24, 23, 22, 22, + 22, 22, 23, 22, 22, 21, 21, 20, 28, 27, 26, 24, 23, 22, 22, 21, 22, 22, + 23, 22, 22, 21, 21, 20, 25, 24, 24, 23, 22, 22, 21, 20, 20, 21, 21, 20, + 20, 20, 20, 19, 21, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 19, + 18, 18, 21, 22, 23, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, + 21, 22, 23, 22, 22, 22, 21, 19, 19, 19, 18, 18, 17, 17, 17, 17, 21, 22, + 23, 23, 23, 23, 21, 19, 19, 18, 18, 17, 17, 17, 16, 16, 20, 21, 22, 22, + 22, 22, 20, 19, 18, 18, 17, 17, 16, 16, 16, 15, 20, 20, 22, 22, 22, 22, + 20, 19, 18, 17, 17, 16, 16, 15, 15, 15, 19, 20, 21, 21, 21, 21, 20, 19, + 18, 17, 17, 16, 15, 15, 14, 14, 19, 19, 20, 20, 21, 21, 20, 18, 18, 17, + 16, 16, 15, 14, 14, 14, 18, 19, 20, 20, 20, 20, 19, 18, 17, 17, 16, 15, + 15, 14, 14, 13, + /* Size 32x32 */ + 32, 33, 33, 33, 34, 34, 31, 31, 30, 28, 28, 26, 25, 23, 21, 21, 21, 21, + 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 33, 33, 33, 33, + 33, 33, 31, 30, 28, 27, 27, 25, 24, 23, 21, 21, 22, 22, 22, 22, 22, 21, + 21, 21, 20, 20, 20, 20, 19, 19, 19, 18, 33, 33, 33, 33, 33, 33, 30, 30, + 28, 27, 27, 25, 24, 23, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, + 20, 20, 19, 19, 19, 18, 33, 33, 33, 33, 33, 33, 30, 29, 28, 26, 26, 25, + 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, + 19, 19, 34, 33, 33, 33, 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, + 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 34, 33, + 33, 33, 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, 23, 23, 23, 23, + 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 31, 31, 30, 30, 30, 30, + 28, 27, 26, 24, 24, 23, 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, + 22, 22, 21, 21, 20, 20, 20, 19, 31, 30, 30, 29, 29, 29, 27, 26, 26, 24, + 24, 23, 23, 22, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, + 20, 20, 20, 19, 30, 28, 28, 28, 28, 28, 26, 26, 24, 23, 23, 23, 22, 22, + 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, + 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 22, 22, + 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 28, 27, 27, 26, + 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23, 23, 22, + 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 26, 25, 25, 25, 24, 24, 23, 23, + 23, 22, 22, 21, 21, 21, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 20, 20, 20, 20, 20, 19, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 21, + 21, 21, 20, 20, 20, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 19, + 19, 19, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 18, 21, 21, + 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 21, 21, 22, 22, 22, 22, + 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 18, 18, 18, 18, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, + 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, + 18, 17, 17, 17, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, + 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, + 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, + 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 21, 22, 22, 22, + 23, 23, 23, 23, 23, 23, 23, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 17, + 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 21, 22, 22, 22, 23, 23, 23, 23, + 23, 23, 23, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, + 17, 17, 16, 16, 16, 16, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, + 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, + 16, 15, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, + 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 20, 21, + 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 18, 17, + 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15, 20, 20, 20, 21, 22, 22, + 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, + 16, 16, 15, 15, 15, 15, 15, 14, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, + 22, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, + 15, 15, 15, 14, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19, + 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, + 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 19, 18, 18, + 17, 17, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 19, 19, 19, 20, + 20, 20, 20, 20, 21, 21, 21, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, + 16, 15, 15, 15, 14, 14, 14, 14, 14, 13, 18, 19, 19, 19, 20, 20, 20, 20, + 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, + 14, 14, 14, 13, 13, 13, 18, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, + 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 13, + 13, 13, 17, 18, 18, 19, 19, 19, 19, 19, 20, 20, 20, 19, 19, 18, 18, 18, + 17, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, + /* Size 4x8 */ + 33, 27, 22, 20, 32, 26, 23, 21, 26, 22, 23, 21, 23, 22, 20, 19, 22, 22, + 18, 18, 22, 22, 17, 16, 21, 22, 17, 15, 19, 20, 16, 14, + /* Size 8x4 */ + 33, 32, 26, 23, 22, 22, 21, 19, 27, 26, 22, 22, 22, 22, 22, 20, 22, 23, + 23, 20, 18, 17, 17, 16, 20, 21, 21, 19, 18, 16, 15, 14, + /* Size 8x16 */ + 32, 33, 28, 23, 21, 21, 20, 18, 33, 33, 27, 23, 22, 22, 20, 19, 34, 32, + 26, 23, 23, 23, 21, 20, 31, 29, 24, 22, 22, 23, 22, 20, 29, 28, 23, 22, + 22, 23, 22, 20, 28, 26, 22, 22, 22, 23, 22, 20, 24, 24, 22, 21, 20, 21, + 20, 19, 21, 22, 21, 20, 19, 19, 19, 18, 21, 22, 22, 20, 19, 19, 18, 17, + 21, 23, 22, 20, 19, 18, 17, 17, 21, 23, 22, 20, 19, 18, 17, 16, 20, 22, + 22, 20, 18, 17, 16, 15, 20, 21, 22, 19, 18, 17, 16, 14, 19, 21, 21, 19, + 18, 17, 15, 14, 19, 20, 21, 19, 18, 16, 15, 14, 18, 20, 20, 19, 17, 16, + 15, 13, + /* Size 16x8 */ + 32, 33, 34, 31, 29, 28, 24, 21, 21, 21, 21, 20, 20, 19, 19, 18, 33, 33, + 32, 29, 28, 26, 24, 22, 22, 23, 23, 22, 21, 21, 20, 20, 28, 27, 26, 24, + 23, 22, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, 23, 23, 23, 22, 22, 22, + 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 22, 23, 22, 22, 22, 20, 19, + 19, 19, 19, 18, 18, 18, 18, 17, 21, 22, 23, 23, 23, 23, 21, 19, 19, 18, + 18, 17, 17, 17, 16, 16, 20, 20, 21, 22, 22, 22, 20, 19, 18, 17, 17, 16, + 16, 15, 15, 15, 18, 19, 20, 20, 20, 20, 19, 18, 17, 17, 16, 15, 14, 14, + 14, 13, + /* Size 16x32 */ + 32, 33, 33, 31, 28, 28, 23, 21, 21, 21, 21, 20, 20, 19, 18, 18, 33, 33, + 33, 30, 27, 27, 23, 22, 22, 22, 22, 20, 20, 20, 19, 19, 33, 33, 33, 30, + 27, 27, 23, 22, 22, 22, 22, 21, 20, 20, 19, 19, 33, 33, 32, 30, 26, 26, + 23, 22, 22, 22, 22, 21, 21, 20, 19, 19, 34, 32, 32, 29, 26, 26, 23, 22, + 23, 23, 23, 22, 21, 21, 20, 20, 34, 32, 32, 29, 26, 26, 23, 22, 23, 23, + 23, 22, 21, 21, 20, 20, 31, 30, 29, 28, 24, 24, 22, 22, 22, 23, 23, 22, + 22, 21, 20, 20, 31, 29, 28, 27, 24, 24, 22, 22, 22, 22, 22, 22, 22, 21, + 20, 20, 29, 28, 28, 26, 23, 23, 22, 22, 22, 23, 23, 22, 22, 21, 20, 20, + 28, 26, 26, 24, 22, 22, 22, 22, 22, 23, 23, 22, 22, 21, 20, 20, 28, 26, + 26, 24, 22, 22, 22, 22, 22, 23, 23, 22, 22, 21, 20, 20, 25, 24, 24, 23, + 22, 22, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 24, 24, 24, 23, 22, 22, + 21, 20, 20, 21, 21, 20, 20, 20, 19, 19, 23, 23, 23, 23, 22, 22, 20, 20, + 20, 20, 20, 20, 20, 19, 19, 19, 21, 22, 22, 22, 21, 21, 20, 19, 19, 19, + 19, 19, 19, 19, 18, 18, 21, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, + 19, 19, 18, 18, 21, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 18, + 17, 17, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 18, 18, 18, 18, 17, 17, + 21, 22, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 21, 22, + 23, 23, 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 21, 22, 23, 23, + 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 20, 22, 22, 22, 22, 22, + 20, 19, 18, 17, 17, 17, 16, 16, 16, 16, 20, 22, 22, 22, 22, 22, 20, 19, + 18, 17, 17, 16, 16, 16, 15, 15, 20, 21, 22, 22, 22, 22, 20, 19, 18, 17, + 17, 16, 16, 16, 15, 15, 20, 21, 21, 22, 22, 22, 19, 19, 18, 17, 17, 16, + 16, 15, 14, 14, 20, 21, 21, 22, 22, 22, 19, 19, 18, 17, 17, 16, 16, 15, + 14, 14, 19, 20, 21, 21, 21, 21, 19, 19, 18, 17, 17, 15, 15, 15, 14, 14, + 19, 20, 20, 21, 21, 21, 19, 19, 18, 17, 17, 15, 15, 15, 14, 14, 19, 20, + 20, 20, 21, 21, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 18, 19, 20, 20, + 20, 20, 19, 18, 17, 16, 16, 15, 15, 14, 13, 13, 18, 19, 20, 20, 20, 20, + 19, 18, 17, 16, 16, 15, 15, 14, 13, 13, 17, 19, 19, 19, 20, 20, 18, 18, + 17, 16, 16, 15, 14, 14, 13, 13, + /* Size 32x16 */ + 32, 33, 33, 33, 34, 34, 31, 31, 29, 28, 28, 25, 24, 23, 21, 21, 21, 21, + 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 33, 33, 33, 33, + 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 33, 33, 33, 32, 32, 32, 29, 28, + 28, 26, 26, 24, 24, 23, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 21, 21, + 21, 20, 20, 20, 20, 19, 31, 30, 30, 30, 29, 29, 28, 27, 26, 24, 24, 23, + 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, + 20, 19, 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 28, 27, + 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 23, 23, 23, 23, 23, 23, + 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 19, 19, 19, 19, 19, 19, 19, 18, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 18, 18, 18, 18, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 20, 20, + 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, + 21, 22, 22, 22, 23, 23, 23, 22, 23, 23, 23, 21, 21, 20, 19, 19, 19, 18, + 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 21, 22, 22, 22, + 23, 23, 23, 22, 23, 23, 23, 21, 21, 20, 19, 19, 19, 18, 18, 18, 18, 17, + 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 20, 20, 21, 21, 22, 22, 22, 22, + 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, + 15, 15, 15, 15, 15, 15, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 20, + 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15, + 15, 14, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 19, + 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 18, 19, + 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, + 16, 16, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 18, 19, 19, 19, 20, 20, + 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, + 14, 14, 14, 14, 14, 13, 13, 13, + /* Size 4x16 */ + 33, 28, 21, 19, 33, 27, 22, 20, 32, 26, 23, 21, 30, 24, 23, 21, 28, 23, + 23, 21, 26, 22, 23, 21, 24, 22, 21, 20, 22, 21, 19, 19, 22, 22, 19, 18, + 22, 22, 18, 17, 22, 22, 18, 17, 22, 22, 17, 16, 21, 22, 17, 15, 20, 21, + 17, 15, 20, 21, 16, 14, 19, 20, 16, 14, + /* Size 16x4 */ + 33, 33, 32, 30, 28, 26, 24, 22, 22, 22, 22, 22, 21, 20, 20, 19, 28, 27, + 26, 24, 23, 22, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, 21, 22, 23, 23, + 23, 23, 21, 19, 19, 18, 18, 17, 17, 17, 16, 16, 19, 20, 21, 21, 21, 21, + 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, + /* Size 8x32 */ + 32, 33, 28, 23, 21, 21, 20, 18, 33, 33, 27, 23, 22, 22, 20, 19, 33, 33, + 27, 23, 22, 22, 20, 19, 33, 32, 26, 23, 22, 22, 21, 19, 34, 32, 26, 23, + 23, 23, 21, 20, 34, 32, 26, 23, 23, 23, 21, 20, 31, 29, 24, 22, 22, 23, + 22, 20, 31, 28, 24, 22, 22, 22, 22, 20, 29, 28, 23, 22, 22, 23, 22, 20, + 28, 26, 22, 22, 22, 23, 22, 20, 28, 26, 22, 22, 22, 23, 22, 20, 25, 24, + 22, 21, 21, 21, 20, 20, 24, 24, 22, 21, 20, 21, 20, 19, 23, 23, 22, 20, + 20, 20, 20, 19, 21, 22, 21, 20, 19, 19, 19, 18, 21, 22, 21, 20, 19, 19, + 19, 18, 21, 22, 22, 20, 19, 19, 18, 17, 21, 22, 22, 20, 19, 18, 18, 17, + 21, 23, 22, 20, 19, 18, 17, 17, 21, 23, 22, 20, 19, 18, 17, 16, 21, 23, + 22, 20, 19, 18, 17, 16, 20, 22, 22, 20, 18, 17, 16, 16, 20, 22, 22, 20, + 18, 17, 16, 15, 20, 22, 22, 20, 18, 17, 16, 15, 20, 21, 22, 19, 18, 17, + 16, 14, 20, 21, 22, 19, 18, 17, 16, 14, 19, 21, 21, 19, 18, 17, 15, 14, + 19, 20, 21, 19, 18, 17, 15, 14, 19, 20, 21, 19, 18, 16, 15, 14, 18, 20, + 20, 19, 17, 16, 15, 13, 18, 20, 20, 19, 17, 16, 15, 13, 17, 19, 20, 18, + 17, 16, 14, 13, + /* Size 32x8 */ + 32, 33, 33, 33, 34, 34, 31, 31, 29, 28, 28, 25, 24, 23, 21, 21, 21, 21, + 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 33, 33, 33, 32, + 32, 32, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 22, 22, 23, 23, 23, 22, + 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 28, 27, 27, 26, 26, 26, 24, 24, + 23, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 21, 21, 21, 20, 20, 20, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, + 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, + 19, 18, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, + 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 21, 22, + 22, 22, 23, 23, 23, 22, 23, 23, 23, 21, 21, 20, 19, 19, 19, 18, 18, 18, + 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 20, 20, 20, 21, 21, 21, + 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, + 16, 16, 15, 15, 15, 15, 15, 14, 18, 19, 19, 19, 20, 20, 20, 20, 20, 20, + 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, + 14, 13, 13, 13 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 32, 27, 20, 32, 29, 26, 21, 27, 26, 19, 16, 20, 21, 16, 13, + /* Size 8x8 */ + 33, 32, 32, 30, 29, 25, 22, 19, 32, 32, 32, 31, 30, 26, 23, 20, 32, 32, + 30, 29, 28, 25, 23, 20, 30, 31, 29, 26, 24, 22, 20, 19, 29, 30, 28, 24, + 21, 19, 18, 17, 25, 26, 25, 22, 19, 17, 16, 15, 22, 23, 23, 20, 18, 16, + 14, 13, 19, 20, 20, 19, 17, 15, 13, 12, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 17, 33, 32, + 32, 32, 32, 32, 31, 30, 29, 28, 27, 24, 23, 22, 20, 18, 33, 32, 32, 32, + 32, 32, 31, 31, 30, 28, 28, 25, 23, 22, 20, 19, 33, 32, 32, 32, 32, 31, + 31, 30, 29, 28, 27, 25, 23, 23, 21, 19, 33, 32, 32, 32, 31, 30, 30, 29, + 28, 27, 26, 24, 23, 22, 20, 19, 32, 32, 32, 31, 30, 29, 28, 28, 27, 26, + 26, 24, 23, 22, 21, 19, 32, 31, 31, 31, 30, 28, 28, 27, 26, 25, 24, 23, + 22, 21, 20, 19, 30, 30, 31, 30, 29, 28, 27, 26, 24, 23, 23, 22, 20, 20, + 19, 18, 28, 29, 30, 29, 28, 27, 26, 24, 21, 20, 20, 19, 18, 18, 17, 16, + 27, 28, 28, 28, 27, 26, 25, 23, 20, 20, 20, 18, 18, 17, 16, 15, 26, 27, + 28, 27, 26, 26, 24, 23, 20, 20, 19, 18, 17, 17, 16, 15, 23, 24, 25, 25, + 24, 24, 23, 22, 19, 18, 18, 16, 16, 15, 14, 14, 22, 23, 23, 23, 23, 23, + 22, 20, 18, 18, 17, 16, 15, 14, 14, 13, 21, 22, 22, 23, 22, 22, 21, 20, + 18, 17, 17, 15, 14, 14, 13, 13, 19, 20, 20, 21, 20, 21, 20, 19, 17, 16, + 16, 14, 14, 13, 12, 12, 17, 18, 19, 19, 19, 19, 19, 18, 16, 15, 15, 14, + 13, 13, 12, 11, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 29, 28, 28, + 27, 26, 26, 24, 23, 23, 22, 21, 21, 19, 19, 19, 17, 17, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 28, 26, 26, 25, + 24, 24, 22, 22, 21, 20, 20, 19, 18, 18, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 28, 27, 27, 25, 24, 24, 23, 22, + 22, 20, 20, 19, 18, 18, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 30, 30, 30, 29, 29, 28, 27, 27, 25, 24, 24, 23, 22, 22, 20, 20, 20, + 18, 18, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, + 30, 30, 28, 28, 28, 26, 25, 25, 23, 23, 22, 21, 20, 20, 19, 19, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 28, 28, + 28, 26, 25, 25, 23, 23, 23, 21, 21, 20, 19, 19, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 29, 28, 27, 27, 26, 25, 25, + 23, 23, 23, 21, 21, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 31, 31, 31, + 30, 30, 30, 29, 29, 29, 28, 28, 27, 26, 26, 25, 24, 24, 23, 23, 22, 21, + 20, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, + 29, 28, 28, 28, 27, 26, 26, 25, 24, 24, 23, 23, 22, 21, 20, 20, 19, 19, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, + 27, 26, 26, 25, 24, 24, 23, 23, 22, 21, 21, 20, 19, 19, 32, 32, 32, 32, + 32, 32, 31, 30, 30, 30, 29, 29, 28, 28, 28, 28, 27, 27, 26, 26, 26, 24, + 24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 32, 32, 32, 32, 32, 32, 31, 30, + 30, 30, 29, 29, 28, 28, 28, 28, 27, 27, 26, 26, 26, 24, 24, 24, 23, 22, + 22, 21, 21, 20, 19, 19, 32, 31, 31, 31, 31, 31, 31, 30, 30, 29, 28, 28, + 28, 27, 27, 26, 26, 26, 25, 24, 24, 23, 23, 23, 22, 22, 21, 20, 20, 20, + 19, 19, 30, 30, 30, 30, 31, 31, 30, 29, 29, 29, 28, 28, 27, 26, 26, 25, + 24, 24, 23, 23, 23, 22, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 30, 30, + 30, 30, 31, 31, 30, 29, 29, 29, 28, 28, 27, 26, 26, 25, 24, 24, 23, 23, + 23, 22, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 29, 30, 30, 30, 30, 30, + 30, 29, 28, 28, 28, 28, 26, 25, 25, 24, 23, 23, 22, 22, 22, 21, 20, 20, + 19, 19, 19, 18, 18, 18, 17, 17, 28, 29, 29, 29, 30, 30, 29, 28, 28, 28, + 27, 27, 26, 24, 24, 23, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, + 17, 17, 16, 16, 28, 29, 29, 29, 30, 30, 29, 28, 28, 28, 27, 27, 26, 24, + 24, 23, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, + 27, 28, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 25, 23, 23, 22, 20, 20, + 20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 15, 15, 26, 26, 27, 27, + 28, 28, 27, 26, 26, 26, 26, 26, 24, 23, 23, 22, 20, 20, 20, 19, 19, 18, + 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 26, 26, 27, 27, 28, 28, 27, 26, + 26, 26, 26, 26, 24, 23, 23, 22, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, + 17, 16, 16, 16, 15, 15, 24, 25, 25, 25, 26, 26, 26, 25, 25, 25, 24, 24, + 23, 22, 22, 21, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, + 14, 14, 23, 24, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 23, 22, 22, 20, + 19, 19, 18, 18, 18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 23, 24, + 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18, + 18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 22, 22, 23, 23, 23, 23, + 23, 23, 23, 23, 23, 23, 22, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, + 15, 15, 14, 14, 14, 13, 13, 13, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, + 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 17, 16, 15, 15, 15, 14, 14, 14, + 13, 13, 13, 13, 21, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 20, + 20, 19, 18, 18, 17, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, + 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, + 17, 16, 16, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 12, 19, 20, 20, 20, + 20, 21, 21, 20, 20, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, + 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 19, 19, 19, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 13, 13, + 13, 12, 12, 12, 12, 12, 17, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 18, 18, 17, 16, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, + 11, 11, 17, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 17, + 16, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 11, 11, + /* Size 4x8 */ + 32, 32, 28, 20, 32, 31, 28, 21, 32, 30, 27, 21, 30, 28, 23, 19, 29, 27, + 21, 17, 26, 24, 19, 15, 22, 22, 17, 13, 20, 20, 16, 12, + /* Size 8x4 */ + 32, 32, 32, 30, 29, 26, 22, 20, 32, 31, 30, 28, 27, 24, 22, 20, 28, 28, + 27, 23, 21, 19, 17, 16, 20, 21, 21, 19, 17, 15, 13, 12, + /* Size 8x16 */ + 32, 33, 32, 32, 28, 23, 22, 19, 33, 32, 32, 31, 29, 24, 23, 20, 33, 32, + 32, 31, 29, 25, 23, 21, 33, 32, 31, 31, 29, 25, 23, 21, 32, 32, 30, 30, + 28, 24, 23, 20, 32, 31, 29, 28, 27, 24, 23, 21, 32, 31, 29, 28, 26, 23, + 22, 20, 30, 30, 28, 27, 24, 21, 20, 19, 28, 30, 28, 26, 21, 19, 18, 17, + 27, 28, 26, 25, 21, 18, 18, 16, 26, 28, 26, 24, 20, 18, 17, 16, 23, 25, + 24, 23, 19, 16, 16, 14, 22, 23, 23, 22, 18, 16, 15, 14, 21, 22, 22, 21, + 18, 15, 14, 13, 19, 21, 20, 20, 17, 14, 14, 12, 18, 19, 19, 19, 16, 14, + 13, 12, + /* Size 16x8 */ + 32, 33, 33, 33, 32, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 18, 33, 32, + 32, 32, 32, 31, 31, 30, 30, 28, 28, 25, 23, 22, 21, 19, 32, 32, 32, 31, + 30, 29, 29, 28, 28, 26, 26, 24, 23, 22, 20, 19, 32, 31, 31, 31, 30, 28, + 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 28, 29, 29, 29, 28, 27, 26, 24, + 21, 21, 20, 19, 18, 18, 17, 16, 23, 24, 25, 25, 24, 24, 23, 21, 19, 18, + 18, 16, 16, 15, 14, 14, 22, 23, 23, 23, 23, 23, 22, 20, 18, 18, 17, 16, + 15, 14, 14, 13, 19, 20, 21, 21, 20, 21, 20, 19, 17, 16, 16, 14, 14, 13, + 12, 12, + /* Size 16x32 */ + 32, 33, 33, 33, 32, 32, 32, 29, 28, 27, 23, 23, 22, 19, 19, 17, 33, 32, + 32, 32, 32, 32, 31, 29, 29, 28, 24, 24, 22, 20, 20, 18, 33, 32, 32, 32, + 32, 32, 31, 29, 29, 28, 24, 24, 23, 20, 20, 18, 33, 32, 32, 32, 32, 32, + 31, 29, 29, 28, 24, 24, 23, 20, 20, 18, 33, 32, 32, 32, 32, 32, 31, 30, + 29, 28, 25, 25, 23, 21, 21, 19, 33, 32, 32, 32, 32, 31, 31, 30, 30, 28, + 25, 25, 23, 21, 21, 19, 33, 32, 32, 32, 31, 31, 31, 29, 29, 28, 25, 25, + 23, 21, 21, 19, 32, 32, 32, 32, 31, 30, 30, 28, 28, 27, 24, 24, 23, 21, + 21, 19, 32, 32, 32, 31, 30, 30, 30, 28, 28, 27, 24, 24, 23, 20, 20, 19, + 32, 32, 32, 31, 30, 30, 29, 28, 28, 27, 24, 24, 23, 21, 21, 19, 32, 32, + 31, 31, 29, 29, 28, 27, 27, 26, 24, 24, 23, 21, 21, 19, 32, 32, 31, 31, + 29, 29, 28, 27, 27, 26, 24, 24, 23, 21, 21, 19, 32, 31, 31, 31, 29, 28, + 28, 26, 26, 25, 23, 23, 22, 20, 20, 19, 30, 30, 30, 30, 28, 28, 27, 24, + 24, 23, 21, 21, 20, 19, 19, 18, 30, 30, 30, 30, 28, 28, 27, 24, 24, 23, + 21, 21, 20, 19, 19, 18, 29, 30, 30, 30, 28, 28, 26, 23, 23, 22, 20, 20, + 19, 18, 18, 17, 28, 29, 30, 29, 28, 27, 26, 22, 21, 21, 19, 19, 18, 17, + 17, 16, 28, 29, 30, 29, 28, 27, 26, 22, 21, 21, 19, 19, 18, 17, 17, 16, + 27, 28, 28, 28, 26, 26, 25, 21, 21, 20, 18, 18, 18, 16, 16, 15, 26, 27, + 28, 27, 26, 26, 24, 21, 20, 20, 18, 18, 17, 16, 16, 15, 26, 27, 28, 27, + 26, 26, 24, 21, 20, 20, 18, 18, 17, 16, 16, 15, 24, 26, 26, 26, 24, 24, + 23, 20, 20, 19, 17, 17, 16, 15, 15, 14, 23, 24, 25, 25, 24, 24, 23, 20, + 19, 18, 16, 16, 16, 14, 14, 14, 23, 24, 25, 25, 24, 24, 23, 20, 19, 18, + 16, 16, 16, 14, 14, 13, 22, 23, 23, 23, 23, 23, 22, 19, 18, 18, 16, 16, + 15, 14, 14, 13, 21, 22, 23, 23, 22, 22, 21, 19, 18, 17, 15, 15, 15, 13, + 13, 13, 21, 22, 22, 22, 22, 22, 21, 18, 18, 17, 15, 15, 14, 13, 13, 13, + 19, 20, 21, 21, 21, 21, 20, 18, 17, 17, 14, 14, 14, 13, 13, 12, 19, 20, + 21, 21, 20, 20, 20, 17, 17, 16, 14, 14, 14, 12, 12, 12, 19, 20, 20, 20, + 20, 20, 19, 17, 17, 16, 14, 14, 13, 12, 12, 12, 18, 19, 19, 19, 19, 19, + 19, 17, 16, 15, 14, 14, 13, 12, 12, 11, 18, 19, 19, 19, 19, 19, 19, 17, + 16, 15, 14, 14, 13, 12, 12, 11, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 29, 28, 28, + 27, 26, 26, 24, 23, 23, 22, 21, 21, 19, 19, 19, 18, 18, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 28, 27, 27, 26, + 24, 24, 23, 22, 22, 20, 20, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 28, 28, 28, 26, 25, 25, 23, 23, + 22, 21, 21, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 30, 30, 30, 29, 29, 28, 27, 27, 26, 25, 25, 23, 23, 22, 21, 21, 20, + 19, 19, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 29, 28, 28, 28, + 28, 28, 26, 26, 26, 24, 24, 24, 23, 22, 22, 21, 20, 20, 19, 19, 32, 32, + 32, 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 28, 27, 27, 26, 26, + 26, 24, 24, 24, 23, 22, 22, 21, 20, 20, 19, 19, 32, 31, 31, 31, 31, 31, + 31, 30, 30, 29, 28, 28, 28, 27, 27, 26, 26, 26, 25, 24, 24, 23, 23, 23, + 22, 21, 21, 20, 20, 19, 19, 19, 29, 29, 29, 29, 30, 30, 29, 28, 28, 28, + 27, 27, 26, 24, 24, 23, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, + 17, 17, 17, 17, 28, 29, 29, 29, 29, 30, 29, 28, 28, 28, 27, 27, 26, 24, + 24, 23, 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, + 27, 28, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 25, 23, 23, 22, 21, 21, + 20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 15, 15, 23, 24, 24, 24, + 25, 25, 25, 24, 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18, 18, 17, + 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 23, 24, 24, 24, 25, 25, 25, 24, + 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 15, + 15, 14, 14, 14, 14, 14, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 22, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, + 13, 13, 19, 20, 20, 20, 21, 21, 21, 21, 20, 21, 21, 21, 20, 19, 19, 18, + 17, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 19, 20, + 20, 20, 21, 21, 21, 21, 20, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, + 16, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 17, 18, 18, 18, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 18, 18, 17, 16, 16, 15, 15, 15, 14, 14, 13, + 13, 13, 13, 12, 12, 12, 11, 11, + /* Size 4x16 */ + 33, 32, 27, 19, 32, 32, 28, 20, 32, 32, 28, 21, 32, 31, 28, 21, 32, 30, + 27, 20, 32, 29, 26, 21, 31, 28, 25, 20, 30, 28, 23, 19, 29, 27, 21, 17, + 28, 26, 20, 16, 27, 26, 20, 16, 24, 24, 18, 14, 23, 23, 18, 14, 22, 22, + 17, 13, 20, 20, 16, 12, 19, 19, 15, 12, + /* Size 16x4 */ + 33, 32, 32, 32, 32, 32, 31, 30, 29, 28, 27, 24, 23, 22, 20, 19, 32, 32, + 32, 31, 30, 29, 28, 28, 27, 26, 26, 24, 23, 22, 20, 19, 27, 28, 28, 28, + 27, 26, 25, 23, 21, 20, 20, 18, 18, 17, 16, 15, 19, 20, 21, 21, 20, 21, + 20, 19, 17, 16, 16, 14, 14, 13, 12, 12, + /* Size 8x32 */ + 32, 33, 32, 32, 28, 23, 22, 19, 33, 32, 32, 31, 29, 24, 22, 20, 33, 32, + 32, 31, 29, 24, 23, 20, 33, 32, 32, 31, 29, 24, 23, 20, 33, 32, 32, 31, + 29, 25, 23, 21, 33, 32, 32, 31, 30, 25, 23, 21, 33, 32, 31, 31, 29, 25, + 23, 21, 32, 32, 31, 30, 28, 24, 23, 21, 32, 32, 30, 30, 28, 24, 23, 20, + 32, 32, 30, 29, 28, 24, 23, 21, 32, 31, 29, 28, 27, 24, 23, 21, 32, 31, + 29, 28, 27, 24, 23, 21, 32, 31, 29, 28, 26, 23, 22, 20, 30, 30, 28, 27, + 24, 21, 20, 19, 30, 30, 28, 27, 24, 21, 20, 19, 29, 30, 28, 26, 23, 20, + 19, 18, 28, 30, 28, 26, 21, 19, 18, 17, 28, 30, 28, 26, 21, 19, 18, 17, + 27, 28, 26, 25, 21, 18, 18, 16, 26, 28, 26, 24, 20, 18, 17, 16, 26, 28, + 26, 24, 20, 18, 17, 16, 24, 26, 24, 23, 20, 17, 16, 15, 23, 25, 24, 23, + 19, 16, 16, 14, 23, 25, 24, 23, 19, 16, 16, 14, 22, 23, 23, 22, 18, 16, + 15, 14, 21, 23, 22, 21, 18, 15, 15, 13, 21, 22, 22, 21, 18, 15, 14, 13, + 19, 21, 21, 20, 17, 14, 14, 13, 19, 21, 20, 20, 17, 14, 14, 12, 19, 20, + 20, 19, 17, 14, 13, 12, 18, 19, 19, 19, 16, 14, 13, 12, 18, 19, 19, 19, + 16, 14, 13, 12, + /* Size 32x8 */ + 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 29, 28, 28, + 27, 26, 26, 24, 23, 23, 22, 21, 21, 19, 19, 19, 18, 18, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 28, 28, 28, 26, + 25, 25, 23, 23, 22, 21, 21, 20, 19, 19, 32, 32, 32, 32, 32, 32, 31, 31, + 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 26, 26, 26, 24, 24, 24, 23, 22, + 22, 21, 20, 20, 19, 19, 32, 31, 31, 31, 31, 31, 31, 30, 30, 29, 28, 28, + 28, 27, 27, 26, 26, 26, 25, 24, 24, 23, 23, 23, 22, 21, 21, 20, 20, 19, + 19, 19, 28, 29, 29, 29, 29, 30, 29, 28, 28, 28, 27, 27, 26, 24, 24, 23, + 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 23, 24, + 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18, + 18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 22, 22, 23, 23, 23, 23, + 23, 23, 23, 23, 23, 23, 22, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, + 15, 15, 14, 14, 14, 13, 13, 13, 19, 20, 20, 20, 21, 21, 21, 21, 20, 21, + 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, + 12, 12, 12, 12 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 27, 22, 21, 27, 22, 22, 22, 22, 22, 19, 18, 21, 22, 18, 16, + /* Size 8x8 */ + 33, 33, 29, 24, 21, 22, 21, 20, 33, 32, 28, 24, 22, 23, 22, 21, 29, 28, + 25, 23, 22, 23, 22, 21, 24, 24, 23, 21, 20, 21, 20, 20, 21, 22, 22, 20, + 19, 19, 19, 19, 22, 23, 23, 21, 19, 18, 17, 17, 21, 22, 22, 20, 19, 17, + 17, 16, 20, 21, 21, 20, 19, 17, 16, 15, + /* Size 16x16 */ + 32, 33, 34, 33, 31, 28, 27, 25, 21, 21, 21, 21, 20, 20, 20, 19, 33, 33, + 33, 32, 30, 27, 26, 24, 22, 22, 22, 22, 21, 21, 20, 20, 34, 33, 33, 32, + 29, 26, 25, 24, 22, 22, 22, 23, 22, 22, 21, 20, 33, 32, 32, 31, 28, 26, + 25, 24, 22, 22, 23, 23, 22, 22, 22, 21, 31, 30, 29, 28, 26, 24, 23, 23, + 22, 22, 22, 23, 22, 22, 22, 21, 28, 27, 26, 26, 24, 22, 22, 22, 21, 22, + 22, 23, 22, 22, 22, 21, 27, 26, 25, 25, 23, 22, 22, 21, 21, 21, 21, 22, + 22, 22, 21, 21, 25, 24, 24, 24, 23, 22, 21, 21, 20, 20, 21, 21, 20, 20, + 20, 20, 21, 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, + 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 18, 21, 22, + 22, 23, 22, 22, 21, 21, 19, 19, 19, 19, 18, 18, 18, 18, 21, 22, 23, 23, + 23, 23, 22, 21, 19, 19, 19, 18, 17, 17, 17, 17, 20, 21, 22, 22, 22, 22, + 22, 20, 19, 19, 18, 17, 17, 17, 16, 16, 20, 21, 22, 22, 22, 22, 22, 20, + 19, 19, 18, 17, 17, 17, 16, 16, 20, 20, 21, 22, 22, 22, 21, 20, 19, 18, + 18, 17, 16, 16, 16, 15, 19, 20, 20, 21, 21, 21, 21, 20, 19, 18, 18, 17, + 16, 16, 15, 14, + /* Size 32x32 */ + 32, 33, 33, 33, 34, 34, 33, 31, 31, 30, 28, 28, 27, 25, 25, 23, 21, 21, + 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 33, 33, 33, 33, + 33, 33, 33, 30, 30, 29, 27, 27, 26, 24, 24, 23, 21, 21, 22, 22, 22, 22, + 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 33, 33, 33, 33, 33, 33, 32, 30, + 30, 29, 27, 27, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, + 21, 21, 20, 20, 20, 20, 33, 33, 33, 33, 33, 33, 32, 30, 30, 28, 27, 27, + 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, + 20, 20, 34, 33, 33, 33, 33, 33, 32, 29, 29, 28, 26, 26, 25, 24, 24, 23, + 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 34, 33, + 33, 33, 33, 32, 32, 29, 29, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 23, + 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 33, 33, 32, 32, 32, 32, + 31, 29, 28, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 23, 23, 23, 23, 23, + 22, 22, 22, 22, 22, 21, 21, 21, 31, 30, 30, 30, 29, 29, 29, 27, 27, 26, + 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, + 22, 21, 21, 21, 31, 30, 30, 30, 29, 29, 28, 27, 26, 26, 24, 24, 23, 23, + 23, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, + 30, 29, 29, 28, 28, 28, 28, 26, 26, 25, 23, 23, 23, 23, 23, 22, 22, 22, + 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 28, 27, 27, 27, + 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23, + 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 28, 27, 27, 27, 26, 26, 26, 24, + 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23, 23, 23, 22, 22, + 22, 22, 22, 22, 21, 21, 27, 26, 26, 26, 25, 25, 25, 24, 23, 23, 22, 22, + 22, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 21, 21, 21, + 21, 21, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 21, 21, 21, 21, + 20, 20, 20, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 25, 24, + 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 21, 21, 21, 21, 20, 20, 20, 21, + 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 23, 23, 23, 23, 23, 23, + 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, + 21, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, + 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 22, 22, 22, + 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, + 19, 18, 18, 18, 18, 18, 18, 18, 18, 18, 21, 22, 22, 22, 22, 23, 23, 22, + 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, + 22, 21, 21, 20, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, + 17, 17, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 20, + 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 21, 22, + 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18, + 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 20, 21, 21, 21, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, + 17, 17, 17, 16, 16, 16, 16, 16, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16, + 16, 16, 16, 16, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, + 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, + 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, + 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 20, 20, 20, 21, + 21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, + 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 19, 20, 20, 20, 21, 21, 21, 21, + 21, 21, 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, + 16, 15, 15, 15, 15, 15, 19, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, + 21, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, + 14, 14, 19, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19, + 19, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, + /* Size 4x8 */ + 33, 27, 22, 20, 33, 26, 22, 21, 28, 23, 22, 22, 24, 22, 20, 20, 22, 21, + 19, 19, 22, 22, 19, 17, 21, 22, 19, 16, 20, 21, 18, 15, + /* Size 8x4 */ + 33, 33, 28, 24, 22, 22, 21, 20, 27, 26, 23, 22, 21, 22, 22, 21, 22, 22, + 22, 20, 19, 19, 19, 18, 20, 21, 22, 20, 19, 17, 16, 15, + /* Size 8x16 */ + 32, 33, 29, 27, 21, 21, 20, 20, 33, 33, 28, 26, 22, 22, 21, 20, 34, 32, + 27, 26, 22, 23, 22, 21, 33, 31, 27, 25, 22, 23, 22, 21, 31, 28, 25, 23, + 22, 22, 22, 22, 28, 26, 23, 22, 22, 23, 22, 22, 26, 25, 22, 22, 21, 22, + 22, 21, 24, 24, 22, 21, 20, 21, 20, 20, 21, 22, 21, 21, 19, 19, 19, 19, + 21, 22, 22, 21, 19, 19, 19, 18, 21, 22, 22, 21, 19, 18, 18, 18, 21, 23, + 23, 22, 19, 18, 17, 17, 20, 22, 22, 21, 19, 17, 17, 16, 20, 22, 22, 21, + 19, 17, 17, 16, 20, 21, 22, 21, 19, 17, 16, 16, 19, 20, 21, 20, 19, 17, + 16, 15, + /* Size 16x8 */ + 32, 33, 34, 33, 31, 28, 26, 24, 21, 21, 21, 21, 20, 20, 20, 19, 33, 33, + 32, 31, 28, 26, 25, 24, 22, 22, 22, 23, 22, 22, 21, 20, 29, 28, 27, 27, + 25, 23, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 27, 26, 26, 25, 23, 22, + 22, 21, 21, 21, 21, 22, 21, 21, 21, 20, 21, 22, 22, 22, 22, 22, 21, 20, + 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 23, 23, 22, 23, 22, 21, 19, 19, + 18, 18, 17, 17, 17, 17, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17, + 17, 17, 16, 16, 20, 20, 21, 21, 22, 22, 21, 20, 19, 18, 18, 17, 16, 16, + 16, 15, + /* Size 16x32 */ + 32, 33, 33, 33, 29, 28, 27, 22, 21, 21, 21, 21, 20, 20, 20, 19, 33, 33, + 33, 32, 28, 27, 26, 22, 22, 22, 21, 21, 21, 20, 20, 19, 33, 33, 33, 32, + 28, 27, 26, 22, 22, 22, 22, 22, 21, 20, 20, 20, 33, 33, 33, 32, 28, 27, + 26, 22, 22, 22, 22, 22, 21, 20, 20, 20, 34, 33, 32, 32, 27, 26, 26, 23, + 22, 22, 23, 23, 22, 21, 21, 20, 34, 33, 32, 31, 27, 26, 25, 23, 22, 22, + 23, 23, 22, 21, 21, 20, 33, 32, 31, 31, 27, 26, 25, 23, 22, 22, 23, 23, + 22, 21, 21, 20, 31, 29, 29, 28, 25, 24, 24, 22, 22, 22, 23, 23, 22, 22, + 22, 21, 31, 29, 28, 28, 25, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, + 30, 28, 28, 28, 24, 23, 23, 22, 22, 22, 23, 23, 22, 22, 22, 21, 28, 26, + 26, 25, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 21, 28, 26, 26, 25, + 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 21, 26, 26, 25, 24, 22, 22, + 22, 21, 21, 21, 22, 22, 22, 21, 21, 20, 24, 24, 24, 24, 22, 22, 21, 20, + 20, 20, 21, 21, 20, 20, 20, 20, 24, 24, 24, 24, 22, 22, 21, 20, 20, 20, + 21, 21, 20, 20, 20, 20, 23, 23, 23, 23, 22, 22, 21, 20, 20, 20, 20, 20, + 20, 20, 20, 19, 21, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, + 19, 19, 21, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, + 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 18, 18, 18, 21, 22, + 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, 18, 18, 18, 17, 21, 22, 22, 22, + 22, 22, 21, 20, 19, 19, 18, 18, 18, 18, 18, 17, 21, 22, 23, 23, 22, 22, + 22, 20, 19, 19, 18, 18, 18, 17, 17, 17, 21, 22, 23, 23, 23, 22, 22, 20, + 19, 19, 18, 18, 17, 17, 17, 17, 21, 22, 23, 23, 22, 22, 22, 20, 19, 19, + 18, 18, 17, 17, 17, 16, 20, 22, 22, 22, 22, 22, 21, 19, 19, 19, 17, 17, + 17, 16, 16, 16, 20, 21, 22, 22, 22, 22, 21, 19, 19, 19, 17, 17, 17, 16, + 16, 16, 20, 21, 22, 22, 22, 22, 21, 19, 19, 19, 17, 17, 17, 16, 16, 16, + 20, 21, 21, 21, 22, 22, 21, 19, 19, 18, 17, 17, 16, 16, 16, 15, 20, 21, + 21, 21, 22, 22, 21, 19, 19, 18, 17, 17, 16, 16, 16, 15, 19, 20, 21, 21, + 21, 21, 21, 19, 19, 18, 17, 17, 16, 15, 15, 15, 19, 20, 20, 20, 21, 21, + 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 19, 20, 20, 20, 21, 21, 20, 19, + 19, 18, 17, 17, 16, 15, 15, 14, + /* Size 32x16 */ + 32, 33, 33, 33, 34, 34, 33, 31, 31, 30, 28, 28, 26, 24, 24, 23, 21, 21, + 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 33, 33, 33, 33, + 33, 33, 32, 29, 29, 28, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 33, 33, 33, 33, 32, 32, 31, 29, + 28, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, + 22, 21, 21, 21, 20, 20, 33, 32, 32, 32, 32, 31, 31, 28, 28, 28, 25, 25, + 24, 24, 24, 23, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 21, 21, 21, + 20, 20, 29, 28, 28, 28, 27, 27, 27, 25, 25, 24, 23, 23, 22, 22, 22, 22, + 21, 21, 22, 22, 22, 22, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 28, 27, + 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 27, 26, 26, 26, 26, 25, + 25, 24, 23, 23, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, + 21, 21, 21, 21, 21, 21, 20, 20, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, + 22, 22, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, + 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, + 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 21, 22, 22, + 23, 23, 23, 23, 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18, 18, 18, + 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 21, 21, 22, 22, 23, 23, 23, 23, + 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, + 17, 17, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, + 16, 16, 20, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 21, 20, 20, 20, + 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 20, 20, + 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, + 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 19, 19, 20, 20, 20, 20, + 20, 21, 21, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 17, 17, 17, 17, 16, + 16, 16, 16, 15, 15, 15, 14, 14, + /* Size 4x16 */ + 33, 28, 21, 20, 33, 27, 22, 20, 33, 26, 22, 21, 32, 26, 22, 21, 29, 24, + 22, 22, 26, 22, 22, 22, 26, 22, 21, 21, 24, 22, 20, 20, 22, 21, 19, 19, + 22, 22, 19, 18, 22, 22, 19, 18, 22, 22, 19, 17, 22, 22, 19, 16, 21, 22, + 19, 16, 21, 22, 18, 16, 20, 21, 18, 15, + /* Size 16x4 */ + 33, 33, 33, 32, 29, 26, 26, 24, 22, 22, 22, 22, 22, 21, 21, 20, 28, 27, + 26, 26, 24, 22, 22, 22, 21, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, + 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 18, 20, 20, 21, 21, 22, 22, + 21, 20, 19, 18, 18, 17, 16, 16, 16, 15, + /* Size 8x32 */ + 32, 33, 29, 27, 21, 21, 20, 20, 33, 33, 28, 26, 22, 21, 21, 20, 33, 33, + 28, 26, 22, 22, 21, 20, 33, 33, 28, 26, 22, 22, 21, 20, 34, 32, 27, 26, + 22, 23, 22, 21, 34, 32, 27, 25, 22, 23, 22, 21, 33, 31, 27, 25, 22, 23, + 22, 21, 31, 29, 25, 24, 22, 23, 22, 22, 31, 28, 25, 23, 22, 22, 22, 22, + 30, 28, 24, 23, 22, 23, 22, 22, 28, 26, 23, 22, 22, 23, 22, 22, 28, 26, + 23, 22, 22, 23, 22, 22, 26, 25, 22, 22, 21, 22, 22, 21, 24, 24, 22, 21, + 20, 21, 20, 20, 24, 24, 22, 21, 20, 21, 20, 20, 23, 23, 22, 21, 20, 20, + 20, 20, 21, 22, 21, 21, 19, 19, 19, 19, 21, 22, 21, 21, 19, 19, 19, 19, + 21, 22, 22, 21, 19, 19, 19, 18, 21, 22, 22, 21, 19, 18, 18, 18, 21, 22, + 22, 21, 19, 18, 18, 18, 21, 23, 22, 22, 19, 18, 18, 17, 21, 23, 23, 22, + 19, 18, 17, 17, 21, 23, 22, 22, 19, 18, 17, 17, 20, 22, 22, 21, 19, 17, + 17, 16, 20, 22, 22, 21, 19, 17, 17, 16, 20, 22, 22, 21, 19, 17, 17, 16, + 20, 21, 22, 21, 19, 17, 16, 16, 20, 21, 22, 21, 19, 17, 16, 16, 19, 21, + 21, 21, 19, 17, 16, 15, 19, 20, 21, 20, 19, 17, 16, 15, 19, 20, 21, 20, + 19, 17, 16, 15, + /* Size 32x8 */ + 32, 33, 33, 33, 34, 34, 33, 31, 31, 30, 28, 28, 26, 24, 24, 23, 21, 21, + 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 33, 33, 33, 33, + 32, 32, 31, 29, 28, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 22, 22, 23, + 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 29, 28, 28, 28, 27, 27, 27, 25, + 25, 24, 23, 23, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 23, 22, 22, 22, + 22, 22, 22, 21, 21, 21, 27, 26, 26, 26, 26, 25, 25, 24, 23, 23, 22, 22, + 22, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 21, 21, 21, 21, 21, 21, + 20, 20, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, + 22, 22, 23, 23, 23, 23, 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18, + 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, + 17, 17, 17, 16, 16, 16, 16, 16, 20, 20, 20, 20, 21, 21, 21, 22, 22, 22, + 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, + 16, 15, 15, 15 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 32, 29, 24, 32, 30, 28, 24, 29, 28, 21, 19, 24, 24, 19, 16, + /* Size 8x8 */ + 33, 33, 32, 32, 30, 28, 24, 22, 33, 32, 32, 32, 30, 28, 25, 23, 32, 32, + 31, 30, 29, 27, 24, 23, 32, 32, 30, 29, 28, 26, 24, 22, 30, 30, 29, 28, + 25, 23, 21, 20, 28, 28, 27, 26, 23, 20, 18, 17, 24, 25, 24, 24, 21, 18, + 16, 15, 22, 23, 23, 22, 20, 17, 15, 14, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 33, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 33, 32, + 32, 32, 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 33, 32, 32, 32, + 32, 32, 32, 31, 30, 30, 29, 27, 26, 24, 23, 23, 33, 32, 32, 32, 32, 32, + 32, 31, 31, 30, 30, 28, 27, 25, 23, 23, 33, 32, 32, 32, 31, 31, 31, 30, + 29, 28, 28, 26, 26, 24, 23, 23, 33, 32, 32, 32, 31, 31, 30, 30, 29, 28, + 28, 26, 26, 24, 23, 23, 32, 32, 32, 32, 31, 30, 29, 28, 28, 27, 27, 26, + 25, 24, 23, 22, 32, 31, 31, 31, 30, 30, 28, 28, 27, 26, 26, 24, 24, 23, + 22, 22, 30, 30, 30, 31, 29, 29, 28, 27, 26, 24, 24, 23, 22, 22, 20, 20, + 29, 29, 30, 30, 28, 28, 27, 26, 24, 22, 22, 21, 20, 20, 19, 19, 28, 29, + 29, 30, 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 26, 27, 27, 28, + 26, 26, 26, 24, 23, 21, 20, 19, 19, 18, 17, 17, 25, 26, 26, 27, 26, 26, + 25, 24, 22, 20, 20, 19, 18, 17, 17, 16, 23, 24, 24, 25, 24, 24, 24, 23, + 22, 20, 19, 18, 17, 16, 16, 15, 22, 23, 23, 23, 23, 23, 23, 22, 20, 19, + 18, 17, 17, 16, 15, 15, 21, 22, 23, 23, 23, 23, 22, 22, 20, 19, 18, 17, + 16, 15, 15, 14, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 30, + 29, 28, 28, 27, 26, 26, 25, 23, 23, 23, 22, 21, 21, 20, 33, 33, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 29, 28, + 26, 26, 26, 24, 24, 23, 22, 22, 22, 20, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 29, 28, 27, 27, 26, 24, + 24, 24, 23, 22, 22, 21, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 30, 30, 30, 29, 29, 29, 28, 27, 27, 26, 24, 24, 24, 23, 22, + 22, 21, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, + 30, 30, 30, 29, 29, 28, 27, 27, 26, 24, 24, 24, 23, 23, 23, 21, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, + 30, 28, 28, 28, 27, 25, 25, 25, 23, 23, 23, 22, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 28, 28, 28, + 27, 25, 25, 25, 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 28, 27, 27, 26, 25, 25, 24, + 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 30, 29, 29, 29, 28, 28, 28, 28, 26, 26, 26, 24, 24, 24, 23, 23, 23, 21, + 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 29, 29, + 28, 28, 28, 27, 26, 26, 26, 24, 24, 24, 23, 23, 23, 21, 33, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 27, + 26, 26, 26, 24, 24, 24, 23, 23, 23, 21, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 28, 28, 26, 26, 26, 25, 24, + 24, 24, 23, 23, 23, 21, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29, + 29, 29, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, + 22, 21, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 29, 28, 28, + 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 21, 32, 31, + 31, 31, 31, 31, 31, 31, 30, 30, 30, 29, 28, 28, 28, 27, 27, 27, 26, 26, + 26, 25, 24, 24, 24, 23, 23, 23, 22, 22, 22, 20, 30, 30, 30, 30, 30, 31, + 31, 30, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 24, 24, 24, 23, 23, 23, + 22, 22, 22, 21, 20, 20, 20, 19, 30, 30, 30, 30, 30, 31, 31, 30, 29, 29, + 29, 28, 28, 28, 27, 26, 26, 26, 24, 24, 24, 23, 23, 23, 22, 22, 22, 21, + 20, 20, 20, 19, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, + 27, 26, 26, 25, 24, 23, 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, + 29, 29, 29, 29, 30, 30, 30, 30, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24, + 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 19, 18, 28, 29, 29, 29, + 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 23, 22, 21, 21, 20, + 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 28, 29, 29, 29, 29, 30, 30, 29, + 28, 28, 28, 28, 27, 27, 26, 24, 24, 23, 22, 21, 21, 20, 20, 20, 20, 19, + 19, 19, 18, 18, 18, 18, 27, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 26, + 26, 26, 25, 23, 23, 23, 21, 20, 20, 20, 20, 20, 19, 18, 18, 18, 18, 17, + 17, 17, 26, 26, 27, 27, 27, 28, 28, 27, 26, 26, 26, 26, 26, 26, 24, 23, + 23, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 26, 26, + 27, 27, 27, 28, 28, 27, 26, 26, 26, 26, 26, 26, 24, 23, 23, 22, 21, 20, + 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 25, 26, 26, 26, 26, 27, + 27, 26, 26, 26, 26, 25, 25, 25, 24, 22, 22, 22, 20, 20, 20, 19, 19, 19, + 18, 17, 17, 17, 17, 16, 16, 16, 23, 24, 24, 24, 24, 25, 25, 25, 24, 24, + 24, 24, 24, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, + 16, 15, 15, 15, 23, 24, 24, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 24, + 23, 22, 22, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 15, 15, 15, + 23, 23, 24, 24, 24, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, + 19, 19, 19, 18, 18, 18, 17, 16, 16, 16, 15, 15, 15, 15, 22, 22, 23, 23, + 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 20, 20, 20, 19, 18, 18, 18, + 17, 17, 17, 16, 16, 15, 15, 15, 15, 14, 21, 22, 22, 22, 23, 23, 23, 23, + 23, 23, 23, 23, 22, 22, 22, 20, 20, 20, 19, 18, 18, 17, 17, 17, 16, 15, + 15, 15, 15, 14, 14, 14, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, + 22, 22, 22, 20, 20, 20, 19, 18, 18, 17, 17, 17, 16, 15, 15, 15, 15, 14, + 14, 14, 20, 20, 21, 21, 21, 22, 22, 22, 21, 21, 21, 21, 21, 21, 20, 19, + 19, 19, 18, 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 13, + /* Size 4x8 */ + 33, 32, 29, 24, 32, 31, 30, 25, 32, 30, 28, 24, 32, 29, 27, 24, 30, 28, + 24, 21, 28, 26, 21, 18, 24, 24, 19, 16, 22, 22, 18, 15, + /* Size 8x4 */ + 33, 32, 32, 32, 30, 28, 24, 22, 32, 31, 30, 29, 28, 26, 24, 22, 29, 30, + 28, 27, 24, 21, 19, 18, 24, 25, 24, 24, 21, 18, 16, 15, + /* Size 8x16 */ + 32, 33, 33, 32, 29, 28, 23, 22, 33, 32, 32, 32, 29, 29, 24, 23, 33, 32, + 32, 32, 30, 29, 25, 23, 33, 32, 32, 31, 30, 30, 25, 23, 33, 32, 31, 30, + 29, 28, 24, 23, 32, 32, 31, 30, 28, 28, 24, 23, 32, 31, 30, 29, 28, 27, + 24, 23, 32, 31, 30, 28, 26, 26, 23, 22, 30, 30, 29, 28, 25, 24, 21, 20, + 29, 30, 28, 27, 23, 22, 20, 19, 28, 30, 28, 27, 22, 21, 19, 18, 26, 28, + 26, 26, 21, 20, 18, 17, 25, 26, 26, 25, 21, 20, 17, 17, 23, 25, 24, 24, + 20, 19, 16, 16, 22, 23, 23, 23, 19, 18, 16, 15, 21, 23, 23, 22, 19, 18, + 15, 15, + /* Size 16x8 */ + 32, 33, 33, 33, 33, 32, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 33, 32, + 32, 32, 32, 32, 31, 31, 30, 30, 30, 28, 26, 25, 23, 23, 33, 32, 32, 32, + 31, 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 23, 32, 32, 32, 31, 30, 30, + 29, 28, 28, 27, 27, 26, 25, 24, 23, 22, 29, 29, 30, 30, 29, 28, 28, 26, + 25, 23, 22, 21, 21, 20, 19, 19, 28, 29, 29, 30, 28, 28, 27, 26, 24, 22, + 21, 20, 20, 19, 18, 18, 23, 24, 25, 25, 24, 24, 24, 23, 21, 20, 19, 18, + 17, 16, 16, 15, 22, 23, 23, 23, 23, 23, 23, 22, 20, 19, 18, 17, 17, 16, + 15, 15, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 32, 32, 32, 29, 28, 28, 26, 23, 23, 22, 19, 33, 33, + 32, 32, 32, 32, 32, 31, 29, 29, 29, 26, 24, 24, 22, 20, 33, 32, 32, 32, + 32, 32, 32, 31, 29, 29, 29, 26, 24, 24, 23, 20, 33, 32, 32, 32, 32, 32, + 32, 31, 29, 29, 29, 26, 24, 24, 23, 20, 33, 32, 32, 32, 32, 32, 32, 31, + 30, 29, 29, 26, 25, 25, 23, 20, 33, 32, 32, 32, 32, 31, 31, 31, 30, 30, + 30, 27, 25, 25, 23, 21, 33, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 27, + 25, 25, 23, 21, 33, 32, 32, 32, 32, 31, 31, 31, 30, 29, 29, 27, 25, 25, + 23, 21, 33, 32, 32, 32, 31, 30, 30, 30, 29, 28, 28, 26, 24, 24, 23, 21, + 32, 32, 32, 32, 31, 30, 30, 30, 28, 28, 28, 26, 24, 24, 23, 20, 32, 32, + 32, 32, 31, 30, 30, 30, 28, 28, 28, 26, 24, 24, 23, 20, 32, 32, 32, 32, + 31, 29, 29, 29, 28, 28, 28, 26, 24, 24, 23, 21, 32, 32, 31, 31, 30, 29, + 29, 28, 28, 27, 27, 25, 24, 24, 23, 21, 32, 32, 31, 31, 30, 29, 29, 28, + 28, 27, 27, 25, 24, 24, 23, 21, 32, 31, 31, 31, 30, 28, 28, 28, 26, 26, + 26, 24, 23, 23, 22, 20, 30, 30, 30, 30, 29, 28, 28, 27, 25, 24, 24, 23, + 21, 21, 20, 19, 30, 30, 30, 30, 29, 28, 28, 27, 25, 24, 24, 23, 21, 21, + 20, 19, 30, 30, 30, 30, 29, 28, 28, 27, 24, 24, 24, 22, 21, 21, 20, 19, + 29, 29, 30, 30, 28, 27, 27, 26, 23, 22, 22, 20, 20, 20, 19, 17, 28, 29, + 30, 30, 28, 27, 27, 26, 22, 21, 21, 20, 19, 19, 18, 17, 28, 29, 30, 30, + 28, 27, 27, 26, 22, 21, 21, 20, 19, 19, 18, 17, 27, 28, 28, 28, 28, 26, + 26, 25, 22, 21, 21, 19, 18, 18, 18, 16, 26, 27, 28, 28, 26, 26, 26, 24, + 21, 20, 20, 19, 18, 18, 17, 16, 26, 27, 28, 28, 26, 26, 26, 24, 21, 20, + 20, 19, 18, 18, 17, 16, 25, 26, 26, 26, 26, 25, 25, 24, 21, 20, 20, 18, + 17, 17, 17, 15, 23, 24, 25, 25, 24, 24, 24, 23, 20, 19, 19, 17, 16, 16, + 16, 14, 23, 24, 25, 25, 24, 24, 24, 23, 20, 19, 19, 17, 16, 16, 16, 14, + 23, 24, 24, 24, 24, 24, 24, 23, 20, 19, 19, 17, 16, 16, 15, 14, 22, 23, + 23, 23, 23, 23, 23, 22, 19, 18, 18, 17, 16, 16, 15, 14, 21, 22, 23, 23, + 23, 22, 22, 21, 19, 18, 18, 17, 15, 15, 15, 13, 21, 22, 23, 23, 23, 22, + 22, 21, 19, 18, 18, 17, 15, 15, 15, 13, 20, 21, 22, 22, 21, 21, 21, 20, + 18, 18, 18, 16, 15, 15, 14, 13, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 30, + 29, 28, 28, 27, 26, 26, 25, 23, 23, 23, 22, 21, 21, 20, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 29, 28, + 27, 27, 26, 24, 24, 24, 23, 22, 22, 21, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 30, 28, 28, 28, 26, 25, + 25, 24, 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 30, 30, 30, 30, 30, 30, 28, 28, 28, 26, 25, 25, 24, 23, 23, + 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, + 29, 29, 28, 28, 28, 28, 26, 26, 26, 24, 24, 24, 23, 23, 23, 21, 32, 32, + 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 27, + 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 21, 32, 32, 32, 32, 32, 31, + 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, + 25, 24, 24, 24, 23, 22, 22, 21, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, + 30, 29, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, + 22, 21, 21, 20, 29, 29, 29, 29, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, + 26, 25, 25, 24, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, + 28, 29, 29, 29, 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24, + 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 28, 29, 29, 29, + 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24, 22, 21, 21, 21, + 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 26, 26, 26, 26, 26, 27, 27, 27, + 26, 26, 26, 26, 25, 25, 24, 23, 23, 22, 20, 20, 20, 19, 19, 19, 18, 17, + 17, 17, 17, 17, 17, 16, 23, 24, 24, 24, 25, 25, 25, 25, 24, 24, 24, 24, + 24, 24, 23, 21, 21, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 15, + 15, 15, 23, 24, 24, 24, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 23, 21, + 21, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 15, 15, 15, 22, 22, + 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 20, 20, 20, 19, 18, + 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 15, 14, 19, 20, 20, 20, 20, 21, + 21, 21, 21, 20, 20, 21, 21, 21, 20, 19, 19, 19, 17, 17, 17, 16, 16, 16, + 15, 14, 14, 14, 14, 13, 13, 13, + /* Size 4x16 */ + 33, 32, 28, 23, 32, 32, 29, 24, 32, 32, 29, 25, 32, 31, 30, 25, 32, 30, + 28, 24, 32, 30, 28, 24, 32, 29, 27, 24, 31, 28, 26, 23, 30, 28, 24, 21, + 29, 27, 22, 20, 29, 27, 21, 19, 27, 26, 20, 18, 26, 25, 20, 17, 24, 24, + 19, 16, 23, 23, 18, 16, 22, 22, 18, 15, + /* Size 16x4 */ + 33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 32, 32, + 32, 31, 30, 30, 29, 28, 28, 27, 27, 26, 25, 24, 23, 22, 28, 29, 29, 30, + 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 23, 24, 25, 25, 24, 24, + 24, 23, 21, 20, 19, 18, 17, 16, 16, 15, + /* Size 8x32 */ + 32, 33, 33, 32, 29, 28, 23, 22, 33, 32, 32, 32, 29, 29, 24, 22, 33, 32, + 32, 32, 29, 29, 24, 23, 33, 32, 32, 32, 29, 29, 24, 23, 33, 32, 32, 32, + 30, 29, 25, 23, 33, 32, 32, 31, 30, 30, 25, 23, 33, 32, 32, 31, 30, 30, + 25, 23, 33, 32, 32, 31, 30, 29, 25, 23, 33, 32, 31, 30, 29, 28, 24, 23, + 32, 32, 31, 30, 28, 28, 24, 23, 32, 32, 31, 30, 28, 28, 24, 23, 32, 32, + 31, 29, 28, 28, 24, 23, 32, 31, 30, 29, 28, 27, 24, 23, 32, 31, 30, 29, + 28, 27, 24, 23, 32, 31, 30, 28, 26, 26, 23, 22, 30, 30, 29, 28, 25, 24, + 21, 20, 30, 30, 29, 28, 25, 24, 21, 20, 30, 30, 29, 28, 24, 24, 21, 20, + 29, 30, 28, 27, 23, 22, 20, 19, 28, 30, 28, 27, 22, 21, 19, 18, 28, 30, + 28, 27, 22, 21, 19, 18, 27, 28, 28, 26, 22, 21, 18, 18, 26, 28, 26, 26, + 21, 20, 18, 17, 26, 28, 26, 26, 21, 20, 18, 17, 25, 26, 26, 25, 21, 20, + 17, 17, 23, 25, 24, 24, 20, 19, 16, 16, 23, 25, 24, 24, 20, 19, 16, 16, + 23, 24, 24, 24, 20, 19, 16, 15, 22, 23, 23, 23, 19, 18, 16, 15, 21, 23, + 23, 22, 19, 18, 15, 15, 21, 23, 23, 22, 19, 18, 15, 15, 20, 22, 21, 21, + 18, 18, 15, 14, + /* Size 32x8 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 30, + 29, 28, 28, 27, 26, 26, 25, 23, 23, 23, 22, 21, 21, 20, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 30, 28, + 28, 28, 26, 25, 25, 24, 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 26, 26, 26, 24, + 24, 24, 23, 23, 23, 21, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, + 29, 29, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, + 22, 21, 29, 29, 29, 29, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 26, 25, + 25, 24, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 28, 29, + 29, 29, 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24, 22, 21, + 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 23, 24, 24, 24, 25, 25, + 25, 25, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19, 18, 18, 18, + 17, 16, 16, 16, 16, 15, 15, 15, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, + 23, 23, 23, 23, 22, 20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 15, + 15, 15, 15, 14 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 28, 22, 22, 28, 23, 22, 23, 22, 22, 19, 19, 22, 23, 19, 17, + /* Size 8x8 */ + 33, 33, 30, 28, 24, 21, 22, 21, 33, 32, 29, 26, 24, 22, 23, 22, 30, 29, + 26, 24, 23, 22, 23, 22, 28, 26, 24, 22, 22, 22, 23, 22, 24, 24, 23, 22, + 21, 20, 20, 20, 21, 22, 22, 22, 20, 19, 19, 19, 22, 23, 23, 23, 20, 19, + 18, 17, 21, 22, 22, 22, 20, 19, 17, 17, + /* Size 16x16 */ + 32, 33, 33, 34, 31, 31, 28, 27, 25, 22, 21, 21, 21, 21, 20, 20, 33, 33, + 33, 33, 30, 30, 27, 26, 24, 22, 22, 22, 22, 22, 21, 21, 33, 33, 33, 33, + 30, 29, 26, 26, 24, 22, 22, 22, 22, 22, 22, 22, 34, 33, 33, 32, 30, 29, + 26, 25, 24, 23, 22, 23, 23, 23, 22, 22, 31, 30, 30, 30, 28, 27, 24, 24, + 23, 22, 22, 22, 22, 23, 22, 22, 31, 30, 29, 29, 27, 26, 24, 23, 23, 22, + 22, 22, 22, 23, 22, 22, 28, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 22, + 22, 23, 22, 22, 27, 26, 26, 25, 24, 23, 22, 22, 21, 21, 21, 21, 22, 22, + 22, 22, 25, 24, 24, 24, 23, 23, 22, 21, 21, 20, 20, 21, 21, 21, 20, 20, + 22, 22, 22, 23, 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, 19, 19, 21, 22, + 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 21, 22, 22, 23, + 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 18, 18, 21, 22, 22, 23, 22, 22, + 22, 22, 21, 20, 19, 19, 19, 18, 18, 18, 21, 22, 22, 23, 23, 23, 23, 22, + 21, 20, 19, 19, 18, 18, 17, 17, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19, + 19, 18, 18, 17, 17, 17, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19, 19, 18, + 18, 17, 17, 17, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 27, 25, 25, 24, + 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 33, 33, 33, 33, + 33, 33, 33, 33, 31, 30, 30, 28, 28, 28, 26, 24, 24, 24, 22, 21, 21, 21, + 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 33, 33, 33, 33, 33, 33, 33, 32, + 30, 30, 30, 28, 27, 27, 26, 24, 24, 24, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 28, + 27, 27, 26, 24, 24, 24, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, + 21, 21, 33, 33, 33, 33, 33, 33, 33, 32, 30, 29, 29, 28, 26, 26, 26, 24, + 24, 24, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 34, 33, + 33, 33, 33, 32, 32, 32, 30, 29, 29, 27, 26, 26, 25, 24, 24, 24, 23, 22, + 22, 22, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 34, 33, 33, 33, 33, 32, + 32, 32, 30, 29, 29, 27, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 23, 23, + 23, 23, 23, 23, 22, 22, 22, 22, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, + 28, 27, 26, 26, 25, 24, 24, 24, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, + 22, 22, 22, 22, 31, 31, 30, 30, 30, 30, 30, 29, 28, 27, 27, 25, 24, 24, + 24, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, + 31, 30, 30, 30, 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 23, 23, + 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 31, 30, 30, 30, + 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, + 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 29, 28, 28, 28, 28, 27, 27, 27, + 25, 25, 25, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, + 23, 23, 22, 22, 22, 22, 28, 28, 27, 27, 26, 26, 26, 26, 24, 24, 24, 22, + 22, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, + 22, 22, 28, 28, 27, 27, 26, 26, 26, 26, 24, 24, 24, 22, 22, 22, 22, 22, + 22, 22, 22, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 27, 26, + 26, 26, 26, 25, 25, 25, 24, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 25, 24, 24, 24, 24, 24, + 24, 24, 23, 23, 23, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 21, 21, + 21, 21, 21, 21, 20, 20, 20, 20, 25, 24, 24, 24, 24, 24, 24, 24, 23, 23, + 23, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, + 20, 20, 20, 20, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22, + 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 18, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, + 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 22, + 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 19, + 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 22, 22, 22, 22, 23, + 23, 23, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, + 19, 18, 18, 18, 18, 18, 18, 18, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, + 23, 23, 23, 23, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 18, 18, 18, 17, + 17, 17, 17, 17, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, + 21, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 20, + 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 20, 21, 21, 21, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, + 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 17, + 17, 17, 17, 17, 17, 16, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, + 17, 16, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, + 20, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, + /* Size 4x8 */ + 33, 27, 22, 21, 33, 26, 22, 23, 29, 24, 22, 22, 26, 22, 22, 23, 24, 22, + 20, 20, 22, 22, 19, 19, 22, 22, 19, 18, 21, 22, 19, 17, + /* Size 8x4 */ + 33, 33, 29, 26, 24, 22, 22, 21, 27, 26, 24, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 20, 19, 19, 19, 21, 23, 22, 23, 20, 19, 18, 17, + /* Size 8x16 */ + 32, 33, 31, 28, 23, 21, 21, 20, 33, 33, 30, 27, 23, 22, 22, 21, 33, 32, + 30, 26, 23, 22, 22, 22, 34, 32, 29, 26, 23, 22, 23, 22, 31, 29, 28, 24, + 22, 22, 23, 22, 31, 28, 27, 24, 22, 22, 22, 22, 28, 26, 24, 22, 22, 22, + 23, 22, 26, 25, 24, 22, 21, 21, 22, 22, 24, 24, 23, 22, 21, 20, 21, 20, + 22, 22, 22, 21, 20, 20, 19, 19, 21, 22, 22, 21, 20, 19, 19, 19, 21, 22, + 22, 22, 20, 19, 18, 18, 21, 23, 22, 22, 20, 19, 18, 18, 21, 23, 23, 22, + 20, 19, 18, 17, 20, 22, 22, 22, 20, 19, 17, 17, 20, 22, 22, 22, 20, 19, + 17, 17, + /* Size 16x8 */ + 32, 33, 33, 34, 31, 31, 28, 26, 24, 22, 21, 21, 21, 21, 20, 20, 33, 33, + 32, 32, 29, 28, 26, 25, 24, 22, 22, 22, 23, 23, 22, 22, 31, 30, 30, 29, + 28, 27, 24, 24, 23, 22, 22, 22, 22, 23, 22, 22, 28, 27, 26, 26, 24, 24, + 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 23, 23, 23, 23, 22, 22, 22, 21, + 21, 20, 20, 20, 20, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, 21, 20, 20, + 19, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 23, 22, 21, 19, 19, 18, + 18, 18, 17, 17, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19, 19, 18, 18, 17, + 17, 17, + /* Size 16x32 */ + 32, 33, 33, 33, 31, 28, 28, 27, 23, 21, 21, 21, 21, 21, 20, 20, 33, 33, + 33, 33, 31, 27, 27, 26, 23, 22, 22, 21, 21, 21, 21, 20, 33, 33, 33, 33, + 30, 27, 27, 26, 23, 22, 22, 22, 22, 22, 21, 20, 33, 33, 33, 33, 30, 27, + 27, 26, 23, 22, 22, 22, 22, 22, 21, 20, 33, 33, 32, 32, 30, 26, 26, 26, + 23, 22, 22, 22, 22, 22, 22, 21, 34, 33, 32, 32, 29, 26, 26, 25, 23, 22, + 22, 23, 23, 23, 22, 21, 34, 33, 32, 32, 29, 26, 26, 25, 23, 22, 22, 23, + 23, 23, 22, 21, 33, 32, 31, 31, 29, 26, 26, 25, 23, 22, 22, 23, 23, 23, + 22, 21, 31, 30, 29, 29, 28, 24, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22, + 31, 29, 28, 28, 27, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 31, 29, + 28, 28, 27, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 29, 28, 27, 27, + 25, 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 28, 26, 26, 26, 24, 22, + 22, 22, 22, 22, 22, 22, 23, 23, 22, 22, 28, 26, 26, 26, 24, 22, 22, 22, + 22, 22, 22, 22, 23, 23, 22, 22, 26, 26, 25, 25, 24, 22, 22, 22, 21, 21, + 21, 22, 22, 22, 22, 21, 24, 24, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, + 21, 21, 20, 20, 24, 24, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, + 20, 20, 24, 24, 24, 24, 23, 22, 22, 21, 20, 20, 20, 20, 20, 20, 20, 20, + 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 21, 22, + 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, + 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, + 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 21, 22, 22, 22, 22, 22, 22, 21, + 20, 19, 19, 19, 18, 18, 18, 18, 21, 22, 22, 22, 22, 22, 22, 21, 20, 19, + 19, 19, 18, 18, 18, 18, 21, 22, 23, 23, 22, 22, 22, 22, 20, 19, 19, 19, + 18, 18, 18, 17, 21, 22, 23, 23, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, + 17, 17, 21, 22, 23, 23, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, 17, 17, + 21, 22, 23, 23, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, 17, 17, 20, 21, + 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 17, 16, 20, 21, 22, 22, + 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 17, 16, 20, 21, 22, 22, 22, 22, + 22, 21, 20, 19, 19, 18, 17, 17, 17, 16, 20, 21, 22, 22, 22, 22, 22, 21, + 20, 19, 19, 18, 17, 17, 17, 16, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 26, 24, 24, 24, + 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 33, 33, 33, 33, + 33, 33, 33, 32, 30, 29, 29, 28, 26, 26, 26, 24, 24, 24, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 33, 33, 33, 33, 32, 32, 32, 31, + 29, 28, 28, 27, 26, 26, 25, 24, 24, 24, 22, 22, 22, 22, 22, 22, 23, 23, + 23, 23, 22, 22, 22, 22, 33, 33, 33, 33, 32, 32, 32, 31, 29, 28, 28, 27, + 26, 26, 25, 24, 24, 24, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 22, 22, + 22, 22, 31, 31, 30, 30, 30, 29, 29, 29, 28, 27, 27, 25, 24, 24, 24, 23, + 23, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 28, 27, + 27, 27, 26, 26, 26, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, + 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 27, 27, 27, 26, 26, + 26, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 27, 26, 26, 26, 26, 25, 25, 25, 24, 23, + 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, + 21, 21, 21, 21, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, + 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, + 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 23, 23, 23, + 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 18, + 18, 18, 18, 18, 18, 18, 21, 21, 22, 22, 22, 23, 23, 23, 23, 22, 22, 23, + 23, 23, 22, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17, + 17, 17, 21, 21, 22, 22, 22, 23, 23, 23, 23, 22, 22, 23, 23, 23, 22, 21, + 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 20, 21, + 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, + 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 20, 20, 20, 20, 21, 21, + 21, 21, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, + 17, 17, 17, 17, 16, 16, 16, 16, + /* Size 4x16 */ + 33, 28, 21, 21, 33, 27, 22, 22, 33, 26, 22, 22, 33, 26, 22, 23, 30, 24, + 22, 23, 29, 24, 22, 22, 26, 22, 22, 23, 26, 22, 21, 22, 24, 22, 20, 21, + 22, 21, 20, 19, 22, 21, 19, 19, 22, 22, 19, 18, 22, 22, 19, 18, 22, 22, + 19, 18, 21, 22, 19, 17, 21, 22, 19, 17, + /* Size 16x4 */ + 33, 33, 33, 33, 30, 29, 26, 26, 24, 22, 22, 22, 22, 22, 21, 21, 28, 27, + 26, 26, 24, 24, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 21, 22, 22, 22, + 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, + 23, 22, 21, 19, 19, 18, 18, 18, 17, 17, + /* Size 8x32 */ + 32, 33, 31, 28, 23, 21, 21, 20, 33, 33, 31, 27, 23, 22, 21, 21, 33, 33, + 30, 27, 23, 22, 22, 21, 33, 33, 30, 27, 23, 22, 22, 21, 33, 32, 30, 26, + 23, 22, 22, 22, 34, 32, 29, 26, 23, 22, 23, 22, 34, 32, 29, 26, 23, 22, + 23, 22, 33, 31, 29, 26, 23, 22, 23, 22, 31, 29, 28, 24, 22, 22, 23, 22, + 31, 28, 27, 24, 22, 22, 22, 22, 31, 28, 27, 24, 22, 22, 22, 22, 29, 27, + 25, 23, 22, 22, 23, 22, 28, 26, 24, 22, 22, 22, 23, 22, 28, 26, 24, 22, + 22, 22, 23, 22, 26, 25, 24, 22, 21, 21, 22, 22, 24, 24, 23, 22, 21, 20, + 21, 20, 24, 24, 23, 22, 21, 20, 21, 20, 24, 24, 23, 22, 20, 20, 20, 20, + 22, 22, 22, 21, 20, 20, 19, 19, 21, 22, 22, 21, 20, 19, 19, 19, 21, 22, + 22, 21, 20, 19, 19, 19, 21, 22, 22, 22, 20, 19, 19, 19, 21, 22, 22, 22, + 20, 19, 18, 18, 21, 22, 22, 22, 20, 19, 18, 18, 21, 23, 22, 22, 20, 19, + 18, 18, 21, 23, 23, 22, 20, 19, 18, 17, 21, 23, 23, 22, 20, 19, 18, 17, + 21, 23, 23, 22, 20, 19, 18, 17, 20, 22, 22, 22, 20, 19, 17, 17, 20, 22, + 22, 22, 20, 19, 17, 17, 20, 22, 22, 22, 20, 19, 17, 17, 20, 22, 22, 22, + 20, 19, 17, 17, + /* Size 32x8 */ + 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 26, 24, 24, 24, + 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 33, 33, 33, 33, + 32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 24, 24, 22, 22, 22, 22, + 22, 22, 23, 23, 23, 23, 22, 22, 22, 22, 31, 31, 30, 30, 30, 29, 29, 29, + 28, 27, 27, 25, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 23, + 23, 23, 22, 22, 22, 22, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 23, + 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, + 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 23, + 23, 23, 23, 22, 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 19, 18, 18, + 18, 18, 18, 18, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, + 17, 17, 17, 17 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 32, 30, 27, 32, 31, 29, 26, 30, 29, 26, 23, 27, 26, 23, 19, + /* Size 8x8 */ + 33, 33, 32, 32, 31, 30, 28, 25, 33, 32, 32, 32, 31, 30, 28, 26, 32, 32, + 32, 31, 30, 29, 28, 26, 32, 32, 31, 30, 29, 28, 27, 25, 31, 31, 30, 29, + 28, 26, 25, 23, 30, 30, 29, 28, 26, 24, 22, 21, 28, 28, 28, 27, 25, 22, + 20, 19, 25, 26, 26, 25, 23, 21, 19, 18, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 33, 33, 32, 32, 30, 30, 28, 28, 26, 26, 23, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 30, 30, 28, 28, 25, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 30, 30, 28, 28, 25, 33, 32, 32, 32, 32, 31, 31, 30, 30, 29, + 29, 28, 28, 26, 26, 24, 33, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, + 28, 26, 26, 24, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 28, 27, 27, 26, + 26, 24, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 24, + 30, 30, 30, 31, 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 30, 30, + 30, 31, 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 28, 29, 29, 30, + 30, 28, 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 28, 29, 29, 30, 30, 28, + 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 26, 27, 27, 28, 28, 26, 26, 26, + 26, 23, 23, 20, 20, 19, 19, 18, 26, 27, 27, 28, 28, 26, 26, 26, 26, 23, + 23, 20, 20, 19, 19, 18, 23, 24, 24, 25, 25, 24, 24, 24, 24, 22, 22, 19, + 19, 18, 18, 16, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, + 30, 30, 30, 29, 28, 28, 28, 28, 26, 26, 26, 25, 23, 23, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, + 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, + 27, 27, 27, 26, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 27, 27, 27, 26, + 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 27, 27, 27, 26, 24, 24, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, + 30, 30, 29, 29, 29, 28, 27, 27, 27, 26, 25, 25, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, + 30, 28, 28, 28, 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 28, 28, 28, + 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 28, 28, 28, 28, 26, 25, 25, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, + 30, 30, 30, 29, 29, 29, 29, 28, 27, 27, 27, 26, 25, 25, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, + 28, 28, 28, 27, 26, 26, 26, 26, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, + 26, 26, 26, 26, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, + 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 26, + 24, 24, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, + 30, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, + 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, + 27, 26, 26, 26, 26, 25, 24, 24, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, + 26, 25, 24, 24, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, + 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, + 30, 30, 30, 30, 30, 30, 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 27, + 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 30, 30, 30, 30, + 30, 30, 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 25, + 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 30, 30, 30, 30, 30, 30, 31, 31, + 31, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, + 23, 23, 23, 22, 22, 22, 29, 30, 30, 30, 30, 30, 30, 30, 30, 29, 28, 28, + 28, 28, 28, 28, 28, 26, 25, 25, 25, 24, 23, 23, 23, 22, 22, 22, 22, 21, + 20, 20, 28, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, + 27, 26, 24, 24, 24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, + 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, + 24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29, + 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, + 21, 21, 20, 20, 20, 20, 19, 19, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 27, 27, 27, 27, 26, 26, 26, 25, 23, 23, 23, 22, 21, 21, 21, 20, 20, 20, + 20, 19, 18, 18, 26, 26, 27, 27, 27, 27, 28, 28, 28, 27, 26, 26, 26, 26, + 26, 26, 26, 24, 23, 23, 23, 22, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, + 26, 26, 27, 27, 27, 27, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 24, + 23, 23, 23, 22, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 26, 26, 27, 27, + 27, 27, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 24, 23, 23, 23, 22, + 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 25, 25, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 25, 25, 25, 25, 23, 22, 22, 22, 21, 20, 20, 20, 19, + 18, 18, 18, 18, 17, 17, 23, 24, 24, 24, 24, 25, 25, 25, 25, 25, 24, 24, + 24, 24, 24, 24, 24, 23, 22, 22, 22, 20, 19, 19, 19, 18, 18, 18, 18, 17, + 16, 16, 23, 24, 24, 24, 24, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, + 24, 23, 22, 22, 22, 20, 19, 19, 19, 18, 18, 18, 18, 17, 16, 16, + /* Size 4x8 */ + 33, 32, 30, 26, 32, 32, 30, 27, 32, 31, 30, 27, 32, 31, 28, 26, 31, 30, + 27, 24, 30, 28, 25, 22, 28, 27, 23, 20, 26, 26, 22, 18, + /* Size 8x4 */ + 33, 32, 32, 32, 31, 30, 28, 26, 32, 32, 31, 31, 30, 28, 27, 26, 30, 30, + 30, 28, 27, 25, 23, 22, 26, 27, 27, 26, 24, 22, 20, 18, + /* Size 8x16 */ + 32, 33, 33, 32, 32, 28, 28, 23, 33, 32, 32, 32, 32, 29, 29, 24, 33, 32, + 32, 32, 32, 29, 29, 24, 33, 32, 32, 31, 31, 30, 30, 25, 33, 32, 32, 31, + 31, 30, 30, 25, 32, 32, 32, 30, 30, 28, 28, 24, 32, 32, 32, 30, 30, 28, + 28, 24, 32, 31, 31, 29, 29, 27, 27, 24, 32, 31, 31, 29, 29, 27, 27, 24, + 30, 30, 30, 28, 28, 24, 24, 21, 30, 30, 30, 28, 28, 24, 24, 21, 28, 30, + 30, 27, 27, 21, 21, 19, 28, 30, 30, 27, 27, 21, 21, 19, 26, 28, 28, 26, + 26, 20, 20, 18, 26, 28, 28, 26, 26, 20, 20, 18, 23, 25, 25, 24, 24, 19, + 19, 16, + /* Size 16x8 */ + 32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 28, 28, 26, 26, 23, 33, 32, + 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 33, 32, 32, 32, + 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 32, 32, 32, 31, 31, 30, + 30, 29, 29, 28, 28, 27, 27, 26, 26, 24, 32, 32, 32, 31, 31, 30, 30, 29, + 29, 28, 28, 27, 27, 26, 26, 24, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, + 24, 21, 21, 20, 20, 19, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21, + 21, 20, 20, 19, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21, 21, 19, 19, 18, + 18, 16, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 28, 28, 26, 23, 23, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 30, 29, 29, 29, 26, 24, 24, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 30, 29, 29, 29, 27, 24, 24, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 30, 29, 29, 29, 27, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 30, 29, 29, 29, 27, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32, 32, 30, + 29, 29, 29, 27, 25, 25, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, + 30, 28, 25, 25, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 28, + 25, 25, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 28, 25, 25, + 33, 32, 32, 32, 32, 31, 31, 31, 31, 30, 29, 29, 29, 27, 25, 25, 32, 32, + 32, 32, 32, 31, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 32, 32, 32, 32, + 32, 31, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 32, 32, 32, 32, 32, 31, + 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 32, 32, 32, 32, 32, 31, 30, 30, + 30, 28, 28, 28, 28, 26, 24, 24, 32, 32, 31, 31, 31, 30, 29, 29, 29, 28, + 27, 27, 27, 26, 24, 24, 32, 32, 31, 31, 31, 30, 29, 29, 29, 28, 27, 27, + 27, 26, 24, 24, 32, 32, 31, 31, 31, 30, 29, 29, 29, 28, 27, 27, 27, 26, + 24, 24, 31, 31, 31, 31, 31, 30, 28, 28, 28, 27, 26, 26, 26, 24, 23, 23, + 30, 30, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 24, 23, 21, 21, 30, 30, + 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 24, 23, 21, 21, 30, 30, 30, 30, + 30, 29, 28, 28, 28, 26, 24, 24, 24, 23, 21, 21, 29, 30, 30, 30, 30, 28, + 28, 28, 28, 25, 23, 23, 23, 22, 20, 20, 28, 29, 30, 30, 30, 28, 27, 27, + 27, 24, 21, 21, 21, 20, 19, 19, 28, 29, 30, 30, 30, 28, 27, 27, 27, 24, + 21, 21, 21, 20, 19, 19, 28, 29, 30, 30, 30, 28, 27, 27, 27, 24, 21, 21, + 21, 20, 19, 19, 28, 28, 28, 28, 28, 27, 26, 26, 26, 23, 21, 21, 21, 20, + 18, 18, 26, 27, 28, 28, 28, 26, 26, 26, 26, 23, 20, 20, 20, 19, 18, 18, + 26, 27, 28, 28, 28, 26, 26, 26, 26, 23, 20, 20, 20, 19, 18, 18, 26, 27, + 28, 28, 28, 26, 26, 26, 26, 23, 20, 20, 20, 19, 18, 18, 25, 26, 26, 26, + 26, 26, 24, 24, 24, 22, 20, 20, 20, 18, 17, 17, 23, 24, 25, 25, 25, 24, + 24, 24, 24, 21, 19, 19, 19, 18, 16, 16, 23, 24, 25, 25, 25, 24, 24, 24, + 24, 21, 19, 19, 19, 18, 16, 16, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, + 30, 30, 30, 29, 28, 28, 28, 28, 26, 26, 26, 25, 23, 23, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, + 29, 29, 29, 28, 27, 27, 27, 26, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 28, + 28, 28, 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 28, 28, 28, 28, 26, + 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 30, 30, 30, 30, 30, 30, 30, 28, 28, 28, 28, 26, 25, 25, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, + 29, 28, 28, 28, 28, 27, 26, 26, 26, 26, 24, 24, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, + 27, 26, 26, 26, 26, 24, 24, 24, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, + 26, 24, 24, 24, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, + 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, + 30, 30, 30, 30, 30, 30, 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 27, + 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 21, 21, 28, 29, 29, 29, + 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, + 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29, 30, 30, + 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, 21, 21, + 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, + 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, 21, 21, 20, 20, 20, 20, + 19, 19, 26, 26, 27, 27, 27, 27, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, + 26, 24, 23, 23, 23, 22, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 23, 24, + 24, 24, 24, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, 21, 21, + 21, 20, 19, 19, 19, 18, 18, 18, 18, 17, 16, 16, 23, 24, 24, 24, 24, 25, + 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19, + 19, 18, 18, 18, 18, 17, 16, 16, + /* Size 4x16 */ + 33, 32, 30, 26, 32, 32, 30, 27, 32, 32, 30, 27, 32, 32, 31, 28, 32, 32, + 31, 28, 32, 31, 29, 26, 32, 31, 29, 26, 32, 30, 28, 26, 32, 30, 28, 26, + 30, 29, 26, 23, 30, 29, 26, 23, 29, 28, 24, 20, 29, 28, 24, 20, 27, 26, + 23, 19, 27, 26, 23, 19, 24, 24, 21, 18, + /* Size 16x4 */ + 33, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 32, 32, + 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 26, 26, 24, 30, 30, 30, 31, + 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 21, 26, 27, 27, 28, 28, 26, + 26, 26, 26, 23, 23, 20, 20, 19, 19, 18, + /* Size 8x32 */ + 32, 33, 33, 32, 32, 28, 28, 23, 33, 33, 33, 32, 32, 29, 29, 24, 33, 32, + 32, 32, 32, 29, 29, 24, 33, 32, 32, 32, 32, 29, 29, 24, 33, 32, 32, 32, + 32, 29, 29, 24, 33, 32, 32, 32, 32, 29, 29, 25, 33, 32, 32, 31, 31, 30, + 30, 25, 33, 32, 32, 31, 31, 30, 30, 25, 33, 32, 32, 31, 31, 30, 30, 25, + 33, 32, 32, 31, 31, 29, 29, 25, 32, 32, 32, 30, 30, 28, 28, 24, 32, 32, + 32, 30, 30, 28, 28, 24, 32, 32, 32, 30, 30, 28, 28, 24, 32, 32, 32, 30, + 30, 28, 28, 24, 32, 31, 31, 29, 29, 27, 27, 24, 32, 31, 31, 29, 29, 27, + 27, 24, 32, 31, 31, 29, 29, 27, 27, 24, 31, 31, 31, 28, 28, 26, 26, 23, + 30, 30, 30, 28, 28, 24, 24, 21, 30, 30, 30, 28, 28, 24, 24, 21, 30, 30, + 30, 28, 28, 24, 24, 21, 29, 30, 30, 28, 28, 23, 23, 20, 28, 30, 30, 27, + 27, 21, 21, 19, 28, 30, 30, 27, 27, 21, 21, 19, 28, 30, 30, 27, 27, 21, + 21, 19, 28, 28, 28, 26, 26, 21, 21, 18, 26, 28, 28, 26, 26, 20, 20, 18, + 26, 28, 28, 26, 26, 20, 20, 18, 26, 28, 28, 26, 26, 20, 20, 18, 25, 26, + 26, 24, 24, 20, 20, 17, 23, 25, 25, 24, 24, 19, 19, 16, 23, 25, 25, 24, + 24, 19, 19, 16, + /* Size 32x8 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, + 30, 30, 30, 29, 28, 28, 28, 28, 26, 26, 26, 25, 23, 23, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, + 30, 30, 30, 28, 28, 28, 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 28, + 28, 28, 28, 26, 25, 25, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, + 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 24, + 24, 24, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, + 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 28, 29, + 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, + 24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29, + 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, + 21, 21, 20, 20, 20, 20, 19, 19, 23, 24, 24, 24, 24, 25, 25, 25, 25, 25, + 24, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19, 19, 18, 18, 18, + 18, 17, 16, 16 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 30, 24, 22, 30, 26, 23, 22, 24, 23, 21, 21, 22, 22, 21, 19, + /* Size 8x8 */ + 33, 33, 32, 29, 26, 23, 21, 21, 33, 33, 31, 28, 25, 23, 22, 22, 32, 31, + 29, 26, 24, 23, 22, 23, 29, 28, 26, 24, 23, 22, 22, 22, 26, 25, 24, 23, + 22, 21, 21, 22, 23, 23, 23, 22, 21, 20, 20, 20, 21, 22, 22, 22, 21, 20, + 19, 19, 21, 22, 23, 22, 22, 20, 19, 18, + /* Size 16x16 */ + 32, 33, 33, 34, 34, 31, 31, 28, 28, 25, 25, 21, 21, 21, 21, 21, 33, 33, + 33, 33, 33, 30, 30, 27, 27, 24, 24, 22, 22, 22, 22, 22, 33, 33, 33, 33, + 33, 30, 30, 27, 27, 24, 24, 22, 22, 22, 22, 22, 34, 33, 33, 32, 32, 29, + 29, 26, 26, 24, 24, 22, 22, 23, 23, 23, 34, 33, 33, 32, 32, 29, 29, 26, + 26, 24, 24, 22, 22, 23, 23, 23, 31, 30, 30, 29, 29, 26, 26, 24, 24, 23, + 23, 22, 22, 22, 22, 23, 31, 30, 30, 29, 29, 26, 26, 24, 24, 23, 23, 22, + 22, 22, 22, 23, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, + 22, 23, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, 22, 23, + 25, 24, 24, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 25, 24, + 24, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 22, 22, 22, + 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, + 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 22, 22, + 22, 21, 21, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 22, 22, 22, 21, + 21, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 23, 23, 23, 23, 21, 21, 19, + 19, 19, 19, 18, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 31, 31, 29, 28, 28, 28, 26, + 25, 25, 25, 23, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 32, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 24, 23, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 31, 30, 30, 30, 28, 27, 27, 27, 26, 24, 24, 24, 23, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 30, + 30, 28, 27, 27, 27, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 30, 30, 28, 27, 27, + 27, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 31, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, + 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 34, 33, 33, 33, 33, 33, + 32, 32, 32, 31, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, + 22, 22, 23, 23, 23, 23, 23, 23, 34, 33, 33, 33, 33, 33, 32, 32, 32, 31, + 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 23, 23, + 23, 23, 23, 23, 34, 33, 33, 33, 33, 33, 32, 32, 32, 31, 29, 29, 29, 28, + 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, + 32, 32, 31, 31, 31, 31, 31, 31, 31, 29, 28, 28, 28, 26, 25, 25, 25, 24, + 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 31, 30, 30, 30, + 30, 29, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 31, 30, 30, 30, 30, 29, 29, 29, + 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 23, 23, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 26, 26, + 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 23, 23, 29, 29, 28, 28, 28, 28, 28, 28, 28, 26, 25, 25, 25, 24, 23, 23, + 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 28, 28, + 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, + 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 23, 23, 28, 28, 27, 27, 27, 26, + 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, + 21, 22, 22, 22, 22, 22, 23, 23, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, + 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, + 22, 22, 23, 23, 26, 26, 26, 26, 26, 25, 25, 25, 25, 24, 23, 23, 23, 23, + 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, + 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 21, + 21, 21, 21, 21, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 25, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, + 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 25, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 23, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 20, + 21, 21, 21, 21, 21, 21, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, + 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, + 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, + 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, + 22, 22, 22, 21, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, + 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, + 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 23, 23, + 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, + 19, 19, 19, 18, 18, 18, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, + 23, 23, 23, 23, 23, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18, + 18, 18, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 23, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, + /* Size 4x8 */ + 33, 30, 24, 21, 33, 29, 24, 22, 31, 28, 23, 22, 28, 25, 22, 22, 26, 23, + 21, 21, 23, 22, 21, 20, 22, 22, 20, 19, 22, 22, 21, 19, + /* Size 8x4 */ + 33, 33, 31, 28, 26, 23, 22, 22, 30, 29, 28, 25, 23, 22, 22, 22, 24, 24, + 23, 22, 21, 21, 20, 21, 21, 22, 22, 22, 21, 20, 19, 19, + /* Size 8x16 */ + 32, 33, 33, 28, 28, 21, 21, 21, 33, 33, 33, 27, 27, 22, 22, 22, 33, 33, + 33, 27, 27, 22, 22, 22, 34, 32, 32, 26, 26, 22, 22, 23, 34, 32, 32, 26, + 26, 22, 22, 23, 31, 28, 28, 24, 24, 22, 22, 22, 31, 28, 28, 24, 24, 22, + 22, 22, 28, 26, 26, 22, 22, 22, 22, 23, 28, 26, 26, 22, 22, 22, 22, 23, + 24, 24, 24, 22, 22, 20, 20, 21, 24, 24, 24, 22, 22, 20, 20, 21, 21, 22, + 22, 21, 21, 19, 19, 19, 21, 22, 22, 21, 21, 19, 19, 19, 21, 22, 22, 22, + 22, 19, 19, 18, 21, 22, 22, 22, 22, 19, 19, 18, 21, 23, 23, 22, 22, 19, + 19, 18, + /* Size 16x8 */ + 32, 33, 33, 34, 34, 31, 31, 28, 28, 24, 24, 21, 21, 21, 21, 21, 33, 33, + 33, 32, 32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 33, 33, 33, 32, + 32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 28, 27, 27, 26, 26, 24, + 24, 22, 22, 22, 22, 21, 21, 22, 22, 22, 28, 27, 27, 26, 26, 24, 24, 22, + 22, 22, 22, 21, 21, 22, 22, 22, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, + 20, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19, + 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21, 21, 19, 19, 18, + 18, 18, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 31, 28, 28, 28, 24, 21, 21, 21, 21, 21, 21, 33, 33, + 33, 33, 33, 30, 28, 28, 28, 24, 22, 22, 22, 21, 21, 21, 33, 33, 33, 33, + 33, 30, 27, 27, 27, 24, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 30, + 27, 27, 27, 24, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 30, 27, 27, + 27, 24, 22, 22, 22, 22, 22, 22, 33, 33, 32, 32, 32, 29, 26, 26, 26, 24, + 22, 22, 22, 22, 22, 22, 34, 33, 32, 32, 32, 29, 26, 26, 26, 24, 22, 22, + 22, 23, 23, 23, 34, 33, 32, 32, 32, 29, 26, 26, 26, 24, 22, 22, 22, 23, + 23, 23, 34, 33, 32, 32, 32, 29, 26, 26, 26, 24, 22, 22, 22, 23, 23, 23, + 32, 31, 30, 30, 30, 28, 25, 25, 25, 23, 22, 22, 22, 22, 23, 23, 31, 30, + 28, 28, 28, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 31, 30, 28, 28, + 28, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 31, 30, 28, 28, 28, 26, + 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 29, 28, 27, 27, 27, 25, 23, 23, + 23, 22, 22, 22, 22, 22, 23, 23, 28, 27, 26, 26, 26, 24, 22, 22, 22, 22, + 22, 22, 22, 22, 23, 23, 28, 27, 26, 26, 26, 24, 22, 22, 22, 22, 22, 22, + 22, 22, 23, 23, 28, 27, 26, 26, 26, 24, 22, 22, 22, 22, 22, 22, 22, 22, + 23, 23, 26, 26, 25, 25, 25, 23, 22, 22, 22, 21, 21, 21, 21, 21, 22, 22, + 24, 24, 24, 24, 24, 23, 22, 22, 22, 21, 20, 20, 20, 20, 21, 21, 24, 24, + 24, 24, 24, 23, 22, 22, 22, 21, 20, 20, 20, 20, 21, 21, 24, 24, 24, 24, + 24, 23, 22, 22, 22, 21, 20, 20, 20, 20, 21, 21, 23, 23, 23, 23, 23, 22, + 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, 21, 21, 22, 22, 22, 22, 21, 21, + 21, 20, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 21, 21, 21, 20, + 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, + 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, + 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, + 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 21, 22, + 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 21, 22, 23, 23, + 23, 22, 22, 22, 22, 21, 19, 19, 19, 19, 18, 18, 21, 22, 23, 23, 23, 23, + 22, 22, 22, 21, 19, 19, 19, 18, 18, 18, 21, 22, 23, 23, 23, 23, 22, 22, + 22, 21, 19, 19, 19, 18, 18, 18, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 31, 31, 29, 28, 28, 28, 26, + 24, 24, 24, 23, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 31, 30, 30, 30, 28, 27, 27, 27, 26, 24, 24, 24, 23, + 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 30, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, + 22, 22, 22, 23, 23, 23, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 28, + 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 23, + 23, 23, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 28, 28, 27, 26, 26, + 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 31, 30, + 30, 30, 30, 29, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, + 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 28, 28, 27, 27, 27, 26, + 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, + 21, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, + 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, + 22, 22, 22, 22, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, + 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 21, + 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, + 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 21, 21, + 22, 22, 22, 22, 23, 23, 23, 23, 22, 22, 22, 23, 23, 23, 23, 22, 21, 21, + 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 21, 21, 22, 22, 22, 22, + 23, 23, 23, 23, 22, 22, 22, 23, 23, 23, 23, 22, 21, 21, 21, 20, 19, 19, + 19, 19, 18, 18, 18, 18, 18, 18, + /* Size 4x16 */ + 33, 31, 24, 21, 33, 30, 24, 22, 33, 30, 24, 22, 33, 29, 24, 23, 33, 29, + 24, 23, 30, 26, 23, 22, 30, 26, 23, 22, 27, 24, 22, 22, 27, 24, 22, 22, + 24, 23, 21, 20, 24, 23, 21, 20, 21, 22, 20, 19, 21, 22, 20, 19, 22, 22, + 20, 19, 22, 22, 20, 19, 22, 23, 21, 18, + /* Size 16x4 */ + 33, 33, 33, 33, 33, 30, 30, 27, 27, 24, 24, 21, 21, 22, 22, 22, 31, 30, + 30, 29, 29, 26, 26, 24, 24, 23, 23, 22, 22, 22, 22, 23, 24, 24, 24, 24, + 24, 23, 23, 22, 22, 21, 21, 20, 20, 20, 20, 21, 21, 22, 22, 23, 23, 22, + 22, 22, 22, 20, 20, 19, 19, 19, 19, 18, + /* Size 8x32 */ + 32, 33, 33, 28, 28, 21, 21, 21, 33, 33, 33, 28, 28, 22, 22, 21, 33, 33, + 33, 27, 27, 22, 22, 22, 33, 33, 33, 27, 27, 22, 22, 22, 33, 33, 33, 27, + 27, 22, 22, 22, 33, 32, 32, 26, 26, 22, 22, 22, 34, 32, 32, 26, 26, 22, + 22, 23, 34, 32, 32, 26, 26, 22, 22, 23, 34, 32, 32, 26, 26, 22, 22, 23, + 32, 30, 30, 25, 25, 22, 22, 23, 31, 28, 28, 24, 24, 22, 22, 22, 31, 28, + 28, 24, 24, 22, 22, 22, 31, 28, 28, 24, 24, 22, 22, 22, 29, 27, 27, 23, + 23, 22, 22, 23, 28, 26, 26, 22, 22, 22, 22, 23, 28, 26, 26, 22, 22, 22, + 22, 23, 28, 26, 26, 22, 22, 22, 22, 23, 26, 25, 25, 22, 22, 21, 21, 22, + 24, 24, 24, 22, 22, 20, 20, 21, 24, 24, 24, 22, 22, 20, 20, 21, 24, 24, + 24, 22, 22, 20, 20, 21, 23, 23, 23, 22, 22, 20, 20, 20, 21, 22, 22, 21, + 21, 19, 19, 19, 21, 22, 22, 21, 21, 19, 19, 19, 21, 22, 22, 21, 21, 19, + 19, 19, 21, 22, 22, 22, 22, 19, 19, 19, 21, 22, 22, 22, 22, 19, 19, 18, + 21, 22, 22, 22, 22, 19, 19, 18, 21, 22, 22, 22, 22, 19, 19, 18, 21, 23, + 23, 22, 22, 19, 19, 18, 21, 23, 23, 22, 22, 19, 19, 18, 21, 23, 23, 22, + 22, 19, 19, 18, + /* Size 32x8 */ + 32, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 31, 31, 29, 28, 28, 28, 26, + 24, 24, 24, 23, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 30, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, + 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 30, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, + 22, 22, 22, 23, 23, 23, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, + 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, + 22, 22, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, + 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, + 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, + 22, 22, 22, 23, 23, 23, 23, 22, 21, 21, 21, 20, 19, 19, 19, 19, 18, 18, + 18, 18, 18, 18 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 32, 32, 29, 32, 32, 31, 29, 32, 31, 29, 27, 29, 29, 27, 22, + /* Size 8x8 */ + 33, 33, 33, 32, 32, 32, 30, 29, 33, 32, 32, 32, 32, 31, 30, 29, 33, 32, + 32, 32, 32, 31, 31, 30, 32, 32, 32, 31, 30, 30, 29, 28, 32, 32, 32, 30, + 29, 29, 28, 27, 32, 31, 31, 30, 29, 28, 27, 26, 30, 30, 31, 29, 28, 27, + 26, 24, 29, 29, 30, 28, 27, 26, 24, 21, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 30, 28, 28, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 30, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 30, 30, 30, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, + 29, 29, 28, 28, 33, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29, 29, 28, + 28, 28, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 28, 28, 28, 28, + 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 32, 32, + 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 31, 31, 31, 31, + 31, 31, 30, 29, 29, 28, 28, 27, 26, 26, 24, 24, 30, 30, 30, 30, 31, 31, + 29, 29, 28, 28, 28, 26, 26, 25, 24, 24, 30, 30, 30, 30, 30, 30, 29, 28, + 28, 28, 28, 26, 25, 24, 23, 23, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27, + 27, 24, 24, 23, 21, 21, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27, 27, 24, + 24, 23, 21, 21, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, + 30, 29, 29, 29, 29, 28, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, + 29, 28, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, + 30, 30, 30, 30, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, + 30, 30, 30, 29, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, + 30, 29, 29, 29, 29, 28, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 28, + 28, 28, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, + 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, + 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, + 29, 29, 28, 28, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28, + 28, 28, 28, 27, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, + 30, 30, 30, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 26, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, + 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, + 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, + 28, 27, 27, 27, 27, 26, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 26, 26, + 26, 25, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, + 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 25, 24, 24, 24, 24, 30, 30, + 30, 30, 30, 30, 30, 31, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 28, 28, + 28, 27, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 30, 30, 30, 30, 30, 30, + 30, 31, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 27, 26, 26, + 26, 26, 25, 24, 24, 24, 24, 24, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, + 31, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 25, 24, + 24, 24, 24, 24, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 28, + 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 25, 25, 24, 23, 23, 23, 23, 23, + 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 28, + 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 21, 28, 29, 29, 29, + 29, 29, 29, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, + 24, 24, 24, 24, 23, 22, 21, 21, 21, 21, 28, 29, 29, 29, 29, 29, 29, 30, + 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 24, 24, 24, 24, + 23, 22, 21, 21, 21, 21, 28, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 29, + 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 24, 24, 24, 24, 23, 22, 21, 21, + 21, 21, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 28, 28, 28, 28, 28, + 27, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 21, 21, 21, 21, 20, + /* Size 4x8 */ + 33, 33, 32, 29, 32, 32, 32, 29, 32, 32, 31, 30, 32, 32, 30, 28, 32, 31, + 29, 27, 31, 31, 28, 26, 30, 30, 28, 24, 29, 30, 27, 21, + /* Size 8x4 */ + 33, 32, 32, 32, 32, 31, 30, 29, 33, 32, 32, 32, 31, 31, 30, 30, 32, 32, + 31, 30, 29, 28, 28, 27, 29, 29, 30, 28, 27, 26, 24, 21, + /* Size 8x16 */ + 32, 33, 33, 33, 32, 32, 29, 28, 33, 32, 32, 32, 32, 32, 29, 29, 33, 32, + 32, 32, 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, 30, 29, 33, 32, 32, 32, + 31, 31, 30, 30, 33, 32, 32, 32, 31, 31, 30, 30, 33, 32, 32, 31, 30, 30, + 29, 28, 32, 32, 32, 31, 30, 30, 28, 28, 32, 32, 32, 31, 30, 30, 28, 28, + 32, 32, 31, 30, 29, 29, 28, 27, 32, 32, 31, 30, 29, 29, 28, 27, 31, 31, + 31, 29, 28, 28, 26, 25, 30, 30, 30, 29, 28, 28, 25, 24, 30, 30, 30, 29, + 28, 28, 24, 23, 28, 29, 30, 28, 27, 27, 22, 21, 28, 29, 30, 28, 27, 27, + 22, 21, + /* Size 16x8 */ + 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 30, 30, 28, 28, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 33, 32, 32, 32, 32, 32, + 31, 31, 31, 30, 30, 29, 29, 29, 28, 28, 32, 32, 32, 32, 31, 31, 30, 30, + 30, 29, 29, 28, 28, 28, 27, 27, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, + 29, 28, 28, 28, 27, 27, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 26, + 25, 24, 22, 22, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27, 27, 25, 24, 23, + 21, 21, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 29, 28, 28, 28, 33, 33, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 29, 29, 29, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 30, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 29, + 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, + 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 33, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 33, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 30, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, + 31, 31, 30, 30, 30, 30, 29, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, + 30, 30, 30, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, + 30, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, + 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, + 28, 28, 32, 32, 32, 31, 31, 31, 31, 30, 29, 29, 29, 28, 28, 27, 27, 27, + 32, 32, 32, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 27, 27, 27, 32, 32, + 32, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 27, 27, 27, 32, 32, 32, 31, + 31, 31, 30, 29, 29, 29, 29, 28, 28, 27, 27, 27, 32, 31, 31, 31, 31, 31, + 30, 29, 28, 28, 28, 28, 26, 26, 26, 26, 31, 31, 31, 31, 31, 31, 29, 28, + 28, 28, 28, 27, 26, 25, 25, 25, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, + 28, 26, 25, 24, 24, 24, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, + 25, 24, 24, 24, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, 25, 24, + 24, 24, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, 24, 23, 23, 23, + 29, 29, 30, 30, 30, 30, 28, 28, 27, 27, 27, 25, 23, 22, 22, 22, 28, 29, + 29, 30, 30, 30, 28, 28, 27, 27, 27, 24, 22, 21, 21, 21, 28, 29, 29, 30, + 30, 30, 28, 28, 27, 27, 27, 24, 22, 21, 21, 21, 28, 29, 29, 30, 30, 30, + 28, 28, 27, 27, 27, 24, 22, 21, 21, 21, 28, 28, 28, 28, 28, 28, 28, 27, + 26, 26, 26, 24, 22, 21, 21, 21, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, + 30, 30, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, + 30, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 28, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, + 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 28, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, + 29, 29, 29, 28, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 27, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, + 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, + 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28, + 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, + 26, 25, 24, 24, 24, 24, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 29, + 29, 28, 28, 28, 28, 28, 28, 28, 28, 26, 26, 25, 25, 25, 24, 23, 22, 22, + 22, 22, 28, 29, 29, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, + 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 28, 29, + 29, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, + 27, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 28, 29, 29, 29, 29, 29, + 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 25, 24, + 24, 24, 23, 22, 21, 21, 21, 21, + /* Size 4x16 */ + 33, 33, 32, 28, 33, 32, 32, 29, 32, 32, 32, 29, 32, 32, 32, 29, 32, 32, + 31, 30, 32, 32, 31, 30, 32, 32, 30, 28, 32, 32, 30, 28, 32, 32, 30, 28, + 32, 31, 29, 27, 32, 31, 29, 27, 31, 31, 28, 25, 30, 30, 28, 24, 30, 30, + 28, 23, 29, 30, 27, 21, 29, 30, 27, 21, + /* Size 16x4 */ + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 32, 32, 32, 32, + 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 27, 27, 28, 29, 29, 29, 30, 30, + 28, 28, 28, 27, 27, 25, 24, 23, 21, 21, + /* Size 8x32 */ + 32, 33, 33, 33, 32, 32, 29, 28, 33, 33, 33, 32, 32, 32, 29, 29, 33, 32, + 32, 32, 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, 29, 29, 33, 32, 32, 32, + 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, + 30, 29, 33, 32, 32, 32, 32, 32, 30, 29, 33, 32, 32, 32, 31, 31, 30, 30, + 33, 32, 32, 32, 31, 31, 30, 30, 33, 32, 32, 32, 31, 31, 30, 30, 33, 32, + 32, 31, 31, 31, 29, 29, 33, 32, 32, 31, 30, 30, 29, 28, 32, 32, 32, 31, + 30, 30, 28, 28, 32, 32, 32, 31, 30, 30, 28, 28, 32, 32, 32, 31, 30, 30, + 28, 28, 32, 32, 32, 31, 30, 30, 28, 28, 32, 32, 31, 31, 29, 29, 28, 27, + 32, 32, 31, 30, 29, 29, 28, 27, 32, 32, 31, 30, 29, 29, 28, 27, 32, 32, + 31, 30, 29, 29, 28, 27, 32, 31, 31, 30, 28, 28, 26, 26, 31, 31, 31, 29, + 28, 28, 26, 25, 30, 30, 30, 29, 28, 28, 25, 24, 30, 30, 30, 29, 28, 28, + 25, 24, 30, 30, 30, 29, 28, 28, 25, 24, 30, 30, 30, 29, 28, 28, 24, 23, + 29, 30, 30, 28, 27, 27, 23, 22, 28, 29, 30, 28, 27, 27, 22, 21, 28, 29, + 30, 28, 27, 27, 22, 21, 28, 29, 30, 28, 27, 27, 22, 21, 28, 28, 28, 28, + 26, 26, 22, 21, + /* Size 32x8 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 30, 30, 30, 30, 30, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, + 30, 30, 30, 30, 30, 28, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 28, 28, + 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, + 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, + 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 29, 29, 29, 29, 29, 29, + 30, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, 26, 26, 25, + 25, 25, 24, 23, 22, 22, 22, 22, 28, 29, 29, 29, 29, 29, 29, 29, 30, 30, + 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, + 21, 21, 21, 21 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 32, 27, 22, 32, 30, 25, 22, 27, 25, 22, 22, 22, 22, 22, 20, + /* Size 8x8 */ + 33, 33, 34, 30, 28, 26, 24, 21, 33, 33, 33, 30, 28, 26, 24, 22, 34, 33, + 32, 29, 26, 25, 24, 22, 30, 30, 29, 26, 24, 23, 23, 22, 28, 28, 26, 24, + 22, 22, 22, 22, 26, 26, 25, 23, 22, 22, 21, 21, 24, 24, 24, 23, 22, 21, + 21, 20, 21, 22, 22, 22, 22, 21, 20, 19, + /* Size 16x16 */ + 32, 33, 33, 33, 34, 34, 31, 31, 30, 28, 28, 26, 25, 23, 21, 21, 33, 33, + 33, 33, 33, 33, 31, 30, 28, 27, 27, 25, 24, 23, 21, 21, 33, 33, 33, 33, + 33, 33, 30, 30, 28, 27, 27, 25, 24, 23, 22, 22, 33, 33, 33, 33, 33, 33, + 30, 29, 28, 26, 26, 25, 24, 23, 22, 22, 34, 33, 33, 33, 32, 32, 30, 29, + 28, 26, 26, 24, 24, 23, 22, 22, 34, 33, 33, 33, 32, 32, 30, 29, 28, 26, + 26, 24, 24, 23, 22, 22, 31, 31, 30, 30, 30, 30, 28, 27, 26, 24, 24, 23, + 23, 23, 22, 22, 31, 30, 30, 29, 29, 29, 27, 26, 26, 24, 24, 23, 23, 22, + 22, 22, 30, 28, 28, 28, 28, 28, 26, 26, 24, 23, 23, 23, 22, 22, 22, 22, + 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 28, 27, + 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 26, 25, 25, 25, + 24, 24, 23, 23, 23, 22, 22, 21, 21, 21, 20, 20, 25, 24, 24, 24, 24, 24, + 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 23, 23, 23, 23, 23, 23, 23, 22, + 22, 22, 22, 21, 21, 20, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, + 21, 20, 20, 20, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, + 20, 20, 19, 19, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 30, 28, + 28, 28, 28, 27, 26, 25, 25, 25, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33, + 33, 33, 33, 33, 34, 34, 34, 32, 31, 30, 30, 30, 29, 28, 28, 28, 28, 26, + 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, 31, 30, 30, 30, 28, 28, 27, 27, 27, 26, 25, 24, 24, 24, + 23, 22, 21, 21, 21, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, + 30, 30, 30, 30, 28, 28, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 22, 22, + 22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, + 28, 28, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 28, 28, 27, 27, + 27, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 26, 25, 24, + 24, 24, 23, 22, 22, 22, 22, 22, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, + 22, 22, 22, 22, 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, + 29, 29, 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, + 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 28, 26, + 26, 26, 26, 25, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 34, 34, 33, 33, + 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 28, 26, 26, 26, 26, 25, + 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 33, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 30, 28, 28, 28, 28, 27, 26, 25, 25, 25, 24, 24, 24, 24, 24, + 23, 22, 22, 22, 22, 22, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 28, + 28, 27, 27, 27, 26, 25, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, + 22, 22, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, + 26, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 31, 30, + 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 24, 24, 24, + 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 31, 30, 30, 30, 30, 30, + 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 24, 24, 24, 24, 23, 23, 23, + 23, 23, 22, 22, 22, 22, 22, 22, 30, 29, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 27, 26, 26, 26, 26, 24, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 28, 28, 28, 28, 28, 28, 27, 27, 26, 26, 26, 26, 25, 24, + 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 28, 28, 27, 27, + 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 28, 28, 27, 27, 27, 27, 26, 26, + 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 21, 21, 21, 22, 27, 26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 24, + 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 26, 25, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 23, 23, 23, 23, + 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 21, 25, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, + 22, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 25, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, + 21, 21, 21, 20, 20, 20, 20, 20, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 20, + 20, 20, 20, 20, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, + 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, + 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, + 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, 21, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, + 20, 20, 19, 19, 19, 19, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 19, 19, + 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, + /* Size 4x8 */ + 33, 33, 28, 21, 33, 33, 27, 22, 33, 32, 26, 22, 30, 28, 24, 22, 28, 26, + 22, 22, 26, 25, 22, 21, 24, 24, 22, 20, 21, 22, 21, 19, + /* Size 8x4 */ + 33, 33, 33, 30, 28, 26, 24, 21, 33, 33, 32, 28, 26, 25, 24, 22, 28, 27, + 26, 24, 22, 22, 22, 21, 21, 22, 22, 22, 22, 21, 20, 19, + /* Size 8x16 */ + 32, 33, 33, 31, 28, 28, 23, 21, 33, 33, 33, 30, 27, 27, 23, 22, 33, 33, + 33, 30, 27, 27, 23, 22, 33, 33, 32, 30, 26, 26, 23, 22, 34, 32, 32, 29, + 26, 26, 23, 22, 34, 32, 32, 29, 26, 26, 23, 22, 31, 30, 29, 28, 24, 24, + 22, 22, 31, 29, 28, 27, 24, 24, 22, 22, 29, 28, 28, 26, 23, 23, 22, 22, + 28, 26, 26, 24, 22, 22, 22, 22, 28, 26, 26, 24, 22, 22, 22, 22, 25, 24, + 24, 23, 22, 22, 21, 21, 24, 24, 24, 23, 22, 22, 21, 20, 23, 23, 23, 23, + 22, 22, 20, 20, 21, 22, 22, 22, 21, 21, 20, 19, 21, 22, 22, 22, 21, 21, + 20, 19, + /* Size 16x8 */ + 32, 33, 33, 33, 34, 34, 31, 31, 29, 28, 28, 25, 24, 23, 21, 21, 33, 33, + 33, 33, 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, 33, 33, 33, 32, + 32, 32, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 31, 30, 30, 30, 29, 29, + 28, 27, 26, 24, 24, 23, 23, 23, 22, 22, 28, 27, 27, 26, 26, 26, 24, 24, + 23, 22, 22, 22, 22, 22, 21, 21, 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, + 22, 22, 22, 22, 21, 21, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, + 21, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, + 19, 19, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 31, 29, 28, 28, 28, 26, 23, 21, 21, 21, 33, 33, + 33, 33, 33, 33, 31, 28, 28, 28, 28, 25, 23, 21, 21, 21, 33, 33, 33, 33, + 33, 33, 30, 28, 27, 27, 27, 25, 23, 22, 22, 22, 33, 33, 33, 33, 33, 33, + 30, 28, 27, 27, 27, 25, 23, 22, 22, 22, 33, 33, 33, 33, 33, 33, 30, 28, + 27, 27, 27, 25, 23, 22, 22, 22, 33, 33, 33, 33, 33, 33, 30, 28, 27, 27, + 27, 25, 23, 22, 22, 22, 33, 33, 33, 32, 32, 32, 30, 28, 26, 26, 26, 25, + 23, 22, 22, 22, 34, 33, 33, 32, 32, 32, 30, 27, 26, 26, 26, 24, 23, 22, + 22, 22, 34, 33, 32, 32, 32, 32, 29, 27, 26, 26, 26, 24, 23, 22, 22, 22, + 34, 33, 32, 32, 32, 32, 29, 27, 26, 26, 26, 24, 23, 22, 22, 22, 34, 33, + 32, 32, 32, 32, 29, 27, 26, 26, 26, 24, 23, 22, 22, 22, 33, 32, 31, 31, + 31, 31, 28, 26, 25, 25, 25, 24, 23, 22, 22, 22, 31, 30, 30, 29, 29, 29, + 28, 26, 24, 24, 24, 23, 22, 22, 22, 22, 31, 30, 29, 28, 28, 28, 27, 25, + 24, 24, 24, 23, 22, 22, 22, 22, 31, 30, 29, 28, 28, 28, 27, 25, 24, 24, + 24, 23, 22, 22, 22, 22, 31, 30, 29, 28, 28, 28, 27, 25, 24, 24, 24, 23, + 22, 22, 22, 22, 29, 28, 28, 28, 28, 28, 26, 24, 23, 23, 23, 23, 22, 22, + 22, 22, 28, 28, 27, 26, 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, + 28, 27, 26, 26, 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 28, 27, + 26, 26, 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 28, 27, 26, 26, + 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 26, 26, 26, 25, 25, 25, + 24, 22, 22, 22, 22, 21, 21, 21, 21, 21, 25, 25, 24, 24, 24, 24, 23, 22, + 22, 22, 22, 21, 21, 21, 21, 21, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22, + 22, 21, 21, 20, 20, 20, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 21, + 21, 20, 20, 20, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 21, 21, 20, + 20, 20, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 21, 20, 20, 20, 20, + 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 21, 21, + 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 19, 19, 19, 21, 21, 22, 22, + 22, 22, 22, 21, 21, 21, 21, 20, 20, 19, 19, 19, 21, 21, 22, 22, 22, 22, + 22, 21, 21, 21, 21, 20, 20, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 21, 20, 19, 19, 19, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 29, 28, + 28, 28, 28, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 28, 28, 27, 27, 27, 26, + 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 26, 24, 24, 24, 24, + 23, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, + 29, 28, 28, 28, 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, + 22, 22, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, + 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 28, 26, 26, 26, + 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 31, 31, 30, 30, 30, 30, + 30, 30, 29, 29, 29, 28, 28, 27, 27, 27, 26, 24, 24, 24, 24, 24, 23, 23, + 23, 23, 23, 22, 22, 22, 22, 22, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, + 27, 26, 26, 25, 25, 25, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, + 21, 21, 21, 22, 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, + 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, + 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 28, 28, 27, 27, + 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 26, 25, 25, 25, 25, 25, 25, 24, + 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, + 21, 21, 20, 20, 20, 21, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 20, + 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, + 20, 20, 20, 20, 19, 19, 19, 19, + /* Size 4x16 */ + 33, 33, 28, 21, 33, 33, 27, 22, 33, 33, 27, 22, 33, 32, 26, 22, 33, 32, + 26, 22, 33, 32, 26, 22, 30, 29, 24, 22, 30, 28, 24, 22, 28, 28, 23, 22, + 27, 26, 22, 22, 27, 26, 22, 22, 25, 24, 22, 21, 24, 24, 22, 20, 23, 23, + 22, 20, 21, 22, 21, 19, 21, 22, 21, 19, + /* Size 16x4 */ + 33, 33, 33, 33, 33, 33, 30, 30, 28, 27, 27, 25, 24, 23, 21, 21, 33, 33, + 33, 32, 32, 32, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 28, 27, 27, 26, + 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, + /* Size 8x32 */ + 32, 33, 33, 31, 28, 28, 23, 21, 33, 33, 33, 31, 28, 28, 23, 21, 33, 33, + 33, 30, 27, 27, 23, 22, 33, 33, 33, 30, 27, 27, 23, 22, 33, 33, 33, 30, + 27, 27, 23, 22, 33, 33, 33, 30, 27, 27, 23, 22, 33, 33, 32, 30, 26, 26, + 23, 22, 34, 33, 32, 30, 26, 26, 23, 22, 34, 32, 32, 29, 26, 26, 23, 22, + 34, 32, 32, 29, 26, 26, 23, 22, 34, 32, 32, 29, 26, 26, 23, 22, 33, 31, + 31, 28, 25, 25, 23, 22, 31, 30, 29, 28, 24, 24, 22, 22, 31, 29, 28, 27, + 24, 24, 22, 22, 31, 29, 28, 27, 24, 24, 22, 22, 31, 29, 28, 27, 24, 24, + 22, 22, 29, 28, 28, 26, 23, 23, 22, 22, 28, 27, 26, 24, 22, 22, 22, 22, + 28, 26, 26, 24, 22, 22, 22, 22, 28, 26, 26, 24, 22, 22, 22, 22, 28, 26, + 26, 24, 22, 22, 22, 22, 26, 26, 25, 24, 22, 22, 21, 21, 25, 24, 24, 23, + 22, 22, 21, 21, 24, 24, 24, 23, 22, 22, 21, 20, 24, 24, 24, 23, 22, 22, + 21, 20, 24, 24, 24, 23, 22, 22, 21, 20, 23, 23, 23, 23, 22, 22, 20, 20, + 22, 22, 22, 22, 21, 21, 20, 20, 21, 22, 22, 22, 21, 21, 20, 19, 21, 22, + 22, 22, 21, 21, 20, 19, 21, 22, 22, 22, 21, 21, 20, 19, 21, 22, 22, 22, + 22, 22, 20, 19, + /* Size 32x8 */ + 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 29, 28, + 28, 28, 28, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33, + 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 26, + 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 31, 29, 28, 28, 28, 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, + 23, 22, 22, 22, 22, 22, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28, + 28, 27, 27, 27, 26, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, + 22, 22, 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, + 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 28, 28, + 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 23, 23, 23, 23, 23, 23, + 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, + 21, 21, 20, 20, 20, 20, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, + 19, 19, 19, 19 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 33, 32, 32, 32, 32, 32, 32, 31, 32, 32, 31, 30, 32, 31, 30, 29, + /* Size 8x8 */ + 33, 33, 33, 33, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, + 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, + 31, 31, 30, 29, 32, 32, 32, 32, 31, 30, 30, 29, 32, 32, 32, 32, 30, 30, + 29, 28, 31, 31, 31, 31, 29, 29, 28, 27, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 33, 33, + 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, + 31, 30, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 29, + 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 33, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 30, 29, 29, 29, 28, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 30, 30, 29, 29, 29, 28, 28, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 30, 30, 29, 29, 29, 28, 28, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, + 30, 29, 28, 28, 28, 27, 30, 30, 30, 30, 30, 31, 31, 30, 29, 29, 29, 28, + 28, 28, 27, 26, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 30, 30, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 31, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, + 31, 31, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 30, 30, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, + 30, 30, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, + 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, + 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, + 30, 30, 30, 29, 29, 29, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, + 29, 29, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, + 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, + 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, + 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, + 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, + 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, + 28, 28, 28, 27, 27, 27, 30, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, + 31, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, + 26, 26, 30, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 30, 30, + 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 26, 26, + /* Size 4x8 */ + 33, 33, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, + 31, 30, 32, 32, 30, 30, 32, 31, 30, 29, 31, 31, 29, 28, + /* Size 8x4 */ + 33, 33, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 31, 30, 30, 29, 32, 32, 32, 31, 30, 30, 29, 28, + /* Size 8x16 */ + 32, 33, 33, 33, 33, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 31, 33, 32, + 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, + 32, 32, 32, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 32, 31, + 31, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 31, 30, 30, 30, + 32, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 30, 32, 32, + 32, 32, 31, 29, 29, 29, 32, 32, 31, 31, 30, 29, 29, 28, 32, 32, 31, 31, + 30, 29, 29, 28, 32, 31, 31, 31, 30, 28, 28, 28, 30, 30, 30, 30, 29, 28, + 28, 27, + /* Size 16x8 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 33, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 30, 30, 30, 29, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, + 30, 29, 29, 29, 28, 28, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, + 29, 29, 28, 28, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 29, 28, 28, + 28, 27, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 31, 31, 30, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, + 30, 29, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 30, 30, 30, 30, 30, 29, 29, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 30, 29, 29, 29, 29, 29, 28, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, + 29, 29, 29, 29, 28, 28, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29, + 29, 29, 28, 28, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, + 28, 28, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, + 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 32, 31, + 31, 31, 31, 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 27, 31, 31, 31, 31, + 31, 31, 31, 30, 30, 29, 28, 28, 28, 28, 28, 27, 30, 30, 30, 30, 30, 30, + 30, 30, 29, 28, 28, 28, 28, 28, 27, 26, 30, 30, 30, 30, 30, 30, 30, 30, + 29, 28, 28, 28, 28, 28, 27, 26, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, + 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 30, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 28, 28, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, + 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, + 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, + 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, + 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 27, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, + 28, 28, 28, 28, 27, 27, 26, 26, + /* Size 4x16 */ + 33, 33, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, + 32, 32, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 31, 31, 32, 32, 31, 30, + 32, 32, 31, 30, 32, 32, 31, 30, 32, 32, 30, 29, 32, 31, 30, 29, 32, 31, + 30, 29, 31, 31, 29, 28, 30, 30, 28, 28, + /* Size 16x4 */ + 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 28, 32, 32, 32, 32, 32, 31, + 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 31, 33, 33, + 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, + 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, + 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, + 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, + 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, + 32, 31, 31, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 31, 31, + 31, 30, 33, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 30, + 32, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 30, 32, 32, + 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 29, 32, 32, 32, 32, + 31, 29, 29, 29, 32, 32, 31, 31, 31, 29, 29, 28, 32, 32, 31, 31, 30, 29, + 29, 28, 32, 32, 31, 31, 30, 29, 29, 28, 32, 32, 31, 31, 30, 29, 29, 28, + 32, 32, 31, 31, 30, 29, 29, 28, 32, 31, 31, 31, 30, 28, 28, 28, 31, 31, + 31, 31, 30, 28, 28, 28, 30, 30, 30, 30, 29, 28, 28, 27, 30, 30, 30, 30, + 29, 28, 28, 27, + /* Size 32x8 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, + 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, + 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, + 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, + 29, 29, 29, 29, 28, 28, 28, 28, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28, + 28, 28, 27, 27 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 33, 30, 27, 33, 32, 29, 26, 30, 29, 26, 24, 27, 26, 24, 22, + /* Size 8x8 */ + 33, 33, 33, 34, 30, 29, 28, 26, 33, 33, 33, 33, 30, 29, 27, 25, 33, 33, + 33, 33, 29, 28, 26, 25, 34, 33, 33, 32, 29, 28, 26, 24, 30, 30, 29, 29, + 26, 26, 24, 23, 29, 29, 28, 28, 26, 25, 23, 23, 28, 27, 26, 26, 24, 23, + 22, 22, 26, 25, 25, 24, 23, 23, 22, 21, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 27, 25, 33, 33, + 33, 33, 33, 33, 33, 33, 31, 30, 30, 28, 28, 28, 26, 24, 33, 33, 33, 33, + 33, 33, 33, 32, 30, 30, 30, 28, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, + 33, 32, 30, 30, 30, 28, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, + 30, 29, 29, 28, 26, 26, 26, 24, 34, 33, 33, 33, 33, 32, 32, 32, 30, 29, + 29, 27, 26, 26, 25, 24, 34, 33, 33, 33, 33, 32, 32, 32, 30, 29, 29, 27, + 26, 26, 25, 24, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 27, 26, 26, + 25, 24, 31, 31, 30, 30, 30, 30, 30, 29, 28, 27, 27, 25, 24, 24, 24, 23, + 31, 30, 30, 30, 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 31, 30, + 30, 30, 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 29, 28, 28, 28, + 28, 27, 27, 27, 25, 25, 25, 23, 22, 22, 22, 22, 28, 28, 27, 27, 26, 26, + 26, 26, 24, 24, 24, 22, 22, 22, 22, 22, 28, 28, 27, 27, 26, 26, 26, 26, + 24, 24, 24, 22, 22, 22, 22, 22, 27, 26, 26, 26, 26, 25, 25, 25, 24, 23, + 23, 22, 22, 22, 22, 21, 25, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, + 22, 22, 21, 21, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 32, 31, 31, + 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 25, 25, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 32, 31, 30, 30, 30, 30, 29, + 28, 28, 28, 28, 28, 28, 26, 26, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, + 28, 27, 26, 26, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 30, 30, 30, 30, 30, 29, 28, 27, 27, 27, 27, 27, 26, 25, + 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, + 30, 30, 30, 30, 30, 29, 28, 27, 27, 27, 27, 26, 26, 25, 24, 24, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, + 30, 29, 28, 27, 27, 27, 27, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 30, 29, 28, 27, + 27, 27, 27, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 30, 28, 28, 27, 27, 27, 27, 26, + 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 31, 30, 29, 29, 29, 29, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, + 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, + 29, 29, 29, 28, 28, 26, 26, 26, 26, 26, 26, 25, 24, 24, 34, 34, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 30, 29, 29, 29, 29, 28, + 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 34, 34, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 31, 30, 29, 29, 29, 29, 28, 27, 26, 26, 26, + 26, 26, 25, 24, 24, 24, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 31, 30, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 26, 25, 24, + 24, 24, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, + 30, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28, + 28, 28, 27, 26, 26, 26, 26, 25, 25, 24, 24, 24, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 25, + 25, 25, 25, 24, 24, 24, 24, 24, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 29, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 24, 24, + 24, 23, 23, 23, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, + 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 23, 23, + 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 27, 26, + 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 23, 23, 31, 30, 30, 30, + 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 27, 26, 26, 26, 26, 26, + 25, 24, 24, 24, 24, 24, 23, 23, 23, 23, 31, 30, 30, 30, 30, 30, 30, 30, + 29, 29, 29, 29, 29, 29, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, + 24, 24, 23, 23, 23, 23, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, + 28, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 23, 23, 23, 23, 23, 23, + 23, 23, 29, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 27, 26, + 25, 25, 25, 25, 25, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, + 28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, + 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 28, 27, 27, 27, + 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 28, 27, 27, 27, 27, 27, 26, 26, + 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 28, 28, 28, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, + 26, 25, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 28, 28, 27, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, + 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 27, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 25, 24, 24, 23, 23, 23, 23, 23, + 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 26, 26, 26, 25, 25, 25, 25, 25, + 25, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, + 22, 22, 21, 21, 21, 21, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, + 21, 21, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, + /* Size 4x8 */ + 33, 33, 29, 28, 33, 33, 28, 27, 33, 32, 28, 26, 33, 32, 28, 26, 30, 28, + 26, 24, 29, 28, 24, 23, 27, 26, 23, 22, 25, 24, 23, 22, + /* Size 8x4 */ + 33, 33, 33, 33, 30, 29, 27, 25, 33, 33, 32, 32, 28, 28, 26, 24, 29, 28, + 28, 28, 26, 24, 23, 23, 28, 27, 26, 26, 24, 23, 22, 22, + /* Size 8x16 */ + 32, 33, 33, 33, 31, 28, 28, 27, 33, 33, 33, 33, 31, 27, 27, 26, 33, 33, + 33, 33, 30, 27, 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 32, 32, + 30, 26, 26, 26, 34, 33, 32, 32, 29, 26, 26, 25, 34, 33, 32, 32, 29, 26, + 26, 25, 33, 32, 31, 31, 29, 26, 26, 25, 31, 30, 29, 29, 28, 24, 24, 24, + 31, 29, 28, 28, 27, 24, 24, 23, 31, 29, 28, 28, 27, 24, 24, 23, 29, 28, + 27, 27, 25, 23, 23, 22, 28, 26, 26, 26, 24, 22, 22, 22, 28, 26, 26, 26, + 24, 22, 22, 22, 26, 26, 25, 25, 24, 22, 22, 22, 24, 24, 24, 24, 23, 22, + 22, 21, + /* Size 16x8 */ + 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 26, 24, 33, 33, + 33, 33, 33, 33, 33, 32, 30, 29, 29, 28, 26, 26, 26, 24, 33, 33, 33, 33, + 32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 33, 33, 33, 33, 32, 32, + 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 31, 31, 30, 30, 30, 29, 29, 29, + 28, 27, 27, 25, 24, 24, 24, 23, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, + 24, 23, 22, 22, 22, 22, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 23, + 22, 22, 22, 22, 27, 26, 26, 26, 26, 25, 25, 25, 24, 23, 23, 22, 22, 22, + 22, 21, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 31, 29, 28, 28, 28, 28, 27, 24, 33, 33, + 33, 33, 33, 33, 33, 33, 31, 29, 28, 28, 28, 28, 26, 24, 33, 33, 33, 33, + 33, 33, 33, 32, 31, 29, 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, + 33, 32, 30, 28, 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, + 30, 28, 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 28, + 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 28, 27, 27, + 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 28, 27, 27, 27, 27, + 26, 24, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 26, 26, 26, 26, 26, 24, + 34, 33, 33, 32, 32, 32, 32, 32, 30, 28, 26, 26, 26, 26, 26, 24, 34, 33, + 33, 32, 32, 32, 32, 31, 29, 28, 26, 26, 26, 26, 25, 24, 34, 33, 33, 32, + 32, 32, 32, 31, 29, 28, 26, 26, 26, 26, 25, 24, 34, 33, 33, 32, 32, 32, + 32, 31, 29, 28, 26, 26, 26, 26, 25, 24, 34, 33, 33, 32, 32, 32, 32, 31, + 29, 28, 26, 26, 26, 26, 25, 24, 33, 33, 32, 32, 31, 31, 31, 31, 29, 27, + 26, 26, 26, 26, 25, 24, 32, 32, 31, 31, 30, 30, 30, 30, 28, 26, 25, 25, + 25, 25, 24, 23, 31, 31, 30, 29, 29, 29, 29, 29, 28, 26, 24, 24, 24, 24, + 24, 23, 31, 30, 29, 29, 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23, + 31, 30, 29, 29, 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23, 31, 30, + 29, 29, 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23, 31, 30, 29, 29, + 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23, 30, 29, 28, 28, 28, 28, + 28, 28, 26, 24, 23, 23, 23, 23, 23, 23, 29, 28, 28, 27, 27, 27, 27, 26, + 25, 24, 23, 23, 23, 23, 22, 22, 28, 28, 27, 26, 26, 26, 26, 26, 24, 23, + 22, 22, 22, 22, 22, 22, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22, + 22, 22, 22, 22, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22, 22, 22, + 22, 22, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22, 22, 22, 22, 22, + 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22, 22, 22, 22, 22, 26, 26, + 26, 25, 25, 25, 25, 24, 24, 23, 22, 22, 22, 22, 22, 21, 26, 25, 25, 24, + 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 21, 24, 24, 24, 24, 24, 24, + 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 24, 24, 24, 24, 24, 24, 24, 24, + 23, 22, 22, 22, 22, 22, 21, 21, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 32, 31, 31, + 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 26, 26, 24, 24, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 29, + 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 29, 29, 29, 28, 28, 27, 26, 26, + 26, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 31, 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 26, 25, 24, + 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, + 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28, + 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, + 26, 26, 26, 26, 25, 24, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 26, 26, 25, 25, 25, 25, + 24, 24, 24, 24, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, + 29, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, + 29, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, + 26, 26, 26, 24, 24, 23, 23, 23, 23, 23, 23, 23, 22, 22, 28, 28, 27, 27, + 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, + 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 27, 27, 27, + 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, + 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 28, 28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, + 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 27, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 25, 24, 24, 23, 23, 23, + 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 23, 22, 22, + 22, 22, 22, 22, 21, 21, 21, 21, + /* Size 4x16 */ + 33, 33, 29, 28, 33, 33, 29, 27, 33, 33, 28, 27, 33, 33, 28, 27, 33, 32, + 28, 26, 33, 32, 28, 26, 33, 32, 28, 26, 33, 31, 27, 26, 31, 29, 26, 24, + 30, 28, 26, 24, 30, 28, 26, 24, 28, 27, 24, 23, 27, 26, 23, 22, 27, 26, + 23, 22, 26, 25, 23, 22, 24, 24, 22, 22, + /* Size 16x4 */ + 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 30, 28, 27, 27, 26, 24, 33, 33, + 33, 33, 32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 29, 29, 28, 28, + 28, 28, 28, 27, 26, 26, 26, 24, 23, 23, 23, 22, 28, 27, 27, 27, 26, 26, + 26, 26, 24, 24, 24, 23, 22, 22, 22, 22, + /* Size 8x32 */ + 32, 33, 33, 33, 31, 28, 28, 27, 33, 33, 33, 33, 31, 28, 28, 26, 33, 33, + 33, 33, 31, 27, 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 33, 33, + 30, 27, 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 33, 33, 30, 27, + 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 32, 32, 30, 26, 26, 26, + 34, 33, 32, 32, 30, 26, 26, 26, 34, 33, 32, 32, 29, 26, 26, 25, 34, 33, + 32, 32, 29, 26, 26, 25, 34, 33, 32, 32, 29, 26, 26, 25, 34, 33, 32, 32, + 29, 26, 26, 25, 33, 32, 31, 31, 29, 26, 26, 25, 32, 31, 30, 30, 28, 25, + 25, 24, 31, 30, 29, 29, 28, 24, 24, 24, 31, 29, 28, 28, 27, 24, 24, 23, + 31, 29, 28, 28, 27, 24, 24, 23, 31, 29, 28, 28, 27, 24, 24, 23, 31, 29, + 28, 28, 27, 24, 24, 23, 30, 28, 28, 28, 26, 23, 23, 23, 29, 28, 27, 27, + 25, 23, 23, 22, 28, 27, 26, 26, 24, 22, 22, 22, 28, 26, 26, 26, 24, 22, + 22, 22, 28, 26, 26, 26, 24, 22, 22, 22, 28, 26, 26, 26, 24, 22, 22, 22, + 28, 26, 26, 26, 24, 22, 22, 22, 26, 26, 25, 25, 24, 22, 22, 22, 26, 25, + 24, 24, 23, 22, 22, 22, 24, 24, 24, 24, 23, 22, 22, 21, 24, 24, 24, 24, + 23, 22, 22, 21, + /* Size 32x8 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 32, 31, 31, + 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 26, 26, 24, 24, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 29, 29, 29, 28, + 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, + 26, 26, 25, 24, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, + 24, 24, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, + 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, 28, 28, + 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, + 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 27, + 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 25, 25, 25, 25, 25, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, + 22, 22, 21, 21 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 33, 33, 33, 32, 33, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 31, + /* Size 8x8 */ + 33, 33, 33, 33, 33, 33, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, + 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, + 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 31, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + /* Size 4x8 */ + 33, 33, 33, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, + 32, 32, 33, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, + /* Size 8x4 */ + 33, 33, 33, 33, 33, 33, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, + /* Size 8x16 */ + 32, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33, 33, 33, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, + 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, + 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, + 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, + 32, 32, 32, 32, 31, 31, 33, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, + 31, 30, + /* Size 16x8 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, + 30, 30, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 30, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, + 30, 30, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 30, 30, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 30, 30, 30, 30, 30, 30, 30, 30, + /* Size 4x16 */ + 33, 33, 33, 32, 33, 33, 33, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, + 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, + 33, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, + 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, + /* Size 16x4 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33, 33, 33, 32, 32, 33, 33, + 33, 33, 33, 33, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, + 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, + 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, + 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, + 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, + 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, + 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, + 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, + 32, 32, 31, 31, 33, 32, 32, 32, 32, 32, 31, 31, 33, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, + 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, + 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, + 32, 32, 31, 30, + /* Size 32x8 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, + 30, 30, 30, 30 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 33, 33, 30, 33, 33, 33, 29, 33, 33, 32, 29, 30, 29, 29, 26, + /* Size 8x8 */ + 33, 33, 33, 33, 34, 33, 31, 31, 33, 33, 33, 33, 33, 32, 30, 30, 33, 33, + 33, 33, 33, 32, 30, 30, 33, 33, 33, 33, 33, 32, 29, 29, 34, 33, 33, 33, + 32, 32, 29, 29, 33, 32, 32, 32, 32, 31, 28, 28, 31, 30, 30, 29, 29, 28, + 26, 26, 31, 30, 30, 29, 29, 28, 26, 26, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 33, 33, + 33, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 30, 30, 30, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, + 30, 29, 29, 29, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, + 29, 29, 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, + 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 34, 34, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 33, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 30, 28, 28, 28, 28, 31, 31, 31, 30, 30, 30, + 30, 30, 30, 30, 30, 28, 28, 27, 27, 27, 31, 30, 30, 30, 30, 30, 29, 29, + 29, 29, 29, 28, 27, 26, 26, 26, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, + 29, 28, 27, 26, 26, 26, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, + 27, 26, 26, 26, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, + 34, 34, 34, 33, 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 33, + 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 33, 32, 32, 31, 30, + 30, 30, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 30, 30, 30, 30, + 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 30, 30, 30, 30, 30, 29, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, + 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 30, 30, 30, + 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, + 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 29, + 29, 29, 29, 29, 29, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 29, 29, 29, 29, 29, + 29, 29, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 32, 31, 31, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, + 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, + 29, 29, 29, 28, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, + 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, + 29, 29, 29, 29, 29, 28, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 28, 28, 28, 28, 28, + 28, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 30, 30, 29, 28, 28, 28, 28, 28, 28, 28, 28, 32, 32, + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, + 30, 30, 29, 28, 28, 28, 28, 28, 28, 28, 28, 27, 31, 31, 31, 31, 31, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 28, 28, + 28, 27, 27, 27, 27, 27, 27, 26, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, + 26, 26, 26, 26, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, + 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, + 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, + 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 31, 31, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, + 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 31, 31, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, + 26, 26, 26, 26, 26, 26, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, + 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 26, 26, + 26, 26, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 26, + /* Size 4x8 */ + 33, 33, 33, 30, 33, 33, 33, 29, 33, 33, 33, 29, 33, 32, 32, 28, 33, 32, + 32, 28, 33, 31, 31, 28, 30, 28, 28, 26, 30, 28, 28, 26, + /* Size 8x4 */ + 33, 33, 33, 33, 33, 33, 30, 30, 33, 33, 33, 32, 32, 31, 28, 28, 33, 33, + 33, 32, 32, 31, 28, 28, 30, 29, 29, 28, 28, 28, 26, 26, + /* Size 8x16 */ + 32, 33, 33, 33, 33, 33, 31, 29, 33, 33, 33, 33, 33, 33, 31, 28, 33, 33, + 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, + 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 32, 32, 32, + 30, 28, 34, 33, 33, 32, 32, 32, 30, 27, 34, 33, 32, 32, 32, 32, 29, 27, + 34, 33, 32, 32, 32, 32, 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, 33, 32, + 31, 31, 31, 31, 28, 26, 31, 30, 30, 29, 29, 29, 28, 26, 31, 30, 29, 28, + 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, + 27, 25, + /* Size 16x8 */ + 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 33, 33, 33, 33, + 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 31, 29, 28, 28, 28, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 31, 29, 28, 28, 28, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28, + 28, 27, 27, 27, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 26, 25, + 25, 25, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 28, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 28, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 28, 28, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 32, 31, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, 30, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 31, 30, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, + 30, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, + 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, 33, 33, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 31, 30, 28, 28, 26, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 31, 30, 28, 28, 26, 34, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 31, 30, 28, 27, 26, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, + 29, 28, 27, 26, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, + 27, 26, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, + 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, 34, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, 34, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, 33, 33, 33, 32, 32, 31, + 31, 31, 31, 31, 31, 30, 29, 28, 27, 26, 33, 32, 32, 31, 31, 31, 31, 31, + 31, 31, 31, 29, 28, 28, 26, 25, 32, 32, 31, 31, 30, 30, 30, 30, 30, 30, + 30, 29, 28, 27, 26, 25, 31, 31, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, + 28, 26, 26, 24, 31, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, + 25, 24, 31, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, + 31, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 31, 30, + 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 31, 30, 30, 29, + 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 31, 30, 30, 29, 29, 28, + 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 30, 30, 29, 29, 28, 28, 28, 28, + 28, 28, 28, 27, 26, 26, 24, 23, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, + 34, 34, 34, 33, 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, + 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 29, 29, 29, 29, 29, + 29, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, + 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, + 28, 28, 28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 32, 32, 32, 32, 32, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 29, 29, 28, 28, + 28, 28, 28, 28, 28, 27, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 27, 27, 27, 27, + 27, 26, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 26, 29, 29, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 27, 27, + 27, 27, 26, 26, 26, 25, 25, 25, 25, 25, 25, 24, 28, 28, 28, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 25, 25, + 24, 24, 24, 24, 24, 24, 24, 23, + /* Size 4x16 */ + 33, 33, 33, 30, 33, 33, 33, 30, 33, 33, 33, 29, 33, 33, 33, 29, 33, 33, + 33, 29, 33, 33, 33, 29, 33, 32, 32, 28, 33, 32, 32, 28, 33, 32, 32, 28, + 33, 32, 32, 28, 33, 32, 32, 28, 32, 31, 31, 28, 31, 29, 29, 26, 30, 28, + 28, 26, 30, 28, 28, 26, 30, 28, 28, 26, + /* Size 16x4 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 33, 33, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 30, 30, 29, 29, 29, 29, + 28, 28, 28, 28, 28, 28, 26, 26, 26, 26, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 31, 29, 33, 33, 33, 33, 33, 33, 31, 29, 33, 33, + 33, 33, 33, 33, 31, 28, 33, 33, 33, 33, 33, 33, 31, 28, 33, 33, 33, 33, + 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, + 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, + 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, + 33, 33, 33, 33, 30, 28, 33, 33, 33, 32, 32, 32, 30, 28, 33, 33, 33, 32, + 32, 32, 30, 28, 34, 33, 33, 32, 32, 32, 30, 27, 34, 33, 32, 32, 32, 32, + 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, + 34, 33, 32, 32, 32, 32, 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, 34, 33, + 32, 32, 32, 32, 29, 27, 33, 33, 32, 31, 31, 31, 29, 27, 33, 32, 31, 31, + 31, 31, 28, 26, 32, 31, 30, 30, 30, 30, 28, 26, 31, 30, 30, 29, 29, 29, + 28, 26, 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, + 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, + 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, 30, 29, 28, 28, + 28, 28, 26, 24, + /* Size 32x8 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, + 34, 34, 34, 33, 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, + 29, 29, 29, 29, 29, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, + 28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 31, 31, 31, 31, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 28, 28, + 28, 27, 27, 27, 27, 27, 27, 26, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 27, 27, 27, 27, 27, 27, 27, 27, 26, 26, 26, 25, 25, 25, + 25, 25, 25, 24 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 33, 33, 33, 33, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, + /* Size 8x8 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, + 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 4x8 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, + 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, + /* Size 8x4 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, + /* Size 8x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 33, 33, 33, 32, + 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, + 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, + 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, + 32, 32, + /* Size 16x8 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 4x16 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 33, 32, + 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, + 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, + 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, + /* Size 16x4 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, + 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, + 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, + 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, + 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, + 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32, + /* Size 32x8 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + /* Size 8x8 */ + 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 34, 33, 33, 33, 33, 33, 33, 33, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, + 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 32, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, + /* Size 4x8 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 34, 33, 32, 32, + /* Size 8x4 */ + 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, + /* Size 8x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, + 33, 32, 32, 32, 34, 33, 33, 33, 33, 32, 32, 32, 34, 33, 33, 33, 32, 32, + 32, 32, + /* Size 16x8 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 34, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 34, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 34, 34, 33, 33, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 4x16 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 33, 33, + 32, 32, 33, 33, 32, 32, 34, 33, 32, 32, + /* Size 16x4 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 32, + 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, + 34, 33, 33, 33, 33, 32, 32, 32, 34, 33, 33, 33, 33, 32, 32, 32, 34, 33, + 33, 33, 32, 32, 32, 32, 34, 33, 33, 33, 32, 32, 32, 32, 34, 33, 33, 33, + 32, 32, 32, 32, + /* Size 32x8 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32 }, + }, +}; diff --git a/libs/libaom/src/av1/common/quant_common.h b/libs/libaom/src/av1/common/quant_common.h new file mode 100644 index 000000000..9c30204ff --- /dev/null +++ b/libs/libaom/src/av1/common/quant_common.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_QUANT_COMMON_H_ +#define AOM_AV1_COMMON_QUANT_COMMON_H_ + +#include +#include "aom/aom_codec.h" +#include "av1/common/seg_common.h" +#include "av1/common/enums.h" +#include "av1/common/entropy.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MINQ 0 +#define MAXQ 255 +#define QINDEX_RANGE (MAXQ - MINQ + 1) +#define QINDEX_BITS 8 +// Total number of QM sets stored +#define QM_LEVEL_BITS 4 +#define NUM_QM_LEVELS (1 << QM_LEVEL_BITS) +/* Range of QMS is between first and last value, with offset applied to inter + * blocks*/ +#define DEFAULT_QM_Y 10 +#define DEFAULT_QM_U 11 +#define DEFAULT_QM_V 12 +#define DEFAULT_QM_FIRST 5 +#define DEFAULT_QM_LAST 9 + +struct AV1Common; +struct CommonQuantParams; +struct macroblockd; + +int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth); +int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth); + +int av1_get_qindex(const struct segmentation *seg, int segment_id, + int base_qindex); + +// Returns true if we are using quantization matrix. +bool av1_use_qmatrix(const struct CommonQuantParams *quant_params, + const struct macroblockd *xd, int segment_id); + +// Reduce the large number of quantizers to a smaller number of levels for which +// different matrices may be defined +static INLINE int aom_get_qmlevel(int qindex, int first, int last) { + return first + (qindex * (last + 1 - first)) / QINDEX_RANGE; +} + +// Initialize all global quant/dequant matrices. +void av1_qm_init(struct CommonQuantParams *quant_params, int num_planes); + +// Get global dequant matrix. +const qm_val_t *av1_iqmatrix(const struct CommonQuantParams *quant_params, + int qmlevel, int plane, TX_SIZE tx_size); +// Get global quant matrix. +const qm_val_t *av1_qmatrix(const struct CommonQuantParams *quant_params, + int qmlevel, int plane, TX_SIZE tx_size); + +// Get either local / global dequant matrix as appropriate. +const qm_val_t *av1_get_iqmatrix(const struct CommonQuantParams *quant_params, + const struct macroblockd *xd, int plane, + TX_SIZE tx_size, TX_TYPE tx_type); +// Get either local / global quant matrix as appropriate. +const qm_val_t *av1_get_qmatrix(const struct CommonQuantParams *quant_params, + const struct macroblockd *xd, int plane, + TX_SIZE tx_size, TX_TYPE tx_type); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_QUANT_COMMON_H_ diff --git a/libs/libaom/src/av1/common/reconinter.c b/libs/libaom/src/av1/common/reconinter.c new file mode 100644 index 000000000..287adddcc --- /dev/null +++ b/libs/libaom/src/av1/common/reconinter.c @@ -0,0 +1,1426 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/blend.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/mvref_common.h" +#include "av1/common/obmc.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" + +// This function will determine whether or not to create a warped +// prediction. +int av1_allow_warp(const MB_MODE_INFO *const mbmi, + const WarpTypesAllowed *const warp_types, + const WarpedMotionParams *const gm_params, + int build_for_obmc, const struct scale_factors *const sf, + WarpedMotionParams *final_warp_params) { + // Note: As per the spec, we must test the fixed point scales here, which are + // at a higher precision (1 << 14) than the xs and ys in subpel_params (that + // have 1 << 10 precision). + if (av1_is_scaled(sf)) return 0; + + if (final_warp_params != NULL) *final_warp_params = default_warp_params; + + if (build_for_obmc) return 0; + + if (warp_types->local_warp_allowed && !mbmi->wm_params.invalid) { + if (final_warp_params != NULL) + memcpy(final_warp_params, &mbmi->wm_params, sizeof(*final_warp_params)); + return 1; + } else if (warp_types->global_warp_allowed && !gm_params->invalid) { + if (final_warp_params != NULL) + memcpy(final_warp_params, gm_params, sizeof(*final_warp_params)); + return 1; + } + + return 0; +} + +void av1_init_inter_params(InterPredParams *inter_pred_params, int block_width, + int block_height, int pix_row, int pix_col, + int subsampling_x, int subsampling_y, int bit_depth, + int use_hbd_buf, int is_intrabc, + const struct scale_factors *sf, + const struct buf_2d *ref_buf, + int_interpfilters interp_filters) { + inter_pred_params->block_width = block_width; + inter_pred_params->block_height = block_height; + inter_pred_params->pix_row = pix_row; + inter_pred_params->pix_col = pix_col; + inter_pred_params->subsampling_x = subsampling_x; + inter_pred_params->subsampling_y = subsampling_y; + inter_pred_params->bit_depth = bit_depth; + inter_pred_params->use_hbd_buf = use_hbd_buf; + inter_pred_params->is_intrabc = is_intrabc; + inter_pred_params->scale_factors = sf; + inter_pred_params->ref_frame_buf = *ref_buf; + inter_pred_params->mode = TRANSLATION_PRED; + inter_pred_params->comp_mode = UNIFORM_SINGLE; + + if (is_intrabc) { + inter_pred_params->interp_filter_params[0] = &av1_intrabc_filter_params; + inter_pred_params->interp_filter_params[1] = &av1_intrabc_filter_params; + } else { + inter_pred_params->interp_filter_params[0] = + av1_get_interp_filter_params_with_block_size( + interp_filters.as_filters.x_filter, block_width); + inter_pred_params->interp_filter_params[1] = + av1_get_interp_filter_params_with_block_size( + interp_filters.as_filters.y_filter, block_height); + } +} + +void av1_init_comp_mode(InterPredParams *inter_pred_params) { + inter_pred_params->comp_mode = UNIFORM_COMP; +} + +void av1_init_warp_params(InterPredParams *inter_pred_params, + const WarpTypesAllowed *warp_types, int ref, + const MACROBLOCKD *xd, const MB_MODE_INFO *mi) { + if (inter_pred_params->block_height < 8 || inter_pred_params->block_width < 8) + return; + + if (xd->cur_frame_force_integer_mv) return; + + if (av1_allow_warp(mi, warp_types, &xd->global_motion[mi->ref_frame[ref]], 0, + inter_pred_params->scale_factors, + &inter_pred_params->warp_params)) + inter_pred_params->mode = WARP_PRED; +} + +void av1_init_mask_comp(InterPredParams *inter_pred_params, BLOCK_SIZE bsize, + const INTERINTER_COMPOUND_DATA *mask_comp) { + inter_pred_params->sb_type = bsize; + inter_pred_params->mask_comp = *mask_comp; + + if (inter_pred_params->conv_params.compound_index == 1) { + inter_pred_params->conv_params.do_average = 0; + inter_pred_params->comp_mode = MASK_COMP; + } +} + +void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, + InterPredParams *inter_pred_params, + const SubpelParams *subpel_params) { + assert(IMPLIES(inter_pred_params->conv_params.is_compound, + inter_pred_params->conv_params.dst != NULL)); + + // TODO(jingning): av1_warp_plane() can be further cleaned up. + if (inter_pred_params->mode == WARP_PRED) { + av1_warp_plane( + &inter_pred_params->warp_params, inter_pred_params->use_hbd_buf, + inter_pred_params->bit_depth, inter_pred_params->ref_frame_buf.buf0, + inter_pred_params->ref_frame_buf.width, + inter_pred_params->ref_frame_buf.height, + inter_pred_params->ref_frame_buf.stride, dst, + inter_pred_params->pix_col, inter_pred_params->pix_row, + inter_pred_params->block_width, inter_pred_params->block_height, + dst_stride, inter_pred_params->subsampling_x, + inter_pred_params->subsampling_y, &inter_pred_params->conv_params); + } else if (inter_pred_params->mode == TRANSLATION_PRED) { +#if CONFIG_AV1_HIGHBITDEPTH + if (inter_pred_params->use_hbd_buf) { + highbd_inter_predictor( + src, src_stride, dst, dst_stride, subpel_params, + inter_pred_params->scale_factors, inter_pred_params->block_width, + inter_pred_params->block_height, &inter_pred_params->conv_params, + inter_pred_params->interp_filter_params, + inter_pred_params->bit_depth); + } else { + inter_predictor( + src, src_stride, dst, dst_stride, subpel_params, + inter_pred_params->scale_factors, inter_pred_params->block_width, + inter_pred_params->block_height, &inter_pred_params->conv_params, + inter_pred_params->interp_filter_params); + } +#else + inter_predictor( + src, src_stride, dst, dst_stride, subpel_params, + inter_pred_params->scale_factors, inter_pred_params->block_width, + inter_pred_params->block_height, &inter_pred_params->conv_params, + inter_pred_params->interp_filter_params); +#endif + } +} + +static const uint8_t wedge_master_oblique_odd[MASK_MASTER_SIZE] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 6, 18, + 37, 53, 60, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, +}; +static const uint8_t wedge_master_oblique_even[MASK_MASTER_SIZE] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 11, 27, + 46, 58, 62, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, +}; +static const uint8_t wedge_master_vertical[MASK_MASTER_SIZE] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 7, 21, + 43, 57, 62, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, +}; + +static AOM_INLINE void shift_copy(const uint8_t *src, uint8_t *dst, int shift, + int width) { + if (shift >= 0) { + memcpy(dst + shift, src, width - shift); + memset(dst, src[0], shift); + } else { + shift = -shift; + memcpy(dst, src + shift, width - shift); + memset(dst + width - shift, src[width - 1], shift); + } +} + +/* clang-format off */ +DECLARE_ALIGNED(16, static uint8_t, + wedge_signflip_lookup[BLOCK_SIZES_ALL][MAX_WEDGE_TYPES]) = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, }, + { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, }, + { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, }, + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, }, + { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, }, + { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, }, + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, }, + { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used +}; +/* clang-format on */ + +// [negative][direction] +DECLARE_ALIGNED( + 16, static uint8_t, + wedge_mask_obl[2][WEDGE_DIRECTIONS][MASK_MASTER_SIZE * MASK_MASTER_SIZE]); + +// 4 * MAX_WEDGE_SQUARE is an easy to compute and fairly tight upper bound +// on the sum of all mask sizes up to an including MAX_WEDGE_SQUARE. +DECLARE_ALIGNED(16, static uint8_t, + wedge_mask_buf[2 * MAX_WEDGE_TYPES * 4 * MAX_WEDGE_SQUARE]); + +DECLARE_ALIGNED(16, static uint8_t, + smooth_interintra_mask_buf[INTERINTRA_MODES][BLOCK_SIZES_ALL] + [MAX_WEDGE_SQUARE]); + +static wedge_masks_type wedge_masks[BLOCK_SIZES_ALL][2]; + +static const wedge_code_type wedge_codebook_16_hgtw[16] = { + { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 }, + { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 }, + { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 4 }, + { WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 }, + { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 }, + { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 }, + { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 }, + { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 }, +}; + +static const wedge_code_type wedge_codebook_16_hltw[16] = { + { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 }, + { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 }, + { WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 4, 4 }, + { WEDGE_VERTICAL, 6, 4 }, { WEDGE_HORIZONTAL, 4, 4 }, + { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 }, + { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 }, + { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 }, + { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 }, +}; + +static const wedge_code_type wedge_codebook_16_heqw[16] = { + { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 }, + { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 }, + { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 }, + { WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 6, 4 }, + { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 }, + { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 }, + { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 }, + { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 }, +}; + +const wedge_params_type av1_wedge_params_lookup[BLOCK_SIZES_ALL] = { + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, + { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8], + wedge_masks[BLOCK_8X8] }, + { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16], + wedge_masks[BLOCK_8X16] }, + { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8], + wedge_masks[BLOCK_16X8] }, + { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16], + wedge_masks[BLOCK_16X16] }, + { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32], + wedge_masks[BLOCK_16X32] }, + { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16], + wedge_masks[BLOCK_32X16] }, + { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32], + wedge_masks[BLOCK_32X32] }, + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, + { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X32], + wedge_masks[BLOCK_8X32] }, + { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X8], + wedge_masks[BLOCK_32X8] }, + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, +}; + +static const uint8_t *get_wedge_mask_inplace(int wedge_index, int neg, + BLOCK_SIZE sb_type) { + const uint8_t *master; + const int bh = block_size_high[sb_type]; + const int bw = block_size_wide[sb_type]; + const wedge_code_type *a = + av1_wedge_params_lookup[sb_type].codebook + wedge_index; + int woff, hoff; + const uint8_t wsignflip = + av1_wedge_params_lookup[sb_type].signflip[wedge_index]; + + assert(wedge_index >= 0 && wedge_index < get_wedge_types_lookup(sb_type)); + woff = (a->x_offset * bw) >> 3; + hoff = (a->y_offset * bh) >> 3; + master = wedge_mask_obl[neg ^ wsignflip][a->direction] + + MASK_MASTER_STRIDE * (MASK_MASTER_SIZE / 2 - hoff) + + MASK_MASTER_SIZE / 2 - woff; + return master; +} + +const uint8_t *av1_get_compound_type_mask( + const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type) { + assert(is_masked_compound_type(comp_data->type)); + (void)sb_type; + switch (comp_data->type) { + case COMPOUND_WEDGE: + return av1_get_contiguous_soft_mask(comp_data->wedge_index, + comp_data->wedge_sign, sb_type); + case COMPOUND_DIFFWTD: return comp_data->seg_mask; + default: assert(0); return NULL; + } +} + +static AOM_INLINE void diffwtd_mask_d16( + uint8_t *mask, int which_inverse, int mask_base, const CONV_BUF_TYPE *src0, + int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, + ConvolveParams *conv_params, int bd) { + int round = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8); + int i, j, m, diff; + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + diff = abs(src0[i * src0_stride + j] - src1[i * src1_stride + j]); + diff = ROUND_POWER_OF_TWO(diff, round); + m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA); + mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m; + } + } +} + +void av1_build_compound_diffwtd_mask_d16_c( + uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, + int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, + ConvolveParams *conv_params, int bd) { + switch (mask_type) { + case DIFFWTD_38: + diffwtd_mask_d16(mask, 0, 38, src0, src0_stride, src1, src1_stride, h, w, + conv_params, bd); + break; + case DIFFWTD_38_INV: + diffwtd_mask_d16(mask, 1, 38, src0, src0_stride, src1, src1_stride, h, w, + conv_params, bd); + break; + default: assert(0); + } +} + +static AOM_INLINE void diffwtd_mask(uint8_t *mask, int which_inverse, + int mask_base, const uint8_t *src0, + int src0_stride, const uint8_t *src1, + int src1_stride, int h, int w) { + int i, j, m, diff; + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + diff = + abs((int)src0[i * src0_stride + j] - (int)src1[i * src1_stride + j]); + m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA); + mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m; + } + } +} + +void av1_build_compound_diffwtd_mask_c(uint8_t *mask, + DIFFWTD_MASK_TYPE mask_type, + const uint8_t *src0, int src0_stride, + const uint8_t *src1, int src1_stride, + int h, int w) { + switch (mask_type) { + case DIFFWTD_38: + diffwtd_mask(mask, 0, 38, src0, src0_stride, src1, src1_stride, h, w); + break; + case DIFFWTD_38_INV: + diffwtd_mask(mask, 1, 38, src0, src0_stride, src1, src1_stride, h, w); + break; + default: assert(0); + } +} + +static AOM_FORCE_INLINE void diffwtd_mask_highbd( + uint8_t *mask, int which_inverse, int mask_base, const uint16_t *src0, + int src0_stride, const uint16_t *src1, int src1_stride, int h, int w, + const unsigned int bd) { + assert(bd >= 8); + if (bd == 8) { + if (which_inverse) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR; + unsigned int m = negative_to_zero(mask_base + diff); + m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA); + mask[j] = AOM_BLEND_A64_MAX_ALPHA - m; + } + src0 += src0_stride; + src1 += src1_stride; + mask += w; + } + } else { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR; + unsigned int m = negative_to_zero(mask_base + diff); + m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA); + mask[j] = m; + } + src0 += src0_stride; + src1 += src1_stride; + mask += w; + } + } + } else { + const unsigned int bd_shift = bd - 8; + if (which_inverse) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int diff = + (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR; + unsigned int m = negative_to_zero(mask_base + diff); + m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA); + mask[j] = AOM_BLEND_A64_MAX_ALPHA - m; + } + src0 += src0_stride; + src1 += src1_stride; + mask += w; + } + } else { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int diff = + (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR; + unsigned int m = negative_to_zero(mask_base + diff); + m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA); + mask[j] = m; + } + src0 += src0_stride; + src1 += src1_stride; + mask += w; + } + } + } +} + +void av1_build_compound_diffwtd_mask_highbd_c( + uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, + int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, + int bd) { + switch (mask_type) { + case DIFFWTD_38: + diffwtd_mask_highbd(mask, 0, 38, CONVERT_TO_SHORTPTR(src0), src0_stride, + CONVERT_TO_SHORTPTR(src1), src1_stride, h, w, bd); + break; + case DIFFWTD_38_INV: + diffwtd_mask_highbd(mask, 1, 38, CONVERT_TO_SHORTPTR(src0), src0_stride, + CONVERT_TO_SHORTPTR(src1), src1_stride, h, w, bd); + break; + default: assert(0); + } +} + +static AOM_INLINE void init_wedge_master_masks() { + int i, j; + const int w = MASK_MASTER_SIZE; + const int h = MASK_MASTER_SIZE; + const int stride = MASK_MASTER_STRIDE; + // Note: index [0] stores the masters, and [1] its complement. + // Generate prototype by shifting the masters + int shift = h / 4; + for (i = 0; i < h; i += 2) { + shift_copy(wedge_master_oblique_even, + &wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride], shift, + MASK_MASTER_SIZE); + shift--; + shift_copy(wedge_master_oblique_odd, + &wedge_mask_obl[0][WEDGE_OBLIQUE63][(i + 1) * stride], shift, + MASK_MASTER_SIZE); + memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][i * stride], + wedge_master_vertical, + MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0])); + memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][(i + 1) * stride], + wedge_master_vertical, + MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0])); + } + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int msk = wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j]; + wedge_mask_obl[0][WEDGE_OBLIQUE27][j * stride + i] = msk; + wedge_mask_obl[0][WEDGE_OBLIQUE117][i * stride + w - 1 - j] = + wedge_mask_obl[0][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] = + (1 << WEDGE_WEIGHT_BITS) - msk; + wedge_mask_obl[1][WEDGE_OBLIQUE63][i * stride + j] = + wedge_mask_obl[1][WEDGE_OBLIQUE27][j * stride + i] = + (1 << WEDGE_WEIGHT_BITS) - msk; + wedge_mask_obl[1][WEDGE_OBLIQUE117][i * stride + w - 1 - j] = + wedge_mask_obl[1][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] = msk; + const int mskx = wedge_mask_obl[0][WEDGE_VERTICAL][i * stride + j]; + wedge_mask_obl[0][WEDGE_HORIZONTAL][j * stride + i] = mskx; + wedge_mask_obl[1][WEDGE_VERTICAL][i * stride + j] = + wedge_mask_obl[1][WEDGE_HORIZONTAL][j * stride + i] = + (1 << WEDGE_WEIGHT_BITS) - mskx; + } + } +} + +static AOM_INLINE void init_wedge_masks() { + uint8_t *dst = wedge_mask_buf; + BLOCK_SIZE bsize; + memset(wedge_masks, 0, sizeof(wedge_masks)); + for (bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; ++bsize) { + const wedge_params_type *wedge_params = &av1_wedge_params_lookup[bsize]; + const int wtypes = wedge_params->wedge_types; + if (wtypes == 0) continue; + const uint8_t *mask; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + int w; + for (w = 0; w < wtypes; ++w) { + mask = get_wedge_mask_inplace(w, 0, bsize); + aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw, NULL, 0, NULL, 0, bw, + bh); + wedge_params->masks[0][w] = dst; + dst += bw * bh; + + mask = get_wedge_mask_inplace(w, 1, bsize); + aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw, NULL, 0, NULL, 0, bw, + bh); + wedge_params->masks[1][w] = dst; + dst += bw * bh; + } + assert(sizeof(wedge_mask_buf) >= (size_t)(dst - wedge_mask_buf)); + } +} + +/* clang-format off */ +static const uint8_t ii_weights1d[MAX_SB_SIZE] = { + 60, 58, 56, 54, 52, 50, 48, 47, 45, 44, 42, 41, 39, 38, 37, 35, 34, 33, 32, + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 16, + 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, + 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 4, 4, + 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 +}; +static uint8_t ii_size_scales[BLOCK_SIZES_ALL] = { + 32, 16, 16, 16, 8, 8, 8, 4, + 4, 4, 2, 2, 2, 1, 1, 1, + 8, 8, 4, 4, 2, 2 +}; +/* clang-format on */ + +static AOM_INLINE void build_smooth_interintra_mask(uint8_t *mask, int stride, + BLOCK_SIZE plane_bsize, + INTERINTRA_MODE mode) { + int i, j; + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + const int size_scale = ii_size_scales[plane_bsize]; + + switch (mode) { + case II_V_PRED: + for (i = 0; i < bh; ++i) { + memset(mask, ii_weights1d[i * size_scale], bw * sizeof(mask[0])); + mask += stride; + } + break; + + case II_H_PRED: + for (i = 0; i < bh; ++i) { + for (j = 0; j < bw; ++j) mask[j] = ii_weights1d[j * size_scale]; + mask += stride; + } + break; + + case II_SMOOTH_PRED: + for (i = 0; i < bh; ++i) { + for (j = 0; j < bw; ++j) + mask[j] = ii_weights1d[(i < j ? i : j) * size_scale]; + mask += stride; + } + break; + + case II_DC_PRED: + default: + for (i = 0; i < bh; ++i) { + memset(mask, 32, bw * sizeof(mask[0])); + mask += stride; + } + break; + } +} + +static AOM_INLINE void init_smooth_interintra_masks() { + for (int m = 0; m < INTERINTRA_MODES; ++m) { + for (int bs = 0; bs < BLOCK_SIZES_ALL; ++bs) { + const int bw = block_size_wide[bs]; + const int bh = block_size_high[bs]; + if (bw > MAX_WEDGE_SIZE || bh > MAX_WEDGE_SIZE) continue; + build_smooth_interintra_mask(smooth_interintra_mask_buf[m][bs], bw, bs, + m); + } + } +} + +// Equation of line: f(x, y) = a[0]*(x - a[2]*w/8) + a[1]*(y - a[3]*h/8) = 0 +void av1_init_wedge_masks() { + init_wedge_master_masks(); + init_wedge_masks(); + init_smooth_interintra_masks(); +} + +static AOM_INLINE void build_masked_compound_no_round( + uint8_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, + const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h, + int w, InterPredParams *inter_pred_params) { + const int ssy = inter_pred_params->subsampling_y; + const int ssx = inter_pred_params->subsampling_x; + const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type); + const int mask_stride = block_size_wide[sb_type]; +#if CONFIG_AV1_HIGHBITDEPTH + if (inter_pred_params->use_hbd_buf) { + aom_highbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, ssx, + ssy, &inter_pred_params->conv_params, + inter_pred_params->bit_depth); + } else { + aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, ssx, ssy, + &inter_pred_params->conv_params); + } +#else + aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, ssx, ssy, + &inter_pred_params->conv_params); +#endif +} + +void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride, + uint8_t *dst, int dst_stride, + InterPredParams *inter_pred_params, + const SubpelParams *subpel_params) { + const INTERINTER_COMPOUND_DATA *comp_data = &inter_pred_params->mask_comp; + BLOCK_SIZE sb_type = inter_pred_params->sb_type; + + // We're going to call av1_make_inter_predictor to generate a prediction into + // a temporary buffer, then will blend that temporary buffer with that from + // the other reference. + DECLARE_ALIGNED(32, uint8_t, tmp_buf[2 * MAX_SB_SQUARE]); + uint8_t *tmp_dst = + inter_pred_params->use_hbd_buf ? CONVERT_TO_BYTEPTR(tmp_buf) : tmp_buf; + + const int tmp_buf_stride = MAX_SB_SIZE; + CONV_BUF_TYPE *org_dst = inter_pred_params->conv_params.dst; + int org_dst_stride = inter_pred_params->conv_params.dst_stride; + CONV_BUF_TYPE *tmp_buf16 = (CONV_BUF_TYPE *)tmp_buf; + inter_pred_params->conv_params.dst = tmp_buf16; + inter_pred_params->conv_params.dst_stride = tmp_buf_stride; + assert(inter_pred_params->conv_params.do_average == 0); + + // This will generate a prediction in tmp_buf for the second reference + av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE, + inter_pred_params, subpel_params); + + if (!inter_pred_params->conv_params.plane && + comp_data->type == COMPOUND_DIFFWTD) { + av1_build_compound_diffwtd_mask_d16( + comp_data->seg_mask, comp_data->mask_type, org_dst, org_dst_stride, + tmp_buf16, tmp_buf_stride, inter_pred_params->block_height, + inter_pred_params->block_width, &inter_pred_params->conv_params, + inter_pred_params->bit_depth); + } + build_masked_compound_no_round( + dst, dst_stride, org_dst, org_dst_stride, tmp_buf16, tmp_buf_stride, + comp_data, sb_type, inter_pred_params->block_height, + inter_pred_params->block_width, inter_pred_params); +} + +void av1_build_one_inter_predictor( + uint8_t *dst, int dst_stride, const MV *const src_mv, + InterPredParams *inter_pred_params, MACROBLOCKD *xd, int mi_x, int mi_y, + int ref, CalcSubpelParamsFunc calc_subpel_params_func) { + SubpelParams subpel_params; + uint8_t *src; + int src_stride; + calc_subpel_params_func(src_mv, inter_pred_params, xd, mi_x, mi_y, ref, &src, + &subpel_params, &src_stride); + + if (inter_pred_params->comp_mode == UNIFORM_SINGLE || + inter_pred_params->comp_mode == UNIFORM_COMP) { + av1_make_inter_predictor(src, src_stride, dst, dst_stride, + inter_pred_params, &subpel_params); + } else { + av1_make_masked_inter_predictor(src, src_stride, dst, dst_stride, + inter_pred_params, &subpel_params); + } +} + +// True if the following hold: +// 1. Not intrabc and not build_for_obmc +// 2. A U or V plane +// 3. If the block size differs from the base block size +// 4. If sub-sampled, none of the previous blocks around the sub-sample +// are intrabc or inter-blocks +static bool is_sub8x8_inter(const MACROBLOCKD *xd, int plane, BLOCK_SIZE bsize, + int is_intrabc, int build_for_obmc) { + if (is_intrabc || build_for_obmc) { + return false; + } + + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + if ((block_size_wide[bsize] >= 8 || !ss_x) && + (block_size_high[bsize] >= 8 || !ss_y)) { + return false; + } + + // For sub8x8 chroma blocks, we may be covering more than one luma block's + // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for + // the top-left corner of the prediction source - the correct top-left corner + // is at (pre_x, pre_y). + const int row_start = (block_size_high[bsize] == 4) && ss_y ? -1 : 0; + const int col_start = (block_size_wide[bsize] == 4) && ss_x ? -1 : 0; + + for (int row = row_start; row <= 0; ++row) { + for (int col = col_start; col <= 0; ++col) { + const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col]; + if (!is_inter_block(this_mbmi)) return false; + if (is_intrabc_block(this_mbmi)) return false; + } + } + return true; +} + +static void build_inter_predictors_sub8x8( + const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi, + int bw, int bh, int mi_x, int mi_y, + CalcSubpelParamsFunc calc_subpel_params_func) { + const BLOCK_SIZE bsize = mi->sb_type; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const bool ss_x = pd->subsampling_x; + const bool ss_y = pd->subsampling_y; + const int b4_w = block_size_wide[bsize] >> ss_x; + const int b4_h = block_size_high[bsize] >> ss_y; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y); + const int b8_w = block_size_wide[plane_bsize]; + const int b8_h = block_size_high[plane_bsize]; + const int is_compound = has_second_ref(mi); + assert(!is_compound); + assert(!is_intrabc_block(mi)); + + // For sub8x8 chroma blocks, we may be covering more than one luma block's + // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for + // the top-left corner of the prediction source - the correct top-left corner + // is at (pre_x, pre_y). + const int row_start = (block_size_high[bsize] == 4) && ss_y ? -1 : 0; + const int col_start = (block_size_wide[bsize] == 4) && ss_x ? -1 : 0; + const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x; + const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y; + + int row = row_start; + for (int y = 0; y < b8_h; y += b4_h) { + int col = col_start; + for (int x = 0; x < b8_w; x += b4_w) { + MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col]; + int tmp_dst_stride = 8; + assert(bw < 8 || bh < 8); + (void)bw; + (void)bh; + struct buf_2d *const dst_buf = &pd->dst; + uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x; + int ref = 0; + const RefCntBuffer *ref_buf = + get_ref_frame_buf(cm, this_mbmi->ref_frame[ref]); + const struct scale_factors *ref_scale_factors = + get_ref_scale_factors_const(cm, this_mbmi->ref_frame[ref]); + const struct scale_factors *const sf = ref_scale_factors; + const struct buf_2d pre_buf = { + NULL, + (plane == 1) ? ref_buf->buf.u_buffer : ref_buf->buf.v_buffer, + ref_buf->buf.uv_crop_width, + ref_buf->buf.uv_crop_height, + ref_buf->buf.uv_stride, + }; + + const MV mv = this_mbmi->mv[ref].as_mv; + + InterPredParams inter_pred_params; + av1_init_inter_params(&inter_pred_params, b4_w, b4_h, pre_y + y, + pre_x + x, pd->subsampling_x, pd->subsampling_y, + xd->bd, is_cur_buf_hbd(xd), mi->use_intrabc, sf, + &pre_buf, this_mbmi->interp_filters); + inter_pred_params.conv_params = get_conv_params_no_round( + ref, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd); + inter_pred_params.conv_params.use_dist_wtd_comp_avg = 0; + + av1_build_one_inter_predictor(dst, dst_buf->stride, &mv, + &inter_pred_params, xd, mi_x + x, mi_y + y, + ref, calc_subpel_params_func); + + ++col; + } + ++row; + } +} + +static void build_inter_predictors_8x8_and_bigger( + const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi, + int build_for_obmc, int bw, int bh, int mi_x, int mi_y, + CalcSubpelParamsFunc calc_subpel_params_func) { + const int is_compound = has_second_ref(mi); + const int is_intrabc = is_intrabc_block(mi); + assert(IMPLIES(is_intrabc, !is_compound)); + struct macroblockd_plane *const pd = &xd->plane[plane]; + struct buf_2d *const dst_buf = &pd->dst; + uint8_t *const dst = dst_buf->buf; + + int is_global[2] = { 0, 0 }; + for (int ref = 0; ref < 1 + is_compound; ++ref) { + const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]]; + is_global[ref] = is_global_mv_block(mi, wm->wmtype); + } + + const BLOCK_SIZE bsize = mi->sb_type; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + const int row_start = + (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0; + const int col_start = + (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0; + const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x; + const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y; + + for (int ref = 0; ref < 1 + is_compound; ++ref) { + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref]; + struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref]; + const MV mv = mi->mv[ref].as_mv; + const WarpTypesAllowed warp_types = { is_global[ref], + mi->motion_mode == WARPED_CAUSAL }; + + InterPredParams inter_pred_params; + av1_init_inter_params(&inter_pred_params, bw, bh, pre_y, pre_x, + pd->subsampling_x, pd->subsampling_y, xd->bd, + is_cur_buf_hbd(xd), mi->use_intrabc, sf, pre_buf, + mi->interp_filters); + if (is_compound) av1_init_comp_mode(&inter_pred_params); + inter_pred_params.conv_params = get_conv_params_no_round( + ref, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd); + + av1_dist_wtd_comp_weight_assign( + cm, mi, 0, &inter_pred_params.conv_params.fwd_offset, + &inter_pred_params.conv_params.bck_offset, + &inter_pred_params.conv_params.use_dist_wtd_comp_avg, is_compound); + + if (!build_for_obmc) + av1_init_warp_params(&inter_pred_params, &warp_types, ref, xd, mi); + + if (is_masked_compound_type(mi->interinter_comp.type)) { + av1_init_mask_comp(&inter_pred_params, mi->sb_type, &mi->interinter_comp); + // Assign physical buffer. + inter_pred_params.mask_comp.seg_mask = xd->seg_mask; + } + + av1_build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params, + xd, mi_x, mi_y, ref, calc_subpel_params_func); + } +} + +void av1_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, + int plane, const MB_MODE_INFO *mi, + int build_for_obmc, int bw, int bh, int mi_x, + int mi_y, + CalcSubpelParamsFunc calc_subpel_params_func) { + if (is_sub8x8_inter(xd, plane, mi->sb_type, is_intrabc_block(mi), + build_for_obmc)) { + build_inter_predictors_sub8x8(cm, xd, plane, mi, bw, bh, mi_x, mi_y, + calc_subpel_params_func); + } else { + build_inter_predictors_8x8_and_bigger(cm, xd, plane, mi, build_for_obmc, bw, + bh, mi_x, mi_y, + calc_subpel_params_func); + } +} + +void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm, + const MB_MODE_INFO *mbmi, int order_idx, + int *fwd_offset, int *bck_offset, + int *use_dist_wtd_comp_avg, + int is_compound) { + assert(fwd_offset != NULL && bck_offset != NULL); + if (!is_compound || mbmi->compound_idx) { + *use_dist_wtd_comp_avg = 0; + return; + } + + *use_dist_wtd_comp_avg = 1; + const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]); + const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]); + const int cur_frame_index = cm->cur_frame->order_hint; + int bck_frame_index = 0, fwd_frame_index = 0; + + if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint; + if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint; + + int d0 = clamp(abs(get_relative_dist(&cm->seq_params.order_hint_info, + fwd_frame_index, cur_frame_index)), + 0, MAX_FRAME_DISTANCE); + int d1 = clamp(abs(get_relative_dist(&cm->seq_params.order_hint_info, + cur_frame_index, bck_frame_index)), + 0, MAX_FRAME_DISTANCE); + + const int order = d0 <= d1; + + if (d0 == 0 || d1 == 0) { + *fwd_offset = quant_dist_lookup_table[order_idx][3][order]; + *bck_offset = quant_dist_lookup_table[order_idx][3][1 - order]; + return; + } + + int i; + for (i = 0; i < 3; ++i) { + int c0 = quant_dist_weight[i][order]; + int c1 = quant_dist_weight[i][!order]; + int d0_c0 = d0 * c0; + int d1_c1 = d1 * c1; + if ((d0 > d1 && d0_c0 < d1_c1) || (d0 <= d1 && d0_c0 > d1_c1)) break; + } + + *fwd_offset = quant_dist_lookup_table[order_idx][i][order]; + *bck_offset = quant_dist_lookup_table[order_idx][i][1 - order]; +} + +void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize, + const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, + const int plane_start, const int plane_end) { + // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet + // the static analysis warnings. + for (int i = plane_start; i < AOMMIN(plane_end, MAX_MB_PLANE); ++i) { + struct macroblockd_plane *const pd = &planes[i]; + const int is_uv = i > 0; + setup_pred_plane(&pd->dst, bsize, src->buffers[i], src->crop_widths[is_uv], + src->crop_heights[is_uv], src->strides[is_uv], mi_row, + mi_col, NULL, pd->subsampling_x, pd->subsampling_y); + } +} + +void av1_setup_pre_planes(MACROBLOCKD *xd, int idx, + const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, + const struct scale_factors *sf, + const int num_planes) { + if (src != NULL) { + // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet + // the static analysis warnings. + for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) { + struct macroblockd_plane *const pd = &xd->plane[i]; + const int is_uv = i > 0; + setup_pred_plane(&pd->pre[idx], xd->mi[0]->sb_type, src->buffers[i], + src->crop_widths[is_uv], src->crop_heights[is_uv], + src->strides[is_uv], mi_row, mi_col, sf, + pd->subsampling_x, pd->subsampling_y); + } + } +} + +// obmc_mask_N[overlap_position] +static const uint8_t obmc_mask_1[1] = { 64 }; +DECLARE_ALIGNED(2, static const uint8_t, obmc_mask_2[2]) = { 45, 64 }; + +DECLARE_ALIGNED(4, static const uint8_t, obmc_mask_4[4]) = { 39, 50, 59, 64 }; + +static const uint8_t obmc_mask_8[8] = { 36, 42, 48, 53, 57, 61, 64, 64 }; + +static const uint8_t obmc_mask_16[16] = { 34, 37, 40, 43, 46, 49, 52, 54, + 56, 58, 60, 61, 64, 64, 64, 64 }; + +static const uint8_t obmc_mask_32[32] = { 33, 35, 36, 38, 40, 41, 43, 44, + 45, 47, 48, 50, 51, 52, 53, 55, + 56, 57, 58, 59, 60, 60, 61, 62, + 64, 64, 64, 64, 64, 64, 64, 64 }; + +static const uint8_t obmc_mask_64[64] = { + 33, 34, 35, 35, 36, 37, 38, 39, 40, 40, 41, 42, 43, 44, 44, 44, + 45, 46, 47, 47, 48, 49, 50, 51, 51, 51, 52, 52, 53, 54, 55, 56, + 56, 56, 57, 57, 58, 58, 59, 60, 60, 60, 60, 60, 61, 62, 62, 62, + 62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, +}; + +const uint8_t *av1_get_obmc_mask(int length) { + switch (length) { + case 1: return obmc_mask_1; + case 2: return obmc_mask_2; + case 4: return obmc_mask_4; + case 8: return obmc_mask_8; + case 16: return obmc_mask_16; + case 32: return obmc_mask_32; + case 64: return obmc_mask_64; + default: assert(0); return NULL; + } +} + +static INLINE void increment_int_ptr(MACROBLOCKD *xd, int rel_mi_row, + int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *mi, void *fun_ctxt, + const int num_planes) { + (void)xd; + (void)rel_mi_row; + (void)rel_mi_col; + (void)op_mi_size; + (void)dir; + (void)mi; + ++*(int *)fun_ctxt; + (void)num_planes; +} + +void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd) { + MB_MODE_INFO *mbmi = xd->mi[0]; + + mbmi->overlappable_neighbors[0] = 0; + mbmi->overlappable_neighbors[1] = 0; + + if (!is_motion_variation_allowed_bsize(mbmi->sb_type)) return; + + foreach_overlappable_nb_above(cm, xd, INT_MAX, increment_int_ptr, + &mbmi->overlappable_neighbors[0]); + foreach_overlappable_nb_left(cm, xd, INT_MAX, increment_int_ptr, + &mbmi->overlappable_neighbors[1]); +} + +// HW does not support < 4x4 prediction. To limit the bandwidth requirement, if +// block-size of current plane is smaller than 8x8, always only blend with the +// left neighbor(s) (skip blending with the above side). +#define DISABLE_CHROMA_U8X8_OBMC 0 // 0: one-sided obmc; 1: disable + +int av1_skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize, + const struct macroblockd_plane *pd, int dir) { + assert(is_motion_variation_allowed_bsize(bsize)); + + const BLOCK_SIZE bsize_plane = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + switch (bsize_plane) { +#if DISABLE_CHROMA_U8X8_OBMC + case BLOCK_4X4: + case BLOCK_8X4: + case BLOCK_4X8: return 1; break; +#else + case BLOCK_4X4: + case BLOCK_8X4: + case BLOCK_4X8: return dir == 0; break; +#endif + default: return 0; + } +} + +void av1_modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi) { + mbmi->ref_frame[1] = NONE_FRAME; + mbmi->interinter_comp.type = COMPOUND_AVERAGE; + + return; +} + +struct obmc_inter_pred_ctxt { + uint8_t **adjacent; + int *adjacent_stride; +}; + +static INLINE void build_obmc_inter_pred_above( + MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *above_mi, void *fun_ctxt, const int num_planes) { + (void)above_mi; + (void)rel_mi_row; + (void)dir; + struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt; + const BLOCK_SIZE bsize = xd->mi[0]->sb_type; + const int overlap = + AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1; + + for (int plane = 0; plane < num_planes; ++plane) { + const struct macroblockd_plane *pd = &xd->plane[plane]; + const int bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x; + const int bh = overlap >> pd->subsampling_y; + const int plane_col = (rel_mi_col * MI_SIZE) >> pd->subsampling_x; + + if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue; + + const int dst_stride = pd->dst.stride; + uint8_t *const dst = &pd->dst.buf[plane_col]; + const int tmp_stride = ctxt->adjacent_stride[plane]; + const uint8_t *const tmp = &ctxt->adjacent[plane][plane_col]; + const uint8_t *const mask = av1_get_obmc_mask(bh); +#if CONFIG_AV1_HIGHBITDEPTH + const int is_hbd = is_cur_buf_hbd(xd); + if (is_hbd) + aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, + tmp_stride, mask, bw, bh, xd->bd); + else + aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, + mask, bw, bh); +#else + aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask, + bw, bh); +#endif + } +} + +static INLINE void build_obmc_inter_pred_left( + MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *left_mi, void *fun_ctxt, const int num_planes) { + (void)left_mi; + (void)rel_mi_col; + (void)dir; + struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt; + const BLOCK_SIZE bsize = xd->mi[0]->sb_type; + const int overlap = + AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1; + + for (int plane = 0; plane < num_planes; ++plane) { + const struct macroblockd_plane *pd = &xd->plane[plane]; + const int bw = overlap >> pd->subsampling_x; + const int bh = (op_mi_size * MI_SIZE) >> pd->subsampling_y; + const int plane_row = (rel_mi_row * MI_SIZE) >> pd->subsampling_y; + + if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue; + + const int dst_stride = pd->dst.stride; + uint8_t *const dst = &pd->dst.buf[plane_row * dst_stride]; + const int tmp_stride = ctxt->adjacent_stride[plane]; + const uint8_t *const tmp = &ctxt->adjacent[plane][plane_row * tmp_stride]; + const uint8_t *const mask = av1_get_obmc_mask(bw); + +#if CONFIG_AV1_HIGHBITDEPTH + const int is_hbd = is_cur_buf_hbd(xd); + if (is_hbd) + aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, + tmp_stride, mask, bw, bh, xd->bd); + else + aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, + mask, bw, bh); +#else + aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask, + bw, bh); +#endif + } +} + +// This function combines motion compensated predictions that are generated by +// top/left neighboring blocks' inter predictors with the regular inter +// prediction. We assume the original prediction (bmc) is stored in +// xd->plane[].dst.buf +void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd, + uint8_t *above[MAX_MB_PLANE], + int above_stride[MAX_MB_PLANE], + uint8_t *left[MAX_MB_PLANE], + int left_stride[MAX_MB_PLANE]) { + const BLOCK_SIZE bsize = xd->mi[0]->sb_type; + + // handle above row + struct obmc_inter_pred_ctxt ctxt_above = { above, above_stride }; + foreach_overlappable_nb_above(cm, xd, + max_neighbor_obmc[mi_size_wide_log2[bsize]], + build_obmc_inter_pred_above, &ctxt_above); + + // handle left column + struct obmc_inter_pred_ctxt ctxt_left = { left, left_stride }; + foreach_overlappable_nb_left(cm, xd, + max_neighbor_obmc[mi_size_high_log2[bsize]], + build_obmc_inter_pred_left, &ctxt_left); +} + +void av1_setup_address_for_obmc(MACROBLOCKD *xd, int mi_row_offset, + int mi_col_offset, MB_MODE_INFO *ref_mbmi, + struct build_prediction_ctxt *ctxt, + const int num_planes) { + const BLOCK_SIZE ref_bsize = AOMMAX(BLOCK_8X8, ref_mbmi->sb_type); + const int ref_mi_row = xd->mi_row + mi_row_offset; + const int ref_mi_col = xd->mi_col + mi_col_offset; + + for (int plane = 0; plane < num_planes; ++plane) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + setup_pred_plane(&pd->dst, ref_bsize, ctxt->tmp_buf[plane], + ctxt->tmp_width[plane], ctxt->tmp_height[plane], + ctxt->tmp_stride[plane], mi_row_offset, mi_col_offset, + NULL, pd->subsampling_x, pd->subsampling_y); + } + + const MV_REFERENCE_FRAME frame = ref_mbmi->ref_frame[0]; + + const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame); + const struct scale_factors *const sf = + get_ref_scale_factors_const(ctxt->cm, frame); + + xd->block_ref_scale_factors[0] = sf; + if ((!av1_is_valid_scale(sf))) + aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM, + "Reference frame has invalid dimensions"); + + av1_setup_pre_planes(xd, 0, &ref_buf->buf, ref_mi_row, ref_mi_col, sf, + num_planes); +} + +void av1_setup_build_prediction_by_above_pred( + MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width, + MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt, + const int num_planes) { + const BLOCK_SIZE a_bsize = AOMMAX(BLOCK_8X8, above_mbmi->sb_type); + const int above_mi_col = xd->mi_col + rel_mi_col; + + av1_modify_neighbor_predictor_for_obmc(above_mbmi); + + for (int j = 0; j < num_planes; ++j) { + struct macroblockd_plane *const pd = &xd->plane[j]; + setup_pred_plane(&pd->dst, a_bsize, ctxt->tmp_buf[j], ctxt->tmp_width[j], + ctxt->tmp_height[j], ctxt->tmp_stride[j], 0, rel_mi_col, + NULL, pd->subsampling_x, pd->subsampling_y); + } + + const int num_refs = 1 + has_second_ref(above_mbmi); + + for (int ref = 0; ref < num_refs; ++ref) { + const MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref]; + + const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame); + const struct scale_factors *const sf = + get_ref_scale_factors_const(ctxt->cm, frame); + xd->block_ref_scale_factors[ref] = sf; + if ((!av1_is_valid_scale(sf))) + aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM, + "Reference frame has invalid dimensions"); + av1_setup_pre_planes(xd, ref, &ref_buf->buf, xd->mi_row, above_mi_col, sf, + num_planes); + } + + xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col); + xd->mb_to_right_edge = + ctxt->mb_to_far_edge + + (xd->width - rel_mi_col - above_mi_width) * MI_SIZE * 8; +} + +void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row, + uint8_t left_mi_height, + MB_MODE_INFO *left_mbmi, + struct build_prediction_ctxt *ctxt, + const int num_planes) { + const BLOCK_SIZE l_bsize = AOMMAX(BLOCK_8X8, left_mbmi->sb_type); + const int left_mi_row = xd->mi_row + rel_mi_row; + + av1_modify_neighbor_predictor_for_obmc(left_mbmi); + + for (int j = 0; j < num_planes; ++j) { + struct macroblockd_plane *const pd = &xd->plane[j]; + setup_pred_plane(&pd->dst, l_bsize, ctxt->tmp_buf[j], ctxt->tmp_width[j], + ctxt->tmp_height[j], ctxt->tmp_stride[j], rel_mi_row, 0, + NULL, pd->subsampling_x, pd->subsampling_y); + } + + const int num_refs = 1 + has_second_ref(left_mbmi); + + for (int ref = 0; ref < num_refs; ++ref) { + const MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref]; + + const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame); + const struct scale_factors *const ref_scale_factors = + get_ref_scale_factors_const(ctxt->cm, frame); + + xd->block_ref_scale_factors[ref] = ref_scale_factors; + if ((!av1_is_valid_scale(ref_scale_factors))) + aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM, + "Reference frame has invalid dimensions"); + av1_setup_pre_planes(xd, ref, &ref_buf->buf, left_mi_row, xd->mi_col, + ref_scale_factors, num_planes); + } + + xd->mb_to_top_edge = GET_MV_SUBPEL(MI_SIZE * (-left_mi_row)); + xd->mb_to_bottom_edge = + ctxt->mb_to_far_edge + + GET_MV_SUBPEL((xd->height - rel_mi_row - left_mi_height) * MI_SIZE); +} + +static AOM_INLINE void combine_interintra( + INTERINTRA_MODE mode, int8_t use_wedge_interintra, int8_t wedge_index, + int8_t wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize, + uint8_t *comppred, int compstride, const uint8_t *interpred, + int interstride, const uint8_t *intrapred, int intrastride) { + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + + if (use_wedge_interintra) { + if (av1_is_wedge_used(bsize)) { + const uint8_t *mask = + av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); + const int subw = 2 * mi_size_wide[bsize] == bw; + const int subh = 2 * mi_size_high[bsize] == bh; + aom_blend_a64_mask(comppred, compstride, intrapred, intrastride, + interpred, interstride, mask, block_size_wide[bsize], + bw, bh, subw, subh); + } + return; + } + + const uint8_t *mask = smooth_interintra_mask_buf[mode][plane_bsize]; + aom_blend_a64_mask(comppred, compstride, intrapred, intrastride, interpred, + interstride, mask, bw, bw, bh, 0, 0); +} + +#if CONFIG_AV1_HIGHBITDEPTH +static AOM_INLINE void combine_interintra_highbd( + INTERINTRA_MODE mode, int8_t use_wedge_interintra, int8_t wedge_index, + int8_t wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize, + uint8_t *comppred8, int compstride, const uint8_t *interpred8, + int interstride, const uint8_t *intrapred8, int intrastride, int bd) { + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + + if (use_wedge_interintra) { + if (av1_is_wedge_used(bsize)) { + const uint8_t *mask = + av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); + const int subh = 2 * mi_size_high[bsize] == bh; + const int subw = 2 * mi_size_wide[bsize] == bw; + aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8, intrastride, + interpred8, interstride, mask, + block_size_wide[bsize], bw, bh, subw, subh, bd); + } + return; + } + + uint8_t mask[MAX_SB_SQUARE]; + build_smooth_interintra_mask(mask, bw, plane_bsize, mode); + aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8, intrastride, + interpred8, interstride, mask, bw, bw, bh, 0, 0, + bd); +} +#endif + +void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm, + MACROBLOCKD *xd, + BLOCK_SIZE bsize, int plane, + const BUFFER_SET *ctx, + uint8_t *dst, int dst_stride) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ssx = xd->plane[plane].subsampling_x; + const int ssy = xd->plane[plane].subsampling_y; + BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy); + PREDICTION_MODE mode = interintra_to_intra_mode[xd->mi[0]->interintra_mode]; + assert(xd->mi[0]->angle_delta[PLANE_TYPE_Y] == 0); + assert(xd->mi[0]->angle_delta[PLANE_TYPE_UV] == 0); + assert(xd->mi[0]->filter_intra_mode_info.use_filter_intra == 0); + assert(xd->mi[0]->use_intrabc == 0); + + av1_predict_intra_block(cm, xd, pd->width, pd->height, + max_txsize_rect_lookup[plane_bsize], mode, 0, 0, + FILTER_INTRA_MODES, ctx->plane[plane], + ctx->stride[plane], dst, dst_stride, 0, 0, plane); +} + +void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, + const uint8_t *inter_pred, int inter_stride, + const uint8_t *intra_pred, int intra_stride) { + const int ssx = xd->plane[plane].subsampling_x; + const int ssy = xd->plane[plane].subsampling_y; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy); +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + combine_interintra_highbd( + xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra, + xd->mi[0]->interintra_wedge_index, INTERINTRA_WEDGE_SIGN, bsize, + plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride, + inter_pred, inter_stride, intra_pred, intra_stride, xd->bd); + return; + } +#endif + combine_interintra( + xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra, + xd->mi[0]->interintra_wedge_index, INTERINTRA_WEDGE_SIGN, bsize, + plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride, + inter_pred, inter_stride, intra_pred, intra_stride); +} + +// build interintra_predictors for one plane +void av1_build_interintra_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd, + uint8_t *pred, int stride, + const BUFFER_SET *ctx, int plane, + BLOCK_SIZE bsize) { + assert(bsize < BLOCK_SIZES_ALL); + if (is_cur_buf_hbd(xd)) { + DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]); + av1_build_intra_predictors_for_interintra( + cm, xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(intrapredictor), + MAX_SB_SIZE); + av1_combine_interintra(xd, bsize, plane, pred, stride, + CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE); + } else { + DECLARE_ALIGNED(16, uint8_t, intrapredictor[MAX_SB_SQUARE]); + av1_build_intra_predictors_for_interintra(cm, xd, bsize, plane, ctx, + intrapredictor, MAX_SB_SIZE); + av1_combine_interintra(xd, bsize, plane, pred, stride, intrapredictor, + MAX_SB_SIZE); + } +} diff --git a/libs/libaom/src/av1/common/reconinter.h b/libs/libaom/src/av1/common/reconinter.h new file mode 100644 index 000000000..fe3c6a621 --- /dev/null +++ b/libs/libaom/src/av1/common/reconinter.h @@ -0,0 +1,414 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_RECONINTER_H_ +#define AOM_AV1_COMMON_RECONINTER_H_ + +#include "av1/common/av1_common_int.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" +#include "av1/common/warped_motion.h" +#include "aom/aom_integer.h" + +// Work out how many pixels off the edge of a reference frame we're allowed +// to go when forming an inter prediction. +// The outermost row/col of each referernce frame is extended by +// (AOM_BORDER_IN_PIXELS >> subsampling) pixels, but we need to keep +// at least AOM_INTERP_EXTEND pixels within that to account for filtering. +// +// We have to break this up into two macros to keep both clang-format and +// tools/lint-hunks.py happy. +#define AOM_LEFT_TOP_MARGIN_PX(subsampling) \ + ((AOM_BORDER_IN_PIXELS >> subsampling) - AOM_INTERP_EXTEND) +#define AOM_LEFT_TOP_MARGIN_SCALED(subsampling) \ + (AOM_LEFT_TOP_MARGIN_PX(subsampling) << SCALE_SUBPEL_BITS) + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_WEDGE_TYPES 16 + +#define MAX_WEDGE_SIZE_LOG2 5 // 32x32 +#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2) +#define MAX_WEDGE_SQUARE (MAX_WEDGE_SIZE * MAX_WEDGE_SIZE) + +#define WEDGE_WEIGHT_BITS 6 + +#define WEDGE_NONE -1 + +// Angles are with respect to horizontal anti-clockwise +enum { + WEDGE_HORIZONTAL = 0, + WEDGE_VERTICAL = 1, + WEDGE_OBLIQUE27 = 2, + WEDGE_OBLIQUE63 = 3, + WEDGE_OBLIQUE117 = 4, + WEDGE_OBLIQUE153 = 5, + WEDGE_DIRECTIONS +} UENUM1BYTE(WedgeDirectionType); + +// 3-tuple: {direction, x_offset, y_offset} +typedef struct { + WedgeDirectionType direction; + int x_offset; + int y_offset; +} wedge_code_type; + +typedef uint8_t *wedge_masks_type[MAX_WEDGE_TYPES]; + +typedef struct { + int wedge_types; + const wedge_code_type *codebook; + uint8_t *signflip; + wedge_masks_type *masks; +} wedge_params_type; + +extern const wedge_params_type av1_wedge_params_lookup[BLOCK_SIZES_ALL]; + +typedef struct SubpelParams { + int xs; + int ys; + int subpel_x; + int subpel_y; +} SubpelParams; + +struct build_prediction_ctxt { + const AV1_COMMON *cm; + uint8_t **tmp_buf; + int *tmp_width; + int *tmp_height; + int *tmp_stride; + int mb_to_far_edge; +}; + +typedef enum InterPredMode { + TRANSLATION_PRED, + WARP_PRED, +} InterPredMode; + +typedef enum InterCompMode { + UNIFORM_SINGLE, + UNIFORM_COMP, + MASK_COMP, +} InterCompMode; + +typedef struct InterPredParams { + InterPredMode mode; + InterCompMode comp_mode; + WarpedMotionParams warp_params; + ConvolveParams conv_params; + const InterpFilterParams *interp_filter_params[2]; + int block_width; + int block_height; + int pix_row; + int pix_col; + struct buf_2d ref_frame_buf; + int subsampling_x; + int subsampling_y; + const struct scale_factors *scale_factors; + int bit_depth; + int use_hbd_buf; + INTERINTER_COMPOUND_DATA mask_comp; + BLOCK_SIZE sb_type; + int is_intrabc; +} InterPredParams; + +void av1_init_inter_params(InterPredParams *inter_pred_params, int block_width, + int block_height, int pix_row, int pix_col, + int subsampling_x, int subsampling_y, int bit_depth, + int use_hbd_buf, int is_intrabc, + const struct scale_factors *sf, + const struct buf_2d *ref_buf, + int_interpfilters interp_filters); + +void av1_init_comp_mode(InterPredParams *inter_pred_params); + +void av1_init_warp_params(InterPredParams *inter_pred_params, + const WarpTypesAllowed *warp_types, int ref, + const MACROBLOCKD *xd, const MB_MODE_INFO *mi); + +void av1_init_mask_comp(InterPredParams *inter_pred_params, BLOCK_SIZE bsize, + const INTERINTER_COMPOUND_DATA *mask_comp); + +static INLINE int has_scale(int xs, int ys) { + return xs != SCALE_SUBPEL_SHIFTS || ys != SCALE_SUBPEL_SHIFTS; +} + +static INLINE void revert_scale_extra_bits(SubpelParams *sp) { + sp->subpel_x >>= SCALE_EXTRA_BITS; + sp->subpel_y >>= SCALE_EXTRA_BITS; + sp->xs >>= SCALE_EXTRA_BITS; + sp->ys >>= SCALE_EXTRA_BITS; + assert(sp->subpel_x < SUBPEL_SHIFTS); + assert(sp->subpel_y < SUBPEL_SHIFTS); + assert(sp->xs <= SUBPEL_SHIFTS); + assert(sp->ys <= SUBPEL_SHIFTS); +} + +static INLINE void inter_predictor( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, + const SubpelParams *subpel_params, const struct scale_factors *sf, int w, + int h, ConvolveParams *conv_params, + const InterpFilterParams *interp_filters[2]) { + assert(conv_params->do_average == 0 || conv_params->do_average == 1); + assert(sf); + const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys); + if (is_scaled) { + av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, + interp_filters, subpel_params->subpel_x, + subpel_params->xs, subpel_params->subpel_y, + subpel_params->ys, 1, conv_params, sf); + } else { + SubpelParams sp = *subpel_params; + revert_scale_extra_bits(&sp); + av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, + interp_filters, sp.subpel_x, sp.xs, sp.subpel_y, + sp.ys, 0, conv_params, sf); + } +} + +static INLINE void highbd_inter_predictor( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, + const SubpelParams *subpel_params, const struct scale_factors *sf, int w, + int h, ConvolveParams *conv_params, + const InterpFilterParams *interp_filters[2], int bd) { + assert(conv_params->do_average == 0 || conv_params->do_average == 1); + assert(sf); + const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys); + if (is_scaled) { + av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, + interp_filters, subpel_params->subpel_x, + subpel_params->xs, subpel_params->subpel_y, + subpel_params->ys, 1, conv_params, sf, bd); + } else { + SubpelParams sp = *subpel_params; + revert_scale_extra_bits(&sp); + av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, + interp_filters, sp.subpel_x, sp.xs, + sp.subpel_y, sp.ys, 0, conv_params, sf, bd); + } +} + +void av1_modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi); +int av1_skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize, + const struct macroblockd_plane *pd, int dir); + +static INLINE int is_interinter_compound_used(COMPOUND_TYPE type, + BLOCK_SIZE sb_type) { + const int comp_allowed = is_comp_ref_allowed(sb_type); + switch (type) { + case COMPOUND_AVERAGE: + case COMPOUND_DISTWTD: + case COMPOUND_DIFFWTD: return comp_allowed; + case COMPOUND_WEDGE: + return comp_allowed && av1_wedge_params_lookup[sb_type].wedge_types > 0; + default: assert(0); return 0; + } +} + +static INLINE int is_any_masked_compound_used(BLOCK_SIZE sb_type) { + COMPOUND_TYPE comp_type; + int i; + if (!is_comp_ref_allowed(sb_type)) return 0; + for (i = 0; i < COMPOUND_TYPES; i++) { + comp_type = (COMPOUND_TYPE)i; + if (is_masked_compound_type(comp_type) && + is_interinter_compound_used(comp_type, sb_type)) + return 1; + } + return 0; +} + +static INLINE int get_wedge_types_lookup(BLOCK_SIZE sb_type) { + return av1_wedge_params_lookup[sb_type].wedge_types; +} + +static INLINE int av1_is_wedge_used(BLOCK_SIZE sb_type) { + return av1_wedge_params_lookup[sb_type].wedge_types > 0; +} + +void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, + InterPredParams *inter_pred_params, + const SubpelParams *subpel_params); + +void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride, + uint8_t *dst, int dst_stride, + InterPredParams *inter_pred_params, + const SubpelParams *subpel_params); + +typedef void (*CalcSubpelParamsFunc)(const MV *const src_mv, + InterPredParams *const inter_pred_params, + MACROBLOCKD *xd, int mi_x, int mi_y, + int ref, uint8_t **pre, + SubpelParams *subpel_params, + int *src_stride); + +void av1_build_one_inter_predictor( + uint8_t *dst, int dst_stride, const MV *const src_mv, + InterPredParams *inter_pred_params, MACROBLOCKD *xd, int mi_x, int mi_y, + int ref, CalcSubpelParamsFunc calc_subpel_params_func); + +void av1_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, + int plane, const MB_MODE_INFO *mi, + int build_for_obmc, int bw, int bh, int mi_x, + int mi_y, + CalcSubpelParamsFunc calc_subpel_params_func); + +// TODO(jkoleszar): yet another mv clamping function :-( +static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, + const MV *src_mv, int bw, int bh, + int ss_x, int ss_y) { + // If the MV points so far into the UMV border that no visible pixels + // are used for reconstruction, the subpel part of the MV can be + // discarded and the MV limited to 16 pixels with equivalent results. + const int spel_left = (AOM_INTERP_EXTEND + bw) << SUBPEL_BITS; + const int spel_right = spel_left - SUBPEL_SHIFTS; + const int spel_top = (AOM_INTERP_EXTEND + bh) << SUBPEL_BITS; + const int spel_bottom = spel_top - SUBPEL_SHIFTS; + MV clamped_mv = { (int16_t)(src_mv->row * (1 << (1 - ss_y))), + (int16_t)(src_mv->col * (1 << (1 - ss_x))) }; + assert(ss_x <= 1); + assert(ss_y <= 1); + const SubpelMvLimits mv_limits = { + xd->mb_to_left_edge * (1 << (1 - ss_x)) - spel_left, + xd->mb_to_right_edge * (1 << (1 - ss_x)) + spel_right, + xd->mb_to_top_edge * (1 << (1 - ss_y)) - spel_top, + xd->mb_to_bottom_edge * (1 << (1 - ss_y)) + spel_bottom + }; + + clamp_mv(&clamped_mv, &mv_limits); + + return clamped_mv; +} + +static INLINE int64_t scaled_buffer_offset(int x_offset, int y_offset, + int stride, + const struct scale_factors *sf) { + const int x = + sf ? sf->scale_value_x(x_offset, sf) >> SCALE_EXTRA_BITS : x_offset; + const int y = + sf ? sf->scale_value_y(y_offset, sf) >> SCALE_EXTRA_BITS : y_offset; + return (int64_t)y * stride + x; +} + +static INLINE void setup_pred_plane(struct buf_2d *dst, BLOCK_SIZE bsize, + uint8_t *src, int width, int height, + int stride, int mi_row, int mi_col, + const struct scale_factors *scale, + int subsampling_x, int subsampling_y) { + // Offset the buffer pointer + if (subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1)) + mi_row -= 1; + if (subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1)) + mi_col -= 1; + + const int x = (MI_SIZE * mi_col) >> subsampling_x; + const int y = (MI_SIZE * mi_row) >> subsampling_y; + dst->buf = src + scaled_buffer_offset(x, y, stride, scale); + dst->buf0 = src; + dst->width = width; + dst->height = height; + dst->stride = stride; +} + +void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize, + const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, + const int plane_start, const int plane_end); + +void av1_setup_pre_planes(MACROBLOCKD *xd, int idx, + const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, + const struct scale_factors *sf, const int num_planes); + +static INLINE void set_default_interp_filters( + MB_MODE_INFO *const mbmi, InterpFilter frame_interp_filter) { + mbmi->interp_filters = + av1_broadcast_interp_filter(av1_unswitchable_filter(frame_interp_filter)); +} + +static INLINE int av1_is_interp_needed(const MACROBLOCKD *const xd) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + if (mbmi->skip_mode) return 0; + if (mbmi->motion_mode == WARPED_CAUSAL) return 0; + if (is_nontrans_global_motion(xd, xd->mi[0])) return 0; + return 1; +} + +void av1_setup_address_for_obmc(MACROBLOCKD *xd, int mi_row_offset, + int mi_col_offset, MB_MODE_INFO *ref_mbmi, + struct build_prediction_ctxt *ctxt, + const int num_planes); + +void av1_setup_build_prediction_by_above_pred( + MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width, + MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt, + const int num_planes); +void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row, + uint8_t left_mi_height, + MB_MODE_INFO *left_mbmi, + struct build_prediction_ctxt *ctxt, + const int num_planes); +void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd, + uint8_t *above[MAX_MB_PLANE], + int above_stride[MAX_MB_PLANE], + uint8_t *left[MAX_MB_PLANE], + int left_stride[MAX_MB_PLANE]); + +const uint8_t *av1_get_obmc_mask(int length); +void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd); + +#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1) +#define MASK_MASTER_STRIDE (MASK_MASTER_SIZE) + +void av1_init_wedge_masks(); + +static INLINE const uint8_t *av1_get_contiguous_soft_mask(int8_t wedge_index, + int8_t wedge_sign, + BLOCK_SIZE sb_type) { + return av1_wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index]; +} + +const uint8_t *av1_get_compound_type_mask( + const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type); + +// build interintra_predictors for one plane +void av1_build_interintra_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd, + uint8_t *pred, int stride, + const BUFFER_SET *ctx, int plane, + BLOCK_SIZE bsize); + +void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm, + MACROBLOCKD *xd, + BLOCK_SIZE bsize, int plane, + const BUFFER_SET *ctx, + uint8_t *dst, int dst_stride); + +void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, + const uint8_t *inter_pred, int inter_stride, + const uint8_t *intra_pred, int intra_stride); + +void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm, + const MB_MODE_INFO *mbmi, int order_idx, + int *fwd_offset, int *bck_offset, + int *use_dist_wtd_comp_avg, + int is_compound); +int av1_allow_warp(const MB_MODE_INFO *const mbmi, + const WarpTypesAllowed *const warp_types, + const WarpedMotionParams *const gm_params, + int build_for_obmc, const struct scale_factors *const sf, + WarpedMotionParams *final_warp_params); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_RECONINTER_H_ diff --git a/libs/libaom/src/av1/common/reconintra.c b/libs/libaom/src/av1/common/reconintra.c new file mode 100644 index 000000000..1307a0313 --- /dev/null +++ b/libs/libaom/src/av1/common/reconintra.c @@ -0,0 +1,1704 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/aom_once.h" +#include "aom_ports/mem.h" +#include "aom_ports/system_state.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/cfl.h" +#include "av1/common/reconintra.h" + +enum { + NEED_LEFT = 1 << 1, + NEED_ABOVE = 1 << 2, + NEED_ABOVERIGHT = 1 << 3, + NEED_ABOVELEFT = 1 << 4, + NEED_BOTTOMLEFT = 1 << 5, +}; + +#define INTRA_EDGE_FILT 3 +#define INTRA_EDGE_TAPS 5 +#define MAX_UPSAMPLE_SZ 16 + +static const uint8_t extend_modes[INTRA_MODES] = { + NEED_ABOVE | NEED_LEFT, // DC + NEED_ABOVE, // V + NEED_LEFT, // H + NEED_ABOVE | NEED_ABOVERIGHT, // D45 + NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT, // D135 + NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT, // D113 + NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT, // D157 + NEED_LEFT | NEED_BOTTOMLEFT, // D203 + NEED_ABOVE | NEED_ABOVERIGHT, // D67 + NEED_LEFT | NEED_ABOVE, // SMOOTH + NEED_LEFT | NEED_ABOVE, // SMOOTH_V + NEED_LEFT | NEED_ABOVE, // SMOOTH_H + NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT, // PAETH +}; + +// Tables to store if the top-right reference pixels are available. The flags +// are represented with bits, packed into 8-bit integers. E.g., for the 32x32 +// blocks in a 128x128 superblock, the index of the "o" block is 10 (in raster +// order), so its flag is stored at the 3rd bit of the 2nd entry in the table, +// i.e. (table[10 / 8] >> (10 % 8)) & 1. +// . . . . +// . . . . +// . . o . +// . . . . +static uint8_t has_tr_4x4[128] = { + 255, 255, 255, 255, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, + 127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, + 255, 127, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, + 127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, + 255, 255, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, + 127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, + 255, 127, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, + 127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, +}; +static uint8_t has_tr_4x8[64] = { + 255, 255, 255, 255, 119, 119, 119, 119, 127, 127, 127, 127, 119, + 119, 119, 119, 255, 127, 255, 127, 119, 119, 119, 119, 127, 127, + 127, 127, 119, 119, 119, 119, 255, 255, 255, 127, 119, 119, 119, + 119, 127, 127, 127, 127, 119, 119, 119, 119, 255, 127, 255, 127, + 119, 119, 119, 119, 127, 127, 127, 127, 119, 119, 119, 119, +}; +static uint8_t has_tr_8x4[64] = { + 255, 255, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0, + 127, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0, + 255, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0, + 127, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0, +}; +static uint8_t has_tr_8x8[32] = { + 255, 255, 85, 85, 119, 119, 85, 85, 127, 127, 85, 85, 119, 119, 85, 85, + 255, 127, 85, 85, 119, 119, 85, 85, 127, 127, 85, 85, 119, 119, 85, 85, +}; +static uint8_t has_tr_8x16[16] = { + 255, 255, 119, 119, 127, 127, 119, 119, + 255, 127, 119, 119, 127, 127, 119, 119, +}; +static uint8_t has_tr_16x8[16] = { + 255, 0, 85, 0, 119, 0, 85, 0, 127, 0, 85, 0, 119, 0, 85, 0, +}; +static uint8_t has_tr_16x16[8] = { + 255, 85, 119, 85, 127, 85, 119, 85, +}; +static uint8_t has_tr_16x32[4] = { 255, 119, 127, 119 }; +static uint8_t has_tr_32x16[4] = { 15, 5, 7, 5 }; +static uint8_t has_tr_32x32[2] = { 95, 87 }; +static uint8_t has_tr_32x64[1] = { 127 }; +static uint8_t has_tr_64x32[1] = { 19 }; +static uint8_t has_tr_64x64[1] = { 7 }; +static uint8_t has_tr_64x128[1] = { 3 }; +static uint8_t has_tr_128x64[1] = { 1 }; +static uint8_t has_tr_128x128[1] = { 1 }; +static uint8_t has_tr_4x16[32] = { + 255, 255, 255, 255, 127, 127, 127, 127, 255, 127, 255, + 127, 127, 127, 127, 127, 255, 255, 255, 127, 127, 127, + 127, 127, 255, 127, 255, 127, 127, 127, 127, 127, +}; +static uint8_t has_tr_16x4[32] = { + 255, 0, 0, 0, 85, 0, 0, 0, 119, 0, 0, 0, 85, 0, 0, 0, + 127, 0, 0, 0, 85, 0, 0, 0, 119, 0, 0, 0, 85, 0, 0, 0, +}; +static uint8_t has_tr_8x32[8] = { + 255, 255, 127, 127, 255, 127, 127, 127, +}; +static uint8_t has_tr_32x8[8] = { + 15, 0, 5, 0, 7, 0, 5, 0, +}; +static uint8_t has_tr_16x64[2] = { 255, 127 }; +static uint8_t has_tr_64x16[2] = { 3, 1 }; + +static const uint8_t *const has_tr_tables[BLOCK_SIZES_ALL] = { + // 4X4 + has_tr_4x4, + // 4X8, 8X4, 8X8 + has_tr_4x8, has_tr_8x4, has_tr_8x8, + // 8X16, 16X8, 16X16 + has_tr_8x16, has_tr_16x8, has_tr_16x16, + // 16X32, 32X16, 32X32 + has_tr_16x32, has_tr_32x16, has_tr_32x32, + // 32X64, 64X32, 64X64 + has_tr_32x64, has_tr_64x32, has_tr_64x64, + // 64x128, 128x64, 128x128 + has_tr_64x128, has_tr_128x64, has_tr_128x128, + // 4x16, 16x4, 8x32 + has_tr_4x16, has_tr_16x4, has_tr_8x32, + // 32x8, 16x64, 64x16 + has_tr_32x8, has_tr_16x64, has_tr_64x16 +}; + +static uint8_t has_tr_vert_8x8[32] = { + 255, 255, 0, 0, 119, 119, 0, 0, 127, 127, 0, 0, 119, 119, 0, 0, + 255, 127, 0, 0, 119, 119, 0, 0, 127, 127, 0, 0, 119, 119, 0, 0, +}; +static uint8_t has_tr_vert_16x16[8] = { + 255, 0, 119, 0, 127, 0, 119, 0, +}; +static uint8_t has_tr_vert_32x32[2] = { 15, 7 }; +static uint8_t has_tr_vert_64x64[1] = { 3 }; + +// The _vert_* tables are like the ordinary tables above, but describe the +// order we visit square blocks when doing a PARTITION_VERT_A or +// PARTITION_VERT_B. This is the same order as normal except for on the last +// split where we go vertically (TL, BL, TR, BR). We treat the rectangular block +// as a pair of squares, which means that these tables work correctly for both +// mixed vertical partition types. +// +// There are tables for each of the square sizes. Vertical rectangles (like +// BLOCK_16X32) use their respective "non-vert" table +static const uint8_t *const has_tr_vert_tables[BLOCK_SIZES] = { + // 4X4 + NULL, + // 4X8, 8X4, 8X8 + has_tr_4x8, NULL, has_tr_vert_8x8, + // 8X16, 16X8, 16X16 + has_tr_8x16, NULL, has_tr_vert_16x16, + // 16X32, 32X16, 32X32 + has_tr_16x32, NULL, has_tr_vert_32x32, + // 32X64, 64X32, 64X64 + has_tr_32x64, NULL, has_tr_vert_64x64, + // 64x128, 128x64, 128x128 + has_tr_64x128, NULL, has_tr_128x128 +}; + +static const uint8_t *get_has_tr_table(PARTITION_TYPE partition, + BLOCK_SIZE bsize) { + const uint8_t *ret = NULL; + // If this is a mixed vertical partition, look up bsize in orders_vert. + if (partition == PARTITION_VERT_A || partition == PARTITION_VERT_B) { + assert(bsize < BLOCK_SIZES); + ret = has_tr_vert_tables[bsize]; + } else { + ret = has_tr_tables[bsize]; + } + assert(ret); + return ret; +} + +static int has_top_right(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row, + int mi_col, int top_available, int right_available, + PARTITION_TYPE partition, TX_SIZE txsz, int row_off, + int col_off, int ss_x, int ss_y) { + if (!top_available || !right_available) return 0; + + const int bw_unit = mi_size_wide[bsize]; + const int plane_bw_unit = AOMMAX(bw_unit >> ss_x, 1); + const int top_right_count_unit = tx_size_wide_unit[txsz]; + + if (row_off > 0) { // Just need to check if enough pixels on the right. + if (block_size_wide[bsize] > block_size_wide[BLOCK_64X64]) { + // Special case: For 128x128 blocks, the transform unit whose + // top-right corner is at the center of the block does in fact have + // pixels available at its top-right corner. + if (row_off == mi_size_high[BLOCK_64X64] >> ss_y && + col_off + top_right_count_unit == mi_size_wide[BLOCK_64X64] >> ss_x) { + return 1; + } + const int plane_bw_unit_64 = mi_size_wide[BLOCK_64X64] >> ss_x; + const int col_off_64 = col_off % plane_bw_unit_64; + return col_off_64 + top_right_count_unit < plane_bw_unit_64; + } + return col_off + top_right_count_unit < plane_bw_unit; + } else { + // All top-right pixels are in the block above, which is already available. + if (col_off + top_right_count_unit < plane_bw_unit) return 1; + + const int bw_in_mi_log2 = mi_size_wide_log2[bsize]; + const int bh_in_mi_log2 = mi_size_high_log2[bsize]; + const int sb_mi_size = mi_size_high[cm->seq_params.sb_size]; + const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2; + const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2; + + // Top row of superblock: so top-right pixels are in the top and/or + // top-right superblocks, both of which are already available. + if (blk_row_in_sb == 0) return 1; + + // Rightmost column of superblock (and not the top row): so top-right pixels + // fall in the right superblock, which is not available yet. + if (((blk_col_in_sb + 1) << bw_in_mi_log2) >= sb_mi_size) { + return 0; + } + + // General case (neither top row nor rightmost column): check if the + // top-right block is coded before the current block. + const int this_blk_index = + ((blk_row_in_sb + 0) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) + + blk_col_in_sb + 0; + const int idx1 = this_blk_index / 8; + const int idx2 = this_blk_index % 8; + const uint8_t *has_tr_table = get_has_tr_table(partition, bsize); + return (has_tr_table[idx1] >> idx2) & 1; + } +} + +// Similar to the has_tr_* tables, but store if the bottom-left reference +// pixels are available. +static uint8_t has_bl_4x4[128] = { + 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 1, 1, 1, 84, 85, 85, + 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 0, 1, 0, 84, 85, 85, 85, 16, 17, + 17, 17, 84, 85, 85, 85, 0, 1, 1, 1, 84, 85, 85, 85, 16, 17, 17, 17, 84, + 85, 85, 85, 0, 0, 0, 0, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, + 0, 1, 1, 1, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 0, 1, + 0, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 1, 1, 1, 84, 85, + 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 0, 0, 0, +}; +static uint8_t has_bl_4x8[64] = { + 16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 1, 0, + 16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 0, 0, + 16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 1, 0, + 16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 0, 0, +}; +static uint8_t has_bl_8x4[64] = { + 254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 1, + 254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 0, + 254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 1, + 254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 0, +}; +static uint8_t has_bl_8x8[32] = { + 84, 85, 16, 17, 84, 85, 0, 1, 84, 85, 16, 17, 84, 85, 0, 0, + 84, 85, 16, 17, 84, 85, 0, 1, 84, 85, 16, 17, 84, 85, 0, 0, +}; +static uint8_t has_bl_8x16[16] = { + 16, 17, 0, 1, 16, 17, 0, 0, 16, 17, 0, 1, 16, 17, 0, 0, +}; +static uint8_t has_bl_16x8[16] = { + 254, 84, 254, 16, 254, 84, 254, 0, 254, 84, 254, 16, 254, 84, 254, 0, +}; +static uint8_t has_bl_16x16[8] = { + 84, 16, 84, 0, 84, 16, 84, 0, +}; +static uint8_t has_bl_16x32[4] = { 16, 0, 16, 0 }; +static uint8_t has_bl_32x16[4] = { 78, 14, 78, 14 }; +static uint8_t has_bl_32x32[2] = { 4, 4 }; +static uint8_t has_bl_32x64[1] = { 0 }; +static uint8_t has_bl_64x32[1] = { 34 }; +static uint8_t has_bl_64x64[1] = { 0 }; +static uint8_t has_bl_64x128[1] = { 0 }; +static uint8_t has_bl_128x64[1] = { 0 }; +static uint8_t has_bl_128x128[1] = { 0 }; +static uint8_t has_bl_4x16[32] = { + 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, + 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, +}; +static uint8_t has_bl_16x4[32] = { + 254, 254, 254, 84, 254, 254, 254, 16, 254, 254, 254, 84, 254, 254, 254, 0, + 254, 254, 254, 84, 254, 254, 254, 16, 254, 254, 254, 84, 254, 254, 254, 0, +}; +static uint8_t has_bl_8x32[8] = { + 0, 1, 0, 0, 0, 1, 0, 0, +}; +static uint8_t has_bl_32x8[8] = { + 238, 78, 238, 14, 238, 78, 238, 14, +}; +static uint8_t has_bl_16x64[2] = { 0, 0 }; +static uint8_t has_bl_64x16[2] = { 42, 42 }; + +static const uint8_t *const has_bl_tables[BLOCK_SIZES_ALL] = { + // 4X4 + has_bl_4x4, + // 4X8, 8X4, 8X8 + has_bl_4x8, has_bl_8x4, has_bl_8x8, + // 8X16, 16X8, 16X16 + has_bl_8x16, has_bl_16x8, has_bl_16x16, + // 16X32, 32X16, 32X32 + has_bl_16x32, has_bl_32x16, has_bl_32x32, + // 32X64, 64X32, 64X64 + has_bl_32x64, has_bl_64x32, has_bl_64x64, + // 64x128, 128x64, 128x128 + has_bl_64x128, has_bl_128x64, has_bl_128x128, + // 4x16, 16x4, 8x32 + has_bl_4x16, has_bl_16x4, has_bl_8x32, + // 32x8, 16x64, 64x16 + has_bl_32x8, has_bl_16x64, has_bl_64x16 +}; + +static uint8_t has_bl_vert_8x8[32] = { + 254, 255, 16, 17, 254, 255, 0, 1, 254, 255, 16, 17, 254, 255, 0, 0, + 254, 255, 16, 17, 254, 255, 0, 1, 254, 255, 16, 17, 254, 255, 0, 0, +}; +static uint8_t has_bl_vert_16x16[8] = { + 254, 16, 254, 0, 254, 16, 254, 0, +}; +static uint8_t has_bl_vert_32x32[2] = { 14, 14 }; +static uint8_t has_bl_vert_64x64[1] = { 2 }; + +// The _vert_* tables are like the ordinary tables above, but describe the +// order we visit square blocks when doing a PARTITION_VERT_A or +// PARTITION_VERT_B. This is the same order as normal except for on the last +// split where we go vertically (TL, BL, TR, BR). We treat the rectangular block +// as a pair of squares, which means that these tables work correctly for both +// mixed vertical partition types. +// +// There are tables for each of the square sizes. Vertical rectangles (like +// BLOCK_16X32) use their respective "non-vert" table +static const uint8_t *const has_bl_vert_tables[BLOCK_SIZES] = { + // 4X4 + NULL, + // 4X8, 8X4, 8X8 + has_bl_4x8, NULL, has_bl_vert_8x8, + // 8X16, 16X8, 16X16 + has_bl_8x16, NULL, has_bl_vert_16x16, + // 16X32, 32X16, 32X32 + has_bl_16x32, NULL, has_bl_vert_32x32, + // 32X64, 64X32, 64X64 + has_bl_32x64, NULL, has_bl_vert_64x64, + // 64x128, 128x64, 128x128 + has_bl_64x128, NULL, has_bl_128x128 +}; + +static const uint8_t *get_has_bl_table(PARTITION_TYPE partition, + BLOCK_SIZE bsize) { + const uint8_t *ret = NULL; + // If this is a mixed vertical partition, look up bsize in orders_vert. + if (partition == PARTITION_VERT_A || partition == PARTITION_VERT_B) { + assert(bsize < BLOCK_SIZES); + ret = has_bl_vert_tables[bsize]; + } else { + ret = has_bl_tables[bsize]; + } + assert(ret); + return ret; +} + +static int has_bottom_left(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row, + int mi_col, int bottom_available, int left_available, + PARTITION_TYPE partition, TX_SIZE txsz, int row_off, + int col_off, int ss_x, int ss_y) { + if (!bottom_available || !left_available) return 0; + + // Special case for 128x* blocks, when col_off is half the block width. + // This is needed because 128x* superblocks are divided into 64x* blocks in + // raster order + if (block_size_wide[bsize] > block_size_wide[BLOCK_64X64] && col_off > 0) { + const int plane_bw_unit_64 = mi_size_wide[BLOCK_64X64] >> ss_x; + const int col_off_64 = col_off % plane_bw_unit_64; + if (col_off_64 == 0) { + // We are at the left edge of top-right or bottom-right 64x* block. + const int plane_bh_unit_64 = mi_size_high[BLOCK_64X64] >> ss_y; + const int row_off_64 = row_off % plane_bh_unit_64; + const int plane_bh_unit = + AOMMIN(mi_size_high[bsize] >> ss_y, plane_bh_unit_64); + // Check if all bottom-left pixels are in the left 64x* block (which is + // already coded). + return row_off_64 + tx_size_high_unit[txsz] < plane_bh_unit; + } + } + + if (col_off > 0) { + // Bottom-left pixels are in the bottom-left block, which is not available. + return 0; + } else { + const int bh_unit = mi_size_high[bsize]; + const int plane_bh_unit = AOMMAX(bh_unit >> ss_y, 1); + const int bottom_left_count_unit = tx_size_high_unit[txsz]; + + // All bottom-left pixels are in the left block, which is already available. + if (row_off + bottom_left_count_unit < plane_bh_unit) return 1; + + const int bw_in_mi_log2 = mi_size_wide_log2[bsize]; + const int bh_in_mi_log2 = mi_size_high_log2[bsize]; + const int sb_mi_size = mi_size_high[cm->seq_params.sb_size]; + const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2; + const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2; + + // Leftmost column of superblock: so bottom-left pixels maybe in the left + // and/or bottom-left superblocks. But only the left superblock is + // available, so check if all required pixels fall in that superblock. + if (blk_col_in_sb == 0) { + const int blk_start_row_off = + blk_row_in_sb << (bh_in_mi_log2 + MI_SIZE_LOG2 - MI_SIZE_LOG2) >> + ss_y; + const int row_off_in_sb = blk_start_row_off + row_off; + const int sb_height_unit = sb_mi_size >> ss_y; + return row_off_in_sb + bottom_left_count_unit < sb_height_unit; + } + + // Bottom row of superblock (and not the leftmost column): so bottom-left + // pixels fall in the bottom superblock, which is not available yet. + if (((blk_row_in_sb + 1) << bh_in_mi_log2) >= sb_mi_size) return 0; + + // General case (neither leftmost column nor bottom row): check if the + // bottom-left block is coded before the current block. + const int this_blk_index = + ((blk_row_in_sb + 0) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) + + blk_col_in_sb + 0; + const int idx1 = this_blk_index / 8; + const int idx2 = this_blk_index % 8; + const uint8_t *has_bl_table = get_has_bl_table(partition, bsize); + return (has_bl_table[idx1] >> idx2) & 1; + } +} + +typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left); + +static intra_pred_fn pred[INTRA_MODES][TX_SIZES_ALL]; +static intra_pred_fn dc_pred[2][2][TX_SIZES_ALL]; + +#if CONFIG_AV1_HIGHBITDEPTH +typedef void (*intra_high_pred_fn)(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd); +static intra_high_pred_fn pred_high[INTRA_MODES][TX_SIZES_ALL]; +static intra_high_pred_fn dc_pred_high[2][2][TX_SIZES_ALL]; +#endif + +static void init_intra_predictors_internal(void) { + assert(NELEMENTS(mode_to_angle_map) == INTRA_MODES); + +#define INIT_RECTANGULAR(p, type) \ + p[TX_4X8] = aom_##type##_predictor_4x8; \ + p[TX_8X4] = aom_##type##_predictor_8x4; \ + p[TX_8X16] = aom_##type##_predictor_8x16; \ + p[TX_16X8] = aom_##type##_predictor_16x8; \ + p[TX_16X32] = aom_##type##_predictor_16x32; \ + p[TX_32X16] = aom_##type##_predictor_32x16; \ + p[TX_32X64] = aom_##type##_predictor_32x64; \ + p[TX_64X32] = aom_##type##_predictor_64x32; \ + p[TX_4X16] = aom_##type##_predictor_4x16; \ + p[TX_16X4] = aom_##type##_predictor_16x4; \ + p[TX_8X32] = aom_##type##_predictor_8x32; \ + p[TX_32X8] = aom_##type##_predictor_32x8; \ + p[TX_16X64] = aom_##type##_predictor_16x64; \ + p[TX_64X16] = aom_##type##_predictor_64x16; + +#define INIT_NO_4X4(p, type) \ + p[TX_8X8] = aom_##type##_predictor_8x8; \ + p[TX_16X16] = aom_##type##_predictor_16x16; \ + p[TX_32X32] = aom_##type##_predictor_32x32; \ + p[TX_64X64] = aom_##type##_predictor_64x64; \ + INIT_RECTANGULAR(p, type) + +#define INIT_ALL_SIZES(p, type) \ + p[TX_4X4] = aom_##type##_predictor_4x4; \ + INIT_NO_4X4(p, type) + + INIT_ALL_SIZES(pred[V_PRED], v); + INIT_ALL_SIZES(pred[H_PRED], h); + INIT_ALL_SIZES(pred[PAETH_PRED], paeth); + INIT_ALL_SIZES(pred[SMOOTH_PRED], smooth); + INIT_ALL_SIZES(pred[SMOOTH_V_PRED], smooth_v); + INIT_ALL_SIZES(pred[SMOOTH_H_PRED], smooth_h); + INIT_ALL_SIZES(dc_pred[0][0], dc_128); + INIT_ALL_SIZES(dc_pred[0][1], dc_top); + INIT_ALL_SIZES(dc_pred[1][0], dc_left); + INIT_ALL_SIZES(dc_pred[1][1], dc); +#if CONFIG_AV1_HIGHBITDEPTH + INIT_ALL_SIZES(pred_high[V_PRED], highbd_v); + INIT_ALL_SIZES(pred_high[H_PRED], highbd_h); + INIT_ALL_SIZES(pred_high[PAETH_PRED], highbd_paeth); + INIT_ALL_SIZES(pred_high[SMOOTH_PRED], highbd_smooth); + INIT_ALL_SIZES(pred_high[SMOOTH_V_PRED], highbd_smooth_v); + INIT_ALL_SIZES(pred_high[SMOOTH_H_PRED], highbd_smooth_h); + INIT_ALL_SIZES(dc_pred_high[0][0], highbd_dc_128); + INIT_ALL_SIZES(dc_pred_high[0][1], highbd_dc_top); + INIT_ALL_SIZES(dc_pred_high[1][0], highbd_dc_left); + INIT_ALL_SIZES(dc_pred_high[1][1], highbd_dc); +#endif +#undef intra_pred_allsizes +} + +// Directional prediction, zone 1: 0 < angle < 90 +void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_above, int dx, int dy) { + int r, c, x, base, shift, val; + + (void)left; + (void)dy; + assert(dy == 1); + assert(dx > 0); + + const int max_base_x = ((bw + bh) - 1) << upsample_above; + const int frac_bits = 6 - upsample_above; + const int base_inc = 1 << upsample_above; + x = dx; + for (r = 0; r < bh; ++r, dst += stride, x += dx) { + base = x >> frac_bits; + shift = ((x << upsample_above) & 0x3F) >> 1; + + if (base >= max_base_x) { + for (int i = r; i < bh; ++i) { + memset(dst, above[max_base_x], bw * sizeof(dst[0])); + dst += stride; + } + return; + } + + for (c = 0; c < bw; ++c, base += base_inc) { + if (base < max_base_x) { + val = above[base] * (32 - shift) + above[base + 1] * shift; + dst[c] = ROUND_POWER_OF_TWO(val, 5); + } else { + dst[c] = above[max_base_x]; + } + } + } +} + +// Directional prediction, zone 2: 90 < angle < 180 +void av1_dr_prediction_z2_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_above, int upsample_left, int dx, + int dy) { + assert(dx > 0); + assert(dy > 0); + + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + (void)min_base_y; + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + for (int r = 0; r < bh; ++r) { + for (int c = 0; c < bw; ++c) { + int val; + int y = r + 1; + int x = (c << 6) - y * dx; + const int base_x = x >> frac_bits_x; + if (base_x >= min_base_x) { + const int shift = ((x * (1 << upsample_above)) & 0x3F) >> 1; + val = above[base_x] * (32 - shift) + above[base_x + 1] * shift; + val = ROUND_POWER_OF_TWO(val, 5); + } else { + x = c + 1; + y = (r << 6) - x * dy; + const int base_y = y >> frac_bits_y; + assert(base_y >= min_base_y); + const int shift = ((y * (1 << upsample_left)) & 0x3F) >> 1; + val = left[base_y] * (32 - shift) + left[base_y + 1] * shift; + val = ROUND_POWER_OF_TWO(val, 5); + } + dst[c] = val; + } + dst += stride; + } +} + +// Directional prediction, zone 3: 180 < angle < 270 +void av1_dr_prediction_z3_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_left, int dx, int dy) { + int r, c, y, base, shift, val; + + (void)above; + (void)dx; + + assert(dx == 1); + assert(dy > 0); + + const int max_base_y = (bw + bh - 1) << upsample_left; + const int frac_bits = 6 - upsample_left; + const int base_inc = 1 << upsample_left; + y = dy; + for (c = 0; c < bw; ++c, y += dy) { + base = y >> frac_bits; + shift = ((y << upsample_left) & 0x3F) >> 1; + + for (r = 0; r < bh; ++r, base += base_inc) { + if (base < max_base_y) { + val = left[base] * (32 - shift) + left[base + 1] * shift; + dst[r * stride + c] = val = ROUND_POWER_OF_TWO(val, 5); + } else { + for (; r < bh; ++r) dst[r * stride + c] = left[max_base_y]; + break; + } + } + } +} + +static void dr_predictor(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, + const uint8_t *above, const uint8_t *left, + int upsample_above, int upsample_left, int angle) { + const int dx = av1_get_dx(angle); + const int dy = av1_get_dy(angle); + const int bw = tx_size_wide[tx_size]; + const int bh = tx_size_high[tx_size]; + assert(angle > 0 && angle < 270); + + if (angle > 0 && angle < 90) { + av1_dr_prediction_z1(dst, stride, bw, bh, above, left, upsample_above, dx, + dy); + } else if (angle > 90 && angle < 180) { + av1_dr_prediction_z2(dst, stride, bw, bh, above, left, upsample_above, + upsample_left, dx, dy); + } else if (angle > 180 && angle < 270) { + av1_dr_prediction_z3(dst, stride, bw, bh, above, left, upsample_left, dx, + dy); + } else if (angle == 90) { + pred[V_PRED][tx_size](dst, stride, above, left); + } else if (angle == 180) { + pred[H_PRED][tx_size](dst, stride, above, left); + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +// Directional prediction, zone 1: 0 < angle < 90 +void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int upsample_above, + int dx, int dy, int bd) { + int r, c, x, base, shift, val; + + (void)left; + (void)dy; + (void)bd; + assert(dy == 1); + assert(dx > 0); + + const int max_base_x = ((bw + bh) - 1) << upsample_above; + const int frac_bits = 6 - upsample_above; + const int base_inc = 1 << upsample_above; + x = dx; + for (r = 0; r < bh; ++r, dst += stride, x += dx) { + base = x >> frac_bits; + shift = ((x << upsample_above) & 0x3F) >> 1; + + if (base >= max_base_x) { + for (int i = r; i < bh; ++i) { + aom_memset16(dst, above[max_base_x], bw); + dst += stride; + } + return; + } + + for (c = 0; c < bw; ++c, base += base_inc) { + if (base < max_base_x) { + val = above[base] * (32 - shift) + above[base + 1] * shift; + dst[c] = ROUND_POWER_OF_TWO(val, 5); + } else { + dst[c] = above[max_base_x]; + } + } + } +} + +// Directional prediction, zone 2: 90 < angle < 180 +void av1_highbd_dr_prediction_z2_c(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int upsample_above, + int upsample_left, int dx, int dy, int bd) { + (void)bd; + assert(dx > 0); + assert(dy > 0); + + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + (void)min_base_y; + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + for (int r = 0; r < bh; ++r) { + for (int c = 0; c < bw; ++c) { + int val; + int y = r + 1; + int x = (c << 6) - y * dx; + const int base_x = x >> frac_bits_x; + if (base_x >= min_base_x) { + const int shift = ((x * (1 << upsample_above)) & 0x3F) >> 1; + val = above[base_x] * (32 - shift) + above[base_x + 1] * shift; + val = ROUND_POWER_OF_TWO(val, 5); + } else { + x = c + 1; + y = (r << 6) - x * dy; + const int base_y = y >> frac_bits_y; + assert(base_y >= min_base_y); + const int shift = ((y * (1 << upsample_left)) & 0x3F) >> 1; + val = left[base_y] * (32 - shift) + left[base_y + 1] * shift; + val = ROUND_POWER_OF_TWO(val, 5); + } + dst[c] = val; + } + dst += stride; + } +} + +// Directional prediction, zone 3: 180 < angle < 270 +void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int upsample_left, + int dx, int dy, int bd) { + int r, c, y, base, shift, val; + + (void)above; + (void)dx; + (void)bd; + assert(dx == 1); + assert(dy > 0); + + const int max_base_y = (bw + bh - 1) << upsample_left; + const int frac_bits = 6 - upsample_left; + const int base_inc = 1 << upsample_left; + y = dy; + for (c = 0; c < bw; ++c, y += dy) { + base = y >> frac_bits; + shift = ((y << upsample_left) & 0x3F) >> 1; + + for (r = 0; r < bh; ++r, base += base_inc) { + if (base < max_base_y) { + val = left[base] * (32 - shift) + left[base + 1] * shift; + dst[r * stride + c] = ROUND_POWER_OF_TWO(val, 5); + } else { + for (; r < bh; ++r) dst[r * stride + c] = left[max_base_y]; + break; + } + } + } +} + +static void highbd_dr_predictor(uint16_t *dst, ptrdiff_t stride, + TX_SIZE tx_size, const uint16_t *above, + const uint16_t *left, int upsample_above, + int upsample_left, int angle, int bd) { + const int dx = av1_get_dx(angle); + const int dy = av1_get_dy(angle); + const int bw = tx_size_wide[tx_size]; + const int bh = tx_size_high[tx_size]; + assert(angle > 0 && angle < 270); + + if (angle > 0 && angle < 90) { + av1_highbd_dr_prediction_z1(dst, stride, bw, bh, above, left, + upsample_above, dx, dy, bd); + } else if (angle > 90 && angle < 180) { + av1_highbd_dr_prediction_z2(dst, stride, bw, bh, above, left, + upsample_above, upsample_left, dx, dy, bd); + } else if (angle > 180 && angle < 270) { + av1_highbd_dr_prediction_z3(dst, stride, bw, bh, above, left, upsample_left, + dx, dy, bd); + } else if (angle == 90) { + pred_high[V_PRED][tx_size](dst, stride, above, left, bd); + } else if (angle == 180) { + pred_high[H_PRED][tx_size](dst, stride, above, left, bd); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +DECLARE_ALIGNED(16, const int8_t, + av1_filter_intra_taps[FILTER_INTRA_MODES][8][8]) = { + { + { -6, 10, 0, 0, 0, 12, 0, 0 }, + { -5, 2, 10, 0, 0, 9, 0, 0 }, + { -3, 1, 1, 10, 0, 7, 0, 0 }, + { -3, 1, 1, 2, 10, 5, 0, 0 }, + { -4, 6, 0, 0, 0, 2, 12, 0 }, + { -3, 2, 6, 0, 0, 2, 9, 0 }, + { -3, 2, 2, 6, 0, 2, 7, 0 }, + { -3, 1, 2, 2, 6, 3, 5, 0 }, + }, + { + { -10, 16, 0, 0, 0, 10, 0, 0 }, + { -6, 0, 16, 0, 0, 6, 0, 0 }, + { -4, 0, 0, 16, 0, 4, 0, 0 }, + { -2, 0, 0, 0, 16, 2, 0, 0 }, + { -10, 16, 0, 0, 0, 0, 10, 0 }, + { -6, 0, 16, 0, 0, 0, 6, 0 }, + { -4, 0, 0, 16, 0, 0, 4, 0 }, + { -2, 0, 0, 0, 16, 0, 2, 0 }, + }, + { + { -8, 8, 0, 0, 0, 16, 0, 0 }, + { -8, 0, 8, 0, 0, 16, 0, 0 }, + { -8, 0, 0, 8, 0, 16, 0, 0 }, + { -8, 0, 0, 0, 8, 16, 0, 0 }, + { -4, 4, 0, 0, 0, 0, 16, 0 }, + { -4, 0, 4, 0, 0, 0, 16, 0 }, + { -4, 0, 0, 4, 0, 0, 16, 0 }, + { -4, 0, 0, 0, 4, 0, 16, 0 }, + }, + { + { -2, 8, 0, 0, 0, 10, 0, 0 }, + { -1, 3, 8, 0, 0, 6, 0, 0 }, + { -1, 2, 3, 8, 0, 4, 0, 0 }, + { 0, 1, 2, 3, 8, 2, 0, 0 }, + { -1, 4, 0, 0, 0, 3, 10, 0 }, + { -1, 3, 4, 0, 0, 4, 6, 0 }, + { -1, 2, 3, 4, 0, 4, 4, 0 }, + { -1, 2, 2, 3, 4, 3, 3, 0 }, + }, + { + { -12, 14, 0, 0, 0, 14, 0, 0 }, + { -10, 0, 14, 0, 0, 12, 0, 0 }, + { -9, 0, 0, 14, 0, 11, 0, 0 }, + { -8, 0, 0, 0, 14, 10, 0, 0 }, + { -10, 12, 0, 0, 0, 0, 14, 0 }, + { -9, 1, 12, 0, 0, 0, 12, 0 }, + { -8, 0, 0, 12, 0, 1, 11, 0 }, + { -7, 0, 0, 1, 12, 1, 9, 0 }, + }, +}; + +void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride, + TX_SIZE tx_size, const uint8_t *above, + const uint8_t *left, int mode) { + int r, c; + uint8_t buffer[33][33]; + const int bw = tx_size_wide[tx_size]; + const int bh = tx_size_high[tx_size]; + + assert(bw <= 32 && bh <= 32); + + for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r]; + memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t)); + + for (r = 1; r < bh + 1; r += 2) + for (c = 1; c < bw + 1; c += 4) { + const uint8_t p0 = buffer[r - 1][c - 1]; + const uint8_t p1 = buffer[r - 1][c]; + const uint8_t p2 = buffer[r - 1][c + 1]; + const uint8_t p3 = buffer[r - 1][c + 2]; + const uint8_t p4 = buffer[r - 1][c + 3]; + const uint8_t p5 = buffer[r][c - 1]; + const uint8_t p6 = buffer[r + 1][c - 1]; + for (int k = 0; k < 8; ++k) { + int r_offset = k >> 2; + int c_offset = k & 0x03; + buffer[r + r_offset][c + c_offset] = + clip_pixel(ROUND_POWER_OF_TWO_SIGNED( + av1_filter_intra_taps[mode][k][0] * p0 + + av1_filter_intra_taps[mode][k][1] * p1 + + av1_filter_intra_taps[mode][k][2] * p2 + + av1_filter_intra_taps[mode][k][3] * p3 + + av1_filter_intra_taps[mode][k][4] * p4 + + av1_filter_intra_taps[mode][k][5] * p5 + + av1_filter_intra_taps[mode][k][6] * p6, + FILTER_INTRA_SCALE_BITS)); + } + } + + for (r = 0; r < bh; ++r) { + memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t)); + dst += stride; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void highbd_filter_intra_predictor(uint16_t *dst, ptrdiff_t stride, + TX_SIZE tx_size, + const uint16_t *above, + const uint16_t *left, int mode, + int bd) { + int r, c; + uint16_t buffer[33][33]; + const int bw = tx_size_wide[tx_size]; + const int bh = tx_size_high[tx_size]; + + assert(bw <= 32 && bh <= 32); + + for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r]; + memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(buffer[0][0])); + + for (r = 1; r < bh + 1; r += 2) + for (c = 1; c < bw + 1; c += 4) { + const uint16_t p0 = buffer[r - 1][c - 1]; + const uint16_t p1 = buffer[r - 1][c]; + const uint16_t p2 = buffer[r - 1][c + 1]; + const uint16_t p3 = buffer[r - 1][c + 2]; + const uint16_t p4 = buffer[r - 1][c + 3]; + const uint16_t p5 = buffer[r][c - 1]; + const uint16_t p6 = buffer[r + 1][c - 1]; + for (int k = 0; k < 8; ++k) { + int r_offset = k >> 2; + int c_offset = k & 0x03; + buffer[r + r_offset][c + c_offset] = + clip_pixel_highbd(ROUND_POWER_OF_TWO_SIGNED( + av1_filter_intra_taps[mode][k][0] * p0 + + av1_filter_intra_taps[mode][k][1] * p1 + + av1_filter_intra_taps[mode][k][2] * p2 + + av1_filter_intra_taps[mode][k][3] * p3 + + av1_filter_intra_taps[mode][k][4] * p4 + + av1_filter_intra_taps[mode][k][5] * p5 + + av1_filter_intra_taps[mode][k][6] * p6, + FILTER_INTRA_SCALE_BITS), + bd); + } + } + + for (r = 0; r < bh; ++r) { + memcpy(dst, &buffer[r + 1][1], bw * sizeof(dst[0])); + dst += stride; + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static int is_smooth(const MB_MODE_INFO *mbmi, int plane) { + if (plane == 0) { + const PREDICTION_MODE mode = mbmi->mode; + return (mode == SMOOTH_PRED || mode == SMOOTH_V_PRED || + mode == SMOOTH_H_PRED); + } else { + // uv_mode is not set for inter blocks, so need to explicitly + // detect that case. + if (is_inter_block(mbmi)) return 0; + + const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode; + return (uv_mode == UV_SMOOTH_PRED || uv_mode == UV_SMOOTH_V_PRED || + uv_mode == UV_SMOOTH_H_PRED); + } +} + +static int get_filt_type(const MACROBLOCKD *xd, int plane) { + int ab_sm, le_sm; + + if (plane == 0) { + const MB_MODE_INFO *ab = xd->above_mbmi; + const MB_MODE_INFO *le = xd->left_mbmi; + ab_sm = ab ? is_smooth(ab, plane) : 0; + le_sm = le ? is_smooth(le, plane) : 0; + } else { + const MB_MODE_INFO *ab = xd->chroma_above_mbmi; + const MB_MODE_INFO *le = xd->chroma_left_mbmi; + ab_sm = ab ? is_smooth(ab, plane) : 0; + le_sm = le ? is_smooth(le, plane) : 0; + } + + return (ab_sm || le_sm) ? 1 : 0; +} + +static int intra_edge_filter_strength(int bs0, int bs1, int delta, int type) { + const int d = abs(delta); + int strength = 0; + + const int blk_wh = bs0 + bs1; + if (type == 0) { + if (blk_wh <= 8) { + if (d >= 56) strength = 1; + } else if (blk_wh <= 12) { + if (d >= 40) strength = 1; + } else if (blk_wh <= 16) { + if (d >= 40) strength = 1; + } else if (blk_wh <= 24) { + if (d >= 8) strength = 1; + if (d >= 16) strength = 2; + if (d >= 32) strength = 3; + } else if (blk_wh <= 32) { + if (d >= 1) strength = 1; + if (d >= 4) strength = 2; + if (d >= 32) strength = 3; + } else { + if (d >= 1) strength = 3; + } + } else { + if (blk_wh <= 8) { + if (d >= 40) strength = 1; + if (d >= 64) strength = 2; + } else if (blk_wh <= 16) { + if (d >= 20) strength = 1; + if (d >= 48) strength = 2; + } else if (blk_wh <= 24) { + if (d >= 4) strength = 3; + } else { + if (d >= 1) strength = 3; + } + } + return strength; +} + +void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength) { + if (!strength) return; + + const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = { { 0, 4, 8, 4, 0 }, + { 0, 5, 6, 5, 0 }, + { 2, 4, 4, 4, 2 } }; + const int filt = strength - 1; + uint8_t edge[129]; + + memcpy(edge, p, sz * sizeof(*p)); + for (int i = 1; i < sz; i++) { + int s = 0; + for (int j = 0; j < INTRA_EDGE_TAPS; j++) { + int k = i - 2 + j; + k = (k < 0) ? 0 : k; + k = (k > sz - 1) ? sz - 1 : k; + s += edge[k] * kernel[filt][j]; + } + s = (s + 8) >> 4; + p[i] = s; + } +} + +static void filter_intra_edge_corner(uint8_t *p_above, uint8_t *p_left) { + const int kernel[3] = { 5, 6, 5 }; + + int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) + + (p_above[0] * kernel[2]); + s = (s + 8) >> 4; + p_above[-1] = s; + p_left[-1] = s; +} + +void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength) { + if (!strength) return; + + const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = { { 0, 4, 8, 4, 0 }, + { 0, 5, 6, 5, 0 }, + { 2, 4, 4, 4, 2 } }; + const int filt = strength - 1; + uint16_t edge[129]; + + memcpy(edge, p, sz * sizeof(*p)); + for (int i = 1; i < sz; i++) { + int s = 0; + for (int j = 0; j < INTRA_EDGE_TAPS; j++) { + int k = i - 2 + j; + k = (k < 0) ? 0 : k; + k = (k > sz - 1) ? sz - 1 : k; + s += edge[k] * kernel[filt][j]; + } + s = (s + 8) >> 4; + p[i] = s; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void filter_intra_edge_corner_high(uint16_t *p_above, uint16_t *p_left) { + const int kernel[3] = { 5, 6, 5 }; + + int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) + + (p_above[0] * kernel[2]); + s = (s + 8) >> 4; + p_above[-1] = s; + p_left[-1] = s; +} +#endif + +void av1_upsample_intra_edge_c(uint8_t *p, int sz) { + // interpolate half-sample positions + assert(sz <= MAX_UPSAMPLE_SZ); + + uint8_t in[MAX_UPSAMPLE_SZ + 3]; + // copy p[-1..(sz-1)] and extend first and last samples + in[0] = p[-1]; + in[1] = p[-1]; + for (int i = 0; i < sz; i++) { + in[i + 2] = p[i]; + } + in[sz + 2] = p[sz - 1]; + + // interpolate half-sample edge positions + p[-2] = in[0]; + for (int i = 0; i < sz; i++) { + int s = -in[i] + (9 * in[i + 1]) + (9 * in[i + 2]) - in[i + 3]; + s = clip_pixel((s + 8) >> 4); + p[2 * i - 1] = s; + p[2 * i] = in[i + 2]; + } +} + +void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd) { + // interpolate half-sample positions + assert(sz <= MAX_UPSAMPLE_SZ); + + uint16_t in[MAX_UPSAMPLE_SZ + 3]; + // copy p[-1..(sz-1)] and extend first and last samples + in[0] = p[-1]; + in[1] = p[-1]; + for (int i = 0; i < sz; i++) { + in[i + 2] = p[i]; + } + in[sz + 2] = p[sz - 1]; + + // interpolate half-sample edge positions + p[-2] = in[0]; + for (int i = 0; i < sz; i++) { + int s = -in[i] + (9 * in[i + 1]) + (9 * in[i + 2]) - in[i + 3]; + s = (s + 8) >> 4; + s = clip_pixel_highbd(s, bd); + p[2 * i - 1] = s; + p[2 * i] = in[i + 2]; + } +} +#if CONFIG_AV1_HIGHBITDEPTH +static void build_intra_predictors_high( + const MACROBLOCKD *xd, const uint8_t *ref8, int ref_stride, uint8_t *dst8, + int dst_stride, PREDICTION_MODE mode, int angle_delta, + FILTER_INTRA_MODE filter_intra_mode, TX_SIZE tx_size, + int disable_edge_filter, int n_top_px, int n_topright_px, int n_left_px, + int n_bottomleft_px, int plane) { + int i; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + DECLARE_ALIGNED(16, uint16_t, left_data[MAX_TX_SIZE * 2 + 32]); + DECLARE_ALIGNED(16, uint16_t, above_data[MAX_TX_SIZE * 2 + 32]); + uint16_t *const above_row = above_data + 16; + uint16_t *const left_col = left_data + 16; + const int txwpx = tx_size_wide[tx_size]; + const int txhpx = tx_size_high[tx_size]; + int need_left = extend_modes[mode] & NEED_LEFT; + int need_above = extend_modes[mode] & NEED_ABOVE; + int need_above_left = extend_modes[mode] & NEED_ABOVELEFT; + const uint16_t *above_ref = ref - ref_stride; + const uint16_t *left_ref = ref - 1; + int p_angle = 0; + const int is_dr_mode = av1_is_directional_mode(mode); + const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES; + int base = 128 << (xd->bd - 8); + + // The default values if ref pixels are not available: + // base base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1 + // base+1 A B .. Y Z + // base+1 C D .. W X + // base+1 E F .. U V + // base+1 G H .. S T T T T T + + if (is_dr_mode) { + p_angle = mode_to_angle_map[mode] + angle_delta; + if (p_angle <= 90) + need_above = 1, need_left = 0, need_above_left = 1; + else if (p_angle < 180) + need_above = 1, need_left = 1, need_above_left = 1; + else + need_above = 0, need_left = 1, need_above_left = 1; + } + if (use_filter_intra) need_left = need_above = need_above_left = 1; + + assert(n_top_px >= 0); + assert(n_topright_px >= 0); + assert(n_left_px >= 0); + assert(n_bottomleft_px >= 0); + + if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) { + int val; + if (need_left) { + val = (n_top_px > 0) ? above_ref[0] : base + 1; + } else { + val = (n_left_px > 0) ? left_ref[0] : base - 1; + } + for (i = 0; i < txhpx; ++i) { + aom_memset16(dst, val, txwpx); + dst += dst_stride; + } + return; + } + + // NEED_LEFT + if (need_left) { + int need_bottom = extend_modes[mode] & NEED_BOTTOMLEFT; + if (use_filter_intra) need_bottom = 0; + if (is_dr_mode) need_bottom = p_angle > 180; + const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 0); + i = 0; + if (n_left_px > 0) { + for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride]; + if (need_bottom && n_bottomleft_px > 0) { + assert(i == txhpx); + for (; i < txhpx + n_bottomleft_px; i++) + left_col[i] = left_ref[i * ref_stride]; + } + if (i < num_left_pixels_needed) + aom_memset16(&left_col[i], left_col[i - 1], num_left_pixels_needed - i); + } else { + if (n_top_px > 0) { + aom_memset16(left_col, above_ref[0], num_left_pixels_needed); + } else { + aom_memset16(left_col, base + 1, num_left_pixels_needed); + } + } + } + + // NEED_ABOVE + if (need_above) { + int need_right = extend_modes[mode] & NEED_ABOVERIGHT; + if (use_filter_intra) need_right = 0; + if (is_dr_mode) need_right = p_angle < 90; + const int num_top_pixels_needed = txwpx + (need_right ? txhpx : 0); + if (n_top_px > 0) { + memcpy(above_row, above_ref, n_top_px * sizeof(above_ref[0])); + i = n_top_px; + if (need_right && n_topright_px > 0) { + assert(n_top_px == txwpx); + memcpy(above_row + txwpx, above_ref + txwpx, + n_topright_px * sizeof(above_ref[0])); + i += n_topright_px; + } + if (i < num_top_pixels_needed) + aom_memset16(&above_row[i], above_row[i - 1], + num_top_pixels_needed - i); + } else { + if (n_left_px > 0) { + aom_memset16(above_row, left_ref[0], num_top_pixels_needed); + } else { + aom_memset16(above_row, base - 1, num_top_pixels_needed); + } + } + } + + if (need_above_left) { + if (n_top_px > 0 && n_left_px > 0) { + above_row[-1] = above_ref[-1]; + } else if (n_top_px > 0) { + above_row[-1] = above_ref[0]; + } else if (n_left_px > 0) { + above_row[-1] = left_ref[0]; + } else { + above_row[-1] = base; + } + left_col[-1] = above_row[-1]; + } + + if (use_filter_intra) { + highbd_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col, + filter_intra_mode, xd->bd); + return; + } + + if (is_dr_mode) { + int upsample_above = 0; + int upsample_left = 0; + if (!disable_edge_filter) { + const int need_right = p_angle < 90; + const int need_bottom = p_angle > 180; + const int filt_type = get_filt_type(xd, plane); + if (p_angle != 90 && p_angle != 180) { + const int ab_le = need_above_left ? 1 : 0; + if (need_above && need_left && (txwpx + txhpx >= 24)) { + filter_intra_edge_corner_high(above_row, left_col); + } + if (need_above && n_top_px > 0) { + const int strength = + intra_edge_filter_strength(txwpx, txhpx, p_angle - 90, filt_type); + const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0); + av1_filter_intra_edge_high(above_row - ab_le, n_px, strength); + } + if (need_left && n_left_px > 0) { + const int strength = intra_edge_filter_strength( + txhpx, txwpx, p_angle - 180, filt_type); + const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0); + av1_filter_intra_edge_high(left_col - ab_le, n_px, strength); + } + } + upsample_above = + av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type); + if (need_above && upsample_above) { + const int n_px = txwpx + (need_right ? txhpx : 0); + av1_upsample_intra_edge_high(above_row, n_px, xd->bd); + } + upsample_left = + av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type); + if (need_left && upsample_left) { + const int n_px = txhpx + (need_bottom ? txwpx : 0); + av1_upsample_intra_edge_high(left_col, n_px, xd->bd); + } + } + highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col, + upsample_above, upsample_left, p_angle, xd->bd); + return; + } + + // predict + if (mode == DC_PRED) { + dc_pred_high[n_left_px > 0][n_top_px > 0][tx_size]( + dst, dst_stride, above_row, left_col, xd->bd); + } else { + pred_high[mode][tx_size](dst, dst_stride, above_row, left_col, xd->bd); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref, + int ref_stride, uint8_t *dst, int dst_stride, + PREDICTION_MODE mode, int angle_delta, + FILTER_INTRA_MODE filter_intra_mode, + TX_SIZE tx_size, int disable_edge_filter, + int n_top_px, int n_topright_px, + int n_left_px, int n_bottomleft_px, + int plane) { + int i; + const uint8_t *above_ref = ref - ref_stride; + const uint8_t *left_ref = ref - 1; + DECLARE_ALIGNED(16, uint8_t, left_data[MAX_TX_SIZE * 2 + 32]); + DECLARE_ALIGNED(16, uint8_t, above_data[MAX_TX_SIZE * 2 + 32]); + uint8_t *const above_row = above_data + 16; + uint8_t *const left_col = left_data + 16; + const int txwpx = tx_size_wide[tx_size]; + const int txhpx = tx_size_high[tx_size]; + int need_left = extend_modes[mode] & NEED_LEFT; + int need_above = extend_modes[mode] & NEED_ABOVE; + int need_above_left = extend_modes[mode] & NEED_ABOVELEFT; + int p_angle = 0; + const int is_dr_mode = av1_is_directional_mode(mode); + const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES; + + // The default values if ref pixels are not available: + // 128 127 127 .. 127 127 127 127 127 127 + // 129 A B .. Y Z + // 129 C D .. W X + // 129 E F .. U V + // 129 G H .. S T T T T T + // .. + + if (is_dr_mode) { + p_angle = mode_to_angle_map[mode] + angle_delta; + if (p_angle <= 90) + need_above = 1, need_left = 0, need_above_left = 1; + else if (p_angle < 180) + need_above = 1, need_left = 1, need_above_left = 1; + else + need_above = 0, need_left = 1, need_above_left = 1; + } + if (use_filter_intra) need_left = need_above = need_above_left = 1; + + assert(n_top_px >= 0); + assert(n_topright_px >= 0); + assert(n_left_px >= 0); + assert(n_bottomleft_px >= 0); + + if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) { + int val; + if (need_left) { + val = (n_top_px > 0) ? above_ref[0] : 129; + } else { + val = (n_left_px > 0) ? left_ref[0] : 127; + } + for (i = 0; i < txhpx; ++i) { + memset(dst, val, txwpx); + dst += dst_stride; + } + return; + } + + // NEED_LEFT + if (need_left) { + int need_bottom = extend_modes[mode] & NEED_BOTTOMLEFT; + if (use_filter_intra) need_bottom = 0; + if (is_dr_mode) need_bottom = p_angle > 180; + // the avx2 dr_prediction_z2 may read at most 3 extra bytes, + // due to the avx2 mask load is with dword granularity. + // so we initialize 3 extra bytes to silence valgrind complain. + const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 3); + i = 0; + if (n_left_px > 0) { + for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride]; + if (need_bottom && n_bottomleft_px > 0) { + assert(i == txhpx); + for (; i < txhpx + n_bottomleft_px; i++) + left_col[i] = left_ref[i * ref_stride]; + } + if (i < num_left_pixels_needed) + memset(&left_col[i], left_col[i - 1], num_left_pixels_needed - i); + } else { + if (n_top_px > 0) { + memset(left_col, above_ref[0], num_left_pixels_needed); + } else { + memset(left_col, 129, num_left_pixels_needed); + } + } + } + + // NEED_ABOVE + if (need_above) { + int need_right = extend_modes[mode] & NEED_ABOVERIGHT; + if (use_filter_intra) need_right = 0; + if (is_dr_mode) need_right = p_angle < 90; + const int num_top_pixels_needed = txwpx + (need_right ? txhpx : 0); + if (n_top_px > 0) { + memcpy(above_row, above_ref, n_top_px); + i = n_top_px; + if (need_right && n_topright_px > 0) { + assert(n_top_px == txwpx); + memcpy(above_row + txwpx, above_ref + txwpx, n_topright_px); + i += n_topright_px; + } + if (i < num_top_pixels_needed) + memset(&above_row[i], above_row[i - 1], num_top_pixels_needed - i); + } else { + if (n_left_px > 0) { + memset(above_row, left_ref[0], num_top_pixels_needed); + } else { + memset(above_row, 127, num_top_pixels_needed); + } + } + } + + if (need_above_left) { + if (n_top_px > 0 && n_left_px > 0) { + above_row[-1] = above_ref[-1]; + } else if (n_top_px > 0) { + above_row[-1] = above_ref[0]; + } else if (n_left_px > 0) { + above_row[-1] = left_ref[0]; + } else { + above_row[-1] = 128; + } + left_col[-1] = above_row[-1]; + } + + if (use_filter_intra) { + av1_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col, + filter_intra_mode); + return; + } + + if (is_dr_mode) { + int upsample_above = 0; + int upsample_left = 0; + if (!disable_edge_filter) { + const int need_right = p_angle < 90; + const int need_bottom = p_angle > 180; + const int filt_type = get_filt_type(xd, plane); + if (p_angle != 90 && p_angle != 180) { + const int ab_le = need_above_left ? 1 : 0; + if (need_above && need_left && (txwpx + txhpx >= 24)) { + filter_intra_edge_corner(above_row, left_col); + } + if (need_above && n_top_px > 0) { + const int strength = + intra_edge_filter_strength(txwpx, txhpx, p_angle - 90, filt_type); + const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0); + av1_filter_intra_edge(above_row - ab_le, n_px, strength); + } + if (need_left && n_left_px > 0) { + const int strength = intra_edge_filter_strength( + txhpx, txwpx, p_angle - 180, filt_type); + const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0); + av1_filter_intra_edge(left_col - ab_le, n_px, strength); + } + } + upsample_above = + av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type); + if (need_above && upsample_above) { + const int n_px = txwpx + (need_right ? txhpx : 0); + av1_upsample_intra_edge(above_row, n_px); + } + upsample_left = + av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type); + if (need_left && upsample_left) { + const int n_px = txhpx + (need_bottom ? txwpx : 0); + av1_upsample_intra_edge(left_col, n_px); + } + } + dr_predictor(dst, dst_stride, tx_size, above_row, left_col, upsample_above, + upsample_left, p_angle); + return; + } + + // predict + if (mode == DC_PRED) { + dc_pred[n_left_px > 0][n_top_px > 0][tx_size](dst, dst_stride, above_row, + left_col); + } else { + pred[mode][tx_size](dst, dst_stride, above_row, left_col); + } +} + +static INLINE BLOCK_SIZE scale_chroma_bsize(BLOCK_SIZE bsize, int subsampling_x, + int subsampling_y) { + assert(subsampling_x >= 0 && subsampling_x < 2); + assert(subsampling_y >= 0 && subsampling_y < 2); + BLOCK_SIZE bs = bsize; + switch (bsize) { + case BLOCK_4X4: + if (subsampling_x == 1 && subsampling_y == 1) + bs = BLOCK_8X8; + else if (subsampling_x == 1) + bs = BLOCK_8X4; + else if (subsampling_y == 1) + bs = BLOCK_4X8; + break; + case BLOCK_4X8: + if (subsampling_x == 1 && subsampling_y == 1) + bs = BLOCK_8X8; + else if (subsampling_x == 1) + bs = BLOCK_8X8; + else if (subsampling_y == 1) + bs = BLOCK_4X8; + break; + case BLOCK_8X4: + if (subsampling_x == 1 && subsampling_y == 1) + bs = BLOCK_8X8; + else if (subsampling_x == 1) + bs = BLOCK_8X4; + else if (subsampling_y == 1) + bs = BLOCK_8X8; + break; + case BLOCK_4X16: + if (subsampling_x == 1 && subsampling_y == 1) + bs = BLOCK_8X16; + else if (subsampling_x == 1) + bs = BLOCK_8X16; + else if (subsampling_y == 1) + bs = BLOCK_4X16; + break; + case BLOCK_16X4: + if (subsampling_x == 1 && subsampling_y == 1) + bs = BLOCK_16X8; + else if (subsampling_x == 1) + bs = BLOCK_16X4; + else if (subsampling_y == 1) + bs = BLOCK_16X8; + break; + default: break; + } + return bs; +} + +void av1_predict_intra_block( + const AV1_COMMON *cm, const MACROBLOCKD *xd, int wpx, int hpx, + TX_SIZE tx_size, PREDICTION_MODE mode, int angle_delta, int use_palette, + FILTER_INTRA_MODE filter_intra_mode, const uint8_t *ref, int ref_stride, + uint8_t *dst, int dst_stride, int col_off, int row_off, int plane) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const int txwpx = tx_size_wide[tx_size]; + const int txhpx = tx_size_high[tx_size]; + const int x = col_off << MI_SIZE_LOG2; + const int y = row_off << MI_SIZE_LOG2; + + if (use_palette) { + int r, c; + const uint8_t *const map = xd->plane[plane != 0].color_index_map + + xd->color_index_map_offset[plane != 0]; + const uint16_t *const palette = + mbmi->palette_mode_info.palette_colors + plane * PALETTE_MAX_SIZE; + if (is_cur_buf_hbd(xd)) { + uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); + for (r = 0; r < txhpx; ++r) { + for (c = 0; c < txwpx; ++c) { + dst16[r * dst_stride + c] = palette[map[(r + y) * wpx + c + x]]; + } + } + } else { + for (r = 0; r < txhpx; ++r) { + for (c = 0; c < txwpx; ++c) { + dst[r * dst_stride + c] = + (uint8_t)palette[map[(r + y) * wpx + c + x]]; + } + } + } + return; + } + + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int txw = tx_size_wide_unit[tx_size]; + const int txh = tx_size_high_unit[tx_size]; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + const int have_top = + row_off || (ss_y ? xd->chroma_up_available : xd->up_available); + const int have_left = + col_off || (ss_x ? xd->chroma_left_available : xd->left_available); + const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2); + const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2); + const int xr_chr_offset = 0; + const int yd_chr_offset = 0; + + // Distance between the right edge of this prediction block to + // the frame right edge + const int xr = + (xd->mb_to_right_edge >> (3 + ss_x)) + (wpx - x - txwpx) - xr_chr_offset; + // Distance between the bottom edge of this prediction block to + // the frame bottom edge + const int yd = + (xd->mb_to_bottom_edge >> (3 + ss_y)) + (hpx - y - txhpx) - yd_chr_offset; + const int right_available = + mi_col + ((col_off + txw) << ss_x) < xd->tile.mi_col_end; + const int bottom_available = + (yd > 0) && (mi_row + ((row_off + txh) << ss_y) < xd->tile.mi_row_end); + + const PARTITION_TYPE partition = mbmi->partition; + + BLOCK_SIZE bsize = mbmi->sb_type; + // force 4x4 chroma component block size. + if (ss_x || ss_y) { + bsize = scale_chroma_bsize(bsize, ss_x, ss_y); + } + + const int have_top_right = + has_top_right(cm, bsize, mi_row, mi_col, have_top, right_available, + partition, tx_size, row_off, col_off, ss_x, ss_y); + const int have_bottom_left = + has_bottom_left(cm, bsize, mi_row, mi_col, bottom_available, have_left, + partition, tx_size, row_off, col_off, ss_x, ss_y); + + const int disable_edge_filter = !cm->seq_params.enable_intra_edge_filter; +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + build_intra_predictors_high( + xd, ref, ref_stride, dst, dst_stride, mode, angle_delta, + filter_intra_mode, tx_size, disable_edge_filter, + have_top ? AOMMIN(txwpx, xr + txwpx) : 0, + have_top_right ? AOMMIN(txwpx, xr) : 0, + have_left ? AOMMIN(txhpx, yd + txhpx) : 0, + have_bottom_left ? AOMMIN(txhpx, yd) : 0, plane); + return; + } +#endif + build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode, + angle_delta, filter_intra_mode, tx_size, + disable_edge_filter, + have_top ? AOMMIN(txwpx, xr + txwpx) : 0, + have_top_right ? AOMMIN(txwpx, xr) : 0, + have_left ? AOMMIN(txhpx, yd + txhpx) : 0, + have_bottom_left ? AOMMIN(txhpx, yd) : 0, plane); +} + +void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd, + int plane, int blk_col, int blk_row, + TX_SIZE tx_size) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int dst_stride = pd->dst.stride; + uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2]; + const PREDICTION_MODE mode = + (plane == AOM_PLANE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode); + const int use_palette = mbmi->palette_mode_info.palette_size[plane != 0] > 0; + const FILTER_INTRA_MODE filter_intra_mode = + (plane == AOM_PLANE_Y && mbmi->filter_intra_mode_info.use_filter_intra) + ? mbmi->filter_intra_mode_info.filter_intra_mode + : FILTER_INTRA_MODES; + const int angle_delta = mbmi->angle_delta[plane != AOM_PLANE_Y] * ANGLE_STEP; + + if (plane != AOM_PLANE_Y && mbmi->uv_mode == UV_CFL_PRED) { +#if CONFIG_DEBUG + assert(is_cfl_allowed(xd)); + const BLOCK_SIZE plane_bsize = get_plane_block_size( + mbmi->sb_type, pd->subsampling_x, pd->subsampling_y); + (void)plane_bsize; + assert(plane_bsize < BLOCK_SIZES_ALL); + if (!xd->lossless[mbmi->segment_id]) { + assert(blk_col == 0); + assert(blk_row == 0); + assert(block_size_wide[plane_bsize] == tx_size_wide[tx_size]); + assert(block_size_high[plane_bsize] == tx_size_high[tx_size]); + } +#endif + CFL_CTX *const cfl = &xd->cfl; + CFL_PRED_TYPE pred_plane = get_cfl_pred_type(plane); + if (cfl->dc_pred_is_cached[pred_plane] == 0) { + av1_predict_intra_block(cm, xd, pd->width, pd->height, tx_size, mode, + angle_delta, use_palette, filter_intra_mode, dst, + dst_stride, dst, dst_stride, blk_col, blk_row, + plane); + if (cfl->use_dc_pred_cache) { + cfl_store_dc_pred(xd, dst, pred_plane, tx_size_wide[tx_size]); + cfl->dc_pred_is_cached[pred_plane] = 1; + } + } else { + cfl_load_dc_pred(xd, dst, dst_stride, tx_size, pred_plane); + } + cfl_predict_block(xd, dst, dst_stride, tx_size, plane); + return; + } + av1_predict_intra_block(cm, xd, pd->width, pd->height, tx_size, mode, + angle_delta, use_palette, filter_intra_mode, dst, + dst_stride, dst, dst_stride, blk_col, blk_row, plane); +} + +void av1_init_intra_predictors(void) { + aom_once(init_intra_predictors_internal); +} diff --git a/libs/libaom/src/av1/common/reconintra.h b/libs/libaom/src/av1/common/reconintra.h new file mode 100644 index 000000000..9d203569c --- /dev/null +++ b/libs/libaom/src/av1/common/reconintra.h @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_RECONINTRA_H_ +#define AOM_AV1_COMMON_RECONINTRA_H_ + +#include + +#include "aom/aom_integer.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_init_intra_predictors(void); +void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd, + int plane, int blk_col, int blk_row, + TX_SIZE tx_size); +void av1_predict_intra_block( + const AV1_COMMON *cm, const MACROBLOCKD *xd, int wpx, int hpx, + TX_SIZE tx_size, PREDICTION_MODE mode, int angle_delta, int use_palette, + FILTER_INTRA_MODE filter_intra_mode, const uint8_t *ref, int ref_stride, + uint8_t *dst, int dst_stride, int col_off, int row_off, int plane); + +// Mapping of interintra to intra mode for use in the intra component +static const PREDICTION_MODE interintra_to_intra_mode[INTERINTRA_MODES] = { + DC_PRED, V_PRED, H_PRED, SMOOTH_PRED +}; + +// Mapping of intra mode to the interintra mode +static const INTERINTRA_MODE intra_to_interintra_mode[INTRA_MODES] = { + II_DC_PRED, II_V_PRED, II_H_PRED, II_V_PRED, II_SMOOTH_PRED, II_V_PRED, + II_H_PRED, II_H_PRED, II_V_PRED, II_SMOOTH_PRED, II_SMOOTH_PRED +}; + +#define FILTER_INTRA_SCALE_BITS 4 + +static INLINE int av1_is_directional_mode(PREDICTION_MODE mode) { + return mode >= V_PRED && mode <= D67_PRED; +} + +static INLINE int av1_use_angle_delta(BLOCK_SIZE bsize) { + return bsize >= BLOCK_8X8; +} + +static INLINE int av1_allow_intrabc(const AV1_COMMON *const cm) { + return frame_is_intra_only(cm) && cm->features.allow_screen_content_tools && + cm->features.allow_intrabc; +} + +static INLINE int av1_filter_intra_allowed_bsize(const AV1_COMMON *const cm, + BLOCK_SIZE bs) { + if (!cm->seq_params.enable_filter_intra || bs == BLOCK_INVALID) return 0; + + return block_size_wide[bs] <= 32 && block_size_high[bs] <= 32; +} + +static INLINE int av1_filter_intra_allowed(const AV1_COMMON *const cm, + const MB_MODE_INFO *mbmi) { + return mbmi->mode == DC_PRED && + mbmi->palette_mode_info.palette_size[0] == 0 && + av1_filter_intra_allowed_bsize(cm, mbmi->sb_type); +} + +extern const int8_t av1_filter_intra_taps[FILTER_INTRA_MODES][8][8]; + +static const int16_t dr_intra_derivative[90] = { + // More evenly spread out angles and limited to 10-bit + // Values that are 0 will never be used + // Approx angle + 0, 0, 0, // + 1023, 0, 0, // 3, ... + 547, 0, 0, // 6, ... + 372, 0, 0, 0, 0, // 9, ... + 273, 0, 0, // 14, ... + 215, 0, 0, // 17, ... + 178, 0, 0, // 20, ... + 151, 0, 0, // 23, ... (113 & 203 are base angles) + 132, 0, 0, // 26, ... + 116, 0, 0, // 29, ... + 102, 0, 0, 0, // 32, ... + 90, 0, 0, // 36, ... + 80, 0, 0, // 39, ... + 71, 0, 0, // 42, ... + 64, 0, 0, // 45, ... (45 & 135 are base angles) + 57, 0, 0, // 48, ... + 51, 0, 0, // 51, ... + 45, 0, 0, 0, // 54, ... + 40, 0, 0, // 58, ... + 35, 0, 0, // 61, ... + 31, 0, 0, // 64, ... + 27, 0, 0, // 67, ... (67 & 157 are base angles) + 23, 0, 0, // 70, ... + 19, 0, 0, // 73, ... + 15, 0, 0, 0, 0, // 76, ... + 11, 0, 0, // 81, ... + 7, 0, 0, // 84, ... + 3, 0, 0, // 87, ... +}; + +// Get the shift (up-scaled by 256) in X w.r.t a unit change in Y. +// If angle > 0 && angle < 90, dx = -((int)(256 / t)); +// If angle > 90 && angle < 180, dx = (int)(256 / t); +// If angle > 180 && angle < 270, dx = 1; +static INLINE int av1_get_dx(int angle) { + if (angle > 0 && angle < 90) { + return dr_intra_derivative[angle]; + } else if (angle > 90 && angle < 180) { + return dr_intra_derivative[180 - angle]; + } else { + // In this case, we are not really going to use dx. We may return any value. + return 1; + } +} + +// Get the shift (up-scaled by 256) in Y w.r.t a unit change in X. +// If angle > 0 && angle < 90, dy = 1; +// If angle > 90 && angle < 180, dy = (int)(256 * t); +// If angle > 180 && angle < 270, dy = -((int)(256 * t)); +static INLINE int av1_get_dy(int angle) { + if (angle > 90 && angle < 180) { + return dr_intra_derivative[angle - 90]; + } else if (angle > 180 && angle < 270) { + return dr_intra_derivative[270 - angle]; + } else { + // In this case, we are not really going to use dy. We may return any value. + return 1; + } +} + +static INLINE int av1_use_intra_edge_upsample(int bs0, int bs1, int delta, + int type) { + const int d = abs(delta); + const int blk_wh = bs0 + bs1; + if (d == 0 || d >= 40) return 0; + return type ? (blk_wh <= 8) : (blk_wh <= 16); +} +#ifdef __cplusplus +} // extern "C" +#endif +#endif // AOM_AV1_COMMON_RECONINTRA_H_ diff --git a/libs/libaom/src/av1/common/resize.c b/libs/libaom/src/av1/common/resize.c new file mode 100644 index 000000000..98f28f7b5 --- /dev/null +++ b/libs/libaom/src/av1/common/resize.c @@ -0,0 +1,1455 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include +#include + +#include "config/aom_config.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" +#include "aom_scale/aom_scale.h" +#include "av1/common/common.h" +#include "av1/common/resize.h" + +#include "config/aom_scale_rtcd.h" + +// Filters for interpolation (0.5-band) - note this also filters integer pels. +static const InterpKernel filteredinterp_filters500[(1 << RS_SUBPEL_BITS)] = { + { -3, 0, 35, 64, 35, 0, -3, 0 }, { -3, 0, 34, 64, 36, 0, -3, 0 }, + { -3, -1, 34, 64, 36, 1, -3, 0 }, { -3, -1, 33, 64, 37, 1, -3, 0 }, + { -3, -1, 32, 64, 38, 1, -3, 0 }, { -3, -1, 31, 64, 39, 1, -3, 0 }, + { -3, -1, 31, 63, 39, 2, -3, 0 }, { -2, -2, 30, 63, 40, 2, -3, 0 }, + { -2, -2, 29, 63, 41, 2, -3, 0 }, { -2, -2, 29, 63, 41, 3, -4, 0 }, + { -2, -2, 28, 63, 42, 3, -4, 0 }, { -2, -2, 27, 63, 43, 3, -4, 0 }, + { -2, -3, 27, 63, 43, 4, -4, 0 }, { -2, -3, 26, 62, 44, 5, -4, 0 }, + { -2, -3, 25, 62, 45, 5, -4, 0 }, { -2, -3, 25, 62, 45, 5, -4, 0 }, + { -2, -3, 24, 62, 46, 5, -4, 0 }, { -2, -3, 23, 61, 47, 6, -4, 0 }, + { -2, -3, 23, 61, 47, 6, -4, 0 }, { -2, -3, 22, 61, 48, 7, -4, -1 }, + { -2, -3, 21, 60, 49, 7, -4, 0 }, { -1, -4, 20, 60, 49, 8, -4, 0 }, + { -1, -4, 20, 60, 50, 8, -4, -1 }, { -1, -4, 19, 59, 51, 9, -4, -1 }, + { -1, -4, 19, 59, 51, 9, -4, -1 }, { -1, -4, 18, 58, 52, 10, -4, -1 }, + { -1, -4, 17, 58, 52, 11, -4, -1 }, { -1, -4, 16, 58, 53, 11, -4, -1 }, + { -1, -4, 16, 57, 53, 12, -4, -1 }, { -1, -4, 15, 57, 54, 12, -4, -1 }, + { -1, -4, 15, 56, 54, 13, -4, -1 }, { -1, -4, 14, 56, 55, 13, -4, -1 }, + { -1, -4, 14, 55, 55, 14, -4, -1 }, { -1, -4, 13, 55, 56, 14, -4, -1 }, + { -1, -4, 13, 54, 56, 15, -4, -1 }, { -1, -4, 12, 54, 57, 15, -4, -1 }, + { -1, -4, 12, 53, 57, 16, -4, -1 }, { -1, -4, 11, 53, 58, 16, -4, -1 }, + { -1, -4, 11, 52, 58, 17, -4, -1 }, { -1, -4, 10, 52, 58, 18, -4, -1 }, + { -1, -4, 9, 51, 59, 19, -4, -1 }, { -1, -4, 9, 51, 59, 19, -4, -1 }, + { -1, -4, 8, 50, 60, 20, -4, -1 }, { 0, -4, 8, 49, 60, 20, -4, -1 }, + { 0, -4, 7, 49, 60, 21, -3, -2 }, { -1, -4, 7, 48, 61, 22, -3, -2 }, + { 0, -4, 6, 47, 61, 23, -3, -2 }, { 0, -4, 6, 47, 61, 23, -3, -2 }, + { 0, -4, 5, 46, 62, 24, -3, -2 }, { 0, -4, 5, 45, 62, 25, -3, -2 }, + { 0, -4, 5, 45, 62, 25, -3, -2 }, { 0, -4, 5, 44, 62, 26, -3, -2 }, + { 0, -4, 4, 43, 63, 27, -3, -2 }, { 0, -4, 3, 43, 63, 27, -2, -2 }, + { 0, -4, 3, 42, 63, 28, -2, -2 }, { 0, -4, 3, 41, 63, 29, -2, -2 }, + { 0, -3, 2, 41, 63, 29, -2, -2 }, { 0, -3, 2, 40, 63, 30, -2, -2 }, + { 0, -3, 2, 39, 63, 31, -1, -3 }, { 0, -3, 1, 39, 64, 31, -1, -3 }, + { 0, -3, 1, 38, 64, 32, -1, -3 }, { 0, -3, 1, 37, 64, 33, -1, -3 }, + { 0, -3, 1, 36, 64, 34, -1, -3 }, { 0, -3, 0, 36, 64, 34, 0, -3 }, +}; + +// Filters for interpolation (0.625-band) - note this also filters integer pels. +static const InterpKernel filteredinterp_filters625[(1 << RS_SUBPEL_BITS)] = { + { -1, -8, 33, 80, 33, -8, -1, 0 }, { -1, -8, 31, 80, 34, -8, -1, 1 }, + { -1, -8, 30, 80, 35, -8, -1, 1 }, { -1, -8, 29, 80, 36, -7, -2, 1 }, + { -1, -8, 28, 80, 37, -7, -2, 1 }, { -1, -8, 27, 80, 38, -7, -2, 1 }, + { 0, -8, 26, 79, 39, -7, -2, 1 }, { 0, -8, 25, 79, 40, -7, -2, 1 }, + { 0, -8, 24, 79, 41, -7, -2, 1 }, { 0, -8, 23, 78, 42, -6, -2, 1 }, + { 0, -8, 22, 78, 43, -6, -2, 1 }, { 0, -8, 21, 78, 44, -6, -2, 1 }, + { 0, -8, 20, 78, 45, -5, -3, 1 }, { 0, -8, 19, 77, 47, -5, -3, 1 }, + { 0, -8, 18, 77, 48, -5, -3, 1 }, { 0, -8, 17, 77, 49, -5, -3, 1 }, + { 0, -8, 16, 76, 50, -4, -3, 1 }, { 0, -8, 15, 76, 51, -4, -3, 1 }, + { 0, -8, 15, 75, 52, -3, -4, 1 }, { 0, -7, 14, 74, 53, -3, -4, 1 }, + { 0, -7, 13, 74, 54, -3, -4, 1 }, { 0, -7, 12, 73, 55, -2, -4, 1 }, + { 0, -7, 11, 73, 56, -2, -4, 1 }, { 0, -7, 10, 72, 57, -1, -4, 1 }, + { 1, -7, 10, 71, 58, -1, -5, 1 }, { 0, -7, 9, 71, 59, 0, -5, 1 }, + { 1, -7, 8, 70, 60, 0, -5, 1 }, { 1, -7, 7, 69, 61, 1, -5, 1 }, + { 1, -6, 6, 68, 62, 1, -5, 1 }, { 0, -6, 6, 68, 62, 2, -5, 1 }, + { 1, -6, 5, 67, 63, 2, -5, 1 }, { 1, -6, 5, 66, 64, 3, -6, 1 }, + { 1, -6, 4, 65, 65, 4, -6, 1 }, { 1, -6, 3, 64, 66, 5, -6, 1 }, + { 1, -5, 2, 63, 67, 5, -6, 1 }, { 1, -5, 2, 62, 68, 6, -6, 0 }, + { 1, -5, 1, 62, 68, 6, -6, 1 }, { 1, -5, 1, 61, 69, 7, -7, 1 }, + { 1, -5, 0, 60, 70, 8, -7, 1 }, { 1, -5, 0, 59, 71, 9, -7, 0 }, + { 1, -5, -1, 58, 71, 10, -7, 1 }, { 1, -4, -1, 57, 72, 10, -7, 0 }, + { 1, -4, -2, 56, 73, 11, -7, 0 }, { 1, -4, -2, 55, 73, 12, -7, 0 }, + { 1, -4, -3, 54, 74, 13, -7, 0 }, { 1, -4, -3, 53, 74, 14, -7, 0 }, + { 1, -4, -3, 52, 75, 15, -8, 0 }, { 1, -3, -4, 51, 76, 15, -8, 0 }, + { 1, -3, -4, 50, 76, 16, -8, 0 }, { 1, -3, -5, 49, 77, 17, -8, 0 }, + { 1, -3, -5, 48, 77, 18, -8, 0 }, { 1, -3, -5, 47, 77, 19, -8, 0 }, + { 1, -3, -5, 45, 78, 20, -8, 0 }, { 1, -2, -6, 44, 78, 21, -8, 0 }, + { 1, -2, -6, 43, 78, 22, -8, 0 }, { 1, -2, -6, 42, 78, 23, -8, 0 }, + { 1, -2, -7, 41, 79, 24, -8, 0 }, { 1, -2, -7, 40, 79, 25, -8, 0 }, + { 1, -2, -7, 39, 79, 26, -8, 0 }, { 1, -2, -7, 38, 80, 27, -8, -1 }, + { 1, -2, -7, 37, 80, 28, -8, -1 }, { 1, -2, -7, 36, 80, 29, -8, -1 }, + { 1, -1, -8, 35, 80, 30, -8, -1 }, { 1, -1, -8, 34, 80, 31, -8, -1 }, +}; + +// Filters for interpolation (0.75-band) - note this also filters integer pels. +static const InterpKernel filteredinterp_filters750[(1 << RS_SUBPEL_BITS)] = { + { 2, -11, 25, 96, 25, -11, 2, 0 }, { 2, -11, 24, 96, 26, -11, 2, 0 }, + { 2, -11, 22, 96, 28, -11, 2, 0 }, { 2, -10, 21, 96, 29, -12, 2, 0 }, + { 2, -10, 19, 96, 31, -12, 2, 0 }, { 2, -10, 18, 95, 32, -11, 2, 0 }, + { 2, -10, 17, 95, 34, -12, 2, 0 }, { 2, -9, 15, 95, 35, -12, 2, 0 }, + { 2, -9, 14, 94, 37, -12, 2, 0 }, { 2, -9, 13, 94, 38, -12, 2, 0 }, + { 2, -8, 12, 93, 40, -12, 1, 0 }, { 2, -8, 11, 93, 41, -12, 1, 0 }, + { 2, -8, 9, 92, 43, -12, 1, 1 }, { 2, -8, 8, 92, 44, -12, 1, 1 }, + { 2, -7, 7, 91, 46, -12, 1, 0 }, { 2, -7, 6, 90, 47, -12, 1, 1 }, + { 2, -7, 5, 90, 49, -12, 1, 0 }, { 2, -6, 4, 89, 50, -12, 1, 0 }, + { 2, -6, 3, 88, 52, -12, 0, 1 }, { 2, -6, 2, 87, 54, -12, 0, 1 }, + { 2, -5, 1, 86, 55, -12, 0, 1 }, { 2, -5, 0, 85, 57, -12, 0, 1 }, + { 2, -5, -1, 84, 58, -11, 0, 1 }, { 2, -5, -2, 83, 60, -11, 0, 1 }, + { 2, -4, -2, 82, 61, -11, -1, 1 }, { 1, -4, -3, 81, 63, -10, -1, 1 }, + { 2, -4, -4, 80, 64, -10, -1, 1 }, { 1, -4, -4, 79, 66, -10, -1, 1 }, + { 1, -3, -5, 77, 67, -9, -1, 1 }, { 1, -3, -6, 76, 69, -9, -1, 1 }, + { 1, -3, -6, 75, 70, -8, -2, 1 }, { 1, -2, -7, 74, 71, -8, -2, 1 }, + { 1, -2, -7, 72, 72, -7, -2, 1 }, { 1, -2, -8, 71, 74, -7, -2, 1 }, + { 1, -2, -8, 70, 75, -6, -3, 1 }, { 1, -1, -9, 69, 76, -6, -3, 1 }, + { 1, -1, -9, 67, 77, -5, -3, 1 }, { 1, -1, -10, 66, 79, -4, -4, 1 }, + { 1, -1, -10, 64, 80, -4, -4, 2 }, { 1, -1, -10, 63, 81, -3, -4, 1 }, + { 1, -1, -11, 61, 82, -2, -4, 2 }, { 1, 0, -11, 60, 83, -2, -5, 2 }, + { 1, 0, -11, 58, 84, -1, -5, 2 }, { 1, 0, -12, 57, 85, 0, -5, 2 }, + { 1, 0, -12, 55, 86, 1, -5, 2 }, { 1, 0, -12, 54, 87, 2, -6, 2 }, + { 1, 0, -12, 52, 88, 3, -6, 2 }, { 0, 1, -12, 50, 89, 4, -6, 2 }, + { 0, 1, -12, 49, 90, 5, -7, 2 }, { 1, 1, -12, 47, 90, 6, -7, 2 }, + { 0, 1, -12, 46, 91, 7, -7, 2 }, { 1, 1, -12, 44, 92, 8, -8, 2 }, + { 1, 1, -12, 43, 92, 9, -8, 2 }, { 0, 1, -12, 41, 93, 11, -8, 2 }, + { 0, 1, -12, 40, 93, 12, -8, 2 }, { 0, 2, -12, 38, 94, 13, -9, 2 }, + { 0, 2, -12, 37, 94, 14, -9, 2 }, { 0, 2, -12, 35, 95, 15, -9, 2 }, + { 0, 2, -12, 34, 95, 17, -10, 2 }, { 0, 2, -11, 32, 95, 18, -10, 2 }, + { 0, 2, -12, 31, 96, 19, -10, 2 }, { 0, 2, -12, 29, 96, 21, -10, 2 }, + { 0, 2, -11, 28, 96, 22, -11, 2 }, { 0, 2, -11, 26, 96, 24, -11, 2 }, +}; + +// Filters for interpolation (0.875-band) - note this also filters integer pels. +static const InterpKernel filteredinterp_filters875[(1 << RS_SUBPEL_BITS)] = { + { 3, -8, 13, 112, 13, -8, 3, 0 }, { 2, -7, 12, 112, 15, -8, 3, -1 }, + { 3, -7, 10, 112, 17, -9, 3, -1 }, { 2, -6, 8, 112, 19, -9, 3, -1 }, + { 2, -6, 7, 112, 21, -10, 3, -1 }, { 2, -5, 6, 111, 22, -10, 3, -1 }, + { 2, -5, 4, 111, 24, -10, 3, -1 }, { 2, -4, 3, 110, 26, -11, 3, -1 }, + { 2, -4, 1, 110, 28, -11, 3, -1 }, { 2, -4, 0, 109, 30, -12, 4, -1 }, + { 1, -3, -1, 108, 32, -12, 4, -1 }, { 1, -3, -2, 108, 34, -13, 4, -1 }, + { 1, -2, -4, 107, 36, -13, 4, -1 }, { 1, -2, -5, 106, 38, -13, 4, -1 }, + { 1, -1, -6, 105, 40, -14, 4, -1 }, { 1, -1, -7, 104, 42, -14, 4, -1 }, + { 1, -1, -7, 103, 44, -15, 4, -1 }, { 1, 0, -8, 101, 46, -15, 4, -1 }, + { 1, 0, -9, 100, 48, -15, 4, -1 }, { 1, 0, -10, 99, 50, -15, 4, -1 }, + { 1, 1, -11, 97, 53, -16, 4, -1 }, { 0, 1, -11, 96, 55, -16, 4, -1 }, + { 0, 1, -12, 95, 57, -16, 4, -1 }, { 0, 2, -13, 93, 59, -16, 4, -1 }, + { 0, 2, -13, 91, 61, -16, 4, -1 }, { 0, 2, -14, 90, 63, -16, 4, -1 }, + { 0, 2, -14, 88, 65, -16, 4, -1 }, { 0, 2, -15, 86, 67, -16, 4, 0 }, + { 0, 3, -15, 84, 69, -17, 4, 0 }, { 0, 3, -16, 83, 71, -17, 4, 0 }, + { 0, 3, -16, 81, 73, -16, 3, 0 }, { 0, 3, -16, 79, 75, -16, 3, 0 }, + { 0, 3, -16, 77, 77, -16, 3, 0 }, { 0, 3, -16, 75, 79, -16, 3, 0 }, + { 0, 3, -16, 73, 81, -16, 3, 0 }, { 0, 4, -17, 71, 83, -16, 3, 0 }, + { 0, 4, -17, 69, 84, -15, 3, 0 }, { 0, 4, -16, 67, 86, -15, 2, 0 }, + { -1, 4, -16, 65, 88, -14, 2, 0 }, { -1, 4, -16, 63, 90, -14, 2, 0 }, + { -1, 4, -16, 61, 91, -13, 2, 0 }, { -1, 4, -16, 59, 93, -13, 2, 0 }, + { -1, 4, -16, 57, 95, -12, 1, 0 }, { -1, 4, -16, 55, 96, -11, 1, 0 }, + { -1, 4, -16, 53, 97, -11, 1, 1 }, { -1, 4, -15, 50, 99, -10, 0, 1 }, + { -1, 4, -15, 48, 100, -9, 0, 1 }, { -1, 4, -15, 46, 101, -8, 0, 1 }, + { -1, 4, -15, 44, 103, -7, -1, 1 }, { -1, 4, -14, 42, 104, -7, -1, 1 }, + { -1, 4, -14, 40, 105, -6, -1, 1 }, { -1, 4, -13, 38, 106, -5, -2, 1 }, + { -1, 4, -13, 36, 107, -4, -2, 1 }, { -1, 4, -13, 34, 108, -2, -3, 1 }, + { -1, 4, -12, 32, 108, -1, -3, 1 }, { -1, 4, -12, 30, 109, 0, -4, 2 }, + { -1, 3, -11, 28, 110, 1, -4, 2 }, { -1, 3, -11, 26, 110, 3, -4, 2 }, + { -1, 3, -10, 24, 111, 4, -5, 2 }, { -1, 3, -10, 22, 111, 6, -5, 2 }, + { -1, 3, -10, 21, 112, 7, -6, 2 }, { -1, 3, -9, 19, 112, 8, -6, 2 }, + { -1, 3, -9, 17, 112, 10, -7, 3 }, { -1, 3, -8, 15, 112, 12, -7, 2 }, +}; + +const int16_t av1_resize_filter_normative[( + 1 << RS_SUBPEL_BITS)][UPSCALE_NORMATIVE_TAPS] = { +#if UPSCALE_NORMATIVE_TAPS == 8 + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -1, 128, 2, -1, 0, 0 }, + { 0, 1, -3, 127, 4, -2, 1, 0 }, { 0, 1, -4, 127, 6, -3, 1, 0 }, + { 0, 2, -6, 126, 8, -3, 1, 0 }, { 0, 2, -7, 125, 11, -4, 1, 0 }, + { -1, 2, -8, 125, 13, -5, 2, 0 }, { -1, 3, -9, 124, 15, -6, 2, 0 }, + { -1, 3, -10, 123, 18, -6, 2, -1 }, { -1, 3, -11, 122, 20, -7, 3, -1 }, + { -1, 4, -12, 121, 22, -8, 3, -1 }, { -1, 4, -13, 120, 25, -9, 3, -1 }, + { -1, 4, -14, 118, 28, -9, 3, -1 }, { -1, 4, -15, 117, 30, -10, 4, -1 }, + { -1, 5, -16, 116, 32, -11, 4, -1 }, { -1, 5, -16, 114, 35, -12, 4, -1 }, + { -1, 5, -17, 112, 38, -12, 4, -1 }, { -1, 5, -18, 111, 40, -13, 5, -1 }, + { -1, 5, -18, 109, 43, -14, 5, -1 }, { -1, 6, -19, 107, 45, -14, 5, -1 }, + { -1, 6, -19, 105, 48, -15, 5, -1 }, { -1, 6, -19, 103, 51, -16, 5, -1 }, + { -1, 6, -20, 101, 53, -16, 6, -1 }, { -1, 6, -20, 99, 56, -17, 6, -1 }, + { -1, 6, -20, 97, 58, -17, 6, -1 }, { -1, 6, -20, 95, 61, -18, 6, -1 }, + { -2, 7, -20, 93, 64, -18, 6, -2 }, { -2, 7, -20, 91, 66, -19, 6, -1 }, + { -2, 7, -20, 88, 69, -19, 6, -1 }, { -2, 7, -20, 86, 71, -19, 6, -1 }, + { -2, 7, -20, 84, 74, -20, 7, -2 }, { -2, 7, -20, 81, 76, -20, 7, -1 }, + { -2, 7, -20, 79, 79, -20, 7, -2 }, { -1, 7, -20, 76, 81, -20, 7, -2 }, + { -2, 7, -20, 74, 84, -20, 7, -2 }, { -1, 6, -19, 71, 86, -20, 7, -2 }, + { -1, 6, -19, 69, 88, -20, 7, -2 }, { -1, 6, -19, 66, 91, -20, 7, -2 }, + { -2, 6, -18, 64, 93, -20, 7, -2 }, { -1, 6, -18, 61, 95, -20, 6, -1 }, + { -1, 6, -17, 58, 97, -20, 6, -1 }, { -1, 6, -17, 56, 99, -20, 6, -1 }, + { -1, 6, -16, 53, 101, -20, 6, -1 }, { -1, 5, -16, 51, 103, -19, 6, -1 }, + { -1, 5, -15, 48, 105, -19, 6, -1 }, { -1, 5, -14, 45, 107, -19, 6, -1 }, + { -1, 5, -14, 43, 109, -18, 5, -1 }, { -1, 5, -13, 40, 111, -18, 5, -1 }, + { -1, 4, -12, 38, 112, -17, 5, -1 }, { -1, 4, -12, 35, 114, -16, 5, -1 }, + { -1, 4, -11, 32, 116, -16, 5, -1 }, { -1, 4, -10, 30, 117, -15, 4, -1 }, + { -1, 3, -9, 28, 118, -14, 4, -1 }, { -1, 3, -9, 25, 120, -13, 4, -1 }, + { -1, 3, -8, 22, 121, -12, 4, -1 }, { -1, 3, -7, 20, 122, -11, 3, -1 }, + { -1, 2, -6, 18, 123, -10, 3, -1 }, { 0, 2, -6, 15, 124, -9, 3, -1 }, + { 0, 2, -5, 13, 125, -8, 2, -1 }, { 0, 1, -4, 11, 125, -7, 2, 0 }, + { 0, 1, -3, 8, 126, -6, 2, 0 }, { 0, 1, -3, 6, 127, -4, 1, 0 }, + { 0, 1, -2, 4, 127, -3, 1, 0 }, { 0, 0, -1, 2, 128, -1, 0, 0 }, +#else +#error "Invalid value of UPSCALE_NORMATIVE_TAPS" +#endif // UPSCALE_NORMATIVE_TAPS == 8 +}; + +// Filters for interpolation (full-band) - no filtering for integer pixels +#define filteredinterp_filters1000 av1_resize_filter_normative + +// Filters for factor of 2 downsampling. +static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 }; +static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 }; + +static const InterpKernel *choose_interp_filter(int in_length, int out_length) { + int out_length16 = out_length * 16; + if (out_length16 >= in_length * 16) + return filteredinterp_filters1000; + else if (out_length16 >= in_length * 13) + return filteredinterp_filters875; + else if (out_length16 >= in_length * 11) + return filteredinterp_filters750; + else if (out_length16 >= in_length * 9) + return filteredinterp_filters625; + else + return filteredinterp_filters500; +} + +static void interpolate_core(const uint8_t *const input, int in_length, + uint8_t *output, int out_length, + const int16_t *interp_filters, int interp_taps) { + const int32_t delta = + (((uint32_t)in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / + out_length; + const int32_t offset = + in_length > out_length + ? (((int32_t)(in_length - out_length) << (RS_SCALE_SUBPEL_BITS - 1)) + + out_length / 2) / + out_length + : -(((int32_t)(out_length - in_length) + << (RS_SCALE_SUBPEL_BITS - 1)) + + out_length / 2) / + out_length; + uint8_t *optr = output; + int x, x1, x2, sum, k, int_pel, sub_pel; + int32_t y; + + x = 0; + y = offset + RS_SCALE_EXTRA_OFF; + while ((y >> RS_SCALE_SUBPEL_BITS) < (interp_taps / 2 - 1)) { + x++; + y += delta; + } + x1 = x; + x = out_length - 1; + y = delta * x + offset + RS_SCALE_EXTRA_OFF; + while ((y >> RS_SCALE_SUBPEL_BITS) + (int32_t)(interp_taps / 2) >= + in_length) { + x--; + y -= delta; + } + x2 = x; + if (x1 > x2) { + for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < out_length; + ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) { + const int pk = int_pel - interp_taps / 2 + 1 + k; + sum += filter[k] * input[AOMMAX(AOMMIN(pk, in_length - 1), 0)]; + } + *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + } else { + // Initial part. + for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < x1; ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) + sum += filter[k] * input[AOMMAX(int_pel - interp_taps / 2 + 1 + k, 0)]; + *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + // Middle part. + for (; x <= x2; ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) + sum += filter[k] * input[int_pel - interp_taps / 2 + 1 + k]; + *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + // End part. + for (; x < out_length; ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) + sum += filter[k] * + input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, in_length - 1)]; + *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + } +} + +static void interpolate_core_double_prec(const double *const input, + int in_length, double *output, + int out_length, + const int16_t *interp_filters, + int interp_taps) { + const int32_t delta = + (((uint32_t)in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / + out_length; + const int32_t offset = + in_length > out_length + ? (((int32_t)(in_length - out_length) << (RS_SCALE_SUBPEL_BITS - 1)) + + out_length / 2) / + out_length + : -(((int32_t)(out_length - in_length) + << (RS_SCALE_SUBPEL_BITS - 1)) + + out_length / 2) / + out_length; + double *optr = output; + int x, x1, x2, k, int_pel, sub_pel; + double sum; + int32_t y; + + x = 0; + y = offset + RS_SCALE_EXTRA_OFF; + while ((y >> RS_SCALE_SUBPEL_BITS) < (interp_taps / 2 - 1)) { + x++; + y += delta; + } + x1 = x; + x = out_length - 1; + y = delta * x + offset + RS_SCALE_EXTRA_OFF; + while ((y >> RS_SCALE_SUBPEL_BITS) + (int32_t)(interp_taps / 2) >= + in_length) { + x--; + y -= delta; + } + x2 = x; + if (x1 > x2) { + for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < out_length; + ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) { + const int pk = int_pel - interp_taps / 2 + 1 + k; + sum += filter[k] * input[AOMMAX(AOMMIN(pk, in_length - 1), 0)]; + } + *optr++ = sum / (1 << FILTER_BITS); + } + } else { + // Initial part. + for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < x1; ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) + sum += filter[k] * input[AOMMAX(int_pel - interp_taps / 2 + 1 + k, 0)]; + *optr++ = sum / (1 << FILTER_BITS); + } + // Middle part. + for (; x <= x2; ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) + sum += filter[k] * input[int_pel - interp_taps / 2 + 1 + k]; + *optr++ = sum / (1 << FILTER_BITS); + } + // End part. + for (; x < out_length; ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) + sum += filter[k] * + input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, in_length - 1)]; + *optr++ = sum / (1 << FILTER_BITS); + } + } +} + +static void interpolate(const uint8_t *const input, int in_length, + uint8_t *output, int out_length) { + const InterpKernel *interp_filters = + choose_interp_filter(in_length, out_length); + + interpolate_core(input, in_length, output, out_length, &interp_filters[0][0], + SUBPEL_TAPS); +} + +static void interpolate_double_prec(const double *const input, int in_length, + double *output, int out_length) { + const InterpKernel *interp_filters = + choose_interp_filter(in_length, out_length); + + interpolate_core_double_prec(input, in_length, output, out_length, + &interp_filters[0][0], SUBPEL_TAPS); +} + +int32_t av1_get_upscale_convolve_step(int in_length, int out_length) { + return ((in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / out_length; +} + +static int32_t get_upscale_convolve_x0(int in_length, int out_length, + int32_t x_step_qn) { + const int err = out_length * x_step_qn - (in_length << RS_SCALE_SUBPEL_BITS); + const int32_t x0 = + (-((out_length - in_length) << (RS_SCALE_SUBPEL_BITS - 1)) + + out_length / 2) / + out_length + + RS_SCALE_EXTRA_OFF - err / 2; + return (int32_t)((uint32_t)x0 & RS_SCALE_SUBPEL_MASK); +} + +static void down2_symeven(const uint8_t *const input, int length, + uint8_t *output) { + // Actual filter len = 2 * filter_len_half. + const int16_t *filter = av1_down2_symeven_half_filter; + const int filter_len_half = sizeof(av1_down2_symeven_half_filter) / 2; + int i, j; + uint8_t *optr = output; + int l1 = filter_len_half; + int l2 = (length - filter_len_half); + l1 += (l1 & 1); + l2 += (l2 & 1); + if (l1 > l2) { + // Short input length. + for (i = 0; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += + (input[AOMMAX(i - j, 0)] + input[AOMMIN(i + 1 + j, length - 1)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + } else { + // Initial part. + for (i = 0; i < l1; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += (input[AOMMAX(i - j, 0)] + input[i + 1 + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + // Middle part. + for (; i < l2; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += (input[i - j] + input[i + 1 + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + // End part. + for (; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += + (input[i - j] + input[AOMMIN(i + 1 + j, length - 1)]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + } +} + +static void down2_symodd(const uint8_t *const input, int length, + uint8_t *output) { + // Actual filter len = 2 * filter_len_half - 1. + const int16_t *filter = av1_down2_symodd_half_filter; + const int filter_len_half = sizeof(av1_down2_symodd_half_filter) / 2; + int i, j; + uint8_t *optr = output; + int l1 = filter_len_half - 1; + int l2 = (length - filter_len_half + 1); + l1 += (l1 & 1); + l2 += (l2 & 1); + if (l1 > l2) { + // Short input length. + for (i = 0; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[(i - j < 0 ? 0 : i - j)] + + input[(i + j >= length ? length - 1 : i + j)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + } else { + // Initial part. + for (i = 0; i < l1; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + // Middle part. + for (; i < l2; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[i - j] + input[i + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + // End part. + for (; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[i - j] + input[(i + j >= length ? length - 1 : i + j)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + } +} + +static int get_down2_length(int length, int steps) { + for (int s = 0; s < steps; ++s) length = (length + 1) >> 1; + return length; +} + +static int get_down2_steps(int in_length, int out_length) { + int steps = 0; + int proj_in_length; + while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) { + ++steps; + in_length = proj_in_length; + if (in_length == 1) { + // Special case: we break because any further calls to get_down2_length() + // with be with length == 1, which return 1, resulting in an infinite + // loop. + break; + } + } + return steps; +} + +static void resize_multistep(const uint8_t *const input, int length, + uint8_t *output, int olength, uint8_t *otmp) { + if (length == olength) { + memcpy(output, input, sizeof(output[0]) * length); + return; + } + const int steps = get_down2_steps(length, olength); + + if (steps > 0) { + uint8_t *out = NULL; + int filteredlength = length; + + assert(otmp != NULL); + uint8_t *otmp2 = otmp + get_down2_length(length, 1); + for (int s = 0; s < steps; ++s) { + const int proj_filteredlength = get_down2_length(filteredlength, 1); + const uint8_t *const in = (s == 0 ? input : out); + if (s == steps - 1 && proj_filteredlength == olength) + out = output; + else + out = (s & 1 ? otmp2 : otmp); + if (filteredlength & 1) + down2_symodd(in, filteredlength, out); + else + down2_symeven(in, filteredlength, out); + filteredlength = proj_filteredlength; + } + if (filteredlength != olength) { + interpolate(out, filteredlength, output, olength); + } + } else { + interpolate(input, length, output, olength); + } +} + +static void upscale_multistep_double_prec(const double *const input, int length, + double *output, int olength) { + assert(length < olength); + interpolate_double_prec(input, length, output, olength); +} + +static void fill_col_to_arr(uint8_t *img, int stride, int len, uint8_t *arr) { + int i; + uint8_t *iptr = img; + uint8_t *aptr = arr; + for (i = 0; i < len; ++i, iptr += stride) { + *aptr++ = *iptr; + } +} + +static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) { + int i; + uint8_t *iptr = img; + uint8_t *aptr = arr; + for (i = 0; i < len; ++i, iptr += stride) { + *iptr = *aptr++; + } +} + +static void fill_col_to_arr_double_prec(double *img, int stride, int len, + double *arr) { + int i; + double *iptr = img; + double *aptr = arr; + for (i = 0; i < len; ++i, iptr += stride) { + *aptr++ = *iptr; + } +} + +static void fill_arr_to_col_double_prec(double *img, int stride, int len, + double *arr) { + int i; + double *iptr = img; + double *aptr = arr; + for (i = 0; i < len; ++i, iptr += stride) { + *iptr = *aptr++; + } +} + +void av1_resize_plane(const uint8_t *const input, int height, int width, + int in_stride, uint8_t *output, int height2, int width2, + int out_stride) { + int i; + uint8_t *intbuf = (uint8_t *)aom_malloc(sizeof(uint8_t) * width2 * height); + uint8_t *tmpbuf = + (uint8_t *)aom_malloc(sizeof(uint8_t) * AOMMAX(width, height)); + uint8_t *arrbuf = (uint8_t *)aom_malloc(sizeof(uint8_t) * height); + uint8_t *arrbuf2 = (uint8_t *)aom_malloc(sizeof(uint8_t) * height2); + if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) + goto Error; + assert(width > 0); + assert(height > 0); + assert(width2 > 0); + assert(height2 > 0); + for (i = 0; i < height; ++i) + resize_multistep(input + in_stride * i, width, intbuf + width2 * i, width2, + tmpbuf); + for (i = 0; i < width2; ++i) { + fill_col_to_arr(intbuf + i, width2, height, arrbuf); + resize_multistep(arrbuf, height, arrbuf2, height2, tmpbuf); + fill_arr_to_col(output + i, out_stride, height2, arrbuf2); + } + +Error: + aom_free(intbuf); + aom_free(tmpbuf); + aom_free(arrbuf); + aom_free(arrbuf2); +} + +void av1_upscale_plane_double_prec(const double *const input, int height, + int width, int in_stride, double *output, + int height2, int width2, int out_stride) { + int i; + double *intbuf = (double *)aom_malloc(sizeof(double) * width2 * height); + double *arrbuf = (double *)aom_malloc(sizeof(double) * height); + double *arrbuf2 = (double *)aom_malloc(sizeof(double) * height2); + if (intbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) goto Error; + assert(width > 0); + assert(height > 0); + assert(width2 > 0); + assert(height2 > 0); + for (i = 0; i < height; ++i) + upscale_multistep_double_prec(input + in_stride * i, width, + intbuf + width2 * i, width2); + for (i = 0; i < width2; ++i) { + fill_col_to_arr_double_prec(intbuf + i, width2, height, arrbuf); + upscale_multistep_double_prec(arrbuf, height, arrbuf2, height2); + fill_arr_to_col_double_prec(output + i, out_stride, height2, arrbuf2); + } + +Error: + aom_free(intbuf); + aom_free(arrbuf); + aom_free(arrbuf2); +} + +static void upscale_normative_rect(const uint8_t *const input, int height, + int width, int in_stride, uint8_t *output, + int height2, int width2, int out_stride, + int x_step_qn, int x0_qn, int pad_left, + int pad_right) { + assert(width > 0); + assert(height > 0); + assert(width2 > 0); + assert(height2 > 0); + assert(height2 == height); + + // Extend the left/right pixels of the tile column if needed + // (either because we can't sample from other tiles, or because we're at + // a frame edge). + // Save the overwritten pixels into tmp_left and tmp_right. + // Note: Because we pass input-1 to av1_convolve_horiz_rs, we need one extra + // column of border pixels compared to what we'd naively think. + const int border_cols = UPSCALE_NORMATIVE_TAPS / 2 + 1; + uint8_t *tmp_left = + NULL; // Silence spurious "may be used uninitialized" warnings + uint8_t *tmp_right = NULL; + uint8_t *const in_tl = (uint8_t *)(input - border_cols); // Cast off 'const' + uint8_t *const in_tr = (uint8_t *)(input + width); + if (pad_left) { + tmp_left = (uint8_t *)aom_malloc(sizeof(*tmp_left) * border_cols * height); + for (int i = 0; i < height; i++) { + memcpy(tmp_left + i * border_cols, in_tl + i * in_stride, border_cols); + memset(in_tl + i * in_stride, input[i * in_stride], border_cols); + } + } + if (pad_right) { + tmp_right = + (uint8_t *)aom_malloc(sizeof(*tmp_right) * border_cols * height); + for (int i = 0; i < height; i++) { + memcpy(tmp_right + i * border_cols, in_tr + i * in_stride, border_cols); + memset(in_tr + i * in_stride, input[i * in_stride + width - 1], + border_cols); + } + } + + av1_convolve_horiz_rs(input - 1, in_stride, output, out_stride, width2, + height2, &av1_resize_filter_normative[0][0], x0_qn, + x_step_qn); + + // Restore the left/right border pixels + if (pad_left) { + for (int i = 0; i < height; i++) { + memcpy(in_tl + i * in_stride, tmp_left + i * border_cols, border_cols); + } + aom_free(tmp_left); + } + if (pad_right) { + for (int i = 0; i < height; i++) { + memcpy(in_tr + i * in_stride, tmp_right + i * border_cols, border_cols); + } + aom_free(tmp_right); + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void highbd_interpolate_core(const uint16_t *const input, int in_length, + uint16_t *output, int out_length, int bd, + const int16_t *interp_filters, + int interp_taps) { + const int32_t delta = + (((uint32_t)in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / + out_length; + const int32_t offset = + in_length > out_length + ? (((int32_t)(in_length - out_length) << (RS_SCALE_SUBPEL_BITS - 1)) + + out_length / 2) / + out_length + : -(((int32_t)(out_length - in_length) + << (RS_SCALE_SUBPEL_BITS - 1)) + + out_length / 2) / + out_length; + uint16_t *optr = output; + int x, x1, x2, sum, k, int_pel, sub_pel; + int32_t y; + + x = 0; + y = offset + RS_SCALE_EXTRA_OFF; + while ((y >> RS_SCALE_SUBPEL_BITS) < (interp_taps / 2 - 1)) { + x++; + y += delta; + } + x1 = x; + x = out_length - 1; + y = delta * x + offset + RS_SCALE_EXTRA_OFF; + while ((y >> RS_SCALE_SUBPEL_BITS) + (int32_t)(interp_taps / 2) >= + in_length) { + x--; + y -= delta; + } + x2 = x; + if (x1 > x2) { + for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < out_length; + ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) { + const int pk = int_pel - interp_taps / 2 + 1 + k; + sum += filter[k] * input[AOMMAX(AOMMIN(pk, in_length - 1), 0)]; + } + *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + } + } else { + // Initial part. + for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < x1; ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) + sum += filter[k] * input[AOMMAX(int_pel - interp_taps / 2 + 1 + k, 0)]; + *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + } + // Middle part. + for (; x <= x2; ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) + sum += filter[k] * input[int_pel - interp_taps / 2 + 1 + k]; + *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + } + // End part. + for (; x < out_length; ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) + sum += filter[k] * + input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, in_length - 1)]; + *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + } + } +} + +static void highbd_interpolate(const uint16_t *const input, int in_length, + uint16_t *output, int out_length, int bd) { + const InterpKernel *interp_filters = + choose_interp_filter(in_length, out_length); + + highbd_interpolate_core(input, in_length, output, out_length, bd, + &interp_filters[0][0], SUBPEL_TAPS); +} + +static void highbd_down2_symeven(const uint16_t *const input, int length, + uint16_t *output, int bd) { + // Actual filter len = 2 * filter_len_half. + static const int16_t *filter = av1_down2_symeven_half_filter; + const int filter_len_half = sizeof(av1_down2_symeven_half_filter) / 2; + int i, j; + uint16_t *optr = output; + int l1 = filter_len_half; + int l2 = (length - filter_len_half); + l1 += (l1 & 1); + l2 += (l2 & 1); + if (l1 > l2) { + // Short input length. + for (i = 0; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += + (input[AOMMAX(0, i - j)] + input[AOMMIN(i + 1 + j, length - 1)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + } else { + // Initial part. + for (i = 0; i < l1; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += (input[AOMMAX(0, i - j)] + input[i + 1 + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + // Middle part. + for (; i < l2; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += (input[i - j] + input[i + 1 + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + // End part. + for (; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += + (input[i - j] + input[AOMMIN(i + 1 + j, length - 1)]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + } +} + +static void highbd_down2_symodd(const uint16_t *const input, int length, + uint16_t *output, int bd) { + // Actual filter len = 2 * filter_len_half - 1. + static const int16_t *filter = av1_down2_symodd_half_filter; + const int filter_len_half = sizeof(av1_down2_symodd_half_filter) / 2; + int i, j; + uint16_t *optr = output; + int l1 = filter_len_half - 1; + int l2 = (length - filter_len_half + 1); + l1 += (l1 & 1); + l2 += (l2 & 1); + if (l1 > l2) { + // Short input length. + for (i = 0; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[AOMMAX(i - j, 0)] + input[AOMMIN(i + j, length - 1)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + } else { + // Initial part. + for (i = 0; i < l1; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[AOMMAX(i - j, 0)] + input[i + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + // Middle part. + for (; i < l2; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[i - j] + input[i + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + // End part. + for (; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[i - j] + input[AOMMIN(i + j, length - 1)]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + } +} + +static void highbd_resize_multistep(const uint16_t *const input, int length, + uint16_t *output, int olength, + uint16_t *otmp, int bd) { + if (length == olength) { + memcpy(output, input, sizeof(output[0]) * length); + return; + } + const int steps = get_down2_steps(length, olength); + + if (steps > 0) { + uint16_t *out = NULL; + int filteredlength = length; + + assert(otmp != NULL); + uint16_t *otmp2 = otmp + get_down2_length(length, 1); + for (int s = 0; s < steps; ++s) { + const int proj_filteredlength = get_down2_length(filteredlength, 1); + const uint16_t *const in = (s == 0 ? input : out); + if (s == steps - 1 && proj_filteredlength == olength) + out = output; + else + out = (s & 1 ? otmp2 : otmp); + if (filteredlength & 1) + highbd_down2_symodd(in, filteredlength, out, bd); + else + highbd_down2_symeven(in, filteredlength, out, bd); + filteredlength = proj_filteredlength; + } + if (filteredlength != olength) { + highbd_interpolate(out, filteredlength, output, olength, bd); + } + } else { + highbd_interpolate(input, length, output, olength, bd); + } +} + +static void highbd_fill_col_to_arr(uint16_t *img, int stride, int len, + uint16_t *arr) { + int i; + uint16_t *iptr = img; + uint16_t *aptr = arr; + for (i = 0; i < len; ++i, iptr += stride) { + *aptr++ = *iptr; + } +} + +static void highbd_fill_arr_to_col(uint16_t *img, int stride, int len, + uint16_t *arr) { + int i; + uint16_t *iptr = img; + uint16_t *aptr = arr; + for (i = 0; i < len; ++i, iptr += stride) { + *iptr = *aptr++; + } +} + +void av1_highbd_resize_plane(const uint8_t *const input, int height, int width, + int in_stride, uint8_t *output, int height2, + int width2, int out_stride, int bd) { + int i; + uint16_t *intbuf = (uint16_t *)aom_malloc(sizeof(uint16_t) * width2 * height); + uint16_t *tmpbuf = + (uint16_t *)aom_malloc(sizeof(uint16_t) * AOMMAX(width, height)); + uint16_t *arrbuf = (uint16_t *)aom_malloc(sizeof(uint16_t) * height); + uint16_t *arrbuf2 = (uint16_t *)aom_malloc(sizeof(uint16_t) * height2); + if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) + goto Error; + for (i = 0; i < height; ++i) { + highbd_resize_multistep(CONVERT_TO_SHORTPTR(input + in_stride * i), width, + intbuf + width2 * i, width2, tmpbuf, bd); + } + for (i = 0; i < width2; ++i) { + highbd_fill_col_to_arr(intbuf + i, width2, height, arrbuf); + highbd_resize_multistep(arrbuf, height, arrbuf2, height2, tmpbuf, bd); + highbd_fill_arr_to_col(CONVERT_TO_SHORTPTR(output + i), out_stride, height2, + arrbuf2); + } + +Error: + aom_free(intbuf); + aom_free(tmpbuf); + aom_free(arrbuf); + aom_free(arrbuf2); +} + +static void highbd_upscale_normative_rect(const uint8_t *const input, + int height, int width, int in_stride, + uint8_t *output, int height2, + int width2, int out_stride, + int x_step_qn, int x0_qn, + int pad_left, int pad_right, int bd) { + assert(width > 0); + assert(height > 0); + assert(width2 > 0); + assert(height2 > 0); + assert(height2 == height); + + // Extend the left/right pixels of the tile column if needed + // (either because we can't sample from other tiles, or because we're at + // a frame edge). + // Save the overwritten pixels into tmp_left and tmp_right. + // Note: Because we pass input-1 to av1_convolve_horiz_rs, we need one extra + // column of border pixels compared to what we'd naively think. + const int border_cols = UPSCALE_NORMATIVE_TAPS / 2 + 1; + const int border_size = border_cols * sizeof(uint16_t); + uint16_t *tmp_left = + NULL; // Silence spurious "may be used uninitialized" warnings + uint16_t *tmp_right = NULL; + uint16_t *const input16 = CONVERT_TO_SHORTPTR(input); + uint16_t *const in_tl = input16 - border_cols; + uint16_t *const in_tr = input16 + width; + if (pad_left) { + tmp_left = (uint16_t *)aom_malloc(sizeof(*tmp_left) * border_cols * height); + for (int i = 0; i < height; i++) { + memcpy(tmp_left + i * border_cols, in_tl + i * in_stride, border_size); + aom_memset16(in_tl + i * in_stride, input16[i * in_stride], border_cols); + } + } + if (pad_right) { + tmp_right = + (uint16_t *)aom_malloc(sizeof(*tmp_right) * border_cols * height); + for (int i = 0; i < height; i++) { + memcpy(tmp_right + i * border_cols, in_tr + i * in_stride, border_size); + aom_memset16(in_tr + i * in_stride, input16[i * in_stride + width - 1], + border_cols); + } + } + + av1_highbd_convolve_horiz_rs(CONVERT_TO_SHORTPTR(input - 1), in_stride, + CONVERT_TO_SHORTPTR(output), out_stride, width2, + height2, &av1_resize_filter_normative[0][0], + x0_qn, x_step_qn, bd); + + // Restore the left/right border pixels + if (pad_left) { + for (int i = 0; i < height; i++) { + memcpy(in_tl + i * in_stride, tmp_left + i * border_cols, border_size); + } + aom_free(tmp_left); + } + if (pad_right) { + for (int i = 0; i < height; i++) { + memcpy(in_tr + i * in_stride, tmp_right + i * border_cols, border_size); + } + aom_free(tmp_right); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +void av1_resize_frame420(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, uint8_t *oy, + int oy_stride, uint8_t *ou, uint8_t *ov, + int ouv_stride, int oheight, int owidth) { + av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride); + av1_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2, + owidth / 2, ouv_stride); + av1_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2, + owidth / 2, ouv_stride); +} + +void av1_resize_frame422(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, uint8_t *oy, + int oy_stride, uint8_t *ou, uint8_t *ov, + int ouv_stride, int oheight, int owidth) { + av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride); + av1_resize_plane(u, height, width / 2, uv_stride, ou, oheight, owidth / 2, + ouv_stride); + av1_resize_plane(v, height, width / 2, uv_stride, ov, oheight, owidth / 2, + ouv_stride); +} + +void av1_resize_frame444(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, uint8_t *oy, + int oy_stride, uint8_t *ou, uint8_t *ov, + int ouv_stride, int oheight, int owidth) { + av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride); + av1_resize_plane(u, height, width, uv_stride, ou, oheight, owidth, + ouv_stride); + av1_resize_plane(v, height, width, uv_stride, ov, oheight, owidth, + ouv_stride); +} + +#if CONFIG_AV1_HIGHBITDEPTH +void av1_highbd_resize_frame420(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, + uint8_t *ov, int ouv_stride, int oheight, + int owidth, int bd) { + av1_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth, + oy_stride, bd); + av1_highbd_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2, + owidth / 2, ouv_stride, bd); + av1_highbd_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2, + owidth / 2, ouv_stride, bd); +} + +void av1_highbd_resize_frame422(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, + uint8_t *ov, int ouv_stride, int oheight, + int owidth, int bd) { + av1_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth, + oy_stride, bd); + av1_highbd_resize_plane(u, height, width / 2, uv_stride, ou, oheight, + owidth / 2, ouv_stride, bd); + av1_highbd_resize_plane(v, height, width / 2, uv_stride, ov, oheight, + owidth / 2, ouv_stride, bd); +} + +void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, + uint8_t *ov, int ouv_stride, int oheight, + int owidth, int bd) { + av1_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth, + oy_stride, bd); + av1_highbd_resize_plane(u, height, width, uv_stride, ou, oheight, owidth, + ouv_stride, bd); + av1_highbd_resize_plane(v, height, width, uv_stride, ov, oheight, owidth, + ouv_stride, bd); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int bd, + const int num_planes) { + // TODO(dkovalev): replace YV12_BUFFER_CONFIG with aom_image_t + + // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet + // the static analysis warnings. + for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) { + const int is_uv = i > 0; +#if CONFIG_AV1_HIGHBITDEPTH + if (src->flags & YV12_FLAG_HIGHBITDEPTH) + av1_highbd_resize_plane(src->buffers[i], src->crop_heights[is_uv], + src->crop_widths[is_uv], src->strides[is_uv], + dst->buffers[i], dst->crop_heights[is_uv], + dst->crop_widths[is_uv], dst->strides[is_uv], bd); + else + av1_resize_plane(src->buffers[i], src->crop_heights[is_uv], + src->crop_widths[is_uv], src->strides[is_uv], + dst->buffers[i], dst->crop_heights[is_uv], + dst->crop_widths[is_uv], dst->strides[is_uv]); +#else + (void)bd; + av1_resize_plane(src->buffers[i], src->crop_heights[is_uv], + src->crop_widths[is_uv], src->strides[is_uv], + dst->buffers[i], dst->crop_heights[is_uv], + dst->crop_widths[is_uv], dst->strides[is_uv]); +#endif + } + aom_extend_frame_borders(dst, num_planes); +} + +void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src, + int src_stride, uint8_t *dst, int dst_stride, + int plane, int rows) { + const int is_uv = (plane > 0); + const int ss_x = is_uv && cm->seq_params.subsampling_x; + const int downscaled_plane_width = ROUND_POWER_OF_TWO(cm->width, ss_x); + const int upscaled_plane_width = + ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x); + const int superres_denom = cm->superres_scale_denominator; + + TileInfo tile_col; + const int32_t x_step_qn = av1_get_upscale_convolve_step( + downscaled_plane_width, upscaled_plane_width); + int32_t x0_qn = get_upscale_convolve_x0(downscaled_plane_width, + upscaled_plane_width, x_step_qn); + + for (int j = 0; j < cm->tiles.cols; j++) { + av1_tile_set_col(&tile_col, cm, j); + // Determine the limits of this tile column in both the source + // and destination images. + // Note: The actual location which we start sampling from is + // (downscaled_x0 - 1 + (x0_qn/2^14)), and this quantity increases + // by exactly dst_width * (x_step_qn/2^14) pixels each iteration. + const int downscaled_x0 = tile_col.mi_col_start << (MI_SIZE_LOG2 - ss_x); + const int downscaled_x1 = tile_col.mi_col_end << (MI_SIZE_LOG2 - ss_x); + const int src_width = downscaled_x1 - downscaled_x0; + + const int upscaled_x0 = (downscaled_x0 * superres_denom) / SCALE_NUMERATOR; + int upscaled_x1; + if (j == cm->tiles.cols - 1) { + // Note that we can't just use AOMMIN here - due to rounding, + // (downscaled_x1 * superres_denom) / SCALE_NUMERATOR may be less than + // upscaled_plane_width. + upscaled_x1 = upscaled_plane_width; + } else { + upscaled_x1 = (downscaled_x1 * superres_denom) / SCALE_NUMERATOR; + } + + const uint8_t *const src_ptr = src + downscaled_x0; + uint8_t *const dst_ptr = dst + upscaled_x0; + const int dst_width = upscaled_x1 - upscaled_x0; + + const int pad_left = (j == 0); + const int pad_right = (j == cm->tiles.cols - 1); + +#if CONFIG_AV1_HIGHBITDEPTH + if (cm->seq_params.use_highbitdepth) + highbd_upscale_normative_rect(src_ptr, rows, src_width, src_stride, + dst_ptr, rows, dst_width, dst_stride, + x_step_qn, x0_qn, pad_left, pad_right, + cm->seq_params.bit_depth); + else + upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr, + rows, dst_width, dst_stride, x_step_qn, x0_qn, + pad_left, pad_right); +#else + upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr, rows, + dst_width, dst_stride, x_step_qn, x0_qn, pad_left, + pad_right); +#endif + // Update the fractional pixel offset to prepare for the next tile column. + x0_qn += (dst_width * x_step_qn) - (src_width << RS_SCALE_SUBPEL_BITS); + } +} + +void av1_upscale_normative_and_extend_frame(const AV1_COMMON *cm, + const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst) { + const int num_planes = av1_num_planes(cm); + for (int i = 0; i < num_planes; ++i) { + const int is_uv = (i > 0); + av1_upscale_normative_rows(cm, src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], i, + src->crop_heights[is_uv]); + } + + aom_extend_frame_borders(dst, num_planes); +} + +YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm, + YV12_BUFFER_CONFIG *unscaled, + YV12_BUFFER_CONFIG *scaled) { + const int num_planes = av1_num_planes(cm); + if (cm->width != unscaled->y_crop_width || + cm->height != unscaled->y_crop_height) { + av1_resize_and_extend_frame(unscaled, scaled, (int)cm->seq_params.bit_depth, + num_planes); + return scaled; + } else { + return unscaled; + } +} + +// Calculates the scaled dimension given the original dimension and the scale +// denominator. +static void calculate_scaled_size_helper(int *dim, int denom) { + if (denom != SCALE_NUMERATOR) { + // We need to ensure the constraint in "Appendix A" of the spec: + // * FrameWidth is greater than or equal to 16 + // * FrameHeight is greater than or equal to 16 + // For this, we clamp the downscaled dimension to at least 16. One + // exception: if original dimension itself was < 16, then we keep the + // downscaled dimension to be same as the original, to ensure that resizing + // is valid. + const int min_dim = AOMMIN(16, *dim); + // Use this version if we need *dim to be even + // *width = (*width * SCALE_NUMERATOR + denom) / (2 * denom); + // *width <<= 1; + *dim = (*dim * SCALE_NUMERATOR + denom / 2) / (denom); + *dim = AOMMAX(*dim, min_dim); + } +} + +void av1_calculate_scaled_size(int *width, int *height, int resize_denom) { + calculate_scaled_size_helper(width, resize_denom); + calculate_scaled_size_helper(height, resize_denom); +} + +void av1_calculate_scaled_superres_size(int *width, int *height, + int superres_denom) { + (void)height; + calculate_scaled_size_helper(width, superres_denom); +} + +void av1_calculate_unscaled_superres_size(int *width, int *height, int denom) { + if (denom != SCALE_NUMERATOR) { + // Note: av1_calculate_scaled_superres_size() rounds *up* after division + // when the resulting dimensions are odd. So here, we round *down*. + *width = *width * denom / SCALE_NUMERATOR; + (void)height; + } +} + +// Copy only the config data from 'src' to 'dst'. +static void copy_buffer_config(const YV12_BUFFER_CONFIG *const src, + YV12_BUFFER_CONFIG *const dst) { + dst->bit_depth = src->bit_depth; + dst->color_primaries = src->color_primaries; + dst->transfer_characteristics = src->transfer_characteristics; + dst->matrix_coefficients = src->matrix_coefficients; + dst->monochrome = src->monochrome; + dst->chroma_sample_position = src->chroma_sample_position; + dst->color_range = src->color_range; +} + +// TODO(afergs): Look for in-place upscaling +// TODO(afergs): aom_ vs av1_ functions? Which can I use? +// Upscale decoded image. +void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) { + const int num_planes = av1_num_planes(cm); + if (!av1_superres_scaled(cm)) return; + const SequenceHeader *const seq_params = &cm->seq_params; + const int byte_alignment = cm->features.byte_alignment; + + YV12_BUFFER_CONFIG copy_buffer; + memset(©_buffer, 0, sizeof(copy_buffer)); + + YV12_BUFFER_CONFIG *const frame_to_show = &cm->cur_frame->buf; + + const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, 3); + if (aom_alloc_frame_buffer( + ©_buffer, aligned_width, cm->height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + AOM_BORDER_IN_PIXELS, byte_alignment)) + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate copy buffer for superres upscaling"); + + // Copy function assumes the frames are the same size. + // Note that it does not copy YV12_BUFFER_CONFIG config data. + aom_yv12_copy_frame(frame_to_show, ©_buffer, num_planes); + + assert(copy_buffer.y_crop_width == aligned_width); + assert(copy_buffer.y_crop_height == cm->height); + + // Realloc the current frame buffer at a higher resolution in place. + if (pool != NULL) { + // Use callbacks if on the decoder. + aom_codec_frame_buffer_t *fb = &cm->cur_frame->raw_frame_buffer; + aom_release_frame_buffer_cb_fn_t release_fb_cb = pool->release_fb_cb; + aom_get_frame_buffer_cb_fn_t cb = pool->get_fb_cb; + void *cb_priv = pool->cb_priv; + + lock_buffer_pool(pool); + // Realloc with callback does not release the frame buffer - release first. + if (release_fb_cb(cb_priv, fb)) { + unlock_buffer_pool(pool); + aom_internal_error( + &cm->error, AOM_CODEC_MEM_ERROR, + "Failed to free current frame buffer before superres upscaling"); + } + // aom_realloc_frame_buffer() leaves config data for frame_to_show intact + if (aom_realloc_frame_buffer( + frame_to_show, cm->superres_upscaled_width, + cm->superres_upscaled_height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + AOM_BORDER_IN_PIXELS, byte_alignment, fb, cb, cb_priv)) { + unlock_buffer_pool(pool); + aom_internal_error( + &cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate current frame buffer for superres upscaling"); + } + unlock_buffer_pool(pool); + } else { + // Make a copy of the config data for frame_to_show in copy_buffer + copy_buffer_config(frame_to_show, ©_buffer); + + // Don't use callbacks on the encoder. + // aom_alloc_frame_buffer() clears the config data for frame_to_show + if (aom_alloc_frame_buffer( + frame_to_show, cm->superres_upscaled_width, + cm->superres_upscaled_height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + AOM_BORDER_IN_PIXELS, byte_alignment)) + aom_internal_error( + &cm->error, AOM_CODEC_MEM_ERROR, + "Failed to reallocate current frame buffer for superres upscaling"); + + // Restore config data back to frame_to_show + copy_buffer_config(©_buffer, frame_to_show); + } + // TODO(afergs): verify frame_to_show is correct after realloc + // encoder: + // decoder: + + assert(frame_to_show->y_crop_width == cm->superres_upscaled_width); + assert(frame_to_show->y_crop_height == cm->superres_upscaled_height); + + // Scale up and back into frame_to_show. + assert(frame_to_show->y_crop_width != cm->width); + av1_upscale_normative_and_extend_frame(cm, ©_buffer, frame_to_show); + + // Free the copy buffer + aom_free_frame_buffer(©_buffer); +} diff --git a/libs/libaom/src/av1/common/resize.h b/libs/libaom/src/av1/common/resize.h new file mode 100644 index 000000000..8ee859e5c --- /dev/null +++ b/libs/libaom/src/av1/common/resize.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_RESIZE_H_ +#define AOM_AV1_COMMON_RESIZE_H_ + +#include +#include "aom/aom_integer.h" +#include "av1/common/av1_common_int.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_resize_plane(const uint8_t *const input, int height, int width, + int in_stride, uint8_t *output, int height2, int width2, + int out_stride); +void av1_upscale_plane_double_prec(const double *const input, int height, + int width, int in_stride, double *output, + int height2, int width2, int out_stride); +void av1_resize_frame420(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, uint8_t *oy, + int oy_stride, uint8_t *ou, uint8_t *ov, + int ouv_stride, int oheight, int owidth); +void av1_resize_frame422(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, uint8_t *oy, + int oy_stride, uint8_t *ou, uint8_t *ov, + int ouv_stride, int oheight, int owidth); +void av1_resize_frame444(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, uint8_t *oy, + int oy_stride, uint8_t *ou, uint8_t *ov, + int ouv_stride, int oheight, int owidth); + +void av1_highbd_resize_plane(const uint8_t *const input, int height, int width, + int in_stride, uint8_t *output, int height2, + int width2, int out_stride, int bd); +void av1_highbd_resize_frame420(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, + uint8_t *ov, int ouv_stride, int oheight, + int owidth, int bd); +void av1_highbd_resize_frame422(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, + uint8_t *ov, int ouv_stride, int oheight, + int owidth, int bd); +void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, + uint8_t *ov, int ouv_stride, int oheight, + int owidth, int bd); +void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int bd, + const int num_planes); + +void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src, + int src_stride, uint8_t *dst, int dst_stride, + int plane, int rows); +void av1_upscale_normative_and_extend_frame(const AV1_COMMON *cm, + const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst); + +YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm, + YV12_BUFFER_CONFIG *unscaled, + YV12_BUFFER_CONFIG *scaled); + +// Calculates the scaled dimensions from the given original dimensions and the +// resize scale denominator. +void av1_calculate_scaled_size(int *width, int *height, int resize_denom); + +// Similar to above, but calculates scaled dimensions after superres from the +// given original dimensions and superres scale denominator. +void av1_calculate_scaled_superres_size(int *width, int *height, + int superres_denom); + +// Inverse of av1_calculate_scaled_superres_size() above: calculates the +// original dimensions from the given scaled dimensions and the scale +// denominator. +void av1_calculate_unscaled_superres_size(int *width, int *height, int denom); + +void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool); + +// Returns 1 if a superres upscaled frame is scaled and 0 otherwise. +static INLINE int av1_superres_scaled(const AV1_COMMON *cm) { + // Note: for some corner cases (e.g. cm->width of 1), there may be no scaling + // required even though cm->superres_scale_denominator != SCALE_NUMERATOR. + // So, the following check is more accurate. + return !(cm->width == cm->superres_upscaled_width); +} + +#define UPSCALE_NORMATIVE_TAPS 8 +extern const int16_t av1_resize_filter_normative[1 << RS_SUBPEL_BITS] + [UPSCALE_NORMATIVE_TAPS]; + +int32_t av1_get_upscale_convolve_step(int in_length, int out_length); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_RESIZE_H_ diff --git a/libs/libaom/src/av1/common/restoration.c b/libs/libaom/src/av1/common/restoration.c new file mode 100644 index 000000000..a0f37ad63 --- /dev/null +++ b/libs/libaom/src/av1/common/restoration.c @@ -0,0 +1,1566 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + * + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" + +#include "aom_mem/aom_mem.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/resize.h" +#include "av1/common/restoration.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" + +#include "aom_ports/mem.h" + +// The 's' values are calculated based on original 'r' and 'e' values in the +// spec using GenSgrprojVtable(). +// Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid). +const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = { + { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } }, + { { 2, 1 }, { 93, 1618 } }, { { 2, 1 }, { 80, 1438 } }, + { { 2, 1 }, { 70, 1295 } }, { { 2, 1 }, { 58, 1177 } }, + { { 2, 1 }, { 47, 1079 } }, { { 2, 1 }, { 37, 996 } }, + { { 2, 1 }, { 30, 925 } }, { { 2, 1 }, { 25, 863 } }, + { { 0, 1 }, { -1, 2589 } }, { { 0, 1 }, { -1, 1618 } }, + { { 0, 1 }, { -1, 1177 } }, { { 0, 1 }, { -1, 925 } }, + { { 2, 0 }, { 56, -1 } }, { { 2, 0 }, { 22, -1 } }, +}; + +AV1PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) { + AV1PixelRect rect; + + int ss_x = is_uv && cm->seq_params.subsampling_x; + int ss_y = is_uv && cm->seq_params.subsampling_y; + + rect.top = 0; + rect.bottom = ROUND_POWER_OF_TWO(cm->height, ss_y); + rect.left = 0; + rect.right = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x); + return rect; +} + +// Count horizontal or vertical units per tile (use a width or height for +// tile_size, respectively). We basically want to divide the tile size by the +// size of a restoration unit. Rather than rounding up unconditionally as you +// might expect, we round to nearest, which models the way a right or bottom +// restoration unit can extend to up to 150% its normal width or height. The +// max with 1 is to deal with tiles that are smaller than half of a restoration +// unit. +int av1_lr_count_units_in_tile(int unit_size, int tile_size) { + return AOMMAX((tile_size + (unit_size >> 1)) / unit_size, 1); +} + +void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi, + int is_uv) { + // We need to allocate enough space for restoration units to cover the + // largest tile. Without CONFIG_MAX_TILE, this is always the tile at the + // top-left and we can use av1_get_tile_rect(). With CONFIG_MAX_TILE, we have + // to do the computation ourselves, iterating over the tiles and keeping + // track of the largest width and height, then upscaling. + const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv); + const int max_tile_w = tile_rect.right - tile_rect.left; + const int max_tile_h = tile_rect.bottom - tile_rect.top; + + // To calculate hpertile and vpertile (horizontal and vertical units per + // tile), we basically want to divide the largest tile width or height by the + // size of a restoration unit. Rather than rounding up unconditionally as you + // might expect, we round to nearest, which models the way a right or bottom + // restoration unit can extend to up to 150% its normal width or height. The + // max with 1 is to deal with tiles that are smaller than half of a + // restoration unit. + const int unit_size = rsi->restoration_unit_size; + const int hpertile = av1_lr_count_units_in_tile(unit_size, max_tile_w); + const int vpertile = av1_lr_count_units_in_tile(unit_size, max_tile_h); + + rsi->units_per_tile = hpertile * vpertile; + rsi->horz_units_per_tile = hpertile; + rsi->vert_units_per_tile = vpertile; + + const int ntiles = 1; + const int nunits = ntiles * rsi->units_per_tile; + + aom_free(rsi->unit_info); + CHECK_MEM_ERROR(cm, rsi->unit_info, + (RestorationUnitInfo *)aom_memalign( + 16, sizeof(*rsi->unit_info) * nunits)); +} + +void av1_free_restoration_struct(RestorationInfo *rst_info) { + aom_free(rst_info->unit_info); + rst_info->unit_info = NULL; +} + +#if 0 +// Pair of values for each sgrproj parameter: +// Index 0 corresponds to r[0], e[0] +// Index 1 corresponds to r[1], e[1] +int sgrproj_mtable[SGRPROJ_PARAMS][2]; + +static void GenSgrprojVtable() { + for (int i = 0; i < SGRPROJ_PARAMS; ++i) { + const sgr_params_type *const params = &av1_sgr_params[i]; + for (int j = 0; j < 2; ++j) { + const int e = params->e[j]; + const int r = params->r[j]; + if (r == 0) { // filter is disabled + sgrproj_mtable[i][j] = -1; // mark invalid + } else { // filter is enabled + const int n = (2 * r + 1) * (2 * r + 1); + const int n2e = n * n * e; + assert(n2e != 0); + sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e); + } + } + } +} +#endif + +void av1_loop_restoration_precal() { +#if 0 + GenSgrprojVtable(); +#endif +} + +static void extend_frame_lowbd(uint8_t *data, int width, int height, int stride, + int border_horz, int border_vert) { + uint8_t *data_p; + int i; + for (i = 0; i < height; ++i) { + data_p = data + i * stride; + memset(data_p - border_horz, data_p[0], border_horz); + memset(data_p + width, data_p[width - 1], border_horz); + } + data_p = data - border_horz; + for (i = -border_vert; i < 0; ++i) { + memcpy(data_p + i * stride, data_p, width + 2 * border_horz); + } + for (i = height; i < height + border_vert; ++i) { + memcpy(data_p + i * stride, data_p + (height - 1) * stride, + width + 2 * border_horz); + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void extend_frame_highbd(uint16_t *data, int width, int height, + int stride, int border_horz, int border_vert) { + uint16_t *data_p; + int i, j; + for (i = 0; i < height; ++i) { + data_p = data + i * stride; + for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0]; + for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1]; + } + data_p = data - border_horz; + for (i = -border_vert; i < 0; ++i) { + memcpy(data_p + i * stride, data_p, + (width + 2 * border_horz) * sizeof(uint16_t)); + } + for (i = height; i < height + border_vert; ++i) { + memcpy(data_p + i * stride, data_p + (height - 1) * stride, + (width + 2 * border_horz) * sizeof(uint16_t)); + } +} + +static void copy_tile_highbd(int width, int height, const uint16_t *src, + int src_stride, uint16_t *dst, int dst_stride) { + for (int i = 0; i < height; ++i) + memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst)); +} +#endif + +void av1_extend_frame(uint8_t *data, int width, int height, int stride, + int border_horz, int border_vert, int highbd) { +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd) { + extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride, + border_horz, border_vert); + return; + } +#endif + (void)highbd; + extend_frame_lowbd(data, width, height, stride, border_horz, border_vert); +} + +static void copy_tile_lowbd(int width, int height, const uint8_t *src, + int src_stride, uint8_t *dst, int dst_stride) { + for (int i = 0; i < height; ++i) + memcpy(dst + i * dst_stride, src + i * src_stride, width); +} + +static void copy_tile(int width, int height, const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int highbd) { +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd) { + copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride, + CONVERT_TO_SHORTPTR(dst), dst_stride); + return; + } +#endif + (void)highbd; + copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride); +} + +#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d)) + +// With striped loop restoration, the filtering for each 64-pixel stripe gets +// most of its input from the output of CDEF (stored in data8), but we need to +// fill out a border of 3 pixels above/below the stripe according to the +// following +// rules: +// +// * At a frame boundary, we copy the outermost row of CDEF pixels three times. +// This extension is done by a call to av1_extend_frame() at the start of the +// loop restoration process, so the value of copy_above/copy_below doesn't +// strictly matter. However, by setting *copy_above = *copy_below = 1 whenever +// loop filtering across tiles is disabled, we can allow +// {setup,restore}_processing_stripe_boundary to assume that the top/bottom +// data has always been copied, simplifying the behaviour at the left and +// right edges of tiles. +// +// * If we're at a tile boundary and loop filtering across tiles is enabled, +// then there is a logical stripe which is 64 pixels high, but which is split +// into an 8px high and a 56px high stripe so that the processing (and +// coefficient set usage) can be aligned to tiles. +// In this case, we use the 3 rows of CDEF output across the boundary for +// context; this corresponds to leaving the frame buffer as-is. +// +// * If we're at a tile boundary and loop filtering across tiles is disabled, +// then we take the outermost row of CDEF pixels *within the current tile* +// and copy it three times. Thus we behave exactly as if the tile were a full +// frame. +// +// * Otherwise, we're at a stripe boundary within a tile. In that case, we +// take 2 rows of deblocked pixels and extend them to 3 rows of context. +// +// The distinction between the latter two cases is handled by the +// av1_loop_restoration_save_boundary_lines() function, so here we just need +// to decide if we're overwriting the above/below boundary pixels or not. +static void get_stripe_boundary_info(const RestorationTileLimits *limits, + const AV1PixelRect *tile_rect, int ss_y, + int *copy_above, int *copy_below) { + *copy_above = 1; + *copy_below = 1; + + const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y; + const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y; + + const int first_stripe_in_tile = (limits->v_start == tile_rect->top); + const int this_stripe_height = + full_stripe_height - (first_stripe_in_tile ? runit_offset : 0); + const int last_stripe_in_tile = + (limits->v_start + this_stripe_height >= tile_rect->bottom); + + if (first_stripe_in_tile) *copy_above = 0; + if (last_stripe_in_tile) *copy_below = 0; +} + +// Overwrite the border pixels around a processing stripe so that the conditions +// listed above get_stripe_boundary_info() are preserved. +// We save the pixels which get overwritten into a temporary buffer, so that +// they can be restored by restore_processing_stripe_boundary() after we've +// processed the stripe. +// +// limits gives the rectangular limits of the remaining stripes for the current +// restoration unit. rsb is the stored stripe boundaries (taken from either +// deblock or CDEF output as necessary). +// +// tile_rect is the limits of the current tile and tile_stripe0 is the index of +// the first stripe in this tile (needed to convert the tile-relative stripe +// index we get from limits into something we can look up in rsb). +static void setup_processing_stripe_boundary( + const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb, + int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride, + RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) { + // Offsets within the line buffers. The buffer logically starts at column + // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ) + // has column x0 in the buffer. + const int buf_stride = rsb->stripe_boundary_stride; + const int buf_x0_off = limits->h_start; + const int line_width = + (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ; + const int line_size = line_width << use_highbd; + + const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ; + + // Replace RESTORATION_BORDER pixels above the top of the stripe + // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above + // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by + // duplicating the topmost of the 2 lines (see the AOMMAX call when + // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1). + // + // Special case: If we're at the top of a tile, which isn't on the topmost + // tile row, and we're allowed to loop filter across tiles, then we have a + // logical 64-pixel-high stripe which has been split into an 8-pixel high + // stripe and a 56-pixel high stripe (the current one). So, in this case, + // we want to leave the boundary alone! + if (!opt) { + if (copy_above) { + uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride; + + for (int i = -RESTORATION_BORDER; i < 0; ++i) { + const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0); + const int buf_off = buf_x0_off + buf_row * buf_stride; + const uint8_t *buf = + rsb->stripe_boundary_above + (buf_off << use_highbd); + uint8_t *dst8 = data8_tl + i * data_stride; + // Save old pixels, then replace with data from stripe_boundary_above + memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER], + REAL_PTR(use_highbd, dst8), line_size); + memcpy(REAL_PTR(use_highbd, dst8), buf, line_size); + } + } + + // Replace RESTORATION_BORDER pixels below the bottom of the stripe. + // The second buffer row is repeated, so src_row gets the values 0, 1, 1 + // for i = 0, 1, 2. + if (copy_below) { + const int stripe_end = limits->v_start + h; + uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride; + + for (int i = 0; i < RESTORATION_BORDER; ++i) { + const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1); + const int buf_off = buf_x0_off + buf_row * buf_stride; + const uint8_t *src = + rsb->stripe_boundary_below + (buf_off << use_highbd); + + uint8_t *dst8 = data8_bl + i * data_stride; + // Save old pixels, then replace with data from stripe_boundary_below + memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size); + memcpy(REAL_PTR(use_highbd, dst8), src, line_size); + } + } + } else { + if (copy_above) { + uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride; + + // Only save and overwrite i=-RESTORATION_BORDER line. + uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride; + // Save old pixels, then replace with data from stripe_boundary_above + memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size); + memcpy(REAL_PTR(use_highbd, dst8), + REAL_PTR(use_highbd, + data8_tl + (-RESTORATION_BORDER + 1) * data_stride), + line_size); + } + + if (copy_below) { + const int stripe_end = limits->v_start + h; + uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride; + + // Only save and overwrite i=2 line. + uint8_t *dst8 = data8_bl + 2 * data_stride; + // Save old pixels, then replace with data from stripe_boundary_below + memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size); + memcpy(REAL_PTR(use_highbd, dst8), + REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size); + } + } +} + +// This function restores the boundary lines modified by +// setup_processing_stripe_boundary. +// +// Note: We need to be careful when handling the corners of the processing +// unit, because (eg.) the top-left corner is considered to be part of +// both the left and top borders. This means that, depending on the +// loop_filter_across_tiles_enabled flag, the corner pixels might get +// overwritten twice, once as part of the "top" border and once as part +// of the "left" border (or similar for other corners). +// +// Everything works out fine as long as we make sure to reverse the order +// when restoring, ie. we need to restore the left/right borders followed +// by the top/bottom borders. +static void restore_processing_stripe_boundary( + const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs, + int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above, + int copy_below, int opt) { + const int line_width = + (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ; + const int line_size = line_width << use_highbd; + + const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ; + + if (!opt) { + if (copy_above) { + uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride; + for (int i = -RESTORATION_BORDER; i < 0; ++i) { + uint8_t *dst8 = data8_tl + i * data_stride; + memcpy(REAL_PTR(use_highbd, dst8), + rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size); + } + } + + if (copy_below) { + const int stripe_bottom = limits->v_start + h; + uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride; + + for (int i = 0; i < RESTORATION_BORDER; ++i) { + if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break; + + uint8_t *dst8 = data8_bl + i * data_stride; + memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size); + } + } + } else { + if (copy_above) { + uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride; + + // Only restore i=-RESTORATION_BORDER line. + uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride; + memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size); + } + + if (copy_below) { + const int stripe_bottom = limits->v_start + h; + uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride; + + // Only restore i=2 line. + if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) { + uint8_t *dst8 = data8_bl + 2 * data_stride; + memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size); + } + } + } +} + +static void wiener_filter_stripe(const RestorationUnitInfo *rui, + int stripe_width, int stripe_height, + int procunit_width, const uint8_t *src, + int src_stride, uint8_t *dst, int dst_stride, + int32_t *tmpbuf, int bit_depth) { + (void)tmpbuf; + (void)bit_depth; + assert(bit_depth == 8); + const ConvolveParams conv_params = get_conv_params_wiener(8); + + for (int j = 0; j < stripe_width; j += procunit_width) { + int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15); + const uint8_t *src_p = src + j; + uint8_t *dst_p = dst + j; + av1_wiener_convolve_add_src( + src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16, + rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params); + } +} + +/* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1) + over the input. The window is of size (2r + 1)x(2r + 1), and we + specialize to r = 1, 2, 3. A default function is used for r > 3. + + Each loop follows the same format: We keep a window's worth of input + in individual variables and select data out of that as appropriate. +*/ +static void boxsum1(int32_t *src, int width, int height, int src_stride, + int sqr, int32_t *dst, int dst_stride) { + int i, j, a, b, c; + assert(width > 2 * SGRPROJ_BORDER_HORZ); + assert(height > 2 * SGRPROJ_BORDER_VERT); + + // Vertical sum over 3-pixel regions, from src into dst. + if (!sqr) { + for (j = 0; j < width; ++j) { + a = src[j]; + b = src[src_stride + j]; + c = src[2 * src_stride + j]; + + dst[j] = a + b; + for (i = 1; i < height - 2; ++i) { + // Loop invariant: At the start of each iteration, + // a = src[(i - 1) * src_stride + j] + // b = src[(i ) * src_stride + j] + // c = src[(i + 1) * src_stride + j] + dst[i * dst_stride + j] = a + b + c; + a = b; + b = c; + c = src[(i + 2) * src_stride + j]; + } + dst[i * dst_stride + j] = a + b + c; + dst[(i + 1) * dst_stride + j] = b + c; + } + } else { + for (j = 0; j < width; ++j) { + a = src[j] * src[j]; + b = src[src_stride + j] * src[src_stride + j]; + c = src[2 * src_stride + j] * src[2 * src_stride + j]; + + dst[j] = a + b; + for (i = 1; i < height - 2; ++i) { + dst[i * dst_stride + j] = a + b + c; + a = b; + b = c; + c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j]; + } + dst[i * dst_stride + j] = a + b + c; + dst[(i + 1) * dst_stride + j] = b + c; + } + } + + // Horizontal sum over 3-pixel regions of dst + for (i = 0; i < height; ++i) { + a = dst[i * dst_stride]; + b = dst[i * dst_stride + 1]; + c = dst[i * dst_stride + 2]; + + dst[i * dst_stride] = a + b; + for (j = 1; j < width - 2; ++j) { + // Loop invariant: At the start of each iteration, + // a = src[i * src_stride + (j - 1)] + // b = src[i * src_stride + (j )] + // c = src[i * src_stride + (j + 1)] + dst[i * dst_stride + j] = a + b + c; + a = b; + b = c; + c = dst[i * dst_stride + (j + 2)]; + } + dst[i * dst_stride + j] = a + b + c; + dst[i * dst_stride + (j + 1)] = b + c; + } +} + +static void boxsum2(int32_t *src, int width, int height, int src_stride, + int sqr, int32_t *dst, int dst_stride) { + int i, j, a, b, c, d, e; + assert(width > 2 * SGRPROJ_BORDER_HORZ); + assert(height > 2 * SGRPROJ_BORDER_VERT); + + // Vertical sum over 5-pixel regions, from src into dst. + if (!sqr) { + for (j = 0; j < width; ++j) { + a = src[j]; + b = src[src_stride + j]; + c = src[2 * src_stride + j]; + d = src[3 * src_stride + j]; + e = src[4 * src_stride + j]; + + dst[j] = a + b + c; + dst[dst_stride + j] = a + b + c + d; + for (i = 2; i < height - 3; ++i) { + // Loop invariant: At the start of each iteration, + // a = src[(i - 2) * src_stride + j] + // b = src[(i - 1) * src_stride + j] + // c = src[(i ) * src_stride + j] + // d = src[(i + 1) * src_stride + j] + // e = src[(i + 2) * src_stride + j] + dst[i * dst_stride + j] = a + b + c + d + e; + a = b; + b = c; + c = d; + d = e; + e = src[(i + 3) * src_stride + j]; + } + dst[i * dst_stride + j] = a + b + c + d + e; + dst[(i + 1) * dst_stride + j] = b + c + d + e; + dst[(i + 2) * dst_stride + j] = c + d + e; + } + } else { + for (j = 0; j < width; ++j) { + a = src[j] * src[j]; + b = src[src_stride + j] * src[src_stride + j]; + c = src[2 * src_stride + j] * src[2 * src_stride + j]; + d = src[3 * src_stride + j] * src[3 * src_stride + j]; + e = src[4 * src_stride + j] * src[4 * src_stride + j]; + + dst[j] = a + b + c; + dst[dst_stride + j] = a + b + c + d; + for (i = 2; i < height - 3; ++i) { + dst[i * dst_stride + j] = a + b + c + d + e; + a = b; + b = c; + c = d; + d = e; + e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j]; + } + dst[i * dst_stride + j] = a + b + c + d + e; + dst[(i + 1) * dst_stride + j] = b + c + d + e; + dst[(i + 2) * dst_stride + j] = c + d + e; + } + } + + // Horizontal sum over 5-pixel regions of dst + for (i = 0; i < height; ++i) { + a = dst[i * dst_stride]; + b = dst[i * dst_stride + 1]; + c = dst[i * dst_stride + 2]; + d = dst[i * dst_stride + 3]; + e = dst[i * dst_stride + 4]; + + dst[i * dst_stride] = a + b + c; + dst[i * dst_stride + 1] = a + b + c + d; + for (j = 2; j < width - 3; ++j) { + // Loop invariant: At the start of each iteration, + // a = src[i * src_stride + (j - 2)] + // b = src[i * src_stride + (j - 1)] + // c = src[i * src_stride + (j )] + // d = src[i * src_stride + (j + 1)] + // e = src[i * src_stride + (j + 2)] + dst[i * dst_stride + j] = a + b + c + d + e; + a = b; + b = c; + c = d; + d = e; + e = dst[i * dst_stride + (j + 3)]; + } + dst[i * dst_stride + j] = a + b + c + d + e; + dst[i * dst_stride + (j + 1)] = b + c + d + e; + dst[i * dst_stride + (j + 2)] = c + d + e; + } +} + +static void boxsum(int32_t *src, int width, int height, int src_stride, int r, + int sqr, int32_t *dst, int dst_stride) { + if (r == 1) + boxsum1(src, width, height, src_stride, sqr, dst, dst_stride); + else if (r == 2) + boxsum2(src, width, height, src_stride, sqr, dst, dst_stride); + else + assert(0 && "Invalid value of r in self-guided filter"); +} + +void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params) { + if (params->r[0] == 0) { + xq[0] = 0; + xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1]; + } else if (params->r[1] == 0) { + xq[0] = xqd[0]; + xq[1] = 0; + } else { + xq[0] = xqd[0]; + xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1]; + } +} + +const int32_t av1_x_by_xplus1[256] = { + // Special case: Map 0 -> 1 (corresponding to a value of 1/256) + // instead of 0. See comments in selfguided_restoration_internal() for why + 1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239, + 240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247, + 248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250, + 250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253, + 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, + 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254, + 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 256, +}; + +const int32_t av1_one_by_x[MAX_NELEM] = { + 4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315, + 293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164, +}; + +static void calculate_intermediate_result(int32_t *dgd, int width, int height, + int dgd_stride, int bit_depth, + int sgr_params_idx, int radius_idx, + int pass, int32_t *A, int32_t *B) { + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + const int r = params->r[radius_idx]; + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; + // Adjusting the stride of A and B here appears to avoid bad cache effects, + // leading to a significant speed improvement. + // We also align the stride to a multiple of 16 bytes, for consistency + // with the SIMD version of this function. + int buf_stride = ((width_ext + 3) & ~3) + 16; + const int step = pass == 0 ? 1 : 2; + int i, j; + + assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r"); + assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 && + "Need SGRPROJ_BORDER_* >= r+1"); + + boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ, + width_ext, height_ext, dgd_stride, r, 0, B, buf_stride); + boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ, + width_ext, height_ext, dgd_stride, r, 1, A, buf_stride); + A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie, + // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[]. + for (i = -1; i < height + 1; i += step) { + for (j = -1; j < width + 1; ++j) { + const int k = i * buf_stride + j; + const int n = (2 * r + 1) * (2 * r + 1); + + // a < 2^16 * n < 2^22 regardless of bit depth + uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8)); + // b < 2^8 * n < 2^14 regardless of bit depth + uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8); + + // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28, + // and p itself satisfies p < 2^14 * n^2 < 2^26. + // This bound on p is due to: + // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances + // + // Note: Sometimes, in high bit depth, we can end up with a*n < b*b. + // This is an artefact of rounding, and can only happen if all pixels + // are (almost) identical, so in this case we saturate to p=0. + uint32_t p = (a * n < b * b) ? 0 : a * n - b * b; + + const uint32_t s = params->s[radius_idx]; + + // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32 + // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12 + // (this holds even after accounting for the rounding in s) + const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS); + + // Note: We have to be quite careful about the value of A[k]. + // This is used as a blend factor between individual pixel values and the + // local mean. So it logically has a range of [0, 256], including both + // endpoints. + // + // This is a pain for hardware, as we'd like something which can be stored + // in exactly 8 bits. + // Further, in the calculation of B[k] below, if z == 0 and r == 2, + // then A[k] "should be" 0. But then we can end up setting B[k] to a value + // slightly above 2^(8 + bit depth), due to rounding in the value of + // av1_one_by_x[25-1]. + // + // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0. + // This fixes the above issues (256 - A[k] fits in a uint8, and we can't + // overflow), without significantly affecting the final result: z == 0 + // implies that the image is essentially "flat", so the local mean and + // individual pixel values are very similar. + // + // Note that saturating on the other side, ie. requring A[k] <= 255, + // would be a bad idea, as that corresponds to the case where the image + // is very variable, when we want to preserve the local pixel value as + // much as possible. + A[k] = av1_x_by_xplus1[AOMMIN(z, 255)]; // in range [1, 256] + + // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n, + // av1_one_by_x[n - 1] = round(2^12 / n) + // => the product here is < 2^(20 + bit_depth) <= 2^32, + // and B[k] is set to a value < 2^(8 + bit depth) + // This holds even with the rounding in av1_one_by_x and in the overall + // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8. + B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) * + (uint32_t)B[k] * + (uint32_t)av1_one_by_x[n - 1], + SGRPROJ_RECIP_BITS); + } + } +} + +static void selfguided_restoration_fast_internal( + int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst, + int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) { + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + const int r = params->r[radius_idx]; + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + // Adjusting the stride of A and B here appears to avoid bad cache effects, + // leading to a significant speed improvement. + // We also align the stride to a multiple of 16 bytes, for consistency + // with the SIMD version of this function. + int buf_stride = ((width_ext + 3) & ~3) + 16; + int32_t A_[RESTORATION_PROC_UNIT_PELS]; + int32_t B_[RESTORATION_PROC_UNIT_PELS]; + int32_t *A = A_; + int32_t *B = B_; + int i, j; + calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth, + sgr_params_idx, radius_idx, 1, A, B); + A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + + // Use the A[] and B[] arrays to calculate the filtered image + (void)r; + assert(r == 2); + for (i = 0; i < height; ++i) { + if (!(i & 1)) { // even row + for (j = 0; j < width; ++j) { + const int k = i * buf_stride + j; + const int l = i * dgd_stride + j; + const int m = i * dst_stride + j; + const int nb = 5; + const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 + + (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] + + A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) * + 5; + const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 + + (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] + + B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) * + 5; + const int32_t v = a * dgd[l] + b; + dst[m] = + ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); + } + } else { // odd row + for (j = 0; j < width; ++j) { + const int k = i * buf_stride + j; + const int l = i * dgd_stride + j; + const int m = i * dst_stride + j; + const int nb = 4; + const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5; + const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5; + const int32_t v = a * dgd[l] + b; + dst[m] = + ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); + } + } + } +} + +static void selfguided_restoration_internal(int32_t *dgd, int width, int height, + int dgd_stride, int32_t *dst, + int dst_stride, int bit_depth, + int sgr_params_idx, + int radius_idx) { + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + // Adjusting the stride of A and B here appears to avoid bad cache effects, + // leading to a significant speed improvement. + // We also align the stride to a multiple of 16 bytes, for consistency + // with the SIMD version of this function. + int buf_stride = ((width_ext + 3) & ~3) + 16; + int32_t A_[RESTORATION_PROC_UNIT_PELS]; + int32_t B_[RESTORATION_PROC_UNIT_PELS]; + int32_t *A = A_; + int32_t *B = B_; + int i, j; + calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth, + sgr_params_idx, radius_idx, 0, A, B); + A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + + // Use the A[] and B[] arrays to calculate the filtered image + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int k = i * buf_stride + j; + const int l = i * dgd_stride + j; + const int m = i * dst_stride + j; + const int nb = 5; + const int32_t a = + (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) * + 4 + + (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] + + A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) * + 3; + const int32_t b = + (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) * + 4 + + (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] + + B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) * + 3; + const int32_t v = a * dgd[l] + b; + dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); + } + } +} + +int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height, + int dgd_stride, int32_t *flt0, int32_t *flt1, + int flt_stride, int sgr_params_idx, + int bit_depth, int highbd) { + int32_t dgd32_[RESTORATION_PROC_UNIT_PELS]; + const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ; + int32_t *dgd32 = + dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ; + + if (highbd) { + const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8); + for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) { + for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) { + dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j]; + } + } + } else { + for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) { + for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) { + dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j]; + } + } + } + + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + // If params->r == 0 we skip the corresponding filter. We only allow one of + // the radii to be 0, as having both equal to 0 would be equivalent to + // skipping SGR entirely. + assert(!(params->r[0] == 0 && params->r[1] == 0)); + + if (params->r[0] > 0) + selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride, + flt0, flt_stride, bit_depth, + sgr_params_idx, 0); + if (params->r[1] > 0) + selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1, + flt_stride, bit_depth, sgr_params_idx, 1); + return 0; +} + +void av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width, + int height, int stride, int eps, + const int *xqd, uint8_t *dst8, + int dst_stride, int32_t *tmpbuf, + int bit_depth, int highbd) { + int32_t *flt0 = tmpbuf; + int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; + assert(width * height <= RESTORATION_UNITPELS_MAX); + + const int ret = av1_selfguided_restoration_c( + dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd); + (void)ret; + assert(!ret); + const sgr_params_type *const params = &av1_sgr_params[eps]; + int xq[2]; + av1_decode_xq(xqd, xq, params); + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + const int k = i * width + j; + uint8_t *dst8ij = dst8 + i * dst_stride + j; + const uint8_t *dat8ij = dat8 + i * stride + j; + + const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij; + const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS; + int32_t v = u << SGRPROJ_PRJ_BITS; + // If params->r == 0 then we skipped the filtering in + // av1_selfguided_restoration_c, i.e. flt[k] == u + if (params->r[0] > 0) v += xq[0] * (flt0[k] - u); + if (params->r[1] > 0) v += xq[1] * (flt1[k] - u); + const int16_t w = + (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + + const uint16_t out = clip_pixel_highbd(w, bit_depth); + if (highbd) + *CONVERT_TO_SHORTPTR(dst8ij) = out; + else + *dst8ij = (uint8_t)out; + } + } +} + +static void sgrproj_filter_stripe(const RestorationUnitInfo *rui, + int stripe_width, int stripe_height, + int procunit_width, const uint8_t *src, + int src_stride, uint8_t *dst, int dst_stride, + int32_t *tmpbuf, int bit_depth) { + (void)bit_depth; + assert(bit_depth == 8); + + for (int j = 0; j < stripe_width; j += procunit_width) { + int w = AOMMIN(procunit_width, stripe_width - j); + av1_apply_selfguided_restoration( + src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep, + rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth, 0); + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void wiener_filter_stripe_highbd(const RestorationUnitInfo *rui, + int stripe_width, int stripe_height, + int procunit_width, const uint8_t *src8, + int src_stride, uint8_t *dst8, + int dst_stride, int32_t *tmpbuf, + int bit_depth) { + (void)tmpbuf; + const ConvolveParams conv_params = get_conv_params_wiener(bit_depth); + + for (int j = 0; j < stripe_width; j += procunit_width) { + int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15); + const uint8_t *src8_p = src8 + j; + uint8_t *dst8_p = dst8 + j; + av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride, + rui->wiener_info.hfilter, 16, + rui->wiener_info.vfilter, 16, w, + stripe_height, &conv_params, bit_depth); + } +} + +static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui, + int stripe_width, int stripe_height, + int procunit_width, + const uint8_t *src8, int src_stride, + uint8_t *dst8, int dst_stride, + int32_t *tmpbuf, int bit_depth) { + for (int j = 0; j < stripe_width; j += procunit_width) { + int w = AOMMIN(procunit_width, stripe_width - j); + av1_apply_selfguided_restoration( + src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep, + rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth, 1); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui, + int stripe_width, int stripe_height, + int procunit_width, const uint8_t *src, + int src_stride, uint8_t *dst, int dst_stride, + int32_t *tmpbuf, int bit_depth); + +#if CONFIG_AV1_HIGHBITDEPTH +#define NUM_STRIPE_FILTERS 4 +static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = { + wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd, + sgrproj_filter_stripe_highbd +}; +#else +#define NUM_STRIPE_FILTERS 2 +static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = { + wiener_filter_stripe, sgrproj_filter_stripe +}; +#endif // CONFIG_AV1_HIGHBITDEPTH + +// Filter one restoration unit +void av1_loop_restoration_filter_unit( + const RestorationTileLimits *limits, const RestorationUnitInfo *rui, + const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs, + const AV1PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y, + int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8, + int dst_stride, int32_t *tmpbuf, int optimized_lr) { + RestorationType unit_rtype = rui->restoration_type; + + int unit_h = limits->v_end - limits->v_start; + int unit_w = limits->h_end - limits->h_start; + uint8_t *data8_tl = data8 + limits->v_start * stride + limits->h_start; + uint8_t *dst8_tl = dst8 + limits->v_start * dst_stride + limits->h_start; + + if (unit_rtype == RESTORE_NONE) { + copy_tile(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride, highbd); + return; + } + + const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ); + assert(filter_idx < NUM_STRIPE_FILTERS); + const stripe_filter_fun stripe_filter = stripe_filters[filter_idx]; + + const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x; + + // Convolve the whole tile one stripe at a time + RestorationTileLimits remaining_stripes = *limits; + int i = 0; + while (i < unit_h) { + int copy_above, copy_below; + remaining_stripes.v_start = limits->v_start + i; + + get_stripe_boundary_info(&remaining_stripes, tile_rect, ss_y, ©_above, + ©_below); + + const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y; + const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y; + + // Work out where this stripe's boundaries are within + // rsb->stripe_boundary_{above,below} + const int tile_stripe = + (remaining_stripes.v_start - tile_rect->top + runit_offset) / + full_stripe_height; + const int frame_stripe = tile_stripe0 + tile_stripe; + const int rsb_row = RESTORATION_CTX_VERT * frame_stripe; + + // Calculate this stripe's height, based on two rules: + // * The topmost stripe in each tile is 8 luma pixels shorter than usual. + // * We can't extend past the end of the current restoration unit + const int nominal_stripe_height = + full_stripe_height - ((tile_stripe == 0) ? runit_offset : 0); + const int h = AOMMIN(nominal_stripe_height, + remaining_stripes.v_end - remaining_stripes.v_start); + + setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd, + h, data8, stride, rlbs, copy_above, + copy_below, optimized_lr); + + stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride, + dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth); + + restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h, + data8, stride, copy_above, copy_below, + optimized_lr); + + i += h; + } +} + +static void filter_frame_on_unit(const RestorationTileLimits *limits, + const AV1PixelRect *tile_rect, + int rest_unit_idx, void *priv, int32_t *tmpbuf, + RestorationLineBuffers *rlbs) { + FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv; + const RestorationInfo *rsi = ctxt->rsi; + + av1_loop_restoration_filter_unit( + limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs, tile_rect, + ctxt->tile_stripe0, ctxt->ss_x, ctxt->ss_y, ctxt->highbd, ctxt->bit_depth, + ctxt->data8, ctxt->data_stride, ctxt->dst8, ctxt->dst_stride, tmpbuf, + rsi->optimized_lr); +} + +void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt, + YV12_BUFFER_CONFIG *frame, + AV1_COMMON *cm, int optimized_lr, + int num_planes) { + const SequenceHeader *const seq_params = &cm->seq_params; + const int bit_depth = seq_params->bit_depth; + const int highbd = seq_params->use_highbitdepth; + lr_ctxt->dst = &cm->rst_frame; + + const int frame_width = frame->crop_widths[0]; + const int frame_height = frame->crop_heights[0]; + if (aom_realloc_frame_buffer( + lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x, + seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER, + cm->features.byte_alignment, NULL, NULL, NULL) < 0) + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate restoration dst buffer"); + + lr_ctxt->on_rest_unit = filter_frame_on_unit; + lr_ctxt->frame = frame; + for (int plane = 0; plane < num_planes; ++plane) { + RestorationInfo *rsi = &cm->rst_info[plane]; + RestorationType rtype = rsi->frame_restoration_type; + rsi->optimized_lr = optimized_lr; + + if (rtype == RESTORE_NONE) { + continue; + } + + const int is_uv = plane > 0; + const int plane_width = frame->crop_widths[is_uv]; + const int plane_height = frame->crop_heights[is_uv]; + FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane]; + + av1_extend_frame(frame->buffers[plane], plane_width, plane_height, + frame->strides[is_uv], RESTORATION_BORDER, + RESTORATION_BORDER, highbd); + + lr_plane_ctxt->rsi = rsi; + lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x; + lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y; + lr_plane_ctxt->highbd = highbd; + lr_plane_ctxt->bit_depth = bit_depth; + lr_plane_ctxt->data8 = frame->buffers[plane]; + lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane]; + lr_plane_ctxt->data_stride = frame->strides[is_uv]; + lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv]; + lr_plane_ctxt->tile_rect = av1_whole_frame_rect(cm, is_uv); + lr_plane_ctxt->tile_stripe0 = 0; + } +} + +void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt, + AV1_COMMON *cm, int num_planes) { + typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend, + int vstart, int vend); + static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y, + aom_yv12_partial_coloc_copy_u, + aom_yv12_partial_coloc_copy_v }; + assert(num_planes <= 3); + for (int plane = 0; plane < num_planes; ++plane) { + if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; + AV1PixelRect tile_rect = loop_rest_ctxt->ctxt[plane].tile_rect; + copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, tile_rect.left, + tile_rect.right, tile_rect.top, tile_rect.bottom); + } +} + +static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm, + int num_planes) { + FilterFrameCtxt *ctxt = lr_ctxt->ctxt; + + for (int plane = 0; plane < num_planes; ++plane) { + if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) { + continue; + } + + av1_foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit, + &ctxt[plane], &ctxt[plane].tile_rect, + cm->rst_tmpbuf, cm->rlbs); + } +} + +void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame, + AV1_COMMON *cm, int optimized_lr, + void *lr_ctxt) { + assert(!cm->features.all_lossless); + const int num_planes = av1_num_planes(cm); + + AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt; + + av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm, + optimized_lr, num_planes); + + foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes); + + av1_loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes); +} + +void av1_foreach_rest_unit_in_row( + RestorationTileLimits *limits, const AV1PixelRect *tile_rect, + rest_unit_visitor_t on_rest_unit, int row_number, int unit_size, + int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane, + void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs, + sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write, + struct AV1LrSyncData *const lr_sync) { + const int tile_w = tile_rect->right - tile_rect->left; + const int ext_size = unit_size * 3 / 2; + int x0 = 0, j = 0; + while (x0 < tile_w) { + int remaining_w = tile_w - x0; + int w = (remaining_w < ext_size) ? remaining_w : unit_size; + + limits->h_start = tile_rect->left + x0; + limits->h_end = tile_rect->left + x0 + w; + assert(limits->h_end <= tile_rect->right); + + const int unit_idx = unit_idx0 + row_number * hunits_per_tile + j; + + // No sync for even numbered rows + // For odd numbered rows, Loop Restoration of current block requires the LR + // of top-right and bottom-right blocks to be completed + + // top-right sync + on_sync_read(lr_sync, row_number, j, plane); + if ((row_number + 1) < vunits_per_tile) + // bottom-right sync + on_sync_read(lr_sync, row_number + 2, j, plane); + + on_rest_unit(limits, tile_rect, unit_idx, priv, tmpbuf, rlbs); + + on_sync_write(lr_sync, row_number, j, hunits_per_tile, plane); + + x0 += w; + ++j; + } +} + +void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) { + (void)lr_sync; + (void)r; + (void)c; + (void)plane; +} + +void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c, + const int sb_cols, int plane) { + (void)lr_sync; + (void)r; + (void)c; + (void)sb_cols; + (void)plane; +} + +static void foreach_rest_unit_in_tile( + const AV1PixelRect *tile_rect, int tile_row, int tile_col, int tile_cols, + int hunits_per_tile, int vunits_per_tile, int units_per_tile, int unit_size, + int ss_y, int plane, rest_unit_visitor_t on_rest_unit, void *priv, + int32_t *tmpbuf, RestorationLineBuffers *rlbs) { + const int tile_h = tile_rect->bottom - tile_rect->top; + const int ext_size = unit_size * 3 / 2; + + const int tile_idx = tile_col + tile_row * tile_cols; + const int unit_idx0 = tile_idx * units_per_tile; + + int y0 = 0, i = 0; + while (y0 < tile_h) { + int remaining_h = tile_h - y0; + int h = (remaining_h < ext_size) ? remaining_h : unit_size; + + RestorationTileLimits limits; + limits.v_start = tile_rect->top + y0; + limits.v_end = tile_rect->top + y0 + h; + assert(limits.v_end <= tile_rect->bottom); + // Offset the tile upwards to align with the restoration processing stripe + const int voffset = RESTORATION_UNIT_OFFSET >> ss_y; + limits.v_start = AOMMAX(tile_rect->top, limits.v_start - voffset); + if (limits.v_end < tile_rect->bottom) limits.v_end -= voffset; + + av1_foreach_rest_unit_in_row( + &limits, tile_rect, on_rest_unit, i, unit_size, unit_idx0, + hunits_per_tile, vunits_per_tile, plane, priv, tmpbuf, rlbs, + av1_lr_sync_read_dummy, av1_lr_sync_write_dummy, NULL); + + y0 += h; + ++i; + } +} + +void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane, + rest_unit_visitor_t on_rest_unit, + void *priv, AV1PixelRect *tile_rect, + int32_t *tmpbuf, + RestorationLineBuffers *rlbs) { + const int is_uv = plane > 0; + const int ss_y = is_uv && cm->seq_params.subsampling_y; + + const RestorationInfo *rsi = &cm->rst_info[plane]; + + foreach_rest_unit_in_tile(tile_rect, LR_TILE_ROW, LR_TILE_COL, LR_TILE_COLS, + rsi->horz_units_per_tile, rsi->vert_units_per_tile, + rsi->units_per_tile, rsi->restoration_unit_size, + ss_y, plane, on_rest_unit, priv, tmpbuf, rlbs); +} + +int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane, + int mi_row, int mi_col, BLOCK_SIZE bsize, + int *rcol0, int *rcol1, int *rrow0, + int *rrow1) { + assert(rcol0 && rcol1 && rrow0 && rrow1); + + if (bsize != cm->seq_params.sb_size) return 0; + if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) return 0; + + assert(!cm->features.all_lossless); + + const int is_uv = plane > 0; + + const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv); + const int tile_w = tile_rect.right - tile_rect.left; + const int tile_h = tile_rect.bottom - tile_rect.top; + + const int mi_top = 0; + const int mi_left = 0; + + // Compute the mi-unit corners of the superblock relative to the top-left of + // the tile + const int mi_rel_row0 = mi_row - mi_top; + const int mi_rel_col0 = mi_col - mi_left; + const int mi_rel_row1 = mi_rel_row0 + mi_size_high[bsize]; + const int mi_rel_col1 = mi_rel_col0 + mi_size_wide[bsize]; + + const RestorationInfo *rsi = &cm->rst_info[plane]; + const int size = rsi->restoration_unit_size; + + // Calculate the number of restoration units in this tile (which might be + // strictly less than rsi->horz_units_per_tile and rsi->vert_units_per_tile) + const int horz_units = av1_lr_count_units_in_tile(size, tile_w); + const int vert_units = av1_lr_count_units_in_tile(size, tile_h); + + // The size of an MI-unit on this plane of the image + const int ss_x = is_uv && cm->seq_params.subsampling_x; + const int ss_y = is_uv && cm->seq_params.subsampling_y; + const int mi_size_x = MI_SIZE >> ss_x; + const int mi_size_y = MI_SIZE >> ss_y; + + // Write m for the relative mi column or row, D for the superres denominator + // and N for the superres numerator. If u is the upscaled pixel offset then + // we can write the downscaled pixel offset in two ways as: + // + // MI_SIZE * m = N / D u + // + // from which we get u = D * MI_SIZE * m / N + const int mi_to_num_x = av1_superres_scaled(cm) + ? mi_size_x * cm->superres_scale_denominator + : mi_size_x; + const int mi_to_num_y = mi_size_y; + const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size; + const int denom_y = size; + + const int rnd_x = denom_x - 1; + const int rnd_y = denom_y - 1; + + // rcol0/rrow0 should be the first column/row of restoration units (relative + // to the top-left of the tile) that doesn't start left/below of + // mi_col/mi_row. For this calculation, we need to round up the division (if + // the sb starts at runit column 10.1, the first matching runit has column + // index 11) + *rcol0 = (mi_rel_col0 * mi_to_num_x + rnd_x) / denom_x; + *rrow0 = (mi_rel_row0 * mi_to_num_y + rnd_y) / denom_y; + + // rel_col1/rel_row1 is the equivalent calculation, but for the superblock + // below-right. If we're at the bottom or right of the tile, this restoration + // unit might not exist, in which case we'll clamp accordingly. + *rcol1 = AOMMIN((mi_rel_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units); + *rrow1 = AOMMIN((mi_rel_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units); + + return *rcol0 < *rcol1 && *rrow0 < *rrow1; +} + +// Extend to left and right +static void extend_lines(uint8_t *buf, int width, int height, int stride, + int extend, int use_highbitdepth) { + for (int i = 0; i < height; ++i) { + if (use_highbitdepth) { + uint16_t *buf16 = (uint16_t *)buf; + aom_memset16(buf16 - extend, buf16[0], extend); + aom_memset16(buf16 + width, buf16[width - 1], extend); + } else { + memset(buf - extend, buf[0], extend); + memset(buf + width, buf[width - 1], extend); + } + buf += stride; + } +} + +static void save_deblock_boundary_lines( + const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row, + int stripe, int use_highbd, int is_above, + RestorationStripeBoundaries *boundaries) { + const int is_uv = plane > 0; + const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]); + const int src_stride = frame->strides[is_uv] << use_highbd; + const uint8_t *src_rows = src_buf + row * src_stride; + + uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above + : boundaries->stripe_boundary_below; + uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd); + const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd; + uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride; + + // There is a rare case in which a processing stripe can end 1px above the + // crop border. In this case, we do want to use deblocked pixels from below + // the stripe (hence why we ended up in this function), but instead of + // fetching 2 "below" rows we need to fetch one and duplicate it. + // This is equivalent to clamping the sample locations against the crop border + const int lines_to_save = + AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row); + assert(lines_to_save == 1 || lines_to_save == 2); + + int upscaled_width; + int line_bytes; + if (av1_superres_scaled(cm)) { + const int ss_x = is_uv && cm->seq_params.subsampling_x; + upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x; + line_bytes = upscaled_width << use_highbd; + if (use_highbd) + av1_upscale_normative_rows( + cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv], + CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride, + plane, lines_to_save); + else + av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows, + boundaries->stripe_boundary_stride, plane, + lines_to_save); + } else { + upscaled_width = frame->crop_widths[is_uv]; + line_bytes = upscaled_width << use_highbd; + for (int i = 0; i < lines_to_save; i++) { + memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride, + line_bytes); + } + } + // If we only saved one line, then copy it into the second line buffer + if (lines_to_save == 1) + memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes); + + extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride, + RESTORATION_EXTRA_HORZ, use_highbd); +} + +static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame, + const AV1_COMMON *cm, int plane, int row, + int stripe, int use_highbd, int is_above, + RestorationStripeBoundaries *boundaries) { + const int is_uv = plane > 0; + const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]); + const int src_stride = frame->strides[is_uv] << use_highbd; + const uint8_t *src_rows = src_buf + row * src_stride; + + uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above + : boundaries->stripe_boundary_below; + uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd); + const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd; + uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride; + const int src_width = frame->crop_widths[is_uv]; + + // At the point where this function is called, we've already applied + // superres. So we don't need to extend the lines here, we can just + // pull directly from the topmost row of the upscaled frame. + const int ss_x = is_uv && cm->seq_params.subsampling_x; + const int upscaled_width = av1_superres_scaled(cm) + ? (cm->superres_upscaled_width + ss_x) >> ss_x + : src_width; + const int line_bytes = upscaled_width << use_highbd; + for (int i = 0; i < RESTORATION_CTX_VERT; i++) { + // Copy the line at 'row' into both context lines. This is because + // we want to (effectively) extend the outermost row of CDEF data + // from this tile to produce a border, rather than using deblocked + // pixels from the tile above/below. + memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes); + } + extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride, + RESTORATION_EXTRA_HORZ, use_highbd); +} + +static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame, + int use_highbd, int plane, + AV1_COMMON *cm, int after_cdef) { + const int is_uv = plane > 0; + const int ss_y = is_uv && cm->seq_params.subsampling_y; + const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y; + const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y; + + // Get the tile rectangle, with height rounded up to the next multiple of 8 + // luma pixels (only relevant for the bottom tile of the frame) + const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv); + const int stripe0 = 0; + + RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries; + + const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y); + + int tile_stripe; + for (tile_stripe = 0;; ++tile_stripe) { + const int rel_y0 = AOMMAX(0, tile_stripe * stripe_height - stripe_off); + const int y0 = tile_rect.top + rel_y0; + if (y0 >= tile_rect.bottom) break; + + const int rel_y1 = (tile_stripe + 1) * stripe_height - stripe_off; + const int y1 = AOMMIN(tile_rect.top + rel_y1, tile_rect.bottom); + + const int frame_stripe = stripe0 + tile_stripe; + + // In this case, we should only use CDEF pixels at the top + // and bottom of the frame as a whole; internal tile boundaries + // can use deblocked pixels from adjacent tiles for context. + const int use_deblock_above = (frame_stripe > 0); + const int use_deblock_below = (y1 < plane_height); + + if (!after_cdef) { + // Save deblocked context where needed. + if (use_deblock_above) { + save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT, + frame_stripe, use_highbd, 1, boundaries); + } + if (use_deblock_below) { + save_deblock_boundary_lines(frame, cm, plane, y1, frame_stripe, + use_highbd, 0, boundaries); + } + } else { + // Save CDEF context where needed. Note that we need to save the CDEF + // context for a particular boundary iff we *didn't* save deblocked + // context for that boundary. + // + // In addition, we need to save copies of the outermost line within + // the tile, rather than using data from outside the tile. + if (!use_deblock_above) { + save_cdef_boundary_lines(frame, cm, plane, y0, frame_stripe, use_highbd, + 1, boundaries); + } + if (!use_deblock_below) { + save_cdef_boundary_lines(frame, cm, plane, y1 - 1, frame_stripe, + use_highbd, 0, boundaries); + } + } + } +} + +// For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan +// lines to be used as boundary in the loop restoration process. The +// lines are saved in rst_internal.stripe_boundary_lines +void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame, + AV1_COMMON *cm, int after_cdef) { + const int num_planes = av1_num_planes(cm); + const int use_highbd = cm->seq_params.use_highbitdepth; + for (int p = 0; p < num_planes; ++p) { + save_tile_row_boundary_lines(frame, use_highbd, p, cm, after_cdef); + } +} diff --git a/libs/libaom/src/av1/common/restoration.h b/libs/libaom/src/av1/common/restoration.h new file mode 100644 index 000000000..3b80dd5a9 --- /dev/null +++ b/libs/libaom/src/av1/common/restoration.h @@ -0,0 +1,380 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_RESTORATION_H_ +#define AOM_AV1_COMMON_RESTORATION_H_ + +#include "aom_ports/mem.h" +#include "config/aom_config.h" + +#include "av1/common/blockd.h" +#include "av1/common/enums.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Border for Loop restoration buffer +#define AOM_RESTORATION_FRAME_BORDER 32 +#define CLIP(x, lo, hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x)) +#define RINT(x) ((x) < 0 ? (int)((x)-0.5) : (int)((x) + 0.5)) + +#define RESTORATION_PROC_UNIT_SIZE 64 + +// Filter tile grid offset upwards compared to the superblock grid +#define RESTORATION_UNIT_OFFSET 8 + +#define SGRPROJ_BORDER_VERT 3 // Vertical border used for Sgr +#define SGRPROJ_BORDER_HORZ 3 // Horizontal border used for Sgr + +#define WIENER_BORDER_VERT 2 // Vertical border used for Wiener +#define WIENER_HALFWIN 3 +#define WIENER_BORDER_HORZ (WIENER_HALFWIN) // Horizontal border for Wiener + +// RESTORATION_BORDER_VERT determines line buffer requirement for LR. +// Should be set at the max of SGRPROJ_BORDER_VERT and WIENER_BORDER_VERT. +// Note the line buffer needed is twice the value of this macro. +#if SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT +#define RESTORATION_BORDER_VERT (SGRPROJ_BORDER_VERT) +#else +#define RESTORATION_BORDER_VERT (WIENER_BORDER_VERT) +#endif // SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT + +#if SGRPROJ_BORDER_HORZ >= WIENER_BORDER_HORZ +#define RESTORATION_BORDER_HORZ (SGRPROJ_BORDER_HORZ) +#else +#define RESTORATION_BORDER_HORZ (WIENER_BORDER_HORZ) +#endif // SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT + +// How many border pixels do we need for each processing unit? +#define RESTORATION_BORDER 3 + +// How many rows of deblocked pixels do we save above/below each processing +// stripe? +#define RESTORATION_CTX_VERT 2 + +// Additional pixels to the left and right in above/below buffers +// It is RESTORATION_BORDER_HORZ rounded up to get nicer buffer alignment +#define RESTORATION_EXTRA_HORZ 4 + +// Pad up to 20 more (may be much less is needed) +#define RESTORATION_PADDING 20 +#define RESTORATION_PROC_UNIT_PELS \ + ((RESTORATION_PROC_UNIT_SIZE + RESTORATION_BORDER_HORZ * 2 + \ + RESTORATION_PADDING) * \ + (RESTORATION_PROC_UNIT_SIZE + RESTORATION_BORDER_VERT * 2 + \ + RESTORATION_PADDING)) + +#define RESTORATION_UNITSIZE_MAX 256 +#define RESTORATION_UNITPELS_HORZ_MAX \ + (RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16) +#define RESTORATION_UNITPELS_VERT_MAX \ + ((RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT + \ + RESTORATION_UNIT_OFFSET)) +#define RESTORATION_UNITPELS_MAX \ + (RESTORATION_UNITPELS_HORZ_MAX * RESTORATION_UNITPELS_VERT_MAX) + +// Two 32-bit buffers needed for the restored versions from two filters +// TODO(debargha, rupert): Refactor to not need the large tilesize to be stored +// on the decoder side. +#define SGRPROJ_TMPBUF_SIZE (RESTORATION_UNITPELS_MAX * 2 * sizeof(int32_t)) + +#define SGRPROJ_EXTBUF_SIZE (0) +#define SGRPROJ_PARAMS_BITS 4 +#define SGRPROJ_PARAMS (1 << SGRPROJ_PARAMS_BITS) + +// Precision bits for projection +#define SGRPROJ_PRJ_BITS 7 +// Restoration precision bits generated higher than source before projection +#define SGRPROJ_RST_BITS 4 +// Internal precision bits for core selfguided_restoration +#define SGRPROJ_SGR_BITS 8 +#define SGRPROJ_SGR (1 << SGRPROJ_SGR_BITS) + +#define SGRPROJ_PRJ_MIN0 (-(1 << SGRPROJ_PRJ_BITS) * 3 / 4) +#define SGRPROJ_PRJ_MAX0 (SGRPROJ_PRJ_MIN0 + (1 << SGRPROJ_PRJ_BITS) - 1) +#define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 4) +#define SGRPROJ_PRJ_MAX1 (SGRPROJ_PRJ_MIN1 + (1 << SGRPROJ_PRJ_BITS) - 1) + +#define SGRPROJ_PRJ_SUBEXP_K 4 + +#define SGRPROJ_BITS (SGRPROJ_PRJ_BITS * 2 + SGRPROJ_PARAMS_BITS) + +#define MAX_RADIUS 2 // Only 1, 2, 3 allowed +#define MAX_NELEM ((2 * MAX_RADIUS + 1) * (2 * MAX_RADIUS + 1)) +#define SGRPROJ_MTABLE_BITS 20 +#define SGRPROJ_RECIP_BITS 12 + +#define WIENER_HALFWIN1 (WIENER_HALFWIN + 1) +#define WIENER_WIN (2 * WIENER_HALFWIN + 1) +#define WIENER_WIN2 ((WIENER_WIN) * (WIENER_WIN)) +#define WIENER_TMPBUF_SIZE (0) +#define WIENER_EXTBUF_SIZE (0) + +// If WIENER_WIN_CHROMA == WIENER_WIN - 2, that implies 5x5 filters are used for +// chroma. To use 7x7 for chroma set WIENER_WIN_CHROMA to WIENER_WIN. +#define WIENER_WIN_CHROMA (WIENER_WIN - 2) +#define WIENER_WIN_REDUCED (WIENER_WIN - 2) +#define WIENER_WIN2_CHROMA ((WIENER_WIN_CHROMA) * (WIENER_WIN_CHROMA)) + +#define WIENER_FILT_PREC_BITS 7 +#define WIENER_FILT_STEP (1 << WIENER_FILT_PREC_BITS) + +// Central values for the taps +#define WIENER_FILT_TAP0_MIDV (3) +#define WIENER_FILT_TAP1_MIDV (-7) +#define WIENER_FILT_TAP2_MIDV (15) +#define WIENER_FILT_TAP3_MIDV \ + (WIENER_FILT_STEP - 2 * (WIENER_FILT_TAP0_MIDV + WIENER_FILT_TAP1_MIDV + \ + WIENER_FILT_TAP2_MIDV)) + +#define WIENER_FILT_TAP0_BITS 4 +#define WIENER_FILT_TAP1_BITS 5 +#define WIENER_FILT_TAP2_BITS 6 + +#define WIENER_FILT_BITS \ + ((WIENER_FILT_TAP0_BITS + WIENER_FILT_TAP1_BITS + WIENER_FILT_TAP2_BITS) * 2) + +#define WIENER_FILT_TAP0_MINV \ + (WIENER_FILT_TAP0_MIDV - (1 << WIENER_FILT_TAP0_BITS) / 2) +#define WIENER_FILT_TAP1_MINV \ + (WIENER_FILT_TAP1_MIDV - (1 << WIENER_FILT_TAP1_BITS) / 2) +#define WIENER_FILT_TAP2_MINV \ + (WIENER_FILT_TAP2_MIDV - (1 << WIENER_FILT_TAP2_BITS) / 2) + +#define WIENER_FILT_TAP0_MAXV \ + (WIENER_FILT_TAP0_MIDV - 1 + (1 << WIENER_FILT_TAP0_BITS) / 2) +#define WIENER_FILT_TAP1_MAXV \ + (WIENER_FILT_TAP1_MIDV - 1 + (1 << WIENER_FILT_TAP1_BITS) / 2) +#define WIENER_FILT_TAP2_MAXV \ + (WIENER_FILT_TAP2_MIDV - 1 + (1 << WIENER_FILT_TAP2_BITS) / 2) + +#define WIENER_FILT_TAP0_SUBEXP_K 1 +#define WIENER_FILT_TAP1_SUBEXP_K 2 +#define WIENER_FILT_TAP2_SUBEXP_K 3 + +// Max of SGRPROJ_TMPBUF_SIZE, DOMAINTXFMRF_TMPBUF_SIZE, WIENER_TMPBUF_SIZE +#define RESTORATION_TMPBUF_SIZE (SGRPROJ_TMPBUF_SIZE) + +// Max of SGRPROJ_EXTBUF_SIZE, WIENER_EXTBUF_SIZE +#define RESTORATION_EXTBUF_SIZE (WIENER_EXTBUF_SIZE) + +// Check the assumptions of the existing code +#if SUBPEL_TAPS != WIENER_WIN + 1 +#error "Wiener filter currently only works if SUBPEL_TAPS == WIENER_WIN + 1" +#endif +#if WIENER_FILT_PREC_BITS != 7 +#error "Wiener filter currently only works if WIENER_FILT_PREC_BITS == 7" +#endif + +#define LR_TILE_ROW 0 +#define LR_TILE_COL 0 +#define LR_TILE_COLS 1 + +typedef struct { + int r[2]; // radii + int s[2]; // sgr parameters for r[0] and r[1], based on GenSgrprojVtable() +} sgr_params_type; + +typedef struct { + RestorationType restoration_type; + WienerInfo wiener_info; + SgrprojInfo sgrproj_info; +} RestorationUnitInfo; + +// A restoration line buffer needs space for two lines plus a horizontal filter +// margin of RESTORATION_EXTRA_HORZ on each side. +#define RESTORATION_LINEBUFFER_WIDTH \ + (RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_EXTRA_HORZ) + +// Similarly, the column buffers (used when we're at a vertical tile edge +// that we can't filter across) need space for one processing unit's worth +// of pixels, plus the top/bottom border width +#define RESTORATION_COLBUFFER_HEIGHT \ + (RESTORATION_PROC_UNIT_SIZE + 2 * RESTORATION_BORDER) + +typedef struct { + // Temporary buffers to save/restore 3 lines above/below the restoration + // stripe. + uint16_t tmp_save_above[RESTORATION_BORDER][RESTORATION_LINEBUFFER_WIDTH]; + uint16_t tmp_save_below[RESTORATION_BORDER][RESTORATION_LINEBUFFER_WIDTH]; +} RestorationLineBuffers; + +typedef struct { + uint8_t *stripe_boundary_above; + uint8_t *stripe_boundary_below; + int stripe_boundary_stride; + int stripe_boundary_size; +} RestorationStripeBoundaries; + +typedef struct { + RestorationType frame_restoration_type; + int restoration_unit_size; + + // Fields below here are allocated and initialised by + // av1_alloc_restoration_struct. (horz_)units_per_tile give the number of + // restoration units in (one row of) the largest tile in the frame. The data + // in unit_info is laid out with units_per_tile entries for each tile, which + // have stride horz_units_per_tile. + // + // Even if there are tiles of different sizes, the data in unit_info is laid + // out as if all tiles are of full size. + int units_per_tile; + int vert_units_per_tile, horz_units_per_tile; + RestorationUnitInfo *unit_info; + RestorationStripeBoundaries boundaries; + int optimized_lr; +} RestorationInfo; + +static INLINE void set_default_sgrproj(SgrprojInfo *sgrproj_info) { + sgrproj_info->xqd[0] = (SGRPROJ_PRJ_MIN0 + SGRPROJ_PRJ_MAX0) / 2; + sgrproj_info->xqd[1] = (SGRPROJ_PRJ_MIN1 + SGRPROJ_PRJ_MAX1) / 2; +} + +static INLINE void set_default_wiener(WienerInfo *wiener_info) { + wiener_info->vfilter[0] = wiener_info->hfilter[0] = WIENER_FILT_TAP0_MIDV; + wiener_info->vfilter[1] = wiener_info->hfilter[1] = WIENER_FILT_TAP1_MIDV; + wiener_info->vfilter[2] = wiener_info->hfilter[2] = WIENER_FILT_TAP2_MIDV; + wiener_info->vfilter[WIENER_HALFWIN] = wiener_info->hfilter[WIENER_HALFWIN] = + -2 * + (WIENER_FILT_TAP2_MIDV + WIENER_FILT_TAP1_MIDV + WIENER_FILT_TAP0_MIDV); + wiener_info->vfilter[4] = wiener_info->hfilter[4] = WIENER_FILT_TAP2_MIDV; + wiener_info->vfilter[5] = wiener_info->hfilter[5] = WIENER_FILT_TAP1_MIDV; + wiener_info->vfilter[6] = wiener_info->hfilter[6] = WIENER_FILT_TAP0_MIDV; +} + +typedef struct { + int h_start, h_end, v_start, v_end; +} RestorationTileLimits; + +typedef void (*rest_unit_visitor_t)(const RestorationTileLimits *limits, + const AV1PixelRect *tile_rect, + int rest_unit_idx, void *priv, + int32_t *tmpbuf, + RestorationLineBuffers *rlbs); + +typedef struct FilterFrameCtxt { + const RestorationInfo *rsi; + int tile_stripe0; + int ss_x, ss_y; + int highbd, bit_depth; + uint8_t *data8, *dst8; + int data_stride, dst_stride; + AV1PixelRect tile_rect; +} FilterFrameCtxt; + +typedef struct AV1LrStruct { + rest_unit_visitor_t on_rest_unit; + FilterFrameCtxt ctxt[MAX_MB_PLANE]; + YV12_BUFFER_CONFIG *frame; + YV12_BUFFER_CONFIG *dst; +} AV1LrStruct; + +extern const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS]; +extern int sgrproj_mtable[SGRPROJ_PARAMS][2]; +extern const int32_t av1_x_by_xplus1[256]; +extern const int32_t av1_one_by_x[MAX_NELEM]; + +void av1_alloc_restoration_struct(struct AV1Common *cm, RestorationInfo *rsi, + int is_uv); +void av1_free_restoration_struct(RestorationInfo *rst_info); + +void av1_extend_frame(uint8_t *data, int width, int height, int stride, + int border_horz, int border_vert, int highbd); +void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params); + +// Filter a single loop restoration unit. +// +// limits is the limits of the unit. rui gives the mode to use for this unit +// and its coefficients. If striped loop restoration is enabled, rsb contains +// deblocked pixels to use for stripe boundaries; rlbs is just some space to +// use as a scratch buffer. tile_rect gives the limits of the tile containing +// this unit. tile_stripe0 is the index of the first stripe in this tile. +// +// ss_x and ss_y are flags which should be 1 if this is a plane with +// horizontal/vertical subsampling, respectively. highbd is a flag which should +// be 1 in high bit depth mode, in which case bit_depth is the bit depth. +// +// data8 is the frame data (pointing at the top-left corner of the frame, not +// the restoration unit) and stride is its stride. dst8 is the buffer where the +// results will be written and has stride dst_stride. Like data8, dst8 should +// point at the top-left corner of the frame. +// +// Finally tmpbuf is a scratch buffer used by the sgrproj filter which should +// be at least SGRPROJ_TMPBUF_SIZE big. +void av1_loop_restoration_filter_unit( + const RestorationTileLimits *limits, const RestorationUnitInfo *rui, + const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs, + const AV1PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y, + int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8, + int dst_stride, int32_t *tmpbuf, int optimized_lr); + +void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame, + struct AV1Common *cm, int optimized_lr, + void *lr_ctxt); +void av1_loop_restoration_precal(); + +typedef void (*rest_tile_start_visitor_t)(int tile_row, int tile_col, + void *priv); +struct AV1LrSyncData; + +typedef void (*sync_read_fn_t)(void *const lr_sync, int r, int c, int plane); + +typedef void (*sync_write_fn_t)(void *const lr_sync, int r, int c, + const int sb_cols, int plane); + +// Call on_rest_unit for each loop restoration unit in the plane. +void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane, + rest_unit_visitor_t on_rest_unit, + void *priv, AV1PixelRect *tile_rect, + int32_t *tmpbuf, + RestorationLineBuffers *rlbs); + +// Return 1 iff the block at mi_row, mi_col with size bsize is a +// top-level superblock containing the top-left corner of at least one +// loop restoration unit. +// +// If the block is a top-level superblock, the function writes to +// *rcol0, *rcol1, *rrow0, *rrow1. The rectangle of restoration unit +// indices given by [*rcol0, *rcol1) x [*rrow0, *rrow1) are relative +// to the current tile, whose starting index is returned as +// *tile_tl_idx. +int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane, + int mi_row, int mi_col, BLOCK_SIZE bsize, + int *rcol0, int *rcol1, int *rrow0, + int *rrow1); + +void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame, + struct AV1Common *cm, + int after_cdef); +void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt, + YV12_BUFFER_CONFIG *frame, + struct AV1Common *cm, + int optimized_lr, int num_planes); +void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt, + struct AV1Common *cm, int num_planes); +void av1_foreach_rest_unit_in_row( + RestorationTileLimits *limits, const AV1PixelRect *tile_rect, + rest_unit_visitor_t on_rest_unit, int row_number, int unit_size, + int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane, + void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs, + sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write, + struct AV1LrSyncData *const lr_sync); +AV1PixelRect av1_whole_frame_rect(const struct AV1Common *cm, int is_uv); +int av1_lr_count_units_in_tile(int unit_size, int tile_size); +void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane); +void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c, + const int sb_cols, int plane); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_RESTORATION_H_ diff --git a/libs/libaom/src/av1/common/scale.c b/libs/libaom/src/av1/common/scale.c new file mode 100644 index 000000000..3b14c0a2c --- /dev/null +++ b/libs/libaom/src/av1/common/scale.c @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "av1/common/filter.h" +#include "av1/common/scale.h" +#include "aom_dsp/aom_filter.h" + +// Note: Expect val to be in q4 precision +static INLINE int scaled_x(int val, const struct scale_factors *sf) { + const int off = + (sf->x_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1)); + const int64_t tval = (int64_t)val * sf->x_scale_fp + off; + return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval, + REF_SCALE_SHIFT - SCALE_EXTRA_BITS); +} + +// Note: Expect val to be in q4 precision +static INLINE int scaled_y(int val, const struct scale_factors *sf) { + const int off = + (sf->y_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1)); + const int64_t tval = (int64_t)val * sf->y_scale_fp + off; + return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval, + REF_SCALE_SHIFT - SCALE_EXTRA_BITS); +} + +// Note: Expect val to be in q4 precision +static int unscaled_value(int val, const struct scale_factors *sf) { + (void)sf; + return val * (1 << SCALE_EXTRA_BITS); +} + +static int get_fixed_point_scale_factor(int other_size, int this_size) { + // Calculate scaling factor once for each reference frame + // and use fixed point scaling factors in decoding and encoding routines. + // Hardware implementations can calculate scale factor in device driver + // and use multiplication and shifting on hardware instead of division. + return ((other_size << REF_SCALE_SHIFT) + this_size / 2) / this_size; +} + +// Given the fixed point scale, calculate coarse point scale. +static int fixed_point_scale_to_coarse_point_scale(int scale_fp) { + return ROUND_POWER_OF_TWO(scale_fp, REF_SCALE_SHIFT - SCALE_SUBPEL_BITS); +} + +// Note: x and y are integer precision, mvq4 is q4 precision. +MV32 av1_scale_mv(const MV *mvq4, int x, int y, + const struct scale_factors *sf) { + const int x_off_q4 = scaled_x(x << SUBPEL_BITS, sf); + const int y_off_q4 = scaled_y(y << SUBPEL_BITS, sf); + const MV32 res = { scaled_y((y << SUBPEL_BITS) + mvq4->row, sf) - y_off_q4, + scaled_x((x << SUBPEL_BITS) + mvq4->col, sf) - x_off_q4 }; + return res; +} + +void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, + int other_h, int this_w, int this_h) { + if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) { + sf->x_scale_fp = REF_INVALID_SCALE; + sf->y_scale_fp = REF_INVALID_SCALE; + return; + } + + sf->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w); + sf->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h); + + sf->x_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->x_scale_fp); + sf->y_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->y_scale_fp); + + if (av1_is_scaled(sf)) { + sf->scale_value_x = scaled_x; + sf->scale_value_y = scaled_y; + } else { + sf->scale_value_x = unscaled_value; + sf->scale_value_y = unscaled_value; + } + + // AV1 convolve functions + // Special case convolve functions should produce the same result as + // av1_convolve_2d. + // subpel_x_qn == 0 && subpel_y_qn == 0 + sf->convolve[0][0][0] = av1_convolve_2d_copy_sr; + // subpel_x_qn == 0 + sf->convolve[0][1][0] = av1_convolve_y_sr; + // subpel_y_qn == 0 + sf->convolve[1][0][0] = av1_convolve_x_sr; + // subpel_x_qn != 0 && subpel_y_qn != 0 + sf->convolve[1][1][0] = av1_convolve_2d_sr; + // subpel_x_qn == 0 && subpel_y_qn == 0 + sf->convolve[0][0][1] = av1_dist_wtd_convolve_2d_copy; + // subpel_x_qn == 0 + sf->convolve[0][1][1] = av1_dist_wtd_convolve_y; + // subpel_y_qn == 0 + sf->convolve[1][0][1] = av1_dist_wtd_convolve_x; + // subpel_x_qn != 0 && subpel_y_qn != 0 + sf->convolve[1][1][1] = av1_dist_wtd_convolve_2d; +#if CONFIG_AV1_HIGHBITDEPTH + // AV1 High BD convolve functions + // Special case convolve functions should produce the same result as + // av1_highbd_convolve_2d. + // subpel_x_qn == 0 && subpel_y_qn == 0 + sf->highbd_convolve[0][0][0] = av1_highbd_convolve_2d_copy_sr; + // subpel_x_qn == 0 + sf->highbd_convolve[0][1][0] = av1_highbd_convolve_y_sr; + // subpel_y_qn == 0 + sf->highbd_convolve[1][0][0] = av1_highbd_convolve_x_sr; + // subpel_x_qn != 0 && subpel_y_qn != 0 + sf->highbd_convolve[1][1][0] = av1_highbd_convolve_2d_sr; + // subpel_x_qn == 0 && subpel_y_qn == 0 + sf->highbd_convolve[0][0][1] = av1_highbd_dist_wtd_convolve_2d_copy; + // subpel_x_qn == 0 + sf->highbd_convolve[0][1][1] = av1_highbd_dist_wtd_convolve_y; + // subpel_y_qn == 0 + sf->highbd_convolve[1][0][1] = av1_highbd_dist_wtd_convolve_x; + // subpel_x_qn != 0 && subpel_y_qn != 0 + sf->highbd_convolve[1][1][1] = av1_highbd_dist_wtd_convolve_2d; +#endif +} diff --git a/libs/libaom/src/av1/common/scale.h b/libs/libaom/src/av1/common/scale.h new file mode 100644 index 000000000..16b40bde8 --- /dev/null +++ b/libs/libaom/src/av1/common/scale.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_SCALE_H_ +#define AOM_AV1_COMMON_SCALE_H_ + +#include "av1/common/convolve.h" +#include "av1/common/mv.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define SCALE_NUMERATOR 8 + +#define REF_SCALE_SHIFT 14 +#define REF_NO_SCALE (1 << REF_SCALE_SHIFT) +#define REF_INVALID_SCALE -1 + +struct scale_factors { + int x_scale_fp; // horizontal fixed point scale factor + int y_scale_fp; // vertical fixed point scale factor + int x_step_q4; + int y_step_q4; + + int (*scale_value_x)(int val, const struct scale_factors *sf); + int (*scale_value_y)(int val, const struct scale_factors *sf); + + // convolve_fn_ptr[subpel_x != 0][subpel_y != 0][is_compound] + aom_convolve_fn_t convolve[2][2][2]; + aom_highbd_convolve_fn_t highbd_convolve[2][2][2]; +}; + +MV32 av1_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf); + +void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, + int other_h, int this_w, int this_h); + +static INLINE int av1_is_valid_scale(const struct scale_factors *sf) { + assert(sf != NULL); + return sf->x_scale_fp != REF_INVALID_SCALE && + sf->y_scale_fp != REF_INVALID_SCALE; +} + +static INLINE int av1_is_scaled(const struct scale_factors *sf) { + assert(sf != NULL); + return av1_is_valid_scale(sf) && + (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE); +} + +static INLINE int valid_ref_frame_size(int ref_width, int ref_height, + int this_width, int this_height) { + return 2 * this_width >= ref_width && 2 * this_height >= ref_height && + this_width <= 16 * ref_width && this_height <= 16 * ref_height; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_SCALE_H_ diff --git a/libs/libaom/src/av1/common/scan.c b/libs/libaom/src/av1/common/scan.c new file mode 100644 index 000000000..c1d4f3581 --- /dev/null +++ b/libs/libaom/src/av1/common/scan.c @@ -0,0 +1,2048 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/common_data.h" +#include "av1/common/scan.h" + +DECLARE_ALIGNED(16, static const int16_t, + default_scan_4x4[16]) = { 0, 1, 4, 8, 5, 2, 3, 6, + 9, 12, 13, 10, 7, 11, 14, 15 }; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x4[16]) = { + 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x4[16]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_4x8[32]) = { + 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14, + 17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 27, 30, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x8[32]) = { + 0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29, + 2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x8[32]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_8x4[32]) = { + 0, 8, 1, 16, 9, 2, 24, 17, 10, 3, 25, 18, 11, 4, 26, 19, + 12, 5, 27, 20, 13, 6, 28, 21, 14, 7, 29, 22, 15, 30, 23, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x4[32]) = { + 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, + 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x4[32]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_4x16[64]) = { + 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14, + 17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30, + 33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46, + 49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_16x4[64]) = { + 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 49, 34, 19, 4, 50, 35, + 20, 5, 51, 36, 21, 6, 52, 37, 22, 7, 53, 38, 23, 8, 54, 39, + 24, 9, 55, 40, 25, 10, 56, 41, 26, 11, 57, 42, 27, 12, 58, 43, + 28, 13, 59, 44, 29, 14, 60, 45, 30, 15, 61, 46, 31, 62, 47, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x16[64]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x4[64]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x16[64]) = { + 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, + 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61, + 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, + 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x4[64]) = { + 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51, + 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55, + 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59, + 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_8x32[256]) = { + 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, + 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, + 21, 28, 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, + 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80, + 39, 46, 53, 60, 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89, + 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98, + 105, 112, 71, 78, 85, 92, 99, 106, 113, 120, 79, 86, 93, 100, 107, + 114, 121, 128, 87, 94, 101, 108, 115, 122, 129, 136, 95, 102, 109, 116, + 123, 130, 137, 144, 103, 110, 117, 124, 131, 138, 145, 152, 111, 118, 125, + 132, 139, 146, 153, 160, 119, 126, 133, 140, 147, 154, 161, 168, 127, 134, + 141, 148, 155, 162, 169, 176, 135, 142, 149, 156, 163, 170, 177, 184, 143, + 150, 157, 164, 171, 178, 185, 192, 151, 158, 165, 172, 179, 186, 193, 200, + 159, 166, 173, 180, 187, 194, 201, 208, 167, 174, 181, 188, 195, 202, 209, + 216, 175, 182, 189, 196, 203, 210, 217, 224, 183, 190, 197, 204, 211, 218, + 225, 232, 191, 198, 205, 212, 219, 226, 233, 240, 199, 206, 213, 220, 227, + 234, 241, 248, 207, 214, 221, 228, 235, 242, 249, 215, 222, 229, 236, 243, + 250, 223, 230, 237, 244, 251, 231, 238, 245, 252, 239, 246, 253, 247, 254, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_32x8[256]) = { + 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4, + 160, 129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, 193, + 162, 131, 100, 69, 38, 7, 225, 194, 163, 132, 101, 70, 39, 8, 226, + 195, 164, 133, 102, 71, 40, 9, 227, 196, 165, 134, 103, 72, 41, 10, + 228, 197, 166, 135, 104, 73, 42, 11, 229, 198, 167, 136, 105, 74, 43, + 12, 230, 199, 168, 137, 106, 75, 44, 13, 231, 200, 169, 138, 107, 76, + 45, 14, 232, 201, 170, 139, 108, 77, 46, 15, 233, 202, 171, 140, 109, + 78, 47, 16, 234, 203, 172, 141, 110, 79, 48, 17, 235, 204, 173, 142, + 111, 80, 49, 18, 236, 205, 174, 143, 112, 81, 50, 19, 237, 206, 175, + 144, 113, 82, 51, 20, 238, 207, 176, 145, 114, 83, 52, 21, 239, 208, + 177, 146, 115, 84, 53, 22, 240, 209, 178, 147, 116, 85, 54, 23, 241, + 210, 179, 148, 117, 86, 55, 24, 242, 211, 180, 149, 118, 87, 56, 25, + 243, 212, 181, 150, 119, 88, 57, 26, 244, 213, 182, 151, 120, 89, 58, + 27, 245, 214, 183, 152, 121, 90, 59, 28, 246, 215, 184, 153, 122, 91, + 60, 29, 247, 216, 185, 154, 123, 92, 61, 30, 248, 217, 186, 155, 124, + 93, 62, 31, 249, 218, 187, 156, 125, 94, 63, 250, 219, 188, 157, 126, + 95, 251, 220, 189, 158, 127, 252, 221, 190, 159, 253, 222, 191, 254, 223, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x32[256]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x8[256]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x32[256]) = { + 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, + 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, + 240, 248, 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, + 105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, + 225, 233, 241, 249, 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, + 90, 98, 106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202, + 210, 218, 226, 234, 242, 250, 3, 11, 19, 27, 35, 43, 51, 59, 67, + 75, 83, 91, 99, 107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187, + 195, 203, 211, 219, 227, 235, 243, 251, 4, 12, 20, 28, 36, 44, 52, + 60, 68, 76, 84, 92, 100, 108, 116, 124, 132, 140, 148, 156, 164, 172, + 180, 188, 196, 204, 212, 220, 228, 236, 244, 252, 5, 13, 21, 29, 37, + 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 133, 141, 149, 157, + 165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253, 6, 14, 22, + 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 134, 142, + 150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254, 7, + 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, + 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x8[256]) = { + 0, 32, 64, 96, 128, 160, 192, 224, 1, 33, 65, 97, 129, 161, 193, 225, + 2, 34, 66, 98, 130, 162, 194, 226, 3, 35, 67, 99, 131, 163, 195, 227, + 4, 36, 68, 100, 132, 164, 196, 228, 5, 37, 69, 101, 133, 165, 197, 229, + 6, 38, 70, 102, 134, 166, 198, 230, 7, 39, 71, 103, 135, 167, 199, 231, + 8, 40, 72, 104, 136, 168, 200, 232, 9, 41, 73, 105, 137, 169, 201, 233, + 10, 42, 74, 106, 138, 170, 202, 234, 11, 43, 75, 107, 139, 171, 203, 235, + 12, 44, 76, 108, 140, 172, 204, 236, 13, 45, 77, 109, 141, 173, 205, 237, + 14, 46, 78, 110, 142, 174, 206, 238, 15, 47, 79, 111, 143, 175, 207, 239, + 16, 48, 80, 112, 144, 176, 208, 240, 17, 49, 81, 113, 145, 177, 209, 241, + 18, 50, 82, 114, 146, 178, 210, 242, 19, 51, 83, 115, 147, 179, 211, 243, + 20, 52, 84, 116, 148, 180, 212, 244, 21, 53, 85, 117, 149, 181, 213, 245, + 22, 54, 86, 118, 150, 182, 214, 246, 23, 55, 87, 119, 151, 183, 215, 247, + 24, 56, 88, 120, 152, 184, 216, 248, 25, 57, 89, 121, 153, 185, 217, 249, + 26, 58, 90, 122, 154, 186, 218, 250, 27, 59, 91, 123, 155, 187, 219, 251, + 28, 60, 92, 124, 156, 188, 220, 252, 29, 61, 93, 125, 157, 189, 221, 253, + 30, 62, 94, 126, 158, 190, 222, 254, 31, 63, 95, 127, 159, 191, 223, 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8[64]) = { + 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, + 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, + 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51, + 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63 +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x8[64]) = { + 0, 8, 16, 24, 32, 40, 48, 56, 1, 9, 17, 25, 33, 41, 49, 57, + 2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59, + 4, 12, 20, 28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61, + 6, 14, 22, 30, 38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x8[64]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_8x16[128]) = { + 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, + 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, + 21, 28, 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, + 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80, + 39, 46, 53, 60, 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89, + 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98, + 105, 112, 71, 78, 85, 92, 99, 106, 113, 120, 79, 86, 93, 100, 107, + 114, 121, 87, 94, 101, 108, 115, 122, 95, 102, 109, 116, 123, 103, 110, + 117, 124, 111, 118, 125, 119, 126, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_16x8[128]) = { + 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 49, 34, 19, 4, 80, + 65, 50, 35, 20, 5, 96, 81, 66, 51, 36, 21, 6, 112, 97, 82, 67, + 52, 37, 22, 7, 113, 98, 83, 68, 53, 38, 23, 8, 114, 99, 84, 69, + 54, 39, 24, 9, 115, 100, 85, 70, 55, 40, 25, 10, 116, 101, 86, 71, + 56, 41, 26, 11, 117, 102, 87, 72, 57, 42, 27, 12, 118, 103, 88, 73, + 58, 43, 28, 13, 119, 104, 89, 74, 59, 44, 29, 14, 120, 105, 90, 75, + 60, 45, 30, 15, 121, 106, 91, 76, 61, 46, 31, 122, 107, 92, 77, 62, + 47, 123, 108, 93, 78, 63, 124, 109, 94, 79, 125, 110, 95, 126, 111, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x16[128]) = { + 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, + 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121, + 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122, + 3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123, + 4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, + 5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, + 6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, + 7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x8[128]) = { + 0, 16, 32, 48, 64, 80, 96, 112, 1, 17, 33, 49, 65, 81, 97, 113, + 2, 18, 34, 50, 66, 82, 98, 114, 3, 19, 35, 51, 67, 83, 99, 115, + 4, 20, 36, 52, 68, 84, 100, 116, 5, 21, 37, 53, 69, 85, 101, 117, + 6, 22, 38, 54, 70, 86, 102, 118, 7, 23, 39, 55, 71, 87, 103, 119, + 8, 24, 40, 56, 72, 88, 104, 120, 9, 25, 41, 57, 73, 89, 105, 121, + 10, 26, 42, 58, 74, 90, 106, 122, 11, 27, 43, 59, 75, 91, 107, 123, + 12, 28, 44, 60, 76, 92, 108, 124, 13, 29, 45, 61, 77, 93, 109, 125, + 14, 30, 46, 62, 78, 94, 110, 126, 15, 31, 47, 63, 79, 95, 111, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x16[128]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x8[128]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_16x32[512]) = { + 0, 1, 16, 2, 17, 32, 3, 18, 33, 48, 4, 19, 34, 49, 64, + 5, 20, 35, 50, 65, 80, 6, 21, 36, 51, 66, 81, 96, 7, 22, + 37, 52, 67, 82, 97, 112, 8, 23, 38, 53, 68, 83, 98, 113, 128, + 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 10, 25, 40, 55, 70, + 85, 100, 115, 130, 145, 160, 11, 26, 41, 56, 71, 86, 101, 116, 131, + 146, 161, 176, 12, 27, 42, 57, 72, 87, 102, 117, 132, 147, 162, 177, + 192, 13, 28, 43, 58, 73, 88, 103, 118, 133, 148, 163, 178, 193, 208, + 14, 29, 44, 59, 74, 89, 104, 119, 134, 149, 164, 179, 194, 209, 224, + 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225, + 240, 31, 46, 61, 76, 91, 106, 121, 136, 151, 166, 181, 196, 211, 226, + 241, 256, 47, 62, 77, 92, 107, 122, 137, 152, 167, 182, 197, 212, 227, + 242, 257, 272, 63, 78, 93, 108, 123, 138, 153, 168, 183, 198, 213, 228, + 243, 258, 273, 288, 79, 94, 109, 124, 139, 154, 169, 184, 199, 214, 229, + 244, 259, 274, 289, 304, 95, 110, 125, 140, 155, 170, 185, 200, 215, 230, + 245, 260, 275, 290, 305, 320, 111, 126, 141, 156, 171, 186, 201, 216, 231, + 246, 261, 276, 291, 306, 321, 336, 127, 142, 157, 172, 187, 202, 217, 232, + 247, 262, 277, 292, 307, 322, 337, 352, 143, 158, 173, 188, 203, 218, 233, + 248, 263, 278, 293, 308, 323, 338, 353, 368, 159, 174, 189, 204, 219, 234, + 249, 264, 279, 294, 309, 324, 339, 354, 369, 384, 175, 190, 205, 220, 235, + 250, 265, 280, 295, 310, 325, 340, 355, 370, 385, 400, 191, 206, 221, 236, + 251, 266, 281, 296, 311, 326, 341, 356, 371, 386, 401, 416, 207, 222, 237, + 252, 267, 282, 297, 312, 327, 342, 357, 372, 387, 402, 417, 432, 223, 238, + 253, 268, 283, 298, 313, 328, 343, 358, 373, 388, 403, 418, 433, 448, 239, + 254, 269, 284, 299, 314, 329, 344, 359, 374, 389, 404, 419, 434, 449, 464, + 255, 270, 285, 300, 315, 330, 345, 360, 375, 390, 405, 420, 435, 450, 465, + 480, 271, 286, 301, 316, 331, 346, 361, 376, 391, 406, 421, 436, 451, 466, + 481, 496, 287, 302, 317, 332, 347, 362, 377, 392, 407, 422, 437, 452, 467, + 482, 497, 303, 318, 333, 348, 363, 378, 393, 408, 423, 438, 453, 468, 483, + 498, 319, 334, 349, 364, 379, 394, 409, 424, 439, 454, 469, 484, 499, 335, + 350, 365, 380, 395, 410, 425, 440, 455, 470, 485, 500, 351, 366, 381, 396, + 411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472, + 487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444, + 459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476, + 491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495, + 510, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_32x16[512]) = { + 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4, + 160, 129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, 193, + 162, 131, 100, 69, 38, 7, 256, 225, 194, 163, 132, 101, 70, 39, 8, + 288, 257, 226, 195, 164, 133, 102, 71, 40, 9, 320, 289, 258, 227, 196, + 165, 134, 103, 72, 41, 10, 352, 321, 290, 259, 228, 197, 166, 135, 104, + 73, 42, 11, 384, 353, 322, 291, 260, 229, 198, 167, 136, 105, 74, 43, + 12, 416, 385, 354, 323, 292, 261, 230, 199, 168, 137, 106, 75, 44, 13, + 448, 417, 386, 355, 324, 293, 262, 231, 200, 169, 138, 107, 76, 45, 14, + 480, 449, 418, 387, 356, 325, 294, 263, 232, 201, 170, 139, 108, 77, 46, + 15, 481, 450, 419, 388, 357, 326, 295, 264, 233, 202, 171, 140, 109, 78, + 47, 16, 482, 451, 420, 389, 358, 327, 296, 265, 234, 203, 172, 141, 110, + 79, 48, 17, 483, 452, 421, 390, 359, 328, 297, 266, 235, 204, 173, 142, + 111, 80, 49, 18, 484, 453, 422, 391, 360, 329, 298, 267, 236, 205, 174, + 143, 112, 81, 50, 19, 485, 454, 423, 392, 361, 330, 299, 268, 237, 206, + 175, 144, 113, 82, 51, 20, 486, 455, 424, 393, 362, 331, 300, 269, 238, + 207, 176, 145, 114, 83, 52, 21, 487, 456, 425, 394, 363, 332, 301, 270, + 239, 208, 177, 146, 115, 84, 53, 22, 488, 457, 426, 395, 364, 333, 302, + 271, 240, 209, 178, 147, 116, 85, 54, 23, 489, 458, 427, 396, 365, 334, + 303, 272, 241, 210, 179, 148, 117, 86, 55, 24, 490, 459, 428, 397, 366, + 335, 304, 273, 242, 211, 180, 149, 118, 87, 56, 25, 491, 460, 429, 398, + 367, 336, 305, 274, 243, 212, 181, 150, 119, 88, 57, 26, 492, 461, 430, + 399, 368, 337, 306, 275, 244, 213, 182, 151, 120, 89, 58, 27, 493, 462, + 431, 400, 369, 338, 307, 276, 245, 214, 183, 152, 121, 90, 59, 28, 494, + 463, 432, 401, 370, 339, 308, 277, 246, 215, 184, 153, 122, 91, 60, 29, + 495, 464, 433, 402, 371, 340, 309, 278, 247, 216, 185, 154, 123, 92, 61, + 30, 496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, 93, + 62, 31, 497, 466, 435, 404, 373, 342, 311, 280, 249, 218, 187, 156, 125, + 94, 63, 498, 467, 436, 405, 374, 343, 312, 281, 250, 219, 188, 157, 126, + 95, 499, 468, 437, 406, 375, 344, 313, 282, 251, 220, 189, 158, 127, 500, + 469, 438, 407, 376, 345, 314, 283, 252, 221, 190, 159, 501, 470, 439, 408, + 377, 346, 315, 284, 253, 222, 191, 502, 471, 440, 409, 378, 347, 316, 285, + 254, 223, 503, 472, 441, 410, 379, 348, 317, 286, 255, 504, 473, 442, 411, + 380, 349, 318, 287, 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413, + 382, 351, 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510, + 479, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x32[512]) = { + 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, + 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, + 480, 496, 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, + 209, 225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 401, 417, 433, + 449, 465, 481, 497, 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, + 178, 194, 210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 402, + 418, 434, 450, 466, 482, 498, 3, 19, 35, 51, 67, 83, 99, 115, 131, + 147, 163, 179, 195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371, + 387, 403, 419, 435, 451, 467, 483, 499, 4, 20, 36, 52, 68, 84, 100, + 116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340, + 356, 372, 388, 404, 420, 436, 452, 468, 484, 500, 5, 21, 37, 53, 69, + 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309, + 325, 341, 357, 373, 389, 405, 421, 437, 453, 469, 485, 501, 6, 22, 38, + 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278, + 294, 310, 326, 342, 358, 374, 390, 406, 422, 438, 454, 470, 486, 502, 7, + 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247, + 263, 279, 295, 311, 327, 343, 359, 375, 391, 407, 423, 439, 455, 471, 487, + 503, 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, + 232, 248, 264, 280, 296, 312, 328, 344, 360, 376, 392, 408, 424, 440, 456, + 472, 488, 504, 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, + 201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, 377, 393, 409, 425, + 441, 457, 473, 489, 505, 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, + 170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, 394, + 410, 426, 442, 458, 474, 490, 506, 11, 27, 43, 59, 75, 91, 107, 123, + 139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363, + 379, 395, 411, 427, 443, 459, 475, 491, 507, 12, 28, 44, 60, 76, 92, + 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284, 300, 316, 332, + 348, 364, 380, 396, 412, 428, 444, 460, 476, 492, 508, 13, 29, 45, 61, + 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269, 285, 301, + 317, 333, 349, 365, 381, 397, 413, 429, 445, 461, 477, 493, 509, 14, 30, + 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 270, + 286, 302, 318, 334, 350, 366, 382, 398, 414, 430, 446, 462, 478, 494, 510, + 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, + 255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 415, 431, 447, 463, 479, + 495, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x16[512]) = { + 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, + 1, 33, 65, 97, 129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481, + 2, 34, 66, 98, 130, 162, 194, 226, 258, 290, 322, 354, 386, 418, 450, 482, + 3, 35, 67, 99, 131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483, + 4, 36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, 388, 420, 452, 484, + 5, 37, 69, 101, 133, 165, 197, 229, 261, 293, 325, 357, 389, 421, 453, 485, + 6, 38, 70, 102, 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486, + 7, 39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, 455, 487, + 8, 40, 72, 104, 136, 168, 200, 232, 264, 296, 328, 360, 392, 424, 456, 488, + 9, 41, 73, 105, 137, 169, 201, 233, 265, 297, 329, 361, 393, 425, 457, 489, + 10, 42, 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490, + 11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, 395, 427, 459, 491, + 12, 44, 76, 108, 140, 172, 204, 236, 268, 300, 332, 364, 396, 428, 460, 492, + 13, 45, 77, 109, 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493, + 14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, 462, 494, + 15, 47, 79, 111, 143, 175, 207, 239, 271, 303, 335, 367, 399, 431, 463, 495, + 16, 48, 80, 112, 144, 176, 208, 240, 272, 304, 336, 368, 400, 432, 464, 496, + 17, 49, 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497, + 18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, 402, 434, 466, 498, + 19, 51, 83, 115, 147, 179, 211, 243, 275, 307, 339, 371, 403, 435, 467, 499, + 20, 52, 84, 116, 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500, + 21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, 469, 501, + 22, 54, 86, 118, 150, 182, 214, 246, 278, 310, 342, 374, 406, 438, 470, 502, + 23, 55, 87, 119, 151, 183, 215, 247, 279, 311, 343, 375, 407, 439, 471, 503, + 24, 56, 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504, + 25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, 409, 441, 473, 505, + 26, 58, 90, 122, 154, 186, 218, 250, 282, 314, 346, 378, 410, 442, 474, 506, + 27, 59, 91, 123, 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507, + 28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, 476, 508, + 29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413, 445, 477, 509, + 30, 62, 94, 126, 158, 190, 222, 254, 286, 318, 350, 382, 414, 446, 478, 510, + 31, 63, 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x32[512]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, + 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, + 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, + 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, + 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, + 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, + 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, + 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, + 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, + 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, + 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, + 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, + 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, + 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, + 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, + 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, + 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, + 510, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x16[512]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, + 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, + 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, + 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, + 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, + 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, + 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, + 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, + 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, + 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, + 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, + 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, + 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, + 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, + 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, + 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, + 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, + 510, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_16x16[256]) = { + 0, 1, 16, 32, 17, 2, 3, 18, 33, 48, 64, 49, 34, 19, 4, + 5, 20, 35, 50, 65, 80, 96, 81, 66, 51, 36, 21, 6, 7, 22, + 37, 52, 67, 82, 97, 112, 128, 113, 98, 83, 68, 53, 38, 23, 8, + 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 160, 145, 130, 115, 100, + 85, 70, 55, 40, 25, 10, 11, 26, 41, 56, 71, 86, 101, 116, 131, + 146, 161, 176, 192, 177, 162, 147, 132, 117, 102, 87, 72, 57, 42, 27, + 12, 13, 28, 43, 58, 73, 88, 103, 118, 133, 148, 163, 178, 193, 208, + 224, 209, 194, 179, 164, 149, 134, 119, 104, 89, 74, 59, 44, 29, 14, + 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225, + 240, 241, 226, 211, 196, 181, 166, 151, 136, 121, 106, 91, 76, 61, 46, + 31, 47, 62, 77, 92, 107, 122, 137, 152, 167, 182, 197, 212, 227, 242, + 243, 228, 213, 198, 183, 168, 153, 138, 123, 108, 93, 78, 63, 79, 94, + 109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230, 215, 200, 185, + 170, 155, 140, 125, 110, 95, 111, 126, 141, 156, 171, 186, 201, 216, 231, + 246, 247, 232, 217, 202, 187, 172, 157, 142, 127, 143, 158, 173, 188, 203, + 218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235, + 250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254, + 255 +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x16[256]) = { + 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, + 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241, + 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242, + 3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243, + 4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244, + 5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, + 6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, + 7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247, + 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248, + 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249, + 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250, + 11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251, + 12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, + 13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, + 14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, + 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x16[256]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x32[1024]) = { + 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, + 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, + 896, 928, 960, 992, 1, 33, 65, 97, 129, 161, 193, 225, 257, 289, + 321, 353, 385, 417, 449, 481, 513, 545, 577, 609, 641, 673, 705, 737, + 769, 801, 833, 865, 897, 929, 961, 993, 2, 34, 66, 98, 130, 162, + 194, 226, 258, 290, 322, 354, 386, 418, 450, 482, 514, 546, 578, 610, + 642, 674, 706, 738, 770, 802, 834, 866, 898, 930, 962, 994, 3, 35, + 67, 99, 131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483, + 515, 547, 579, 611, 643, 675, 707, 739, 771, 803, 835, 867, 899, 931, + 963, 995, 4, 36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, + 388, 420, 452, 484, 516, 548, 580, 612, 644, 676, 708, 740, 772, 804, + 836, 868, 900, 932, 964, 996, 5, 37, 69, 101, 133, 165, 197, 229, + 261, 293, 325, 357, 389, 421, 453, 485, 517, 549, 581, 613, 645, 677, + 709, 741, 773, 805, 837, 869, 901, 933, 965, 997, 6, 38, 70, 102, + 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486, 518, 550, + 582, 614, 646, 678, 710, 742, 774, 806, 838, 870, 902, 934, 966, 998, + 7, 39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, + 455, 487, 519, 551, 583, 615, 647, 679, 711, 743, 775, 807, 839, 871, + 903, 935, 967, 999, 8, 40, 72, 104, 136, 168, 200, 232, 264, 296, + 328, 360, 392, 424, 456, 488, 520, 552, 584, 616, 648, 680, 712, 744, + 776, 808, 840, 872, 904, 936, 968, 1000, 9, 41, 73, 105, 137, 169, + 201, 233, 265, 297, 329, 361, 393, 425, 457, 489, 521, 553, 585, 617, + 649, 681, 713, 745, 777, 809, 841, 873, 905, 937, 969, 1001, 10, 42, + 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490, + 522, 554, 586, 618, 650, 682, 714, 746, 778, 810, 842, 874, 906, 938, + 970, 1002, 11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, + 395, 427, 459, 491, 523, 555, 587, 619, 651, 683, 715, 747, 779, 811, + 843, 875, 907, 939, 971, 1003, 12, 44, 76, 108, 140, 172, 204, 236, + 268, 300, 332, 364, 396, 428, 460, 492, 524, 556, 588, 620, 652, 684, + 716, 748, 780, 812, 844, 876, 908, 940, 972, 1004, 13, 45, 77, 109, + 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493, 525, 557, + 589, 621, 653, 685, 717, 749, 781, 813, 845, 877, 909, 941, 973, 1005, + 14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, + 462, 494, 526, 558, 590, 622, 654, 686, 718, 750, 782, 814, 846, 878, + 910, 942, 974, 1006, 15, 47, 79, 111, 143, 175, 207, 239, 271, 303, + 335, 367, 399, 431, 463, 495, 527, 559, 591, 623, 655, 687, 719, 751, + 783, 815, 847, 879, 911, 943, 975, 1007, 16, 48, 80, 112, 144, 176, + 208, 240, 272, 304, 336, 368, 400, 432, 464, 496, 528, 560, 592, 624, + 656, 688, 720, 752, 784, 816, 848, 880, 912, 944, 976, 1008, 17, 49, + 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497, + 529, 561, 593, 625, 657, 689, 721, 753, 785, 817, 849, 881, 913, 945, + 977, 1009, 18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, + 402, 434, 466, 498, 530, 562, 594, 626, 658, 690, 722, 754, 786, 818, + 850, 882, 914, 946, 978, 1010, 19, 51, 83, 115, 147, 179, 211, 243, + 275, 307, 339, 371, 403, 435, 467, 499, 531, 563, 595, 627, 659, 691, + 723, 755, 787, 819, 851, 883, 915, 947, 979, 1011, 20, 52, 84, 116, + 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500, 532, 564, + 596, 628, 660, 692, 724, 756, 788, 820, 852, 884, 916, 948, 980, 1012, + 21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, + 469, 501, 533, 565, 597, 629, 661, 693, 725, 757, 789, 821, 853, 885, + 917, 949, 981, 1013, 22, 54, 86, 118, 150, 182, 214, 246, 278, 310, + 342, 374, 406, 438, 470, 502, 534, 566, 598, 630, 662, 694, 726, 758, + 790, 822, 854, 886, 918, 950, 982, 1014, 23, 55, 87, 119, 151, 183, + 215, 247, 279, 311, 343, 375, 407, 439, 471, 503, 535, 567, 599, 631, + 663, 695, 727, 759, 791, 823, 855, 887, 919, 951, 983, 1015, 24, 56, + 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504, + 536, 568, 600, 632, 664, 696, 728, 760, 792, 824, 856, 888, 920, 952, + 984, 1016, 25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, + 409, 441, 473, 505, 537, 569, 601, 633, 665, 697, 729, 761, 793, 825, + 857, 889, 921, 953, 985, 1017, 26, 58, 90, 122, 154, 186, 218, 250, + 282, 314, 346, 378, 410, 442, 474, 506, 538, 570, 602, 634, 666, 698, + 730, 762, 794, 826, 858, 890, 922, 954, 986, 1018, 27, 59, 91, 123, + 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507, 539, 571, + 603, 635, 667, 699, 731, 763, 795, 827, 859, 891, 923, 955, 987, 1019, + 28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, + 476, 508, 540, 572, 604, 636, 668, 700, 732, 764, 796, 828, 860, 892, + 924, 956, 988, 1020, 29, 61, 93, 125, 157, 189, 221, 253, 285, 317, + 349, 381, 413, 445, 477, 509, 541, 573, 605, 637, 669, 701, 733, 765, + 797, 829, 861, 893, 925, 957, 989, 1021, 30, 62, 94, 126, 158, 190, + 222, 254, 286, 318, 350, 382, 414, 446, 478, 510, 542, 574, 606, 638, + 670, 702, 734, 766, 798, 830, 862, 894, 926, 958, 990, 1022, 31, 63, + 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511, + 543, 575, 607, 639, 671, 703, 735, 767, 799, 831, 863, 895, 927, 959, + 991, 1023, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x32[1024]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, + 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, + 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, + 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, + 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, + 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, + 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, + 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, + 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, + 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, + 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, + 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, + 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, + 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, + 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, + 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, + 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, + 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, + 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, + 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, + 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, + 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, + 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, + 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, + 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, + 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, + 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, + 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, + 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, + 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, + 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, + 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, + 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, + 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, + 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, + 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, + 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, + 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, + 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, + 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, + 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, + 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, + 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, + 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, + 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, + 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, + 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, + 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, + 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, + 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, + 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, + 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, + 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, + 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, + 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, + 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, + 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, + 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, + 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, + 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, + 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, + 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, + 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, + 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, + 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, + 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, + 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, + 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, + 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, + 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, + 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, + 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, + 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_32x32[1024]) = { + 0, 1, 32, 64, 33, 2, 3, 34, 65, 96, 128, 97, 66, + 35, 4, 5, 36, 67, 98, 129, 160, 192, 161, 130, 99, 68, + 37, 6, 7, 38, 69, 100, 131, 162, 193, 224, 256, 225, 194, + 163, 132, 101, 70, 39, 8, 9, 40, 71, 102, 133, 164, 195, + 226, 257, 288, 320, 289, 258, 227, 196, 165, 134, 103, 72, 41, + 10, 11, 42, 73, 104, 135, 166, 197, 228, 259, 290, 321, 352, + 384, 353, 322, 291, 260, 229, 198, 167, 136, 105, 74, 43, 12, + 13, 44, 75, 106, 137, 168, 199, 230, 261, 292, 323, 354, 385, + 416, 448, 417, 386, 355, 324, 293, 262, 231, 200, 169, 138, 107, + 76, 45, 14, 15, 46, 77, 108, 139, 170, 201, 232, 263, 294, + 325, 356, 387, 418, 449, 480, 512, 481, 450, 419, 388, 357, 326, + 295, 264, 233, 202, 171, 140, 109, 78, 47, 16, 17, 48, 79, + 110, 141, 172, 203, 234, 265, 296, 327, 358, 389, 420, 451, 482, + 513, 544, 576, 545, 514, 483, 452, 421, 390, 359, 328, 297, 266, + 235, 204, 173, 142, 111, 80, 49, 18, 19, 50, 81, 112, 143, + 174, 205, 236, 267, 298, 329, 360, 391, 422, 453, 484, 515, 546, + 577, 608, 640, 609, 578, 547, 516, 485, 454, 423, 392, 361, 330, + 299, 268, 237, 206, 175, 144, 113, 82, 51, 20, 21, 52, 83, + 114, 145, 176, 207, 238, 269, 300, 331, 362, 393, 424, 455, 486, + 517, 548, 579, 610, 641, 672, 704, 673, 642, 611, 580, 549, 518, + 487, 456, 425, 394, 363, 332, 301, 270, 239, 208, 177, 146, 115, + 84, 53, 22, 23, 54, 85, 116, 147, 178, 209, 240, 271, 302, + 333, 364, 395, 426, 457, 488, 519, 550, 581, 612, 643, 674, 705, + 736, 768, 737, 706, 675, 644, 613, 582, 551, 520, 489, 458, 427, + 396, 365, 334, 303, 272, 241, 210, 179, 148, 117, 86, 55, 24, + 25, 56, 87, 118, 149, 180, 211, 242, 273, 304, 335, 366, 397, + 428, 459, 490, 521, 552, 583, 614, 645, 676, 707, 738, 769, 800, + 832, 801, 770, 739, 708, 677, 646, 615, 584, 553, 522, 491, 460, + 429, 398, 367, 336, 305, 274, 243, 212, 181, 150, 119, 88, 57, + 26, 27, 58, 89, 120, 151, 182, 213, 244, 275, 306, 337, 368, + 399, 430, 461, 492, 523, 554, 585, 616, 647, 678, 709, 740, 771, + 802, 833, 864, 896, 865, 834, 803, 772, 741, 710, 679, 648, 617, + 586, 555, 524, 493, 462, 431, 400, 369, 338, 307, 276, 245, 214, + 183, 152, 121, 90, 59, 28, 29, 60, 91, 122, 153, 184, 215, + 246, 277, 308, 339, 370, 401, 432, 463, 494, 525, 556, 587, 618, + 649, 680, 711, 742, 773, 804, 835, 866, 897, 928, 960, 929, 898, + 867, 836, 805, 774, 743, 712, 681, 650, 619, 588, 557, 526, 495, + 464, 433, 402, 371, 340, 309, 278, 247, 216, 185, 154, 123, 92, + 61, 30, 31, 62, 93, 124, 155, 186, 217, 248, 279, 310, 341, + 372, 403, 434, 465, 496, 527, 558, 589, 620, 651, 682, 713, 744, + 775, 806, 837, 868, 899, 930, 961, 992, 993, 962, 931, 900, 869, + 838, 807, 776, 745, 714, 683, 652, 621, 590, 559, 528, 497, 466, + 435, 404, 373, 342, 311, 280, 249, 218, 187, 156, 125, 94, 63, + 95, 126, 157, 188, 219, 250, 281, 312, 343, 374, 405, 436, 467, + 498, 529, 560, 591, 622, 653, 684, 715, 746, 777, 808, 839, 870, + 901, 932, 963, 994, 995, 964, 933, 902, 871, 840, 809, 778, 747, + 716, 685, 654, 623, 592, 561, 530, 499, 468, 437, 406, 375, 344, + 313, 282, 251, 220, 189, 158, 127, 159, 190, 221, 252, 283, 314, + 345, 376, 407, 438, 469, 500, 531, 562, 593, 624, 655, 686, 717, + 748, 779, 810, 841, 872, 903, 934, 965, 996, 997, 966, 935, 904, + 873, 842, 811, 780, 749, 718, 687, 656, 625, 594, 563, 532, 501, + 470, 439, 408, 377, 346, 315, 284, 253, 222, 191, 223, 254, 285, + 316, 347, 378, 409, 440, 471, 502, 533, 564, 595, 626, 657, 688, + 719, 750, 781, 812, 843, 874, 905, 936, 967, 998, 999, 968, 937, + 906, 875, 844, 813, 782, 751, 720, 689, 658, 627, 596, 565, 534, + 503, 472, 441, 410, 379, 348, 317, 286, 255, 287, 318, 349, 380, + 411, 442, 473, 504, 535, 566, 597, 628, 659, 690, 721, 752, 783, + 814, 845, 876, 907, 938, 969, 1000, 1001, 970, 939, 908, 877, 846, + 815, 784, 753, 722, 691, 660, 629, 598, 567, 536, 505, 474, 443, + 412, 381, 350, 319, 351, 382, 413, 444, 475, 506, 537, 568, 599, + 630, 661, 692, 723, 754, 785, 816, 847, 878, 909, 940, 971, 1002, + 1003, 972, 941, 910, 879, 848, 817, 786, 755, 724, 693, 662, 631, + 600, 569, 538, 507, 476, 445, 414, 383, 415, 446, 477, 508, 539, + 570, 601, 632, 663, 694, 725, 756, 787, 818, 849, 880, 911, 942, + 973, 1004, 1005, 974, 943, 912, 881, 850, 819, 788, 757, 726, 695, + 664, 633, 602, 571, 540, 509, 478, 447, 479, 510, 541, 572, 603, + 634, 665, 696, 727, 758, 789, 820, 851, 882, 913, 944, 975, 1006, + 1007, 976, 945, 914, 883, 852, 821, 790, 759, 728, 697, 666, 635, + 604, 573, 542, 511, 543, 574, 605, 636, 667, 698, 729, 760, 791, + 822, 853, 884, 915, 946, 977, 1008, 1009, 978, 947, 916, 885, 854, + 823, 792, 761, 730, 699, 668, 637, 606, 575, 607, 638, 669, 700, + 731, 762, 793, 824, 855, 886, 917, 948, 979, 1010, 1011, 980, 949, + 918, 887, 856, 825, 794, 763, 732, 701, 670, 639, 671, 702, 733, + 764, 795, 826, 857, 888, 919, 950, 981, 1012, 1013, 982, 951, 920, + 889, 858, 827, 796, 765, 734, 703, 735, 766, 797, 828, 859, 890, + 921, 952, 983, 1014, 1015, 984, 953, 922, 891, 860, 829, 798, 767, + 799, 830, 861, 892, 923, 954, 985, 1016, 1017, 986, 955, 924, 893, + 862, 831, 863, 894, 925, 956, 987, 1018, 1019, 988, 957, 926, 895, + 927, 958, 989, 1020, 1021, 990, 959, 991, 1022, 1023 +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_default_iscan_4x4[16]) = { 0, 1, 5, 6, 2, 4, 7, 12, + 3, 8, 11, 13, 9, 10, 14, 15 }; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x4[16]) = { + 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x4[16]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x8[32]) = { + 0, 1, 3, 6, 2, 4, 7, 10, 5, 8, 11, 14, 9, 12, 15, 18, + 13, 16, 19, 22, 17, 20, 23, 26, 21, 24, 27, 29, 25, 28, 30, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x8[32]) = { + 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, + 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x8[32]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x4[32]) = { + 0, 2, 5, 9, 13, 17, 21, 25, 1, 4, 8, 12, 16, 20, 24, 28, + 3, 7, 11, 15, 19, 23, 27, 30, 6, 10, 14, 18, 22, 26, 29, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x4[32]) = { + 0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29, + 2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x4[32]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x16[64]) = { + 0, 1, 3, 6, 2, 4, 7, 10, 5, 8, 11, 14, 9, 12, 15, 18, + 13, 16, 19, 22, 17, 20, 23, 26, 21, 24, 27, 30, 25, 28, 31, 34, + 29, 32, 35, 38, 33, 36, 39, 42, 37, 40, 43, 46, 41, 44, 47, 50, + 45, 48, 51, 54, 49, 52, 55, 58, 53, 56, 59, 61, 57, 60, 62, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x4[64]) = { + 0, 2, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, + 1, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, + 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 62, + 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 61, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x16[64]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x4[64]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x16[64]) = { + 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51, + 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55, + 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59, + 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x4[64]) = { + 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, + 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61, + 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, + 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x32[256]) = { + 0, 1, 3, 6, 10, 15, 21, 28, 2, 4, 7, 11, 16, 22, 29, + 36, 5, 8, 12, 17, 23, 30, 37, 44, 9, 13, 18, 24, 31, 38, + 45, 52, 14, 19, 25, 32, 39, 46, 53, 60, 20, 26, 33, 40, 47, + 54, 61, 68, 27, 34, 41, 48, 55, 62, 69, 76, 35, 42, 49, 56, + 63, 70, 77, 84, 43, 50, 57, 64, 71, 78, 85, 92, 51, 58, 65, + 72, 79, 86, 93, 100, 59, 66, 73, 80, 87, 94, 101, 108, 67, 74, + 81, 88, 95, 102, 109, 116, 75, 82, 89, 96, 103, 110, 117, 124, 83, + 90, 97, 104, 111, 118, 125, 132, 91, 98, 105, 112, 119, 126, 133, 140, + 99, 106, 113, 120, 127, 134, 141, 148, 107, 114, 121, 128, 135, 142, 149, + 156, 115, 122, 129, 136, 143, 150, 157, 164, 123, 130, 137, 144, 151, 158, + 165, 172, 131, 138, 145, 152, 159, 166, 173, 180, 139, 146, 153, 160, 167, + 174, 181, 188, 147, 154, 161, 168, 175, 182, 189, 196, 155, 162, 169, 176, + 183, 190, 197, 204, 163, 170, 177, 184, 191, 198, 205, 212, 171, 178, 185, + 192, 199, 206, 213, 220, 179, 186, 193, 200, 207, 214, 221, 228, 187, 194, + 201, 208, 215, 222, 229, 235, 195, 202, 209, 216, 223, 230, 236, 241, 203, + 210, 217, 224, 231, 237, 242, 246, 211, 218, 225, 232, 238, 243, 247, 250, + 219, 226, 233, 239, 244, 248, 251, 253, 227, 234, 240, 245, 249, 252, 254, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x8[256]) = { + 0, 2, 5, 9, 14, 20, 27, 35, 43, 51, 59, 67, 75, 83, 91, + 99, 107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187, 195, 203, 211, + 219, 227, 1, 4, 8, 13, 19, 26, 34, 42, 50, 58, 66, 74, 82, + 90, 98, 106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202, + 210, 218, 226, 234, 3, 7, 12, 18, 25, 33, 41, 49, 57, 65, 73, + 81, 89, 97, 105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, + 201, 209, 217, 225, 233, 240, 6, 11, 17, 24, 32, 40, 48, 56, 64, + 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, + 192, 200, 208, 216, 224, 232, 239, 245, 10, 16, 23, 31, 39, 47, 55, + 63, 71, 79, 87, 95, 103, 111, 119, 127, 135, 143, 151, 159, 167, 175, + 183, 191, 199, 207, 215, 223, 231, 238, 244, 249, 15, 22, 30, 38, 46, + 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 134, 142, 150, 158, 166, + 174, 182, 190, 198, 206, 214, 222, 230, 237, 243, 248, 252, 21, 29, 37, + 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 133, 141, 149, 157, + 165, 173, 181, 189, 197, 205, 213, 221, 229, 236, 242, 247, 251, 254, 28, + 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, 132, 140, 148, + 156, 164, 172, 180, 188, 196, 204, 212, 220, 228, 235, 241, 246, 250, 253, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x32[256]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x8[256]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x32[256]) = { + 0, 32, 64, 96, 128, 160, 192, 224, 1, 33, 65, 97, 129, 161, 193, 225, + 2, 34, 66, 98, 130, 162, 194, 226, 3, 35, 67, 99, 131, 163, 195, 227, + 4, 36, 68, 100, 132, 164, 196, 228, 5, 37, 69, 101, 133, 165, 197, 229, + 6, 38, 70, 102, 134, 166, 198, 230, 7, 39, 71, 103, 135, 167, 199, 231, + 8, 40, 72, 104, 136, 168, 200, 232, 9, 41, 73, 105, 137, 169, 201, 233, + 10, 42, 74, 106, 138, 170, 202, 234, 11, 43, 75, 107, 139, 171, 203, 235, + 12, 44, 76, 108, 140, 172, 204, 236, 13, 45, 77, 109, 141, 173, 205, 237, + 14, 46, 78, 110, 142, 174, 206, 238, 15, 47, 79, 111, 143, 175, 207, 239, + 16, 48, 80, 112, 144, 176, 208, 240, 17, 49, 81, 113, 145, 177, 209, 241, + 18, 50, 82, 114, 146, 178, 210, 242, 19, 51, 83, 115, 147, 179, 211, 243, + 20, 52, 84, 116, 148, 180, 212, 244, 21, 53, 85, 117, 149, 181, 213, 245, + 22, 54, 86, 118, 150, 182, 214, 246, 23, 55, 87, 119, 151, 183, 215, 247, + 24, 56, 88, 120, 152, 184, 216, 248, 25, 57, 89, 121, 153, 185, 217, 249, + 26, 58, 90, 122, 154, 186, 218, 250, 27, 59, 91, 123, 155, 187, 219, 251, + 28, 60, 92, 124, 156, 188, 220, 252, 29, 61, 93, 125, 157, 189, 221, 253, + 30, 62, 94, 126, 158, 190, 222, 254, 31, 63, 95, 127, 159, 191, 223, 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x8[256]) = { + 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, + 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, + 240, 248, 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, + 105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, + 225, 233, 241, 249, 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, + 90, 98, 106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202, + 210, 218, 226, 234, 242, 250, 3, 11, 19, 27, 35, 43, 51, 59, 67, + 75, 83, 91, 99, 107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187, + 195, 203, 211, 219, 227, 235, 243, 251, 4, 12, 20, 28, 36, 44, 52, + 60, 68, 76, 84, 92, 100, 108, 116, 124, 132, 140, 148, 156, 164, 172, + 180, 188, 196, 204, 212, 220, 228, 236, 244, 252, 5, 13, 21, 29, 37, + 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 133, 141, 149, 157, + 165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253, 6, 14, 22, + 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 134, 142, + 150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254, 7, + 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, + 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x8[64]) = { + 0, 8, 16, 24, 32, 40, 48, 56, 1, 9, 17, 25, 33, 41, 49, 57, + 2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59, + 4, 12, 20, 28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61, + 6, 14, 22, 30, 38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x8[64]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x8[64]) = { + 0, 1, 5, 6, 14, 15, 27, 28, 2, 4, 7, 13, 16, 26, 29, 42, + 3, 8, 12, 17, 25, 30, 41, 43, 9, 11, 18, 24, 31, 40, 44, 53, + 10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60, + 21, 34, 37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63 +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x16[128]) = { + 0, 1, 3, 6, 10, 15, 21, 28, 2, 4, 7, 11, 16, 22, 29, 36, + 5, 8, 12, 17, 23, 30, 37, 44, 9, 13, 18, 24, 31, 38, 45, 52, + 14, 19, 25, 32, 39, 46, 53, 60, 20, 26, 33, 40, 47, 54, 61, 68, + 27, 34, 41, 48, 55, 62, 69, 76, 35, 42, 49, 56, 63, 70, 77, 84, + 43, 50, 57, 64, 71, 78, 85, 92, 51, 58, 65, 72, 79, 86, 93, 100, + 59, 66, 73, 80, 87, 94, 101, 107, 67, 74, 81, 88, 95, 102, 108, 113, + 75, 82, 89, 96, 103, 109, 114, 118, 83, 90, 97, 104, 110, 115, 119, 122, + 91, 98, 105, 111, 116, 120, 123, 125, 99, 106, 112, 117, 121, 124, 126, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x8[128]) = { + 0, 2, 5, 9, 14, 20, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, + 1, 4, 8, 13, 19, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, + 3, 7, 12, 18, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 112, + 6, 11, 17, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 111, 117, + 10, 16, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 110, 116, 121, + 15, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 109, 115, 120, 124, + 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 108, 114, 119, 123, 126, + 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 107, 113, 118, 122, 125, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x16[128]) = { + 0, 16, 32, 48, 64, 80, 96, 112, 1, 17, 33, 49, 65, 81, 97, 113, + 2, 18, 34, 50, 66, 82, 98, 114, 3, 19, 35, 51, 67, 83, 99, 115, + 4, 20, 36, 52, 68, 84, 100, 116, 5, 21, 37, 53, 69, 85, 101, 117, + 6, 22, 38, 54, 70, 86, 102, 118, 7, 23, 39, 55, 71, 87, 103, 119, + 8, 24, 40, 56, 72, 88, 104, 120, 9, 25, 41, 57, 73, 89, 105, 121, + 10, 26, 42, 58, 74, 90, 106, 122, 11, 27, 43, 59, 75, 91, 107, 123, + 12, 28, 44, 60, 76, 92, 108, 124, 13, 29, 45, 61, 77, 93, 109, 125, + 14, 30, 46, 62, 78, 94, 110, 126, 15, 31, 47, 63, 79, 95, 111, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x8[128]) = { + 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, + 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121, + 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122, + 3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123, + 4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, + 5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, + 6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, + 7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x16[128]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x8[128]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x32[512]) = { + 0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66, 78, 91, 105, + 120, 2, 4, 7, 11, 16, 22, 29, 37, 46, 56, 67, 79, 92, 106, + 121, 136, 5, 8, 12, 17, 23, 30, 38, 47, 57, 68, 80, 93, 107, + 122, 137, 152, 9, 13, 18, 24, 31, 39, 48, 58, 69, 81, 94, 108, + 123, 138, 153, 168, 14, 19, 25, 32, 40, 49, 59, 70, 82, 95, 109, + 124, 139, 154, 169, 184, 20, 26, 33, 41, 50, 60, 71, 83, 96, 110, + 125, 140, 155, 170, 185, 200, 27, 34, 42, 51, 61, 72, 84, 97, 111, + 126, 141, 156, 171, 186, 201, 216, 35, 43, 52, 62, 73, 85, 98, 112, + 127, 142, 157, 172, 187, 202, 217, 232, 44, 53, 63, 74, 86, 99, 113, + 128, 143, 158, 173, 188, 203, 218, 233, 248, 54, 64, 75, 87, 100, 114, + 129, 144, 159, 174, 189, 204, 219, 234, 249, 264, 65, 76, 88, 101, 115, + 130, 145, 160, 175, 190, 205, 220, 235, 250, 265, 280, 77, 89, 102, 116, + 131, 146, 161, 176, 191, 206, 221, 236, 251, 266, 281, 296, 90, 103, 117, + 132, 147, 162, 177, 192, 207, 222, 237, 252, 267, 282, 297, 312, 104, 118, + 133, 148, 163, 178, 193, 208, 223, 238, 253, 268, 283, 298, 313, 328, 119, + 134, 149, 164, 179, 194, 209, 224, 239, 254, 269, 284, 299, 314, 329, 344, + 135, 150, 165, 180, 195, 210, 225, 240, 255, 270, 285, 300, 315, 330, 345, + 360, 151, 166, 181, 196, 211, 226, 241, 256, 271, 286, 301, 316, 331, 346, + 361, 376, 167, 182, 197, 212, 227, 242, 257, 272, 287, 302, 317, 332, 347, + 362, 377, 392, 183, 198, 213, 228, 243, 258, 273, 288, 303, 318, 333, 348, + 363, 378, 393, 407, 199, 214, 229, 244, 259, 274, 289, 304, 319, 334, 349, + 364, 379, 394, 408, 421, 215, 230, 245, 260, 275, 290, 305, 320, 335, 350, + 365, 380, 395, 409, 422, 434, 231, 246, 261, 276, 291, 306, 321, 336, 351, + 366, 381, 396, 410, 423, 435, 446, 247, 262, 277, 292, 307, 322, 337, 352, + 367, 382, 397, 411, 424, 436, 447, 457, 263, 278, 293, 308, 323, 338, 353, + 368, 383, 398, 412, 425, 437, 448, 458, 467, 279, 294, 309, 324, 339, 354, + 369, 384, 399, 413, 426, 438, 449, 459, 468, 476, 295, 310, 325, 340, 355, + 370, 385, 400, 414, 427, 439, 450, 460, 469, 477, 484, 311, 326, 341, 356, + 371, 386, 401, 415, 428, 440, 451, 461, 470, 478, 485, 491, 327, 342, 357, + 372, 387, 402, 416, 429, 441, 452, 462, 471, 479, 486, 492, 497, 343, 358, + 373, 388, 403, 417, 430, 442, 453, 463, 472, 480, 487, 493, 498, 502, 359, + 374, 389, 404, 418, 431, 443, 454, 464, 473, 481, 488, 494, 499, 503, 506, + 375, 390, 405, 419, 432, 444, 455, 465, 474, 482, 489, 495, 500, 504, 507, + 509, 391, 406, 420, 433, 445, 456, 466, 475, 483, 490, 496, 501, 505, 508, + 510, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x16[512]) = { + 0, 2, 5, 9, 14, 20, 27, 35, 44, 54, 65, 77, 90, 104, 119, + 135, 151, 167, 183, 199, 215, 231, 247, 263, 279, 295, 311, 327, 343, 359, + 375, 391, 1, 4, 8, 13, 19, 26, 34, 43, 53, 64, 76, 89, 103, + 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278, 294, 310, 326, 342, + 358, 374, 390, 406, 3, 7, 12, 18, 25, 33, 42, 52, 63, 75, 88, + 102, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309, 325, + 341, 357, 373, 389, 405, 420, 6, 11, 17, 24, 32, 41, 51, 62, 74, + 87, 101, 116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, + 324, 340, 356, 372, 388, 404, 419, 433, 10, 16, 23, 31, 40, 50, 61, + 73, 86, 100, 115, 131, 147, 163, 179, 195, 211, 227, 243, 259, 275, 291, + 307, 323, 339, 355, 371, 387, 403, 418, 432, 445, 15, 22, 30, 39, 49, + 60, 72, 85, 99, 114, 130, 146, 162, 178, 194, 210, 226, 242, 258, 274, + 290, 306, 322, 338, 354, 370, 386, 402, 417, 431, 444, 456, 21, 29, 38, + 48, 59, 71, 84, 98, 113, 129, 145, 161, 177, 193, 209, 225, 241, 257, + 273, 289, 305, 321, 337, 353, 369, 385, 401, 416, 430, 443, 455, 466, 28, + 37, 47, 58, 70, 83, 97, 112, 128, 144, 160, 176, 192, 208, 224, 240, + 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 415, 429, 442, 454, 465, + 475, 36, 46, 57, 69, 82, 96, 111, 127, 143, 159, 175, 191, 207, 223, + 239, 255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 414, 428, 441, 453, + 464, 474, 483, 45, 56, 68, 81, 95, 110, 126, 142, 158, 174, 190, 206, + 222, 238, 254, 270, 286, 302, 318, 334, 350, 366, 382, 398, 413, 427, 440, + 452, 463, 473, 482, 490, 55, 67, 80, 94, 109, 125, 141, 157, 173, 189, + 205, 221, 237, 253, 269, 285, 301, 317, 333, 349, 365, 381, 397, 412, 426, + 439, 451, 462, 472, 481, 489, 496, 66, 79, 93, 108, 124, 140, 156, 172, + 188, 204, 220, 236, 252, 268, 284, 300, 316, 332, 348, 364, 380, 396, 411, + 425, 438, 450, 461, 471, 480, 488, 495, 501, 78, 92, 107, 123, 139, 155, + 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363, 379, 395, + 410, 424, 437, 449, 460, 470, 479, 487, 494, 500, 505, 91, 106, 122, 138, + 154, 170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, + 394, 409, 423, 436, 448, 459, 469, 478, 486, 493, 499, 504, 508, 105, 121, + 137, 153, 169, 185, 201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, + 377, 393, 408, 422, 435, 447, 458, 468, 477, 485, 492, 498, 503, 507, 510, + 120, 136, 152, 168, 184, 200, 216, 232, 248, 264, 280, 296, 312, 328, 344, + 360, 376, 392, 407, 421, 434, 446, 457, 467, 476, 484, 491, 497, 502, 506, + 509, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x32[512]) = { + 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, + 1, 33, 65, 97, 129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481, + 2, 34, 66, 98, 130, 162, 194, 226, 258, 290, 322, 354, 386, 418, 450, 482, + 3, 35, 67, 99, 131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483, + 4, 36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, 388, 420, 452, 484, + 5, 37, 69, 101, 133, 165, 197, 229, 261, 293, 325, 357, 389, 421, 453, 485, + 6, 38, 70, 102, 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486, + 7, 39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, 455, 487, + 8, 40, 72, 104, 136, 168, 200, 232, 264, 296, 328, 360, 392, 424, 456, 488, + 9, 41, 73, 105, 137, 169, 201, 233, 265, 297, 329, 361, 393, 425, 457, 489, + 10, 42, 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490, + 11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, 395, 427, 459, 491, + 12, 44, 76, 108, 140, 172, 204, 236, 268, 300, 332, 364, 396, 428, 460, 492, + 13, 45, 77, 109, 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493, + 14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, 462, 494, + 15, 47, 79, 111, 143, 175, 207, 239, 271, 303, 335, 367, 399, 431, 463, 495, + 16, 48, 80, 112, 144, 176, 208, 240, 272, 304, 336, 368, 400, 432, 464, 496, + 17, 49, 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497, + 18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, 402, 434, 466, 498, + 19, 51, 83, 115, 147, 179, 211, 243, 275, 307, 339, 371, 403, 435, 467, 499, + 20, 52, 84, 116, 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500, + 21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, 469, 501, + 22, 54, 86, 118, 150, 182, 214, 246, 278, 310, 342, 374, 406, 438, 470, 502, + 23, 55, 87, 119, 151, 183, 215, 247, 279, 311, 343, 375, 407, 439, 471, 503, + 24, 56, 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504, + 25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, 409, 441, 473, 505, + 26, 58, 90, 122, 154, 186, 218, 250, 282, 314, 346, 378, 410, 442, 474, 506, + 27, 59, 91, 123, 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507, + 28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, 476, 508, + 29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413, 445, 477, 509, + 30, 62, 94, 126, 158, 190, 222, 254, 286, 318, 350, 382, 414, 446, 478, 510, + 31, 63, 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x16[512]) = { + 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, + 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, + 480, 496, 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, + 209, 225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 401, 417, 433, + 449, 465, 481, 497, 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, + 178, 194, 210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 402, + 418, 434, 450, 466, 482, 498, 3, 19, 35, 51, 67, 83, 99, 115, 131, + 147, 163, 179, 195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371, + 387, 403, 419, 435, 451, 467, 483, 499, 4, 20, 36, 52, 68, 84, 100, + 116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340, + 356, 372, 388, 404, 420, 436, 452, 468, 484, 500, 5, 21, 37, 53, 69, + 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309, + 325, 341, 357, 373, 389, 405, 421, 437, 453, 469, 485, 501, 6, 22, 38, + 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278, + 294, 310, 326, 342, 358, 374, 390, 406, 422, 438, 454, 470, 486, 502, 7, + 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247, + 263, 279, 295, 311, 327, 343, 359, 375, 391, 407, 423, 439, 455, 471, 487, + 503, 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, + 232, 248, 264, 280, 296, 312, 328, 344, 360, 376, 392, 408, 424, 440, 456, + 472, 488, 504, 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, + 201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, 377, 393, 409, 425, + 441, 457, 473, 489, 505, 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, + 170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, 394, + 410, 426, 442, 458, 474, 490, 506, 11, 27, 43, 59, 75, 91, 107, 123, + 139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363, + 379, 395, 411, 427, 443, 459, 475, 491, 507, 12, 28, 44, 60, 76, 92, + 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284, 300, 316, 332, + 348, 364, 380, 396, 412, 428, 444, 460, 476, 492, 508, 13, 29, 45, 61, + 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269, 285, 301, + 317, 333, 349, 365, 381, 397, 413, 429, 445, 461, 477, 493, 509, 14, 30, + 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 270, + 286, 302, 318, 334, 350, 366, 382, 398, 414, 430, 446, 462, 478, 494, 510, + 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, + 255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 415, 431, 447, 463, 479, + 495, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x32[512]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, + 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, + 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, + 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, + 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, + 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, + 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, + 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, + 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, + 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, + 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, + 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, + 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, + 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, + 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, + 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, + 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, + 510, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x16[512]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, + 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, + 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, + 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, + 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, + 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, + 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, + 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, + 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, + 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, + 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, + 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, + 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, + 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, + 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, + 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, + 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, + 510, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x16[256]) = { + 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, + 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241, + 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242, + 3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243, + 4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244, + 5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, + 6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, + 7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247, + 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248, + 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249, + 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250, + 11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251, + 12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, + 13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, + 14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, + 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x16[256]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x16[256]) = { + 0, 1, 5, 6, 14, 15, 27, 28, 44, 45, 65, 66, 90, 91, 119, + 120, 2, 4, 7, 13, 16, 26, 29, 43, 46, 64, 67, 89, 92, 118, + 121, 150, 3, 8, 12, 17, 25, 30, 42, 47, 63, 68, 88, 93, 117, + 122, 149, 151, 9, 11, 18, 24, 31, 41, 48, 62, 69, 87, 94, 116, + 123, 148, 152, 177, 10, 19, 23, 32, 40, 49, 61, 70, 86, 95, 115, + 124, 147, 153, 176, 178, 20, 22, 33, 39, 50, 60, 71, 85, 96, 114, + 125, 146, 154, 175, 179, 200, 21, 34, 38, 51, 59, 72, 84, 97, 113, + 126, 145, 155, 174, 180, 199, 201, 35, 37, 52, 58, 73, 83, 98, 112, + 127, 144, 156, 173, 181, 198, 202, 219, 36, 53, 57, 74, 82, 99, 111, + 128, 143, 157, 172, 182, 197, 203, 218, 220, 54, 56, 75, 81, 100, 110, + 129, 142, 158, 171, 183, 196, 204, 217, 221, 234, 55, 76, 80, 101, 109, + 130, 141, 159, 170, 184, 195, 205, 216, 222, 233, 235, 77, 79, 102, 108, + 131, 140, 160, 169, 185, 194, 206, 215, 223, 232, 236, 245, 78, 103, 107, + 132, 139, 161, 168, 186, 193, 207, 214, 224, 231, 237, 244, 246, 104, 106, + 133, 138, 162, 167, 187, 192, 208, 213, 225, 230, 238, 243, 247, 252, 105, + 134, 137, 163, 166, 188, 191, 209, 212, 226, 229, 239, 242, 248, 251, 253, + 135, 136, 164, 165, 189, 190, 210, 211, 227, 228, 240, 241, 249, 250, 254, + 255 +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x32[1024]) = { + 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, + 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, + 896, 928, 960, 992, 1, 33, 65, 97, 129, 161, 193, 225, 257, 289, + 321, 353, 385, 417, 449, 481, 513, 545, 577, 609, 641, 673, 705, 737, + 769, 801, 833, 865, 897, 929, 961, 993, 2, 34, 66, 98, 130, 162, + 194, 226, 258, 290, 322, 354, 386, 418, 450, 482, 514, 546, 578, 610, + 642, 674, 706, 738, 770, 802, 834, 866, 898, 930, 962, 994, 3, 35, + 67, 99, 131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483, + 515, 547, 579, 611, 643, 675, 707, 739, 771, 803, 835, 867, 899, 931, + 963, 995, 4, 36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, + 388, 420, 452, 484, 516, 548, 580, 612, 644, 676, 708, 740, 772, 804, + 836, 868, 900, 932, 964, 996, 5, 37, 69, 101, 133, 165, 197, 229, + 261, 293, 325, 357, 389, 421, 453, 485, 517, 549, 581, 613, 645, 677, + 709, 741, 773, 805, 837, 869, 901, 933, 965, 997, 6, 38, 70, 102, + 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486, 518, 550, + 582, 614, 646, 678, 710, 742, 774, 806, 838, 870, 902, 934, 966, 998, + 7, 39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, + 455, 487, 519, 551, 583, 615, 647, 679, 711, 743, 775, 807, 839, 871, + 903, 935, 967, 999, 8, 40, 72, 104, 136, 168, 200, 232, 264, 296, + 328, 360, 392, 424, 456, 488, 520, 552, 584, 616, 648, 680, 712, 744, + 776, 808, 840, 872, 904, 936, 968, 1000, 9, 41, 73, 105, 137, 169, + 201, 233, 265, 297, 329, 361, 393, 425, 457, 489, 521, 553, 585, 617, + 649, 681, 713, 745, 777, 809, 841, 873, 905, 937, 969, 1001, 10, 42, + 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490, + 522, 554, 586, 618, 650, 682, 714, 746, 778, 810, 842, 874, 906, 938, + 970, 1002, 11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, + 395, 427, 459, 491, 523, 555, 587, 619, 651, 683, 715, 747, 779, 811, + 843, 875, 907, 939, 971, 1003, 12, 44, 76, 108, 140, 172, 204, 236, + 268, 300, 332, 364, 396, 428, 460, 492, 524, 556, 588, 620, 652, 684, + 716, 748, 780, 812, 844, 876, 908, 940, 972, 1004, 13, 45, 77, 109, + 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493, 525, 557, + 589, 621, 653, 685, 717, 749, 781, 813, 845, 877, 909, 941, 973, 1005, + 14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, + 462, 494, 526, 558, 590, 622, 654, 686, 718, 750, 782, 814, 846, 878, + 910, 942, 974, 1006, 15, 47, 79, 111, 143, 175, 207, 239, 271, 303, + 335, 367, 399, 431, 463, 495, 527, 559, 591, 623, 655, 687, 719, 751, + 783, 815, 847, 879, 911, 943, 975, 1007, 16, 48, 80, 112, 144, 176, + 208, 240, 272, 304, 336, 368, 400, 432, 464, 496, 528, 560, 592, 624, + 656, 688, 720, 752, 784, 816, 848, 880, 912, 944, 976, 1008, 17, 49, + 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497, + 529, 561, 593, 625, 657, 689, 721, 753, 785, 817, 849, 881, 913, 945, + 977, 1009, 18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, + 402, 434, 466, 498, 530, 562, 594, 626, 658, 690, 722, 754, 786, 818, + 850, 882, 914, 946, 978, 1010, 19, 51, 83, 115, 147, 179, 211, 243, + 275, 307, 339, 371, 403, 435, 467, 499, 531, 563, 595, 627, 659, 691, + 723, 755, 787, 819, 851, 883, 915, 947, 979, 1011, 20, 52, 84, 116, + 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500, 532, 564, + 596, 628, 660, 692, 724, 756, 788, 820, 852, 884, 916, 948, 980, 1012, + 21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, + 469, 501, 533, 565, 597, 629, 661, 693, 725, 757, 789, 821, 853, 885, + 917, 949, 981, 1013, 22, 54, 86, 118, 150, 182, 214, 246, 278, 310, + 342, 374, 406, 438, 470, 502, 534, 566, 598, 630, 662, 694, 726, 758, + 790, 822, 854, 886, 918, 950, 982, 1014, 23, 55, 87, 119, 151, 183, + 215, 247, 279, 311, 343, 375, 407, 439, 471, 503, 535, 567, 599, 631, + 663, 695, 727, 759, 791, 823, 855, 887, 919, 951, 983, 1015, 24, 56, + 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504, + 536, 568, 600, 632, 664, 696, 728, 760, 792, 824, 856, 888, 920, 952, + 984, 1016, 25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, + 409, 441, 473, 505, 537, 569, 601, 633, 665, 697, 729, 761, 793, 825, + 857, 889, 921, 953, 985, 1017, 26, 58, 90, 122, 154, 186, 218, 250, + 282, 314, 346, 378, 410, 442, 474, 506, 538, 570, 602, 634, 666, 698, + 730, 762, 794, 826, 858, 890, 922, 954, 986, 1018, 27, 59, 91, 123, + 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507, 539, 571, + 603, 635, 667, 699, 731, 763, 795, 827, 859, 891, 923, 955, 987, 1019, + 28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, + 476, 508, 540, 572, 604, 636, 668, 700, 732, 764, 796, 828, 860, 892, + 924, 956, 988, 1020, 29, 61, 93, 125, 157, 189, 221, 253, 285, 317, + 349, 381, 413, 445, 477, 509, 541, 573, 605, 637, 669, 701, 733, 765, + 797, 829, 861, 893, 925, 957, 989, 1021, 30, 62, 94, 126, 158, 190, + 222, 254, 286, 318, 350, 382, 414, 446, 478, 510, 542, 574, 606, 638, + 670, 702, 734, 766, 798, 830, 862, 894, 926, 958, 990, 1022, 31, 63, + 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511, + 543, 575, 607, 639, 671, 703, 735, 767, 799, 831, 863, 895, 927, 959, + 991, 1023, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x32[1024]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, + 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, + 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, + 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, + 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, + 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, + 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, + 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, + 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, + 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, + 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, + 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, + 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, + 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, + 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, + 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, + 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, + 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, + 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, + 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, + 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, + 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, + 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, + 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, + 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, + 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, + 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, + 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, + 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, + 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, + 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, + 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, + 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, + 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, + 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, + 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, + 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, + 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, + 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, + 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, + 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, + 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, + 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, + 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, + 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, + 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, + 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, + 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, + 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, + 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, + 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, + 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, + 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, + 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, + 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, + 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, + 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, + 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, + 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, + 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, + 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, + 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, + 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, + 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, + 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, + 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, + 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, + 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, + 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, + 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, + 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, + 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, + 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x32[1024]) = { + 0, 1, 5, 6, 14, 15, 27, 28, 44, 45, 65, 66, 90, + 91, 119, 120, 152, 153, 189, 190, 230, 231, 275, 276, 324, 325, + 377, 378, 434, 435, 495, 496, 2, 4, 7, 13, 16, 26, 29, + 43, 46, 64, 67, 89, 92, 118, 121, 151, 154, 188, 191, 229, + 232, 274, 277, 323, 326, 376, 379, 433, 436, 494, 497, 558, 3, + 8, 12, 17, 25, 30, 42, 47, 63, 68, 88, 93, 117, 122, + 150, 155, 187, 192, 228, 233, 273, 278, 322, 327, 375, 380, 432, + 437, 493, 498, 557, 559, 9, 11, 18, 24, 31, 41, 48, 62, + 69, 87, 94, 116, 123, 149, 156, 186, 193, 227, 234, 272, 279, + 321, 328, 374, 381, 431, 438, 492, 499, 556, 560, 617, 10, 19, + 23, 32, 40, 49, 61, 70, 86, 95, 115, 124, 148, 157, 185, + 194, 226, 235, 271, 280, 320, 329, 373, 382, 430, 439, 491, 500, + 555, 561, 616, 618, 20, 22, 33, 39, 50, 60, 71, 85, 96, + 114, 125, 147, 158, 184, 195, 225, 236, 270, 281, 319, 330, 372, + 383, 429, 440, 490, 501, 554, 562, 615, 619, 672, 21, 34, 38, + 51, 59, 72, 84, 97, 113, 126, 146, 159, 183, 196, 224, 237, + 269, 282, 318, 331, 371, 384, 428, 441, 489, 502, 553, 563, 614, + 620, 671, 673, 35, 37, 52, 58, 73, 83, 98, 112, 127, 145, + 160, 182, 197, 223, 238, 268, 283, 317, 332, 370, 385, 427, 442, + 488, 503, 552, 564, 613, 621, 670, 674, 723, 36, 53, 57, 74, + 82, 99, 111, 128, 144, 161, 181, 198, 222, 239, 267, 284, 316, + 333, 369, 386, 426, 443, 487, 504, 551, 565, 612, 622, 669, 675, + 722, 724, 54, 56, 75, 81, 100, 110, 129, 143, 162, 180, 199, + 221, 240, 266, 285, 315, 334, 368, 387, 425, 444, 486, 505, 550, + 566, 611, 623, 668, 676, 721, 725, 770, 55, 76, 80, 101, 109, + 130, 142, 163, 179, 200, 220, 241, 265, 286, 314, 335, 367, 388, + 424, 445, 485, 506, 549, 567, 610, 624, 667, 677, 720, 726, 769, + 771, 77, 79, 102, 108, 131, 141, 164, 178, 201, 219, 242, 264, + 287, 313, 336, 366, 389, 423, 446, 484, 507, 548, 568, 609, 625, + 666, 678, 719, 727, 768, 772, 813, 78, 103, 107, 132, 140, 165, + 177, 202, 218, 243, 263, 288, 312, 337, 365, 390, 422, 447, 483, + 508, 547, 569, 608, 626, 665, 679, 718, 728, 767, 773, 812, 814, + 104, 106, 133, 139, 166, 176, 203, 217, 244, 262, 289, 311, 338, + 364, 391, 421, 448, 482, 509, 546, 570, 607, 627, 664, 680, 717, + 729, 766, 774, 811, 815, 852, 105, 134, 138, 167, 175, 204, 216, + 245, 261, 290, 310, 339, 363, 392, 420, 449, 481, 510, 545, 571, + 606, 628, 663, 681, 716, 730, 765, 775, 810, 816, 851, 853, 135, + 137, 168, 174, 205, 215, 246, 260, 291, 309, 340, 362, 393, 419, + 450, 480, 511, 544, 572, 605, 629, 662, 682, 715, 731, 764, 776, + 809, 817, 850, 854, 887, 136, 169, 173, 206, 214, 247, 259, 292, + 308, 341, 361, 394, 418, 451, 479, 512, 543, 573, 604, 630, 661, + 683, 714, 732, 763, 777, 808, 818, 849, 855, 886, 888, 170, 172, + 207, 213, 248, 258, 293, 307, 342, 360, 395, 417, 452, 478, 513, + 542, 574, 603, 631, 660, 684, 713, 733, 762, 778, 807, 819, 848, + 856, 885, 889, 918, 171, 208, 212, 249, 257, 294, 306, 343, 359, + 396, 416, 453, 477, 514, 541, 575, 602, 632, 659, 685, 712, 734, + 761, 779, 806, 820, 847, 857, 884, 890, 917, 919, 209, 211, 250, + 256, 295, 305, 344, 358, 397, 415, 454, 476, 515, 540, 576, 601, + 633, 658, 686, 711, 735, 760, 780, 805, 821, 846, 858, 883, 891, + 916, 920, 945, 210, 251, 255, 296, 304, 345, 357, 398, 414, 455, + 475, 516, 539, 577, 600, 634, 657, 687, 710, 736, 759, 781, 804, + 822, 845, 859, 882, 892, 915, 921, 944, 946, 252, 254, 297, 303, + 346, 356, 399, 413, 456, 474, 517, 538, 578, 599, 635, 656, 688, + 709, 737, 758, 782, 803, 823, 844, 860, 881, 893, 914, 922, 943, + 947, 968, 253, 298, 302, 347, 355, 400, 412, 457, 473, 518, 537, + 579, 598, 636, 655, 689, 708, 738, 757, 783, 802, 824, 843, 861, + 880, 894, 913, 923, 942, 948, 967, 969, 299, 301, 348, 354, 401, + 411, 458, 472, 519, 536, 580, 597, 637, 654, 690, 707, 739, 756, + 784, 801, 825, 842, 862, 879, 895, 912, 924, 941, 949, 966, 970, + 987, 300, 349, 353, 402, 410, 459, 471, 520, 535, 581, 596, 638, + 653, 691, 706, 740, 755, 785, 800, 826, 841, 863, 878, 896, 911, + 925, 940, 950, 965, 971, 986, 988, 350, 352, 403, 409, 460, 470, + 521, 534, 582, 595, 639, 652, 692, 705, 741, 754, 786, 799, 827, + 840, 864, 877, 897, 910, 926, 939, 951, 964, 972, 985, 989, 1002, + 351, 404, 408, 461, 469, 522, 533, 583, 594, 640, 651, 693, 704, + 742, 753, 787, 798, 828, 839, 865, 876, 898, 909, 927, 938, 952, + 963, 973, 984, 990, 1001, 1003, 405, 407, 462, 468, 523, 532, 584, + 593, 641, 650, 694, 703, 743, 752, 788, 797, 829, 838, 866, 875, + 899, 908, 928, 937, 953, 962, 974, 983, 991, 1000, 1004, 1013, 406, + 463, 467, 524, 531, 585, 592, 642, 649, 695, 702, 744, 751, 789, + 796, 830, 837, 867, 874, 900, 907, 929, 936, 954, 961, 975, 982, + 992, 999, 1005, 1012, 1014, 464, 466, 525, 530, 586, 591, 643, 648, + 696, 701, 745, 750, 790, 795, 831, 836, 868, 873, 901, 906, 930, + 935, 955, 960, 976, 981, 993, 998, 1006, 1011, 1015, 1020, 465, 526, + 529, 587, 590, 644, 647, 697, 700, 746, 749, 791, 794, 832, 835, + 869, 872, 902, 905, 931, 934, 956, 959, 977, 980, 994, 997, 1007, + 1010, 1016, 1019, 1021, 527, 528, 588, 589, 645, 646, 698, 699, 747, + 748, 792, 793, 833, 834, 870, 871, 903, 904, 932, 933, 957, 958, + 978, 979, 995, 996, 1008, 1009, 1017, 1018, 1022, 1023 +}; + +const SCAN_ORDER av1_default_scan_orders[TX_SIZES] = { + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + // Half of the coefficients of tx64 at higher frequencies are set to + // zeros. So tx32's scan order is used. + { default_scan_32x32, av1_default_iscan_32x32 }, +}; + +const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES] = { + { + // TX_4X4 + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { mrow_scan_4x4, av1_mrow_iscan_4x4 }, + { mcol_scan_4x4, av1_mcol_iscan_4x4 }, + { mrow_scan_4x4, av1_mrow_iscan_4x4 }, + { mcol_scan_4x4, av1_mcol_iscan_4x4 }, + { mrow_scan_4x4, av1_mrow_iscan_4x4 }, + { mcol_scan_4x4, av1_mcol_iscan_4x4 }, + }, + { + // TX_8X8 + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { mrow_scan_8x8, av1_mrow_iscan_8x8 }, + { mcol_scan_8x8, av1_mcol_iscan_8x8 }, + { mrow_scan_8x8, av1_mrow_iscan_8x8 }, + { mcol_scan_8x8, av1_mcol_iscan_8x8 }, + { mrow_scan_8x8, av1_mrow_iscan_8x8 }, + { mcol_scan_8x8, av1_mcol_iscan_8x8 }, + }, + { + // TX_16X16 + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { mrow_scan_16x16, av1_mrow_iscan_16x16 }, + { mcol_scan_16x16, av1_mcol_iscan_16x16 }, + { mrow_scan_16x16, av1_mrow_iscan_16x16 }, + { mcol_scan_16x16, av1_mcol_iscan_16x16 }, + { mrow_scan_16x16, av1_mrow_iscan_16x16 }, + { mcol_scan_16x16, av1_mcol_iscan_16x16 }, + }, + { + // TX_32X32 + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + }, + { + // TX_64X64 + // Half of the coefficients of tx64 at higher frequencies are set to + // zeros. So tx32's scan order is used. + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + }, + { + // TX_4X8 + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { mrow_scan_4x8, av1_mrow_iscan_4x8 }, + { mcol_scan_4x8, av1_mcol_iscan_4x8 }, + { mrow_scan_4x8, av1_mrow_iscan_4x8 }, + { mcol_scan_4x8, av1_mcol_iscan_4x8 }, + { mrow_scan_4x8, av1_mrow_iscan_4x8 }, + { mcol_scan_4x8, av1_mcol_iscan_4x8 }, + }, + { + // TX_8X4 + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { mrow_scan_8x4, av1_mrow_iscan_8x4 }, + { mcol_scan_8x4, av1_mcol_iscan_8x4 }, + { mrow_scan_8x4, av1_mrow_iscan_8x4 }, + { mcol_scan_8x4, av1_mcol_iscan_8x4 }, + { mrow_scan_8x4, av1_mrow_iscan_8x4 }, + { mcol_scan_8x4, av1_mcol_iscan_8x4 }, + }, + { + // TX_8X16 + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { mrow_scan_8x16, av1_mrow_iscan_8x16 }, + { mcol_scan_8x16, av1_mcol_iscan_8x16 }, + { mrow_scan_8x16, av1_mrow_iscan_8x16 }, + { mcol_scan_8x16, av1_mcol_iscan_8x16 }, + { mrow_scan_8x16, av1_mrow_iscan_8x16 }, + { mcol_scan_8x16, av1_mcol_iscan_8x16 }, + }, + { + // TX_16X8 + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { mrow_scan_16x8, av1_mrow_iscan_16x8 }, + { mcol_scan_16x8, av1_mcol_iscan_16x8 }, + { mrow_scan_16x8, av1_mrow_iscan_16x8 }, + { mcol_scan_16x8, av1_mcol_iscan_16x8 }, + { mrow_scan_16x8, av1_mrow_iscan_16x8 }, + { mcol_scan_16x8, av1_mcol_iscan_16x8 }, + }, + { + // TX_16X32 + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { mrow_scan_16x32, av1_mrow_iscan_16x32 }, + { mcol_scan_16x32, av1_mcol_iscan_16x32 }, + { mrow_scan_16x32, av1_mrow_iscan_16x32 }, + { mcol_scan_16x32, av1_mcol_iscan_16x32 }, + { mrow_scan_16x32, av1_mrow_iscan_16x32 }, + { mcol_scan_16x32, av1_mcol_iscan_16x32 }, + }, + { + // TX_32X16 + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { mrow_scan_32x16, av1_mrow_iscan_32x16 }, + { mcol_scan_32x16, av1_mcol_iscan_32x16 }, + { mrow_scan_32x16, av1_mrow_iscan_32x16 }, + { mcol_scan_32x16, av1_mcol_iscan_32x16 }, + { mrow_scan_32x16, av1_mrow_iscan_32x16 }, + { mcol_scan_32x16, av1_mcol_iscan_32x16 }, + }, + { + // TX_32X64 + // Half of the coefficients of tx64 at higher frequencies are set to + // zeros. So tx32's scan order is used. + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + }, + { + // TX_64X32 + // Half of the coefficients of tx64 at higher frequencies are set to + // zeros. So tx32's scan order is used. + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + }, + { + // TX_4X16 + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { mrow_scan_4x16, av1_mrow_iscan_4x16 }, + { mcol_scan_4x16, av1_mcol_iscan_4x16 }, + { mrow_scan_4x16, av1_mrow_iscan_4x16 }, + { mcol_scan_4x16, av1_mcol_iscan_4x16 }, + { mrow_scan_4x16, av1_mrow_iscan_4x16 }, + { mcol_scan_4x16, av1_mcol_iscan_4x16 }, + }, + { + // TX_16X4 + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { mrow_scan_16x4, av1_mrow_iscan_16x4 }, + { mcol_scan_16x4, av1_mcol_iscan_16x4 }, + { mrow_scan_16x4, av1_mrow_iscan_16x4 }, + { mcol_scan_16x4, av1_mcol_iscan_16x4 }, + { mrow_scan_16x4, av1_mrow_iscan_16x4 }, + { mcol_scan_16x4, av1_mcol_iscan_16x4 }, + }, + { + // TX_8X32 + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { mrow_scan_8x32, av1_mrow_iscan_8x32 }, + { mcol_scan_8x32, av1_mcol_iscan_8x32 }, + { mrow_scan_8x32, av1_mrow_iscan_8x32 }, + { mcol_scan_8x32, av1_mcol_iscan_8x32 }, + { mrow_scan_8x32, av1_mrow_iscan_8x32 }, + { mcol_scan_8x32, av1_mcol_iscan_8x32 }, + }, + { + // TX_32X8 + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { mrow_scan_32x8, av1_mrow_iscan_32x8 }, + { mcol_scan_32x8, av1_mcol_iscan_32x8 }, + { mrow_scan_32x8, av1_mrow_iscan_32x8 }, + { mcol_scan_32x8, av1_mcol_iscan_32x8 }, + { mrow_scan_32x8, av1_mrow_iscan_32x8 }, + { mcol_scan_32x8, av1_mcol_iscan_32x8 }, + }, + { + // TX_16X64 + // Half of the coefficients of tx64 at higher frequencies are set to + // zeros. So tx32's scan order is used. + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { mrow_scan_16x32, av1_mrow_iscan_16x32 }, + { mcol_scan_16x32, av1_mcol_iscan_16x32 }, + { mrow_scan_16x32, av1_mrow_iscan_16x32 }, + { mcol_scan_16x32, av1_mcol_iscan_16x32 }, + { mrow_scan_16x32, av1_mrow_iscan_16x32 }, + { mcol_scan_16x32, av1_mcol_iscan_16x32 }, + }, + { + // TX_64X16 + // Half of the coefficients of tx64 at higher frequencies are set to + // zeros. So tx32's scan order is used. + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { mrow_scan_32x16, av1_mrow_iscan_32x16 }, + { mcol_scan_32x16, av1_mcol_iscan_32x16 }, + { mrow_scan_32x16, av1_mrow_iscan_32x16 }, + { mcol_scan_32x16, av1_mcol_iscan_32x16 }, + { mrow_scan_32x16, av1_mrow_iscan_32x16 }, + { mcol_scan_32x16, av1_mcol_iscan_32x16 }, + }, +}; diff --git a/libs/libaom/src/av1/common/scan.h b/libs/libaom/src/av1/common/scan.h new file mode 100644 index 000000000..d9620e1c5 --- /dev/null +++ b/libs/libaom/src/av1/common/scan.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_SCAN_H_ +#define AOM_AV1_COMMON_SCAN_H_ + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/enums.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_NEIGHBORS 2 + +enum { + SCAN_MODE_ZIG_ZAG, + SCAN_MODE_COL_DIAG, + SCAN_MODE_ROW_DIAG, + SCAN_MODE_COL_1D, + SCAN_MODE_ROW_1D, + SCAN_MODES +} UENUM1BYTE(SCAN_MODE); + +extern const SCAN_ORDER av1_default_scan_orders[TX_SIZES]; +extern const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES]; + +void av1_deliver_eob_threshold(const AV1_COMMON *cm, MACROBLOCKD *xd); + +static INLINE const SCAN_ORDER *get_default_scan(TX_SIZE tx_size, + TX_TYPE tx_type) { + return &av1_scan_orders[tx_size][tx_type]; +} + +static INLINE const SCAN_ORDER *get_scan(TX_SIZE tx_size, TX_TYPE tx_type) { + return get_default_scan(tx_size, tx_type); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_SCAN_H_ diff --git a/libs/libaom/src/av1/common/seg_common.c b/libs/libaom/src/av1/common/seg_common.c new file mode 100644 index 000000000..60b185161 --- /dev/null +++ b/libs/libaom/src/av1/common/seg_common.c @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/av1_loopfilter.h" +#include "av1/common/blockd.h" +#include "av1/common/seg_common.h" +#include "av1/common/quant_common.h" + +static const int seg_feature_data_signed[SEG_LVL_MAX] = { + 1, 1, 1, 1, 1, 0, 0, 0 +}; + +static const int seg_feature_data_max[SEG_LVL_MAX] = { MAXQ, + MAX_LOOP_FILTER, + MAX_LOOP_FILTER, + MAX_LOOP_FILTER, + MAX_LOOP_FILTER, + 7, + 0, + 0 }; + +// These functions provide access to new segment level features. +// Eventually these function may be "optimized out" but for the moment, +// the coding mechanism is still subject to change so these provide a +// convenient single point of change. + +void av1_clearall_segfeatures(struct segmentation *seg) { + av1_zero(seg->feature_data); + av1_zero(seg->feature_mask); +} + +void av1_calculate_segdata(struct segmentation *seg) { + seg->segid_preskip = 0; + seg->last_active_segid = 0; + for (int i = 0; i < MAX_SEGMENTS; i++) { + for (int j = 0; j < SEG_LVL_MAX; j++) { + if (seg->feature_mask[i] & (1 << j)) { + seg->segid_preskip |= (j >= SEG_LVL_REF_FRAME); + seg->last_active_segid = i; + } + } + } +} + +void av1_enable_segfeature(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id) { + seg->feature_mask[segment_id] |= 1 << feature_id; +} + +int av1_seg_feature_data_max(SEG_LVL_FEATURES feature_id) { + return seg_feature_data_max[feature_id]; +} + +int av1_is_segfeature_signed(SEG_LVL_FEATURES feature_id) { + return seg_feature_data_signed[feature_id]; +} + +// The 'seg_data' given for each segment can be either deltas (from the default +// value chosen for the frame) or absolute values. +// +// Valid range for abs values is (0-127 for MB_LVL_ALT_Q), (0-63 for +// SEGMENT_ALT_LF) +// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q), (+/-63 for +// SEGMENT_ALT_LF) +// +// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use +// the absolute values given). + +void av1_set_segdata(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id, int seg_data) { + if (seg_data < 0) { + assert(seg_feature_data_signed[feature_id]); + assert(-seg_data <= seg_feature_data_max[feature_id]); + } else { + assert(seg_data <= seg_feature_data_max[feature_id]); + } + + seg->feature_data[segment_id][feature_id] = seg_data; +} + +// TBD? Functions to read and write segment data with range / validity checking diff --git a/libs/libaom/src/av1/common/seg_common.h b/libs/libaom/src/av1/common/seg_common.h new file mode 100644 index 000000000..aeb9c1768 --- /dev/null +++ b/libs/libaom/src/av1/common/seg_common.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_SEG_COMMON_H_ +#define AOM_AV1_COMMON_SEG_COMMON_H_ + +#include "aom_dsp/prob.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_SEGMENTS 8 +#define SEG_TREE_PROBS (MAX_SEGMENTS - 1) + +#define SEG_TEMPORAL_PRED_CTXS 3 +#define SPATIAL_PREDICTION_PROBS 3 + +enum { + SEG_LVL_ALT_Q, // Use alternate Quantizer .... + SEG_LVL_ALT_LF_Y_V, // Use alternate loop filter value on y plane vertical + SEG_LVL_ALT_LF_Y_H, // Use alternate loop filter value on y plane horizontal + SEG_LVL_ALT_LF_U, // Use alternate loop filter value on u plane + SEG_LVL_ALT_LF_V, // Use alternate loop filter value on v plane + SEG_LVL_REF_FRAME, // Optional Segment reference frame + SEG_LVL_SKIP, // Optional Segment (0,0) + skip mode + SEG_LVL_GLOBALMV, + SEG_LVL_MAX +} UENUM1BYTE(SEG_LVL_FEATURES); + +struct segmentation { + uint8_t enabled; + uint8_t update_map; + uint8_t update_data; + uint8_t temporal_update; + + int16_t feature_data[MAX_SEGMENTS][SEG_LVL_MAX]; + unsigned int feature_mask[MAX_SEGMENTS]; + int last_active_segid; // The highest numbered segment id that has some + // enabled feature. + uint8_t segid_preskip; // Whether the segment id will be read before the + // skip syntax element. + // 1: the segment id will be read first. + // 0: the skip syntax element will be read first. +}; + +struct segmentation_probs { + aom_cdf_prob tree_cdf[CDF_SIZE(MAX_SEGMENTS)]; + aom_cdf_prob pred_cdf[SEG_TEMPORAL_PRED_CTXS][CDF_SIZE(2)]; + aom_cdf_prob spatial_pred_seg_cdf[SPATIAL_PREDICTION_PROBS] + [CDF_SIZE(MAX_SEGMENTS)]; +}; + +static INLINE int segfeature_active(const struct segmentation *seg, + int segment_id, + SEG_LVL_FEATURES feature_id) { + return seg->enabled && (seg->feature_mask[segment_id] & (1 << feature_id)); +} + +static INLINE void segfeatures_copy(struct segmentation *dst, + const struct segmentation *src) { + int i, j; + for (i = 0; i < MAX_SEGMENTS; i++) { + dst->feature_mask[i] = src->feature_mask[i]; + for (j = 0; j < SEG_LVL_MAX; j++) { + dst->feature_data[i][j] = src->feature_data[i][j]; + } + } + dst->segid_preskip = src->segid_preskip; + dst->last_active_segid = src->last_active_segid; +} + +void av1_clearall_segfeatures(struct segmentation *seg); + +void av1_enable_segfeature(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id); + +void av1_calculate_segdata(struct segmentation *seg); + +int av1_seg_feature_data_max(SEG_LVL_FEATURES feature_id); + +int av1_is_segfeature_signed(SEG_LVL_FEATURES feature_id); + +void av1_set_segdata(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id, int seg_data); + +static INLINE int get_segdata(const struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id) { + return seg->feature_data[segment_id][feature_id]; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_SEG_COMMON_H_ diff --git a/libs/libaom/src/av1/common/thread_common.c b/libs/libaom/src/av1/common/thread_common.c new file mode 100644 index 000000000..f3c8795f8 --- /dev/null +++ b/libs/libaom/src/av1/common/thread_common.c @@ -0,0 +1,930 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" +#include "config/aom_scale_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "av1/common/av1_loopfilter.h" +#include "av1/common/entropymode.h" +#include "av1/common/thread_common.h" +#include "av1/common/reconinter.h" + +// Set up nsync by width. +static INLINE int get_sync_range(int width) { + // nsync numbers are picked by testing. For example, for 4k + // video, using 4 gives best performance. + if (width < 640) + return 1; + else if (width <= 1280) + return 2; + else if (width <= 4096) + return 4; + else + return 8; +} + +static INLINE int get_lr_sync_range(int width) { +#if 0 + // nsync numbers are picked by testing. For example, for 4k + // video, using 4 gives best performance. + if (width < 640) + return 1; + else if (width <= 1280) + return 2; + else if (width <= 4096) + return 4; + else + return 8; +#else + (void)width; + return 1; +#endif +} + +// Allocate memory for lf row synchronization +static void loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows, + int width, int num_workers) { + lf_sync->rows = rows; +#if CONFIG_MULTITHREAD + { + int i, j; + + for (j = 0; j < MAX_MB_PLANE; j++) { + CHECK_MEM_ERROR(cm, lf_sync->mutex_[j], + aom_malloc(sizeof(*(lf_sync->mutex_[j])) * rows)); + if (lf_sync->mutex_[j]) { + for (i = 0; i < rows; ++i) { + pthread_mutex_init(&lf_sync->mutex_[j][i], NULL); + } + } + + CHECK_MEM_ERROR(cm, lf_sync->cond_[j], + aom_malloc(sizeof(*(lf_sync->cond_[j])) * rows)); + if (lf_sync->cond_[j]) { + for (i = 0; i < rows; ++i) { + pthread_cond_init(&lf_sync->cond_[j][i], NULL); + } + } + } + + CHECK_MEM_ERROR(cm, lf_sync->job_mutex, + aom_malloc(sizeof(*(lf_sync->job_mutex)))); + if (lf_sync->job_mutex) { + pthread_mutex_init(lf_sync->job_mutex, NULL); + } + } +#endif // CONFIG_MULTITHREAD + CHECK_MEM_ERROR(cm, lf_sync->lfdata, + aom_malloc(num_workers * sizeof(*(lf_sync->lfdata)))); + lf_sync->num_workers = num_workers; + + for (int j = 0; j < MAX_MB_PLANE; j++) { + CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col[j], + aom_malloc(sizeof(*(lf_sync->cur_sb_col[j])) * rows)); + } + CHECK_MEM_ERROR( + cm, lf_sync->job_queue, + aom_malloc(sizeof(*(lf_sync->job_queue)) * rows * MAX_MB_PLANE * 2)); + // Set up nsync. + lf_sync->sync_range = get_sync_range(width); +} + +// Deallocate lf synchronization related mutex and data +void av1_loop_filter_dealloc(AV1LfSync *lf_sync) { + if (lf_sync != NULL) { + int j; +#if CONFIG_MULTITHREAD + int i; + for (j = 0; j < MAX_MB_PLANE; j++) { + if (lf_sync->mutex_[j] != NULL) { + for (i = 0; i < lf_sync->rows; ++i) { + pthread_mutex_destroy(&lf_sync->mutex_[j][i]); + } + aom_free(lf_sync->mutex_[j]); + } + if (lf_sync->cond_[j] != NULL) { + for (i = 0; i < lf_sync->rows; ++i) { + pthread_cond_destroy(&lf_sync->cond_[j][i]); + } + aom_free(lf_sync->cond_[j]); + } + } + if (lf_sync->job_mutex != NULL) { + pthread_mutex_destroy(lf_sync->job_mutex); + aom_free(lf_sync->job_mutex); + } +#endif // CONFIG_MULTITHREAD + aom_free(lf_sync->lfdata); + for (j = 0; j < MAX_MB_PLANE; j++) { + aom_free(lf_sync->cur_sb_col[j]); + } + + aom_free(lf_sync->job_queue); + // clear the structure as the source of this call may be a resize in which + // case this call will be followed by an _alloc() which may fail. + av1_zero(*lf_sync); + } +} + +static void loop_filter_data_reset(LFWorkerData *lf_data, + YV12_BUFFER_CONFIG *frame_buffer, + struct AV1Common *cm, MACROBLOCKD *xd) { + struct macroblockd_plane *pd = xd->plane; + lf_data->frame_buffer = frame_buffer; + lf_data->cm = cm; + lf_data->xd = xd; + for (int i = 0; i < MAX_MB_PLANE; i++) { + memcpy(&lf_data->planes[i].dst, &pd[i].dst, sizeof(lf_data->planes[i].dst)); + lf_data->planes[i].subsampling_x = pd[i].subsampling_x; + lf_data->planes[i].subsampling_y = pd[i].subsampling_y; + } +} + +static INLINE void sync_read(AV1LfSync *const lf_sync, int r, int c, + int plane) { +#if CONFIG_MULTITHREAD + const int nsync = lf_sync->sync_range; + + if (r && !(c & (nsync - 1))) { + pthread_mutex_t *const mutex = &lf_sync->mutex_[plane][r - 1]; + pthread_mutex_lock(mutex); + + while (c > lf_sync->cur_sb_col[plane][r - 1] - nsync) { + pthread_cond_wait(&lf_sync->cond_[plane][r - 1], mutex); + } + pthread_mutex_unlock(mutex); + } +#else + (void)lf_sync; + (void)r; + (void)c; + (void)plane; +#endif // CONFIG_MULTITHREAD +} + +static INLINE void sync_write(AV1LfSync *const lf_sync, int r, int c, + const int sb_cols, int plane) { +#if CONFIG_MULTITHREAD + const int nsync = lf_sync->sync_range; + int cur; + // Only signal when there are enough filtered SB for next row to run. + int sig = 1; + + if (c < sb_cols - 1) { + cur = c; + if (c % nsync) sig = 0; + } else { + cur = sb_cols + nsync; + } + + if (sig) { + pthread_mutex_lock(&lf_sync->mutex_[plane][r]); + + lf_sync->cur_sb_col[plane][r] = cur; + + pthread_cond_broadcast(&lf_sync->cond_[plane][r]); + pthread_mutex_unlock(&lf_sync->mutex_[plane][r]); + } +#else + (void)lf_sync; + (void)r; + (void)c; + (void)sb_cols; + (void)plane; +#endif // CONFIG_MULTITHREAD +} + +static void enqueue_lf_jobs(AV1LfSync *lf_sync, AV1_COMMON *cm, int start, + int stop, +#if CONFIG_LPF_MASK + int is_decoding, +#endif + int plane_start, int plane_end) { + int mi_row, plane, dir; + AV1LfMTInfo *lf_job_queue = lf_sync->job_queue; + lf_sync->jobs_enqueued = 0; + lf_sync->jobs_dequeued = 0; + + for (dir = 0; dir < 2; dir++) { + for (plane = plane_start; plane < plane_end; plane++) { + if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1])) + break; + else if (plane == 1 && !(cm->lf.filter_level_u)) + continue; + else if (plane == 2 && !(cm->lf.filter_level_v)) + continue; +#if CONFIG_LPF_MASK + int step = MAX_MIB_SIZE; + if (is_decoding) { + step = MI_SIZE_64X64; + } + for (mi_row = start; mi_row < stop; mi_row += step) +#else + for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) +#endif + { + lf_job_queue->mi_row = mi_row; + lf_job_queue->plane = plane; + lf_job_queue->dir = dir; + lf_job_queue++; + lf_sync->jobs_enqueued++; + } + } + } +} + +static AV1LfMTInfo *get_lf_job_info(AV1LfSync *lf_sync) { + AV1LfMTInfo *cur_job_info = NULL; + +#if CONFIG_MULTITHREAD + pthread_mutex_lock(lf_sync->job_mutex); + + if (lf_sync->jobs_dequeued < lf_sync->jobs_enqueued) { + cur_job_info = lf_sync->job_queue + lf_sync->jobs_dequeued; + lf_sync->jobs_dequeued++; + } + + pthread_mutex_unlock(lf_sync->job_mutex); +#else + (void)lf_sync; +#endif + + return cur_job_info; +} + +// Implement row loopfiltering for each thread. +static INLINE void thread_loop_filter_rows( + const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm, + struct macroblockd_plane *planes, MACROBLOCKD *xd, + AV1LfSync *const lf_sync) { + const int sb_cols = + ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, MAX_MIB_SIZE_LOG2) >> + MAX_MIB_SIZE_LOG2; + int mi_row, mi_col, plane, dir; + int r, c; + + while (1) { + AV1LfMTInfo *cur_job_info = get_lf_job_info(lf_sync); + + if (cur_job_info != NULL) { + mi_row = cur_job_info->mi_row; + plane = cur_job_info->plane; + dir = cur_job_info->dir; + r = mi_row >> MAX_MIB_SIZE_LOG2; + + if (dir == 0) { + for (mi_col = 0; mi_col < cm->mi_params.mi_cols; + mi_col += MAX_MIB_SIZE) { + c = mi_col >> MAX_MIB_SIZE_LOG2; + + av1_setup_dst_planes(planes, cm->seq_params.sb_size, frame_buffer, + mi_row, mi_col, plane, plane + 1); + + av1_filter_block_plane_vert(cm, xd, plane, &planes[plane], mi_row, + mi_col); + sync_write(lf_sync, r, c, sb_cols, plane); + } + } else if (dir == 1) { + for (mi_col = 0; mi_col < cm->mi_params.mi_cols; + mi_col += MAX_MIB_SIZE) { + c = mi_col >> MAX_MIB_SIZE_LOG2; + + // Wait for vertical edge filtering of the top-right block to be + // completed + sync_read(lf_sync, r, c, plane); + + // Wait for vertical edge filtering of the right block to be + // completed + sync_read(lf_sync, r + 1, c, plane); + + av1_setup_dst_planes(planes, cm->seq_params.sb_size, frame_buffer, + mi_row, mi_col, plane, plane + 1); + av1_filter_block_plane_horz(cm, xd, plane, &planes[plane], mi_row, + mi_col); + } + } + } else { + break; + } + } +} + +// Row-based multi-threaded loopfilter hook +static int loop_filter_row_worker(void *arg1, void *arg2) { + AV1LfSync *const lf_sync = (AV1LfSync *)arg1; + LFWorkerData *const lf_data = (LFWorkerData *)arg2; + thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, + lf_data->xd, lf_sync); + return 1; +} + +#if CONFIG_LPF_MASK +static INLINE void thread_loop_filter_bitmask_rows( + const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm, + struct macroblockd_plane *planes, MACROBLOCKD *xd, + AV1LfSync *const lf_sync) { + const int sb_cols = + ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, MIN_MIB_SIZE_LOG2) >> + MIN_MIB_SIZE_LOG2; + int mi_row, mi_col, plane, dir; + int r, c; + (void)xd; + + while (1) { + AV1LfMTInfo *cur_job_info = get_lf_job_info(lf_sync); + + if (cur_job_info != NULL) { + mi_row = cur_job_info->mi_row; + plane = cur_job_info->plane; + dir = cur_job_info->dir; + r = mi_row >> MIN_MIB_SIZE_LOG2; + + if (dir == 0) { + for (mi_col = 0; mi_col < cm->mi_params.mi_cols; + mi_col += MI_SIZE_64X64) { + c = mi_col >> MIN_MIB_SIZE_LOG2; + + av1_setup_dst_planes(planes, BLOCK_64X64, frame_buffer, mi_row, + mi_col, plane, plane + 1); + + av1_filter_block_plane_bitmask_vert(cm, &planes[plane], plane, mi_row, + mi_col); + sync_write(lf_sync, r, c, sb_cols, plane); + } + } else if (dir == 1) { + for (mi_col = 0; mi_col < cm->mi_params.mi_cols; + mi_col += MI_SIZE_64X64) { + c = mi_col >> MIN_MIB_SIZE_LOG2; + + // Wait for vertical edge filtering of the top-right block to be + // completed + sync_read(lf_sync, r, c, plane); + + // Wait for vertical edge filtering of the right block to be + // completed + sync_read(lf_sync, r + 1, c, plane); + + av1_setup_dst_planes(planes, BLOCK_64X64, frame_buffer, mi_row, + mi_col, plane, plane + 1); + av1_filter_block_plane_bitmask_horz(cm, &planes[plane], plane, mi_row, + mi_col); + } + } + } else { + break; + } + } +} + +// Row-based multi-threaded loopfilter hook +static int loop_filter_bitmask_row_worker(void *arg1, void *arg2) { + AV1LfSync *const lf_sync = (AV1LfSync *)arg1; + LFWorkerData *const lf_data = (LFWorkerData *)arg2; + thread_loop_filter_bitmask_rows(lf_data->frame_buffer, lf_data->cm, + lf_data->planes, lf_data->xd, lf_sync); + return 1; +} +#endif // CONFIG_LPF_MASK + +static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, + MACROBLOCKD *xd, int start, int stop, + int plane_start, int plane_end, +#if CONFIG_LPF_MASK + int is_decoding, +#endif + AVxWorker *workers, int nworkers, + AV1LfSync *lf_sync) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); +#if CONFIG_LPF_MASK + int sb_rows; + if (is_decoding) { + sb_rows = ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, MIN_MIB_SIZE_LOG2) >> + MIN_MIB_SIZE_LOG2; + } else { + sb_rows = ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, MAX_MIB_SIZE_LOG2) >> + MAX_MIB_SIZE_LOG2; + } +#else + // Number of superblock rows and cols + const int sb_rows = + ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, MAX_MIB_SIZE_LOG2) >> + MAX_MIB_SIZE_LOG2; +#endif + const int num_workers = nworkers; + int i; + + if (!lf_sync->sync_range || sb_rows != lf_sync->rows || + num_workers > lf_sync->num_workers) { + av1_loop_filter_dealloc(lf_sync); + loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers); + } + + // Initialize cur_sb_col to -1 for all SB rows. + for (i = 0; i < MAX_MB_PLANE; i++) { + memset(lf_sync->cur_sb_col[i], -1, + sizeof(*(lf_sync->cur_sb_col[i])) * sb_rows); + } + + enqueue_lf_jobs(lf_sync, cm, start, stop, +#if CONFIG_LPF_MASK + is_decoding, +#endif + plane_start, plane_end); + + // Set up loopfilter thread data. + for (i = 0; i < num_workers; ++i) { + AVxWorker *const worker = &workers[i]; + LFWorkerData *const lf_data = &lf_sync->lfdata[i]; + +#if CONFIG_LPF_MASK + if (is_decoding) { + worker->hook = loop_filter_bitmask_row_worker; + } else { + worker->hook = loop_filter_row_worker; + } +#else + worker->hook = loop_filter_row_worker; +#endif + worker->data1 = lf_sync; + worker->data2 = lf_data; + + // Loopfilter data + loop_filter_data_reset(lf_data, frame, cm, xd); + + // Start loopfiltering + if (i == num_workers - 1) { + winterface->execute(worker); + } else { + winterface->launch(worker); + } + } + + // Wait till all rows are finished + for (i = 0; i < num_workers; ++i) { + winterface->sync(&workers[i]); + } +} + +void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, + MACROBLOCKD *xd, int plane_start, int plane_end, + int partial_frame, +#if CONFIG_LPF_MASK + int is_decoding, +#endif + AVxWorker *workers, int num_workers, + AV1LfSync *lf_sync) { + int start_mi_row, end_mi_row, mi_rows_to_filter; + + start_mi_row = 0; + mi_rows_to_filter = cm->mi_params.mi_rows; + if (partial_frame && cm->mi_params.mi_rows > 8) { + start_mi_row = cm->mi_params.mi_rows >> 1; + start_mi_row &= 0xfffffff8; + mi_rows_to_filter = AOMMAX(cm->mi_params.mi_rows / 8, 8); + } + end_mi_row = start_mi_row + mi_rows_to_filter; + av1_loop_filter_frame_init(cm, plane_start, plane_end); + +#if CONFIG_LPF_MASK + if (is_decoding) { + cm->is_decoding = is_decoding; + // TODO(chengchen): currently use one thread to build bitmasks for the + // frame. Make it support multi-thread later. + for (int plane = plane_start; plane < plane_end; plane++) { + if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1])) + break; + else if (plane == 1 && !(cm->lf.filter_level_u)) + continue; + else if (plane == 2 && !(cm->lf.filter_level_v)) + continue; + + // TODO(chengchen): can we remove this? + struct macroblockd_plane *pd = xd->plane; + av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame, 0, 0, plane, + plane + 1); + + av1_build_bitmask_vert_info(cm, &pd[plane], plane); + av1_build_bitmask_horz_info(cm, &pd[plane], plane); + } + loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start, + plane_end, 1, workers, num_workers, lf_sync); + } else { + loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start, + plane_end, 0, workers, num_workers, lf_sync); + } +#else + loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start, + plane_end, workers, num_workers, lf_sync); +#endif +} + +static INLINE void lr_sync_read(void *const lr_sync, int r, int c, int plane) { +#if CONFIG_MULTITHREAD + AV1LrSync *const loop_res_sync = (AV1LrSync *)lr_sync; + const int nsync = loop_res_sync->sync_range; + + if (r && !(c & (nsync - 1))) { + pthread_mutex_t *const mutex = &loop_res_sync->mutex_[plane][r - 1]; + pthread_mutex_lock(mutex); + + while (c > loop_res_sync->cur_sb_col[plane][r - 1] - nsync) { + pthread_cond_wait(&loop_res_sync->cond_[plane][r - 1], mutex); + } + pthread_mutex_unlock(mutex); + } +#else + (void)lr_sync; + (void)r; + (void)c; + (void)plane; +#endif // CONFIG_MULTITHREAD +} + +static INLINE void lr_sync_write(void *const lr_sync, int r, int c, + const int sb_cols, int plane) { +#if CONFIG_MULTITHREAD + AV1LrSync *const loop_res_sync = (AV1LrSync *)lr_sync; + const int nsync = loop_res_sync->sync_range; + int cur; + // Only signal when there are enough filtered SB for next row to run. + int sig = 1; + + if (c < sb_cols - 1) { + cur = c; + if (c % nsync) sig = 0; + } else { + cur = sb_cols + nsync; + } + + if (sig) { + pthread_mutex_lock(&loop_res_sync->mutex_[plane][r]); + + loop_res_sync->cur_sb_col[plane][r] = cur; + + pthread_cond_broadcast(&loop_res_sync->cond_[plane][r]); + pthread_mutex_unlock(&loop_res_sync->mutex_[plane][r]); + } +#else + (void)lr_sync; + (void)r; + (void)c; + (void)sb_cols; + (void)plane; +#endif // CONFIG_MULTITHREAD +} + +// Allocate memory for loop restoration row synchronization +static void loop_restoration_alloc(AV1LrSync *lr_sync, AV1_COMMON *cm, + int num_workers, int num_rows_lr, + int num_planes, int width) { + lr_sync->rows = num_rows_lr; + lr_sync->num_planes = num_planes; +#if CONFIG_MULTITHREAD + { + int i, j; + + for (j = 0; j < num_planes; j++) { + CHECK_MEM_ERROR(cm, lr_sync->mutex_[j], + aom_malloc(sizeof(*(lr_sync->mutex_[j])) * num_rows_lr)); + if (lr_sync->mutex_[j]) { + for (i = 0; i < num_rows_lr; ++i) { + pthread_mutex_init(&lr_sync->mutex_[j][i], NULL); + } + } + + CHECK_MEM_ERROR(cm, lr_sync->cond_[j], + aom_malloc(sizeof(*(lr_sync->cond_[j])) * num_rows_lr)); + if (lr_sync->cond_[j]) { + for (i = 0; i < num_rows_lr; ++i) { + pthread_cond_init(&lr_sync->cond_[j][i], NULL); + } + } + } + + CHECK_MEM_ERROR(cm, lr_sync->job_mutex, + aom_malloc(sizeof(*(lr_sync->job_mutex)))); + if (lr_sync->job_mutex) { + pthread_mutex_init(lr_sync->job_mutex, NULL); + } + } +#endif // CONFIG_MULTITHREAD + CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata, + aom_malloc(num_workers * sizeof(*(lr_sync->lrworkerdata)))); + + for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) { + if (worker_idx < num_workers - 1) { + CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata[worker_idx].rst_tmpbuf, + (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE)); + CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata[worker_idx].rlbs, + aom_malloc(sizeof(RestorationLineBuffers))); + + } else { + lr_sync->lrworkerdata[worker_idx].rst_tmpbuf = cm->rst_tmpbuf; + lr_sync->lrworkerdata[worker_idx].rlbs = cm->rlbs; + } + } + + lr_sync->num_workers = num_workers; + + for (int j = 0; j < num_planes; j++) { + CHECK_MEM_ERROR( + cm, lr_sync->cur_sb_col[j], + aom_malloc(sizeof(*(lr_sync->cur_sb_col[j])) * num_rows_lr)); + } + CHECK_MEM_ERROR( + cm, lr_sync->job_queue, + aom_malloc(sizeof(*(lr_sync->job_queue)) * num_rows_lr * num_planes)); + // Set up nsync. + lr_sync->sync_range = get_lr_sync_range(width); +} + +// Deallocate loop restoration synchronization related mutex and data +void av1_loop_restoration_dealloc(AV1LrSync *lr_sync, int num_workers) { + if (lr_sync != NULL) { + int j; +#if CONFIG_MULTITHREAD + int i; + for (j = 0; j < MAX_MB_PLANE; j++) { + if (lr_sync->mutex_[j] != NULL) { + for (i = 0; i < lr_sync->rows; ++i) { + pthread_mutex_destroy(&lr_sync->mutex_[j][i]); + } + aom_free(lr_sync->mutex_[j]); + } + if (lr_sync->cond_[j] != NULL) { + for (i = 0; i < lr_sync->rows; ++i) { + pthread_cond_destroy(&lr_sync->cond_[j][i]); + } + aom_free(lr_sync->cond_[j]); + } + } + if (lr_sync->job_mutex != NULL) { + pthread_mutex_destroy(lr_sync->job_mutex); + aom_free(lr_sync->job_mutex); + } +#endif // CONFIG_MULTITHREAD + for (j = 0; j < MAX_MB_PLANE; j++) { + aom_free(lr_sync->cur_sb_col[j]); + } + + aom_free(lr_sync->job_queue); + + if (lr_sync->lrworkerdata) { + for (int worker_idx = 0; worker_idx < num_workers - 1; worker_idx++) { + LRWorkerData *const workerdata_data = + lr_sync->lrworkerdata + worker_idx; + + aom_free(workerdata_data->rst_tmpbuf); + aom_free(workerdata_data->rlbs); + } + aom_free(lr_sync->lrworkerdata); + } + + // clear the structure as the source of this call may be a resize in which + // case this call will be followed by an _alloc() which may fail. + av1_zero(*lr_sync); + } +} + +static void enqueue_lr_jobs(AV1LrSync *lr_sync, AV1LrStruct *lr_ctxt, + AV1_COMMON *cm) { + FilterFrameCtxt *ctxt = lr_ctxt->ctxt; + + const int num_planes = av1_num_planes(cm); + AV1LrMTInfo *lr_job_queue = lr_sync->job_queue; + int32_t lr_job_counter[2], num_even_lr_jobs = 0; + lr_sync->jobs_enqueued = 0; + lr_sync->jobs_dequeued = 0; + + for (int plane = 0; plane < num_planes; plane++) { + if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; + num_even_lr_jobs = + num_even_lr_jobs + ((ctxt[plane].rsi->vert_units_per_tile + 1) >> 1); + } + lr_job_counter[0] = 0; + lr_job_counter[1] = num_even_lr_jobs; + + for (int plane = 0; plane < num_planes; plane++) { + if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; + const int is_uv = plane > 0; + const int ss_y = is_uv && cm->seq_params.subsampling_y; + + AV1PixelRect tile_rect = ctxt[plane].tile_rect; + const int unit_size = ctxt[plane].rsi->restoration_unit_size; + + const int tile_h = tile_rect.bottom - tile_rect.top; + const int ext_size = unit_size * 3 / 2; + + int y0 = 0, i = 0; + while (y0 < tile_h) { + int remaining_h = tile_h - y0; + int h = (remaining_h < ext_size) ? remaining_h : unit_size; + + RestorationTileLimits limits; + limits.v_start = tile_rect.top + y0; + limits.v_end = tile_rect.top + y0 + h; + assert(limits.v_end <= tile_rect.bottom); + // Offset the tile upwards to align with the restoration processing stripe + const int voffset = RESTORATION_UNIT_OFFSET >> ss_y; + limits.v_start = AOMMAX(tile_rect.top, limits.v_start - voffset); + if (limits.v_end < tile_rect.bottom) limits.v_end -= voffset; + + assert(lr_job_counter[0] <= num_even_lr_jobs); + + lr_job_queue[lr_job_counter[i & 1]].lr_unit_row = i; + lr_job_queue[lr_job_counter[i & 1]].plane = plane; + lr_job_queue[lr_job_counter[i & 1]].v_start = limits.v_start; + lr_job_queue[lr_job_counter[i & 1]].v_end = limits.v_end; + lr_job_queue[lr_job_counter[i & 1]].sync_mode = i & 1; + if ((i & 1) == 0) { + lr_job_queue[lr_job_counter[i & 1]].v_copy_start = + limits.v_start + RESTORATION_BORDER; + lr_job_queue[lr_job_counter[i & 1]].v_copy_end = + limits.v_end - RESTORATION_BORDER; + if (i == 0) { + assert(limits.v_start == tile_rect.top); + lr_job_queue[lr_job_counter[i & 1]].v_copy_start = tile_rect.top; + } + if (i == (ctxt[plane].rsi->vert_units_per_tile - 1)) { + assert(limits.v_end == tile_rect.bottom); + lr_job_queue[lr_job_counter[i & 1]].v_copy_end = tile_rect.bottom; + } + } else { + lr_job_queue[lr_job_counter[i & 1]].v_copy_start = + AOMMAX(limits.v_start - RESTORATION_BORDER, tile_rect.top); + lr_job_queue[lr_job_counter[i & 1]].v_copy_end = + AOMMIN(limits.v_end + RESTORATION_BORDER, tile_rect.bottom); + } + lr_job_counter[i & 1]++; + lr_sync->jobs_enqueued++; + + y0 += h; + ++i; + } + } +} + +static AV1LrMTInfo *get_lr_job_info(AV1LrSync *lr_sync) { + AV1LrMTInfo *cur_job_info = NULL; + +#if CONFIG_MULTITHREAD + pthread_mutex_lock(lr_sync->job_mutex); + + if (lr_sync->jobs_dequeued < lr_sync->jobs_enqueued) { + cur_job_info = lr_sync->job_queue + lr_sync->jobs_dequeued; + lr_sync->jobs_dequeued++; + } + + pthread_mutex_unlock(lr_sync->job_mutex); +#else + (void)lr_sync; +#endif + + return cur_job_info; +} + +// Implement row loop restoration for each thread. +static int loop_restoration_row_worker(void *arg1, void *arg2) { + AV1LrSync *const lr_sync = (AV1LrSync *)arg1; + LRWorkerData *lrworkerdata = (LRWorkerData *)arg2; + AV1LrStruct *lr_ctxt = (AV1LrStruct *)lrworkerdata->lr_ctxt; + FilterFrameCtxt *ctxt = lr_ctxt->ctxt; + int lr_unit_row; + int plane; + const int tile_row = LR_TILE_ROW; + const int tile_col = LR_TILE_COL; + const int tile_cols = LR_TILE_COLS; + const int tile_idx = tile_col + tile_row * tile_cols; + typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend, + int vstart, int vend); + static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y, + aom_yv12_partial_coloc_copy_u, + aom_yv12_partial_coloc_copy_v }; + + while (1) { + AV1LrMTInfo *cur_job_info = get_lr_job_info(lr_sync); + if (cur_job_info != NULL) { + RestorationTileLimits limits; + sync_read_fn_t on_sync_read; + sync_write_fn_t on_sync_write; + limits.v_start = cur_job_info->v_start; + limits.v_end = cur_job_info->v_end; + lr_unit_row = cur_job_info->lr_unit_row; + plane = cur_job_info->plane; + const int unit_idx0 = tile_idx * ctxt[plane].rsi->units_per_tile; + + // sync_mode == 1 implies only sync read is required in LR Multi-threading + // sync_mode == 0 implies only sync write is required. + on_sync_read = + cur_job_info->sync_mode == 1 ? lr_sync_read : av1_lr_sync_read_dummy; + on_sync_write = cur_job_info->sync_mode == 0 ? lr_sync_write + : av1_lr_sync_write_dummy; + + av1_foreach_rest_unit_in_row( + &limits, &(ctxt[plane].tile_rect), lr_ctxt->on_rest_unit, lr_unit_row, + ctxt[plane].rsi->restoration_unit_size, unit_idx0, + ctxt[plane].rsi->horz_units_per_tile, + ctxt[plane].rsi->vert_units_per_tile, plane, &ctxt[plane], + lrworkerdata->rst_tmpbuf, lrworkerdata->rlbs, on_sync_read, + on_sync_write, lr_sync); + + copy_funs[plane](lr_ctxt->dst, lr_ctxt->frame, ctxt[plane].tile_rect.left, + ctxt[plane].tile_rect.right, cur_job_info->v_copy_start, + cur_job_info->v_copy_end); + } else { + break; + } + } + return 1; +} + +static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt, + AVxWorker *workers, int nworkers, + AV1LrSync *lr_sync, AV1_COMMON *cm) { + FilterFrameCtxt *ctxt = lr_ctxt->ctxt; + + const int num_planes = av1_num_planes(cm); + + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + int num_rows_lr = 0; + + for (int plane = 0; plane < num_planes; plane++) { + if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; + + const AV1PixelRect tile_rect = ctxt[plane].tile_rect; + const int max_tile_h = tile_rect.bottom - tile_rect.top; + + const int unit_size = cm->rst_info[plane].restoration_unit_size; + + num_rows_lr = + AOMMAX(num_rows_lr, av1_lr_count_units_in_tile(unit_size, max_tile_h)); + } + + const int num_workers = nworkers; + int i; + assert(MAX_MB_PLANE == 3); + + if (!lr_sync->sync_range || num_rows_lr != lr_sync->rows || + num_workers > lr_sync->num_workers || num_planes != lr_sync->num_planes) { + av1_loop_restoration_dealloc(lr_sync, num_workers); + loop_restoration_alloc(lr_sync, cm, num_workers, num_rows_lr, num_planes, + cm->width); + } + + // Initialize cur_sb_col to -1 for all SB rows. + for (i = 0; i < num_planes; i++) { + memset(lr_sync->cur_sb_col[i], -1, + sizeof(*(lr_sync->cur_sb_col[i])) * num_rows_lr); + } + + enqueue_lr_jobs(lr_sync, lr_ctxt, cm); + + // Set up looprestoration thread data. + for (i = 0; i < num_workers; ++i) { + AVxWorker *const worker = &workers[i]; + lr_sync->lrworkerdata[i].lr_ctxt = (void *)lr_ctxt; + worker->hook = loop_restoration_row_worker; + worker->data1 = lr_sync; + worker->data2 = &lr_sync->lrworkerdata[i]; + + // Start loopfiltering + if (i == num_workers - 1) { + winterface->execute(worker); + } else { + winterface->launch(worker); + } + } + + // Wait till all rows are finished + for (i = 0; i < num_workers; ++i) { + winterface->sync(&workers[i]); + } +} + +void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame, + AV1_COMMON *cm, int optimized_lr, + AVxWorker *workers, int num_workers, + AV1LrSync *lr_sync, void *lr_ctxt) { + assert(!cm->features.all_lossless); + + const int num_planes = av1_num_planes(cm); + + AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt; + + av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm, + optimized_lr, num_planes); + + foreach_rest_unit_in_planes_mt(loop_rest_ctxt, workers, num_workers, lr_sync, + cm); +} diff --git a/libs/libaom/src/av1/common/thread_common.h b/libs/libaom/src/av1/common/thread_common.h new file mode 100644 index 000000000..7397f1c54 --- /dev/null +++ b/libs/libaom/src/av1/common/thread_common.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_THREAD_COMMON_H_ +#define AOM_AV1_COMMON_THREAD_COMMON_H_ + +#include "config/aom_config.h" + +#include "av1/common/av1_loopfilter.h" +#include "aom_util/aom_thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1Common; + +typedef struct AV1LfMTInfo { + int mi_row; + int plane; + int dir; +} AV1LfMTInfo; + +// Loopfilter row synchronization +typedef struct AV1LfSyncData { +#if CONFIG_MULTITHREAD + pthread_mutex_t *mutex_[MAX_MB_PLANE]; + pthread_cond_t *cond_[MAX_MB_PLANE]; +#endif + // Allocate memory to store the loop-filtered superblock index in each row. + int *cur_sb_col[MAX_MB_PLANE]; + // The optimal sync_range for different resolution and platform should be + // determined by testing. Currently, it is chosen to be a power-of-2 number. + int sync_range; + int rows; + + // Row-based parallel loopfilter data + LFWorkerData *lfdata; + int num_workers; + +#if CONFIG_MULTITHREAD + pthread_mutex_t *job_mutex; +#endif + AV1LfMTInfo *job_queue; + int jobs_enqueued; + int jobs_dequeued; +} AV1LfSync; + +typedef struct AV1LrMTInfo { + int v_start; + int v_end; + int lr_unit_row; + int plane; + int sync_mode; + int v_copy_start; + int v_copy_end; +} AV1LrMTInfo; + +typedef struct LoopRestorationWorkerData { + int32_t *rst_tmpbuf; + void *rlbs; + void *lr_ctxt; +} LRWorkerData; + +// Looprestoration row synchronization +typedef struct AV1LrSyncData { +#if CONFIG_MULTITHREAD + pthread_mutex_t *mutex_[MAX_MB_PLANE]; + pthread_cond_t *cond_[MAX_MB_PLANE]; +#endif + // Allocate memory to store the loop-restoration block index in each row. + int *cur_sb_col[MAX_MB_PLANE]; + // The optimal sync_range for different resolution and platform should be + // determined by testing. Currently, it is chosen to be a power-of-2 number. + int sync_range; + int rows; + int num_planes; + + int num_workers; + +#if CONFIG_MULTITHREAD + pthread_mutex_t *job_mutex; +#endif + // Row-based parallel loopfilter data + LRWorkerData *lrworkerdata; + + AV1LrMTInfo *job_queue; + int jobs_enqueued; + int jobs_dequeued; +} AV1LrSync; + +// Deallocate loopfilter synchronization related mutex and data. +void av1_loop_filter_dealloc(AV1LfSync *lf_sync); + +void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm, + struct macroblockd *xd, int plane_start, + int plane_end, int partial_frame, +#if CONFIG_LPF_MASK + int is_decoding, +#endif + AVxWorker *workers, int num_workers, + AV1LfSync *lf_sync); +void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame, + struct AV1Common *cm, + int optimized_lr, AVxWorker *workers, + int num_workers, AV1LrSync *lr_sync, + void *lr_ctxt); +void av1_loop_restoration_dealloc(AV1LrSync *lr_sync, int num_workers); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_THREAD_COMMON_H_ diff --git a/libs/libaom/src/av1/common/tile_common.c b/libs/libaom/src/av1/common/tile_common.c new file mode 100644 index 000000000..1b11bd760 --- /dev/null +++ b/libs/libaom/src/av1/common/tile_common.c @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/av1_common_int.h" +#include "av1/common/resize.h" +#include "av1/common/tile_common.h" +#include "aom_dsp/aom_dsp_common.h" + +void av1_tile_init(TileInfo *tile, const AV1_COMMON *cm, int row, int col) { + av1_tile_set_row(tile, cm, row); + av1_tile_set_col(tile, cm, col); +} + +// Find smallest k>=0 such that (blk_size << k) >= target +static int tile_log2(int blk_size, int target) { + int k; + for (k = 0; (blk_size << k) < target; k++) { + } + return k; +} + +void av1_get_tile_limits(AV1_COMMON *const cm) { + const SequenceHeader *const seq_params = &cm->seq_params; + CommonTileParams *const tiles = &cm->tiles; + const int mi_cols = + ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2); + const int mi_rows = + ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, seq_params->mib_size_log2); + const int sb_cols = mi_cols >> seq_params->mib_size_log2; + const int sb_rows = mi_rows >> seq_params->mib_size_log2; + + const int sb_size_log2 = seq_params->mib_size_log2 + MI_SIZE_LOG2; + tiles->max_width_sb = MAX_TILE_WIDTH >> sb_size_log2; + const int max_tile_area_sb = MAX_TILE_AREA >> (2 * sb_size_log2); + + tiles->min_log2_cols = tile_log2(tiles->max_width_sb, sb_cols); + tiles->max_log2_cols = tile_log2(1, AOMMIN(sb_cols, MAX_TILE_COLS)); + tiles->max_log2_rows = tile_log2(1, AOMMIN(sb_rows, MAX_TILE_ROWS)); + tiles->min_log2 = tile_log2(max_tile_area_sb, sb_cols * sb_rows); + tiles->min_log2 = AOMMAX(tiles->min_log2, tiles->min_log2_cols); +} + +void av1_calculate_tile_cols(const SequenceHeader *const seq_params, + int cm_mi_rows, int cm_mi_cols, + CommonTileParams *const tiles) { + int mi_cols = ALIGN_POWER_OF_TWO(cm_mi_cols, seq_params->mib_size_log2); + int mi_rows = ALIGN_POWER_OF_TWO(cm_mi_rows, seq_params->mib_size_log2); + int sb_cols = mi_cols >> seq_params->mib_size_log2; + int sb_rows = mi_rows >> seq_params->mib_size_log2; + int i; + + // This will be overridden if there is at least two columns of tiles + // (otherwise there is no inner tile width) + tiles->min_inner_width = -1; + + if (tiles->uniform_spacing) { + int start_sb; + int size_sb = ALIGN_POWER_OF_TWO(sb_cols, tiles->log2_cols); + size_sb >>= tiles->log2_cols; + assert(size_sb > 0); + for (i = 0, start_sb = 0; start_sb < sb_cols; i++) { + tiles->col_start_sb[i] = start_sb; + start_sb += size_sb; + } + tiles->cols = i; + tiles->col_start_sb[i] = sb_cols; + tiles->min_log2_rows = AOMMAX(tiles->min_log2 - tiles->log2_cols, 0); + tiles->max_height_sb = sb_rows >> tiles->min_log2_rows; + + tiles->width = size_sb << seq_params->mib_size_log2; + tiles->width = AOMMIN(tiles->width, cm_mi_cols); + if (tiles->cols > 1) { + tiles->min_inner_width = tiles->width; + } + } else { + int max_tile_area_sb = (sb_rows * sb_cols); + int widest_tile_sb = 1; + int narrowest_inner_tile_sb = 65536; + tiles->log2_cols = tile_log2(1, tiles->cols); + for (i = 0; i < tiles->cols; i++) { + int size_sb = tiles->col_start_sb[i + 1] - tiles->col_start_sb[i]; + widest_tile_sb = AOMMAX(widest_tile_sb, size_sb); + // ignore the rightmost tile in frame for determining the narrowest + if (i < tiles->cols - 1) + narrowest_inner_tile_sb = AOMMIN(narrowest_inner_tile_sb, size_sb); + } + if (tiles->min_log2) { + max_tile_area_sb >>= (tiles->min_log2 + 1); + } + tiles->max_height_sb = AOMMAX(max_tile_area_sb / widest_tile_sb, 1); + if (tiles->cols > 1) { + tiles->min_inner_width = narrowest_inner_tile_sb + << seq_params->mib_size_log2; + } + } +} + +void av1_calculate_tile_rows(const SequenceHeader *const seq_params, + int cm_mi_rows, CommonTileParams *const tiles) { + int mi_rows = ALIGN_POWER_OF_TWO(cm_mi_rows, seq_params->mib_size_log2); + int sb_rows = mi_rows >> seq_params->mib_size_log2; + int start_sb, size_sb, i; + + if (tiles->uniform_spacing) { + size_sb = ALIGN_POWER_OF_TWO(sb_rows, tiles->log2_rows); + size_sb >>= tiles->log2_rows; + assert(size_sb > 0); + for (i = 0, start_sb = 0; start_sb < sb_rows; i++) { + tiles->row_start_sb[i] = start_sb; + start_sb += size_sb; + } + tiles->rows = i; + tiles->row_start_sb[i] = sb_rows; + + tiles->height = size_sb << seq_params->mib_size_log2; + tiles->height = AOMMIN(tiles->height, cm_mi_rows); + } else { + tiles->log2_rows = tile_log2(1, tiles->rows); + } +} + +void av1_tile_set_row(TileInfo *tile, const AV1_COMMON *cm, int row) { + assert(row < cm->tiles.rows); + int mi_row_start = cm->tiles.row_start_sb[row] + << cm->seq_params.mib_size_log2; + int mi_row_end = cm->tiles.row_start_sb[row + 1] + << cm->seq_params.mib_size_log2; + tile->tile_row = row; + tile->mi_row_start = mi_row_start; + tile->mi_row_end = AOMMIN(mi_row_end, cm->mi_params.mi_rows); + assert(tile->mi_row_end > tile->mi_row_start); +} + +void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) { + assert(col < cm->tiles.cols); + int mi_col_start = cm->tiles.col_start_sb[col] + << cm->seq_params.mib_size_log2; + int mi_col_end = cm->tiles.col_start_sb[col + 1] + << cm->seq_params.mib_size_log2; + tile->tile_col = col; + tile->mi_col_start = mi_col_start; + tile->mi_col_end = AOMMIN(mi_col_end, cm->mi_params.mi_cols); + assert(tile->mi_col_end > tile->mi_col_start); +} + +int av1_get_sb_rows_in_tile(AV1_COMMON *cm, TileInfo tile) { + int mi_rows_aligned_to_sb = ALIGN_POWER_OF_TWO( + tile.mi_row_end - tile.mi_row_start, cm->seq_params.mib_size_log2); + int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2; + + return sb_rows; +} + +int av1_get_sb_cols_in_tile(AV1_COMMON *cm, TileInfo tile) { + int mi_cols_aligned_to_sb = ALIGN_POWER_OF_TWO( + tile.mi_col_end - tile.mi_col_start, cm->seq_params.mib_size_log2); + int sb_cols = mi_cols_aligned_to_sb >> cm->seq_params.mib_size_log2; + + return sb_cols; +} + +AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm, + int is_uv) { + AV1PixelRect r; + + // Calculate position in the Y plane + r.left = tile_info->mi_col_start * MI_SIZE; + r.right = tile_info->mi_col_end * MI_SIZE; + r.top = tile_info->mi_row_start * MI_SIZE; + r.bottom = tile_info->mi_row_end * MI_SIZE; + + // If upscaling is enabled, the tile limits need scaling to match the + // upscaled frame where the restoration units live. To do this, scale up the + // top-left and bottom-right of the tile. + if (av1_superres_scaled(cm)) { + av1_calculate_unscaled_superres_size(&r.left, &r.top, + cm->superres_scale_denominator); + av1_calculate_unscaled_superres_size(&r.right, &r.bottom, + cm->superres_scale_denominator); + } + + const int frame_w = cm->superres_upscaled_width; + const int frame_h = cm->superres_upscaled_height; + + // Make sure we don't fall off the bottom-right of the frame. + r.right = AOMMIN(r.right, frame_w); + r.bottom = AOMMIN(r.bottom, frame_h); + + // Convert to coordinates in the appropriate plane + const int ss_x = is_uv && cm->seq_params.subsampling_x; + const int ss_y = is_uv && cm->seq_params.subsampling_y; + + r.left = ROUND_POWER_OF_TWO(r.left, ss_x); + r.right = ROUND_POWER_OF_TWO(r.right, ss_x); + r.top = ROUND_POWER_OF_TWO(r.top, ss_y); + r.bottom = ROUND_POWER_OF_TWO(r.bottom, ss_y); + + return r; +} + +void av1_get_uniform_tile_size(const AV1_COMMON *cm, int *w, int *h) { + const CommonTileParams *const tiles = &cm->tiles; + if (tiles->uniform_spacing) { + *w = tiles->width; + *h = tiles->height; + } else { + for (int i = 0; i < tiles->cols; ++i) { + const int tile_width_sb = + tiles->col_start_sb[i + 1] - tiles->col_start_sb[i]; + const int tile_w = tile_width_sb * cm->seq_params.mib_size; + assert(i == 0 || tile_w == *w); // ensure all tiles have same dimension + *w = tile_w; + } + + for (int i = 0; i < tiles->rows; ++i) { + const int tile_height_sb = + tiles->row_start_sb[i + 1] - tiles->row_start_sb[i]; + const int tile_h = tile_height_sb * cm->seq_params.mib_size; + assert(i == 0 || tile_h == *h); // ensure all tiles have same dimension + *h = tile_h; + } + } +} + +int av1_is_min_tile_width_satisfied(const AV1_COMMON *cm) { + // Disable check if there is a single tile col in the frame + if (cm->tiles.cols == 1) return 1; + + return ((cm->tiles.min_inner_width << MI_SIZE_LOG2) >= + (64 << av1_superres_scaled(cm))); +} diff --git a/libs/libaom/src/av1/common/tile_common.h b/libs/libaom/src/av1/common/tile_common.h new file mode 100644 index 000000000..ca7c5f496 --- /dev/null +++ b/libs/libaom/src/av1/common/tile_common.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_TILE_COMMON_H_ +#define AOM_AV1_COMMON_TILE_COMMON_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "config/aom_config.h" + +struct AV1Common; +struct SequenceHeader; +struct CommonTileParams; + +#define DEFAULT_MAX_NUM_TG 1 + +typedef struct TileInfo { + int mi_row_start, mi_row_end; + int mi_col_start, mi_col_end; + int tile_row; + int tile_col; +} TileInfo; + +// initializes 'tile->mi_(row|col)_(start|end)' for (row, col) based on +// 'cm->log2_tile_(rows|cols)' & 'cm->mi_(rows|cols)' +void av1_tile_init(TileInfo *tile, const struct AV1Common *cm, int row, + int col); + +void av1_tile_set_row(TileInfo *tile, const struct AV1Common *cm, int row); +void av1_tile_set_col(TileInfo *tile, const struct AV1Common *cm, int col); + +int av1_get_sb_rows_in_tile(struct AV1Common *cm, TileInfo tile); +int av1_get_sb_cols_in_tile(struct AV1Common *cm, TileInfo tile); + +typedef struct { + int left, top, right, bottom; +} AV1PixelRect; + +// Return the pixel extents of the given tile +AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, + const struct AV1Common *cm, int is_uv); + +// Define tile maximum width and area +// There is no maximum height since height is limited by area and width limits +// The minimum tile width or height is fixed at one superblock +#define MAX_TILE_WIDTH (4096) // Max Tile width in pixels +#define MAX_TILE_AREA (4096 * 2304) // Maximum tile area in pixels + +void av1_get_uniform_tile_size(const struct AV1Common *cm, int *w, int *h); +void av1_get_tile_limits(struct AV1Common *const cm); +void av1_calculate_tile_cols(const struct SequenceHeader *const seq_params, + int cm_mi_rows, int cm_mi_cols, + struct CommonTileParams *const tiles); +void av1_calculate_tile_rows(const struct SequenceHeader *const seq_params, + int cm_mi_rows, + struct CommonTileParams *const tiles); + +// Checks if the minimum tile_width requirement is satisfied +int av1_is_min_tile_width_satisfied(const struct AV1Common *cm); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_TILE_COMMON_H_ diff --git a/libs/libaom/src/av1/common/timing.c b/libs/libaom/src/av1/common/timing.c new file mode 100644 index 000000000..a959cdf76 --- /dev/null +++ b/libs/libaom/src/av1/common/timing.c @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/timing.h" + +/* Tables for AV1 max bitrates for different levels of main and high tier. + * The tables are in Kbps instead of Mbps in the specification. + * Note that depending on the profile, a multiplier is needed. + */ +#define UNDEFINED_RATE \ + (1 << 21) // Placeholder rate for levels with undefined rate +#define INVALID_RATE \ + (0) // For invalid profile-level configuration, set rate to 0 + +/* Max Bitrates for levels of Main Tier in kbps. Bitrate in main_kbps [31] */ +/* is a dummy value. The decoder model is not applicable for level 31. */ +static int32_t main_kbps[1 << LEVEL_BITS] = { + 1500, 3000, UNDEFINED_RATE, UNDEFINED_RATE, + 6000, 10000, UNDEFINED_RATE, UNDEFINED_RATE, + 12000, 20000, UNDEFINED_RATE, UNDEFINED_RATE, + 30000, 40000, 60000, 60000, + 60000, 100000, 160000, 160000, + UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, + UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, + UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE +}; + +/* Max Bitrates for levels of High Tier in kbps. Bitrate in high_kbps [31] */ +/* is a dummy value. The decoder model is not applicable for level 31. */ +static int32_t high_kbps[1 << LEVEL_BITS] = { + INVALID_RATE, INVALID_RATE, INVALID_RATE, INVALID_RATE, + INVALID_RATE, INVALID_RATE, INVALID_RATE, INVALID_RATE, + 30000, 50000, UNDEFINED_RATE, UNDEFINED_RATE, + 100000, 160000, 240000, 240000, + 240000, 480000, 800000, 800000, + UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, + UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, + UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE +}; + +/* BitrateProfileFactor */ +static int bitrate_profile_factor[1 << PROFILE_BITS] = { + 1, 2, 3, 0, 0, 0, 0, 0 +}; + +int64_t av1_max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx, + int seq_tier) { + int64_t bitrate; + + if (seq_tier) { + bitrate = high_kbps[seq_level_idx] * bitrate_profile_factor[seq_profile]; + } else { + bitrate = main_kbps[seq_level_idx] * bitrate_profile_factor[seq_profile]; + } + + return bitrate * 1000; +} + +void av1_set_aom_dec_model_info(aom_dec_model_info_t *decoder_model) { + decoder_model->encoder_decoder_buffer_delay_length = 16; + decoder_model->buffer_removal_time_length = 10; + decoder_model->frame_presentation_time_length = 10; +} + +void av1_set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params) { + op_params->decoder_model_param_present_flag = 1; + op_params->decoder_buffer_delay = 90000 >> 1; // 0.5 s + op_params->encoder_buffer_delay = 90000 >> 1; // 0.5 s + op_params->low_delay_mode_flag = 0; + op_params->display_model_param_present_flag = 1; + op_params->initial_display_delay = 8; // 8 frames delay +} + +void av1_set_resource_availability_parameters( + aom_dec_model_op_parameters_t *op_params) { + op_params->decoder_model_param_present_flag = 0; + op_params->decoder_buffer_delay = + 70000; // Resource availability mode default + op_params->encoder_buffer_delay = + 20000; // Resource availability mode default + op_params->low_delay_mode_flag = 0; // Resource availability mode default + op_params->display_model_param_present_flag = 1; + op_params->initial_display_delay = 8; // 8 frames delay +} diff --git a/libs/libaom/src/av1/common/timing.h b/libs/libaom/src/av1/common/timing.h new file mode 100644 index 000000000..9192124f7 --- /dev/null +++ b/libs/libaom/src/av1/common/timing.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_TIMING_H_ +#define AOM_AV1_COMMON_TIMING_H_ + +#include "aom/aom_integer.h" +#include "av1/common/enums.h" + +#define MAX_NUM_OP_POINTS 32 + +typedef struct aom_timing { + uint32_t num_units_in_display_tick; + uint32_t time_scale; + int equal_picture_interval; + uint32_t num_ticks_per_picture; +} aom_timing_info_t; + +typedef struct aom_dec_model_info { + uint32_t num_units_in_decoding_tick; + int encoder_decoder_buffer_delay_length; + int buffer_removal_time_length; + int frame_presentation_time_length; +} aom_dec_model_info_t; + +typedef struct aom_dec_model_op_parameters { + int decoder_model_param_present_flag; + int64_t bitrate; + int64_t buffer_size; + uint32_t decoder_buffer_delay; + uint32_t encoder_buffer_delay; + int low_delay_mode_flag; + int display_model_param_present_flag; + int initial_display_delay; +} aom_dec_model_op_parameters_t; + +void av1_set_aom_dec_model_info(aom_dec_model_info_t *decoder_model); + +void av1_set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params); + +void av1_set_resource_availability_parameters( + aom_dec_model_op_parameters_t *op_params); + +int64_t av1_max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx, + int seq_tier); + +#endif // AOM_AV1_COMMON_TIMING_H_ diff --git a/libs/libaom/src/av1/common/token_cdfs.h b/libs/libaom/src/av1/common/token_cdfs.h new file mode 100644 index 000000000..f1edda58d --- /dev/null +++ b/libs/libaom/src/av1/common/token_cdfs.h @@ -0,0 +1,3555 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_TOKEN_CDFS_H_ +#define AOM_AV1_COMMON_TOKEN_CDFS_H_ + +#include "config/aom_config.h" + +#include "av1/common/entropy.h" + +static const aom_cdf_prob + av1_default_dc_sign_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][DC_SIGN_CONTEXTS] + [CDF_SIZE(2)] = { + { { + { AOM_CDF2(128 * 125) }, + { AOM_CDF2(128 * 102) }, + { AOM_CDF2(128 * 147) }, + }, + { + { AOM_CDF2(128 * 119) }, + { AOM_CDF2(128 * 101) }, + { AOM_CDF2(128 * 135) }, + } }, + { { + { AOM_CDF2(128 * 125) }, + { AOM_CDF2(128 * 102) }, + { AOM_CDF2(128 * 147) }, + }, + { + { AOM_CDF2(128 * 119) }, + { AOM_CDF2(128 * 101) }, + { AOM_CDF2(128 * 135) }, + } }, + { { + { AOM_CDF2(128 * 125) }, + { AOM_CDF2(128 * 102) }, + { AOM_CDF2(128 * 147) }, + }, + { + { AOM_CDF2(128 * 119) }, + { AOM_CDF2(128 * 101) }, + { AOM_CDF2(128 * 135) }, + } }, + { { + { AOM_CDF2(128 * 125) }, + { AOM_CDF2(128 * 102) }, + { AOM_CDF2(128 * 147) }, + }, + { + { AOM_CDF2(128 * 119) }, + { AOM_CDF2(128 * 101) }, + { AOM_CDF2(128 * 135) }, + } }, + }; + +static const aom_cdf_prob + av1_default_txb_skip_cdfs[TOKEN_CDF_Q_CTXS][TX_SIZES][TXB_SKIP_CONTEXTS] + [CDF_SIZE(2)] = { { { { AOM_CDF2(31849) }, + { AOM_CDF2(5892) }, + { AOM_CDF2(12112) }, + { AOM_CDF2(21935) }, + { AOM_CDF2(20289) }, + { AOM_CDF2(27473) }, + { AOM_CDF2(32487) }, + { AOM_CDF2(7654) }, + { AOM_CDF2(19473) }, + { AOM_CDF2(29984) }, + { AOM_CDF2(9961) }, + { AOM_CDF2(30242) }, + { AOM_CDF2(32117) } }, + { { AOM_CDF2(31548) }, + { AOM_CDF2(1549) }, + { AOM_CDF2(10130) }, + { AOM_CDF2(16656) }, + { AOM_CDF2(18591) }, + { AOM_CDF2(26308) }, + { AOM_CDF2(32537) }, + { AOM_CDF2(5403) }, + { AOM_CDF2(18096) }, + { AOM_CDF2(30003) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(29957) }, + { AOM_CDF2(5391) }, + { AOM_CDF2(18039) }, + { AOM_CDF2(23566) }, + { AOM_CDF2(22431) }, + { AOM_CDF2(25822) }, + { AOM_CDF2(32197) }, + { AOM_CDF2(3778) }, + { AOM_CDF2(15336) }, + { AOM_CDF2(28981) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(17920) }, + { AOM_CDF2(1818) }, + { AOM_CDF2(7282) }, + { AOM_CDF2(25273) }, + { AOM_CDF2(10923) }, + { AOM_CDF2(31554) }, + { AOM_CDF2(32624) }, + { AOM_CDF2(1366) }, + { AOM_CDF2(15628) }, + { AOM_CDF2(30462) }, + { AOM_CDF2(146) }, + { AOM_CDF2(5132) }, + { AOM_CDF2(31657) } }, + { { AOM_CDF2(6308) }, + { AOM_CDF2(117) }, + { AOM_CDF2(1638) }, + { AOM_CDF2(2161) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(10923) }, + { AOM_CDF2(30247) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } } }, + { { { AOM_CDF2(30371) }, + { AOM_CDF2(7570) }, + { AOM_CDF2(13155) }, + { AOM_CDF2(20751) }, + { AOM_CDF2(20969) }, + { AOM_CDF2(27067) }, + { AOM_CDF2(32013) }, + { AOM_CDF2(5495) }, + { AOM_CDF2(17942) }, + { AOM_CDF2(28280) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(31782) }, + { AOM_CDF2(1836) }, + { AOM_CDF2(10689) }, + { AOM_CDF2(17604) }, + { AOM_CDF2(21622) }, + { AOM_CDF2(27518) }, + { AOM_CDF2(32399) }, + { AOM_CDF2(4419) }, + { AOM_CDF2(16294) }, + { AOM_CDF2(28345) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(31901) }, + { AOM_CDF2(10311) }, + { AOM_CDF2(18047) }, + { AOM_CDF2(24806) }, + { AOM_CDF2(23288) }, + { AOM_CDF2(27914) }, + { AOM_CDF2(32296) }, + { AOM_CDF2(4215) }, + { AOM_CDF2(15756) }, + { AOM_CDF2(28341) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(26726) }, + { AOM_CDF2(1045) }, + { AOM_CDF2(11703) }, + { AOM_CDF2(20590) }, + { AOM_CDF2(18554) }, + { AOM_CDF2(25970) }, + { AOM_CDF2(31938) }, + { AOM_CDF2(5583) }, + { AOM_CDF2(21313) }, + { AOM_CDF2(29390) }, + { AOM_CDF2(641) }, + { AOM_CDF2(22265) }, + { AOM_CDF2(31452) } }, + { { AOM_CDF2(26584) }, + { AOM_CDF2(188) }, + { AOM_CDF2(8847) }, + { AOM_CDF2(24519) }, + { AOM_CDF2(22938) }, + { AOM_CDF2(30583) }, + { AOM_CDF2(32608) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } } }, + { { { AOM_CDF2(29614) }, + { AOM_CDF2(9068) }, + { AOM_CDF2(12924) }, + { AOM_CDF2(19538) }, + { AOM_CDF2(17737) }, + { AOM_CDF2(24619) }, + { AOM_CDF2(30642) }, + { AOM_CDF2(4119) }, + { AOM_CDF2(16026) }, + { AOM_CDF2(25657) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(31957) }, + { AOM_CDF2(3230) }, + { AOM_CDF2(11153) }, + { AOM_CDF2(18123) }, + { AOM_CDF2(20143) }, + { AOM_CDF2(26536) }, + { AOM_CDF2(31986) }, + { AOM_CDF2(3050) }, + { AOM_CDF2(14603) }, + { AOM_CDF2(25155) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(32363) }, + { AOM_CDF2(10692) }, + { AOM_CDF2(19090) }, + { AOM_CDF2(24357) }, + { AOM_CDF2(24442) }, + { AOM_CDF2(28312) }, + { AOM_CDF2(32169) }, + { AOM_CDF2(3648) }, + { AOM_CDF2(15690) }, + { AOM_CDF2(26815) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(30669) }, + { AOM_CDF2(3832) }, + { AOM_CDF2(11663) }, + { AOM_CDF2(18889) }, + { AOM_CDF2(19782) }, + { AOM_CDF2(23313) }, + { AOM_CDF2(31330) }, + { AOM_CDF2(5124) }, + { AOM_CDF2(18719) }, + { AOM_CDF2(28468) }, + { AOM_CDF2(3082) }, + { AOM_CDF2(20982) }, + { AOM_CDF2(29443) } }, + { { AOM_CDF2(28573) }, + { AOM_CDF2(3183) }, + { AOM_CDF2(17802) }, + { AOM_CDF2(25977) }, + { AOM_CDF2(26677) }, + { AOM_CDF2(27832) }, + { AOM_CDF2(32387) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } } }, + { { { AOM_CDF2(26887) }, + { AOM_CDF2(6729) }, + { AOM_CDF2(10361) }, + { AOM_CDF2(17442) }, + { AOM_CDF2(15045) }, + { AOM_CDF2(22478) }, + { AOM_CDF2(29072) }, + { AOM_CDF2(2713) }, + { AOM_CDF2(11861) }, + { AOM_CDF2(20773) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(31903) }, + { AOM_CDF2(2044) }, + { AOM_CDF2(7528) }, + { AOM_CDF2(14618) }, + { AOM_CDF2(16182) }, + { AOM_CDF2(24168) }, + { AOM_CDF2(31037) }, + { AOM_CDF2(2786) }, + { AOM_CDF2(11194) }, + { AOM_CDF2(20155) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(32510) }, + { AOM_CDF2(8430) }, + { AOM_CDF2(17318) }, + { AOM_CDF2(24154) }, + { AOM_CDF2(23674) }, + { AOM_CDF2(28789) }, + { AOM_CDF2(32139) }, + { AOM_CDF2(3440) }, + { AOM_CDF2(13117) }, + { AOM_CDF2(22702) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(31671) }, + { AOM_CDF2(2056) }, + { AOM_CDF2(11746) }, + { AOM_CDF2(16852) }, + { AOM_CDF2(18635) }, + { AOM_CDF2(24715) }, + { AOM_CDF2(31484) }, + { AOM_CDF2(4656) }, + { AOM_CDF2(16074) }, + { AOM_CDF2(24704) }, + { AOM_CDF2(1806) }, + { AOM_CDF2(14645) }, + { AOM_CDF2(25336) } }, + { { AOM_CDF2(31539) }, + { AOM_CDF2(8433) }, + { AOM_CDF2(20576) }, + { AOM_CDF2(27904) }, + { AOM_CDF2(27852) }, + { AOM_CDF2(30026) }, + { AOM_CDF2(32441) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } } } }; + +static const aom_cdf_prob + av1_default_eob_extra_cdfs[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES] + [EOB_COEF_CONTEXTS][CDF_SIZE(2)] = { + { { { + { AOM_CDF2(16961) }, + { AOM_CDF2(17223) }, + { AOM_CDF2(7621) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(19069) }, + { AOM_CDF2(22525) }, + { AOM_CDF2(13377) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(20401) }, + { AOM_CDF2(17025) }, + { AOM_CDF2(12845) }, + { AOM_CDF2(12873) }, + { AOM_CDF2(14094) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(20681) }, + { AOM_CDF2(20701) }, + { AOM_CDF2(15250) }, + { AOM_CDF2(15017) }, + { AOM_CDF2(14928) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(23905) }, + { AOM_CDF2(17194) }, + { AOM_CDF2(16170) }, + { AOM_CDF2(17695) }, + { AOM_CDF2(13826) }, + { AOM_CDF2(15810) }, + { AOM_CDF2(12036) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(23959) }, + { AOM_CDF2(20799) }, + { AOM_CDF2(19021) }, + { AOM_CDF2(16203) }, + { AOM_CDF2(17886) }, + { AOM_CDF2(14144) }, + { AOM_CDF2(12010) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(27399) }, + { AOM_CDF2(16327) }, + { AOM_CDF2(18071) }, + { AOM_CDF2(19584) }, + { AOM_CDF2(20721) }, + { AOM_CDF2(18432) }, + { AOM_CDF2(19560) }, + { AOM_CDF2(10150) }, + { AOM_CDF2(8805) }, + }, + { + { AOM_CDF2(24932) }, + { AOM_CDF2(20833) }, + { AOM_CDF2(12027) }, + { AOM_CDF2(16670) }, + { AOM_CDF2(19914) }, + { AOM_CDF2(15106) }, + { AOM_CDF2(17662) }, + { AOM_CDF2(13783) }, + { AOM_CDF2(28756) }, + } }, + { { + { AOM_CDF2(23406) }, + { AOM_CDF2(21845) }, + { AOM_CDF2(18432) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(17096) }, + { AOM_CDF2(12561) }, + { AOM_CDF2(17320) }, + { AOM_CDF2(22395) }, + { AOM_CDF2(21370) }, + }, + { + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } } }, + { { { + { AOM_CDF2(17471) }, + { AOM_CDF2(20223) }, + { AOM_CDF2(11357) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(20335) }, + { AOM_CDF2(21667) }, + { AOM_CDF2(14818) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(20430) }, + { AOM_CDF2(20662) }, + { AOM_CDF2(15367) }, + { AOM_CDF2(16970) }, + { AOM_CDF2(14657) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(22117) }, + { AOM_CDF2(22028) }, + { AOM_CDF2(18650) }, + { AOM_CDF2(16042) }, + { AOM_CDF2(15885) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(22409) }, + { AOM_CDF2(21012) }, + { AOM_CDF2(15650) }, + { AOM_CDF2(17395) }, + { AOM_CDF2(15469) }, + { AOM_CDF2(20205) }, + { AOM_CDF2(19511) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(24220) }, + { AOM_CDF2(22480) }, + { AOM_CDF2(17737) }, + { AOM_CDF2(18916) }, + { AOM_CDF2(19268) }, + { AOM_CDF2(18412) }, + { AOM_CDF2(18844) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(25991) }, + { AOM_CDF2(20314) }, + { AOM_CDF2(17731) }, + { AOM_CDF2(19678) }, + { AOM_CDF2(18649) }, + { AOM_CDF2(17307) }, + { AOM_CDF2(21798) }, + { AOM_CDF2(17549) }, + { AOM_CDF2(15630) }, + }, + { + { AOM_CDF2(26585) }, + { AOM_CDF2(21469) }, + { AOM_CDF2(20432) }, + { AOM_CDF2(17735) }, + { AOM_CDF2(19280) }, + { AOM_CDF2(15235) }, + { AOM_CDF2(20297) }, + { AOM_CDF2(22471) }, + { AOM_CDF2(28997) }, + } }, + { { + { AOM_CDF2(26605) }, + { AOM_CDF2(11304) }, + { AOM_CDF2(16726) }, + { AOM_CDF2(16560) }, + { AOM_CDF2(20866) }, + { AOM_CDF2(23524) }, + { AOM_CDF2(19878) }, + { AOM_CDF2(13469) }, + { AOM_CDF2(23084) }, + }, + { + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } } }, + { { { + { AOM_CDF2(18983) }, + { AOM_CDF2(20512) }, + { AOM_CDF2(14885) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(20090) }, + { AOM_CDF2(19444) }, + { AOM_CDF2(17286) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(19139) }, + { AOM_CDF2(21487) }, + { AOM_CDF2(18959) }, + { AOM_CDF2(20910) }, + { AOM_CDF2(19089) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(20536) }, + { AOM_CDF2(20664) }, + { AOM_CDF2(20625) }, + { AOM_CDF2(19123) }, + { AOM_CDF2(14862) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(19833) }, + { AOM_CDF2(21502) }, + { AOM_CDF2(17485) }, + { AOM_CDF2(20267) }, + { AOM_CDF2(18353) }, + { AOM_CDF2(23329) }, + { AOM_CDF2(21478) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(22041) }, + { AOM_CDF2(23434) }, + { AOM_CDF2(20001) }, + { AOM_CDF2(20554) }, + { AOM_CDF2(20951) }, + { AOM_CDF2(20145) }, + { AOM_CDF2(15562) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(23312) }, + { AOM_CDF2(21607) }, + { AOM_CDF2(16526) }, + { AOM_CDF2(18957) }, + { AOM_CDF2(18034) }, + { AOM_CDF2(18934) }, + { AOM_CDF2(24247) }, + { AOM_CDF2(16921) }, + { AOM_CDF2(17080) }, + }, + { + { AOM_CDF2(26579) }, + { AOM_CDF2(24910) }, + { AOM_CDF2(18637) }, + { AOM_CDF2(19800) }, + { AOM_CDF2(20388) }, + { AOM_CDF2(9887) }, + { AOM_CDF2(15642) }, + { AOM_CDF2(30198) }, + { AOM_CDF2(24721) }, + } }, + { { + { AOM_CDF2(26998) }, + { AOM_CDF2(16737) }, + { AOM_CDF2(17838) }, + { AOM_CDF2(18922) }, + { AOM_CDF2(19515) }, + { AOM_CDF2(18636) }, + { AOM_CDF2(17333) }, + { AOM_CDF2(15776) }, + { AOM_CDF2(22658) }, + }, + { + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } } }, + { { { + { AOM_CDF2(20177) }, + { AOM_CDF2(20789) }, + { AOM_CDF2(20262) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(21416) }, + { AOM_CDF2(20855) }, + { AOM_CDF2(23410) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(20238) }, + { AOM_CDF2(21057) }, + { AOM_CDF2(19159) }, + { AOM_CDF2(22337) }, + { AOM_CDF2(20159) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(20125) }, + { AOM_CDF2(20559) }, + { AOM_CDF2(21707) }, + { AOM_CDF2(22296) }, + { AOM_CDF2(17333) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(19941) }, + { AOM_CDF2(20527) }, + { AOM_CDF2(21470) }, + { AOM_CDF2(22487) }, + { AOM_CDF2(19558) }, + { AOM_CDF2(22354) }, + { AOM_CDF2(20331) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(22752) }, + { AOM_CDF2(25006) }, + { AOM_CDF2(22075) }, + { AOM_CDF2(21576) }, + { AOM_CDF2(17740) }, + { AOM_CDF2(21690) }, + { AOM_CDF2(19211) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(21442) }, + { AOM_CDF2(22358) }, + { AOM_CDF2(18503) }, + { AOM_CDF2(20291) }, + { AOM_CDF2(19945) }, + { AOM_CDF2(21294) }, + { AOM_CDF2(21178) }, + { AOM_CDF2(19400) }, + { AOM_CDF2(10556) }, + }, + { + { AOM_CDF2(24648) }, + { AOM_CDF2(24949) }, + { AOM_CDF2(20708) }, + { AOM_CDF2(23905) }, + { AOM_CDF2(20501) }, + { AOM_CDF2(9558) }, + { AOM_CDF2(9423) }, + { AOM_CDF2(30365) }, + { AOM_CDF2(19253) }, + } }, + { { + { AOM_CDF2(26064) }, + { AOM_CDF2(22098) }, + { AOM_CDF2(19613) }, + { AOM_CDF2(20525) }, + { AOM_CDF2(17595) }, + { AOM_CDF2(16618) }, + { AOM_CDF2(20497) }, + { AOM_CDF2(18989) }, + { AOM_CDF2(15513) }, + }, + { + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } } } + }; + +static const aom_cdf_prob + av1_default_eob_multi16_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE( + 5)] = { { { { AOM_CDF5(840, 1039, 1980, 4895) }, + { AOM_CDF5(370, 671, 1883, 4471) } }, + { { AOM_CDF5(3247, 4950, 9688, 14563) }, + { AOM_CDF5(1904, 3354, 7763, 14647) } } }, + { { { AOM_CDF5(2125, 2551, 5165, 8946) }, + { AOM_CDF5(513, 765, 1859, 6339) } }, + { { AOM_CDF5(7637, 9498, 14259, 19108) }, + { AOM_CDF5(2497, 4096, 8866, 16993) } } }, + { { { AOM_CDF5(4016, 4897, 8881, 14968) }, + { AOM_CDF5(716, 1105, 2646, 10056) } }, + { { AOM_CDF5(11139, 13270, 18241, 23566) }, + { AOM_CDF5(3192, 5032, 10297, 19755) } } }, + { { { AOM_CDF5(6708, 8958, 14746, 22133) }, + { AOM_CDF5(1222, 2074, 4783, 15410) } }, + { { AOM_CDF5(19575, 21766, 26044, 29709) }, + { AOM_CDF5(7297, 10767, 19273, 28194) } } } }; + +static const aom_cdf_prob + av1_default_eob_multi32_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE( + 6)] = { { { { AOM_CDF6(400, 520, 977, 2102, 6542) }, + { AOM_CDF6(210, 405, 1315, 3326, 7537) } }, + { { AOM_CDF6(2636, 4273, 7588, 11794, 20401) }, + { AOM_CDF6(1786, 3179, 6902, 11357, 19054) } } }, + { { { AOM_CDF6(989, 1249, 2019, 4151, 10785) }, + { AOM_CDF6(313, 441, 1099, 2917, 8562) } }, + { { AOM_CDF6(8394, 10352, 13932, 18855, 26014) }, + { AOM_CDF6(2578, 4124, 8181, 13670, 24234) } } }, + { { { AOM_CDF6(2515, 3003, 4452, 8162, 16041) }, + { AOM_CDF6(574, 821, 1836, 5089, 13128) } }, + { { AOM_CDF6(13468, 16303, 20361, 25105, 29281) }, + { AOM_CDF6(3542, 5502, 10415, 16760, 25644) } } }, + { { { AOM_CDF6(4617, 5709, 8446, 13584, 23135) }, + { AOM_CDF6(1156, 1702, 3675, 9274, 20539) } }, + { { AOM_CDF6(22086, 24282, 27010, 29770, 31743) }, + { AOM_CDF6(7699, 10897, 20891, 26926, 31628) } } } }; + +static const aom_cdf_prob + av1_default_eob_multi64_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE( + 7)] = { { { { AOM_CDF7(329, 498, 1101, 1784, 3265, 7758) }, + { AOM_CDF7(335, 730, 1459, 5494, 8755, 12997) } }, + { { AOM_CDF7(3505, 5304, 10086, 13814, 17684, 23370) }, + { AOM_CDF7(1563, 2700, 4876, 10911, 14706, 22480) } } }, + { { { AOM_CDF7(1260, 1446, 2253, 3712, 6652, 13369) }, + { AOM_CDF7(401, 605, 1029, 2563, 5845, 12626) } }, + { { AOM_CDF7(8609, 10612, 14624, 18714, 22614, 29024) }, + { AOM_CDF7(1923, 3127, 5867, 9703, 14277, 27100) } } }, + { { { AOM_CDF7(2374, 2772, 4583, 7276, 12288, 19706) }, + { AOM_CDF7(497, 810, 1315, 3000, 7004, 15641) } }, + { { AOM_CDF7(15050, 17126, 21410, 24886, 28156, 30726) }, + { AOM_CDF7(4034, 6290, 10235, 14982, 21214, 28491) } } }, + { { { AOM_CDF7(6307, 7541, 12060, 16358, 22553, 27865) }, + { AOM_CDF7(1289, 2320, 3971, 7926, 14153, 24291) } }, + { { AOM_CDF7(24212, 25708, 28268, 30035, 31307, 32049) }, + { AOM_CDF7(8726, 12378, 19409, 26450, 30038, 32462) } } } }; + +static const aom_cdf_prob + av1_default_eob_multi128_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE( + 8)] = { + { { { AOM_CDF8(219, 482, 1140, 2091, 3680, 6028, 12586) }, + { AOM_CDF8(371, 699, 1254, 4830, 9479, 12562, 17497) } }, + { { AOM_CDF8(5245, 7456, 12880, 15852, 20033, 23932, 27608) }, + { AOM_CDF8(2054, 3472, 5869, 14232, 18242, 20590, 26752) } } }, + { { { AOM_CDF8(685, 933, 1488, 2714, 4766, 8562, 19254) }, + { AOM_CDF8(217, 352, 618, 2303, 5261, 9969, 17472) } }, + { { AOM_CDF8(8045, 11200, 15497, 19595, 23948, 27408, 30938) }, + { AOM_CDF8(2310, 4160, 7471, 14997, 17931, 20768, 30240) } } }, + { { { AOM_CDF8(1366, 1738, 2527, 5016, 9355, 15797, 24643) }, + { AOM_CDF8(354, 558, 944, 2760, 7287, 14037, 21779) } }, + { { AOM_CDF8(13627, 16246, 20173, 24429, 27948, 30415, 31863) }, + { AOM_CDF8(6275, 9889, 14769, 23164, 27988, 30493, 32272) } } }, + { { { AOM_CDF8(3472, 4885, 7489, 12481, 18517, 24536, 29635) }, + { AOM_CDF8(886, 1731, 3271, 8469, 15569, 22126, 28383) } }, + { { AOM_CDF8(24313, 26062, 28385, 30107, 31217, 31898, 32345) }, + { AOM_CDF8(9165, 13282, 21150, 30286, 31894, 32571, 32712) } } } + }; + +static const aom_cdf_prob + av1_default_eob_multi256_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE( + 9)] = { + { { { AOM_CDF9(310, 584, 1887, 3589, 6168, 8611, 11352, 15652) }, + { AOM_CDF9(998, 1850, 2998, 5604, 17341, 19888, 22899, 25583) } }, + { { AOM_CDF9(2520, 3240, 5952, 8870, 12577, 17558, 19954, 24168) }, + { AOM_CDF9(2203, 4130, 7435, 10739, 20652, 23681, 25609, 27261) } } }, + { { { AOM_CDF9(1448, 2109, 4151, 6263, 9329, 13260, 17944, 23300) }, + { AOM_CDF9(399, 1019, 1749, 3038, 10444, 15546, 22739, 27294) } }, + { { AOM_CDF9(6402, 8148, 12623, 15072, 18728, 22847, 26447, 29377) }, + { AOM_CDF9(1674, 3252, 5734, 10159, 22397, 23802, 24821, 30940) } } }, + { { { AOM_CDF9(3089, 3920, 6038, 9460, 14266, 19881, 25766, 29176) }, + { AOM_CDF9(1084, 2358, 3488, 5122, 11483, 18103, 26023, 29799) } }, + { { AOM_CDF9(11514, 13794, 17480, 20754, 24361, 27378, 29492, 31277) }, + { AOM_CDF9(6571, 9610, 15516, 21826, 29092, 30829, 31842, + 32708) } } }, + { { { AOM_CDF9(5348, 7113, 11820, 15924, 22106, 26777, 30334, 31757) }, + { AOM_CDF9(2453, 4474, 6307, 8777, 16474, 22975, 29000, 31547) } }, + { { AOM_CDF9(23110, 24597, 27140, 28894, 30167, 30927, 31392, 32094) }, + { AOM_CDF9(9998, 17661, 25178, 28097, 31308, 32038, 32403, + 32695) } } } + }; + +static const aom_cdf_prob + av1_default_eob_multi512_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE( + 10)] = { { { { AOM_CDF10(641, 983, 3707, 5430, 10234, 14958, 18788, + 23412, 26061) }, + { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, + 26214, 29491) } }, + { { AOM_CDF10(5095, 6446, 9996, 13354, 16017, 17986, 20919, + 26129, 29140) }, + { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, + 26214, 29491) } } }, + { { { AOM_CDF10(1230, 2278, 5035, 7776, 11871, 15346, 19590, + 24584, 28749) }, + { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, + 26214, 29491) } }, + { { AOM_CDF10(7265, 9979, 15819, 19250, 21780, 23846, 26478, + 28396, 31811) }, + { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, + 26214, 29491) } } }, + { { { AOM_CDF10(2624, 3936, 6480, 9686, 13979, 17726, 23267, + 28410, 31078) }, + { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, + 26214, 29491) } }, + { { AOM_CDF10(12015, 14769, 19588, 22052, 24222, 25812, + 27300, 29219, 32114) }, + { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, + 26214, 29491) } } }, + { { { AOM_CDF10(5927, 7809, 10923, 14597, 19439, 24135, 28456, + 31142, 32060) }, + { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, + 26214, 29491) } }, + { { AOM_CDF10(21093, 23043, 25742, 27658, 29097, 29716, + 30073, 30820, 31956) }, + { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, + 26214, 29491) } } } }; + +static const aom_cdf_prob + av1_default_eob_multi1024_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE( + 11)] = { { { { AOM_CDF11(393, 421, 751, 1623, 3160, 6352, 13345, 18047, + 22571, 25830) }, + { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, + 23831, 26810, 29789) } }, + { { AOM_CDF11(1865, 1988, 2930, 4242, 10533, 16538, 21354, + 27255, 28546, 31784) }, + { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, + 23831, 26810, 29789) } } }, + { { { AOM_CDF11(696, 948, 3145, 5702, 9706, 13217, 17851, + 21856, 25692, 28034) }, + { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, + 23831, 26810, 29789) } }, + { { AOM_CDF11(2672, 3591, 9330, 17084, 22725, 24284, 26527, + 28027, 28377, 30876) }, + { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, + 23831, 26810, 29789) } } }, + { { { AOM_CDF11(2784, 3831, 7041, 10521, 14847, 18844, 23155, + 26682, 29229, 31045) }, + { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, + 23831, 26810, 29789) } }, + { { AOM_CDF11(9577, 12466, 17739, 20750, 22061, 23215, 24601, + 25483, 25843, 32056) }, + { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, + 23831, 26810, 29789) } } }, + { { { AOM_CDF11(6698, 8334, 11961, 15762, 20186, 23862, 27434, + 29326, 31082, 32050) }, + { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, + 23831, 26810, 29789) } }, + { { AOM_CDF11(20569, 22426, 25569, 26859, 28053, 28913, + 29486, 29724, 29807, 32570) }, + { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, + 23831, 26810, 29789) } } } }; + +static const aom_cdf_prob av1_default_coeff_lps_multi_cdfs + [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS] + [CDF_SIZE(BR_CDF_SIZE)] = { + { { { { AOM_CDF4(14298, 20718, 24174) }, + { AOM_CDF4(12536, 19601, 23789) }, + { AOM_CDF4(8712, 15051, 19503) }, + { AOM_CDF4(6170, 11327, 15434) }, + { AOM_CDF4(4742, 8926, 12538) }, + { AOM_CDF4(3803, 7317, 10546) }, + { AOM_CDF4(1696, 3317, 4871) }, + { AOM_CDF4(14392, 19951, 22756) }, + { AOM_CDF4(15978, 23218, 26818) }, + { AOM_CDF4(12187, 19474, 23889) }, + { AOM_CDF4(9176, 15640, 20259) }, + { AOM_CDF4(7068, 12655, 17028) }, + { AOM_CDF4(5656, 10442, 14472) }, + { AOM_CDF4(2580, 4992, 7244) }, + { AOM_CDF4(12136, 18049, 21426) }, + { AOM_CDF4(13784, 20721, 24481) }, + { AOM_CDF4(10836, 17621, 21900) }, + { AOM_CDF4(8372, 14444, 18847) }, + { AOM_CDF4(6523, 11779, 16000) }, + { AOM_CDF4(5337, 9898, 13760) }, + { AOM_CDF4(3034, 5860, 8462) } }, + { { AOM_CDF4(15967, 22905, 26286) }, + { AOM_CDF4(13534, 20654, 24579) }, + { AOM_CDF4(9504, 16092, 20535) }, + { AOM_CDF4(6975, 12568, 16903) }, + { AOM_CDF4(5364, 10091, 14020) }, + { AOM_CDF4(4357, 8370, 11857) }, + { AOM_CDF4(2506, 4934, 7218) }, + { AOM_CDF4(23032, 28815, 30936) }, + { AOM_CDF4(19540, 26704, 29719) }, + { AOM_CDF4(15158, 22969, 27097) }, + { AOM_CDF4(11408, 18865, 23650) }, + { AOM_CDF4(8885, 15448, 20250) }, + { AOM_CDF4(7108, 12853, 17416) }, + { AOM_CDF4(4231, 8041, 11480) }, + { AOM_CDF4(19823, 26490, 29156) }, + { AOM_CDF4(18890, 25929, 28932) }, + { AOM_CDF4(15660, 23491, 27433) }, + { AOM_CDF4(12147, 19776, 24488) }, + { AOM_CDF4(9728, 16774, 21649) }, + { AOM_CDF4(7919, 14277, 19066) }, + { AOM_CDF4(5440, 10170, 14185) } } }, + { { { AOM_CDF4(14406, 20862, 24414) }, + { AOM_CDF4(11824, 18907, 23109) }, + { AOM_CDF4(8257, 14393, 18803) }, + { AOM_CDF4(5860, 10747, 14778) }, + { AOM_CDF4(4475, 8486, 11984) }, + { AOM_CDF4(3606, 6954, 10043) }, + { AOM_CDF4(1736, 3410, 5048) }, + { AOM_CDF4(14430, 20046, 22882) }, + { AOM_CDF4(15593, 22899, 26709) }, + { AOM_CDF4(12102, 19368, 23811) }, + { AOM_CDF4(9059, 15584, 20262) }, + { AOM_CDF4(6999, 12603, 17048) }, + { AOM_CDF4(5684, 10497, 14553) }, + { AOM_CDF4(2822, 5438, 7862) }, + { AOM_CDF4(15785, 21585, 24359) }, + { AOM_CDF4(18347, 25229, 28266) }, + { AOM_CDF4(14974, 22487, 26389) }, + { AOM_CDF4(11423, 18681, 23271) }, + { AOM_CDF4(8863, 15350, 20008) }, + { AOM_CDF4(7153, 12852, 17278) }, + { AOM_CDF4(3707, 7036, 9982) } }, + { { AOM_CDF4(15460, 21696, 25469) }, + { AOM_CDF4(12170, 19249, 23191) }, + { AOM_CDF4(8723, 15027, 19332) }, + { AOM_CDF4(6428, 11704, 15874) }, + { AOM_CDF4(4922, 9292, 13052) }, + { AOM_CDF4(4139, 7695, 11010) }, + { AOM_CDF4(2291, 4508, 6598) }, + { AOM_CDF4(19856, 26920, 29828) }, + { AOM_CDF4(17923, 25289, 28792) }, + { AOM_CDF4(14278, 21968, 26297) }, + { AOM_CDF4(10910, 18136, 22950) }, + { AOM_CDF4(8423, 14815, 19627) }, + { AOM_CDF4(6771, 12283, 16774) }, + { AOM_CDF4(4074, 7750, 11081) }, + { AOM_CDF4(19852, 26074, 28672) }, + { AOM_CDF4(19371, 26110, 28989) }, + { AOM_CDF4(16265, 23873, 27663) }, + { AOM_CDF4(12758, 20378, 24952) }, + { AOM_CDF4(10095, 17098, 21961) }, + { AOM_CDF4(8250, 14628, 19451) }, + { AOM_CDF4(5205, 9745, 13622) } } }, + { { { AOM_CDF4(10563, 16233, 19763) }, + { AOM_CDF4(9794, 16022, 19804) }, + { AOM_CDF4(6750, 11945, 15759) }, + { AOM_CDF4(4963, 9186, 12752) }, + { AOM_CDF4(3845, 7435, 10627) }, + { AOM_CDF4(3051, 6085, 8834) }, + { AOM_CDF4(1311, 2596, 3830) }, + { AOM_CDF4(11246, 16404, 19689) }, + { AOM_CDF4(12315, 18911, 22731) }, + { AOM_CDF4(10557, 17095, 21289) }, + { AOM_CDF4(8136, 14006, 18249) }, + { AOM_CDF4(6348, 11474, 15565) }, + { AOM_CDF4(5196, 9655, 13400) }, + { AOM_CDF4(2349, 4526, 6587) }, + { AOM_CDF4(13337, 18730, 21569) }, + { AOM_CDF4(19306, 26071, 28882) }, + { AOM_CDF4(15952, 23540, 27254) }, + { AOM_CDF4(12409, 19934, 24430) }, + { AOM_CDF4(9760, 16706, 21389) }, + { AOM_CDF4(8004, 14220, 18818) }, + { AOM_CDF4(4138, 7794, 10961) } }, + { { AOM_CDF4(10870, 16684, 20949) }, + { AOM_CDF4(9664, 15230, 18680) }, + { AOM_CDF4(6886, 12109, 15408) }, + { AOM_CDF4(4825, 8900, 12305) }, + { AOM_CDF4(3630, 7162, 10314) }, + { AOM_CDF4(3036, 6429, 9387) }, + { AOM_CDF4(1671, 3296, 4940) }, + { AOM_CDF4(13819, 19159, 23026) }, + { AOM_CDF4(11984, 19108, 23120) }, + { AOM_CDF4(10690, 17210, 21663) }, + { AOM_CDF4(7984, 14154, 18333) }, + { AOM_CDF4(6868, 12294, 16124) }, + { AOM_CDF4(5274, 8994, 12868) }, + { AOM_CDF4(2988, 5771, 8424) }, + { AOM_CDF4(19736, 26647, 29141) }, + { AOM_CDF4(18933, 26070, 28984) }, + { AOM_CDF4(15779, 23048, 27200) }, + { AOM_CDF4(12638, 20061, 24532) }, + { AOM_CDF4(10692, 17545, 22220) }, + { AOM_CDF4(9217, 15251, 20054) }, + { AOM_CDF4(5078, 9284, 12594) } } }, + { { { AOM_CDF4(2331, 3662, 5244) }, + { AOM_CDF4(2891, 4771, 6145) }, + { AOM_CDF4(4598, 7623, 9729) }, + { AOM_CDF4(3520, 6845, 9199) }, + { AOM_CDF4(3417, 6119, 9324) }, + { AOM_CDF4(2601, 5412, 7385) }, + { AOM_CDF4(600, 1173, 1744) }, + { AOM_CDF4(7672, 13286, 17469) }, + { AOM_CDF4(4232, 7792, 10793) }, + { AOM_CDF4(2915, 5317, 7397) }, + { AOM_CDF4(2318, 4356, 6152) }, + { AOM_CDF4(2127, 4000, 5554) }, + { AOM_CDF4(1850, 3478, 5275) }, + { AOM_CDF4(977, 1933, 2843) }, + { AOM_CDF4(18280, 24387, 27989) }, + { AOM_CDF4(15852, 22671, 26185) }, + { AOM_CDF4(13845, 20951, 24789) }, + { AOM_CDF4(11055, 17966, 22129) }, + { AOM_CDF4(9138, 15422, 19801) }, + { AOM_CDF4(7454, 13145, 17456) }, + { AOM_CDF4(3370, 6393, 9013) } }, + { { AOM_CDF4(5842, 9229, 10838) }, + { AOM_CDF4(2313, 3491, 4276) }, + { AOM_CDF4(2998, 6104, 7496) }, + { AOM_CDF4(2420, 7447, 9868) }, + { AOM_CDF4(3034, 8495, 10923) }, + { AOM_CDF4(4076, 8937, 10975) }, + { AOM_CDF4(1086, 2370, 3299) }, + { AOM_CDF4(9714, 17254, 20444) }, + { AOM_CDF4(8543, 13698, 17123) }, + { AOM_CDF4(4918, 9007, 11910) }, + { AOM_CDF4(4129, 7532, 10553) }, + { AOM_CDF4(2364, 5533, 8058) }, + { AOM_CDF4(1834, 3546, 5563) }, + { AOM_CDF4(1473, 2908, 4133) }, + { AOM_CDF4(15405, 21193, 25619) }, + { AOM_CDF4(15691, 21952, 26561) }, + { AOM_CDF4(12962, 19194, 24165) }, + { AOM_CDF4(10272, 17855, 22129) }, + { AOM_CDF4(8588, 15270, 20718) }, + { AOM_CDF4(8682, 14669, 19500) }, + { AOM_CDF4(4870, 9636, 13205) } } }, + { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } } }, + { { { { AOM_CDF4(14995, 21341, 24749) }, + { AOM_CDF4(13158, 20289, 24601) }, + { AOM_CDF4(8941, 15326, 19876) }, + { AOM_CDF4(6297, 11541, 15807) }, + { AOM_CDF4(4817, 9029, 12776) }, + { AOM_CDF4(3731, 7273, 10627) }, + { AOM_CDF4(1847, 3617, 5354) }, + { AOM_CDF4(14472, 19659, 22343) }, + { AOM_CDF4(16806, 24162, 27533) }, + { AOM_CDF4(12900, 20404, 24713) }, + { AOM_CDF4(9411, 16112, 20797) }, + { AOM_CDF4(7056, 12697, 17148) }, + { AOM_CDF4(5544, 10339, 14460) }, + { AOM_CDF4(2954, 5704, 8319) }, + { AOM_CDF4(12464, 18071, 21354) }, + { AOM_CDF4(15482, 22528, 26034) }, + { AOM_CDF4(12070, 19269, 23624) }, + { AOM_CDF4(8953, 15406, 20106) }, + { AOM_CDF4(7027, 12730, 17220) }, + { AOM_CDF4(5887, 10913, 15140) }, + { AOM_CDF4(3793, 7278, 10447) } }, + { { AOM_CDF4(15571, 22232, 25749) }, + { AOM_CDF4(14506, 21575, 25374) }, + { AOM_CDF4(10189, 17089, 21569) }, + { AOM_CDF4(7316, 13301, 17915) }, + { AOM_CDF4(5783, 10912, 15190) }, + { AOM_CDF4(4760, 9155, 13088) }, + { AOM_CDF4(2993, 5966, 8774) }, + { AOM_CDF4(23424, 28903, 30778) }, + { AOM_CDF4(20775, 27666, 30290) }, + { AOM_CDF4(16474, 24410, 28299) }, + { AOM_CDF4(12471, 20180, 24987) }, + { AOM_CDF4(9410, 16487, 21439) }, + { AOM_CDF4(7536, 13614, 18529) }, + { AOM_CDF4(5048, 9586, 13549) }, + { AOM_CDF4(21090, 27290, 29756) }, + { AOM_CDF4(20796, 27402, 30026) }, + { AOM_CDF4(17819, 25485, 28969) }, + { AOM_CDF4(13860, 21909, 26462) }, + { AOM_CDF4(11002, 18494, 23529) }, + { AOM_CDF4(8953, 15929, 20897) }, + { AOM_CDF4(6448, 11918, 16454) } } }, + { { { AOM_CDF4(15999, 22208, 25449) }, + { AOM_CDF4(13050, 19988, 24122) }, + { AOM_CDF4(8594, 14864, 19378) }, + { AOM_CDF4(6033, 11079, 15238) }, + { AOM_CDF4(4554, 8683, 12347) }, + { AOM_CDF4(3672, 7139, 10337) }, + { AOM_CDF4(1900, 3771, 5576) }, + { AOM_CDF4(15788, 21340, 23949) }, + { AOM_CDF4(16825, 24235, 27758) }, + { AOM_CDF4(12873, 20402, 24810) }, + { AOM_CDF4(9590, 16363, 21094) }, + { AOM_CDF4(7352, 13209, 17733) }, + { AOM_CDF4(5960, 10989, 15184) }, + { AOM_CDF4(3232, 6234, 9007) }, + { AOM_CDF4(15761, 20716, 23224) }, + { AOM_CDF4(19318, 25989, 28759) }, + { AOM_CDF4(15529, 23094, 26929) }, + { AOM_CDF4(11662, 18989, 23641) }, + { AOM_CDF4(8955, 15568, 20366) }, + { AOM_CDF4(7281, 13106, 17708) }, + { AOM_CDF4(4248, 8059, 11440) } }, + { { AOM_CDF4(14899, 21217, 24503) }, + { AOM_CDF4(13519, 20283, 24047) }, + { AOM_CDF4(9429, 15966, 20365) }, + { AOM_CDF4(6700, 12355, 16652) }, + { AOM_CDF4(5088, 9704, 13716) }, + { AOM_CDF4(4243, 8154, 11731) }, + { AOM_CDF4(2702, 5364, 7861) }, + { AOM_CDF4(22745, 28388, 30454) }, + { AOM_CDF4(20235, 27146, 29922) }, + { AOM_CDF4(15896, 23715, 27637) }, + { AOM_CDF4(11840, 19350, 24131) }, + { AOM_CDF4(9122, 15932, 20880) }, + { AOM_CDF4(7488, 13581, 18362) }, + { AOM_CDF4(5114, 9568, 13370) }, + { AOM_CDF4(20845, 26553, 28932) }, + { AOM_CDF4(20981, 27372, 29884) }, + { AOM_CDF4(17781, 25335, 28785) }, + { AOM_CDF4(13760, 21708, 26297) }, + { AOM_CDF4(10975, 18415, 23365) }, + { AOM_CDF4(9045, 15789, 20686) }, + { AOM_CDF4(6130, 11199, 15423) } } }, + { { { AOM_CDF4(13549, 19724, 23158) }, + { AOM_CDF4(11844, 18382, 22246) }, + { AOM_CDF4(7919, 13619, 17773) }, + { AOM_CDF4(5486, 10143, 13946) }, + { AOM_CDF4(4166, 7983, 11324) }, + { AOM_CDF4(3364, 6506, 9427) }, + { AOM_CDF4(1598, 3160, 4674) }, + { AOM_CDF4(15281, 20979, 23781) }, + { AOM_CDF4(14939, 22119, 25952) }, + { AOM_CDF4(11363, 18407, 22812) }, + { AOM_CDF4(8609, 14857, 19370) }, + { AOM_CDF4(6737, 12184, 16480) }, + { AOM_CDF4(5506, 10263, 14262) }, + { AOM_CDF4(2990, 5786, 8380) }, + { AOM_CDF4(20249, 25253, 27417) }, + { AOM_CDF4(21070, 27518, 30001) }, + { AOM_CDF4(16854, 24469, 28074) }, + { AOM_CDF4(12864, 20486, 25000) }, + { AOM_CDF4(9962, 16978, 21778) }, + { AOM_CDF4(8074, 14338, 19048) }, + { AOM_CDF4(4494, 8479, 11906) } }, + { { AOM_CDF4(13960, 19617, 22829) }, + { AOM_CDF4(11150, 17341, 21228) }, + { AOM_CDF4(7150, 12964, 17190) }, + { AOM_CDF4(5331, 10002, 13867) }, + { AOM_CDF4(4167, 7744, 11057) }, + { AOM_CDF4(3480, 6629, 9646) }, + { AOM_CDF4(1883, 3784, 5686) }, + { AOM_CDF4(18752, 25660, 28912) }, + { AOM_CDF4(16968, 24586, 28030) }, + { AOM_CDF4(13520, 21055, 25313) }, + { AOM_CDF4(10453, 17626, 22280) }, + { AOM_CDF4(8386, 14505, 19116) }, + { AOM_CDF4(6742, 12595, 17008) }, + { AOM_CDF4(4273, 8140, 11499) }, + { AOM_CDF4(22120, 27827, 30233) }, + { AOM_CDF4(20563, 27358, 29895) }, + { AOM_CDF4(17076, 24644, 28153) }, + { AOM_CDF4(13362, 20942, 25309) }, + { AOM_CDF4(10794, 17965, 22695) }, + { AOM_CDF4(9014, 15652, 20319) }, + { AOM_CDF4(5708, 10512, 14497) } } }, + { { { AOM_CDF4(5705, 10930, 15725) }, + { AOM_CDF4(7946, 12765, 16115) }, + { AOM_CDF4(6801, 12123, 16226) }, + { AOM_CDF4(5462, 10135, 14200) }, + { AOM_CDF4(4189, 8011, 11507) }, + { AOM_CDF4(3191, 6229, 9408) }, + { AOM_CDF4(1057, 2137, 3212) }, + { AOM_CDF4(10018, 17067, 21491) }, + { AOM_CDF4(7380, 12582, 16453) }, + { AOM_CDF4(6068, 10845, 14339) }, + { AOM_CDF4(5098, 9198, 12555) }, + { AOM_CDF4(4312, 8010, 11119) }, + { AOM_CDF4(3700, 6966, 9781) }, + { AOM_CDF4(1693, 3326, 4887) }, + { AOM_CDF4(18757, 24930, 27774) }, + { AOM_CDF4(17648, 24596, 27817) }, + { AOM_CDF4(14707, 22052, 26026) }, + { AOM_CDF4(11720, 18852, 23292) }, + { AOM_CDF4(9357, 15952, 20525) }, + { AOM_CDF4(7810, 13753, 18210) }, + { AOM_CDF4(3879, 7333, 10328) } }, + { { AOM_CDF4(8278, 13242, 15922) }, + { AOM_CDF4(10547, 15867, 18919) }, + { AOM_CDF4(9106, 15842, 20609) }, + { AOM_CDF4(6833, 13007, 17218) }, + { AOM_CDF4(4811, 9712, 13923) }, + { AOM_CDF4(3985, 7352, 11128) }, + { AOM_CDF4(1688, 3458, 5262) }, + { AOM_CDF4(12951, 21861, 26510) }, + { AOM_CDF4(9788, 16044, 20276) }, + { AOM_CDF4(6309, 11244, 14870) }, + { AOM_CDF4(5183, 9349, 12566) }, + { AOM_CDF4(4389, 8229, 11492) }, + { AOM_CDF4(3633, 6945, 10620) }, + { AOM_CDF4(3600, 6847, 9907) }, + { AOM_CDF4(21748, 28137, 30255) }, + { AOM_CDF4(19436, 26581, 29560) }, + { AOM_CDF4(16359, 24201, 27953) }, + { AOM_CDF4(13961, 21693, 25871) }, + { AOM_CDF4(11544, 18686, 23322) }, + { AOM_CDF4(9372, 16462, 20952) }, + { AOM_CDF4(6138, 11210, 15390) } } }, + { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } } }, + { { { { AOM_CDF4(16138, 22223, 25509) }, + { AOM_CDF4(15347, 22430, 26332) }, + { AOM_CDF4(9614, 16736, 21332) }, + { AOM_CDF4(6600, 12275, 16907) }, + { AOM_CDF4(4811, 9424, 13547) }, + { AOM_CDF4(3748, 7809, 11420) }, + { AOM_CDF4(2254, 4587, 6890) }, + { AOM_CDF4(15196, 20284, 23177) }, + { AOM_CDF4(18317, 25469, 28451) }, + { AOM_CDF4(13918, 21651, 25842) }, + { AOM_CDF4(10052, 17150, 21995) }, + { AOM_CDF4(7499, 13630, 18587) }, + { AOM_CDF4(6158, 11417, 16003) }, + { AOM_CDF4(4014, 7785, 11252) }, + { AOM_CDF4(15048, 21067, 24384) }, + { AOM_CDF4(18202, 25346, 28553) }, + { AOM_CDF4(14302, 22019, 26356) }, + { AOM_CDF4(10839, 18139, 23166) }, + { AOM_CDF4(8715, 15744, 20806) }, + { AOM_CDF4(7536, 13576, 18544) }, + { AOM_CDF4(5413, 10335, 14498) } }, + { { AOM_CDF4(17394, 24501, 27895) }, + { AOM_CDF4(15889, 23420, 27185) }, + { AOM_CDF4(11561, 19133, 23870) }, + { AOM_CDF4(8285, 14812, 19844) }, + { AOM_CDF4(6496, 12043, 16550) }, + { AOM_CDF4(4771, 9574, 13677) }, + { AOM_CDF4(3603, 6830, 10144) }, + { AOM_CDF4(21656, 27704, 30200) }, + { AOM_CDF4(21324, 27915, 30511) }, + { AOM_CDF4(17327, 25336, 28997) }, + { AOM_CDF4(13417, 21381, 26033) }, + { AOM_CDF4(10132, 17425, 22338) }, + { AOM_CDF4(8580, 15016, 19633) }, + { AOM_CDF4(5694, 11477, 16411) }, + { AOM_CDF4(24116, 29780, 31450) }, + { AOM_CDF4(23853, 29695, 31591) }, + { AOM_CDF4(20085, 27614, 30428) }, + { AOM_CDF4(15326, 24335, 28575) }, + { AOM_CDF4(11814, 19472, 24810) }, + { AOM_CDF4(10221, 18611, 24767) }, + { AOM_CDF4(7689, 14558, 20321) } } }, + { { { AOM_CDF4(16214, 22380, 25770) }, + { AOM_CDF4(14213, 21304, 25295) }, + { AOM_CDF4(9213, 15823, 20455) }, + { AOM_CDF4(6395, 11758, 16139) }, + { AOM_CDF4(4779, 9187, 13066) }, + { AOM_CDF4(3821, 7501, 10953) }, + { AOM_CDF4(2293, 4567, 6795) }, + { AOM_CDF4(15859, 21283, 23820) }, + { AOM_CDF4(18404, 25602, 28726) }, + { AOM_CDF4(14325, 21980, 26206) }, + { AOM_CDF4(10669, 17937, 22720) }, + { AOM_CDF4(8297, 14642, 19447) }, + { AOM_CDF4(6746, 12389, 16893) }, + { AOM_CDF4(4324, 8251, 11770) }, + { AOM_CDF4(16532, 21631, 24475) }, + { AOM_CDF4(20667, 27150, 29668) }, + { AOM_CDF4(16728, 24510, 28175) }, + { AOM_CDF4(12861, 20645, 25332) }, + { AOM_CDF4(10076, 17361, 22417) }, + { AOM_CDF4(8395, 14940, 19963) }, + { AOM_CDF4(5731, 10683, 14912) } }, + { { AOM_CDF4(14433, 21155, 24938) }, + { AOM_CDF4(14658, 21716, 25545) }, + { AOM_CDF4(9923, 16824, 21557) }, + { AOM_CDF4(6982, 13052, 17721) }, + { AOM_CDF4(5419, 10503, 15050) }, + { AOM_CDF4(4852, 9162, 13014) }, + { AOM_CDF4(3271, 6395, 9630) }, + { AOM_CDF4(22210, 27833, 30109) }, + { AOM_CDF4(20750, 27368, 29821) }, + { AOM_CDF4(16894, 24828, 28573) }, + { AOM_CDF4(13247, 21276, 25757) }, + { AOM_CDF4(10038, 17265, 22563) }, + { AOM_CDF4(8587, 14947, 20327) }, + { AOM_CDF4(5645, 11371, 15252) }, + { AOM_CDF4(22027, 27526, 29714) }, + { AOM_CDF4(23098, 29146, 31221) }, + { AOM_CDF4(19886, 27341, 30272) }, + { AOM_CDF4(15609, 23747, 28046) }, + { AOM_CDF4(11993, 20065, 24939) }, + { AOM_CDF4(9637, 18267, 23671) }, + { AOM_CDF4(7625, 13801, 19144) } } }, + { { { AOM_CDF4(14438, 20798, 24089) }, + { AOM_CDF4(12621, 19203, 23097) }, + { AOM_CDF4(8177, 14125, 18402) }, + { AOM_CDF4(5674, 10501, 14456) }, + { AOM_CDF4(4236, 8239, 11733) }, + { AOM_CDF4(3447, 6750, 9806) }, + { AOM_CDF4(1986, 3950, 5864) }, + { AOM_CDF4(16208, 22099, 24930) }, + { AOM_CDF4(16537, 24025, 27585) }, + { AOM_CDF4(12780, 20381, 24867) }, + { AOM_CDF4(9767, 16612, 21416) }, + { AOM_CDF4(7686, 13738, 18398) }, + { AOM_CDF4(6333, 11614, 15964) }, + { AOM_CDF4(3941, 7571, 10836) }, + { AOM_CDF4(22819, 27422, 29202) }, + { AOM_CDF4(22224, 28514, 30721) }, + { AOM_CDF4(17660, 25433, 28913) }, + { AOM_CDF4(13574, 21482, 26002) }, + { AOM_CDF4(10629, 17977, 22938) }, + { AOM_CDF4(8612, 15298, 20265) }, + { AOM_CDF4(5607, 10491, 14596) } }, + { { AOM_CDF4(13569, 19800, 23206) }, + { AOM_CDF4(13128, 19924, 23869) }, + { AOM_CDF4(8329, 14841, 19403) }, + { AOM_CDF4(6130, 10976, 15057) }, + { AOM_CDF4(4682, 8839, 12518) }, + { AOM_CDF4(3656, 7409, 10588) }, + { AOM_CDF4(2577, 5099, 7412) }, + { AOM_CDF4(22427, 28684, 30585) }, + { AOM_CDF4(20913, 27750, 30139) }, + { AOM_CDF4(15840, 24109, 27834) }, + { AOM_CDF4(12308, 20029, 24569) }, + { AOM_CDF4(10216, 16785, 21458) }, + { AOM_CDF4(8309, 14203, 19113) }, + { AOM_CDF4(6043, 11168, 15307) }, + { AOM_CDF4(23166, 28901, 30998) }, + { AOM_CDF4(21899, 28405, 30751) }, + { AOM_CDF4(18413, 26091, 29443) }, + { AOM_CDF4(15233, 23114, 27352) }, + { AOM_CDF4(12683, 20472, 25288) }, + { AOM_CDF4(10702, 18259, 23409) }, + { AOM_CDF4(8125, 14464, 19226) } } }, + { { { AOM_CDF4(9040, 14786, 18360) }, + { AOM_CDF4(9979, 15718, 19415) }, + { AOM_CDF4(7913, 13918, 18311) }, + { AOM_CDF4(5859, 10889, 15184) }, + { AOM_CDF4(4593, 8677, 12510) }, + { AOM_CDF4(3820, 7396, 10791) }, + { AOM_CDF4(1730, 3471, 5192) }, + { AOM_CDF4(11803, 18365, 22709) }, + { AOM_CDF4(11419, 18058, 22225) }, + { AOM_CDF4(9418, 15774, 20243) }, + { AOM_CDF4(7539, 13325, 17657) }, + { AOM_CDF4(6233, 11317, 15384) }, + { AOM_CDF4(5137, 9656, 13545) }, + { AOM_CDF4(2977, 5774, 8349) }, + { AOM_CDF4(21207, 27246, 29640) }, + { AOM_CDF4(19547, 26578, 29497) }, + { AOM_CDF4(16169, 23871, 27690) }, + { AOM_CDF4(12820, 20458, 25018) }, + { AOM_CDF4(10224, 17332, 22214) }, + { AOM_CDF4(8526, 15048, 19884) }, + { AOM_CDF4(5037, 9410, 13118) } }, + { { AOM_CDF4(12339, 17329, 20140) }, + { AOM_CDF4(13505, 19895, 23225) }, + { AOM_CDF4(9847, 16944, 21564) }, + { AOM_CDF4(7280, 13256, 18348) }, + { AOM_CDF4(4712, 10009, 14454) }, + { AOM_CDF4(4361, 7914, 12477) }, + { AOM_CDF4(2870, 5628, 7995) }, + { AOM_CDF4(20061, 25504, 28526) }, + { AOM_CDF4(15235, 22878, 26145) }, + { AOM_CDF4(12985, 19958, 24155) }, + { AOM_CDF4(9782, 16641, 21403) }, + { AOM_CDF4(9456, 16360, 20760) }, + { AOM_CDF4(6855, 12940, 18557) }, + { AOM_CDF4(5661, 10564, 15002) }, + { AOM_CDF4(25656, 30602, 31894) }, + { AOM_CDF4(22570, 29107, 31092) }, + { AOM_CDF4(18917, 26423, 29541) }, + { AOM_CDF4(15940, 23649, 27754) }, + { AOM_CDF4(12803, 20581, 25219) }, + { AOM_CDF4(11082, 18695, 23376) }, + { AOM_CDF4(7939, 14373, 19005) } } }, + { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } } }, + { { { { AOM_CDF4(18315, 24289, 27551) }, + { AOM_CDF4(16854, 24068, 27835) }, + { AOM_CDF4(10140, 17927, 23173) }, + { AOM_CDF4(6722, 12982, 18267) }, + { AOM_CDF4(4661, 9826, 14706) }, + { AOM_CDF4(3832, 8165, 12294) }, + { AOM_CDF4(2795, 6098, 9245) }, + { AOM_CDF4(17145, 23326, 26672) }, + { AOM_CDF4(20733, 27680, 30308) }, + { AOM_CDF4(16032, 24461, 28546) }, + { AOM_CDF4(11653, 20093, 25081) }, + { AOM_CDF4(9290, 16429, 22086) }, + { AOM_CDF4(7796, 14598, 19982) }, + { AOM_CDF4(6502, 12378, 17441) }, + { AOM_CDF4(21681, 27732, 30320) }, + { AOM_CDF4(22389, 29044, 31261) }, + { AOM_CDF4(19027, 26731, 30087) }, + { AOM_CDF4(14739, 23755, 28624) }, + { AOM_CDF4(11358, 20778, 25511) }, + { AOM_CDF4(10995, 18073, 24190) }, + { AOM_CDF4(9162, 14990, 20617) } }, + { { AOM_CDF4(21425, 27952, 30388) }, + { AOM_CDF4(18062, 25838, 29034) }, + { AOM_CDF4(11956, 19881, 24808) }, + { AOM_CDF4(7718, 15000, 20980) }, + { AOM_CDF4(5702, 11254, 16143) }, + { AOM_CDF4(4898, 9088, 16864) }, + { AOM_CDF4(3679, 6776, 11907) }, + { AOM_CDF4(23294, 30160, 31663) }, + { AOM_CDF4(24397, 29896, 31836) }, + { AOM_CDF4(19245, 27128, 30593) }, + { AOM_CDF4(13202, 19825, 26404) }, + { AOM_CDF4(11578, 19297, 23957) }, + { AOM_CDF4(8073, 13297, 21370) }, + { AOM_CDF4(5461, 10923, 19745) }, + { AOM_CDF4(27367, 30521, 31934) }, + { AOM_CDF4(24904, 30671, 31940) }, + { AOM_CDF4(23075, 28460, 31299) }, + { AOM_CDF4(14400, 23658, 30417) }, + { AOM_CDF4(13885, 23882, 28325) }, + { AOM_CDF4(14746, 22938, 27853) }, + { AOM_CDF4(5461, 16384, 27307) } } }, + { { { AOM_CDF4(18274, 24813, 27890) }, + { AOM_CDF4(15537, 23149, 27003) }, + { AOM_CDF4(9449, 16740, 21827) }, + { AOM_CDF4(6700, 12498, 17261) }, + { AOM_CDF4(4988, 9866, 14198) }, + { AOM_CDF4(4236, 8147, 11902) }, + { AOM_CDF4(2867, 5860, 8654) }, + { AOM_CDF4(17124, 23171, 26101) }, + { AOM_CDF4(20396, 27477, 30148) }, + { AOM_CDF4(16573, 24629, 28492) }, + { AOM_CDF4(12749, 20846, 25674) }, + { AOM_CDF4(10233, 17878, 22818) }, + { AOM_CDF4(8525, 15332, 20363) }, + { AOM_CDF4(6283, 11632, 16255) }, + { AOM_CDF4(20466, 26511, 29286) }, + { AOM_CDF4(23059, 29174, 31191) }, + { AOM_CDF4(19481, 27263, 30241) }, + { AOM_CDF4(15458, 23631, 28137) }, + { AOM_CDF4(12416, 20608, 25693) }, + { AOM_CDF4(10261, 18011, 23261) }, + { AOM_CDF4(8016, 14655, 19666) } }, + { { AOM_CDF4(17616, 24586, 28112) }, + { AOM_CDF4(15809, 23299, 27155) }, + { AOM_CDF4(10767, 18890, 23793) }, + { AOM_CDF4(7727, 14255, 18865) }, + { AOM_CDF4(6129, 11926, 16882) }, + { AOM_CDF4(4482, 9704, 14861) }, + { AOM_CDF4(3277, 7452, 11522) }, + { AOM_CDF4(22956, 28551, 30730) }, + { AOM_CDF4(22724, 28937, 30961) }, + { AOM_CDF4(18467, 26324, 29580) }, + { AOM_CDF4(13234, 20713, 25649) }, + { AOM_CDF4(11181, 17592, 22481) }, + { AOM_CDF4(8291, 18358, 24576) }, + { AOM_CDF4(7568, 11881, 14984) }, + { AOM_CDF4(24948, 29001, 31147) }, + { AOM_CDF4(25674, 30619, 32151) }, + { AOM_CDF4(20841, 26793, 29603) }, + { AOM_CDF4(14669, 24356, 28666) }, + { AOM_CDF4(11334, 23593, 28219) }, + { AOM_CDF4(8922, 14762, 22873) }, + { AOM_CDF4(8301, 13544, 20535) } } }, + { { { AOM_CDF4(17113, 23733, 27081) }, + { AOM_CDF4(14139, 21406, 25452) }, + { AOM_CDF4(8552, 15002, 19776) }, + { AOM_CDF4(5871, 11120, 15378) }, + { AOM_CDF4(4455, 8616, 12253) }, + { AOM_CDF4(3469, 6910, 10386) }, + { AOM_CDF4(2255, 4553, 6782) }, + { AOM_CDF4(18224, 24376, 27053) }, + { AOM_CDF4(19290, 26710, 29614) }, + { AOM_CDF4(14936, 22991, 27184) }, + { AOM_CDF4(11238, 18951, 23762) }, + { AOM_CDF4(8786, 15617, 20588) }, + { AOM_CDF4(7317, 13228, 18003) }, + { AOM_CDF4(5101, 9512, 13493) }, + { AOM_CDF4(22639, 28222, 30210) }, + { AOM_CDF4(23216, 29331, 31307) }, + { AOM_CDF4(19075, 26762, 29895) }, + { AOM_CDF4(15014, 23113, 27457) }, + { AOM_CDF4(11938, 19857, 24752) }, + { AOM_CDF4(9942, 17280, 22282) }, + { AOM_CDF4(7167, 13144, 17752) } }, + { { AOM_CDF4(15820, 22738, 26488) }, + { AOM_CDF4(13530, 20885, 25216) }, + { AOM_CDF4(8395, 15530, 20452) }, + { AOM_CDF4(6574, 12321, 16380) }, + { AOM_CDF4(5353, 10419, 14568) }, + { AOM_CDF4(4613, 8446, 12381) }, + { AOM_CDF4(3440, 7158, 9903) }, + { AOM_CDF4(24247, 29051, 31224) }, + { AOM_CDF4(22118, 28058, 30369) }, + { AOM_CDF4(16498, 24768, 28389) }, + { AOM_CDF4(12920, 21175, 26137) }, + { AOM_CDF4(10730, 18619, 25352) }, + { AOM_CDF4(10187, 16279, 22791) }, + { AOM_CDF4(9310, 14631, 22127) }, + { AOM_CDF4(24970, 30558, 32057) }, + { AOM_CDF4(24801, 29942, 31698) }, + { AOM_CDF4(22432, 28453, 30855) }, + { AOM_CDF4(19054, 25680, 29580) }, + { AOM_CDF4(14392, 23036, 28109) }, + { AOM_CDF4(12495, 20947, 26650) }, + { AOM_CDF4(12442, 20326, 26214) } } }, + { { { AOM_CDF4(12162, 18785, 22648) }, + { AOM_CDF4(12749, 19697, 23806) }, + { AOM_CDF4(8580, 15297, 20346) }, + { AOM_CDF4(6169, 11749, 16543) }, + { AOM_CDF4(4836, 9391, 13448) }, + { AOM_CDF4(3821, 7711, 11613) }, + { AOM_CDF4(2228, 4601, 7070) }, + { AOM_CDF4(16319, 24725, 28280) }, + { AOM_CDF4(15698, 23277, 27168) }, + { AOM_CDF4(12726, 20368, 25047) }, + { AOM_CDF4(9912, 17015, 21976) }, + { AOM_CDF4(7888, 14220, 19179) }, + { AOM_CDF4(6777, 12284, 17018) }, + { AOM_CDF4(4492, 8590, 12252) }, + { AOM_CDF4(23249, 28904, 30947) }, + { AOM_CDF4(21050, 27908, 30512) }, + { AOM_CDF4(17440, 25340, 28949) }, + { AOM_CDF4(14059, 22018, 26541) }, + { AOM_CDF4(11288, 18903, 23898) }, + { AOM_CDF4(9411, 16342, 21428) }, + { AOM_CDF4(6278, 11588, 15944) } }, + { { AOM_CDF4(13981, 20067, 23226) }, + { AOM_CDF4(16922, 23580, 26783) }, + { AOM_CDF4(11005, 19039, 24487) }, + { AOM_CDF4(7389, 14218, 19798) }, + { AOM_CDF4(5598, 11505, 17206) }, + { AOM_CDF4(6090, 11213, 15659) }, + { AOM_CDF4(3820, 7371, 10119) }, + { AOM_CDF4(21082, 26925, 29675) }, + { AOM_CDF4(21262, 28627, 31128) }, + { AOM_CDF4(18392, 26454, 30437) }, + { AOM_CDF4(14870, 22910, 27096) }, + { AOM_CDF4(12620, 19484, 24908) }, + { AOM_CDF4(9290, 16553, 22802) }, + { AOM_CDF4(6668, 14288, 20004) }, + { AOM_CDF4(27704, 31055, 31949) }, + { AOM_CDF4(24709, 29978, 31788) }, + { AOM_CDF4(21668, 29264, 31657) }, + { AOM_CDF4(18295, 26968, 30074) }, + { AOM_CDF4(16399, 24422, 29313) }, + { AOM_CDF4(14347, 23026, 28104) }, + { AOM_CDF4(12370, 19806, 24477) } } }, + { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } } } + }; + +static const aom_cdf_prob av1_default_coeff_base_multi_cdfs + [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS] + [CDF_SIZE(NUM_BASE_LEVELS + + 2)] = { { { { { AOM_CDF4(4034, 8930, 12727) }, + { AOM_CDF4(18082, 29741, 31877) }, + { AOM_CDF4(12596, 26124, 30493) }, + { AOM_CDF4(9446, 21118, 27005) }, + { AOM_CDF4(6308, 15141, 21279) }, + { AOM_CDF4(2463, 6357, 9783) }, + { AOM_CDF4(20667, 30546, 31929) }, + { AOM_CDF4(13043, 26123, 30134) }, + { AOM_CDF4(8151, 18757, 24778) }, + { AOM_CDF4(5255, 12839, 18632) }, + { AOM_CDF4(2820, 7206, 11161) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(15736, 27553, 30604) }, + { AOM_CDF4(11210, 23794, 28787) }, + { AOM_CDF4(5947, 13874, 19701) }, + { AOM_CDF4(4215, 9323, 13891) }, + { AOM_CDF4(2833, 6462, 10059) }, + { AOM_CDF4(19605, 30393, 31582) }, + { AOM_CDF4(13523, 26252, 30248) }, + { AOM_CDF4(8446, 18622, 24512) }, + { AOM_CDF4(3818, 10343, 15974) }, + { AOM_CDF4(1481, 4117, 6796) }, + { AOM_CDF4(22649, 31302, 32190) }, + { AOM_CDF4(14829, 27127, 30449) }, + { AOM_CDF4(8313, 17702, 23304) }, + { AOM_CDF4(3022, 8301, 12786) }, + { AOM_CDF4(1536, 4412, 7184) }, + { AOM_CDF4(22354, 29774, 31372) }, + { AOM_CDF4(14723, 25472, 29214) }, + { AOM_CDF4(6673, 13745, 18662) }, + { AOM_CDF4(2068, 5766, 9322) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(6302, 16444, 21761) }, + { AOM_CDF4(23040, 31538, 32475) }, + { AOM_CDF4(15196, 28452, 31496) }, + { AOM_CDF4(10020, 22946, 28514) }, + { AOM_CDF4(6533, 16862, 23501) }, + { AOM_CDF4(3538, 9816, 15076) }, + { AOM_CDF4(24444, 31875, 32525) }, + { AOM_CDF4(15881, 28924, 31635) }, + { AOM_CDF4(9922, 22873, 28466) }, + { AOM_CDF4(6527, 16966, 23691) }, + { AOM_CDF4(4114, 11303, 17220) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(20201, 30770, 32209) }, + { AOM_CDF4(14754, 28071, 31258) }, + { AOM_CDF4(8378, 20186, 26517) }, + { AOM_CDF4(5916, 15299, 21978) }, + { AOM_CDF4(4268, 11583, 17901) }, + { AOM_CDF4(24361, 32025, 32581) }, + { AOM_CDF4(18673, 30105, 31943) }, + { AOM_CDF4(10196, 22244, 27576) }, + { AOM_CDF4(5495, 14349, 20417) }, + { AOM_CDF4(2676, 7415, 11498) }, + { AOM_CDF4(24678, 31958, 32585) }, + { AOM_CDF4(18629, 29906, 31831) }, + { AOM_CDF4(9364, 20724, 26315) }, + { AOM_CDF4(4641, 12318, 18094) }, + { AOM_CDF4(2758, 7387, 11579) }, + { AOM_CDF4(25433, 31842, 32469) }, + { AOM_CDF4(18795, 29289, 31411) }, + { AOM_CDF4(7644, 17584, 23592) }, + { AOM_CDF4(3408, 9014, 15047) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(4536, 10072, 14001) }, + { AOM_CDF4(25459, 31416, 32206) }, + { AOM_CDF4(16605, 28048, 30818) }, + { AOM_CDF4(11008, 22857, 27719) }, + { AOM_CDF4(6915, 16268, 22315) }, + { AOM_CDF4(2625, 6812, 10537) }, + { AOM_CDF4(24257, 31788, 32499) }, + { AOM_CDF4(16880, 29454, 31879) }, + { AOM_CDF4(11958, 25054, 29778) }, + { AOM_CDF4(7916, 18718, 25084) }, + { AOM_CDF4(3383, 8777, 13446) }, + { AOM_CDF4(22720, 31603, 32393) }, + { AOM_CDF4(14960, 28125, 31335) }, + { AOM_CDF4(9731, 22210, 27928) }, + { AOM_CDF4(6304, 15832, 22277) }, + { AOM_CDF4(2910, 7818, 12166) }, + { AOM_CDF4(20375, 30627, 32131) }, + { AOM_CDF4(13904, 27284, 30887) }, + { AOM_CDF4(9368, 21558, 27144) }, + { AOM_CDF4(5937, 14966, 21119) }, + { AOM_CDF4(2667, 7225, 11319) }, + { AOM_CDF4(23970, 31470, 32378) }, + { AOM_CDF4(17173, 29734, 32018) }, + { AOM_CDF4(12795, 25441, 29965) }, + { AOM_CDF4(8981, 19680, 25893) }, + { AOM_CDF4(4728, 11372, 16902) }, + { AOM_CDF4(24287, 31797, 32439) }, + { AOM_CDF4(16703, 29145, 31696) }, + { AOM_CDF4(10833, 23554, 28725) }, + { AOM_CDF4(6468, 16566, 23057) }, + { AOM_CDF4(2415, 6562, 10278) }, + { AOM_CDF4(26610, 32395, 32659) }, + { AOM_CDF4(18590, 30498, 32117) }, + { AOM_CDF4(12420, 25756, 29950) }, + { AOM_CDF4(7639, 18746, 24710) }, + { AOM_CDF4(3001, 8086, 12347) }, + { AOM_CDF4(25076, 32064, 32580) }, + { AOM_CDF4(17946, 30128, 32028) }, + { AOM_CDF4(12024, 24985, 29378) }, + { AOM_CDF4(7517, 18390, 24304) }, + { AOM_CDF4(3243, 8781, 13331) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(6037, 16771, 21957) }, + { AOM_CDF4(24774, 31704, 32426) }, + { AOM_CDF4(16830, 28589, 31056) }, + { AOM_CDF4(10602, 22828, 27760) }, + { AOM_CDF4(6733, 16829, 23071) }, + { AOM_CDF4(3250, 8914, 13556) }, + { AOM_CDF4(25582, 32220, 32668) }, + { AOM_CDF4(18659, 30342, 32223) }, + { AOM_CDF4(12546, 26149, 30515) }, + { AOM_CDF4(8420, 20451, 26801) }, + { AOM_CDF4(4636, 12420, 18344) }, + { AOM_CDF4(27581, 32362, 32639) }, + { AOM_CDF4(18987, 30083, 31978) }, + { AOM_CDF4(11327, 24248, 29084) }, + { AOM_CDF4(7264, 17719, 24120) }, + { AOM_CDF4(3995, 10768, 16169) }, + { AOM_CDF4(25893, 31831, 32487) }, + { AOM_CDF4(16577, 28587, 31379) }, + { AOM_CDF4(10189, 22748, 28182) }, + { AOM_CDF4(6832, 17094, 23556) }, + { AOM_CDF4(3708, 10110, 15334) }, + { AOM_CDF4(25904, 32282, 32656) }, + { AOM_CDF4(19721, 30792, 32276) }, + { AOM_CDF4(12819, 26243, 30411) }, + { AOM_CDF4(8572, 20614, 26891) }, + { AOM_CDF4(5364, 14059, 20467) }, + { AOM_CDF4(26580, 32438, 32677) }, + { AOM_CDF4(20852, 31225, 32340) }, + { AOM_CDF4(12435, 25700, 29967) }, + { AOM_CDF4(8691, 20825, 26976) }, + { AOM_CDF4(4446, 12209, 17269) }, + { AOM_CDF4(27350, 32429, 32696) }, + { AOM_CDF4(21372, 30977, 32272) }, + { AOM_CDF4(12673, 25270, 29853) }, + { AOM_CDF4(9208, 20925, 26640) }, + { AOM_CDF4(5018, 13351, 18732) }, + { AOM_CDF4(27351, 32479, 32713) }, + { AOM_CDF4(21398, 31209, 32387) }, + { AOM_CDF4(12162, 25047, 29842) }, + { AOM_CDF4(7896, 18691, 25319) }, + { AOM_CDF4(4670, 12882, 18881) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(5487, 10460, 13708) }, + { AOM_CDF4(21597, 28303, 30674) }, + { AOM_CDF4(11037, 21953, 26476) }, + { AOM_CDF4(8147, 17962, 22952) }, + { AOM_CDF4(5242, 13061, 18532) }, + { AOM_CDF4(1889, 5208, 8182) }, + { AOM_CDF4(26774, 32133, 32590) }, + { AOM_CDF4(17844, 29564, 31767) }, + { AOM_CDF4(11690, 24438, 29171) }, + { AOM_CDF4(7542, 18215, 24459) }, + { AOM_CDF4(2993, 8050, 12319) }, + { AOM_CDF4(28023, 32328, 32591) }, + { AOM_CDF4(18651, 30126, 31954) }, + { AOM_CDF4(12164, 25146, 29589) }, + { AOM_CDF4(7762, 18530, 24771) }, + { AOM_CDF4(3492, 9183, 13920) }, + { AOM_CDF4(27591, 32008, 32491) }, + { AOM_CDF4(17149, 28853, 31510) }, + { AOM_CDF4(11485, 24003, 28860) }, + { AOM_CDF4(7697, 18086, 24210) }, + { AOM_CDF4(3075, 7999, 12218) }, + { AOM_CDF4(28268, 32482, 32654) }, + { AOM_CDF4(19631, 31051, 32404) }, + { AOM_CDF4(13860, 27260, 31020) }, + { AOM_CDF4(9605, 21613, 27594) }, + { AOM_CDF4(4876, 12162, 17908) }, + { AOM_CDF4(27248, 32316, 32576) }, + { AOM_CDF4(18955, 30457, 32075) }, + { AOM_CDF4(11824, 23997, 28795) }, + { AOM_CDF4(7346, 18196, 24647) }, + { AOM_CDF4(3403, 9247, 14111) }, + { AOM_CDF4(29711, 32655, 32735) }, + { AOM_CDF4(21169, 31394, 32417) }, + { AOM_CDF4(13487, 27198, 30957) }, + { AOM_CDF4(8828, 21683, 27614) }, + { AOM_CDF4(4270, 11451, 17038) }, + { AOM_CDF4(28708, 32578, 32731) }, + { AOM_CDF4(20120, 31241, 32482) }, + { AOM_CDF4(13692, 27550, 31321) }, + { AOM_CDF4(9418, 22514, 28439) }, + { AOM_CDF4(4999, 13283, 19462) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(5673, 14302, 19711) }, + { AOM_CDF4(26251, 30701, 31834) }, + { AOM_CDF4(12782, 23783, 27803) }, + { AOM_CDF4(9127, 20657, 25808) }, + { AOM_CDF4(6368, 16208, 21462) }, + { AOM_CDF4(2465, 7177, 10822) }, + { AOM_CDF4(29961, 32563, 32719) }, + { AOM_CDF4(18318, 29891, 31949) }, + { AOM_CDF4(11361, 24514, 29357) }, + { AOM_CDF4(7900, 19603, 25607) }, + { AOM_CDF4(4002, 10590, 15546) }, + { AOM_CDF4(29637, 32310, 32595) }, + { AOM_CDF4(18296, 29913, 31809) }, + { AOM_CDF4(10144, 21515, 26871) }, + { AOM_CDF4(5358, 14322, 20394) }, + { AOM_CDF4(3067, 8362, 13346) }, + { AOM_CDF4(28652, 32470, 32676) }, + { AOM_CDF4(17538, 30771, 32209) }, + { AOM_CDF4(13924, 26882, 30494) }, + { AOM_CDF4(10496, 22837, 27869) }, + { AOM_CDF4(7236, 16396, 21621) }, + { AOM_CDF4(30743, 32687, 32746) }, + { AOM_CDF4(23006, 31676, 32489) }, + { AOM_CDF4(14494, 27828, 31120) }, + { AOM_CDF4(10174, 22801, 28352) }, + { AOM_CDF4(6242, 15281, 21043) }, + { AOM_CDF4(25817, 32243, 32720) }, + { AOM_CDF4(18618, 31367, 32325) }, + { AOM_CDF4(13997, 28318, 31878) }, + { AOM_CDF4(12255, 26534, 31383) }, + { AOM_CDF4(9561, 21588, 28450) }, + { AOM_CDF4(28188, 32635, 32724) }, + { AOM_CDF4(22060, 32365, 32728) }, + { AOM_CDF4(18102, 30690, 32528) }, + { AOM_CDF4(14196, 28864, 31999) }, + { AOM_CDF4(12262, 25792, 30865) }, + { AOM_CDF4(24176, 32109, 32628) }, + { AOM_CDF4(18280, 29681, 31963) }, + { AOM_CDF4(10205, 23703, 29664) }, + { AOM_CDF4(7889, 20025, 27676) }, + { AOM_CDF4(6060, 16743, 23970) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(5141, 7096, 8260) }, + { AOM_CDF4(27186, 29022, 29789) }, + { AOM_CDF4(6668, 12568, 15682) }, + { AOM_CDF4(2172, 6181, 8638) }, + { AOM_CDF4(1126, 3379, 4531) }, + { AOM_CDF4(443, 1361, 2254) }, + { AOM_CDF4(26083, 31153, 32436) }, + { AOM_CDF4(13486, 24603, 28483) }, + { AOM_CDF4(6508, 14840, 19910) }, + { AOM_CDF4(3386, 8800, 13286) }, + { AOM_CDF4(1530, 4322, 7054) }, + { AOM_CDF4(29639, 32080, 32548) }, + { AOM_CDF4(15897, 27552, 30290) }, + { AOM_CDF4(8588, 20047, 25383) }, + { AOM_CDF4(4889, 13339, 19269) }, + { AOM_CDF4(2240, 6871, 10498) }, + { AOM_CDF4(28165, 32197, 32517) }, + { AOM_CDF4(20735, 30427, 31568) }, + { AOM_CDF4(14325, 24671, 27692) }, + { AOM_CDF4(5119, 12554, 17805) }, + { AOM_CDF4(1810, 5441, 8261) }, + { AOM_CDF4(31212, 32724, 32748) }, + { AOM_CDF4(23352, 31766, 32545) }, + { AOM_CDF4(14669, 27570, 31059) }, + { AOM_CDF4(8492, 20894, 27272) }, + { AOM_CDF4(3644, 10194, 15204) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(2461, 7013, 9371) }, + { AOM_CDF4(24749, 29600, 30986) }, + { AOM_CDF4(9466, 19037, 22417) }, + { AOM_CDF4(3584, 9280, 14400) }, + { AOM_CDF4(1505, 3929, 5433) }, + { AOM_CDF4(677, 1500, 2736) }, + { AOM_CDF4(23987, 30702, 32117) }, + { AOM_CDF4(13554, 24571, 29263) }, + { AOM_CDF4(6211, 14556, 21155) }, + { AOM_CDF4(3135, 10972, 15625) }, + { AOM_CDF4(2435, 7127, 11427) }, + { AOM_CDF4(31300, 32532, 32550) }, + { AOM_CDF4(14757, 30365, 31954) }, + { AOM_CDF4(4405, 11612, 18553) }, + { AOM_CDF4(580, 4132, 7322) }, + { AOM_CDF4(1695, 10169, 14124) }, + { AOM_CDF4(30008, 32282, 32591) }, + { AOM_CDF4(19244, 30108, 31748) }, + { AOM_CDF4(11180, 24158, 29555) }, + { AOM_CDF4(5650, 14972, 19209) }, + { AOM_CDF4(2114, 5109, 8456) }, + { AOM_CDF4(31856, 32716, 32748) }, + { AOM_CDF4(23012, 31664, 32572) }, + { AOM_CDF4(13694, 26656, 30636) }, + { AOM_CDF4(8142, 19508, 26093) }, + { AOM_CDF4(4253, 10955, 16724) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(601, 983, 1311) }, + { AOM_CDF4(18725, 23406, 28087) }, + { AOM_CDF4(5461, 8192, 10923) }, + { AOM_CDF4(3781, 15124, 21425) }, + { AOM_CDF4(2587, 7761, 12072) }, + { AOM_CDF4(106, 458, 810) }, + { AOM_CDF4(22282, 29710, 31894) }, + { AOM_CDF4(8508, 20926, 25984) }, + { AOM_CDF4(3726, 12713, 18083) }, + { AOM_CDF4(1620, 7112, 10893) }, + { AOM_CDF4(729, 2236, 3495) }, + { AOM_CDF4(30163, 32474, 32684) }, + { AOM_CDF4(18304, 30464, 32000) }, + { AOM_CDF4(11443, 26526, 29647) }, + { AOM_CDF4(6007, 15292, 21299) }, + { AOM_CDF4(2234, 6703, 8937) }, + { AOM_CDF4(30954, 32177, 32571) }, + { AOM_CDF4(17363, 29562, 31076) }, + { AOM_CDF4(9686, 22464, 27410) }, + { AOM_CDF4(8192, 16384, 21390) }, + { AOM_CDF4(1755, 8046, 11264) }, + { AOM_CDF4(31168, 32734, 32748) }, + { AOM_CDF4(22486, 31441, 32471) }, + { AOM_CDF4(12833, 25627, 29738) }, + { AOM_CDF4(6980, 17379, 23122) }, + { AOM_CDF4(3111, 8887, 13479) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } } }, + { { { { AOM_CDF4(6041, 11854, 15927) }, + { AOM_CDF4(20326, 30905, 32251) }, + { AOM_CDF4(14164, 26831, 30725) }, + { AOM_CDF4(9760, 20647, 26585) }, + { AOM_CDF4(6416, 14953, 21219) }, + { AOM_CDF4(2966, 7151, 10891) }, + { AOM_CDF4(23567, 31374, 32254) }, + { AOM_CDF4(14978, 27416, 30946) }, + { AOM_CDF4(9434, 20225, 26254) }, + { AOM_CDF4(6658, 14558, 20535) }, + { AOM_CDF4(3916, 8677, 12989) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(18088, 29545, 31587) }, + { AOM_CDF4(13062, 25843, 30073) }, + { AOM_CDF4(8940, 16827, 22251) }, + { AOM_CDF4(7654, 13220, 17973) }, + { AOM_CDF4(5733, 10316, 14456) }, + { AOM_CDF4(22879, 31388, 32114) }, + { AOM_CDF4(15215, 27993, 30955) }, + { AOM_CDF4(9397, 19445, 24978) }, + { AOM_CDF4(3442, 9813, 15344) }, + { AOM_CDF4(1368, 3936, 6532) }, + { AOM_CDF4(25494, 32033, 32406) }, + { AOM_CDF4(16772, 27963, 30718) }, + { AOM_CDF4(9419, 18165, 23260) }, + { AOM_CDF4(2677, 7501, 11797) }, + { AOM_CDF4(1516, 4344, 7170) }, + { AOM_CDF4(26556, 31454, 32101) }, + { AOM_CDF4(17128, 27035, 30108) }, + { AOM_CDF4(8324, 15344, 20249) }, + { AOM_CDF4(1903, 5696, 9469) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8455, 19003, 24368) }, + { AOM_CDF4(23563, 32021, 32604) }, + { AOM_CDF4(16237, 29446, 31935) }, + { AOM_CDF4(10724, 23999, 29358) }, + { AOM_CDF4(6725, 17528, 24416) }, + { AOM_CDF4(3927, 10927, 16825) }, + { AOM_CDF4(26313, 32288, 32634) }, + { AOM_CDF4(17430, 30095, 32095) }, + { AOM_CDF4(11116, 24606, 29679) }, + { AOM_CDF4(7195, 18384, 25269) }, + { AOM_CDF4(4726, 12852, 19315) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(22822, 31648, 32483) }, + { AOM_CDF4(16724, 29633, 31929) }, + { AOM_CDF4(10261, 23033, 28725) }, + { AOM_CDF4(7029, 17840, 24528) }, + { AOM_CDF4(4867, 13886, 21502) }, + { AOM_CDF4(25298, 31892, 32491) }, + { AOM_CDF4(17809, 29330, 31512) }, + { AOM_CDF4(9668, 21329, 26579) }, + { AOM_CDF4(4774, 12956, 18976) }, + { AOM_CDF4(2322, 7030, 11540) }, + { AOM_CDF4(25472, 31920, 32543) }, + { AOM_CDF4(17957, 29387, 31632) }, + { AOM_CDF4(9196, 20593, 26400) }, + { AOM_CDF4(4680, 12705, 19202) }, + { AOM_CDF4(2917, 8456, 13436) }, + { AOM_CDF4(26471, 32059, 32574) }, + { AOM_CDF4(18458, 29783, 31909) }, + { AOM_CDF4(8400, 19464, 25956) }, + { AOM_CDF4(3812, 10973, 17206) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(6779, 13743, 17678) }, + { AOM_CDF4(24806, 31797, 32457) }, + { AOM_CDF4(17616, 29047, 31372) }, + { AOM_CDF4(11063, 23175, 28003) }, + { AOM_CDF4(6521, 16110, 22324) }, + { AOM_CDF4(2764, 7504, 11654) }, + { AOM_CDF4(25266, 32367, 32637) }, + { AOM_CDF4(19054, 30553, 32175) }, + { AOM_CDF4(12139, 25212, 29807) }, + { AOM_CDF4(7311, 18162, 24704) }, + { AOM_CDF4(3397, 9164, 14074) }, + { AOM_CDF4(25988, 32208, 32522) }, + { AOM_CDF4(16253, 28912, 31526) }, + { AOM_CDF4(9151, 21387, 27372) }, + { AOM_CDF4(5688, 14915, 21496) }, + { AOM_CDF4(2717, 7627, 12004) }, + { AOM_CDF4(23144, 31855, 32443) }, + { AOM_CDF4(16070, 28491, 31325) }, + { AOM_CDF4(8702, 20467, 26517) }, + { AOM_CDF4(5243, 13956, 20367) }, + { AOM_CDF4(2621, 7335, 11567) }, + { AOM_CDF4(26636, 32340, 32630) }, + { AOM_CDF4(19990, 31050, 32341) }, + { AOM_CDF4(13243, 26105, 30315) }, + { AOM_CDF4(8588, 19521, 25918) }, + { AOM_CDF4(4717, 11585, 17304) }, + { AOM_CDF4(25844, 32292, 32582) }, + { AOM_CDF4(19090, 30635, 32097) }, + { AOM_CDF4(11963, 24546, 28939) }, + { AOM_CDF4(6218, 16087, 22354) }, + { AOM_CDF4(2340, 6608, 10426) }, + { AOM_CDF4(28046, 32576, 32694) }, + { AOM_CDF4(21178, 31313, 32296) }, + { AOM_CDF4(13486, 26184, 29870) }, + { AOM_CDF4(7149, 17871, 23723) }, + { AOM_CDF4(2833, 7958, 12259) }, + { AOM_CDF4(27710, 32528, 32686) }, + { AOM_CDF4(20674, 31076, 32268) }, + { AOM_CDF4(12413, 24955, 29243) }, + { AOM_CDF4(6676, 16927, 23097) }, + { AOM_CDF4(2966, 8333, 12919) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8639, 19339, 24429) }, + { AOM_CDF4(24404, 31837, 32525) }, + { AOM_CDF4(16997, 29425, 31784) }, + { AOM_CDF4(11253, 24234, 29149) }, + { AOM_CDF4(6751, 17394, 24028) }, + { AOM_CDF4(3490, 9830, 15191) }, + { AOM_CDF4(26283, 32471, 32714) }, + { AOM_CDF4(19599, 31168, 32442) }, + { AOM_CDF4(13146, 26954, 30893) }, + { AOM_CDF4(8214, 20588, 26890) }, + { AOM_CDF4(4699, 13081, 19300) }, + { AOM_CDF4(28212, 32458, 32669) }, + { AOM_CDF4(18594, 30316, 32100) }, + { AOM_CDF4(11219, 24408, 29234) }, + { AOM_CDF4(6865, 17656, 24149) }, + { AOM_CDF4(3678, 10362, 16006) }, + { AOM_CDF4(25825, 32136, 32616) }, + { AOM_CDF4(17313, 29853, 32021) }, + { AOM_CDF4(11197, 24471, 29472) }, + { AOM_CDF4(6947, 17781, 24405) }, + { AOM_CDF4(3768, 10660, 16261) }, + { AOM_CDF4(27352, 32500, 32706) }, + { AOM_CDF4(20850, 31468, 32469) }, + { AOM_CDF4(14021, 27707, 31133) }, + { AOM_CDF4(8964, 21748, 27838) }, + { AOM_CDF4(5437, 14665, 21187) }, + { AOM_CDF4(26304, 32492, 32698) }, + { AOM_CDF4(20409, 31380, 32385) }, + { AOM_CDF4(13682, 27222, 30632) }, + { AOM_CDF4(8974, 21236, 26685) }, + { AOM_CDF4(4234, 11665, 16934) }, + { AOM_CDF4(26273, 32357, 32711) }, + { AOM_CDF4(20672, 31242, 32441) }, + { AOM_CDF4(14172, 27254, 30902) }, + { AOM_CDF4(9870, 21898, 27275) }, + { AOM_CDF4(5164, 13506, 19270) }, + { AOM_CDF4(26725, 32459, 32728) }, + { AOM_CDF4(20991, 31442, 32527) }, + { AOM_CDF4(13071, 26434, 30811) }, + { AOM_CDF4(8184, 20090, 26742) }, + { AOM_CDF4(4803, 13255, 19895) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(7555, 14942, 18501) }, + { AOM_CDF4(24410, 31178, 32287) }, + { AOM_CDF4(14394, 26738, 30253) }, + { AOM_CDF4(8413, 19554, 25195) }, + { AOM_CDF4(4766, 12924, 18785) }, + { AOM_CDF4(2029, 5806, 9207) }, + { AOM_CDF4(26776, 32364, 32663) }, + { AOM_CDF4(18732, 29967, 31931) }, + { AOM_CDF4(11005, 23786, 28852) }, + { AOM_CDF4(6466, 16909, 23510) }, + { AOM_CDF4(3044, 8638, 13419) }, + { AOM_CDF4(29208, 32582, 32704) }, + { AOM_CDF4(20068, 30857, 32208) }, + { AOM_CDF4(12003, 25085, 29595) }, + { AOM_CDF4(6947, 17750, 24189) }, + { AOM_CDF4(3245, 9103, 14007) }, + { AOM_CDF4(27359, 32465, 32669) }, + { AOM_CDF4(19421, 30614, 32174) }, + { AOM_CDF4(11915, 25010, 29579) }, + { AOM_CDF4(6950, 17676, 24074) }, + { AOM_CDF4(3007, 8473, 13096) }, + { AOM_CDF4(29002, 32676, 32735) }, + { AOM_CDF4(22102, 31849, 32576) }, + { AOM_CDF4(14408, 28009, 31405) }, + { AOM_CDF4(9027, 21679, 27931) }, + { AOM_CDF4(4694, 12678, 18748) }, + { AOM_CDF4(28216, 32528, 32682) }, + { AOM_CDF4(20849, 31264, 32318) }, + { AOM_CDF4(12756, 25815, 29751) }, + { AOM_CDF4(7565, 18801, 24923) }, + { AOM_CDF4(3509, 9533, 14477) }, + { AOM_CDF4(30133, 32687, 32739) }, + { AOM_CDF4(23063, 31910, 32515) }, + { AOM_CDF4(14588, 28051, 31132) }, + { AOM_CDF4(9085, 21649, 27457) }, + { AOM_CDF4(4261, 11654, 17264) }, + { AOM_CDF4(29518, 32691, 32748) }, + { AOM_CDF4(22451, 31959, 32613) }, + { AOM_CDF4(14864, 28722, 31700) }, + { AOM_CDF4(9695, 22964, 28716) }, + { AOM_CDF4(4932, 13358, 19502) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(6465, 16958, 21688) }, + { AOM_CDF4(25199, 31514, 32360) }, + { AOM_CDF4(14774, 27149, 30607) }, + { AOM_CDF4(9257, 21438, 26972) }, + { AOM_CDF4(5723, 15183, 21882) }, + { AOM_CDF4(3150, 8879, 13731) }, + { AOM_CDF4(26989, 32262, 32682) }, + { AOM_CDF4(17396, 29937, 32085) }, + { AOM_CDF4(11387, 24901, 29784) }, + { AOM_CDF4(7289, 18821, 25548) }, + { AOM_CDF4(3734, 10577, 16086) }, + { AOM_CDF4(29728, 32501, 32695) }, + { AOM_CDF4(17431, 29701, 31903) }, + { AOM_CDF4(9921, 22826, 28300) }, + { AOM_CDF4(5896, 15434, 22068) }, + { AOM_CDF4(3430, 9646, 14757) }, + { AOM_CDF4(28614, 32511, 32705) }, + { AOM_CDF4(19364, 30638, 32263) }, + { AOM_CDF4(13129, 26254, 30402) }, + { AOM_CDF4(8754, 20484, 26440) }, + { AOM_CDF4(4378, 11607, 17110) }, + { AOM_CDF4(30292, 32671, 32744) }, + { AOM_CDF4(21780, 31603, 32501) }, + { AOM_CDF4(14314, 27829, 31291) }, + { AOM_CDF4(9611, 22327, 28263) }, + { AOM_CDF4(4890, 13087, 19065) }, + { AOM_CDF4(25862, 32567, 32733) }, + { AOM_CDF4(20794, 32050, 32567) }, + { AOM_CDF4(17243, 30625, 32254) }, + { AOM_CDF4(13283, 27628, 31474) }, + { AOM_CDF4(9669, 22532, 28918) }, + { AOM_CDF4(27435, 32697, 32748) }, + { AOM_CDF4(24922, 32390, 32714) }, + { AOM_CDF4(21449, 31504, 32536) }, + { AOM_CDF4(16392, 29729, 31832) }, + { AOM_CDF4(11692, 24884, 29076) }, + { AOM_CDF4(24193, 32290, 32735) }, + { AOM_CDF4(18909, 31104, 32563) }, + { AOM_CDF4(12236, 26841, 31403) }, + { AOM_CDF4(8171, 21840, 29082) }, + { AOM_CDF4(7224, 17280, 25275) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(3078, 6839, 9890) }, + { AOM_CDF4(13837, 20450, 24479) }, + { AOM_CDF4(5914, 14222, 19328) }, + { AOM_CDF4(3866, 10267, 14762) }, + { AOM_CDF4(2612, 7208, 11042) }, + { AOM_CDF4(1067, 2991, 4776) }, + { AOM_CDF4(25817, 31646, 32529) }, + { AOM_CDF4(13708, 26338, 30385) }, + { AOM_CDF4(7328, 18585, 24870) }, + { AOM_CDF4(4691, 13080, 19276) }, + { AOM_CDF4(1825, 5253, 8352) }, + { AOM_CDF4(29386, 32315, 32624) }, + { AOM_CDF4(17160, 29001, 31360) }, + { AOM_CDF4(9602, 21862, 27396) }, + { AOM_CDF4(5915, 15772, 22148) }, + { AOM_CDF4(2786, 7779, 12047) }, + { AOM_CDF4(29246, 32450, 32663) }, + { AOM_CDF4(18696, 29929, 31818) }, + { AOM_CDF4(10510, 23369, 28560) }, + { AOM_CDF4(6229, 16499, 23125) }, + { AOM_CDF4(2608, 7448, 11705) }, + { AOM_CDF4(30753, 32710, 32748) }, + { AOM_CDF4(21638, 31487, 32503) }, + { AOM_CDF4(12937, 26854, 30870) }, + { AOM_CDF4(8182, 20596, 26970) }, + { AOM_CDF4(3637, 10269, 15497) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(5244, 12150, 16906) }, + { AOM_CDF4(20486, 26858, 29701) }, + { AOM_CDF4(7756, 18317, 23735) }, + { AOM_CDF4(3452, 9256, 13146) }, + { AOM_CDF4(2020, 5206, 8229) }, + { AOM_CDF4(1801, 4993, 7903) }, + { AOM_CDF4(27051, 31858, 32531) }, + { AOM_CDF4(15988, 27531, 30619) }, + { AOM_CDF4(9188, 21484, 26719) }, + { AOM_CDF4(6273, 17186, 23800) }, + { AOM_CDF4(3108, 9355, 14764) }, + { AOM_CDF4(31076, 32520, 32680) }, + { AOM_CDF4(18119, 30037, 31850) }, + { AOM_CDF4(10244, 22969, 27472) }, + { AOM_CDF4(4692, 14077, 19273) }, + { AOM_CDF4(3694, 11677, 17556) }, + { AOM_CDF4(30060, 32581, 32720) }, + { AOM_CDF4(21011, 30775, 32120) }, + { AOM_CDF4(11931, 24820, 29289) }, + { AOM_CDF4(7119, 17662, 24356) }, + { AOM_CDF4(3833, 10706, 16304) }, + { AOM_CDF4(31954, 32731, 32748) }, + { AOM_CDF4(23913, 31724, 32489) }, + { AOM_CDF4(15520, 28060, 31286) }, + { AOM_CDF4(11517, 23008, 28571) }, + { AOM_CDF4(6193, 14508, 20629) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(1035, 2807, 4156) }, + { AOM_CDF4(13162, 18138, 20939) }, + { AOM_CDF4(2696, 6633, 8755) }, + { AOM_CDF4(1373, 4161, 6853) }, + { AOM_CDF4(1099, 2746, 4716) }, + { AOM_CDF4(340, 1021, 1599) }, + { AOM_CDF4(22826, 30419, 32135) }, + { AOM_CDF4(10395, 21762, 26942) }, + { AOM_CDF4(4726, 12407, 17361) }, + { AOM_CDF4(2447, 7080, 10593) }, + { AOM_CDF4(1227, 3717, 6011) }, + { AOM_CDF4(28156, 31424, 31934) }, + { AOM_CDF4(16915, 27754, 30373) }, + { AOM_CDF4(9148, 20990, 26431) }, + { AOM_CDF4(5950, 15515, 21148) }, + { AOM_CDF4(2492, 7327, 11526) }, + { AOM_CDF4(30602, 32477, 32670) }, + { AOM_CDF4(20026, 29955, 31568) }, + { AOM_CDF4(11220, 23628, 28105) }, + { AOM_CDF4(6652, 17019, 22973) }, + { AOM_CDF4(3064, 8536, 13043) }, + { AOM_CDF4(31769, 32724, 32748) }, + { AOM_CDF4(22230, 30887, 32373) }, + { AOM_CDF4(12234, 25079, 29731) }, + { AOM_CDF4(7326, 18816, 25353) }, + { AOM_CDF4(3933, 10907, 16616) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } } }, + { { { { AOM_CDF4(8896, 16227, 20630) }, + { AOM_CDF4(23629, 31782, 32527) }, + { AOM_CDF4(15173, 27755, 31321) }, + { AOM_CDF4(10158, 21233, 27382) }, + { AOM_CDF4(6420, 14857, 21558) }, + { AOM_CDF4(3269, 8155, 12646) }, + { AOM_CDF4(24835, 32009, 32496) }, + { AOM_CDF4(16509, 28421, 31579) }, + { AOM_CDF4(10957, 21514, 27418) }, + { AOM_CDF4(7881, 15930, 22096) }, + { AOM_CDF4(5388, 10960, 15918) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(20745, 30773, 32093) }, + { AOM_CDF4(15200, 27221, 30861) }, + { AOM_CDF4(13032, 20873, 25667) }, + { AOM_CDF4(12285, 18663, 23494) }, + { AOM_CDF4(11563, 17481, 21489) }, + { AOM_CDF4(26260, 31982, 32320) }, + { AOM_CDF4(15397, 28083, 31100) }, + { AOM_CDF4(9742, 19217, 24824) }, + { AOM_CDF4(3261, 9629, 15362) }, + { AOM_CDF4(1480, 4322, 7499) }, + { AOM_CDF4(27599, 32256, 32460) }, + { AOM_CDF4(16857, 27659, 30774) }, + { AOM_CDF4(9551, 18290, 23748) }, + { AOM_CDF4(3052, 8933, 14103) }, + { AOM_CDF4(2021, 5910, 9787) }, + { AOM_CDF4(29005, 32015, 32392) }, + { AOM_CDF4(17677, 27694, 30863) }, + { AOM_CDF4(9204, 17356, 23219) }, + { AOM_CDF4(2403, 7516, 12814) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(10808, 22056, 26896) }, + { AOM_CDF4(25739, 32313, 32676) }, + { AOM_CDF4(17288, 30203, 32221) }, + { AOM_CDF4(11359, 24878, 29896) }, + { AOM_CDF4(6949, 17767, 24893) }, + { AOM_CDF4(4287, 11796, 18071) }, + { AOM_CDF4(27880, 32521, 32705) }, + { AOM_CDF4(19038, 31004, 32414) }, + { AOM_CDF4(12564, 26345, 30768) }, + { AOM_CDF4(8269, 19947, 26779) }, + { AOM_CDF4(5674, 14657, 21674) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(25742, 32319, 32671) }, + { AOM_CDF4(19557, 31164, 32454) }, + { AOM_CDF4(13381, 26381, 30755) }, + { AOM_CDF4(10101, 21466, 26722) }, + { AOM_CDF4(9209, 19650, 26825) }, + { AOM_CDF4(27107, 31917, 32432) }, + { AOM_CDF4(18056, 28893, 31203) }, + { AOM_CDF4(10200, 21434, 26764) }, + { AOM_CDF4(4660, 12913, 19502) }, + { AOM_CDF4(2368, 6930, 12504) }, + { AOM_CDF4(26960, 32158, 32613) }, + { AOM_CDF4(18628, 30005, 32031) }, + { AOM_CDF4(10233, 22442, 28232) }, + { AOM_CDF4(5471, 14630, 21516) }, + { AOM_CDF4(3235, 10767, 17109) }, + { AOM_CDF4(27696, 32440, 32692) }, + { AOM_CDF4(20032, 31167, 32438) }, + { AOM_CDF4(8700, 21341, 28442) }, + { AOM_CDF4(5662, 14831, 21795) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(9704, 17294, 21132) }, + { AOM_CDF4(26762, 32278, 32633) }, + { AOM_CDF4(18382, 29620, 31819) }, + { AOM_CDF4(10891, 23475, 28723) }, + { AOM_CDF4(6358, 16583, 23309) }, + { AOM_CDF4(3248, 9118, 14141) }, + { AOM_CDF4(27204, 32573, 32699) }, + { AOM_CDF4(19818, 30824, 32329) }, + { AOM_CDF4(11772, 25120, 30041) }, + { AOM_CDF4(6995, 18033, 25039) }, + { AOM_CDF4(3752, 10442, 16098) }, + { AOM_CDF4(27222, 32256, 32559) }, + { AOM_CDF4(15356, 28399, 31475) }, + { AOM_CDF4(8821, 20635, 27057) }, + { AOM_CDF4(5511, 14404, 21239) }, + { AOM_CDF4(2935, 8222, 13051) }, + { AOM_CDF4(24875, 32120, 32529) }, + { AOM_CDF4(15233, 28265, 31445) }, + { AOM_CDF4(8605, 20570, 26932) }, + { AOM_CDF4(5431, 14413, 21196) }, + { AOM_CDF4(2994, 8341, 13223) }, + { AOM_CDF4(28201, 32604, 32700) }, + { AOM_CDF4(21041, 31446, 32456) }, + { AOM_CDF4(13221, 26213, 30475) }, + { AOM_CDF4(8255, 19385, 26037) }, + { AOM_CDF4(4930, 12585, 18830) }, + { AOM_CDF4(28768, 32448, 32627) }, + { AOM_CDF4(19705, 30561, 32021) }, + { AOM_CDF4(11572, 23589, 28220) }, + { AOM_CDF4(5532, 15034, 21446) }, + { AOM_CDF4(2460, 7150, 11456) }, + { AOM_CDF4(29874, 32619, 32699) }, + { AOM_CDF4(21621, 31071, 32201) }, + { AOM_CDF4(12511, 24747, 28992) }, + { AOM_CDF4(6281, 16395, 22748) }, + { AOM_CDF4(3246, 9278, 14497) }, + { AOM_CDF4(29715, 32625, 32712) }, + { AOM_CDF4(20958, 31011, 32283) }, + { AOM_CDF4(11233, 23671, 28806) }, + { AOM_CDF4(6012, 16128, 22868) }, + { AOM_CDF4(3427, 9851, 15414) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(11016, 22111, 26794) }, + { AOM_CDF4(25946, 32357, 32677) }, + { AOM_CDF4(17890, 30452, 32252) }, + { AOM_CDF4(11678, 25142, 29816) }, + { AOM_CDF4(6720, 17534, 24584) }, + { AOM_CDF4(4230, 11665, 17820) }, + { AOM_CDF4(28400, 32623, 32747) }, + { AOM_CDF4(21164, 31668, 32575) }, + { AOM_CDF4(13572, 27388, 31182) }, + { AOM_CDF4(8234, 20750, 27358) }, + { AOM_CDF4(5065, 14055, 20897) }, + { AOM_CDF4(28981, 32547, 32705) }, + { AOM_CDF4(18681, 30543, 32239) }, + { AOM_CDF4(10919, 24075, 29286) }, + { AOM_CDF4(6431, 17199, 24077) }, + { AOM_CDF4(3819, 10464, 16618) }, + { AOM_CDF4(26870, 32467, 32693) }, + { AOM_CDF4(19041, 30831, 32347) }, + { AOM_CDF4(11794, 25211, 30016) }, + { AOM_CDF4(6888, 18019, 24970) }, + { AOM_CDF4(4370, 12363, 18992) }, + { AOM_CDF4(29578, 32670, 32744) }, + { AOM_CDF4(23159, 32007, 32613) }, + { AOM_CDF4(15315, 28669, 31676) }, + { AOM_CDF4(9298, 22607, 28782) }, + { AOM_CDF4(6144, 15913, 22968) }, + { AOM_CDF4(28110, 32499, 32669) }, + { AOM_CDF4(21574, 30937, 32015) }, + { AOM_CDF4(12759, 24818, 28727) }, + { AOM_CDF4(6545, 16761, 23042) }, + { AOM_CDF4(3649, 10597, 16833) }, + { AOM_CDF4(28163, 32552, 32728) }, + { AOM_CDF4(22101, 31469, 32464) }, + { AOM_CDF4(13160, 25472, 30143) }, + { AOM_CDF4(7303, 18684, 25468) }, + { AOM_CDF4(5241, 13975, 20955) }, + { AOM_CDF4(28400, 32631, 32744) }, + { AOM_CDF4(22104, 31793, 32603) }, + { AOM_CDF4(13557, 26571, 30846) }, + { AOM_CDF4(7749, 19861, 26675) }, + { AOM_CDF4(4873, 14030, 21234) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(9800, 17635, 21073) }, + { AOM_CDF4(26153, 31885, 32527) }, + { AOM_CDF4(15038, 27852, 31006) }, + { AOM_CDF4(8718, 20564, 26486) }, + { AOM_CDF4(5128, 14076, 20514) }, + { AOM_CDF4(2636, 7566, 11925) }, + { AOM_CDF4(27551, 32504, 32701) }, + { AOM_CDF4(18310, 30054, 32100) }, + { AOM_CDF4(10211, 23420, 29082) }, + { AOM_CDF4(6222, 16876, 23916) }, + { AOM_CDF4(3462, 9954, 15498) }, + { AOM_CDF4(29991, 32633, 32721) }, + { AOM_CDF4(19883, 30751, 32201) }, + { AOM_CDF4(11141, 24184, 29285) }, + { AOM_CDF4(6420, 16940, 23774) }, + { AOM_CDF4(3392, 9753, 15118) }, + { AOM_CDF4(28465, 32616, 32712) }, + { AOM_CDF4(19850, 30702, 32244) }, + { AOM_CDF4(10983, 24024, 29223) }, + { AOM_CDF4(6294, 16770, 23582) }, + { AOM_CDF4(3244, 9283, 14509) }, + { AOM_CDF4(30023, 32717, 32748) }, + { AOM_CDF4(22940, 32032, 32626) }, + { AOM_CDF4(14282, 27928, 31473) }, + { AOM_CDF4(8562, 21327, 27914) }, + { AOM_CDF4(4846, 13393, 19919) }, + { AOM_CDF4(29981, 32590, 32695) }, + { AOM_CDF4(20465, 30963, 32166) }, + { AOM_CDF4(11479, 23579, 28195) }, + { AOM_CDF4(5916, 15648, 22073) }, + { AOM_CDF4(3031, 8605, 13398) }, + { AOM_CDF4(31146, 32691, 32739) }, + { AOM_CDF4(23106, 31724, 32444) }, + { AOM_CDF4(13783, 26738, 30439) }, + { AOM_CDF4(7852, 19468, 25807) }, + { AOM_CDF4(3860, 11124, 16853) }, + { AOM_CDF4(31014, 32724, 32748) }, + { AOM_CDF4(23629, 32109, 32628) }, + { AOM_CDF4(14747, 28115, 31403) }, + { AOM_CDF4(8545, 21242, 27478) }, + { AOM_CDF4(4574, 12781, 19067) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(9185, 19694, 24688) }, + { AOM_CDF4(26081, 31985, 32621) }, + { AOM_CDF4(16015, 29000, 31787) }, + { AOM_CDF4(10542, 23690, 29206) }, + { AOM_CDF4(6732, 17945, 24677) }, + { AOM_CDF4(3916, 11039, 16722) }, + { AOM_CDF4(28224, 32566, 32744) }, + { AOM_CDF4(19100, 31138, 32485) }, + { AOM_CDF4(12528, 26620, 30879) }, + { AOM_CDF4(7741, 20277, 26885) }, + { AOM_CDF4(4566, 12845, 18990) }, + { AOM_CDF4(29933, 32593, 32718) }, + { AOM_CDF4(17670, 30333, 32155) }, + { AOM_CDF4(10385, 23600, 28909) }, + { AOM_CDF4(6243, 16236, 22407) }, + { AOM_CDF4(3976, 10389, 16017) }, + { AOM_CDF4(28377, 32561, 32738) }, + { AOM_CDF4(19366, 31175, 32482) }, + { AOM_CDF4(13327, 27175, 31094) }, + { AOM_CDF4(8258, 20769, 27143) }, + { AOM_CDF4(4703, 13198, 19527) }, + { AOM_CDF4(31086, 32706, 32748) }, + { AOM_CDF4(22853, 31902, 32583) }, + { AOM_CDF4(14759, 28186, 31419) }, + { AOM_CDF4(9284, 22382, 28348) }, + { AOM_CDF4(5585, 15192, 21868) }, + { AOM_CDF4(28291, 32652, 32746) }, + { AOM_CDF4(19849, 32107, 32571) }, + { AOM_CDF4(14834, 26818, 29214) }, + { AOM_CDF4(10306, 22594, 28672) }, + { AOM_CDF4(6615, 17384, 23384) }, + { AOM_CDF4(28947, 32604, 32745) }, + { AOM_CDF4(25625, 32289, 32646) }, + { AOM_CDF4(18758, 28672, 31403) }, + { AOM_CDF4(10017, 23430, 28523) }, + { AOM_CDF4(6862, 15269, 22131) }, + { AOM_CDF4(23933, 32509, 32739) }, + { AOM_CDF4(19927, 31495, 32631) }, + { AOM_CDF4(11903, 26023, 30621) }, + { AOM_CDF4(7026, 20094, 27252) }, + { AOM_CDF4(5998, 18106, 24437) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(4456, 11274, 15533) }, + { AOM_CDF4(21219, 29079, 31616) }, + { AOM_CDF4(11173, 23774, 28567) }, + { AOM_CDF4(7282, 18293, 24263) }, + { AOM_CDF4(4890, 13286, 19115) }, + { AOM_CDF4(1890, 5508, 8659) }, + { AOM_CDF4(26651, 32136, 32647) }, + { AOM_CDF4(14630, 28254, 31455) }, + { AOM_CDF4(8716, 21287, 27395) }, + { AOM_CDF4(5615, 15331, 22008) }, + { AOM_CDF4(2675, 7700, 12150) }, + { AOM_CDF4(29954, 32526, 32690) }, + { AOM_CDF4(16126, 28982, 31633) }, + { AOM_CDF4(9030, 21361, 27352) }, + { AOM_CDF4(5411, 14793, 21271) }, + { AOM_CDF4(2943, 8422, 13163) }, + { AOM_CDF4(29539, 32601, 32730) }, + { AOM_CDF4(18125, 30385, 32201) }, + { AOM_CDF4(10422, 24090, 29468) }, + { AOM_CDF4(6468, 17487, 24438) }, + { AOM_CDF4(2970, 8653, 13531) }, + { AOM_CDF4(30912, 32715, 32748) }, + { AOM_CDF4(20666, 31373, 32497) }, + { AOM_CDF4(12509, 26640, 30917) }, + { AOM_CDF4(8058, 20629, 27290) }, + { AOM_CDF4(4231, 12006, 18052) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(10202, 20633, 25484) }, + { AOM_CDF4(27336, 31445, 32352) }, + { AOM_CDF4(12420, 24384, 28552) }, + { AOM_CDF4(7648, 18115, 23856) }, + { AOM_CDF4(5662, 14341, 19902) }, + { AOM_CDF4(3611, 10328, 15390) }, + { AOM_CDF4(30945, 32616, 32736) }, + { AOM_CDF4(18682, 30505, 32253) }, + { AOM_CDF4(11513, 25336, 30203) }, + { AOM_CDF4(7449, 19452, 26148) }, + { AOM_CDF4(4482, 13051, 18886) }, + { AOM_CDF4(32022, 32690, 32747) }, + { AOM_CDF4(18578, 30501, 32146) }, + { AOM_CDF4(11249, 23368, 28631) }, + { AOM_CDF4(5645, 16958, 22158) }, + { AOM_CDF4(5009, 11444, 16637) }, + { AOM_CDF4(31357, 32710, 32748) }, + { AOM_CDF4(21552, 31494, 32504) }, + { AOM_CDF4(13891, 27677, 31340) }, + { AOM_CDF4(9051, 22098, 28172) }, + { AOM_CDF4(5190, 13377, 19486) }, + { AOM_CDF4(32364, 32740, 32748) }, + { AOM_CDF4(24839, 31907, 32551) }, + { AOM_CDF4(17160, 28779, 31696) }, + { AOM_CDF4(12452, 24137, 29602) }, + { AOM_CDF4(6165, 15389, 22477) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(2575, 7281, 11077) }, + { AOM_CDF4(14002, 20866, 25402) }, + { AOM_CDF4(6343, 15056, 19658) }, + { AOM_CDF4(4474, 11858, 17041) }, + { AOM_CDF4(2865, 8299, 12534) }, + { AOM_CDF4(1344, 3949, 6391) }, + { AOM_CDF4(24720, 31239, 32459) }, + { AOM_CDF4(12585, 25356, 29968) }, + { AOM_CDF4(7181, 18246, 24444) }, + { AOM_CDF4(5025, 13667, 19885) }, + { AOM_CDF4(2521, 7304, 11605) }, + { AOM_CDF4(29908, 32252, 32584) }, + { AOM_CDF4(17421, 29156, 31575) }, + { AOM_CDF4(9889, 22188, 27782) }, + { AOM_CDF4(5878, 15647, 22123) }, + { AOM_CDF4(2814, 8665, 13323) }, + { AOM_CDF4(30183, 32568, 32713) }, + { AOM_CDF4(18528, 30195, 32049) }, + { AOM_CDF4(10982, 24606, 29657) }, + { AOM_CDF4(6957, 18165, 25231) }, + { AOM_CDF4(3508, 10118, 15468) }, + { AOM_CDF4(31761, 32736, 32748) }, + { AOM_CDF4(21041, 31328, 32546) }, + { AOM_CDF4(12568, 26732, 31166) }, + { AOM_CDF4(8052, 20720, 27733) }, + { AOM_CDF4(4336, 12192, 18396) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } } }, + { { { { AOM_CDF4(7062, 16472, 22319) }, + { AOM_CDF4(24538, 32261, 32674) }, + { AOM_CDF4(13675, 28041, 31779) }, + { AOM_CDF4(8590, 20674, 27631) }, + { AOM_CDF4(5685, 14675, 22013) }, + { AOM_CDF4(3655, 9898, 15731) }, + { AOM_CDF4(26493, 32418, 32658) }, + { AOM_CDF4(16376, 29342, 32090) }, + { AOM_CDF4(10594, 22649, 28970) }, + { AOM_CDF4(8176, 17170, 24303) }, + { AOM_CDF4(5605, 12694, 19139) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(23888, 31902, 32542) }, + { AOM_CDF4(18612, 29687, 31987) }, + { AOM_CDF4(16245, 24852, 29249) }, + { AOM_CDF4(15765, 22608, 27559) }, + { AOM_CDF4(19895, 24699, 27510) }, + { AOM_CDF4(28401, 32212, 32457) }, + { AOM_CDF4(15274, 27825, 30980) }, + { AOM_CDF4(9364, 18128, 24332) }, + { AOM_CDF4(2283, 8193, 15082) }, + { AOM_CDF4(1228, 3972, 7881) }, + { AOM_CDF4(29455, 32469, 32620) }, + { AOM_CDF4(17981, 28245, 31388) }, + { AOM_CDF4(10921, 20098, 26240) }, + { AOM_CDF4(3743, 11829, 18657) }, + { AOM_CDF4(2374, 9593, 15715) }, + { AOM_CDF4(31068, 32466, 32635) }, + { AOM_CDF4(20321, 29572, 31971) }, + { AOM_CDF4(10771, 20255, 27119) }, + { AOM_CDF4(2795, 10410, 17361) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(9320, 22102, 27840) }, + { AOM_CDF4(27057, 32464, 32724) }, + { AOM_CDF4(16331, 30268, 32309) }, + { AOM_CDF4(10319, 23935, 29720) }, + { AOM_CDF4(6189, 16448, 24106) }, + { AOM_CDF4(3589, 10884, 18808) }, + { AOM_CDF4(29026, 32624, 32748) }, + { AOM_CDF4(19226, 31507, 32587) }, + { AOM_CDF4(12692, 26921, 31203) }, + { AOM_CDF4(7049, 19532, 27635) }, + { AOM_CDF4(7727, 15669, 23252) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(28056, 32625, 32748) }, + { AOM_CDF4(22383, 32075, 32669) }, + { AOM_CDF4(15417, 27098, 31749) }, + { AOM_CDF4(18127, 26493, 27190) }, + { AOM_CDF4(5461, 16384, 21845) }, + { AOM_CDF4(27982, 32091, 32584) }, + { AOM_CDF4(19045, 29868, 31972) }, + { AOM_CDF4(10397, 22266, 27932) }, + { AOM_CDF4(5990, 13697, 21500) }, + { AOM_CDF4(1792, 6912, 15104) }, + { AOM_CDF4(28198, 32501, 32718) }, + { AOM_CDF4(21534, 31521, 32569) }, + { AOM_CDF4(11109, 25217, 30017) }, + { AOM_CDF4(5671, 15124, 26151) }, + { AOM_CDF4(4681, 14043, 18725) }, + { AOM_CDF4(28688, 32580, 32741) }, + { AOM_CDF4(22576, 32079, 32661) }, + { AOM_CDF4(10627, 22141, 28340) }, + { AOM_CDF4(9362, 14043, 28087) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(7754, 16948, 22142) }, + { AOM_CDF4(25670, 32330, 32691) }, + { AOM_CDF4(15663, 29225, 31994) }, + { AOM_CDF4(9878, 23288, 29158) }, + { AOM_CDF4(6419, 17088, 24336) }, + { AOM_CDF4(3859, 11003, 17039) }, + { AOM_CDF4(27562, 32595, 32725) }, + { AOM_CDF4(17575, 30588, 32399) }, + { AOM_CDF4(10819, 24838, 30309) }, + { AOM_CDF4(7124, 18686, 25916) }, + { AOM_CDF4(4479, 12688, 19340) }, + { AOM_CDF4(28385, 32476, 32673) }, + { AOM_CDF4(15306, 29005, 31938) }, + { AOM_CDF4(8937, 21615, 28322) }, + { AOM_CDF4(5982, 15603, 22786) }, + { AOM_CDF4(3620, 10267, 16136) }, + { AOM_CDF4(27280, 32464, 32667) }, + { AOM_CDF4(15607, 29160, 32004) }, + { AOM_CDF4(9091, 22135, 28740) }, + { AOM_CDF4(6232, 16632, 24020) }, + { AOM_CDF4(4047, 11377, 17672) }, + { AOM_CDF4(29220, 32630, 32718) }, + { AOM_CDF4(19650, 31220, 32462) }, + { AOM_CDF4(13050, 26312, 30827) }, + { AOM_CDF4(9228, 20870, 27468) }, + { AOM_CDF4(6146, 15149, 21971) }, + { AOM_CDF4(30169, 32481, 32623) }, + { AOM_CDF4(17212, 29311, 31554) }, + { AOM_CDF4(9911, 21311, 26882) }, + { AOM_CDF4(4487, 13314, 20372) }, + { AOM_CDF4(2570, 7772, 12889) }, + { AOM_CDF4(30924, 32613, 32708) }, + { AOM_CDF4(19490, 30206, 32107) }, + { AOM_CDF4(11232, 23998, 29276) }, + { AOM_CDF4(6769, 17955, 25035) }, + { AOM_CDF4(4398, 12623, 19214) }, + { AOM_CDF4(30609, 32627, 32722) }, + { AOM_CDF4(19370, 30582, 32287) }, + { AOM_CDF4(10457, 23619, 29409) }, + { AOM_CDF4(6443, 17637, 24834) }, + { AOM_CDF4(4645, 13236, 20106) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8626, 20271, 26216) }, + { AOM_CDF4(26707, 32406, 32711) }, + { AOM_CDF4(16999, 30329, 32286) }, + { AOM_CDF4(11445, 25123, 30286) }, + { AOM_CDF4(6411, 18828, 25601) }, + { AOM_CDF4(6801, 12458, 20248) }, + { AOM_CDF4(29918, 32682, 32748) }, + { AOM_CDF4(20649, 31739, 32618) }, + { AOM_CDF4(12879, 27773, 31581) }, + { AOM_CDF4(7896, 21751, 28244) }, + { AOM_CDF4(5260, 14870, 23698) }, + { AOM_CDF4(29252, 32593, 32731) }, + { AOM_CDF4(17072, 30460, 32294) }, + { AOM_CDF4(10653, 24143, 29365) }, + { AOM_CDF4(6536, 17490, 23983) }, + { AOM_CDF4(4929, 13170, 20085) }, + { AOM_CDF4(28137, 32518, 32715) }, + { AOM_CDF4(18171, 30784, 32407) }, + { AOM_CDF4(11437, 25436, 30459) }, + { AOM_CDF4(7252, 18534, 26176) }, + { AOM_CDF4(4126, 13353, 20978) }, + { AOM_CDF4(31162, 32726, 32748) }, + { AOM_CDF4(23017, 32222, 32701) }, + { AOM_CDF4(15629, 29233, 32046) }, + { AOM_CDF4(9387, 22621, 29480) }, + { AOM_CDF4(6922, 17616, 25010) }, + { AOM_CDF4(28838, 32265, 32614) }, + { AOM_CDF4(19701, 30206, 31920) }, + { AOM_CDF4(11214, 22410, 27933) }, + { AOM_CDF4(5320, 14177, 23034) }, + { AOM_CDF4(5049, 12881, 17827) }, + { AOM_CDF4(27484, 32471, 32734) }, + { AOM_CDF4(21076, 31526, 32561) }, + { AOM_CDF4(12707, 26303, 31211) }, + { AOM_CDF4(8169, 21722, 28219) }, + { AOM_CDF4(6045, 19406, 27042) }, + { AOM_CDF4(27753, 32572, 32745) }, + { AOM_CDF4(20832, 31878, 32653) }, + { AOM_CDF4(13250, 27356, 31674) }, + { AOM_CDF4(7718, 21508, 29858) }, + { AOM_CDF4(7209, 18350, 25559) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(7876, 16901, 21741) }, + { AOM_CDF4(24001, 31898, 32625) }, + { AOM_CDF4(14529, 27959, 31451) }, + { AOM_CDF4(8273, 20818, 27258) }, + { AOM_CDF4(5278, 14673, 21510) }, + { AOM_CDF4(2983, 8843, 14039) }, + { AOM_CDF4(28016, 32574, 32732) }, + { AOM_CDF4(17471, 30306, 32301) }, + { AOM_CDF4(10224, 24063, 29728) }, + { AOM_CDF4(6602, 17954, 25052) }, + { AOM_CDF4(4002, 11585, 17759) }, + { AOM_CDF4(30190, 32634, 32739) }, + { AOM_CDF4(17497, 30282, 32270) }, + { AOM_CDF4(10229, 23729, 29538) }, + { AOM_CDF4(6344, 17211, 24440) }, + { AOM_CDF4(3849, 11189, 17108) }, + { AOM_CDF4(28570, 32583, 32726) }, + { AOM_CDF4(17521, 30161, 32238) }, + { AOM_CDF4(10153, 23565, 29378) }, + { AOM_CDF4(6455, 17341, 24443) }, + { AOM_CDF4(3907, 11042, 17024) }, + { AOM_CDF4(30689, 32715, 32748) }, + { AOM_CDF4(21546, 31840, 32610) }, + { AOM_CDF4(13547, 27581, 31459) }, + { AOM_CDF4(8912, 21757, 28309) }, + { AOM_CDF4(5548, 15080, 22046) }, + { AOM_CDF4(30783, 32540, 32685) }, + { AOM_CDF4(17540, 29528, 31668) }, + { AOM_CDF4(10160, 21468, 26783) }, + { AOM_CDF4(4724, 13393, 20054) }, + { AOM_CDF4(2702, 8174, 13102) }, + { AOM_CDF4(31648, 32686, 32742) }, + { AOM_CDF4(20954, 31094, 32337) }, + { AOM_CDF4(12420, 25698, 30179) }, + { AOM_CDF4(7304, 19320, 26248) }, + { AOM_CDF4(4366, 12261, 18864) }, + { AOM_CDF4(31581, 32723, 32748) }, + { AOM_CDF4(21373, 31586, 32525) }, + { AOM_CDF4(12744, 26625, 30885) }, + { AOM_CDF4(7431, 20322, 26950) }, + { AOM_CDF4(4692, 13323, 20111) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(7833, 18369, 24095) }, + { AOM_CDF4(26650, 32273, 32702) }, + { AOM_CDF4(16371, 29961, 32191) }, + { AOM_CDF4(11055, 24082, 29629) }, + { AOM_CDF4(6892, 18644, 25400) }, + { AOM_CDF4(5006, 13057, 19240) }, + { AOM_CDF4(29834, 32666, 32748) }, + { AOM_CDF4(19577, 31335, 32570) }, + { AOM_CDF4(12253, 26509, 31122) }, + { AOM_CDF4(7991, 20772, 27711) }, + { AOM_CDF4(5677, 15910, 23059) }, + { AOM_CDF4(30109, 32532, 32720) }, + { AOM_CDF4(16747, 30166, 32252) }, + { AOM_CDF4(10134, 23542, 29184) }, + { AOM_CDF4(5791, 16176, 23556) }, + { AOM_CDF4(4362, 10414, 17284) }, + { AOM_CDF4(29492, 32626, 32748) }, + { AOM_CDF4(19894, 31402, 32525) }, + { AOM_CDF4(12942, 27071, 30869) }, + { AOM_CDF4(8346, 21216, 27405) }, + { AOM_CDF4(6572, 17087, 23859) }, + { AOM_CDF4(32035, 32735, 32748) }, + { AOM_CDF4(22957, 31838, 32618) }, + { AOM_CDF4(14724, 28572, 31772) }, + { AOM_CDF4(10364, 23999, 29553) }, + { AOM_CDF4(7004, 18433, 25655) }, + { AOM_CDF4(27528, 32277, 32681) }, + { AOM_CDF4(16959, 31171, 32096) }, + { AOM_CDF4(10486, 23593, 27962) }, + { AOM_CDF4(8192, 16384, 23211) }, + { AOM_CDF4(8937, 17873, 20852) }, + { AOM_CDF4(27715, 32002, 32615) }, + { AOM_CDF4(15073, 29491, 31676) }, + { AOM_CDF4(11264, 24576, 28672) }, + { AOM_CDF4(2341, 18725, 23406) }, + { AOM_CDF4(7282, 18204, 25486) }, + { AOM_CDF4(28547, 32213, 32657) }, + { AOM_CDF4(20788, 29773, 32239) }, + { AOM_CDF4(6780, 21469, 30508) }, + { AOM_CDF4(5958, 14895, 23831) }, + { AOM_CDF4(16384, 21845, 27307) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(5992, 14304, 19765) }, + { AOM_CDF4(22612, 31238, 32456) }, + { AOM_CDF4(13456, 27162, 31087) }, + { AOM_CDF4(8001, 20062, 26504) }, + { AOM_CDF4(5168, 14105, 20764) }, + { AOM_CDF4(2632, 7771, 12385) }, + { AOM_CDF4(27034, 32344, 32709) }, + { AOM_CDF4(15850, 29415, 31997) }, + { AOM_CDF4(9494, 22776, 28841) }, + { AOM_CDF4(6151, 16830, 23969) }, + { AOM_CDF4(3461, 10039, 15722) }, + { AOM_CDF4(30134, 32569, 32731) }, + { AOM_CDF4(15638, 29422, 31945) }, + { AOM_CDF4(9150, 21865, 28218) }, + { AOM_CDF4(5647, 15719, 22676) }, + { AOM_CDF4(3402, 9772, 15477) }, + { AOM_CDF4(28530, 32586, 32735) }, + { AOM_CDF4(17139, 30298, 32292) }, + { AOM_CDF4(10200, 24039, 29685) }, + { AOM_CDF4(6419, 17674, 24786) }, + { AOM_CDF4(3544, 10225, 15824) }, + { AOM_CDF4(31333, 32726, 32748) }, + { AOM_CDF4(20618, 31487, 32544) }, + { AOM_CDF4(12901, 27217, 31232) }, + { AOM_CDF4(8624, 21734, 28171) }, + { AOM_CDF4(5104, 14191, 20748) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(11206, 21090, 26561) }, + { AOM_CDF4(28759, 32279, 32671) }, + { AOM_CDF4(14171, 27952, 31569) }, + { AOM_CDF4(9743, 22907, 29141) }, + { AOM_CDF4(6871, 17886, 24868) }, + { AOM_CDF4(4960, 13152, 19315) }, + { AOM_CDF4(31077, 32661, 32748) }, + { AOM_CDF4(19400, 31195, 32515) }, + { AOM_CDF4(12752, 26858, 31040) }, + { AOM_CDF4(8370, 22098, 28591) }, + { AOM_CDF4(5457, 15373, 22298) }, + { AOM_CDF4(31697, 32706, 32748) }, + { AOM_CDF4(17860, 30657, 32333) }, + { AOM_CDF4(12510, 24812, 29261) }, + { AOM_CDF4(6180, 19124, 24722) }, + { AOM_CDF4(5041, 13548, 17959) }, + { AOM_CDF4(31552, 32716, 32748) }, + { AOM_CDF4(21908, 31769, 32623) }, + { AOM_CDF4(14470, 28201, 31565) }, + { AOM_CDF4(9493, 22982, 28608) }, + { AOM_CDF4(6858, 17240, 24137) }, + { AOM_CDF4(32543, 32752, 32756) }, + { AOM_CDF4(24286, 32097, 32666) }, + { AOM_CDF4(15958, 29217, 32024) }, + { AOM_CDF4(10207, 24234, 29958) }, + { AOM_CDF4(6929, 18305, 25652) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(4137, 10847, 15682) }, + { AOM_CDF4(17824, 27001, 30058) }, + { AOM_CDF4(10204, 22796, 28291) }, + { AOM_CDF4(6076, 15935, 22125) }, + { AOM_CDF4(3852, 10937, 16816) }, + { AOM_CDF4(2252, 6324, 10131) }, + { AOM_CDF4(25840, 32016, 32662) }, + { AOM_CDF4(15109, 28268, 31531) }, + { AOM_CDF4(9385, 22231, 28340) }, + { AOM_CDF4(6082, 16672, 23479) }, + { AOM_CDF4(3318, 9427, 14681) }, + { AOM_CDF4(30594, 32574, 32718) }, + { AOM_CDF4(16836, 29552, 31859) }, + { AOM_CDF4(9556, 22542, 28356) }, + { AOM_CDF4(6305, 16725, 23540) }, + { AOM_CDF4(3376, 9895, 15184) }, + { AOM_CDF4(29383, 32617, 32745) }, + { AOM_CDF4(18891, 30809, 32401) }, + { AOM_CDF4(11688, 25942, 30687) }, + { AOM_CDF4(7468, 19469, 26651) }, + { AOM_CDF4(3909, 11358, 17012) }, + { AOM_CDF4(31564, 32736, 32748) }, + { AOM_CDF4(20906, 31611, 32600) }, + { AOM_CDF4(13191, 27621, 31537) }, + { AOM_CDF4(8768, 22029, 28676) }, + { AOM_CDF4(5079, 14109, 20906) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } } } }; + +static const aom_cdf_prob av1_default_coeff_base_eob_multi_cdfs + [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB][CDF_SIZE( + NUM_BASE_LEVELS + 1)] = { { { { { AOM_CDF3(17837, 29055) }, + { AOM_CDF3(29600, 31446) }, + { AOM_CDF3(30844, 31878) }, + { AOM_CDF3(24926, 28948) } }, + { { AOM_CDF3(21365, 30026) }, + { AOM_CDF3(30512, 32423) }, + { AOM_CDF3(31658, 32621) }, + { AOM_CDF3(29630, 31881) } } }, + { { { AOM_CDF3(5717, 26477) }, + { AOM_CDF3(30491, 31703) }, + { AOM_CDF3(31550, 32158) }, + { AOM_CDF3(29648, 31491) } }, + { { AOM_CDF3(12608, 27820) }, + { AOM_CDF3(30680, 32225) }, + { AOM_CDF3(30809, 32335) }, + { AOM_CDF3(31299, 32423) } } }, + { { { AOM_CDF3(1786, 12612) }, + { AOM_CDF3(30663, 31625) }, + { AOM_CDF3(32339, 32468) }, + { AOM_CDF3(31148, 31833) } }, + { { AOM_CDF3(18857, 23865) }, + { AOM_CDF3(31428, 32428) }, + { AOM_CDF3(31744, 32373) }, + { AOM_CDF3(31775, 32526) } } }, + { { { AOM_CDF3(1787, 2532) }, + { AOM_CDF3(30832, 31662) }, + { AOM_CDF3(31824, 32682) }, + { AOM_CDF3(32133, 32569) } }, + { { AOM_CDF3(13751, 22235) }, + { AOM_CDF3(32089, 32409) }, + { AOM_CDF3(27084, 27920) }, + { AOM_CDF3(29291, 32594) } } }, + { { { AOM_CDF3(1725, 3449) }, + { AOM_CDF3(31102, 31935) }, + { AOM_CDF3(32457, 32613) }, + { AOM_CDF3(32412, 32649) } }, + { { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) } } } }, + { { { { AOM_CDF3(17560, 29888) }, + { AOM_CDF3(29671, 31549) }, + { AOM_CDF3(31007, 32056) }, + { AOM_CDF3(27286, 30006) } }, + { { AOM_CDF3(26594, 31212) }, + { AOM_CDF3(31208, 32582) }, + { AOM_CDF3(31835, 32637) }, + { AOM_CDF3(30595, 32206) } } }, + { { { AOM_CDF3(15239, 29932) }, + { AOM_CDF3(31315, 32095) }, + { AOM_CDF3(32130, 32434) }, + { AOM_CDF3(30864, 31996) } }, + { { AOM_CDF3(26279, 30968) }, + { AOM_CDF3(31142, 32495) }, + { AOM_CDF3(31713, 32540) }, + { AOM_CDF3(31929, 32594) } } }, + { { { AOM_CDF3(2644, 25198) }, + { AOM_CDF3(32038, 32451) }, + { AOM_CDF3(32639, 32695) }, + { AOM_CDF3(32166, 32518) } }, + { { AOM_CDF3(17187, 27668) }, + { AOM_CDF3(31714, 32550) }, + { AOM_CDF3(32283, 32678) }, + { AOM_CDF3(31930, 32563) } } }, + { { { AOM_CDF3(1044, 2257) }, + { AOM_CDF3(30755, 31923) }, + { AOM_CDF3(32208, 32693) }, + { AOM_CDF3(32244, 32615) } }, + { { AOM_CDF3(21317, 26207) }, + { AOM_CDF3(29133, 30868) }, + { AOM_CDF3(29311, 31231) }, + { AOM_CDF3(29657, 31087) } } }, + { { { AOM_CDF3(478, 1834) }, + { AOM_CDF3(31005, 31987) }, + { AOM_CDF3(32317, 32724) }, + { AOM_CDF3(30865, 32648) } }, + { { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) } } } }, + { { { { AOM_CDF3(20092, 30774) }, + { AOM_CDF3(30695, 32020) }, + { AOM_CDF3(31131, 32103) }, + { AOM_CDF3(28666, 30870) } }, + { { AOM_CDF3(27258, 31095) }, + { AOM_CDF3(31804, 32623) }, + { AOM_CDF3(31763, 32528) }, + { AOM_CDF3(31438, 32506) } } }, + { { { AOM_CDF3(18049, 30489) }, + { AOM_CDF3(31706, 32286) }, + { AOM_CDF3(32163, 32473) }, + { AOM_CDF3(31550, 32184) } }, + { { AOM_CDF3(27116, 30842) }, + { AOM_CDF3(31971, 32598) }, + { AOM_CDF3(32088, 32576) }, + { AOM_CDF3(32067, 32664) } } }, + { { { AOM_CDF3(12854, 29093) }, + { AOM_CDF3(32272, 32558) }, + { AOM_CDF3(32667, 32729) }, + { AOM_CDF3(32306, 32585) } }, + { { AOM_CDF3(25476, 30366) }, + { AOM_CDF3(32169, 32687) }, + { AOM_CDF3(32479, 32689) }, + { AOM_CDF3(31673, 32634) } } }, + { { { AOM_CDF3(2809, 19301) }, + { AOM_CDF3(32205, 32622) }, + { AOM_CDF3(32338, 32730) }, + { AOM_CDF3(31786, 32616) } }, + { { AOM_CDF3(22737, 29105) }, + { AOM_CDF3(30810, 32362) }, + { AOM_CDF3(30014, 32627) }, + { AOM_CDF3(30528, 32574) } } }, + { { { AOM_CDF3(935, 3382) }, + { AOM_CDF3(30789, 31909) }, + { AOM_CDF3(32466, 32756) }, + { AOM_CDF3(30860, 32513) } }, + { { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) } } } }, + { { { { AOM_CDF3(22497, 31198) }, + { AOM_CDF3(31715, 32495) }, + { AOM_CDF3(31606, 32337) }, + { AOM_CDF3(30388, 31990) } }, + { { AOM_CDF3(27877, 31584) }, + { AOM_CDF3(32170, 32728) }, + { AOM_CDF3(32155, 32688) }, + { AOM_CDF3(32219, 32702) } } }, + { { { AOM_CDF3(21457, 31043) }, + { AOM_CDF3(31951, 32483) }, + { AOM_CDF3(32153, 32562) }, + { AOM_CDF3(31473, 32215) } }, + { { AOM_CDF3(27558, 31151) }, + { AOM_CDF3(32020, 32640) }, + { AOM_CDF3(32097, 32575) }, + { AOM_CDF3(32242, 32719) } } }, + { { { AOM_CDF3(19980, 30591) }, + { AOM_CDF3(32219, 32597) }, + { AOM_CDF3(32581, 32706) }, + { AOM_CDF3(31803, 32287) } }, + { { AOM_CDF3(26473, 30507) }, + { AOM_CDF3(32431, 32723) }, + { AOM_CDF3(32196, 32611) }, + { AOM_CDF3(31588, 32528) } } }, + { { { AOM_CDF3(24647, 30463) }, + { AOM_CDF3(32412, 32695) }, + { AOM_CDF3(32468, 32720) }, + { AOM_CDF3(31269, 32523) } }, + { { AOM_CDF3(28482, 31505) }, + { AOM_CDF3(32152, 32701) }, + { AOM_CDF3(31732, 32598) }, + { AOM_CDF3(31767, 32712) } } }, + { { { AOM_CDF3(12358, 24977) }, + { AOM_CDF3(31331, 32385) }, + { AOM_CDF3(32634, 32756) }, + { AOM_CDF3(30411, 32548) } }, + { { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) } } } } }; + +#endif // AOM_AV1_COMMON_TOKEN_CDFS_H_ diff --git a/libs/libaom/src/av1/common/txb_common.c b/libs/libaom/src/av1/common/txb_common.c new file mode 100644 index 000000000..4eef319cd --- /dev/null +++ b/libs/libaom/src/av1/common/txb_common.c @@ -0,0 +1,458 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "aom/aom_integer.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/txb_common.h" + +const int8_t av1_coeff_band_4x4[16] = { 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15 }; + +const int8_t av1_coeff_band_8x8[64] = { + 0, 1, 2, 2, 3, 3, 4, 4, 5, 6, 2, 2, 3, 3, 4, 4, + 7, 7, 8, 8, 9, 9, 10, 10, 7, 7, 8, 8, 9, 9, 10, 10, + 11, 11, 12, 12, 13, 13, 14, 14, 11, 11, 12, 12, 13, 13, 14, 14, + 15, 15, 16, 16, 17, 17, 18, 18, 15, 15, 16, 16, 17, 17, 18, 18, +}; + +const int8_t av1_coeff_band_16x16[256] = { + 0, 1, 4, 4, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 2, 3, 4, + 4, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 5, 5, 6, 6, 7, 7, + 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 5, 5, 6, 6, 7, 7, 7, 7, 8, + 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, + 13, 13, 13, 13, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, + 13, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 10, 10, + 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, + 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17, 14, 14, 14, 14, 15, 15, 15, 15, + 16, 16, 16, 16, 17, 17, 17, 17, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, + 16, 17, 17, 17, 17, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, + 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 18, + 18, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 18, 18, 18, 18, + 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 18, 18, 18, 18, 19, 19, 19, + 19, 20, 20, 20, 20, 21, 21, 21, 21, +}; + +const int8_t av1_coeff_band_32x32[1024] = { + 0, 1, 4, 4, 7, 7, 7, 7, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, + 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 2, 3, 4, 4, 7, 7, + 7, 7, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 12, + 12, 12, 12, 12, 12, 12, 12, 5, 5, 6, 6, 7, 7, 7, 7, 10, 10, 10, 10, + 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, + 12, 5, 5, 6, 6, 7, 7, 7, 7, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, + 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 8, 8, 8, 8, 9, + 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, + 12, 12, 12, 12, 12, 12, 12, 12, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, + 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, + 12, 12, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 11, + 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 8, 8, 8, 8, + 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, + 11, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, + 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, + 16, 16, 16, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, + 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 13, 13, 13, + 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, + 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 13, 13, 13, 13, 13, 13, 13, 13, 14, + 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, + 16, 16, 16, 16, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, + 14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 13, 13, + 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, + 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 13, 13, 13, 13, 13, 13, 13, 13, + 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, + 16, 16, 16, 16, 16, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, + 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 17, + 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, + 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 17, 17, 17, 17, 17, 17, 17, + 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, + 20, 20, 20, 20, 20, 20, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, + 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, + 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, + 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 17, 17, 17, 17, 17, 17, + 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, + 20, 20, 20, 20, 20, 20, 20, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, + 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, + 20, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, + 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 17, 17, 17, 17, 17, + 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, + 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, + 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, + 24, 24, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, + 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 21, 21, 21, 21, + 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, + 23, 24, 24, 24, 24, 24, 24, 24, 24, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, + 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, + 24, 24, 24, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, + 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 21, 21, 21, + 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, + 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 21, 21, 21, 21, 21, 21, 21, 21, 22, + 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, + 24, 24, 24, 24, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, + 22, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, +}; + +// The ctx offset table when TX is TX_CLASS_2D. +// TX col and row indices are clamped to 4 + +const int8_t av1_nz_map_ctx_offset_4x4[16] = { + 0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_8x8[64] = { + 0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21, + 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_16x16[256] = { + 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 1, 6, 6, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6, 6, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_32x32[1024] = { + 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_8x4[32] = { + 0, 16, 6, 6, 21, 21, 21, 21, 16, 16, 6, 21, 21, 21, 21, 21, + 16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_8x16[128] = { + 0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, + 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_16x8[128] = { + 0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 6, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_16x32[512] = { + 0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_32x16[512] = { + 0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 6, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_32x64[1024] = { + 0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_64x32[1024] = { + 0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 6, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, + 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_4x16[64] = { + 0, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 6, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_16x4[64] = { + 0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 16, 16, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_8x32[256] = { + 0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, + 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_32x8[256] = { + 0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 6, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t *av1_nz_map_ctx_offset[19] = { + av1_nz_map_ctx_offset_4x4, // TX_4x4 + av1_nz_map_ctx_offset_8x8, // TX_8x8 + av1_nz_map_ctx_offset_16x16, // TX_16x16 + av1_nz_map_ctx_offset_32x32, // TX_32x32 + av1_nz_map_ctx_offset_32x32, // TX_32x32 + av1_nz_map_ctx_offset_4x16, // TX_4x8 + av1_nz_map_ctx_offset_8x4, // TX_8x4 + av1_nz_map_ctx_offset_8x32, // TX_8x16 + av1_nz_map_ctx_offset_16x8, // TX_16x8 + av1_nz_map_ctx_offset_16x32, // TX_16x32 + av1_nz_map_ctx_offset_32x16, // TX_32x16 + av1_nz_map_ctx_offset_32x64, // TX_32x64 + av1_nz_map_ctx_offset_64x32, // TX_64x32 + av1_nz_map_ctx_offset_4x16, // TX_4x16 + av1_nz_map_ctx_offset_16x4, // TX_16x4 + av1_nz_map_ctx_offset_8x32, // TX_8x32 + av1_nz_map_ctx_offset_32x8, // TX_32x8 + av1_nz_map_ctx_offset_16x32, // TX_16x64 + av1_nz_map_ctx_offset_64x32, // TX_64x16 +}; + +const int16_t av1_eob_group_start[12] = { 0, 1, 2, 3, 5, 9, + 17, 33, 65, 129, 257, 513 }; +const int16_t av1_eob_offset_bits[12] = { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; diff --git a/libs/libaom/src/av1/common/txb_common.h b/libs/libaom/src/av1/common/txb_common.h new file mode 100644 index 000000000..5a62fa89b --- /dev/null +++ b/libs/libaom/src/av1/common/txb_common.h @@ -0,0 +1,442 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_TXB_COMMON_H_ +#define AOM_AV1_COMMON_TXB_COMMON_H_ + +#include "av1/common/av1_common_int.h" + +extern const int16_t av1_eob_group_start[12]; +extern const int16_t av1_eob_offset_bits[12]; + +extern const int8_t av1_coeff_band_4x4[16]; + +extern const int8_t av1_coeff_band_8x8[64]; + +extern const int8_t av1_coeff_band_16x16[256]; + +extern const int8_t av1_coeff_band_32x32[1024]; + +extern const int8_t *av1_nz_map_ctx_offset[TX_SIZES_ALL]; + +typedef struct txb_ctx { + int txb_skip_ctx; + int dc_sign_ctx; +} TXB_CTX; + +static const int base_level_count_to_index[13] = { + 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, +}; + +static const TX_CLASS tx_type_to_class[TX_TYPES] = { + TX_CLASS_2D, // DCT_DCT + TX_CLASS_2D, // ADST_DCT + TX_CLASS_2D, // DCT_ADST + TX_CLASS_2D, // ADST_ADST + TX_CLASS_2D, // FLIPADST_DCT + TX_CLASS_2D, // DCT_FLIPADST + TX_CLASS_2D, // FLIPADST_FLIPADST + TX_CLASS_2D, // ADST_FLIPADST + TX_CLASS_2D, // FLIPADST_ADST + TX_CLASS_2D, // IDTX + TX_CLASS_VERT, // V_DCT + TX_CLASS_HORIZ, // H_DCT + TX_CLASS_VERT, // V_ADST + TX_CLASS_HORIZ, // H_ADST + TX_CLASS_VERT, // V_FLIPADST + TX_CLASS_HORIZ, // H_FLIPADST +}; + +static INLINE int get_txb_bwl(TX_SIZE tx_size) { + tx_size = av1_get_adjusted_tx_size(tx_size); + return tx_size_wide_log2[tx_size]; +} + +static INLINE int get_txb_wide(TX_SIZE tx_size) { + tx_size = av1_get_adjusted_tx_size(tx_size); + return tx_size_wide[tx_size]; +} + +static INLINE int get_txb_high(TX_SIZE tx_size) { + tx_size = av1_get_adjusted_tx_size(tx_size); + return tx_size_high[tx_size]; +} + +static INLINE uint8_t *set_levels(uint8_t *const levels_buf, const int width) { + return levels_buf + TX_PAD_TOP * (width + TX_PAD_HOR); +} + +static INLINE int get_padded_idx(const int idx, const int bwl) { + return idx + ((idx >> bwl) << TX_PAD_HOR_LOG2); +} + +static INLINE int get_base_ctx_from_count_mag(int row, int col, int count, + int sig_mag) { + const int ctx = base_level_count_to_index[count]; + int ctx_idx = -1; + + if (row == 0 && col == 0) { + if (sig_mag >= 2) return ctx_idx = 0; + if (sig_mag == 1) { + if (count >= 2) + ctx_idx = 1; + else + ctx_idx = 2; + + return ctx_idx; + } + + ctx_idx = 3 + ctx; + assert(ctx_idx <= 6); + return ctx_idx; + } else if (row == 0) { + if (sig_mag >= 2) return ctx_idx = 6; + if (sig_mag == 1) { + if (count >= 2) + ctx_idx = 7; + else + ctx_idx = 8; + return ctx_idx; + } + + ctx_idx = 9 + ctx; + assert(ctx_idx <= 11); + return ctx_idx; + } else if (col == 0) { + if (sig_mag >= 2) return ctx_idx = 12; + if (sig_mag == 1) { + if (count >= 2) + ctx_idx = 13; + else + ctx_idx = 14; + + return ctx_idx; + } + + ctx_idx = 15 + ctx; + assert(ctx_idx <= 17); + // TODO(angiebird): turn this on once the optimization is finalized + // assert(ctx_idx < 28); + } else { + if (sig_mag >= 2) return ctx_idx = 18; + if (sig_mag == 1) { + if (count >= 2) + ctx_idx = 19; + else + ctx_idx = 20; + return ctx_idx; + } + + ctx_idx = 21 + ctx; + + assert(ctx_idx <= 24); + } + return ctx_idx; +} + +static INLINE int get_br_ctx_2d(const uint8_t *const levels, + const int c, // raster order + const int bwl) { + assert(c > 0); + const int row = c >> bwl; + const int col = c - (row << bwl); + const int stride = (1 << bwl) + TX_PAD_HOR; + const int pos = row * stride + col; + int mag = AOMMIN(levels[pos + 1], MAX_BASE_BR_RANGE) + + AOMMIN(levels[pos + stride], MAX_BASE_BR_RANGE) + + AOMMIN(levels[pos + 1 + stride], MAX_BASE_BR_RANGE); + mag = AOMMIN((mag + 1) >> 1, 6); + //((row | col) < 2) is equivalent to ((row < 2) && (col < 2)) + if ((row | col) < 2) return mag + 7; + return mag + 14; +} + +static AOM_FORCE_INLINE int get_br_ctx_eob(const int c, // raster order + const int bwl, + const TX_CLASS tx_class) { + const int row = c >> bwl; + const int col = c - (row << bwl); + if (c == 0) return 0; + if ((tx_class == TX_CLASS_2D && row < 2 && col < 2) || + (tx_class == TX_CLASS_HORIZ && col == 0) || + (tx_class == TX_CLASS_VERT && row == 0)) + return 7; + return 14; +} + +static AOM_FORCE_INLINE int get_br_ctx(const uint8_t *const levels, + const int c, // raster order + const int bwl, const TX_CLASS tx_class) { + const int row = c >> bwl; + const int col = c - (row << bwl); + const int stride = (1 << bwl) + TX_PAD_HOR; + const int pos = row * stride + col; + int mag = levels[pos + 1]; + mag += levels[pos + stride]; + switch (tx_class) { + case TX_CLASS_2D: + mag += levels[pos + stride + 1]; + mag = AOMMIN((mag + 1) >> 1, 6); + if (c == 0) return mag; + if ((row < 2) && (col < 2)) return mag + 7; + break; + case TX_CLASS_HORIZ: + mag += levels[pos + 2]; + mag = AOMMIN((mag + 1) >> 1, 6); + if (c == 0) return mag; + if (col == 0) return mag + 7; + break; + case TX_CLASS_VERT: + mag += levels[pos + (stride << 1)]; + mag = AOMMIN((mag + 1) >> 1, 6); + if (c == 0) return mag; + if (row == 0) return mag + 7; + break; + default: break; + } + + return mag + 14; +} + +static const uint8_t clip_max3[256] = { + 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 +}; + +static AOM_FORCE_INLINE int get_nz_mag(const uint8_t *const levels, + const int bwl, const TX_CLASS tx_class) { + int mag; + + // Note: AOMMIN(level, 3) is useless for decoder since level < 3. + mag = clip_max3[levels[1]]; // { 0, 1 } + mag += clip_max3[levels[(1 << bwl) + TX_PAD_HOR]]; // { 1, 0 } + + if (tx_class == TX_CLASS_2D) { + mag += clip_max3[levels[(1 << bwl) + TX_PAD_HOR + 1]]; // { 1, 1 } + mag += clip_max3[levels[2]]; // { 0, 2 } + mag += clip_max3[levels[(2 << bwl) + (2 << TX_PAD_HOR_LOG2)]]; // { 2, 0 } + } else if (tx_class == TX_CLASS_VERT) { + mag += clip_max3[levels[(2 << bwl) + (2 << TX_PAD_HOR_LOG2)]]; // { 2, 0 } + mag += clip_max3[levels[(3 << bwl) + (3 << TX_PAD_HOR_LOG2)]]; // { 3, 0 } + mag += clip_max3[levels[(4 << bwl) + (4 << TX_PAD_HOR_LOG2)]]; // { 4, 0 } + } else { + mag += clip_max3[levels[2]]; // { 0, 2 } + mag += clip_max3[levels[3]]; // { 0, 3 } + mag += clip_max3[levels[4]]; // { 0, 4 } + } + + return mag; +} + +#define NZ_MAP_CTX_0 SIG_COEF_CONTEXTS_2D +#define NZ_MAP_CTX_5 (NZ_MAP_CTX_0 + 5) +#define NZ_MAP_CTX_10 (NZ_MAP_CTX_0 + 10) + +static const int nz_map_ctx_offset_1d[32] = { + NZ_MAP_CTX_0, NZ_MAP_CTX_5, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, + NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, + NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, + NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, + NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, + NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, + NZ_MAP_CTX_10, NZ_MAP_CTX_10, +}; + +static AOM_FORCE_INLINE int get_nz_map_ctx_from_stats( + const int stats, + const int coeff_idx, // raster order + const int bwl, const TX_SIZE tx_size, const TX_CLASS tx_class) { + // tx_class == 0(TX_CLASS_2D) + if ((tx_class | coeff_idx) == 0) return 0; + int ctx = (stats + 1) >> 1; + ctx = AOMMIN(ctx, 4); + switch (tx_class) { + case TX_CLASS_2D: { + // This is the algorithm to generate av1_nz_map_ctx_offset[][] + // const int width = tx_size_wide[tx_size]; + // const int height = tx_size_high[tx_size]; + // if (width < height) { + // if (row < 2) return 11 + ctx; + // } else if (width > height) { + // if (col < 2) return 16 + ctx; + // } + // if (row + col < 2) return ctx + 1; + // if (row + col < 4) return 5 + ctx + 1; + // return 21 + ctx; + return ctx + av1_nz_map_ctx_offset[tx_size][coeff_idx]; + } + case TX_CLASS_HORIZ: { + const int row = coeff_idx >> bwl; + const int col = coeff_idx - (row << bwl); + return ctx + nz_map_ctx_offset_1d[col]; + } + case TX_CLASS_VERT: { + const int row = coeff_idx >> bwl; + return ctx + nz_map_ctx_offset_1d[row]; + } + default: break; + } + return 0; +} + +typedef aom_cdf_prob (*base_cdf_arr)[CDF_SIZE(4)]; +typedef aom_cdf_prob (*br_cdf_arr)[CDF_SIZE(BR_CDF_SIZE)]; + +static INLINE int get_lower_levels_ctx_eob(int bwl, int height, int scan_idx) { + if (scan_idx == 0) return 0; + if (scan_idx <= (height << bwl) / 8) return 1; + if (scan_idx <= (height << bwl) / 4) return 2; + return 3; +} + +static INLINE int get_lower_levels_ctx_2d(const uint8_t *levels, int coeff_idx, + int bwl, TX_SIZE tx_size) { + assert(coeff_idx > 0); + int mag; + // Note: AOMMIN(level, 3) is useless for decoder since level < 3. + levels = levels + get_padded_idx(coeff_idx, bwl); + mag = AOMMIN(levels[1], 3); // { 0, 1 } + mag += AOMMIN(levels[(1 << bwl) + TX_PAD_HOR], 3); // { 1, 0 } + mag += AOMMIN(levels[(1 << bwl) + TX_PAD_HOR + 1], 3); // { 1, 1 } + mag += AOMMIN(levels[2], 3); // { 0, 2 } + mag += AOMMIN(levels[(2 << bwl) + (2 << TX_PAD_HOR_LOG2)], 3); // { 2, 0 } + + const int ctx = AOMMIN((mag + 1) >> 1, 4); + return ctx + av1_nz_map_ctx_offset[tx_size][coeff_idx]; +} +static AOM_FORCE_INLINE int get_lower_levels_ctx(const uint8_t *levels, + int coeff_idx, int bwl, + TX_SIZE tx_size, + TX_CLASS tx_class) { + const int stats = + get_nz_mag(levels + get_padded_idx(coeff_idx, bwl), bwl, tx_class); + return get_nz_map_ctx_from_stats(stats, coeff_idx, bwl, tx_size, tx_class); +} + +static INLINE int get_lower_levels_ctx_general(int is_last, int scan_idx, + int bwl, int height, + const uint8_t *levels, + int coeff_idx, TX_SIZE tx_size, + TX_CLASS tx_class) { + if (is_last) { + if (scan_idx == 0) return 0; + if (scan_idx <= (height << bwl) >> 3) return 1; + if (scan_idx <= (height << bwl) >> 2) return 2; + return 3; + } + return get_lower_levels_ctx(levels, coeff_idx, bwl, tx_size, tx_class); +} + +static INLINE void set_dc_sign(int *cul_level, int dc_val) { + if (dc_val < 0) + *cul_level |= 1 << COEFF_CONTEXT_BITS; + else if (dc_val > 0) + *cul_level += 2 << COEFF_CONTEXT_BITS; +} + +static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize, + const TX_SIZE tx_size, const int plane, + const ENTROPY_CONTEXT *const a, + const ENTROPY_CONTEXT *const l, + TXB_CTX *const txb_ctx) { +#define MAX_TX_SIZE_UNIT 16 + static const int8_t signs[3] = { 0, -1, 1 }; + static const int8_t dc_sign_contexts[4 * MAX_TX_SIZE_UNIT + 1] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + }; + const int txb_w_unit = tx_size_wide_unit[tx_size]; + const int txb_h_unit = tx_size_high_unit[tx_size]; + int dc_sign = 0; + int k = 0; + + do { + const unsigned int sign = ((uint8_t)a[k]) >> COEFF_CONTEXT_BITS; + assert(sign <= 2); + dc_sign += signs[sign]; + } while (++k < txb_w_unit); + + k = 0; + do { + const unsigned int sign = ((uint8_t)l[k]) >> COEFF_CONTEXT_BITS; + assert(sign <= 2); + dc_sign += signs[sign]; + } while (++k < txb_h_unit); + + txb_ctx->dc_sign_ctx = dc_sign_contexts[dc_sign + 2 * MAX_TX_SIZE_UNIT]; + + if (plane == 0) { + if (plane_bsize == txsize_to_bsize[tx_size]) { + txb_ctx->txb_skip_ctx = 0; + } else { + // This is the algorithm to generate table skip_contexts[top][left]. + // const int max = AOMMIN(top | left, 4); + // const int min = AOMMIN(AOMMIN(top, left), 4); + // if (!max) + // txb_skip_ctx = 1; + // else if (!min) + // txb_skip_ctx = 2 + (max > 3); + // else if (max <= 3) + // txb_skip_ctx = 4; + // else if (min <= 3) + // txb_skip_ctx = 5; + // else + // txb_skip_ctx = 6; + static const uint8_t skip_contexts[5][5] = { { 1, 2, 2, 2, 3 }, + { 2, 4, 4, 4, 5 }, + { 2, 4, 4, 4, 5 }, + { 2, 4, 4, 4, 5 }, + { 3, 5, 5, 5, 6 } }; + // For top and left, we only care about which of the following three + // categories they belong to: { 0 }, { 1, 2, 3 }, or { 4, 5, ... }. The + // spec calculates top and left with the Max() function. We can calculate + // an approximate max with bitwise OR because the real max and the + // approximate max belong to the same category. + int top = 0; + int left = 0; + + k = 0; + do { + top |= a[k]; + } while (++k < txb_w_unit); + top &= COEFF_CONTEXT_MASK; + top = AOMMIN(top, 4); + + k = 0; + do { + left |= l[k]; + } while (++k < txb_h_unit); + left &= COEFF_CONTEXT_MASK; + left = AOMMIN(left, 4); + + txb_ctx->txb_skip_ctx = skip_contexts[top][left]; + } + } else { + const int ctx_base = get_entropy_context(tx_size, a, l); + const int ctx_offset = (num_pels_log2_lookup[plane_bsize] > + num_pels_log2_lookup[txsize_to_bsize[tx_size]]) + ? 10 + : 7; + txb_ctx->txb_skip_ctx = ctx_base + ctx_offset; + } +#undef MAX_TX_SIZE_UNIT +} + +#endif // AOM_AV1_COMMON_TXB_COMMON_H_ diff --git a/libs/libaom/src/av1/common/warped_motion.c b/libs/libaom/src/av1/common/warped_motion.c new file mode 100644 index 000000000..4e9fab9bd --- /dev/null +++ b/libs/libaom/src/av1/common/warped_motion.c @@ -0,0 +1,1073 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/warped_motion.h" +#include "av1/common/scale.h" + +// For warping, we really use a 6-tap filter, but we do blocks of 8 pixels +// at a time. The zoom/rotation/shear in the model are applied to the +// "fractional" position of each pixel, which therefore varies within +// [-1, 2) * WARPEDPIXEL_PREC_SHIFTS. +// We need an extra 2 taps to fit this in, for a total of 8 taps. +/* clang-format off */ +const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = { +#if WARPEDPIXEL_PREC_BITS == 6 + // [-1, 0) + { 0, 0, 127, 1, 0, 0, 0, 0 }, { 0, - 1, 127, 2, 0, 0, 0, 0 }, + { 1, - 3, 127, 4, - 1, 0, 0, 0 }, { 1, - 4, 126, 6, - 2, 1, 0, 0 }, + { 1, - 5, 126, 8, - 3, 1, 0, 0 }, { 1, - 6, 125, 11, - 4, 1, 0, 0 }, + { 1, - 7, 124, 13, - 4, 1, 0, 0 }, { 2, - 8, 123, 15, - 5, 1, 0, 0 }, + { 2, - 9, 122, 18, - 6, 1, 0, 0 }, { 2, -10, 121, 20, - 6, 1, 0, 0 }, + { 2, -11, 120, 22, - 7, 2, 0, 0 }, { 2, -12, 119, 25, - 8, 2, 0, 0 }, + { 3, -13, 117, 27, - 8, 2, 0, 0 }, { 3, -13, 116, 29, - 9, 2, 0, 0 }, + { 3, -14, 114, 32, -10, 3, 0, 0 }, { 3, -15, 113, 35, -10, 2, 0, 0 }, + { 3, -15, 111, 37, -11, 3, 0, 0 }, { 3, -16, 109, 40, -11, 3, 0, 0 }, + { 3, -16, 108, 42, -12, 3, 0, 0 }, { 4, -17, 106, 45, -13, 3, 0, 0 }, + { 4, -17, 104, 47, -13, 3, 0, 0 }, { 4, -17, 102, 50, -14, 3, 0, 0 }, + { 4, -17, 100, 52, -14, 3, 0, 0 }, { 4, -18, 98, 55, -15, 4, 0, 0 }, + { 4, -18, 96, 58, -15, 3, 0, 0 }, { 4, -18, 94, 60, -16, 4, 0, 0 }, + { 4, -18, 91, 63, -16, 4, 0, 0 }, { 4, -18, 89, 65, -16, 4, 0, 0 }, + { 4, -18, 87, 68, -17, 4, 0, 0 }, { 4, -18, 85, 70, -17, 4, 0, 0 }, + { 4, -18, 82, 73, -17, 4, 0, 0 }, { 4, -18, 80, 75, -17, 4, 0, 0 }, + { 4, -18, 78, 78, -18, 4, 0, 0 }, { 4, -17, 75, 80, -18, 4, 0, 0 }, + { 4, -17, 73, 82, -18, 4, 0, 0 }, { 4, -17, 70, 85, -18, 4, 0, 0 }, + { 4, -17, 68, 87, -18, 4, 0, 0 }, { 4, -16, 65, 89, -18, 4, 0, 0 }, + { 4, -16, 63, 91, -18, 4, 0, 0 }, { 4, -16, 60, 94, -18, 4, 0, 0 }, + { 3, -15, 58, 96, -18, 4, 0, 0 }, { 4, -15, 55, 98, -18, 4, 0, 0 }, + { 3, -14, 52, 100, -17, 4, 0, 0 }, { 3, -14, 50, 102, -17, 4, 0, 0 }, + { 3, -13, 47, 104, -17, 4, 0, 0 }, { 3, -13, 45, 106, -17, 4, 0, 0 }, + { 3, -12, 42, 108, -16, 3, 0, 0 }, { 3, -11, 40, 109, -16, 3, 0, 0 }, + { 3, -11, 37, 111, -15, 3, 0, 0 }, { 2, -10, 35, 113, -15, 3, 0, 0 }, + { 3, -10, 32, 114, -14, 3, 0, 0 }, { 2, - 9, 29, 116, -13, 3, 0, 0 }, + { 2, - 8, 27, 117, -13, 3, 0, 0 }, { 2, - 8, 25, 119, -12, 2, 0, 0 }, + { 2, - 7, 22, 120, -11, 2, 0, 0 }, { 1, - 6, 20, 121, -10, 2, 0, 0 }, + { 1, - 6, 18, 122, - 9, 2, 0, 0 }, { 1, - 5, 15, 123, - 8, 2, 0, 0 }, + { 1, - 4, 13, 124, - 7, 1, 0, 0 }, { 1, - 4, 11, 125, - 6, 1, 0, 0 }, + { 1, - 3, 8, 126, - 5, 1, 0, 0 }, { 1, - 2, 6, 126, - 4, 1, 0, 0 }, + { 0, - 1, 4, 127, - 3, 1, 0, 0 }, { 0, 0, 2, 127, - 1, 0, 0, 0 }, + + // [0, 1) + { 0, 0, 0, 127, 1, 0, 0, 0}, { 0, 0, -1, 127, 2, 0, 0, 0}, + { 0, 1, -3, 127, 4, -2, 1, 0}, { 0, 1, -5, 127, 6, -2, 1, 0}, + { 0, 2, -6, 126, 8, -3, 1, 0}, {-1, 2, -7, 126, 11, -4, 2, -1}, + {-1, 3, -8, 125, 13, -5, 2, -1}, {-1, 3, -10, 124, 16, -6, 3, -1}, + {-1, 4, -11, 123, 18, -7, 3, -1}, {-1, 4, -12, 122, 20, -7, 3, -1}, + {-1, 4, -13, 121, 23, -8, 3, -1}, {-2, 5, -14, 120, 25, -9, 4, -1}, + {-1, 5, -15, 119, 27, -10, 4, -1}, {-1, 5, -16, 118, 30, -11, 4, -1}, + {-2, 6, -17, 116, 33, -12, 5, -1}, {-2, 6, -17, 114, 35, -12, 5, -1}, + {-2, 6, -18, 113, 38, -13, 5, -1}, {-2, 7, -19, 111, 41, -14, 6, -2}, + {-2, 7, -19, 110, 43, -15, 6, -2}, {-2, 7, -20, 108, 46, -15, 6, -2}, + {-2, 7, -20, 106, 49, -16, 6, -2}, {-2, 7, -21, 104, 51, -16, 7, -2}, + {-2, 7, -21, 102, 54, -17, 7, -2}, {-2, 8, -21, 100, 56, -18, 7, -2}, + {-2, 8, -22, 98, 59, -18, 7, -2}, {-2, 8, -22, 96, 62, -19, 7, -2}, + {-2, 8, -22, 94, 64, -19, 7, -2}, {-2, 8, -22, 91, 67, -20, 8, -2}, + {-2, 8, -22, 89, 69, -20, 8, -2}, {-2, 8, -22, 87, 72, -21, 8, -2}, + {-2, 8, -21, 84, 74, -21, 8, -2}, {-2, 8, -22, 82, 77, -21, 8, -2}, + {-2, 8, -21, 79, 79, -21, 8, -2}, {-2, 8, -21, 77, 82, -22, 8, -2}, + {-2, 8, -21, 74, 84, -21, 8, -2}, {-2, 8, -21, 72, 87, -22, 8, -2}, + {-2, 8, -20, 69, 89, -22, 8, -2}, {-2, 8, -20, 67, 91, -22, 8, -2}, + {-2, 7, -19, 64, 94, -22, 8, -2}, {-2, 7, -19, 62, 96, -22, 8, -2}, + {-2, 7, -18, 59, 98, -22, 8, -2}, {-2, 7, -18, 56, 100, -21, 8, -2}, + {-2, 7, -17, 54, 102, -21, 7, -2}, {-2, 7, -16, 51, 104, -21, 7, -2}, + {-2, 6, -16, 49, 106, -20, 7, -2}, {-2, 6, -15, 46, 108, -20, 7, -2}, + {-2, 6, -15, 43, 110, -19, 7, -2}, {-2, 6, -14, 41, 111, -19, 7, -2}, + {-1, 5, -13, 38, 113, -18, 6, -2}, {-1, 5, -12, 35, 114, -17, 6, -2}, + {-1, 5, -12, 33, 116, -17, 6, -2}, {-1, 4, -11, 30, 118, -16, 5, -1}, + {-1, 4, -10, 27, 119, -15, 5, -1}, {-1, 4, -9, 25, 120, -14, 5, -2}, + {-1, 3, -8, 23, 121, -13, 4, -1}, {-1, 3, -7, 20, 122, -12, 4, -1}, + {-1, 3, -7, 18, 123, -11, 4, -1}, {-1, 3, -6, 16, 124, -10, 3, -1}, + {-1, 2, -5, 13, 125, -8, 3, -1}, {-1, 2, -4, 11, 126, -7, 2, -1}, + { 0, 1, -3, 8, 126, -6, 2, 0}, { 0, 1, -2, 6, 127, -5, 1, 0}, + { 0, 1, -2, 4, 127, -3, 1, 0}, { 0, 0, 0, 2, 127, -1, 0, 0}, + + // [1, 2) + { 0, 0, 0, 1, 127, 0, 0, 0 }, { 0, 0, 0, - 1, 127, 2, 0, 0 }, + { 0, 0, 1, - 3, 127, 4, - 1, 0 }, { 0, 0, 1, - 4, 126, 6, - 2, 1 }, + { 0, 0, 1, - 5, 126, 8, - 3, 1 }, { 0, 0, 1, - 6, 125, 11, - 4, 1 }, + { 0, 0, 1, - 7, 124, 13, - 4, 1 }, { 0, 0, 2, - 8, 123, 15, - 5, 1 }, + { 0, 0, 2, - 9, 122, 18, - 6, 1 }, { 0, 0, 2, -10, 121, 20, - 6, 1 }, + { 0, 0, 2, -11, 120, 22, - 7, 2 }, { 0, 0, 2, -12, 119, 25, - 8, 2 }, + { 0, 0, 3, -13, 117, 27, - 8, 2 }, { 0, 0, 3, -13, 116, 29, - 9, 2 }, + { 0, 0, 3, -14, 114, 32, -10, 3 }, { 0, 0, 3, -15, 113, 35, -10, 2 }, + { 0, 0, 3, -15, 111, 37, -11, 3 }, { 0, 0, 3, -16, 109, 40, -11, 3 }, + { 0, 0, 3, -16, 108, 42, -12, 3 }, { 0, 0, 4, -17, 106, 45, -13, 3 }, + { 0, 0, 4, -17, 104, 47, -13, 3 }, { 0, 0, 4, -17, 102, 50, -14, 3 }, + { 0, 0, 4, -17, 100, 52, -14, 3 }, { 0, 0, 4, -18, 98, 55, -15, 4 }, + { 0, 0, 4, -18, 96, 58, -15, 3 }, { 0, 0, 4, -18, 94, 60, -16, 4 }, + { 0, 0, 4, -18, 91, 63, -16, 4 }, { 0, 0, 4, -18, 89, 65, -16, 4 }, + { 0, 0, 4, -18, 87, 68, -17, 4 }, { 0, 0, 4, -18, 85, 70, -17, 4 }, + { 0, 0, 4, -18, 82, 73, -17, 4 }, { 0, 0, 4, -18, 80, 75, -17, 4 }, + { 0, 0, 4, -18, 78, 78, -18, 4 }, { 0, 0, 4, -17, 75, 80, -18, 4 }, + { 0, 0, 4, -17, 73, 82, -18, 4 }, { 0, 0, 4, -17, 70, 85, -18, 4 }, + { 0, 0, 4, -17, 68, 87, -18, 4 }, { 0, 0, 4, -16, 65, 89, -18, 4 }, + { 0, 0, 4, -16, 63, 91, -18, 4 }, { 0, 0, 4, -16, 60, 94, -18, 4 }, + { 0, 0, 3, -15, 58, 96, -18, 4 }, { 0, 0, 4, -15, 55, 98, -18, 4 }, + { 0, 0, 3, -14, 52, 100, -17, 4 }, { 0, 0, 3, -14, 50, 102, -17, 4 }, + { 0, 0, 3, -13, 47, 104, -17, 4 }, { 0, 0, 3, -13, 45, 106, -17, 4 }, + { 0, 0, 3, -12, 42, 108, -16, 3 }, { 0, 0, 3, -11, 40, 109, -16, 3 }, + { 0, 0, 3, -11, 37, 111, -15, 3 }, { 0, 0, 2, -10, 35, 113, -15, 3 }, + { 0, 0, 3, -10, 32, 114, -14, 3 }, { 0, 0, 2, - 9, 29, 116, -13, 3 }, + { 0, 0, 2, - 8, 27, 117, -13, 3 }, { 0, 0, 2, - 8, 25, 119, -12, 2 }, + { 0, 0, 2, - 7, 22, 120, -11, 2 }, { 0, 0, 1, - 6, 20, 121, -10, 2 }, + { 0, 0, 1, - 6, 18, 122, - 9, 2 }, { 0, 0, 1, - 5, 15, 123, - 8, 2 }, + { 0, 0, 1, - 4, 13, 124, - 7, 1 }, { 0, 0, 1, - 4, 11, 125, - 6, 1 }, + { 0, 0, 1, - 3, 8, 126, - 5, 1 }, { 0, 0, 1, - 2, 6, 126, - 4, 1 }, + { 0, 0, 0, - 1, 4, 127, - 3, 1 }, { 0, 0, 0, 0, 2, 127, - 1, 0 }, + // dummy (replicate row index 191) + { 0, 0, 0, 0, 2, 127, - 1, 0 }, + +#elif WARPEDPIXEL_PREC_BITS == 5 + // [-1, 0) + {0, 0, 127, 1, 0, 0, 0, 0}, {1, -3, 127, 4, -1, 0, 0, 0}, + {1, -5, 126, 8, -3, 1, 0, 0}, {1, -7, 124, 13, -4, 1, 0, 0}, + {2, -9, 122, 18, -6, 1, 0, 0}, {2, -11, 120, 22, -7, 2, 0, 0}, + {3, -13, 117, 27, -8, 2, 0, 0}, {3, -14, 114, 32, -10, 3, 0, 0}, + {3, -15, 111, 37, -11, 3, 0, 0}, {3, -16, 108, 42, -12, 3, 0, 0}, + {4, -17, 104, 47, -13, 3, 0, 0}, {4, -17, 100, 52, -14, 3, 0, 0}, + {4, -18, 96, 58, -15, 3, 0, 0}, {4, -18, 91, 63, -16, 4, 0, 0}, + {4, -18, 87, 68, -17, 4, 0, 0}, {4, -18, 82, 73, -17, 4, 0, 0}, + {4, -18, 78, 78, -18, 4, 0, 0}, {4, -17, 73, 82, -18, 4, 0, 0}, + {4, -17, 68, 87, -18, 4, 0, 0}, {4, -16, 63, 91, -18, 4, 0, 0}, + {3, -15, 58, 96, -18, 4, 0, 0}, {3, -14, 52, 100, -17, 4, 0, 0}, + {3, -13, 47, 104, -17, 4, 0, 0}, {3, -12, 42, 108, -16, 3, 0, 0}, + {3, -11, 37, 111, -15, 3, 0, 0}, {3, -10, 32, 114, -14, 3, 0, 0}, + {2, -8, 27, 117, -13, 3, 0, 0}, {2, -7, 22, 120, -11, 2, 0, 0}, + {1, -6, 18, 122, -9, 2, 0, 0}, {1, -4, 13, 124, -7, 1, 0, 0}, + {1, -3, 8, 126, -5, 1, 0, 0}, {0, -1, 4, 127, -3, 1, 0, 0}, + // [0, 1) + { 0, 0, 0, 127, 1, 0, 0, 0}, { 0, 1, -3, 127, 4, -2, 1, 0}, + { 0, 2, -6, 126, 8, -3, 1, 0}, {-1, 3, -8, 125, 13, -5, 2, -1}, + {-1, 4, -11, 123, 18, -7, 3, -1}, {-1, 4, -13, 121, 23, -8, 3, -1}, + {-1, 5, -15, 119, 27, -10, 4, -1}, {-2, 6, -17, 116, 33, -12, 5, -1}, + {-2, 6, -18, 113, 38, -13, 5, -1}, {-2, 7, -19, 110, 43, -15, 6, -2}, + {-2, 7, -20, 106, 49, -16, 6, -2}, {-2, 7, -21, 102, 54, -17, 7, -2}, + {-2, 8, -22, 98, 59, -18, 7, -2}, {-2, 8, -22, 94, 64, -19, 7, -2}, + {-2, 8, -22, 89, 69, -20, 8, -2}, {-2, 8, -21, 84, 74, -21, 8, -2}, + {-2, 8, -21, 79, 79, -21, 8, -2}, {-2, 8, -21, 74, 84, -21, 8, -2}, + {-2, 8, -20, 69, 89, -22, 8, -2}, {-2, 7, -19, 64, 94, -22, 8, -2}, + {-2, 7, -18, 59, 98, -22, 8, -2}, {-2, 7, -17, 54, 102, -21, 7, -2}, + {-2, 6, -16, 49, 106, -20, 7, -2}, {-2, 6, -15, 43, 110, -19, 7, -2}, + {-1, 5, -13, 38, 113, -18, 6, -2}, {-1, 5, -12, 33, 116, -17, 6, -2}, + {-1, 4, -10, 27, 119, -15, 5, -1}, {-1, 3, -8, 23, 121, -13, 4, -1}, + {-1, 3, -7, 18, 123, -11, 4, -1}, {-1, 2, -5, 13, 125, -8, 3, -1}, + { 0, 1, -3, 8, 126, -6, 2, 0}, { 0, 1, -2, 4, 127, -3, 1, 0}, + // [1, 2) + {0, 0, 0, 1, 127, 0, 0, 0}, {0, 0, 1, -3, 127, 4, -1, 0}, + {0, 0, 1, -5, 126, 8, -3, 1}, {0, 0, 1, -7, 124, 13, -4, 1}, + {0, 0, 2, -9, 122, 18, -6, 1}, {0, 0, 2, -11, 120, 22, -7, 2}, + {0, 0, 3, -13, 117, 27, -8, 2}, {0, 0, 3, -14, 114, 32, -10, 3}, + {0, 0, 3, -15, 111, 37, -11, 3}, {0, 0, 3, -16, 108, 42, -12, 3}, + {0, 0, 4, -17, 104, 47, -13, 3}, {0, 0, 4, -17, 100, 52, -14, 3}, + {0, 0, 4, -18, 96, 58, -15, 3}, {0, 0, 4, -18, 91, 63, -16, 4}, + {0, 0, 4, -18, 87, 68, -17, 4}, {0, 0, 4, -18, 82, 73, -17, 4}, + {0, 0, 4, -18, 78, 78, -18, 4}, {0, 0, 4, -17, 73, 82, -18, 4}, + {0, 0, 4, -17, 68, 87, -18, 4}, {0, 0, 4, -16, 63, 91, -18, 4}, + {0, 0, 3, -15, 58, 96, -18, 4}, {0, 0, 3, -14, 52, 100, -17, 4}, + {0, 0, 3, -13, 47, 104, -17, 4}, {0, 0, 3, -12, 42, 108, -16, 3}, + {0, 0, 3, -11, 37, 111, -15, 3}, {0, 0, 3, -10, 32, 114, -14, 3}, + {0, 0, 2, -8, 27, 117, -13, 3}, {0, 0, 2, -7, 22, 120, -11, 2}, + {0, 0, 1, -6, 18, 122, -9, 2}, {0, 0, 1, -4, 13, 124, -7, 1}, + {0, 0, 1, -3, 8, 126, -5, 1}, {0, 0, 0, -1, 4, 127, -3, 1}, + // dummy (replicate row index 95) + {0, 0, 0, -1, 4, 127, -3, 1}, + +#endif // WARPEDPIXEL_PREC_BITS == 6 +}; + +/* clang-format on */ + +#define DIV_LUT_PREC_BITS 14 +#define DIV_LUT_BITS 8 +#define DIV_LUT_NUM (1 << DIV_LUT_BITS) + +static const uint16_t div_lut[DIV_LUT_NUM + 1] = { + 16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768, + 15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142, + 15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564, + 14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028, + 13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530, + 13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066, + 13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633, + 12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228, + 12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848, + 11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491, + 11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155, + 11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838, + 10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538, + 10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255, + 10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986, + 9963, 9939, 9916, 9892, 9869, 9846, 9823, 9800, 9777, 9754, 9732, + 9709, 9687, 9664, 9642, 9620, 9598, 9576, 9554, 9533, 9511, 9489, + 9468, 9447, 9425, 9404, 9383, 9362, 9341, 9321, 9300, 9279, 9259, + 9239, 9218, 9198, 9178, 9158, 9138, 9118, 9098, 9079, 9059, 9039, + 9020, 9001, 8981, 8962, 8943, 8924, 8905, 8886, 8867, 8849, 8830, + 8812, 8793, 8775, 8756, 8738, 8720, 8702, 8684, 8666, 8648, 8630, + 8613, 8595, 8577, 8560, 8542, 8525, 8508, 8490, 8473, 8456, 8439, + 8422, 8405, 8389, 8372, 8355, 8339, 8322, 8306, 8289, 8273, 8257, + 8240, 8224, 8208, 8192, +}; + +// Decomposes a divisor D such that 1/D = y/2^shift, where y is returned +// at precision of DIV_LUT_PREC_BITS along with the shift. +static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) { + int64_t f; + *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32 + : get_msb((unsigned int)D)); + // e is obtained from D after resetting the most significant 1 bit. + const int64_t e = D - ((uint64_t)1 << *shift); + // Get the most significant DIV_LUT_BITS (8) bits of e into f + if (*shift > DIV_LUT_BITS) + f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS); + else + f = e << (DIV_LUT_BITS - *shift); + assert(f <= DIV_LUT_NUM); + *shift += DIV_LUT_PREC_BITS; + // Use f as lookup into the precomputed table of multipliers + return div_lut[f]; +} + +static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) { + int32_t f; + *shift = get_msb(D); + // e is obtained from D after resetting the most significant 1 bit. + const int32_t e = D - ((uint32_t)1 << *shift); + // Get the most significant DIV_LUT_BITS (8) bits of e into f + if (*shift > DIV_LUT_BITS) + f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS); + else + f = e << (DIV_LUT_BITS - *shift); + assert(f <= DIV_LUT_NUM); + *shift += DIV_LUT_PREC_BITS; + // Use f as lookup into the precomputed table of multipliers + return div_lut[f]; +} + +static int is_affine_valid(const WarpedMotionParams *const wm) { + const int32_t *mat = wm->wmmat; + return (mat[2] > 0); +} + +static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma, + int16_t delta) { + if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) || + (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS))) + return 0; + else + return 1; +} + +// Returns 1 on success or 0 on an invalid affine set +int av1_get_shear_params(WarpedMotionParams *wm) { + const int32_t *mat = wm->wmmat; + if (!is_affine_valid(wm)) return 0; + wm->alpha = + clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX); + wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX); + int16_t shift; + int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1); + int64_t v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y; + wm->gamma = + clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX); + v = ((int64_t)mat[3] * mat[4]) * y; + wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) - + (1 << WARPEDMODEL_PREC_BITS), + INT16_MIN, INT16_MAX); + + wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) * + (1 << WARP_PARAM_REDUCE_BITS); + wm->beta = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS) * + (1 << WARP_PARAM_REDUCE_BITS); + wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS) * + (1 << WARP_PARAM_REDUCE_BITS); + wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) * + (1 << WARP_PARAM_REDUCE_BITS); + + if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta)) + return 0; + + return 1; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE int highbd_error_measure(int err, int bd) { + const int b = bd - 8; + const int bmask = (1 << b) - 1; + const int v = (1 << b); + err = abs(err); + const int e1 = err >> b; + const int e2 = err & bmask; + return error_measure_lut[255 + e1] * (v - e2) + + error_measure_lut[256 + e1] * e2; +} + +/* Note: For an explanation of the warp algorithm, and some notes on bit widths + for hardware implementations, see the comments above av1_warp_affine_c +*/ +void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, + int width, int height, int stride, uint16_t *pred, + int p_col, int p_row, int p_width, int p_height, + int p_stride, int subsampling_x, + int subsampling_y, int bd, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + int32_t tmp[15 * 8]; + const int reduce_bits_horiz = + conv_params->round_0 + + AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0); + const int reduce_bits_vert = conv_params->is_compound + ? conv_params->round_1 + : 2 * FILTER_BITS - reduce_bits_horiz; + const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + (void)max_bits_horiz; + assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); + + for (int i = p_row; i < p_row + p_height; i += 8) { + for (int j = p_col; j < p_col + p_width; j += 8) { + // Calculate the center of this 8x8 block, + // project to luma coordinates (if in a subsampled chroma plane), + // apply the affine transformation, + // then convert back to the original coordinates (if necessary) + const int32_t src_x = (j + 4) << subsampling_x; + const int32_t src_y = (i + 4) << subsampling_y; + const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0]; + const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1]; + const int32_t x4 = dst_x >> subsampling_x; + const int32_t y4 = dst_y >> subsampling_y; + + const int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS; + int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + const int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS; + int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + + sx4 += alpha * (-4) + beta * (-4); + sy4 += gamma * (-4) + delta * (-4); + + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + + // Horizontal filter + for (int k = -7; k < 8; ++k) { + const int iy = clamp(iy4 + k, 0, height - 1); + + int sx = sx4 + beta * (k + 4); + for (int l = -4; l < 4; ++l) { + int ix = ix4 + l - 3; + const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) + + WARPEDPIXEL_PREC_SHIFTS; + assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); + const int16_t *coeffs = av1_warped_filter[offs]; + + int32_t sum = 1 << offset_bits_horiz; + for (int m = 0; m < 8; ++m) { + const int sample_x = clamp(ix + m, 0, width - 1); + sum += ref[iy * stride + sample_x] * coeffs[m]; + } + sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz); + assert(0 <= sum && sum < (1 << max_bits_horiz)); + tmp[(k + 7) * 8 + (l + 4)] = sum; + sx += alpha; + } + } + + // Vertical filter + for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) { + const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) + + WARPEDPIXEL_PREC_SHIFTS; + assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); + const int16_t *coeffs = av1_warped_filter[offs]; + + int32_t sum = 1 << offset_bits_vert; + for (int m = 0; m < 8; ++m) { + sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m]; + } + + if (conv_params->is_compound) { + CONV_BUF_TYPE *p = + &conv_params + ->dst[(i - p_row + k + 4) * conv_params->dst_stride + + (j - p_col + l + 4)]; + sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert); + if (conv_params->do_average) { + uint16_t *dst16 = + &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)]; + int32_t tmp32 = *p; + if (conv_params->use_dist_wtd_comp_avg) { + tmp32 = tmp32 * conv_params->fwd_offset + + sum * conv_params->bck_offset; + tmp32 = tmp32 >> DIST_PRECISION_BITS; + } else { + tmp32 += sum; + tmp32 = tmp32 >> 1; + } + tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) - + (1 << (offset_bits - conv_params->round_1 - 1)); + *dst16 = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp32, round_bits), bd); + } else { + *p = sum; + } + } else { + uint16_t *p = + &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)]; + sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert); + assert(0 <= sum && sum < (1 << (bd + 2))); + *p = clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd); + } + sy += gamma; + } + } + } + } +} + +void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref, + int width, int height, int stride, uint16_t *const pred, + int p_col, int p_row, int p_width, int p_height, + int p_stride, int subsampling_x, int subsampling_y, + int bd, ConvolveParams *conv_params) { + assert(wm->wmtype <= AFFINE); + if (wm->wmtype == ROTZOOM) { + wm->wmmat[5] = wm->wmmat[2]; + wm->wmmat[4] = -wm->wmmat[3]; + } + const int32_t *const mat = wm->wmmat; + const int16_t alpha = wm->alpha; + const int16_t beta = wm->beta; + const int16_t gamma = wm->gamma; + const int16_t delta = wm->delta; + + av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, + p_width, p_height, p_stride, subsampling_x, + subsampling_y, bd, conv_params, alpha, beta, gamma, + delta); +} + +int64_t av1_calc_highbd_frame_error(const uint16_t *const ref, int stride, + const uint16_t *const dst, int p_width, + int p_height, int p_stride, int bd) { + int64_t sum_error = 0; + for (int i = 0; i < p_height; ++i) { + for (int j = 0; j < p_width; ++j) { + sum_error += + highbd_error_measure(dst[j + i * p_stride] - ref[j + i * stride], bd); + } + } + return sum_error; +} + +static int64_t highbd_segmented_frame_error( + const uint16_t *const ref, int stride, const uint16_t *const dst, + int p_width, int p_height, int p_stride, int bd, uint8_t *segment_map, + int segment_map_stride) { + int patch_w, patch_h; + const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK); + const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK); + int64_t sum_error = 0; + for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) { + for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) { + int seg_x = j >> WARP_ERROR_BLOCK_LOG; + int seg_y = i >> WARP_ERROR_BLOCK_LOG; + // Only compute the error if this block contains inliers from the motion + // model + if (!segment_map[seg_y * segment_map_stride + seg_x]) continue; + + // avoid computing error into the frame padding + patch_w = AOMMIN(error_bsize_w, p_width - j); + patch_h = AOMMIN(error_bsize_h, p_height - i); + sum_error += av1_calc_highbd_frame_error(ref + j + i * stride, stride, + dst + j + i * p_stride, patch_w, + patch_h, p_stride, bd); + } + } + return sum_error; +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +/* The warp filter for ROTZOOM and AFFINE models works as follows: + * Split the input into 8x8 blocks + * For each block, project the point (4, 4) within the block, to get the + overall block position. Split into integer and fractional coordinates, + maintaining full WARPEDMODEL precision + * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a + variable horizontal offset. This means that, while the rows of the + intermediate buffer align with the rows of the *reference* image, the + columns align with the columns of the *destination* image. + * Filter vertically: Generate the output block (up to 8x8 pixels, but if the + destination is too small we crop the output at this stage). Each pixel has + a variable vertical offset, so that the resulting rows are aligned with + the rows of the destination image. + + To accomplish these alignments, we factor the warp matrix as a + product of two shear / asymmetric zoom matrices: + / a b \ = / 1 0 \ * / 1+alpha beta \ + \ c d / \ gamma 1+delta / \ 0 1 / + where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively. + The horizontal shear (with alpha and beta) is applied first, + then the vertical shear (with gamma and delta) is applied second. + + The only limitation is that, to fit this in a fixed 8-tap filter size, + the fractional pixel offsets must be at most +-1. Since the horizontal filter + generates 15 rows of 8 columns, and the initial point we project is at (4, 4) + within the block, the parameters must satisfy + 4 * |alpha| + 7 * |beta| <= 1 and 4 * |gamma| + 4 * |delta| <= 1 + for this filter to be applicable. + + Note: This function assumes that the caller has done all of the relevant + checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5] + are set appropriately (if using a ROTZOOM model), and that alpha, beta, + gamma, delta are all in range. + + TODO(david.barker): Maybe support scaled references? +*/ +/* A note on hardware implementation: + The warp filter is intended to be implementable using the same hardware as + the high-precision convolve filters from the loop-restoration and + convolve-round experiments. + + For a single filter stage, considering all of the coefficient sets for the + warp filter and the regular convolution filter, an input in the range + [0, 2^k - 1] is mapped into the range [-56 * (2^k - 1), 184 * (2^k - 1)] + before rounding. + + Allowing for some changes to the filter coefficient sets, call the range + [-64 * 2^k, 192 * 2^k]. Then, if we initialize the accumulator to 64 * 2^k, + we can replace this by the range [0, 256 * 2^k], which can be stored in an + unsigned value with 8 + k bits. + + This allows the derivation of the appropriate bit widths and offsets for + the various intermediate values: If + + F := FILTER_BITS = 7 (or else the above ranges need adjusting) + So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit + intermediate value. + H := ROUND0_BITS + V := VERSHEAR_REDUCE_PREC_BITS + (and note that we must have H + V = 2*F for the output to have the same + scale as the input) + + then we end up with the following offsets and ranges: + Horizontal filter: Apply an offset of 1 << (bd + F - 1), sum fits into a + uint{bd + F + 1} + After rounding: The values stored in 'tmp' fit into a uint{bd + F + 1 - H}. + Vertical filter: Apply an offset of 1 << (bd + 2*F - H), sum fits into a + uint{bd + 2*F + 2 - H} + After rounding: The final value, before undoing the offset, fits into a + uint{bd + 2}. + + Then we need to undo the offsets before clamping to a pixel. Note that, + if we do this at the end, the amount to subtract is actually independent + of H and V: + + offset to subtract = (1 << ((bd + F - 1) - H + F - V)) + + (1 << ((bd + 2*F - H) - V)) + == (1 << (bd - 1)) + (1 << bd) + + This allows us to entirely avoid clamping in both the warp filter and + the convolve-round experiment. As of the time of writing, the Wiener filter + from loop-restoration can encode a central coefficient up to 216, which + leads to a maximum value of about 282 * 2^k after applying the offset. + So in that case we still need to clamp. +*/ +void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, + int height, int stride, uint8_t *pred, int p_col, + int p_row, int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, + ConvolveParams *conv_params, int16_t alpha, int16_t beta, + int16_t gamma, int16_t delta) { + int32_t tmp[15 * 8]; + const int bd = 8; + const int reduce_bits_horiz = conv_params->round_0; + const int reduce_bits_vert = conv_params->is_compound + ? conv_params->round_1 + : 2 * FILTER_BITS - reduce_bits_horiz; + const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + (void)max_bits_horiz; + assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); + assert(IMPLIES(conv_params->do_average, conv_params->is_compound)); + + for (int i = p_row; i < p_row + p_height; i += 8) { + for (int j = p_col; j < p_col + p_width; j += 8) { + // Calculate the center of this 8x8 block, + // project to luma coordinates (if in a subsampled chroma plane), + // apply the affine transformation, + // then convert back to the original coordinates (if necessary) + const int32_t src_x = (j + 4) << subsampling_x; + const int32_t src_y = (i + 4) << subsampling_y; + const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0]; + const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1]; + const int32_t x4 = dst_x >> subsampling_x; + const int32_t y4 = dst_y >> subsampling_y; + + int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS; + int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS; + int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + + sx4 += alpha * (-4) + beta * (-4); + sy4 += gamma * (-4) + delta * (-4); + + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + + // Horizontal filter + for (int k = -7; k < 8; ++k) { + // Clamp to top/bottom edge of the frame + const int iy = clamp(iy4 + k, 0, height - 1); + + int sx = sx4 + beta * (k + 4); + + for (int l = -4; l < 4; ++l) { + int ix = ix4 + l - 3; + // At this point, sx = sx4 + alpha * l + beta * k + const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) + + WARPEDPIXEL_PREC_SHIFTS; + assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); + const int16_t *coeffs = av1_warped_filter[offs]; + + int32_t sum = 1 << offset_bits_horiz; + for (int m = 0; m < 8; ++m) { + // Clamp to left/right edge of the frame + const int sample_x = clamp(ix + m, 0, width - 1); + + sum += ref[iy * stride + sample_x] * coeffs[m]; + } + sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz); + assert(0 <= sum && sum < (1 << max_bits_horiz)); + tmp[(k + 7) * 8 + (l + 4)] = sum; + sx += alpha; + } + } + + // Vertical filter + for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) { + // At this point, sy = sy4 + gamma * l + delta * k + const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) + + WARPEDPIXEL_PREC_SHIFTS; + assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); + const int16_t *coeffs = av1_warped_filter[offs]; + + int32_t sum = 1 << offset_bits_vert; + for (int m = 0; m < 8; ++m) { + sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m]; + } + + if (conv_params->is_compound) { + CONV_BUF_TYPE *p = + &conv_params + ->dst[(i - p_row + k + 4) * conv_params->dst_stride + + (j - p_col + l + 4)]; + sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert); + if (conv_params->do_average) { + uint8_t *dst8 = + &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)]; + int32_t tmp32 = *p; + if (conv_params->use_dist_wtd_comp_avg) { + tmp32 = tmp32 * conv_params->fwd_offset + + sum * conv_params->bck_offset; + tmp32 = tmp32 >> DIST_PRECISION_BITS; + } else { + tmp32 += sum; + tmp32 = tmp32 >> 1; + } + tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) - + (1 << (offset_bits - conv_params->round_1 - 1)); + *dst8 = clip_pixel(ROUND_POWER_OF_TWO(tmp32, round_bits)); + } else { + *p = sum; + } + } else { + uint8_t *p = + &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)]; + sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert); + assert(0 <= sum && sum < (1 << (bd + 2))); + *p = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd)); + } + sy += gamma; + } + } + } + } +} + +void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width, + int height, int stride, uint8_t *pred, int p_col, int p_row, + int p_width, int p_height, int p_stride, int subsampling_x, + int subsampling_y, ConvolveParams *conv_params) { + assert(wm->wmtype <= AFFINE); + if (wm->wmtype == ROTZOOM) { + wm->wmmat[5] = wm->wmmat[2]; + wm->wmmat[4] = -wm->wmmat[3]; + } + const int32_t *const mat = wm->wmmat; + const int16_t alpha = wm->alpha; + const int16_t beta = wm->beta; + const int16_t gamma = wm->gamma; + const int16_t delta = wm->delta; + av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width, + p_height, p_stride, subsampling_x, subsampling_y, conv_params, + alpha, beta, gamma, delta); +} + +int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride, + const uint8_t *const dst, int p_width, + int p_height, int p_stride) { + int64_t sum_error = 0; + for (int i = 0; i < p_height; ++i) { + for (int j = 0; j < p_width; ++j) { + sum_error += + (int64_t)error_measure(dst[j + i * p_stride] - ref[j + i * stride]); + } + } + return sum_error; +} + +static int64_t segmented_frame_error(const uint8_t *const ref, int stride, + const uint8_t *const dst, int p_width, + int p_height, int p_stride, + uint8_t *segment_map, + int segment_map_stride) { + int patch_w, patch_h; + const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK); + const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK); + int64_t sum_error = 0; + for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) { + for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) { + int seg_x = j >> WARP_ERROR_BLOCK_LOG; + int seg_y = i >> WARP_ERROR_BLOCK_LOG; + // Only compute the error if this block contains inliers from the motion + // model + if (!segment_map[seg_y * segment_map_stride + seg_x]) continue; + + // avoid computing error into the frame padding + patch_w = AOMMIN(error_bsize_w, p_width - j); + patch_h = AOMMIN(error_bsize_h, p_height - i); + sum_error += av1_calc_frame_error(ref + j + i * stride, stride, + dst + j + i * p_stride, patch_w, + patch_h, p_stride); + } + } + return sum_error; +} + +int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride, + uint8_t *dst, int p_width, int p_height, int p_stride) { +#if CONFIG_AV1_HIGHBITDEPTH + if (use_hbd) { + return av1_calc_highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride, + CONVERT_TO_SHORTPTR(dst), p_width, + p_height, p_stride, bd); + } +#endif + (void)use_hbd; + (void)bd; + return av1_calc_frame_error(ref, stride, dst, p_width, p_height, p_stride); +} + +int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref, + int stride, uint8_t *dst, int p_width, + int p_height, int p_stride, + uint8_t *segment_map, + int segment_map_stride) { +#if CONFIG_AV1_HIGHBITDEPTH + if (use_hbd) { + return highbd_segmented_frame_error( + CONVERT_TO_SHORTPTR(ref), stride, CONVERT_TO_SHORTPTR(dst), p_width, + p_height, p_stride, bd, segment_map, segment_map_stride); + } +#endif + (void)use_hbd; + (void)bd; + return segmented_frame_error(ref, stride, dst, p_width, p_height, p_stride, + segment_map, segment_map_stride); +} + +void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd, + const uint8_t *ref, int width, int height, int stride, + uint8_t *pred, int p_col, int p_row, int p_width, + int p_height, int p_stride, int subsampling_x, + int subsampling_y, ConvolveParams *conv_params) { +#if CONFIG_AV1_HIGHBITDEPTH + if (use_hbd) + highbd_warp_plane(wm, CONVERT_TO_SHORTPTR(ref), width, height, stride, + CONVERT_TO_SHORTPTR(pred), p_col, p_row, p_width, + p_height, p_stride, subsampling_x, subsampling_y, bd, + conv_params); + else + warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width, + p_height, p_stride, subsampling_x, subsampling_y, conv_params); +#else + (void)use_hbd; + (void)bd; + warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width, + p_height, p_stride, subsampling_x, subsampling_y, conv_params); +#endif +} + +#define LS_MV_MAX 256 // max mv in 1/8-pel +// Use LS_STEP = 8 so that 2 less bits needed for A, Bx, By. +#define LS_STEP 8 + +// Assuming LS_MV_MAX is < MAX_SB_SIZE * 8, +// the precision needed is: +// (MAX_SB_SIZE_LOG2 + 3) [for sx * sx magnitude] + +// (MAX_SB_SIZE_LOG2 + 4) [for sx * dx magnitude] + +// 1 [for sign] + +// LEAST_SQUARES_SAMPLES_MAX_BITS +// [for adding up to LEAST_SQUARES_SAMPLES_MAX samples] +// The value is 23 +#define LS_MAT_RANGE_BITS \ + ((MAX_SB_SIZE_LOG2 + 4) * 2 + LEAST_SQUARES_SAMPLES_MAX_BITS) + +// Bit-depth reduction from the full-range +#define LS_MAT_DOWN_BITS 2 + +// bits range of A, Bx and By after downshifting +#define LS_MAT_BITS (LS_MAT_RANGE_BITS - LS_MAT_DOWN_BITS) +#define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1))) +#define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1) + +// By setting LS_STEP = 8, the least 2 bits of every elements in A, Bx, By are +// 0. So, we can reduce LS_MAT_RANGE_BITS(2) bits here. +#define LS_SQUARE(a) \ + (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \ + (2 + LS_MAT_DOWN_BITS)) +#define LS_PRODUCT1(a, b) \ + (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \ + (2 + LS_MAT_DOWN_BITS)) +#define LS_PRODUCT2(a, b) \ + (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \ + (2 + LS_MAT_DOWN_BITS)) + +#define USE_LIMITED_PREC_MULT 0 + +#if USE_LIMITED_PREC_MULT + +#define MUL_PREC_BITS 16 +static uint16_t resolve_multiplier_64(uint64_t D, int16_t *shift) { + int msb = 0; + uint16_t mult = 0; + *shift = 0; + if (D != 0) { + msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32 + : get_msb((unsigned int)D)); + if (msb >= MUL_PREC_BITS) { + mult = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS); + *shift = msb + 1 - MUL_PREC_BITS; + } else { + mult = (uint16_t)D; + *shift = 0; + } + } + return mult; +} + +static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) { + int32_t ret; + int16_t mshift; + uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift); + int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1); + shift -= mshift; + if (shift > 0) { + return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift), + -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); + } else { + return (int32_t)clamp(v * (1 << (-shift)), + -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); + } + return ret; +} + +static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) { + int16_t mshift; + uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift); + int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1); + shift -= mshift; + if (shift > 0) { + return (int32_t)clamp( + ROUND_POWER_OF_TWO_SIGNED(v, shift), + (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); + } else { + return (int32_t)clamp( + v * (1 << (-shift)), + (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); + } +} + +#else + +static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) { + int64_t v = Px * (int64_t)iDet; + return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift), + -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); +} + +static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) { + int64_t v = Px * (int64_t)iDet; + return (int32_t)clamp64( + ROUND_POWER_OF_TWO_SIGNED_64(v, shift), + (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); +} +#endif // USE_LIMITED_PREC_MULT + +static int find_affine_int(int np, const int *pts1, const int *pts2, + BLOCK_SIZE bsize, int mvy, int mvx, + WarpedMotionParams *wm, int mi_row, int mi_col) { + int32_t A[2][2] = { { 0, 0 }, { 0, 0 } }; + int32_t Bx[2] = { 0, 0 }; + int32_t By[2] = { 0, 0 }; + + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const int rsuy = bh / 2 - 1; + const int rsux = bw / 2 - 1; + const int suy = rsuy * 8; + const int sux = rsux * 8; + const int duy = suy + mvy; + const int dux = sux + mvx; + + // Assume the center pixel of the block has exactly the same motion vector + // as transmitted for the block. First shift the origin of the source + // points to the block center, and the origin of the destination points to + // the block center added to the motion vector transmitted. + // Let (xi, yi) denote the source points and (xi', yi') denote destination + // points after origin shfifting, for i = 0, 1, 2, .... n-1. + // Then if P = [x0, y0, + // x1, y1 + // x2, y1, + // .... + // ] + // q = [x0', x1', x2', ... ]' + // r = [y0', y1', y2', ... ]' + // the least squares problems that need to be solved are: + // [h1, h2]' = inv(P'P)P'q and + // [h3, h4]' = inv(P'P)P'r + // where the affine transformation is given by: + // x' = h1.x + h2.y + // y' = h3.x + h4.y + // + // The loop below computes: A = P'P, Bx = P'q, By = P'r + // We need to just compute inv(A).Bx and inv(A).By for the solutions. + // Contribution from neighbor block + for (int i = 0; i < np; i++) { + const int dx = pts2[i * 2] - dux; + const int dy = pts2[i * 2 + 1] - duy; + const int sx = pts1[i * 2] - sux; + const int sy = pts1[i * 2 + 1] - suy; + // (TODO)yunqing: This comparison wouldn't be necessary if the sample + // selection is done in find_samples(). Also, global offset can be removed + // while collecting samples. + if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) { + A[0][0] += LS_SQUARE(sx); + A[0][1] += LS_PRODUCT1(sx, sy); + A[1][1] += LS_SQUARE(sy); + Bx[0] += LS_PRODUCT2(sx, dx); + Bx[1] += LS_PRODUCT1(sy, dx); + By[0] += LS_PRODUCT1(sx, dy); + By[1] += LS_PRODUCT2(sy, dy); + } + } + + // Just for debugging, and can be removed later. + assert(A[0][0] >= LS_MAT_MIN && A[0][0] <= LS_MAT_MAX); + assert(A[0][1] >= LS_MAT_MIN && A[0][1] <= LS_MAT_MAX); + assert(A[1][1] >= LS_MAT_MIN && A[1][1] <= LS_MAT_MAX); + assert(Bx[0] >= LS_MAT_MIN && Bx[0] <= LS_MAT_MAX); + assert(Bx[1] >= LS_MAT_MIN && Bx[1] <= LS_MAT_MAX); + assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX); + assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX); + + // Compute Determinant of A + const int64_t Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1]; + if (Det == 0) return 1; + + int16_t shift; + int16_t iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1); + shift -= WARPEDMODEL_PREC_BITS; + if (shift < 0) { + iDet <<= (-shift); + shift = 0; + } + + int64_t Px[2], Py[2]; + // These divided by the Det, are the least squares solutions + Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1]; + Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1]; + Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1]; + Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1]; + + wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift); + wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift); + wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift); + wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift); + + const int isuy = (mi_row * MI_SIZE + rsuy); + const int isux = (mi_col * MI_SIZE + rsux); + // Note: In the vx, vy expressions below, the max value of each of the + // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room + // for the first term so that the overall sum in the worst case fits + // within 32 bits overall. + const int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) - + (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) + + isuy * wm->wmmat[3]); + const int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) - + (isux * wm->wmmat[4] + + isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS))); + wm->wmmat[0] = + clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1); + wm->wmmat[1] = + clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1); + + wm->wmmat[6] = wm->wmmat[7] = 0; + return 0; +} + +int av1_find_projection(int np, const int *pts1, const int *pts2, + BLOCK_SIZE bsize, int mvy, int mvx, + WarpedMotionParams *wm_params, int mi_row, int mi_col) { + assert(wm_params->wmtype == AFFINE); + + if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row, + mi_col)) + return 1; + + // check compatibility with the fast warp filter + if (!av1_get_shear_params(wm_params)) return 1; + + return 0; +} diff --git a/libs/libaom/src/av1/common/warped_motion.h b/libs/libaom/src/av1/common/warped_motion.h new file mode 100644 index 000000000..14dc0fe47 --- /dev/null +++ b/libs/libaom/src/av1/common/warped_motion.h @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_WARPED_MOTION_H_ +#define AOM_AV1_COMMON_WARPED_MOTION_H_ + +#include +#include +#include +#include +#include + +#include "config/aom_config.h" + +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "av1/common/mv.h" +#include "av1/common/convolve.h" + +#define MAX_PARAMDIM 9 +#define LEAST_SQUARES_SAMPLES_MAX_BITS 3 +#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS) +#define SAMPLES_ARRAY_SIZE (LEAST_SQUARES_SAMPLES_MAX * 2) +#define WARPED_MOTION_DEBUG 0 +#define DEFAULT_WMTYPE AFFINE +#define WARP_ERROR_BLOCK_LOG 5 +#define WARP_ERROR_BLOCK (1 << WARP_ERROR_BLOCK_LOG) + +extern const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]; + +DECLARE_ALIGNED(8, extern const int8_t, + av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]); + +/* clang-format off */ +static const int error_measure_lut[512] = { + // pow 0.7 + 16384, 16339, 16294, 16249, 16204, 16158, 16113, 16068, + 16022, 15977, 15932, 15886, 15840, 15795, 15749, 15703, + 15657, 15612, 15566, 15520, 15474, 15427, 15381, 15335, + 15289, 15242, 15196, 15149, 15103, 15056, 15010, 14963, + 14916, 14869, 14822, 14775, 14728, 14681, 14634, 14587, + 14539, 14492, 14445, 14397, 14350, 14302, 14254, 14206, + 14159, 14111, 14063, 14015, 13967, 13918, 13870, 13822, + 13773, 13725, 13676, 13628, 13579, 13530, 13481, 13432, + 13383, 13334, 13285, 13236, 13187, 13137, 13088, 13038, + 12988, 12939, 12889, 12839, 12789, 12739, 12689, 12639, + 12588, 12538, 12487, 12437, 12386, 12335, 12285, 12234, + 12183, 12132, 12080, 12029, 11978, 11926, 11875, 11823, + 11771, 11719, 11667, 11615, 11563, 11511, 11458, 11406, + 11353, 11301, 11248, 11195, 11142, 11089, 11036, 10982, + 10929, 10875, 10822, 10768, 10714, 10660, 10606, 10552, + 10497, 10443, 10388, 10333, 10279, 10224, 10168, 10113, + 10058, 10002, 9947, 9891, 9835, 9779, 9723, 9666, + 9610, 9553, 9497, 9440, 9383, 9326, 9268, 9211, + 9153, 9095, 9037, 8979, 8921, 8862, 8804, 8745, + 8686, 8627, 8568, 8508, 8449, 8389, 8329, 8269, + 8208, 8148, 8087, 8026, 7965, 7903, 7842, 7780, + 7718, 7656, 7593, 7531, 7468, 7405, 7341, 7278, + 7214, 7150, 7086, 7021, 6956, 6891, 6826, 6760, + 6695, 6628, 6562, 6495, 6428, 6361, 6293, 6225, + 6157, 6089, 6020, 5950, 5881, 5811, 5741, 5670, + 5599, 5527, 5456, 5383, 5311, 5237, 5164, 5090, + 5015, 4941, 4865, 4789, 4713, 4636, 4558, 4480, + 4401, 4322, 4242, 4162, 4080, 3998, 3916, 3832, + 3748, 3663, 3577, 3490, 3402, 3314, 3224, 3133, + 3041, 2948, 2854, 2758, 2661, 2562, 2461, 2359, + 2255, 2148, 2040, 1929, 1815, 1698, 1577, 1452, + 1323, 1187, 1045, 894, 731, 550, 339, 0, + 339, 550, 731, 894, 1045, 1187, 1323, 1452, + 1577, 1698, 1815, 1929, 2040, 2148, 2255, 2359, + 2461, 2562, 2661, 2758, 2854, 2948, 3041, 3133, + 3224, 3314, 3402, 3490, 3577, 3663, 3748, 3832, + 3916, 3998, 4080, 4162, 4242, 4322, 4401, 4480, + 4558, 4636, 4713, 4789, 4865, 4941, 5015, 5090, + 5164, 5237, 5311, 5383, 5456, 5527, 5599, 5670, + 5741, 5811, 5881, 5950, 6020, 6089, 6157, 6225, + 6293, 6361, 6428, 6495, 6562, 6628, 6695, 6760, + 6826, 6891, 6956, 7021, 7086, 7150, 7214, 7278, + 7341, 7405, 7468, 7531, 7593, 7656, 7718, 7780, + 7842, 7903, 7965, 8026, 8087, 8148, 8208, 8269, + 8329, 8389, 8449, 8508, 8568, 8627, 8686, 8745, + 8804, 8862, 8921, 8979, 9037, 9095, 9153, 9211, + 9268, 9326, 9383, 9440, 9497, 9553, 9610, 9666, + 9723, 9779, 9835, 9891, 9947, 10002, 10058, 10113, + 10168, 10224, 10279, 10333, 10388, 10443, 10497, 10552, + 10606, 10660, 10714, 10768, 10822, 10875, 10929, 10982, + 11036, 11089, 11142, 11195, 11248, 11301, 11353, 11406, + 11458, 11511, 11563, 11615, 11667, 11719, 11771, 11823, + 11875, 11926, 11978, 12029, 12080, 12132, 12183, 12234, + 12285, 12335, 12386, 12437, 12487, 12538, 12588, 12639, + 12689, 12739, 12789, 12839, 12889, 12939, 12988, 13038, + 13088, 13137, 13187, 13236, 13285, 13334, 13383, 13432, + 13481, 13530, 13579, 13628, 13676, 13725, 13773, 13822, + 13870, 13918, 13967, 14015, 14063, 14111, 14159, 14206, + 14254, 14302, 14350, 14397, 14445, 14492, 14539, 14587, + 14634, 14681, 14728, 14775, 14822, 14869, 14916, 14963, + 15010, 15056, 15103, 15149, 15196, 15242, 15289, 15335, + 15381, 15427, 15474, 15520, 15566, 15612, 15657, 15703, + 15749, 15795, 15840, 15886, 15932, 15977, 16022, 16068, + 16113, 16158, 16204, 16249, 16294, 16339, 16384, 16384, +}; +/* clang-format on */ + +static const uint8_t warp_pad_left[14][16] = { + { 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 3, 3, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 5, 5, 5, 5, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 6, 6, 6, 6, 6, 6, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 7, 7, 7, 7, 7, 7, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 11, 12, 13, 14, 15 }, + { 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 12, 13, 14, 15 }, + { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 13, 14, 15 }, + { 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 14, 15 }, + { 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 15 }, + { 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15 }, +}; + +static const uint8_t warp_pad_right[14][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12, 12 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 11, 11 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 9, 9, 9, 9 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 }, + { 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 }, + { 0, 1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 }, + { 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }, + { 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }, + { 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }, + { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 } +}; + +static INLINE int error_measure(int err) { + return error_measure_lut[255 + err]; +} + +// Returns the error between the frame described by 'ref' and the frame +// described by 'dst'. +int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride, + uint8_t *dst, int p_width, int p_height, int p_stride); + +int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref, + int stride, uint8_t *dst, int p_width, + int p_height, int p_stride, + uint8_t *segment_map, int segment_map_stride); + +int64_t av1_calc_highbd_frame_error(const uint16_t *const ref, int stride, + const uint16_t *const dst, int p_width, + int p_height, int p_stride, int bd); + +void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref, + int width, int height, int stride, uint16_t *const pred, + int p_col, int p_row, int p_width, int p_height, + int p_stride, int subsampling_x, int subsampling_y, + int bd, ConvolveParams *conv_params); + +void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width, + int height, int stride, uint8_t *pred, int p_col, int p_row, + int p_width, int p_height, int p_stride, int subsampling_x, + int subsampling_y, ConvolveParams *conv_params); + +void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd, + const uint8_t *ref, int width, int height, int stride, + uint8_t *pred, int p_col, int p_row, int p_width, + int p_height, int p_stride, int subsampling_x, + int subsampling_y, ConvolveParams *conv_params); + +int av1_find_projection(int np, const int *pts1, const int *pts2, + BLOCK_SIZE bsize, int mvy, int mvx, + WarpedMotionParams *wm_params, int mi_row, int mi_col); + +int av1_get_shear_params(WarpedMotionParams *wm); +#endif // AOM_AV1_COMMON_WARPED_MOTION_H_ diff --git a/libs/libaom/src/av1/common/x86/av1_convolve_horiz_rs_sse4.c b/libs/libaom/src/av1/common/x86/av1_convolve_horiz_rs_sse4.c new file mode 100644 index 000000000..8aa14696f --- /dev/null +++ b/libs/libaom/src/av1/common/x86/av1_convolve_horiz_rs_sse4.c @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/convolve.h" +#include "av1/common/resize.h" +#include "aom_dsp/x86/synonyms.h" + +// Note: If the crop width is not a multiple of 4, then, unlike the C version, +// this function will overwrite some of the padding on the right hand side of +// the frame. This padding appears to be trashed anyway, so this should not +// affect the running of the decoder. +void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const int16_t *x_filters, int x0_qn, + int x_step_qn) { + assert(UPSCALE_NORMATIVE_TAPS == 8); + + src -= UPSCALE_NORMATIVE_TAPS / 2 - 1; + + const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1); + const __m128i zero = _mm_setzero_si128(); + + const uint8_t *src_y; + uint8_t *dst_y; + int x_qn = x0_qn; + for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) { + const int x_filter_idx0 = + ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + const int x_filter_idx1 = + ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + const int x_filter_idx2 = + ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + const int x_filter_idx3 = + ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + + assert(x_filter_idx0 <= RS_SUBPEL_MASK); + assert(x_filter_idx1 <= RS_SUBPEL_MASK); + assert(x_filter_idx2 <= RS_SUBPEL_MASK); + assert(x_filter_idx3 <= RS_SUBPEL_MASK); + + const int16_t *const x_filter0 = + &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS]; + const int16_t *const x_filter1 = + &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS]; + const int16_t *const x_filter2 = + &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS]; + const int16_t *const x_filter3 = + &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS]; + + const __m128i fil0_16 = xx_loadu_128(x_filter0); + const __m128i fil1_16 = xx_loadu_128(x_filter1); + const __m128i fil2_16 = xx_loadu_128(x_filter2); + const __m128i fil3_16 = xx_loadu_128(x_filter3); + + src_y = src; + dst_y = dst; + for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) { + const uint8_t *const src_x0 = + &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; + const uint8_t *const src_x1 = + &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; + const uint8_t *const src_x2 = + &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; + const uint8_t *const src_x3 = + &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; + + // Load up the source data. This is 8-bit input data, so each load + // gets 8 pixels. + const __m128i src0_8 = xx_loadl_64(src_x0); + const __m128i src1_8 = xx_loadl_64(src_x1); + const __m128i src2_8 = xx_loadl_64(src_x2); + const __m128i src3_8 = xx_loadl_64(src_x3); + + // Now zero-extend up to 16-bit precision, i.e. + // [ 00 00 00 00 hg fe dc ba ] -> [ 0h 0g 0f 0e 0d 0c 0b 0a ] + const __m128i src0_16 = _mm_cvtepu8_epi16(src0_8); + const __m128i src1_16 = _mm_cvtepu8_epi16(src1_8); + const __m128i src2_16 = _mm_cvtepu8_epi16(src2_8); + const __m128i src3_16 = _mm_cvtepu8_epi16(src3_8); + + // Multiply by filter coefficients (results in a 32-bit value), + // and add adjacent pairs, i.e. + // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ]) + // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ] + const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16); + const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16); + const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16); + const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16); + + // Reduce horizontally and add, i.e. + // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ] + const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32); + const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32); + + const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32); + + // Divide down by (1 << FILTER_BITS), rounding to nearest. + const __m128i shifted_32 = + _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS); + + // Pack 32-bit values into 16-bit values, i.e. + // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ] + const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero); + + // Pack 16-bit values into 8-bit values, i.e. + // ([ 0 0 0 0 D C B A ], [ 0 0 0 0 0 0 0 0 ]) + // -> [ 0 0 0 0 0 0 DC BA ] + const __m128i shifted_8 = _mm_packus_epi16(shifted_16, zero); + + // Write to the output + xx_storel_32(&dst_y[x], shifted_8); + } + } +} + +// Note: If the crop width is not a multiple of 4, then, unlike the C version, +// this function will overwrite some of the padding on the right hand side of +// the frame. This padding appears to be trashed anyway, so this should not +// affect the running of the decoder. +void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, + int h, const int16_t *x_filters, + int x0_qn, int x_step_qn, int bd) { + assert(UPSCALE_NORMATIVE_TAPS == 8); + assert(bd == 8 || bd == 10 || bd == 12); + + src -= UPSCALE_NORMATIVE_TAPS / 2 - 1; + + const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1); + const __m128i zero = _mm_setzero_si128(); + const __m128i clip_maximum = _mm_set1_epi16((1 << bd) - 1); + + const uint16_t *src_y; + uint16_t *dst_y; + int x_qn = x0_qn; + for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) { + const int x_filter_idx0 = + ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + const int x_filter_idx1 = + ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + const int x_filter_idx2 = + ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + const int x_filter_idx3 = + ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + + assert(x_filter_idx0 <= RS_SUBPEL_MASK); + assert(x_filter_idx1 <= RS_SUBPEL_MASK); + assert(x_filter_idx2 <= RS_SUBPEL_MASK); + assert(x_filter_idx3 <= RS_SUBPEL_MASK); + + const int16_t *const x_filter0 = + &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS]; + const int16_t *const x_filter1 = + &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS]; + const int16_t *const x_filter2 = + &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS]; + const int16_t *const x_filter3 = + &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS]; + + const __m128i fil0_16 = xx_loadu_128(x_filter0); + const __m128i fil1_16 = xx_loadu_128(x_filter1); + const __m128i fil2_16 = xx_loadu_128(x_filter2); + const __m128i fil3_16 = xx_loadu_128(x_filter3); + + src_y = src; + dst_y = dst; + for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) { + const uint16_t *const src_x0 = + &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; + const uint16_t *const src_x1 = + &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; + const uint16_t *const src_x2 = + &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; + const uint16_t *const src_x3 = + &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; + + // Load up the source data. This is 16-bit input data, so each load + // gets 8 pixels. + const __m128i src0_16 = xx_loadu_128(src_x0); + const __m128i src1_16 = xx_loadu_128(src_x1); + const __m128i src2_16 = xx_loadu_128(src_x2); + const __m128i src3_16 = xx_loadu_128(src_x3); + + // Multiply by filter coefficients (results in a 32-bit value), + // and add adjacent pairs, i.e. + // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ]) + // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ] + const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16); + const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16); + const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16); + const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16); + + // Reduce horizontally and add, i.e. + // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ] + const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32); + const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32); + + const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32); + + // Divide down by (1 << FILTER_BITS), rounding to nearest. + const __m128i shifted_32 = + _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS); + + // Pack 32-bit values into 16-bit values, i.e. + // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ] + const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero); + + // Clip the values at (1 << bd) - 1 + const __m128i clipped_16 = _mm_min_epi16(shifted_16, clip_maximum); + + // Write to the output + xx_storel_64(&dst_y[x], clipped_16); + } + } +} diff --git a/libs/libaom/src/av1/common/x86/av1_convolve_scale_sse4.c b/libs/libaom/src/av1/common/x86/av1_convolve_scale_sse4.c new file mode 100644 index 000000000..196618176 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/av1_convolve_scale_sse4.c @@ -0,0 +1,498 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "av1/common/convolve.h" + +// A specialised version of hfilter, the horizontal filter for +// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters. +static void hfilter8(const uint8_t *src, int src_stride, int16_t *dst, int w, + int h, int subpel_x_qn, int x_step_qn, + const InterpFilterParams *filter_params, unsigned round) { + const int bd = 8; + const int ntaps = 8; + + src -= ntaps / 2 - 1; + + int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1)); + const __m128i round_add = _mm_set1_epi32(round_add32); + const __m128i round_shift = _mm_cvtsi32_si128(round); + + int x_qn = subpel_x_qn; + for (int x = 0; x < w; ++x, x_qn += x_step_qn) { + const uint8_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS); + const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + assert(filter_idx < SUBPEL_SHIFTS); + const int16_t *filter = + av1_get_interp_filter_subpel_kernel(filter_params, filter_idx); + + // Load the filter coefficients + const __m128i coefflo = _mm_loadu_si128((__m128i *)filter); + const __m128i zero = _mm_castps_si128(_mm_setzero_ps()); + + int y; + for (y = 0; y <= h - 4; y += 4) { + const uint8_t *const src0 = src_col + y * src_stride; + const uint8_t *const src1 = src0 + 1 * src_stride; + const uint8_t *const src2 = src0 + 2 * src_stride; + const uint8_t *const src3 = src0 + 3 * src_stride; + + // Load up source data. This is 8-bit input data; each load is just + // loading the lower half of the register and gets 8 pixels + const __m128i data08 = _mm_loadl_epi64((__m128i *)src0); + const __m128i data18 = _mm_loadl_epi64((__m128i *)src1); + const __m128i data28 = _mm_loadl_epi64((__m128i *)src2); + const __m128i data38 = _mm_loadl_epi64((__m128i *)src3); + + // Now zero-extend up to 16-bit precision by interleaving with + // zeros. Drop the upper half of each register (which just had zeros) + const __m128i data0lo = _mm_unpacklo_epi8(data08, zero); + const __m128i data1lo = _mm_unpacklo_epi8(data18, zero); + const __m128i data2lo = _mm_unpacklo_epi8(data28, zero); + const __m128i data3lo = _mm_unpacklo_epi8(data38, zero); + + // Multiply by coefficients + const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo); + const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo); + const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo); + const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo); + + // Reduce horizontally and add + const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo); + const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo); + const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo); + + // Divide down by (1 << round), rounding to nearest. + __m128i shifted = + _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift); + + shifted = _mm_packus_epi32(shifted, shifted); + // Write transposed to the output + _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted); + } + for (; y < h; ++y) { + const uint8_t *const src_row = src_col + y * src_stride; + + int32_t sum = (1 << (bd + FILTER_BITS - 1)); + for (int k = 0; k < ntaps; ++k) { + sum += filter[k] * src_row[k]; + } + + dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round); + } + } +} + +static __m128i convolve_16_8(const int16_t *src, __m128i coeff) { + __m128i data = _mm_loadu_si128((__m128i *)src); + return _mm_madd_epi16(data, coeff); +} + +// A specialised version of vfilter, the vertical filter for +// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters. +static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, int subpel_y_qn, + int y_step_qn, const InterpFilterParams *filter_params, + const ConvolveParams *conv_params, int bd) { + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int ntaps = 8; + + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); + + const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + const __m128i sub = _mm_set1_epi16(sub32); + + CONV_BUF_TYPE *dst16 = conv_params->dst; + const int dst16_stride = conv_params->dst_stride; + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + const __m128i bits_shift = _mm_cvtsi32_si128(bits); + const __m128i bits_const = _mm_set1_epi16(((1 << bits) >> 1)); + const __m128i round_shift_add = + _mm_set1_epi32(((1 << conv_params->round_1) >> 1)); + const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits); + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi16((short)w0); + const __m128i wt1 = _mm_set1_epi16((short)w1); + const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); + + int y_qn = subpel_y_qn; + for (int y = 0; y < h; ++y, y_qn += y_step_qn) { + const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS); + const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + assert(filter_idx < SUBPEL_SHIFTS); + const int16_t *filter = + av1_get_interp_filter_subpel_kernel(filter_params, filter_idx); + + const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter); + int x; + for (x = 0; x <= w - 4; x += 4) { + const int16_t *const src0 = src_y + x * src_stride; + const int16_t *const src1 = src0 + 1 * src_stride; + const int16_t *const src2 = src0 + 2 * src_stride; + const int16_t *const src3 = src0 + 3 * src_stride; + + // Load the source data for the three rows, adding the three registers of + // convolved products to one as we go (conv0..conv3) to avoid the + // register pressure getting too high. + const __m128i conv0 = convolve_16_8(src0, coeff0716); + const __m128i conv1 = convolve_16_8(src1, coeff0716); + const __m128i conv2 = convolve_16_8(src2, coeff0716); + const __m128i conv3 = convolve_16_8(src3, coeff0716); + + // Now reduce horizontally to get one lane for each result + const __m128i conv01 = _mm_hadd_epi32(conv0, conv1); + const __m128i conv23 = _mm_hadd_epi32(conv2, conv3); + __m128i conv = _mm_hadd_epi32(conv01, conv23); + + conv = _mm_add_epi32(conv, res_add_const); + // Divide down by (1 << round_1), rounding to nearest and subtract sub32. + __m128i shifted = + _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift); + + uint8_t *dst_x = dst + y * dst_stride + x; + CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x; + __m128i result; + __m128i shifted_16 = _mm_packus_epi32(shifted, shifted); + + if (conv_params->is_compound) { + if (conv_params->do_average) { + const __m128i p_16 = _mm_loadl_epi64((__m128i *)dst_16_x); + if (conv_params->use_dist_wtd_comp_avg) { + const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, shifted_16); + const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt); + const __m128i shifted_32 = + _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); + shifted_16 = _mm_packus_epi32(shifted_32, shifted_32); + } else { + shifted_16 = _mm_srai_epi16(_mm_add_epi16(p_16, shifted_16), 1); + } + const __m128i subbed = _mm_sub_epi16(shifted_16, sub); + result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift); + const __m128i result_8 = _mm_packus_epi16(result, result); + *(uint32_t *)dst_x = _mm_cvtsi128_si32(result_8); + } else { + _mm_storel_epi64((__m128i *)dst_16_x, shifted_16); + } + } else { + const __m128i subbed = _mm_sub_epi16(shifted_16, sub); + result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift); + const __m128i result_8 = _mm_packus_epi16(result, result); + *(uint32_t *)dst_x = _mm_cvtsi128_si32(result_8); + } + } + for (; x < w; ++x) { + const int16_t *src_x = src_y + x * src_stride; + int32_t sum = 1 << offset_bits; + for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k]; + CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); + + if (conv_params->is_compound) { + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + /* Subtract round offset and convolve round */ + tmp = tmp - sub32; + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); + } else { + dst16[y * dst16_stride + x] = res; + } + } else { + /* Subtract round offset and convolve round */ + int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); + } + } + } +} +void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, + uint8_t *dst8, int dst8_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int x_step_qn, + const int subpel_y_qn, const int y_step_qn, + ConvolveParams *conv_params) { + int16_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]; + int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + + filter_params_y->taps; + + const int xtaps = filter_params_x->taps; + const int ytaps = filter_params_y->taps; + const int fo_vert = ytaps / 2 - 1; + assert((xtaps == 8) && (ytaps == 8)); + (void)xtaps; + + // horizontal filter + hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn, + x_step_qn, filter_params_x, conv_params->round_0); + + // vertical filter (input is transposed) + vfilter8(tmp, im_h, dst8, dst8_stride, w, h, subpel_y_qn, y_step_qn, + filter_params_y, conv_params, 8); +} + +// A specialised version of hfilter, the horizontal filter for +// av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap +// filters. +static void highbd_hfilter8(const uint16_t *src, int src_stride, int16_t *dst, + int w, int h, int subpel_x_qn, int x_step_qn, + const InterpFilterParams *filter_params, + unsigned round, int bd) { + const int ntaps = 8; + + src -= ntaps / 2 - 1; + + int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1)); + const __m128i round_add = _mm_set1_epi32(round_add32); + const __m128i round_shift = _mm_cvtsi32_si128(round); + + int x_qn = subpel_x_qn; + for (int x = 0; x < w; ++x, x_qn += x_step_qn) { + const uint16_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS); + const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + assert(filter_idx < SUBPEL_SHIFTS); + const int16_t *filter = + av1_get_interp_filter_subpel_kernel(filter_params, filter_idx); + + // Load the filter coefficients + const __m128i coefflo = _mm_loadu_si128((__m128i *)filter); + + int y; + for (y = 0; y <= h - 4; y += 4) { + const uint16_t *const src0 = src_col + y * src_stride; + const uint16_t *const src1 = src0 + 1 * src_stride; + const uint16_t *const src2 = src0 + 2 * src_stride; + const uint16_t *const src3 = src0 + 3 * src_stride; + + // Load up source data. This is 16-bit input data, so each load gets the 8 + // pixels we need. + const __m128i data0lo = _mm_loadu_si128((__m128i *)src0); + const __m128i data1lo = _mm_loadu_si128((__m128i *)src1); + const __m128i data2lo = _mm_loadu_si128((__m128i *)src2); + const __m128i data3lo = _mm_loadu_si128((__m128i *)src3); + + // Multiply by coefficients + const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo); + const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo); + const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo); + const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo); + + // Reduce horizontally and add + const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo); + const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo); + const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo); + + // Divide down by (1 << round), rounding to nearest. + __m128i shifted = + _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift); + + shifted = _mm_packus_epi32(shifted, shifted); + // Write transposed to the output + _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted); + } + for (; y < h; ++y) { + const uint16_t *const src_row = src_col + y * src_stride; + + int32_t sum = (1 << (bd + FILTER_BITS - 1)); + for (int k = 0; k < ntaps; ++k) { + sum += filter[k] * src_row[k]; + } + + dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round); + } + } +} +// A specialised version of vfilter, the vertical filter for +// av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap +// filters. +static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst, + int dst_stride, int w, int h, int subpel_y_qn, + int y_step_qn, + const InterpFilterParams *filter_params, + const ConvolveParams *conv_params, int bd) { + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int ntaps = 8; + + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); + + const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + const __m128i sub = _mm_set1_epi32(sub32); + + CONV_BUF_TYPE *dst16 = conv_params->dst; + const int dst16_stride = conv_params->dst_stride; + const __m128i clip_pixel_ = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + const __m128i bits_shift = _mm_cvtsi32_si128(bits); + const __m128i bits_const = _mm_set1_epi32(((1 << bits) >> 1)); + const __m128i round_shift_add = + _mm_set1_epi32(((1 << conv_params->round_1) >> 1)); + const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits); + __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1)); + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi32(w0); + const __m128i wt1 = _mm_set1_epi32(w1); + + int y_qn = subpel_y_qn; + for (int y = 0; y < h; ++y, y_qn += y_step_qn) { + const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS); + const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + assert(filter_idx < SUBPEL_SHIFTS); + const int16_t *filter = + av1_get_interp_filter_subpel_kernel(filter_params, filter_idx); + + const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter); + int x; + for (x = 0; x <= w - 4; x += 4) { + const int16_t *const src0 = src_y + x * src_stride; + const int16_t *const src1 = src0 + 1 * src_stride; + const int16_t *const src2 = src0 + 2 * src_stride; + const int16_t *const src3 = src0 + 3 * src_stride; + + // Load the source data for the three rows, adding the three registers of + // convolved products to one as we go (conv0..conv3) to avoid the + // register pressure getting too high. + const __m128i conv0 = convolve_16_8(src0, coeff0716); + const __m128i conv1 = convolve_16_8(src1, coeff0716); + const __m128i conv2 = convolve_16_8(src2, coeff0716); + const __m128i conv3 = convolve_16_8(src3, coeff0716); + + // Now reduce horizontally to get one lane for each result + const __m128i conv01 = _mm_hadd_epi32(conv0, conv1); + const __m128i conv23 = _mm_hadd_epi32(conv2, conv3); + __m128i conv = _mm_hadd_epi32(conv01, conv23); + conv = _mm_add_epi32(conv, res_add_const); + + // Divide down by (1 << round_1), rounding to nearest and subtract sub32. + __m128i shifted = + _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift); + + uint16_t *dst_x = dst + y * dst_stride + x; + CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x; + + __m128i result; + if (conv_params->is_compound) { + if (conv_params->do_average) { + __m128i p_32 = + _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)dst_16_x)); + + if (conv_params->use_dist_wtd_comp_avg) { + shifted = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0), + _mm_mullo_epi32(shifted, wt1)); + shifted = _mm_srai_epi32(shifted, DIST_PRECISION_BITS); + } else { + shifted = _mm_srai_epi32(_mm_add_epi32(p_32, shifted), 1); + } + __m128i res32 = _mm_sub_epi32(shifted, sub); + res32 = _mm_sra_epi32(_mm_add_epi32(res32, round_bits_const), + round_bits_shift); + + __m128i res16 = _mm_packus_epi32(res32, res32); + res16 = _mm_min_epi16(res16, clip_pixel_); + _mm_storel_epi64((__m128i *)dst_x, res16); + } else { + __m128i shifted_16 = _mm_packus_epi32(shifted, shifted); + _mm_storel_epi64((__m128i *)dst_16_x, shifted_16); + } + } else { + const __m128i subbed = _mm_sub_epi32(shifted, sub); + result = _mm_sra_epi16(_mm_add_epi32(subbed, bits_const), bits_shift); + result = _mm_packus_epi32(result, result); + result = _mm_min_epi16(result, clip_pixel_); + _mm_storel_epi64((__m128i *)dst_x, result); + } + } + + for (; x < w; ++x) { + const int16_t *src_x = src_y + x * src_stride; + int32_t sum = 1 << offset_bits; + for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k]; + CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); + if (conv_params->is_compound) { + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + /* Subtract round offset and convolve round */ + tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); + } else { + dst16[y * dst16_stride + x] = res; + } + } else { + /* Subtract round offset and convolve round */ + int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); + } + } + } +} + +void av1_highbd_convolve_2d_scale_sse4_1( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int x_step_qn, const int subpel_y_qn, const int y_step_qn, + ConvolveParams *conv_params, int bd) { + // TODO(yaowu): Move this out of stack + DECLARE_ALIGNED(16, int16_t, + tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); + int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + + filter_params_y->taps; + const int xtaps = filter_params_x->taps; + const int ytaps = filter_params_y->taps; + const int fo_vert = ytaps / 2 - 1; + + memset(tmp, 0, sizeof(tmp)); + assert((xtaps == 8) && (ytaps == 8)); + (void)xtaps; + + // horizontal filter + highbd_hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h, + subpel_x_qn, x_step_qn, filter_params_x, conv_params->round_0, + bd); + + // vertical filter (input is transposed) + highbd_vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn, + filter_params_y, conv_params, bd); +} diff --git a/libs/libaom/src/av1/common/x86/av1_inv_txfm_avx2.c b/libs/libaom/src/av1/common/x86/av1_inv_txfm_avx2.c new file mode 100644 index 000000000..0fbd5eae4 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/av1_inv_txfm_avx2.c @@ -0,0 +1,1949 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" + +#include "config/av1_rtcd.h" + +#include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/x86/av1_txfm_sse2.h" +#include "av1/common/x86/av1_inv_txfm_avx2.h" +#include "av1/common/x86/av1_inv_txfm_ssse3.h" + +// TODO(venkatsanampudi@ittiam.com): move this to header file + +// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5 +static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096, + 4 * 5793 }; + +static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_avx2(&x1[0], &x1[3]); + btf_16_adds_subs_avx2(&x1[1], &x1[2]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); + + btf_16_adds_subs_avx2(&x1[8], &x1[11]); + btf_16_adds_subs_avx2(&x1[9], &x1[10]); + btf_16_adds_subs_avx2(&x1[15], &x1[12]); + btf_16_adds_subs_avx2(&x1[14], &x1[13]); +} + +static INLINE void idct16_stage6_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_avx2(&x[0], &x[7]); + btf_16_adds_subs_avx2(&x[1], &x[6]); + btf_16_adds_subs_avx2(&x[2], &x[5]); + btf_16_adds_subs_avx2(&x[3], &x[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); +} + +static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) { + btf_16_adds_subs_out_avx2(&output[0], &output[15], x1[0], x1[15]); + btf_16_adds_subs_out_avx2(&output[1], &output[14], x1[1], x1[14]); + btf_16_adds_subs_out_avx2(&output[2], &output[13], x1[2], x1[13]); + btf_16_adds_subs_out_avx2(&output[3], &output[12], x1[3], x1[12]); + btf_16_adds_subs_out_avx2(&output[4], &output[11], x1[4], x1[11]); + btf_16_adds_subs_out_avx2(&output[5], &output[10], x1[5], x1[10]); + btf_16_adds_subs_out_avx2(&output[6], &output[9], x1[6], x1[9]); + btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]); +} + +static void idct16_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { + (void)(cos_bit); + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]); + __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]); + __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]); + __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]); + __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]); + __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]); + __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]); + __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]); + __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); + __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); + __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); + __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); + __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); + __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + + // stage 1 + __m256i x1[16]; + x1[0] = input[0]; + x1[1] = input[8]; + x1[2] = input[4]; + x1[3] = input[12]; + x1[4] = input[2]; + x1[5] = input[10]; + x1[6] = input[6]; + x1[7] = input[14]; + x1[8] = input[1]; + x1[9] = input[9]; + x1[10] = input[5]; + x1[11] = input[13]; + x1[12] = input[3]; + x1[13] = input[11]; + x1[14] = input[7]; + x1[15] = input[15]; + + // stage 2 + btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit); + btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit); + + // stage 3 + btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); + + // stage 4 + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); + + idct16_stage5_avx2(x1, cospi, _r, cos_bit); + idct16_stage6_avx2(x1, cospi, _r, cos_bit); + idct16_stage7_avx2(output, x1); +} + +static void idct16_low8_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)(cos_bit); + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + + // stage 1 + __m256i x1[16]; + x1[0] = input[0]; + x1[2] = input[4]; + x1[4] = input[2]; + x1[6] = input[6]; + x1[8] = input[1]; + x1[10] = input[5]; + x1[12] = input[3]; + x1[14] = input[7]; + + // stage 2 + btf_16_w16_0_avx2(cospi[60], cospi[4], x1[8], x1[8], x1[15]); + btf_16_w16_0_avx2(-cospi[36], cospi[28], x1[14], x1[9], x1[14]); + btf_16_w16_0_avx2(cospi[44], cospi[20], x1[10], x1[10], x1[13]); + btf_16_w16_0_avx2(-cospi[52], cospi[12], x1[12], x1[11], x1[12]); + + // stage 3 + btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]); + btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); + + // stage 4 + btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]); + btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); + + idct16_stage5_avx2(x1, cospi, _r, cos_bit); + idct16_stage6_avx2(x1, cospi, _r, cos_bit); + idct16_stage7_avx2(output, x1); +} + +static void idct16_low1_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)(cos_bit); + const int32_t *cospi = cospi_arr(INV_COS_BIT); + + // stage 1 + __m256i x1[2]; + x1[0] = input[0]; + + // stage 2 + // stage 3 + // stage 4 + btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]); + + // stage 5 + // stage 6 + output[0] = x1[0]; + output[1] = x1[1]; + output[2] = x1[1]; + output[3] = x1[0]; + output[4] = x1[0]; + output[5] = x1[1]; + output[6] = x1[1]; + output[7] = x1[0]; + output[8] = x1[0]; + output[9] = x1[1]; + output[10] = x1[1]; + output[11] = x1[0]; + output[12] = x1[0]; + output[13] = x1[1]; + output[14] = x1[1]; + output[15] = x1[0]; +} + +static INLINE void iadst16_stage3_avx2(__m256i *x) { + btf_16_adds_subs_avx2(&x[0], &x[8]); + btf_16_adds_subs_avx2(&x[1], &x[9]); + btf_16_adds_subs_avx2(&x[2], &x[10]); + btf_16_adds_subs_avx2(&x[3], &x[11]); + btf_16_adds_subs_avx2(&x[4], &x[12]); + btf_16_adds_subs_avx2(&x[5], &x[13]); + btf_16_adds_subs_avx2(&x[6], &x[14]); + btf_16_adds_subs_avx2(&x[7], &x[15]); +} + +static INLINE void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); + const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); + const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); + const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); + const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]); + const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]); + btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x[8], &x[9], _r, cos_bit); + btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x[10], &x[11], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x[12], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x[14], &x[15], _r, cos_bit); +} + +static INLINE void iadst16_stage5_avx2(__m256i *x) { + btf_16_adds_subs_avx2(&x[0], &x[4]); + btf_16_adds_subs_avx2(&x[1], &x[5]); + btf_16_adds_subs_avx2(&x[2], &x[6]); + btf_16_adds_subs_avx2(&x[3], &x[7]); + btf_16_adds_subs_avx2(&x[8], &x[12]); + btf_16_adds_subs_avx2(&x[9], &x[13]); + btf_16_adds_subs_avx2(&x[10], &x[14]); + btf_16_adds_subs_avx2(&x[11], &x[15]); +} + +static INLINE void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); + const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); + const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]); + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[4], &x[5], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[6], &x[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[12], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[14], &x[15], _r, cos_bit); +} + +static INLINE void iadst16_stage7_avx2(__m256i *x) { + btf_16_adds_subs_avx2(&x[0], &x[2]); + btf_16_adds_subs_avx2(&x[1], &x[3]); + btf_16_adds_subs_avx2(&x[4], &x[6]); + btf_16_adds_subs_avx2(&x[5], &x[7]); + btf_16_adds_subs_avx2(&x[8], &x[10]); + btf_16_adds_subs_avx2(&x[9], &x[11]); + btf_16_adds_subs_avx2(&x[12], &x[14]); + btf_16_adds_subs_avx2(&x[13], &x[15]); +} + +static INLINE void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit); +} + +static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) { + const __m256i __zero = _mm256_setzero_si256(); + output[0] = x1[0]; + output[1] = _mm256_subs_epi16(__zero, x1[8]); + output[2] = x1[12]; + output[3] = _mm256_subs_epi16(__zero, x1[4]); + output[4] = x1[6]; + output[5] = _mm256_subs_epi16(__zero, x1[14]); + output[6] = x1[10]; + output[7] = _mm256_subs_epi16(__zero, x1[2]); + output[8] = x1[3]; + output[9] = _mm256_subs_epi16(__zero, x1[11]); + output[10] = x1[15]; + output[11] = _mm256_subs_epi16(__zero, x1[7]); + output[12] = x1[5]; + output[13] = _mm256_subs_epi16(__zero, x1[13]); + output[14] = x1[9]; + output[15] = _mm256_subs_epi16(__zero, x1[1]); +} + +static void iadst16_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)(cos_bit); + const int32_t *cospi = cospi_arr(INV_COS_BIT); + + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]); + __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]); + __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]); + __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]); + __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]); + __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]); + __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]); + __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]); + __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]); + __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]); + __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]); + __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]); + __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]); + __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]); + __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]); + __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]); + + // stage 1 + __m256i x1[16]; + x1[0] = input[15]; + x1[1] = input[0]; + x1[2] = input[13]; + x1[3] = input[2]; + x1[4] = input[11]; + x1[5] = input[4]; + x1[6] = input[9]; + x1[7] = input[6]; + x1[8] = input[7]; + x1[9] = input[8]; + x1[10] = input[5]; + x1[11] = input[10]; + x1[12] = input[3]; + x1[13] = input[12]; + x1[14] = input[1]; + x1[15] = input[14]; + + // stage 2 + btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit); + btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit); + btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit); + btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit); + btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit); + + iadst16_stage3_avx2(x1); + iadst16_stage4_avx2(x1, cospi, _r, cos_bit); + iadst16_stage5_avx2(x1); + iadst16_stage6_avx2(x1, cospi, _r, cos_bit); + iadst16_stage7_avx2(x1); + iadst16_stage8_avx2(x1, cospi, _r, cos_bit); + iadst16_stage9_avx2(output, x1); +} + +static void iadst16_low8_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)(cos_bit); + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + // stage 1 + __m256i x1[16]; + x1[1] = input[0]; + x1[3] = input[2]; + x1[5] = input[4]; + x1[7] = input[6]; + x1[8] = input[7]; + x1[10] = input[5]; + x1[12] = input[3]; + x1[14] = input[1]; + + // stage 2 + btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]); + btf_16_w16_0_avx2(cospi[54], -cospi[10], x1[3], x1[2], x1[3]); + btf_16_w16_0_avx2(cospi[46], -cospi[18], x1[5], x1[4], x1[5]); + btf_16_w16_0_avx2(cospi[38], -cospi[26], x1[7], x1[6], x1[7]); + btf_16_w16_0_avx2(cospi[34], cospi[30], x1[8], x1[8], x1[9]); + btf_16_w16_0_avx2(cospi[42], cospi[22], x1[10], x1[10], x1[11]); + btf_16_w16_0_avx2(cospi[50], cospi[14], x1[12], x1[12], x1[13]); + btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]); + + iadst16_stage3_avx2(x1); + iadst16_stage4_avx2(x1, cospi, _r, cos_bit); + iadst16_stage5_avx2(x1); + iadst16_stage6_avx2(x1, cospi, _r, cos_bit); + iadst16_stage7_avx2(x1); + iadst16_stage8_avx2(x1, cospi, _r, cos_bit); + iadst16_stage9_avx2(output, x1); +} + +static void iadst16_low1_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)(cos_bit); + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); + const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); + const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); + const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); + + // stage 1 + __m256i x1[16]; + x1[1] = input[0]; + + // stage 2 + btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]); + + // stage 3 + x1[8] = x1[0]; + x1[9] = x1[1]; + + // stage 4 + btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit); + + // stage 5 + x1[4] = x1[0]; + x1[5] = x1[1]; + + x1[12] = x1[8]; + x1[13] = x1[9]; + + // stage 6 + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit); + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit); + + // stage 7 + x1[2] = x1[0]; + x1[3] = x1[1]; + x1[6] = x1[4]; + x1[7] = x1[5]; + x1[10] = x1[8]; + x1[11] = x1[9]; + x1[14] = x1[12]; + x1[15] = x1[13]; + + iadst16_stage8_avx2(x1, cospi, _r, cos_bit); + iadst16_stage9_avx2(output, x1); +} + +static INLINE void idct32_high16_stage3_avx2(__m256i *x) { + btf_16_adds_subs_avx2(&x[16], &x[17]); + btf_16_adds_subs_avx2(&x[19], &x[18]); + btf_16_adds_subs_avx2(&x[20], &x[21]); + btf_16_adds_subs_avx2(&x[23], &x[22]); + btf_16_adds_subs_avx2(&x[24], &x[25]); + btf_16_adds_subs_avx2(&x[27], &x[26]); + btf_16_adds_subs_avx2(&x[28], &x[29]); + btf_16_adds_subs_avx2(&x[31], &x[30]); +} + +static INLINE void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); + const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit); +} + +static INLINE void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit); + btf_16_adds_subs_avx2(&x[16], &x[19]); + btf_16_adds_subs_avx2(&x[17], &x[18]); + btf_16_adds_subs_avx2(&x[23], &x[20]); + btf_16_adds_subs_avx2(&x[22], &x[21]); + btf_16_adds_subs_avx2(&x[24], &x[27]); + btf_16_adds_subs_avx2(&x[25], &x[26]); + btf_16_adds_subs_avx2(&x[31], &x[28]); + btf_16_adds_subs_avx2(&x[30], &x[29]); +} + +static INLINE void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x[8], &x[11]); + btf_16_adds_subs_avx2(&x[9], &x[10]); + btf_16_adds_subs_avx2(&x[15], &x[12]); + btf_16_adds_subs_avx2(&x[14], &x[13]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit); +} + +static INLINE void idct32_stage7_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_avx2(&x[0], &x[7]); + btf_16_adds_subs_avx2(&x[1], &x[6]); + btf_16_adds_subs_avx2(&x[2], &x[5]); + btf_16_adds_subs_avx2(&x[3], &x[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); + btf_16_adds_subs_avx2(&x[16], &x[23]); + btf_16_adds_subs_avx2(&x[17], &x[22]); + btf_16_adds_subs_avx2(&x[18], &x[21]); + btf_16_adds_subs_avx2(&x[19], &x[20]); + btf_16_adds_subs_avx2(&x[31], &x[24]); + btf_16_adds_subs_avx2(&x[30], &x[25]); + btf_16_adds_subs_avx2(&x[29], &x[26]); + btf_16_adds_subs_avx2(&x[28], &x[27]); +} + +static INLINE void idct32_stage8_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_avx2(&x[0], &x[15]); + btf_16_adds_subs_avx2(&x[1], &x[14]); + btf_16_adds_subs_avx2(&x[2], &x[13]); + btf_16_adds_subs_avx2(&x[3], &x[12]); + btf_16_adds_subs_avx2(&x[4], &x[11]); + btf_16_adds_subs_avx2(&x[5], &x[10]); + btf_16_adds_subs_avx2(&x[6], &x[9]); + btf_16_adds_subs_avx2(&x[7], &x[8]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit); +} + +static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) { + btf_16_adds_subs_out_avx2(&output[0], &output[31], x[0], x[31]); + btf_16_adds_subs_out_avx2(&output[1], &output[30], x[1], x[30]); + btf_16_adds_subs_out_avx2(&output[2], &output[29], x[2], x[29]); + btf_16_adds_subs_out_avx2(&output[3], &output[28], x[3], x[28]); + btf_16_adds_subs_out_avx2(&output[4], &output[27], x[4], x[27]); + btf_16_adds_subs_out_avx2(&output[5], &output[26], x[5], x[26]); + btf_16_adds_subs_out_avx2(&output[6], &output[25], x[6], x[25]); + btf_16_adds_subs_out_avx2(&output[7], &output[24], x[7], x[24]); + btf_16_adds_subs_out_avx2(&output[8], &output[23], x[8], x[23]); + btf_16_adds_subs_out_avx2(&output[9], &output[22], x[9], x[22]); + btf_16_adds_subs_out_avx2(&output[10], &output[21], x[10], x[21]); + btf_16_adds_subs_out_avx2(&output[11], &output[20], x[11], x[20]); + btf_16_adds_subs_out_avx2(&output[12], &output[19], x[12], x[19]); + btf_16_adds_subs_out_avx2(&output[13], &output[18], x[13], x[18]); + btf_16_adds_subs_out_avx2(&output[14], &output[17], x[14], x[17]); + btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]); +} + +static void idct32_low1_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + + // stage 1 + __m256i x[2]; + x[0] = input[0]; + + // stage 2 + // stage 3 + // stage 4 + // stage 5 + btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); + + // stage 6 + // stage 7 + // stage 8 + // stage 9 + output[0] = x[0]; + output[31] = x[0]; + output[1] = x[1]; + output[30] = x[1]; + output[2] = x[1]; + output[29] = x[1]; + output[3] = x[0]; + output[28] = x[0]; + output[4] = x[0]; + output[27] = x[0]; + output[5] = x[1]; + output[26] = x[1]; + output[6] = x[1]; + output[25] = x[1]; + output[7] = x[0]; + output[24] = x[0]; + output[8] = x[0]; + output[23] = x[0]; + output[9] = x[1]; + output[22] = x[1]; + output[10] = x[1]; + output[21] = x[1]; + output[11] = x[0]; + output[20] = x[0]; + output[12] = x[0]; + output[19] = x[0]; + output[13] = x[1]; + output[18] = x[1]; + output[14] = x[1]; + output[17] = x[1]; + output[15] = x[0]; + output[16] = x[0]; +} + +static void idct32_low8_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + // stage 1 + __m256i x[32]; + x[0] = input[0]; + x[4] = input[4]; + x[8] = input[2]; + x[12] = input[6]; + x[16] = input[1]; + x[20] = input[5]; + x[24] = input[3]; + x[28] = input[7]; + + // stage 2 + btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); + btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); + btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); + + // stage 3 + btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); + x[17] = x[16]; + x[18] = x[19]; + x[21] = x[20]; + x[22] = x[23]; + x[25] = x[24]; + x[26] = x[27]; + x[29] = x[28]; + x[30] = x[31]; + + // stage 4 + btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); + x[9] = x[8]; + x[10] = x[11]; + x[13] = x[12]; + x[14] = x[15]; + idct32_high16_stage4_avx2(x, cospi, _r, cos_bit); + + // stage 5 + btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); + x[5] = x[4]; + x[6] = x[7]; + idct32_high24_stage5_avx2(x, cospi, _r, cos_bit); + // stage 6 + x[3] = x[0]; + x[2] = x[1]; + idct32_high28_stage6_avx2(x, cospi, _r, cos_bit); + + idct32_stage7_avx2(x, cospi, _r, cos_bit); + idct32_stage8_avx2(x, cospi, _r, cos_bit); + idct32_stage9_avx2(output, x); +} + +static void idct32_low16_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + // stage 1 + __m256i x[32]; + x[0] = input[0]; + x[2] = input[8]; + x[4] = input[4]; + x[6] = input[12]; + x[8] = input[2]; + x[10] = input[10]; + x[12] = input[6]; + x[14] = input[14]; + x[16] = input[1]; + x[18] = input[9]; + x[20] = input[5]; + x[22] = input[13]; + x[24] = input[3]; + x[26] = input[11]; + x[28] = input[7]; + x[30] = input[15]; + + // stage 2 + btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]); + btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]); + btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); + btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); + btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]); + btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]); + btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); + + // stage 3 + btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]); + btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]); + btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); + idct32_high16_stage3_avx2(x); + + // stage 4 + btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); + btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]); + btf_16_adds_subs_avx2(&x[8], &x[9]); + btf_16_adds_subs_avx2(&x[11], &x[10]); + btf_16_adds_subs_avx2(&x[12], &x[13]); + btf_16_adds_subs_avx2(&x[15], &x[14]); + idct32_high16_stage4_avx2(x, cospi, _r, cos_bit); + + // stage 5 + btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); + btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]); + btf_16_adds_subs_avx2(&x[4], &x[5]); + btf_16_adds_subs_avx2(&x[7], &x[6]); + idct32_high24_stage5_avx2(x, cospi, _r, cos_bit); + + btf_16_adds_subs_avx2(&x[0], &x[3]); + btf_16_adds_subs_avx2(&x[1], &x[2]); + idct32_high28_stage6_avx2(x, cospi, _r, cos_bit); + + idct32_stage7_avx2(x, cospi, _r, cos_bit); + idct32_stage8_avx2(x, cospi, _r, cos_bit); + idct32_stage9_avx2(output, x); +} + +static void idct32_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { + (void)(cos_bit); + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]); + __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]); + __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]); + __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]); + __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]); + __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]); + __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]); + __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]); + __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]); + __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]); + __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]); + __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]); + __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]); + __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]); + __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]); + __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]); + __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]); + __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]); + __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]); + __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]); + __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]); + __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]); + __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]); + __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]); + __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); + __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); + __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); + __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); + __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); + + // stage 1 + __m256i x1[32]; + x1[0] = input[0]; + x1[1] = input[16]; + x1[2] = input[8]; + x1[3] = input[24]; + x1[4] = input[4]; + x1[5] = input[20]; + x1[6] = input[12]; + x1[7] = input[28]; + x1[8] = input[2]; + x1[9] = input[18]; + x1[10] = input[10]; + x1[11] = input[26]; + x1[12] = input[6]; + x1[13] = input[22]; + x1[14] = input[14]; + x1[15] = input[30]; + x1[16] = input[1]; + x1[17] = input[17]; + x1[18] = input[9]; + x1[19] = input[25]; + x1[20] = input[5]; + x1[21] = input[21]; + x1[22] = input[13]; + x1[23] = input[29]; + x1[24] = input[3]; + x1[25] = input[19]; + x1[26] = input[11]; + x1[27] = input[27]; + x1[28] = input[7]; + x1[29] = input[23]; + x1[30] = input[15]; + x1[31] = input[31]; + + // stage 2 + btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r, cos_bit); + btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r, cos_bit); + btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r, cos_bit); + btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r, cos_bit); + btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r, cos_bit); + + // stage 3 + btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit); + btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit); + idct32_high16_stage3_avx2(x1); + + // stage 4 + btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); + idct32_high16_stage4_avx2(x1, cospi, _r, cos_bit); + + // stage 5 + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + idct32_high24_stage5_avx2(x1, cospi, _r, cos_bit); + + // stage 6 + btf_16_adds_subs_avx2(&x1[0], &x1[3]); + btf_16_adds_subs_avx2(&x1[1], &x1[2]); + idct32_high28_stage6_avx2(x1, cospi, _r, cos_bit); + + idct32_stage7_avx2(x1, cospi, _r, cos_bit); + idct32_stage8_avx2(x1, cospi, _r, cos_bit); + idct32_stage9_avx2(output, x1); +} + +static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + (void)cos_bit; + const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); + const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); + const __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]); + const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); + const __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); + const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]); + const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); + const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); + const __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]); + const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); + const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); + const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]); + btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit); + btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x[34], &x[61], _r, cos_bit); + btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x[37], &x[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit); + btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x[42], &x[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x[45], &x[50], _r, cos_bit); + btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit); +} + +static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + (void)cos_bit; + const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); + const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit); + btf_16_adds_subs_avx2(&x[32], &x[35]); + btf_16_adds_subs_avx2(&x[33], &x[34]); + btf_16_adds_subs_avx2(&x[39], &x[36]); + btf_16_adds_subs_avx2(&x[38], &x[37]); + btf_16_adds_subs_avx2(&x[40], &x[43]); + btf_16_adds_subs_avx2(&x[41], &x[42]); + btf_16_adds_subs_avx2(&x[47], &x[44]); + btf_16_adds_subs_avx2(&x[46], &x[45]); + btf_16_adds_subs_avx2(&x[48], &x[51]); + btf_16_adds_subs_avx2(&x[49], &x[50]); + btf_16_adds_subs_avx2(&x[55], &x[52]); + btf_16_adds_subs_avx2(&x[54], &x[53]); + btf_16_adds_subs_avx2(&x[56], &x[59]); + btf_16_adds_subs_avx2(&x[57], &x[58]); + btf_16_adds_subs_avx2(&x[63], &x[60]); + btf_16_adds_subs_avx2(&x[62], &x[61]); +} + +static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + (void)cos_bit; + const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); + const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[34], &x[61], _r, cos_bit); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[35], &x[60], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[36], &x[59], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[37], &x[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[42], &x[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[43], &x[52], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[44], &x[51], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[45], &x[50], _r, cos_bit); +} + +static INLINE void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + btf_16_adds_subs_avx2(&x[16], &x[19]); + btf_16_adds_subs_avx2(&x[17], &x[18]); + btf_16_adds_subs_avx2(&x[23], &x[20]); + btf_16_adds_subs_avx2(&x[22], &x[21]); + btf_16_adds_subs_avx2(&x[24], &x[27]); + btf_16_adds_subs_avx2(&x[25], &x[26]); + btf_16_adds_subs_avx2(&x[31], &x[28]); + btf_16_adds_subs_avx2(&x[30], &x[29]); + idct64_stage6_high32_avx2(x, cospi, _r, cos_bit); +} + +static INLINE void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + (void)cos_bit; + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit); + btf_16_adds_subs_avx2(&x[32], &x[39]); + btf_16_adds_subs_avx2(&x[33], &x[38]); + btf_16_adds_subs_avx2(&x[34], &x[37]); + btf_16_adds_subs_avx2(&x[35], &x[36]); + btf_16_adds_subs_avx2(&x[47], &x[40]); + btf_16_adds_subs_avx2(&x[46], &x[41]); + btf_16_adds_subs_avx2(&x[45], &x[42]); + btf_16_adds_subs_avx2(&x[44], &x[43]); + btf_16_adds_subs_avx2(&x[48], &x[55]); + btf_16_adds_subs_avx2(&x[49], &x[54]); + btf_16_adds_subs_avx2(&x[50], &x[53]); + btf_16_adds_subs_avx2(&x[51], &x[52]); + btf_16_adds_subs_avx2(&x[63], &x[56]); + btf_16_adds_subs_avx2(&x[62], &x[57]); + btf_16_adds_subs_avx2(&x[61], &x[58]); + btf_16_adds_subs_avx2(&x[60], &x[59]); +} + +static INLINE void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + (void)cos_bit; + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + btf_16_adds_subs_avx2(&x[16], &x[23]); + btf_16_adds_subs_avx2(&x[17], &x[22]); + btf_16_adds_subs_avx2(&x[18], &x[21]); + btf_16_adds_subs_avx2(&x[19], &x[20]); + btf_16_adds_subs_avx2(&x[31], &x[24]); + btf_16_adds_subs_avx2(&x[30], &x[25]); + btf_16_adds_subs_avx2(&x[29], &x[26]); + btf_16_adds_subs_avx2(&x[28], &x[27]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[36], &x[59], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[37], &x[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[38], &x[57], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[39], &x[56], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[40], &x[55], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[41], &x[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[42], &x[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[43], &x[52], _r, cos_bit); +} + +static INLINE void idct64_stage9_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + (void)cos_bit; + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_avx2(&x[0], &x[15]); + btf_16_adds_subs_avx2(&x[1], &x[14]); + btf_16_adds_subs_avx2(&x[2], &x[13]); + btf_16_adds_subs_avx2(&x[3], &x[12]); + btf_16_adds_subs_avx2(&x[4], &x[11]); + btf_16_adds_subs_avx2(&x[5], &x[10]); + btf_16_adds_subs_avx2(&x[6], &x[9]); + btf_16_adds_subs_avx2(&x[7], &x[8]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit); + btf_16_adds_subs_avx2(&x[32], &x[47]); + btf_16_adds_subs_avx2(&x[33], &x[46]); + btf_16_adds_subs_avx2(&x[34], &x[45]); + btf_16_adds_subs_avx2(&x[35], &x[44]); + btf_16_adds_subs_avx2(&x[36], &x[43]); + btf_16_adds_subs_avx2(&x[37], &x[42]); + btf_16_adds_subs_avx2(&x[38], &x[41]); + btf_16_adds_subs_avx2(&x[39], &x[40]); + btf_16_adds_subs_avx2(&x[63], &x[48]); + btf_16_adds_subs_avx2(&x[62], &x[49]); + btf_16_adds_subs_avx2(&x[61], &x[50]); + btf_16_adds_subs_avx2(&x[60], &x[51]); + btf_16_adds_subs_avx2(&x[59], &x[52]); + btf_16_adds_subs_avx2(&x[58], &x[53]); + btf_16_adds_subs_avx2(&x[57], &x[54]); + btf_16_adds_subs_avx2(&x[56], &x[55]); +} + +static INLINE void idct64_stage10_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + (void)cos_bit; + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_avx2(&x[0], &x[31]); + btf_16_adds_subs_avx2(&x[1], &x[30]); + btf_16_adds_subs_avx2(&x[2], &x[29]); + btf_16_adds_subs_avx2(&x[3], &x[28]); + btf_16_adds_subs_avx2(&x[4], &x[27]); + btf_16_adds_subs_avx2(&x[5], &x[26]); + btf_16_adds_subs_avx2(&x[6], &x[25]); + btf_16_adds_subs_avx2(&x[7], &x[24]); + btf_16_adds_subs_avx2(&x[8], &x[23]); + btf_16_adds_subs_avx2(&x[9], &x[22]); + btf_16_adds_subs_avx2(&x[10], &x[21]); + btf_16_adds_subs_avx2(&x[11], &x[20]); + btf_16_adds_subs_avx2(&x[12], &x[19]); + btf_16_adds_subs_avx2(&x[13], &x[18]); + btf_16_adds_subs_avx2(&x[14], &x[17]); + btf_16_adds_subs_avx2(&x[15], &x[16]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[40], &x[55], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[41], &x[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[42], &x[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[43], &x[52], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[44], &x[51], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[45], &x[50], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[46], &x[49], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[47], &x[48], _r, cos_bit); +} + +static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) { + btf_16_adds_subs_out_avx2(&output[0], &output[63], x[0], x[63]); + btf_16_adds_subs_out_avx2(&output[1], &output[62], x[1], x[62]); + btf_16_adds_subs_out_avx2(&output[2], &output[61], x[2], x[61]); + btf_16_adds_subs_out_avx2(&output[3], &output[60], x[3], x[60]); + btf_16_adds_subs_out_avx2(&output[4], &output[59], x[4], x[59]); + btf_16_adds_subs_out_avx2(&output[5], &output[58], x[5], x[58]); + btf_16_adds_subs_out_avx2(&output[6], &output[57], x[6], x[57]); + btf_16_adds_subs_out_avx2(&output[7], &output[56], x[7], x[56]); + btf_16_adds_subs_out_avx2(&output[8], &output[55], x[8], x[55]); + btf_16_adds_subs_out_avx2(&output[9], &output[54], x[9], x[54]); + btf_16_adds_subs_out_avx2(&output[10], &output[53], x[10], x[53]); + btf_16_adds_subs_out_avx2(&output[11], &output[52], x[11], x[52]); + btf_16_adds_subs_out_avx2(&output[12], &output[51], x[12], x[51]); + btf_16_adds_subs_out_avx2(&output[13], &output[50], x[13], x[50]); + btf_16_adds_subs_out_avx2(&output[14], &output[49], x[14], x[49]); + btf_16_adds_subs_out_avx2(&output[15], &output[48], x[15], x[48]); + btf_16_adds_subs_out_avx2(&output[16], &output[47], x[16], x[47]); + btf_16_adds_subs_out_avx2(&output[17], &output[46], x[17], x[46]); + btf_16_adds_subs_out_avx2(&output[18], &output[45], x[18], x[45]); + btf_16_adds_subs_out_avx2(&output[19], &output[44], x[19], x[44]); + btf_16_adds_subs_out_avx2(&output[20], &output[43], x[20], x[43]); + btf_16_adds_subs_out_avx2(&output[21], &output[42], x[21], x[42]); + btf_16_adds_subs_out_avx2(&output[22], &output[41], x[22], x[41]); + btf_16_adds_subs_out_avx2(&output[23], &output[40], x[23], x[40]); + btf_16_adds_subs_out_avx2(&output[24], &output[39], x[24], x[39]); + btf_16_adds_subs_out_avx2(&output[25], &output[38], x[25], x[38]); + btf_16_adds_subs_out_avx2(&output[26], &output[37], x[26], x[37]); + btf_16_adds_subs_out_avx2(&output[27], &output[36], x[27], x[36]); + btf_16_adds_subs_out_avx2(&output[28], &output[35], x[28], x[35]); + btf_16_adds_subs_out_avx2(&output[29], &output[34], x[29], x[34]); + btf_16_adds_subs_out_avx2(&output[30], &output[33], x[30], x[33]); + btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]); +} + +static void idct64_low1_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + + // stage 1 + __m256i x[32]; + x[0] = input[0]; + + // stage 2 + // stage 3 + // stage 4 + // stage 5 + // stage 6 + btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); + + // stage 7 + // stage 8 + // stage 9 + // stage 10 + // stage 11 + output[0] = x[0]; + output[63] = x[0]; + output[1] = x[1]; + output[62] = x[1]; + output[2] = x[1]; + output[61] = x[1]; + output[3] = x[0]; + output[60] = x[0]; + output[4] = x[0]; + output[59] = x[0]; + output[5] = x[1]; + output[58] = x[1]; + output[6] = x[1]; + output[57] = x[1]; + output[7] = x[0]; + output[56] = x[0]; + output[8] = x[0]; + output[55] = x[0]; + output[9] = x[1]; + output[54] = x[1]; + output[10] = x[1]; + output[53] = x[1]; + output[11] = x[0]; + output[52] = x[0]; + output[12] = x[0]; + output[51] = x[0]; + output[13] = x[1]; + output[50] = x[1]; + output[14] = x[1]; + output[49] = x[1]; + output[15] = x[0]; + output[48] = x[0]; + output[16] = x[0]; + output[47] = x[0]; + output[17] = x[1]; + output[46] = x[1]; + output[18] = x[1]; + output[45] = x[1]; + output[19] = x[0]; + output[44] = x[0]; + output[20] = x[0]; + output[43] = x[0]; + output[21] = x[1]; + output[42] = x[1]; + output[22] = x[1]; + output[41] = x[1]; + output[23] = x[0]; + output[40] = x[0]; + output[24] = x[0]; + output[39] = x[0]; + output[25] = x[1]; + output[38] = x[1]; + output[26] = x[1]; + output[37] = x[1]; + output[27] = x[0]; + output[36] = x[0]; + output[28] = x[0]; + output[35] = x[0]; + output[29] = x[1]; + output[34] = x[1]; + output[30] = x[1]; + output[33] = x[1]; + output[31] = x[0]; + output[32] = x[0]; +} + +static void idct64_low8_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); + const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); + const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); + const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]); + const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); + const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); + const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); + const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]); + const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m256i x[64]; + x[0] = input[0]; + x[8] = input[4]; + x[16] = input[2]; + x[24] = input[6]; + x[32] = input[1]; + x[40] = input[5]; + x[48] = input[3]; + x[56] = input[7]; + + // stage 2 + btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]); + btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]); + btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]); + btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]); + + // stage 3 + btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); + x[33] = x[32]; + x[38] = x[39]; + x[41] = x[40]; + x[46] = x[47]; + x[49] = x[48]; + x[54] = x[55]; + x[57] = x[56]; + x[62] = x[63]; + + // stage 4 + btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); + x[17] = x[16]; + x[22] = x[23]; + x[25] = x[24]; + x[30] = x[31]; + btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit); + btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit); + btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit); + + // stage 5 + x[9] = x[8]; + x[14] = x[15]; + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit); + x[35] = x[32]; + x[34] = x[33]; + x[36] = x[39]; + x[37] = x[38]; + x[43] = x[40]; + x[42] = x[41]; + x[44] = x[47]; + x[45] = x[46]; + x[51] = x[48]; + x[50] = x[49]; + x[52] = x[55]; + x[53] = x[54]; + x[59] = x[56]; + x[58] = x[57]; + x[60] = x[63]; + x[61] = x[62]; + + // stage 6 + btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit); + x[19] = x[16]; + x[18] = x[17]; + x[20] = x[23]; + x[21] = x[22]; + x[27] = x[24]; + x[26] = x[25]; + x[28] = x[31]; + x[29] = x[30]; + idct64_stage6_high32_avx2(x, cospi, _r, cos_bit); + + // stage 7 + x[3] = x[0]; + x[2] = x[1]; + x[11] = x[8]; + x[10] = x[9]; + x[12] = x[15]; + x[13] = x[14]; + idct64_stage7_high48_avx2(x, cospi, _r, cos_bit); + + // stage 8 + x[7] = x[0]; + x[6] = x[1]; + x[5] = x[2]; + x[4] = x[3]; + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); + idct64_stage8_high48_avx2(x, cospi, _r, cos_bit); + + idct64_stage9_avx2(x, cospi, _r, cos_bit); + idct64_stage10_avx2(x, cospi, _r, cos_bit); + idct64_stage11_avx2(output, x); +} + +static void idct64_low16_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m256i x[64]; + x[0] = input[0]; + x[4] = input[8]; + x[8] = input[4]; + x[12] = input[12]; + x[16] = input[2]; + x[20] = input[10]; + x[24] = input[6]; + x[28] = input[14]; + x[32] = input[1]; + x[36] = input[9]; + x[40] = input[5]; + x[44] = input[13]; + x[48] = input[3]; + x[52] = input[11]; + x[56] = input[7]; + x[60] = input[15]; + + // stage 2 + btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]); + btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]); + btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]); + btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]); + btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]); + btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]); + btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]); + btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]); + + // stage 3 + btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); + btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); + btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); + x[33] = x[32]; + x[34] = x[35]; + x[37] = x[36]; + x[38] = x[39]; + x[41] = x[40]; + x[42] = x[43]; + x[45] = x[44]; + x[46] = x[47]; + x[49] = x[48]; + x[50] = x[51]; + x[53] = x[52]; + x[54] = x[55]; + x[57] = x[56]; + x[58] = x[59]; + x[61] = x[60]; + x[62] = x[63]; + + // stage 4 + btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); + x[17] = x[16]; + x[18] = x[19]; + x[21] = x[20]; + x[22] = x[23]; + x[25] = x[24]; + x[26] = x[27]; + x[29] = x[28]; + x[30] = x[31]; + idct64_stage4_high32_avx2(x, cospi, _r, cos_bit); + + // stage 5 + btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); + x[9] = x[8]; + x[10] = x[11]; + x[13] = x[12]; + x[14] = x[15]; + idct64_stage5_high48_avx2(x, cospi, _r, cos_bit); + + // stage 6 + btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); + x[5] = x[4]; + x[6] = x[7]; + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit); + idct64_stage6_high48_avx2(x, cospi, _r, cos_bit); + + // stage 7 + x[3] = x[0]; + x[2] = x[1]; + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x[8], &x[11]); + btf_16_adds_subs_avx2(&x[9], &x[10]); + btf_16_adds_subs_avx2(&x[15], &x[12]); + btf_16_adds_subs_avx2(&x[14], &x[13]); + idct64_stage7_high48_avx2(x, cospi, _r, cos_bit); + + // stage 8 + btf_16_adds_subs_avx2(&x[0], &x[7]); + btf_16_adds_subs_avx2(&x[1], &x[6]); + btf_16_adds_subs_avx2(&x[2], &x[5]); + btf_16_adds_subs_avx2(&x[3], &x[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); + idct64_stage8_high48_avx2(x, cospi, _r, cos_bit); + + idct64_stage9_avx2(x, cospi, _r, cos_bit); + idct64_stage10_avx2(x, cospi, _r, cos_bit); + idct64_stage11_avx2(output, x); +} + +static void idct64_low32_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m256i x[64]; + x[0] = input[0]; + x[2] = input[16]; + x[4] = input[8]; + x[6] = input[24]; + x[8] = input[4]; + x[10] = input[20]; + x[12] = input[12]; + x[14] = input[28]; + x[16] = input[2]; + x[18] = input[18]; + x[20] = input[10]; + x[22] = input[26]; + x[24] = input[6]; + x[26] = input[22]; + x[28] = input[14]; + x[30] = input[30]; + x[32] = input[1]; + x[34] = input[17]; + x[36] = input[9]; + x[38] = input[25]; + x[40] = input[5]; + x[42] = input[21]; + x[44] = input[13]; + x[46] = input[29]; + x[48] = input[3]; + x[50] = input[19]; + x[52] = input[11]; + x[54] = input[27]; + x[56] = input[7]; + x[58] = input[23]; + x[60] = input[15]; + x[62] = input[31]; + + // stage 2 + btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]); + btf_16_w16_0_avx2(-cospi[33], cospi[31], x[62], x[33], x[62]); + btf_16_w16_0_avx2(cospi[47], cospi[17], x[34], x[34], x[61]); + btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]); + btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]); + btf_16_w16_0_avx2(-cospi[41], cospi[23], x[58], x[37], x[58]); + btf_16_w16_0_avx2(cospi[39], cospi[25], x[38], x[38], x[57]); + btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]); + btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]); + btf_16_w16_0_avx2(-cospi[37], cospi[27], x[54], x[41], x[54]); + btf_16_w16_0_avx2(cospi[43], cospi[21], x[42], x[42], x[53]); + btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]); + btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]); + btf_16_w16_0_avx2(-cospi[45], cospi[19], x[50], x[45], x[50]); + btf_16_w16_0_avx2(cospi[35], cospi[29], x[46], x[46], x[49]); + btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]); + + // stage 3 + btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]); + btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]); + btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); + btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); + btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]); + btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]); + btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); + btf_16_adds_subs_avx2(&x[32], &x[33]); + btf_16_adds_subs_avx2(&x[35], &x[34]); + btf_16_adds_subs_avx2(&x[36], &x[37]); + btf_16_adds_subs_avx2(&x[39], &x[38]); + btf_16_adds_subs_avx2(&x[40], &x[41]); + btf_16_adds_subs_avx2(&x[43], &x[42]); + btf_16_adds_subs_avx2(&x[44], &x[45]); + btf_16_adds_subs_avx2(&x[47], &x[46]); + btf_16_adds_subs_avx2(&x[48], &x[49]); + btf_16_adds_subs_avx2(&x[51], &x[50]); + btf_16_adds_subs_avx2(&x[52], &x[53]); + btf_16_adds_subs_avx2(&x[55], &x[54]); + btf_16_adds_subs_avx2(&x[56], &x[57]); + btf_16_adds_subs_avx2(&x[59], &x[58]); + btf_16_adds_subs_avx2(&x[60], &x[61]); + btf_16_adds_subs_avx2(&x[63], &x[62]); + + // stage 4 + btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]); + btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]); + btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); + btf_16_adds_subs_avx2(&x[16], &x[17]); + btf_16_adds_subs_avx2(&x[19], &x[18]); + btf_16_adds_subs_avx2(&x[20], &x[21]); + btf_16_adds_subs_avx2(&x[23], &x[22]); + btf_16_adds_subs_avx2(&x[24], &x[25]); + btf_16_adds_subs_avx2(&x[27], &x[26]); + btf_16_adds_subs_avx2(&x[28], &x[29]); + btf_16_adds_subs_avx2(&x[31], &x[30]); + idct64_stage4_high32_avx2(x, cospi, _r, cos_bit); + + // stage 5 + btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); + btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]); + btf_16_adds_subs_avx2(&x[8], &x[9]); + btf_16_adds_subs_avx2(&x[11], &x[10]); + btf_16_adds_subs_avx2(&x[12], &x[13]); + btf_16_adds_subs_avx2(&x[15], &x[14]); + idct64_stage5_high48_avx2(x, cospi, _r, cos_bit); + + // stage 6 + btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); + btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]); + btf_16_adds_subs_avx2(&x[4], &x[5]); + btf_16_adds_subs_avx2(&x[7], &x[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit); + idct64_stage6_high48_avx2(x, cospi, _r, cos_bit); + + // stage 7 + btf_16_adds_subs_avx2(&x[0], &x[3]); + btf_16_adds_subs_avx2(&x[1], &x[2]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x[8], &x[11]); + btf_16_adds_subs_avx2(&x[9], &x[10]); + btf_16_adds_subs_avx2(&x[15], &x[12]); + btf_16_adds_subs_avx2(&x[14], &x[13]); + idct64_stage7_high48_avx2(x, cospi, _r, cos_bit); + + // stage 8 + btf_16_adds_subs_avx2(&x[0], &x[7]); + btf_16_adds_subs_avx2(&x[1], &x[6]); + btf_16_adds_subs_avx2(&x[2], &x[5]); + btf_16_adds_subs_avx2(&x[3], &x[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); + idct64_stage8_high48_avx2(x, cospi, _r, cos_bit); + + // stage 9~11 + idct64_stage9_avx2(x, cospi, _r, cos_bit); + idct64_stage10_avx2(x, cospi, _r, cos_bit); + idct64_stage11_avx2(output, x); +} + +typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output, + int8_t cos_bit); + +// 1D functions process 16 pixels at one time. +static const transform_1d_avx2 + lowbd_txfm_all_1d_zeros_w16_arr[TX_SIZES][ITX_TYPES_1D][4] = { + { + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + { + { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL }, + { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2, + idct64_low32_avx2 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } } + }; + +// only process w >= 16 h >= 16 +static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + __m256i buf1[64 * 16]; + int eobx, eoby; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div16 = txfm_size_col >> 4; + const int buf_size_nonzero_w_div16 = (eobx + 16) >> 4; + const int buf_size_nonzero_h_div16 = (eoby + 16) >> 4; + const int input_stride = AOMMIN(32, txfm_size_col); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_avx2 row_txfm = + lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_1d_avx2 col_txfm = + lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + const __m256i scale0 = _mm256_set1_epi16(1 << (15 + shift[0])); + for (int i = 0; i < buf_size_nonzero_h_div16; i++) { + __m256i buf0[64]; + const int32_t *input_row = input + (i << 4) * input_stride; + for (int j = 0; j < buf_size_nonzero_w_div16; ++j) { + __m256i *buf0_cur = buf0 + j * 16; + const int32_t *input_cur = input_row + j * 16; + load_buffer_32bit_to_16bit_w16_avx2(input_cur, input_stride, buf0_cur, + 16); + transpose_16bit_16x16_avx2(buf0_cur, buf0_cur); + } + if (rect_type == 1 || rect_type == -1) { + round_shift_avx2(buf0, buf0, input_stride); // rect special code + } + row_txfm(buf0, buf0, cos_bit_row); + for (int j = 0; j < txfm_size_col; ++j) { + buf0[j] = _mm256_mulhrs_epi16(buf0[j], scale0); + } + + __m256i *buf1_cur = buf1 + (i << 4); + if (lr_flip) { + for (int j = 0; j < buf_size_w_div16; ++j) { + __m256i temp[16]; + flip_buf_avx2(buf0 + 16 * j, temp, 16); + int offset = txfm_size_row * (buf_size_w_div16 - 1 - j); + transpose_16bit_16x16_avx2(temp, buf1_cur + offset); + } + } else { + for (int j = 0; j < buf_size_w_div16; ++j) { + transpose_16bit_16x16_avx2(buf0 + 16 * j, buf1_cur + txfm_size_row * j); + } + } + } + const __m256i scale1 = _mm256_set1_epi16(1 << (15 + shift[1])); + for (int i = 0; i < buf_size_w_div16; i++) { + __m256i *buf1_cur = buf1 + i * txfm_size_row; + col_txfm(buf1_cur, buf1_cur, cos_bit_col); + for (int j = 0; j < txfm_size_row; ++j) { + buf1_cur[j] = _mm256_mulhrs_epi16(buf1_cur[j], scale1); + } + } + for (int i = 0; i < buf_size_w_div16; i++) { + lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i, + stride, ud_flip, txfm_size_row); + } +} + +static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input, + int stride, int shift, int height, + int txw_idx, int rect_type) { + const int32_t *input_row = input; + const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]); + const __m256i _r = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) + + (1 << (NewSqrt2Bits - shift - 1))); + const __m256i one = _mm256_set1_epi16(1); + const __m256i scale__r = _mm256_unpacklo_epi16(scale, _r); + if (rect_type != 1 && rect_type != -1) { + for (int i = 0; i < height; ++i) { + const __m256i src = load_32bit_to_16bit_w16_avx2(input_row); + input_row += stride; + __m256i lo = _mm256_unpacklo_epi16(src, one); + __m256i hi = _mm256_unpackhi_epi16(src, one); + lo = _mm256_madd_epi16(lo, scale__r); + hi = _mm256_madd_epi16(hi, scale__r); + lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift); + hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift); + out[i] = _mm256_packs_epi32(lo, hi); + } + } else { + const __m256i rect_scale = + _mm256_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits)); + for (int i = 0; i < height; ++i) { + __m256i src = load_32bit_to_16bit_w16_avx2(input_row); + src = _mm256_mulhrs_epi16(src, rect_scale); + input_row += stride; + __m256i lo = _mm256_unpacklo_epi16(src, one); + __m256i hi = _mm256_unpackhi_epi16(src, one); + lo = _mm256_madd_epi16(lo, scale__r); + hi = _mm256_madd_epi16(hi, scale__r); + lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift); + hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift); + out[i] = _mm256_packs_epi32(lo, hi); + } + } +} + +static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride, + __m256i *buf, int shift, int height, + int txh_idx) { + const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]); + const __m256i scale__r = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1)); + const __m256i shift__r = _mm256_set1_epi32(1 << (-shift - 1)); + const __m256i one = _mm256_set1_epi16(1); + const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale__r); + for (int h = 0; h < height; ++h) { + __m256i lo = _mm256_unpacklo_epi16(buf[h], one); + __m256i hi = _mm256_unpackhi_epi16(buf[h], one); + lo = _mm256_madd_epi16(lo, scale_coeff); + hi = _mm256_madd_epi16(hi, scale_coeff); + lo = _mm256_srai_epi32(lo, NewSqrt2Bits); + hi = _mm256_srai_epi32(hi, NewSqrt2Bits); + lo = _mm256_add_epi32(lo, shift__r); + hi = _mm256_add_epi32(hi, shift__r); + lo = _mm256_srai_epi32(lo, -shift); + hi = _mm256_srai_epi32(hi, -shift); + const __m256i x = _mm256_packs_epi32(lo, hi); + write_recon_w16_avx2(x, output); + output += stride; + } +} + +static INLINE void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input, + uint8_t *output, int stride, + TX_SIZE tx_size, + int32_t eob) { + (void)eob; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int input_stride = AOMMIN(32, txfm_size_col); + const int row_max = AOMMIN(32, txfm_size_row); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + __m256i buf[32]; + for (int i = 0; i < input_stride; i += 16) { + iidentity_row_16xn_avx2(buf, input + i, input_stride, shift[0], row_max, + txw_idx, rect_type); + iidentity_col_16xn_avx2(output + i, stride, buf, shift[1], row_max, + txh_idx); + } +} + +static INLINE void lowbd_inv_txfm2d_add_h_identity_avx2( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + int eobx, eoby; + get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int txfm_size_col_notzero = AOMMIN(32, txfm_size_col); + const int input_stride = txfm_size_col_notzero; + const int buf_size_w_div16 = (eobx + 16) >> 4; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_avx2 col_txfm = + lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < buf_size_w_div16; i++) { + __m256i buf0[64]; + iidentity_row_16xn_avx2(buf0, input + (i << 4), input_stride, shift[0], + eoby + 1, txw_idx, rect_type); + col_txfm(buf0, buf0, cos_bit_col); + __m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1])); + int k = ud_flip ? (txfm_size_row - 1) : 0; + const int step = ud_flip ? -1 : 1; + for (int j = 0; j < txfm_size_row; ++j, k += step) { + __m256i res = _mm256_mulhrs_epi16(buf0[k], mshift); + write_recon_w16_avx2(res, output + (i << 4) + j * stride); + } + } +} + +static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + __m256i buf1[64]; + int eobx, eoby; + get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div16 = txfm_size_col >> 4; + const int buf_size_h_div16 = (eoby + 16) >> 4; + const int input_stride = AOMMIN(32, txfm_size_col); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const transform_1d_avx2 row_txfm = + lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + + assert(row_txfm != NULL); + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < buf_size_h_div16; i++) { + __m256i buf0[64]; + const int32_t *input_row = input + i * input_stride * 16; + for (int j = 0; j < AOMMIN(4, buf_size_w_div16); ++j) { + __m256i *buf0_cur = buf0 + j * 16; + load_buffer_32bit_to_16bit_w16_avx2(input_row + j * 16, input_stride, + buf0_cur, 16); + transpose_16bit_16x16_avx2(buf0_cur, buf0_cur); + } + if (rect_type == 1 || rect_type == -1) { + round_shift_avx2(buf0, buf0, input_stride); // rect special code + } + row_txfm(buf0, buf0, cos_bit_row); + round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]); + __m256i *_buf1 = buf1; + if (lr_flip) { + for (int j = 0; j < buf_size_w_div16; ++j) { + __m256i temp[16]; + flip_buf_avx2(buf0 + 16 * j, temp, 16); + transpose_16bit_16x16_avx2(temp, + _buf1 + 16 * (buf_size_w_div16 - 1 - j)); + } + } else { + for (int j = 0; j < buf_size_w_div16; ++j) { + transpose_16bit_16x16_avx2(buf0 + 16 * j, _buf1 + 16 * j); + } + } + for (int j = 0; j < buf_size_w_div16; ++j) { + iidentity_col_16xn_avx2(output + i * 16 * stride + j * 16, stride, + buf1 + j * 16, shift[1], 16, txh_idx); + } + } +} + +// for 32x32,32x64,64x32,64x64,16x32,32x16,64x16,16x64 +static INLINE void lowbd_inv_txfm2d_add_universe_avx2( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + (void)eob; + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: // ADST in vertical, DCT in horizontal + case DCT_ADST: // DCT in vertical, ADST in horizontal + case ADST_ADST: // ADST in both directions + case FLIPADST_DCT: + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + lowbd_inv_txfm2d_add_no_identity_avx2(input, output, stride, tx_type, + tx_size, eob); + break; + case IDTX: + lowbd_inv_txfm2d_add_idtx_avx2(input, output, stride, tx_size, eob); + break; + case V_DCT: + case V_ADST: + case V_FLIPADST: + lowbd_inv_txfm2d_add_h_identity_avx2(input, output, stride, tx_type, + tx_size, eob); + break; + case H_DCT: + case H_ADST: + case H_FLIPADST: + lowbd_inv_txfm2d_add_v_identity_avx2(input, output, stride, tx_type, + tx_size, eob); + break; + default: + av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size, + eob); + break; + } +} + +void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, TX_SIZE tx_size, + int eob) { + switch (tx_size) { + case TX_4X4: + case TX_8X8: + case TX_4X8: + case TX_8X4: + case TX_8X16: + case TX_16X8: + case TX_4X16: + case TX_16X4: + case TX_8X32: + case TX_32X8: + av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size, + eob); + break; + case TX_16X16: + case TX_32X32: + case TX_64X64: + case TX_16X32: + case TX_32X16: + case TX_32X64: + case TX_64X32: + case TX_16X64: + case TX_64X16: + default: + lowbd_inv_txfm2d_add_universe_avx2(input, output, stride, tx_type, + tx_size, eob); + break; + } +} + +void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, + const TxfmParam *txfm_param) { + const TX_TYPE tx_type = txfm_param->tx_type; + if (!txfm_param->lossless) { + av1_lowbd_inv_txfm2d_add_avx2(dqcoeff, dst, stride, tx_type, + txfm_param->tx_size, txfm_param->eob); + } else { + av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param); + } +} diff --git a/libs/libaom/src/av1/common/x86/av1_inv_txfm_avx2.h b/libs/libaom/src/av1/common/x86/av1_inv_txfm_avx2.h new file mode 100644 index 000000000..f74cbaeaa --- /dev/null +++ b/libs/libaom/src/av1/common/x86/av1_inv_txfm_avx2.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_ +#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_ + +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/transpose_sse2.h" +#include "aom_dsp/x86/txfm_common_sse2.h" +#include "aom_dsp/x86/txfm_common_avx2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// half input is zero +#define btf_16_w16_0_avx2(w0, w1, in, out0, out1) \ + { \ + const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \ + const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \ + const __m256i _in = in; \ + out0 = _mm256_mulhrs_epi16(_in, _w0); \ + out1 = _mm256_mulhrs_epi16(_in, _w1); \ + } + +static INLINE void round_shift_avx2(const __m256i *input, __m256i *output, + int size) { + const __m256i scale = _mm256_set1_epi16(NewInvSqrt2 * 8); + for (int i = 0; i < size; ++i) { + output[i] = _mm256_mulhrs_epi16(input[i], scale); + } +} + +static INLINE void write_recon_w16_avx2(__m256i res, uint8_t *output) { + __m128i pred = _mm_loadu_si128((__m128i const *)(output)); + __m256i u = _mm256_adds_epi16(_mm256_cvtepu8_epi16(pred), res); + __m128i y = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(u, u), 168)); + _mm_storeu_si128((__m128i *)(output), y); +} + +static INLINE void lowbd_write_buffer_16xn_avx2(__m256i *in, uint8_t *output, + int stride, int flipud, + int height) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + write_recon_w16_avx2(in[j], output + i * stride); + } +} + +void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, TX_SIZE tx_size, + int eob); +#ifdef __cplusplus +} +#endif + +#endif // AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_ diff --git a/libs/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.c b/libs/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.c new file mode 100644 index 000000000..46c051ff8 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.c @@ -0,0 +1,2956 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/x86/av1_inv_txfm_ssse3.h" +#include "av1/common/x86/av1_txfm_sse2.h" + +// TODO(venkatsanampudi@ittiam.com): move this to header file + +// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5 +static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096, + 4 * 5793 }; + +// TODO(binpengsmail@gmail.com): replace some for loop with do {} while + +static void idct4_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + + // stage 1 + __m128i x[4]; + x[0] = input[0]; + x[1] = input[2]; + x[2] = input[1]; + x[3] = input[3]; + + // stage 2 + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); + btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); + + // stage 3 + btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]); + btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]); +} + +static void idct4_w4_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + + // stage 1 + __m128i x[4]; + x[0] = input[0]; + x[1] = input[2]; + x[2] = input[1]; + x[3] = input[3]; + + // stage 2 + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); + btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); + + // stage 3 + btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]); + btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]); +} + +static void idct8_low1_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + + // stage 1 + __m128i x[2]; + x[0] = input[0]; + + // stage 2 + // stage 3 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + + // stage 4 + // stage 5 + output[0] = x[0]; + output[7] = x[0]; + output[1] = x[1]; + output[6] = x[1]; + output[2] = x[1]; + output[5] = x[1]; + output[3] = x[0]; + output[4] = x[0]; +} + +static void idct8_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m128i x[8]; + x[0] = input[0]; + x[1] = input[4]; + x[2] = input[2]; + x[3] = input[6]; + x[4] = input[1]; + x[5] = input[5]; + x[6] = input[3]; + x[7] = input[7]; + + // stage 2 + btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); + btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); + + // stage 3 + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); + btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); + btf_16_adds_subs_sse2(x[4], x[5]); + btf_16_subs_adds_sse2(x[7], x[6]); + + // stage 4 + btf_16_adds_subs_sse2(x[0], x[3]); + btf_16_adds_subs_sse2(x[1], x[2]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); + + // stage 5 + btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]); + btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]); + btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]); + btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]); +} + +static void idct8_w4_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m128i x[8]; + x[0] = input[0]; + x[1] = input[4]; + x[2] = input[2]; + x[3] = input[6]; + x[4] = input[1]; + x[5] = input[5]; + x[6] = input[3]; + x[7] = input[7]; + + // stage 2 + btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); + btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); + + // stage 3 + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); + btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); + btf_16_adds_subs_sse2(x[4], x[5]); + btf_16_subs_adds_sse2(x[7], x[6]); + + // stage 4 + btf_16_adds_subs_sse2(x[0], x[3]); + btf_16_adds_subs_sse2(x[1], x[2]); + btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); + + // stage 5 + btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]); + btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]); + btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]); + btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]); +} + +static INLINE void idct16_stage5_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_sse2(x[0], x[3]); + btf_16_adds_subs_sse2(x[1], x[2]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[11]); + btf_16_adds_subs_sse2(x[9], x[10]); + btf_16_subs_adds_sse2(x[15], x[12]); + btf_16_subs_adds_sse2(x[14], x[13]); +} + +static INLINE void idct16_stage6_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_sse2(x[0], x[7]); + btf_16_adds_subs_sse2(x[1], x[6]); + btf_16_adds_subs_sse2(x[2], x[5]); + btf_16_adds_subs_sse2(x[3], x[4]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); +} + +static INLINE void idct16_stage7_sse2(__m128i *output, __m128i *x) { + btf_16_adds_subs_out_sse2(output[0], output[15], x[0], x[15]); + btf_16_adds_subs_out_sse2(output[1], output[14], x[1], x[14]); + btf_16_adds_subs_out_sse2(output[2], output[13], x[2], x[13]); + btf_16_adds_subs_out_sse2(output[3], output[12], x[3], x[12]); + btf_16_adds_subs_out_sse2(output[4], output[11], x[4], x[11]); + btf_16_adds_subs_out_sse2(output[5], output[10], x[5], x[10]); + btf_16_adds_subs_out_sse2(output[6], output[9], x[6], x[9]); + btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]); +} + +static void idct16_low1_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + + // stage 1 + __m128i x[2]; + x[0] = input[0]; + + // stage 2 + // stage 3 + // stage 4 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + + // stage 5 + // stage 6 + // stage 7 + output[0] = x[0]; + output[15] = x[0]; + output[1] = x[1]; + output[14] = x[1]; + output[2] = x[1]; + output[13] = x[1]; + output[3] = x[0]; + output[12] = x[0]; + output[4] = x[0]; + output[11] = x[0]; + output[5] = x[1]; + output[10] = x[1]; + output[6] = x[1]; + output[9] = x[1]; + output[7] = x[0]; + output[8] = x[0]; +} + +static void idct16_low8_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + + // stage 1 + __m128i x[16]; + x[0] = input[0]; + x[2] = input[4]; + x[4] = input[2]; + x[6] = input[6]; + x[8] = input[1]; + x[10] = input[5]; + x[12] = input[3]; + x[14] = input[7]; + + // stage 2 + btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]); + btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]); + btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); + + // stage 3 + btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); + btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[9]); + btf_16_subs_adds_sse2(x[11], x[10]); + btf_16_adds_subs_sse2(x[12], x[13]); + btf_16_subs_adds_sse2(x[15], x[14]); + + // stage 4 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]); + btf_16_adds_subs_sse2(x[4], x[5]); + btf_16_subs_adds_sse2(x[7], x[6]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); + + idct16_stage5_sse2(x, cospi, __rounding, cos_bit); + idct16_stage6_sse2(x, cospi, __rounding, cos_bit); + idct16_stage7_sse2(output, x); +} + +static void idct16_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); + const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); + const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); + const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); + const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); + const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); + const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); + const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); + const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + + // stage 1 + __m128i x[16]; + x[0] = input[0]; + x[1] = input[8]; + x[2] = input[4]; + x[3] = input[12]; + x[4] = input[2]; + x[5] = input[10]; + x[6] = input[6]; + x[7] = input[14]; + x[8] = input[1]; + x[9] = input[9]; + x[10] = input[5]; + x[11] = input[13]; + x[12] = input[3]; + x[13] = input[11]; + x[14] = input[7]; + x[15] = input[15]; + + // stage 2 + btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]); + btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]); + btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]); + btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]); + + // stage 3 + btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); + btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[9]); + btf_16_subs_adds_sse2(x[11], x[10]); + btf_16_adds_subs_sse2(x[12], x[13]); + btf_16_subs_adds_sse2(x[15], x[14]); + + // stage 4 + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); + btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); + btf_16_adds_subs_sse2(x[4], x[5]); + btf_16_subs_adds_sse2(x[7], x[6]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); + + // stage 5~7 + idct16_stage5_sse2(x, cospi, __rounding, cos_bit); + idct16_stage6_sse2(x, cospi, __rounding, cos_bit); + idct16_stage7_sse2(output, x); +} + +static void idct16_w4_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); + const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); + const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); + const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); + const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); + const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); + const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); + const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); + const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m128i x[16]; + x[0] = input[0]; + x[1] = input[8]; + x[2] = input[4]; + x[3] = input[12]; + x[4] = input[2]; + x[5] = input[10]; + x[6] = input[6]; + x[7] = input[14]; + x[8] = input[1]; + x[9] = input[9]; + x[10] = input[5]; + x[11] = input[13]; + x[12] = input[3]; + x[13] = input[11]; + x[14] = input[7]; + x[15] = input[15]; + + // stage 2 + btf_16_4p_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]); + btf_16_4p_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]); + btf_16_4p_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]); + btf_16_4p_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]); + + // stage 3 + btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); + btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[9]); + btf_16_subs_adds_sse2(x[11], x[10]); + btf_16_adds_subs_sse2(x[12], x[13]); + btf_16_subs_adds_sse2(x[15], x[14]); + + // stage 4 + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); + btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); + btf_16_adds_subs_sse2(x[4], x[5]); + btf_16_subs_adds_sse2(x[7], x[6]); + btf_16_4p_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); + btf_16_4p_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); + + // stage 5 + btf_16_adds_subs_sse2(x[0], x[3]); + btf_16_adds_subs_sse2(x[1], x[2]); + btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[11]); + btf_16_adds_subs_sse2(x[9], x[10]); + btf_16_subs_adds_sse2(x[15], x[12]); + btf_16_subs_adds_sse2(x[14], x[13]); + + // stage 6 + btf_16_adds_subs_sse2(x[0], x[7]); + btf_16_adds_subs_sse2(x[1], x[6]); + btf_16_adds_subs_sse2(x[2], x[5]); + btf_16_adds_subs_sse2(x[3], x[4]); + btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); + btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); + + // stage 7 + idct16_stage7_sse2(output, x); +} + +static INLINE void idct32_high16_stage3_sse2(__m128i *x) { + btf_16_adds_subs_sse2(x[16], x[17]); + btf_16_subs_adds_sse2(x[19], x[18]); + btf_16_adds_subs_sse2(x[20], x[21]); + btf_16_subs_adds_sse2(x[23], x[22]); + btf_16_adds_subs_sse2(x[24], x[25]); + btf_16_subs_adds_sse2(x[27], x[26]); + btf_16_adds_subs_sse2(x[28], x[29]); + btf_16_subs_adds_sse2(x[31], x[30]); +} + +static INLINE void idct32_high16_stage4_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); + const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]); + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]); +} + +static INLINE void idct32_high24_stage5_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); + btf_16_adds_subs_sse2(x[16], x[19]); + btf_16_adds_subs_sse2(x[17], x[18]); + btf_16_subs_adds_sse2(x[23], x[20]); + btf_16_subs_adds_sse2(x[22], x[21]); + btf_16_adds_subs_sse2(x[24], x[27]); + btf_16_adds_subs_sse2(x[25], x[26]); + btf_16_subs_adds_sse2(x[31], x[28]); + btf_16_subs_adds_sse2(x[30], x[29]); +} + +static INLINE void idct32_high28_stage6_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[11]); + btf_16_adds_subs_sse2(x[9], x[10]); + btf_16_subs_adds_sse2(x[15], x[12]); + btf_16_subs_adds_sse2(x[14], x[13]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]); +} + +static INLINE void idct32_stage7_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_sse2(x[0], x[7]); + btf_16_adds_subs_sse2(x[1], x[6]); + btf_16_adds_subs_sse2(x[2], x[5]); + btf_16_adds_subs_sse2(x[3], x[4]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); + btf_16_adds_subs_sse2(x[16], x[23]); + btf_16_adds_subs_sse2(x[17], x[22]); + btf_16_adds_subs_sse2(x[18], x[21]); + btf_16_adds_subs_sse2(x[19], x[20]); + btf_16_subs_adds_sse2(x[31], x[24]); + btf_16_subs_adds_sse2(x[30], x[25]); + btf_16_subs_adds_sse2(x[29], x[26]); + btf_16_subs_adds_sse2(x[28], x[27]); +} + +static INLINE void idct32_stage8_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_sse2(x[0], x[15]); + btf_16_adds_subs_sse2(x[1], x[14]); + btf_16_adds_subs_sse2(x[2], x[13]); + btf_16_adds_subs_sse2(x[3], x[12]); + btf_16_adds_subs_sse2(x[4], x[11]); + btf_16_adds_subs_sse2(x[5], x[10]); + btf_16_adds_subs_sse2(x[6], x[9]); + btf_16_adds_subs_sse2(x[7], x[8]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]); +} + +static INLINE void idct32_stage9_sse2(__m128i *output, __m128i *x) { + btf_16_adds_subs_out_sse2(output[0], output[31], x[0], x[31]); + btf_16_adds_subs_out_sse2(output[1], output[30], x[1], x[30]); + btf_16_adds_subs_out_sse2(output[2], output[29], x[2], x[29]); + btf_16_adds_subs_out_sse2(output[3], output[28], x[3], x[28]); + btf_16_adds_subs_out_sse2(output[4], output[27], x[4], x[27]); + btf_16_adds_subs_out_sse2(output[5], output[26], x[5], x[26]); + btf_16_adds_subs_out_sse2(output[6], output[25], x[6], x[25]); + btf_16_adds_subs_out_sse2(output[7], output[24], x[7], x[24]); + btf_16_adds_subs_out_sse2(output[8], output[23], x[8], x[23]); + btf_16_adds_subs_out_sse2(output[9], output[22], x[9], x[22]); + btf_16_adds_subs_out_sse2(output[10], output[21], x[10], x[21]); + btf_16_adds_subs_out_sse2(output[11], output[20], x[11], x[20]); + btf_16_adds_subs_out_sse2(output[12], output[19], x[12], x[19]); + btf_16_adds_subs_out_sse2(output[13], output[18], x[13], x[18]); + btf_16_adds_subs_out_sse2(output[14], output[17], x[14], x[17]); + btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]); +} + +static void idct32_low1_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + + // stage 1 + __m128i x[2]; + x[0] = input[0]; + + // stage 2 + // stage 3 + // stage 4 + // stage 5 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + + // stage 6 + // stage 7 + // stage 8 + // stage 9 + output[0] = x[0]; + output[31] = x[0]; + output[1] = x[1]; + output[30] = x[1]; + output[2] = x[1]; + output[29] = x[1]; + output[3] = x[0]; + output[28] = x[0]; + output[4] = x[0]; + output[27] = x[0]; + output[5] = x[1]; + output[26] = x[1]; + output[6] = x[1]; + output[25] = x[1]; + output[7] = x[0]; + output[24] = x[0]; + output[8] = x[0]; + output[23] = x[0]; + output[9] = x[1]; + output[22] = x[1]; + output[10] = x[1]; + output[21] = x[1]; + output[11] = x[0]; + output[20] = x[0]; + output[12] = x[0]; + output[19] = x[0]; + output[13] = x[1]; + output[18] = x[1]; + output[14] = x[1]; + output[17] = x[1]; + output[15] = x[0]; + output[16] = x[0]; +} + +static void idct32_low8_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + // stage 1 + __m128i x[32]; + x[0] = input[0]; + x[4] = input[4]; + x[8] = input[2]; + x[12] = input[6]; + x[16] = input[1]; + x[20] = input[5]; + x[24] = input[3]; + x[28] = input[7]; + + // stage 2 + btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]); + btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]); + btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); + + // stage 3 + btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); + x[17] = x[16]; + x[18] = x[19]; + x[21] = x[20]; + x[22] = x[23]; + x[25] = x[24]; + x[26] = x[27]; + x[29] = x[28]; + x[30] = x[31]; + + // stage 4 + btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); + x[9] = x[8]; + x[10] = x[11]; + x[13] = x[12]; + x[14] = x[15]; + idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit); + + // stage 5 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + x[5] = x[4]; + x[6] = x[7]; + idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit); + // stage 6 + x[3] = x[0]; + x[2] = x[1]; + idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit); + + idct32_stage7_sse2(x, cospi, __rounding, cos_bit); + idct32_stage8_sse2(x, cospi, __rounding, cos_bit); + idct32_stage9_sse2(output, x); +} + +static void idct32_low16_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + // stage 1 + __m128i x[32]; + x[0] = input[0]; + x[2] = input[8]; + x[4] = input[4]; + x[6] = input[12]; + x[8] = input[2]; + x[10] = input[10]; + x[12] = input[6]; + x[14] = input[14]; + x[16] = input[1]; + x[18] = input[9]; + x[20] = input[5]; + x[22] = input[13]; + x[24] = input[3]; + x[26] = input[11]; + x[28] = input[7]; + x[30] = input[15]; + + // stage 2 + btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]); + btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]); + btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]); + btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]); + btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]); + btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]); + btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); + + // stage 3 + btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]); + btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]); + btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); + idct32_high16_stage3_sse2(x); + + // stage 4 + btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); + btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[9]); + btf_16_subs_adds_sse2(x[11], x[10]); + btf_16_adds_subs_sse2(x[12], x[13]); + btf_16_subs_adds_sse2(x[15], x[14]); + idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit); + + // stage 5 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]); + btf_16_adds_subs_sse2(x[4], x[5]); + btf_16_subs_adds_sse2(x[7], x[6]); + idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit); + + btf_16_adds_subs_sse2(x[0], x[3]); + btf_16_adds_subs_sse2(x[1], x[2]); + idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit); + + idct32_stage7_sse2(x, cospi, __rounding, cos_bit); + idct32_stage8_sse2(x, cospi, __rounding, cos_bit); + idct32_stage9_sse2(output, x); +} + +static void idct32_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); + const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); + const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); + const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); + const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); + const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); + const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); + const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); + const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); + const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); + const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); + const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); + const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); + const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); + const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); + const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); + const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); + const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); + const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); + const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); + const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); + const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); + const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); + const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); + const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + + // stage 1 + __m128i x[32]; + x[0] = input[0]; + x[1] = input[16]; + x[2] = input[8]; + x[3] = input[24]; + x[4] = input[4]; + x[5] = input[20]; + x[6] = input[12]; + x[7] = input[28]; + x[8] = input[2]; + x[9] = input[18]; + x[10] = input[10]; + x[11] = input[26]; + x[12] = input[6]; + x[13] = input[22]; + x[14] = input[14]; + x[15] = input[30]; + x[16] = input[1]; + x[17] = input[17]; + x[18] = input[9]; + x[19] = input[25]; + x[20] = input[5]; + x[21] = input[21]; + x[22] = input[13]; + x[23] = input[29]; + x[24] = input[3]; + x[25] = input[19]; + x[26] = input[11]; + x[27] = input[27]; + x[28] = input[7]; + x[29] = input[23]; + x[30] = input[15]; + x[31] = input[31]; + + // stage 2 + btf_16_sse2(cospi_p62_m02, cospi_p02_p62, x[16], x[31], x[16], x[31]); + btf_16_sse2(cospi_p30_m34, cospi_p34_p30, x[17], x[30], x[17], x[30]); + btf_16_sse2(cospi_p46_m18, cospi_p18_p46, x[18], x[29], x[18], x[29]); + btf_16_sse2(cospi_p14_m50, cospi_p50_p14, x[19], x[28], x[19], x[28]); + btf_16_sse2(cospi_p54_m10, cospi_p10_p54, x[20], x[27], x[20], x[27]); + btf_16_sse2(cospi_p22_m42, cospi_p42_p22, x[21], x[26], x[21], x[26]); + btf_16_sse2(cospi_p38_m26, cospi_p26_p38, x[22], x[25], x[22], x[25]); + btf_16_sse2(cospi_p06_m58, cospi_p58_p06, x[23], x[24], x[23], x[24]); + + // stage 3 + btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]); + btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]); + btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]); + btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]); + idct32_high16_stage3_sse2(x); + + // stage 4 + btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); + btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[9]); + btf_16_subs_adds_sse2(x[11], x[10]); + btf_16_adds_subs_sse2(x[12], x[13]); + btf_16_subs_adds_sse2(x[15], x[14]); + idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit); + + // stage 5 + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); + btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); + btf_16_adds_subs_sse2(x[4], x[5]); + btf_16_adds_subs_sse2(x[7], x[6]); + idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit); + + // stage 6 + btf_16_adds_subs_sse2(x[0], x[3]); + btf_16_adds_subs_sse2(x[1], x[2]); + idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit); + + // stage 7~8 + idct32_stage7_sse2(x, cospi, __rounding, cos_bit); + idct32_stage8_sse2(x, cospi, __rounding, cos_bit); + idct32_stage9_sse2(output, x); +} + +static INLINE void idct64_stage4_high32_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); + const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); + const __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]); + const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); + const __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); + const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]); + const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); + const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); + const __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]); + const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); + const __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); + const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]); + btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]); + btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]); + btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]); + btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]); + btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]); + btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]); + btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]); + btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]); +} + +static INLINE void idct64_stage5_high48_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); + const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]); + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]); + btf_16_adds_subs_sse2(x[32], x[35]); + btf_16_adds_subs_sse2(x[33], x[34]); + btf_16_subs_adds_sse2(x[39], x[36]); + btf_16_subs_adds_sse2(x[38], x[37]); + btf_16_adds_subs_sse2(x[40], x[43]); + btf_16_adds_subs_sse2(x[41], x[42]); + btf_16_subs_adds_sse2(x[47], x[44]); + btf_16_subs_adds_sse2(x[46], x[45]); + btf_16_adds_subs_sse2(x[48], x[51]); + btf_16_adds_subs_sse2(x[49], x[50]); + btf_16_subs_adds_sse2(x[55], x[52]); + btf_16_subs_adds_sse2(x[54], x[53]); + btf_16_adds_subs_sse2(x[56], x[59]); + btf_16_adds_subs_sse2(x[57], x[58]); + btf_16_subs_adds_sse2(x[63], x[60]); + btf_16_subs_adds_sse2(x[62], x[61]); +} + +static INLINE void idct64_stage6_high32_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); + const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]); + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]); + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]); + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]); +} + +static INLINE void idct64_stage6_high48_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + btf_16_adds_subs_sse2(x[16], x[19]); + btf_16_adds_subs_sse2(x[17], x[18]); + btf_16_subs_adds_sse2(x[23], x[20]); + btf_16_subs_adds_sse2(x[22], x[21]); + btf_16_adds_subs_sse2(x[24], x[27]); + btf_16_adds_subs_sse2(x[25], x[26]); + btf_16_subs_adds_sse2(x[31], x[28]); + btf_16_subs_adds_sse2(x[30], x[29]); + idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit); +} + +static INLINE void idct64_stage7_high48_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]); + btf_16_adds_subs_sse2(x[32], x[39]); + btf_16_adds_subs_sse2(x[33], x[38]); + btf_16_adds_subs_sse2(x[34], x[37]); + btf_16_adds_subs_sse2(x[35], x[36]); + btf_16_subs_adds_sse2(x[47], x[40]); + btf_16_subs_adds_sse2(x[46], x[41]); + btf_16_subs_adds_sse2(x[45], x[42]); + btf_16_subs_adds_sse2(x[44], x[43]); + btf_16_adds_subs_sse2(x[48], x[55]); + btf_16_adds_subs_sse2(x[49], x[54]); + btf_16_adds_subs_sse2(x[50], x[53]); + btf_16_adds_subs_sse2(x[51], x[52]); + btf_16_subs_adds_sse2(x[63], x[56]); + btf_16_subs_adds_sse2(x[62], x[57]); + btf_16_subs_adds_sse2(x[61], x[58]); + btf_16_subs_adds_sse2(x[60], x[59]); +} + +static INLINE void idct64_stage8_high48_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + btf_16_adds_subs_sse2(x[16], x[23]); + btf_16_adds_subs_sse2(x[17], x[22]); + btf_16_adds_subs_sse2(x[18], x[21]); + btf_16_adds_subs_sse2(x[19], x[20]); + btf_16_subs_adds_sse2(x[31], x[24]); + btf_16_subs_adds_sse2(x[30], x[25]); + btf_16_subs_adds_sse2(x[29], x[26]); + btf_16_subs_adds_sse2(x[28], x[27]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]); +} + +static INLINE void idct64_stage9_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_sse2(x[0], x[15]); + btf_16_adds_subs_sse2(x[1], x[14]); + btf_16_adds_subs_sse2(x[2], x[13]); + btf_16_adds_subs_sse2(x[3], x[12]); + btf_16_adds_subs_sse2(x[4], x[11]); + btf_16_adds_subs_sse2(x[5], x[10]); + btf_16_adds_subs_sse2(x[6], x[9]); + btf_16_adds_subs_sse2(x[7], x[8]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]); + btf_16_adds_subs_sse2(x[32], x[47]); + btf_16_adds_subs_sse2(x[33], x[46]); + btf_16_adds_subs_sse2(x[34], x[45]); + btf_16_adds_subs_sse2(x[35], x[44]); + btf_16_adds_subs_sse2(x[36], x[43]); + btf_16_adds_subs_sse2(x[37], x[42]); + btf_16_adds_subs_sse2(x[38], x[41]); + btf_16_adds_subs_sse2(x[39], x[40]); + btf_16_subs_adds_sse2(x[63], x[48]); + btf_16_subs_adds_sse2(x[62], x[49]); + btf_16_subs_adds_sse2(x[61], x[50]); + btf_16_subs_adds_sse2(x[60], x[51]); + btf_16_subs_adds_sse2(x[59], x[52]); + btf_16_subs_adds_sse2(x[58], x[53]); + btf_16_subs_adds_sse2(x[57], x[54]); + btf_16_subs_adds_sse2(x[56], x[55]); +} + +static INLINE void idct64_stage10_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_sse2(x[0], x[31]); + btf_16_adds_subs_sse2(x[1], x[30]); + btf_16_adds_subs_sse2(x[2], x[29]); + btf_16_adds_subs_sse2(x[3], x[28]); + btf_16_adds_subs_sse2(x[4], x[27]); + btf_16_adds_subs_sse2(x[5], x[26]); + btf_16_adds_subs_sse2(x[6], x[25]); + btf_16_adds_subs_sse2(x[7], x[24]); + btf_16_adds_subs_sse2(x[8], x[23]); + btf_16_adds_subs_sse2(x[9], x[22]); + btf_16_adds_subs_sse2(x[10], x[21]); + btf_16_adds_subs_sse2(x[11], x[20]); + btf_16_adds_subs_sse2(x[12], x[19]); + btf_16_adds_subs_sse2(x[13], x[18]); + btf_16_adds_subs_sse2(x[14], x[17]); + btf_16_adds_subs_sse2(x[15], x[16]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]); +} + +static INLINE void idct64_stage11_sse2(__m128i *output, __m128i *x) { + btf_16_adds_subs_out_sse2(output[0], output[63], x[0], x[63]); + btf_16_adds_subs_out_sse2(output[1], output[62], x[1], x[62]); + btf_16_adds_subs_out_sse2(output[2], output[61], x[2], x[61]); + btf_16_adds_subs_out_sse2(output[3], output[60], x[3], x[60]); + btf_16_adds_subs_out_sse2(output[4], output[59], x[4], x[59]); + btf_16_adds_subs_out_sse2(output[5], output[58], x[5], x[58]); + btf_16_adds_subs_out_sse2(output[6], output[57], x[6], x[57]); + btf_16_adds_subs_out_sse2(output[7], output[56], x[7], x[56]); + btf_16_adds_subs_out_sse2(output[8], output[55], x[8], x[55]); + btf_16_adds_subs_out_sse2(output[9], output[54], x[9], x[54]); + btf_16_adds_subs_out_sse2(output[10], output[53], x[10], x[53]); + btf_16_adds_subs_out_sse2(output[11], output[52], x[11], x[52]); + btf_16_adds_subs_out_sse2(output[12], output[51], x[12], x[51]); + btf_16_adds_subs_out_sse2(output[13], output[50], x[13], x[50]); + btf_16_adds_subs_out_sse2(output[14], output[49], x[14], x[49]); + btf_16_adds_subs_out_sse2(output[15], output[48], x[15], x[48]); + btf_16_adds_subs_out_sse2(output[16], output[47], x[16], x[47]); + btf_16_adds_subs_out_sse2(output[17], output[46], x[17], x[46]); + btf_16_adds_subs_out_sse2(output[18], output[45], x[18], x[45]); + btf_16_adds_subs_out_sse2(output[19], output[44], x[19], x[44]); + btf_16_adds_subs_out_sse2(output[20], output[43], x[20], x[43]); + btf_16_adds_subs_out_sse2(output[21], output[42], x[21], x[42]); + btf_16_adds_subs_out_sse2(output[22], output[41], x[22], x[41]); + btf_16_adds_subs_out_sse2(output[23], output[40], x[23], x[40]); + btf_16_adds_subs_out_sse2(output[24], output[39], x[24], x[39]); + btf_16_adds_subs_out_sse2(output[25], output[38], x[25], x[38]); + btf_16_adds_subs_out_sse2(output[26], output[37], x[26], x[37]); + btf_16_adds_subs_out_sse2(output[27], output[36], x[27], x[36]); + btf_16_adds_subs_out_sse2(output[28], output[35], x[28], x[35]); + btf_16_adds_subs_out_sse2(output[29], output[34], x[29], x[34]); + btf_16_adds_subs_out_sse2(output[30], output[33], x[30], x[33]); + btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]); +} + +static void idct64_low1_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + + // stage 1 + __m128i x[32]; + x[0] = input[0]; + + // stage 2 + // stage 3 + // stage 4 + // stage 5 + // stage 6 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + + // stage 7 + // stage 8 + // stage 9 + // stage 10 + // stage 11 + output[0] = x[0]; + output[63] = x[0]; + output[1] = x[1]; + output[62] = x[1]; + output[2] = x[1]; + output[61] = x[1]; + output[3] = x[0]; + output[60] = x[0]; + output[4] = x[0]; + output[59] = x[0]; + output[5] = x[1]; + output[58] = x[1]; + output[6] = x[1]; + output[57] = x[1]; + output[7] = x[0]; + output[56] = x[0]; + output[8] = x[0]; + output[55] = x[0]; + output[9] = x[1]; + output[54] = x[1]; + output[10] = x[1]; + output[53] = x[1]; + output[11] = x[0]; + output[52] = x[0]; + output[12] = x[0]; + output[51] = x[0]; + output[13] = x[1]; + output[50] = x[1]; + output[14] = x[1]; + output[49] = x[1]; + output[15] = x[0]; + output[48] = x[0]; + output[16] = x[0]; + output[47] = x[0]; + output[17] = x[1]; + output[46] = x[1]; + output[18] = x[1]; + output[45] = x[1]; + output[19] = x[0]; + output[44] = x[0]; + output[20] = x[0]; + output[43] = x[0]; + output[21] = x[1]; + output[42] = x[1]; + output[22] = x[1]; + output[41] = x[1]; + output[23] = x[0]; + output[40] = x[0]; + output[24] = x[0]; + output[39] = x[0]; + output[25] = x[1]; + output[38] = x[1]; + output[26] = x[1]; + output[37] = x[1]; + output[27] = x[0]; + output[36] = x[0]; + output[28] = x[0]; + output[35] = x[0]; + output[29] = x[1]; + output[34] = x[1]; + output[30] = x[1]; + output[33] = x[1]; + output[31] = x[0]; + output[32] = x[0]; +} + +static void idct64_low8_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); + const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); + const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); + const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]); + const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); + const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); + const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); + const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]); + const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m128i x[64]; + x[0] = input[0]; + x[8] = input[4]; + x[16] = input[2]; + x[24] = input[6]; + x[32] = input[1]; + x[40] = input[5]; + x[48] = input[3]; + x[56] = input[7]; + + // stage 2 + btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]); + btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]); + btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]); + btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]); + + // stage 3 + btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); + x[33] = x[32]; + x[38] = x[39]; + x[41] = x[40]; + x[46] = x[47]; + x[49] = x[48]; + x[54] = x[55]; + x[57] = x[56]; + x[62] = x[63]; + + // stage 4 + btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); + x[17] = x[16]; + x[22] = x[23]; + x[25] = x[24]; + x[30] = x[31]; + btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]); + btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]); + btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]); + btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]); + + // stage 5 + x[9] = x[8]; + x[14] = x[15]; + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]); + x[35] = x[32]; + x[34] = x[33]; + x[36] = x[39]; + x[37] = x[38]; + x[43] = x[40]; + x[42] = x[41]; + x[44] = x[47]; + x[45] = x[46]; + x[51] = x[48]; + x[50] = x[49]; + x[52] = x[55]; + x[53] = x[54]; + x[59] = x[56]; + x[58] = x[57]; + x[60] = x[63]; + x[61] = x[62]; + + // stage 6 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); + x[19] = x[16]; + x[18] = x[17]; + x[20] = x[23]; + x[21] = x[22]; + x[27] = x[24]; + x[26] = x[25]; + x[28] = x[31]; + x[29] = x[30]; + idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit); + + // stage 7 + x[3] = x[0]; + x[2] = x[1]; + x[11] = x[8]; + x[10] = x[9]; + x[12] = x[15]; + x[13] = x[14]; + idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit); + + // stage 8 + x[7] = x[0]; + x[6] = x[1]; + x[5] = x[2]; + x[4] = x[3]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); + idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit); + + idct64_stage9_sse2(x, cospi, __rounding, cos_bit); + idct64_stage10_sse2(x, cospi, __rounding, cos_bit); + idct64_stage11_sse2(output, x); +} + +static void idct64_low16_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m128i x[64]; + x[0] = input[0]; + x[4] = input[8]; + x[8] = input[4]; + x[12] = input[12]; + x[16] = input[2]; + x[20] = input[10]; + x[24] = input[6]; + x[28] = input[14]; + x[32] = input[1]; + x[36] = input[9]; + x[40] = input[5]; + x[44] = input[13]; + x[48] = input[3]; + x[52] = input[11]; + x[56] = input[7]; + x[60] = input[15]; + + // stage 2 + btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]); + btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]); + btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]); + btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]); + btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]); + btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]); + btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]); + btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]); + + // stage 3 + btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]); + btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]); + btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); + x[33] = x[32]; + x[34] = x[35]; + x[37] = x[36]; + x[38] = x[39]; + x[41] = x[40]; + x[42] = x[43]; + x[45] = x[44]; + x[46] = x[47]; + x[49] = x[48]; + x[50] = x[51]; + x[53] = x[52]; + x[54] = x[55]; + x[57] = x[56]; + x[58] = x[59]; + x[61] = x[60]; + x[62] = x[63]; + + // stage 4 + btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); + x[17] = x[16]; + x[18] = x[19]; + x[21] = x[20]; + x[22] = x[23]; + x[25] = x[24]; + x[26] = x[27]; + x[29] = x[28]; + x[30] = x[31]; + idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit); + + // stage 5 + btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); + x[9] = x[8]; + x[10] = x[11]; + x[13] = x[12]; + x[14] = x[15]; + idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit); + + // stage 6 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + x[5] = x[4]; + x[6] = x[7]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); + idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit); + + // stage 7 + x[3] = x[0]; + x[2] = x[1]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[11]); + btf_16_adds_subs_sse2(x[9], x[10]); + btf_16_subs_adds_sse2(x[15], x[12]); + btf_16_subs_adds_sse2(x[14], x[13]); + idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit); + + // stage 8 + btf_16_adds_subs_sse2(x[0], x[7]); + btf_16_adds_subs_sse2(x[1], x[6]); + btf_16_adds_subs_sse2(x[2], x[5]); + btf_16_adds_subs_sse2(x[3], x[4]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); + idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit); + + idct64_stage9_sse2(x, cospi, __rounding, cos_bit); + idct64_stage10_sse2(x, cospi, __rounding, cos_bit); + idct64_stage11_sse2(output, x); +} + +static void idct64_low32_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m128i x[64]; + x[0] = input[0]; + x[2] = input[16]; + x[4] = input[8]; + x[6] = input[24]; + x[8] = input[4]; + x[10] = input[20]; + x[12] = input[12]; + x[14] = input[28]; + x[16] = input[2]; + x[18] = input[18]; + x[20] = input[10]; + x[22] = input[26]; + x[24] = input[6]; + x[26] = input[22]; + x[28] = input[14]; + x[30] = input[30]; + x[32] = input[1]; + x[34] = input[17]; + x[36] = input[9]; + x[38] = input[25]; + x[40] = input[5]; + x[42] = input[21]; + x[44] = input[13]; + x[46] = input[29]; + x[48] = input[3]; + x[50] = input[19]; + x[52] = input[11]; + x[54] = input[27]; + x[56] = input[7]; + x[58] = input[23]; + x[60] = input[15]; + x[62] = input[31]; + + // stage 2 + btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]); + btf_16_ssse3(-cospi[33], cospi[31], x[62], x[33], x[62]); + btf_16_ssse3(cospi[47], cospi[17], x[34], x[34], x[61]); + btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]); + btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]); + btf_16_ssse3(-cospi[41], cospi[23], x[58], x[37], x[58]); + btf_16_ssse3(cospi[39], cospi[25], x[38], x[38], x[57]); + btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]); + btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]); + btf_16_ssse3(-cospi[37], cospi[27], x[54], x[41], x[54]); + btf_16_ssse3(cospi[43], cospi[21], x[42], x[42], x[53]); + btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]); + btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]); + btf_16_ssse3(-cospi[45], cospi[19], x[50], x[45], x[50]); + btf_16_ssse3(cospi[35], cospi[29], x[46], x[46], x[49]); + btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]); + + // stage 3 + btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]); + btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]); + btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]); + btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]); + btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]); + btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]); + btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); + btf_16_adds_subs_sse2(x[32], x[33]); + btf_16_subs_adds_sse2(x[35], x[34]); + btf_16_adds_subs_sse2(x[36], x[37]); + btf_16_subs_adds_sse2(x[39], x[38]); + btf_16_adds_subs_sse2(x[40], x[41]); + btf_16_subs_adds_sse2(x[43], x[42]); + btf_16_adds_subs_sse2(x[44], x[45]); + btf_16_subs_adds_sse2(x[47], x[46]); + btf_16_adds_subs_sse2(x[48], x[49]); + btf_16_subs_adds_sse2(x[51], x[50]); + btf_16_adds_subs_sse2(x[52], x[53]); + btf_16_subs_adds_sse2(x[55], x[54]); + btf_16_adds_subs_sse2(x[56], x[57]); + btf_16_subs_adds_sse2(x[59], x[58]); + btf_16_adds_subs_sse2(x[60], x[61]); + btf_16_subs_adds_sse2(x[63], x[62]); + + // stage 4 + btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]); + btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]); + btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); + btf_16_adds_subs_sse2(x[16], x[17]); + btf_16_subs_adds_sse2(x[19], x[18]); + btf_16_adds_subs_sse2(x[20], x[21]); + btf_16_subs_adds_sse2(x[23], x[22]); + btf_16_adds_subs_sse2(x[24], x[25]); + btf_16_subs_adds_sse2(x[27], x[26]); + btf_16_adds_subs_sse2(x[28], x[29]); + btf_16_subs_adds_sse2(x[31], x[30]); + idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit); + + // stage 5 + btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); + btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[9]); + btf_16_subs_adds_sse2(x[11], x[10]); + btf_16_adds_subs_sse2(x[12], x[13]); + btf_16_subs_adds_sse2(x[15], x[14]); + idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit); + + // stage 6 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]); + btf_16_adds_subs_sse2(x[4], x[5]); + btf_16_subs_adds_sse2(x[7], x[6]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); + idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit); + + // stage 7 + btf_16_adds_subs_sse2(x[0], x[3]); + btf_16_adds_subs_sse2(x[1], x[2]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[11]); + btf_16_adds_subs_sse2(x[9], x[10]); + btf_16_subs_adds_sse2(x[15], x[12]); + btf_16_subs_adds_sse2(x[14], x[13]); + idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit); + + // stage 8 + btf_16_adds_subs_sse2(x[0], x[7]); + btf_16_adds_subs_sse2(x[1], x[6]); + btf_16_adds_subs_sse2(x[2], x[5]); + btf_16_adds_subs_sse2(x[3], x[4]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); + idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit); + + // stage 9~11 + idct64_stage9_sse2(x, cospi, __rounding, cos_bit); + idct64_stage10_sse2(x, cospi, __rounding, cos_bit); + idct64_stage11_sse2(output, x); +} + +static void iadst4_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { + (void)cos_bit; + const int32_t *sinpi = sinpi_arr(INV_COS_BIT); + const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]); + const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]); + const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]); + const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]); + const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]); + const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]); + const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]); + const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]); + __m128i x0[4]; + x0[0] = input[0]; + x0[1] = input[1]; + x0[2] = input[2]; + x0[3] = input[3]; + + __m128i u[4]; + u[0] = _mm_unpacklo_epi16(x0[0], x0[2]); + u[1] = _mm_unpackhi_epi16(x0[0], x0[2]); + u[2] = _mm_unpacklo_epi16(x0[1], x0[3]); + u[3] = _mm_unpackhi_epi16(x0[1], x0[3]); + + __m128i x1[16]; + x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4 + x1[1] = _mm_madd_epi16(u[1], sinpi_p01_p04); + x1[2] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1 + x1[3] = _mm_madd_epi16(u[1], sinpi_p02_m01); + x1[4] = _mm_madd_epi16(u[2], sinpi_p03_p02); // x1*sin3 + x3*sin2 + x1[5] = _mm_madd_epi16(u[3], sinpi_p03_p02); + x1[6] = _mm_madd_epi16(u[2], sinpi_p03_m04); // x1*sin3 - x3*sin4 + x1[7] = _mm_madd_epi16(u[3], sinpi_p03_m04); + x1[8] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3 + x1[9] = _mm_madd_epi16(u[1], sinpi_p03_m03); + x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03); // x2*sin3 + x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03); + x1[12] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2 + x1[13] = _mm_madd_epi16(u[1], sinpi_p04_p02); + x1[14] = _mm_madd_epi16(u[2], sinpi_m03_m01); // -x1*sin3 - x3*sin1 + x1[15] = _mm_madd_epi16(u[3], sinpi_m03_m01); + + __m128i x2[8]; + x2[0] = _mm_add_epi32(x1[0], x1[4]); // x0*sin1 +x2*sin4 +x1*sin3 +x3*sin2 + x2[1] = _mm_add_epi32(x1[1], x1[5]); + x2[2] = _mm_add_epi32(x1[2], x1[6]); // x0*sin2 -x2*sin1 +x1*sin3 -x3*sin4 + x2[3] = _mm_add_epi32(x1[3], x1[7]); + x2[4] = _mm_add_epi32(x1[8], x1[10]); // x0*sin3 -x2*sin3 +x3*sin3 + x2[5] = _mm_add_epi32(x1[9], x1[11]); + x2[6] = _mm_add_epi32(x1[12], x1[14]); // x0*sin1 +x2*sin4 +x0*sin2 -x2*sin1 + x2[7] = _mm_add_epi32(x1[13], x1[15]); + + const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + for (int i = 0; i < 4; ++i) { + __m128i out0 = _mm_add_epi32(x2[2 * i], rounding); + __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding); + out0 = _mm_srai_epi32(out0, INV_COS_BIT); + out1 = _mm_srai_epi32(out1, INV_COS_BIT); + output[i] = _mm_packs_epi32(out0, out1); + } +} + +static void iadst4_w4_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *sinpi = sinpi_arr(INV_COS_BIT); + const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]); + const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]); + const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]); + const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]); + const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]); + const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]); + const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]); + const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]); + __m128i x0[4]; + x0[0] = input[0]; + x0[1] = input[1]; + x0[2] = input[2]; + x0[3] = input[3]; + + __m128i u[2]; + u[0] = _mm_unpacklo_epi16(x0[0], x0[2]); + u[1] = _mm_unpacklo_epi16(x0[1], x0[3]); + + __m128i x1[8]; + x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4 + x1[1] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1 + x1[2] = _mm_madd_epi16(u[1], sinpi_p03_p02); // x1*sin3 + x3*sin2 + x1[3] = _mm_madd_epi16(u[1], sinpi_p03_m04); // x1*sin3 - x3*sin4 + x1[4] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3 + x1[5] = _mm_madd_epi16(u[1], sinpi_0_p03); // x2*sin3 + x1[6] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2 + x1[7] = _mm_madd_epi16(u[1], sinpi_m03_m01); // -x1*sin3 - x3*sin1 + + __m128i x2[4]; + x2[0] = _mm_add_epi32(x1[0], x1[2]); // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2 + x2[1] = _mm_add_epi32(x1[1], x1[3]); // x0*sin2 - x2*sin1 + x1*sin3 - x3*sin4 + x2[2] = _mm_add_epi32(x1[4], x1[5]); // x0*sin3 - x2*sin3 + x3*sin3 + x2[3] = _mm_add_epi32(x1[6], x1[7]); // x0*sin4 + x2*sin2 - x1*sin3 - x3*sin1 + + const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + for (int i = 0; i < 4; ++i) { + __m128i out0 = _mm_add_epi32(x2[i], rounding); + out0 = _mm_srai_epi32(out0, INV_COS_BIT); + output[i] = _mm_packs_epi32(out0, out0); + } +} + +static void iadst8_low1_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + + // stage 1 + __m128i x[8]; + x[1] = input[0]; + + // stage 2 + btf_16_ssse3(cospi[60], -cospi[4], x[1], x[0], x[1]); + + // stage 3 + x[4] = x[0]; + x[5] = x[1]; + + // stage 4 + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); + + // stage 5 + x[2] = x[0]; + x[3] = x[1]; + x[6] = x[4]; + x[7] = x[5]; + + // stage 6 + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); + + // stage 7 + output[0] = x[0]; + output[1] = _mm_subs_epi16(__zero, x[4]); + output[2] = x[6]; + output[3] = _mm_subs_epi16(__zero, x[2]); + output[4] = x[3]; + output[5] = _mm_subs_epi16(__zero, x[7]); + output[6] = x[5]; + output[7] = _mm_subs_epi16(__zero, x[1]); +} + +static void iadst8_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); + const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); + const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); + const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); + const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); + const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); + const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); + const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + + // stage 1 + __m128i x[8]; + x[0] = input[7]; + x[1] = input[0]; + x[2] = input[5]; + x[3] = input[2]; + x[4] = input[3]; + x[5] = input[4]; + x[6] = input[1]; + x[7] = input[6]; + + // stage 2 + btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]); + btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]); + btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]); + btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]); + + // stage 3 + btf_16_adds_subs_sse2(x[0], x[4]); + btf_16_adds_subs_sse2(x[1], x[5]); + btf_16_adds_subs_sse2(x[2], x[6]); + btf_16_adds_subs_sse2(x[3], x[7]); + + // stage 4 + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); + btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]); + + // stage 5 + btf_16_adds_subs_sse2(x[0], x[2]); + btf_16_adds_subs_sse2(x[1], x[3]); + btf_16_adds_subs_sse2(x[4], x[6]); + btf_16_adds_subs_sse2(x[5], x[7]); + + // stage 6 + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); + + // stage 7 + output[0] = x[0]; + output[1] = _mm_subs_epi16(__zero, x[4]); + output[2] = x[6]; + output[3] = _mm_subs_epi16(__zero, x[2]); + output[4] = x[3]; + output[5] = _mm_subs_epi16(__zero, x[7]); + output[6] = x[5]; + output[7] = _mm_subs_epi16(__zero, x[1]); +} + +static void iadst8_w4_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); + const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); + const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); + const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); + const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); + const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); + const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); + const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + + // stage 1 + __m128i x[8]; + x[0] = input[7]; + x[1] = input[0]; + x[2] = input[5]; + x[3] = input[2]; + x[4] = input[3]; + x[5] = input[4]; + x[6] = input[1]; + x[7] = input[6]; + + // stage 2 + btf_16_4p_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]); + btf_16_4p_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]); + btf_16_4p_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]); + btf_16_4p_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]); + + // stage 3 + btf_16_adds_subs_sse2(x[0], x[4]); + btf_16_adds_subs_sse2(x[1], x[5]); + btf_16_adds_subs_sse2(x[2], x[6]); + btf_16_adds_subs_sse2(x[3], x[7]); + + // stage 4 + btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); + btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]); + + // stage 5 + btf_16_adds_subs_sse2(x[0], x[2]); + btf_16_adds_subs_sse2(x[1], x[3]); + btf_16_adds_subs_sse2(x[4], x[6]); + btf_16_adds_subs_sse2(x[5], x[7]); + + // stage 6 + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); + + // stage 7 + output[0] = x[0]; + output[1] = _mm_subs_epi16(__zero, x[4]); + output[2] = x[6]; + output[3] = _mm_subs_epi16(__zero, x[2]); + output[4] = x[3]; + output[5] = _mm_subs_epi16(__zero, x[7]); + output[6] = x[5]; + output[7] = _mm_subs_epi16(__zero, x[1]); +} + +static INLINE void iadst16_stage3_ssse3(__m128i *x) { + btf_16_adds_subs_sse2(x[0], x[8]); + btf_16_adds_subs_sse2(x[1], x[9]); + btf_16_adds_subs_sse2(x[2], x[10]); + btf_16_adds_subs_sse2(x[3], x[11]); + btf_16_adds_subs_sse2(x[4], x[12]); + btf_16_adds_subs_sse2(x[5], x[13]); + btf_16_adds_subs_sse2(x[6], x[14]); + btf_16_adds_subs_sse2(x[7], x[15]); +} + +static INLINE void iadst16_stage4_ssse3(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]); + const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]); + btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]); + btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]); + btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]); + btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]); +} + +static INLINE void iadst16_stage5_ssse3(__m128i *x) { + btf_16_adds_subs_sse2(x[0], x[4]); + btf_16_adds_subs_sse2(x[1], x[5]); + btf_16_adds_subs_sse2(x[2], x[6]); + btf_16_adds_subs_sse2(x[3], x[7]); + btf_16_adds_subs_sse2(x[8], x[12]); + btf_16_adds_subs_sse2(x[9], x[13]); + btf_16_adds_subs_sse2(x[10], x[14]); + btf_16_adds_subs_sse2(x[11], x[15]); +} + +static INLINE void iadst16_stage6_ssse3(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); + btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]); + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]); + btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]); +} + +static INLINE void iadst16_stage7_ssse3(__m128i *x) { + btf_16_adds_subs_sse2(x[0], x[2]); + btf_16_adds_subs_sse2(x[1], x[3]); + btf_16_adds_subs_sse2(x[4], x[6]); + btf_16_adds_subs_sse2(x[5], x[7]); + btf_16_adds_subs_sse2(x[8], x[10]); + btf_16_adds_subs_sse2(x[9], x[11]); + btf_16_adds_subs_sse2(x[12], x[14]); + btf_16_adds_subs_sse2(x[13], x[15]); +} + +static INLINE void iadst16_stage8_ssse3(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]); + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]); +} + +static INLINE void iadst16_stage9_ssse3(__m128i *output, __m128i *x) { + const __m128i __zero = _mm_setzero_si128(); + output[0] = x[0]; + output[1] = _mm_subs_epi16(__zero, x[8]); + output[2] = x[12]; + output[3] = _mm_subs_epi16(__zero, x[4]); + output[4] = x[6]; + output[5] = _mm_subs_epi16(__zero, x[14]); + output[6] = x[10]; + output[7] = _mm_subs_epi16(__zero, x[2]); + output[8] = x[3]; + output[9] = _mm_subs_epi16(__zero, x[11]); + output[10] = x[15]; + output[11] = _mm_subs_epi16(__zero, x[7]); + output[12] = x[5]; + output[13] = _mm_subs_epi16(__zero, x[13]); + output[14] = x[9]; + output[15] = _mm_subs_epi16(__zero, x[1]); +} + +static void iadst16_low1_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + + // stage 1 + __m128i x[16]; + x[1] = input[0]; + + // stage 2 + btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]); + + // stage 3 + x[8] = x[0]; + x[9] = x[1]; + + // stage 4 + btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]); + + // stage 5 + x[4] = x[0]; + x[5] = x[1]; + x[12] = x[8]; + x[13] = x[9]; + + // stage 6 + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]); + + // stage 7 + x[2] = x[0]; + x[3] = x[1]; + x[6] = x[4]; + x[7] = x[5]; + x[10] = x[8]; + x[11] = x[9]; + x[14] = x[12]; + x[15] = x[13]; + + iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit); + iadst16_stage9_ssse3(output, x); +} + +static void iadst16_low8_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + // stage 1 + __m128i x[16]; + x[1] = input[0]; + x[3] = input[2]; + x[5] = input[4]; + x[7] = input[6]; + x[8] = input[7]; + x[10] = input[5]; + x[12] = input[3]; + x[14] = input[1]; + + // stage 2 + btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]); + btf_16_ssse3(cospi[54], -cospi[10], x[3], x[2], x[3]); + btf_16_ssse3(cospi[46], -cospi[18], x[5], x[4], x[5]); + btf_16_ssse3(cospi[38], -cospi[26], x[7], x[6], x[7]); + btf_16_ssse3(cospi[34], cospi[30], x[8], x[8], x[9]); + btf_16_ssse3(cospi[42], cospi[22], x[10], x[10], x[11]); + btf_16_ssse3(cospi[50], cospi[14], x[12], x[12], x[13]); + btf_16_ssse3(cospi[58], cospi[6], x[14], x[14], x[15]); + + // stage 3 + iadst16_stage3_ssse3(x); + iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit); + iadst16_stage5_ssse3(x); + iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit); + iadst16_stage7_ssse3(x); + iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit); + iadst16_stage9_ssse3(output, x); +} +static void iadst16_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); + const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); + const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); + const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); + const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); + const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); + const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); + const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); + const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); + const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); + const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); + const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); + const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); + const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); + const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); + const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); + + // stage 1 + __m128i x[16]; + x[0] = input[15]; + x[1] = input[0]; + x[2] = input[13]; + x[3] = input[2]; + x[4] = input[11]; + x[5] = input[4]; + x[6] = input[9]; + x[7] = input[6]; + x[8] = input[7]; + x[9] = input[8]; + x[10] = input[5]; + x[11] = input[10]; + x[12] = input[3]; + x[13] = input[12]; + x[14] = input[1]; + x[15] = input[14]; + + // stage 2 + btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]); + btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]); + btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]); + btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]); + btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]); + btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]); + btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]); + btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]); + + // stage 3~9 + iadst16_stage3_ssse3(x); + iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit); + iadst16_stage5_ssse3(x); + iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit); + iadst16_stage7_ssse3(x); + iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit); + iadst16_stage9_ssse3(output, x); +} + +static void iadst16_w4_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); + const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); + const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); + const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); + const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); + const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); + const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); + const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); + const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); + const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); + const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); + const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); + const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); + const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); + const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); + const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); + const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]); + const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + + // stage 1 + __m128i x[16]; + x[0] = input[15]; + x[1] = input[0]; + x[2] = input[13]; + x[3] = input[2]; + x[4] = input[11]; + x[5] = input[4]; + x[6] = input[9]; + x[7] = input[6]; + x[8] = input[7]; + x[9] = input[8]; + x[10] = input[5]; + x[11] = input[10]; + x[12] = input[3]; + x[13] = input[12]; + x[14] = input[1]; + x[15] = input[14]; + + // stage 2 + btf_16_4p_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]); + btf_16_4p_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]); + btf_16_4p_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]); + btf_16_4p_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]); + btf_16_4p_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]); + btf_16_4p_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]); + btf_16_4p_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]); + btf_16_4p_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]); + + // stage 3 + iadst16_stage3_ssse3(x); + + // stage 4 + btf_16_4p_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]); + btf_16_4p_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]); + btf_16_4p_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]); + btf_16_4p_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]); + + // stage 5 + iadst16_stage5_ssse3(x); + + // stage 6 + btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); + btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]); + btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]); + btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]); + + // stage 7 + iadst16_stage7_ssse3(x); + + // stage 8 + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]); + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]); + + // stage 9 + iadst16_stage9_ssse3(output, x); +} + +static void iidentity4_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits)); + const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits)); + for (int i = 0; i < 4; ++i) { + __m128i x = _mm_mulhrs_epi16(input[i], scale); + output[i] = _mm_adds_epi16(x, input[i]); + } +} + +static void iidentity8_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + for (int i = 0; i < 8; ++i) { + output[i] = _mm_adds_epi16(input[i], input[i]); + } +} + +static void iidentity16_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits)); + const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits)); + for (int i = 0; i < 16; ++i) { + __m128i x = _mm_mulhrs_epi16(input[i], scale); + __m128i srcx2 = _mm_adds_epi16(input[i], input[i]); + output[i] = _mm_adds_epi16(x, srcx2); + } +} + +static INLINE __m128i lowbd_get_recon_8x8_sse2(const __m128i pred, + __m128i res) { + const __m128i zero = _mm_setzero_si128(); + __m128i x0 = _mm_adds_epi16(res, _mm_unpacklo_epi8(pred, zero)); + return _mm_packus_epi16(x0, x0); +} + +static INLINE void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output, + int stride, int flipud, + const int height) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + const __m128i zero = _mm_setzero_si128(); + for (int i = 0; i < height; ++i, j += step) { + const __m128i v = _mm_cvtsi32_si128(*((uint32_t *)(output + i * stride))); + __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero)); + u = _mm_packus_epi16(u, zero); + *((uint32_t *)(output + i * stride)) = _mm_cvtsi128_si32(u); + } +} + +static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output, + int stride, int flipud, + const int height) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + const __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride)); + const __m128i u = lowbd_get_recon_8x8_sse2(v, in[j]); + _mm_storel_epi64((__m128i *)(output + i * stride), u); + } +} + +// 1D functions process process 8 pixels at one time. +static const transform_1d_ssse3 + lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = { + { idct4_sse2, iadst4_sse2, iidentity4_ssse3 }, + { idct8_sse2, iadst8_sse2, iidentity8_sse2 }, + { idct16_sse2, iadst16_sse2, iidentity16_ssse3 }, + { idct32_sse2, NULL, NULL }, + { idct64_low32_ssse3, NULL, NULL }, + }; + +// functions for blocks with eob at DC and within +// topleft 8x8, 16x16, 32x32 corner +static const transform_1d_ssse3 + lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { + { + { idct4_sse2, idct4_sse2, NULL, NULL }, + { iadst4_sse2, iadst4_sse2, NULL, NULL }, + { iidentity4_ssse3, iidentity4_ssse3, NULL, NULL }, + }, + { { idct8_low1_ssse3, idct8_sse2, NULL, NULL }, + { iadst8_low1_ssse3, iadst8_sse2, NULL, NULL }, + { iidentity8_sse2, iidentity8_sse2, NULL, NULL } }, + { + { idct16_low1_ssse3, idct16_low8_ssse3, idct16_sse2, NULL }, + { iadst16_low1_ssse3, iadst16_low8_ssse3, iadst16_sse2, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { idct32_low1_ssse3, idct32_low8_ssse3, idct32_low16_ssse3, + idct32_sse2 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + { { idct64_low1_ssse3, idct64_low8_ssse3, idct64_low16_ssse3, + idct64_low32_ssse3 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } } + }; + +// 1D functions process process 4 pixels at one time. +// used in 4x4, 4x8, 4x16, 8x4, 16x4 +static const transform_1d_ssse3 + lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = { + { idct4_w4_sse2, iadst4_w4_sse2, iidentity4_ssse3 }, + { idct8_w4_sse2, iadst8_w4_sse2, iidentity8_sse2 }, + { idct16_w4_sse2, iadst16_w4_sse2, iidentity16_ssse3 }, + { NULL, NULL, NULL }, + { NULL, NULL, NULL }, + }; + +static INLINE void iidentity_row_8xn_ssse3(__m128i *out, const int32_t *input, + int stride, int shift, int height, + int txw_idx, int rect_type) { + const int32_t *input_row = input; + const __m128i scale = _mm_set1_epi16(NewSqrt2list[txw_idx]); + const __m128i rounding = _mm_set1_epi16((1 << (NewSqrt2Bits - 1)) + + (1 << (NewSqrt2Bits - shift - 1))); + const __m128i one = _mm_set1_epi16(1); + const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding); + if (rect_type != 1 && rect_type != -1) { + for (int i = 0; i < height; ++i) { + const __m128i src = load_32bit_to_16bit(input_row); + input_row += stride; + __m128i lo = _mm_unpacklo_epi16(src, one); + __m128i hi = _mm_unpackhi_epi16(src, one); + lo = _mm_madd_epi16(lo, scale_rounding); + hi = _mm_madd_epi16(hi, scale_rounding); + lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift); + hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift); + out[i] = _mm_packs_epi32(lo, hi); + } + } else { + const __m128i rect_scale = + _mm_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits)); + for (int i = 0; i < height; ++i) { + __m128i src = load_32bit_to_16bit(input_row); + src = _mm_mulhrs_epi16(src, rect_scale); + input_row += stride; + __m128i lo = _mm_unpacklo_epi16(src, one); + __m128i hi = _mm_unpackhi_epi16(src, one); + lo = _mm_madd_epi16(lo, scale_rounding); + hi = _mm_madd_epi16(hi, scale_rounding); + lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift); + hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift); + out[i] = _mm_packs_epi32(lo, hi); + } + } +} + +static INLINE void iidentity_col_8xn_ssse3(uint8_t *output, int stride, + __m128i *buf, int shift, int height, + int txh_idx) { + const __m128i scale = _mm_set1_epi16(NewSqrt2list[txh_idx]); + const __m128i scale_rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1)); + const __m128i shift_rounding = _mm_set1_epi32(1 << (-shift - 1)); + const __m128i one = _mm_set1_epi16(1); + const __m128i scale_coeff = _mm_unpacklo_epi16(scale, scale_rounding); + const __m128i zero = _mm_setzero_si128(); + for (int h = 0; h < height; ++h) { + __m128i lo = _mm_unpacklo_epi16(buf[h], one); + __m128i hi = _mm_unpackhi_epi16(buf[h], one); + lo = _mm_madd_epi16(lo, scale_coeff); + hi = _mm_madd_epi16(hi, scale_coeff); + lo = _mm_srai_epi32(lo, NewSqrt2Bits); + hi = _mm_srai_epi32(hi, NewSqrt2Bits); + lo = _mm_add_epi32(lo, shift_rounding); + hi = _mm_add_epi32(hi, shift_rounding); + lo = _mm_srai_epi32(lo, -shift); + hi = _mm_srai_epi32(hi, -shift); + __m128i x = _mm_packs_epi32(lo, hi); + + const __m128i pred = _mm_loadl_epi64((__m128i const *)(output)); + x = _mm_adds_epi16(x, _mm_unpacklo_epi8(pred, zero)); + const __m128i u = _mm_packus_epi16(x, x); + _mm_storel_epi64((__m128i *)(output), u); + output += stride; + } +} + +static INLINE void lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_SIZE tx_size) { + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int input_stride = AOMMIN(32, txfm_size_col); + const int row_max = AOMMIN(32, txfm_size_row); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + __m128i buf[32]; + + for (int i = 0; i < (input_stride >> 3); ++i) { + iidentity_row_8xn_ssse3(buf, input + 8 * i, input_stride, shift[0], row_max, + txw_idx, rect_type); + iidentity_col_8xn_ssse3(output + 8 * i, stride, buf, shift[1], row_max, + txh_idx); + } +} + +static void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size_, + int eob) { + (void)tx_size_; + (void)eob; + __m128i buf[4]; + const TX_SIZE tx_size = TX_4X4; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + + const transform_1d_ssse3 row_txfm = + lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_ssse3 col_txfm = + lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row); + transpose_16bit_4x4(buf, buf); + row_txfm(buf, buf, cos_bit_row); + if (lr_flip) { + __m128i temp[4]; + flip_buf_sse2(buf, temp, txfm_size_col); + transpose_16bit_4x4(temp, buf); + } else { + transpose_16bit_4x4(buf, buf); + } + col_txfm(buf, buf, cos_bit_col); + round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]); + lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row); +} + +static INLINE __m128i lowbd_get_recon_16x16_sse2(const __m128i pred, + __m128i res0, __m128i res1) { + const __m128i zero = _mm_setzero_si128(); + __m128i x0 = _mm_unpacklo_epi8(pred, zero); + __m128i x1 = _mm_unpackhi_epi8(pred, zero); + x0 = _mm_adds_epi16(res0, x0); + x1 = _mm_adds_epi16(res1, x1); + return _mm_packus_epi16(x0, x1); +} + +static INLINE void lowbd_write_buffer_16xn_sse2(__m128i *in, uint8_t *output, + int stride, int flipud, + int height) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride)); + __m128i u = lowbd_get_recon_16x16_sse2(v, in[j], in[j + height]); + _mm_storeu_si128((__m128i *)(output + i * stride), u); + } +} + +static INLINE void round_shift_ssse3(const __m128i *input, __m128i *output, + int size) { + const __m128i scale = _mm_set1_epi16(NewInvSqrt2 * 8); + for (int i = 0; i < size; ++i) { + output[i] = _mm_mulhrs_epi16(input[i], scale); + } +} + +static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + __m128i buf1[64 * 8]; + int eobx, eoby; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 3; + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int input_stride = AOMMIN(32, txfm_size_col); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_ssse3 row_txfm = + lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_1d_ssse3 col_txfm = + lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < buf_size_nonzero_h_div8; i++) { + __m128i buf0[64]; + const int32_t *input_row = input + i * input_stride * 8; + for (int j = 0; j < buf_size_nonzero_w_div8; ++j) { + __m128i *buf0_cur = buf0 + j * 8; + load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8); + transpose_16bit_8x8(buf0_cur, buf0_cur); + } + if (rect_type == 1 || rect_type == -1) { + round_shift_ssse3(buf0, buf0, input_stride); // rect special code + } + row_txfm(buf0, buf0, cos_bit_row); + round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]); + __m128i *_buf1 = buf1 + i * 8; + if (lr_flip) { + for (int j = 0; j < buf_size_w_div8; ++j) { + __m128i temp[8]; + flip_buf_sse2(buf0 + 8 * j, temp, 8); + transpose_16bit_8x8(temp, + _buf1 + txfm_size_row * (buf_size_w_div8 - 1 - j)); + } + } else { + for (int j = 0; j < buf_size_w_div8; ++j) { + transpose_16bit_8x8(buf0 + 8 * j, _buf1 + txfm_size_row * j); + } + } + } + for (int i = 0; i < buf_size_w_div8; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, cos_bit_col); + round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]); + } + + if (txfm_size_col >= 16) { + for (int i = 0; i < (txfm_size_col >> 4); i++) { + lowbd_write_buffer_16xn_sse2(buf1 + i * txfm_size_row * 2, + output + 16 * i, stride, ud_flip, + txfm_size_row); + } + } else if (txfm_size_col == 8) { + lowbd_write_buffer_8xn_sse2(buf1, output, stride, ud_flip, txfm_size_row); + } +} + +static INLINE void lowbd_inv_txfm2d_add_h_identity_ssse3( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + int eobx, eoby; + get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = (eobx + 8) >> 3; + const int input_stride = AOMMIN(32, txfm_size_col); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby]; + assert(fun_idx < 5); + const transform_1d_ssse3 col_txfm = + lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx]; + + assert(col_txfm != NULL); + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < buf_size_w_div8; i++) { + __m128i buf0[64]; + iidentity_row_8xn_ssse3(buf0, input + 8 * i, input_stride, shift[0], + eoby + 1, txw_idx, rect_type); + col_txfm(buf0, buf0, cos_bit_col); + __m128i mshift = _mm_set1_epi16(1 << (15 + shift[1])); + int k = ud_flip ? (txfm_size_row - 1) : 0; + const int step = ud_flip ? -1 : 1; + uint8_t *out = output + 8 * i; + for (int j = 0; j < txfm_size_row; ++j, k += step) { + const __m128i v = _mm_loadl_epi64((__m128i const *)(out)); + __m128i res = _mm_mulhrs_epi16(buf0[k], mshift); + const __m128i u = lowbd_get_recon_8x8_sse2(v, res); + _mm_storel_epi64((__m128i *)(out), u); + out += stride; + } + } +} + +static INLINE void lowbd_inv_txfm2d_add_v_identity_ssse3( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + __m128i buf1[64]; + int eobx, eoby; + get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 3; + const int buf_size_h_div8 = (eoby + 8) >> 3; + const int input_stride = AOMMIN(32, txfm_size_col); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx]; + const transform_1d_ssse3 row_txfm = + lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx]; + + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < buf_size_h_div8; i++) { + __m128i buf0[64]; + const int32_t *input_row = input + i * input_stride * 8; + for (int j = 0; j < AOMMIN(4, buf_size_w_div8); ++j) { + __m128i *buf0_cur = buf0 + j * 8; + load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8); + transpose_16bit_8x8(buf0_cur, buf0_cur); + } + if (rect_type == 1 || rect_type == -1) { + round_shift_ssse3(buf0, buf0, input_stride); // rect special code + } + row_txfm(buf0, buf0, cos_bit_row); + round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]); + __m128i *_buf1 = buf1; + if (lr_flip) { + for (int j = 0; j < buf_size_w_div8; ++j) { + __m128i temp[8]; + flip_buf_sse2(buf0 + 8 * j, temp, 8); + transpose_16bit_8x8(temp, _buf1 + 8 * (buf_size_w_div8 - 1 - j)); + } + } else { + for (int j = 0; j < buf_size_w_div8; ++j) { + transpose_16bit_8x8(buf0 + 8 * j, _buf1 + 8 * j); + } + } + + for (int j = 0; j < buf_size_w_div8; ++j) { + iidentity_col_8xn_ssse3(output + i * 8 * stride + j * 8, stride, + buf1 + j * 8, shift[1], 8, txh_idx); + } + } +} + +// for 32x32,32x64,64x32,64x64,32x8,8x32,16x32,32x16,64x16,16x64 +static INLINE void lowbd_inv_txfm2d_add_universe_ssse3( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + switch (tx_type) { + case DCT_DCT: + lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type, + tx_size, eob); + break; + case IDTX: + lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size); + break; + case V_DCT: + case V_ADST: + case V_FLIPADST: + lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type, + tx_size, eob); + break; + case H_DCT: + case H_ADST: + case H_FLIPADST: + lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type, + tx_size, eob); + break; + default: + lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type, + tx_size, eob); + break; + } +} + +static void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size_, + int eob) { + (void)tx_size_; + (void)eob; + __m128i buf[8]; + const TX_SIZE tx_size = TX_4X8; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + + const transform_1d_ssse3 row_txfm = + lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_ssse3 col_txfm = + lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row); + transpose_16bit_4x8(buf, buf); + round_shift_ssse3(buf, buf, txfm_size_col); // rect special code + row_txfm(buf, buf, cos_bit_row); + // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0 + if (lr_flip) { + __m128i temp[4]; + flip_buf_sse2(buf, temp, txfm_size_col); + transpose_16bit_8x4(temp, buf); + } else { + transpose_16bit_8x4(buf, buf); + } + col_txfm(buf, buf, cos_bit_col); + round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]); + lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row); +} + +static void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size_, + int eob) { + (void)tx_size_; + (void)eob; + __m128i buf[8]; + const TX_SIZE tx_size = TX_8X4; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + + const transform_1d_ssse3 row_txfm = + lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_ssse3 col_txfm = + lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + load_buffer_32bit_to_16bit(input, txfm_size_col, buf, txfm_size_row); + transpose_16bit_8x4(buf, buf); + round_shift_ssse3(buf, buf, txfm_size_col); // rect special code + row_txfm(buf, buf, cos_bit_row); + // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0 + if (lr_flip) { + __m128i temp[8]; + flip_buf_sse2(buf, temp, txfm_size_col); + transpose_16bit_4x8(temp, buf); + } else { + transpose_16bit_4x8(buf, buf); + } + col_txfm(buf, buf, cos_bit_col); + round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]); + lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row); +} + +static void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size_, + int eob) { + (void)tx_size_; + (void)eob; + __m128i buf[16]; + const TX_SIZE tx_size = TX_4X16; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + + const transform_1d_ssse3 row_txfm = + lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_ssse3 col_txfm = + lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + const int row_one_loop = 8; + for (int i = 0; i < 2; ++i) { + const int32_t *input_cur = input + i * txfm_size_col * row_one_loop; + __m128i *buf_cur = buf + i * row_one_loop; + load_buffer_32bit_to_16bit_w4(input_cur, txfm_size_col, buf_cur, + row_one_loop); + transpose_16bit_4x8(buf_cur, buf_cur); + if (row_txfm == iidentity4_ssse3) { + const __m128i scale = pair_set_epi16(NewSqrt2, 3 << (NewSqrt2Bits - 1)); + const __m128i ones = _mm_set1_epi16(1); + for (int j = 0; j < 4; ++j) { + const __m128i buf_lo = _mm_unpacklo_epi16(buf_cur[j], ones); + const __m128i buf_hi = _mm_unpackhi_epi16(buf_cur[j], ones); + const __m128i buf_32_lo = + _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1)); + const __m128i buf_32_hi = + _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1)); + buf_cur[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi); + } + } else { + row_txfm(buf_cur, buf_cur, cos_bit_row); + round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]); + } + if (lr_flip) { + __m128i temp[8]; + flip_buf_sse2(buf_cur, temp, txfm_size_col); + transpose_16bit_8x4(temp, buf_cur); + } else { + transpose_16bit_8x4(buf_cur, buf_cur); + } + } + col_txfm(buf, buf, cos_bit_col); + round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]); + lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row); +} + +static void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size_, + int eob) { + (void)tx_size_; + (void)eob; + __m128i buf[16]; + const TX_SIZE tx_size = TX_16X4; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 3; + + const transform_1d_ssse3 row_txfm = + lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_ssse3 col_txfm = + lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + const int row_one_loop = 8; + for (int i = 0; i < buf_size_w_div8; ++i) { + const int32_t *input_cur = input + i * row_one_loop; + __m128i *buf_cur = buf + i * row_one_loop; + load_buffer_32bit_to_16bit(input_cur, txfm_size_col, buf_cur, + txfm_size_row); + transpose_16bit_8x4(buf_cur, buf_cur); + } + if (row_txfm == iidentity16_ssse3) { + const __m128i scale = pair_set_epi16(2 * NewSqrt2, 3 << (NewSqrt2Bits - 1)); + const __m128i ones = _mm_set1_epi16(1); + for (int j = 0; j < 16; ++j) { + const __m128i buf_lo = _mm_unpacklo_epi16(buf[j], ones); + const __m128i buf_hi = _mm_unpackhi_epi16(buf[j], ones); + const __m128i buf_32_lo = + _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1)); + const __m128i buf_32_hi = + _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1)); + buf[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi); + } + } else { + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); + } + if (lr_flip) { + __m128i temp[16]; + flip_buf_sse2(buf, temp, 16); + transpose_16bit_4x8(temp, buf); + transpose_16bit_4x8(temp + 8, buf + 8); + } else { + transpose_16bit_4x8(buf, buf); + transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop); + } + for (int i = 0; i < buf_size_w_div8; i++) { + col_txfm(buf + i * row_one_loop, buf + i * row_one_loop, cos_bit_col); + round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]); + } + lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, 4); + lowbd_write_buffer_8xn_sse2(buf + 8, output + 8, stride, ud_flip, 4); +} + +void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + switch (tx_size) { + case TX_4X4: + lowbd_inv_txfm2d_add_4x4_ssse3(input, output, stride, tx_type, tx_size, + eob); + break; + case TX_4X8: + lowbd_inv_txfm2d_add_4x8_ssse3(input, output, stride, tx_type, tx_size, + eob); + break; + case TX_8X4: + lowbd_inv_txfm2d_add_8x4_ssse3(input, output, stride, tx_type, tx_size, + eob); + break; + case TX_4X16: + lowbd_inv_txfm2d_add_4x16_ssse3(input, output, stride, tx_type, tx_size, + eob); + break; + case TX_16X4: + lowbd_inv_txfm2d_add_16x4_ssse3(input, output, stride, tx_type, tx_size, + eob); + break; + default: + lowbd_inv_txfm2d_add_universe_ssse3(input, output, stride, tx_type, + tx_size, eob); + break; + } +} + +void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride, + const TxfmParam *txfm_param) { + if (!txfm_param->lossless) { + const TX_TYPE tx_type = txfm_param->tx_type; + av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type, + txfm_param->tx_size, txfm_param->eob); + + } else { + av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param); + } +} diff --git a/libs/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.h b/libs/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.h new file mode 100644 index 000000000..7d5055deb --- /dev/null +++ b/libs/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.h @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_ +#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_ + +#include // SSE2 +#include // SSSE3 + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/transpose_sse2.h" +#include "aom_dsp/x86/txfm_common_sse2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define btf_16_ssse3(w0, w1, in, out0, out1) \ + do { \ + const __m128i _w0 = _mm_set1_epi16(w0 * 8); \ + const __m128i _w1 = _mm_set1_epi16(w1 * 8); \ + const __m128i _in = in; \ + out0 = _mm_mulhrs_epi16(_in, _w0); \ + out1 = _mm_mulhrs_epi16(_in, _w1); \ + } while (0) + +#define btf_16_adds_subs_sse2(in0, in1) \ + do { \ + const __m128i _in0 = in0; \ + const __m128i _in1 = in1; \ + in0 = _mm_adds_epi16(_in0, _in1); \ + in1 = _mm_subs_epi16(_in0, _in1); \ + } while (0) + +#define btf_16_subs_adds_sse2(in0, in1) \ + do { \ + const __m128i _in0 = in0; \ + const __m128i _in1 = in1; \ + in1 = _mm_subs_epi16(_in0, _in1); \ + in0 = _mm_adds_epi16(_in0, _in1); \ + } while (0) + +#define btf_16_adds_subs_out_sse2(out0, out1, in0, in1) \ + do { \ + const __m128i _in0 = in0; \ + const __m128i _in1 = in1; \ + out0 = _mm_adds_epi16(_in0, _in1); \ + out1 = _mm_subs_epi16(_in0, _in1); \ + } while (0) + +static INLINE void round_shift_16bit_ssse3(__m128i *in, int size, int bit) { + if (bit < 0) { + const __m128i scale = _mm_set1_epi16(1 << (15 + bit)); + for (int i = 0; i < size; ++i) { + in[i] = _mm_mulhrs_epi16(in[i], scale); + } + } else if (bit > 0) { + for (int i = 0; i < size; ++i) { + in[i] = _mm_slli_epi16(in[i], bit); + } + } +} + +// 1D itx types +enum { + IDCT_1D, + IADST_1D, + IFLIPADST_1D = IADST_1D, + IIDENTITY_1D, + ITX_TYPES_1D, +} UENUM1BYTE(ITX_TYPE_1D); + +static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = { + IDCT_1D, IADST_1D, IDCT_1D, IADST_1D, + IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D, + IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D, + IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D, +}; + +static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = { + IDCT_1D, IDCT_1D, IADST_1D, IADST_1D, + IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D, + IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D, + IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = { + 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_16x16_default[16]) = { + 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, + 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_32x32_default[32]) = { + 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, + 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, + 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, + 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = { + 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07, + 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = { + 0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_16x32_default[32]) = { + 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, + 0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, + 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, + 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_32x16_default[16]) = { + 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, + 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = { + 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07, + 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07, + 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, + 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = { + 0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f, +}; + +DECLARE_ALIGNED(16, static const int16_t *, + av1_eob_to_eobxy_default[TX_SIZES_ALL]) = { + NULL, + av1_eob_to_eobxy_8x8_default, + av1_eob_to_eobxy_16x16_default, + av1_eob_to_eobxy_32x32_default, + av1_eob_to_eobxy_32x32_default, + NULL, + NULL, + av1_eob_to_eobxy_8x16_default, + av1_eob_to_eobxy_16x8_default, + av1_eob_to_eobxy_16x32_default, + av1_eob_to_eobxy_32x16_default, + av1_eob_to_eobxy_32x32_default, + av1_eob_to_eobxy_32x32_default, + NULL, + NULL, + av1_eob_to_eobxy_8x32_default, + av1_eob_to_eobxy_32x8_default, + av1_eob_to_eobxy_16x32_default, + av1_eob_to_eobxy_32x16_default, +}; + +static const int lowbd_txfm_all_1d_zeros_idx[32] = { + 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, +}; + +// Transform block width in log2 for eob (size of 64 map to 32) +static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = { + 2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5, +}; + +static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby, + TX_SIZE tx_size, int eob) { + if (eob == 1) { + *eobx = 0; + *eoby = 0; + return; + } + + const int tx_w_log2 = tx_size_wide_log2_eob[tx_size]; + const int eob_row = (eob - 1) >> tx_w_log2; + const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row]; + *eobx = eobxy & 0xFF; + *eoby = eobxy >> 8; +} + +static int eob_fill[32] = { + 0, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, +}; + +static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby, + TX_SIZE tx_size, int eob) { + eob -= 1; + const int txfm_size_col = tx_size_wide[tx_size]; + const int eobx_max = AOMMIN(32, txfm_size_col) - 1; + *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob]; + const int temp_eoby = eob / (eobx_max + 1); + assert(temp_eoby < 32); + *eoby = eob_fill[temp_eoby]; +} + +static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby, + TX_SIZE tx_size, int eob) { + eob -= 1; + const int txfm_size_row = tx_size_high[tx_size]; + const int eoby_max = AOMMIN(32, txfm_size_row) - 1; + *eobx = eob / (eoby_max + 1); + *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob]; +} + +typedef void (*transform_1d_ssse3)(const __m128i *input, __m128i *output, + int8_t cos_bit); + +void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_ diff --git a/libs/libaom/src/av1/common/x86/av1_txfm_sse2.h b/libs/libaom/src/av1/common/x86/av1_txfm_sse2.h new file mode 100644 index 000000000..77aeb6eb1 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/av1_txfm_sse2.h @@ -0,0 +1,317 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_ +#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_ + +#include // SSE2 + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/transpose_sse2.h" +#include "aom_dsp/x86/txfm_common_sse2.h" +#include "av1/common/av1_txfm.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE void btf_16_w4_sse2( + const __m128i *const w0, const __m128i *const w1, const __m128i __rounding, + const int8_t cos_bit, const __m128i *const in0, const __m128i *const in1, + __m128i *const out0, __m128i *const out1) { + const __m128i t0 = _mm_unpacklo_epi16(*in0, *in1); + const __m128i u0 = _mm_madd_epi16(t0, *w0); + const __m128i v0 = _mm_madd_epi16(t0, *w1); + const __m128i a0 = _mm_add_epi32(u0, __rounding); + const __m128i b0 = _mm_add_epi32(v0, __rounding); + const __m128i c0 = _mm_srai_epi32(a0, cos_bit); + const __m128i d0 = _mm_srai_epi32(b0, cos_bit); + + *out0 = _mm_packs_epi32(c0, c0); + *out1 = _mm_packs_epi32(d0, c0); +} + +#define btf_16_4p_sse2(w0, w1, in0, in1, out0, out1) \ + { \ + __m128i t0 = _mm_unpacklo_epi16(in0, in1); \ + __m128i u0 = _mm_madd_epi16(t0, w0); \ + __m128i v0 = _mm_madd_epi16(t0, w1); \ + \ + __m128i a0 = _mm_add_epi32(u0, __rounding); \ + __m128i b0 = _mm_add_epi32(v0, __rounding); \ + \ + __m128i c0 = _mm_srai_epi32(a0, cos_bit); \ + __m128i d0 = _mm_srai_epi32(b0, cos_bit); \ + \ + out0 = _mm_packs_epi32(c0, c0); \ + out1 = _mm_packs_epi32(d0, d0); \ + } + +#define btf_16_sse2(w0, w1, in0, in1, out0, out1) \ + { \ + __m128i t0 = _mm_unpacklo_epi16(in0, in1); \ + __m128i t1 = _mm_unpackhi_epi16(in0, in1); \ + __m128i u0 = _mm_madd_epi16(t0, w0); \ + __m128i u1 = _mm_madd_epi16(t1, w0); \ + __m128i v0 = _mm_madd_epi16(t0, w1); \ + __m128i v1 = _mm_madd_epi16(t1, w1); \ + \ + __m128i a0 = _mm_add_epi32(u0, __rounding); \ + __m128i a1 = _mm_add_epi32(u1, __rounding); \ + __m128i b0 = _mm_add_epi32(v0, __rounding); \ + __m128i b1 = _mm_add_epi32(v1, __rounding); \ + \ + __m128i c0 = _mm_srai_epi32(a0, cos_bit); \ + __m128i c1 = _mm_srai_epi32(a1, cos_bit); \ + __m128i d0 = _mm_srai_epi32(b0, cos_bit); \ + __m128i d1 = _mm_srai_epi32(b1, cos_bit); \ + \ + out0 = _mm_packs_epi32(c0, c1); \ + out1 = _mm_packs_epi32(d0, d1); \ + } + +static INLINE __m128i load_16bit_to_16bit(const int16_t *a) { + return _mm_load_si128((const __m128i *)a); +} + +static INLINE __m128i load_32bit_to_16bit(const int32_t *a) { + const __m128i a_low = _mm_load_si128((const __m128i *)a); + return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4)); +} + +static INLINE __m128i load_32bit_to_16bit_w4(const int32_t *a) { + const __m128i a_low = _mm_load_si128((const __m128i *)a); + return _mm_packs_epi32(a_low, a_low); +} + +// Store 4 16 bit values. Sign extend the values. +static INLINE void store_16bit_to_32bit_w4(const __m128i a, int32_t *const b) { + const __m128i a_lo = _mm_unpacklo_epi16(a, a); + const __m128i a_1 = _mm_srai_epi32(a_lo, 16); + _mm_store_si128((__m128i *)b, a_1); +} + +// Store 8 16 bit values. Sign extend the values. +static INLINE void store_16bit_to_32bit(__m128i a, int32_t *b) { + const __m128i a_lo = _mm_unpacklo_epi16(a, a); + const __m128i a_hi = _mm_unpackhi_epi16(a, a); + const __m128i a_1 = _mm_srai_epi32(a_lo, 16); + const __m128i a_2 = _mm_srai_epi32(a_hi, 16); + _mm_store_si128((__m128i *)b, a_1); + _mm_store_si128((__m128i *)(b + 4), a_2); +} + +static INLINE __m128i scale_round_sse2(const __m128i a, const int scale) { + const __m128i scale_rounding = pair_set_epi16(scale, 1 << (NewSqrt2Bits - 1)); + const __m128i b = _mm_madd_epi16(a, scale_rounding); + return _mm_srai_epi32(b, NewSqrt2Bits); +} + +static INLINE void store_rect_16bit_to_32bit_w4(const __m128i a, + int32_t *const b) { + const __m128i one = _mm_set1_epi16(1); + const __m128i a_lo = _mm_unpacklo_epi16(a, one); + const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2); + _mm_store_si128((__m128i *)b, b_lo); +} + +static INLINE void store_rect_16bit_to_32bit(const __m128i a, + int32_t *const b) { + const __m128i one = _mm_set1_epi16(1); + const __m128i a_lo = _mm_unpacklo_epi16(a, one); + const __m128i a_hi = _mm_unpackhi_epi16(a, one); + const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2); + const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2); + _mm_store_si128((__m128i *)b, b_lo); + _mm_store_si128((__m128i *)(b + 4), b_hi); +} + +static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *const in, + const int stride, + __m128i *const out, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = _mm_loadl_epi64((const __m128i *)(in + i * stride)); + } +} + +static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in, + const int stride, + __m128i *const out, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + out[out_size - i - 1] = _mm_loadl_epi64((const __m128i *)(in + i * stride)); + } +} + +static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int stride, + __m128i *out, int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = load_16bit_to_16bit(in + i * stride); + } +} + +static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in, + int stride, __m128i *out, + int out_size) { + for (int i = 0; i < out_size; ++i) { + out[out_size - i - 1] = load_16bit_to_16bit(in + i * stride); + } +} + +static INLINE void load_buffer_32bit_to_16bit(const int32_t *in, int stride, + __m128i *out, int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = load_32bit_to_16bit(in + i * stride); + } +} + +static INLINE void load_buffer_32bit_to_16bit_w4(const int32_t *in, int stride, + __m128i *out, int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = load_32bit_to_16bit_w4(in + i * stride); + } +} + +static INLINE void load_buffer_32bit_to_16bit_flip(const int32_t *in, + int stride, __m128i *out, + int out_size) { + for (int i = 0; i < out_size; ++i) { + out[out_size - i - 1] = load_32bit_to_16bit(in + i * stride); + } +} + +static INLINE void store_buffer_16bit_to_32bit_w4(const __m128i *const in, + int32_t *const out, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + store_16bit_to_32bit_w4(in[i], out + i * stride); + } +} + +static INLINE void store_buffer_16bit_to_32bit_w8(const __m128i *const in, + int32_t *const out, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + store_16bit_to_32bit(in[i], out + i * stride); + } +} + +static INLINE void store_rect_buffer_16bit_to_32bit_w4(const __m128i *const in, + int32_t *const out, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + store_rect_16bit_to_32bit_w4(in[i], out + i * stride); + } +} + +static INLINE void store_rect_buffer_16bit_to_32bit_w8(const __m128i *const in, + int32_t *const out, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + store_rect_16bit_to_32bit(in[i], out + i * stride); + } +} + +static INLINE void store_buffer_16bit_to_16bit_8x8(const __m128i *in, + uint16_t *out, + const int stride) { + for (int i = 0; i < 8; ++i) { + _mm_store_si128((__m128i *)(out + i * stride), in[i]); + } +} + +static INLINE void round_shift_16bit(__m128i *in, int size, int bit) { + if (bit < 0) { + bit = -bit; + __m128i rounding = _mm_set1_epi16(1 << (bit - 1)); + for (int i = 0; i < size; ++i) { + in[i] = _mm_adds_epi16(in[i], rounding); + in[i] = _mm_srai_epi16(in[i], bit); + } + } else if (bit > 0) { + for (int i = 0; i < size; ++i) { + in[i] = _mm_slli_epi16(in[i], bit); + } + } +} + +static INLINE void flip_buf_sse2(__m128i *in, __m128i *out, int size) { + for (int i = 0; i < size; ++i) { + out[size - i - 1] = in[i]; + } +} + +void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +typedef void (*transform_1d_sse2)(const __m128i *input, __m128i *output, + int8_t cos_bit); + +typedef struct { + transform_1d_sse2 col, row; // vertical and horizontal +} transform_2d_sse2; + +#ifdef __cplusplus +} +#endif // __cplusplus +#endif // AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_ diff --git a/libs/libaom/src/av1/common/x86/av1_txfm_sse4.c b/libs/libaom/src/av1/common/x86/av1_txfm_sse4.c new file mode 100644 index 000000000..65ccd1952 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/av1_txfm_sse4.c @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/av1_rtcd.h" + +#include "av1/common/av1_txfm.h" +#include "av1/common/x86/av1_txfm_sse4.h" + +void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit) { + __m128i *const vec = (__m128i *)arr; + const int vec_size = size >> 2; + av1_round_shift_array_32_sse4_1(vec, vec, vec_size, bit); +} diff --git a/libs/libaom/src/av1/common/x86/av1_txfm_sse4.h b/libs/libaom/src/av1/common/x86/av1_txfm_sse4.h new file mode 100644 index 000000000..6cad821b1 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/av1_txfm_sse4.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_ +#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE __m128i av1_round_shift_32_sse4_1(__m128i vec, int bit) { + __m128i tmp, round; + round = _mm_set1_epi32(1 << (bit - 1)); + tmp = _mm_add_epi32(vec, round); + return _mm_srai_epi32(tmp, bit); +} + +static INLINE void av1_round_shift_array_32_sse4_1(__m128i *input, + __m128i *output, + const int size, + const int bit) { + if (bit > 0) { + int i; + for (i = 0; i < size; i++) { + output[i] = av1_round_shift_32_sse4_1(input[i], bit); + } + } else { + int i; + for (i = 0; i < size; i++) { + output[i] = _mm_slli_epi32(input[i], -bit); + } + } +} + +static INLINE void av1_round_shift_rect_array_32_sse4_1(__m128i *input, + __m128i *output, + const int size, + const int bit, + const int val) { + const __m128i sqrt2 = _mm_set1_epi32(val); + if (bit > 0) { + int i; + for (i = 0; i < size; i++) { + const __m128i r0 = av1_round_shift_32_sse4_1(input[i], bit); + const __m128i r1 = _mm_mullo_epi32(sqrt2, r0); + output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits); + } + } else { + int i; + for (i = 0; i < size; i++) { + const __m128i r0 = _mm_slli_epi32(input[i], -bit); + const __m128i r1 = _mm_mullo_epi32(sqrt2, r0); + output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits); + } + } +} + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_ diff --git a/libs/libaom/src/av1/common/x86/cfl_avx2.c b/libs/libaom/src/av1/common/x86/cfl_avx2.c new file mode 100644 index 000000000..d9c6f99d5 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/cfl_avx2.c @@ -0,0 +1,495 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/cfl.h" + +#include "av1/common/x86/cfl_simd.h" + +#define CFL_GET_SUBSAMPLE_FUNCTION_AVX2(sub, bd) \ + CFL_SUBSAMPLE(avx2, sub, bd, 32, 32) \ + CFL_SUBSAMPLE(avx2, sub, bd, 32, 16) \ + CFL_SUBSAMPLE(avx2, sub, bd, 32, 8) \ + cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_avx2( \ + TX_SIZE tx_size) { \ + static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \ + cfl_subsample_##bd##_##sub##_4x4_ssse3, /* 4x4 */ \ + cfl_subsample_##bd##_##sub##_8x8_ssse3, /* 8x8 */ \ + cfl_subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */ \ + cfl_subsample_##bd##_##sub##_32x32_avx2, /* 32x32 */ \ + NULL, /* 64x64 (invalid CFL size) */ \ + cfl_subsample_##bd##_##sub##_4x8_ssse3, /* 4x8 */ \ + cfl_subsample_##bd##_##sub##_8x4_ssse3, /* 8x4 */ \ + cfl_subsample_##bd##_##sub##_8x16_ssse3, /* 8x16 */ \ + cfl_subsample_##bd##_##sub##_16x8_ssse3, /* 16x8 */ \ + cfl_subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */ \ + cfl_subsample_##bd##_##sub##_32x16_avx2, /* 32x16 */ \ + NULL, /* 32x64 (invalid CFL size) */ \ + NULL, /* 64x32 (invalid CFL size) */ \ + cfl_subsample_##bd##_##sub##_4x16_ssse3, /* 4x16 */ \ + cfl_subsample_##bd##_##sub##_16x4_ssse3, /* 16x4 */ \ + cfl_subsample_##bd##_##sub##_8x32_ssse3, /* 8x32 */ \ + cfl_subsample_##bd##_##sub##_32x8_avx2, /* 32x8 */ \ + NULL, /* 16x64 (invalid CFL size) */ \ + NULL, /* 64x16 (invalid CFL size) */ \ + }; \ + return subfn_##sub[tx_size]; \ + } + +/** + * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more + * precise version of a box filter 4:2:0 pixel subsampling in Q3. + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + * + * Note: For 4:2:0 luma subsampling, the width will never be greater than 16. + */ +static void cfl_luma_subsampling_420_lbd_avx2(const uint8_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + (void)width; // Forever 32 + const __m256i twos = _mm256_set1_epi8(2); // Thirty two twos + const int luma_stride = input_stride << 1; + __m256i *row = (__m256i *)pred_buf_q3; + const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256; + do { + __m256i top = _mm256_loadu_si256((__m256i *)input); + __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride)); + + __m256i top_16x16 = _mm256_maddubs_epi16(top, twos); + __m256i bot_16x16 = _mm256_maddubs_epi16(bot, twos); + __m256i sum_16x16 = _mm256_add_epi16(top_16x16, bot_16x16); + + _mm256_storeu_si256(row, sum_16x16); + + input += luma_stride; + } while ((row += CFL_BUF_LINE_I256) < row_end); +} + +CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, lbd) + +/** + * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more + * precise version of a box filter 4:2:2 pixel subsampling in Q3. + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + */ +static void cfl_luma_subsampling_422_lbd_avx2(const uint8_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + (void)width; // Forever 32 + const __m256i fours = _mm256_set1_epi8(4); // Thirty two fours + __m256i *row = (__m256i *)pred_buf_q3; + const __m256i *row_end = row + height * CFL_BUF_LINE_I256; + do { + __m256i top = _mm256_loadu_si256((__m256i *)input); + __m256i top_16x16 = _mm256_maddubs_epi16(top, fours); + _mm256_storeu_si256(row, top_16x16); + input += input_stride; + } while ((row += CFL_BUF_LINE_I256) < row_end); +} + +CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, lbd) + +/** + * Multiplies the pixels by 8 (scaling in Q3). The AVX2 subsampling is only + * performed on block of width 32. + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + */ +static void cfl_luma_subsampling_444_lbd_avx2(const uint8_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + (void)width; // Forever 32 + __m256i *row = (__m256i *)pred_buf_q3; + const __m256i *row_end = row + height * CFL_BUF_LINE_I256; + const __m256i zeros = _mm256_setzero_si256(); + do { + __m256i top = _mm256_loadu_si256((__m256i *)input); + top = _mm256_permute4x64_epi64(top, _MM_SHUFFLE(3, 1, 2, 0)); + + __m256i row_lo = _mm256_unpacklo_epi8(top, zeros); + row_lo = _mm256_slli_epi16(row_lo, 3); + __m256i row_hi = _mm256_unpackhi_epi8(top, zeros); + row_hi = _mm256_slli_epi16(row_hi, 3); + + _mm256_storeu_si256(row, row_lo); + _mm256_storeu_si256(row + 1, row_hi); + + input += input_stride; + } while ((row += CFL_BUF_LINE_I256) < row_end); +} + +CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, lbd) + +#if CONFIG_AV1_HIGHBITDEPTH +/** + * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more + * precise version of a box filter 4:2:0 pixel subsampling in Q3. + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + * + * Note: For 4:2:0 luma subsampling, the width will never be greater than 16. + */ +static void cfl_luma_subsampling_420_hbd_avx2(const uint16_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + (void)width; // Forever 32 + const int luma_stride = input_stride << 1; + __m256i *row = (__m256i *)pred_buf_q3; + const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256; + do { + __m256i top = _mm256_loadu_si256((__m256i *)input); + __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride)); + __m256i sum = _mm256_add_epi16(top, bot); + + __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16)); + __m256i bot_1 = _mm256_loadu_si256((__m256i *)(input + 16 + input_stride)); + __m256i sum_1 = _mm256_add_epi16(top_1, bot_1); + + __m256i hsum = _mm256_hadd_epi16(sum, sum_1); + hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0)); + hsum = _mm256_add_epi16(hsum, hsum); + + _mm256_storeu_si256(row, hsum); + + input += luma_stride; + } while ((row += CFL_BUF_LINE_I256) < row_end); +} + +CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, hbd) + +/** + * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more + * precise version of a box filter 4:2:2 pixel subsampling in Q3. + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + * + */ +static void cfl_luma_subsampling_422_hbd_avx2(const uint16_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + (void)width; // Forever 32 + __m256i *row = (__m256i *)pred_buf_q3; + const __m256i *row_end = row + height * CFL_BUF_LINE_I256; + do { + __m256i top = _mm256_loadu_si256((__m256i *)input); + __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16)); + __m256i hsum = _mm256_hadd_epi16(top, top_1); + hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0)); + hsum = _mm256_slli_epi16(hsum, 2); + + _mm256_storeu_si256(row, hsum); + + input += input_stride; + } while ((row += CFL_BUF_LINE_I256) < row_end); +} + +CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, hbd) + +static void cfl_luma_subsampling_444_hbd_avx2(const uint16_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + (void)width; // Forever 32 + __m256i *row = (__m256i *)pred_buf_q3; + const __m256i *row_end = row + height * CFL_BUF_LINE_I256; + do { + __m256i top = _mm256_loadu_si256((__m256i *)input); + __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16)); + _mm256_storeu_si256(row, _mm256_slli_epi16(top, 3)); + _mm256_storeu_si256(row + 1, _mm256_slli_epi16(top_1, 3)); + input += input_stride; + } while ((row += CFL_BUF_LINE_I256) < row_end); +} + +CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, hbd) +#endif // CONFIG_AV1_HIGHBITDEPTH + +static INLINE __m256i predict_unclipped(const __m256i *input, __m256i alpha_q12, + __m256i alpha_sign, __m256i dc_q0) { + __m256i ac_q3 = _mm256_loadu_si256(input); + __m256i ac_sign = _mm256_sign_epi16(alpha_sign, ac_q3); + __m256i scaled_luma_q0 = + _mm256_mulhrs_epi16(_mm256_abs_epi16(ac_q3), alpha_q12); + scaled_luma_q0 = _mm256_sign_epi16(scaled_luma_q0, ac_sign); + return _mm256_add_epi16(scaled_luma_q0, dc_q0); +} + +static INLINE void cfl_predict_lbd_avx2(const int16_t *pred_buf_q3, + uint8_t *dst, int dst_stride, + int alpha_q3, int width, int height) { + (void)width; + const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3); + const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9); + const __m256i dc_q0 = _mm256_set1_epi16(*dst); + __m256i *row = (__m256i *)pred_buf_q3; + const __m256i *row_end = row + height * CFL_BUF_LINE_I256; + + do { + __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); + __m256i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); + res = _mm256_packus_epi16(res, next); + res = _mm256_permute4x64_epi64(res, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_storeu_si256((__m256i *)dst, res); + dst += dst_stride; + } while ((row += CFL_BUF_LINE_I256) < row_end); +} + +CFL_PREDICT_X(avx2, 32, 8, lbd); +CFL_PREDICT_X(avx2, 32, 16, lbd); +CFL_PREDICT_X(avx2, 32, 32, lbd); + +cfl_predict_lbd_fn cfl_get_predict_lbd_fn_avx2(TX_SIZE tx_size) { + static const cfl_predict_lbd_fn pred[TX_SIZES_ALL] = { + cfl_predict_lbd_4x4_ssse3, /* 4x4 */ + cfl_predict_lbd_8x8_ssse3, /* 8x8 */ + cfl_predict_lbd_16x16_ssse3, /* 16x16 */ + cfl_predict_lbd_32x32_avx2, /* 32x32 */ + NULL, /* 64x64 (invalid CFL size) */ + cfl_predict_lbd_4x8_ssse3, /* 4x8 */ + cfl_predict_lbd_8x4_ssse3, /* 8x4 */ + cfl_predict_lbd_8x16_ssse3, /* 8x16 */ + cfl_predict_lbd_16x8_ssse3, /* 16x8 */ + cfl_predict_lbd_16x32_ssse3, /* 16x32 */ + cfl_predict_lbd_32x16_avx2, /* 32x16 */ + NULL, /* 32x64 (invalid CFL size) */ + NULL, /* 64x32 (invalid CFL size) */ + cfl_predict_lbd_4x16_ssse3, /* 4x16 */ + cfl_predict_lbd_16x4_ssse3, /* 16x4 */ + cfl_predict_lbd_8x32_ssse3, /* 8x32 */ + cfl_predict_lbd_32x8_avx2, /* 32x8 */ + NULL, /* 16x64 (invalid CFL size) */ + NULL, /* 64x16 (invalid CFL size) */ + }; + // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the + // function pointer array out of bounds. + return pred[tx_size % TX_SIZES_ALL]; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static __m256i highbd_max_epi16(int bd) { + const __m256i neg_one = _mm256_set1_epi16(-1); + // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd) + return _mm256_xor_si256(_mm256_slli_epi16(neg_one, bd), neg_one); +} + +static __m256i highbd_clamp_epi16(__m256i u, __m256i zero, __m256i max) { + return _mm256_max_epi16(_mm256_min_epi16(u, max), zero); +} + +static INLINE void cfl_predict_hbd_avx2(const int16_t *pred_buf_q3, + uint16_t *dst, int dst_stride, + int alpha_q3, int bd, int width, + int height) { + // Use SSSE3 version for smaller widths + assert(width == 16 || width == 32); + const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3); + const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9); + const __m256i dc_q0 = _mm256_loadu_si256((__m256i *)dst); + const __m256i max = highbd_max_epi16(bd); + + __m256i *row = (__m256i *)pred_buf_q3; + const __m256i *row_end = row + height * CFL_BUF_LINE_I256; + do { + const __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); + _mm256_storeu_si256((__m256i *)dst, + highbd_clamp_epi16(res, _mm256_setzero_si256(), max)); + if (width == 32) { + const __m256i res_1 = + predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); + _mm256_storeu_si256( + (__m256i *)(dst + 16), + highbd_clamp_epi16(res_1, _mm256_setzero_si256(), max)); + } + dst += dst_stride; + } while ((row += CFL_BUF_LINE_I256) < row_end); +} + +CFL_PREDICT_X(avx2, 16, 4, hbd) +CFL_PREDICT_X(avx2, 16, 8, hbd) +CFL_PREDICT_X(avx2, 16, 16, hbd) +CFL_PREDICT_X(avx2, 16, 32, hbd) +CFL_PREDICT_X(avx2, 32, 8, hbd) +CFL_PREDICT_X(avx2, 32, 16, hbd) +CFL_PREDICT_X(avx2, 32, 32, hbd) + +cfl_predict_hbd_fn cfl_get_predict_hbd_fn_avx2(TX_SIZE tx_size) { + static const cfl_predict_hbd_fn pred[TX_SIZES_ALL] = { + cfl_predict_hbd_4x4_ssse3, /* 4x4 */ + cfl_predict_hbd_8x8_ssse3, /* 8x8 */ + cfl_predict_hbd_16x16_avx2, /* 16x16 */ + cfl_predict_hbd_32x32_avx2, /* 32x32 */ + NULL, /* 64x64 (invalid CFL size) */ + cfl_predict_hbd_4x8_ssse3, /* 4x8 */ + cfl_predict_hbd_8x4_ssse3, /* 8x4 */ + cfl_predict_hbd_8x16_ssse3, /* 8x16 */ + cfl_predict_hbd_16x8_avx2, /* 16x8 */ + cfl_predict_hbd_16x32_avx2, /* 16x32 */ + cfl_predict_hbd_32x16_avx2, /* 32x16 */ + NULL, /* 32x64 (invalid CFL size) */ + NULL, /* 64x32 (invalid CFL size) */ + cfl_predict_hbd_4x16_ssse3, /* 4x16 */ + cfl_predict_hbd_16x4_avx2, /* 16x4 */ + cfl_predict_hbd_8x32_ssse3, /* 8x32 */ + cfl_predict_hbd_32x8_avx2, /* 32x8 */ + NULL, /* 16x64 (invalid CFL size) */ + NULL, /* 64x16 (invalid CFL size) */ + }; + // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the + // function pointer array out of bounds. + return pred[tx_size % TX_SIZES_ALL]; +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +// Returns a vector where all the (32-bits) elements are the sum of all the +// lanes in a. +static INLINE __m256i fill_sum_epi32(__m256i a) { + // Given that a == [A, B, C, D, E, F, G, H] + a = _mm256_hadd_epi32(a, a); + // Given that A' == A + B, C' == C + D, E' == E + F, G' == G + H + // a == [A', C', A', C', E', G', E', G'] + a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)); + // a == [A', C', E', G', A', C', E', G'] + a = _mm256_hadd_epi32(a, a); + // Given that A'' == A' + C' and E'' == E' + G' + // a == [A'', E'', A'', E'', A'', E'', A'', E''] + return _mm256_hadd_epi32(a, a); + // Given that A''' == A'' + E'' + // a == [A''', A''', A''', A''', A''', A''', A''', A'''] +} + +static INLINE __m256i _mm256_addl_epi16(__m256i a) { + return _mm256_add_epi32(_mm256_unpacklo_epi16(a, _mm256_setzero_si256()), + _mm256_unpackhi_epi16(a, _mm256_setzero_si256())); +} + +static INLINE void subtract_average_avx2(const uint16_t *src_ptr, + int16_t *dst_ptr, int width, + int height, int round_offset, + int num_pel_log2) { + // Use SSE2 version for smaller widths + assert(width == 16 || width == 32); + + const __m256i *src = (__m256i *)src_ptr; + const __m256i *const end = src + height * CFL_BUF_LINE_I256; + // To maximize usage of the AVX2 registers, we sum two rows per loop + // iteration + const int step = 2 * CFL_BUF_LINE_I256; + + __m256i sum = _mm256_setzero_si256(); + // For width 32, we use a second sum accumulator to reduce accumulator + // dependencies in the loop. + __m256i sum2; + if (width == 32) sum2 = _mm256_setzero_si256(); + + do { + // Add top row to the bottom row + __m256i l0 = _mm256_add_epi16(_mm256_loadu_si256(src), + _mm256_loadu_si256(src + CFL_BUF_LINE_I256)); + sum = _mm256_add_epi32(sum, _mm256_addl_epi16(l0)); + if (width == 32) { /* Don't worry, this if it gets optimized out. */ + // Add the second part of the top row to the second part of the bottom row + __m256i l1 = + _mm256_add_epi16(_mm256_loadu_si256(src + 1), + _mm256_loadu_si256(src + 1 + CFL_BUF_LINE_I256)); + sum2 = _mm256_add_epi32(sum2, _mm256_addl_epi16(l1)); + } + src += step; + } while (src < end); + // Combine both sum accumulators + if (width == 32) sum = _mm256_add_epi32(sum, sum2); + + __m256i fill = fill_sum_epi32(sum); + + __m256i avg_epi16 = _mm256_srli_epi32( + _mm256_add_epi32(fill, _mm256_set1_epi32(round_offset)), num_pel_log2); + avg_epi16 = _mm256_packs_epi32(avg_epi16, avg_epi16); + + // Store and subtract loop + src = (__m256i *)src_ptr; + __m256i *dst = (__m256i *)dst_ptr; + do { + _mm256_storeu_si256(dst, + _mm256_sub_epi16(_mm256_loadu_si256(src), avg_epi16)); + if (width == 32) { + _mm256_storeu_si256( + dst + 1, _mm256_sub_epi16(_mm256_loadu_si256(src + 1), avg_epi16)); + } + src += CFL_BUF_LINE_I256; + dst += CFL_BUF_LINE_I256; + } while (src < end); +} + +// Declare wrappers for AVX2 sizes +CFL_SUB_AVG_X(avx2, 16, 4, 32, 6) +CFL_SUB_AVG_X(avx2, 16, 8, 64, 7) +CFL_SUB_AVG_X(avx2, 16, 16, 128, 8) +CFL_SUB_AVG_X(avx2, 16, 32, 256, 9) +CFL_SUB_AVG_X(avx2, 32, 8, 128, 8) +CFL_SUB_AVG_X(avx2, 32, 16, 256, 9) +CFL_SUB_AVG_X(avx2, 32, 32, 512, 10) + +// Based on the observation that for small blocks AVX2 does not outperform +// SSE2, we call the SSE2 code for block widths 4 and 8. +cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size) { + static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { + cfl_subtract_average_4x4_sse2, /* 4x4 */ + cfl_subtract_average_8x8_sse2, /* 8x8 */ + cfl_subtract_average_16x16_avx2, /* 16x16 */ + cfl_subtract_average_32x32_avx2, /* 32x32 */ + NULL, /* 64x64 (invalid CFL size) */ + cfl_subtract_average_4x8_sse2, /* 4x8 */ + cfl_subtract_average_8x4_sse2, /* 8x4 */ + cfl_subtract_average_8x16_sse2, /* 8x16 */ + cfl_subtract_average_16x8_avx2, /* 16x8 */ + cfl_subtract_average_16x32_avx2, /* 16x32 */ + cfl_subtract_average_32x16_avx2, /* 32x16 */ + NULL, /* 32x64 (invalid CFL size) */ + NULL, /* 64x32 (invalid CFL size) */ + cfl_subtract_average_4x16_sse2, /* 4x16 */ + cfl_subtract_average_16x4_avx2, /* 16x4 */ + cfl_subtract_average_8x32_sse2, /* 8x32 */ + cfl_subtract_average_32x8_avx2, /* 32x8 */ + NULL, /* 16x64 (invalid CFL size) */ + NULL, /* 64x16 (invalid CFL size) */ + }; + // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to + // index the function pointer array out of bounds. + return sub_avg[tx_size % TX_SIZES_ALL]; +} diff --git a/libs/libaom/src/av1/common/x86/cfl_simd.h b/libs/libaom/src/av1/common/x86/cfl_simd.h new file mode 100644 index 000000000..03ae02a92 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/cfl_simd.h @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_X86_CFL_SIMD_H_ +#define AOM_AV1_COMMON_X86_CFL_SIMD_H_ + +#include "av1/common/blockd.h" + +// SSSE3 version is optimal for with == 4, we reuse them in AVX2 +void cfl_subsample_lbd_420_4x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_4x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_4x16_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); + +// SSSE3 version is optimal for with == 8, we reuse it in AVX2 +void cfl_subsample_lbd_420_8x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_8x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_8x16_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_8x32_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); + +// SSSE3 version is optimal for with == 16, we reuse it in AVX2 +void cfl_subsample_lbd_420_16x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_16x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_16x16_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_lbd_420_16x32_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); + +// SSSE3 version is optimal for with == 4, we reuse them in AVX2 +void cfl_subsample_lbd_422_4x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_4x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_4x16_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); + +// SSSE3 version is optimal for with == 8, we reuse it in AVX2 +void cfl_subsample_lbd_422_8x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_8x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_8x16_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_8x32_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); + +// SSSE3 version is optimal for with == 16, we reuse it in AVX2 +void cfl_subsample_lbd_422_16x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_16x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_16x16_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_lbd_422_16x32_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); + +// SSSE3 version is optimal for with == 4, we reuse them in AVX2 +void cfl_subsample_lbd_444_4x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_4x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_4x16_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); + +// SSSE3 version is optimal for with == 8, we reuse it in AVX2 +void cfl_subsample_lbd_444_8x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_8x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_8x16_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_8x32_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); + +// SSSE3 version is optimal for with == 16, we reuse it in AVX2 +void cfl_subsample_lbd_444_16x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_16x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_16x16_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_lbd_444_16x32_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); + +#if CONFIG_AV1_HIGHBITDEPTH +void cfl_subsample_hbd_420_4x4_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_420_4x8_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_420_4x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); + +// SSSE3 version is optimal for with == 8, we reuse it in AVX2 +void cfl_subsample_hbd_420_8x4_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_420_8x8_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_420_8x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_420_8x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); + +// SSSE3 version is faster for with == 16, we reuse it in AVX2 +void cfl_subsample_hbd_420_16x4_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_420_16x8_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_420_16x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_420_16x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); + +void cfl_subsample_hbd_422_4x4_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_422_4x8_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_422_4x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); + +// SSSE3 version is optimal for with == 8, we reuse it in AVX2 +void cfl_subsample_hbd_422_8x4_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_422_8x8_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_422_8x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_422_8x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); + +// SSSE3 version is faster for with == 16, we reuse it in AVX2 +void cfl_subsample_hbd_422_16x4_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_422_16x8_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_422_16x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_422_16x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); + +void cfl_subsample_hbd_444_4x4_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_444_4x8_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_444_4x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); + +// SSSE3 version is optimal for with == 8, we reuse it in AVX2 +void cfl_subsample_hbd_444_8x4_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_444_8x8_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_444_8x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_444_8x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); + +// SSSE3 version is faster for with == 16, we reuse it in AVX2 +void cfl_subsample_hbd_444_16x4_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_444_16x8_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_444_16x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_444_16x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +#endif // CONFIG_AV1_HIGHBITDEPTH + +// SSE2 version is optimal for with == 4, we reuse them in AVX2 +void cfl_subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_4x8_sse2(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_4x16_sse2(const uint16_t *src, int16_t *dst); + +// SSE2 version is optimal for with == 8, we reuse them in AVX2 +void cfl_subtract_average_8x4_sse2(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_8x8_sse2(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst); + +void cfl_predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_4x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_4x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); + +void cfl_predict_lbd_8x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_8x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_8x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_8x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); + +void cfl_predict_lbd_16x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_16x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_16x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_16x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); + +#if CONFIG_AV1_HIGHBITDEPTH +void cfl_predict_hbd_4x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_4x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_4x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); + +void cfl_predict_hbd_8x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_8x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_8x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_8x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); + +void cfl_predict_hbd_16x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_16x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +#endif // CONFIG_AV1_HIGHBITDEPTH +#endif // AOM_AV1_COMMON_X86_CFL_SIMD_H_ diff --git a/libs/libaom/src/av1/common/x86/cfl_sse2.c b/libs/libaom/src/av1/common/x86/cfl_sse2.c new file mode 100644 index 000000000..4783fe098 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/cfl_sse2.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/cfl.h" +#include "config/av1_rtcd.h" + +static INLINE __m128i fill_sum_epi32(__m128i l0) { + l0 = _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(1, 0, 3, 2))); + return _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(2, 3, 0, 1))); +} + +static INLINE void subtract_average_sse2(const uint16_t *src_ptr, + int16_t *dst_ptr, int width, + int height, int round_offset, + int num_pel_log2) { + const __m128i zeros = _mm_setzero_si128(); + const __m128i round_offset_epi32 = _mm_set1_epi32(round_offset); + const __m128i *src = (__m128i *)src_ptr; + const __m128i *const end = src + height * CFL_BUF_LINE_I128; + const int step = CFL_BUF_LINE_I128 * (1 + (width == 8) + 3 * (width == 4)); + + __m128i sum = zeros; + do { + __m128i l0; + if (width == 4) { + l0 = _mm_add_epi16(_mm_loadl_epi64(src), + _mm_loadl_epi64(src + CFL_BUF_LINE_I128)); + __m128i l1 = _mm_add_epi16(_mm_loadl_epi64(src + 2 * CFL_BUF_LINE_I128), + _mm_loadl_epi64(src + 3 * CFL_BUF_LINE_I128)); + sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros), + _mm_unpacklo_epi16(l1, zeros))); + } else { + if (width == 8) { + l0 = _mm_add_epi16(_mm_loadu_si128(src), + _mm_loadu_si128(src + CFL_BUF_LINE_I128)); + } else { + l0 = _mm_add_epi16(_mm_loadu_si128(src), _mm_loadu_si128(src + 1)); + } + sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros), + _mm_unpackhi_epi16(l0, zeros))); + if (width == 32) { + l0 = _mm_add_epi16(_mm_loadu_si128(src + 2), _mm_loadu_si128(src + 3)); + sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros), + _mm_unpackhi_epi16(l0, zeros))); + } + } + src += step; + } while (src < end); + + sum = fill_sum_epi32(sum); + + __m128i avg_epi16 = + _mm_srli_epi32(_mm_add_epi32(sum, round_offset_epi32), num_pel_log2); + avg_epi16 = _mm_packs_epi32(avg_epi16, avg_epi16); + + src = (__m128i *)src_ptr; + __m128i *dst = (__m128i *)dst_ptr; + do { + if (width == 4) { + _mm_storel_epi64(dst, _mm_sub_epi16(_mm_loadl_epi64(src), avg_epi16)); + } else { + _mm_storeu_si128(dst, _mm_sub_epi16(_mm_loadu_si128(src), avg_epi16)); + if (width > 8) { + _mm_storeu_si128(dst + 1, + _mm_sub_epi16(_mm_loadu_si128(src + 1), avg_epi16)); + if (width == 32) { + _mm_storeu_si128(dst + 2, + _mm_sub_epi16(_mm_loadu_si128(src + 2), avg_epi16)); + _mm_storeu_si128(dst + 3, + _mm_sub_epi16(_mm_loadu_si128(src + 3), avg_epi16)); + } + } + } + src += CFL_BUF_LINE_I128; + dst += CFL_BUF_LINE_I128; + } while (src < end); +} + +CFL_SUB_AVG_FN(sse2) diff --git a/libs/libaom/src/av1/common/x86/cfl_ssse3.c b/libs/libaom/src/av1/common/x86/cfl_ssse3.c new file mode 100644 index 000000000..476b6609a --- /dev/null +++ b/libs/libaom/src/av1/common/x86/cfl_ssse3.c @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/cfl.h" + +#include "av1/common/x86/cfl_simd.h" + +// Load 32-bit integer from memory into the first element of dst. +static INLINE __m128i _mm_loadh_epi32(__m128i const *mem_addr) { + return _mm_cvtsi32_si128(*((int *)mem_addr)); +} + +// Store 32-bit integer from the first element of a into memory. +static INLINE void _mm_storeh_epi32(__m128i const *mem_addr, __m128i a) { + *((int *)mem_addr) = _mm_cvtsi128_si32(a); +} + +/** + * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more + * precise version of a box filter 4:2:0 pixel subsampling in Q3. + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + */ +static INLINE void cfl_luma_subsampling_420_lbd_ssse3(const uint8_t *input, + int input_stride, + uint16_t *pred_buf_q3, + int width, int height) { + const __m128i twos = _mm_set1_epi8(2); + __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; + const __m128i *end = pred_buf_m128i + (height >> 1) * CFL_BUF_LINE_I128; + const int luma_stride = input_stride << 1; + do { + if (width == 4) { + __m128i top = _mm_loadh_epi32((__m128i *)input); + top = _mm_maddubs_epi16(top, twos); + __m128i bot = _mm_loadh_epi32((__m128i *)(input + input_stride)); + bot = _mm_maddubs_epi16(bot, twos); + const __m128i sum = _mm_add_epi16(top, bot); + _mm_storeh_epi32(pred_buf_m128i, sum); + } else if (width == 8) { + __m128i top = _mm_loadl_epi64((__m128i *)input); + top = _mm_maddubs_epi16(top, twos); + __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride)); + bot = _mm_maddubs_epi16(bot, twos); + const __m128i sum = _mm_add_epi16(top, bot); + _mm_storel_epi64(pred_buf_m128i, sum); + } else { + __m128i top = _mm_loadu_si128((__m128i *)input); + top = _mm_maddubs_epi16(top, twos); + __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride)); + bot = _mm_maddubs_epi16(bot, twos); + const __m128i sum = _mm_add_epi16(top, bot); + _mm_storeu_si128(pred_buf_m128i, sum); + if (width == 32) { + __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); + __m128i bot_1 = + _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1); + top_1 = _mm_maddubs_epi16(top_1, twos); + bot_1 = _mm_maddubs_epi16(bot_1, twos); + __m128i sum_1 = _mm_add_epi16(top_1, bot_1); + _mm_storeu_si128(pred_buf_m128i + 1, sum_1); + } + } + input += luma_stride; + pred_buf_m128i += CFL_BUF_LINE_I128; + } while (pred_buf_m128i < end); +} + +/** + * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more + * precise version of a box filter 4:2:2 pixel subsampling in Q3. + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + */ +static INLINE void cfl_luma_subsampling_422_lbd_ssse3(const uint8_t *input, + int input_stride, + uint16_t *pred_buf_q3, + int width, int height) { + const __m128i fours = _mm_set1_epi8(4); + __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; + const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128; + do { + if (width == 4) { + __m128i top = _mm_loadh_epi32((__m128i *)input); + top = _mm_maddubs_epi16(top, fours); + _mm_storeh_epi32(pred_buf_m128i, top); + } else if (width == 8) { + __m128i top = _mm_loadl_epi64((__m128i *)input); + top = _mm_maddubs_epi16(top, fours); + _mm_storel_epi64(pred_buf_m128i, top); + } else { + __m128i top = _mm_loadu_si128((__m128i *)input); + top = _mm_maddubs_epi16(top, fours); + _mm_storeu_si128(pred_buf_m128i, top); + if (width == 32) { + __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); + top_1 = _mm_maddubs_epi16(top_1, fours); + _mm_storeu_si128(pred_buf_m128i + 1, top_1); + } + } + input += input_stride; + pred_buf_m128i += CFL_BUF_LINE_I128; + } while (pred_buf_m128i < end); +} + +/** + * Multiplies the pixels by 8 (scaling in Q3). + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + */ +static INLINE void cfl_luma_subsampling_444_lbd_ssse3(const uint8_t *input, + int input_stride, + uint16_t *pred_buf_q3, + int width, int height) { + const __m128i zeros = _mm_setzero_si128(); + const int luma_stride = input_stride; + __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; + const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128; + do { + if (width == 4) { + __m128i row = _mm_loadh_epi32((__m128i *)input); + row = _mm_unpacklo_epi8(row, zeros); + _mm_storel_epi64(pred_buf_m128i, _mm_slli_epi16(row, 3)); + } else if (width == 8) { + __m128i row = _mm_loadl_epi64((__m128i *)input); + row = _mm_unpacklo_epi8(row, zeros); + _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row, 3)); + } else { + __m128i row = _mm_loadu_si128((__m128i *)input); + const __m128i row_lo = _mm_unpacklo_epi8(row, zeros); + const __m128i row_hi = _mm_unpackhi_epi8(row, zeros); + _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row_lo, 3)); + _mm_storeu_si128(pred_buf_m128i + 1, _mm_slli_epi16(row_hi, 3)); + if (width == 32) { + __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1); + const __m128i row_1_lo = _mm_unpacklo_epi8(row_1, zeros); + const __m128i row_1_hi = _mm_unpackhi_epi8(row_1, zeros); + _mm_storeu_si128(pred_buf_m128i + 2, _mm_slli_epi16(row_1_lo, 3)); + _mm_storeu_si128(pred_buf_m128i + 3, _mm_slli_epi16(row_1_hi, 3)); + } + } + input += luma_stride; + pred_buf_m128i += CFL_BUF_LINE_I128; + } while (pred_buf_m128i < end); +} + +#if CONFIG_AV1_HIGHBITDEPTH +/** + * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more + * precise version of a box filter 4:2:0 pixel subsampling in Q3. + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + */ +static INLINE void cfl_luma_subsampling_420_hbd_ssse3(const uint16_t *input, + int input_stride, + uint16_t *pred_buf_q3, + int width, int height) { + const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE; + const int luma_stride = input_stride << 1; + do { + if (width == 4) { + const __m128i top = _mm_loadl_epi64((__m128i *)input); + const __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride)); + __m128i sum = _mm_add_epi16(top, bot); + sum = _mm_hadd_epi16(sum, sum); + *((int *)pred_buf_q3) = _mm_cvtsi128_si32(_mm_add_epi16(sum, sum)); + } else { + const __m128i top = _mm_loadu_si128((__m128i *)input); + const __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride)); + __m128i sum = _mm_add_epi16(top, bot); + if (width == 8) { + sum = _mm_hadd_epi16(sum, sum); + _mm_storel_epi64((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum)); + } else { + const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); + const __m128i bot_1 = + _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1); + sum = _mm_hadd_epi16(sum, _mm_add_epi16(top_1, bot_1)); + _mm_storeu_si128((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum)); + if (width == 32) { + const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2); + const __m128i bot_2 = + _mm_loadu_si128(((__m128i *)(input + input_stride)) + 2); + const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3); + const __m128i bot_3 = + _mm_loadu_si128(((__m128i *)(input + input_stride)) + 3); + const __m128i sum_2 = _mm_add_epi16(top_2, bot_2); + const __m128i sum_3 = _mm_add_epi16(top_3, bot_3); + __m128i next_sum = _mm_hadd_epi16(sum_2, sum_3); + _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1, + _mm_add_epi16(next_sum, next_sum)); + } + } + } + input += luma_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); +} + +/** + * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more + * precise version of a box filter 4:2:2 pixel subsampling in Q3. + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + */ +static INLINE void cfl_luma_subsampling_422_hbd_ssse3(const uint16_t *input, + int input_stride, + uint16_t *pred_buf_q3, + int width, int height) { + __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; + const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128; + do { + if (width == 4) { + const __m128i top = _mm_loadl_epi64((__m128i *)input); + const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2); + _mm_storeh_epi32(pred_buf_m128i, sum); + } else { + const __m128i top = _mm_loadu_si128((__m128i *)input); + if (width == 8) { + const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2); + _mm_storel_epi64(pred_buf_m128i, sum); + } else { + const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); + const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top_1), 2); + _mm_storeu_si128(pred_buf_m128i, sum); + if (width == 32) { + const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2); + const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3); + const __m128i sum_1 = _mm_slli_epi16(_mm_hadd_epi16(top_2, top_3), 2); + _mm_storeu_si128(pred_buf_m128i + 1, sum_1); + } + } + } + pred_buf_m128i += CFL_BUF_LINE_I128; + input += input_stride; + } while (pred_buf_m128i < end); +} + +static INLINE void cfl_luma_subsampling_444_hbd_ssse3(const uint16_t *input, + int input_stride, + uint16_t *pred_buf_q3, + int width, int height) { + const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE; + do { + if (width == 4) { + const __m128i row = _mm_slli_epi16(_mm_loadl_epi64((__m128i *)input), 3); + _mm_storel_epi64((__m128i *)pred_buf_q3, row); + } else { + const __m128i row = _mm_slli_epi16(_mm_loadu_si128((__m128i *)input), 3); + _mm_storeu_si128((__m128i *)pred_buf_q3, row); + if (width >= 16) { + __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1); + row_1 = _mm_slli_epi16(row_1, 3); + _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1, row_1); + if (width == 32) { + __m128i row_2 = _mm_loadu_si128(((__m128i *)input) + 2); + row_2 = _mm_slli_epi16(row_2, 3); + _mm_storeu_si128(((__m128i *)pred_buf_q3) + 2, row_2); + __m128i row_3 = _mm_loadu_si128(((__m128i *)input) + 3); + row_3 = _mm_slli_epi16(row_3, 3); + _mm_storeu_si128(((__m128i *)pred_buf_q3) + 3, row_3); + } + } + } + input += input_stride; + pred_buf_q3 += CFL_BUF_LINE; + } while (pred_buf_q3 < end); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +CFL_GET_SUBSAMPLE_FUNCTION(ssse3) + +static INLINE __m128i predict_unclipped(const __m128i *input, __m128i alpha_q12, + __m128i alpha_sign, __m128i dc_q0) { + __m128i ac_q3 = _mm_loadu_si128(input); + __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3); + __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12); + scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign); + return _mm_add_epi16(scaled_luma_q0, dc_q0); +} + +static INLINE void cfl_predict_lbd_ssse3(const int16_t *pred_buf_q3, + uint8_t *dst, int dst_stride, + int alpha_q3, int width, int height) { + const __m128i alpha_sign = _mm_set1_epi16(alpha_q3); + const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9); + const __m128i dc_q0 = _mm_set1_epi16(*dst); + __m128i *row = (__m128i *)pred_buf_q3; + const __m128i *row_end = row + height * CFL_BUF_LINE_I128; + do { + __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); + if (width < 16) { + res = _mm_packus_epi16(res, res); + if (width == 4) + _mm_storeh_epi32((__m128i *)dst, res); + else + _mm_storel_epi64((__m128i *)dst, res); + } else { + __m128i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); + res = _mm_packus_epi16(res, next); + _mm_storeu_si128((__m128i *)dst, res); + if (width == 32) { + res = predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0); + next = predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0); + res = _mm_packus_epi16(res, next); + _mm_storeu_si128((__m128i *)(dst + 16), res); + } + } + dst += dst_stride; + } while ((row += CFL_BUF_LINE_I128) < row_end); +} + +CFL_PREDICT_FN(ssse3, lbd) + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE __m128i highbd_max_epi16(int bd) { + const __m128i neg_one = _mm_set1_epi16(-1); + // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd) + return _mm_xor_si128(_mm_slli_epi16(neg_one, bd), neg_one); +} + +static INLINE __m128i highbd_clamp_epi16(__m128i u, __m128i zero, __m128i max) { + return _mm_max_epi16(_mm_min_epi16(u, max), zero); +} + +static INLINE void cfl_predict_hbd_ssse3(const int16_t *pred_buf_q3, + uint16_t *dst, int dst_stride, + int alpha_q3, int bd, int width, + int height) { + const __m128i alpha_sign = _mm_set1_epi16(alpha_q3); + const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9); + const __m128i dc_q0 = _mm_set1_epi16(*dst); + const __m128i max = highbd_max_epi16(bd); + const __m128i zeros = _mm_setzero_si128(); + __m128i *row = (__m128i *)pred_buf_q3; + const __m128i *row_end = row + height * CFL_BUF_LINE_I128; + do { + __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); + res = highbd_clamp_epi16(res, zeros, max); + if (width == 4) { + _mm_storel_epi64((__m128i *)dst, res); + } else { + _mm_storeu_si128((__m128i *)dst, res); + } + if (width >= 16) { + const __m128i res_1 = + predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); + _mm_storeu_si128(((__m128i *)dst) + 1, + highbd_clamp_epi16(res_1, zeros, max)); + } + if (width == 32) { + const __m128i res_2 = + predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0); + _mm_storeu_si128((__m128i *)(dst + 16), + highbd_clamp_epi16(res_2, zeros, max)); + const __m128i res_3 = + predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0); + _mm_storeu_si128((__m128i *)(dst + 24), + highbd_clamp_epi16(res_3, zeros, max)); + } + dst += dst_stride; + } while ((row += CFL_BUF_LINE_I128) < row_end); +} + +CFL_PREDICT_FN(ssse3, hbd) +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/libs/libaom/src/av1/common/x86/convolve_2d_avx2.c b/libs/libaom/src/av1/common/x86/convolve_2d_avx2.c new file mode 100644 index 000000000..e19575d72 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/convolve_2d_avx2.c @@ -0,0 +1,317 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/convolve_avx2.h" +#include "aom_dsp/x86/convolve_common_intrin.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/synonyms.h" +#include "av1/common/convolve.h" + +void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + const int bd = 8; + int im_stride = 8; + int i, is_horiz_4tap = 0, is_vert_4tap = 0; + DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + + assert(conv_params->round_0 > 0); + + const __m256i round_const_h = _mm256_set1_epi16( + ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2))); + const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1); + + const __m256i sum_round_v = _mm256_set1_epi32( + (1 << offset_bits) + ((1 << conv_params->round_1) >> 1)); + const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1); + + const __m256i round_const_v = _mm256_set1_epi32( + ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) - + ((1 << (offset_bits - conv_params->round_1)) >> 1)); + const __m128i round_shift_v = _mm_cvtsi32_si128(bits); + + __m256i filt[4], coeffs_h[4], coeffs_v[4]; + + filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2)); + filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + + prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h); + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v); + + // Condition for checking valid horz_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_h[0], coeffs_h[3]), 0))) + is_horiz_4tap = 1; + + // Condition for checking valid vert_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_v[0], coeffs_v[3]), 0))) + is_vert_4tap = 1; + + // horz_filt as 4 tap and vert_filt as 8 tap + if (is_horiz_4tap) { + int im_h = h + filter_params_y->taps - 1; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + // horz-filter + for (int j = 0; j < w; j += 8) { + for (i = 0; i < (im_h - 2); i += 2) { + __m256i data = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); + + // Load the next line + data = _mm256_inserti128_si256( + data, + _mm_loadu_si128( + (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), + 1); + __m256i res = convolve_lowbd_x_4tap(data, coeffs_h + 1, filt); + + res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), + round_shift_h); + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); + } + + __m256i data_1 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); + + __m256i res = convolve_lowbd_x_4tap(data_1, coeffs_h + 1, filt); + res = + _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); + + // vert filter + CONVOLVE_SR_VERTICAL_FILTER_8TAP; + } + } else if (is_vert_4tap) { + int im_h = h + 3; + const int fo_vert = 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + for (int j = 0; j < w; j += 8) { + // horz_filter + CONVOLVE_SR_HORIZONTAL_FILTER_8TAP; + // vert_filter + __m256i s[6]; + __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); + __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); + __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); + __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); + + s[0] = _mm256_unpacklo_epi16(src_0, src_1); + s[1] = _mm256_unpacklo_epi16(src_2, src_3); + s[3] = _mm256_unpackhi_epi16(src_0, src_1); + s[4] = _mm256_unpackhi_epi16(src_2, src_3); + + for (i = 0; i < h; i += 2) { + const int16_t *data = &im_block[i * im_stride]; + + const __m256i s4 = + _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); + const __m256i s5 = + _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); + + s[2] = _mm256_unpacklo_epi16(s4, s5); + s[5] = _mm256_unpackhi_epi16(s4, s5); + + __m256i res_a = convolve_4tap(s, coeffs_v + 1); + __m256i res_b = convolve_4tap(s + 3, coeffs_v + 1); + + // Combine V round and 2F-H-V round into a single rounding + res_a = + _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); + res_b = + _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); + + const __m256i res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a, round_const_v), round_shift_v); + const __m256i res_b_round = _mm256_sra_epi32( + _mm256_add_epi32(res_b, round_const_v), round_shift_v); + + /* rounding code */ + // 16 bit conversion + const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); + // 8 bit conversion and saturation to uint8 + const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); + + const __m128i res_0 = _mm256_castsi256_si128(res_8b); + const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); + + // Store values into the destination buffer + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; + __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; + if (w - j > 4) { + _mm_storel_epi64(p_0, res_0); + _mm_storel_epi64(p_1, res_1); + } else if (w == 4) { + xx_storel_32(p_0, res_0); + xx_storel_32(p_1, res_1); + } else { + *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0); + *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1); + } + + s[0] = s[1]; + s[1] = s[2]; + s[3] = s[4]; + s[4] = s[5]; + } + } + } else { + int j; + int im_h = h + filter_params_y->taps - 1; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + for (j = 0; j < w; j += 8) { + CONVOLVE_SR_HORIZONTAL_FILTER_8TAP; + + CONVOLVE_SR_VERTICAL_FILTER_8TAP; + } + } +} + +static INLINE void copy_128(const uint8_t *src, uint8_t *dst) { + __m256i s[4]; + s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32)); + s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32)); + s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 32)); + s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 32)); + _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]); + _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]); + _mm256_storeu_si256((__m256i *)(dst + 2 * 32), s[2]); + _mm256_storeu_si256((__m256i *)(dst + 3 * 32), s[3]); +} + +void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + (void)filter_params_x; + (void)filter_params_y; + (void)subpel_x_qn; + (void)subpel_y_qn; + (void)conv_params; + + if (w >= 16) { + assert(!((intptr_t)dst % 16)); + assert(!(dst_stride % 16)); + } + + if (w == 2) { + do { + memmove(dst, src, 2 * sizeof(*src)); + src += src_stride; + dst += dst_stride; + memmove(dst, src, 2 * sizeof(*src)); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 4) { + do { + memmove(dst, src, 4 * sizeof(*src)); + src += src_stride; + dst += dst_stride; + memmove(dst, src, 4 * sizeof(*src)); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 8) { + do { + __m128i s[2]; + s[0] = _mm_loadl_epi64((__m128i *)src); + src += src_stride; + s[1] = _mm_loadl_epi64((__m128i *)src); + src += src_stride; + _mm_storel_epi64((__m128i *)dst, s[0]); + dst += dst_stride; + _mm_storel_epi64((__m128i *)dst, s[1]); + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 16) { + do { + __m128i s[2]; + s[0] = _mm_loadu_si128((__m128i *)src); + src += src_stride; + s[1] = _mm_loadu_si128((__m128i *)src); + src += src_stride; + _mm_store_si128((__m128i *)dst, s[0]); + dst += dst_stride; + _mm_store_si128((__m128i *)dst, s[1]); + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 32) { + do { + __m256i s[2]; + s[0] = _mm256_loadu_si256((__m256i *)src); + src += src_stride; + s[1] = _mm256_loadu_si256((__m256i *)src); + src += src_stride; + _mm256_storeu_si256((__m256i *)dst, s[0]); + dst += dst_stride; + _mm256_storeu_si256((__m256i *)dst, s[1]); + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 64) { + do { + __m256i s[4]; + s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32)); + s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32)); + src += src_stride; + s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 32)); + s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 32)); + src += src_stride; + _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]); + _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]); + dst += dst_stride; + _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[2]); + _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[3]); + dst += dst_stride; + h -= 2; + } while (h); + } else { + do { + copy_128(src, dst); + src += src_stride; + dst += dst_stride; + copy_128(src, dst); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h); + } +} diff --git a/libs/libaom/src/av1/common/x86/convolve_2d_sse2.c b/libs/libaom/src/av1/common/x86/convolve_2d_sse2.c new file mode 100644 index 000000000..5376ea79b --- /dev/null +++ b/libs/libaom/src/av1/common/x86/convolve_2d_sse2.c @@ -0,0 +1,471 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve_sse2.h" +#include "av1/common/convolve.h" + +void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + const int bd = 8; + + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); + int im_h = h + filter_params_y->taps - 1; + int im_stride = MAX_SB_SIZE; + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + const __m128i zero = _mm_setzero_si128(); + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + + assert(conv_params->round_0 > 0); + + /* Horizontal filter */ + { + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + (1 << (bd + FILTER_BITS - 1)) + ((1 << conv_params->round_0) >> 1)); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); + + for (i = 0; i < im_h; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + + // Filter even-index pixels + const __m128i src_0 = _mm_unpacklo_epi8(data, zero); + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), + _mm_add_epi32(res_2, res_6)); + res_even = + _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); + + // Filter odd-index pixels + const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero); + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + res_odd = + _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); + + // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 + __m128i res = _mm_packs_epi32(res_even, res_odd); + _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res); + } + } + } + + /* Vertical filter */ + { + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i sum_round = + _mm_set1_epi32((1 << offset_bits) + ((1 << conv_params->round_1) >> 1)); + const __m128i sum_shift = _mm_cvtsi32_si128(conv_params->round_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) - + ((1 << (offset_bits - conv_params->round_1)) >> 1)); + const __m128i round_shift = _mm_cvtsi32_si128(bits); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + // Filter even-index pixels + const int16_t *data = &im_block[i * im_stride + j]; + const __m128i src_0 = + _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_2 = + _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_4 = + _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_6 = + _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = + _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_3 = + _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_5 = + _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_7 = + _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, sum_round), sum_shift); + __m128i res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, sum_round), sum_shift); + + res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), + round_shift); + res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const), + round_shift); + + const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round); + const __m128i res = _mm_packus_epi16(res16, res16); + + // Accumulate values into the destination buffer + __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; + + if (w == 2) { + *(uint16_t *)p = (uint16_t)_mm_cvtsi128_si32(res); + } else if (w == 4) { + *(uint32_t *)p = _mm_cvtsi128_si32(res); + } else { + _mm_storel_epi64(p, res); + } + } + } + } +} + +static INLINE void copy_128(const uint8_t *src, uint8_t *dst) { + __m128i s[8]; + s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16)); + s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16)); + s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16)); + s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16)); + s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 16)); + s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 16)); + s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 16)); + s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 16)); + _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]); + _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]); + _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]); + _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]); + _mm_store_si128((__m128i *)(dst + 4 * 16), s[4]); + _mm_store_si128((__m128i *)(dst + 5 * 16), s[5]); + _mm_store_si128((__m128i *)(dst + 6 * 16), s[6]); + _mm_store_si128((__m128i *)(dst + 7 * 16), s[7]); +} + +void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + (void)filter_params_x; + (void)filter_params_y; + (void)subpel_x_qn; + (void)subpel_y_qn; + (void)conv_params; + + if (w >= 16) { + assert(!((intptr_t)dst % 16)); + assert(!(dst_stride % 16)); + } + + if (w == 2) { + do { + memmove(dst, src, 2 * sizeof(*src)); + src += src_stride; + dst += dst_stride; + memmove(dst, src, 2 * sizeof(*src)); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 4) { + do { + memmove(dst, src, 4 * sizeof(*src)); + src += src_stride; + dst += dst_stride; + memmove(dst, src, 4 * sizeof(*src)); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 8) { + do { + __m128i s[2]; + s[0] = _mm_loadl_epi64((__m128i *)src); + src += src_stride; + s[1] = _mm_loadl_epi64((__m128i *)src); + src += src_stride; + _mm_storel_epi64((__m128i *)dst, s[0]); + dst += dst_stride; + _mm_storel_epi64((__m128i *)dst, s[1]); + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 16) { + do { + __m128i s[2]; + s[0] = _mm_loadu_si128((__m128i *)src); + src += src_stride; + s[1] = _mm_loadu_si128((__m128i *)src); + src += src_stride; + _mm_store_si128((__m128i *)dst, s[0]); + dst += dst_stride; + _mm_store_si128((__m128i *)dst, s[1]); + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 32) { + do { + __m128i s[4]; + s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16)); + s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16)); + src += src_stride; + s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 16)); + s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 16)); + src += src_stride; + _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]); + _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]); + dst += dst_stride; + _mm_store_si128((__m128i *)(dst + 0 * 16), s[2]); + _mm_store_si128((__m128i *)(dst + 1 * 16), s[3]); + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 64) { + do { + __m128i s[8]; + s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16)); + s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16)); + s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16)); + s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16)); + src += src_stride; + s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 16)); + s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 16)); + s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 16)); + s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 16)); + src += src_stride; + _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]); + _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]); + _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]); + _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]); + dst += dst_stride; + _mm_store_si128((__m128i *)(dst + 0 * 16), s[4]); + _mm_store_si128((__m128i *)(dst + 1 * 16), s[5]); + _mm_store_si128((__m128i *)(dst + 2 * 16), s[6]); + _mm_store_si128((__m128i *)(dst + 3 * 16), s[7]); + dst += dst_stride; + h -= 2; + } while (h); + } else { + do { + copy_128(src, dst); + src += src_stride; + dst += dst_stride; + copy_128(src, dst); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h); + } +} + +void av1_dist_wtd_convolve_2d_copy_sse2( + const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params) { + const int bd = 8; + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + (void)filter_params_x; + (void)filter_params_y; + (void)subpel_x_qn; + (void)subpel_y_qn; + + const int bits = + FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const __m128i zero = _mm_setzero_si128(); + const __m128i left_shift = _mm_cvtsi32_si128(bits); + int i, j; + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi16(w0); + const __m128i wt1 = _mm_set1_epi16(w1); + const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m128i offset_const = _mm_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1); + + assert((w % 4) == 0); + + if (!(w % 16)) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 16) { + const __m128i d8 = _mm_loadu_si128((__m128i *)&src[j]); + + const __m128i d16_lo = _mm_unpacklo_epi8(d8, zero); + const __m128i d16_hi = _mm_unpackhi_epi8(d8, zero); + + const __m128i res_lo = _mm_sll_epi16(d16_lo, left_shift); + const __m128i res_unsigned_lo = _mm_add_epi16(res_lo, offset_const); + + const __m128i res_hi = _mm_sll_epi16(d16_hi, left_shift); + const __m128i res_unsigned_hi = _mm_add_epi16(res_hi, offset_const); + + if (do_average) { + const __m128i data_ref_0_lo = _mm_loadu_si128((__m128i *)(&dst[j])); + const __m128i data_ref_0_hi = + _mm_loadu_si128((__m128i *)(&dst[j + 8])); + + const __m128i comp_avg_res_lo = comp_avg( + &data_ref_0_lo, &res_unsigned_lo, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result_lo = convolve_rounding( + &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); + + const __m128i comp_avg_res_hi = comp_avg( + &data_ref_0_hi, &res_unsigned_hi, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result_hi = convolve_rounding( + &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = + _mm_packus_epi16(round_result_lo, round_result_hi); + + _mm_store_si128((__m128i *)(&dst0[j]), res_8); + } else { + _mm_store_si128((__m128i *)(&dst[j]), res_unsigned_lo); + _mm_store_si128((__m128i *)(&dst[j + 8]), res_unsigned_hi); + } + } + src += src_stride; + dst += dst_stride; + dst0 += dst_stride0; + } + } else { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i d8 = _mm_loadl_epi64((__m128i *)&src[j]); + const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero); + + const __m128i res = _mm_sll_epi16(d16_0, left_shift); + const __m128i res_unsigned = _mm_add_epi16(res, offset_const); + + if (do_average) { + const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)(&dst[j])); + + const __m128i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = _mm_packus_epi16(round_result, round_result); + + if (w > 4) + _mm_storel_epi64((__m128i *)(&dst0[j]), res_8); + else + *(uint32_t *)(&dst0[j]) = _mm_cvtsi128_si32(res_8); + } else { + _mm_store_si128((__m128i *)(&dst[j]), res_unsigned); + } + } + src += src_stride; + dst += dst_stride; + dst0 += dst_stride0; + } + } +} diff --git a/libs/libaom/src/av1/common/x86/convolve_avx2.c b/libs/libaom/src/av1/common/x86/convolve_avx2.c new file mode 100644 index 000000000..1d5bc6fbd --- /dev/null +++ b/libs/libaom/src/av1/common/x86/convolve_avx2.c @@ -0,0 +1,439 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/x86/convolve_avx2.h" +#include "aom_dsp/x86/synonyms.h" + +void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + int i, j, is_vert_4tap = 0; + // right shift is F-1 because we are already dividing + // filter co-efficients by 2 + const int right_shift_bits = (FILTER_BITS - 1); + const __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits); + const __m256i right_shift_const = + _mm256_set1_epi16((1 << right_shift_bits) >> 1); + + assert(conv_params->round_0 <= FILTER_BITS); + assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) || + ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); + + (void)filter_params_x; + (void)subpel_x_qn; + (void)conv_params; + __m256i coeffs[4], s[8]; + __m128i d[6]; + + prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs); + + // Condition for checking valid vert_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0))) + is_vert_4tap = 1; + + // vert_filt as 4 tap + if (is_vert_4tap) { + const int fo_vert = 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride; + for (j = 0; j < w; j += 16) { + const uint8_t *data = &src_ptr[j]; + d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); + d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); + d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); + d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); + d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); + + // Load lines a and b. Line a to lower 128, line b to upper 128 + const __m256i src_01a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); + + const __m256i src_12a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); + + const __m256i src_23a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); + + const __m256i src_34a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); + + s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); + s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); + + s[3] = _mm256_unpackhi_epi8(src_01a, src_12a); + s[4] = _mm256_unpackhi_epi8(src_23a, src_34a); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); + const __m256i src_45a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); + + d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); + const __m256i src_56a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20); + + s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); + s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); + + const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1); + /* rounding code */ + // shift by F - 1 + const __m256i res_16b_lo = _mm256_sra_epi16( + _mm256_add_epi16(res_lo, right_shift_const), right_shift); + // 8 bit conversion and saturation to uint8 + __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); + + if (w - j > 8) { + const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1); + + /* rounding code */ + // shift by F - 1 + const __m256i res_16b_hi = _mm256_sra_epi16( + _mm256_add_epi16(res_hi, right_shift_const), right_shift); + // 8 bit conversion and saturation to uint8 + __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); + + __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); + + const __m128i res_0 = _mm256_castsi256_si128(res_a); + const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_1); + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); + const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); + if (w - j > 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_1); + } else if (w - j > 2) { + xx_storel_32(&dst[i * dst_stride + j], res_0); + xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); + } else { + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; + __m128i *const p_1 = + (__m128i *)&dst[i * dst_stride + j + dst_stride]; + *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); + *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); + } + } + s[0] = s[1]; + s[1] = s[2]; + + s[3] = s[4]; + s[4] = s[5]; + } + } + } else { + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride; + + for (j = 0; j < w; j += 16) { + const uint8_t *data = &src_ptr[j]; + __m256i src6; + + d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); + d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); + d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); + d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); + d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); + d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); + // Load lines a and b. Line a to lower 128, line b to upper 128 + const __m256i src_01a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); + + const __m256i src_12a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); + + const __m256i src_23a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); + + const __m256i src_34a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); + + const __m256i src_45a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); + + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); + const __m256i src_56a = + _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20); + + s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); + s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); + s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); + + s[4] = _mm256_unpackhi_epi8(src_01a, src_12a); + s[5] = _mm256_unpackhi_epi8(src_23a, src_34a); + s[6] = _mm256_unpackhi_epi8(src_45a, src_56a); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + const __m256i src_67a = _mm256_permute2x128_si256( + src6, + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), + 0x20); + + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); + const __m256i src_78a = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), + src6, 0x20); + + s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); + s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); + + const __m256i res_lo = convolve_lowbd(s, coeffs); + + /* rounding code */ + // shift by F - 1 + const __m256i res_16b_lo = _mm256_sra_epi16( + _mm256_add_epi16(res_lo, right_shift_const), right_shift); + // 8 bit conversion and saturation to uint8 + __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); + + if (w - j > 8) { + const __m256i res_hi = convolve_lowbd(s + 4, coeffs); + + /* rounding code */ + // shift by F - 1 + const __m256i res_16b_hi = _mm256_sra_epi16( + _mm256_add_epi16(res_hi, right_shift_const), right_shift); + // 8 bit conversion and saturation to uint8 + __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); + + __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); + + const __m128i res_0 = _mm256_castsi256_si128(res_a); + const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_1); + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); + const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); + if (w - j > 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_1); + } else if (w - j > 2) { + xx_storel_32(&dst[i * dst_stride + j], res_0); + xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); + } else { + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; + __m128i *const p_1 = + (__m128i *)&dst[i * dst_stride + j + dst_stride]; + *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); + *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); + } + } + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + } + } + } +} + +void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + const int bits = FILTER_BITS - conv_params->round_0; + + const __m256i round_0_const = + _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1); + const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1); + const __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1); + const __m128i round_shift = _mm_cvtsi32_si128(bits); + int i, is_horiz_4tap = 0; + (void)filter_params_y; + (void)subpel_y_qn; + + assert(bits >= 0); + assert((FILTER_BITS - conv_params->round_1) >= 0 || + ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); + assert(conv_params->round_0 > 0); + + __m256i coeffs[4], filt[4]; + filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2)); + filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + + prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs); + + // Condition for checking valid horz_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0))) + is_horiz_4tap = 1; + + // horz_filt as 4 tap + if (is_horiz_4tap) { + const int fo_horiz = 1; + const uint8_t *const src_ptr = src - fo_horiz; + if (w <= 8) { + for (i = 0; i < h; i += 2) { + const __m256i data = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), + _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)(&src_ptr[i * src_stride + src_stride]))), + 0x20); + + __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), + round_0_shift); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), + round_shift); + + /* rounding code */ + // 8 bit conversion and saturation to uint8 + __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); + + const __m128i res_0 = _mm256_castsi256_si128(res_8b); + const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); + + if (w > 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); + } else if (w > 2) { + xx_storel_32(&dst[i * dst_stride], res_0); + xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); + } else { + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride]; + __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride]; + *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); + *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); + } + } + } else { + for (i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 + // 18 19 20 21 22 23 + const __m256i data = _mm256_inserti128_si256( + _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), + 1); + + __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), + round_0_shift); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), + round_shift); + + /* rounding code */ + // 8 bit conversion and saturation to uint8 + __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); + + // Store values into the destination buffer + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + res_8b = _mm256_permute4x64_epi64(res_8b, 216); + __m128i res = _mm256_castsi256_si128(res_8b); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); + } + } + } + } else { + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_horiz; + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + if (w <= 8) { + for (i = 0; i < h; i += 2) { + const __m256i data = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), + _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)(&src_ptr[i * src_stride + src_stride]))), + 0x20); + + __m256i res_16b = convolve_lowbd_x(data, coeffs, filt); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), + round_0_shift); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), + round_shift); + + /* rounding code */ + // 8 bit conversion and saturation to uint8 + __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); + + const __m128i res_0 = _mm256_castsi256_si128(res_8b); + const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); + if (w > 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); + } else if (w > 2) { + xx_storel_32(&dst[i * dst_stride], res_0); + xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); + } else { + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride]; + __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride]; + *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0); + *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1); + } + } + } else { + for (i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 + // 18 19 20 21 22 23 + const __m256i data = _mm256_inserti128_si256( + _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), + 1); + + __m256i res_16b = convolve_lowbd_x(data, coeffs, filt); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), + round_0_shift); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), + round_shift); + + /* rounding code */ + // 8 bit conversion and saturation to uint8 + __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); + + // Store values into the destination buffer + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + res_8b = _mm256_permute4x64_epi64(res_8b, 216); + __m128i res = _mm256_castsi256_si128(res_8b); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); + } + } + } + } +} diff --git a/libs/libaom/src/av1/common/x86/convolve_sse2.c b/libs/libaom/src/av1/common/x86/convolve_sse2.c new file mode 100644 index 000000000..4323ac4d1 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/convolve_sse2.c @@ -0,0 +1,338 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve_common_intrin.h" +#include "av1/common/convolve.h" + +static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params, + const int subpel_q4, + __m128i *const coeffs /* [4] */) { + const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel( + filter_params, subpel_q4 & SUBPEL_MASK); + const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + coeffs[0] = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 0 1 0 1 0 1 0 1 + coeffs[1] = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 + coeffs[2] = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 + coeffs[3] = _mm_unpackhi_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7 +} + +static INLINE __m128i convolve(const __m128i *const s, + const __m128i *const coeffs) { + const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]); + const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]); + const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]); + const __m128i d3 = _mm_madd_epi16(s[3], coeffs[3]); + const __m128i d = _mm_add_epi32(_mm_add_epi32(d0, d1), _mm_add_epi32(d2, d3)); + return d; +} + +static INLINE __m128i convolve_lo_x(const __m128i *const s, + const __m128i *const coeffs) { + __m128i ss[4]; + ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128()); + ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128()); + ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128()); + ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128()); + return convolve(ss, coeffs); +} + +static INLINE __m128i convolve_lo_y(const __m128i *const s, + const __m128i *const coeffs) { + __m128i ss[4]; + ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128()); + ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128()); + ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128()); + ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128()); + return convolve(ss, coeffs); +} + +static INLINE __m128i convolve_hi_y(const __m128i *const s, + const __m128i *const coeffs) { + __m128i ss[4]; + ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128()); + ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128()); + ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128()); + ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128()); + return convolve(ss, coeffs); +} + +void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint8_t *src_ptr = src - fo_vert * src_stride; + const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1); + const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS); + __m128i coeffs[4]; + + (void)filter_params_x; + (void)subpel_x_qn; + (void)conv_params; + + assert(conv_params->round_0 <= FILTER_BITS); + assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) || + ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); + + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs); + + if (w <= 4) { + __m128i s[8], src6, res, res_round, res16; + uint32_t res_int; + src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride)); + s[0] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)), + _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride))); + s[1] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)), + _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride))); + s[2] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)), + _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride))); + s[3] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)), + _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride))); + s[4] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)), + _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride))); + s[5] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6); + + do { + s[6] = _mm_unpacklo_epi8( + src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride))); + src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride)); + s[7] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6); + + res = convolve_lo_y(s + 0, coeffs); + res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift); + res16 = _mm_packs_epi32(res_round, res_round); + res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16)); + + if (w == 2) + *(uint16_t *)dst = (uint16_t)res_int; + else + *(uint32_t *)dst = res_int; + + src_ptr += src_stride; + dst += dst_stride; + + res = convolve_lo_y(s + 1, coeffs); + res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift); + res16 = _mm_packs_epi32(res_round, res_round); + res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16)); + + if (w == 2) + *(uint16_t *)dst = (uint16_t)res_int; + else + *(uint32_t *)dst = res_int; + + src_ptr += src_stride; + dst += dst_stride; + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + s[3] = s[5]; + s[4] = s[6]; + s[5] = s[7]; + h -= 2; + } while (h); + } else { + assert(!(w % 8)); + int j = 0; + do { + __m128i s[8], src6, res_lo, res_hi; + __m128i res_lo_round, res_hi_round, res16, res; + const uint8_t *data = &src_ptr[j]; + + src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)); + s[0] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 1 * src_stride))); + s[1] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 2 * src_stride))); + s[2] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 3 * src_stride))); + s[3] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 4 * src_stride))); + s[4] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 5 * src_stride))); + s[5] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6); + + int i = 0; + do { + data = &src_ptr[i * src_stride + j]; + s[6] = _mm_unpacklo_epi8( + src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride))); + src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)); + s[7] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6); + + res_lo = convolve_lo_y(s, coeffs); // Filter low index pixels + res_hi = convolve_hi_y(s, coeffs); // Filter high index pixels + + res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); + + res16 = _mm_packs_epi32(res_lo_round, res_hi_round); + res = _mm_packus_epi16(res16, res16); + + _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res); + i++; + + res_lo = convolve_lo_y(s + 1, coeffs); // Filter low index pixels + res_hi = convolve_hi_y(s + 1, coeffs); // Filter high index pixels + + res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); + + res16 = _mm_packs_epi32(res_lo_round, res_hi_round); + res = _mm_packus_epi16(res16, res16); + + _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res); + i++; + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + s[3] = s[5]; + s[4] = s[6]; + s[5] = s[7]; + } while (i < h); + j += 8; + } while (j < w); + } +} + +void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - fo_horiz; + const int bits = FILTER_BITS - conv_params->round_0; + const __m128i round_0_const = + _mm_set1_epi32((1 << conv_params->round_0) >> 1); + const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1); + const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0); + const __m128i round_shift = _mm_cvtsi32_si128(bits); + __m128i coeffs[4]; + + (void)filter_params_y; + (void)subpel_y_qn; + + assert(bits >= 0); + assert((FILTER_BITS - conv_params->round_1) >= 0 || + ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); + + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs); + + if (w <= 4) { + do { + const __m128i data = _mm_loadu_si128((__m128i *)src_ptr); + __m128i s[4]; + + s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1)); + s[1] = + _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3)); + s[2] = + _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5)); + s[3] = + _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7)); + const __m128i res_lo = convolve_lo_x(s, coeffs); + __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift); + res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), round_shift); + + const __m128i res16 = _mm_packs_epi32(res_lo_round, res_lo_round); + const __m128i res = _mm_packus_epi16(res16, res16); + + uint32_t r = _mm_cvtsi128_si32(res); + if (w == 2) + *(uint16_t *)dst = (uint16_t)r; + else + *(uint32_t *)dst = r; + + src_ptr += src_stride; + dst += dst_stride; + } while (--h); + } else { + assert(!(w % 8)); + int i = 0; + do { + int j = 0; + do { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + __m128i s[4]; + + // Filter even-index pixels + s[0] = data; + s[1] = _mm_srli_si128(data, 2); + s[2] = _mm_srli_si128(data, 4); + s[3] = _mm_srli_si128(data, 6); + const __m128i res_even = convolve_lo_x(s, coeffs); + + // Filter odd-index pixels + s[0] = _mm_srli_si128(data, 1); + s[1] = _mm_srli_si128(data, 3); + s[2] = _mm_srli_si128(data, 5); + s[3] = _mm_srli_si128(data, 7); + const __m128i res_odd = convolve_lo_x(s, coeffs); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift); + res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), + round_shift); + __m128i res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_0_const), round_0_shift); + res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const), + round_shift); + + const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round); + const __m128i res = _mm_packus_epi16(res16, res16); + + _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res); + j += 8; + } while (j < w); + } while (++i < h); + } +} diff --git a/libs/libaom/src/av1/common/x86/filterintra_sse4.c b/libs/libaom/src/av1/common/x86/filterintra_sse4.c new file mode 100644 index 000000000..99f4d9967 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/filterintra_sse4.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" +#include "av1/common/enums.h" +#include "av1/common/reconintra.h" + +void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, + TX_SIZE tx_size, const uint8_t *above, + const uint8_t *left, int mode) { + int r, c; + uint8_t buffer[33][33]; + const int bw = tx_size_wide[tx_size]; + const int bh = tx_size_high[tx_size]; + + assert(bw <= 32 && bh <= 32); + + for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r]; + memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t)); + + const __m128i f1f0 = xx_load_128(av1_filter_intra_taps[mode][0]); + const __m128i f3f2 = xx_load_128(av1_filter_intra_taps[mode][2]); + const __m128i f5f4 = xx_load_128(av1_filter_intra_taps[mode][4]); + const __m128i f7f6 = xx_load_128(av1_filter_intra_taps[mode][6]); + const __m128i filter_intra_scale_bits = + _mm_set1_epi16(1 << (15 - FILTER_INTRA_SCALE_BITS)); + + for (r = 1; r < bh + 1; r += 2) { + for (c = 1; c < bw + 1; c += 4) { + DECLARE_ALIGNED(16, uint8_t, p[8]); + memcpy(p, &buffer[r - 1][c - 1], 5 * sizeof(uint8_t)); + p[5] = buffer[r][c - 1]; + p[6] = buffer[r + 1][c - 1]; + p[7] = 0; + const __m128i p_b = xx_loadl_64(p); + const __m128i in = _mm_unpacklo_epi64(p_b, p_b); + const __m128i out_01 = _mm_maddubs_epi16(in, f1f0); + const __m128i out_23 = _mm_maddubs_epi16(in, f3f2); + const __m128i out_45 = _mm_maddubs_epi16(in, f5f4); + const __m128i out_67 = _mm_maddubs_epi16(in, f7f6); + const __m128i out_0123 = _mm_hadd_epi16(out_01, out_23); + const __m128i out_4567 = _mm_hadd_epi16(out_45, out_67); + const __m128i out_01234567 = _mm_hadd_epi16(out_0123, out_4567); + // Rounding + const __m128i round_w = + _mm_mulhrs_epi16(out_01234567, filter_intra_scale_bits); + const __m128i out_r = _mm_packus_epi16(round_w, round_w); + const __m128i out_r1 = _mm_srli_si128(out_r, 4); + // Storing + xx_storel_32(&buffer[r][c], out_r); + xx_storel_32(&buffer[r + 1][c], out_r1); + } + } + + for (r = 0; r < bh; ++r) { + memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t)); + dst += stride; + } +} diff --git a/libs/libaom/src/av1/common/x86/highbd_convolve_2d_avx2.c b/libs/libaom/src/av1/common/x86/highbd_convolve_2d_avx2.c new file mode 100644 index 000000000..396aed01b --- /dev/null +++ b/libs/libaom/src/av1/common/x86/highbd_convolve_2d_avx2.c @@ -0,0 +1,326 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/convolve_avx2.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "av1/common/convolve.h" + +void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, + const int subpel_y_qn, + ConvolveParams *conv_params, int bd) { + DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); + int im_h = h + filter_params_y->taps - 1; + int im_stride = 8; + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + + __m256i s[8], coeffs_y[4], coeffs_x[4]; + + const __m256i round_const_x = _mm256_set1_epi32( + ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); + const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); + + const __m256i round_const_y = _mm256_set1_epi32( + ((1 << conv_params->round_1) >> 1) - + (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); + const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1); + + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); + const __m256i clip_pixel = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m256i zero = _mm256_setzero_si256(); + + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + { + for (i = 0; i < im_h; i += 2) { + const __m256i row0 = + _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]); + __m256i row1 = _mm256_set1_epi16(0); + if (i + 1 < im_h) + row1 = + _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]); + + const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); + const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); + + // even pixels + s[0] = _mm256_alignr_epi8(r1, r0, 0); + s[1] = _mm256_alignr_epi8(r1, r0, 4); + s[2] = _mm256_alignr_epi8(r1, r0, 8); + s[3] = _mm256_alignr_epi8(r1, r0, 12); + + __m256i res_even = convolve(s, coeffs_x); + res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x), + round_shift_x); + + // odd pixels + s[0] = _mm256_alignr_epi8(r1, r0, 2); + s[1] = _mm256_alignr_epi8(r1, r0, 6); + s[2] = _mm256_alignr_epi8(r1, r0, 10); + s[3] = _mm256_alignr_epi8(r1, r0, 14); + + __m256i res_odd = convolve(s, coeffs_x); + res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x), + round_shift_x); + + __m256i res_even1 = _mm256_packs_epi32(res_even, res_even); + __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd); + __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1); + + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); + } + } + + /* Vertical filter */ + { + __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); + __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); + __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); + __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); + __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); + __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); + + s[0] = _mm256_unpacklo_epi16(s0, s1); + s[1] = _mm256_unpacklo_epi16(s2, s3); + s[2] = _mm256_unpacklo_epi16(s4, s5); + + s[4] = _mm256_unpackhi_epi16(s0, s1); + s[5] = _mm256_unpackhi_epi16(s2, s3); + s[6] = _mm256_unpackhi_epi16(s4, s5); + + for (i = 0; i < h; i += 2) { + const int16_t *data = &im_block[i * im_stride]; + + const __m256i s6 = + _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); + const __m256i s7 = + _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); + + s[3] = _mm256_unpacklo_epi16(s6, s7); + s[7] = _mm256_unpackhi_epi16(s6, s7); + + const __m256i res_a = convolve(s, coeffs_y); + __m256i res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a, round_const_y), round_shift_y); + + res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a_round, round_const_bits), round_shift_bits); + + if (w - j > 4) { + const __m256i res_b = convolve(s + 4, coeffs_y); + __m256i res_b_round = _mm256_sra_epi32( + _mm256_add_epi32(res_b, round_const_y), round_shift_y); + res_b_round = + _mm256_sra_epi32(_mm256_add_epi32(res_b_round, round_const_bits), + round_shift_bits); + + __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); + res_16bit = _mm256_min_epi16(res_16bit, clip_pixel); + res_16bit = _mm256_max_epi16(res_16bit, zero); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res_16bit)); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res_16bit, 1)); + } else if (w == 4) { + res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); + res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); + res_a_round = _mm256_max_epi16(res_a_round, zero); + + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res_a_round)); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res_a_round, 1)); + } else { + res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); + res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); + res_a_round = _mm256_max_epi16(res_a_round, zero); + + xx_storel_32((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res_a_round)); + xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res_a_round, 1)); + } + + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + } + } + } +} + +static INLINE void copy_64(const uint16_t *src, uint16_t *dst) { + __m256i s[4]; + s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16)); + s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16)); + s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16)); + s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16)); + _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]); + _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]); + _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]); + _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]); +} + +static INLINE void copy_128(const uint16_t *src, uint16_t *dst) { + __m256i s[8]; + s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16)); + s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16)); + s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16)); + s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16)); + s[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16)); + s[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 16)); + s[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 16)); + s[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 16)); + + _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]); + _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]); + _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]); + _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]); + _mm256_storeu_si256((__m256i *)(dst + 4 * 16), s[4]); + _mm256_storeu_si256((__m256i *)(dst + 5 * 16), s[5]); + _mm256_storeu_si256((__m256i *)(dst + 6 * 16), s[6]); + _mm256_storeu_si256((__m256i *)(dst + 7 * 16), s[7]); +} + +void av1_highbd_convolve_2d_copy_sr_avx2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + (void)filter_params_x; + (void)filter_params_y; + (void)subpel_x_qn; + (void)subpel_y_qn; + (void)conv_params; + (void)bd; + + if (w >= 16) { + assert(!((intptr_t)dst % 16)); + assert(!(dst_stride % 16)); + } + + if (w == 2) { + do { + memmove(dst, src, 2 * sizeof(*src)); + src += src_stride; + dst += dst_stride; + memmove(dst, src, 2 * sizeof(*src)); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 4) { + do { + __m128i s[2]; + s[0] = _mm_loadl_epi64((__m128i *)src); + src += src_stride; + s[1] = _mm_loadl_epi64((__m128i *)src); + src += src_stride; + _mm_storel_epi64((__m128i *)dst, s[0]); + dst += dst_stride; + _mm_storel_epi64((__m128i *)dst, s[1]); + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 8) { + do { + __m128i s[2]; + s[0] = _mm_loadu_si128((__m128i *)src); + src += src_stride; + s[1] = _mm_loadu_si128((__m128i *)src); + src += src_stride; + _mm_store_si128((__m128i *)dst, s[0]); + dst += dst_stride; + _mm_store_si128((__m128i *)dst, s[1]); + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 16) { + do { + __m256i s[2]; + s[0] = _mm256_loadu_si256((__m256i *)src); + src += src_stride; + s[1] = _mm256_loadu_si256((__m256i *)src); + src += src_stride; + _mm256_storeu_si256((__m256i *)dst, s[0]); + dst += dst_stride; + _mm256_storeu_si256((__m256i *)dst, s[1]); + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 32) { + do { + __m256i s[4]; + s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16)); + s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16)); + src += src_stride; + s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 16)); + s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 16)); + src += src_stride; + _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]); + _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]); + dst += dst_stride; + _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[2]); + _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[3]); + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 64) { + do { + copy_64(src, dst); + src += src_stride; + dst += dst_stride; + copy_64(src, dst); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h); + } else { + do { + copy_128(src, dst); + src += src_stride; + dst += dst_stride; + copy_128(src, dst); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h); + } +} diff --git a/libs/libaom/src/av1/common/x86/highbd_convolve_2d_sse2.c b/libs/libaom/src/av1/common/x86/highbd_convolve_2d_sse2.c new file mode 100644 index 000000000..f758775ee --- /dev/null +++ b/libs/libaom/src/av1/common/x86/highbd_convolve_2d_sse2.c @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_filter.h" + +static INLINE void copy_64(const uint16_t *src, uint16_t *dst) { + __m128i s[8]; + s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); + s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); + s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8)); + s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8)); + s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8)); + s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8)); + s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8)); + s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8)); + _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]); + _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]); + _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]); + _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]); + _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]); + _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]); + _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]); + _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]); +} + +static INLINE void copy_128(const uint16_t *src, uint16_t *dst) { + __m128i s[16]; + s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); + s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); + s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8)); + s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8)); + s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8)); + s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8)); + s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8)); + s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8)); + s[8] = _mm_loadu_si128((__m128i *)(src + 8 * 8)); + s[9] = _mm_loadu_si128((__m128i *)(src + 9 * 8)); + s[10] = _mm_loadu_si128((__m128i *)(src + 10 * 8)); + s[11] = _mm_loadu_si128((__m128i *)(src + 11 * 8)); + s[12] = _mm_loadu_si128((__m128i *)(src + 12 * 8)); + s[13] = _mm_loadu_si128((__m128i *)(src + 13 * 8)); + s[14] = _mm_loadu_si128((__m128i *)(src + 14 * 8)); + s[15] = _mm_loadu_si128((__m128i *)(src + 15 * 8)); + _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]); + _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]); + _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]); + _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]); + _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]); + _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]); + _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]); + _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]); + _mm_store_si128((__m128i *)(dst + 8 * 8), s[8]); + _mm_store_si128((__m128i *)(dst + 9 * 8), s[9]); + _mm_store_si128((__m128i *)(dst + 10 * 8), s[10]); + _mm_store_si128((__m128i *)(dst + 11 * 8), s[11]); + _mm_store_si128((__m128i *)(dst + 12 * 8), s[12]); + _mm_store_si128((__m128i *)(dst + 13 * 8), s[13]); + _mm_store_si128((__m128i *)(dst + 14 * 8), s[14]); + _mm_store_si128((__m128i *)(dst + 15 * 8), s[15]); +} + +void av1_highbd_convolve_2d_copy_sr_sse2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + (void)filter_params_x; + (void)filter_params_y; + (void)subpel_x_qn; + (void)subpel_y_qn; + (void)conv_params; + (void)bd; + if (w >= 16) { + assert(!((intptr_t)dst % 16)); + assert(!(dst_stride % 16)); + } + + if (w == 2) { + do { + __m128i s = _mm_loadl_epi64((__m128i *)src); + *(uint32_t *)dst = _mm_cvtsi128_si32(s); + src += src_stride; + dst += dst_stride; + s = _mm_loadl_epi64((__m128i *)src); + *(uint32_t *)dst = _mm_cvtsi128_si32(s); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 4) { + do { + __m128i s[2]; + s[0] = _mm_loadl_epi64((__m128i *)src); + src += src_stride; + s[1] = _mm_loadl_epi64((__m128i *)src); + src += src_stride; + _mm_storel_epi64((__m128i *)dst, s[0]); + dst += dst_stride; + _mm_storel_epi64((__m128i *)dst, s[1]); + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 8) { + do { + __m128i s[2]; + s[0] = _mm_loadu_si128((__m128i *)src); + src += src_stride; + s[1] = _mm_loadu_si128((__m128i *)src); + src += src_stride; + _mm_store_si128((__m128i *)dst, s[0]); + dst += dst_stride; + _mm_store_si128((__m128i *)dst, s[1]); + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 16) { + do { + __m128i s[4]; + s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); + s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); + src += src_stride; + s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); + s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); + src += src_stride; + _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]); + _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]); + dst += dst_stride; + _mm_store_si128((__m128i *)(dst + 0 * 8), s[2]); + _mm_store_si128((__m128i *)(dst + 1 * 8), s[3]); + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 32) { + do { + __m128i s[8]; + s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); + s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); + s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8)); + s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8)); + src += src_stride; + s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); + s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); + s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 8)); + s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 8)); + src += src_stride; + _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]); + _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]); + _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]); + _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]); + dst += dst_stride; + _mm_store_si128((__m128i *)(dst + 0 * 8), s[4]); + _mm_store_si128((__m128i *)(dst + 1 * 8), s[5]); + _mm_store_si128((__m128i *)(dst + 2 * 8), s[6]); + _mm_store_si128((__m128i *)(dst + 3 * 8), s[7]); + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 64) { + do { + copy_64(src, dst); + src += src_stride; + dst += dst_stride; + copy_64(src, dst); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h); + } else { + do { + copy_128(src, dst); + src += src_stride; + dst += dst_stride; + copy_128(src, dst); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h); + } +} diff --git a/libs/libaom/src/av1/common/x86/highbd_convolve_2d_sse4.c b/libs/libaom/src/av1/common/x86/highbd_convolve_2d_sse4.c new file mode 100644 index 000000000..d2ff47c1f --- /dev/null +++ b/libs/libaom/src/av1/common/x86/highbd_convolve_2d_sse4.c @@ -0,0 +1,425 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve_sse2.h" +#include "aom_dsp/x86/convolve_sse4_1.h" +#include "av1/common/convolve.h" + +void av1_highbd_dist_wtd_convolve_2d_copy_sse4_1( + const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + (void)filter_params_x; + (void)filter_params_y; + (void)subpel_x_qn; + (void)subpel_y_qn; + + const int bits = + FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; + const __m128i left_shift = _mm_cvtsi32_si128(bits); + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi32(w0); + const __m128i wt1 = _mm_set1_epi32(w1); + const __m128i zero = _mm_setzero_si128(); + int i, j; + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m128i offset_const = _mm_set1_epi32(offset); + const __m128i offset_const_16b = _mm_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1); + const __m128i clip_pixel_to_bd = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + + assert(bits <= 4); + + if (!(w % 8)) { + for (i = 0; i < h; i += 1) { + for (j = 0; j < w; j += 8) { + const __m128i src_16bit = + _mm_loadu_si128((__m128i *)(&src[i * src_stride + j])); + const __m128i res = _mm_sll_epi16(src_16bit, left_shift); + if (do_average) { + const __m128i data_0 = + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); + + const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero); + const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero); + + const __m128i res_32b_lo = _mm_unpacklo_epi16(res, zero); + const __m128i res_unsigned_lo = + _mm_add_epi32(res_32b_lo, offset_const); + + const __m128i comp_avg_res_lo = + highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0, + &wt1, use_dist_wtd_comp_avg); + + const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero); + const __m128i res_unsigned_hi = + _mm_add_epi32(res_32b_hi, offset_const); + + const __m128i comp_avg_res_hi = + highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0, + &wt1, use_dist_wtd_comp_avg); + + const __m128i round_result_lo = highbd_convolve_rounding_sse2( + &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); + const __m128i round_result_hi = highbd_convolve_rounding_sse2( + &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_16b = + _mm_packus_epi32(round_result_lo, round_result_hi); + const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); + } else { + const __m128i res_unsigned_16b = + _mm_adds_epu16(res, offset_const_16b); + + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), + res_unsigned_16b); + } + } + } + } else if (!(w % 4)) { + for (i = 0; i < h; i += 2) { + for (j = 0; j < w; j += 4) { + const __m128i src_row_0 = + _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j])); + const __m128i src_row_1 = + _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride])); + const __m128i src_10 = _mm_unpacklo_epi64(src_row_0, src_row_1); + + const __m128i res = _mm_sll_epi16(src_10, left_shift); + + if (do_average) { + const __m128i data_0 = + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])); + const __m128i data_1 = _mm_loadl_epi64( + (__m128i *)(&dst[i * dst_stride + j + dst_stride])); + + const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero); + const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero); + + const __m128i res_32b = _mm_unpacklo_epi16(res, zero); + const __m128i res_unsigned_lo = _mm_add_epi32(res_32b, offset_const); + + const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero); + const __m128i res_unsigned_hi = + _mm_add_epi32(res_32b_hi, offset_const); + + const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1( + &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); + const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1( + &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_dist_wtd_comp_avg); + + const __m128i round_result_lo = highbd_convolve_rounding_sse2( + &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); + const __m128i round_result_hi = highbd_convolve_rounding_sse2( + &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_16b = + _mm_packus_epi32(round_result_lo, round_result_hi); + const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); + + const __m128i res_1 = _mm_srli_si128(res_clip, 8); + + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); + _mm_storel_epi64( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); + } else { + const __m128i res_unsigned_16b = + _mm_adds_epu16(res, offset_const_16b); + + const __m128i res_1 = _mm_srli_si128(res_unsigned_16b, 8); + + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), + res_unsigned_16b); + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } + } + } +} + +void av1_highbd_dist_wtd_convolve_2d_sse4_1( + const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + int im_h = h + filter_params_y->taps - 1; + int im_stride = MAX_SB_SIZE; + int i, j; + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi32(w0); + const __m128i wt1 = _mm_set1_epi32(w1); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m128i offset_const = _mm_set1_epi32(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1); + const __m128i clip_pixel_to_bd = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + + /* Horizontal filter */ + { + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); + + for (i = 0; i < im_h; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + const __m128i data2 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]); + + // Filter even-index pixels + const __m128i res_0 = _mm_madd_epi16(data, coeff_01); + const __m128i res_2 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23); + const __m128i res_4 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45); + const __m128i res_6 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67); + + __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), + _mm_add_epi32(res_2, res_6)); + res_even = + _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); + + // Filter odd-index pixels + const __m128i res_1 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01); + const __m128i res_3 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23); + const __m128i res_5 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45); + const __m128i res_7 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67); + + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + res_odd = + _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); + + // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 + __m128i res = _mm_packs_epi32(res_even, res_odd); + _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res); + } + } + } + + /* Vertical filter */ + { + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << conv_params->round_1) >> 1) - + (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + // Filter even-index pixels + const int16_t *data = &im_block[i * im_stride + j]; + const __m128i src_0 = + _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_2 = + _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_4 = + _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_6 = + _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = + _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_3 = + _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_5 = + _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_7 = + _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + const __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + + const __m128i res_unsigned_lo = + _mm_add_epi32(res_lo_round, offset_const); + + if (w < 8) { + if (do_average) { + const __m128i data_0 = + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])); + + const __m128i data_ref_0 = _mm_cvtepu16_epi32(data_0); + + const __m128i comp_avg_res = + highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo, &wt0, + &wt1, use_dist_wtd_comp_avg); + + const __m128i round_result = highbd_convolve_rounding_sse2( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_16b = + _mm_packus_epi32(round_result, round_result); + const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); + + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); + } else { + const __m128i res_16b = + _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo); + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_16b); + } + } else { + const __m128i res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); + + const __m128i res_unsigned_hi = + _mm_add_epi32(res_hi_round, offset_const); + + if (do_average) { + const __m128i data_lo = + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])); + const __m128i data_hi = + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j + 4])); + + const __m128i data_ref_0_lo = _mm_cvtepu16_epi32(data_lo); + const __m128i data_ref_0_hi = _mm_cvtepu16_epi32(data_hi); + + const __m128i comp_avg_res_lo = + highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0, + &wt1, use_dist_wtd_comp_avg); + const __m128i comp_avg_res_hi = + highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0, + &wt1, use_dist_wtd_comp_avg); + + const __m128i round_result_lo = + highbd_convolve_rounding_sse2(&comp_avg_res_lo, &offset_const, + &rounding_const, rounding_shift); + const __m128i round_result_hi = + highbd_convolve_rounding_sse2(&comp_avg_res_hi, &offset_const, + &rounding_const, rounding_shift); + + const __m128i res_16b = + _mm_packus_epi32(round_result_lo, round_result_hi); + const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); + } else { + const __m128i res_16b = + _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b); + } + } + } + } + } +} diff --git a/libs/libaom/src/av1/common/x86/highbd_convolve_2d_ssse3.c b/libs/libaom/src/av1/common/x86/highbd_convolve_2d_ssse3.c new file mode 100644 index 000000000..5318fcaa8 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/highbd_convolve_2d_ssse3.c @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve_sse2.h" +#include "av1/common/convolve.h" + +void av1_highbd_convolve_2d_sr_ssse3( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); + int im_h = h + filter_params_y->taps - 1; + int im_stride = 8; + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + __m128i coeffs_x[4], coeffs_y[4], s[16]; + + const __m128i round_const_x = _mm_set1_epi32( + ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); + const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); + + const __m128i round_const_y = + _mm_set1_epi32(((1 << conv_params->round_1) >> 1) - + (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); + const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1); + + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); + const __m128i clip_pixel = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m128i zero = _mm_setzero_si128(); + + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + { + for (i = 0; i < im_h; i += 1) { + const __m128i row00 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + const __m128i row01 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]); + + // even pixels + s[0] = _mm_alignr_epi8(row01, row00, 0); + s[1] = _mm_alignr_epi8(row01, row00, 4); + s[2] = _mm_alignr_epi8(row01, row00, 8); + s[3] = _mm_alignr_epi8(row01, row00, 12); + + __m128i res_even = convolve(s, coeffs_x); + res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), + round_shift_x); + + // odd pixels + s[0] = _mm_alignr_epi8(row01, row00, 2); + s[1] = _mm_alignr_epi8(row01, row00, 6); + s[2] = _mm_alignr_epi8(row01, row00, 10); + s[3] = _mm_alignr_epi8(row01, row00, 14); + + __m128i res_odd = convolve(s, coeffs_x); + res_odd = + _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x); + + __m128i res_even1 = _mm_packs_epi32(res_even, res_even); + __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd); + __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1); + + _mm_store_si128((__m128i *)&im_block[i * im_stride], res); + } + } + /* Vertical filter */ + { + __m128i s0 = _mm_loadu_si128((__m128i *)(im_block + 0 * im_stride)); + __m128i s1 = _mm_loadu_si128((__m128i *)(im_block + 1 * im_stride)); + __m128i s2 = _mm_loadu_si128((__m128i *)(im_block + 2 * im_stride)); + __m128i s3 = _mm_loadu_si128((__m128i *)(im_block + 3 * im_stride)); + __m128i s4 = _mm_loadu_si128((__m128i *)(im_block + 4 * im_stride)); + __m128i s5 = _mm_loadu_si128((__m128i *)(im_block + 5 * im_stride)); + __m128i s6 = _mm_loadu_si128((__m128i *)(im_block + 6 * im_stride)); + + s[0] = _mm_unpacklo_epi16(s0, s1); + s[1] = _mm_unpacklo_epi16(s2, s3); + s[2] = _mm_unpacklo_epi16(s4, s5); + + s[4] = _mm_unpackhi_epi16(s0, s1); + s[5] = _mm_unpackhi_epi16(s2, s3); + s[6] = _mm_unpackhi_epi16(s4, s5); + + s[0 + 8] = _mm_unpacklo_epi16(s1, s2); + s[1 + 8] = _mm_unpacklo_epi16(s3, s4); + s[2 + 8] = _mm_unpacklo_epi16(s5, s6); + + s[4 + 8] = _mm_unpackhi_epi16(s1, s2); + s[5 + 8] = _mm_unpackhi_epi16(s3, s4); + s[6 + 8] = _mm_unpackhi_epi16(s5, s6); + + for (i = 0; i < h; i += 2) { + const int16_t *data = &im_block[i * im_stride]; + + __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * im_stride)); + __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * im_stride)); + + s[3] = _mm_unpacklo_epi16(s6, s7); + s[7] = _mm_unpackhi_epi16(s6, s7); + + s[3 + 8] = _mm_unpacklo_epi16(s7, s8); + s[7 + 8] = _mm_unpackhi_epi16(s7, s8); + + const __m128i res_a0 = convolve(s, coeffs_y); + __m128i res_a_round0 = + _mm_sra_epi32(_mm_add_epi32(res_a0, round_const_y), round_shift_y); + res_a_round0 = _mm_sra_epi32( + _mm_add_epi32(res_a_round0, round_const_bits), round_shift_bits); + + const __m128i res_a1 = convolve(s + 8, coeffs_y); + __m128i res_a_round1 = + _mm_sra_epi32(_mm_add_epi32(res_a1, round_const_y), round_shift_y); + res_a_round1 = _mm_sra_epi32( + _mm_add_epi32(res_a_round1, round_const_bits), round_shift_bits); + + if (w - j > 4) { + const __m128i res_b0 = convolve(s + 4, coeffs_y); + __m128i res_b_round0 = _mm_sra_epi32( + _mm_add_epi32(res_b0, round_const_y), round_shift_y); + res_b_round0 = _mm_sra_epi32( + _mm_add_epi32(res_b_round0, round_const_bits), round_shift_bits); + + const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y); + __m128i res_b_round1 = _mm_sra_epi32( + _mm_add_epi32(res_b1, round_const_y), round_shift_y); + res_b_round1 = _mm_sra_epi32( + _mm_add_epi32(res_b_round1, round_const_bits), round_shift_bits); + + __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0); + res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel); + res_16bit0 = _mm_max_epi16(res_16bit0, zero); + + __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1); + res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel); + res_16bit1 = _mm_max_epi16(res_16bit1, zero); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_16bit1); + } else if (w == 4) { + res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); + res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); + res_a_round0 = _mm_max_epi16(res_a_round0, zero); + + res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); + res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); + res_a_round1 = _mm_max_epi16(res_a_round1, zero); + + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_a_round1); + } else { + res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); + res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); + res_a_round0 = _mm_max_epi16(res_a_round0, zero); + + res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); + res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); + res_a_round1 = _mm_max_epi16(res_a_round1, zero); + + *((uint32_t *)(&dst[i * dst_stride + j])) = + _mm_cvtsi128_si32(res_a_round0); + + *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) = + _mm_cvtsi128_si32(res_a_round1); + } + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + + s[0 + 8] = s[1 + 8]; + s[1 + 8] = s[2 + 8]; + s[2 + 8] = s[3 + 8]; + + s[4 + 8] = s[5 + 8]; + s[5 + 8] = s[6 + 8]; + s[6 + 8] = s[7 + 8]; + + s6 = s8; + } + } + } +} diff --git a/libs/libaom/src/av1/common/x86/highbd_inv_txfm_avx2.c b/libs/libaom/src/av1/common/x86/highbd_inv_txfm_avx2.c new file mode 100644 index 000000000..93e98e4b3 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/highbd_inv_txfm_avx2.c @@ -0,0 +1,4246 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/idct.h" +#include "av1/common/x86/av1_inv_txfm_ssse3.h" +#include "av1/common/x86/highbd_txfm_utility_sse4.h" +#include "aom_dsp/x86/txfm_common_avx2.h" + +// Note: +// Total 32x4 registers to represent 32x32 block coefficients. +// For high bit depth, each coefficient is 4-byte. +// Each __m256i register holds 8 coefficients. +// So each "row" we needs 4 register. Totally 32 rows +// Register layout: +// v0, v1, v2, v3, +// v4, v5, v6, v7, +// ... ... +// v124, v125, v126, v127 + +static INLINE __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) { + const __m256i zero = _mm256_setzero_si256(); + const __m256i one = _mm256_set1_epi16(1); + const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one); + __m256i clamped, mask; + + mask = _mm256_cmpgt_epi16(u, max); + clamped = _mm256_andnot_si256(mask, u); + mask = _mm256_and_si256(mask, max); + clamped = _mm256_or_si256(mask, clamped); + mask = _mm256_cmpgt_epi16(clamped, zero); + clamped = _mm256_and_si256(clamped, mask); + + return clamped; +} + +static INLINE void round_shift_4x4_avx2(__m256i *in, int shift) { + if (shift != 0) { + __m256i rnding = _mm256_set1_epi32(1 << (shift - 1)); + in[0] = _mm256_add_epi32(in[0], rnding); + in[1] = _mm256_add_epi32(in[1], rnding); + in[2] = _mm256_add_epi32(in[2], rnding); + in[3] = _mm256_add_epi32(in[3], rnding); + + in[0] = _mm256_srai_epi32(in[0], shift); + in[1] = _mm256_srai_epi32(in[1], shift); + in[2] = _mm256_srai_epi32(in[2], shift); + in[3] = _mm256_srai_epi32(in[3], shift); + } +} + +static INLINE void round_shift_8x8_avx2(__m256i *in, int shift) { + round_shift_4x4_avx2(in, shift); + round_shift_4x4_avx2(in + 4, shift); + round_shift_4x4_avx2(in + 8, shift); + round_shift_4x4_avx2(in + 12, shift); +} + +static void highbd_clamp_epi32_avx2(__m256i *in, __m256i *out, + const __m256i *clamp_lo, + const __m256i *clamp_hi, int size) { + __m256i a0, a1; + for (int i = 0; i < size; i += 4) { + a0 = _mm256_max_epi32(in[i], *clamp_lo); + out[i] = _mm256_min_epi32(a0, *clamp_hi); + + a1 = _mm256_max_epi32(in[i + 1], *clamp_lo); + out[i + 1] = _mm256_min_epi32(a1, *clamp_hi); + + a0 = _mm256_max_epi32(in[i + 2], *clamp_lo); + out[i + 2] = _mm256_min_epi32(a0, *clamp_hi); + + a1 = _mm256_max_epi32(in[i + 3], *clamp_lo); + out[i + 3] = _mm256_min_epi32(a1, *clamp_hi); + } +} + +static INLINE __m256i highbd_get_recon_16x8_avx2(const __m256i pred, + __m256i res0, __m256i res1, + const int bd) { + __m256i x0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(pred)); + __m256i x1 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(pred, 1)); + + x0 = _mm256_add_epi32(res0, x0); + x1 = _mm256_add_epi32(res1, x1); + x0 = _mm256_packus_epi32(x0, x1); + x0 = _mm256_permute4x64_epi64(x0, 0xd8); + x0 = highbd_clamp_epi16_avx2(x0, bd); + return x0; +} + +static INLINE void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output, + int stride, int flipud, + int height, const int bd) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + __m256i v = _mm256_loadu_si256((__m256i const *)(output + i * stride)); + __m256i u = highbd_get_recon_16x8_avx2(v, in[j], in[j + height], bd); + + _mm256_storeu_si256((__m256i *)(output + i * stride), u); + } +} +static INLINE __m256i highbd_get_recon_8x8_avx2(const __m256i pred, __m256i res, + const int bd) { + __m256i x0 = pred; + x0 = _mm256_add_epi32(res, x0); + x0 = _mm256_packus_epi32(x0, x0); + x0 = _mm256_permute4x64_epi64(x0, 0xd8); + x0 = highbd_clamp_epi16_avx2(x0, bd); + return x0; +} + +static INLINE void highbd_write_buffer_8xn_avx2(__m256i *in, uint16_t *output, + int stride, int flipud, + int height, const int bd) { + int j = flipud ? (height - 1) : 0; + __m128i temp; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + temp = _mm_loadu_si128((__m128i const *)(output + i * stride)); + __m256i v = _mm256_cvtepi16_epi32(temp); + __m256i u = highbd_get_recon_8x8_avx2(v, in[j], bd); + __m128i u1 = _mm256_castsi256_si128(u); + _mm_storeu_si128((__m128i *)(output + i * stride), u1); + } +} +static void neg_shift_avx2(const __m256i in0, const __m256i in1, __m256i *out0, + __m256i *out1, const __m256i *clamp_lo, + const __m256i *clamp_hi, int shift) { + __m256i offset = _mm256_set1_epi32((1 << shift) >> 1); + __m256i a0 = _mm256_add_epi32(offset, in0); + __m256i a1 = _mm256_sub_epi32(offset, in1); + + a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift)); + a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift)); + + a0 = _mm256_max_epi32(a0, *clamp_lo); + a0 = _mm256_min_epi32(a0, *clamp_hi); + a1 = _mm256_max_epi32(a1, *clamp_lo); + a1 = _mm256_min_epi32(a1, *clamp_hi); + + *out0 = a0; + *out1 = a1; +} + +static void transpose_8x8_avx2(const __m256i *in, __m256i *out) { + __m256i u0, u1, u2, u3, u4, u5, u6, u7; + __m256i x0, x1; + + u0 = _mm256_unpacklo_epi32(in[0], in[1]); + u1 = _mm256_unpackhi_epi32(in[0], in[1]); + + u2 = _mm256_unpacklo_epi32(in[2], in[3]); + u3 = _mm256_unpackhi_epi32(in[2], in[3]); + + u4 = _mm256_unpacklo_epi32(in[4], in[5]); + u5 = _mm256_unpackhi_epi32(in[4], in[5]); + + u6 = _mm256_unpacklo_epi32(in[6], in[7]); + u7 = _mm256_unpackhi_epi32(in[6], in[7]); + + x0 = _mm256_unpacklo_epi64(u0, u2); + x1 = _mm256_unpacklo_epi64(u4, u6); + out[0] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[4] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpackhi_epi64(u0, u2); + x1 = _mm256_unpackhi_epi64(u4, u6); + out[1] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[5] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpacklo_epi64(u1, u3); + x1 = _mm256_unpacklo_epi64(u5, u7); + out[2] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[6] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpackhi_epi64(u1, u3); + x1 = _mm256_unpackhi_epi64(u5, u7); + out[3] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[7] = _mm256_permute2f128_si256(x0, x1, 0x31); +} + +static void transpose_8x8_flip_avx2(const __m256i *in, __m256i *out) { + __m256i u0, u1, u2, u3, u4, u5, u6, u7; + __m256i x0, x1; + + u0 = _mm256_unpacklo_epi32(in[7], in[6]); + u1 = _mm256_unpackhi_epi32(in[7], in[6]); + + u2 = _mm256_unpacklo_epi32(in[5], in[4]); + u3 = _mm256_unpackhi_epi32(in[5], in[4]); + + u4 = _mm256_unpacklo_epi32(in[3], in[2]); + u5 = _mm256_unpackhi_epi32(in[3], in[2]); + + u6 = _mm256_unpacklo_epi32(in[1], in[0]); + u7 = _mm256_unpackhi_epi32(in[1], in[0]); + + x0 = _mm256_unpacklo_epi64(u0, u2); + x1 = _mm256_unpacklo_epi64(u4, u6); + out[0] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[4] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpackhi_epi64(u0, u2); + x1 = _mm256_unpackhi_epi64(u4, u6); + out[1] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[5] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpacklo_epi64(u1, u3); + x1 = _mm256_unpacklo_epi64(u5, u7); + out[2] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[6] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpackhi_epi64(u1, u3); + x1 = _mm256_unpackhi_epi64(u5, u7); + out[3] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[7] = _mm256_permute2f128_si256(x0, x1, 0x31); +} + +static void load_buffer_32x32(const int32_t *coeff, __m256i *in, + int input_stiride, int size) { + int i; + for (i = 0; i < size; ++i) { + in[i] = _mm256_loadu_si256((const __m256i *)(coeff + i * input_stiride)); + } +} + +static INLINE __m256i half_btf_0_avx2(const __m256i *w0, const __m256i *n0, + const __m256i *rounding, int bit) { + __m256i x; + x = _mm256_mullo_epi32(*w0, *n0); + x = _mm256_add_epi32(x, *rounding); + x = _mm256_srai_epi32(x, bit); + return x; +} + +static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0, + const __m256i *w1, const __m256i *n1, + const __m256i *rounding, int bit) { + __m256i x, y; + + x = _mm256_mullo_epi32(*w0, *n0); + y = _mm256_mullo_epi32(*w1, *n1); + x = _mm256_add_epi32(x, y); + x = _mm256_add_epi32(x, *rounding); + x = _mm256_srai_epi32(x, bit); + return x; +} + +static void addsub_avx2(const __m256i in0, const __m256i in1, __m256i *out0, + __m256i *out1, const __m256i *clamp_lo, + const __m256i *clamp_hi) { + __m256i a0 = _mm256_add_epi32(in0, in1); + __m256i a1 = _mm256_sub_epi32(in0, in1); + + a0 = _mm256_max_epi32(a0, *clamp_lo); + a0 = _mm256_min_epi32(a0, *clamp_hi); + a1 = _mm256_max_epi32(a1, *clamp_lo); + a1 = _mm256_min_epi32(a1, *clamp_hi); + + *out0 = a0; + *out1 = a1; +} + +static INLINE void idct32_stage4_avx2( + __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56, + const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40, + const __m256i *cospi24, const __m256i *cospi40, const __m256i *cospim24, + const __m256i *rounding, int bit) { + __m256i temp1, temp2; + temp1 = half_btf_avx2(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit); + bf1[30] = half_btf_avx2(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit); + bf1[17] = temp1; + + temp2 = half_btf_avx2(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit); + bf1[29] = half_btf_avx2(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit); + bf1[18] = temp2; + + temp1 = half_btf_avx2(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit); + bf1[26] = half_btf_avx2(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit); + bf1[21] = temp1; + + temp2 = half_btf_avx2(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit); + bf1[25] = half_btf_avx2(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit); + bf1[22] = temp2; +} + +static INLINE void idct32_stage5_avx2( + __m256i *bf1, const __m256i *cospim16, const __m256i *cospi48, + const __m256i *cospi16, const __m256i *cospim48, const __m256i *clamp_lo, + const __m256i *clamp_hi, const __m256i *rounding, int bit) { + __m256i temp1, temp2; + temp1 = half_btf_avx2(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit); + bf1[14] = half_btf_avx2(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit); + bf1[9] = temp1; + + temp2 = half_btf_avx2(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit); + bf1[13] = half_btf_avx2(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit); + bf1[10] = temp2; + + addsub_avx2(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi); + addsub_avx2(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi); + addsub_avx2(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi); + addsub_avx2(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi); + addsub_avx2(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi); + addsub_avx2(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi); + addsub_avx2(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi); + addsub_avx2(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi); +} + +static INLINE void idct32_stage6_avx2( + __m256i *bf1, const __m256i *cospim32, const __m256i *cospi32, + const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16, + const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi, + const __m256i *rounding, int bit) { + __m256i temp1, temp2; + temp1 = half_btf_avx2(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit); + bf1[6] = half_btf_avx2(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit); + bf1[5] = temp1; + + addsub_avx2(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi); + addsub_avx2(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi); + addsub_avx2(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi); + addsub_avx2(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi); + + temp1 = half_btf_avx2(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit); + bf1[29] = half_btf_avx2(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit); + bf1[18] = temp1; + temp2 = half_btf_avx2(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit); + bf1[28] = half_btf_avx2(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit); + bf1[19] = temp2; + temp1 = half_btf_avx2(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit); + bf1[27] = half_btf_avx2(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit); + bf1[20] = temp1; + temp2 = half_btf_avx2(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit); + bf1[26] = half_btf_avx2(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit); + bf1[21] = temp2; +} + +static INLINE void idct32_stage7_avx2(__m256i *bf1, const __m256i *cospim32, + const __m256i *cospi32, + const __m256i *clamp_lo, + const __m256i *clamp_hi, + const __m256i *rounding, int bit) { + __m256i temp1, temp2; + addsub_avx2(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi); + addsub_avx2(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi); + addsub_avx2(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi); + addsub_avx2(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi); + + temp1 = half_btf_avx2(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit); + bf1[13] = half_btf_avx2(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit); + bf1[10] = temp1; + temp2 = half_btf_avx2(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit); + bf1[12] = half_btf_avx2(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit); + bf1[11] = temp2; + + addsub_avx2(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi); + addsub_avx2(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi); + addsub_avx2(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi); + addsub_avx2(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi); + addsub_avx2(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi); + addsub_avx2(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi); + addsub_avx2(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi); + addsub_avx2(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi); +} + +static INLINE void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32, + const __m256i *cospi32, + const __m256i *clamp_lo, + const __m256i *clamp_hi, + const __m256i *rounding, int bit) { + __m256i temp1, temp2; + addsub_avx2(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi); + addsub_avx2(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi); + addsub_avx2(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi); + addsub_avx2(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi); + addsub_avx2(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi); + addsub_avx2(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi); + addsub_avx2(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi); + addsub_avx2(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi); + + temp1 = half_btf_avx2(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit); + bf1[27] = half_btf_avx2(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit); + bf1[20] = temp1; + temp2 = half_btf_avx2(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit); + bf1[26] = half_btf_avx2(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit); + bf1[21] = temp2; + temp1 = half_btf_avx2(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit); + bf1[25] = half_btf_avx2(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit); + bf1[22] = temp1; + temp2 = half_btf_avx2(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit); + bf1[24] = half_btf_avx2(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit); + bf1[23] = temp2; +} + +static INLINE void idct32_stage9_avx2(__m256i *bf1, __m256i *out, + const int do_cols, const int bd, + const int out_shift, + const __m256i *clamp_lo, + const __m256i *clamp_hi) { + addsub_avx2(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi); + addsub_avx2(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi); + addsub_avx2(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi); + addsub_avx2(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi); + addsub_avx2(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi); + addsub_avx2(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi); + addsub_avx2(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi); + addsub_avx2(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi); + addsub_avx2(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi); + addsub_avx2(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi); + addsub_avx2(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi); + addsub_avx2(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi); + addsub_avx2(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi); + addsub_avx2(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi); + addsub_avx2(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi); + addsub_avx2(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi); + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8_avx2(out, out_shift); + round_shift_8x8_avx2(out + 16, out_shift); + highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32); + } +} + +static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i x; + // stage 0 + // stage 1 + // stage 2 + // stage 3 + // stage 4 + // stage 5 + x = _mm256_mullo_epi32(in[0], cospi32); + x = _mm256_add_epi32(x, rounding); + x = _mm256_srai_epi32(x, bit); + + // stage 6 + // stage 7 + // stage 8 + // stage 9 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1); + clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + x = _mm256_add_epi32(offset, x); + x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); + } + x = _mm256_max_epi32(x, clamp_lo); + x = _mm256_min_epi32(x, clamp_hi); + out[0] = x; + out[1] = x; + out[2] = x; + out[3] = x; + out[4] = x; + out[5] = x; + out[6] = x; + out[7] = x; + out[8] = x; + out[9] = x; + out[10] = x; + out[11] = x; + out[12] = x; + out[13] = x; + out[14] = x; + out[15] = x; + out[16] = x; + out[17] = x; + out[18] = x; + out[19] = x; + out[20] = x; + out[21] = x; + out[22] = x; + out[23] = x; + out[24] = x; + out[25] = x; + out[26] = x; + out[27] = x; + out[28] = x; + out[29] = x; + out[30] = x; + out[31] = x; +} + +static void idct32_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i bf1[32]; + + { + // stage 0 + // stage 1 + bf1[0] = in[0]; + bf1[4] = in[4]; + bf1[8] = in[2]; + bf1[12] = in[6]; + bf1[16] = in[1]; + bf1[20] = in[5]; + bf1[24] = in[3]; + bf1[28] = in[7]; + + // stage 2 + bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit); + bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit); + bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit); + bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit); + bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit); + bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit); + bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit); + bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit); + + // stage 3 + bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit); + bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit); + + bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit); + bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit); + bf1[17] = bf1[16]; + bf1[18] = bf1[19]; + bf1[21] = bf1[20]; + bf1[22] = bf1[23]; + bf1[25] = bf1[24]; + bf1[26] = bf1[27]; + bf1[29] = bf1[28]; + bf1[30] = bf1[31]; + + // stage 4 + bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit); + bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit); + + bf1[9] = bf1[8]; + bf1[10] = bf1[11]; + bf1[13] = bf1[12]; + bf1[14] = bf1[15]; + + idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, + &cospi24, &cospi40, &cospim24, &rounding, bit); + + // stage 5 + bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit); + bf1[1] = bf1[0]; + bf1[5] = bf1[4]; + bf1[6] = bf1[7]; + + idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, + &clamp_hi, &rounding, bit); + + // stage 6 + bf1[3] = bf1[0]; + bf1[2] = bf1[1]; + + idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); + + // stage 7 + idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 8 + idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 9 + idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); + } +} + +static void idct32_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); + const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); + const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]); + const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); + const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i bf1[32]; + + { + // stage 0 + // stage 1 + bf1[0] = in[0]; + bf1[2] = in[8]; + bf1[4] = in[4]; + bf1[6] = in[12]; + bf1[8] = in[2]; + bf1[10] = in[10]; + bf1[12] = in[6]; + bf1[14] = in[14]; + bf1[16] = in[1]; + bf1[18] = in[9]; + bf1[20] = in[5]; + bf1[22] = in[13]; + bf1[24] = in[3]; + bf1[26] = in[11]; + bf1[28] = in[7]; + bf1[30] = in[15]; + + // stage 2 + bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit); + bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit); + bf1[17] = half_btf_0_avx2(&cospim34, &bf1[30], &rounding, bit); + bf1[30] = half_btf_0_avx2(&cospi30, &bf1[30], &rounding, bit); + bf1[29] = half_btf_0_avx2(&cospi18, &bf1[18], &rounding, bit); + bf1[18] = half_btf_0_avx2(&cospi46, &bf1[18], &rounding, bit); + bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit); + bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit); + bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit); + bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit); + bf1[21] = half_btf_0_avx2(&cospim42, &bf1[26], &rounding, bit); + bf1[26] = half_btf_0_avx2(&cospi22, &bf1[26], &rounding, bit); + bf1[25] = half_btf_0_avx2(&cospi26, &bf1[22], &rounding, bit); + bf1[22] = half_btf_0_avx2(&cospi38, &bf1[22], &rounding, bit); + bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit); + bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit); + + // stage 3 + bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit); + bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit); + bf1[9] = half_btf_0_avx2(&cospim36, &bf1[14], &rounding, bit); + bf1[14] = half_btf_0_avx2(&cospi28, &bf1[14], &rounding, bit); + bf1[13] = half_btf_0_avx2(&cospi20, &bf1[10], &rounding, bit); + bf1[10] = half_btf_0_avx2(&cospi44, &bf1[10], &rounding, bit); + bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit); + bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit); + + addsub_avx2(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); + + // stage 4 + bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit); + bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit); + bf1[5] = half_btf_0_avx2(&cospim40, &bf1[6], &rounding, bit); + bf1[6] = half_btf_0_avx2(&cospi24, &bf1[6], &rounding, bit); + + addsub_avx2(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi); + + idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, + &cospi24, &cospi40, &cospim24, &rounding, bit); + + // stage 5 + bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit); + bf1[1] = bf1[0]; + bf1[3] = half_btf_0_avx2(&cospi16, &bf1[2], &rounding, bit); + bf1[2] = half_btf_0_avx2(&cospi48, &bf1[2], &rounding, bit); + + addsub_avx2(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); + + idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, + &clamp_hi, &rounding, bit); + + // stage 6 + addsub_avx2(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi); + + idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); + + // stage 7 + idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 8 + idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 9 + idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); + } +} + +static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, + int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); + const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); + const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi58 = _mm256_set1_epi32(cospi[58]); + const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); + const __m256i cospi42 = _mm256_set1_epi32(cospi[42]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi50 = _mm256_set1_epi32(cospi[50]); + const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); + const __m256i cospi34 = _mm256_set1_epi32(cospi[34]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]); + const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]); + const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]); + const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); + const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]); + const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]); + const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i bf1[32], bf0[32]; + + { + // stage 0 + // stage 1 + bf1[0] = in[0]; + bf1[1] = in[16]; + bf1[2] = in[8]; + bf1[3] = in[24]; + bf1[4] = in[4]; + bf1[5] = in[20]; + bf1[6] = in[12]; + bf1[7] = in[28]; + bf1[8] = in[2]; + bf1[9] = in[18]; + bf1[10] = in[10]; + bf1[11] = in[26]; + bf1[12] = in[6]; + bf1[13] = in[22]; + bf1[14] = in[14]; + bf1[15] = in[30]; + bf1[16] = in[1]; + bf1[17] = in[17]; + bf1[18] = in[9]; + bf1[19] = in[25]; + bf1[20] = in[5]; + bf1[21] = in[21]; + bf1[22] = in[13]; + bf1[23] = in[29]; + bf1[24] = in[3]; + bf1[25] = in[19]; + bf1[26] = in[11]; + bf1[27] = in[27]; + bf1[28] = in[7]; + bf1[29] = in[23]; + bf1[30] = in[15]; + bf1[31] = in[31]; + + // stage 2 + bf0[0] = bf1[0]; + bf0[1] = bf1[1]; + bf0[2] = bf1[2]; + bf0[3] = bf1[3]; + bf0[4] = bf1[4]; + bf0[5] = bf1[5]; + bf0[6] = bf1[6]; + bf0[7] = bf1[7]; + bf0[8] = bf1[8]; + bf0[9] = bf1[9]; + bf0[10] = bf1[10]; + bf0[11] = bf1[11]; + bf0[12] = bf1[12]; + bf0[13] = bf1[13]; + bf0[14] = bf1[14]; + bf0[15] = bf1[15]; + bf0[16] = + half_btf_avx2(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit); + bf0[17] = + half_btf_avx2(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit); + bf0[18] = + half_btf_avx2(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit); + bf0[19] = + half_btf_avx2(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit); + bf0[20] = + half_btf_avx2(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit); + bf0[21] = + half_btf_avx2(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit); + bf0[22] = + half_btf_avx2(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit); + bf0[23] = + half_btf_avx2(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit); + bf0[24] = + half_btf_avx2(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit); + bf0[25] = + half_btf_avx2(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit); + bf0[26] = + half_btf_avx2(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit); + bf0[27] = + half_btf_avx2(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit); + bf0[28] = + half_btf_avx2(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit); + bf0[29] = + half_btf_avx2(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit); + bf0[30] = + half_btf_avx2(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit); + bf0[31] = + half_btf_avx2(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit); + + // stage 3 + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = + half_btf_avx2(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit); + bf1[9] = + half_btf_avx2(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit); + bf1[10] = + half_btf_avx2(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit); + bf1[11] = + half_btf_avx2(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit); + bf1[12] = + half_btf_avx2(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit); + bf1[13] = + half_btf_avx2(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit); + bf1[14] = + half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit); + bf1[15] = + half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit); + + addsub_avx2(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); + + // stage 4 + bf0[0] = bf1[0]; + bf0[1] = bf1[1]; + bf0[2] = bf1[2]; + bf0[3] = bf1[3]; + bf0[4] = + half_btf_avx2(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit); + bf0[5] = + half_btf_avx2(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit); + bf0[6] = + half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit); + bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit); + + addsub_avx2(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi); + + bf0[16] = bf1[16]; + bf0[17] = + half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit); + bf0[18] = + half_btf_avx2(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit); + bf0[19] = bf1[19]; + bf0[20] = bf1[20]; + bf0[21] = + half_btf_avx2(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit); + bf0[22] = + half_btf_avx2(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit); + bf0[23] = bf1[23]; + bf0[24] = bf1[24]; + bf0[25] = + half_btf_avx2(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit); + bf0[26] = + half_btf_avx2(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit); + bf0[27] = bf1[27]; + bf0[28] = bf1[28]; + bf0[29] = + half_btf_avx2(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit); + bf0[30] = + half_btf_avx2(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit); + bf0[31] = bf1[31]; + + // stage 5 + bf1[0] = + half_btf_avx2(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit); + bf1[1] = + half_btf_avx2(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit); + bf1[2] = + half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit); + bf1[3] = + half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit); + addsub_avx2(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); + bf1[8] = bf0[8]; + bf1[9] = + half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit); + bf1[10] = + half_btf_avx2(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = + half_btf_avx2(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit); + bf1[14] = + half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit); + bf1[15] = bf0[15]; + addsub_avx2(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi); + + // stage 6 + addsub_avx2(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi); + bf0[4] = bf1[4]; + bf0[5] = + half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); + bf0[6] = + half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); + bf0[7] = bf1[7]; + addsub_avx2(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi); + bf0[16] = bf1[16]; + bf0[17] = bf1[17]; + bf0[18] = + half_btf_avx2(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit); + bf0[19] = + half_btf_avx2(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit); + bf0[20] = + half_btf_avx2(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit); + bf0[21] = + half_btf_avx2(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit); + bf0[22] = bf1[22]; + bf0[23] = bf1[23]; + bf0[24] = bf1[24]; + bf0[25] = bf1[25]; + bf0[26] = + half_btf_avx2(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit); + bf0[27] = + half_btf_avx2(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit); + bf0[28] = + half_btf_avx2(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit); + bf0[29] = + half_btf_avx2(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit); + bf0[30] = bf1[30]; + bf0[31] = bf1[31]; + + // stage 7 + addsub_avx2(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = + half_btf_avx2(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); + bf1[11] = + half_btf_avx2(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); + bf1[12] = + half_btf_avx2(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); + bf1[13] = + half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + addsub_avx2(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi); + + // stage 8 + addsub_avx2(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi); + bf0[16] = bf1[16]; + bf0[17] = bf1[17]; + bf0[18] = bf1[18]; + bf0[19] = bf1[19]; + bf0[20] = + half_btf_avx2(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); + bf0[21] = + half_btf_avx2(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); + bf0[22] = + half_btf_avx2(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); + bf0[23] = + half_btf_avx2(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); + bf0[24] = + half_btf_avx2(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); + bf0[25] = + half_btf_avx2(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); + bf0[26] = + half_btf_avx2(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); + bf0[27] = + half_btf_avx2(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); + bf0[28] = bf1[28]; + bf0[29] = bf1[29]; + bf0[30] = bf1[30]; + bf0[31] = bf1[31]; + + // stage 9 + addsub_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi); + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = + _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8_avx2(out, out_shift); + round_shift_8x8_avx2(out + 16, out_shift); + highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32); + } + } +} +static void idct16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + + { + // stage 0 + // stage 1 + // stage 2 + // stage 3 + // stage 4 + in[0] = _mm256_mullo_epi32(in[0], cospi32); + in[0] = _mm256_add_epi32(in[0], rnding); + in[0] = _mm256_srai_epi32(in[0], bit); + + // stage 5 + // stage 6 + // stage 7 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1); + in[0] = _mm256_add_epi32(in[0], offset); + in[0] = _mm256_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift)); + } + in[0] = _mm256_max_epi32(in[0], clamp_lo); + in[0] = _mm256_min_epi32(in[0], clamp_hi); + out[0] = in[0]; + out[1] = in[0]; + out[2] = in[0]; + out[3] = in[0]; + out[4] = in[0]; + out[5] = in[0]; + out[6] = in[0]; + out[7] = in[0]; + out[8] = in[0]; + out[9] = in[0]; + out[10] = in[0]; + out[11] = in[0]; + out[12] = in[0]; + out[13] = in[0]; + out[14] = in[0]; + out[15] = in[0]; + } +} + +static void idct16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i u[16], x, y; + + { + // stage 0 + // stage 1 + u[0] = in[0]; + u[2] = in[4]; + u[4] = in[2]; + u[6] = in[6]; + u[8] = in[1]; + u[10] = in[5]; + u[12] = in[3]; + u[14] = in[7]; + + // stage 2 + u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit); + u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit); + + u[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit); + u[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit); + + u[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit); + u[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit); + + u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit); + u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit); + + // stage 3 + u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit); + u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit); + u[5] = half_btf_0_avx2(&cospim40, &u[6], &rnding, bit); + u[6] = half_btf_0_avx2(&cospi24, &u[6], &rnding, bit); + + addsub_avx2(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi); + addsub_avx2(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi); + addsub_avx2(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi); + + // stage 4 + x = _mm256_mullo_epi32(u[0], cospi32); + u[0] = _mm256_add_epi32(x, rnding); + u[0] = _mm256_srai_epi32(u[0], bit); + u[1] = u[0]; + + u[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit); + u[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit); + + addsub_avx2(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi); + addsub_avx2(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi); + + x = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + u[9] = x; + y = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + u[10] = y; + + // stage 5 + addsub_avx2(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + + x = _mm256_mullo_epi32(u[5], cospi32); + y = _mm256_mullo_epi32(u[6], cospi32); + u[5] = _mm256_sub_epi32(y, x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + u[6] = _mm256_add_epi32(y, x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + // stage 6 + addsub_avx2(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi); + addsub_avx2(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi); + addsub_avx2(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi); + + x = _mm256_mullo_epi32(u[10], cospi32); + y = _mm256_mullo_epi32(u[13], cospi32); + u[10] = _mm256_sub_epi32(y, x); + u[10] = _mm256_add_epi32(u[10], rnding); + u[10] = _mm256_srai_epi32(u[10], bit); + + u[13] = _mm256_add_epi32(x, y); + u[13] = _mm256_add_epi32(u[13], rnding); + u[13] = _mm256_srai_epi32(u[13], bit); + + x = _mm256_mullo_epi32(u[11], cospi32); + y = _mm256_mullo_epi32(u[12], cospi32); + u[11] = _mm256_sub_epi32(y, x); + u[11] = _mm256_add_epi32(u[11], rnding); + u[11] = _mm256_srai_epi32(u[11], bit); + + u[12] = _mm256_add_epi32(x, y); + u[12] = _mm256_add_epi32(u[12], rnding); + u[12] = _mm256_srai_epi32(u[12], bit); + // stage 7 + addsub_avx2(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi); + addsub_avx2(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi); + addsub_avx2(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi); + addsub_avx2(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi); + addsub_avx2(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi); + addsub_avx2(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi); + addsub_avx2(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = + _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8_avx2(out, out_shift); + highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16); + } + } +} + +static void idct16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, + int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); + const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i u[16], v[16], x, y; + + { + // stage 0 + // stage 1 + u[0] = in[0]; + u[1] = in[8]; + u[2] = in[4]; + u[3] = in[12]; + u[4] = in[2]; + u[5] = in[10]; + u[6] = in[6]; + u[7] = in[14]; + u[8] = in[1]; + u[9] = in[9]; + u[10] = in[5]; + u[11] = in[13]; + u[12] = in[3]; + u[13] = in[11]; + u[14] = in[7]; + u[15] = in[15]; + + // stage 2 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = half_btf_avx2(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit); + v[9] = half_btf_avx2(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit); + v[10] = half_btf_avx2(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit); + v[11] = half_btf_avx2(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit); + v[12] = half_btf_avx2(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit); + v[13] = half_btf_avx2(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit); + v[14] = half_btf_avx2(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit); + v[15] = half_btf_avx2(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit); + + // stage 3 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + u[4] = half_btf_avx2(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit); + u[5] = half_btf_avx2(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit); + u[6] = half_btf_avx2(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit); + u[7] = half_btf_avx2(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit); + addsub_avx2(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi); + addsub_avx2(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi); + addsub_avx2(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi); + + // stage 4 + x = _mm256_mullo_epi32(u[0], cospi32); + y = _mm256_mullo_epi32(u[1], cospi32); + v[0] = _mm256_add_epi32(x, y); + v[0] = _mm256_add_epi32(v[0], rnding); + v[0] = _mm256_srai_epi32(v[0], bit); + + v[1] = _mm256_sub_epi32(x, y); + v[1] = _mm256_add_epi32(v[1], rnding); + v[1] = _mm256_srai_epi32(v[1], bit); + + v[2] = half_btf_avx2(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit); + v[3] = half_btf_avx2(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit); + addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); + addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); + v[8] = u[8]; + v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + v[11] = u[11]; + v[12] = u[12]; + v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + v[15] = u[15]; + + // stage 5 + addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + u[4] = v[4]; + + x = _mm256_mullo_epi32(v[5], cospi32); + y = _mm256_mullo_epi32(v[6], cospi32); + u[5] = _mm256_sub_epi32(y, x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + u[6] = _mm256_add_epi32(y, x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[7] = v[7]; + addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + // stage 6 + addsub_avx2(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi); + addsub_avx2(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi); + addsub_avx2(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi); + v[8] = u[8]; + v[9] = u[9]; + + x = _mm256_mullo_epi32(u[10], cospi32); + y = _mm256_mullo_epi32(u[13], cospi32); + v[10] = _mm256_sub_epi32(y, x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[13] = _mm256_add_epi32(x, y); + v[13] = _mm256_add_epi32(v[13], rnding); + v[13] = _mm256_srai_epi32(v[13], bit); + + x = _mm256_mullo_epi32(u[11], cospi32); + y = _mm256_mullo_epi32(u[12], cospi32); + v[11] = _mm256_sub_epi32(y, x); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + v[12] = _mm256_add_epi32(x, y); + v[12] = _mm256_add_epi32(v[12], rnding); + v[12] = _mm256_srai_epi32(v[12], bit); + + v[14] = u[14]; + v[15] = u[15]; + + // stage 7 + addsub_avx2(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi); + addsub_avx2(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi); + addsub_avx2(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi); + addsub_avx2(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi); + addsub_avx2(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi); + addsub_avx2(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi); + addsub_avx2(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi); + addsub_avx2(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = + _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8_avx2(out, out_shift); + highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16); + } + } +} + +static void iadst16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const __m256i zero = _mm256_setzero_si256(); + __m256i v[16], x, y, temp1, temp2; + + // Calculate the column 0, 1, 2, 3 + { + // stage 0 + // stage 1 + // stage 2 + x = _mm256_mullo_epi32(in[0], cospi62); + v[0] = _mm256_add_epi32(x, rnding); + v[0] = _mm256_srai_epi32(v[0], bit); + + x = _mm256_mullo_epi32(in[0], cospi2); + v[1] = _mm256_sub_epi32(zero, x); + v[1] = _mm256_add_epi32(v[1], rnding); + v[1] = _mm256_srai_epi32(v[1], bit); + + // stage 3 + v[8] = v[0]; + v[9] = v[1]; + + // stage 4 + temp1 = _mm256_mullo_epi32(v[8], cospi8); + x = _mm256_mullo_epi32(v[9], cospi56); + temp1 = _mm256_add_epi32(temp1, x); + temp1 = _mm256_add_epi32(temp1, rnding); + temp1 = _mm256_srai_epi32(temp1, bit); + + temp2 = _mm256_mullo_epi32(v[8], cospi56); + x = _mm256_mullo_epi32(v[9], cospi8); + temp2 = _mm256_sub_epi32(temp2, x); + temp2 = _mm256_add_epi32(temp2, rnding); + temp2 = _mm256_srai_epi32(temp2, bit); + v[8] = temp1; + v[9] = temp2; + + // stage 5 + v[4] = v[0]; + v[5] = v[1]; + v[12] = v[8]; + v[13] = v[9]; + + // stage 6 + temp1 = _mm256_mullo_epi32(v[4], cospi16); + x = _mm256_mullo_epi32(v[5], cospi48); + temp1 = _mm256_add_epi32(temp1, x); + temp1 = _mm256_add_epi32(temp1, rnding); + temp1 = _mm256_srai_epi32(temp1, bit); + + temp2 = _mm256_mullo_epi32(v[4], cospi48); + x = _mm256_mullo_epi32(v[5], cospi16); + temp2 = _mm256_sub_epi32(temp2, x); + temp2 = _mm256_add_epi32(temp2, rnding); + temp2 = _mm256_srai_epi32(temp2, bit); + v[4] = temp1; + v[5] = temp2; + + temp1 = _mm256_mullo_epi32(v[12], cospi16); + x = _mm256_mullo_epi32(v[13], cospi48); + temp1 = _mm256_add_epi32(temp1, x); + temp1 = _mm256_add_epi32(temp1, rnding); + temp1 = _mm256_srai_epi32(temp1, bit); + + temp2 = _mm256_mullo_epi32(v[12], cospi48); + x = _mm256_mullo_epi32(v[13], cospi16); + temp2 = _mm256_sub_epi32(temp2, x); + temp2 = _mm256_add_epi32(temp2, rnding); + temp2 = _mm256_srai_epi32(temp2, bit); + v[12] = temp1; + v[13] = temp2; + + // stage 7 + v[2] = v[0]; + v[3] = v[1]; + v[6] = v[4]; + v[7] = v[5]; + v[10] = v[8]; + v[11] = v[9]; + v[14] = v[12]; + v[15] = v[13]; + + // stage 8 + y = _mm256_mullo_epi32(v[2], cospi32); + x = _mm256_mullo_epi32(v[3], cospi32); + v[2] = _mm256_add_epi32(y, x); + v[2] = _mm256_add_epi32(v[2], rnding); + v[2] = _mm256_srai_epi32(v[2], bit); + + v[3] = _mm256_sub_epi32(y, x); + v[3] = _mm256_add_epi32(v[3], rnding); + v[3] = _mm256_srai_epi32(v[3], bit); + + y = _mm256_mullo_epi32(v[6], cospi32); + x = _mm256_mullo_epi32(v[7], cospi32); + v[6] = _mm256_add_epi32(y, x); + v[6] = _mm256_add_epi32(v[6], rnding); + v[6] = _mm256_srai_epi32(v[6], bit); + + v[7] = _mm256_sub_epi32(y, x); + v[7] = _mm256_add_epi32(v[7], rnding); + v[7] = _mm256_srai_epi32(v[7], bit); + + y = _mm256_mullo_epi32(v[10], cospi32); + x = _mm256_mullo_epi32(v[11], cospi32); + v[10] = _mm256_add_epi32(y, x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[11] = _mm256_sub_epi32(y, x); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + y = _mm256_mullo_epi32(v[14], cospi32); + x = _mm256_mullo_epi32(v[15], cospi32); + v[14] = _mm256_add_epi32(y, x); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[15] = _mm256_sub_epi32(y, x); + v[15] = _mm256_add_epi32(v[15], rnding); + v[15] = _mm256_srai_epi32(v[15], bit); + + // stage 9 + if (do_cols) { + out[0] = v[0]; + out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]); + out[2] = v[12]; + out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]); + out[4] = v[6]; + out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]); + out[6] = v[10]; + out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]); + out[8] = v[3]; + out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]); + out[10] = v[15]; + out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]); + out[12] = v[5]; + out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]); + out[14] = v[9]; + out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = + _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } + } +} + +static void iadst16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); + const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); + const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); + const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); + const __m256i cospi34 = _mm256_set1_epi32(cospi[34]); + const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); + const __m256i cospi42 = _mm256_set1_epi32(cospi[42]); + const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); + const __m256i cospi50 = _mm256_set1_epi32(cospi[50]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi58 = _mm256_set1_epi32(cospi[58]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i u[16], x, y; + + { + // stage 0 + // stage 1 + // stage 2 + __m256i zero = _mm256_setzero_si256(); + x = _mm256_mullo_epi32(in[0], cospi62); + u[0] = _mm256_add_epi32(x, rnding); + u[0] = _mm256_srai_epi32(u[0], bit); + + x = _mm256_mullo_epi32(in[0], cospi2); + u[1] = _mm256_sub_epi32(zero, x); + u[1] = _mm256_add_epi32(u[1], rnding); + u[1] = _mm256_srai_epi32(u[1], bit); + + x = _mm256_mullo_epi32(in[2], cospi54); + u[2] = _mm256_add_epi32(x, rnding); + u[2] = _mm256_srai_epi32(u[2], bit); + + x = _mm256_mullo_epi32(in[2], cospi10); + u[3] = _mm256_sub_epi32(zero, x); + u[3] = _mm256_add_epi32(u[3], rnding); + u[3] = _mm256_srai_epi32(u[3], bit); + + x = _mm256_mullo_epi32(in[4], cospi46); + u[4] = _mm256_add_epi32(x, rnding); + u[4] = _mm256_srai_epi32(u[4], bit); + + x = _mm256_mullo_epi32(in[4], cospi18); + u[5] = _mm256_sub_epi32(zero, x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + x = _mm256_mullo_epi32(in[6], cospi38); + u[6] = _mm256_add_epi32(x, rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + x = _mm256_mullo_epi32(in[6], cospi26); + u[7] = _mm256_sub_epi32(zero, x); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + u[8] = _mm256_mullo_epi32(in[7], cospi34); + u[8] = _mm256_add_epi32(u[8], rnding); + u[8] = _mm256_srai_epi32(u[8], bit); + + u[9] = _mm256_mullo_epi32(in[7], cospi30); + u[9] = _mm256_add_epi32(u[9], rnding); + u[9] = _mm256_srai_epi32(u[9], bit); + + u[10] = _mm256_mullo_epi32(in[5], cospi42); + u[10] = _mm256_add_epi32(u[10], rnding); + u[10] = _mm256_srai_epi32(u[10], bit); + + u[11] = _mm256_mullo_epi32(in[5], cospi22); + u[11] = _mm256_add_epi32(u[11], rnding); + u[11] = _mm256_srai_epi32(u[11], bit); + + u[12] = _mm256_mullo_epi32(in[3], cospi50); + u[12] = _mm256_add_epi32(u[12], rnding); + u[12] = _mm256_srai_epi32(u[12], bit); + + u[13] = _mm256_mullo_epi32(in[3], cospi14); + u[13] = _mm256_add_epi32(u[13], rnding); + u[13] = _mm256_srai_epi32(u[13], bit); + + u[14] = _mm256_mullo_epi32(in[1], cospi58); + u[14] = _mm256_add_epi32(u[14], rnding); + u[14] = _mm256_srai_epi32(u[14], bit); + + u[15] = _mm256_mullo_epi32(in[1], cospi6); + u[15] = _mm256_add_epi32(u[15], rnding); + u[15] = _mm256_srai_epi32(u[15], bit); + + // stage 3 + addsub_avx2(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi); + addsub_avx2(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi); + addsub_avx2(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi); + addsub_avx2(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi); + + // stage 4 + y = _mm256_mullo_epi32(u[8], cospi56); + x = _mm256_mullo_epi32(u[9], cospi56); + u[8] = _mm256_mullo_epi32(u[8], cospi8); + u[8] = _mm256_add_epi32(u[8], x); + u[8] = _mm256_add_epi32(u[8], rnding); + u[8] = _mm256_srai_epi32(u[8], bit); + + x = _mm256_mullo_epi32(u[9], cospi8); + u[9] = _mm256_sub_epi32(y, x); + u[9] = _mm256_add_epi32(u[9], rnding); + u[9] = _mm256_srai_epi32(u[9], bit); + + x = _mm256_mullo_epi32(u[11], cospi24); + y = _mm256_mullo_epi32(u[10], cospi24); + u[10] = _mm256_mullo_epi32(u[10], cospi40); + u[10] = _mm256_add_epi32(u[10], x); + u[10] = _mm256_add_epi32(u[10], rnding); + u[10] = _mm256_srai_epi32(u[10], bit); + + x = _mm256_mullo_epi32(u[11], cospi40); + u[11] = _mm256_sub_epi32(y, x); + u[11] = _mm256_add_epi32(u[11], rnding); + u[11] = _mm256_srai_epi32(u[11], bit); + + x = _mm256_mullo_epi32(u[13], cospi8); + y = _mm256_mullo_epi32(u[12], cospi8); + u[12] = _mm256_mullo_epi32(u[12], cospim56); + u[12] = _mm256_add_epi32(u[12], x); + u[12] = _mm256_add_epi32(u[12], rnding); + u[12] = _mm256_srai_epi32(u[12], bit); + + x = _mm256_mullo_epi32(u[13], cospim56); + u[13] = _mm256_sub_epi32(y, x); + u[13] = _mm256_add_epi32(u[13], rnding); + u[13] = _mm256_srai_epi32(u[13], bit); + + x = _mm256_mullo_epi32(u[15], cospi40); + y = _mm256_mullo_epi32(u[14], cospi40); + u[14] = _mm256_mullo_epi32(u[14], cospim24); + u[14] = _mm256_add_epi32(u[14], x); + u[14] = _mm256_add_epi32(u[14], rnding); + u[14] = _mm256_srai_epi32(u[14], bit); + + x = _mm256_mullo_epi32(u[15], cospim24); + u[15] = _mm256_sub_epi32(y, x); + u[15] = _mm256_add_epi32(u[15], rnding); + u[15] = _mm256_srai_epi32(u[15], bit); + + // stage 5 + addsub_avx2(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi); + addsub_avx2(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi); + addsub_avx2(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi); + addsub_avx2(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi); + addsub_avx2(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi); + addsub_avx2(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi); + + // stage 6 + x = _mm256_mullo_epi32(u[5], cospi48); + y = _mm256_mullo_epi32(u[4], cospi48); + u[4] = _mm256_mullo_epi32(u[4], cospi16); + u[4] = _mm256_add_epi32(u[4], x); + u[4] = _mm256_add_epi32(u[4], rnding); + u[4] = _mm256_srai_epi32(u[4], bit); + + x = _mm256_mullo_epi32(u[5], cospi16); + u[5] = _mm256_sub_epi32(y, x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + x = _mm256_mullo_epi32(u[7], cospi16); + y = _mm256_mullo_epi32(u[6], cospi16); + u[6] = _mm256_mullo_epi32(u[6], cospim48); + u[6] = _mm256_add_epi32(u[6], x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + x = _mm256_mullo_epi32(u[7], cospim48); + u[7] = _mm256_sub_epi32(y, x); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + x = _mm256_mullo_epi32(u[13], cospi48); + y = _mm256_mullo_epi32(u[12], cospi48); + u[12] = _mm256_mullo_epi32(u[12], cospi16); + u[12] = _mm256_add_epi32(u[12], x); + u[12] = _mm256_add_epi32(u[12], rnding); + u[12] = _mm256_srai_epi32(u[12], bit); + + x = _mm256_mullo_epi32(u[13], cospi16); + u[13] = _mm256_sub_epi32(y, x); + u[13] = _mm256_add_epi32(u[13], rnding); + u[13] = _mm256_srai_epi32(u[13], bit); + + x = _mm256_mullo_epi32(u[15], cospi16); + y = _mm256_mullo_epi32(u[14], cospi16); + u[14] = _mm256_mullo_epi32(u[14], cospim48); + u[14] = _mm256_add_epi32(u[14], x); + u[14] = _mm256_add_epi32(u[14], rnding); + u[14] = _mm256_srai_epi32(u[14], bit); + + x = _mm256_mullo_epi32(u[15], cospim48); + u[15] = _mm256_sub_epi32(y, x); + u[15] = _mm256_add_epi32(u[15], rnding); + u[15] = _mm256_srai_epi32(u[15], bit); + + // stage 7 + addsub_avx2(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi); + addsub_avx2(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi); + addsub_avx2(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi); + addsub_avx2(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi); + addsub_avx2(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi); + + // stage 8 + y = _mm256_mullo_epi32(u[2], cospi32); + x = _mm256_mullo_epi32(u[3], cospi32); + u[2] = _mm256_add_epi32(y, x); + u[2] = _mm256_add_epi32(u[2], rnding); + u[2] = _mm256_srai_epi32(u[2], bit); + + u[3] = _mm256_sub_epi32(y, x); + u[3] = _mm256_add_epi32(u[3], rnding); + u[3] = _mm256_srai_epi32(u[3], bit); + y = _mm256_mullo_epi32(u[6], cospi32); + x = _mm256_mullo_epi32(u[7], cospi32); + u[6] = _mm256_add_epi32(y, x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[7] = _mm256_sub_epi32(y, x); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + y = _mm256_mullo_epi32(u[10], cospi32); + x = _mm256_mullo_epi32(u[11], cospi32); + u[10] = _mm256_add_epi32(y, x); + u[10] = _mm256_add_epi32(u[10], rnding); + u[10] = _mm256_srai_epi32(u[10], bit); + + u[11] = _mm256_sub_epi32(y, x); + u[11] = _mm256_add_epi32(u[11], rnding); + u[11] = _mm256_srai_epi32(u[11], bit); + + y = _mm256_mullo_epi32(u[14], cospi32); + x = _mm256_mullo_epi32(u[15], cospi32); + u[14] = _mm256_add_epi32(y, x); + u[14] = _mm256_add_epi32(u[14], rnding); + u[14] = _mm256_srai_epi32(u[14], bit); + + u[15] = _mm256_sub_epi32(y, x); + u[15] = _mm256_add_epi32(u[15], rnding); + u[15] = _mm256_srai_epi32(u[15], bit); + + // stage 9 + if (do_cols) { + out[0] = u[0]; + out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), u[8]); + out[2] = u[12]; + out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), u[4]); + out[4] = u[6]; + out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), u[14]); + out[6] = u[10]; + out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), u[2]); + out[8] = u[3]; + out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), u[11]); + out[10] = u[15]; + out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), u[7]); + out[12] = u[5]; + out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), u[13]); + out[14] = u[9]; + out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = + _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_avx2(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(u[12], u[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(u[6], u[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(u[10], u[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(u[3], u[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(u[15], u[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(u[5], u[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(u[9], u[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } + } +} + +static void iadst16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); + const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); + const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); + const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); + const __m256i cospi34 = _mm256_set1_epi32(cospi[34]); + const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); + const __m256i cospi42 = _mm256_set1_epi32(cospi[42]); + const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); + const __m256i cospi50 = _mm256_set1_epi32(cospi[50]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi58 = _mm256_set1_epi32(cospi[58]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i u[16], v[16], x, y; + + { + // stage 0 + // stage 1 + // stage 2 + v[0] = _mm256_mullo_epi32(in[15], cospi2); + x = _mm256_mullo_epi32(in[0], cospi62); + v[0] = _mm256_add_epi32(v[0], x); + v[0] = _mm256_add_epi32(v[0], rnding); + v[0] = _mm256_srai_epi32(v[0], bit); + + v[1] = _mm256_mullo_epi32(in[15], cospi62); + x = _mm256_mullo_epi32(in[0], cospi2); + v[1] = _mm256_sub_epi32(v[1], x); + v[1] = _mm256_add_epi32(v[1], rnding); + v[1] = _mm256_srai_epi32(v[1], bit); + + v[2] = _mm256_mullo_epi32(in[13], cospi10); + x = _mm256_mullo_epi32(in[2], cospi54); + v[2] = _mm256_add_epi32(v[2], x); + v[2] = _mm256_add_epi32(v[2], rnding); + v[2] = _mm256_srai_epi32(v[2], bit); + + v[3] = _mm256_mullo_epi32(in[13], cospi54); + x = _mm256_mullo_epi32(in[2], cospi10); + v[3] = _mm256_sub_epi32(v[3], x); + v[3] = _mm256_add_epi32(v[3], rnding); + v[3] = _mm256_srai_epi32(v[3], bit); + + v[4] = _mm256_mullo_epi32(in[11], cospi18); + x = _mm256_mullo_epi32(in[4], cospi46); + v[4] = _mm256_add_epi32(v[4], x); + v[4] = _mm256_add_epi32(v[4], rnding); + v[4] = _mm256_srai_epi32(v[4], bit); + + v[5] = _mm256_mullo_epi32(in[11], cospi46); + x = _mm256_mullo_epi32(in[4], cospi18); + v[5] = _mm256_sub_epi32(v[5], x); + v[5] = _mm256_add_epi32(v[5], rnding); + v[5] = _mm256_srai_epi32(v[5], bit); + + v[6] = _mm256_mullo_epi32(in[9], cospi26); + x = _mm256_mullo_epi32(in[6], cospi38); + v[6] = _mm256_add_epi32(v[6], x); + v[6] = _mm256_add_epi32(v[6], rnding); + v[6] = _mm256_srai_epi32(v[6], bit); + + v[7] = _mm256_mullo_epi32(in[9], cospi38); + x = _mm256_mullo_epi32(in[6], cospi26); + v[7] = _mm256_sub_epi32(v[7], x); + v[7] = _mm256_add_epi32(v[7], rnding); + v[7] = _mm256_srai_epi32(v[7], bit); + + v[8] = _mm256_mullo_epi32(in[7], cospi34); + x = _mm256_mullo_epi32(in[8], cospi30); + v[8] = _mm256_add_epi32(v[8], x); + v[8] = _mm256_add_epi32(v[8], rnding); + v[8] = _mm256_srai_epi32(v[8], bit); + + v[9] = _mm256_mullo_epi32(in[7], cospi30); + x = _mm256_mullo_epi32(in[8], cospi34); + v[9] = _mm256_sub_epi32(v[9], x); + v[9] = _mm256_add_epi32(v[9], rnding); + v[9] = _mm256_srai_epi32(v[9], bit); + + v[10] = _mm256_mullo_epi32(in[5], cospi42); + x = _mm256_mullo_epi32(in[10], cospi22); + v[10] = _mm256_add_epi32(v[10], x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[11] = _mm256_mullo_epi32(in[5], cospi22); + x = _mm256_mullo_epi32(in[10], cospi42); + v[11] = _mm256_sub_epi32(v[11], x); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + v[12] = _mm256_mullo_epi32(in[3], cospi50); + x = _mm256_mullo_epi32(in[12], cospi14); + v[12] = _mm256_add_epi32(v[12], x); + v[12] = _mm256_add_epi32(v[12], rnding); + v[12] = _mm256_srai_epi32(v[12], bit); + + v[13] = _mm256_mullo_epi32(in[3], cospi14); + x = _mm256_mullo_epi32(in[12], cospi50); + v[13] = _mm256_sub_epi32(v[13], x); + v[13] = _mm256_add_epi32(v[13], rnding); + v[13] = _mm256_srai_epi32(v[13], bit); + + v[14] = _mm256_mullo_epi32(in[1], cospi58); + x = _mm256_mullo_epi32(in[14], cospi6); + v[14] = _mm256_add_epi32(v[14], x); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[15] = _mm256_mullo_epi32(in[1], cospi6); + x = _mm256_mullo_epi32(in[14], cospi58); + v[15] = _mm256_sub_epi32(v[15], x); + v[15] = _mm256_add_epi32(v[15], rnding); + v[15] = _mm256_srai_epi32(v[15], bit); + + // stage 3 + addsub_avx2(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi); + addsub_avx2(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi); + addsub_avx2(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi); + addsub_avx2(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi); + addsub_avx2(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi); + + // stage 4 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = _mm256_mullo_epi32(u[8], cospi8); + x = _mm256_mullo_epi32(u[9], cospi56); + v[8] = _mm256_add_epi32(v[8], x); + v[8] = _mm256_add_epi32(v[8], rnding); + v[8] = _mm256_srai_epi32(v[8], bit); + + v[9] = _mm256_mullo_epi32(u[8], cospi56); + x = _mm256_mullo_epi32(u[9], cospi8); + v[9] = _mm256_sub_epi32(v[9], x); + v[9] = _mm256_add_epi32(v[9], rnding); + v[9] = _mm256_srai_epi32(v[9], bit); + + v[10] = _mm256_mullo_epi32(u[10], cospi40); + x = _mm256_mullo_epi32(u[11], cospi24); + v[10] = _mm256_add_epi32(v[10], x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[11] = _mm256_mullo_epi32(u[10], cospi24); + x = _mm256_mullo_epi32(u[11], cospi40); + v[11] = _mm256_sub_epi32(v[11], x); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + v[12] = _mm256_mullo_epi32(u[12], cospim56); + x = _mm256_mullo_epi32(u[13], cospi8); + v[12] = _mm256_add_epi32(v[12], x); + v[12] = _mm256_add_epi32(v[12], rnding); + v[12] = _mm256_srai_epi32(v[12], bit); + + v[13] = _mm256_mullo_epi32(u[12], cospi8); + x = _mm256_mullo_epi32(u[13], cospim56); + v[13] = _mm256_sub_epi32(v[13], x); + v[13] = _mm256_add_epi32(v[13], rnding); + v[13] = _mm256_srai_epi32(v[13], bit); + + v[14] = _mm256_mullo_epi32(u[14], cospim24); + x = _mm256_mullo_epi32(u[15], cospi40); + v[14] = _mm256_add_epi32(v[14], x); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[15] = _mm256_mullo_epi32(u[14], cospi40); + x = _mm256_mullo_epi32(u[15], cospim24); + v[15] = _mm256_sub_epi32(v[15], x); + v[15] = _mm256_add_epi32(v[15], rnding); + v[15] = _mm256_srai_epi32(v[15], bit); + + // stage 5 + addsub_avx2(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi); + addsub_avx2(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi); + addsub_avx2(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi); + addsub_avx2(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi); + addsub_avx2(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi); + addsub_avx2(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi); + addsub_avx2(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + + v[4] = _mm256_mullo_epi32(u[4], cospi16); + x = _mm256_mullo_epi32(u[5], cospi48); + v[4] = _mm256_add_epi32(v[4], x); + v[4] = _mm256_add_epi32(v[4], rnding); + v[4] = _mm256_srai_epi32(v[4], bit); + + v[5] = _mm256_mullo_epi32(u[4], cospi48); + x = _mm256_mullo_epi32(u[5], cospi16); + v[5] = _mm256_sub_epi32(v[5], x); + v[5] = _mm256_add_epi32(v[5], rnding); + v[5] = _mm256_srai_epi32(v[5], bit); + + v[6] = _mm256_mullo_epi32(u[6], cospim48); + x = _mm256_mullo_epi32(u[7], cospi16); + v[6] = _mm256_add_epi32(v[6], x); + v[6] = _mm256_add_epi32(v[6], rnding); + v[6] = _mm256_srai_epi32(v[6], bit); + + v[7] = _mm256_mullo_epi32(u[6], cospi16); + x = _mm256_mullo_epi32(u[7], cospim48); + v[7] = _mm256_sub_epi32(v[7], x); + v[7] = _mm256_add_epi32(v[7], rnding); + v[7] = _mm256_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + v[10] = u[10]; + v[11] = u[11]; + + v[12] = _mm256_mullo_epi32(u[12], cospi16); + x = _mm256_mullo_epi32(u[13], cospi48); + v[12] = _mm256_add_epi32(v[12], x); + v[12] = _mm256_add_epi32(v[12], rnding); + v[12] = _mm256_srai_epi32(v[12], bit); + + v[13] = _mm256_mullo_epi32(u[12], cospi48); + x = _mm256_mullo_epi32(u[13], cospi16); + v[13] = _mm256_sub_epi32(v[13], x); + v[13] = _mm256_add_epi32(v[13], rnding); + v[13] = _mm256_srai_epi32(v[13], bit); + + v[14] = _mm256_mullo_epi32(u[14], cospim48); + x = _mm256_mullo_epi32(u[15], cospi16); + v[14] = _mm256_add_epi32(v[14], x); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[15] = _mm256_mullo_epi32(u[14], cospi16); + x = _mm256_mullo_epi32(u[15], cospim48); + v[15] = _mm256_sub_epi32(v[15], x); + v[15] = _mm256_add_epi32(v[15], rnding); + v[15] = _mm256_srai_epi32(v[15], bit); + + // stage 7 + addsub_avx2(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi); + addsub_avx2(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi); + addsub_avx2(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi); + addsub_avx2(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi); + addsub_avx2(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi); + addsub_avx2(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi); + + // stage 8 + v[0] = u[0]; + v[1] = u[1]; + + y = _mm256_mullo_epi32(u[2], cospi32); + x = _mm256_mullo_epi32(u[3], cospi32); + v[2] = _mm256_add_epi32(y, x); + v[2] = _mm256_add_epi32(v[2], rnding); + v[2] = _mm256_srai_epi32(v[2], bit); + + v[3] = _mm256_sub_epi32(y, x); + v[3] = _mm256_add_epi32(v[3], rnding); + v[3] = _mm256_srai_epi32(v[3], bit); + + v[4] = u[4]; + v[5] = u[5]; + + y = _mm256_mullo_epi32(u[6], cospi32); + x = _mm256_mullo_epi32(u[7], cospi32); + v[6] = _mm256_add_epi32(y, x); + v[6] = _mm256_add_epi32(v[6], rnding); + v[6] = _mm256_srai_epi32(v[6], bit); + + v[7] = _mm256_sub_epi32(y, x); + v[7] = _mm256_add_epi32(v[7], rnding); + v[7] = _mm256_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + + y = _mm256_mullo_epi32(u[10], cospi32); + x = _mm256_mullo_epi32(u[11], cospi32); + v[10] = _mm256_add_epi32(y, x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[11] = _mm256_sub_epi32(y, x); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + v[12] = u[12]; + v[13] = u[13]; + + y = _mm256_mullo_epi32(u[14], cospi32); + x = _mm256_mullo_epi32(u[15], cospi32); + v[14] = _mm256_add_epi32(y, x); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[15] = _mm256_sub_epi32(y, x); + v[15] = _mm256_add_epi32(v[15], rnding); + v[15] = _mm256_srai_epi32(v[15], bit); + + // stage 9 + if (do_cols) { + out[0] = v[0]; + out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]); + out[2] = v[12]; + out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]); + out[4] = v[6]; + out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]); + out[6] = v[10]; + out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]); + out[8] = v[3]; + out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]); + out[10] = v[15]; + out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]); + out[12] = v[5]; + out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]); + out[14] = v[9]; + out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = + _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } + } +} +static void idct8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i x; + + // stage 0 + // stage 1 + // stage 2 + // stage 3 + x = _mm256_mullo_epi32(in[0], cospi32); + x = _mm256_add_epi32(x, rnding); + x = _mm256_srai_epi32(x, bit); + + // stage 4 + // stage 5 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1); + clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + x = _mm256_add_epi32(x, offset); + x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); + } + x = _mm256_max_epi32(x, clamp_lo); + x = _mm256_min_epi32(x, clamp_hi); + out[0] = x; + out[1] = x; + out[2] = x; + out[3] = x; + out[4] = x; + out[5] = x; + out[6] = x; + out[7] = x; +} +static void idct8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i u0, u1, u2, u3, u4, u5, u6, u7; + __m256i v0, v1, v2, v3, v4, v5, v6, v7; + __m256i x, y; + + // stage 0 + // stage 1 + // stage 2 + u0 = in[0]; + u1 = in[4]; + u2 = in[2]; + u3 = in[6]; + + x = _mm256_mullo_epi32(in[1], cospi56); + y = _mm256_mullo_epi32(in[7], cospim8); + u4 = _mm256_add_epi32(x, y); + u4 = _mm256_add_epi32(u4, rnding); + u4 = _mm256_srai_epi32(u4, bit); + + x = _mm256_mullo_epi32(in[1], cospi8); + y = _mm256_mullo_epi32(in[7], cospi56); + u7 = _mm256_add_epi32(x, y); + u7 = _mm256_add_epi32(u7, rnding); + u7 = _mm256_srai_epi32(u7, bit); + + x = _mm256_mullo_epi32(in[5], cospi24); + y = _mm256_mullo_epi32(in[3], cospim40); + u5 = _mm256_add_epi32(x, y); + u5 = _mm256_add_epi32(u5, rnding); + u5 = _mm256_srai_epi32(u5, bit); + + x = _mm256_mullo_epi32(in[5], cospi40); + y = _mm256_mullo_epi32(in[3], cospi24); + u6 = _mm256_add_epi32(x, y); + u6 = _mm256_add_epi32(u6, rnding); + u6 = _mm256_srai_epi32(u6, bit); + + // stage 3 + x = _mm256_mullo_epi32(u0, cospi32); + y = _mm256_mullo_epi32(u1, cospi32); + v0 = _mm256_add_epi32(x, y); + v0 = _mm256_add_epi32(v0, rnding); + v0 = _mm256_srai_epi32(v0, bit); + + v1 = _mm256_sub_epi32(x, y); + v1 = _mm256_add_epi32(v1, rnding); + v1 = _mm256_srai_epi32(v1, bit); + + x = _mm256_mullo_epi32(u2, cospi48); + y = _mm256_mullo_epi32(u3, cospim16); + v2 = _mm256_add_epi32(x, y); + v2 = _mm256_add_epi32(v2, rnding); + v2 = _mm256_srai_epi32(v2, bit); + + x = _mm256_mullo_epi32(u2, cospi16); + y = _mm256_mullo_epi32(u3, cospi48); + v3 = _mm256_add_epi32(x, y); + v3 = _mm256_add_epi32(v3, rnding); + v3 = _mm256_srai_epi32(v3, bit); + + addsub_avx2(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi); + addsub_avx2(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi); + + // stage 4 + addsub_avx2(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi); + addsub_avx2(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi); + u4 = v4; + u7 = v7; + + x = _mm256_mullo_epi32(v5, cospi32); + y = _mm256_mullo_epi32(v6, cospi32); + u6 = _mm256_add_epi32(y, x); + u6 = _mm256_add_epi32(u6, rnding); + u6 = _mm256_srai_epi32(u6, bit); + + u5 = _mm256_sub_epi32(y, x); + u5 = _mm256_add_epi32(u5, rnding); + u5 = _mm256_srai_epi32(u5, bit); + + addsub_avx2(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi); + addsub_avx2(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi); + addsub_avx2(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi); + addsub_avx2(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi); + // stage 5 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + round_shift_4x4_avx2(out, out_shift); + round_shift_4x4_avx2(out + 4, out_shift); + highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 8); + } +} +static void iadst8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const __m256i kZero = _mm256_setzero_si256(); + __m256i u[8], x; + + // stage 0 + // stage 1 + // stage 2 + + x = _mm256_mullo_epi32(in[0], cospi60); + u[0] = _mm256_add_epi32(x, rnding); + u[0] = _mm256_srai_epi32(u[0], bit); + + x = _mm256_mullo_epi32(in[0], cospi4); + u[1] = _mm256_sub_epi32(kZero, x); + u[1] = _mm256_add_epi32(u[1], rnding); + u[1] = _mm256_srai_epi32(u[1], bit); + + // stage 3 + // stage 4 + __m256i temp1, temp2; + temp1 = _mm256_mullo_epi32(u[0], cospi16); + x = _mm256_mullo_epi32(u[1], cospi48); + temp1 = _mm256_add_epi32(temp1, x); + temp1 = _mm256_add_epi32(temp1, rnding); + temp1 = _mm256_srai_epi32(temp1, bit); + u[4] = temp1; + + temp2 = _mm256_mullo_epi32(u[0], cospi48); + x = _mm256_mullo_epi32(u[1], cospi16); + u[5] = _mm256_sub_epi32(temp2, x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + // stage 5 + // stage 6 + temp1 = _mm256_mullo_epi32(u[0], cospi32); + x = _mm256_mullo_epi32(u[1], cospi32); + u[2] = _mm256_add_epi32(temp1, x); + u[2] = _mm256_add_epi32(u[2], rnding); + u[2] = _mm256_srai_epi32(u[2], bit); + + u[3] = _mm256_sub_epi32(temp1, x); + u[3] = _mm256_add_epi32(u[3], rnding); + u[3] = _mm256_srai_epi32(u[3], bit); + + temp1 = _mm256_mullo_epi32(u[4], cospi32); + x = _mm256_mullo_epi32(u[5], cospi32); + u[6] = _mm256_add_epi32(temp1, x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[7] = _mm256_sub_epi32(temp1, x); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + // stage 7 + if (do_cols) { + out[0] = u[0]; + out[1] = _mm256_sub_epi32(kZero, u[4]); + out[2] = u[6]; + out[3] = _mm256_sub_epi32(kZero, u[2]); + out[4] = u[3]; + out[5] = _mm256_sub_epi32(kZero, u[7]); + out[6] = u[5]; + out[7] = _mm256_sub_epi32(kZero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, + out_shift); + } +} + +static void iadst8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const __m256i kZero = _mm256_setzero_si256(); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i u[8], v[8], x; + + // stage 0 + // stage 1 + // stage 2 + + u[0] = _mm256_mullo_epi32(in[7], cospi4); + x = _mm256_mullo_epi32(in[0], cospi60); + u[0] = _mm256_add_epi32(u[0], x); + u[0] = _mm256_add_epi32(u[0], rnding); + u[0] = _mm256_srai_epi32(u[0], bit); + + u[1] = _mm256_mullo_epi32(in[7], cospi60); + x = _mm256_mullo_epi32(in[0], cospi4); + u[1] = _mm256_sub_epi32(u[1], x); + u[1] = _mm256_add_epi32(u[1], rnding); + u[1] = _mm256_srai_epi32(u[1], bit); + + u[2] = _mm256_mullo_epi32(in[5], cospi20); + x = _mm256_mullo_epi32(in[2], cospi44); + u[2] = _mm256_add_epi32(u[2], x); + u[2] = _mm256_add_epi32(u[2], rnding); + u[2] = _mm256_srai_epi32(u[2], bit); + + u[3] = _mm256_mullo_epi32(in[5], cospi44); + x = _mm256_mullo_epi32(in[2], cospi20); + u[3] = _mm256_sub_epi32(u[3], x); + u[3] = _mm256_add_epi32(u[3], rnding); + u[3] = _mm256_srai_epi32(u[3], bit); + + u[4] = _mm256_mullo_epi32(in[3], cospi36); + x = _mm256_mullo_epi32(in[4], cospi28); + u[4] = _mm256_add_epi32(u[4], x); + u[4] = _mm256_add_epi32(u[4], rnding); + u[4] = _mm256_srai_epi32(u[4], bit); + + u[5] = _mm256_mullo_epi32(in[3], cospi28); + x = _mm256_mullo_epi32(in[4], cospi36); + u[5] = _mm256_sub_epi32(u[5], x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + u[6] = _mm256_mullo_epi32(in[1], cospi52); + x = _mm256_mullo_epi32(in[6], cospi12); + u[6] = _mm256_add_epi32(u[6], x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[7] = _mm256_mullo_epi32(in[1], cospi12); + x = _mm256_mullo_epi32(in[6], cospi52); + u[7] = _mm256_sub_epi32(u[7], x); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + // stage 3 + addsub_avx2(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); + addsub_avx2(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); + addsub_avx2(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); + + // stage 4 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = _mm256_mullo_epi32(v[4], cospi16); + x = _mm256_mullo_epi32(v[5], cospi48); + u[4] = _mm256_add_epi32(u[4], x); + u[4] = _mm256_add_epi32(u[4], rnding); + u[4] = _mm256_srai_epi32(u[4], bit); + + u[5] = _mm256_mullo_epi32(v[4], cospi48); + x = _mm256_mullo_epi32(v[5], cospi16); + u[5] = _mm256_sub_epi32(u[5], x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + u[6] = _mm256_mullo_epi32(v[6], cospim48); + x = _mm256_mullo_epi32(v[7], cospi16); + u[6] = _mm256_add_epi32(u[6], x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[7] = _mm256_mullo_epi32(v[6], cospi16); + x = _mm256_mullo_epi32(v[7], cospim48); + u[7] = _mm256_sub_epi32(u[7], x); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + // stage 5 + addsub_avx2(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); + addsub_avx2(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); + addsub_avx2(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); + + // stage 6 + u[0] = v[0]; + u[1] = v[1]; + u[4] = v[4]; + u[5] = v[5]; + + v[0] = _mm256_mullo_epi32(v[2], cospi32); + x = _mm256_mullo_epi32(v[3], cospi32); + u[2] = _mm256_add_epi32(v[0], x); + u[2] = _mm256_add_epi32(u[2], rnding); + u[2] = _mm256_srai_epi32(u[2], bit); + + u[3] = _mm256_sub_epi32(v[0], x); + u[3] = _mm256_add_epi32(u[3], rnding); + u[3] = _mm256_srai_epi32(u[3], bit); + + v[0] = _mm256_mullo_epi32(v[6], cospi32); + x = _mm256_mullo_epi32(v[7], cospi32); + u[6] = _mm256_add_epi32(v[0], x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[7] = _mm256_sub_epi32(v[0], x); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + // stage 7 + if (do_cols) { + out[0] = u[0]; + out[1] = _mm256_sub_epi32(kZero, u[4]); + out[2] = u[6]; + out[3] = _mm256_sub_epi32(kZero, u[2]); + out[4] = u[3]; + out[5] = _mm256_sub_epi32(kZero, u[7]); + out[6] = u[5]; + out[7] = _mm256_sub_epi32(kZero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, + out_shift); + } +} +static INLINE void idct64_stage8_avx2( + __m256i *u, const __m256i *cospim32, const __m256i *cospi32, + const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16, + const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi, + const __m256i *rnding, int bit) { + int i; + __m256i temp1, temp2, temp3, temp4; + temp1 = half_btf_avx2(cospim32, &u[10], cospi32, &u[13], rnding, bit); + u[13] = half_btf_avx2(cospi32, &u[10], cospi32, &u[13], rnding, bit); + u[10] = temp1; + temp2 = half_btf_avx2(cospim32, &u[11], cospi32, &u[12], rnding, bit); + u[12] = half_btf_avx2(cospi32, &u[11], cospi32, &u[12], rnding, bit); + u[11] = temp2; + + for (i = 16; i < 20; ++i) { + addsub_avx2(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi); + addsub_avx2(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi); + } + + temp1 = half_btf_avx2(cospim16, &u[36], cospi48, &u[59], rnding, bit); + temp2 = half_btf_avx2(cospim16, &u[37], cospi48, &u[58], rnding, bit); + temp3 = half_btf_avx2(cospim16, &u[38], cospi48, &u[57], rnding, bit); + temp4 = half_btf_avx2(cospim16, &u[39], cospi48, &u[56], rnding, bit); + u[56] = half_btf_avx2(cospi48, &u[39], cospi16, &u[56], rnding, bit); + u[57] = half_btf_avx2(cospi48, &u[38], cospi16, &u[57], rnding, bit); + u[58] = half_btf_avx2(cospi48, &u[37], cospi16, &u[58], rnding, bit); + u[59] = half_btf_avx2(cospi48, &u[36], cospi16, &u[59], rnding, bit); + u[36] = temp1; + u[37] = temp2; + u[38] = temp3; + u[39] = temp4; + + temp1 = half_btf_avx2(cospim48, &u[40], cospim16, &u[55], rnding, bit); + temp2 = half_btf_avx2(cospim48, &u[41], cospim16, &u[54], rnding, bit); + temp3 = half_btf_avx2(cospim48, &u[42], cospim16, &u[53], rnding, bit); + temp4 = half_btf_avx2(cospim48, &u[43], cospim16, &u[52], rnding, bit); + u[52] = half_btf_avx2(cospim16, &u[43], cospi48, &u[52], rnding, bit); + u[53] = half_btf_avx2(cospim16, &u[42], cospi48, &u[53], rnding, bit); + u[54] = half_btf_avx2(cospim16, &u[41], cospi48, &u[54], rnding, bit); + u[55] = half_btf_avx2(cospim16, &u[40], cospi48, &u[55], rnding, bit); + u[40] = temp1; + u[41] = temp2; + u[42] = temp3; + u[43] = temp4; +} + +static INLINE void idct64_stage9_avx2(__m256i *u, const __m256i *cospim32, + const __m256i *cospi32, + const __m256i *clamp_lo, + const __m256i *clamp_hi, + const __m256i *rnding, int bit) { + int i; + __m256i temp1, temp2, temp3, temp4; + for (i = 0; i < 8; ++i) { + addsub_avx2(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi); + } + + temp1 = half_btf_avx2(cospim32, &u[20], cospi32, &u[27], rnding, bit); + temp2 = half_btf_avx2(cospim32, &u[21], cospi32, &u[26], rnding, bit); + temp3 = half_btf_avx2(cospim32, &u[22], cospi32, &u[25], rnding, bit); + temp4 = half_btf_avx2(cospim32, &u[23], cospi32, &u[24], rnding, bit); + u[24] = half_btf_avx2(cospi32, &u[23], cospi32, &u[24], rnding, bit); + u[25] = half_btf_avx2(cospi32, &u[22], cospi32, &u[25], rnding, bit); + u[26] = half_btf_avx2(cospi32, &u[21], cospi32, &u[26], rnding, bit); + u[27] = half_btf_avx2(cospi32, &u[20], cospi32, &u[27], rnding, bit); + u[20] = temp1; + u[21] = temp2; + u[22] = temp3; + u[23] = temp4; + for (i = 32; i < 40; i++) { + addsub_avx2(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi); + } + + for (i = 48; i < 56; i++) { + addsub_avx2(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi); + } +} + +static INLINE void idct64_stage10_avx2(__m256i *u, const __m256i *cospim32, + const __m256i *cospi32, + const __m256i *clamp_lo, + const __m256i *clamp_hi, + const __m256i *rnding, int bit) { + __m256i temp1, temp2, temp3, temp4; + for (int i = 0; i < 16; i++) { + addsub_avx2(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi); + } + + temp1 = half_btf_avx2(cospim32, &u[40], cospi32, &u[55], rnding, bit); + temp2 = half_btf_avx2(cospim32, &u[41], cospi32, &u[54], rnding, bit); + temp3 = half_btf_avx2(cospim32, &u[42], cospi32, &u[53], rnding, bit); + temp4 = half_btf_avx2(cospim32, &u[43], cospi32, &u[52], rnding, bit); + u[52] = half_btf_avx2(cospi32, &u[43], cospi32, &u[52], rnding, bit); + u[53] = half_btf_avx2(cospi32, &u[42], cospi32, &u[53], rnding, bit); + u[54] = half_btf_avx2(cospi32, &u[41], cospi32, &u[54], rnding, bit); + u[55] = half_btf_avx2(cospi32, &u[40], cospi32, &u[55], rnding, bit); + u[40] = temp1; + u[41] = temp2; + u[42] = temp3; + u[43] = temp4; + + temp1 = half_btf_avx2(cospim32, &u[44], cospi32, &u[51], rnding, bit); + temp2 = half_btf_avx2(cospim32, &u[45], cospi32, &u[50], rnding, bit); + temp3 = half_btf_avx2(cospim32, &u[46], cospi32, &u[49], rnding, bit); + temp4 = half_btf_avx2(cospim32, &u[47], cospi32, &u[48], rnding, bit); + u[48] = half_btf_avx2(cospi32, &u[47], cospi32, &u[48], rnding, bit); + u[49] = half_btf_avx2(cospi32, &u[46], cospi32, &u[49], rnding, bit); + u[50] = half_btf_avx2(cospi32, &u[45], cospi32, &u[50], rnding, bit); + u[51] = half_btf_avx2(cospi32, &u[44], cospi32, &u[51], rnding, bit); + u[44] = temp1; + u[45] = temp2; + u[46] = temp3; + u[47] = temp4; +} + +static INLINE void idct64_stage11_avx2(__m256i *u, __m256i *out, int do_cols, + int bd, int out_shift, + const __m256i *clamp_lo, + const __m256i *clamp_hi) { + for (int i = 0; i < 32; i++) { + addsub_avx2(u[i], u[63 - i], &out[(i)], &out[(63 - i)], clamp_lo, clamp_hi); + } + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + round_shift_8x8_avx2(out, out_shift); + round_shift_8x8_avx2(out + 16, out_shift); + round_shift_8x8_avx2(out + 32, out_shift); + round_shift_8x8_avx2(out + 48, out_shift); + highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64); + } +} + +static void idct64_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + + { + __m256i x; + + // stage 1 + // stage 2 + // stage 3 + // stage 4 + // stage 5 + // stage 6 + x = half_btf_0_avx2(&cospi32, &in[0], &rnding, bit); + + // stage 8 + // stage 9 + // stage 10 + // stage 11 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + if (out_shift != 0) { + __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1); + x = _mm256_add_epi32(x, offset); + x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); + } + } + x = _mm256_max_epi32(x, clamp_lo); + x = _mm256_min_epi32(x, clamp_hi); + out[0] = x; + out[1] = x; + out[2] = x; + out[3] = x; + out[4] = x; + out[5] = x; + out[6] = x; + out[7] = x; + out[8] = x; + out[9] = x; + out[10] = x; + out[11] = x; + out[12] = x; + out[13] = x; + out[14] = x; + out[15] = x; + out[16] = x; + out[17] = x; + out[18] = x; + out[19] = x; + out[20] = x; + out[21] = x; + out[22] = x; + out[23] = x; + out[24] = x; + out[25] = x; + out[26] = x; + out[27] = x; + out[28] = x; + out[29] = x; + out[30] = x; + out[31] = x; + out[32] = x; + out[33] = x; + out[34] = x; + out[35] = x; + out[36] = x; + out[37] = x; + out[38] = x; + out[39] = x; + out[40] = x; + out[41] = x; + out[42] = x; + out[43] = x; + out[44] = x; + out[45] = x; + out[46] = x; + out[47] = x; + out[48] = x; + out[49] = x; + out[50] = x; + out[51] = x; + out[52] = x; + out[53] = x; + out[54] = x; + out[55] = x; + out[56] = x; + out[57] = x; + out[58] = x; + out[59] = x; + out[60] = x; + out[61] = x; + out[62] = x; + out[63] = x; + } +} +static void idct64_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + + const __m256i cospi1 = _mm256_set1_epi32(cospi[1]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospi3 = _mm256_set1_epi32(cospi[3]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospi63 = _mm256_set1_epi32(cospi[63]); + const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]); + const __m256i cospi7 = _mm256_set1_epi32(cospi[7]); + const __m256i cospi5 = _mm256_set1_epi32(cospi[5]); + const __m256i cospi59 = _mm256_set1_epi32(cospi[59]); + const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + + { + __m256i u[64]; + + // stage 1 + u[0] = in[0]; + u[8] = in[4]; + u[16] = in[2]; + u[24] = in[6]; + u[32] = in[1]; + u[40] = in[5]; + u[48] = in[3]; + u[56] = in[7]; + + // stage 2 + u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit); + u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit); + u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit); + u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit); + u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit); + u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit); + u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit); + u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit); + + // stage 3 + u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit); + u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit); + u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit); + u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit); + u[33] = u[32]; + u[38] = u[39]; + u[41] = u[40]; + u[46] = u[47]; + u[49] = u[48]; + u[54] = u[55]; + u[57] = u[56]; + u[62] = u[63]; + + // stage 4 + __m256i temp1, temp2; + u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit); + u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit); + u[17] = u[16]; + u[22] = u[23]; + u[25] = u[24]; + u[30] = u[31]; + + temp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); + u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + u[33] = temp1; + + temp2 = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); + u[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); + u[57] = temp2; + + temp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); + u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); + u[41] = temp1; + + temp2 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); + u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); + u[46] = temp2; + + // stage 5 + u[9] = u[8]; + u[14] = u[15]; + + temp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit); + u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit); + u[17] = temp1; + + temp2 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit); + u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit); + u[22] = temp2; + + u[35] = u[32]; + u[34] = u[33]; + u[36] = u[39]; + u[37] = u[38]; + u[43] = u[40]; + u[42] = u[41]; + u[44] = u[47]; + u[45] = u[46]; + u[51] = u[48]; + u[50] = u[49]; + u[52] = u[55]; + u[53] = u[54]; + u[59] = u[56]; + u[58] = u[57]; + u[60] = u[63]; + u[61] = u[62]; + + // stage 6 + temp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); + u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); + u[0] = temp1; + + temp2 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + u[9] = temp2; + u[19] = u[16]; + u[18] = u[17]; + u[20] = u[23]; + u[21] = u[22]; + u[27] = u[24]; + u[26] = u[25]; + u[28] = u[31]; + u[29] = u[30]; + + temp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); + u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + u[34] = temp1; + temp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); + u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); + u[35] = temp2; + temp1 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); + u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); + u[36] = temp1; + temp2 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); + u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); + u[37] = temp2; + temp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); + u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); + u[42] = temp1; + temp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); + u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); + u[43] = temp2; + temp1 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); + u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); + u[44] = temp1; + temp2 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); + u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); + u[45] = temp2; + + // stage 7 + u[3] = u[0]; + u[2] = u[1]; + u[11] = u[8]; + u[10] = u[9]; + u[12] = u[15]; + u[13] = u[14]; + + temp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit); + u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit); + u[18] = temp1; + temp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit); + u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit); + u[19] = temp2; + temp1 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit); + u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit); + u[20] = temp1; + temp2 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit); + u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit); + u[21] = temp2; + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + u[7] = u[0]; + u[6] = u[1]; + u[5] = u[2]; + u[4] = u[3]; + + idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); + + // stage 9 + idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 10 + idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 11 + idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); + } +} +static void idct64_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + + const __m256i cospi1 = _mm256_set1_epi32(cospi[1]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospi3 = _mm256_set1_epi32(cospi[3]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi5 = _mm256_set1_epi32(cospi[5]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi7 = _mm256_set1_epi32(cospi[7]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi9 = _mm256_set1_epi32(cospi[9]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi11 = _mm256_set1_epi32(cospi[11]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi13 = _mm256_set1_epi32(cospi[13]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi15 = _mm256_set1_epi32(cospi[15]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi51 = _mm256_set1_epi32(cospi[51]); + const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi55 = _mm256_set1_epi32(cospi[55]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi59 = _mm256_set1_epi32(cospi[59]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi63 = _mm256_set1_epi32(cospi[63]); + + const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]); + const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]); + const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]); + + { + __m256i u[64]; + __m256i tmp1, tmp2, tmp3, tmp4; + // stage 1 + u[0] = in[0]; + u[32] = in[1]; + u[36] = in[9]; + u[40] = in[5]; + u[44] = in[13]; + u[48] = in[3]; + u[52] = in[11]; + u[56] = in[7]; + u[60] = in[15]; + u[16] = in[2]; + u[20] = in[10]; + u[24] = in[6]; + u[28] = in[14]; + u[4] = in[8]; + u[8] = in[4]; + u[12] = in[12]; + + // stage 2 + u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit); + u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit); + u[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit); + u[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit); + u[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit); + u[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit); + u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit); + u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit); + u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit); + u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit); + u[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit); + u[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit); + u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit); + u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit); + u[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit); + u[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit); + + // stage 3 + u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit); + u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit); + u[19] = half_btf_0_avx2(&cospim50, &u[28], &rnding, bit); + u[28] = half_btf_0_avx2(&cospi14, &u[28], &rnding, bit); + u[27] = half_btf_0_avx2(&cospi10, &u[20], &rnding, bit); + u[20] = half_btf_0_avx2(&cospi54, &u[20], &rnding, bit); + u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit); + u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit); + u[33] = u[32]; + u[34] = u[35]; + u[37] = u[36]; + u[38] = u[39]; + u[41] = u[40]; + u[42] = u[43]; + u[45] = u[44]; + u[46] = u[47]; + u[49] = u[48]; + u[50] = u[51]; + u[53] = u[52]; + u[54] = u[55]; + u[57] = u[56]; + u[58] = u[59]; + u[61] = u[60]; + u[62] = u[63]; + + // stage 4 + u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit); + u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit); + u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit); + u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit); + + u[17] = u[16]; + u[18] = u[19]; + u[21] = u[20]; + u[22] = u[23]; + u[25] = u[24]; + u[26] = u[27]; + u[29] = u[28]; + u[30] = u[31]; + + tmp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); + tmp2 = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); + tmp3 = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); + tmp4 = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); + u[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); + u[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); + u[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); + u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + u[33] = tmp1; + u[34] = tmp2; + u[37] = tmp3; + u[38] = tmp4; + + tmp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); + tmp2 = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); + tmp3 = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); + tmp4 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); + u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); + u[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); + u[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); + u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); + u[41] = tmp1; + u[42] = tmp2; + u[45] = tmp3; + u[46] = tmp4; + + // stage 5 + u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit); + u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit); + + u[9] = u[8]; + u[10] = u[11]; + u[13] = u[12]; + u[14] = u[15]; + + tmp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit); + tmp2 = half_btf_avx2(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit); + tmp3 = half_btf_avx2(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit); + tmp4 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit); + u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit); + u[26] = half_btf_avx2(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit); + u[29] = half_btf_avx2(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit); + u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit); + u[17] = tmp1; + u[18] = tmp2; + u[21] = tmp3; + u[22] = tmp4; + + for (i = 32; i < 64; i += 8) { + addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + // stage 6 + tmp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); + u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); + u[0] = tmp1; + u[5] = u[4]; + u[6] = u[7]; + + tmp1 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + u[9] = tmp1; + tmp2 = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + u[10] = tmp2; + + for (i = 16; i < 32; i += 8) { + addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + tmp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); + tmp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); + tmp3 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); + tmp4 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); + u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); + u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); + u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); + u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + u[34] = tmp1; + u[35] = tmp2; + u[36] = tmp3; + u[37] = tmp4; + + tmp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); + tmp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); + tmp3 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); + tmp4 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); + u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); + u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); + u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); + u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); + u[42] = tmp1; + u[43] = tmp2; + u[44] = tmp3; + u[45] = tmp4; + + // stage 7 + u[3] = u[0]; + u[2] = u[1]; + tmp1 = half_btf_avx2(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit); + u[6] = half_btf_avx2(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit); + u[5] = tmp1; + addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + tmp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit); + tmp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit); + tmp3 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit); + tmp4 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit); + u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit); + u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit); + u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit); + u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit); + u[18] = tmp1; + u[19] = tmp2; + u[20] = tmp3; + u[21] = tmp4; + + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + for (i = 0; i < 4; ++i) { + addsub_avx2(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi); + } + + idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); + + // stage 9 + idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 10 + idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 11 + idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); + } +} +static void idct64_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, + int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + + const __m256i cospi1 = _mm256_set1_epi32(cospi[1]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospi3 = _mm256_set1_epi32(cospi[3]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi5 = _mm256_set1_epi32(cospi[5]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi7 = _mm256_set1_epi32(cospi[7]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi9 = _mm256_set1_epi32(cospi[9]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi11 = _mm256_set1_epi32(cospi[11]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi13 = _mm256_set1_epi32(cospi[13]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi15 = _mm256_set1_epi32(cospi[15]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospi17 = _mm256_set1_epi32(cospi[17]); + const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); + const __m256i cospi19 = _mm256_set1_epi32(cospi[19]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi21 = _mm256_set1_epi32(cospi[21]); + const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); + const __m256i cospi23 = _mm256_set1_epi32(cospi[23]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi25 = _mm256_set1_epi32(cospi[25]); + const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); + const __m256i cospi27 = _mm256_set1_epi32(cospi[27]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi29 = _mm256_set1_epi32(cospi[29]); + const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); + const __m256i cospi31 = _mm256_set1_epi32(cospi[31]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi35 = _mm256_set1_epi32(cospi[35]); + const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); + const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); + const __m256i cospi39 = _mm256_set1_epi32(cospi[39]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi43 = _mm256_set1_epi32(cospi[43]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); + const __m256i cospi47 = _mm256_set1_epi32(cospi[47]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi51 = _mm256_set1_epi32(cospi[51]); + const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi55 = _mm256_set1_epi32(cospi[55]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi59 = _mm256_set1_epi32(cospi[59]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi63 = _mm256_set1_epi32(cospi[63]); + + const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospim33 = _mm256_set1_epi32(-cospi[33]); + const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospim37 = _mm256_set1_epi32(-cospi[37]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospim41 = _mm256_set1_epi32(-cospi[41]); + const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]); + const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]); + const __m256i cospim45 = _mm256_set1_epi32(-cospi[45]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]); + const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]); + const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]); + + { + __m256i u[64], v[64]; + + // stage 1 + u[32] = in[1]; + u[34] = in[17]; + u[36] = in[9]; + u[38] = in[25]; + u[40] = in[5]; + u[42] = in[21]; + u[44] = in[13]; + u[46] = in[29]; + u[48] = in[3]; + u[50] = in[19]; + u[52] = in[11]; + u[54] = in[27]; + u[56] = in[7]; + u[58] = in[23]; + u[60] = in[15]; + u[62] = in[31]; + + v[16] = in[2]; + v[18] = in[18]; + v[20] = in[10]; + v[22] = in[26]; + v[24] = in[6]; + v[26] = in[22]; + v[28] = in[14]; + v[30] = in[30]; + + u[8] = in[4]; + u[10] = in[20]; + u[12] = in[12]; + u[14] = in[28]; + + v[4] = in[8]; + v[6] = in[24]; + + u[0] = in[0]; + u[2] = in[16]; + + // stage 2 + v[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit); + v[33] = half_btf_0_avx2(&cospim33, &u[62], &rnding, bit); + v[34] = half_btf_0_avx2(&cospi47, &u[34], &rnding, bit); + v[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit); + v[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit); + v[37] = half_btf_0_avx2(&cospim41, &u[58], &rnding, bit); + v[38] = half_btf_0_avx2(&cospi39, &u[38], &rnding, bit); + v[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit); + v[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit); + v[41] = half_btf_0_avx2(&cospim37, &u[54], &rnding, bit); + v[42] = half_btf_0_avx2(&cospi43, &u[42], &rnding, bit); + v[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit); + v[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit); + v[45] = half_btf_0_avx2(&cospim45, &u[50], &rnding, bit); + v[46] = half_btf_0_avx2(&cospi35, &u[46], &rnding, bit); + v[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit); + v[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit); + v[49] = half_btf_0_avx2(&cospi29, &u[46], &rnding, bit); + v[50] = half_btf_0_avx2(&cospi19, &u[50], &rnding, bit); + v[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit); + v[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit); + v[53] = half_btf_0_avx2(&cospi21, &u[42], &rnding, bit); + v[54] = half_btf_0_avx2(&cospi27, &u[54], &rnding, bit); + v[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit); + v[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit); + v[57] = half_btf_0_avx2(&cospi25, &u[38], &rnding, bit); + v[58] = half_btf_0_avx2(&cospi23, &u[58], &rnding, bit); + v[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit); + v[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit); + v[61] = half_btf_0_avx2(&cospi17, &u[34], &rnding, bit); + v[62] = half_btf_0_avx2(&cospi31, &u[62], &rnding, bit); + v[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit); + + // stage 3 + u[16] = half_btf_0_avx2(&cospi62, &v[16], &rnding, bit); + u[17] = half_btf_0_avx2(&cospim34, &v[30], &rnding, bit); + u[18] = half_btf_0_avx2(&cospi46, &v[18], &rnding, bit); + u[19] = half_btf_0_avx2(&cospim50, &v[28], &rnding, bit); + u[20] = half_btf_0_avx2(&cospi54, &v[20], &rnding, bit); + u[21] = half_btf_0_avx2(&cospim42, &v[26], &rnding, bit); + u[22] = half_btf_0_avx2(&cospi38, &v[22], &rnding, bit); + u[23] = half_btf_0_avx2(&cospim58, &v[24], &rnding, bit); + u[24] = half_btf_0_avx2(&cospi6, &v[24], &rnding, bit); + u[25] = half_btf_0_avx2(&cospi26, &v[22], &rnding, bit); + u[26] = half_btf_0_avx2(&cospi22, &v[26], &rnding, bit); + u[27] = half_btf_0_avx2(&cospi10, &v[20], &rnding, bit); + u[28] = half_btf_0_avx2(&cospi14, &v[28], &rnding, bit); + u[29] = half_btf_0_avx2(&cospi18, &v[18], &rnding, bit); + u[30] = half_btf_0_avx2(&cospi30, &v[30], &rnding, bit); + u[31] = half_btf_0_avx2(&cospi2, &v[16], &rnding, bit); + + for (i = 32; i < 64; i += 4) { + addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, + &clamp_hi); + addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, + &clamp_hi); + } + + // stage 4 + v[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit); + v[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit); + v[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit); + v[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit); + v[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit); + v[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit); + v[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit); + v[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit); + + for (i = 16; i < 32; i += 4) { + addsub_avx2(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo, + &clamp_hi); + addsub_avx2(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 64; i += 4) { + v[i + 0] = u[i + 0]; + v[i + 3] = u[i + 3]; + } + + v[33] = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); + v[34] = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); + v[37] = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); + v[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); + v[41] = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); + v[42] = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); + v[45] = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); + v[46] = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); + v[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); + v[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); + v[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); + v[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); + v[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); + v[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); + v[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); + v[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + + // stage 5 + u[4] = half_btf_0_avx2(&cospi56, &v[4], &rnding, bit); + u[5] = half_btf_0_avx2(&cospim40, &v[6], &rnding, bit); + u[6] = half_btf_0_avx2(&cospi24, &v[6], &rnding, bit); + u[7] = half_btf_0_avx2(&cospi8, &v[4], &rnding, bit); + + for (i = 8; i < 16; i += 4) { + addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, + &clamp_hi); + addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, + &clamp_hi); + } + + for (i = 16; i < 32; i += 4) { + u[i + 0] = v[i + 0]; + u[i + 3] = v[i + 3]; + } + + u[17] = half_btf_avx2(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit); + u[18] = half_btf_avx2(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit); + u[21] = half_btf_avx2(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit); + u[22] = half_btf_avx2(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit); + u[25] = half_btf_avx2(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit); + u[26] = half_btf_avx2(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit); + u[29] = half_btf_avx2(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit); + u[30] = half_btf_avx2(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit); + + for (i = 32; i < 64; i += 8) { + addsub_avx2(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_avx2(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_avx2(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_avx2(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + // stage 6 + v[0] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); + v[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); + v[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit); + v[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit); + + addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); + addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); + + for (i = 8; i < 16; i += 4) { + v[i + 0] = u[i + 0]; + v[i + 3] = u[i + 3]; + } + + v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + + for (i = 16; i < 32; i += 8) { + addsub_avx2(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo, + &clamp_hi); + addsub_avx2(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo, + &clamp_hi); + + addsub_avx2(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo, + &clamp_hi); + addsub_avx2(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 64; i += 8) { + v[i + 0] = u[i + 0]; + v[i + 1] = u[i + 1]; + v[i + 6] = u[i + 6]; + v[i + 7] = u[i + 7]; + } + + v[34] = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); + v[35] = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); + v[36] = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); + v[37] = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); + v[42] = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); + v[43] = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); + v[44] = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); + v[45] = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); + v[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); + v[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); + v[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); + v[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); + v[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); + v[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); + v[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); + v[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + + // stage 7 + addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + + u[4] = v[4]; + u[7] = v[7]; + u[5] = half_btf_avx2(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit); + u[6] = half_btf_avx2(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit); + + addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + for (i = 16; i < 32; i += 8) { + u[i + 0] = v[i + 0]; + u[i + 1] = v[i + 1]; + u[i + 6] = v[i + 6]; + u[i + 7] = v[i + 7]; + } + + u[18] = half_btf_avx2(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit); + u[19] = half_btf_avx2(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit); + u[20] = half_btf_avx2(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit); + u[21] = half_btf_avx2(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit); + u[26] = half_btf_avx2(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit); + u[27] = half_btf_avx2(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit); + u[28] = half_btf_avx2(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit); + u[29] = half_btf_avx2(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit); + + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_avx2(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_avx2(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + for (i = 0; i < 4; ++i) { + addsub_avx2(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi); + } + + v[8] = u[8]; + v[9] = u[9]; + v[14] = u[14]; + v[15] = u[15]; + + v[10] = half_btf_avx2(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit); + v[11] = half_btf_avx2(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit); + v[12] = half_btf_avx2(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit); + v[13] = half_btf_avx2(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit); + + for (i = 16; i < 20; ++i) { + addsub_avx2(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi); + addsub_avx2(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 36; ++i) { + v[i] = u[i]; + v[i + 12] = u[i + 12]; + v[i + 16] = u[i + 16]; + v[i + 28] = u[i + 28]; + } + + v[36] = half_btf_avx2(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit); + v[37] = half_btf_avx2(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit); + v[38] = half_btf_avx2(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit); + v[39] = half_btf_avx2(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit); + v[40] = half_btf_avx2(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit); + v[41] = half_btf_avx2(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit); + v[42] = half_btf_avx2(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit); + v[43] = half_btf_avx2(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit); + v[52] = half_btf_avx2(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit); + v[53] = half_btf_avx2(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit); + v[54] = half_btf_avx2(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit); + v[55] = half_btf_avx2(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit); + v[56] = half_btf_avx2(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit); + v[57] = half_btf_avx2(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit); + v[58] = half_btf_avx2(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit); + v[59] = half_btf_avx2(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit); + + // stage 9 + for (i = 0; i < 8; ++i) { + addsub_avx2(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi); + } + + for (i = 16; i < 20; ++i) { + u[i] = v[i]; + u[i + 12] = v[i + 12]; + } + + u[20] = half_btf_avx2(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit); + u[21] = half_btf_avx2(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit); + u[22] = half_btf_avx2(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit); + u[23] = half_btf_avx2(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit); + u[24] = half_btf_avx2(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit); + u[25] = half_btf_avx2(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit); + u[26] = half_btf_avx2(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit); + u[27] = half_btf_avx2(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit); + + for (i = 32; i < 40; i++) { + addsub_avx2(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi); + } + + for (i = 48; i < 56; i++) { + addsub_avx2(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi); + } + + // stage 10 + for (i = 0; i < 16; i++) { + addsub_avx2(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi); + } + + for (i = 32; i < 40; i++) v[i] = u[i]; + + v[40] = half_btf_avx2(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit); + v[41] = half_btf_avx2(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit); + v[42] = half_btf_avx2(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit); + v[43] = half_btf_avx2(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit); + v[44] = half_btf_avx2(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit); + v[45] = half_btf_avx2(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit); + v[46] = half_btf_avx2(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit); + v[47] = half_btf_avx2(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit); + v[48] = half_btf_avx2(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit); + v[49] = half_btf_avx2(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit); + v[50] = half_btf_avx2(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit); + v[51] = half_btf_avx2(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit); + v[52] = half_btf_avx2(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit); + v[53] = half_btf_avx2(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit); + v[54] = half_btf_avx2(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit); + v[55] = half_btf_avx2(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit); + + for (i = 56; i < 64; i++) v[i] = u[i]; + + // stage 11 + for (i = 0; i < 32; i++) { + addsub_avx2(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo, + &clamp_hi); + } + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = + _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + round_shift_8x8_avx2(out, out_shift); + round_shift_8x8_avx2(out + 16, out_shift); + round_shift_8x8_avx2(out + 32, out_shift); + round_shift_8x8_avx2(out + 48, out_shift); + highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64); + } + } +} +typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, int bit, + int do_cols, int bd, int out_shift); + +static const transform_1d_avx2 + highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { + { + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { + { idct8x8_low1_avx2, idct8x8_avx2, NULL, NULL }, + { iadst8x8_low1_avx2, iadst8x8_avx2, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { + { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL }, + { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + + { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2, idct64_avx2 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } } + }; + +static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, + TX_SIZE tx_size, int eob, + const int bd) { + __m256i buf1[64 * 8]; + int eobx, eoby; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 3; + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int input_stride = AOMMIN(32, txfm_size_col); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_avx2 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_1d_avx2 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + for (int i = 0; i < buf_size_nonzero_h_div8; i++) { + __m256i buf0[64]; + const int32_t *input_row = input + i * input_stride * 8; + for (int j = 0; j < buf_size_nonzero_w_div8; ++j) { + __m256i *buf0_cur = buf0 + j * 8; + load_buffer_32x32(input_row + j * 8, buf0_cur, input_stride, 8); + + transpose_8x8_avx2(&buf0_cur[0], &buf0_cur[0]); + } + if (rect_type == 1 || rect_type == -1) { + av1_round_shift_rect_array_32_avx2( + buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2); + } + row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + -shift[0]); + + __m256i *_buf1 = buf1 + i * 8; + if (lr_flip) { + for (int j = 0; j < buf_size_w_div8; ++j) { + transpose_8x8_flip_avx2( + &buf0[j * 8], &_buf1[(buf_size_w_div8 - 1 - j) * txfm_size_row]); + } + } else { + for (int j = 0; j < buf_size_w_div8; ++j) { + transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]); + } + } + } + // 2nd stage: column transform + for (int i = 0; i < buf_size_w_div8; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, + av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + + av1_round_shift_array_32_avx2(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + if (txfm_size_col >= 16) { + for (int i = 0; i < (txfm_size_col >> 4); i++) { + highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2, + output + 16 * i, stride, ud_flip, + txfm_size_row, bd); + } + } else if (txfm_size_col == 8) { + highbd_write_buffer_8xn_avx2(buf1, output, stride, ud_flip, txfm_size_row, + bd); + } +} + +void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: + case FLIPADST_DCT: + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output), + stride, tx_type, tx_size, eob, bd); + break; + case IDTX: + case H_DCT: + case H_ADST: + case H_FLIPADST: + case V_DCT: + case V_ADST: + case V_FLIPADST: + av1_highbd_inv_txfm2d_add_universe_sse4_1(input, output, stride, tx_type, + tx_size, eob, bd); + break; + default: assert(0); break; + } +} +void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + const TX_SIZE tx_size = txfm_param->tx_size; + switch (tx_size) { + case TX_4X8: + av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param); + break; + case TX_8X4: + av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param); + break; + case TX_4X4: + av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param); + break; + case TX_16X4: + av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param); + break; + case TX_4X16: + av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param); + break; + default: + av1_highbd_inv_txfm2d_add_universe_avx2( + input, dest, stride, txfm_param->tx_type, txfm_param->tx_size, + txfm_param->eob, txfm_param->bd); + break; + } +} diff --git a/libs/libaom/src/av1/common/x86/highbd_inv_txfm_sse4.c b/libs/libaom/src/av1/common/x86/highbd_inv_txfm_sse4.c new file mode 100644 index 000000000..03eaef832 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/highbd_inv_txfm_sse4.c @@ -0,0 +1,5821 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include /* SSE4.1 */ + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/idct.h" +#include "av1/common/x86/av1_inv_txfm_ssse3.h" +#include "av1/common/x86/av1_txfm_sse2.h" +#include "av1/common/x86/av1_txfm_sse4.h" +#include "av1/common/x86/highbd_txfm_utility_sse4.h" + +static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); + __m128i clamped, mask; + + mask = _mm_cmpgt_epi16(u, max); + clamped = _mm_andnot_si128(mask, u); + mask = _mm_and_si128(mask, max); + clamped = _mm_or_si128(mask, clamped); + mask = _mm_cmpgt_epi16(clamped, zero); + clamped = _mm_and_si128(clamped, mask); + + return clamped; +} + +static INLINE void round_shift_4x4(__m128i *in, int shift) { + if (shift != 0) { + __m128i rnding = _mm_set1_epi32(1 << (shift - 1)); + in[0] = _mm_add_epi32(in[0], rnding); + in[1] = _mm_add_epi32(in[1], rnding); + in[2] = _mm_add_epi32(in[2], rnding); + in[3] = _mm_add_epi32(in[3], rnding); + + in[0] = _mm_srai_epi32(in[0], shift); + in[1] = _mm_srai_epi32(in[1], shift); + in[2] = _mm_srai_epi32(in[2], shift); + in[3] = _mm_srai_epi32(in[3], shift); + } +} + +static void round_shift_8x8(__m128i *in, int shift) { + round_shift_4x4(&in[0], shift); + round_shift_4x4(&in[4], shift); + round_shift_4x4(&in[8], shift); + round_shift_4x4(&in[12], shift); +} + +static void highbd_clamp_epi32_sse4_1(__m128i *in, __m128i *out, + const __m128i *clamp_lo, + const __m128i *clamp_hi, int size) { + __m128i a0, a1; + for (int i = 0; i < size; i += 4) { + a0 = _mm_max_epi32(in[i], *clamp_lo); + out[i] = _mm_min_epi32(a0, *clamp_hi); + + a1 = _mm_max_epi32(in[i + 1], *clamp_lo); + out[i + 1] = _mm_min_epi32(a1, *clamp_hi); + + a0 = _mm_max_epi32(in[i + 2], *clamp_lo); + out[i + 2] = _mm_min_epi32(a0, *clamp_hi); + + a1 = _mm_max_epi32(in[i + 3], *clamp_lo); + out[i + 3] = _mm_min_epi32(a1, *clamp_hi); + } +} + +static INLINE __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred, + __m128i res0, __m128i res1, + const int bd) { + __m128i x0 = _mm_cvtepi16_epi32(pred); + __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8)); + __m128i min_clip_val = _mm_setzero_si128(); + __m128i max_clip_val = _mm_set1_epi32((1 << bd) - 1); + x0 = _mm_add_epi32(res0, x0); + x1 = _mm_add_epi32(res1, x1); + x0 = _mm_max_epi32(x0, min_clip_val); + x0 = _mm_min_epi32(x0, max_clip_val); + x1 = _mm_max_epi32(x1, min_clip_val); + x1 = _mm_min_epi32(x1, max_clip_val); + x0 = _mm_packus_epi32(x0, x1); + return x0; +} + +static INLINE __m128i highbd_get_recon_4xn_sse4_1(const __m128i pred, + __m128i res0, const int bd) { + __m128i x0 = _mm_cvtepi16_epi32(pred); + + x0 = _mm_add_epi32(res0, x0); + x0 = _mm_packus_epi32(x0, x0); + x0 = highbd_clamp_epi16(x0, bd); + return x0; +} + +static INLINE void highbd_write_buffer_4xn_sse4_1(__m128i *in, uint16_t *output, + int stride, int flipud, + int height, const int bd) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride)); + __m128i u = highbd_get_recon_4xn_sse4_1(v, in[j], bd); + + _mm_storel_epi64((__m128i *)(output + i * stride), u); + } +} + +static INLINE void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output, + int stride, int flipud, + int height, const int bd) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride)); + __m128i u = highbd_get_recon_8x8_sse4_1(v, in[j], in[j + height], bd); + + _mm_storeu_si128((__m128i *)(output + i * stride), u); + } +} + +static INLINE void load_buffer_32bit_input(const int32_t *in, int stride, + __m128i *out, int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = _mm_loadu_si128((const __m128i *)(in + i * stride)); + } +} + +static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) { + in[0] = _mm_load_si128((const __m128i *)(coeff + 0)); + in[1] = _mm_load_si128((const __m128i *)(coeff + 4)); + in[2] = _mm_load_si128((const __m128i *)(coeff + 8)); + in[3] = _mm_load_si128((const __m128i *)(coeff + 12)); +} + +static void addsub_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0, + __m128i *out1, const __m128i *clamp_lo, + const __m128i *clamp_hi) { + __m128i a0 = _mm_add_epi32(in0, in1); + __m128i a1 = _mm_sub_epi32(in0, in1); + + a0 = _mm_max_epi32(a0, *clamp_lo); + a0 = _mm_min_epi32(a0, *clamp_hi); + a1 = _mm_max_epi32(a1, *clamp_lo); + a1 = _mm_min_epi32(a1, *clamp_hi); + + *out0 = a0; + *out1 = a1; +} + +static void shift_and_clamp_sse4_1(__m128i *in0, __m128i *in1, + const __m128i *clamp_lo, + const __m128i *clamp_hi, int shift) { + __m128i offset = _mm_set1_epi32((1 << shift) >> 1); + __m128i in0_w_offset = _mm_add_epi32(*in0, offset); + __m128i in1_w_offset = _mm_add_epi32(*in1, offset); + + in0_w_offset = _mm_sra_epi32(in0_w_offset, _mm_cvtsi32_si128(shift)); + in1_w_offset = _mm_sra_epi32(in1_w_offset, _mm_cvtsi32_si128(shift)); + + in0_w_offset = _mm_max_epi32(in0_w_offset, *clamp_lo); + in0_w_offset = _mm_min_epi32(in0_w_offset, *clamp_hi); + in1_w_offset = _mm_max_epi32(in1_w_offset, *clamp_lo); + in1_w_offset = _mm_min_epi32(in1_w_offset, *clamp_hi); + + *in0 = in0_w_offset; + *in1 = in1_w_offset; +} + +static INLINE void idct32_stage4_sse4_1( + __m128i *bf1, const __m128i *cospim8, const __m128i *cospi56, + const __m128i *cospi8, const __m128i *cospim56, const __m128i *cospim40, + const __m128i *cospi24, const __m128i *cospi40, const __m128i *cospim24, + const __m128i *rounding, int bit) { + __m128i temp1, temp2; + temp1 = half_btf_sse4_1(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit); + bf1[30] = half_btf_sse4_1(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit); + bf1[17] = temp1; + + temp2 = half_btf_sse4_1(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit); + bf1[29] = + half_btf_sse4_1(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit); + bf1[18] = temp2; + + temp1 = half_btf_sse4_1(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit); + bf1[26] = + half_btf_sse4_1(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit); + bf1[21] = temp1; + + temp2 = + half_btf_sse4_1(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit); + bf1[25] = + half_btf_sse4_1(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit); + bf1[22] = temp2; +} + +static INLINE void idct32_stage5_sse4_1( + __m128i *bf1, const __m128i *cospim16, const __m128i *cospi48, + const __m128i *cospi16, const __m128i *cospim48, const __m128i *clamp_lo, + const __m128i *clamp_hi, const __m128i *rounding, int bit) { + __m128i temp1, temp2; + temp1 = half_btf_sse4_1(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit); + bf1[14] = half_btf_sse4_1(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit); + bf1[9] = temp1; + + temp2 = + half_btf_sse4_1(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit); + bf1[13] = + half_btf_sse4_1(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit); + bf1[10] = temp2; + + addsub_sse4_1(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi); +} + +static INLINE void idct32_stage6_sse4_1( + __m128i *bf1, const __m128i *cospim32, const __m128i *cospi32, + const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16, + const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi, + const __m128i *rounding, int bit) { + __m128i temp1, temp2; + temp1 = half_btf_sse4_1(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit); + bf1[6] = half_btf_sse4_1(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit); + bf1[5] = temp1; + + addsub_sse4_1(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi); + + temp1 = half_btf_sse4_1(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit); + bf1[29] = + half_btf_sse4_1(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit); + bf1[18] = temp1; + temp2 = half_btf_sse4_1(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit); + bf1[28] = + half_btf_sse4_1(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit); + bf1[19] = temp2; + temp1 = + half_btf_sse4_1(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit); + bf1[27] = + half_btf_sse4_1(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit); + bf1[20] = temp1; + temp2 = + half_btf_sse4_1(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit); + bf1[26] = + half_btf_sse4_1(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit); + bf1[21] = temp2; +} + +static INLINE void idct32_stage7_sse4_1(__m128i *bf1, const __m128i *cospim32, + const __m128i *cospi32, + const __m128i *clamp_lo, + const __m128i *clamp_hi, + const __m128i *rounding, int bit) { + __m128i temp1, temp2; + addsub_sse4_1(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi); + + temp1 = half_btf_sse4_1(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit); + bf1[13] = + half_btf_sse4_1(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit); + bf1[10] = temp1; + temp2 = half_btf_sse4_1(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit); + bf1[12] = + half_btf_sse4_1(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit); + bf1[11] = temp2; + + addsub_sse4_1(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi); +} + +static INLINE void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32, + const __m128i *cospi32, + const __m128i *clamp_lo, + const __m128i *clamp_hi, + const __m128i *rounding, int bit) { + __m128i temp1, temp2; + addsub_sse4_1(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi); + + temp1 = half_btf_sse4_1(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit); + bf1[27] = + half_btf_sse4_1(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit); + bf1[20] = temp1; + temp2 = half_btf_sse4_1(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit); + bf1[26] = + half_btf_sse4_1(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit); + bf1[21] = temp2; + temp1 = half_btf_sse4_1(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit); + bf1[25] = + half_btf_sse4_1(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit); + bf1[22] = temp1; + temp2 = half_btf_sse4_1(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit); + bf1[24] = + half_btf_sse4_1(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit); + bf1[23] = temp2; +} + +static INLINE void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out, + const int do_cols, const int bd, + const int out_shift, + const __m128i *clamp_lo, + const __m128i *clamp_hi) { + addsub_sse4_1(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + for (int i = 0; i < 32; i += 8) { + round_shift_4x4(out + i, out_shift); + round_shift_4x4(out + i + 4, out_shift); + } + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32); + } +} + +static void neg_shift_sse4_1(const __m128i in0, const __m128i in1, + __m128i *out0, __m128i *out1, + const __m128i *clamp_lo, const __m128i *clamp_hi, + int shift) { + __m128i offset = _mm_set1_epi32((1 << shift) >> 1); + __m128i a0 = _mm_add_epi32(offset, in0); + __m128i a1 = _mm_sub_epi32(offset, in1); + + a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift)); + a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift)); + + a0 = _mm_max_epi32(a0, *clamp_lo); + a0 = _mm_min_epi32(a0, *clamp_hi); + a1 = _mm_max_epi32(a1, *clamp_lo); + a1 = _mm_min_epi32(a1, *clamp_hi); + + *out0 = a0; + *out1 = a1; +} + +static void idct4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u0, u1, u2, u3; + __m128i v0, v1, v2, v3, x, y; + + // Stage 0 + // Stage 1 + // Stage 2 + v0 = _mm_unpacklo_epi32(in[0], in[1]); + v1 = _mm_unpackhi_epi32(in[0], in[1]); + v2 = _mm_unpacklo_epi32(in[2], in[3]); + v3 = _mm_unpackhi_epi32(in[2], in[3]); + + u0 = _mm_unpacklo_epi64(v0, v2); + u1 = _mm_unpackhi_epi64(v0, v2); + u2 = _mm_unpacklo_epi64(v1, v3); + u3 = _mm_unpackhi_epi64(v1, v3); + + x = _mm_mullo_epi32(u0, cospi32); + y = _mm_mullo_epi32(u2, cospi32); + v0 = _mm_add_epi32(x, y); + v0 = _mm_add_epi32(v0, rnding); + v0 = _mm_srai_epi32(v0, bit); + + v1 = _mm_sub_epi32(x, y); + v1 = _mm_add_epi32(v1, rnding); + v1 = _mm_srai_epi32(v1, bit); + + x = _mm_mullo_epi32(u1, cospi48); + y = _mm_mullo_epi32(u3, cospim16); + v2 = _mm_add_epi32(x, y); + v2 = _mm_add_epi32(v2, rnding); + v2 = _mm_srai_epi32(v2, bit); + + x = _mm_mullo_epi32(u1, cospi16); + y = _mm_mullo_epi32(u3, cospi48); + v3 = _mm_add_epi32(x, y); + v3 = _mm_add_epi32(v3, rnding); + v3 = _mm_srai_epi32(v3, bit); + + // Stage 3 + addsub_sse4_1(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi); + addsub_sse4_1(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi); + + if (!do_cols) { + log_range = AOMMAX(16, bd + 6); + clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + + shift_and_clamp_sse4_1(out + 0, out + 3, &clamp_lo, &clamp_hi, out_shift); + shift_and_clamp_sse4_1(out + 1, out + 2, &clamp_lo, &clamp_hi, out_shift); + } +} + +static void iadst4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *sinpi = sinpi_arr(bit); + const __m128i zero = _mm_set1_epi32(0); + __m128i rnding = _mm_set1_epi32(1 << (bit + 4 - 1)); + rnding = _mm_unpacklo_epi32(rnding, zero); + const __m128i mul = _mm_set1_epi32(1 << 4); + const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]); + const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]); + const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]); + const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]); + __m128i t; + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + __m128i x0, x1, x2, x3; + __m128i u0, u1, u2, u3; + __m128i v0, v1, v2, v3; + __m128i u0_low, u1_low, u2_low, u3_low; + __m128i u0_high, u1_high, u2_high, u3_high; + + v0 = _mm_unpacklo_epi32(in[0], in[1]); + v1 = _mm_unpackhi_epi32(in[0], in[1]); + v2 = _mm_unpacklo_epi32(in[2], in[3]); + v3 = _mm_unpackhi_epi32(in[2], in[3]); + + x0 = _mm_unpacklo_epi64(v0, v2); + x1 = _mm_unpackhi_epi64(v0, v2); + x2 = _mm_unpacklo_epi64(v1, v3); + x3 = _mm_unpackhi_epi64(v1, v3); + + s0 = _mm_mullo_epi32(x0, sinpi1); + s1 = _mm_mullo_epi32(x0, sinpi2); + s2 = _mm_mullo_epi32(x1, sinpi3); + s3 = _mm_mullo_epi32(x2, sinpi4); + s4 = _mm_mullo_epi32(x2, sinpi1); + s5 = _mm_mullo_epi32(x3, sinpi2); + s6 = _mm_mullo_epi32(x3, sinpi4); + t = _mm_sub_epi32(x0, x2); + s7 = _mm_add_epi32(t, x3); + + t = _mm_add_epi32(s0, s3); + s0 = _mm_add_epi32(t, s5); + t = _mm_sub_epi32(s1, s4); + s1 = _mm_sub_epi32(t, s6); + s3 = s2; + s2 = _mm_mullo_epi32(s7, sinpi3); + + u0 = _mm_add_epi32(s0, s3); + u1 = _mm_add_epi32(s1, s3); + u2 = s2; + t = _mm_add_epi32(s0, s1); + u3 = _mm_sub_epi32(t, s3); + + // u0 + u0_low = _mm_mul_epi32(u0, mul); + u0_low = _mm_add_epi64(u0_low, rnding); + + u0 = _mm_srli_si128(u0, 4); + u0_high = _mm_mul_epi32(u0, mul); + u0_high = _mm_add_epi64(u0_high, rnding); + + u0_low = _mm_srli_si128(u0_low, 2); + u0_high = _mm_srli_si128(u0_high, 2); + + u0 = _mm_unpacklo_epi32(u0_low, u0_high); + u0_high = _mm_unpackhi_epi32(u0_low, u0_high); + u0 = _mm_unpacklo_epi64(u0, u0_high); + + // u1 + u1_low = _mm_mul_epi32(u1, mul); + u1_low = _mm_add_epi64(u1_low, rnding); + + u1 = _mm_srli_si128(u1, 4); + u1_high = _mm_mul_epi32(u1, mul); + u1_high = _mm_add_epi64(u1_high, rnding); + + u1_low = _mm_srli_si128(u1_low, 2); + u1_high = _mm_srli_si128(u1_high, 2); + + u1 = _mm_unpacklo_epi32(u1_low, u1_high); + u1_high = _mm_unpackhi_epi32(u1_low, u1_high); + u1 = _mm_unpacklo_epi64(u1, u1_high); + + // u2 + u2_low = _mm_mul_epi32(u2, mul); + u2_low = _mm_add_epi64(u2_low, rnding); + + u2 = _mm_srli_si128(u2, 4); + u2_high = _mm_mul_epi32(u2, mul); + u2_high = _mm_add_epi64(u2_high, rnding); + + u2_low = _mm_srli_si128(u2_low, 2); + u2_high = _mm_srli_si128(u2_high, 2); + + u2 = _mm_unpacklo_epi32(u2_low, u2_high); + u2_high = _mm_unpackhi_epi32(u2_low, u2_high); + u2 = _mm_unpacklo_epi64(u2, u2_high); + + // u3 + u3_low = _mm_mul_epi32(u3, mul); + u3_low = _mm_add_epi64(u3_low, rnding); + + u3 = _mm_srli_si128(u3, 4); + u3_high = _mm_mul_epi32(u3, mul); + u3_high = _mm_add_epi64(u3_high, rnding); + + u3_low = _mm_srli_si128(u3_low, 2); + u3_high = _mm_srli_si128(u3_high, 2); + + u3 = _mm_unpacklo_epi32(u3_low, u3_high); + u3_high = _mm_unpackhi_epi32(u3_low, u3_high); + u3 = _mm_unpacklo_epi64(u3, u3_high); + + out[0] = u0; + out[1] = u1; + out[2] = u2; + out[3] = u3; + + if (!do_cols) { + const int log_range = AOMMAX(16, bd + 6); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + round_shift_4x4(out, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4); + } +} + +static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride, + int fliplr, int flipud, int shift, int bd) { + const __m128i zero = _mm_setzero_si128(); + __m128i u0, u1, u2, u3; + __m128i v0, v1, v2, v3; + + round_shift_4x4(in, shift); + + v0 = _mm_loadl_epi64((__m128i const *)(output + 0 * stride)); + v1 = _mm_loadl_epi64((__m128i const *)(output + 1 * stride)); + v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride)); + v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride)); + + v0 = _mm_unpacklo_epi16(v0, zero); + v1 = _mm_unpacklo_epi16(v1, zero); + v2 = _mm_unpacklo_epi16(v2, zero); + v3 = _mm_unpacklo_epi16(v3, zero); + + if (fliplr) { + in[0] = _mm_shuffle_epi32(in[0], 0x1B); + in[1] = _mm_shuffle_epi32(in[1], 0x1B); + in[2] = _mm_shuffle_epi32(in[2], 0x1B); + in[3] = _mm_shuffle_epi32(in[3], 0x1B); + } + + if (flipud) { + u0 = _mm_add_epi32(in[3], v0); + u1 = _mm_add_epi32(in[2], v1); + u2 = _mm_add_epi32(in[1], v2); + u3 = _mm_add_epi32(in[0], v3); + } else { + u0 = _mm_add_epi32(in[0], v0); + u1 = _mm_add_epi32(in[1], v1); + u2 = _mm_add_epi32(in[2], v2); + u3 = _mm_add_epi32(in[3], v3); + } + + v0 = _mm_packus_epi32(u0, u1); + v2 = _mm_packus_epi32(u2, u3); + + u0 = highbd_clamp_epi16(v0, bd); + u2 = highbd_clamp_epi16(v2, bd); + + v0 = _mm_unpacklo_epi64(u0, u0); + v1 = _mm_unpackhi_epi64(u0, u0); + v2 = _mm_unpacklo_epi64(u2, u2); + v3 = _mm_unpackhi_epi64(u2, u2); + + _mm_storel_epi64((__m128i *)(output + 0 * stride), v0); + _mm_storel_epi64((__m128i *)(output + 1 * stride), v1); + _mm_storel_epi64((__m128i *)(output + 2 * stride), v2); + _mm_storel_epi64((__m128i *)(output + 3 * stride), v3); +} + +static void iidentity4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + (void)bit; + __m128i v[4]; + __m128i zero = _mm_set1_epi32(0); + __m128i fact = _mm_set1_epi32(NewSqrt2); + __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1)); + __m128i a0_low, a1_low; + __m128i a0_high, a1_high; + + offset = _mm_unpacklo_epi32(offset, zero); + + for (int i = 0; i < 4; i++) { + a0_low = _mm_mul_epi32(in[i], fact); + a0_low = _mm_add_epi32(a0_low, offset); + a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits); + + a0_high = _mm_srli_si128(in[i], 4); + a0_high = _mm_mul_epi32(a0_high, fact); + a0_high = _mm_add_epi32(a0_high, offset); + a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits); + + a1_low = _mm_unpacklo_epi32(a0_low, a0_high); + a1_high = _mm_unpackhi_epi32(a0_low, a0_high); + out[i] = _mm_unpacklo_epi64(a1_low, a1_high); + } + + if (!do_cols) { + const int log_range = AOMMAX(16, bd + 6); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + round_shift_4x4(out, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4); + } + + // Transpose for 4x4 + v[0] = _mm_unpacklo_epi32(out[0], out[1]); + v[1] = _mm_unpackhi_epi32(out[0], out[1]); + v[2] = _mm_unpacklo_epi32(out[2], out[3]); + v[3] = _mm_unpackhi_epi32(out[2], out[3]); + + out[0] = _mm_unpacklo_epi64(v[0], v[2]); + out[1] = _mm_unpackhi_epi64(v[0], v[2]); + out[2] = _mm_unpacklo_epi64(v[1], v[3]); + out[3] = _mm_unpackhi_epi64(v[1], v[3]); +} +void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[4]; + const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4]; + const int txw_idx = get_txw_idx(TX_4X4); + const int txh_idx = get_txh_idx(TX_4X4); + + switch (tx_type) { + case DCT_DCT: + load_buffer_4x4(input, in); + idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case ADST_DCT: + load_buffer_4x4(input, in); + idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case DCT_ADST: + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case ADST_ADST: + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case FLIPADST_DCT: + load_buffer_4x4(input, in); + idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); + break; + case DCT_FLIPADST: + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); + break; + case FLIPADST_FLIPADST: + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd); + break; + case ADST_FLIPADST: + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); + break; + case FLIPADST_ADST: + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); + break; + case IDTX: + load_buffer_4x4(input, in); + iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + 0); + iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, + 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case V_DCT: + load_buffer_4x4(input, in); + iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + 0); + idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case H_DCT: + load_buffer_4x4(input, in); + idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, + 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case V_ADST: + load_buffer_4x4(input, in); + iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + 0); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case H_ADST: + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, + 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case V_FLIPADST: + load_buffer_4x4(input, in); + iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + 0); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); + break; + case H_FLIPADST: + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, + 0); + write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); + break; + default: assert(0); + } +} + +// 8x8 +static void load_buffer_8x8(const int32_t *coeff, __m128i *in) { + in[0] = _mm_load_si128((const __m128i *)(coeff + 0)); + in[1] = _mm_load_si128((const __m128i *)(coeff + 4)); + in[2] = _mm_load_si128((const __m128i *)(coeff + 8)); + in[3] = _mm_load_si128((const __m128i *)(coeff + 12)); + in[4] = _mm_load_si128((const __m128i *)(coeff + 16)); + in[5] = _mm_load_si128((const __m128i *)(coeff + 20)); + in[6] = _mm_load_si128((const __m128i *)(coeff + 24)); + in[7] = _mm_load_si128((const __m128i *)(coeff + 28)); + in[8] = _mm_load_si128((const __m128i *)(coeff + 32)); + in[9] = _mm_load_si128((const __m128i *)(coeff + 36)); + in[10] = _mm_load_si128((const __m128i *)(coeff + 40)); + in[11] = _mm_load_si128((const __m128i *)(coeff + 44)); + in[12] = _mm_load_si128((const __m128i *)(coeff + 48)); + in[13] = _mm_load_si128((const __m128i *)(coeff + 52)); + in[14] = _mm_load_si128((const __m128i *)(coeff + 56)); + in[15] = _mm_load_si128((const __m128i *)(coeff + 60)); +} + +static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i x, y; + int col; + + // Note: + // Even column: 0, 2, ..., 14 + // Odd column: 1, 3, ..., 15 + // one even column plus one odd column constructs one row (8 coeffs) + // total we have 8 rows (8x8). + for (col = 0; col < 2; ++col) { + // stage 0 + // stage 1 + // stage 2 + u0 = in[0 * 2 + col]; + u1 = in[4 * 2 + col]; + u2 = in[2 * 2 + col]; + u3 = in[6 * 2 + col]; + + x = _mm_mullo_epi32(in[1 * 2 + col], cospi56); + y = _mm_mullo_epi32(in[7 * 2 + col], cospim8); + u4 = _mm_add_epi32(x, y); + u4 = _mm_add_epi32(u4, rnding); + u4 = _mm_srai_epi32(u4, bit); + + x = _mm_mullo_epi32(in[1 * 2 + col], cospi8); + y = _mm_mullo_epi32(in[7 * 2 + col], cospi56); + u7 = _mm_add_epi32(x, y); + u7 = _mm_add_epi32(u7, rnding); + u7 = _mm_srai_epi32(u7, bit); + + x = _mm_mullo_epi32(in[5 * 2 + col], cospi24); + y = _mm_mullo_epi32(in[3 * 2 + col], cospim40); + u5 = _mm_add_epi32(x, y); + u5 = _mm_add_epi32(u5, rnding); + u5 = _mm_srai_epi32(u5, bit); + + x = _mm_mullo_epi32(in[5 * 2 + col], cospi40); + y = _mm_mullo_epi32(in[3 * 2 + col], cospi24); + u6 = _mm_add_epi32(x, y); + u6 = _mm_add_epi32(u6, rnding); + u6 = _mm_srai_epi32(u6, bit); + + // stage 3 + x = _mm_mullo_epi32(u0, cospi32); + y = _mm_mullo_epi32(u1, cospi32); + v0 = _mm_add_epi32(x, y); + v0 = _mm_add_epi32(v0, rnding); + v0 = _mm_srai_epi32(v0, bit); + + v1 = _mm_sub_epi32(x, y); + v1 = _mm_add_epi32(v1, rnding); + v1 = _mm_srai_epi32(v1, bit); + + x = _mm_mullo_epi32(u2, cospi48); + y = _mm_mullo_epi32(u3, cospim16); + v2 = _mm_add_epi32(x, y); + v2 = _mm_add_epi32(v2, rnding); + v2 = _mm_srai_epi32(v2, bit); + + x = _mm_mullo_epi32(u2, cospi16); + y = _mm_mullo_epi32(u3, cospi48); + v3 = _mm_add_epi32(x, y); + v3 = _mm_add_epi32(v3, rnding); + v3 = _mm_srai_epi32(v3, bit); + + addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi); + addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi); + + // stage 4 + addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi); + addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi); + u4 = v4; + u7 = v7; + + x = _mm_mullo_epi32(v5, cospi32); + y = _mm_mullo_epi32(v6, cospi32); + u6 = _mm_add_epi32(y, x); + u6 = _mm_add_epi32(u6, rnding); + u6 = _mm_srai_epi32(u6, bit); + + u5 = _mm_sub_epi32(y, x); + u5 = _mm_add_epi32(u5, rnding); + u5 = _mm_srai_epi32(u5, bit); + + // stage 5 + addsub_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col, &clamp_lo, + &clamp_hi); + addsub_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col, &clamp_lo, + &clamp_hi); + addsub_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col, &clamp_lo, + &clamp_hi); + addsub_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col, &clamp_lo, + &clamp_hi); + } + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16); + } +} + +static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i kZero = _mm_setzero_si128(); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u[8], v[8], x; + + // Even 8 points: 0, 2, ..., 14 + // stage 0 + // stage 1 + // stage 2 + // (1) + u[0] = _mm_mullo_epi32(in[14], cospi4); + x = _mm_mullo_epi32(in[0], cospi60); + u[0] = _mm_add_epi32(u[0], x); + u[0] = _mm_add_epi32(u[0], rnding); + u[0] = _mm_srai_epi32(u[0], bit); + + u[1] = _mm_mullo_epi32(in[14], cospi60); + x = _mm_mullo_epi32(in[0], cospi4); + u[1] = _mm_sub_epi32(u[1], x); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); + + // (2) + u[2] = _mm_mullo_epi32(in[10], cospi20); + x = _mm_mullo_epi32(in[4], cospi44); + u[2] = _mm_add_epi32(u[2], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_mullo_epi32(in[10], cospi44); + x = _mm_mullo_epi32(in[4], cospi20); + u[3] = _mm_sub_epi32(u[3], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + // (3) + u[4] = _mm_mullo_epi32(in[6], cospi36); + x = _mm_mullo_epi32(in[8], cospi28); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[5] = _mm_mullo_epi32(in[6], cospi28); + x = _mm_mullo_epi32(in[8], cospi36); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + // (4) + u[6] = _mm_mullo_epi32(in[2], cospi52); + x = _mm_mullo_epi32(in[12], cospi12); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_mullo_epi32(in[2], cospi12); + x = _mm_mullo_epi32(in[12], cospi52); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 3 + addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); + + // stage 4 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = _mm_mullo_epi32(v[4], cospi16); + x = _mm_mullo_epi32(v[5], cospi48); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[5] = _mm_mullo_epi32(v[4], cospi48); + x = _mm_mullo_epi32(v[5], cospi16); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_mullo_epi32(v[6], cospim48); + x = _mm_mullo_epi32(v[7], cospi16); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_mullo_epi32(v[6], cospi16); + x = _mm_mullo_epi32(v[7], cospim48); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 5 + addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); + + // stage 6 + u[0] = v[0]; + u[1] = v[1]; + u[4] = v[4]; + u[5] = v[5]; + + v[0] = _mm_mullo_epi32(v[2], cospi32); + x = _mm_mullo_epi32(v[3], cospi32); + u[2] = _mm_add_epi32(v[0], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_sub_epi32(v[0], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + v[0] = _mm_mullo_epi32(v[6], cospi32); + x = _mm_mullo_epi32(v[7], cospi32); + u[6] = _mm_add_epi32(v[0], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_sub_epi32(v[0], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 7 + if (do_cols) { + out[0] = u[0]; + out[2] = _mm_sub_epi32(kZero, u[4]); + out[4] = u[6]; + out[6] = _mm_sub_epi32(kZero, u[2]); + out[8] = u[3]; + out[10] = _mm_sub_epi32(kZero, u[7]); + out[12] = u[5]; + out[14] = _mm_sub_epi32(kZero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo_out, + &clamp_hi_out, out_shift); + } + + // Odd 8 points: 1, 3, ..., 15 + // stage 0 + // stage 1 + // stage 2 + // (1) + u[0] = _mm_mullo_epi32(in[15], cospi4); + x = _mm_mullo_epi32(in[1], cospi60); + u[0] = _mm_add_epi32(u[0], x); + u[0] = _mm_add_epi32(u[0], rnding); + u[0] = _mm_srai_epi32(u[0], bit); + + u[1] = _mm_mullo_epi32(in[15], cospi60); + x = _mm_mullo_epi32(in[1], cospi4); + u[1] = _mm_sub_epi32(u[1], x); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); + + // (2) + u[2] = _mm_mullo_epi32(in[11], cospi20); + x = _mm_mullo_epi32(in[5], cospi44); + u[2] = _mm_add_epi32(u[2], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_mullo_epi32(in[11], cospi44); + x = _mm_mullo_epi32(in[5], cospi20); + u[3] = _mm_sub_epi32(u[3], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + // (3) + u[4] = _mm_mullo_epi32(in[7], cospi36); + x = _mm_mullo_epi32(in[9], cospi28); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[5] = _mm_mullo_epi32(in[7], cospi28); + x = _mm_mullo_epi32(in[9], cospi36); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + // (4) + u[6] = _mm_mullo_epi32(in[3], cospi52); + x = _mm_mullo_epi32(in[13], cospi12); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_mullo_epi32(in[3], cospi12); + x = _mm_mullo_epi32(in[13], cospi52); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 3 + addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); + + // stage 4 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = _mm_mullo_epi32(v[4], cospi16); + x = _mm_mullo_epi32(v[5], cospi48); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[5] = _mm_mullo_epi32(v[4], cospi48); + x = _mm_mullo_epi32(v[5], cospi16); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_mullo_epi32(v[6], cospim48); + x = _mm_mullo_epi32(v[7], cospi16); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_mullo_epi32(v[6], cospi16); + x = _mm_mullo_epi32(v[7], cospim48); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 5 + addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); + + // stage 6 + u[0] = v[0]; + u[1] = v[1]; + u[4] = v[4]; + u[5] = v[5]; + + v[0] = _mm_mullo_epi32(v[2], cospi32); + x = _mm_mullo_epi32(v[3], cospi32); + u[2] = _mm_add_epi32(v[0], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_sub_epi32(v[0], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + v[0] = _mm_mullo_epi32(v[6], cospi32); + x = _mm_mullo_epi32(v[7], cospi32); + u[6] = _mm_add_epi32(v[0], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_sub_epi32(v[0], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 7 + if (do_cols) { + out[1] = u[0]; + out[3] = _mm_sub_epi32(kZero, u[4]); + out[5] = u[6]; + out[7] = _mm_sub_epi32(kZero, u[2]); + out[9] = u[3]; + out[11] = _mm_sub_epi32(kZero, u[7]); + out[13] = u[5]; + out[15] = _mm_sub_epi32(kZero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } +} + +static void iidentity8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + (void)bit; + out[0] = _mm_add_epi32(in[0], in[0]); + out[1] = _mm_add_epi32(in[1], in[1]); + out[2] = _mm_add_epi32(in[2], in[2]); + out[3] = _mm_add_epi32(in[3], in[3]); + out[4] = _mm_add_epi32(in[4], in[4]); + out[5] = _mm_add_epi32(in[5], in[5]); + out[6] = _mm_add_epi32(in[6], in[6]); + out[7] = _mm_add_epi32(in[7], in[7]); + + if (!do_cols) { + const int log_range = AOMMAX(16, bd + 6); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + round_shift_4x4(out, out_shift); + round_shift_4x4(out + 4, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 8); + } +} + +static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo, __m128i res_hi, + int fliplr, int bd) { + __m128i x0, x1; + const __m128i zero = _mm_setzero_si128(); + + x0 = _mm_unpacklo_epi16(pred, zero); + x1 = _mm_unpackhi_epi16(pred, zero); + + if (fliplr) { + res_lo = _mm_shuffle_epi32(res_lo, 0x1B); + res_hi = _mm_shuffle_epi32(res_hi, 0x1B); + x0 = _mm_add_epi32(res_hi, x0); + x1 = _mm_add_epi32(res_lo, x1); + + } else { + x0 = _mm_add_epi32(res_lo, x0); + x1 = _mm_add_epi32(res_hi, x1); + } + + x0 = _mm_packus_epi32(x0, x1); + return highbd_clamp_epi16(x0, bd); +} + +static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride, + int fliplr, int flipud, int shift, int bd) { + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + + round_shift_8x8(in, shift); + + v0 = _mm_load_si128((__m128i const *)(output + 0 * stride)); + v1 = _mm_load_si128((__m128i const *)(output + 1 * stride)); + v2 = _mm_load_si128((__m128i const *)(output + 2 * stride)); + v3 = _mm_load_si128((__m128i const *)(output + 3 * stride)); + v4 = _mm_load_si128((__m128i const *)(output + 4 * stride)); + v5 = _mm_load_si128((__m128i const *)(output + 5 * stride)); + v6 = _mm_load_si128((__m128i const *)(output + 6 * stride)); + v7 = _mm_load_si128((__m128i const *)(output + 7 * stride)); + + if (flipud) { + u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd); + u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd); + u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd); + u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd); + u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd); + u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd); + u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd); + u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd); + } else { + u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd); + u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd); + u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd); + u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd); + u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd); + u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd); + u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd); + u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd); + } + + _mm_store_si128((__m128i *)(output + 0 * stride), u0); + _mm_store_si128((__m128i *)(output + 1 * stride), u1); + _mm_store_si128((__m128i *)(output + 2 * stride), u2); + _mm_store_si128((__m128i *)(output + 3 * stride), u3); + _mm_store_si128((__m128i *)(output + 4 * stride), u4); + _mm_store_si128((__m128i *)(output + 5 * stride), u5); + _mm_store_si128((__m128i *)(output + 6 * stride), u6); + _mm_store_si128((__m128i *)(output + 7 * stride), u7); +} + +void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[16], out[16]; + const int8_t *shift = av1_inv_txfm_shift_ls[TX_8X8]; + const int txw_idx = get_txw_idx(TX_8X8); + const int txh_idx = get_txh_idx(TX_8X8); + + switch (tx_type) { + case DCT_DCT: + load_buffer_8x8(input, in); + transpose_8x8(in, out); + idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + -shift[0]); + transpose_8x8(in, out); + idct8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd); + break; + case DCT_ADST: + load_buffer_8x8(input, in); + transpose_8x8(in, out); + iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + -shift[0]); + transpose_8x8(in, out); + idct8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd); + break; + case ADST_DCT: + load_buffer_8x8(input, in); + transpose_8x8(in, out); + idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + -shift[0]); + transpose_8x8(in, out); + iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd); + break; + case ADST_ADST: + load_buffer_8x8(input, in); + transpose_8x8(in, out); + iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + -shift[0]); + transpose_8x8(in, out); + iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd); + break; + case FLIPADST_DCT: + load_buffer_8x8(input, in); + transpose_8x8(in, out); + idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + -shift[0]); + transpose_8x8(in, out); + iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd); + break; + case DCT_FLIPADST: + load_buffer_8x8(input, in); + transpose_8x8(in, out); + iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + -shift[0]); + transpose_8x8(in, out); + idct8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd); + break; + case ADST_FLIPADST: + load_buffer_8x8(input, in); + transpose_8x8(in, out); + iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + -shift[0]); + transpose_8x8(in, out); + iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd); + break; + case FLIPADST_FLIPADST: + load_buffer_8x8(input, in); + transpose_8x8(in, out); + iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + -shift[0]); + transpose_8x8(in, out); + iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_8x8(in, output, stride, 1, 1, -shift[1], bd); + break; + case FLIPADST_ADST: + load_buffer_8x8(input, in); + transpose_8x8(in, out); + iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + -shift[0]); + transpose_8x8(in, out); + iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd); + break; + default: assert(0); + } +} + +static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i x; + + // stage 0 + // stage 1 + // stage 2 + // stage 3 + x = _mm_mullo_epi32(in[0], cospi32); + x = _mm_add_epi32(x, rnding); + x = _mm_srai_epi32(x, bit); + + // stage 4 + // stage 5 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1))); + clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); + x = _mm_add_epi32(x, offset); + x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); + } + + x = _mm_max_epi32(x, clamp_lo); + x = _mm_min_epi32(x, clamp_hi); + out[0] = x; + out[1] = x; + out[2] = x; + out[3] = x; + out[4] = x; + out[5] = x; + out[6] = x; + out[7] = x; +} + +static void idct8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i x, y; + + // stage 0 + // stage 1 + // stage 2 + u0 = in[0]; + u1 = in[4]; + u2 = in[2]; + u3 = in[6]; + + x = _mm_mullo_epi32(in[1], cospi56); + y = _mm_mullo_epi32(in[7], cospim8); + u4 = _mm_add_epi32(x, y); + u4 = _mm_add_epi32(u4, rnding); + u4 = _mm_srai_epi32(u4, bit); + + x = _mm_mullo_epi32(in[1], cospi8); + y = _mm_mullo_epi32(in[7], cospi56); + u7 = _mm_add_epi32(x, y); + u7 = _mm_add_epi32(u7, rnding); + u7 = _mm_srai_epi32(u7, bit); + + x = _mm_mullo_epi32(in[5], cospi24); + y = _mm_mullo_epi32(in[3], cospim40); + u5 = _mm_add_epi32(x, y); + u5 = _mm_add_epi32(u5, rnding); + u5 = _mm_srai_epi32(u5, bit); + + x = _mm_mullo_epi32(in[5], cospi40); + y = _mm_mullo_epi32(in[3], cospi24); + u6 = _mm_add_epi32(x, y); + u6 = _mm_add_epi32(u6, rnding); + u6 = _mm_srai_epi32(u6, bit); + + // stage 3 + x = _mm_mullo_epi32(u0, cospi32); + y = _mm_mullo_epi32(u1, cospi32); + v0 = _mm_add_epi32(x, y); + v0 = _mm_add_epi32(v0, rnding); + v0 = _mm_srai_epi32(v0, bit); + + v1 = _mm_sub_epi32(x, y); + v1 = _mm_add_epi32(v1, rnding); + v1 = _mm_srai_epi32(v1, bit); + + x = _mm_mullo_epi32(u2, cospi48); + y = _mm_mullo_epi32(u3, cospim16); + v2 = _mm_add_epi32(x, y); + v2 = _mm_add_epi32(v2, rnding); + v2 = _mm_srai_epi32(v2, bit); + + x = _mm_mullo_epi32(u2, cospi16); + y = _mm_mullo_epi32(u3, cospi48); + v3 = _mm_add_epi32(x, y); + v3 = _mm_add_epi32(v3, rnding); + v3 = _mm_srai_epi32(v3, bit); + + addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi); + addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi); + + // stage 4 + addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi); + addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi); + u4 = v4; + u7 = v7; + + x = _mm_mullo_epi32(v5, cospi32); + y = _mm_mullo_epi32(v6, cospi32); + u6 = _mm_add_epi32(y, x); + u6 = _mm_add_epi32(u6, rnding); + u6 = _mm_srai_epi32(u6, bit); + + u5 = _mm_sub_epi32(y, x); + u5 = _mm_add_epi32(u5, rnding); + u5 = _mm_srai_epi32(u5, bit); + + // stage 5 + addsub_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi); + addsub_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi); + addsub_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi); + addsub_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + round_shift_4x4(out, out_shift); + round_shift_4x4(out + 4, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 8); + } +} + +static void iadst8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i kZero = _mm_setzero_si128(); + __m128i u[8], x; + + // stage 0 + // stage 1 + // stage 2 + + x = _mm_mullo_epi32(in[0], cospi60); + u[0] = _mm_add_epi32(x, rnding); + u[0] = _mm_srai_epi32(u[0], bit); + + x = _mm_mullo_epi32(in[0], cospi4); + u[1] = _mm_sub_epi32(kZero, x); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); + + // stage 3 + // stage 4 + __m128i temp1, temp2; + temp1 = _mm_mullo_epi32(u[0], cospi16); + x = _mm_mullo_epi32(u[1], cospi48); + temp1 = _mm_add_epi32(temp1, x); + temp1 = _mm_add_epi32(temp1, rnding); + temp1 = _mm_srai_epi32(temp1, bit); + u[4] = temp1; + + temp2 = _mm_mullo_epi32(u[0], cospi48); + x = _mm_mullo_epi32(u[1], cospi16); + u[5] = _mm_sub_epi32(temp2, x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + // stage 5 + // stage 6 + temp1 = _mm_mullo_epi32(u[0], cospi32); + x = _mm_mullo_epi32(u[1], cospi32); + u[2] = _mm_add_epi32(temp1, x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_sub_epi32(temp1, x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + temp1 = _mm_mullo_epi32(u[4], cospi32); + x = _mm_mullo_epi32(u[5], cospi32); + u[6] = _mm_add_epi32(temp1, x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_sub_epi32(temp1, x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 7 + if (do_cols) { + out[0] = u[0]; + out[1] = _mm_sub_epi32(kZero, u[4]); + out[2] = u[6]; + out[3] = _mm_sub_epi32(kZero, u[2]); + out[4] = u[3]; + out[5] = _mm_sub_epi32(kZero, u[7]); + out[6] = u[5]; + out[7] = _mm_sub_epi32(kZero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, + out_shift); + } +} + +static void iadst8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i kZero = _mm_setzero_si128(); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u[8], v[8], x; + + // stage 0 + // stage 1 + // stage 2 + + u[0] = _mm_mullo_epi32(in[7], cospi4); + x = _mm_mullo_epi32(in[0], cospi60); + u[0] = _mm_add_epi32(u[0], x); + u[0] = _mm_add_epi32(u[0], rnding); + u[0] = _mm_srai_epi32(u[0], bit); + + u[1] = _mm_mullo_epi32(in[7], cospi60); + x = _mm_mullo_epi32(in[0], cospi4); + u[1] = _mm_sub_epi32(u[1], x); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); + + // (2) + u[2] = _mm_mullo_epi32(in[5], cospi20); + x = _mm_mullo_epi32(in[2], cospi44); + u[2] = _mm_add_epi32(u[2], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_mullo_epi32(in[5], cospi44); + x = _mm_mullo_epi32(in[2], cospi20); + u[3] = _mm_sub_epi32(u[3], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + // (3) + u[4] = _mm_mullo_epi32(in[3], cospi36); + x = _mm_mullo_epi32(in[4], cospi28); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[5] = _mm_mullo_epi32(in[3], cospi28); + x = _mm_mullo_epi32(in[4], cospi36); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + // (4) + u[6] = _mm_mullo_epi32(in[1], cospi52); + x = _mm_mullo_epi32(in[6], cospi12); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_mullo_epi32(in[1], cospi12); + x = _mm_mullo_epi32(in[6], cospi52); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 3 + addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); + + // stage 4 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = _mm_mullo_epi32(v[4], cospi16); + x = _mm_mullo_epi32(v[5], cospi48); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[5] = _mm_mullo_epi32(v[4], cospi48); + x = _mm_mullo_epi32(v[5], cospi16); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_mullo_epi32(v[6], cospim48); + x = _mm_mullo_epi32(v[7], cospi16); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_mullo_epi32(v[6], cospi16); + x = _mm_mullo_epi32(v[7], cospim48); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 5 + addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); + + // stage 6 + u[0] = v[0]; + u[1] = v[1]; + u[4] = v[4]; + u[5] = v[5]; + + v[0] = _mm_mullo_epi32(v[2], cospi32); + x = _mm_mullo_epi32(v[3], cospi32); + u[2] = _mm_add_epi32(v[0], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_sub_epi32(v[0], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + v[0] = _mm_mullo_epi32(v[6], cospi32); + x = _mm_mullo_epi32(v[7], cospi32); + u[6] = _mm_add_epi32(v[0], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_sub_epi32(v[0], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 7 + if (do_cols) { + out[0] = u[0]; + out[1] = _mm_sub_epi32(kZero, u[4]); + out[2] = u[6]; + out[3] = _mm_sub_epi32(kZero, u[2]); + out[4] = u[3]; + out[5] = _mm_sub_epi32(kZero, u[7]); + out[6] = u[5]; + out[7] = _mm_sub_epi32(kZero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, + out_shift); + } +} + +static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + // stage 0 + // stage 1 + // stage 2 + // stage 3 + // stage 4 + in[0] = _mm_mullo_epi32(in[0], cospi32); + in[0] = _mm_add_epi32(in[0], rnding); + in[0] = _mm_srai_epi32(in[0], bit); + + // stage 5 + // stage 6 + // stage 7 + if (!do_cols) { + log_range = AOMMAX(16, bd + 6); + clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + if (out_shift != 0) { + __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); + in[0] = _mm_add_epi32(in[0], offset); + in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift)); + } + } + + in[0] = _mm_max_epi32(in[0], clamp_lo); + in[0] = _mm_min_epi32(in[0], clamp_hi); + out[0] = in[0]; + out[1] = in[0]; + out[2] = in[0]; + out[3] = in[0]; + out[4] = in[0]; + out[5] = in[0]; + out[6] = in[0]; + out[7] = in[0]; + out[8] = in[0]; + out[9] = in[0]; + out[10] = in[0]; + out[11] = in[0]; + out[12] = in[0]; + out[13] = in[0]; + out[14] = in[0]; + out[15] = in[0]; +} + +static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u[16], x, y; + // stage 0 + // stage 1 + u[0] = in[0]; + u[2] = in[4]; + u[4] = in[2]; + u[6] = in[6]; + u[8] = in[1]; + u[10] = in[5]; + u[12] = in[3]; + u[14] = in[7]; + + // stage 2 + u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); + u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); + + u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit); + u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit); + + u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit); + u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit); + + u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit); + u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit); + + // stage 3 + u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit); + u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit); + u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit); + u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit); + + addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi); + + // stage 4 + x = _mm_mullo_epi32(u[0], cospi32); + u[0] = _mm_add_epi32(x, rnding); + u[0] = _mm_srai_epi32(u[0], bit); + u[1] = u[0]; + + u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit); + u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit); + + addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi); + + x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + u[9] = x; + y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + u[10] = y; + + // stage 5 + addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + + x = _mm_mullo_epi32(u[5], cospi32); + y = _mm_mullo_epi32(u[6], cospi32); + u[5] = _mm_sub_epi32(y, x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_add_epi32(y, x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + // stage 6 + addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi); + + x = _mm_mullo_epi32(u[10], cospi32); + y = _mm_mullo_epi32(u[13], cospi32); + u[10] = _mm_sub_epi32(y, x); + u[10] = _mm_add_epi32(u[10], rnding); + u[10] = _mm_srai_epi32(u[10], bit); + + u[13] = _mm_add_epi32(x, y); + u[13] = _mm_add_epi32(u[13], rnding); + u[13] = _mm_srai_epi32(u[13], bit); + + x = _mm_mullo_epi32(u[11], cospi32); + y = _mm_mullo_epi32(u[12], cospi32); + u[11] = _mm_sub_epi32(y, x); + u[11] = _mm_add_epi32(u[11], rnding); + u[11] = _mm_srai_epi32(u[11], bit); + + u[12] = _mm_add_epi32(x, y); + u[12] = _mm_add_epi32(u[12], rnding); + u[12] = _mm_srai_epi32(u[12], bit); + // stage 7 + addsub_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi); + addsub_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi); + addsub_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi); + addsub_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi); + addsub_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16); + } +} + +static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i zero = _mm_setzero_si128(); + __m128i v[16], x, y, temp1, temp2; + // stage 0 + // stage 1 + // stage 2 + x = _mm_mullo_epi32(in[0], cospi62); + v[0] = _mm_add_epi32(x, rnding); + v[0] = _mm_srai_epi32(v[0], bit); + + x = _mm_mullo_epi32(in[0], cospi2); + v[1] = _mm_sub_epi32(zero, x); + v[1] = _mm_add_epi32(v[1], rnding); + v[1] = _mm_srai_epi32(v[1], bit); + + // stage 3 + v[8] = v[0]; + v[9] = v[1]; + + // stage 4 + temp1 = _mm_mullo_epi32(v[8], cospi8); + x = _mm_mullo_epi32(v[9], cospi56); + temp1 = _mm_add_epi32(temp1, x); + temp1 = _mm_add_epi32(temp1, rnding); + temp1 = _mm_srai_epi32(temp1, bit); + + temp2 = _mm_mullo_epi32(v[8], cospi56); + x = _mm_mullo_epi32(v[9], cospi8); + temp2 = _mm_sub_epi32(temp2, x); + temp2 = _mm_add_epi32(temp2, rnding); + temp2 = _mm_srai_epi32(temp2, bit); + v[8] = temp1; + v[9] = temp2; + + // stage 5 + v[4] = v[0]; + v[5] = v[1]; + v[12] = v[8]; + v[13] = v[9]; + + // stage 6 + temp1 = _mm_mullo_epi32(v[4], cospi16); + x = _mm_mullo_epi32(v[5], cospi48); + temp1 = _mm_add_epi32(temp1, x); + temp1 = _mm_add_epi32(temp1, rnding); + temp1 = _mm_srai_epi32(temp1, bit); + + temp2 = _mm_mullo_epi32(v[4], cospi48); + x = _mm_mullo_epi32(v[5], cospi16); + temp2 = _mm_sub_epi32(temp2, x); + temp2 = _mm_add_epi32(temp2, rnding); + temp2 = _mm_srai_epi32(temp2, bit); + v[4] = temp1; + v[5] = temp2; + + temp1 = _mm_mullo_epi32(v[12], cospi16); + x = _mm_mullo_epi32(v[13], cospi48); + temp1 = _mm_add_epi32(temp1, x); + temp1 = _mm_add_epi32(temp1, rnding); + temp1 = _mm_srai_epi32(temp1, bit); + + temp2 = _mm_mullo_epi32(v[12], cospi48); + x = _mm_mullo_epi32(v[13], cospi16); + temp2 = _mm_sub_epi32(temp2, x); + temp2 = _mm_add_epi32(temp2, rnding); + temp2 = _mm_srai_epi32(temp2, bit); + v[12] = temp1; + v[13] = temp2; + + // stage 7 + v[2] = v[0]; + v[3] = v[1]; + v[6] = v[4]; + v[7] = v[5]; + v[10] = v[8]; + v[11] = v[9]; + v[14] = v[12]; + v[15] = v[13]; + + // stage 8 + y = _mm_mullo_epi32(v[2], cospi32); + x = _mm_mullo_epi32(v[3], cospi32); + v[2] = _mm_add_epi32(y, x); + v[2] = _mm_add_epi32(v[2], rnding); + v[2] = _mm_srai_epi32(v[2], bit); + + v[3] = _mm_sub_epi32(y, x); + v[3] = _mm_add_epi32(v[3], rnding); + v[3] = _mm_srai_epi32(v[3], bit); + + y = _mm_mullo_epi32(v[6], cospi32); + x = _mm_mullo_epi32(v[7], cospi32); + v[6] = _mm_add_epi32(y, x); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + v[7] = _mm_sub_epi32(y, x); + v[7] = _mm_add_epi32(v[7], rnding); + v[7] = _mm_srai_epi32(v[7], bit); + + y = _mm_mullo_epi32(v[10], cospi32); + x = _mm_mullo_epi32(v[11], cospi32); + v[10] = _mm_add_epi32(y, x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[11] = _mm_sub_epi32(y, x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + y = _mm_mullo_epi32(v[14], cospi32); + x = _mm_mullo_epi32(v[15], cospi32); + v[14] = _mm_add_epi32(y, x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_sub_epi32(y, x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 9 + if (do_cols) { + out[0] = v[0]; + out[1] = _mm_sub_epi32(zero, v[8]); + out[2] = v[12]; + out[3] = _mm_sub_epi32(zero, v[4]); + out[4] = v[6]; + out[5] = _mm_sub_epi32(zero, v[14]); + out[6] = v[10]; + out[7] = _mm_sub_epi32(zero, v[2]); + out[8] = v[3]; + out[9] = _mm_sub_epi32(zero, v[11]); + out[10] = v[15]; + out[11] = _mm_sub_epi32(zero, v[7]); + out[12] = v[5]; + out[13] = _mm_sub_epi32(zero, v[13]); + out[14] = v[9]; + out[15] = _mm_sub_epi32(zero, v[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } +} + +static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospi34 = _mm_set1_epi32(cospi[34]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospi42 = _mm_set1_epi32(cospi[42]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospi50 = _mm_set1_epi32(cospi[50]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi58 = _mm_set1_epi32(cospi[58]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i zero = _mm_setzero_si128(); + __m128i u[16], x, y; + + // stage 0 + // stage 1 + // stage 2 + x = _mm_mullo_epi32(in[0], cospi62); + u[0] = _mm_add_epi32(x, rnding); + u[0] = _mm_srai_epi32(u[0], bit); + + x = _mm_mullo_epi32(in[0], cospi2); + u[1] = _mm_sub_epi32(zero, x); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); + + x = _mm_mullo_epi32(in[2], cospi54); + u[2] = _mm_add_epi32(x, rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + x = _mm_mullo_epi32(in[2], cospi10); + u[3] = _mm_sub_epi32(zero, x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + x = _mm_mullo_epi32(in[4], cospi46); + u[4] = _mm_add_epi32(x, rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + x = _mm_mullo_epi32(in[4], cospi18); + u[5] = _mm_sub_epi32(zero, x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + x = _mm_mullo_epi32(in[6], cospi38); + u[6] = _mm_add_epi32(x, rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + x = _mm_mullo_epi32(in[6], cospi26); + u[7] = _mm_sub_epi32(zero, x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + u[8] = _mm_mullo_epi32(in[7], cospi34); + u[8] = _mm_add_epi32(u[8], rnding); + u[8] = _mm_srai_epi32(u[8], bit); + + u[9] = _mm_mullo_epi32(in[7], cospi30); + u[9] = _mm_add_epi32(u[9], rnding); + u[9] = _mm_srai_epi32(u[9], bit); + + u[10] = _mm_mullo_epi32(in[5], cospi42); + u[10] = _mm_add_epi32(u[10], rnding); + u[10] = _mm_srai_epi32(u[10], bit); + + u[11] = _mm_mullo_epi32(in[5], cospi22); + u[11] = _mm_add_epi32(u[11], rnding); + u[11] = _mm_srai_epi32(u[11], bit); + + u[12] = _mm_mullo_epi32(in[3], cospi50); + u[12] = _mm_add_epi32(u[12], rnding); + u[12] = _mm_srai_epi32(u[12], bit); + + u[13] = _mm_mullo_epi32(in[3], cospi14); + u[13] = _mm_add_epi32(u[13], rnding); + u[13] = _mm_srai_epi32(u[13], bit); + + u[14] = _mm_mullo_epi32(in[1], cospi58); + u[14] = _mm_add_epi32(u[14], rnding); + u[14] = _mm_srai_epi32(u[14], bit); + + u[15] = _mm_mullo_epi32(in[1], cospi6); + u[15] = _mm_add_epi32(u[15], rnding); + u[15] = _mm_srai_epi32(u[15], bit); + + // stage 3 + addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi); + + // stage 4 + y = _mm_mullo_epi32(u[8], cospi56); + x = _mm_mullo_epi32(u[9], cospi56); + u[8] = _mm_mullo_epi32(u[8], cospi8); + u[8] = _mm_add_epi32(u[8], x); + u[8] = _mm_add_epi32(u[8], rnding); + u[8] = _mm_srai_epi32(u[8], bit); + + x = _mm_mullo_epi32(u[9], cospi8); + u[9] = _mm_sub_epi32(y, x); + u[9] = _mm_add_epi32(u[9], rnding); + u[9] = _mm_srai_epi32(u[9], bit); + + x = _mm_mullo_epi32(u[11], cospi24); + y = _mm_mullo_epi32(u[10], cospi24); + u[10] = _mm_mullo_epi32(u[10], cospi40); + u[10] = _mm_add_epi32(u[10], x); + u[10] = _mm_add_epi32(u[10], rnding); + u[10] = _mm_srai_epi32(u[10], bit); + + x = _mm_mullo_epi32(u[11], cospi40); + u[11] = _mm_sub_epi32(y, x); + u[11] = _mm_add_epi32(u[11], rnding); + u[11] = _mm_srai_epi32(u[11], bit); + + x = _mm_mullo_epi32(u[13], cospi8); + y = _mm_mullo_epi32(u[12], cospi8); + u[12] = _mm_mullo_epi32(u[12], cospim56); + u[12] = _mm_add_epi32(u[12], x); + u[12] = _mm_add_epi32(u[12], rnding); + u[12] = _mm_srai_epi32(u[12], bit); + + x = _mm_mullo_epi32(u[13], cospim56); + u[13] = _mm_sub_epi32(y, x); + u[13] = _mm_add_epi32(u[13], rnding); + u[13] = _mm_srai_epi32(u[13], bit); + + x = _mm_mullo_epi32(u[15], cospi40); + y = _mm_mullo_epi32(u[14], cospi40); + u[14] = _mm_mullo_epi32(u[14], cospim24); + u[14] = _mm_add_epi32(u[14], x); + u[14] = _mm_add_epi32(u[14], rnding); + u[14] = _mm_srai_epi32(u[14], bit); + + x = _mm_mullo_epi32(u[15], cospim24); + u[15] = _mm_sub_epi32(y, x); + u[15] = _mm_add_epi32(u[15], rnding); + u[15] = _mm_srai_epi32(u[15], bit); + + // stage 5 + addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi); + + // stage 6 + x = _mm_mullo_epi32(u[5], cospi48); + y = _mm_mullo_epi32(u[4], cospi48); + u[4] = _mm_mullo_epi32(u[4], cospi16); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + x = _mm_mullo_epi32(u[5], cospi16); + u[5] = _mm_sub_epi32(y, x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + x = _mm_mullo_epi32(u[7], cospi16); + y = _mm_mullo_epi32(u[6], cospi16); + u[6] = _mm_mullo_epi32(u[6], cospim48); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + x = _mm_mullo_epi32(u[7], cospim48); + u[7] = _mm_sub_epi32(y, x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + x = _mm_mullo_epi32(u[13], cospi48); + y = _mm_mullo_epi32(u[12], cospi48); + u[12] = _mm_mullo_epi32(u[12], cospi16); + u[12] = _mm_add_epi32(u[12], x); + u[12] = _mm_add_epi32(u[12], rnding); + u[12] = _mm_srai_epi32(u[12], bit); + + x = _mm_mullo_epi32(u[13], cospi16); + u[13] = _mm_sub_epi32(y, x); + u[13] = _mm_add_epi32(u[13], rnding); + u[13] = _mm_srai_epi32(u[13], bit); + + x = _mm_mullo_epi32(u[15], cospi16); + y = _mm_mullo_epi32(u[14], cospi16); + u[14] = _mm_mullo_epi32(u[14], cospim48); + u[14] = _mm_add_epi32(u[14], x); + u[14] = _mm_add_epi32(u[14], rnding); + u[14] = _mm_srai_epi32(u[14], bit); + + x = _mm_mullo_epi32(u[15], cospim48); + u[15] = _mm_sub_epi32(y, x); + u[15] = _mm_add_epi32(u[15], rnding); + u[15] = _mm_srai_epi32(u[15], bit); + + // stage 7 + addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi); + + // stage 8 + y = _mm_mullo_epi32(u[2], cospi32); + x = _mm_mullo_epi32(u[3], cospi32); + u[2] = _mm_add_epi32(y, x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_sub_epi32(y, x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + y = _mm_mullo_epi32(u[6], cospi32); + x = _mm_mullo_epi32(u[7], cospi32); + u[6] = _mm_add_epi32(y, x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_sub_epi32(y, x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + y = _mm_mullo_epi32(u[10], cospi32); + x = _mm_mullo_epi32(u[11], cospi32); + u[10] = _mm_add_epi32(y, x); + u[10] = _mm_add_epi32(u[10], rnding); + u[10] = _mm_srai_epi32(u[10], bit); + + u[11] = _mm_sub_epi32(y, x); + u[11] = _mm_add_epi32(u[11], rnding); + u[11] = _mm_srai_epi32(u[11], bit); + + y = _mm_mullo_epi32(u[14], cospi32); + x = _mm_mullo_epi32(u[15], cospi32); + u[14] = _mm_add_epi32(y, x); + u[14] = _mm_add_epi32(u[14], rnding); + u[14] = _mm_srai_epi32(u[14], bit); + + u[15] = _mm_sub_epi32(y, x); + u[15] = _mm_add_epi32(u[15], rnding); + u[15] = _mm_srai_epi32(u[15], bit); + + // stage 9 + if (do_cols) { + out[0] = u[0]; + out[1] = _mm_sub_epi32(zero, u[8]); + out[2] = u[12]; + out[3] = _mm_sub_epi32(zero, u[4]); + out[4] = u[6]; + out[5] = _mm_sub_epi32(zero, u[14]); + out[6] = u[10]; + out[7] = _mm_sub_epi32(zero, u[2]); + out[8] = u[3]; + out[9] = _mm_sub_epi32(zero, u[11]); + out[10] = u[15]; + out[11] = _mm_sub_epi32(zero, u[7]); + out[12] = u[5]; + out[13] = _mm_sub_epi32(zero, u[13]); + out[14] = u[9]; + out[15] = _mm_sub_epi32(zero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } +} + +static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u[16], v[16], x, y; + + { + // stage 0 + // stage 1 + u[0] = in[0]; + u[1] = in[8]; + u[2] = in[4]; + u[3] = in[12]; + u[4] = in[2]; + u[5] = in[10]; + u[6] = in[6]; + u[7] = in[14]; + u[8] = in[1]; + u[9] = in[9]; + u[10] = in[5]; + u[11] = in[13]; + u[12] = in[3]; + u[13] = in[11]; + u[14] = in[7]; + u[15] = in[15]; + + // stage 2 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit); + v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit); + v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit); + v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit); + v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit); + v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit); + + // stage 3 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit); + u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit); + u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit); + u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit); + addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi); + + // stage 4 + x = _mm_mullo_epi32(u[0], cospi32); + y = _mm_mullo_epi32(u[1], cospi32); + v[0] = _mm_add_epi32(x, y); + v[0] = _mm_add_epi32(v[0], rnding); + v[0] = _mm_srai_epi32(v[0], bit); + + v[1] = _mm_sub_epi32(x, y); + v[1] = _mm_add_epi32(v[1], rnding); + v[1] = _mm_srai_epi32(v[1], bit); + + v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit); + v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit); + addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); + v[8] = u[8]; + v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + v[11] = u[11]; + v[12] = u[12]; + v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + v[15] = u[15]; + + // stage 5 + addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + u[4] = v[4]; + + x = _mm_mullo_epi32(v[5], cospi32); + y = _mm_mullo_epi32(v[6], cospi32); + u[5] = _mm_sub_epi32(y, x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_add_epi32(y, x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = v[7]; + addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + // stage 6 + addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi); + v[8] = u[8]; + v[9] = u[9]; + + x = _mm_mullo_epi32(u[10], cospi32); + y = _mm_mullo_epi32(u[13], cospi32); + v[10] = _mm_sub_epi32(y, x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[13] = _mm_add_epi32(x, y); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + x = _mm_mullo_epi32(u[11], cospi32); + y = _mm_mullo_epi32(u[12], cospi32); + v[11] = _mm_sub_epi32(y, x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = _mm_add_epi32(x, y); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + v[14] = u[14]; + v[15] = u[15]; + + // stage 7 + addsub_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi); + addsub_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi); + addsub_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi); + addsub_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi); + addsub_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi); + addsub_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi); + addsub_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = + _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16); + } + } +} + +static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospi34 = _mm_set1_epi32(cospi[34]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospi42 = _mm_set1_epi32(cospi[42]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospi50 = _mm_set1_epi32(cospi[50]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi58 = _mm_set1_epi32(cospi[58]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + const __m128i zero = _mm_setzero_si128(); + __m128i u[16], v[16], x, y; + // Calculate the column 0, 1, 2, 3 + // stage 0 + // stage 1 + // stage 2 + v[0] = _mm_mullo_epi32(in[15], cospi2); + x = _mm_mullo_epi32(in[0], cospi62); + v[0] = _mm_add_epi32(v[0], x); + v[0] = _mm_add_epi32(v[0], rnding); + v[0] = _mm_srai_epi32(v[0], bit); + + v[1] = _mm_mullo_epi32(in[15], cospi62); + x = _mm_mullo_epi32(in[0], cospi2); + v[1] = _mm_sub_epi32(v[1], x); + v[1] = _mm_add_epi32(v[1], rnding); + v[1] = _mm_srai_epi32(v[1], bit); + + v[2] = _mm_mullo_epi32(in[13], cospi10); + x = _mm_mullo_epi32(in[2], cospi54); + v[2] = _mm_add_epi32(v[2], x); + v[2] = _mm_add_epi32(v[2], rnding); + v[2] = _mm_srai_epi32(v[2], bit); + + v[3] = _mm_mullo_epi32(in[13], cospi54); + x = _mm_mullo_epi32(in[2], cospi10); + v[3] = _mm_sub_epi32(v[3], x); + v[3] = _mm_add_epi32(v[3], rnding); + v[3] = _mm_srai_epi32(v[3], bit); + + v[4] = _mm_mullo_epi32(in[11], cospi18); + x = _mm_mullo_epi32(in[4], cospi46); + v[4] = _mm_add_epi32(v[4], x); + v[4] = _mm_add_epi32(v[4], rnding); + v[4] = _mm_srai_epi32(v[4], bit); + + v[5] = _mm_mullo_epi32(in[11], cospi46); + x = _mm_mullo_epi32(in[4], cospi18); + v[5] = _mm_sub_epi32(v[5], x); + v[5] = _mm_add_epi32(v[5], rnding); + v[5] = _mm_srai_epi32(v[5], bit); + + v[6] = _mm_mullo_epi32(in[9], cospi26); + x = _mm_mullo_epi32(in[6], cospi38); + v[6] = _mm_add_epi32(v[6], x); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + v[7] = _mm_mullo_epi32(in[9], cospi38); + x = _mm_mullo_epi32(in[6], cospi26); + v[7] = _mm_sub_epi32(v[7], x); + v[7] = _mm_add_epi32(v[7], rnding); + v[7] = _mm_srai_epi32(v[7], bit); + + v[8] = _mm_mullo_epi32(in[7], cospi34); + x = _mm_mullo_epi32(in[8], cospi30); + v[8] = _mm_add_epi32(v[8], x); + v[8] = _mm_add_epi32(v[8], rnding); + v[8] = _mm_srai_epi32(v[8], bit); + + v[9] = _mm_mullo_epi32(in[7], cospi30); + x = _mm_mullo_epi32(in[8], cospi34); + v[9] = _mm_sub_epi32(v[9], x); + v[9] = _mm_add_epi32(v[9], rnding); + v[9] = _mm_srai_epi32(v[9], bit); + + v[10] = _mm_mullo_epi32(in[5], cospi42); + x = _mm_mullo_epi32(in[10], cospi22); + v[10] = _mm_add_epi32(v[10], x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[11] = _mm_mullo_epi32(in[5], cospi22); + x = _mm_mullo_epi32(in[10], cospi42); + v[11] = _mm_sub_epi32(v[11], x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = _mm_mullo_epi32(in[3], cospi50); + x = _mm_mullo_epi32(in[12], cospi14); + v[12] = _mm_add_epi32(v[12], x); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + v[13] = _mm_mullo_epi32(in[3], cospi14); + x = _mm_mullo_epi32(in[12], cospi50); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[14] = _mm_mullo_epi32(in[1], cospi58); + x = _mm_mullo_epi32(in[14], cospi6); + v[14] = _mm_add_epi32(v[14], x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_mullo_epi32(in[1], cospi6); + x = _mm_mullo_epi32(in[14], cospi58); + v[15] = _mm_sub_epi32(v[15], x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 3 + addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi); + + // stage 4 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = _mm_mullo_epi32(u[8], cospi8); + x = _mm_mullo_epi32(u[9], cospi56); + v[8] = _mm_add_epi32(v[8], x); + v[8] = _mm_add_epi32(v[8], rnding); + v[8] = _mm_srai_epi32(v[8], bit); + + v[9] = _mm_mullo_epi32(u[8], cospi56); + x = _mm_mullo_epi32(u[9], cospi8); + v[9] = _mm_sub_epi32(v[9], x); + v[9] = _mm_add_epi32(v[9], rnding); + v[9] = _mm_srai_epi32(v[9], bit); + + v[10] = _mm_mullo_epi32(u[10], cospi40); + x = _mm_mullo_epi32(u[11], cospi24); + v[10] = _mm_add_epi32(v[10], x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[11] = _mm_mullo_epi32(u[10], cospi24); + x = _mm_mullo_epi32(u[11], cospi40); + v[11] = _mm_sub_epi32(v[11], x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = _mm_mullo_epi32(u[12], cospim56); + x = _mm_mullo_epi32(u[13], cospi8); + v[12] = _mm_add_epi32(v[12], x); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + v[13] = _mm_mullo_epi32(u[12], cospi8); + x = _mm_mullo_epi32(u[13], cospim56); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[14] = _mm_mullo_epi32(u[14], cospim24); + x = _mm_mullo_epi32(u[15], cospi40); + v[14] = _mm_add_epi32(v[14], x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_mullo_epi32(u[14], cospi40); + x = _mm_mullo_epi32(u[15], cospim24); + v[15] = _mm_sub_epi32(v[15], x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 5 + addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + + v[4] = _mm_mullo_epi32(u[4], cospi16); + x = _mm_mullo_epi32(u[5], cospi48); + v[4] = _mm_add_epi32(v[4], x); + v[4] = _mm_add_epi32(v[4], rnding); + v[4] = _mm_srai_epi32(v[4], bit); + + v[5] = _mm_mullo_epi32(u[4], cospi48); + x = _mm_mullo_epi32(u[5], cospi16); + v[5] = _mm_sub_epi32(v[5], x); + v[5] = _mm_add_epi32(v[5], rnding); + v[5] = _mm_srai_epi32(v[5], bit); + + v[6] = _mm_mullo_epi32(u[6], cospim48); + x = _mm_mullo_epi32(u[7], cospi16); + v[6] = _mm_add_epi32(v[6], x); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + v[7] = _mm_mullo_epi32(u[6], cospi16); + x = _mm_mullo_epi32(u[7], cospim48); + v[7] = _mm_sub_epi32(v[7], x); + v[7] = _mm_add_epi32(v[7], rnding); + v[7] = _mm_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + v[10] = u[10]; + v[11] = u[11]; + + v[12] = _mm_mullo_epi32(u[12], cospi16); + x = _mm_mullo_epi32(u[13], cospi48); + v[12] = _mm_add_epi32(v[12], x); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + v[13] = _mm_mullo_epi32(u[12], cospi48); + x = _mm_mullo_epi32(u[13], cospi16); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[14] = _mm_mullo_epi32(u[14], cospim48); + x = _mm_mullo_epi32(u[15], cospi16); + v[14] = _mm_add_epi32(v[14], x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_mullo_epi32(u[14], cospi16); + x = _mm_mullo_epi32(u[15], cospim48); + v[15] = _mm_sub_epi32(v[15], x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 7 + addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi); + + // stage 8 + v[0] = u[0]; + v[1] = u[1]; + + y = _mm_mullo_epi32(u[2], cospi32); + x = _mm_mullo_epi32(u[3], cospi32); + v[2] = _mm_add_epi32(y, x); + v[2] = _mm_add_epi32(v[2], rnding); + v[2] = _mm_srai_epi32(v[2], bit); + + v[3] = _mm_sub_epi32(y, x); + v[3] = _mm_add_epi32(v[3], rnding); + v[3] = _mm_srai_epi32(v[3], bit); + + v[4] = u[4]; + v[5] = u[5]; + + y = _mm_mullo_epi32(u[6], cospi32); + x = _mm_mullo_epi32(u[7], cospi32); + v[6] = _mm_add_epi32(y, x); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + v[7] = _mm_sub_epi32(y, x); + v[7] = _mm_add_epi32(v[7], rnding); + v[7] = _mm_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + + y = _mm_mullo_epi32(u[10], cospi32); + x = _mm_mullo_epi32(u[11], cospi32); + v[10] = _mm_add_epi32(y, x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[11] = _mm_sub_epi32(y, x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = u[12]; + v[13] = u[13]; + + y = _mm_mullo_epi32(u[14], cospi32); + x = _mm_mullo_epi32(u[15], cospi32); + v[14] = _mm_add_epi32(y, x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_sub_epi32(y, x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 9 + if (do_cols) { + out[0] = v[0]; + out[1] = _mm_sub_epi32(zero, v[8]); + out[2] = v[12]; + out[3] = _mm_sub_epi32(zero, v[4]); + out[4] = v[6]; + out[5] = _mm_sub_epi32(zero, v[14]); + out[6] = v[10]; + out[7] = _mm_sub_epi32(zero, v[2]); + out[8] = v[3]; + out[9] = _mm_sub_epi32(zero, v[11]); + out[10] = v[15]; + out[11] = _mm_sub_epi32(zero, v[7]); + out[12] = v[5]; + out[13] = _mm_sub_epi32(zero, v[13]); + out[14] = v[9]; + out[15] = _mm_sub_epi32(zero, v[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } +} +static void iidentity16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + (void)bit; + __m128i fact = _mm_set1_epi32(2 * NewSqrt2); + __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1)); + __m128i a0_low, a0_high, a1_low, a1_high; + __m128i zero = _mm_set1_epi32(0); + offset = _mm_unpacklo_epi32(offset, zero); + + for (int i = 0; i < 16; i++) { + a0_low = _mm_mul_epi32(in[i], fact); + a0_low = _mm_add_epi32(a0_low, offset); + a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits); + + a0_high = _mm_srli_si128(in[i], 4); + a0_high = _mm_mul_epi32(a0_high, fact); + a0_high = _mm_add_epi32(a0_high, offset); + a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits); + + a1_low = _mm_unpacklo_epi32(a0_low, a0_high); + a1_high = _mm_unpackhi_epi32(a0_low, a0_high); + out[i] = _mm_unpacklo_epi64(a1_low, a1_high); + } + + if (!do_cols) { + const int log_range = AOMMAX(16, bd + 6); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + round_shift_8x8(out, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 16); + } +} +static INLINE void idct64_stage8_sse4_1( + __m128i *u, const __m128i *cospim32, const __m128i *cospi32, + const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16, + const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi, + const __m128i *rnding, int bit) { + int i; + __m128i temp1, temp2, temp3, temp4; + temp1 = half_btf_sse4_1(cospim32, &u[10], cospi32, &u[13], rnding, bit); + u[13] = half_btf_sse4_1(cospi32, &u[10], cospi32, &u[13], rnding, bit); + u[10] = temp1; + temp2 = half_btf_sse4_1(cospim32, &u[11], cospi32, &u[12], rnding, bit); + u[12] = half_btf_sse4_1(cospi32, &u[11], cospi32, &u[12], rnding, bit); + u[11] = temp2; + + for (i = 16; i < 20; ++i) { + addsub_sse4_1(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi); + addsub_sse4_1(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, + clamp_hi); + } + + temp1 = half_btf_sse4_1(cospim16, &u[36], cospi48, &u[59], rnding, bit); + temp2 = half_btf_sse4_1(cospim16, &u[37], cospi48, &u[58], rnding, bit); + temp3 = half_btf_sse4_1(cospim16, &u[38], cospi48, &u[57], rnding, bit); + temp4 = half_btf_sse4_1(cospim16, &u[39], cospi48, &u[56], rnding, bit); + u[56] = half_btf_sse4_1(cospi48, &u[39], cospi16, &u[56], rnding, bit); + u[57] = half_btf_sse4_1(cospi48, &u[38], cospi16, &u[57], rnding, bit); + u[58] = half_btf_sse4_1(cospi48, &u[37], cospi16, &u[58], rnding, bit); + u[59] = half_btf_sse4_1(cospi48, &u[36], cospi16, &u[59], rnding, bit); + u[36] = temp1; + u[37] = temp2; + u[38] = temp3; + u[39] = temp4; + + temp1 = half_btf_sse4_1(cospim48, &u[40], cospim16, &u[55], rnding, bit); + temp2 = half_btf_sse4_1(cospim48, &u[41], cospim16, &u[54], rnding, bit); + temp3 = half_btf_sse4_1(cospim48, &u[42], cospim16, &u[53], rnding, bit); + temp4 = half_btf_sse4_1(cospim48, &u[43], cospim16, &u[52], rnding, bit); + u[52] = half_btf_sse4_1(cospim16, &u[43], cospi48, &u[52], rnding, bit); + u[53] = half_btf_sse4_1(cospim16, &u[42], cospi48, &u[53], rnding, bit); + u[54] = half_btf_sse4_1(cospim16, &u[41], cospi48, &u[54], rnding, bit); + u[55] = half_btf_sse4_1(cospim16, &u[40], cospi48, &u[55], rnding, bit); + u[40] = temp1; + u[41] = temp2; + u[42] = temp3; + u[43] = temp4; +} + +static INLINE void idct64_stage9_sse4_1(__m128i *u, const __m128i *cospim32, + const __m128i *cospi32, + const __m128i *clamp_lo, + const __m128i *clamp_hi, + const __m128i *rnding, int bit) { + int i; + __m128i temp1, temp2, temp3, temp4; + for (i = 0; i < 8; ++i) { + addsub_sse4_1(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi); + } + + temp1 = half_btf_sse4_1(cospim32, &u[20], cospi32, &u[27], rnding, bit); + temp2 = half_btf_sse4_1(cospim32, &u[21], cospi32, &u[26], rnding, bit); + temp3 = half_btf_sse4_1(cospim32, &u[22], cospi32, &u[25], rnding, bit); + temp4 = half_btf_sse4_1(cospim32, &u[23], cospi32, &u[24], rnding, bit); + u[24] = half_btf_sse4_1(cospi32, &u[23], cospi32, &u[24], rnding, bit); + u[25] = half_btf_sse4_1(cospi32, &u[22], cospi32, &u[25], rnding, bit); + u[26] = half_btf_sse4_1(cospi32, &u[21], cospi32, &u[26], rnding, bit); + u[27] = half_btf_sse4_1(cospi32, &u[20], cospi32, &u[27], rnding, bit); + u[20] = temp1; + u[21] = temp2; + u[22] = temp3; + u[23] = temp4; + for (i = 32; i < 40; i++) { + addsub_sse4_1(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi); + } + + for (i = 48; i < 56; i++) { + addsub_sse4_1(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi); + } +} + +static INLINE void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32, + const __m128i *cospi32, + const __m128i *clamp_lo, + const __m128i *clamp_hi, + const __m128i *rnding, int bit) { + __m128i temp1, temp2, temp3, temp4; + for (int i = 0; i < 16; i++) { + addsub_sse4_1(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi); + } + + temp1 = half_btf_sse4_1(cospim32, &u[40], cospi32, &u[55], rnding, bit); + temp2 = half_btf_sse4_1(cospim32, &u[41], cospi32, &u[54], rnding, bit); + temp3 = half_btf_sse4_1(cospim32, &u[42], cospi32, &u[53], rnding, bit); + temp4 = half_btf_sse4_1(cospim32, &u[43], cospi32, &u[52], rnding, bit); + u[52] = half_btf_sse4_1(cospi32, &u[43], cospi32, &u[52], rnding, bit); + u[53] = half_btf_sse4_1(cospi32, &u[42], cospi32, &u[53], rnding, bit); + u[54] = half_btf_sse4_1(cospi32, &u[41], cospi32, &u[54], rnding, bit); + u[55] = half_btf_sse4_1(cospi32, &u[40], cospi32, &u[55], rnding, bit); + u[40] = temp1; + u[41] = temp2; + u[42] = temp3; + u[43] = temp4; + + temp1 = half_btf_sse4_1(cospim32, &u[44], cospi32, &u[51], rnding, bit); + temp2 = half_btf_sse4_1(cospim32, &u[45], cospi32, &u[50], rnding, bit); + temp3 = half_btf_sse4_1(cospim32, &u[46], cospi32, &u[49], rnding, bit); + temp4 = half_btf_sse4_1(cospim32, &u[47], cospi32, &u[48], rnding, bit); + u[48] = half_btf_sse4_1(cospi32, &u[47], cospi32, &u[48], rnding, bit); + u[49] = half_btf_sse4_1(cospi32, &u[46], cospi32, &u[49], rnding, bit); + u[50] = half_btf_sse4_1(cospi32, &u[45], cospi32, &u[50], rnding, bit); + u[51] = half_btf_sse4_1(cospi32, &u[44], cospi32, &u[51], rnding, bit); + u[44] = temp1; + u[45] = temp2; + u[46] = temp3; + u[47] = temp4; +} + +static INLINE void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols, + int bd, int out_shift, + const __m128i *clamp_lo, + const __m128i *clamp_hi) { + for (int i = 0; i < 32; i++) { + addsub_sse4_1(u[i], u[63 - i], out + i, out + 63 - i, clamp_lo, clamp_hi); + } + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + for (int i = 0; i < 64; i += 4) { + round_shift_4x4(out + i, out_shift); + highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out, &clamp_hi_out, + 4); + } + } +} + +static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + + { + __m128i x; + + // stage 1 + // stage 2 + // stage 3 + // stage 4 + // stage 5 + // stage 6 + x = half_btf_0_sse4_1(&cospi32, &in[0], &rnding, bit); + + // stage 8 + // stage 9 + // stage 10 + // stage 11 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1))); + clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + if (out_shift != 0) { + __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); + x = _mm_add_epi32(x, offset); + x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); + } + } + x = _mm_max_epi32(x, clamp_lo); + x = _mm_min_epi32(x, clamp_hi); + out[0] = x; + out[1] = x; + out[2] = x; + out[3] = x; + out[4] = x; + out[5] = x; + out[6] = x; + out[7] = x; + out[8] = x; + out[9] = x; + out[10] = x; + out[11] = x; + out[12] = x; + out[13] = x; + out[14] = x; + out[15] = x; + out[16] = x; + out[17] = x; + out[18] = x; + out[19] = x; + out[20] = x; + out[21] = x; + out[22] = x; + out[23] = x; + out[24] = x; + out[25] = x; + out[26] = x; + out[27] = x; + out[28] = x; + out[29] = x; + out[30] = x; + out[31] = x; + out[32] = x; + out[33] = x; + out[34] = x; + out[35] = x; + out[36] = x; + out[37] = x; + out[38] = x; + out[39] = x; + out[40] = x; + out[41] = x; + out[42] = x; + out[43] = x; + out[44] = x; + out[45] = x; + out[46] = x; + out[47] = x; + out[48] = x; + out[49] = x; + out[50] = x; + out[51] = x; + out[52] = x; + out[53] = x; + out[54] = x; + out[55] = x; + out[56] = x; + out[57] = x; + out[58] = x; + out[59] = x; + out[60] = x; + out[61] = x; + out[62] = x; + out[63] = x; + } +} + +static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + + const __m128i cospi1 = _mm_set1_epi32(cospi[1]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi3 = _mm_set1_epi32(cospi[3]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim12 = _mm_set1_epi32(-cospi[12]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospim28 = _mm_set1_epi32(-cospi[28]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospi63 = _mm_set1_epi32(cospi[63]); + const __m128i cospim57 = _mm_set1_epi32(-cospi[57]); + const __m128i cospi7 = _mm_set1_epi32(cospi[7]); + const __m128i cospi5 = _mm_set1_epi32(cospi[5]); + const __m128i cospi59 = _mm_set1_epi32(cospi[59]); + const __m128i cospim61 = _mm_set1_epi32(-cospi[61]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + + { + __m128i u[64]; + + // stage 1 + u[0] = in[0]; + u[8] = in[4]; + u[16] = in[2]; + u[24] = in[6]; + u[32] = in[1]; + u[40] = in[5]; + u[48] = in[3]; + u[56] = in[7]; + + // stage 2 + u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit); + u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit); + u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit); + u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit); + u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit); + u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit); + u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit); + u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit); + + // stage 3 + u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit); + u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit); + u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit); + u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit); + u[33] = u[32]; + u[38] = u[39]; + u[41] = u[40]; + u[46] = u[47]; + u[49] = u[48]; + u[54] = u[55]; + u[57] = u[56]; + u[62] = u[63]; + + // stage 4 + __m128i temp1, temp2; + u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); + u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); + u[17] = u[16]; + u[22] = u[23]; + u[25] = u[24]; + u[30] = u[31]; + + temp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); + u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + u[33] = temp1; + + temp2 = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); + u[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); + u[57] = temp2; + + temp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); + u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); + u[41] = temp1; + + temp2 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); + u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); + u[46] = temp2; + + // stage 5 + u[9] = u[8]; + u[14] = u[15]; + + temp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit); + u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit); + u[17] = temp1; + + temp2 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit); + u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit); + u[22] = temp2; + + u[35] = u[32]; + u[34] = u[33]; + u[36] = u[39]; + u[37] = u[38]; + u[43] = u[40]; + u[42] = u[41]; + u[44] = u[47]; + u[45] = u[46]; + u[51] = u[48]; + u[50] = u[49]; + u[52] = u[55]; + u[53] = u[54]; + u[59] = u[56]; + u[58] = u[57]; + u[60] = u[63]; + u[61] = u[62]; + + // stage 6 + temp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + u[0] = temp1; + + temp2 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + u[9] = temp2; + u[19] = u[16]; + u[18] = u[17]; + u[20] = u[23]; + u[21] = u[22]; + u[27] = u[24]; + u[26] = u[25]; + u[28] = u[31]; + u[29] = u[30]; + + temp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); + u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + u[34] = temp1; + temp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); + u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); + u[35] = temp2; + temp1 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); + u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); + u[36] = temp1; + temp2 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); + u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); + u[37] = temp2; + temp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); + u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); + u[42] = temp1; + temp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); + u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); + u[43] = temp2; + temp1 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); + u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); + u[44] = temp1; + temp2 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); + u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); + u[45] = temp2; + + // stage 7 + u[3] = u[0]; + u[2] = u[1]; + u[11] = u[8]; + u[10] = u[9]; + u[12] = u[15]; + u[13] = u[14]; + + temp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit); + u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit); + u[18] = temp1; + temp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit); + u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit); + u[19] = temp2; + temp1 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit); + u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit); + u[20] = temp1; + temp2 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit); + u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit); + u[21] = temp2; + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + u[7] = u[0]; + u[6] = u[1]; + u[5] = u[2]; + u[4] = u[3]; + + idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); + + // stage 9 + idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 10 + idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 11 + idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); + } +} + +static void idct64x64_low16_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + + const __m128i cospi1 = _mm_set1_epi32(cospi[1]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi3 = _mm_set1_epi32(cospi[3]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi5 = _mm_set1_epi32(cospi[5]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi7 = _mm_set1_epi32(cospi[7]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi9 = _mm_set1_epi32(cospi[9]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi11 = _mm_set1_epi32(cospi[11]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi13 = _mm_set1_epi32(cospi[13]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi15 = _mm_set1_epi32(cospi[15]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi51 = _mm_set1_epi32(cospi[51]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi55 = _mm_set1_epi32(cospi[55]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi59 = _mm_set1_epi32(cospi[59]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi63 = _mm_set1_epi32(cospi[63]); + + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim12 = _mm_set1_epi32(-cospi[12]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospim28 = _mm_set1_epi32(-cospi[28]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim44 = _mm_set1_epi32(-cospi[44]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospim49 = _mm_set1_epi32(-cospi[49]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospim53 = _mm_set1_epi32(-cospi[53]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim57 = _mm_set1_epi32(-cospi[57]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospim60 = _mm_set1_epi32(-cospi[60]); + const __m128i cospim61 = _mm_set1_epi32(-cospi[61]); + + { + __m128i u[64]; + __m128i tmp1, tmp2, tmp3, tmp4; + // stage 1 + u[0] = in[0]; + u[32] = in[1]; + u[36] = in[9]; + u[40] = in[5]; + u[44] = in[13]; + u[48] = in[3]; + u[52] = in[11]; + u[56] = in[7]; + u[60] = in[15]; + u[16] = in[2]; + u[20] = in[10]; + u[24] = in[6]; + u[28] = in[14]; + u[4] = in[8]; + u[8] = in[4]; + u[12] = in[12]; + + // stage 2 + u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit); + u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit); + u[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit); + u[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit); + u[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit); + u[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit); + u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit); + u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit); + u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit); + u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit); + u[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit); + u[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit); + u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit); + u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit); + u[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit); + u[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit); + + // stage 3 + u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit); + u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit); + u[19] = half_btf_0_sse4_1(&cospim50, &u[28], &rnding, bit); + u[28] = half_btf_0_sse4_1(&cospi14, &u[28], &rnding, bit); + u[27] = half_btf_0_sse4_1(&cospi10, &u[20], &rnding, bit); + u[20] = half_btf_0_sse4_1(&cospi54, &u[20], &rnding, bit); + u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit); + u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit); + u[33] = u[32]; + u[34] = u[35]; + u[37] = u[36]; + u[38] = u[39]; + u[41] = u[40]; + u[42] = u[43]; + u[45] = u[44]; + u[46] = u[47]; + u[49] = u[48]; + u[50] = u[51]; + u[53] = u[52]; + u[54] = u[55]; + u[57] = u[56]; + u[58] = u[59]; + u[61] = u[60]; + u[62] = u[63]; + + // stage 4 + u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); + u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); + u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit); + u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit); + + u[17] = u[16]; + u[18] = u[19]; + u[21] = u[20]; + u[22] = u[23]; + u[25] = u[24]; + u[26] = u[27]; + u[29] = u[28]; + u[30] = u[31]; + + tmp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); + u[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); + u[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); + u[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); + u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + u[33] = tmp1; + u[34] = tmp2; + u[37] = tmp3; + u[38] = tmp4; + + tmp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); + u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); + u[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); + u[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); + u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); + u[41] = tmp1; + u[42] = tmp2; + u[45] = tmp3; + u[46] = tmp4; + + // stage 5 + u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit); + u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit); + + u[9] = u[8]; + u[10] = u[11]; + u[13] = u[12]; + u[14] = u[15]; + + tmp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit); + u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit); + u[26] = half_btf_sse4_1(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit); + u[29] = half_btf_sse4_1(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit); + u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit); + u[17] = tmp1; + u[18] = tmp2; + u[21] = tmp3; + u[22] = tmp4; + + for (i = 32; i < 64; i += 8) { + addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + // stage 6 + tmp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + u[0] = tmp1; + u[5] = u[4]; + u[6] = u[7]; + + tmp1 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + u[9] = tmp1; + tmp2 = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + u[10] = tmp2; + + for (i = 16; i < 32; i += 8) { + addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + tmp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); + u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); + u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); + u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); + u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + u[34] = tmp1; + u[35] = tmp2; + u[36] = tmp3; + u[37] = tmp4; + + tmp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); + u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); + u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); + u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); + u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); + u[42] = tmp1; + u[43] = tmp2; + u[44] = tmp3; + u[45] = tmp4; + + // stage 7 + u[3] = u[0]; + u[2] = u[1]; + tmp1 = half_btf_sse4_1(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit); + u[6] = half_btf_sse4_1(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit); + u[5] = tmp1; + addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + tmp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit); + u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit); + u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit); + u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit); + u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit); + u[18] = tmp1; + u[19] = tmp2; + u[20] = tmp3; + u[21] = tmp4; + + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + for (i = 0; i < 4; ++i) { + addsub_sse4_1(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi); + } + + idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); + + // stage 9 + idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 10 + idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 11 + idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); + } +} + +static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + + const __m128i cospi1 = _mm_set1_epi32(cospi[1]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi3 = _mm_set1_epi32(cospi[3]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi5 = _mm_set1_epi32(cospi[5]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi7 = _mm_set1_epi32(cospi[7]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi9 = _mm_set1_epi32(cospi[9]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi11 = _mm_set1_epi32(cospi[11]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi13 = _mm_set1_epi32(cospi[13]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi15 = _mm_set1_epi32(cospi[15]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi17 = _mm_set1_epi32(cospi[17]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi19 = _mm_set1_epi32(cospi[19]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi21 = _mm_set1_epi32(cospi[21]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospi23 = _mm_set1_epi32(cospi[23]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi25 = _mm_set1_epi32(cospi[25]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi27 = _mm_set1_epi32(cospi[27]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi29 = _mm_set1_epi32(cospi[29]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospi31 = _mm_set1_epi32(cospi[31]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi35 = _mm_set1_epi32(cospi[35]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospi39 = _mm_set1_epi32(cospi[39]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi43 = _mm_set1_epi32(cospi[43]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospi47 = _mm_set1_epi32(cospi[47]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi51 = _mm_set1_epi32(cospi[51]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi55 = _mm_set1_epi32(cospi[55]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi59 = _mm_set1_epi32(cospi[59]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi63 = _mm_set1_epi32(cospi[63]); + + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim12 = _mm_set1_epi32(-cospi[12]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospim28 = _mm_set1_epi32(-cospi[28]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospim33 = _mm_set1_epi32(-cospi[33]); + const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospim37 = _mm_set1_epi32(-cospi[37]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim41 = _mm_set1_epi32(-cospi[41]); + const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); + const __m128i cospim44 = _mm_set1_epi32(-cospi[44]); + const __m128i cospim45 = _mm_set1_epi32(-cospi[45]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospim49 = _mm_set1_epi32(-cospi[49]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospim53 = _mm_set1_epi32(-cospi[53]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim57 = _mm_set1_epi32(-cospi[57]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospim60 = _mm_set1_epi32(-cospi[60]); + const __m128i cospim61 = _mm_set1_epi32(-cospi[61]); + + { + __m128i u[64], v[64]; + + // stage 1 + u[32] = in[1]; + u[34] = in[17]; + u[36] = in[9]; + u[38] = in[25]; + u[40] = in[5]; + u[42] = in[21]; + u[44] = in[13]; + u[46] = in[29]; + u[48] = in[3]; + u[50] = in[19]; + u[52] = in[11]; + u[54] = in[27]; + u[56] = in[7]; + u[58] = in[23]; + u[60] = in[15]; + u[62] = in[31]; + + v[16] = in[2]; + v[18] = in[18]; + v[20] = in[10]; + v[22] = in[26]; + v[24] = in[6]; + v[26] = in[22]; + v[28] = in[14]; + v[30] = in[30]; + + u[8] = in[4]; + u[10] = in[20]; + u[12] = in[12]; + u[14] = in[28]; + + v[4] = in[8]; + v[6] = in[24]; + + u[0] = in[0]; + u[2] = in[16]; + + // stage 2 + v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit); + v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit); + v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit); + v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit); + v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit); + v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit); + v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit); + v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit); + v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit); + v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit); + v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit); + v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit); + v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit); + v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit); + v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit); + v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit); + v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit); + v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit); + v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit); + v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit); + v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit); + v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit); + v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit); + v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit); + v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit); + v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit); + v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit); + v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit); + v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit); + v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit); + v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit); + v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit); + + // stage 3 + u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit); + u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit); + u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit); + u[19] = half_btf_0_sse4_1(&cospim50, &v[28], &rnding, bit); + u[20] = half_btf_0_sse4_1(&cospi54, &v[20], &rnding, bit); + u[21] = half_btf_0_sse4_1(&cospim42, &v[26], &rnding, bit); + u[22] = half_btf_0_sse4_1(&cospi38, &v[22], &rnding, bit); + u[23] = half_btf_0_sse4_1(&cospim58, &v[24], &rnding, bit); + u[24] = half_btf_0_sse4_1(&cospi6, &v[24], &rnding, bit); + u[25] = half_btf_0_sse4_1(&cospi26, &v[22], &rnding, bit); + u[26] = half_btf_0_sse4_1(&cospi22, &v[26], &rnding, bit); + u[27] = half_btf_0_sse4_1(&cospi10, &v[20], &rnding, bit); + u[28] = half_btf_0_sse4_1(&cospi14, &v[28], &rnding, bit); + u[29] = half_btf_0_sse4_1(&cospi18, &v[18], &rnding, bit); + u[30] = half_btf_0_sse4_1(&cospi30, &v[30], &rnding, bit); + u[31] = half_btf_0_sse4_1(&cospi2, &v[16], &rnding, bit); + + for (i = 32; i < 64; i += 4) { + addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, + &clamp_hi); + addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, + &clamp_hi); + } + + // stage 4 + v[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); + v[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit); + v[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit); + v[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit); + v[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit); + v[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit); + v[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit); + v[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); + + for (i = 16; i < 32; i += 4) { + addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 64; i += 4) { + v[i + 0] = u[i + 0]; + v[i + 3] = u[i + 3]; + } + + v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); + v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); + v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); + v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); + v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); + v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); + v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); + v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); + v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); + v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); + v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); + v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); + v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); + v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); + v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); + v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + + // stage 5 + u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit); + u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit); + u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit); + u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit); + + for (i = 8; i < 16; i += 4) { + addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, + &clamp_hi); + addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, + &clamp_hi); + } + + for (i = 16; i < 32; i += 4) { + u[i + 0] = v[i + 0]; + u[i + 3] = v[i + 3]; + } + + u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit); + u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit); + u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit); + u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit); + u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit); + u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit); + u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit); + u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit); + + for (i = 32; i < 64; i += 8) { + addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + // stage 6 + v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit); + v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit); + + addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); + + for (i = 8; i < 16; i += 4) { + v[i + 0] = u[i + 0]; + v[i + 3] = u[i + 3]; + } + + v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + + for (i = 16; i < 32; i += 8) { + addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo, + &clamp_hi); + + addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 64; i += 8) { + v[i + 0] = u[i + 0]; + v[i + 1] = u[i + 1]; + v[i + 6] = u[i + 6]; + v[i + 7] = u[i + 7]; + } + + v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); + v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); + v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); + v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); + v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); + v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); + v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); + v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); + v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); + v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); + v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); + v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); + v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); + v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); + v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); + v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + + // stage 7 + addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + + u[4] = v[4]; + u[7] = v[7]; + u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit); + u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit); + + addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + for (i = 16; i < 32; i += 8) { + u[i + 0] = v[i + 0]; + u[i + 1] = v[i + 1]; + u[i + 6] = v[i + 6]; + u[i + 7] = v[i + 7]; + } + + u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit); + u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit); + u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit); + u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit); + u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit); + u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit); + u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit); + u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit); + + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + for (i = 0; i < 4; ++i) { + addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi); + } + + v[8] = u[8]; + v[9] = u[9]; + v[14] = u[14]; + v[15] = u[15]; + + v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit); + v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit); + v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit); + + for (i = 16; i < 20; ++i) { + addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 36; ++i) { + v[i] = u[i]; + v[i + 12] = u[i + 12]; + v[i + 16] = u[i + 16]; + v[i + 28] = u[i + 28]; + } + + v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit); + v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit); + v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit); + v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit); + v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit); + v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit); + v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit); + v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit); + v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit); + v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit); + v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit); + v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit); + v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit); + v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit); + v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit); + v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit); + + // stage 9 + for (i = 0; i < 8; ++i) { + addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi); + } + + for (i = 16; i < 20; ++i) { + u[i] = v[i]; + u[i + 12] = v[i + 12]; + } + + u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit); + u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit); + u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit); + u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit); + u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit); + u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit); + u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit); + u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit); + + for (i = 32; i < 40; i++) { + addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi); + } + + for (i = 48; i < 56; i++) { + addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi); + } + + // stage 10 + for (i = 0; i < 16; i++) { + addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi); + } + + for (i = 32; i < 40; i++) v[i] = u[i]; + + v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit); + v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit); + v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit); + v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit); + v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit); + v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit); + v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit); + v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit); + v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit); + v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit); + v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit); + v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit); + v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit); + v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit); + v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit); + v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit); + + for (i = 56; i < 64; i++) v[i] = u[i]; + + // stage 11 + for (i = 0; i < 32; i++) { + addsub_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo, + &clamp_hi); + } + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = + _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + for (i = 0; i < 64; i += 4) { + round_shift_4x4(out + i, out_shift); + highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out, + &clamp_hi_out, 4); + } + } + } +} + +static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i bf1; + + // stage 0 + // stage 1 + bf1 = in[0]; + + // stage 2 + // stage 3 + // stage 4 + // stage 5 + bf1 = half_btf_0_sse4_1(&cospi32, &bf1, &rounding, bit); + + // stage 6 + // stage 7 + // stage 8 + // stage 9 + if (do_cols) { + bf1 = _mm_max_epi32(bf1, clamp_lo); + bf1 = _mm_min_epi32(bf1, clamp_hi); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1))); + clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + if (out_shift != 0) { + __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); + bf1 = _mm_add_epi32(bf1, offset); + bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift)); + } + } + + bf1 = _mm_max_epi32(bf1, clamp_lo); + bf1 = _mm_min_epi32(bf1, clamp_hi); + out[0] = bf1; + out[1] = bf1; + out[2] = bf1; + out[3] = bf1; + out[4] = bf1; + out[5] = bf1; + out[6] = bf1; + out[7] = bf1; + out[8] = bf1; + out[9] = bf1; + out[10] = bf1; + out[11] = bf1; + out[12] = bf1; + out[13] = bf1; + out[14] = bf1; + out[15] = bf1; + out[16] = bf1; + out[17] = bf1; + out[18] = bf1; + out[19] = bf1; + out[20] = bf1; + out[21] = bf1; + out[22] = bf1; + out[23] = bf1; + out[24] = bf1; + out[25] = bf1; + out[26] = bf1; + out[27] = bf1; + out[28] = bf1; + out[29] = bf1; + out[30] = bf1; + out[31] = bf1; +} + +static void idct32x32_low8_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i bf1[32]; + + // stage 0 + // stage 1 + bf1[0] = in[0]; + bf1[4] = in[4]; + bf1[8] = in[2]; + bf1[12] = in[6]; + bf1[16] = in[1]; + bf1[20] = in[5]; + bf1[24] = in[3]; + bf1[28] = in[7]; + + // stage 2 + bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit); + bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit); + bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit); + bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit); + bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit); + bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit); + bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit); + bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit); + + // stage 3 + bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit); + bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit); + + bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit); + bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit); + bf1[17] = bf1[16]; + bf1[18] = bf1[19]; + bf1[21] = bf1[20]; + bf1[22] = bf1[23]; + bf1[25] = bf1[24]; + bf1[26] = bf1[27]; + bf1[29] = bf1[28]; + bf1[30] = bf1[31]; + + // stage 4 : + bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit); + bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit); + + bf1[9] = bf1[8]; + bf1[10] = bf1[11]; + bf1[13] = bf1[12]; + bf1[14] = bf1[15]; + + idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, + &cospi24, &cospi40, &cospim24, &rounding, bit); + + // stage 5 + bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit); + bf1[1] = bf1[0]; + bf1[5] = bf1[4]; + bf1[6] = bf1[7]; + + idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, + &clamp_hi, &rounding, bit); + + // stage 6 + bf1[3] = bf1[0]; + bf1[2] = bf1[1]; + + idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); + + // stage 7 + idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 8 + idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 9 + idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); +} + +static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); + const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i bf1[32]; + + // stage 0 + // stage 1 + + bf1[0] = in[0]; + bf1[2] = in[8]; + bf1[4] = in[4]; + bf1[6] = in[12]; + bf1[8] = in[2]; + bf1[10] = in[10]; + bf1[12] = in[6]; + bf1[14] = in[14]; + bf1[16] = in[1]; + bf1[18] = in[9]; + bf1[20] = in[5]; + bf1[22] = in[13]; + bf1[24] = in[3]; + bf1[26] = in[11]; + bf1[28] = in[7]; + bf1[30] = in[15]; + + // stage 2 + bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit); + bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit); + bf1[17] = half_btf_0_sse4_1(&cospim34, &bf1[30], &rounding, bit); + bf1[30] = half_btf_0_sse4_1(&cospi30, &bf1[30], &rounding, bit); + bf1[29] = half_btf_0_sse4_1(&cospi18, &bf1[18], &rounding, bit); + bf1[18] = half_btf_0_sse4_1(&cospi46, &bf1[18], &rounding, bit); + bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit); + bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit); + bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit); + bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit); + bf1[21] = half_btf_0_sse4_1(&cospim42, &bf1[26], &rounding, bit); + bf1[26] = half_btf_0_sse4_1(&cospi22, &bf1[26], &rounding, bit); + bf1[25] = half_btf_0_sse4_1(&cospi26, &bf1[22], &rounding, bit); + bf1[22] = half_btf_0_sse4_1(&cospi38, &bf1[22], &rounding, bit); + bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit); + bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit); + + // stage 3 + bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit); + bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit); + bf1[9] = half_btf_0_sse4_1(&cospim36, &bf1[14], &rounding, bit); + bf1[14] = half_btf_0_sse4_1(&cospi28, &bf1[14], &rounding, bit); + bf1[13] = half_btf_0_sse4_1(&cospi20, &bf1[10], &rounding, bit); + bf1[10] = half_btf_0_sse4_1(&cospi44, &bf1[10], &rounding, bit); + bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit); + bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit); + + addsub_sse4_1(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); + // stage 4 + bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit); + bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit); + bf1[5] = half_btf_0_sse4_1(&cospim40, &bf1[6], &rounding, bit); + bf1[6] = half_btf_0_sse4_1(&cospi24, &bf1[6], &rounding, bit); + + addsub_sse4_1(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi); + + idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, + &cospi24, &cospi40, &cospim24, &rounding, bit); + + // stage 5 + bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit); + bf1[1] = bf1[0]; + bf1[3] = half_btf_0_sse4_1(&cospi16, &bf1[2], &rounding, bit); + bf1[2] = half_btf_0_sse4_1(&cospi48, &bf1[2], &rounding, bit); + + addsub_sse4_1(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); + + idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, + &clamp_hi, &rounding, bit); + + // stage 6 + addsub_sse4_1(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi); + + idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); + + // stage 7 + idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 8 + idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + // stage 9 + idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); +} + +static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi58 = _mm_set1_epi32(cospi[58]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi42 = _mm_set1_epi32(cospi[42]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi50 = _mm_set1_epi32(cospi[50]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi34 = _mm_set1_epi32(cospi[34]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospim26 = _mm_set1_epi32(-cospi[26]); + const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); + const __m128i cospim10 = _mm_set1_epi32(-cospi[10]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); + const __m128i cospim18 = _mm_set1_epi32(-cospi[18]); + const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); + const __m128i cospim2 = _mm_set1_epi32(-cospi[2]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i bf1[32], bf0[32]; + + // stage 0 + // stage 1 + bf1[0] = in[0]; + bf1[1] = in[16]; + bf1[2] = in[8]; + bf1[3] = in[24]; + bf1[4] = in[4]; + bf1[5] = in[20]; + bf1[6] = in[12]; + bf1[7] = in[28]; + bf1[8] = in[2]; + bf1[9] = in[18]; + bf1[10] = in[10]; + bf1[11] = in[26]; + bf1[12] = in[6]; + bf1[13] = in[22]; + bf1[14] = in[14]; + bf1[15] = in[30]; + bf1[16] = in[1]; + bf1[17] = in[17]; + bf1[18] = in[9]; + bf1[19] = in[25]; + bf1[20] = in[5]; + bf1[21] = in[21]; + bf1[22] = in[13]; + bf1[23] = in[29]; + bf1[24] = in[3]; + bf1[25] = in[19]; + bf1[26] = in[11]; + bf1[27] = in[27]; + bf1[28] = in[7]; + bf1[29] = in[23]; + bf1[30] = in[15]; + bf1[31] = in[31]; + + // stage 2 + bf0[0] = bf1[0]; + bf0[1] = bf1[1]; + bf0[2] = bf1[2]; + bf0[3] = bf1[3]; + bf0[4] = bf1[4]; + bf0[5] = bf1[5]; + bf0[6] = bf1[6]; + bf0[7] = bf1[7]; + bf0[8] = bf1[8]; + bf0[9] = bf1[9]; + bf0[10] = bf1[10]; + bf0[11] = bf1[11]; + bf0[12] = bf1[12]; + bf0[13] = bf1[13]; + bf0[14] = bf1[14]; + bf0[15] = bf1[15]; + bf0[16] = + half_btf_sse4_1(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit); + bf0[17] = + half_btf_sse4_1(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit); + bf0[18] = + half_btf_sse4_1(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit); + bf0[19] = + half_btf_sse4_1(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit); + bf0[20] = + half_btf_sse4_1(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit); + bf0[21] = + half_btf_sse4_1(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit); + bf0[22] = + half_btf_sse4_1(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit); + bf0[23] = + half_btf_sse4_1(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit); + bf0[24] = + half_btf_sse4_1(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit); + bf0[25] = + half_btf_sse4_1(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit); + bf0[26] = + half_btf_sse4_1(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit); + bf0[27] = + half_btf_sse4_1(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit); + bf0[28] = + half_btf_sse4_1(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit); + bf0[29] = + half_btf_sse4_1(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit); + bf0[30] = + half_btf_sse4_1(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit); + bf0[31] = + half_btf_sse4_1(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit); + + // stage 3 + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = + half_btf_sse4_1(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit); + bf1[9] = + half_btf_sse4_1(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit); + bf1[10] = + half_btf_sse4_1(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit); + bf1[11] = + half_btf_sse4_1(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit); + bf1[12] = + half_btf_sse4_1(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit); + bf1[13] = + half_btf_sse4_1(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit); + bf1[14] = + half_btf_sse4_1(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit); + bf1[15] = + half_btf_sse4_1(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit); + + addsub_sse4_1(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); + + // stage 4 + bf0[0] = bf1[0]; + bf0[1] = bf1[1]; + bf0[2] = bf1[2]; + bf0[3] = bf1[3]; + bf0[4] = + half_btf_sse4_1(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit); + bf0[5] = + half_btf_sse4_1(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit); + bf0[6] = + half_btf_sse4_1(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit); + bf0[7] = half_btf_sse4_1(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit); + + addsub_sse4_1(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi); + + bf0[16] = bf1[16]; + bf0[17] = + half_btf_sse4_1(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit); + bf0[18] = + half_btf_sse4_1(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit); + bf0[19] = bf1[19]; + bf0[20] = bf1[20]; + bf0[21] = + half_btf_sse4_1(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit); + bf0[22] = + half_btf_sse4_1(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit); + bf0[23] = bf1[23]; + bf0[24] = bf1[24]; + bf0[25] = + half_btf_sse4_1(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit); + bf0[26] = + half_btf_sse4_1(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit); + bf0[27] = bf1[27]; + bf0[28] = bf1[28]; + bf0[29] = + half_btf_sse4_1(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit); + bf0[30] = + half_btf_sse4_1(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit); + bf0[31] = bf1[31]; + + // stage 5 + bf1[0] = + half_btf_sse4_1(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit); + bf1[1] = + half_btf_sse4_1(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit); + bf1[2] = + half_btf_sse4_1(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit); + bf1[3] = + half_btf_sse4_1(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit); + addsub_sse4_1(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); + bf1[8] = bf0[8]; + bf1[9] = + half_btf_sse4_1(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit); + bf1[10] = + half_btf_sse4_1(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = + half_btf_sse4_1(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit); + bf1[14] = + half_btf_sse4_1(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit); + bf1[15] = bf0[15]; + addsub_sse4_1(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi); + + // stage 6 + addsub_sse4_1(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi); + bf0[4] = bf1[4]; + bf0[5] = + half_btf_sse4_1(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); + bf0[6] = + half_btf_sse4_1(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); + bf0[7] = bf1[7]; + addsub_sse4_1(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi); + bf0[16] = bf1[16]; + bf0[17] = bf1[17]; + bf0[18] = + half_btf_sse4_1(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit); + bf0[19] = + half_btf_sse4_1(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit); + bf0[20] = + half_btf_sse4_1(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit); + bf0[21] = + half_btf_sse4_1(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit); + bf0[22] = bf1[22]; + bf0[23] = bf1[23]; + bf0[24] = bf1[24]; + bf0[25] = bf1[25]; + bf0[26] = + half_btf_sse4_1(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit); + bf0[27] = + half_btf_sse4_1(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit); + bf0[28] = + half_btf_sse4_1(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit); + bf0[29] = + half_btf_sse4_1(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit); + bf0[30] = bf1[30]; + bf0[31] = bf1[31]; + + // stage 7 + addsub_sse4_1(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = + half_btf_sse4_1(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); + bf1[11] = + half_btf_sse4_1(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); + bf1[12] = + half_btf_sse4_1(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); + bf1[13] = + half_btf_sse4_1(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + addsub_sse4_1(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi); + + // stage 8 + addsub_sse4_1(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi); + bf0[16] = bf1[16]; + bf0[17] = bf1[17]; + bf0[18] = bf1[18]; + bf0[19] = bf1[19]; + bf0[20] = + half_btf_sse4_1(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); + bf0[21] = + half_btf_sse4_1(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); + bf0[22] = + half_btf_sse4_1(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); + bf0[23] = + half_btf_sse4_1(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); + bf0[24] = + half_btf_sse4_1(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); + bf0[25] = + half_btf_sse4_1(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); + bf0[26] = + half_btf_sse4_1(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); + bf0[27] = + half_btf_sse4_1(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); + bf0[28] = bf1[28]; + bf0[29] = bf1[29]; + bf0[30] = bf1[30]; + bf0[31] = bf1[31]; + + // stage 9 + addsub_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + round_shift_8x8(out + 16, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32); + } +} + +void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + switch (tx_type) { + case IDTX: + case H_DCT: + case H_ADST: + case H_FLIPADST: + case V_DCT: + case V_ADST: + case V_FLIPADST: + av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, + txfm_param->tx_size, + txfm_param->eob, bd); + break; + default: + av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, bd); + break; + } +} +void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + int eob = txfm_param->eob; + int bd = txfm_param->bd; + int lossless = txfm_param->lossless; + const int32_t *src = cast_to_int32(input); + const TX_TYPE tx_type = txfm_param->tx_type; + if (lossless) { + assert(tx_type == DCT_DCT); + av1_highbd_iwht4x4_add(input, dest, stride, eob, bd); + return; + } + av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, + bd); +} +static void iidentity32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + (void)bit; + for (int i = 0; i < 32; i += 16) { + out[i] = _mm_slli_epi32(in[i], 2); + out[i + 1] = _mm_slli_epi32(in[i + 1], 2); + out[i + 2] = _mm_slli_epi32(in[i + 2], 2); + out[i + 3] = _mm_slli_epi32(in[i + 3], 2); + out[i + 4] = _mm_slli_epi32(in[i + 4], 2); + out[i + 5] = _mm_slli_epi32(in[i + 5], 2); + out[i + 6] = _mm_slli_epi32(in[i + 6], 2); + out[i + 7] = _mm_slli_epi32(in[i + 7], 2); + out[i + 8] = _mm_slli_epi32(in[i + 8], 2); + out[i + 9] = _mm_slli_epi32(in[i + 9], 2); + out[i + 10] = _mm_slli_epi32(in[i + 10], 2); + out[i + 11] = _mm_slli_epi32(in[i + 11], 2); + out[i + 12] = _mm_slli_epi32(in[i + 12], 2); + out[i + 13] = _mm_slli_epi32(in[i + 13], 2); + out[i + 14] = _mm_slli_epi32(in[i + 14], 2); + out[i + 15] = _mm_slli_epi32(in[i + 15], 2); + } + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + round_shift_8x8(out + 16, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32); + } +} +static const transform_1d_sse4_1 + highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { + { + { idct4x4_sse4_1, NULL, NULL, NULL }, + { iadst4x4_sse4_1, NULL, NULL, NULL }, + { iidentity4_sse4_1, iidentity4_sse4_1, iidentity4_sse4_1, NULL }, + }, + { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL }, + { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL }, + { iidentity8_sse4_1, iidentity8_sse4_1, NULL, NULL } }, + { + { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1, + NULL }, + { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1, + NULL }, + { iidentity16_sse4_1, NULL, iidentity16_sse4_1, NULL }, + }, + { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1, + idct32x32_sse4_1 }, + { NULL, NULL, NULL, NULL }, + { iidentity32_sse4_1, NULL, NULL, NULL } }, + { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1, + idct64x64_sse4_1 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } } + }; +static void highbd_inv_txfm2d_add_h_identity_ssse41(const int32_t *input, + uint16_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob, + const int bd) { + __m128i buf1[64]; + int eobx, eoby; + get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int input_stride = AOMMIN(32, txfm_size_col); + const int buf_size_w_div4 = input_stride >> 2; + const int buf_size_h_div8 = (eoby + 8) >> 3; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < (buf_size_h_div8 << 1); ++i) { + __m128i buf0[16]; + const int32_t *input_row = input + i * input_stride * 4; + for (int j = 0; j < buf_size_w_div4; ++j) { + __m128i *buf0_cur = buf0 + j * 4; + load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4); + } + if (rect_type == 1 || rect_type == -1) { + av1_round_shift_rect_array_32_sse4_1(buf0, buf0, input_stride, 0, + NewInvSqrt2); + } + row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + -shift[0]); + + __m128i *_buf1 = buf1 + i * 4; + + for (int j = 0; j < buf_size_w_div4; ++j) { + _buf1[j * txfm_size_row + 0] = buf0[j * 4 + 0]; + _buf1[j * txfm_size_row + 1] = buf0[j * 4 + 1]; + _buf1[j * txfm_size_row + 2] = buf0[j * 4 + 2]; + _buf1[j * txfm_size_row + 3] = buf0[j * 4 + 3]; + } + } + for (int i = 0; i < buf_size_w_div4; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, + av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + + av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, output + 8 * i, + stride, ud_flip, txfm_size_row, bd); + } +} +static void highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t *input, + uint16_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob, + const int bd) { + __m128i buf1[64]; + int eobx, eoby; + get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int input_stride = AOMMIN(32, txfm_size_col); + const int buf_size_w_div8 = input_stride >> 2; + const int row_max = AOMMIN(32, txfm_size_row); + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx]; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < (row_max >> 2); ++i) { + __m128i buf0[16]; + const int32_t *input_row = input + i * input_stride * 4; + for (int j = 0; j < (buf_size_nonzero_w_div8 << 1); ++j) { + __m128i *buf0_cur = buf0 + j * 4; + load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4); + + TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3], + buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]); + } + if (rect_type == 1 || rect_type == -1) { + av1_round_shift_rect_array_32_sse4_1( + buf0, buf0, (buf_size_nonzero_w_div8 << 3), 0, NewInvSqrt2); + } + row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + -shift[0]); + + __m128i *_buf1 = buf1 + i * 4; + if (lr_flip) { + for (int j = 0; j < buf_size_w_div8; ++j) { + TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], + buf0[4 * j], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]); + } + } else { + for (int j = 0; j < buf_size_w_div8; ++j) { + TRANSPOSE_4X4( + buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3], + _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1], + _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]); + } + } + } + for (int i = 0; i < buf_size_w_div8; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, + av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + + av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + { + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, + output + 8 * i, stride, ud_flip, + txfm_size_row, bd); + } + } +} +static void highbd_inv_txfm2d_add_idtx_ssse41(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + (void)eob; + __m128i buf1[64 * 4]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int input_stride = AOMMIN(32, txfm_size_col); + const int row_max = AOMMIN(32, txfm_size_row); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + + for (int i = 0; i < (row_max >> 2); ++i) { + __m128i buf0[32]; + const int32_t *input_row = input + i * input_stride * 4; + for (int j = 0; j < (input_stride >> 2); ++j) { + __m128i *buf0_cur = buf0 + j * 4; + load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4); + } + if (rect_type == 1 || rect_type == -1) { + av1_round_shift_rect_array_32_sse4_1(buf0, buf0, input_stride, 0, + NewInvSqrt2); + } + row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + -shift[0]); + + __m128i *_buf1 = buf1 + i * 4; + for (int j = 0; j < (input_stride >> 2); ++j) { + _buf1[j * txfm_size_row + 0] = buf0[j * 4 + 0]; + _buf1[j * txfm_size_row + 1] = buf0[j * 4 + 1]; + _buf1[j * txfm_size_row + 2] = buf0[j * 4 + 2]; + _buf1[j * txfm_size_row + 3] = buf0[j * 4 + 3]; + } + } + for (int i = 0; i < (input_stride >> 2); i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, + av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + + av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + { + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, + output + 8 * i, stride, 0, txfm_size_row, + bd); + } + } +} +static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input, + uint16_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob, + const int bd) { + __m128i buf1[64 * 16]; + int eobx, eoby; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 2; + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int input_stride = AOMMIN(32, txfm_size_col); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) { + __m128i buf0[64]; + const int32_t *input_row = input + i * input_stride * 4; + for (int j = 0; j < buf_size_nonzero_w_div8 << 1; ++j) { + __m128i *buf0_cur = buf0 + j * 4; + load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4); + + TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3], + buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]); + } + if (rect_type == 1 || rect_type == -1) { + av1_round_shift_rect_array_32_sse4_1( + buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2); + } + row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + -shift[0]); + + __m128i *_buf1 = buf1 + i * 4; + if (lr_flip) { + for (int j = 0; j < buf_size_w_div8; ++j) { + TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], + buf0[4 * j], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]); + } + } else { + for (int j = 0; j < buf_size_w_div8; ++j) { + TRANSPOSE_4X4( + buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3], + _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1], + _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]); + } + } + } + // 2nd stage: column transform + for (int i = 0; i < buf_size_w_div8; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, + av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + + av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + { + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, + output + 8 * i, stride, ud_flip, + txfm_size_row, bd); + } + } +} + +static void highbd_inv_txfm2d_add_4x8_sse41(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + (void)eob; + __m128i buf1[8]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][1]; + const int input_stride = AOMMIN(32, txfm_size_col); + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + __m128i buf0[8]; + const int32_t *input_row = input; + __m128i *buf0_cur = buf0; + load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row); + av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_row, 0, + NewInvSqrt2); + row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); + row_txfm(buf0 + 4, buf0 + 4, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + -shift[0]); + + if (lr_flip) { + TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2], + buf1[3]); + + TRANSPOSE_4X4(buf0[7], buf0[6], buf0[5], buf0[4], buf1[4], buf1[5], buf1[6], + buf1[7]); + } else { + TRANSPOSE_4X4(buf0[0], buf0[1], buf0[2], buf0[3], buf1[0], buf1[1], buf1[2], + buf1[3]); + + TRANSPOSE_4X4(buf0[4], buf0[5], buf0[6], buf0[7], buf1[4], buf1[5], buf1[6], + buf1[7]); + } + + // 2nd stage: column transform + col_txfm(buf1, buf1, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + + av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]); + + // write to buffer + highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row, + bd); +} + +static void highbd_inv_txfm2d_add_8x4_sse41(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + (void)eob; + __m128i buf1[8]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][1]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + __m128i buf0[8]; + const int32_t *input_row = input; + load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col); + + TRANSPOSE_4X4(buf0[0], buf0[2], buf0[4], buf0[6], buf1[0], buf1[1], buf1[2], + buf1[3]); + TRANSPOSE_4X4(buf0[1], buf0[3], buf0[5], buf0[7], buf1[4], buf1[5], buf1[6], + buf1[7]); + + av1_round_shift_rect_array_32_sse4_1(buf1, buf0, txfm_size_col, 0, + NewInvSqrt2); + row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); + + __m128i *buf1_ptr; + if (lr_flip) { + flip_buf_sse2(buf0, buf1, txfm_size_col); + buf1_ptr = buf1; + } else { + buf1_ptr = buf0; + } + + // 2nd stage: column transform + for (int i = 0; i < 2; i++) { + col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row, + av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + } + av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]); + // write to buffer + highbd_write_buffer_8xn_sse4_1(buf1_ptr, output, stride, ud_flip, + txfm_size_row, bd); +} + +static void highbd_inv_txfm2d_add_4x16_sse4_1(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + (void)eob; + __m128i buf1[16]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_h_div8 = txfm_size_row >> 2; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2]; + const int input_stride = AOMMIN(32, txfm_size_col); + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + __m128i buf0[16]; + const int32_t *input_row = input; + __m128i *buf0_cur = buf0; + load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row); + for (int i = 0; i < (txfm_size_row >> 2); i++) { + row_txfm(buf0 + (i << 2), buf0 + (i << 2), + av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); + } + + if (lr_flip) { + for (int j = 0; j < buf_size_h_div8; ++j) { + TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], + buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2], + buf1[4 * j + 3]); + } + } else { + for (int j = 0; j < buf_size_h_div8; ++j) { + TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2], + buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1], + buf1[4 * j + 2], buf1[4 * j + 3]); + } + } + + // 2nd stage: column transform + col_txfm(buf1, buf1, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + + av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]); + + // write to buffer + highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row, + bd); +} + +static void highbd_inv_txfm2d_add_16x4_sse4_1(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + (void)eob; + __m128i buf1[16]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 2; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + __m128i buf0[16]; + const int32_t *input_row = input; + load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col); + + for (int j = 0; j < buf_size_w_div8; j++) { + TRANSPOSE_4X4(buf0[j], buf0[j + 4], buf0[j + 8], buf0[j + 12], buf1[4 * j], + buf1[4 * j + 1], buf1[4 * j + 2], buf1[4 * j + 3]); + } + row_txfm(buf1, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); + + __m128i *buf1_ptr; + if (lr_flip) { + flip_buf_sse2(buf0, buf1, txfm_size_col); + buf1_ptr = buf1; + } else { + buf1_ptr = buf0; + } + + // 2nd stage: column transform + for (int i = 0; i < buf_size_w_div8; i++) { + col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row, + av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + } + av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]); + + // write to buffer + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_sse4_1(buf1_ptr + i * txfm_size_row * 2, + output + 8 * i, stride, ud_flip, + txfm_size_row, bd); + } +} + +void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: + case FLIPADST_DCT: + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + highbd_inv_txfm2d_add_no_identity_sse41( + input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, + bd); + break; + case V_DCT: + case V_ADST: + case V_FLIPADST: + highbd_inv_txfm2d_add_h_identity_ssse41( + input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, + bd); + break; + case H_DCT: + case H_ADST: + case H_FLIPADST: + highbd_inv_txfm2d_add_v_identity_ssse41( + input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, + bd); + break; + case IDTX: + highbd_inv_txfm2d_add_idtx_ssse41(input, CONVERT_TO_SHORTPTR(output), + stride, tx_type, tx_size, eob, bd); + break; + default: assert(0); break; + } +} + +void av1_highbd_inv_txfm_add_4x8_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const TX_SIZE tx_size = txfm_param->tx_size; + int eob = txfm_param->eob; + highbd_inv_txfm2d_add_4x8_sse41(input, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, tx_size, eob, bd); +} + +void av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const TX_SIZE tx_size = txfm_param->tx_size; + int eob = txfm_param->eob; + highbd_inv_txfm2d_add_8x4_sse41(input, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, tx_size, eob, bd); +} + +void av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const TX_SIZE tx_size = txfm_param->tx_size; + int eob = txfm_param->eob; + highbd_inv_txfm2d_add_4x16_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, tx_size, eob, bd); +} + +void av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const TX_SIZE tx_size = txfm_param->tx_size; + int eob = txfm_param->eob; + highbd_inv_txfm2d_add_16x4_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, tx_size, eob, bd); +} + +void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + const TX_SIZE tx_size = txfm_param->tx_size; + switch (tx_size) { + case TX_8X8: + av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param); + break; + case TX_4X8: + av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param); + break; + case TX_8X4: + av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param); + break; + case TX_4X4: + av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param); + break; + case TX_16X4: + av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param); + break; + case TX_4X16: + av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param); + break; + default: + av1_highbd_inv_txfm2d_add_universe_sse4_1( + input, dest, stride, txfm_param->tx_type, tx_size, txfm_param->eob, + txfm_param->bd); + break; + } +} diff --git a/libs/libaom/src/av1/common/x86/highbd_jnt_convolve_avx2.c b/libs/libaom/src/av1/common/x86/highbd_jnt_convolve_avx2.c new file mode 100644 index 000000000..70f1ec709 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/highbd_jnt_convolve_avx2.c @@ -0,0 +1,859 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/convolve_avx2.h" +#include "aom_dsp/x86/convolve_common_intrin.h" +#include "aom_dsp/x86/convolve_sse4_1.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "av1/common/convolve.h" + +void av1_highbd_dist_wtd_convolve_2d_copy_avx2( + const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + (void)filter_params_x; + (void)filter_params_y; + (void)subpel_x_qn; + (void)subpel_y_qn; + + const int bits = + FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; + const __m128i left_shift = _mm_cvtsi32_si128(bits); + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m256i wt0 = _mm256_set1_epi32(w0); + const __m256i wt1 = _mm256_set1_epi32(w1); + const __m256i zero = _mm256_setzero_si256(); + int i, j; + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m256i offset_const = _mm256_set1_epi32(offset); + const __m256i offset_const_16b = _mm256_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1); + const __m256i clip_pixel_to_bd = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + + assert(bits <= 4); + + if (!(w % 16)) { + for (i = 0; i < h; i += 1) { + for (j = 0; j < w; j += 16) { + const __m256i src_16bit = + _mm256_loadu_si256((__m256i *)(&src[i * src_stride + j])); + + const __m256i res = _mm256_sll_epi16(src_16bit, left_shift); + + if (do_average) { + const __m256i data_0 = + _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j])); + + const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_0, zero); + const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_0, zero); + + const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero); + const __m256i res_unsigned_lo = + _mm256_add_epi32(res_32b_lo, offset_const); + + const __m256i comp_avg_res_lo = + highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero); + const __m256i res_unsigned_hi = + _mm256_add_epi32(res_32b_hi, offset_const); + + const __m256i comp_avg_res_hi = + highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i round_result_lo = highbd_convolve_rounding( + &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); + const __m256i round_result_hi = highbd_convolve_rounding( + &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_16b = + _mm256_packus_epi32(round_result_lo, round_result_hi); + const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd); + + _mm256_store_si256((__m256i *)(&dst0[i * dst_stride0 + j]), res_clip); + } else { + const __m256i res_unsigned_16b = + _mm256_adds_epu16(res, offset_const_16b); + + _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]), + res_unsigned_16b); + } + } + } + } else if (!(w % 4)) { + for (i = 0; i < h; i += 2) { + for (j = 0; j < w; j += 8) { + const __m128i src_row_0 = + _mm_loadu_si128((__m128i *)(&src[i * src_stride + j])); + const __m128i src_row_1 = + _mm_loadu_si128((__m128i *)(&src[i * src_stride + j + src_stride])); + // since not all compilers yet support _mm256_set_m128i() + const __m256i src_10 = _mm256_insertf128_si256( + _mm256_castsi128_si256(src_row_0), src_row_1, 1); + + const __m256i res = _mm256_sll_epi16(src_10, left_shift); + + if (w - j < 8) { + if (do_average) { + const __m256i data_0 = _mm256_castsi128_si256( + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]))); + const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64( + (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); + const __m256i data_01 = + _mm256_permute2x128_si256(data_0, data_1, 0x20); + + const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero); + + const __m256i res_32b = _mm256_unpacklo_epi16(res, zero); + const __m256i res_unsigned_lo = + _mm256_add_epi32(res_32b, offset_const); + + const __m256i comp_avg_res = + highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i round_result = highbd_convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_16b = + _mm256_packus_epi32(round_result, round_result); + const __m256i res_clip = + _mm256_min_epi16(res_16b, clip_pixel_to_bd); + + const __m128i res_0 = _mm256_castsi256_si128(res_clip); + const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); + + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); + } else { + const __m256i res_unsigned_16b = + _mm256_adds_epu16(res, offset_const_16b); + + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b); + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1); + + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0); + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } else { + if (do_average) { + const __m256i data_0 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))); + const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); + const __m256i data_01 = + _mm256_permute2x128_si256(data_0, data_1, 0x20); + + const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero); + const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero); + + const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero); + const __m256i res_unsigned_lo = + _mm256_add_epi32(res_32b_lo, offset_const); + + const __m256i comp_avg_res_lo = + highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero); + const __m256i res_unsigned_hi = + _mm256_add_epi32(res_32b_hi, offset_const); + + const __m256i comp_avg_res_hi = + highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i round_result_lo = + highbd_convolve_rounding(&comp_avg_res_lo, &offset_const, + &rounding_const, rounding_shift); + const __m256i round_result_hi = + highbd_convolve_rounding(&comp_avg_res_hi, &offset_const, + &rounding_const, rounding_shift); + + const __m256i res_16b = + _mm256_packus_epi32(round_result_lo, round_result_hi); + const __m256i res_clip = + _mm256_min_epi16(res_16b, clip_pixel_to_bd); + + const __m128i res_0 = _mm256_castsi256_si128(res_clip); + const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_store_si128( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); + } else { + const __m256i res_unsigned_16b = + _mm256_adds_epu16(res, offset_const_16b); + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b); + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1); + + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } + } + } + } +} + +void av1_highbd_dist_wtd_convolve_2d_avx2( + const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + int im_h = h + filter_params_y->taps - 1; + int im_stride = 8; + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + + __m256i s[8], coeffs_y[4], coeffs_x[4]; + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m256i wt0 = _mm256_set1_epi32(w0); + const __m256i wt1 = _mm256_set1_epi32(w1); + const __m256i zero = _mm256_setzero_si256(); + + const __m256i round_const_x = _mm256_set1_epi32( + ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); + const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); + + const __m256i round_const_y = _mm256_set1_epi32( + ((1 << conv_params->round_1) >> 1) - + (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); + const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m256i offset_const = _mm256_set1_epi32(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1); + + const __m256i clip_pixel_to_bd = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + { + for (i = 0; i < im_h; i += 2) { + const __m256i row0 = + _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]); + __m256i row1 = _mm256_set1_epi16(0); + if (i + 1 < im_h) + row1 = + _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]); + + const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); + const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); + + // even pixels + s[0] = _mm256_alignr_epi8(r1, r0, 0); + s[1] = _mm256_alignr_epi8(r1, r0, 4); + s[2] = _mm256_alignr_epi8(r1, r0, 8); + s[3] = _mm256_alignr_epi8(r1, r0, 12); + + __m256i res_even = convolve(s, coeffs_x); + res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x), + round_shift_x); + + // odd pixels + s[0] = _mm256_alignr_epi8(r1, r0, 2); + s[1] = _mm256_alignr_epi8(r1, r0, 6); + s[2] = _mm256_alignr_epi8(r1, r0, 10); + s[3] = _mm256_alignr_epi8(r1, r0, 14); + + __m256i res_odd = convolve(s, coeffs_x); + res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x), + round_shift_x); + + __m256i res_even1 = _mm256_packs_epi32(res_even, res_even); + __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd); + __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1); + + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); + } + } + + /* Vertical filter */ + { + __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); + __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); + __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); + __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); + __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); + __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); + + s[0] = _mm256_unpacklo_epi16(s0, s1); + s[1] = _mm256_unpacklo_epi16(s2, s3); + s[2] = _mm256_unpacklo_epi16(s4, s5); + + s[4] = _mm256_unpackhi_epi16(s0, s1); + s[5] = _mm256_unpackhi_epi16(s2, s3); + s[6] = _mm256_unpackhi_epi16(s4, s5); + + for (i = 0; i < h; i += 2) { + const int16_t *data = &im_block[i * im_stride]; + + const __m256i s6 = + _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); + const __m256i s7 = + _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); + + s[3] = _mm256_unpacklo_epi16(s6, s7); + s[7] = _mm256_unpackhi_epi16(s6, s7); + + const __m256i res_a = convolve(s, coeffs_y); + + const __m256i res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a, round_const_y), round_shift_y); + + const __m256i res_unsigned_lo = + _mm256_add_epi32(res_a_round, offset_const); + + if (w - j < 8) { + if (do_average) { + const __m256i data_0 = _mm256_castsi128_si256( + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]))); + const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64( + (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); + const __m256i data_01 = + _mm256_permute2x128_si256(data_0, data_1, 0x20); + + const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero); + + const __m256i comp_avg_res = + highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i round_result = highbd_convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_16b = + _mm256_packus_epi32(round_result, round_result); + const __m256i res_clip = + _mm256_min_epi16(res_16b, clip_pixel_to_bd); + + const __m128i res_0 = _mm256_castsi256_si128(res_clip); + const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); + + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); + } else { + __m256i res_16b = + _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo); + const __m128i res_0 = _mm256_castsi256_si128(res_16b); + const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); + + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0); + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } else { + const __m256i res_b = convolve(s + 4, coeffs_y); + const __m256i res_b_round = _mm256_sra_epi32( + _mm256_add_epi32(res_b, round_const_y), round_shift_y); + + __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const); + + if (do_average) { + const __m256i data_0 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))); + const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); + const __m256i data_01 = + _mm256_permute2x128_si256(data_0, data_1, 0x20); + + const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero); + const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero); + + const __m256i comp_avg_res_lo = + highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + const __m256i comp_avg_res_hi = + highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i round_result_lo = + highbd_convolve_rounding(&comp_avg_res_lo, &offset_const, + &rounding_const, rounding_shift); + const __m256i round_result_hi = + highbd_convolve_rounding(&comp_avg_res_hi, &offset_const, + &rounding_const, rounding_shift); + + const __m256i res_16b = + _mm256_packus_epi32(round_result_lo, round_result_hi); + const __m256i res_clip = + _mm256_min_epi16(res_16b, clip_pixel_to_bd); + + const __m128i res_0 = _mm256_castsi256_si128(res_clip); + const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_store_si128( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); + } else { + __m256i res_16b = + _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi); + const __m128i res_0 = _mm256_castsi256_si128(res_16b); + const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); + + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } + + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + } + } + } +} + +void av1_highbd_dist_wtd_convolve_x_avx2( + const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_horiz; + const int bits = FILTER_BITS - conv_params->round_1; + (void)filter_params_y; + (void)subpel_y_qn; + + int i, j; + __m256i s[4], coeffs_x[4]; + + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m256i wt0 = _mm256_set1_epi32(w0); + const __m256i wt1 = _mm256_set1_epi32(w1); + const __m256i zero = _mm256_setzero_si256(); + + const __m256i round_const_x = + _mm256_set1_epi32(((1 << conv_params->round_0) >> 1)); + const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m256i offset_const = _mm256_set1_epi32(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1); + const __m256i clip_pixel_to_bd = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + + assert(bits >= 0); + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + for (i = 0; i < h; i += 2) { + const __m256i row0 = + _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]); + __m256i row1 = + _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]); + + const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); + const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); + + // even pixels + s[0] = _mm256_alignr_epi8(r1, r0, 0); + s[1] = _mm256_alignr_epi8(r1, r0, 4); + s[2] = _mm256_alignr_epi8(r1, r0, 8); + s[3] = _mm256_alignr_epi8(r1, r0, 12); + + __m256i res_even = convolve(s, coeffs_x); + res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x), + round_shift_x); + + // odd pixels + s[0] = _mm256_alignr_epi8(r1, r0, 2); + s[1] = _mm256_alignr_epi8(r1, r0, 6); + s[2] = _mm256_alignr_epi8(r1, r0, 10); + s[3] = _mm256_alignr_epi8(r1, r0, 14); + + __m256i res_odd = convolve(s, coeffs_x); + res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x), + round_shift_x); + + res_even = _mm256_sll_epi32(res_even, round_shift_bits); + res_odd = _mm256_sll_epi32(res_odd, round_shift_bits); + + __m256i res1 = _mm256_unpacklo_epi32(res_even, res_odd); + + __m256i res_unsigned_lo = _mm256_add_epi32(res1, offset_const); + + if (w - j < 8) { + if (do_average) { + const __m256i data_0 = _mm256_castsi128_si256( + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]))); + const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64( + (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); + const __m256i data_01 = + _mm256_permute2x128_si256(data_0, data_1, 0x20); + + const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero); + + const __m256i comp_avg_res = highbd_comp_avg( + &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); + + const __m256i round_result = highbd_convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_16b = + _mm256_packus_epi32(round_result, round_result); + const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd); + + const __m128i res_0 = _mm256_castsi256_si128(res_clip); + const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); + + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); + } else { + __m256i res_16b = + _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo); + const __m128i res_0 = _mm256_castsi256_si128(res_16b); + const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); + + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0); + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } else { + __m256i res2 = _mm256_unpackhi_epi32(res_even, res_odd); + __m256i res_unsigned_hi = _mm256_add_epi32(res2, offset_const); + + if (do_average) { + const __m256i data_0 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))); + const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); + const __m256i data_01 = + _mm256_permute2x128_si256(data_0, data_1, 0x20); + + const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero); + const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero); + + const __m256i comp_avg_res_lo = + highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + const __m256i comp_avg_res_hi = + highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i round_result_lo = highbd_convolve_rounding( + &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); + const __m256i round_result_hi = highbd_convolve_rounding( + &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_16b = + _mm256_packus_epi32(round_result_lo, round_result_hi); + const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd); + + const __m128i res_0 = _mm256_castsi256_si128(res_clip); + const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), + res_1); + } else { + __m256i res_16b = + _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi); + const __m128i res_0 = _mm256_castsi256_si128(res_16b); + const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); + + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } + } + } +} + +void av1_highbd_dist_wtd_convolve_y_avx2( + const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride; + const int bits = FILTER_BITS - conv_params->round_0; + (void)filter_params_x; + (void)subpel_x_qn; + + assert(bits >= 0); + int i, j; + __m256i s[8], coeffs_y[4]; + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m256i wt0 = _mm256_set1_epi32(w0); + const __m256i wt1 = _mm256_set1_epi32(w1); + const __m256i round_const_y = + _mm256_set1_epi32(((1 << conv_params->round_1) >> 1)); + const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1); + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m256i offset_const = _mm256_set1_epi32(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1); + const __m256i clip_pixel_to_bd = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m256i zero = _mm256_setzero_si256(); + + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); + + for (j = 0; j < w; j += 8) { + const uint16_t *data = &src_ptr[j]; + /* Vertical filter */ + { + __m256i src6; + __m256i s01 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 0 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), + 0x20); + __m256i s12 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), + 0x20); + __m256i s23 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), + 0x20); + __m256i s34 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), + 0x20); + __m256i s45 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), + 0x20); + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); + __m256i s56 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), + src6, 0x20); + + s[0] = _mm256_unpacklo_epi16(s01, s12); + s[1] = _mm256_unpacklo_epi16(s23, s34); + s[2] = _mm256_unpacklo_epi16(s45, s56); + + s[4] = _mm256_unpackhi_epi16(s01, s12); + s[5] = _mm256_unpackhi_epi16(s23, s34); + s[6] = _mm256_unpackhi_epi16(s45, s56); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + + const __m256i s67 = _mm256_permute2x128_si256( + src6, + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), + 0x20); + + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); + + const __m256i s78 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), + src6, 0x20); + + s[3] = _mm256_unpacklo_epi16(s67, s78); + s[7] = _mm256_unpackhi_epi16(s67, s78); + + const __m256i res_a = convolve(s, coeffs_y); + + __m256i res_a_round = _mm256_sll_epi32(res_a, round_shift_bits); + res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a_round, round_const_y), round_shift_y); + + __m256i res_unsigned_lo = _mm256_add_epi32(res_a_round, offset_const); + + if (w - j < 8) { + if (do_average) { + const __m256i data_0 = _mm256_castsi128_si256( + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]))); + const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64( + (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); + const __m256i data_01 = + _mm256_permute2x128_si256(data_0, data_1, 0x20); + + const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero); + + const __m256i comp_avg_res = + highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i round_result = highbd_convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_16b = + _mm256_packus_epi32(round_result, round_result); + const __m256i res_clip = + _mm256_min_epi16(res_16b, clip_pixel_to_bd); + + const __m128i res_0 = _mm256_castsi256_si128(res_clip); + const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); + + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); + } else { + __m256i res_16b = + _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo); + const __m128i res_0 = _mm256_castsi256_si128(res_16b); + const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); + + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0); + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } else { + const __m256i res_b = convolve(s + 4, coeffs_y); + __m256i res_b_round = _mm256_sll_epi32(res_b, round_shift_bits); + res_b_round = _mm256_sra_epi32( + _mm256_add_epi32(res_b_round, round_const_y), round_shift_y); + + __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const); + + if (do_average) { + const __m256i data_0 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))); + const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); + const __m256i data_01 = + _mm256_permute2x128_si256(data_0, data_1, 0x20); + + const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero); + const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero); + + const __m256i comp_avg_res_lo = + highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + const __m256i comp_avg_res_hi = + highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i round_result_lo = + highbd_convolve_rounding(&comp_avg_res_lo, &offset_const, + &rounding_const, rounding_shift); + const __m256i round_result_hi = + highbd_convolve_rounding(&comp_avg_res_hi, &offset_const, + &rounding_const, rounding_shift); + + const __m256i res_16b = + _mm256_packus_epi32(round_result_lo, round_result_hi); + const __m256i res_clip = + _mm256_min_epi16(res_16b, clip_pixel_to_bd); + + const __m128i res_0 = _mm256_castsi256_si128(res_clip); + const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_store_si128( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); + } else { + __m256i res_16b = + _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi); + const __m128i res_0 = _mm256_castsi256_si128(res_16b); + const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); + + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + } + } + } +} diff --git a/libs/libaom/src/av1/common/x86/highbd_jnt_convolve_sse4.c b/libs/libaom/src/av1/common/x86/highbd_jnt_convolve_sse4.c new file mode 100644 index 000000000..f033a6f94 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/highbd_jnt_convolve_sse4.c @@ -0,0 +1,387 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/convolve_sse2.h" +#include "aom_dsp/x86/convolve_sse4_1.h" + +void av1_highbd_dist_wtd_convolve_y_sse4_1( + const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride; + const int bits = FILTER_BITS - conv_params->round_0; + (void)filter_params_x; + (void)subpel_x_qn; + + assert(bits >= 0); + int i, j; + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi32(w0); + const __m128i wt1 = _mm_set1_epi32(w1); + const __m128i round_const_y = + _mm_set1_epi32(((1 << conv_params->round_1) >> 1)); + const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1); + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m128i offset_const = _mm_set1_epi32(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1); + const __m128i clip_pixel_to_bd = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m128i zero = _mm_setzero_si128(); + __m128i s[16], coeffs_y[4]; + + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); + + for (j = 0; j < w; j += 8) { + const uint16_t *data = &src_ptr[j]; + /* Vertical filter */ + { + __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); + __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); + __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); + __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); + __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); + __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); + __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); + + s[0] = _mm_unpacklo_epi16(s0, s1); + s[1] = _mm_unpacklo_epi16(s2, s3); + s[2] = _mm_unpacklo_epi16(s4, s5); + + s[4] = _mm_unpackhi_epi16(s0, s1); + s[5] = _mm_unpackhi_epi16(s2, s3); + s[6] = _mm_unpackhi_epi16(s4, s5); + + s[0 + 8] = _mm_unpacklo_epi16(s1, s2); + s[1 + 8] = _mm_unpacklo_epi16(s3, s4); + s[2 + 8] = _mm_unpacklo_epi16(s5, s6); + + s[4 + 8] = _mm_unpackhi_epi16(s1, s2); + s[5 + 8] = _mm_unpackhi_epi16(s3, s4); + s[6 + 8] = _mm_unpackhi_epi16(s5, s6); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + + __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride)); + __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride)); + + s[3] = _mm_unpacklo_epi16(s6, s7); + s[7] = _mm_unpackhi_epi16(s6, s7); + + s[3 + 8] = _mm_unpacklo_epi16(s7, s8); + s[7 + 8] = _mm_unpackhi_epi16(s7, s8); + + const __m128i res_a0 = convolve(s, coeffs_y); + __m128i res_a_round0 = _mm_sll_epi32(res_a0, round_shift_bits); + res_a_round0 = _mm_sra_epi32(_mm_add_epi32(res_a_round0, round_const_y), + round_shift_y); + + const __m128i res_a1 = convolve(s + 8, coeffs_y); + __m128i res_a_round1 = _mm_sll_epi32(res_a1, round_shift_bits); + res_a_round1 = _mm_sra_epi32(_mm_add_epi32(res_a_round1, round_const_y), + round_shift_y); + + __m128i res_unsigned_lo_0 = _mm_add_epi32(res_a_round0, offset_const); + __m128i res_unsigned_lo_1 = _mm_add_epi32(res_a_round1, offset_const); + + if (w - j < 8) { + if (do_average) { + const __m128i data_0 = + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])); + const __m128i data_1 = _mm_loadl_epi64( + (__m128i *)(&dst[i * dst_stride + j + dst_stride])); + + const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero); + const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero); + + const __m128i comp_avg_res_0 = + highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo_0, &wt0, + &wt1, use_dist_wtd_comp_avg); + const __m128i comp_avg_res_1 = + highbd_comp_avg_sse4_1(&data_ref_1, &res_unsigned_lo_1, &wt0, + &wt1, use_dist_wtd_comp_avg); + + const __m128i round_result_0 = + highbd_convolve_rounding_sse2(&comp_avg_res_0, &offset_const, + &rounding_const, rounding_shift); + const __m128i round_result_1 = + highbd_convolve_rounding_sse2(&comp_avg_res_1, &offset_const, + &rounding_const, rounding_shift); + + const __m128i res_16b_0 = + _mm_packus_epi32(round_result_0, round_result_0); + const __m128i res_clip_0 = + _mm_min_epi16(res_16b_0, clip_pixel_to_bd); + const __m128i res_16b_1 = + _mm_packus_epi32(round_result_1, round_result_1); + const __m128i res_clip_1 = + _mm_min_epi16(res_16b_1, clip_pixel_to_bd); + + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), + res_clip_0); + _mm_storel_epi64( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), + res_clip_1); + + } else { + __m128i res_16b_0 = + _mm_packus_epi32(res_unsigned_lo_0, res_unsigned_lo_0); + + __m128i res_16b_1 = + _mm_packus_epi32(res_unsigned_lo_1, res_unsigned_lo_1); + + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_16b_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_16b_1); + } + } else { + const __m128i res_b0 = convolve(s + 4, coeffs_y); + __m128i res_b_round0 = _mm_sll_epi32(res_b0, round_shift_bits); + res_b_round0 = _mm_sra_epi32( + _mm_add_epi32(res_b_round0, round_const_y), round_shift_y); + + const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y); + __m128i res_b_round1 = _mm_sll_epi32(res_b1, round_shift_bits); + res_b_round1 = _mm_sra_epi32( + _mm_add_epi32(res_b_round1, round_const_y), round_shift_y); + + __m128i res_unsigned_hi_0 = _mm_add_epi32(res_b_round0, offset_const); + __m128i res_unsigned_hi_1 = _mm_add_epi32(res_b_round1, offset_const); + + if (do_average) { + const __m128i data_0 = + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); + const __m128i data_1 = _mm_loadu_si128( + (__m128i *)(&dst[i * dst_stride + j + dst_stride])); + const __m128i data_ref_0_lo_0 = _mm_unpacklo_epi16(data_0, zero); + const __m128i data_ref_0_lo_1 = _mm_unpacklo_epi16(data_1, zero); + + const __m128i data_ref_0_hi_0 = _mm_unpackhi_epi16(data_0, zero); + const __m128i data_ref_0_hi_1 = _mm_unpackhi_epi16(data_1, zero); + + const __m128i comp_avg_res_lo_0 = + highbd_comp_avg_sse4_1(&data_ref_0_lo_0, &res_unsigned_lo_0, + &wt0, &wt1, use_dist_wtd_comp_avg); + const __m128i comp_avg_res_lo_1 = + highbd_comp_avg_sse4_1(&data_ref_0_lo_1, &res_unsigned_lo_1, + &wt0, &wt1, use_dist_wtd_comp_avg); + const __m128i comp_avg_res_hi_0 = + highbd_comp_avg_sse4_1(&data_ref_0_hi_0, &res_unsigned_hi_0, + &wt0, &wt1, use_dist_wtd_comp_avg); + const __m128i comp_avg_res_hi_1 = + highbd_comp_avg_sse4_1(&data_ref_0_hi_1, &res_unsigned_hi_1, + &wt0, &wt1, use_dist_wtd_comp_avg); + + const __m128i round_result_lo_0 = + highbd_convolve_rounding_sse2(&comp_avg_res_lo_0, &offset_const, + &rounding_const, rounding_shift); + const __m128i round_result_lo_1 = + highbd_convolve_rounding_sse2(&comp_avg_res_lo_1, &offset_const, + &rounding_const, rounding_shift); + const __m128i round_result_hi_0 = + highbd_convolve_rounding_sse2(&comp_avg_res_hi_0, &offset_const, + &rounding_const, rounding_shift); + const __m128i round_result_hi_1 = + highbd_convolve_rounding_sse2(&comp_avg_res_hi_1, &offset_const, + &rounding_const, rounding_shift); + + const __m128i res_16b_0 = + _mm_packus_epi32(round_result_lo_0, round_result_hi_0); + const __m128i res_clip_0 = + _mm_min_epi16(res_16b_0, clip_pixel_to_bd); + + const __m128i res_16b_1 = + _mm_packus_epi32(round_result_lo_1, round_result_hi_1); + const __m128i res_clip_1 = + _mm_min_epi16(res_16b_1, clip_pixel_to_bd); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), + res_clip_0); + _mm_store_si128( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), + res_clip_1); + } else { + __m128i res_16bit0 = + _mm_packus_epi32(res_unsigned_lo_0, res_unsigned_hi_0); + __m128i res_16bit1 = + _mm_packus_epi32(res_unsigned_lo_1, res_unsigned_hi_1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16bit0); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_16bit1); + } + } + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + + s[0 + 8] = s[1 + 8]; + s[1 + 8] = s[2 + 8]; + s[2 + 8] = s[3 + 8]; + + s[4 + 8] = s[5 + 8]; + s[5 + 8] = s[6 + 8]; + s[6 + 8] = s[7 + 8]; + + s6 = s8; + } + } + } +} + +void av1_highbd_dist_wtd_convolve_x_sse4_1( + const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_horiz; + const int bits = FILTER_BITS - conv_params->round_1; + (void)filter_params_y; + (void)subpel_y_qn; + + int i, j; + __m128i s[4], coeffs_x[4]; + + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi32(w0); + const __m128i wt1 = _mm_set1_epi32(w1); + const __m128i zero = _mm_setzero_si128(); + + const __m128i round_const_x = + _mm_set1_epi32(((1 << conv_params->round_0) >> 1)); + const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m128i offset_const = _mm_set1_epi32(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1); + const __m128i clip_pixel_to_bd = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + + assert(bits >= 0); + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + for (i = 0; i < h; i += 1) { + const __m128i row00 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + const __m128i row01 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]); + + // even pixels + s[0] = _mm_alignr_epi8(row01, row00, 0); + s[1] = _mm_alignr_epi8(row01, row00, 4); + s[2] = _mm_alignr_epi8(row01, row00, 8); + s[3] = _mm_alignr_epi8(row01, row00, 12); + + __m128i res_even = convolve(s, coeffs_x); + res_even = + _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), round_shift_x); + + // odd pixels + s[0] = _mm_alignr_epi8(row01, row00, 2); + s[1] = _mm_alignr_epi8(row01, row00, 6); + s[2] = _mm_alignr_epi8(row01, row00, 10); + s[3] = _mm_alignr_epi8(row01, row00, 14); + + __m128i res_odd = convolve(s, coeffs_x); + res_odd = + _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x); + + res_even = _mm_sll_epi32(res_even, round_shift_bits); + res_odd = _mm_sll_epi32(res_odd, round_shift_bits); + + __m128i res1 = _mm_unpacklo_epi32(res_even, res_odd); + __m128i res_unsigned_lo = _mm_add_epi32(res1, offset_const); + if (w - j < 8) { + if (do_average) { + const __m128i data_0 = + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])); + const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero); + + const __m128i comp_avg_res = highbd_comp_avg_sse4_1( + &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); + const __m128i round_result = highbd_convolve_rounding_sse2( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_16b = _mm_packus_epi32(round_result, round_result); + const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); + } else { + __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_16b); + } + } else { + __m128i res2 = _mm_unpackhi_epi32(res_even, res_odd); + __m128i res_unsigned_hi = _mm_add_epi32(res2, offset_const); + if (do_average) { + const __m128i data_0 = + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); + const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero); + const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero); + + const __m128i comp_avg_res_lo = + highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0, + &wt1, use_dist_wtd_comp_avg); + const __m128i comp_avg_res_hi = + highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0, + &wt1, use_dist_wtd_comp_avg); + + const __m128i round_result_lo = highbd_convolve_rounding_sse2( + &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); + const __m128i round_result_hi = highbd_convolve_rounding_sse2( + &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_16b = + _mm_packus_epi32(round_result_lo, round_result_hi); + const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); + } else { + __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b); + } + } + } + } +} diff --git a/libs/libaom/src/av1/common/x86/highbd_txfm_utility_sse4.h b/libs/libaom/src/av1/common/x86/highbd_txfm_utility_sse4.h new file mode 100644 index 000000000..5734810f5 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/highbd_txfm_utility_sse4.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_ +#define AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_ + +#include /* SSE4.1 */ + +#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \ + do { \ + __m128i u0, u1, u2, u3; \ + u0 = _mm_unpacklo_epi32(x0, x1); \ + u1 = _mm_unpackhi_epi32(x0, x1); \ + u2 = _mm_unpacklo_epi32(x2, x3); \ + u3 = _mm_unpackhi_epi32(x2, x3); \ + y0 = _mm_unpacklo_epi64(u0, u2); \ + y1 = _mm_unpackhi_epi64(u0, u2); \ + y2 = _mm_unpacklo_epi64(u1, u3); \ + y3 = _mm_unpackhi_epi64(u1, u3); \ + } while (0) + +static INLINE void transpose_8x8(const __m128i *in, __m128i *out) { + TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]); + TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]); + TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]); + TRANSPOSE_4X4(in[9], in[11], in[13], in[15], out[9], out[11], out[13], + out[15]); +} + +static INLINE void transpose_16x16(const __m128i *in, __m128i *out) { + // Upper left 8x8 + TRANSPOSE_4X4(in[0], in[4], in[8], in[12], out[0], out[4], out[8], out[12]); + TRANSPOSE_4X4(in[1], in[5], in[9], in[13], out[16], out[20], out[24], + out[28]); + TRANSPOSE_4X4(in[16], in[20], in[24], in[28], out[1], out[5], out[9], + out[13]); + TRANSPOSE_4X4(in[17], in[21], in[25], in[29], out[17], out[21], out[25], + out[29]); + + // Upper right 8x8 + TRANSPOSE_4X4(in[2], in[6], in[10], in[14], out[32], out[36], out[40], + out[44]); + TRANSPOSE_4X4(in[3], in[7], in[11], in[15], out[48], out[52], out[56], + out[60]); + TRANSPOSE_4X4(in[18], in[22], in[26], in[30], out[33], out[37], out[41], + out[45]); + TRANSPOSE_4X4(in[19], in[23], in[27], in[31], out[49], out[53], out[57], + out[61]); + + // Lower left 8x8 + TRANSPOSE_4X4(in[32], in[36], in[40], in[44], out[2], out[6], out[10], + out[14]); + TRANSPOSE_4X4(in[33], in[37], in[41], in[45], out[18], out[22], out[26], + out[30]); + TRANSPOSE_4X4(in[48], in[52], in[56], in[60], out[3], out[7], out[11], + out[15]); + TRANSPOSE_4X4(in[49], in[53], in[57], in[61], out[19], out[23], out[27], + out[31]); + // Lower right 8x8 + TRANSPOSE_4X4(in[34], in[38], in[42], in[46], out[34], out[38], out[42], + out[46]); + TRANSPOSE_4X4(in[35], in[39], in[43], in[47], out[50], out[54], out[58], + out[62]); + TRANSPOSE_4X4(in[50], in[54], in[58], in[62], out[35], out[39], out[43], + out[47]); + TRANSPOSE_4X4(in[51], in[55], in[59], in[63], out[51], out[55], out[59], + out[63]); +} + +static INLINE void transpose_8nx8n(const __m128i *input, __m128i *output, + const int width, const int height) { + const int numcol = height >> 2; + const int numrow = width >> 2; + for (int j = 0; j < numrow; j++) { + for (int i = 0; i < numcol; i++) { + TRANSPOSE_4X4(input[i * width + j + (numrow * 0)], + input[i * width + j + (numrow * 1)], + input[i * width + j + (numrow * 2)], + input[i * width + j + (numrow * 3)], + output[j * height + i + (numcol * 0)], + output[j * height + i + (numcol * 1)], + output[j * height + i + (numcol * 2)], + output[j * height + i + (numcol * 3)]); + } + } +} + +// Note: +// rounding = 1 << (bit - 1) +static INLINE __m128i half_btf_sse4_1(const __m128i *w0, const __m128i *n0, + const __m128i *w1, const __m128i *n1, + const __m128i *rounding, int bit) { + __m128i x, y; + + x = _mm_mullo_epi32(*w0, *n0); + y = _mm_mullo_epi32(*w1, *n1); + x = _mm_add_epi32(x, y); + x = _mm_add_epi32(x, *rounding); + x = _mm_srai_epi32(x, bit); + return x; +} + +static INLINE __m128i half_btf_0_sse4_1(const __m128i *w0, const __m128i *n0, + const __m128i *rounding, int bit) { + __m128i x; + + x = _mm_mullo_epi32(*w0, *n0); + x = _mm_add_epi32(x, *rounding); + x = _mm_srai_epi32(x, bit); + return x; +} + +typedef void (*transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift); + +typedef void (*fwd_transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit, + const int num_cols); + +void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd); + +#endif // AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_ diff --git a/libs/libaom/src/av1/common/x86/highbd_warp_plane_sse4.c b/libs/libaom/src/av1/common/x86/highbd_warp_plane_sse4.c new file mode 100644 index 000000000..60a819308 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/highbd_warp_plane_sse4.c @@ -0,0 +1,632 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/warped_motion.h" + +static const uint8_t warp_highbd_arrange_bytes[16] = { 0, 2, 4, 6, 8, 10, + 12, 14, 1, 3, 5, 7, + 9, 11, 13, 15 }; + +static const uint8_t highbd_shuffle_alpha0_mask0[16] = { + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +}; +static const uint8_t highbd_shuffle_alpha0_mask1[16] = { + 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7 +}; +static const uint8_t highbd_shuffle_alpha0_mask2[16] = { 8, 9, 10, 11, 8, 9, + 10, 11, 8, 9, 10, 11, + 8, 9, 10, 11 }; +static const uint8_t highbd_shuffle_alpha0_mask3[16] = { 12, 13, 14, 15, 12, 13, + 14, 15, 12, 13, 14, 15, + 12, 13, 14, 15 }; + +static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx, + __m128i *coeff) { + // Filter even-index pixels + const __m128i tmp_0 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_2 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_4 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_6 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS))); + + // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2 + const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); + // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6 + const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); + // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2 + const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); + // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6 + const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); + + // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6 + coeff[0] = _mm_unpacklo_epi64(tmp_8, tmp_10); + // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6 + coeff[2] = _mm_unpackhi_epi64(tmp_8, tmp_10); + // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6 + coeff[4] = _mm_unpacklo_epi64(tmp_12, tmp_14); + // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6 + coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14); + + // Filter odd-index pixels + const __m128i tmp_1 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_3 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_5 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_7 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS))); + + const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); + const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); + const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); + const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); + + coeff[1] = _mm_unpacklo_epi64(tmp_9, tmp_11); + coeff[3] = _mm_unpackhi_epi64(tmp_9, tmp_11); + coeff[5] = _mm_unpacklo_epi64(tmp_13, tmp_15); + coeff[7] = _mm_unpackhi_epi64(tmp_13, tmp_15); +} + +static INLINE void highbd_prepare_horizontal_filter_coeff_alpha0( + int sx, __m128i *coeff) { + // Filter coeff + const __m128i tmp_0 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS))); + + coeff[0] = _mm_shuffle_epi8( + tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0)); + coeff[2] = _mm_shuffle_epi8( + tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask1)); + coeff[4] = _mm_shuffle_epi8( + tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask2)); + coeff[6] = _mm_shuffle_epi8( + tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask3)); + + coeff[1] = coeff[0]; + coeff[3] = coeff[2]; + coeff[5] = coeff[4]; + coeff[7] = coeff[6]; +} + +static INLINE void highbd_filter_src_pixels( + const __m128i *src, const __m128i *src2, __m128i *tmp, __m128i *coeff, + const int offset_bits_horiz, const int reduce_bits_horiz, int k) { + const __m128i src_1 = *src; + const __m128i src2_1 = *src2; + + const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) + + ((1 << reduce_bits_horiz) >> 1)); + + const __m128i res_0 = _mm_madd_epi16(src_1, coeff[0]); + const __m128i res_2 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 4), coeff[2]); + const __m128i res_4 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 8), coeff[4]); + const __m128i res_6 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 12), coeff[6]); + + __m128i res_even = + _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6)); + res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const), + _mm_cvtsi32_si128(reduce_bits_horiz)); + + const __m128i res_1 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 2), coeff[1]); + const __m128i res_3 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 6), coeff[3]); + const __m128i res_5 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 10), coeff[5]); + const __m128i res_7 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 14), coeff[7]); + + __m128i res_odd = + _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7)); + res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), + _mm_cvtsi32_si128(reduce_bits_horiz)); + + // Combine results into one register. + // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7 + // as this order helps with the vertical filter. + tmp[k + 7] = _mm_packs_epi32(res_even, res_odd); +} + +static INLINE void highbd_horiz_filter(const __m128i *src, const __m128i *src2, + __m128i *tmp, int sx, int alpha, int k, + const int offset_bits_horiz, + const int reduce_bits_horiz) { + __m128i coeff[8]; + highbd_prepare_horizontal_filter_coeff(alpha, sx, coeff); + highbd_filter_src_pixels(src, src2, tmp, coeff, offset_bits_horiz, + reduce_bits_horiz, k); +} + +static INLINE void highbd_warp_horizontal_filter_alpha0_beta0( + const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)beta; + (void)alpha; + int k; + + __m128i coeff[8]; + highbd_prepare_horizontal_filter_coeff_alpha0(sx4, coeff); + + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m128i src2 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); + highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz, + reduce_bits_horiz, k); + } +} + +static INLINE void highbd_warp_horizontal_filter_alpha0( + const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)alpha; + int k; + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m128i src2 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); + + __m128i coeff[8]; + highbd_prepare_horizontal_filter_coeff_alpha0(sx, coeff); + highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz, + reduce_bits_horiz, k); + } +} + +static INLINE void highbd_warp_horizontal_filter_beta0( + const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)beta; + int k; + __m128i coeff[8]; + highbd_prepare_horizontal_filter_coeff(alpha, sx4, coeff); + + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m128i src2 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); + highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz, + reduce_bits_horiz, k); + } +} + +static INLINE void highbd_warp_horizontal_filter( + const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + int k; + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m128i src2 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); + + highbd_horiz_filter(&src, &src2, tmp, sx, alpha, k, offset_bits_horiz, + reduce_bits_horiz); + } +} + +static INLINE void highbd_prepare_warp_horizontal_filter( + const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + if (alpha == 0 && beta == 0) + highbd_warp_horizontal_filter_alpha0_beta0( + ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); + + else if (alpha == 0 && beta != 0) + highbd_warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, + beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); + + else if (alpha != 0 && beta == 0) + highbd_warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, + beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); + else + highbd_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, + p_height, height, i, offset_bits_horiz, + reduce_bits_horiz); +} + +void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, + int width, int height, int stride, + uint16_t *pred, int p_col, int p_row, + int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, int bd, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + __m128i tmp[15]; + int i, j, k; + const int reduce_bits_horiz = + conv_params->round_0 + + AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0); + const int reduce_bits_vert = conv_params->is_compound + ? conv_params->round_1 + : 2 * FILTER_BITS - reduce_bits_horiz; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); + assert(!(bd == 12 && reduce_bits_horiz < 5)); + assert(IMPLIES(conv_params->do_average, conv_params->is_compound)); + + const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; + const __m128i clip_pixel = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert); + const __m128i reduce_bits_vert_const = + _mm_set1_epi32(((1 << reduce_bits_vert) >> 1)); + const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const __m128i res_sub_const = + _mm_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) - + (1 << (offset_bits - conv_params->round_1 - 1))); + __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits); + __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1)); + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi32(w0); + const __m128i wt1 = _mm_set1_epi32(w1); + + /* Note: For this code to work, the left/right frame borders need to be + extended by at least 13 pixels each. By the time we get here, other + code will have set up this border, but we allow an explicit check + for debugging purposes. + */ + /*for (i = 0; i < height; ++i) { + for (j = 0; j < 13; ++j) { + assert(ref[i * stride - 13 + j] == ref[i * stride]); + assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]); + } + }*/ + + for (i = 0; i < p_height; i += 8) { + for (j = 0; j < p_width; j += 8) { + const int32_t src_x = (p_col + j + 4) << subsampling_x; + const int32_t src_y = (p_row + i + 4) << subsampling_y; + const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0]; + const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1]; + const int32_t x4 = dst_x >> subsampling_x; + const int32_t y4 = dst_y >> subsampling_y; + + int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS; + int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS; + int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + + // Add in all the constant terms, including rounding and offset + sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + + // Horizontal filter + // If the block is aligned such that, after clamping, every sample + // would be taken from the leftmost/rightmost column, then we can + // skip the expensive horizontal filter. + if (ix4 <= -7) { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + tmp[k + 7] = _mm_set1_epi16( + (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + + ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz))); + } + } else if (ix4 >= width + 6) { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + tmp[k + 7] = + _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + + ref[iy * stride + (width - 1)] * + (1 << (FILTER_BITS - reduce_bits_horiz))); + } + } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) { + const int out_of_boundary_left = -(ix4 - 6); + const int out_of_boundary_right = (ix4 + 8) - width; + + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m128i src2 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); + + const __m128i src_01 = _mm_shuffle_epi8( + src, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes)); + const __m128i src2_01 = _mm_shuffle_epi8( + src2, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes)); + + __m128i src_lo = _mm_unpacklo_epi64(src_01, src2_01); + __m128i src_hi = _mm_unpackhi_epi64(src_01, src2_01); + + if (out_of_boundary_left >= 0) { + const __m128i shuffle_reg_left = + _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]); + src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_left); + src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_left); + } + + if (out_of_boundary_right >= 0) { + const __m128i shuffle_reg_right = _mm_loadu_si128( + (__m128i *)warp_pad_right[out_of_boundary_right]); + src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_right); + src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_right); + } + + const __m128i src_padded = _mm_unpacklo_epi8(src_lo, src_hi); + const __m128i src2_padded = _mm_unpackhi_epi8(src_lo, src_hi); + + highbd_horiz_filter(&src_padded, &src2_padded, tmp, sx, alpha, k, + offset_bits_horiz, reduce_bits_horiz); + } + } else { + highbd_prepare_warp_horizontal_filter( + ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); + } + + // Vertical filter + for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + + // Load from tmp and rearrange pairs of consecutive rows into the + // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7 + const __m128i *src = tmp + (k + 4); + const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]); + const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]); + const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]); + const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]); + + // Filter even-index pixels + const __m128i tmp_0 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_2 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_4 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_6 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); + + const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); + const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); + const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); + const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); + + const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10); + const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10); + const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14); + const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]); + const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]); + const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]); + const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]); + + const __m128i tmp_1 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_3 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_5 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_7 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); + + const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); + const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); + const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); + const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); + + const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11); + const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11); + const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15); + const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + if (conv_params->is_compound) { + __m128i *const p = + (__m128i *)&conv_params + ->dst[(i + k + 4) * conv_params->dst_stride + j]; + res_lo = _mm_add_epi32(res_lo, res_add_const); + res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const), + reduce_bits_vert_shift); + + if (conv_params->do_average) { + __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p)); + + if (conv_params->use_dist_wtd_comp_avg) { + res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0), + _mm_mullo_epi32(res_lo, wt1)); + res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS); + } else { + res_lo = _mm_srai_epi32(_mm_add_epi32(p_32, res_lo), 1); + } + + __m128i res32_lo = _mm_add_epi32(res_lo, res_sub_const); + res32_lo = _mm_sra_epi32(_mm_add_epi32(res32_lo, round_bits_const), + round_bits_shift); + + __m128i res16_lo = _mm_packus_epi32(res32_lo, res32_lo); + res16_lo = _mm_min_epi16(res16_lo, clip_pixel); + _mm_storel_epi64(dst16, res16_lo); + } else { + res_lo = _mm_packus_epi32(res_lo, res_lo); + _mm_storel_epi64(p, res_lo); + } + if (p_width > 4) { + __m128i *const p4 = + (__m128i *)&conv_params + ->dst[(i + k + 4) * conv_params->dst_stride + j + 4]; + + res_hi = _mm_add_epi32(res_hi, res_add_const); + res_hi = + _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const), + reduce_bits_vert_shift); + if (conv_params->do_average) { + __m128i *const dst16_4 = + (__m128i *)&pred[(i + k + 4) * p_stride + j + 4]; + __m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4)); + + if (conv_params->use_dist_wtd_comp_avg) { + res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0), + _mm_mullo_epi32(res_hi, wt1)); + res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS); + } else { + res_hi = _mm_srai_epi32(_mm_add_epi32(p4_32, res_hi), 1); + } + + __m128i res32_hi = _mm_add_epi32(res_hi, res_sub_const); + res32_hi = _mm_sra_epi32( + _mm_add_epi32(res32_hi, round_bits_const), round_bits_shift); + __m128i res16_hi = _mm_packus_epi32(res32_hi, res32_hi); + res16_hi = _mm_min_epi16(res16_hi, clip_pixel); + _mm_storel_epi64(dst16_4, res16_hi); + } else { + res_hi = _mm_packus_epi32(res_hi, res_hi); + _mm_storel_epi64(p4, res_hi); + } + } + } else { + // Round and pack into 8 bits + const __m128i round_const = + _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) + + ((1 << reduce_bits_vert) >> 1)); + + const __m128i res_lo_round = _mm_srai_epi32( + _mm_add_epi32(res_lo, round_const), reduce_bits_vert); + const __m128i res_hi_round = _mm_srai_epi32( + _mm_add_epi32(res_hi, round_const), reduce_bits_vert); + + __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); + // Clamp res_16bit to the range [0, 2^bd - 1] + const __m128i max_val = _mm_set1_epi16((1 << bd) - 1); + const __m128i zero = _mm_setzero_si128(); + res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero); + + // Store, blending with 'pred' if needed + __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + + // Note: If we're outputting a 4x4 block, we need to be very careful + // to only output 4 pixels at this point, to avoid encode/decode + // mismatches when encoding with multiple threads. + if (p_width == 4) { + _mm_storel_epi64(p, res_16bit); + } else { + _mm_storeu_si128(p, res_16bit); + } + } + } + } + } +} diff --git a/libs/libaom/src/av1/common/x86/highbd_wiener_convolve_avx2.c b/libs/libaom/src/av1/common/x86/highbd_wiener_convolve_avx2.c new file mode 100644 index 000000000..0c8a8505b --- /dev/null +++ b/libs/libaom/src/av1/common/x86/highbd_wiener_convolve_avx2.c @@ -0,0 +1,245 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "av1/common/convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +// 128-bit xmmwords are written as [ ... ] with the MSB on the left. +// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB +// on the left. +// A row of, say, 16-bit pixels with values p0, p1, p2, ..., p14, p15 will be +// loaded and stored as [ p15 ... p9 p8 ][ p7 ... p1 p0 ]. +void av1_highbd_wiener_convolve_add_src_avx2( + const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, int h, + const ConvolveParams *conv_params, int bd) { + assert(x_step_q4 == 16 && y_step_q4 == 16); + assert(!(w & 7)); + assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16); + (void)x_step_q4; + (void)y_step_q4; + + const uint16_t *const src = CONVERT_TO_SHORTPTR(src8); + uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8); + + DECLARE_ALIGNED(32, uint16_t, + temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); + int intermediate_height = h + SUBPEL_TAPS - 1; + const int center_tap = ((SUBPEL_TAPS - 1) / 2); + const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap; + + const __m128i zero_128 = _mm_setzero_si128(); + const __m256i zero_256 = _mm256_setzero_si256(); + + // Add an offset to account for the "add_src" part of the convolve function. + const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3); + + const __m256i clamp_low = zero_256; + + /* Horizontal filter */ + { + const __m256i clamp_high_ep = + _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1); + + // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ] + const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset); + + // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ] + const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ] + const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ] + const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123); + // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ] + const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123); + // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ] + const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567); + // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ] + const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567); + + // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ] + const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128); + // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ] + const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128); + // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ] + const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128); + // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ] + const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128); + + const __m256i round_const = _mm256_set1_epi32( + (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1))); + + for (int i = 0; i < intermediate_height; ++i) { + for (int j = 0; j < w; j += 16) { + const uint16_t *src_ij = src_ptr + i * src_stride + j; + + // Load 16-bit src data + const __m256i src_0 = yy_loadu_256(src_ij + 0); + const __m256i src_1 = yy_loadu_256(src_ij + 1); + const __m256i src_2 = yy_loadu_256(src_ij + 2); + const __m256i src_3 = yy_loadu_256(src_ij + 3); + const __m256i src_4 = yy_loadu_256(src_ij + 4); + const __m256i src_5 = yy_loadu_256(src_ij + 5); + const __m256i src_6 = yy_loadu_256(src_ij + 6); + const __m256i src_7 = yy_loadu_256(src_ij + 7); + + // Multiply src data by filter coeffs and sum pairs + const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01); + const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01); + const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23); + const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23); + const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45); + const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45); + const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67); + const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67); + + // Calculate scalar product for even- and odd-indices separately, + // increasing to 32-bit precision + const __m256i res_even_sum = _mm256_add_epi32( + _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6)); + const __m256i res_even = _mm256_srai_epi32( + _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0); + + const __m256i res_odd_sum = _mm256_add_epi32( + _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7)); + const __m256i res_odd = _mm256_srai_epi32( + _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0); + + // Reduce to 16-bit precision and pack even- and odd-index results + // back into one register. The _mm256_packs_epi32 intrinsic returns + // a register with the pixels ordered as follows: + // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ] + const __m256i res = _mm256_packs_epi32(res_even, res_odd); + const __m256i res_clamped = + _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high_ep); + + // Store in a temporary array + yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped); + } + } + } + + /* Vertical filter */ + { + const __m256i clamp_high = _mm256_set1_epi16((1 << bd) - 1); + + // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ] + const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset); + + // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ] + const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ] + const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ] + const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123); + // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ] + const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123); + // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ] + const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567); + // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ] + const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567); + + // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ] + const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128); + // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ] + const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128); + // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ] + const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128); + // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ] + const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128); + + const __m256i round_const = + _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) - + (1 << (bd + conv_params->round_1 - 1))); + + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const uint16_t *temp_ij = temp + i * MAX_SB_SIZE + j; + + // Load 16-bit data from the output of the horizontal filter in + // which the pixels are ordered as follows: + // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ] + const __m256i data_0 = yy_loadu_256(temp_ij + 0 * MAX_SB_SIZE); + const __m256i data_1 = yy_loadu_256(temp_ij + 1 * MAX_SB_SIZE); + const __m256i data_2 = yy_loadu_256(temp_ij + 2 * MAX_SB_SIZE); + const __m256i data_3 = yy_loadu_256(temp_ij + 3 * MAX_SB_SIZE); + const __m256i data_4 = yy_loadu_256(temp_ij + 4 * MAX_SB_SIZE); + const __m256i data_5 = yy_loadu_256(temp_ij + 5 * MAX_SB_SIZE); + const __m256i data_6 = yy_loadu_256(temp_ij + 6 * MAX_SB_SIZE); + const __m256i data_7 = yy_loadu_256(temp_ij + 7 * MAX_SB_SIZE); + + // Filter the even-indices, increasing to 32-bit precision + const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1); + const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3); + const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5); + const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7); + + const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01); + const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23); + const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45); + const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67); + + const __m256i res_even = _mm256_add_epi32( + _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6)); + + // Filter the odd-indices, increasing to 32-bit precision + const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1); + const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3); + const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5); + const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7); + + const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01); + const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23); + const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45); + const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67); + + const __m256i res_odd = _mm256_add_epi32( + _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7)); + + // Pixels are currently in the following order: + // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ] + // res_odd order: [ 15 13 11 9 ] [ 7 5 3 1 ] + // + // Rearrange the pixels into the following order: + // res_lo order: [ 11 10 9 8 ] [ 3 2 1 0 ] + // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ] + const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd); + const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd); + + const __m256i res_lo_round = _mm256_srai_epi32( + _mm256_add_epi32(res_lo, round_const), conv_params->round_1); + const __m256i res_hi_round = _mm256_srai_epi32( + _mm256_add_epi32(res_hi, round_const), conv_params->round_1); + + // Reduce to 16-bit precision and pack into the correct order: + // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ] + const __m256i res_16bit = + _mm256_packs_epi32(res_lo_round, res_hi_round); + const __m256i res_16bit_clamped = _mm256_min_epi16( + _mm256_max_epi16(res_16bit, clamp_low), clamp_high); + + // Store in the dst array + yy_storeu_256(dst + i * dst_stride + j, res_16bit_clamped); + } + } + } +} diff --git a/libs/libaom/src/av1/common/x86/highbd_wiener_convolve_ssse3.c b/libs/libaom/src/av1/common/x86/highbd_wiener_convolve_ssse3.c new file mode 100644 index 000000000..818b1099c --- /dev/null +++ b/libs/libaom/src/av1/common/x86/highbd_wiener_convolve_ssse3.c @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "av1/common/convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" + +void av1_highbd_wiener_convolve_add_src_ssse3( + const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, int h, + const ConvolveParams *conv_params, int bd) { + assert(x_step_q4 == 16 && y_step_q4 == 16); + assert(!(w & 7)); + assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16); + (void)x_step_q4; + (void)y_step_q4; + + const uint16_t *const src = CONVERT_TO_SHORTPTR(src8); + uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8); + + DECLARE_ALIGNED(16, uint16_t, + temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); + int intermediate_height = h + SUBPEL_TAPS - 1; + int i, j; + const int center_tap = ((SUBPEL_TAPS - 1) / 2); + const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap; + + const __m128i zero = _mm_setzero_si128(); + // Add an offset to account for the "add_src" part of the convolve function. + const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3); + + /* Horizontal filter */ + { + const __m128i coeffs_x = + _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1))); + + for (i = 0; i < intermediate_height; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + const __m128i data2 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]); + + // Filter even-index pixels + const __m128i res_0 = _mm_madd_epi16(data, coeff_01); + const __m128i res_2 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23); + const __m128i res_4 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45); + const __m128i res_6 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67); + + __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), + _mm_add_epi32(res_2, res_6)); + res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const), + conv_params->round_0); + + // Filter odd-index pixels + const __m128i res_1 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01); + const __m128i res_3 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23); + const __m128i res_5 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45); + const __m128i res_7 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67); + + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const), + conv_params->round_0); + + // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 + const __m128i maxval = + _mm_set1_epi16((WIENER_CLAMP_LIMIT(conv_params->round_0, bd)) - 1); + __m128i res = _mm_packs_epi32(res_even, res_odd); + res = _mm_min_epi16(_mm_max_epi16(res, zero), maxval); + _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res); + } + } + } + + /* Vertical filter */ + { + const __m128i coeffs_y = + _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = + _mm_set1_epi32((1 << (conv_params->round_1 - 1)) - + (1 << (bd + conv_params->round_1 - 1))); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + // Filter even-index pixels + const uint16_t *data = &temp[i * MAX_SB_SIZE + j]; + const __m128i src_0 = + _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), + *(__m128i *)(data + 1 * MAX_SB_SIZE)); + const __m128i src_2 = + _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), + *(__m128i *)(data + 3 * MAX_SB_SIZE)); + const __m128i src_4 = + _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), + *(__m128i *)(data + 5 * MAX_SB_SIZE)); + const __m128i src_6 = + _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), + *(__m128i *)(data + 7 * MAX_SB_SIZE)); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = + _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), + *(__m128i *)(data + 1 * MAX_SB_SIZE)); + const __m128i src_3 = + _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), + *(__m128i *)(data + 3 * MAX_SB_SIZE)); + const __m128i src_5 = + _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), + *(__m128i *)(data + 5 * MAX_SB_SIZE)); + const __m128i src_7 = + _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), + *(__m128i *)(data + 7 * MAX_SB_SIZE)); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + const __m128i res_lo_round = _mm_srai_epi32( + _mm_add_epi32(res_lo, round_const), conv_params->round_1); + const __m128i res_hi_round = _mm_srai_epi32( + _mm_add_epi32(res_hi, round_const), conv_params->round_1); + + const __m128i maxval = _mm_set1_epi16((1 << bd) - 1); + __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); + res_16bit = _mm_min_epi16(_mm_max_epi16(res_16bit, zero), maxval); + + __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; + _mm_storeu_si128(p, res_16bit); + } + } + } +} diff --git a/libs/libaom/src/av1/common/x86/intra_edge_sse4.c b/libs/libaom/src/av1/common/x86/intra_edge_sse4.c new file mode 100644 index 000000000..fc69f41d7 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/intra_edge_sse4.c @@ -0,0 +1,318 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) { + if (!strength) return; + + DECLARE_ALIGNED(16, static const int8_t, kern[3][16]) = { + { 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0 }, // strength 1: 4,8,4 + { 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0 }, // strength 2: 5,6,5 + { 2, 4, 4, 4, 2, 0, 0, 0, 2, 4, 4, 4, 2, 0, 0, 0 } // strength 3: 2,4,4,4,2 + }; + + DECLARE_ALIGNED(16, static const int8_t, v_const[5][16]) = { + { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }, + { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + }; + + // Extend the first and last samples to simplify the loop for the 5-tap case + p[-1] = p[0]; + __m128i last = _mm_set1_epi8(p[sz - 1]); + _mm_storeu_si128((__m128i *)&p[sz], last); + + // Adjust input pointer for filter support area + uint8_t *in = (strength == 3) ? p - 1 : p; + + // Avoid modifying first sample + uint8_t *out = p + 1; + int len = sz - 1; + + const int use_3tap_filter = (strength < 3); + + if (use_3tap_filter) { + __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); + __m128i shuf0 = _mm_lddqu_si128((__m128i const *)v_const[0]); + __m128i shuf1 = _mm_lddqu_si128((__m128i const *)v_const[1]); + __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]); + __m128i in0 = _mm_lddqu_si128((__m128i *)in); + while (len > 0) { + int n_out = (len < 8) ? len : 8; + __m128i d0 = _mm_shuffle_epi8(in0, shuf0); + __m128i d1 = _mm_shuffle_epi8(in0, shuf1); + d0 = _mm_maddubs_epi16(d0, coef0); + d1 = _mm_maddubs_epi16(d1, coef0); + d0 = _mm_hadd_epi16(d0, d1); + __m128i eight = _mm_set1_epi16(8); + d0 = _mm_add_epi16(d0, eight); + d0 = _mm_srai_epi16(d0, 4); + d0 = _mm_packus_epi16(d0, d0); + __m128i out0 = _mm_lddqu_si128((__m128i *)out); + __m128i n0 = _mm_set1_epi8(n_out); + __m128i mask = _mm_cmpgt_epi8(n0, iden); + out0 = _mm_blendv_epi8(out0, d0, mask); + _mm_storel_epi64((__m128i *)out, out0); + __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16)); + in0 = _mm_alignr_epi8(in1, in0, 8); + in += 8; + out += 8; + len -= n_out; + } + } else { // 5-tap filter + __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); + __m128i two = _mm_set1_epi8(2); + __m128i shuf_a = _mm_lddqu_si128((__m128i const *)v_const[2]); + __m128i shuf_b = _mm_add_epi8(shuf_a, two); + __m128i shuf_c = _mm_add_epi8(shuf_b, two); + __m128i shuf_d = _mm_add_epi8(shuf_c, two); + __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]); + __m128i in0 = _mm_lddqu_si128((__m128i *)in); + while (len > 0) { + int n_out = (len < 8) ? len : 8; + __m128i d0 = _mm_shuffle_epi8(in0, shuf_a); + __m128i d1 = _mm_shuffle_epi8(in0, shuf_b); + __m128i d2 = _mm_shuffle_epi8(in0, shuf_c); + __m128i d3 = _mm_shuffle_epi8(in0, shuf_d); + d0 = _mm_maddubs_epi16(d0, coef0); + d1 = _mm_maddubs_epi16(d1, coef0); + d2 = _mm_maddubs_epi16(d2, coef0); + d3 = _mm_maddubs_epi16(d3, coef0); + d0 = _mm_hadd_epi16(d0, d1); + d2 = _mm_hadd_epi16(d2, d3); + d0 = _mm_hadd_epi16(d0, d2); + __m128i eight = _mm_set1_epi16(8); + d0 = _mm_add_epi16(d0, eight); + d0 = _mm_srai_epi16(d0, 4); + d0 = _mm_packus_epi16(d0, d0); + __m128i out0 = _mm_lddqu_si128((__m128i *)out); + __m128i n0 = _mm_set1_epi8(n_out); + __m128i mask = _mm_cmpgt_epi8(n0, iden); + out0 = _mm_blendv_epi8(out0, d0, mask); + _mm_storel_epi64((__m128i *)out, out0); + __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16)); + in0 = _mm_alignr_epi8(in1, in0, 8); + in += 8; + out += 8; + len -= n_out; + } + } +} + +void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength) { + if (!strength) return; + + DECLARE_ALIGNED(16, static const int16_t, kern[3][8]) = { + { 4, 8, 4, 8, 4, 8, 4, 8 }, // strength 1: 4,8,4 + { 5, 6, 5, 6, 5, 6, 5, 6 }, // strength 2: 5,6,5 + { 2, 4, 2, 4, 2, 4, 2, 4 } // strength 3: 2,4,4,4,2 + }; + + DECLARE_ALIGNED(16, static const int16_t, + v_const[1][8]) = { { 0, 1, 2, 3, 4, 5, 6, 7 } }; + + // Extend the first and last samples to simplify the loop for the 5-tap case + p[-1] = p[0]; + __m128i last = _mm_set1_epi16(p[sz - 1]); + _mm_storeu_si128((__m128i *)&p[sz], last); + + // Adjust input pointer for filter support area + uint16_t *in = (strength == 3) ? p - 1 : p; + + // Avoid modifying first sample + uint16_t *out = p + 1; + int len = sz - 1; + + const int use_3tap_filter = (strength < 3); + + if (use_3tap_filter) { + __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); + __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]); + __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); + __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]); + while (len > 0) { + int n_out = (len < 8) ? len : 8; + __m128i in1 = _mm_alignr_epi8(in8, in0, 2); + __m128i in2 = _mm_alignr_epi8(in8, in0, 4); + __m128i in02 = _mm_add_epi16(in0, in2); + __m128i d0 = _mm_unpacklo_epi16(in02, in1); + __m128i d1 = _mm_unpackhi_epi16(in02, in1); + d0 = _mm_mullo_epi16(d0, coef0); + d1 = _mm_mullo_epi16(d1, coef0); + d0 = _mm_hadd_epi16(d0, d1); + __m128i eight = _mm_set1_epi16(8); + d0 = _mm_add_epi16(d0, eight); + d0 = _mm_srli_epi16(d0, 4); + __m128i out0 = _mm_lddqu_si128((__m128i *)out); + __m128i n0 = _mm_set1_epi16(n_out); + __m128i mask = _mm_cmpgt_epi16(n0, iden); + out0 = _mm_blendv_epi8(out0, d0, mask); + _mm_storeu_si128((__m128i *)out, out0); + in += 8; + in0 = in8; + in8 = _mm_lddqu_si128((__m128i *)&in[8]); + out += 8; + len -= n_out; + } + } else { // 5-tap filter + __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); + __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]); + __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); + __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]); + while (len > 0) { + int n_out = (len < 8) ? len : 8; + __m128i in1 = _mm_alignr_epi8(in8, in0, 2); + __m128i in2 = _mm_alignr_epi8(in8, in0, 4); + __m128i in3 = _mm_alignr_epi8(in8, in0, 6); + __m128i in4 = _mm_alignr_epi8(in8, in0, 8); + __m128i in04 = _mm_add_epi16(in0, in4); + __m128i in123 = _mm_add_epi16(in1, in2); + in123 = _mm_add_epi16(in123, in3); + __m128i d0 = _mm_unpacklo_epi16(in04, in123); + __m128i d1 = _mm_unpackhi_epi16(in04, in123); + d0 = _mm_mullo_epi16(d0, coef0); + d1 = _mm_mullo_epi16(d1, coef0); + d0 = _mm_hadd_epi16(d0, d1); + __m128i eight = _mm_set1_epi16(8); + d0 = _mm_add_epi16(d0, eight); + d0 = _mm_srli_epi16(d0, 4); + __m128i out0 = _mm_lddqu_si128((__m128i *)out); + __m128i n0 = _mm_set1_epi16(n_out); + __m128i mask = _mm_cmpgt_epi16(n0, iden); + out0 = _mm_blendv_epi8(out0, d0, mask); + _mm_storeu_si128((__m128i *)out, out0); + in += 8; + in0 = in8; + in8 = _mm_lddqu_si128((__m128i *)&in[8]); + out += 8; + len -= n_out; + } + } +} + +void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) { + // interpolate half-sample positions + assert(sz <= 24); + + DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = { + { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 } + }; + + DECLARE_ALIGNED( + 16, static const int8_t, + v_const[2][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }, + { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } }; + + // Extend first/last samples (upper-left p[-1], last p[sz-1]) + // to support 4-tap filter + p[-2] = p[-1]; + p[sz] = p[sz - 1]; + + uint8_t *in = &p[-2]; + uint8_t *out = &p[-2]; + + int n = sz + 1; // Input length including upper-left sample + + __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); + __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]); + + __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]); + __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]); + __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]); + + while (n > 0) { + __m128i in8 = _mm_alignr_epi8(in16, in0, 8); + __m128i d0 = _mm_shuffle_epi8(in0, shuf0); + __m128i d1 = _mm_shuffle_epi8(in0, shuf1); + __m128i d2 = _mm_shuffle_epi8(in8, shuf0); + __m128i d3 = _mm_shuffle_epi8(in8, shuf1); + d0 = _mm_maddubs_epi16(d0, coef0); + d1 = _mm_maddubs_epi16(d1, coef0); + d2 = _mm_maddubs_epi16(d2, coef0); + d3 = _mm_maddubs_epi16(d3, coef0); + d0 = _mm_hadd_epi16(d0, d1); + d2 = _mm_hadd_epi16(d2, d3); + __m128i eight = _mm_set1_epi16(8); + d0 = _mm_add_epi16(d0, eight); + d2 = _mm_add_epi16(d2, eight); + d0 = _mm_srai_epi16(d0, 4); + d2 = _mm_srai_epi16(d2, 4); + d0 = _mm_packus_epi16(d0, d2); + __m128i in1 = _mm_alignr_epi8(in16, in0, 1); + __m128i out0 = _mm_unpacklo_epi8(in1, d0); + __m128i out1 = _mm_unpackhi_epi8(in1, d0); + _mm_storeu_si128((__m128i *)&out[0], out0); + _mm_storeu_si128((__m128i *)&out[16], out1); + in0 = in16; + in16 = _mm_setzero_si128(); + out += 32; + n -= 16; + } +} + +void av1_upsample_intra_edge_high_sse4_1(uint16_t *p, int sz, int bd) { + // interpolate half-sample positions + assert(sz <= 24); + + DECLARE_ALIGNED(16, static const int16_t, + kernel[1][8]) = { { -1, 9, -1, 9, -1, 9, -1, 9 } }; + + // Extend first/last samples (upper-left p[-1], last p[sz-1]) + // to support 4-tap filter + p[-2] = p[-1]; + p[sz] = p[sz - 1]; + + uint16_t *in = &p[-2]; + uint16_t *out = in; + int n = sz + 1; + + __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); + __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]); + __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]); + __m128i in24 = _mm_lddqu_si128((__m128i *)&in[24]); + + while (n > 0) { + __m128i in1 = _mm_alignr_epi8(in8, in0, 2); + __m128i in2 = _mm_alignr_epi8(in8, in0, 4); + __m128i in3 = _mm_alignr_epi8(in8, in0, 6); + __m128i sum0 = _mm_add_epi16(in0, in3); + __m128i sum1 = _mm_add_epi16(in1, in2); + __m128i d0 = _mm_unpacklo_epi16(sum0, sum1); + __m128i d1 = _mm_unpackhi_epi16(sum0, sum1); + __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]); + d0 = _mm_madd_epi16(d0, coef0); + d1 = _mm_madd_epi16(d1, coef0); + __m128i eight = _mm_set1_epi32(8); + d0 = _mm_add_epi32(d0, eight); + d1 = _mm_add_epi32(d1, eight); + d0 = _mm_srai_epi32(d0, 4); + d1 = _mm_srai_epi32(d1, 4); + d0 = _mm_packus_epi32(d0, d1); + __m128i max0 = _mm_set1_epi16((1 << bd) - 1); + d0 = _mm_min_epi16(d0, max0); + __m128i out0 = _mm_unpacklo_epi16(in1, d0); + __m128i out1 = _mm_unpackhi_epi16(in1, d0); + _mm_storeu_si128((__m128i *)&out[0], out0); + _mm_storeu_si128((__m128i *)&out[8], out1); + in0 = in8; + in8 = in16; + in16 = in24; + in24 = _mm_setzero_si128(); + out += 16; + n -= 8; + } +} diff --git a/libs/libaom/src/av1/common/x86/jnt_convolve_avx2.c b/libs/libaom/src/av1/common/x86/jnt_convolve_avx2.c new file mode 100644 index 000000000..6de61573e --- /dev/null +++ b/libs/libaom/src/av1/common/x86/jnt_convolve_avx2.c @@ -0,0 +1,917 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/convolve_avx2.h" +#include "aom_dsp/x86/convolve_common_intrin.h" +#include "aom_dsp/x86/convolve_sse4_1.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "av1/common/convolve.h" + +static INLINE __m256i unpack_weights_avx2(ConvolveParams *conv_params) { + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m256i wt0 = _mm256_set1_epi16((int16_t)w0); + const __m256i wt1 = _mm256_set1_epi16((int16_t)w1); + const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1); + return wt; +} + +static INLINE __m256i load_line2_avx2(const void *a, const void *b) { + return _mm256_permute2x128_si256( + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)a)), + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20); +} + +void av1_dist_wtd_convolve_x_avx2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int bd = 8; + int i, j, is_horiz_4tap = 0; + const int bits = FILTER_BITS - conv_params->round_1; + const __m256i wt = unpack_weights_avx2(conv_params); + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m256i offset_const = _mm256_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); + + assert(bits >= 0); + assert(conv_params->round_0 > 0); + + const __m256i round_const = + _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1); + + (void)filter_params_y; + (void)subpel_y_qn; + + __m256i filt[4], coeffs[4]; + + filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + + prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs); + + // Condition for checking valid horz_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0))) + is_horiz_4tap = 1; + + // horz_filt as 4 tap + if (is_horiz_4tap) { + const int fo_horiz = 1; + const uint8_t *const src_ptr = src - fo_horiz; + for (i = 0; i < h; i += 2) { + const uint8_t *src_data = src_ptr + i * src_stride; + CONV_BUF_TYPE *dst_data = dst + i * dst_stride; + for (j = 0; j < w; j += 8) { + const __m256i data = + load_line2_avx2(&src_data[j], &src_data[j + src_stride]); + + __m256i res = convolve_lowbd_x_4tap(data, coeffs + 1, filt); + res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift); + res = _mm256_slli_epi16(res, bits); + + const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m256i data_ref_0 = + load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]); + const __m256i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + if (w > 4) { + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + } else { + *(uint32_t *)(&dst0[i * dst_stride0 + j]) = + _mm_cvtsi128_si32(res_0); + *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = + _mm_cvtsi128_si32(res_1); + } + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } + } + } else { + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_horiz; + + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + for (i = 0; i < h; i += 2) { + const uint8_t *src_data = src_ptr + i * src_stride; + CONV_BUF_TYPE *dst_data = dst + i * dst_stride; + for (j = 0; j < w; j += 8) { + const __m256i data = + load_line2_avx2(&src_data[j], &src_data[j + src_stride]); + + __m256i res = convolve_lowbd_x(data, coeffs, filt); + + res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift); + + res = _mm256_slli_epi16(res, bits); + + const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m256i data_ref_0 = + load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]); + const __m256i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + if (w > 4) { + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + } else { + *(uint32_t *)(&dst0[i * dst_stride0 + j]) = + _mm_cvtsi128_si32(res_0); + *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = + _mm_cvtsi128_si32(res_1); + } + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } + } + } +} + +void av1_dist_wtd_convolve_y_avx2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int bd = 8; + int i, j, is_vert_4tap = 0; + // +1 to compensate for dividing the filter coeffs by 2 + const int left_shift = FILTER_BITS - conv_params->round_0 + 1; + const __m256i round_const = + _mm256_set1_epi32((1 << conv_params->round_1) >> 1); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); + const __m256i wt = unpack_weights_avx2(conv_params); + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m256i offset_const = _mm256_set1_epi16(offset); + const int offset_1 = (1 << (bd + FILTER_BITS - 2)); + const __m256i offset_const_1 = _mm256_set1_epi16(offset_1); + const __m256i offset_const_2 = _mm256_set1_epi16((1 << offset_0)); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); + const __m256i zero = _mm256_setzero_si256(); + __m256i coeffs[4], s[8]; + + assert((FILTER_BITS - conv_params->round_0) >= 0); + + prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs); + + (void)conv_params; + (void)filter_params_x; + (void)subpel_x_qn; + + // Condition for checking valid vert_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0))) + is_vert_4tap = 1; + + if (is_vert_4tap) { + const int fo_vert = 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride; + for (j = 0; j < w; j += 16) { + const uint8_t *data = &src_ptr[j]; + __m256i src4; + // Load lines a and b. Line a to lower 128, line b to upper 128 + { + __m256i src_ab[4]; + __m256i src_a[5]; + src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + for (int kk = 0; kk < 4; ++kk) { + data += src_stride; + src_a[kk + 1] = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + src_ab[kk] = + _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20); + } + src4 = src_a[4]; + s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]); + s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]); + + s[3] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]); + s[4] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]); + } + + for (i = 0; i < h; i += 2) { + data = &src_ptr[(i + 5) * src_stride + j]; + const __m256i src5 = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + const __m256i src_45a = _mm256_permute2x128_si256(src4, src5, 0x20); + + src4 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + src_stride))); + const __m256i src_56a = _mm256_permute2x128_si256(src5, src4, 0x20); + + s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); + s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); + + __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1); + + res_lo = _mm256_add_epi16(res_lo, offset_const_1); + + const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero); + const __m256i res_lo_0_shift = + _mm256_slli_epi32(res_lo_0_32b, left_shift); + const __m256i res_lo_0_round = _mm256_sra_epi32( + _mm256_add_epi32(res_lo_0_shift, round_const), round_shift); + + const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero); + const __m256i res_lo_1_shift = + _mm256_slli_epi32(res_lo_1_32b, left_shift); + const __m256i res_lo_1_round = _mm256_sra_epi32( + _mm256_add_epi32(res_lo_1_shift, round_const), round_shift); + + const __m256i res_lo_round = + _mm256_packs_epi32(res_lo_0_round, res_lo_1_round); + + const __m256i res_lo_unsigned = + _mm256_add_epi16(res_lo_round, offset_const_2); + + if (w - j < 16) { + if (do_average) { + const __m256i data_ref_0 = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); + const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned, + &wt, use_dist_wtd_comp_avg); + + const __m256i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_8 = + _mm256_packus_epi16(round_result, round_result); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + if (w - j > 4) { + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), + res_1); + } else { + *(uint32_t *)(&dst0[i * dst_stride0 + j]) = + _mm_cvtsi128_si32(res_0); + *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = + _mm_cvtsi128_si32(res_1); + } + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + + const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } else { + __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1); + + res_hi = _mm256_add_epi16(res_hi, offset_const_1); + + const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero); + const __m256i res_hi_0_shift = + _mm256_slli_epi32(res_hi_0_32b, left_shift); + const __m256i res_hi_0_round = _mm256_sra_epi32( + _mm256_add_epi32(res_hi_0_shift, round_const), round_shift); + + const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero); + const __m256i res_hi_1_shift = + _mm256_slli_epi32(res_hi_1_32b, left_shift); + const __m256i res_hi_1_round = _mm256_sra_epi32( + _mm256_add_epi32(res_hi_1_shift, round_const), round_shift); + + const __m256i res_hi_round = + _mm256_packs_epi32(res_hi_0_round, res_hi_1_round); + + const __m256i res_hi_unsigned = + _mm256_add_epi16(res_hi_round, offset_const_2); + + if (do_average) { + const __m256i data_ref_0_lo = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); + + const __m256i data_ref_0_hi = + load_line2_avx2(&dst[i * dst_stride + j + 8], + &dst[i * dst_stride + j + 8 + dst_stride]); + + const __m256i comp_avg_res_lo = comp_avg( + &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i comp_avg_res_hi = comp_avg( + &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i round_result_lo = + convolve_rounding(&comp_avg_res_lo, &offset_const, + &rounding_const, rounding_shift); + + const __m256i round_result_hi = + convolve_rounding(&comp_avg_res_hi, &offset_const, + &rounding_const, rounding_shift); + + const __m256i res_8 = + _mm256_packus_epi16(round_result_lo, round_result_hi); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_store_si128( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + + } else { + const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0); + + const __m128i res_lo_1 = + _mm256_extracti128_si256(res_lo_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_lo_1); + + const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), + res_hi_0); + + const __m128i res_hi_1 = + _mm256_extracti128_si256(res_hi_unsigned, 1); + _mm_store_si128( + (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), + res_hi_1); + } + } + s[0] = s[1]; + s[1] = s[2]; + + s[3] = s[4]; + s[4] = s[5]; + } + } + } else { + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride; + for (j = 0; j < w; j += 16) { + const uint8_t *data = &src_ptr[j]; + __m256i src6; + // Load lines a and b. Line a to lower 128, line b to upper 128 + { + __m256i src_ab[7]; + __m256i src_a[7]; + src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + for (int kk = 0; kk < 6; ++kk) { + data += src_stride; + src_a[kk + 1] = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + src_ab[kk] = + _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20); + } + src6 = src_a[6]; + s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]); + s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]); + s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]); + s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]); + s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]); + s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]); + } + + for (i = 0; i < h; i += 2) { + data = &src_ptr[(i + 7) * src_stride + j]; + const __m256i src7 = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20); + + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + src_stride))); + const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20); + + s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); + s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); + + __m256i res_lo = convolve_lowbd(s, coeffs); + + res_lo = _mm256_add_epi16(res_lo, offset_const_1); + + const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero); + const __m256i res_lo_0_shift = + _mm256_slli_epi32(res_lo_0_32b, left_shift); + const __m256i res_lo_0_round = _mm256_sra_epi32( + _mm256_add_epi32(res_lo_0_shift, round_const), round_shift); + + const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero); + const __m256i res_lo_1_shift = + _mm256_slli_epi32(res_lo_1_32b, left_shift); + const __m256i res_lo_1_round = _mm256_sra_epi32( + _mm256_add_epi32(res_lo_1_shift, round_const), round_shift); + + const __m256i res_lo_round = + _mm256_packs_epi32(res_lo_0_round, res_lo_1_round); + + const __m256i res_lo_unsigned = + _mm256_add_epi16(res_lo_round, offset_const_2); + + if (w - j < 16) { + if (do_average) { + const __m256i data_ref_0 = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); + const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned, + &wt, use_dist_wtd_comp_avg); + + const __m256i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_8 = + _mm256_packus_epi16(round_result, round_result); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + if (w - j > 4) { + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), + res_1); + } else { + *(uint32_t *)(&dst0[i * dst_stride0 + j]) = + _mm_cvtsi128_si32(res_0); + *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = + _mm_cvtsi128_si32(res_1); + } + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + + const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } else { + __m256i res_hi = convolve_lowbd(s + 4, coeffs); + + res_hi = _mm256_add_epi16(res_hi, offset_const_1); + + const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero); + const __m256i res_hi_0_shift = + _mm256_slli_epi32(res_hi_0_32b, left_shift); + const __m256i res_hi_0_round = _mm256_sra_epi32( + _mm256_add_epi32(res_hi_0_shift, round_const), round_shift); + + const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero); + const __m256i res_hi_1_shift = + _mm256_slli_epi32(res_hi_1_32b, left_shift); + const __m256i res_hi_1_round = _mm256_sra_epi32( + _mm256_add_epi32(res_hi_1_shift, round_const), round_shift); + + const __m256i res_hi_round = + _mm256_packs_epi32(res_hi_0_round, res_hi_1_round); + + const __m256i res_hi_unsigned = + _mm256_add_epi16(res_hi_round, offset_const_2); + + if (do_average) { + const __m256i data_ref_0_lo = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); + + const __m256i data_ref_0_hi = + load_line2_avx2(&dst[i * dst_stride + j + 8], + &dst[i * dst_stride + j + 8 + dst_stride]); + + const __m256i comp_avg_res_lo = comp_avg( + &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i comp_avg_res_hi = comp_avg( + &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i round_result_lo = + convolve_rounding(&comp_avg_res_lo, &offset_const, + &rounding_const, rounding_shift); + + const __m256i round_result_hi = + convolve_rounding(&comp_avg_res_hi, &offset_const, + &rounding_const, rounding_shift); + + const __m256i res_8 = + _mm256_packus_epi16(round_result_lo, round_result_hi); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_store_si128( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + + } else { + const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0); + + const __m128i res_lo_1 = + _mm256_extracti128_si256(res_lo_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_lo_1); + + const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), + res_hi_0); + + const __m128i res_hi_1 = + _mm256_extracti128_si256(res_hi_unsigned, 1); + _mm_store_si128( + (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), + res_hi_1); + } + } + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + } + } + } +} + +void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int bd = 8; + + DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); + + int im_stride = 8; + int i, is_horiz_4tap = 0, is_vert_4tap = 0; + const __m256i wt = unpack_weights_avx2(conv_params); + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m256i offset_const = _mm256_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); + + assert(conv_params->round_0 > 0); + + const __m256i round_const_h = _mm256_set1_epi16( + ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2))); + const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1); + + const __m256i round_const_v = _mm256_set1_epi32( + ((1 << conv_params->round_1) >> 1) - + (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); + const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1); + + __m256i filt[4], coeffs_x[4], coeffs_y[4]; + + filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + + prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_x); + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); + + // Condition for checking valid horz_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_x[0], coeffs_x[3]), 0))) + is_horiz_4tap = 1; + + // Condition for checking valid vert_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_y[0], coeffs_y[3]), 0))) + is_vert_4tap = 1; + + if (is_horiz_4tap) { + int im_h = h + filter_params_y->taps - 1; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + for (int j = 0; j < w; j += 8) { + /* Horizontal filter */ + const uint8_t *src_h = src_ptr + j; + for (i = 0; i < im_h; i += 2) { + __m256i data = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h)); + if (i + 1 < im_h) + data = _mm256_inserti128_si256( + data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); + src_h += (src_stride << 1); + __m256i res = convolve_lowbd_x_4tap(data, coeffs_x + 1, filt); + + res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), + round_shift_h); + + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); + } + DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP; + } + } else if (is_vert_4tap) { + int im_h = h + 3; + const int fo_vert = 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + for (int j = 0; j < w; j += 8) { + /* Horizontal filter */ + const uint8_t *src_h = src_ptr + j; + DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP; + + /* Vertical filter */ + __m256i s[6]; + __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); + __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); + __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); + __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); + + s[0] = _mm256_unpacklo_epi16(s0, s1); + s[1] = _mm256_unpacklo_epi16(s2, s3); + + s[3] = _mm256_unpackhi_epi16(s0, s1); + s[4] = _mm256_unpackhi_epi16(s2, s3); + + for (i = 0; i < h; i += 2) { + const int16_t *data = &im_block[i * im_stride]; + + const __m256i s4 = + _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); + const __m256i s5 = + _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); + + s[2] = _mm256_unpacklo_epi16(s4, s5); + s[5] = _mm256_unpackhi_epi16(s4, s5); + + const __m256i res_a = convolve_4tap(s, coeffs_y + 1); + const __m256i res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a, round_const_v), round_shift_v); + + if (w - j > 4) { + const __m256i res_b = convolve_4tap(s + 3, coeffs_y + 1); + const __m256i res_b_round = _mm256_sra_epi32( + _mm256_add_epi32(res_b, round_const_v), round_shift_v); + const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); + const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); + + if (do_average) { + const __m256i data_ref_0 = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); + const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, + &wt, use_dist_wtd_comp_avg); + + const __m256i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_8 = + _mm256_packus_epi16(round_result, round_result); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } else { + const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round); + const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); + + if (do_average) { + const __m256i data_ref_0 = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); + + const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, + &wt, use_dist_wtd_comp_avg); + + const __m256i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_8 = + _mm256_packus_epi16(round_result, round_result); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + *(uint32_t *)(&dst0[i * dst_stride0 + j]) = + _mm_cvtsi128_si32(res_0); + *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = + _mm_cvtsi128_si32(res_1); + + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } + s[0] = s[1]; + s[1] = s[2]; + s[3] = s[4]; + s[4] = s[5]; + } + } + } else { + int im_h = h + filter_params_y->taps - 1; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + for (int j = 0; j < w; j += 8) { + /* Horizontal filter */ + const uint8_t *src_h = src_ptr + j; + DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP; + + DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP; + } + } +} + +void av1_dist_wtd_convolve_2d_copy_avx2( + const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params) { + const int bd = 8; + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + (void)filter_params_x; + (void)filter_params_y; + (void)subpel_x_qn; + (void)subpel_y_qn; + + const int bits = + FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; + const __m128i left_shift = _mm_cvtsi32_si128(bits); + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const __m256i wt = unpack_weights_avx2(conv_params); + const __m256i zero = _mm256_setzero_si256(); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m256i offset_const = _mm256_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); + int i, j; + + if (!(w % 16)) { + for (i = 0; i < h; i += 1) { + for (j = 0; j < w; j += 16) { + const __m256i src_16bit = _mm256_cvtepu8_epi16( + _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]))); + + const __m256i res = _mm256_sll_epi16(src_16bit, left_shift); + const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); + + if (do_average) { + const __m256i data_ref_0 = + _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j])); + + const __m256i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); + const __m256i res_0 = _mm256_permute4x64_epi64(res_8, 0xD8); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), + _mm256_castsi256_si128(res_0)); + } else { + _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]), + res_unsigned); + } + } + } + } else if (!(w % 4)) { + for (i = 0; i < h; i += 2) { + for (j = 0; j < w; j += 8) { + const __m128i src_row_0 = + _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j])); + const __m128i src_row_1 = + _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride])); + // since not all compilers yet support _mm256_set_m128i() + const __m256i src_10 = _mm256_insertf128_si256( + _mm256_castsi128_si256(src_row_0), src_row_1, 1); + + const __m256i src_16bit = _mm256_unpacklo_epi8(src_10, zero); + + const __m256i res = _mm256_sll_epi16(src_16bit, left_shift); + + const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m256i data_ref_0 = load_line2_avx2( + &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); + const __m256i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + if (w > 4) { + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + } else { + *(uint32_t *)(&dst0[i * dst_stride0 + j]) = + _mm_cvtsi128_si32(res_0); + *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = + _mm_cvtsi128_si32(res_1); + } + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } + } + } +} diff --git a/libs/libaom/src/av1/common/x86/jnt_convolve_sse2.c b/libs/libaom/src/av1/common/x86/jnt_convolve_sse2.c new file mode 100644 index 000000000..f8f640a11 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/jnt_convolve_sse2.c @@ -0,0 +1,615 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve_sse2.h" + +void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + const int bd = 8; + CONV_BUF_TYPE *dst = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - fo_horiz; + const int bits = FILTER_BITS - conv_params->round_1; + const __m128i left_shift = _mm_cvtsi32_si128(bits); + const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_0) >> 1); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi16(w0); + const __m128i wt1 = _mm_set1_epi16(w1); + const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m128i offset_const = _mm_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1); + __m128i coeffs[4]; + + (void)filter_params_y; + (void)subpel_y_qn; + + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs); + + if (w == 4) { + do { + const __m128i data = _mm_loadu_si128((__m128i *)src_ptr); + __m128i s[4]; + + s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1)); + s[1] = + _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3)); + s[2] = + _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5)); + s[3] = + _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7)); + const __m128i res_lo = convolve_lo_x(s, coeffs); + const __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift); + + const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_lo_shift); + const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst); + + const __m128i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = _mm_packus_epi16(round_result, round_result); + *(uint32_t *)(&dst0[0]) = _mm_cvtsi128_si32(res_8); + } else { + _mm_store_si128((__m128i *)(&dst[0]), res_unsigned); + } + src_ptr += src_stride; + dst += dst_stride; + dst0 += dst_stride0; + } while (--h); + } else { + assert(!(w % 8)); + int i = 0; + do { + int j = 0; + do { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + __m128i s[4]; + + // Filter even-index pixels + s[0] = data; + s[1] = _mm_srli_si128(data, 2); + s[2] = _mm_srli_si128(data, 4); + s[3] = _mm_srli_si128(data, 6); + const __m128i res_even = convolve_lo_x(s, coeffs); + + // Filter odd-index pixels + s[0] = _mm_srli_si128(data, 1); + s[1] = _mm_srli_si128(data, 3); + s[2] = _mm_srli_si128(data, 5); + s[3] = _mm_srli_si128(data, 7); + const __m128i res_odd = convolve_lo_x(s, coeffs); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + const __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + const __m128i res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); + const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift); + const __m128i res_hi_shift = _mm_sll_epi32(res_hi_round, left_shift); + + const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift); + const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m128i data_ref_0 = + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); + + const __m128i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = _mm_packus_epi16(round_result, round_result); + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8); + } else { + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned); + } + j += 8; + } while (j < w); + } while (++i < h); + } +} + +void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + const int bd = 8; + CONV_BUF_TYPE *dst = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint8_t *src_ptr = src - fo_vert * src_stride; + const int bits = FILTER_BITS - conv_params->round_0; + const __m128i left_shift = _mm_cvtsi32_si128(bits); + const __m128i wt0 = _mm_set1_epi16(conv_params->fwd_offset); + const __m128i wt1 = _mm_set1_epi16(conv_params->bck_offset); + const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m128i offset_const = _mm_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1); + const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_1) >> 1); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); + __m128i coeffs[4]; + + (void)filter_params_x; + (void)subpel_x_qn; + + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs); + + if (w == 4) { + __m128i s[8], src6, res, res_shift; + src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride)); + s[0] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)), + _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride))); + s[1] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)), + _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride))); + s[2] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)), + _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride))); + s[3] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)), + _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride))); + s[4] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)), + _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride))); + s[5] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6); + + do { + s[6] = _mm_unpacklo_epi8( + src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride))); + src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride)); + s[7] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6); + + res = convolve_lo_y(s + 0, coeffs); + res_shift = _mm_sll_epi32(res, left_shift); + res_shift = + _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift); + + __m128i res_16b = _mm_packs_epi32(res_shift, res_shift); + __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst); + + const __m128i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = _mm_packus_epi16(round_result, round_result); + *(uint32_t *)(&dst0[0]) = _mm_cvtsi128_si32(res_8); + + } else { + _mm_store_si128((__m128i *)dst, res_unsigned); + } + + src_ptr += src_stride; + dst += dst_stride; + dst0 += dst_stride0; + + res = convolve_lo_y(s + 1, coeffs); + res_shift = _mm_sll_epi32(res, left_shift); + res_shift = + _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift); + + res_16b = _mm_packs_epi32(res_shift, res_shift); + res_unsigned = _mm_add_epi16(res_16b, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst); + + const __m128i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = _mm_packus_epi16(round_result, round_result); + *(uint32_t *)(&dst0[0]) = _mm_cvtsi128_si32(res_8); + + } else { + _mm_store_si128((__m128i *)dst, res_unsigned); + } + + src_ptr += src_stride; + dst += dst_stride; + dst0 += dst_stride0; + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + s[3] = s[5]; + s[4] = s[6]; + s[5] = s[7]; + h -= 2; + } while (h); + } else { + assert(!(w % 8)); + int j = 0; + do { + __m128i s[8], src6, res_lo, res_hi, res_lo_shift, res_hi_shift; + const uint8_t *data = &src_ptr[j]; + + src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)); + s[0] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 1 * src_stride))); + s[1] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 2 * src_stride))); + s[2] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 3 * src_stride))); + s[3] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 4 * src_stride))); + s[4] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 5 * src_stride))); + s[5] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6); + + int i = 0; + do { + data = &src_ptr[i * src_stride + j]; + s[6] = _mm_unpacklo_epi8( + src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride))); + src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)); + s[7] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6); + + res_lo = convolve_lo_y(s, coeffs); // Filter low index pixels + res_hi = convolve_hi_y(s, coeffs); // Filter high index pixels + res_lo_shift = _mm_sll_epi32(res_lo, left_shift); + res_hi_shift = _mm_sll_epi32(res_hi, left_shift); + res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const), + round_shift); + res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const), + round_shift); + + __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift); + __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m128i data_ref_0 = + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); + + const __m128i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = _mm_packus_epi16(round_result, round_result); + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8); + } else { + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned); + } + i++; + + res_lo = convolve_lo_y(s + 1, coeffs); // Filter low index pixels + res_hi = convolve_hi_y(s + 1, coeffs); // Filter high index pixels + res_lo_shift = _mm_sll_epi32(res_lo, left_shift); + res_hi_shift = _mm_sll_epi32(res_hi, left_shift); + res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const), + round_shift); + res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const), + round_shift); + res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift); + res_unsigned = _mm_add_epi16(res_16b, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + __m128i data_ref_0 = + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); + + const __m128i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = _mm_packus_epi16(round_result, round_result); + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8); + } else { + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned); + } + i++; + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + s[3] = s[5]; + s[4] = s[6]; + s[5] = s[7]; + } while (i < h); + j += 8; + } while (j < w); + } +} + +void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int bd = 8; + + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); + int im_h = h + filter_params_y->taps - 1; + int im_stride = MAX_SB_SIZE; + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + const __m128i zero = _mm_setzero_si128(); + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi16(w0); + const __m128i wt1 = _mm_set1_epi16(w1); + const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m128i offset_const = _mm_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1); + + /* Horizontal filter */ + { + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); + + for (i = 0; i < im_h; ++i) { + for (j = 0; j < w; j += 8) { + __m128i temp_lo, temp_hi; + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + + const __m128i src_lo = _mm_unpacklo_epi8(data, zero); + const __m128i src_hi = _mm_unpackhi_epi8(data, zero); + + // Filter even-index pixels + const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01); + temp_lo = _mm_srli_si128(src_lo, 4); + temp_hi = _mm_slli_si128(src_hi, 12); + const __m128i src_2 = _mm_or_si128(temp_hi, temp_lo); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + temp_lo = _mm_srli_si128(src_lo, 8); + temp_hi = _mm_slli_si128(src_hi, 8); + const __m128i src_4 = _mm_or_si128(temp_hi, temp_lo); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + temp_lo = _mm_srli_si128(src_lo, 12); + temp_hi = _mm_slli_si128(src_hi, 4); + const __m128i src_6 = _mm_or_si128(temp_hi, temp_lo); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), + _mm_add_epi32(res_2, res_6)); + res_even = + _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); + + // Filter odd-index pixels + temp_lo = _mm_srli_si128(src_lo, 2); + temp_hi = _mm_slli_si128(src_hi, 14); + const __m128i src_1 = _mm_or_si128(temp_hi, temp_lo); + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + temp_lo = _mm_srli_si128(src_lo, 6); + temp_hi = _mm_slli_si128(src_hi, 10); + const __m128i src_3 = _mm_or_si128(temp_hi, temp_lo); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + temp_lo = _mm_srli_si128(src_lo, 10); + temp_hi = _mm_slli_si128(src_hi, 6); + const __m128i src_5 = _mm_or_si128(temp_hi, temp_lo); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + temp_lo = _mm_srli_si128(src_lo, 14); + temp_hi = _mm_slli_si128(src_hi, 2); + const __m128i src_7 = _mm_or_si128(temp_hi, temp_lo); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + res_odd = + _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); + + // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 + __m128i res = _mm_packs_epi32(res_even, res_odd); + _mm_store_si128((__m128i *)&im_block[i * im_stride + j], res); + } + } + } + + /* Vertical filter */ + { + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << conv_params->round_1) >> 1) - + (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + // Filter even-index pixels + const int16_t *data = &im_block[i * im_stride + j]; + const __m128i src_0 = + _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_2 = + _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_4 = + _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_6 = + _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = + _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_3 = + _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_5 = + _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_7 = + _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + const __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + const __m128i res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); + + const __m128i res_16b = _mm_packs_epi32(res_lo_round, res_hi_round); + const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m128i data_ref_0 = + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); + + const __m128i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = _mm_packus_epi16(round_result, round_result); + + if (w > 4) + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8); + else + *(uint32_t *)(&dst0[i * dst_stride0 + j]) = + _mm_cvtsi128_si32(res_8); + } else { + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned); + } + } + } + } +} diff --git a/libs/libaom/src/av1/common/x86/jnt_convolve_ssse3.c b/libs/libaom/src/av1/common/x86/jnt_convolve_ssse3.c new file mode 100644 index 000000000..f45e3b267 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/jnt_convolve_ssse3.c @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve_sse2.h" + +void av1_dist_wtd_convolve_2d_ssse3( + const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int bd = 8; + + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); + int im_h = h + filter_params_y->taps - 1; + int im_stride = MAX_SB_SIZE; + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + const __m128i zero = _mm_setzero_si128(); + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi16(w0); + const __m128i wt1 = _mm_set1_epi16(w1); + const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m128i offset_const = _mm_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1); + + /* Horizontal filter */ + { + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); + + for (i = 0; i < im_h; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + + const __m128i src_lo = _mm_unpacklo_epi8(data, zero); + const __m128i src_hi = _mm_unpackhi_epi8(data, zero); + + // Filter even-index pixels + const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01); + const __m128i src_2 = _mm_alignr_epi8(src_hi, src_lo, 4); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i src_4 = _mm_alignr_epi8(src_hi, src_lo, 8); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i src_6 = _mm_alignr_epi8(src_hi, src_lo, 12); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), + _mm_add_epi32(res_2, res_6)); + res_even = + _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); + + // Filter odd-index pixels + const __m128i src_1 = _mm_alignr_epi8(src_hi, src_lo, 2); + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i src_3 = _mm_alignr_epi8(src_hi, src_lo, 6); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i src_5 = _mm_alignr_epi8(src_hi, src_lo, 10); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i src_7 = _mm_alignr_epi8(src_hi, src_lo, 14); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + res_odd = + _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); + + // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 + __m128i res = _mm_packs_epi32(res_even, res_odd); + _mm_store_si128((__m128i *)&im_block[i * im_stride + j], res); + } + } + } + + /* Vertical filter */ + { + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << conv_params->round_1) >> 1) - + (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + // Filter even-index pixels + const int16_t *data = &im_block[i * im_stride + j]; + const __m128i src_0 = + _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_2 = + _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_4 = + _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_6 = + _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = + _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_3 = + _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_5 = + _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_7 = + _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + const __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + const __m128i res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); + + const __m128i res_16b = _mm_packs_epi32(res_lo_round, res_hi_round); + const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m128i data_ref_0 = + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); + + const __m128i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = _mm_packus_epi16(round_result, round_result); + + if (w > 4) + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8); + else + *(uint32_t *)(&dst0[i * dst_stride0 + j]) = + _mm_cvtsi128_si32(res_8); + } else { + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned); + } + } + } + } +} diff --git a/libs/libaom/src/av1/common/x86/reconinter_avx2.c b/libs/libaom/src/av1/common/x86/reconinter_avx2.c new file mode 100644 index 000000000..a38bd8317 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/reconinter_avx2.c @@ -0,0 +1,620 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/blend.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" +#include "av1/common/blockd.h" + +static INLINE __m256i calc_mask_avx2(const __m256i mask_base, const __m256i s0, + const __m256i s1) { + const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)); + return _mm256_abs_epi16( + _mm256_add_epi16(mask_base, _mm256_srli_epi16(diff, 4))); + // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54) +} +void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask, + DIFFWTD_MASK_TYPE mask_type, + const uint8_t *src0, int src0_stride, + const uint8_t *src1, int src1_stride, + int h, int w) { + const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0; + const __m256i y_mask_base = _mm256_set1_epi16(38 - mb); + int i = 0; + if (4 == w) { + do { + const __m128i s0A = xx_loadl_32(src0); + const __m128i s0B = xx_loadl_32(src0 + src0_stride); + const __m128i s0C = xx_loadl_32(src0 + src0_stride * 2); + const __m128i s0D = xx_loadl_32(src0 + src0_stride * 3); + const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B); + const __m128i s0CD = _mm_unpacklo_epi32(s0C, s0D); + const __m128i s0ABCD = _mm_unpacklo_epi64(s0AB, s0CD); + const __m256i s0ABCD_w = _mm256_cvtepu8_epi16(s0ABCD); + + const __m128i s1A = xx_loadl_32(src1); + const __m128i s1B = xx_loadl_32(src1 + src1_stride); + const __m128i s1C = xx_loadl_32(src1 + src1_stride * 2); + const __m128i s1D = xx_loadl_32(src1 + src1_stride * 3); + const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B); + const __m128i s1CD = _mm_unpacklo_epi32(s1C, s1D); + const __m128i s1ABCD = _mm_unpacklo_epi64(s1AB, s1CD); + const __m256i s1ABCD_w = _mm256_cvtepu8_epi16(s1ABCD); + const __m256i m16 = calc_mask_avx2(y_mask_base, s0ABCD_w, s1ABCD_w); + const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256()); + const __m128i x_m8 = + _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)); + xx_storeu_128(mask, x_m8); + src0 += (src0_stride << 2); + src1 += (src1_stride << 2); + mask += 16; + i += 4; + } while (i < h); + } else if (8 == w) { + do { + const __m128i s0A = xx_loadl_64(src0); + const __m128i s0B = xx_loadl_64(src0 + src0_stride); + const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2); + const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3); + const __m256i s0AC_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0A, s0C)); + const __m256i s0BD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0B, s0D)); + const __m128i s1A = xx_loadl_64(src1); + const __m128i s1B = xx_loadl_64(src1 + src1_stride); + const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2); + const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3); + const __m256i s1AB_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1A, s1C)); + const __m256i s1CD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1B, s1D)); + const __m256i m16AC = calc_mask_avx2(y_mask_base, s0AC_w, s1AB_w); + const __m256i m16BD = calc_mask_avx2(y_mask_base, s0BD_w, s1CD_w); + const __m256i m8 = _mm256_packus_epi16(m16AC, m16BD); + yy_storeu_256(mask, m8); + src0 += src0_stride << 2; + src1 += src1_stride << 2; + mask += 32; + i += 4; + } while (i < h); + } else if (16 == w) { + do { + const __m128i s0A = xx_load_128(src0); + const __m128i s0B = xx_load_128(src0 + src0_stride); + const __m128i s1A = xx_load_128(src1); + const __m128i s1B = xx_load_128(src1 + src1_stride); + const __m256i s0AL = _mm256_cvtepu8_epi16(s0A); + const __m256i s0BL = _mm256_cvtepu8_epi16(s0B); + const __m256i s1AL = _mm256_cvtepu8_epi16(s1A); + const __m256i s1BL = _mm256_cvtepu8_epi16(s1B); + + const __m256i m16AL = calc_mask_avx2(y_mask_base, s0AL, s1AL); + const __m256i m16BL = calc_mask_avx2(y_mask_base, s0BL, s1BL); + + const __m256i m8 = + _mm256_permute4x64_epi64(_mm256_packus_epi16(m16AL, m16BL), 0xd8); + yy_storeu_256(mask, m8); + src0 += src0_stride << 1; + src1 += src1_stride << 1; + mask += 32; + i += 2; + } while (i < h); + } else { + do { + int j = 0; + do { + const __m256i s0 = yy_loadu_256(src0 + j); + const __m256i s1 = yy_loadu_256(src1 + j); + const __m256i s0L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s0)); + const __m256i s1L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1)); + const __m256i s0H = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s0, 1)); + const __m256i s1H = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1)); + const __m256i m16L = calc_mask_avx2(y_mask_base, s0L, s1L); + const __m256i m16H = calc_mask_avx2(y_mask_base, s0H, s1H); + const __m256i m8 = + _mm256_permute4x64_epi64(_mm256_packus_epi16(m16L, m16H), 0xd8); + yy_storeu_256(mask + j, m8); + j += 32; + } while (j < w); + src0 += src0_stride; + src1 += src1_stride; + mask += w; + i += 1; + } while (i < h); + } +} + +static INLINE __m256i calc_mask_d16_avx2(const __m256i *data_src0, + const __m256i *data_src1, + const __m256i *round_const, + const __m256i *mask_base_16, + const __m256i *clip_diff, int round) { + const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1); + const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0); + const __m256i diff = _mm256_max_epu16(diffa, diffb); + const __m256i diff_round = + _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round); + const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2); + const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16); + const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff); + return diff_clamp; +} + +static INLINE __m256i calc_mask_d16_inv_avx2(const __m256i *data_src0, + const __m256i *data_src1, + const __m256i *round_const, + const __m256i *mask_base_16, + const __m256i *clip_diff, + int round) { + const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1); + const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0); + const __m256i diff = _mm256_max_epu16(diffa, diffb); + const __m256i diff_round = + _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round); + const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2); + const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16); + const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff); + const __m256i diff_const_16 = _mm256_sub_epi16(*clip_diff, diff_clamp); + return diff_const_16; +} + +static INLINE void build_compound_diffwtd_mask_d16_avx2( + uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) { + const int mask_base = 38; + const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1); + const __m256i y38 = _mm256_set1_epi16(mask_base); + const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + int i = 0; + if (w == 4) { + do { + const __m128i s0A = xx_loadl_64(src0); + const __m128i s0B = xx_loadl_64(src0 + src0_stride); + const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2); + const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3); + const __m128i s1A = xx_loadl_64(src1); + const __m128i s1B = xx_loadl_64(src1 + src1_stride); + const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2); + const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3); + const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D), + _mm_unpacklo_epi64(s0A, s0B)); + const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D), + _mm_unpacklo_epi64(s1A, s1B)); + const __m256i m16 = calc_mask_d16_avx2(&s0, &s1, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256()); + xx_storeu_128(mask, + _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8))); + src0 += src0_stride << 2; + src1 += src1_stride << 2; + mask += 16; + i += 4; + } while (i < h); + } else if (w == 8) { + do { + const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0); + const __m256i s0CD = + yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2); + const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1); + const __m256i s1CD = + yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2); + const __m256i m16AB = + calc_mask_d16_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift); + const __m256i m16CD = + calc_mask_d16_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride << 2; + src1 += src1_stride << 2; + mask += 32; + i += 4; + } while (i < h); + } else if (w == 16) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + src0_stride); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + src1_stride); + const __m256i m16A = + calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16A, m16B); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride << 1; + src1 += src1_stride << 1; + mask += 32; + i += 2; + } while (i < h); + } else if (w == 32) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i m16A = + calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16A, m16B); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 32; + i += 1; + } while (i < h); + } else if (w == 64) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s0C = yy_loadu_256(src0 + 32); + const __m256i s0D = yy_loadu_256(src0 + 48); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i s1C = yy_loadu_256(src1 + 32); + const __m256i s1D = yy_loadu_256(src1 + 48); + const __m256i m16A = + calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m16C = + calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); + const __m256i m16D = + calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); + const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); + const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); + yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 64; + i += 1; + } while (i < h); + } else { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s0C = yy_loadu_256(src0 + 32); + const __m256i s0D = yy_loadu_256(src0 + 48); + const __m256i s0E = yy_loadu_256(src0 + 64); + const __m256i s0F = yy_loadu_256(src0 + 80); + const __m256i s0G = yy_loadu_256(src0 + 96); + const __m256i s0H = yy_loadu_256(src0 + 112); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i s1C = yy_loadu_256(src1 + 32); + const __m256i s1D = yy_loadu_256(src1 + 48); + const __m256i s1E = yy_loadu_256(src1 + 64); + const __m256i s1F = yy_loadu_256(src1 + 80); + const __m256i s1G = yy_loadu_256(src1 + 96); + const __m256i s1H = yy_loadu_256(src1 + 112); + const __m256i m16A = + calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m16C = + calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); + const __m256i m16D = + calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); + const __m256i m16E = + calc_mask_d16_avx2(&s0E, &s1E, &_r, &y38, &y64, shift); + const __m256i m16F = + calc_mask_d16_avx2(&s0F, &s1F, &_r, &y38, &y64, shift); + const __m256i m16G = + calc_mask_d16_avx2(&s0G, &s1G, &_r, &y38, &y64, shift); + const __m256i m16H = + calc_mask_d16_avx2(&s0H, &s1H, &_r, &y38, &y64, shift); + const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); + const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); + const __m256i m8EF = _mm256_packus_epi16(m16E, m16F); + const __m256i m8GH = _mm256_packus_epi16(m16G, m16H); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); + yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); + yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8)); + yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 128; + i += 1; + } while (i < h); + } +} + +static INLINE void build_compound_diffwtd_mask_d16_inv_avx2( + uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) { + const int mask_base = 38; + const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1); + const __m256i y38 = _mm256_set1_epi16(mask_base); + const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + int i = 0; + if (w == 4) { + do { + const __m128i s0A = xx_loadl_64(src0); + const __m128i s0B = xx_loadl_64(src0 + src0_stride); + const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2); + const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3); + const __m128i s1A = xx_loadl_64(src1); + const __m128i s1B = xx_loadl_64(src1 + src1_stride); + const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2); + const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3); + const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D), + _mm_unpacklo_epi64(s0A, s0B)); + const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D), + _mm_unpacklo_epi64(s1A, s1B)); + const __m256i m16 = + calc_mask_d16_inv_avx2(&s0, &s1, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256()); + xx_storeu_128(mask, + _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8))); + src0 += src0_stride << 2; + src1 += src1_stride << 2; + mask += 16; + i += 4; + } while (i < h); + } else if (w == 8) { + do { + const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0); + const __m256i s0CD = + yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2); + const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1); + const __m256i s1CD = + yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2); + const __m256i m16AB = + calc_mask_d16_inv_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift); + const __m256i m16CD = + calc_mask_d16_inv_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride << 2; + src1 += src1_stride << 2; + mask += 32; + i += 4; + } while (i < h); + } else if (w == 16) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + src0_stride); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + src1_stride); + const __m256i m16A = + calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16A, m16B); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride << 1; + src1 += src1_stride << 1; + mask += 32; + i += 2; + } while (i < h); + } else if (w == 32) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i m16A = + calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16A, m16B); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 32; + i += 1; + } while (i < h); + } else if (w == 64) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s0C = yy_loadu_256(src0 + 32); + const __m256i s0D = yy_loadu_256(src0 + 48); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i s1C = yy_loadu_256(src1 + 32); + const __m256i s1D = yy_loadu_256(src1 + 48); + const __m256i m16A = + calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m16C = + calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); + const __m256i m16D = + calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); + const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); + const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); + yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 64; + i += 1; + } while (i < h); + } else { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s0C = yy_loadu_256(src0 + 32); + const __m256i s0D = yy_loadu_256(src0 + 48); + const __m256i s0E = yy_loadu_256(src0 + 64); + const __m256i s0F = yy_loadu_256(src0 + 80); + const __m256i s0G = yy_loadu_256(src0 + 96); + const __m256i s0H = yy_loadu_256(src0 + 112); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i s1C = yy_loadu_256(src1 + 32); + const __m256i s1D = yy_loadu_256(src1 + 48); + const __m256i s1E = yy_loadu_256(src1 + 64); + const __m256i s1F = yy_loadu_256(src1 + 80); + const __m256i s1G = yy_loadu_256(src1 + 96); + const __m256i s1H = yy_loadu_256(src1 + 112); + const __m256i m16A = + calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m16C = + calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); + const __m256i m16D = + calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); + const __m256i m16E = + calc_mask_d16_inv_avx2(&s0E, &s1E, &_r, &y38, &y64, shift); + const __m256i m16F = + calc_mask_d16_inv_avx2(&s0F, &s1F, &_r, &y38, &y64, shift); + const __m256i m16G = + calc_mask_d16_inv_avx2(&s0G, &s1G, &_r, &y38, &y64, shift); + const __m256i m16H = + calc_mask_d16_inv_avx2(&s0H, &s1H, &_r, &y38, &y64, shift); + const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); + const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); + const __m256i m8EF = _mm256_packus_epi16(m16E, m16F); + const __m256i m8GH = _mm256_packus_epi16(m16G, m16H); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); + yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); + yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8)); + yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 128; + i += 1; + } while (i < h); + } +} + +void av1_build_compound_diffwtd_mask_d16_avx2( + uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, + int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, + ConvolveParams *conv_params, int bd) { + const int shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8); + // When rounding constant is added, there is a possibility of overflow. + // However that much precision is not required. Code should very well work for + // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But + // there is a possibility of corner case bugs. + assert(DIFF_FACTOR_LOG2 == 4); + assert(AOM_BLEND_A64_MAX_ALPHA == 64); + + if (mask_type == DIFFWTD_38) { + build_compound_diffwtd_mask_d16_avx2(mask, src0, src0_stride, src1, + src1_stride, h, w, shift); + } else { + build_compound_diffwtd_mask_d16_inv_avx2(mask, src0, src0_stride, src1, + src1_stride, h, w, shift); + } +} + +void av1_build_compound_diffwtd_mask_highbd_avx2( + uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, + int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, + int bd) { + if (w < 16) { + av1_build_compound_diffwtd_mask_highbd_ssse3( + mask, mask_type, src0, src0_stride, src1, src1_stride, h, w, bd); + } else { + assert(mask_type == DIFFWTD_38 || mask_type == DIFFWTD_38_INV); + assert(bd >= 8); + assert((w % 16) == 0); + const __m256i y0 = _mm256_setzero_si256(); + const __m256i yAOM_BLEND_A64_MAX_ALPHA = + _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const int mask_base = 38; + const __m256i ymask_base = _mm256_set1_epi16(mask_base); + const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0); + const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1); + if (bd == 8) { + if (mask_type == DIFFWTD_38_INV) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]); + __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]); + __m256i diff = _mm256_srai_epi16( + _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), DIFF_FACTOR_LOG2); + __m256i m = _mm256_min_epi16( + _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)), + yAOM_BLEND_A64_MAX_ALPHA); + m = _mm256_sub_epi16(yAOM_BLEND_A64_MAX_ALPHA, m); + m = _mm256_packus_epi16(m, m); + m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0)); + __m128i m0 = _mm256_castsi256_si128(m); + _mm_storeu_si128((__m128i *)&mask[j], m0); + } + ssrc0 += src0_stride; + ssrc1 += src1_stride; + mask += w; + } + } else { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]); + __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]); + __m256i diff = _mm256_srai_epi16( + _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), DIFF_FACTOR_LOG2); + __m256i m = _mm256_min_epi16( + _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)), + yAOM_BLEND_A64_MAX_ALPHA); + m = _mm256_packus_epi16(m, m); + m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0)); + __m128i m0 = _mm256_castsi256_si128(m); + _mm_storeu_si128((__m128i *)&mask[j], m0); + } + ssrc0 += src0_stride; + ssrc1 += src1_stride; + mask += w; + } + } + } else { + const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2); + if (mask_type == DIFFWTD_38_INV) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]); + __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]); + __m256i diff = _mm256_sra_epi16( + _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), xshift); + __m256i m = _mm256_min_epi16( + _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)), + yAOM_BLEND_A64_MAX_ALPHA); + m = _mm256_sub_epi16(yAOM_BLEND_A64_MAX_ALPHA, m); + m = _mm256_packus_epi16(m, m); + m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0)); + __m128i m0 = _mm256_castsi256_si128(m); + _mm_storeu_si128((__m128i *)&mask[j], m0); + } + ssrc0 += src0_stride; + ssrc1 += src1_stride; + mask += w; + } + } else { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]); + __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]); + __m256i diff = _mm256_sra_epi16( + _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), xshift); + __m256i m = _mm256_min_epi16( + _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)), + yAOM_BLEND_A64_MAX_ALPHA); + m = _mm256_packus_epi16(m, m); + m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0)); + __m128i m0 = _mm256_castsi256_si128(m); + _mm_storeu_si128((__m128i *)&mask[j], m0); + } + ssrc0 += src0_stride; + ssrc1 += src1_stride; + mask += w; + } + } + } + } +} diff --git a/libs/libaom/src/av1/common/x86/reconinter_sse4.c b/libs/libaom/src/av1/common/x86/reconinter_sse4.c new file mode 100644 index 000000000..5171ca493 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/reconinter_sse4.c @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 +#include /* SSE4.1 */ + +#include "aom/aom_integer.h" +#include "aom_dsp/blend.h" +#include "av1/common/blockd.h" + +static INLINE __m128i calc_mask(const __m128i mask_base, const __m128i s0, + const __m128i s1) { + const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(s0, s1)); + return _mm_abs_epi16(_mm_add_epi16(mask_base, _mm_srli_epi16(diff, 4))); + // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54) +} + +void av1_build_compound_diffwtd_mask_sse4_1(uint8_t *mask, + DIFFWTD_MASK_TYPE mask_type, + const uint8_t *src0, int stride0, + const uint8_t *src1, int stride1, + int h, int w) { + const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0; + const __m128i mask_base = _mm_set1_epi16(38 - mb); + int i = 0; + if (4 == w) { + do { + const __m128i s0A = _mm_cvtsi32_si128(*(uint32_t *)src0); + const __m128i s0B = _mm_cvtsi32_si128(*(uint32_t *)(src0 + stride0)); + const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B); + const __m128i s0 = _mm_cvtepu8_epi16(s0AB); + + const __m128i s1A = _mm_cvtsi32_si128(*(uint32_t *)src1); + const __m128i s1B = _mm_cvtsi32_si128(*(uint32_t *)(src1 + stride1)); + const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B); + const __m128i s1 = _mm_cvtepu8_epi16(s1AB); + + const __m128i m16 = calc_mask(mask_base, s0, s1); + const __m128i m8 = _mm_packus_epi16(m16, m16); + + *(uint32_t *)mask = _mm_cvtsi128_si32(m8); + *(uint32_t *)(mask + w) = _mm_extract_epi32(m8, 1); + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += 8; + i += 2; + } while (i < h); + } else if (8 == w) { + do { + __m128i s0 = _mm_loadl_epi64((__m128i const *)src0); + __m128i s1 = _mm_loadl_epi64((__m128i const *)src1); + s0 = _mm_cvtepu8_epi16(s0); + s1 = _mm_cvtepu8_epi16(s1); + const __m128i m16 = calc_mask(mask_base, s0, s1); + const __m128i m8 = _mm_packus_epi16(m16, m16); + _mm_storel_epi64((__m128i *)mask, m8); + src0 += stride0; + src1 += stride1; + mask += 8; + i += 1; + } while (i < h); + } else { + const __m128i zero = _mm_setzero_si128(); + do { + int j = 0; + do { + const __m128i s0 = _mm_load_si128((__m128i const *)(src0 + j)); + const __m128i s1 = _mm_load_si128((__m128i const *)(src1 + j)); + const __m128i s0L = _mm_cvtepu8_epi16(s0); + const __m128i s1L = _mm_cvtepu8_epi16(s1); + const __m128i s0H = _mm_unpackhi_epi8(s0, zero); + const __m128i s1H = _mm_unpackhi_epi8(s1, zero); + + const __m128i m16L = calc_mask(mask_base, s0L, s1L); + const __m128i m16H = calc_mask(mask_base, s0H, s1H); + + const __m128i m8 = _mm_packus_epi16(m16L, m16H); + _mm_store_si128((__m128i *)(mask + j), m8); + j += 16; + } while (j < w); + src0 += stride0; + src1 += stride1; + mask += w; + i += 1; + } while (i < h); + } +} + +void av1_build_compound_diffwtd_mask_d16_sse4_1( + uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, + int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, + ConvolveParams *conv_params, int bd) { + const int which_inverse = (mask_type == DIFFWTD_38) ? 0 : 1; + const int mask_base = 38; + int round = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8); + const __m128i round_const = _mm_set1_epi16((1 << round) >> 1); + const __m128i mask_base_16 = _mm_set1_epi16(mask_base); + const __m128i clip_diff = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i add_const = + _mm_set1_epi16((which_inverse ? AOM_BLEND_A64_MAX_ALPHA : 0)); + const __m128i add_sign = _mm_set1_epi16((which_inverse ? -1 : 1)); + + int i, j; + // When rounding constant is added, there is a possibility of overflow. + // However that much precision is not required. Code should very well work for + // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But + // there is a possibility of corner case bugs. + assert(DIFF_FACTOR_LOG2 == 4); + assert(AOM_BLEND_A64_MAX_ALPHA == 64); + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i data_src0 = + _mm_loadu_si128((__m128i *)&src0[(i * src0_stride) + j]); + const __m128i data_src1 = + _mm_loadu_si128((__m128i *)&src1[(i * src1_stride) + j]); + + const __m128i diffa = _mm_subs_epu16(data_src0, data_src1); + const __m128i diffb = _mm_subs_epu16(data_src1, data_src0); + const __m128i diff = _mm_max_epu16(diffa, diffb); + const __m128i diff_round = + _mm_srli_epi16(_mm_adds_epu16(diff, round_const), round); + const __m128i diff_factor = _mm_srli_epi16(diff_round, DIFF_FACTOR_LOG2); + const __m128i diff_mask = _mm_adds_epi16(diff_factor, mask_base_16); + __m128i diff_clamp = _mm_min_epi16(diff_mask, clip_diff); + // clamp to 0 can be skipped since we are using add and saturate + // instruction + + const __m128i diff_sign = _mm_sign_epi16(diff_clamp, add_sign); + const __m128i diff_const_16 = _mm_add_epi16(diff_sign, add_const); + + // 8 bit conversion and saturation to uint8 + const __m128i res_8 = _mm_packus_epi16(diff_const_16, diff_const_16); + + // Store values into the destination buffer + __m128i *const dst = (__m128i *)&mask[i * w + j]; + + if ((w - j) > 4) { + _mm_storel_epi64(dst, res_8); + } else { // w==4 + *(uint32_t *)dst = _mm_cvtsi128_si32(res_8); + } + } + } +} diff --git a/libs/libaom/src/av1/common/x86/reconinter_ssse3.c b/libs/libaom/src/av1/common/x86/reconinter_ssse3.c new file mode 100644 index 000000000..cf684447c --- /dev/null +++ b/libs/libaom/src/av1/common/x86/reconinter_ssse3.c @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/blend.h" +#include "aom_dsp/x86/synonyms.h" +#include "av1/common/blockd.h" + +void av1_build_compound_diffwtd_mask_highbd_ssse3( + uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, + int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, + int bd) { + if (w < 8) { + av1_build_compound_diffwtd_mask_highbd_c(mask, mask_type, src0, src0_stride, + src1, src1_stride, h, w, bd); + } else { + assert(bd >= 8); + assert((w % 8) == 0); + assert(mask_type == DIFFWTD_38 || mask_type == DIFFWTD_38_INV); + const __m128i x0 = _mm_setzero_si128(); + const __m128i xAOM_BLEND_A64_MAX_ALPHA = + _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const int mask_base = 38; + const __m128i xmask_base = _mm_set1_epi16(mask_base); + const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0); + const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1); + if (bd == 8) { + if (mask_type == DIFFWTD_38_INV) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 8) { + __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]); + __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]); + __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), + DIFF_FACTOR_LOG2); + __m128i m = _mm_min_epi16( + _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)), + xAOM_BLEND_A64_MAX_ALPHA); + m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m); + m = _mm_packus_epi16(m, m); + _mm_storel_epi64((__m128i *)&mask[j], m); + } + ssrc0 += src0_stride; + ssrc1 += src1_stride; + mask += w; + } + } else { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 8) { + __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]); + __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]); + __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), + DIFF_FACTOR_LOG2); + __m128i m = _mm_min_epi16( + _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)), + xAOM_BLEND_A64_MAX_ALPHA); + m = _mm_packus_epi16(m, m); + _mm_storel_epi64((__m128i *)&mask[j], m); + } + ssrc0 += src0_stride; + ssrc1 += src1_stride; + mask += w; + } + } + } else { + const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2); + if (mask_type == DIFFWTD_38_INV) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 8) { + __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]); + __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]); + __m128i diff = + _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift); + __m128i m = _mm_min_epi16( + _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)), + xAOM_BLEND_A64_MAX_ALPHA); + m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m); + m = _mm_packus_epi16(m, m); + _mm_storel_epi64((__m128i *)&mask[j], m); + } + ssrc0 += src0_stride; + ssrc1 += src1_stride; + mask += w; + } + } else { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 8) { + __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]); + __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]); + __m128i diff = + _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift); + __m128i m = _mm_min_epi16( + _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)), + xAOM_BLEND_A64_MAX_ALPHA); + m = _mm_packus_epi16(m, m); + _mm_storel_epi64((__m128i *)&mask[j], m); + } + ssrc0 += src0_stride; + ssrc1 += src1_stride; + mask += w; + } + } + } + } +} diff --git a/libs/libaom/src/av1/common/x86/selfguided_avx2.c b/libs/libaom/src/av1/common/x86/selfguided_avx2.c new file mode 100644 index 000000000..3c5558dda --- /dev/null +++ b/libs/libaom/src/av1/common/x86/selfguided_avx2.c @@ -0,0 +1,724 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "av1/common/restoration.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +// Load 8 bytes from the possibly-misaligned pointer p, extend each byte to +// 32-bit precision and return them in an AVX2 register. +static __m256i yy256_load_extend_8_32(const void *p) { + return _mm256_cvtepu8_epi32(xx_loadl_64(p)); +} + +// Load 8 halfwords from the possibly-misaligned pointer p, extend each +// halfword to 32-bit precision and return them in an AVX2 register. +static __m256i yy256_load_extend_16_32(const void *p) { + return _mm256_cvtepu16_epi32(xx_loadu_128(p)); +} + +// Compute the scan of an AVX2 register holding 8 32-bit integers. If the +// register holds x0..x7 then the scan will hold x0, x0+x1, x0+x1+x2, ..., +// x0+x1+...+x7 +// +// Let [...] represent a 128-bit block, and let a, ..., h be 32-bit integers +// (assumed small enough to be able to add them without overflow). +// +// Use -> as shorthand for summing, i.e. h->a = h + g + f + e + d + c + b + a. +// +// x = [h g f e][d c b a] +// x01 = [g f e 0][c b a 0] +// x02 = [g+h f+g e+f e][c+d b+c a+b a] +// x03 = [e+f e 0 0][a+b a 0 0] +// x04 = [e->h e->g e->f e][a->d a->c a->b a] +// s = a->d +// s01 = [a->d a->d a->d a->d] +// s02 = [a->d a->d a->d a->d][0 0 0 0] +// ret = [a->h a->g a->f a->e][a->d a->c a->b a] +static __m256i scan_32(__m256i x) { + const __m256i x01 = _mm256_slli_si256(x, 4); + const __m256i x02 = _mm256_add_epi32(x, x01); + const __m256i x03 = _mm256_slli_si256(x02, 8); + const __m256i x04 = _mm256_add_epi32(x02, x03); + const int32_t s = _mm256_extract_epi32(x04, 3); + const __m128i s01 = _mm_set1_epi32(s); + const __m256i s02 = _mm256_insertf128_si256(_mm256_setzero_si256(), s01, 1); + return _mm256_add_epi32(x04, s02); +} + +// Compute two integral images from src. B sums elements; A sums their +// squares. The images are offset by one pixel, so will have width and height +// equal to width + 1, height + 1 and the first row and column will be zero. +// +// A+1 and B+1 should be aligned to 32 bytes. buf_stride should be a multiple +// of 8. + +static void *memset_zero_avx(int32_t *dest, const __m256i *zero, size_t count) { + unsigned int i = 0; + for (i = 0; i < (count & 0xffffffe0); i += 32) { + _mm256_storeu_si256((__m256i *)(dest + i), *zero); + _mm256_storeu_si256((__m256i *)(dest + i + 8), *zero); + _mm256_storeu_si256((__m256i *)(dest + i + 16), *zero); + _mm256_storeu_si256((__m256i *)(dest + i + 24), *zero); + } + for (; i < (count & 0xfffffff8); i += 8) { + _mm256_storeu_si256((__m256i *)(dest + i), *zero); + } + for (; i < count; i++) { + dest[i] = 0; + } + return dest; +} + +static void integral_images(const uint8_t *src, int src_stride, int width, + int height, int32_t *A, int32_t *B, + int buf_stride) { + const __m256i zero = _mm256_setzero_si256(); + // Write out the zero top row + memset_zero_avx(A, &zero, (width + 8)); + memset_zero_avx(B, &zero, (width + 8)); + for (int i = 0; i < height; ++i) { + // Zero the left column. + A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0; + + // ldiff is the difference H - D where H is the output sample immediately + // to the left and D is the output sample above it. These are scalars, + // replicated across the eight lanes. + __m256i ldiff1 = zero, ldiff2 = zero; + for (int j = 0; j < width; j += 8) { + const int ABj = 1 + j; + + const __m256i above1 = yy_load_256(B + ABj + i * buf_stride); + const __m256i above2 = yy_load_256(A + ABj + i * buf_stride); + + const __m256i x1 = yy256_load_extend_8_32(src + j + i * src_stride); + const __m256i x2 = _mm256_madd_epi16(x1, x1); + + const __m256i sc1 = scan_32(x1); + const __m256i sc2 = scan_32(x2); + + const __m256i row1 = + _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1); + const __m256i row2 = + _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2); + + yy_store_256(B + ABj + (i + 1) * buf_stride, row1); + yy_store_256(A + ABj + (i + 1) * buf_stride, row2); + + // Calculate the new H - D. + ldiff1 = _mm256_set1_epi32( + _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7)); + ldiff2 = _mm256_set1_epi32( + _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7)); + } + } +} + +// Compute two integral images from src. B sums elements; A sums their squares +// +// A and B should be aligned to 32 bytes. buf_stride should be a multiple of 8. +static void integral_images_highbd(const uint16_t *src, int src_stride, + int width, int height, int32_t *A, + int32_t *B, int buf_stride) { + const __m256i zero = _mm256_setzero_si256(); + // Write out the zero top row + memset_zero_avx(A, &zero, (width + 8)); + memset_zero_avx(B, &zero, (width + 8)); + + for (int i = 0; i < height; ++i) { + // Zero the left column. + A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0; + + // ldiff is the difference H - D where H is the output sample immediately + // to the left and D is the output sample above it. These are scalars, + // replicated across the eight lanes. + __m256i ldiff1 = zero, ldiff2 = zero; + for (int j = 0; j < width; j += 8) { + const int ABj = 1 + j; + + const __m256i above1 = yy_load_256(B + ABj + i * buf_stride); + const __m256i above2 = yy_load_256(A + ABj + i * buf_stride); + + const __m256i x1 = yy256_load_extend_16_32(src + j + i * src_stride); + const __m256i x2 = _mm256_madd_epi16(x1, x1); + + const __m256i sc1 = scan_32(x1); + const __m256i sc2 = scan_32(x2); + + const __m256i row1 = + _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1); + const __m256i row2 = + _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2); + + yy_store_256(B + ABj + (i + 1) * buf_stride, row1); + yy_store_256(A + ABj + (i + 1) * buf_stride, row2); + + // Calculate the new H - D. + ldiff1 = _mm256_set1_epi32( + _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7)); + ldiff2 = _mm256_set1_epi32( + _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7)); + } + } +} + +// Compute 8 values of boxsum from the given integral image. ii should point +// at the middle of the box (for the first value). r is the box radius. +static INLINE __m256i boxsum_from_ii(const int32_t *ii, int stride, int r) { + const __m256i tl = yy_loadu_256(ii - (r + 1) - (r + 1) * stride); + const __m256i tr = yy_loadu_256(ii + (r + 0) - (r + 1) * stride); + const __m256i bl = yy_loadu_256(ii - (r + 1) + r * stride); + const __m256i br = yy_loadu_256(ii + (r + 0) + r * stride); + const __m256i u = _mm256_sub_epi32(tr, tl); + const __m256i v = _mm256_sub_epi32(br, bl); + return _mm256_sub_epi32(v, u); +} + +static __m256i round_for_shift(unsigned shift) { + return _mm256_set1_epi32((1 << shift) >> 1); +} + +static __m256i compute_p(__m256i sum1, __m256i sum2, int bit_depth, int n) { + __m256i an, bb; + if (bit_depth > 8) { + const __m256i rounding_a = round_for_shift(2 * (bit_depth - 8)); + const __m256i rounding_b = round_for_shift(bit_depth - 8); + const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8)); + const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8); + const __m256i a = + _mm256_srl_epi32(_mm256_add_epi32(sum2, rounding_a), shift_a); + const __m256i b = + _mm256_srl_epi32(_mm256_add_epi32(sum1, rounding_b), shift_b); + // b < 2^14, so we can use a 16-bit madd rather than a 32-bit + // mullo to square it + bb = _mm256_madd_epi16(b, b); + an = _mm256_max_epi32(_mm256_mullo_epi32(a, _mm256_set1_epi32(n)), bb); + } else { + bb = _mm256_madd_epi16(sum1, sum1); + an = _mm256_mullo_epi32(sum2, _mm256_set1_epi32(n)); + } + return _mm256_sub_epi32(an, bb); +} + +// Assumes that C, D are integral images for the original buffer which has been +// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels +// on the sides. A, B, C, D point at logical position (0, 0). +static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D, + int width, int height, int buf_stride, int bit_depth, + int sgr_params_idx, int radius_idx) { + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + const int r = params->r[radius_idx]; + const int n = (2 * r + 1) * (2 * r + 1); + const __m256i s = _mm256_set1_epi32(params->s[radius_idx]); + // one_over_n[n-1] is 2^12/n, so easily fits in an int16 + const __m256i one_over_n = _mm256_set1_epi32(av1_one_by_x[n - 1]); + + const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS); + const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS); + + // Set up masks + const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff); + __m256i mask[8]; + for (int idx = 0; idx < 8; idx++) { + const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx)); + mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift)); + } + + for (int i = -1; i < height + 1; ++i) { + for (int j = -1; j < width + 1; j += 8) { + const int32_t *Cij = C + i * buf_stride + j; + const int32_t *Dij = D + i * buf_stride + j; + + __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r); + __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r); + + // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain + // some uninitialised data in their upper words. We use a mask to + // ensure that these bits are set to 0. + int idx = AOMMIN(8, width + 1 - j); + assert(idx >= 1); + + if (idx < 8) { + sum1 = _mm256_and_si256(mask[idx], sum1); + sum2 = _mm256_and_si256(mask[idx], sum2); + } + + const __m256i p = compute_p(sum1, sum2, bit_depth, n); + + const __m256i z = _mm256_min_epi32( + _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z), + SGRPROJ_MTABLE_BITS), + _mm256_set1_epi32(255)); + + const __m256i a_res = _mm256_i32gather_epi32(av1_x_by_xplus1, z, 4); + + yy_storeu_256(A + i * buf_stride + j, a_res); + + const __m256i a_complement = + _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res); + + // sum1 might have lanes greater than 2^15, so we can't use madd to do + // multiplication involving sum1. However, a_complement and one_over_n + // are both less than 256, so we can multiply them first. + const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n); + const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1); + const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res), + SGRPROJ_RECIP_BITS); + + yy_storeu_256(B + i * buf_stride + j, b_res); + } + } +} + +// Calculate 8 values of the "cross sum" starting at buf. This is a 3x3 filter +// where the outer four corners have weight 3 and all other pixels have weight +// 4. +// +// Pixels are indexed as follows: +// xtl xt xtr +// xl x xr +// xbl xb xbr +// +// buf points to x +// +// fours = xl + xt + xr + xb + x +// threes = xtl + xtr + xbr + xbl +// cross_sum = 4 * fours + 3 * threes +// = 4 * (fours + threes) - threes +// = (fours + threes) << 2 - threes +static INLINE __m256i cross_sum(const int32_t *buf, int stride) { + const __m256i xtl = yy_loadu_256(buf - 1 - stride); + const __m256i xt = yy_loadu_256(buf - stride); + const __m256i xtr = yy_loadu_256(buf + 1 - stride); + const __m256i xl = yy_loadu_256(buf - 1); + const __m256i x = yy_loadu_256(buf); + const __m256i xr = yy_loadu_256(buf + 1); + const __m256i xbl = yy_loadu_256(buf - 1 + stride); + const __m256i xb = yy_loadu_256(buf + stride); + const __m256i xbr = yy_loadu_256(buf + 1 + stride); + + const __m256i fours = _mm256_add_epi32( + xl, _mm256_add_epi32(xt, _mm256_add_epi32(xr, _mm256_add_epi32(xb, x)))); + const __m256i threes = + _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl))); + + return _mm256_sub_epi32(_mm256_slli_epi32(_mm256_add_epi32(fours, threes), 2), + threes); +} + +// The final filter for self-guided restoration. Computes a weighted average +// across A, B with "cross sums" (see cross_sum implementation above). +static void final_filter(int32_t *dst, int dst_stride, const int32_t *A, + const int32_t *B, int buf_stride, const void *dgd8, + int dgd_stride, int width, int height, int highbd) { + const int nb = 5; + const __m256i rounding = + round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); + const uint8_t *dgd_real = + highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 8) { + const __m256i a = cross_sum(A + i * buf_stride + j, buf_stride); + const __m256i b = cross_sum(B + i * buf_stride + j, buf_stride); + + const __m128i raw = + xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd)); + const __m256i src = + highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw); + + __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b); + __m256i w = _mm256_srai_epi32(_mm256_add_epi32(v, rounding), + SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); + + yy_storeu_256(dst + i * dst_stride + j, w); + } + } +} + +// Assumes that C, D are integral images for the original buffer which has been +// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels +// on the sides. A, B, C, D point at logical position (0, 0). +static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C, + const int32_t *D, int width, int height, + int buf_stride, int bit_depth, int sgr_params_idx, + int radius_idx) { + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + const int r = params->r[radius_idx]; + const int n = (2 * r + 1) * (2 * r + 1); + const __m256i s = _mm256_set1_epi32(params->s[radius_idx]); + // one_over_n[n-1] is 2^12/n, so easily fits in an int16 + const __m256i one_over_n = _mm256_set1_epi32(av1_one_by_x[n - 1]); + + const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS); + const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS); + + // Set up masks + const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff); + __m256i mask[8]; + for (int idx = 0; idx < 8; idx++) { + const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx)); + mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift)); + } + + for (int i = -1; i < height + 1; i += 2) { + for (int j = -1; j < width + 1; j += 8) { + const int32_t *Cij = C + i * buf_stride + j; + const int32_t *Dij = D + i * buf_stride + j; + + __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r); + __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r); + + // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain + // some uninitialised data in their upper words. We use a mask to + // ensure that these bits are set to 0. + int idx = AOMMIN(8, width + 1 - j); + assert(idx >= 1); + + if (idx < 8) { + sum1 = _mm256_and_si256(mask[idx], sum1); + sum2 = _mm256_and_si256(mask[idx], sum2); + } + + const __m256i p = compute_p(sum1, sum2, bit_depth, n); + + const __m256i z = _mm256_min_epi32( + _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z), + SGRPROJ_MTABLE_BITS), + _mm256_set1_epi32(255)); + + const __m256i a_res = _mm256_i32gather_epi32(av1_x_by_xplus1, z, 4); + + yy_storeu_256(A + i * buf_stride + j, a_res); + + const __m256i a_complement = + _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res); + + // sum1 might have lanes greater than 2^15, so we can't use madd to do + // multiplication involving sum1. However, a_complement and one_over_n + // are both less than 256, so we can multiply them first. + const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n); + const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1); + const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res), + SGRPROJ_RECIP_BITS); + + yy_storeu_256(B + i * buf_stride + j, b_res); + } + } +} + +// Calculate 8 values of the "cross sum" starting at buf. +// +// Pixels are indexed like this: +// xtl xt xtr +// - buf - +// xbl xb xbr +// +// Pixels are weighted like this: +// 5 6 5 +// 0 0 0 +// 5 6 5 +// +// fives = xtl + xtr + xbl + xbr +// sixes = xt + xb +// cross_sum = 6 * sixes + 5 * fives +// = 5 * (fives + sixes) - sixes +// = (fives + sixes) << 2 + (fives + sixes) + sixes +static INLINE __m256i cross_sum_fast_even_row(const int32_t *buf, int stride) { + const __m256i xtl = yy_loadu_256(buf - 1 - stride); + const __m256i xt = yy_loadu_256(buf - stride); + const __m256i xtr = yy_loadu_256(buf + 1 - stride); + const __m256i xbl = yy_loadu_256(buf - 1 + stride); + const __m256i xb = yy_loadu_256(buf + stride); + const __m256i xbr = yy_loadu_256(buf + 1 + stride); + + const __m256i fives = + _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl))); + const __m256i sixes = _mm256_add_epi32(xt, xb); + const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes); + + return _mm256_add_epi32( + _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2), + fives_plus_sixes), + sixes); +} + +// Calculate 8 values of the "cross sum" starting at buf. +// +// Pixels are indexed like this: +// xl x xr +// +// Pixels are weighted like this: +// 5 6 5 +// +// buf points to x +// +// fives = xl + xr +// sixes = x +// cross_sum = 5 * fives + 6 * sixes +// = 4 * (fives + sixes) + (fives + sixes) + sixes +// = (fives + sixes) << 2 + (fives + sixes) + sixes +static INLINE __m256i cross_sum_fast_odd_row(const int32_t *buf) { + const __m256i xl = yy_loadu_256(buf - 1); + const __m256i x = yy_loadu_256(buf); + const __m256i xr = yy_loadu_256(buf + 1); + + const __m256i fives = _mm256_add_epi32(xl, xr); + const __m256i sixes = x; + + const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes); + + return _mm256_add_epi32( + _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2), + fives_plus_sixes), + sixes); +} + +// The final filter for the self-guided restoration. Computes a +// weighted average across A, B with "cross sums" (see cross_sum_... +// implementations above). +static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A, + const int32_t *B, int buf_stride, + const void *dgd8, int dgd_stride, int width, + int height, int highbd) { + const int nb0 = 5; + const int nb1 = 4; + + const __m256i rounding0 = + round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS); + const __m256i rounding1 = + round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS); + + const uint8_t *dgd_real = + highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8; + + for (int i = 0; i < height; ++i) { + if (!(i & 1)) { // even row + for (int j = 0; j < width; j += 8) { + const __m256i a = + cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride); + const __m256i b = + cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride); + + const __m128i raw = + xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd)); + const __m256i src = + highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw); + + __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b); + __m256i w = + _mm256_srai_epi32(_mm256_add_epi32(v, rounding0), + SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS); + + yy_storeu_256(dst + i * dst_stride + j, w); + } + } else { // odd row + for (int j = 0; j < width; j += 8) { + const __m256i a = cross_sum_fast_odd_row(A + i * buf_stride + j); + const __m256i b = cross_sum_fast_odd_row(B + i * buf_stride + j); + + const __m128i raw = + xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd)); + const __m256i src = + highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw); + + __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b); + __m256i w = + _mm256_srai_epi32(_mm256_add_epi32(v, rounding1), + SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS); + + yy_storeu_256(dst + i * dst_stride + j, w); + } + } + } +} + +int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height, + int dgd_stride, int32_t *flt0, + int32_t *flt1, int flt_stride, + int sgr_params_idx, int bit_depth, + int highbd) { + // The ALIGN_POWER_OF_TWO macro here ensures that column 1 of Atl, Btl, + // Ctl and Dtl is 32-byte aligned. + const int buf_elts = ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3); + + int32_t *buf = aom_memalign( + 32, 4 * sizeof(*buf) * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3)); + if (!buf) return -1; + + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; + + // Adjusting the stride of A and B here appears to avoid bad cache effects, + // leading to a significant speed improvement. + // We also align the stride to a multiple of 32 bytes for efficiency. + int buf_stride = ALIGN_POWER_OF_TWO(width_ext + 16, 3); + + // The "tl" pointers point at the top-left of the initialised data for the + // array. + int32_t *Atl = buf + 0 * buf_elts + 7; + int32_t *Btl = buf + 1 * buf_elts + 7; + int32_t *Ctl = buf + 2 * buf_elts + 7; + int32_t *Dtl = buf + 3 * buf_elts + 7; + + // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note + // there's a zero row and column in A, B (integral images), so we move down + // and right one for them. + const int buf_diag_border = + SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT; + + int32_t *A0 = Atl + 1 + buf_stride; + int32_t *B0 = Btl + 1 + buf_stride; + int32_t *C0 = Ctl + 1 + buf_stride; + int32_t *D0 = Dtl + 1 + buf_stride; + + // Finally, A, B, C, D point at position (0, 0). + int32_t *A = A0 + buf_diag_border; + int32_t *B = B0 + buf_diag_border; + int32_t *C = C0 + buf_diag_border; + int32_t *D = D0 + buf_diag_border; + + const int dgd_diag_border = + SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT; + const uint8_t *dgd0 = dgd8 - dgd_diag_border; + + // Generate integral images from the input. C will contain sums of squares; D + // will contain just sums + if (highbd) + integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext, + height_ext, Ctl, Dtl, buf_stride); + else + integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl, + buf_stride); + + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + // Write to flt0 and flt1 + // If params->r == 0 we skip the corresponding filter. We only allow one of + // the radii to be 0, as having both equal to 0 would be equivalent to + // skipping SGR entirely. + assert(!(params->r[0] == 0 && params->r[1] == 0)); + assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ)); + assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ)); + + if (params->r[0] > 0) { + calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth, + sgr_params_idx, 0); + final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride, + width, height, highbd); + } + + if (params->r[1] > 0) { + calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx, + 1); + final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width, + height, highbd); + } + aom_free(buf); + return 0; +} + +void av1_apply_selfguided_restoration_avx2(const uint8_t *dat8, int width, + int height, int stride, int eps, + const int *xqd, uint8_t *dst8, + int dst_stride, int32_t *tmpbuf, + int bit_depth, int highbd) { + int32_t *flt0 = tmpbuf; + int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; + assert(width * height <= RESTORATION_UNITPELS_MAX); + const int ret = av1_selfguided_restoration_avx2( + dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd); + (void)ret; + assert(!ret); + const sgr_params_type *const params = &av1_sgr_params[eps]; + int xq[2]; + av1_decode_xq(xqd, xq, params); + + __m256i xq0 = _mm256_set1_epi32(xq[0]); + __m256i xq1 = _mm256_set1_epi32(xq[1]); + + for (int i = 0; i < height; ++i) { + // Calculate output in batches of 16 pixels + for (int j = 0; j < width; j += 16) { + const int k = i * width + j; + const int m = i * dst_stride + j; + + const uint8_t *dat8ij = dat8 + i * stride + j; + __m256i ep_0, ep_1; + __m128i src_0, src_1; + if (highbd) { + src_0 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij)); + src_1 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij + 8)); + ep_0 = _mm256_cvtepu16_epi32(src_0); + ep_1 = _mm256_cvtepu16_epi32(src_1); + } else { + src_0 = xx_loadu_128(dat8ij); + ep_0 = _mm256_cvtepu8_epi32(src_0); + ep_1 = _mm256_cvtepu8_epi32(_mm_srli_si128(src_0, 8)); + } + + const __m256i u_0 = _mm256_slli_epi32(ep_0, SGRPROJ_RST_BITS); + const __m256i u_1 = _mm256_slli_epi32(ep_1, SGRPROJ_RST_BITS); + + __m256i v_0 = _mm256_slli_epi32(u_0, SGRPROJ_PRJ_BITS); + __m256i v_1 = _mm256_slli_epi32(u_1, SGRPROJ_PRJ_BITS); + + if (params->r[0] > 0) { + const __m256i f1_0 = _mm256_sub_epi32(yy_loadu_256(&flt0[k]), u_0); + v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq0, f1_0)); + + const __m256i f1_1 = _mm256_sub_epi32(yy_loadu_256(&flt0[k + 8]), u_1); + v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq0, f1_1)); + } + + if (params->r[1] > 0) { + const __m256i f2_0 = _mm256_sub_epi32(yy_loadu_256(&flt1[k]), u_0); + v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq1, f2_0)); + + const __m256i f2_1 = _mm256_sub_epi32(yy_loadu_256(&flt1[k + 8]), u_1); + v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq1, f2_1)); + } + + const __m256i rounding = + round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + const __m256i w_0 = _mm256_srai_epi32( + _mm256_add_epi32(v_0, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + const __m256i w_1 = _mm256_srai_epi32( + _mm256_add_epi32(v_1, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + + if (highbd) { + // Pack into 16 bits and clamp to [0, 2^bit_depth) + // Note that packing into 16 bits messes up the order of the bits, + // so we use a permute function to correct this + const __m256i tmp = _mm256_packus_epi32(w_0, w_1); + const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8); + const __m256i max = _mm256_set1_epi16((1 << bit_depth) - 1); + const __m256i res = _mm256_min_epi16(tmp2, max); + yy_storeu_256(CONVERT_TO_SHORTPTR(dst8 + m), res); + } else { + // Pack into 8 bits and clamp to [0, 256) + // Note that each pack messes up the order of the bits, + // so we use a permute function to correct this + const __m256i tmp = _mm256_packs_epi32(w_0, w_1); + const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8); + const __m256i res = + _mm256_packus_epi16(tmp2, tmp2 /* "don't care" value */); + const __m128i res2 = + _mm256_castsi256_si128(_mm256_permute4x64_epi64(res, 0xd8)); + xx_storeu_128(dst8 + m, res2); + } + } + } +} diff --git a/libs/libaom/src/av1/common/x86/selfguided_sse4.c b/libs/libaom/src/av1/common/x86/selfguided_sse4.c new file mode 100644 index 000000000..72c7708f1 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/selfguided_sse4.c @@ -0,0 +1,662 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "av1/common/restoration.h" +#include "aom_dsp/x86/synonyms.h" + +// Load 4 bytes from the possibly-misaligned pointer p, extend each byte to +// 32-bit precision and return them in an SSE register. +static __m128i xx_load_extend_8_32(const void *p) { + return _mm_cvtepu8_epi32(xx_loadl_32(p)); +} + +// Load 4 halfwords from the possibly-misaligned pointer p, extend each +// halfword to 32-bit precision and return them in an SSE register. +static __m128i xx_load_extend_16_32(const void *p) { + return _mm_cvtepu16_epi32(xx_loadl_64(p)); +} + +// Compute the scan of an SSE register holding 4 32-bit integers. If the +// register holds x0..x3 then the scan will hold x0, x0+x1, x0+x1+x2, +// x0+x1+x2+x3 +static __m128i scan_32(__m128i x) { + const __m128i x01 = _mm_add_epi32(x, _mm_slli_si128(x, 4)); + return _mm_add_epi32(x01, _mm_slli_si128(x01, 8)); +} + +// Compute two integral images from src. B sums elements; A sums their +// squares. The images are offset by one pixel, so will have width and height +// equal to width + 1, height + 1 and the first row and column will be zero. +// +// A+1 and B+1 should be aligned to 16 bytes. buf_stride should be a multiple +// of 4. +static void integral_images(const uint8_t *src, int src_stride, int width, + int height, int32_t *A, int32_t *B, + int buf_stride) { + // Write out the zero top row + memset(A, 0, sizeof(*A) * (width + 1)); + memset(B, 0, sizeof(*B) * (width + 1)); + + const __m128i zero = _mm_setzero_si128(); + for (int i = 0; i < height; ++i) { + // Zero the left column. + A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0; + + // ldiff is the difference H - D where H is the output sample immediately + // to the left and D is the output sample above it. These are scalars, + // replicated across the four lanes. + __m128i ldiff1 = zero, ldiff2 = zero; + for (int j = 0; j < width; j += 4) { + const int ABj = 1 + j; + + const __m128i above1 = xx_load_128(B + ABj + i * buf_stride); + const __m128i above2 = xx_load_128(A + ABj + i * buf_stride); + + const __m128i x1 = xx_load_extend_8_32(src + j + i * src_stride); + const __m128i x2 = _mm_madd_epi16(x1, x1); + + const __m128i sc1 = scan_32(x1); + const __m128i sc2 = scan_32(x2); + + const __m128i row1 = _mm_add_epi32(_mm_add_epi32(sc1, above1), ldiff1); + const __m128i row2 = _mm_add_epi32(_mm_add_epi32(sc2, above2), ldiff2); + + xx_store_128(B + ABj + (i + 1) * buf_stride, row1); + xx_store_128(A + ABj + (i + 1) * buf_stride, row2); + + // Calculate the new H - D. + ldiff1 = _mm_shuffle_epi32(_mm_sub_epi32(row1, above1), 0xff); + ldiff2 = _mm_shuffle_epi32(_mm_sub_epi32(row2, above2), 0xff); + } + } +} + +// Compute two integral images from src. B sums elements; A sums their squares +// +// A and B should be aligned to 16 bytes. buf_stride should be a multiple of 4. +static void integral_images_highbd(const uint16_t *src, int src_stride, + int width, int height, int32_t *A, + int32_t *B, int buf_stride) { + // Write out the zero top row + memset(A, 0, sizeof(*A) * (width + 1)); + memset(B, 0, sizeof(*B) * (width + 1)); + + const __m128i zero = _mm_setzero_si128(); + for (int i = 0; i < height; ++i) { + // Zero the left column. + A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0; + + // ldiff is the difference H - D where H is the output sample immediately + // to the left and D is the output sample above it. These are scalars, + // replicated across the four lanes. + __m128i ldiff1 = zero, ldiff2 = zero; + for (int j = 0; j < width; j += 4) { + const int ABj = 1 + j; + + const __m128i above1 = xx_load_128(B + ABj + i * buf_stride); + const __m128i above2 = xx_load_128(A + ABj + i * buf_stride); + + const __m128i x1 = xx_load_extend_16_32(src + j + i * src_stride); + const __m128i x2 = _mm_madd_epi16(x1, x1); + + const __m128i sc1 = scan_32(x1); + const __m128i sc2 = scan_32(x2); + + const __m128i row1 = _mm_add_epi32(_mm_add_epi32(sc1, above1), ldiff1); + const __m128i row2 = _mm_add_epi32(_mm_add_epi32(sc2, above2), ldiff2); + + xx_store_128(B + ABj + (i + 1) * buf_stride, row1); + xx_store_128(A + ABj + (i + 1) * buf_stride, row2); + + // Calculate the new H - D. + ldiff1 = _mm_shuffle_epi32(_mm_sub_epi32(row1, above1), 0xff); + ldiff2 = _mm_shuffle_epi32(_mm_sub_epi32(row2, above2), 0xff); + } + } +} + +// Compute 4 values of boxsum from the given integral image. ii should point +// at the middle of the box (for the first value). r is the box radius. +static INLINE __m128i boxsum_from_ii(const int32_t *ii, int stride, int r) { + const __m128i tl = xx_loadu_128(ii - (r + 1) - (r + 1) * stride); + const __m128i tr = xx_loadu_128(ii + (r + 0) - (r + 1) * stride); + const __m128i bl = xx_loadu_128(ii - (r + 1) + r * stride); + const __m128i br = xx_loadu_128(ii + (r + 0) + r * stride); + const __m128i u = _mm_sub_epi32(tr, tl); + const __m128i v = _mm_sub_epi32(br, bl); + return _mm_sub_epi32(v, u); +} + +static __m128i round_for_shift(unsigned shift) { + return _mm_set1_epi32((1 << shift) >> 1); +} + +static __m128i compute_p(__m128i sum1, __m128i sum2, int bit_depth, int n) { + __m128i an, bb; + if (bit_depth > 8) { + const __m128i rounding_a = round_for_shift(2 * (bit_depth - 8)); + const __m128i rounding_b = round_for_shift(bit_depth - 8); + const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8)); + const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8); + const __m128i a = _mm_srl_epi32(_mm_add_epi32(sum2, rounding_a), shift_a); + const __m128i b = _mm_srl_epi32(_mm_add_epi32(sum1, rounding_b), shift_b); + // b < 2^14, so we can use a 16-bit madd rather than a 32-bit + // mullo to square it + bb = _mm_madd_epi16(b, b); + an = _mm_max_epi32(_mm_mullo_epi32(a, _mm_set1_epi32(n)), bb); + } else { + bb = _mm_madd_epi16(sum1, sum1); + an = _mm_mullo_epi32(sum2, _mm_set1_epi32(n)); + } + return _mm_sub_epi32(an, bb); +} + +// Assumes that C, D are integral images for the original buffer which has been +// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels +// on the sides. A, B, C, D point at logical position (0, 0). +static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D, + int width, int height, int buf_stride, int bit_depth, + int sgr_params_idx, int radius_idx) { + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + const int r = params->r[radius_idx]; + const int n = (2 * r + 1) * (2 * r + 1); + const __m128i s = _mm_set1_epi32(params->s[radius_idx]); + // one_over_n[n-1] is 2^12/n, so easily fits in an int16 + const __m128i one_over_n = _mm_set1_epi32(av1_one_by_x[n - 1]); + + const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS); + const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS); + + // Set up masks + const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff); + __m128i mask[4]; + for (int idx = 0; idx < 4; idx++) { + const __m128i shift = _mm_cvtsi32_si128(8 * (4 - idx)); + mask[idx] = _mm_cvtepi8_epi32(_mm_srl_epi64(ones32, shift)); + } + + for (int i = -1; i < height + 1; ++i) { + for (int j = -1; j < width + 1; j += 4) { + const int32_t *Cij = C + i * buf_stride + j; + const int32_t *Dij = D + i * buf_stride + j; + + __m128i sum1 = boxsum_from_ii(Dij, buf_stride, r); + __m128i sum2 = boxsum_from_ii(Cij, buf_stride, r); + + // When width + 2 isn't a multiple of 4, sum1 and sum2 will contain + // some uninitialised data in their upper words. We use a mask to + // ensure that these bits are set to 0. + int idx = AOMMIN(4, width + 1 - j); + assert(idx >= 1); + + if (idx < 4) { + sum1 = _mm_and_si128(mask[idx], sum1); + sum2 = _mm_and_si128(mask[idx], sum2); + } + + const __m128i p = compute_p(sum1, sum2, bit_depth, n); + + const __m128i z = _mm_min_epi32( + _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rnd_z), + SGRPROJ_MTABLE_BITS), + _mm_set1_epi32(255)); + + // 'Gather' type instructions are not available pre-AVX2, so synthesize a + // gather using scalar loads. + const __m128i a_res = + _mm_set_epi32(av1_x_by_xplus1[_mm_extract_epi32(z, 3)], + av1_x_by_xplus1[_mm_extract_epi32(z, 2)], + av1_x_by_xplus1[_mm_extract_epi32(z, 1)], + av1_x_by_xplus1[_mm_extract_epi32(z, 0)]); + + xx_storeu_128(A + i * buf_stride + j, a_res); + + const __m128i a_complement = + _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res); + + // sum1 might have lanes greater than 2^15, so we can't use madd to do + // multiplication involving sum1. However, a_complement and one_over_n + // are both less than 256, so we can multiply them first. + const __m128i a_comp_over_n = _mm_madd_epi16(a_complement, one_over_n); + const __m128i b_int = _mm_mullo_epi32(a_comp_over_n, sum1); + const __m128i b_res = + _mm_srli_epi32(_mm_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS); + + xx_storeu_128(B + i * buf_stride + j, b_res); + } + } +} + +// Calculate 4 values of the "cross sum" starting at buf. This is a 3x3 filter +// where the outer four corners have weight 3 and all other pixels have weight +// 4. +// +// Pixels are indexed like this: +// xtl xt xtr +// xl x xr +// xbl xb xbr +// +// buf points to x +// +// fours = xl + xt + xr + xb + x +// threes = xtl + xtr + xbr + xbl +// cross_sum = 4 * fours + 3 * threes +// = 4 * (fours + threes) - threes +// = (fours + threes) << 2 - threes +static INLINE __m128i cross_sum(const int32_t *buf, int stride) { + const __m128i xtl = xx_loadu_128(buf - 1 - stride); + const __m128i xt = xx_loadu_128(buf - stride); + const __m128i xtr = xx_loadu_128(buf + 1 - stride); + const __m128i xl = xx_loadu_128(buf - 1); + const __m128i x = xx_loadu_128(buf); + const __m128i xr = xx_loadu_128(buf + 1); + const __m128i xbl = xx_loadu_128(buf - 1 + stride); + const __m128i xb = xx_loadu_128(buf + stride); + const __m128i xbr = xx_loadu_128(buf + 1 + stride); + + const __m128i fours = _mm_add_epi32( + xl, _mm_add_epi32(xt, _mm_add_epi32(xr, _mm_add_epi32(xb, x)))); + const __m128i threes = + _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl))); + + return _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(fours, threes), 2), threes); +} + +// The final filter for self-guided restoration. Computes a weighted average +// across A, B with "cross sums" (see cross_sum implementation above). +static void final_filter(int32_t *dst, int dst_stride, const int32_t *A, + const int32_t *B, int buf_stride, const void *dgd8, + int dgd_stride, int width, int height, int highbd) { + const int nb = 5; + const __m128i rounding = + round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); + const uint8_t *dgd_real = + highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 4) { + const __m128i a = cross_sum(A + i * buf_stride + j, buf_stride); + const __m128i b = cross_sum(B + i * buf_stride + j, buf_stride); + const __m128i raw = + xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd)); + const __m128i src = + highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw); + + __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b); + __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding), + SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); + + xx_storeu_128(dst + i * dst_stride + j, w); + } + } +} + +// Assumes that C, D are integral images for the original buffer which has been +// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels +// on the sides. A, B, C, D point at logical position (0, 0). +static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C, + const int32_t *D, int width, int height, + int buf_stride, int bit_depth, int sgr_params_idx, + int radius_idx) { + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + const int r = params->r[radius_idx]; + const int n = (2 * r + 1) * (2 * r + 1); + const __m128i s = _mm_set1_epi32(params->s[radius_idx]); + // one_over_n[n-1] is 2^12/n, so easily fits in an int16 + const __m128i one_over_n = _mm_set1_epi32(av1_one_by_x[n - 1]); + + const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS); + const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS); + + // Set up masks + const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff); + __m128i mask[4]; + for (int idx = 0; idx < 4; idx++) { + const __m128i shift = _mm_cvtsi32_si128(8 * (4 - idx)); + mask[idx] = _mm_cvtepi8_epi32(_mm_srl_epi64(ones32, shift)); + } + + for (int i = -1; i < height + 1; i += 2) { + for (int j = -1; j < width + 1; j += 4) { + const int32_t *Cij = C + i * buf_stride + j; + const int32_t *Dij = D + i * buf_stride + j; + + __m128i sum1 = boxsum_from_ii(Dij, buf_stride, r); + __m128i sum2 = boxsum_from_ii(Cij, buf_stride, r); + + // When width + 2 isn't a multiple of 4, sum1 and sum2 will contain + // some uninitialised data in their upper words. We use a mask to + // ensure that these bits are set to 0. + int idx = AOMMIN(4, width + 1 - j); + assert(idx >= 1); + + if (idx < 4) { + sum1 = _mm_and_si128(mask[idx], sum1); + sum2 = _mm_and_si128(mask[idx], sum2); + } + + const __m128i p = compute_p(sum1, sum2, bit_depth, n); + + const __m128i z = _mm_min_epi32( + _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rnd_z), + SGRPROJ_MTABLE_BITS), + _mm_set1_epi32(255)); + + // 'Gather' type instructions are not available pre-AVX2, so synthesize a + // gather using scalar loads. + const __m128i a_res = + _mm_set_epi32(av1_x_by_xplus1[_mm_extract_epi32(z, 3)], + av1_x_by_xplus1[_mm_extract_epi32(z, 2)], + av1_x_by_xplus1[_mm_extract_epi32(z, 1)], + av1_x_by_xplus1[_mm_extract_epi32(z, 0)]); + + xx_storeu_128(A + i * buf_stride + j, a_res); + + const __m128i a_complement = + _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res); + + // sum1 might have lanes greater than 2^15, so we can't use madd to do + // multiplication involving sum1. However, a_complement and one_over_n + // are both less than 256, so we can multiply them first. + const __m128i a_comp_over_n = _mm_madd_epi16(a_complement, one_over_n); + const __m128i b_int = _mm_mullo_epi32(a_comp_over_n, sum1); + const __m128i b_res = + _mm_srli_epi32(_mm_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS); + + xx_storeu_128(B + i * buf_stride + j, b_res); + } + } +} + +// Calculate 4 values of the "cross sum" starting at buf. +// +// Pixels are indexed like this: +// xtl xt xtr +// - buf - +// xbl xb xbr +// +// Pixels are weighted like this: +// 5 6 5 +// 0 0 0 +// 5 6 5 +// +// fives = xtl + xtr + xbl + xbr +// sixes = xt + xb +// cross_sum = 6 * sixes + 5 * fives +// = 5 * (fives + sixes) - sixes +// = (fives + sixes) << 2 + (fives + sixes) + sixes +static INLINE __m128i cross_sum_fast_even_row(const int32_t *buf, int stride) { + const __m128i xtl = xx_loadu_128(buf - 1 - stride); + const __m128i xt = xx_loadu_128(buf - stride); + const __m128i xtr = xx_loadu_128(buf + 1 - stride); + const __m128i xbl = xx_loadu_128(buf - 1 + stride); + const __m128i xb = xx_loadu_128(buf + stride); + const __m128i xbr = xx_loadu_128(buf + 1 + stride); + + const __m128i fives = + _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl))); + const __m128i sixes = _mm_add_epi32(xt, xb); + const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes); + + return _mm_add_epi32( + _mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes), + sixes); +} + +// Calculate 4 values of the "cross sum" starting at buf. +// +// Pixels are indexed like this: +// xl x xr +// +// Pixels are weighted like this: +// 5 6 5 +// +// buf points to x +// +// fives = xl + xr +// sixes = x +// cross_sum = 5 * fives + 6 * sixes +// = 4 * (fives + sixes) + (fives + sixes) + sixes +// = (fives + sixes) << 2 + (fives + sixes) + sixes +static INLINE __m128i cross_sum_fast_odd_row(const int32_t *buf) { + const __m128i xl = xx_loadu_128(buf - 1); + const __m128i x = xx_loadu_128(buf); + const __m128i xr = xx_loadu_128(buf + 1); + + const __m128i fives = _mm_add_epi32(xl, xr); + const __m128i sixes = x; + + const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes); + + return _mm_add_epi32( + _mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes), + sixes); +} + +// The final filter for the self-guided restoration. Computes a +// weighted average across A, B with "cross sums" (see cross_sum_... +// implementations above). +static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A, + const int32_t *B, int buf_stride, + const void *dgd8, int dgd_stride, int width, + int height, int highbd) { + const int nb0 = 5; + const int nb1 = 4; + + const __m128i rounding0 = + round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS); + const __m128i rounding1 = + round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS); + + const uint8_t *dgd_real = + highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8; + + for (int i = 0; i < height; ++i) { + if (!(i & 1)) { // even row + for (int j = 0; j < width; j += 4) { + const __m128i a = + cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride); + const __m128i b = + cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride); + const __m128i raw = + xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd)); + const __m128i src = + highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw); + + __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b); + __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding0), + SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS); + + xx_storeu_128(dst + i * dst_stride + j, w); + } + } else { // odd row + for (int j = 0; j < width; j += 4) { + const __m128i a = cross_sum_fast_odd_row(A + i * buf_stride + j); + const __m128i b = cross_sum_fast_odd_row(B + i * buf_stride + j); + const __m128i raw = + xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd)); + const __m128i src = + highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw); + + __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b); + __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding1), + SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS); + + xx_storeu_128(dst + i * dst_stride + j, w); + } + } + } +} + +int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width, + int height, int dgd_stride, int32_t *flt0, + int32_t *flt1, int flt_stride, + int sgr_params_idx, int bit_depth, + int highbd) { + int32_t *buf = (int32_t *)aom_memalign( + 16, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS); + if (!buf) return -1; + memset(buf, 0, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS); + + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; + + // Adjusting the stride of A and B here appears to avoid bad cache effects, + // leading to a significant speed improvement. + // We also align the stride to a multiple of 16 bytes for efficiency. + int buf_stride = ((width_ext + 3) & ~3) + 16; + + // The "tl" pointers point at the top-left of the initialised data for the + // array. Adding 3 here ensures that column 1 is 16-byte aligned. + int32_t *Atl = buf + 0 * RESTORATION_PROC_UNIT_PELS + 3; + int32_t *Btl = buf + 1 * RESTORATION_PROC_UNIT_PELS + 3; + int32_t *Ctl = buf + 2 * RESTORATION_PROC_UNIT_PELS + 3; + int32_t *Dtl = buf + 3 * RESTORATION_PROC_UNIT_PELS + 3; + + // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note + // there's a zero row and column in A, B (integral images), so we move down + // and right one for them. + const int buf_diag_border = + SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT; + + int32_t *A0 = Atl + 1 + buf_stride; + int32_t *B0 = Btl + 1 + buf_stride; + int32_t *C0 = Ctl + 1 + buf_stride; + int32_t *D0 = Dtl + 1 + buf_stride; + + // Finally, A, B, C, D point at position (0, 0). + int32_t *A = A0 + buf_diag_border; + int32_t *B = B0 + buf_diag_border; + int32_t *C = C0 + buf_diag_border; + int32_t *D = D0 + buf_diag_border; + + const int dgd_diag_border = + SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT; + const uint8_t *dgd0 = dgd8 - dgd_diag_border; + + // Generate integral images from the input. C will contain sums of squares; D + // will contain just sums + if (highbd) + integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext, + height_ext, Ctl, Dtl, buf_stride); + else + integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl, + buf_stride); + + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + // Write to flt0 and flt1 + // If params->r == 0 we skip the corresponding filter. We only allow one of + // the radii to be 0, as having both equal to 0 would be equivalent to + // skipping SGR entirely. + assert(!(params->r[0] == 0 && params->r[1] == 0)); + assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ)); + assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ)); + + if (params->r[0] > 0) { + calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth, + sgr_params_idx, 0); + final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride, + width, height, highbd); + } + + if (params->r[1] > 0) { + calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx, + 1); + final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width, + height, highbd); + } + aom_free(buf); + return 0; +} + +void av1_apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width, + int height, int stride, int eps, + const int *xqd, uint8_t *dst8, + int dst_stride, int32_t *tmpbuf, + int bit_depth, int highbd) { + int32_t *flt0 = tmpbuf; + int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; + assert(width * height <= RESTORATION_UNITPELS_MAX); + const int ret = av1_selfguided_restoration_sse4_1( + dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd); + (void)ret; + assert(!ret); + const sgr_params_type *const params = &av1_sgr_params[eps]; + int xq[2]; + av1_decode_xq(xqd, xq, params); + + __m128i xq0 = _mm_set1_epi32(xq[0]); + __m128i xq1 = _mm_set1_epi32(xq[1]); + + for (int i = 0; i < height; ++i) { + // Calculate output in batches of 8 pixels + for (int j = 0; j < width; j += 8) { + const int k = i * width + j; + const int m = i * dst_stride + j; + + const uint8_t *dat8ij = dat8 + i * stride + j; + __m128i src; + if (highbd) { + src = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij)); + } else { + src = _mm_cvtepu8_epi16(xx_loadl_64(dat8ij)); + } + + const __m128i u = _mm_slli_epi16(src, SGRPROJ_RST_BITS); + const __m128i u_0 = _mm_cvtepu16_epi32(u); + const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(u, 8)); + + __m128i v_0 = _mm_slli_epi32(u_0, SGRPROJ_PRJ_BITS); + __m128i v_1 = _mm_slli_epi32(u_1, SGRPROJ_PRJ_BITS); + + if (params->r[0] > 0) { + const __m128i f1_0 = _mm_sub_epi32(xx_loadu_128(&flt0[k]), u_0); + v_0 = _mm_add_epi32(v_0, _mm_mullo_epi32(xq0, f1_0)); + + const __m128i f1_1 = _mm_sub_epi32(xx_loadu_128(&flt0[k + 4]), u_1); + v_1 = _mm_add_epi32(v_1, _mm_mullo_epi32(xq0, f1_1)); + } + + if (params->r[1] > 0) { + const __m128i f2_0 = _mm_sub_epi32(xx_loadu_128(&flt1[k]), u_0); + v_0 = _mm_add_epi32(v_0, _mm_mullo_epi32(xq1, f2_0)); + + const __m128i f2_1 = _mm_sub_epi32(xx_loadu_128(&flt1[k + 4]), u_1); + v_1 = _mm_add_epi32(v_1, _mm_mullo_epi32(xq1, f2_1)); + } + + const __m128i rounding = + round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + const __m128i w_0 = _mm_srai_epi32(_mm_add_epi32(v_0, rounding), + SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + const __m128i w_1 = _mm_srai_epi32(_mm_add_epi32(v_1, rounding), + SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + + if (highbd) { + // Pack into 16 bits and clamp to [0, 2^bit_depth) + const __m128i tmp = _mm_packus_epi32(w_0, w_1); + const __m128i max = _mm_set1_epi16((1 << bit_depth) - 1); + const __m128i res = _mm_min_epi16(tmp, max); + xx_storeu_128(CONVERT_TO_SHORTPTR(dst8 + m), res); + } else { + // Pack into 8 bits and clamp to [0, 256) + const __m128i tmp = _mm_packs_epi32(w_0, w_1); + const __m128i res = _mm_packus_epi16(tmp, tmp /* "don't care" value */); + xx_storel_64(dst8 + m, res); + } + } + } +} diff --git a/libs/libaom/src/av1/common/x86/warp_plane_avx2.c b/libs/libaom/src/av1/common/x86/warp_plane_avx2.c new file mode 100644 index 000000000..53a928d76 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/warp_plane_avx2.c @@ -0,0 +1,1318 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "config/av1_rtcd.h" +#include "av1/common/warped_motion.h" +#include "aom_dsp/x86/synonyms.h" + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask01_avx2[32]) = { + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask23_avx2[32]) = { + 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, + 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask45_avx2[32]) = { + 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, + 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask67_avx2[32]) = { + 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, + 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask0_avx2[32]) = { + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask1_avx2[32]) = { + 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, + 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask2_avx2[32]) = { + 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, + 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask3_avx2[32]) = { + 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, + 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15 +}; + +DECLARE_ALIGNED(32, static const uint8_t, + shuffle_src0[32]) = { 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3, + 5, 5, 7, 7, 9, 0, 2, 2, 4, 4, 6, + 6, 8, 1, 3, 3, 5, 5, 7, 7, 9 }; + +DECLARE_ALIGNED(32, static const uint8_t, + shuffle_src1[32]) = { 4, 6, 6, 8, 8, 10, 10, 12, 5, 7, 7, + 9, 9, 11, 11, 13, 4, 6, 6, 8, 8, 10, + 10, 12, 5, 7, 7, 9, 9, 11, 11, 13 }; + +DECLARE_ALIGNED(32, static const uint8_t, + shuffle_src2[32]) = { 1, 3, 3, 5, 5, 7, 7, 9, 2, 4, 4, + 6, 6, 8, 8, 10, 1, 3, 3, 5, 5, 7, + 7, 9, 2, 4, 4, 6, 6, 8, 8, 10 }; + +DECLARE_ALIGNED(32, static const uint8_t, + shuffle_src3[32]) = { 5, 7, 7, 9, 9, 11, 11, 13, 6, 8, 8, + 10, 10, 12, 12, 14, 5, 7, 7, 9, 9, 11, + 11, 13, 6, 8, 8, 10, 10, 12, 12, 14 }; + +static INLINE void filter_src_pixels_avx2(const __m256i src, __m256i *horz_out, + __m256i *coeff, + const __m256i *shuffle_src, + const __m256i *round_const, + const __m128i *shift, int row) { + const __m256i src_0 = _mm256_shuffle_epi8(src, shuffle_src[0]); + const __m256i src_1 = _mm256_shuffle_epi8(src, shuffle_src[1]); + const __m256i src_2 = _mm256_shuffle_epi8(src, shuffle_src[2]); + const __m256i src_3 = _mm256_shuffle_epi8(src, shuffle_src[3]); + + const __m256i res_02 = _mm256_maddubs_epi16(src_0, coeff[0]); + const __m256i res_46 = _mm256_maddubs_epi16(src_1, coeff[1]); + const __m256i res_13 = _mm256_maddubs_epi16(src_2, coeff[2]); + const __m256i res_57 = _mm256_maddubs_epi16(src_3, coeff[3]); + + const __m256i res_even = _mm256_add_epi16(res_02, res_46); + const __m256i res_odd = _mm256_add_epi16(res_13, res_57); + const __m256i res = + _mm256_add_epi16(_mm256_add_epi16(res_even, res_odd), *round_const); + horz_out[row] = _mm256_srl_epi16(res, *shift); +} + +static INLINE void prepare_horizontal_filter_coeff_avx2(int alpha, int beta, + int sx, + __m256i *coeff) { + __m128i tmp_0 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 0 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + __m128i tmp_1 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 1 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + __m128i tmp_2 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 2 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + __m128i tmp_3 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 3 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + + __m128i tmp_4 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 4 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + __m128i tmp_5 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 5 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + __m128i tmp_6 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 6 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + __m128i tmp_7 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 7 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + + __m256i tmp0_256 = _mm256_castsi128_si256(tmp_0); + __m256i tmp2_256 = _mm256_castsi128_si256(tmp_2); + __m256i tmp1_256 = _mm256_castsi128_si256(tmp_1); + __m256i tmp3_256 = _mm256_castsi128_si256(tmp_3); + + __m256i tmp4_256 = _mm256_castsi128_si256(tmp_4); + __m256i tmp6_256 = _mm256_castsi128_si256(tmp_6); + __m256i tmp5_256 = _mm256_castsi128_si256(tmp_5); + __m256i tmp7_256 = _mm256_castsi128_si256(tmp_7); + + __m128i tmp_8 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 0 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp0_256 = _mm256_inserti128_si256(tmp0_256, tmp_8, 1); + + __m128i tmp_9 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 1 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp1_256 = _mm256_inserti128_si256(tmp1_256, tmp_9, 1); + + __m128i tmp_10 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 2 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp2_256 = _mm256_inserti128_si256(tmp2_256, tmp_10, 1); + + __m128i tmp_11 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 3 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp3_256 = _mm256_inserti128_si256(tmp3_256, tmp_11, 1); + + tmp_2 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 4 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp4_256 = _mm256_inserti128_si256(tmp4_256, tmp_2, 1); + + tmp_3 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 5 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp5_256 = _mm256_inserti128_si256(tmp5_256, tmp_3, 1); + + tmp_6 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 6 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp6_256 = _mm256_inserti128_si256(tmp6_256, tmp_6, 1); + + tmp_7 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 7 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp7_256 = _mm256_inserti128_si256(tmp7_256, tmp_7, 1); + + const __m256i tmp_12 = _mm256_unpacklo_epi16(tmp0_256, tmp2_256); + const __m256i tmp_13 = _mm256_unpacklo_epi16(tmp1_256, tmp3_256); + const __m256i tmp_14 = _mm256_unpacklo_epi16(tmp4_256, tmp6_256); + const __m256i tmp_15 = _mm256_unpacklo_epi16(tmp5_256, tmp7_256); + + const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14); + const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14); + const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15); + const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15); + + coeff[0] = _mm256_unpacklo_epi64(res_0, res_2); + coeff[1] = _mm256_unpackhi_epi64(res_0, res_2); + coeff[2] = _mm256_unpacklo_epi64(res_1, res_3); + coeff[3] = _mm256_unpackhi_epi64(res_1, res_3); +} + +static INLINE void prepare_horizontal_filter_coeff_beta0_avx2(int alpha, int sx, + __m256i *coeff) { + __m128i tmp_0 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]); + __m128i tmp_1 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]); + __m128i tmp_2 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]); + __m128i tmp_3 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]); + __m128i tmp_4 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]); + __m128i tmp_5 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]); + __m128i tmp_6 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]); + __m128i tmp_7 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]); + + tmp_0 = _mm_unpacklo_epi16(tmp_0, tmp_2); + tmp_1 = _mm_unpacklo_epi16(tmp_1, tmp_3); + tmp_4 = _mm_unpacklo_epi16(tmp_4, tmp_6); + tmp_5 = _mm_unpacklo_epi16(tmp_5, tmp_7); + + const __m256i tmp_12 = _mm256_broadcastsi128_si256(tmp_0); + const __m256i tmp_13 = _mm256_broadcastsi128_si256(tmp_1); + const __m256i tmp_14 = _mm256_broadcastsi128_si256(tmp_4); + const __m256i tmp_15 = _mm256_broadcastsi128_si256(tmp_5); + + const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14); + const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14); + const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15); + const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15); + + coeff[0] = _mm256_unpacklo_epi64(res_0, res_2); + coeff[1] = _mm256_unpackhi_epi64(res_0, res_2); + coeff[2] = _mm256_unpacklo_epi64(res_1, res_3); + coeff[3] = _mm256_unpackhi_epi64(res_1, res_3); +} + +static INLINE void prepare_horizontal_filter_coeff_alpha0_avx2(int beta, int sx, + __m256i *coeff) { + const __m128i tmp_0 = + _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_1 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + beta) >> WARPEDDIFF_PREC_BITS]); + + const __m256i res_0 = + _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_0), tmp_1, 0x1); + + coeff[0] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask01_avx2)); + coeff[1] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask23_avx2)); + coeff[2] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask45_avx2)); + coeff[3] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask67_avx2)); +} + +static INLINE void horizontal_filter_avx2(const __m256i src, __m256i *horz_out, + int sx, int alpha, int beta, int row, + const __m256i *shuffle_src, + const __m256i *round_const, + const __m128i *shift) { + __m256i coeff[4]; + prepare_horizontal_filter_coeff_avx2(alpha, beta, sx, coeff); + filter_src_pixels_avx2(src, horz_out, coeff, shuffle_src, round_const, shift, + row); +} +static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx, + __m256i *coeff) { + const __m128i tmp_0 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_1 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_2 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_3 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_4 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_5 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_6 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_7 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]); + + const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2); + const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3); + const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6); + const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7); + + const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10); + const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10); + const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11); + const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11); + + coeff[0] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_12, tmp_14)); + coeff[1] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_12, tmp_14)); + coeff[2] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_13, tmp_15)); + coeff[3] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_13, tmp_15)); +} + +static INLINE void warp_horizontal_filter_avx2( + const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const __m256i *round_const, const __m128i *shift, + const __m256i *shuffle_src) { + int k, iy, sx, row = 0; + __m256i coeff[4]; + for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m128i src_0 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + iy = iy4 + k + 1; + iy = clamp(iy, 0, height - 1); + const __m128i src_1 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m256i src_01 = + _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1); + sx = sx4 + beta * (k + 4); + horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row, shuffle_src, + round_const, shift); + row += 1; + } + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m256i src_01 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7))); + sx = sx4 + beta * (k + 4); + prepare_horizontal_filter_coeff(alpha, sx, coeff); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, + shift, row); +} + +static INLINE void warp_horizontal_filter_alpha0_avx2( + const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const __m256i *round_const, const __m128i *shift, + const __m256i *shuffle_src) { + (void)alpha; + int k, iy, sx, row = 0; + __m256i coeff[4]; + for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m128i src_0 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + iy = iy4 + k + 1; + iy = clamp(iy, 0, height - 1); + const __m128i src_1 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m256i src_01 = + _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1); + sx = sx4 + beta * (k + 4); + prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, + shift, row); + row += 1; + } + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m256i src_01 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7))); + sx = sx4 + beta * (k + 4); + prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, + shift, row); +} + +static INLINE void warp_horizontal_filter_beta0_avx2( + const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const __m256i *round_const, const __m128i *shift, + const __m256i *shuffle_src) { + (void)beta; + int k, iy, row = 0; + __m256i coeff[4]; + prepare_horizontal_filter_coeff_beta0_avx2(alpha, sx4, coeff); + for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m128i src_0 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + iy = iy4 + k + 1; + iy = clamp(iy, 0, height - 1); + const __m128i src_1 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m256i src_01 = + _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, + shift, row); + row += 1; + } + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m256i src_01 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7))); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, + shift, row); +} + +static INLINE void warp_horizontal_filter_alpha0_beta0_avx2( + const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const __m256i *round_const, const __m128i *shift, + const __m256i *shuffle_src) { + (void)alpha; + int k, iy, row = 0; + __m256i coeff[4]; + prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx4, coeff); + for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m128i src0 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + iy = iy4 + k + 1; + iy = clamp(iy, 0, height - 1); + const __m128i src1 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m256i src_01 = + _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, + shift, row); + row += 1; + } + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m256i src_01 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7))); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, + shift, row); +} + +static INLINE void unpack_weights_and_set_round_const_avx2( + ConvolveParams *conv_params, const int round_bits, const int offset_bits, + __m256i *res_sub_const, __m256i *round_bits_const, __m256i *wt) { + *res_sub_const = + _mm256_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) - + (1 << (offset_bits - conv_params->round_1 - 1))); + *round_bits_const = _mm256_set1_epi16(((1 << round_bits) >> 1)); + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m256i wt0 = _mm256_set1_epi16((short)w0); + const __m256i wt1 = _mm256_set1_epi16((short)w1); + *wt = _mm256_unpacklo_epi16(wt0, wt1); +} + +static INLINE void prepare_vertical_filter_coeffs_avx2(int gamma, int delta, + int sy, + __m256i *coeffs) { + __m128i filt_00 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_01 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_02 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_03 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); + + __m128i filt_10 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_11 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_12 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_13 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); + + __m256i filt_0 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1); + __m256i filt_1 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1); + __m256i filt_2 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1); + __m256i filt_3 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1); + + __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1); + __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3); + __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1); + __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3); + + coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1); + coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1); + coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3); + coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3); + + filt_00 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_01 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_02 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_03 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); + + filt_10 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_11 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_12 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_13 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); + + filt_0 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1); + filt_1 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1); + filt_2 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1); + filt_3 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1); + + res_0 = _mm256_unpacklo_epi32(filt_0, filt_1); + res_1 = _mm256_unpacklo_epi32(filt_2, filt_3); + res_2 = _mm256_unpackhi_epi32(filt_0, filt_1); + res_3 = _mm256_unpackhi_epi32(filt_2, filt_3); + + coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1); + coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1); + coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3); + coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3); +} + +static INLINE void prepare_vertical_filter_coeffs_delta0_avx2(int gamma, int sy, + __m256i *coeffs) { + __m128i filt_00 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_01 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_02 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_03 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); + + __m256i filt_0 = _mm256_broadcastsi128_si256(filt_00); + __m256i filt_1 = _mm256_broadcastsi128_si256(filt_01); + __m256i filt_2 = _mm256_broadcastsi128_si256(filt_02); + __m256i filt_3 = _mm256_broadcastsi128_si256(filt_03); + + __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1); + __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3); + __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1); + __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3); + + coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1); + coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1); + coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3); + coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3); + + filt_00 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_01 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_02 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_03 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); + + filt_0 = _mm256_broadcastsi128_si256(filt_00); + filt_1 = _mm256_broadcastsi128_si256(filt_01); + filt_2 = _mm256_broadcastsi128_si256(filt_02); + filt_3 = _mm256_broadcastsi128_si256(filt_03); + + res_0 = _mm256_unpacklo_epi32(filt_0, filt_1); + res_1 = _mm256_unpacklo_epi32(filt_2, filt_3); + res_2 = _mm256_unpackhi_epi32(filt_0, filt_1); + res_3 = _mm256_unpackhi_epi32(filt_2, filt_3); + + coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1); + coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1); + coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3); + coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3); +} + +static INLINE void prepare_vertical_filter_coeffs_gamma0_avx2(int delta, int sy, + __m256i *coeffs) { + const __m128i filt_0 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); + const __m128i filt_1 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + ((sy + delta) >> WARPEDDIFF_PREC_BITS))); + + __m256i res_0 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_0), filt_1, 0x1); + + coeffs[0] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask0_avx2)); + coeffs[1] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask1_avx2)); + coeffs[2] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask2_avx2)); + coeffs[3] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask3_avx2)); + + coeffs[4] = coeffs[0]; + coeffs[5] = coeffs[1]; + coeffs[6] = coeffs[2]; + coeffs[7] = coeffs[3]; +} + +static INLINE void filter_src_pixels_vertical_avx2(__m256i *horz_out, + __m256i *src, + __m256i *coeffs, + __m256i *res_lo, + __m256i *res_hi, int row) { + const __m256i src_6 = horz_out[row + 3]; + const __m256i src_7 = + _mm256_permute2x128_si256(horz_out[row + 3], horz_out[row + 4], 0x21); + + src[6] = _mm256_unpacklo_epi16(src_6, src_7); + + const __m256i res_0 = _mm256_madd_epi16(src[0], coeffs[0]); + const __m256i res_2 = _mm256_madd_epi16(src[2], coeffs[1]); + const __m256i res_4 = _mm256_madd_epi16(src[4], coeffs[2]); + const __m256i res_6 = _mm256_madd_epi16(src[6], coeffs[3]); + + const __m256i res_even = _mm256_add_epi32(_mm256_add_epi32(res_0, res_2), + _mm256_add_epi32(res_4, res_6)); + + src[7] = _mm256_unpackhi_epi16(src_6, src_7); + + const __m256i res_1 = _mm256_madd_epi16(src[1], coeffs[4]); + const __m256i res_3 = _mm256_madd_epi16(src[3], coeffs[5]); + const __m256i res_5 = _mm256_madd_epi16(src[5], coeffs[6]); + const __m256i res_7 = _mm256_madd_epi16(src[7], coeffs[7]); + + const __m256i res_odd = _mm256_add_epi32(_mm256_add_epi32(res_1, res_3), + _mm256_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + *res_lo = _mm256_unpacklo_epi32(res_even, res_odd); + *res_hi = _mm256_unpackhi_epi32(res_even, res_odd); +} + +static INLINE void store_vertical_filter_output_avx2( + const __m256i *res_lo, const __m256i *res_hi, const __m256i *res_add_const, + const __m256i *wt, const __m256i *res_sub_const, + const __m256i *round_bits_const, uint8_t *pred, ConvolveParams *conv_params, + int i, int j, int k, const int reduce_bits_vert, int p_stride, int p_width, + const int round_bits) { + __m256i res_lo_1 = *res_lo; + __m256i res_hi_1 = *res_hi; + + if (conv_params->is_compound) { + __m128i *const p_0 = + (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j]; + __m128i *const p_1 = + (__m128i *)&conv_params + ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j]; + + res_lo_1 = _mm256_srai_epi32(_mm256_add_epi32(res_lo_1, *res_add_const), + reduce_bits_vert); + + const __m256i temp_lo_16 = _mm256_packus_epi32(res_lo_1, res_lo_1); + __m256i res_lo_16; + if (conv_params->do_average) { + __m128i *const dst8_0 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + __m128i *const dst8_1 = + (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j]; + const __m128i p_16_0 = _mm_loadl_epi64(p_0); + const __m128i p_16_1 = _mm_loadl_epi64(p_1); + const __m256i p_16 = + _mm256_inserti128_si256(_mm256_castsi128_si256(p_16_0), p_16_1, 1); + if (conv_params->use_dist_wtd_comp_avg) { + const __m256i p_16_lo = _mm256_unpacklo_epi16(p_16, temp_lo_16); + const __m256i wt_res_lo = _mm256_madd_epi16(p_16_lo, *wt); + const __m256i shifted_32 = + _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); + res_lo_16 = _mm256_packus_epi32(shifted_32, shifted_32); + } else { + res_lo_16 = _mm256_srai_epi16(_mm256_add_epi16(p_16, temp_lo_16), 1); + } + res_lo_16 = _mm256_add_epi16(res_lo_16, *res_sub_const); + res_lo_16 = _mm256_srai_epi16( + _mm256_add_epi16(res_lo_16, *round_bits_const), round_bits); + const __m256i res_8_lo = _mm256_packus_epi16(res_lo_16, res_lo_16); + const __m128i res_8_lo_0 = _mm256_castsi256_si128(res_8_lo); + const __m128i res_8_lo_1 = _mm256_extracti128_si256(res_8_lo, 1); + *(uint32_t *)dst8_0 = _mm_cvtsi128_si32(res_8_lo_0); + *(uint32_t *)dst8_1 = _mm_cvtsi128_si32(res_8_lo_1); + } else { + const __m128i temp_lo_16_0 = _mm256_castsi256_si128(temp_lo_16); + const __m128i temp_lo_16_1 = _mm256_extracti128_si256(temp_lo_16, 1); + _mm_storel_epi64(p_0, temp_lo_16_0); + _mm_storel_epi64(p_1, temp_lo_16_1); + } + if (p_width > 4) { + __m128i *const p4_0 = + (__m128i *)&conv_params + ->dst[(i + k + 4) * conv_params->dst_stride + j + 4]; + __m128i *const p4_1 = + (__m128i *)&conv_params + ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j + 4]; + res_hi_1 = _mm256_srai_epi32(_mm256_add_epi32(res_hi_1, *res_add_const), + reduce_bits_vert); + const __m256i temp_hi_16 = _mm256_packus_epi32(res_hi_1, res_hi_1); + __m256i res_hi_16; + if (conv_params->do_average) { + __m128i *const dst8_4_0 = + (__m128i *)&pred[(i + k + 4) * p_stride + j + 4]; + __m128i *const dst8_4_1 = + (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j + 4]; + const __m128i p4_16_0 = _mm_loadl_epi64(p4_0); + const __m128i p4_16_1 = _mm_loadl_epi64(p4_1); + const __m256i p4_16 = _mm256_inserti128_si256( + _mm256_castsi128_si256(p4_16_0), p4_16_1, 1); + if (conv_params->use_dist_wtd_comp_avg) { + const __m256i p_16_hi = _mm256_unpacklo_epi16(p4_16, temp_hi_16); + const __m256i wt_res_hi = _mm256_madd_epi16(p_16_hi, *wt); + const __m256i shifted_32 = + _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); + res_hi_16 = _mm256_packus_epi32(shifted_32, shifted_32); + } else { + res_hi_16 = _mm256_srai_epi16(_mm256_add_epi16(p4_16, temp_hi_16), 1); + } + res_hi_16 = _mm256_add_epi16(res_hi_16, *res_sub_const); + res_hi_16 = _mm256_srai_epi16( + _mm256_add_epi16(res_hi_16, *round_bits_const), round_bits); + __m256i res_8_hi = _mm256_packus_epi16(res_hi_16, res_hi_16); + const __m128i res_8_hi_0 = _mm256_castsi256_si128(res_8_hi); + const __m128i res_8_hi_1 = _mm256_extracti128_si256(res_8_hi, 1); + *(uint32_t *)dst8_4_0 = _mm_cvtsi128_si32(res_8_hi_0); + *(uint32_t *)dst8_4_1 = _mm_cvtsi128_si32(res_8_hi_1); + } else { + const __m128i temp_hi_16_0 = _mm256_castsi256_si128(temp_hi_16); + const __m128i temp_hi_16_1 = _mm256_extracti128_si256(temp_hi_16, 1); + _mm_storel_epi64(p4_0, temp_hi_16_0); + _mm_storel_epi64(p4_1, temp_hi_16_1); + } + } + } else { + const __m256i res_lo_round = _mm256_srai_epi32( + _mm256_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert); + const __m256i res_hi_round = _mm256_srai_epi32( + _mm256_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert); + + const __m256i res_16bit = _mm256_packs_epi32(res_lo_round, res_hi_round); + const __m256i res_8bit = _mm256_packus_epi16(res_16bit, res_16bit); + const __m128i res_8bit0 = _mm256_castsi256_si128(res_8bit); + const __m128i res_8bit1 = _mm256_extracti128_si256(res_8bit, 1); + + // Store, blending with 'pred' if needed + __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + __m128i *const p1 = (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j]; + + if (p_width == 4) { + *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit0); + *(uint32_t *)p1 = _mm_cvtsi128_si32(res_8bit1); + } else { + _mm_storel_epi64(p, res_8bit0); + _mm_storel_epi64(p1, res_8bit1); + } + } +} + +static INLINE void warp_vertical_filter_avx2( + uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, + int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, + int i, int j, int sy4, const int reduce_bits_vert, + const __m256i *res_add_const, const int round_bits, + const __m256i *res_sub_const, const __m256i *round_bits_const, + const __m256i *wt) { + int k, row = 0; + __m256i src[8]; + const __m256i src_0 = horz_out[0]; + const __m256i src_1 = + _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21); + const __m256i src_2 = horz_out[1]; + const __m256i src_3 = + _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21); + const __m256i src_4 = horz_out[2]; + const __m256i src_5 = + _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21); + + src[0] = _mm256_unpacklo_epi16(src_0, src_1); + src[2] = _mm256_unpacklo_epi16(src_2, src_3); + src[4] = _mm256_unpacklo_epi16(src_4, src_5); + + src[1] = _mm256_unpackhi_epi16(src_0, src_1); + src[3] = _mm256_unpackhi_epi16(src_2, src_3); + src[5] = _mm256_unpackhi_epi16(src_4, src_5); + + for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) { + int sy = sy4 + delta * (k + 4); + __m256i coeffs[8]; + prepare_vertical_filter_coeffs_avx2(gamma, delta, sy, coeffs); + __m256i res_lo, res_hi; + filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi, + row); + store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt, + res_sub_const, round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + src[0] = src[2]; + src[2] = src[4]; + src[4] = src[6]; + src[1] = src[3]; + src[3] = src[5]; + src[5] = src[7]; + + row += 1; + } +} + +static INLINE void warp_vertical_filter_gamma0_avx2( + uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, + int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, + int i, int j, int sy4, const int reduce_bits_vert, + const __m256i *res_add_const, const int round_bits, + const __m256i *res_sub_const, const __m256i *round_bits_const, + const __m256i *wt) { + (void)gamma; + int k, row = 0; + __m256i src[8]; + const __m256i src_0 = horz_out[0]; + const __m256i src_1 = + _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21); + const __m256i src_2 = horz_out[1]; + const __m256i src_3 = + _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21); + const __m256i src_4 = horz_out[2]; + const __m256i src_5 = + _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21); + + src[0] = _mm256_unpacklo_epi16(src_0, src_1); + src[2] = _mm256_unpacklo_epi16(src_2, src_3); + src[4] = _mm256_unpacklo_epi16(src_4, src_5); + + src[1] = _mm256_unpackhi_epi16(src_0, src_1); + src[3] = _mm256_unpackhi_epi16(src_2, src_3); + src[5] = _mm256_unpackhi_epi16(src_4, src_5); + + for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) { + int sy = sy4 + delta * (k + 4); + __m256i coeffs[8]; + prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy, coeffs); + __m256i res_lo, res_hi; + filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi, + row); + store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt, + res_sub_const, round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + src[0] = src[2]; + src[2] = src[4]; + src[4] = src[6]; + src[1] = src[3]; + src[3] = src[5]; + src[5] = src[7]; + row += 1; + } +} + +static INLINE void warp_vertical_filter_delta0_avx2( + uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, + int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, + int i, int j, int sy4, const int reduce_bits_vert, + const __m256i *res_add_const, const int round_bits, + const __m256i *res_sub_const, const __m256i *round_bits_const, + const __m256i *wt) { + (void)delta; + int k, row = 0; + __m256i src[8], coeffs[8]; + const __m256i src_0 = horz_out[0]; + const __m256i src_1 = + _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21); + const __m256i src_2 = horz_out[1]; + const __m256i src_3 = + _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21); + const __m256i src_4 = horz_out[2]; + const __m256i src_5 = + _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21); + + src[0] = _mm256_unpacklo_epi16(src_0, src_1); + src[2] = _mm256_unpacklo_epi16(src_2, src_3); + src[4] = _mm256_unpacklo_epi16(src_4, src_5); + + src[1] = _mm256_unpackhi_epi16(src_0, src_1); + src[3] = _mm256_unpackhi_epi16(src_2, src_3); + src[5] = _mm256_unpackhi_epi16(src_4, src_5); + + prepare_vertical_filter_coeffs_delta0_avx2(gamma, sy4, coeffs); + + for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) { + __m256i res_lo, res_hi; + filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi, + row); + store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt, + res_sub_const, round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + src[0] = src[2]; + src[2] = src[4]; + src[4] = src[6]; + src[1] = src[3]; + src[3] = src[5]; + src[5] = src[7]; + row += 1; + } +} + +static INLINE void warp_vertical_filter_gamma0_delta0_avx2( + uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, + int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, + int i, int j, int sy4, const int reduce_bits_vert, + const __m256i *res_add_const, const int round_bits, + const __m256i *res_sub_const, const __m256i *round_bits_const, + const __m256i *wt) { + (void)gamma; + int k, row = 0; + __m256i src[8], coeffs[8]; + const __m256i src_0 = horz_out[0]; + const __m256i src_1 = + _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21); + const __m256i src_2 = horz_out[1]; + const __m256i src_3 = + _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21); + const __m256i src_4 = horz_out[2]; + const __m256i src_5 = + _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21); + + src[0] = _mm256_unpacklo_epi16(src_0, src_1); + src[2] = _mm256_unpacklo_epi16(src_2, src_3); + src[4] = _mm256_unpacklo_epi16(src_4, src_5); + + src[1] = _mm256_unpackhi_epi16(src_0, src_1); + src[3] = _mm256_unpackhi_epi16(src_2, src_3); + src[5] = _mm256_unpackhi_epi16(src_4, src_5); + + prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy4, coeffs); + + for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) { + __m256i res_lo, res_hi; + filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi, + row); + store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt, + res_sub_const, round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + src[0] = src[2]; + src[2] = src[4]; + src[4] = src[6]; + src[1] = src[3]; + src[3] = src[5]; + src[5] = src[7]; + row += 1; + } +} + +static INLINE void prepare_warp_vertical_filter_avx2( + uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, + int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, + int i, int j, int sy4, const int reduce_bits_vert, + const __m256i *res_add_const, const int round_bits, + const __m256i *res_sub_const, const __m256i *round_bits_const, + const __m256i *wt) { + if (gamma == 0 && delta == 0) + warp_vertical_filter_gamma0_delta0_avx2( + pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width, + i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const, + round_bits_const, wt); + else if (gamma == 0 && delta != 0) + warp_vertical_filter_gamma0_avx2( + pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width, + i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const, + round_bits_const, wt); + else if (gamma != 0 && delta == 0) + warp_vertical_filter_delta0_avx2( + pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width, + i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const, + round_bits_const, wt); + else + warp_vertical_filter_avx2(pred, horz_out, conv_params, gamma, delta, + p_height, p_stride, p_width, i, j, sy4, + reduce_bits_vert, res_add_const, round_bits, + res_sub_const, round_bits_const, wt); +} + +static INLINE void prepare_warp_horizontal_filter_avx2( + const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const __m256i *round_const, const __m128i *shift, + const __m256i *shuffle_src) { + if (alpha == 0 && beta == 0) + warp_horizontal_filter_alpha0_beta0_avx2( + ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, + round_const, shift, shuffle_src); + else if (alpha == 0 && beta != 0) + warp_horizontal_filter_alpha0_avx2(ref, horz_out, stride, ix4, iy4, sx4, + alpha, beta, p_height, height, i, + round_const, shift, shuffle_src); + else if (alpha != 0 && beta == 0) + warp_horizontal_filter_beta0_avx2(ref, horz_out, stride, ix4, iy4, sx4, + alpha, beta, p_height, height, i, + round_const, shift, shuffle_src); + else + warp_horizontal_filter_avx2(ref, horz_out, stride, ix4, iy4, sx4, alpha, + beta, p_height, height, i, round_const, shift, + shuffle_src); +} + +int64_t av1_calc_frame_error_avx2(const uint8_t *const ref, int ref_stride, + const uint8_t *const dst, int p_width, + int p_height, int dst_stride) { + int64_t sum_error = 0; + int i, j; + __m256i row_error, col_error; + __m256i zero = _mm256_set1_epi16(0); + __m256i dup_255 = _mm256_set1_epi16(255); + col_error = zero; + + for (i = 0; i < (p_height / 4); i++) { + row_error = _mm256_set1_epi16(0); + for (j = 0; j < (p_width / 16); j++) { + __m256i ref_1_16 = _mm256_cvtepu8_epi16(_mm_load_si128( + (__m128i *)(ref + (j * 16) + (((i * 4) + 0) * ref_stride)))); + __m256i dst_1_16 = _mm256_cvtepu8_epi16(_mm_load_si128( + (__m128i *)(dst + (j * 16) + (((i * 4) + 0) * dst_stride)))); + __m256i ref_2_16 = _mm256_cvtepu8_epi16(_mm_load_si128( + (__m128i *)(ref + (j * 16) + (((i * 4) + 1) * ref_stride)))); + __m256i dst_2_16 = _mm256_cvtepu8_epi16(_mm_load_si128( + (__m128i *)(dst + (j * 16) + (((i * 4) + 1) * dst_stride)))); + __m256i ref_3_16 = _mm256_cvtepu8_epi16(_mm_load_si128( + (__m128i *)(ref + (j * 16) + (((i * 4) + 2) * ref_stride)))); + __m256i dst_3_16 = _mm256_cvtepu8_epi16(_mm_load_si128( + (__m128i *)(dst + (j * 16) + (((i * 4) + 2) * dst_stride)))); + __m256i ref_4_16 = _mm256_cvtepu8_epi16(_mm_load_si128( + (__m128i *)(ref + (j * 16) + (((i * 4) + 3) * ref_stride)))); + __m256i dst_4_16 = _mm256_cvtepu8_epi16(_mm_load_si128( + (__m128i *)(dst + (j * 16) + (((i * 4) + 3) * dst_stride)))); + + __m256i diff_1 = + _mm256_add_epi16(_mm256_sub_epi16(dst_1_16, ref_1_16), dup_255); + __m256i diff_2 = + _mm256_add_epi16(_mm256_sub_epi16(dst_2_16, ref_2_16), dup_255); + __m256i diff_3 = + _mm256_add_epi16(_mm256_sub_epi16(dst_3_16, ref_3_16), dup_255); + __m256i diff_4 = + _mm256_add_epi16(_mm256_sub_epi16(dst_4_16, ref_4_16), dup_255); + + __m256i diff_1_lo = _mm256_unpacklo_epi16(diff_1, zero); + __m256i diff_1_hi = _mm256_unpackhi_epi16(diff_1, zero); + __m256i diff_2_lo = _mm256_unpacklo_epi16(diff_2, zero); + __m256i diff_2_hi = _mm256_unpackhi_epi16(diff_2, zero); + __m256i diff_3_lo = _mm256_unpacklo_epi16(diff_3, zero); + __m256i diff_3_hi = _mm256_unpackhi_epi16(diff_3, zero); + __m256i diff_4_lo = _mm256_unpacklo_epi16(diff_4, zero); + __m256i diff_4_hi = _mm256_unpackhi_epi16(diff_4, zero); + + __m256i error_1_lo = + _mm256_i32gather_epi32(error_measure_lut, diff_1_lo, 4); + __m256i error_1_hi = + _mm256_i32gather_epi32(error_measure_lut, diff_1_hi, 4); + __m256i error_2_lo = + _mm256_i32gather_epi32(error_measure_lut, diff_2_lo, 4); + __m256i error_2_hi = + _mm256_i32gather_epi32(error_measure_lut, diff_2_hi, 4); + __m256i error_3_lo = + _mm256_i32gather_epi32(error_measure_lut, diff_3_lo, 4); + __m256i error_3_hi = + _mm256_i32gather_epi32(error_measure_lut, diff_3_hi, 4); + __m256i error_4_lo = + _mm256_i32gather_epi32(error_measure_lut, diff_4_lo, 4); + __m256i error_4_hi = + _mm256_i32gather_epi32(error_measure_lut, diff_4_hi, 4); + + __m256i error_1 = _mm256_add_epi32(error_1_lo, error_1_hi); + __m256i error_2 = _mm256_add_epi32(error_2_lo, error_2_hi); + __m256i error_3 = _mm256_add_epi32(error_3_lo, error_3_hi); + __m256i error_4 = _mm256_add_epi32(error_4_lo, error_4_hi); + + __m256i error_1_2 = _mm256_add_epi32(error_1, error_2); + __m256i error_3_4 = _mm256_add_epi32(error_3, error_4); + + __m256i error_1_2_3_4 = _mm256_add_epi32(error_1_2, error_3_4); + row_error = _mm256_add_epi32(row_error, error_1_2_3_4); + } + __m256i col_error_lo = _mm256_unpacklo_epi32(row_error, zero); + __m256i col_error_hi = _mm256_unpackhi_epi32(row_error, zero); + __m256i col_error_temp = _mm256_add_epi64(col_error_lo, col_error_hi); + col_error = _mm256_add_epi64(col_error, col_error_temp); + // Error summation for remaining width, which is not multiple of 16 + if (p_width & 0xf) { + for (int k = 0; k < 4; ++k) { + for (int l = j * 16; l < p_width; ++l) { + sum_error += + (int64_t)error_measure(dst[l + ((i * 4) + k) * dst_stride] - + ref[l + ((i * 4) + k) * ref_stride]); + } + } + } + } + __m128i sum_error_q_0 = _mm256_castsi256_si128(col_error); + __m128i sum_error_q_1 = _mm256_extracti128_si256(col_error, 1); + sum_error_q_0 = _mm_add_epi64(sum_error_q_0, sum_error_q_1); + int64_t sum_error_d_0, sum_error_d_1; + xx_storel_64(&sum_error_d_0, sum_error_q_0); + xx_storel_64(&sum_error_d_1, _mm_srli_si128(sum_error_q_0, 8)); + sum_error = (sum_error + sum_error_d_0 + sum_error_d_1); + // Error summation for remaining height, which is not multiple of 4 + if (p_height & 0x3) { + for (int k = i * 4; k < p_height; ++k) { + for (int l = 0; l < p_width; ++l) { + sum_error += (int64_t)error_measure(dst[l + k * dst_stride] - + ref[l + k * ref_stride]); + } + } + } + return sum_error; +} + +void av1_warp_affine_avx2(const int32_t *mat, const uint8_t *ref, int width, + int height, int stride, uint8_t *pred, int p_col, + int p_row, int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + __m256i horz_out[8]; + int i, j, k; + const int bd = 8; + const int reduce_bits_horiz = conv_params->round_0; + const int reduce_bits_vert = conv_params->is_compound + ? conv_params->round_1 + : 2 * FILTER_BITS - reduce_bits_horiz; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); + + const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; + const __m256i reduce_bits_vert_const = + _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1)); + const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + assert(IMPLIES(conv_params->do_average, conv_params->is_compound)); + + const __m256i round_const = _mm256_set1_epi16( + (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1)); + const __m128i shift = _mm_cvtsi32_si128(reduce_bits_horiz); + + __m256i res_sub_const, round_bits_const, wt; + unpack_weights_and_set_round_const_avx2(conv_params, round_bits, offset_bits, + &res_sub_const, &round_bits_const, + &wt); + + __m256i res_add_const_1; + if (conv_params->is_compound == 1) { + res_add_const_1 = _mm256_add_epi32(reduce_bits_vert_const, res_add_const); + } else { + res_add_const_1 = _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) + + ((1 << reduce_bits_vert) >> 1)); + } + const int32_t const1 = alpha * (-4) + beta * (-4) + + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + const int32_t const2 = gamma * (-4) + delta * (-4) + + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + const int32_t const3 = ((1 << WARP_PARAM_REDUCE_BITS) - 1); + const int16_t const4 = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)); + const int16_t const5 = (1 << (FILTER_BITS - reduce_bits_horiz)); + + __m256i shuffle_src[4]; + shuffle_src[0] = _mm256_load_si256((__m256i *)shuffle_src0); + shuffle_src[1] = _mm256_load_si256((__m256i *)shuffle_src1); + shuffle_src[2] = _mm256_load_si256((__m256i *)shuffle_src2); + shuffle_src[3] = _mm256_load_si256((__m256i *)shuffle_src3); + + for (i = 0; i < p_height; i += 8) { + for (j = 0; j < p_width; j += 8) { + const int32_t src_x = (p_col + j + 4) << subsampling_x; + const int32_t src_y = (p_row + i + 4) << subsampling_y; + const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0]; + const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1]; + const int32_t x4 = dst_x >> subsampling_x; + const int32_t y4 = dst_y >> subsampling_y; + + int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS; + int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS; + int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + + // Add in all the constant terms, including rounding and offset + sx4 += const1; + sy4 += const2; + + sx4 &= ~const3; + sy4 &= ~const3; + + // Horizontal filter + // If the block is aligned such that, after clamping, every sample + // would be taken from the leftmost/rightmost column, then we can + // skip the expensive horizontal filter. + + if (ix4 <= -7) { + int iy, row = 0; + for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m256i temp_0 = + _mm256_set1_epi16(const4 + ref[iy * stride] * const5); + iy = iy4 + k + 1; + iy = clamp(iy, 0, height - 1); + const __m256i temp_1 = + _mm256_set1_epi16(const4 + ref[iy * stride] * const5); + horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0); + row += 1; + } + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + horz_out[row] = _mm256_set1_epi16(const4 + ref[iy * stride] * const5); + } else if (ix4 >= width + 6) { + int iy, row = 0; + for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m256i temp_0 = _mm256_set1_epi16( + const4 + ref[iy * stride + (width - 1)] * const5); + iy = iy4 + k + 1; + iy = clamp(iy, 0, height - 1); + const __m256i temp_1 = _mm256_set1_epi16( + const4 + ref[iy * stride + (width - 1)] * const5); + horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0); + row += 1; + } + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + horz_out[row] = + _mm256_set1_epi16(const4 + ref[iy * stride + (width - 1)] * const5); + } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) { + const int out_of_boundary_left = -(ix4 - 6); + const int out_of_boundary_right = (ix4 + 8) - width; + int iy, sx, row = 0; + for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + __m128i src0 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + iy = iy4 + k + 1; + iy = clamp(iy, 0, height - 1); + __m128i src1 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + + if (out_of_boundary_left >= 0) { + const __m128i shuffle_reg_left = + _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]); + src0 = _mm_shuffle_epi8(src0, shuffle_reg_left); + src1 = _mm_shuffle_epi8(src1, shuffle_reg_left); + } + if (out_of_boundary_right >= 0) { + const __m128i shuffle_reg_right = _mm_loadu_si128( + (__m128i *)warp_pad_right[out_of_boundary_right]); + src0 = _mm_shuffle_epi8(src0, shuffle_reg_right); + src1 = _mm_shuffle_epi8(src1, shuffle_reg_right); + } + sx = sx4 + beta * (k + 4); + const __m256i src_01 = + _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1); + horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row, + shuffle_src, &round_const, &shift); + row += 1; + } + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + if (out_of_boundary_left >= 0) { + const __m128i shuffle_reg_left = + _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]); + src = _mm_shuffle_epi8(src, shuffle_reg_left); + } + if (out_of_boundary_right >= 0) { + const __m128i shuffle_reg_right = + _mm_loadu_si128((__m128i *)warp_pad_right[out_of_boundary_right]); + src = _mm_shuffle_epi8(src, shuffle_reg_right); + } + sx = sx4 + beta * (k + 4); + const __m256i src_01 = _mm256_castsi128_si256(src); + __m256i coeff[4]; + prepare_horizontal_filter_coeff(alpha, sx, coeff); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, + &round_const, &shift, row); + } else { + prepare_warp_horizontal_filter_avx2( + ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height, + i, &round_const, &shift, shuffle_src); + } + + // Vertical filter + prepare_warp_vertical_filter_avx2( + pred, horz_out, conv_params, gamma, delta, p_height, p_stride, + p_width, i, j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, + &res_sub_const, &round_bits_const, &wt); + } + } +} diff --git a/libs/libaom/src/av1/common/x86/warp_plane_sse2.c b/libs/libaom/src/av1/common/x86/warp_plane_sse2.c new file mode 100644 index 000000000..6ff666518 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/warp_plane_sse2.c @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/x86/synonyms.h" +#include "av1/common/warped_motion.h" +#include "config/av1_rtcd.h" + +int64_t av1_calc_frame_error_sse2(const uint8_t *const ref, int ref_stride, + const uint8_t *const dst, int p_width, + int p_height, int dst_stride) { + int64_t sum_error = 0; + int i, j; + __m128i row_error, col_error; + __m128i zero = _mm_set1_epi16(0); + __m128i dup_255 = _mm_set1_epi16(255); + col_error = zero; + for (i = 0; i < (p_height); i++) { + row_error = zero; + for (j = 0; j < (p_width / 16); j++) { + __m128i ref_8 = + _mm_load_si128((__m128i *)(ref + (j * 16) + (i * ref_stride))); + __m128i dst_8 = + _mm_load_si128((__m128i *)(dst + (j * 16) + (i * dst_stride))); + __m128i ref_16_lo = _mm_unpacklo_epi8(ref_8, zero); + __m128i ref_16_hi = _mm_unpackhi_epi8(ref_8, zero); + __m128i dst_16_lo = _mm_unpacklo_epi8(dst_8, zero); + __m128i dst_16_hi = _mm_unpackhi_epi8(dst_8, zero); + + __m128i diff_1 = + _mm_add_epi16(_mm_sub_epi16(dst_16_lo, ref_16_lo), dup_255); + __m128i diff_2 = + _mm_add_epi16(_mm_sub_epi16(dst_16_hi, ref_16_hi), dup_255); + + __m128i error_1_lo = + _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_1, 3)], + error_measure_lut[_mm_extract_epi16(diff_1, 2)], + error_measure_lut[_mm_extract_epi16(diff_1, 1)], + error_measure_lut[_mm_extract_epi16(diff_1, 0)]); + __m128i error_1_hi = + _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_1, 7)], + error_measure_lut[_mm_extract_epi16(diff_1, 6)], + error_measure_lut[_mm_extract_epi16(diff_1, 5)], + error_measure_lut[_mm_extract_epi16(diff_1, 4)]); + __m128i error_2_lo = + _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_2, 3)], + error_measure_lut[_mm_extract_epi16(diff_2, 2)], + error_measure_lut[_mm_extract_epi16(diff_2, 1)], + error_measure_lut[_mm_extract_epi16(diff_2, 0)]); + __m128i error_2_hi = + _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_2, 7)], + error_measure_lut[_mm_extract_epi16(diff_2, 6)], + error_measure_lut[_mm_extract_epi16(diff_2, 5)], + error_measure_lut[_mm_extract_epi16(diff_2, 4)]); + + __m128i error_1 = _mm_add_epi32(error_1_lo, error_1_hi); + __m128i error_2 = _mm_add_epi32(error_2_lo, error_2_hi); + __m128i error_1_2 = _mm_add_epi32(error_1, error_2); + + row_error = _mm_add_epi32(row_error, error_1_2); + } + __m128i col_error_lo = _mm_unpacklo_epi32(row_error, zero); + __m128i col_error_hi = _mm_unpackhi_epi32(row_error, zero); + __m128i col_error_temp = _mm_add_epi64(col_error_lo, col_error_hi); + col_error = _mm_add_epi64(col_error, col_error_temp); + // Error summation for remaining width, which is not multiple of 16 + if (p_width & 0xf) { + for (int l = j * 16; l < p_width; ++l) { + sum_error += (int64_t)error_measure(dst[l + i * dst_stride] - + ref[l + i * ref_stride]); + } + } + } + int64_t sum_error_d_0, sum_error_d_1; + xx_storel_64(&sum_error_d_0, col_error); + xx_storel_64(&sum_error_d_1, _mm_srli_si128(col_error, 8)); + sum_error = (sum_error + sum_error_d_0 + sum_error_d_1); + return sum_error; +} diff --git a/libs/libaom/src/av1/common/x86/warp_plane_sse4.c b/libs/libaom/src/av1/common/x86/warp_plane_sse4.c new file mode 100644 index 000000000..10ddf92d0 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/warp_plane_sse4.c @@ -0,0 +1,963 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/warped_motion.h" + +/* This is a modified version of 'av1_warped_filter' from warped_motion.c: + * Each coefficient is stored in 8 bits instead of 16 bits + * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7 + + This is done in order to avoid overflow: Since the tap with the largest + coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation + order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular + convolve functions. + + Instead, we use the summation order + ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)). + The rearrangement of coefficients in this table is so that we can get the + coefficients into the correct order more quickly. +*/ +/* clang-format off */ +DECLARE_ALIGNED(8, const int8_t, + av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = { +#if WARPEDPIXEL_PREC_BITS == 6 + // [-1, 0) + { 0, 127, 0, 0, 0, 1, 0, 0}, { 0, 127, 0, 0, -1, 2, 0, 0}, + { 1, 127, -1, 0, -3, 4, 0, 0}, { 1, 126, -2, 0, -4, 6, 1, 0}, + { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 125, -4, 0, -6, 11, 1, 0}, + { 1, 124, -4, 0, -7, 13, 1, 0}, { 2, 123, -5, 0, -8, 15, 1, 0}, + { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 121, -6, 0, -10, 20, 1, 0}, + { 2, 120, -7, 0, -11, 22, 2, 0}, { 2, 119, -8, 0, -12, 25, 2, 0}, + { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 116, -9, 0, -13, 29, 2, 0}, + { 3, 114, -10, 0, -14, 32, 3, 0}, { 3, 113, -10, 0, -15, 35, 2, 0}, + { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 109, -11, 0, -16, 40, 3, 0}, + { 3, 108, -12, 0, -16, 42, 3, 0}, { 4, 106, -13, 0, -17, 45, 3, 0}, + { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 102, -14, 0, -17, 50, 3, 0}, + { 4, 100, -14, 0, -17, 52, 3, 0}, { 4, 98, -15, 0, -18, 55, 4, 0}, + { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 94, -16, 0, -18, 60, 4, 0}, + { 4, 91, -16, 0, -18, 63, 4, 0}, { 4, 89, -16, 0, -18, 65, 4, 0}, + { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 85, -17, 0, -18, 70, 4, 0}, + { 4, 82, -17, 0, -18, 73, 4, 0}, { 4, 80, -17, 0, -18, 75, 4, 0}, + { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 75, -18, 0, -17, 80, 4, 0}, + { 4, 73, -18, 0, -17, 82, 4, 0}, { 4, 70, -18, 0, -17, 85, 4, 0}, + { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 65, -18, 0, -16, 89, 4, 0}, + { 4, 63, -18, 0, -16, 91, 4, 0}, { 4, 60, -18, 0, -16, 94, 4, 0}, + { 3, 58, -18, 0, -15, 96, 4, 0}, { 4, 55, -18, 0, -15, 98, 4, 0}, + { 3, 52, -17, 0, -14, 100, 4, 0}, { 3, 50, -17, 0, -14, 102, 4, 0}, + { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 45, -17, 0, -13, 106, 4, 0}, + { 3, 42, -16, 0, -12, 108, 3, 0}, { 3, 40, -16, 0, -11, 109, 3, 0}, + { 3, 37, -15, 0, -11, 111, 3, 0}, { 2, 35, -15, 0, -10, 113, 3, 0}, + { 3, 32, -14, 0, -10, 114, 3, 0}, { 2, 29, -13, 0, -9, 116, 3, 0}, + { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 25, -12, 0, -8, 119, 2, 0}, + { 2, 22, -11, 0, -7, 120, 2, 0}, { 1, 20, -10, 0, -6, 121, 2, 0}, + { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 15, -8, 0, -5, 123, 2, 0}, + { 1, 13, -7, 0, -4, 124, 1, 0}, { 1, 11, -6, 0, -4, 125, 1, 0}, + { 1, 8, -5, 0, -3, 126, 1, 0}, { 1, 6, -4, 0, -2, 126, 1, 0}, + { 0, 4, -3, 0, -1, 127, 1, 0}, { 0, 2, -1, 0, 0, 127, 0, 0}, + // [0, 1) + { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -1, 2, 0, 0, 127, 0, 0}, + { 0, -3, 4, 1, 1, 127, -2, 0}, { 0, -5, 6, 1, 1, 127, -2, 0}, + { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -7, 11, 2, 2, 126, -4, -1}, + {-1, -8, 13, 2, 3, 125, -5, -1}, {-1, -10, 16, 3, 3, 124, -6, -1}, + {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -12, 20, 3, 4, 122, -7, -1}, + {-1, -13, 23, 3, 4, 121, -8, -1}, {-2, -14, 25, 4, 5, 120, -9, -1}, + {-1, -15, 27, 4, 5, 119, -10, -1}, {-1, -16, 30, 4, 5, 118, -11, -1}, + {-2, -17, 33, 5, 6, 116, -12, -1}, {-2, -17, 35, 5, 6, 114, -12, -1}, + {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 41, 6, 7, 111, -14, -2}, + {-2, -19, 43, 6, 7, 110, -15, -2}, {-2, -20, 46, 6, 7, 108, -15, -2}, + {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 51, 7, 7, 104, -16, -2}, + {-2, -21, 54, 7, 7, 102, -17, -2}, {-2, -21, 56, 7, 8, 100, -18, -2}, + {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 62, 7, 8, 96, -19, -2}, + {-2, -22, 64, 7, 8, 94, -19, -2}, {-2, -22, 67, 8, 8, 91, -20, -2}, + {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -22, 72, 8, 8, 87, -21, -2}, + {-2, -21, 74, 8, 8, 84, -21, -2}, {-2, -22, 77, 8, 8, 82, -21, -2}, + {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 82, 8, 8, 77, -22, -2}, + {-2, -21, 84, 8, 8, 74, -21, -2}, {-2, -21, 87, 8, 8, 72, -22, -2}, + {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -20, 91, 8, 8, 67, -22, -2}, + {-2, -19, 94, 8, 7, 64, -22, -2}, {-2, -19, 96, 8, 7, 62, -22, -2}, + {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -18, 100, 8, 7, 56, -21, -2}, + {-2, -17, 102, 7, 7, 54, -21, -2}, {-2, -16, 104, 7, 7, 51, -21, -2}, + {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 108, 7, 6, 46, -20, -2}, + {-2, -15, 110, 7, 6, 43, -19, -2}, {-2, -14, 111, 7, 6, 41, -19, -2}, + {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 114, 6, 5, 35, -17, -2}, + {-1, -12, 116, 6, 5, 33, -17, -2}, {-1, -11, 118, 5, 4, 30, -16, -1}, + {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -9, 120, 5, 4, 25, -14, -2}, + {-1, -8, 121, 4, 3, 23, -13, -1}, {-1, -7, 122, 4, 3, 20, -12, -1}, + {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -6, 124, 3, 3, 16, -10, -1}, + {-1, -5, 125, 3, 2, 13, -8, -1}, {-1, -4, 126, 2, 2, 11, -7, -1}, + { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 6, -5, 0}, + { 0, -2, 127, 1, 1, 4, -3, 0}, { 0, 0, 127, 0, 0, 2, -1, 0}, + // [1, 2) + { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 0, 127, 0, 0, -1, 2, 0}, + { 0, 1, 127, -1, 0, -3, 4, 0}, { 0, 1, 126, -2, 0, -4, 6, 1}, + { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 125, -4, 0, -6, 11, 1}, + { 0, 1, 124, -4, 0, -7, 13, 1}, { 0, 2, 123, -5, 0, -8, 15, 1}, + { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 121, -6, 0, -10, 20, 1}, + { 0, 2, 120, -7, 0, -11, 22, 2}, { 0, 2, 119, -8, 0, -12, 25, 2}, + { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 116, -9, 0, -13, 29, 2}, + { 0, 3, 114, -10, 0, -14, 32, 3}, { 0, 3, 113, -10, 0, -15, 35, 2}, + { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 109, -11, 0, -16, 40, 3}, + { 0, 3, 108, -12, 0, -16, 42, 3}, { 0, 4, 106, -13, 0, -17, 45, 3}, + { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 102, -14, 0, -17, 50, 3}, + { 0, 4, 100, -14, 0, -17, 52, 3}, { 0, 4, 98, -15, 0, -18, 55, 4}, + { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 94, -16, 0, -18, 60, 4}, + { 0, 4, 91, -16, 0, -18, 63, 4}, { 0, 4, 89, -16, 0, -18, 65, 4}, + { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 85, -17, 0, -18, 70, 4}, + { 0, 4, 82, -17, 0, -18, 73, 4}, { 0, 4, 80, -17, 0, -18, 75, 4}, + { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 75, -18, 0, -17, 80, 4}, + { 0, 4, 73, -18, 0, -17, 82, 4}, { 0, 4, 70, -18, 0, -17, 85, 4}, + { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 65, -18, 0, -16, 89, 4}, + { 0, 4, 63, -18, 0, -16, 91, 4}, { 0, 4, 60, -18, 0, -16, 94, 4}, + { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 4, 55, -18, 0, -15, 98, 4}, + { 0, 3, 52, -17, 0, -14, 100, 4}, { 0, 3, 50, -17, 0, -14, 102, 4}, + { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 45, -17, 0, -13, 106, 4}, + { 0, 3, 42, -16, 0, -12, 108, 3}, { 0, 3, 40, -16, 0, -11, 109, 3}, + { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 2, 35, -15, 0, -10, 113, 3}, + { 0, 3, 32, -14, 0, -10, 114, 3}, { 0, 2, 29, -13, 0, -9, 116, 3}, + { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 25, -12, 0, -8, 119, 2}, + { 0, 2, 22, -11, 0, -7, 120, 2}, { 0, 1, 20, -10, 0, -6, 121, 2}, + { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 15, -8, 0, -5, 123, 2}, + { 0, 1, 13, -7, 0, -4, 124, 1}, { 0, 1, 11, -6, 0, -4, 125, 1}, + { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 1, 6, -4, 0, -2, 126, 1}, + { 0, 0, 4, -3, 0, -1, 127, 1}, { 0, 0, 2, -1, 0, 0, 127, 0}, + // dummy (replicate row index 191) + { 0, 0, 2, -1, 0, 0, 127, 0}, + +#else + // [-1, 0) + { 0, 127, 0, 0, 0, 1, 0, 0}, { 1, 127, -1, 0, -3, 4, 0, 0}, + { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 124, -4, 0, -7, 13, 1, 0}, + { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 120, -7, 0, -11, 22, 2, 0}, + { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 114, -10, 0, -14, 32, 3, 0}, + { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 108, -12, 0, -16, 42, 3, 0}, + { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 100, -14, 0, -17, 52, 3, 0}, + { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 91, -16, 0, -18, 63, 4, 0}, + { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 82, -17, 0, -18, 73, 4, 0}, + { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 73, -18, 0, -17, 82, 4, 0}, + { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 63, -18, 0, -16, 91, 4, 0}, + { 3, 58, -18, 0, -15, 96, 4, 0}, { 3, 52, -17, 0, -14, 100, 4, 0}, + { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 42, -16, 0, -12, 108, 3, 0}, + { 3, 37, -15, 0, -11, 111, 3, 0}, { 3, 32, -14, 0, -10, 114, 3, 0}, + { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 22, -11, 0, -7, 120, 2, 0}, + { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 13, -7, 0, -4, 124, 1, 0}, + { 1, 8, -5, 0, -3, 126, 1, 0}, { 0, 4, -3, 0, -1, 127, 1, 0}, + // [0, 1) + { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -3, 4, 1, 1, 127, -2, 0}, + { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -8, 13, 2, 3, 125, -5, -1}, + {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -13, 23, 3, 4, 121, -8, -1}, + {-1, -15, 27, 4, 5, 119, -10, -1}, {-2, -17, 33, 5, 6, 116, -12, -1}, + {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 43, 6, 7, 110, -15, -2}, + {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 54, 7, 7, 102, -17, -2}, + {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 64, 7, 8, 94, -19, -2}, + {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -21, 74, 8, 8, 84, -21, -2}, + {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 84, 8, 8, 74, -21, -2}, + {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -19, 94, 8, 7, 64, -22, -2}, + {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -17, 102, 7, 7, 54, -21, -2}, + {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 110, 7, 6, 43, -19, -2}, + {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 116, 6, 5, 33, -17, -2}, + {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -8, 121, 4, 3, 23, -13, -1}, + {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -5, 125, 3, 2, 13, -8, -1}, + { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 4, -3, 0}, + // [1, 2) + { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 1, 127, -1, 0, -3, 4, 0}, + { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 124, -4, 0, -7, 13, 1}, + { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 120, -7, 0, -11, 22, 2}, + { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 114, -10, 0, -14, 32, 3}, + { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 108, -12, 0, -16, 42, 3}, + { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 100, -14, 0, -17, 52, 3}, + { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 91, -16, 0, -18, 63, 4}, + { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 82, -17, 0, -18, 73, 4}, + { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 73, -18, 0, -17, 82, 4}, + { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 63, -18, 0, -16, 91, 4}, + { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 3, 52, -17, 0, -14, 100, 4}, + { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 42, -16, 0, -12, 108, 3}, + { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 3, 32, -14, 0, -10, 114, 3}, + { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 22, -11, 0, -7, 120, 2}, + { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 13, -7, 0, -4, 124, 1}, + { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 0, 4, -3, 0, -1, 127, 1}, + // dummy (replicate row index 95) + { 0, 0, 4, -3, 0, -1, 127, 1}, +#endif // WARPEDPIXEL_PREC_BITS == 6 +}; +/* clang-format on */ + +// Shuffle masks: we want to convert a sequence of bytes 0, 1, 2, ..., 15 +// in an SSE register into two sequences: +// 0, 2, 2, 4, ..., 12, 12, 14, +// 1, 3, 3, 5, ..., 13, 13, 15, +DECLARE_ALIGNED(16, static const uint8_t, + even_mask[16]) = { 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 0 }; + +DECLARE_ALIGNED(16, static const uint8_t, + odd_mask[16]) = { 1, 3, 3, 5, 5, 7, 7, 9, + 9, 11, 11, 13, 13, 15, 15, 0 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_alpha0_mask01[16]) = { 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_alpha0_mask23[16]) = { 2, 3, 2, 3, 2, 3, 2, 3, + 2, 3, 2, 3, 2, 3, 2, 3 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_alpha0_mask45[16]) = { 4, 5, 4, 5, 4, 5, 4, 5, + 4, 5, 4, 5, 4, 5, 4, 5 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_alpha0_mask67[16]) = { 6, 7, 6, 7, 6, 7, 6, 7, + 6, 7, 6, 7, 6, 7, 6, 7 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_gamma0_mask0[16]) = { 0, 1, 2, 3, 0, 1, 2, 3, + 0, 1, 2, 3, 0, 1, 2, 3 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_gamma0_mask1[16]) = { 4, 5, 6, 7, 4, 5, 6, 7, + 4, 5, 6, 7, 4, 5, 6, 7 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_gamma0_mask2[16]) = { 8, 9, 10, 11, 8, 9, 10, 11, + 8, 9, 10, 11, 8, 9, 10, 11 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_gamma0_mask3[16]) = { 12, 13, 14, 15, 12, 13, 14, 15, + 12, 13, 14, 15, 12, 13, 14, 15 }; + +static INLINE void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff, + const int offset_bits_horiz, + const int reduce_bits_horiz, int k) { + const __m128i src_even = + _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)even_mask)); + const __m128i src_odd = + _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)odd_mask)); + // The pixel order we need for 'src' is: + // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9 + const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd); + const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff[0]); + // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13 + const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4), + _mm_srli_si128(src_odd, 4)); + const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff[1]); + // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10 + const __m128i src_13 = + _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2)); + const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff[2]); + // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14 + const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4), + _mm_srli_si128(src_even, 6)); + const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff[3]); + + const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) + + ((1 << reduce_bits_horiz) >> 1)); + + // Note: The values res_02 + res_46 and res_13 + res_57 both + // fit into int16s at this point, but their sum may be too wide to fit + // into an int16. However, once we also add round_const, the sum of + // all of these fits into a uint16. + // + // The wrapping behaviour of _mm_add_* is used here to make sure we + // get the correct result despite converting between different + // (implicit) types. + const __m128i res_even = _mm_add_epi16(res_02, res_46); + const __m128i res_odd = _mm_add_epi16(res_13, res_57); + const __m128i res = + _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const); + tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz)); +} + +static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx, + __m128i *coeff) { + // Filter even-index pixels + const __m128i tmp_0 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_1 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_2 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_3 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_4 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_5 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_6 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_7 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]); + + // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2 + const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2); + // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3 + const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3); + // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6 + const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6); + // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7 + const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7); + + // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6 + const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10); + // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6 + const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10); + // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7 + const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11); + // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7 + const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11); + + // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7 + coeff[0] = _mm_unpacklo_epi64(tmp_12, tmp_14); + // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7 + coeff[1] = _mm_unpackhi_epi64(tmp_12, tmp_14); + // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7 + coeff[2] = _mm_unpacklo_epi64(tmp_13, tmp_15); + // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7 + coeff[3] = _mm_unpackhi_epi64(tmp_13, tmp_15); +} + +static INLINE void prepare_horizontal_filter_coeff_alpha0(int sx, + __m128i *coeff) { + // Filter even-index pixels + const __m128i tmp_0 = + _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]); + + // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7 + coeff[0] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask01)); + // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7 + coeff[1] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask23)); + // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7 + coeff[2] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask45)); + // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7 + coeff[3] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask67)); +} + +static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx, + int alpha, int k, + const int offset_bits_horiz, + const int reduce_bits_horiz) { + __m128i coeff[4]; + prepare_horizontal_filter_coeff(alpha, sx, coeff); + filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); +} + +static INLINE void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp, + int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, + int p_height, int height, int i, + const int offset_bits_horiz, + const int reduce_bits_horiz) { + int k; + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz, + reduce_bits_horiz); + } +} + +static INLINE void warp_horizontal_filter_alpha0( + const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)alpha; + int k; + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + + __m128i coeff[4]; + prepare_horizontal_filter_coeff_alpha0(sx, coeff); + filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); + } +} + +static INLINE void warp_horizontal_filter_beta0( + const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)beta; + int k; + __m128i coeff[4]; + prepare_horizontal_filter_coeff(alpha, sx4, coeff); + + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); + } +} + +static INLINE void warp_horizontal_filter_alpha0_beta0( + const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)beta; + (void)alpha; + int k; + + __m128i coeff[4]; + prepare_horizontal_filter_coeff_alpha0(sx4, coeff); + + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); + } +} + +static INLINE void unpack_weights_and_set_round_const( + ConvolveParams *conv_params, const int round_bits, const int offset_bits, + __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) { + *res_sub_const = + _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) - + (1 << (offset_bits - conv_params->round_1 - 1))); + *round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1)); + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi16((int16_t)w0); + const __m128i wt1 = _mm_set1_epi16((int16_t)w1); + *wt = _mm_unpacklo_epi16(wt0, wt1); +} + +static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy, + __m128i *coeffs) { + const __m128i tmp_0 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_2 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_4 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_6 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); + + const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); + const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); + const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); + const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); + + // even coeffs + coeffs[0] = _mm_unpacklo_epi64(tmp_8, tmp_10); + coeffs[1] = _mm_unpackhi_epi64(tmp_8, tmp_10); + coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14); + coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14); + + const __m128i tmp_1 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_3 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_5 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_7 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); + + const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); + const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); + const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); + const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); + + // odd coeffs + coeffs[4] = _mm_unpacklo_epi64(tmp_9, tmp_11); + coeffs[5] = _mm_unpackhi_epi64(tmp_9, tmp_11); + coeffs[6] = _mm_unpacklo_epi64(tmp_13, tmp_15); + coeffs[7] = _mm_unpackhi_epi64(tmp_13, tmp_15); +} + +static INLINE void prepare_vertical_filter_coeffs_gamma0(int sy, + __m128i *coeffs) { + const __m128i tmp_0 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); + + // even coeffs + coeffs[0] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask0)); + coeffs[1] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask1)); + coeffs[2] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask2)); + coeffs[3] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask3)); + + // odd coeffs + coeffs[4] = coeffs[0]; + coeffs[5] = coeffs[1]; + coeffs[6] = coeffs[2]; + coeffs[7] = coeffs[3]; +} + +static INLINE void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs, + __m128i *res_lo, __m128i *res_hi, + int k) { + // Load from tmp and rearrange pairs of consecutive rows into the + // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7 + const __m128i *src = tmp + (k + 4); + const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]); + const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]); + const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]); + const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]); + const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]); + const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]); + const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]); + + const __m128i res_even = + _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]); + const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]); + const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]); + const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[4]); + const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[5]); + const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[6]); + const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[7]); + + const __m128i res_odd = + _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + *res_lo = _mm_unpacklo_epi32(res_even, res_odd); + *res_hi = _mm_unpackhi_epi32(res_even, res_odd); +} + +static INLINE void store_vertical_filter_output( + __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const, + const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const, + uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k, + const int reduce_bits_vert, int p_stride, int p_width, + const int round_bits) { + __m128i res_lo_1 = *res_lo; + __m128i res_hi_1 = *res_hi; + + if (conv_params->is_compound) { + __m128i *const p = + (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j]; + res_lo_1 = _mm_srai_epi32(_mm_add_epi32(res_lo_1, *res_add_const), + reduce_bits_vert); + const __m128i temp_lo_16 = _mm_packus_epi32(res_lo_1, res_lo_1); + __m128i res_lo_16; + if (conv_params->do_average) { + __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + const __m128i p_16 = _mm_loadl_epi64(p); + + if (conv_params->use_dist_wtd_comp_avg) { + const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16); + const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt); + const __m128i shifted_32 = + _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); + res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32); + } else { + res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1); + } + + res_lo_16 = _mm_add_epi16(res_lo_16, *res_sub_const); + + res_lo_16 = _mm_srai_epi16(_mm_add_epi16(res_lo_16, *round_bits_const), + round_bits); + __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16); + *(uint32_t *)dst8 = _mm_cvtsi128_si32(res_8_lo); + } else { + _mm_storel_epi64(p, temp_lo_16); + } + if (p_width > 4) { + __m128i *const p4 = + (__m128i *)&conv_params + ->dst[(i + k + 4) * conv_params->dst_stride + j + 4]; + res_hi_1 = _mm_srai_epi32(_mm_add_epi32(res_hi_1, *res_add_const), + reduce_bits_vert); + const __m128i temp_hi_16 = _mm_packus_epi32(res_hi_1, res_hi_1); + __m128i res_hi_16; + + if (conv_params->do_average) { + __m128i *const dst8_4 = + (__m128i *)&pred[(i + k + 4) * p_stride + j + 4]; + const __m128i p4_16 = _mm_loadl_epi64(p4); + + if (conv_params->use_dist_wtd_comp_avg) { + const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16); + const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt); + const __m128i shifted_32 = + _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); + res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32); + } else { + res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1); + } + res_hi_16 = _mm_add_epi16(res_hi_16, *res_sub_const); + + res_hi_16 = _mm_srai_epi16(_mm_add_epi16(res_hi_16, *round_bits_const), + round_bits); + __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16); + *(uint32_t *)dst8_4 = _mm_cvtsi128_si32(res_8_hi); + + } else { + _mm_storel_epi64(p4, temp_hi_16); + } + } + } else { + const __m128i res_lo_round = _mm_srai_epi32( + _mm_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert); + const __m128i res_hi_round = _mm_srai_epi32( + _mm_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert); + + const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); + __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit); + + // Store, blending with 'pred' if needed + __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + + // Note: If we're outputting a 4x4 block, we need to be very careful + // to only output 4 pixels at this point, to avoid encode/decode + // mismatches when encoding with multiple threads. + if (p_width == 4) { + *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit); + } else { + _mm_storel_epi64(p, res_8bit); + } + } +} + +static INLINE void warp_vertical_filter( + uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, + int16_t delta, int p_height, int p_stride, int p_width, int i, int j, + int sy4, const int reduce_bits_vert, const __m128i *res_add_const, + const int round_bits, const int offset_bits) { + int k; + __m128i res_sub_const, round_bits_const, wt; + unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, + &res_sub_const, &round_bits_const, &wt); + // Vertical filter + for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + + __m128i coeffs[8]; + prepare_vertical_filter_coeffs(gamma, sy, coeffs); + + __m128i res_lo; + __m128i res_hi; + filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); + + store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, + &res_sub_const, &round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + } +} + +static INLINE void warp_vertical_filter_gamma0( + uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, + int16_t delta, int p_height, int p_stride, int p_width, int i, int j, + int sy4, const int reduce_bits_vert, const __m128i *res_add_const, + const int round_bits, const int offset_bits) { + int k; + (void)gamma; + __m128i res_sub_const, round_bits_const, wt; + unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, + &res_sub_const, &round_bits_const, &wt); + // Vertical filter + for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + + __m128i coeffs[8]; + prepare_vertical_filter_coeffs_gamma0(sy, coeffs); + + __m128i res_lo; + __m128i res_hi; + filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); + + store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, + &res_sub_const, &round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + } +} + +static INLINE void warp_vertical_filter_delta0( + uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, + int16_t delta, int p_height, int p_stride, int p_width, int i, int j, + int sy4, const int reduce_bits_vert, const __m128i *res_add_const, + const int round_bits, const int offset_bits) { + (void)delta; + int k; + __m128i res_sub_const, round_bits_const, wt; + unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, + &res_sub_const, &round_bits_const, &wt); + + __m128i coeffs[8]; + prepare_vertical_filter_coeffs(gamma, sy4, coeffs); + // Vertical filter + for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + __m128i res_lo; + __m128i res_hi; + filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); + + store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, + &res_sub_const, &round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + } +} + +static INLINE void warp_vertical_filter_gamma0_delta0( + uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, + int16_t delta, int p_height, int p_stride, int p_width, int i, int j, + int sy4, const int reduce_bits_vert, const __m128i *res_add_const, + const int round_bits, const int offset_bits) { + (void)delta; + (void)gamma; + int k; + __m128i res_sub_const, round_bits_const, wt; + unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, + &res_sub_const, &round_bits_const, &wt); + + __m128i coeffs[8]; + prepare_vertical_filter_coeffs_gamma0(sy4, coeffs); + // Vertical filter + for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + __m128i res_lo; + __m128i res_hi; + filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); + + store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, + &res_sub_const, &round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + } +} + +static INLINE void prepare_warp_vertical_filter( + uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, + int16_t delta, int p_height, int p_stride, int p_width, int i, int j, + int sy4, const int reduce_bits_vert, const __m128i *res_add_const, + const int round_bits, const int offset_bits) { + if (gamma == 0 && delta == 0) + warp_vertical_filter_gamma0_delta0( + pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j, + sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits); + else if (gamma == 0 && delta != 0) + warp_vertical_filter_gamma0(pred, tmp, conv_params, gamma, delta, p_height, + p_stride, p_width, i, j, sy4, reduce_bits_vert, + res_add_const, round_bits, offset_bits); + else if (gamma != 0 && delta == 0) + warp_vertical_filter_delta0(pred, tmp, conv_params, gamma, delta, p_height, + p_stride, p_width, i, j, sy4, reduce_bits_vert, + res_add_const, round_bits, offset_bits); + else + warp_vertical_filter(pred, tmp, conv_params, gamma, delta, p_height, + p_stride, p_width, i, j, sy4, reduce_bits_vert, + res_add_const, round_bits, offset_bits); +} + +static INLINE void prepare_warp_horizontal_filter( + const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + if (alpha == 0 && beta == 0) + warp_horizontal_filter_alpha0_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, + beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); + else if (alpha == 0 && beta != 0) + warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, + p_height, height, i, offset_bits_horiz, + reduce_bits_horiz); + else if (alpha != 0 && beta == 0) + warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, + p_height, height, i, offset_bits_horiz, + reduce_bits_horiz); + else + warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, + p_height, height, i, offset_bits_horiz, + reduce_bits_horiz); +} + +void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width, + int height, int stride, uint8_t *pred, int p_col, + int p_row, int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + __m128i tmp[15]; + int i, j, k; + const int bd = 8; + const int reduce_bits_horiz = conv_params->round_0; + const int reduce_bits_vert = conv_params->is_compound + ? conv_params->round_1 + : 2 * FILTER_BITS - reduce_bits_horiz; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); + + const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; + const __m128i reduce_bits_vert_const = + _mm_set1_epi32(((1 << reduce_bits_vert) >> 1)); + const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + assert(IMPLIES(conv_params->do_average, conv_params->is_compound)); + + /* Note: For this code to work, the left/right frame borders need to be + extended by at least 13 pixels each. By the time we get here, other + code will have set up this border, but we allow an explicit check + for debugging purposes. + */ + /*for (i = 0; i < height; ++i) { + for (j = 0; j < 13; ++j) { + assert(ref[i * stride - 13 + j] == ref[i * stride]); + assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]); + } + }*/ + __m128i res_add_const_1; + if (conv_params->is_compound == 1) { + res_add_const_1 = _mm_add_epi32(reduce_bits_vert_const, res_add_const); + } else { + res_add_const_1 = _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) + + ((1 << reduce_bits_vert) >> 1)); + } + + for (i = 0; i < p_height; i += 8) { + for (j = 0; j < p_width; j += 8) { + const int32_t src_x = (p_col + j + 4) << subsampling_x; + const int32_t src_y = (p_row + i + 4) << subsampling_y; + const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0]; + const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1]; + const int32_t x4 = dst_x >> subsampling_x; + const int32_t y4 = dst_y >> subsampling_y; + + int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS; + int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS; + int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + + // Add in all the constant terms, including rounding and offset + sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + + // Horizontal filter + // If the block is aligned such that, after clamping, every sample + // would be taken from the leftmost/rightmost column, then we can + // skip the expensive horizontal filter. + if (ix4 <= -7) { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + tmp[k + 7] = _mm_set1_epi16( + (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + + ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz))); + } + } else if (ix4 >= width + 6) { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + tmp[k + 7] = + _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + + ref[iy * stride + (width - 1)] * + (1 << (FILTER_BITS - reduce_bits_horiz))); + } + } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) { + const int out_of_boundary_left = -(ix4 - 6); + const int out_of_boundary_right = (ix4 + 8) - width; + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + // Load source pixels + __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + if (out_of_boundary_left >= 0) { + const __m128i shuffle_reg_left = + _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]); + src = _mm_shuffle_epi8(src, shuffle_reg_left); + } + if (out_of_boundary_right >= 0) { + const __m128i shuffle_reg_right = _mm_loadu_si128( + (__m128i *)warp_pad_right[out_of_boundary_right]); + src = _mm_shuffle_epi8(src, shuffle_reg_right); + } + horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz, + reduce_bits_horiz); + } + } else { + prepare_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, + beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); + } + + // Vertical filter + prepare_warp_vertical_filter( + pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, + j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, offset_bits); + } + } +} diff --git a/libs/libaom/src/av1/common/x86/wiener_convolve_avx2.c b/libs/libaom/src/av1/common/x86/wiener_convolve_avx2.c new file mode 100644 index 000000000..b7ac68383 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/wiener_convolve_avx2.c @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve_avx2.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +// 128-bit xmmwords are written as [ ... ] with the MSB on the left. +// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB +// on the left. +// A row of, say, 8-bit pixels with values p0, p1, p2, ..., p30, p31 will be +// loaded and stored as [ p31 ... p17 p16 ][ p15 ... p1 p0 ]. + +// Exploiting the range of wiener filter coefficients, +// horizontal filtering can be done in 16 bit intermediate precision. +// The details are as follows : +// Consider the horizontal wiener filter coefficients of the following form : +// [C0, C1, C2, 2^(FILTER_BITS) -2 * (C0 + C1 + C2), C2, C1, C0] +// Subtracting 2^(FILTER_BITS) from the centre tap we get the following : +// [C0, C1, C2, -2 * (C0 + C1 + C2), C2, C1, C0] +// The sum of the product "C0 * p0 + C1 * p1 + C2 * p2 -2 * (C0 + C1 + C2) * p3 +// + C2 * p4 + C1 * p5 + C0 * p6" would be in the range of signed 16 bit +// precision. Finally, after rounding the above result by round_0, we multiply +// the centre pixel by 2^(FILTER_BITS - round_0) and add it to get the +// horizontal filter output. + +void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, + const ConvolveParams *conv_params) { + const int bd = 8; + assert(x_step_q4 == 16 && y_step_q4 == 16); + assert(!(w & 7)); + (void)x_step_q4; + (void)y_step_q4; + + DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS) * 8]); + int im_h = h + SUBPEL_TAPS - 2; + int im_stride = 8; + memset(im_block + (im_h * im_stride), 0, MAX_SB_SIZE); + int i, j; + const int center_tap = (SUBPEL_TAPS - 1) / 2; + const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap; + + __m256i filt[4], coeffs_h[4], coeffs_v[4], filt_center; + + assert(conv_params->round_0 > 0); + + filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2); + filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2); + filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2); + + filt_center = _mm256_load_si256((__m256i const *)filt_center_global_avx2); + + const __m128i coeffs_x = _mm_loadu_si128((__m128i *)filter_x); + const __m256i filter_coeffs_x = _mm256_broadcastsi128_si256(coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + coeffs_h[0] = + _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0200u)); + // coeffs 2 3 2 3 2 3 2 3 + coeffs_h[1] = + _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0604u)); + // coeffs 4 5 4 5 4 5 4 5 + coeffs_h[2] = + _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0a08u)); + // coeffs 6 7 6 7 6 7 6 7 + coeffs_h[3] = + _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0e0cu)); + + const __m256i round_const_h = + _mm256_set1_epi16((1 << (conv_params->round_0 - 1))); + const __m256i round_const_horz = + _mm256_set1_epi16((1 << (bd + FILTER_BITS - conv_params->round_0 - 1))); + const __m256i clamp_low = _mm256_setzero_si256(); + const __m256i clamp_high = + _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1); + const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0); + + // Add an offset to account for the "add_src" part of the convolve function. + const __m128i zero_128 = _mm_setzero_si128(); + const __m128i offset_0 = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3); + const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset_0); + + const __m256i filter_coeffs_y = _mm256_broadcastsi128_si256(coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + coeffs_v[0] = _mm256_shuffle_epi32(filter_coeffs_y, 0x00); + // coeffs 2 3 2 3 2 3 2 3 + coeffs_v[1] = _mm256_shuffle_epi32(filter_coeffs_y, 0x55); + // coeffs 4 5 4 5 4 5 4 5 + coeffs_v[2] = _mm256_shuffle_epi32(filter_coeffs_y, 0xaa); + // coeffs 6 7 6 7 6 7 6 7 + coeffs_v[3] = _mm256_shuffle_epi32(filter_coeffs_y, 0xff); + + const __m256i round_const_v = + _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) - + (1 << (bd + conv_params->round_1 - 1))); + const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1); + + for (j = 0; j < w; j += 8) { + for (i = 0; i < im_h; i += 2) { + __m256i data = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); + + // Load the next line + if (i + 1 < im_h) + data = _mm256_inserti128_si256( + data, + _mm_loadu_si128( + (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), + 1); + + __m256i res = convolve_lowbd_x(data, coeffs_h, filt); + + res = + _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); + + __m256i data_0 = _mm256_shuffle_epi8(data, filt_center); + + // multiply the center pixel by 2^(FILTER_BITS - round_0) and add it to + // the result + data_0 = _mm256_slli_epi16(data_0, FILTER_BITS - conv_params->round_0); + res = _mm256_add_epi16(res, data_0); + res = _mm256_add_epi16(res, round_const_horz); + const __m256i res_clamped = + _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high); + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res_clamped); + } + + /* Vertical filter */ + { + __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); + __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); + __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); + __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); + __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); + __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); + + __m256i s[8]; + s[0] = _mm256_unpacklo_epi16(src_0, src_1); + s[1] = _mm256_unpacklo_epi16(src_2, src_3); + s[2] = _mm256_unpacklo_epi16(src_4, src_5); + + s[4] = _mm256_unpackhi_epi16(src_0, src_1); + s[5] = _mm256_unpackhi_epi16(src_2, src_3); + s[6] = _mm256_unpackhi_epi16(src_4, src_5); + + for (i = 0; i < h - 1; i += 2) { + const int16_t *data = &im_block[i * im_stride]; + + const __m256i s6 = + _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); + const __m256i s7 = + _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); + + s[3] = _mm256_unpacklo_epi16(s6, s7); + s[7] = _mm256_unpackhi_epi16(s6, s7); + + __m256i res_a = convolve(s, coeffs_v); + __m256i res_b = convolve(s + 4, coeffs_v); + + const __m256i res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a, round_const_v), round_shift_v); + const __m256i res_b_round = _mm256_sra_epi32( + _mm256_add_epi32(res_b, round_const_v), round_shift_v); + + /* rounding code */ + // 16 bit conversion + const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); + // 8 bit conversion and saturation to uint8 + const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); + + const __m128i res_0 = _mm256_castsi256_si128(res_8b); + const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); + + // Store values into the destination buffer + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; + __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; + + _mm_storel_epi64(p_0, res_0); + _mm_storel_epi64(p_1, res_1); + + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + } + if (h - i) { + s[0] = _mm256_permute2x128_si256(s[0], s[4], 0x20); + s[1] = _mm256_permute2x128_si256(s[1], s[5], 0x20); + s[2] = _mm256_permute2x128_si256(s[2], s[6], 0x20); + + const int16_t *data = &im_block[i * im_stride]; + const __m128i s6_ = _mm_loadu_si128((__m128i *)(data + 6 * im_stride)); + const __m128i s7_ = _mm_loadu_si128((__m128i *)(data + 7 * im_stride)); + + __m128i s3 = _mm_unpacklo_epi16(s6_, s7_); + __m128i s7 = _mm_unpackhi_epi16(s6_, s7_); + + s[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s7, 1); + __m256i convolveres = convolve(s, coeffs_v); + + const __m256i res_round = _mm256_sra_epi32( + _mm256_add_epi32(convolveres, round_const_v), round_shift_v); + + /* rounding code */ + // 16 bit conversion + __m128i reslo = _mm256_castsi256_si128(res_round); + __m128i reshi = _mm256_extracti128_si256(res_round, 1); + const __m128i res_16bit = _mm_packus_epi32(reslo, reshi); + + // 8 bit conversion and saturation to uint8 + const __m128i res_8b = _mm_packus_epi16(res_16bit, res_16bit); + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; + _mm_storel_epi64(p_0, res_8b); + } + } + } +} diff --git a/libs/libaom/src/av1/common/x86/wiener_convolve_sse2.c b/libs/libaom/src/av1/common/x86/wiener_convolve_sse2.c new file mode 100644 index 000000000..f9d00b733 --- /dev/null +++ b/libs/libaom/src/av1/common/x86/wiener_convolve_sse2.c @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" + +void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, + const ConvolveParams *conv_params) { + const int bd = 8; + assert(x_step_q4 == 16 && y_step_q4 == 16); + assert(!(w & 7)); + (void)x_step_q4; + (void)y_step_q4; + + DECLARE_ALIGNED(16, uint16_t, + temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); + int intermediate_height = h + SUBPEL_TAPS - 2; + memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE); + int i, j; + const int center_tap = ((SUBPEL_TAPS - 1) / 2); + const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap; + + const __m128i zero = _mm_setzero_si128(); + // Add an offset to account for the "add_src" part of the convolve function. + const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3); + + /* Horizontal filter */ + { + const __m128i coeffs_x = + _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1))); + + for (i = 0; i < intermediate_height; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + + // Filter even-index pixels + const __m128i src_0 = _mm_unpacklo_epi8(data, zero); + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), + _mm_add_epi32(res_2, res_6)); + res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const), + conv_params->round_0); + + // Filter odd-index pixels + const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero); + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const), + conv_params->round_0); + + // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 + __m128i res = _mm_packs_epi32(res_even, res_odd); + res = _mm_min_epi16( + _mm_max_epi16(res, zero), + _mm_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1)); + _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res); + } + } + } + + /* Vertical filter */ + { + const __m128i coeffs_y = + _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = + _mm_set1_epi32((1 << (conv_params->round_1 - 1)) - + (1 << (bd + conv_params->round_1 - 1))); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + // Filter even-index pixels + const uint16_t *data = &temp[i * MAX_SB_SIZE + j]; + const __m128i src_0 = + _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), + *(__m128i *)(data + 1 * MAX_SB_SIZE)); + const __m128i src_2 = + _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), + *(__m128i *)(data + 3 * MAX_SB_SIZE)); + const __m128i src_4 = + _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), + *(__m128i *)(data + 5 * MAX_SB_SIZE)); + const __m128i src_6 = + _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), + *(__m128i *)(data + 7 * MAX_SB_SIZE)); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = + _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), + *(__m128i *)(data + 1 * MAX_SB_SIZE)); + const __m128i src_3 = + _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), + *(__m128i *)(data + 3 * MAX_SB_SIZE)); + const __m128i src_5 = + _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), + *(__m128i *)(data + 5 * MAX_SB_SIZE)); + const __m128i src_7 = + _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), + *(__m128i *)(data + 7 * MAX_SB_SIZE)); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + const __m128i res_lo_round = _mm_srai_epi32( + _mm_add_epi32(res_lo, round_const), conv_params->round_1); + const __m128i res_hi_round = _mm_srai_epi32( + _mm_add_epi32(res_hi, round_const), conv_params->round_1); + + const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); + __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit); + + __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; + _mm_storel_epi64(p, res_8bit); + } + } + } +} diff --git a/libs/libaom/src/av1/decoder/accounting.c b/libs/libaom/src/av1/decoder/accounting.c new file mode 100644 index 000000000..2e58d09e0 --- /dev/null +++ b/libs/libaom/src/av1/decoder/accounting.c @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "aom/aom_integer.h" +#include "av1/decoder/accounting.h" + +static int accounting_hash(const char *str) { + uint32_t val; + const unsigned char *ustr; + val = 0; + ustr = (const unsigned char *)str; + /* This is about the worst hash one can design, but it should be good enough + here. */ + while (*ustr) val += *ustr++; + return val % AOM_ACCOUNTING_HASH_SIZE; +} + +/* Dictionary lookup based on an open-addressing hash table. */ +int aom_accounting_dictionary_lookup(Accounting *accounting, const char *str) { + int hash; + size_t len; + AccountingDictionary *dictionary; + dictionary = &accounting->syms.dictionary; + hash = accounting_hash(str); + while (accounting->hash_dictionary[hash] != -1) { + if (strcmp(dictionary->strs[accounting->hash_dictionary[hash]], str) == 0) { + return accounting->hash_dictionary[hash]; + } + hash++; + if (hash == AOM_ACCOUNTING_HASH_SIZE) hash = 0; + } + /* No match found. */ + assert(dictionary->num_strs + 1 < MAX_SYMBOL_TYPES); + accounting->hash_dictionary[hash] = dictionary->num_strs; + len = strlen(str); + dictionary->strs[dictionary->num_strs] = malloc(len + 1); + snprintf(dictionary->strs[dictionary->num_strs], len + 1, "%s", str); + dictionary->num_strs++; + return dictionary->num_strs - 1; +} + +void aom_accounting_init(Accounting *accounting) { + int i; + accounting->num_syms_allocated = 1000; + accounting->syms.syms = + malloc(sizeof(AccountingSymbol) * accounting->num_syms_allocated); + accounting->syms.dictionary.num_strs = 0; + assert(AOM_ACCOUNTING_HASH_SIZE > 2 * MAX_SYMBOL_TYPES); + for (i = 0; i < AOM_ACCOUNTING_HASH_SIZE; i++) + accounting->hash_dictionary[i] = -1; + aom_accounting_reset(accounting); +} + +void aom_accounting_reset(Accounting *accounting) { + accounting->syms.num_syms = 0; + accounting->syms.num_binary_syms = 0; + accounting->syms.num_multi_syms = 0; + accounting->context.x = -1; + accounting->context.y = -1; + accounting->last_tell_frac = 0; +} + +void aom_accounting_clear(Accounting *accounting) { + int i; + AccountingDictionary *dictionary; + free(accounting->syms.syms); + dictionary = &accounting->syms.dictionary; + for (i = 0; i < dictionary->num_strs; i++) { + free(dictionary->strs[i]); + } +} + +void aom_accounting_set_context(Accounting *accounting, int16_t x, int16_t y) { + accounting->context.x = x; + accounting->context.y = y; +} + +void aom_accounting_record(Accounting *accounting, const char *str, + uint32_t bits) { + AccountingSymbol sym; + // Reuse previous symbol if it has the same context and symbol id. + if (accounting->syms.num_syms) { + AccountingSymbol *last_sym; + last_sym = &accounting->syms.syms[accounting->syms.num_syms - 1]; + if (memcmp(&last_sym->context, &accounting->context, + sizeof(AccountingSymbolContext)) == 0) { + uint32_t id; + id = aom_accounting_dictionary_lookup(accounting, str); + if (id == last_sym->id) { + last_sym->bits += bits; + last_sym->samples++; + return; + } + } + } + sym.context = accounting->context; + sym.samples = 1; + sym.bits = bits; + sym.id = aom_accounting_dictionary_lookup(accounting, str); + assert(sym.id <= 255); + if (accounting->syms.num_syms == accounting->num_syms_allocated) { + accounting->num_syms_allocated *= 2; + accounting->syms.syms = + realloc(accounting->syms.syms, + sizeof(AccountingSymbol) * accounting->num_syms_allocated); + assert(accounting->syms.syms != NULL); + } + accounting->syms.syms[accounting->syms.num_syms++] = sym; +} + +void aom_accounting_dump(Accounting *accounting) { + int i; + AccountingSymbol *sym; + printf("\n----- Number of recorded syntax elements = %d -----\n", + accounting->syms.num_syms); + printf("----- Total number of symbol calls = %d (%d binary) -----\n", + accounting->syms.num_multi_syms + accounting->syms.num_binary_syms, + accounting->syms.num_binary_syms); + for (i = 0; i < accounting->syms.num_syms; i++) { + sym = &accounting->syms.syms[i]; + printf("%s x: %d, y: %d bits: %f samples: %d\n", + accounting->syms.dictionary.strs[sym->id], sym->context.x, + sym->context.y, (float)sym->bits / 8.0, sym->samples); + } +} diff --git a/libs/libaom/src/av1/decoder/accounting.h b/libs/libaom/src/av1/decoder/accounting.h new file mode 100644 index 000000000..ad2e8b6cf --- /dev/null +++ b/libs/libaom/src/av1/decoder/accounting.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_DECODER_ACCOUNTING_H_ +#define AOM_AV1_DECODER_ACCOUNTING_H_ +#include +#include "aom/aomdx.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +#define AOM_ACCOUNTING_HASH_SIZE (1021) + +/* Max number of entries for symbol types in the dictionary (increase as + necessary). */ +#define MAX_SYMBOL_TYPES (256) + +/*The resolution of fractional-precision bit usage measurements, i.e., + 3 => 1/8th bits.*/ +#define AOM_ACCT_BITRES (3) + +typedef struct { + int16_t x; + int16_t y; +} AccountingSymbolContext; + +typedef struct { + AccountingSymbolContext context; + uint32_t id; + /** Number of bits in units of 1/8 bit. */ + uint32_t bits; + uint32_t samples; +} AccountingSymbol; + +/** Dictionary for translating strings into id. */ +typedef struct { + char *strs[MAX_SYMBOL_TYPES]; + int num_strs; +} AccountingDictionary; + +typedef struct { + /** All recorded symbols decoded. */ + AccountingSymbol *syms; + /** Number of syntax actually recorded. */ + int num_syms; + /** Raw symbol decoding calls for non-binary values. */ + int num_multi_syms; + /** Raw binary symbol decoding calls. */ + int num_binary_syms; + /** Dictionary for translating strings into id. */ + AccountingDictionary dictionary; +} AccountingSymbols; + +struct Accounting { + AccountingSymbols syms; + /** Size allocated for symbols (not all may be used). */ + int num_syms_allocated; + int16_t hash_dictionary[AOM_ACCOUNTING_HASH_SIZE]; + AccountingSymbolContext context; + uint32_t last_tell_frac; +}; + +void aom_accounting_init(Accounting *accounting); +void aom_accounting_reset(Accounting *accounting); +void aom_accounting_clear(Accounting *accounting); +void aom_accounting_set_context(Accounting *accounting, int16_t x, int16_t y); +int aom_accounting_dictionary_lookup(Accounting *accounting, const char *str); +void aom_accounting_record(Accounting *accounting, const char *str, + uint32_t bits); +void aom_accounting_dump(Accounting *accounting); +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus +#endif // AOM_AV1_DECODER_ACCOUNTING_H_ diff --git a/libs/libaom/src/av1/decoder/decodeframe.c b/libs/libaom/src/av1/decoder/decodeframe.c new file mode 100644 index 000000000..7abfac4aa --- /dev/null +++ b/libs/libaom/src/av1/decoder/decodeframe.c @@ -0,0 +1,5326 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_codec.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/binary_codes_reader.h" +#include "aom_dsp/bitreader.h" +#include "aom_dsp/bitreader_buffer.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/mem.h" +#include "aom_ports/mem_ops.h" +#include "aom_scale/aom_scale.h" +#include "aom_util/aom_thread.h" + +#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG +#include "aom_util/debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG + +#include "av1/common/alloccommon.h" +#include "av1/common/cdef.h" +#include "av1/common/cfl.h" +#if CONFIG_INSPECTION +#include "av1/decoder/inspection.h" +#endif +#include "av1/common/common.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/entropymv.h" +#include "av1/common/frame_buffers.h" +#include "av1/common/idct.h" +#include "av1/common/mvref_common.h" +#include "av1/common/pred_common.h" +#include "av1/common/quant_common.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/common/resize.h" +#include "av1/common/seg_common.h" +#include "av1/common/thread_common.h" +#include "av1/common/tile_common.h" +#include "av1/common/warped_motion.h" +#include "av1/common/obmc.h" +#include "av1/decoder/decodeframe.h" +#include "av1/decoder/decodemv.h" +#include "av1/decoder/decoder.h" +#include "av1/decoder/decodetxb.h" +#include "av1/decoder/detokenize.h" + +#define ACCT_STR __func__ + +#define AOM_MIN_THREADS_PER_TILE 1 +#define AOM_MAX_THREADS_PER_TILE 2 + +// This is needed by ext_tile related unit tests. +#define EXT_TILE_DEBUG 1 +#define MC_TEMP_BUF_PELS \ + (((MAX_SB_SIZE)*2 + (AOM_INTERP_EXTEND)*2) * \ + ((MAX_SB_SIZE)*2 + (AOM_INTERP_EXTEND)*2)) + +// Checks that the remaining bits start with a 1 and ends with 0s. +// It consumes an additional byte, if already byte aligned before the check. +int av1_check_trailing_bits(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) { + AV1_COMMON *const cm = &pbi->common; + // bit_offset is set to 0 (mod 8) when the reader is already byte aligned + int bits_before_alignment = 8 - rb->bit_offset % 8; + int trailing = aom_rb_read_literal(rb, bits_before_alignment); + if (trailing != (1 << (bits_before_alignment - 1))) { + cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + return 0; +} + +// Use only_chroma = 1 to only set the chroma planes +static AOM_INLINE void set_planes_to_neutral_grey( + const SequenceHeader *const seq_params, const YV12_BUFFER_CONFIG *const buf, + int only_chroma) { + if (seq_params->use_highbitdepth) { + const int val = 1 << (seq_params->bit_depth - 1); + for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) { + const int is_uv = plane > 0; + uint16_t *const base = CONVERT_TO_SHORTPTR(buf->buffers[plane]); + // Set the first row to neutral grey. Then copy the first row to all + // subsequent rows. + if (buf->crop_heights[is_uv] > 0) { + aom_memset16(base, val, buf->crop_widths[is_uv]); + for (int row_idx = 1; row_idx < buf->crop_heights[is_uv]; row_idx++) { + memcpy(&base[row_idx * buf->strides[is_uv]], base, + sizeof(*base) * buf->crop_widths[is_uv]); + } + } + } + } else { + for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) { + const int is_uv = plane > 0; + for (int row_idx = 0; row_idx < buf->crop_heights[is_uv]; row_idx++) { + memset(&buf->buffers[plane][row_idx * buf->uv_stride], 1 << 7, + buf->crop_widths[is_uv]); + } + } + } +} + +static AOM_INLINE void loop_restoration_read_sb_coeffs( + const AV1_COMMON *const cm, MACROBLOCKD *xd, aom_reader *const r, int plane, + int runit_idx); + +static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) { + return len != 0 && len <= (size_t)(end - start); +} + +static TX_MODE read_tx_mode(struct aom_read_bit_buffer *rb, + int coded_lossless) { + if (coded_lossless) return ONLY_4X4; + return aom_rb_read_bit(rb) ? TX_MODE_SELECT : TX_MODE_LARGEST; +} + +static REFERENCE_MODE read_frame_reference_mode( + const AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { + if (frame_is_intra_only(cm)) { + return SINGLE_REFERENCE; + } else { + return aom_rb_read_bit(rb) ? REFERENCE_MODE_SELECT : SINGLE_REFERENCE; + } +} + +static AOM_INLINE void inverse_transform_block(MACROBLOCKD *xd, int plane, + const TX_TYPE tx_type, + const TX_SIZE tx_size, + uint8_t *dst, int stride, + int reduced_tx_set) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + tran_low_t *const dqcoeff = pd->dqcoeff_block + xd->cb_offset[plane]; + eob_info *eob_data = pd->eob_data + xd->txb_offset[plane]; + uint16_t scan_line = eob_data->max_scan_line; + uint16_t eob = eob_data->eob; + av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, stride, + eob, reduced_tx_set); + memset(dqcoeff, 0, (scan_line + 1) * sizeof(dqcoeff[0])); +} + +static AOM_INLINE void read_coeffs_tx_intra_block( + const AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_reader *const r, + const int plane, const int row, const int col, const TX_SIZE tx_size) { + MB_MODE_INFO *mbmi = xd->mi[0]; + if (!mbmi->skip) { +#if TXCOEFF_TIMER + struct aom_usec_timer timer; + aom_usec_timer_start(&timer); +#endif + av1_read_coeffs_txb_facade(cm, xd, r, plane, row, col, tx_size); +#if TXCOEFF_TIMER + aom_usec_timer_mark(&timer); + const int64_t elapsed_time = aom_usec_timer_elapsed(&timer); + cm->txcoeff_timer += elapsed_time; + ++cm->txb_count; +#endif + } +} + +static AOM_INLINE void decode_block_void(const AV1_COMMON *const cm, + MACROBLOCKD *const xd, + aom_reader *const r, const int plane, + const int row, const int col, + const TX_SIZE tx_size) { + (void)cm; + (void)xd; + (void)r; + (void)plane; + (void)row; + (void)col; + (void)tx_size; +} + +static AOM_INLINE void predict_inter_block_void(AV1_COMMON *const cm, + MACROBLOCKD *const xd, + BLOCK_SIZE bsize) { + (void)cm; + (void)xd; + (void)bsize; +} + +static AOM_INLINE void cfl_store_inter_block_void(AV1_COMMON *const cm, + MACROBLOCKD *const xd) { + (void)cm; + (void)xd; +} + +static AOM_INLINE void predict_and_reconstruct_intra_block( + const AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_reader *const r, + const int plane, const int row, const int col, const TX_SIZE tx_size) { + (void)r; + MB_MODE_INFO *mbmi = xd->mi[0]; + PLANE_TYPE plane_type = get_plane_type(plane); + + av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size); + + if (!mbmi->skip) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + eob_info *eob_data = pd->eob_data + xd->txb_offset[plane]; + if (eob_data->eob) { + const bool reduced_tx_set_used = cm->features.reduced_tx_set_used; + // tx_type was read out in av1_read_coeffs_txb. + const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, row, col, tx_size, + reduced_tx_set_used); + uint8_t *dst = &pd->dst.buf[(row * pd->dst.stride + col) << MI_SIZE_LOG2]; + inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride, + reduced_tx_set_used); + } + } + if (plane == AOM_PLANE_Y && store_cfl_required(cm, xd)) { + cfl_store_tx(xd, row, col, tx_size, mbmi->sb_type); + } +} + +static AOM_INLINE void inverse_transform_inter_block( + const AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_reader *const r, + const int plane, const int blk_row, const int blk_col, + const TX_SIZE tx_size) { + (void)r; + PLANE_TYPE plane_type = get_plane_type(plane); + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const bool reduced_tx_set_used = cm->features.reduced_tx_set_used; + // tx_type was read out in av1_read_coeffs_txb. + const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, + tx_size, reduced_tx_set_used); + + uint8_t *dst = + &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2]; + inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride, + reduced_tx_set_used); +#if CONFIG_MISMATCH_DEBUG + int pixel_c, pixel_r; + BLOCK_SIZE bsize = txsize_to_bsize[tx_size]; + int blk_w = block_size_wide[bsize]; + int blk_h = block_size_high[bsize]; + const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2); + const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2); + mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, blk_col, blk_row, + pd->subsampling_x, pd->subsampling_y); + mismatch_check_block_tx(dst, pd->dst.stride, cm->current_frame.order_hint, + plane, pixel_c, pixel_r, blk_w, blk_h, + xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); +#endif +} + +static AOM_INLINE void set_cb_buffer_offsets(MACROBLOCKD *const xd, + TX_SIZE tx_size, int plane) { + xd->cb_offset[plane] += tx_size_wide[tx_size] * tx_size_high[tx_size]; + xd->txb_offset[plane] = + xd->cb_offset[plane] / (TX_SIZE_W_MIN * TX_SIZE_H_MIN); +} + +static AOM_INLINE void decode_reconstruct_tx( + AV1_COMMON *cm, ThreadData *const td, aom_reader *r, + MB_MODE_INFO *const mbmi, int plane, BLOCK_SIZE plane_bsize, int blk_row, + int blk_col, int block, TX_SIZE tx_size, int *eob_total) { + MACROBLOCKD *const xd = &td->xd; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const TX_SIZE plane_tx_size = + plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x, + pd->subsampling_y) + : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row, + blk_col)]; + // Scale to match transform block unit. + const int max_blocks_high = max_block_high(xd, plane_bsize, plane); + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + if (tx_size == plane_tx_size || plane) { + td->read_coeffs_tx_inter_block_visit(cm, xd, r, plane, blk_row, blk_col, + tx_size); + + td->inverse_tx_inter_block_visit(cm, xd, r, plane, blk_row, blk_col, + tx_size); + eob_info *eob_data = pd->eob_data + xd->txb_offset[plane]; + *eob_total += eob_data->eob; + set_cb_buffer_offsets(xd, tx_size, plane); + } else { + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + assert(IMPLIES(tx_size <= TX_4X4, sub_txs == tx_size)); + assert(IMPLIES(tx_size > TX_4X4, sub_txs < tx_size)); + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + const int sub_step = bsw * bsh; + + assert(bsw > 0 && bsh > 0); + + for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) { + for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) { + const int offsetr = blk_row + row; + const int offsetc = blk_col + col; + + if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; + + decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize, offsetr, + offsetc, block, sub_txs, eob_total); + block += sub_step; + } + } + } +} + +static AOM_INLINE void set_offsets(AV1_COMMON *const cm, MACROBLOCKD *const xd, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int bw, int bh, int x_mis, int y_mis) { + const int num_planes = av1_num_planes(cm); + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const TileInfo *const tile = &xd->tile; + + set_mi_offsets(mi_params, xd, mi_row, mi_col); + xd->mi[0]->sb_type = bsize; +#if CONFIG_RD_DEBUG + xd->mi[0]->mi_row = mi_row; + xd->mi[0]->mi_col = mi_col; +#endif + + assert(x_mis && y_mis); + for (int x = 1; x < x_mis; ++x) xd->mi[x] = xd->mi[0]; + int idx = mi_params->mi_stride; + for (int y = 1; y < y_mis; ++y) { + memcpy(&xd->mi[idx], &xd->mi[0], x_mis * sizeof(xd->mi[0])); + idx += mi_params->mi_stride; + } + + set_plane_n4(xd, bw, bh, num_planes); + set_entropy_context(xd, mi_row, mi_col, num_planes); + + // Distance of Mb to the various image edges. These are specified to 8th pel + // as they are always compared to values that are in 1/8th pel units + set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows, + mi_params->mi_cols); + + av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0, + num_planes); +} + +static AOM_INLINE void decode_mbmi_block(AV1Decoder *const pbi, + MACROBLOCKD *const xd, int mi_row, + int mi_col, aom_reader *r, + PARTITION_TYPE partition, + BLOCK_SIZE bsize) { + AV1_COMMON *const cm = &pbi->common; + const SequenceHeader *const seq_params = &cm->seq_params; + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + const int x_mis = AOMMIN(bw, cm->mi_params.mi_cols - mi_col); + const int y_mis = AOMMIN(bh, cm->mi_params.mi_rows - mi_row); + +#if CONFIG_ACCOUNTING + aom_accounting_set_context(&pbi->accounting, mi_col, mi_row); +#endif + set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis); + xd->mi[0]->partition = partition; + av1_read_mode_info(pbi, xd, r, x_mis, y_mis); + if (bsize >= BLOCK_8X8 && + (seq_params->subsampling_x || seq_params->subsampling_y)) { + const BLOCK_SIZE uv_subsize = + ss_size_lookup[bsize][seq_params->subsampling_x] + [seq_params->subsampling_y]; + if (uv_subsize == BLOCK_INVALID) + aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, + "Invalid block size."); + } +} + +typedef struct PadBlock { + int x0; + int x1; + int y0; + int y1; +} PadBlock; + +#if CONFIG_AV1_HIGHBITDEPTH +static AOM_INLINE void highbd_build_mc_border(const uint8_t *src8, + int src_stride, uint8_t *dst8, + int dst_stride, int x, int y, + int b_w, int b_h, int w, int h) { + // Get a pointer to the start of the real data for this row. + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const uint16_t *ref_row = src - x - y * src_stride; + + if (y >= h) + ref_row += (h - 1) * src_stride; + else if (y > 0) + ref_row += y * src_stride; + + do { + int right = 0, copy; + int left = x < 0 ? -x : 0; + + if (left > b_w) left = b_w; + + if (x + b_w > w) right = x + b_w - w; + + if (right > b_w) right = b_w; + + copy = b_w - left - right; + + if (left) aom_memset16(dst, ref_row[0], left); + + if (copy) memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t)); + + if (right) aom_memset16(dst + left + copy, ref_row[w - 1], right); + + dst += dst_stride; + ++y; + + if (y > 0 && y < h) ref_row += src_stride; + } while (--b_h); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static AOM_INLINE void build_mc_border(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int x, + int y, int b_w, int b_h, int w, int h) { + // Get a pointer to the start of the real data for this row. + const uint8_t *ref_row = src - x - y * src_stride; + + if (y >= h) + ref_row += (h - 1) * src_stride; + else if (y > 0) + ref_row += y * src_stride; + + do { + int right = 0, copy; + int left = x < 0 ? -x : 0; + + if (left > b_w) left = b_w; + + if (x + b_w > w) right = x + b_w - w; + + if (right > b_w) right = b_w; + + copy = b_w - left - right; + + if (left) memset(dst, ref_row[0], left); + + if (copy) memcpy(dst + left, ref_row + x + left, copy); + + if (right) memset(dst + left + copy, ref_row[w - 1], right); + + dst += dst_stride; + ++y; + + if (y > 0 && y < h) ref_row += src_stride; + } while (--b_h); +} + +static INLINE int update_extend_mc_border_params( + const struct scale_factors *const sf, struct buf_2d *const pre_buf, + MV32 scaled_mv, PadBlock *block, int subpel_x_mv, int subpel_y_mv, + int do_warp, int is_intrabc, int *x_pad, int *y_pad) { + const int is_scaled = av1_is_scaled(sf); + // Get reference width and height. + int frame_width = pre_buf->width; + int frame_height = pre_buf->height; + + // Do border extension if there is motion or + // width/height is not a multiple of 8 pixels. + if ((!is_intrabc) && (!do_warp) && + (is_scaled || scaled_mv.col || scaled_mv.row || (frame_width & 0x7) || + (frame_height & 0x7))) { + if (subpel_x_mv || (sf->x_step_q4 != SUBPEL_SHIFTS)) { + block->x0 -= AOM_INTERP_EXTEND - 1; + block->x1 += AOM_INTERP_EXTEND; + *x_pad = 1; + } + + if (subpel_y_mv || (sf->y_step_q4 != SUBPEL_SHIFTS)) { + block->y0 -= AOM_INTERP_EXTEND - 1; + block->y1 += AOM_INTERP_EXTEND; + *y_pad = 1; + } + + // Skip border extension if block is inside the frame. + if (block->x0 < 0 || block->x1 > frame_width - 1 || block->y0 < 0 || + block->y1 > frame_height - 1) { + return 1; + } + } + return 0; +} + +static INLINE void extend_mc_border(const struct scale_factors *const sf, + struct buf_2d *const pre_buf, + MV32 scaled_mv, PadBlock block, + int subpel_x_mv, int subpel_y_mv, + int do_warp, int is_intrabc, int highbd, + uint8_t *mc_buf, uint8_t **pre, + int *src_stride) { + int x_pad = 0, y_pad = 0; + if (update_extend_mc_border_params(sf, pre_buf, scaled_mv, &block, + subpel_x_mv, subpel_y_mv, do_warp, + is_intrabc, &x_pad, &y_pad)) { + // Get reference block pointer. + const uint8_t *const buf_ptr = + pre_buf->buf0 + block.y0 * pre_buf->stride + block.x0; + int buf_stride = pre_buf->stride; + const int b_w = block.x1 - block.x0; + const int b_h = block.y1 - block.y0; + +#if CONFIG_AV1_HIGHBITDEPTH + // Extend the border. + if (highbd) { + highbd_build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0, + block.y0, b_w, b_h, pre_buf->width, + pre_buf->height); + } else { + build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0, block.y0, b_w, + b_h, pre_buf->width, pre_buf->height); + } +#else + (void)highbd; + build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0, block.y0, b_w, + b_h, pre_buf->width, pre_buf->height); +#endif + *src_stride = b_w; + *pre = mc_buf + y_pad * (AOM_INTERP_EXTEND - 1) * b_w + + x_pad * (AOM_INTERP_EXTEND - 1); + } +} + +static void dec_calc_subpel_params(const MV *const src_mv, + InterPredParams *const inter_pred_params, + const MACROBLOCKD *const xd, int mi_x, + int mi_y, uint8_t **pre, + SubpelParams *subpel_params, int *src_stride, + PadBlock *block, MV32 *scaled_mv, + int *subpel_x_mv, int *subpel_y_mv) { + const struct scale_factors *sf = inter_pred_params->scale_factors; + struct buf_2d *pre_buf = &inter_pred_params->ref_frame_buf; + const int bw = inter_pred_params->block_width; + const int bh = inter_pred_params->block_height; + const int is_scaled = av1_is_scaled(sf); + if (is_scaled) { + int ssx = inter_pred_params->subsampling_x; + int ssy = inter_pred_params->subsampling_y; + int orig_pos_y = inter_pred_params->pix_row << SUBPEL_BITS; + orig_pos_y += src_mv->row * (1 << (1 - ssy)); + int orig_pos_x = inter_pred_params->pix_col << SUBPEL_BITS; + orig_pos_x += src_mv->col * (1 << (1 - ssx)); + int pos_y = sf->scale_value_y(orig_pos_y, sf); + int pos_x = sf->scale_value_x(orig_pos_x, sf); + pos_x += SCALE_EXTRA_OFF; + pos_y += SCALE_EXTRA_OFF; + + const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy); + const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx); + const int bottom = (pre_buf->height + AOM_INTERP_EXTEND) + << SCALE_SUBPEL_BITS; + const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS; + pos_y = clamp(pos_y, top, bottom); + pos_x = clamp(pos_x, left, right); + + subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK; + subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK; + subpel_params->xs = sf->x_step_q4; + subpel_params->ys = sf->y_step_q4; + + // Get reference block top left coordinate. + block->x0 = pos_x >> SCALE_SUBPEL_BITS; + block->y0 = pos_y >> SCALE_SUBPEL_BITS; + + // Get reference block bottom right coordinate. + block->x1 = + ((pos_x + (bw - 1) * subpel_params->xs) >> SCALE_SUBPEL_BITS) + 1; + block->y1 = + ((pos_y + (bh - 1) * subpel_params->ys) >> SCALE_SUBPEL_BITS) + 1; + + MV temp_mv; + temp_mv = clamp_mv_to_umv_border_sb(xd, src_mv, bw, bh, + inter_pred_params->subsampling_x, + inter_pred_params->subsampling_y); + *scaled_mv = av1_scale_mv(&temp_mv, mi_x, mi_y, sf); + scaled_mv->row += SCALE_EXTRA_OFF; + scaled_mv->col += SCALE_EXTRA_OFF; + + *subpel_x_mv = scaled_mv->col & SCALE_SUBPEL_MASK; + *subpel_y_mv = scaled_mv->row & SCALE_SUBPEL_MASK; + } else { + // Get block position in current frame. + int pos_x = inter_pred_params->pix_col << SUBPEL_BITS; + int pos_y = inter_pred_params->pix_row << SUBPEL_BITS; + + const MV mv_q4 = clamp_mv_to_umv_border_sb( + xd, src_mv, bw, bh, inter_pred_params->subsampling_x, + inter_pred_params->subsampling_y); + subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS; + subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS; + subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS; + + // Get reference block top left coordinate. + pos_x += mv_q4.col; + pos_y += mv_q4.row; + block->x0 = pos_x >> SUBPEL_BITS; + block->y0 = pos_y >> SUBPEL_BITS; + + // Get reference block bottom right coordinate. + block->x1 = (pos_x >> SUBPEL_BITS) + (bw - 1) + 1; + block->y1 = (pos_y >> SUBPEL_BITS) + (bh - 1) + 1; + + scaled_mv->row = mv_q4.row; + scaled_mv->col = mv_q4.col; + *subpel_x_mv = scaled_mv->col & SUBPEL_MASK; + *subpel_y_mv = scaled_mv->row & SUBPEL_MASK; + } + *pre = pre_buf->buf0 + block->y0 * pre_buf->stride + block->x0; + *src_stride = pre_buf->stride; +} + +static void dec_calc_subpel_params_and_extend( + const MV *const src_mv, InterPredParams *const inter_pred_params, + MACROBLOCKD *xd, int mi_x, int mi_y, int ref, uint8_t **pre, + SubpelParams *subpel_params, int *src_stride) { + PadBlock block; + MV32 scaled_mv; + int subpel_x_mv, subpel_y_mv; + dec_calc_subpel_params(src_mv, inter_pred_params, xd, mi_x, mi_y, pre, + subpel_params, src_stride, &block, &scaled_mv, + &subpel_x_mv, &subpel_y_mv); + extend_mc_border( + inter_pred_params->scale_factors, &inter_pred_params->ref_frame_buf, + scaled_mv, block, subpel_x_mv, subpel_y_mv, + inter_pred_params->mode == WARP_PRED, inter_pred_params->is_intrabc, + inter_pred_params->use_hbd_buf, xd->mc_buf[ref], pre, src_stride); +} + +static void dec_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, + int plane, const MB_MODE_INFO *mi, + int build_for_obmc, int bw, int bh, + int mi_x, int mi_y) { + av1_build_inter_predictors(cm, xd, plane, mi, build_for_obmc, bw, bh, mi_x, + mi_y, dec_calc_subpel_params_and_extend); +} + +static AOM_INLINE void dec_build_inter_predictor(const AV1_COMMON *cm, + MACROBLOCKD *xd, int mi_row, + int mi_col, BLOCK_SIZE bsize) { + const int num_planes = av1_num_planes(cm); + for (int plane = 0; plane < num_planes; ++plane) { + if (plane && !xd->is_chroma_ref) break; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + dec_build_inter_predictors(cm, xd, plane, xd->mi[0], 0, + xd->plane[plane].width, xd->plane[plane].height, + mi_x, mi_y); + if (is_interintra_pred(xd->mi[0])) { + BUFFER_SET ctx = { { xd->plane[0].dst.buf, xd->plane[1].dst.buf, + xd->plane[2].dst.buf }, + { xd->plane[0].dst.stride, xd->plane[1].dst.stride, + xd->plane[2].dst.stride } }; + av1_build_interintra_predictor(cm, xd, xd->plane[plane].dst.buf, + xd->plane[plane].dst.stride, &ctx, plane, + bsize); + } + } +} + +static INLINE void dec_build_prediction_by_above_pred( + MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) { + struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt; + const int above_mi_col = xd->mi_col + rel_mi_col; + int mi_x, mi_y; + MB_MODE_INFO backup_mbmi = *above_mbmi; + + (void)rel_mi_row; + (void)dir; + + av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, op_mi_size, + &backup_mbmi, ctxt, num_planes); + mi_x = above_mi_col << MI_SIZE_LOG2; + mi_y = xd->mi_row << MI_SIZE_LOG2; + + const BLOCK_SIZE bsize = xd->mi[0]->sb_type; + + for (int j = 0; j < num_planes; ++j) { + const struct macroblockd_plane *pd = &xd->plane[j]; + int bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x; + int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4, + block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1)); + + if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue; + dec_build_inter_predictors(ctxt->cm, xd, j, &backup_mbmi, 1, bw, bh, mi_x, + mi_y); + } +} + +static AOM_INLINE void dec_build_prediction_by_above_preds( + const AV1_COMMON *cm, MACROBLOCKD *xd, uint8_t *tmp_buf[MAX_MB_PLANE], + int tmp_width[MAX_MB_PLANE], int tmp_height[MAX_MB_PLANE], + int tmp_stride[MAX_MB_PLANE]) { + if (!xd->up_available) return; + + // Adjust mb_to_bottom_edge to have the correct value for the OBMC + // prediction block. This is half the height of the original block, + // except for 128-wide blocks, where we only use a height of 32. + const int this_height = xd->height * MI_SIZE; + const int pred_height = AOMMIN(this_height / 2, 32); + xd->mb_to_bottom_edge += GET_MV_SUBPEL(this_height - pred_height); + struct build_prediction_ctxt ctxt = { cm, tmp_buf, + tmp_width, tmp_height, + tmp_stride, xd->mb_to_right_edge }; + const BLOCK_SIZE bsize = xd->mi[0]->sb_type; + foreach_overlappable_nb_above(cm, xd, + max_neighbor_obmc[mi_size_wide_log2[bsize]], + dec_build_prediction_by_above_pred, &ctxt); + + xd->mb_to_left_edge = -GET_MV_SUBPEL(xd->mi_col * MI_SIZE); + xd->mb_to_right_edge = ctxt.mb_to_far_edge; + xd->mb_to_bottom_edge -= GET_MV_SUBPEL(this_height - pred_height); +} + +static INLINE void dec_build_prediction_by_left_pred( + MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) { + struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt; + const int left_mi_row = xd->mi_row + rel_mi_row; + int mi_x, mi_y; + MB_MODE_INFO backup_mbmi = *left_mbmi; + + (void)rel_mi_col; + (void)dir; + + av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, op_mi_size, + &backup_mbmi, ctxt, num_planes); + mi_x = xd->mi_col << MI_SIZE_LOG2; + mi_y = left_mi_row << MI_SIZE_LOG2; + const BLOCK_SIZE bsize = xd->mi[0]->sb_type; + + for (int j = 0; j < num_planes; ++j) { + const struct macroblockd_plane *pd = &xd->plane[j]; + int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4, + block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1)); + int bh = (op_mi_size << MI_SIZE_LOG2) >> pd->subsampling_y; + + if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue; + dec_build_inter_predictors(ctxt->cm, xd, j, &backup_mbmi, 1, bw, bh, mi_x, + mi_y); + } +} + +static AOM_INLINE void dec_build_prediction_by_left_preds( + const AV1_COMMON *cm, MACROBLOCKD *xd, uint8_t *tmp_buf[MAX_MB_PLANE], + int tmp_width[MAX_MB_PLANE], int tmp_height[MAX_MB_PLANE], + int tmp_stride[MAX_MB_PLANE]) { + if (!xd->left_available) return; + + // Adjust mb_to_right_edge to have the correct value for the OBMC + // prediction block. This is half the width of the original block, + // except for 128-wide blocks, where we only use a width of 32. + const int this_width = xd->width * MI_SIZE; + const int pred_width = AOMMIN(this_width / 2, 32); + xd->mb_to_right_edge += GET_MV_SUBPEL(this_width - pred_width); + + struct build_prediction_ctxt ctxt = { cm, tmp_buf, + tmp_width, tmp_height, + tmp_stride, xd->mb_to_bottom_edge }; + const BLOCK_SIZE bsize = xd->mi[0]->sb_type; + foreach_overlappable_nb_left(cm, xd, + max_neighbor_obmc[mi_size_high_log2[bsize]], + dec_build_prediction_by_left_pred, &ctxt); + + xd->mb_to_top_edge = -GET_MV_SUBPEL(xd->mi_row * MI_SIZE); + xd->mb_to_right_edge -= GET_MV_SUBPEL(this_width - pred_width); + xd->mb_to_bottom_edge = ctxt.mb_to_far_edge; +} + +static void set_dst_buf(MACROBLOCKD *xd, uint8_t **dst_buf1, + uint8_t **dst_buf2) { + dst_buf1[0] = xd->tmp_obmc_bufs[0]; + dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE; + dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2; + dst_buf2[0] = xd->tmp_obmc_bufs[1]; + dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE; + dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void set_dst_buf_highbd(MACROBLOCKD *xd, uint8_t **dst_buf1, + uint8_t **dst_buf2) { + int len = sizeof(uint16_t); + dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]); + dst_buf1[1] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len); + dst_buf1[2] = + CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len); + dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]); + dst_buf2[1] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len); + dst_buf2[2] = + CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len); +} +#endif + +static AOM_INLINE void dec_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, + MACROBLOCKD *xd) { + const int num_planes = av1_num_planes(cm); + uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE]; + int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + set_dst_buf_highbd(xd, dst_buf1, dst_buf2); + } else { + set_dst_buf(xd, dst_buf1, dst_buf2); + } +#else + set_dst_buf(xd, dst_buf1, dst_buf2); +#endif + + dec_build_prediction_by_above_preds(cm, xd, dst_buf1, dst_width1, dst_height1, + dst_stride1); + dec_build_prediction_by_left_preds(cm, xd, dst_buf2, dst_width2, dst_height2, + dst_stride2); + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, &cm->cur_frame->buf, + mi_row, mi_col, 0, num_planes); + av1_build_obmc_inter_prediction(cm, xd, dst_buf1, dst_stride1, dst_buf2, + dst_stride2); +} + +static AOM_INLINE void cfl_store_inter_block(AV1_COMMON *const cm, + MACROBLOCKD *const xd) { + MB_MODE_INFO *mbmi = xd->mi[0]; + if (store_cfl_required(cm, xd)) { + cfl_store_block(xd, mbmi->sb_type, mbmi->tx_size); + } +} + +static AOM_INLINE void predict_inter_block(AV1_COMMON *const cm, + MACROBLOCKD *const xd, + BLOCK_SIZE bsize) { + MB_MODE_INFO *mbmi = xd->mi[0]; + const int num_planes = av1_num_planes(cm); + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) { + const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref]; + if (frame < LAST_FRAME) { + assert(is_intrabc_block(mbmi)); + assert(frame == INTRA_FRAME); + assert(ref == 0); + } else { + const RefCntBuffer *ref_buf = get_ref_frame_buf(cm, frame); + const struct scale_factors *ref_scale_factors = + get_ref_scale_factors_const(cm, frame); + + xd->block_ref_scale_factors[ref] = ref_scale_factors; + av1_setup_pre_planes(xd, ref, &ref_buf->buf, mi_row, mi_col, + ref_scale_factors, num_planes); + } + } + + dec_build_inter_predictor(cm, xd, mi_row, mi_col, bsize); + if (mbmi->motion_mode == OBMC_CAUSAL) { + dec_build_obmc_inter_predictors_sb(cm, xd); + } +#if CONFIG_MISMATCH_DEBUG + for (int plane = 0; plane < num_planes; ++plane) { + const struct macroblockd_plane *pd = &xd->plane[plane]; + int pixel_c, pixel_r; + mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0, pd->subsampling_x, + pd->subsampling_y); + if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x, + pd->subsampling_y)) + continue; + mismatch_check_block_pre(pd->dst.buf, pd->dst.stride, + cm->current_frame.order_hint, plane, pixel_c, + pixel_r, pd->width, pd->height, + xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); + } +#endif +} + +static AOM_INLINE void set_color_index_map_offset(MACROBLOCKD *const xd, + int plane, aom_reader *r) { + (void)r; + Av1ColorMapParam params; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + av1_get_block_dimensions(mbmi->sb_type, plane, xd, ¶ms.plane_width, + ¶ms.plane_height, NULL, NULL); + xd->color_index_map_offset[plane] += params.plane_width * params.plane_height; +} + +static AOM_INLINE void decode_token_recon_block(AV1Decoder *const pbi, + ThreadData *const td, + aom_reader *r, + BLOCK_SIZE bsize) { + AV1_COMMON *const cm = &pbi->common; + MACROBLOCKD *const xd = &td->xd; + const int num_planes = av1_num_planes(cm); + MB_MODE_INFO *mbmi = xd->mi[0]; + + if (!is_inter_block(mbmi)) { + int row, col; + assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x, + xd->plane[0].subsampling_y)); + const int max_blocks_wide = max_block_wide(xd, bsize, 0); + const int max_blocks_high = max_block_high(xd, bsize, 0); + const BLOCK_SIZE max_unit_bsize = BLOCK_64X64; + int mu_blocks_wide = mi_size_wide[max_unit_bsize]; + int mu_blocks_high = mi_size_high[max_unit_bsize]; + mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide); + mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high); + + for (row = 0; row < max_blocks_high; row += mu_blocks_high) { + for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) { + for (int plane = 0; plane < num_planes; ++plane) { + if (plane && !xd->is_chroma_ref) break; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const TX_SIZE tx_size = av1_get_tx_size(plane, xd); + const int stepr = tx_size_high_unit[tx_size]; + const int stepc = tx_size_wide_unit[tx_size]; + + const int unit_height = ROUND_POWER_OF_TWO( + AOMMIN(mu_blocks_high + row, max_blocks_high), pd->subsampling_y); + const int unit_width = ROUND_POWER_OF_TWO( + AOMMIN(mu_blocks_wide + col, max_blocks_wide), pd->subsampling_x); + + for (int blk_row = row >> pd->subsampling_y; blk_row < unit_height; + blk_row += stepr) { + for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width; + blk_col += stepc) { + td->read_coeffs_tx_intra_block_visit(cm, xd, r, plane, blk_row, + blk_col, tx_size); + td->predict_and_recon_intra_block_visit(cm, xd, r, plane, blk_row, + blk_col, tx_size); + set_cb_buffer_offsets(xd, tx_size, plane); + } + } + } + } + } + } else { + td->predict_inter_block_visit(cm, xd, bsize); + // Reconstruction + if (!mbmi->skip) { + int eobtotal = 0; + + const int max_blocks_wide = max_block_wide(xd, bsize, 0); + const int max_blocks_high = max_block_high(xd, bsize, 0); + int row, col; + + const BLOCK_SIZE max_unit_bsize = BLOCK_64X64; + assert(max_unit_bsize == + get_plane_block_size(BLOCK_64X64, xd->plane[0].subsampling_x, + xd->plane[0].subsampling_y)); + int mu_blocks_wide = mi_size_wide[max_unit_bsize]; + int mu_blocks_high = mi_size_high[max_unit_bsize]; + + mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide); + mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high); + + for (row = 0; row < max_blocks_high; row += mu_blocks_high) { + for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) { + for (int plane = 0; plane < num_planes; ++plane) { + if (plane && !xd->is_chroma_ref) break; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, ss_x, ss_y); + const TX_SIZE max_tx_size = + get_vartx_max_txsize(xd, plane_bsize, plane); + const int bh_var_tx = tx_size_high_unit[max_tx_size]; + const int bw_var_tx = tx_size_wide_unit[max_tx_size]; + int block = 0; + int step = + tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size]; + int blk_row, blk_col; + const int unit_height = ROUND_POWER_OF_TWO( + AOMMIN(mu_blocks_high + row, max_blocks_high), ss_y); + const int unit_width = ROUND_POWER_OF_TWO( + AOMMIN(mu_blocks_wide + col, max_blocks_wide), ss_x); + + for (blk_row = row >> ss_y; blk_row < unit_height; + blk_row += bh_var_tx) { + for (blk_col = col >> ss_x; blk_col < unit_width; + blk_col += bw_var_tx) { + decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize, + blk_row, blk_col, block, max_tx_size, + &eobtotal); + block += step; + } + } + } + } + } + } + td->cfl_store_inter_block_visit(cm, xd); + } + + av1_visit_palette(pbi, xd, r, set_color_index_map_offset); +} + +static AOM_INLINE void set_inter_tx_size(MB_MODE_INFO *mbmi, int stride_log2, + int tx_w_log2, int tx_h_log2, + int min_txs, int split_size, int txs, + int blk_row, int blk_col) { + for (int idy = 0; idy < tx_size_high_unit[split_size]; + idy += tx_size_high_unit[min_txs]) { + for (int idx = 0; idx < tx_size_wide_unit[split_size]; + idx += tx_size_wide_unit[min_txs]) { + const int index = (((blk_row + idy) >> tx_h_log2) << stride_log2) + + ((blk_col + idx) >> tx_w_log2); + mbmi->inter_tx_size[index] = txs; + } + } +} + +static AOM_INLINE void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi, + TX_SIZE tx_size, int depth, +#if CONFIG_LPF_MASK + AV1_COMMON *cm, int mi_row, + int mi_col, int store_bitmask, +#endif + int blk_row, int blk_col, + aom_reader *r) { + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + int is_split = 0; + const BLOCK_SIZE bsize = mbmi->sb_type; + const int max_blocks_high = max_block_high(xd, bsize, 0); + const int max_blocks_wide = max_block_wide(xd, bsize, 0); + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + assert(tx_size > TX_4X4); + TX_SIZE txs = max_txsize_rect_lookup[bsize]; + for (int level = 0; level < MAX_VARTX_DEPTH - 1; ++level) + txs = sub_tx_size_map[txs]; + const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2; + const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2; + const int bw_log2 = mi_size_wide_log2[bsize]; + const int stride_log2 = bw_log2 - tx_w_log2; + + if (depth == MAX_VARTX_DEPTH) { + set_inter_tx_size(mbmi, stride_log2, tx_w_log2, tx_h_log2, txs, tx_size, + tx_size, blk_row, blk_col); + mbmi->tx_size = tx_size; + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, tx_size, tx_size); + return; + } + + const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, + mbmi->sb_type, tx_size); + is_split = aom_read_symbol(r, ec_ctx->txfm_partition_cdf[ctx], 2, ACCT_STR); + + if (is_split) { + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + + if (sub_txs == TX_4X4) { + set_inter_tx_size(mbmi, stride_log2, tx_w_log2, tx_h_log2, txs, tx_size, + sub_txs, blk_row, blk_col); + mbmi->tx_size = sub_txs; + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, sub_txs, tx_size); +#if CONFIG_LPF_MASK + if (store_bitmask) { + av1_store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col, + txsize_to_bsize[tx_size], TX_4X4, mbmi); + } +#endif + return; + } +#if CONFIG_LPF_MASK + if (depth + 1 == MAX_VARTX_DEPTH && store_bitmask) { + av1_store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col, + txsize_to_bsize[tx_size], sub_txs, mbmi); + store_bitmask = 0; + } +#endif + + assert(bsw > 0 && bsh > 0); + for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) { + for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) { + int offsetr = blk_row + row; + int offsetc = blk_col + col; + read_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, +#if CONFIG_LPF_MASK + cm, mi_row, mi_col, store_bitmask, +#endif + offsetr, offsetc, r); + } + } + } else { + set_inter_tx_size(mbmi, stride_log2, tx_w_log2, tx_h_log2, txs, tx_size, + tx_size, blk_row, blk_col); + mbmi->tx_size = tx_size; + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, tx_size, tx_size); +#if CONFIG_LPF_MASK + if (store_bitmask) { + av1_store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col, + txsize_to_bsize[tx_size], tx_size, mbmi); + } +#endif + } +} + +static TX_SIZE read_selected_tx_size(const MACROBLOCKD *const xd, + aom_reader *r) { + // TODO(debargha): Clean up the logic here. This function should only + // be called for intra. + const BLOCK_SIZE bsize = xd->mi[0]->sb_type; + const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize); + const int max_depths = bsize_to_max_depth(bsize); + const int ctx = get_tx_size_context(xd); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + const int depth = aom_read_symbol(r, ec_ctx->tx_size_cdf[tx_size_cat][ctx], + max_depths + 1, ACCT_STR); + assert(depth >= 0 && depth <= max_depths); + const TX_SIZE tx_size = depth_to_tx_size(depth, bsize); + return tx_size; +} + +static TX_SIZE read_tx_size(const MACROBLOCKD *const xd, TX_MODE tx_mode, + int is_inter, int allow_select_inter, + aom_reader *r) { + const BLOCK_SIZE bsize = xd->mi[0]->sb_type; + if (xd->lossless[xd->mi[0]->segment_id]) return TX_4X4; + + if (block_signals_txsize(bsize)) { + if ((!is_inter || allow_select_inter) && tx_mode == TX_MODE_SELECT) { + const TX_SIZE coded_tx_size = read_selected_tx_size(xd, r); + return coded_tx_size; + } else { + return tx_size_from_tx_mode(bsize, tx_mode); + } + } else { + assert(IMPLIES(tx_mode == ONLY_4X4, bsize == BLOCK_4X4)); + return max_txsize_rect_lookup[bsize]; + } +} + +static AOM_INLINE void parse_decode_block(AV1Decoder *const pbi, + ThreadData *const td, int mi_row, + int mi_col, aom_reader *r, + PARTITION_TYPE partition, + BLOCK_SIZE bsize) { + MACROBLOCKD *const xd = &td->xd; + decode_mbmi_block(pbi, xd, mi_row, mi_col, r, partition, bsize); + + av1_visit_palette(pbi, xd, r, av1_decode_palette_tokens); + + AV1_COMMON *cm = &pbi->common; + const int num_planes = av1_num_planes(cm); + MB_MODE_INFO *mbmi = xd->mi[0]; + int inter_block_tx = is_inter_block(mbmi) || is_intrabc_block(mbmi); + if (cm->features.tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) && + !mbmi->skip && inter_block_tx && !xd->lossless[mbmi->segment_id]) { + const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize]; + const int bh = tx_size_high_unit[max_tx_size]; + const int bw = tx_size_wide_unit[max_tx_size]; + const int width = mi_size_wide[bsize]; + const int height = mi_size_high[bsize]; + + for (int idy = 0; idy < height; idy += bh) + for (int idx = 0; idx < width; idx += bw) + read_tx_size_vartx(xd, mbmi, max_tx_size, 0, +#if CONFIG_LPF_MASK + cm, mi_row, mi_col, 1, +#endif + idy, idx, r); + } else { + mbmi->tx_size = + read_tx_size(xd, cm->features.tx_mode, inter_block_tx, !mbmi->skip, r); + if (inter_block_tx) + memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); + set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height, + mbmi->skip && is_inter_block(mbmi), xd); +#if CONFIG_LPF_MASK + const int w = mi_size_wide[bsize]; + const int h = mi_size_high[bsize]; + if (w <= mi_size_wide[BLOCK_64X64] && h <= mi_size_high[BLOCK_64X64]) { + av1_store_bitmask_univariant_tx(cm, mi_row, mi_col, bsize, mbmi); + } else { + for (int row = 0; row < h; row += mi_size_high[BLOCK_64X64]) { + for (int col = 0; col < w; col += mi_size_wide[BLOCK_64X64]) { + av1_store_bitmask_univariant_tx(cm, mi_row + row, mi_col + col, + BLOCK_64X64, mbmi); + } + } + } +#endif + } +#if CONFIG_LPF_MASK + const int w = mi_size_wide[bsize]; + const int h = mi_size_high[bsize]; + if (w <= mi_size_wide[BLOCK_64X64] && h <= mi_size_high[BLOCK_64X64]) { + av1_store_bitmask_other_info(cm, mi_row, mi_col, bsize, mbmi, 1, 1); + } else { + for (int row = 0; row < h; row += mi_size_high[BLOCK_64X64]) { + for (int col = 0; col < w; col += mi_size_wide[BLOCK_64X64]) { + av1_store_bitmask_other_info(cm, mi_row + row, mi_col + col, + BLOCK_64X64, mbmi, row == 0, col == 0); + } + } + } +#endif + + if (cm->delta_q_info.delta_q_present_flag) { + for (int i = 0; i < MAX_SEGMENTS; i++) { + const int current_qindex = + av1_get_qindex(&cm->seg, i, xd->current_qindex); + const CommonQuantParams *const quant_params = &cm->quant_params; + for (int j = 0; j < num_planes; ++j) { + const int dc_delta_q = j == 0 ? quant_params->y_dc_delta_q + : (j == 1 ? quant_params->u_dc_delta_q + : quant_params->v_dc_delta_q); + const int ac_delta_q = j == 0 ? 0 + : (j == 1 ? quant_params->u_ac_delta_q + : quant_params->v_ac_delta_q); + xd->plane[j].seg_dequant_QTX[i][0] = av1_dc_quant_QTX( + current_qindex, dc_delta_q, cm->seq_params.bit_depth); + xd->plane[j].seg_dequant_QTX[i][1] = av1_ac_quant_QTX( + current_qindex, ac_delta_q, cm->seq_params.bit_depth); + } + } + } + if (mbmi->skip) av1_reset_entropy_context(xd, bsize, num_planes); + + decode_token_recon_block(pbi, td, r, bsize); +} + +static AOM_INLINE void set_offsets_for_pred_and_recon(AV1Decoder *const pbi, + ThreadData *const td, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { + AV1_COMMON *const cm = &pbi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + MACROBLOCKD *const xd = &td->xd; + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + const int num_planes = av1_num_planes(cm); + + const int offset = mi_row * mi_params->mi_stride + mi_col; + const TileInfo *const tile = &xd->tile; + + xd->mi = mi_params->mi_grid_base + offset; + xd->tx_type_map = + &mi_params->tx_type_map[mi_row * mi_params->mi_stride + mi_col]; + xd->tx_type_map_stride = mi_params->mi_stride; + + set_plane_n4(xd, bw, bh, num_planes); + + // Distance of Mb to the various image edges. These are specified to 8th pel + // as they are always compared to values that are in 1/8th pel units + set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows, + mi_params->mi_cols); + + av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0, + num_planes); +} + +static AOM_INLINE void decode_block(AV1Decoder *const pbi, ThreadData *const td, + int mi_row, int mi_col, aom_reader *r, + PARTITION_TYPE partition, + BLOCK_SIZE bsize) { + (void)partition; + set_offsets_for_pred_and_recon(pbi, td, mi_row, mi_col, bsize); + decode_token_recon_block(pbi, td, r, bsize); +} + +static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col, + aom_reader *r, int has_rows, int has_cols, + BLOCK_SIZE bsize) { + const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + + if (!has_rows && !has_cols) return PARTITION_SPLIT; + + assert(ctx >= 0); + aom_cdf_prob *partition_cdf = ec_ctx->partition_cdf[ctx]; + if (has_rows && has_cols) { + return (PARTITION_TYPE)aom_read_symbol( + r, partition_cdf, partition_cdf_length(bsize), ACCT_STR); + } else if (!has_rows && has_cols) { + assert(bsize > BLOCK_8X8); + aom_cdf_prob cdf[2]; + partition_gather_vert_alike(cdf, partition_cdf, bsize); + assert(cdf[1] == AOM_ICDF(CDF_PROB_TOP)); + return aom_read_cdf(r, cdf, 2, ACCT_STR) ? PARTITION_SPLIT : PARTITION_HORZ; + } else { + assert(has_rows && !has_cols); + assert(bsize > BLOCK_8X8); + aom_cdf_prob cdf[2]; + partition_gather_horz_alike(cdf, partition_cdf, bsize); + assert(cdf[1] == AOM_ICDF(CDF_PROB_TOP)); + return aom_read_cdf(r, cdf, 2, ACCT_STR) ? PARTITION_SPLIT : PARTITION_VERT; + } +} + +// TODO(slavarnway): eliminate bsize and subsize in future commits +static AOM_INLINE void decode_partition(AV1Decoder *const pbi, + ThreadData *const td, int mi_row, + int mi_col, aom_reader *reader, + BLOCK_SIZE bsize, + int parse_decode_flag) { + assert(bsize < BLOCK_SIZES_ALL); + AV1_COMMON *const cm = &pbi->common; + MACROBLOCKD *const xd = &td->xd; + const int bw = mi_size_wide[bsize]; + const int hbs = bw >> 1; + PARTITION_TYPE partition; + BLOCK_SIZE subsize; + const int quarter_step = bw / 4; + BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT); + const int has_rows = (mi_row + hbs) < cm->mi_params.mi_rows; + const int has_cols = (mi_col + hbs) < cm->mi_params.mi_cols; + + if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols) + return; + + // parse_decode_flag takes the following values : + // 01 - do parse only + // 10 - do decode only + // 11 - do parse and decode + static const block_visitor_fn_t block_visit[4] = { NULL, parse_decode_block, + decode_block, + parse_decode_block }; + + if (parse_decode_flag & 1) { + const int num_planes = av1_num_planes(cm); + for (int plane = 0; plane < num_planes; ++plane) { + int rcol0, rcol1, rrow0, rrow1; + if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize, + &rcol0, &rcol1, &rrow0, &rrow1)) { + const int rstride = cm->rst_info[plane].horz_units_per_tile; + for (int rrow = rrow0; rrow < rrow1; ++rrow) { + for (int rcol = rcol0; rcol < rcol1; ++rcol) { + const int runit_idx = rcol + rrow * rstride; + loop_restoration_read_sb_coeffs(cm, xd, reader, plane, runit_idx); + } + } + } + } + + partition = (bsize < BLOCK_8X8) ? PARTITION_NONE + : read_partition(xd, mi_row, mi_col, reader, + has_rows, has_cols, bsize); + } else { + partition = get_partition(cm, mi_row, mi_col, bsize); + } + subsize = get_partition_subsize(bsize, partition); + if (subsize == BLOCK_INVALID) { + aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, + "Partition is invalid for block size %dx%d", + block_size_wide[bsize], block_size_high[bsize]); + } + // Check the bitstream is conformant: if there is subsampling on the + // chroma planes, subsize must subsample to a valid block size. + const struct macroblockd_plane *const pd_u = &xd->plane[1]; + if (get_plane_block_size(subsize, pd_u->subsampling_x, pd_u->subsampling_y) == + BLOCK_INVALID) { + aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, + "Block size %dx%d invalid with this subsampling mode", + block_size_wide[subsize], block_size_high[subsize]); + } + +#define DEC_BLOCK_STX_ARG +#define DEC_BLOCK_EPT_ARG partition, +#define DEC_BLOCK(db_r, db_c, db_subsize) \ + block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \ + reader, DEC_BLOCK_EPT_ARG(db_subsize)) +#define DEC_PARTITION(db_r, db_c, db_subsize) \ + decode_partition(pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), reader, \ + (db_subsize), parse_decode_flag) + + switch (partition) { + case PARTITION_NONE: DEC_BLOCK(mi_row, mi_col, subsize); break; + case PARTITION_HORZ: + DEC_BLOCK(mi_row, mi_col, subsize); + if (has_rows) DEC_BLOCK(mi_row + hbs, mi_col, subsize); + break; + case PARTITION_VERT: + DEC_BLOCK(mi_row, mi_col, subsize); + if (has_cols) DEC_BLOCK(mi_row, mi_col + hbs, subsize); + break; + case PARTITION_SPLIT: + DEC_PARTITION(mi_row, mi_col, subsize); + DEC_PARTITION(mi_row, mi_col + hbs, subsize); + DEC_PARTITION(mi_row + hbs, mi_col, subsize); + DEC_PARTITION(mi_row + hbs, mi_col + hbs, subsize); + break; + case PARTITION_HORZ_A: + DEC_BLOCK(mi_row, mi_col, bsize2); + DEC_BLOCK(mi_row, mi_col + hbs, bsize2); + DEC_BLOCK(mi_row + hbs, mi_col, subsize); + break; + case PARTITION_HORZ_B: + DEC_BLOCK(mi_row, mi_col, subsize); + DEC_BLOCK(mi_row + hbs, mi_col, bsize2); + DEC_BLOCK(mi_row + hbs, mi_col + hbs, bsize2); + break; + case PARTITION_VERT_A: + DEC_BLOCK(mi_row, mi_col, bsize2); + DEC_BLOCK(mi_row + hbs, mi_col, bsize2); + DEC_BLOCK(mi_row, mi_col + hbs, subsize); + break; + case PARTITION_VERT_B: + DEC_BLOCK(mi_row, mi_col, subsize); + DEC_BLOCK(mi_row, mi_col + hbs, bsize2); + DEC_BLOCK(mi_row + hbs, mi_col + hbs, bsize2); + break; + case PARTITION_HORZ_4: + for (int i = 0; i < 4; ++i) { + int this_mi_row = mi_row + i * quarter_step; + if (i > 0 && this_mi_row >= cm->mi_params.mi_rows) break; + DEC_BLOCK(this_mi_row, mi_col, subsize); + } + break; + case PARTITION_VERT_4: + for (int i = 0; i < 4; ++i) { + int this_mi_col = mi_col + i * quarter_step; + if (i > 0 && this_mi_col >= cm->mi_params.mi_cols) break; + DEC_BLOCK(mi_row, this_mi_col, subsize); + } + break; + default: assert(0 && "Invalid partition type"); + } + +#undef DEC_PARTITION +#undef DEC_BLOCK +#undef DEC_BLOCK_EPT_ARG +#undef DEC_BLOCK_STX_ARG + + if (parse_decode_flag & 1) + update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition); +} + +static AOM_INLINE void setup_bool_decoder( + const uint8_t *data, const uint8_t *data_end, const size_t read_size, + struct aom_internal_error_info *error_info, aom_reader *r, + uint8_t allow_update_cdf) { + // Validate the calculated partition length. If the buffer + // described by the partition can't be fully read, then restrict + // it to the portion that can be (for EC mode) or throw an error. + if (!read_is_valid(data, read_size, data_end)) + aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt tile length"); + + if (aom_reader_init(r, data, read_size)) + aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate bool decoder %d", 1); + + r->allow_update_cdf = allow_update_cdf; +} + +static AOM_INLINE void setup_segmentation(AV1_COMMON *const cm, + struct aom_read_bit_buffer *rb) { + struct segmentation *const seg = &cm->seg; + + seg->update_map = 0; + seg->update_data = 0; + seg->temporal_update = 0; + + seg->enabled = aom_rb_read_bit(rb); + if (!seg->enabled) { + if (cm->cur_frame->seg_map) + memset(cm->cur_frame->seg_map, 0, + (cm->mi_params.mi_rows * cm->mi_params.mi_cols)); + + memset(seg, 0, sizeof(*seg)); + segfeatures_copy(&cm->cur_frame->seg, seg); + return; + } + if (cm->seg.enabled && cm->prev_frame && + (cm->mi_params.mi_rows == cm->prev_frame->mi_rows) && + (cm->mi_params.mi_cols == cm->prev_frame->mi_cols)) { + cm->last_frame_seg_map = cm->prev_frame->seg_map; + } else { + cm->last_frame_seg_map = NULL; + } + // Read update flags + if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) { + // These frames can't use previous frames, so must signal map + features + seg->update_map = 1; + seg->temporal_update = 0; + seg->update_data = 1; + } else { + seg->update_map = aom_rb_read_bit(rb); + if (seg->update_map) { + seg->temporal_update = aom_rb_read_bit(rb); + } else { + seg->temporal_update = 0; + } + seg->update_data = aom_rb_read_bit(rb); + } + + // Segmentation data update + if (seg->update_data) { + av1_clearall_segfeatures(seg); + + for (int i = 0; i < MAX_SEGMENTS; i++) { + for (int j = 0; j < SEG_LVL_MAX; j++) { + int data = 0; + const int feature_enabled = aom_rb_read_bit(rb); + if (feature_enabled) { + av1_enable_segfeature(seg, i, j); + + const int data_max = av1_seg_feature_data_max(j); + const int data_min = -data_max; + const int ubits = get_unsigned_bits(data_max); + + if (av1_is_segfeature_signed(j)) { + data = aom_rb_read_inv_signed_literal(rb, ubits); + } else { + data = aom_rb_read_literal(rb, ubits); + } + + data = clamp(data, data_min, data_max); + } + av1_set_segdata(seg, i, j, data); + } + } + av1_calculate_segdata(seg); + } else if (cm->prev_frame) { + segfeatures_copy(seg, &cm->prev_frame->seg); + } + segfeatures_copy(&cm->cur_frame->seg, seg); +} + +static AOM_INLINE void decode_restoration_mode(AV1_COMMON *cm, + struct aom_read_bit_buffer *rb) { + assert(!cm->features.all_lossless); + const int num_planes = av1_num_planes(cm); + if (cm->features.allow_intrabc) return; + int all_none = 1, chroma_none = 1; + for (int p = 0; p < num_planes; ++p) { + RestorationInfo *rsi = &cm->rst_info[p]; + if (aom_rb_read_bit(rb)) { + rsi->frame_restoration_type = + aom_rb_read_bit(rb) ? RESTORE_SGRPROJ : RESTORE_WIENER; + } else { + rsi->frame_restoration_type = + aom_rb_read_bit(rb) ? RESTORE_SWITCHABLE : RESTORE_NONE; + } + if (rsi->frame_restoration_type != RESTORE_NONE) { + all_none = 0; + chroma_none &= p == 0; + } + } + if (!all_none) { + assert(cm->seq_params.sb_size == BLOCK_64X64 || + cm->seq_params.sb_size == BLOCK_128X128); + const int sb_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64; + + for (int p = 0; p < num_planes; ++p) + cm->rst_info[p].restoration_unit_size = sb_size; + + RestorationInfo *rsi = &cm->rst_info[0]; + + if (sb_size == 64) { + rsi->restoration_unit_size <<= aom_rb_read_bit(rb); + } + if (rsi->restoration_unit_size > 64) { + rsi->restoration_unit_size <<= aom_rb_read_bit(rb); + } + } else { + const int size = RESTORATION_UNITSIZE_MAX; + for (int p = 0; p < num_planes; ++p) + cm->rst_info[p].restoration_unit_size = size; + } + + if (num_planes > 1) { + int s = AOMMIN(cm->seq_params.subsampling_x, cm->seq_params.subsampling_y); + if (s && !chroma_none) { + cm->rst_info[1].restoration_unit_size = + cm->rst_info[0].restoration_unit_size >> (aom_rb_read_bit(rb) * s); + } else { + cm->rst_info[1].restoration_unit_size = + cm->rst_info[0].restoration_unit_size; + } + cm->rst_info[2].restoration_unit_size = + cm->rst_info[1].restoration_unit_size; + } +} + +static AOM_INLINE void read_wiener_filter(int wiener_win, + WienerInfo *wiener_info, + WienerInfo *ref_wiener_info, + aom_reader *rb) { + memset(wiener_info->vfilter, 0, sizeof(wiener_info->vfilter)); + memset(wiener_info->hfilter, 0, sizeof(wiener_info->hfilter)); + + if (wiener_win == WIENER_WIN) + wiener_info->vfilter[0] = wiener_info->vfilter[WIENER_WIN - 1] = + aom_read_primitive_refsubexpfin( + rb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1, + WIENER_FILT_TAP0_SUBEXP_K, + ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV, ACCT_STR) + + WIENER_FILT_TAP0_MINV; + else + wiener_info->vfilter[0] = wiener_info->vfilter[WIENER_WIN - 1] = 0; + wiener_info->vfilter[1] = wiener_info->vfilter[WIENER_WIN - 2] = + aom_read_primitive_refsubexpfin( + rb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1, + WIENER_FILT_TAP1_SUBEXP_K, + ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV, ACCT_STR) + + WIENER_FILT_TAP1_MINV; + wiener_info->vfilter[2] = wiener_info->vfilter[WIENER_WIN - 3] = + aom_read_primitive_refsubexpfin( + rb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1, + WIENER_FILT_TAP2_SUBEXP_K, + ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV, ACCT_STR) + + WIENER_FILT_TAP2_MINV; + // The central element has an implicit +WIENER_FILT_STEP + wiener_info->vfilter[WIENER_HALFWIN] = + -2 * (wiener_info->vfilter[0] + wiener_info->vfilter[1] + + wiener_info->vfilter[2]); + + if (wiener_win == WIENER_WIN) + wiener_info->hfilter[0] = wiener_info->hfilter[WIENER_WIN - 1] = + aom_read_primitive_refsubexpfin( + rb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1, + WIENER_FILT_TAP0_SUBEXP_K, + ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV, ACCT_STR) + + WIENER_FILT_TAP0_MINV; + else + wiener_info->hfilter[0] = wiener_info->hfilter[WIENER_WIN - 1] = 0; + wiener_info->hfilter[1] = wiener_info->hfilter[WIENER_WIN - 2] = + aom_read_primitive_refsubexpfin( + rb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1, + WIENER_FILT_TAP1_SUBEXP_K, + ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV, ACCT_STR) + + WIENER_FILT_TAP1_MINV; + wiener_info->hfilter[2] = wiener_info->hfilter[WIENER_WIN - 3] = + aom_read_primitive_refsubexpfin( + rb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1, + WIENER_FILT_TAP2_SUBEXP_K, + ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV, ACCT_STR) + + WIENER_FILT_TAP2_MINV; + // The central element has an implicit +WIENER_FILT_STEP + wiener_info->hfilter[WIENER_HALFWIN] = + -2 * (wiener_info->hfilter[0] + wiener_info->hfilter[1] + + wiener_info->hfilter[2]); + memcpy(ref_wiener_info, wiener_info, sizeof(*wiener_info)); +} + +static AOM_INLINE void read_sgrproj_filter(SgrprojInfo *sgrproj_info, + SgrprojInfo *ref_sgrproj_info, + aom_reader *rb) { + sgrproj_info->ep = aom_read_literal(rb, SGRPROJ_PARAMS_BITS, ACCT_STR); + const sgr_params_type *params = &av1_sgr_params[sgrproj_info->ep]; + + if (params->r[0] == 0) { + sgrproj_info->xqd[0] = 0; + sgrproj_info->xqd[1] = + aom_read_primitive_refsubexpfin( + rb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, ACCT_STR) + + SGRPROJ_PRJ_MIN1; + } else if (params->r[1] == 0) { + sgrproj_info->xqd[0] = + aom_read_primitive_refsubexpfin( + rb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, ACCT_STR) + + SGRPROJ_PRJ_MIN0; + sgrproj_info->xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - sgrproj_info->xqd[0], + SGRPROJ_PRJ_MIN1, SGRPROJ_PRJ_MAX1); + } else { + sgrproj_info->xqd[0] = + aom_read_primitive_refsubexpfin( + rb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, ACCT_STR) + + SGRPROJ_PRJ_MIN0; + sgrproj_info->xqd[1] = + aom_read_primitive_refsubexpfin( + rb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, ACCT_STR) + + SGRPROJ_PRJ_MIN1; + } + + memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info)); +} + +static AOM_INLINE void loop_restoration_read_sb_coeffs( + const AV1_COMMON *const cm, MACROBLOCKD *xd, aom_reader *const r, int plane, + int runit_idx) { + const RestorationInfo *rsi = &cm->rst_info[plane]; + RestorationUnitInfo *rui = &rsi->unit_info[runit_idx]; + if (rsi->frame_restoration_type == RESTORE_NONE) return; + + assert(!cm->features.all_lossless); + + const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN; + WienerInfo *wiener_info = xd->wiener_info + plane; + SgrprojInfo *sgrproj_info = xd->sgrproj_info + plane; + + if (rsi->frame_restoration_type == RESTORE_SWITCHABLE) { + rui->restoration_type = + aom_read_symbol(r, xd->tile_ctx->switchable_restore_cdf, + RESTORE_SWITCHABLE_TYPES, ACCT_STR); + switch (rui->restoration_type) { + case RESTORE_WIENER: + read_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, r); + break; + case RESTORE_SGRPROJ: + read_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, r); + break; + default: assert(rui->restoration_type == RESTORE_NONE); break; + } + } else if (rsi->frame_restoration_type == RESTORE_WIENER) { + if (aom_read_symbol(r, xd->tile_ctx->wiener_restore_cdf, 2, ACCT_STR)) { + rui->restoration_type = RESTORE_WIENER; + read_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, r); + } else { + rui->restoration_type = RESTORE_NONE; + } + } else if (rsi->frame_restoration_type == RESTORE_SGRPROJ) { + if (aom_read_symbol(r, xd->tile_ctx->sgrproj_restore_cdf, 2, ACCT_STR)) { + rui->restoration_type = RESTORE_SGRPROJ; + read_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, r); + } else { + rui->restoration_type = RESTORE_NONE; + } + } +} + +static AOM_INLINE void setup_loopfilter(AV1_COMMON *cm, + struct aom_read_bit_buffer *rb) { + const int num_planes = av1_num_planes(cm); + struct loopfilter *lf = &cm->lf; + + if (cm->features.allow_intrabc || cm->features.coded_lossless) { + // write default deltas to frame buffer + av1_set_default_ref_deltas(cm->cur_frame->ref_deltas); + av1_set_default_mode_deltas(cm->cur_frame->mode_deltas); + return; + } + assert(!cm->features.coded_lossless); + if (cm->prev_frame) { + // write deltas to frame buffer + memcpy(lf->ref_deltas, cm->prev_frame->ref_deltas, REF_FRAMES); + memcpy(lf->mode_deltas, cm->prev_frame->mode_deltas, MAX_MODE_LF_DELTAS); + } else { + av1_set_default_ref_deltas(lf->ref_deltas); + av1_set_default_mode_deltas(lf->mode_deltas); + } + lf->filter_level[0] = aom_rb_read_literal(rb, 6); + lf->filter_level[1] = aom_rb_read_literal(rb, 6); + if (num_planes > 1) { + if (lf->filter_level[0] || lf->filter_level[1]) { + lf->filter_level_u = aom_rb_read_literal(rb, 6); + lf->filter_level_v = aom_rb_read_literal(rb, 6); + } + } + lf->sharpness_level = aom_rb_read_literal(rb, 3); + + // Read in loop filter deltas applied at the MB level based on mode or ref + // frame. + lf->mode_ref_delta_update = 0; + + lf->mode_ref_delta_enabled = aom_rb_read_bit(rb); + if (lf->mode_ref_delta_enabled) { + lf->mode_ref_delta_update = aom_rb_read_bit(rb); + if (lf->mode_ref_delta_update) { + for (int i = 0; i < REF_FRAMES; i++) + if (aom_rb_read_bit(rb)) + lf->ref_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6); + + for (int i = 0; i < MAX_MODE_LF_DELTAS; i++) + if (aom_rb_read_bit(rb)) + lf->mode_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6); + } + } + + // write deltas to frame buffer + memcpy(cm->cur_frame->ref_deltas, lf->ref_deltas, REF_FRAMES); + memcpy(cm->cur_frame->mode_deltas, lf->mode_deltas, MAX_MODE_LF_DELTAS); +} + +static AOM_INLINE void setup_cdef(AV1_COMMON *cm, + struct aom_read_bit_buffer *rb) { + const int num_planes = av1_num_planes(cm); + CdefInfo *const cdef_info = &cm->cdef_info; + + if (cm->features.allow_intrabc) return; + cdef_info->cdef_damping = aom_rb_read_literal(rb, 2) + 3; + cdef_info->cdef_bits = aom_rb_read_literal(rb, 2); + cdef_info->nb_cdef_strengths = 1 << cdef_info->cdef_bits; + for (int i = 0; i < cdef_info->nb_cdef_strengths; i++) { + cdef_info->cdef_strengths[i] = aom_rb_read_literal(rb, CDEF_STRENGTH_BITS); + cdef_info->cdef_uv_strengths[i] = + num_planes > 1 ? aom_rb_read_literal(rb, CDEF_STRENGTH_BITS) : 0; + } +} + +static INLINE int read_delta_q(struct aom_read_bit_buffer *rb) { + return aom_rb_read_bit(rb) ? aom_rb_read_inv_signed_literal(rb, 6) : 0; +} + +static AOM_INLINE void setup_quantization(CommonQuantParams *quant_params, + int num_planes, + bool separate_uv_delta_q, + struct aom_read_bit_buffer *rb) { + quant_params->base_qindex = aom_rb_read_literal(rb, QINDEX_BITS); + quant_params->y_dc_delta_q = read_delta_q(rb); + if (num_planes > 1) { + int diff_uv_delta = 0; + if (separate_uv_delta_q) diff_uv_delta = aom_rb_read_bit(rb); + quant_params->u_dc_delta_q = read_delta_q(rb); + quant_params->u_ac_delta_q = read_delta_q(rb); + if (diff_uv_delta) { + quant_params->v_dc_delta_q = read_delta_q(rb); + quant_params->v_ac_delta_q = read_delta_q(rb); + } else { + quant_params->v_dc_delta_q = quant_params->u_dc_delta_q; + quant_params->v_ac_delta_q = quant_params->u_ac_delta_q; + } + } else { + quant_params->u_dc_delta_q = 0; + quant_params->u_ac_delta_q = 0; + quant_params->v_dc_delta_q = 0; + quant_params->v_ac_delta_q = 0; + } + quant_params->using_qmatrix = aom_rb_read_bit(rb); + if (quant_params->using_qmatrix) { + quant_params->qmatrix_level_y = aom_rb_read_literal(rb, QM_LEVEL_BITS); + quant_params->qmatrix_level_u = aom_rb_read_literal(rb, QM_LEVEL_BITS); + if (!separate_uv_delta_q) + quant_params->qmatrix_level_v = quant_params->qmatrix_level_u; + else + quant_params->qmatrix_level_v = aom_rb_read_literal(rb, QM_LEVEL_BITS); + } else { + quant_params->qmatrix_level_y = 0; + quant_params->qmatrix_level_u = 0; + quant_params->qmatrix_level_v = 0; + } +} + +// Build y/uv dequant values based on segmentation. +static AOM_INLINE void setup_segmentation_dequant(AV1_COMMON *const cm, + MACROBLOCKD *const xd) { + const int bit_depth = cm->seq_params.bit_depth; + // When segmentation is disabled, only the first value is used. The + // remaining are don't cares. + const int max_segments = cm->seg.enabled ? MAX_SEGMENTS : 1; + CommonQuantParams *const quant_params = &cm->quant_params; + for (int i = 0; i < max_segments; ++i) { + const int qindex = xd->qindex[i]; + quant_params->y_dequant_QTX[i][0] = + av1_dc_quant_QTX(qindex, quant_params->y_dc_delta_q, bit_depth); + quant_params->y_dequant_QTX[i][1] = av1_ac_quant_QTX(qindex, 0, bit_depth); + quant_params->u_dequant_QTX[i][0] = + av1_dc_quant_QTX(qindex, quant_params->u_dc_delta_q, bit_depth); + quant_params->u_dequant_QTX[i][1] = + av1_ac_quant_QTX(qindex, quant_params->u_ac_delta_q, bit_depth); + quant_params->v_dequant_QTX[i][0] = + av1_dc_quant_QTX(qindex, quant_params->v_dc_delta_q, bit_depth); + quant_params->v_dequant_QTX[i][1] = + av1_ac_quant_QTX(qindex, quant_params->v_ac_delta_q, bit_depth); + const int use_qmatrix = av1_use_qmatrix(quant_params, xd, i); + // NB: depends on base index so there is only 1 set per frame + // No quant weighting when lossless or signalled not using QM + const int qmlevel_y = + use_qmatrix ? quant_params->qmatrix_level_y : NUM_QM_LEVELS - 1; + for (int j = 0; j < TX_SIZES_ALL; ++j) { + quant_params->y_iqmatrix[i][j] = + av1_iqmatrix(quant_params, qmlevel_y, AOM_PLANE_Y, j); + } + const int qmlevel_u = + use_qmatrix ? quant_params->qmatrix_level_u : NUM_QM_LEVELS - 1; + for (int j = 0; j < TX_SIZES_ALL; ++j) { + quant_params->u_iqmatrix[i][j] = + av1_iqmatrix(quant_params, qmlevel_u, AOM_PLANE_U, j); + } + const int qmlevel_v = + use_qmatrix ? quant_params->qmatrix_level_v : NUM_QM_LEVELS - 1; + for (int j = 0; j < TX_SIZES_ALL; ++j) { + quant_params->v_iqmatrix[i][j] = + av1_iqmatrix(quant_params, qmlevel_v, AOM_PLANE_V, j); + } + } +} + +static InterpFilter read_frame_interp_filter(struct aom_read_bit_buffer *rb) { + return aom_rb_read_bit(rb) ? SWITCHABLE + : aom_rb_read_literal(rb, LOG_SWITCHABLE_FILTERS); +} + +static AOM_INLINE void setup_render_size(AV1_COMMON *cm, + struct aom_read_bit_buffer *rb) { + cm->render_width = cm->superres_upscaled_width; + cm->render_height = cm->superres_upscaled_height; + if (aom_rb_read_bit(rb)) + av1_read_frame_size(rb, 16, 16, &cm->render_width, &cm->render_height); +} + +// TODO(afergs): make "struct aom_read_bit_buffer *const rb"? +static AOM_INLINE void setup_superres(AV1_COMMON *const cm, + struct aom_read_bit_buffer *rb, + int *width, int *height) { + cm->superres_upscaled_width = *width; + cm->superres_upscaled_height = *height; + + const SequenceHeader *const seq_params = &cm->seq_params; + if (!seq_params->enable_superres) return; + + if (aom_rb_read_bit(rb)) { + cm->superres_scale_denominator = + (uint8_t)aom_rb_read_literal(rb, SUPERRES_SCALE_BITS); + cm->superres_scale_denominator += SUPERRES_SCALE_DENOMINATOR_MIN; + // Don't edit cm->width or cm->height directly, or the buffers won't get + // resized correctly + av1_calculate_scaled_superres_size(width, height, + cm->superres_scale_denominator); + } else { + // 1:1 scaling - ie. no scaling, scale not provided + cm->superres_scale_denominator = SCALE_NUMERATOR; + } +} + +static AOM_INLINE void resize_context_buffers(AV1_COMMON *cm, int width, + int height) { +#if CONFIG_SIZE_LIMIT + if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Dimensions of %dx%d beyond allowed size of %dx%d.", + width, height, DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT); +#endif + if (cm->width != width || cm->height != height) { + const int new_mi_rows = + ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2) >> MI_SIZE_LOG2; + const int new_mi_cols = + ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2) >> MI_SIZE_LOG2; + + // Allocations in av1_alloc_context_buffers() depend on individual + // dimensions as well as the overall size. + if (new_mi_cols > cm->mi_params.mi_cols || + new_mi_rows > cm->mi_params.mi_rows) { + if (av1_alloc_context_buffers(cm, width, height)) { + // The cm->mi_* values have been cleared and any existing context + // buffers have been freed. Clear cm->width and cm->height to be + // consistent and to force a realloc next time. + cm->width = 0; + cm->height = 0; + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate context buffers"); + } + } else { + cm->mi_params.set_mb_mi(&cm->mi_params, width, height); + } + av1_init_mi_buffers(&cm->mi_params); + cm->width = width; + cm->height = height; + } + + ensure_mv_buffer(cm->cur_frame, cm); + cm->cur_frame->width = cm->width; + cm->cur_frame->height = cm->height; +} + +static AOM_INLINE void setup_buffer_pool(AV1_COMMON *cm) { + BufferPool *const pool = cm->buffer_pool; + const SequenceHeader *const seq_params = &cm->seq_params; + + lock_buffer_pool(pool); + if (aom_realloc_frame_buffer( + &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + AOM_DEC_BORDER_IN_PIXELS, cm->features.byte_alignment, + &cm->cur_frame->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv)) { + unlock_buffer_pool(pool); + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + } + unlock_buffer_pool(pool); + + cm->cur_frame->buf.bit_depth = (unsigned int)seq_params->bit_depth; + cm->cur_frame->buf.color_primaries = seq_params->color_primaries; + cm->cur_frame->buf.transfer_characteristics = + seq_params->transfer_characteristics; + cm->cur_frame->buf.matrix_coefficients = seq_params->matrix_coefficients; + cm->cur_frame->buf.monochrome = seq_params->monochrome; + cm->cur_frame->buf.chroma_sample_position = + seq_params->chroma_sample_position; + cm->cur_frame->buf.color_range = seq_params->color_range; + cm->cur_frame->buf.render_width = cm->render_width; + cm->cur_frame->buf.render_height = cm->render_height; +} + +static AOM_INLINE void setup_frame_size(AV1_COMMON *cm, + int frame_size_override_flag, + struct aom_read_bit_buffer *rb) { + const SequenceHeader *const seq_params = &cm->seq_params; + int width, height; + + if (frame_size_override_flag) { + int num_bits_width = seq_params->num_bits_width; + int num_bits_height = seq_params->num_bits_height; + av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height); + if (width > seq_params->max_frame_width || + height > seq_params->max_frame_height) { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Frame dimensions are larger than the maximum values"); + } + } else { + width = seq_params->max_frame_width; + height = seq_params->max_frame_height; + } + + setup_superres(cm, rb, &width, &height); + resize_context_buffers(cm, width, height); + setup_render_size(cm, rb); + setup_buffer_pool(cm); +} + +static AOM_INLINE void setup_sb_size(SequenceHeader *seq_params, + struct aom_read_bit_buffer *rb) { + set_sb_size(seq_params, aom_rb_read_bit(rb) ? BLOCK_128X128 : BLOCK_64X64); +} + +static INLINE int valid_ref_frame_img_fmt(aom_bit_depth_t ref_bit_depth, + int ref_xss, int ref_yss, + aom_bit_depth_t this_bit_depth, + int this_xss, int this_yss) { + return ref_bit_depth == this_bit_depth && ref_xss == this_xss && + ref_yss == this_yss; +} + +static AOM_INLINE void setup_frame_size_with_refs( + AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { + int width, height; + int found = 0; + int has_valid_ref_frame = 0; + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + if (aom_rb_read_bit(rb)) { + const RefCntBuffer *const ref_buf = get_ref_frame_buf(cm, i); + // This will never be NULL in a normal stream, as streams are required to + // have a shown keyframe before any inter frames, which would refresh all + // the reference buffers. However, it might be null if we're starting in + // the middle of a stream, and static analysis will error if we don't do + // a null check here. + if (ref_buf == NULL) { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Invalid condition: invalid reference buffer"); + } else { + const YV12_BUFFER_CONFIG *const buf = &ref_buf->buf; + width = buf->y_crop_width; + height = buf->y_crop_height; + cm->render_width = buf->render_width; + cm->render_height = buf->render_height; + setup_superres(cm, rb, &width, &height); + resize_context_buffers(cm, width, height); + found = 1; + break; + } + } + } + + const SequenceHeader *const seq_params = &cm->seq_params; + if (!found) { + int num_bits_width = seq_params->num_bits_width; + int num_bits_height = seq_params->num_bits_height; + + av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height); + setup_superres(cm, rb, &width, &height); + resize_context_buffers(cm, width, height); + setup_render_size(cm, rb); + } + + if (width <= 0 || height <= 0) + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Invalid frame size"); + + // Check to make sure at least one of frames that this frame references + // has valid dimensions. + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + const RefCntBuffer *const ref_frame = get_ref_frame_buf(cm, i); + has_valid_ref_frame |= + valid_ref_frame_size(ref_frame->buf.y_crop_width, + ref_frame->buf.y_crop_height, width, height); + } + if (!has_valid_ref_frame) + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Referenced frame has invalid size"); + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + const RefCntBuffer *const ref_frame = get_ref_frame_buf(cm, i); + if (!valid_ref_frame_img_fmt( + ref_frame->buf.bit_depth, ref_frame->buf.subsampling_x, + ref_frame->buf.subsampling_y, seq_params->bit_depth, + seq_params->subsampling_x, seq_params->subsampling_y)) + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Referenced frame has incompatible color format"); + } + setup_buffer_pool(cm); +} + +// Same function as av1_read_uniform but reading from uncompresses header wb +static int rb_read_uniform(struct aom_read_bit_buffer *const rb, int n) { + const int l = get_unsigned_bits(n); + const int m = (1 << l) - n; + const int v = aom_rb_read_literal(rb, l - 1); + assert(l != 0); + if (v < m) + return v; + else + return (v << 1) - m + aom_rb_read_bit(rb); +} + +static AOM_INLINE void read_tile_info_max_tile( + AV1_COMMON *const cm, struct aom_read_bit_buffer *const rb) { + const SequenceHeader *const seq_params = &cm->seq_params; + CommonTileParams *const tiles = &cm->tiles; + int width_mi = + ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2); + int height_mi = + ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, seq_params->mib_size_log2); + int width_sb = width_mi >> seq_params->mib_size_log2; + int height_sb = height_mi >> seq_params->mib_size_log2; + + av1_get_tile_limits(cm); + tiles->uniform_spacing = aom_rb_read_bit(rb); + + // Read tile columns + if (tiles->uniform_spacing) { + tiles->log2_cols = tiles->min_log2_cols; + while (tiles->log2_cols < tiles->max_log2_cols) { + if (!aom_rb_read_bit(rb)) { + break; + } + tiles->log2_cols++; + } + } else { + int i; + int start_sb; + for (i = 0, start_sb = 0; width_sb > 0 && i < MAX_TILE_COLS; i++) { + const int size_sb = + 1 + rb_read_uniform(rb, AOMMIN(width_sb, tiles->max_width_sb)); + tiles->col_start_sb[i] = start_sb; + start_sb += size_sb; + width_sb -= size_sb; + } + tiles->cols = i; + tiles->col_start_sb[i] = start_sb + width_sb; + } + av1_calculate_tile_cols(seq_params, cm->mi_params.mi_rows, + cm->mi_params.mi_cols, tiles); + + // Read tile rows + if (tiles->uniform_spacing) { + tiles->log2_rows = tiles->min_log2_rows; + while (tiles->log2_rows < tiles->max_log2_rows) { + if (!aom_rb_read_bit(rb)) { + break; + } + tiles->log2_rows++; + } + } else { + int i; + int start_sb; + for (i = 0, start_sb = 0; height_sb > 0 && i < MAX_TILE_ROWS; i++) { + const int size_sb = + 1 + rb_read_uniform(rb, AOMMIN(height_sb, tiles->max_height_sb)); + tiles->row_start_sb[i] = start_sb; + start_sb += size_sb; + height_sb -= size_sb; + } + tiles->rows = i; + tiles->row_start_sb[i] = start_sb + height_sb; + } + av1_calculate_tile_rows(seq_params, cm->mi_params.mi_rows, tiles); +} + +void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm) { + cm->tiles.single_tile_decoding = 0; + if (cm->tiles.large_scale) { + struct loopfilter *lf = &cm->lf; + RestorationInfo *const rst_info = cm->rst_info; + const CdefInfo *const cdef_info = &cm->cdef_info; + + // Figure out single_tile_decoding by loopfilter_level. + const int no_loopfilter = !(lf->filter_level[0] || lf->filter_level[1]); + const int no_cdef = cdef_info->cdef_bits == 0 && + cdef_info->cdef_strengths[0] == 0 && + cdef_info->cdef_uv_strengths[0] == 0; + const int no_restoration = + rst_info[0].frame_restoration_type == RESTORE_NONE && + rst_info[1].frame_restoration_type == RESTORE_NONE && + rst_info[2].frame_restoration_type == RESTORE_NONE; + assert(IMPLIES(cm->features.coded_lossless, no_loopfilter && no_cdef)); + assert(IMPLIES(cm->features.all_lossless, no_restoration)); + cm->tiles.single_tile_decoding = no_loopfilter && no_cdef && no_restoration; + } +} + +static AOM_INLINE void read_tile_info(AV1Decoder *const pbi, + struct aom_read_bit_buffer *const rb) { + AV1_COMMON *const cm = &pbi->common; + + read_tile_info_max_tile(cm, rb); + + pbi->context_update_tile_id = 0; + if (cm->tiles.rows * cm->tiles.cols > 1) { + // tile to use for cdf update + pbi->context_update_tile_id = + aom_rb_read_literal(rb, cm->tiles.log2_rows + cm->tiles.log2_cols); + if (pbi->context_update_tile_id >= cm->tiles.rows * cm->tiles.cols) { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Invalid context_update_tile_id"); + } + // tile size magnitude + pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1; + } +} + +#if EXT_TILE_DEBUG +static AOM_INLINE void read_ext_tile_info( + AV1Decoder *const pbi, struct aom_read_bit_buffer *const rb) { + AV1_COMMON *const cm = &pbi->common; + + // This information is stored as a separate byte. + int mod = rb->bit_offset % CHAR_BIT; + if (mod > 0) aom_rb_read_literal(rb, CHAR_BIT - mod); + assert(rb->bit_offset % CHAR_BIT == 0); + + if (cm->tiles.cols * cm->tiles.rows > 1) { + // Read the number of bytes used to store tile size + pbi->tile_col_size_bytes = aom_rb_read_literal(rb, 2) + 1; + pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1; + } +} +#endif // EXT_TILE_DEBUG + +static size_t mem_get_varsize(const uint8_t *src, int sz) { + switch (sz) { + case 1: return src[0]; + case 2: return mem_get_le16(src); + case 3: return mem_get_le24(src); + case 4: return mem_get_le32(src); + default: assert(0 && "Invalid size"); return -1; + } +} + +#if EXT_TILE_DEBUG +// Reads the next tile returning its size and adjusting '*data' accordingly +// based on 'is_last'. On return, '*data' is updated to point to the end of the +// raw tile buffer in the bit stream. +static AOM_INLINE void get_ls_tile_buffer( + const uint8_t *const data_end, struct aom_internal_error_info *error_info, + const uint8_t **data, TileBufferDec (*const tile_buffers)[MAX_TILE_COLS], + int tile_size_bytes, int col, int row, int tile_copy_mode) { + size_t size; + + size_t copy_size = 0; + const uint8_t *copy_data = NULL; + + if (!read_is_valid(*data, tile_size_bytes, data_end)) + aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt tile length"); + size = mem_get_varsize(*data, tile_size_bytes); + + // If tile_copy_mode = 1, then the top bit of the tile header indicates copy + // mode. + if (tile_copy_mode && (size >> (tile_size_bytes * 8 - 1)) == 1) { + // The remaining bits in the top byte signal the row offset + int offset = (size >> (tile_size_bytes - 1) * 8) & 0x7f; + + // Currently, only use tiles in same column as reference tiles. + copy_data = tile_buffers[row - offset][col].data; + copy_size = tile_buffers[row - offset][col].size; + size = 0; + } else { + size += AV1_MIN_TILE_SIZE_BYTES; + } + + *data += tile_size_bytes; + + if (size > (size_t)(data_end - *data)) + aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt tile size"); + + if (size > 0) { + tile_buffers[row][col].data = *data; + tile_buffers[row][col].size = size; + } else { + tile_buffers[row][col].data = copy_data; + tile_buffers[row][col].size = copy_size; + } + + *data += size; +} + +// Returns the end of the last tile buffer +// (tile_buffers[cm->tiles.rows - 1][cm->tiles.cols - 1]). +static const uint8_t *get_ls_tile_buffers( + AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end, + TileBufferDec (*const tile_buffers)[MAX_TILE_COLS]) { + AV1_COMMON *const cm = &pbi->common; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + const int have_tiles = tile_cols * tile_rows > 1; + const uint8_t *raw_data_end; // The end of the last tile buffer + + if (!have_tiles) { + const size_t tile_size = data_end - data; + tile_buffers[0][0].data = data; + tile_buffers[0][0].size = tile_size; + raw_data_end = NULL; + } else { + // We locate only the tile buffers that are required, which are the ones + // specified by pbi->dec_tile_col and pbi->dec_tile_row. Also, we always + // need the last (bottom right) tile buffer, as we need to know where the + // end of the compressed frame buffer is for proper superframe decoding. + + const uint8_t *tile_col_data_end[MAX_TILE_COLS] = { NULL }; + const uint8_t *const data_start = data; + + const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows); + const int single_row = pbi->dec_tile_row >= 0; + const int tile_rows_start = single_row ? dec_tile_row : 0; + const int tile_rows_end = single_row ? tile_rows_start + 1 : tile_rows; + const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols); + const int single_col = pbi->dec_tile_col >= 0; + const int tile_cols_start = single_col ? dec_tile_col : 0; + const int tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols; + + const int tile_col_size_bytes = pbi->tile_col_size_bytes; + const int tile_size_bytes = pbi->tile_size_bytes; + int tile_width, tile_height; + av1_get_uniform_tile_size(cm, &tile_width, &tile_height); + const int tile_copy_mode = + ((AOMMAX(tile_width, tile_height) << MI_SIZE_LOG2) <= 256) ? 1 : 0; + // Read tile column sizes for all columns (we need the last tile buffer) + for (int c = 0; c < tile_cols; ++c) { + const int is_last = c == tile_cols - 1; + size_t tile_col_size; + + if (!is_last) { + tile_col_size = mem_get_varsize(data, tile_col_size_bytes); + data += tile_col_size_bytes; + tile_col_data_end[c] = data + tile_col_size; + } else { + tile_col_size = data_end - data; + tile_col_data_end[c] = data_end; + } + data += tile_col_size; + } + + data = data_start; + + // Read the required tile sizes. + for (int c = tile_cols_start; c < tile_cols_end; ++c) { + const int is_last = c == tile_cols - 1; + + if (c > 0) data = tile_col_data_end[c - 1]; + + if (!is_last) data += tile_col_size_bytes; + + // Get the whole of the last column, otherwise stop at the required tile. + for (int r = 0; r < (is_last ? tile_rows : tile_rows_end); ++r) { + get_ls_tile_buffer(tile_col_data_end[c], &pbi->common.error, &data, + tile_buffers, tile_size_bytes, c, r, tile_copy_mode); + } + } + + // If we have not read the last column, then read it to get the last tile. + if (tile_cols_end != tile_cols) { + const int c = tile_cols - 1; + + data = tile_col_data_end[c - 1]; + + for (int r = 0; r < tile_rows; ++r) { + get_ls_tile_buffer(tile_col_data_end[c], &pbi->common.error, &data, + tile_buffers, tile_size_bytes, c, r, tile_copy_mode); + } + } + raw_data_end = data; + } + return raw_data_end; +} +#endif // EXT_TILE_DEBUG + +static const uint8_t *get_ls_single_tile_buffer( + AV1Decoder *pbi, const uint8_t *data, + TileBufferDec (*const tile_buffers)[MAX_TILE_COLS]) { + assert(pbi->dec_tile_row >= 0 && pbi->dec_tile_col >= 0); + tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].data = data; + tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].size = + (size_t)pbi->coded_tile_data_size; + return data + pbi->coded_tile_data_size; +} + +// Reads the next tile returning its size and adjusting '*data' accordingly +// based on 'is_last'. +static AOM_INLINE void get_tile_buffer( + const uint8_t *const data_end, const int tile_size_bytes, int is_last, + struct aom_internal_error_info *error_info, const uint8_t **data, + TileBufferDec *const buf) { + size_t size; + + if (!is_last) { + if (!read_is_valid(*data, tile_size_bytes, data_end)) + aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME, + "Not enough data to read tile size"); + + size = mem_get_varsize(*data, tile_size_bytes) + AV1_MIN_TILE_SIZE_BYTES; + *data += tile_size_bytes; + + if (size > (size_t)(data_end - *data)) + aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt tile size"); + } else { + size = data_end - *data; + } + + buf->data = *data; + buf->size = size; + + *data += size; +} + +static AOM_INLINE void get_tile_buffers( + AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end, + TileBufferDec (*const tile_buffers)[MAX_TILE_COLS], int start_tile, + int end_tile) { + AV1_COMMON *const cm = &pbi->common; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + int tc = 0; + + for (int r = 0; r < tile_rows; ++r) { + for (int c = 0; c < tile_cols; ++c, ++tc) { + TileBufferDec *const buf = &tile_buffers[r][c]; + + const int is_last = (tc == end_tile); + const size_t hdr_offset = 0; + + if (tc < start_tile || tc > end_tile) continue; + + if (data + hdr_offset >= data_end) + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Data ended before all tiles were read."); + data += hdr_offset; + get_tile_buffer(data_end, pbi->tile_size_bytes, is_last, + &pbi->common.error, &data, buf); + } + } +} + +static AOM_INLINE void set_cb_buffer(AV1Decoder *pbi, MACROBLOCKD *const xd, + CB_BUFFER *cb_buffer_base, + const int num_planes, int mi_row, + int mi_col) { + AV1_COMMON *const cm = &pbi->common; + int mib_size_log2 = cm->seq_params.mib_size_log2; + int stride = (cm->mi_params.mi_cols >> mib_size_log2) + 1; + int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2); + CB_BUFFER *cb_buffer = cb_buffer_base + offset; + + for (int plane = 0; plane < num_planes; ++plane) { + xd->plane[plane].dqcoeff_block = cb_buffer->dqcoeff[plane]; + xd->plane[plane].eob_data = cb_buffer->eob_data[plane]; + xd->cb_offset[plane] = 0; + xd->txb_offset[plane] = 0; + } + xd->plane[0].color_index_map = cb_buffer->color_index_map[0]; + xd->plane[1].color_index_map = cb_buffer->color_index_map[1]; + xd->color_index_map_offset[0] = 0; + xd->color_index_map_offset[1] = 0; +} + +static AOM_INLINE void decoder_alloc_tile_data(AV1Decoder *pbi, + const int n_tiles) { + AV1_COMMON *const cm = &pbi->common; + aom_free(pbi->tile_data); + CHECK_MEM_ERROR(cm, pbi->tile_data, + aom_memalign(32, n_tiles * sizeof(*pbi->tile_data))); + pbi->allocated_tiles = n_tiles; + for (int i = 0; i < n_tiles; i++) { + TileDataDec *const tile_data = pbi->tile_data + i; + av1_zero(tile_data->dec_row_mt_sync); + } + pbi->allocated_row_mt_sync_rows = 0; +} + +// Set up nsync by width. +static INLINE int get_sync_range(int width) { +// nsync numbers are picked by testing. +#if 0 + if (width < 640) + return 1; + else if (width <= 1280) + return 2; + else if (width <= 4096) + return 4; + else + return 8; +#else + (void)width; +#endif + return 1; +} + +// Allocate memory for decoder row synchronization +static AOM_INLINE void dec_row_mt_alloc(AV1DecRowMTSync *dec_row_mt_sync, + AV1_COMMON *cm, int rows) { + dec_row_mt_sync->allocated_sb_rows = rows; +#if CONFIG_MULTITHREAD + { + int i; + + CHECK_MEM_ERROR(cm, dec_row_mt_sync->mutex_, + aom_malloc(sizeof(*(dec_row_mt_sync->mutex_)) * rows)); + if (dec_row_mt_sync->mutex_) { + for (i = 0; i < rows; ++i) { + pthread_mutex_init(&dec_row_mt_sync->mutex_[i], NULL); + } + } + + CHECK_MEM_ERROR(cm, dec_row_mt_sync->cond_, + aom_malloc(sizeof(*(dec_row_mt_sync->cond_)) * rows)); + if (dec_row_mt_sync->cond_) { + for (i = 0; i < rows; ++i) { + pthread_cond_init(&dec_row_mt_sync->cond_[i], NULL); + } + } + } +#endif // CONFIG_MULTITHREAD + + CHECK_MEM_ERROR(cm, dec_row_mt_sync->cur_sb_col, + aom_malloc(sizeof(*(dec_row_mt_sync->cur_sb_col)) * rows)); + + // Set up nsync. + dec_row_mt_sync->sync_range = get_sync_range(cm->width); +} + +// Deallocate decoder row synchronization related mutex and data +void av1_dec_row_mt_dealloc(AV1DecRowMTSync *dec_row_mt_sync) { + if (dec_row_mt_sync != NULL) { +#if CONFIG_MULTITHREAD + int i; + if (dec_row_mt_sync->mutex_ != NULL) { + for (i = 0; i < dec_row_mt_sync->allocated_sb_rows; ++i) { + pthread_mutex_destroy(&dec_row_mt_sync->mutex_[i]); + } + aom_free(dec_row_mt_sync->mutex_); + } + if (dec_row_mt_sync->cond_ != NULL) { + for (i = 0; i < dec_row_mt_sync->allocated_sb_rows; ++i) { + pthread_cond_destroy(&dec_row_mt_sync->cond_[i]); + } + aom_free(dec_row_mt_sync->cond_); + } +#endif // CONFIG_MULTITHREAD + aom_free(dec_row_mt_sync->cur_sb_col); + + // clear the structure as the source of this call may be a resize in which + // case this call will be followed by an _alloc() which may fail. + av1_zero(*dec_row_mt_sync); + } +} + +static INLINE void sync_read(AV1DecRowMTSync *const dec_row_mt_sync, int r, + int c) { +#if CONFIG_MULTITHREAD + const int nsync = dec_row_mt_sync->sync_range; + + if (r && !(c & (nsync - 1))) { + pthread_mutex_t *const mutex = &dec_row_mt_sync->mutex_[r - 1]; + pthread_mutex_lock(mutex); + + while (c > dec_row_mt_sync->cur_sb_col[r - 1] - nsync) { + pthread_cond_wait(&dec_row_mt_sync->cond_[r - 1], mutex); + } + pthread_mutex_unlock(mutex); + } +#else + (void)dec_row_mt_sync; + (void)r; + (void)c; +#endif // CONFIG_MULTITHREAD +} + +static INLINE void sync_write(AV1DecRowMTSync *const dec_row_mt_sync, int r, + int c, const int sb_cols) { +#if CONFIG_MULTITHREAD + const int nsync = dec_row_mt_sync->sync_range; + int cur; + int sig = 1; + + if (c < sb_cols - 1) { + cur = c; + if (c % nsync) sig = 0; + } else { + cur = sb_cols + nsync; + } + + if (sig) { + pthread_mutex_lock(&dec_row_mt_sync->mutex_[r]); + + dec_row_mt_sync->cur_sb_col[r] = cur; + + pthread_cond_signal(&dec_row_mt_sync->cond_[r]); + pthread_mutex_unlock(&dec_row_mt_sync->mutex_[r]); + } +#else + (void)dec_row_mt_sync; + (void)r; + (void)c; + (void)sb_cols; +#endif // CONFIG_MULTITHREAD +} + +static AOM_INLINE void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td, + TileInfo tile_info, + const int mi_row) { + AV1_COMMON *const cm = &pbi->common; + const int num_planes = av1_num_planes(cm); + TileDataDec *const tile_data = + pbi->tile_data + tile_info.tile_row * cm->tiles.cols + tile_info.tile_col; + const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info); + const int sb_row_in_tile = + (mi_row - tile_info.mi_row_start) >> cm->seq_params.mib_size_log2; + int sb_col_in_tile = 0; + + for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end; + mi_col += cm->seq_params.mib_size, sb_col_in_tile++) { + set_cb_buffer(pbi, &td->xd, pbi->cb_buffer_base, num_planes, mi_row, + mi_col); + + sync_read(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile); + + // Decoding of the super-block + decode_partition(pbi, td, mi_row, mi_col, td->bit_reader, + cm->seq_params.sb_size, 0x2); + + sync_write(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile, + sb_cols_in_tile); + } +} + +static int check_trailing_bits_after_symbol_coder(aom_reader *r) { + if (aom_reader_has_overflowed(r)) return -1; + + uint32_t nb_bits = aom_reader_tell(r); + uint32_t nb_bytes = (nb_bits + 7) >> 3; + const uint8_t *p = aom_reader_find_begin(r) + nb_bytes; + + // aom_reader_tell() returns 1 for a newly initialized decoder, and the + // return value only increases as values are decoded. So nb_bits > 0, and + // thus p > p_begin. Therefore accessing p[-1] is safe. + uint8_t last_byte = p[-1]; + uint8_t pattern = 128 >> ((nb_bits - 1) & 7); + if ((last_byte & (2 * pattern - 1)) != pattern) return -1; + + // Make sure that all padding bytes are zero as required by the spec. + const uint8_t *p_end = aom_reader_find_end(r); + while (p < p_end) { + if (*p != 0) return -1; + p++; + } + return 0; +} + +static AOM_INLINE void set_decode_func_pointers(ThreadData *td, + int parse_decode_flag) { + td->read_coeffs_tx_intra_block_visit = decode_block_void; + td->predict_and_recon_intra_block_visit = decode_block_void; + td->read_coeffs_tx_inter_block_visit = decode_block_void; + td->inverse_tx_inter_block_visit = decode_block_void; + td->predict_inter_block_visit = predict_inter_block_void; + td->cfl_store_inter_block_visit = cfl_store_inter_block_void; + + if (parse_decode_flag & 0x1) { + td->read_coeffs_tx_intra_block_visit = read_coeffs_tx_intra_block; + td->read_coeffs_tx_inter_block_visit = av1_read_coeffs_txb_facade; + } + if (parse_decode_flag & 0x2) { + td->predict_and_recon_intra_block_visit = + predict_and_reconstruct_intra_block; + td->inverse_tx_inter_block_visit = inverse_transform_inter_block; + td->predict_inter_block_visit = predict_inter_block; + td->cfl_store_inter_block_visit = cfl_store_inter_block; + } +} + +static AOM_INLINE void decode_tile(AV1Decoder *pbi, ThreadData *const td, + int tile_row, int tile_col) { + TileInfo tile_info; + + AV1_COMMON *const cm = &pbi->common; + const int num_planes = av1_num_planes(cm); + + av1_tile_set_row(&tile_info, cm, tile_row); + av1_tile_set_col(&tile_info, cm, tile_col); + av1_zero_above_context(cm, &td->xd, tile_info.mi_col_start, + tile_info.mi_col_end, tile_row); + av1_reset_loop_filter_delta(&td->xd, num_planes); + av1_reset_loop_restoration(&td->xd, num_planes); + + for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end; + mi_row += cm->seq_params.mib_size) { + av1_zero_left_context(&td->xd); + + for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end; + mi_col += cm->seq_params.mib_size) { + set_cb_buffer(pbi, &td->xd, &td->cb_buffer_base, num_planes, 0, 0); + + // Bit-stream parsing and decoding of the superblock + decode_partition(pbi, td, mi_row, mi_col, td->bit_reader, + cm->seq_params.sb_size, 0x3); + + if (aom_reader_has_overflowed(td->bit_reader)) { + aom_merge_corrupted_flag(&td->xd.corrupted, 1); + return; + } + } + } + + int corrupted = + (check_trailing_bits_after_symbol_coder(td->bit_reader)) ? 1 : 0; + aom_merge_corrupted_flag(&td->xd.corrupted, corrupted); +} + +static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data, + const uint8_t *data_end, int start_tile, + int end_tile) { + AV1_COMMON *const cm = &pbi->common; + ThreadData *const td = &pbi->td; + CommonTileParams *const tiles = &cm->tiles; + const int tile_cols = tiles->cols; + const int tile_rows = tiles->rows; + const int n_tiles = tile_cols * tile_rows; + TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers; + const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows); + const int single_row = pbi->dec_tile_row >= 0; + const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols); + const int single_col = pbi->dec_tile_col >= 0; + int tile_rows_start; + int tile_rows_end; + int tile_cols_start; + int tile_cols_end; + int inv_col_order; + int inv_row_order; + int tile_row, tile_col; + uint8_t allow_update_cdf; + const uint8_t *raw_data_end = NULL; + + if (tiles->large_scale) { + tile_rows_start = single_row ? dec_tile_row : 0; + tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows; + tile_cols_start = single_col ? dec_tile_col : 0; + tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols; + inv_col_order = pbi->inv_tile_order && !single_col; + inv_row_order = pbi->inv_tile_order && !single_row; + allow_update_cdf = 0; + } else { + tile_rows_start = 0; + tile_rows_end = tile_rows; + tile_cols_start = 0; + tile_cols_end = tile_cols; + inv_col_order = pbi->inv_tile_order; + inv_row_order = pbi->inv_tile_order; + allow_update_cdf = 1; + } + + // No tiles to decode. + if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start || + // First tile is larger than end_tile. + tile_rows_start * tiles->cols + tile_cols_start > end_tile || + // Last tile is smaller than start_tile. + (tile_rows_end - 1) * tiles->cols + tile_cols_end - 1 < start_tile) + return data; + + allow_update_cdf = allow_update_cdf && !cm->features.disable_cdf_update; + + assert(tile_rows <= MAX_TILE_ROWS); + assert(tile_cols <= MAX_TILE_COLS); + +#if EXT_TILE_DEBUG + if (tiles->large_scale && !pbi->ext_tile_debug) + raw_data_end = get_ls_single_tile_buffer(pbi, data, tile_buffers); + else if (tiles->large_scale && pbi->ext_tile_debug) + raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers); + else +#endif // EXT_TILE_DEBUG + get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile); + + if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) { + decoder_alloc_tile_data(pbi, n_tiles); + } +#if CONFIG_ACCOUNTING + if (pbi->acct_enabled) { + aom_accounting_reset(&pbi->accounting); + } +#endif + + set_decode_func_pointers(&pbi->td, 0x3); + + // Load all tile information into thread_data. + td->xd = pbi->mb; + td->xd.corrupted = 0; + td->xd.mc_buf[0] = td->mc_buf[0]; + td->xd.mc_buf[1] = td->mc_buf[1]; + td->xd.tmp_conv_dst = td->tmp_conv_dst; + for (int j = 0; j < 2; ++j) { + td->xd.tmp_obmc_bufs[j] = td->tmp_obmc_bufs[j]; + } + + for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) { + const int row = inv_row_order ? tile_rows - 1 - tile_row : tile_row; + + for (tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) { + const int col = inv_col_order ? tile_cols - 1 - tile_col : tile_col; + TileDataDec *const tile_data = pbi->tile_data + row * tiles->cols + col; + const TileBufferDec *const tile_bs_buf = &tile_buffers[row][col]; + + if (row * tiles->cols + col < start_tile || + row * tiles->cols + col > end_tile) + continue; + + td->bit_reader = &tile_data->bit_reader; + av1_zero(td->cb_buffer_base.dqcoeff); + av1_tile_init(&td->xd.tile, cm, row, col); + td->xd.current_qindex = cm->quant_params.base_qindex; + setup_bool_decoder(tile_bs_buf->data, data_end, tile_bs_buf->size, + &cm->error, td->bit_reader, allow_update_cdf); +#if CONFIG_ACCOUNTING + if (pbi->acct_enabled) { + td->bit_reader->accounting = &pbi->accounting; + td->bit_reader->accounting->last_tell_frac = + aom_reader_tell_frac(td->bit_reader); + } else { + td->bit_reader->accounting = NULL; + } +#endif + av1_init_macroblockd(cm, &td->xd, NULL); + av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), row, + &td->xd); + + // Initialise the tile context from the frame context + tile_data->tctx = *cm->fc; + td->xd.tile_ctx = &tile_data->tctx; + + // decode tile + decode_tile(pbi, td, row, col); + aom_merge_corrupted_flag(&pbi->mb.corrupted, td->xd.corrupted); + if (pbi->mb.corrupted) + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Failed to decode tile data"); + } + } + + if (tiles->large_scale) { + if (n_tiles == 1) { + // Find the end of the single tile buffer + return aom_reader_find_end(&pbi->tile_data->bit_reader); + } + // Return the end of the last tile buffer + return raw_data_end; + } + TileDataDec *const tile_data = pbi->tile_data + end_tile; + + return aom_reader_find_end(&tile_data->bit_reader); +} + +static TileJobsDec *get_dec_job_info(AV1DecTileMT *tile_mt_info) { + TileJobsDec *cur_job_info = NULL; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(tile_mt_info->job_mutex); + + if (tile_mt_info->jobs_dequeued < tile_mt_info->jobs_enqueued) { + cur_job_info = tile_mt_info->job_queue + tile_mt_info->jobs_dequeued; + tile_mt_info->jobs_dequeued++; + } + + pthread_mutex_unlock(tile_mt_info->job_mutex); +#else + (void)tile_mt_info; +#endif + return cur_job_info; +} + +static AOM_INLINE void tile_worker_hook_init( + AV1Decoder *const pbi, DecWorkerData *const thread_data, + const TileBufferDec *const tile_buffer, TileDataDec *const tile_data, + uint8_t allow_update_cdf) { + AV1_COMMON *cm = &pbi->common; + ThreadData *const td = thread_data->td; + int tile_row = tile_data->tile_info.tile_row; + int tile_col = tile_data->tile_info.tile_col; + + td->bit_reader = &tile_data->bit_reader; + av1_zero(td->cb_buffer_base.dqcoeff); + av1_tile_init(&td->xd.tile, cm, tile_row, tile_col); + td->xd.current_qindex = cm->quant_params.base_qindex; + setup_bool_decoder(tile_buffer->data, thread_data->data_end, + tile_buffer->size, &thread_data->error_info, + td->bit_reader, allow_update_cdf); +#if CONFIG_ACCOUNTING + if (pbi->acct_enabled) { + td->bit_reader->accounting = &pbi->accounting; + td->bit_reader->accounting->last_tell_frac = + aom_reader_tell_frac(td->bit_reader); + } else { + td->bit_reader->accounting = NULL; + } +#endif + av1_init_macroblockd(cm, &td->xd, NULL); + td->xd.error_info = &thread_data->error_info; + av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row, + &td->xd); + + // Initialise the tile context from the frame context + tile_data->tctx = *cm->fc; + td->xd.tile_ctx = &tile_data->tctx; +#if CONFIG_ACCOUNTING + if (pbi->acct_enabled) { + tile_data->bit_reader.accounting->last_tell_frac = + aom_reader_tell_frac(&tile_data->bit_reader); + } +#endif +} + +static int tile_worker_hook(void *arg1, void *arg2) { + DecWorkerData *const thread_data = (DecWorkerData *)arg1; + AV1Decoder *const pbi = (AV1Decoder *)arg2; + AV1_COMMON *cm = &pbi->common; + ThreadData *const td = thread_data->td; + uint8_t allow_update_cdf; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(thread_data->error_info.jmp)) { + thread_data->error_info.setjmp = 0; + thread_data->td->xd.corrupted = 1; + return 0; + } + thread_data->error_info.setjmp = 1; + + allow_update_cdf = cm->tiles.large_scale ? 0 : 1; + allow_update_cdf = allow_update_cdf && !cm->features.disable_cdf_update; + + set_decode_func_pointers(td, 0x3); + + assert(cm->tiles.cols > 0); + while (!td->xd.corrupted) { + TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info); + + if (cur_job_info != NULL) { + const TileBufferDec *const tile_buffer = cur_job_info->tile_buffer; + TileDataDec *const tile_data = cur_job_info->tile_data; + tile_worker_hook_init(pbi, thread_data, tile_buffer, tile_data, + allow_update_cdf); + // decode tile + int tile_row = tile_data->tile_info.tile_row; + int tile_col = tile_data->tile_info.tile_col; + decode_tile(pbi, td, tile_row, tile_col); + } else { + break; + } + } + thread_data->error_info.setjmp = 0; + return !td->xd.corrupted; +} + +static INLINE int get_max_row_mt_workers_per_tile(AV1_COMMON *cm, + TileInfo tile) { + // NOTE: Currently value of max workers is calculated based + // on the parse and decode time. As per the theoretical estimate + // when percentage of parse time is equal to percentage of decode + // time, number of workers needed to parse + decode a tile can not + // exceed more than 2. + // TODO(any): Modify this value if parsing is optimized in future. + int sb_rows = av1_get_sb_rows_in_tile(cm, tile); + int max_workers = + sb_rows == 1 ? AOM_MIN_THREADS_PER_TILE : AOM_MAX_THREADS_PER_TILE; + return max_workers; +} + +// The caller must hold pbi->row_mt_mutex_ when calling this function. +// Returns 1 if either the next job is stored in *next_job_info or 1 is stored +// in *end_of_frame. +// NOTE: The caller waits on pbi->row_mt_cond_ if this function returns 0. +// The return value of this function depends on the following variables: +// - frame_row_mt_info->mi_rows_parse_done +// - frame_row_mt_info->mi_rows_decode_started +// - frame_row_mt_info->row_mt_exit +// Therefore we may need to signal or broadcast pbi->row_mt_cond_ if any of +// these variables is modified. +static int get_next_job_info(AV1Decoder *const pbi, + AV1DecRowMTJobInfo *next_job_info, + int *end_of_frame) { + AV1_COMMON *cm = &pbi->common; + TileDataDec *tile_data; + AV1DecRowMTSync *dec_row_mt_sync; + AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info; + TileInfo tile_info; + const int tile_rows_start = frame_row_mt_info->tile_rows_start; + const int tile_rows_end = frame_row_mt_info->tile_rows_end; + const int tile_cols_start = frame_row_mt_info->tile_cols_start; + const int tile_cols_end = frame_row_mt_info->tile_cols_end; + const int start_tile = frame_row_mt_info->start_tile; + const int end_tile = frame_row_mt_info->end_tile; + const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size]; + int num_mis_to_decode, num_threads_working; + int num_mis_waiting_for_decode; + int min_threads_working = INT_MAX; + int max_mis_to_decode = 0; + int tile_row_idx, tile_col_idx; + int tile_row = -1; + int tile_col = -1; + + memset(next_job_info, 0, sizeof(*next_job_info)); + + // Frame decode is completed or error is encountered. + *end_of_frame = (frame_row_mt_info->mi_rows_decode_started == + frame_row_mt_info->mi_rows_to_decode) || + (frame_row_mt_info->row_mt_exit == 1); + if (*end_of_frame) { + return 1; + } + + // Decoding cannot start as bit-stream parsing is not complete. + assert(frame_row_mt_info->mi_rows_parse_done >= + frame_row_mt_info->mi_rows_decode_started); + if (frame_row_mt_info->mi_rows_parse_done == + frame_row_mt_info->mi_rows_decode_started) + return 0; + + // Choose the tile to decode. + for (tile_row_idx = tile_rows_start; tile_row_idx < tile_rows_end; + ++tile_row_idx) { + for (tile_col_idx = tile_cols_start; tile_col_idx < tile_cols_end; + ++tile_col_idx) { + if (tile_row_idx * cm->tiles.cols + tile_col_idx < start_tile || + tile_row_idx * cm->tiles.cols + tile_col_idx > end_tile) + continue; + + tile_data = pbi->tile_data + tile_row_idx * cm->tiles.cols + tile_col_idx; + dec_row_mt_sync = &tile_data->dec_row_mt_sync; + + num_threads_working = dec_row_mt_sync->num_threads_working; + num_mis_waiting_for_decode = (dec_row_mt_sync->mi_rows_parse_done - + dec_row_mt_sync->mi_rows_decode_started) * + dec_row_mt_sync->mi_cols; + num_mis_to_decode = + (dec_row_mt_sync->mi_rows - dec_row_mt_sync->mi_rows_decode_started) * + dec_row_mt_sync->mi_cols; + + assert(num_mis_to_decode >= num_mis_waiting_for_decode); + + // Pick the tile which has minimum number of threads working on it. + if (num_mis_waiting_for_decode > 0) { + if (num_threads_working < min_threads_working) { + min_threads_working = num_threads_working; + max_mis_to_decode = 0; + } + if (num_threads_working == min_threads_working && + num_mis_to_decode > max_mis_to_decode && + num_threads_working < + get_max_row_mt_workers_per_tile(cm, tile_data->tile_info)) { + max_mis_to_decode = num_mis_to_decode; + tile_row = tile_row_idx; + tile_col = tile_col_idx; + } + } + } + } + // No job found to process + if (tile_row == -1 || tile_col == -1) return 0; + + tile_data = pbi->tile_data + tile_row * cm->tiles.cols + tile_col; + tile_info = tile_data->tile_info; + dec_row_mt_sync = &tile_data->dec_row_mt_sync; + + next_job_info->tile_row = tile_row; + next_job_info->tile_col = tile_col; + next_job_info->mi_row = + dec_row_mt_sync->mi_rows_decode_started + tile_info.mi_row_start; + + dec_row_mt_sync->num_threads_working++; + dec_row_mt_sync->mi_rows_decode_started += sb_mi_size; + frame_row_mt_info->mi_rows_decode_started += sb_mi_size; + assert(frame_row_mt_info->mi_rows_parse_done >= + frame_row_mt_info->mi_rows_decode_started); +#if CONFIG_MULTITHREAD + if (frame_row_mt_info->mi_rows_decode_started == + frame_row_mt_info->mi_rows_to_decode) { + pthread_cond_broadcast(pbi->row_mt_cond_); + } +#endif + + return 1; +} + +static INLINE void signal_parse_sb_row_done(AV1Decoder *const pbi, + TileDataDec *const tile_data, + const int sb_mi_size) { + AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(pbi->row_mt_mutex_); +#endif + assert(frame_row_mt_info->mi_rows_parse_done >= + frame_row_mt_info->mi_rows_decode_started); + tile_data->dec_row_mt_sync.mi_rows_parse_done += sb_mi_size; + frame_row_mt_info->mi_rows_parse_done += sb_mi_size; +#if CONFIG_MULTITHREAD + // A new decode job is available. Wake up one worker thread to handle the + // new decode job. + // NOTE: This assumes we bump mi_rows_parse_done and mi_rows_decode_started + // by the same increment (sb_mi_size). + pthread_cond_signal(pbi->row_mt_cond_); + pthread_mutex_unlock(pbi->row_mt_mutex_); +#endif +} + +// This function is very similar to decode_tile(). It would be good to figure +// out how to share code. +static AOM_INLINE void parse_tile_row_mt(AV1Decoder *pbi, ThreadData *const td, + TileDataDec *const tile_data) { + AV1_COMMON *const cm = &pbi->common; + const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size]; + const int num_planes = av1_num_planes(cm); + TileInfo tile_info = tile_data->tile_info; + int tile_row = tile_info.tile_row; + + av1_zero_above_context(cm, &td->xd, tile_info.mi_col_start, + tile_info.mi_col_end, tile_row); + av1_reset_loop_filter_delta(&td->xd, num_planes); + av1_reset_loop_restoration(&td->xd, num_planes); + + for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end; + mi_row += cm->seq_params.mib_size) { + av1_zero_left_context(&td->xd); + + for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end; + mi_col += cm->seq_params.mib_size) { + set_cb_buffer(pbi, &td->xd, pbi->cb_buffer_base, num_planes, mi_row, + mi_col); + + // Bit-stream parsing of the superblock + decode_partition(pbi, td, mi_row, mi_col, td->bit_reader, + cm->seq_params.sb_size, 0x1); + + if (aom_reader_has_overflowed(td->bit_reader)) { + aom_merge_corrupted_flag(&td->xd.corrupted, 1); + return; + } + } + signal_parse_sb_row_done(pbi, tile_data, sb_mi_size); + } + + int corrupted = + (check_trailing_bits_after_symbol_coder(td->bit_reader)) ? 1 : 0; + aom_merge_corrupted_flag(&td->xd.corrupted, corrupted); +} + +static int row_mt_worker_hook(void *arg1, void *arg2) { + DecWorkerData *const thread_data = (DecWorkerData *)arg1; + AV1Decoder *const pbi = (AV1Decoder *)arg2; + AV1_COMMON *cm = &pbi->common; + ThreadData *const td = thread_data->td; + uint8_t allow_update_cdf; + AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info; + td->xd.corrupted = 0; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(thread_data->error_info.jmp)) { + thread_data->error_info.setjmp = 0; + thread_data->td->xd.corrupted = 1; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(pbi->row_mt_mutex_); +#endif + frame_row_mt_info->row_mt_exit = 1; +#if CONFIG_MULTITHREAD + pthread_cond_broadcast(pbi->row_mt_cond_); + pthread_mutex_unlock(pbi->row_mt_mutex_); +#endif + return 0; + } + thread_data->error_info.setjmp = 1; + + allow_update_cdf = cm->tiles.large_scale ? 0 : 1; + allow_update_cdf = allow_update_cdf && !cm->features.disable_cdf_update; + + set_decode_func_pointers(td, 0x1); + + assert(cm->tiles.cols > 0); + while (!td->xd.corrupted) { + TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info); + + if (cur_job_info != NULL) { + const TileBufferDec *const tile_buffer = cur_job_info->tile_buffer; + TileDataDec *const tile_data = cur_job_info->tile_data; + tile_worker_hook_init(pbi, thread_data, tile_buffer, tile_data, + allow_update_cdf); +#if CONFIG_MULTITHREAD + pthread_mutex_lock(pbi->row_mt_mutex_); +#endif + tile_data->dec_row_mt_sync.num_threads_working++; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(pbi->row_mt_mutex_); +#endif + // decode tile + parse_tile_row_mt(pbi, td, tile_data); +#if CONFIG_MULTITHREAD + pthread_mutex_lock(pbi->row_mt_mutex_); +#endif + tile_data->dec_row_mt_sync.num_threads_working--; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(pbi->row_mt_mutex_); +#endif + } else { + break; + } + } + + if (td->xd.corrupted) { + thread_data->error_info.setjmp = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(pbi->row_mt_mutex_); +#endif + frame_row_mt_info->row_mt_exit = 1; +#if CONFIG_MULTITHREAD + pthread_cond_broadcast(pbi->row_mt_cond_); + pthread_mutex_unlock(pbi->row_mt_mutex_); +#endif + return 0; + } + + set_decode_func_pointers(td, 0x2); + + while (1) { + AV1DecRowMTJobInfo next_job_info; + int end_of_frame = 0; + +#if CONFIG_MULTITHREAD + pthread_mutex_lock(pbi->row_mt_mutex_); +#endif + while (!get_next_job_info(pbi, &next_job_info, &end_of_frame)) { +#if CONFIG_MULTITHREAD + pthread_cond_wait(pbi->row_mt_cond_, pbi->row_mt_mutex_); +#endif + } +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(pbi->row_mt_mutex_); +#endif + + if (end_of_frame) break; + + int tile_row = next_job_info.tile_row; + int tile_col = next_job_info.tile_col; + int mi_row = next_job_info.mi_row; + + TileDataDec *tile_data = + pbi->tile_data + tile_row * cm->tiles.cols + tile_col; + AV1DecRowMTSync *dec_row_mt_sync = &tile_data->dec_row_mt_sync; + TileInfo tile_info = tile_data->tile_info; + + av1_tile_init(&td->xd.tile, cm, tile_row, tile_col); + av1_init_macroblockd(cm, &td->xd, NULL); + td->xd.error_info = &thread_data->error_info; + + decode_tile_sb_row(pbi, td, tile_info, mi_row); + +#if CONFIG_MULTITHREAD + pthread_mutex_lock(pbi->row_mt_mutex_); +#endif + dec_row_mt_sync->num_threads_working--; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(pbi->row_mt_mutex_); +#endif + } + thread_data->error_info.setjmp = 0; + return !td->xd.corrupted; +} + +// sorts in descending order +static int compare_tile_buffers(const void *a, const void *b) { + const TileJobsDec *const buf1 = (const TileJobsDec *)a; + const TileJobsDec *const buf2 = (const TileJobsDec *)b; + return (((int)buf2->tile_buffer->size) - ((int)buf1->tile_buffer->size)); +} + +static AOM_INLINE void enqueue_tile_jobs(AV1Decoder *pbi, AV1_COMMON *cm, + int tile_rows_start, int tile_rows_end, + int tile_cols_start, int tile_cols_end, + int start_tile, int end_tile) { + AV1DecTileMT *tile_mt_info = &pbi->tile_mt_info; + TileJobsDec *tile_job_queue = tile_mt_info->job_queue; + tile_mt_info->jobs_enqueued = 0; + tile_mt_info->jobs_dequeued = 0; + + for (int row = tile_rows_start; row < tile_rows_end; row++) { + for (int col = tile_cols_start; col < tile_cols_end; col++) { + if (row * cm->tiles.cols + col < start_tile || + row * cm->tiles.cols + col > end_tile) + continue; + tile_job_queue->tile_buffer = &pbi->tile_buffers[row][col]; + tile_job_queue->tile_data = pbi->tile_data + row * cm->tiles.cols + col; + tile_job_queue++; + tile_mt_info->jobs_enqueued++; + } + } +} + +static AOM_INLINE void alloc_dec_jobs(AV1DecTileMT *tile_mt_info, + AV1_COMMON *cm, int tile_rows, + int tile_cols) { + tile_mt_info->alloc_tile_rows = tile_rows; + tile_mt_info->alloc_tile_cols = tile_cols; + int num_tiles = tile_rows * tile_cols; +#if CONFIG_MULTITHREAD + { + CHECK_MEM_ERROR(cm, tile_mt_info->job_mutex, + aom_malloc(sizeof(*tile_mt_info->job_mutex) * num_tiles)); + + for (int i = 0; i < num_tiles; i++) { + pthread_mutex_init(&tile_mt_info->job_mutex[i], NULL); + } + } +#endif + CHECK_MEM_ERROR(cm, tile_mt_info->job_queue, + aom_malloc(sizeof(*tile_mt_info->job_queue) * num_tiles)); +} + +void av1_free_mc_tmp_buf(ThreadData *thread_data) { + int ref; + for (ref = 0; ref < 2; ref++) { + if (thread_data->mc_buf_use_highbd) + aom_free(CONVERT_TO_SHORTPTR(thread_data->mc_buf[ref])); + else + aom_free(thread_data->mc_buf[ref]); + thread_data->mc_buf[ref] = NULL; + } + thread_data->mc_buf_size = 0; + thread_data->mc_buf_use_highbd = 0; + + aom_free(thread_data->tmp_conv_dst); + thread_data->tmp_conv_dst = NULL; + for (int i = 0; i < 2; ++i) { + aom_free(thread_data->tmp_obmc_bufs[i]); + thread_data->tmp_obmc_bufs[i] = NULL; + } +} + +static AOM_INLINE void allocate_mc_tmp_buf(AV1_COMMON *const cm, + ThreadData *thread_data, + int buf_size, int use_highbd) { + for (int ref = 0; ref < 2; ref++) { + if (use_highbd) { + uint16_t *hbd_mc_buf; + CHECK_MEM_ERROR(cm, hbd_mc_buf, (uint16_t *)aom_memalign(16, buf_size)); + thread_data->mc_buf[ref] = CONVERT_TO_BYTEPTR(hbd_mc_buf); + } else { + CHECK_MEM_ERROR(cm, thread_data->mc_buf[ref], + (uint8_t *)aom_memalign(16, buf_size)); + } + } + thread_data->mc_buf_size = buf_size; + thread_data->mc_buf_use_highbd = use_highbd; + + CHECK_MEM_ERROR(cm, thread_data->tmp_conv_dst, + aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * + sizeof(*thread_data->tmp_conv_dst))); + for (int i = 0; i < 2; ++i) { + CHECK_MEM_ERROR( + cm, thread_data->tmp_obmc_bufs[i], + aom_memalign(16, 2 * MAX_MB_PLANE * MAX_SB_SQUARE * + sizeof(*thread_data->tmp_obmc_bufs[i]))); + } +} + +static AOM_INLINE void reset_dec_workers(AV1Decoder *pbi, + AVxWorkerHook worker_hook, + int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + + // Reset tile decoding hook + for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) { + AVxWorker *const worker = &pbi->tile_workers[worker_idx]; + DecWorkerData *const thread_data = pbi->thread_data + worker_idx; + thread_data->td->xd = pbi->mb; + thread_data->td->xd.corrupted = 0; + thread_data->td->xd.mc_buf[0] = thread_data->td->mc_buf[0]; + thread_data->td->xd.mc_buf[1] = thread_data->td->mc_buf[1]; + thread_data->td->xd.tmp_conv_dst = thread_data->td->tmp_conv_dst; + for (int j = 0; j < 2; ++j) { + thread_data->td->xd.tmp_obmc_bufs[j] = thread_data->td->tmp_obmc_bufs[j]; + } + winterface->sync(worker); + + worker->hook = worker_hook; + worker->data1 = thread_data; + worker->data2 = pbi; + } +#if CONFIG_ACCOUNTING + if (pbi->acct_enabled) { + aom_accounting_reset(&pbi->accounting); + } +#endif +} + +static AOM_INLINE void launch_dec_workers(AV1Decoder *pbi, + const uint8_t *data_end, + int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + + for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) { + AVxWorker *const worker = &pbi->tile_workers[worker_idx]; + DecWorkerData *const thread_data = (DecWorkerData *)worker->data1; + + thread_data->data_end = data_end; + + worker->had_error = 0; + if (worker_idx == num_workers - 1) { + winterface->execute(worker); + } else { + winterface->launch(worker); + } + } +} + +static AOM_INLINE void sync_dec_workers(AV1Decoder *pbi, int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + int corrupted = 0; + + for (int worker_idx = num_workers; worker_idx > 0; --worker_idx) { + AVxWorker *const worker = &pbi->tile_workers[worker_idx - 1]; + aom_merge_corrupted_flag(&corrupted, !winterface->sync(worker)); + } + + pbi->mb.corrupted = corrupted; +} + +static AOM_INLINE void decode_mt_init(AV1Decoder *pbi) { + AV1_COMMON *const cm = &pbi->common; + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + int worker_idx; + + // Create workers and thread_data + if (pbi->num_workers == 0) { + const int num_threads = pbi->max_threads; + CHECK_MEM_ERROR(cm, pbi->tile_workers, + aom_malloc(num_threads * sizeof(*pbi->tile_workers))); + CHECK_MEM_ERROR(cm, pbi->thread_data, + aom_malloc(num_threads * sizeof(*pbi->thread_data))); + + for (worker_idx = 0; worker_idx < num_threads; ++worker_idx) { + AVxWorker *const worker = &pbi->tile_workers[worker_idx]; + DecWorkerData *const thread_data = pbi->thread_data + worker_idx; + ++pbi->num_workers; + + winterface->init(worker); + worker->thread_name = "aom tile worker"; + if (worker_idx < num_threads - 1 && !winterface->reset(worker)) { + aom_internal_error(&cm->error, AOM_CODEC_ERROR, + "Tile decoder thread creation failed"); + } + + if (worker_idx < num_threads - 1) { + // Allocate thread data. + CHECK_MEM_ERROR(cm, thread_data->td, + aom_memalign(32, sizeof(*thread_data->td))); + av1_zero(*thread_data->td); + } else { + // Main thread acts as a worker and uses the thread data in pbi + thread_data->td = &pbi->td; + } + thread_data->error_info.error_code = AOM_CODEC_OK; + thread_data->error_info.setjmp = 0; + } + } + const int use_highbd = cm->seq_params.use_highbitdepth; + const int buf_size = MC_TEMP_BUF_PELS << use_highbd; + for (worker_idx = 0; worker_idx < pbi->max_threads - 1; ++worker_idx) { + DecWorkerData *const thread_data = pbi->thread_data + worker_idx; + if (thread_data->td->mc_buf_size != buf_size) { + av1_free_mc_tmp_buf(thread_data->td); + allocate_mc_tmp_buf(cm, thread_data->td, buf_size, use_highbd); + } + } +} + +static AOM_INLINE void tile_mt_queue(AV1Decoder *pbi, int tile_cols, + int tile_rows, int tile_rows_start, + int tile_rows_end, int tile_cols_start, + int tile_cols_end, int start_tile, + int end_tile) { + AV1_COMMON *const cm = &pbi->common; + if (pbi->tile_mt_info.alloc_tile_cols != tile_cols || + pbi->tile_mt_info.alloc_tile_rows != tile_rows) { + av1_dealloc_dec_jobs(&pbi->tile_mt_info); + alloc_dec_jobs(&pbi->tile_mt_info, cm, tile_rows, tile_cols); + } + enqueue_tile_jobs(pbi, cm, tile_rows_start, tile_rows_end, tile_cols_start, + tile_cols_end, start_tile, end_tile); + qsort(pbi->tile_mt_info.job_queue, pbi->tile_mt_info.jobs_enqueued, + sizeof(pbi->tile_mt_info.job_queue[0]), compare_tile_buffers); +} + +static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data, + const uint8_t *data_end, int start_tile, + int end_tile) { + AV1_COMMON *const cm = &pbi->common; + CommonTileParams *const tiles = &cm->tiles; + const int tile_cols = tiles->cols; + const int tile_rows = tiles->rows; + const int n_tiles = tile_cols * tile_rows; + TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers; + const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows); + const int single_row = pbi->dec_tile_row >= 0; + const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols); + const int single_col = pbi->dec_tile_col >= 0; + int tile_rows_start; + int tile_rows_end; + int tile_cols_start; + int tile_cols_end; + int tile_count_tg; + int num_workers; + const uint8_t *raw_data_end = NULL; + + if (tiles->large_scale) { + tile_rows_start = single_row ? dec_tile_row : 0; + tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows; + tile_cols_start = single_col ? dec_tile_col : 0; + tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols; + } else { + tile_rows_start = 0; + tile_rows_end = tile_rows; + tile_cols_start = 0; + tile_cols_end = tile_cols; + } + tile_count_tg = end_tile - start_tile + 1; + num_workers = AOMMIN(pbi->max_threads, tile_count_tg); + + // No tiles to decode. + if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start || + // First tile is larger than end_tile. + tile_rows_start * tile_cols + tile_cols_start > end_tile || + // Last tile is smaller than start_tile. + (tile_rows_end - 1) * tile_cols + tile_cols_end - 1 < start_tile) + return data; + + assert(tile_rows <= MAX_TILE_ROWS); + assert(tile_cols <= MAX_TILE_COLS); + assert(tile_count_tg > 0); + assert(num_workers > 0); + assert(start_tile <= end_tile); + assert(start_tile >= 0 && end_tile < n_tiles); + + decode_mt_init(pbi); + + // get tile size in tile group +#if EXT_TILE_DEBUG + if (tiles->large_scale) assert(pbi->ext_tile_debug == 1); + if (tiles->large_scale) + raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers); + else +#endif // EXT_TILE_DEBUG + get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile); + + if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) { + decoder_alloc_tile_data(pbi, n_tiles); + } + + for (int row = 0; row < tile_rows; row++) { + for (int col = 0; col < tile_cols; col++) { + TileDataDec *tile_data = pbi->tile_data + row * tiles->cols + col; + av1_tile_init(&tile_data->tile_info, cm, row, col); + } + } + + tile_mt_queue(pbi, tile_cols, tile_rows, tile_rows_start, tile_rows_end, + tile_cols_start, tile_cols_end, start_tile, end_tile); + + reset_dec_workers(pbi, tile_worker_hook, num_workers); + launch_dec_workers(pbi, data_end, num_workers); + sync_dec_workers(pbi, num_workers); + + if (pbi->mb.corrupted) + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Failed to decode tile data"); + + if (tiles->large_scale) { + if (n_tiles == 1) { + // Find the end of the single tile buffer + return aom_reader_find_end(&pbi->tile_data->bit_reader); + } + // Return the end of the last tile buffer + return raw_data_end; + } + TileDataDec *const tile_data = pbi->tile_data + end_tile; + + return aom_reader_find_end(&tile_data->bit_reader); +} + +static AOM_INLINE void dec_alloc_cb_buf(AV1Decoder *pbi) { + AV1_COMMON *const cm = &pbi->common; + int size = ((cm->mi_params.mi_rows >> cm->seq_params.mib_size_log2) + 1) * + ((cm->mi_params.mi_cols >> cm->seq_params.mib_size_log2) + 1); + + if (pbi->cb_buffer_alloc_size < size) { + av1_dec_free_cb_buf(pbi); + CHECK_MEM_ERROR(cm, pbi->cb_buffer_base, + aom_memalign(32, sizeof(*pbi->cb_buffer_base) * size)); + memset(pbi->cb_buffer_base, 0, sizeof(*pbi->cb_buffer_base) * size); + pbi->cb_buffer_alloc_size = size; + } +} + +static AOM_INLINE void row_mt_frame_init(AV1Decoder *pbi, int tile_rows_start, + int tile_rows_end, int tile_cols_start, + int tile_cols_end, int start_tile, + int end_tile, int max_sb_rows) { + AV1_COMMON *const cm = &pbi->common; + AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info; + + frame_row_mt_info->tile_rows_start = tile_rows_start; + frame_row_mt_info->tile_rows_end = tile_rows_end; + frame_row_mt_info->tile_cols_start = tile_cols_start; + frame_row_mt_info->tile_cols_end = tile_cols_end; + frame_row_mt_info->start_tile = start_tile; + frame_row_mt_info->end_tile = end_tile; + frame_row_mt_info->mi_rows_to_decode = 0; + frame_row_mt_info->mi_rows_parse_done = 0; + frame_row_mt_info->mi_rows_decode_started = 0; + frame_row_mt_info->row_mt_exit = 0; + + for (int tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) { + for (int tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) { + if (tile_row * cm->tiles.cols + tile_col < start_tile || + tile_row * cm->tiles.cols + tile_col > end_tile) + continue; + + TileDataDec *const tile_data = + pbi->tile_data + tile_row * cm->tiles.cols + tile_col; + TileInfo tile_info = tile_data->tile_info; + + tile_data->dec_row_mt_sync.mi_rows_parse_done = 0; + tile_data->dec_row_mt_sync.mi_rows_decode_started = 0; + tile_data->dec_row_mt_sync.num_threads_working = 0; + tile_data->dec_row_mt_sync.mi_rows = + ALIGN_POWER_OF_TWO(tile_info.mi_row_end - tile_info.mi_row_start, + cm->seq_params.mib_size_log2); + tile_data->dec_row_mt_sync.mi_cols = + ALIGN_POWER_OF_TWO(tile_info.mi_col_end - tile_info.mi_col_start, + cm->seq_params.mib_size_log2); + + frame_row_mt_info->mi_rows_to_decode += + tile_data->dec_row_mt_sync.mi_rows; + + // Initialize cur_sb_col to -1 for all SB rows. + memset(tile_data->dec_row_mt_sync.cur_sb_col, -1, + sizeof(*tile_data->dec_row_mt_sync.cur_sb_col) * max_sb_rows); + } + } + +#if CONFIG_MULTITHREAD + if (pbi->row_mt_mutex_ == NULL) { + CHECK_MEM_ERROR(cm, pbi->row_mt_mutex_, + aom_malloc(sizeof(*(pbi->row_mt_mutex_)))); + if (pbi->row_mt_mutex_) { + pthread_mutex_init(pbi->row_mt_mutex_, NULL); + } + } + + if (pbi->row_mt_cond_ == NULL) { + CHECK_MEM_ERROR(cm, pbi->row_mt_cond_, + aom_malloc(sizeof(*(pbi->row_mt_cond_)))); + if (pbi->row_mt_cond_) { + pthread_cond_init(pbi->row_mt_cond_, NULL); + } + } +#endif +} + +static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data, + const uint8_t *data_end, + int start_tile, int end_tile) { + AV1_COMMON *const cm = &pbi->common; + CommonTileParams *const tiles = &cm->tiles; + const int tile_cols = tiles->cols; + const int tile_rows = tiles->rows; + const int n_tiles = tile_cols * tile_rows; + TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers; + const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows); + const int single_row = pbi->dec_tile_row >= 0; + const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols); + const int single_col = pbi->dec_tile_col >= 0; + int tile_rows_start; + int tile_rows_end; + int tile_cols_start; + int tile_cols_end; + int tile_count_tg; + int num_workers = 0; + int max_threads; + const uint8_t *raw_data_end = NULL; + int max_sb_rows = 0; + + if (tiles->large_scale) { + tile_rows_start = single_row ? dec_tile_row : 0; + tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows; + tile_cols_start = single_col ? dec_tile_col : 0; + tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols; + } else { + tile_rows_start = 0; + tile_rows_end = tile_rows; + tile_cols_start = 0; + tile_cols_end = tile_cols; + } + tile_count_tg = end_tile - start_tile + 1; + max_threads = pbi->max_threads; + + // No tiles to decode. + if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start || + // First tile is larger than end_tile. + tile_rows_start * tile_cols + tile_cols_start > end_tile || + // Last tile is smaller than start_tile. + (tile_rows_end - 1) * tile_cols + tile_cols_end - 1 < start_tile) + return data; + + assert(tile_rows <= MAX_TILE_ROWS); + assert(tile_cols <= MAX_TILE_COLS); + assert(tile_count_tg > 0); + assert(max_threads > 0); + assert(start_tile <= end_tile); + assert(start_tile >= 0 && end_tile < n_tiles); + + (void)tile_count_tg; + + decode_mt_init(pbi); + + // get tile size in tile group +#if EXT_TILE_DEBUG + if (tiles->large_scale) assert(pbi->ext_tile_debug == 1); + if (tiles->large_scale) + raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers); + else +#endif // EXT_TILE_DEBUG + get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile); + + if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) { + if (pbi->tile_data != NULL) { + for (int i = 0; i < pbi->allocated_tiles; i++) { + TileDataDec *const tile_data = pbi->tile_data + i; + av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync); + } + } + decoder_alloc_tile_data(pbi, n_tiles); + } + + for (int row = 0; row < tile_rows; row++) { + for (int col = 0; col < tile_cols; col++) { + TileDataDec *tile_data = pbi->tile_data + row * tiles->cols + col; + av1_tile_init(&tile_data->tile_info, cm, row, col); + + max_sb_rows = AOMMAX(max_sb_rows, + av1_get_sb_rows_in_tile(cm, tile_data->tile_info)); + num_workers += get_max_row_mt_workers_per_tile(cm, tile_data->tile_info); + } + } + num_workers = AOMMIN(num_workers, max_threads); + + if (pbi->allocated_row_mt_sync_rows != max_sb_rows) { + for (int i = 0; i < n_tiles; ++i) { + TileDataDec *const tile_data = pbi->tile_data + i; + av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync); + dec_row_mt_alloc(&tile_data->dec_row_mt_sync, cm, max_sb_rows); + } + pbi->allocated_row_mt_sync_rows = max_sb_rows; + } + + tile_mt_queue(pbi, tile_cols, tile_rows, tile_rows_start, tile_rows_end, + tile_cols_start, tile_cols_end, start_tile, end_tile); + + dec_alloc_cb_buf(pbi); + + row_mt_frame_init(pbi, tile_rows_start, tile_rows_end, tile_cols_start, + tile_cols_end, start_tile, end_tile, max_sb_rows); + + reset_dec_workers(pbi, row_mt_worker_hook, num_workers); + launch_dec_workers(pbi, data_end, num_workers); + sync_dec_workers(pbi, num_workers); + + if (pbi->mb.corrupted) + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Failed to decode tile data"); + + if (tiles->large_scale) { + if (n_tiles == 1) { + // Find the end of the single tile buffer + return aom_reader_find_end(&pbi->tile_data->bit_reader); + } + // Return the end of the last tile buffer + return raw_data_end; + } + TileDataDec *const tile_data = pbi->tile_data + end_tile; + + return aom_reader_find_end(&tile_data->bit_reader); +} + +static AOM_INLINE void error_handler(void *data) { + AV1_COMMON *const cm = (AV1_COMMON *)data; + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, "Truncated packet"); +} + +// Reads the high_bitdepth and twelve_bit fields in color_config() and sets +// seq_params->bit_depth based on the values of those fields and +// seq_params->profile. Reports errors by calling rb->error_handler() or +// aom_internal_error(). +static AOM_INLINE void read_bitdepth( + struct aom_read_bit_buffer *rb, SequenceHeader *seq_params, + struct aom_internal_error_info *error_info) { + const int high_bitdepth = aom_rb_read_bit(rb); + if (seq_params->profile == PROFILE_2 && high_bitdepth) { + const int twelve_bit = aom_rb_read_bit(rb); + seq_params->bit_depth = twelve_bit ? AOM_BITS_12 : AOM_BITS_10; + } else if (seq_params->profile <= PROFILE_2) { + seq_params->bit_depth = high_bitdepth ? AOM_BITS_10 : AOM_BITS_8; + } else { + aom_internal_error(error_info, AOM_CODEC_UNSUP_BITSTREAM, + "Unsupported profile/bit-depth combination"); + } +#if !CONFIG_AV1_HIGHBITDEPTH + if (seq_params->bit_depth > AOM_BITS_8) { + aom_internal_error(error_info, AOM_CODEC_UNSUP_BITSTREAM, + "Bit-depth %d not supported", seq_params->bit_depth); + } +#endif +} + +void av1_read_film_grain_params(AV1_COMMON *cm, + struct aom_read_bit_buffer *rb) { + aom_film_grain_t *pars = &cm->film_grain_params; + const SequenceHeader *const seq_params = &cm->seq_params; + + pars->apply_grain = aom_rb_read_bit(rb); + if (!pars->apply_grain) { + memset(pars, 0, sizeof(*pars)); + return; + } + + pars->random_seed = aom_rb_read_literal(rb, 16); + if (cm->current_frame.frame_type == INTER_FRAME) + pars->update_parameters = aom_rb_read_bit(rb); + else + pars->update_parameters = 1; + + pars->bit_depth = seq_params->bit_depth; + + if (!pars->update_parameters) { + // inherit parameters from a previous reference frame + int film_grain_params_ref_idx = aom_rb_read_literal(rb, 3); + // Section 6.8.20: It is a requirement of bitstream conformance that + // film_grain_params_ref_idx is equal to ref_frame_idx[ j ] for some value + // of j in the range 0 to REFS_PER_FRAME - 1. + int found = 0; + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + if (film_grain_params_ref_idx == cm->remapped_ref_idx[i]) { + found = 1; + break; + } + } + if (!found) { + aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Invalid film grain reference idx %d. ref_frame_idx = " + "{%d, %d, %d, %d, %d, %d, %d}", + film_grain_params_ref_idx, cm->remapped_ref_idx[0], + cm->remapped_ref_idx[1], cm->remapped_ref_idx[2], + cm->remapped_ref_idx[3], cm->remapped_ref_idx[4], + cm->remapped_ref_idx[5], cm->remapped_ref_idx[6]); + } + RefCntBuffer *const buf = cm->ref_frame_map[film_grain_params_ref_idx]; + if (buf == NULL) { + aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Invalid Film grain reference idx"); + } + if (!buf->film_grain_params_present) { + aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Film grain reference parameters not available"); + } + uint16_t random_seed = pars->random_seed; + *pars = buf->film_grain_params; // inherit paramaters + pars->random_seed = random_seed; // with new random seed + return; + } + + // Scaling functions parameters + pars->num_y_points = aom_rb_read_literal(rb, 4); // max 14 + if (pars->num_y_points > 14) + aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Number of points for film grain luma scaling function " + "exceeds the maximum value."); + for (int i = 0; i < pars->num_y_points; i++) { + pars->scaling_points_y[i][0] = aom_rb_read_literal(rb, 8); + if (i && pars->scaling_points_y[i - 1][0] >= pars->scaling_points_y[i][0]) + aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "First coordinate of the scaling function points " + "shall be increasing."); + pars->scaling_points_y[i][1] = aom_rb_read_literal(rb, 8); + } + + if (!seq_params->monochrome) + pars->chroma_scaling_from_luma = aom_rb_read_bit(rb); + else + pars->chroma_scaling_from_luma = 0; + + if (seq_params->monochrome || pars->chroma_scaling_from_luma || + ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) && + (pars->num_y_points == 0))) { + pars->num_cb_points = 0; + pars->num_cr_points = 0; + } else { + pars->num_cb_points = aom_rb_read_literal(rb, 4); // max 10 + if (pars->num_cb_points > 10) + aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Number of points for film grain cb scaling function " + "exceeds the maximum value."); + for (int i = 0; i < pars->num_cb_points; i++) { + pars->scaling_points_cb[i][0] = aom_rb_read_literal(rb, 8); + if (i && + pars->scaling_points_cb[i - 1][0] >= pars->scaling_points_cb[i][0]) + aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "First coordinate of the scaling function points " + "shall be increasing."); + pars->scaling_points_cb[i][1] = aom_rb_read_literal(rb, 8); + } + + pars->num_cr_points = aom_rb_read_literal(rb, 4); // max 10 + if (pars->num_cr_points > 10) + aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Number of points for film grain cr scaling function " + "exceeds the maximum value."); + for (int i = 0; i < pars->num_cr_points; i++) { + pars->scaling_points_cr[i][0] = aom_rb_read_literal(rb, 8); + if (i && + pars->scaling_points_cr[i - 1][0] >= pars->scaling_points_cr[i][0]) + aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "First coordinate of the scaling function points " + "shall be increasing."); + pars->scaling_points_cr[i][1] = aom_rb_read_literal(rb, 8); + } + + if ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) && + (((pars->num_cb_points == 0) && (pars->num_cr_points != 0)) || + ((pars->num_cb_points != 0) && (pars->num_cr_points == 0)))) + aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "In YCbCr 4:2:0, film grain shall be applied " + "to both chroma components or neither."); + } + + pars->scaling_shift = aom_rb_read_literal(rb, 2) + 8; // 8 + value + + // AR coefficients + // Only sent if the corresponsing scaling function has + // more than 0 points + + pars->ar_coeff_lag = aom_rb_read_literal(rb, 2); + + int num_pos_luma = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1); + int num_pos_chroma = num_pos_luma; + if (pars->num_y_points > 0) ++num_pos_chroma; + + if (pars->num_y_points) + for (int i = 0; i < num_pos_luma; i++) + pars->ar_coeffs_y[i] = aom_rb_read_literal(rb, 8) - 128; + + if (pars->num_cb_points || pars->chroma_scaling_from_luma) + for (int i = 0; i < num_pos_chroma; i++) + pars->ar_coeffs_cb[i] = aom_rb_read_literal(rb, 8) - 128; + + if (pars->num_cr_points || pars->chroma_scaling_from_luma) + for (int i = 0; i < num_pos_chroma; i++) + pars->ar_coeffs_cr[i] = aom_rb_read_literal(rb, 8) - 128; + + pars->ar_coeff_shift = aom_rb_read_literal(rb, 2) + 6; // 6 + value + + pars->grain_scale_shift = aom_rb_read_literal(rb, 2); + + if (pars->num_cb_points) { + pars->cb_mult = aom_rb_read_literal(rb, 8); + pars->cb_luma_mult = aom_rb_read_literal(rb, 8); + pars->cb_offset = aom_rb_read_literal(rb, 9); + } + + if (pars->num_cr_points) { + pars->cr_mult = aom_rb_read_literal(rb, 8); + pars->cr_luma_mult = aom_rb_read_literal(rb, 8); + pars->cr_offset = aom_rb_read_literal(rb, 9); + } + + pars->overlap_flag = aom_rb_read_bit(rb); + + pars->clip_to_restricted_range = aom_rb_read_bit(rb); +} + +static AOM_INLINE void read_film_grain(AV1_COMMON *cm, + struct aom_read_bit_buffer *rb) { + if (cm->seq_params.film_grain_params_present && + (cm->show_frame || cm->showable_frame)) { + av1_read_film_grain_params(cm, rb); + } else { + memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params)); + } + cm->film_grain_params.bit_depth = cm->seq_params.bit_depth; + memcpy(&cm->cur_frame->film_grain_params, &cm->film_grain_params, + sizeof(aom_film_grain_t)); +} + +void av1_read_color_config(struct aom_read_bit_buffer *rb, + int allow_lowbitdepth, SequenceHeader *seq_params, + struct aom_internal_error_info *error_info) { + read_bitdepth(rb, seq_params, error_info); + + seq_params->use_highbitdepth = + seq_params->bit_depth > AOM_BITS_8 || !allow_lowbitdepth; + // monochrome bit (not needed for PROFILE_1) + const int is_monochrome = + seq_params->profile != PROFILE_1 ? aom_rb_read_bit(rb) : 0; + seq_params->monochrome = is_monochrome; + int color_description_present_flag = aom_rb_read_bit(rb); + if (color_description_present_flag) { + seq_params->color_primaries = aom_rb_read_literal(rb, 8); + seq_params->transfer_characteristics = aom_rb_read_literal(rb, 8); + seq_params->matrix_coefficients = aom_rb_read_literal(rb, 8); + } else { + seq_params->color_primaries = AOM_CICP_CP_UNSPECIFIED; + seq_params->transfer_characteristics = AOM_CICP_TC_UNSPECIFIED; + seq_params->matrix_coefficients = AOM_CICP_MC_UNSPECIFIED; + } + if (is_monochrome) { + // [16,235] (including xvycc) vs [0,255] range + seq_params->color_range = aom_rb_read_bit(rb); + seq_params->subsampling_y = seq_params->subsampling_x = 1; + seq_params->chroma_sample_position = AOM_CSP_UNKNOWN; + seq_params->separate_uv_delta_q = 0; + return; + } + if (seq_params->color_primaries == AOM_CICP_CP_BT_709 && + seq_params->transfer_characteristics == AOM_CICP_TC_SRGB && + seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) { + seq_params->subsampling_y = seq_params->subsampling_x = 0; + seq_params->color_range = 1; // assume full color-range + if (!(seq_params->profile == PROFILE_1 || + (seq_params->profile == PROFILE_2 && + seq_params->bit_depth == AOM_BITS_12))) { + aom_internal_error( + error_info, AOM_CODEC_UNSUP_BITSTREAM, + "sRGB colorspace not compatible with specified profile"); + } + } else { + // [16,235] (including xvycc) vs [0,255] range + seq_params->color_range = aom_rb_read_bit(rb); + if (seq_params->profile == PROFILE_0) { + // 420 only + seq_params->subsampling_x = seq_params->subsampling_y = 1; + } else if (seq_params->profile == PROFILE_1) { + // 444 only + seq_params->subsampling_x = seq_params->subsampling_y = 0; + } else { + assert(seq_params->profile == PROFILE_2); + if (seq_params->bit_depth == AOM_BITS_12) { + seq_params->subsampling_x = aom_rb_read_bit(rb); + if (seq_params->subsampling_x) + seq_params->subsampling_y = aom_rb_read_bit(rb); // 422 or 420 + else + seq_params->subsampling_y = 0; // 444 + } else { + // 422 + seq_params->subsampling_x = 1; + seq_params->subsampling_y = 0; + } + } + if (seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY && + (seq_params->subsampling_x || seq_params->subsampling_y)) { + aom_internal_error( + error_info, AOM_CODEC_UNSUP_BITSTREAM, + "Identity CICP Matrix incompatible with non 4:4:4 color sampling"); + } + if (seq_params->subsampling_x && seq_params->subsampling_y) { + seq_params->chroma_sample_position = aom_rb_read_literal(rb, 2); + } + } + seq_params->separate_uv_delta_q = aom_rb_read_bit(rb); +} + +void av1_read_timing_info_header(aom_timing_info_t *timing_info, + struct aom_internal_error_info *error, + struct aom_read_bit_buffer *rb) { + timing_info->num_units_in_display_tick = + aom_rb_read_unsigned_literal(rb, + 32); // Number of units in a display tick + timing_info->time_scale = aom_rb_read_unsigned_literal(rb, 32); // Time scale + if (timing_info->num_units_in_display_tick == 0 || + timing_info->time_scale == 0) { + aom_internal_error( + error, AOM_CODEC_UNSUP_BITSTREAM, + "num_units_in_display_tick and time_scale must be greater than 0."); + } + timing_info->equal_picture_interval = + aom_rb_read_bit(rb); // Equal picture interval bit + if (timing_info->equal_picture_interval) { + const uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(rb); + if (num_ticks_per_picture_minus_1 == UINT32_MAX) { + aom_internal_error( + error, AOM_CODEC_UNSUP_BITSTREAM, + "num_ticks_per_picture_minus_1 cannot be (1 << 32) − 1."); + } + timing_info->num_ticks_per_picture = num_ticks_per_picture_minus_1 + 1; + } +} + +void av1_read_decoder_model_info(aom_dec_model_info_t *decoder_model_info, + struct aom_read_bit_buffer *rb) { + decoder_model_info->encoder_decoder_buffer_delay_length = + aom_rb_read_literal(rb, 5) + 1; + decoder_model_info->num_units_in_decoding_tick = + aom_rb_read_unsigned_literal(rb, + 32); // Number of units in a decoding tick + decoder_model_info->buffer_removal_time_length = + aom_rb_read_literal(rb, 5) + 1; + decoder_model_info->frame_presentation_time_length = + aom_rb_read_literal(rb, 5) + 1; +} + +void av1_read_op_parameters_info(aom_dec_model_op_parameters_t *op_params, + int buffer_delay_length, + struct aom_read_bit_buffer *rb) { + op_params->decoder_buffer_delay = + aom_rb_read_unsigned_literal(rb, buffer_delay_length); + op_params->encoder_buffer_delay = + aom_rb_read_unsigned_literal(rb, buffer_delay_length); + op_params->low_delay_mode_flag = aom_rb_read_bit(rb); +} + +static AOM_INLINE void read_temporal_point_info( + AV1_COMMON *const cm, struct aom_read_bit_buffer *rb) { + cm->frame_presentation_time = aom_rb_read_unsigned_literal( + rb, cm->seq_params.decoder_model_info.frame_presentation_time_length); +} + +void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb, + SequenceHeader *seq_params) { + const int num_bits_width = aom_rb_read_literal(rb, 4) + 1; + const int num_bits_height = aom_rb_read_literal(rb, 4) + 1; + const int max_frame_width = aom_rb_read_literal(rb, num_bits_width) + 1; + const int max_frame_height = aom_rb_read_literal(rb, num_bits_height) + 1; + + seq_params->num_bits_width = num_bits_width; + seq_params->num_bits_height = num_bits_height; + seq_params->max_frame_width = max_frame_width; + seq_params->max_frame_height = max_frame_height; + + if (seq_params->reduced_still_picture_hdr) { + seq_params->frame_id_numbers_present_flag = 0; + } else { + seq_params->frame_id_numbers_present_flag = aom_rb_read_bit(rb); + } + if (seq_params->frame_id_numbers_present_flag) { + // We must always have delta_frame_id_length < frame_id_length, + // in order for a frame to be referenced with a unique delta. + // Avoid wasting bits by using a coding that enforces this restriction. + seq_params->delta_frame_id_length = aom_rb_read_literal(rb, 4) + 2; + seq_params->frame_id_length = + aom_rb_read_literal(rb, 3) + seq_params->delta_frame_id_length + 1; + if (seq_params->frame_id_length > 16) + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Invalid frame_id_length"); + } + + setup_sb_size(seq_params, rb); + + seq_params->enable_filter_intra = aom_rb_read_bit(rb); + seq_params->enable_intra_edge_filter = aom_rb_read_bit(rb); + + if (seq_params->reduced_still_picture_hdr) { + seq_params->enable_interintra_compound = 0; + seq_params->enable_masked_compound = 0; + seq_params->enable_warped_motion = 0; + seq_params->enable_dual_filter = 0; + seq_params->order_hint_info.enable_order_hint = 0; + seq_params->order_hint_info.enable_dist_wtd_comp = 0; + seq_params->order_hint_info.enable_ref_frame_mvs = 0; + seq_params->force_screen_content_tools = 2; // SELECT_SCREEN_CONTENT_TOOLS + seq_params->force_integer_mv = 2; // SELECT_INTEGER_MV + seq_params->order_hint_info.order_hint_bits_minus_1 = -1; + } else { + seq_params->enable_interintra_compound = aom_rb_read_bit(rb); + seq_params->enable_masked_compound = aom_rb_read_bit(rb); + seq_params->enable_warped_motion = aom_rb_read_bit(rb); + seq_params->enable_dual_filter = aom_rb_read_bit(rb); + + seq_params->order_hint_info.enable_order_hint = aom_rb_read_bit(rb); + seq_params->order_hint_info.enable_dist_wtd_comp = + seq_params->order_hint_info.enable_order_hint ? aom_rb_read_bit(rb) : 0; + seq_params->order_hint_info.enable_ref_frame_mvs = + seq_params->order_hint_info.enable_order_hint ? aom_rb_read_bit(rb) : 0; + + if (aom_rb_read_bit(rb)) { + seq_params->force_screen_content_tools = + 2; // SELECT_SCREEN_CONTENT_TOOLS + } else { + seq_params->force_screen_content_tools = aom_rb_read_bit(rb); + } + + if (seq_params->force_screen_content_tools > 0) { + if (aom_rb_read_bit(rb)) { + seq_params->force_integer_mv = 2; // SELECT_INTEGER_MV + } else { + seq_params->force_integer_mv = aom_rb_read_bit(rb); + } + } else { + seq_params->force_integer_mv = 2; // SELECT_INTEGER_MV + } + seq_params->order_hint_info.order_hint_bits_minus_1 = + seq_params->order_hint_info.enable_order_hint + ? aom_rb_read_literal(rb, 3) + : -1; + } + + seq_params->enable_superres = aom_rb_read_bit(rb); + seq_params->enable_cdef = aom_rb_read_bit(rb); + seq_params->enable_restoration = aom_rb_read_bit(rb); +} + +static int read_global_motion_params(WarpedMotionParams *params, + const WarpedMotionParams *ref_params, + struct aom_read_bit_buffer *rb, + int allow_hp) { + TransformationType type = aom_rb_read_bit(rb); + if (type != IDENTITY) { + if (aom_rb_read_bit(rb)) + type = ROTZOOM; + else + type = aom_rb_read_bit(rb) ? TRANSLATION : AFFINE; + } + + *params = default_warp_params; + params->wmtype = type; + + if (type >= ROTZOOM) { + params->wmmat[2] = aom_rb_read_signed_primitive_refsubexpfin( + rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - + (1 << GM_ALPHA_PREC_BITS)) * + GM_ALPHA_DECODE_FACTOR + + (1 << WARPEDMODEL_PREC_BITS); + params->wmmat[3] = aom_rb_read_signed_primitive_refsubexpfin( + rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF)) * + GM_ALPHA_DECODE_FACTOR; + } + + if (type >= AFFINE) { + params->wmmat[4] = aom_rb_read_signed_primitive_refsubexpfin( + rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF)) * + GM_ALPHA_DECODE_FACTOR; + params->wmmat[5] = aom_rb_read_signed_primitive_refsubexpfin( + rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) - + (1 << GM_ALPHA_PREC_BITS)) * + GM_ALPHA_DECODE_FACTOR + + (1 << WARPEDMODEL_PREC_BITS); + } else { + params->wmmat[4] = -params->wmmat[3]; + params->wmmat[5] = params->wmmat[2]; + } + + if (type >= TRANSLATION) { + const int trans_bits = (type == TRANSLATION) + ? GM_ABS_TRANS_ONLY_BITS - !allow_hp + : GM_ABS_TRANS_BITS; + const int trans_dec_factor = + (type == TRANSLATION) ? GM_TRANS_ONLY_DECODE_FACTOR * (1 << !allow_hp) + : GM_TRANS_DECODE_FACTOR; + const int trans_prec_diff = (type == TRANSLATION) + ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp + : GM_TRANS_PREC_DIFF; + params->wmmat[0] = aom_rb_read_signed_primitive_refsubexpfin( + rb, (1 << trans_bits) + 1, SUBEXPFIN_K, + (ref_params->wmmat[0] >> trans_prec_diff)) * + trans_dec_factor; + params->wmmat[1] = aom_rb_read_signed_primitive_refsubexpfin( + rb, (1 << trans_bits) + 1, SUBEXPFIN_K, + (ref_params->wmmat[1] >> trans_prec_diff)) * + trans_dec_factor; + } + + if (params->wmtype <= AFFINE) { + int good_shear_params = av1_get_shear_params(params); + if (!good_shear_params) return 0; + } + + return 1; +} + +static AOM_INLINE void read_global_motion(AV1_COMMON *cm, + struct aom_read_bit_buffer *rb) { + for (int frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) { + const WarpedMotionParams *ref_params = + cm->prev_frame ? &cm->prev_frame->global_motion[frame] + : &default_warp_params; + int good_params = + read_global_motion_params(&cm->global_motion[frame], ref_params, rb, + cm->features.allow_high_precision_mv); + if (!good_params) { +#if WARPED_MOTION_DEBUG + printf("Warning: unexpected global motion shear params from aomenc\n"); +#endif + cm->global_motion[frame].invalid = 1; + } + + // TODO(sarahparker, debargha): The logic in the commented out code below + // does not work currently and causes mismatches when resize is on. Fix it + // before turning the optimization back on. + /* + YV12_BUFFER_CONFIG *ref_buf = get_ref_frame(cm, frame); + if (cm->width == ref_buf->y_crop_width && + cm->height == ref_buf->y_crop_height) { + read_global_motion_params(&cm->global_motion[frame], + &cm->prev_frame->global_motion[frame], rb, + cm->features.allow_high_precision_mv); + } else { + cm->global_motion[frame] = default_warp_params; + } + */ + /* + printf("Dec Ref %d [%d/%d]: %d %d %d %d\n", + frame, cm->current_frame.frame_number, cm->show_frame, + cm->global_motion[frame].wmmat[0], + cm->global_motion[frame].wmmat[1], + cm->global_motion[frame].wmmat[2], + cm->global_motion[frame].wmmat[3]); + */ + } + memcpy(cm->cur_frame->global_motion, cm->global_motion, + REF_FRAMES * sizeof(WarpedMotionParams)); +} + +// Release the references to the frame buffers in cm->ref_frame_map and reset +// all elements of cm->ref_frame_map to NULL. +static AOM_INLINE void reset_ref_frame_map(AV1_COMMON *const cm) { + BufferPool *const pool = cm->buffer_pool; + + for (int i = 0; i < REF_FRAMES; i++) { + decrease_ref_count(cm->ref_frame_map[i], pool); + cm->ref_frame_map[i] = NULL; + } +} + +// If the refresh_frame_flags bitmask is set, update reference frame id values +// and mark frames as valid for reference. +static AOM_INLINE void update_ref_frame_id(AV1Decoder *const pbi) { + AV1_COMMON *const cm = &pbi->common; + int refresh_frame_flags = cm->current_frame.refresh_frame_flags; + for (int i = 0; i < REF_FRAMES; i++) { + if ((refresh_frame_flags >> i) & 1) { + cm->ref_frame_id[i] = cm->current_frame_id; + pbi->valid_for_referencing[i] = 1; + } + } +} + +static AOM_INLINE void show_existing_frame_reset(AV1Decoder *const pbi, + int existing_frame_idx) { + AV1_COMMON *const cm = &pbi->common; + + assert(cm->show_existing_frame); + + cm->current_frame.frame_type = KEY_FRAME; + + cm->current_frame.refresh_frame_flags = (1 << REF_FRAMES) - 1; + + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + cm->remapped_ref_idx[i] = INVALID_IDX; + } + + if (pbi->need_resync) { + reset_ref_frame_map(cm); + pbi->need_resync = 0; + } + + // Note that the displayed frame must be valid for referencing in order to + // have been selected. + cm->current_frame_id = cm->ref_frame_id[existing_frame_idx]; + update_ref_frame_id(pbi); + + cm->features.refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED; +} + +static INLINE void reset_frame_buffers(AV1_COMMON *cm) { + RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + int i; + + lock_buffer_pool(cm->buffer_pool); + reset_ref_frame_map(cm); + assert(cm->cur_frame->ref_count == 1); + for (i = 0; i < FRAME_BUFFERS; ++i) { + // Reset all unreferenced frame buffers. We can also reset cm->cur_frame + // because we are the sole owner of cm->cur_frame. + if (frame_bufs[i].ref_count > 0 && &frame_bufs[i] != cm->cur_frame) { + continue; + } + frame_bufs[i].order_hint = 0; + av1_zero(frame_bufs[i].ref_order_hints); + } + av1_zero_unused_internal_frame_buffers(&cm->buffer_pool->int_frame_buffers); + unlock_buffer_pool(cm->buffer_pool); +} + +// On success, returns 0. On failure, calls aom_internal_error and does not +// return. +static int read_uncompressed_header(AV1Decoder *pbi, + struct aom_read_bit_buffer *rb) { + AV1_COMMON *const cm = &pbi->common; + const SequenceHeader *const seq_params = &cm->seq_params; + CurrentFrame *const current_frame = &cm->current_frame; + FeatureFlags *const features = &cm->features; + MACROBLOCKD *const xd = &pbi->mb; + BufferPool *const pool = cm->buffer_pool; + RefCntBuffer *const frame_bufs = pool->frame_bufs; + + if (!pbi->sequence_header_ready) { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "No sequence header"); + } + + if (seq_params->reduced_still_picture_hdr) { + cm->show_existing_frame = 0; + cm->show_frame = 1; + current_frame->frame_type = KEY_FRAME; + if (pbi->sequence_header_changed) { + // This is the start of a new coded video sequence. + pbi->sequence_header_changed = 0; + pbi->decoding_first_frame = 1; + reset_frame_buffers(cm); + } + features->error_resilient_mode = 1; + } else { + cm->show_existing_frame = aom_rb_read_bit(rb); + pbi->reset_decoder_state = 0; + + if (cm->show_existing_frame) { + if (pbi->sequence_header_changed) { + aom_internal_error( + &cm->error, AOM_CODEC_CORRUPT_FRAME, + "New sequence header starts with a show_existing_frame."); + } + // Show an existing frame directly. + const int existing_frame_idx = aom_rb_read_literal(rb, 3); + RefCntBuffer *const frame_to_show = cm->ref_frame_map[existing_frame_idx]; + if (frame_to_show == NULL) { + aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Buffer does not contain a decoded frame"); + } + if (seq_params->decoder_model_info_present_flag && + seq_params->timing_info.equal_picture_interval == 0) { + read_temporal_point_info(cm, rb); + } + if (seq_params->frame_id_numbers_present_flag) { + int frame_id_length = seq_params->frame_id_length; + int display_frame_id = aom_rb_read_literal(rb, frame_id_length); + /* Compare display_frame_id with ref_frame_id and check valid for + * referencing */ + if (display_frame_id != cm->ref_frame_id[existing_frame_idx] || + pbi->valid_for_referencing[existing_frame_idx] == 0) + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Reference buffer frame ID mismatch"); + } + lock_buffer_pool(pool); + assert(frame_to_show->ref_count > 0); + // cm->cur_frame should be the buffer referenced by the return value + // of the get_free_fb() call in assign_cur_frame_new_fb() (called by + // av1_receive_compressed_data()), so the ref_count should be 1. + assert(cm->cur_frame->ref_count == 1); + // assign_frame_buffer_p() decrements ref_count directly rather than + // call decrease_ref_count(). If cm->cur_frame->raw_frame_buffer has + // already been allocated, it will not be released by + // assign_frame_buffer_p()! + assert(!cm->cur_frame->raw_frame_buffer.data); + assign_frame_buffer_p(&cm->cur_frame, frame_to_show); + pbi->reset_decoder_state = frame_to_show->frame_type == KEY_FRAME; + unlock_buffer_pool(pool); + + cm->lf.filter_level[0] = 0; + cm->lf.filter_level[1] = 0; + cm->show_frame = 1; + + // Section 6.8.2: It is a requirement of bitstream conformance that when + // show_existing_frame is used to show a previous frame, that the value + // of showable_frame for the previous frame was equal to 1. + if (!frame_to_show->showable_frame) { + aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Buffer does not contain a showable frame"); + } + // Section 6.8.2: It is a requirement of bitstream conformance that when + // show_existing_frame is used to show a previous frame with + // RefFrameType[ frame_to_show_map_idx ] equal to KEY_FRAME, that the + // frame is output via the show_existing_frame mechanism at most once. + if (pbi->reset_decoder_state) frame_to_show->showable_frame = 0; + + cm->film_grain_params = frame_to_show->film_grain_params; + + if (pbi->reset_decoder_state) { + show_existing_frame_reset(pbi, existing_frame_idx); + } else { + current_frame->refresh_frame_flags = 0; + } + + return 0; + } + + current_frame->frame_type = (FRAME_TYPE)aom_rb_read_literal(rb, 2); + if (pbi->sequence_header_changed) { + if (current_frame->frame_type == KEY_FRAME) { + // This is the start of a new coded video sequence. + pbi->sequence_header_changed = 0; + pbi->decoding_first_frame = 1; + reset_frame_buffers(cm); + } else { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Sequence header has changed without a keyframe."); + } + } + + cm->show_frame = aom_rb_read_bit(rb); + if (seq_params->still_picture && + (current_frame->frame_type != KEY_FRAME || !cm->show_frame)) { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Still pictures must be coded as shown keyframes"); + } + cm->showable_frame = current_frame->frame_type != KEY_FRAME; + if (cm->show_frame) { + if (seq_params->decoder_model_info_present_flag && + seq_params->timing_info.equal_picture_interval == 0) + read_temporal_point_info(cm, rb); + } else { + // See if this frame can be used as show_existing_frame in future + cm->showable_frame = aom_rb_read_bit(rb); + } + cm->cur_frame->showable_frame = cm->showable_frame; + features->error_resilient_mode = + frame_is_sframe(cm) || + (current_frame->frame_type == KEY_FRAME && cm->show_frame) + ? 1 + : aom_rb_read_bit(rb); + } + + if (current_frame->frame_type == KEY_FRAME && cm->show_frame) { + /* All frames need to be marked as not valid for referencing */ + for (int i = 0; i < REF_FRAMES; i++) { + pbi->valid_for_referencing[i] = 0; + } + } + features->disable_cdf_update = aom_rb_read_bit(rb); + if (seq_params->force_screen_content_tools == 2) { + features->allow_screen_content_tools = aom_rb_read_bit(rb); + } else { + features->allow_screen_content_tools = + seq_params->force_screen_content_tools; + } + + if (features->allow_screen_content_tools) { + if (seq_params->force_integer_mv == 2) { + features->cur_frame_force_integer_mv = aom_rb_read_bit(rb); + } else { + features->cur_frame_force_integer_mv = seq_params->force_integer_mv; + } + } else { + features->cur_frame_force_integer_mv = 0; + } + + int frame_size_override_flag = 0; + features->allow_intrabc = 0; + features->primary_ref_frame = PRIMARY_REF_NONE; + + if (!seq_params->reduced_still_picture_hdr) { + if (seq_params->frame_id_numbers_present_flag) { + int frame_id_length = seq_params->frame_id_length; + int diff_len = seq_params->delta_frame_id_length; + int prev_frame_id = 0; + int have_prev_frame_id = + !pbi->decoding_first_frame && + !(current_frame->frame_type == KEY_FRAME && cm->show_frame); + if (have_prev_frame_id) { + prev_frame_id = cm->current_frame_id; + } + cm->current_frame_id = aom_rb_read_literal(rb, frame_id_length); + + if (have_prev_frame_id) { + int diff_frame_id; + if (cm->current_frame_id > prev_frame_id) { + diff_frame_id = cm->current_frame_id - prev_frame_id; + } else { + diff_frame_id = + (1 << frame_id_length) + cm->current_frame_id - prev_frame_id; + } + /* Check current_frame_id for conformance */ + if (prev_frame_id == cm->current_frame_id || + diff_frame_id >= (1 << (frame_id_length - 1))) { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Invalid value of current_frame_id"); + } + } + /* Check if some frames need to be marked as not valid for referencing */ + for (int i = 0; i < REF_FRAMES; i++) { + if (cm->current_frame_id - (1 << diff_len) > 0) { + if (cm->ref_frame_id[i] > cm->current_frame_id || + cm->ref_frame_id[i] < cm->current_frame_id - (1 << diff_len)) + pbi->valid_for_referencing[i] = 0; + } else { + if (cm->ref_frame_id[i] > cm->current_frame_id && + cm->ref_frame_id[i] < (1 << frame_id_length) + + cm->current_frame_id - (1 << diff_len)) + pbi->valid_for_referencing[i] = 0; + } + } + } + + frame_size_override_flag = frame_is_sframe(cm) ? 1 : aom_rb_read_bit(rb); + + current_frame->order_hint = aom_rb_read_literal( + rb, seq_params->order_hint_info.order_hint_bits_minus_1 + 1); + current_frame->frame_number = current_frame->order_hint; + + if (!features->error_resilient_mode && !frame_is_intra_only(cm)) { + features->primary_ref_frame = aom_rb_read_literal(rb, PRIMARY_REF_BITS); + } + } + + if (seq_params->decoder_model_info_present_flag) { + cm->buffer_removal_time_present = aom_rb_read_bit(rb); + if (cm->buffer_removal_time_present) { + for (int op_num = 0; + op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) { + if (seq_params->op_params[op_num].decoder_model_param_present_flag) { + if ((((seq_params->operating_point_idc[op_num] >> + cm->temporal_layer_id) & + 0x1) && + ((seq_params->operating_point_idc[op_num] >> + (cm->spatial_layer_id + 8)) & + 0x1)) || + seq_params->operating_point_idc[op_num] == 0) { + cm->buffer_removal_times[op_num] = aom_rb_read_unsigned_literal( + rb, seq_params->decoder_model_info.buffer_removal_time_length); + } else { + cm->buffer_removal_times[op_num] = 0; + } + } else { + cm->buffer_removal_times[op_num] = 0; + } + } + } + } + if (current_frame->frame_type == KEY_FRAME) { + if (!cm->show_frame) { // unshown keyframe (forward keyframe) + current_frame->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES); + } else { // shown keyframe + current_frame->refresh_frame_flags = (1 << REF_FRAMES) - 1; + } + + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + cm->remapped_ref_idx[i] = INVALID_IDX; + } + if (pbi->need_resync) { + reset_ref_frame_map(cm); + pbi->need_resync = 0; + } + } else { + if (current_frame->frame_type == INTRA_ONLY_FRAME) { + current_frame->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES); + if (current_frame->refresh_frame_flags == 0xFF) { + aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Intra only frames cannot have refresh flags 0xFF"); + } + if (pbi->need_resync) { + reset_ref_frame_map(cm); + pbi->need_resync = 0; + } + } else if (pbi->need_resync != 1) { /* Skip if need resync */ + current_frame->refresh_frame_flags = + frame_is_sframe(cm) ? 0xFF : aom_rb_read_literal(rb, REF_FRAMES); + } + } + + if (!frame_is_intra_only(cm) || current_frame->refresh_frame_flags != 0xFF) { + // Read all ref frame order hints if error_resilient_mode == 1 + if (features->error_resilient_mode && + seq_params->order_hint_info.enable_order_hint) { + for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) { + // Read order hint from bit stream + unsigned int order_hint = aom_rb_read_literal( + rb, seq_params->order_hint_info.order_hint_bits_minus_1 + 1); + // Get buffer + RefCntBuffer *buf = cm->ref_frame_map[ref_idx]; + if (buf == NULL || order_hint != buf->order_hint) { + if (buf != NULL) { + lock_buffer_pool(pool); + decrease_ref_count(buf, pool); + unlock_buffer_pool(pool); + cm->ref_frame_map[ref_idx] = NULL; + } + // If no corresponding buffer exists, allocate a new buffer with all + // pixels set to neutral grey. + int buf_idx = get_free_fb(cm); + if (buf_idx == INVALID_IDX) { + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Unable to find free frame buffer"); + } + buf = &frame_bufs[buf_idx]; + lock_buffer_pool(pool); + if (aom_realloc_frame_buffer( + &buf->buf, seq_params->max_frame_width, + seq_params->max_frame_height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + AOM_BORDER_IN_PIXELS, features->byte_alignment, + &buf->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv)) { + decrease_ref_count(buf, pool); + unlock_buffer_pool(pool); + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + } + unlock_buffer_pool(pool); + // According to the specification, valid bitstreams are required to + // never use missing reference frames so the filling process for + // missing frames is not normatively defined and RefValid for missing + // frames is set to 0. + + // To make libaom more robust when the bitstream has been corrupted + // by the loss of some frames of data, this code adds a neutral grey + // buffer in place of missing frames, i.e. + // + set_planes_to_neutral_grey(seq_params, &buf->buf, 0); + // + // and allows the frames to be used for referencing, i.e. + // + pbi->valid_for_referencing[ref_idx] = 1; + // + // Please note such behavior is not normative and other decoders may + // use a different approach. + cm->ref_frame_map[ref_idx] = buf; + buf->order_hint = order_hint; + } + } + } + } + + if (current_frame->frame_type == KEY_FRAME) { + setup_frame_size(cm, frame_size_override_flag, rb); + + if (features->allow_screen_content_tools && !av1_superres_scaled(cm)) + features->allow_intrabc = aom_rb_read_bit(rb); + features->allow_ref_frame_mvs = 0; + cm->prev_frame = NULL; + } else { + features->allow_ref_frame_mvs = 0; + + if (current_frame->frame_type == INTRA_ONLY_FRAME) { + cm->cur_frame->film_grain_params_present = + seq_params->film_grain_params_present; + setup_frame_size(cm, frame_size_override_flag, rb); + if (features->allow_screen_content_tools && !av1_superres_scaled(cm)) + features->allow_intrabc = aom_rb_read_bit(rb); + + } else if (pbi->need_resync != 1) { /* Skip if need resync */ + int frame_refs_short_signaling = 0; + // Frame refs short signaling is off when error resilient mode is on. + if (seq_params->order_hint_info.enable_order_hint) + frame_refs_short_signaling = aom_rb_read_bit(rb); + + if (frame_refs_short_signaling) { + // == LAST_FRAME == + const int lst_ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2); + const RefCntBuffer *const lst_buf = cm->ref_frame_map[lst_ref]; + + // == GOLDEN_FRAME == + const int gld_ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2); + const RefCntBuffer *const gld_buf = cm->ref_frame_map[gld_ref]; + + // Most of the time, streams start with a keyframe. In that case, + // ref_frame_map will have been filled in at that point and will not + // contain any NULLs. However, streams are explicitly allowed to start + // with an intra-only frame, so long as they don't then signal a + // reference to a slot that hasn't been set yet. That's what we are + // checking here. + if (lst_buf == NULL) + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Inter frame requests nonexistent reference"); + if (gld_buf == NULL) + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Inter frame requests nonexistent reference"); + + av1_set_frame_refs(cm, cm->remapped_ref_idx, lst_ref, gld_ref); + } + + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + int ref = 0; + if (!frame_refs_short_signaling) { + ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2); + + // Most of the time, streams start with a keyframe. In that case, + // ref_frame_map will have been filled in at that point and will not + // contain any NULLs. However, streams are explicitly allowed to start + // with an intra-only frame, so long as they don't then signal a + // reference to a slot that hasn't been set yet. That's what we are + // checking here. + if (cm->ref_frame_map[ref] == NULL) + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Inter frame requests nonexistent reference"); + cm->remapped_ref_idx[i] = ref; + } else { + ref = cm->remapped_ref_idx[i]; + } + // Check valid for referencing + if (pbi->valid_for_referencing[ref] == 0) + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Reference frame not valid for referencing"); + + cm->ref_frame_sign_bias[LAST_FRAME + i] = 0; + + if (seq_params->frame_id_numbers_present_flag) { + int frame_id_length = seq_params->frame_id_length; + int diff_len = seq_params->delta_frame_id_length; + int delta_frame_id_minus_1 = aom_rb_read_literal(rb, diff_len); + int ref_frame_id = + ((cm->current_frame_id - (delta_frame_id_minus_1 + 1) + + (1 << frame_id_length)) % + (1 << frame_id_length)); + // Compare values derived from delta_frame_id_minus_1 and + // refresh_frame_flags. + if (ref_frame_id != cm->ref_frame_id[ref]) + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Reference buffer frame ID mismatch"); + } + } + + if (!features->error_resilient_mode && frame_size_override_flag) { + setup_frame_size_with_refs(cm, rb); + } else { + setup_frame_size(cm, frame_size_override_flag, rb); + } + + if (features->cur_frame_force_integer_mv) { + features->allow_high_precision_mv = 0; + } else { + features->allow_high_precision_mv = aom_rb_read_bit(rb); + } + features->interp_filter = read_frame_interp_filter(rb); + features->switchable_motion_mode = aom_rb_read_bit(rb); + } + + cm->prev_frame = get_primary_ref_frame_buf(cm); + if (features->primary_ref_frame != PRIMARY_REF_NONE && + get_primary_ref_frame_buf(cm) == NULL) { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Reference frame containing this frame's initial " + "frame context is unavailable."); + } + + if (!(current_frame->frame_type == INTRA_ONLY_FRAME) && + pbi->need_resync != 1) { + if (frame_might_allow_ref_frame_mvs(cm)) + features->allow_ref_frame_mvs = aom_rb_read_bit(rb); + else + features->allow_ref_frame_mvs = 0; + + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + const RefCntBuffer *const ref_buf = get_ref_frame_buf(cm, i); + struct scale_factors *const ref_scale_factors = + get_ref_scale_factors(cm, i); + av1_setup_scale_factors_for_frame( + ref_scale_factors, ref_buf->buf.y_crop_width, + ref_buf->buf.y_crop_height, cm->width, cm->height); + if ((!av1_is_valid_scale(ref_scale_factors))) + aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Reference frame has invalid dimensions"); + } + } + } + + av1_setup_frame_buf_refs(cm); + + av1_setup_frame_sign_bias(cm); + + cm->cur_frame->frame_type = current_frame->frame_type; + + update_ref_frame_id(pbi); + + const int might_bwd_adapt = !(seq_params->reduced_still_picture_hdr) && + !(features->disable_cdf_update); + if (might_bwd_adapt) { + features->refresh_frame_context = aom_rb_read_bit(rb) + ? REFRESH_FRAME_CONTEXT_DISABLED + : REFRESH_FRAME_CONTEXT_BACKWARD; + } else { + features->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED; + } + + cm->cur_frame->buf.bit_depth = seq_params->bit_depth; + cm->cur_frame->buf.color_primaries = seq_params->color_primaries; + cm->cur_frame->buf.transfer_characteristics = + seq_params->transfer_characteristics; + cm->cur_frame->buf.matrix_coefficients = seq_params->matrix_coefficients; + cm->cur_frame->buf.monochrome = seq_params->monochrome; + cm->cur_frame->buf.chroma_sample_position = + seq_params->chroma_sample_position; + cm->cur_frame->buf.color_range = seq_params->color_range; + cm->cur_frame->buf.render_width = cm->render_width; + cm->cur_frame->buf.render_height = cm->render_height; + + if (pbi->need_resync) { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Keyframe / intra-only frame required to reset decoder" + " state"); + } + + if (features->allow_intrabc) { + // Set parameters corresponding to no filtering. + struct loopfilter *lf = &cm->lf; + lf->filter_level[0] = 0; + lf->filter_level[1] = 0; + cm->cdef_info.cdef_bits = 0; + cm->cdef_info.cdef_strengths[0] = 0; + cm->cdef_info.nb_cdef_strengths = 1; + cm->cdef_info.cdef_uv_strengths[0] = 0; + cm->rst_info[0].frame_restoration_type = RESTORE_NONE; + cm->rst_info[1].frame_restoration_type = RESTORE_NONE; + cm->rst_info[2].frame_restoration_type = RESTORE_NONE; + } + + read_tile_info(pbi, rb); + if (!av1_is_min_tile_width_satisfied(cm)) { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Minimum tile width requirement not satisfied"); + } + + CommonQuantParams *const quant_params = &cm->quant_params; + setup_quantization(quant_params, av1_num_planes(cm), + cm->seq_params.separate_uv_delta_q, rb); + xd->bd = (int)seq_params->bit_depth; + + CommonContexts *const above_contexts = &cm->above_contexts; + if (above_contexts->num_planes < av1_num_planes(cm) || + above_contexts->num_mi_cols < cm->mi_params.mi_cols || + above_contexts->num_tile_rows < cm->tiles.rows) { + av1_free_above_context_buffers(above_contexts); + if (av1_alloc_above_context_buffers(above_contexts, cm->tiles.rows, + cm->mi_params.mi_cols, + av1_num_planes(cm))) { + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate context buffers"); + } + } + + if (features->primary_ref_frame == PRIMARY_REF_NONE) { + av1_setup_past_independence(cm); + } + + setup_segmentation(cm, rb); + + cm->delta_q_info.delta_q_res = 1; + cm->delta_q_info.delta_lf_res = 1; + cm->delta_q_info.delta_lf_present_flag = 0; + cm->delta_q_info.delta_lf_multi = 0; + cm->delta_q_info.delta_q_present_flag = + quant_params->base_qindex > 0 ? aom_rb_read_bit(rb) : 0; + if (cm->delta_q_info.delta_q_present_flag) { + xd->current_qindex = quant_params->base_qindex; + cm->delta_q_info.delta_q_res = 1 << aom_rb_read_literal(rb, 2); + if (!features->allow_intrabc) + cm->delta_q_info.delta_lf_present_flag = aom_rb_read_bit(rb); + if (cm->delta_q_info.delta_lf_present_flag) { + cm->delta_q_info.delta_lf_res = 1 << aom_rb_read_literal(rb, 2); + cm->delta_q_info.delta_lf_multi = aom_rb_read_bit(rb); + av1_reset_loop_filter_delta(xd, av1_num_planes(cm)); + } + } + + xd->cur_frame_force_integer_mv = features->cur_frame_force_integer_mv; + + for (int i = 0; i < MAX_SEGMENTS; ++i) { + const int qindex = av1_get_qindex(&cm->seg, i, quant_params->base_qindex); + xd->lossless[i] = + qindex == 0 && quant_params->y_dc_delta_q == 0 && + quant_params->u_dc_delta_q == 0 && quant_params->u_ac_delta_q == 0 && + quant_params->v_dc_delta_q == 0 && quant_params->v_ac_delta_q == 0; + xd->qindex[i] = qindex; + } + features->coded_lossless = is_coded_lossless(cm, xd); + features->all_lossless = features->coded_lossless && !av1_superres_scaled(cm); + setup_segmentation_dequant(cm, xd); + if (features->coded_lossless) { + cm->lf.filter_level[0] = 0; + cm->lf.filter_level[1] = 0; + } + if (features->coded_lossless || !seq_params->enable_cdef) { + cm->cdef_info.cdef_bits = 0; + cm->cdef_info.cdef_strengths[0] = 0; + cm->cdef_info.cdef_uv_strengths[0] = 0; + } + if (features->all_lossless || !seq_params->enable_restoration) { + cm->rst_info[0].frame_restoration_type = RESTORE_NONE; + cm->rst_info[1].frame_restoration_type = RESTORE_NONE; + cm->rst_info[2].frame_restoration_type = RESTORE_NONE; + } + setup_loopfilter(cm, rb); + + if (!features->coded_lossless && seq_params->enable_cdef) { + setup_cdef(cm, rb); + } + if (!features->all_lossless && seq_params->enable_restoration) { + decode_restoration_mode(cm, rb); + } + + features->tx_mode = read_tx_mode(rb, features->coded_lossless); + current_frame->reference_mode = read_frame_reference_mode(cm, rb); + + av1_setup_skip_mode_allowed(cm); + current_frame->skip_mode_info.skip_mode_flag = + current_frame->skip_mode_info.skip_mode_allowed ? aom_rb_read_bit(rb) : 0; + + if (frame_might_allow_warped_motion(cm)) + features->allow_warped_motion = aom_rb_read_bit(rb); + else + features->allow_warped_motion = 0; + + features->reduced_tx_set_used = aom_rb_read_bit(rb); + + if (features->allow_ref_frame_mvs && !frame_might_allow_ref_frame_mvs(cm)) { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Frame wrongly requests reference frame MVs"); + } + + if (!frame_is_intra_only(cm)) read_global_motion(cm, rb); + + cm->cur_frame->film_grain_params_present = + seq_params->film_grain_params_present; + read_film_grain(cm, rb); + +#if EXT_TILE_DEBUG + if (pbi->ext_tile_debug && cm->tiles.large_scale) { + read_ext_tile_info(pbi, rb); + av1_set_single_tile_decoding_mode(cm); + } +#endif // EXT_TILE_DEBUG + return 0; +} + +struct aom_read_bit_buffer *av1_init_read_bit_buffer( + AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data, + const uint8_t *data_end) { + rb->bit_offset = 0; + rb->error_handler = error_handler; + rb->error_handler_data = &pbi->common; + rb->bit_buffer = data; + rb->bit_buffer_end = data_end; + return rb; +} + +void av1_read_frame_size(struct aom_read_bit_buffer *rb, int num_bits_width, + int num_bits_height, int *width, int *height) { + *width = aom_rb_read_literal(rb, num_bits_width) + 1; + *height = aom_rb_read_literal(rb, num_bits_height) + 1; +} + +BITSTREAM_PROFILE av1_read_profile(struct aom_read_bit_buffer *rb) { + int profile = aom_rb_read_literal(rb, PROFILE_BITS); + return (BITSTREAM_PROFILE)profile; +} + +static AOM_INLINE void superres_post_decode(AV1Decoder *pbi) { + AV1_COMMON *const cm = &pbi->common; + BufferPool *const pool = cm->buffer_pool; + + if (!av1_superres_scaled(cm)) return; + assert(!cm->features.all_lossless); + + av1_superres_upscale(cm, pool); +} + +uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi, + struct aom_read_bit_buffer *rb, + const uint8_t *data, + const uint8_t **p_data_end, + int trailing_bits_present) { + AV1_COMMON *const cm = &pbi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &pbi->mb; + +#if CONFIG_BITSTREAM_DEBUG + aom_bitstream_queue_set_frame_read(cm->current_frame.frame_number * 2 + + cm->show_frame); +#endif +#if CONFIG_MISMATCH_DEBUG + mismatch_move_frame_idx_r(); +#endif + + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + cm->global_motion[i] = default_warp_params; + cm->cur_frame->global_motion[i] = default_warp_params; + } + xd->global_motion = cm->global_motion; + + read_uncompressed_header(pbi, rb); + + if (trailing_bits_present) av1_check_trailing_bits(pbi, rb); + + if (!cm->tiles.single_tile_decoding && + (pbi->dec_tile_row >= 0 || pbi->dec_tile_col >= 0)) { + pbi->dec_tile_row = -1; + pbi->dec_tile_col = -1; + } + + const uint32_t uncomp_hdr_size = + (uint32_t)aom_rb_bytes_read(rb); // Size of the uncompressed header + YV12_BUFFER_CONFIG *new_fb = &cm->cur_frame->buf; + xd->cur_buf = new_fb; + if (av1_allow_intrabc(cm)) { + av1_setup_scale_factors_for_frame( + &cm->sf_identity, xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height, + xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height); + } + + if (cm->show_existing_frame) { + // showing a frame directly + *p_data_end = data + uncomp_hdr_size; + if (pbi->reset_decoder_state) { + // Use the default frame context values. + *cm->fc = *cm->default_frame_context; + if (!cm->fc->initialized) + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Uninitialized entropy context."); + } + return uncomp_hdr_size; + } + + cm->mi_params.setup_mi(&cm->mi_params); + + av1_setup_motion_field(cm); + + av1_setup_block_planes(xd, cm->seq_params.subsampling_x, + cm->seq_params.subsampling_y, num_planes); + if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) { + // use the default frame context values + *cm->fc = *cm->default_frame_context; + } else { + *cm->fc = get_primary_ref_frame_buf(cm)->frame_context; + } + if (!cm->fc->initialized) + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Uninitialized entropy context."); + + xd->corrupted = 0; + return uncomp_hdr_size; +} + +// Once-per-frame initialization +static AOM_INLINE void setup_frame_info(AV1Decoder *pbi) { + AV1_COMMON *const cm = &pbi->common; + + if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE || + cm->rst_info[1].frame_restoration_type != RESTORE_NONE || + cm->rst_info[2].frame_restoration_type != RESTORE_NONE) { + av1_alloc_restoration_buffers(cm); + } + const int use_highbd = cm->seq_params.use_highbitdepth; + const int buf_size = MC_TEMP_BUF_PELS << use_highbd; + if (pbi->td.mc_buf_size != buf_size) { + av1_free_mc_tmp_buf(&pbi->td); + allocate_mc_tmp_buf(cm, &pbi->td, buf_size, use_highbd); + } +} + +void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data, + const uint8_t *data_end, + const uint8_t **p_data_end, int start_tile, + int end_tile, int initialize_flag) { + AV1_COMMON *const cm = &pbi->common; + CommonTileParams *const tiles = &cm->tiles; + MACROBLOCKD *const xd = &pbi->mb; + const int tile_count_tg = end_tile - start_tile + 1; + + if (initialize_flag) setup_frame_info(pbi); + const int num_planes = av1_num_planes(cm); +#if CONFIG_LPF_MASK + av1_loop_filter_frame_init(cm, 0, num_planes); +#endif + + if (pbi->max_threads > 1 && !(tiles->large_scale && !pbi->ext_tile_debug) && + pbi->row_mt) + *p_data_end = + decode_tiles_row_mt(pbi, data, data_end, start_tile, end_tile); + else if (pbi->max_threads > 1 && tile_count_tg > 1 && + !(tiles->large_scale && !pbi->ext_tile_debug)) + *p_data_end = decode_tiles_mt(pbi, data, data_end, start_tile, end_tile); + else + *p_data_end = decode_tiles(pbi, data, data_end, start_tile, end_tile); + + // If the bit stream is monochrome, set the U and V buffers to a constant. + if (num_planes < 3) { + set_planes_to_neutral_grey(&cm->seq_params, xd->cur_buf, 1); + } + + if (end_tile != tiles->rows * tiles->cols - 1) { + return; + } + + if (!cm->features.allow_intrabc && !tiles->single_tile_decoding) { + if (cm->lf.filter_level[0] || cm->lf.filter_level[1]) { + if (pbi->num_workers > 1) { + av1_loop_filter_frame_mt( + &cm->cur_frame->buf, cm, &pbi->mb, 0, num_planes, 0, +#if CONFIG_LPF_MASK + 1, +#endif + pbi->tile_workers, pbi->num_workers, &pbi->lf_row_sync); + } else { + av1_loop_filter_frame(&cm->cur_frame->buf, cm, &pbi->mb, +#if CONFIG_LPF_MASK + 1, +#endif + 0, num_planes, 0); + } + } + + const int do_loop_restoration = + cm->rst_info[0].frame_restoration_type != RESTORE_NONE || + cm->rst_info[1].frame_restoration_type != RESTORE_NONE || + cm->rst_info[2].frame_restoration_type != RESTORE_NONE; + const int do_cdef = + !pbi->skip_loop_filter && !cm->features.coded_lossless && + (cm->cdef_info.cdef_bits || cm->cdef_info.cdef_strengths[0] || + cm->cdef_info.cdef_uv_strengths[0]); + const int do_superres = av1_superres_scaled(cm); + const int optimized_loop_restoration = !do_cdef && !do_superres; + + if (!optimized_loop_restoration) { + if (do_loop_restoration) + av1_loop_restoration_save_boundary_lines(&pbi->common.cur_frame->buf, + cm, 0); + + if (do_cdef) av1_cdef_frame(&pbi->common.cur_frame->buf, cm, &pbi->mb); + + superres_post_decode(pbi); + + if (do_loop_restoration) { + av1_loop_restoration_save_boundary_lines(&pbi->common.cur_frame->buf, + cm, 1); + if (pbi->num_workers > 1) { + av1_loop_restoration_filter_frame_mt( + (YV12_BUFFER_CONFIG *)xd->cur_buf, cm, optimized_loop_restoration, + pbi->tile_workers, pbi->num_workers, &pbi->lr_row_sync, + &pbi->lr_ctxt); + } else { + av1_loop_restoration_filter_frame((YV12_BUFFER_CONFIG *)xd->cur_buf, + cm, optimized_loop_restoration, + &pbi->lr_ctxt); + } + } + } else { + // In no cdef and no superres case. Provide an optimized version of + // loop_restoration_filter. + if (do_loop_restoration) { + if (pbi->num_workers > 1) { + av1_loop_restoration_filter_frame_mt( + (YV12_BUFFER_CONFIG *)xd->cur_buf, cm, optimized_loop_restoration, + pbi->tile_workers, pbi->num_workers, &pbi->lr_row_sync, + &pbi->lr_ctxt); + } else { + av1_loop_restoration_filter_frame((YV12_BUFFER_CONFIG *)xd->cur_buf, + cm, optimized_loop_restoration, + &pbi->lr_ctxt); + } + } + } + } +#if CONFIG_LPF_MASK + av1_zero_array(cm->lf.lfm, cm->lf.lfm_num); +#endif + + if (!xd->corrupted) { + if (cm->features.refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) { + assert(pbi->context_update_tile_id < pbi->allocated_tiles); + *cm->fc = pbi->tile_data[pbi->context_update_tile_id].tctx; + av1_reset_cdf_symbol_counters(cm->fc); + } + } else { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Decode failed. Frame data is corrupted."); + } + +#if CONFIG_INSPECTION + if (pbi->inspect_cb != NULL) { + (*pbi->inspect_cb)(pbi, pbi->inspect_ctx); + } +#endif + + // Non frame parallel update frame context here. + if (!tiles->large_scale) { + cm->cur_frame->frame_context = *cm->fc; + } +} diff --git a/libs/libaom/src/av1/decoder/decodeframe.h b/libs/libaom/src/av1/decoder/decodeframe.h new file mode 100644 index 000000000..95b3c9f22 --- /dev/null +++ b/libs/libaom/src/av1/decoder/decodeframe.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_DECODER_DECODEFRAME_H_ +#define AOM_AV1_DECODER_DECODEFRAME_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1Decoder; +struct aom_read_bit_buffer; +struct ThreadData; + +// Reads the middle part of the sequence header OBU (from +// frame_width_bits_minus_1 to enable_restoration) into seq_params. +// Reports errors by calling rb->error_handler() or aom_internal_error(). +void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb, + SequenceHeader *seq_params); + +void av1_read_frame_size(struct aom_read_bit_buffer *rb, int num_bits_width, + int num_bits_height, int *width, int *height); +BITSTREAM_PROFILE av1_read_profile(struct aom_read_bit_buffer *rb); + +// Returns 0 on success. Sets pbi->common.error.error_code and returns -1 on +// failure. +int av1_check_trailing_bits(struct AV1Decoder *pbi, + struct aom_read_bit_buffer *rb); + +// On success, returns the frame header size. On failure, calls +// aom_internal_error and does not return. +// TODO(wtc): Figure out and document the p_data_end parameter. +uint32_t av1_decode_frame_headers_and_setup(struct AV1Decoder *pbi, + struct aom_read_bit_buffer *rb, + const uint8_t *data, + const uint8_t **p_data_end, + int trailing_bits_present); + +void av1_decode_tg_tiles_and_wrapup(struct AV1Decoder *pbi, const uint8_t *data, + const uint8_t *data_end, + const uint8_t **p_data_end, int start_tile, + int end_tile, int initialize_flag); + +// Implements the color_config() function in the spec. Reports errors by +// calling rb->error_handler() or aom_internal_error(). +void av1_read_color_config(struct aom_read_bit_buffer *rb, + int allow_lowbitdepth, SequenceHeader *seq_params, + struct aom_internal_error_info *error_info); + +// Implements the timing_info() function in the spec. Reports errors by calling +// rb->error_handler() or aom_internal_error(). +void av1_read_timing_info_header(aom_timing_info_t *timing_info, + struct aom_internal_error_info *error, + struct aom_read_bit_buffer *rb); + +// Implements the decoder_model_info() function in the spec. Reports errors by +// calling rb->error_handler(). +void av1_read_decoder_model_info(aom_dec_model_info_t *decoder_model_info, + struct aom_read_bit_buffer *rb); + +// Implements the operating_parameters_info() function in the spec. Reports +// errors by calling rb->error_handler(). +void av1_read_op_parameters_info(aom_dec_model_op_parameters_t *op_params, + int buffer_delay_length, + struct aom_read_bit_buffer *rb); + +struct aom_read_bit_buffer *av1_init_read_bit_buffer( + struct AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data, + const uint8_t *data_end); + +void av1_free_mc_tmp_buf(struct ThreadData *thread_data); + +void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_DECODER_DECODEFRAME_H_ diff --git a/libs/libaom/src/av1/decoder/decodemv.c b/libs/libaom/src/av1/decoder/decodemv.c new file mode 100644 index 000000000..e97cec42c --- /dev/null +++ b/libs/libaom/src/av1/decoder/decodemv.c @@ -0,0 +1,1575 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/cfl.h" +#include "av1/common/common.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/entropymv.h" +#include "av1/common/mvref_common.h" +#include "av1/common/pred_common.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/common/seg_common.h" +#include "av1/common/warped_motion.h" + +#include "av1/decoder/decodeframe.h" +#include "av1/decoder/decodemv.h" + +#include "aom_dsp/aom_dsp_common.h" + +#define ACCT_STR __func__ + +#define DEC_MISMATCH_DEBUG 0 + +static PREDICTION_MODE read_intra_mode(aom_reader *r, aom_cdf_prob *cdf) { + return (PREDICTION_MODE)aom_read_symbol(r, cdf, INTRA_MODES, ACCT_STR); +} + +static void read_cdef(AV1_COMMON *cm, aom_reader *r, MACROBLOCKD *const xd) { + const int skip = xd->mi[0]->skip; + if (cm->features.coded_lossless) return; + if (cm->features.allow_intrabc) { + assert(cm->cdef_info.cdef_bits == 0); + return; + } + + // At the start of a superblock, mark that we haven't yet read CDEF strengths + // for any of the CDEF units contained in this superblock. + const int sb_mask = (cm->seq_params.mib_size - 1); + const int mi_row_in_sb = (xd->mi_row & sb_mask); + const int mi_col_in_sb = (xd->mi_col & sb_mask); + if (mi_row_in_sb == 0 && mi_col_in_sb == 0) { + xd->cdef_transmitted[0] = xd->cdef_transmitted[1] = + xd->cdef_transmitted[2] = xd->cdef_transmitted[3] = false; + } + + // CDEF unit size is 64x64 irrespective of the superblock size. + const int cdef_size = 1 << (6 - MI_SIZE_LOG2); + + // Find index of this CDEF unit in this superblock. + const int index_mask = cdef_size; + const int cdef_unit_row_in_sb = ((xd->mi_row & index_mask) != 0); + const int cdef_unit_col_in_sb = ((xd->mi_col & index_mask) != 0); + const int index = (cm->seq_params.sb_size == BLOCK_128X128) + ? cdef_unit_col_in_sb + 2 * cdef_unit_row_in_sb + : 0; + + // Read CDEF strength from the first non-skip coding block in this CDEF unit. + if (!xd->cdef_transmitted[index] && !skip) { + // CDEF strength for this CDEF unit needs to be read into the MB_MODE_INFO + // of the 1st block in this CDEF unit. + const int first_block_mask = ~(cdef_size - 1); + CommonModeInfoParams *const mi_params = &cm->mi_params; + const int grid_idx = + get_mi_grid_idx(mi_params, xd->mi_row & first_block_mask, + xd->mi_col & first_block_mask); + MB_MODE_INFO *const mbmi = mi_params->mi_grid_base[grid_idx]; + mbmi->cdef_strength = + aom_read_literal(r, cm->cdef_info.cdef_bits, ACCT_STR); + xd->cdef_transmitted[index] = true; + } +} + +static int read_delta_qindex(AV1_COMMON *cm, const MACROBLOCKD *xd, + aom_reader *r, MB_MODE_INFO *const mbmi) { + int sign, abs, reduced_delta_qindex = 0; + BLOCK_SIZE bsize = mbmi->sb_type; + const int b_col = xd->mi_col & (cm->seq_params.mib_size - 1); + const int b_row = xd->mi_row & (cm->seq_params.mib_size - 1); + const int read_delta_q_flag = (b_col == 0 && b_row == 0); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + + if ((bsize != cm->seq_params.sb_size || mbmi->skip == 0) && + read_delta_q_flag) { + abs = aom_read_symbol(r, ec_ctx->delta_q_cdf, DELTA_Q_PROBS + 1, ACCT_STR); + const int smallval = (abs < DELTA_Q_SMALL); + + if (!smallval) { + const int rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1; + const int thr = (1 << rem_bits) + 1; + abs = aom_read_literal(r, rem_bits, ACCT_STR) + thr; + } + + if (abs) { + sign = aom_read_bit(r, ACCT_STR); + } else { + sign = 1; + } + + reduced_delta_qindex = sign ? -abs : abs; + } + return reduced_delta_qindex; +} +static int read_delta_lflevel(const AV1_COMMON *const cm, aom_reader *r, + aom_cdf_prob *const cdf, + const MB_MODE_INFO *const mbmi, int mi_col, + int mi_row) { + int reduced_delta_lflevel = 0; + const BLOCK_SIZE bsize = mbmi->sb_type; + const int b_col = mi_col & (cm->seq_params.mib_size - 1); + const int b_row = mi_row & (cm->seq_params.mib_size - 1); + const int read_delta_lf_flag = (b_col == 0 && b_row == 0); + + if ((bsize != cm->seq_params.sb_size || mbmi->skip == 0) && + read_delta_lf_flag) { + int abs = aom_read_symbol(r, cdf, DELTA_LF_PROBS + 1, ACCT_STR); + const int smallval = (abs < DELTA_LF_SMALL); + if (!smallval) { + const int rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1; + const int thr = (1 << rem_bits) + 1; + abs = aom_read_literal(r, rem_bits, ACCT_STR) + thr; + } + const int sign = abs ? aom_read_bit(r, ACCT_STR) : 1; + reduced_delta_lflevel = sign ? -abs : abs; + } + return reduced_delta_lflevel; +} + +static UV_PREDICTION_MODE read_intra_mode_uv(FRAME_CONTEXT *ec_ctx, + aom_reader *r, + CFL_ALLOWED_TYPE cfl_allowed, + PREDICTION_MODE y_mode) { + const UV_PREDICTION_MODE uv_mode = + aom_read_symbol(r, ec_ctx->uv_mode_cdf[cfl_allowed][y_mode], + UV_INTRA_MODES - !cfl_allowed, ACCT_STR); + return uv_mode; +} + +static uint8_t read_cfl_alphas(FRAME_CONTEXT *const ec_ctx, aom_reader *r, + int8_t *signs_out) { + const int8_t joint_sign = + aom_read_symbol(r, ec_ctx->cfl_sign_cdf, CFL_JOINT_SIGNS, "cfl:signs"); + uint8_t idx = 0; + // Magnitudes are only coded for nonzero values + if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) { + aom_cdf_prob *cdf_u = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)]; + idx = (uint8_t)aom_read_symbol(r, cdf_u, CFL_ALPHABET_SIZE, "cfl:alpha_u") + << CFL_ALPHABET_SIZE_LOG2; + } + if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) { + aom_cdf_prob *cdf_v = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)]; + idx += (uint8_t)aom_read_symbol(r, cdf_v, CFL_ALPHABET_SIZE, "cfl:alpha_v"); + } + *signs_out = joint_sign; + return idx; +} + +static INTERINTRA_MODE read_interintra_mode(MACROBLOCKD *xd, aom_reader *r, + int size_group) { + const INTERINTRA_MODE ii_mode = (INTERINTRA_MODE)aom_read_symbol( + r, xd->tile_ctx->interintra_mode_cdf[size_group], INTERINTRA_MODES, + ACCT_STR); + return ii_mode; +} + +static PREDICTION_MODE read_inter_mode(FRAME_CONTEXT *ec_ctx, aom_reader *r, + int16_t ctx) { + int16_t mode_ctx = ctx & NEWMV_CTX_MASK; + int is_newmv, is_zeromv, is_refmv; + is_newmv = aom_read_symbol(r, ec_ctx->newmv_cdf[mode_ctx], 2, ACCT_STR) == 0; + if (is_newmv) return NEWMV; + + mode_ctx = (ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; + is_zeromv = + aom_read_symbol(r, ec_ctx->zeromv_cdf[mode_ctx], 2, ACCT_STR) == 0; + if (is_zeromv) return GLOBALMV; + + mode_ctx = (ctx >> REFMV_OFFSET) & REFMV_CTX_MASK; + is_refmv = aom_read_symbol(r, ec_ctx->refmv_cdf[mode_ctx], 2, ACCT_STR) == 0; + if (is_refmv) + return NEARESTMV; + else + return NEARMV; +} + +static void read_drl_idx(FRAME_CONTEXT *ec_ctx, MACROBLOCKD *xd, + MB_MODE_INFO *mbmi, aom_reader *r) { + uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + mbmi->ref_mv_idx = 0; + if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) { + for (int idx = 0; idx < 2; ++idx) { + if (xd->ref_mv_count[ref_frame_type] > idx + 1) { + uint8_t drl_ctx = av1_drl_ctx(xd->weight[ref_frame_type], idx); + int drl_idx = aom_read_symbol(r, ec_ctx->drl_cdf[drl_ctx], 2, ACCT_STR); + mbmi->ref_mv_idx = idx + drl_idx; + if (!drl_idx) return; + } + } + } + if (have_nearmv_in_inter_mode(mbmi->mode)) { + // Offset the NEARESTMV mode. + // TODO(jingning): Unify the two syntax decoding loops after the NEARESTMV + // mode is factored in. + for (int idx = 1; idx < 3; ++idx) { + if (xd->ref_mv_count[ref_frame_type] > idx + 1) { + uint8_t drl_ctx = av1_drl_ctx(xd->weight[ref_frame_type], idx); + int drl_idx = aom_read_symbol(r, ec_ctx->drl_cdf[drl_ctx], 2, ACCT_STR); + mbmi->ref_mv_idx = idx + drl_idx - 1; + if (!drl_idx) return; + } + } + } +} + +static MOTION_MODE read_motion_mode(AV1_COMMON *cm, MACROBLOCKD *xd, + MB_MODE_INFO *mbmi, aom_reader *r) { + if (cm->features.switchable_motion_mode == 0) return SIMPLE_TRANSLATION; + if (mbmi->skip_mode) return SIMPLE_TRANSLATION; + + const MOTION_MODE last_motion_mode_allowed = motion_mode_allowed( + xd->global_motion, xd, mbmi, cm->features.allow_warped_motion); + int motion_mode; + + if (last_motion_mode_allowed == SIMPLE_TRANSLATION) return SIMPLE_TRANSLATION; + + if (last_motion_mode_allowed == OBMC_CAUSAL) { + motion_mode = + aom_read_symbol(r, xd->tile_ctx->obmc_cdf[mbmi->sb_type], 2, ACCT_STR); + return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode); + } else { + motion_mode = + aom_read_symbol(r, xd->tile_ctx->motion_mode_cdf[mbmi->sb_type], + MOTION_MODES, ACCT_STR); + return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode); + } +} + +static PREDICTION_MODE read_inter_compound_mode(MACROBLOCKD *xd, aom_reader *r, + int16_t ctx) { + const int mode = + aom_read_symbol(r, xd->tile_ctx->inter_compound_mode_cdf[ctx], + INTER_COMPOUND_MODES, ACCT_STR); + assert(is_inter_compound_mode(NEAREST_NEARESTMV + mode)); + return NEAREST_NEARESTMV + mode; +} + +int av1_neg_deinterleave(int diff, int ref, int max) { + if (!ref) return diff; + if (ref >= (max - 1)) return max - diff - 1; + if (2 * ref < max) { + if (diff <= 2 * ref) { + if (diff & 1) + return ref + ((diff + 1) >> 1); + else + return ref - (diff >> 1); + } + return diff; + } else { + if (diff <= 2 * (max - ref - 1)) { + if (diff & 1) + return ref + ((diff + 1) >> 1); + else + return ref - (diff >> 1); + } + return max - (diff + 1); + } +} + +static int read_segment_id(AV1_COMMON *const cm, const MACROBLOCKD *const xd, + aom_reader *r, int skip) { + int cdf_num; + const int pred = av1_get_spatial_seg_pred(cm, xd, &cdf_num); + if (skip) return pred; + + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + struct segmentation *const seg = &cm->seg; + struct segmentation_probs *const segp = &ec_ctx->seg; + aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num]; + const int coded_id = aom_read_symbol(r, pred_cdf, MAX_SEGMENTS, ACCT_STR); + const int segment_id = + av1_neg_deinterleave(coded_id, pred, seg->last_active_segid + 1); + + if (segment_id < 0 || segment_id > seg->last_active_segid) { + aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, + "Corrupted segment_ids"); + } + return segment_id; +} + +static int dec_get_segment_id(const AV1_COMMON *cm, const uint8_t *segment_ids, + int mi_offset, int x_mis, int y_mis) { + int segment_id = INT_MAX; + + for (int y = 0; y < y_mis; y++) + for (int x = 0; x < x_mis; x++) + segment_id = AOMMIN( + segment_id, segment_ids[mi_offset + y * cm->mi_params.mi_cols + x]); + + assert(segment_id >= 0 && segment_id < MAX_SEGMENTS); + return segment_id; +} + +static void set_segment_id(AV1_COMMON *cm, int mi_offset, int x_mis, int y_mis, + int segment_id) { + assert(segment_id >= 0 && segment_id < MAX_SEGMENTS); + + for (int y = 0; y < y_mis; y++) + for (int x = 0; x < x_mis; x++) + cm->cur_frame->seg_map[mi_offset + y * cm->mi_params.mi_cols + x] = + segment_id; +} + +static int read_intra_segment_id(AV1_COMMON *const cm, + const MACROBLOCKD *const xd, int bsize, + aom_reader *r, int skip) { + struct segmentation *const seg = &cm->seg; + if (!seg->enabled) return 0; // Default for disabled segmentation + assert(seg->update_map && !seg->temporal_update); + + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const int mi_offset = mi_row * mi_params->mi_cols + mi_col; + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + const int x_mis = AOMMIN(mi_params->mi_cols - mi_col, bw); + const int y_mis = AOMMIN(mi_params->mi_rows - mi_row, bh); + const int segment_id = read_segment_id(cm, xd, r, skip); + set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id); + return segment_id; +} + +static void copy_segment_id(const CommonModeInfoParams *const mi_params, + const uint8_t *last_segment_ids, + uint8_t *current_segment_ids, int mi_offset, + int x_mis, int y_mis) { + for (int y = 0; y < y_mis; y++) + for (int x = 0; x < x_mis; x++) + current_segment_ids[mi_offset + y * mi_params->mi_cols + x] = + last_segment_ids + ? last_segment_ids[mi_offset + y * mi_params->mi_cols + x] + : 0; +} + +static int get_predicted_segment_id(AV1_COMMON *const cm, int mi_offset, + int x_mis, int y_mis) { + return cm->last_frame_seg_map ? dec_get_segment_id(cm, cm->last_frame_seg_map, + mi_offset, x_mis, y_mis) + : 0; +} + +static int read_inter_segment_id(AV1_COMMON *const cm, MACROBLOCKD *const xd, + int preskip, aom_reader *r) { + struct segmentation *const seg = &cm->seg; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const int mi_offset = mi_row * mi_params->mi_cols + mi_col; + const int bw = mi_size_wide[mbmi->sb_type]; + const int bh = mi_size_high[mbmi->sb_type]; + + // TODO(slavarnway): move x_mis, y_mis into xd ????? + const int x_mis = AOMMIN(mi_params->mi_cols - mi_col, bw); + const int y_mis = AOMMIN(mi_params->mi_rows - mi_row, bh); + + if (!seg->enabled) return 0; // Default for disabled segmentation + + if (!seg->update_map) { + copy_segment_id(mi_params, cm->last_frame_seg_map, cm->cur_frame->seg_map, + mi_offset, x_mis, y_mis); + return get_predicted_segment_id(cm, mi_offset, x_mis, y_mis); + } + + int segment_id; + if (preskip) { + if (!seg->segid_preskip) return 0; + } else { + if (mbmi->skip) { + if (seg->temporal_update) { + mbmi->seg_id_predicted = 0; + } + segment_id = read_segment_id(cm, xd, r, 1); + set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id); + return segment_id; + } + } + + if (seg->temporal_update) { + const int ctx = av1_get_pred_context_seg_id(xd); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + struct segmentation_probs *const segp = &ec_ctx->seg; + aom_cdf_prob *pred_cdf = segp->pred_cdf[ctx]; + mbmi->seg_id_predicted = aom_read_symbol(r, pred_cdf, 2, ACCT_STR); + if (mbmi->seg_id_predicted) { + segment_id = get_predicted_segment_id(cm, mi_offset, x_mis, y_mis); + } else { + segment_id = read_segment_id(cm, xd, r, 0); + } + } else { + segment_id = read_segment_id(cm, xd, r, 0); + } + set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id); + return segment_id; +} + +static int read_skip_mode(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id, + aom_reader *r) { + if (!cm->current_frame.skip_mode_info.skip_mode_flag) return 0; + + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { + return 0; + } + + if (!is_comp_ref_allowed(xd->mi[0]->sb_type)) return 0; + + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME) || + segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) { + // These features imply single-reference mode, while skip mode implies + // compound reference. Hence, the two are mutually exclusive. + // In other words, skip_mode is implicitly 0 here. + return 0; + } + + const int ctx = av1_get_skip_mode_context(xd); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + const int skip_mode = + aom_read_symbol(r, ec_ctx->skip_mode_cdfs[ctx], 2, ACCT_STR); + return skip_mode; +} + +static int read_skip(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id, + aom_reader *r) { + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { + return 1; + } else { + const int ctx = av1_get_skip_context(xd); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + const int skip = aom_read_symbol(r, ec_ctx->skip_cdfs[ctx], 2, ACCT_STR); + return skip; + } +} + +// Merge the sorted list of cached colors(cached_colors[0...n_cached_colors-1]) +// and the sorted list of transmitted colors(colors[n_cached_colors...n-1]) into +// one single sorted list(colors[...]). +static void merge_colors(uint16_t *colors, uint16_t *cached_colors, + int n_colors, int n_cached_colors) { + if (n_cached_colors == 0) return; + int cache_idx = 0, trans_idx = n_cached_colors; + for (int i = 0; i < n_colors; ++i) { + if (cache_idx < n_cached_colors && + (trans_idx >= n_colors || + cached_colors[cache_idx] <= colors[trans_idx])) { + colors[i] = cached_colors[cache_idx++]; + } else { + assert(trans_idx < n_colors); + colors[i] = colors[trans_idx++]; + } + } +} + +static void read_palette_colors_y(MACROBLOCKD *const xd, int bit_depth, + PALETTE_MODE_INFO *const pmi, aom_reader *r) { + uint16_t color_cache[2 * PALETTE_MAX_SIZE]; + uint16_t cached_colors[PALETTE_MAX_SIZE]; + const int n_cache = av1_get_palette_cache(xd, 0, color_cache); + const int n = pmi->palette_size[0]; + int idx = 0; + for (int i = 0; i < n_cache && idx < n; ++i) + if (aom_read_bit(r, ACCT_STR)) cached_colors[idx++] = color_cache[i]; + if (idx < n) { + const int n_cached_colors = idx; + pmi->palette_colors[idx++] = aom_read_literal(r, bit_depth, ACCT_STR); + if (idx < n) { + const int min_bits = bit_depth - 3; + int bits = min_bits + aom_read_literal(r, 2, ACCT_STR); + int range = (1 << bit_depth) - pmi->palette_colors[idx - 1] - 1; + for (; idx < n; ++idx) { + assert(range >= 0); + const int delta = aom_read_literal(r, bits, ACCT_STR) + 1; + pmi->palette_colors[idx] = clamp(pmi->palette_colors[idx - 1] + delta, + 0, (1 << bit_depth) - 1); + range -= (pmi->palette_colors[idx] - pmi->palette_colors[idx - 1]); + bits = AOMMIN(bits, av1_ceil_log2(range)); + } + } + merge_colors(pmi->palette_colors, cached_colors, n, n_cached_colors); + } else { + memcpy(pmi->palette_colors, cached_colors, n * sizeof(cached_colors[0])); + } +} + +static void read_palette_colors_uv(MACROBLOCKD *const xd, int bit_depth, + PALETTE_MODE_INFO *const pmi, + aom_reader *r) { + const int n = pmi->palette_size[1]; + // U channel colors. + uint16_t color_cache[2 * PALETTE_MAX_SIZE]; + uint16_t cached_colors[PALETTE_MAX_SIZE]; + const int n_cache = av1_get_palette_cache(xd, 1, color_cache); + int idx = 0; + for (int i = 0; i < n_cache && idx < n; ++i) + if (aom_read_bit(r, ACCT_STR)) cached_colors[idx++] = color_cache[i]; + if (idx < n) { + const int n_cached_colors = idx; + idx += PALETTE_MAX_SIZE; + pmi->palette_colors[idx++] = aom_read_literal(r, bit_depth, ACCT_STR); + if (idx < PALETTE_MAX_SIZE + n) { + const int min_bits = bit_depth - 3; + int bits = min_bits + aom_read_literal(r, 2, ACCT_STR); + int range = (1 << bit_depth) - pmi->palette_colors[idx - 1]; + for (; idx < PALETTE_MAX_SIZE + n; ++idx) { + assert(range >= 0); + const int delta = aom_read_literal(r, bits, ACCT_STR); + pmi->palette_colors[idx] = clamp(pmi->palette_colors[idx - 1] + delta, + 0, (1 << bit_depth) - 1); + range -= (pmi->palette_colors[idx] - pmi->palette_colors[idx - 1]); + bits = AOMMIN(bits, av1_ceil_log2(range)); + } + } + merge_colors(pmi->palette_colors + PALETTE_MAX_SIZE, cached_colors, n, + n_cached_colors); + } else { + memcpy(pmi->palette_colors + PALETTE_MAX_SIZE, cached_colors, + n * sizeof(cached_colors[0])); + } + + // V channel colors. + if (aom_read_bit(r, ACCT_STR)) { // Delta encoding. + const int min_bits_v = bit_depth - 4; + const int max_val = 1 << bit_depth; + int bits = min_bits_v + aom_read_literal(r, 2, ACCT_STR); + pmi->palette_colors[2 * PALETTE_MAX_SIZE] = + aom_read_literal(r, bit_depth, ACCT_STR); + for (int i = 1; i < n; ++i) { + int delta = aom_read_literal(r, bits, ACCT_STR); + if (delta && aom_read_bit(r, ACCT_STR)) delta = -delta; + int val = (int)pmi->palette_colors[2 * PALETTE_MAX_SIZE + i - 1] + delta; + if (val < 0) val += max_val; + if (val >= max_val) val -= max_val; + pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] = val; + } + } else { + for (int i = 0; i < n; ++i) { + pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] = + aom_read_literal(r, bit_depth, ACCT_STR); + } + } +} + +static void read_palette_mode_info(AV1_COMMON *const cm, MACROBLOCKD *const xd, + aom_reader *r) { + const int num_planes = av1_num_planes(cm); + MB_MODE_INFO *const mbmi = xd->mi[0]; + const BLOCK_SIZE bsize = mbmi->sb_type; + assert(av1_allow_palette(cm->features.allow_screen_content_tools, bsize)); + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const int bsize_ctx = av1_get_palette_bsize_ctx(bsize); + + if (mbmi->mode == DC_PRED) { + const int palette_mode_ctx = av1_get_palette_mode_ctx(xd); + const int modev = aom_read_symbol( + r, xd->tile_ctx->palette_y_mode_cdf[bsize_ctx][palette_mode_ctx], 2, + ACCT_STR); + if (modev) { + pmi->palette_size[0] = + aom_read_symbol(r, xd->tile_ctx->palette_y_size_cdf[bsize_ctx], + PALETTE_SIZES, ACCT_STR) + + 2; + read_palette_colors_y(xd, cm->seq_params.bit_depth, pmi, r); + } + } + if (num_planes > 1 && mbmi->uv_mode == UV_DC_PRED && xd->is_chroma_ref) { + const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0); + const int modev = aom_read_symbol( + r, xd->tile_ctx->palette_uv_mode_cdf[palette_uv_mode_ctx], 2, ACCT_STR); + if (modev) { + pmi->palette_size[1] = + aom_read_symbol(r, xd->tile_ctx->palette_uv_size_cdf[bsize_ctx], + PALETTE_SIZES, ACCT_STR) + + 2; + read_palette_colors_uv(xd, cm->seq_params.bit_depth, pmi, r); + } + } +} + +static int read_angle_delta(aom_reader *r, aom_cdf_prob *cdf) { + const int sym = aom_read_symbol(r, cdf, 2 * MAX_ANGLE_DELTA + 1, ACCT_STR); + return sym - MAX_ANGLE_DELTA; +} + +static void read_filter_intra_mode_info(const AV1_COMMON *const cm, + MACROBLOCKD *const xd, aom_reader *r) { + MB_MODE_INFO *const mbmi = xd->mi[0]; + FILTER_INTRA_MODE_INFO *filter_intra_mode_info = + &mbmi->filter_intra_mode_info; + + if (av1_filter_intra_allowed(cm, mbmi)) { + filter_intra_mode_info->use_filter_intra = aom_read_symbol( + r, xd->tile_ctx->filter_intra_cdfs[mbmi->sb_type], 2, ACCT_STR); + if (filter_intra_mode_info->use_filter_intra) { + filter_intra_mode_info->filter_intra_mode = aom_read_symbol( + r, xd->tile_ctx->filter_intra_mode_cdf, FILTER_INTRA_MODES, ACCT_STR); + } + } else { + filter_intra_mode_info->use_filter_intra = 0; + } +} + +void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row, + int blk_col, TX_SIZE tx_size, aom_reader *r) { + MB_MODE_INFO *mbmi = xd->mi[0]; + uint8_t *tx_type = + &xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col]; + *tx_type = DCT_DCT; + + // No need to read transform type if block is skipped. + if (mbmi->skip || segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) + return; + + // No need to read transform type for lossless mode(qindex==0). + const int qindex = xd->qindex[mbmi->segment_id]; + if (qindex == 0) return; + + const int inter_block = is_inter_block(mbmi); + if (get_ext_tx_types(tx_size, inter_block, cm->features.reduced_tx_set_used) > + 1) { + const TxSetType tx_set_type = av1_get_ext_tx_set_type( + tx_size, inter_block, cm->features.reduced_tx_set_used); + const int eset = + get_ext_tx_set(tx_size, inter_block, cm->features.reduced_tx_set_used); + // eset == 0 should correspond to a set with only DCT_DCT and + // there is no need to read the tx_type + assert(eset != 0); + + const TX_SIZE square_tx_size = txsize_sqr_map[tx_size]; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + if (inter_block) { + *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol( + r, ec_ctx->inter_ext_tx_cdf[eset][square_tx_size], + av1_num_ext_tx_set[tx_set_type], ACCT_STR)]; + } else { + const PREDICTION_MODE intra_mode = + mbmi->filter_intra_mode_info.use_filter_intra + ? fimode_to_intradir[mbmi->filter_intra_mode_info + .filter_intra_mode] + : mbmi->mode; + *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol( + r, ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_mode], + av1_num_ext_tx_set[tx_set_type], ACCT_STR)]; + } + } +} + +static INLINE void read_mv(aom_reader *r, MV *mv, const MV *ref, + nmv_context *ctx, MvSubpelPrecision precision); + +static INLINE int is_mv_valid(const MV *mv); + +static INLINE int assign_dv(AV1_COMMON *cm, MACROBLOCKD *xd, int_mv *mv, + const int_mv *ref_mv, int mi_row, int mi_col, + BLOCK_SIZE bsize, aom_reader *r) { + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + read_mv(r, &mv->as_mv, &ref_mv->as_mv, &ec_ctx->ndvc, MV_SUBPEL_NONE); + // DV should not have sub-pel. + assert((mv->as_mv.col & 7) == 0); + assert((mv->as_mv.row & 7) == 0); + mv->as_mv.col = (mv->as_mv.col >> 3) * 8; + mv->as_mv.row = (mv->as_mv.row >> 3) * 8; + int valid = is_mv_valid(&mv->as_mv) && + av1_is_dv_valid(mv->as_mv, cm, xd, mi_row, mi_col, bsize, + cm->seq_params.mib_size_log2); + return valid; +} + +static void read_intrabc_info(AV1_COMMON *const cm, MACROBLOCKD *const xd, + aom_reader *r) { + MB_MODE_INFO *const mbmi = xd->mi[0]; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + mbmi->use_intrabc = aom_read_symbol(r, ec_ctx->intrabc_cdf, 2, ACCT_STR); + if (mbmi->use_intrabc) { + BLOCK_SIZE bsize = mbmi->sb_type; + mbmi->mode = DC_PRED; + mbmi->uv_mode = UV_DC_PRED; + mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR); + mbmi->motion_mode = SIMPLE_TRANSLATION; + + int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES]; + int_mv ref_mvs[INTRA_FRAME + 1][MAX_MV_REF_CANDIDATES]; + + av1_find_mv_refs(cm, xd, mbmi, INTRA_FRAME, xd->ref_mv_count, + xd->ref_mv_stack, xd->weight, ref_mvs, /*global_mvs=*/NULL, + inter_mode_ctx); + + int_mv nearestmv, nearmv; + + av1_find_best_ref_mvs(0, ref_mvs[INTRA_FRAME], &nearestmv, &nearmv, 0); + int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv; + if (dv_ref.as_int == 0) + av1_find_ref_dv(&dv_ref, &xd->tile, cm->seq_params.mib_size, xd->mi_row); + // Ref DV should not have sub-pel. + int valid_dv = (dv_ref.as_mv.col & 7) == 0 && (dv_ref.as_mv.row & 7) == 0; + dv_ref.as_mv.col = (dv_ref.as_mv.col >> 3) * 8; + dv_ref.as_mv.row = (dv_ref.as_mv.row >> 3) * 8; + valid_dv = valid_dv && assign_dv(cm, xd, &mbmi->mv[0], &dv_ref, xd->mi_row, + xd->mi_col, bsize, r); + if (!valid_dv) { + // Intra bc motion vectors are not valid - signal corrupt frame + aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, + "Invalid intrabc dv"); + } + } +} + +// If delta q is present, reads delta_q index. +// Also reads delta_q loop filter levels, if present. +static void read_delta_q_params(AV1_COMMON *const cm, MACROBLOCKD *const xd, + aom_reader *r) { + DeltaQInfo *const delta_q_info = &cm->delta_q_info; + + if (delta_q_info->delta_q_present_flag) { + MB_MODE_INFO *const mbmi = xd->mi[0]; + xd->current_qindex += + read_delta_qindex(cm, xd, r, mbmi) * delta_q_info->delta_q_res; + /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */ + xd->current_qindex = clamp(xd->current_qindex, 1, MAXQ); + FRAME_CONTEXT *const ec_ctx = xd->tile_ctx; + if (delta_q_info->delta_lf_present_flag) { + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + if (delta_q_info->delta_lf_multi) { + const int frame_lf_count = + av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { + const int tmp_lvl = + xd->delta_lf[lf_id] + + read_delta_lflevel(cm, r, ec_ctx->delta_lf_multi_cdf[lf_id], mbmi, + mi_col, mi_row) * + delta_q_info->delta_lf_res; + mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id] = + clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER); + } + } else { + const int tmp_lvl = xd->delta_lf_from_base + + read_delta_lflevel(cm, r, ec_ctx->delta_lf_cdf, + mbmi, mi_col, mi_row) * + delta_q_info->delta_lf_res; + mbmi->delta_lf_from_base = xd->delta_lf_from_base = + clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER); + } + } + } +} + +static void read_intra_frame_mode_info(AV1_COMMON *const cm, + MACROBLOCKD *const xd, aom_reader *r) { + MB_MODE_INFO *const mbmi = xd->mi[0]; + const MB_MODE_INFO *above_mi = xd->above_mbmi; + const MB_MODE_INFO *left_mi = xd->left_mbmi; + const BLOCK_SIZE bsize = mbmi->sb_type; + struct segmentation *const seg = &cm->seg; + + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + + if (seg->segid_preskip) + mbmi->segment_id = read_intra_segment_id(cm, xd, bsize, r, 0); + + mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r); + + if (!seg->segid_preskip) + mbmi->segment_id = read_intra_segment_id(cm, xd, bsize, r, mbmi->skip); + + read_cdef(cm, r, xd); + + read_delta_q_params(cm, xd, r); + + mbmi->current_qindex = xd->current_qindex; + + mbmi->ref_frame[0] = INTRA_FRAME; + mbmi->ref_frame[1] = NONE_FRAME; + mbmi->palette_mode_info.palette_size[0] = 0; + mbmi->palette_mode_info.palette_size[1] = 0; + mbmi->filter_intra_mode_info.use_filter_intra = 0; + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + xd->above_txfm_context = cm->above_contexts.txfm[xd->tile.tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + + if (av1_allow_intrabc(cm)) { + read_intrabc_info(cm, xd, r); + if (is_intrabc_block(mbmi)) return; + } + + mbmi->mode = read_intra_mode(r, get_y_mode_cdf(ec_ctx, above_mi, left_mi)); + + const int use_angle_delta = av1_use_angle_delta(bsize); + mbmi->angle_delta[PLANE_TYPE_Y] = + (use_angle_delta && av1_is_directional_mode(mbmi->mode)) + ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED]) + : 0; + + if (!cm->seq_params.monochrome && xd->is_chroma_ref) { + mbmi->uv_mode = + read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode); + if (mbmi->uv_mode == UV_CFL_PRED) { + mbmi->cfl_alpha_idx = read_cfl_alphas(ec_ctx, r, &mbmi->cfl_alpha_signs); + } + mbmi->angle_delta[PLANE_TYPE_UV] = + (use_angle_delta && av1_is_directional_mode(get_uv_mode(mbmi->uv_mode))) + ? read_angle_delta(r, + ec_ctx->angle_delta_cdf[mbmi->uv_mode - V_PRED]) + : 0; + } else { + // Avoid decoding angle_info if there is is no chroma prediction + mbmi->uv_mode = UV_DC_PRED; + } + xd->cfl.store_y = store_cfl_required(cm, xd); + + if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) + read_palette_mode_info(cm, xd, r); + + read_filter_intra_mode_info(cm, xd, r); +} + +static int read_mv_component(aom_reader *r, nmv_component *mvcomp, + int use_subpel, int usehp) { + int mag, d, fr, hp; + const int sign = aom_read_symbol(r, mvcomp->sign_cdf, 2, ACCT_STR); + const int mv_class = + aom_read_symbol(r, mvcomp->classes_cdf, MV_CLASSES, ACCT_STR); + const int class0 = mv_class == MV_CLASS_0; + + // Integer part + if (class0) { + d = aom_read_symbol(r, mvcomp->class0_cdf, CLASS0_SIZE, ACCT_STR); + mag = 0; + } else { + const int n = mv_class + CLASS0_BITS - 1; // number of bits + d = 0; + for (int i = 0; i < n; ++i) + d |= aom_read_symbol(r, mvcomp->bits_cdf[i], 2, ACCT_STR) << i; + mag = CLASS0_SIZE << (mv_class + 2); + } + + if (use_subpel) { + // Fractional part + fr = aom_read_symbol(r, class0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf, + MV_FP_SIZE, ACCT_STR); + + // High precision part (if hp is not used, the default value of the hp is 1) + hp = usehp ? aom_read_symbol( + r, class0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf, 2, + ACCT_STR) + : 1; + } else { + fr = 3; + hp = 1; + } + + // Result + mag += ((d << 3) | (fr << 1) | hp) + 1; + return sign ? -mag : mag; +} + +static INLINE void read_mv(aom_reader *r, MV *mv, const MV *ref, + nmv_context *ctx, MvSubpelPrecision precision) { + MV diff = kZeroMv; + const MV_JOINT_TYPE joint_type = + (MV_JOINT_TYPE)aom_read_symbol(r, ctx->joints_cdf, MV_JOINTS, ACCT_STR); + + if (mv_joint_vertical(joint_type)) + diff.row = read_mv_component(r, &ctx->comps[0], precision > MV_SUBPEL_NONE, + precision > MV_SUBPEL_LOW_PRECISION); + + if (mv_joint_horizontal(joint_type)) + diff.col = read_mv_component(r, &ctx->comps[1], precision > MV_SUBPEL_NONE, + precision > MV_SUBPEL_LOW_PRECISION); + + mv->row = ref->row + diff.row; + mv->col = ref->col + diff.col; +} + +static REFERENCE_MODE read_block_reference_mode(AV1_COMMON *cm, + const MACROBLOCKD *xd, + aom_reader *r) { + if (!is_comp_ref_allowed(xd->mi[0]->sb_type)) return SINGLE_REFERENCE; + if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) { + const int ctx = av1_get_reference_mode_context(xd); + const REFERENCE_MODE mode = (REFERENCE_MODE)aom_read_symbol( + r, xd->tile_ctx->comp_inter_cdf[ctx], 2, ACCT_STR); + return mode; // SINGLE_REFERENCE or COMPOUND_REFERENCE + } else { + assert(cm->current_frame.reference_mode == SINGLE_REFERENCE); + return cm->current_frame.reference_mode; + } +} + +#define READ_REF_BIT(pname) \ + aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR) + +static COMP_REFERENCE_TYPE read_comp_reference_type(const MACROBLOCKD *xd, + aom_reader *r) { + const int ctx = av1_get_comp_reference_type_context(xd); + const COMP_REFERENCE_TYPE comp_ref_type = + (COMP_REFERENCE_TYPE)aom_read_symbol( + r, xd->tile_ctx->comp_ref_type_cdf[ctx], 2, ACCT_STR); + return comp_ref_type; // UNIDIR_COMP_REFERENCE or BIDIR_COMP_REFERENCE +} + +static void set_ref_frames_for_skip_mode(AV1_COMMON *const cm, + MV_REFERENCE_FRAME ref_frame[2]) { + ref_frame[0] = LAST_FRAME + cm->current_frame.skip_mode_info.ref_frame_idx_0; + ref_frame[1] = LAST_FRAME + cm->current_frame.skip_mode_info.ref_frame_idx_1; +} + +// Read the referncence frame +static void read_ref_frames(AV1_COMMON *const cm, MACROBLOCKD *const xd, + aom_reader *r, int segment_id, + MV_REFERENCE_FRAME ref_frame[2]) { + if (xd->mi[0]->skip_mode) { + set_ref_frames_for_skip_mode(cm, ref_frame); + return; + } + + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { + ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id, + SEG_LVL_REF_FRAME); + ref_frame[1] = NONE_FRAME; + } else if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) || + segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) { + ref_frame[0] = LAST_FRAME; + ref_frame[1] = NONE_FRAME; + } else { + const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r); + + if (mode == COMPOUND_REFERENCE) { + const COMP_REFERENCE_TYPE comp_ref_type = read_comp_reference_type(xd, r); + + if (comp_ref_type == UNIDIR_COMP_REFERENCE) { + const int bit = READ_REF_BIT(uni_comp_ref_p); + if (bit) { + ref_frame[0] = BWDREF_FRAME; + ref_frame[1] = ALTREF_FRAME; + } else { + const int bit1 = READ_REF_BIT(uni_comp_ref_p1); + if (bit1) { + const int bit2 = READ_REF_BIT(uni_comp_ref_p2); + if (bit2) { + ref_frame[0] = LAST_FRAME; + ref_frame[1] = GOLDEN_FRAME; + } else { + ref_frame[0] = LAST_FRAME; + ref_frame[1] = LAST3_FRAME; + } + } else { + ref_frame[0] = LAST_FRAME; + ref_frame[1] = LAST2_FRAME; + } + } + + return; + } + + assert(comp_ref_type == BIDIR_COMP_REFERENCE); + + const int idx = 1; + const int bit = READ_REF_BIT(comp_ref_p); + // Decode forward references. + if (!bit) { + const int bit1 = READ_REF_BIT(comp_ref_p1); + ref_frame[!idx] = bit1 ? LAST2_FRAME : LAST_FRAME; + } else { + const int bit2 = READ_REF_BIT(comp_ref_p2); + ref_frame[!idx] = bit2 ? GOLDEN_FRAME : LAST3_FRAME; + } + + // Decode backward references. + const int bit_bwd = READ_REF_BIT(comp_bwdref_p); + if (!bit_bwd) { + const int bit1_bwd = READ_REF_BIT(comp_bwdref_p1); + ref_frame[idx] = bit1_bwd ? ALTREF2_FRAME : BWDREF_FRAME; + } else { + ref_frame[idx] = ALTREF_FRAME; + } + } else if (mode == SINGLE_REFERENCE) { + const int bit0 = READ_REF_BIT(single_ref_p1); + if (bit0) { + const int bit1 = READ_REF_BIT(single_ref_p2); + if (!bit1) { + const int bit5 = READ_REF_BIT(single_ref_p6); + ref_frame[0] = bit5 ? ALTREF2_FRAME : BWDREF_FRAME; + } else { + ref_frame[0] = ALTREF_FRAME; + } + } else { + const int bit2 = READ_REF_BIT(single_ref_p3); + if (bit2) { + const int bit4 = READ_REF_BIT(single_ref_p5); + ref_frame[0] = bit4 ? GOLDEN_FRAME : LAST3_FRAME; + } else { + const int bit3 = READ_REF_BIT(single_ref_p4); + ref_frame[0] = bit3 ? LAST2_FRAME : LAST_FRAME; + } + } + + ref_frame[1] = NONE_FRAME; + } else { + assert(0 && "Invalid prediction mode."); + } + } +} + +static INLINE void read_mb_interp_filter(const MACROBLOCKD *const xd, + InterpFilter interp_filter, + bool enable_dual_filter, + MB_MODE_INFO *const mbmi, + aom_reader *r) { + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + + if (!av1_is_interp_needed(xd)) { + set_default_interp_filters(mbmi, interp_filter); + return; + } + + if (interp_filter != SWITCHABLE) { + mbmi->interp_filters = av1_broadcast_interp_filter(interp_filter); + } else { + InterpFilter ref0_filter[2] = { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR }; + for (int dir = 0; dir < 2; ++dir) { + const int ctx = av1_get_pred_context_switchable_interp(xd, dir); + ref0_filter[dir] = (InterpFilter)aom_read_symbol( + r, ec_ctx->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS, ACCT_STR); + if (!enable_dual_filter) { + ref0_filter[1] = ref0_filter[0]; + break; + } + } + // The index system works as: (0, 1) -> (vertical, horizontal) filter types + mbmi->interp_filters.as_filters.x_filter = ref0_filter[1]; + mbmi->interp_filters.as_filters.y_filter = ref0_filter[0]; + } +} + +static void read_intra_block_mode_info(AV1_COMMON *const cm, + MACROBLOCKD *const xd, + MB_MODE_INFO *const mbmi, + aom_reader *r) { + const BLOCK_SIZE bsize = mbmi->sb_type; + const int use_angle_delta = av1_use_angle_delta(bsize); + + mbmi->ref_frame[0] = INTRA_FRAME; + mbmi->ref_frame[1] = NONE_FRAME; + + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + + mbmi->mode = read_intra_mode(r, ec_ctx->y_mode_cdf[size_group_lookup[bsize]]); + + mbmi->angle_delta[PLANE_TYPE_Y] = + use_angle_delta && av1_is_directional_mode(mbmi->mode) + ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED]) + : 0; + if (!cm->seq_params.monochrome && xd->is_chroma_ref) { + mbmi->uv_mode = + read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode); + if (mbmi->uv_mode == UV_CFL_PRED) { + mbmi->cfl_alpha_idx = + read_cfl_alphas(xd->tile_ctx, r, &mbmi->cfl_alpha_signs); + } + mbmi->angle_delta[PLANE_TYPE_UV] = + use_angle_delta && av1_is_directional_mode(get_uv_mode(mbmi->uv_mode)) + ? read_angle_delta(r, + ec_ctx->angle_delta_cdf[mbmi->uv_mode - V_PRED]) + : 0; + } else { + // Avoid decoding angle_info if there is is no chroma prediction + mbmi->uv_mode = UV_DC_PRED; + } + xd->cfl.store_y = store_cfl_required(cm, xd); + + mbmi->palette_mode_info.palette_size[0] = 0; + mbmi->palette_mode_info.palette_size[1] = 0; + if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) + read_palette_mode_info(cm, xd, r); + + read_filter_intra_mode_info(cm, xd, r); +} + +static INLINE int is_mv_valid(const MV *mv) { + return mv->row > MV_LOW && mv->row < MV_UPP && mv->col > MV_LOW && + mv->col < MV_UPP; +} + +static INLINE int assign_mv(AV1_COMMON *cm, MACROBLOCKD *xd, + PREDICTION_MODE mode, + MV_REFERENCE_FRAME ref_frame[2], int_mv mv[2], + int_mv ref_mv[2], int_mv nearest_mv[2], + int_mv near_mv[2], int is_compound, int allow_hp, + aom_reader *r) { + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + MB_MODE_INFO *mbmi = xd->mi[0]; + BLOCK_SIZE bsize = mbmi->sb_type; + FeatureFlags *const features = &cm->features; + if (features->cur_frame_force_integer_mv) { + allow_hp = MV_SUBPEL_NONE; + } + switch (mode) { + case NEWMV: { + nmv_context *const nmvc = &ec_ctx->nmvc; + read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp); + break; + } + case NEARESTMV: { + mv[0].as_int = nearest_mv[0].as_int; + break; + } + case NEARMV: { + mv[0].as_int = near_mv[0].as_int; + break; + } + case GLOBALMV: { + mv[0].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[0]], + features->allow_high_precision_mv, + bsize, xd->mi_col, xd->mi_row, + features->cur_frame_force_integer_mv) + .as_int; + break; + } + case NEW_NEWMV: { + assert(is_compound); + for (int i = 0; i < 2; ++i) { + nmv_context *const nmvc = &ec_ctx->nmvc; + read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, nmvc, allow_hp); + } + break; + } + case NEAREST_NEARESTMV: { + assert(is_compound); + mv[0].as_int = nearest_mv[0].as_int; + mv[1].as_int = nearest_mv[1].as_int; + break; + } + case NEAR_NEARMV: { + assert(is_compound); + mv[0].as_int = near_mv[0].as_int; + mv[1].as_int = near_mv[1].as_int; + break; + } + case NEW_NEARESTMV: { + nmv_context *const nmvc = &ec_ctx->nmvc; + read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp); + assert(is_compound); + mv[1].as_int = nearest_mv[1].as_int; + break; + } + case NEAREST_NEWMV: { + nmv_context *const nmvc = &ec_ctx->nmvc; + mv[0].as_int = nearest_mv[0].as_int; + read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, nmvc, allow_hp); + assert(is_compound); + break; + } + case NEAR_NEWMV: { + nmv_context *const nmvc = &ec_ctx->nmvc; + mv[0].as_int = near_mv[0].as_int; + read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, nmvc, allow_hp); + assert(is_compound); + break; + } + case NEW_NEARMV: { + nmv_context *const nmvc = &ec_ctx->nmvc; + read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp); + assert(is_compound); + mv[1].as_int = near_mv[1].as_int; + break; + } + case GLOBAL_GLOBALMV: { + assert(is_compound); + mv[0].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[0]], + features->allow_high_precision_mv, + bsize, xd->mi_col, xd->mi_row, + features->cur_frame_force_integer_mv) + .as_int; + mv[1].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[1]], + features->allow_high_precision_mv, + bsize, xd->mi_col, xd->mi_row, + features->cur_frame_force_integer_mv) + .as_int; + break; + } + default: { return 0; } + } + + int ret = is_mv_valid(&mv[0].as_mv); + if (is_compound) { + ret = ret && is_mv_valid(&mv[1].as_mv); + } + return ret; +} + +static int read_is_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd, + int segment_id, aom_reader *r) { + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { + const int frame = get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME); + if (frame < LAST_FRAME) return 0; + return frame != INTRA_FRAME; + } + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) { + return 1; + } + const int ctx = av1_get_intra_inter_context(xd); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + const int is_inter = + aom_read_symbol(r, ec_ctx->intra_inter_cdf[ctx], 2, ACCT_STR); + return is_inter; +} + +#if DEC_MISMATCH_DEBUG +static void dec_dump_logs(AV1_COMMON *cm, MB_MODE_INFO *const mbmi, int mi_row, + int mi_col, int16_t mode_ctx) { + int_mv mv[2] = { { 0 } }; + for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) + mv[ref].as_mv = mbmi->mv[ref].as_mv; + + const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK; + int16_t zeromv_ctx = -1; + int16_t refmv_ctx = -1; + if (mbmi->mode != NEWMV) { + zeromv_ctx = (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; + if (mbmi->mode != GLOBALMV) + refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK; + } + +#define FRAME_TO_CHECK 11 + if (cm->current_frame.frame_number == FRAME_TO_CHECK && cm->show_frame == 1) { + printf( + "=== DECODER ===: " + "Frame=%d, (mi_row,mi_col)=(%d,%d), skip_mode=%d, mode=%d, bsize=%d, " + "show_frame=%d, mv[0]=(%d,%d), mv[1]=(%d,%d), ref[0]=%d, " + "ref[1]=%d, motion_mode=%d, mode_ctx=%d, " + "newmv_ctx=%d, zeromv_ctx=%d, refmv_ctx=%d, tx_size=%d\n", + cm->current_frame.frame_number, mi_row, mi_col, mbmi->skip_mode, + mbmi->mode, mbmi->sb_type, cm->show_frame, mv[0].as_mv.row, + mv[0].as_mv.col, mv[1].as_mv.row, mv[1].as_mv.col, mbmi->ref_frame[0], + mbmi->ref_frame[1], mbmi->motion_mode, mode_ctx, newmv_ctx, zeromv_ctx, + refmv_ctx, mbmi->tx_size); + } +} +#endif // DEC_MISMATCH_DEBUG + +static void read_inter_block_mode_info(AV1Decoder *const pbi, + MACROBLOCKD *const xd, + MB_MODE_INFO *const mbmi, + aom_reader *r) { + AV1_COMMON *const cm = &pbi->common; + FeatureFlags *const features = &cm->features; + const BLOCK_SIZE bsize = mbmi->sb_type; + const int allow_hp = features->allow_high_precision_mv; + int_mv nearestmv[2], nearmv[2]; + int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES] = { { { 0 } } }; + int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES]; + int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + + mbmi->uv_mode = UV_DC_PRED; + mbmi->palette_mode_info.palette_size[0] = 0; + mbmi->palette_mode_info.palette_size[1] = 0; + + av1_collect_neighbors_ref_counts(xd); + + read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame); + const int is_compound = has_second_ref(mbmi); + + const MV_REFERENCE_FRAME ref_frame = av1_ref_frame_type(mbmi->ref_frame); + av1_find_mv_refs(cm, xd, mbmi, ref_frame, xd->ref_mv_count, xd->ref_mv_stack, + xd->weight, ref_mvs, /*global_mvs=*/NULL, inter_mode_ctx); + + mbmi->ref_mv_idx = 0; + + if (mbmi->skip_mode) { + assert(is_compound); + mbmi->mode = NEAREST_NEARESTMV; + } else { + if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) || + segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_GLOBALMV)) { + mbmi->mode = GLOBALMV; + } else { + const int mode_ctx = + av1_mode_context_analyzer(inter_mode_ctx, mbmi->ref_frame); + if (is_compound) + mbmi->mode = read_inter_compound_mode(xd, r, mode_ctx); + else + mbmi->mode = read_inter_mode(ec_ctx, r, mode_ctx); + if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV || + have_nearmv_in_inter_mode(mbmi->mode)) + read_drl_idx(ec_ctx, xd, mbmi, r); + } + } + + if (is_compound != is_inter_compound_mode(mbmi->mode)) { + aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, + "Prediction mode %d invalid with ref frame %d %d", + mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]); + } + + if (!is_compound && mbmi->mode != GLOBALMV) { + av1_find_best_ref_mvs(allow_hp, ref_mvs[mbmi->ref_frame[0]], &nearestmv[0], + &nearmv[0], features->cur_frame_force_integer_mv); + } + + if (is_compound && mbmi->mode != GLOBAL_GLOBALMV) { + const int ref_mv_idx = mbmi->ref_mv_idx + 1; + nearestmv[0] = xd->ref_mv_stack[ref_frame][0].this_mv; + nearestmv[1] = xd->ref_mv_stack[ref_frame][0].comp_mv; + nearmv[0] = xd->ref_mv_stack[ref_frame][ref_mv_idx].this_mv; + nearmv[1] = xd->ref_mv_stack[ref_frame][ref_mv_idx].comp_mv; + lower_mv_precision(&nearestmv[0].as_mv, allow_hp, + features->cur_frame_force_integer_mv); + lower_mv_precision(&nearestmv[1].as_mv, allow_hp, + features->cur_frame_force_integer_mv); + lower_mv_precision(&nearmv[0].as_mv, allow_hp, + features->cur_frame_force_integer_mv); + lower_mv_precision(&nearmv[1].as_mv, allow_hp, + features->cur_frame_force_integer_mv); + } else if (mbmi->ref_mv_idx > 0 && mbmi->mode == NEARMV) { + nearmv[0] = + xd->ref_mv_stack[mbmi->ref_frame[0]][1 + mbmi->ref_mv_idx].this_mv; + } + + int_mv ref_mv[2] = { nearestmv[0], nearestmv[1] }; + + if (is_compound) { + int ref_mv_idx = mbmi->ref_mv_idx; + // Special case: NEAR_NEWMV and NEW_NEARMV modes use + // 1 + mbmi->ref_mv_idx (like NEARMV) instead of + // mbmi->ref_mv_idx (like NEWMV) + if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) + ref_mv_idx = 1 + mbmi->ref_mv_idx; + + // TODO(jingning, yunqing): Do we need a lower_mv_precision() call here? + if (compound_ref0_mode(mbmi->mode) == NEWMV) + ref_mv[0] = xd->ref_mv_stack[ref_frame][ref_mv_idx].this_mv; + + if (compound_ref1_mode(mbmi->mode) == NEWMV) + ref_mv[1] = xd->ref_mv_stack[ref_frame][ref_mv_idx].comp_mv; + } else { + if (mbmi->mode == NEWMV) { + if (xd->ref_mv_count[ref_frame] > 1) + ref_mv[0] = xd->ref_mv_stack[ref_frame][mbmi->ref_mv_idx].this_mv; + } + } + + if (mbmi->skip_mode) assert(mbmi->mode == NEAREST_NEARESTMV); + + const int mv_corrupted_flag = + !assign_mv(cm, xd, mbmi->mode, mbmi->ref_frame, mbmi->mv, ref_mv, + nearestmv, nearmv, is_compound, allow_hp, r); + aom_merge_corrupted_flag(&xd->corrupted, mv_corrupted_flag); + + mbmi->use_wedge_interintra = 0; + if (cm->seq_params.enable_interintra_compound && !mbmi->skip_mode && + is_interintra_allowed(mbmi)) { + const int bsize_group = size_group_lookup[bsize]; + const int interintra = + aom_read_symbol(r, ec_ctx->interintra_cdf[bsize_group], 2, ACCT_STR); + assert(mbmi->ref_frame[1] == NONE_FRAME); + if (interintra) { + const INTERINTRA_MODE interintra_mode = + read_interintra_mode(xd, r, bsize_group); + mbmi->ref_frame[1] = INTRA_FRAME; + mbmi->interintra_mode = interintra_mode; + mbmi->angle_delta[PLANE_TYPE_Y] = 0; + mbmi->angle_delta[PLANE_TYPE_UV] = 0; + mbmi->filter_intra_mode_info.use_filter_intra = 0; + if (av1_is_wedge_used(bsize)) { + mbmi->use_wedge_interintra = aom_read_symbol( + r, ec_ctx->wedge_interintra_cdf[bsize], 2, ACCT_STR); + if (mbmi->use_wedge_interintra) { + mbmi->interintra_wedge_index = (int8_t)aom_read_symbol( + r, ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES, ACCT_STR); + } + } + } + } + + for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) { + const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref]; + xd->block_ref_scale_factors[ref] = get_ref_scale_factors_const(cm, frame); + } + + mbmi->motion_mode = SIMPLE_TRANSLATION; + if (is_motion_variation_allowed_bsize(mbmi->sb_type) && !mbmi->skip_mode && + !has_second_ref(mbmi)) { + mbmi->num_proj_ref = av1_findSamples(cm, xd, pts, pts_inref); + } + av1_count_overlappable_neighbors(cm, xd); + + if (mbmi->ref_frame[1] != INTRA_FRAME) + mbmi->motion_mode = read_motion_mode(cm, xd, mbmi, r); + + // init + mbmi->comp_group_idx = 0; + mbmi->compound_idx = 1; + mbmi->interinter_comp.type = COMPOUND_AVERAGE; + + if (has_second_ref(mbmi) && !mbmi->skip_mode) { + // Read idx to indicate current compound inter prediction mode group + const int masked_compound_used = is_any_masked_compound_used(bsize) && + cm->seq_params.enable_masked_compound; + + if (masked_compound_used) { + const int ctx_comp_group_idx = get_comp_group_idx_context(xd); + mbmi->comp_group_idx = (uint8_t)aom_read_symbol( + r, ec_ctx->comp_group_idx_cdf[ctx_comp_group_idx], 2, ACCT_STR); + } + + if (mbmi->comp_group_idx == 0) { + if (cm->seq_params.order_hint_info.enable_dist_wtd_comp) { + const int comp_index_ctx = get_comp_index_context(cm, xd); + mbmi->compound_idx = (uint8_t)aom_read_symbol( + r, ec_ctx->compound_index_cdf[comp_index_ctx], 2, ACCT_STR); + mbmi->interinter_comp.type = + mbmi->compound_idx ? COMPOUND_AVERAGE : COMPOUND_DISTWTD; + } else { + // Distance-weighted compound is disabled, so always use average + mbmi->compound_idx = 1; + mbmi->interinter_comp.type = COMPOUND_AVERAGE; + } + } else { + assert(cm->current_frame.reference_mode != SINGLE_REFERENCE && + is_inter_compound_mode(mbmi->mode) && + mbmi->motion_mode == SIMPLE_TRANSLATION); + assert(masked_compound_used); + + // compound_diffwtd, wedge + if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) { + mbmi->interinter_comp.type = + COMPOUND_WEDGE + aom_read_symbol(r, + ec_ctx->compound_type_cdf[bsize], + MASKED_COMPOUND_TYPES, ACCT_STR); + } else { + mbmi->interinter_comp.type = COMPOUND_DIFFWTD; + } + + if (mbmi->interinter_comp.type == COMPOUND_WEDGE) { + assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize)); + mbmi->interinter_comp.wedge_index = (int8_t)aom_read_symbol( + r, ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES, ACCT_STR); + mbmi->interinter_comp.wedge_sign = (int8_t)aom_read_bit(r, ACCT_STR); + } else { + assert(mbmi->interinter_comp.type == COMPOUND_DIFFWTD); + mbmi->interinter_comp.mask_type = + aom_read_literal(r, MAX_DIFFWTD_MASK_BITS, ACCT_STR); + } + } + } + + read_mb_interp_filter(xd, features->interp_filter, + cm->seq_params.enable_dual_filter, mbmi, r); + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + if (mbmi->motion_mode == WARPED_CAUSAL) { + mbmi->wm_params.wmtype = DEFAULT_WMTYPE; + mbmi->wm_params.invalid = 0; + + if (mbmi->num_proj_ref > 1) { + mbmi->num_proj_ref = av1_selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref, + mbmi->num_proj_ref, bsize); + } + + if (av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize, + mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col, + &mbmi->wm_params, mi_row, mi_col)) { +#if WARPED_MOTION_DEBUG + printf("Warning: unexpected warped model from aomenc\n"); +#endif + mbmi->wm_params.invalid = 1; + } + } + + xd->cfl.store_y = store_cfl_required(cm, xd); + +#if DEC_MISMATCH_DEBUG + dec_dump_logs(cm, mi, mi_row, mi_col, mode_ctx); +#endif // DEC_MISMATCH_DEBUG +} + +static void read_inter_frame_mode_info(AV1Decoder *const pbi, + MACROBLOCKD *const xd, aom_reader *r) { + AV1_COMMON *const cm = &pbi->common; + MB_MODE_INFO *const mbmi = xd->mi[0]; + int inter_block = 1; + + mbmi->mv[0].as_int = 0; + mbmi->mv[1].as_int = 0; + mbmi->segment_id = read_inter_segment_id(cm, xd, 1, r); + + mbmi->skip_mode = read_skip_mode(cm, xd, mbmi->segment_id, r); + + if (mbmi->skip_mode) + mbmi->skip = 1; + else + mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r); + + if (!cm->seg.segid_preskip) + mbmi->segment_id = read_inter_segment_id(cm, xd, 0, r); + + read_cdef(cm, r, xd); + + read_delta_q_params(cm, xd, r); + + if (!mbmi->skip_mode) + inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r); + + mbmi->current_qindex = xd->current_qindex; + + xd->above_txfm_context = + cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK); + + if (inter_block) + read_inter_block_mode_info(pbi, xd, mbmi, r); + else + read_intra_block_mode_info(cm, xd, mbmi, r); +} + +static void intra_copy_frame_mvs(AV1_COMMON *const cm, int mi_row, int mi_col, + int x_mis, int y_mis) { + const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, 1); + MV_REF *frame_mvs = + cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1); + x_mis = ROUND_POWER_OF_TWO(x_mis, 1); + y_mis = ROUND_POWER_OF_TWO(y_mis, 1); + + for (int h = 0; h < y_mis; h++) { + MV_REF *mv = frame_mvs; + for (int w = 0; w < x_mis; w++) { + mv->ref_frame = NONE_FRAME; + mv++; + } + frame_mvs += frame_mvs_stride; + } +} + +void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd, aom_reader *r, + int x_mis, int y_mis) { + AV1_COMMON *const cm = &pbi->common; + MB_MODE_INFO *const mi = xd->mi[0]; + mi->use_intrabc = 0; + + if (frame_is_intra_only(cm)) { + read_intra_frame_mode_info(cm, xd, r); + if (pbi->common.seq_params.order_hint_info.enable_ref_frame_mvs) + intra_copy_frame_mvs(cm, xd->mi_row, xd->mi_col, x_mis, y_mis); + } else { + read_inter_frame_mode_info(pbi, xd, r); + if (pbi->common.seq_params.order_hint_info.enable_ref_frame_mvs) + av1_copy_frame_mvs(cm, mi, xd->mi_row, xd->mi_col, x_mis, y_mis); + } +} diff --git a/libs/libaom/src/av1/decoder/decodemv.h b/libs/libaom/src/av1/decoder/decodemv.h new file mode 100644 index 000000000..289e66ae1 --- /dev/null +++ b/libs/libaom/src/av1/decoder/decodemv.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_DECODER_DECODEMV_H_ +#define AOM_AV1_DECODER_DECODEMV_H_ + +#include "aom_dsp/bitreader.h" + +#include "av1/decoder/decoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd, aom_reader *r, + int x_mis, int y_mis); + +#ifdef __cplusplus +} // extern "C" +#endif + +void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row, + int blk_col, TX_SIZE tx_size, aom_reader *r); + +#endif // AOM_AV1_DECODER_DECODEMV_H_ diff --git a/libs/libaom/src/av1/decoder/decoder.c b/libs/libaom/src/av1/decoder/decoder.c new file mode 100644 index 000000000..fc5f2cd20 --- /dev/null +++ b/libs/libaom/src/av1/decoder/decoder.c @@ -0,0 +1,539 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/av1_rtcd.h" +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/system_state.h" +#include "aom_ports/aom_once.h" +#include "aom_ports/aom_timer.h" +#include "aom_scale/aom_scale.h" +#include "aom_util/aom_thread.h" + +#include "av1/common/alloccommon.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/av1_loopfilter.h" +#include "av1/common/quant_common.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" + +#include "av1/decoder/decodeframe.h" +#include "av1/decoder/decoder.h" +#include "av1/decoder/detokenize.h" +#include "av1/decoder/obu.h" + +static void initialize_dec(void) { + av1_rtcd(); + aom_dsp_rtcd(); + aom_scale_rtcd(); + av1_init_intra_predictors(); + av1_init_wedge_masks(); +} + +static void dec_set_mb_mi(CommonModeInfoParams *mi_params, int width, + int height) { + // Ensure that the decoded width and height are both multiples of + // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if + // subsampling is used). + // This simplifies the implementation of various experiments, + // eg. cdef, which operates on units of 8x8 luma pixels. + const int aligned_width = ALIGN_POWER_OF_TWO(width, 3); + const int aligned_height = ALIGN_POWER_OF_TWO(height, 3); + + mi_params->mi_cols = aligned_width >> MI_SIZE_LOG2; + mi_params->mi_rows = aligned_height >> MI_SIZE_LOG2; + mi_params->mi_stride = calc_mi_size(mi_params->mi_cols); + + mi_params->mb_cols = (mi_params->mi_cols + 2) >> 2; + mi_params->mb_rows = (mi_params->mi_rows + 2) >> 2; + mi_params->MBs = mi_params->mb_rows * mi_params->mb_cols; + + mi_params->mi_alloc_bsize = BLOCK_4X4; + mi_params->mi_alloc_stride = mi_params->mi_stride; + + assert(mi_size_wide[mi_params->mi_alloc_bsize] == + mi_size_high[mi_params->mi_alloc_bsize]); + +#if CONFIG_LPF_MASK + av1_alloc_loop_filter_mask(mi_params); +#endif +} + +static void dec_setup_mi(CommonModeInfoParams *mi_params) { + const int mi_grid_size = + mi_params->mi_stride * calc_mi_size(mi_params->mi_rows); + memset(mi_params->mi_grid_base, 0, + mi_grid_size * sizeof(*mi_params->mi_grid_base)); +} + +static void dec_free_mi(CommonModeInfoParams *mi_params) { + aom_free(mi_params->mi_alloc); + mi_params->mi_alloc = NULL; + aom_free(mi_params->mi_grid_base); + mi_params->mi_grid_base = NULL; + mi_params->mi_alloc_size = 0; + aom_free(mi_params->tx_type_map); + mi_params->tx_type_map = NULL; +} + +AV1Decoder *av1_decoder_create(BufferPool *const pool) { + AV1Decoder *volatile const pbi = aom_memalign(32, sizeof(*pbi)); + if (!pbi) return NULL; + av1_zero(*pbi); + + AV1_COMMON *volatile const cm = &pbi->common; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(cm->error.jmp)) { + cm->error.setjmp = 0; + av1_decoder_remove(pbi); + return NULL; + } + + cm->error.setjmp = 1; + + CHECK_MEM_ERROR(cm, cm->fc, + (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc))); + CHECK_MEM_ERROR( + cm, cm->default_frame_context, + (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->default_frame_context))); + memset(cm->fc, 0, sizeof(*cm->fc)); + memset(cm->default_frame_context, 0, sizeof(*cm->default_frame_context)); + + pbi->need_resync = 1; + aom_once(initialize_dec); + + // Initialize the references to not point to any frame buffers. + for (int i = 0; i < REF_FRAMES; i++) { + cm->ref_frame_map[i] = NULL; + } + + cm->current_frame.frame_number = 0; + pbi->decoding_first_frame = 1; + pbi->common.buffer_pool = pool; + + cm->seq_params.bit_depth = AOM_BITS_8; + + cm->mi_params.free_mi = dec_free_mi; + cm->mi_params.setup_mi = dec_setup_mi; + cm->mi_params.set_mb_mi = dec_set_mb_mi; + + av1_loop_filter_init(cm); + + av1_qm_init(&cm->quant_params, av1_num_planes(cm)); + av1_loop_restoration_precal(); +#if CONFIG_ACCOUNTING + pbi->acct_enabled = 1; + aom_accounting_init(&pbi->accounting); +#endif + + cm->error.setjmp = 0; + + aom_get_worker_interface()->init(&pbi->lf_worker); + pbi->lf_worker.thread_name = "aom lf worker"; + + return pbi; +} + +void av1_dealloc_dec_jobs(struct AV1DecTileMTData *tile_mt_info) { + if (tile_mt_info != NULL) { +#if CONFIG_MULTITHREAD + if (tile_mt_info->job_mutex != NULL) { + pthread_mutex_destroy(tile_mt_info->job_mutex); + aom_free(tile_mt_info->job_mutex); + } +#endif + aom_free(tile_mt_info->job_queue); + // clear the structure as the source of this call may be a resize in which + // case this call will be followed by an _alloc() which may fail. + av1_zero(*tile_mt_info); + } +} + +void av1_dec_free_cb_buf(AV1Decoder *pbi) { + aom_free(pbi->cb_buffer_base); + pbi->cb_buffer_base = NULL; + pbi->cb_buffer_alloc_size = 0; +} + +void av1_decoder_remove(AV1Decoder *pbi) { + int i; + + if (!pbi) return; + + // Free the tile list output buffer. + aom_free_frame_buffer(&pbi->tile_list_outbuf); + + aom_get_worker_interface()->end(&pbi->lf_worker); + aom_free(pbi->lf_worker.data1); + + if (pbi->thread_data) { + for (int worker_idx = 0; worker_idx < pbi->max_threads - 1; worker_idx++) { + DecWorkerData *const thread_data = pbi->thread_data + worker_idx; + av1_free_mc_tmp_buf(thread_data->td); + aom_free(thread_data->td); + } + aom_free(pbi->thread_data); + } + + for (i = 0; i < pbi->num_workers; ++i) { + AVxWorker *const worker = &pbi->tile_workers[i]; + aom_get_worker_interface()->end(worker); + } +#if CONFIG_MULTITHREAD + if (pbi->row_mt_mutex_ != NULL) { + pthread_mutex_destroy(pbi->row_mt_mutex_); + aom_free(pbi->row_mt_mutex_); + } + if (pbi->row_mt_cond_ != NULL) { + pthread_cond_destroy(pbi->row_mt_cond_); + aom_free(pbi->row_mt_cond_); + } +#endif + for (i = 0; i < pbi->allocated_tiles; i++) { + TileDataDec *const tile_data = pbi->tile_data + i; + av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync); + } + aom_free(pbi->tile_data); + aom_free(pbi->tile_workers); + + if (pbi->num_workers > 0) { + av1_loop_filter_dealloc(&pbi->lf_row_sync); + av1_loop_restoration_dealloc(&pbi->lr_row_sync, pbi->num_workers); + av1_dealloc_dec_jobs(&pbi->tile_mt_info); + } + + av1_dec_free_cb_buf(pbi); +#if CONFIG_ACCOUNTING + aom_accounting_clear(&pbi->accounting); +#endif + av1_free_mc_tmp_buf(&pbi->td); + aom_img_metadata_array_free(pbi->metadata); + aom_free(pbi); +} + +void av1_visit_palette(AV1Decoder *const pbi, MACROBLOCKD *const xd, + aom_reader *r, palette_visitor_fn_t visit) { + if (!is_inter_block(xd->mi[0])) { + for (int plane = 0; plane < AOMMIN(2, av1_num_planes(&pbi->common)); + ++plane) { + if (plane == 0 || xd->is_chroma_ref) { + if (xd->mi[0]->palette_mode_info.palette_size[plane]) + visit(xd, plane, r); + } else { + assert(xd->mi[0]->palette_mode_info.palette_size[plane] == 0); + } + } + } +} + +static int equal_dimensions(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + return a->y_height == b->y_height && a->y_width == b->y_width && + a->uv_height == b->uv_height && a->uv_width == b->uv_width; +} + +aom_codec_err_t av1_copy_reference_dec(AV1Decoder *pbi, int idx, + YV12_BUFFER_CONFIG *sd) { + AV1_COMMON *cm = &pbi->common; + const int num_planes = av1_num_planes(cm); + + const YV12_BUFFER_CONFIG *const cfg = get_ref_frame(cm, idx); + if (cfg == NULL) { + aom_internal_error(&cm->error, AOM_CODEC_ERROR, "No reference frame"); + return AOM_CODEC_ERROR; + } + if (!equal_dimensions(cfg, sd)) + aom_internal_error(&cm->error, AOM_CODEC_ERROR, + "Incorrect buffer dimensions"); + else + aom_yv12_copy_frame(cfg, sd, num_planes); + + return cm->error.error_code; +} + +static int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + return a->y_height == b->y_height && a->y_width == b->y_width && + a->uv_height == b->uv_height && a->uv_width == b->uv_width && + a->y_stride == b->y_stride && a->uv_stride == b->uv_stride && + a->border == b->border && + (a->flags & YV12_FLAG_HIGHBITDEPTH) == + (b->flags & YV12_FLAG_HIGHBITDEPTH); +} + +aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx, + int use_external_ref, + YV12_BUFFER_CONFIG *sd) { + const int num_planes = av1_num_planes(cm); + YV12_BUFFER_CONFIG *ref_buf = NULL; + + // Get the destination reference buffer. + ref_buf = get_ref_frame(cm, idx); + + if (ref_buf == NULL) { + aom_internal_error(&cm->error, AOM_CODEC_ERROR, "No reference frame"); + return AOM_CODEC_ERROR; + } + + if (!use_external_ref) { + if (!equal_dimensions(ref_buf, sd)) { + aom_internal_error(&cm->error, AOM_CODEC_ERROR, + "Incorrect buffer dimensions"); + } else { + // Overwrite the reference frame buffer. + aom_yv12_copy_frame(sd, ref_buf, num_planes); + } + } else { + if (!equal_dimensions_and_border(ref_buf, sd)) { + aom_internal_error(&cm->error, AOM_CODEC_ERROR, + "Incorrect buffer dimensions"); + } else { + // Overwrite the reference frame buffer pointers. + // Once we no longer need the external reference buffer, these pointers + // are restored. + ref_buf->store_buf_adr[0] = ref_buf->y_buffer; + ref_buf->store_buf_adr[1] = ref_buf->u_buffer; + ref_buf->store_buf_adr[2] = ref_buf->v_buffer; + ref_buf->y_buffer = sd->y_buffer; + ref_buf->u_buffer = sd->u_buffer; + ref_buf->v_buffer = sd->v_buffer; + ref_buf->use_external_reference_buffers = 1; + } + } + + return cm->error.error_code; +} + +aom_codec_err_t av1_copy_new_frame_dec(AV1_COMMON *cm, + YV12_BUFFER_CONFIG *new_frame, + YV12_BUFFER_CONFIG *sd) { + const int num_planes = av1_num_planes(cm); + + if (!equal_dimensions_and_border(new_frame, sd)) + aom_internal_error(&cm->error, AOM_CODEC_ERROR, + "Incorrect buffer dimensions"); + else + aom_yv12_copy_frame(new_frame, sd, num_planes); + + return cm->error.error_code; +} + +static void release_current_frame(AV1Decoder *pbi) { + AV1_COMMON *const cm = &pbi->common; + BufferPool *const pool = cm->buffer_pool; + + cm->cur_frame->buf.corrupted = 1; + lock_buffer_pool(pool); + decrease_ref_count(cm->cur_frame, pool); + unlock_buffer_pool(pool); + cm->cur_frame = NULL; +} + +// If any buffer updating is signaled it should be done here. +// Consumes a reference to cm->cur_frame. +// +// This functions returns void. It reports failure by setting +// cm->error.error_code. +static void update_frame_buffers(AV1Decoder *pbi, int frame_decoded) { + int ref_index = 0, mask; + AV1_COMMON *const cm = &pbi->common; + BufferPool *const pool = cm->buffer_pool; + + if (frame_decoded) { + lock_buffer_pool(pool); + + // In ext-tile decoding, the camera frame header is only decoded once. So, + // we don't update the references here. + if (!pbi->camera_frame_header_ready) { + // The following for loop needs to release the reference stored in + // cm->ref_frame_map[ref_index] before storing a reference to + // cm->cur_frame in cm->ref_frame_map[ref_index]. + for (mask = cm->current_frame.refresh_frame_flags; mask; mask >>= 1) { + if (mask & 1) { + decrease_ref_count(cm->ref_frame_map[ref_index], pool); + cm->ref_frame_map[ref_index] = cm->cur_frame; + ++cm->cur_frame->ref_count; + } + ++ref_index; + } + } + + if (cm->show_existing_frame || cm->show_frame) { + if (pbi->output_all_layers) { + // Append this frame to the output queue + if (pbi->num_output_frames >= MAX_NUM_SPATIAL_LAYERS) { + // We can't store the new frame anywhere, so drop it and return an + // error + cm->cur_frame->buf.corrupted = 1; + decrease_ref_count(cm->cur_frame, pool); + cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; + } else { + pbi->output_frames[pbi->num_output_frames] = cm->cur_frame; + pbi->num_output_frames++; + } + } else { + // Replace any existing output frame + assert(pbi->num_output_frames == 0 || pbi->num_output_frames == 1); + if (pbi->num_output_frames > 0) { + decrease_ref_count(pbi->output_frames[0], pool); + } + pbi->output_frames[0] = cm->cur_frame; + pbi->num_output_frames = 1; + } + } else { + decrease_ref_count(cm->cur_frame, pool); + } + + unlock_buffer_pool(pool); + } else { + // Nothing was decoded, so just drop this frame buffer + lock_buffer_pool(pool); + decrease_ref_count(cm->cur_frame, pool); + unlock_buffer_pool(pool); + } + cm->cur_frame = NULL; + + if (!pbi->camera_frame_header_ready) { + // Invalidate these references until the next frame starts. + for (ref_index = 0; ref_index < INTER_REFS_PER_FRAME; ref_index++) { + cm->remapped_ref_idx[ref_index] = INVALID_IDX; + } + } +} + +int av1_receive_compressed_data(AV1Decoder *pbi, size_t size, + const uint8_t **psource) { + AV1_COMMON *volatile const cm = &pbi->common; + const uint8_t *source = *psource; + cm->error.error_code = AOM_CODEC_OK; + cm->error.has_detail = 0; + + if (size == 0) { + // This is used to signal that we are missing frames. + // We do not know if the missing frame(s) was supposed to update + // any of the reference buffers, but we act conservative and + // mark only the last buffer as corrupted. + // + // TODO(jkoleszar): Error concealment is undefined and non-normative + // at this point, but if it becomes so, [0] may not always be the correct + // thing to do here. + RefCntBuffer *ref_buf = get_ref_frame_buf(cm, LAST_FRAME); + if (ref_buf != NULL) ref_buf->buf.corrupted = 1; + } + + if (assign_cur_frame_new_fb(cm) == NULL) { + cm->error.error_code = AOM_CODEC_MEM_ERROR; + return 1; + } + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(cm->error.jmp)) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + int i; + + cm->error.setjmp = 0; + + // Synchronize all threads immediately as a subsequent decode call may + // cause a resize invalidating some allocations. + winterface->sync(&pbi->lf_worker); + for (i = 0; i < pbi->num_workers; ++i) { + winterface->sync(&pbi->tile_workers[i]); + } + + release_current_frame(pbi); + aom_clear_system_state(); + return -1; + } + + cm->error.setjmp = 1; + + int frame_decoded = + aom_decode_frame_from_obus(pbi, source, source + size, psource); + + if (frame_decoded < 0) { + assert(cm->error.error_code != AOM_CODEC_OK); + release_current_frame(pbi); + cm->error.setjmp = 0; + return 1; + } + +#if TXCOEFF_TIMER + cm->cum_txcoeff_timer += cm->txcoeff_timer; + fprintf(stderr, + "txb coeff block number: %d, frame time: %ld, cum time %ld in us\n", + cm->txb_count, cm->txcoeff_timer, cm->cum_txcoeff_timer); + cm->txcoeff_timer = 0; + cm->txb_count = 0; +#endif + + // Note: At this point, this function holds a reference to cm->cur_frame + // in the buffer pool. This reference is consumed by update_frame_buffers(). + update_frame_buffers(pbi, frame_decoded); + + if (frame_decoded) { + pbi->decoding_first_frame = 0; + } + + if (cm->error.error_code != AOM_CODEC_OK) { + cm->error.setjmp = 0; + return 1; + } + + aom_clear_system_state(); + + if (!cm->show_existing_frame) { + if (cm->seg.enabled) { + if (cm->prev_frame && + (cm->mi_params.mi_rows == cm->prev_frame->mi_rows) && + (cm->mi_params.mi_cols == cm->prev_frame->mi_cols)) { + cm->last_frame_seg_map = cm->prev_frame->seg_map; + } else { + cm->last_frame_seg_map = NULL; + } + } + } + + // Update progress in frame parallel decode. + cm->error.setjmp = 0; + + return 0; +} + +// Get the frame at a particular index in the output queue +int av1_get_raw_frame(AV1Decoder *pbi, size_t index, YV12_BUFFER_CONFIG **sd, + aom_film_grain_t **grain_params) { + if (index >= pbi->num_output_frames) return -1; + *sd = &pbi->output_frames[index]->buf; + *grain_params = &pbi->output_frames[index]->film_grain_params; + aom_clear_system_state(); + return 0; +} + +// Get the highest-spatial-layer output +// TODO(david.barker): What should this do? +int av1_get_frame_to_show(AV1Decoder *pbi, YV12_BUFFER_CONFIG *frame) { + if (pbi->num_output_frames == 0) return -1; + + *frame = pbi->output_frames[pbi->num_output_frames - 1]->buf; + return 0; +} diff --git a/libs/libaom/src/av1/decoder/decoder.h b/libs/libaom/src/av1/decoder/decoder.h new file mode 100644 index 000000000..4580de2ac --- /dev/null +++ b/libs/libaom/src/av1/decoder/decoder.h @@ -0,0 +1,331 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_DECODER_DECODER_H_ +#define AOM_AV1_DECODER_DECODER_H_ + +#include "config/aom_config.h" + +#include "aom/aom_codec.h" +#include "aom_dsp/bitreader.h" +#include "aom_scale/yv12config.h" +#include "aom_util/aom_thread.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/thread_common.h" +#include "av1/decoder/dthread.h" +#if CONFIG_ACCOUNTING +#include "av1/decoder/accounting.h" +#endif +#if CONFIG_INSPECTION +#include "av1/decoder/inspection.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*decode_block_visitor_fn_t)(const AV1_COMMON *const cm, + MACROBLOCKD *const xd, + aom_reader *const r, const int plane, + const int row, const int col, + const TX_SIZE tx_size); + +typedef void (*predict_inter_block_visitor_fn_t)(AV1_COMMON *const cm, + MACROBLOCKD *const xd, + BLOCK_SIZE bsize); + +typedef void (*cfl_store_inter_block_visitor_fn_t)(AV1_COMMON *const cm, + MACROBLOCKD *const xd); + +typedef struct ThreadData { + DECLARE_ALIGNED(32, MACROBLOCKD, xd); + CB_BUFFER cb_buffer_base; + aom_reader *bit_reader; + uint8_t *mc_buf[2]; + int32_t mc_buf_size; + int mc_buf_use_highbd; // Boolean: whether the byte pointers stored in + // mc_buf were converted from highbd pointers. + + CONV_BUF_TYPE *tmp_conv_dst; + uint8_t *tmp_obmc_bufs[2]; + + decode_block_visitor_fn_t read_coeffs_tx_intra_block_visit; + decode_block_visitor_fn_t predict_and_recon_intra_block_visit; + decode_block_visitor_fn_t read_coeffs_tx_inter_block_visit; + decode_block_visitor_fn_t inverse_tx_inter_block_visit; + predict_inter_block_visitor_fn_t predict_inter_block_visit; + cfl_store_inter_block_visitor_fn_t cfl_store_inter_block_visit; +} ThreadData; + +typedef struct AV1DecRowMTJobInfo { + int tile_row; + int tile_col; + int mi_row; +} AV1DecRowMTJobInfo; + +typedef struct AV1DecRowMTSyncData { +#if CONFIG_MULTITHREAD + pthread_mutex_t *mutex_; + pthread_cond_t *cond_; +#endif + int allocated_sb_rows; + int *cur_sb_col; + int sync_range; + int mi_rows; + int mi_cols; + int mi_rows_parse_done; + int mi_rows_decode_started; + int num_threads_working; +} AV1DecRowMTSync; + +typedef struct AV1DecRowMTInfo { + int tile_rows_start; + int tile_rows_end; + int tile_cols_start; + int tile_cols_end; + int start_tile; + int end_tile; + int mi_rows_to_decode; + + // Invariant: + // mi_rows_parse_done >= mi_rows_decode_started. + // mi_rows_parse_done and mi_rows_decode_started are both initialized to 0. + // mi_rows_parse_done is incremented freely. mi_rows_decode_started may only + // be incremented to catch up with mi_rows_parse_done but is not allowed to + // surpass mi_rows_parse_done. + // + // When mi_rows_decode_started reaches mi_rows_to_decode, there are no more + // decode jobs. + + // Indicates the progress of the bit-stream parsing of superblocks. + // Initialized to 0. Incremented by sb_mi_size when parse sb row is done. + int mi_rows_parse_done; + // Indicates the progress of the decoding of superblocks. + // Initialized to 0. Incremented by sb_mi_size when decode sb row is started. + int mi_rows_decode_started; + // Boolean: Initialized to 0 (false). Set to 1 (true) on error to abort + // decoding. + int row_mt_exit; +} AV1DecRowMTInfo; + +typedef struct TileDataDec { + TileInfo tile_info; + aom_reader bit_reader; + DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx); + AV1DecRowMTSync dec_row_mt_sync; +} TileDataDec; + +typedef struct TileBufferDec { + const uint8_t *data; + size_t size; +} TileBufferDec; + +typedef struct DataBuffer { + const uint8_t *data; + size_t size; +} DataBuffer; + +typedef struct EXTERNAL_REFERENCES { + YV12_BUFFER_CONFIG refs[MAX_EXTERNAL_REFERENCES]; + int num; +} EXTERNAL_REFERENCES; + +typedef struct TileJobsDec { + TileBufferDec *tile_buffer; + TileDataDec *tile_data; +} TileJobsDec; + +typedef struct AV1DecTileMTData { +#if CONFIG_MULTITHREAD + pthread_mutex_t *job_mutex; +#endif + TileJobsDec *job_queue; + int jobs_enqueued; + int jobs_dequeued; + int alloc_tile_rows; + int alloc_tile_cols; +} AV1DecTileMT; + +typedef struct AV1Decoder { + DECLARE_ALIGNED(32, MACROBLOCKD, mb); + + DECLARE_ALIGNED(32, AV1_COMMON, common); + + AVxWorker lf_worker; + AV1LfSync lf_row_sync; + AV1LrSync lr_row_sync; + AV1LrStruct lr_ctxt; + AVxWorker *tile_workers; + int num_workers; + DecWorkerData *thread_data; + ThreadData td; + TileDataDec *tile_data; + int allocated_tiles; + + TileBufferDec tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS]; + AV1DecTileMT tile_mt_info; + + // Each time the decoder is called, we expect to receive a full temporal unit. + // This can contain up to one shown frame per spatial layer in the current + // operating point (note that some layers may be entirely omitted). + // If the 'output_all_layers' option is true, we save all of these shown + // frames so that they can be returned to the application. If the + // 'output_all_layers' option is false, then we only output one image per + // temporal unit. + // + // Note: The saved buffers are released at the start of the next time the + // application calls aom_codec_decode(). + int output_all_layers; + RefCntBuffer *output_frames[MAX_NUM_SPATIAL_LAYERS]; + size_t num_output_frames; // How many frames are queued up so far? + + // In order to properly support random-access decoding, we need + // to behave slightly differently for the very first frame we decode. + // So we track whether this is the first frame or not. + int decoding_first_frame; + + int allow_lowbitdepth; + int max_threads; + int inv_tile_order; + int need_resync; // wait for key/intra-only frame. + int reset_decoder_state; + + int tile_size_bytes; + int tile_col_size_bytes; + int dec_tile_row, dec_tile_col; // always -1 for non-VR tile encoding +#if CONFIG_ACCOUNTING + int acct_enabled; + Accounting accounting; +#endif + int sequence_header_ready; + int sequence_header_changed; +#if CONFIG_INSPECTION + aom_inspect_cb inspect_cb; + void *inspect_ctx; +#endif + int operating_point; + int current_operating_point; + int seen_frame_header; + // The expected start_tile (tg_start syntax element) of the next tile group. + int next_start_tile; + + // State if the camera frame header is already decoded while + // large_scale_tile = 1. + int camera_frame_header_ready; + size_t frame_header_size; + DataBuffer obu_size_hdr; + int output_frame_width_in_tiles_minus_1; + int output_frame_height_in_tiles_minus_1; + int tile_count_minus_1; + uint32_t coded_tile_data_size; + unsigned int ext_tile_debug; // for ext-tile software debug & testing + unsigned int row_mt; + EXTERNAL_REFERENCES ext_refs; + YV12_BUFFER_CONFIG tile_list_outbuf; + + CB_BUFFER *cb_buffer_base; + int cb_buffer_alloc_size; + + int allocated_row_mt_sync_rows; + +#if CONFIG_MULTITHREAD + pthread_mutex_t *row_mt_mutex_; + pthread_cond_t *row_mt_cond_; +#endif + + AV1DecRowMTInfo frame_row_mt_info; + aom_metadata_array_t *metadata; + + int context_update_tile_id; + int skip_loop_filter; + int skip_film_grain; + int is_annexb; + int valid_for_referencing[REF_FRAMES]; +} AV1Decoder; + +// Returns 0 on success. Sets pbi->common.error.error_code to a nonzero error +// code and returns a nonzero value on failure. +int av1_receive_compressed_data(struct AV1Decoder *pbi, size_t size, + const uint8_t **psource); + +// Get the frame at a particular index in the output queue +int av1_get_raw_frame(AV1Decoder *pbi, size_t index, YV12_BUFFER_CONFIG **sd, + aom_film_grain_t **grain_params); + +int av1_get_frame_to_show(struct AV1Decoder *pbi, YV12_BUFFER_CONFIG *frame); + +aom_codec_err_t av1_copy_reference_dec(struct AV1Decoder *pbi, int idx, + YV12_BUFFER_CONFIG *sd); + +aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx, + int use_external_ref, + YV12_BUFFER_CONFIG *sd); +aom_codec_err_t av1_copy_new_frame_dec(AV1_COMMON *cm, + YV12_BUFFER_CONFIG *new_frame, + YV12_BUFFER_CONFIG *sd); + +struct AV1Decoder *av1_decoder_create(BufferPool *const pool); + +void av1_decoder_remove(struct AV1Decoder *pbi); +void av1_dealloc_dec_jobs(struct AV1DecTileMTData *tile_mt_info); + +void av1_dec_row_mt_dealloc(AV1DecRowMTSync *dec_row_mt_sync); + +void av1_dec_free_cb_buf(AV1Decoder *pbi); + +static INLINE void decrease_ref_count(RefCntBuffer *const buf, + BufferPool *const pool) { + if (buf != NULL) { + --buf->ref_count; + // Reference counts should never become negative. If this assertion fails, + // there is a bug in our reference count management. + assert(buf->ref_count >= 0); + // A worker may only get a free framebuffer index when calling get_free_fb. + // But the raw frame buffer is not set up until we finish decoding header. + // So if any error happens during decoding header, frame_bufs[idx] will not + // have a valid raw frame buffer. + if (buf->ref_count == 0 && buf->raw_frame_buffer.data) { + pool->release_fb_cb(pool->cb_priv, &buf->raw_frame_buffer); + buf->raw_frame_buffer.data = NULL; + buf->raw_frame_buffer.size = 0; + buf->raw_frame_buffer.priv = NULL; + } + } +} + +#define ACCT_STR __func__ +static INLINE int av1_read_uniform(aom_reader *r, int n) { + const int l = get_unsigned_bits(n); + const int m = (1 << l) - n; + const int v = aom_read_literal(r, l - 1, ACCT_STR); + assert(l != 0); + if (v < m) + return v; + else + return (v << 1) - m + aom_read_literal(r, 1, ACCT_STR); +} + +typedef void (*palette_visitor_fn_t)(MACROBLOCKD *const xd, int plane, + aom_reader *r); + +void av1_visit_palette(AV1Decoder *const pbi, MACROBLOCKD *const xd, + aom_reader *r, palette_visitor_fn_t visit); + +typedef void (*block_visitor_fn_t)(AV1Decoder *const pbi, ThreadData *const td, + int mi_row, int mi_col, aom_reader *r, + PARTITION_TYPE partition, BLOCK_SIZE bsize); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_DECODER_DECODER_H_ diff --git a/libs/libaom/src/av1/decoder/decodetxb.c b/libs/libaom/src/av1/decoder/decodetxb.c new file mode 100644 index 000000000..541f4c984 --- /dev/null +++ b/libs/libaom/src/av1/decoder/decodetxb.c @@ -0,0 +1,379 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/decoder/decodetxb.h" + +#include "aom_ports/mem.h" +#include "av1/common/idct.h" +#include "av1/common/scan.h" +#include "av1/common/txb_common.h" +#include "av1/decoder/decodemv.h" + +#define ACCT_STR __func__ + +static int read_golomb(MACROBLOCKD *xd, aom_reader *r) { + int x = 1; + int length = 0; + int i = 0; + + while (!i) { + i = aom_read_bit(r, ACCT_STR); + ++length; + if (length > 20) { + aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, + "Invalid length in read_golomb"); + break; + } + } + + for (i = 0; i < length - 1; ++i) { + x <<= 1; + x += aom_read_bit(r, ACCT_STR); + } + + return x - 1; +} + +static INLINE int rec_eob_pos(const int eob_token, const int extra) { + int eob = av1_eob_group_start[eob_token]; + if (eob > 2) { + eob += extra; + } + return eob; +} + +static INLINE int get_dqv(const int16_t *dequant, int coeff_idx, + const qm_val_t *iqmatrix) { + int dqv = dequant[!!coeff_idx]; + if (iqmatrix != NULL) + dqv = + ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + return dqv; +} + +static INLINE void read_coeffs_reverse_2d(aom_reader *r, TX_SIZE tx_size, + int start_si, int end_si, + const int16_t *scan, int bwl, + uint8_t *levels, + base_cdf_arr base_cdf, + br_cdf_arr br_cdf) { + for (int c = end_si; c >= start_si; --c) { + const int pos = scan[c]; + const int coeff_ctx = get_lower_levels_ctx_2d(levels, pos, bwl, tx_size); + const int nsymbs = 4; + int level = aom_read_symbol(r, base_cdf[coeff_ctx], nsymbs, ACCT_STR); + if (level > NUM_BASE_LEVELS) { + const int br_ctx = get_br_ctx_2d(levels, pos, bwl); + aom_cdf_prob *cdf = br_cdf[br_ctx]; + for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) { + const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR); + level += k; + if (k < BR_CDF_SIZE - 1) break; + } + } + levels[get_padded_idx(pos, bwl)] = level; + } +} + +static INLINE void read_coeffs_reverse(aom_reader *r, TX_SIZE tx_size, + TX_CLASS tx_class, int start_si, + int end_si, const int16_t *scan, int bwl, + uint8_t *levels, base_cdf_arr base_cdf, + br_cdf_arr br_cdf) { + for (int c = end_si; c >= start_si; --c) { + const int pos = scan[c]; + const int coeff_ctx = + get_lower_levels_ctx(levels, pos, bwl, tx_size, tx_class); + const int nsymbs = 4; + int level = aom_read_symbol(r, base_cdf[coeff_ctx], nsymbs, ACCT_STR); + if (level > NUM_BASE_LEVELS) { + const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class); + aom_cdf_prob *cdf = br_cdf[br_ctx]; + for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) { + const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR); + level += k; + if (k < BR_CDF_SIZE - 1) break; + } + } + levels[get_padded_idx(pos, bwl)] = level; + } +} + +uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd, + aom_reader *const r, const int blk_row, + const int blk_col, const int plane, + const TXB_CTX *const txb_ctx, + const TX_SIZE tx_size) { + FRAME_CONTEXT *const ec_ctx = xd->tile_ctx; + const int32_t max_value = (1 << (7 + xd->bd)) - 1; + const int32_t min_value = -(1 << (7 + xd->bd)); + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + const PLANE_TYPE plane_type = get_plane_type(plane); + MB_MODE_INFO *const mbmi = xd->mi[0]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int16_t *const dequant = pd->seg_dequant_QTX[mbmi->segment_id]; + tran_low_t *const tcoeffs = pd->dqcoeff_block + xd->cb_offset[plane]; + const int shift = av1_get_tx_scale(tx_size); + const int bwl = get_txb_bwl(tx_size); + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + int cul_level = 0; + int dc_val = 0; + uint8_t levels_buf[TX_PAD_2D]; + uint8_t *const levels = set_levels(levels_buf, width); + const int all_zero = aom_read_symbol( + r, ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2, ACCT_STR); + eob_info *eob_data = pd->eob_data + xd->txb_offset[plane]; + uint16_t *const eob = &(eob_data->eob); + uint16_t *const max_scan_line = &(eob_data->max_scan_line); + *max_scan_line = 0; + *eob = 0; + +#if CONFIG_INSPECTION + if (plane == 0) { + const int txk_type_idx = + av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col); + mbmi->tx_skip[txk_type_idx] = all_zero; + } +#endif + + if (all_zero) { + *max_scan_line = 0; + if (plane == 0) { + xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col] = DCT_DCT; + } + return 0; + } + + if (plane == AOM_PLANE_Y) { + // only y plane's tx_type is transmitted + av1_read_tx_type(cm, xd, blk_row, blk_col, tx_size, r); + } + const TX_TYPE tx_type = + av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size, + cm->features.reduced_tx_set_used); + const TX_CLASS tx_class = tx_type_to_class[tx_type]; + const qm_val_t *iqmatrix = + av1_get_iqmatrix(&cm->quant_params, xd, plane, tx_size, tx_type); + const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); + const int16_t *const scan = scan_order->scan; + int eob_extra = 0; + int eob_pt = 1; + + const int eob_multi_size = txsize_log2_minus4[tx_size]; + const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1; + switch (eob_multi_size) { + case 0: + eob_pt = + aom_read_symbol(r, ec_ctx->eob_flag_cdf16[plane_type][eob_multi_ctx], + 5, ACCT_STR) + + 1; + break; + case 1: + eob_pt = + aom_read_symbol(r, ec_ctx->eob_flag_cdf32[plane_type][eob_multi_ctx], + 6, ACCT_STR) + + 1; + break; + case 2: + eob_pt = + aom_read_symbol(r, ec_ctx->eob_flag_cdf64[plane_type][eob_multi_ctx], + 7, ACCT_STR) + + 1; + break; + case 3: + eob_pt = + aom_read_symbol(r, ec_ctx->eob_flag_cdf128[plane_type][eob_multi_ctx], + 8, ACCT_STR) + + 1; + break; + case 4: + eob_pt = + aom_read_symbol(r, ec_ctx->eob_flag_cdf256[plane_type][eob_multi_ctx], + 9, ACCT_STR) + + 1; + break; + case 5: + eob_pt = + aom_read_symbol(r, ec_ctx->eob_flag_cdf512[plane_type][eob_multi_ctx], + 10, ACCT_STR) + + 1; + break; + case 6: + default: + eob_pt = aom_read_symbol( + r, ec_ctx->eob_flag_cdf1024[plane_type][eob_multi_ctx], 11, + ACCT_STR) + + 1; + break; + } + + const int eob_offset_bits = av1_eob_offset_bits[eob_pt]; + if (eob_offset_bits > 0) { + const int eob_ctx = eob_pt - 3; + int bit = aom_read_symbol( + r, ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2, ACCT_STR); + if (bit) { + eob_extra += (1 << (eob_offset_bits - 1)); + } + + for (int i = 1; i < eob_offset_bits; i++) { + bit = aom_read_bit(r, ACCT_STR); + if (bit) { + eob_extra += (1 << (eob_offset_bits - 1 - i)); + } + } + } + *eob = rec_eob_pos(eob_pt, eob_extra); + + if (*eob > 1) { + memset(levels_buf, 0, + sizeof(*levels_buf) * + ((width + TX_PAD_HOR) * (height + TX_PAD_VER) + TX_PAD_END)); + } + + { + // Read the non-zero coefficient with scan index eob-1 + // TODO(angiebird): Put this into a function + const int c = *eob - 1; + const int pos = scan[c]; + const int coeff_ctx = get_lower_levels_ctx_eob(bwl, height, c); + const int nsymbs = 3; + aom_cdf_prob *cdf = + ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx]; + int level = aom_read_symbol(r, cdf, nsymbs, ACCT_STR) + 1; + if (level > NUM_BASE_LEVELS) { + const int br_ctx = get_br_ctx_eob(pos, bwl, tx_class); + cdf = ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx]; + for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) { + const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR); + level += k; + if (k < BR_CDF_SIZE - 1) break; + } + } + levels[get_padded_idx(pos, bwl)] = level; + } + if (*eob > 1) { + base_cdf_arr base_cdf = ec_ctx->coeff_base_cdf[txs_ctx][plane_type]; + br_cdf_arr br_cdf = + ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type]; + if (tx_class == TX_CLASS_2D) { + read_coeffs_reverse_2d(r, tx_size, 1, *eob - 1 - 1, scan, bwl, levels, + base_cdf, br_cdf); + read_coeffs_reverse(r, tx_size, tx_class, 0, 0, scan, bwl, levels, + base_cdf, br_cdf); + } else { + read_coeffs_reverse(r, tx_size, tx_class, 0, *eob - 1 - 1, scan, bwl, + levels, base_cdf, br_cdf); + } + } + + for (int c = 0; c < *eob; ++c) { + const int pos = scan[c]; + uint8_t sign; + tran_low_t level = levels[get_padded_idx(pos, bwl)]; + if (level) { + *max_scan_line = AOMMAX(*max_scan_line, pos); + if (c == 0) { + const int dc_sign_ctx = txb_ctx->dc_sign_ctx; + sign = aom_read_symbol(r, ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], + 2, ACCT_STR); + } else { + sign = aom_read_bit(r, ACCT_STR); + } + if (level >= MAX_BASE_BR_RANGE) { + level += read_golomb(xd, r); + } + + if (c == 0) dc_val = sign ? -level : level; + + // Bitmasking to clamp level to valid range: + // The valid range for 8/10/12 bit vdieo is at most 14/16/18 bit + level &= 0xfffff; + cul_level += level; + tran_low_t dq_coeff; + // Bitmasking to clamp dq_coeff to valid range: + // The valid range for 8/10/12 bit video is at most 17/19/21 bit + dq_coeff = (tran_low_t)( + (int64_t)level * get_dqv(dequant, scan[c], iqmatrix) & 0xffffff); + dq_coeff = dq_coeff >> shift; + if (sign) { + dq_coeff = -dq_coeff; + } + tcoeffs[pos] = clamp(dq_coeff, min_value, max_value); + } + } + + cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level); + + // DC value + set_dc_sign(&cul_level, dc_val); + + return cul_level; +} + +void av1_read_coeffs_txb_facade(const AV1_COMMON *const cm, + MACROBLOCKD *const xd, aom_reader *const r, + const int plane, const int row, const int col, + const TX_SIZE tx_size) { +#if TXCOEFF_TIMER + struct aom_usec_timer timer; + aom_usec_timer_start(&timer); +#endif + MB_MODE_INFO *const mbmi = xd->mi[0]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + + const BLOCK_SIZE bsize = mbmi->sb_type; + assert(bsize < BLOCK_SIZES_ALL); + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, plane, pd->above_entropy_context + col, + pd->left_entropy_context + row, &txb_ctx); + const uint8_t cul_level = + av1_read_coeffs_txb(cm, xd, r, row, col, plane, &txb_ctx, tx_size); + av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, col, + row); + + if (is_inter_block(mbmi)) { + const PLANE_TYPE plane_type = get_plane_type(plane); + // tx_type will be read out in av1_read_coeffs_txb_facade + const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, row, col, tx_size, + cm->features.reduced_tx_set_used); + + if (plane == 0) { + const int txw = tx_size_wide_unit[tx_size]; + const int txh = tx_size_high_unit[tx_size]; + // The 16x16 unit is due to the constraint from tx_64x64 which sets the + // maximum tx size for chroma as 32x32. Coupled with 4x1 transform block + // size, the constraint takes effect in 32x16 / 16x32 size too. To solve + // the intricacy, cover all the 16x16 units inside a 64 level transform. + if (txw == tx_size_wide_unit[TX_64X64] || + txh == tx_size_high_unit[TX_64X64]) { + const int tx_unit = tx_size_wide_unit[TX_16X16]; + const int stride = xd->tx_type_map_stride; + for (int idy = 0; idy < txh; idy += tx_unit) { + for (int idx = 0; idx < txw; idx += tx_unit) { + xd->tx_type_map[(row + idy) * stride + col + idx] = tx_type; + } + } + } + } + } + +#if TXCOEFF_TIMER + aom_usec_timer_mark(&timer); + const int64_t elapsed_time = aom_usec_timer_elapsed(&timer); + cm->txcoeff_timer += elapsed_time; + ++cm->txb_count; +#endif +} diff --git a/libs/libaom/src/av1/decoder/decodetxb.h b/libs/libaom/src/av1/decoder/decodetxb.h new file mode 100644 index 000000000..39bf0bf78 --- /dev/null +++ b/libs/libaom/src/av1/decoder/decodetxb.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_DECODER_DECODETXB_H_ +#define AOM_AV1_DECODER_DECODETXB_H_ + +#include "config/aom_config.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/txb_common.h" +#include "aom_dsp/bitreader.h" + +uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd, + aom_reader *const r, const int blk_row, + const int blk_col, const int plane, + const TXB_CTX *const txb_ctx, + const TX_SIZE tx_size); + +void av1_read_coeffs_txb_facade(const AV1_COMMON *const cm, + MACROBLOCKD *const xd, aom_reader *const r, + const int plane, const int row, const int col, + const TX_SIZE tx_size); +#endif // AOM_AV1_DECODER_DECODETXB_H_ diff --git a/libs/libaom/src/av1/decoder/detokenize.c b/libs/libaom/src/av1/decoder/detokenize.c new file mode 100644 index 000000000..9d54bd13d --- /dev/null +++ b/libs/libaom/src/av1/decoder/detokenize.c @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" + +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "av1/common/blockd.h" +#include "av1/decoder/detokenize.h" + +#define ACCT_STR __func__ + +#include "av1/common/common.h" +#include "av1/common/entropy.h" +#include "av1/common/idct.h" + +static void decode_color_map_tokens(Av1ColorMapParam *param, aom_reader *r) { + uint8_t color_order[PALETTE_MAX_SIZE]; + const int n = param->n_colors; + uint8_t *const color_map = param->color_map; + MapCdf color_map_cdf = param->map_cdf; + int plane_block_width = param->plane_width; + int plane_block_height = param->plane_height; + int rows = param->rows; + int cols = param->cols; + + // The first color index. + color_map[0] = av1_read_uniform(r, n); + assert(color_map[0] < n); + + // Run wavefront on the palette map index decoding. + for (int i = 1; i < rows + cols - 1; ++i) { + for (int j = AOMMIN(i, cols - 1); j >= AOMMAX(0, i - rows + 1); --j) { + const int color_ctx = av1_get_palette_color_index_context( + color_map, plane_block_width, (i - j), j, n, color_order, NULL); + const int color_idx = aom_read_symbol( + r, color_map_cdf[n - PALETTE_MIN_SIZE][color_ctx], n, ACCT_STR); + assert(color_idx >= 0 && color_idx < n); + color_map[(i - j) * plane_block_width + j] = color_order[color_idx]; + } + } + // Copy last column to extra columns. + if (cols < plane_block_width) { + for (int i = 0; i < rows; ++i) { + memset(color_map + i * plane_block_width + cols, + color_map[i * plane_block_width + cols - 1], + (plane_block_width - cols)); + } + } + // Copy last row to extra rows. + for (int i = rows; i < plane_block_height; ++i) { + memcpy(color_map + i * plane_block_width, + color_map + (rows - 1) * plane_block_width, plane_block_width); + } +} + +void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane, + aom_reader *r) { + assert(plane == 0 || plane == 1); + Av1ColorMapParam params; + params.color_map = + xd->plane[plane].color_index_map + xd->color_index_map_offset[plane]; + params.map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf + : xd->tile_ctx->palette_y_color_index_cdf; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + params.n_colors = mbmi->palette_mode_info.palette_size[plane]; + av1_get_block_dimensions(mbmi->sb_type, plane, xd, ¶ms.plane_width, + ¶ms.plane_height, ¶ms.rows, ¶ms.cols); + decode_color_map_tokens(¶ms, r); +} diff --git a/libs/libaom/src/av1/decoder/detokenize.h b/libs/libaom/src/av1/decoder/detokenize.h new file mode 100644 index 000000000..173b437a9 --- /dev/null +++ b/libs/libaom/src/av1/decoder/detokenize.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_DECODER_DETOKENIZE_H_ +#define AOM_AV1_DECODER_DETOKENIZE_H_ + +#include "config/aom_config.h" + +#include "av1/common/scan.h" +#include "av1/decoder/decoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane, aom_reader *r); + +#ifdef __cplusplus +} // extern "C" +#endif +#endif // AOM_AV1_DECODER_DETOKENIZE_H_ diff --git a/libs/libaom/src/av1/decoder/dthread.h b/libs/libaom/src/av1/decoder/dthread.h new file mode 100644 index 000000000..f82b9d8cc --- /dev/null +++ b/libs/libaom/src/av1/decoder/dthread.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_DECODER_DTHREAD_H_ +#define AOM_AV1_DECODER_DTHREAD_H_ + +#include "config/aom_config.h" + +#include "aom_util/aom_thread.h" +#include "aom/internal/aom_codec_internal.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1Common; +struct AV1Decoder; +struct ThreadData; + +typedef struct DecWorkerData { + struct ThreadData *td; + const uint8_t *data_end; + struct aom_internal_error_info error_info; +} DecWorkerData; + +// WorkerData for the FrameWorker thread. It contains all the information of +// the worker and decode structures for decoding a frame. +typedef struct FrameWorkerData { + struct AV1Decoder *pbi; + const uint8_t *data; + const uint8_t *data_end; + size_t data_size; + void *user_priv; + int received_frame; + int frame_context_ready; // Current frame's context is ready to read. + int frame_decoded; // Finished decoding current frame. +} FrameWorkerData; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_DECODER_DTHREAD_H_ diff --git a/libs/libaom/src/av1/decoder/inspection.c b/libs/libaom/src/av1/decoder/inspection.c new file mode 100644 index 000000000..d121a7034 --- /dev/null +++ b/libs/libaom/src/av1/decoder/inspection.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "av1/decoder/decoder.h" +#include "av1/decoder/inspection.h" +#include "av1/common/enums.h" +#include "av1/common/cdef.h" + +static void ifd_init_mi_rc(insp_frame_data *fd, int mi_cols, int mi_rows) { + fd->mi_cols = mi_cols; + fd->mi_rows = mi_rows; + fd->mi_grid = (insp_mi_data *)aom_malloc(sizeof(insp_mi_data) * fd->mi_rows * + fd->mi_cols); +} + +void ifd_init(insp_frame_data *fd, int frame_width, int frame_height) { + int mi_cols = ALIGN_POWER_OF_TWO(frame_width, 3) >> MI_SIZE_LOG2; + int mi_rows = ALIGN_POWER_OF_TWO(frame_height, 3) >> MI_SIZE_LOG2; + ifd_init_mi_rc(fd, mi_cols, mi_rows); +} + +void ifd_clear(insp_frame_data *fd) { + aom_free(fd->mi_grid); + fd->mi_grid = NULL; +} + +/* TODO(negge) This function may be called by more than one thread when using + a multi-threaded decoder and this may cause a data race. */ +int ifd_inspect(insp_frame_data *fd, void *decoder, int skip_not_transform) { + struct AV1Decoder *pbi = (struct AV1Decoder *)decoder; + AV1_COMMON *const cm = &pbi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const CommonQuantParams *quant_params = &cm->quant_params; + + if (fd->mi_rows != mi_params->mi_rows || fd->mi_cols != mi_params->mi_cols) { + ifd_clear(fd); + ifd_init_mi_rc(fd, mi_params->mi_rows, mi_params->mi_cols); + } + fd->show_existing_frame = cm->show_existing_frame; + fd->frame_number = cm->current_frame.frame_number; + fd->show_frame = cm->show_frame; + fd->frame_type = cm->current_frame.frame_type; + fd->base_qindex = quant_params->base_qindex; + // Set width and height of the first tile until generic support can be added + TileInfo tile_info; + av1_tile_set_row(&tile_info, cm, 0); + av1_tile_set_col(&tile_info, cm, 0); + fd->tile_mi_cols = tile_info.mi_col_end - tile_info.mi_col_start; + fd->tile_mi_rows = tile_info.mi_row_end - tile_info.mi_row_start; + fd->delta_q_present_flag = cm->delta_q_info.delta_q_present_flag; + fd->delta_q_res = cm->delta_q_info.delta_q_res; +#if CONFIG_ACCOUNTING + fd->accounting = &pbi->accounting; +#endif + // TODO(negge): copy per frame CDEF data + int i, j; + for (i = 0; i < MAX_SEGMENTS; i++) { + for (j = 0; j < 2; j++) { + fd->y_dequant[i][j] = quant_params->y_dequant_QTX[i][j]; + fd->u_dequant[i][j] = quant_params->u_dequant_QTX[i][j]; + fd->v_dequant[i][j] = quant_params->v_dequant_QTX[i][j]; + } + } + for (j = 0; j < mi_params->mi_rows; j++) { + for (i = 0; i < mi_params->mi_cols; i++) { + const MB_MODE_INFO *mbmi = + mi_params->mi_grid_base[j * mi_params->mi_stride + i]; + insp_mi_data *mi = &fd->mi_grid[j * mi_params->mi_cols + i]; + // Segment + mi->segment_id = mbmi->segment_id; + // Motion Vectors + mi->mv[0].row = mbmi->mv[0].as_mv.row; + mi->mv[0].col = mbmi->mv[0].as_mv.col; + mi->mv[1].row = mbmi->mv[1].as_mv.row; + mi->mv[1].col = mbmi->mv[1].as_mv.col; + // Reference Frames + mi->ref_frame[0] = mbmi->ref_frame[0]; + mi->ref_frame[1] = mbmi->ref_frame[1]; + // Prediction Mode + mi->mode = mbmi->mode; + mi->intrabc = (int16_t)mbmi->use_intrabc; + mi->palette = (int16_t)mbmi->palette_mode_info.palette_size[0]; + mi->uv_palette = (int16_t)mbmi->palette_mode_info.palette_size[1]; + // Prediction Mode for Chromatic planes + if (mi->mode < INTRA_MODES) { + mi->uv_mode = mbmi->uv_mode; + } else { + mi->uv_mode = UV_MODE_INVALID; + } + + mi->motion_mode = mbmi->motion_mode; + mi->compound_type = mbmi->interinter_comp.type; + + // Block Size + mi->sb_type = mbmi->sb_type; + // Skip Flag + mi->skip = mbmi->skip; + mi->filter[0] = av1_extract_interp_filter(mbmi->interp_filters, 0); + mi->filter[1] = av1_extract_interp_filter(mbmi->interp_filters, 1); + mi->dual_filter_type = mi->filter[0] * 3 + mi->filter[1]; + + // Transform + // TODO(anyone): extract tx type info from mbmi->txk_type[]. + + const BLOCK_SIZE bsize = mbmi->sb_type; + const int c = i % mi_size_wide[bsize]; + const int r = j % mi_size_high[bsize]; + if (is_inter_block(mbmi) || is_intrabc_block(mbmi)) + mi->tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(bsize, r, c)]; + else + mi->tx_size = mbmi->tx_size; + + if (skip_not_transform && mi->skip) mi->tx_size = -1; + + if (mi->skip) { + const int tx_type_row = j - j % tx_size_high_unit[mi->tx_size]; + const int tx_type_col = i - i % tx_size_wide_unit[mi->tx_size]; + const int tx_type_map_idx = + tx_type_row * mi_params->mi_stride + tx_type_col; + mi->tx_type = mi_params->tx_type_map[tx_type_map_idx]; + } else { + mi->tx_type = 0; + } + + if (skip_not_transform && + (mi->skip || mbmi->tx_skip[av1_get_txk_type_index(bsize, r, c)])) + mi->tx_type = -1; + + mi->cdef_level = cm->cdef_info.cdef_strengths[mbmi->cdef_strength] / + CDEF_SEC_STRENGTHS; + mi->cdef_strength = cm->cdef_info.cdef_strengths[mbmi->cdef_strength] % + CDEF_SEC_STRENGTHS; + + mi->cdef_strength += mi->cdef_strength == 3; + if (mbmi->uv_mode == UV_CFL_PRED) { + mi->cfl_alpha_idx = mbmi->cfl_alpha_idx; + mi->cfl_alpha_sign = mbmi->cfl_alpha_signs; + } else { + mi->cfl_alpha_idx = 0; + mi->cfl_alpha_sign = 0; + } + // delta_q + mi->current_qindex = mbmi->current_qindex; + } + } + return 1; +} diff --git a/libs/libaom/src/av1/decoder/inspection.h b/libs/libaom/src/av1/decoder/inspection.h new file mode 100644 index 000000000..b963f6ac6 --- /dev/null +++ b/libs/libaom/src/av1/decoder/inspection.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_DECODER_INSPECTION_H_ +#define AOM_AV1_DECODER_INSPECTION_H_ + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +#include "av1/common/seg_common.h" +#if CONFIG_ACCOUNTING +#include "av1/decoder/accounting.h" +#endif + +#ifndef AOM_AOM_AOMDX_H_ +typedef void (*aom_inspect_cb)(void *decoder, void *data); +#endif + +typedef struct insp_mv insp_mv; + +struct insp_mv { + int16_t row; + int16_t col; +}; + +typedef struct insp_mi_data insp_mi_data; + +struct insp_mi_data { + insp_mv mv[2]; + int16_t ref_frame[2]; + int16_t mode; + int16_t uv_mode; + int16_t sb_type; + int16_t skip; + int16_t segment_id; + int16_t dual_filter_type; + int16_t filter[2]; + int16_t tx_type; + int16_t tx_size; + int16_t cdef_level; + int16_t cdef_strength; + int16_t cfl_alpha_idx; + int16_t cfl_alpha_sign; + int16_t current_qindex; + int16_t compound_type; + int16_t motion_mode; + int16_t intrabc; + int16_t palette; + int16_t uv_palette; +}; + +typedef struct insp_frame_data insp_frame_data; + +struct insp_frame_data { +#if CONFIG_ACCOUNTING + Accounting *accounting; +#endif + insp_mi_data *mi_grid; + int16_t frame_number; + int show_frame; + int frame_type; + int base_qindex; + int mi_rows; + int mi_cols; + int tile_mi_rows; + int tile_mi_cols; + int16_t y_dequant[MAX_SEGMENTS][2]; + int16_t u_dequant[MAX_SEGMENTS][2]; + int16_t v_dequant[MAX_SEGMENTS][2]; + // TODO(negge): add per frame CDEF data + int delta_q_present_flag; + int delta_q_res; + int show_existing_frame; +}; + +void ifd_init(insp_frame_data *fd, int frame_width, int frame_height); +void ifd_clear(insp_frame_data *fd); +int ifd_inspect(insp_frame_data *fd, void *decoder, int skip_not_transform); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus +#endif // AOM_AV1_DECODER_INSPECTION_H_ diff --git a/libs/libaom/src/av1/decoder/obu.c b/libs/libaom/src/av1/decoder/obu.c new file mode 100644 index 000000000..791e5965b --- /dev/null +++ b/libs/libaom/src/av1/decoder/obu.c @@ -0,0 +1,1085 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_codec.h" +#include "aom_dsp/bitreader_buffer.h" +#include "aom_ports/mem_ops.h" + +#include "av1/common/common.h" +#include "av1/common/obu_util.h" +#include "av1/common/timing.h" +#include "av1/decoder/decoder.h" +#include "av1/decoder/decodeframe.h" +#include "av1/decoder/obu.h" + +aom_codec_err_t aom_get_num_layers_from_operating_point_idc( + int operating_point_idc, unsigned int *number_spatial_layers, + unsigned int *number_temporal_layers) { + // derive number of spatial/temporal layers from operating_point_idc + + if (!number_spatial_layers || !number_temporal_layers) + return AOM_CODEC_INVALID_PARAM; + + if (operating_point_idc == 0) { + *number_temporal_layers = 1; + *number_spatial_layers = 1; + } else { + *number_spatial_layers = 0; + *number_temporal_layers = 0; + for (int j = 0; j < MAX_NUM_SPATIAL_LAYERS; j++) { + *number_spatial_layers += + (operating_point_idc >> (j + MAX_NUM_TEMPORAL_LAYERS)) & 0x1; + } + for (int j = 0; j < MAX_NUM_TEMPORAL_LAYERS; j++) { + *number_temporal_layers += (operating_point_idc >> j) & 0x1; + } + } + + return AOM_CODEC_OK; +} + +static int is_obu_in_current_operating_point(AV1Decoder *pbi, + ObuHeader obu_header) { + if (!pbi->current_operating_point) { + return 1; + } + + if ((pbi->current_operating_point >> obu_header.temporal_layer_id) & 0x1 && + (pbi->current_operating_point >> (obu_header.spatial_layer_id + 8)) & + 0x1) { + return 1; + } + return 0; +} + +static int byte_alignment(AV1_COMMON *const cm, + struct aom_read_bit_buffer *const rb) { + while (rb->bit_offset & 7) { + if (aom_rb_read_bit(rb)) { + cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + } + return 0; +} + +static uint32_t read_temporal_delimiter_obu() { return 0; } + +// Returns a boolean that indicates success. +static int read_bitstream_level(AV1_LEVEL *seq_level_idx, + struct aom_read_bit_buffer *rb) { + *seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS); + if (!is_valid_seq_level_idx(*seq_level_idx)) return 0; + return 1; +} + +// Returns whether two sequence headers are consistent with each other. +// Note that the 'op_params' field is not compared per Section 7.5 in the spec: +// Within a particular coded video sequence, the contents of +// sequence_header_obu must be bit-identical each time the sequence header +// appears except for the contents of operating_parameters_info. +static int are_seq_headers_consistent(const SequenceHeader *seq_params_old, + const SequenceHeader *seq_params_new) { + return !memcmp(seq_params_old, seq_params_new, + offsetof(SequenceHeader, op_params)); +} + +// On success, sets pbi->sequence_header_ready to 1 and returns the number of +// bytes read from 'rb'. +// On failure, sets pbi->common.error.error_code and returns 0. +static uint32_t read_sequence_header_obu(AV1Decoder *pbi, + struct aom_read_bit_buffer *rb) { + AV1_COMMON *const cm = &pbi->common; + const uint32_t saved_bit_offset = rb->bit_offset; + + // Verify rb has been configured to report errors. + assert(rb->error_handler); + + // Use a local variable to store the information as we decode. At the end, + // if no errors have occurred, cm->seq_params is updated. + SequenceHeader sh = cm->seq_params; + SequenceHeader *const seq_params = &sh; + + seq_params->profile = av1_read_profile(rb); + if (seq_params->profile > CONFIG_MAX_DECODE_PROFILE) { + cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; + return 0; + } + + // Still picture or not + seq_params->still_picture = aom_rb_read_bit(rb); + seq_params->reduced_still_picture_hdr = aom_rb_read_bit(rb); + // Video must have reduced_still_picture_hdr = 0 + if (!seq_params->still_picture && seq_params->reduced_still_picture_hdr) { + cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; + return 0; + } + + if (seq_params->reduced_still_picture_hdr) { + seq_params->timing_info_present = 0; + seq_params->decoder_model_info_present_flag = 0; + seq_params->display_model_info_present_flag = 0; + seq_params->operating_points_cnt_minus_1 = 0; + seq_params->operating_point_idc[0] = 0; + if (!read_bitstream_level(&seq_params->seq_level_idx[0], rb)) { + cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; + return 0; + } + seq_params->tier[0] = 0; + seq_params->op_params[0].decoder_model_param_present_flag = 0; + seq_params->op_params[0].display_model_param_present_flag = 0; + } else { + seq_params->timing_info_present = aom_rb_read_bit(rb); + if (seq_params->timing_info_present) { + av1_read_timing_info_header(&seq_params->timing_info, &cm->error, rb); + + seq_params->decoder_model_info_present_flag = aom_rb_read_bit(rb); + if (seq_params->decoder_model_info_present_flag) + av1_read_decoder_model_info(&seq_params->decoder_model_info, rb); + } else { + seq_params->decoder_model_info_present_flag = 0; + } + seq_params->display_model_info_present_flag = aom_rb_read_bit(rb); + seq_params->operating_points_cnt_minus_1 = + aom_rb_read_literal(rb, OP_POINTS_CNT_MINUS_1_BITS); + for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) { + seq_params->operating_point_idc[i] = + aom_rb_read_literal(rb, OP_POINTS_IDC_BITS); + if (!read_bitstream_level(&seq_params->seq_level_idx[i], rb)) { + cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; + return 0; + } + // This is the seq_level_idx[i] > 7 check in the spec. seq_level_idx 7 + // is equivalent to level 3.3. + if (seq_params->seq_level_idx[i] >= SEQ_LEVEL_4_0) + seq_params->tier[i] = aom_rb_read_bit(rb); + else + seq_params->tier[i] = 0; + if (seq_params->decoder_model_info_present_flag) { + seq_params->op_params[i].decoder_model_param_present_flag = + aom_rb_read_bit(rb); + if (seq_params->op_params[i].decoder_model_param_present_flag) + av1_read_op_parameters_info(&seq_params->op_params[i], + seq_params->decoder_model_info + .encoder_decoder_buffer_delay_length, + rb); + } else { + seq_params->op_params[i].decoder_model_param_present_flag = 0; + } + if (seq_params->timing_info_present && + (seq_params->timing_info.equal_picture_interval || + seq_params->op_params[i].decoder_model_param_present_flag)) { + seq_params->op_params[i].bitrate = av1_max_level_bitrate( + seq_params->profile, seq_params->seq_level_idx[i], + seq_params->tier[i]); + // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass + // the check + if (seq_params->op_params[i].bitrate == 0) + aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "AV1 does not support this combination of " + "profile, level, and tier."); + // Buffer size in bits/s is bitrate in bits/s * 1 s + seq_params->op_params[i].buffer_size = seq_params->op_params[i].bitrate; + } + if (seq_params->timing_info_present && + seq_params->timing_info.equal_picture_interval && + !seq_params->op_params[i].decoder_model_param_present_flag) { + // When the decoder_model_parameters are not sent for this op, set + // the default ones that can be used with the resource availability mode + seq_params->op_params[i].decoder_buffer_delay = 70000; + seq_params->op_params[i].encoder_buffer_delay = 20000; + seq_params->op_params[i].low_delay_mode_flag = 0; + } + + if (seq_params->display_model_info_present_flag) { + seq_params->op_params[i].display_model_param_present_flag = + aom_rb_read_bit(rb); + if (seq_params->op_params[i].display_model_param_present_flag) { + seq_params->op_params[i].initial_display_delay = + aom_rb_read_literal(rb, 4) + 1; + if (seq_params->op_params[i].initial_display_delay > 10) + aom_internal_error( + &cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "AV1 does not support more than 10 decoded frames delay"); + } else { + seq_params->op_params[i].initial_display_delay = 10; + } + } else { + seq_params->op_params[i].display_model_param_present_flag = 0; + seq_params->op_params[i].initial_display_delay = 10; + } + } + } + // This decoder supports all levels. Choose operating point provided by + // external means + int operating_point = pbi->operating_point; + if (operating_point < 0 || + operating_point > seq_params->operating_points_cnt_minus_1) + operating_point = 0; + pbi->current_operating_point = + seq_params->operating_point_idc[operating_point]; + if (aom_get_num_layers_from_operating_point_idc( + pbi->current_operating_point, &cm->number_spatial_layers, + &cm->number_temporal_layers) != AOM_CODEC_OK) { + cm->error.error_code = AOM_CODEC_ERROR; + return 0; + } + + av1_read_sequence_header(cm, rb, seq_params); + + av1_read_color_config(rb, pbi->allow_lowbitdepth, seq_params, &cm->error); + if (!(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0) && + !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) && + !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 0)) { + aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Only 4:4:4, 4:2:2 and 4:2:0 are currently supported, " + "%d %d subsampling is not supported.\n", + seq_params->subsampling_x, seq_params->subsampling_y); + } + + seq_params->film_grain_params_present = aom_rb_read_bit(rb); + + if (av1_check_trailing_bits(pbi, rb) != 0) { + // cm->error.error_code is already set. + return 0; + } + + // If a sequence header has been decoded before, we check if the new + // one is consistent with the old one. + if (pbi->sequence_header_ready) { + if (!are_seq_headers_consistent(&cm->seq_params, seq_params)) + pbi->sequence_header_changed = 1; + } + + cm->seq_params = *seq_params; + pbi->sequence_header_ready = 1; + + return ((rb->bit_offset - saved_bit_offset + 7) >> 3); +} + +// On success, returns the frame header size. On failure, calls +// aom_internal_error and does not return. +static uint32_t read_frame_header_obu(AV1Decoder *pbi, + struct aom_read_bit_buffer *rb, + const uint8_t *data, + const uint8_t **p_data_end, + int trailing_bits_present) { + return av1_decode_frame_headers_and_setup(pbi, rb, data, p_data_end, + trailing_bits_present); +} + +// On success, returns the tile group header size. On failure, calls +// aom_internal_error() and returns -1. +static int32_t read_tile_group_header(AV1Decoder *pbi, + struct aom_read_bit_buffer *rb, + int *start_tile, int *end_tile, + int tile_start_implicit) { + AV1_COMMON *const cm = &pbi->common; + CommonTileParams *const tiles = &cm->tiles; + uint32_t saved_bit_offset = rb->bit_offset; + int tile_start_and_end_present_flag = 0; + const int num_tiles = tiles->rows * tiles->cols; + + if (!tiles->large_scale && num_tiles > 1) { + tile_start_and_end_present_flag = aom_rb_read_bit(rb); + if (tile_start_implicit && tile_start_and_end_present_flag) { + aom_internal_error( + &cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "For OBU_FRAME type obu tile_start_and_end_present_flag must be 0"); + return -1; + } + } + if (tiles->large_scale || num_tiles == 1 || + !tile_start_and_end_present_flag) { + *start_tile = 0; + *end_tile = num_tiles - 1; + } else { + int tile_bits = tiles->log2_rows + tiles->log2_cols; + *start_tile = aom_rb_read_literal(rb, tile_bits); + *end_tile = aom_rb_read_literal(rb, tile_bits); + } + if (*start_tile != pbi->next_start_tile) { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "tg_start (%d) must be equal to %d", *start_tile, + pbi->next_start_tile); + return -1; + } + if (*start_tile > *end_tile) { + aom_internal_error( + &cm->error, AOM_CODEC_CORRUPT_FRAME, + "tg_end (%d) must be greater than or equal to tg_start (%d)", *end_tile, + *start_tile); + return -1; + } + if (*end_tile >= num_tiles) { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "tg_end (%d) must be less than NumTiles (%d)", *end_tile, + num_tiles); + return -1; + } + pbi->next_start_tile = (*end_tile == num_tiles - 1) ? 0 : *end_tile + 1; + + return ((rb->bit_offset - saved_bit_offset + 7) >> 3); +} + +// On success, returns the tile group OBU size. On failure, sets +// pbi->common.error.error_code and returns 0. +static uint32_t read_one_tile_group_obu( + AV1Decoder *pbi, struct aom_read_bit_buffer *rb, int is_first_tg, + const uint8_t *data, const uint8_t *data_end, const uint8_t **p_data_end, + int *is_last_tg, int tile_start_implicit) { + AV1_COMMON *const cm = &pbi->common; + int start_tile, end_tile; + int32_t header_size, tg_payload_size; + + assert((rb->bit_offset & 7) == 0); + assert(rb->bit_buffer + aom_rb_bytes_read(rb) == data); + + header_size = read_tile_group_header(pbi, rb, &start_tile, &end_tile, + tile_start_implicit); + if (header_size == -1 || byte_alignment(cm, rb)) return 0; + data += header_size; + av1_decode_tg_tiles_and_wrapup(pbi, data, data_end, p_data_end, start_tile, + end_tile, is_first_tg); + + tg_payload_size = (uint32_t)(*p_data_end - data); + + *is_last_tg = end_tile == cm->tiles.rows * cm->tiles.cols - 1; + return header_size + tg_payload_size; +} + +static void alloc_tile_list_buffer(AV1Decoder *pbi) { + // The resolution of the output frame is read out from the bitstream. The data + // are stored in the order of Y plane, U plane and V plane. As an example, for + // image format 4:2:0, the output frame of U plane and V plane is 1/4 of the + // output frame. + AV1_COMMON *const cm = &pbi->common; + int tile_width, tile_height; + av1_get_uniform_tile_size(cm, &tile_width, &tile_height); + const int tile_width_in_pixels = tile_width * MI_SIZE; + const int tile_height_in_pixels = tile_height * MI_SIZE; + const int output_frame_width = + (pbi->output_frame_width_in_tiles_minus_1 + 1) * tile_width_in_pixels; + const int output_frame_height = + (pbi->output_frame_height_in_tiles_minus_1 + 1) * tile_height_in_pixels; + // The output frame is used to store the decoded tile list. The decoded tile + // list has to fit into 1 output frame. + assert((pbi->tile_count_minus_1 + 1) <= + (pbi->output_frame_width_in_tiles_minus_1 + 1) * + (pbi->output_frame_height_in_tiles_minus_1 + 1)); + + // Allocate the tile list output buffer. + // Note: if cm->seq_params.use_highbitdepth is 1 and cm->seq_params.bit_depth + // is 8, we could allocate less memory, namely, 8 bits/pixel. + if (aom_alloc_frame_buffer(&pbi->tile_list_outbuf, output_frame_width, + output_frame_height, cm->seq_params.subsampling_x, + cm->seq_params.subsampling_y, + (cm->seq_params.use_highbitdepth && + (cm->seq_params.bit_depth > AOM_BITS_8)), + 0, cm->features.byte_alignment)) + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate the tile list output buffer"); +} + +static void yv12_tile_copy(const YV12_BUFFER_CONFIG *src, int hstart1, + int hend1, int vstart1, int vend1, + YV12_BUFFER_CONFIG *dst, int hstart2, int vstart2, + int plane) { + const int src_stride = (plane > 0) ? src->strides[1] : src->strides[0]; + const int dst_stride = (plane > 0) ? dst->strides[1] : dst->strides[0]; + int row, col; + + assert(src->flags & YV12_FLAG_HIGHBITDEPTH); + assert(!(dst->flags & YV12_FLAG_HIGHBITDEPTH)); + + const uint16_t *src16 = + CONVERT_TO_SHORTPTR(src->buffers[plane] + vstart1 * src_stride + hstart1); + uint8_t *dst8 = dst->buffers[plane] + vstart2 * dst_stride + hstart2; + + for (row = vstart1; row < vend1; ++row) { + for (col = 0; col < (hend1 - hstart1); ++col) *dst8++ = (uint8_t)(*src16++); + src16 += src_stride - (hend1 - hstart1); + dst8 += dst_stride - (hend1 - hstart1); + } + return; +} + +static void copy_decoded_tile_to_tile_list_buffer(AV1Decoder *pbi, + int tile_idx) { + AV1_COMMON *const cm = &pbi->common; + int tile_width, tile_height; + av1_get_uniform_tile_size(cm, &tile_width, &tile_height); + const int tile_width_in_pixels = tile_width * MI_SIZE; + const int tile_height_in_pixels = tile_height * MI_SIZE; + const int ssy = cm->seq_params.subsampling_y; + const int ssx = cm->seq_params.subsampling_x; + const int num_planes = av1_num_planes(cm); + + YV12_BUFFER_CONFIG *cur_frame = &cm->cur_frame->buf; + const int tr = tile_idx / (pbi->output_frame_width_in_tiles_minus_1 + 1); + const int tc = tile_idx % (pbi->output_frame_width_in_tiles_minus_1 + 1); + int plane; + + // Copy decoded tile to the tile list output buffer. + for (plane = 0; plane < num_planes; ++plane) { + const int shift_x = plane > 0 ? ssx : 0; + const int shift_y = plane > 0 ? ssy : 0; + const int h = tile_height_in_pixels >> shift_y; + const int w = tile_width_in_pixels >> shift_x; + + // src offset + int vstart1 = pbi->dec_tile_row * h; + int vend1 = vstart1 + h; + int hstart1 = pbi->dec_tile_col * w; + int hend1 = hstart1 + w; + // dst offset + int vstart2 = tr * h; + int hstart2 = tc * w; + + if (cm->seq_params.use_highbitdepth && + cm->seq_params.bit_depth == AOM_BITS_8) { + yv12_tile_copy(cur_frame, hstart1, hend1, vstart1, vend1, + &pbi->tile_list_outbuf, hstart2, vstart2, plane); + } else { + switch (plane) { + case 0: + aom_yv12_partial_copy_y(cur_frame, hstart1, hend1, vstart1, vend1, + &pbi->tile_list_outbuf, hstart2, vstart2); + break; + case 1: + aom_yv12_partial_copy_u(cur_frame, hstart1, hend1, vstart1, vend1, + &pbi->tile_list_outbuf, hstart2, vstart2); + break; + case 2: + aom_yv12_partial_copy_v(cur_frame, hstart1, hend1, vstart1, vend1, + &pbi->tile_list_outbuf, hstart2, vstart2); + break; + default: assert(0); + } + } + } +} + +// Only called while large_scale_tile = 1. +// +// On success, returns the tile list OBU size. On failure, sets +// pbi->common.error.error_code and returns 0. +static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi, + struct aom_read_bit_buffer *rb, + const uint8_t *data, + const uint8_t *data_end, + const uint8_t **p_data_end, + int *frame_decoding_finished) { + AV1_COMMON *const cm = &pbi->common; + uint32_t tile_list_payload_size = 0; + const int num_tiles = cm->tiles.cols * cm->tiles.rows; + const int start_tile = 0; + const int end_tile = num_tiles - 1; + int i = 0; + + // Process the tile list info. + pbi->output_frame_width_in_tiles_minus_1 = aom_rb_read_literal(rb, 8); + pbi->output_frame_height_in_tiles_minus_1 = aom_rb_read_literal(rb, 8); + pbi->tile_count_minus_1 = aom_rb_read_literal(rb, 16); + if (pbi->tile_count_minus_1 > MAX_TILES - 1) { + cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return 0; + } + + // Allocate output frame buffer for the tile list. + alloc_tile_list_buffer(pbi); + + uint32_t tile_list_info_bytes = 4; + tile_list_payload_size += tile_list_info_bytes; + data += tile_list_info_bytes; + + int tile_idx = 0; + for (i = 0; i <= pbi->tile_count_minus_1; i++) { + // Process 1 tile. + // Reset the bit reader. + rb->bit_offset = 0; + rb->bit_buffer = data; + + // Read out the tile info. + uint32_t tile_info_bytes = 5; + // Set reference for each tile. + int ref_idx = aom_rb_read_literal(rb, 8); + if (ref_idx >= MAX_EXTERNAL_REFERENCES) { + cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return 0; + } + av1_set_reference_dec(cm, cm->remapped_ref_idx[0], 1, + &pbi->ext_refs.refs[ref_idx]); + + pbi->dec_tile_row = aom_rb_read_literal(rb, 8); + pbi->dec_tile_col = aom_rb_read_literal(rb, 8); + if (pbi->dec_tile_row < 0 || pbi->dec_tile_col < 0 || + pbi->dec_tile_row >= cm->tiles.rows || + pbi->dec_tile_col >= cm->tiles.cols) { + cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return 0; + } + + pbi->coded_tile_data_size = aom_rb_read_literal(rb, 16) + 1; + data += tile_info_bytes; + if ((size_t)(data_end - data) < pbi->coded_tile_data_size) { + cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return 0; + } + + av1_decode_tg_tiles_and_wrapup(pbi, data, data + pbi->coded_tile_data_size, + p_data_end, start_tile, end_tile, 0); + uint32_t tile_payload_size = (uint32_t)(*p_data_end - data); + + tile_list_payload_size += tile_info_bytes + tile_payload_size; + + // Update data ptr for next tile decoding. + data = *p_data_end; + assert(data <= data_end); + + // Copy the decoded tile to the tile list output buffer. + copy_decoded_tile_to_tile_list_buffer(pbi, tile_idx); + tile_idx++; + } + + *frame_decoding_finished = 1; + return tile_list_payload_size; +} + +// Returns the last nonzero byte index in 'data'. If there is no nonzero byte in +// 'data', returns -1. +static int get_last_nonzero_byte_index(const uint8_t *data, size_t sz) { + // Scan backward and return on the first nonzero byte. + int i = (int)sz - 1; + while (i >= 0 && data[i] == 0) { + --i; + } + return i; +} + +// Allocates metadata that was read and adds it to the decoders metadata array. +static void alloc_read_metadata(AV1Decoder *const pbi, + OBU_METADATA_TYPE metadata_type, + const uint8_t *data, size_t sz, + aom_metadata_insert_flags_t insert_flag) { + AV1_COMMON *const cm = &pbi->common; + aom_metadata_t *metadata = + aom_img_metadata_alloc(metadata_type, data, sz, insert_flag); + if (!metadata) { + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Error allocating metadata"); + } + if (!pbi->metadata) { + pbi->metadata = aom_img_metadata_array_alloc(1); + if (!pbi->metadata) { + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate metadata array"); + } + } else { + aom_metadata_t **metadata_array = + (aom_metadata_t **)realloc(pbi->metadata->metadata_array, + (pbi->metadata->sz + 1) * sizeof(metadata)); + if (!metadata_array) { + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Error allocating metadata"); + } + pbi->metadata->metadata_array = metadata_array; + pbi->metadata->sz++; + } + pbi->metadata->metadata_array[pbi->metadata->sz - 1] = metadata; +} + +// On success, returns the number of bytes read from 'data'. On failure, calls +// aom_internal_error() and does not return. +static size_t read_metadata_itut_t35(AV1Decoder *const pbi, const uint8_t *data, + size_t sz) { + const int kMinItuT35PayloadSize = 2; + AV1_COMMON *const cm = &pbi->common; + if (sz == 0) { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "itu_t_t35_country_code is missing"); + } + int bytes_read = get_last_nonzero_byte_index(data, sz); + if (bytes_read < 0) { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "No trailing bits found on metadata"); + } + if (*data == 0xFF && bytes_read < kMinItuT35PayloadSize) { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "itu_t_t35_country_code_extension_byte is missing"); + } + alloc_read_metadata(pbi, OBU_METADATA_TYPE_ITUT_T35, data, (size_t)bytes_read, + AOM_MIF_ANY_FRAME); + return (size_t)bytes_read; +} + +// On success, returns the number of bytes read from 'data'. On failure, calls +// aom_internal_error() and does not return. +static size_t read_metadata_hdr_cll(AV1Decoder *const pbi, const uint8_t *data, + size_t sz) { + const int kHdrCllPayloadSize = 4; + AV1_COMMON *const cm = &pbi->common; + if (sz == 0) { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "HDR CLL metadata payload is missing"); + } + int bytes_read = get_last_nonzero_byte_index(data, sz); + if (bytes_read < 0) { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "No trailing bits found on metadata"); + } + if (bytes_read != kHdrCllPayloadSize) { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Incorrect HDR CLL metadata payload size"); + } + alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_CLL, data, (size_t)bytes_read, + AOM_MIF_ANY_FRAME); + return (size_t)bytes_read; +} + +// On success, returns the number of bytes read from 'data'. On failure, calls +// aom_internal_error() and does not return. +static size_t read_metadata_hdr_mdcv(AV1Decoder *const pbi, const uint8_t *data, + size_t sz) { + const int kMdcvPayloadSize = 24; + AV1_COMMON *const cm = &pbi->common; + if (sz == 0) { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "HDR MDCV metadata payload is missing"); + } + int bytes_read = get_last_nonzero_byte_index(data, sz); + if (bytes_read < 0) { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "No trailing bits found on HDR MDCV metadata"); + } + if (bytes_read != kMdcvPayloadSize) { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Incorrect HDR MDCV metadata payload size"); + } + alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_MDCV, data, (size_t)bytes_read, + AOM_MIF_ANY_FRAME); + return (size_t)bytes_read; +} + +static void scalability_structure(struct aom_read_bit_buffer *rb) { + const int spatial_layers_cnt_minus_1 = aom_rb_read_literal(rb, 2); + const int spatial_layer_dimensions_present_flag = aom_rb_read_bit(rb); + const int spatial_layer_description_present_flag = aom_rb_read_bit(rb); + const int temporal_group_description_present_flag = aom_rb_read_bit(rb); + aom_rb_read_literal(rb, 3); // reserved + + if (spatial_layer_dimensions_present_flag) { + for (int i = 0; i <= spatial_layers_cnt_minus_1; i++) { + aom_rb_read_literal(rb, 16); + aom_rb_read_literal(rb, 16); + } + } + if (spatial_layer_description_present_flag) { + for (int i = 0; i <= spatial_layers_cnt_minus_1; i++) { + aom_rb_read_literal(rb, 8); + } + } + if (temporal_group_description_present_flag) { + const int temporal_group_size = aom_rb_read_literal(rb, 8); + for (int i = 0; i < temporal_group_size; i++) { + aom_rb_read_literal(rb, 3); + aom_rb_read_bit(rb); + aom_rb_read_bit(rb); + const int temporal_group_ref_cnt = aom_rb_read_literal(rb, 3); + for (int j = 0; j < temporal_group_ref_cnt; j++) { + aom_rb_read_literal(rb, 8); + } + } + } +} + +static void read_metadata_scalability(struct aom_read_bit_buffer *rb) { + const int scalability_mode_idc = aom_rb_read_literal(rb, 8); + if (scalability_mode_idc == SCALABILITY_SS) { + scalability_structure(rb); + } +} + +static void read_metadata_timecode(struct aom_read_bit_buffer *rb) { + aom_rb_read_literal(rb, 5); // counting_type f(5) + const int full_timestamp_flag = + aom_rb_read_bit(rb); // full_timestamp_flag f(1) + aom_rb_read_bit(rb); // discontinuity_flag (f1) + aom_rb_read_bit(rb); // cnt_dropped_flag f(1) + aom_rb_read_literal(rb, 9); // n_frames f(9) + if (full_timestamp_flag) { + aom_rb_read_literal(rb, 6); // seconds_value f(6) + aom_rb_read_literal(rb, 6); // minutes_value f(6) + aom_rb_read_literal(rb, 5); // hours_value f(5) + } else { + const int seconds_flag = aom_rb_read_bit(rb); // seconds_flag f(1) + if (seconds_flag) { + aom_rb_read_literal(rb, 6); // seconds_value f(6) + const int minutes_flag = aom_rb_read_bit(rb); // minutes_flag f(1) + if (minutes_flag) { + aom_rb_read_literal(rb, 6); // minutes_value f(6) + const int hours_flag = aom_rb_read_bit(rb); // hours_flag f(1) + if (hours_flag) { + aom_rb_read_literal(rb, 5); // hours_value f(5) + } + } + } + } + // time_offset_length f(5) + const int time_offset_length = aom_rb_read_literal(rb, 5); + if (time_offset_length) { + // time_offset_value f(time_offset_length) + aom_rb_read_literal(rb, time_offset_length); + } +} + +// Returns the last nonzero byte in 'data'. If there is no nonzero byte in +// 'data', returns 0. +// +// Call this function to check the following requirement in the spec: +// This implies that when any payload data is present for this OBU type, at +// least one byte of the payload data (including the trailing bit) shall not +// be equal to 0. +static uint8_t get_last_nonzero_byte(const uint8_t *data, size_t sz) { + // Scan backward and return on the first nonzero byte. + size_t i = sz; + while (i != 0) { + --i; + if (data[i] != 0) return data[i]; + } + return 0; +} + +// Checks the metadata for correct syntax but ignores the parsed metadata. +// +// On success, returns the number of bytes read from 'data'. On failure, sets +// pbi->common.error.error_code and returns 0, or calls aom_internal_error() +// and does not return. +static size_t read_metadata(AV1Decoder *pbi, const uint8_t *data, size_t sz) { + AV1_COMMON *const cm = &pbi->common; + size_t type_length; + uint64_t type_value; + if (aom_uleb_decode(data, sz, &type_value, &type_length) < 0) { + cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return 0; + } + const OBU_METADATA_TYPE metadata_type = (OBU_METADATA_TYPE)type_value; + if (metadata_type == 0 || metadata_type >= 6) { + // If metadata_type is reserved for future use or a user private value, + // ignore the entire OBU and just check trailing bits. + if (get_last_nonzero_byte(data + type_length, sz - type_length) == 0) { + pbi->common.error.error_code = AOM_CODEC_CORRUPT_FRAME; + return 0; + } + return sz; + } + if (metadata_type == OBU_METADATA_TYPE_ITUT_T35) { + size_t bytes_read = + type_length + + read_metadata_itut_t35(pbi, data + type_length, sz - type_length); + // itu_t_t35_payload_bytes is byte aligned and the first + // trailing byte should be 0x80. + if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) { + pbi->common.error.error_code = AOM_CODEC_CORRUPT_FRAME; + return 0; + } + return sz; + } else if (metadata_type == OBU_METADATA_TYPE_HDR_CLL) { + size_t bytes_read = + type_length + + read_metadata_hdr_cll(pbi, data + type_length, sz - type_length); + if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) { + pbi->common.error.error_code = AOM_CODEC_CORRUPT_FRAME; + return 0; + } + return sz; + } else if (metadata_type == OBU_METADATA_TYPE_HDR_MDCV) { + size_t bytes_read = + type_length + + read_metadata_hdr_mdcv(pbi, data + type_length, sz - type_length); + if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) { + pbi->common.error.error_code = AOM_CODEC_CORRUPT_FRAME; + return 0; + } + return sz; + } + + struct aom_read_bit_buffer rb; + av1_init_read_bit_buffer(pbi, &rb, data + type_length, data + sz); + if (metadata_type == OBU_METADATA_TYPE_SCALABILITY) { + read_metadata_scalability(&rb); + } else { + assert(metadata_type == OBU_METADATA_TYPE_TIMECODE); + read_metadata_timecode(&rb); + } + if (av1_check_trailing_bits(pbi, &rb) != 0) { + // cm->error.error_code is already set. + return 0; + } + assert((rb.bit_offset & 7) == 0); + return type_length + (rb.bit_offset >> 3); +} + +// On success, returns 'sz'. On failure, sets pbi->common.error.error_code and +// returns 0. +static size_t read_padding(AV1_COMMON *const cm, const uint8_t *data, + size_t sz) { + // The spec allows a padding OBU to be header-only (i.e., obu_size = 0). So + // check trailing bits only if sz > 0. + if (sz > 0) { + // The payload of a padding OBU is byte aligned. Therefore the first + // trailing byte should be 0x80. See https://crbug.com/aomedia/2393. + const uint8_t last_nonzero_byte = get_last_nonzero_byte(data, sz); + if (last_nonzero_byte != 0x80) { + cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return 0; + } + } + return sz; +} + +// On success, returns a boolean that indicates whether the decoding of the +// current frame is finished. On failure, sets cm->error.error_code and +// returns -1. +int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, + const uint8_t *data_end, + const uint8_t **p_data_end) { + AV1_COMMON *const cm = &pbi->common; + int frame_decoding_finished = 0; + int is_first_tg_obu_received = 1; + uint32_t frame_header_size = 0; + ObuHeader obu_header; + memset(&obu_header, 0, sizeof(obu_header)); + pbi->seen_frame_header = 0; + pbi->next_start_tile = 0; + + if (data_end < data) { + cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + + // Reset pbi->camera_frame_header_ready to 0 if cm->tiles.large_scale = 0. + if (!cm->tiles.large_scale) pbi->camera_frame_header_ready = 0; + + // decode frame as a series of OBUs + while (!frame_decoding_finished && cm->error.error_code == AOM_CODEC_OK) { + struct aom_read_bit_buffer rb; + size_t payload_size = 0; + size_t decoded_payload_size = 0; + size_t obu_payload_offset = 0; + size_t bytes_read = 0; + const size_t bytes_available = data_end - data; + + if (bytes_available == 0 && !pbi->seen_frame_header) { + *p_data_end = data; + cm->error.error_code = AOM_CODEC_OK; + break; + } + + aom_codec_err_t status = + aom_read_obu_header_and_size(data, bytes_available, pbi->is_annexb, + &obu_header, &payload_size, &bytes_read); + + if (status != AOM_CODEC_OK) { + cm->error.error_code = status; + return -1; + } + + // Record obu size header information. + pbi->obu_size_hdr.data = data + obu_header.size; + pbi->obu_size_hdr.size = bytes_read - obu_header.size; + + // Note: aom_read_obu_header_and_size() takes care of checking that this + // doesn't cause 'data' to advance past 'data_end'. + data += bytes_read; + + if ((size_t)(data_end - data) < payload_size) { + cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + + cm->temporal_layer_id = obu_header.temporal_layer_id; + cm->spatial_layer_id = obu_header.spatial_layer_id; + + if (obu_header.type != OBU_TEMPORAL_DELIMITER && + obu_header.type != OBU_SEQUENCE_HEADER && + obu_header.type != OBU_PADDING) { + // don't decode obu if it's not in current operating mode + if (!is_obu_in_current_operating_point(pbi, obu_header)) { + data += payload_size; + continue; + } + } + + av1_init_read_bit_buffer(pbi, &rb, data, data + payload_size); + + switch (obu_header.type) { + case OBU_TEMPORAL_DELIMITER: + decoded_payload_size = read_temporal_delimiter_obu(); + pbi->seen_frame_header = 0; + pbi->next_start_tile = 0; + break; + case OBU_SEQUENCE_HEADER: + decoded_payload_size = read_sequence_header_obu(pbi, &rb); + if (cm->error.error_code != AOM_CODEC_OK) return -1; + // The sequence header should not change in the middle of a frame. + if (pbi->sequence_header_changed && pbi->seen_frame_header) { + cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + break; + case OBU_FRAME_HEADER: + case OBU_REDUNDANT_FRAME_HEADER: + case OBU_FRAME: + if (obu_header.type == OBU_REDUNDANT_FRAME_HEADER) { + if (!pbi->seen_frame_header) { + cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + } else { + // OBU_FRAME_HEADER or OBU_FRAME. + if (pbi->seen_frame_header) { + cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + } + // Only decode first frame header received + if (!pbi->seen_frame_header || + (cm->tiles.large_scale && !pbi->camera_frame_header_ready)) { + frame_header_size = read_frame_header_obu( + pbi, &rb, data, p_data_end, obu_header.type != OBU_FRAME); + pbi->seen_frame_header = 1; + if (!pbi->ext_tile_debug && cm->tiles.large_scale) + pbi->camera_frame_header_ready = 1; + } else { + // TODO(wtc): Verify that the frame_header_obu is identical to the + // original frame_header_obu. For now just skip frame_header_size + // bytes in the bit buffer. + if (frame_header_size > payload_size) { + cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + assert(rb.bit_offset == 0); + rb.bit_offset = 8 * frame_header_size; + } + + decoded_payload_size = frame_header_size; + pbi->frame_header_size = frame_header_size; + + if (cm->show_existing_frame) { + if (obu_header.type == OBU_FRAME) { + cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; + return -1; + } + frame_decoding_finished = 1; + pbi->seen_frame_header = 0; + break; + } + + // In large scale tile coding, decode the common camera frame header + // before any tile list OBU. + if (!pbi->ext_tile_debug && pbi->camera_frame_header_ready) { + frame_decoding_finished = 1; + // Skip the rest of the frame data. + decoded_payload_size = payload_size; + // Update data_end. + *p_data_end = data_end; + break; + } + + if (obu_header.type != OBU_FRAME) break; + obu_payload_offset = frame_header_size; + // Byte align the reader before reading the tile group. + // byte_alignment() has set cm->error.error_code if it returns -1. + if (byte_alignment(cm, &rb)) return -1; + AOM_FALLTHROUGH_INTENDED; // fall through to read tile group. + case OBU_TILE_GROUP: + if (!pbi->seen_frame_header) { + cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + if (obu_payload_offset > payload_size) { + cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + decoded_payload_size += read_one_tile_group_obu( + pbi, &rb, is_first_tg_obu_received, data + obu_payload_offset, + data + payload_size, p_data_end, &frame_decoding_finished, + obu_header.type == OBU_FRAME); + if (cm->error.error_code != AOM_CODEC_OK) return -1; + is_first_tg_obu_received = 0; + if (frame_decoding_finished) pbi->seen_frame_header = 0; + break; + case OBU_METADATA: + decoded_payload_size = read_metadata(pbi, data, payload_size); + if (cm->error.error_code != AOM_CODEC_OK) return -1; + break; + case OBU_TILE_LIST: + if (CONFIG_NORMAL_TILE_MODE) { + cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; + return -1; + } + + // This OBU type is purely for the large scale tile coding mode. + // The common camera frame header has to be already decoded. + if (!pbi->camera_frame_header_ready) { + cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + + cm->tiles.large_scale = 1; + av1_set_single_tile_decoding_mode(cm); + decoded_payload_size = + read_and_decode_one_tile_list(pbi, &rb, data, data + payload_size, + p_data_end, &frame_decoding_finished); + if (cm->error.error_code != AOM_CODEC_OK) return -1; + break; + case OBU_PADDING: + decoded_payload_size = read_padding(&pbi->common, data, payload_size); + if (cm->error.error_code != AOM_CODEC_OK) return -1; + break; + default: + // Skip unrecognized OBUs + if (payload_size > 0 && + get_last_nonzero_byte(data, payload_size) == 0) { + cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + decoded_payload_size = payload_size; + break; + } + + // Check that the signalled OBU size matches the actual amount of data read + if (decoded_payload_size > payload_size) { + cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + + // If there are extra padding bytes, they should all be zero + while (decoded_payload_size < payload_size) { + uint8_t padding_byte = data[decoded_payload_size++]; + if (padding_byte != 0) { + cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + } + + data += payload_size; + } + + if (cm->error.error_code != AOM_CODEC_OK) return -1; + return frame_decoding_finished; +} diff --git a/libs/libaom/src/av1/decoder/obu.h b/libs/libaom/src/av1/decoder/obu.h new file mode 100644 index 000000000..d8ebe368e --- /dev/null +++ b/libs/libaom/src/av1/decoder/obu.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_DECODER_OBU_H_ +#define AOM_AV1_DECODER_OBU_H_ + +#include "aom/aom_codec.h" +#include "av1/decoder/decoder.h" + +// Try to decode one frame from a buffer. +// Returns 1 if we decoded a frame, +// 0 if we didn't decode a frame but that's okay +// (eg, if there was a frame but we skipped it), +// or -1 on error +int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, + const uint8_t *data_end, + const uint8_t **p_data_end); + +aom_codec_err_t aom_get_num_layers_from_operating_point_idc( + int operating_point_idc, unsigned int *number_spatial_layers, + unsigned int *number_temporal_layers); + +#endif // AOM_AV1_DECODER_OBU_H_ diff --git a/libs/libaom/src/av1/encoder/aq_complexity.c b/libs/libaom/src/av1/encoder/aq_complexity.c new file mode 100644 index 000000000..36580063d --- /dev/null +++ b/libs/libaom/src/av1/encoder/aq_complexity.c @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "av1/encoder/aq_complexity.h" +#include "av1/encoder/aq_variance.h" +#include "av1/encoder/encodeframe.h" +#include "av1/common/seg_common.h" +#include "av1/encoder/segmentation.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/system_state.h" + +#define AQ_C_SEGMENTS 5 +#define DEFAULT_AQ2_SEG 3 // Neutral Q segment +#define AQ_C_STRENGTHS 3 +static const double aq_c_q_adj_factor[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = { + { 1.75, 1.25, 1.05, 1.00, 0.90 }, + { 2.00, 1.50, 1.15, 1.00, 0.85 }, + { 2.50, 1.75, 1.25, 1.00, 0.80 } +}; +static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = { + { 0.15, 0.30, 0.55, 2.00, 100.0 }, + { 0.20, 0.40, 0.65, 2.00, 100.0 }, + { 0.25, 0.50, 0.75, 2.00, 100.0 } +}; +static const double aq_c_var_thresholds[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = { + { -4.0, -3.0, -2.0, 100.00, 100.0 }, + { -3.5, -2.5, -1.5, 100.00, 100.0 }, + { -3.0, -2.0, -1.0, 100.00, 100.0 } +}; + +static int get_aq_c_strength(int q_index, aom_bit_depth_t bit_depth) { + // Approximate base quatizer (truncated to int) + const int base_quant = av1_ac_quant_QTX(q_index, 0, bit_depth) / 4; + return (base_quant > 10) + (base_quant > 25); +} + +static bool is_frame_aq_enabled(const AV1_COMP *const cpi) { + const AV1_COMMON *const cm = &cpi->common; + + return frame_is_intra_only(cm) || cm->features.error_resilient_mode || + cpi->refresh_alt_ref_frame || + (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref); +} + +// Segmentation only makes sense if the target bits per SB is above a threshold. +// Below this the overheads will usually outweigh any benefit. +static bool is_sb_aq_enabled(const AV1_COMP *const cpi) { + return cpi->rc.sb64_target_rate >= 256; +} + +void av1_setup_in_frame_q_adj(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int base_qindex = cm->quant_params.base_qindex; + struct segmentation *const seg = &cm->seg; + const int resolution_change = + cm->prev_frame && (cm->width != cm->prev_frame->width || + cm->height != cm->prev_frame->height); + + // Make SURE use of floating point in this function is safe. + aom_clear_system_state(); + + if (resolution_change) { + memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); + av1_clearall_segfeatures(seg); + av1_disable_segmentation(seg); + return; + } + + if (is_frame_aq_enabled(cpi)) { + int segment; + const int aq_strength = + get_aq_c_strength(base_qindex, cm->seq_params.bit_depth); + + // Clear down the segment map. + memset(cpi->enc_seg.map, DEFAULT_AQ2_SEG, + cm->mi_params.mi_rows * cm->mi_params.mi_cols); + + av1_clearall_segfeatures(seg); + + if (!is_sb_aq_enabled(cpi)) { + av1_disable_segmentation(seg); + return; + } + + av1_enable_segmentation(seg); + + // Default segment "Q" feature is disabled so it defaults to the baseline Q. + av1_disable_segfeature(seg, DEFAULT_AQ2_SEG, SEG_LVL_ALT_Q); + + // Use some of the segments for in frame Q adjustment. + for (segment = 0; segment < AQ_C_SEGMENTS; ++segment) { + int qindex_delta; + + if (segment == DEFAULT_AQ2_SEG) continue; + + qindex_delta = av1_compute_qdelta_by_rate( + &cpi->rc, cm->current_frame.frame_type, base_qindex, + aq_c_q_adj_factor[aq_strength][segment], cm->seq_params.bit_depth); + + // For AQ complexity mode, we dont allow Q0 in a segment if the base + // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment + // Q delta is sometimes applied without going back around the rd loop. + // This could lead to an illegal combination of partition size and q. + if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) { + qindex_delta = -base_qindex + 1; + } + if ((base_qindex + qindex_delta) > 0) { + av1_enable_segfeature(seg, segment, SEG_LVL_ALT_Q); + av1_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta); + } + } + } +} + +#define DEFAULT_LV_THRESH 10.0 +#define MIN_DEFAULT_LV_THRESH 8.0 +// Select a segment for the current block. +// The choice of segment for a block depends on the ratio of the projected +// bits for the block vs a target average and its spatial complexity. +void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs, + int mi_row, int mi_col, int projected_rate) { + if ((!is_frame_aq_enabled(cpi)) || (!is_sb_aq_enabled(cpi))) return; + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + + const int mi_offset = mi_row * cm->mi_params.mi_cols + mi_col; + const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, mi_size_wide[bs]); + const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, mi_size_high[bs]); + int x, y; + int i; + unsigned char segment; + + if (0) { + segment = DEFAULT_AQ2_SEG; + } else { + // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh). + // It is converted to bits << AV1_PROB_COST_SHIFT units. + const int64_t num = (int64_t)(cpi->rc.sb64_target_rate * xmis * ymis) + << AV1_PROB_COST_SHIFT; + const int denom = cm->seq_params.mib_size * cm->seq_params.mib_size; + const int target_rate = (int)(num / denom); + double logvar; + double low_var_thresh; + const int aq_strength = get_aq_c_strength(cm->quant_params.base_qindex, + cm->seq_params.bit_depth); + + aom_clear_system_state(); + low_var_thresh = + (is_stat_consumption_stage_twopass(cpi)) + ? AOMMAX(exp(cpi->twopass.mb_av_energy), MIN_DEFAULT_LV_THRESH) + : DEFAULT_LV_THRESH; + + av1_setup_src_planes(mb, cpi->source, mi_row, mi_col, num_planes, bs); + logvar = av1_log_block_var(cpi, mb, bs); + + segment = AQ_C_SEGMENTS - 1; // Just in case no break out below. + for (i = 0; i < AQ_C_SEGMENTS; ++i) { + // Test rate against a threshold value and variance against a threshold. + // Increasing segment number (higher variance and complexity) = higher Q. + if ((projected_rate < target_rate * aq_c_transitions[aq_strength][i]) && + (logvar < (low_var_thresh + aq_c_var_thresholds[aq_strength][i]))) { + segment = i; + break; + } + } + } + + // Fill in the entires in the segment map corresponding to this SB64. + for (y = 0; y < ymis; y++) { + for (x = 0; x < xmis; x++) { + cpi->enc_seg.map[mi_offset + y * cm->mi_params.mi_cols + x] = segment; + } + } +} diff --git a/libs/libaom/src/av1/encoder/aq_complexity.h b/libs/libaom/src/av1/encoder/aq_complexity.h new file mode 100644 index 000000000..3421d74c9 --- /dev/null +++ b/libs/libaom/src/av1/encoder/aq_complexity.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_AQ_COMPLEXITY_H_ +#define AOM_AV1_ENCODER_AQ_COMPLEXITY_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/common/enums.h" + +struct AV1_COMP; +struct macroblock; + +// Select a segment for the current Block. +void av1_caq_select_segment(const struct AV1_COMP *cpi, struct macroblock *, + BLOCK_SIZE bs, int mi_row, int mi_col, + int projected_rate); + +// This function sets up a set of segments with delta Q values around +// the baseline frame quantizer. +void av1_setup_in_frame_q_adj(struct AV1_COMP *cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_AQ_COMPLEXITY_H_ diff --git a/libs/libaom/src/av1/encoder/aq_cyclicrefresh.c b/libs/libaom/src/av1/encoder/aq_cyclicrefresh.c new file mode 100644 index 000000000..b8884942a --- /dev/null +++ b/libs/libaom/src/av1/encoder/aq_cyclicrefresh.c @@ -0,0 +1,501 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "av1/common/seg_common.h" +#include "av1/encoder/aq_cyclicrefresh.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/segmentation.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/system_state.h" + +CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols) { + size_t last_coded_q_map_size; + CYCLIC_REFRESH *const cr = aom_calloc(1, sizeof(*cr)); + if (cr == NULL) return NULL; + + cr->map = aom_calloc(mi_rows * mi_cols, sizeof(*cr->map)); + if (cr->map == NULL) { + av1_cyclic_refresh_free(cr); + return NULL; + } + last_coded_q_map_size = mi_rows * mi_cols * sizeof(*cr->last_coded_q_map); + cr->last_coded_q_map = aom_malloc(last_coded_q_map_size); + if (cr->last_coded_q_map == NULL) { + av1_cyclic_refresh_free(cr); + return NULL; + } + assert(MAXQ <= 255); + memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size); + cr->avg_frame_low_motion = 0.0; + return cr; +} + +void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr) { + if (cr != NULL) { + aom_free(cr->map); + aom_free(cr->last_coded_q_map); + aom_free(cr); + } +} + +// Check if this coding block, of size bsize, should be considered for refresh +// (lower-qp coding). Decision can be based on various factors, such as +// size of the coding block (i.e., below min_block size rejected), coding +// mode, and rate/distortion. +static int candidate_refresh_aq(const CYCLIC_REFRESH *cr, + const MB_MODE_INFO *mbmi, int64_t rate, + int64_t dist, int bsize) { + MV mv = mbmi->mv[0].as_mv; + // Reject the block for lower-qp coding if projected distortion + // is above the threshold, and any of the following is true: + // 1) mode uses large mv + // 2) mode is an intra-mode + // Otherwise accept for refresh. + if (dist > cr->thresh_dist_sb && + (mv.row > cr->motion_thresh || mv.row < -cr->motion_thresh || + mv.col > cr->motion_thresh || mv.col < -cr->motion_thresh || + !is_inter_block(mbmi))) + return CR_SEGMENT_ID_BASE; + else if (bsize >= BLOCK_16X16 && rate < cr->thresh_rate_sb && + is_inter_block(mbmi) && mbmi->mv[0].as_int == 0 && + cr->rate_boost_fac > 10) + // More aggressive delta-q for bigger blocks with zero motion. + return CR_SEGMENT_ID_BOOST2; + else + return CR_SEGMENT_ID_BOOST1; +} + +// Compute delta-q for the segment. +static int compute_deltaq(const AV1_COMP *cpi, int q, double rate_factor) { + const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + const RATE_CONTROL *const rc = &cpi->rc; + int deltaq = + av1_compute_qdelta_by_rate(rc, cpi->common.current_frame.frame_type, q, + rate_factor, cpi->common.seq_params.bit_depth); + if ((-deltaq) > cr->max_qdelta_perc * q / 100) { + deltaq = -cr->max_qdelta_perc * q / 100; + } + return deltaq; +} + +// For the just encoded frame, estimate the bits, incorporating the delta-q +// from non-base segment. For now ignore effect of multiple segments +// (with different delta-q). Note this function is called in the postencode +// (called from rc_update_rate_correction_factors()). +int av1_cyclic_refresh_estimate_bits_at_q(const AV1_COMP *cpi, + double correction_factor) { + const AV1_COMMON *const cm = &cpi->common; + const FRAME_TYPE frame_type = cm->current_frame.frame_type; + const int base_qindex = cm->quant_params.base_qindex; + const int bit_depth = cm->seq_params.bit_depth; + const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + const int mbs = cm->mi_params.MBs; + const int num4x4bl = mbs << 4; + // Weight for non-base segments: use actual number of blocks refreshed in + // previous/just encoded frame. Note number of blocks here is in 4x4 units. + const double weight_segment1 = (double)cr->actual_num_seg1_blocks / num4x4bl; + const double weight_segment2 = (double)cr->actual_num_seg2_blocks / num4x4bl; + // Take segment weighted average for estimated bits. + const int estimated_bits = + (int)((1.0 - weight_segment1 - weight_segment2) * + av1_estimate_bits_at_q(frame_type, base_qindex, mbs, + correction_factor, bit_depth) + + weight_segment1 * av1_estimate_bits_at_q( + frame_type, base_qindex + cr->qindex_delta[1], + mbs, correction_factor, bit_depth) + + weight_segment2 * av1_estimate_bits_at_q( + frame_type, base_qindex + cr->qindex_delta[2], + mbs, correction_factor, bit_depth)); + return estimated_bits; +} + +// Prior to encoding the frame, estimate the bits per mb, for a given q = i and +// a corresponding delta-q (for segment 1). This function is called in the +// rc_regulate_q() to set the base qp index. +// Note: the segment map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or +// to 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock, prior to encoding. +int av1_cyclic_refresh_rc_bits_per_mb(const AV1_COMP *cpi, int i, + double correction_factor) { + const AV1_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + int bits_per_mb; + int num4x4bl = cm->mi_params.MBs << 4; + // Weight for segment prior to encoding: take the average of the target + // number for the frame to be encoded and the actual from the previous frame. + double weight_segment = + (double)((cr->target_num_seg_blocks + cr->actual_num_seg1_blocks + + cr->actual_num_seg2_blocks) >> + 1) / + num4x4bl; + // Compute delta-q corresponding to qindex i. + int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta); + // Take segment weighted average for bits per mb. + bits_per_mb = + (int)((1.0 - weight_segment) * + av1_rc_bits_per_mb(cm->current_frame.frame_type, i, + correction_factor, + cm->seq_params.bit_depth) + + weight_segment * av1_rc_bits_per_mb(cm->current_frame.frame_type, + i + deltaq, correction_factor, + cm->seq_params.bit_depth)); + return bits_per_mb; +} + +// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col), +// check if we should reset the segment_id, and update the cyclic_refresh map +// and segmentation map. +void av1_cyclic_refresh_update_segment(const AV1_COMP *cpi, + MB_MODE_INFO *const mbmi, int mi_row, + int mi_col, BLOCK_SIZE bsize, + int64_t rate, int64_t dist, int skip) { + const AV1_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, bw); + const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh); + const int block_index = mi_row * cm->mi_params.mi_cols + mi_col; + const int refresh_this_block = + candidate_refresh_aq(cr, mbmi, rate, dist, bsize); + // Default is to not update the refresh map. + int new_map_value = cr->map[block_index]; + + // If this block is labeled for refresh, check if we should reset the + // segment_id. + if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) { + mbmi->segment_id = refresh_this_block; + // Reset segment_id if will be skipped. + if (skip) mbmi->segment_id = CR_SEGMENT_ID_BASE; + } + + // Update the cyclic refresh map, to be used for setting segmentation map + // for the next frame. If the block will be refreshed this frame, mark it + // as clean. The magnitude of the -ve influences how long before we consider + // it for refresh again. + if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) { + new_map_value = -cr->time_for_refresh; + } else if (refresh_this_block) { + // Else if it is accepted as candidate for refresh, and has not already + // been refreshed (marked as 1) then mark it as a candidate for cleanup + // for future time (marked as 0), otherwise don't update it. + if (cr->map[block_index] == 1) new_map_value = 0; + } else { + // Leave it marked as block that is not candidate for refresh. + new_map_value = 1; + } + + // Update entries in the cyclic refresh map with new_map_value, and + // copy mbmi->segment_id into global segmentation map. + for (int y = 0; y < ymis; y++) + for (int x = 0; x < xmis; x++) { + int map_offset = block_index + y * cm->mi_params.mi_cols + x; + cr->map[map_offset] = new_map_value; + cpi->enc_seg.map[map_offset] = mbmi->segment_id; + } +} + +// Update the some stats after encode frame is done. +void av1_cyclic_refresh_postencode(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + unsigned char *const seg_map = cpi->enc_seg.map; + cr->cnt_zeromv = 0; + cr->actual_num_seg1_blocks = 0; + cr->actual_num_seg2_blocks = 0; + for (int mi_row = 0; mi_row < mi_params->mi_rows; mi_row++) { + for (int mi_col = 0; mi_col < mi_params->mi_cols; mi_col++) { + MB_MODE_INFO **mi = + mi_params->mi_grid_base + mi_row * mi_params->mi_stride + mi_col; + MV mv = mi[0]->mv[0].as_mv; + if (cm->seg.enabled) { + int map_index = mi_row * mi_params->mi_cols + mi_col; + if (cyclic_refresh_segment_id(seg_map[map_index]) == + CR_SEGMENT_ID_BOOST1) + cr->actual_num_seg1_blocks++; + else if (cyclic_refresh_segment_id(seg_map[map_index]) == + CR_SEGMENT_ID_BOOST2) + cr->actual_num_seg2_blocks++; + } + // Accumulate low_content_frame. + if (is_inter_block(mi[0]) && abs(mv.row) < 16 && abs(mv.col) < 16) + cr->cnt_zeromv++; + } + } + cr->cnt_zeromv = + 100 * cr->cnt_zeromv / (mi_params->mi_rows * mi_params->mi_cols); + cr->avg_frame_low_motion = + (3 * cr->avg_frame_low_motion + (double)cr->cnt_zeromv) / 4; +} + +// Set golden frame update interval, for 1 pass CBR mode. +void av1_cyclic_refresh_set_golden_update(AV1_COMP *const cpi) { + RATE_CONTROL *const rc = &cpi->rc; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + // Set minimum gf_interval for GF update to a multiple of the refresh period, + // with some max limit. Depending on past encoding stats, GF flag may be + // reset and update may not occur until next baseline_gf_interval. + if (cr->percent_refresh > 0) + rc->baseline_gf_interval = AOMMIN(2 * (100 / cr->percent_refresh), 40); + else + rc->baseline_gf_interval = 20; + if (cr->avg_frame_low_motion < 40) rc->baseline_gf_interval = 8; +} + +// Update the segmentation map, and related quantities: cyclic refresh map, +// refresh sb_index, and target number of blocks to be refreshed. +// The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to +// 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock. +// Blocks labeled as BOOST1 may later get set to BOOST2 (during the +// encoding of the superblock). +static void cyclic_refresh_update_map(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + unsigned char *const seg_map = cpi->enc_seg.map; + int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame; + int xmis, ymis, x, y; + memset(seg_map, CR_SEGMENT_ID_BASE, mi_params->mi_rows * mi_params->mi_cols); + sb_cols = (mi_params->mi_cols + cm->seq_params.mib_size - 1) / + cm->seq_params.mib_size; + sb_rows = (mi_params->mi_rows + cm->seq_params.mib_size - 1) / + cm->seq_params.mib_size; + sbs_in_frame = sb_cols * sb_rows; + // Number of target blocks to get the q delta (segment 1). + block_count = + cr->percent_refresh * mi_params->mi_rows * mi_params->mi_cols / 100; + // Set the segmentation map: cycle through the superblocks, starting at + // cr->mb_index, and stopping when either block_count blocks have been found + // to be refreshed, or we have passed through whole frame. + if (cr->sb_index >= sbs_in_frame) cr->sb_index = 0; + assert(cr->sb_index < sbs_in_frame); + i = cr->sb_index; + cr->target_num_seg_blocks = 0; + do { + int sum_map = 0; + // Get the mi_row/mi_col corresponding to superblock index i. + int sb_row_index = (i / sb_cols); + int sb_col_index = i - sb_row_index * sb_cols; + int mi_row = sb_row_index * cm->seq_params.mib_size; + int mi_col = sb_col_index * cm->seq_params.mib_size; + // TODO(any): Ensure the population of + // cpi->common.features.allow_screen_content_tools and use the same instead + // of cpi->oxcf.content == AOM_CONTENT_SCREEN + int qindex_thresh = cpi->oxcf.content == AOM_CONTENT_SCREEN + ? av1_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, + cm->quant_params.base_qindex) + : 0; + assert(mi_row >= 0 && mi_row < mi_params->mi_rows); + assert(mi_col >= 0 && mi_col < mi_params->mi_cols); + bl_index = mi_row * mi_params->mi_cols + mi_col; + // Loop through all MI blocks in superblock and update map. + xmis = AOMMIN(mi_params->mi_cols - mi_col, cm->seq_params.mib_size); + ymis = AOMMIN(mi_params->mi_rows - mi_row, cm->seq_params.mib_size); + for (y = 0; y < ymis; y++) { + for (x = 0; x < xmis; x++) { + const int bl_index2 = bl_index + y * mi_params->mi_cols + x; + // If the block is as a candidate for clean up then mark it + // for possible boost/refresh (segment 1). The segment id may get + // reset to 0 later if block gets coded anything other than GLOBALMV. + if (cr->map[bl_index2] == 0) { + if (cr->last_coded_q_map[bl_index2] > qindex_thresh) sum_map++; + } else if (cr->map[bl_index2] < 0) { + cr->map[bl_index2]++; + } + } + } + // Enforce constant segment over superblock. + // If segment is at least half of superblock, set to 1. + if (sum_map >= xmis * ymis / 2) { + for (y = 0; y < ymis; y++) + for (x = 0; x < xmis; x++) { + seg_map[bl_index + y * mi_params->mi_cols + x] = CR_SEGMENT_ID_BOOST1; + } + cr->target_num_seg_blocks += xmis * ymis; + } + i++; + if (i == sbs_in_frame) { + i = 0; + } + } while (cr->target_num_seg_blocks < block_count && i != cr->sb_index); + cr->sb_index = i; +} + +// Set cyclic refresh parameters. +void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) { + // TODO(marpan): Parameters need to be tuned. + const RATE_CONTROL *const rc = &cpi->rc; + const AV1_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + int num4x4bl = cm->mi_params.MBs << 4; + int target_refresh = 0; + double weight_segment_target = 0; + double weight_segment = 0; + int qp_thresh = AOMMIN(20, rc->best_quality << 1); + int qp_max_thresh = 118 * MAXQ >> 7; + cr->apply_cyclic_refresh = 1; + if (frame_is_intra_only(cm) || is_lossless_requested(&cpi->oxcf) || + cpi->svc.temporal_layer_id > 0 || + rc->avg_frame_qindex[INTER_FRAME] < qp_thresh || + (rc->frames_since_key > 20 && + rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh) || + (cr->avg_frame_low_motion < 45 && rc->frames_since_key > 40)) { + cr->apply_cyclic_refresh = 0; + return; + } + cr->percent_refresh = 10; + cr->max_qdelta_perc = 60; + cr->time_for_refresh = 0; + cr->motion_thresh = 32; + cr->rate_boost_fac = 15; + // Use larger delta-qp (increase rate_ratio_qdelta) for first few (~4) + // periods of the refresh cycle, after a key frame. + // Account for larger interval on base layer for temporal layers. + if (cr->percent_refresh > 0 && + rc->frames_since_key < 400 / cr->percent_refresh) { + cr->rate_ratio_qdelta = 3.0; + } else { + cr->rate_ratio_qdelta = 2.0; + } + // Adjust some parameters for low resolutions. + if (cm->width * cm->height <= 352 * 288) { + if (rc->avg_frame_bandwidth < 3000) { + cr->motion_thresh = 16; + cr->rate_boost_fac = 13; + } else { + cr->max_qdelta_perc = 70; + cr->rate_ratio_qdelta = AOMMAX(cr->rate_ratio_qdelta, 2.5); + } + } + if (cpi->oxcf.rc_mode == AOM_VBR) { + // To be adjusted for VBR mode, e.g., based on gf period and boost. + // For now use smaller qp-delta (than CBR), no second boosted seg, and + // turn-off (no refresh) on golden refresh (since it's already boosted). + cr->percent_refresh = 10; + cr->rate_ratio_qdelta = 1.5; + cr->rate_boost_fac = 10; + if (cpi->refresh_golden_frame == 1) { + cr->percent_refresh = 0; + cr->rate_ratio_qdelta = 1.0; + } + } + // Weight for segment prior to encoding: take the average of the target + // number for the frame to be encoded and the actual from the previous frame. + // Use the target if its less. To be used for setting the base qp for the + // frame in av1_rc_regulate_q. + target_refresh = + cr->percent_refresh * cm->mi_params.mi_rows * cm->mi_params.mi_cols / 100; + weight_segment_target = (double)(target_refresh) / num4x4bl; + weight_segment = (double)((target_refresh + cr->actual_num_seg1_blocks + + cr->actual_num_seg2_blocks) >> + 1) / + num4x4bl; + if (weight_segment_target < 7 * weight_segment / 8) + weight_segment = weight_segment_target; + cr->weight_segment = weight_segment; +} + +// Setup cyclic background refresh: set delta q and segmentation map. +void av1_cyclic_refresh_setup(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + struct segmentation *const seg = &cm->seg; + int resolution_change = + cm->prev_frame && (cm->width != cm->prev_frame->width || + cm->height != cm->prev_frame->height); + if (resolution_change) av1_cyclic_refresh_reset_resize(cpi); + if (cm->current_frame.frame_number == 0) cr->low_content_avg = 0.0; + if (!cr->apply_cyclic_refresh) { + // Set segmentation map to 0 and disable. + unsigned char *const seg_map = cpi->enc_seg.map; + memset(seg_map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); + av1_disable_segmentation(&cm->seg); + if (cm->current_frame.frame_type == KEY_FRAME) { + memset(cr->last_coded_q_map, MAXQ, + cm->mi_params.mi_rows * cm->mi_params.mi_cols * + sizeof(*cr->last_coded_q_map)); + cr->sb_index = 0; + } + return; + } else { + const double q = av1_convert_qindex_to_q(cm->quant_params.base_qindex, + cm->seq_params.bit_depth); + aom_clear_system_state(); + // Set rate threshold to some multiple (set to 2 for now) of the target + // rate (target is given by sb64_target_rate and scaled by 256). + cr->thresh_rate_sb = ((int64_t)(rc->sb64_target_rate) << 8) << 2; + // Distortion threshold, quadratic in Q, scale factor to be adjusted. + // q will not exceed 457, so (q * q) is within 32bit; see: + // av1_convert_qindex_to_q(), av1_ac_quant(), ac_qlookup*[]. + cr->thresh_dist_sb = ((int64_t)(q * q)) << 2; + + // Set up segmentation. + // Clear down the segment map. + av1_enable_segmentation(&cm->seg); + av1_clearall_segfeatures(seg); + + // Note: setting temporal_update has no effect, as the seg-map coding method + // (temporal or spatial) is determined in + // av1_choose_segmap_coding_method(), + // based on the coding cost of each method. For error_resilient mode on the + // last_frame_seg_map is set to 0, so if temporal coding is used, it is + // relative to 0 previous map. + // seg->temporal_update = 0; + + // Segment BASE "Q" feature is disabled so it defaults to the baseline Q. + av1_disable_segfeature(seg, CR_SEGMENT_ID_BASE, SEG_LVL_ALT_Q); + // Use segment BOOST1 for in-frame Q adjustment. + av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q); + // Use segment BOOST2 for more aggressive in-frame Q adjustment. + av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q); + + // Set the q delta for segment BOOST1. + const CommonQuantParams *const quant_params = &cm->quant_params; + int qindex_delta = + compute_deltaq(cpi, quant_params->base_qindex, cr->rate_ratio_qdelta); + cr->qindex_delta[1] = qindex_delta; + + // Compute rd-mult for segment BOOST1. + const int qindex2 = clamp( + quant_params->base_qindex + quant_params->y_dc_delta_q + qindex_delta, + 0, MAXQ); + cr->rdmult = av1_compute_rd_mult(cpi, qindex2); + + av1_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta); + + // Set a more aggressive (higher) q delta for segment BOOST2. + qindex_delta = compute_deltaq( + cpi, quant_params->base_qindex, + AOMMIN(CR_MAX_RATE_TARGET_RATIO, + 0.1 * cr->rate_boost_fac * cr->rate_ratio_qdelta)); + cr->qindex_delta[2] = qindex_delta; + av1_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta); + + // Update the segmentation and refresh map. + cyclic_refresh_update_map(cpi); + } +} + +int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) { + return cr->rdmult; +} + +void av1_cyclic_refresh_reset_resize(AV1_COMP *const cpi) { + const AV1_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + memset(cr->map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); + cr->sb_index = 0; + cpi->refresh_golden_frame = 1; +} diff --git a/libs/libaom/src/av1/encoder/aq_cyclicrefresh.h b/libs/libaom/src/av1/encoder/aq_cyclicrefresh.h new file mode 100644 index 000000000..ee62f6aaa --- /dev/null +++ b/libs/libaom/src/av1/encoder/aq_cyclicrefresh.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_ +#define AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_ + +#include "av1/common/blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// The segment ids used in cyclic refresh: from base (no boost) to increasing +// boost (higher delta-qp). +#define CR_SEGMENT_ID_BASE 0 +#define CR_SEGMENT_ID_BOOST1 1 +#define CR_SEGMENT_ID_BOOST2 2 + +// Maximum rate target ratio for setting segment delta-qp. +#define CR_MAX_RATE_TARGET_RATIO 4.0 + +struct CYCLIC_REFRESH { + // Percentage of blocks per frame that are targeted as candidates + // for cyclic refresh. + int percent_refresh; + // Maximum q-delta as percentage of base q. + int max_qdelta_perc; + // Superblock starting index for cycling through the frame. + int sb_index; + // Controls how long block will need to wait to be refreshed again, in + // excess of the cycle time, i.e., in the case of all zero motion, block + // will be refreshed every (100/percent_refresh + time_for_refresh) frames. + int time_for_refresh; + // Target number of (4x4) blocks that are set for delta-q. + int target_num_seg_blocks; + // Actual number of (4x4) blocks that were applied delta-q. + int actual_num_seg1_blocks; + int actual_num_seg2_blocks; + // RD mult. parameters for segment 1. + int rdmult; + // Cyclic refresh map. + int8_t *map; + // Map of the last q a block was coded at. + uint8_t *last_coded_q_map; + // Thresholds applied to the projected rate/distortion of the coding block, + // when deciding whether block should be refreshed. + int64_t thresh_rate_sb; + int64_t thresh_dist_sb; + // Threshold applied to the motion vector (in units of 1/8 pel) of the + // coding block, when deciding whether block should be refreshed. + int16_t motion_thresh; + // Rate target ratio to set q delta. + double rate_ratio_qdelta; + // Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2. + int rate_boost_fac; + double low_content_avg; + int qindex_delta[3]; + double weight_segment; + int apply_cyclic_refresh; + int cnt_zeromv; + double avg_frame_low_motion; +}; + +struct AV1_COMP; + +typedef struct CYCLIC_REFRESH CYCLIC_REFRESH; + +CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols); + +void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr); + +// Estimate the bits, incorporating the delta-q from segment 1, after encoding +// the frame. +int av1_cyclic_refresh_estimate_bits_at_q(const struct AV1_COMP *cpi, + double correction_factor); + +// Estimate the bits per mb, for a given q = i and a corresponding delta-q +// (for segment 1), prior to encoding the frame. +int av1_cyclic_refresh_rc_bits_per_mb(const struct AV1_COMP *cpi, int i, + double correction_factor); + +// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col), +// check if we should reset the segment_id, and update the cyclic_refresh map +// and segmentation map. +void av1_cyclic_refresh_update_segment(const struct AV1_COMP *cpi, + MB_MODE_INFO *const mbmi, int mi_row, + int mi_col, BLOCK_SIZE bsize, + int64_t rate, int64_t dist, int skip); + +// Update the some stats after encode frame is done. +void av1_cyclic_refresh_postencode(struct AV1_COMP *const cpi); + +// Set golden frame update interval, for 1 pass CBR mode. +void av1_cyclic_refresh_set_golden_update(struct AV1_COMP *const cpi); + +// Set/update global/frame level refresh parameters. +void av1_cyclic_refresh_update_parameters(struct AV1_COMP *const cpi); + +// Setup cyclic background refresh: set delta q and segmentation map. +void av1_cyclic_refresh_setup(struct AV1_COMP *const cpi); + +int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr); + +void av1_cyclic_refresh_reset_resize(struct AV1_COMP *const cpi); + +static INLINE int cyclic_refresh_segment_id_boosted(int segment_id) { + return segment_id == CR_SEGMENT_ID_BOOST1 || + segment_id == CR_SEGMENT_ID_BOOST2; +} + +static INLINE int cyclic_refresh_segment_id(int segment_id) { + if (segment_id == CR_SEGMENT_ID_BOOST1) + return CR_SEGMENT_ID_BOOST1; + else if (segment_id == CR_SEGMENT_ID_BOOST2) + return CR_SEGMENT_ID_BOOST2; + else + return CR_SEGMENT_ID_BASE; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_ diff --git a/libs/libaom/src/av1/encoder/aq_variance.c b/libs/libaom/src/av1/encoder/aq_variance.c new file mode 100644 index 000000000..4176da292 --- /dev/null +++ b/libs/libaom/src/av1/encoder/aq_variance.c @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_ports/mem.h" + +#include "av1/encoder/aq_variance.h" +#include "av1/common/seg_common.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/segmentation.h" +#include "av1/encoder/dwt.h" +#include "aom_ports/system_state.h" + +static const double rate_ratio[MAX_SEGMENTS] = { 2.2, 1.7, 1.3, 1.0, + 0.9, .8, .7, .6 }; + +static const double deltaq_rate_ratio[MAX_SEGMENTS] = { 2.5, 2.0, 1.5, 1.0, + 0.75, 1.0, 1.0, 1.0 }; +#define ENERGY_MIN (-4) +#define ENERGY_MAX (1) +#define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN + 1) +#define ENERGY_IN_BOUNDS(energy) \ + assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX) + +DECLARE_ALIGNED(16, static const uint8_t, av1_all_zeros[MAX_SB_SIZE]) = { 0 }; + +DECLARE_ALIGNED(16, static const uint16_t, + av1_highbd_all_zeros[MAX_SB_SIZE]) = { 0 }; + +static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 }; + +#define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN] + +void av1_vaq_frame_setup(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + const int base_qindex = cm->quant_params.base_qindex; + struct segmentation *seg = &cm->seg; + int i; + + int resolution_change = + cm->prev_frame && (cm->width != cm->prev_frame->width || + cm->height != cm->prev_frame->height); + int avg_energy = (int)(cpi->twopass.mb_av_energy - 2); + double avg_ratio; + if (avg_energy > 7) avg_energy = 7; + if (avg_energy < 0) avg_energy = 0; + avg_ratio = rate_ratio[avg_energy]; + + if (resolution_change) { + memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); + av1_clearall_segfeatures(seg); + aom_clear_system_state(); + av1_disable_segmentation(seg); + return; + } + if (frame_is_intra_only(cm) || cm->features.error_resilient_mode || + cpi->refresh_alt_ref_frame || + (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { + cpi->vaq_refresh = 1; + + av1_enable_segmentation(seg); + av1_clearall_segfeatures(seg); + + aom_clear_system_state(); + + for (i = 0; i < MAX_SEGMENTS; ++i) { + // Set up avg segment id to be 1.0 and adjust the other segments around + // it. + int qindex_delta = av1_compute_qdelta_by_rate( + &cpi->rc, cm->current_frame.frame_type, base_qindex, + rate_ratio[i] / avg_ratio, cm->seq_params.bit_depth); + + // We don't allow qindex 0 in a segment if the base value is not 0. + // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment + // Q delta is sometimes applied without going back around the rd loop. + // This could lead to an illegal combination of partition size and q. + if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) { + qindex_delta = -base_qindex + 1; + } + + av1_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta); + av1_enable_segfeature(seg, i, SEG_LVL_ALT_Q); + } + } +} + +int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { + // This functions returns a score for the blocks local variance as calculated + // by: sum of the log of the (4x4 variances) of each subblock to the current + // block (x,bs) + // * 32 / number of pixels in the block_size. + // This is used for segmentation because to avoid situations in which a large + // block with a gentle gradient gets marked high variance even though each + // subblock has a low variance. This allows us to assign the same segment + // number for the same sorts of area regardless of how the partitioning goes. + + MACROBLOCKD *xd = &x->e_mbd; + double var = 0; + unsigned int sse; + int i, j; + + int right_overflow = + (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0; + int bottom_overflow = + (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0; + + const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow; + const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow; + + aom_clear_system_state(); + + for (i = 0; i < bh; i += 4) { + for (j = 0; j < bw; j += 4) { + if (is_cur_buf_hbd(xd)) { + var += + log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf( + x->plane[0].src.buf + i * x->plane[0].src.stride + j, + x->plane[0].src.stride, + CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse) / + 16); + } else { + var += + log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf( + x->plane[0].src.buf + i * x->plane[0].src.stride + j, + x->plane[0].src.stride, av1_all_zeros, 0, &sse) / + 16); + } + } + } + // Use average of 4x4 log variance. The range for 8 bit 0 - 9.704121561. + var /= (bw / 4 * bh / 4); + if (var > 7) var = 7; + + aom_clear_system_state(); + return (int)(var); +} + +#define DEFAULT_E_MIDPOINT 10.0 + +static unsigned int haar_ac_energy(MACROBLOCK *x, BLOCK_SIZE bs) { + MACROBLOCKD *xd = &x->e_mbd; + int stride = x->plane[0].src.stride; + uint8_t *buf = x->plane[0].src.buf; + const int bw = MI_SIZE * mi_size_wide[bs]; + const int bh = MI_SIZE * mi_size_high[bs]; + const int hbd = is_cur_buf_hbd(xd); + + int var = 0; + for (int r = 0; r < bh; r += 8) + for (int c = 0; c < bw; c += 8) { + var += av1_haar_ac_sad_8x8_uint8_input(buf + c + r * stride, stride, hbd); + } + + return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs]; +} + +double av1_log_block_wavelet_energy(MACROBLOCK *x, BLOCK_SIZE bs) { + unsigned int haar_sad = haar_ac_energy(x, bs); + aom_clear_system_state(); + return log(haar_sad + 1.0); +} + +int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bs) { + double energy, energy_midpoint; + aom_clear_system_state(); + energy_midpoint = (is_stat_consumption_stage_twopass(cpi)) + ? cpi->twopass.frame_avg_haar_energy + : DEFAULT_E_MIDPOINT; + energy = av1_log_block_wavelet_energy(x, bs) - energy_midpoint; + return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX); +} + +int av1_compute_q_from_energy_level_deltaq_mode(const AV1_COMP *const cpi, + int block_var_level) { + int rate_level; + const AV1_COMMON *const cm = &cpi->common; + + if (DELTA_Q_PERCEPTUAL_MODULATION == 1) { + ENERGY_IN_BOUNDS(block_var_level); + rate_level = SEGMENT_ID(block_var_level); + } else { + rate_level = block_var_level; + } + const int base_qindex = cm->quant_params.base_qindex; + int qindex_delta = av1_compute_qdelta_by_rate( + &cpi->rc, cm->current_frame.frame_type, base_qindex, + deltaq_rate_ratio[rate_level], cm->seq_params.bit_depth); + + if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) { + qindex_delta = -base_qindex + 1; + } + return base_qindex + qindex_delta; +} diff --git a/libs/libaom/src/av1/encoder/aq_variance.h b/libs/libaom/src/av1/encoder/aq_variance.h new file mode 100644 index 000000000..543eb0b51 --- /dev/null +++ b/libs/libaom/src/av1/encoder/aq_variance.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_AQ_VARIANCE_H_ +#define AOM_AV1_ENCODER_AQ_VARIANCE_H_ + +#include "av1/encoder/encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_vaq_frame_setup(AV1_COMP *cpi); + +int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs); +int av1_compute_q_from_energy_level_deltaq_mode(const AV1_COMP *const cpi, + int block_var_level); +int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bs); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_AQ_VARIANCE_H_ diff --git a/libs/libaom/src/av1/encoder/arm/neon/av1_error_neon.c b/libs/libaom/src/av1/encoder/arm/neon/av1_error_neon.c new file mode 100644 index 000000000..22da1a8d6 --- /dev/null +++ b/libs/libaom/src/av1/encoder/arm/neon/av1_error_neon.c @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "av1/common/arm/mem_neon.h" +#include "aom_dsp/aom_dsp_common.h" + +int64_t av1_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + int64x2_t error = vdupq_n_s64(0); + int64x2_t sqcoeff = vdupq_n_s64(0); + + assert(block_size >= 8); + assert((block_size % 8) == 0); + + do { + const int16x8_t c = load_tran_low_to_s16q(coeff); + const int16x8_t d = load_tran_low_to_s16q(dqcoeff); + const int16x8_t diff = vsubq_s16(c, d); + const int16x4_t diff_lo = vget_low_s16(diff); + const int16x4_t diff_hi = vget_high_s16(diff); + // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before + // accumulating them in 64-bits. + const int32x4_t err0 = vmull_s16(diff_lo, diff_lo); + const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi); + const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1)); + error = vaddq_s64(error, err2); + + const int16x4_t coeff_lo = vget_low_s16(c); + const int16x4_t coeff_hi = vget_high_s16(c); + const int32x4_t sqcoeff0 = vmull_s16(coeff_lo, coeff_lo); + const int32x4_t sqcoeff1 = vmlal_s16(sqcoeff0, coeff_hi, coeff_hi); + const int64x2_t sqcoeff2 = + vaddl_s32(vget_low_s32(sqcoeff1), vget_high_s32(sqcoeff1)); + sqcoeff = vaddq_s64(sqcoeff, sqcoeff2); + + coeff += 8; + dqcoeff += 8; + block_size -= 8; + } while (block_size != 0); + +#if defined(__aarch64__) + *ssz = vaddvq_s64(sqcoeff); + return vaddvq_s64(error); +#else + *ssz = vgetq_lane_s64(sqcoeff, 0) + vgetq_lane_s64(sqcoeff, 1); + return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1); +#endif +} + +int64_t av1_block_error_lp_neon(const int16_t *coeff, const int16_t *dqcoeff, + int block_size) { + int64x2_t error = vdupq_n_s64(0); + + assert(block_size >= 8); + assert((block_size % 8) == 0); + + do { + const int16x8_t c = vld1q_s16(coeff); + const int16x8_t d = vld1q_s16(dqcoeff); + const int16x8_t diff = vsubq_s16(c, d); + const int16x4_t diff_lo = vget_low_s16(diff); + const int16x4_t diff_hi = vget_high_s16(diff); + // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before + // accumulating them in 64-bits. + const int32x4_t err0 = vmull_s16(diff_lo, diff_lo); + const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi); + const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1)); + error = vaddq_s64(error, err2); + coeff += 8; + dqcoeff += 8; + block_size -= 8; + } while (block_size != 0); + + return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1); +} diff --git a/libs/libaom/src/av1/encoder/arm/neon/quantize_neon.c b/libs/libaom/src/av1/encoder/arm/neon/quantize_neon.c new file mode 100644 index 000000000..c2f50a217 --- /dev/null +++ b/libs/libaom/src/av1/encoder/arm/neon/quantize_neon.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include + +#include "aom_mem/aom_mem.h" + +#include "av1/common/quant_common.h" +#include "av1/common/seg_common.h" +#include "av1/common/arm/mem_neon.h" + +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/rd.h" + +void av1_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + // TODO(jingning) Decide the need of these arguments after the + // quantization process is completed. + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)scan; + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + int i; + const int16x8_t v_zero = vdupq_n_s16(0); + const int16x8_t v_one = vdupq_n_s16(1); + int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); + int16x8_t v_round = vmovq_n_s16(round_ptr[1]); + int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]); + int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]); + // adjust for dc + v_round = vsetq_lane_s16(round_ptr[0], v_round, 0); + v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0); + v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0); + // process dc and the first seven ac coeffs + { + const int16x8_t v_iscan = vld1q_s16(&iscan[0]); + const int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_abs = vabsq_s16(v_coeff); + const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round); + const int32x4_t v_tmp_lo = + vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); + const int32x4_t v_tmp_hi = + vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant)); + const int16x8_t v_tmp2 = + vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); + const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); + const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); + const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); + const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); + const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); + const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); + v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); + store_s16q_to_tran_low(&qcoeff_ptr[0], v_qcoeff); + store_s16q_to_tran_low(&dqcoeff_ptr[0], v_dqcoeff); + v_round = vmovq_n_s16(round_ptr[1]); + v_quant = vmovq_n_s16(quant_ptr[1]); + v_dequant = vmovq_n_s16(dequant_ptr[1]); + } + // now process the rest of the ac coeffs + for (i = 8; i < count; i += 8) { + const int16x8_t v_iscan = vld1q_s16(&iscan[i]); + const int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_abs = vabsq_s16(v_coeff); + const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round); + const int32x4_t v_tmp_lo = + vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); + const int32x4_t v_tmp_hi = + vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant)); + const int16x8_t v_tmp2 = + vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); + const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); + const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); + const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); + const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); + const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); + const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); + v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); + store_s16q_to_tran_low(&qcoeff_ptr[i], v_qcoeff); + store_s16q_to_tran_low(&dqcoeff_ptr[i], v_dqcoeff); + } +#ifdef __aarch64__ + *eob_ptr = vmaxvq_s16(v_eobmax_76543210); +#else + { + const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210), + vget_high_s16(v_eobmax_76543210)); + const int64x1_t v_eobmax_xx32 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); + const int16x4_t v_eobmax_tmp = + vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); + const int64x1_t v_eobmax_xxx3 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); + const int16x4_t v_eobmax_final = + vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); + + *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0); + } +#endif // __aarch64__ +} + +static INLINE void calculate_dqcoeff_lp_and_store(const int16x8_t qcoeff, + const int16x8_t dequant, + int16_t *dqcoeff) { + const int32x4_t dqcoeff_0 = + vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); + const int32x4_t dqcoeff_1 = + vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); + + vst1q_s16(dqcoeff, vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1))); +} + +void av1_quantize_lp_neon(const int16_t *coeff_ptr, intptr_t count, + const int16_t *round_ptr, const int16_t *quant_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan) { + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + const int16x8_t v_zero = vdupq_n_s16(0); + const int16x8_t v_one = vdupq_n_s16(1); + int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); + int16x8_t v_round = vmovq_n_s16(round_ptr[1]); + int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]); + int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]); + + // adjust for dc + v_round = vsetq_lane_s16(round_ptr[0], v_round, 0); + v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0); + v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0); + // process dc and the first seven ac coeffs + { + const int16x8_t v_iscan = vld1q_s16(&scan[0]); + const int16x8_t v_coeff = vld1q_s16(coeff_ptr); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_abs = vabsq_s16(v_coeff); + const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round); + const int32x4_t v_tmp_lo = + vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); + const int32x4_t v_tmp_hi = + vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant)); + const int16x8_t v_tmp2 = + vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); + const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); + const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); + const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); + const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); + const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); + calculate_dqcoeff_lp_and_store(v_qcoeff, v_dequant, dqcoeff_ptr); + v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); + vst1q_s16(qcoeff_ptr, v_qcoeff); + v_round = vmovq_n_s16(round_ptr[1]); + v_quant = vmovq_n_s16(quant_ptr[1]); + v_dequant = vmovq_n_s16(dequant_ptr[1]); + } + // now process the rest of the ac coeffs + for (int i = 8; i < count; i += 8) { + const int16x8_t v_iscan = vld1q_s16(&scan[i]); + const int16x8_t v_coeff = vld1q_s16(coeff_ptr + i); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_abs = vabsq_s16(v_coeff); + const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round); + const int32x4_t v_tmp_lo = + vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); + const int32x4_t v_tmp_hi = + vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant)); + const int16x8_t v_tmp2 = + vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); + const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); + const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); + const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); + const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); + const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); + calculate_dqcoeff_lp_and_store(v_qcoeff, v_dequant, dqcoeff_ptr + i); + v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); + vst1q_s16(qcoeff_ptr + i, v_qcoeff); + } +#ifdef __aarch64__ + *eob_ptr = vmaxvq_s16(v_eobmax_76543210); +#else + { + const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210), + vget_high_s16(v_eobmax_76543210)); + const int64x1_t v_eobmax_xx32 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); + const int16x4_t v_eobmax_tmp = + vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); + const int64x1_t v_eobmax_xxx3 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); + const int16x4_t v_eobmax_final = + vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); + + *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0); + } +#endif // __aarch64__ +} diff --git a/libs/libaom/src/av1/encoder/av1_fwd_txfm1d.c b/libs/libaom/src/av1/encoder/av1_fwd_txfm1d.c new file mode 100644 index 000000000..6601c19ab --- /dev/null +++ b/libs/libaom/src/av1/encoder/av1_fwd_txfm1d.c @@ -0,0 +1,1885 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "av1/encoder/av1_fwd_txfm1d.h" +#include "av1/common/av1_txfm.h" + +void av1_fdct4(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + const int32_t size = 4; + const int32_t *cospi; + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[4]; + + // stage 0; + av1_range_check_buf(stage, input, input, size, stage_range[stage]); + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0] + input[3]; + bf1[1] = input[1] + input[2]; + bf1[2] = -input[2] + input[1]; + bf1[3] = -input[3] + input[0]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[2]; + bf1[2] = bf0[1]; + bf1[3] = bf0[3]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); +} + +void av1_fdct8(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + const int32_t size = 8; + const int32_t *cospi; + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[8]; + + // stage 0; + av1_range_check_buf(stage, input, input, size, stage_range[stage]); + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0] + input[7]; + bf1[1] = input[1] + input[6]; + bf1[2] = input[2] + input[5]; + bf1[3] = input[3] + input[4]; + bf1[4] = -input[4] + input[3]; + bf1[5] = -input[5] + input[2]; + bf1[6] = -input[6] + input[1]; + bf1[7] = -input[7] + input[0]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0] + bf0[3]; + bf1[1] = bf0[1] + bf0[2]; + bf1[2] = -bf0[2] + bf0[1]; + bf1[3] = -bf0[3] + bf0[0]; + bf1[4] = bf0[4]; + bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); + bf1[7] = bf0[7]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); + bf1[4] = bf0[4] + bf0[5]; + bf1[5] = -bf0[5] + bf0[4]; + bf1[6] = -bf0[6] + bf0[7]; + bf1[7] = bf0[7] + bf0[6]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); + bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); + bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[4]; + bf1[2] = bf0[2]; + bf1[3] = bf0[6]; + bf1[4] = bf0[1]; + bf1[5] = bf0[5]; + bf1[6] = bf0[3]; + bf1[7] = bf0[7]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); +} + +void av1_fdct16(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + const int32_t size = 16; + const int32_t *cospi; + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[16]; + + // stage 0; + av1_range_check_buf(stage, input, input, size, stage_range[stage]); + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0] + input[15]; + bf1[1] = input[1] + input[14]; + bf1[2] = input[2] + input[13]; + bf1[3] = input[3] + input[12]; + bf1[4] = input[4] + input[11]; + bf1[5] = input[5] + input[10]; + bf1[6] = input[6] + input[9]; + bf1[7] = input[7] + input[8]; + bf1[8] = -input[8] + input[7]; + bf1[9] = -input[9] + input[6]; + bf1[10] = -input[10] + input[5]; + bf1[11] = -input[11] + input[4]; + bf1[12] = -input[12] + input[3]; + bf1[13] = -input[13] + input[2]; + bf1[14] = -input[14] + input[1]; + bf1[15] = -input[15] + input[0]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0] + bf0[7]; + bf1[1] = bf0[1] + bf0[6]; + bf1[2] = bf0[2] + bf0[5]; + bf1[3] = bf0[3] + bf0[4]; + bf1[4] = -bf0[4] + bf0[3]; + bf1[5] = -bf0[5] + bf0[2]; + bf1[6] = -bf0[6] + bf0[1]; + bf1[7] = -bf0[7] + bf0[0]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit); + bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[3]; + bf1[1] = bf0[1] + bf0[2]; + bf1[2] = -bf0[2] + bf0[1]; + bf1[3] = -bf0[3] + bf0[0]; + bf1[4] = bf0[4]; + bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); + bf1[7] = bf0[7]; + bf1[8] = bf0[8] + bf0[11]; + bf1[9] = bf0[9] + bf0[10]; + bf1[10] = -bf0[10] + bf0[9]; + bf1[11] = -bf0[11] + bf0[8]; + bf1[12] = -bf0[12] + bf0[15]; + bf1[13] = -bf0[13] + bf0[14]; + bf1[14] = bf0[14] + bf0[13]; + bf1[15] = bf0[15] + bf0[12]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); + bf1[4] = bf0[4] + bf0[5]; + bf1[5] = -bf0[5] + bf0[4]; + bf1[6] = -bf0[6] + bf0[7]; + bf1[7] = bf0[7] + bf0[6]; + bf1[8] = bf0[8]; + bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); + bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit); + bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit); + bf1[15] = bf0[15]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); + bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); + bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit); + bf1[8] = bf0[8] + bf0[9]; + bf1[9] = -bf0[9] + bf0[8]; + bf1[10] = -bf0[10] + bf0[11]; + bf1[11] = bf0[11] + bf0[10]; + bf1[12] = bf0[12] + bf0[13]; + bf1[13] = -bf0[13] + bf0[12]; + bf1[14] = -bf0[14] + bf0[15]; + bf1[15] = bf0[15] + bf0[14]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit); + bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit); + bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit); + bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit); + bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit); + bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit); + bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[8]; + bf1[2] = bf0[4]; + bf1[3] = bf0[12]; + bf1[4] = bf0[2]; + bf1[5] = bf0[10]; + bf1[6] = bf0[6]; + bf1[7] = bf0[14]; + bf1[8] = bf0[1]; + bf1[9] = bf0[9]; + bf1[10] = bf0[5]; + bf1[11] = bf0[13]; + bf1[12] = bf0[3]; + bf1[13] = bf0[11]; + bf1[14] = bf0[7]; + bf1[15] = bf0[15]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); +} + +void av1_fdct32(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + const int32_t size = 32; + const int32_t *cospi; + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[32]; + + // stage 0; + av1_range_check_buf(stage, input, input, size, stage_range[stage]); + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0] + input[31]; + bf1[1] = input[1] + input[30]; + bf1[2] = input[2] + input[29]; + bf1[3] = input[3] + input[28]; + bf1[4] = input[4] + input[27]; + bf1[5] = input[5] + input[26]; + bf1[6] = input[6] + input[25]; + bf1[7] = input[7] + input[24]; + bf1[8] = input[8] + input[23]; + bf1[9] = input[9] + input[22]; + bf1[10] = input[10] + input[21]; + bf1[11] = input[11] + input[20]; + bf1[12] = input[12] + input[19]; + bf1[13] = input[13] + input[18]; + bf1[14] = input[14] + input[17]; + bf1[15] = input[15] + input[16]; + bf1[16] = -input[16] + input[15]; + bf1[17] = -input[17] + input[14]; + bf1[18] = -input[18] + input[13]; + bf1[19] = -input[19] + input[12]; + bf1[20] = -input[20] + input[11]; + bf1[21] = -input[21] + input[10]; + bf1[22] = -input[22] + input[9]; + bf1[23] = -input[23] + input[8]; + bf1[24] = -input[24] + input[7]; + bf1[25] = -input[25] + input[6]; + bf1[26] = -input[26] + input[5]; + bf1[27] = -input[27] + input[4]; + bf1[28] = -input[28] + input[3]; + bf1[29] = -input[29] + input[2]; + bf1[30] = -input[30] + input[1]; + bf1[31] = -input[31] + input[0]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0] + bf0[15]; + bf1[1] = bf0[1] + bf0[14]; + bf1[2] = bf0[2] + bf0[13]; + bf1[3] = bf0[3] + bf0[12]; + bf1[4] = bf0[4] + bf0[11]; + bf1[5] = bf0[5] + bf0[10]; + bf1[6] = bf0[6] + bf0[9]; + bf1[7] = bf0[7] + bf0[8]; + bf1[8] = -bf0[8] + bf0[7]; + bf1[9] = -bf0[9] + bf0[6]; + bf1[10] = -bf0[10] + bf0[5]; + bf1[11] = -bf0[11] + bf0[4]; + bf1[12] = -bf0[12] + bf0[3]; + bf1[13] = -bf0[13] + bf0[2]; + bf1[14] = -bf0[14] + bf0[1]; + bf1[15] = -bf0[15] + bf0[0]; + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = bf0[18]; + bf1[19] = bf0[19]; + bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); + bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); + bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); + bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); + bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit); + bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit); + bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit); + bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit); + bf1[28] = bf0[28]; + bf1[29] = bf0[29]; + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[7]; + bf1[1] = bf0[1] + bf0[6]; + bf1[2] = bf0[2] + bf0[5]; + bf1[3] = bf0[3] + bf0[4]; + bf1[4] = -bf0[4] + bf0[3]; + bf1[5] = -bf0[5] + bf0[2]; + bf1[6] = -bf0[6] + bf0[1]; + bf1[7] = -bf0[7] + bf0[0]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit); + bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = bf0[16] + bf0[23]; + bf1[17] = bf0[17] + bf0[22]; + bf1[18] = bf0[18] + bf0[21]; + bf1[19] = bf0[19] + bf0[20]; + bf1[20] = -bf0[20] + bf0[19]; + bf1[21] = -bf0[21] + bf0[18]; + bf1[22] = -bf0[22] + bf0[17]; + bf1[23] = -bf0[23] + bf0[16]; + bf1[24] = -bf0[24] + bf0[31]; + bf1[25] = -bf0[25] + bf0[30]; + bf1[26] = -bf0[26] + bf0[29]; + bf1[27] = -bf0[27] + bf0[28]; + bf1[28] = bf0[28] + bf0[27]; + bf1[29] = bf0[29] + bf0[26]; + bf1[30] = bf0[30] + bf0[25]; + bf1[31] = bf0[31] + bf0[24]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0] + bf0[3]; + bf1[1] = bf0[1] + bf0[2]; + bf1[2] = -bf0[2] + bf0[1]; + bf1[3] = -bf0[3] + bf0[0]; + bf1[4] = bf0[4]; + bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); + bf1[7] = bf0[7]; + bf1[8] = bf0[8] + bf0[11]; + bf1[9] = bf0[9] + bf0[10]; + bf1[10] = -bf0[10] + bf0[9]; + bf1[11] = -bf0[11] + bf0[8]; + bf1[12] = -bf0[12] + bf0[15]; + bf1[13] = -bf0[13] + bf0[14]; + bf1[14] = bf0[14] + bf0[13]; + bf1[15] = bf0[15] + bf0[12]; + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit); + bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit); + bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit); + bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit); + bf1[22] = bf0[22]; + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = bf0[25]; + bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit); + bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit); + bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit); + bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit); + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); + bf1[4] = bf0[4] + bf0[5]; + bf1[5] = -bf0[5] + bf0[4]; + bf1[6] = -bf0[6] + bf0[7]; + bf1[7] = bf0[7] + bf0[6]; + bf1[8] = bf0[8]; + bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); + bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit); + bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit); + bf1[15] = bf0[15]; + bf1[16] = bf0[16] + bf0[19]; + bf1[17] = bf0[17] + bf0[18]; + bf1[18] = -bf0[18] + bf0[17]; + bf1[19] = -bf0[19] + bf0[16]; + bf1[20] = -bf0[20] + bf0[23]; + bf1[21] = -bf0[21] + bf0[22]; + bf1[22] = bf0[22] + bf0[21]; + bf1[23] = bf0[23] + bf0[20]; + bf1[24] = bf0[24] + bf0[27]; + bf1[25] = bf0[25] + bf0[26]; + bf1[26] = -bf0[26] + bf0[25]; + bf1[27] = -bf0[27] + bf0[24]; + bf1[28] = -bf0[28] + bf0[31]; + bf1[29] = -bf0[29] + bf0[30]; + bf1[30] = bf0[30] + bf0[29]; + bf1[31] = bf0[31] + bf0[28]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); + bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); + bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit); + bf1[8] = bf0[8] + bf0[9]; + bf1[9] = -bf0[9] + bf0[8]; + bf1[10] = -bf0[10] + bf0[11]; + bf1[11] = bf0[11] + bf0[10]; + bf1[12] = bf0[12] + bf0[13]; + bf1[13] = -bf0[13] + bf0[12]; + bf1[14] = -bf0[14] + bf0[15]; + bf1[15] = bf0[15] + bf0[14]; + bf1[16] = bf0[16]; + bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit); + bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit); + bf1[19] = bf0[19]; + bf1[20] = bf0[20]; + bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit); + bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit); + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit); + bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit); + bf1[27] = bf0[27]; + bf1[28] = bf0[28]; + bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit); + bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit); + bf1[31] = bf0[31]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit); + bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit); + bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit); + bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit); + bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit); + bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit); + bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit); + bf1[16] = bf0[16] + bf0[17]; + bf1[17] = -bf0[17] + bf0[16]; + bf1[18] = -bf0[18] + bf0[19]; + bf1[19] = bf0[19] + bf0[18]; + bf1[20] = bf0[20] + bf0[21]; + bf1[21] = -bf0[21] + bf0[20]; + bf1[22] = -bf0[22] + bf0[23]; + bf1[23] = bf0[23] + bf0[22]; + bf1[24] = bf0[24] + bf0[25]; + bf1[25] = -bf0[25] + bf0[24]; + bf1[26] = -bf0[26] + bf0[27]; + bf1[27] = bf0[27] + bf0[26]; + bf1[28] = bf0[28] + bf0[29]; + bf1[29] = -bf0[29] + bf0[28]; + bf1[30] = -bf0[30] + bf0[31]; + bf1[31] = bf0[31] + bf0[30]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 8 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = bf0[10]; + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = bf0[13]; + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit); + bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit); + bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit); + bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit); + bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit); + bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit); + bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit); + bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit); + bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit); + bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit); + bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit); + bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit); + bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit); + bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit); + bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit); + bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 9 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[16]; + bf1[2] = bf0[8]; + bf1[3] = bf0[24]; + bf1[4] = bf0[4]; + bf1[5] = bf0[20]; + bf1[6] = bf0[12]; + bf1[7] = bf0[28]; + bf1[8] = bf0[2]; + bf1[9] = bf0[18]; + bf1[10] = bf0[10]; + bf1[11] = bf0[26]; + bf1[12] = bf0[6]; + bf1[13] = bf0[22]; + bf1[14] = bf0[14]; + bf1[15] = bf0[30]; + bf1[16] = bf0[1]; + bf1[17] = bf0[17]; + bf1[18] = bf0[9]; + bf1[19] = bf0[25]; + bf1[20] = bf0[5]; + bf1[21] = bf0[21]; + bf1[22] = bf0[13]; + bf1[23] = bf0[29]; + bf1[24] = bf0[3]; + bf1[25] = bf0[19]; + bf1[26] = bf0[11]; + bf1[27] = bf0[27]; + bf1[28] = bf0[7]; + bf1[29] = bf0[23]; + bf1[30] = bf0[15]; + bf1[31] = bf0[31]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); +} + +void av1_fadst4(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + int bit = cos_bit; + const int32_t *sinpi = sinpi_arr(bit); + int32_t x0, x1, x2, x3; + int32_t s0, s1, s2, s3, s4, s5, s6, s7; + + // stage 0 + av1_range_check_buf(0, input, input, 4, stage_range[0]); + x0 = input[0]; + x1 = input[1]; + x2 = input[2]; + x3 = input[3]; + + if (!(x0 | x1 | x2 | x3)) { + output[0] = output[1] = output[2] = output[3] = 0; + return; + } + + // stage 1 + s0 = range_check_value(sinpi[1] * x0, bit + stage_range[1]); + s1 = range_check_value(sinpi[4] * x0, bit + stage_range[1]); + s2 = range_check_value(sinpi[2] * x1, bit + stage_range[1]); + s3 = range_check_value(sinpi[1] * x1, bit + stage_range[1]); + s4 = range_check_value(sinpi[3] * x2, bit + stage_range[1]); + s5 = range_check_value(sinpi[4] * x3, bit + stage_range[1]); + s6 = range_check_value(sinpi[2] * x3, bit + stage_range[1]); + s7 = range_check_value(x0 + x1, stage_range[1]); + + // stage 2 + s7 = range_check_value(s7 - x3, stage_range[2]); + + // stage 3 + x0 = range_check_value(s0 + s2, bit + stage_range[3]); + x1 = range_check_value(sinpi[3] * s7, bit + stage_range[3]); + x2 = range_check_value(s1 - s3, bit + stage_range[3]); + x3 = range_check_value(s4, bit + stage_range[3]); + + // stage 4 + x0 = range_check_value(x0 + s5, bit + stage_range[4]); + x2 = range_check_value(x2 + s6, bit + stage_range[4]); + + // stage 5 + s0 = range_check_value(x0 + x3, bit + stage_range[5]); + s1 = range_check_value(x1, bit + stage_range[5]); + s2 = range_check_value(x2 - x3, bit + stage_range[5]); + s3 = range_check_value(x2 - x0, bit + stage_range[5]); + + // stage 6 + s3 = range_check_value(s3 + x3, bit + stage_range[6]); + + // 1-D transform scaling factor is sqrt(2). + output[0] = round_shift(s0, bit); + output[1] = round_shift(s1, bit); + output[2] = round_shift(s2, bit); + output[3] = round_shift(s3, bit); + av1_range_check_buf(6, input, output, 4, stage_range[6]); +} + +void av1_fadst8(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + const int32_t size = 8; + const int32_t *cospi; + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[8]; + + // stage 0; + av1_range_check_buf(stage, input, input, size, stage_range[stage]); + + // stage 1; + stage++; + assert(output != input); + bf1 = output; + bf1[0] = input[0]; + bf1[1] = -input[7]; + bf1[2] = -input[3]; + bf1[3] = input[4]; + bf1[4] = -input[1]; + bf1[5] = input[6]; + bf1[6] = input[2]; + bf1[7] = -input[5]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit); + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[2]; + bf1[1] = bf0[1] + bf0[3]; + bf1[2] = bf0[0] - bf0[2]; + bf1[3] = bf0[1] - bf0[3]; + bf1[4] = bf0[4] + bf0[6]; + bf1[5] = bf0[5] + bf0[7]; + bf1[6] = bf0[4] - bf0[6]; + bf1[7] = bf0[5] - bf0[7]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit); + bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit); + bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[4]; + bf1[1] = bf0[1] + bf0[5]; + bf1[2] = bf0[2] + bf0[6]; + bf1[3] = bf0[3] + bf0[7]; + bf1[4] = bf0[0] - bf0[4]; + bf1[5] = bf0[1] - bf0[5]; + bf1[6] = bf0[2] - bf0[6]; + bf1[7] = bf0[3] - bf0[7]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit); + bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit); + bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit); + bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[1]; + bf1[1] = bf0[6]; + bf1[2] = bf0[3]; + bf1[3] = bf0[4]; + bf1[4] = bf0[5]; + bf1[5] = bf0[2]; + bf1[6] = bf0[7]; + bf1[7] = bf0[0]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); +} + +void av1_fadst16(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + const int32_t size = 16; + const int32_t *cospi; + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[16]; + + // stage 0; + av1_range_check_buf(stage, input, input, size, stage_range[stage]); + + // stage 1; + stage++; + assert(output != input); + bf1 = output; + bf1[0] = input[0]; + bf1[1] = -input[15]; + bf1[2] = -input[7]; + bf1[3] = input[8]; + bf1[4] = -input[3]; + bf1[5] = input[12]; + bf1[6] = input[4]; + bf1[7] = -input[11]; + bf1[8] = -input[1]; + bf1[9] = input[14]; + bf1[10] = input[6]; + bf1[11] = -input[9]; + bf1[12] = input[2]; + bf1[13] = -input[13]; + bf1[14] = -input[5]; + bf1[15] = input[10]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit); + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit); + bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit); + bf1[12] = bf0[12]; + bf1[13] = bf0[13]; + bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit); + bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[2]; + bf1[1] = bf0[1] + bf0[3]; + bf1[2] = bf0[0] - bf0[2]; + bf1[3] = bf0[1] - bf0[3]; + bf1[4] = bf0[4] + bf0[6]; + bf1[5] = bf0[5] + bf0[7]; + bf1[6] = bf0[4] - bf0[6]; + bf1[7] = bf0[5] - bf0[7]; + bf1[8] = bf0[8] + bf0[10]; + bf1[9] = bf0[9] + bf0[11]; + bf1[10] = bf0[8] - bf0[10]; + bf1[11] = bf0[9] - bf0[11]; + bf1[12] = bf0[12] + bf0[14]; + bf1[13] = bf0[13] + bf0[15]; + bf1[14] = bf0[12] - bf0[14]; + bf1[15] = bf0[13] - bf0[15]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit); + bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit); + bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = bf0[10]; + bf1[11] = bf0[11]; + bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit); + bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit); + bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit); + bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[4]; + bf1[1] = bf0[1] + bf0[5]; + bf1[2] = bf0[2] + bf0[6]; + bf1[3] = bf0[3] + bf0[7]; + bf1[4] = bf0[0] - bf0[4]; + bf1[5] = bf0[1] - bf0[5]; + bf1[6] = bf0[2] - bf0[6]; + bf1[7] = bf0[3] - bf0[7]; + bf1[8] = bf0[8] + bf0[12]; + bf1[9] = bf0[9] + bf0[13]; + bf1[10] = bf0[10] + bf0[14]; + bf1[11] = bf0[11] + bf0[15]; + bf1[12] = bf0[8] - bf0[12]; + bf1[13] = bf0[9] - bf0[13]; + bf1[14] = bf0[10] - bf0[14]; + bf1[15] = bf0[11] - bf0[15]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit); + bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit); + bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit); + bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit); + bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit); + bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit); + bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit); + bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[8]; + bf1[1] = bf0[1] + bf0[9]; + bf1[2] = bf0[2] + bf0[10]; + bf1[3] = bf0[3] + bf0[11]; + bf1[4] = bf0[4] + bf0[12]; + bf1[5] = bf0[5] + bf0[13]; + bf1[6] = bf0[6] + bf0[14]; + bf1[7] = bf0[7] + bf0[15]; + bf1[8] = bf0[0] - bf0[8]; + bf1[9] = bf0[1] - bf0[9]; + bf1[10] = bf0[2] - bf0[10]; + bf1[11] = bf0[3] - bf0[11]; + bf1[12] = bf0[4] - bf0[12]; + bf1[13] = bf0[5] - bf0[13]; + bf1[14] = bf0[6] - bf0[14]; + bf1[15] = bf0[7] - bf0[15]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 8 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit); + bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit); + bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit); + bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit); + bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit); + bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit); + bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit); + bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit); + bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit); + bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit); + bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit); + bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 9 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[1]; + bf1[1] = bf0[14]; + bf1[2] = bf0[3]; + bf1[3] = bf0[12]; + bf1[4] = bf0[5]; + bf1[5] = bf0[10]; + bf1[6] = bf0[7]; + bf1[7] = bf0[8]; + bf1[8] = bf0[9]; + bf1[9] = bf0[6]; + bf1[10] = bf0[11]; + bf1[11] = bf0[4]; + bf1[12] = bf0[13]; + bf1[13] = bf0[2]; + bf1[14] = bf0[15]; + bf1[15] = bf0[0]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); +} + +void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + (void)cos_bit; + for (int i = 0; i < 4; ++i) + output[i] = round_shift((int64_t)input[i] * NewSqrt2, NewSqrt2Bits); + assert(stage_range[0] + NewSqrt2Bits <= 32); + av1_range_check_buf(0, input, output, 4, stage_range[0]); +} + +void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + (void)cos_bit; + for (int i = 0; i < 8; ++i) output[i] = input[i] * 2; + av1_range_check_buf(0, input, output, 8, stage_range[0]); +} + +void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + (void)cos_bit; + for (int i = 0; i < 16; ++i) + output[i] = round_shift((int64_t)input[i] * 2 * NewSqrt2, NewSqrt2Bits); + assert(stage_range[0] + NewSqrt2Bits <= 32); + av1_range_check_buf(0, input, output, 16, stage_range[0]); +} + +void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + (void)cos_bit; + for (int i = 0; i < 32; ++i) output[i] = input[i] * 4; + av1_range_check_buf(0, input, output, 32, stage_range[0]); +} + +void av1_fdct64(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + const int32_t size = 64; + const int32_t *cospi; + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[64]; + + // stage 0; + av1_range_check_buf(stage, input, input, size, stage_range[stage]); + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0] + input[63]; + bf1[1] = input[1] + input[62]; + bf1[2] = input[2] + input[61]; + bf1[3] = input[3] + input[60]; + bf1[4] = input[4] + input[59]; + bf1[5] = input[5] + input[58]; + bf1[6] = input[6] + input[57]; + bf1[7] = input[7] + input[56]; + bf1[8] = input[8] + input[55]; + bf1[9] = input[9] + input[54]; + bf1[10] = input[10] + input[53]; + bf1[11] = input[11] + input[52]; + bf1[12] = input[12] + input[51]; + bf1[13] = input[13] + input[50]; + bf1[14] = input[14] + input[49]; + bf1[15] = input[15] + input[48]; + bf1[16] = input[16] + input[47]; + bf1[17] = input[17] + input[46]; + bf1[18] = input[18] + input[45]; + bf1[19] = input[19] + input[44]; + bf1[20] = input[20] + input[43]; + bf1[21] = input[21] + input[42]; + bf1[22] = input[22] + input[41]; + bf1[23] = input[23] + input[40]; + bf1[24] = input[24] + input[39]; + bf1[25] = input[25] + input[38]; + bf1[26] = input[26] + input[37]; + bf1[27] = input[27] + input[36]; + bf1[28] = input[28] + input[35]; + bf1[29] = input[29] + input[34]; + bf1[30] = input[30] + input[33]; + bf1[31] = input[31] + input[32]; + bf1[32] = -input[32] + input[31]; + bf1[33] = -input[33] + input[30]; + bf1[34] = -input[34] + input[29]; + bf1[35] = -input[35] + input[28]; + bf1[36] = -input[36] + input[27]; + bf1[37] = -input[37] + input[26]; + bf1[38] = -input[38] + input[25]; + bf1[39] = -input[39] + input[24]; + bf1[40] = -input[40] + input[23]; + bf1[41] = -input[41] + input[22]; + bf1[42] = -input[42] + input[21]; + bf1[43] = -input[43] + input[20]; + bf1[44] = -input[44] + input[19]; + bf1[45] = -input[45] + input[18]; + bf1[46] = -input[46] + input[17]; + bf1[47] = -input[47] + input[16]; + bf1[48] = -input[48] + input[15]; + bf1[49] = -input[49] + input[14]; + bf1[50] = -input[50] + input[13]; + bf1[51] = -input[51] + input[12]; + bf1[52] = -input[52] + input[11]; + bf1[53] = -input[53] + input[10]; + bf1[54] = -input[54] + input[9]; + bf1[55] = -input[55] + input[8]; + bf1[56] = -input[56] + input[7]; + bf1[57] = -input[57] + input[6]; + bf1[58] = -input[58] + input[5]; + bf1[59] = -input[59] + input[4]; + bf1[60] = -input[60] + input[3]; + bf1[61] = -input[61] + input[2]; + bf1[62] = -input[62] + input[1]; + bf1[63] = -input[63] + input[0]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0] + bf0[31]; + bf1[1] = bf0[1] + bf0[30]; + bf1[2] = bf0[2] + bf0[29]; + bf1[3] = bf0[3] + bf0[28]; + bf1[4] = bf0[4] + bf0[27]; + bf1[5] = bf0[5] + bf0[26]; + bf1[6] = bf0[6] + bf0[25]; + bf1[7] = bf0[7] + bf0[24]; + bf1[8] = bf0[8] + bf0[23]; + bf1[9] = bf0[9] + bf0[22]; + bf1[10] = bf0[10] + bf0[21]; + bf1[11] = bf0[11] + bf0[20]; + bf1[12] = bf0[12] + bf0[19]; + bf1[13] = bf0[13] + bf0[18]; + bf1[14] = bf0[14] + bf0[17]; + bf1[15] = bf0[15] + bf0[16]; + bf1[16] = -bf0[16] + bf0[15]; + bf1[17] = -bf0[17] + bf0[14]; + bf1[18] = -bf0[18] + bf0[13]; + bf1[19] = -bf0[19] + bf0[12]; + bf1[20] = -bf0[20] + bf0[11]; + bf1[21] = -bf0[21] + bf0[10]; + bf1[22] = -bf0[22] + bf0[9]; + bf1[23] = -bf0[23] + bf0[8]; + bf1[24] = -bf0[24] + bf0[7]; + bf1[25] = -bf0[25] + bf0[6]; + bf1[26] = -bf0[26] + bf0[5]; + bf1[27] = -bf0[27] + bf0[4]; + bf1[28] = -bf0[28] + bf0[3]; + bf1[29] = -bf0[29] + bf0[2]; + bf1[30] = -bf0[30] + bf0[1]; + bf1[31] = -bf0[31] + bf0[0]; + bf1[32] = bf0[32]; + bf1[33] = bf0[33]; + bf1[34] = bf0[34]; + bf1[35] = bf0[35]; + bf1[36] = bf0[36]; + bf1[37] = bf0[37]; + bf1[38] = bf0[38]; + bf1[39] = bf0[39]; + bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit); + bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit); + bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit); + bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit); + bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit); + bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit); + bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit); + bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit); + bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit); + bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit); + bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit); + bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit); + bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit); + bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit); + bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit); + bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit); + bf1[56] = bf0[56]; + bf1[57] = bf0[57]; + bf1[58] = bf0[58]; + bf1[59] = bf0[59]; + bf1[60] = bf0[60]; + bf1[61] = bf0[61]; + bf1[62] = bf0[62]; + bf1[63] = bf0[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[15]; + bf1[1] = bf0[1] + bf0[14]; + bf1[2] = bf0[2] + bf0[13]; + bf1[3] = bf0[3] + bf0[12]; + bf1[4] = bf0[4] + bf0[11]; + bf1[5] = bf0[5] + bf0[10]; + bf1[6] = bf0[6] + bf0[9]; + bf1[7] = bf0[7] + bf0[8]; + bf1[8] = -bf0[8] + bf0[7]; + bf1[9] = -bf0[9] + bf0[6]; + bf1[10] = -bf0[10] + bf0[5]; + bf1[11] = -bf0[11] + bf0[4]; + bf1[12] = -bf0[12] + bf0[3]; + bf1[13] = -bf0[13] + bf0[2]; + bf1[14] = -bf0[14] + bf0[1]; + bf1[15] = -bf0[15] + bf0[0]; + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = bf0[18]; + bf1[19] = bf0[19]; + bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); + bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); + bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); + bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); + bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit); + bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit); + bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit); + bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit); + bf1[28] = bf0[28]; + bf1[29] = bf0[29]; + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + bf1[32] = bf0[32] + bf0[47]; + bf1[33] = bf0[33] + bf0[46]; + bf1[34] = bf0[34] + bf0[45]; + bf1[35] = bf0[35] + bf0[44]; + bf1[36] = bf0[36] + bf0[43]; + bf1[37] = bf0[37] + bf0[42]; + bf1[38] = bf0[38] + bf0[41]; + bf1[39] = bf0[39] + bf0[40]; + bf1[40] = -bf0[40] + bf0[39]; + bf1[41] = -bf0[41] + bf0[38]; + bf1[42] = -bf0[42] + bf0[37]; + bf1[43] = -bf0[43] + bf0[36]; + bf1[44] = -bf0[44] + bf0[35]; + bf1[45] = -bf0[45] + bf0[34]; + bf1[46] = -bf0[46] + bf0[33]; + bf1[47] = -bf0[47] + bf0[32]; + bf1[48] = -bf0[48] + bf0[63]; + bf1[49] = -bf0[49] + bf0[62]; + bf1[50] = -bf0[50] + bf0[61]; + bf1[51] = -bf0[51] + bf0[60]; + bf1[52] = -bf0[52] + bf0[59]; + bf1[53] = -bf0[53] + bf0[58]; + bf1[54] = -bf0[54] + bf0[57]; + bf1[55] = -bf0[55] + bf0[56]; + bf1[56] = bf0[56] + bf0[55]; + bf1[57] = bf0[57] + bf0[54]; + bf1[58] = bf0[58] + bf0[53]; + bf1[59] = bf0[59] + bf0[52]; + bf1[60] = bf0[60] + bf0[51]; + bf1[61] = bf0[61] + bf0[50]; + bf1[62] = bf0[62] + bf0[49]; + bf1[63] = bf0[63] + bf0[48]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0] + bf0[7]; + bf1[1] = bf0[1] + bf0[6]; + bf1[2] = bf0[2] + bf0[5]; + bf1[3] = bf0[3] + bf0[4]; + bf1[4] = -bf0[4] + bf0[3]; + bf1[5] = -bf0[5] + bf0[2]; + bf1[6] = -bf0[6] + bf0[1]; + bf1[7] = -bf0[7] + bf0[0]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit); + bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = bf0[16] + bf0[23]; + bf1[17] = bf0[17] + bf0[22]; + bf1[18] = bf0[18] + bf0[21]; + bf1[19] = bf0[19] + bf0[20]; + bf1[20] = -bf0[20] + bf0[19]; + bf1[21] = -bf0[21] + bf0[18]; + bf1[22] = -bf0[22] + bf0[17]; + bf1[23] = -bf0[23] + bf0[16]; + bf1[24] = -bf0[24] + bf0[31]; + bf1[25] = -bf0[25] + bf0[30]; + bf1[26] = -bf0[26] + bf0[29]; + bf1[27] = -bf0[27] + bf0[28]; + bf1[28] = bf0[28] + bf0[27]; + bf1[29] = bf0[29] + bf0[26]; + bf1[30] = bf0[30] + bf0[25]; + bf1[31] = bf0[31] + bf0[24]; + bf1[32] = bf0[32]; + bf1[33] = bf0[33]; + bf1[34] = bf0[34]; + bf1[35] = bf0[35]; + bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit); + bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit); + bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit); + bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit); + bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit); + bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit); + bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit); + bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit); + bf1[44] = bf0[44]; + bf1[45] = bf0[45]; + bf1[46] = bf0[46]; + bf1[47] = bf0[47]; + bf1[48] = bf0[48]; + bf1[49] = bf0[49]; + bf1[50] = bf0[50]; + bf1[51] = bf0[51]; + bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit); + bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit); + bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit); + bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit); + bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit); + bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit); + bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit); + bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit); + bf1[60] = bf0[60]; + bf1[61] = bf0[61]; + bf1[62] = bf0[62]; + bf1[63] = bf0[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[3]; + bf1[1] = bf0[1] + bf0[2]; + bf1[2] = -bf0[2] + bf0[1]; + bf1[3] = -bf0[3] + bf0[0]; + bf1[4] = bf0[4]; + bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); + bf1[7] = bf0[7]; + bf1[8] = bf0[8] + bf0[11]; + bf1[9] = bf0[9] + bf0[10]; + bf1[10] = -bf0[10] + bf0[9]; + bf1[11] = -bf0[11] + bf0[8]; + bf1[12] = -bf0[12] + bf0[15]; + bf1[13] = -bf0[13] + bf0[14]; + bf1[14] = bf0[14] + bf0[13]; + bf1[15] = bf0[15] + bf0[12]; + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit); + bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit); + bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit); + bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit); + bf1[22] = bf0[22]; + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = bf0[25]; + bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit); + bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit); + bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit); + bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit); + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + bf1[32] = bf0[32] + bf0[39]; + bf1[33] = bf0[33] + bf0[38]; + bf1[34] = bf0[34] + bf0[37]; + bf1[35] = bf0[35] + bf0[36]; + bf1[36] = -bf0[36] + bf0[35]; + bf1[37] = -bf0[37] + bf0[34]; + bf1[38] = -bf0[38] + bf0[33]; + bf1[39] = -bf0[39] + bf0[32]; + bf1[40] = -bf0[40] + bf0[47]; + bf1[41] = -bf0[41] + bf0[46]; + bf1[42] = -bf0[42] + bf0[45]; + bf1[43] = -bf0[43] + bf0[44]; + bf1[44] = bf0[44] + bf0[43]; + bf1[45] = bf0[45] + bf0[42]; + bf1[46] = bf0[46] + bf0[41]; + bf1[47] = bf0[47] + bf0[40]; + bf1[48] = bf0[48] + bf0[55]; + bf1[49] = bf0[49] + bf0[54]; + bf1[50] = bf0[50] + bf0[53]; + bf1[51] = bf0[51] + bf0[52]; + bf1[52] = -bf0[52] + bf0[51]; + bf1[53] = -bf0[53] + bf0[50]; + bf1[54] = -bf0[54] + bf0[49]; + bf1[55] = -bf0[55] + bf0[48]; + bf1[56] = -bf0[56] + bf0[63]; + bf1[57] = -bf0[57] + bf0[62]; + bf1[58] = -bf0[58] + bf0[61]; + bf1[59] = -bf0[59] + bf0[60]; + bf1[60] = bf0[60] + bf0[59]; + bf1[61] = bf0[61] + bf0[58]; + bf1[62] = bf0[62] + bf0[57]; + bf1[63] = bf0[63] + bf0[56]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); + bf1[4] = bf0[4] + bf0[5]; + bf1[5] = -bf0[5] + bf0[4]; + bf1[6] = -bf0[6] + bf0[7]; + bf1[7] = bf0[7] + bf0[6]; + bf1[8] = bf0[8]; + bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); + bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit); + bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit); + bf1[15] = bf0[15]; + bf1[16] = bf0[16] + bf0[19]; + bf1[17] = bf0[17] + bf0[18]; + bf1[18] = -bf0[18] + bf0[17]; + bf1[19] = -bf0[19] + bf0[16]; + bf1[20] = -bf0[20] + bf0[23]; + bf1[21] = -bf0[21] + bf0[22]; + bf1[22] = bf0[22] + bf0[21]; + bf1[23] = bf0[23] + bf0[20]; + bf1[24] = bf0[24] + bf0[27]; + bf1[25] = bf0[25] + bf0[26]; + bf1[26] = -bf0[26] + bf0[25]; + bf1[27] = -bf0[27] + bf0[24]; + bf1[28] = -bf0[28] + bf0[31]; + bf1[29] = -bf0[29] + bf0[30]; + bf1[30] = bf0[30] + bf0[29]; + bf1[31] = bf0[31] + bf0[28]; + bf1[32] = bf0[32]; + bf1[33] = bf0[33]; + bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit); + bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit); + bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit); + bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit); + bf1[38] = bf0[38]; + bf1[39] = bf0[39]; + bf1[40] = bf0[40]; + bf1[41] = bf0[41]; + bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit); + bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit); + bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit); + bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit); + bf1[46] = bf0[46]; + bf1[47] = bf0[47]; + bf1[48] = bf0[48]; + bf1[49] = bf0[49]; + bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit); + bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit); + bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit); + bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit); + bf1[54] = bf0[54]; + bf1[55] = bf0[55]; + bf1[56] = bf0[56]; + bf1[57] = bf0[57]; + bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit); + bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit); + bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit); + bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit); + bf1[62] = bf0[62]; + bf1[63] = bf0[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); + bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); + bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit); + bf1[8] = bf0[8] + bf0[9]; + bf1[9] = -bf0[9] + bf0[8]; + bf1[10] = -bf0[10] + bf0[11]; + bf1[11] = bf0[11] + bf0[10]; + bf1[12] = bf0[12] + bf0[13]; + bf1[13] = -bf0[13] + bf0[12]; + bf1[14] = -bf0[14] + bf0[15]; + bf1[15] = bf0[15] + bf0[14]; + bf1[16] = bf0[16]; + bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit); + bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit); + bf1[19] = bf0[19]; + bf1[20] = bf0[20]; + bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit); + bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit); + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit); + bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit); + bf1[27] = bf0[27]; + bf1[28] = bf0[28]; + bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit); + bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit); + bf1[31] = bf0[31]; + bf1[32] = bf0[32] + bf0[35]; + bf1[33] = bf0[33] + bf0[34]; + bf1[34] = -bf0[34] + bf0[33]; + bf1[35] = -bf0[35] + bf0[32]; + bf1[36] = -bf0[36] + bf0[39]; + bf1[37] = -bf0[37] + bf0[38]; + bf1[38] = bf0[38] + bf0[37]; + bf1[39] = bf0[39] + bf0[36]; + bf1[40] = bf0[40] + bf0[43]; + bf1[41] = bf0[41] + bf0[42]; + bf1[42] = -bf0[42] + bf0[41]; + bf1[43] = -bf0[43] + bf0[40]; + bf1[44] = -bf0[44] + bf0[47]; + bf1[45] = -bf0[45] + bf0[46]; + bf1[46] = bf0[46] + bf0[45]; + bf1[47] = bf0[47] + bf0[44]; + bf1[48] = bf0[48] + bf0[51]; + bf1[49] = bf0[49] + bf0[50]; + bf1[50] = -bf0[50] + bf0[49]; + bf1[51] = -bf0[51] + bf0[48]; + bf1[52] = -bf0[52] + bf0[55]; + bf1[53] = -bf0[53] + bf0[54]; + bf1[54] = bf0[54] + bf0[53]; + bf1[55] = bf0[55] + bf0[52]; + bf1[56] = bf0[56] + bf0[59]; + bf1[57] = bf0[57] + bf0[58]; + bf1[58] = -bf0[58] + bf0[57]; + bf1[59] = -bf0[59] + bf0[56]; + bf1[60] = -bf0[60] + bf0[63]; + bf1[61] = -bf0[61] + bf0[62]; + bf1[62] = bf0[62] + bf0[61]; + bf1[63] = bf0[63] + bf0[60]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 8 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit); + bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit); + bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit); + bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit); + bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit); + bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit); + bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit); + bf1[16] = bf0[16] + bf0[17]; + bf1[17] = -bf0[17] + bf0[16]; + bf1[18] = -bf0[18] + bf0[19]; + bf1[19] = bf0[19] + bf0[18]; + bf1[20] = bf0[20] + bf0[21]; + bf1[21] = -bf0[21] + bf0[20]; + bf1[22] = -bf0[22] + bf0[23]; + bf1[23] = bf0[23] + bf0[22]; + bf1[24] = bf0[24] + bf0[25]; + bf1[25] = -bf0[25] + bf0[24]; + bf1[26] = -bf0[26] + bf0[27]; + bf1[27] = bf0[27] + bf0[26]; + bf1[28] = bf0[28] + bf0[29]; + bf1[29] = -bf0[29] + bf0[28]; + bf1[30] = -bf0[30] + bf0[31]; + bf1[31] = bf0[31] + bf0[30]; + bf1[32] = bf0[32]; + bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit); + bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit); + bf1[35] = bf0[35]; + bf1[36] = bf0[36]; + bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit); + bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit); + bf1[39] = bf0[39]; + bf1[40] = bf0[40]; + bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit); + bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit); + bf1[43] = bf0[43]; + bf1[44] = bf0[44]; + bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit); + bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit); + bf1[47] = bf0[47]; + bf1[48] = bf0[48]; + bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit); + bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit); + bf1[51] = bf0[51]; + bf1[52] = bf0[52]; + bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit); + bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit); + bf1[55] = bf0[55]; + bf1[56] = bf0[56]; + bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit); + bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit); + bf1[59] = bf0[59]; + bf1[60] = bf0[60]; + bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit); + bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit); + bf1[63] = bf0[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 9 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = bf0[10]; + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = bf0[13]; + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit); + bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit); + bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit); + bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit); + bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit); + bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit); + bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit); + bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit); + bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit); + bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit); + bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit); + bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit); + bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit); + bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit); + bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit); + bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit); + bf1[32] = bf0[32] + bf0[33]; + bf1[33] = -bf0[33] + bf0[32]; + bf1[34] = -bf0[34] + bf0[35]; + bf1[35] = bf0[35] + bf0[34]; + bf1[36] = bf0[36] + bf0[37]; + bf1[37] = -bf0[37] + bf0[36]; + bf1[38] = -bf0[38] + bf0[39]; + bf1[39] = bf0[39] + bf0[38]; + bf1[40] = bf0[40] + bf0[41]; + bf1[41] = -bf0[41] + bf0[40]; + bf1[42] = -bf0[42] + bf0[43]; + bf1[43] = bf0[43] + bf0[42]; + bf1[44] = bf0[44] + bf0[45]; + bf1[45] = -bf0[45] + bf0[44]; + bf1[46] = -bf0[46] + bf0[47]; + bf1[47] = bf0[47] + bf0[46]; + bf1[48] = bf0[48] + bf0[49]; + bf1[49] = -bf0[49] + bf0[48]; + bf1[50] = -bf0[50] + bf0[51]; + bf1[51] = bf0[51] + bf0[50]; + bf1[52] = bf0[52] + bf0[53]; + bf1[53] = -bf0[53] + bf0[52]; + bf1[54] = -bf0[54] + bf0[55]; + bf1[55] = bf0[55] + bf0[54]; + bf1[56] = bf0[56] + bf0[57]; + bf1[57] = -bf0[57] + bf0[56]; + bf1[58] = -bf0[58] + bf0[59]; + bf1[59] = bf0[59] + bf0[58]; + bf1[60] = bf0[60] + bf0[61]; + bf1[61] = -bf0[61] + bf0[60]; + bf1[62] = -bf0[62] + bf0[63]; + bf1[63] = bf0[63] + bf0[62]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 10 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = bf0[10]; + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = bf0[13]; + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = bf0[18]; + bf1[19] = bf0[19]; + bf1[20] = bf0[20]; + bf1[21] = bf0[21]; + bf1[22] = bf0[22]; + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = bf0[25]; + bf1[26] = bf0[26]; + bf1[27] = bf0[27]; + bf1[28] = bf0[28]; + bf1[29] = bf0[29]; + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit); + bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit); + bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit); + bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit); + bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit); + bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit); + bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit); + bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit); + bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit); + bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit); + bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit); + bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit); + bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit); + bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit); + bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit); + bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit); + bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit); + bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit); + bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit); + bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit); + bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit); + bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit); + bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit); + bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit); + bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit); + bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit); + bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit); + bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit); + bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit); + bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit); + bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit); + bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 11 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[32]; + bf1[2] = bf0[16]; + bf1[3] = bf0[48]; + bf1[4] = bf0[8]; + bf1[5] = bf0[40]; + bf1[6] = bf0[24]; + bf1[7] = bf0[56]; + bf1[8] = bf0[4]; + bf1[9] = bf0[36]; + bf1[10] = bf0[20]; + bf1[11] = bf0[52]; + bf1[12] = bf0[12]; + bf1[13] = bf0[44]; + bf1[14] = bf0[28]; + bf1[15] = bf0[60]; + bf1[16] = bf0[2]; + bf1[17] = bf0[34]; + bf1[18] = bf0[18]; + bf1[19] = bf0[50]; + bf1[20] = bf0[10]; + bf1[21] = bf0[42]; + bf1[22] = bf0[26]; + bf1[23] = bf0[58]; + bf1[24] = bf0[6]; + bf1[25] = bf0[38]; + bf1[26] = bf0[22]; + bf1[27] = bf0[54]; + bf1[28] = bf0[14]; + bf1[29] = bf0[46]; + bf1[30] = bf0[30]; + bf1[31] = bf0[62]; + bf1[32] = bf0[1]; + bf1[33] = bf0[33]; + bf1[34] = bf0[17]; + bf1[35] = bf0[49]; + bf1[36] = bf0[9]; + bf1[37] = bf0[41]; + bf1[38] = bf0[25]; + bf1[39] = bf0[57]; + bf1[40] = bf0[5]; + bf1[41] = bf0[37]; + bf1[42] = bf0[21]; + bf1[43] = bf0[53]; + bf1[44] = bf0[13]; + bf1[45] = bf0[45]; + bf1[46] = bf0[29]; + bf1[47] = bf0[61]; + bf1[48] = bf0[3]; + bf1[49] = bf0[35]; + bf1[50] = bf0[19]; + bf1[51] = bf0[51]; + bf1[52] = bf0[11]; + bf1[53] = bf0[43]; + bf1[54] = bf0[27]; + bf1[55] = bf0[59]; + bf1[56] = bf0[7]; + bf1[57] = bf0[39]; + bf1[58] = bf0[23]; + bf1[59] = bf0[55]; + bf1[60] = bf0[15]; + bf1[61] = bf0[47]; + bf1[62] = bf0[31]; + bf1[63] = bf0[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); +} diff --git a/libs/libaom/src/av1/encoder/av1_fwd_txfm1d.h b/libs/libaom/src/av1/encoder/av1_fwd_txfm1d.h new file mode 100644 index 000000000..9ef54fe4d --- /dev/null +++ b/libs/libaom/src/av1/encoder/av1_fwd_txfm1d.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_ +#define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_ + +#include "av1/common/av1_txfm.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_fdct4(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fdct8(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fdct16(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fdct32(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fdct64(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fadst4(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fadst8(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fadst16(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +#ifdef __cplusplus +} +#endif + +#endif // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_ diff --git a/libs/libaom/src/av1/encoder/av1_fwd_txfm1d_cfg.h b/libs/libaom/src/av1/encoder/av1_fwd_txfm1d_cfg.h new file mode 100644 index 000000000..2777cc25b --- /dev/null +++ b/libs/libaom/src/av1/encoder/av1_fwd_txfm1d_cfg.h @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_ +#define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_ +#include "av1/common/enums.h" +#include "av1/encoder/av1_fwd_txfm1d.h" +extern const int8_t *av1_fwd_txfm_shift_ls[TX_SIZES_ALL]; +extern const int8_t av1_fwd_cos_bit_col[5][5]; +extern const int8_t av1_fwd_cos_bit_row[5][5]; +#endif // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_ diff --git a/libs/libaom/src/av1/encoder/av1_fwd_txfm2d.c b/libs/libaom/src/av1/encoder/av1_fwd_txfm2d.c new file mode 100644 index 000000000..bcb829d79 --- /dev/null +++ b/libs/libaom/src/av1/encoder/av1_fwd_txfm2d.c @@ -0,0 +1,429 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/txfm_common.h" +#include "av1/common/enums.h" +#include "av1/common/av1_txfm.h" +#include "av1/encoder/av1_fwd_txfm1d.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" + +static INLINE TxfmFunc fwd_txfm_type_to_func(TXFM_TYPE txfm_type) { + switch (txfm_type) { + case TXFM_TYPE_DCT4: return av1_fdct4; + case TXFM_TYPE_DCT8: return av1_fdct8; + case TXFM_TYPE_DCT16: return av1_fdct16; + case TXFM_TYPE_DCT32: return av1_fdct32; + case TXFM_TYPE_DCT64: return av1_fdct64; + case TXFM_TYPE_ADST4: return av1_fadst4; + case TXFM_TYPE_ADST8: return av1_fadst8; + case TXFM_TYPE_ADST16: return av1_fadst16; + case TXFM_TYPE_IDENTITY4: return av1_fidentity4_c; + case TXFM_TYPE_IDENTITY8: return av1_fidentity8_c; + case TXFM_TYPE_IDENTITY16: return av1_fidentity16_c; + case TXFM_TYPE_IDENTITY32: return av1_fidentity32_c; + default: assert(0); return NULL; + } +} + +void av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row, + const TXFM_2D_FLIP_CFG *cfg, int bd) { + // Take the shift from the larger dimension in the rectangular case. + const int8_t *shift = cfg->shift; + // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning + for (int i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) { + stage_range_col[i] = cfg->stage_range_col[i] + shift[0] + bd + 1; + } + + // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning + for (int i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) { + stage_range_row[i] = cfg->stage_range_row[i] + shift[0] + shift[1] + bd + 1; + } +} + +static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output, + const int stride, const TXFM_2D_FLIP_CFG *cfg, + int32_t *buf, int bd) { + int c, r; + // Note when assigning txfm_size_col, we use the txfm_size from the + // row configuration and vice versa. This is intentionally done to + // accurately perform rectangular transforms. When the transform is + // rectangular, the number of columns will be the same as the + // txfm_size stored in the row cfg struct. It will make no difference + // for square transforms. + const int txfm_size_col = tx_size_wide[cfg->tx_size]; + const int txfm_size_row = tx_size_high[cfg->tx_size]; + // Take the shift from the larger dimension in the rectangular case. + const int8_t *shift = cfg->shift; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + int8_t stage_range_col[MAX_TXFM_STAGE_NUM]; + int8_t stage_range_row[MAX_TXFM_STAGE_NUM]; + assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM); + assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM); + av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bd); + + const int8_t cos_bit_col = cfg->cos_bit_col; + const int8_t cos_bit_row = cfg->cos_bit_row; + const TxfmFunc txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col); + const TxfmFunc txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row); + + // use output buffer as temp buffer + int32_t *temp_in = output; + int32_t *temp_out = output + txfm_size_row; + + // Columns + for (c = 0; c < txfm_size_col; ++c) { + if (cfg->ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) temp_in[r] = input[r * stride + c]; + } else { + for (r = 0; r < txfm_size_row; ++r) + // flip upside down + temp_in[r] = input[(txfm_size_row - r - 1) * stride + c]; + } + av1_round_shift_array(temp_in, txfm_size_row, -shift[0]); + txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + if (cfg->lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + buf[r * txfm_size_col + c] = temp_out[r]; + } else { + for (r = 0; r < txfm_size_row; ++r) + // flip from left to right + buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r]; + } + } + + // Rows + for (r = 0; r < txfm_size_row; ++r) { + txfm_func_row(buf + r * txfm_size_col, output + r * txfm_size_col, + cos_bit_row, stage_range_row); + av1_round_shift_array(output + r * txfm_size_col, txfm_size_col, -shift[2]); + if (abs(rect_type) == 1) { + // Multiply everything by Sqrt2 if the transform is rectangular and the + // size difference is a factor of 2. + for (c = 0; c < txfm_size_col; ++c) { + output[r * txfm_size_col + c] = round_shift( + (int64_t)output[r * txfm_size_col + c] * NewSqrt2, NewSqrt2Bits); + } + } + } +} + +void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 8]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_4X8, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[8 * 4]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_8X4, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int32_t, txfm_buf[8 * 16]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_8X16, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[16 * 8]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_16X8, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int32_t, txfm_buf[16 * 32]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_16X32, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[32 * 16]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_32X16, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_4x16_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 16]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_4X16, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_16x4_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[16 * 4]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_16X4, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_8x32_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 8]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_8X32, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_32x8_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[32 * 8]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_32X8, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[4 * 4]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_4X4, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[8 * 8]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_8X8, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[16 * 16]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_16X16, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[32 * 32]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[64 * 64]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); + + // Zero out top-right 32x32 area. + for (int row = 0; row < 32; ++row) { + memset(output + row * 64 + 32, 0, 32 * sizeof(*output)); + } + // Zero out the bottom 64x32 area. + memset(output + 32 * 64, 0, 32 * 64 * sizeof(*output)); + // Re-pack non-zero coeffs in the first 32x32 indices. + for (int row = 1; row < 32; ++row) { + memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output)); + } +} + +void av1_fwd_txfm2d_32x64_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 64]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_32X64, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); + // Zero out the bottom 32x32 area. + memset(output + 32 * 32, 0, 32 * 32 * sizeof(*output)); + // Note: no repacking needed here. +} + +void av1_fwd_txfm2d_64x32_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[64 * 32]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_64X32, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); + + // Zero out right 32x32 area. + for (int row = 0; row < 32; ++row) { + memset(output + row * 64 + 32, 0, 32 * sizeof(*output)); + } + // Re-pack non-zero coeffs in the first 32x32 indices. + for (int row = 1; row < 32; ++row) { + memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output)); + } +} + +void av1_fwd_txfm2d_16x64_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int32_t, txfm_buf[64 * 16]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_16X64, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); + // Zero out the bottom 16x32 area. + memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output)); + // Note: no repacking needed here. +} + +void av1_fwd_txfm2d_64x16_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[64 * 16]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_64X16, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); + // Zero out right 32x16 area. + for (int row = 0; row < 16; ++row) { + memset(output + row * 64 + 32, 0, 32 * sizeof(*output)); + } + // Re-pack non-zero coeffs in the first 32x16 indices. + for (int row = 1; row < 16; ++row) { + memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output)); + } +} + +static const int8_t fwd_shift_4x4[3] = { 2, 0, 0 }; +static const int8_t fwd_shift_8x8[3] = { 2, -1, 0 }; +static const int8_t fwd_shift_16x16[3] = { 2, -2, 0 }; +static const int8_t fwd_shift_32x32[3] = { 2, -4, 0 }; +static const int8_t fwd_shift_64x64[3] = { 0, -2, -2 }; +static const int8_t fwd_shift_4x8[3] = { 2, -1, 0 }; +static const int8_t fwd_shift_8x4[3] = { 2, -1, 0 }; +static const int8_t fwd_shift_8x16[3] = { 2, -2, 0 }; +static const int8_t fwd_shift_16x8[3] = { 2, -2, 0 }; +static const int8_t fwd_shift_16x32[3] = { 2, -4, 0 }; +static const int8_t fwd_shift_32x16[3] = { 2, -4, 0 }; +static const int8_t fwd_shift_32x64[3] = { 0, -2, -2 }; +static const int8_t fwd_shift_64x32[3] = { 2, -4, -2 }; +static const int8_t fwd_shift_4x16[3] = { 2, -1, 0 }; +static const int8_t fwd_shift_16x4[3] = { 2, -1, 0 }; +static const int8_t fwd_shift_8x32[3] = { 2, -2, 0 }; +static const int8_t fwd_shift_32x8[3] = { 2, -2, 0 }; +static const int8_t fwd_shift_16x64[3] = { 0, -2, 0 }; +static const int8_t fwd_shift_64x16[3] = { 2, -4, 0 }; + +const int8_t *av1_fwd_txfm_shift_ls[TX_SIZES_ALL] = { + fwd_shift_4x4, fwd_shift_8x8, fwd_shift_16x16, fwd_shift_32x32, + fwd_shift_64x64, fwd_shift_4x8, fwd_shift_8x4, fwd_shift_8x16, + fwd_shift_16x8, fwd_shift_16x32, fwd_shift_32x16, fwd_shift_32x64, + fwd_shift_64x32, fwd_shift_4x16, fwd_shift_16x4, fwd_shift_8x32, + fwd_shift_32x8, fwd_shift_16x64, fwd_shift_64x16, +}; + +const int8_t av1_fwd_cos_bit_col[MAX_TXWH_IDX /*txw_idx*/] + [MAX_TXWH_IDX /*txh_idx*/] = { + { 13, 13, 13, 0, 0 }, + { 13, 13, 13, 12, 0 }, + { 13, 13, 13, 12, 13 }, + { 0, 13, 13, 12, 13 }, + { 0, 0, 13, 12, 13 } + }; + +const int8_t av1_fwd_cos_bit_row[MAX_TXWH_IDX /*txw_idx*/] + [MAX_TXWH_IDX /*txh_idx*/] = { + { 13, 13, 12, 0, 0 }, + { 13, 13, 13, 12, 0 }, + { 13, 13, 12, 13, 12 }, + { 0, 12, 13, 12, 11 }, + { 0, 0, 12, 11, 10 } + }; + +static const int8_t fdct4_range_mult2[4] = { 0, 2, 3, 3 }; +static const int8_t fdct8_range_mult2[6] = { 0, 2, 4, 5, 5, 5 }; +static const int8_t fdct16_range_mult2[8] = { 0, 2, 4, 6, 7, 7, 7, 7 }; +static const int8_t fdct32_range_mult2[10] = { 0, 2, 4, 6, 8, 9, 9, 9, 9, 9 }; +static const int8_t fdct64_range_mult2[12] = { 0, 2, 4, 6, 8, 10, + 11, 11, 11, 11, 11, 11 }; + +static const int8_t fadst4_range_mult2[7] = { 0, 2, 4, 3, 3, 3, 3 }; +static const int8_t fadst8_range_mult2[8] = { 0, 0, 1, 3, 3, 5, 5, 5 }; +static const int8_t fadst16_range_mult2[10] = { 0, 0, 1, 3, 3, 5, 5, 7, 7, 7 }; + +static const int8_t fidtx4_range_mult2[1] = { 1 }; +static const int8_t fidtx8_range_mult2[1] = { 2 }; +static const int8_t fidtx16_range_mult2[1] = { 3 }; +static const int8_t fidtx32_range_mult2[1] = { 4 }; + +#if 0 +const int8_t fwd_idtx_range_row[MAX_TXWH_IDX /*txw_idx*/] + [MAX_TXWH_IDX /*txh_idx*/] = { { 2, 4, 5, 0, 0 }, + { 3, 4, 5, 6, 0 }, + { 4, 5, 6, 7, 8 }, + { 0, 5, 6, 7, 8 }, + { 0, 0, 7, 8, + 9 } }; +#endif + +static const int8_t *fwd_txfm_range_mult2_list[TXFM_TYPES] = { + fdct4_range_mult2, fdct8_range_mult2, fdct16_range_mult2, + fdct32_range_mult2, fdct64_range_mult2, fadst4_range_mult2, + fadst8_range_mult2, fadst16_range_mult2, fidtx4_range_mult2, + fidtx8_range_mult2, fidtx16_range_mult2, fidtx32_range_mult2 +}; + +static INLINE void set_fwd_txfm_non_scale_range(TXFM_2D_FLIP_CFG *cfg) { + av1_zero(cfg->stage_range_col); + av1_zero(cfg->stage_range_row); + + const int8_t *range_mult2_col = fwd_txfm_range_mult2_list[cfg->txfm_type_col]; + if (cfg->txfm_type_col != TXFM_TYPE_INVALID) { + int stage_num_col = cfg->stage_num_col; + for (int i = 0; i < stage_num_col; ++i) + cfg->stage_range_col[i] = (range_mult2_col[i] + 1) >> 1; + } + + if (cfg->txfm_type_row != TXFM_TYPE_INVALID) { + int stage_num_row = cfg->stage_num_row; + const int8_t *range_mult2_row = + fwd_txfm_range_mult2_list[cfg->txfm_type_row]; + for (int i = 0; i < stage_num_row; ++i) { + cfg->stage_range_row[i] = + (range_mult2_col[cfg->stage_num_col - 1] + range_mult2_row[i] + 1) >> + 1; + } + } +} + +void av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size, + TXFM_2D_FLIP_CFG *cfg) { + assert(cfg != NULL); + cfg->tx_size = tx_size; + set_flip_cfg(tx_type, cfg); + const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type]; + const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + cfg->shift = av1_fwd_txfm_shift_ls[tx_size]; + cfg->cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + cfg->cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col]; + cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row]; + cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col]; + cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row]; + set_fwd_txfm_non_scale_range(cfg); +} diff --git a/libs/libaom/src/av1/encoder/av1_multi_thread.c b/libs/libaom/src/av1/encoder/av1_multi_thread.c new file mode 100644 index 000000000..d170b0c28 --- /dev/null +++ b/libs/libaom/src/av1/encoder/av1_multi_thread.c @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/encoder/encoder.h" +#include "av1/encoder/ethread.h" +#include "av1/encoder/av1_multi_thread.h" + +void av1_row_mt_mem_alloc(AV1_COMP *cpi, int max_sb_rows) { + struct AV1Common *cm = &cpi->common; + MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; + int tile_row, tile_col; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + + multi_thread_ctxt->allocated_tile_cols = tile_cols; + multi_thread_ctxt->allocated_tile_rows = tile_rows; + multi_thread_ctxt->allocated_sb_rows = max_sb_rows; + + // Allocate memory for row based multi-threading + for (tile_row = 0; tile_row < multi_thread_ctxt->allocated_tile_rows; + tile_row++) { + for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols; + tile_col++) { + TileDataEnc *this_tile = + &cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols + + tile_col]; + av1_row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, max_sb_rows); + if (cpi->oxcf.cdf_update_mode) + CHECK_MEM_ERROR( + cm, this_tile->row_ctx, + (FRAME_CONTEXT *)aom_memalign( + 16, + AOMMAX(1, (av1_get_sb_cols_in_tile(cm, this_tile->tile_info) - + 1)) * + sizeof(*this_tile->row_ctx))); + } + } +} + +void av1_row_mt_mem_dealloc(AV1_COMP *cpi) { + MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; + int tile_col; + int tile_row; + + // Free row based multi-threading sync memory + for (tile_row = 0; tile_row < multi_thread_ctxt->allocated_tile_rows; + tile_row++) { + for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols; + tile_col++) { + TileDataEnc *this_tile = + &cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols + + tile_col]; + av1_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync); + if (cpi->oxcf.cdf_update_mode) aom_free(this_tile->row_ctx); + } + } + multi_thread_ctxt->allocated_sb_rows = 0; + multi_thread_ctxt->allocated_tile_cols = 0; + multi_thread_ctxt->allocated_tile_rows = 0; +} diff --git a/libs/libaom/src/av1/encoder/av1_multi_thread.h b/libs/libaom/src/av1/encoder/av1_multi_thread.h new file mode 100644 index 000000000..2a1cc7d6d --- /dev/null +++ b/libs/libaom/src/av1/encoder/av1_multi_thread.h @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_AV1_MULTI_THREAD_H +#define AV1_ENCODER_AV1_MULTI_THREAD_H + +#include "av1/encoder/encoder.h" + +void av1_row_mt_mem_alloc(AV1_COMP *cpi, int max_sb_rows); + +void av1_row_mt_mem_dealloc(AV1_COMP *cpi); + +#endif // AV1_ENCODER_AV1_MULTI_THREAD_H diff --git a/libs/libaom/src/av1/encoder/av1_quantize.c b/libs/libaom/src/av1/encoder/av1_quantize.c new file mode 100644 index 000000000..569784a2a --- /dev/null +++ b/libs/libaom/src/av1/encoder/av1_quantize.c @@ -0,0 +1,789 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/quantize.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" + +#include "av1/common/idct.h" +#include "av1/common/quant_common.h" +#include "av1/common/scan.h" +#include "av1/common/seg_common.h" + +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/rd.h" + +void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) { + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + *eob_ptr = 0; +} + +static void quantize_fp_helper_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, int log_scale) { + int i, eob = -1; + const int rounding[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale), + ROUND_POWER_OF_TWO(round_ptr[1], log_scale) }; + // TODO(jingning) Decide the need of these arguments after the + // quantization process is completed. + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)iscan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (qm_ptr == NULL && iqm_ptr == NULL) { + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int32_t thresh = (int32_t)(dequant_ptr[rc != 0]); + const int coeff = coeff_ptr[rc]; + const int coeff_sign = AOMSIGN(coeff); + int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp32 = 0; + if ((abs_coeff << (1 + log_scale)) >= thresh) { + abs_coeff = + clamp64(abs_coeff + rounding[rc != 0], INT16_MIN, INT16_MAX); + tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale)); + if (tmp32) { + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + const tran_low_t abs_dqcoeff = + (tmp32 * dequant_ptr[rc != 0]) >> log_scale; + dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign; + } + } + if (tmp32) eob = i; + } + } else { + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const qm_val_t wt = qm_ptr ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const qm_val_t iwt = iqm_ptr ? iqm_ptr[rc] : (1 << AOM_QM_BITS); + const int dequant = + (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> + AOM_QM_BITS; + const int coeff_sign = AOMSIGN(coeff); + int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp32 = 0; + if (abs_coeff * wt >= + (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) { + abs_coeff += rounding[rc != 0]; + abs_coeff = clamp64(abs_coeff, INT16_MIN, INT16_MAX); + tmp32 = (int)((abs_coeff * wt * quant_ptr[rc != 0]) >> + (16 - log_scale + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale; + dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign; + } + + if (tmp32) eob = i; + } + } + *eob_ptr = eob + 1; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void highbd_quantize_fp_helper_c( + const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, int log_scale) { + int i; + int eob = -1; + const int shift = 16 - log_scale; + // TODO(jingning) Decide the need of these arguments after the + // quantization process is completed. + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)iscan; + + if (qm_ptr || iqm_ptr) { + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); + const int dequant = + (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> + AOM_QM_BITS; + const int coeff_sign = AOMSIGN(coeff); + const int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int abs_qcoeff = 0; + if (abs_coeff * wt >= + (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) { + const int64_t tmp = + abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale); + abs_qcoeff = + (int)((tmp * quant_ptr[rc != 0] * wt) >> (shift + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + if (abs_qcoeff) eob = i; + } else { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + } + } else { + const int log_scaled_round_arr[2] = { + ROUND_POWER_OF_TWO(round_ptr[0], log_scale), + ROUND_POWER_OF_TWO(round_ptr[1], log_scale), + }; + for (i = 0; i < count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int rc01 = (rc != 0); + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int log_scaled_round = log_scaled_round_arr[rc01]; + if ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01]) { + const int quant = quant_ptr[rc01]; + const int dequant = dequant_ptr[rc01]; + const int64_t tmp = (int64_t)abs_coeff + log_scaled_round; + const int abs_qcoeff = (int)((tmp * quant) >> shift); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale; + if (abs_qcoeff) eob = i; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + } else { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + } + } + *eob_ptr = eob + 1; +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 0); +} + +void av1_quantize_lp_c(const int16_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, const int16_t *quant_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan) { + int eob = -1; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (int i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = (tmp * quant_ptr[rc != 0]) >> 16; + + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + + if (tmp) eob = i; + } + *eob_ptr = eob + 1; +} + +void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 1); +} + +void av1_quantize_fp_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 2); +} + +void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, const QUANT_PARAM *qparam) { + const qm_val_t *qm_ptr = qparam->qmatrix; + const qm_val_t *iqm_ptr = qparam->iqmatrix; + if (qm_ptr != NULL && iqm_ptr != NULL) { + quantize_fp_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, + p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); + } else { + switch (qparam->log_scale) { + case 0: + av1_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, + p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); + break; + case 1: + av1_quantize_fp_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, + p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); + break; + case 2: + av1_quantize_fp_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, + p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); + break; + default: assert(0); + } + } +} + +void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, const QUANT_PARAM *qparam) { + const qm_val_t *qm_ptr = qparam->qmatrix; + const qm_val_t *iqm_ptr = qparam->iqmatrix; + if (qparam->use_quant_b_adapt) { + // TODO(sarahparker) These quantize_b optimizations need SIMD + // implementations + if (qm_ptr != NULL && iqm_ptr != NULL) { + aom_quantize_b_adaptive_helper_c( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, + sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); + } else { + switch (qparam->log_scale) { + case 0: + aom_quantize_b_adaptive(coeff_ptr, n_coeffs, p->zbin_QTX, + p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, + p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); + break; + case 1: + aom_quantize_b_32x32_adaptive( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + break; + case 2: + aom_quantize_b_64x64_adaptive( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + break; + default: assert(0); + } + } + } else { + if (qm_ptr != NULL && iqm_ptr != NULL) { + aom_quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); + } else { + switch (qparam->log_scale) { + case 0: + aom_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); + break; + case 1: + aom_quantize_b_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); + break; + case 2: + aom_quantize_b_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); + break; + default: assert(0); + } + } + } +} + +static void quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t quant, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, + uint16_t *eob_ptr, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale) { + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int64_t tmp; + int eob = -1; + int32_t tmp32; + int dequant; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + const int wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); + tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale), + INT16_MIN, INT16_MAX); + tmp32 = (int32_t)((tmp * wt * quant) >> (16 - log_scale + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + dequant = (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + if (tmp32) eob = 0; + } + *eob_ptr = eob + 1; +} + +void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, const QUANT_PARAM *qparam) { + // obsolete skip_block + const int skip_block = 0; + (void)sc; + assert(qparam->log_scale >= 0 && qparam->log_scale < (3)); + const qm_val_t *qm_ptr = qparam->qmatrix; + const qm_val_t *iqm_ptr = qparam->iqmatrix; + quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round_QTX, + p->quant_fp_QTX[0], qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX[0], + eob_ptr, qm_ptr, iqm_ptr, qparam->log_scale); +} + +#if CONFIG_AV1_HIGHBITDEPTH +void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam) { + const qm_val_t *qm_ptr = qparam->qmatrix; + const qm_val_t *iqm_ptr = qparam->iqmatrix; + if (qm_ptr != NULL && iqm_ptr != NULL) { + highbd_quantize_fp_helper_c( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, + sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); + } else { + av1_highbd_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, + p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan, qparam->log_scale); + } +} + +void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam) { + const qm_val_t *qm_ptr = qparam->qmatrix; + const qm_val_t *iqm_ptr = qparam->iqmatrix; + if (qparam->use_quant_b_adapt) { + if (qm_ptr != NULL && iqm_ptr != NULL) { + aom_highbd_quantize_b_adaptive_helper_c( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, + sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); + } else { + switch (qparam->log_scale) { + case 0: + aom_highbd_quantize_b_adaptive( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + break; + case 1: + aom_highbd_quantize_b_32x32_adaptive( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + break; + case 2: + aom_highbd_quantize_b_64x64_adaptive( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + break; + default: assert(0); + } + } + } else { + if (qm_ptr != NULL && iqm_ptr != NULL) { + aom_highbd_quantize_b_helper_c( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, + sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); + } else { + switch (qparam->log_scale) { + case 0: + aom_highbd_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); + break; + case 1: + aom_highbd_quantize_b_32x32( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + break; + case 2: + aom_highbd_quantize_b_64x64( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + break; + default: assert(0); + } + } + } +} + +static INLINE void highbd_quantize_dc( + const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, + const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr, + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale) { + int eob = -1; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[0] : (1 << AOM_QM_BITS); + const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[0] : (1 << AOM_QM_BITS); + const int coeff = coeff_ptr[0]; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], log_scale); + const int64_t tmpw = tmp * wt; + const int abs_qcoeff = + (int)((tmpw * quant) >> (16 - log_scale + AOM_QM_BITS)); + qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + const int dequant = + (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + + const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale; + dqcoeff_ptr[0] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + if (abs_qcoeff) eob = 0; + } + *eob_ptr = eob + 1; +} + +void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam) { + // obsolete skip_block + const int skip_block = 0; + const qm_val_t *qm_ptr = qparam->qmatrix; + const qm_val_t *iqm_ptr = qparam->iqmatrix; + (void)sc; + + highbd_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round_QTX, + p->quant_fp_QTX[0], qcoeff_ptr, dqcoeff_ptr, + p->dequant_QTX[0], eob_ptr, qm_ptr, iqm_ptr, + qparam->log_scale); +} + +void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, + int log_scale) { + highbd_quantize_fp_helper_c(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, + log_scale); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static void invert_quant(int16_t *quant, int16_t *shift, int d) { + uint32_t t; + int l, m; + t = d; + for (l = 0; t > 1; l++) t >>= 1; + m = 1 + (1 << (16 + l)) / d; + *quant = (int16_t)(m - (1 << 16)); + *shift = 1 << (16 - l); +} + +static int get_qzbin_factor(int q, aom_bit_depth_t bit_depth) { + const int quant = av1_dc_quant_QTX(q, 0, bit_depth); + switch (bit_depth) { + case AOM_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80); + case AOM_BITS_10: return q == 0 ? 64 : (quant < 592 ? 84 : 80); + case AOM_BITS_12: return q == 0 ? 64 : (quant < 2368 ? 84 : 80); + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + return -1; + } +} + +void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q, + int u_dc_delta_q, int u_ac_delta_q, int v_dc_delta_q, + int v_ac_delta_q, QUANTS *const quants, + Dequants *const deq) { + int i, q, quant_QTX; + + for (q = 0; q < QINDEX_RANGE; q++) { + const int qzbin_factor = get_qzbin_factor(q, bit_depth); + const int qrounding_factor = q == 0 ? 64 : 48; + + for (i = 0; i < 2; ++i) { + int qrounding_factor_fp = 64; + // y quantizer with TX scale + quant_QTX = i == 0 ? av1_dc_quant_QTX(q, y_dc_delta_q, bit_depth) + : av1_ac_quant_QTX(q, 0, bit_depth); + invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], + quant_QTX); + quants->y_quant_fp[q][i] = (1 << 16) / quant_QTX; + quants->y_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7; + quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7); + quants->y_round[q][i] = (qrounding_factor * quant_QTX) >> 7; + deq->y_dequant_QTX[q][i] = quant_QTX; + + // u quantizer with TX scale + quant_QTX = i == 0 ? av1_dc_quant_QTX(q, u_dc_delta_q, bit_depth) + : av1_ac_quant_QTX(q, u_ac_delta_q, bit_depth); + invert_quant(&quants->u_quant[q][i], &quants->u_quant_shift[q][i], + quant_QTX); + quants->u_quant_fp[q][i] = (1 << 16) / quant_QTX; + quants->u_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7; + quants->u_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7); + quants->u_round[q][i] = (qrounding_factor * quant_QTX) >> 7; + deq->u_dequant_QTX[q][i] = quant_QTX; + + // v quantizer with TX scale + quant_QTX = i == 0 ? av1_dc_quant_QTX(q, v_dc_delta_q, bit_depth) + : av1_ac_quant_QTX(q, v_ac_delta_q, bit_depth); + invert_quant(&quants->v_quant[q][i], &quants->v_quant_shift[q][i], + quant_QTX); + quants->v_quant_fp[q][i] = (1 << 16) / quant_QTX; + quants->v_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7; + quants->v_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7); + quants->v_round[q][i] = (qrounding_factor * quant_QTX) >> 7; + deq->v_dequant_QTX[q][i] = quant_QTX; + } + + for (i = 2; i < 8; i++) { // 8: SIMD width + quants->y_quant[q][i] = quants->y_quant[q][1]; + quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1]; + quants->y_round_fp[q][i] = quants->y_round_fp[q][1]; + quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1]; + quants->y_zbin[q][i] = quants->y_zbin[q][1]; + quants->y_round[q][i] = quants->y_round[q][1]; + deq->y_dequant_QTX[q][i] = deq->y_dequant_QTX[q][1]; + + quants->u_quant[q][i] = quants->u_quant[q][1]; + quants->u_quant_fp[q][i] = quants->u_quant_fp[q][1]; + quants->u_round_fp[q][i] = quants->u_round_fp[q][1]; + quants->u_quant_shift[q][i] = quants->u_quant_shift[q][1]; + quants->u_zbin[q][i] = quants->u_zbin[q][1]; + quants->u_round[q][i] = quants->u_round[q][1]; + deq->u_dequant_QTX[q][i] = deq->u_dequant_QTX[q][1]; + quants->v_quant[q][i] = quants->u_quant[q][1]; + quants->v_quant_fp[q][i] = quants->v_quant_fp[q][1]; + quants->v_round_fp[q][i] = quants->v_round_fp[q][1]; + quants->v_quant_shift[q][i] = quants->v_quant_shift[q][1]; + quants->v_zbin[q][i] = quants->v_zbin[q][1]; + quants->v_round[q][i] = quants->v_round[q][1]; + deq->v_dequant_QTX[q][i] = deq->v_dequant_QTX[q][1]; + } + } +} + +void av1_init_quantizer(EncQuantDequantParams *const enc_quant_dequant_params, + const CommonQuantParams *quant_params, + aom_bit_depth_t bit_depth) { + QUANTS *const quants = &enc_quant_dequant_params->quants; + Dequants *const dequants = &enc_quant_dequant_params->dequants; + av1_build_quantizer(bit_depth, quant_params->y_dc_delta_q, + quant_params->u_dc_delta_q, quant_params->u_ac_delta_q, + quant_params->v_dc_delta_q, quant_params->v_ac_delta_q, + quants, dequants); +} + +void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x, + int segment_id) { + const AV1_COMMON *const cm = &cpi->common; + const CommonQuantParams *const quant_params = &cm->quant_params; + MACROBLOCKD *const xd = &x->e_mbd; + const QUANTS *const quants = &cpi->enc_quant_dequant_params.quants; + const Dequants *const dequants = &cpi->enc_quant_dequant_params.dequants; + + const int current_qindex = + AOMMAX(0, AOMMIN(QINDEX_RANGE - 1, + cm->delta_q_info.delta_q_present_flag + ? quant_params->base_qindex + xd->delta_qindex + : quant_params->base_qindex)); + const int qindex = av1_get_qindex(&cm->seg, segment_id, current_qindex); + const int rdmult = + av1_compute_rd_mult(cpi, qindex + quant_params->y_dc_delta_q); + const int use_qmatrix = av1_use_qmatrix(quant_params, xd, segment_id); + + // Y + const int qmlevel_y = + use_qmatrix ? quant_params->qmatrix_level_y : NUM_QM_LEVELS - 1; + x->plane[0].quant_QTX = quants->y_quant[qindex]; + x->plane[0].quant_fp_QTX = quants->y_quant_fp[qindex]; + x->plane[0].round_fp_QTX = quants->y_round_fp[qindex]; + x->plane[0].quant_shift_QTX = quants->y_quant_shift[qindex]; + x->plane[0].zbin_QTX = quants->y_zbin[qindex]; + x->plane[0].round_QTX = quants->y_round[qindex]; + x->plane[0].dequant_QTX = dequants->y_dequant_QTX[qindex]; + memcpy(&xd->plane[0].seg_qmatrix[segment_id], + quant_params->gqmatrix[qmlevel_y][0], + sizeof(quant_params->gqmatrix[qmlevel_y][0])); + memcpy(&xd->plane[0].seg_iqmatrix[segment_id], + quant_params->giqmatrix[qmlevel_y][0], + sizeof(quant_params->giqmatrix[qmlevel_y][0])); + + // U + const int qmlevel_u = + use_qmatrix ? quant_params->qmatrix_level_u : NUM_QM_LEVELS - 1; + x->plane[1].quant_QTX = quants->u_quant[qindex]; + x->plane[1].quant_fp_QTX = quants->u_quant_fp[qindex]; + x->plane[1].round_fp_QTX = quants->u_round_fp[qindex]; + x->plane[1].quant_shift_QTX = quants->u_quant_shift[qindex]; + x->plane[1].zbin_QTX = quants->u_zbin[qindex]; + x->plane[1].round_QTX = quants->u_round[qindex]; + x->plane[1].dequant_QTX = dequants->u_dequant_QTX[qindex]; + memcpy(&xd->plane[1].seg_qmatrix[segment_id], + quant_params->gqmatrix[qmlevel_u][1], + sizeof(quant_params->gqmatrix[qmlevel_u][1])); + memcpy(&xd->plane[1].seg_iqmatrix[segment_id], + quant_params->giqmatrix[qmlevel_u][1], + sizeof(quant_params->giqmatrix[qmlevel_u][1])); + // V + const int qmlevel_v = + use_qmatrix ? quant_params->qmatrix_level_v : NUM_QM_LEVELS - 1; + x->plane[2].quant_QTX = quants->v_quant[qindex]; + x->plane[2].quant_fp_QTX = quants->v_quant_fp[qindex]; + x->plane[2].round_fp_QTX = quants->v_round_fp[qindex]; + x->plane[2].quant_shift_QTX = quants->v_quant_shift[qindex]; + x->plane[2].zbin_QTX = quants->v_zbin[qindex]; + x->plane[2].round_QTX = quants->v_round[qindex]; + x->plane[2].dequant_QTX = dequants->v_dequant_QTX[qindex]; + memcpy(&xd->plane[2].seg_qmatrix[segment_id], + quant_params->gqmatrix[qmlevel_v][2], + sizeof(quant_params->gqmatrix[qmlevel_v][2])); + memcpy(&xd->plane[2].seg_iqmatrix[segment_id], + quant_params->giqmatrix[qmlevel_v][2], + sizeof(quant_params->giqmatrix[qmlevel_v][2])); + x->skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP); + x->qindex = qindex; + + set_error_per_bit(x, rdmult); + + av1_initialize_me_consts(cpi, x, qindex); +} + +void av1_frame_init_quantizer(AV1_COMP *cpi) { + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id); +} + +void av1_set_quantizer(AV1_COMMON *const cm, int min_qmlevel, int max_qmlevel, + int q) { + // quantizer has to be reinitialized with av1_init_quantizer() if any + // delta_q changes. + CommonQuantParams *quant_params = &cm->quant_params; + quant_params->base_qindex = AOMMAX(cm->delta_q_info.delta_q_present_flag, q); + quant_params->y_dc_delta_q = 0; + quant_params->u_dc_delta_q = 0; + quant_params->u_ac_delta_q = 0; + quant_params->v_dc_delta_q = 0; + quant_params->v_ac_delta_q = 0; + quant_params->qmatrix_level_y = + aom_get_qmlevel(quant_params->base_qindex, min_qmlevel, max_qmlevel); + quant_params->qmatrix_level_u = + aom_get_qmlevel(quant_params->base_qindex + quant_params->u_ac_delta_q, + min_qmlevel, max_qmlevel); + + if (!cm->seq_params.separate_uv_delta_q) + quant_params->qmatrix_level_v = quant_params->qmatrix_level_u; + else + quant_params->qmatrix_level_v = + aom_get_qmlevel(quant_params->base_qindex + quant_params->v_ac_delta_q, + min_qmlevel, max_qmlevel); +} + +// Table that converts 0-63 Q-range values passed in outside to the Qindex +// range used internally. +static const int quantizer_to_qindex[] = { + 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, + 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, + 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, + 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, + 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 249, 255, +}; + +int av1_quantizer_to_qindex(int quantizer) { + return quantizer_to_qindex[quantizer]; +} + +int av1_qindex_to_quantizer(int qindex) { + int quantizer; + + for (quantizer = 0; quantizer < 64; ++quantizer) + if (quantizer_to_qindex[quantizer] >= qindex) return quantizer; + + return 63; +} diff --git a/libs/libaom/src/av1/encoder/av1_quantize.h b/libs/libaom/src/av1/encoder/av1_quantize.h new file mode 100644 index 000000000..40fb4bee8 --- /dev/null +++ b/libs/libaom/src/av1/encoder/av1_quantize.h @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_AV1_QUANTIZE_H_ +#define AOM_AV1_ENCODER_AV1_QUANTIZE_H_ + +#include "config/aom_config.h" + +#include "av1/common/quant_common.h" +#include "av1/common/scan.h" +#include "av1/encoder/block.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define EOB_FACTOR 325 +#define SKIP_EOB_FACTOR_ADJUST 200 + +typedef struct QUANT_PARAM { + int log_scale; + TX_SIZE tx_size; + const qm_val_t *qmatrix; + const qm_val_t *iqmatrix; + int use_quant_b_adapt; + int use_optimize_b; + int xform_quant_idx; +} QUANT_PARAM; + +typedef void (*AV1_QUANT_FACADE)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam); + +// The QUANTS structure is used only for internal quantizer setup in +// av1_quantize.c. +// All of its fields use the same coefficient shift/scaling at TX. +typedef struct { + // 0: dc 1: ac 2-8: ac repeated to SIMD width + DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]); + + // TODO(jingning): in progress of re-working the quantization. will decide + // if we want to deprecate the current use of y_quant. + DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, u_quant_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, v_quant_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, u_round_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, v_round_fp[QINDEX_RANGE][8]); + + DECLARE_ALIGNED(16, int16_t, u_quant[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, v_quant[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, u_quant_shift[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, v_quant_shift[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, u_zbin[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, v_zbin[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, u_round[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, v_round[QINDEX_RANGE][8]); +} QUANTS; + +// The Dequants structure is used only for internal quantizer setup in +// av1_quantize.c. +// Fields are suffixed according to whether or not they're expressed in +// the same coefficient shift/precision as TX or a fixed Q3 format. +typedef struct { + DECLARE_ALIGNED(16, int16_t, + y_dequant_QTX[QINDEX_RANGE][8]); // 8: SIMD width + DECLARE_ALIGNED(16, int16_t, + u_dequant_QTX[QINDEX_RANGE][8]); // 8: SIMD width + DECLARE_ALIGNED(16, int16_t, + v_dequant_QTX[QINDEX_RANGE][8]); // 8: SIMD width +} Dequants; + +typedef struct { + // Quantization parameters for internal quantizer setup. + QUANTS quants; + // Dequantization parameters for internal quantizer setup. + Dequants dequants; +} EncQuantDequantParams; + +struct AV1_COMP; +struct AV1Common; + +void av1_frame_init_quantizer(struct AV1_COMP *cpi); + +void av1_init_plane_quantizers(const struct AV1_COMP *cpi, MACROBLOCK *x, + int segment_id); + +void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q, + int u_dc_delta_q, int u_ac_delta_q, int v_dc_delta_q, + int v_ac_delta_q, QUANTS *const quants, + Dequants *const deq); + +void av1_init_quantizer(EncQuantDequantParams *const enc_quant_dequant_params, + const CommonQuantParams *quant_params, + aom_bit_depth_t bit_depth); + +void av1_set_quantizer(struct AV1Common *const cm, int min_qmlevel, + int max_qmlevel, int q); + +int av1_quantizer_to_qindex(int quantizer); + +int av1_qindex_to_quantizer(int qindex); + +void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr); + +void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, const QUANT_PARAM *qparam); + +void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, const QUANT_PARAM *qparam); + +void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, const QUANT_PARAM *qparam); + +#if CONFIG_AV1_HIGHBITDEPTH +void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam); + +void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam); + +void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam); +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_AV1_QUANTIZE_H_ diff --git a/libs/libaom/src/av1/encoder/bitstream.c b/libs/libaom/src/av1/encoder/bitstream.c new file mode 100644 index 000000000..daa8ce1fc --- /dev/null +++ b/libs/libaom/src/av1/encoder/bitstream.c @@ -0,0 +1,3925 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom/aom_encoder.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/binary_codes_writer.h" +#include "aom_dsp/bitwriter_buffer.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/bitops.h" +#include "aom_ports/mem_ops.h" +#include "aom_ports/system_state.h" +#if CONFIG_BITSTREAM_DEBUG +#include "aom_util/debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG + +#include "av1/common/cdef.h" +#include "av1/common/cfl.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/entropymv.h" +#include "av1/common/mvref_common.h" +#include "av1/common/pred_common.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/common/seg_common.h" +#include "av1/common/tile_common.h" + +#include "av1/encoder/bitstream.h" +#include "av1/encoder/cost.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/encodetxb.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/palette.h" +#include "av1/encoder/segmentation.h" +#include "av1/encoder/tokenize.h" + +#define ENC_MISMATCH_DEBUG 0 + +static INLINE void write_uniform(aom_writer *w, int n, int v) { + const int l = get_unsigned_bits(n); + const int m = (1 << l) - n; + if (l == 0) return; + if (v < m) { + aom_write_literal(w, v, l - 1); + } else { + aom_write_literal(w, m + ((v - m) >> 1), l - 1); + aom_write_literal(w, (v - m) & 1, 1); + } +} + +static AOM_INLINE void loop_restoration_write_sb_coeffs( + const AV1_COMMON *const cm, MACROBLOCKD *xd, const RestorationUnitInfo *rui, + aom_writer *const w, int plane, FRAME_COUNTS *counts); + +static AOM_INLINE void write_intra_y_mode_kf(FRAME_CONTEXT *frame_ctx, + const MB_MODE_INFO *mi, + const MB_MODE_INFO *above_mi, + const MB_MODE_INFO *left_mi, + PREDICTION_MODE mode, + aom_writer *w) { + assert(!is_intrabc_block(mi)); + (void)mi; + aom_write_symbol(w, mode, get_y_mode_cdf(frame_ctx, above_mi, left_mi), + INTRA_MODES); +} + +static AOM_INLINE void write_inter_mode(aom_writer *w, PREDICTION_MODE mode, + FRAME_CONTEXT *ec_ctx, + const int16_t mode_ctx) { + const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK; + + aom_write_symbol(w, mode != NEWMV, ec_ctx->newmv_cdf[newmv_ctx], 2); + + if (mode != NEWMV) { + const int16_t zeromv_ctx = + (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; + aom_write_symbol(w, mode != GLOBALMV, ec_ctx->zeromv_cdf[zeromv_ctx], 2); + + if (mode != GLOBALMV) { + int16_t refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK; + aom_write_symbol(w, mode != NEARESTMV, ec_ctx->refmv_cdf[refmv_ctx], 2); + } + } +} + +static AOM_INLINE void write_drl_idx( + FRAME_CONTEXT *ec_ctx, const MB_MODE_INFO *mbmi, + const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame, aom_writer *w) { + assert(mbmi->ref_mv_idx < 3); + + const int new_mv = mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV; + if (new_mv) { + int idx; + for (idx = 0; idx < 2; ++idx) { + if (mbmi_ext_frame->ref_mv_count > idx + 1) { + uint8_t drl_ctx = av1_drl_ctx(mbmi_ext_frame->weight, idx); + + aom_write_symbol(w, mbmi->ref_mv_idx != idx, ec_ctx->drl_cdf[drl_ctx], + 2); + if (mbmi->ref_mv_idx == idx) return; + } + } + return; + } + + if (have_nearmv_in_inter_mode(mbmi->mode)) { + int idx; + // TODO(jingning): Temporary solution to compensate the NEARESTMV offset. + for (idx = 1; idx < 3; ++idx) { + if (mbmi_ext_frame->ref_mv_count > idx + 1) { + uint8_t drl_ctx = av1_drl_ctx(mbmi_ext_frame->weight, idx); + aom_write_symbol(w, mbmi->ref_mv_idx != (idx - 1), + ec_ctx->drl_cdf[drl_ctx], 2); + if (mbmi->ref_mv_idx == (idx - 1)) return; + } + } + return; + } +} + +static AOM_INLINE void write_inter_compound_mode(MACROBLOCKD *xd, aom_writer *w, + PREDICTION_MODE mode, + const int16_t mode_ctx) { + assert(is_inter_compound_mode(mode)); + aom_write_symbol(w, INTER_COMPOUND_OFFSET(mode), + xd->tile_ctx->inter_compound_mode_cdf[mode_ctx], + INTER_COMPOUND_MODES); +} + +static AOM_INLINE void write_tx_size_vartx(MACROBLOCKD *xd, + const MB_MODE_INFO *mbmi, + TX_SIZE tx_size, int depth, + int blk_row, int blk_col, + aom_writer *w) { + FRAME_CONTEXT *const ec_ctx = xd->tile_ctx; + const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0); + const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0); + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + if (depth == MAX_VARTX_DEPTH) { + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, tx_size, tx_size); + return; + } + + const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, + mbmi->sb_type, tx_size); + const int txb_size_index = + av1_get_txb_size_index(mbmi->sb_type, blk_row, blk_col); + const int write_txfm_partition = + tx_size == mbmi->inter_tx_size[txb_size_index]; + if (write_txfm_partition) { + aom_write_symbol(w, 0, ec_ctx->txfm_partition_cdf[ctx], 2); + + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, tx_size, tx_size); + // TODO(yuec): set correct txfm partition update for qttx + } else { + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + + aom_write_symbol(w, 1, ec_ctx->txfm_partition_cdf[ctx], 2); + + if (sub_txs == TX_4X4) { + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, sub_txs, tx_size); + return; + } + + assert(bsw > 0 && bsh > 0); + for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) + for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) { + int offsetr = blk_row + row; + int offsetc = blk_col + col; + write_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, w); + } + } +} + +static AOM_INLINE void write_selected_tx_size(const MACROBLOCKD *xd, + aom_writer *w) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const BLOCK_SIZE bsize = mbmi->sb_type; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + if (block_signals_txsize(bsize)) { + const TX_SIZE tx_size = mbmi->tx_size; + const int tx_size_ctx = get_tx_size_context(xd); + const int depth = tx_size_to_depth(tx_size, bsize); + const int max_depths = bsize_to_max_depth(bsize); + const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize); + + assert(depth >= 0 && depth <= max_depths); + assert(!is_inter_block(mbmi)); + assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi))); + + aom_write_symbol(w, depth, ec_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx], + max_depths + 1); + } +} + +static int write_skip(const AV1_COMMON *cm, const MACROBLOCKD *xd, + int segment_id, const MB_MODE_INFO *mi, aom_writer *w) { + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { + return 1; + } else { + const int skip = mi->skip; + const int ctx = av1_get_skip_context(xd); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + aom_write_symbol(w, skip, ec_ctx->skip_cdfs[ctx], 2); + return skip; + } +} + +static int write_skip_mode(const AV1_COMMON *cm, const MACROBLOCKD *xd, + int segment_id, const MB_MODE_INFO *mi, + aom_writer *w) { + if (!cm->current_frame.skip_mode_info.skip_mode_flag) return 0; + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { + return 0; + } + const int skip_mode = mi->skip_mode; + if (!is_comp_ref_allowed(mi->sb_type)) { + assert(!skip_mode); + return 0; + } + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME) || + segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) { + // These features imply single-reference mode, while skip mode implies + // compound reference. Hence, the two are mutually exclusive. + // In other words, skip_mode is implicitly 0 here. + assert(!skip_mode); + return 0; + } + const int ctx = av1_get_skip_mode_context(xd); + aom_write_symbol(w, skip_mode, xd->tile_ctx->skip_mode_cdfs[ctx], 2); + return skip_mode; +} + +static AOM_INLINE void write_is_inter(const AV1_COMMON *cm, + const MACROBLOCKD *xd, int segment_id, + aom_writer *w, const int is_inter) { + if (!segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) { + assert(is_inter); + return; + } + const int ctx = av1_get_intra_inter_context(xd); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + aom_write_symbol(w, is_inter, ec_ctx->intra_inter_cdf[ctx], 2); + } +} + +static AOM_INLINE void write_motion_mode(const AV1_COMMON *cm, MACROBLOCKD *xd, + const MB_MODE_INFO *mbmi, + aom_writer *w) { + MOTION_MODE last_motion_mode_allowed = + cm->features.switchable_motion_mode + ? motion_mode_allowed(cm->global_motion, xd, mbmi, + cm->features.allow_warped_motion) + : SIMPLE_TRANSLATION; + assert(mbmi->motion_mode <= last_motion_mode_allowed); + switch (last_motion_mode_allowed) { + case SIMPLE_TRANSLATION: break; + case OBMC_CAUSAL: + aom_write_symbol(w, mbmi->motion_mode == OBMC_CAUSAL, + xd->tile_ctx->obmc_cdf[mbmi->sb_type], 2); + break; + default: + aom_write_symbol(w, mbmi->motion_mode, + xd->tile_ctx->motion_mode_cdf[mbmi->sb_type], + MOTION_MODES); + } +} + +static AOM_INLINE void write_delta_qindex(const MACROBLOCKD *xd, + int delta_qindex, aom_writer *w) { + int sign = delta_qindex < 0; + int abs = sign ? -delta_qindex : delta_qindex; + int rem_bits, thr; + int smallval = abs < DELTA_Q_SMALL ? 1 : 0; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + + aom_write_symbol(w, AOMMIN(abs, DELTA_Q_SMALL), ec_ctx->delta_q_cdf, + DELTA_Q_PROBS + 1); + + if (!smallval) { + rem_bits = get_msb(abs - 1); + thr = (1 << rem_bits) + 1; + aom_write_literal(w, rem_bits - 1, 3); + aom_write_literal(w, abs - thr, rem_bits); + } + if (abs > 0) { + aom_write_bit(w, sign); + } +} + +static AOM_INLINE void write_delta_lflevel(const AV1_COMMON *cm, + const MACROBLOCKD *xd, int lf_id, + int delta_lflevel, aom_writer *w) { + int sign = delta_lflevel < 0; + int abs = sign ? -delta_lflevel : delta_lflevel; + int rem_bits, thr; + int smallval = abs < DELTA_LF_SMALL ? 1 : 0; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + + if (cm->delta_q_info.delta_lf_multi) { + assert(lf_id >= 0 && lf_id < (av1_num_planes(cm) > 1 ? FRAME_LF_COUNT + : FRAME_LF_COUNT - 2)); + aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), + ec_ctx->delta_lf_multi_cdf[lf_id], DELTA_LF_PROBS + 1); + } else { + aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), ec_ctx->delta_lf_cdf, + DELTA_LF_PROBS + 1); + } + + if (!smallval) { + rem_bits = get_msb(abs - 1); + thr = (1 << rem_bits) + 1; + aom_write_literal(w, rem_bits - 1, 3); + aom_write_literal(w, abs - thr, rem_bits); + } + if (abs > 0) { + aom_write_bit(w, sign); + } +} + +static AOM_INLINE void pack_map_tokens(aom_writer *w, const TOKENEXTRA **tp, + int n, int num) { + const TOKENEXTRA *p = *tp; + write_uniform(w, n, p->token); // The first color index. + ++p; + --num; + for (int i = 0; i < num; ++i) { + aom_write_symbol(w, p->token, p->color_map_cdf, n); + ++p; + } + *tp = p; +} + +static AOM_INLINE void pack_txb_tokens( + aom_writer *w, AV1_COMMON *cm, MACROBLOCK *const x, const TOKENEXTRA **tp, + const TOKENEXTRA *const tok_end, MACROBLOCKD *xd, MB_MODE_INFO *mbmi, + int plane, BLOCK_SIZE plane_bsize, aom_bit_depth_t bit_depth, int block, + int blk_row, int blk_col, TX_SIZE tx_size, TOKEN_STATS *token_stats) { + const int max_blocks_high = max_block_high(xd, plane_bsize, plane); + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const TX_SIZE plane_tx_size = + plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x, + pd->subsampling_y) + : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row, + blk_col)]; + + if (tx_size == plane_tx_size || plane) { + av1_write_coeffs_txb(cm, x, w, blk_row, blk_col, plane, block, tx_size); +#if CONFIG_RD_DEBUG + TOKEN_STATS tmp_token_stats; + init_token_stats(&tmp_token_stats); + token_stats->txb_coeff_cost_map[blk_row][blk_col] = tmp_token_stats.cost; + token_stats->cost += tmp_token_stats.cost; +#endif + } else { + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + const int step = bsh * bsw; + + assert(bsw > 0 && bsh > 0); + + for (int r = 0; r < tx_size_high_unit[tx_size]; r += bsh) { + for (int c = 0; c < tx_size_wide_unit[tx_size]; c += bsw) { + const int offsetr = blk_row + r; + const int offsetc = blk_col + c; + if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; + pack_txb_tokens(w, cm, x, tp, tok_end, xd, mbmi, plane, plane_bsize, + bit_depth, block, offsetr, offsetc, sub_txs, + token_stats); + block += step; + } + } + } +} + +static INLINE void set_spatial_segment_id( + const CommonModeInfoParams *const mi_params, uint8_t *segment_ids, + BLOCK_SIZE bsize, int mi_row, int mi_col, int segment_id) { + const int mi_offset = mi_row * mi_params->mi_cols + mi_col; + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + const int xmis = AOMMIN(mi_params->mi_cols - mi_col, bw); + const int ymis = AOMMIN(mi_params->mi_rows - mi_row, bh); + + for (int y = 0; y < ymis; ++y) { + for (int x = 0; x < xmis; ++x) { + segment_ids[mi_offset + y * mi_params->mi_cols + x] = segment_id; + } + } +} + +int av1_neg_interleave(int x, int ref, int max) { + assert(x < max); + const int diff = x - ref; + if (!ref) return x; + if (ref >= (max - 1)) return -x + max - 1; + if (2 * ref < max) { + if (abs(diff) <= ref) { + if (diff > 0) + return (diff << 1) - 1; + else + return ((-diff) << 1); + } + return x; + } else { + if (abs(diff) < (max - ref)) { + if (diff > 0) + return (diff << 1) - 1; + else + return ((-diff) << 1); + } + return (max - x) - 1; + } +} + +static AOM_INLINE void write_segment_id( + AV1_COMP *cpi, const MB_MODE_INFO *const mbmi, aom_writer *w, + const struct segmentation *seg, struct segmentation_probs *segp, int skip) { + if (!seg->enabled || !seg->update_map) return; + + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + int cdf_num; + const int pred = av1_get_spatial_seg_pred(cm, xd, &cdf_num); + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + if (skip) { + // Still need to transmit tx size for intra blocks even if skip is + // true. Changing segment_id may make the tx size become invalid, e.g + // changing from lossless to lossy. + assert(is_inter_block(mbmi) || !cpi->enc_seg.has_lossless_segment); + + set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map, + mbmi->sb_type, mi_row, mi_col, pred); + set_spatial_segment_id(&cm->mi_params, cpi->enc_seg.map, mbmi->sb_type, + mi_row, mi_col, pred); + /* mbmi is read only but we need to update segment_id */ + ((MB_MODE_INFO *)mbmi)->segment_id = pred; + return; + } + + const int coded_id = + av1_neg_interleave(mbmi->segment_id, pred, seg->last_active_segid + 1); + aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num]; + aom_write_symbol(w, coded_id, pred_cdf, MAX_SEGMENTS); + set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map, mbmi->sb_type, + mi_row, mi_col, mbmi->segment_id); +} + +#define WRITE_REF_BIT(bname, pname) \ + aom_write_symbol(w, bname, av1_get_pred_cdf_##pname(xd), 2) + +// This function encodes the reference frame +static AOM_INLINE void write_ref_frames(const AV1_COMMON *cm, + const MACROBLOCKD *xd, aom_writer *w) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const int is_compound = has_second_ref(mbmi); + const int segment_id = mbmi->segment_id; + + // If segment level coding of this signal is disabled... + // or the segment allows multiple reference frame options + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { + assert(!is_compound); + assert(mbmi->ref_frame[0] == + get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME)); + } else if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) || + segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) { + assert(!is_compound); + assert(mbmi->ref_frame[0] == LAST_FRAME); + } else { + // does the feature use compound prediction or not + // (if not specified at the frame/segment level) + if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) { + if (is_comp_ref_allowed(mbmi->sb_type)) + aom_write_symbol(w, is_compound, av1_get_reference_mode_cdf(xd), 2); + } else { + assert((!is_compound) == + (cm->current_frame.reference_mode == SINGLE_REFERENCE)); + } + + if (is_compound) { + const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi) + ? UNIDIR_COMP_REFERENCE + : BIDIR_COMP_REFERENCE; + aom_write_symbol(w, comp_ref_type, av1_get_comp_reference_type_cdf(xd), + 2); + + if (comp_ref_type == UNIDIR_COMP_REFERENCE) { + const int bit = mbmi->ref_frame[0] == BWDREF_FRAME; + WRITE_REF_BIT(bit, uni_comp_ref_p); + + if (!bit) { + assert(mbmi->ref_frame[0] == LAST_FRAME); + const int bit1 = mbmi->ref_frame[1] == LAST3_FRAME || + mbmi->ref_frame[1] == GOLDEN_FRAME; + WRITE_REF_BIT(bit1, uni_comp_ref_p1); + if (bit1) { + const int bit2 = mbmi->ref_frame[1] == GOLDEN_FRAME; + WRITE_REF_BIT(bit2, uni_comp_ref_p2); + } + } else { + assert(mbmi->ref_frame[1] == ALTREF_FRAME); + } + + return; + } + + assert(comp_ref_type == BIDIR_COMP_REFERENCE); + + const int bit = (mbmi->ref_frame[0] == GOLDEN_FRAME || + mbmi->ref_frame[0] == LAST3_FRAME); + WRITE_REF_BIT(bit, comp_ref_p); + + if (!bit) { + const int bit1 = mbmi->ref_frame[0] == LAST2_FRAME; + WRITE_REF_BIT(bit1, comp_ref_p1); + } else { + const int bit2 = mbmi->ref_frame[0] == GOLDEN_FRAME; + WRITE_REF_BIT(bit2, comp_ref_p2); + } + + const int bit_bwd = mbmi->ref_frame[1] == ALTREF_FRAME; + WRITE_REF_BIT(bit_bwd, comp_bwdref_p); + + if (!bit_bwd) { + WRITE_REF_BIT(mbmi->ref_frame[1] == ALTREF2_FRAME, comp_bwdref_p1); + } + + } else { + const int bit0 = (mbmi->ref_frame[0] <= ALTREF_FRAME && + mbmi->ref_frame[0] >= BWDREF_FRAME); + WRITE_REF_BIT(bit0, single_ref_p1); + + if (bit0) { + const int bit1 = mbmi->ref_frame[0] == ALTREF_FRAME; + WRITE_REF_BIT(bit1, single_ref_p2); + + if (!bit1) { + WRITE_REF_BIT(mbmi->ref_frame[0] == ALTREF2_FRAME, single_ref_p6); + } + } else { + const int bit2 = (mbmi->ref_frame[0] == LAST3_FRAME || + mbmi->ref_frame[0] == GOLDEN_FRAME); + WRITE_REF_BIT(bit2, single_ref_p3); + + if (!bit2) { + const int bit3 = mbmi->ref_frame[0] != LAST_FRAME; + WRITE_REF_BIT(bit3, single_ref_p4); + } else { + const int bit4 = mbmi->ref_frame[0] != LAST3_FRAME; + WRITE_REF_BIT(bit4, single_ref_p5); + } + } + } + } +} + +static AOM_INLINE void write_filter_intra_mode_info( + const AV1_COMMON *cm, const MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi, + aom_writer *w) { + if (av1_filter_intra_allowed(cm, mbmi)) { + aom_write_symbol(w, mbmi->filter_intra_mode_info.use_filter_intra, + xd->tile_ctx->filter_intra_cdfs[mbmi->sb_type], 2); + if (mbmi->filter_intra_mode_info.use_filter_intra) { + const FILTER_INTRA_MODE mode = + mbmi->filter_intra_mode_info.filter_intra_mode; + aom_write_symbol(w, mode, xd->tile_ctx->filter_intra_mode_cdf, + FILTER_INTRA_MODES); + } + } +} + +static AOM_INLINE void write_angle_delta(aom_writer *w, int angle_delta, + aom_cdf_prob *cdf) { + aom_write_symbol(w, angle_delta + MAX_ANGLE_DELTA, cdf, + 2 * MAX_ANGLE_DELTA + 1); +} + +static AOM_INLINE void write_mb_interp_filter(AV1_COMMON *const cm, + const MACROBLOCKD *xd, + aom_writer *w) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + + if (!av1_is_interp_needed(xd)) { + int_interpfilters filters = av1_broadcast_interp_filter( + av1_unswitchable_filter(cm->features.interp_filter)); + assert(mbmi->interp_filters.as_int == filters.as_int); + (void)filters; + return; + } + if (cm->features.interp_filter == SWITCHABLE) { + int dir; + for (dir = 0; dir < 2; ++dir) { + const int ctx = av1_get_pred_context_switchable_interp(xd, dir); + InterpFilter filter = + av1_extract_interp_filter(mbmi->interp_filters, dir); + aom_write_symbol(w, filter, ec_ctx->switchable_interp_cdf[ctx], + SWITCHABLE_FILTERS); + ++cm->cur_frame->interp_filter_selected[filter]; + if (cm->seq_params.enable_dual_filter == 0) return; + } + } +} + +// Transmit color values with delta encoding. Write the first value as +// literal, and the deltas between each value and the previous one. "min_val" is +// the smallest possible value of the deltas. +static AOM_INLINE void delta_encode_palette_colors(const int *colors, int num, + int bit_depth, int min_val, + aom_writer *w) { + if (num <= 0) return; + assert(colors[0] < (1 << bit_depth)); + aom_write_literal(w, colors[0], bit_depth); + if (num == 1) return; + int max_delta = 0; + int deltas[PALETTE_MAX_SIZE]; + memset(deltas, 0, sizeof(deltas)); + for (int i = 1; i < num; ++i) { + assert(colors[i] < (1 << bit_depth)); + const int delta = colors[i] - colors[i - 1]; + deltas[i - 1] = delta; + assert(delta >= min_val); + if (delta > max_delta) max_delta = delta; + } + const int min_bits = bit_depth - 3; + int bits = AOMMAX(av1_ceil_log2(max_delta + 1 - min_val), min_bits); + assert(bits <= bit_depth); + int range = (1 << bit_depth) - colors[0] - min_val; + aom_write_literal(w, bits - min_bits, 2); + for (int i = 0; i < num - 1; ++i) { + aom_write_literal(w, deltas[i] - min_val, bits); + range -= deltas[i]; + bits = AOMMIN(bits, av1_ceil_log2(range)); + } +} + +// Transmit luma palette color values. First signal if each color in the color +// cache is used. Those colors that are not in the cache are transmitted with +// delta encoding. +static AOM_INLINE void write_palette_colors_y( + const MACROBLOCKD *const xd, const PALETTE_MODE_INFO *const pmi, + int bit_depth, aom_writer *w) { + const int n = pmi->palette_size[0]; + uint16_t color_cache[2 * PALETTE_MAX_SIZE]; + const int n_cache = av1_get_palette_cache(xd, 0, color_cache); + int out_cache_colors[PALETTE_MAX_SIZE]; + uint8_t cache_color_found[2 * PALETTE_MAX_SIZE]; + const int n_out_cache = + av1_index_color_cache(color_cache, n_cache, pmi->palette_colors, n, + cache_color_found, out_cache_colors); + int n_in_cache = 0; + for (int i = 0; i < n_cache && n_in_cache < n; ++i) { + const int found = cache_color_found[i]; + aom_write_bit(w, found); + n_in_cache += found; + } + assert(n_in_cache + n_out_cache == n); + delta_encode_palette_colors(out_cache_colors, n_out_cache, bit_depth, 1, w); +} + +// Write chroma palette color values. U channel is handled similarly to the luma +// channel. For v channel, either use delta encoding or transmit raw values +// directly, whichever costs less. +static AOM_INLINE void write_palette_colors_uv( + const MACROBLOCKD *const xd, const PALETTE_MODE_INFO *const pmi, + int bit_depth, aom_writer *w) { + const int n = pmi->palette_size[1]; + const uint16_t *colors_u = pmi->palette_colors + PALETTE_MAX_SIZE; + const uint16_t *colors_v = pmi->palette_colors + 2 * PALETTE_MAX_SIZE; + // U channel colors. + uint16_t color_cache[2 * PALETTE_MAX_SIZE]; + const int n_cache = av1_get_palette_cache(xd, 1, color_cache); + int out_cache_colors[PALETTE_MAX_SIZE]; + uint8_t cache_color_found[2 * PALETTE_MAX_SIZE]; + const int n_out_cache = av1_index_color_cache( + color_cache, n_cache, colors_u, n, cache_color_found, out_cache_colors); + int n_in_cache = 0; + for (int i = 0; i < n_cache && n_in_cache < n; ++i) { + const int found = cache_color_found[i]; + aom_write_bit(w, found); + n_in_cache += found; + } + delta_encode_palette_colors(out_cache_colors, n_out_cache, bit_depth, 0, w); + + // V channel colors. Don't use color cache as the colors are not sorted. + const int max_val = 1 << bit_depth; + int zero_count = 0, min_bits_v = 0; + int bits_v = + av1_get_palette_delta_bits_v(pmi, bit_depth, &zero_count, &min_bits_v); + const int rate_using_delta = + 2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count; + const int rate_using_raw = bit_depth * n; + if (rate_using_delta < rate_using_raw) { // delta encoding + assert(colors_v[0] < (1 << bit_depth)); + aom_write_bit(w, 1); + aom_write_literal(w, bits_v - min_bits_v, 2); + aom_write_literal(w, colors_v[0], bit_depth); + for (int i = 1; i < n; ++i) { + assert(colors_v[i] < (1 << bit_depth)); + if (colors_v[i] == colors_v[i - 1]) { // No need to signal sign bit. + aom_write_literal(w, 0, bits_v); + continue; + } + const int delta = abs((int)colors_v[i] - colors_v[i - 1]); + const int sign_bit = colors_v[i] < colors_v[i - 1]; + if (delta <= max_val - delta) { + aom_write_literal(w, delta, bits_v); + aom_write_bit(w, sign_bit); + } else { + aom_write_literal(w, max_val - delta, bits_v); + aom_write_bit(w, !sign_bit); + } + } + } else { // Transmit raw values. + aom_write_bit(w, 0); + for (int i = 0; i < n; ++i) { + assert(colors_v[i] < (1 << bit_depth)); + aom_write_literal(w, colors_v[i], bit_depth); + } + } +} + +static AOM_INLINE void write_palette_mode_info(const AV1_COMMON *cm, + const MACROBLOCKD *xd, + const MB_MODE_INFO *const mbmi, + aom_writer *w) { + const int num_planes = av1_num_planes(cm); + const BLOCK_SIZE bsize = mbmi->sb_type; + assert(av1_allow_palette(cm->features.allow_screen_content_tools, bsize)); + const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const int bsize_ctx = av1_get_palette_bsize_ctx(bsize); + + if (mbmi->mode == DC_PRED) { + const int n = pmi->palette_size[0]; + const int palette_y_mode_ctx = av1_get_palette_mode_ctx(xd); + aom_write_symbol( + w, n > 0, + xd->tile_ctx->palette_y_mode_cdf[bsize_ctx][palette_y_mode_ctx], 2); + if (n > 0) { + aom_write_symbol(w, n - PALETTE_MIN_SIZE, + xd->tile_ctx->palette_y_size_cdf[bsize_ctx], + PALETTE_SIZES); + write_palette_colors_y(xd, pmi, cm->seq_params.bit_depth, w); + } + } + + const int uv_dc_pred = + num_planes > 1 && mbmi->uv_mode == UV_DC_PRED && xd->is_chroma_ref; + if (uv_dc_pred) { + const int n = pmi->palette_size[1]; + const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0); + aom_write_symbol(w, n > 0, + xd->tile_ctx->palette_uv_mode_cdf[palette_uv_mode_ctx], 2); + if (n > 0) { + aom_write_symbol(w, n - PALETTE_MIN_SIZE, + xd->tile_ctx->palette_uv_size_cdf[bsize_ctx], + PALETTE_SIZES); + write_palette_colors_uv(xd, pmi, cm->seq_params.bit_depth, w); + } + } +} + +void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd, + TX_TYPE tx_type, TX_SIZE tx_size, aom_writer *w) { + MB_MODE_INFO *mbmi = xd->mi[0]; + const FeatureFlags *const features = &cm->features; + const int is_inter = is_inter_block(mbmi); + if (get_ext_tx_types(tx_size, is_inter, features->reduced_tx_set_used) > 1 && + ((!cm->seg.enabled && cm->quant_params.base_qindex > 0) || + (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) && + !mbmi->skip && + !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + const TX_SIZE square_tx_size = txsize_sqr_map[tx_size]; + const TxSetType tx_set_type = av1_get_ext_tx_set_type( + tx_size, is_inter, features->reduced_tx_set_used); + const int eset = + get_ext_tx_set(tx_size, is_inter, features->reduced_tx_set_used); + // eset == 0 should correspond to a set with only DCT_DCT and there + // is no need to send the tx_type + assert(eset > 0); + assert(av1_ext_tx_used[tx_set_type][tx_type]); + if (is_inter) { + aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][tx_type], + ec_ctx->inter_ext_tx_cdf[eset][square_tx_size], + av1_num_ext_tx_set[tx_set_type]); + } else { + PREDICTION_MODE intra_dir; + if (mbmi->filter_intra_mode_info.use_filter_intra) + intra_dir = + fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode]; + else + intra_dir = mbmi->mode; + aom_write_symbol( + w, av1_ext_tx_ind[tx_set_type][tx_type], + ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_dir], + av1_num_ext_tx_set[tx_set_type]); + } + } +} + +static AOM_INLINE void write_intra_y_mode_nonkf(FRAME_CONTEXT *frame_ctx, + BLOCK_SIZE bsize, + PREDICTION_MODE mode, + aom_writer *w) { + aom_write_symbol(w, mode, frame_ctx->y_mode_cdf[size_group_lookup[bsize]], + INTRA_MODES); +} + +static AOM_INLINE void write_intra_uv_mode(FRAME_CONTEXT *frame_ctx, + UV_PREDICTION_MODE uv_mode, + PREDICTION_MODE y_mode, + CFL_ALLOWED_TYPE cfl_allowed, + aom_writer *w) { + aom_write_symbol(w, uv_mode, frame_ctx->uv_mode_cdf[cfl_allowed][y_mode], + UV_INTRA_MODES - !cfl_allowed); +} + +static AOM_INLINE void write_cfl_alphas(FRAME_CONTEXT *const ec_ctx, + uint8_t idx, int8_t joint_sign, + aom_writer *w) { + aom_write_symbol(w, joint_sign, ec_ctx->cfl_sign_cdf, CFL_JOINT_SIGNS); + // Magnitudes are only signaled for nonzero codes. + if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) { + aom_cdf_prob *cdf_u = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)]; + aom_write_symbol(w, CFL_IDX_U(idx), cdf_u, CFL_ALPHABET_SIZE); + } + if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) { + aom_cdf_prob *cdf_v = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)]; + aom_write_symbol(w, CFL_IDX_V(idx), cdf_v, CFL_ALPHABET_SIZE); + } +} + +static AOM_INLINE void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd, + aom_writer *w, int skip) { + if (cm->features.coded_lossless || cm->features.allow_intrabc) return; + + // At the start of a superblock, mark that we haven't yet written CDEF + // strengths for any of the CDEF units contained in this superblock. + const int sb_mask = (cm->seq_params.mib_size - 1); + const int mi_row_in_sb = (xd->mi_row & sb_mask); + const int mi_col_in_sb = (xd->mi_col & sb_mask); + if (mi_row_in_sb == 0 && mi_col_in_sb == 0) { + xd->cdef_transmitted[0] = xd->cdef_transmitted[1] = + xd->cdef_transmitted[2] = xd->cdef_transmitted[3] = false; + } + + // CDEF unit size is 64x64 irrespective of the superblock size. + const int cdef_size = 1 << (6 - MI_SIZE_LOG2); + + // Find index of this CDEF unit in this superblock. + const int index_mask = cdef_size; + const int cdef_unit_row_in_sb = ((xd->mi_row & index_mask) != 0); + const int cdef_unit_col_in_sb = ((xd->mi_col & index_mask) != 0); + const int index = (cm->seq_params.sb_size == BLOCK_128X128) + ? cdef_unit_col_in_sb + 2 * cdef_unit_row_in_sb + : 0; + + // Write CDEF strength to the first non-skip coding block in this CDEF unit. + if (!xd->cdef_transmitted[index] && !skip) { + // CDEF strength for this CDEF unit needs to be stored in the MB_MODE_INFO + // of the 1st block in this CDEF unit. + const int first_block_mask = ~(cdef_size - 1); + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int grid_idx = + get_mi_grid_idx(mi_params, xd->mi_row & first_block_mask, + xd->mi_col & first_block_mask); + const MB_MODE_INFO *const mbmi = mi_params->mi_grid_base[grid_idx]; + aom_write_literal(w, mbmi->cdef_strength, cm->cdef_info.cdef_bits); + xd->cdef_transmitted[index] = true; + } +} + +static AOM_INLINE void write_inter_segment_id( + AV1_COMP *cpi, aom_writer *w, const struct segmentation *const seg, + struct segmentation_probs *const segp, int skip, int preskip) { + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + AV1_COMMON *const cm = &cpi->common; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + if (seg->update_map) { + if (preskip) { + if (!seg->segid_preskip) return; + } else { + if (seg->segid_preskip) return; + if (skip) { + write_segment_id(cpi, mbmi, w, seg, segp, 1); + if (seg->temporal_update) mbmi->seg_id_predicted = 0; + return; + } + } + if (seg->temporal_update) { + const int pred_flag = mbmi->seg_id_predicted; + aom_cdf_prob *pred_cdf = av1_get_pred_cdf_seg_id(segp, xd); + aom_write_symbol(w, pred_flag, pred_cdf, 2); + if (!pred_flag) { + write_segment_id(cpi, mbmi, w, seg, segp, 0); + } + if (pred_flag) { + set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map, + mbmi->sb_type, mi_row, mi_col, mbmi->segment_id); + } + } else { + write_segment_id(cpi, mbmi, w, seg, segp, 0); + } + } +} + +// If delta q is present, writes delta_q index. +// Also writes delta_q loop filter levels, if present. +static AOM_INLINE void write_delta_q_params(AV1_COMP *cpi, int skip, + aom_writer *w) { + AV1_COMMON *const cm = &cpi->common; + const DeltaQInfo *const delta_q_info = &cm->delta_q_info; + + if (delta_q_info->delta_q_present_flag) { + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const BLOCK_SIZE bsize = mbmi->sb_type; + const int super_block_upper_left = + ((xd->mi_row & (cm->seq_params.mib_size - 1)) == 0) && + ((xd->mi_col & (cm->seq_params.mib_size - 1)) == 0); + + if ((bsize != cm->seq_params.sb_size || skip == 0) && + super_block_upper_left) { + assert(mbmi->current_qindex > 0); + const int reduced_delta_qindex = + (mbmi->current_qindex - xd->current_qindex) / + delta_q_info->delta_q_res; + write_delta_qindex(xd, reduced_delta_qindex, w); + xd->current_qindex = mbmi->current_qindex; + if (delta_q_info->delta_lf_present_flag) { + if (delta_q_info->delta_lf_multi) { + const int frame_lf_count = + av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { + int reduced_delta_lflevel = + (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) / + delta_q_info->delta_lf_res; + write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, w); + xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id]; + } + } else { + int reduced_delta_lflevel = + (mbmi->delta_lf_from_base - xd->delta_lf_from_base) / + delta_q_info->delta_lf_res; + write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, w); + xd->delta_lf_from_base = mbmi->delta_lf_from_base; + } + } + } + } +} + +static AOM_INLINE void write_intra_prediction_modes(AV1_COMP *cpi, + int is_keyframe, + aom_writer *w) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const PREDICTION_MODE mode = mbmi->mode; + const BLOCK_SIZE bsize = mbmi->sb_type; + + // Y mode. + if (is_keyframe) { + const MB_MODE_INFO *const above_mi = xd->above_mbmi; + const MB_MODE_INFO *const left_mi = xd->left_mbmi; + write_intra_y_mode_kf(ec_ctx, mbmi, above_mi, left_mi, mode, w); + } else { + write_intra_y_mode_nonkf(ec_ctx, bsize, mode, w); + } + + // Y angle delta. + const int use_angle_delta = av1_use_angle_delta(bsize); + if (use_angle_delta && av1_is_directional_mode(mode)) { + write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y], + ec_ctx->angle_delta_cdf[mode - V_PRED]); + } + + // UV mode and UV angle delta. + if (!cm->seq_params.monochrome && xd->is_chroma_ref) { + const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode; + write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w); + if (uv_mode == UV_CFL_PRED) + write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w); + if (use_angle_delta && av1_is_directional_mode(get_uv_mode(uv_mode))) { + write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV], + ec_ctx->angle_delta_cdf[uv_mode - V_PRED]); + } + } + + // Palette. + if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) { + write_palette_mode_info(cm, xd, mbmi, w); + } + + // Filter intra. + write_filter_intra_mode_info(cm, xd, mbmi, w); +} + +static INLINE int16_t mode_context_analyzer( + const int16_t mode_context, const MV_REFERENCE_FRAME *const rf) { + if (rf[1] <= INTRA_FRAME) return mode_context; + + const int16_t newmv_ctx = mode_context & NEWMV_CTX_MASK; + const int16_t refmv_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK; + + const int16_t comp_ctx = compound_mode_ctx_map[refmv_ctx >> 1][AOMMIN( + newmv_ctx, COMP_NEWMV_CTXS - 1)]; + return comp_ctx; +} + +static INLINE int_mv get_ref_mv_from_stack( + int ref_idx, const MV_REFERENCE_FRAME *ref_frame, int ref_mv_idx, + const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame) { + const int8_t ref_frame_type = av1_ref_frame_type(ref_frame); + const CANDIDATE_MV *curr_ref_mv_stack = mbmi_ext_frame->ref_mv_stack; + + if (ref_frame[1] > INTRA_FRAME) { + assert(ref_idx == 0 || ref_idx == 1); + return ref_idx ? curr_ref_mv_stack[ref_mv_idx].comp_mv + : curr_ref_mv_stack[ref_mv_idx].this_mv; + } + + assert(ref_idx == 0); + return ref_mv_idx < mbmi_ext_frame->ref_mv_count + ? curr_ref_mv_stack[ref_mv_idx].this_mv + : mbmi_ext_frame->global_mvs[ref_frame_type]; +} + +static INLINE int_mv get_ref_mv(const MACROBLOCK *x, int ref_idx) { + const MACROBLOCKD *xd = &x->e_mbd; + const MB_MODE_INFO *mbmi = xd->mi[0]; + int ref_mv_idx = mbmi->ref_mv_idx; + if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) { + assert(has_second_ref(mbmi)); + ref_mv_idx += 1; + } + return get_ref_mv_from_stack(ref_idx, mbmi->ref_frame, ref_mv_idx, + x->mbmi_ext_frame); +} + +static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, aom_writer *w) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + const struct segmentation *const seg = &cm->seg; + struct segmentation_probs *const segp = &ec_ctx->seg; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_frame = x->mbmi_ext_frame; + const PREDICTION_MODE mode = mbmi->mode; + const int segment_id = mbmi->segment_id; + const BLOCK_SIZE bsize = mbmi->sb_type; + const int allow_hp = cm->features.allow_high_precision_mv; + const int is_inter = is_inter_block(mbmi); + const int is_compound = has_second_ref(mbmi); + int ref; + + write_inter_segment_id(cpi, w, seg, segp, 0, 1); + + write_skip_mode(cm, xd, segment_id, mbmi, w); + + assert(IMPLIES(mbmi->skip_mode, mbmi->skip)); + const int skip = + mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w); + + write_inter_segment_id(cpi, w, seg, segp, skip, 0); + + write_cdef(cm, xd, w, skip); + + write_delta_q_params(cpi, skip, w); + + if (!mbmi->skip_mode) write_is_inter(cm, xd, mbmi->segment_id, w, is_inter); + + if (mbmi->skip_mode) return; + + if (!is_inter) { + write_intra_prediction_modes(cpi, 0, w); + } else { + int16_t mode_ctx; + + av1_collect_neighbors_ref_counts(xd); + + write_ref_frames(cm, xd, w); + + mode_ctx = + mode_context_analyzer(mbmi_ext_frame->mode_context, mbmi->ref_frame); + + // If segment skip is not enabled code the mode. + if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) { + if (is_inter_compound_mode(mode)) + write_inter_compound_mode(xd, w, mode, mode_ctx); + else if (is_inter_singleref_mode(mode)) + write_inter_mode(w, mode, ec_ctx, mode_ctx); + + if (mode == NEWMV || mode == NEW_NEWMV || have_nearmv_in_inter_mode(mode)) + write_drl_idx(ec_ctx, mbmi, mbmi_ext_frame, w); + else + assert(mbmi->ref_mv_idx == 0); + } + + if (mode == NEWMV || mode == NEW_NEWMV) { + for (ref = 0; ref < 1 + is_compound; ++ref) { + nmv_context *nmvc = &ec_ctx->nmvc; + const int_mv ref_mv = get_ref_mv(x, ref); + av1_encode_mv(cpi, w, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc, + allow_hp); + } + } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) { + nmv_context *nmvc = &ec_ctx->nmvc; + const int_mv ref_mv = get_ref_mv(x, 1); + av1_encode_mv(cpi, w, &mbmi->mv[1].as_mv, &ref_mv.as_mv, nmvc, allow_hp); + } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) { + nmv_context *nmvc = &ec_ctx->nmvc; + const int_mv ref_mv = get_ref_mv(x, 0); + av1_encode_mv(cpi, w, &mbmi->mv[0].as_mv, &ref_mv.as_mv, nmvc, allow_hp); + } + + if (cpi->common.current_frame.reference_mode != COMPOUND_REFERENCE && + cpi->common.seq_params.enable_interintra_compound && + is_interintra_allowed(mbmi)) { + const int interintra = mbmi->ref_frame[1] == INTRA_FRAME; + const int bsize_group = size_group_lookup[bsize]; + aom_write_symbol(w, interintra, ec_ctx->interintra_cdf[bsize_group], 2); + if (interintra) { + aom_write_symbol(w, mbmi->interintra_mode, + ec_ctx->interintra_mode_cdf[bsize_group], + INTERINTRA_MODES); + if (av1_is_wedge_used(bsize)) { + aom_write_symbol(w, mbmi->use_wedge_interintra, + ec_ctx->wedge_interintra_cdf[bsize], 2); + if (mbmi->use_wedge_interintra) { + aom_write_symbol(w, mbmi->interintra_wedge_index, + ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES); + } + } + } + } + + if (mbmi->ref_frame[1] != INTRA_FRAME) write_motion_mode(cm, xd, mbmi, w); + + // First write idx to indicate current compound inter prediction mode group + // Group A (0): dist_wtd_comp, compound_average + // Group B (1): interintra, compound_diffwtd, wedge + if (has_second_ref(mbmi)) { + const int masked_compound_used = is_any_masked_compound_used(bsize) && + cm->seq_params.enable_masked_compound; + + if (masked_compound_used) { + const int ctx_comp_group_idx = get_comp_group_idx_context(xd); + aom_write_symbol(w, mbmi->comp_group_idx, + ec_ctx->comp_group_idx_cdf[ctx_comp_group_idx], 2); + } else { + assert(mbmi->comp_group_idx == 0); + } + + if (mbmi->comp_group_idx == 0) { + if (mbmi->compound_idx) + assert(mbmi->interinter_comp.type == COMPOUND_AVERAGE); + + if (cm->seq_params.order_hint_info.enable_dist_wtd_comp) { + const int comp_index_ctx = get_comp_index_context(cm, xd); + aom_write_symbol(w, mbmi->compound_idx, + ec_ctx->compound_index_cdf[comp_index_ctx], 2); + } else { + assert(mbmi->compound_idx == 1); + } + } else { + assert(cpi->common.current_frame.reference_mode != SINGLE_REFERENCE && + is_inter_compound_mode(mbmi->mode) && + mbmi->motion_mode == SIMPLE_TRANSLATION); + assert(masked_compound_used); + // compound_diffwtd, wedge + assert(mbmi->interinter_comp.type == COMPOUND_WEDGE || + mbmi->interinter_comp.type == COMPOUND_DIFFWTD); + + if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) + aom_write_symbol(w, mbmi->interinter_comp.type - COMPOUND_WEDGE, + ec_ctx->compound_type_cdf[bsize], + MASKED_COMPOUND_TYPES); + + if (mbmi->interinter_comp.type == COMPOUND_WEDGE) { + assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize)); + aom_write_symbol(w, mbmi->interinter_comp.wedge_index, + ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES); + aom_write_bit(w, mbmi->interinter_comp.wedge_sign); + } else { + assert(mbmi->interinter_comp.type == COMPOUND_DIFFWTD); + aom_write_literal(w, mbmi->interinter_comp.mask_type, + MAX_DIFFWTD_MASK_BITS); + } + } + } + write_mb_interp_filter(cm, xd, w); + } +} + +static AOM_INLINE void write_intrabc_info( + MACROBLOCKD *xd, const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame, + aom_writer *w) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + int use_intrabc = is_intrabc_block(mbmi); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + aom_write_symbol(w, use_intrabc, ec_ctx->intrabc_cdf, 2); + if (use_intrabc) { + assert(mbmi->mode == DC_PRED); + assert(mbmi->uv_mode == UV_DC_PRED); + assert(mbmi->motion_mode == SIMPLE_TRANSLATION); + int_mv dv_ref = mbmi_ext_frame->ref_mv_stack[0].this_mv; + av1_encode_dv(w, &mbmi->mv[0].as_mv, &dv_ref.as_mv, &ec_ctx->ndvc); + } +} + +static AOM_INLINE void write_mb_modes_kf( + AV1_COMP *cpi, MACROBLOCKD *xd, + const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame, aom_writer *w) { + AV1_COMMON *const cm = &cpi->common; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + const struct segmentation *const seg = &cm->seg; + struct segmentation_probs *const segp = &ec_ctx->seg; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + + if (seg->segid_preskip && seg->update_map) + write_segment_id(cpi, mbmi, w, seg, segp, 0); + + const int skip = write_skip(cm, xd, mbmi->segment_id, mbmi, w); + + if (!seg->segid_preskip && seg->update_map) + write_segment_id(cpi, mbmi, w, seg, segp, skip); + + write_cdef(cm, xd, w, skip); + + write_delta_q_params(cpi, skip, w); + + if (av1_allow_intrabc(cm)) { + write_intrabc_info(xd, mbmi_ext_frame, w); + if (is_intrabc_block(mbmi)) return; + } + + write_intra_prediction_modes(cpi, 1, w); +} + +#if CONFIG_RD_DEBUG +static AOM_INLINE void dump_mode_info(MB_MODE_INFO *mi) { + printf("\nmi->mi_row == %d\n", mi->mi_row); + printf("&& mi->mi_col == %d\n", mi->mi_col); + printf("&& mi->sb_type == %d\n", mi->sb_type); + printf("&& mi->tx_size == %d\n", mi->tx_size); + printf("&& mi->mode == %d\n", mi->mode); +} + +static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats, + int plane) { + if (rd_stats->txb_coeff_cost[plane] != token_stats->cost) { + int r, c; + printf("\nplane %d rd_stats->txb_coeff_cost %d token_stats->cost %d\n", + plane, rd_stats->txb_coeff_cost[plane], token_stats->cost); + printf("rd txb_coeff_cost_map\n"); + for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) { + for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) { + printf("%d ", rd_stats->txb_coeff_cost_map[plane][r][c]); + } + printf("\n"); + } + + printf("pack txb_coeff_cost_map\n"); + for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) { + for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) { + printf("%d ", token_stats->txb_coeff_cost_map[r][c]); + } + printf("\n"); + } + return 1; + } + return 0; +} +#endif + +#if ENC_MISMATCH_DEBUG +static AOM_INLINE void enc_dump_logs( + const AV1_COMMON *const cm, + const MBMIExtFrameBufferInfo *const mbmi_ext_info, int mi_row, int mi_col) { + const MB_MODE_INFO *const mbmi = *( + cm->mi_params.mi_grid_base + (mi_row * cm->mi_params.mi_stride + mi_col)); + const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_frame = + mbmi_ext_info->frame_base + get_mi_ext_idx(mi_row, mi_col, + cm->mi_params.mi_alloc_bsize, + mbmi_ext_info->stride); + if (is_inter_block(mbmi)) { +#define FRAME_TO_CHECK 11 + if (cm->current_frame.frame_number == FRAME_TO_CHECK && + cm->show_frame == 1) { + const BLOCK_SIZE bsize = mbmi->sb_type; + + int_mv mv[2] = { 0 }; + const int is_comp_ref = has_second_ref(mbmi); + + for (int ref = 0; ref < 1 + is_comp_ref; ++ref) + mv[ref].as_mv = mbmi->mv[ref].as_mv; + + if (!is_comp_ref) { + mv[1].as_int = 0; + } + + const int16_t mode_ctx = + is_comp_ref ? 0 + : mode_context_analyzer(mbmi_ext_frame->mode_context, + mbmi->ref_frame); + + const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK; + int16_t zeromv_ctx = -1; + int16_t refmv_ctx = -1; + + if (mbmi->mode != NEWMV) { + zeromv_ctx = (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; + if (mbmi->mode != GLOBALMV) + refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK; + } + + printf( + "=== ENCODER ===: " + "Frame=%d, (mi_row,mi_col)=(%d,%d), skip_mode=%d, mode=%d, bsize=%d, " + "show_frame=%d, mv[0]=(%d,%d), mv[1]=(%d,%d), ref[0]=%d, " + "ref[1]=%d, motion_mode=%d, mode_ctx=%d, " + "newmv_ctx=%d, zeromv_ctx=%d, refmv_ctx=%d, tx_size=%d\n", + cm->current_frame.frame_number, mi_row, mi_col, mbmi->skip_mode, + mbmi->mode, bsize, cm->show_frame, mv[0].as_mv.row, mv[0].as_mv.col, + mv[1].as_mv.row, mv[1].as_mv.col, mbmi->ref_frame[0], + mbmi->ref_frame[1], mbmi->motion_mode, mode_ctx, newmv_ctx, + zeromv_ctx, refmv_ctx, mbmi->tx_size); + } + } +} +#endif // ENC_MISMATCH_DEBUG + +static AOM_INLINE void write_mbmi_b(AV1_COMP *cpi, aom_writer *w) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + MB_MODE_INFO *m = xd->mi[0]; + + if (frame_is_intra_only(cm)) { + write_mb_modes_kf(cpi, xd, cpi->td.mb.mbmi_ext_frame, w); + } else { + // has_subpel_mv_component needs the ref frame buffers set up to look + // up if they are scaled. has_subpel_mv_component is in turn needed by + // write_switchable_interp_filter, which is called by pack_inter_mode_mvs. + set_ref_ptrs(cm, xd, m->ref_frame[0], m->ref_frame[1]); + +#if ENC_MISMATCH_DEBUG + enc_dump_logs(cm, &cpi->mbmi_ext_info, xd->mi_row, xd->mi_col); +#endif // ENC_MISMATCH_DEBUG + + pack_inter_mode_mvs(cpi, w); + } +} + +static AOM_INLINE void write_inter_txb_coeff( + AV1_COMMON *const cm, MACROBLOCK *const x, MB_MODE_INFO *const mbmi, + aom_writer *w, const TOKENEXTRA **tok, const TOKENEXTRA *const tok_end, + TOKEN_STATS *token_stats, const int row, const int col, int *block, + const int plane) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE bsize = mbmi->sb_type; + assert(bsize < BLOCK_SIZES_ALL); + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y); + assert(plane_bsize < BLOCK_SIZES_ALL); + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane); + const int step = + tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size]; + const int bkw = tx_size_wide_unit[max_tx_size]; + const int bkh = tx_size_high_unit[max_tx_size]; + const BLOCK_SIZE max_unit_bsize = + get_plane_block_size(BLOCK_64X64, ss_x, ss_y); + const int num_4x4_w = mi_size_wide[plane_bsize]; + const int num_4x4_h = mi_size_high[plane_bsize]; + const int mu_blocks_wide = mi_size_wide[max_unit_bsize]; + const int mu_blocks_high = mi_size_high[max_unit_bsize]; + const int unit_height = AOMMIN(mu_blocks_high + (row >> ss_y), num_4x4_h); + const int unit_width = AOMMIN(mu_blocks_wide + (col >> ss_x), num_4x4_w); + for (int blk_row = row >> ss_y; blk_row < unit_height; blk_row += bkh) { + for (int blk_col = col >> ss_x; blk_col < unit_width; blk_col += bkw) { + pack_txb_tokens(w, cm, x, tok, tok_end, xd, mbmi, plane, plane_bsize, + cm->seq_params.bit_depth, *block, blk_row, blk_col, + max_tx_size, token_stats); + *block += step; + } + } +} + +static AOM_INLINE void write_tokens_b(AV1_COMP *cpi, aom_writer *w, + const TOKENEXTRA **tok, + const TOKENEXTRA *const tok_end) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const BLOCK_SIZE bsize = mbmi->sb_type; + + assert(!mbmi->skip); + + const int is_inter = is_inter_block(mbmi); + if (!is_inter) { + av1_write_coeffs_mb(cm, x, w, bsize); + } else { + int block[MAX_MB_PLANE] = { 0 }; + assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x, + xd->plane[0].subsampling_y)); + const int num_4x4_w = mi_size_wide[bsize]; + const int num_4x4_h = mi_size_high[bsize]; + TOKEN_STATS token_stats; + init_token_stats(&token_stats); + + const BLOCK_SIZE max_unit_bsize = BLOCK_64X64; + assert(max_unit_bsize == get_plane_block_size(BLOCK_64X64, + xd->plane[0].subsampling_x, + xd->plane[0].subsampling_y)); + int mu_blocks_wide = mi_size_wide[max_unit_bsize]; + int mu_blocks_high = mi_size_high[max_unit_bsize]; + mu_blocks_wide = AOMMIN(num_4x4_w, mu_blocks_wide); + mu_blocks_high = AOMMIN(num_4x4_h, mu_blocks_high); + + const int num_planes = av1_num_planes(cm); + for (int row = 0; row < num_4x4_h; row += mu_blocks_high) { + for (int col = 0; col < num_4x4_w; col += mu_blocks_wide) { + for (int plane = 0; plane < num_planes; ++plane) { + if (plane && !xd->is_chroma_ref) break; + write_inter_txb_coeff(cm, x, mbmi, w, tok, tok_end, &token_stats, row, + col, &block[plane], plane); + } + } + } +#if CONFIG_RD_DEBUG + for (int plane = 0; plane < num_planes; ++plane) { + if (mbmi->sb_type >= BLOCK_8X8 && + rd_token_stats_mismatch(&mbmi->rd_stats, &token_stats, plane)) { + dump_mode_info(mbmi); + assert(0); + } + } +#endif // CONFIG_RD_DEBUG + } +} + +static AOM_INLINE void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile, + aom_writer *w, const TOKENEXTRA **tok, + const TOKENEXTRA *const tok_end, + int mi_row, int mi_col) { + const AV1_COMMON *cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + MACROBLOCKD *xd = &cpi->td.mb.e_mbd; + const int grid_idx = mi_row * mi_params->mi_stride + mi_col; + xd->mi = mi_params->mi_grid_base + grid_idx; + cpi->td.mb.mbmi_ext_frame = + cpi->mbmi_ext_info.frame_base + + get_mi_ext_idx(mi_row, mi_col, cm->mi_params.mi_alloc_bsize, + cpi->mbmi_ext_info.stride); + xd->tx_type_map = mi_params->tx_type_map + grid_idx; + xd->tx_type_map_stride = mi_params->mi_stride; + + const MB_MODE_INFO *mbmi = xd->mi[0]; + const BLOCK_SIZE bsize = mbmi->sb_type; + assert(bsize <= cm->seq_params.sb_size || + (bsize >= BLOCK_SIZES && bsize < BLOCK_SIZES_ALL)); + + const int bh = mi_size_high[bsize]; + const int bw = mi_size_wide[bsize]; + set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows, + mi_params->mi_cols); + + xd->above_txfm_context = cm->above_contexts.txfm[tile->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + + write_mbmi_b(cpi, w); + + for (int plane = 0; plane < AOMMIN(2, av1_num_planes(cm)); ++plane) { + const uint8_t palette_size_plane = + mbmi->palette_mode_info.palette_size[plane]; + assert(!mbmi->skip_mode || !palette_size_plane); + if (palette_size_plane > 0) { + assert(mbmi->use_intrabc == 0); + assert(av1_allow_palette(cm->features.allow_screen_content_tools, + mbmi->sb_type)); + assert(!plane || xd->is_chroma_ref); + int rows, cols; + av1_get_block_dimensions(mbmi->sb_type, plane, xd, NULL, NULL, &rows, + &cols); + assert(*tok < tok_end); + pack_map_tokens(w, tok, palette_size_plane, rows * cols); + } + } + + const int is_inter_tx = is_inter_block(mbmi); + const int skip = mbmi->skip; + const int segment_id = mbmi->segment_id; + if (cm->features.tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) && + !(is_inter_tx && skip) && !xd->lossless[segment_id]) { + if (is_inter_tx) { // This implies skip flag is 0. + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, bsize, 0); + const int txbh = tx_size_high_unit[max_tx_size]; + const int txbw = tx_size_wide_unit[max_tx_size]; + const int width = mi_size_wide[bsize]; + const int height = mi_size_high[bsize]; + for (int idy = 0; idy < height; idy += txbh) { + for (int idx = 0; idx < width; idx += txbw) { + write_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, w); + } + } + } else { + write_selected_tx_size(xd, w); + set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height, 0, xd); + } + } else { + set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height, skip && is_inter_tx, + xd); + } + + if (!mbmi->skip) { + write_tokens_b(cpi, w, tok, tok_end); + } +} + +static AOM_INLINE void write_partition(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, int hbs, + int mi_row, int mi_col, PARTITION_TYPE p, + BLOCK_SIZE bsize, aom_writer *w) { + const int is_partition_point = bsize >= BLOCK_8X8; + + if (!is_partition_point) return; + + const int has_rows = (mi_row + hbs) < cm->mi_params.mi_rows; + const int has_cols = (mi_col + hbs) < cm->mi_params.mi_cols; + const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + + if (!has_rows && !has_cols) { + assert(p == PARTITION_SPLIT); + return; + } + + if (has_rows && has_cols) { + aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx], + partition_cdf_length(bsize)); + } else if (!has_rows && has_cols) { + assert(p == PARTITION_SPLIT || p == PARTITION_HORZ); + assert(bsize > BLOCK_8X8); + aom_cdf_prob cdf[2]; + partition_gather_vert_alike(cdf, ec_ctx->partition_cdf[ctx], bsize); + aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2); + } else { + assert(has_rows && !has_cols); + assert(p == PARTITION_SPLIT || p == PARTITION_VERT); + assert(bsize > BLOCK_8X8); + aom_cdf_prob cdf[2]; + partition_gather_horz_alike(cdf, ec_ctx->partition_cdf[ctx], bsize); + aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2); + } +} + +static AOM_INLINE void write_modes_sb( + AV1_COMP *const cpi, const TileInfo *const tile, aom_writer *const w, + const TOKENEXTRA **tok, const TOKENEXTRA *const tok_end, int mi_row, + int mi_col, BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + assert(bsize < BLOCK_SIZES_ALL); + const int hbs = mi_size_wide[bsize] / 2; + const int quarter_step = mi_size_wide[bsize] / 4; + int i; + const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize); + const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); + + if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return; + + const int num_planes = av1_num_planes(cm); + for (int plane = 0; plane < num_planes; ++plane) { + int rcol0, rcol1, rrow0, rrow1; + if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize, + &rcol0, &rcol1, &rrow0, &rrow1)) { + const int rstride = cm->rst_info[plane].horz_units_per_tile; + for (int rrow = rrow0; rrow < rrow1; ++rrow) { + for (int rcol = rcol0; rcol < rcol1; ++rcol) { + const int runit_idx = rcol + rrow * rstride; + const RestorationUnitInfo *rui = + &cm->rst_info[plane].unit_info[runit_idx]; + loop_restoration_write_sb_coeffs(cm, xd, rui, w, plane, + cpi->td.counts); + } + } + } + } + + write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w); + switch (partition) { + case PARTITION_NONE: + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + break; + case PARTITION_HORZ: + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + if (mi_row + hbs < mi_params->mi_rows) + write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col); + break; + case PARTITION_VERT: + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + if (mi_col + hbs < mi_params->mi_cols) + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs); + break; + case PARTITION_SPLIT: + write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize); + write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs, subsize); + write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col, subsize); + write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs, + subsize); + break; + case PARTITION_HORZ_A: + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs); + write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col); + break; + case PARTITION_HORZ_B: + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col); + write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs); + break; + case PARTITION_VERT_A: + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col); + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs); + break; + case PARTITION_VERT_B: + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs); + write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs); + break; + case PARTITION_HORZ_4: + for (i = 0; i < 4; ++i) { + int this_mi_row = mi_row + i * quarter_step; + if (i > 0 && this_mi_row >= mi_params->mi_rows) break; + + write_modes_b(cpi, tile, w, tok, tok_end, this_mi_row, mi_col); + } + break; + case PARTITION_VERT_4: + for (i = 0; i < 4; ++i) { + int this_mi_col = mi_col + i * quarter_step; + if (i > 0 && this_mi_col >= mi_params->mi_cols) break; + + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, this_mi_col); + } + break; + default: assert(0); + } + + // update partition context + update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition); +} + +static AOM_INLINE void write_modes(AV1_COMP *const cpi, + const TileInfo *const tile, + aom_writer *const w, int tile_row, + int tile_col) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + const int mi_row_start = tile->mi_row_start; + const int mi_row_end = tile->mi_row_end; + const int mi_col_start = tile->mi_col_start; + const int mi_col_end = tile->mi_col_end; + const int num_planes = av1_num_planes(cm); + + av1_zero_above_context(cm, xd, mi_col_start, mi_col_end, tile->tile_row); + av1_init_above_context(&cm->above_contexts, num_planes, tile->tile_row, xd); + + if (cpi->common.delta_q_info.delta_q_present_flag) { + xd->current_qindex = cpi->common.quant_params.base_qindex; + if (cpi->common.delta_q_info.delta_lf_present_flag) { + av1_reset_loop_filter_delta(xd, num_planes); + } + } + + for (int mi_row = mi_row_start; mi_row < mi_row_end; + mi_row += cm->seq_params.mib_size) { + const int sb_row_in_tile = + (mi_row - tile->mi_row_start) >> cm->seq_params.mib_size_log2; + const TOKENEXTRA *tok = + cpi->tplist[tile_row][tile_col][sb_row_in_tile].start; + const TOKENEXTRA *tok_end = + tok + cpi->tplist[tile_row][tile_col][sb_row_in_tile].count; + + av1_zero_left_context(xd); + + for (int mi_col = mi_col_start; mi_col < mi_col_end; + mi_col += cm->seq_params.mib_size) { + cpi->td.mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col); + write_modes_sb(cpi, tile, w, &tok, tok_end, mi_row, mi_col, + cm->seq_params.sb_size); + } + assert(tok == cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop); + } +} + +static AOM_INLINE void encode_restoration_mode( + AV1_COMMON *cm, struct aom_write_bit_buffer *wb) { + assert(!cm->features.all_lossless); + if (!cm->seq_params.enable_restoration) return; + if (cm->features.allow_intrabc) return; + const int num_planes = av1_num_planes(cm); + int all_none = 1, chroma_none = 1; + for (int p = 0; p < num_planes; ++p) { + RestorationInfo *rsi = &cm->rst_info[p]; + if (rsi->frame_restoration_type != RESTORE_NONE) { + all_none = 0; + chroma_none &= p == 0; + } + switch (rsi->frame_restoration_type) { + case RESTORE_NONE: + aom_wb_write_bit(wb, 0); + aom_wb_write_bit(wb, 0); + break; + case RESTORE_WIENER: + aom_wb_write_bit(wb, 1); + aom_wb_write_bit(wb, 0); + break; + case RESTORE_SGRPROJ: + aom_wb_write_bit(wb, 1); + aom_wb_write_bit(wb, 1); + break; + case RESTORE_SWITCHABLE: + aom_wb_write_bit(wb, 0); + aom_wb_write_bit(wb, 1); + break; + default: assert(0); + } + } + if (!all_none) { + assert(cm->seq_params.sb_size == BLOCK_64X64 || + cm->seq_params.sb_size == BLOCK_128X128); + const int sb_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64; + + RestorationInfo *rsi = &cm->rst_info[0]; + + assert(rsi->restoration_unit_size >= sb_size); + assert(RESTORATION_UNITSIZE_MAX == 256); + + if (sb_size == 64) { + aom_wb_write_bit(wb, rsi->restoration_unit_size > 64); + } + if (rsi->restoration_unit_size > 64) { + aom_wb_write_bit(wb, rsi->restoration_unit_size > 128); + } + } + + if (num_planes > 1) { + int s = AOMMIN(cm->seq_params.subsampling_x, cm->seq_params.subsampling_y); + if (s && !chroma_none) { + aom_wb_write_bit(wb, cm->rst_info[1].restoration_unit_size != + cm->rst_info[0].restoration_unit_size); + assert(cm->rst_info[1].restoration_unit_size == + cm->rst_info[0].restoration_unit_size || + cm->rst_info[1].restoration_unit_size == + (cm->rst_info[0].restoration_unit_size >> s)); + assert(cm->rst_info[2].restoration_unit_size == + cm->rst_info[1].restoration_unit_size); + } else if (!s) { + assert(cm->rst_info[1].restoration_unit_size == + cm->rst_info[0].restoration_unit_size); + assert(cm->rst_info[2].restoration_unit_size == + cm->rst_info[1].restoration_unit_size); + } + } +} + +static AOM_INLINE void write_wiener_filter(int wiener_win, + const WienerInfo *wiener_info, + WienerInfo *ref_wiener_info, + aom_writer *wb) { + if (wiener_win == WIENER_WIN) + aom_write_primitive_refsubexpfin( + wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1, + WIENER_FILT_TAP0_SUBEXP_K, + ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV, + wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV); + else + assert(wiener_info->vfilter[0] == 0 && + wiener_info->vfilter[WIENER_WIN - 1] == 0); + aom_write_primitive_refsubexpfin( + wb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1, + WIENER_FILT_TAP1_SUBEXP_K, + ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV, + wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV); + aom_write_primitive_refsubexpfin( + wb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1, + WIENER_FILT_TAP2_SUBEXP_K, + ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV, + wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV); + if (wiener_win == WIENER_WIN) + aom_write_primitive_refsubexpfin( + wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1, + WIENER_FILT_TAP0_SUBEXP_K, + ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV, + wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV); + else + assert(wiener_info->hfilter[0] == 0 && + wiener_info->hfilter[WIENER_WIN - 1] == 0); + aom_write_primitive_refsubexpfin( + wb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1, + WIENER_FILT_TAP1_SUBEXP_K, + ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV, + wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV); + aom_write_primitive_refsubexpfin( + wb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1, + WIENER_FILT_TAP2_SUBEXP_K, + ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV, + wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV); + memcpy(ref_wiener_info, wiener_info, sizeof(*wiener_info)); +} + +static AOM_INLINE void write_sgrproj_filter(const SgrprojInfo *sgrproj_info, + SgrprojInfo *ref_sgrproj_info, + aom_writer *wb) { + aom_write_literal(wb, sgrproj_info->ep, SGRPROJ_PARAMS_BITS); + const sgr_params_type *params = &av1_sgr_params[sgrproj_info->ep]; + + if (params->r[0] == 0) { + assert(sgrproj_info->xqd[0] == 0); + aom_write_primitive_refsubexpfin( + wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, + sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1); + } else if (params->r[1] == 0) { + aom_write_primitive_refsubexpfin( + wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, + sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0); + } else { + aom_write_primitive_refsubexpfin( + wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, + sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0); + aom_write_primitive_refsubexpfin( + wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, + sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1); + } + + memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info)); +} + +static AOM_INLINE void loop_restoration_write_sb_coeffs( + const AV1_COMMON *const cm, MACROBLOCKD *xd, const RestorationUnitInfo *rui, + aom_writer *const w, int plane, FRAME_COUNTS *counts) { + const RestorationInfo *rsi = cm->rst_info + plane; + RestorationType frame_rtype = rsi->frame_restoration_type; + if (frame_rtype == RESTORE_NONE) return; + + (void)counts; + assert(!cm->features.all_lossless); + + const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN; + WienerInfo *ref_wiener_info = &xd->wiener_info[plane]; + SgrprojInfo *ref_sgrproj_info = &xd->sgrproj_info[plane]; + RestorationType unit_rtype = rui->restoration_type; + + if (frame_rtype == RESTORE_SWITCHABLE) { + aom_write_symbol(w, unit_rtype, xd->tile_ctx->switchable_restore_cdf, + RESTORE_SWITCHABLE_TYPES); +#if CONFIG_ENTROPY_STATS + ++counts->switchable_restore[unit_rtype]; +#endif + switch (unit_rtype) { + case RESTORE_WIENER: + write_wiener_filter(wiener_win, &rui->wiener_info, ref_wiener_info, w); + break; + case RESTORE_SGRPROJ: + write_sgrproj_filter(&rui->sgrproj_info, ref_sgrproj_info, w); + break; + default: assert(unit_rtype == RESTORE_NONE); break; + } + } else if (frame_rtype == RESTORE_WIENER) { + aom_write_symbol(w, unit_rtype != RESTORE_NONE, + xd->tile_ctx->wiener_restore_cdf, 2); +#if CONFIG_ENTROPY_STATS + ++counts->wiener_restore[unit_rtype != RESTORE_NONE]; +#endif + if (unit_rtype != RESTORE_NONE) { + write_wiener_filter(wiener_win, &rui->wiener_info, ref_wiener_info, w); + } + } else if (frame_rtype == RESTORE_SGRPROJ) { + aom_write_symbol(w, unit_rtype != RESTORE_NONE, + xd->tile_ctx->sgrproj_restore_cdf, 2); +#if CONFIG_ENTROPY_STATS + ++counts->sgrproj_restore[unit_rtype != RESTORE_NONE]; +#endif + if (unit_rtype != RESTORE_NONE) { + write_sgrproj_filter(&rui->sgrproj_info, ref_sgrproj_info, w); + } + } +} + +// Only write out the ref delta section if any of the elements +// will signal a delta. +static bool is_mode_ref_delta_meaningful(AV1_COMMON *cm) { + struct loopfilter *lf = &cm->lf; + if (!lf->mode_ref_delta_update) { + return 0; + } + const RefCntBuffer *buf = get_primary_ref_frame_buf(cm); + int8_t last_ref_deltas[REF_FRAMES]; + int8_t last_mode_deltas[MAX_MODE_LF_DELTAS]; + if (buf == NULL) { + av1_set_default_ref_deltas(last_ref_deltas); + av1_set_default_mode_deltas(last_mode_deltas); + } else { + memcpy(last_ref_deltas, buf->ref_deltas, REF_FRAMES); + memcpy(last_mode_deltas, buf->mode_deltas, MAX_MODE_LF_DELTAS); + } + for (int i = 0; i < REF_FRAMES; i++) { + if (lf->ref_deltas[i] != last_ref_deltas[i]) { + return true; + } + } + for (int i = 0; i < MAX_MODE_LF_DELTAS; i++) { + if (lf->mode_deltas[i] != last_mode_deltas[i]) { + return true; + } + } + return false; +} + +static AOM_INLINE void encode_loopfilter(AV1_COMMON *cm, + struct aom_write_bit_buffer *wb) { + assert(!cm->features.coded_lossless); + if (cm->features.allow_intrabc) return; + const int num_planes = av1_num_planes(cm); + struct loopfilter *lf = &cm->lf; + + // Encode the loop filter level and type + aom_wb_write_literal(wb, lf->filter_level[0], 6); + aom_wb_write_literal(wb, lf->filter_level[1], 6); + if (num_planes > 1) { + if (lf->filter_level[0] || lf->filter_level[1]) { + aom_wb_write_literal(wb, lf->filter_level_u, 6); + aom_wb_write_literal(wb, lf->filter_level_v, 6); + } + } + aom_wb_write_literal(wb, lf->sharpness_level, 3); + + aom_wb_write_bit(wb, lf->mode_ref_delta_enabled); + + // Write out loop filter deltas applied at the MB level based on mode or + // ref frame (if they are enabled), only if there is information to write. + int meaningful = is_mode_ref_delta_meaningful(cm); + aom_wb_write_bit(wb, meaningful); + if (!meaningful) { + return; + } + + const RefCntBuffer *buf = get_primary_ref_frame_buf(cm); + int8_t last_ref_deltas[REF_FRAMES]; + int8_t last_mode_deltas[MAX_MODE_LF_DELTAS]; + if (buf == NULL) { + av1_set_default_ref_deltas(last_ref_deltas); + av1_set_default_mode_deltas(last_mode_deltas); + } else { + memcpy(last_ref_deltas, buf->ref_deltas, REF_FRAMES); + memcpy(last_mode_deltas, buf->mode_deltas, MAX_MODE_LF_DELTAS); + } + for (int i = 0; i < REF_FRAMES; i++) { + const int delta = lf->ref_deltas[i]; + const int changed = delta != last_ref_deltas[i]; + aom_wb_write_bit(wb, changed); + if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6); + } + for (int i = 0; i < MAX_MODE_LF_DELTAS; i++) { + const int delta = lf->mode_deltas[i]; + const int changed = delta != last_mode_deltas[i]; + aom_wb_write_bit(wb, changed); + if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6); + } +} + +static AOM_INLINE void encode_cdef(const AV1_COMMON *cm, + struct aom_write_bit_buffer *wb) { + assert(!cm->features.coded_lossless); + if (!cm->seq_params.enable_cdef) return; + if (cm->features.allow_intrabc) return; + const int num_planes = av1_num_planes(cm); + int i; + aom_wb_write_literal(wb, cm->cdef_info.cdef_damping - 3, 2); + aom_wb_write_literal(wb, cm->cdef_info.cdef_bits, 2); + for (i = 0; i < cm->cdef_info.nb_cdef_strengths; i++) { + aom_wb_write_literal(wb, cm->cdef_info.cdef_strengths[i], + CDEF_STRENGTH_BITS); + if (num_planes > 1) + aom_wb_write_literal(wb, cm->cdef_info.cdef_uv_strengths[i], + CDEF_STRENGTH_BITS); + } +} + +static AOM_INLINE void write_delta_q(struct aom_write_bit_buffer *wb, + int delta_q) { + if (delta_q != 0) { + aom_wb_write_bit(wb, 1); + aom_wb_write_inv_signed_literal(wb, delta_q, 6); + } else { + aom_wb_write_bit(wb, 0); + } +} + +static AOM_INLINE void encode_quantization( + const CommonQuantParams *const quant_params, int num_planes, + bool separate_uv_delta_q, struct aom_write_bit_buffer *wb) { + aom_wb_write_literal(wb, quant_params->base_qindex, QINDEX_BITS); + write_delta_q(wb, quant_params->y_dc_delta_q); + if (num_planes > 1) { + int diff_uv_delta = + (quant_params->u_dc_delta_q != quant_params->v_dc_delta_q) || + (quant_params->u_ac_delta_q != quant_params->v_ac_delta_q); + if (separate_uv_delta_q) aom_wb_write_bit(wb, diff_uv_delta); + write_delta_q(wb, quant_params->u_dc_delta_q); + write_delta_q(wb, quant_params->u_ac_delta_q); + if (diff_uv_delta) { + write_delta_q(wb, quant_params->v_dc_delta_q); + write_delta_q(wb, quant_params->v_ac_delta_q); + } + } + aom_wb_write_bit(wb, quant_params->using_qmatrix); + if (quant_params->using_qmatrix) { + aom_wb_write_literal(wb, quant_params->qmatrix_level_y, QM_LEVEL_BITS); + aom_wb_write_literal(wb, quant_params->qmatrix_level_u, QM_LEVEL_BITS); + if (!separate_uv_delta_q) + assert(quant_params->qmatrix_level_u == quant_params->qmatrix_level_v); + else + aom_wb_write_literal(wb, quant_params->qmatrix_level_v, QM_LEVEL_BITS); + } +} + +static AOM_INLINE void encode_segmentation(AV1_COMMON *cm, MACROBLOCKD *xd, + struct aom_write_bit_buffer *wb) { + int i, j; + struct segmentation *seg = &cm->seg; + + aom_wb_write_bit(wb, seg->enabled); + if (!seg->enabled) return; + + // Write update flags + if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) { + assert(seg->update_map == 1); + seg->temporal_update = 0; + assert(seg->update_data == 1); + } else { + aom_wb_write_bit(wb, seg->update_map); + if (seg->update_map) { + // Select the coding strategy (temporal or spatial) + av1_choose_segmap_coding_method(cm, xd); + aom_wb_write_bit(wb, seg->temporal_update); + } + aom_wb_write_bit(wb, seg->update_data); + } + + // Segmentation data + if (seg->update_data) { + for (i = 0; i < MAX_SEGMENTS; i++) { + for (j = 0; j < SEG_LVL_MAX; j++) { + const int active = segfeature_active(seg, i, j); + aom_wb_write_bit(wb, active); + if (active) { + const int data_max = av1_seg_feature_data_max(j); + const int data_min = -data_max; + const int ubits = get_unsigned_bits(data_max); + const int data = clamp(get_segdata(seg, i, j), data_min, data_max); + + if (av1_is_segfeature_signed(j)) { + aom_wb_write_inv_signed_literal(wb, data, ubits); + } else { + aom_wb_write_literal(wb, data, ubits); + } + } + } + } + } +} + +static AOM_INLINE void write_frame_interp_filter( + InterpFilter filter, struct aom_write_bit_buffer *wb) { + aom_wb_write_bit(wb, filter == SWITCHABLE); + if (filter != SWITCHABLE) + aom_wb_write_literal(wb, filter, LOG_SWITCHABLE_FILTERS); +} + +// Same function as write_uniform but writing to uncompresses header wb +static AOM_INLINE void wb_write_uniform(struct aom_write_bit_buffer *wb, int n, + int v) { + const int l = get_unsigned_bits(n); + const int m = (1 << l) - n; + if (l == 0) return; + if (v < m) { + aom_wb_write_literal(wb, v, l - 1); + } else { + aom_wb_write_literal(wb, m + ((v - m) >> 1), l - 1); + aom_wb_write_literal(wb, (v - m) & 1, 1); + } +} + +static AOM_INLINE void write_tile_info_max_tile( + const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) { + int width_mi = + ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params.mib_size_log2); + int height_mi = + ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params.mib_size_log2); + int width_sb = width_mi >> cm->seq_params.mib_size_log2; + int height_sb = height_mi >> cm->seq_params.mib_size_log2; + int size_sb, i; + const CommonTileParams *const tiles = &cm->tiles; + + aom_wb_write_bit(wb, tiles->uniform_spacing); + + if (tiles->uniform_spacing) { + int ones = tiles->log2_cols - tiles->min_log2_cols; + while (ones--) { + aom_wb_write_bit(wb, 1); + } + if (tiles->log2_cols < tiles->max_log2_cols) { + aom_wb_write_bit(wb, 0); + } + + // rows + ones = tiles->log2_rows - tiles->min_log2_rows; + while (ones--) { + aom_wb_write_bit(wb, 1); + } + if (tiles->log2_rows < tiles->max_log2_rows) { + aom_wb_write_bit(wb, 0); + } + } else { + // Explicit tiles with configurable tile widths and heights + // columns + for (i = 0; i < tiles->cols; i++) { + size_sb = tiles->col_start_sb[i + 1] - tiles->col_start_sb[i]; + wb_write_uniform(wb, AOMMIN(width_sb, tiles->max_width_sb), size_sb - 1); + width_sb -= size_sb; + } + assert(width_sb == 0); + + // rows + for (i = 0; i < tiles->rows; i++) { + size_sb = tiles->row_start_sb[i + 1] - tiles->row_start_sb[i]; + wb_write_uniform(wb, AOMMIN(height_sb, tiles->max_height_sb), + size_sb - 1); + height_sb -= size_sb; + } + assert(height_sb == 0); + } +} + +static AOM_INLINE void write_tile_info(const AV1_COMMON *const cm, + struct aom_write_bit_buffer *saved_wb, + struct aom_write_bit_buffer *wb) { + write_tile_info_max_tile(cm, wb); + + *saved_wb = *wb; + if (cm->tiles.rows * cm->tiles.cols > 1) { + // tile id used for cdf update + aom_wb_write_literal(wb, 0, cm->tiles.log2_cols + cm->tiles.log2_rows); + // Number of bytes in tile size - 1 + aom_wb_write_literal(wb, 3, 2); + } +} + +static AOM_INLINE void write_ext_tile_info( + const AV1_COMMON *const cm, struct aom_write_bit_buffer *saved_wb, + struct aom_write_bit_buffer *wb) { + // This information is stored as a separate byte. + int mod = wb->bit_offset % CHAR_BIT; + if (mod > 0) aom_wb_write_literal(wb, 0, CHAR_BIT - mod); + assert(aom_wb_is_byte_aligned(wb)); + + *saved_wb = *wb; + if (cm->tiles.rows * cm->tiles.cols > 1) { + // Note that the last item in the uncompressed header is the data + // describing tile configuration. + // Number of bytes in tile column size - 1 + aom_wb_write_literal(wb, 0, 2); + // Number of bytes in tile size - 1 + aom_wb_write_literal(wb, 0, 2); + } +} + +// Stores the location and size of a tile's data in the bitstream. Used for +// later identifying identical tiles +typedef struct TileBufferEnc { + uint8_t *data; + size_t size; +} TileBufferEnc; + +static INLINE int find_identical_tile( + const int tile_row, const int tile_col, + TileBufferEnc (*const tile_buffers)[MAX_TILE_COLS]) { + const MV32 candidate_offset[1] = { { 1, 0 } }; + const uint8_t *const cur_tile_data = + tile_buffers[tile_row][tile_col].data + 4; + const size_t cur_tile_size = tile_buffers[tile_row][tile_col].size; + + int i; + + if (tile_row == 0) return 0; + + // (TODO: yunqingwang) For now, only above tile is checked and used. + // More candidates such as left tile can be added later. + for (i = 0; i < 1; i++) { + int row_offset = candidate_offset[0].row; + int col_offset = candidate_offset[0].col; + int row = tile_row - row_offset; + int col = tile_col - col_offset; + const uint8_t *tile_data; + TileBufferEnc *candidate; + + if (row < 0 || col < 0) continue; + + const uint32_t tile_hdr = mem_get_le32(tile_buffers[row][col].data); + + // Read out tile-copy-mode bit: + if ((tile_hdr >> 31) == 1) { + // The candidate is a copy tile itself: the offset is stored in bits + // 30 through 24 inclusive. + row_offset += (tile_hdr >> 24) & 0x7f; + row = tile_row - row_offset; + } + + candidate = &tile_buffers[row][col]; + + if (row_offset >= 128 || candidate->size != cur_tile_size) continue; + + tile_data = candidate->data + 4; + + if (memcmp(tile_data, cur_tile_data, cur_tile_size) != 0) continue; + + // Identical tile found + assert(row_offset > 0); + return row_offset; + } + + // No identical tile found + return 0; +} + +static AOM_INLINE void write_render_size(const AV1_COMMON *cm, + struct aom_write_bit_buffer *wb) { + const int scaling_active = av1_resize_scaled(cm); + aom_wb_write_bit(wb, scaling_active); + if (scaling_active) { + aom_wb_write_literal(wb, cm->render_width - 1, 16); + aom_wb_write_literal(wb, cm->render_height - 1, 16); + } +} + +static AOM_INLINE void write_superres_scale(const AV1_COMMON *const cm, + struct aom_write_bit_buffer *wb) { + const SequenceHeader *const seq_params = &cm->seq_params; + if (!seq_params->enable_superres) { + assert(cm->superres_scale_denominator == SCALE_NUMERATOR); + return; + } + + // First bit is whether to to scale or not + if (cm->superres_scale_denominator == SCALE_NUMERATOR) { + aom_wb_write_bit(wb, 0); // no scaling + } else { + aom_wb_write_bit(wb, 1); // scaling, write scale factor + assert(cm->superres_scale_denominator >= SUPERRES_SCALE_DENOMINATOR_MIN); + assert(cm->superres_scale_denominator < + SUPERRES_SCALE_DENOMINATOR_MIN + (1 << SUPERRES_SCALE_BITS)); + aom_wb_write_literal( + wb, cm->superres_scale_denominator - SUPERRES_SCALE_DENOMINATOR_MIN, + SUPERRES_SCALE_BITS); + } +} + +static AOM_INLINE void write_frame_size(const AV1_COMMON *cm, + int frame_size_override, + struct aom_write_bit_buffer *wb) { + const int coded_width = cm->superres_upscaled_width - 1; + const int coded_height = cm->superres_upscaled_height - 1; + + if (frame_size_override) { + const SequenceHeader *seq_params = &cm->seq_params; + int num_bits_width = seq_params->num_bits_width; + int num_bits_height = seq_params->num_bits_height; + aom_wb_write_literal(wb, coded_width, num_bits_width); + aom_wb_write_literal(wb, coded_height, num_bits_height); + } + + write_superres_scale(cm, wb); + write_render_size(cm, wb); +} + +static AOM_INLINE void write_frame_size_with_refs( + const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) { + int found = 0; + + MV_REFERENCE_FRAME ref_frame; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const YV12_BUFFER_CONFIG *cfg = get_ref_frame_yv12_buf(cm, ref_frame); + + if (cfg != NULL) { + found = cm->superres_upscaled_width == cfg->y_crop_width && + cm->superres_upscaled_height == cfg->y_crop_height; + found &= cm->render_width == cfg->render_width && + cm->render_height == cfg->render_height; + } + aom_wb_write_bit(wb, found); + if (found) { + write_superres_scale(cm, wb); + break; + } + } + + if (!found) { + int frame_size_override = 1; // Always equal to 1 in this function + write_frame_size(cm, frame_size_override, wb); + } +} + +static AOM_INLINE void write_profile(BITSTREAM_PROFILE profile, + struct aom_write_bit_buffer *wb) { + assert(profile >= PROFILE_0 && profile < MAX_PROFILES); + aom_wb_write_literal(wb, profile, PROFILE_BITS); +} + +static AOM_INLINE void write_bitdepth(const SequenceHeader *const seq_params, + struct aom_write_bit_buffer *wb) { + // Profile 0/1: [0] for 8 bit, [1] 10-bit + // Profile 2: [0] for 8 bit, [10] 10-bit, [11] - 12-bit + aom_wb_write_bit(wb, seq_params->bit_depth == AOM_BITS_8 ? 0 : 1); + if (seq_params->profile == PROFILE_2 && seq_params->bit_depth != AOM_BITS_8) { + aom_wb_write_bit(wb, seq_params->bit_depth == AOM_BITS_10 ? 0 : 1); + } +} + +static AOM_INLINE void write_color_config( + const SequenceHeader *const seq_params, struct aom_write_bit_buffer *wb) { + write_bitdepth(seq_params, wb); + const int is_monochrome = seq_params->monochrome; + // monochrome bit + if (seq_params->profile != PROFILE_1) + aom_wb_write_bit(wb, is_monochrome); + else + assert(!is_monochrome); + if (seq_params->color_primaries == AOM_CICP_CP_UNSPECIFIED && + seq_params->transfer_characteristics == AOM_CICP_TC_UNSPECIFIED && + seq_params->matrix_coefficients == AOM_CICP_MC_UNSPECIFIED) { + aom_wb_write_bit(wb, 0); // No color description present + } else { + aom_wb_write_bit(wb, 1); // Color description present + aom_wb_write_literal(wb, seq_params->color_primaries, 8); + aom_wb_write_literal(wb, seq_params->transfer_characteristics, 8); + aom_wb_write_literal(wb, seq_params->matrix_coefficients, 8); + } + if (is_monochrome) { + // 0: [16, 235] (i.e. xvYCC), 1: [0, 255] + aom_wb_write_bit(wb, seq_params->color_range); + return; + } + if (seq_params->color_primaries == AOM_CICP_CP_BT_709 && + seq_params->transfer_characteristics == AOM_CICP_TC_SRGB && + seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) { + assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0); + assert(seq_params->profile == PROFILE_1 || + (seq_params->profile == PROFILE_2 && + seq_params->bit_depth == AOM_BITS_12)); + } else { + // 0: [16, 235] (i.e. xvYCC), 1: [0, 255] + aom_wb_write_bit(wb, seq_params->color_range); + if (seq_params->profile == PROFILE_0) { + // 420 only + assert(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1); + } else if (seq_params->profile == PROFILE_1) { + // 444 only + assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0); + } else if (seq_params->profile == PROFILE_2) { + if (seq_params->bit_depth == AOM_BITS_12) { + // 420, 444 or 422 + aom_wb_write_bit(wb, seq_params->subsampling_x); + if (seq_params->subsampling_x == 0) { + assert(seq_params->subsampling_y == 0 && + "4:4:0 subsampling not allowed in AV1"); + } else { + aom_wb_write_bit(wb, seq_params->subsampling_y); + } + } else { + // 422 only + assert(seq_params->subsampling_x == 1 && + seq_params->subsampling_y == 0); + } + } + if (seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) { + assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0); + } + if (seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) { + aom_wb_write_literal(wb, seq_params->chroma_sample_position, 2); + } + } + aom_wb_write_bit(wb, seq_params->separate_uv_delta_q); +} + +static AOM_INLINE void write_timing_info_header( + const aom_timing_info_t *const timing_info, + struct aom_write_bit_buffer *wb) { + aom_wb_write_unsigned_literal(wb, timing_info->num_units_in_display_tick, 32); + aom_wb_write_unsigned_literal(wb, timing_info->time_scale, 32); + aom_wb_write_bit(wb, timing_info->equal_picture_interval); + if (timing_info->equal_picture_interval) { + aom_wb_write_uvlc(wb, timing_info->num_ticks_per_picture - 1); + } +} + +static AOM_INLINE void write_decoder_model_info( + const aom_dec_model_info_t *const decoder_model_info, + struct aom_write_bit_buffer *wb) { + aom_wb_write_literal( + wb, decoder_model_info->encoder_decoder_buffer_delay_length - 1, 5); + aom_wb_write_unsigned_literal( + wb, decoder_model_info->num_units_in_decoding_tick, 32); + aom_wb_write_literal(wb, decoder_model_info->buffer_removal_time_length - 1, + 5); + aom_wb_write_literal( + wb, decoder_model_info->frame_presentation_time_length - 1, 5); +} + +static AOM_INLINE void write_dec_model_op_parameters( + const aom_dec_model_op_parameters_t *op_params, int buffer_delay_length, + struct aom_write_bit_buffer *wb) { + aom_wb_write_unsigned_literal(wb, op_params->decoder_buffer_delay, + buffer_delay_length); + aom_wb_write_unsigned_literal(wb, op_params->encoder_buffer_delay, + buffer_delay_length); + aom_wb_write_bit(wb, op_params->low_delay_mode_flag); +} + +static AOM_INLINE void write_tu_pts_info(AV1_COMMON *const cm, + struct aom_write_bit_buffer *wb) { + aom_wb_write_unsigned_literal( + wb, cm->frame_presentation_time, + cm->seq_params.decoder_model_info.frame_presentation_time_length); +} + +static AOM_INLINE void write_film_grain_params( + const AV1_COMP *const cpi, struct aom_write_bit_buffer *wb) { + const AV1_COMMON *const cm = &cpi->common; + const aom_film_grain_t *const pars = &cm->cur_frame->film_grain_params; + + aom_wb_write_bit(wb, pars->apply_grain); + if (!pars->apply_grain) return; + + aom_wb_write_literal(wb, pars->random_seed, 16); + + if (cm->current_frame.frame_type == INTER_FRAME) + aom_wb_write_bit(wb, pars->update_parameters); + + if (!pars->update_parameters) { + int ref_frame, ref_idx; + for (ref_frame = LAST_FRAME; ref_frame < REF_FRAMES; ref_frame++) { + ref_idx = get_ref_frame_map_idx(cm, ref_frame); + assert(ref_idx != INVALID_IDX); + const RefCntBuffer *const buf = cm->ref_frame_map[ref_idx]; + if (buf->film_grain_params_present && + av1_check_grain_params_equiv(pars, &buf->film_grain_params)) { + break; + } + } + assert(ref_frame < REF_FRAMES); + aom_wb_write_literal(wb, ref_idx, 3); + return; + } + + // Scaling functions parameters + aom_wb_write_literal(wb, pars->num_y_points, 4); // max 14 + for (int i = 0; i < pars->num_y_points; i++) { + aom_wb_write_literal(wb, pars->scaling_points_y[i][0], 8); + aom_wb_write_literal(wb, pars->scaling_points_y[i][1], 8); + } + + if (!cm->seq_params.monochrome) { + aom_wb_write_bit(wb, pars->chroma_scaling_from_luma); + } else { + assert(!pars->chroma_scaling_from_luma); + } + + if (cm->seq_params.monochrome || pars->chroma_scaling_from_luma || + ((cm->seq_params.subsampling_x == 1) && + (cm->seq_params.subsampling_y == 1) && (pars->num_y_points == 0))) { + assert(pars->num_cb_points == 0 && pars->num_cr_points == 0); + } else { + aom_wb_write_literal(wb, pars->num_cb_points, 4); // max 10 + for (int i = 0; i < pars->num_cb_points; i++) { + aom_wb_write_literal(wb, pars->scaling_points_cb[i][0], 8); + aom_wb_write_literal(wb, pars->scaling_points_cb[i][1], 8); + } + + aom_wb_write_literal(wb, pars->num_cr_points, 4); // max 10 + for (int i = 0; i < pars->num_cr_points; i++) { + aom_wb_write_literal(wb, pars->scaling_points_cr[i][0], 8); + aom_wb_write_literal(wb, pars->scaling_points_cr[i][1], 8); + } + } + + aom_wb_write_literal(wb, pars->scaling_shift - 8, 2); // 8 + value + + // AR coefficients + // Only sent if the corresponsing scaling function has + // more than 0 points + + aom_wb_write_literal(wb, pars->ar_coeff_lag, 2); + + int num_pos_luma = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1); + int num_pos_chroma = num_pos_luma; + if (pars->num_y_points > 0) ++num_pos_chroma; + + if (pars->num_y_points) + for (int i = 0; i < num_pos_luma; i++) + aom_wb_write_literal(wb, pars->ar_coeffs_y[i] + 128, 8); + + if (pars->num_cb_points || pars->chroma_scaling_from_luma) + for (int i = 0; i < num_pos_chroma; i++) + aom_wb_write_literal(wb, pars->ar_coeffs_cb[i] + 128, 8); + + if (pars->num_cr_points || pars->chroma_scaling_from_luma) + for (int i = 0; i < num_pos_chroma; i++) + aom_wb_write_literal(wb, pars->ar_coeffs_cr[i] + 128, 8); + + aom_wb_write_literal(wb, pars->ar_coeff_shift - 6, 2); // 8 + value + + aom_wb_write_literal(wb, pars->grain_scale_shift, 2); + + if (pars->num_cb_points) { + aom_wb_write_literal(wb, pars->cb_mult, 8); + aom_wb_write_literal(wb, pars->cb_luma_mult, 8); + aom_wb_write_literal(wb, pars->cb_offset, 9); + } + + if (pars->num_cr_points) { + aom_wb_write_literal(wb, pars->cr_mult, 8); + aom_wb_write_literal(wb, pars->cr_luma_mult, 8); + aom_wb_write_literal(wb, pars->cr_offset, 9); + } + + aom_wb_write_bit(wb, pars->overlap_flag); + + aom_wb_write_bit(wb, pars->clip_to_restricted_range); +} + +static AOM_INLINE void write_sb_size(const SequenceHeader *const seq_params, + struct aom_write_bit_buffer *wb) { + (void)seq_params; + (void)wb; + assert(seq_params->mib_size == mi_size_wide[seq_params->sb_size]); + assert(seq_params->mib_size == 1 << seq_params->mib_size_log2); + assert(seq_params->sb_size == BLOCK_128X128 || + seq_params->sb_size == BLOCK_64X64); + aom_wb_write_bit(wb, seq_params->sb_size == BLOCK_128X128 ? 1 : 0); +} + +static AOM_INLINE void write_sequence_header( + const SequenceHeader *const seq_params, struct aom_write_bit_buffer *wb) { + aom_wb_write_literal(wb, seq_params->num_bits_width - 1, 4); + aom_wb_write_literal(wb, seq_params->num_bits_height - 1, 4); + aom_wb_write_literal(wb, seq_params->max_frame_width - 1, + seq_params->num_bits_width); + aom_wb_write_literal(wb, seq_params->max_frame_height - 1, + seq_params->num_bits_height); + + if (!seq_params->reduced_still_picture_hdr) { + aom_wb_write_bit(wb, seq_params->frame_id_numbers_present_flag); + if (seq_params->frame_id_numbers_present_flag) { + // We must always have delta_frame_id_length < frame_id_length, + // in order for a frame to be referenced with a unique delta. + // Avoid wasting bits by using a coding that enforces this restriction. + aom_wb_write_literal(wb, seq_params->delta_frame_id_length - 2, 4); + aom_wb_write_literal( + wb, + seq_params->frame_id_length - seq_params->delta_frame_id_length - 1, + 3); + } + } + + write_sb_size(seq_params, wb); + + aom_wb_write_bit(wb, seq_params->enable_filter_intra); + aom_wb_write_bit(wb, seq_params->enable_intra_edge_filter); + + if (!seq_params->reduced_still_picture_hdr) { + aom_wb_write_bit(wb, seq_params->enable_interintra_compound); + aom_wb_write_bit(wb, seq_params->enable_masked_compound); + aom_wb_write_bit(wb, seq_params->enable_warped_motion); + aom_wb_write_bit(wb, seq_params->enable_dual_filter); + + aom_wb_write_bit(wb, seq_params->order_hint_info.enable_order_hint); + + if (seq_params->order_hint_info.enable_order_hint) { + aom_wb_write_bit(wb, seq_params->order_hint_info.enable_dist_wtd_comp); + aom_wb_write_bit(wb, seq_params->order_hint_info.enable_ref_frame_mvs); + } + if (seq_params->force_screen_content_tools == 2) { + aom_wb_write_bit(wb, 1); + } else { + aom_wb_write_bit(wb, 0); + aom_wb_write_bit(wb, seq_params->force_screen_content_tools); + } + if (seq_params->force_screen_content_tools > 0) { + if (seq_params->force_integer_mv == 2) { + aom_wb_write_bit(wb, 1); + } else { + aom_wb_write_bit(wb, 0); + aom_wb_write_bit(wb, seq_params->force_integer_mv); + } + } else { + assert(seq_params->force_integer_mv == 2); + } + if (seq_params->order_hint_info.enable_order_hint) + aom_wb_write_literal( + wb, seq_params->order_hint_info.order_hint_bits_minus_1, 3); + } + + aom_wb_write_bit(wb, seq_params->enable_superres); + aom_wb_write_bit(wb, seq_params->enable_cdef); + aom_wb_write_bit(wb, seq_params->enable_restoration); +} + +static AOM_INLINE void write_global_motion_params( + const WarpedMotionParams *params, const WarpedMotionParams *ref_params, + struct aom_write_bit_buffer *wb, int allow_hp) { + const TransformationType type = params->wmtype; + + aom_wb_write_bit(wb, type != IDENTITY); + if (type != IDENTITY) { + aom_wb_write_bit(wb, type == ROTZOOM); + if (type != ROTZOOM) aom_wb_write_bit(wb, type == TRANSLATION); + } + + if (type >= ROTZOOM) { + aom_wb_write_signed_primitive_refsubexpfin( + wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - + (1 << GM_ALPHA_PREC_BITS), + (params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS)); + aom_wb_write_signed_primitive_refsubexpfin( + wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF), + (params->wmmat[3] >> GM_ALPHA_PREC_DIFF)); + } + + if (type >= AFFINE) { + aom_wb_write_signed_primitive_refsubexpfin( + wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF), + (params->wmmat[4] >> GM_ALPHA_PREC_DIFF)); + aom_wb_write_signed_primitive_refsubexpfin( + wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) - + (1 << GM_ALPHA_PREC_BITS), + (params->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS)); + } + + if (type >= TRANSLATION) { + const int trans_bits = (type == TRANSLATION) + ? GM_ABS_TRANS_ONLY_BITS - !allow_hp + : GM_ABS_TRANS_BITS; + const int trans_prec_diff = (type == TRANSLATION) + ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp + : GM_TRANS_PREC_DIFF; + aom_wb_write_signed_primitive_refsubexpfin( + wb, (1 << trans_bits) + 1, SUBEXPFIN_K, + (ref_params->wmmat[0] >> trans_prec_diff), + (params->wmmat[0] >> trans_prec_diff)); + aom_wb_write_signed_primitive_refsubexpfin( + wb, (1 << trans_bits) + 1, SUBEXPFIN_K, + (ref_params->wmmat[1] >> trans_prec_diff), + (params->wmmat[1] >> trans_prec_diff)); + } +} + +static AOM_INLINE void write_global_motion(AV1_COMP *cpi, + struct aom_write_bit_buffer *wb) { + AV1_COMMON *const cm = &cpi->common; + int frame; + for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) { + const WarpedMotionParams *ref_params = + cm->prev_frame ? &cm->prev_frame->global_motion[frame] + : &default_warp_params; + write_global_motion_params(&cm->global_motion[frame], ref_params, wb, + cm->features.allow_high_precision_mv); + // TODO(sarahparker, debargha): The logic in the commented out code below + // does not work currently and causes mismatches when resize is on. + // Fix it before turning the optimization back on. + /* + YV12_BUFFER_CONFIG *ref_buf = get_ref_frame_yv12_buf(cpi, frame); + if (cpi->source->y_crop_width == ref_buf->y_crop_width && + cpi->source->y_crop_height == ref_buf->y_crop_height) { + write_global_motion_params(&cm->global_motion[frame], + &cm->prev_frame->global_motion[frame], wb, + cm->features.allow_high_precision_mv); + } else { + assert(cm->global_motion[frame].wmtype == IDENTITY && + "Invalid warp type for frames of different resolutions"); + } + */ + /* + printf("Frame %d/%d: Enc Ref %d: %d %d %d %d\n", + cm->current_frame.frame_number, cm->show_frame, frame, + cm->global_motion[frame].wmmat[0], + cm->global_motion[frame].wmmat[1], cm->global_motion[frame].wmmat[2], + cm->global_motion[frame].wmmat[3]); + */ + } +} + +static int check_frame_refs_short_signaling(AV1_COMMON *const cm) { + // Check whether all references are distinct frames. + const RefCntBuffer *seen_bufs[FRAME_BUFFERS] = { NULL }; + int num_refs = 0; + for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + if (buf != NULL) { + int seen = 0; + for (int i = 0; i < num_refs; i++) { + if (seen_bufs[i] == buf) { + seen = 1; + break; + } + } + if (!seen) seen_bufs[num_refs++] = buf; + } + } + + // We only turn on frame_refs_short_signaling when all references are + // distinct. + if (num_refs < INTER_REFS_PER_FRAME) { + // It indicates that there exist more than one reference frame pointing to + // the same reference buffer, i.e. two or more references are duplicate. + return 0; + } + + // Check whether the encoder side ref frame choices are aligned with that to + // be derived at the decoder side. + int remapped_ref_idx_decoder[REF_FRAMES]; + + const int lst_map_idx = get_ref_frame_map_idx(cm, LAST_FRAME); + const int gld_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME); + + // Set up the frame refs mapping indexes according to the + // frame_refs_short_signaling policy. + av1_set_frame_refs(cm, remapped_ref_idx_decoder, lst_map_idx, gld_map_idx); + + // We only turn on frame_refs_short_signaling when the encoder side decision + // on ref frames is identical to that at the decoder side. + int frame_refs_short_signaling = 1; + for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ++ref_idx) { + // Compare the buffer index between two reference frames indexed + // respectively by the encoder and the decoder side decisions. + RefCntBuffer *ref_frame_buf_new = NULL; + if (remapped_ref_idx_decoder[ref_idx] != INVALID_IDX) { + ref_frame_buf_new = cm->ref_frame_map[remapped_ref_idx_decoder[ref_idx]]; + } + if (get_ref_frame_buf(cm, LAST_FRAME + ref_idx) != ref_frame_buf_new) { + frame_refs_short_signaling = 0; + break; + } + } + +#if 0 // For debug + printf("\nFrame=%d: \n", cm->current_frame.frame_number); + printf("***frame_refs_short_signaling=%d\n", frame_refs_short_signaling); + for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + printf("enc_ref(map_idx=%d)=%d, vs. " + "dec_ref(map_idx=%d)=%d\n", + get_ref_frame_map_idx(cm, ref_frame), ref_frame, + cm->remapped_ref_idx[ref_frame - LAST_FRAME], + ref_frame); + } +#endif // 0 + + return frame_refs_short_signaling; +} + +// New function based on HLS R18 +static AOM_INLINE void write_uncompressed_header_obu( + AV1_COMP *cpi, struct aom_write_bit_buffer *saved_wb, + struct aom_write_bit_buffer *wb) { + AV1_COMMON *const cm = &cpi->common; + const SequenceHeader *const seq_params = &cm->seq_params; + const CommonQuantParams *quant_params = &cm->quant_params; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + CurrentFrame *const current_frame = &cm->current_frame; + FeatureFlags *const features = &cm->features; + + current_frame->frame_refs_short_signaling = 0; + + if (seq_params->still_picture) { + assert(cm->show_existing_frame == 0); + assert(cm->show_frame == 1); + assert(current_frame->frame_type == KEY_FRAME); + } + if (!seq_params->reduced_still_picture_hdr) { + if (encode_show_existing_frame(cm)) { + aom_wb_write_bit(wb, 1); // show_existing_frame + aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3); + + if (seq_params->decoder_model_info_present_flag && + seq_params->timing_info.equal_picture_interval == 0) { + write_tu_pts_info(cm, wb); + } + if (seq_params->frame_id_numbers_present_flag) { + int frame_id_len = seq_params->frame_id_length; + int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show]; + aom_wb_write_literal(wb, display_frame_id, frame_id_len); + } + return; + } else { + aom_wb_write_bit(wb, 0); // show_existing_frame + } + + aom_wb_write_literal(wb, current_frame->frame_type, 2); + + aom_wb_write_bit(wb, cm->show_frame); + if (cm->show_frame) { + if (seq_params->decoder_model_info_present_flag && + seq_params->timing_info.equal_picture_interval == 0) + write_tu_pts_info(cm, wb); + } else { + aom_wb_write_bit(wb, cm->showable_frame); + } + if (frame_is_sframe(cm)) { + assert(features->error_resilient_mode); + } else if (!(current_frame->frame_type == KEY_FRAME && cm->show_frame)) { + aom_wb_write_bit(wb, features->error_resilient_mode); + } + } + aom_wb_write_bit(wb, features->disable_cdf_update); + + if (seq_params->force_screen_content_tools == 2) { + aom_wb_write_bit(wb, features->allow_screen_content_tools); + } else { + assert(features->allow_screen_content_tools == + seq_params->force_screen_content_tools); + } + + if (features->allow_screen_content_tools) { + if (seq_params->force_integer_mv == 2) { + aom_wb_write_bit(wb, features->cur_frame_force_integer_mv); + } else { + assert(features->cur_frame_force_integer_mv == + seq_params->force_integer_mv); + } + } else { + assert(features->cur_frame_force_integer_mv == 0); + } + + int frame_size_override_flag = 0; + + if (seq_params->reduced_still_picture_hdr) { + assert(cm->superres_upscaled_width == seq_params->max_frame_width && + cm->superres_upscaled_height == seq_params->max_frame_height); + } else { + if (seq_params->frame_id_numbers_present_flag) { + int frame_id_len = seq_params->frame_id_length; + aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len); + } + + if (cm->superres_upscaled_width > seq_params->max_frame_width || + cm->superres_upscaled_height > seq_params->max_frame_height) { + aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Frame dimensions are larger than the maximum values"); + } + + frame_size_override_flag = + frame_is_sframe(cm) + ? 1 + : (cm->superres_upscaled_width != seq_params->max_frame_width || + cm->superres_upscaled_height != seq_params->max_frame_height); + if (!frame_is_sframe(cm)) aom_wb_write_bit(wb, frame_size_override_flag); + + if (seq_params->order_hint_info.enable_order_hint) + aom_wb_write_literal( + wb, current_frame->order_hint, + seq_params->order_hint_info.order_hint_bits_minus_1 + 1); + + if (!features->error_resilient_mode && !frame_is_intra_only(cm)) { + aom_wb_write_literal(wb, features->primary_ref_frame, PRIMARY_REF_BITS); + } + } + + if (seq_params->decoder_model_info_present_flag) { + aom_wb_write_bit(wb, cm->buffer_removal_time_present); + if (cm->buffer_removal_time_present) { + for (int op_num = 0; + op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) { + if (seq_params->op_params[op_num].decoder_model_param_present_flag) { + if (((seq_params->operating_point_idc[op_num] >> + cm->temporal_layer_id) & + 0x1 && + (seq_params->operating_point_idc[op_num] >> + (cm->spatial_layer_id + 8)) & + 0x1) || + seq_params->operating_point_idc[op_num] == 0) { + aom_wb_write_unsigned_literal( + wb, cm->buffer_removal_times[op_num], + seq_params->decoder_model_info.buffer_removal_time_length); + cm->buffer_removal_times[op_num]++; + if (cm->buffer_removal_times[op_num] == 0) { + aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "buffer_removal_time overflowed"); + } + } + } + } + } + } + + // Shown keyframes and switch-frames automatically refreshes all reference + // frames. For all other frame types, we need to write refresh_frame_flags. + if ((current_frame->frame_type == KEY_FRAME && !cm->show_frame) || + current_frame->frame_type == INTER_FRAME || + current_frame->frame_type == INTRA_ONLY_FRAME) + aom_wb_write_literal(wb, current_frame->refresh_frame_flags, REF_FRAMES); + + if (!frame_is_intra_only(cm) || current_frame->refresh_frame_flags != 0xff) { + // Write all ref frame order hints if error_resilient_mode == 1 + if (features->error_resilient_mode && + seq_params->order_hint_info.enable_order_hint) { + for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) { + aom_wb_write_literal( + wb, cm->ref_frame_map[ref_idx]->order_hint, + seq_params->order_hint_info.order_hint_bits_minus_1 + 1); + } + } + } + + if (current_frame->frame_type == KEY_FRAME) { + write_frame_size(cm, frame_size_override_flag, wb); + assert(!av1_superres_scaled(cm) || !features->allow_intrabc); + if (features->allow_screen_content_tools && !av1_superres_scaled(cm)) + aom_wb_write_bit(wb, features->allow_intrabc); + } else { + if (current_frame->frame_type == INTRA_ONLY_FRAME) { + write_frame_size(cm, frame_size_override_flag, wb); + assert(!av1_superres_scaled(cm) || !features->allow_intrabc); + if (features->allow_screen_content_tools && !av1_superres_scaled(cm)) + aom_wb_write_bit(wb, features->allow_intrabc); + } else if (current_frame->frame_type == INTER_FRAME || + frame_is_sframe(cm)) { + MV_REFERENCE_FRAME ref_frame; + + // NOTE: Error resilient mode turns off frame_refs_short_signaling + // automatically. +#define FRAME_REFS_SHORT_SIGNALING 0 +#if FRAME_REFS_SHORT_SIGNALING + current_frame->frame_refs_short_signaling = + seq_params->order_hint_info.enable_order_hint; +#endif // FRAME_REFS_SHORT_SIGNALING + + if (current_frame->frame_refs_short_signaling) { + // NOTE(zoeliu@google.com): + // An example solution for encoder-side implementation on frame refs + // short signaling, which is only turned on when the encoder side + // decision on ref frames is identical to that at the decoder side. + current_frame->frame_refs_short_signaling = + check_frame_refs_short_signaling(cm); + } + + if (seq_params->order_hint_info.enable_order_hint) + aom_wb_write_bit(wb, current_frame->frame_refs_short_signaling); + + if (current_frame->frame_refs_short_signaling) { + const int lst_ref = get_ref_frame_map_idx(cm, LAST_FRAME); + aom_wb_write_literal(wb, lst_ref, REF_FRAMES_LOG2); + + const int gld_ref = get_ref_frame_map_idx(cm, GOLDEN_FRAME); + aom_wb_write_literal(wb, gld_ref, REF_FRAMES_LOG2); + } + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + assert(get_ref_frame_map_idx(cm, ref_frame) != INVALID_IDX); + if (!current_frame->frame_refs_short_signaling) + aom_wb_write_literal(wb, get_ref_frame_map_idx(cm, ref_frame), + REF_FRAMES_LOG2); + if (seq_params->frame_id_numbers_present_flag) { + int i = get_ref_frame_map_idx(cm, ref_frame); + int frame_id_len = seq_params->frame_id_length; + int diff_len = seq_params->delta_frame_id_length; + int delta_frame_id_minus_1 = + ((cm->current_frame_id - cm->ref_frame_id[i] + + (1 << frame_id_len)) % + (1 << frame_id_len)) - + 1; + if (delta_frame_id_minus_1 < 0 || + delta_frame_id_minus_1 >= (1 << diff_len)) { + aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, + "Invalid delta_frame_id_minus_1"); + } + aom_wb_write_literal(wb, delta_frame_id_minus_1, diff_len); + } + } + + if (!features->error_resilient_mode && frame_size_override_flag) { + write_frame_size_with_refs(cm, wb); + } else { + write_frame_size(cm, frame_size_override_flag, wb); + } + + if (!features->cur_frame_force_integer_mv) + aom_wb_write_bit(wb, features->allow_high_precision_mv); + write_frame_interp_filter(features->interp_filter, wb); + aom_wb_write_bit(wb, features->switchable_motion_mode); + if (frame_might_allow_ref_frame_mvs(cm)) { + aom_wb_write_bit(wb, features->allow_ref_frame_mvs); + } else { + assert(features->allow_ref_frame_mvs == 0); + } + } + } + + const int might_bwd_adapt = !(seq_params->reduced_still_picture_hdr) && + !(features->disable_cdf_update); + if (cm->tiles.large_scale) + assert(features->refresh_frame_context == REFRESH_FRAME_CONTEXT_DISABLED); + + if (might_bwd_adapt) { + aom_wb_write_bit( + wb, features->refresh_frame_context == REFRESH_FRAME_CONTEXT_DISABLED); + } + + write_tile_info(cm, saved_wb, wb); + encode_quantization(quant_params, av1_num_planes(cm), + cm->seq_params.separate_uv_delta_q, wb); + encode_segmentation(cm, xd, wb); + + const DeltaQInfo *const delta_q_info = &cm->delta_q_info; + if (delta_q_info->delta_q_present_flag) assert(quant_params->base_qindex > 0); + if (quant_params->base_qindex > 0) { + aom_wb_write_bit(wb, delta_q_info->delta_q_present_flag); + if (delta_q_info->delta_q_present_flag) { + aom_wb_write_literal(wb, get_msb(delta_q_info->delta_q_res), 2); + xd->current_qindex = quant_params->base_qindex; + if (features->allow_intrabc) + assert(delta_q_info->delta_lf_present_flag == 0); + else + aom_wb_write_bit(wb, delta_q_info->delta_lf_present_flag); + if (delta_q_info->delta_lf_present_flag) { + aom_wb_write_literal(wb, get_msb(delta_q_info->delta_lf_res), 2); + aom_wb_write_bit(wb, delta_q_info->delta_lf_multi); + av1_reset_loop_filter_delta(xd, av1_num_planes(cm)); + } + } + } + + if (features->all_lossless) { + assert(!av1_superres_scaled(cm)); + } else { + if (!features->coded_lossless) { + encode_loopfilter(cm, wb); + encode_cdef(cm, wb); + } + encode_restoration_mode(cm, wb); + } + + // Write TX mode + if (features->coded_lossless) + assert(features->tx_mode == ONLY_4X4); + else + aom_wb_write_bit(wb, features->tx_mode == TX_MODE_SELECT); + + if (!frame_is_intra_only(cm)) { + const int use_hybrid_pred = + current_frame->reference_mode == REFERENCE_MODE_SELECT; + + aom_wb_write_bit(wb, use_hybrid_pred); + } + + if (current_frame->skip_mode_info.skip_mode_allowed) + aom_wb_write_bit(wb, current_frame->skip_mode_info.skip_mode_flag); + + if (frame_might_allow_warped_motion(cm)) + aom_wb_write_bit(wb, features->allow_warped_motion); + else + assert(!features->allow_warped_motion); + + aom_wb_write_bit(wb, features->reduced_tx_set_used); + + if (!frame_is_intra_only(cm)) write_global_motion(cpi, wb); + + if (seq_params->film_grain_params_present && + (cm->show_frame || cm->showable_frame)) + write_film_grain_params(cpi, wb); + + if (cm->tiles.large_scale) write_ext_tile_info(cm, saved_wb, wb); +} + +static int choose_size_bytes(uint32_t size, int spare_msbs) { + // Choose the number of bytes required to represent size, without + // using the 'spare_msbs' number of most significant bits. + + // Make sure we will fit in 4 bytes to start with.. + if (spare_msbs > 0 && size >> (32 - spare_msbs) != 0) return -1; + + // Normalise to 32 bits + size <<= spare_msbs; + + if (size >> 24 != 0) + return 4; + else if (size >> 16 != 0) + return 3; + else if (size >> 8 != 0) + return 2; + else + return 1; +} + +static AOM_INLINE void mem_put_varsize(uint8_t *const dst, const int sz, + const int val) { + switch (sz) { + case 1: dst[0] = (uint8_t)(val & 0xff); break; + case 2: mem_put_le16(dst, val); break; + case 3: mem_put_le24(dst, val); break; + case 4: mem_put_le32(dst, val); break; + default: assert(0 && "Invalid size"); break; + } +} + +static int remux_tiles(const CommonTileParams *const tiles, uint8_t *dst, + const uint32_t data_size, const uint32_t max_tile_size, + const uint32_t max_tile_col_size, + int *const tile_size_bytes, + int *const tile_col_size_bytes) { + // Choose the tile size bytes (tsb) and tile column size bytes (tcsb) + int tsb; + int tcsb; + + if (tiles->large_scale) { + // The top bit in the tile size field indicates tile copy mode, so we + // have 1 less bit to code the tile size + tsb = choose_size_bytes(max_tile_size, 1); + tcsb = choose_size_bytes(max_tile_col_size, 0); + } else { + tsb = choose_size_bytes(max_tile_size, 0); + tcsb = 4; // This is ignored + (void)max_tile_col_size; + } + + assert(tsb > 0); + assert(tcsb > 0); + + *tile_size_bytes = tsb; + *tile_col_size_bytes = tcsb; + if (tsb == 4 && tcsb == 4) return data_size; + + uint32_t wpos = 0; + uint32_t rpos = 0; + + if (tiles->large_scale) { + int tile_row; + int tile_col; + + for (tile_col = 0; tile_col < tiles->cols; tile_col++) { + // All but the last column has a column header + if (tile_col < tiles->cols - 1) { + uint32_t tile_col_size = mem_get_le32(dst + rpos); + rpos += 4; + + // Adjust the tile column size by the number of bytes removed + // from the tile size fields. + tile_col_size -= (4 - tsb) * tiles->rows; + + mem_put_varsize(dst + wpos, tcsb, tile_col_size); + wpos += tcsb; + } + + for (tile_row = 0; tile_row < tiles->rows; tile_row++) { + // All, including the last row has a header + uint32_t tile_header = mem_get_le32(dst + rpos); + rpos += 4; + + // If this is a copy tile, we need to shift the MSB to the + // top bit of the new width, and there is no data to copy. + if (tile_header >> 31 != 0) { + if (tsb < 4) tile_header >>= 32 - 8 * tsb; + mem_put_varsize(dst + wpos, tsb, tile_header); + wpos += tsb; + } else { + mem_put_varsize(dst + wpos, tsb, tile_header); + wpos += tsb; + + tile_header += AV1_MIN_TILE_SIZE_BYTES; + memmove(dst + wpos, dst + rpos, tile_header); + rpos += tile_header; + wpos += tile_header; + } + } + } + + assert(rpos > wpos); + assert(rpos == data_size); + + return wpos; + } + const int n_tiles = tiles->cols * tiles->rows; + int n; + + for (n = 0; n < n_tiles; n++) { + int tile_size; + + if (n == n_tiles - 1) { + tile_size = data_size - rpos; + } else { + tile_size = mem_get_le32(dst + rpos); + rpos += 4; + mem_put_varsize(dst + wpos, tsb, tile_size); + tile_size += AV1_MIN_TILE_SIZE_BYTES; + wpos += tsb; + } + + memmove(dst + wpos, dst + rpos, tile_size); + + rpos += tile_size; + wpos += tile_size; + } + + assert(rpos > wpos); + assert(rpos == data_size); + + return wpos; +} + +uint32_t av1_write_obu_header(AV1LevelParams *const level_params, + OBU_TYPE obu_type, int obu_extension, + uint8_t *const dst) { + if (level_params->keep_level_stats && + (obu_type == OBU_FRAME || obu_type == OBU_FRAME_HEADER)) + ++level_params->frame_header_count; + + struct aom_write_bit_buffer wb = { dst, 0 }; + uint32_t size = 0; + + aom_wb_write_literal(&wb, 0, 1); // forbidden bit. + aom_wb_write_literal(&wb, (int)obu_type, 4); + aom_wb_write_literal(&wb, obu_extension ? 1 : 0, 1); + aom_wb_write_literal(&wb, 1, 1); // obu_has_payload_length_field + aom_wb_write_literal(&wb, 0, 1); // reserved + + if (obu_extension) { + aom_wb_write_literal(&wb, obu_extension & 0xFF, 8); + } + + size = aom_wb_bytes_written(&wb); + return size; +} + +int av1_write_uleb_obu_size(size_t obu_header_size, size_t obu_payload_size, + uint8_t *dest) { + const size_t offset = obu_header_size; + size_t coded_obu_size = 0; + const uint32_t obu_size = (uint32_t)obu_payload_size; + assert(obu_size == obu_payload_size); + + if (aom_uleb_encode(obu_size, sizeof(obu_size), dest + offset, + &coded_obu_size) != 0) { + return AOM_CODEC_ERROR; + } + + return AOM_CODEC_OK; +} + +static size_t obu_memmove(size_t obu_header_size, size_t obu_payload_size, + uint8_t *data) { + const size_t length_field_size = aom_uleb_size_in_bytes(obu_payload_size); + const size_t move_dst_offset = length_field_size + obu_header_size; + const size_t move_src_offset = obu_header_size; + const size_t move_size = obu_payload_size; + memmove(data + move_dst_offset, data + move_src_offset, move_size); + return length_field_size; +} + +static AOM_INLINE void add_trailing_bits(struct aom_write_bit_buffer *wb) { + if (aom_wb_is_byte_aligned(wb)) { + aom_wb_write_literal(wb, 0x80, 8); + } else { + // assumes that the other bits are already 0s + aom_wb_write_bit(wb, 1); + } +} + +static AOM_INLINE void write_bitstream_level(AV1_LEVEL seq_level_idx, + struct aom_write_bit_buffer *wb) { + assert(is_valid_seq_level_idx(seq_level_idx)); + aom_wb_write_literal(wb, seq_level_idx, LEVEL_BITS); +} + +uint32_t av1_write_sequence_header_obu(const SequenceHeader *seq_params, + uint8_t *const dst) { + struct aom_write_bit_buffer wb = { dst, 0 }; + uint32_t size = 0; + + write_profile(seq_params->profile, &wb); + + // Still picture or not + aom_wb_write_bit(&wb, seq_params->still_picture); + assert(IMPLIES(!seq_params->still_picture, + !seq_params->reduced_still_picture_hdr)); + // whether to use reduced still picture header + aom_wb_write_bit(&wb, seq_params->reduced_still_picture_hdr); + + if (seq_params->reduced_still_picture_hdr) { + assert(seq_params->timing_info_present == 0); + assert(seq_params->decoder_model_info_present_flag == 0); + assert(seq_params->display_model_info_present_flag == 0); + write_bitstream_level(seq_params->seq_level_idx[0], &wb); + } else { + aom_wb_write_bit( + &wb, seq_params->timing_info_present); // timing info present flag + + if (seq_params->timing_info_present) { + // timing_info + write_timing_info_header(&seq_params->timing_info, &wb); + aom_wb_write_bit(&wb, seq_params->decoder_model_info_present_flag); + if (seq_params->decoder_model_info_present_flag) { + write_decoder_model_info(&seq_params->decoder_model_info, &wb); + } + } + aom_wb_write_bit(&wb, seq_params->display_model_info_present_flag); + aom_wb_write_literal(&wb, seq_params->operating_points_cnt_minus_1, + OP_POINTS_CNT_MINUS_1_BITS); + int i; + for (i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) { + aom_wb_write_literal(&wb, seq_params->operating_point_idc[i], + OP_POINTS_IDC_BITS); + write_bitstream_level(seq_params->seq_level_idx[i], &wb); + if (seq_params->seq_level_idx[i] >= SEQ_LEVEL_4_0) + aom_wb_write_bit(&wb, seq_params->tier[i]); + if (seq_params->decoder_model_info_present_flag) { + aom_wb_write_bit( + &wb, seq_params->op_params[i].decoder_model_param_present_flag); + if (seq_params->op_params[i].decoder_model_param_present_flag) { + write_dec_model_op_parameters( + &seq_params->op_params[i], + seq_params->decoder_model_info + .encoder_decoder_buffer_delay_length, + &wb); + } + } + if (seq_params->display_model_info_present_flag) { + aom_wb_write_bit( + &wb, seq_params->op_params[i].display_model_param_present_flag); + if (seq_params->op_params[i].display_model_param_present_flag) { + assert(seq_params->op_params[i].initial_display_delay <= 10); + aom_wb_write_literal( + &wb, seq_params->op_params[i].initial_display_delay - 1, 4); + } + } + } + } + write_sequence_header(seq_params, &wb); + + write_color_config(seq_params, &wb); + + aom_wb_write_bit(&wb, seq_params->film_grain_params_present); + + add_trailing_bits(&wb); + + size = aom_wb_bytes_written(&wb); + return size; +} + +static uint32_t write_frame_header_obu(AV1_COMP *cpi, + struct aom_write_bit_buffer *saved_wb, + uint8_t *const dst, + int append_trailing_bits) { + struct aom_write_bit_buffer wb = { dst, 0 }; + write_uncompressed_header_obu(cpi, saved_wb, &wb); + if (append_trailing_bits) add_trailing_bits(&wb); + return aom_wb_bytes_written(&wb); +} + +static uint32_t write_tile_group_header(uint8_t *const dst, int start_tile, + int end_tile, int tiles_log2, + int tile_start_and_end_present_flag) { + struct aom_write_bit_buffer wb = { dst, 0 }; + uint32_t size = 0; + + if (!tiles_log2) return size; + + aom_wb_write_bit(&wb, tile_start_and_end_present_flag); + + if (tile_start_and_end_present_flag) { + aom_wb_write_literal(&wb, start_tile, tiles_log2); + aom_wb_write_literal(&wb, end_tile, tiles_log2); + } + + size = aom_wb_bytes_written(&wb); + return size; +} + +typedef struct { + uint8_t *frame_header; + size_t obu_header_byte_offset; + size_t total_length; +} FrameHeaderInfo; + +extern void av1_print_uncompressed_frame_header(const uint8_t *data, int size, + const char *filename); + +static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, + struct aom_write_bit_buffer *saved_wb, + uint8_t obu_extension_header, + const FrameHeaderInfo *fh_info, + int *const largest_tile_id) { + AV1_COMMON *const cm = &cpi->common; + const CommonTileParams *const tiles = &cm->tiles; + AV1LevelParams *const level_params = &cpi->level_params; + aom_writer mode_bc; + int tile_row, tile_col; + // Store the location and size of each tile's data in the bitstream: + TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS]; + uint32_t total_size = 0; + const int tile_cols = tiles->cols; + const int tile_rows = tiles->rows; + unsigned int tile_size = 0; + unsigned int max_tile_size = 0; + unsigned int max_tile_col_size = 0; + const int n_log2_tiles = tiles->log2_rows + tiles->log2_cols; + // Fixed size tile groups for the moment + const int num_tg_hdrs = cpi->num_tg; + const int tg_size = + (tiles->large_scale) + ? 1 + : (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs; + int tile_count = 0; + int curr_tg_data_size = 0; + uint8_t *data = dst; + int new_tg = 1; + const int have_tiles = tile_cols * tile_rows > 1; + int first_tg = 1; + + *largest_tile_id = 0; + + if (tiles->large_scale) { + // For large_scale_tile case, we always have only one tile group, so it can + // be written as an OBU_FRAME. + const OBU_TYPE obu_type = OBU_FRAME; + const uint32_t tg_hdr_size = + av1_write_obu_header(level_params, obu_type, 0, data); + data += tg_hdr_size; + + const uint32_t frame_header_size = + write_frame_header_obu(cpi, saved_wb, data, 0); + data += frame_header_size; + total_size += frame_header_size; + + // (yunqing) This test ensures the correctness of large scale tile coding. + if (cpi->oxcf.ext_tile_debug) { + char fn[20] = "./fh"; + fn[4] = cm->current_frame.frame_number / 100 + '0'; + fn[5] = (cm->current_frame.frame_number % 100) / 10 + '0'; + fn[6] = (cm->current_frame.frame_number % 10) + '0'; + fn[7] = '\0'; + av1_print_uncompressed_frame_header(data - frame_header_size, + frame_header_size, fn); + } + + int tile_size_bytes = 0; + int tile_col_size_bytes = 0; + + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + TileInfo tile_info; + const int is_last_col = (tile_col == tile_cols - 1); + const uint32_t col_offset = total_size; + + av1_tile_set_col(&tile_info, cm, tile_col); + + // The last column does not have a column header + if (!is_last_col) total_size += 4; + + for (tile_row = 0; tile_row < tile_rows; tile_row++) { + TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col]; + const int data_offset = have_tiles ? 4 : 0; + const int tile_idx = tile_row * tile_cols + tile_col; + TileDataEnc *this_tile = &cpi->tile_data[tile_idx]; + av1_tile_set_row(&tile_info, cm, tile_row); + + buf->data = dst + total_size + tg_hdr_size; + + // Is CONFIG_EXT_TILE = 1, every tile in the row has a header, + // even for the last one, unless no tiling is used at all. + total_size += data_offset; + cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx; + mode_bc.allow_update_cdf = !tiles->large_scale; + mode_bc.allow_update_cdf = + mode_bc.allow_update_cdf && !cm->features.disable_cdf_update; + aom_start_encode(&mode_bc, buf->data + data_offset); + write_modes(cpi, &tile_info, &mode_bc, tile_row, tile_col); + aom_stop_encode(&mode_bc); + tile_size = mode_bc.pos; + buf->size = tile_size; + + // Record the maximum tile size we see, so we can compact headers later. + if (tile_size > max_tile_size) { + max_tile_size = tile_size; + *largest_tile_id = tile_cols * tile_row + tile_col; + } + + if (have_tiles) { + // tile header: size of this tile, or copy offset + uint32_t tile_header = tile_size - AV1_MIN_TILE_SIZE_BYTES; + const int tile_copy_mode = + ((AOMMAX(tiles->width, tiles->height) << MI_SIZE_LOG2) <= 256) + ? 1 + : 0; + + // If tile_copy_mode = 1, check if this tile is a copy tile. + // Very low chances to have copy tiles on the key frames, so don't + // search on key frames to reduce unnecessary search. + if (cm->current_frame.frame_type != KEY_FRAME && tile_copy_mode) { + const int identical_tile_offset = + find_identical_tile(tile_row, tile_col, tile_buffers); + + // Indicate a copy-tile by setting the most significant bit. + // The row-offset to copy from is stored in the highest byte. + // remux_tiles will move these around later + if (identical_tile_offset > 0) { + tile_size = 0; + tile_header = identical_tile_offset | 0x80; + tile_header <<= 24; + } + } + + mem_put_le32(buf->data, tile_header); + } + + total_size += tile_size; + } + + if (!is_last_col) { + uint32_t col_size = total_size - col_offset - 4; + mem_put_le32(dst + col_offset + tg_hdr_size, col_size); + + // Record the maximum tile column size we see. + max_tile_col_size = AOMMAX(max_tile_col_size, col_size); + } + } + + if (have_tiles) { + total_size = remux_tiles(tiles, data, total_size - frame_header_size, + max_tile_size, max_tile_col_size, + &tile_size_bytes, &tile_col_size_bytes); + total_size += frame_header_size; + } + + // In EXT_TILE case, only use 1 tile group. Follow the obu syntax, write + // current tile group size before tile data(include tile column header). + // Tile group size doesn't include the bytes storing tg size. + total_size += tg_hdr_size; + const uint32_t obu_payload_size = total_size - tg_hdr_size; + const size_t length_field_size = + obu_memmove(tg_hdr_size, obu_payload_size, dst); + if (av1_write_uleb_obu_size(tg_hdr_size, obu_payload_size, dst) != + AOM_CODEC_OK) { + assert(0); + } + total_size += (uint32_t)length_field_size; + saved_wb->bit_buffer += length_field_size; + + // Now fill in the gaps in the uncompressed header. + if (have_tiles) { + assert(tile_col_size_bytes >= 1 && tile_col_size_bytes <= 4); + aom_wb_overwrite_literal(saved_wb, tile_col_size_bytes - 1, 2); + + assert(tile_size_bytes >= 1 && tile_size_bytes <= 4); + aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2); + } + return total_size; + } + + uint32_t obu_header_size = 0; + uint8_t *tile_data_start = dst + total_size; + for (tile_row = 0; tile_row < tile_rows; tile_row++) { + TileInfo tile_info; + av1_tile_set_row(&tile_info, cm, tile_row); + + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + const int tile_idx = tile_row * tile_cols + tile_col; + TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col]; + TileDataEnc *this_tile = &cpi->tile_data[tile_idx]; + int is_last_tile_in_tg = 0; + + if (new_tg) { + data = dst + total_size; + + // A new tile group begins at this tile. Write the obu header and + // tile group header + const OBU_TYPE obu_type = + (num_tg_hdrs == 1) ? OBU_FRAME : OBU_TILE_GROUP; + curr_tg_data_size = av1_write_obu_header(level_params, obu_type, + obu_extension_header, data); + obu_header_size = curr_tg_data_size; + + if (num_tg_hdrs == 1) { + curr_tg_data_size += write_frame_header_obu( + cpi, saved_wb, data + curr_tg_data_size, 0); + } + curr_tg_data_size += write_tile_group_header( + data + curr_tg_data_size, tile_idx, + AOMMIN(tile_idx + tg_size - 1, tile_cols * tile_rows - 1), + n_log2_tiles, cpi->num_tg > 1); + total_size += curr_tg_data_size; + tile_data_start += curr_tg_data_size; + new_tg = 0; + tile_count = 0; + } + tile_count++; + av1_tile_set_col(&tile_info, cm, tile_col); + + if (tile_count == tg_size || tile_idx == (tile_cols * tile_rows - 1)) { + is_last_tile_in_tg = 1; + new_tg = 1; + } else { + is_last_tile_in_tg = 0; + } + + buf->data = dst + total_size; + + // The last tile of the tile group does not have a header. + if (!is_last_tile_in_tg) total_size += 4; + + cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx; + mode_bc.allow_update_cdf = 1; + mode_bc.allow_update_cdf = + mode_bc.allow_update_cdf && !cm->features.disable_cdf_update; + const int num_planes = av1_num_planes(cm); + av1_reset_loop_restoration(&cpi->td.mb.e_mbd, num_planes); + + aom_start_encode(&mode_bc, dst + total_size); + write_modes(cpi, &tile_info, &mode_bc, tile_row, tile_col); + aom_stop_encode(&mode_bc); + tile_size = mode_bc.pos; + assert(tile_size >= AV1_MIN_TILE_SIZE_BYTES); + + curr_tg_data_size += (tile_size + (is_last_tile_in_tg ? 0 : 4)); + buf->size = tile_size; + if (tile_size > max_tile_size) { + *largest_tile_id = tile_cols * tile_row + tile_col; + max_tile_size = tile_size; + } + + if (!is_last_tile_in_tg) { + // size of this tile + mem_put_le32(buf->data, tile_size - AV1_MIN_TILE_SIZE_BYTES); + } else { + // write current tile group size + const uint32_t obu_payload_size = curr_tg_data_size - obu_header_size; + const size_t length_field_size = + obu_memmove(obu_header_size, obu_payload_size, data); + if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) != + AOM_CODEC_OK) { + assert(0); + } + curr_tg_data_size += (int)length_field_size; + total_size += (uint32_t)length_field_size; + tile_data_start += length_field_size; + if (num_tg_hdrs == 1) { + // if this tg is combined with the frame header then update saved + // frame header base offset accroding to length field size + saved_wb->bit_buffer += length_field_size; + } + + if (!first_tg && cm->features.error_resilient_mode) { + // Make room for a duplicate Frame Header OBU. + memmove(data + fh_info->total_length, data, curr_tg_data_size); + + // Insert a copy of the Frame Header OBU. + memcpy(data, fh_info->frame_header, fh_info->total_length); + + // Force context update tile to be the first tile in error + // resiliant mode as the duplicate frame headers will have + // context_update_tile_id set to 0 + *largest_tile_id = 0; + + // Rewrite the OBU header to change the OBU type to Redundant Frame + // Header. + av1_write_obu_header(level_params, OBU_REDUNDANT_FRAME_HEADER, + obu_extension_header, + &data[fh_info->obu_header_byte_offset]); + + data += fh_info->total_length; + + curr_tg_data_size += (int)(fh_info->total_length); + total_size += (uint32_t)(fh_info->total_length); + } + first_tg = 0; + } + + total_size += tile_size; + } + } + + if (have_tiles) { + // Fill in context_update_tile_id indicating the tile to use for the + // cdf update. The encoder currently sets it to the largest tile + // (but is up to the encoder) + aom_wb_overwrite_literal(saved_wb, *largest_tile_id, + tiles->log2_cols + tiles->log2_rows); + // If more than one tile group. tile_size_bytes takes the default value 4 + // and does not need to be set. For a single tile group it is set in the + // section below. + if (num_tg_hdrs == 1) { + int tile_size_bytes = 4, unused; + const uint32_t tile_data_offset = (uint32_t)(tile_data_start - dst); + const uint32_t tile_data_size = total_size - tile_data_offset; + + total_size = + remux_tiles(tiles, tile_data_start, tile_data_size, max_tile_size, + max_tile_col_size, &tile_size_bytes, &unused); + total_size += tile_data_offset; + assert(tile_size_bytes >= 1 && tile_size_bytes <= 4); + + aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2); + + // Update the OBU length if remux_tiles() reduced the size. + uint64_t payload_size; + size_t length_field_size; + int res = + aom_uleb_decode(dst + obu_header_size, total_size - obu_header_size, + &payload_size, &length_field_size); + assert(res == 0); + (void)res; + + const uint64_t new_payload_size = + total_size - obu_header_size - length_field_size; + if (new_payload_size != payload_size) { + size_t new_length_field_size; + res = aom_uleb_encode(new_payload_size, length_field_size, + dst + obu_header_size, &new_length_field_size); + assert(res == 0); + if (new_length_field_size < length_field_size) { + const size_t src_offset = obu_header_size + length_field_size; + const size_t dst_offset = obu_header_size + new_length_field_size; + memmove(dst + dst_offset, dst + src_offset, (size_t)payload_size); + total_size -= (int)(length_field_size - new_length_field_size); + } + } + } + } + return total_size; +} + +static size_t av1_write_metadata_obu(const aom_metadata_t *metadata, + uint8_t *const dst) { + size_t coded_metadata_size = 0; + const uint64_t metadata_type = (uint64_t)metadata->type; + if (aom_uleb_encode(metadata_type, sizeof(metadata_type), dst, + &coded_metadata_size) != 0) { + return 0; + } + memcpy(dst + coded_metadata_size, metadata->payload, metadata->sz); + // Add trailing bits. + dst[coded_metadata_size + metadata->sz] = 0x80; + return (uint32_t)(coded_metadata_size + metadata->sz + 1); +} + +static size_t av1_write_metadata_array(AV1_COMP *const cpi, uint8_t *dst) { + if (!cpi->source) return 0; + AV1_COMMON *const cm = &cpi->common; + aom_metadata_array_t *arr = cpi->source->metadata; + if (!arr) return 0; + size_t obu_header_size = 0; + size_t obu_payload_size = 0; + size_t total_bytes_written = 0; + size_t length_field_size = 0; + for (size_t i = 0; i < arr->sz; i++) { + aom_metadata_t *current_metadata = arr->metadata_array[i]; + if (current_metadata && current_metadata->payload) { + if ((cm->current_frame.frame_type == KEY_FRAME && + current_metadata->insert_flag == AOM_MIF_KEY_FRAME) || + (cm->current_frame.frame_type != KEY_FRAME && + current_metadata->insert_flag == AOM_MIF_NON_KEY_FRAME) || + current_metadata->insert_flag == AOM_MIF_ANY_FRAME) { + obu_header_size = + av1_write_obu_header(&cpi->level_params, OBU_METADATA, 0, dst); + obu_payload_size = + av1_write_metadata_obu(current_metadata, dst + obu_header_size); + length_field_size = obu_memmove(obu_header_size, obu_payload_size, dst); + if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, dst) == + AOM_CODEC_OK) { + const size_t obu_size = obu_header_size + obu_payload_size; + dst += obu_size + length_field_size; + total_bytes_written += obu_size + length_field_size; + } else { + aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, + "Error writing metadata OBU size"); + } + } + } + } + return total_bytes_written; +} + +int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size, + int *const largest_tile_id) { + uint8_t *data = dst; + uint32_t data_size; + AV1_COMMON *const cm = &cpi->common; + AV1LevelParams *const level_params = &cpi->level_params; + uint32_t obu_header_size = 0; + uint32_t obu_payload_size = 0; + FrameHeaderInfo fh_info = { NULL, 0, 0 }; + const uint8_t obu_extension_header = + cm->temporal_layer_id << 5 | cm->spatial_layer_id << 3 | 0; + + // If no non-zero delta_q has been used, reset delta_q_present_flag + if (cm->delta_q_info.delta_q_present_flag && cpi->deltaq_used == 0) { + cm->delta_q_info.delta_q_present_flag = 0; + } + +#if CONFIG_BITSTREAM_DEBUG + bitstream_queue_reset_write(); +#endif + + level_params->frame_header_count = 0; + + // The TD is now written outside the frame encode loop + + // write sequence header obu if KEY_FRAME, preceded by 4-byte size + if (cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) { + obu_header_size = + av1_write_obu_header(level_params, OBU_SEQUENCE_HEADER, 0, data); + + obu_payload_size = + av1_write_sequence_header_obu(&cm->seq_params, data + obu_header_size); + const size_t length_field_size = + obu_memmove(obu_header_size, obu_payload_size, data); + if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + + data += obu_header_size + obu_payload_size + length_field_size; + } + + // write metadata obus before the frame obu that has the show_frame flag set + if (cm->show_frame) data += av1_write_metadata_array(cpi, data); + + const int write_frame_header = + (cpi->num_tg > 1 || encode_show_existing_frame(cm)); + struct aom_write_bit_buffer saved_wb; + if (write_frame_header) { + // Write Frame Header OBU. + fh_info.frame_header = data; + obu_header_size = av1_write_obu_header(level_params, OBU_FRAME_HEADER, + obu_extension_header, data); + obu_payload_size = + write_frame_header_obu(cpi, &saved_wb, data + obu_header_size, 1); + + const size_t length_field_size = + obu_memmove(obu_header_size, obu_payload_size, data); + if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + + fh_info.obu_header_byte_offset = 0; + fh_info.total_length = + obu_header_size + obu_payload_size + length_field_size; + data += fh_info.total_length; + + // Since length_field_size is determined adaptively after frame header + // encoding, saved_wb must be adjusted accordingly. + saved_wb.bit_buffer += length_field_size; + } + + if (encode_show_existing_frame(cm)) { + data_size = 0; + } else { + // Each tile group obu will be preceded by 4-byte size of the tile group + // obu + data_size = write_tiles_in_tg_obus( + cpi, data, &saved_wb, obu_extension_header, &fh_info, largest_tile_id); + } + data += data_size; + *size = data - dst; + return AOM_CODEC_OK; +} diff --git a/libs/libaom/src/av1/encoder/bitstream.h b/libs/libaom/src/av1/encoder/bitstream.h new file mode 100644 index 000000000..45151e25e --- /dev/null +++ b/libs/libaom/src/av1/encoder/bitstream.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_BITSTREAM_H_ +#define AOM_AV1_ENCODER_BITSTREAM_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/encoder.h" + +struct aom_write_bit_buffer; + +// Writes only the OBU Sequence Header payload, and returns the size of the +// payload written to 'dst'. This function does not write the OBU header, the +// optional extension, or the OBU size to 'dst'. +uint32_t av1_write_sequence_header_obu(const SequenceHeader *seq_params, + uint8_t *const dst); + +// Writes the OBU header byte, and the OBU header extension byte when +// 'obu_extension' is non-zero. Returns number of bytes written to 'dst'. +uint32_t av1_write_obu_header(AV1LevelParams *const level_params, + OBU_TYPE obu_type, int obu_extension, + uint8_t *const dst); + +int av1_write_uleb_obu_size(size_t obu_header_size, size_t obu_payload_size, + uint8_t *dest); + +int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size, + int *const largest_tile_id); + +void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd, + TX_TYPE tx_type, TX_SIZE tx_size, aom_writer *w); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_BITSTREAM_H_ diff --git a/libs/libaom/src/av1/encoder/block.h b/libs/libaom/src/av1/encoder/block.h new file mode 100644 index 000000000..5a74567a4 --- /dev/null +++ b/libs/libaom/src/av1/encoder/block.h @@ -0,0 +1,575 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_BLOCK_H_ +#define AOM_AV1_ENCODER_BLOCK_H_ + +#include "av1/common/entropymv.h" +#include "av1/common/entropy.h" +#include "av1/common/mvref_common.h" + +#include "av1/encoder/enc_enums.h" +#if !CONFIG_REALTIME_ONLY +#include "av1/encoder/partition_cnn_weights.h" +#endif + +#include "av1/encoder/hash.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MC_FLOW_BSIZE_1D 16 +#define MC_FLOW_NUM_PELS (MC_FLOW_BSIZE_1D * MC_FLOW_BSIZE_1D) +#define MAX_MC_FLOW_BLK_IN_SB (MAX_SB_SIZE / MC_FLOW_BSIZE_1D) +#define MAX_WINNER_MODE_COUNT_INTRA 3 +#define MAX_WINNER_MODE_COUNT_INTER 1 +typedef struct { + MB_MODE_INFO mbmi; + RD_STATS rd_cost; + int64_t rd; + int rate_y; + int rate_uv; + uint8_t color_index_map[64 * 64]; + THR_MODES mode_index; +} WinnerModeStats; + +typedef struct { + unsigned int sse; + int sum; + unsigned int var; +} DIFF; + +enum { + NO_TRELLIS_OPT, // No trellis optimization + FULL_TRELLIS_OPT, // Trellis optimization in all stages + FINAL_PASS_TRELLIS_OPT, // Trellis optimization in only the final encode pass + NO_ESTIMATE_YRD_TRELLIS_OPT // Disable trellis in estimate_yrd_for_sb +} UENUM1BYTE(TRELLIS_OPT_TYPE); + +typedef struct macroblock_plane { + DECLARE_ALIGNED(32, int16_t, src_diff[MAX_SB_SQUARE]); + tran_low_t *qcoeff; + tran_low_t *coeff; + uint16_t *eobs; + uint8_t *txb_entropy_ctx; + struct buf_2d src; + + // Quantizer setings + // These are used/accessed only in the quantization process + // RDO does not / must not depend on any of these values + // All values below share the coefficient scale/shift used in TX + const int16_t *quant_fp_QTX; + const int16_t *round_fp_QTX; + const int16_t *quant_QTX; + const int16_t *quant_shift_QTX; + const int16_t *zbin_QTX; + const int16_t *round_QTX; + const int16_t *dequant_QTX; +} MACROBLOCK_PLANE; + +typedef struct { + int txb_skip_cost[TXB_SKIP_CONTEXTS][2]; + int base_eob_cost[SIG_COEF_CONTEXTS_EOB][3]; + int base_cost[SIG_COEF_CONTEXTS][8]; + int eob_extra_cost[EOB_COEF_CONTEXTS][2]; + int dc_sign_cost[DC_SIGN_CONTEXTS][2]; + int lps_cost[LEVEL_CONTEXTS][COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1]; +} LV_MAP_COEFF_COST; + +typedef struct { + int eob_cost[2][11]; +} LV_MAP_EOB_COST; + +typedef struct { + tran_low_t tcoeff[MAX_MB_PLANE][MAX_SB_SQUARE]; + uint16_t eobs[MAX_MB_PLANE][MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)]; + // Transform block entropy contexts. + // Bits 0~3: txb_skip_ctx; bits 4~5: dc_sign_ctx. + uint8_t entropy_ctx[MAX_MB_PLANE] + [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)]; +} CB_COEFF_BUFFER; + +typedef struct { + // TODO(angiebird): Reduce the buffer size according to sb_type + CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][USABLE_REF_MV_STACK_SIZE]; + uint16_t weight[MODE_CTX_REF_FRAMES][USABLE_REF_MV_STACK_SIZE]; + int_mv global_mvs[REF_FRAMES]; + int16_t mode_context[MODE_CTX_REF_FRAMES]; + uint8_t ref_mv_count[MODE_CTX_REF_FRAMES]; +} MB_MODE_INFO_EXT; + +// Structure to store best mode information at frame level. This +// frame level information will be used during bitstream preparation stage. +typedef struct { + CANDIDATE_MV ref_mv_stack[USABLE_REF_MV_STACK_SIZE]; + uint16_t weight[USABLE_REF_MV_STACK_SIZE]; + // TODO(Ravi/Remya): Reduce the buffer size of global_mvs + int_mv global_mvs[REF_FRAMES]; + int cb_offset; + int16_t mode_context; + uint8_t ref_mv_count; +} MB_MODE_INFO_EXT_FRAME; + +typedef struct { + uint8_t best_palette_color_map[MAX_PALETTE_SQUARE]; + int kmeans_data_buf[2 * MAX_PALETTE_SQUARE]; +} PALETTE_BUFFER; + +typedef struct { + TX_SIZE tx_size; + TX_SIZE inter_tx_size[INTER_TX_SIZE_BUF_LEN]; + uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + uint8_t tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + RD_STATS rd_stats; + uint32_t hash_value; +} MB_RD_INFO; + +#define RD_RECORD_BUFFER_LEN 8 +typedef struct { + MB_RD_INFO tx_rd_info[RD_RECORD_BUFFER_LEN]; // Circular buffer. + int index_start; + int num; + CRC32C crc_calculator; // Hash function. +} MB_RD_RECORD; + +typedef struct { + int64_t dist; + int64_t sse; + int rate; + uint16_t eob; + TX_TYPE tx_type; + uint16_t entropy_context; + uint8_t txb_entropy_ctx; + uint8_t valid; + uint8_t fast; // This is not being used now. + uint8_t perform_block_coeff_opt; +} TXB_RD_INFO; + +#define TX_SIZE_RD_RECORD_BUFFER_LEN 256 +typedef struct { + uint32_t hash_vals[TX_SIZE_RD_RECORD_BUFFER_LEN]; + TXB_RD_INFO tx_rd_info[TX_SIZE_RD_RECORD_BUFFER_LEN]; + int index_start; + int num; +} TXB_RD_RECORD; + +typedef struct tx_size_rd_info_node { + TXB_RD_INFO *rd_info_array; // Points to array of size TX_TYPES. + struct tx_size_rd_info_node *children[4]; +} TXB_RD_INFO_NODE; + +// Simple translation rd state for prune_comp_search_by_single_result +typedef struct { + RD_STATS rd_stats; + RD_STATS rd_stats_y; + RD_STATS rd_stats_uv; + uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + uint8_t tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + uint8_t skip; + uint8_t disable_skip; + uint8_t early_skipped; +} SimpleRDState; + +// 4: NEAREST, NEW, NEAR, GLOBAL +#define SINGLE_REF_MODES ((REF_FRAMES - 1) * 4) + +#define MAX_COMP_RD_STATS 64 +typedef struct { + int32_t rate[COMPOUND_TYPES]; + int64_t dist[COMPOUND_TYPES]; + int32_t model_rate[COMPOUND_TYPES]; + int64_t model_dist[COMPOUND_TYPES]; + int comp_rs2[COMPOUND_TYPES]; + int_mv mv[2]; + MV_REFERENCE_FRAME ref_frames[2]; + PREDICTION_MODE mode; + int_interpfilters filter; + int ref_mv_idx; + int is_global[2]; + INTERINTER_COMPOUND_DATA interinter_comp; +} COMP_RD_STATS; + +// Struct for buffers used by av1_compound_type_rd() function. +// For sizes and alignment of these arrays, refer to +// alloc_compound_type_rd_buffers() function. +typedef struct { + uint8_t *pred0; + uint8_t *pred1; + int16_t *residual1; // src - pred1 + int16_t *diff10; // pred1 - pred0 + uint8_t *tmp_best_mask_buf; // backup of the best segmentation mask +} CompoundTypeRdBuffers; + +enum { + MV_COST_ENTROPY, // Use the entropy rate of the mv as the cost + MV_COST_L1_LOWRES, // Use the l1 norm of the mv as the cost (<480p) + MV_COST_L1_MIDRES, // Use the l1 norm of the mv as the cost (>=480p) + MV_COST_L1_HDRES, // Use the l1 norm of the mv as the cost (>=720p) + MV_COST_NONE // Use 0 as as cost irrespective of the current mv +} UENUM1BYTE(MV_COST_TYPE); + +struct inter_modes_info; +typedef struct macroblock MACROBLOCK; +struct macroblock { + struct macroblock_plane plane[MAX_MB_PLANE]; + + // Determine if one would go with reduced complexity transform block + // search model to select prediction modes, or full complexity model + // to select transform kernel. + int rd_model; + + // prune_comp_search_by_single_result (3:MAX_REF_MV_SEARCH) + SimpleRDState simple_rd_state[SINGLE_REF_MODES][3]; + + // Inter macroblock RD search info. + MB_RD_RECORD mb_rd_record; + + // Inter transform block RD search info. for square TX sizes. + TXB_RD_RECORD txb_rd_record_8X8[(MAX_MIB_SIZE >> 1) * (MAX_MIB_SIZE >> 1)]; + TXB_RD_RECORD txb_rd_record_16X16[(MAX_MIB_SIZE >> 2) * (MAX_MIB_SIZE >> 2)]; + TXB_RD_RECORD txb_rd_record_32X32[(MAX_MIB_SIZE >> 3) * (MAX_MIB_SIZE >> 3)]; + TXB_RD_RECORD txb_rd_record_64X64[(MAX_MIB_SIZE >> 4) * (MAX_MIB_SIZE >> 4)]; + + // Intra transform block RD search info. for square TX sizes. + TXB_RD_RECORD txb_rd_record_intra; + + MACROBLOCKD e_mbd; + MB_MODE_INFO_EXT *mbmi_ext; + MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame; + // Array of mode stats for winner mode processing + WinnerModeStats winner_mode_stats[AOMMAX(MAX_WINNER_MODE_COUNT_INTRA, + MAX_WINNER_MODE_COUNT_INTER)]; + int winner_mode_count; + int skip_block; + int qindex; + + // The equivalent error at the current rdmult of one whole bit (not one + // bitcost unit). + int errorperbit; + // The equivalend SAD error of one (whole) bit at the current quantizer + // for large blocks. + int sadperbit; + int rdmult; + int mb_energy; + int sb_energy_level; + + unsigned int txb_split_count; +#if CONFIG_SPEED_STATS + unsigned int tx_search_count; +#endif // CONFIG_SPEED_STATS + + // These are set to their default values at the beginning, and then adjusted + // further in the encoding process. + BLOCK_SIZE min_partition_size; + BLOCK_SIZE max_partition_size; + + unsigned int max_mv_context[REF_FRAMES]; + unsigned int source_variance; + unsigned int simple_motion_pred_sse; + unsigned int pred_sse[REF_FRAMES]; + int pred_mv_sad[REF_FRAMES]; + int best_pred_mv_sad; + + int nmv_vec_cost[MV_JOINTS]; + int nmv_costs[2][MV_VALS]; + int nmv_costs_hp[2][MV_VALS]; + int *nmvcost[2]; + int *nmvcost_hp[2]; + int **mv_cost_stack; + + int32_t *wsrc_buf; + int32_t *mask_buf; + uint8_t *above_pred_buf; + uint8_t *left_pred_buf; + + PALETTE_BUFFER *palette_buffer; + CompoundTypeRdBuffers comp_rd_buffer; + + CONV_BUF_TYPE *tmp_conv_dst; + uint8_t *tmp_obmc_bufs[2]; + + FRAME_CONTEXT *row_ctx; + // This context will be used to update color_map_cdf pointer which would be + // used during pack bitstream. For single thread and tile-multithreading case + // this ponter will be same as xd->tile_ctx, but for the case of row-mt: + // xd->tile_ctx will point to a temporary context while tile_pb_ctx will point + // to the accurate tile context. + FRAME_CONTEXT *tile_pb_ctx; + + struct inter_modes_info *inter_modes_info; + + // Contains the hash table, hash function, and buffer used for intrabc + IntraBCHashInfo intrabc_hash_info; + + // These define limits to motion vector components to prevent them + // from extending outside the UMV borders + FullMvLimits mv_limits; + + uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + uint8_t tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + + // Force the coding block to skip transform and quantization. + int force_skip; + int skip_cost[SKIP_CONTEXTS][2]; + + int skip_mode; // 0: off; 1: on + int skip_mode_cost[SKIP_CONTEXTS][2]; + + LV_MAP_COEFF_COST coeff_costs[TX_SIZES][PLANE_TYPES]; + LV_MAP_EOB_COST eob_costs[7][2]; + uint16_t cb_offset; + + // mode costs + int intra_inter_cost[INTRA_INTER_CONTEXTS][2]; + + int mbmode_cost[BLOCK_SIZE_GROUPS][INTRA_MODES]; + int newmv_mode_cost[NEWMV_MODE_CONTEXTS][2]; + int zeromv_mode_cost[GLOBALMV_MODE_CONTEXTS][2]; + int refmv_mode_cost[REFMV_MODE_CONTEXTS][2]; + int drl_mode_cost0[DRL_MODE_CONTEXTS][2]; + + int comp_inter_cost[COMP_INTER_CONTEXTS][2]; + int single_ref_cost[REF_CONTEXTS][SINGLE_REFS - 1][2]; + int comp_ref_type_cost[COMP_REF_TYPE_CONTEXTS] + [CDF_SIZE(COMP_REFERENCE_TYPES)]; + int uni_comp_ref_cost[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1] + [CDF_SIZE(2)]; + // Cost for signaling ref_frame[0] (LAST_FRAME, LAST2_FRAME, LAST3_FRAME or + // GOLDEN_FRAME) in bidir-comp mode. + int comp_ref_cost[REF_CONTEXTS][FWD_REFS - 1][2]; + // Cost for signaling ref_frame[1] (ALTREF_FRAME, ALTREF2_FRAME, or + // BWDREF_FRAME) in bidir-comp mode. + int comp_bwdref_cost[REF_CONTEXTS][BWD_REFS - 1][2]; + int inter_compound_mode_cost[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES]; + int compound_type_cost[BLOCK_SIZES_ALL][MASKED_COMPOUND_TYPES]; + int wedge_idx_cost[BLOCK_SIZES_ALL][16]; + int interintra_cost[BLOCK_SIZE_GROUPS][2]; + int wedge_interintra_cost[BLOCK_SIZES_ALL][2]; + int interintra_mode_cost[BLOCK_SIZE_GROUPS][INTERINTRA_MODES]; + int motion_mode_cost[BLOCK_SIZES_ALL][MOTION_MODES]; + int motion_mode_cost1[BLOCK_SIZES_ALL][2]; + int intra_uv_mode_cost[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES]; + int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES]; + int filter_intra_cost[BLOCK_SIZES_ALL][2]; + int filter_intra_mode_cost[FILTER_INTRA_MODES]; + int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS]; + int partition_cost[PARTITION_CONTEXTS][EXT_PARTITION_TYPES]; + int palette_y_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES]; + int palette_uv_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES]; + int palette_y_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS] + [PALETTE_COLORS]; + int palette_uv_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS] + [PALETTE_COLORS]; + int palette_y_mode_cost[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2]; + int palette_uv_mode_cost[PALETTE_UV_MODE_CONTEXTS][2]; + // The rate associated with each alpha codeword + int cfl_cost[CFL_JOINT_SIGNS][CFL_PRED_PLANES][CFL_ALPHABET_SIZE]; + int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES]; + int txfm_partition_cost[TXFM_PARTITION_CONTEXTS][2]; + int inter_tx_type_costs[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES]; + int intra_tx_type_costs[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES] + [TX_TYPES]; + int angle_delta_cost[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1]; + int switchable_restore_cost[RESTORE_SWITCHABLE_TYPES]; + int wiener_restore_cost[2]; + int sgrproj_restore_cost[2]; + int intrabc_cost[2]; + + // Used to store sub partition's choices. + MV pred_mv[REF_FRAMES]; + + // Ref frames that are selected by square partition blocks within a super- + // block, in MI resolution. They can be used to prune ref frames for + // rectangular blocks. + int picked_ref_frames_mask[32 * 32]; + + // use default transform and skip transform type search for intra modes + int use_default_intra_tx_type; + // use default transform and skip transform type search for inter modes + int use_default_inter_tx_type; + int comp_idx_cost[COMP_INDEX_CONTEXTS][2]; + int comp_group_idx_cost[COMP_GROUP_IDX_CONTEXTS][2]; + int must_find_valid_partition; + int recalc_luma_mc_data; // Flag to indicate recalculation of MC data during + // interpolation filter search + int prune_mode; + uint32_t tx_domain_dist_threshold; + int use_transform_domain_distortion; + // The likelihood of an edge existing in the block (using partial Canny edge + // detection). For reference, 556 is the value returned for a solid + // vertical black/white edge. + uint16_t edge_strength; + // The strongest edge strength seen along the x/y axis. + uint16_t edge_strength_x; + uint16_t edge_strength_y; + uint8_t compound_idx; + + // [Saved stat index] + COMP_RD_STATS comp_rd_stats[MAX_COMP_RD_STATS]; + int comp_rd_stats_idx; + + CB_COEFF_BUFFER *cb_coef_buff; + + // Threshold used to decide the applicability of R-D optimization of + // quantized coeffs + uint32_t coeff_opt_dist_threshold; + +#if !CONFIG_REALTIME_ONLY + int quad_tree_idx; + int cnn_output_valid; + float cnn_buffer[CNN_OUT_BUF_SIZE]; + float log_q; +#endif + int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES]; + // 0 - 128x128 + // 1-2 - 128x64 + // 3-4 - 64x128 + // 5-8 - 64x64 + // 9-16 - 64x32 + // 17-24 - 32x64 + // 25-40 - 32x32 + // 41-104 - 16x16 + uint8_t variance_low[105]; + uint8_t content_state_sb; + // Strong color activity detection. Used in REALTIME coding mode to enhance + // the visual quality at the boundary of moving color objects. + uint8_t color_sensitivity[2]; + int nonrd_prune_ref_frame_search; + + // Used to control the tx size search evaluation for mode processing + // (normal/winner mode) + int tx_size_search_method; + // This tx_mode_search_type is used internally by the encoder, and is not + // written to the bitstream. It determines what kind of tx_mode should be + // searched. For example, we might set it to TX_MODE_LARGEST to find a good + // candidate, then use TX_MODE_SELECT on it + TX_MODE tx_mode_search_type; + + // Used to control aggressiveness of skip flag prediction for mode processing + // (normal/winner mode) + unsigned int predict_skip_level; + + // Copy out this SB's TPL block stats. + int valid_cost_b; + int64_t inter_cost_b[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB]; + int64_t intra_cost_b[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB]; + int_mv mv_b[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB] + [INTER_REFS_PER_FRAME]; + int cost_stride; + + // The type of mv cost used during motion search + MV_COST_TYPE mv_cost_type; + + uint8_t search_ref_frame[REF_FRAMES]; + +#if CONFIG_AV1_HIGHBITDEPTH + void (*fwd_txfm4x4)(const int16_t *input, tran_low_t *output, int stride); + void (*inv_txfm_add)(const tran_low_t *input, uint8_t *dest, int stride, + int eob); +#else + void (*fwd_txfm4x4)(const int16_t *input, int16_t *output, int stride); + void (*inv_txfm_add)(const int16_t *input, uint8_t *dest, int stride, + int eob); +#endif +}; + +// Only consider full SB, MC_FLOW_BSIZE_1D = 16. +static INLINE int tpl_blocks_in_sb(BLOCK_SIZE bsize) { + switch (bsize) { + case BLOCK_64X64: return 16; + case BLOCK_128X128: return 64; + default: assert(0); + } + return -1; +} + +static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) { + static const char LUT[BLOCK_SIZES_ALL] = { + 0, // BLOCK_4X4 + 1, // BLOCK_4X8 + 1, // BLOCK_8X4 + 0, // BLOCK_8X8 + 1, // BLOCK_8X16 + 1, // BLOCK_16X8 + 0, // BLOCK_16X16 + 1, // BLOCK_16X32 + 1, // BLOCK_32X16 + 0, // BLOCK_32X32 + 1, // BLOCK_32X64 + 1, // BLOCK_64X32 + 0, // BLOCK_64X64 + 0, // BLOCK_64X128 + 0, // BLOCK_128X64 + 0, // BLOCK_128X128 + 1, // BLOCK_4X16 + 1, // BLOCK_16X4 + 1, // BLOCK_8X32 + 1, // BLOCK_32X8 + 1, // BLOCK_16X64 + 1, // BLOCK_64X16 + }; + + return LUT[bsize]; +} + +static INLINE int is_rect_tx_allowed(const MACROBLOCKD *xd, + const MB_MODE_INFO *mbmi) { + return is_rect_tx_allowed_bsize(mbmi->sb_type) && + !xd->lossless[mbmi->segment_id]; +} + +static INLINE int tx_size_to_depth(TX_SIZE tx_size, BLOCK_SIZE bsize) { + TX_SIZE ctx_size = max_txsize_rect_lookup[bsize]; + int depth = 0; + while (tx_size != ctx_size) { + depth++; + ctx_size = sub_tx_size_map[ctx_size]; + assert(depth <= MAX_TX_DEPTH); + } + return depth; +} + +static INLINE void set_blk_skip(MACROBLOCK *x, int plane, int blk_idx, + int skip) { + if (skip) + x->blk_skip[blk_idx] |= 1UL << plane; + else + x->blk_skip[blk_idx] &= ~(1UL << plane); +#ifndef NDEBUG + // Set chroma planes to uninitialized states when luma is set to check if + // it will be set later + if (plane == 0) { + x->blk_skip[blk_idx] |= 1UL << (1 + 4); + x->blk_skip[blk_idx] |= 1UL << (2 + 4); + } + + // Clear the initialization checking bit + x->blk_skip[blk_idx] &= ~(1UL << (plane + 4)); +#endif +} + +static INLINE int is_blk_skip(MACROBLOCK *x, int plane, int blk_idx) { +#ifndef NDEBUG + // Check if this is initialized + assert(!(x->blk_skip[blk_idx] & (1UL << (plane + 4)))); + + // The magic number is 0x77, this is to test if there is garbage data + assert((x->blk_skip[blk_idx] & 0x88) == 0); +#endif + return (x->blk_skip[blk_idx] >> plane) & 1; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_BLOCK_H_ diff --git a/libs/libaom/src/av1/encoder/blockiness.c b/libs/libaom/src/av1/encoder/blockiness.c new file mode 100644 index 000000000..f7cff9e53 --- /dev/null +++ b/libs/libaom/src/av1/encoder/blockiness.c @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/av1_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "av1/common/common.h" +#include "av1/common/filter.h" +#include "aom/aom_integer.h" +#include "aom_dsp/aom_filter.h" +#include "aom_ports/mem.h" +#include "aom_ports/system_state.h" + +static int horizontal_filter(const uint8_t *s) { + return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6; +} + +static int vertical_filter(const uint8_t *s, int p) { + return (s[p] - s[-2 * p]) * 2 + (s[-p] - s[0]) * 6; +} + +static int variance(int sum, int sum_squared, int size) { + return sum_squared / size - (sum / size) * (sum / size); +} +// Calculate a blockiness level for a vertical block edge. +// This function returns a new blockiness metric that's defined as + +// p0 p1 p2 p3 +// q0 q1 q2 q3 +// block edge -> +// r0 r1 r2 r3 +// s0 s1 s2 s3 + +// blockiness = p0*-2+q0*6+r0*-6+s0*2 + +// p1*-2+q1*6+r1*-6+s1*2 + +// p2*-2+q2*6+r2*-6+s2*2 + +// p3*-2+q3*6+r3*-6+s3*2 ; + +// reconstructed_blockiness = abs(blockiness from reconstructed buffer - +// blockiness from source buffer,0) +// +// I make the assumption that flat blocks are much more visible than high +// contrast blocks. As such, I scale the result of the blockiness calc +// by dividing the blockiness by the variance of the pixels on either side +// of the edge as follows: +// var_0 = (q0^2+q1^2+q2^2+q3^2) - ((q0 + q1 + q2 + q3) / 4 )^2 +// var_1 = (r0^2+r1^2+r2^2+r3^2) - ((r0 + r1 + r2 + r3) / 4 )^2 +// The returned blockiness is the scaled value +// Reconstructed blockiness / ( 1 + var_0 + var_1 ) ; +static int blockiness_vertical(const uint8_t *s, int sp, const uint8_t *r, + int rp, int size) { + int s_blockiness = 0; + int r_blockiness = 0; + int sum_0 = 0; + int sum_sq_0 = 0; + int sum_1 = 0; + int sum_sq_1 = 0; + int i; + int var_0; + int var_1; + for (i = 0; i < size; ++i, s += sp, r += rp) { + s_blockiness += horizontal_filter(s); + r_blockiness += horizontal_filter(r); + sum_0 += s[0]; + sum_sq_0 += s[0] * s[0]; + sum_1 += s[-1]; + sum_sq_1 += s[-1] * s[-1]; + } + var_0 = variance(sum_0, sum_sq_0, size); + var_1 = variance(sum_1, sum_sq_1, size); + r_blockiness = abs(r_blockiness); + s_blockiness = abs(s_blockiness); + + if (r_blockiness > s_blockiness) + return (r_blockiness - s_blockiness) / (1 + var_0 + var_1); + else + return 0; +} + +// Calculate a blockiness level for a horizontal block edge +// same as above. +static int blockiness_horizontal(const uint8_t *s, int sp, const uint8_t *r, + int rp, int size) { + int s_blockiness = 0; + int r_blockiness = 0; + int sum_0 = 0; + int sum_sq_0 = 0; + int sum_1 = 0; + int sum_sq_1 = 0; + int i; + int var_0; + int var_1; + for (i = 0; i < size; ++i, ++s, ++r) { + s_blockiness += vertical_filter(s, sp); + r_blockiness += vertical_filter(r, rp); + sum_0 += s[0]; + sum_sq_0 += s[0] * s[0]; + sum_1 += s[-sp]; + sum_sq_1 += s[-sp] * s[-sp]; + } + var_0 = variance(sum_0, sum_sq_0, size); + var_1 = variance(sum_1, sum_sq_1, size); + r_blockiness = abs(r_blockiness); + s_blockiness = abs(s_blockiness); + + if (r_blockiness > s_blockiness) + return (r_blockiness - s_blockiness) / (1 + var_0 + var_1); + else + return 0; +} + +// This function returns the blockiness for the entire frame currently by +// looking at all borders in steps of 4. +double av1_get_blockiness(const unsigned char *img1, int img1_pitch, + const unsigned char *img2, int img2_pitch, int width, + int height) { + double blockiness = 0; + int i, j; + aom_clear_system_state(); + for (i = 0; i < height; + i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) { + for (j = 0; j < width; j += 4) { + if (i > 0 && i < height && j > 0 && j < width) { + blockiness += + blockiness_vertical(img1 + j, img1_pitch, img2 + j, img2_pitch, 4); + blockiness += blockiness_horizontal(img1 + j, img1_pitch, img2 + j, + img2_pitch, 4); + } + } + } + blockiness /= width * height / 16; + return blockiness; +} diff --git a/libs/libaom/src/av1/encoder/cnn.c b/libs/libaom/src/av1/encoder/cnn.c new file mode 100644 index 000000000..5d8a236a0 --- /dev/null +++ b/libs/libaom/src/av1/encoder/cnn.c @@ -0,0 +1,1144 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "av1/encoder/cnn.h" +#include "av1/common/av1_common_int.h" + +#define CLAMPINDEX(a, hi) ((a) < 0 ? 0 : ((a) >= (hi) ? ((hi)-1) : (a))) + +typedef struct { + const float **input; + int in_width; + int in_height; + int in_stride; + const CNN_LAYER_CONFIG *layer_config; + float **output; + int out_stride; + int start_idx; + int th_step; +} CONVOLVE_OPS; + +typedef float (*activation_fn)(float); + +static float softsign(float x) { return x / (float)(fabsf(x) + 1.0); } + +static float relu(float x) { return (x < 0) ? 0 : x; } + +static float identity(float x) { return x; } + +typedef struct { + int allocsize; + int channels; + int width, height, stride; + float *buf[CNN_MAX_CHANNELS]; +} TENSOR; + +static void init_tensor(TENSOR *tensor) { memset(tensor, 0, sizeof(*tensor)); } + +static void free_tensor(TENSOR *tensor) { + if (tensor->allocsize) { + aom_free(tensor->buf[0]); + tensor->buf[0] = NULL; + tensor->allocsize = 0; + } +} + +static void realloc_tensor(TENSOR *tensor, int channels, int width, + int height) { + const int newallocsize = channels * width * height; + if (tensor->allocsize < newallocsize) { + free_tensor(tensor); + tensor->buf[0] = + (float *)aom_malloc(sizeof(*tensor->buf[0]) * newallocsize); + tensor->allocsize = newallocsize; + } + tensor->width = width; + tensor->height = height; + tensor->stride = width; + tensor->channels = channels; + for (int c = 1; c < channels; ++c) + tensor->buf[c] = &tensor->buf[0][c * width * height]; +} + +static void copy_tensor(const TENSOR *src, int copy_channels, int dst_offset, + TENSOR *dst) { + assert(src->width == dst->width); + assert(src->height == dst->height); + assert(copy_channels <= src->channels); + if (src->stride == dst->width && dst->stride == dst->width) { + for (int c = 0; c < copy_channels; ++c) { + memcpy(dst->buf[dst_offset + c], src->buf[c], + sizeof(*dst->buf[0]) * src->width * src->height); + } + } else { + for (int c = 0; c < copy_channels; ++c) { + for (int r = 0; r < dst->height; ++r) { + memcpy(&dst->buf[dst_offset + c][r * dst->stride], + &src->buf[c][r * src->stride], + dst->width * sizeof(*dst->buf[c])); + } + } + } +} + +static void assign_tensor(TENSOR *tensor, float *buf[CNN_MAX_CHANNELS], + int channels, int width, int height, int stride) { + tensor->allocsize = 0; + tensor->channels = channels; + tensor->width = width; + tensor->height = height; + tensor->stride = stride; + if (buf) { + for (int c = 0; c < channels; ++c) tensor->buf[c] = buf[c]; + } else { + for (int c = 0; c < channels; ++c) tensor->buf[c] = NULL; + } +} + +static void swap_tensor(TENSOR *t1, TENSOR *t2) { + TENSOR t = *t1; + *t1 = *t2; + *t2 = t; +} + +// The concatenated tensor goes into dst with first the channels in +// original dst followed by the channels in the src +static void concat_tensor(const TENSOR *src, TENSOR *dst) { + assert(src->width == dst->width); + assert(src->height == dst->height); + + const int dst_channels = dst->channels; + const int channels = dst->channels + src->channels; + const int newallocsize = channels * dst->width * dst->height; + if (dst->allocsize < newallocsize) { + TENSOR t; + init_tensor(&t); + // allocate new buffers and copy first the dst channels + realloc_tensor(&t, channels, dst->width, dst->height); + copy_tensor(dst, dst->channels, 0, &t); + // Swap the tensors and free the old buffers + swap_tensor(dst, &t); + free_tensor(&t); + } + for (int c = 1; c < channels; ++c) + dst->buf[c] = &dst->buf[0][c * dst->width * dst->height]; + // Copy the channels in src after the first dst_channels channels. + copy_tensor(src, src->channels, dst_channels, dst); +} + +int check_tensor_equal_dims(TENSOR *t1, TENSOR *t2) { + return (t1->width == t2->width && t1->height == t2->height); +} + +int check_tensor_equal_size(TENSOR *t1, TENSOR *t2) { + return (t1->channels == t2->channels && t1->width == t2->width && + t1->height == t2->height); +} + +static void find_layer_output_size(int in_width, int in_height, + const CNN_LAYER_CONFIG *layer_config, + int *out_width, int *out_height) { + if (!layer_config->deconvolve) { + switch (layer_config->pad) { + case PADDING_SAME_ZERO: + case PADDING_SAME_REPLICATE: + *out_width = (in_width + layer_config->skip_width - 1) / + layer_config->skip_width; + *out_height = (in_height + layer_config->skip_height - 1) / + layer_config->skip_height; + break; + case PADDING_VALID: + *out_width = + (in_width - layer_config->filter_width + layer_config->skip_width) / + layer_config->skip_width; + *out_height = (in_height - layer_config->filter_height + + layer_config->skip_height) / + layer_config->skip_height; + break; + default: assert(0 && "Unknown padding type"); + } + } else { + switch (layer_config->pad) { + case PADDING_SAME_ZERO: + case PADDING_SAME_REPLICATE: + *out_width = in_width * layer_config->skip_width; + *out_height = in_height * layer_config->skip_height; + break; + case PADDING_VALID: + *out_width = (in_width - 1) * layer_config->skip_width + + layer_config->filter_width; + *out_height = (in_height - 1) * layer_config->skip_height + + layer_config->filter_height; + break; + default: assert(0 && "Unknown padding type"); + } + } +} + +void find_cnn_out_channels(const CNN_LAYER_CONFIG *layer_config, + int channels_per_branch[]) { + int branch = layer_config->branch; + const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config; + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + if ((branch_config->input_to_branches & (1 << b)) && b != branch) { + if (layer_config->branch_copy_type == BRANCH_INPUT) { + channels_per_branch[b] = layer_config->in_channels; + } else if (layer_config->branch_copy_type == BRANCH_OUTPUT) { + channels_per_branch[b] = layer_config->out_channels; + } else if (layer_config->branch_copy_type == BRANCH_COMBINED) { + channels_per_branch[b] = layer_config->out_channels; + for (int c = 0; c < CNN_MAX_BRANCHES; ++c) { + if ((branch_config->branches_to_combine & (1 << c)) && c != branch) { + assert(channels_per_branch[c] > 0); + channels_per_branch[b] += channels_per_branch[c]; + } + } + } + } + } + channels_per_branch[branch] = layer_config->out_channels; + for (int c = 0; c < CNN_MAX_BRANCHES; ++c) { + if ((branch_config->branches_to_combine & (1 << c)) && c != branch) { + assert(channels_per_branch[c] > 0); + channels_per_branch[branch] += channels_per_branch[c]; + } + } +} + +#if CONFIG_DEBUG +static INLINE int cnn_has_at_least_one_output(const CNN_CONFIG *cnn_config) { + const int num_layers = cnn_config->num_layers; + const CNN_LAYER_CONFIG *layer_configs = cnn_config->layer_config; + + for (int idx = 0; idx < num_layers; idx++) { + if (layer_configs[idx].output_num != -1) { + return 1; + } + } + return 0; +} +#endif + +void av1_find_cnn_output_size(int in_width, int in_height, + const CNN_CONFIG *cnn_config, int *out_width, + int *out_height, int *out_channels) { + int channels_per_branch[CNN_MAX_BRANCHES] = { 0 }; + int i_width[CNN_MAX_BRANCHES] = { 0 }; + int i_height[CNN_MAX_BRANCHES] = { 0 }; + i_width[0] = in_width + cnn_config->ext_width * 2; + i_height[0] = in_height + cnn_config->ext_height * 2; + +#if CONFIG_DEBUG + assert(cnn_has_at_least_one_output(cnn_config)); +#endif + + for (int i = 0; i < cnn_config->num_layers; ++i) { + const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[i]; + const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config; + const int branch = layer_config->branch; + int o_width = 0, o_height = 0; + + if (layer_config->branch_copy_type == BRANCH_INPUT) { + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + if ((branch_config->input_to_branches & (1 << b)) && b != branch) { + assert(i_width[branch] > 0 && i_height[branch] > 0); + i_width[b] = i_width[branch]; + i_height[b] = i_height[branch]; + } + } + } + + find_layer_output_size(i_width[branch], i_height[branch], layer_config, + &o_width, &o_height); + i_width[branch] = o_width; + i_height[branch] = o_height; + + if (layer_config->branch_copy_type == BRANCH_OUTPUT) { + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + if ((branch_config->input_to_branches & (1 << b)) && b != branch) { + i_width[b] = o_width; + i_height[b] = o_height; + } + } + } + + find_cnn_out_channels(layer_config, channels_per_branch); + + const int output_num = layer_config->output_num; + if (output_num != -1) { // Current layer is an output layer + out_width[output_num] = o_width; + out_height[output_num] = o_height; + out_channels[output_num] = channels_per_branch[layer_config->branch]; + } + } +} + +activation_fn get_activation(ACTIVATION layer_activation) { + switch (layer_activation) { + case NONE: return identity; + case RELU: return relu; + case SOFTSIGN: return softsign; + case SIGMOID: + assert(0 && "Sigmoid has not been supported in CNN."); // TO DO + return NULL; + default: assert(0 && "Unknown activation type"); return NULL; + } +} + +static INLINE int get_start_shift_convolve(int width, int filt_width, + int stride) { + const int mod = (width % stride); + const int filt_off = (filt_width - 1) / 2; + const int dif = (mod ? mod - 1 : stride - 1); + return AOMMIN((dif + (filt_width % 2)) / 2, filt_off); +} + +void av1_cnn_add_c(float **output, int channels, int width, int height, + int stride, const float **add) { + for (int c = 0; c < channels; ++c) { + for (int i = 0; i < height; ++i) + for (int j = 0; j < width; ++j) + output[c][i * stride + j] += add[c][i * stride + j]; + } +} + +void av1_cnn_activate_c(float **output, int channels, int width, int height, + int stride, ACTIVATION layer_activation) { + activation_fn activation = get_activation(layer_activation); + for (int c = 0; c < channels; ++c) { + for (int i = 0; i < height; ++i) + for (int j = 0; j < width; ++j) + output[c][i * stride + j] = activation(output[c][i * stride + j]); + } +} + +static void copy_active_tensor_to_branches(const TENSOR *layer_active_tensor, + const CNN_LAYER_CONFIG *layer_config, + int branch, TENSOR branch_output[]) { + const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config; + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + if ((branch_config->input_to_branches & (1 << b)) && b != branch) { + // Copy layer's active tensor to output tensor of branch b if set in + // mask. The output becomes the input of the first layer of the branch + // because the layer of the branch is not the first layer. + int copy_channels = branch_config->channels_to_copy > 0 + ? branch_config->channels_to_copy + : layer_active_tensor->channels; + realloc_tensor(&branch_output[b], copy_channels, + layer_active_tensor->width, layer_active_tensor->height); + copy_tensor(layer_active_tensor, copy_channels, 0, &branch_output[b]); + } + } +} + +static int convolve_layer(void *arg1, void *arg2) { + const CONVOLVE_OPS *convolve_ops = arg1; + (void)arg2; + av1_cnn_convolve( + convolve_ops->input, convolve_ops->in_width, convolve_ops->in_height, + convolve_ops->in_stride, convolve_ops->layer_config, convolve_ops->output, + convolve_ops->out_stride, convolve_ops->start_idx, convolve_ops->th_step); + return 1; +} + +static void convolve_layer_mt(const float **input, int in_width, int in_height, + int in_stride, + const CNN_LAYER_CONFIG *layer_config, + const CNN_THREAD_DATA *thread_data, + float **output, int out_stride) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + const int num_workers = thread_data->num_workers; + + CONVOLVE_OPS convolve_ops[CNN_MAX_THREADS]; + for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) { + AVxWorker *const worker = &thread_data->workers[th]; + winterface->reset(worker); + + CONVOLVE_OPS convolve_op = { input, in_width, in_height, + in_stride, layer_config, output, + out_stride, th, num_workers }; + convolve_ops[th] = convolve_op; + worker->hook = convolve_layer; + worker->data1 = &(convolve_ops[th]); + worker->data2 = NULL; + + // Start convolving. + if (th == num_workers - 1) { + winterface->execute(worker); + } else { + winterface->launch(worker); + } + } + + // Wait until all workers have finished. + for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) { + winterface->sync(&thread_data->workers[th]); + } +} + +void av1_cnn_convolve_c(const float **input, int in_width, int in_height, + int in_stride, const CNN_LAYER_CONFIG *layer_config, + float **output, int out_stride, int start_idx, + int step) { + assert(!layer_config->deconvolve); + const int cstep = layer_config->in_channels * layer_config->out_channels; + const int filter_height_half = layer_config->filter_height >> 1; + const int filter_width_half = layer_config->filter_width >> 1; + const int channel_step = AOMMAX(step, 1); + + if (layer_config->maxpool && + (layer_config->skip_height > 1 || layer_config->skip_width > 1)) { + switch (layer_config->pad) { + case PADDING_SAME_ZERO: + for (int i = 0; i < layer_config->out_channels; ++i) { + for (int h = 0, u = 0; h < in_height; + h += layer_config->skip_height, ++u) { + for (int w = 0, v = 0; w < in_width; + w += layer_config->skip_width, ++v) { + for (int hh = h; + hh < AOMMIN(in_height, h + layer_config->skip_height); + ++hh) { + for (int ww = w; + ww < AOMMIN(in_width, w + layer_config->skip_width); + ++ww) { + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + int off = k * layer_config->out_channels + i; + for (int l = 0; l < layer_config->filter_height; ++l) { + const int ii = hh + l - filter_height_half; + for (int m = 0; m < layer_config->filter_width; + ++m, off += cstep) { + const int jj = ww + m - filter_width_half; + if (ii < 0 || ii >= in_height || jj < 0 || + jj >= in_width) + continue; + sum += layer_config->weights[off] * + input[k][ii * in_stride + jj]; + } + } + } + const float a = sum; + if (h == hh && w == ww) + output[i][u * out_stride + v] = a; + else + output[i][u * out_stride + v] = + AOMMAX(output[i][u * out_stride + v], a); + } + } + } + } + } + break; + case PADDING_SAME_REPLICATE: + for (int i = 0; i < layer_config->out_channels; ++i) { + for (int h = 0, u = 0; h < in_height; + h += layer_config->skip_height, ++u) { + for (int w = 0, v = 0; w < in_width; + w += layer_config->skip_width, ++v) { + for (int hh = h; + hh < AOMMIN(in_height, h + layer_config->skip_height); + ++hh) { + for (int ww = w; + ww < AOMMIN(in_width, w + layer_config->skip_width); + ++ww) { + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + int off = k * layer_config->out_channels + i; + for (int l = 0; l < layer_config->filter_height; ++l) { + const int ii = + CLAMPINDEX(hh + l - filter_height_half, in_height); + for (int m = 0; m < layer_config->filter_width; + ++m, off += cstep) { + const int jj = + CLAMPINDEX(ww + m - filter_width_half, in_width); + assert(ii >= 0 && ii < in_height && jj >= 0 && + jj < in_width); + sum += layer_config->weights[off] * + input[k][ii * in_stride + jj]; + } + } + } + const float a = sum; + if (h == hh && w == ww) + output[i][u * out_stride + v] = a; + else + output[i][u * out_stride + v] = + AOMMAX(output[i][u * out_stride + v], a); + } + } + } + } + } + break; + case PADDING_VALID: + for (int i = 0; i < layer_config->out_channels; ++i) { + for (int h = 0, u = 0; + h < in_height - layer_config->filter_height + 1; + h += layer_config->skip_height, ++u) { + for (int w = 0, v = 0; + w < in_width - layer_config->filter_width + 1; + w += layer_config->skip_width, ++v) { + for (int hh = h; + hh < AOMMIN(in_height, h + layer_config->skip_height); + ++hh) { + for (int ww = w; + ww < AOMMIN(in_width, w + layer_config->skip_width); + ++ww) { + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + int off = k * layer_config->out_channels + i; + for (int l = 0; l < layer_config->filter_height; ++l) { + const int ii = hh + l; + for (int m = 0; m < layer_config->filter_width; + ++m, off += cstep) { + const int jj = ww + m; + assert(ii >= 0 && ii < in_height && jj >= 0 && + jj < in_width); + sum += layer_config->weights[off] * + input[k][ii * in_stride + jj]; + } + } + } + const float a = sum; + if (h == hh && w == ww) + output[i][u * out_stride + v] = a; + else + output[i][u * out_stride + v] = + AOMMAX(output[i][u * out_stride + v], a); + } + } + } + } + } + break; + default: assert(0 && "Unknown padding type"); + } + } else { + // Results in element-wise matrix multiplication. + if (layer_config->filter_height == 1 && layer_config->filter_width == 1) { + const int start_h = get_start_shift_convolve( + in_height, layer_config->filter_height, layer_config->skip_height); + const int start_w = + get_start_shift_convolve(in_width, layer_config->filter_width, + layer_config->skip_width) + + start_idx * layer_config->skip_width; + const int out_w_step = AOMMAX(step, 1); + const int in_w_step = layer_config->skip_width * out_w_step; + for (int i = 0; i < layer_config->out_channels; ++i) { + for (int h = start_h, u = 0; h < in_height; + h += layer_config->skip_height, ++u) { + const int in_h = h * in_stride; + const int out_h = u * out_stride + start_idx; + for (int w = start_w, out_index = out_h; w < in_width; + w += in_w_step, out_index += out_w_step) { + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + sum += layer_config->weights[k * layer_config->out_channels + i] * + input[k][in_h + w]; + } + output[i][out_index] = sum; + } + } + } + return; + } + const int ii_shift = + filter_height_half - (layer_config->filter_height - 1) % 2; + const int jj_shift = + filter_width_half - (layer_config->filter_width - 1) % 2; + switch (layer_config->pad) { + case PADDING_SAME_ZERO: { + const int start_h = get_start_shift_convolve( + in_height, layer_config->filter_height, layer_config->skip_height); + const int start_w = get_start_shift_convolve( + in_width, layer_config->filter_width, layer_config->skip_width); + const int end_ii_shift = filter_height_half + 1; + const int end_jj_shift = filter_width_half + 1; + // *_filter_margin stores the number of pixels along a dimension in the + // intersection of the complement of the image in the extended image + // and the filter. + const int top_filter_margin = layer_config->filter_width * ii_shift; + const int right_filter_margin = end_jj_shift - in_width; + for (int i = start_idx; i < layer_config->out_channels; + i += channel_step) { + for (int h = start_h, u = 0; h < in_height; + h += layer_config->skip_height, ++u) { + const int out_h = u * out_stride; + const int top_cstep = + AOMMAX(0, top_filter_margin - h * layer_config->filter_width) * + cstep + + i; + const int start_ii = AOMMAX(0, h - ii_shift); + const int end_ii = AOMMIN(in_height, h + end_ii_shift); + for (int w = start_w, out_index = out_h; w < in_width; + w += layer_config->skip_width, ++out_index) { + const int left_cstep = AOMMAX(0, jj_shift - w) * cstep; + const int right_cstep = + AOMMAX(0, right_filter_margin + w) * cstep; + const int start_jj = AOMMAX(0, w - jj_shift); + const int end_jj = AOMMIN(in_width, w + end_jj_shift); + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + int off = k * layer_config->out_channels + top_cstep; + for (int ii = start_ii; ii < end_ii; ++ii) { + off += left_cstep; + for (int jj = start_jj; jj < end_jj; ++jj, off += cstep) { + sum += layer_config->weights[off] * + input[k][ii * in_stride + jj]; + } + off += right_cstep; + } + } + output[i][out_index] = sum; + } + } + } + break; + } + case PADDING_SAME_REPLICATE: { + // h and w are shifted to an offset coordinate system to reduce in-loop + // computation. + const int start_h = + get_start_shift_convolve(in_height, layer_config->filter_height, + layer_config->skip_height) - + ii_shift; + const int start_w = + get_start_shift_convolve(in_width, layer_config->filter_width, + layer_config->skip_width) - + jj_shift; + const int end_h = in_height - ii_shift; + const int end_w = in_width - jj_shift; + for (int i = start_idx; i < layer_config->out_channels; + i += channel_step) { + for (int h = start_h, u = 0; h < end_h; + h += layer_config->skip_height, ++u) { + const int out_h = u * out_stride; + const int upper_ii_index = layer_config->filter_height + h; + for (int w = start_w, out_index = out_h; w < end_w; + w += layer_config->skip_width, ++out_index) { + const int upper_jj_index = layer_config->filter_width + w; + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + int off = k * layer_config->out_channels + i; + for (int ii = h; ii < upper_ii_index; ++ii) { + const int clamped_ii = CLAMPINDEX(ii, in_height); + for (int jj = w; jj < upper_jj_index; ++jj) { + const int clamped_jj = CLAMPINDEX(jj, in_width); + assert(clamped_ii >= 0 && clamped_ii < in_height && + clamped_jj >= 0 && clamped_jj < in_width); + sum += layer_config->weights[off] * + input[k][clamped_ii * in_stride + clamped_jj]; + off += cstep; + } + } + } + output[i][out_index] = sum; + } + } + } + break; + } + case PADDING_VALID: { + for (int i = start_idx; i < layer_config->out_channels; + i += channel_step) { + for (int h = 0, u = 0; + h < in_height - layer_config->filter_height + 1; + h += layer_config->skip_height, ++u) { + const int out_h = u * out_stride; + const int upper_ii_index = layer_config->filter_height + h; + for (int w = 0, out_index = out_h; + w < in_width - layer_config->filter_width + 1; + w += layer_config->skip_width, ++out_index) { + const int upper_jj_index = layer_config->filter_width + w; + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + int off = k * layer_config->out_channels + i; + for (int ii = h; ii < upper_ii_index; ++ii) { + for (int jj = w; jj < upper_jj_index; ++jj) { + assert(ii >= 0 && ii < in_height && jj >= 0 && + jj < in_width); + sum += layer_config->weights[off] * + input[k][ii * in_stride + jj]; + off += cstep; + } + } + } + output[i][out_index] = sum; + } + } + } + break; + } + default: assert(0 && "Unknown padding type"); + } + } +} + +static INLINE int get_start_shift_deconvolve(int filt_width, int stride) { + const int dif = AOMMAX(filt_width - stride, 0); + return dif / 2; +} + +void av1_cnn_batchnorm_c(float **image, int channels, int width, int height, + int stride, const float *gamma, const float *beta, + const float *mean, const float *std) { + assert(gamma && beta && beta && std && "batchnorm has null parameter!"); + for (int ch = 0; ch < channels; ch++) { + const float ch_gamma = gamma[ch]; + const float ch_beta = beta[ch]; + const float ch_mean = mean[ch]; + const float ch_std = std[ch]; + float *image_row = image[ch]; + + for (int row = 0; row < height; row++) { + for (int col = 0; col < width; col++) { + image_row[col] = + ch_gamma * (image_row[col] - ch_mean) / ch_std + ch_beta; + } + image_row += stride; + } + } +} + +void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height, + int in_stride, const CNN_LAYER_CONFIG *layer_config, + float **output, int out_stride) { + assert(layer_config->deconvolve); + + const int cstep = layer_config->in_channels * layer_config->out_channels; + + int out_width = 0; + int out_height = 0; + find_layer_output_size(in_width, in_height, layer_config, &out_width, + &out_height); + switch (layer_config->pad) { + case PADDING_SAME_ZERO: + for (int i = 0; i < layer_config->out_channels; ++i) { + for (int u = 0; u < out_height; ++u) { + for (int v = 0; v < out_width; ++v) { + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + int off = k * layer_config->out_channels + i; + for (int l = 0; l < layer_config->filter_height; ++l) { + const int h = + u - l + + get_start_shift_deconvolve(layer_config->filter_height, + layer_config->skip_height); + for (int m = 0; m < layer_config->filter_width; + ++m, off += cstep) { + const int w = + v - m + + get_start_shift_deconvolve(layer_config->filter_width, + layer_config->skip_width); + if ((h % layer_config->skip_height) != 0 || + (w % layer_config->skip_width) != 0) + continue; + const int ii = h / layer_config->skip_height; + const int jj = w / layer_config->skip_width; + if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width) + continue; + sum += layer_config->weights[off] * + input[k][ii * in_stride + jj]; + } + } + } + output[i][u * out_stride + v] = sum; + } + } + } + break; + case PADDING_SAME_REPLICATE: + for (int i = 0; i < layer_config->out_channels; ++i) { + for (int u = 0; u < out_height; ++u) { + for (int v = 0; v < out_width; ++v) { + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + int off = k * layer_config->out_channels + i; + for (int l = 0; l < layer_config->filter_height; ++l) { + const int h = + u - l + + get_start_shift_deconvolve(layer_config->filter_height, + layer_config->skip_height); + for (int m = 0; m < layer_config->filter_width; + ++m, off += cstep) { + const int w = + v - m + + get_start_shift_deconvolve(layer_config->filter_width, + layer_config->skip_width); + if ((h % layer_config->skip_height) != 0 || + (w % layer_config->skip_width) != 0) + continue; + const int ii = + CLAMPINDEX(h / layer_config->skip_height, in_height); + const int jj = + CLAMPINDEX(w / layer_config->skip_width, in_width); + assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width); + continue; + sum += layer_config->weights[off] * + input[k][ii * in_stride + jj]; + } + } + } + output[i][u * out_stride + v] = sum; + } + } + } + break; + case PADDING_VALID: + for (int i = 0; i < layer_config->out_channels; ++i) { + for (int u = 0; u < out_height; ++u) { + for (int v = 0; v < out_width; ++v) { + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + int off = k * layer_config->out_channels + i; + for (int l = 0; l < layer_config->filter_height; ++l) { + const int h = u - l; + for (int m = 0; m < layer_config->filter_width; + ++m, off += cstep) { + const int w = v - m; + if ((h % layer_config->skip_height) != 0 || + (w % layer_config->skip_width) != 0) + continue; + const int ii = h / layer_config->skip_height; + const int jj = w / layer_config->skip_width; + if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width) + continue; + sum += layer_config->weights[off] * + input[k][ii * in_stride + jj]; + } + } + } + output[i][u * out_stride + v] = sum; + } + } + } + break; + default: assert(0 && "Unknown padding type"); + } +} + +void av1_cnn_predict_c(const float **input, int in_width, int in_height, + int in_stride, const CNN_CONFIG *cnn_config, + const CNN_THREAD_DATA *thread_data, + CNN_MULTI_OUT *output_struct) { + TENSOR tensor1[CNN_MAX_BRANCHES] = { 0 }; + TENSOR tensor2[CNN_MAX_BRANCHES] = { 0 }; + + float **output[CNN_MAX_BRANCHES]; + const int *out_chs = output_struct->output_channels; + output[0] = output_struct->output_buffer; + for (int out_idx = 1; out_idx < output_struct->num_outputs; out_idx++) { + output[out_idx] = output[out_idx - 1] + out_chs[out_idx - 1]; + } + + int i_width = in_width; + int i_height = in_height; + int o_width = 0, o_height = 0; + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + init_tensor(&tensor1[b]); + init_tensor(&tensor2[b]); + } + + const int *out_stride = output_struct->output_strides; + for (int layer = 0; layer < cnn_config->num_layers; ++layer) { + const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[layer]; + const int branch = layer_config->branch; + const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config; + + // Allocate input tensor + if (layer == 0) { // First layer + assert(branch == 0); // First layer must be primary branch + assign_tensor(&tensor1[branch], (float **)input, + layer_config->in_channels, in_width, in_height, in_stride); + } else { // Non-first layer + // Swap tensor1 and tensor2 + swap_tensor(&tensor1[branch], &tensor2[branch]); + + i_width = tensor1[branch].width; + i_height = tensor1[branch].height; + } + + // Allocate output tensor + find_layer_output_size(i_width, i_height, layer_config, &o_width, + &o_height); + const int output_num = layer_config->output_num; + if (output_num == -1) { // Non-output layer + realloc_tensor(&tensor2[branch], layer_config->out_channels, o_width, + o_height); + } else { // Output layer + free_tensor(&tensor2[branch]); + assign_tensor(&tensor2[branch], output[output_num], + layer_config->out_channels, o_width, o_height, + out_stride[output_num]); + } + + // If we are combining branches make sure that the branch to combine + // is different from the current branch. + assert(IMPLIES(layer_config->branch_combine_type != BRANCH_NOC, + !(branch_config->branches_to_combine & (1 << branch)))); + + if (layer_config->branch_copy_type == BRANCH_INPUT) { + copy_active_tensor_to_branches(&tensor1[branch], layer_config, branch, + tensor2); + } + // Check consistency of input and output channels + assert(tensor1[branch].channels == layer_config->in_channels); + assert(tensor2[branch].channels == layer_config->out_channels); + + // Convolve/Deconvolve + if (!cnn_config->layer_config[layer].deconvolve) { + if (thread_data->num_workers > 1) { + convolve_layer_mt((const float **)tensor1[branch].buf, + tensor1[branch].width, tensor1[branch].height, + tensor1[branch].stride, layer_config, thread_data, + tensor2[branch].buf, tensor2[branch].stride); + } else { + av1_cnn_convolve((const float **)tensor1[branch].buf, + tensor1[branch].width, tensor1[branch].height, + tensor1[branch].stride, layer_config, + tensor2[branch].buf, tensor2[branch].stride, 0, 1); + } + } else { + av1_cnn_deconvolve((const float **)tensor1[branch].buf, + tensor1[branch].width, tensor1[branch].height, + tensor1[branch].stride, layer_config, + tensor2[branch].buf, tensor2[branch].stride); + } + + if (layer_config->branch_copy_type == BRANCH_OUTPUT) { + copy_active_tensor_to_branches(&tensor2[branch], layer_config, branch, + tensor2); + } + + // Add tensors from other branches if needed + if (layer_config->branch_combine_type == BRANCH_ADD) { + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + if ((branch_config->branches_to_combine & (1 << b)) && b != branch) { + assert(check_tensor_equal_size(&tensor2[b], &tensor2[branch])); + av1_cnn_add(tensor2[branch].buf, tensor2[branch].channels, + tensor2[branch].width, tensor2[branch].height, + tensor2[branch].stride, (const float **)tensor2[b].buf); + } + } + } + + // Non-linearity + if (layer_config->activation != IDENTITY) + av1_cnn_activate(tensor2[branch].buf, tensor2[branch].channels, + tensor2[branch].width, tensor2[branch].height, + tensor2[branch].stride, layer_config->activation); + + if (layer_config->bn_params.bn_gamma) { + av1_cnn_batchnorm( + tensor2[branch].buf, tensor2[branch].channels, tensor2[branch].width, + tensor2[branch].height, tensor2[branch].stride, + layer_config->bn_params.bn_gamma, layer_config->bn_params.bn_beta, + layer_config->bn_params.bn_mean, layer_config->bn_params.bn_std); + } + + // Concatenate tensors + if (layer_config->branch_combine_type == BRANCH_CAT) { + if (output_num == -1) { // Non-output layer + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + if ((branch_config->branches_to_combine & (1 << b)) && b != branch) { + assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch])); + assert(tensor2[b].channels > 0); + concat_tensor(&tensor2[b], &tensor2[branch]); + } + } + } else { // Output layer + const int existing_channels = tensor2[branch].channels; + int num_chs = existing_channels; + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + if ((branch_config->branches_to_combine & (1 << b)) && b != branch) { + assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch])); + // Needed only to assign the new channel buffers + num_chs += tensor2[b].channels; + } + } + assign_tensor(&tensor2[branch], output[output_num], num_chs, o_width, + o_height, out_stride[output_num]); + + num_chs = existing_channels; + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + if ((branch_config->branches_to_combine & (1 << b)) && b != branch) { + assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch])); + // Needed only to assign the new channel buffers + copy_tensor(&tensor2[b], tensor2[b].channels, num_chs, + &tensor2[branch]); + num_chs += tensor2[b].channels; + } + } + } + } + + if (layer_config->branch_copy_type == BRANCH_COMBINED) { + copy_active_tensor_to_branches(&tensor2[branch], layer_config, branch, + tensor2); + } + } + + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + free_tensor(&tensor1[b]); + free_tensor(&tensor2[b]); + } +} + +// Assume output already has proper allocation +// Assume input image buffers all have same resolution and strides +void av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height, + int stride, const CNN_CONFIG *cnn_config, + const CNN_THREAD_DATA *thread_data, + CNN_MULTI_OUT *output) { + const float max_val = 255.0; + + const int in_width = width + 2 * cnn_config->ext_width; + const int in_height = height + 2 * cnn_config->ext_height; + const int in_channels = cnn_config->layer_config[0].in_channels; + float *inputs[CNN_MAX_CHANNELS]; + float *input_ = + (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_)); + const int in_stride = in_width; + + for (int c = 0; c < in_channels; ++c) { + inputs[c] = input_ + c * in_stride * in_height; + float *input = + inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width; + + if (cnn_config->strict_bounds) { + for (int i = 0; i < height; ++i) + for (int j = 0; j < width; ++j) + input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val; + // extend left and right + for (int i = 0; i < height; ++i) { + for (int j = -cnn_config->ext_width; j < 0; ++j) + input[i * in_stride + j] = input[i * in_stride]; + for (int j = width; j < width + cnn_config->ext_width; ++j) + input[i * in_stride + j] = input[i * in_stride + width - 1]; + } + // extend top and bottom + for (int i = -cnn_config->ext_height; i < 0; ++i) + memcpy(&input[i * in_stride - cnn_config->ext_width], + &input[-cnn_config->ext_width], in_width * sizeof(*input)); + for (int i = height; i < height + cnn_config->ext_height; ++i) + memcpy(&input[i * in_stride - cnn_config->ext_width], + &input[(height - 1) * in_stride - cnn_config->ext_width], + in_width * sizeof(*input)); + } else { + for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height; + ++i) + for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width; + ++j) + input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val; + } + } + av1_cnn_predict((const float **)inputs, in_width, in_height, in_stride, + cnn_config, thread_data, output); + + aom_free(input_); +} + +// Assume output already has proper allocation +// Assume input image buffers all have same resolution and strides +void av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height, + int stride, + const CNN_CONFIG *cnn_config, + const CNN_THREAD_DATA *thread_data, + int bit_depth, + CNN_MULTI_OUT *output) { + const float max_val = (float)((1 << bit_depth) - 1); + + const int in_width = width + 2 * cnn_config->ext_width; + const int in_height = height + 2 * cnn_config->ext_height; + const int in_channels = cnn_config->layer_config[0].in_channels; + float *inputs[CNN_MAX_CHANNELS]; + float *input_ = + (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_)); + const int in_stride = in_width; + + for (int c = 0; c < in_channels; ++c) { + inputs[c] = input_ + c * in_stride * in_height; + float *input = + inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width; + + if (cnn_config->strict_bounds) { + for (int i = 0; i < height; ++i) + for (int j = 0; j < width; ++j) + input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val; + // extend left and right + for (int i = 0; i < height; ++i) { + for (int j = -cnn_config->ext_width; j < 0; ++j) + input[i * in_stride + j] = input[i * in_stride]; + for (int j = width; j < width + cnn_config->ext_width; ++j) + input[i * in_stride + j] = input[i * in_stride + width - 1]; + } + // extend top and bottom + for (int i = -cnn_config->ext_height; i < 0; ++i) + memcpy(&input[i * in_stride - cnn_config->ext_width], + &input[-cnn_config->ext_width], in_width * sizeof(*input)); + for (int i = height; i < height + cnn_config->ext_height; ++i) + memcpy(&input[i * in_stride - cnn_config->ext_width], + &input[(height - 1) * in_stride - cnn_config->ext_width], + in_width * sizeof(*input)); + } else { + for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height; + ++i) + for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width; + ++j) + input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val; + } + } + + av1_cnn_predict((const float **)inputs, in_width, in_height, in_stride, + cnn_config, thread_data, output); + + aom_free(input_); +} + +// Assume output already has proper allocation +// Assume input image buffers all have same resolution and strides +void av1_cnn_predict_img(uint8_t **dgd, int width, int height, int stride, + const CNN_CONFIG *cnn_config, + const CNN_THREAD_DATA *thread_data, float **output, + int out_stride) { + int out_width = 0, out_height = 0, out_channels = 0; + av1_find_cnn_output_size(width, height, cnn_config, &out_width, &out_height, + &out_channels); + const int output_chs[1] = { out_channels }; + const int output_strides[1] = { out_stride }; + CNN_MULTI_OUT output_struct = { .output_channels = output_chs, + .output_strides = output_strides, + .output_buffer = output }; + av1_cnn_predict_img_multi_out(dgd, width, height, stride, cnn_config, + thread_data, &output_struct); +} + +// Assume output already has proper allocation +// Assume input image buffers all have same resolution and strides +void av1_cnn_predict_img_highbd(uint16_t **dgd, int width, int height, + int stride, const CNN_CONFIG *cnn_config, + const CNN_THREAD_DATA *thread_data, + int bit_depth, float **output, int out_stride) { + int out_width = 0, out_height = 0, out_channels = 0; + av1_find_cnn_output_size(width, height, cnn_config, &out_width, &out_height, + &out_channels); + const int output_chs[1] = { out_channels }; + const int output_strides[1] = { out_stride }; + CNN_MULTI_OUT output_struct = { .output_channels = output_chs, + .output_strides = output_strides, + .output_buffer = output }; + av1_cnn_predict_img_multi_out_highbd(dgd, width, height, stride, cnn_config, + thread_data, bit_depth, &output_struct); +} diff --git a/libs/libaom/src/av1/encoder/cnn.h b/libs/libaom/src/av1/encoder/cnn.h new file mode 100644 index 000000000..706be4447 --- /dev/null +++ b/libs/libaom/src/av1/encoder/cnn.h @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_CNN_H_ +#define AOM_AV1_COMMON_CNN_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include "aom_util/aom_thread.h" +#include "config/av1_rtcd.h" + +struct AV1Common; + +#define CNN_MAX_HIDDEN_LAYERS 64 +#define CNN_MAX_LAYERS (CNN_MAX_HIDDEN_LAYERS + 1) +#define CNN_MAX_CHANNELS 256 +#define CNN_MAX_BRANCHES 4 +#define CNN_MAX_THREADS 32 + +#define NO_BRANCH_CONFIG \ + { 0, 0, 0 } +#define NO_BN_PARAMS \ + { NULL, NULL, NULL, NULL } + +enum { + PADDING_SAME_ZERO, // tensorflow's SAME padding with pixels outside + // the image area assumed to be 0 (default) + PADDING_SAME_REPLICATE, // tensorflow's SAME padding with pixels outside + // the image area replicated from closest edge + PADDING_VALID // tensorflow's VALID padding +} UENUM1BYTE(PADDING_TYPE); + +// enum { NONE, RELU, SOFTSIGN } UENUM1BYTE(ACTIVATION); + +// Times when input tensor may be copied to branches given in input_to_branches. +// BRANCH_NO_COPY: doesn't copy any tensor. +// BRANCH_INPUT: copies the input tensor to branches. +// BRANCH_OUTPUT: copies the convolved tensor to branches. +// BRANCH_COMBINED: copies the combined (after convolving and branch combining) +// tensor. If no combinations happen at this layer, then this option +// has the same effect as COPY_OUTPUT. +enum { + BRANCH_NO_COPY, + BRANCH_INPUT, + BRANCH_OUTPUT, + BRANCH_COMBINED +} UENUM1BYTE(BRANCH_COPY); + +// Types of combining branches with output of current layer: +// BRANCH_NOC: no branch combining +// BRANCH_ADD: Add previously stored branch tensor to output of layer +// BRANCH_CAT: Concatenate branch tensor to output of layer +enum { BRANCH_NOC, BRANCH_ADD, BRANCH_CAT } UENUM1BYTE(BRANCH_COMBINE); + +// The parameters used to scale each channel in batch +// normalization. The processing in done on a per-channel basis. +// e.g. bn_mean[c] is the mean for all pixels in channel c. This +// is always applied after activation. The output is given by +// out[c,i,j] = norm[c,i,j] * bn_gamma[c] + bn_beta[c] where +// norm[c,i,j] = (in[c,i,j] - bn_mean[c]) / bn_std[c] +// here we assume that the effect of variance_epsilon is already +// taken into account when bn_std is calculated. The pointers +// needs to be either all zero or all valid. If all zero, then +// batchnorm is disabled, else batchnorm is applied. +struct CNN_BATCHNORM_PARAMS { + const float *bn_gamma; + const float *bn_beta; + const float *bn_mean; + const float *bn_std; +}; + +struct CNN_BRANCH_CONFIG { + int input_to_branches; // If nonzero, copy the active tensor to the current + // layer and store for future use in branches + // specified in the field as a binary mask. For + // example, if input_to_branch = 0x06, it means the + // input tensor to the current branch is copied to + // branches 1 and 2 (where 0 represents the primary + // branch). One restriction is that the mask + // cannot indicate copying to the current branch. + // If greater than 0, only copies the channels up + // to the given index. + int channels_to_copy; // Within the layer, input a copy of active + // tensor to branches given in input_to_branches. + int branches_to_combine; // mask of branches to combine with output of + // current layer, if + // branch_combine_type != BRANCH_NOC + // For example, if branches_to_combine = 0x0A, + // it means that braches 1 and 3 are combined + // with the current branch. +}; + +struct CNN_LAYER_CONFIG { + int in_channels; + int filter_width; + int filter_height; + int out_channels; + int skip_width; + int skip_height; + int maxpool; // whether to use maxpool or not (only effective when + // skip width or skip_height are > 1) + const float *weights; // array of length filter_height x filter_width x + // in_channels x out_channels where the inner-most + // scan is out_channels and the outer most scan is + // filter_height. + const float *bias; // array of length out_channels + PADDING_TYPE pad; // padding type + ACTIVATION activation; // the activation function to use after convolution + int deconvolve; // whether this is a deconvolution layer. + // 0: If skip_width or skip_height are > 1, then we + // reduce resolution + // 1: If skip_width or skip_height are > 1, then we + // increase resolution + int branch; // branch index in [0, CNN_MAX_BRANCHES - 1], where + // 0 refers to the primary branch. + BRANCH_COPY branch_copy_type; + BRANCH_COMBINE branch_combine_type; + struct CNN_BRANCH_CONFIG branch_config; + struct CNN_BATCHNORM_PARAMS + bn_params; // A struct that contains the parameters + // used for batch normalization. + int output_num; // The output buffer idx to which the layer output is + // written. Set to -1 to disable writing it to the output. In + // the case that branch_combine_type is BRANCH_CAT, all + // concatenated channels will be written to output. In the + // case of BRANCH_ADD, the output will be the result of + // summation. +}; + +struct CNN_CONFIG { + int num_layers; // number of CNN layers ( = number of hidden layers + 1) + int is_residue; // whether the output activation is a residue + int ext_width, ext_height; // extension horizontally and vertically + int strict_bounds; // whether the input bounds are strict or not. + // If strict, the extension area is filled by + // replication; if not strict, image data is + // assumed available beyond the bounds. + CNN_LAYER_CONFIG layer_config[CNN_MAX_LAYERS]; +}; + +struct CNN_THREAD_DATA { + int num_workers; + AVxWorker *workers; +}; + +struct CNN_MULTI_OUT { + int num_outputs; + const int *output_channels; + const int *output_strides; + float **output_buffer; +}; + +// Function to return size of output +void av1_find_cnn_output_size(int in_width, int in_height, + const CNN_CONFIG *cnn_config, int *out_width, + int *out_height, int *out_channels); + +// Prediction functions from set of input image buffers. This function supports +// CNN with multiple outputs. +void av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height, + int stride, const CNN_CONFIG *cnn_config, + const CNN_THREAD_DATA *thread_data, + struct CNN_MULTI_OUT *output); +void av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height, + int stride, + const CNN_CONFIG *cnn_config, + const CNN_THREAD_DATA *thread_data, + int bit_depth, CNN_MULTI_OUT *output); + +// Prediction functions from set of input image buffers. This function only +// supports a single output. +void av1_cnn_predict_img(uint8_t **dgd, int width, int height, int stride, + const CNN_CONFIG *cnn_config, + const CNN_THREAD_DATA *thread_data, float **output, + int out_stride); +void av1_cnn_predict_img_highbd(uint16_t **dgd, int width, int height, + int stride, const CNN_CONFIG *cnn_config, + const CNN_THREAD_DATA *thread_data, + int bit_depth, float **output, int out_stride); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_CNN_H_ diff --git a/libs/libaom/src/av1/encoder/compound_type.c b/libs/libaom/src/av1/encoder/compound_type.c new file mode 100644 index 000000000..42095b79e --- /dev/null +++ b/libs/libaom/src/av1/encoder/compound_type.c @@ -0,0 +1,1508 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/pred_common.h" +#include "av1/encoder/compound_type.h" +#include "av1/encoder/model_rd.h" +#include "av1/encoder/motion_search_facade.h" +#include "av1/encoder/rdopt_utils.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/tx_search.h" + +typedef int64_t (*pick_interinter_mask_type)( + const AV1_COMP *const cpi, MACROBLOCK *x, const BLOCK_SIZE bsize, + const uint8_t *const p0, const uint8_t *const p1, + const int16_t *const residual1, const int16_t *const diff10, + uint64_t *best_sse); + +// Checks if characteristics of search match +static INLINE int is_comp_rd_match(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + const COMP_RD_STATS *st, + const MB_MODE_INFO *const mi, + int32_t *comp_rate, int64_t *comp_dist, + int32_t *comp_model_rate, + int64_t *comp_model_dist, int *comp_rs2) { + // TODO(ranjit): Ensure that compound type search use regular filter always + // and check if following check can be removed + // Check if interp filter matches with previous case + if (st->filter.as_int != mi->interp_filters.as_int) return 0; + + const MACROBLOCKD *const xd = &x->e_mbd; + // Match MV and reference indices + for (int i = 0; i < 2; ++i) { + if ((st->ref_frames[i] != mi->ref_frame[i]) || + (st->mv[i].as_int != mi->mv[i].as_int)) { + return 0; + } + const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[i]]; + if (is_global_mv_block(mi, wm->wmtype) != st->is_global[i]) return 0; + } + + // Store the stats for COMPOUND_AVERAGE and COMPOUND_DISTWTD + for (int comp_type = COMPOUND_AVERAGE; comp_type <= COMPOUND_DISTWTD; + comp_type++) { + comp_rate[comp_type] = st->rate[comp_type]; + comp_dist[comp_type] = st->dist[comp_type]; + comp_model_rate[comp_type] = st->model_rate[comp_type]; + comp_model_dist[comp_type] = st->model_dist[comp_type]; + comp_rs2[comp_type] = st->comp_rs2[comp_type]; + } + + // For compound wedge/segment, reuse data only if NEWMV is not present in + // either of the directions + if ((!have_newmv_in_inter_mode(mi->mode) && + !have_newmv_in_inter_mode(st->mode)) || + (cpi->sf.inter_sf.disable_interinter_wedge_newmv_search)) { + memcpy(&comp_rate[COMPOUND_WEDGE], &st->rate[COMPOUND_WEDGE], + sizeof(comp_rate[COMPOUND_WEDGE]) * 2); + memcpy(&comp_dist[COMPOUND_WEDGE], &st->dist[COMPOUND_WEDGE], + sizeof(comp_dist[COMPOUND_WEDGE]) * 2); + memcpy(&comp_model_rate[COMPOUND_WEDGE], &st->model_rate[COMPOUND_WEDGE], + sizeof(comp_model_rate[COMPOUND_WEDGE]) * 2); + memcpy(&comp_model_dist[COMPOUND_WEDGE], &st->model_dist[COMPOUND_WEDGE], + sizeof(comp_model_dist[COMPOUND_WEDGE]) * 2); + memcpy(&comp_rs2[COMPOUND_WEDGE], &st->comp_rs2[COMPOUND_WEDGE], + sizeof(comp_rs2[COMPOUND_WEDGE]) * 2); + } + return 1; +} + +// Checks if similar compound type search case is accounted earlier +// If found, returns relevant rd data +static INLINE int find_comp_rd_in_stats(const AV1_COMP *const cpi, + const MACROBLOCK *x, + const MB_MODE_INFO *const mbmi, + int32_t *comp_rate, int64_t *comp_dist, + int32_t *comp_model_rate, + int64_t *comp_model_dist, int *comp_rs2, + int *match_index) { + for (int j = 0; j < x->comp_rd_stats_idx; ++j) { + if (is_comp_rd_match(cpi, x, &x->comp_rd_stats[j], mbmi, comp_rate, + comp_dist, comp_model_rate, comp_model_dist, + comp_rs2)) { + *match_index = j; + return 1; + } + } + return 0; // no match result found +} + +static INLINE bool enable_wedge_search(MACROBLOCK *const x, + const AV1_COMP *const cpi) { + // Enable wedge search if source variance and edge strength are above + // the thresholds. + return x->source_variance > + cpi->sf.inter_sf.disable_wedge_search_var_thresh && + x->edge_strength > cpi->sf.inter_sf.disable_wedge_search_edge_thresh; +} + +static INLINE bool enable_wedge_interinter_search(MACROBLOCK *const x, + const AV1_COMP *const cpi) { + return enable_wedge_search(x, cpi) && cpi->oxcf.enable_interinter_wedge && + !cpi->sf.inter_sf.disable_interinter_wedge; +} + +static INLINE bool enable_wedge_interintra_search(MACROBLOCK *const x, + const AV1_COMP *const cpi) { + return enable_wedge_search(x, cpi) && cpi->oxcf.enable_interintra_wedge && + !cpi->sf.inter_sf.disable_wedge_interintra_search; +} + +static int8_t estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x, + const BLOCK_SIZE bsize, const uint8_t *pred0, + int stride0, const uint8_t *pred1, + int stride1) { + static const BLOCK_SIZE split_qtr[BLOCK_SIZES_ALL] = { + // 4X4 + BLOCK_INVALID, + // 4X8, 8X4, 8X8 + BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4, + // 8X16, 16X8, 16X16 + BLOCK_4X8, BLOCK_8X4, BLOCK_8X8, + // 16X32, 32X16, 32X32 + BLOCK_8X16, BLOCK_16X8, BLOCK_16X16, + // 32X64, 64X32, 64X64 + BLOCK_16X32, BLOCK_32X16, BLOCK_32X32, + // 64x128, 128x64, 128x128 + BLOCK_32X64, BLOCK_64X32, BLOCK_64X64, + // 4X16, 16X4, 8X32 + BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16, + // 32X8, 16X64, 64X16 + BLOCK_16X4, BLOCK_8X32, BLOCK_32X8 + }; + const struct macroblock_plane *const p = &x->plane[0]; + const uint8_t *src = p->src.buf; + int src_stride = p->src.stride; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const int bw_by2 = bw >> 1; + const int bh_by2 = bh >> 1; + uint32_t esq[2][2]; + int64_t tl, br; + + const BLOCK_SIZE f_index = split_qtr[bsize]; + assert(f_index != BLOCK_INVALID); + + if (is_cur_buf_hbd(&x->e_mbd)) { + pred0 = CONVERT_TO_BYTEPTR(pred0); + pred1 = CONVERT_TO_BYTEPTR(pred1); + } + + // Residual variance computation over relevant quandrants in order to + // find TL + BR, TL = sum(1st,2nd,3rd) quadrants of (pred0 - pred1), + // BR = sum(2nd,3rd,4th) quadrants of (pred1 - pred0) + // The 2nd and 3rd quadrants cancel out in TL + BR + // Hence TL + BR = 1st quadrant of (pred0-pred1) + 4th of (pred1-pred0) + // TODO(nithya): Sign estimation assumes 45 degrees (1st and 4th quadrants) + // for all codebooks; experiment with other quadrant combinations for + // 0, 90 and 135 degrees also. + cpi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]); + cpi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride, + pred0 + bh_by2 * stride0 + bw_by2, stride0, + &esq[0][1]); + cpi->fn_ptr[f_index].vf(src, src_stride, pred1, stride1, &esq[1][0]); + cpi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride, + pred1 + bh_by2 * stride1 + bw_by2, stride0, + &esq[1][1]); + + tl = ((int64_t)esq[0][0]) - ((int64_t)esq[1][0]); + br = ((int64_t)esq[1][1]) - ((int64_t)esq[0][1]); + return (tl + br > 0); +} + +// Choose the best wedge index and sign +static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x, + const BLOCK_SIZE bsize, const uint8_t *const p0, + const int16_t *const residual1, + const int16_t *const diff10, + int8_t *const best_wedge_sign, + int8_t *const best_wedge_index, uint64_t *best_sse) { + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const src = &x->plane[0].src; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const int N = bw * bh; + assert(N >= 64); + int rate; + int64_t dist; + int64_t rd, best_rd = INT64_MAX; + int8_t wedge_index; + int8_t wedge_sign; + const int8_t wedge_types = get_wedge_types_lookup(bsize); + const uint8_t *mask; + uint64_t sse; + const int hbd = is_cur_buf_hbd(xd); + const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; + + DECLARE_ALIGNED(32, int16_t, residual0[MAX_SB_SQUARE]); // src - pred0 +#if CONFIG_AV1_HIGHBITDEPTH + if (hbd) { + aom_highbd_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, + CONVERT_TO_BYTEPTR(p0), bw, xd->bd); + } else { + aom_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, p0, bw); + } +#else + (void)hbd; + aom_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, p0, bw); +#endif + + int64_t sign_limit = ((int64_t)aom_sum_squares_i16(residual0, N) - + (int64_t)aom_sum_squares_i16(residual1, N)) * + (1 << WEDGE_WEIGHT_BITS) / 2; + int16_t *ds = residual0; + + av1_wedge_compute_delta_squares(ds, residual0, residual1, N); + + for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) { + mask = av1_get_contiguous_soft_mask(wedge_index, 0, bsize); + + wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit); + + mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); + sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N); + sse = ROUND_POWER_OF_TWO(sse, bd_round); + + model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N, + &rate, &dist); + // int rate2; + // int64_t dist2; + // model_rd_with_curvfit(cpi, x, bsize, 0, sse, N, &rate2, &dist2); + // printf("sse %"PRId64": leagacy: %d %"PRId64", curvfit %d %"PRId64"\n", + // sse, rate, dist, rate2, dist2); dist = dist2; + // rate = rate2; + + rate += x->wedge_idx_cost[bsize][wedge_index]; + rd = RDCOST(x->rdmult, rate, dist); + + if (rd < best_rd) { + *best_wedge_index = wedge_index; + *best_wedge_sign = wedge_sign; + best_rd = rd; + *best_sse = sse; + } + } + + return best_rd - + RDCOST(x->rdmult, x->wedge_idx_cost[bsize][*best_wedge_index], 0); +} + +// Choose the best wedge index the specified sign +static int64_t pick_wedge_fixed_sign( + const AV1_COMP *const cpi, const MACROBLOCK *const x, + const BLOCK_SIZE bsize, const int16_t *const residual1, + const int16_t *const diff10, const int8_t wedge_sign, + int8_t *const best_wedge_index, uint64_t *best_sse) { + const MACROBLOCKD *const xd = &x->e_mbd; + + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const int N = bw * bh; + assert(N >= 64); + int rate; + int64_t dist; + int64_t rd, best_rd = INT64_MAX; + int8_t wedge_index; + const int8_t wedge_types = get_wedge_types_lookup(bsize); + const uint8_t *mask; + uint64_t sse; + const int hbd = is_cur_buf_hbd(xd); + const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; + for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) { + mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); + sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N); + sse = ROUND_POWER_OF_TWO(sse, bd_round); + + model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N, + &rate, &dist); + rate += x->wedge_idx_cost[bsize][wedge_index]; + rd = RDCOST(x->rdmult, rate, dist); + + if (rd < best_rd) { + *best_wedge_index = wedge_index; + best_rd = rd; + *best_sse = sse; + } + } + return best_rd - + RDCOST(x->rdmult, x->wedge_idx_cost[bsize][*best_wedge_index], 0); +} + +static int64_t pick_interinter_wedge( + const AV1_COMP *const cpi, MACROBLOCK *const x, const BLOCK_SIZE bsize, + const uint8_t *const p0, const uint8_t *const p1, + const int16_t *const residual1, const int16_t *const diff10, + uint64_t *best_sse) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int bw = block_size_wide[bsize]; + + int64_t rd; + int8_t wedge_index = -1; + int8_t wedge_sign = 0; + + assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize)); + assert(cpi->common.seq_params.enable_masked_compound); + + if (cpi->sf.inter_sf.fast_wedge_sign_estimate) { + wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw); + rd = pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, wedge_sign, + &wedge_index, best_sse); + } else { + rd = pick_wedge(cpi, x, bsize, p0, residual1, diff10, &wedge_sign, + &wedge_index, best_sse); + } + + mbmi->interinter_comp.wedge_sign = wedge_sign; + mbmi->interinter_comp.wedge_index = wedge_index; + return rd; +} + +static int64_t pick_interinter_seg(const AV1_COMP *const cpi, + MACROBLOCK *const x, const BLOCK_SIZE bsize, + const uint8_t *const p0, + const uint8_t *const p1, + const int16_t *const residual1, + const int16_t *const diff10, + uint64_t *best_sse) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const int N = 1 << num_pels_log2_lookup[bsize]; + int rate; + int64_t dist; + DIFFWTD_MASK_TYPE cur_mask_type; + int64_t best_rd = INT64_MAX; + DIFFWTD_MASK_TYPE best_mask_type = 0; + const int hbd = is_cur_buf_hbd(xd); + const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; + DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]); + uint8_t *tmp_mask[2] = { xd->seg_mask, seg_mask }; + // try each mask type and its inverse + for (cur_mask_type = 0; cur_mask_type < DIFFWTD_MASK_TYPES; cur_mask_type++) { + // build mask and inverse + if (hbd) + av1_build_compound_diffwtd_mask_highbd( + tmp_mask[cur_mask_type], cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw, + CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd); + else + av1_build_compound_diffwtd_mask(tmp_mask[cur_mask_type], cur_mask_type, + p0, bw, p1, bw, bh, bw); + + // compute rd for mask + uint64_t sse = av1_wedge_sse_from_residuals(residual1, diff10, + tmp_mask[cur_mask_type], N); + sse = ROUND_POWER_OF_TWO(sse, bd_round); + + model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N, + &rate, &dist); + const int64_t rd0 = RDCOST(x->rdmult, rate, dist); + + if (rd0 < best_rd) { + best_mask_type = cur_mask_type; + best_rd = rd0; + *best_sse = sse; + } + } + mbmi->interinter_comp.mask_type = best_mask_type; + if (best_mask_type == DIFFWTD_38_INV) { + memcpy(xd->seg_mask, seg_mask, N * 2); + } + return best_rd; +} + +static int64_t pick_interintra_wedge(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + const BLOCK_SIZE bsize, + const uint8_t *const p0, + const uint8_t *const p1) { + const MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(av1_is_wedge_used(bsize)); + assert(cpi->common.seq_params.enable_interintra_compound); + + const struct buf_2d *const src = &x->plane[0].src; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + DECLARE_ALIGNED(32, int16_t, residual1[MAX_SB_SQUARE]); // src - pred1 + DECLARE_ALIGNED(32, int16_t, diff10[MAX_SB_SQUARE]); // pred1 - pred0 +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, + CONVERT_TO_BYTEPTR(p1), bw, xd->bd); + aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(p1), bw, + CONVERT_TO_BYTEPTR(p0), bw, xd->bd); + } else { + aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, p1, bw); + aom_subtract_block(bh, bw, diff10, bw, p1, bw, p0, bw); + } +#else + aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, p1, bw); + aom_subtract_block(bh, bw, diff10, bw, p1, bw, p0, bw); +#endif + int8_t wedge_index = -1; + uint64_t sse; + int64_t rd = pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, 0, + &wedge_index, &sse); + + mbmi->interintra_wedge_index = wedge_index; + return rd; +} + +static AOM_INLINE void get_inter_predictors_masked_compound( + MACROBLOCK *x, const BLOCK_SIZE bsize, uint8_t **preds0, uint8_t **preds1, + int16_t *residual1, int16_t *diff10, int *strides) { + MACROBLOCKD *xd = &x->e_mbd; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + // get inter predictors to use for masked compound modes + av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 0, preds0, + strides); + av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 1, preds1, + strides); + const struct buf_2d *const src = &x->plane[0].src; +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, + CONVERT_TO_BYTEPTR(*preds1), bw, xd->bd); + aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(*preds1), + bw, CONVERT_TO_BYTEPTR(*preds0), bw, xd->bd); + } else { + aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, *preds1, + bw); + aom_subtract_block(bh, bw, diff10, bw, *preds1, bw, *preds0, bw); + } +#else + aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, *preds1, bw); + aom_subtract_block(bh, bw, diff10, bw, *preds1, bw, *preds0, bw); +#endif +} + +// Computes the rd cost for the given interintra mode and updates the best +static INLINE void compute_best_interintra_mode( + const AV1_COMP *const cpi, MB_MODE_INFO *mbmi, MACROBLOCKD *xd, + MACROBLOCK *const x, const int *const interintra_mode_cost, + const BUFFER_SET *orig_dst, uint8_t *intrapred, const uint8_t *tmp_buf, + INTERINTRA_MODE *best_interintra_mode, int64_t *best_interintra_rd, + INTERINTRA_MODE interintra_mode, BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + int rate, skip_txfm_sb; + int64_t dist, skip_sse_sb; + const int bw = block_size_wide[bsize]; + mbmi->interintra_mode = interintra_mode; + int rmode = interintra_mode_cost[interintra_mode]; + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); + model_rd_sb_fn[MODELRD_TYPE_INTERINTRA](cpi, bsize, x, xd, 0, 0, &rate, &dist, + &skip_txfm_sb, &skip_sse_sb, NULL, + NULL, NULL); + int64_t rd = RDCOST(x->rdmult, rate + rmode, dist); + if (rd < *best_interintra_rd) { + *best_interintra_rd = rd; + *best_interintra_mode = mbmi->interintra_mode; + } +} + +static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs, + MACROBLOCK *x, int64_t ref_best_rd, + RD_STATS *rd_stats) { + MACROBLOCKD *const xd = &x->e_mbd; + if (ref_best_rd < 0) return INT64_MAX; + av1_subtract_plane(x, bs, 0); + x->rd_model = LOW_TXFM_RD; + const int skip_trellis = (cpi->optimize_seg_arr[xd->mi[0]->segment_id] == + NO_ESTIMATE_YRD_TRELLIS_OPT); + const int64_t rd = + av1_uniform_txfm_yrd(cpi, x, rd_stats, ref_best_rd, bs, + max_txsize_rect_lookup[bs], FTXS_NONE, skip_trellis); + x->rd_model = FULL_TXFM_RD; + if (rd != INT64_MAX) { + const int skip_ctx = av1_get_skip_context(xd); + if (rd_stats->skip) { + const int s1 = x->skip_cost[skip_ctx][1]; + rd_stats->rate = s1; + } else { + const int s0 = x->skip_cost[skip_ctx][0]; + rd_stats->rate += s0; + } + } + return rd; +} + +// Computes the rd_threshold for smooth interintra rd search. +static AOM_INLINE int64_t compute_rd_thresh(MACROBLOCK *const x, + int total_mode_rate, + int64_t ref_best_rd) { + const int64_t rd_thresh = get_rd_thresh_from_best_rd( + ref_best_rd, (1 << INTER_INTRA_RD_THRESH_SHIFT), + INTER_INTRA_RD_THRESH_SCALE); + const int64_t mode_rd = RDCOST(x->rdmult, total_mode_rate, 0); + return (rd_thresh - mode_rd); +} + +// Computes the best wedge interintra mode +static AOM_INLINE int64_t compute_best_wedge_interintra( + const AV1_COMP *const cpi, MB_MODE_INFO *mbmi, MACROBLOCKD *xd, + MACROBLOCK *const x, const int *const interintra_mode_cost, + const BUFFER_SET *orig_dst, uint8_t *intrapred_, uint8_t *tmp_buf_, + int *best_mode, int *best_wedge_index, BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + const int bw = block_size_wide[bsize]; + int64_t best_interintra_rd_wedge = INT64_MAX; + int64_t best_total_rd = INT64_MAX; + uint8_t *intrapred = get_buf_by_bd(xd, intrapred_); + for (INTERINTRA_MODE mode = 0; mode < INTERINTRA_MODES; ++mode) { + mbmi->interintra_mode = mode; + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + int64_t rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); + const int rate_overhead = + interintra_mode_cost[mode] + + x->wedge_idx_cost[bsize][mbmi->interintra_wedge_index]; + const int64_t total_rd = rd + RDCOST(x->rdmult, rate_overhead, 0); + if (total_rd < best_total_rd) { + best_total_rd = total_rd; + best_interintra_rd_wedge = rd; + *best_mode = mbmi->interintra_mode; + *best_wedge_index = mbmi->interintra_wedge_index; + } + } + return best_interintra_rd_wedge; +} + +int av1_handle_inter_intra_mode(const AV1_COMP *const cpi, MACROBLOCK *const x, + BLOCK_SIZE bsize, MB_MODE_INFO *mbmi, + HandleInterModeArgs *args, int64_t ref_best_rd, + int *rate_mv, int *tmp_rate2, + const BUFFER_SET *orig_dst) { + const int try_smooth_interintra = cpi->oxcf.enable_smooth_interintra && + !cpi->sf.inter_sf.disable_smooth_interintra; + const int is_wedge_used = av1_is_wedge_used(bsize); + const int try_wedge_interintra = + is_wedge_used && enable_wedge_interintra_search(x, cpi); + if (!try_smooth_interintra && !try_wedge_interintra) return -1; + + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + int64_t rd = INT64_MAX; + const int bw = block_size_wide[bsize]; + DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]); + DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]); + uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_); + uint8_t *intrapred = get_buf_by_bd(xd, intrapred_); + const int *const interintra_mode_cost = + x->interintra_mode_cost[size_group_lookup[bsize]]; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + // Single reference inter prediction + mbmi->ref_frame[1] = NONE_FRAME; + xd->plane[0].dst.buf = tmp_buf; + xd->plane[0].dst.stride = bw; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + const int num_planes = av1_num_planes(cm); + + // Restore the buffers for intra prediction + restore_dst_buf(xd, *orig_dst, num_planes); + mbmi->ref_frame[1] = INTRA_FRAME; + INTERINTRA_MODE best_interintra_mode = + args->inter_intra_mode[mbmi->ref_frame[0]]; + + // Compute smooth_interintra + int64_t best_interintra_rd_nowedge = INT64_MAX; + int best_mode_rate = INT_MAX; + if (try_smooth_interintra) { + mbmi->use_wedge_interintra = 0; + int interintra_mode_reuse = 1; + if (cpi->sf.inter_sf.reuse_inter_intra_mode == 0 || + best_interintra_mode == INTERINTRA_MODES) { + interintra_mode_reuse = 0; + int64_t best_interintra_rd = INT64_MAX; + for (INTERINTRA_MODE cur_mode = 0; cur_mode < INTERINTRA_MODES; + ++cur_mode) { + if ((!cpi->oxcf.enable_smooth_intra || + cpi->sf.intra_sf.disable_smooth_intra) && + cur_mode == II_SMOOTH_PRED) + continue; + compute_best_interintra_mode(cpi, mbmi, xd, x, interintra_mode_cost, + orig_dst, intrapred, tmp_buf, + &best_interintra_mode, &best_interintra_rd, + cur_mode, bsize); + } + args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode; + } + assert(IMPLIES(!cpi->oxcf.enable_smooth_interintra || + cpi->sf.inter_sf.disable_smooth_interintra, + best_interintra_mode != II_SMOOTH_PRED)); + // Recompute prediction if required + if (interintra_mode_reuse || best_interintra_mode != INTERINTRA_MODES - 1) { + mbmi->interintra_mode = best_interintra_mode; + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); + } + + // Compute rd cost for best smooth_interintra + RD_STATS rd_stats; + const int rmode = interintra_mode_cost[best_interintra_mode] + + (is_wedge_used ? x->wedge_interintra_cost[bsize][0] : 0); + const int total_mode_rate = rmode + *rate_mv; + const int64_t rd_thresh = + compute_rd_thresh(x, total_mode_rate, ref_best_rd); + rd = estimate_yrd_for_sb(cpi, bsize, x, rd_thresh, &rd_stats); + if (rd != INT64_MAX) { + rd = RDCOST(x->rdmult, total_mode_rate + rd_stats.rate, rd_stats.dist); + } else { + return -1; + } + best_interintra_rd_nowedge = rd; + best_mode_rate = rmode; + // Return early if best_interintra_rd_nowedge not good enough + if (ref_best_rd < INT64_MAX && + (best_interintra_rd_nowedge >> INTER_INTRA_RD_THRESH_SHIFT) * + INTER_INTRA_RD_THRESH_SCALE > + ref_best_rd) { + return -1; + } + } + + // Compute wedge interintra + int64_t best_interintra_rd_wedge = INT64_MAX; + if (try_wedge_interintra) { + mbmi->use_wedge_interintra = 1; + if (!cpi->sf.inter_sf.fast_interintra_wedge_search) { + // Exhaustive search of all wedge and mode combinations. + int best_mode = 0; + int best_wedge_index = 0; + best_interintra_rd_wedge = compute_best_wedge_interintra( + cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred_, + tmp_buf_, &best_mode, &best_wedge_index, bsize); + mbmi->interintra_mode = best_mode; + mbmi->interintra_wedge_index = best_wedge_index; + if (best_mode != INTERINTRA_MODES - 1) { + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + } + } else if (!try_smooth_interintra) { + if (best_interintra_mode == INTERINTRA_MODES) { + mbmi->interintra_mode = INTERINTRA_MODES - 1; + best_interintra_mode = INTERINTRA_MODES - 1; + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + // Pick wedge mask based on INTERINTRA_MODES - 1 + best_interintra_rd_wedge = + pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); + // Find the best interintra mode for the chosen wedge mask + for (INTERINTRA_MODE cur_mode = 0; cur_mode < INTERINTRA_MODES; + ++cur_mode) { + compute_best_interintra_mode( + cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred, + tmp_buf, &best_interintra_mode, &best_interintra_rd_wedge, + cur_mode, bsize); + } + args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode; + mbmi->interintra_mode = best_interintra_mode; + + // Recompute prediction if required + if (best_interintra_mode != INTERINTRA_MODES - 1) { + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + } + } else { + // Pick wedge mask for the best interintra mode (reused) + mbmi->interintra_mode = best_interintra_mode; + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + best_interintra_rd_wedge = + pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); + } + } else { + // Pick wedge mask for the best interintra mode from smooth_interintra + best_interintra_rd_wedge = + pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); + } + + const int rate_overhead = + interintra_mode_cost[mbmi->interintra_mode] + + x->wedge_idx_cost[bsize][mbmi->interintra_wedge_index] + + x->wedge_interintra_cost[bsize][1]; + best_interintra_rd_wedge += RDCOST(x->rdmult, rate_overhead + *rate_mv, 0); + + const int_mv mv0 = mbmi->mv[0]; + int_mv tmp_mv = mv0; + rd = INT64_MAX; + int tmp_rate_mv = 0; + // Refine motion vector for NEWMV case. + if (have_newmv_in_inter_mode(mbmi->mode)) { + int rate_sum, skip_txfm_sb; + int64_t dist_sum, skip_sse_sb; + // get negative of mask + const uint8_t *mask = + av1_get_contiguous_soft_mask(mbmi->interintra_wedge_index, 1, bsize); + av1_compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, intrapred, + mask, bw, &tmp_rate_mv, 0); + if (mbmi->mv[0].as_int != tmp_mv.as_int) { + mbmi->mv[0].as_int = tmp_mv.as_int; + // Set ref_frame[1] to NONE_FRAME temporarily so that the intra + // predictor is not calculated again in av1_enc_build_inter_predictor(). + mbmi->ref_frame[1] = NONE_FRAME; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + mbmi->ref_frame[1] = INTRA_FRAME; + av1_combine_interintra(xd, bsize, 0, xd->plane[AOM_PLANE_Y].dst.buf, + xd->plane[AOM_PLANE_Y].dst.stride, intrapred, + bw); + model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND]( + cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, &skip_txfm_sb, + &skip_sse_sb, NULL, NULL, NULL); + rd = + RDCOST(x->rdmult, tmp_rate_mv + rate_overhead + rate_sum, dist_sum); + } + } + if (rd >= best_interintra_rd_wedge) { + tmp_mv.as_int = mv0.as_int; + tmp_rate_mv = *rate_mv; + av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); + } + // Evaluate closer to true rd + RD_STATS rd_stats; + const int64_t mode_rd = RDCOST(x->rdmult, rate_overhead + tmp_rate_mv, 0); + const int64_t tmp_rd_thresh = best_interintra_rd_nowedge - mode_rd; + rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, &rd_stats); + if (rd != INT64_MAX) { + rd = RDCOST(x->rdmult, rate_overhead + tmp_rate_mv + rd_stats.rate, + rd_stats.dist); + } else { + if (best_interintra_rd_nowedge == INT64_MAX) return -1; + } + best_interintra_rd_wedge = rd; + if (best_interintra_rd_wedge < best_interintra_rd_nowedge) { + mbmi->mv[0].as_int = tmp_mv.as_int; + *tmp_rate2 += tmp_rate_mv - *rate_mv; + *rate_mv = tmp_rate_mv; + best_mode_rate = rate_overhead; + } else { + mbmi->use_wedge_interintra = 0; + mbmi->interintra_mode = best_interintra_mode; + mbmi->mv[0].as_int = mv0.as_int; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + } + } + + if (best_interintra_rd_nowedge == INT64_MAX && + best_interintra_rd_wedge == INT64_MAX) { + return -1; + } + + *tmp_rate2 += best_mode_rate; + + if (num_planes > 1) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_U, num_planes - 1); + } + return 0; +} + +static void alloc_compound_type_rd_buffers_no_check( + CompoundTypeRdBuffers *const bufs) { + bufs->pred0 = + (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred0)); + bufs->pred1 = + (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred1)); + bufs->residual1 = + (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->residual1)); + bufs->diff10 = + (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->diff10)); + bufs->tmp_best_mask_buf = (uint8_t *)aom_malloc( + 2 * MAX_SB_SQUARE * sizeof(*bufs->tmp_best_mask_buf)); +} + +// Computes the valid compound_types to be evaluated +static INLINE int compute_valid_comp_types( + MACROBLOCK *x, const AV1_COMP *const cpi, int *try_average_and_distwtd_comp, + BLOCK_SIZE bsize, int masked_compound_used, int mode_search_mask, + COMPOUND_TYPE *valid_comp_types) { + const AV1_COMMON *cm = &cpi->common; + int valid_type_count = 0; + int comp_type, valid_check; + int8_t enable_masked_type[MASKED_COMPOUND_TYPES] = { 0, 0 }; + + const int try_average_comp = (mode_search_mask & (1 << COMPOUND_AVERAGE)); + const int try_distwtd_comp = + ((mode_search_mask & (1 << COMPOUND_DISTWTD)) && + cm->seq_params.order_hint_info.enable_dist_wtd_comp == 1 && + cpi->sf.inter_sf.use_dist_wtd_comp_flag != DIST_WTD_COMP_DISABLED); + *try_average_and_distwtd_comp = try_average_comp && try_distwtd_comp; + + // Check if COMPOUND_AVERAGE and COMPOUND_DISTWTD are valid cases + for (comp_type = COMPOUND_AVERAGE; comp_type <= COMPOUND_DISTWTD; + comp_type++) { + valid_check = + (comp_type == COMPOUND_AVERAGE) ? try_average_comp : try_distwtd_comp; + if (!*try_average_and_distwtd_comp && valid_check && + is_interinter_compound_used(comp_type, bsize)) + valid_comp_types[valid_type_count++] = comp_type; + } + // Check if COMPOUND_WEDGE and COMPOUND_DIFFWTD are valid cases + if (masked_compound_used) { + // enable_masked_type[0] corresponds to COMPOUND_WEDGE + // enable_masked_type[1] corresponds to COMPOUND_DIFFWTD + enable_masked_type[0] = enable_wedge_interinter_search(x, cpi); + enable_masked_type[1] = cpi->oxcf.enable_diff_wtd_comp; + for (comp_type = COMPOUND_WEDGE; comp_type <= COMPOUND_DIFFWTD; + comp_type++) { + if ((mode_search_mask & (1 << comp_type)) && + is_interinter_compound_used(comp_type, bsize) && + enable_masked_type[comp_type - COMPOUND_WEDGE]) + valid_comp_types[valid_type_count++] = comp_type; + } + } + return valid_type_count; +} + +// Calculates the cost for compound type mask +static INLINE void calc_masked_type_cost(MACROBLOCK *x, BLOCK_SIZE bsize, + int comp_group_idx_ctx, + int comp_index_ctx, + int masked_compound_used, + int *masked_type_cost) { + av1_zero_array(masked_type_cost, COMPOUND_TYPES); + // Account for group index cost when wedge and/or diffwtd prediction are + // enabled + if (masked_compound_used) { + // Compound group index of average and distwtd is 0 + // Compound group index of wedge and diffwtd is 1 + masked_type_cost[COMPOUND_AVERAGE] += + x->comp_group_idx_cost[comp_group_idx_ctx][0]; + masked_type_cost[COMPOUND_DISTWTD] += masked_type_cost[COMPOUND_AVERAGE]; + masked_type_cost[COMPOUND_WEDGE] += + x->comp_group_idx_cost[comp_group_idx_ctx][1]; + masked_type_cost[COMPOUND_DIFFWTD] += masked_type_cost[COMPOUND_WEDGE]; + } + + // Compute the cost to signal compound index/type + masked_type_cost[COMPOUND_AVERAGE] += x->comp_idx_cost[comp_index_ctx][1]; + masked_type_cost[COMPOUND_DISTWTD] += x->comp_idx_cost[comp_index_ctx][0]; + masked_type_cost[COMPOUND_WEDGE] += x->compound_type_cost[bsize][0]; + masked_type_cost[COMPOUND_DIFFWTD] += x->compound_type_cost[bsize][1]; +} + +// Updates mbmi structure with the relevant compound type info +static INLINE void update_mbmi_for_compound_type(MB_MODE_INFO *mbmi, + COMPOUND_TYPE cur_type) { + mbmi->interinter_comp.type = cur_type; + mbmi->comp_group_idx = (cur_type >= COMPOUND_WEDGE); + mbmi->compound_idx = (cur_type != COMPOUND_DISTWTD); +} + +// When match is found, populate the compound type data +// and calculate the rd cost using the stored stats and +// update the mbmi appropriately. +static INLINE int populate_reuse_comp_type_data( + const MACROBLOCK *x, MB_MODE_INFO *mbmi, + BEST_COMP_TYPE_STATS *best_type_stats, int_mv *cur_mv, int32_t *comp_rate, + int64_t *comp_dist, int *comp_rs2, int *rate_mv, int64_t *rd, + int match_index) { + const int winner_comp_type = + x->comp_rd_stats[match_index].interinter_comp.type; + if (comp_rate[winner_comp_type] == INT_MAX) + return best_type_stats->best_compmode_interinter_cost; + update_mbmi_for_compound_type(mbmi, winner_comp_type); + mbmi->interinter_comp = x->comp_rd_stats[match_index].interinter_comp; + *rd = RDCOST( + x->rdmult, + comp_rs2[winner_comp_type] + *rate_mv + comp_rate[winner_comp_type], + comp_dist[winner_comp_type]); + mbmi->mv[0].as_int = cur_mv[0].as_int; + mbmi->mv[1].as_int = cur_mv[1].as_int; + return comp_rs2[winner_comp_type]; +} + +// Updates rd cost and relevant compound type data for the best compound type +static INLINE void update_best_info(const MB_MODE_INFO *const mbmi, int64_t *rd, + BEST_COMP_TYPE_STATS *best_type_stats, + int64_t best_rd_cur, + int64_t comp_model_rd_cur, int rs2) { + *rd = best_rd_cur; + best_type_stats->comp_best_model_rd = comp_model_rd_cur; + best_type_stats->best_compound_data = mbmi->interinter_comp; + best_type_stats->best_compmode_interinter_cost = rs2; +} + +// Updates best_mv for masked compound types +static INLINE void update_mask_best_mv(const MB_MODE_INFO *const mbmi, + int_mv *best_mv, int_mv *cur_mv, + const COMPOUND_TYPE cur_type, + int *best_tmp_rate_mv, int tmp_rate_mv, + const SPEED_FEATURES *const sf) { + if (cur_type == COMPOUND_WEDGE || + (sf->inter_sf.enable_interinter_diffwtd_newmv_search && + cur_type == COMPOUND_DIFFWTD)) { + *best_tmp_rate_mv = tmp_rate_mv; + best_mv[0].as_int = mbmi->mv[0].as_int; + best_mv[1].as_int = mbmi->mv[1].as_int; + } else { + best_mv[0].as_int = cur_mv[0].as_int; + best_mv[1].as_int = cur_mv[1].as_int; + } +} + +// Choose the better of the two COMPOUND_AVERAGE, +// COMPOUND_DISTWTD based on modeled cost +static int find_best_avg_distwtd_comp_type(MACROBLOCK *x, int *comp_model_rate, + int64_t *comp_model_dist, + int rate_mv, int64_t *best_rd) { + int64_t est_rd[2]; + est_rd[COMPOUND_AVERAGE] = + RDCOST(x->rdmult, comp_model_rate[COMPOUND_AVERAGE] + rate_mv, + comp_model_dist[COMPOUND_AVERAGE]); + est_rd[COMPOUND_DISTWTD] = + RDCOST(x->rdmult, comp_model_rate[COMPOUND_DISTWTD] + rate_mv, + comp_model_dist[COMPOUND_DISTWTD]); + int best_type = (est_rd[COMPOUND_AVERAGE] <= est_rd[COMPOUND_DISTWTD]) + ? COMPOUND_AVERAGE + : COMPOUND_DISTWTD; + *best_rd = est_rd[best_type]; + return best_type; +} + +static INLINE void save_comp_rd_search_stat( + MACROBLOCK *x, const MB_MODE_INFO *const mbmi, const int32_t *comp_rate, + const int64_t *comp_dist, const int32_t *comp_model_rate, + const int64_t *comp_model_dist, const int_mv *cur_mv, const int *comp_rs2) { + const int offset = x->comp_rd_stats_idx; + if (offset < MAX_COMP_RD_STATS) { + COMP_RD_STATS *const rd_stats = x->comp_rd_stats + offset; + memcpy(rd_stats->rate, comp_rate, sizeof(rd_stats->rate)); + memcpy(rd_stats->dist, comp_dist, sizeof(rd_stats->dist)); + memcpy(rd_stats->model_rate, comp_model_rate, sizeof(rd_stats->model_rate)); + memcpy(rd_stats->model_dist, comp_model_dist, sizeof(rd_stats->model_dist)); + memcpy(rd_stats->comp_rs2, comp_rs2, sizeof(rd_stats->comp_rs2)); + memcpy(rd_stats->mv, cur_mv, sizeof(rd_stats->mv)); + memcpy(rd_stats->ref_frames, mbmi->ref_frame, sizeof(rd_stats->ref_frames)); + rd_stats->mode = mbmi->mode; + rd_stats->filter = mbmi->interp_filters; + rd_stats->ref_mv_idx = mbmi->ref_mv_idx; + const MACROBLOCKD *const xd = &x->e_mbd; + for (int i = 0; i < 2; ++i) { + const WarpedMotionParams *const wm = + &xd->global_motion[mbmi->ref_frame[i]]; + rd_stats->is_global[i] = is_global_mv_block(mbmi, wm->wmtype); + } + memcpy(&rd_stats->interinter_comp, &mbmi->interinter_comp, + sizeof(rd_stats->interinter_comp)); + ++x->comp_rd_stats_idx; + } +} + +static INLINE int get_interinter_compound_mask_rate( + const MACROBLOCK *const x, const MB_MODE_INFO *const mbmi) { + const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type; + // This function will be called only for COMPOUND_WEDGE and COMPOUND_DIFFWTD + if (compound_type == COMPOUND_WEDGE) { + return av1_is_wedge_used(mbmi->sb_type) + ? av1_cost_literal(1) + + x->wedge_idx_cost[mbmi->sb_type] + [mbmi->interinter_comp.wedge_index] + : 0; + } else { + assert(compound_type == COMPOUND_DIFFWTD); + return av1_cost_literal(1); + } +} + +// Takes a backup of rate, distortion and model_rd for future reuse +static INLINE void backup_stats(COMPOUND_TYPE cur_type, int32_t *comp_rate, + int64_t *comp_dist, int32_t *comp_model_rate, + int64_t *comp_model_dist, int rate_sum, + int64_t dist_sum, RD_STATS *rd_stats, + int *comp_rs2, int rs2) { + comp_rate[cur_type] = rd_stats->rate; + comp_dist[cur_type] = rd_stats->dist; + comp_model_rate[cur_type] = rate_sum; + comp_model_dist[cur_type] = dist_sum; + comp_rs2[cur_type] = rs2; +} + +static int64_t masked_compound_type_rd( + const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv, + const BLOCK_SIZE bsize, const PREDICTION_MODE this_mode, int *rs2, + int rate_mv, const BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0, + uint8_t **preds1, int16_t *residual1, int16_t *diff10, int *strides, + int mode_rate, int64_t rd_thresh, int *calc_pred_masked_compound, + int32_t *comp_rate, int64_t *comp_dist, int32_t *comp_model_rate, + int64_t *comp_model_dist, const int64_t comp_best_model_rd, + int64_t *const comp_model_rd_cur, int *comp_rs2, int64_t ref_skip_rd) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + int64_t best_rd_cur = INT64_MAX; + int64_t rd = INT64_MAX; + const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type; + // This function will be called only for COMPOUND_WEDGE and COMPOUND_DIFFWTD + assert(compound_type == COMPOUND_WEDGE || compound_type == COMPOUND_DIFFWTD); + int rate_sum, tmp_skip_txfm_sb; + int64_t dist_sum, tmp_skip_sse_sb; + pick_interinter_mask_type pick_interinter_mask[2] = { pick_interinter_wedge, + pick_interinter_seg }; + + // TODO(any): Save pred and mask calculation as well into records. However + // this may increase memory requirements as compound segment mask needs to be + // stored in each record. + if (*calc_pred_masked_compound) { + get_inter_predictors_masked_compound(x, bsize, preds0, preds1, residual1, + diff10, strides); + *calc_pred_masked_compound = 0; + } + if (cpi->sf.inter_sf.prune_wedge_pred_diff_based && + compound_type == COMPOUND_WEDGE) { + unsigned int sse; + if (is_cur_buf_hbd(xd)) + (void)cpi->fn_ptr[bsize].vf(CONVERT_TO_BYTEPTR(*preds0), *strides, + CONVERT_TO_BYTEPTR(*preds1), *strides, &sse); + else + (void)cpi->fn_ptr[bsize].vf(*preds0, *strides, *preds1, *strides, &sse); + const unsigned int mse = + ROUND_POWER_OF_TWO(sse, num_pels_log2_lookup[bsize]); + // If two predictors are very similar, skip wedge compound mode search + if (mse < 8 || (!have_newmv_in_inter_mode(this_mode) && mse < 64)) { + *comp_model_rd_cur = INT64_MAX; + return INT64_MAX; + } + } + // Function pointer to pick the appropriate mask + // compound_type == COMPOUND_WEDGE, calls pick_interinter_wedge() + // compound_type == COMPOUND_DIFFWTD, calls pick_interinter_seg() + uint64_t cur_sse = UINT64_MAX; + best_rd_cur = pick_interinter_mask[compound_type - COMPOUND_WEDGE]( + cpi, x, bsize, *preds0, *preds1, residual1, diff10, &cur_sse); + *rs2 += get_interinter_compound_mask_rate(x, mbmi); + best_rd_cur += RDCOST(x->rdmult, *rs2 + rate_mv, 0); + assert(cur_sse != UINT64_MAX); + int64_t skip_rd_cur = RDCOST(x->rdmult, *rs2 + rate_mv, (cur_sse << 4)); + + // Although the true rate_mv might be different after motion search, but it + // is unlikely to be the best mode considering the transform rd cost and other + // mode overhead cost + int64_t mode_rd = RDCOST(x->rdmult, *rs2 + mode_rate, 0); + if (mode_rd > rd_thresh) { + *comp_model_rd_cur = INT64_MAX; + return INT64_MAX; + } + + // Check if the mode is good enough based on skip rd + // TODO(nithya): Handle wedge_newmv_search if extending for lower speed + // setting + if (cpi->sf.inter_sf.txfm_rd_gate_level) { + int eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd, skip_rd_cur, + cpi->sf.inter_sf.txfm_rd_gate_level, 1); + if (!eval_txfm) { + *comp_model_rd_cur = INT64_MAX; + return INT64_MAX; + } + } + + // Compute cost if matching record not found, else, reuse data + if (comp_rate[compound_type] == INT_MAX) { + // Check whether new MV search for wedge is to be done + int wedge_newmv_search = + have_newmv_in_inter_mode(this_mode) && + (compound_type == COMPOUND_WEDGE) && + (!cpi->sf.inter_sf.disable_interinter_wedge_newmv_search); + int diffwtd_newmv_search = + cpi->sf.inter_sf.enable_interinter_diffwtd_newmv_search && + compound_type == COMPOUND_DIFFWTD && + have_newmv_in_inter_mode(this_mode); + + // Search for new MV if needed and build predictor + if (wedge_newmv_search) { + *out_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv, + bsize, this_mode); + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, ctx, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + } else if (diffwtd_newmv_search) { + *out_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv, + bsize, this_mode); + // we need to update the mask according to the new motion vector + CompoundTypeRdBuffers tmp_buf; + int64_t tmp_rd = INT64_MAX; + alloc_compound_type_rd_buffers_no_check(&tmp_buf); + + uint8_t *tmp_preds0[1] = { tmp_buf.pred0 }; + uint8_t *tmp_preds1[1] = { tmp_buf.pred1 }; + + get_inter_predictors_masked_compound(x, bsize, tmp_preds0, tmp_preds1, + tmp_buf.residual1, tmp_buf.diff10, + strides); + + tmp_rd = pick_interinter_mask[compound_type - COMPOUND_WEDGE]( + cpi, x, bsize, *tmp_preds0, *tmp_preds1, tmp_buf.residual1, + tmp_buf.diff10, &cur_sse); + // we can reuse rs2 here + tmp_rd += RDCOST(x->rdmult, *rs2 + *out_rate_mv, 0); + + if (tmp_rd >= best_rd_cur) { + // restore the motion vector + mbmi->mv[0].as_int = cur_mv[0].as_int; + mbmi->mv[1].as_int = cur_mv[1].as_int; + *out_rate_mv = rate_mv; + av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, + strides, preds1, strides); + } else { + // build the final prediciton using the updated mv + av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, tmp_preds0, + strides, tmp_preds1, strides); + } + av1_release_compound_type_rd_buffers(&tmp_buf); + } else { + *out_rate_mv = rate_mv; + av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides, + preds1, strides); + } + // Get the RD cost from model RD + model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND]( + cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, &tmp_skip_txfm_sb, + &tmp_skip_sse_sb, NULL, NULL, NULL); + rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum); + *comp_model_rd_cur = rd; + // Override with best if current is worse than best for new MV + if (wedge_newmv_search) { + if (rd >= best_rd_cur) { + mbmi->mv[0].as_int = cur_mv[0].as_int; + mbmi->mv[1].as_int = cur_mv[1].as_int; + *out_rate_mv = rate_mv; + av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, + strides, preds1, strides); + *comp_model_rd_cur = best_rd_cur; + } + } + if (cpi->sf.inter_sf.prune_comp_type_by_model_rd && + (*comp_model_rd_cur > comp_best_model_rd) && + comp_best_model_rd != INT64_MAX) { + *comp_model_rd_cur = INT64_MAX; + return INT64_MAX; + } + // Compute RD cost for the current type + RD_STATS rd_stats; + const int64_t tmp_mode_rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv, 0); + const int64_t tmp_rd_thresh = rd_thresh - tmp_mode_rd; + rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, &rd_stats); + if (rd != INT64_MAX) { + rd = + RDCOST(x->rdmult, *rs2 + *out_rate_mv + rd_stats.rate, rd_stats.dist); + // Backup rate and distortion for future reuse + backup_stats(compound_type, comp_rate, comp_dist, comp_model_rate, + comp_model_dist, rate_sum, dist_sum, &rd_stats, comp_rs2, + *rs2); + } + } else { + // Reuse data as matching record is found + assert(comp_dist[compound_type] != INT64_MAX); + // When disable_interinter_wedge_newmv_search is set, motion refinement is + // disabled. Hence rate and distortion can be reused in this case as well + assert(IMPLIES(have_newmv_in_inter_mode(this_mode), + cpi->sf.inter_sf.disable_interinter_wedge_newmv_search)); + assert(mbmi->mv[0].as_int == cur_mv[0].as_int); + assert(mbmi->mv[1].as_int == cur_mv[1].as_int); + *out_rate_mv = rate_mv; + // Calculate RD cost based on stored stats + rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + comp_rate[compound_type], + comp_dist[compound_type]); + // Recalculate model rdcost with the updated rate + *comp_model_rd_cur = + RDCOST(x->rdmult, *rs2 + *out_rate_mv + comp_model_rate[compound_type], + comp_model_dist[compound_type]); + } + return rd; +} + +// scaling values to be used for gating wedge/compound segment based on best +// approximate rd +static int comp_type_rd_threshold_mul[3] = { 1, 11, 12 }; +static int comp_type_rd_threshold_div[3] = { 3, 16, 16 }; + +int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int_mv *cur_mv, int mode_search_mask, + int masked_compound_used, const BUFFER_SET *orig_dst, + const BUFFER_SET *tmp_dst, + const CompoundTypeRdBuffers *buffers, int *rate_mv, + int64_t *rd, RD_STATS *rd_stats, int64_t ref_best_rd, + int64_t ref_skip_rd, int *is_luma_interp_done, + int64_t rd_thresh) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const PREDICTION_MODE this_mode = mbmi->mode; + const int bw = block_size_wide[bsize]; + int rs2; + int_mv best_mv[2]; + int best_tmp_rate_mv = *rate_mv; + BEST_COMP_TYPE_STATS best_type_stats; + // Initializing BEST_COMP_TYPE_STATS + best_type_stats.best_compound_data.type = COMPOUND_AVERAGE; + best_type_stats.best_compmode_interinter_cost = 0; + best_type_stats.comp_best_model_rd = INT64_MAX; + + uint8_t *preds0[1] = { buffers->pred0 }; + uint8_t *preds1[1] = { buffers->pred1 }; + int strides[1] = { bw }; + int tmp_rate_mv; + const int num_pix = 1 << num_pels_log2_lookup[bsize]; + const int mask_len = 2 * num_pix * sizeof(uint8_t); + COMPOUND_TYPE cur_type; + // Local array to store the mask cost for different compound types + int masked_type_cost[COMPOUND_TYPES]; + + int calc_pred_masked_compound = 1; + int64_t comp_dist[COMPOUND_TYPES] = { INT64_MAX, INT64_MAX, INT64_MAX, + INT64_MAX }; + int32_t comp_rate[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX }; + int comp_rs2[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX }; + int32_t comp_model_rate[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX, + INT_MAX }; + int64_t comp_model_dist[COMPOUND_TYPES] = { INT64_MAX, INT64_MAX, INT64_MAX, + INT64_MAX }; + int match_index = 0; + const int match_found = + find_comp_rd_in_stats(cpi, x, mbmi, comp_rate, comp_dist, comp_model_rate, + comp_model_dist, comp_rs2, &match_index); + best_mv[0].as_int = cur_mv[0].as_int; + best_mv[1].as_int = cur_mv[1].as_int; + *rd = INT64_MAX; + int rate_sum, tmp_skip_txfm_sb; + int64_t dist_sum, tmp_skip_sse_sb; + + // Local array to store the valid compound types to be evaluated in the core + // loop + COMPOUND_TYPE valid_comp_types[COMPOUND_TYPES] = { + COMPOUND_AVERAGE, COMPOUND_DISTWTD, COMPOUND_WEDGE, COMPOUND_DIFFWTD + }; + int valid_type_count = 0; + int try_average_and_distwtd_comp = 0; + // compute_valid_comp_types() returns the number of valid compound types to be + // evaluated and populates the same in the local array valid_comp_types[]. + // It also sets the flag 'try_average_and_distwtd_comp' + valid_type_count = compute_valid_comp_types( + x, cpi, &try_average_and_distwtd_comp, bsize, masked_compound_used, + mode_search_mask, valid_comp_types); + + // The following context indices are independent of compound type + const int comp_group_idx_ctx = get_comp_group_idx_context(xd); + const int comp_index_ctx = get_comp_index_context(cm, xd); + + // Populates masked_type_cost local array for the 4 compound types + calc_masked_type_cost(x, bsize, comp_group_idx_ctx, comp_index_ctx, + masked_compound_used, masked_type_cost); + + int64_t comp_model_rd_cur = INT64_MAX; + int64_t best_rd_cur = INT64_MAX; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + // If the match is found, calculate the rd cost using the + // stored stats and update the mbmi appropriately. + if (match_found && cpi->sf.inter_sf.reuse_compound_type_decision) { + return populate_reuse_comp_type_data(x, mbmi, &best_type_stats, cur_mv, + comp_rate, comp_dist, comp_rs2, + rate_mv, rd, match_index); + } + // Special handling if both compound_average and compound_distwtd + // are to be searched. In this case, first estimate between the two + // modes and then call estimate_yrd_for_sb() only for the better of + // the two. + if (try_average_and_distwtd_comp) { + int est_rate[2]; + int64_t est_dist[2], est_rd; + COMPOUND_TYPE best_type; + // Since modelled rate and dist are separately stored, + // compute better of COMPOUND_AVERAGE and COMPOUND_DISTWTD + // using the stored stats. + if ((comp_model_rate[COMPOUND_AVERAGE] != INT_MAX) && + comp_model_rate[COMPOUND_DISTWTD] != INT_MAX) { + // Choose the better of the COMPOUND_AVERAGE, + // COMPOUND_DISTWTD on modeled cost. + best_type = find_best_avg_distwtd_comp_type( + x, comp_model_rate, comp_model_dist, *rate_mv, &est_rd); + update_mbmi_for_compound_type(mbmi, best_type); + if (comp_rate[best_type] != INT_MAX) + best_rd_cur = RDCOST( + x->rdmult, + masked_type_cost[best_type] + *rate_mv + comp_rate[best_type], + comp_dist[best_type]); + comp_model_rd_cur = est_rd; + // Update stats for best compound type + if (best_rd_cur < *rd) { + update_best_info(mbmi, rd, &best_type_stats, best_rd_cur, + comp_model_rd_cur, masked_type_cost[best_type]); + } + restore_dst_buf(xd, *tmp_dst, 1); + } else { + int64_t sse_y[COMPOUND_DISTWTD + 1]; + // Calculate model_rd for COMPOUND_AVERAGE and COMPOUND_DISTWTD + for (int comp_type = COMPOUND_AVERAGE; comp_type <= COMPOUND_DISTWTD; + comp_type++) { + update_mbmi_for_compound_type(mbmi, comp_type); + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + model_rd_sb_fn[MODELRD_CURVFIT]( + cpi, bsize, x, xd, 0, 0, &est_rate[comp_type], &est_dist[comp_type], + NULL, NULL, NULL, NULL, NULL); + est_rate[comp_type] += masked_type_cost[comp_type]; + comp_model_rate[comp_type] = est_rate[comp_type]; + comp_model_dist[comp_type] = est_dist[comp_type]; + sse_y[comp_type] = x->pred_sse[xd->mi[0]->ref_frame[0]]; + if (comp_type == COMPOUND_AVERAGE) { + *is_luma_interp_done = 1; + restore_dst_buf(xd, *tmp_dst, 1); + } + } + // Choose the better of the two based on modeled cost and call + // estimate_yrd_for_sb() for that one. + best_type = find_best_avg_distwtd_comp_type( + x, comp_model_rate, comp_model_dist, *rate_mv, &est_rd); + update_mbmi_for_compound_type(mbmi, best_type); + if (best_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *orig_dst, 1); + rs2 = masked_type_cost[best_type]; + RD_STATS est_rd_stats; + const int64_t mode_rd = RDCOST(x->rdmult, rs2 + *rate_mv, 0); + const int64_t tmp_rd_thresh = AOMMIN(*rd, rd_thresh) - mode_rd; + int64_t est_rd_ = INT64_MAX; + int eval_txfm = 1; + // Check if the mode is good enough based on skip rd + if (cpi->sf.inter_sf.txfm_rd_gate_level) { + int64_t skip_rd = + RDCOST(x->rdmult, rs2 + *rate_mv, (sse_y[best_type] << 4)); + eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd, skip_rd, + cpi->sf.inter_sf.txfm_rd_gate_level, 1); + } + // Evaluate further if skip rd is low enough + if (eval_txfm) { + est_rd_ = + estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, &est_rd_stats); + } + + if (est_rd_ != INT64_MAX) { + best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + est_rd_stats.rate, + est_rd_stats.dist); + // Backup rate and distortion for future reuse + backup_stats(best_type, comp_rate, comp_dist, comp_model_rate, + comp_model_dist, est_rate[best_type], est_dist[best_type], + &est_rd_stats, comp_rs2, rs2); + comp_model_rd_cur = est_rd; + } + if (best_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1); + // Update stats for best compound type + if (best_rd_cur < *rd) { + update_best_info(mbmi, rd, &best_type_stats, best_rd_cur, + comp_model_rd_cur, rs2); + } + } + } + + // If COMPOUND_AVERAGE is not valid, use the spare buffer + if (valid_comp_types[0] != COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1); + + // Loop over valid compound types + for (int i = 0; i < valid_type_count; i++) { + cur_type = valid_comp_types[i]; + comp_model_rd_cur = INT64_MAX; + tmp_rate_mv = *rate_mv; + best_rd_cur = INT64_MAX; + + // Case COMPOUND_AVERAGE and COMPOUND_DISTWTD + if (cur_type < COMPOUND_WEDGE) { + update_mbmi_for_compound_type(mbmi, cur_type); + rs2 = masked_type_cost[cur_type]; + const int64_t mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0); + if (mode_rd < ref_best_rd) { + // Reuse data if matching record is found + if (comp_rate[cur_type] == INT_MAX) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + if (cur_type == COMPOUND_AVERAGE) *is_luma_interp_done = 1; + + // Compute RD cost for the current type + RD_STATS est_rd_stats; + const int64_t tmp_rd_thresh = AOMMIN(*rd, rd_thresh) - mode_rd; + int64_t est_rd = INT64_MAX; + int eval_txfm = 1; + // Check if the mode is good enough based on skip rd + if (cpi->sf.inter_sf.txfm_rd_gate_level) { + int64_t sse_y = compute_sse_plane(x, xd, PLANE_TYPE_Y, bsize); + int64_t skip_rd = RDCOST(x->rdmult, rs2 + *rate_mv, (sse_y << 4)); + eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd, skip_rd, + cpi->sf.inter_sf.txfm_rd_gate_level, 1); + } + // Evaluate further if skip rd is low enough + if (eval_txfm) { + est_rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, + &est_rd_stats); + } + + if (est_rd != INT64_MAX) { + best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + est_rd_stats.rate, + est_rd_stats.dist); + model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND]( + cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); + comp_model_rd_cur = + RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum); + + // Backup rate and distortion for future reuse + backup_stats(cur_type, comp_rate, comp_dist, comp_model_rate, + comp_model_dist, rate_sum, dist_sum, &est_rd_stats, + comp_rs2, rs2); + } + } else { + // Calculate RD cost based on stored stats + assert(comp_dist[cur_type] != INT64_MAX); + best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + comp_rate[cur_type], + comp_dist[cur_type]); + // Recalculate model rdcost with the updated rate + comp_model_rd_cur = + RDCOST(x->rdmult, rs2 + *rate_mv + comp_model_rate[cur_type], + comp_model_dist[cur_type]); + } + } + // use spare buffer for following compound type try + if (cur_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1); + } else { + // Handle masked compound types + update_mbmi_for_compound_type(mbmi, cur_type); + rs2 = masked_type_cost[cur_type]; + // Factors to control gating of compound type selection based on best + // approximate rd so far + const int max_comp_type_rd_threshold_mul = + comp_type_rd_threshold_mul[cpi->sf.inter_sf + .prune_comp_type_by_comp_avg]; + const int max_comp_type_rd_threshold_div = + comp_type_rd_threshold_div[cpi->sf.inter_sf + .prune_comp_type_by_comp_avg]; + // Evaluate COMPOUND_WEDGE / COMPOUND_DIFFWTD if approximated cost is + // within threshold + int64_t approx_rd = ((*rd / max_comp_type_rd_threshold_div) * + max_comp_type_rd_threshold_mul); + + if (approx_rd < ref_best_rd) { + const int64_t tmp_rd_thresh = AOMMIN(*rd, rd_thresh); + best_rd_cur = masked_compound_type_rd( + cpi, x, cur_mv, bsize, this_mode, &rs2, *rate_mv, orig_dst, + &tmp_rate_mv, preds0, preds1, buffers->residual1, buffers->diff10, + strides, rd_stats->rate, tmp_rd_thresh, &calc_pred_masked_compound, + comp_rate, comp_dist, comp_model_rate, comp_model_dist, + best_type_stats.comp_best_model_rd, &comp_model_rd_cur, comp_rs2, + ref_skip_rd); + } + } + // Update stats for best compound type + if (best_rd_cur < *rd) { + update_best_info(mbmi, rd, &best_type_stats, best_rd_cur, + comp_model_rd_cur, rs2); + if (masked_compound_used && cur_type >= COMPOUND_WEDGE) { + memcpy(buffers->tmp_best_mask_buf, xd->seg_mask, mask_len); + if (have_newmv_in_inter_mode(this_mode)) + update_mask_best_mv(mbmi, best_mv, cur_mv, cur_type, + &best_tmp_rate_mv, tmp_rate_mv, &cpi->sf); + } + } + // reset to original mvs for next iteration + mbmi->mv[0].as_int = cur_mv[0].as_int; + mbmi->mv[1].as_int = cur_mv[1].as_int; + } + if (mbmi->interinter_comp.type != best_type_stats.best_compound_data.type) { + mbmi->comp_group_idx = + (best_type_stats.best_compound_data.type < COMPOUND_WEDGE) ? 0 : 1; + mbmi->compound_idx = + !(best_type_stats.best_compound_data.type == COMPOUND_DISTWTD); + mbmi->interinter_comp = best_type_stats.best_compound_data; + memcpy(xd->seg_mask, buffers->tmp_best_mask_buf, mask_len); + } + if (have_newmv_in_inter_mode(this_mode)) { + mbmi->mv[0].as_int = best_mv[0].as_int; + mbmi->mv[1].as_int = best_mv[1].as_int; + if (mbmi->interinter_comp.type == COMPOUND_WEDGE) { + rd_stats->rate += best_tmp_rate_mv - *rate_mv; + *rate_mv = best_tmp_rate_mv; + } + } + restore_dst_buf(xd, *orig_dst, 1); + if (!match_found) + save_comp_rd_search_stat(x, mbmi, comp_rate, comp_dist, comp_model_rate, + comp_model_dist, cur_mv, comp_rs2); + return best_type_stats.best_compmode_interinter_cost; +} diff --git a/libs/libaom/src/av1/encoder/compound_type.h b/libs/libaom/src/av1/encoder/compound_type.h new file mode 100644 index 000000000..f2bd857c9 --- /dev/null +++ b/libs/libaom/src/av1/encoder/compound_type.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_COMPOUND_TYPE_H_ +#define AOM_AV1_ENCODER_COMPOUND_TYPE_H_ + +#include "av1/encoder/encoder.h" +#include "av1/encoder/interp_search.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Structure to store the compound type related stats for best compound type +typedef struct { + INTERINTER_COMPOUND_DATA best_compound_data; + int64_t comp_best_model_rd; + int best_compmode_interinter_cost; +} BEST_COMP_TYPE_STATS; + +int av1_handle_inter_intra_mode(const AV1_COMP *const cpi, MACROBLOCK *const x, + BLOCK_SIZE bsize, MB_MODE_INFO *mbmi, + HandleInterModeArgs *args, int64_t ref_best_rd, + int *rate_mv, int *tmp_rate2, + const BUFFER_SET *orig_dst); + +int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int_mv *cur_mv, int mode_search_mask, + int masked_compound_used, const BUFFER_SET *orig_dst, + const BUFFER_SET *tmp_dst, + const CompoundTypeRdBuffers *buffers, int *rate_mv, + int64_t *rd, RD_STATS *rd_stats, int64_t ref_best_rd, + int64_t ref_skip_rd, int *is_luma_interp_done, + int64_t rd_thresh); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_COMPOUND_TYPE_H_ diff --git a/libs/libaom/src/av1/encoder/context_tree.c b/libs/libaom/src/av1/encoder/context_tree.c new file mode 100644 index 000000000..9b5b1cbf9 --- /dev/null +++ b/libs/libaom/src/av1/encoder/context_tree.c @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/encoder/context_tree.h" +#include "av1/encoder/encoder.h" + +static const BLOCK_SIZE square[MAX_SB_SIZE_LOG2 - 1] = { + BLOCK_4X4, BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, BLOCK_128X128, +}; + +typedef struct { + tran_low_t *coeff_buf[MAX_MB_PLANE]; + tran_low_t *qcoeff_buf[MAX_MB_PLANE]; + tran_low_t *dqcoeff_buf[MAX_MB_PLANE]; +} PC_TREE_SHARED_BUFFERS; + +static AOM_INLINE void alloc_mode_context(AV1_COMMON *cm, int num_pix, + PICK_MODE_CONTEXT *ctx, + PC_TREE_SHARED_BUFFERS *shared_bufs) { + const int num_planes = av1_num_planes(cm); + int i; + const int num_blk = num_pix / 16; + ctx->num_4x4_blk = num_blk; + + CHECK_MEM_ERROR(cm, ctx->blk_skip, + aom_calloc(num_blk, sizeof(*ctx->blk_skip))); + CHECK_MEM_ERROR(cm, ctx->tx_type_map, + aom_calloc(num_blk, sizeof(*ctx->tx_type_map))); + for (i = 0; i < num_planes; ++i) { + ctx->coeff[i] = shared_bufs->coeff_buf[i]; + ctx->qcoeff[i] = shared_bufs->qcoeff_buf[i]; + ctx->dqcoeff[i] = shared_bufs->dqcoeff_buf[i]; + CHECK_MEM_ERROR(cm, ctx->eobs[i], + aom_memalign(32, num_blk * sizeof(*ctx->eobs[i]))); + CHECK_MEM_ERROR( + cm, ctx->txb_entropy_ctx[i], + aom_memalign(32, num_blk * sizeof(*ctx->txb_entropy_ctx[i]))); + } + + if (num_pix <= MAX_PALETTE_SQUARE) { + for (i = 0; i < 2; ++i) { + CHECK_MEM_ERROR( + cm, ctx->color_index_map[i], + aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i]))); + } + } +} + +static AOM_INLINE void free_mode_context(PICK_MODE_CONTEXT *ctx, + const int num_planes) { + int i; + aom_free(ctx->blk_skip); + ctx->blk_skip = 0; + aom_free(ctx->tx_type_map); + ctx->tx_type_map = 0; + for (i = 0; i < num_planes; ++i) { + ctx->coeff[i] = 0; + ctx->qcoeff[i] = 0; + ctx->dqcoeff[i] = 0; + aom_free(ctx->eobs[i]); + ctx->eobs[i] = 0; + aom_free(ctx->txb_entropy_ctx[i]); + ctx->txb_entropy_ctx[i] = 0; + } + + for (i = 0; i < 2; ++i) { + aom_free(ctx->color_index_map[i]); + ctx->color_index_map[i] = 0; + } +} + +static AOM_INLINE void alloc_tree_contexts( + AV1_COMMON *cm, PC_TREE *tree, int num_pix, int is_leaf, + PC_TREE_SHARED_BUFFERS *shared_bufs) { + alloc_mode_context(cm, num_pix, &tree->none, shared_bufs); + + if (is_leaf) return; + + alloc_mode_context(cm, num_pix / 2, &tree->horizontal[0], shared_bufs); + alloc_mode_context(cm, num_pix / 2, &tree->vertical[0], shared_bufs); + + alloc_mode_context(cm, num_pix / 2, &tree->horizontal[1], shared_bufs); + alloc_mode_context(cm, num_pix / 2, &tree->vertical[1], shared_bufs); + + alloc_mode_context(cm, num_pix / 4, &tree->horizontala[0], shared_bufs); + alloc_mode_context(cm, num_pix / 4, &tree->horizontala[1], shared_bufs); + alloc_mode_context(cm, num_pix / 2, &tree->horizontala[2], shared_bufs); + + alloc_mode_context(cm, num_pix / 2, &tree->horizontalb[0], shared_bufs); + alloc_mode_context(cm, num_pix / 4, &tree->horizontalb[1], shared_bufs); + alloc_mode_context(cm, num_pix / 4, &tree->horizontalb[2], shared_bufs); + + alloc_mode_context(cm, num_pix / 4, &tree->verticala[0], shared_bufs); + alloc_mode_context(cm, num_pix / 4, &tree->verticala[1], shared_bufs); + alloc_mode_context(cm, num_pix / 2, &tree->verticala[2], shared_bufs); + + alloc_mode_context(cm, num_pix / 2, &tree->verticalb[0], shared_bufs); + alloc_mode_context(cm, num_pix / 4, &tree->verticalb[1], shared_bufs); + alloc_mode_context(cm, num_pix / 4, &tree->verticalb[2], shared_bufs); + + for (int i = 0; i < 4; ++i) { + alloc_mode_context(cm, num_pix / 4, &tree->horizontal4[i], shared_bufs); + alloc_mode_context(cm, num_pix / 4, &tree->vertical4[i], shared_bufs); + } +} + +static AOM_INLINE void free_tree_contexts(PC_TREE *tree, const int num_planes) { + int i; + for (i = 0; i < 3; i++) { + free_mode_context(&tree->horizontala[i], num_planes); + free_mode_context(&tree->horizontalb[i], num_planes); + free_mode_context(&tree->verticala[i], num_planes); + free_mode_context(&tree->verticalb[i], num_planes); + } + for (i = 0; i < 4; ++i) { + free_mode_context(&tree->horizontal4[i], num_planes); + free_mode_context(&tree->vertical4[i], num_planes); + } + free_mode_context(&tree->none, num_planes); + free_mode_context(&tree->horizontal[0], num_planes); + free_mode_context(&tree->horizontal[1], num_planes); + free_mode_context(&tree->vertical[0], num_planes); + free_mode_context(&tree->vertical[1], num_planes); +} + +// This function will compute the number of pc_tree nodes to be allocated +// or freed as per the super block size of BLOCK_128X128 or BLOCK_64X64 +static AOM_INLINE int get_pc_tree_nodes(const int is_sb_size_128, + int stat_generation_stage) { + const int tree_nodes_inc = is_sb_size_128 ? 1024 : 0; + const int tree_nodes = + stat_generation_stage ? 1 : (tree_nodes_inc + 256 + 64 + 16 + 4 + 1); + return tree_nodes; +} + +// This function sets up a tree of contexts such that at each square +// partition level. There are contexts for none, horizontal, vertical, and +// split. Along with a block_size value and a selected block_size which +// represents the state of our search. +void av1_setup_pc_tree(AV1_COMP *const cpi, ThreadData *td) { + AV1_COMMON *const cm = &cpi->common; + int i, j, stat_generation_stage = is_stat_generation_stage(cpi); + const int is_sb_size_128 = cm->seq_params.sb_size == BLOCK_128X128; + const int tree_nodes = + get_pc_tree_nodes(is_sb_size_128, stat_generation_stage); + int pc_tree_index = 0; + PC_TREE *this_pc; + PC_TREE_SHARED_BUFFERS shared_bufs; + int square_index = 1; + int nodes; + + aom_free(td->pc_tree); + CHECK_MEM_ERROR(cm, td->pc_tree, + aom_calloc(tree_nodes, sizeof(*td->pc_tree))); + this_pc = &td->pc_tree[0]; + + for (i = 0; i < 3; i++) { + const int max_num_pix = MAX_SB_SIZE * MAX_SB_SIZE; + CHECK_MEM_ERROR(cm, td->tree_coeff_buf[i], + aom_memalign(32, max_num_pix * sizeof(tran_low_t))); + CHECK_MEM_ERROR(cm, td->tree_qcoeff_buf[i], + aom_memalign(32, max_num_pix * sizeof(tran_low_t))); + CHECK_MEM_ERROR(cm, td->tree_dqcoeff_buf[i], + aom_memalign(32, max_num_pix * sizeof(tran_low_t))); + shared_bufs.coeff_buf[i] = td->tree_coeff_buf[i]; + shared_bufs.qcoeff_buf[i] = td->tree_qcoeff_buf[i]; + shared_bufs.dqcoeff_buf[i] = td->tree_dqcoeff_buf[i]; + } + + if (!stat_generation_stage) { + const int leaf_factor = is_sb_size_128 ? 4 : 1; + const int leaf_nodes = 256 * leaf_factor; + + // Sets up all the leaf nodes in the tree. + for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) { + PC_TREE *const tree = &td->pc_tree[pc_tree_index]; + tree->block_size = square[0]; + alloc_tree_contexts(cm, tree, 16, 1, &shared_bufs); + } + + // Each node has 4 leaf nodes, fill each block_size level of the tree + // from leafs to the root. + for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) { + for (i = 0; i < nodes; ++i) { + PC_TREE *const tree = &td->pc_tree[pc_tree_index]; + alloc_tree_contexts(cm, tree, 16 << (2 * square_index), 0, + &shared_bufs); + tree->block_size = square[square_index]; + for (j = 0; j < 4; j++) tree->split[j] = this_pc++; + ++pc_tree_index; + } + ++square_index; + } + } else { + // Allocation for firstpass/LAP stage + // TODO(Mufaddal): refactor square_index to use a common block_size macro + // from firstpass.c + PC_TREE *const tree = &td->pc_tree[pc_tree_index]; + square_index = 2; + alloc_tree_contexts(cm, tree, 16 << (2 * square_index), 1, &shared_bufs); + tree->block_size = square[square_index]; + } + + // Set up the root node for the applicable superblock size + td->pc_root = &td->pc_tree[tree_nodes - 1]; +#if CONFIG_INTERNAL_STATS + td->pc_root->none.best_mode_index = THR_INVALID; +#endif // CONFIG_INTERNAL_STATS +} + +void av1_free_pc_tree(const AV1_COMP *const cpi, ThreadData *td, + const int num_planes, BLOCK_SIZE sb_size) { + int stat_generation_stage = is_stat_generation_stage(cpi); + if (td->pc_tree != NULL) { + const int is_sb_size_128 = sb_size == BLOCK_128X128; + const int tree_nodes = + get_pc_tree_nodes(is_sb_size_128, stat_generation_stage); + for (int i = 0; i < tree_nodes; ++i) { + free_tree_contexts(&td->pc_tree[i], num_planes); + } + for (int i = 0; i < 3; ++i) { + aom_free(td->tree_coeff_buf[i]); + aom_free(td->tree_qcoeff_buf[i]); + aom_free(td->tree_dqcoeff_buf[i]); + td->tree_coeff_buf[i] = NULL; + td->tree_qcoeff_buf[i] = NULL; + td->tree_dqcoeff_buf[i] = NULL; + } + aom_free(td->pc_tree); + td->pc_tree = NULL; + } +} + +void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx, + PICK_MODE_CONTEXT *src_ctx) { + dst_ctx->mic = src_ctx->mic; + dst_ctx->mbmi_ext_best = src_ctx->mbmi_ext_best; + + dst_ctx->num_4x4_blk = src_ctx->num_4x4_blk; + dst_ctx->skippable = src_ctx->skippable; +#if CONFIG_INTERNAL_STATS + dst_ctx->best_mode_index = src_ctx->best_mode_index; +#endif // CONFIG_INTERNAL_STATS + + memcpy(dst_ctx->blk_skip, src_ctx->blk_skip, + sizeof(uint8_t) * src_ctx->num_4x4_blk); + av1_copy_array(dst_ctx->tx_type_map, src_ctx->tx_type_map, + src_ctx->num_4x4_blk); + + dst_ctx->hybrid_pred_diff = src_ctx->hybrid_pred_diff; + dst_ctx->comp_pred_diff = src_ctx->comp_pred_diff; + dst_ctx->single_pred_diff = src_ctx->single_pred_diff; + + dst_ctx->rd_stats = src_ctx->rd_stats; + dst_ctx->rd_mode_is_ready = src_ctx->rd_mode_is_ready; + + memcpy(dst_ctx->pred_mv, src_ctx->pred_mv, sizeof(MV) * REF_FRAMES); + + dst_ctx->partition = src_ctx->partition; +} diff --git a/libs/libaom/src/av1/encoder/context_tree.h b/libs/libaom/src/av1/encoder/context_tree.h new file mode 100644 index 000000000..a39979413 --- /dev/null +++ b/libs/libaom/src/av1/encoder/context_tree.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_CONTEXT_TREE_H_ +#define AOM_AV1_ENCODER_CONTEXT_TREE_H_ + +#include "config/aom_config.h" + +#include "av1/common/blockd.h" +#include "av1/encoder/block.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1_COMP; +struct AV1Common; +struct ThreadData; + +// Structure to hold snapshot of coding context during the mode picking process +typedef struct { + MB_MODE_INFO mic; + MB_MODE_INFO_EXT_FRAME mbmi_ext_best; + uint8_t *color_index_map[2]; + uint8_t *blk_skip; + + tran_low_t *coeff[MAX_MB_PLANE]; + tran_low_t *qcoeff[MAX_MB_PLANE]; + tran_low_t *dqcoeff[MAX_MB_PLANE]; + uint16_t *eobs[MAX_MB_PLANE]; + uint8_t *txb_entropy_ctx[MAX_MB_PLANE]; + uint8_t *tx_type_map; + + int num_4x4_blk; + // For current partition, only if all Y, U, and V transform blocks' + // coefficients are quantized to 0, skippable is set to 1. + int skippable; +#if CONFIG_INTERNAL_STATS + THR_MODES best_mode_index; +#endif // CONFIG_INTERNAL_STATS + int hybrid_pred_diff; + int comp_pred_diff; + int single_pred_diff; + + RD_STATS rd_stats; + + int rd_mode_is_ready; // Flag to indicate whether rd pick mode decision has + // been made. + + // motion vector cache for adaptive motion search control in partition + // search loop + MV pred_mv[REF_FRAMES]; + PARTITION_TYPE partition; +} PICK_MODE_CONTEXT; + +typedef struct PC_TREE { + PARTITION_TYPE partitioning; + BLOCK_SIZE block_size; + PICK_MODE_CONTEXT none; + PICK_MODE_CONTEXT horizontal[2]; + PICK_MODE_CONTEXT vertical[2]; + PICK_MODE_CONTEXT horizontala[3]; + PICK_MODE_CONTEXT horizontalb[3]; + PICK_MODE_CONTEXT verticala[3]; + PICK_MODE_CONTEXT verticalb[3]; + PICK_MODE_CONTEXT horizontal4[4]; + PICK_MODE_CONTEXT vertical4[4]; + struct PC_TREE *split[4]; + int index; + + // Simple motion search_features + FULLPEL_MV start_mvs[REF_FRAMES]; + unsigned int sms_none_feat[2]; + unsigned int sms_rect_feat[8]; + int sms_none_valid; + int sms_rect_valid; +} PC_TREE; + +void av1_setup_pc_tree(struct AV1_COMP *const cpi, struct ThreadData *td); +void av1_free_pc_tree(const struct AV1_COMP *const cpi, struct ThreadData *td, + const int num_planes, BLOCK_SIZE sb_size); +void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx, + PICK_MODE_CONTEXT *src_ctx); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_CONTEXT_TREE_H_ diff --git a/libs/libaom/src/av1/encoder/corner_detect.c b/libs/libaom/src/av1/encoder/corner_detect.c new file mode 100644 index 000000000..597bb30fc --- /dev/null +++ b/libs/libaom/src/av1/encoder/corner_detect.c @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include + +#include "third_party/fastfeat/fast.h" + +#include "av1/encoder/corner_detect.h" + +// Fast_9 wrapper +#define FAST_BARRIER 18 +int av1_fast_corner_detect(unsigned char *buf, int width, int height, + int stride, int *points, int max_points) { + int num_points; + xy *const frm_corners_xy = aom_fast9_detect_nonmax(buf, width, height, stride, + FAST_BARRIER, &num_points); + num_points = (num_points <= max_points ? num_points : max_points); + if (num_points > 0 && frm_corners_xy) { + memcpy(points, frm_corners_xy, sizeof(*frm_corners_xy) * num_points); + free(frm_corners_xy); + return num_points; + } + free(frm_corners_xy); + return 0; +} diff --git a/libs/libaom/src/av1/encoder/corner_detect.h b/libs/libaom/src/av1/encoder/corner_detect.h new file mode 100644 index 000000000..15062f265 --- /dev/null +++ b/libs/libaom/src/av1/encoder/corner_detect.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_CORNER_DETECT_H_ +#define AOM_AV1_ENCODER_CORNER_DETECT_H_ + +#include +#include +#include + +int av1_fast_corner_detect(unsigned char *buf, int width, int height, + int stride, int *points, int max_points); + +#endif // AOM_AV1_ENCODER_CORNER_DETECT_H_ diff --git a/libs/libaom/src/av1/encoder/corner_match.c b/libs/libaom/src/av1/encoder/corner_match.c new file mode 100644 index 000000000..12f633b4f --- /dev/null +++ b/libs/libaom/src/av1/encoder/corner_match.c @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_ports/system_state.h" +#include "av1/encoder/corner_match.h" + +#define SEARCH_SZ 9 +#define SEARCH_SZ_BY2 ((SEARCH_SZ - 1) / 2) + +#define THRESHOLD_NCC 0.75 + +/* Compute var(im) * MATCH_SZ_SQ over a MATCH_SZ by MATCH_SZ window of im, + centered at (x, y). +*/ +static double compute_variance(unsigned char *im, int stride, int x, int y) { + int sum = 0; + int sumsq = 0; + int var; + int i, j; + for (i = 0; i < MATCH_SZ; ++i) + for (j = 0; j < MATCH_SZ; ++j) { + sum += im[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)]; + sumsq += im[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)] * + im[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)]; + } + var = sumsq * MATCH_SZ_SQ - sum * sum; + return (double)var; +} + +/* Compute corr(im1, im2) * MATCH_SZ * stddev(im1), where the + correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows + of each image, centered at (x1, y1) and (x2, y2) respectively. +*/ +double av1_compute_cross_correlation_c(unsigned char *im1, int stride1, int x1, + int y1, unsigned char *im2, int stride2, + int x2, int y2) { + int v1, v2; + int sum1 = 0; + int sum2 = 0; + int sumsq2 = 0; + int cross = 0; + int var2, cov; + int i, j; + for (i = 0; i < MATCH_SZ; ++i) + for (j = 0; j < MATCH_SZ; ++j) { + v1 = im1[(i + y1 - MATCH_SZ_BY2) * stride1 + (j + x1 - MATCH_SZ_BY2)]; + v2 = im2[(i + y2 - MATCH_SZ_BY2) * stride2 + (j + x2 - MATCH_SZ_BY2)]; + sum1 += v1; + sum2 += v2; + sumsq2 += v2 * v2; + cross += v1 * v2; + } + var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2; + cov = cross * MATCH_SZ_SQ - sum1 * sum2; + aom_clear_system_state(); + return cov / sqrt((double)var2); +} + +static int is_eligible_point(int pointx, int pointy, int width, int height) { + return (pointx >= MATCH_SZ_BY2 && pointy >= MATCH_SZ_BY2 && + pointx + MATCH_SZ_BY2 < width && pointy + MATCH_SZ_BY2 < height); +} + +static int is_eligible_distance(int point1x, int point1y, int point2x, + int point2y, int width, int height) { + const int thresh = (width < height ? height : width) >> 4; + return ((point1x - point2x) * (point1x - point2x) + + (point1y - point2y) * (point1y - point2y)) <= thresh * thresh; +} + +static void improve_correspondence(unsigned char *frm, unsigned char *ref, + int width, int height, int frm_stride, + int ref_stride, + Correspondence *correspondences, + int num_correspondences) { + int i; + for (i = 0; i < num_correspondences; ++i) { + int x, y, best_x = 0, best_y = 0; + double best_match_ncc = 0.0; + for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y) { + for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) { + double match_ncc; + if (!is_eligible_point(correspondences[i].rx + x, + correspondences[i].ry + y, width, height)) + continue; + if (!is_eligible_distance(correspondences[i].x, correspondences[i].y, + correspondences[i].rx + x, + correspondences[i].ry + y, width, height)) + continue; + match_ncc = av1_compute_cross_correlation( + frm, frm_stride, correspondences[i].x, correspondences[i].y, ref, + ref_stride, correspondences[i].rx + x, correspondences[i].ry + y); + if (match_ncc > best_match_ncc) { + best_match_ncc = match_ncc; + best_y = y; + best_x = x; + } + } + } + correspondences[i].rx += best_x; + correspondences[i].ry += best_y; + } + for (i = 0; i < num_correspondences; ++i) { + int x, y, best_x = 0, best_y = 0; + double best_match_ncc = 0.0; + for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y) + for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) { + double match_ncc; + if (!is_eligible_point(correspondences[i].x + x, + correspondences[i].y + y, width, height)) + continue; + if (!is_eligible_distance( + correspondences[i].x + x, correspondences[i].y + y, + correspondences[i].rx, correspondences[i].ry, width, height)) + continue; + match_ncc = av1_compute_cross_correlation( + ref, ref_stride, correspondences[i].rx, correspondences[i].ry, frm, + frm_stride, correspondences[i].x + x, correspondences[i].y + y); + if (match_ncc > best_match_ncc) { + best_match_ncc = match_ncc; + best_y = y; + best_x = x; + } + } + correspondences[i].x += best_x; + correspondences[i].y += best_y; + } +} + +int av1_determine_correspondence(unsigned char *frm, int *frm_corners, + int num_frm_corners, unsigned char *ref, + int *ref_corners, int num_ref_corners, + int width, int height, int frm_stride, + int ref_stride, int *correspondence_pts) { + // TODO(sarahparker) Improve this to include 2-way match + int i, j; + Correspondence *correspondences = (Correspondence *)correspondence_pts; + int num_correspondences = 0; + for (i = 0; i < num_frm_corners; ++i) { + double best_match_ncc = 0.0; + double template_norm; + int best_match_j = -1; + if (!is_eligible_point(frm_corners[2 * i], frm_corners[2 * i + 1], width, + height)) + continue; + for (j = 0; j < num_ref_corners; ++j) { + double match_ncc; + if (!is_eligible_point(ref_corners[2 * j], ref_corners[2 * j + 1], width, + height)) + continue; + if (!is_eligible_distance(frm_corners[2 * i], frm_corners[2 * i + 1], + ref_corners[2 * j], ref_corners[2 * j + 1], + width, height)) + continue; + match_ncc = av1_compute_cross_correlation( + frm, frm_stride, frm_corners[2 * i], frm_corners[2 * i + 1], ref, + ref_stride, ref_corners[2 * j], ref_corners[2 * j + 1]); + if (match_ncc > best_match_ncc) { + best_match_ncc = match_ncc; + best_match_j = j; + } + } + // Note: We want to test if the best correlation is >= THRESHOLD_NCC, + // but need to account for the normalization in + // av1_compute_cross_correlation. + template_norm = compute_variance(frm, frm_stride, frm_corners[2 * i], + frm_corners[2 * i + 1]); + if (best_match_ncc > THRESHOLD_NCC * sqrt(template_norm)) { + correspondences[num_correspondences].x = frm_corners[2 * i]; + correspondences[num_correspondences].y = frm_corners[2 * i + 1]; + correspondences[num_correspondences].rx = ref_corners[2 * best_match_j]; + correspondences[num_correspondences].ry = + ref_corners[2 * best_match_j + 1]; + num_correspondences++; + } + } + improve_correspondence(frm, ref, width, height, frm_stride, ref_stride, + correspondences, num_correspondences); + return num_correspondences; +} diff --git a/libs/libaom/src/av1/encoder/corner_match.h b/libs/libaom/src/av1/encoder/corner_match.h new file mode 100644 index 000000000..3cf6de159 --- /dev/null +++ b/libs/libaom/src/av1/encoder/corner_match.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_ENCODER_CORNER_MATCH_H_ +#define AOM_AV1_ENCODER_CORNER_MATCH_H_ + +#include +#include +#include + +#define MATCH_SZ 13 +#define MATCH_SZ_BY2 ((MATCH_SZ - 1) / 2) +#define MATCH_SZ_SQ (MATCH_SZ * MATCH_SZ) + +typedef struct { + int x, y; + int rx, ry; +} Correspondence; + +int av1_determine_correspondence(unsigned char *frm, int *frm_corners, + int num_frm_corners, unsigned char *ref, + int *ref_corners, int num_ref_corners, + int width, int height, int frm_stride, + int ref_stride, int *correspondence_pts); + +#endif // AOM_AV1_ENCODER_CORNER_MATCH_H_ diff --git a/libs/libaom/src/av1/encoder/cost.c b/libs/libaom/src/av1/encoder/cost.c new file mode 100644 index 000000000..323e2aed5 --- /dev/null +++ b/libs/libaom/src/av1/encoder/cost.c @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "av1/encoder/cost.h" +#include "av1/common/entropy.h" + +// round(-log2(i/256.) * (1 << AV1_PROB_COST_SHIFT)); i = 128~255. +const uint16_t av1_prob_cost[128] = { + 512, 506, 501, 495, 489, 484, 478, 473, 467, 462, 456, 451, 446, 441, 435, + 430, 425, 420, 415, 410, 405, 400, 395, 390, 385, 380, 375, 371, 366, 361, + 356, 352, 347, 343, 338, 333, 329, 324, 320, 316, 311, 307, 302, 298, 294, + 289, 285, 281, 277, 273, 268, 264, 260, 256, 252, 248, 244, 240, 236, 232, + 228, 224, 220, 216, 212, 209, 205, 201, 197, 194, 190, 186, 182, 179, 175, + 171, 168, 164, 161, 157, 153, 150, 146, 143, 139, 136, 132, 129, 125, 122, + 119, 115, 112, 109, 105, 102, 99, 95, 92, 89, 86, 82, 79, 76, 73, + 70, 66, 63, 60, 57, 54, 51, 48, 45, 42, 38, 35, 32, 29, 26, + 23, 20, 18, 15, 12, 9, 6, 3, +}; + +void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf, + const int *inv_map) { + int i; + aom_cdf_prob prev_cdf = 0; + for (i = 0;; ++i) { + aom_cdf_prob p15 = AOM_ICDF(cdf[i]) - prev_cdf; + p15 = (p15 < EC_MIN_PROB) ? EC_MIN_PROB : p15; + prev_cdf = AOM_ICDF(cdf[i]); + + if (inv_map) + costs[inv_map[i]] = av1_cost_symbol(p15); + else + costs[i] = av1_cost_symbol(p15); + + // Stop once we reach the end of the CDF + if (cdf[i] == AOM_ICDF(CDF_PROB_TOP)) break; + } +} diff --git a/libs/libaom/src/av1/encoder/cost.h b/libs/libaom/src/av1/encoder/cost.h new file mode 100644 index 000000000..be0241a82 --- /dev/null +++ b/libs/libaom/src/av1/encoder/cost.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_COST_H_ +#define AOM_AV1_ENCODER_COST_H_ + +#include "aom_dsp/prob.h" +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern const uint16_t av1_prob_cost[128]; + +// The factor to scale from cost in bits to cost in av1_prob_cost units. +#define AV1_PROB_COST_SHIFT 9 + +// Cost of coding an n bit literal, using 128 (i.e. 50%) probability +// for each bit. +#define av1_cost_literal(n) ((n) * (1 << AV1_PROB_COST_SHIFT)) + +// Calculate the cost of a symbol with probability p15 / 2^15 +static INLINE int av1_cost_symbol(aom_cdf_prob p15) { + // p15 can be out of range [1, CDF_PROB_TOP - 1]. Clamping it, so that the + // following cost calculation works correctly. Otherwise, if p15 = + // CDF_PROB_TOP, shift would be -1, and "p15 << shift" would be wrong. + p15 = (aom_cdf_prob)clamp(p15, 1, CDF_PROB_TOP - 1); + assert(0 < p15 && p15 < CDF_PROB_TOP); + const int shift = CDF_PROB_BITS - 1 - get_msb(p15); + const int prob = get_prob(p15 << shift, CDF_PROB_TOP); + assert(prob >= 128); + return av1_prob_cost[prob - 128] + av1_cost_literal(shift); +} + +void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf, + const int *inv_map); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_COST_H_ diff --git a/libs/libaom/src/av1/encoder/dwt.c b/libs/libaom/src/av1/encoder/dwt.c new file mode 100644 index 000000000..04088b25f --- /dev/null +++ b/libs/libaom/src/av1/encoder/dwt.c @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/av1_rtcd.h" +#include "av1/encoder/dwt.h" + +// Note: block length must be even for this implementation +static void analysis_53_row(int length, tran_low_t *x, tran_low_t *lowpass, + tran_low_t *highpass) { + int n; + tran_low_t r, *a, *b; + + n = length >> 1; + b = highpass; + a = lowpass; + while (--n) { + *a++ = (r = *x++) * 2; + *b++ = *x - ((r + x[1] + 1) >> 1); + x++; + } + *a = (r = *x++) * 2; + *b = *x - r; + + n = length >> 1; + b = highpass; + a = lowpass; + r = *highpass; + while (n--) { + *a++ += (r + (*b) + 1) >> 1; + r = *b++; + } +} + +static void analysis_53_col(int length, tran_low_t *x, tran_low_t *lowpass, + tran_low_t *highpass) { + int n; + tran_low_t r, *a, *b; + + n = length >> 1; + b = highpass; + a = lowpass; + while (--n) { + *a++ = (r = *x++); + *b++ = (((*x) * 2) - (r + x[1]) + 2) >> 2; + x++; + } + *a = (r = *x++); + *b = (*x - r + 1) >> 1; + + n = length >> 1; + b = highpass; + a = lowpass; + r = *highpass; + while (n--) { + *a++ += (r + (*b) + 1) >> 1; + r = *b++; + } +} + +static void dyadic_analyze_53_uint8_input(int levels, int width, int height, + uint8_t *x, int pitch_x, + tran_low_t *c, int pitch_c, + int dwt_scale_bits, int hbd) { + int lv, i, j, nh, nw, hh = height, hw = width; + tran_low_t buffer[2 * DWT_MAX_LENGTH]; + + if (hbd) { + uint16_t *x16 = CONVERT_TO_SHORTPTR(x); + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + c[i * pitch_c + j] = x16[i * pitch_x + j] << dwt_scale_bits; + } + } + } else { + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + c[i * pitch_c + j] = x[i * pitch_x + j] << dwt_scale_bits; + } + } + } + + for (lv = 0; lv < levels; lv++) { + nh = hh; + hh = (hh + 1) >> 1; + nw = hw; + hw = (hw + 1) >> 1; + if ((nh < 2) || (nw < 2)) return; + for (i = 0; i < nh; i++) { + memcpy(buffer, &c[i * pitch_c], nw * sizeof(tran_low_t)); + analysis_53_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw); + } + for (j = 0; j < nw; j++) { + for (i = 0; i < nh; i++) buffer[i + nh] = c[i * pitch_c + j]; + analysis_53_col(nh, buffer + nh, buffer, buffer + hh); + for (i = 0; i < nh; i++) c[i * pitch_c + j] = buffer[i]; + } + } +} + +void av1_fdwt8x8_uint8_input_c(uint8_t *input, tran_low_t *output, int stride, + int hbd) { + dyadic_analyze_53_uint8_input(4, 8, 8, input, stride, output, 8, 2, hbd); +} + +int av1_haar_ac_sad(tran_low_t *output, int bw, int bh, int stride) { + int acsad = 0; + + for (int r = 0; r < bh; ++r) + for (int c = 0; c < bw; ++c) { + if (r >= bh / 2 || c >= bw / 2) acsad += abs(output[r * stride + c]); + } + return acsad; +} + +uint64_t av1_dct_ac_sad(tran_low_t *output, int bw, int bh, int stride) { + uint64_t acsad = 0; + + for (int r = 0; r < bh; ++r) + for (int c = 0; c < bw; ++c) { + if (r > 0 || c > 0) acsad += abs(output[r * stride + c]); + } + + return acsad; +} + +uint32_t av1_variance(uint8_t *input, int bw, int bh, int stride) { + int sum = 0; + uint32_t sse = 0; + + for (int r = 0; r < bh; ++r) + for (int c = 0; c < bw; ++c) { + sum += input[r * stride + c]; + sse += input[r * stride + c] * input[r * stride + c]; + } + return sse - (uint32_t)(((int64_t)sum * sum) / (bw * bh)); +} + +int av1_haar_ac_sad_8x8_uint8_input(uint8_t *input, int stride, int hbd) { + tran_low_t output[64]; + + av1_fdwt8x8_uint8_input_c(input, output, stride, hbd); + return av1_haar_ac_sad(output, 8, 8, 8); +} diff --git a/libs/libaom/src/av1/encoder/dwt.h b/libs/libaom/src/av1/encoder/dwt.h new file mode 100644 index 000000000..37306c6a5 --- /dev/null +++ b/libs/libaom/src/av1/encoder/dwt.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_DWT_H_ +#define AOM_AV1_ENCODER_DWT_H_ + +#include "av1/common/common.h" +#include "av1/common/enums.h" + +#define DWT_MAX_LENGTH 64 + +void av1_fdwt8x8(tran_low_t *input, tran_low_t *output, int stride); +void av1_fdwt8x8_uint8_input_c(uint8_t *input, tran_low_t *output, int stride, + int hbd); +int av1_haar_ac_sad_8x8_uint8_input(uint8_t *input, int stride, int hbd); + +#endif // AOM_AV1_ENCODER_DWT_H_ diff --git a/libs/libaom/src/av1/encoder/enc_enums.h b/libs/libaom/src/av1/encoder/enc_enums.h new file mode 100644 index 000000000..5a0651483 --- /dev/null +++ b/libs/libaom/src/av1/encoder/enc_enums.h @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ENC_ENUMS_H_ +#define AOM_AV1_ENCODER_ENC_ENUMS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +// This enumerator type needs to be kept aligned with the mode order in +// const MODE_DEFINITION av1_mode_defs[MAX_MODES] used in the rd code. +enum { + THR_NEARESTMV, + THR_NEARESTL2, + THR_NEARESTL3, + THR_NEARESTB, + THR_NEARESTA2, + THR_NEARESTA, + THR_NEARESTG, + + THR_NEWMV, + THR_NEWL2, + THR_NEWL3, + THR_NEWB, + THR_NEWA2, + THR_NEWA, + THR_NEWG, + + THR_NEARMV, + THR_NEARL2, + THR_NEARL3, + THR_NEARB, + THR_NEARA2, + THR_NEARA, + THR_NEARG, + + THR_GLOBALMV, + THR_GLOBALL2, + THR_GLOBALL3, + THR_GLOBALB, + THR_GLOBALA2, + THR_GLOBALA, + THR_GLOBALG, + + THR_COMP_NEAREST_NEARESTLA, + THR_COMP_NEAREST_NEARESTL2A, + THR_COMP_NEAREST_NEARESTL3A, + THR_COMP_NEAREST_NEARESTGA, + THR_COMP_NEAREST_NEARESTLB, + THR_COMP_NEAREST_NEARESTL2B, + THR_COMP_NEAREST_NEARESTL3B, + THR_COMP_NEAREST_NEARESTGB, + THR_COMP_NEAREST_NEARESTLA2, + THR_COMP_NEAREST_NEARESTL2A2, + THR_COMP_NEAREST_NEARESTL3A2, + THR_COMP_NEAREST_NEARESTGA2, + THR_COMP_NEAREST_NEARESTLL2, + THR_COMP_NEAREST_NEARESTLL3, + THR_COMP_NEAREST_NEARESTLG, + THR_COMP_NEAREST_NEARESTBA, + + THR_COMP_NEAR_NEARLA, + THR_COMP_NEW_NEARESTLA, + THR_COMP_NEAREST_NEWLA, + THR_COMP_NEW_NEARLA, + THR_COMP_NEAR_NEWLA, + THR_COMP_NEW_NEWLA, + THR_COMP_GLOBAL_GLOBALLA, + + THR_COMP_NEAR_NEARL2A, + THR_COMP_NEW_NEARESTL2A, + THR_COMP_NEAREST_NEWL2A, + THR_COMP_NEW_NEARL2A, + THR_COMP_NEAR_NEWL2A, + THR_COMP_NEW_NEWL2A, + THR_COMP_GLOBAL_GLOBALL2A, + + THR_COMP_NEAR_NEARL3A, + THR_COMP_NEW_NEARESTL3A, + THR_COMP_NEAREST_NEWL3A, + THR_COMP_NEW_NEARL3A, + THR_COMP_NEAR_NEWL3A, + THR_COMP_NEW_NEWL3A, + THR_COMP_GLOBAL_GLOBALL3A, + + THR_COMP_NEAR_NEARGA, + THR_COMP_NEW_NEARESTGA, + THR_COMP_NEAREST_NEWGA, + THR_COMP_NEW_NEARGA, + THR_COMP_NEAR_NEWGA, + THR_COMP_NEW_NEWGA, + THR_COMP_GLOBAL_GLOBALGA, + + THR_COMP_NEAR_NEARLB, + THR_COMP_NEW_NEARESTLB, + THR_COMP_NEAREST_NEWLB, + THR_COMP_NEW_NEARLB, + THR_COMP_NEAR_NEWLB, + THR_COMP_NEW_NEWLB, + THR_COMP_GLOBAL_GLOBALLB, + + THR_COMP_NEAR_NEARL2B, + THR_COMP_NEW_NEARESTL2B, + THR_COMP_NEAREST_NEWL2B, + THR_COMP_NEW_NEARL2B, + THR_COMP_NEAR_NEWL2B, + THR_COMP_NEW_NEWL2B, + THR_COMP_GLOBAL_GLOBALL2B, + + THR_COMP_NEAR_NEARL3B, + THR_COMP_NEW_NEARESTL3B, + THR_COMP_NEAREST_NEWL3B, + THR_COMP_NEW_NEARL3B, + THR_COMP_NEAR_NEWL3B, + THR_COMP_NEW_NEWL3B, + THR_COMP_GLOBAL_GLOBALL3B, + + THR_COMP_NEAR_NEARGB, + THR_COMP_NEW_NEARESTGB, + THR_COMP_NEAREST_NEWGB, + THR_COMP_NEW_NEARGB, + THR_COMP_NEAR_NEWGB, + THR_COMP_NEW_NEWGB, + THR_COMP_GLOBAL_GLOBALGB, + + THR_COMP_NEAR_NEARLA2, + THR_COMP_NEW_NEARESTLA2, + THR_COMP_NEAREST_NEWLA2, + THR_COMP_NEW_NEARLA2, + THR_COMP_NEAR_NEWLA2, + THR_COMP_NEW_NEWLA2, + THR_COMP_GLOBAL_GLOBALLA2, + + THR_COMP_NEAR_NEARL2A2, + THR_COMP_NEW_NEARESTL2A2, + THR_COMP_NEAREST_NEWL2A2, + THR_COMP_NEW_NEARL2A2, + THR_COMP_NEAR_NEWL2A2, + THR_COMP_NEW_NEWL2A2, + THR_COMP_GLOBAL_GLOBALL2A2, + + THR_COMP_NEAR_NEARL3A2, + THR_COMP_NEW_NEARESTL3A2, + THR_COMP_NEAREST_NEWL3A2, + THR_COMP_NEW_NEARL3A2, + THR_COMP_NEAR_NEWL3A2, + THR_COMP_NEW_NEWL3A2, + THR_COMP_GLOBAL_GLOBALL3A2, + + THR_COMP_NEAR_NEARGA2, + THR_COMP_NEW_NEARESTGA2, + THR_COMP_NEAREST_NEWGA2, + THR_COMP_NEW_NEARGA2, + THR_COMP_NEAR_NEWGA2, + THR_COMP_NEW_NEWGA2, + THR_COMP_GLOBAL_GLOBALGA2, + + THR_COMP_NEAR_NEARLL2, + THR_COMP_NEW_NEARESTLL2, + THR_COMP_NEAREST_NEWLL2, + THR_COMP_NEW_NEARLL2, + THR_COMP_NEAR_NEWLL2, + THR_COMP_NEW_NEWLL2, + THR_COMP_GLOBAL_GLOBALLL2, + + THR_COMP_NEAR_NEARLL3, + THR_COMP_NEW_NEARESTLL3, + THR_COMP_NEAREST_NEWLL3, + THR_COMP_NEW_NEARLL3, + THR_COMP_NEAR_NEWLL3, + THR_COMP_NEW_NEWLL3, + THR_COMP_GLOBAL_GLOBALLL3, + + THR_COMP_NEAR_NEARLG, + THR_COMP_NEW_NEARESTLG, + THR_COMP_NEAREST_NEWLG, + THR_COMP_NEW_NEARLG, + THR_COMP_NEAR_NEWLG, + THR_COMP_NEW_NEWLG, + THR_COMP_GLOBAL_GLOBALLG, + + THR_COMP_NEAR_NEARBA, + THR_COMP_NEW_NEARESTBA, + THR_COMP_NEAREST_NEWBA, + THR_COMP_NEW_NEARBA, + THR_COMP_NEAR_NEWBA, + THR_COMP_NEW_NEWBA, + THR_COMP_GLOBAL_GLOBALBA, + + THR_DC, + THR_PAETH, + THR_SMOOTH, + THR_SMOOTH_V, + THR_SMOOTH_H, + THR_H_PRED, + THR_V_PRED, + THR_D135_PRED, + THR_D203_PRED, + THR_D157_PRED, + THR_D67_PRED, + THR_D113_PRED, + THR_D45_PRED, + + MAX_MODES, + SINGLE_REF_MODE_START = THR_NEARESTMV, + SINGLE_REF_MODE_END = THR_COMP_NEAREST_NEARESTLA, + NUM_SINGLE_REF_MODES = SINGLE_REF_MODE_END - SINGLE_REF_MODE_START, + THR_MODE_START = THR_NEARESTMV, + THR_MODE_END = MAX_MODES, + THR_INVALID = 255 +} UENUM1BYTE(THR_MODES); + +enum { + THR_LAST, + THR_LAST2, + THR_LAST3, + THR_BWDR, + THR_ALTR2, + THR_GOLD, + THR_ALTR, + + THR_COMP_LA, + THR_COMP_L2A, + THR_COMP_L3A, + THR_COMP_GA, + + THR_COMP_LB, + THR_COMP_L2B, + THR_COMP_L3B, + THR_COMP_GB, + + THR_COMP_LA2, + THR_COMP_L2A2, + THR_COMP_L3A2, + THR_COMP_GA2, + + THR_INTRA, + + MAX_REFS +} UENUM1BYTE(THR_MODES_SUB8X8); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ENC_ENUMS_H_ diff --git a/libs/libaom/src/av1/encoder/encode_strategy.c b/libs/libaom/src/av1/encoder/encode_strategy.c new file mode 100644 index 000000000..8eb73d8d3 --- /dev/null +++ b/libs/libaom/src/av1/encoder/encode_strategy.c @@ -0,0 +1,1322 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_codec.h" +#include "aom/aom_encoder.h" + +#include "aom_ports/system_state.h" + +#if CONFIG_MISMATCH_DEBUG +#include "aom_util/debug_util.h" +#endif // CONFIG_MISMATCH_DEBUG + +#include "av1/common/av1_common_int.h" +#include "av1/common/reconinter.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/encode_strategy.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/pass2_strategy.h" +#include "av1/encoder/temporal_filter.h" +#include "av1/encoder/tpl_model.h" + +#define TEMPORAL_FILTER_KEY_FRAME (CONFIG_REALTIME_ONLY ? 0 : 1) + +void av1_configure_buffer_updates(AV1_COMP *const cpi, + EncodeFrameParams *const frame_params, + const FRAME_UPDATE_TYPE type, + int force_refresh_all) { + // NOTE(weitinglin): Should we define another function to take care of + // cpi->rc.is_$Source_Type to make this function as it is in the comment? + + const ExternalFlags *const ext_flags = &cpi->ext_flags; + cpi->rc.is_src_frame_alt_ref = 0; + + switch (type) { + case KF_UPDATE: + frame_params->refresh_golden_frame = 1; + frame_params->refresh_bwd_ref_frame = 1; + frame_params->refresh_alt_ref_frame = 1; + break; + + case LF_UPDATE: + frame_params->refresh_golden_frame = 0; + frame_params->refresh_bwd_ref_frame = 0; + frame_params->refresh_alt_ref_frame = 0; + break; + + case GF_UPDATE: + frame_params->refresh_golden_frame = 1; + frame_params->refresh_bwd_ref_frame = 0; + frame_params->refresh_alt_ref_frame = 0; + break; + + case OVERLAY_UPDATE: + frame_params->refresh_golden_frame = 1; + frame_params->refresh_bwd_ref_frame = 0; + frame_params->refresh_alt_ref_frame = 0; + + cpi->rc.is_src_frame_alt_ref = 1; + break; + + case ARF_UPDATE: + frame_params->refresh_golden_frame = 0; + // NOTE: BWDREF does not get updated along with ALTREF_FRAME. + frame_params->refresh_bwd_ref_frame = 0; + frame_params->refresh_alt_ref_frame = 1; + break; + + case INTNL_OVERLAY_UPDATE: + frame_params->refresh_golden_frame = 0; + frame_params->refresh_bwd_ref_frame = 0; + frame_params->refresh_alt_ref_frame = 0; + + cpi->rc.is_src_frame_alt_ref = 1; + break; + + case INTNL_ARF_UPDATE: + frame_params->refresh_golden_frame = 0; + frame_params->refresh_bwd_ref_frame = 1; + frame_params->refresh_alt_ref_frame = 0; + break; + + default: assert(0); break; + } + + if (ext_flags->refresh_frame_flags_pending && + (!is_stat_generation_stage(cpi))) { + frame_params->refresh_golden_frame = ext_flags->refresh_golden_frame; + frame_params->refresh_alt_ref_frame = ext_flags->refresh_alt_ref_frame; + frame_params->refresh_bwd_ref_frame = ext_flags->refresh_bwd_ref_frame; + } + + if (force_refresh_all) { + frame_params->refresh_golden_frame = 1; + frame_params->refresh_bwd_ref_frame = 1; + frame_params->refresh_alt_ref_frame = 1; + } +} + +static void set_additional_frame_flags(const AV1_COMMON *const cm, + unsigned int *const frame_flags) { + if (frame_is_intra_only(cm)) { + *frame_flags |= FRAMEFLAGS_INTRAONLY; + } + if (frame_is_sframe(cm)) { + *frame_flags |= FRAMEFLAGS_SWITCH; + } + if (cm->features.error_resilient_mode) { + *frame_flags |= FRAMEFLAGS_ERROR_RESILIENT; + } +} + +static INLINE void update_keyframe_counters(AV1_COMP *cpi) { + if (cpi->common.show_frame) { + if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref || + cpi->common.current_frame.frame_type == KEY_FRAME) { + // If this is a show_existing_frame with a source other than altref, + // or if it is not a displayed forward keyframe, the keyframe update + // counters were incremented when it was originally encoded. + cpi->rc.frames_since_key++; + cpi->rc.frames_to_key--; + } + } +} + +static INLINE int is_frame_droppable(const SVC *const svc, + const ExternalFlags *const ext_flags) { + // Droppable frame is only used by external refresh flags. VoD setting won't + // trigger its use case. + if (svc->external_ref_frame_config) + return svc->non_reference_frame; + else if (ext_flags->refresh_frame_flags_pending) + return !(ext_flags->refresh_alt_ref_frame || + ext_flags->refresh_alt2_ref_frame || + ext_flags->refresh_bwd_ref_frame || + ext_flags->refresh_golden_frame || ext_flags->refresh_last_frame); + else + return 0; +} + +static INLINE void update_frames_till_gf_update(AV1_COMP *cpi) { + // TODO(weitinglin): Updating this counter for is_frame_droppable + // is a work-around to handle the condition when a frame is drop. + // We should fix the cpi->common.show_frame flag + // instead of checking the other condition to update the counter properly. + if (cpi->common.show_frame || + is_frame_droppable(&cpi->svc, &cpi->ext_flags)) { + // Decrement count down till next gf + if (cpi->rc.frames_till_gf_update_due > 0) + cpi->rc.frames_till_gf_update_due--; + } +} + +static INLINE void update_gf_group_index(AV1_COMP *cpi) { + // Increment the gf group index ready for the next frame. If this is + // a show_existing_frame with a source other than altref, or if it is not + // a displayed forward keyframe, the index was incremented when it was + // originally encoded. + if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref || + cpi->common.current_frame.frame_type == KEY_FRAME) { + ++cpi->gf_group.index; + } +} + +static void update_rc_counts(AV1_COMP *cpi) { + update_keyframe_counters(cpi); + update_frames_till_gf_update(cpi); + update_gf_group_index(cpi); +} + +static void set_ext_overrides(AV1_COMMON *const cm, + EncodeFrameParams *const frame_params, + ExternalFlags *const ext_flags) { + // Overrides the defaults with the externally supplied values with + // av1_update_reference() and av1_update_entropy() calls + // Note: The overrides are valid only for the next frame passed + // to av1_encode_lowlevel() + + if (ext_flags->use_s_frame) { + frame_params->frame_type = S_FRAME; + } + + if (ext_flags->refresh_frame_context_pending) { + cm->features.refresh_frame_context = ext_flags->refresh_frame_context; + ext_flags->refresh_frame_context_pending = 0; + } + cm->features.allow_ref_frame_mvs = ext_flags->use_ref_frame_mvs; + + frame_params->error_resilient_mode = ext_flags->use_error_resilient; + // A keyframe is already error resilient and keyframes with + // error_resilient_mode interferes with the use of show_existing_frame + // when forward reference keyframes are enabled. + frame_params->error_resilient_mode &= frame_params->frame_type != KEY_FRAME; + // For bitstream conformance, s-frames must be error-resilient + frame_params->error_resilient_mode |= frame_params->frame_type == S_FRAME; +} + +static int get_current_frame_ref_type( + const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params) { + // We choose the reference "type" of this frame from the flags which indicate + // which reference frames will be refreshed by it. More than one of these + // flags may be set, so the order here implies an order of precedence. This is + // just used to choose the primary_ref_frame (as the most recent reference + // buffer of the same reference-type as the current frame) + + (void)frame_params; + // TODO(jingning): This table should be a lot simpler with the new + // ARF system in place. Keep frame_params for the time being as we are + // still evaluating a few design options. + switch (cpi->gf_group.layer_depth[cpi->gf_group.index]) { + case 0: return 0; + case 1: return 1; + case MAX_ARF_LAYERS: + case MAX_ARF_LAYERS + 1: return 4; + default: return 7; + } +} + +static int choose_primary_ref_frame( + const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params) { + const AV1_COMMON *const cm = &cpi->common; + + const int intra_only = frame_params->frame_type == KEY_FRAME || + frame_params->frame_type == INTRA_ONLY_FRAME; + if (intra_only || frame_params->error_resilient_mode || cpi->use_svc || + cpi->ext_flags.use_primary_ref_none) { + return PRIMARY_REF_NONE; + } + + // In large scale case, always use Last frame's frame contexts. + // Note(yunqing): In other cases, primary_ref_frame is chosen based on + // cpi->gf_group.layer_depth[cpi->gf_group.index], which also controls + // frame bit allocation. + if (cm->tiles.large_scale) return (LAST_FRAME - LAST_FRAME); + + // Find the most recent reference frame with the same reference type as the + // current frame + const int current_ref_type = get_current_frame_ref_type(cpi, frame_params); + int wanted_fb = cpi->fb_of_context_type[current_ref_type]; + + int primary_ref_frame = PRIMARY_REF_NONE; + for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { + if (get_ref_frame_map_idx(cm, ref_frame) == wanted_fb) { + primary_ref_frame = ref_frame - LAST_FRAME; + } + } + + return primary_ref_frame; +} + +static void update_fb_of_context_type( + const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params, + int *const fb_of_context_type) { + const AV1_COMMON *const cm = &cpi->common; + const int current_frame_ref_type = + get_current_frame_ref_type(cpi, frame_params); + + if (frame_is_intra_only(cm) || cm->features.error_resilient_mode || + cpi->ext_flags.use_primary_ref_none) { + for (int i = 0; i < REF_FRAMES; i++) { + fb_of_context_type[i] = -1; + } + fb_of_context_type[current_frame_ref_type] = + cm->show_frame ? get_ref_frame_map_idx(cm, GOLDEN_FRAME) + : get_ref_frame_map_idx(cm, ALTREF_FRAME); + } + + if (!encode_show_existing_frame(cm)) { + // Refresh fb_of_context_type[]: see encoder.h for explanation + if (cm->current_frame.frame_type == KEY_FRAME) { + // All ref frames are refreshed, pick one that will live long enough + fb_of_context_type[current_frame_ref_type] = 0; + } else { + // If more than one frame is refreshed, it doesn't matter which one we + // pick so pick the first. LST sometimes doesn't refresh any: this is ok + + for (int i = 0; i < REF_FRAMES; i++) { + if (cm->current_frame.refresh_frame_flags & (1 << i)) { + fb_of_context_type[current_frame_ref_type] = i; + break; + } + } + } + } +} + +static int get_order_offset(const GF_GROUP *const gf_group, + const EncodeFrameParams *const frame_params) { + // shown frame by definition has order offset 0 + // show_existing_frame ignores order_offset and simply takes the order_hint + // from the reference frame being shown. + if (frame_params->show_frame || frame_params->show_existing_frame) return 0; + + const int arf_offset = + AOMMIN((MAX_GF_INTERVAL - 1), gf_group->arf_src_offset[gf_group->index]); + return AOMMIN((MAX_GF_INTERVAL - 1), arf_offset); +} + +static void adjust_frame_rate(AV1_COMP *cpi, int64_t ts_start, int64_t ts_end) { + TimeStamps *time_stamps = &cpi->time_stamps; + int64_t this_duration; + int step = 0; + + // Clear down mmx registers + aom_clear_system_state(); + + if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) { + cpi->framerate = cpi->svc.base_framerate; + av1_rc_update_framerate(cpi, cpi->common.width, cpi->common.height); + return; + } + + if (ts_start == time_stamps->first_ever) { + this_duration = ts_end - ts_start; + step = 1; + } else { + int64_t last_duration = + time_stamps->prev_end_seen - time_stamps->prev_start_seen; + + this_duration = ts_end - time_stamps->prev_end_seen; + + // do a step update if the duration changes by 10% + if (last_duration) + step = (int)((this_duration - last_duration) * 10 / last_duration); + } + + if (this_duration) { + if (step) { + av1_new_framerate(cpi, 10000000.0 / this_duration); + } else { + // Average this frame's rate into the last second's average + // frame rate. If we haven't seen 1 second yet, then average + // over the whole interval seen. + const double interval = + AOMMIN((double)(ts_end - time_stamps->first_ever), 10000000.0); + double avg_duration = 10000000.0 / cpi->framerate; + avg_duration *= (interval - avg_duration + this_duration); + avg_duration /= interval; + + av1_new_framerate(cpi, 10000000.0 / avg_duration); + } + } + time_stamps->prev_start_seen = ts_start; + time_stamps->prev_end_seen = ts_end; +} + +// If this is an alt-ref, returns the offset of the source frame used +// as the arf midpoint. Otherwise, returns 0. +static int get_arf_src_index(GF_GROUP *gf_group, int pass) { + int arf_src_index = 0; + if (pass != 1) arf_src_index = gf_group->arf_src_offset[gf_group->index]; + return arf_src_index; +} + +// Called if this frame is an ARF or ARF2. Also handles forward-keyframes +// For an ARF set arf2=0, for ARF2 set arf2=1 +// temporal_filtered is set to 1 if we temporally filter the ARF frame, so that +// the correct post-filter buffer can be used. +static struct lookahead_entry *setup_arf_frame( + AV1_COMP *const cpi, const int arf_src_index, int *code_arf, + EncodeFrameParams *const frame_params, int *show_existing_alt_ref) { + AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; +#if !CONFIG_REALTIME_ONLY + const AV1EncoderConfig *const oxcf = &cpi->oxcf; +#endif + + assert(arf_src_index <= rc->frames_to_key); + *code_arf = 0; + + struct lookahead_entry *source = + av1_lookahead_peek(cpi->lookahead, arf_src_index, cpi->compressor_stage); + + if (source != NULL) { + cm->showable_frame = 1; + + // When arf_src_index == rc->frames_to_key, it indicates a fwd_kf + if (arf_src_index == rc->frames_to_key) { + // Skip temporal filtering and mark as intra_only if we have a fwd_kf + cpi->no_show_kf = 1; + } else { +#if !CONFIG_REALTIME_ONLY + if (oxcf->arnr_max_frames > 0) { + // Produce the filtered ARF frame. + cm->current_frame.frame_type = INTER_FRAME; + FRAME_UPDATE_TYPE frame_update_type = + get_frame_update_type(&cpi->gf_group); + av1_configure_buffer_updates(cpi, frame_params, frame_update_type, 0); + *code_arf = + av1_temporal_filter(cpi, arf_src_index, show_existing_alt_ref); + if (*code_arf) { + aom_extend_frame_borders(&cpi->alt_ref_buffer, av1_num_planes(cm)); + } + } +#else + (void)show_existing_alt_ref; +#endif + } + frame_params->show_frame = 0; + } + rc->source_alt_ref_pending = 0; + return source; +} + +// Determine whether there is a forced keyframe pending in the lookahead buffer +int is_forced_keyframe_pending(struct lookahead_ctx *lookahead, + const int up_to_index, + const COMPRESSOR_STAGE compressor_stage) { + for (int i = 0; i <= up_to_index; i++) { + const struct lookahead_entry *e = + av1_lookahead_peek(lookahead, i, compressor_stage); + if (e == NULL) { + // We have reached the end of the lookahead buffer and not early-returned + // so there isn't a forced key-frame pending. + return -1; + } else if (e->flags == AOM_EFLAG_FORCE_KF) { + return (i + 1); + } else { + continue; + } + } + return -1; // Never reached +} + +// Check if we should encode an ARF or internal ARF. If not, try a LAST +// Do some setup associated with the chosen source +// temporal_filtered, flush, and frame_update_type are outputs. +// Return the frame source, or NULL if we couldn't find one +static struct lookahead_entry *choose_frame_source( + AV1_COMP *const cpi, int *const code_arf, int *const flush, + struct lookahead_entry **last_source, EncodeFrameParams *const frame_params, + int *show_existing_alt_ref) { + AV1_COMMON *const cm = &cpi->common; + struct lookahead_entry *source = NULL; + *code_arf = 0; + + // Should we encode an alt-ref frame. + int arf_src_index = get_arf_src_index(&cpi->gf_group, cpi->oxcf.pass); + // TODO(Aasaipriya): Forced key frames need to be fixed when rc_mode != AOM_Q + if (arf_src_index && + (is_forced_keyframe_pending(cpi->lookahead, arf_src_index, + cpi->compressor_stage) != -1) && + cpi->oxcf.rc_mode != AOM_Q) { + arf_src_index = 0; + *flush = 1; + } + + if (arf_src_index) + source = setup_arf_frame(cpi, arf_src_index, code_arf, frame_params, + show_existing_alt_ref); + + if (!source) { + // Get last frame source. + if (cm->current_frame.frame_number > 0) { + *last_source = + av1_lookahead_peek(cpi->lookahead, -1, cpi->compressor_stage); + } + // Read in the source frame. + source = av1_lookahead_pop(cpi->lookahead, *flush, cpi->compressor_stage); + if (source == NULL) return NULL; + frame_params->show_frame = 1; + } + return source; +} + +// Don't allow a show_existing_frame to coincide with an error resilient or +// S-Frame. An exception can be made in the case of a keyframe, since it does +// not depend on any previous frames. +static int allow_show_existing(const AV1_COMP *const cpi, + unsigned int frame_flags) { + if (cpi->common.current_frame.frame_number == 0) return 0; + + const struct lookahead_entry *lookahead_src = + av1_lookahead_peek(cpi->lookahead, 0, cpi->compressor_stage); + if (lookahead_src == NULL) return 1; + + const int is_error_resilient = + cpi->oxcf.error_resilient_mode || + (lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT); + const int is_s_frame = + cpi->oxcf.s_frame_mode || (lookahead_src->flags & AOM_EFLAG_SET_S_FRAME); + const int is_key_frame = + (cpi->rc.frames_to_key == 0) || (frame_flags & FRAMEFLAGS_KEY); + return !(is_error_resilient || is_s_frame) || is_key_frame; +} + +// Update frame_flags to tell the encoder's caller what sort of frame was +// encoded. +static void update_frame_flags(AV1_COMP *cpi, unsigned int *frame_flags) { + if (encode_show_existing_frame(&cpi->common)) { + *frame_flags &= ~FRAMEFLAGS_GOLDEN; + *frame_flags &= ~FRAMEFLAGS_BWDREF; + *frame_flags &= ~FRAMEFLAGS_ALTREF; + *frame_flags &= ~FRAMEFLAGS_KEY; + return; + } + + if (cpi->refresh_golden_frame == 1) { + *frame_flags |= FRAMEFLAGS_GOLDEN; + } else { + *frame_flags &= ~FRAMEFLAGS_GOLDEN; + } + + if (cpi->refresh_alt_ref_frame == 1) { + *frame_flags |= FRAMEFLAGS_ALTREF; + } else { + *frame_flags &= ~FRAMEFLAGS_ALTREF; + } + + if (cpi->refresh_bwd_ref_frame == 1) { + *frame_flags |= FRAMEFLAGS_BWDREF; + } else { + *frame_flags &= ~FRAMEFLAGS_BWDREF; + } + + if (cpi->common.current_frame.frame_type == KEY_FRAME) { + *frame_flags |= FRAMEFLAGS_KEY; + } else { + *frame_flags &= ~FRAMEFLAGS_KEY; + } +} + +#define DUMP_REF_FRAME_IMAGES 0 + +#if DUMP_REF_FRAME_IMAGES == 1 +static int dump_one_image(AV1_COMMON *cm, + const YV12_BUFFER_CONFIG *const ref_buf, + char *file_name) { + int h; + FILE *f_ref = NULL; + + if (ref_buf == NULL) { + printf("Frame data buffer is NULL.\n"); + return AOM_CODEC_MEM_ERROR; + } + + if ((f_ref = fopen(file_name, "wb")) == NULL) { + printf("Unable to open file %s to write.\n", file_name); + return AOM_CODEC_MEM_ERROR; + } + + // --- Y --- + for (h = 0; h < cm->height; ++h) { + fwrite(&ref_buf->y_buffer[h * ref_buf->y_stride], 1, cm->width, f_ref); + } + // --- U --- + for (h = 0; h < (cm->height >> 1); ++h) { + fwrite(&ref_buf->u_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1), + f_ref); + } + // --- V --- + for (h = 0; h < (cm->height >> 1); ++h) { + fwrite(&ref_buf->v_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1), + f_ref); + } + + fclose(f_ref); + + return AOM_CODEC_OK; +} + +static void dump_ref_frame_images(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + MV_REFERENCE_FRAME ref_frame; + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + char file_name[256] = ""; + snprintf(file_name, sizeof(file_name), "/tmp/enc_F%d_ref_%d.yuv", + cm->current_frame.frame_number, ref_frame); + dump_one_image(cm, get_ref_frame_yv12_buf(cpi, ref_frame), file_name); + } +} +#endif // DUMP_REF_FRAME_IMAGES == 1 + +int av1_get_refresh_ref_frame_map(int refresh_frame_flags) { + int ref_map_index = INVALID_IDX; + + for (ref_map_index = 0; ref_map_index < REF_FRAMES; ++ref_map_index) + if ((refresh_frame_flags >> ref_map_index) & 1) break; + + return ref_map_index; +} + +static void update_arf_stack(int ref_map_index, + RefBufferStack *ref_buffer_stack) { + if (ref_buffer_stack->arf_stack_size >= 0) { + if (ref_buffer_stack->arf_stack[0] == ref_map_index) + stack_pop(ref_buffer_stack->arf_stack, &ref_buffer_stack->arf_stack_size); + } + + if (ref_buffer_stack->lst_stack_size) { + for (int i = ref_buffer_stack->lst_stack_size - 1; i >= 0; --i) { + if (ref_buffer_stack->lst_stack[i] == ref_map_index) { + for (int idx = i; idx < ref_buffer_stack->lst_stack_size - 1; ++idx) + ref_buffer_stack->lst_stack[idx] = + ref_buffer_stack->lst_stack[idx + 1]; + ref_buffer_stack->lst_stack[ref_buffer_stack->lst_stack_size - 1] = + INVALID_IDX; + --ref_buffer_stack->lst_stack_size; + } + } + } + + if (ref_buffer_stack->gld_stack_size) { + for (int i = ref_buffer_stack->gld_stack_size - 1; i >= 0; --i) { + if (ref_buffer_stack->gld_stack[i] == ref_map_index) { + for (int idx = i; idx < ref_buffer_stack->gld_stack_size - 1; ++idx) + ref_buffer_stack->gld_stack[idx] = + ref_buffer_stack->gld_stack[idx + 1]; + ref_buffer_stack->gld_stack[ref_buffer_stack->gld_stack_size - 1] = + INVALID_IDX; + --ref_buffer_stack->gld_stack_size; + } + } + } +} + +// Update reference frame stack info. +void av1_update_ref_frame_map(AV1_COMP *cpi, + FRAME_UPDATE_TYPE frame_update_type, + int show_existing_frame, int ref_map_index, + RefBufferStack *ref_buffer_stack) { + AV1_COMMON *const cm = &cpi->common; + // TODO(jingning): Consider the S-frame same as key frame for the + // reference frame tracking purpose. The logic might be better + // expressed than converting the frame update type. + if (frame_is_sframe(cm)) frame_update_type = KEY_FRAME; + + if (is_frame_droppable(&cpi->svc, &cpi->ext_flags)) return; + + switch (frame_update_type) { + case KEY_FRAME: + if (show_existing_frame) + ref_map_index = stack_pop(ref_buffer_stack->arf_stack, + &ref_buffer_stack->arf_stack_size); + stack_reset(ref_buffer_stack->lst_stack, + &ref_buffer_stack->lst_stack_size); + stack_reset(ref_buffer_stack->gld_stack, + &ref_buffer_stack->gld_stack_size); + stack_reset(ref_buffer_stack->arf_stack, + &ref_buffer_stack->arf_stack_size); + stack_push(ref_buffer_stack->gld_stack, &ref_buffer_stack->gld_stack_size, + ref_map_index); + break; + case GF_UPDATE: + update_arf_stack(ref_map_index, ref_buffer_stack); + stack_push(ref_buffer_stack->gld_stack, &ref_buffer_stack->gld_stack_size, + ref_map_index); + // For nonrd_mode: update LAST as well on GF_UPDATE frame. + if (cpi->sf.rt_sf.use_nonrd_pick_mode) + stack_push(ref_buffer_stack->lst_stack, + &ref_buffer_stack->lst_stack_size, ref_map_index); + break; + case LF_UPDATE: + update_arf_stack(ref_map_index, ref_buffer_stack); + stack_push(ref_buffer_stack->lst_stack, &ref_buffer_stack->lst_stack_size, + ref_map_index); + break; + case ARF_UPDATE: + case INTNL_ARF_UPDATE: + update_arf_stack(ref_map_index, ref_buffer_stack); + stack_push(ref_buffer_stack->arf_stack, &ref_buffer_stack->arf_stack_size, + ref_map_index); + break; + case OVERLAY_UPDATE: + ref_map_index = stack_pop(ref_buffer_stack->arf_stack, + &ref_buffer_stack->arf_stack_size); + stack_push(ref_buffer_stack->gld_stack, &ref_buffer_stack->gld_stack_size, + ref_map_index); + break; + case INTNL_OVERLAY_UPDATE: + ref_map_index = stack_pop(ref_buffer_stack->arf_stack, + &ref_buffer_stack->arf_stack_size); + stack_push(ref_buffer_stack->lst_stack, &ref_buffer_stack->lst_stack_size, + ref_map_index); + break; + default: assert(0 && "unknown type"); + } + return; +} + +static int get_free_ref_map_index(const RefBufferStack *ref_buffer_stack) { + for (int idx = 0; idx < REF_FRAMES; ++idx) { + int is_free = 1; + for (int i = 0; i < ref_buffer_stack->arf_stack_size; ++i) { + if (ref_buffer_stack->arf_stack[i] == idx) { + is_free = 0; + break; + } + } + + for (int i = 0; i < ref_buffer_stack->lst_stack_size; ++i) { + if (ref_buffer_stack->lst_stack[i] == idx) { + is_free = 0; + break; + } + } + + for (int i = 0; i < ref_buffer_stack->gld_stack_size; ++i) { + if (ref_buffer_stack->gld_stack[i] == idx) { + is_free = 0; + break; + } + } + + if (is_free) return idx; + } + return INVALID_IDX; +} + +int av1_get_refresh_frame_flags(const AV1_COMP *const cpi, + const EncodeFrameParams *const frame_params, + FRAME_UPDATE_TYPE frame_update_type, + const RefBufferStack *const ref_buffer_stack) { + const AV1_COMMON *const cm = &cpi->common; + const ExternalFlags *const ext_flags = &cpi->ext_flags; + const SVC *const svc = &cpi->svc; + // Switch frames and shown key-frames overwrite all reference slots + if ((frame_params->frame_type == KEY_FRAME && frame_params->show_frame) || + frame_params->frame_type == S_FRAME) + return 0xFF; + + // show_existing_frames don't actually send refresh_frame_flags so set the + // flags to 0 to keep things consistent. + if (frame_params->show_existing_frame && + (!frame_params->error_resilient_mode || + frame_params->frame_type == KEY_FRAME)) { + return 0; + } + + if (is_frame_droppable(svc, ext_flags)) return 0; + + int refresh_mask = 0; + + if (ext_flags->refresh_frame_flags_pending) { + if (svc->external_ref_frame_config) { + for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) { + int ref_frame_map_idx = svc->ref_idx[i]; + refresh_mask |= svc->refresh[ref_frame_map_idx] << ref_frame_map_idx; + } + return refresh_mask; + } + // Unfortunately the encoder interface reflects the old refresh_*_frame + // flags so we have to replicate the old refresh_frame_flags logic here in + // order to preserve the behaviour of the flag overrides. + int ref_frame_map_idx = get_ref_frame_map_idx(cm, LAST_FRAME); + if (ref_frame_map_idx != INVALID_IDX) + refresh_mask |= ext_flags->refresh_last_frame << ref_frame_map_idx; + + ref_frame_map_idx = get_ref_frame_map_idx(cm, EXTREF_FRAME); + if (ref_frame_map_idx != INVALID_IDX) + refresh_mask |= ext_flags->refresh_bwd_ref_frame << ref_frame_map_idx; + + ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF2_FRAME); + if (ref_frame_map_idx != INVALID_IDX) + refresh_mask |= ext_flags->refresh_alt2_ref_frame << ref_frame_map_idx; + + if (frame_update_type == OVERLAY_UPDATE) { + ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF_FRAME); + if (ref_frame_map_idx != INVALID_IDX) + refresh_mask |= ext_flags->refresh_golden_frame << ref_frame_map_idx; + } else { + ref_frame_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME); + if (ref_frame_map_idx != INVALID_IDX) + refresh_mask |= ext_flags->refresh_golden_frame << ref_frame_map_idx; + + ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF_FRAME); + if (ref_frame_map_idx != INVALID_IDX) + refresh_mask |= ext_flags->refresh_alt_ref_frame << ref_frame_map_idx; + } + return refresh_mask; + } + + // Search for the open slot to store the current frame. + int free_fb_index = get_free_ref_map_index(ref_buffer_stack); + switch (frame_update_type) { + case KF_UPDATE: + case GF_UPDATE: + if (free_fb_index != INVALID_IDX) { + refresh_mask = 1 << free_fb_index; + } else { + if (ref_buffer_stack->gld_stack_size) + refresh_mask = + 1 << ref_buffer_stack + ->gld_stack[ref_buffer_stack->gld_stack_size - 1]; + else + refresh_mask = + 1 << ref_buffer_stack + ->lst_stack[ref_buffer_stack->lst_stack_size - 1]; + } + break; + case LF_UPDATE: + if (free_fb_index != INVALID_IDX) { + refresh_mask = 1 << free_fb_index; + } else { + if (ref_buffer_stack->lst_stack_size >= 2) + refresh_mask = + 1 << ref_buffer_stack + ->lst_stack[ref_buffer_stack->lst_stack_size - 1]; + else if (ref_buffer_stack->gld_stack_size >= 2) + refresh_mask = + 1 << ref_buffer_stack + ->gld_stack[ref_buffer_stack->gld_stack_size - 1]; + else + assert(0 && "No ref map index found"); + } + break; + case ARF_UPDATE: + if (free_fb_index != INVALID_IDX) { + refresh_mask = 1 << free_fb_index; + } else { + if (ref_buffer_stack->gld_stack_size >= 3) + refresh_mask = + 1 << ref_buffer_stack + ->gld_stack[ref_buffer_stack->gld_stack_size - 1]; + else if (ref_buffer_stack->lst_stack_size >= 2) + refresh_mask = + 1 << ref_buffer_stack + ->lst_stack[ref_buffer_stack->lst_stack_size - 1]; + else + assert(0 && "No ref map index found"); + } + break; + case INTNL_ARF_UPDATE: + if (free_fb_index != INVALID_IDX) { + refresh_mask = 1 << free_fb_index; + } else { + refresh_mask = + 1 << ref_buffer_stack + ->lst_stack[ref_buffer_stack->lst_stack_size - 1]; + } + break; + case OVERLAY_UPDATE: break; + case INTNL_OVERLAY_UPDATE: break; + default: assert(0); break; + } + + return refresh_mask; +} + +#if !CONFIG_REALTIME_ONLY +void setup_mi(AV1_COMP *const cpi, YV12_BUFFER_CONFIG *src) { + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + + av1_setup_src_planes(x, src, 0, 0, num_planes, cm->seq_params.sb_size); + + av1_setup_block_planes(xd, cm->seq_params.subsampling_x, + cm->seq_params.subsampling_y, num_planes); + + set_mi_offsets(&cm->mi_params, xd, 0, 0); +} + +// Apply temporal filtering to key frames and encode the filtered frame. +// If the current frame is not key frame, this function is identical to +// av1_encode(). +static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest, + EncodeFrameInput *const frame_input, + EncodeFrameParams *const frame_params, + EncodeFrameResults *const frame_results) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + AV1_COMMON *const cm = &cpi->common; + + // Decide whether to apply temporal filtering to the source frame. + int apply_filtering = + frame_params->frame_type == KEY_FRAME && + oxcf->enable_keyframe_filtering && !is_stat_generation_stage(cpi) && + !frame_params->show_existing_frame && + cpi->rc.frames_to_key > TF_NUM_FILTERING_FRAMES_FOR_KEY_FRAME && + !is_lossless_requested(oxcf) && oxcf->arnr_max_frames > 0; + if (apply_filtering) { + const double y_noise_level = av1_estimate_noise_from_single_plane( + frame_input->source, 0, cm->seq_params.bit_depth); + apply_filtering = y_noise_level > 0; + } + + // Save the pointer to the original source image. + YV12_BUFFER_CONFIG *source_kf_buffer = frame_input->source; + + // Apply filtering to key frame. + if (apply_filtering) { + // Initialization for frame motion estimation. + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + av1_init_mi_buffers(&cm->mi_params); + setup_mi(cpi, frame_input->source); + av1_init_macroblockd(cm, xd, NULL); + memset( + cpi->mbmi_ext_info.frame_base, 0, + cpi->mbmi_ext_info.alloc_size * sizeof(*cpi->mbmi_ext_info.frame_base)); + + av1_set_speed_features_framesize_independent(cpi, oxcf->speed); + av1_set_speed_features_framesize_dependent(cpi, oxcf->speed); + av1_set_rd_speed_thresholds(cpi); + av1_setup_frame_buf_refs(cm); + av1_setup_frame_sign_bias(cm); + av1_frame_init_quantizer(cpi); + av1_setup_past_independence(cm); + + if (!frame_params->show_frame) { + int arf_src_index = get_arf_src_index(&cpi->gf_group, cpi->oxcf.pass); + av1_temporal_filter(cpi, -1 * arf_src_index, NULL); + } else { + av1_temporal_filter(cpi, -1, NULL); + } + aom_extend_frame_borders(&cpi->alt_ref_buffer, av1_num_planes(cm)); + // Use the filtered frame for encoding. + frame_input->source = &cpi->alt_ref_buffer; + // Copy metadata info to alt-ref buffer. + aom_remove_metadata_from_frame_buffer(frame_input->source); + aom_copy_metadata_to_frame_buffer(frame_input->source, + source_kf_buffer->metadata); + } + + if (frame_params->frame_type == KEY_FRAME && !is_stat_generation_stage(cpi) && + oxcf->enable_tpl_model && oxcf->lag_in_frames > 0 && + frame_params->show_frame) { + av1_tpl_setup_stats(cpi, 0, frame_params, frame_input); + } + + if (av1_encode(cpi, dest, frame_input, frame_params, frame_results) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + + // Set frame_input source to true source for psnr calculation. + if (apply_filtering) { + cpi->source = source_kf_buffer; + cpi->unscaled_source = source_kf_buffer; + } + + return AOM_CODEC_OK; +} +#endif // !CONFIG_REALTIME_ONLY + +static INLINE int find_unused_ref_frame(const int *used_ref_frames, + const int *stack, int stack_size) { + for (int i = 0; i < stack_size; ++i) { + const int this_ref = stack[i]; + int ref_idx = 0; + for (ref_idx = 0; ref_idx <= ALTREF_FRAME - LAST_FRAME; ++ref_idx) { + if (this_ref == used_ref_frames[ref_idx]) break; + } + + // not in use + if (ref_idx > ALTREF_FRAME - LAST_FRAME) return this_ref; + } + + return INVALID_IDX; +} + +void av1_get_ref_frames(AV1_COMP *const cpi, RefBufferStack *ref_buffer_stack) { + AV1_COMMON *cm = &cpi->common; + int *const remapped_ref_idx = cm->remapped_ref_idx; + int *const arf_stack = ref_buffer_stack->arf_stack; + int *const lst_stack = ref_buffer_stack->lst_stack; + int *const gld_stack = ref_buffer_stack->gld_stack; + const int arf_stack_size = ref_buffer_stack->arf_stack_size; + const int lst_stack_size = ref_buffer_stack->lst_stack_size; + const int gld_stack_size = ref_buffer_stack->gld_stack_size; + + // Initialization + for (int i = 0; i < REF_FRAMES; ++i) remapped_ref_idx[i] = INVALID_IDX; + + if (arf_stack_size) { + remapped_ref_idx[ALTREF_FRAME - LAST_FRAME] = arf_stack[arf_stack_size - 1]; + + if (arf_stack_size > 1) + remapped_ref_idx[BWDREF_FRAME - LAST_FRAME] = arf_stack[0]; + + if (arf_stack_size > 2) + remapped_ref_idx[ALTREF2_FRAME - LAST_FRAME] = arf_stack[1]; + } + + if (lst_stack_size) { + remapped_ref_idx[LAST_FRAME - LAST_FRAME] = lst_stack[0]; + + if (lst_stack_size > 1) + remapped_ref_idx[LAST2_FRAME - LAST_FRAME] = lst_stack[1]; + } + + if (gld_stack_size) { + remapped_ref_idx[GOLDEN_FRAME - LAST_FRAME] = gld_stack[0]; + + if (gld_stack_size > 1) { + if (arf_stack_size <= 1) + remapped_ref_idx[BWDREF_FRAME - LAST_FRAME] = gld_stack[1]; + else + remapped_ref_idx[LAST3_FRAME - LAST_FRAME] = gld_stack[1]; + } + } + + for (int idx = ALTREF_FRAME - LAST_FRAME; idx >= 0; --idx) { + int ref_map_index = remapped_ref_idx[idx]; + + if (ref_map_index != INVALID_IDX) continue; + + ref_map_index = + find_unused_ref_frame(remapped_ref_idx, arf_stack, arf_stack_size); + + if (ref_map_index == INVALID_IDX) { + ref_map_index = + find_unused_ref_frame(remapped_ref_idx, gld_stack, gld_stack_size); + } + + if (ref_map_index == INVALID_IDX) { + ref_map_index = + find_unused_ref_frame(remapped_ref_idx, lst_stack, lst_stack_size); + } + + if (ref_map_index != INVALID_IDX) + remapped_ref_idx[idx] = ref_map_index; + else + remapped_ref_idx[idx] = ref_buffer_stack->gld_stack[0]; + } +} + +int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size, + uint8_t *const dest, unsigned int *frame_flags, + int64_t *const time_stamp, int64_t *const time_end, + const aom_rational64_t *const timestamp_ratio, + int flush) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + AV1_COMMON *const cm = &cpi->common; + GF_GROUP *gf_group = &cpi->gf_group; + ExternalFlags *const ext_flags = &cpi->ext_flags; + + EncodeFrameInput frame_input; + EncodeFrameParams frame_params; + EncodeFrameResults frame_results; + memset(&frame_input, 0, sizeof(frame_input)); + memset(&frame_params, 0, sizeof(frame_params)); + memset(&frame_results, 0, sizeof(frame_results)); + + // TODO(sarahparker) finish bit allocation for one pass pyramid + if (has_no_stats_stage(cpi) && oxcf->rc_mode != AOM_Q) { + cpi->oxcf.gf_max_pyr_height = + AOMMIN(cpi->oxcf.gf_max_pyr_height, USE_ALTREF_FOR_ONE_PASS); + cpi->oxcf.gf_min_pyr_height = + AOMMIN(cpi->oxcf.gf_min_pyr_height, cpi->oxcf.gf_max_pyr_height); + } + + if (!is_stat_generation_stage(cpi)) { + // If this is a forward keyframe, mark as a show_existing_frame + if (cpi->oxcf.fwd_kf_enabled && (gf_group->index == gf_group->size) && + gf_group->update_type[1] == ARF_UPDATE && cpi->rc.frames_to_key == 0) { + frame_params.show_existing_frame = 1; + } else { + frame_params.show_existing_frame = + ((oxcf->enable_overlay == 0 || cpi->sf.hl_sf.disable_overlay_frames || + cpi->show_existing_alt_ref) && + gf_group->update_type[gf_group->index] == OVERLAY_UPDATE) || + gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE; + } + frame_params.show_existing_frame &= allow_show_existing(cpi, *frame_flags); + + // Reset show_existing_alt_ref decision to 0 after it is used. + if (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE) { + cpi->show_existing_alt_ref = 0; + } + } else { + frame_params.show_existing_frame = 0; + } + + int code_arf = 0; + struct lookahead_entry *source = NULL; + struct lookahead_entry *last_source = NULL; + if (frame_params.show_existing_frame) { + source = av1_lookahead_pop(cpi->lookahead, flush, cpi->compressor_stage); + frame_params.show_frame = 1; + } else { + int show_existing_alt_ref = 0; + source = choose_frame_source(cpi, &code_arf, &flush, &last_source, + &frame_params, &show_existing_alt_ref); + if (gf_group->update_type[gf_group->index] == ARF_UPDATE) + cpi->show_existing_alt_ref = show_existing_alt_ref; + } + + if (source == NULL) { // If no source was found, we can't encode a frame. +#if !CONFIG_REALTIME_ONLY + if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) { + av1_end_first_pass(cpi); /* get last stats packet */ + cpi->twopass.first_pass_done = 1; + } +#endif + return -1; + } + + frame_input.source = code_arf ? &cpi->alt_ref_buffer : &source->img; + frame_input.last_source = last_source != NULL ? &last_source->img : NULL; + frame_input.ts_duration = source->ts_end - source->ts_start; + // Save unfiltered source. It is used in av1_get_second_pass_params(). + cpi->unfiltered_source = frame_input.source; + + *time_stamp = source->ts_start; + *time_end = source->ts_end; + if (source->ts_start < cpi->time_stamps.first_ever) { + cpi->time_stamps.first_ever = source->ts_start; + cpi->time_stamps.prev_end_seen = source->ts_start; + } + + av1_apply_encoding_flags(cpi, source->flags); + if (!frame_params.show_existing_frame) + *frame_flags = (source->flags & AOM_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0; + + // Shown frames and arf-overlay frames need frame-rate considering + if (frame_params.show_frame) + adjust_frame_rate(cpi, source->ts_start, source->ts_end); + + if (!frame_params.show_existing_frame) { + if (cpi->film_grain_table) { + cm->cur_frame->film_grain_params_present = aom_film_grain_table_lookup( + cpi->film_grain_table, *time_stamp, *time_end, 0 /* =erase */, + &cm->film_grain_params); + } else { + cm->cur_frame->film_grain_params_present = + cm->seq_params.film_grain_params_present; + } + // only one operating point supported now + const int64_t pts64 = ticks_to_timebase_units(timestamp_ratio, *time_stamp); + if (pts64 < 0 || pts64 > UINT32_MAX) return AOM_CODEC_ERROR; + cm->frame_presentation_time = (uint32_t)pts64; + } + +#if CONFIG_REALTIME_ONLY + av1_get_one_pass_rt_params(cpi, &frame_params, *frame_flags); +#else + if (has_no_stats_stage(cpi) && oxcf->mode == REALTIME && + oxcf->lag_in_frames == 0) + av1_get_one_pass_rt_params(cpi, &frame_params, *frame_flags); + else if (!is_stat_generation_stage(cpi)) + av1_get_second_pass_params(cpi, &frame_params, &frame_input, *frame_flags); +#endif + FRAME_UPDATE_TYPE frame_update_type = get_frame_update_type(gf_group); + + if (frame_params.show_existing_frame && + frame_params.frame_type != KEY_FRAME) { + // Force show-existing frames to be INTER, except forward keyframes + frame_params.frame_type = INTER_FRAME; + } + + // TODO(david.turner@argondesign.com): Move all the encode strategy + // (largely near av1_get_compressed_data) in here + + // TODO(david.turner@argondesign.com): Change all the encode strategy to + // modify frame_params instead of cm or cpi. + + // Per-frame encode speed. In theory this can vary, but things may have been + // written assuming speed-level will not change within a sequence, so this + // parameter should be used with caution. + frame_params.speed = oxcf->speed; + + // Work out some encoding parameters specific to the pass: + if (has_no_stats_stage(cpi) && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + av1_cyclic_refresh_update_parameters(cpi); + } else if (is_stat_generation_stage(cpi)) { + cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(&cpi->oxcf); + const int kf_requested = (cm->current_frame.frame_number == 0 || + (*frame_flags & FRAMEFLAGS_KEY)); + if (kf_requested && frame_update_type != OVERLAY_UPDATE && + frame_update_type != INTNL_OVERLAY_UPDATE) { + frame_params.frame_type = KEY_FRAME; + } else { + frame_params.frame_type = INTER_FRAME; + } + } else if (is_stat_consumption_stage(cpi)) { +#if CONFIG_MISMATCH_DEBUG + mismatch_move_frame_idx_w(); +#endif +#if TXCOEFF_COST_TIMER + cm->txcoeff_cost_timer = 0; + cm->txcoeff_cost_count = 0; +#endif + } + + if (!is_stat_generation_stage(cpi)) + set_ext_overrides(cm, &frame_params, ext_flags); + + // Shown keyframes and S frames refresh all reference buffers + const int force_refresh_all = + ((frame_params.frame_type == KEY_FRAME && frame_params.show_frame) || + frame_params.frame_type == S_FRAME) && + !frame_params.show_existing_frame; + + av1_configure_buffer_updates(cpi, &frame_params, frame_update_type, + force_refresh_all); + + if (!is_stat_generation_stage(cpi)) { + const RefCntBuffer *ref_frames[INTER_REFS_PER_FRAME]; + const YV12_BUFFER_CONFIG *ref_frame_buf[INTER_REFS_PER_FRAME]; + + if (!ext_flags->refresh_frame_flags_pending) { + av1_get_ref_frames(cpi, &cpi->ref_buffer_stack); + } else if (cpi->svc.external_ref_frame_config) { + for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) + cm->remapped_ref_idx[i] = cpi->svc.ref_idx[i]; + } + + // Get the reference frames + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + ref_frames[i] = get_ref_frame_buf(cm, ref_frame_priority_order[i]); + ref_frame_buf[i] = ref_frames[i] != NULL ? &ref_frames[i]->buf : NULL; + } + // Work out which reference frame slots may be used. + frame_params.ref_frame_flags = get_ref_frame_flags( + &cpi->sf, ref_frame_buf, ext_flags->ref_frame_flags); + + frame_params.primary_ref_frame = + choose_primary_ref_frame(cpi, &frame_params); + frame_params.order_offset = get_order_offset(&cpi->gf_group, &frame_params); + + frame_params.refresh_frame_flags = av1_get_refresh_frame_flags( + cpi, &frame_params, frame_update_type, &cpi->ref_buffer_stack); + + frame_params.existing_fb_idx_to_show = + frame_params.show_existing_frame + ? (frame_update_type == INTNL_OVERLAY_UPDATE + ? get_ref_frame_map_idx(cm, BWDREF_FRAME) + : get_ref_frame_map_idx(cm, ALTREF_FRAME)) + : INVALID_IDX; + } + + // The way frame_params->remapped_ref_idx is setup is a placeholder. + // Currently, reference buffer assignment is done by update_ref_frame_map() + // which is called by high-level strategy AFTER encoding a frame. It modifies + // cm->remapped_ref_idx. If you want to use an alternative method to + // determine reference buffer assignment, just put your assignments into + // frame_params->remapped_ref_idx here and they will be used when encoding + // this frame. If frame_params->remapped_ref_idx is setup independently of + // cm->remapped_ref_idx then update_ref_frame_map() will have no effect. + memcpy(frame_params.remapped_ref_idx, cm->remapped_ref_idx, + REF_FRAMES * sizeof(*cm->remapped_ref_idx)); + + cpi->td.mb.e_mbd.delta_qindex = 0; + + if (!frame_params.show_existing_frame) { + cm->quant_params.using_qmatrix = cpi->oxcf.using_qm; +#if !CONFIG_REALTIME_ONLY + if (oxcf->lag_in_frames > 0 && !is_stat_generation_stage(cpi)) { + if (cpi->gf_group.index == 1 && cpi->oxcf.enable_tpl_model) { + av1_configure_buffer_updates(cpi, &frame_params, frame_update_type, 0); + av1_set_frame_size(cpi, cm->width, cm->height); + av1_tpl_setup_stats(cpi, 0, &frame_params, &frame_input); + assert(cpi->num_gf_group_show_frames == 1); + } + } +#endif + } + +#if CONFIG_REALTIME_ONLY + if (av1_encode(cpi, dest, &frame_input, &frame_params, &frame_results) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } +#else + if (denoise_and_encode(cpi, dest, &frame_input, &frame_params, + &frame_results) != AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } +#endif // CONFIG_REALTIME_ONLY + if (!is_stat_generation_stage(cpi)) + cpi->num_gf_group_show_frames += frame_params.show_frame; + + if (!is_stat_generation_stage(cpi)) { + // First pass doesn't modify reference buffer assignment or produce frame + // flags + update_frame_flags(cpi, frame_flags); + if (!ext_flags->refresh_frame_flags_pending) { + int ref_map_index = + av1_get_refresh_ref_frame_map(cm->current_frame.refresh_frame_flags); + av1_update_ref_frame_map(cpi, frame_update_type, cm->show_existing_frame, + ref_map_index, &cpi->ref_buffer_stack); + } + } + +#if !CONFIG_REALTIME_ONLY + if (!is_stat_generation_stage(cpi)) { +#if TXCOEFF_COST_TIMER + cm->cum_txcoeff_cost_timer += cm->txcoeff_cost_timer; + fprintf(stderr, + "\ntxb coeff cost block number: %ld, frame time: %ld, cum time %ld " + "in us\n", + cm->txcoeff_cost_count, cm->txcoeff_cost_timer, + cm->cum_txcoeff_cost_timer); +#endif + av1_twopass_postencode_update(cpi); + } +#endif // !CONFIG_REALTIME_ONLY + + if (!is_stat_generation_stage(cpi)) { + update_fb_of_context_type(cpi, &frame_params, cpi->fb_of_context_type); + set_additional_frame_flags(cm, frame_flags); + update_rc_counts(cpi); + } + + // Unpack frame_results: + *size = frame_results.size; + + // Leave a signal for a higher level caller about if this frame is droppable + if (*size > 0) { + cpi->droppable = is_frame_droppable(&cpi->svc, ext_flags); + } + + if (cpi->use_svc) av1_save_layer_context(cpi); + + return AOM_CODEC_OK; +} diff --git a/libs/libaom/src/av1/encoder/encode_strategy.h b/libs/libaom/src/av1/encoder/encode_strategy.h new file mode 100644 index 000000000..b05224ba1 --- /dev/null +++ b/libs/libaom/src/av1/encoder/encode_strategy.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ENCODE_STRATEGY_H_ +#define AOM_AV1_ENCODER_ENCODE_STRATEGY_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include "aom/aom_encoder.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/firstpass.h" + +// This function will implement high-level encode strategy, choosing frame type, +// frame placement, etc. It populates an EncodeFrameParams struct with the +// results of these decisions and then calls av1_encode() +int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size, + uint8_t *const dest, unsigned int *frame_flags, + int64_t *const time_stamp, int64_t *const time_end, + const aom_rational64_t *const timestamp_ratio, + int flush); + +// Set individual buffer update flags based on frame reference type. +// force_refresh_all is used when we have a KEY_FRAME or S_FRAME. It forces all +// refresh_*_frame flags to be set, because we refresh all buffers in this case. +void av1_configure_buffer_updates(AV1_COMP *const cpi, + EncodeFrameParams *const frame_params, + const FRAME_UPDATE_TYPE type, + int force_refresh_all); + +int av1_get_refresh_frame_flags(const AV1_COMP *const cpi, + const EncodeFrameParams *const frame_params, + FRAME_UPDATE_TYPE frame_update_type, + const RefBufferStack *const ref_buffer_stack); + +int av1_get_refresh_ref_frame_map(int refresh_frame_flags); + +void av1_update_ref_frame_map(AV1_COMP *cpi, + FRAME_UPDATE_TYPE frame_update_type, + int show_existing_frame, int ref_map_index, + RefBufferStack *ref_buffer_stack); + +void av1_get_ref_frames(AV1_COMP *const cpi, RefBufferStack *ref_buffer_stack); + +int is_forced_keyframe_pending(struct lookahead_ctx *lookahead, + const int up_to_index, + const COMPRESSOR_STAGE compressor_stage); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ENCODE_STRATEGY_H_ diff --git a/libs/libaom/src/av1/encoder/encodeframe.c b/libs/libaom/src/av1/encoder/encodeframe.c new file mode 100644 index 000000000..53b47d49e --- /dev/null +++ b/libs/libaom/src/av1/encoder/encodeframe.c @@ -0,0 +1,6475 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/binary_codes_writer.h" +#include "aom_ports/mem.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/system_state.h" + +#if CONFIG_MISMATCH_DEBUG +#include "aom_util/debug_util.h" +#endif // CONFIG_MISMATCH_DEBUG + +#include "av1/common/cfl.h" +#include "av1/common/common.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/idct.h" +#include "av1/common/mv.h" +#include "av1/common/mvref_common.h" +#include "av1/common/pred_common.h" +#include "av1/common/quant_common.h" +#include "av1/common/reconintra.h" +#include "av1/common/reconinter.h" +#include "av1/common/seg_common.h" +#include "av1/common/tile_common.h" +#include "av1/common/warped_motion.h" + +#include "av1/encoder/aq_complexity.h" +#include "av1/encoder/aq_cyclicrefresh.h" +#include "av1/encoder/aq_variance.h" +#include "av1/encoder/corner_detect.h" +#include "av1/encoder/global_motion.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encodemb.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/encodetxb.h" +#include "av1/encoder/ethread.h" +#include "av1/encoder/extend.h" +#include "av1/encoder/ml.h" +#include "av1/encoder/motion_search_facade.h" +#include "av1/encoder/partition_strategy.h" +#if !CONFIG_REALTIME_ONLY +#include "av1/encoder/partition_model_weights.h" +#endif +#include "av1/encoder/rd.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/segmentation.h" +#include "av1/encoder/tokenize.h" +#include "av1/encoder/tpl_model.h" +#include "av1/encoder/var_based_part.h" + +#if CONFIG_TUNE_VMAF +#include "av1/encoder/tune_vmaf.h" +#endif + +static AOM_INLINE void encode_superblock(const AV1_COMP *const cpi, + TileDataEnc *tile_data, ThreadData *td, + TOKENEXTRA **t, RUN_TYPE dry_run, + BLOCK_SIZE bsize, int *rate); + +// This is used as a reference when computing the source variance for the +// purposes of activity masking. +// Eventually this should be replaced by custom no-reference routines, +// which will be faster. +const uint8_t AV1_VAR_OFFS[MAX_SB_SIZE] = { + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128 +}; + +static const uint16_t AV1_HIGH_VAR_OFFS_8[MAX_SB_SIZE] = { + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128 +}; + +static const uint16_t AV1_HIGH_VAR_OFFS_10[MAX_SB_SIZE] = { + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4 +}; + +static const uint16_t AV1_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = { + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16 +}; + +typedef struct { + ENTROPY_CONTEXT a[MAX_MIB_SIZE * MAX_MB_PLANE]; + ENTROPY_CONTEXT l[MAX_MIB_SIZE * MAX_MB_PLANE]; + PARTITION_CONTEXT sa[MAX_MIB_SIZE]; + PARTITION_CONTEXT sl[MAX_MIB_SIZE]; + TXFM_CONTEXT *p_ta; + TXFM_CONTEXT *p_tl; + TXFM_CONTEXT ta[MAX_MIB_SIZE]; + TXFM_CONTEXT tl[MAX_MIB_SIZE]; +} RD_SEARCH_MACROBLOCK_CONTEXT; + +enum { PICK_MODE_RD = 0, PICK_MODE_NONRD }; + +enum { + SB_SINGLE_PASS, // Single pass encoding: all ctxs get updated normally + SB_DRY_PASS, // First pass of multi-pass: does not update the ctxs + SB_WET_PASS // Second pass of multi-pass: finalize and update the ctx +} UENUM1BYTE(SB_MULTI_PASS_MODE); + +// This struct is used to store the statistics used by sb-level multi-pass +// encoding. Currently, this is only used to make a copy of the state before we +// perform the first pass +typedef struct SB_FIRST_PASS_STATS { + RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; + RD_COUNTS rd_count; + + int split_count; + FRAME_COUNTS fc; + InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL]; + int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES]; + int current_qindex; + +#if CONFIG_INTERNAL_STATS + unsigned int mode_chosen_counts[MAX_MODES]; +#endif // CONFIG_INTERNAL_STATS +} SB_FIRST_PASS_STATS; + +unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi, + const struct buf_2d *ref, + BLOCK_SIZE bs) { + unsigned int sse; + const unsigned int var = + cpi->fn_ptr[bs].vf(ref->buf, ref->stride, AV1_VAR_OFFS, 0, &sse); + return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]); +} + +unsigned int av1_high_get_sby_perpixel_variance(const AV1_COMP *cpi, + const struct buf_2d *ref, + BLOCK_SIZE bs, int bd) { + unsigned int var, sse; + assert(bd == 8 || bd == 10 || bd == 12); + const int off_index = (bd - 8) >> 1; + const uint16_t *high_var_offs[3] = { AV1_HIGH_VAR_OFFS_8, + AV1_HIGH_VAR_OFFS_10, + AV1_HIGH_VAR_OFFS_12 }; + var = + cpi->fn_ptr[bs].vf(ref->buf, ref->stride, + CONVERT_TO_BYTEPTR(high_var_offs[off_index]), 0, &sse); + return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]); +} + +static unsigned int get_sby_perpixel_diff_variance(const AV1_COMP *const cpi, + const struct buf_2d *ref, + int mi_row, int mi_col, + BLOCK_SIZE bs) { + unsigned int sse, var; + uint8_t *last_y; + const YV12_BUFFER_CONFIG *last = + get_ref_frame_yv12_buf(&cpi->common, LAST_FRAME); + + assert(last != NULL); + last_y = + &last->y_buffer[mi_row * MI_SIZE * last->y_stride + mi_col * MI_SIZE]; + var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, last_y, last->y_stride, &sse); + return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]); +} + +static BLOCK_SIZE get_rd_var_based_fixed_partition(AV1_COMP *cpi, MACROBLOCK *x, + int mi_row, int mi_col) { + unsigned int var = get_sby_perpixel_diff_variance( + cpi, &x->plane[0].src, mi_row, mi_col, BLOCK_64X64); + if (var < 8) + return BLOCK_64X64; + else if (var < 128) + return BLOCK_32X32; + else if (var < 2048) + return BLOCK_16X16; + else + return BLOCK_8X8; +} + +static int set_deltaq_rdmult(const AV1_COMP *const cpi, MACROBLOCKD *const xd) { + const AV1_COMMON *const cm = &cpi->common; + const CommonQuantParams *quant_params = &cm->quant_params; + return av1_compute_rd_mult(cpi, quant_params->base_qindex + xd->delta_qindex + + quant_params->y_dc_delta_q); +} + +static AOM_INLINE void set_ssim_rdmult(const AV1_COMP *const cpi, + MACROBLOCK *const x, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col, int *const rdmult) { + const AV1_COMMON *const cm = &cpi->common; + + const int bsize_base = BLOCK_16X16; + const int num_mi_w = mi_size_wide[bsize_base]; + const int num_mi_h = mi_size_high[bsize_base]; + const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w; + const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h; + const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w; + const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h; + int row, col; + double num_of_mi = 0.0; + double geom_mean_of_scale = 0.0; + + assert(cpi->oxcf.tuning == AOM_TUNE_SSIM); + + aom_clear_system_state(); + for (row = mi_row / num_mi_w; + row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) { + for (col = mi_col / num_mi_h; + col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) { + const int index = row * num_cols + col; + geom_mean_of_scale += log(cpi->ssim_rdmult_scaling_factors[index]); + num_of_mi += 1.0; + } + } + geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi); + + *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5); + *rdmult = AOMMAX(*rdmult, 0); + set_error_per_bit(x, *rdmult); + aom_clear_system_state(); +} + +static int get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col, int orig_rdmult) { + const AV1_COMMON *const cm = &cpi->common; + assert(IMPLIES(cpi->gf_group.size > 0, + cpi->gf_group.index < cpi->gf_group.size)); + const int tpl_idx = cpi->gf_group.index; + const TplDepFrame *tpl_frame = &cpi->tpl_data.tpl_frame[tpl_idx]; + MACROBLOCKD *const xd = &x->e_mbd; + const int deltaq_rdmult = set_deltaq_rdmult(cpi, xd); + if (tpl_frame->is_valid == 0) return deltaq_rdmult; + if (!is_frame_tpl_eligible((AV1_COMP *)cpi)) return deltaq_rdmult; + if (tpl_idx >= MAX_LAG_BUFFERS) return deltaq_rdmult; + if (cpi->superres_mode != SUPERRES_NONE) return deltaq_rdmult; + if (cpi->oxcf.aq_mode != NO_AQ) return deltaq_rdmult; + + const int bsize_base = BLOCK_16X16; + const int num_mi_w = mi_size_wide[bsize_base]; + const int num_mi_h = mi_size_high[bsize_base]; + const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w; + const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h; + const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w; + const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h; + int row, col; + double base_block_count = 0.0; + double geom_mean_of_scale = 0.0; + aom_clear_system_state(); + for (row = mi_row / num_mi_w; + row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) { + for (col = mi_col / num_mi_h; + col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) { + const int index = row * num_cols + col; + geom_mean_of_scale += log(cpi->tpl_sb_rdmult_scaling_factors[index]); + base_block_count += 1.0; + } + } + geom_mean_of_scale = exp(geom_mean_of_scale / base_block_count); + int rdmult = (int)((double)orig_rdmult * geom_mean_of_scale + 0.5); + rdmult = AOMMAX(rdmult, 0); + set_error_per_bit(x, rdmult); + aom_clear_system_state(); + if (bsize == cm->seq_params.sb_size) { + const int rdmult_sb = set_deltaq_rdmult(cpi, xd); + assert(rdmult_sb == rdmult); + (void)rdmult_sb; + } + return rdmult; +} + +static int set_segment_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x, + int8_t segment_id) { + const AV1_COMMON *const cm = &cpi->common; + av1_init_plane_quantizers(cpi, x, segment_id); + aom_clear_system_state(); + const int segment_qindex = + av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex); + return av1_compute_rd_mult(cpi, + segment_qindex + cm->quant_params.y_dc_delta_q); +} + +static AOM_INLINE void setup_block_rdmult(const AV1_COMP *const cpi, + MACROBLOCK *const x, int mi_row, + int mi_col, BLOCK_SIZE bsize, + AQ_MODE aq_mode, MB_MODE_INFO *mbmi) { + x->rdmult = cpi->rd.RDMULT; + + if (aq_mode != NO_AQ) { + assert(mbmi != NULL); + if (aq_mode == VARIANCE_AQ) { + if (cpi->vaq_refresh) { + const int energy = bsize <= BLOCK_16X16 + ? x->mb_energy + : av1_log_block_var(cpi, x, bsize); + mbmi->segment_id = energy; + } + x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id); + } else if (aq_mode == COMPLEXITY_AQ) { + x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id); + } else if (aq_mode == CYCLIC_REFRESH_AQ) { + // If segment is boosted, use rdmult for that segment. + if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) + x->rdmult = av1_cyclic_refresh_get_rdmult(cpi->cyclic_refresh); + } + } + + const AV1_COMMON *const cm = &cpi->common; + if (cm->delta_q_info.delta_q_present_flag && + !cpi->sf.rt_sf.use_nonrd_pick_mode) { + x->rdmult = get_hier_tpl_rdmult(cpi, x, bsize, mi_row, mi_col, x->rdmult); + } + + if (cpi->oxcf.tuning == AOM_TUNE_SSIM) { + set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult); + } +#if CONFIG_TUNE_VMAF + if (cpi->oxcf.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING || + cpi->oxcf.tuning == AOM_TUNE_VMAF_MAX_GAIN) { + av1_set_vmaf_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult); + } +#endif +} + +static AOM_INLINE void set_offsets_without_segment_id( + const AV1_COMP *const cpi, const TileInfo *const tile, MACROBLOCK *const x, + int mi_row, int mi_col, BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + assert(bsize < BLOCK_SIZES_ALL); + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + + set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd, + mi_row, mi_col); + + set_entropy_context(xd, mi_row, mi_col, num_planes); + xd->above_txfm_context = cm->above_contexts.txfm[tile->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + + // Set up destination pointers. + av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0, + num_planes); + + // Set up limit values for MV components. + // Mv beyond the range do not produce new/different prediction block. + av1_set_mv_limits(&cm->mi_params, &x->mv_limits, mi_row, mi_col, mi_height, + mi_width, cpi->oxcf.border_in_pixels); + + set_plane_n4(xd, mi_width, mi_height, num_planes); + + // Set up distance of MB to edge of frame in 1/8th pel units. + assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1))); + set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width, + cm->mi_params.mi_rows, cm->mi_params.mi_cols); + + // Set up source buffers. + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize); + + // required by av1_append_sub8x8_mvs_for_idx() and av1_find_best_ref_mvs() + xd->tile = *tile; +} + +static AOM_INLINE void set_offsets(const AV1_COMP *const cpi, + const TileInfo *const tile, + MACROBLOCK *const x, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + const struct segmentation *const seg = &cm->seg; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi; + + set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize); + + // Setup segment ID. + mbmi = xd->mi[0]; + mbmi->segment_id = 0; + if (seg->enabled) { + if (seg->enabled && !cpi->vaq_refresh) { + const uint8_t *const map = + seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map; + mbmi->segment_id = + map ? get_segment_id(&cm->mi_params, map, bsize, mi_row, mi_col) : 0; + } + av1_init_plane_quantizers(cpi, x, mbmi->segment_id); + } +} + +static AOM_INLINE void update_filter_type_count(FRAME_COUNTS *counts, + const MACROBLOCKD *xd, + const MB_MODE_INFO *mbmi) { + int dir; + for (dir = 0; dir < 2; ++dir) { + const int ctx = av1_get_pred_context_switchable_interp(xd, dir); + InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir); + ++counts->switchable_interp[ctx][filter]; + } +} + +static AOM_INLINE void update_filter_type_cdf(const MACROBLOCKD *xd, + const MB_MODE_INFO *mbmi) { + int dir; + for (dir = 0; dir < 2; ++dir) { + const int ctx = av1_get_pred_context_switchable_interp(xd, dir); + InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir); + update_cdf(xd->tile_ctx->switchable_interp_cdf[ctx], filter, + SWITCHABLE_FILTERS); + } +} + +static AOM_INLINE void update_global_motion_used(PREDICTION_MODE mode, + BLOCK_SIZE bsize, + const MB_MODE_INFO *mbmi, + RD_COUNTS *rdc) { + if (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) { + const int num_4x4s = mi_size_wide[bsize] * mi_size_high[bsize]; + int ref; + for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) { + rdc->global_motion_used[mbmi->ref_frame[ref]] += num_4x4s; + } + } +} + +static AOM_INLINE void reset_tx_size(MACROBLOCK *x, MB_MODE_INFO *mbmi, + const TX_MODE tx_mode) { + MACROBLOCKD *const xd = &x->e_mbd; + if (xd->lossless[mbmi->segment_id]) { + mbmi->tx_size = TX_4X4; + } else if (tx_mode != TX_MODE_SELECT) { + mbmi->tx_size = tx_size_from_tx_mode(mbmi->sb_type, tx_mode); + } else { + BLOCK_SIZE bsize = mbmi->sb_type; + TX_SIZE min_tx_size = depth_to_tx_size(MAX_TX_DEPTH, bsize); + mbmi->tx_size = (TX_SIZE)TXSIZEMAX(mbmi->tx_size, min_tx_size); + } + if (is_inter_block(mbmi)) { + memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); + } + const int stride = xd->tx_type_map_stride; + const int bw = mi_size_wide[mbmi->sb_type]; + for (int row = 0; row < mi_size_high[mbmi->sb_type]; ++row) { + memset(xd->tx_type_map + row * stride, DCT_DCT, + bw * sizeof(xd->tx_type_map[0])); + } + av1_zero(x->blk_skip); + x->force_skip = 0; +} + +// This function will copy the best reference mode information from +// MB_MODE_INFO_EXT_FRAME to MB_MODE_INFO_EXT. +static INLINE void copy_mbmi_ext_frame_to_mbmi_ext( + MB_MODE_INFO_EXT *mbmi_ext, + const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_best, uint8_t ref_frame_type) { + memcpy(mbmi_ext->ref_mv_stack[ref_frame_type], mbmi_ext_best->ref_mv_stack, + sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE])); + memcpy(mbmi_ext->weight[ref_frame_type], mbmi_ext_best->weight, + sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE])); + mbmi_ext->mode_context[ref_frame_type] = mbmi_ext_best->mode_context; + mbmi_ext->ref_mv_count[ref_frame_type] = mbmi_ext_best->ref_mv_count; + memcpy(mbmi_ext->global_mvs, mbmi_ext_best->global_mvs, + sizeof(mbmi_ext->global_mvs)); +} + +static AOM_INLINE void update_state(const AV1_COMP *const cpi, ThreadData *td, + const PICK_MODE_CONTEXT *const ctx, + int mi_row, int mi_col, BLOCK_SIZE bsize, + RUN_TYPE dry_run) { + int i, x_idx, y; + const AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int num_planes = av1_num_planes(cm); + RD_COUNTS *const rdc = &td->rd_counts; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; + const MB_MODE_INFO *const mi = &ctx->mic; + MB_MODE_INFO *const mi_addr = xd->mi[0]; + const struct segmentation *const seg = &cm->seg; + const int bw = mi_size_wide[mi->sb_type]; + const int bh = mi_size_high[mi->sb_type]; + const int mis = mi_params->mi_stride; + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + + assert(mi->sb_type == bsize); + + *mi_addr = *mi; + copy_mbmi_ext_frame_to_mbmi_ext(x->mbmi_ext, &ctx->mbmi_ext_best, + av1_ref_frame_type(ctx->mic.ref_frame)); + + memcpy(x->blk_skip, ctx->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); + + x->force_skip = ctx->rd_stats.skip; + + xd->tx_type_map = ctx->tx_type_map; + xd->tx_type_map_stride = mi_size_wide[bsize]; + // If not dry_run, copy the transform type data into the frame level buffer. + // Encoder will fetch tx types when writing bitstream. + if (!dry_run) { + const int grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col); + uint8_t *const tx_type_map = mi_params->tx_type_map + grid_idx; + const int mi_stride = mi_params->mi_stride; + for (int blk_row = 0; blk_row < bh; ++blk_row) { + av1_copy_array(tx_type_map + blk_row * mi_stride, + xd->tx_type_map + blk_row * xd->tx_type_map_stride, bw); + } + xd->tx_type_map = tx_type_map; + xd->tx_type_map_stride = mi_stride; + } + + // If segmentation in use + if (seg->enabled) { + // For in frame complexity AQ copy the segment id from the segment map. + if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) { + const uint8_t *const map = + seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map; + mi_addr->segment_id = + map ? get_segment_id(mi_params, map, bsize, mi_row, mi_col) : 0; + reset_tx_size(x, mi_addr, x->tx_mode_search_type); + } + // Else for cyclic refresh mode update the segment map, set the segment id + // and then update the quantizer. + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + av1_cyclic_refresh_update_segment(cpi, mi_addr, mi_row, mi_col, bsize, + ctx->rd_stats.rate, ctx->rd_stats.dist, + x->force_skip); + } + if (mi_addr->uv_mode == UV_CFL_PRED && !is_cfl_allowed(xd)) + mi_addr->uv_mode = UV_DC_PRED; + } + + for (i = 0; i < num_planes; ++i) { + p[i].coeff = ctx->coeff[i]; + p[i].qcoeff = ctx->qcoeff[i]; + pd[i].dqcoeff = ctx->dqcoeff[i]; + p[i].eobs = ctx->eobs[i]; + p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i]; + } + for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i]; + // Restore the coding context of the MB to that that was in place + // when the mode was picked for it + for (y = 0; y < mi_height; y++) { + for (x_idx = 0; x_idx < mi_width; x_idx++) { + if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx && + (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) { + xd->mi[x_idx + y * mis] = mi_addr; + } + } + } + + if (cpi->oxcf.aq_mode) av1_init_plane_quantizers(cpi, x, mi_addr->segment_id); + + if (dry_run) return; + +#if CONFIG_INTERNAL_STATS + { + unsigned int *const mode_chosen_counts = + (unsigned int *)cpi->mode_chosen_counts; // Cast const away. + if (frame_is_intra_only(cm)) { + static const int kf_mode_index[] = { + THR_DC /*DC_PRED*/, + THR_V_PRED /*V_PRED*/, + THR_H_PRED /*H_PRED*/, + THR_D45_PRED /*D45_PRED*/, + THR_D135_PRED /*D135_PRED*/, + THR_D113_PRED /*D113_PRED*/, + THR_D157_PRED /*D157_PRED*/, + THR_D203_PRED /*D203_PRED*/, + THR_D67_PRED /*D67_PRED*/, + THR_SMOOTH, /*SMOOTH_PRED*/ + THR_SMOOTH_V, /*SMOOTH_V_PRED*/ + THR_SMOOTH_H, /*SMOOTH_H_PRED*/ + THR_PAETH /*PAETH_PRED*/, + }; + ++mode_chosen_counts[kf_mode_index[mi_addr->mode]]; + } else { + // Note how often each mode chosen as best + ++mode_chosen_counts[ctx->best_mode_index]; + } + } +#endif + if (!frame_is_intra_only(cm)) { + if (is_inter_block(mi_addr)) { + // TODO(sarahparker): global motion stats need to be handled per-tile + // to be compatible with tile-based threading. + update_global_motion_used(mi_addr->mode, bsize, mi_addr, rdc); + } + + if (cm->features.interp_filter == SWITCHABLE && + mi_addr->motion_mode != WARPED_CAUSAL && + !is_nontrans_global_motion(xd, xd->mi[0])) { + update_filter_type_count(td->counts, xd, mi_addr); + } + + rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff; + rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff; + rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff; + } + + const int x_mis = AOMMIN(bw, mi_params->mi_cols - mi_col); + const int y_mis = AOMMIN(bh, mi_params->mi_rows - mi_row); + if (cm->seq_params.order_hint_info.enable_ref_frame_mvs) + av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis); +} + +void av1_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src, + int mi_row, int mi_col, const int num_planes, + BLOCK_SIZE bsize) { + // Set current frame pointer. + x->e_mbd.cur_buf = src; + + // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet + // the static analysis warnings. + for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); i++) { + const int is_uv = i > 0; + setup_pred_plane( + &x->plane[i].src, bsize, src->buffers[i], src->crop_widths[is_uv], + src->crop_heights[is_uv], src->strides[is_uv], mi_row, mi_col, NULL, + x->e_mbd.plane[i].subsampling_x, x->e_mbd.plane[i].subsampling_y); + } +} + +static EdgeInfo edge_info(const struct buf_2d *ref, const BLOCK_SIZE bsize, + const bool high_bd, const int bd) { + const int width = block_size_wide[bsize]; + const int height = block_size_high[bsize]; + // Implementation requires width to be a multiple of 8. It also requires + // height to be a multiple of 4, but this is always the case. + assert(height % 4 == 0); + if (width % 8 != 0) { + EdgeInfo ei = { .magnitude = 0, .x = 0, .y = 0 }; + return ei; + } + return av1_edge_exists(ref->buf, ref->stride, width, height, high_bd, bd); +} + +static int use_pb_simple_motion_pred_sse(const AV1_COMP *const cpi) { + // TODO(debargha, yuec): Not in use, need to implement a speed feature + // utilizing this data point, and replace '0' by the corresponding speed + // feature flag. + return 0 && !frame_is_intra_only(&cpi->common); +} + +static void hybrid_intra_mode_search(AV1_COMP *cpi, MACROBLOCK *const x, + RD_STATS *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx) { + // TODO(jianj): Investigate the failure of ScalabilityTest in AOM_Q mode, + // which sets base_qindex to 0 on keyframe. + if (cpi->oxcf.rc_mode != AOM_CBR || !cpi->sf.rt_sf.hybrid_intra_pickmode || + bsize < BLOCK_16X16) + av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX); + else + av1_pick_intra_mode(cpi, x, rd_cost, bsize, ctx); +} + +static AOM_INLINE void pick_sb_modes(AV1_COMP *const cpi, + TileDataEnc *tile_data, + MACROBLOCK *const x, int mi_row, + int mi_col, RD_STATS *rd_cost, + PARTITION_TYPE partition, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, RD_STATS best_rd, + int pick_mode_type) { + if (best_rd.rdcost < 0) { + ctx->rd_stats.rdcost = INT64_MAX; + ctx->rd_stats.skip = 0; + av1_invalid_rd_stats(rd_cost); + return; + } + + set_offsets(cpi, &tile_data->tile_info, x, mi_row, mi_col, bsize); + + if (ctx->rd_mode_is_ready) { + assert(ctx->mic.sb_type == bsize); + assert(ctx->mic.partition == partition); + rd_cost->rate = ctx->rd_stats.rate; + rd_cost->dist = ctx->rd_stats.dist; + rd_cost->rdcost = ctx->rd_stats.rdcost; + return; + } + + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; + const AQ_MODE aq_mode = cpi->oxcf.aq_mode; + int i; + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, rd_pick_sb_modes_time); +#endif + + aom_clear_system_state(); + + mbmi = xd->mi[0]; + mbmi->sb_type = bsize; + mbmi->partition = partition; + +#if CONFIG_RD_DEBUG + mbmi->mi_row = mi_row; + mbmi->mi_col = mi_col; +#endif + + xd->tx_type_map = x->tx_type_map; + xd->tx_type_map_stride = mi_size_wide[bsize]; + + for (i = 0; i < num_planes; ++i) { + p[i].coeff = ctx->coeff[i]; + p[i].qcoeff = ctx->qcoeff[i]; + pd[i].dqcoeff = ctx->dqcoeff[i]; + p[i].eobs = ctx->eobs[i]; + p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i]; + } + + for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i]; + + ctx->skippable = 0; + // Set to zero to make sure we do not use the previous encoded frame stats + mbmi->skip = 0; + // Reset skip mode flag. + mbmi->skip_mode = 0; + + if (is_cur_buf_hbd(xd)) { + x->source_variance = av1_high_get_sby_perpixel_variance( + cpi, &x->plane[0].src, bsize, xd->bd); + } else { + x->source_variance = + av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize); + } + if (use_pb_simple_motion_pred_sse(cpi)) { + const FULLPEL_MV start_mv = kZeroFullMv; + unsigned int var = 0; + av1_simple_motion_sse_var(cpi, x, mi_row, mi_col, bsize, start_mv, 0, + &x->simple_motion_pred_sse, &var); + } + + // If the threshold for disabling wedge search is zero, it means the feature + // should not be used. Use a value that will always succeed in the check. + if (cpi->sf.inter_sf.disable_wedge_search_edge_thresh == 0) { + x->edge_strength = UINT16_MAX; + x->edge_strength_x = UINT16_MAX; + x->edge_strength_y = UINT16_MAX; + } else { + EdgeInfo ei = + edge_info(&x->plane[0].src, bsize, is_cur_buf_hbd(xd), xd->bd); + x->edge_strength = ei.magnitude; + x->edge_strength_x = ei.x; + x->edge_strength_y = ei.y; + } + + // Initialize default mode evaluation params + set_mode_eval_params(cpi, x, DEFAULT_EVAL); + + // Save rdmult before it might be changed, so it can be restored later. + const int orig_rdmult = x->rdmult; + setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode, mbmi); + // Set error per bit for current rdmult + set_error_per_bit(x, x->rdmult); + av1_rd_cost_update(x->rdmult, &best_rd); + + // Find best coding mode & reconstruct the MB so it is available + // as a predictor for MBs that follow in the SB + if (frame_is_intra_only(cm)) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_rd_pick_intra_mode_sb_time); +#endif + switch (pick_mode_type) { + case PICK_MODE_RD: + av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd.rdcost); + break; + case PICK_MODE_NONRD: + hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx); + break; + default: assert(0 && "Unknown pick mode type."); + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_rd_pick_intra_mode_sb_time); +#endif + } else { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_rd_pick_inter_mode_sb_time); +#endif + if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col, + rd_cost, bsize, ctx, best_rd.rdcost); + } else { + // TODO(kyslov): do the same for pick_inter_mode_sb_seg_skip + switch (pick_mode_type) { + case PICK_MODE_RD: + av1_rd_pick_inter_mode_sb(cpi, tile_data, x, rd_cost, bsize, ctx, + best_rd.rdcost); + break; + case PICK_MODE_NONRD: + av1_nonrd_pick_inter_mode_sb(cpi, tile_data, x, rd_cost, bsize, ctx, + best_rd.rdcost); + break; + default: assert(0 && "Unknown pick mode type."); + } + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_rd_pick_inter_mode_sb_time); +#endif + } + + // Examine the resulting rate and for AQ mode 2 make a segment choice. + if (rd_cost->rate != INT_MAX && aq_mode == COMPLEXITY_AQ && + bsize >= BLOCK_16X16) { + av1_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate); + } + + x->rdmult = orig_rdmult; + + // TODO(jingning) The rate-distortion optimization flow needs to be + // refactored to provide proper exit/return handle. + if (rd_cost->rate == INT_MAX) rd_cost->rdcost = INT64_MAX; + + ctx->rd_stats.rate = rd_cost->rate; + ctx->rd_stats.dist = rd_cost->dist; + ctx->rd_stats.rdcost = rd_cost->rdcost; + +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, rd_pick_sb_modes_time); +#endif +} + +static AOM_INLINE void update_inter_mode_stats(FRAME_CONTEXT *fc, + FRAME_COUNTS *counts, + PREDICTION_MODE mode, + int16_t mode_context) { + (void)counts; + + int16_t mode_ctx = mode_context & NEWMV_CTX_MASK; + if (mode == NEWMV) { +#if CONFIG_ENTROPY_STATS + ++counts->newmv_mode[mode_ctx][0]; +#endif + update_cdf(fc->newmv_cdf[mode_ctx], 0, 2); + return; + } + +#if CONFIG_ENTROPY_STATS + ++counts->newmv_mode[mode_ctx][1]; +#endif + update_cdf(fc->newmv_cdf[mode_ctx], 1, 2); + + mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; + if (mode == GLOBALMV) { +#if CONFIG_ENTROPY_STATS + ++counts->zeromv_mode[mode_ctx][0]; +#endif + update_cdf(fc->zeromv_cdf[mode_ctx], 0, 2); + return; + } + +#if CONFIG_ENTROPY_STATS + ++counts->zeromv_mode[mode_ctx][1]; +#endif + update_cdf(fc->zeromv_cdf[mode_ctx], 1, 2); + + mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK; +#if CONFIG_ENTROPY_STATS + ++counts->refmv_mode[mode_ctx][mode != NEARESTMV]; +#endif + update_cdf(fc->refmv_cdf[mode_ctx], mode != NEARESTMV, 2); +} + +static AOM_INLINE void update_palette_cdf(MACROBLOCKD *xd, + const MB_MODE_INFO *const mbmi, + FRAME_COUNTS *counts) { + FRAME_CONTEXT *fc = xd->tile_ctx; + const BLOCK_SIZE bsize = mbmi->sb_type; + const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const int palette_bsize_ctx = av1_get_palette_bsize_ctx(bsize); + + (void)counts; + + if (mbmi->mode == DC_PRED) { + const int n = pmi->palette_size[0]; + const int palette_mode_ctx = av1_get_palette_mode_ctx(xd); + +#if CONFIG_ENTROPY_STATS + ++counts->palette_y_mode[palette_bsize_ctx][palette_mode_ctx][n > 0]; +#endif + update_cdf(fc->palette_y_mode_cdf[palette_bsize_ctx][palette_mode_ctx], + n > 0, 2); + if (n > 0) { +#if CONFIG_ENTROPY_STATS + ++counts->palette_y_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE]; +#endif + update_cdf(fc->palette_y_size_cdf[palette_bsize_ctx], + n - PALETTE_MIN_SIZE, PALETTE_SIZES); + } + } + + if (mbmi->uv_mode == UV_DC_PRED) { + const int n = pmi->palette_size[1]; + const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0); + +#if CONFIG_ENTROPY_STATS + ++counts->palette_uv_mode[palette_uv_mode_ctx][n > 0]; +#endif + update_cdf(fc->palette_uv_mode_cdf[palette_uv_mode_ctx], n > 0, 2); + + if (n > 0) { +#if CONFIG_ENTROPY_STATS + ++counts->palette_uv_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE]; +#endif + update_cdf(fc->palette_uv_size_cdf[palette_bsize_ctx], + n - PALETTE_MIN_SIZE, PALETTE_SIZES); + } + } +} + +static AOM_INLINE void sum_intra_stats(const AV1_COMMON *const cm, + FRAME_COUNTS *counts, MACROBLOCKD *xd, + const MB_MODE_INFO *const mbmi, + const MB_MODE_INFO *above_mi, + const MB_MODE_INFO *left_mi, + const int intraonly) { + FRAME_CONTEXT *fc = xd->tile_ctx; + const PREDICTION_MODE y_mode = mbmi->mode; + (void)counts; + const BLOCK_SIZE bsize = mbmi->sb_type; + + if (intraonly) { +#if CONFIG_ENTROPY_STATS + const PREDICTION_MODE above = av1_above_block_mode(above_mi); + const PREDICTION_MODE left = av1_left_block_mode(left_mi); + const int above_ctx = intra_mode_context[above]; + const int left_ctx = intra_mode_context[left]; + ++counts->kf_y_mode[above_ctx][left_ctx][y_mode]; +#endif // CONFIG_ENTROPY_STATS + update_cdf(get_y_mode_cdf(fc, above_mi, left_mi), y_mode, INTRA_MODES); + } else { +#if CONFIG_ENTROPY_STATS + ++counts->y_mode[size_group_lookup[bsize]][y_mode]; +#endif // CONFIG_ENTROPY_STATS + update_cdf(fc->y_mode_cdf[size_group_lookup[bsize]], y_mode, INTRA_MODES); + } + + if (av1_filter_intra_allowed(cm, mbmi)) { + const int use_filter_intra_mode = + mbmi->filter_intra_mode_info.use_filter_intra; +#if CONFIG_ENTROPY_STATS + ++counts->filter_intra[mbmi->sb_type][use_filter_intra_mode]; + if (use_filter_intra_mode) { + ++counts + ->filter_intra_mode[mbmi->filter_intra_mode_info.filter_intra_mode]; + } +#endif // CONFIG_ENTROPY_STATS + update_cdf(fc->filter_intra_cdfs[mbmi->sb_type], use_filter_intra_mode, 2); + if (use_filter_intra_mode) { + update_cdf(fc->filter_intra_mode_cdf, + mbmi->filter_intra_mode_info.filter_intra_mode, + FILTER_INTRA_MODES); + } + } + if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) { +#if CONFIG_ENTROPY_STATS + ++counts->angle_delta[mbmi->mode - V_PRED] + [mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA]; +#endif + update_cdf(fc->angle_delta_cdf[mbmi->mode - V_PRED], + mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA, + 2 * MAX_ANGLE_DELTA + 1); + } + + if (!xd->is_chroma_ref) return; + + const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode; + const CFL_ALLOWED_TYPE cfl_allowed = is_cfl_allowed(xd); +#if CONFIG_ENTROPY_STATS + ++counts->uv_mode[cfl_allowed][y_mode][uv_mode]; +#endif // CONFIG_ENTROPY_STATS + update_cdf(fc->uv_mode_cdf[cfl_allowed][y_mode], uv_mode, + UV_INTRA_MODES - !cfl_allowed); + if (uv_mode == UV_CFL_PRED) { + const int8_t joint_sign = mbmi->cfl_alpha_signs; + const uint8_t idx = mbmi->cfl_alpha_idx; + +#if CONFIG_ENTROPY_STATS + ++counts->cfl_sign[joint_sign]; +#endif + update_cdf(fc->cfl_sign_cdf, joint_sign, CFL_JOINT_SIGNS); + if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) { + aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)]; + +#if CONFIG_ENTROPY_STATS + ++counts->cfl_alpha[CFL_CONTEXT_U(joint_sign)][CFL_IDX_U(idx)]; +#endif + update_cdf(cdf_u, CFL_IDX_U(idx), CFL_ALPHABET_SIZE); + } + if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) { + aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)]; + +#if CONFIG_ENTROPY_STATS + ++counts->cfl_alpha[CFL_CONTEXT_V(joint_sign)][CFL_IDX_V(idx)]; +#endif + update_cdf(cdf_v, CFL_IDX_V(idx), CFL_ALPHABET_SIZE); + } + } + if (av1_is_directional_mode(get_uv_mode(uv_mode)) && + av1_use_angle_delta(bsize)) { +#if CONFIG_ENTROPY_STATS + ++counts->angle_delta[uv_mode - UV_V_PRED] + [mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA]; +#endif + update_cdf(fc->angle_delta_cdf[uv_mode - UV_V_PRED], + mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA, + 2 * MAX_ANGLE_DELTA + 1); + } + if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) { + update_palette_cdf(xd, mbmi, counts); + } +} + +static AOM_INLINE void update_stats(const AV1_COMMON *const cm, + ThreadData *td) { + MACROBLOCK *x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const CurrentFrame *const current_frame = &cm->current_frame; + const BLOCK_SIZE bsize = mbmi->sb_type; + FRAME_CONTEXT *fc = xd->tile_ctx; + const int seg_ref_active = + segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME); + + if (current_frame->skip_mode_info.skip_mode_flag && !seg_ref_active && + is_comp_ref_allowed(bsize)) { + const int skip_mode_ctx = av1_get_skip_mode_context(xd); +#if CONFIG_ENTROPY_STATS + td->counts->skip_mode[skip_mode_ctx][mbmi->skip_mode]++; +#endif + update_cdf(fc->skip_mode_cdfs[skip_mode_ctx], mbmi->skip_mode, 2); + } + + if (!mbmi->skip_mode && !seg_ref_active) { + const int skip_ctx = av1_get_skip_context(xd); +#if CONFIG_ENTROPY_STATS + td->counts->skip[skip_ctx][mbmi->skip]++; +#endif + update_cdf(fc->skip_cdfs[skip_ctx], mbmi->skip, 2); + } + +#if CONFIG_ENTROPY_STATS + // delta quant applies to both intra and inter + const int super_block_upper_left = + ((xd->mi_row & (cm->seq_params.mib_size - 1)) == 0) && + ((xd->mi_col & (cm->seq_params.mib_size - 1)) == 0); + const DeltaQInfo *const delta_q_info = &cm->delta_q_info; + if (delta_q_info->delta_q_present_flag && + (bsize != cm->seq_params.sb_size || !mbmi->skip) && + super_block_upper_left) { + const int dq = + (mbmi->current_qindex - xd->current_qindex) / delta_q_info->delta_q_res; + const int absdq = abs(dq); + for (int i = 0; i < AOMMIN(absdq, DELTA_Q_SMALL); ++i) { + td->counts->delta_q[i][1]++; + } + if (absdq < DELTA_Q_SMALL) td->counts->delta_q[absdq][0]++; + if (delta_q_info->delta_lf_present_flag) { + if (delta_q_info->delta_lf_multi) { + const int frame_lf_count = + av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { + const int delta_lf = (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) / + delta_q_info->delta_lf_res; + const int abs_delta_lf = abs(delta_lf); + for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) { + td->counts->delta_lf_multi[lf_id][i][1]++; + } + if (abs_delta_lf < DELTA_LF_SMALL) + td->counts->delta_lf_multi[lf_id][abs_delta_lf][0]++; + } + } else { + const int delta_lf = + (mbmi->delta_lf_from_base - xd->delta_lf_from_base) / + delta_q_info->delta_lf_res; + const int abs_delta_lf = abs(delta_lf); + for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) { + td->counts->delta_lf[i][1]++; + } + if (abs_delta_lf < DELTA_LF_SMALL) + td->counts->delta_lf[abs_delta_lf][0]++; + } + } + } +#endif + + if (!is_inter_block(mbmi)) { + sum_intra_stats(cm, td->counts, xd, mbmi, xd->above_mbmi, xd->left_mbmi, + frame_is_intra_only(cm)); + } + + if (av1_allow_intrabc(cm)) { + update_cdf(fc->intrabc_cdf, is_intrabc_block(mbmi), 2); +#if CONFIG_ENTROPY_STATS + ++td->counts->intrabc[is_intrabc_block(mbmi)]; +#endif // CONFIG_ENTROPY_STATS + } + + if (frame_is_intra_only(cm) || mbmi->skip_mode) return; + + FRAME_COUNTS *const counts = td->counts; + const int inter_block = is_inter_block(mbmi); + + if (!seg_ref_active) { +#if CONFIG_ENTROPY_STATS + counts->intra_inter[av1_get_intra_inter_context(xd)][inter_block]++; +#endif + update_cdf(fc->intra_inter_cdf[av1_get_intra_inter_context(xd)], + inter_block, 2); + // If the segment reference feature is enabled we have only a single + // reference frame allowed for the segment so exclude it from + // the reference frame counts used to work out probabilities. + if (inter_block) { + const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0]; + const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1]; + if (current_frame->reference_mode == REFERENCE_MODE_SELECT) { + if (is_comp_ref_allowed(bsize)) { +#if CONFIG_ENTROPY_STATS + counts->comp_inter[av1_get_reference_mode_context(xd)] + [has_second_ref(mbmi)]++; +#endif // CONFIG_ENTROPY_STATS + update_cdf(av1_get_reference_mode_cdf(xd), has_second_ref(mbmi), 2); + } + } + + if (has_second_ref(mbmi)) { + const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi) + ? UNIDIR_COMP_REFERENCE + : BIDIR_COMP_REFERENCE; + update_cdf(av1_get_comp_reference_type_cdf(xd), comp_ref_type, + COMP_REFERENCE_TYPES); +#if CONFIG_ENTROPY_STATS + counts->comp_ref_type[av1_get_comp_reference_type_context(xd)] + [comp_ref_type]++; +#endif // CONFIG_ENTROPY_STATS + + if (comp_ref_type == UNIDIR_COMP_REFERENCE) { + const int bit = (ref0 == BWDREF_FRAME); + update_cdf(av1_get_pred_cdf_uni_comp_ref_p(xd), bit, 2); +#if CONFIG_ENTROPY_STATS + counts + ->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p(xd)][0][bit]++; +#endif // CONFIG_ENTROPY_STATS + if (!bit) { + const int bit1 = (ref1 == LAST3_FRAME || ref1 == GOLDEN_FRAME); + update_cdf(av1_get_pred_cdf_uni_comp_ref_p1(xd), bit1, 2); +#if CONFIG_ENTROPY_STATS + counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p1(xd)][1] + [bit1]++; +#endif // CONFIG_ENTROPY_STATS + if (bit1) { + update_cdf(av1_get_pred_cdf_uni_comp_ref_p2(xd), + ref1 == GOLDEN_FRAME, 2); +#if CONFIG_ENTROPY_STATS + counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p2(xd)][2] + [ref1 == GOLDEN_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + } + } + } else { + const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME); + update_cdf(av1_get_pred_cdf_comp_ref_p(xd), bit, 2); +#if CONFIG_ENTROPY_STATS + counts->comp_ref[av1_get_pred_context_comp_ref_p(xd)][0][bit]++; +#endif // CONFIG_ENTROPY_STATS + if (!bit) { + update_cdf(av1_get_pred_cdf_comp_ref_p1(xd), ref0 == LAST2_FRAME, + 2); +#if CONFIG_ENTROPY_STATS + counts->comp_ref[av1_get_pred_context_comp_ref_p1(xd)][1] + [ref0 == LAST2_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + } else { + update_cdf(av1_get_pred_cdf_comp_ref_p2(xd), ref0 == GOLDEN_FRAME, + 2); +#if CONFIG_ENTROPY_STATS + counts->comp_ref[av1_get_pred_context_comp_ref_p2(xd)][2] + [ref0 == GOLDEN_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + } + update_cdf(av1_get_pred_cdf_comp_bwdref_p(xd), ref1 == ALTREF_FRAME, + 2); +#if CONFIG_ENTROPY_STATS + counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(xd)][0] + [ref1 == ALTREF_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + if (ref1 != ALTREF_FRAME) { + update_cdf(av1_get_pred_cdf_comp_bwdref_p1(xd), + ref1 == ALTREF2_FRAME, 2); +#if CONFIG_ENTROPY_STATS + counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p1(xd)][1] + [ref1 == ALTREF2_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + } + } + } else { + const int bit = (ref0 >= BWDREF_FRAME); + update_cdf(av1_get_pred_cdf_single_ref_p1(xd), bit, 2); +#if CONFIG_ENTROPY_STATS + counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0][bit]++; +#endif // CONFIG_ENTROPY_STATS + if (bit) { + assert(ref0 <= ALTREF_FRAME); + update_cdf(av1_get_pred_cdf_single_ref_p2(xd), ref0 == ALTREF_FRAME, + 2); +#if CONFIG_ENTROPY_STATS + counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1] + [ref0 == ALTREF_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + if (ref0 != ALTREF_FRAME) { + update_cdf(av1_get_pred_cdf_single_ref_p6(xd), + ref0 == ALTREF2_FRAME, 2); +#if CONFIG_ENTROPY_STATS + counts->single_ref[av1_get_pred_context_single_ref_p6(xd)][5] + [ref0 == ALTREF2_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + } + } else { + const int bit1 = !(ref0 == LAST2_FRAME || ref0 == LAST_FRAME); + update_cdf(av1_get_pred_cdf_single_ref_p3(xd), bit1, 2); +#if CONFIG_ENTROPY_STATS + counts->single_ref[av1_get_pred_context_single_ref_p3(xd)][2][bit1]++; +#endif // CONFIG_ENTROPY_STATS + if (!bit1) { + update_cdf(av1_get_pred_cdf_single_ref_p4(xd), ref0 != LAST_FRAME, + 2); +#if CONFIG_ENTROPY_STATS + counts->single_ref[av1_get_pred_context_single_ref_p4(xd)][3] + [ref0 != LAST_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + } else { + update_cdf(av1_get_pred_cdf_single_ref_p5(xd), ref0 != LAST3_FRAME, + 2); +#if CONFIG_ENTROPY_STATS + counts->single_ref[av1_get_pred_context_single_ref_p5(xd)][4] + [ref0 != LAST3_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + } + } + } + + if (cm->seq_params.enable_interintra_compound && + is_interintra_allowed(mbmi)) { + const int bsize_group = size_group_lookup[bsize]; + if (mbmi->ref_frame[1] == INTRA_FRAME) { +#if CONFIG_ENTROPY_STATS + counts->interintra[bsize_group][1]++; +#endif + update_cdf(fc->interintra_cdf[bsize_group], 1, 2); +#if CONFIG_ENTROPY_STATS + counts->interintra_mode[bsize_group][mbmi->interintra_mode]++; +#endif + update_cdf(fc->interintra_mode_cdf[bsize_group], + mbmi->interintra_mode, INTERINTRA_MODES); + if (av1_is_wedge_used(bsize)) { +#if CONFIG_ENTROPY_STATS + counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++; +#endif + update_cdf(fc->wedge_interintra_cdf[bsize], + mbmi->use_wedge_interintra, 2); + if (mbmi->use_wedge_interintra) { +#if CONFIG_ENTROPY_STATS + counts->wedge_idx[bsize][mbmi->interintra_wedge_index]++; +#endif + update_cdf(fc->wedge_idx_cdf[bsize], mbmi->interintra_wedge_index, + 16); + } + } + } else { +#if CONFIG_ENTROPY_STATS + counts->interintra[bsize_group][0]++; +#endif + update_cdf(fc->interintra_cdf[bsize_group], 0, 2); + } + } + + const MOTION_MODE motion_allowed = + cm->features.switchable_motion_mode + ? motion_mode_allowed(xd->global_motion, xd, mbmi, + cm->features.allow_warped_motion) + : SIMPLE_TRANSLATION; + if (mbmi->ref_frame[1] != INTRA_FRAME) { + if (motion_allowed == WARPED_CAUSAL) { +#if CONFIG_ENTROPY_STATS + counts->motion_mode[bsize][mbmi->motion_mode]++; +#endif + update_cdf(fc->motion_mode_cdf[bsize], mbmi->motion_mode, + MOTION_MODES); + } else if (motion_allowed == OBMC_CAUSAL) { +#if CONFIG_ENTROPY_STATS + counts->obmc[bsize][mbmi->motion_mode == OBMC_CAUSAL]++; +#endif + update_cdf(fc->obmc_cdf[bsize], mbmi->motion_mode == OBMC_CAUSAL, 2); + } + } + + if (has_second_ref(mbmi)) { + assert(current_frame->reference_mode != SINGLE_REFERENCE && + is_inter_compound_mode(mbmi->mode) && + mbmi->motion_mode == SIMPLE_TRANSLATION); + + const int masked_compound_used = is_any_masked_compound_used(bsize) && + cm->seq_params.enable_masked_compound; + if (masked_compound_used) { + const int comp_group_idx_ctx = get_comp_group_idx_context(xd); +#if CONFIG_ENTROPY_STATS + ++counts->comp_group_idx[comp_group_idx_ctx][mbmi->comp_group_idx]; +#endif + update_cdf(fc->comp_group_idx_cdf[comp_group_idx_ctx], + mbmi->comp_group_idx, 2); + } + + if (mbmi->comp_group_idx == 0) { + const int comp_index_ctx = get_comp_index_context(cm, xd); +#if CONFIG_ENTROPY_STATS + ++counts->compound_index[comp_index_ctx][mbmi->compound_idx]; +#endif + update_cdf(fc->compound_index_cdf[comp_index_ctx], mbmi->compound_idx, + 2); + } else { + assert(masked_compound_used); + if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) { +#if CONFIG_ENTROPY_STATS + ++counts->compound_type[bsize][mbmi->interinter_comp.type - + COMPOUND_WEDGE]; +#endif + update_cdf(fc->compound_type_cdf[bsize], + mbmi->interinter_comp.type - COMPOUND_WEDGE, + MASKED_COMPOUND_TYPES); + } + } + } + if (mbmi->interinter_comp.type == COMPOUND_WEDGE) { + if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) { +#if CONFIG_ENTROPY_STATS + counts->wedge_idx[bsize][mbmi->interinter_comp.wedge_index]++; +#endif + update_cdf(fc->wedge_idx_cdf[bsize], + mbmi->interinter_comp.wedge_index, 16); + } + } + } + } + + if (inter_block && cm->features.interp_filter == SWITCHABLE && + mbmi->motion_mode != WARPED_CAUSAL && + !is_nontrans_global_motion(xd, mbmi)) { + update_filter_type_cdf(xd, mbmi); + } + if (inter_block && + !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + const PREDICTION_MODE mode = mbmi->mode; + const int16_t mode_ctx = + av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame); + if (has_second_ref(mbmi)) { +#if CONFIG_ENTROPY_STATS + ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)]; +#endif + update_cdf(fc->inter_compound_mode_cdf[mode_ctx], + INTER_COMPOUND_OFFSET(mode), INTER_COMPOUND_MODES); + } else { + update_inter_mode_stats(fc, counts, mode, mode_ctx); + } + + const int new_mv = mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV; + if (new_mv) { + const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + for (int idx = 0; idx < 2; ++idx) { + if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { + const uint8_t drl_ctx = + av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx); + update_cdf(fc->drl_cdf[drl_ctx], mbmi->ref_mv_idx != idx, 2); +#if CONFIG_ENTROPY_STATS + ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx]; +#endif + if (mbmi->ref_mv_idx == idx) break; + } + } + } + + if (have_nearmv_in_inter_mode(mbmi->mode)) { + const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + for (int idx = 1; idx < 3; ++idx) { + if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { + const uint8_t drl_ctx = + av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx); + update_cdf(fc->drl_cdf[drl_ctx], mbmi->ref_mv_idx != idx - 1, 2); +#if CONFIG_ENTROPY_STATS + ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx - 1]; +#endif + if (mbmi->ref_mv_idx == idx - 1) break; + } + } + } + if (have_newmv_in_inter_mode(mbmi->mode)) { + const int allow_hp = cm->features.cur_frame_force_integer_mv + ? MV_SUBPEL_NONE + : cm->features.allow_high_precision_mv; + if (new_mv) { + for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) { + const int_mv ref_mv = av1_get_ref_mv(x, ref); + av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc, + allow_hp); + } + } else if (mbmi->mode == NEAREST_NEWMV || mbmi->mode == NEAR_NEWMV) { + const int ref = 1; + const int_mv ref_mv = av1_get_ref_mv(x, ref); + av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc, + allow_hp); + } else if (mbmi->mode == NEW_NEARESTMV || mbmi->mode == NEW_NEARMV) { + const int ref = 0; + const int_mv ref_mv = av1_get_ref_mv(x, ref); + av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc, + allow_hp); + } + } + } +} + +static AOM_INLINE void restore_context(MACROBLOCK *x, + const RD_SEARCH_MACROBLOCK_CONTEXT *ctx, + int mi_row, int mi_col, BLOCK_SIZE bsize, + const int num_planes) { + MACROBLOCKD *xd = &x->e_mbd; + int p; + const int num_4x4_blocks_wide = mi_size_wide[bsize]; + const int num_4x4_blocks_high = mi_size_high[bsize]; + int mi_width = mi_size_wide[bsize]; + int mi_height = mi_size_high[bsize]; + for (p = 0; p < num_planes; p++) { + int tx_col = mi_col; + int tx_row = mi_row & MAX_MIB_MASK; + memcpy( + xd->above_entropy_context[p] + (tx_col >> xd->plane[p].subsampling_x), + ctx->a + num_4x4_blocks_wide * p, + (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >> + xd->plane[p].subsampling_x); + memcpy(xd->left_entropy_context[p] + (tx_row >> xd->plane[p].subsampling_y), + ctx->l + num_4x4_blocks_high * p, + (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >> + xd->plane[p].subsampling_y); + } + memcpy(xd->above_partition_context + mi_col, ctx->sa, + sizeof(*xd->above_partition_context) * mi_width); + memcpy(xd->left_partition_context + (mi_row & MAX_MIB_MASK), ctx->sl, + sizeof(xd->left_partition_context[0]) * mi_height); + xd->above_txfm_context = ctx->p_ta; + xd->left_txfm_context = ctx->p_tl; + memcpy(xd->above_txfm_context, ctx->ta, + sizeof(*xd->above_txfm_context) * mi_width); + memcpy(xd->left_txfm_context, ctx->tl, + sizeof(*xd->left_txfm_context) * mi_height); +} + +static AOM_INLINE void save_context(const MACROBLOCK *x, + RD_SEARCH_MACROBLOCK_CONTEXT *ctx, + int mi_row, int mi_col, BLOCK_SIZE bsize, + const int num_planes) { + const MACROBLOCKD *xd = &x->e_mbd; + int p; + int mi_width = mi_size_wide[bsize]; + int mi_height = mi_size_high[bsize]; + + // buffer the above/left context information of the block in search. + for (p = 0; p < num_planes; ++p) { + int tx_col = mi_col; + int tx_row = mi_row & MAX_MIB_MASK; + memcpy( + ctx->a + mi_width * p, + xd->above_entropy_context[p] + (tx_col >> xd->plane[p].subsampling_x), + (sizeof(ENTROPY_CONTEXT) * mi_width) >> xd->plane[p].subsampling_x); + memcpy(ctx->l + mi_height * p, + xd->left_entropy_context[p] + (tx_row >> xd->plane[p].subsampling_y), + (sizeof(ENTROPY_CONTEXT) * mi_height) >> xd->plane[p].subsampling_y); + } + memcpy(ctx->sa, xd->above_partition_context + mi_col, + sizeof(*xd->above_partition_context) * mi_width); + memcpy(ctx->sl, xd->left_partition_context + (mi_row & MAX_MIB_MASK), + sizeof(xd->left_partition_context[0]) * mi_height); + memcpy(ctx->ta, xd->above_txfm_context, + sizeof(*xd->above_txfm_context) * mi_width); + memcpy(ctx->tl, xd->left_txfm_context, + sizeof(*xd->left_txfm_context) * mi_height); + ctx->p_ta = xd->above_txfm_context; + ctx->p_tl = xd->left_txfm_context; +} + +static AOM_INLINE void encode_b(const AV1_COMP *const cpi, + TileDataEnc *tile_data, ThreadData *td, + TOKENEXTRA **tp, int mi_row, int mi_col, + RUN_TYPE dry_run, BLOCK_SIZE bsize, + PARTITION_TYPE partition, + PICK_MODE_CONTEXT *const ctx, int *rate) { + TileInfo *const tile = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + + set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize); + const int origin_mult = x->rdmult; + setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL); + MB_MODE_INFO *mbmi = xd->mi[0]; + mbmi->partition = partition; + update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run); + + if (!dry_run) { + x->mbmi_ext_frame->cb_offset = x->cb_offset; + assert(x->cb_offset < + (1 << num_pels_log2_lookup[cpi->common.seq_params.sb_size])); + } + + encode_superblock(cpi, tile_data, td, tp, dry_run, bsize, rate); + + if (!dry_run) { + const AV1_COMMON *const cm = &cpi->common; + x->cb_offset += block_size_wide[bsize] * block_size_high[bsize]; + if (bsize == cpi->common.seq_params.sb_size && mbmi->skip == 1 && + cm->delta_q_info.delta_lf_present_flag) { + const int frame_lf_count = + av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) + mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id]; + mbmi->delta_lf_from_base = xd->delta_lf_from_base; + } + if (has_second_ref(mbmi)) { + if (mbmi->compound_idx == 0 || + mbmi->interinter_comp.type == COMPOUND_AVERAGE) + mbmi->comp_group_idx = 0; + else + mbmi->comp_group_idx = 1; + } + + // delta quant applies to both intra and inter + const int super_block_upper_left = + ((mi_row & (cm->seq_params.mib_size - 1)) == 0) && + ((mi_col & (cm->seq_params.mib_size - 1)) == 0); + const DeltaQInfo *const delta_q_info = &cm->delta_q_info; + if (delta_q_info->delta_q_present_flag && + (bsize != cm->seq_params.sb_size || !mbmi->skip) && + super_block_upper_left) { + xd->current_qindex = mbmi->current_qindex; + if (delta_q_info->delta_lf_present_flag) { + if (delta_q_info->delta_lf_multi) { + const int frame_lf_count = + av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { + xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id]; + } + } else { + xd->delta_lf_from_base = mbmi->delta_lf_from_base; + } + } + } + + RD_COUNTS *rdc = &td->rd_counts; + if (mbmi->skip_mode) { + assert(!frame_is_intra_only(cm)); + rdc->skip_mode_used_flag = 1; + if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) { + assert(has_second_ref(mbmi)); + rdc->compound_ref_used_flag = 1; + } + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + } else { + const int seg_ref_active = + segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME); + if (!seg_ref_active) { + // If the segment reference feature is enabled we have only a single + // reference frame allowed for the segment so exclude it from + // the reference frame counts used to work out probabilities. + if (is_inter_block(mbmi)) { + av1_collect_neighbors_ref_counts(xd); + if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) { + if (has_second_ref(mbmi)) { + // This flag is also updated for 4x4 blocks + rdc->compound_ref_used_flag = 1; + } + } + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + } + } + } + + if (tile_data->allow_update_cdf) update_stats(&cpi->common, td); + + // Gather obmc and warped motion count to update the probability. + if ((!cpi->sf.inter_sf.disable_obmc && + cpi->sf.inter_sf.prune_obmc_prob_thresh > 0) || + (cm->features.allow_warped_motion && + cpi->sf.inter_sf.prune_warped_prob_thresh > 0)) { + const int inter_block = is_inter_block(mbmi); + const int seg_ref_active = + segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME); + if (!seg_ref_active && inter_block) { + const MOTION_MODE motion_allowed = + cm->features.switchable_motion_mode + ? motion_mode_allowed(xd->global_motion, xd, mbmi, + cm->features.allow_warped_motion) + : SIMPLE_TRANSLATION; + + if (mbmi->ref_frame[1] != INTRA_FRAME) { + if (motion_allowed >= OBMC_CAUSAL) { + td->rd_counts.obmc_used[bsize][mbmi->motion_mode == OBMC_CAUSAL]++; + } + if (motion_allowed == WARPED_CAUSAL) { + td->rd_counts.warped_used[mbmi->motion_mode == WARPED_CAUSAL]++; + } + } + } + } + } + // TODO(Ravi/Remya): Move this copy function to a better logical place + // This function will copy the best mode information from block + // level (x->mbmi_ext) to frame level (cpi->mbmi_ext_info.frame_base). This + // frame level buffer (cpi->mbmi_ext_info.frame_base) will be used during + // bitstream preparation. + av1_copy_mbmi_ext_to_mbmi_ext_frame(x->mbmi_ext_frame, x->mbmi_ext, + av1_ref_frame_type(xd->mi[0]->ref_frame)); + x->rdmult = origin_mult; +} + +static AOM_INLINE void encode_sb(const AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TOKENEXTRA **tp, + int mi_row, int mi_col, RUN_TYPE dry_run, + BLOCK_SIZE bsize, PC_TREE *pc_tree, + int *rate) { + assert(bsize < BLOCK_SIZES_ALL); + const AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + assert(bsize < BLOCK_SIZES_ALL); + const int hbs = mi_size_wide[bsize] / 2; + const int is_partition_root = bsize >= BLOCK_8X8; + const int ctx = is_partition_root + ? partition_plane_context(xd, mi_row, mi_col, bsize) + : -1; + const PARTITION_TYPE partition = pc_tree->partitioning; + const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); + int quarter_step = mi_size_wide[bsize] / 4; + int i; + BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT); + + if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return; + + if (!dry_run && ctx >= 0) { + const int has_rows = (mi_row + hbs) < mi_params->mi_rows; + const int has_cols = (mi_col + hbs) < mi_params->mi_cols; + + if (has_rows && has_cols) { +#if CONFIG_ENTROPY_STATS + td->counts->partition[ctx][partition]++; +#endif + + if (tile_data->allow_update_cdf) { + FRAME_CONTEXT *fc = xd->tile_ctx; + update_cdf(fc->partition_cdf[ctx], partition, + partition_cdf_length(bsize)); + } + } + } + + switch (partition) { + case PARTITION_NONE: + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize, + partition, &pc_tree->none, rate); + break; + case PARTITION_VERT: + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize, + partition, &pc_tree->vertical[0], rate); + if (mi_col + hbs < mi_params->mi_cols) { + encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize, + partition, &pc_tree->vertical[1], rate); + } + break; + case PARTITION_HORZ: + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize, + partition, &pc_tree->horizontal[0], rate); + if (mi_row + hbs < mi_params->mi_rows) { + encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize, + partition, &pc_tree->horizontal[1], rate); + } + break; + case PARTITION_SPLIT: + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, dry_run, subsize, + pc_tree->split[0], rate); + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col + hbs, dry_run, subsize, + pc_tree->split[1], rate); + encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col, dry_run, subsize, + pc_tree->split[2], rate); + encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col + hbs, dry_run, + subsize, pc_tree->split[3], rate); + break; + + case PARTITION_HORZ_A: + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2, + partition, &pc_tree->horizontala[0], rate); + encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2, + partition, &pc_tree->horizontala[1], rate); + encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize, + partition, &pc_tree->horizontala[2], rate); + break; + case PARTITION_HORZ_B: + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize, + partition, &pc_tree->horizontalb[0], rate); + encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2, + partition, &pc_tree->horizontalb[1], rate); + encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run, + bsize2, partition, &pc_tree->horizontalb[2], rate); + break; + case PARTITION_VERT_A: + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2, + partition, &pc_tree->verticala[0], rate); + encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2, + partition, &pc_tree->verticala[1], rate); + encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize, + partition, &pc_tree->verticala[2], rate); + + break; + case PARTITION_VERT_B: + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize, + partition, &pc_tree->verticalb[0], rate); + encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2, + partition, &pc_tree->verticalb[1], rate); + encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run, + bsize2, partition, &pc_tree->verticalb[2], rate); + break; + case PARTITION_HORZ_4: + for (i = 0; i < 4; ++i) { + int this_mi_row = mi_row + i * quarter_step; + if (i > 0 && this_mi_row >= mi_params->mi_rows) break; + + encode_b(cpi, tile_data, td, tp, this_mi_row, mi_col, dry_run, subsize, + partition, &pc_tree->horizontal4[i], rate); + } + break; + case PARTITION_VERT_4: + for (i = 0; i < 4; ++i) { + int this_mi_col = mi_col + i * quarter_step; + if (i > 0 && this_mi_col >= mi_params->mi_cols) break; + encode_b(cpi, tile_data, td, tp, mi_row, this_mi_col, dry_run, subsize, + partition, &pc_tree->vertical4[i], rate); + } + break; + default: assert(0 && "Invalid partition type."); break; + } + + update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition); +} + +static AOM_INLINE void set_partial_sb_partition( + const AV1_COMMON *const cm, MB_MODE_INFO *mi, int bh_in, int bw_in, + int mi_rows_remaining, int mi_cols_remaining, BLOCK_SIZE bsize, + MB_MODE_INFO **mib) { + int bh = bh_in; + int r, c; + for (r = 0; r < cm->seq_params.mib_size; r += bh) { + int bw = bw_in; + for (c = 0; c < cm->seq_params.mib_size; c += bw) { + const int grid_index = get_mi_grid_idx(&cm->mi_params, r, c); + const int mi_index = get_alloc_mi_idx(&cm->mi_params, r, c); + mib[grid_index] = mi + mi_index; + mib[grid_index]->sb_type = find_partition_size( + bsize, mi_rows_remaining - r, mi_cols_remaining - c, &bh, &bw); + } + } +} + +// This function attempts to set all mode info entries in a given superblock +// to the same block partition size. +// However, at the bottom and right borders of the image the requested size +// may not be allowed in which case this code attempts to choose the largest +// allowable partition. +static AOM_INLINE void set_fixed_partitioning(AV1_COMP *cpi, + const TileInfo *const tile, + MB_MODE_INFO **mib, int mi_row, + int mi_col, BLOCK_SIZE bsize) { + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int mi_rows_remaining = tile->mi_row_end - mi_row; + const int mi_cols_remaining = tile->mi_col_end - mi_col; + MB_MODE_INFO *const mi_upper_left = + mi_params->mi_alloc + get_alloc_mi_idx(mi_params, mi_row, mi_col); + int bh = mi_size_high[bsize]; + int bw = mi_size_wide[bsize]; + + assert(bsize >= mi_params->mi_alloc_bsize && + "Attempted to use bsize < mi_params->mi_alloc_bsize"); + assert((mi_rows_remaining > 0) && (mi_cols_remaining > 0)); + + // Apply the requested partition size to the SB if it is all "in image" + if ((mi_cols_remaining >= cm->seq_params.mib_size) && + (mi_rows_remaining >= cm->seq_params.mib_size)) { + for (int block_row = 0; block_row < cm->seq_params.mib_size; + block_row += bh) { + for (int block_col = 0; block_col < cm->seq_params.mib_size; + block_col += bw) { + const int grid_index = get_mi_grid_idx(mi_params, block_row, block_col); + const int mi_index = get_alloc_mi_idx(mi_params, block_row, block_col); + mib[grid_index] = mi_upper_left + mi_index; + mib[grid_index]->sb_type = bsize; + } + } + } else { + // Else this is a partial SB. + set_partial_sb_partition(cm, mi_upper_left, bh, bw, mi_rows_remaining, + mi_cols_remaining, bsize, mib); + } +} + +static AOM_INLINE void rd_use_partition( + AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, MB_MODE_INFO **mib, + TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, int *rate, + int64_t *dist, int do_recon, PC_TREE *pc_tree) { + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int num_planes = av1_num_planes(cm); + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int bs = mi_size_wide[bsize]; + const int hbs = bs / 2; + int i; + const int pl = (bsize >= BLOCK_8X8) + ? partition_plane_context(xd, mi_row, mi_col, bsize) + : 0; + const PARTITION_TYPE partition = + (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize) + : PARTITION_NONE; + const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); + RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; + RD_STATS last_part_rdc, none_rdc, chosen_rdc, invalid_rdc; + BLOCK_SIZE sub_subsize = BLOCK_4X4; + int splits_below = 0; + BLOCK_SIZE bs_type = mib[0]->sb_type; + PICK_MODE_CONTEXT *ctx_none = &pc_tree->none; + + if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return; + + assert(mi_size_wide[bsize] == mi_size_high[bsize]); + + av1_invalid_rd_stats(&last_part_rdc); + av1_invalid_rd_stats(&none_rdc); + av1_invalid_rd_stats(&chosen_rdc); + av1_invalid_rd_stats(&invalid_rdc); + + pc_tree->partitioning = partition; + + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + + if (bsize == BLOCK_16X16 && cpi->vaq_refresh) { + set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); + x->mb_energy = av1_log_block_var(cpi, x, bsize); + } + + // Save rdmult before it might be changed, so it can be restored later. + const int orig_rdmult = x->rdmult; + setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL); + + if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION && + (cpi->sf.part_sf.adjust_var_based_rd_partitioning == 2 || + (cpi->sf.part_sf.adjust_var_based_rd_partitioning == 1 && + cm->quant_params.base_qindex > 190 && bsize <= BLOCK_32X32 && + !frame_is_intra_only(cm)))) { + // Check if any of the sub blocks are further split. + if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) { + sub_subsize = get_partition_subsize(subsize, PARTITION_SPLIT); + splits_below = 1; + for (i = 0; i < 4; i++) { + int jj = i >> 1, ii = i & 0x01; + MB_MODE_INFO *this_mi = mib[jj * hbs * mi_params->mi_stride + ii * hbs]; + if (this_mi && this_mi->sb_type >= sub_subsize) { + splits_below = 0; + } + } + } + + // If partition is not none try none unless each of the 4 splits are split + // even further.. + if (partition != PARTITION_NONE && !splits_below && + mi_row + hbs < mi_params->mi_rows && + mi_col + hbs < mi_params->mi_cols) { + pc_tree->partitioning = PARTITION_NONE; + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc, + PARTITION_NONE, bsize, ctx_none, invalid_rdc, PICK_MODE_RD); + + if (none_rdc.rate < INT_MAX) { + none_rdc.rate += x->partition_cost[pl][PARTITION_NONE]; + none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist); + } + + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + mib[0]->sb_type = bs_type; + pc_tree->partitioning = partition; + } + } + + switch (partition) { + case PARTITION_NONE: + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, + PARTITION_NONE, bsize, ctx_none, invalid_rdc, PICK_MODE_RD); + break; + case PARTITION_HORZ: + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, + PARTITION_HORZ, subsize, &pc_tree->horizontal[0], + invalid_rdc, PICK_MODE_RD); + if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && + mi_row + hbs < mi_params->mi_rows) { + RD_STATS tmp_rdc; + const PICK_MODE_CONTEXT *const ctx_h = &pc_tree->horizontal[0]; + av1_init_rd_stats(&tmp_rdc); + update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1); + encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, + NULL); + pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc, + PARTITION_HORZ, subsize, &pc_tree->horizontal[1], + invalid_rdc, PICK_MODE_RD); + if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { + av1_invalid_rd_stats(&last_part_rdc); + break; + } + last_part_rdc.rate += tmp_rdc.rate; + last_part_rdc.dist += tmp_rdc.dist; + last_part_rdc.rdcost += tmp_rdc.rdcost; + } + break; + case PARTITION_VERT: + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, + PARTITION_VERT, subsize, &pc_tree->vertical[0], invalid_rdc, + PICK_MODE_RD); + if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && + mi_col + hbs < mi_params->mi_cols) { + RD_STATS tmp_rdc; + const PICK_MODE_CONTEXT *const ctx_v = &pc_tree->vertical[0]; + av1_init_rd_stats(&tmp_rdc); + update_state(cpi, td, ctx_v, mi_row, mi_col, subsize, 1); + encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, + NULL); + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc, + PARTITION_VERT, subsize, + &pc_tree->vertical[bsize > BLOCK_8X8], invalid_rdc, + PICK_MODE_RD); + if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { + av1_invalid_rd_stats(&last_part_rdc); + break; + } + last_part_rdc.rate += tmp_rdc.rate; + last_part_rdc.dist += tmp_rdc.dist; + last_part_rdc.rdcost += tmp_rdc.rdcost; + } + break; + case PARTITION_SPLIT: + if (cpi->sf.part_sf.adjust_var_based_rd_partitioning == 1 && + none_rdc.rate < INT_MAX && none_rdc.skip == 1) { + av1_invalid_rd_stats(&last_part_rdc); + break; + } + last_part_rdc.rate = 0; + last_part_rdc.dist = 0; + last_part_rdc.rdcost = 0; + for (i = 0; i < 4; i++) { + int x_idx = (i & 1) * hbs; + int y_idx = (i >> 1) * hbs; + int jj = i >> 1, ii = i & 0x01; + RD_STATS tmp_rdc; + if ((mi_row + y_idx >= mi_params->mi_rows) || + (mi_col + x_idx >= mi_params->mi_cols)) + continue; + + av1_init_rd_stats(&tmp_rdc); + rd_use_partition(cpi, td, tile_data, + mib + jj * hbs * mi_params->mi_stride + ii * hbs, tp, + mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate, + &tmp_rdc.dist, i != 3, pc_tree->split[i]); + if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { + av1_invalid_rd_stats(&last_part_rdc); + break; + } + last_part_rdc.rate += tmp_rdc.rate; + last_part_rdc.dist += tmp_rdc.dist; + } + break; + case PARTITION_VERT_A: + case PARTITION_VERT_B: + case PARTITION_HORZ_A: + case PARTITION_HORZ_B: + case PARTITION_HORZ_4: + case PARTITION_VERT_4: + assert(0 && "Cannot handle extended partition types"); + default: assert(0); break; + } + + if (last_part_rdc.rate < INT_MAX) { + last_part_rdc.rate += x->partition_cost[pl][partition]; + last_part_rdc.rdcost = + RDCOST(x->rdmult, last_part_rdc.rate, last_part_rdc.dist); + } + + if ((cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION && + cpi->sf.part_sf.adjust_var_based_rd_partitioning > 2) && + partition != PARTITION_SPLIT && bsize > BLOCK_8X8 && + (mi_row + bs < mi_params->mi_rows || + mi_row + hbs == mi_params->mi_rows) && + (mi_col + bs < mi_params->mi_cols || + mi_col + hbs == mi_params->mi_cols)) { + BLOCK_SIZE split_subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + chosen_rdc.rate = 0; + chosen_rdc.dist = 0; + + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + pc_tree->partitioning = PARTITION_SPLIT; + + // Split partition. + for (i = 0; i < 4; i++) { + int x_idx = (i & 1) * hbs; + int y_idx = (i >> 1) * hbs; + RD_STATS tmp_rdc; + + if ((mi_row + y_idx >= mi_params->mi_rows) || + (mi_col + x_idx >= mi_params->mi_cols)) + continue; + + save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + pc_tree->split[i]->partitioning = PARTITION_NONE; + pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, &tmp_rdc, + PARTITION_SPLIT, split_subsize, &pc_tree->split[i]->none, + invalid_rdc, PICK_MODE_RD); + + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { + av1_invalid_rd_stats(&chosen_rdc); + break; + } + + chosen_rdc.rate += tmp_rdc.rate; + chosen_rdc.dist += tmp_rdc.dist; + + if (i != 3) + encode_sb(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, + OUTPUT_ENABLED, split_subsize, pc_tree->split[i], NULL); + + chosen_rdc.rate += x->partition_cost[pl][PARTITION_NONE]; + } + if (chosen_rdc.rate < INT_MAX) { + chosen_rdc.rate += x->partition_cost[pl][PARTITION_SPLIT]; + chosen_rdc.rdcost = RDCOST(x->rdmult, chosen_rdc.rate, chosen_rdc.dist); + } + } + + // If last_part is better set the partitioning to that. + if (last_part_rdc.rdcost < chosen_rdc.rdcost) { + mib[0]->sb_type = bsize; + if (bsize >= BLOCK_8X8) pc_tree->partitioning = partition; + chosen_rdc = last_part_rdc; + } + // If none was better set the partitioning to that. + if (none_rdc.rdcost < chosen_rdc.rdcost) { + if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE; + chosen_rdc = none_rdc; + } + + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + + // We must have chosen a partitioning and encoding or we'll fail later on. + // No other opportunities for success. + if (bsize == cm->seq_params.sb_size) + assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX); + + if (do_recon) { + if (bsize == cm->seq_params.sb_size) { + // NOTE: To get estimate for rate due to the tokens, use: + // int rate_coeffs = 0; + // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS, + // bsize, pc_tree, &rate_coeffs); + x->cb_offset = 0; + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize, + pc_tree, NULL); + } else { + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, + pc_tree, NULL); + } + } + + *rate = chosen_rdc.rate; + *dist = chosen_rdc.dist; + x->rdmult = orig_rdmult; +} + +static int is_leaf_split_partition(AV1_COMMON *cm, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const int bs = mi_size_wide[bsize]; + const int hbs = bs / 2; + assert(bsize >= BLOCK_8X8); + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + + for (int i = 0; i < 4; i++) { + int x_idx = (i & 1) * hbs; + int y_idx = (i >> 1) * hbs; + if ((mi_row + y_idx >= cm->mi_params.mi_rows) || + (mi_col + x_idx >= cm->mi_params.mi_cols)) + return 0; + if (get_partition(cm, mi_row + y_idx, mi_col + x_idx, subsize) != + PARTITION_NONE && + subsize != BLOCK_8X8) + return 0; + } + return 1; +} + +static AOM_INLINE int do_slipt_check(BLOCK_SIZE bsize) { + return (bsize == BLOCK_16X16 || bsize == BLOCK_32X32); +} + +static AOM_INLINE void nonrd_use_partition(AV1_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, + MB_MODE_INFO **mib, TOKENEXTRA **tp, + int mi_row, int mi_col, + BLOCK_SIZE bsize, PC_TREE *pc_tree) { + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + // Only square blocks from 8x8 to 128x128 are supported + assert(bsize >= BLOCK_8X8 && bsize <= BLOCK_128X128); + const int bs = mi_size_wide[bsize]; + const int hbs = bs / 2; + const PARTITION_TYPE partition = + (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize) + : PARTITION_NONE; + BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); + assert(subsize <= BLOCK_LARGEST); + const int pl = (bsize >= BLOCK_8X8) + ? partition_plane_context(xd, mi_row, mi_col, bsize) + : 0; + + RD_STATS dummy_cost; + av1_invalid_rd_stats(&dummy_cost); + RD_STATS invalid_rd; + av1_invalid_rd_stats(&invalid_rd); + + if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return; + + assert(mi_size_wide[bsize] == mi_size_high[bsize]); + + pc_tree->partitioning = partition; + + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + + switch (partition) { + case PARTITION_NONE: + if (cpi->sf.rt_sf.nonrd_check_partition_split && do_slipt_check(bsize) && + !frame_is_intra_only(cm)) { + RD_STATS split_rdc, none_rdc, block_rdc; + RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; + + av1_init_rd_stats(&split_rdc); + av1_invalid_rd_stats(&none_rdc); + + save_context(x, &x_ctx, mi_row, mi_col, bsize, 3); + subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc, + PARTITION_NONE, bsize, &pc_tree->none, invalid_rd, + PICK_MODE_NONRD); + none_rdc.rate += x->partition_cost[pl][PARTITION_NONE]; + none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist); + restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3); + + for (int i = 0; i < 4; i++) { + av1_invalid_rd_stats(&block_rdc); + const int x_idx = (i & 1) * hbs; + const int y_idx = (i >> 1) * hbs; + if (mi_row + y_idx >= mi_params->mi_rows || + mi_col + x_idx >= mi_params->mi_cols) + continue; + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx; + xd->left_txfm_context = + xd->left_txfm_context_buffer + ((mi_row + y_idx) & MAX_MIB_MASK); + pc_tree->split[i]->partitioning = PARTITION_NONE; + pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, + &block_rdc, PARTITION_NONE, subsize, + &pc_tree->split[i]->none, invalid_rd, PICK_MODE_NONRD); + split_rdc.rate += block_rdc.rate; + split_rdc.dist += block_rdc.dist; + + encode_b(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 1, + subsize, PARTITION_NONE, &pc_tree->split[i]->none, NULL); + } + split_rdc.rate += x->partition_cost[pl][PARTITION_SPLIT]; + split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist); + restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3); + + if (none_rdc.rdcost < split_rdc.rdcost) { + mib[0]->sb_type = bsize; + pc_tree->partitioning = PARTITION_NONE; + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize, partition, + &pc_tree->none, NULL); + } else { + mib[0]->sb_type = subsize; + pc_tree->partitioning = PARTITION_SPLIT; + for (int i = 0; i < 4; i++) { + const int x_idx = (i & 1) * hbs; + const int y_idx = (i >> 1) * hbs; + if (mi_row + y_idx >= mi_params->mi_rows || + mi_col + x_idx >= mi_params->mi_cols) + continue; + + encode_b(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 0, + subsize, PARTITION_NONE, &pc_tree->split[i]->none, NULL); + } + } + + } else { + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &dummy_cost, + PARTITION_NONE, bsize, &pc_tree->none, invalid_rd, + PICK_MODE_NONRD); + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize, partition, + &pc_tree->none, NULL); + } + break; + case PARTITION_VERT: + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &dummy_cost, + PARTITION_VERT, subsize, &pc_tree->vertical[0], invalid_rd, + PICK_MODE_NONRD); + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, subsize, + PARTITION_VERT, &pc_tree->vertical[0], NULL); + if (mi_col + hbs < mi_params->mi_cols && bsize > BLOCK_8X8) { + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &dummy_cost, + PARTITION_VERT, subsize, &pc_tree->vertical[1], + invalid_rd, PICK_MODE_NONRD); + encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, 0, subsize, + PARTITION_VERT, &pc_tree->vertical[1], NULL); + } + break; + case PARTITION_HORZ: + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &dummy_cost, + PARTITION_HORZ, subsize, &pc_tree->horizontal[0], + invalid_rd, PICK_MODE_NONRD); + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, subsize, + PARTITION_HORZ, &pc_tree->horizontal[0], NULL); + + if (mi_row + hbs < mi_params->mi_rows && bsize > BLOCK_8X8) { + pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &dummy_cost, + PARTITION_HORZ, subsize, &pc_tree->horizontal[1], + invalid_rd, PICK_MODE_NONRD); + encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, 0, subsize, + PARTITION_HORZ, &pc_tree->horizontal[1], NULL); + } + break; + case PARTITION_SPLIT: + if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode && + is_leaf_split_partition(cm, mi_row, mi_col, bsize) && + !frame_is_intra_only(cm) && bsize <= BLOCK_32X32) { + RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; + RD_STATS split_rdc, none_rdc; + av1_invalid_rd_stats(&split_rdc); + av1_invalid_rd_stats(&none_rdc); + save_context(x, &x_ctx, mi_row, mi_col, bsize, 3); + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + pc_tree->partitioning = PARTITION_NONE; + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc, + PARTITION_NONE, bsize, &pc_tree->none, invalid_rd, + PICK_MODE_NONRD); + none_rdc.rate += x->partition_cost[pl][PARTITION_NONE]; + none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist); + restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3); + if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode != 2 || + none_rdc.skip != 1 || pc_tree->none.mic.mode == NEWMV) { + av1_init_rd_stats(&split_rdc); + for (int i = 0; i < 4; i++) { + RD_STATS block_rdc; + av1_invalid_rd_stats(&block_rdc); + int x_idx = (i & 1) * hbs; + int y_idx = (i >> 1) * hbs; + if ((mi_row + y_idx >= mi_params->mi_rows) || + (mi_col + x_idx >= mi_params->mi_cols)) + continue; + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx; + xd->left_txfm_context = xd->left_txfm_context_buffer + + ((mi_row + y_idx) & MAX_MIB_MASK); + pc_tree->split[i]->partitioning = PARTITION_NONE; + pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, + &block_rdc, PARTITION_NONE, subsize, + &pc_tree->split[i]->none, invalid_rd, + PICK_MODE_NONRD); + split_rdc.rate += block_rdc.rate; + split_rdc.dist += block_rdc.dist; + + encode_b(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 1, + subsize, PARTITION_NONE, &pc_tree->split[i]->none, NULL); + } + restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3); + split_rdc.rate += x->partition_cost[pl][PARTITION_SPLIT]; + split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist); + } + if (none_rdc.rdcost < split_rdc.rdcost) { + mib[0]->sb_type = bsize; + pc_tree->partitioning = PARTITION_NONE; + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize, partition, + &pc_tree->none, NULL); + } else { + mib[0]->sb_type = subsize; + pc_tree->partitioning = PARTITION_SPLIT; + for (int i = 0; i < 4; i++) { + int x_idx = (i & 1) * hbs; + int y_idx = (i >> 1) * hbs; + if ((mi_row + y_idx >= mi_params->mi_rows) || + (mi_col + x_idx >= mi_params->mi_cols)) + continue; + + encode_b(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 0, + subsize, PARTITION_NONE, &pc_tree->split[i]->none, NULL); + } + } + } else { + for (int i = 0; i < 4; i++) { + int x_idx = (i & 1) * hbs; + int y_idx = (i >> 1) * hbs; + int jj = i >> 1, ii = i & 0x01; + if ((mi_row + y_idx >= mi_params->mi_rows) || + (mi_col + x_idx >= mi_params->mi_cols)) + continue; + nonrd_use_partition(cpi, td, tile_data, + mib + jj * hbs * mi_params->mi_stride + ii * hbs, + tp, mi_row + y_idx, mi_col + x_idx, subsize, + pc_tree->split[i]); + } + } + break; + case PARTITION_VERT_A: + case PARTITION_VERT_B: + case PARTITION_HORZ_A: + case PARTITION_HORZ_B: + case PARTITION_HORZ_4: + case PARTITION_VERT_4: + assert(0 && "Cannot handle extended partition types"); + default: assert(0); break; + } +} + +#if !CONFIG_REALTIME_ONLY +static const FIRSTPASS_STATS *read_one_frame_stats(const TWO_PASS *p, int frm) { + assert(frm >= 0); + if (frm < 0 || + p->stats_buf_ctx->stats_in_start + frm > p->stats_buf_ctx->stats_in_end) { + return NULL; + } + + return &p->stats_buf_ctx->stats_in_start[frm]; +} +// Checks to see if a super block is on a horizontal image edge. +// In most cases this is the "real" edge unless there are formatting +// bars embedded in the stream. +static int active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step) { + int top_edge = 0; + int bottom_edge = cpi->common.mi_params.mi_rows; + int is_active_h_edge = 0; + + // For two pass account for any formatting bars detected. + if (is_stat_consumption_stage_twopass(cpi)) { + const AV1_COMMON *const cm = &cpi->common; + const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats( + &cpi->twopass, cm->current_frame.display_order_hint); + if (this_frame_stats == NULL) return AOM_CODEC_ERROR; + + // The inactive region is specified in MBs not mi units. + // The image edge is in the following MB row. + top_edge += (int)(this_frame_stats->inactive_zone_rows * 4); + + bottom_edge -= (int)(this_frame_stats->inactive_zone_rows * 4); + bottom_edge = AOMMAX(top_edge, bottom_edge); + } + + if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) || + ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) { + is_active_h_edge = 1; + } + return is_active_h_edge; +} + +// Checks to see if a super block is on a vertical image edge. +// In most cases this is the "real" edge unless there are formatting +// bars embedded in the stream. +static int active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) { + int left_edge = 0; + int right_edge = cpi->common.mi_params.mi_cols; + int is_active_v_edge = 0; + + // For two pass account for any formatting bars detected. + if (is_stat_consumption_stage_twopass(cpi)) { + const AV1_COMMON *const cm = &cpi->common; + const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats( + &cpi->twopass, cm->current_frame.display_order_hint); + if (this_frame_stats == NULL) return AOM_CODEC_ERROR; + + // The inactive region is specified in MBs not mi units. + // The image edge is in the following MB row. + left_edge += (int)(this_frame_stats->inactive_zone_cols * 4); + + right_edge -= (int)(this_frame_stats->inactive_zone_cols * 4); + right_edge = AOMMAX(left_edge, right_edge); + } + + if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) || + ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) { + is_active_v_edge = 1; + } + return is_active_v_edge; +} +#endif // !CONFIG_REALTIME_ONLY + +static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { + memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv)); +} + +static INLINE void load_pred_mv(MACROBLOCK *x, + const PICK_MODE_CONTEXT *const ctx) { + memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv)); +} + +#if !CONFIG_REALTIME_ONLY +// Try searching for an encoding for the given subblock. Returns zero if the +// rdcost is already too high (to tell the caller not to bother searching for +// encodings of further subblocks) +static int rd_try_subblock(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TOKENEXTRA **tp, int is_last, + int mi_row, int mi_col, BLOCK_SIZE subsize, + RD_STATS best_rdcost, RD_STATS *sum_rdc, + PARTITION_TYPE partition, + PICK_MODE_CONTEXT *prev_ctx, + PICK_MODE_CONTEXT *this_ctx) { + MACROBLOCK *const x = &td->mb; + const int orig_mult = x->rdmult; + setup_block_rdmult(cpi, x, mi_row, mi_col, subsize, NO_AQ, NULL); + + av1_rd_cost_update(x->rdmult, &best_rdcost); + if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, prev_ctx); + + RD_STATS rdcost_remaining; + av1_rd_stats_subtraction(x->rdmult, &best_rdcost, sum_rdc, &rdcost_remaining); + RD_STATS this_rdc; + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, partition, + subsize, this_ctx, rdcost_remaining, PICK_MODE_RD); + + if (this_rdc.rate == INT_MAX) { + sum_rdc->rdcost = INT64_MAX; + } else { + sum_rdc->rate += this_rdc.rate; + sum_rdc->dist += this_rdc.dist; + av1_rd_cost_update(x->rdmult, sum_rdc); + } + + if (sum_rdc->rdcost >= best_rdcost.rdcost) { + x->rdmult = orig_mult; + return 0; + } + + if (!is_last) { + update_state(cpi, td, this_ctx, mi_row, mi_col, subsize, 1); + encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, NULL); + } + + x->rdmult = orig_mult; + return 1; +} + +static bool rd_test_partition3(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TOKENEXTRA **tp, + PC_TREE *pc_tree, RD_STATS *best_rdc, + PICK_MODE_CONTEXT ctxs[3], + PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col, + BLOCK_SIZE bsize, PARTITION_TYPE partition, + int mi_row0, int mi_col0, BLOCK_SIZE subsize0, + int mi_row1, int mi_col1, BLOCK_SIZE subsize1, + int mi_row2, int mi_col2, BLOCK_SIZE subsize2) { + const MACROBLOCK *const x = &td->mb; + const MACROBLOCKD *const xd = &x->e_mbd; + const int pl = partition_plane_context(xd, mi_row, mi_col, bsize); + RD_STATS sum_rdc; + av1_init_rd_stats(&sum_rdc); + sum_rdc.rate = x->partition_cost[pl][partition]; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); + if (!rd_try_subblock(cpi, td, tile_data, tp, 0, mi_row0, mi_col0, subsize0, + *best_rdc, &sum_rdc, partition, ctx, &ctxs[0])) + return false; + + if (!rd_try_subblock(cpi, td, tile_data, tp, 0, mi_row1, mi_col1, subsize1, + *best_rdc, &sum_rdc, partition, &ctxs[0], &ctxs[1])) + return false; + + if (!rd_try_subblock(cpi, td, tile_data, tp, 1, mi_row2, mi_col2, subsize2, + *best_rdc, &sum_rdc, partition, &ctxs[1], &ctxs[2])) + return false; + + av1_rd_cost_update(x->rdmult, &sum_rdc); + if (sum_rdc.rdcost >= best_rdc->rdcost) return false; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); + if (sum_rdc.rdcost >= best_rdc->rdcost) return false; + + *best_rdc = sum_rdc; + pc_tree->partitioning = partition; + return true; +} + +static AOM_INLINE void reset_partition(PC_TREE *pc_tree, BLOCK_SIZE bsize) { + pc_tree->partitioning = PARTITION_NONE; + pc_tree->none.rd_stats.skip = 0; + + if (bsize >= BLOCK_8X8) { + BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + for (int idx = 0; idx < 4; ++idx) + reset_partition(pc_tree->split[idx], subsize); + } +} + +// Record the ref frames that have been selected by square partition blocks. +static AOM_INLINE void update_picked_ref_frames_mask(MACROBLOCK *const x, + int ref_type, + BLOCK_SIZE bsize, + int mib_size, int mi_row, + int mi_col) { + assert(mi_size_wide[bsize] == mi_size_high[bsize]); + const int sb_size_mask = mib_size - 1; + const int mi_row_in_sb = mi_row & sb_size_mask; + const int mi_col_in_sb = mi_col & sb_size_mask; + const int mi_size = mi_size_wide[bsize]; + for (int i = mi_row_in_sb; i < mi_row_in_sb + mi_size; ++i) { + for (int j = mi_col_in_sb; j < mi_col_in_sb + mi_size; ++j) { + x->picked_ref_frames_mask[i * 32 + j] |= 1 << ref_type; + } + } +} + +// Structure to keep win flags for HORZ and VERT partition evaluations +typedef struct { + bool horz_win; + bool vert_win; +} RD_RECT_PART_WIN_INFO; + +// Decide whether to evaluate the AB partition specified by part_type based on +// split and HORZ/VERT info +int evaluate_ab_partition_based_on_split( + PC_TREE *pc_tree, PARTITION_TYPE rect_part, + RD_RECT_PART_WIN_INFO *rect_part_win_info, int qindex, int split_idx1, + int split_idx2) { + int num_win = 0; + // Threshold for number of winners + // Conservative pruning for high quantizers + const int num_win_thresh = AOMMIN(3 * (2 * (MAXQ - qindex) / MAXQ), 3); + bool sub_part_win = (rect_part_win_info == NULL) + ? (pc_tree->partitioning == rect_part) + : (rect_part == PARTITION_HORZ) + ? rect_part_win_info->horz_win + : rect_part_win_info->vert_win; + num_win += (sub_part_win) ? 1 : 0; + num_win += + (pc_tree->split[split_idx1]->partitioning == PARTITION_NONE) ? 1 : 0; + num_win += + (pc_tree->split[split_idx2]->partitioning == PARTITION_NONE) ? 1 : 0; + if (num_win < num_win_thresh) { + return 0; + } + return 1; +} + +// Searches for the best partition pattern for a block based on the +// rate-distortion cost, and returns a bool value to indicate whether a valid +// partition pattern is found. The partition can recursively go down to +// the smallest block size. +// +// Inputs: +// cpi: the global compressor setting +// td: thread data +// tile_data: tile data +// tp: the pointer to the start token +// mi_row: row coordinate of the block in a step size of MI_SIZE +// mi_col: column coordinate of the block in a step size of MI_SIZE +// bsize: block size +// max_sq_part: the largest square block size for prediction blocks +// min_sq_part: the smallest square block size for prediction blocks +// rd_cost: the pointer to the final rd cost of the current block +// best_rdc: the upper bound of rd cost for a valid partition +// pc_tree: the pointer to the PC_TREE node storing the picked partitions +// and mode info for the current block +// none_rd: the pointer to the rd cost in the case of not splitting the +// current block +// multi_pass_mode: SB_SINGLE_PASS/SB_DRY_PASS/SB_WET_PASS +// rect_part_win_info: the pointer to a struct storing whether horz/vert +// partition outperforms previously tested partitions +// +// Output: +// a bool value indicating whether a valid partition is found +static bool rd_pick_partition(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TOKENEXTRA **tp, + int mi_row, int mi_col, BLOCK_SIZE bsize, + BLOCK_SIZE max_sq_part, BLOCK_SIZE min_sq_part, + RD_STATS *rd_cost, RD_STATS best_rdc, + PC_TREE *pc_tree, int64_t *none_rd, + SB_MULTI_PASS_MODE multi_pass_mode, + RD_RECT_PART_WIN_INFO *rect_part_win_info) { + const AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int num_planes = av1_num_planes(cm); + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int mi_step = mi_size_wide[bsize] / 2; + RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; + const TOKENEXTRA *const tp_orig = *tp; + PICK_MODE_CONTEXT *ctx_none = &pc_tree->none; + int tmp_partition_cost[PARTITION_TYPES]; + BLOCK_SIZE subsize; + RD_STATS this_rdc, sum_rdc; + const int bsize_at_least_8x8 = (bsize >= BLOCK_8X8); + int do_square_split = bsize_at_least_8x8; + const int pl = bsize_at_least_8x8 + ? partition_plane_context(xd, mi_row, mi_col, bsize) + : 0; + const int *partition_cost = x->partition_cost[pl]; + + int do_rectangular_split = cpi->oxcf.enable_rect_partitions; + int64_t cur_none_rd = 0; + int64_t split_rd[4] = { 0, 0, 0, 0 }; + int64_t horz_rd[2] = { 0, 0 }; + int64_t vert_rd[2] = { 0, 0 }; + int prune_horz = 0; + int prune_vert = 0; + int terminate_partition_search = 0; + + int split_ctx_is_ready[2] = { 0, 0 }; + int horz_ctx_is_ready = 0; + int vert_ctx_is_ready = 0; + BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT); + // Initialise HORZ and VERT win flags as true for all split partitions + RD_RECT_PART_WIN_INFO split_part_rect_win[4] = { + { true, true }, { true, true }, { true, true }, { true, true } + }; + + bool found_best_partition = false; + if (best_rdc.rdcost < 0) { + av1_invalid_rd_stats(rd_cost); + return found_best_partition; + } + + if (frame_is_intra_only(cm) && bsize == BLOCK_64X64) { + x->quad_tree_idx = 0; + x->cnn_output_valid = 0; + } + + if (bsize == cm->seq_params.sb_size) x->must_find_valid_partition = 0; + + // Override skipping rectangular partition operations for edge blocks + const int has_rows = (mi_row + mi_step < mi_params->mi_rows); + const int has_cols = (mi_col + mi_step < mi_params->mi_cols); + const int xss = x->e_mbd.plane[1].subsampling_x; + const int yss = x->e_mbd.plane[1].subsampling_y; + + if (none_rd) *none_rd = 0; + int partition_none_allowed = has_rows && has_cols; + int partition_horz_allowed = + has_cols && bsize_at_least_8x8 && cpi->oxcf.enable_rect_partitions && + get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ), xss, + yss) != BLOCK_INVALID; + int partition_vert_allowed = + has_rows && bsize_at_least_8x8 && cpi->oxcf.enable_rect_partitions && + get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT), xss, + yss) != BLOCK_INVALID; + + (void)*tp_orig; + +#if CONFIG_COLLECT_PARTITION_STATS + int partition_decisions[EXT_PARTITION_TYPES] = { 0 }; + int partition_attempts[EXT_PARTITION_TYPES] = { 0 }; + int64_t partition_times[EXT_PARTITION_TYPES] = { 0 }; + struct aom_usec_timer partition_timer = { 0 }; + int partition_timer_on = 0; +#if CONFIG_COLLECT_PARTITION_STATS == 2 + PartitionStats *part_stats = &cpi->partition_stats; +#endif +#endif + + // Override partition costs at the edges of the frame in the same + // way as in read_partition (see decodeframe.c) + if (!(has_rows && has_cols)) { + assert(bsize_at_least_8x8 && pl >= 0); + const aom_cdf_prob *partition_cdf = cm->fc->partition_cdf[pl]; + const int max_cost = av1_cost_symbol(0); + for (int i = 0; i < PARTITION_TYPES; ++i) tmp_partition_cost[i] = max_cost; + if (has_cols) { + // At the bottom, the two possibilities are HORZ and SPLIT + aom_cdf_prob bot_cdf[2]; + partition_gather_vert_alike(bot_cdf, partition_cdf, bsize); + static const int bot_inv_map[2] = { PARTITION_HORZ, PARTITION_SPLIT }; + av1_cost_tokens_from_cdf(tmp_partition_cost, bot_cdf, bot_inv_map); + } else if (has_rows) { + // At the right, the two possibilities are VERT and SPLIT + aom_cdf_prob rhs_cdf[2]; + partition_gather_horz_alike(rhs_cdf, partition_cdf, bsize); + static const int rhs_inv_map[2] = { PARTITION_VERT, PARTITION_SPLIT }; + av1_cost_tokens_from_cdf(tmp_partition_cost, rhs_cdf, rhs_inv_map); + } else { + // At the bottom right, we always split + tmp_partition_cost[PARTITION_SPLIT] = 0; + } + + partition_cost = tmp_partition_cost; + } + +#ifndef NDEBUG + // Nothing should rely on the default value of this array (which is just + // leftover from encoding the previous block. Setting it to fixed pattern + // when debugging. + // bit 0, 1, 2 are blk_skip of each plane + // bit 4, 5, 6 are initialization checking of each plane + memset(x->blk_skip, 0x77, sizeof(x->blk_skip)); +#endif // NDEBUG + + assert(mi_size_wide[bsize] == mi_size_high[bsize]); + + av1_init_rd_stats(&this_rdc); + + set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); + + // Save rdmult before it might be changed, so it can be restored later. + const int orig_rdmult = x->rdmult; + setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL); + + av1_rd_cost_update(x->rdmult, &best_rdc); + + if (bsize == BLOCK_16X16 && cpi->vaq_refresh) + x->mb_energy = av1_log_block_var(cpi, x, bsize); + + if (bsize > cpi->sf.part_sf.use_square_partition_only_threshold) { + partition_horz_allowed &= !has_rows; + partition_vert_allowed &= !has_cols; + } + + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + + const int try_intra_cnn_split = + !cpi->is_screen_content_type && frame_is_intra_only(cm) && + cpi->sf.part_sf.intra_cnn_split && + cm->seq_params.sb_size >= BLOCK_64X64 && bsize <= BLOCK_64X64 && + bsize >= BLOCK_8X8 && + mi_row + mi_size_high[bsize] <= mi_params->mi_rows && + mi_col + mi_size_wide[bsize] <= mi_params->mi_cols; + + if (try_intra_cnn_split) { + av1_intra_mode_cnn_partition( + &cpi->common, x, bsize, x->quad_tree_idx, &partition_none_allowed, + &partition_horz_allowed, &partition_vert_allowed, &do_rectangular_split, + &do_square_split); + } + + // Use simple_motion_search to prune partitions. This must be done prior to + // PARTITION_SPLIT to propagate the initial mvs to a smaller blocksize. + const int try_split_only = + !cpi->is_screen_content_type && + cpi->sf.part_sf.simple_motion_search_split && do_square_split && + bsize >= BLOCK_8X8 && + mi_row + mi_size_high[bsize] <= mi_params->mi_rows && + mi_col + mi_size_wide[bsize] <= mi_params->mi_cols && + !frame_is_intra_only(cm) && !av1_superres_scaled(cm); + + if (try_split_only) { + av1_simple_motion_search_based_split( + cpi, x, pc_tree, mi_row, mi_col, bsize, &partition_none_allowed, + &partition_horz_allowed, &partition_vert_allowed, &do_rectangular_split, + &do_square_split); + } + + const int try_prune_rect = + !cpi->is_screen_content_type && + cpi->sf.part_sf.simple_motion_search_prune_rect && + !frame_is_intra_only(cm) && do_rectangular_split && + (do_square_split || partition_none_allowed || + (prune_horz && prune_vert)) && + (partition_horz_allowed || partition_vert_allowed) && bsize >= BLOCK_8X8; + + if (try_prune_rect) { + av1_simple_motion_search_prune_rect( + cpi, x, pc_tree, mi_row, mi_col, bsize, &partition_horz_allowed, + &partition_vert_allowed, &prune_horz, &prune_vert); + } + + // Max and min square partition levels are defined as the partition nodes that + // the recursive function rd_pick_partition() can reach. To implement this: + // only PARTITION_NONE is allowed if the current node equals min_sq_part, + // only PARTITION_SPLIT is allowed if the current node exceeds max_sq_part. + assert(block_size_wide[min_sq_part] == block_size_high[min_sq_part]); + assert(block_size_wide[max_sq_part] == block_size_high[max_sq_part]); + assert(min_sq_part <= max_sq_part); + assert(block_size_wide[bsize] == block_size_high[bsize]); + const int max_partition_size = block_size_wide[max_sq_part]; + const int min_partition_size = block_size_wide[min_sq_part]; + const int blksize = block_size_wide[bsize]; + assert(min_partition_size <= max_partition_size); + const int is_le_min_sq_part = blksize <= min_partition_size; + const int is_gt_max_sq_part = blksize > max_partition_size; + if (is_gt_max_sq_part) { + // If current block size is larger than max, only allow split. + partition_none_allowed = 0; + partition_horz_allowed = 0; + partition_vert_allowed = 0; + do_square_split = 1; + } else if (is_le_min_sq_part) { + // If current block size is less or equal to min, only allow none if valid + // block large enough; only allow split otherwise. + partition_horz_allowed = 0; + partition_vert_allowed = 0; + // only disable square split when current block is not at the picture + // boundary. otherwise, inherit the square split flag from previous logic + if (has_rows && has_cols) do_square_split = 0; + partition_none_allowed = !do_square_split; + } + +BEGIN_PARTITION_SEARCH: + if (x->must_find_valid_partition) { + do_square_split = bsize_at_least_8x8 && (blksize > min_partition_size); + partition_none_allowed = + has_rows && has_cols && (blksize >= min_partition_size); + partition_horz_allowed = + has_cols && bsize_at_least_8x8 && cpi->oxcf.enable_rect_partitions && + (blksize > min_partition_size) && + get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ), xss, + yss) != BLOCK_INVALID; + partition_vert_allowed = + has_rows && bsize_at_least_8x8 && cpi->oxcf.enable_rect_partitions && + (blksize > min_partition_size) && + get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT), xss, + yss) != BLOCK_INVALID; + terminate_partition_search = 0; + } + + // Partition block source pixel variance. + unsigned int pb_source_variance = UINT_MAX; + + // Partition block sse after simple motion compensation, not in use now, + // but will be used for upcoming speed features + unsigned int pb_simple_motion_pred_sse = UINT_MAX; + (void)pb_simple_motion_pred_sse; + + // PARTITION_NONE + if (is_le_min_sq_part && has_rows && has_cols) partition_none_allowed = 1; + assert(terminate_partition_search == 0); + int64_t part_none_rd = INT64_MAX; + if (cpi->is_screen_content_type) + partition_none_allowed = has_rows && has_cols; + if (partition_none_allowed && !is_gt_max_sq_part) { + int pt_cost = 0; + if (bsize_at_least_8x8) { + pt_cost = partition_cost[PARTITION_NONE] < INT_MAX + ? partition_cost[PARTITION_NONE] + : 0; + } + RD_STATS partition_rdcost; + av1_init_rd_stats(&partition_rdcost); + partition_rdcost.rate = pt_cost; + av1_rd_cost_update(x->rdmult, &partition_rdcost); + RD_STATS best_remain_rdcost; + av1_rd_stats_subtraction(x->rdmult, &best_rdc, &partition_rdcost, + &best_remain_rdcost); +#if CONFIG_COLLECT_PARTITION_STATS + if (best_remain_rdcost >= 0) { + partition_attempts[PARTITION_NONE] += 1; + aom_usec_timer_start(&partition_timer); + partition_timer_on = 1; + } +#endif + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_NONE, + bsize, ctx_none, best_remain_rdcost, PICK_MODE_RD); + av1_rd_cost_update(x->rdmult, &this_rdc); +#if CONFIG_COLLECT_PARTITION_STATS + if (partition_timer_on) { + aom_usec_timer_mark(&partition_timer); + int64_t time = aom_usec_timer_elapsed(&partition_timer); + partition_times[PARTITION_NONE] += time; + partition_timer_on = 0; + } +#endif + pb_source_variance = x->source_variance; + pb_simple_motion_pred_sse = x->simple_motion_pred_sse; + if (none_rd) *none_rd = this_rdc.rdcost; + cur_none_rd = this_rdc.rdcost; + if (this_rdc.rate != INT_MAX) { + if (cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions) { + const int ref_type = av1_ref_frame_type(ctx_none->mic.ref_frame); + update_picked_ref_frames_mask(x, ref_type, bsize, + cm->seq_params.mib_size, mi_row, mi_col); + } + if (bsize_at_least_8x8) { + this_rdc.rate += pt_cost; + this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist); + } + + part_none_rd = this_rdc.rdcost; + if (this_rdc.rdcost < best_rdc.rdcost) { + // Adjust dist breakout threshold according to the partition size. + const int64_t dist_breakout_thr = + cpi->sf.part_sf.partition_search_breakout_dist_thr >> + ((2 * (MAX_SB_SIZE_LOG2 - 2)) - + (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize])); + const int rate_breakout_thr = + cpi->sf.part_sf.partition_search_breakout_rate_thr * + num_pels_log2_lookup[bsize]; + + best_rdc = this_rdc; + found_best_partition = true; + if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE; + + if (!frame_is_intra_only(cm) && + (do_square_split || do_rectangular_split) && + !x->e_mbd.lossless[xd->mi[0]->segment_id] && ctx_none->skippable) { + const int use_ml_based_breakout = + bsize <= cpi->sf.part_sf.use_square_partition_only_threshold && + bsize > BLOCK_4X4 && xd->bd == 8; + if (use_ml_based_breakout) { + if (av1_ml_predict_breakout(cpi, bsize, x, &this_rdc, + pb_source_variance)) { + do_square_split = 0; + do_rectangular_split = 0; + } + } + + // If all y, u, v transform blocks in this partition are skippable, + // and the dist & rate are within the thresholds, the partition + // search is terminated for current branch of the partition search + // tree. The dist & rate thresholds are set to 0 at speed 0 to + // disable the early termination at that speed. + if (best_rdc.dist < dist_breakout_thr && + best_rdc.rate < rate_breakout_thr) { + do_square_split = 0; + do_rectangular_split = 0; + } + } + + if (cpi->sf.part_sf.simple_motion_search_early_term_none && + cm->show_frame && !frame_is_intra_only(cm) && + bsize >= BLOCK_16X16 && mi_row + mi_step < mi_params->mi_rows && + mi_col + mi_step < mi_params->mi_cols && + this_rdc.rdcost < INT64_MAX && this_rdc.rdcost >= 0 && + this_rdc.rate < INT_MAX && this_rdc.rate >= 0 && + (do_square_split || do_rectangular_split)) { + av1_simple_motion_search_early_term_none(cpi, x, pc_tree, mi_row, + mi_col, bsize, &this_rdc, + &terminate_partition_search); + } + } + } + + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + } + + // store estimated motion vector + if (cpi->sf.mv_sf.adaptive_motion_search) store_pred_mv(x, ctx_none); + + // PARTITION_SPLIT + int64_t part_split_rd = INT64_MAX; + if ((!terminate_partition_search && do_square_split) || is_gt_max_sq_part) { + av1_init_rd_stats(&sum_rdc); + subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + sum_rdc.rate = partition_cost[PARTITION_SPLIT]; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); + + int idx; +#if CONFIG_COLLECT_PARTITION_STATS + if (best_rdc.rdcost - sum_rdc.rdcost >= 0) { + partition_attempts[PARTITION_SPLIT] += 1; + aom_usec_timer_start(&partition_timer); + partition_timer_on = 1; + } +#endif + for (idx = 0; idx < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++idx) { + const int x_idx = (idx & 1) * mi_step; + const int y_idx = (idx >> 1) * mi_step; + + if (mi_row + y_idx >= mi_params->mi_rows || + mi_col + x_idx >= mi_params->mi_cols) + continue; + + if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, ctx_none); + + pc_tree->split[idx]->index = idx; + int64_t *p_split_rd = &split_rd[idx]; + + RD_STATS best_remain_rdcost; + av1_rd_stats_subtraction(x->rdmult, &best_rdc, &sum_rdc, + &best_remain_rdcost); + + int curr_quad_tree_idx = 0; + if (frame_is_intra_only(cm) && bsize <= BLOCK_64X64) { + curr_quad_tree_idx = x->quad_tree_idx; + x->quad_tree_idx = 4 * curr_quad_tree_idx + idx + 1; + } + if (!rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx, + mi_col + x_idx, subsize, max_sq_part, min_sq_part, + &this_rdc, best_remain_rdcost, pc_tree->split[idx], + p_split_rd, multi_pass_mode, + &split_part_rect_win[idx])) { + av1_invalid_rd_stats(&sum_rdc); + break; + } + if (frame_is_intra_only(cm) && bsize <= BLOCK_64X64) { + x->quad_tree_idx = curr_quad_tree_idx; + } + + sum_rdc.rate += this_rdc.rate; + sum_rdc.dist += this_rdc.dist; + av1_rd_cost_update(x->rdmult, &sum_rdc); + if (idx <= 1 && (bsize <= BLOCK_8X8 || + pc_tree->split[idx]->partitioning == PARTITION_NONE)) { + const MB_MODE_INFO *const mbmi = &pc_tree->split[idx]->none.mic; + const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + // Neither palette mode nor cfl predicted + if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) { + if (mbmi->uv_mode != UV_CFL_PRED) split_ctx_is_ready[idx] = 1; + } + } + } +#if CONFIG_COLLECT_PARTITION_STATS + if (partition_timer_on) { + aom_usec_timer_mark(&partition_timer); + int64_t time = aom_usec_timer_elapsed(&partition_timer); + partition_times[PARTITION_SPLIT] += time; + partition_timer_on = 0; + } +#endif + const int reached_last_index = (idx == 4); + + part_split_rd = sum_rdc.rdcost; + if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) { + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); + if (sum_rdc.rdcost < best_rdc.rdcost) { + best_rdc = sum_rdc; + found_best_partition = true; + pc_tree->partitioning = PARTITION_SPLIT; + } + } else if (cpi->sf.part_sf.less_rectangular_check_level > 0) { + // Skip rectangular partition test when partition type none gives better + // rd than partition type split. + if (cpi->sf.part_sf.less_rectangular_check_level == 2 || idx <= 2) { + const int partition_none_valid = cur_none_rd > 0; + const int partition_none_better = cur_none_rd < sum_rdc.rdcost; + do_rectangular_split &= + !(partition_none_valid && partition_none_better); + } + } + + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + } // if (do_split) + + if (cpi->sf.part_sf.ml_early_term_after_part_split_level && + !frame_is_intra_only(cm) && !terminate_partition_search && + do_rectangular_split && + (partition_horz_allowed || partition_vert_allowed)) { + av1_ml_early_term_after_split(cpi, x, pc_tree, bsize, best_rdc.rdcost, + part_none_rd, part_split_rd, split_rd, mi_row, + mi_col, &terminate_partition_search); + } + + if (!cpi->sf.part_sf.ml_early_term_after_part_split_level && + cpi->sf.part_sf.ml_prune_rect_partition && !frame_is_intra_only(cm) && + (partition_horz_allowed || partition_vert_allowed) && + !(prune_horz || prune_vert) && !terminate_partition_search) { + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize); + av1_ml_prune_rect_partition(cpi, x, bsize, best_rdc.rdcost, cur_none_rd, + split_rd, &prune_horz, &prune_vert); + } + + // PARTITION_HORZ + assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_horz_allowed)); + if (!terminate_partition_search && partition_horz_allowed && !prune_horz && + (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step)) && + !is_gt_max_sq_part) { + av1_init_rd_stats(&sum_rdc); + subsize = get_partition_subsize(bsize, PARTITION_HORZ); + if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, ctx_none); + sum_rdc.rate = partition_cost[PARTITION_HORZ]; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); + RD_STATS best_remain_rdcost; + av1_rd_stats_subtraction(x->rdmult, &best_rdc, &sum_rdc, + &best_remain_rdcost); +#if CONFIG_COLLECT_PARTITION_STATS + if (best_remain_rdcost >= 0) { + partition_attempts[PARTITION_HORZ] += 1; + aom_usec_timer_start(&partition_timer); + partition_timer_on = 1; + } +#endif + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_HORZ, + subsize, &pc_tree->horizontal[0], best_remain_rdcost, + PICK_MODE_RD); + av1_rd_cost_update(x->rdmult, &this_rdc); + + if (this_rdc.rate == INT_MAX) { + sum_rdc.rdcost = INT64_MAX; + } else { + sum_rdc.rate += this_rdc.rate; + sum_rdc.dist += this_rdc.dist; + av1_rd_cost_update(x->rdmult, &sum_rdc); + } + horz_rd[0] = this_rdc.rdcost; + + if (sum_rdc.rdcost < best_rdc.rdcost && has_rows) { + const PICK_MODE_CONTEXT *const ctx_h = &pc_tree->horizontal[0]; + const MB_MODE_INFO *const mbmi = &pc_tree->horizontal[0].mic; + const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + // Neither palette mode nor cfl predicted + if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) { + if (mbmi->uv_mode != UV_CFL_PRED) horz_ctx_is_ready = 1; + } + update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1); + encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, NULL); + + if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, ctx_h); + + av1_rd_stats_subtraction(x->rdmult, &best_rdc, &sum_rdc, + &best_remain_rdcost); + + pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc, + PARTITION_HORZ, subsize, &pc_tree->horizontal[1], + best_remain_rdcost, PICK_MODE_RD); + av1_rd_cost_update(x->rdmult, &this_rdc); + horz_rd[1] = this_rdc.rdcost; + + if (this_rdc.rate == INT_MAX) { + sum_rdc.rdcost = INT64_MAX; + } else { + sum_rdc.rate += this_rdc.rate; + sum_rdc.dist += this_rdc.dist; + av1_rd_cost_update(x->rdmult, &sum_rdc); + } + } +#if CONFIG_COLLECT_PARTITION_STATS + if (partition_timer_on) { + aom_usec_timer_mark(&partition_timer); + int64_t time = aom_usec_timer_elapsed(&partition_timer); + partition_times[PARTITION_HORZ] += time; + partition_timer_on = 0; + } +#endif + + if (sum_rdc.rdcost < best_rdc.rdcost) { + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); + if (sum_rdc.rdcost < best_rdc.rdcost) { + best_rdc = sum_rdc; + found_best_partition = true; + pc_tree->partitioning = PARTITION_HORZ; + } + } else { + // Update HORZ win flag + if (rect_part_win_info != NULL) { + rect_part_win_info->horz_win = false; + } + } + + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + } + + // PARTITION_VERT + assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_vert_allowed)); + if (!terminate_partition_search && partition_vert_allowed && !prune_vert && + (do_rectangular_split || active_v_edge(cpi, mi_col, mi_step)) && + !is_gt_max_sq_part) { + av1_init_rd_stats(&sum_rdc); + subsize = get_partition_subsize(bsize, PARTITION_VERT); + + if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, ctx_none); + + sum_rdc.rate = partition_cost[PARTITION_VERT]; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); + RD_STATS best_remain_rdcost; + av1_rd_stats_subtraction(x->rdmult, &best_rdc, &sum_rdc, + &best_remain_rdcost); +#if CONFIG_COLLECT_PARTITION_STATS + if (best_remain_rdcost >= 0) { + partition_attempts[PARTITION_VERT] += 1; + aom_usec_timer_start(&partition_timer); + partition_timer_on = 1; + } +#endif + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_VERT, + subsize, &pc_tree->vertical[0], best_remain_rdcost, + PICK_MODE_RD); + av1_rd_cost_update(x->rdmult, &this_rdc); + + if (this_rdc.rate == INT_MAX) { + sum_rdc.rdcost = INT64_MAX; + } else { + sum_rdc.rate += this_rdc.rate; + sum_rdc.dist += this_rdc.dist; + av1_rd_cost_update(x->rdmult, &sum_rdc); + } + vert_rd[0] = this_rdc.rdcost; + if (sum_rdc.rdcost < best_rdc.rdcost && has_cols) { + const MB_MODE_INFO *const mbmi = &pc_tree->vertical[0].mic; + const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + // Neither palette mode nor cfl predicted + if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) { + if (mbmi->uv_mode != UV_CFL_PRED) vert_ctx_is_ready = 1; + } + update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 1); + encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, NULL); + + if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, ctx_none); + + av1_rd_stats_subtraction(x->rdmult, &best_rdc, &sum_rdc, + &best_remain_rdcost); + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc, + PARTITION_VERT, subsize, &pc_tree->vertical[1], + best_remain_rdcost, PICK_MODE_RD); + av1_rd_cost_update(x->rdmult, &this_rdc); + vert_rd[1] = this_rdc.rdcost; + + if (this_rdc.rate == INT_MAX) { + sum_rdc.rdcost = INT64_MAX; + } else { + sum_rdc.rate += this_rdc.rate; + sum_rdc.dist += this_rdc.dist; + av1_rd_cost_update(x->rdmult, &sum_rdc); + } + } +#if CONFIG_COLLECT_PARTITION_STATS + if (partition_timer_on) { + aom_usec_timer_mark(&partition_timer); + int64_t time = aom_usec_timer_elapsed(&partition_timer); + partition_times[PARTITION_VERT] += time; + partition_timer_on = 0; + } +#endif + + av1_rd_cost_update(x->rdmult, &sum_rdc); + if (sum_rdc.rdcost < best_rdc.rdcost) { + best_rdc = sum_rdc; + found_best_partition = true; + pc_tree->partitioning = PARTITION_VERT; + } else { + // Update VERT win flag + if (rect_part_win_info != NULL) { + rect_part_win_info->vert_win = false; + } + } + + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + } + + if (pb_source_variance == UINT_MAX) { + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize); + if (is_cur_buf_hbd(xd)) { + pb_source_variance = av1_high_get_sby_perpixel_variance( + cpi, &x->plane[0].src, bsize, xd->bd); + } else { + pb_source_variance = + av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize); + } + } + + if (use_pb_simple_motion_pred_sse(cpi) && + pb_simple_motion_pred_sse == UINT_MAX) { + const FULLPEL_MV start_mv = kZeroFullMv; + unsigned int var = 0; + + av1_simple_motion_sse_var(cpi, x, mi_row, mi_col, bsize, start_mv, 0, + &pb_simple_motion_pred_sse, &var); + } + + assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !do_rectangular_split)); + + const int ext_partition_allowed = + do_rectangular_split && + bsize > cpi->sf.part_sf.ext_partition_eval_thresh && has_rows && has_cols; + + // The standard AB partitions are allowed whenever ext-partition-types are + // allowed + int horzab_partition_allowed = + ext_partition_allowed & cpi->oxcf.enable_ab_partitions; + int vertab_partition_allowed = + ext_partition_allowed & cpi->oxcf.enable_ab_partitions; + + if (cpi->sf.part_sf.prune_ext_partition_types_search_level) { + if (cpi->sf.part_sf.prune_ext_partition_types_search_level == 1) { + // TODO(debargha,huisu@google.com): may need to tune the threshold for + // pb_source_variance. + horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ || + (pc_tree->partitioning == PARTITION_NONE && + pb_source_variance < 32) || + pc_tree->partitioning == PARTITION_SPLIT); + vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT || + (pc_tree->partitioning == PARTITION_NONE && + pb_source_variance < 32) || + pc_tree->partitioning == PARTITION_SPLIT); + } else { + horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ || + pc_tree->partitioning == PARTITION_SPLIT); + vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT || + pc_tree->partitioning == PARTITION_SPLIT); + } + horz_rd[0] = (horz_rd[0] < INT64_MAX ? horz_rd[0] : 0); + horz_rd[1] = (horz_rd[1] < INT64_MAX ? horz_rd[1] : 0); + vert_rd[0] = (vert_rd[0] < INT64_MAX ? vert_rd[0] : 0); + vert_rd[1] = (vert_rd[1] < INT64_MAX ? vert_rd[1] : 0); + split_rd[0] = (split_rd[0] < INT64_MAX ? split_rd[0] : 0); + split_rd[1] = (split_rd[1] < INT64_MAX ? split_rd[1] : 0); + split_rd[2] = (split_rd[2] < INT64_MAX ? split_rd[2] : 0); + split_rd[3] = (split_rd[3] < INT64_MAX ? split_rd[3] : 0); + } + int horza_partition_allowed = horzab_partition_allowed; + int horzb_partition_allowed = horzab_partition_allowed; + if (cpi->sf.part_sf.prune_ext_partition_types_search_level) { + const int64_t horz_a_rd = horz_rd[1] + split_rd[0] + split_rd[1]; + const int64_t horz_b_rd = horz_rd[0] + split_rd[2] + split_rd[3]; + switch (cpi->sf.part_sf.prune_ext_partition_types_search_level) { + case 1: + horza_partition_allowed &= (horz_a_rd / 16 * 14 < best_rdc.rdcost); + horzb_partition_allowed &= (horz_b_rd / 16 * 14 < best_rdc.rdcost); + break; + case 2: + default: + horza_partition_allowed &= (horz_a_rd / 16 * 15 < best_rdc.rdcost); + horzb_partition_allowed &= (horz_b_rd / 16 * 15 < best_rdc.rdcost); + break; + } + } + + int verta_partition_allowed = vertab_partition_allowed; + int vertb_partition_allowed = vertab_partition_allowed; + if (cpi->sf.part_sf.prune_ext_partition_types_search_level) { + const int64_t vert_a_rd = vert_rd[1] + split_rd[0] + split_rd[2]; + const int64_t vert_b_rd = vert_rd[0] + split_rd[1] + split_rd[3]; + switch (cpi->sf.part_sf.prune_ext_partition_types_search_level) { + case 1: + verta_partition_allowed &= (vert_a_rd / 16 * 14 < best_rdc.rdcost); + vertb_partition_allowed &= (vert_b_rd / 16 * 14 < best_rdc.rdcost); + break; + case 2: + default: + verta_partition_allowed &= (vert_a_rd / 16 * 15 < best_rdc.rdcost); + vertb_partition_allowed &= (vert_b_rd / 16 * 15 < best_rdc.rdcost); + break; + } + } + + if (cpi->sf.part_sf.ml_prune_ab_partition && ext_partition_allowed && + partition_horz_allowed && partition_vert_allowed) { + // TODO(huisu@google.com): x->source_variance may not be the current + // block's variance. The correct one to use is pb_source_variance. Need to + // re-train the model to fix it. + av1_ml_prune_ab_partition( + bsize, pc_tree->partitioning, get_unsigned_bits(x->source_variance), + best_rdc.rdcost, horz_rd, vert_rd, split_rd, &horza_partition_allowed, + &horzb_partition_allowed, &verta_partition_allowed, + &vertb_partition_allowed); + } + + horza_partition_allowed &= cpi->oxcf.enable_ab_partitions; + horzb_partition_allowed &= cpi->oxcf.enable_ab_partitions; + verta_partition_allowed &= cpi->oxcf.enable_ab_partitions; + vertb_partition_allowed &= cpi->oxcf.enable_ab_partitions; + + if (cpi->sf.part_sf.prune_ab_partition_using_split_info && + horza_partition_allowed) { + horza_partition_allowed &= evaluate_ab_partition_based_on_split( + pc_tree, PARTITION_HORZ, rect_part_win_info, x->qindex, 0, 1); + } + + // PARTITION_HORZ_A + if (!terminate_partition_search && partition_horz_allowed && + horza_partition_allowed && !is_gt_max_sq_part) { + subsize = get_partition_subsize(bsize, PARTITION_HORZ_A); + pc_tree->horizontala[0].rd_mode_is_ready = 0; + pc_tree->horizontala[1].rd_mode_is_ready = 0; + pc_tree->horizontala[2].rd_mode_is_ready = 0; + if (split_ctx_is_ready[0]) { + av1_copy_tree_context(&pc_tree->horizontala[0], &pc_tree->split[0]->none); + pc_tree->horizontala[0].mic.partition = PARTITION_HORZ_A; + pc_tree->horizontala[0].rd_mode_is_ready = 1; + if (split_ctx_is_ready[1]) { + av1_copy_tree_context(&pc_tree->horizontala[1], + &pc_tree->split[1]->none); + pc_tree->horizontala[1].mic.partition = PARTITION_HORZ_A; + pc_tree->horizontala[1].rd_mode_is_ready = 1; + } + } +#if CONFIG_COLLECT_PARTITION_STATS + { + RD_STATS tmp_sum_rdc; + av1_init_rd_stats(&tmp_sum_rdc); + tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_HORZ_A]; + tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0); + if (best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) { + partition_attempts[PARTITION_HORZ_A] += 1; + aom_usec_timer_start(&partition_timer); + partition_timer_on = 1; + } + } +#endif + found_best_partition |= rd_test_partition3( + cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->horizontala, + ctx_none, mi_row, mi_col, bsize, PARTITION_HORZ_A, mi_row, mi_col, + bsize2, mi_row, mi_col + mi_step, bsize2, mi_row + mi_step, mi_col, + subsize); +#if CONFIG_COLLECT_PARTITION_STATS + if (partition_timer_on) { + aom_usec_timer_mark(&partition_timer); + int64_t time = aom_usec_timer_elapsed(&partition_timer); + partition_times[PARTITION_HORZ_A] += time; + partition_timer_on = 0; + } +#endif + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + } + + if (cpi->sf.part_sf.prune_ab_partition_using_split_info && + horzb_partition_allowed) { + horzb_partition_allowed &= evaluate_ab_partition_based_on_split( + pc_tree, PARTITION_HORZ, rect_part_win_info, x->qindex, 2, 3); + } + + // PARTITION_HORZ_B + if (!terminate_partition_search && partition_horz_allowed && + horzb_partition_allowed && !is_gt_max_sq_part) { + subsize = get_partition_subsize(bsize, PARTITION_HORZ_B); + pc_tree->horizontalb[0].rd_mode_is_ready = 0; + pc_tree->horizontalb[1].rd_mode_is_ready = 0; + pc_tree->horizontalb[2].rd_mode_is_ready = 0; + if (horz_ctx_is_ready) { + av1_copy_tree_context(&pc_tree->horizontalb[0], &pc_tree->horizontal[0]); + pc_tree->horizontalb[0].mic.partition = PARTITION_HORZ_B; + pc_tree->horizontalb[0].rd_mode_is_ready = 1; + } +#if CONFIG_COLLECT_PARTITION_STATS + { + RD_STATS tmp_sum_rdc; + av1_init_rd_stats(&tmp_sum_rdc); + tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_HORZ_B]; + tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0); + if (best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) { + partition_attempts[PARTITION_HORZ_B] += 1; + aom_usec_timer_start(&partition_timer); + partition_timer_on = 1; + } + } +#endif + found_best_partition |= rd_test_partition3( + cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->horizontalb, + ctx_none, mi_row, mi_col, bsize, PARTITION_HORZ_B, mi_row, mi_col, + subsize, mi_row + mi_step, mi_col, bsize2, mi_row + mi_step, + mi_col + mi_step, bsize2); + +#if CONFIG_COLLECT_PARTITION_STATS + if (partition_timer_on) { + aom_usec_timer_mark(&partition_timer); + int64_t time = aom_usec_timer_elapsed(&partition_timer); + partition_times[PARTITION_HORZ_B] += time; + partition_timer_on = 0; + } +#endif + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + } + + if (cpi->sf.part_sf.prune_ab_partition_using_split_info && + verta_partition_allowed) { + verta_partition_allowed &= evaluate_ab_partition_based_on_split( + pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 0, 2); + } + + // PARTITION_VERT_A + if (!terminate_partition_search && partition_vert_allowed && + verta_partition_allowed && !is_gt_max_sq_part) { + subsize = get_partition_subsize(bsize, PARTITION_VERT_A); + pc_tree->verticala[0].rd_mode_is_ready = 0; + pc_tree->verticala[1].rd_mode_is_ready = 0; + pc_tree->verticala[2].rd_mode_is_ready = 0; + if (split_ctx_is_ready[0]) { + av1_copy_tree_context(&pc_tree->verticala[0], &pc_tree->split[0]->none); + pc_tree->verticala[0].mic.partition = PARTITION_VERT_A; + pc_tree->verticala[0].rd_mode_is_ready = 1; + } +#if CONFIG_COLLECT_PARTITION_STATS + { + RD_STATS tmp_sum_rdc; + av1_init_rd_stats(&tmp_sum_rdc); + tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_VERT_A]; + tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0); + if (best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) { + partition_attempts[PARTITION_VERT_A] += 1; + aom_usec_timer_start(&partition_timer); + partition_timer_on = 1; + } + } +#endif + found_best_partition |= rd_test_partition3( + cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->verticala, + ctx_none, mi_row, mi_col, bsize, PARTITION_VERT_A, mi_row, mi_col, + bsize2, mi_row + mi_step, mi_col, bsize2, mi_row, mi_col + mi_step, + subsize); +#if CONFIG_COLLECT_PARTITION_STATS + if (partition_timer_on) { + aom_usec_timer_mark(&partition_timer); + int64_t time = aom_usec_timer_elapsed(&partition_timer); + partition_times[PARTITION_VERT_A] += time; + partition_timer_on = 0; + } +#endif + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + } + + if (cpi->sf.part_sf.prune_ab_partition_using_split_info && + vertb_partition_allowed) { + vertb_partition_allowed &= evaluate_ab_partition_based_on_split( + pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 1, 3); + } + + // PARTITION_VERT_B + if (!terminate_partition_search && partition_vert_allowed && + vertb_partition_allowed && !is_gt_max_sq_part) { + subsize = get_partition_subsize(bsize, PARTITION_VERT_B); + pc_tree->verticalb[0].rd_mode_is_ready = 0; + pc_tree->verticalb[1].rd_mode_is_ready = 0; + pc_tree->verticalb[2].rd_mode_is_ready = 0; + if (vert_ctx_is_ready) { + av1_copy_tree_context(&pc_tree->verticalb[0], &pc_tree->vertical[0]); + pc_tree->verticalb[0].mic.partition = PARTITION_VERT_B; + pc_tree->verticalb[0].rd_mode_is_ready = 1; + } +#if CONFIG_COLLECT_PARTITION_STATS + { + RD_STATS tmp_sum_rdc; + av1_init_rd_stats(&tmp_sum_rdc); + tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_VERT_B]; + tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0); + if (!frame_is_intra_only(cm) && + best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) { + partition_attempts[PARTITION_VERT_B] += 1; + aom_usec_timer_start(&partition_timer); + partition_timer_on = 1; + } + } +#endif + found_best_partition |= rd_test_partition3( + cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->verticalb, + ctx_none, mi_row, mi_col, bsize, PARTITION_VERT_B, mi_row, mi_col, + subsize, mi_row, mi_col + mi_step, bsize2, mi_row + mi_step, + mi_col + mi_step, bsize2); +#if CONFIG_COLLECT_PARTITION_STATS + if (partition_timer_on) { + aom_usec_timer_mark(&partition_timer); + int64_t time = aom_usec_timer_elapsed(&partition_timer); + partition_times[PARTITION_VERT_B] += time; + partition_timer_on = 0; + } +#endif + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + } + + // partition4_allowed is 1 if we can use a PARTITION_HORZ_4 or + // PARTITION_VERT_4 for this block. This is almost the same as + // ext_partition_allowed, except that we don't allow 128x32 or 32x128 + // blocks, so we require that bsize is not BLOCK_128X128. + const int partition4_allowed = cpi->oxcf.enable_1to4_partitions && + ext_partition_allowed && + bsize != BLOCK_128X128; + + int partition_horz4_allowed = + partition4_allowed && partition_horz_allowed && + get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ_4), xss, + yss) != BLOCK_INVALID; + int partition_vert4_allowed = + partition4_allowed && partition_vert_allowed && + get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT_4), xss, + yss) != BLOCK_INVALID; + if (cpi->sf.part_sf.prune_ext_partition_types_search_level == 2) { + partition_horz4_allowed &= (pc_tree->partitioning == PARTITION_HORZ || + pc_tree->partitioning == PARTITION_HORZ_A || + pc_tree->partitioning == PARTITION_HORZ_B || + pc_tree->partitioning == PARTITION_SPLIT || + pc_tree->partitioning == PARTITION_NONE); + partition_vert4_allowed &= (pc_tree->partitioning == PARTITION_VERT || + pc_tree->partitioning == PARTITION_VERT_A || + pc_tree->partitioning == PARTITION_VERT_B || + pc_tree->partitioning == PARTITION_SPLIT || + pc_tree->partitioning == PARTITION_NONE); + } + if (cpi->sf.part_sf.ml_prune_4_partition && partition4_allowed && + partition_horz_allowed && partition_vert_allowed) { + av1_ml_prune_4_partition(cpi, x, bsize, pc_tree->partitioning, + best_rdc.rdcost, horz_rd, vert_rd, split_rd, + &partition_horz4_allowed, &partition_vert4_allowed, + pb_source_variance, mi_row, mi_col); + } + + if (blksize < (min_partition_size << 2)) { + partition_horz4_allowed = 0; + partition_vert4_allowed = 0; + } + + if (cpi->sf.part_sf.prune_4_partition_using_split_info && + (partition_horz4_allowed || partition_vert4_allowed)) { + // Count of child blocks in which HORZ or VERT partition has won + int num_child_horz_win = 0, num_child_vert_win = 0; + for (int idx = 0; idx < 4; idx++) { + num_child_horz_win += (split_part_rect_win[idx].horz_win) ? 1 : 0; + num_child_vert_win += (split_part_rect_win[idx].vert_win) ? 1 : 0; + } + + // Prune HORZ4/VERT4 partitions based on number of HORZ/VERT winners of + // split partiitons. + // Conservative pruning for high quantizers + const int num_win_thresh = AOMMIN(3 * (MAXQ - x->qindex) / MAXQ + 1, 3); + if (num_child_horz_win < num_win_thresh) { + partition_horz4_allowed = 0; + } + if (num_child_vert_win < num_win_thresh) { + partition_vert4_allowed = 0; + } + } + + // PARTITION_HORZ_4 + assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_horz4_allowed)); + if (!terminate_partition_search && partition_horz4_allowed && has_rows && + (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step)) && + !is_gt_max_sq_part) { + av1_init_rd_stats(&sum_rdc); + const int quarter_step = mi_size_high[bsize] / 4; + PICK_MODE_CONTEXT *ctx_prev = ctx_none; + + subsize = get_partition_subsize(bsize, PARTITION_HORZ_4); + sum_rdc.rate = partition_cost[PARTITION_HORZ_4]; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); + +#if CONFIG_COLLECT_PARTITION_STATS + if (best_rdc.rdcost - sum_rdc.rdcost >= 0) { + partition_attempts[PARTITION_HORZ_4] += 1; + aom_usec_timer_start(&partition_timer); + partition_timer_on = 1; + } +#endif + for (int i = 0; i < 4; ++i) { + const int this_mi_row = mi_row + i * quarter_step; + + if (i > 0 && this_mi_row >= mi_params->mi_rows) break; + + PICK_MODE_CONTEXT *ctx_this = &pc_tree->horizontal4[i]; + + ctx_this->rd_mode_is_ready = 0; + if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), this_mi_row, + mi_col, subsize, best_rdc, &sum_rdc, + PARTITION_HORZ_4, ctx_prev, ctx_this)) { + av1_invalid_rd_stats(&sum_rdc); + break; + } + + ctx_prev = ctx_this; + } + + av1_rd_cost_update(x->rdmult, &sum_rdc); + if (sum_rdc.rdcost < best_rdc.rdcost) { + best_rdc = sum_rdc; + found_best_partition = true; + pc_tree->partitioning = PARTITION_HORZ_4; + } + +#if CONFIG_COLLECT_PARTITION_STATS + if (partition_timer_on) { + aom_usec_timer_mark(&partition_timer); + int64_t time = aom_usec_timer_elapsed(&partition_timer); + partition_times[PARTITION_HORZ_4] += time; + partition_timer_on = 0; + } +#endif + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + } + + // PARTITION_VERT_4 + assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_vert4_allowed)); + if (!terminate_partition_search && partition_vert4_allowed && has_cols && + (do_rectangular_split || active_v_edge(cpi, mi_row, mi_step)) && + !is_gt_max_sq_part) { + av1_init_rd_stats(&sum_rdc); + const int quarter_step = mi_size_wide[bsize] / 4; + PICK_MODE_CONTEXT *ctx_prev = ctx_none; + + subsize = get_partition_subsize(bsize, PARTITION_VERT_4); + sum_rdc.rate = partition_cost[PARTITION_VERT_4]; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); + +#if CONFIG_COLLECT_PARTITION_STATS + if (best_rdc.rdcost - sum_rdc.rdcost >= 0) { + partition_attempts[PARTITION_VERT_4] += 1; + aom_usec_timer_start(&partition_timer); + partition_timer_on = 1; + } +#endif + for (int i = 0; i < 4; ++i) { + const int this_mi_col = mi_col + i * quarter_step; + + if (i > 0 && this_mi_col >= mi_params->mi_cols) break; + + PICK_MODE_CONTEXT *ctx_this = &pc_tree->vertical4[i]; + + ctx_this->rd_mode_is_ready = 0; + if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), mi_row, + this_mi_col, subsize, best_rdc, &sum_rdc, + PARTITION_VERT_4, ctx_prev, ctx_this)) { + av1_invalid_rd_stats(&sum_rdc); + break; + } + + ctx_prev = ctx_this; + } + + av1_rd_cost_update(x->rdmult, &sum_rdc); + if (sum_rdc.rdcost < best_rdc.rdcost) { + best_rdc = sum_rdc; + found_best_partition = true; + pc_tree->partitioning = PARTITION_VERT_4; + } +#if CONFIG_COLLECT_PARTITION_STATS + if (partition_timer_on) { + aom_usec_timer_mark(&partition_timer); + int64_t time = aom_usec_timer_elapsed(&partition_timer); + partition_times[PARTITION_VERT_4] += time; + partition_timer_on = 0; + } +#endif + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + } + + if (bsize == cm->seq_params.sb_size && !found_best_partition) { + // Did not find a valid partition, go back and search again, with less + // constraint on which partition types to search. + x->must_find_valid_partition = 1; +#if CONFIG_COLLECT_PARTITION_STATS == 2 + part_stats->partition_redo += 1; +#endif + goto BEGIN_PARTITION_SEARCH; + } + + *rd_cost = best_rdc; + +#if CONFIG_COLLECT_PARTITION_STATS + if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX) { + partition_decisions[pc_tree->partitioning] += 1; + } +#endif + +#if CONFIG_COLLECT_PARTITION_STATS == 1 + // If CONFIG_COLLECT_PARTITION_STATS is 1, then print out the stats for each + // prediction block + FILE *f = fopen("data.csv", "a"); + fprintf(f, "%d,%d,%d,", bsize, cm->show_frame, frame_is_intra_only(cm)); + for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { + fprintf(f, "%d,", partition_decisions[idx]); + } + for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { + fprintf(f, "%d,", partition_attempts[idx]); + } + for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { + fprintf(f, "%ld,", partition_times[idx]); + } + fprintf(f, "\n"); + fclose(f); +#endif + +#if CONFIG_COLLECT_PARTITION_STATS == 2 + // If CONFIG_COLLECTION_PARTITION_STATS is 2, then we print out the stats for + // the whole clip. So we need to pass the information upstream to the encoder + const int bsize_idx = av1_get_bsize_idx_for_part_stats(bsize); + int *agg_attempts = part_stats->partition_attempts[bsize_idx]; + int *agg_decisions = part_stats->partition_decisions[bsize_idx]; + int64_t *agg_times = part_stats->partition_times[bsize_idx]; + for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { + agg_attempts[idx] += partition_attempts[idx]; + agg_decisions[idx] += partition_decisions[idx]; + agg_times[idx] += partition_times[idx]; + } +#endif + + if (found_best_partition && pc_tree->index != 3) { + if (bsize == cm->seq_params.sb_size) { + const int emit_output = multi_pass_mode != SB_DRY_PASS; + const RUN_TYPE run_type = emit_output ? OUTPUT_ENABLED : DRY_RUN_NORMAL; + + x->cb_offset = 0; + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, run_type, bsize, + pc_tree, NULL); + } else { + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, + pc_tree, NULL); + } + } + + if (bsize == cm->seq_params.sb_size) { + assert(best_rdc.rate < INT_MAX); + assert(best_rdc.dist < INT64_MAX); + } else { + assert(tp_orig == *tp); + } + + x->rdmult = orig_rdmult; + return found_best_partition; +} +#endif // !CONFIG_REALTIME_ONLY +#undef NUM_SIMPLE_MOTION_FEATURES + +#if !CONFIG_REALTIME_ONLY + +static int get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int analysis_type, + int mi_row, int mi_col, int orig_rdmult) { + AV1_COMMON *const cm = &cpi->common; + assert(IMPLIES(cpi->gf_group.size > 0, + cpi->gf_group.index < cpi->gf_group.size)); + const int tpl_idx = cpi->gf_group.index; + TplParams *const tpl_data = &cpi->tpl_data; + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; + int tpl_stride = tpl_frame->stride; + int64_t intra_cost = 0; + int64_t mc_dep_cost = 0; + const int mi_wide = mi_size_wide[bsize]; + const int mi_high = mi_size_high[bsize]; + + if (tpl_frame->is_valid == 0) return orig_rdmult; + + if (!is_frame_tpl_eligible(cpi)) return orig_rdmult; + + if (cpi->gf_group.index >= MAX_LAG_BUFFERS) return orig_rdmult; + + int64_t mc_count = 0, mc_saved = 0; + int mi_count = 0; + const int mi_col_sr = + coded_to_superres_mi(mi_col, cm->superres_scale_denominator); + const int mi_col_end_sr = + coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator); + const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); + const int step = 1 << block_mis_log2; + for (int row = mi_row; row < mi_row + mi_high; row += step) { + for (int col = mi_col_sr; col < mi_col_end_sr; col += step) { + if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) continue; + TplDepStats *this_stats = + &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)]; + int64_t mc_dep_delta = + RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, + this_stats->mc_dep_dist); + intra_cost += this_stats->recrf_dist << RDDIV_BITS; + mc_dep_cost += (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta; + mc_count += this_stats->mc_count; + mc_saved += this_stats->mc_saved; + mi_count++; + } + } + + aom_clear_system_state(); + + double beta = 1.0; + if (analysis_type == 0) { + if (mc_dep_cost > 0 && intra_cost > 0) { + const double r0 = cpi->rd.r0; + const double rk = (double)intra_cost / mc_dep_cost; + beta = (r0 / rk); + } + } else if (analysis_type == 1) { + const double mc_count_base = (mi_count * cpi->rd.mc_count_base); + beta = (mc_count + 1.0) / (mc_count_base + 1.0); + beta = pow(beta, 0.5); + } else if (analysis_type == 2) { + const double mc_saved_base = (mi_count * cpi->rd.mc_saved_base); + beta = (mc_saved + 1.0) / (mc_saved_base + 1.0); + beta = pow(beta, 0.5); + } + + int rdmult = av1_get_adaptive_rdmult(cpi, beta); + + aom_clear_system_state(); + + rdmult = AOMMIN(rdmult, orig_rdmult * 3 / 2); + rdmult = AOMMAX(rdmult, orig_rdmult * 1 / 2); + + rdmult = AOMMAX(1, rdmult); + + return rdmult; +} + +static int get_tpl_stats_b(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col, int64_t *intra_cost_b, + int64_t *inter_cost_b, + int_mv mv_b[][INTER_REFS_PER_FRAME], int *stride) { + if (!cpi->oxcf.enable_tpl_model) return 0; + if (cpi->superres_mode != SUPERRES_NONE) return 0; + if (cpi->common.current_frame.frame_type == KEY_FRAME) return 0; + const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group); + if (update_type == INTNL_OVERLAY_UPDATE || update_type == OVERLAY_UPDATE) + return 0; + assert(IMPLIES(cpi->gf_group.size > 0, + cpi->gf_group.index < cpi->gf_group.size)); + + AV1_COMMON *const cm = &cpi->common; + const int gf_group_index = cpi->gf_group.index; + TplParams *const tpl_data = &cpi->tpl_data; + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_group_index]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + int tpl_stride = tpl_frame->stride; + const int mi_wide = mi_size_wide[bsize]; + const int mi_high = mi_size_high[bsize]; + + if (tpl_frame->is_valid == 0) return 0; + if (gf_group_index >= MAX_LAG_BUFFERS) return 0; + + int mi_count = 0; + int count = 0; + const int mi_col_sr = + coded_to_superres_mi(mi_col, cm->superres_scale_denominator); + const int mi_col_end_sr = + coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator); + // mi_cols_sr is mi_cols at superres case. + const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); + + // TPL store unit size is not the same as the motion estimation unit size. + // Here always use motion estimation size to avoid getting repetitive inter/ + // intra cost. + const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D); + const int step = mi_size_wide[tpl_bsize]; + assert(mi_size_wide[tpl_bsize] == mi_size_high[tpl_bsize]); + + // Stride is only based on SB size, and we fill in values for every 16x16 + // block in a SB. + *stride = (mi_col_end_sr - mi_col_sr) / step; + + for (int row = mi_row; row < mi_row + mi_high; row += step) { + for (int col = mi_col_sr; col < mi_col_end_sr; col += step) { + // Handle partial SB, so that no invalid values are used later. + if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) { + inter_cost_b[count] = INT64_MAX; + intra_cost_b[count] = INT64_MAX; + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + mv_b[count][i].as_int = INVALID_MV; + } + count++; + continue; + } + + TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos( + row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)]; + inter_cost_b[count] = this_stats->inter_cost; + intra_cost_b[count] = this_stats->intra_cost; + memcpy(mv_b[count], this_stats->mv, sizeof(this_stats->mv)); + mi_count++; + count++; + } + } + + return mi_count; +} + +// analysis_type 0: Use mc_dep_cost and intra_cost +// analysis_type 1: Use count of best inter predictor chosen +// analysis_type 2: Use cost reduction from intra to inter for best inter +// predictor chosen +static int get_q_for_deltaq_objective(AV1_COMP *const cpi, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + AV1_COMMON *const cm = &cpi->common; + assert(IMPLIES(cpi->gf_group.size > 0, + cpi->gf_group.index < cpi->gf_group.size)); + const int tpl_idx = cpi->gf_group.index; + TplParams *const tpl_data = &cpi->tpl_data; + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; + int tpl_stride = tpl_frame->stride; + int64_t intra_cost = 0; + int64_t mc_dep_cost = 0; + const int mi_wide = mi_size_wide[bsize]; + const int mi_high = mi_size_high[bsize]; + const int base_qindex = cm->quant_params.base_qindex; + + if (tpl_frame->is_valid == 0) return base_qindex; + + if (!is_frame_tpl_eligible(cpi)) return base_qindex; + + if (cpi->gf_group.index >= MAX_LAG_BUFFERS) return base_qindex; + + int64_t mc_count = 0, mc_saved = 0; + int mi_count = 0; + const int mi_col_sr = + coded_to_superres_mi(mi_col, cm->superres_scale_denominator); + const int mi_col_end_sr = + coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator); + const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); + const int step = 1 << block_mis_log2; + for (int row = mi_row; row < mi_row + mi_high; row += step) { + for (int col = mi_col_sr; col < mi_col_end_sr; col += step) { + if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) continue; + TplDepStats *this_stats = + &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)]; + int64_t mc_dep_delta = + RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, + this_stats->mc_dep_dist); + intra_cost += this_stats->recrf_dist << RDDIV_BITS; + mc_dep_cost += (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta; + mc_count += this_stats->mc_count; + mc_saved += this_stats->mc_saved; + mi_count++; + } + } + + aom_clear_system_state(); + + int offset = 0; + double beta = 1.0; + if (mc_dep_cost > 0 && intra_cost > 0) { + const double r0 = cpi->rd.r0; + const double rk = (double)intra_cost / mc_dep_cost; + beta = (r0 / rk); + assert(beta > 0.0); + } + offset = av1_get_deltaq_offset(cpi, base_qindex, beta); + aom_clear_system_state(); + + const DeltaQInfo *const delta_q_info = &cm->delta_q_info; + offset = AOMMIN(offset, delta_q_info->delta_q_res * 9 - 1); + offset = AOMMAX(offset, -delta_q_info->delta_q_res * 9 + 1); + int qindex = cm->quant_params.base_qindex + offset; + qindex = AOMMIN(qindex, MAXQ); + qindex = AOMMAX(qindex, MINQ); + + return qindex; +} + +static AOM_INLINE void setup_delta_q(AV1_COMP *const cpi, ThreadData *td, + MACROBLOCK *const x, + const TileInfo *const tile_info, + int mi_row, int mi_col, int num_planes) { + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const DeltaQInfo *const delta_q_info = &cm->delta_q_info; + assert(delta_q_info->delta_q_present_flag); + + const BLOCK_SIZE sb_size = cm->seq_params.sb_size; + // Delta-q modulation based on variance + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, sb_size); + + int current_qindex = cm->quant_params.base_qindex; + if (cpi->oxcf.deltaq_mode == DELTA_Q_PERCEPTUAL) { + if (DELTA_Q_PERCEPTUAL_MODULATION == 1) { + const int block_wavelet_energy_level = + av1_block_wavelet_energy_level(cpi, x, sb_size); + x->sb_energy_level = block_wavelet_energy_level; + current_qindex = av1_compute_q_from_energy_level_deltaq_mode( + cpi, block_wavelet_energy_level); + } else { + const int block_var_level = av1_log_block_var(cpi, x, sb_size); + x->sb_energy_level = block_var_level; + current_qindex = + av1_compute_q_from_energy_level_deltaq_mode(cpi, block_var_level); + } + } else if (cpi->oxcf.deltaq_mode == DELTA_Q_OBJECTIVE && + cpi->oxcf.enable_tpl_model) { + // Setup deltaq based on tpl stats + current_qindex = get_q_for_deltaq_objective(cpi, sb_size, mi_row, mi_col); + } + + const int delta_q_res = delta_q_info->delta_q_res; + // Right now aq only works with tpl model. So if tpl is disabled, we set the + // current_qindex to base_qindex. + if (cpi->oxcf.enable_tpl_model && cpi->oxcf.deltaq_mode != NO_DELTA_Q) { + current_qindex = + clamp(current_qindex, delta_q_res, 256 - delta_q_info->delta_q_res); + } else { + current_qindex = cm->quant_params.base_qindex; + } + + MACROBLOCKD *const xd = &x->e_mbd; + const int sign_deltaq_index = + current_qindex - xd->current_qindex >= 0 ? 1 : -1; + const int deltaq_deadzone = delta_q_res / 4; + const int qmask = ~(delta_q_res - 1); + int abs_deltaq_index = abs(current_qindex - xd->current_qindex); + abs_deltaq_index = (abs_deltaq_index + deltaq_deadzone) & qmask; + current_qindex = xd->current_qindex + sign_deltaq_index * abs_deltaq_index; + current_qindex = AOMMAX(current_qindex, MINQ + 1); + assert(current_qindex > 0); + + xd->delta_qindex = current_qindex - cm->quant_params.base_qindex; + set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); + xd->mi[0]->current_qindex = current_qindex; + av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id); + + // keep track of any non-zero delta-q used + td->deltaq_used |= (xd->delta_qindex != 0); + + if (cpi->oxcf.deltalf_mode) { + const int delta_lf_res = delta_q_info->delta_lf_res; + const int lfmask = ~(delta_lf_res - 1); + const int delta_lf_from_base = + ((xd->delta_qindex / 2 + delta_lf_res / 2) & lfmask); + const int8_t delta_lf = + (int8_t)clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER); + const int frame_lf_count = + av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + const int mib_size = cm->seq_params.mib_size; + + // pre-set the delta lf for loop filter. Note that this value is set + // before mi is assigned for each block in current superblock + for (int j = 0; j < AOMMIN(mib_size, mi_params->mi_rows - mi_row); j++) { + for (int k = 0; k < AOMMIN(mib_size, mi_params->mi_cols - mi_col); k++) { + const int grid_idx = get_mi_grid_idx(mi_params, mi_row + j, mi_col + k); + mi_params->mi_grid_base[grid_idx]->delta_lf_from_base = delta_lf; + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { + mi_params->mi_grid_base[grid_idx]->delta_lf[lf_id] = delta_lf; + } + } + } + } +} +#endif // !CONFIG_REALTIME_ONLY + +#define AVG_CDF_WEIGHT_LEFT 3 +#define AVG_CDF_WEIGHT_TOP_RIGHT 1 + +static AOM_INLINE void avg_cdf_symbol(aom_cdf_prob *cdf_ptr_left, + aom_cdf_prob *cdf_ptr_tr, int num_cdfs, + int cdf_stride, int nsymbs, int wt_left, + int wt_tr) { + for (int i = 0; i < num_cdfs; i++) { + for (int j = 0; j <= nsymbs; j++) { + cdf_ptr_left[i * cdf_stride + j] = + (aom_cdf_prob)(((int)cdf_ptr_left[i * cdf_stride + j] * wt_left + + (int)cdf_ptr_tr[i * cdf_stride + j] * wt_tr + + ((wt_left + wt_tr) / 2)) / + (wt_left + wt_tr)); + assert(cdf_ptr_left[i * cdf_stride + j] >= 0 && + cdf_ptr_left[i * cdf_stride + j] < CDF_PROB_TOP); + } + } +} + +#define AVERAGE_CDF(cname_left, cname_tr, nsymbs) \ + AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, CDF_SIZE(nsymbs)) + +#define AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, cdf_stride) \ + do { \ + aom_cdf_prob *cdf_ptr_left = (aom_cdf_prob *)cname_left; \ + aom_cdf_prob *cdf_ptr_tr = (aom_cdf_prob *)cname_tr; \ + int array_size = (int)sizeof(cname_left) / sizeof(aom_cdf_prob); \ + int num_cdfs = array_size / cdf_stride; \ + avg_cdf_symbol(cdf_ptr_left, cdf_ptr_tr, num_cdfs, cdf_stride, nsymbs, \ + wt_left, wt_tr); \ + } while (0) + +static AOM_INLINE void avg_nmv(nmv_context *nmv_left, nmv_context *nmv_tr, + int wt_left, int wt_tr) { + AVERAGE_CDF(nmv_left->joints_cdf, nmv_tr->joints_cdf, 4); + for (int i = 0; i < 2; i++) { + AVERAGE_CDF(nmv_left->comps[i].classes_cdf, nmv_tr->comps[i].classes_cdf, + MV_CLASSES); + AVERAGE_CDF(nmv_left->comps[i].class0_fp_cdf, + nmv_tr->comps[i].class0_fp_cdf, MV_FP_SIZE); + AVERAGE_CDF(nmv_left->comps[i].fp_cdf, nmv_tr->comps[i].fp_cdf, MV_FP_SIZE); + AVERAGE_CDF(nmv_left->comps[i].sign_cdf, nmv_tr->comps[i].sign_cdf, 2); + AVERAGE_CDF(nmv_left->comps[i].class0_hp_cdf, + nmv_tr->comps[i].class0_hp_cdf, 2); + AVERAGE_CDF(nmv_left->comps[i].hp_cdf, nmv_tr->comps[i].hp_cdf, 2); + AVERAGE_CDF(nmv_left->comps[i].class0_cdf, nmv_tr->comps[i].class0_cdf, + CLASS0_SIZE); + AVERAGE_CDF(nmv_left->comps[i].bits_cdf, nmv_tr->comps[i].bits_cdf, 2); + } +} + +// In case of row-based multi-threading of encoder, since we always +// keep a top - right sync, we can average the top - right SB's CDFs and +// the left SB's CDFs and use the same for current SB's encoding to +// improve the performance. This function facilitates the averaging +// of CDF and used only when row-mt is enabled in encoder. +static AOM_INLINE void avg_cdf_symbols(FRAME_CONTEXT *ctx_left, + FRAME_CONTEXT *ctx_tr, int wt_left, + int wt_tr) { + AVERAGE_CDF(ctx_left->txb_skip_cdf, ctx_tr->txb_skip_cdf, 2); + AVERAGE_CDF(ctx_left->eob_extra_cdf, ctx_tr->eob_extra_cdf, 2); + AVERAGE_CDF(ctx_left->dc_sign_cdf, ctx_tr->dc_sign_cdf, 2); + AVERAGE_CDF(ctx_left->eob_flag_cdf16, ctx_tr->eob_flag_cdf16, 5); + AVERAGE_CDF(ctx_left->eob_flag_cdf32, ctx_tr->eob_flag_cdf32, 6); + AVERAGE_CDF(ctx_left->eob_flag_cdf64, ctx_tr->eob_flag_cdf64, 7); + AVERAGE_CDF(ctx_left->eob_flag_cdf128, ctx_tr->eob_flag_cdf128, 8); + AVERAGE_CDF(ctx_left->eob_flag_cdf256, ctx_tr->eob_flag_cdf256, 9); + AVERAGE_CDF(ctx_left->eob_flag_cdf512, ctx_tr->eob_flag_cdf512, 10); + AVERAGE_CDF(ctx_left->eob_flag_cdf1024, ctx_tr->eob_flag_cdf1024, 11); + AVERAGE_CDF(ctx_left->coeff_base_eob_cdf, ctx_tr->coeff_base_eob_cdf, 3); + AVERAGE_CDF(ctx_left->coeff_base_cdf, ctx_tr->coeff_base_cdf, 4); + AVERAGE_CDF(ctx_left->coeff_br_cdf, ctx_tr->coeff_br_cdf, BR_CDF_SIZE); + AVERAGE_CDF(ctx_left->newmv_cdf, ctx_tr->newmv_cdf, 2); + AVERAGE_CDF(ctx_left->zeromv_cdf, ctx_tr->zeromv_cdf, 2); + AVERAGE_CDF(ctx_left->refmv_cdf, ctx_tr->refmv_cdf, 2); + AVERAGE_CDF(ctx_left->drl_cdf, ctx_tr->drl_cdf, 2); + AVERAGE_CDF(ctx_left->inter_compound_mode_cdf, + ctx_tr->inter_compound_mode_cdf, INTER_COMPOUND_MODES); + AVERAGE_CDF(ctx_left->compound_type_cdf, ctx_tr->compound_type_cdf, + MASKED_COMPOUND_TYPES); + AVERAGE_CDF(ctx_left->wedge_idx_cdf, ctx_tr->wedge_idx_cdf, 16); + AVERAGE_CDF(ctx_left->interintra_cdf, ctx_tr->interintra_cdf, 2); + AVERAGE_CDF(ctx_left->wedge_interintra_cdf, ctx_tr->wedge_interintra_cdf, 2); + AVERAGE_CDF(ctx_left->interintra_mode_cdf, ctx_tr->interintra_mode_cdf, + INTERINTRA_MODES); + AVERAGE_CDF(ctx_left->motion_mode_cdf, ctx_tr->motion_mode_cdf, MOTION_MODES); + AVERAGE_CDF(ctx_left->obmc_cdf, ctx_tr->obmc_cdf, 2); + AVERAGE_CDF(ctx_left->palette_y_size_cdf, ctx_tr->palette_y_size_cdf, + PALETTE_SIZES); + AVERAGE_CDF(ctx_left->palette_uv_size_cdf, ctx_tr->palette_uv_size_cdf, + PALETTE_SIZES); + for (int j = 0; j < PALETTE_SIZES; j++) { + int nsymbs = j + PALETTE_MIN_SIZE; + AVG_CDF_STRIDE(ctx_left->palette_y_color_index_cdf[j], + ctx_tr->palette_y_color_index_cdf[j], nsymbs, + CDF_SIZE(PALETTE_COLORS)); + AVG_CDF_STRIDE(ctx_left->palette_uv_color_index_cdf[j], + ctx_tr->palette_uv_color_index_cdf[j], nsymbs, + CDF_SIZE(PALETTE_COLORS)); + } + AVERAGE_CDF(ctx_left->palette_y_mode_cdf, ctx_tr->palette_y_mode_cdf, 2); + AVERAGE_CDF(ctx_left->palette_uv_mode_cdf, ctx_tr->palette_uv_mode_cdf, 2); + AVERAGE_CDF(ctx_left->comp_inter_cdf, ctx_tr->comp_inter_cdf, 2); + AVERAGE_CDF(ctx_left->single_ref_cdf, ctx_tr->single_ref_cdf, 2); + AVERAGE_CDF(ctx_left->comp_ref_type_cdf, ctx_tr->comp_ref_type_cdf, 2); + AVERAGE_CDF(ctx_left->uni_comp_ref_cdf, ctx_tr->uni_comp_ref_cdf, 2); + AVERAGE_CDF(ctx_left->comp_ref_cdf, ctx_tr->comp_ref_cdf, 2); + AVERAGE_CDF(ctx_left->comp_bwdref_cdf, ctx_tr->comp_bwdref_cdf, 2); + AVERAGE_CDF(ctx_left->txfm_partition_cdf, ctx_tr->txfm_partition_cdf, 2); + AVERAGE_CDF(ctx_left->compound_index_cdf, ctx_tr->compound_index_cdf, 2); + AVERAGE_CDF(ctx_left->comp_group_idx_cdf, ctx_tr->comp_group_idx_cdf, 2); + AVERAGE_CDF(ctx_left->skip_mode_cdfs, ctx_tr->skip_mode_cdfs, 2); + AVERAGE_CDF(ctx_left->skip_cdfs, ctx_tr->skip_cdfs, 2); + AVERAGE_CDF(ctx_left->intra_inter_cdf, ctx_tr->intra_inter_cdf, 2); + avg_nmv(&ctx_left->nmvc, &ctx_tr->nmvc, wt_left, wt_tr); + avg_nmv(&ctx_left->ndvc, &ctx_tr->ndvc, wt_left, wt_tr); + AVERAGE_CDF(ctx_left->intrabc_cdf, ctx_tr->intrabc_cdf, 2); + AVERAGE_CDF(ctx_left->seg.tree_cdf, ctx_tr->seg.tree_cdf, MAX_SEGMENTS); + AVERAGE_CDF(ctx_left->seg.pred_cdf, ctx_tr->seg.pred_cdf, 2); + AVERAGE_CDF(ctx_left->seg.spatial_pred_seg_cdf, + ctx_tr->seg.spatial_pred_seg_cdf, MAX_SEGMENTS); + AVERAGE_CDF(ctx_left->filter_intra_cdfs, ctx_tr->filter_intra_cdfs, 2); + AVERAGE_CDF(ctx_left->filter_intra_mode_cdf, ctx_tr->filter_intra_mode_cdf, + FILTER_INTRA_MODES); + AVERAGE_CDF(ctx_left->switchable_restore_cdf, ctx_tr->switchable_restore_cdf, + RESTORE_SWITCHABLE_TYPES); + AVERAGE_CDF(ctx_left->wiener_restore_cdf, ctx_tr->wiener_restore_cdf, 2); + AVERAGE_CDF(ctx_left->sgrproj_restore_cdf, ctx_tr->sgrproj_restore_cdf, 2); + AVERAGE_CDF(ctx_left->y_mode_cdf, ctx_tr->y_mode_cdf, INTRA_MODES); + AVG_CDF_STRIDE(ctx_left->uv_mode_cdf[0], ctx_tr->uv_mode_cdf[0], + UV_INTRA_MODES - 1, CDF_SIZE(UV_INTRA_MODES)); + AVERAGE_CDF(ctx_left->uv_mode_cdf[1], ctx_tr->uv_mode_cdf[1], UV_INTRA_MODES); + for (int i = 0; i < PARTITION_CONTEXTS; i++) { + if (i < 4) { + AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 4, + CDF_SIZE(10)); + } else if (i < 16) { + AVERAGE_CDF(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 10); + } else { + AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 8, + CDF_SIZE(10)); + } + } + AVERAGE_CDF(ctx_left->switchable_interp_cdf, ctx_tr->switchable_interp_cdf, + SWITCHABLE_FILTERS); + AVERAGE_CDF(ctx_left->kf_y_cdf, ctx_tr->kf_y_cdf, INTRA_MODES); + AVERAGE_CDF(ctx_left->angle_delta_cdf, ctx_tr->angle_delta_cdf, + 2 * MAX_ANGLE_DELTA + 1); + AVG_CDF_STRIDE(ctx_left->tx_size_cdf[0], ctx_tr->tx_size_cdf[0], MAX_TX_DEPTH, + CDF_SIZE(MAX_TX_DEPTH + 1)); + AVERAGE_CDF(ctx_left->tx_size_cdf[1], ctx_tr->tx_size_cdf[1], + MAX_TX_DEPTH + 1); + AVERAGE_CDF(ctx_left->tx_size_cdf[2], ctx_tr->tx_size_cdf[2], + MAX_TX_DEPTH + 1); + AVERAGE_CDF(ctx_left->tx_size_cdf[3], ctx_tr->tx_size_cdf[3], + MAX_TX_DEPTH + 1); + AVERAGE_CDF(ctx_left->delta_q_cdf, ctx_tr->delta_q_cdf, DELTA_Q_PROBS + 1); + AVERAGE_CDF(ctx_left->delta_lf_cdf, ctx_tr->delta_lf_cdf, DELTA_LF_PROBS + 1); + for (int i = 0; i < FRAME_LF_COUNT; i++) { + AVERAGE_CDF(ctx_left->delta_lf_multi_cdf[i], ctx_tr->delta_lf_multi_cdf[i], + DELTA_LF_PROBS + 1); + } + AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[1], ctx_tr->intra_ext_tx_cdf[1], 7, + CDF_SIZE(TX_TYPES)); + AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[2], ctx_tr->intra_ext_tx_cdf[2], 5, + CDF_SIZE(TX_TYPES)); + AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[1], ctx_tr->inter_ext_tx_cdf[1], 16, + CDF_SIZE(TX_TYPES)); + AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[2], ctx_tr->inter_ext_tx_cdf[2], 12, + CDF_SIZE(TX_TYPES)); + AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[3], ctx_tr->inter_ext_tx_cdf[3], 2, + CDF_SIZE(TX_TYPES)); + AVERAGE_CDF(ctx_left->cfl_sign_cdf, ctx_tr->cfl_sign_cdf, CFL_JOINT_SIGNS); + AVERAGE_CDF(ctx_left->cfl_alpha_cdf, ctx_tr->cfl_alpha_cdf, + CFL_ALPHABET_SIZE); +} + +#if !CONFIG_REALTIME_ONLY +static AOM_INLINE void adjust_rdmult_tpl_model(AV1_COMP *cpi, MACROBLOCK *x, + int mi_row, int mi_col) { + const BLOCK_SIZE sb_size = cpi->common.seq_params.sb_size; + const int orig_rdmult = cpi->rd.RDMULT; + + assert(IMPLIES(cpi->gf_group.size > 0, + cpi->gf_group.index < cpi->gf_group.size)); + const int gf_group_index = cpi->gf_group.index; + if (cpi->oxcf.enable_tpl_model && cpi->oxcf.aq_mode == NO_AQ && + cpi->oxcf.deltaq_mode == NO_DELTA_Q && gf_group_index > 0 && + cpi->gf_group.update_type[gf_group_index] == ARF_UPDATE) { + const int dr = + get_rdmult_delta(cpi, sb_size, 0, mi_row, mi_col, orig_rdmult); + x->rdmult = dr; + } +} +#endif + +static void source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, int shift) { + unsigned int tmp_sse; + unsigned int tmp_variance; + const BLOCK_SIZE bsize = BLOCK_64X64; + uint8_t *src_y = cpi->source->y_buffer; + int src_ystride = cpi->source->y_stride; + uint8_t *last_src_y = cpi->last_source->y_buffer; + int last_src_ystride = cpi->last_source->y_stride; + uint64_t avg_source_sse_threshold = 100000; // ~5*5*(64*64) + uint64_t avg_source_sse_threshold_high = 1000000; // ~15*15*(64*64) + uint64_t sum_sq_thresh = 10000; // sum = sqrt(thresh / 64*64)) ~1.5 +#if CONFIG_AV1_HIGHBITDEPTH + MACROBLOCKD *xd = &x->e_mbd; + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) return; +#endif + src_y += shift; + last_src_y += shift; + tmp_variance = cpi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y, + last_src_ystride, &tmp_sse); + // Note: tmp_sse - tmp_variance = ((sum * sum) >> 12) + // Detect large lighting change. + if (tmp_variance < (tmp_sse >> 1) && (tmp_sse - tmp_variance) > sum_sq_thresh) + x->content_state_sb = kLowVarHighSumdiff; + else if (tmp_sse < avg_source_sse_threshold) + x->content_state_sb = kLowSad; + else if (tmp_sse > avg_source_sse_threshold_high) + x->content_state_sb = kHighSad; +} + +static AOM_INLINE void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, + PC_TREE *const pc_root, TOKENEXTRA **tp, + const int mi_row, const int mi_col, + const int seg_skip) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + const SPEED_FEATURES *const sf = &cpi->sf; + const TileInfo *const tile_info = &tile_data->tile_info; + MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + + get_mi_grid_idx(&cm->mi_params, mi_row, mi_col); + const BLOCK_SIZE sb_size = cm->seq_params.sb_size; + if (sf->rt_sf.source_metrics_sb_nonrd && sb_size == BLOCK_64X64 && + cpi->svc.number_spatial_layers <= 1 && + cm->current_frame.frame_type != KEY_FRAME) { + int shift = cpi->source->y_stride * (mi_row << 2) + (mi_col << 2); + source_content_sb(cpi, x, shift); + } + if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip) { + set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); + const BLOCK_SIZE bsize = + seg_skip ? sb_size : sf->part_sf.always_this_block_size; + set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); + } else if (cpi->partition_search_skippable_frame) { + set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); + const BLOCK_SIZE bsize = + get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col); + set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); + } else if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) { + set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, sb_size); + av1_choose_var_based_partitioning(cpi, tile_info, td, x, mi_row, mi_col); + } + assert(sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip || + cpi->partition_search_skippable_frame || + sf->part_sf.partition_search_type == VAR_BASED_PARTITION); + td->mb.cb_offset = 0; + nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size, + pc_root); +} + +// Memset the mbmis at the current superblock to 0 +static INLINE void reset_mbmi(CommonModeInfoParams *const mi_params, + BLOCK_SIZE sb_size, int mi_row, int mi_col) { + // size of sb in unit of mi (BLOCK_4X4) + const int sb_size_mi = mi_size_wide[sb_size]; + const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize]; + // size of sb in unit of allocated mi size + const int sb_size_alloc_mi = mi_size_wide[sb_size] / mi_alloc_size_1d; + assert(mi_params->mi_alloc_stride % sb_size_alloc_mi == 0 && + "mi is not allocated as a multiple of sb!"); + assert(mi_params->mi_stride % sb_size_mi == 0 && + "mi_grid_base is not allocated as a multiple of sb!"); + + const int mi_rows = mi_size_high[sb_size]; + for (int cur_mi_row = 0; cur_mi_row < mi_rows; cur_mi_row++) { + assert(get_mi_grid_idx(mi_params, 0, mi_col + mi_alloc_size_1d) < + mi_params->mi_stride); + const int mi_grid_idx = + get_mi_grid_idx(mi_params, mi_row + cur_mi_row, mi_col); + const int alloc_mi_idx = + get_alloc_mi_idx(mi_params, mi_row + cur_mi_row, mi_col); + memset(&mi_params->mi_grid_base[mi_grid_idx], 0, + sb_size_mi * sizeof(*mi_params->mi_grid_base)); + memset(&mi_params->tx_type_map[mi_grid_idx], 0, + sb_size_mi * sizeof(*mi_params->tx_type_map)); + if (cur_mi_row % mi_alloc_size_1d == 0) { + memset(&mi_params->mi_alloc[alloc_mi_idx], 0, + sb_size_alloc_mi * sizeof(*mi_params->mi_alloc)); + } + } +} + +static INLINE void backup_sb_state(SB_FIRST_PASS_STATS *sb_fp_stats, + const AV1_COMP *cpi, ThreadData *td, + const TileDataEnc *tile_data, int mi_row, + int mi_col) { + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + const TileInfo *tile_info = &tile_data->tile_info; + + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + const BLOCK_SIZE sb_size = cm->seq_params.sb_size; + + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + save_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size, num_planes); + + sb_fp_stats->rd_count = cpi->td.rd_counts; + sb_fp_stats->split_count = cpi->td.mb.txb_split_count; + + sb_fp_stats->fc = *td->counts; + + memcpy(sb_fp_stats->inter_mode_rd_models, tile_data->inter_mode_rd_models, + sizeof(sb_fp_stats->inter_mode_rd_models)); + + memcpy(sb_fp_stats->thresh_freq_fact, x->thresh_freq_fact, + sizeof(sb_fp_stats->thresh_freq_fact)); + + const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col); + sb_fp_stats->current_qindex = + cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex; + +#if CONFIG_INTERNAL_STATS + memcpy(sb_fp_stats->mode_chosen_counts, cpi->mode_chosen_counts, + sizeof(sb_fp_stats->mode_chosen_counts)); +#endif // CONFIG_INTERNAL_STATS +} + +static INLINE void restore_sb_state(const SB_FIRST_PASS_STATS *sb_fp_stats, + AV1_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, int mi_row, + int mi_col) { + MACROBLOCK *x = &td->mb; + + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + const BLOCK_SIZE sb_size = cm->seq_params.sb_size; + + restore_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size, num_planes); + + cpi->td.rd_counts = sb_fp_stats->rd_count; + cpi->td.mb.txb_split_count = sb_fp_stats->split_count; + + *td->counts = sb_fp_stats->fc; + + memcpy(tile_data->inter_mode_rd_models, sb_fp_stats->inter_mode_rd_models, + sizeof(sb_fp_stats->inter_mode_rd_models)); + memcpy(x->thresh_freq_fact, sb_fp_stats->thresh_freq_fact, + sizeof(sb_fp_stats->thresh_freq_fact)); + + const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col); + cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex = + sb_fp_stats->current_qindex; + +#if CONFIG_INTERNAL_STATS + memcpy(cpi->mode_chosen_counts, sb_fp_stats->mode_chosen_counts, + sizeof(sb_fp_stats->mode_chosen_counts)); +#endif // CONFIG_INTERNAL_STATS +} + +#if !CONFIG_REALTIME_ONLY +static void init_ref_frame_space(AV1_COMP *cpi, ThreadData *td, int mi_row, + int mi_col) { + const AV1_COMMON *cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + MACROBLOCK *x = &td->mb; + const int frame_idx = cpi->gf_group.index; + TplParams *const tpl_data = &cpi->tpl_data; + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx]; + const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; + + av1_zero(x->search_ref_frame); + + if (tpl_frame->is_valid == 0) return; + if (!is_frame_tpl_eligible(cpi)) return; + if (frame_idx >= MAX_LAG_BUFFERS) return; + if (cpi->superres_mode != SUPERRES_NONE) return; + if (cpi->oxcf.aq_mode != NO_AQ) return; + + const int is_overlay = cpi->gf_group.update_type[frame_idx] == OVERLAY_UPDATE; + if (is_overlay) { + memset(x->search_ref_frame, 1, sizeof(x->search_ref_frame)); + return; + } + + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + const int tpl_stride = tpl_frame->stride; + int64_t inter_cost[INTER_REFS_PER_FRAME] = { 0 }; + const int step = 1 << block_mis_log2; + const BLOCK_SIZE sb_size = cm->seq_params.sb_size; + const int mi_row_end = + AOMMIN(mi_size_high[sb_size] + mi_row, mi_params->mi_rows); + const int mi_col_end = + AOMMIN(mi_size_wide[sb_size] + mi_col, mi_params->mi_cols); + + for (int row = mi_row; row < mi_row_end; row += step) { + for (int col = mi_col; col < mi_col_end; col += step) { + const TplDepStats *this_stats = + &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)]; + int64_t tpl_pred_error[INTER_REFS_PER_FRAME] = { 0 }; + // Find the winner ref frame idx for the current block + int64_t best_inter_cost = this_stats->pred_error[0]; + int best_rf_idx = 0; + for (int idx = 1; idx < INTER_REFS_PER_FRAME; ++idx) { + if ((this_stats->pred_error[idx] < best_inter_cost) && + (this_stats->pred_error[idx] != 0)) { + best_inter_cost = this_stats->pred_error[idx]; + best_rf_idx = idx; + } + } + // tpl_pred_error is the pred_error reduction of best_ref w.r.t. + // LAST_FRAME. + tpl_pred_error[best_rf_idx] = this_stats->pred_error[best_rf_idx] - + this_stats->pred_error[LAST_FRAME - 1]; + + for (int rf_idx = 1; rf_idx < INTER_REFS_PER_FRAME; ++rf_idx) + inter_cost[rf_idx] += tpl_pred_error[rf_idx]; + } + } + + int rank_index[INTER_REFS_PER_FRAME - 1]; + for (int idx = 0; idx < INTER_REFS_PER_FRAME - 1; ++idx) { + rank_index[idx] = idx + 1; + for (int i = idx; i > 0; --i) { + if (inter_cost[rank_index[i - 1]] > inter_cost[rank_index[i]]) { + const int tmp = rank_index[i - 1]; + rank_index[i - 1] = rank_index[i]; + rank_index[i] = tmp; + } + } + } + + x->search_ref_frame[INTRA_FRAME] = 1; + x->search_ref_frame[LAST_FRAME] = 1; + + int cutoff_ref = 0; + for (int idx = 0; idx < INTER_REFS_PER_FRAME - 1; ++idx) { + x->search_ref_frame[rank_index[idx] + LAST_FRAME] = 1; + if (idx > 2) { + if (!cutoff_ref) { + // If the predictive coding gains are smaller than the previous more + // relevant frame over certain amount, discard this frame and all the + // frames afterwards. + if (llabs(inter_cost[rank_index[idx]]) < + llabs(inter_cost[rank_index[idx - 1]]) / 8 || + inter_cost[rank_index[idx]] == 0) + cutoff_ref = 1; + } + + if (cutoff_ref) x->search_ref_frame[rank_index[idx] + LAST_FRAME] = 0; + } + } +} +#endif // !CONFIG_REALTIME_ONLY + +// This function initializes the stats for encode_rd_sb. +static INLINE void init_encode_rd_sb(AV1_COMP *cpi, ThreadData *td, + const TileDataEnc *tile_data, + PC_TREE *pc_root, RD_STATS *rd_cost, + int mi_row, int mi_col, + int gather_tpl_data) { + const AV1_COMMON *cm = &cpi->common; + const TileInfo *tile_info = &tile_data->tile_info; + MACROBLOCK *x = &td->mb; + + const SPEED_FEATURES *sf = &cpi->sf; + const int use_simple_motion_search = + (sf->part_sf.simple_motion_search_split || + sf->part_sf.simple_motion_search_prune_rect || + sf->part_sf.simple_motion_search_early_term_none || + sf->part_sf.ml_early_term_after_part_split_level) && + !frame_is_intra_only(cm); + if (use_simple_motion_search) { + init_simple_motion_search_mvs(pc_root); + } + +#if !CONFIG_REALTIME_ONLY + init_ref_frame_space(cpi, td, mi_row, mi_col); + x->sb_energy_level = 0; + x->cnn_output_valid = 0; + if (gather_tpl_data) { + if (cm->delta_q_info.delta_q_present_flag) { + const int num_planes = av1_num_planes(cm); + const BLOCK_SIZE sb_size = cm->seq_params.sb_size; + setup_delta_q(cpi, td, x, tile_info, mi_row, mi_col, num_planes); + av1_tpl_rdmult_setup_sb(cpi, x, sb_size, mi_row, mi_col); + } + if (cpi->oxcf.enable_tpl_model) { + adjust_rdmult_tpl_model(cpi, x, mi_row, mi_col); + } + } +#else + (void)tile_info; + (void)mi_row; + (void)mi_col; + (void)gather_tpl_data; +#endif + + // Reset hash state for transform/mode rd hash information + reset_hash_records(x, cpi->sf.tx_sf.use_inter_txb_hash); + av1_zero(x->picked_ref_frames_mask); + av1_zero(x->pred_mv); + av1_invalid_rd_stats(rd_cost); +} + +static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, + PC_TREE *const pc_root, TOKENEXTRA **tp, + const int mi_row, const int mi_col, + const int seg_skip) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + const SPEED_FEATURES *const sf = &cpi->sf; + const TileInfo *const tile_info = &tile_data->tile_info; + MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + + get_mi_grid_idx(&cm->mi_params, mi_row, mi_col); + const BLOCK_SIZE sb_size = cm->seq_params.sb_size; + int dummy_rate; + int64_t dummy_dist; + RD_STATS dummy_rdc; + +#if CONFIG_REALTIME_ONLY + (void)seg_skip; +#endif // CONFIG_REALTIME_ONLY + + init_encode_rd_sb(cpi, td, tile_data, pc_root, &dummy_rdc, mi_row, mi_col, 1); + + if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) { + set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, sb_size); + av1_choose_var_based_partitioning(cpi, tile_info, td, x, mi_row, mi_col); + rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size, + &dummy_rate, &dummy_dist, 1, pc_root); + } +#if !CONFIG_REALTIME_ONLY + else if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip) { + set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); + const BLOCK_SIZE bsize = + seg_skip ? sb_size : sf->part_sf.always_this_block_size; + set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); + rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size, + &dummy_rate, &dummy_dist, 1, pc_root); + } else if (cpi->partition_search_skippable_frame) { + set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); + const BLOCK_SIZE bsize = + get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col); + set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); + rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size, + &dummy_rate, &dummy_dist, 1, pc_root); + } else { + // No stats for overlay frames. Exclude key frame. + x->valid_cost_b = + get_tpl_stats_b(cpi, sb_size, mi_row, mi_col, x->intra_cost_b, + x->inter_cost_b, x->mv_b, &x->cost_stride); + + reset_partition(pc_root, sb_size); + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, rd_pick_partition_time); +#endif + BLOCK_SIZE max_sq_size = x->max_partition_size; + BLOCK_SIZE min_sq_size = x->min_partition_size; + + if (use_auto_max_partition(cpi, sb_size, mi_row, mi_col)) { + float features[FEATURE_SIZE_MAX_MIN_PART_PRED] = { 0.0f }; + + av1_get_max_min_partition_features(cpi, x, mi_row, mi_col, features); + max_sq_size = AOMMAX( + AOMMIN(av1_predict_max_partition(cpi, x, features), max_sq_size), + min_sq_size); + } + + const int num_passes = cpi->oxcf.sb_multipass_unit_test ? 2 : 1; + + if (num_passes == 1) { + rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size, + max_sq_size, min_sq_size, &dummy_rdc, dummy_rdc, + pc_root, NULL, SB_SINGLE_PASS, NULL); + } else { + // First pass + SB_FIRST_PASS_STATS sb_fp_stats; + backup_sb_state(&sb_fp_stats, cpi, td, tile_data, mi_row, mi_col); + rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size, + max_sq_size, min_sq_size, &dummy_rdc, dummy_rdc, + pc_root, NULL, SB_DRY_PASS, NULL); + + // Second pass + init_encode_rd_sb(cpi, td, tile_data, pc_root, &dummy_rdc, mi_row, mi_col, + 0); + reset_mbmi(&cm->mi_params, sb_size, mi_row, mi_col); + reset_partition(pc_root, sb_size); + + restore_sb_state(&sb_fp_stats, cpi, td, tile_data, mi_row, mi_col); + + rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size, + max_sq_size, min_sq_size, &dummy_rdc, dummy_rdc, + pc_root, NULL, SB_WET_PASS, NULL); + } + // Reset to 0 so that it wouldn't be used elsewhere mistakenly. + x->valid_cost_b = 0; +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, rd_pick_partition_time); +#endif + } +#endif // !CONFIG_REALTIME_ONLY + + // TODO(angiebird): Let inter_mode_rd_model_estimation support multi-tile. + if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1 && + cm->tiles.cols == 1 && cm->tiles.rows == 1) { + av1_inter_mode_data_fit(tile_data, x->rdmult); + } +} + +static AOM_INLINE void set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td, + const TileInfo *const tile_info, + const int mi_row, const int mi_col) { + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + + switch (cpi->oxcf.coeff_cost_upd_freq) { + case COST_UPD_TILE: // Tile level + if (mi_row != tile_info->mi_row_start) break; + AOM_FALLTHROUGH_INTENDED; + case COST_UPD_SBROW: // SB row level in tile + if (mi_col != tile_info->mi_col_start) break; + AOM_FALLTHROUGH_INTENDED; + case COST_UPD_SB: // SB level + if (cpi->sf.inter_sf.disable_sb_level_coeff_cost_upd && + mi_col != tile_info->mi_col_start) + break; + av1_fill_coeff_costs(&td->mb, xd->tile_ctx, num_planes); + break; + default: assert(0); + } + + switch (cpi->oxcf.mode_cost_upd_freq) { + case COST_UPD_TILE: // Tile level + if (mi_row != tile_info->mi_row_start) break; + AOM_FALLTHROUGH_INTENDED; + case COST_UPD_SBROW: // SB row level in tile + if (mi_col != tile_info->mi_col_start) break; + AOM_FALLTHROUGH_INTENDED; + case COST_UPD_SB: // SB level + av1_fill_mode_rates(cm, x, xd->tile_ctx); + break; + default: assert(0); + } + switch (cpi->oxcf.mv_cost_upd_freq) { + case COST_UPD_OFF: break; + case COST_UPD_TILE: // Tile level + if (mi_row != tile_info->mi_row_start) break; + AOM_FALLTHROUGH_INTENDED; + case COST_UPD_SBROW: // SB row level in tile + if (mi_col != tile_info->mi_col_start) break; + AOM_FALLTHROUGH_INTENDED; + case COST_UPD_SB: // SB level + if (cpi->sf.inter_sf.disable_sb_level_mv_cost_upd && + mi_col != tile_info->mi_col_start) + break; + av1_fill_mv_costs(xd->tile_ctx, cm->features.cur_frame_force_integer_mv, + cm->features.allow_high_precision_mv, x); + break; + default: assert(0); + } +} + +static AOM_INLINE void encode_sb_row(AV1_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, int mi_row, + TOKENEXTRA **tp) { + AV1_COMMON *const cm = &cpi->common; + const TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_data->tile_info); + const BLOCK_SIZE sb_size = cm->seq_params.sb_size; + const int mib_size = cm->seq_params.mib_size; + const int mib_size_log2 = cm->seq_params.mib_size_log2; + const int sb_row = (mi_row - tile_info->mi_row_start) >> mib_size_log2; + const int use_nonrd_mode = cpi->sf.rt_sf.use_nonrd_pick_mode; + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_sb_time); +#endif + + // Initialize the left context for the new SB row + av1_zero_left_context(xd); + + // Reset delta for every tile + if (mi_row == tile_info->mi_row_start || cpi->row_mt) { + if (cm->delta_q_info.delta_q_present_flag) + xd->current_qindex = cm->quant_params.base_qindex; + if (cm->delta_q_info.delta_lf_present_flag) { + av1_reset_loop_filter_delta(xd, av1_num_planes(cm)); + } + } + reset_thresh_freq_fact(x); + + // Code each SB in the row + for (int mi_col = tile_info->mi_col_start, sb_col_in_tile = 0; + mi_col < tile_info->mi_col_end; mi_col += mib_size, sb_col_in_tile++) { + (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row, + sb_col_in_tile); + if (tile_data->allow_update_cdf && (cpi->row_mt == 1) && + (tile_info->mi_row_start != mi_row)) { + if ((tile_info->mi_col_start == mi_col)) { + // restore frame context of 1st column sb + memcpy(xd->tile_ctx, x->row_ctx, sizeof(*xd->tile_ctx)); + } else { + int wt_left = AVG_CDF_WEIGHT_LEFT; + int wt_tr = AVG_CDF_WEIGHT_TOP_RIGHT; + if (tile_info->mi_col_end > (mi_col + mib_size)) + avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile, wt_left, + wt_tr); + else + avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile - 1, + wt_left, wt_tr); + } + } + + set_cost_upd_freq(cpi, td, tile_info, mi_row, mi_col); + + x->color_sensitivity[0] = 0; + x->color_sensitivity[1] = 0; + x->content_state_sb = 0; + + PC_TREE *const pc_root = td->pc_root; + pc_root->index = 0; + + xd->cur_frame_force_integer_mv = cm->features.cur_frame_force_integer_mv; + td->mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col); + x->source_variance = UINT_MAX; + x->simple_motion_pred_sse = UINT_MAX; + + const struct segmentation *const seg = &cm->seg; + int seg_skip = 0; + if (seg->enabled) { + const uint8_t *const map = + seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map; + const int segment_id = + map ? get_segment_id(&cm->mi_params, map, sb_size, mi_row, mi_col) + : 0; + seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP); + } + + if (use_nonrd_mode) { + encode_nonrd_sb(cpi, td, tile_data, pc_root, tp, mi_row, mi_col, + seg_skip); + } else { + encode_rd_sb(cpi, td, tile_data, pc_root, tp, mi_row, mi_col, seg_skip); + } + + if (tile_data->allow_update_cdf && (cpi->row_mt == 1) && + (tile_info->mi_row_end > (mi_row + mib_size))) { + if (sb_cols_in_tile == 1) + memcpy(x->row_ctx, xd->tile_ctx, sizeof(*xd->tile_ctx)); + else if (sb_col_in_tile >= 1) + memcpy(x->row_ctx + sb_col_in_tile - 1, xd->tile_ctx, + sizeof(*xd->tile_ctx)); + } + (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row, + sb_col_in_tile, sb_cols_in_tile); + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_sb_time); +#endif +} + +static AOM_INLINE void init_encode_frame_mb_context(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + + // Copy data over into macro block data structures. + av1_setup_src_planes(x, cpi->source, 0, 0, num_planes, + cm->seq_params.sb_size); + + av1_setup_block_planes(xd, cm->seq_params.subsampling_x, + cm->seq_params.subsampling_y, num_planes); +} + +void av1_alloc_tile_data(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + + if (cpi->tile_data != NULL) aom_free(cpi->tile_data); + CHECK_MEM_ERROR( + cm, cpi->tile_data, + aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data))); + + cpi->allocated_tiles = tile_cols * tile_rows; +} + +void av1_init_tile_data(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + int tile_col, tile_row; + TOKENEXTRA *pre_tok = cpi->tile_tok[0][0]; + TOKENLIST *tplist = cpi->tplist[0][0]; + unsigned int tile_tok = 0; + int tplist_count = 0; + + for (tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileDataEnc *const tile_data = + &cpi->tile_data[tile_row * tile_cols + tile_col]; + TileInfo *const tile_info = &tile_data->tile_info; + av1_tile_init(tile_info, cm, tile_row, tile_col); + + cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok; + pre_tok = cpi->tile_tok[tile_row][tile_col]; + tile_tok = allocated_tokens( + *tile_info, cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes); + cpi->tplist[tile_row][tile_col] = tplist + tplist_count; + tplist = cpi->tplist[tile_row][tile_col]; + tplist_count = av1_get_sb_rows_in_tile(cm, tile_data->tile_info); + tile_data->allow_update_cdf = !cm->tiles.large_scale; + tile_data->allow_update_cdf = + tile_data->allow_update_cdf && !cm->features.disable_cdf_update; + tile_data->tctx = *cm->fc; + } + } +} + +void av1_encode_sb_row(AV1_COMP *cpi, ThreadData *td, int tile_row, + int tile_col, int mi_row) { + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + const int tile_cols = cm->tiles.cols; + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + const TileInfo *const tile_info = &this_tile->tile_info; + TOKENEXTRA *tok = NULL; + const int sb_row_in_tile = + (mi_row - tile_info->mi_row_start) >> cm->seq_params.mib_size_log2; + const int tile_mb_cols = + (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2; + const int num_mb_rows_in_sb = + ((1 << (cm->seq_params.mib_size_log2 + MI_SIZE_LOG2)) + 8) >> 4; + + get_start_tok(cpi, tile_row, tile_col, mi_row, &tok, + cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes); + cpi->tplist[tile_row][tile_col][sb_row_in_tile].start = tok; + + encode_sb_row(cpi, td, this_tile, mi_row, &tok); + + cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop = tok; + cpi->tplist[tile_row][tile_col][sb_row_in_tile].count = + (unsigned int)(cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop - + cpi->tplist[tile_row][tile_col][sb_row_in_tile].start); + + assert( + (unsigned int)(tok - + cpi->tplist[tile_row][tile_col][sb_row_in_tile].start) <= + get_token_alloc(num_mb_rows_in_sb, tile_mb_cols, + cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes)); + + (void)tile_mb_cols; + (void)num_mb_rows_in_sb; +} + +void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row, + int tile_col) { + AV1_COMMON *const cm = &cpi->common; + TileDataEnc *const this_tile = + &cpi->tile_data[tile_row * cm->tiles.cols + tile_col]; + const TileInfo *const tile_info = &this_tile->tile_info; + + if (!cpi->sf.rt_sf.use_nonrd_pick_mode) av1_inter_mode_data_init(this_tile); + + av1_zero_above_context(cm, &td->mb.e_mbd, tile_info->mi_col_start, + tile_info->mi_col_end, tile_row); + av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row, + &td->mb.e_mbd); + + if (cpi->oxcf.enable_cfl_intra) cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params); + + av1_crc32c_calculator_init(&td->mb.mb_rd_record.crc_calculator); + + for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end; + mi_row += cm->seq_params.mib_size) { + av1_encode_sb_row(cpi, td, tile_row, tile_col, mi_row); + } +} + +static AOM_INLINE void encode_tiles(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + int tile_col, tile_row; + + if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) + av1_alloc_tile_data(cpi); + + av1_init_tile_data(cpi); + + for (tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileDataEnc *const this_tile = + &cpi->tile_data[tile_row * cm->tiles.cols + tile_col]; + cpi->td.intrabc_used = 0; + cpi->td.deltaq_used = 0; + cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx; + cpi->td.mb.tile_pb_ctx = &this_tile->tctx; + av1_encode_tile(cpi, &cpi->td, tile_row, tile_col); + cpi->intrabc_used |= cpi->td.intrabc_used; + cpi->deltaq_used |= cpi->td.deltaq_used; + } + } +} + +#define GLOBAL_TRANS_TYPES_ENC 3 // highest motion model to search +static int gm_get_params_cost(const WarpedMotionParams *gm, + const WarpedMotionParams *ref_gm, int allow_hp) { + int params_cost = 0; + int trans_bits, trans_prec_diff; + switch (gm->wmtype) { + case AFFINE: + case ROTZOOM: + params_cost += aom_count_signed_primitive_refsubexpfin( + GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS), + (gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS)); + params_cost += aom_count_signed_primitive_refsubexpfin( + GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_gm->wmmat[3] >> GM_ALPHA_PREC_DIFF), + (gm->wmmat[3] >> GM_ALPHA_PREC_DIFF)); + if (gm->wmtype >= AFFINE) { + params_cost += aom_count_signed_primitive_refsubexpfin( + GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_gm->wmmat[4] >> GM_ALPHA_PREC_DIFF), + (gm->wmmat[4] >> GM_ALPHA_PREC_DIFF)); + params_cost += aom_count_signed_primitive_refsubexpfin( + GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) - + (1 << GM_ALPHA_PREC_BITS), + (gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS)); + } + AOM_FALLTHROUGH_INTENDED; + case TRANSLATION: + trans_bits = (gm->wmtype == TRANSLATION) + ? GM_ABS_TRANS_ONLY_BITS - !allow_hp + : GM_ABS_TRANS_BITS; + trans_prec_diff = (gm->wmtype == TRANSLATION) + ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp + : GM_TRANS_PREC_DIFF; + params_cost += aom_count_signed_primitive_refsubexpfin( + (1 << trans_bits) + 1, SUBEXPFIN_K, + (ref_gm->wmmat[0] >> trans_prec_diff), + (gm->wmmat[0] >> trans_prec_diff)); + params_cost += aom_count_signed_primitive_refsubexpfin( + (1 << trans_bits) + 1, SUBEXPFIN_K, + (ref_gm->wmmat[1] >> trans_prec_diff), + (gm->wmmat[1] >> trans_prec_diff)); + AOM_FALLTHROUGH_INTENDED; + case IDENTITY: break; + default: assert(0); + } + return (params_cost << AV1_PROB_COST_SHIFT); +} + +static int do_gm_search_logic(SPEED_FEATURES *const sf, int frame) { + (void)frame; + switch (sf->gm_sf.gm_search_type) { + case GM_FULL_SEARCH: return 1; + case GM_REDUCED_REF_SEARCH_SKIP_L2_L3: + return !(frame == LAST2_FRAME || frame == LAST3_FRAME); + case GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2: + return !(frame == LAST2_FRAME || frame == LAST3_FRAME || + (frame == ALTREF2_FRAME)); + case GM_DISABLE_SEARCH: return 0; + default: assert(0); + } + return 1; +} + +// Set the relative distance of a reference frame w.r.t. current frame +static AOM_INLINE void set_rel_frame_dist(AV1_COMP *cpi) { + const AV1_COMMON *const cm = &cpi->common; + const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info; + MV_REFERENCE_FRAME ref_frame; + int min_past_dist = INT32_MAX, min_future_dist = INT32_MAX; + cpi->nearest_past_ref = NONE_FRAME; + cpi->nearest_future_ref = NONE_FRAME; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + cpi->ref_relative_dist[ref_frame - LAST_FRAME] = 0; + if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) { + int dist = av1_encoder_get_relative_dist( + order_hint_info, + cm->cur_frame->ref_display_order_hint[ref_frame - LAST_FRAME], + cm->current_frame.display_order_hint); + cpi->ref_relative_dist[ref_frame - LAST_FRAME] = dist; + // Get the nearest ref_frame in the past + if (abs(dist) < min_past_dist && dist < 0) { + cpi->nearest_past_ref = ref_frame; + min_past_dist = abs(dist); + } + // Get the nearest ref_frame in the future + if (dist < min_future_dist && dist > 0) { + cpi->nearest_future_ref = ref_frame; + min_future_dist = dist; + } + } + } +} + +static INLINE int refs_are_one_sided(const AV1_COMMON *cm) { + assert(!frame_is_intra_only(cm)); + + int one_sided_refs = 1; + for (int ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref); + if (buf == NULL) continue; + + const int ref_display_order_hint = buf->display_order_hint; + if (av1_encoder_get_relative_dist( + &cm->seq_params.order_hint_info, ref_display_order_hint, + (int)cm->current_frame.display_order_hint) > 0) { + one_sided_refs = 0; // bwd reference + break; + } + } + return one_sided_refs; +} + +static INLINE void get_skip_mode_ref_offsets(const AV1_COMMON *cm, + int ref_order_hint[2]) { + const SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info; + ref_order_hint[0] = ref_order_hint[1] = 0; + if (!skip_mode_info->skip_mode_allowed) return; + + const RefCntBuffer *const buf_0 = + get_ref_frame_buf(cm, LAST_FRAME + skip_mode_info->ref_frame_idx_0); + const RefCntBuffer *const buf_1 = + get_ref_frame_buf(cm, LAST_FRAME + skip_mode_info->ref_frame_idx_1); + assert(buf_0 != NULL && buf_1 != NULL); + + ref_order_hint[0] = buf_0->order_hint; + ref_order_hint[1] = buf_1->order_hint; +} + +static int check_skip_mode_enabled(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + + av1_setup_skip_mode_allowed(cm); + if (!cm->current_frame.skip_mode_info.skip_mode_allowed) return 0; + + // Turn off skip mode if the temporal distances of the reference pair to the + // current frame are different by more than 1 frame. + const int cur_offset = (int)cm->current_frame.order_hint; + int ref_offset[2]; + get_skip_mode_ref_offsets(cm, ref_offset); + const int cur_to_ref0 = get_relative_dist(&cm->seq_params.order_hint_info, + cur_offset, ref_offset[0]); + const int cur_to_ref1 = abs(get_relative_dist(&cm->seq_params.order_hint_info, + cur_offset, ref_offset[1])); + if (abs(cur_to_ref0 - cur_to_ref1) > 1) return 0; + + // High Latency: Turn off skip mode if all refs are fwd. + if (cpi->all_one_sided_refs && cpi->oxcf.lag_in_frames > 0) return 0; + + static const int flag_list[REF_FRAMES] = { 0, + AOM_LAST_FLAG, + AOM_LAST2_FLAG, + AOM_LAST3_FLAG, + AOM_GOLD_FLAG, + AOM_BWD_FLAG, + AOM_ALT2_FLAG, + AOM_ALT_FLAG }; + const int ref_frame[2] = { + cm->current_frame.skip_mode_info.ref_frame_idx_0 + LAST_FRAME, + cm->current_frame.skip_mode_info.ref_frame_idx_1 + LAST_FRAME + }; + if (!(cpi->ref_frame_flags & flag_list[ref_frame[0]]) || + !(cpi->ref_frame_flags & flag_list[ref_frame[1]])) + return 0; + + return 1; +} + +// Function to decide if we can skip the global motion parameter computation +// for a particular ref frame +static INLINE int skip_gm_frame(AV1_COMMON *const cm, int ref_frame) { + if ((ref_frame == LAST3_FRAME || ref_frame == LAST2_FRAME) && + cm->global_motion[GOLDEN_FRAME].wmtype != IDENTITY) { + return get_relative_dist( + &cm->seq_params.order_hint_info, + cm->cur_frame->ref_order_hints[ref_frame - LAST_FRAME], + cm->cur_frame->ref_order_hints[GOLDEN_FRAME - LAST_FRAME]) <= 0; + } + return 0; +} + +static AOM_INLINE void set_default_interp_skip_flags( + const AV1_COMMON *cm, InterpSearchFlags *interp_search_flags) { + const int num_planes = av1_num_planes(cm); + interp_search_flags->default_interp_skip_flags = + (num_planes == 1) ? INTERP_SKIP_LUMA_EVAL_CHROMA + : INTERP_SKIP_LUMA_SKIP_CHROMA; +} + +// TODO(Remya): Can include erroradv_prod_tr[] for threshold calculation +static INLINE int64_t calc_erroradv_threshold(AV1_COMP *cpi, + int64_t ref_frame_error) { + if (!cpi->sf.gm_sf.disable_adaptive_warp_error_thresh) + return (int64_t)( + ref_frame_error * erroradv_tr[cpi->sf.gm_sf.gm_erroradv_type] + 0.5); + else + return INT64_MAX; +} + +static void compute_global_motion_for_ref_frame( + AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame, + int *num_frm_corners, int *frm_corners, unsigned char *frm_buffer, + MotionModel *params_by_motion, uint8_t *segment_map, + const int segment_map_w, const int segment_map_h, + const WarpedMotionParams *ref_params) { + ThreadData *const td = &cpi->td; + MACROBLOCK *const x = &td->mb; + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + int i; + // clang-format off + static const double kIdentityParams[MAX_PARAMDIM - 1] = { + 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0 + }; + // clang-format on + WarpedMotionParams tmp_wm_params; + const double *params_this_motion; + int inliers_by_motion[RANSAC_NUM_MOTIONS]; + assert(ref_buf[frame] != NULL); + if (*num_frm_corners < 0) { + // compute interest points using FAST features + *num_frm_corners = av1_fast_corner_detect( + frm_buffer, cpi->source->y_width, cpi->source->y_height, + cpi->source->y_stride, frm_corners, MAX_CORNERS); + } + TransformationType model; + + aom_clear_system_state(); + + // TODO(sarahparker, debargha): Explore do_adaptive_gm_estimation = 1 + const int do_adaptive_gm_estimation = 0; + + const int ref_frame_dist = get_relative_dist( + &cm->seq_params.order_hint_info, cm->current_frame.order_hint, + cm->cur_frame->ref_order_hints[frame - LAST_FRAME]); + const GlobalMotionEstimationType gm_estimation_type = + cm->seq_params.order_hint_info.enable_order_hint && + abs(ref_frame_dist) <= 2 && do_adaptive_gm_estimation + ? GLOBAL_MOTION_DISFLOW_BASED + : GLOBAL_MOTION_FEATURE_BASED; + for (model = ROTZOOM; model < GLOBAL_TRANS_TYPES_ENC; ++model) { + int64_t best_warp_error = INT64_MAX; + // Initially set all params to identity. + for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) { + memcpy(params_by_motion[i].params, kIdentityParams, + (MAX_PARAMDIM - 1) * sizeof(*(params_by_motion[i].params))); + params_by_motion[i].num_inliers = 0; + } + + av1_compute_global_motion( + model, frm_buffer, cpi->source->y_width, cpi->source->y_height, + cpi->source->y_stride, frm_corners, *num_frm_corners, ref_buf[frame], + cpi->common.seq_params.bit_depth, gm_estimation_type, inliers_by_motion, + params_by_motion, RANSAC_NUM_MOTIONS); + int64_t ref_frame_error = 0; + for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) { + if (inliers_by_motion[i] == 0) continue; + + params_this_motion = params_by_motion[i].params; + av1_convert_model_to_params(params_this_motion, &tmp_wm_params); + + if (tmp_wm_params.wmtype != IDENTITY) { + av1_compute_feature_segmentation_map( + segment_map, segment_map_w, segment_map_h, + params_by_motion[i].inliers, params_by_motion[i].num_inliers); + + ref_frame_error = av1_segmented_frame_error( + is_cur_buf_hbd(xd), xd->bd, ref_buf[frame]->y_buffer, + ref_buf[frame]->y_stride, cpi->source->y_buffer, + cpi->source->y_width, cpi->source->y_height, cpi->source->y_stride, + segment_map, segment_map_w); + + int64_t erroradv_threshold = + calc_erroradv_threshold(cpi, ref_frame_error); + + const int64_t warp_error = av1_refine_integerized_param( + &tmp_wm_params, tmp_wm_params.wmtype, is_cur_buf_hbd(xd), xd->bd, + ref_buf[frame]->y_buffer, ref_buf[frame]->y_width, + ref_buf[frame]->y_height, ref_buf[frame]->y_stride, + cpi->source->y_buffer, cpi->source->y_width, cpi->source->y_height, + cpi->source->y_stride, GM_REFINEMENT_COUNT, best_warp_error, + segment_map, segment_map_w, erroradv_threshold); + + if (warp_error < best_warp_error) { + best_warp_error = warp_error; + // Save the wm_params modified by + // av1_refine_integerized_param() rather than motion index to + // avoid rerunning refine() below. + memcpy(&(cm->global_motion[frame]), &tmp_wm_params, + sizeof(WarpedMotionParams)); + } + } + } + if (cm->global_motion[frame].wmtype <= AFFINE) + if (!av1_get_shear_params(&cm->global_motion[frame])) + cm->global_motion[frame] = default_warp_params; + + if (cm->global_motion[frame].wmtype == TRANSLATION) { + cm->global_motion[frame].wmmat[0] = + convert_to_trans_prec(cm->features.allow_high_precision_mv, + cm->global_motion[frame].wmmat[0]) * + GM_TRANS_ONLY_DECODE_FACTOR; + cm->global_motion[frame].wmmat[1] = + convert_to_trans_prec(cm->features.allow_high_precision_mv, + cm->global_motion[frame].wmmat[1]) * + GM_TRANS_ONLY_DECODE_FACTOR; + } + + if (cm->global_motion[frame].wmtype == IDENTITY) continue; + + if (ref_frame_error == 0) continue; + + // If the best error advantage found doesn't meet the threshold for + // this motion type, revert to IDENTITY. + if (!av1_is_enough_erroradvantage( + (double)best_warp_error / ref_frame_error, + gm_get_params_cost(&cm->global_motion[frame], ref_params, + cm->features.allow_high_precision_mv), + cpi->sf.gm_sf.gm_erroradv_type)) { + cm->global_motion[frame] = default_warp_params; + } + + if (cm->global_motion[frame].wmtype != IDENTITY) break; + } + + aom_clear_system_state(); +} + +typedef struct { + int distance; + MV_REFERENCE_FRAME frame; +} FrameDistPair; + +static INLINE void update_valid_ref_frames_for_gm( + AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], + FrameDistPair *past_ref_frame, FrameDistPair *future_ref_frame, + int *num_past_ref_frames, int *num_future_ref_frames) { + AV1_COMMON *const cm = &cpi->common; + const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info; + for (int frame = ALTREF_FRAME; frame >= LAST_FRAME; --frame) { + const MV_REFERENCE_FRAME ref_frame[2] = { frame, NONE_FRAME }; + RefCntBuffer *buf = get_ref_frame_buf(cm, frame); + const int ref_disabled = + !(cpi->ref_frame_flags & av1_ref_frame_flag_list[frame]); + ref_buf[frame] = NULL; + cm->global_motion[frame] = default_warp_params; + // Skip global motion estimation for invalid ref frames + if (buf == NULL || + (ref_disabled && cpi->sf.hl_sf.recode_loop != DISALLOW_RECODE)) { + cpi->gm_info.params_cost[frame] = 0; + continue; + } else { + ref_buf[frame] = &buf->buf; + } + + if (ref_buf[frame]->y_crop_width == cpi->source->y_crop_width && + ref_buf[frame]->y_crop_height == cpi->source->y_crop_height && + do_gm_search_logic(&cpi->sf, frame) && + !prune_ref_by_selective_ref_frame( + cpi, NULL, ref_frame, cm->cur_frame->ref_display_order_hint) && + !(cpi->sf.gm_sf.selective_ref_gm && skip_gm_frame(cm, frame))) { + assert(ref_buf[frame] != NULL); + int relative_frame_dist = av1_encoder_get_relative_dist( + order_hint_info, buf->display_order_hint, + cm->cur_frame->display_order_hint); + // Populate past and future ref frames + if (relative_frame_dist <= 0) { + past_ref_frame[*num_past_ref_frames].distance = + abs(relative_frame_dist); + past_ref_frame[*num_past_ref_frames].frame = frame; + (*num_past_ref_frames)++; + } else { + future_ref_frame[*num_future_ref_frames].distance = + abs(relative_frame_dist); + future_ref_frame[*num_future_ref_frames].frame = frame; + (*num_future_ref_frames)++; + } + } + } +} + +static INLINE void compute_gm_for_valid_ref_frames( + AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame, + int *num_frm_corners, int *frm_corners, unsigned char *frm_buffer, + MotionModel *params_by_motion, uint8_t *segment_map, + const int segment_map_w, const int segment_map_h) { + AV1_COMMON *const cm = &cpi->common; + GlobalMotionInfo *const gm_info = &cpi->gm_info; + const WarpedMotionParams *ref_params = + cm->prev_frame ? &cm->prev_frame->global_motion[frame] + : &default_warp_params; + + compute_global_motion_for_ref_frame( + cpi, ref_buf, frame, num_frm_corners, frm_corners, frm_buffer, + params_by_motion, segment_map, segment_map_w, segment_map_h, ref_params); + + gm_info->params_cost[frame] = + gm_get_params_cost(&cm->global_motion[frame], ref_params, + cm->features.allow_high_precision_mv) + + gm_info->type_cost[cm->global_motion[frame].wmtype] - + gm_info->type_cost[IDENTITY]; +} + +static int compare_distance(const void *a, const void *b) { + const int diff = + ((FrameDistPair *)a)->distance - ((FrameDistPair *)b)->distance; + if (diff > 0) + return 1; + else if (diff < 0) + return -1; + return 0; +} + +static INLINE void compute_global_motion_for_references( + AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], + FrameDistPair reference_frame[REF_FRAMES - 1], int num_ref_frames, + int *num_frm_corners, int *frm_corners, unsigned char *frm_buffer, + MotionModel *params_by_motion, uint8_t *segment_map, + const int segment_map_w, const int segment_map_h) { + AV1_COMMON *const cm = &cpi->common; + // Compute global motion w.r.t. reference frames starting from the nearest ref + // frame in a given direction + for (int frame = 0; frame < num_ref_frames; frame++) { + int ref_frame = reference_frame[frame].frame; + compute_gm_for_valid_ref_frames(cpi, ref_buf, ref_frame, num_frm_corners, + frm_corners, frm_buffer, params_by_motion, + segment_map, segment_map_w, segment_map_h); + // If global motion w.r.t. current ref frame is + // INVALID/TRANSLATION/IDENTITY, skip the evaluation of global motion w.r.t + // the remaining ref frames in that direction. The below exit is disabled + // when ref frame distance w.r.t. current frame is zero. E.g.: + // source_alt_ref_frame w.r.t. ARF frames + if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search && + reference_frame[frame].distance != 0 && + cm->global_motion[ref_frame].wmtype != ROTZOOM) + break; + } +} + +static AOM_INLINE void setup_prune_ref_frame_mask(AV1_COMP *cpi) { + if (!cpi->sf.rt_sf.use_nonrd_pick_mode && + cpi->sf.inter_sf.selective_ref_frame >= 2) { + AV1_COMMON *const cm = &cpi->common; + const OrderHintInfo *const order_hint_info = + &cm->seq_params.order_hint_info; + const int cur_frame_display_order_hint = + cm->current_frame.display_order_hint; + unsigned int *ref_display_order_hint = + cm->cur_frame->ref_display_order_hint; + const int arf2_dist = av1_encoder_get_relative_dist( + order_hint_info, ref_display_order_hint[ALTREF2_FRAME - LAST_FRAME], + cur_frame_display_order_hint); + const int bwd_dist = av1_encoder_get_relative_dist( + order_hint_info, ref_display_order_hint[BWDREF_FRAME - LAST_FRAME], + cur_frame_display_order_hint); + + for (int ref_idx = REF_FRAMES; ref_idx < MODE_CTX_REF_FRAMES; ++ref_idx) { + MV_REFERENCE_FRAME rf[2]; + av1_set_ref_frame(rf, ref_idx); + if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[0]]) || + !(cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[1]])) { + continue; + } + + if (!cpi->all_one_sided_refs) { + int ref_dist[2]; + for (int i = 0; i < 2; ++i) { + ref_dist[i] = av1_encoder_get_relative_dist( + order_hint_info, ref_display_order_hint[rf[i] - LAST_FRAME], + cur_frame_display_order_hint); + } + + // One-sided compound is used only when all reference frames are + // one-sided. + if ((ref_dist[0] > 0) == (ref_dist[1] > 0)) { + cpi->prune_ref_frame_mask |= 1 << ref_idx; + } + } + + if (cpi->sf.inter_sf.selective_ref_frame >= 4 && + (rf[0] == ALTREF2_FRAME || rf[1] == ALTREF2_FRAME) && + (cpi->ref_frame_flags & av1_ref_frame_flag_list[BWDREF_FRAME])) { + // Check if both ALTREF2_FRAME and BWDREF_FRAME are future references. + if (arf2_dist > 0 && bwd_dist > 0 && bwd_dist <= arf2_dist) { + // Drop ALTREF2_FRAME as a reference if BWDREF_FRAME is a closer + // reference to the current frame than ALTREF2_FRAME + cpi->prune_ref_frame_mask |= 1 << ref_idx; + } + } + } + } +} + +#define CHECK_PRECOMPUTED_REF_FRAME_MAP 0 + +static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) { + ThreadData *const td = &cpi->td; + MACROBLOCK *const x = &td->mb; + AV1_COMMON *const cm = &cpi->common; + CommonModeInfoParams *const mi_params = &cm->mi_params; + FeatureFlags *const features = &cm->features; + MACROBLOCKD *const xd = &x->e_mbd; + RD_COUNTS *const rdc = &cpi->td.rd_counts; + GlobalMotionInfo *const gm_info = &cpi->gm_info; + FrameProbInfo *const frame_probs = &cpi->frame_probs; + IntraBCHashInfo *const intrabc_hash_info = &x->intrabc_hash_info; + int i; + + if (!cpi->sf.rt_sf.use_nonrd_pick_mode) { + mi_params->setup_mi(mi_params); + } + + set_mi_offsets(mi_params, xd, 0, 0); + +#if CONFIG_AV1_HIGHBITDEPTH + x->fwd_txfm4x4 = aom_fdct4x4; +#else + x->fwd_txfm4x4 = aom_fdct4x4_lp; +#endif + + av1_zero(*td->counts); + av1_zero(rdc->comp_pred_diff); + av1_zero(rdc->tx_type_used); + av1_zero(rdc->obmc_used); + av1_zero(rdc->warped_used); + + // Reset the flag. + cpi->intrabc_used = 0; + // Need to disable intrabc when superres is selected + if (av1_superres_scaled(cm)) { + features->allow_intrabc = 0; + } + + features->allow_intrabc &= (cpi->oxcf.enable_intrabc); + + if (features->allow_warped_motion && + cpi->sf.inter_sf.prune_warped_prob_thresh > 0) { + const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group); + if (frame_probs->warped_probs[update_type] < + cpi->sf.inter_sf.prune_warped_prob_thresh) + features->allow_warped_motion = 0; + } + + int hash_table_created = 0; + if (!is_stat_generation_stage(cpi) && av1_use_hash_me(cpi) && + !cpi->sf.rt_sf.use_nonrd_pick_mode) { + // TODO(any): move this outside of the recoding loop to avoid recalculating + // the hash table. + // add to hash table + const int pic_width = cpi->source->y_crop_width; + const int pic_height = cpi->source->y_crop_height; + uint32_t *block_hash_values[2][2]; + int8_t *is_block_same[2][3]; + int k, j; + + for (k = 0; k < 2; k++) { + for (j = 0; j < 2; j++) { + CHECK_MEM_ERROR(cm, block_hash_values[k][j], + aom_malloc(sizeof(uint32_t) * pic_width * pic_height)); + } + + for (j = 0; j < 3; j++) { + CHECK_MEM_ERROR(cm, is_block_same[k][j], + aom_malloc(sizeof(int8_t) * pic_width * pic_height)); + } + } + + av1_hash_table_init(intrabc_hash_info); + av1_hash_table_create(&intrabc_hash_info->intrabc_hash_table); + hash_table_created = 1; + av1_generate_block_2x2_hash_value(intrabc_hash_info, cpi->source, + block_hash_values[0], is_block_same[0]); + // Hash data generated for screen contents is used for intraBC ME + const int min_alloc_size = block_size_wide[mi_params->mi_alloc_bsize]; + const int max_sb_size = + (1 << (cm->seq_params.mib_size_log2 + MI_SIZE_LOG2)); + int src_idx = 0; + for (int size = 4; size <= max_sb_size; size *= 2, src_idx = !src_idx) { + const int dst_idx = !src_idx; + av1_generate_block_hash_value( + intrabc_hash_info, cpi->source, size, block_hash_values[src_idx], + block_hash_values[dst_idx], is_block_same[src_idx], + is_block_same[dst_idx]); + if (size >= min_alloc_size) { + av1_add_to_hash_map_by_row_with_precal_data( + &intrabc_hash_info->intrabc_hash_table, block_hash_values[dst_idx], + is_block_same[dst_idx][2], pic_width, pic_height, size); + } + } + + for (k = 0; k < 2; k++) { + for (j = 0; j < 2; j++) { + aom_free(block_hash_values[k][j]); + } + + for (j = 0; j < 3; j++) { + aom_free(is_block_same[k][j]); + } + } + } + + const CommonQuantParams *quant_params = &cm->quant_params; + for (i = 0; i < MAX_SEGMENTS; ++i) { + const int qindex = + cm->seg.enabled ? av1_get_qindex(&cm->seg, i, quant_params->base_qindex) + : quant_params->base_qindex; + xd->lossless[i] = + qindex == 0 && quant_params->y_dc_delta_q == 0 && + quant_params->u_dc_delta_q == 0 && quant_params->u_ac_delta_q == 0 && + quant_params->v_dc_delta_q == 0 && quant_params->v_ac_delta_q == 0; + if (xd->lossless[i]) cpi->enc_seg.has_lossless_segment = 1; + xd->qindex[i] = qindex; + if (xd->lossless[i]) { + cpi->optimize_seg_arr[i] = NO_TRELLIS_OPT; + } else { + cpi->optimize_seg_arr[i] = cpi->sf.rd_sf.optimize_coefficients; + } + } + features->coded_lossless = is_coded_lossless(cm, xd); + features->all_lossless = features->coded_lossless && !av1_superres_scaled(cm); + + // Fix delta q resolution for the moment + cm->delta_q_info.delta_q_res = 0; + if (cpi->oxcf.deltaq_mode == DELTA_Q_OBJECTIVE) + cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_OBJECTIVE; + else if (cpi->oxcf.deltaq_mode == DELTA_Q_PERCEPTUAL) + cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL; + // Set delta_q_present_flag before it is used for the first time + cm->delta_q_info.delta_lf_res = DEFAULT_DELTA_LF_RES; + cm->delta_q_info.delta_q_present_flag = cpi->oxcf.deltaq_mode != NO_DELTA_Q; + + // Turn off cm->delta_q_info.delta_q_present_flag if objective delta_q is used + // for ineligible frames. That effectively will turn off row_mt usage. + // Note objective delta_q and tpl eligible frames are only altref frames + // currently. + if (cm->delta_q_info.delta_q_present_flag) { + if (cpi->oxcf.deltaq_mode == DELTA_Q_OBJECTIVE && + !is_frame_tpl_eligible(cpi)) + cm->delta_q_info.delta_q_present_flag = 0; + } + + // Reset delta_q_used flag + cpi->deltaq_used = 0; + + cm->delta_q_info.delta_lf_present_flag = + cm->delta_q_info.delta_q_present_flag && cpi->oxcf.deltalf_mode; + cm->delta_q_info.delta_lf_multi = DEFAULT_DELTA_LF_MULTI; + + // update delta_q_present_flag and delta_lf_present_flag based on + // base_qindex + cm->delta_q_info.delta_q_present_flag &= quant_params->base_qindex > 0; + cm->delta_q_info.delta_lf_present_flag &= quant_params->base_qindex > 0; + + av1_frame_init_quantizer(cpi); + av1_initialize_rd_consts(cpi); + av1_initialize_me_consts(cpi, x, quant_params->base_qindex); + + init_encode_frame_mb_context(cpi); + set_default_interp_skip_flags(cm, &cpi->interp_search_flags); + if (cm->prev_frame && cm->prev_frame->seg.enabled) + cm->last_frame_seg_map = cm->prev_frame->seg_map; + else + cm->last_frame_seg_map = NULL; + if (features->allow_intrabc || features->coded_lossless) { + av1_set_default_ref_deltas(cm->lf.ref_deltas); + av1_set_default_mode_deltas(cm->lf.mode_deltas); + } else if (cm->prev_frame) { + memcpy(cm->lf.ref_deltas, cm->prev_frame->ref_deltas, REF_FRAMES); + memcpy(cm->lf.mode_deltas, cm->prev_frame->mode_deltas, MAX_MODE_LF_DELTAS); + } + memcpy(cm->cur_frame->ref_deltas, cm->lf.ref_deltas, REF_FRAMES); + memcpy(cm->cur_frame->mode_deltas, cm->lf.mode_deltas, MAX_MODE_LF_DELTAS); + + cpi->all_one_sided_refs = + frame_is_intra_only(cm) ? 0 : refs_are_one_sided(cm); + + cpi->prune_ref_frame_mask = 0; + // Figure out which ref frames can be skipped at frame level. + setup_prune_ref_frame_mask(cpi); + + x->txb_split_count = 0; +#if CONFIG_SPEED_STATS + x->tx_search_count = 0; +#endif // CONFIG_SPEED_STATS + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_compute_global_motion_time); +#endif + av1_zero(rdc->global_motion_used); + av1_zero(gm_info->params_cost); + if (cpi->common.current_frame.frame_type == INTER_FRAME && cpi->source && + cpi->oxcf.enable_global_motion && !gm_info->search_done) { + YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES]; + MotionModel params_by_motion[RANSAC_NUM_MOTIONS]; + for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) { + memset(¶ms_by_motion[m], 0, sizeof(params_by_motion[m])); + params_by_motion[m].inliers = + aom_malloc(sizeof(*(params_by_motion[m].inliers)) * 2 * MAX_CORNERS); + } + + int num_frm_corners = -1; + int frm_corners[2 * MAX_CORNERS]; + unsigned char *frm_buffer = cpi->source->y_buffer; + if (cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) { + // The frame buffer is 16-bit, so we need to convert to 8 bits for the + // following code. We cache the result until the frame is released. + frm_buffer = + av1_downconvert_frame(cpi->source, cpi->common.seq_params.bit_depth); + } + const int segment_map_w = + (cpi->source->y_width + WARP_ERROR_BLOCK) >> WARP_ERROR_BLOCK_LOG; + const int segment_map_h = + (cpi->source->y_height + WARP_ERROR_BLOCK) >> WARP_ERROR_BLOCK_LOG; + + uint8_t *segment_map = + aom_malloc(sizeof(*segment_map) * segment_map_w * segment_map_h); + memset(segment_map, 0, + sizeof(*segment_map) * segment_map_w * segment_map_h); + + FrameDistPair future_ref_frame[REF_FRAMES - 1] = { + { -1, NONE_FRAME }, { -1, NONE_FRAME }, { -1, NONE_FRAME }, + { -1, NONE_FRAME }, { -1, NONE_FRAME }, { -1, NONE_FRAME }, + { -1, NONE_FRAME } + }; + FrameDistPair past_ref_frame[REF_FRAMES - 1] = { + { -1, NONE_FRAME }, { -1, NONE_FRAME }, { -1, NONE_FRAME }, + { -1, NONE_FRAME }, { -1, NONE_FRAME }, { -1, NONE_FRAME }, + { -1, NONE_FRAME } + }; + int num_past_ref_frames = 0; + int num_future_ref_frames = 0; + // Populate ref_buf for valid ref frames in global motion + update_valid_ref_frames_for_gm(cpi, ref_buf, past_ref_frame, + future_ref_frame, &num_past_ref_frames, + &num_future_ref_frames); + + // Sort the ref frames in the ascending order of their distance from the + // current frame + qsort(past_ref_frame, num_past_ref_frames, sizeof(past_ref_frame[0]), + compare_distance); + qsort(future_ref_frame, num_future_ref_frames, sizeof(future_ref_frame[0]), + compare_distance); + + // Compute global motion w.r.t. past reference frames + if (num_past_ref_frames > 0) + compute_global_motion_for_references( + cpi, ref_buf, past_ref_frame, num_past_ref_frames, &num_frm_corners, + frm_corners, frm_buffer, params_by_motion, segment_map, segment_map_w, + segment_map_h); + + // Compute global motion w.r.t. future reference frames + if (num_future_ref_frames > 0) + compute_global_motion_for_references( + cpi, ref_buf, future_ref_frame, num_future_ref_frames, + &num_frm_corners, frm_corners, frm_buffer, params_by_motion, + segment_map, segment_map_w, segment_map_h); + + aom_free(segment_map); + + gm_info->search_done = 1; + for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) { + aom_free(params_by_motion[m].inliers); + } + } + memcpy(cm->cur_frame->global_motion, cm->global_motion, + REF_FRAMES * sizeof(WarpedMotionParams)); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_compute_global_motion_time); +#endif + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_setup_motion_field_time); +#endif + if (features->allow_ref_frame_mvs) av1_setup_motion_field(cm); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_setup_motion_field_time); +#endif + + cm->current_frame.skip_mode_info.skip_mode_flag = + check_skip_mode_enabled(cpi); + + cpi->row_mt_sync_read_ptr = av1_row_mt_sync_read_dummy; + cpi->row_mt_sync_write_ptr = av1_row_mt_sync_write_dummy; + cpi->row_mt = 0; + + if (cpi->oxcf.row_mt && (cpi->oxcf.max_threads > 1)) { + cpi->row_mt = 1; + cpi->row_mt_sync_read_ptr = av1_row_mt_sync_read; + cpi->row_mt_sync_write_ptr = av1_row_mt_sync_write; + av1_encode_tiles_row_mt(cpi); + } else { + if (AOMMIN(cpi->oxcf.max_threads, cm->tiles.cols * cm->tiles.rows) > 1) + av1_encode_tiles_mt(cpi); + else + encode_tiles(cpi); + } + + // If intrabc is allowed but never selected, reset the allow_intrabc flag. + if (features->allow_intrabc && !cpi->intrabc_used) { + features->allow_intrabc = 0; + } + if (features->allow_intrabc) { + cm->delta_q_info.delta_lf_present_flag = 0; + } + + if (cm->delta_q_info.delta_q_present_flag && cpi->deltaq_used == 0) { + cm->delta_q_info.delta_q_present_flag = 0; + } + + // Set the transform size appropriately before bitstream creation + const MODE_EVAL_TYPE eval_type = + cpi->sf.winner_mode_sf.enable_winner_mode_for_tx_size_srch + ? WINNER_MODE_EVAL + : DEFAULT_EVAL; + const TX_SIZE_SEARCH_METHOD tx_search_type = + cpi->winner_mode_params.tx_size_search_methods[eval_type]; + assert(cpi->oxcf.enable_tx64 || tx_search_type != USE_LARGESTALL); + features->tx_mode = select_tx_mode(cm, tx_search_type); + + if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) { + const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group); + + for (i = 0; i < TX_SIZES_ALL; i++) { + int sum = 0; + int j; + int left = 1024; + + for (j = 0; j < TX_TYPES; j++) + sum += cpi->td.rd_counts.tx_type_used[i][j]; + + for (j = TX_TYPES - 1; j >= 0; j--) { + const int new_prob = + sum ? 1024 * cpi->td.rd_counts.tx_type_used[i][j] / sum + : (j ? 0 : 1024); + int prob = + (frame_probs->tx_type_probs[update_type][i][j] + new_prob) >> 1; + left -= prob; + if (j == 0) prob += left; + frame_probs->tx_type_probs[update_type][i][j] = prob; + } + } + } + + if (!cpi->sf.inter_sf.disable_obmc && + cpi->sf.inter_sf.prune_obmc_prob_thresh > 0) { + const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group); + + for (i = 0; i < BLOCK_SIZES_ALL; i++) { + int sum = 0; + for (int j = 0; j < 2; j++) sum += cpi->td.rd_counts.obmc_used[i][j]; + + const int new_prob = + sum ? 128 * cpi->td.rd_counts.obmc_used[i][1] / sum : 0; + frame_probs->obmc_probs[update_type][i] = + (frame_probs->obmc_probs[update_type][i] + new_prob) >> 1; + } + } + + if (features->allow_warped_motion && + cpi->sf.inter_sf.prune_warped_prob_thresh > 0) { + const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group); + int sum = 0; + for (i = 0; i < 2; i++) sum += cpi->td.rd_counts.warped_used[i]; + const int new_prob = sum ? 128 * cpi->td.rd_counts.warped_used[1] / sum : 0; + frame_probs->warped_probs[update_type] = + (frame_probs->warped_probs[update_type] + new_prob) >> 1; + } + + if (cm->current_frame.frame_type != KEY_FRAME && + cpi->sf.interp_sf.adaptive_interp_filter_search == 2 && + features->interp_filter == SWITCHABLE) { + const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group); + + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { + int sum = 0; + int j; + int left = 1536; + + for (j = 0; j < SWITCHABLE_FILTERS; j++) { + sum += cpi->td.counts->switchable_interp[i][j]; + } + + for (j = SWITCHABLE_FILTERS - 1; j >= 0; j--) { + const int new_prob = + sum ? 1536 * cpi->td.counts->switchable_interp[i][j] / sum + : (j ? 0 : 1536); + int prob = (frame_probs->switchable_interp_probs[update_type][i][j] + + new_prob) >> + 1; + left -= prob; + if (j == 0) prob += left; + frame_probs->switchable_interp_probs[update_type][i][j] = prob; + } + } + } + + if ((!is_stat_generation_stage(cpi) && av1_use_hash_me(cpi) && + !cpi->sf.rt_sf.use_nonrd_pick_mode) || + hash_table_created) { + av1_hash_table_destroy(&intrabc_hash_info->intrabc_hash_table); + } +} + +void av1_encode_frame(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + CurrentFrame *const current_frame = &cm->current_frame; + FeatureFlags *const features = &cm->features; + const int num_planes = av1_num_planes(cm); + // Indicates whether or not to use a default reduced set for ext-tx + // rather than the potential full set of 16 transforms + features->reduced_tx_set_used = cpi->oxcf.reduced_tx_type_set; + + // Make sure segment_id is no larger than last_active_segid. + if (cm->seg.enabled && cm->seg.update_map) { + const int mi_rows = cm->mi_params.mi_rows; + const int mi_cols = cm->mi_params.mi_cols; + const int last_active_segid = cm->seg.last_active_segid; + uint8_t *map = cpi->enc_seg.map; + for (int mi_row = 0; mi_row < mi_rows; ++mi_row) { + for (int mi_col = 0; mi_col < mi_cols; ++mi_col) { + map[mi_col] = AOMMIN(map[mi_col], last_active_segid); + } + map += mi_cols; + } + } + + av1_setup_frame_buf_refs(cm); + enforce_max_ref_frames(cpi, &cpi->ref_frame_flags); + set_rel_frame_dist(cpi); + av1_setup_frame_sign_bias(cm); + +#if CHECK_PRECOMPUTED_REF_FRAME_MAP + GF_GROUP *gf_group = &cpi->gf_group; + // TODO(yuec): The check is disabled on OVERLAY frames for now, because info + // in cpi->gf_group has been refreshed for the next GOP when the check is + // performed for OVERLAY frames. Since we have not support inter-GOP ref + // frame map computation, the precomputed ref map for an OVERLAY frame is all + // -1 at this point (although it is meaning before gf_group is refreshed). + if (!frame_is_intra_only(cm) && gf_group->index != 0) { + const RefCntBuffer *const golden_buf = get_ref_frame_buf(cm, GOLDEN_FRAME); + + if (golden_buf) { + const int golden_order_hint = golden_buf->order_hint; + + for (int ref = LAST_FRAME; ref < EXTREF_FRAME; ++ref) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref); + const int ref_disp_idx_precomputed = + gf_group->ref_frame_disp_idx[gf_group->index][ref - LAST_FRAME]; + + (void)ref_disp_idx_precomputed; + + if (buf != NULL) { + const int ref_disp_idx = + get_relative_dist(&cm->seq_params.order_hint_info, + buf->order_hint, golden_order_hint); + + if (ref_disp_idx >= 0) + assert(ref_disp_idx == ref_disp_idx_precomputed); + else + assert(ref_disp_idx_precomputed == -1); + } else { + assert(ref_disp_idx_precomputed == -1); + } + } + } + } +#endif + +#if CONFIG_MISMATCH_DEBUG + mismatch_reset_frame(num_planes); +#else + (void)num_planes; +#endif + + if (cpi->sf.hl_sf.frame_parameter_update) { + RD_COUNTS *const rdc = &cpi->td.rd_counts; + + if (frame_is_intra_only(cm)) + current_frame->reference_mode = SINGLE_REFERENCE; + else + current_frame->reference_mode = REFERENCE_MODE_SELECT; + + features->interp_filter = SWITCHABLE; + if (cm->tiles.large_scale) features->interp_filter = EIGHTTAP_REGULAR; + + features->switchable_motion_mode = 1; + + rdc->compound_ref_used_flag = 0; + rdc->skip_mode_used_flag = 0; + + encode_frame_internal(cpi); + + if (current_frame->reference_mode == REFERENCE_MODE_SELECT) { + // Use a flag that includes 4x4 blocks + if (rdc->compound_ref_used_flag == 0) { + current_frame->reference_mode = SINGLE_REFERENCE; +#if CONFIG_ENTROPY_STATS + av1_zero(cpi->td.counts->comp_inter); +#endif // CONFIG_ENTROPY_STATS + } + } + // Re-check on the skip mode status as reference mode may have been + // changed. + SkipModeInfo *const skip_mode_info = ¤t_frame->skip_mode_info; + if (frame_is_intra_only(cm) || + current_frame->reference_mode == SINGLE_REFERENCE) { + skip_mode_info->skip_mode_allowed = 0; + skip_mode_info->skip_mode_flag = 0; + } + if (skip_mode_info->skip_mode_flag && rdc->skip_mode_used_flag == 0) + skip_mode_info->skip_mode_flag = 0; + + if (!cm->tiles.large_scale) { + if (features->tx_mode == TX_MODE_SELECT && + cpi->td.mb.txb_split_count == 0) + features->tx_mode = TX_MODE_LARGEST; + } + } else { + encode_frame_internal(cpi); + } +} + +static AOM_INLINE void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd, + FRAME_COUNTS *counts, TX_SIZE tx_size, + int depth, int blk_row, int blk_col, + uint8_t allow_update_cdf) { + MB_MODE_INFO *mbmi = xd->mi[0]; + const BLOCK_SIZE bsize = mbmi->sb_type; + const int max_blocks_high = max_block_high(xd, bsize, 0); + const int max_blocks_wide = max_block_wide(xd, bsize, 0); + int ctx = txfm_partition_context(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, + mbmi->sb_type, tx_size); + const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col); + const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index]; + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + assert(tx_size > TX_4X4); + + if (depth == MAX_VARTX_DEPTH) { + // Don't add to counts in this case + mbmi->tx_size = tx_size; + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, tx_size, tx_size); + return; + } + + if (tx_size == plane_tx_size) { +#if CONFIG_ENTROPY_STATS + ++counts->txfm_partition[ctx][0]; +#endif + if (allow_update_cdf) + update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 0, 2); + mbmi->tx_size = tx_size; + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, tx_size, tx_size); + } else { + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + +#if CONFIG_ENTROPY_STATS + ++counts->txfm_partition[ctx][1]; +#endif + if (allow_update_cdf) + update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 1, 2); + ++x->txb_split_count; + + if (sub_txs == TX_4X4) { + mbmi->inter_tx_size[txb_size_index] = TX_4X4; + mbmi->tx_size = TX_4X4; + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, TX_4X4, tx_size); + return; + } + + for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) { + for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) { + int offsetr = row; + int offsetc = col; + + update_txfm_count(x, xd, counts, sub_txs, depth + 1, blk_row + offsetr, + blk_col + offsetc, allow_update_cdf); + } + } + } +} + +static AOM_INLINE void tx_partition_count_update(const AV1_COMMON *const cm, + MACROBLOCK *x, + BLOCK_SIZE plane_bsize, + FRAME_COUNTS *td_counts, + uint8_t allow_update_cdf) { + MACROBLOCKD *xd = &x->e_mbd; + const int mi_width = mi_size_wide[plane_bsize]; + const int mi_height = mi_size_high[plane_bsize]; + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0); + const int bh = tx_size_high_unit[max_tx_size]; + const int bw = tx_size_wide_unit[max_tx_size]; + + xd->above_txfm_context = + cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK); + + for (int idy = 0; idy < mi_height; idy += bh) { + for (int idx = 0; idx < mi_width; idx += bw) { + update_txfm_count(x, xd, td_counts, max_tx_size, 0, idy, idx, + allow_update_cdf); + } + } +} + +static AOM_INLINE void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, + int blk_row, int blk_col) { + MB_MODE_INFO *mbmi = xd->mi[0]; + const BLOCK_SIZE bsize = mbmi->sb_type; + const int max_blocks_high = max_block_high(xd, bsize, 0); + const int max_blocks_wide = max_block_wide(xd, bsize, 0); + const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col); + const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index]; + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + if (tx_size == plane_tx_size) { + mbmi->tx_size = tx_size; + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, tx_size, tx_size); + + } else { + if (tx_size == TX_8X8) { + mbmi->inter_tx_size[txb_size_index] = TX_4X4; + mbmi->tx_size = TX_4X4; + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, TX_4X4, tx_size); + return; + } + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) { + for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) { + const int offsetr = blk_row + row; + const int offsetc = blk_col + col; + if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; + set_txfm_context(xd, sub_txs, offsetr, offsetc); + } + } + } +} + +static AOM_INLINE void tx_partition_set_contexts(const AV1_COMMON *const cm, + MACROBLOCKD *xd, + BLOCK_SIZE plane_bsize) { + const int mi_width = mi_size_wide[plane_bsize]; + const int mi_height = mi_size_high[plane_bsize]; + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0); + const int bh = tx_size_high_unit[max_tx_size]; + const int bw = tx_size_wide_unit[max_tx_size]; + + xd->above_txfm_context = + cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK); + + for (int idy = 0; idy < mi_height; idy += bh) { + for (int idx = 0; idx < mi_width; idx += bw) { + set_txfm_context(xd, max_tx_size, idy, idx); + } + } +} + +static AOM_INLINE void encode_superblock(const AV1_COMP *const cpi, + TileDataEnc *tile_data, ThreadData *td, + TOKENEXTRA **t, RUN_TYPE dry_run, + BLOCK_SIZE bsize, int *rate) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO **mi_4x4 = xd->mi; + MB_MODE_INFO *mbmi = mi_4x4[0]; + const int seg_skip = + segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP); + const int mis = cm->mi_params.mi_stride; + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + const int is_inter = is_inter_block(mbmi); + + // Initialize tx_mode and tx_size_search_method + set_tx_size_search_method( + cm, &cpi->winner_mode_params, x, + cpi->sf.winner_mode_sf.enable_winner_mode_for_tx_size_srch, 1); + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + if (!is_inter) { + xd->cfl.store_y = store_cfl_required(cm, xd); + mbmi->skip = 1; + for (int plane = 0; plane < num_planes; ++plane) { + av1_encode_intra_block_plane(cpi, x, bsize, plane, dry_run, + cpi->optimize_seg_arr[mbmi->segment_id]); + } + + // If there is at least one lossless segment, force the skip for intra + // block to be 0, in order to avoid the segment_id to be changed by in + // write_segment_id(). + if (!cpi->common.seg.segid_preskip && cpi->common.seg.update_map && + cpi->enc_seg.has_lossless_segment) + mbmi->skip = 0; + + xd->cfl.store_y = 0; + if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) { + for (int plane = 0; plane < AOMMIN(2, num_planes); ++plane) { + if (mbmi->palette_mode_info.palette_size[plane] > 0) { + if (!dry_run) { + av1_tokenize_color_map(x, plane, t, bsize, mbmi->tx_size, + PALETTE_MAP, tile_data->allow_update_cdf, + td->counts); + } else if (dry_run == DRY_RUN_COSTCOEFFS) { + rate += + av1_cost_color_map(x, plane, bsize, mbmi->tx_size, PALETTE_MAP); + } + } + } + } + + av1_update_txb_context(cpi, td, dry_run, bsize, + tile_data->allow_update_cdf); + } else { + int ref; + const int is_compound = has_second_ref(mbmi); + + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + for (ref = 0; ref < 1 + is_compound; ++ref) { + const YV12_BUFFER_CONFIG *cfg = + get_ref_frame_yv12_buf(cm, mbmi->ref_frame[ref]); + assert(IMPLIES(!is_intrabc_block(mbmi), cfg)); + av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col, + xd->block_ref_scale_factors[ref], num_planes); + } + int start_plane = (cpi->sf.rt_sf.reuse_inter_pred_nonrd) ? 1 : 0; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + start_plane, av1_num_planes(cm) - 1); + if (mbmi->motion_mode == OBMC_CAUSAL) { + assert(cpi->oxcf.enable_obmc == 1); + av1_build_obmc_inter_predictors_sb(cm, xd); + } + +#if CONFIG_MISMATCH_DEBUG + if (dry_run == OUTPUT_ENABLED) { + for (int plane = 0; plane < num_planes; ++plane) { + const struct macroblockd_plane *pd = &xd->plane[plane]; + int pixel_c, pixel_r; + mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0, + pd->subsampling_x, pd->subsampling_y); + if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x, + pd->subsampling_y)) + continue; + mismatch_record_block_pre(pd->dst.buf, pd->dst.stride, + cm->current_frame.order_hint, plane, pixel_c, + pixel_r, pd->width, pd->height, + xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); + } + } +#else + (void)num_planes; +#endif + + av1_encode_sb(cpi, x, bsize, dry_run); + av1_tokenize_sb_vartx(cpi, td, dry_run, bsize, rate, + tile_data->allow_update_cdf); + } + + if (!dry_run) { + if (av1_allow_intrabc(cm) && is_intrabc_block(mbmi)) td->intrabc_used = 1; + if (x->tx_mode_search_type == TX_MODE_SELECT && + !xd->lossless[mbmi->segment_id] && mbmi->sb_type > BLOCK_4X4 && + !(is_inter && (mbmi->skip || seg_skip))) { + if (is_inter) { + tx_partition_count_update(cm, x, bsize, td->counts, + tile_data->allow_update_cdf); + } else { + if (mbmi->tx_size != max_txsize_rect_lookup[bsize]) + ++x->txb_split_count; + if (block_signals_txsize(bsize)) { + const int tx_size_ctx = get_tx_size_context(xd); + const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize); + const int depth = tx_size_to_depth(mbmi->tx_size, bsize); + const int max_depths = bsize_to_max_depth(bsize); + + if (tile_data->allow_update_cdf) + update_cdf(xd->tile_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx], + depth, max_depths + 1); +#if CONFIG_ENTROPY_STATS + ++td->counts->intra_tx_size[tx_size_cat][tx_size_ctx][depth]; +#endif + } + } + assert(IMPLIES(is_rect_tx(mbmi->tx_size), is_rect_tx_allowed(xd, mbmi))); + } else { + int i, j; + TX_SIZE intra_tx_size; + // The new intra coding scheme requires no change of transform size + if (is_inter) { + if (xd->lossless[mbmi->segment_id]) { + intra_tx_size = TX_4X4; + } else { + intra_tx_size = tx_size_from_tx_mode(bsize, x->tx_mode_search_type); + } + } else { + intra_tx_size = mbmi->tx_size; + } + + for (j = 0; j < mi_height; j++) + for (i = 0; i < mi_width; i++) + if (mi_col + i < cm->mi_params.mi_cols && + mi_row + j < cm->mi_params.mi_rows) + mi_4x4[mis * j + i]->tx_size = intra_tx_size; + + if (intra_tx_size != max_txsize_rect_lookup[bsize]) ++x->txb_split_count; + } + } + + if (x->tx_mode_search_type == TX_MODE_SELECT && + block_signals_txsize(mbmi->sb_type) && is_inter && + !(mbmi->skip || seg_skip) && !xd->lossless[mbmi->segment_id]) { + if (dry_run) tx_partition_set_contexts(cm, xd, bsize); + } else { + TX_SIZE tx_size = mbmi->tx_size; + // The new intra coding scheme requires no change of transform size + if (is_inter) { + if (xd->lossless[mbmi->segment_id]) { + tx_size = TX_4X4; + } else { + tx_size = tx_size_from_tx_mode(bsize, x->tx_mode_search_type); + } + } else { + tx_size = (bsize > BLOCK_4X4) ? tx_size : TX_4X4; + } + mbmi->tx_size = tx_size; + set_txfm_ctxs(tx_size, xd->width, xd->height, + (mbmi->skip || seg_skip) && is_inter_block(mbmi), xd); + } + + if (is_inter_block(mbmi) && !xd->is_chroma_ref && is_cfl_allowed(xd)) { + cfl_store_block(xd, mbmi->sb_type, mbmi->tx_size); + } +} diff --git a/libs/libaom/src/av1/encoder/encodeframe.h b/libs/libaom/src/av1/encoder/encodeframe.h new file mode 100644 index 000000000..e4c484105 --- /dev/null +++ b/libs/libaom/src/av1/encoder/encodeframe.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ENCODEFRAME_H_ +#define AOM_AV1_ENCODER_ENCODEFRAME_H_ + +#include "aom/aom_integer.h" +#include "av1/common/blockd.h" +#include "av1/common/enums.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define DELTA_Q_PERCEPTUAL_MODULATION \ + 1 // 0: variance based + // 1: wavelet AC energy based + +struct macroblock; +struct yv12_buffer_config; +struct AV1_COMP; +struct ThreadData; + +void av1_setup_src_planes(struct macroblock *x, + const struct yv12_buffer_config *src, int mi_row, + int mi_col, const int num_planes, BLOCK_SIZE bsize); + +void av1_encode_frame(struct AV1_COMP *cpi); + +void av1_alloc_tile_data(struct AV1_COMP *cpi); +void av1_init_tile_data(struct AV1_COMP *cpi); +void av1_encode_tile(struct AV1_COMP *cpi, struct ThreadData *td, int tile_row, + int tile_col); +void av1_encode_sb_row(struct AV1_COMP *cpi, struct ThreadData *td, + int tile_row, int tile_col, int mi_row); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ENCODEFRAME_H_ diff --git a/libs/libaom/src/av1/encoder/encodemb.c b/libs/libaom/src/av1/encoder/encodemb.c new file mode 100644 index 000000000..ec3336229 --- /dev/null +++ b/libs/libaom/src/av1/encoder/encodemb.c @@ -0,0 +1,805 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/bitwriter.h" +#include "aom_dsp/quantize.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" + +#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG +#include "aom_util/debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG + +#include "av1/common/cfl.h" +#include "av1/common/idct.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/common/scan.h" + +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/encodemb.h" +#include "av1/encoder/encodetxb.h" +#include "av1/encoder/hybrid_fwd_txfm.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/rdopt.h" + +void av1_subtract_block(const MACROBLOCKD *xd, int rows, int cols, + int16_t *diff, ptrdiff_t diff_stride, + const uint8_t *src8, ptrdiff_t src_stride, + const uint8_t *pred8, ptrdiff_t pred_stride) { + assert(rows >= 4 && cols >= 4); +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, + pred8, pred_stride, xd->bd); + return; + } +#endif + (void)xd; + aom_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8, + pred_stride); +} + +void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, + int blk_col, int blk_row, TX_SIZE tx_size) { + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane]; + const int diff_stride = block_size_wide[plane_bsize]; + const int src_stride = p->src.stride; + const int dst_stride = pd->dst.stride; + const int tx1d_width = tx_size_wide[tx_size]; + const int tx1d_height = tx_size_high[tx_size]; + uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2]; + uint8_t *src = &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2]; + int16_t *src_diff = + &p->src_diff[(blk_row * diff_stride + blk_col) << MI_SIZE_LOG2]; + av1_subtract_block(xd, tx1d_height, tx1d_width, src_diff, diff_stride, src, + src_stride, dst, dst_stride); +} + +void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane) { + struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane]; + assert(plane_bsize < BLOCK_SIZES_ALL); + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + const MACROBLOCKD *xd = &x->e_mbd; + + av1_subtract_block(xd, bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride); +} + +int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, + int block, TX_SIZE tx_size, TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, int fast_mode, + int *rate_cost) { + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = &x->plane[plane]; + const int eob = p->eobs[block]; + const int segment_id = xd->mi[0]->segment_id; + + if (eob == 0 || !cpi->optimize_seg_arr[segment_id] || + xd->lossless[segment_id]) { + *rate_cost = av1_cost_skip_txb(x, txb_ctx, plane, tx_size); + return eob; + } + + return av1_optimize_txb_new(cpi, x, plane, block, tx_size, tx_type, txb_ctx, + rate_cost, cpi->oxcf.sharpness, fast_mode); +} + +// Hyper-parameters for dropout optimization, based on following logics. +// TODO(yjshen): These settings are tuned by experiments. They may still be +// optimized for better performance. +// (1) Coefficients which are large enough will ALWAYS be kept. +const tran_low_t DROPOUT_COEFF_MAX = 2; // Max dropout-able coefficient. +// (2) Continuous coefficients will ALWAYS be kept. Here rigorous continuity is +// NOT required. For example, `5 0 0 0 7` is treated as two continuous +// coefficients if three zeros do not fulfill the dropout condition. +const int DROPOUT_CONTINUITY_MAX = 2; // Max dropout-able continuous coeff. +// (3) Dropout operation is NOT applicable to blocks with large or small +// quantization index. +const int DROPOUT_Q_MAX = 128; +const int DROPOUT_Q_MIN = 16; +// (4) Recall that dropout optimization will forcibly set some quantized +// coefficients to zero. The key logic on determining whether a coefficient +// should be dropped is to check the number of continuous zeros before AND +// after this coefficient. The exact number of zeros for judgement depends +// on block size and quantization index. More concretely, block size +// determines the base number of zeros, while quantization index determines +// the multiplier. Intuitively, larger block requires more zeros and larger +// quantization index also requires more zeros (more information is lost +// when using larger quantization index). +const int DROPOUT_BEFORE_BASE_MAX = 32; // Max base number for leading zeros. +const int DROPOUT_BEFORE_BASE_MIN = 16; // Min base number for leading zeros. +const int DROPOUT_AFTER_BASE_MAX = 32; // Max base number for trailing zeros. +const int DROPOUT_AFTER_BASE_MIN = 16; // Min base number for trailing zeros. +const int DROPOUT_MULTIPLIER_MAX = 8; // Max multiplier on number of zeros. +const int DROPOUT_MULTIPLIER_MIN = 2; // Min multiplier on number of zeros. +const int DROPOUT_MULTIPLIER_Q_BASE = 32; // Base Q to compute multiplier. + +void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, + TX_TYPE tx_type, int qindex) { + MACROBLOCKD *const xd = &mb->e_mbd; + const struct macroblock_plane *const p = &mb->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block); + tran_low_t *const dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block); + const int tx_width = tx_size_wide[tx_size]; + const int tx_height = tx_size_high[tx_size]; + const int max_eob = av1_get_max_eob(tx_size); + const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); + + // Early return if `qindex` is out of range. + if (qindex > DROPOUT_Q_MAX || qindex < DROPOUT_Q_MIN) { + return; + } + + // Compute number of zeros used for dropout judgement. + const int base_size = AOMMAX(tx_width, tx_height); + const int multiplier = CLIP(qindex / DROPOUT_MULTIPLIER_Q_BASE, + DROPOUT_MULTIPLIER_MIN, DROPOUT_MULTIPLIER_MAX); + const int dropout_num_before = + multiplier * + CLIP(base_size, DROPOUT_BEFORE_BASE_MIN, DROPOUT_BEFORE_BASE_MAX); + const int dropout_num_after = + multiplier * + CLIP(base_size, DROPOUT_AFTER_BASE_MIN, DROPOUT_AFTER_BASE_MAX); + + // Early return if there are not enough non-zero coefficients. + if (p->eobs[block] == 0 || p->eobs[block] <= dropout_num_before) { + return; + } + + int count_zeros_before = 0; + int count_zeros_after = 0; + int count_nonzeros = 0; + // Index of the first non-zero coefficient after sufficient number of + // continuous zeros. If equals to `-1`, it means number of leading zeros + // hasn't reach `dropout_num_before`. + int idx = -1; + int eob = 0; // New end of block. + + for (int i = 0; i < p->eobs[block]; ++i) { + const int scan_idx = scan_order->scan[i]; + if (qcoeff[scan_idx] > DROPOUT_COEFF_MAX) { // Keep large coefficients. + count_zeros_before = 0; + count_zeros_after = 0; + idx = -1; + eob = i + 1; + } else if (qcoeff[scan_idx] == 0) { // Count zeros. + if (idx == -1) { + ++count_zeros_before; + } else { + ++count_zeros_after; + } + } else { // Count non-zeros. + if (count_zeros_before >= dropout_num_before) { + idx = (idx == -1) ? i : idx; + ++count_nonzeros; + } else { + count_zeros_before = 0; + eob = i + 1; + } + } + + // Handle continuity. + if (count_nonzeros > DROPOUT_CONTINUITY_MAX) { + count_zeros_before = 0; + count_zeros_after = 0; + idx = -1; + eob = i + 1; + } + + // Handle the trailing zeros after original end of block. + if (idx != -1 && i == p->eobs[block] - 1) { + count_zeros_after += (max_eob - p->eobs[block]); + } + + // Set redundant coefficients to zeros if needed. + if (count_zeros_after >= dropout_num_after) { + for (int j = idx; j <= i; ++j) { + qcoeff[scan_order->scan[j]] = 0; + dqcoeff[scan_order->scan[j]] = 0; + } + count_zeros_before += (i - idx + 1); + count_zeros_after = 0; + count_nonzeros = 0; + } else if (i == p->eobs[block] - 1) { + eob = i + 1; + } + } + + if (eob != p->eobs[block]) { + p->eobs[block] = eob; + p->txb_entropy_ctx[block] = + (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, eob); + } +} + +// Settings for optimization type. NOTE: To set optimization type for all intra +// frames, both `KEY_BLOCK_OPT_TYPE` and `INTRA_BLOCK_OPT_TYPE` should be set. +// TODO(yjshen): These settings are hard-coded and look okay for now. They +// should be made configurable later. +// Blocks of key frames ONLY. +const OPT_TYPE KEY_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT; +// Blocks of intra frames (key frames EXCLUSIVE). +const OPT_TYPE INTRA_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT; +// Blocks of inter frames. (NOTE: Dropout optimization is DISABLED by default +// if trellis optimization is on for inter frames.) +const OPT_TYPE INTER_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT; + +enum { + QUANT_FUNC_LOWBD = 0, + QUANT_FUNC_HIGHBD = 1, + QUANT_FUNC_TYPES = 2 +} UENUM1BYTE(QUANT_FUNC); + +#if CONFIG_AV1_HIGHBITDEPTH +static AV1_QUANT_FACADE + quant_func_list[AV1_XFORM_QUANT_TYPES][QUANT_FUNC_TYPES] = { + { av1_quantize_fp_facade, av1_highbd_quantize_fp_facade }, + { av1_quantize_b_facade, av1_highbd_quantize_b_facade }, + { av1_quantize_dc_facade, av1_highbd_quantize_dc_facade }, + { NULL, NULL } + }; +#else +static AV1_QUANT_FACADE quant_func_list[AV1_XFORM_QUANT_TYPES] = { + av1_quantize_fp_facade, av1_quantize_b_facade, av1_quantize_dc_facade, NULL +}; +#endif + +void av1_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row, + int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param, + QUANT_PARAM *qparam) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const SCAN_ORDER *const scan_order = + get_scan(txfm_param->tx_size, txfm_param->tx_type); + const int block_offset = BLOCK_OFFSET(block); + tran_low_t *const coeff = p->coeff + block_offset; + tran_low_t *const qcoeff = p->qcoeff + block_offset; + tran_low_t *const dqcoeff = pd->dqcoeff + block_offset; + uint16_t *const eob = &p->eobs[block]; + const int diff_stride = block_size_wide[plane_bsize]; + + const int src_offset = (blk_row * diff_stride + blk_col); + const int16_t *src_diff = &p->src_diff[src_offset << MI_SIZE_LOG2]; + + av1_fwd_txfm(src_diff, coeff, diff_stride, txfm_param); + + if (qparam->xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) { + const int n_coeffs = av1_get_max_eob(txfm_param->tx_size); + if (LIKELY(!x->skip_block)) { +#if CONFIG_AV1_HIGHBITDEPTH + quant_func_list[qparam->xform_quant_idx][txfm_param->is_hbd]( + coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, qparam); +#else + quant_func_list[qparam->xform_quant_idx]( + coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, qparam); +#endif + } else { + av1_quantize_skip(n_coeffs, qcoeff, dqcoeff, eob); + } + } + // use_optimize_b is true means av1_optimze_b will be called, + // thus cannot update entropy ctx now (performed in optimize_b) + if (qparam->use_optimize_b) { + p->txb_entropy_ctx[block] = 0; + } else { + p->txb_entropy_ctx[block] = + (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, *eob); + } + return; +} + +void av1_setup_xform(const AV1_COMMON *cm, MACROBLOCK *x, TX_SIZE tx_size, + TX_TYPE tx_type, TxfmParam *txfm_param) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + + txfm_param->tx_type = tx_type; + txfm_param->tx_size = tx_size; + txfm_param->lossless = xd->lossless[mbmi->segment_id]; + txfm_param->tx_set_type = av1_get_ext_tx_set_type( + tx_size, is_inter_block(mbmi), cm->features.reduced_tx_set_used); + + txfm_param->bd = xd->bd; + txfm_param->is_hbd = is_cur_buf_hbd(xd); +} +void av1_setup_quant(TX_SIZE tx_size, int use_optimize_b, int xform_quant_idx, + int use_quant_b_adapt, QUANT_PARAM *qparam) { + qparam->log_scale = av1_get_tx_scale(tx_size); + qparam->tx_size = tx_size; + + qparam->use_quant_b_adapt = use_quant_b_adapt; + + // TODO(bohanli): optimize_b and quantization idx has relationship, + // but is kind of buried and complicated in different encoding stages. + // Should have a unified function to derive quant_idx, rather than + // determine and pass in the quant_idx + qparam->use_optimize_b = use_optimize_b; + qparam->xform_quant_idx = xform_quant_idx; + + qparam->qmatrix = NULL; + qparam->iqmatrix = NULL; +} +void av1_setup_qmatrix(const CommonQuantParams *quant_params, + const MACROBLOCKD *xd, int plane, TX_SIZE tx_size, + TX_TYPE tx_type, QUANT_PARAM *qparam) { + qparam->qmatrix = av1_get_qmatrix(quant_params, xd, plane, tx_size, tx_type); + qparam->iqmatrix = + av1_get_iqmatrix(quant_params, xd, plane, tx_size, tx_type); +} + +static void encode_block(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg, + RUN_TYPE dry_run) { + (void)dry_run; + struct encode_b_args *const args = arg; + const AV1_COMP *const cpi = args->cpi; + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + tran_low_t *const dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block); + uint8_t *dst; + ENTROPY_CONTEXT *a, *l; + int dummy_rate_cost = 0; + + const int bw = mi_size_wide[plane_bsize]; + dst = &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2]; + + a = &args->ta[blk_col]; + l = &args->tl[blk_row]; + + TX_TYPE tx_type = DCT_DCT; + if (!is_blk_skip(x, plane, blk_row * bw + blk_col) && !mbmi->skip_mode) { + tx_type = av1_get_tx_type(xd, pd->plane_type, blk_row, blk_col, tx_size, + cm->features.reduced_tx_set_used); + TxfmParam txfm_param; + QUANT_PARAM quant_param; + const int use_trellis = is_trellis_used(args->enable_optimize_b, dry_run); + int quant_idx; + if (use_trellis) + quant_idx = AV1_XFORM_QUANT_FP; + else + quant_idx = + USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP; + av1_setup_xform(cm, x, tx_size, tx_type, &txfm_param); + av1_setup_quant(tx_size, use_trellis, quant_idx, cpi->oxcf.quant_b_adapt, + &quant_param); + av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type, + &quant_param); + av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param, + &quant_param); + + // Whether trellis or dropout optimization is required for inter frames. + const bool do_trellis = INTER_BLOCK_OPT_TYPE == TRELLIS_OPT || + INTER_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT; + const bool do_dropout = INTER_BLOCK_OPT_TYPE == DROPOUT_OPT || + INTER_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT; + + if (quant_param.use_optimize_b && do_trellis) { + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx); + av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, + args->cpi->sf.rd_sf.trellis_eob_fast, &dummy_rate_cost); + } + if (!quant_param.use_optimize_b && do_dropout) { + av1_dropout_qcoeff(x, plane, block, tx_size, tx_type, + cm->quant_params.base_qindex); + } + } else { + p->eobs[block] = 0; + p->txb_entropy_ctx[block] = 0; + } + + av1_set_txb_context(x, plane, block, tx_size, a, l); + + if (p->eobs[block]) { + *(args->skip) = 0; + av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, + pd->dst.stride, p->eobs[block], + cm->features.reduced_tx_set_used); + } + + // TODO(debargha, jingning): Temporarily disable txk_type check for eob=0 + // case. It is possible that certain collision in hash index would cause + // the assertion failure. To further optimize the rate-distortion + // performance, we need to re-visit this part and enable this assert + // again. + if (p->eobs[block] == 0 && plane == 0) { +#if 0 + if (args->cpi->oxcf.aq_mode == NO_AQ && + args->cpi->oxcf.deltaq_mode == NO_DELTA_Q) { + // TODO(jingning,angiebird,huisu@google.com): enable txk_check when + // enable_optimize_b is true to detect potential RD bug. + const uint8_t disable_txk_check = args->enable_optimize_b; + if (!disable_txk_check) { + assert(xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col)] == + DCT_DCT); + } + } +#endif + update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT); + } + +#if CONFIG_MISMATCH_DEBUG + if (dry_run == OUTPUT_ENABLED) { + int pixel_c, pixel_r; + BLOCK_SIZE bsize = txsize_to_bsize[tx_size]; + int blk_w = block_size_wide[bsize]; + int blk_h = block_size_high[bsize]; + mi_to_pixel_loc(&pixel_c, &pixel_r, xd->mi_col, xd->mi_row, blk_col, + blk_row, pd->subsampling_x, pd->subsampling_y); + mismatch_record_block_tx(dst, pd->dst.stride, cm->current_frame.order_hint, + plane, pixel_c, pixel_r, blk_w, blk_h, + xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); + } +#endif +} + +static void encode_block_inter(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *arg, RUN_TYPE dry_run) { + struct encode_b_args *const args = arg; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int max_blocks_high = max_block_high(xd, plane_bsize, plane); + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + const TX_SIZE plane_tx_size = + plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x, + pd->subsampling_y) + : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row, + blk_col)]; + if (!plane) { + assert(tx_size_wide[tx_size] >= tx_size_wide[plane_tx_size] && + tx_size_high[tx_size] >= tx_size_high[plane_tx_size]); + } + + if (tx_size == plane_tx_size || plane) { + encode_block(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg, + dry_run); + } else { + assert(tx_size < TX_SIZES_ALL); + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + assert(IMPLIES(tx_size <= TX_4X4, sub_txs == tx_size)); + assert(IMPLIES(tx_size > TX_4X4, sub_txs < tx_size)); + // This is the square transform block partition entry point. + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + const int step = bsh * bsw; + assert(bsw > 0 && bsh > 0); + + for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) { + for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) { + const int offsetr = blk_row + row; + const int offsetc = blk_col + col; + + if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; + + encode_block_inter(plane, block, offsetr, offsetc, plane_bsize, sub_txs, + arg, dry_run); + block += step; + } + } + } +} + +void av1_foreach_transformed_block_in_plane( + const MACROBLOCKD *const xd, BLOCK_SIZE plane_bsize, int plane, + foreach_transformed_block_visitor visit, void *arg) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + // block and transform sizes, in number of 4x4 blocks log 2 ("*_b") + // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 + // transform size varies per plane, look it up in a common way. + const TX_SIZE tx_size = av1_get_tx_size(plane, xd); + const uint8_t txw_unit = tx_size_wide_unit[tx_size]; + const uint8_t txh_unit = tx_size_high_unit[tx_size]; + const int step = txw_unit * txh_unit; + + // If mb_to_right_edge is < 0 we are in a situation in which + // the current block size extends into the UMV and we won't + // visit the sub blocks that are wholly within the UMV. + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + const int max_blocks_high = max_block_high(xd, plane_bsize, plane); + const BLOCK_SIZE max_unit_bsize = + get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y); + const int mu_blocks_wide = + AOMMIN(mi_size_wide[max_unit_bsize], max_blocks_wide); + const int mu_blocks_high = + AOMMIN(mi_size_high[max_unit_bsize], max_blocks_high); + + // Keep track of the row and column of the blocks we use so that we know + // if we are in the unrestricted motion border. + int i = 0; + for (int r = 0; r < max_blocks_high; r += mu_blocks_high) { + const int unit_height = AOMMIN(mu_blocks_high + r, max_blocks_high); + // Skip visiting the sub blocks that are wholly within the UMV. + for (int c = 0; c < max_blocks_wide; c += mu_blocks_wide) { + const int unit_width = AOMMIN(mu_blocks_wide + c, max_blocks_wide); + for (int blk_row = r; blk_row < unit_height; blk_row += txh_unit) { + for (int blk_col = c; blk_col < unit_width; blk_col += txw_unit) { + visit(plane, i, blk_row, blk_col, plane_bsize, tx_size, arg); + i += step; + } + } + } + } +} + +typedef struct encode_block_pass1_args { + AV1_COMP *cpi; + MACROBLOCK *x; +} encode_block_pass1_args; + +static void encode_block_pass1(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *arg) { + encode_block_pass1_args *args = (encode_block_pass1_args *)arg; + AV1_COMP *cpi = args->cpi; + AV1_COMMON *cm = &cpi->common; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + tran_low_t *const dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block); + + uint8_t *dst; + dst = &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2]; + + TxfmParam txfm_param; + QUANT_PARAM quant_param; + + av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param); + av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_B, cpi->oxcf.quant_b_adapt, + &quant_param); + av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, DCT_DCT, + &quant_param); + + av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param, + &quant_param); + + if (p->eobs[block] > 0) { + txfm_param.eob = p->eobs[block]; + if (txfm_param.is_hbd) { + av1_highbd_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param); + return; + } + av1_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param); + } +} + +void av1_encode_sby_pass1(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize) { + encode_block_pass1_args args = { cpi, x }; + av1_subtract_plane(x, bsize, 0); + av1_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0, + encode_block_pass1, &args); +} + +void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + RUN_TYPE dry_run) { + assert(bsize < BLOCK_SIZES_ALL); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + mbmi->skip = 1; + if (x->force_skip) return; + + struct optimize_ctx ctx; + struct encode_b_args arg = { + cpi, x, &ctx, &mbmi->skip, + NULL, NULL, dry_run, cpi->optimize_seg_arr[mbmi->segment_id] + }; + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + for (int plane = 0; plane < num_planes; ++plane) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int subsampling_x = pd->subsampling_x; + const int subsampling_y = pd->subsampling_y; + if (plane && !xd->is_chroma_ref) break; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, subsampling_x, subsampling_y); + assert(plane_bsize < BLOCK_SIZES_ALL); + const int mi_width = mi_size_wide[plane_bsize]; + const int mi_height = mi_size_high[plane_bsize]; + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane); + const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size]; + const int bw = mi_size_wide[txb_size]; + const int bh = mi_size_high[txb_size]; + int block = 0; + const int step = + tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size]; + av1_get_entropy_contexts(plane_bsize, pd, ctx.ta[plane], ctx.tl[plane]); + av1_subtract_plane(x, plane_bsize, plane); + arg.ta = ctx.ta[plane]; + arg.tl = ctx.tl[plane]; + const BLOCK_SIZE max_unit_bsize = + get_plane_block_size(BLOCK_64X64, subsampling_x, subsampling_y); + int mu_blocks_wide = mi_size_wide[max_unit_bsize]; + int mu_blocks_high = mi_size_high[max_unit_bsize]; + mu_blocks_wide = AOMMIN(mi_width, mu_blocks_wide); + mu_blocks_high = AOMMIN(mi_height, mu_blocks_high); + + for (int idy = 0; idy < mi_height; idy += mu_blocks_high) { + for (int idx = 0; idx < mi_width; idx += mu_blocks_wide) { + int blk_row, blk_col; + const int unit_height = AOMMIN(mu_blocks_high + idy, mi_height); + const int unit_width = AOMMIN(mu_blocks_wide + idx, mi_width); + for (blk_row = idy; blk_row < unit_height; blk_row += bh) { + for (blk_col = idx; blk_col < unit_width; blk_col += bw) { + encode_block_inter(plane, block, blk_row, blk_col, plane_bsize, + max_tx_size, &arg, dry_run); + block += step; + } + } + } + } + } +} + +static void encode_block_intra_and_set_context(int plane, int block, + int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { + av1_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size, + arg); + + struct encode_b_args *const args = arg; + MACROBLOCK *x = args->x; + ENTROPY_CONTEXT *a = &args->ta[blk_col]; + ENTROPY_CONTEXT *l = &args->tl[blk_row]; + av1_set_txb_context(x, plane, block, tx_size, a, l); +} + +void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *arg) { + struct encode_b_args *const args = arg; + const AV1_COMP *const cpi = args->cpi; + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + tran_low_t *dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block); + PLANE_TYPE plane_type = get_plane_type(plane); + uint16_t *eob = &p->eobs[block]; + const int dst_stride = pd->dst.stride; + uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2]; + int dummy_rate_cost = 0; + + av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size); + + TX_TYPE tx_type = DCT_DCT; + const int bw = mi_size_wide[plane_bsize]; + if (plane == 0 && is_blk_skip(x, plane, blk_row * bw + blk_col)) { + *eob = 0; + p->txb_entropy_ctx[block] = 0; + } else { + av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size); + + const ENTROPY_CONTEXT *a = &args->ta[blk_col]; + const ENTROPY_CONTEXT *l = &args->tl[blk_row]; + tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size, + cm->features.reduced_tx_set_used); + TxfmParam txfm_param; + QUANT_PARAM quant_param; + const int use_trellis = + is_trellis_used(args->enable_optimize_b, args->dry_run); + int quant_idx; + if (use_trellis) + quant_idx = AV1_XFORM_QUANT_FP; + else + quant_idx = + USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP; + + av1_setup_xform(cm, x, tx_size, tx_type, &txfm_param); + av1_setup_quant(tx_size, use_trellis, quant_idx, cpi->oxcf.quant_b_adapt, + &quant_param); + av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type, + &quant_param); + + av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param, + &quant_param); + + // Whether trellis or dropout optimization is required for key frames and + // intra frames. + const bool do_trellis = (frame_is_intra_only(cm) && + (KEY_BLOCK_OPT_TYPE == TRELLIS_OPT || + KEY_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT)) || + (!frame_is_intra_only(cm) && + (INTRA_BLOCK_OPT_TYPE == TRELLIS_OPT || + INTRA_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT)); + const bool do_dropout = (frame_is_intra_only(cm) && + (KEY_BLOCK_OPT_TYPE == DROPOUT_OPT || + KEY_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT)) || + (!frame_is_intra_only(cm) && + (INTRA_BLOCK_OPT_TYPE == DROPOUT_OPT || + INTRA_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT)); + + if (quant_param.use_optimize_b && do_trellis) { + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx); + av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, + args->cpi->sf.rd_sf.trellis_eob_fast, &dummy_rate_cost); + } + if (do_dropout) { + av1_dropout_qcoeff(x, plane, block, tx_size, tx_type, + cm->quant_params.base_qindex); + } + } + + if (*eob) { + av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, + dst_stride, *eob, + cm->features.reduced_tx_set_used); + } + + // TODO(jingning): Temporarily disable txk_type check for eob=0 case. + // It is possible that certain collision in hash index would cause + // the assertion failure. To further optimize the rate-distortion + // performance, we need to re-visit this part and enable this assert + // again. + if (*eob == 0 && plane == 0) { +#if 0 + if (args->cpi->oxcf.aq_mode == NO_AQ + && args->cpi->oxcf.deltaq_mode == NO_DELTA_Q) { + assert(xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col)] == + DCT_DCT); + } +#endif + update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT); + } + + // For intra mode, skipped blocks are so rare that transmitting skip=1 is + // very expensive. + *(args->skip) = 0; + + if (plane == AOM_PLANE_Y && xd->cfl.store_y) { + cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize); + } +} + +void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int plane, RUN_TYPE dry_run, + TRELLIS_OPT_TYPE enable_optimize_b) { + assert(bsize < BLOCK_SIZES_ALL); + const MACROBLOCKD *const xd = &x->e_mbd; + if (plane && !xd->is_chroma_ref) return; + + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + ENTROPY_CONTEXT ta[MAX_MIB_SIZE] = { 0 }; + ENTROPY_CONTEXT tl[MAX_MIB_SIZE] = { 0 }; + struct encode_b_args arg = { cpi, x, NULL, &(xd->mi[0]->skip), + ta, tl, dry_run, enable_optimize_b }; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y); + if (enable_optimize_b) { + av1_get_entropy_contexts(plane_bsize, pd, ta, tl); + } + av1_foreach_transformed_block_in_plane( + xd, plane_bsize, plane, encode_block_intra_and_set_context, &arg); +} diff --git a/libs/libaom/src/av1/encoder/encodemb.h b/libs/libaom/src/av1/encoder/encodemb.h new file mode 100644 index 000000000..a337c83db --- /dev/null +++ b/libs/libaom/src/av1/encoder/encodemb.h @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ENCODEMB_H_ +#define AOM_AV1_ENCODER_ENCODEMB_H_ + +#include "config/aom_config.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/txb_common.h" +#include "av1/encoder/block.h" +#include "av1/encoder/tokenize.h" +#ifdef __cplusplus +extern "C" { +#endif + +struct optimize_ctx { + ENTROPY_CONTEXT ta[MAX_MB_PLANE][MAX_MIB_SIZE]; + ENTROPY_CONTEXT tl[MAX_MB_PLANE][MAX_MIB_SIZE]; +}; + +struct encode_b_args { + const struct AV1_COMP *cpi; + MACROBLOCK *x; + struct optimize_ctx *ctx; + int8_t *skip; + ENTROPY_CONTEXT *ta; + ENTROPY_CONTEXT *tl; + RUN_TYPE dry_run; + TRELLIS_OPT_TYPE enable_optimize_b; +}; + +enum { + AV1_XFORM_QUANT_FP = 0, + AV1_XFORM_QUANT_B = 1, + AV1_XFORM_QUANT_DC = 2, + AV1_XFORM_QUANT_SKIP_QUANT, + AV1_XFORM_QUANT_TYPES, +} UENUM1BYTE(AV1_XFORM_QUANT); + +// Available optimization types to optimize the quantized coefficients. +enum { + NONE_OPT = 0, // No optimization. + TRELLIS_OPT = 1, // Trellis optimization. See `av1_optimize_b()`. + DROPOUT_OPT = 2, // Dropout optimization. See `av1_dropout_qcoeff()`. + TRELLIS_DROPOUT_OPT = 3 // Perform dropout after trellis optimization. +} UENUM1BYTE(OPT_TYPE); + +void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + RUN_TYPE dry_run); + +void av1_foreach_transformed_block_in_plane( + const MACROBLOCKD *const xd, BLOCK_SIZE plane_bsize, int plane, + foreach_transformed_block_visitor visit, void *arg); + +void av1_encode_sby_pass1(struct AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize); + +void av1_setup_xform(const AV1_COMMON *cm, MACROBLOCK *x, TX_SIZE tx_size, + TX_TYPE tx_type, TxfmParam *txfm_param); +void av1_setup_quant(TX_SIZE tx_size, int use_optimize_b, int xform_quant_idx, + int use_quant_b_adapt, QUANT_PARAM *qparam); +void av1_setup_qmatrix(const CommonQuantParams *quant_params, + const MACROBLOCKD *xd, int plane, TX_SIZE tx_size, + TX_TYPE tx_type, QUANT_PARAM *qparam); + +void av1_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row, + int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param, + QUANT_PARAM *qparam); + +int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane, + int block, TX_SIZE tx_size, TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, int fast_mode, int *rate_cost); + +// This function can be used as (i) a further optimization to reduce the +// redundancy of quantized coefficients (a.k.a., `qcoeff`) after trellis +// optimization, or (ii) an alternative to trellis optimization in high-speed +// compression mode (e.g., real-time mode under speed-6) due to its LOW time +// complexity. The rational behind is to drop out the may-be redundant quantized +// coefficient which is among a bunch of zeros. NOTE: This algorithm is not as +// accurate as trellis optimization since the hyper-parameters are hard-coded +// instead of dynamic search. More adaptive logic may improve the performance. +// This function should be applied to all or partical block cells. +// Inputs: +// mb: Pointer to the MACROBLOCK to perform dropout on. +// plane: Index of the plane to which the target block belongs. +// block: Index of the target block. +// tx_size: Transform size of the target block. +// tx_type: Transform type of the target block. This field is particularly +// used to find out the scan order of the block. +// qindex: Quantization index used for target block. In general, all blocks +// in a same plane share the same quantization index. This field is +// particularly used to determine how many zeros should be used to +// drop out a coefficient. +// Returns: +// Nothing will be returned, but `qcoeff`, `dqcoeff`, `eob`, as well as +// `txb_entropy_ctx`, which `mb` points to, may be modified by this function. +void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, + TX_TYPE tx_type, int qindex); + +void av1_subtract_block(const MACROBLOCKD *xd, int rows, int cols, + int16_t *diff, ptrdiff_t diff_stride, + const uint8_t *src8, ptrdiff_t src_stride, + const uint8_t *pred8, ptrdiff_t pred_stride); + +void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, + int blk_col, int blk_row, TX_SIZE tx_size); + +void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane); + +static INLINE void av1_set_txb_context(MACROBLOCK *x, int plane, int block, + TX_SIZE tx_size, ENTROPY_CONTEXT *a, + ENTROPY_CONTEXT *l) { + const uint8_t ctx = x->plane[plane].txb_entropy_ctx[block]; + memset(a, ctx, tx_size_wide_unit[tx_size] * sizeof(*a)); + memset(l, ctx, tx_size_high_unit[tx_size] * sizeof(*l)); +} + +void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg); + +void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int plane, RUN_TYPE dry_run, + TRELLIS_OPT_TYPE enable_optimize_b); + +static INLINE int is_trellis_used(TRELLIS_OPT_TYPE optimize_b, + RUN_TYPE dry_run) { + if (optimize_b == NO_TRELLIS_OPT) return false; + if (optimize_b == FINAL_PASS_TRELLIS_OPT && dry_run != OUTPUT_ENABLED) + return false; + return true; +} +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ENCODEMB_H_ diff --git a/libs/libaom/src/av1/encoder/encodemv.c b/libs/libaom/src/av1/encoder/encodemv.c new file mode 100644 index 000000000..167e9c0a3 --- /dev/null +++ b/libs/libaom/src/av1/encoder/encodemv.c @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/common.h" +#include "av1/common/entropymode.h" + +#include "av1/encoder/cost.h" +#include "av1/encoder/encodemv.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/bitops.h" + +static void update_mv_component_stats(int comp, nmv_component *mvcomp, + MvSubpelPrecision precision) { + assert(comp != 0); + int offset; + const int sign = comp < 0; + const int mag = sign ? -comp : comp; + const int mv_class = av1_get_mv_class(mag - 1, &offset); + const int d = offset >> 3; // int mv data + const int fr = (offset >> 1) & 3; // fractional mv data + const int hp = offset & 1; // high precision mv data + + // Sign + update_cdf(mvcomp->sign_cdf, sign, 2); + + // Class + update_cdf(mvcomp->classes_cdf, mv_class, MV_CLASSES); + + // Integer bits + if (mv_class == MV_CLASS_0) { + update_cdf(mvcomp->class0_cdf, d, CLASS0_SIZE); + } else { + const int n = mv_class + CLASS0_BITS - 1; // number of bits + for (int i = 0; i < n; ++i) + update_cdf(mvcomp->bits_cdf[i], (d >> i) & 1, 2); + } + // Fractional bits + if (precision > MV_SUBPEL_NONE) { + aom_cdf_prob *fp_cdf = + mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf; + update_cdf(fp_cdf, fr, MV_FP_SIZE); + } + + // High precision bit + if (precision > MV_SUBPEL_LOW_PRECISION) { + aom_cdf_prob *hp_cdf = + mv_class == MV_CLASS_0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf; + update_cdf(hp_cdf, hp, 2); + } +} + +void av1_update_mv_stats(const MV *mv, const MV *ref, nmv_context *mvctx, + MvSubpelPrecision precision) { + const MV diff = { mv->row - ref->row, mv->col - ref->col }; + const MV_JOINT_TYPE j = av1_get_mv_joint(&diff); + + update_cdf(mvctx->joints_cdf, j, MV_JOINTS); + + if (mv_joint_vertical(j)) + update_mv_component_stats(diff.row, &mvctx->comps[0], precision); + + if (mv_joint_horizontal(j)) + update_mv_component_stats(diff.col, &mvctx->comps[1], precision); +} + +static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp, + MvSubpelPrecision precision) { + assert(comp != 0); + int offset; + const int sign = comp < 0; + const int mag = sign ? -comp : comp; + const int mv_class = av1_get_mv_class(mag - 1, &offset); + const int d = offset >> 3; // int mv data + const int fr = (offset >> 1) & 3; // fractional mv data + const int hp = offset & 1; // high precision mv data + + // Sign + aom_write_symbol(w, sign, mvcomp->sign_cdf, 2); + + // Class + aom_write_symbol(w, mv_class, mvcomp->classes_cdf, MV_CLASSES); + + // Integer bits + if (mv_class == MV_CLASS_0) { + aom_write_symbol(w, d, mvcomp->class0_cdf, CLASS0_SIZE); + } else { + int i; + const int n = mv_class + CLASS0_BITS - 1; // number of bits + for (i = 0; i < n; ++i) + aom_write_symbol(w, (d >> i) & 1, mvcomp->bits_cdf[i], 2); + } + // Fractional bits + if (precision > MV_SUBPEL_NONE) { + aom_write_symbol( + w, fr, + mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf, + MV_FP_SIZE); + } + + // High precision bit + if (precision > MV_SUBPEL_LOW_PRECISION) + aom_write_symbol( + w, hp, mv_class == MV_CLASS_0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf, + 2); +} + +static void build_nmv_component_cost_table(int *mvcost, + const nmv_component *const mvcomp, + MvSubpelPrecision precision) { + int i, v; + int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE]; + int bits_cost[MV_OFFSET_BITS][2]; + int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE], fp_cost[MV_FP_SIZE]; + int class0_hp_cost[2], hp_cost[2]; + + av1_cost_tokens_from_cdf(sign_cost, mvcomp->sign_cdf, NULL); + av1_cost_tokens_from_cdf(class_cost, mvcomp->classes_cdf, NULL); + av1_cost_tokens_from_cdf(class0_cost, mvcomp->class0_cdf, NULL); + for (i = 0; i < MV_OFFSET_BITS; ++i) { + av1_cost_tokens_from_cdf(bits_cost[i], mvcomp->bits_cdf[i], NULL); + } + + for (i = 0; i < CLASS0_SIZE; ++i) + av1_cost_tokens_from_cdf(class0_fp_cost[i], mvcomp->class0_fp_cdf[i], NULL); + av1_cost_tokens_from_cdf(fp_cost, mvcomp->fp_cdf, NULL); + + if (precision > MV_SUBPEL_LOW_PRECISION) { + av1_cost_tokens_from_cdf(class0_hp_cost, mvcomp->class0_hp_cdf, NULL); + av1_cost_tokens_from_cdf(hp_cost, mvcomp->hp_cdf, NULL); + } + mvcost[0] = 0; + for (v = 1; v <= MV_MAX; ++v) { + int z, c, o, d, e, f, cost = 0; + z = v - 1; + c = av1_get_mv_class(z, &o); + cost += class_cost[c]; + d = (o >> 3); /* int mv data */ + f = (o >> 1) & 3; /* fractional pel mv data */ + e = (o & 1); /* high precision mv data */ + if (c == MV_CLASS_0) { + cost += class0_cost[d]; + } else { + const int b = c + CLASS0_BITS - 1; /* number of bits */ + for (i = 0; i < b; ++i) cost += bits_cost[i][((d >> i) & 1)]; + } + if (precision > MV_SUBPEL_NONE) { + if (c == MV_CLASS_0) { + cost += class0_fp_cost[d][f]; + } else { + cost += fp_cost[f]; + } + if (precision > MV_SUBPEL_LOW_PRECISION) { + if (c == MV_CLASS_0) { + cost += class0_hp_cost[e]; + } else { + cost += hp_cost[e]; + } + } + } + mvcost[v] = cost + sign_cost[0]; + mvcost[-v] = cost + sign_cost[1]; + } +} + +void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref, + nmv_context *mvctx, int usehp) { + const MV diff = { mv->row - ref->row, mv->col - ref->col }; + const MV_JOINT_TYPE j = av1_get_mv_joint(&diff); + // If the mv_diff is zero, then we should have used near or nearest instead. + assert(j != MV_JOINT_ZERO); + if (cpi->common.features.cur_frame_force_integer_mv) { + usehp = MV_SUBPEL_NONE; + } + aom_write_symbol(w, j, mvctx->joints_cdf, MV_JOINTS); + if (mv_joint_vertical(j)) + encode_mv_component(w, diff.row, &mvctx->comps[0], usehp); + + if (mv_joint_horizontal(j)) + encode_mv_component(w, diff.col, &mvctx->comps[1], usehp); + + // If auto_mv_step_size is enabled then keep track of the largest + // motion vector component used. + if (cpi->sf.mv_sf.auto_mv_step_size) { + int maxv = AOMMAX(abs(mv->row), abs(mv->col)) >> 3; + cpi->mv_search_params.max_mv_magnitude = + AOMMAX(maxv, cpi->mv_search_params.max_mv_magnitude); + } +} + +void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref, + nmv_context *mvctx) { + // DV and ref DV should not have sub-pel. + assert((mv->col & 7) == 0); + assert((mv->row & 7) == 0); + assert((ref->col & 7) == 0); + assert((ref->row & 7) == 0); + const MV diff = { mv->row - ref->row, mv->col - ref->col }; + const MV_JOINT_TYPE j = av1_get_mv_joint(&diff); + + aom_write_symbol(w, j, mvctx->joints_cdf, MV_JOINTS); + if (mv_joint_vertical(j)) + encode_mv_component(w, diff.row, &mvctx->comps[0], MV_SUBPEL_NONE); + + if (mv_joint_horizontal(j)) + encode_mv_component(w, diff.col, &mvctx->comps[1], MV_SUBPEL_NONE); +} + +void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2], + const nmv_context *ctx, + MvSubpelPrecision precision) { + av1_cost_tokens_from_cdf(mvjoint, ctx->joints_cdf, NULL); + build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], precision); + build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], precision); +} + +int_mv av1_get_ref_mv_from_stack(int ref_idx, + const MV_REFERENCE_FRAME *ref_frame, + int ref_mv_idx, + const MB_MODE_INFO_EXT *mbmi_ext) { + const int8_t ref_frame_type = av1_ref_frame_type(ref_frame); + const CANDIDATE_MV *curr_ref_mv_stack = + mbmi_ext->ref_mv_stack[ref_frame_type]; + + if (ref_frame[1] > INTRA_FRAME) { + assert(ref_idx == 0 || ref_idx == 1); + return ref_idx ? curr_ref_mv_stack[ref_mv_idx].comp_mv + : curr_ref_mv_stack[ref_mv_idx].this_mv; + } + + assert(ref_idx == 0); + return ref_mv_idx < mbmi_ext->ref_mv_count[ref_frame_type] + ? curr_ref_mv_stack[ref_mv_idx].this_mv + : mbmi_ext->global_mvs[ref_frame_type]; +} + +int_mv av1_get_ref_mv(const MACROBLOCK *x, int ref_idx) { + const MACROBLOCKD *xd = &x->e_mbd; + const MB_MODE_INFO *mbmi = xd->mi[0]; + int ref_mv_idx = mbmi->ref_mv_idx; + if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) { + assert(has_second_ref(mbmi)); + ref_mv_idx += 1; + } + return av1_get_ref_mv_from_stack(ref_idx, mbmi->ref_frame, ref_mv_idx, + x->mbmi_ext); +} + +void av1_find_best_ref_mvs_from_stack(int allow_hp, + const MB_MODE_INFO_EXT *mbmi_ext, + MV_REFERENCE_FRAME ref_frame, + int_mv *nearest_mv, int_mv *near_mv, + int is_integer) { + const int ref_idx = 0; + MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME }; + *nearest_mv = av1_get_ref_mv_from_stack(ref_idx, ref_frames, 0, mbmi_ext); + lower_mv_precision(&nearest_mv->as_mv, allow_hp, is_integer); + *near_mv = av1_get_ref_mv_from_stack(ref_idx, ref_frames, 1, mbmi_ext); + lower_mv_precision(&near_mv->as_mv, allow_hp, is_integer); +} diff --git a/libs/libaom/src/av1/encoder/encodemv.h b/libs/libaom/src/av1/encoder/encodemv.h new file mode 100644 index 000000000..0d130143e --- /dev/null +++ b/libs/libaom/src/av1/encoder/encodemv.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ENCODEMV_H_ +#define AOM_AV1_ENCODER_ENCODEMV_H_ + +#include "av1/encoder/encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref, + nmv_context *mvctx, int usehp); + +void av1_update_mv_stats(const MV *mv, const MV *ref, nmv_context *mvctx, + MvSubpelPrecision precision); + +void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2], + const nmv_context *mvctx, + MvSubpelPrecision precision); + +void av1_update_mv_count(ThreadData *td); + +void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref, + nmv_context *mvctx); +int_mv av1_get_ref_mv(const MACROBLOCK *x, int ref_idx); +int_mv av1_get_ref_mv_from_stack(int ref_idx, + const MV_REFERENCE_FRAME *ref_frame, + int ref_mv_idx, + const MB_MODE_INFO_EXT *mbmi_ext); +void av1_find_best_ref_mvs_from_stack(int allow_hp, + const MB_MODE_INFO_EXT *mbmi_ext, + MV_REFERENCE_FRAME ref_frame, + int_mv *nearest_mv, int_mv *near_mv, + int is_integer); + +static INLINE MV_JOINT_TYPE av1_get_mv_joint(const MV *mv) { + // row: Z col: Z | MV_JOINT_ZERO (0) + // row: Z col: NZ | MV_JOINT_HNZVZ (1) + // row: NZ col: Z | MV_JOINT_HZVNZ (2) + // row: NZ col: NZ | MV_JOINT_HNZVNZ (3) + return (!!mv->col) | ((!!mv->row) << 1); +} + +static INLINE int av1_mv_class_base(MV_CLASS_TYPE c) { + return c ? CLASS0_SIZE << (c + 2) : 0; +} + +// If n != 0, returns the floor of log base 2 of n. If n == 0, returns 0. +static INLINE uint8_t av1_log_in_base_2(unsigned int n) { + // get_msb() is only valid when n != 0. + return n == 0 ? 0 : get_msb(n); +} + +static INLINE MV_CLASS_TYPE av1_get_mv_class(int z, int *offset) { + const MV_CLASS_TYPE c = (z >= CLASS0_SIZE * 4096) + ? MV_CLASS_10 + : (MV_CLASS_TYPE)av1_log_in_base_2(z >> 3); + if (offset) *offset = z - av1_mv_class_base(c); + return c; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ENCODEMV_H_ diff --git a/libs/libaom/src/av1/encoder/encoder.c b/libs/libaom/src/av1/encoder/encoder.c new file mode 100644 index 000000000..6406afd4a --- /dev/null +++ b/libs/libaom/src/av1/encoder/encoder.c @@ -0,0 +1,7187 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#if CONFIG_DENOISE +#include "aom_dsp/grain_table.h" +#include "aom_dsp/noise_util.h" +#include "aom_dsp/noise_model.h" +#endif +#include "aom_dsp/psnr.h" +#if CONFIG_INTERNAL_STATS +#include "aom_dsp/ssim.h" +#endif +#include "aom_ports/aom_timer.h" +#include "aom_ports/mem.h" +#include "aom_ports/system_state.h" +#include "aom_scale/aom_scale.h" +#if CONFIG_BITSTREAM_DEBUG +#include "aom_util/debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG + +#include "av1/common/alloccommon.h" +#include "av1/common/cdef.h" +#include "av1/common/filter.h" +#include "av1/common/idct.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/common/resize.h" +#include "av1/common/tile_common.h" + +#include "av1/encoder/av1_multi_thread.h" +#include "av1/encoder/aq_complexity.h" +#include "av1/encoder/aq_cyclicrefresh.h" +#include "av1/encoder/aq_variance.h" +#include "av1/encoder/bitstream.h" +#include "av1/encoder/context_tree.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/encode_strategy.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encodetxb.h" +#include "av1/encoder/ethread.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/grain_test_vectors.h" +#include "av1/encoder/hash_motion.h" +#include "av1/encoder/mv_prec.h" +#include "av1/encoder/pass2_strategy.h" +#include "av1/encoder/picklpf.h" +#include "av1/encoder/pickrst.h" +#include "av1/encoder/random.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/segmentation.h" +#include "av1/encoder/speed_features.h" +#include "av1/encoder/tpl_model.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/var_based_part.h" + +#if CONFIG_TUNE_VMAF +#include "av1/encoder/tune_vmaf.h" +#endif + +#define DEFAULT_EXPLICIT_ORDER_HINT_BITS 7 + +#if CONFIG_ENTROPY_STATS +FRAME_COUNTS aggregate_fc; +#endif // CONFIG_ENTROPY_STATS + +#define AM_SEGMENT_ID_INACTIVE 7 +#define AM_SEGMENT_ID_ACTIVE 0 + +// #define OUTPUT_YUV_REC +#ifdef OUTPUT_YUV_SKINMAP +FILE *yuv_skinmap_file = NULL; +#endif +#ifdef OUTPUT_YUV_REC +FILE *yuv_rec_file; +#define FILE_NAME_LEN 100 +#endif + +const int default_tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL][TX_TYPES] = { + { { 221, 189, 214, 292, 0, 0, 0, 0, 0, 2, 38, 68, 0, 0, 0, 0 }, + { 262, 203, 216, 239, 0, 0, 0, 0, 0, 1, 37, 66, 0, 0, 0, 0 }, + { 315, 231, 239, 226, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 222, 188, 214, 287, 0, 0, 0, 0, 0, 2, 50, 61, 0, 0, 0, 0 }, + { 256, 182, 205, 282, 0, 0, 0, 0, 0, 2, 21, 76, 0, 0, 0, 0 }, + { 281, 214, 217, 222, 0, 0, 0, 0, 0, 1, 48, 41, 0, 0, 0, 0 }, + { 263, 194, 225, 225, 0, 0, 0, 0, 0, 2, 15, 100, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 170, 192, 242, 293, 0, 0, 0, 0, 0, 1, 68, 58, 0, 0, 0, 0 }, + { 199, 210, 213, 291, 0, 0, 0, 0, 0, 1, 14, 96, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { { 106, 69, 107, 278, 9, 15, 20, 45, 49, 23, 23, 88, 36, 74, 25, 57 }, + { 105, 72, 81, 98, 45, 49, 47, 50, 56, 72, 30, 81, 33, 95, 27, 83 }, + { 211, 105, 109, 120, 57, 62, 43, 49, 52, 58, 42, 116, 0, 0, 0, 0 }, + { 1008, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 131, 57, 98, 172, 19, 40, 37, 64, 69, 22, 41, 52, 51, 77, 35, 59 }, + { 176, 83, 93, 202, 22, 24, 28, 47, 50, 16, 12, 93, 26, 76, 17, 59 }, + { 136, 72, 89, 95, 46, 59, 47, 56, 61, 68, 35, 51, 32, 82, 26, 69 }, + { 122, 80, 87, 105, 49, 47, 46, 46, 57, 52, 13, 90, 19, 103, 15, 93 }, + { 1009, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0 }, + { 1011, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 202, 20, 84, 114, 14, 60, 41, 79, 99, 21, 41, 15, 50, 84, 34, 66 }, + { 196, 44, 23, 72, 30, 22, 28, 57, 67, 13, 4, 165, 15, 148, 9, 131 }, + { 882, 0, 0, 0, 0, 0, 0, 0, 0, 142, 0, 0, 0, 0, 0, 0 }, + { 840, 0, 0, 0, 0, 0, 0, 0, 0, 184, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } }, + { { 213, 110, 141, 269, 12, 16, 15, 19, 21, 11, 38, 68, 22, 29, 16, 24 }, + { 216, 119, 128, 143, 38, 41, 26, 30, 31, 30, 42, 70, 23, 36, 19, 32 }, + { 367, 149, 154, 154, 38, 35, 17, 21, 21, 10, 22, 36, 0, 0, 0, 0 }, + { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 219, 96, 127, 191, 21, 40, 25, 32, 34, 18, 45, 45, 33, 39, 26, 33 }, + { 296, 99, 122, 198, 23, 21, 19, 24, 25, 13, 20, 64, 23, 32, 18, 27 }, + { 275, 128, 142, 143, 35, 48, 23, 30, 29, 18, 42, 36, 18, 23, 14, 20 }, + { 239, 132, 166, 175, 36, 27, 19, 21, 24, 14, 13, 85, 9, 31, 8, 25 }, + { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, + { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 309, 25, 79, 59, 25, 80, 34, 53, 61, 25, 49, 23, 43, 64, 36, 59 }, + { 270, 57, 40, 54, 50, 42, 41, 53, 56, 28, 17, 81, 45, 86, 34, 70 }, + { 1005, 0, 0, 0, 0, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, 0 }, + { 992, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { { 133, 63, 55, 83, 57, 87, 58, 72, 68, 16, 24, 35, 29, 105, 25, 114 }, + { 131, 75, 74, 60, 71, 77, 65, 66, 73, 33, 21, 79, 20, 83, 18, 78 }, + { 276, 95, 82, 58, 86, 93, 63, 60, 64, 17, 38, 92, 0, 0, 0, 0 }, + { 1006, 0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 147, 49, 75, 78, 50, 97, 60, 67, 76, 17, 42, 35, 31, 93, 27, 80 }, + { 157, 49, 58, 75, 61, 52, 56, 67, 69, 12, 15, 79, 24, 119, 11, 120 }, + { 178, 69, 83, 77, 69, 85, 72, 77, 77, 20, 35, 40, 25, 48, 23, 46 }, + { 174, 55, 64, 57, 73, 68, 62, 61, 75, 15, 12, 90, 17, 99, 16, 86 }, + { 1008, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0 }, + { 1018, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 266, 31, 63, 64, 21, 52, 39, 54, 63, 30, 52, 31, 48, 89, 46, 75 }, + { 272, 26, 32, 44, 29, 31, 32, 53, 51, 13, 13, 88, 22, 153, 16, 149 }, + { 923, 0, 0, 0, 0, 0, 0, 0, 0, 101, 0, 0, 0, 0, 0, 0 }, + { 969, 0, 0, 0, 0, 0, 0, 0, 0, 55, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } }, + { { 158, 92, 125, 298, 12, 15, 20, 29, 31, 12, 29, 67, 34, 44, 23, 35 }, + { 147, 94, 103, 123, 45, 48, 38, 41, 46, 48, 37, 78, 33, 63, 27, 53 }, + { 268, 126, 125, 136, 54, 53, 31, 38, 38, 33, 35, 87, 0, 0, 0, 0 }, + { 1018, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 159, 72, 103, 194, 20, 35, 37, 50, 56, 21, 39, 40, 51, 61, 38, 48 }, + { 259, 86, 95, 188, 32, 20, 25, 34, 37, 13, 12, 85, 25, 53, 17, 43 }, + { 189, 99, 113, 123, 45, 59, 37, 46, 48, 44, 39, 41, 31, 47, 26, 37 }, + { 175, 110, 113, 128, 58, 38, 33, 33, 43, 29, 13, 100, 14, 68, 12, 57 }, + { 1017, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0 }, + { 1019, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 208, 22, 84, 101, 21, 59, 44, 70, 90, 25, 59, 13, 64, 67, 49, 48 }, + { 277, 52, 32, 63, 43, 26, 33, 48, 54, 11, 6, 130, 18, 119, 11, 101 }, + { 963, 0, 0, 0, 0, 0, 0, 0, 0, 61, 0, 0, 0, 0, 0, 0 }, + { 979, 0, 0, 0, 0, 0, 0, 0, 0, 45, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } +}; + +const int default_obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL] = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 106, 90, 90, 97, 67, 59, 70, 28, + 30, 38, 16, 16, 16, 0, 0, 44, 50, 26, 25 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 98, 93, 97, 68, 82, 85, 33, 30, + 33, 16, 16, 16, 16, 0, 0, 43, 37, 26, 16 }, + { 0, 0, 0, 91, 80, 76, 78, 55, 49, 24, 16, + 16, 16, 16, 16, 16, 0, 0, 29, 45, 16, 38 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 103, 89, 89, 89, 62, 63, 76, 34, + 35, 32, 19, 16, 16, 0, 0, 49, 55, 29, 19 } +}; + +const int default_warped_probs[FRAME_UPDATE_TYPES] = { 64, 64, 64, 64, + 64, 64, 64 }; + +// TODO(yunqing): the default probs can be trained later from better +// performance. +const int default_switchable_interp_probs[FRAME_UPDATE_TYPES] + [SWITCHABLE_FILTER_CONTEXTS] + [SWITCHABLE_FILTERS] = { + { { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 } }, + { { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 } }, + { { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 } }, + { { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 } }, + { { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 } }, + { { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 } }, + { { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 } } + }; + +static INLINE void Scale2Ratio(AOM_SCALING mode, int *hr, int *hs) { + switch (mode) { + case NORMAL: + *hr = 1; + *hs = 1; + break; + case FOURFIVE: + *hr = 4; + *hs = 5; + break; + case THREEFIVE: + *hr = 3; + *hs = 5; + break; + case ONETWO: + *hr = 1; + *hs = 2; + break; + default: + *hr = 1; + *hs = 1; + assert(0); + break; + } +} + +// Mark all inactive blocks as active. Other segmentation features may be set +// so memset cannot be used, instead only inactive blocks should be reset. +static void suppress_active_map(AV1_COMP *cpi) { + unsigned char *const seg_map = cpi->enc_seg.map; + int i; + if (cpi->active_map.enabled || cpi->active_map.update) + for (i = 0; + i < cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols; ++i) + if (seg_map[i] == AM_SEGMENT_ID_INACTIVE) + seg_map[i] = AM_SEGMENT_ID_ACTIVE; +} + +static void apply_active_map(AV1_COMP *cpi) { + struct segmentation *const seg = &cpi->common.seg; + unsigned char *const seg_map = cpi->enc_seg.map; + const unsigned char *const active_map = cpi->active_map.map; + int i; + + assert(AM_SEGMENT_ID_ACTIVE == CR_SEGMENT_ID_BASE); + + if (frame_is_intra_only(&cpi->common)) { + cpi->active_map.enabled = 0; + cpi->active_map.update = 1; + } + + if (cpi->active_map.update) { + if (cpi->active_map.enabled) { + for (i = 0; + i < cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols; + ++i) + if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i]; + av1_enable_segmentation(seg); + av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP); + av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H); + av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V); + av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U); + av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V); + + av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H, + -MAX_LOOP_FILTER); + av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V, + -MAX_LOOP_FILTER); + av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U, + -MAX_LOOP_FILTER); + av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V, + -MAX_LOOP_FILTER); + } else { + av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP); + av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H); + av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V); + av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U); + av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V); + if (seg->enabled) { + seg->update_data = 1; + seg->update_map = 1; + } + } + cpi->active_map.update = 0; + } +} + +int av1_set_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows, + int cols) { + const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; + if (rows == mi_params->mb_rows && cols == mi_params->mb_cols) { + unsigned char *const active_map_8x8 = cpi->active_map.map; + const int mi_rows = mi_params->mi_rows; + const int mi_cols = mi_params->mi_cols; + const int row_scale = mi_size_high[BLOCK_16X16] == 2 ? 1 : 2; + const int col_scale = mi_size_wide[BLOCK_16X16] == 2 ? 1 : 2; + cpi->active_map.update = 1; + if (new_map_16x16) { + int r, c; + for (r = 0; r < mi_rows; ++r) { + for (c = 0; c < mi_cols; ++c) { + active_map_8x8[r * mi_cols + c] = + new_map_16x16[(r >> row_scale) * cols + (c >> col_scale)] + ? AM_SEGMENT_ID_ACTIVE + : AM_SEGMENT_ID_INACTIVE; + } + } + cpi->active_map.enabled = 1; + } else { + cpi->active_map.enabled = 0; + } + return 0; + } else { + return -1; + } +} + +int av1_get_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows, + int cols) { + const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; + if (rows == mi_params->mb_rows && cols == mi_params->mb_cols && + new_map_16x16) { + unsigned char *const seg_map_8x8 = cpi->enc_seg.map; + const int mi_rows = mi_params->mi_rows; + const int mi_cols = mi_params->mi_cols; + const int row_scale = mi_size_high[BLOCK_16X16] == 2 ? 1 : 2; + const int col_scale = mi_size_wide[BLOCK_16X16] == 2 ? 1 : 2; + + memset(new_map_16x16, !cpi->active_map.enabled, rows * cols); + if (cpi->active_map.enabled) { + int r, c; + for (r = 0; r < mi_rows; ++r) { + for (c = 0; c < mi_cols; ++c) { + // Cyclic refresh segments are considered active despite not having + // AM_SEGMENT_ID_ACTIVE + new_map_16x16[(r >> row_scale) * cols + (c >> col_scale)] |= + seg_map_8x8[r * mi_cols + c] != AM_SEGMENT_ID_INACTIVE; + } + } + } + return 0; + } else { + return -1; + } +} + +// Compute the horizontal frequency components' energy in a frame +// by calculuating the 16x4 Horizontal DCT. This is to be used to +// decide the superresolution parameters. +static void analyze_hor_freq(const AV1_COMP *cpi, double *energy) { + uint64_t freq_energy[16] = { 0 }; + const YV12_BUFFER_CONFIG *buf = cpi->source; + const int bd = cpi->td.mb.e_mbd.bd; + const int width = buf->y_crop_width; + const int height = buf->y_crop_height; + DECLARE_ALIGNED(16, int32_t, coeff[16 * 4]); + int n = 0; + memset(freq_energy, 0, sizeof(freq_energy)); + if (buf->flags & YV12_FLAG_HIGHBITDEPTH) { + const int16_t *src16 = (const int16_t *)CONVERT_TO_SHORTPTR(buf->y_buffer); + for (int i = 0; i < height - 4; i += 4) { + for (int j = 0; j < width - 16; j += 16) { + av1_fwd_txfm2d_16x4(src16 + i * buf->y_stride + j, coeff, buf->y_stride, + H_DCT, bd); + for (int k = 1; k < 16; ++k) { + const uint64_t this_energy = + ((int64_t)coeff[k] * coeff[k]) + + ((int64_t)coeff[k + 16] * coeff[k + 16]) + + ((int64_t)coeff[k + 32] * coeff[k + 32]) + + ((int64_t)coeff[k + 48] * coeff[k + 48]); + freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2 + 2 * (bd - 8)); + } + n++; + } + } + } else { + assert(bd == 8); + DECLARE_ALIGNED(16, int16_t, src16[16 * 4]); + for (int i = 0; i < height - 4; i += 4) { + for (int j = 0; j < width - 16; j += 16) { + for (int ii = 0; ii < 4; ++ii) + for (int jj = 0; jj < 16; ++jj) + src16[ii * 16 + jj] = + buf->y_buffer[(i + ii) * buf->y_stride + (j + jj)]; + av1_fwd_txfm2d_16x4(src16, coeff, 16, H_DCT, bd); + for (int k = 1; k < 16; ++k) { + const uint64_t this_energy = + ((int64_t)coeff[k] * coeff[k]) + + ((int64_t)coeff[k + 16] * coeff[k + 16]) + + ((int64_t)coeff[k + 32] * coeff[k + 32]) + + ((int64_t)coeff[k + 48] * coeff[k + 48]); + freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2); + } + n++; + } + } + } + if (n) { + for (int k = 1; k < 16; ++k) energy[k] = (double)freq_energy[k] / n; + // Convert to cumulative energy + for (int k = 14; k > 0; --k) energy[k] += energy[k + 1]; + } else { + for (int k = 1; k < 16; ++k) energy[k] = 1e+20; + } +} + +static BLOCK_SIZE select_sb_size(const AV1_COMP *const cpi) { + const AV1_COMMON *const cm = &cpi->common; + + if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_64X64) + return BLOCK_64X64; + if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_128X128) + return BLOCK_128X128; + + assert(cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_DYNAMIC); + + if (cpi->svc.number_spatial_layers > 1) { + // Use the configured size (top resolution) for spatial layers. + return AOMMIN(cpi->oxcf.width, cpi->oxcf.height) > 480 ? BLOCK_128X128 + : BLOCK_64X64; + } + + // TODO(any): Possibly could improve this with a heuristic. + // When superres / resize is on, 'cm->width / height' can change between + // calls, so we don't apply this heuristic there. + // Things break if superblock size changes between the first pass and second + // pass encoding, which is why this heuristic is not configured as a + // speed-feature. + if (cpi->oxcf.superres_mode == SUPERRES_NONE && + cpi->oxcf.resize_mode == RESIZE_NONE && cpi->oxcf.speed >= 1) { + return AOMMIN(cm->width, cm->height) > 480 ? BLOCK_128X128 : BLOCK_64X64; + } + + return BLOCK_128X128; +} + +static void setup_frame(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + // Set up entropy context depending on frame type. The decoder mandates + // the use of the default context, index 0, for keyframes and inter + // frames where the error_resilient_mode or intra_only flag is set. For + // other inter-frames the encoder currently uses only two contexts; + // context 1 for ALTREF frames and context 0 for the others. + + if (frame_is_intra_only(cm) || cm->features.error_resilient_mode || + cpi->ext_flags.use_primary_ref_none) { + av1_setup_past_independence(cm); + } + + if ((cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) || + frame_is_sframe(cm)) { + if (!cpi->seq_params_locked) { + set_sb_size(&cm->seq_params, select_sb_size(cpi)); + } + } else { + const RefCntBuffer *const primary_ref_buf = get_primary_ref_frame_buf(cm); + if (primary_ref_buf == NULL) { + av1_setup_past_independence(cm); + cm->seg.update_map = 1; + cm->seg.update_data = 1; + } else { + *cm->fc = primary_ref_buf->frame_context; + } + } + + av1_zero(cm->cur_frame->interp_filter_selected); + cm->prev_frame = get_primary_ref_frame_buf(cm); + cpi->vaq_refresh = 0; +} + +static void set_mb_mi(CommonModeInfoParams *mi_params, int width, int height) { + // Ensure that the decoded width and height are both multiples of + // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if + // subsampling is used). + // This simplifies the implementation of various experiments, + // eg. cdef, which operates on units of 8x8 luma pixels. + const int aligned_width = ALIGN_POWER_OF_TWO(width, 3); + const int aligned_height = ALIGN_POWER_OF_TWO(height, 3); + + mi_params->mi_cols = aligned_width >> MI_SIZE_LOG2; + mi_params->mi_rows = aligned_height >> MI_SIZE_LOG2; + mi_params->mi_stride = calc_mi_size(mi_params->mi_cols); + + mi_params->mb_cols = (mi_params->mi_cols + 2) >> 2; + mi_params->mb_rows = (mi_params->mi_rows + 2) >> 2; + mi_params->MBs = mi_params->mb_rows * mi_params->mb_cols; + + const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize]; + mi_params->mi_alloc_stride = + (mi_params->mi_stride + mi_alloc_size_1d - 1) / mi_alloc_size_1d; + + assert(mi_size_wide[mi_params->mi_alloc_bsize] == + mi_size_high[mi_params->mi_alloc_bsize]); + +#if CONFIG_LPF_MASK + av1_alloc_loop_filter_mask(mi_params); +#endif +} + +static void enc_set_mb_mi(CommonModeInfoParams *mi_params, int width, + int height) { + const int is_4k_or_larger = AOMMIN(width, height) >= 2160; + mi_params->mi_alloc_bsize = is_4k_or_larger ? BLOCK_8X8 : BLOCK_4X4; + + set_mb_mi(mi_params, width, height); +} + +static void stat_stage_set_mb_mi(CommonModeInfoParams *mi_params, int width, + int height) { + mi_params->mi_alloc_bsize = BLOCK_16X16; + + set_mb_mi(mi_params, width, height); +} + +static void enc_setup_mi(CommonModeInfoParams *mi_params) { + const int mi_grid_size = + mi_params->mi_stride * calc_mi_size(mi_params->mi_rows); + memset(mi_params->mi_alloc, 0, + mi_params->mi_alloc_size * sizeof(*mi_params->mi_alloc)); + memset(mi_params->mi_grid_base, 0, + mi_grid_size * sizeof(*mi_params->mi_grid_base)); + memset(mi_params->tx_type_map, 0, + mi_grid_size * sizeof(*mi_params->tx_type_map)); +} + +static void enc_free_mi(CommonModeInfoParams *mi_params) { + aom_free(mi_params->mi_alloc); + mi_params->mi_alloc = NULL; + aom_free(mi_params->mi_grid_base); + mi_params->mi_grid_base = NULL; + mi_params->mi_alloc_size = 0; + aom_free(mi_params->tx_type_map); + mi_params->tx_type_map = NULL; +} + +void av1_initialize_enc(void) { + av1_rtcd(); + aom_dsp_rtcd(); + aom_scale_rtcd(); + av1_init_intra_predictors(); + av1_init_me_luts(); + av1_rc_init_minq_luts(); + av1_init_wedge_masks(); +} + +static void dealloc_context_buffers_ext(MBMIExtFrameBufferInfo *mbmi_ext_info) { + if (mbmi_ext_info->frame_base) { + aom_free(mbmi_ext_info->frame_base); + mbmi_ext_info->frame_base = NULL; + mbmi_ext_info->alloc_size = 0; + } +} + +static void alloc_context_buffers_ext(AV1_COMMON *cm, + MBMIExtFrameBufferInfo *mbmi_ext_info) { + const CommonModeInfoParams *const mi_params = &cm->mi_params; + + const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize]; + const int mi_alloc_rows = + (mi_params->mi_rows + mi_alloc_size_1d - 1) / mi_alloc_size_1d; + const int mi_alloc_cols = + (mi_params->mi_cols + mi_alloc_size_1d - 1) / mi_alloc_size_1d; + const int new_ext_mi_size = mi_alloc_rows * mi_alloc_cols; + + if (new_ext_mi_size > mbmi_ext_info->alloc_size) { + dealloc_context_buffers_ext(mbmi_ext_info); + CHECK_MEM_ERROR( + cm, mbmi_ext_info->frame_base, + aom_calloc(new_ext_mi_size, sizeof(*mbmi_ext_info->frame_base))); + mbmi_ext_info->alloc_size = new_ext_mi_size; + } + // The stride needs to be updated regardless of whether new allocation + // happened or not. + mbmi_ext_info->stride = mi_alloc_cols; +} + +static void reset_film_grain_chroma_params(aom_film_grain_t *pars) { + pars->num_cr_points = 0; + pars->cr_mult = 0; + pars->cr_luma_mult = 0; + memset(pars->scaling_points_cr, 0, sizeof(pars->scaling_points_cr)); + memset(pars->ar_coeffs_cr, 0, sizeof(pars->ar_coeffs_cr)); + pars->num_cb_points = 0; + pars->cb_mult = 0; + pars->cb_luma_mult = 0; + pars->chroma_scaling_from_luma = 0; + memset(pars->scaling_points_cb, 0, sizeof(pars->scaling_points_cb)); + memset(pars->ar_coeffs_cb, 0, sizeof(pars->ar_coeffs_cb)); +} + +static void update_film_grain_parameters(struct AV1_COMP *cpi, + const AV1EncoderConfig *oxcf) { + AV1_COMMON *const cm = &cpi->common; + cpi->oxcf = *oxcf; + + if (cpi->film_grain_table) { + aom_film_grain_table_free(cpi->film_grain_table); + aom_free(cpi->film_grain_table); + cpi->film_grain_table = NULL; + } + + if (oxcf->film_grain_test_vector) { + cm->seq_params.film_grain_params_present = 1; + if (cm->current_frame.frame_type == KEY_FRAME) { + memcpy(&cm->film_grain_params, + film_grain_test_vectors + oxcf->film_grain_test_vector - 1, + sizeof(cm->film_grain_params)); + if (oxcf->monochrome) + reset_film_grain_chroma_params(&cm->film_grain_params); + cm->film_grain_params.bit_depth = cm->seq_params.bit_depth; + if (cm->seq_params.color_range == AOM_CR_FULL_RANGE) { + cm->film_grain_params.clip_to_restricted_range = 0; + } + } + } else if (oxcf->film_grain_table_filename) { + cm->seq_params.film_grain_params_present = 1; + + cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table)); + memset(cpi->film_grain_table, 0, sizeof(aom_film_grain_table_t)); + + aom_film_grain_table_read(cpi->film_grain_table, + oxcf->film_grain_table_filename, &cm->error); + } else { +#if CONFIG_DENOISE + cm->seq_params.film_grain_params_present = (cpi->oxcf.noise_level > 0); +#else + cm->seq_params.film_grain_params_present = 0; +#endif + memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params)); + } +} + +static void dealloc_compressor_data(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + + dealloc_context_buffers_ext(&cpi->mbmi_ext_info); + + aom_free(cpi->tile_data); + cpi->tile_data = NULL; + + // Delete sementation map + aom_free(cpi->enc_seg.map); + cpi->enc_seg.map = NULL; + + av1_cyclic_refresh_free(cpi->cyclic_refresh); + cpi->cyclic_refresh = NULL; + + aom_free(cpi->active_map.map); + cpi->active_map.map = NULL; + + aom_free(cpi->ssim_rdmult_scaling_factors); + cpi->ssim_rdmult_scaling_factors = NULL; + + aom_free(cpi->tpl_rdmult_scaling_factors); + cpi->tpl_rdmult_scaling_factors = NULL; + + aom_free(cpi->tpl_sb_rdmult_scaling_factors); + cpi->tpl_sb_rdmult_scaling_factors = NULL; + +#if CONFIG_TUNE_VMAF + aom_free(cpi->vmaf_rdmult_scaling_factors); + cpi->vmaf_rdmult_scaling_factors = NULL; +#endif + + aom_free(cpi->td.mb.above_pred_buf); + cpi->td.mb.above_pred_buf = NULL; + + aom_free(cpi->td.mb.left_pred_buf); + cpi->td.mb.left_pred_buf = NULL; + + aom_free(cpi->td.mb.wsrc_buf); + cpi->td.mb.wsrc_buf = NULL; + + aom_free(cpi->td.mb.inter_modes_info); + cpi->td.mb.inter_modes_info = NULL; + + for (int i = 0; i < 2; i++) + for (int j = 0; j < 2; j++) { + aom_free(cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j]); + cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j] = NULL; + } + aom_free(cpi->td.mb.mask_buf); + cpi->td.mb.mask_buf = NULL; + + aom_free(cm->tpl_mvs); + cm->tpl_mvs = NULL; + + aom_free(cpi->td.mb.mbmi_ext); + cpi->td.mb.mbmi_ext = NULL; + + if (cpi->td.vt64x64) { + aom_free(cpi->td.vt64x64); + cpi->td.vt64x64 = NULL; + } + + av1_free_ref_frame_buffers(cm->buffer_pool); + av1_free_txb_buf(cpi); + av1_free_context_buffers(cm); + + aom_free_frame_buffer(&cpi->last_frame_uf); + av1_free_restoration_buffers(cm); + aom_free_frame_buffer(&cpi->trial_frame_rst); + aom_free_frame_buffer(&cpi->scaled_source); + aom_free_frame_buffer(&cpi->scaled_last_source); + aom_free_frame_buffer(&cpi->alt_ref_buffer); + av1_lookahead_destroy(cpi->lookahead); + + aom_free(cpi->tile_tok[0][0]); + cpi->tile_tok[0][0] = 0; + + aom_free(cpi->tplist[0][0]); + cpi->tplist[0][0] = NULL; + + av1_free_pc_tree(cpi, &cpi->td, num_planes, cm->seq_params.sb_size); + + aom_free(cpi->td.mb.palette_buffer); + av1_release_compound_type_rd_buffers(&cpi->td.mb.comp_rd_buffer); + aom_free(cpi->td.mb.tmp_conv_dst); + for (int j = 0; j < 2; ++j) { + aom_free(cpi->td.mb.tmp_obmc_bufs[j]); + } + +#if CONFIG_DENOISE + if (cpi->denoise_and_model) { + aom_denoise_and_model_free(cpi->denoise_and_model); + cpi->denoise_and_model = NULL; + } +#endif + if (cpi->film_grain_table) { + aom_film_grain_table_free(cpi->film_grain_table); + cpi->film_grain_table = NULL; + } + + for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) { + aom_free(cpi->level_params.level_info[i]); + } + + if (cpi->use_svc) av1_free_svc_cyclic_refresh(cpi); +} + +static void configure_static_seg_features(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + struct segmentation *const seg = &cm->seg; + + int high_q = (int)(rc->avg_q > 48.0); + int qi_delta; + + // Disable and clear down for KF + if (cm->current_frame.frame_type == KEY_FRAME) { + // Clear down the global segmentation map + memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); + seg->update_map = 0; + seg->update_data = 0; + + // Disable segmentation + av1_disable_segmentation(seg); + + // Clear down the segment features. + av1_clearall_segfeatures(seg); + } else if (cpi->refresh_alt_ref_frame) { + // If this is an alt ref frame + // Clear down the global segmentation map + memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); + seg->update_map = 0; + seg->update_data = 0; + + // Disable segmentation and individual segment features by default + av1_disable_segmentation(seg); + av1_clearall_segfeatures(seg); + + // If segmentation was enabled set those features needed for the + // arf itself. + if (seg->enabled) { + seg->update_map = 1; + seg->update_data = 1; + + qi_delta = av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875, + cm->seq_params.bit_depth); + av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2); + av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2); + av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2); + av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_U, -2); + av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_V, -2); + + av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_H); + av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_V); + av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_U); + av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V); + + av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q); + } + } else if (seg->enabled) { + // All other frames if segmentation has been enabled + + // First normal frame in a valid gf or alt ref group + if (rc->frames_since_golden == 0) { + // Set up segment features for normal frames in an arf group + if (rc->source_alt_ref_active) { + seg->update_map = 0; + seg->update_data = 1; + + qi_delta = av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125, + cm->seq_params.bit_depth); + av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta + 2); + av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q); + + av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2); + av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2); + av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_U, -2); + av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_V, -2); + + av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_H); + av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_V); + av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_U); + av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V); + + // Segment coding disabled for compred testing + if (high_q) { + av1_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME); + av1_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME); + av1_enable_segfeature(seg, 1, SEG_LVL_SKIP); + } + } else { + // Disable segmentation and clear down features if alt ref + // is not active for this group + + av1_disable_segmentation(seg); + + memset(cpi->enc_seg.map, 0, + cm->mi_params.mi_rows * cm->mi_params.mi_cols); + + seg->update_map = 0; + seg->update_data = 0; + + av1_clearall_segfeatures(seg); + } + } else if (rc->is_src_frame_alt_ref) { + // Special case where we are coding over the top of a previous + // alt ref frame. + // Segment coding disabled for compred testing + + // Enable ref frame features for segment 0 as well + av1_enable_segfeature(seg, 0, SEG_LVL_REF_FRAME); + av1_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME); + + // All mbs should use ALTREF_FRAME + av1_clear_segdata(seg, 0, SEG_LVL_REF_FRAME); + av1_set_segdata(seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME); + av1_clear_segdata(seg, 1, SEG_LVL_REF_FRAME); + av1_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME); + + // Skip all MBs if high Q (0,0 mv and skip coeffs) + if (high_q) { + av1_enable_segfeature(seg, 0, SEG_LVL_SKIP); + av1_enable_segfeature(seg, 1, SEG_LVL_SKIP); + } + // Enable data update + seg->update_data = 1; + } else { + // All other frames. + + // No updates.. leave things as they are. + seg->update_map = 0; + seg->update_data = 0; + } + } +} + +static void update_reference_segmentation_map(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + MB_MODE_INFO **mi_4x4_ptr = mi_params->mi_grid_base; + uint8_t *cache_ptr = cm->cur_frame->seg_map; + + for (int row = 0; row < mi_params->mi_rows; row++) { + MB_MODE_INFO **mi_4x4 = mi_4x4_ptr; + uint8_t *cache = cache_ptr; + for (int col = 0; col < mi_params->mi_cols; col++, mi_4x4++, cache++) + cache[0] = mi_4x4[0]->segment_id; + mi_4x4_ptr += mi_params->mi_stride; + cache_ptr += mi_params->mi_cols; + } +} + +static void alloc_altref_frame_buffer(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + const SequenceHeader *const seq_params = &cm->seq_params; + const AV1EncoderConfig *oxcf = &cpi->oxcf; + + // TODO(agrange) Check if ARF is enabled and skip allocation if not. + if (aom_realloc_frame_buffer( + &cpi->alt_ref_buffer, oxcf->width, oxcf->height, + seq_params->subsampling_x, seq_params->subsampling_y, + seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, NULL, NULL, NULL)) + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate altref buffer"); +} + +static void alloc_util_frame_buffers(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const SequenceHeader *const seq_params = &cm->seq_params; + const int byte_alignment = cm->features.byte_alignment; + if (aom_realloc_frame_buffer( + &cpi->last_frame_uf, cm->width, cm->height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, byte_alignment, NULL, NULL, NULL)) + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate last frame buffer"); + + if (aom_realloc_frame_buffer( + &cpi->trial_frame_rst, cm->superres_upscaled_width, + cm->superres_upscaled_height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + AOM_RESTORATION_FRAME_BORDER, byte_alignment, NULL, NULL, NULL)) + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate trial restored frame buffer"); + + if (aom_realloc_frame_buffer( + &cpi->scaled_source, cm->width, cm->height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, byte_alignment, NULL, NULL, NULL)) + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate scaled source buffer"); + + if (aom_realloc_frame_buffer( + &cpi->scaled_last_source, cm->width, cm->height, + seq_params->subsampling_x, seq_params->subsampling_y, + seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, + byte_alignment, NULL, NULL, NULL)) + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate scaled last source buffer"); +} + +static void alloc_compressor_data(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + + if (av1_alloc_context_buffers(cm, cm->width, cm->height)) { + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate context buffers"); + } + + int mi_rows_aligned_to_sb = + ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params.mib_size_log2); + int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2; + + if (!is_stat_generation_stage(cpi)) { + av1_alloc_txb_buf(cpi); + + alloc_context_buffers_ext(cm, &cpi->mbmi_ext_info); + } + + aom_free(cpi->tile_tok[0][0]); + aom_free(cpi->tplist[0][0]); + + if (!is_stat_generation_stage(cpi)) { + unsigned int tokens = + get_token_alloc(cm->mi_params.mb_rows, cm->mi_params.mb_cols, + MAX_SB_SIZE_LOG2, num_planes); + CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0], + aom_calloc(tokens, sizeof(*cpi->tile_tok[0][0]))); + + CHECK_MEM_ERROR(cm, cpi->tplist[0][0], + aom_calloc(sb_rows * MAX_TILE_ROWS * MAX_TILE_COLS, + sizeof(*cpi->tplist[0][0]))); + } + + av1_setup_pc_tree(cpi, &cpi->td); +} + +void av1_new_framerate(AV1_COMP *cpi, double framerate) { + cpi->framerate = framerate < 0.1 ? 30 : framerate; + av1_rc_update_framerate(cpi, cpi->common.width, cpi->common.height); +} + +double av1_get_compression_ratio(const AV1_COMMON *const cm, + size_t encoded_frame_size) { + const int upscaled_width = cm->superres_upscaled_width; + const int height = cm->height; + const int luma_pic_size = upscaled_width * height; + const SequenceHeader *const seq_params = &cm->seq_params; + const BITSTREAM_PROFILE profile = seq_params->profile; + const int pic_size_profile_factor = + profile == PROFILE_0 ? 15 : (profile == PROFILE_1 ? 30 : 36); + encoded_frame_size = + (encoded_frame_size > 129 ? encoded_frame_size - 128 : 1); + const size_t uncompressed_frame_size = + (luma_pic_size * pic_size_profile_factor) >> 3; + return uncompressed_frame_size / (double)encoded_frame_size; +} + +static void set_tile_info(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const SequenceHeader *const seq_params = &cm->seq_params; + CommonTileParams *const tiles = &cm->tiles; + int i, start_sb; + + av1_get_tile_limits(cm); + + // configure tile columns + if (cpi->oxcf.tile_width_count == 0 || cpi->oxcf.tile_height_count == 0) { + tiles->uniform_spacing = 1; + tiles->log2_cols = AOMMAX(cpi->oxcf.tile_columns, tiles->min_log2_cols); + tiles->log2_cols = AOMMIN(tiles->log2_cols, tiles->max_log2_cols); + } else { + int mi_cols = + ALIGN_POWER_OF_TWO(mi_params->mi_cols, seq_params->mib_size_log2); + int sb_cols = mi_cols >> seq_params->mib_size_log2; + int size_sb, j = 0; + tiles->uniform_spacing = 0; + for (i = 0, start_sb = 0; start_sb < sb_cols && i < MAX_TILE_COLS; i++) { + tiles->col_start_sb[i] = start_sb; + size_sb = cpi->oxcf.tile_widths[j++]; + if (j >= cpi->oxcf.tile_width_count) j = 0; + start_sb += AOMMIN(size_sb, tiles->max_width_sb); + } + tiles->cols = i; + tiles->col_start_sb[i] = sb_cols; + } + av1_calculate_tile_cols(seq_params, mi_params->mi_rows, mi_params->mi_cols, + tiles); + + // configure tile rows + if (tiles->uniform_spacing) { + tiles->log2_rows = AOMMAX(cpi->oxcf.tile_rows, tiles->min_log2_rows); + tiles->log2_rows = AOMMIN(tiles->log2_rows, tiles->max_log2_rows); + } else { + int mi_rows = + ALIGN_POWER_OF_TWO(mi_params->mi_rows, seq_params->mib_size_log2); + int sb_rows = mi_rows >> seq_params->mib_size_log2; + int size_sb, j = 0; + for (i = 0, start_sb = 0; start_sb < sb_rows && i < MAX_TILE_ROWS; i++) { + tiles->row_start_sb[i] = start_sb; + size_sb = cpi->oxcf.tile_heights[j++]; + if (j >= cpi->oxcf.tile_height_count) j = 0; + start_sb += AOMMIN(size_sb, tiles->max_height_sb); + } + tiles->rows = i; + tiles->row_start_sb[i] = sb_rows; + } + av1_calculate_tile_rows(seq_params, mi_params->mi_rows, tiles); +} + +static void update_frame_size(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + + // We need to reallocate the context buffers here in case we need more mis. + if (av1_alloc_context_buffers(cm, cm->width, cm->height)) { + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate context buffers"); + } + av1_init_mi_buffers(&cm->mi_params); + + av1_init_macroblockd(cm, xd, NULL); + + if (!is_stat_generation_stage(cpi)) + alloc_context_buffers_ext(cm, &cpi->mbmi_ext_info); + set_tile_info(cpi); +} + +static void init_buffer_indices(ForceIntegerMVInfo *const force_intpel_info, + int *const remapped_ref_idx) { + int fb_idx; + for (fb_idx = 0; fb_idx < REF_FRAMES; ++fb_idx) + remapped_ref_idx[fb_idx] = fb_idx; + force_intpel_info->rate_index = 0; + force_intpel_info->rate_size = 0; +} + +static INLINE int does_level_match(int width, int height, double fps, + int lvl_width, int lvl_height, + double lvl_fps, int lvl_dim_mult) { + const int64_t lvl_luma_pels = lvl_width * lvl_height; + const double lvl_display_sample_rate = lvl_luma_pels * lvl_fps; + const int64_t luma_pels = width * height; + const double display_sample_rate = luma_pels * fps; + return luma_pels <= lvl_luma_pels && + display_sample_rate <= lvl_display_sample_rate && + width <= lvl_width * lvl_dim_mult && + height <= lvl_height * lvl_dim_mult; +} + +static void set_bitstream_level_tier(SequenceHeader *seq, AV1_COMMON *cm, + const AV1EncoderConfig *oxcf) { + // TODO(any): This is a placeholder function that only addresses dimensions + // and max display sample rates. + // Need to add checks for max bit rate, max decoded luma sample rate, header + // rate, etc. that are not covered by this function. + AV1_LEVEL level = SEQ_LEVEL_MAX; + if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, 512, + 288, 30.0, 4)) { + level = SEQ_LEVEL_2_0; + } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, + 704, 396, 30.0, 4)) { + level = SEQ_LEVEL_2_1; + } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, + 1088, 612, 30.0, 4)) { + level = SEQ_LEVEL_3_0; + } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, + 1376, 774, 30.0, 4)) { + level = SEQ_LEVEL_3_1; + } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, + 2048, 1152, 30.0, 3)) { + level = SEQ_LEVEL_4_0; + } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, + 2048, 1152, 60.0, 3)) { + level = SEQ_LEVEL_4_1; + } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, + 4096, 2176, 30.0, 2)) { + level = SEQ_LEVEL_5_0; + } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, + 4096, 2176, 60.0, 2)) { + level = SEQ_LEVEL_5_1; + } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, + 4096, 2176, 120.0, 2)) { + level = SEQ_LEVEL_5_2; + } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, + 8192, 4352, 30.0, 2)) { + level = SEQ_LEVEL_6_0; + } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, + 8192, 4352, 60.0, 2)) { + level = SEQ_LEVEL_6_1; + } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, + 8192, 4352, 120.0, 2)) { + level = SEQ_LEVEL_6_2; + } + + SequenceHeader *const seq_params = &cm->seq_params; + for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) { + seq->seq_level_idx[i] = level; + // Set the maximum parameters for bitrate and buffer size for this profile, + // level, and tier + seq_params->op_params[i].bitrate = av1_max_level_bitrate( + cm->seq_params.profile, seq->seq_level_idx[i], seq->tier[i]); + // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass the + // check + if (seq_params->op_params[i].bitrate == 0) + aom_internal_error( + &cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "AV1 does not support this combination of profile, level, and tier."); + // Buffer size in bits/s is bitrate in bits/s * 1 s + seq_params->op_params[i].buffer_size = seq_params->op_params[i].bitrate; + } +} + +static void init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm, + const AV1EncoderConfig *oxcf, int use_svc) { + seq->still_picture = (oxcf->force_video_mode == 0) && (oxcf->limit == 1); + seq->reduced_still_picture_hdr = seq->still_picture; + seq->reduced_still_picture_hdr &= !oxcf->full_still_picture_hdr; + seq->force_screen_content_tools = (oxcf->mode == REALTIME) ? 0 : 2; + seq->force_integer_mv = 2; + seq->order_hint_info.enable_order_hint = oxcf->enable_order_hint; + seq->frame_id_numbers_present_flag = + !(seq->still_picture && seq->reduced_still_picture_hdr) && + !oxcf->large_scale_tile && oxcf->error_resilient_mode && !use_svc; + if (seq->still_picture && seq->reduced_still_picture_hdr) { + seq->order_hint_info.enable_order_hint = 0; + seq->force_screen_content_tools = 2; + seq->force_integer_mv = 2; + } + seq->order_hint_info.order_hint_bits_minus_1 = + seq->order_hint_info.enable_order_hint + ? DEFAULT_EXPLICIT_ORDER_HINT_BITS - 1 + : -1; + + seq->max_frame_width = + oxcf->forced_max_frame_width ? oxcf->forced_max_frame_width : oxcf->width; + seq->max_frame_height = oxcf->forced_max_frame_height + ? oxcf->forced_max_frame_height + : oxcf->height; + seq->num_bits_width = + (seq->max_frame_width > 1) ? get_msb(seq->max_frame_width - 1) + 1 : 1; + seq->num_bits_height = + (seq->max_frame_height > 1) ? get_msb(seq->max_frame_height - 1) + 1 : 1; + assert(seq->num_bits_width <= 16); + assert(seq->num_bits_height <= 16); + + seq->frame_id_length = FRAME_ID_LENGTH; + seq->delta_frame_id_length = DELTA_FRAME_ID_LENGTH; + + seq->enable_dual_filter = oxcf->enable_dual_filter; + seq->order_hint_info.enable_dist_wtd_comp = oxcf->enable_dist_wtd_comp; + seq->order_hint_info.enable_dist_wtd_comp &= + seq->order_hint_info.enable_order_hint; + seq->order_hint_info.enable_ref_frame_mvs = oxcf->enable_ref_frame_mvs; + seq->order_hint_info.enable_ref_frame_mvs &= + seq->order_hint_info.enable_order_hint; + seq->enable_superres = oxcf->enable_superres; + seq->enable_cdef = oxcf->enable_cdef; + seq->enable_restoration = oxcf->enable_restoration; + seq->enable_warped_motion = oxcf->enable_warped_motion; + seq->enable_interintra_compound = oxcf->enable_interintra_comp; + seq->enable_masked_compound = oxcf->enable_masked_comp; + seq->enable_intra_edge_filter = oxcf->enable_intra_edge_filter; + seq->enable_filter_intra = oxcf->enable_filter_intra; + + set_bitstream_level_tier(seq, cm, oxcf); + + if (seq->operating_points_cnt_minus_1 == 0) { + seq->operating_point_idc[0] = 0; + } else { + // Set operating_point_idc[] such that the i=0 point corresponds to the + // highest quality operating point (all layers), and subsequent + // operarting points (i > 0) are lower quality corresponding to + // skip decoding enhancement layers (temporal first). + int i = 0; + assert(seq->operating_points_cnt_minus_1 == + (int)(cm->number_spatial_layers * cm->number_temporal_layers - 1)); + for (unsigned int sl = 0; sl < cm->number_spatial_layers; sl++) { + for (unsigned int tl = 0; tl < cm->number_temporal_layers; tl++) { + seq->operating_point_idc[i] = + (~(~0u << (cm->number_spatial_layers - sl)) << 8) | + ~(~0u << (cm->number_temporal_layers - tl)); + i++; + } + } + } +} + +static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) { + AV1_COMMON *const cm = &cpi->common; + SequenceHeader *const seq_params = &cm->seq_params; + ResizePendingParams *resize_pending_params = &cpi->resize_pending_params; + + cpi->oxcf = *oxcf; + cpi->framerate = oxcf->init_framerate; + + seq_params->profile = oxcf->profile; + seq_params->bit_depth = oxcf->bit_depth; + seq_params->use_highbitdepth = oxcf->use_highbitdepth; + seq_params->color_primaries = oxcf->color_primaries; + seq_params->transfer_characteristics = oxcf->transfer_characteristics; + seq_params->matrix_coefficients = oxcf->matrix_coefficients; + seq_params->monochrome = oxcf->monochrome; + seq_params->chroma_sample_position = oxcf->chroma_sample_position; + seq_params->color_range = oxcf->color_range; + seq_params->timing_info_present = oxcf->timing_info_present; + seq_params->timing_info.num_units_in_display_tick = + oxcf->timing_info.num_units_in_display_tick; + seq_params->timing_info.time_scale = oxcf->timing_info.time_scale; + seq_params->timing_info.equal_picture_interval = + oxcf->timing_info.equal_picture_interval; + seq_params->timing_info.num_ticks_per_picture = + oxcf->timing_info.num_ticks_per_picture; + + seq_params->display_model_info_present_flag = + oxcf->display_model_info_present_flag; + seq_params->decoder_model_info_present_flag = + oxcf->decoder_model_info_present_flag; + if (oxcf->decoder_model_info_present_flag) { + // set the decoder model parameters in schedule mode + seq_params->decoder_model_info.num_units_in_decoding_tick = + oxcf->buffer_model.num_units_in_decoding_tick; + cm->buffer_removal_time_present = 1; + av1_set_aom_dec_model_info(&seq_params->decoder_model_info); + av1_set_dec_model_op_parameters(&seq_params->op_params[0]); + } else if (seq_params->timing_info_present && + seq_params->timing_info.equal_picture_interval && + !seq_params->decoder_model_info_present_flag) { + // set the decoder model parameters in resource availability mode + av1_set_resource_availability_parameters(&seq_params->op_params[0]); + } else { + seq_params->op_params[0].initial_display_delay = + 10; // Default value (not signaled) + } + + if (seq_params->monochrome) { + seq_params->subsampling_x = 1; + seq_params->subsampling_y = 1; + } else if (seq_params->color_primaries == AOM_CICP_CP_BT_709 && + seq_params->transfer_characteristics == AOM_CICP_TC_SRGB && + seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) { + seq_params->subsampling_x = 0; + seq_params->subsampling_y = 0; + } else { + if (seq_params->profile == 0) { + seq_params->subsampling_x = 1; + seq_params->subsampling_y = 1; + } else if (seq_params->profile == 1) { + seq_params->subsampling_x = 0; + seq_params->subsampling_y = 0; + } else { + if (seq_params->bit_depth == AOM_BITS_12) { + seq_params->subsampling_x = oxcf->chroma_subsampling_x; + seq_params->subsampling_y = oxcf->chroma_subsampling_y; + } else { + seq_params->subsampling_x = 1; + seq_params->subsampling_y = 0; + } + } + } + + cm->width = oxcf->width; + cm->height = oxcf->height; + set_sb_size(seq_params, + select_sb_size(cpi)); // set sb size before allocations + alloc_compressor_data(cpi); + + update_film_grain_parameters(cpi, oxcf); + + // Single thread case: use counts in common. + cpi->td.counts = &cpi->counts; + + // Set init SVC parameters. + cpi->use_svc = 0; + cpi->svc.external_ref_frame_config = 0; + cpi->svc.non_reference_frame = 0; + cpi->svc.number_spatial_layers = 1; + cpi->svc.number_temporal_layers = 1; + cm->number_spatial_layers = 1; + cm->number_temporal_layers = 1; + cm->spatial_layer_id = 0; + cm->temporal_layer_id = 0; + + // change includes all joint functionality + av1_change_config(cpi, oxcf); + + cpi->ref_frame_flags = 0; + + // Reset resize pending flags + resize_pending_params->width = 0; + resize_pending_params->height = 0; + + init_buffer_indices(&cpi->force_intpel_info, cm->remapped_ref_idx); +} + +static void set_rc_buffer_sizes(RATE_CONTROL *rc, + const AV1EncoderConfig *oxcf) { + const int64_t bandwidth = oxcf->target_bandwidth; + const int64_t starting = oxcf->starting_buffer_level_ms; + const int64_t optimal = oxcf->optimal_buffer_level_ms; + const int64_t maximum = oxcf->maximum_buffer_size_ms; + + rc->starting_buffer_level = starting * bandwidth / 1000; + rc->optimal_buffer_level = + (optimal == 0) ? bandwidth / 8 : optimal * bandwidth / 1000; + rc->maximum_buffer_size = + (maximum == 0) ? bandwidth / 8 : maximum * bandwidth / 1000; +} + +#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \ + cpi->fn_ptr[BT].sdf = SDF; \ + cpi->fn_ptr[BT].sdaf = SDAF; \ + cpi->fn_ptr[BT].vf = VF; \ + cpi->fn_ptr[BT].svf = SVF; \ + cpi->fn_ptr[BT].svaf = SVAF; \ + cpi->fn_ptr[BT].sdx4df = SDX4DF; \ + cpi->fn_ptr[BT].jsdaf = JSDAF; \ + cpi->fn_ptr[BT].jsvaf = JSVAF; + +#define MAKE_BFP_SAD_WRAPPER(fnname) \ + static unsigned int fnname##_bits8(const uint8_t *src_ptr, \ + int source_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride); \ + } \ + static unsigned int fnname##_bits10( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 2; \ + } \ + static unsigned int fnname##_bits12( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 4; \ + } + +#define MAKE_BFP_SADAVG_WRAPPER(fnname) \ + static unsigned int fnname##_bits8( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred); \ + } \ + static unsigned int fnname##_bits10( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \ + 2; \ + } \ + static unsigned int fnname##_bits12( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \ + 4; \ + } + +#define MAKE_BFP_SAD4D_WRAPPER(fnname) \ + static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \ + const uint8_t *const ref_ptr[], int ref_stride, \ + unsigned int *sad_array) { \ + fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ + } \ + static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \ + const uint8_t *const ref_ptr[], int ref_stride, \ + unsigned int *sad_array) { \ + int i; \ + fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ + for (i = 0; i < 4; i++) sad_array[i] >>= 2; \ + } \ + static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \ + const uint8_t *const ref_ptr[], int ref_stride, \ + unsigned int *sad_array) { \ + int i; \ + fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ + for (i = 0; i < 4; i++) sad_array[i] >>= 4; \ + } + +#define MAKE_BFP_JSADAVG_WRAPPER(fnname) \ + static unsigned int fnname##_bits8( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred, \ + const DIST_WTD_COMP_PARAMS *jcp_param) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \ + jcp_param); \ + } \ + static unsigned int fnname##_bits10( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred, \ + const DIST_WTD_COMP_PARAMS *jcp_param) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \ + jcp_param) >> \ + 2; \ + } \ + static unsigned int fnname##_bits12( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred, \ + const DIST_WTD_COMP_PARAMS *jcp_param) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \ + jcp_param) >> \ + 4; \ + } + +#if CONFIG_AV1_HIGHBITDEPTH +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x128) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x128_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x64) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x64_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x128) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x128_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x16) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x16_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x32) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x32_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x32) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x32_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x64) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x64_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x32) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x32_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x64) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x64_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x16) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x16_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x8) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x8_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x16) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x16_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x8) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x8_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x4) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x4_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x8) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x8_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x4) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x4_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x4d) + +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x16) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x16_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x4) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x4_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x4x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x32) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x32_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x32x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x8) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x8_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x8x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x64) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x64_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x16) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x16_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x4d) + +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x128_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x64_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x128_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x16_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x32_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x32_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x64_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x32_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x64_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x16_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x8_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x16_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x8_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x4_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x8_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x4_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x16_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x4_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x32_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x8_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x64_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x16_avg) +#endif // CONFIG_AV1_HIGHBITDEPTH + +#define HIGHBD_MBFP(BT, MCSDF, MCSVF) \ + cpi->fn_ptr[BT].msdf = MCSDF; \ + cpi->fn_ptr[BT].msvf = MCSVF; + +#define MAKE_MBFP_COMPOUND_SAD_WRAPPER(fnname) \ + static unsigned int fnname##_bits8( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m, \ + int m_stride, int invert_mask) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \ + second_pred_ptr, m, m_stride, invert_mask); \ + } \ + static unsigned int fnname##_bits10( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m, \ + int m_stride, int invert_mask) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \ + second_pred_ptr, m, m_stride, invert_mask) >> \ + 2; \ + } \ + static unsigned int fnname##_bits12( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m, \ + int m_stride, int invert_mask) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \ + second_pred_ptr, m, m_stride, invert_mask) >> \ + 4; \ + } + +#if CONFIG_AV1_HIGHBITDEPTH +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x128) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x64) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x128) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x64) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x32) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x64) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x32) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x16) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x32) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x16) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x8) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x16) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x8) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x4) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x8) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x4) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x16) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x4) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x32) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x8) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x64) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x16) +#endif + +#define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \ + cpi->fn_ptr[BT].osdf = OSDF; \ + cpi->fn_ptr[BT].ovf = OVF; \ + cpi->fn_ptr[BT].osvf = OSVF; + +#define MAKE_OBFP_SAD_WRAPPER(fnname) \ + static unsigned int fnname##_bits8(const uint8_t *ref, int ref_stride, \ + const int32_t *wsrc, \ + const int32_t *msk) { \ + return fnname(ref, ref_stride, wsrc, msk); \ + } \ + static unsigned int fnname##_bits10(const uint8_t *ref, int ref_stride, \ + const int32_t *wsrc, \ + const int32_t *msk) { \ + return fnname(ref, ref_stride, wsrc, msk) >> 2; \ + } \ + static unsigned int fnname##_bits12(const uint8_t *ref, int ref_stride, \ + const int32_t *wsrc, \ + const int32_t *msk) { \ + return fnname(ref, ref_stride, wsrc, msk) >> 4; \ + } + +#if CONFIG_AV1_HIGHBITDEPTH +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x128) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x64) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x128) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x64) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x32) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x64) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x32) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x16) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x32) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x16) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x8) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x16) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x8) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x4) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x8) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x4) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x16) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x4) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x32) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x8) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x64) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x16) + +static void highbd_set_var_fns(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + if (cm->seq_params.use_highbitdepth) { + switch (cm->seq_params.bit_depth) { + case AOM_BITS_8: + HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits8, + aom_highbd_sad64x16_avg_bits8, aom_highbd_8_variance64x16, + aom_highbd_8_sub_pixel_variance64x16, + aom_highbd_8_sub_pixel_avg_variance64x16, + aom_highbd_sad64x16x4d_bits8, + aom_highbd_dist_wtd_sad64x16_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x16) + + HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits8, + aom_highbd_sad16x64_avg_bits8, aom_highbd_8_variance16x64, + aom_highbd_8_sub_pixel_variance16x64, + aom_highbd_8_sub_pixel_avg_variance16x64, + aom_highbd_sad16x64x4d_bits8, + aom_highbd_dist_wtd_sad16x64_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x64) + + HIGHBD_BFP( + BLOCK_32X8, aom_highbd_sad32x8_bits8, aom_highbd_sad32x8_avg_bits8, + aom_highbd_8_variance32x8, aom_highbd_8_sub_pixel_variance32x8, + aom_highbd_8_sub_pixel_avg_variance32x8, + aom_highbd_sad32x8x4d_bits8, aom_highbd_dist_wtd_sad32x8_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x8) + + HIGHBD_BFP( + BLOCK_8X32, aom_highbd_sad8x32_bits8, aom_highbd_sad8x32_avg_bits8, + aom_highbd_8_variance8x32, aom_highbd_8_sub_pixel_variance8x32, + aom_highbd_8_sub_pixel_avg_variance8x32, + aom_highbd_sad8x32x4d_bits8, aom_highbd_dist_wtd_sad8x32_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x32) + + HIGHBD_BFP( + BLOCK_16X4, aom_highbd_sad16x4_bits8, aom_highbd_sad16x4_avg_bits8, + aom_highbd_8_variance16x4, aom_highbd_8_sub_pixel_variance16x4, + aom_highbd_8_sub_pixel_avg_variance16x4, + aom_highbd_sad16x4x4d_bits8, aom_highbd_dist_wtd_sad16x4_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x4) + + HIGHBD_BFP( + BLOCK_4X16, aom_highbd_sad4x16_bits8, aom_highbd_sad4x16_avg_bits8, + aom_highbd_8_variance4x16, aom_highbd_8_sub_pixel_variance4x16, + aom_highbd_8_sub_pixel_avg_variance4x16, + aom_highbd_sad4x16x4d_bits8, aom_highbd_dist_wtd_sad4x16_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x16) + + HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits8, + aom_highbd_sad32x16_avg_bits8, aom_highbd_8_variance32x16, + aom_highbd_8_sub_pixel_variance32x16, + aom_highbd_8_sub_pixel_avg_variance32x16, + aom_highbd_sad32x16x4d_bits8, + aom_highbd_dist_wtd_sad32x16_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x16) + + HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits8, + aom_highbd_sad16x32_avg_bits8, aom_highbd_8_variance16x32, + aom_highbd_8_sub_pixel_variance16x32, + aom_highbd_8_sub_pixel_avg_variance16x32, + aom_highbd_sad16x32x4d_bits8, + aom_highbd_dist_wtd_sad16x32_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x32) + + HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits8, + aom_highbd_sad64x32_avg_bits8, aom_highbd_8_variance64x32, + aom_highbd_8_sub_pixel_variance64x32, + aom_highbd_8_sub_pixel_avg_variance64x32, + aom_highbd_sad64x32x4d_bits8, + aom_highbd_dist_wtd_sad64x32_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x32) + + HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits8, + aom_highbd_sad32x64_avg_bits8, aom_highbd_8_variance32x64, + aom_highbd_8_sub_pixel_variance32x64, + aom_highbd_8_sub_pixel_avg_variance32x64, + aom_highbd_sad32x64x4d_bits8, + aom_highbd_dist_wtd_sad32x64_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x64) + + HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits8, + aom_highbd_sad32x32_avg_bits8, aom_highbd_8_variance32x32, + aom_highbd_8_sub_pixel_variance32x32, + aom_highbd_8_sub_pixel_avg_variance32x32, + aom_highbd_sad32x32x4d_bits8, + aom_highbd_dist_wtd_sad32x32_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x32) + + HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits8, + aom_highbd_sad64x64_avg_bits8, aom_highbd_8_variance64x64, + aom_highbd_8_sub_pixel_variance64x64, + aom_highbd_8_sub_pixel_avg_variance64x64, + aom_highbd_sad64x64x4d_bits8, + aom_highbd_dist_wtd_sad64x64_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x64) + + HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits8, + aom_highbd_sad16x16_avg_bits8, aom_highbd_8_variance16x16, + aom_highbd_8_sub_pixel_variance16x16, + aom_highbd_8_sub_pixel_avg_variance16x16, + aom_highbd_sad16x16x4d_bits8, + aom_highbd_dist_wtd_sad16x16_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x16) + + HIGHBD_BFP( + BLOCK_16X8, aom_highbd_sad16x8_bits8, aom_highbd_sad16x8_avg_bits8, + aom_highbd_8_variance16x8, aom_highbd_8_sub_pixel_variance16x8, + aom_highbd_8_sub_pixel_avg_variance16x8, + aom_highbd_sad16x8x4d_bits8, aom_highbd_dist_wtd_sad16x8_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x8) + + HIGHBD_BFP( + BLOCK_8X16, aom_highbd_sad8x16_bits8, aom_highbd_sad8x16_avg_bits8, + aom_highbd_8_variance8x16, aom_highbd_8_sub_pixel_variance8x16, + aom_highbd_8_sub_pixel_avg_variance8x16, + aom_highbd_sad8x16x4d_bits8, aom_highbd_dist_wtd_sad8x16_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x16) + + HIGHBD_BFP( + BLOCK_8X8, aom_highbd_sad8x8_bits8, aom_highbd_sad8x8_avg_bits8, + aom_highbd_8_variance8x8, aom_highbd_8_sub_pixel_variance8x8, + aom_highbd_8_sub_pixel_avg_variance8x8, aom_highbd_sad8x8x4d_bits8, + aom_highbd_dist_wtd_sad8x8_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x8) + + HIGHBD_BFP( + BLOCK_8X4, aom_highbd_sad8x4_bits8, aom_highbd_sad8x4_avg_bits8, + aom_highbd_8_variance8x4, aom_highbd_8_sub_pixel_variance8x4, + aom_highbd_8_sub_pixel_avg_variance8x4, aom_highbd_sad8x4x4d_bits8, + aom_highbd_dist_wtd_sad8x4_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x4) + + HIGHBD_BFP( + BLOCK_4X8, aom_highbd_sad4x8_bits8, aom_highbd_sad4x8_avg_bits8, + aom_highbd_8_variance4x8, aom_highbd_8_sub_pixel_variance4x8, + aom_highbd_8_sub_pixel_avg_variance4x8, aom_highbd_sad4x8x4d_bits8, + aom_highbd_dist_wtd_sad4x8_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x8) + + HIGHBD_BFP( + BLOCK_4X4, aom_highbd_sad4x4_bits8, aom_highbd_sad4x4_avg_bits8, + aom_highbd_8_variance4x4, aom_highbd_8_sub_pixel_variance4x4, + aom_highbd_8_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x4d_bits8, + aom_highbd_dist_wtd_sad4x4_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x4) + + HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits8, + aom_highbd_sad128x128_avg_bits8, + aom_highbd_8_variance128x128, + aom_highbd_8_sub_pixel_variance128x128, + aom_highbd_8_sub_pixel_avg_variance128x128, + aom_highbd_sad128x128x4d_bits8, + aom_highbd_dist_wtd_sad128x128_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x128) + + HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits8, + aom_highbd_sad128x64_avg_bits8, aom_highbd_8_variance128x64, + aom_highbd_8_sub_pixel_variance128x64, + aom_highbd_8_sub_pixel_avg_variance128x64, + aom_highbd_sad128x64x4d_bits8, + aom_highbd_dist_wtd_sad128x64_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x64) + + HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits8, + aom_highbd_sad64x128_avg_bits8, aom_highbd_8_variance64x128, + aom_highbd_8_sub_pixel_variance64x128, + aom_highbd_8_sub_pixel_avg_variance64x128, + aom_highbd_sad64x128x4d_bits8, + aom_highbd_dist_wtd_sad64x128_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x128) + + HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits8, + aom_highbd_8_masked_sub_pixel_variance128x128) + HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits8, + aom_highbd_8_masked_sub_pixel_variance128x64) + HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits8, + aom_highbd_8_masked_sub_pixel_variance64x128) + HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits8, + aom_highbd_8_masked_sub_pixel_variance64x64) + HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits8, + aom_highbd_8_masked_sub_pixel_variance64x32) + HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits8, + aom_highbd_8_masked_sub_pixel_variance32x64) + HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits8, + aom_highbd_8_masked_sub_pixel_variance32x32) + HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits8, + aom_highbd_8_masked_sub_pixel_variance32x16) + HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits8, + aom_highbd_8_masked_sub_pixel_variance16x32) + HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits8, + aom_highbd_8_masked_sub_pixel_variance16x16) + HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits8, + aom_highbd_8_masked_sub_pixel_variance8x16) + HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits8, + aom_highbd_8_masked_sub_pixel_variance16x8) + HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits8, + aom_highbd_8_masked_sub_pixel_variance8x8) + HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits8, + aom_highbd_8_masked_sub_pixel_variance4x8) + HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits8, + aom_highbd_8_masked_sub_pixel_variance8x4) + HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits8, + aom_highbd_8_masked_sub_pixel_variance4x4) + HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits8, + aom_highbd_8_masked_sub_pixel_variance64x16) + HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits8, + aom_highbd_8_masked_sub_pixel_variance16x64) + HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits8, + aom_highbd_8_masked_sub_pixel_variance32x8) + HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits8, + aom_highbd_8_masked_sub_pixel_variance8x32) + HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits8, + aom_highbd_8_masked_sub_pixel_variance16x4) + HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits8, + aom_highbd_8_masked_sub_pixel_variance4x16) + HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits8, + aom_highbd_obmc_variance128x128, + aom_highbd_obmc_sub_pixel_variance128x128) + HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits8, + aom_highbd_obmc_variance128x64, + aom_highbd_obmc_sub_pixel_variance128x64) + HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits8, + aom_highbd_obmc_variance64x128, + aom_highbd_obmc_sub_pixel_variance64x128) + HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits8, + aom_highbd_obmc_variance64x64, + aom_highbd_obmc_sub_pixel_variance64x64) + HIGHBD_OBFP(BLOCK_64X32, aom_highbd_obmc_sad64x32_bits8, + aom_highbd_obmc_variance64x32, + aom_highbd_obmc_sub_pixel_variance64x32) + HIGHBD_OBFP(BLOCK_32X64, aom_highbd_obmc_sad32x64_bits8, + aom_highbd_obmc_variance32x64, + aom_highbd_obmc_sub_pixel_variance32x64) + HIGHBD_OBFP(BLOCK_32X32, aom_highbd_obmc_sad32x32_bits8, + aom_highbd_obmc_variance32x32, + aom_highbd_obmc_sub_pixel_variance32x32) + HIGHBD_OBFP(BLOCK_32X16, aom_highbd_obmc_sad32x16_bits8, + aom_highbd_obmc_variance32x16, + aom_highbd_obmc_sub_pixel_variance32x16) + HIGHBD_OBFP(BLOCK_16X32, aom_highbd_obmc_sad16x32_bits8, + aom_highbd_obmc_variance16x32, + aom_highbd_obmc_sub_pixel_variance16x32) + HIGHBD_OBFP(BLOCK_16X16, aom_highbd_obmc_sad16x16_bits8, + aom_highbd_obmc_variance16x16, + aom_highbd_obmc_sub_pixel_variance16x16) + HIGHBD_OBFP(BLOCK_8X16, aom_highbd_obmc_sad8x16_bits8, + aom_highbd_obmc_variance8x16, + aom_highbd_obmc_sub_pixel_variance8x16) + HIGHBD_OBFP(BLOCK_16X8, aom_highbd_obmc_sad16x8_bits8, + aom_highbd_obmc_variance16x8, + aom_highbd_obmc_sub_pixel_variance16x8) + HIGHBD_OBFP(BLOCK_8X8, aom_highbd_obmc_sad8x8_bits8, + aom_highbd_obmc_variance8x8, + aom_highbd_obmc_sub_pixel_variance8x8) + HIGHBD_OBFP(BLOCK_4X8, aom_highbd_obmc_sad4x8_bits8, + aom_highbd_obmc_variance4x8, + aom_highbd_obmc_sub_pixel_variance4x8) + HIGHBD_OBFP(BLOCK_8X4, aom_highbd_obmc_sad8x4_bits8, + aom_highbd_obmc_variance8x4, + aom_highbd_obmc_sub_pixel_variance8x4) + HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits8, + aom_highbd_obmc_variance4x4, + aom_highbd_obmc_sub_pixel_variance4x4) + HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits8, + aom_highbd_obmc_variance64x16, + aom_highbd_obmc_sub_pixel_variance64x16) + HIGHBD_OBFP(BLOCK_16X64, aom_highbd_obmc_sad16x64_bits8, + aom_highbd_obmc_variance16x64, + aom_highbd_obmc_sub_pixel_variance16x64) + HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits8, + aom_highbd_obmc_variance32x8, + aom_highbd_obmc_sub_pixel_variance32x8) + HIGHBD_OBFP(BLOCK_8X32, aom_highbd_obmc_sad8x32_bits8, + aom_highbd_obmc_variance8x32, + aom_highbd_obmc_sub_pixel_variance8x32) + HIGHBD_OBFP(BLOCK_16X4, aom_highbd_obmc_sad16x4_bits8, + aom_highbd_obmc_variance16x4, + aom_highbd_obmc_sub_pixel_variance16x4) + HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits8, + aom_highbd_obmc_variance4x16, + aom_highbd_obmc_sub_pixel_variance4x16) + break; + + case AOM_BITS_10: + HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits10, + aom_highbd_sad64x16_avg_bits10, aom_highbd_10_variance64x16, + aom_highbd_10_sub_pixel_variance64x16, + aom_highbd_10_sub_pixel_avg_variance64x16, + aom_highbd_sad64x16x4d_bits10, + aom_highbd_dist_wtd_sad64x16_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x16); + + HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits10, + aom_highbd_sad16x64_avg_bits10, aom_highbd_10_variance16x64, + aom_highbd_10_sub_pixel_variance16x64, + aom_highbd_10_sub_pixel_avg_variance16x64, + aom_highbd_sad16x64x4d_bits10, + aom_highbd_dist_wtd_sad16x64_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x64); + + HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits10, + aom_highbd_sad32x8_avg_bits10, aom_highbd_10_variance32x8, + aom_highbd_10_sub_pixel_variance32x8, + aom_highbd_10_sub_pixel_avg_variance32x8, + aom_highbd_sad32x8x4d_bits10, + aom_highbd_dist_wtd_sad32x8_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x8); + + HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits10, + aom_highbd_sad8x32_avg_bits10, aom_highbd_10_variance8x32, + aom_highbd_10_sub_pixel_variance8x32, + aom_highbd_10_sub_pixel_avg_variance8x32, + aom_highbd_sad8x32x4d_bits10, + aom_highbd_dist_wtd_sad8x32_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x32); + + HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits10, + aom_highbd_sad16x4_avg_bits10, aom_highbd_10_variance16x4, + aom_highbd_10_sub_pixel_variance16x4, + aom_highbd_10_sub_pixel_avg_variance16x4, + aom_highbd_sad16x4x4d_bits10, + aom_highbd_dist_wtd_sad16x4_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x4); + + HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits10, + aom_highbd_sad4x16_avg_bits10, aom_highbd_10_variance4x16, + aom_highbd_10_sub_pixel_variance4x16, + aom_highbd_10_sub_pixel_avg_variance4x16, + aom_highbd_sad4x16x4d_bits10, + aom_highbd_dist_wtd_sad4x16_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x16); + + HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits10, + aom_highbd_sad32x16_avg_bits10, aom_highbd_10_variance32x16, + aom_highbd_10_sub_pixel_variance32x16, + aom_highbd_10_sub_pixel_avg_variance32x16, + aom_highbd_sad32x16x4d_bits10, + aom_highbd_dist_wtd_sad32x16_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x16); + + HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits10, + aom_highbd_sad16x32_avg_bits10, aom_highbd_10_variance16x32, + aom_highbd_10_sub_pixel_variance16x32, + aom_highbd_10_sub_pixel_avg_variance16x32, + aom_highbd_sad16x32x4d_bits10, + aom_highbd_dist_wtd_sad16x32_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x32); + + HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits10, + aom_highbd_sad64x32_avg_bits10, aom_highbd_10_variance64x32, + aom_highbd_10_sub_pixel_variance64x32, + aom_highbd_10_sub_pixel_avg_variance64x32, + aom_highbd_sad64x32x4d_bits10, + aom_highbd_dist_wtd_sad64x32_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x32); + + HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits10, + aom_highbd_sad32x64_avg_bits10, aom_highbd_10_variance32x64, + aom_highbd_10_sub_pixel_variance32x64, + aom_highbd_10_sub_pixel_avg_variance32x64, + aom_highbd_sad32x64x4d_bits10, + aom_highbd_dist_wtd_sad32x64_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x64); + + HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits10, + aom_highbd_sad32x32_avg_bits10, aom_highbd_10_variance32x32, + aom_highbd_10_sub_pixel_variance32x32, + aom_highbd_10_sub_pixel_avg_variance32x32, + aom_highbd_sad32x32x4d_bits10, + aom_highbd_dist_wtd_sad32x32_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x32); + + HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits10, + aom_highbd_sad64x64_avg_bits10, aom_highbd_10_variance64x64, + aom_highbd_10_sub_pixel_variance64x64, + aom_highbd_10_sub_pixel_avg_variance64x64, + aom_highbd_sad64x64x4d_bits10, + aom_highbd_dist_wtd_sad64x64_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x64); + + HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits10, + aom_highbd_sad16x16_avg_bits10, aom_highbd_10_variance16x16, + aom_highbd_10_sub_pixel_variance16x16, + aom_highbd_10_sub_pixel_avg_variance16x16, + aom_highbd_sad16x16x4d_bits10, + aom_highbd_dist_wtd_sad16x16_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x16); + + HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits10, + aom_highbd_sad16x8_avg_bits10, aom_highbd_10_variance16x8, + aom_highbd_10_sub_pixel_variance16x8, + aom_highbd_10_sub_pixel_avg_variance16x8, + aom_highbd_sad16x8x4d_bits10, + aom_highbd_dist_wtd_sad16x8_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x8); + + HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits10, + aom_highbd_sad8x16_avg_bits10, aom_highbd_10_variance8x16, + aom_highbd_10_sub_pixel_variance8x16, + aom_highbd_10_sub_pixel_avg_variance8x16, + aom_highbd_sad8x16x4d_bits10, + aom_highbd_dist_wtd_sad8x16_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x16); + + HIGHBD_BFP( + BLOCK_8X8, aom_highbd_sad8x8_bits10, aom_highbd_sad8x8_avg_bits10, + aom_highbd_10_variance8x8, aom_highbd_10_sub_pixel_variance8x8, + aom_highbd_10_sub_pixel_avg_variance8x8, + aom_highbd_sad8x8x4d_bits10, aom_highbd_dist_wtd_sad8x8_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x8); + + HIGHBD_BFP( + BLOCK_8X4, aom_highbd_sad8x4_bits10, aom_highbd_sad8x4_avg_bits10, + aom_highbd_10_variance8x4, aom_highbd_10_sub_pixel_variance8x4, + aom_highbd_10_sub_pixel_avg_variance8x4, + aom_highbd_sad8x4x4d_bits10, aom_highbd_dist_wtd_sad8x4_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x4); + + HIGHBD_BFP( + BLOCK_4X8, aom_highbd_sad4x8_bits10, aom_highbd_sad4x8_avg_bits10, + aom_highbd_10_variance4x8, aom_highbd_10_sub_pixel_variance4x8, + aom_highbd_10_sub_pixel_avg_variance4x8, + aom_highbd_sad4x8x4d_bits10, aom_highbd_dist_wtd_sad4x8_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x8); + + HIGHBD_BFP( + BLOCK_4X4, aom_highbd_sad4x4_bits10, aom_highbd_sad4x4_avg_bits10, + aom_highbd_10_variance4x4, aom_highbd_10_sub_pixel_variance4x4, + aom_highbd_10_sub_pixel_avg_variance4x4, + aom_highbd_sad4x4x4d_bits10, aom_highbd_dist_wtd_sad4x4_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x4); + + HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits10, + aom_highbd_sad128x128_avg_bits10, + aom_highbd_10_variance128x128, + aom_highbd_10_sub_pixel_variance128x128, + aom_highbd_10_sub_pixel_avg_variance128x128, + aom_highbd_sad128x128x4d_bits10, + aom_highbd_dist_wtd_sad128x128_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x128); + + HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits10, + aom_highbd_sad128x64_avg_bits10, + aom_highbd_10_variance128x64, + aom_highbd_10_sub_pixel_variance128x64, + aom_highbd_10_sub_pixel_avg_variance128x64, + aom_highbd_sad128x64x4d_bits10, + aom_highbd_dist_wtd_sad128x64_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x64); + + HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits10, + aom_highbd_sad64x128_avg_bits10, + aom_highbd_10_variance64x128, + aom_highbd_10_sub_pixel_variance64x128, + aom_highbd_10_sub_pixel_avg_variance64x128, + aom_highbd_sad64x128x4d_bits10, + aom_highbd_dist_wtd_sad64x128_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x128); + + HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits10, + aom_highbd_10_masked_sub_pixel_variance128x128) + HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits10, + aom_highbd_10_masked_sub_pixel_variance128x64) + HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits10, + aom_highbd_10_masked_sub_pixel_variance64x128) + HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits10, + aom_highbd_10_masked_sub_pixel_variance64x64) + HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits10, + aom_highbd_10_masked_sub_pixel_variance64x32) + HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits10, + aom_highbd_10_masked_sub_pixel_variance32x64) + HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits10, + aom_highbd_10_masked_sub_pixel_variance32x32) + HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits10, + aom_highbd_10_masked_sub_pixel_variance32x16) + HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits10, + aom_highbd_10_masked_sub_pixel_variance16x32) + HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits10, + aom_highbd_10_masked_sub_pixel_variance16x16) + HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits10, + aom_highbd_10_masked_sub_pixel_variance8x16) + HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits10, + aom_highbd_10_masked_sub_pixel_variance16x8) + HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits10, + aom_highbd_10_masked_sub_pixel_variance8x8) + HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits10, + aom_highbd_10_masked_sub_pixel_variance4x8) + HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits10, + aom_highbd_10_masked_sub_pixel_variance8x4) + HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits10, + aom_highbd_10_masked_sub_pixel_variance4x4) + HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits10, + aom_highbd_10_masked_sub_pixel_variance64x16) + HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits10, + aom_highbd_10_masked_sub_pixel_variance16x64) + HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits10, + aom_highbd_10_masked_sub_pixel_variance32x8) + HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits10, + aom_highbd_10_masked_sub_pixel_variance8x32) + HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits10, + aom_highbd_10_masked_sub_pixel_variance16x4) + HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits10, + aom_highbd_10_masked_sub_pixel_variance4x16) + HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits10, + aom_highbd_10_obmc_variance128x128, + aom_highbd_10_obmc_sub_pixel_variance128x128) + HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits10, + aom_highbd_10_obmc_variance128x64, + aom_highbd_10_obmc_sub_pixel_variance128x64) + HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits10, + aom_highbd_10_obmc_variance64x128, + aom_highbd_10_obmc_sub_pixel_variance64x128) + HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits10, + aom_highbd_10_obmc_variance64x64, + aom_highbd_10_obmc_sub_pixel_variance64x64) + HIGHBD_OBFP(BLOCK_64X32, aom_highbd_obmc_sad64x32_bits10, + aom_highbd_10_obmc_variance64x32, + aom_highbd_10_obmc_sub_pixel_variance64x32) + HIGHBD_OBFP(BLOCK_32X64, aom_highbd_obmc_sad32x64_bits10, + aom_highbd_10_obmc_variance32x64, + aom_highbd_10_obmc_sub_pixel_variance32x64) + HIGHBD_OBFP(BLOCK_32X32, aom_highbd_obmc_sad32x32_bits10, + aom_highbd_10_obmc_variance32x32, + aom_highbd_10_obmc_sub_pixel_variance32x32) + HIGHBD_OBFP(BLOCK_32X16, aom_highbd_obmc_sad32x16_bits10, + aom_highbd_10_obmc_variance32x16, + aom_highbd_10_obmc_sub_pixel_variance32x16) + HIGHBD_OBFP(BLOCK_16X32, aom_highbd_obmc_sad16x32_bits10, + aom_highbd_10_obmc_variance16x32, + aom_highbd_10_obmc_sub_pixel_variance16x32) + HIGHBD_OBFP(BLOCK_16X16, aom_highbd_obmc_sad16x16_bits10, + aom_highbd_10_obmc_variance16x16, + aom_highbd_10_obmc_sub_pixel_variance16x16) + HIGHBD_OBFP(BLOCK_8X16, aom_highbd_obmc_sad8x16_bits10, + aom_highbd_10_obmc_variance8x16, + aom_highbd_10_obmc_sub_pixel_variance8x16) + HIGHBD_OBFP(BLOCK_16X8, aom_highbd_obmc_sad16x8_bits10, + aom_highbd_10_obmc_variance16x8, + aom_highbd_10_obmc_sub_pixel_variance16x8) + HIGHBD_OBFP(BLOCK_8X8, aom_highbd_obmc_sad8x8_bits10, + aom_highbd_10_obmc_variance8x8, + aom_highbd_10_obmc_sub_pixel_variance8x8) + HIGHBD_OBFP(BLOCK_4X8, aom_highbd_obmc_sad4x8_bits10, + aom_highbd_10_obmc_variance4x8, + aom_highbd_10_obmc_sub_pixel_variance4x8) + HIGHBD_OBFP(BLOCK_8X4, aom_highbd_obmc_sad8x4_bits10, + aom_highbd_10_obmc_variance8x4, + aom_highbd_10_obmc_sub_pixel_variance8x4) + HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits10, + aom_highbd_10_obmc_variance4x4, + aom_highbd_10_obmc_sub_pixel_variance4x4) + + HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits10, + aom_highbd_10_obmc_variance64x16, + aom_highbd_10_obmc_sub_pixel_variance64x16) + + HIGHBD_OBFP(BLOCK_16X64, aom_highbd_obmc_sad16x64_bits10, + aom_highbd_10_obmc_variance16x64, + aom_highbd_10_obmc_sub_pixel_variance16x64) + + HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits10, + aom_highbd_10_obmc_variance32x8, + aom_highbd_10_obmc_sub_pixel_variance32x8) + + HIGHBD_OBFP(BLOCK_8X32, aom_highbd_obmc_sad8x32_bits10, + aom_highbd_10_obmc_variance8x32, + aom_highbd_10_obmc_sub_pixel_variance8x32) + + HIGHBD_OBFP(BLOCK_16X4, aom_highbd_obmc_sad16x4_bits10, + aom_highbd_10_obmc_variance16x4, + aom_highbd_10_obmc_sub_pixel_variance16x4) + + HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits10, + aom_highbd_10_obmc_variance4x16, + aom_highbd_10_obmc_sub_pixel_variance4x16) + break; + + case AOM_BITS_12: + HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits12, + aom_highbd_sad64x16_avg_bits12, aom_highbd_12_variance64x16, + aom_highbd_12_sub_pixel_variance64x16, + aom_highbd_12_sub_pixel_avg_variance64x16, + aom_highbd_sad64x16x4d_bits12, + aom_highbd_dist_wtd_sad64x16_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x16); + + HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits12, + aom_highbd_sad16x64_avg_bits12, aom_highbd_12_variance16x64, + aom_highbd_12_sub_pixel_variance16x64, + aom_highbd_12_sub_pixel_avg_variance16x64, + aom_highbd_sad16x64x4d_bits12, + aom_highbd_dist_wtd_sad16x64_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x64); + + HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits12, + aom_highbd_sad32x8_avg_bits12, aom_highbd_12_variance32x8, + aom_highbd_12_sub_pixel_variance32x8, + aom_highbd_12_sub_pixel_avg_variance32x8, + aom_highbd_sad32x8x4d_bits12, + aom_highbd_dist_wtd_sad32x8_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x8); + + HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits12, + aom_highbd_sad8x32_avg_bits12, aom_highbd_12_variance8x32, + aom_highbd_12_sub_pixel_variance8x32, + aom_highbd_12_sub_pixel_avg_variance8x32, + aom_highbd_sad8x32x4d_bits12, + aom_highbd_dist_wtd_sad8x32_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x32); + + HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits12, + aom_highbd_sad16x4_avg_bits12, aom_highbd_12_variance16x4, + aom_highbd_12_sub_pixel_variance16x4, + aom_highbd_12_sub_pixel_avg_variance16x4, + aom_highbd_sad16x4x4d_bits12, + aom_highbd_dist_wtd_sad16x4_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x4); + + HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits12, + aom_highbd_sad4x16_avg_bits12, aom_highbd_12_variance4x16, + aom_highbd_12_sub_pixel_variance4x16, + aom_highbd_12_sub_pixel_avg_variance4x16, + aom_highbd_sad4x16x4d_bits12, + aom_highbd_dist_wtd_sad4x16_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x16); + + HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits12, + aom_highbd_sad32x16_avg_bits12, aom_highbd_12_variance32x16, + aom_highbd_12_sub_pixel_variance32x16, + aom_highbd_12_sub_pixel_avg_variance32x16, + aom_highbd_sad32x16x4d_bits12, + aom_highbd_dist_wtd_sad32x16_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x16); + + HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits12, + aom_highbd_sad16x32_avg_bits12, aom_highbd_12_variance16x32, + aom_highbd_12_sub_pixel_variance16x32, + aom_highbd_12_sub_pixel_avg_variance16x32, + aom_highbd_sad16x32x4d_bits12, + aom_highbd_dist_wtd_sad16x32_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x32); + + HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits12, + aom_highbd_sad64x32_avg_bits12, aom_highbd_12_variance64x32, + aom_highbd_12_sub_pixel_variance64x32, + aom_highbd_12_sub_pixel_avg_variance64x32, + aom_highbd_sad64x32x4d_bits12, + aom_highbd_dist_wtd_sad64x32_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x32); + + HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits12, + aom_highbd_sad32x64_avg_bits12, aom_highbd_12_variance32x64, + aom_highbd_12_sub_pixel_variance32x64, + aom_highbd_12_sub_pixel_avg_variance32x64, + aom_highbd_sad32x64x4d_bits12, + aom_highbd_dist_wtd_sad32x64_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x64); + + HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits12, + aom_highbd_sad32x32_avg_bits12, aom_highbd_12_variance32x32, + aom_highbd_12_sub_pixel_variance32x32, + aom_highbd_12_sub_pixel_avg_variance32x32, + aom_highbd_sad32x32x4d_bits12, + aom_highbd_dist_wtd_sad32x32_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x32); + + HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits12, + aom_highbd_sad64x64_avg_bits12, aom_highbd_12_variance64x64, + aom_highbd_12_sub_pixel_variance64x64, + aom_highbd_12_sub_pixel_avg_variance64x64, + aom_highbd_sad64x64x4d_bits12, + aom_highbd_dist_wtd_sad64x64_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x64); + + HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits12, + aom_highbd_sad16x16_avg_bits12, aom_highbd_12_variance16x16, + aom_highbd_12_sub_pixel_variance16x16, + aom_highbd_12_sub_pixel_avg_variance16x16, + aom_highbd_sad16x16x4d_bits12, + aom_highbd_dist_wtd_sad16x16_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x16); + + HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits12, + aom_highbd_sad16x8_avg_bits12, aom_highbd_12_variance16x8, + aom_highbd_12_sub_pixel_variance16x8, + aom_highbd_12_sub_pixel_avg_variance16x8, + aom_highbd_sad16x8x4d_bits12, + aom_highbd_dist_wtd_sad16x8_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x8); + + HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits12, + aom_highbd_sad8x16_avg_bits12, aom_highbd_12_variance8x16, + aom_highbd_12_sub_pixel_variance8x16, + aom_highbd_12_sub_pixel_avg_variance8x16, + aom_highbd_sad8x16x4d_bits12, + aom_highbd_dist_wtd_sad8x16_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x16); + + HIGHBD_BFP( + BLOCK_8X8, aom_highbd_sad8x8_bits12, aom_highbd_sad8x8_avg_bits12, + aom_highbd_12_variance8x8, aom_highbd_12_sub_pixel_variance8x8, + aom_highbd_12_sub_pixel_avg_variance8x8, + aom_highbd_sad8x8x4d_bits12, aom_highbd_dist_wtd_sad8x8_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x8); + + HIGHBD_BFP( + BLOCK_8X4, aom_highbd_sad8x4_bits12, aom_highbd_sad8x4_avg_bits12, + aom_highbd_12_variance8x4, aom_highbd_12_sub_pixel_variance8x4, + aom_highbd_12_sub_pixel_avg_variance8x4, + aom_highbd_sad8x4x4d_bits12, aom_highbd_dist_wtd_sad8x4_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x4); + + HIGHBD_BFP( + BLOCK_4X8, aom_highbd_sad4x8_bits12, aom_highbd_sad4x8_avg_bits12, + aom_highbd_12_variance4x8, aom_highbd_12_sub_pixel_variance4x8, + aom_highbd_12_sub_pixel_avg_variance4x8, + aom_highbd_sad4x8x4d_bits12, aom_highbd_dist_wtd_sad4x8_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x8); + + HIGHBD_BFP( + BLOCK_4X4, aom_highbd_sad4x4_bits12, aom_highbd_sad4x4_avg_bits12, + aom_highbd_12_variance4x4, aom_highbd_12_sub_pixel_variance4x4, + aom_highbd_12_sub_pixel_avg_variance4x4, + aom_highbd_sad4x4x4d_bits12, aom_highbd_dist_wtd_sad4x4_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x4); + + HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits12, + aom_highbd_sad128x128_avg_bits12, + aom_highbd_12_variance128x128, + aom_highbd_12_sub_pixel_variance128x128, + aom_highbd_12_sub_pixel_avg_variance128x128, + aom_highbd_sad128x128x4d_bits12, + aom_highbd_dist_wtd_sad128x128_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x128); + + HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits12, + aom_highbd_sad128x64_avg_bits12, + aom_highbd_12_variance128x64, + aom_highbd_12_sub_pixel_variance128x64, + aom_highbd_12_sub_pixel_avg_variance128x64, + aom_highbd_sad128x64x4d_bits12, + aom_highbd_dist_wtd_sad128x64_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x64); + + HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits12, + aom_highbd_sad64x128_avg_bits12, + aom_highbd_12_variance64x128, + aom_highbd_12_sub_pixel_variance64x128, + aom_highbd_12_sub_pixel_avg_variance64x128, + aom_highbd_sad64x128x4d_bits12, + aom_highbd_dist_wtd_sad64x128_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x128); + + HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits12, + aom_highbd_12_masked_sub_pixel_variance128x128) + HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits12, + aom_highbd_12_masked_sub_pixel_variance128x64) + HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits12, + aom_highbd_12_masked_sub_pixel_variance64x128) + HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits12, + aom_highbd_12_masked_sub_pixel_variance64x64) + HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits12, + aom_highbd_12_masked_sub_pixel_variance64x32) + HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits12, + aom_highbd_12_masked_sub_pixel_variance32x64) + HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits12, + aom_highbd_12_masked_sub_pixel_variance32x32) + HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits12, + aom_highbd_12_masked_sub_pixel_variance32x16) + HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits12, + aom_highbd_12_masked_sub_pixel_variance16x32) + HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits12, + aom_highbd_12_masked_sub_pixel_variance16x16) + HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits12, + aom_highbd_12_masked_sub_pixel_variance8x16) + HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits12, + aom_highbd_12_masked_sub_pixel_variance16x8) + HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits12, + aom_highbd_12_masked_sub_pixel_variance8x8) + HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits12, + aom_highbd_12_masked_sub_pixel_variance4x8) + HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits12, + aom_highbd_12_masked_sub_pixel_variance8x4) + HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits12, + aom_highbd_12_masked_sub_pixel_variance4x4) + HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits12, + aom_highbd_12_masked_sub_pixel_variance64x16) + HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits12, + aom_highbd_12_masked_sub_pixel_variance16x64) + HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits12, + aom_highbd_12_masked_sub_pixel_variance32x8) + HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits12, + aom_highbd_12_masked_sub_pixel_variance8x32) + HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits12, + aom_highbd_12_masked_sub_pixel_variance16x4) + HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits12, + aom_highbd_12_masked_sub_pixel_variance4x16) + HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits12, + aom_highbd_12_obmc_variance128x128, + aom_highbd_12_obmc_sub_pixel_variance128x128) + HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits12, + aom_highbd_12_obmc_variance128x64, + aom_highbd_12_obmc_sub_pixel_variance128x64) + HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits12, + aom_highbd_12_obmc_variance64x128, + aom_highbd_12_obmc_sub_pixel_variance64x128) + HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits12, + aom_highbd_12_obmc_variance64x64, + aom_highbd_12_obmc_sub_pixel_variance64x64) + HIGHBD_OBFP(BLOCK_64X32, aom_highbd_obmc_sad64x32_bits12, + aom_highbd_12_obmc_variance64x32, + aom_highbd_12_obmc_sub_pixel_variance64x32) + HIGHBD_OBFP(BLOCK_32X64, aom_highbd_obmc_sad32x64_bits12, + aom_highbd_12_obmc_variance32x64, + aom_highbd_12_obmc_sub_pixel_variance32x64) + HIGHBD_OBFP(BLOCK_32X32, aom_highbd_obmc_sad32x32_bits12, + aom_highbd_12_obmc_variance32x32, + aom_highbd_12_obmc_sub_pixel_variance32x32) + HIGHBD_OBFP(BLOCK_32X16, aom_highbd_obmc_sad32x16_bits12, + aom_highbd_12_obmc_variance32x16, + aom_highbd_12_obmc_sub_pixel_variance32x16) + HIGHBD_OBFP(BLOCK_16X32, aom_highbd_obmc_sad16x32_bits12, + aom_highbd_12_obmc_variance16x32, + aom_highbd_12_obmc_sub_pixel_variance16x32) + HIGHBD_OBFP(BLOCK_16X16, aom_highbd_obmc_sad16x16_bits12, + aom_highbd_12_obmc_variance16x16, + aom_highbd_12_obmc_sub_pixel_variance16x16) + HIGHBD_OBFP(BLOCK_8X16, aom_highbd_obmc_sad8x16_bits12, + aom_highbd_12_obmc_variance8x16, + aom_highbd_12_obmc_sub_pixel_variance8x16) + HIGHBD_OBFP(BLOCK_16X8, aom_highbd_obmc_sad16x8_bits12, + aom_highbd_12_obmc_variance16x8, + aom_highbd_12_obmc_sub_pixel_variance16x8) + HIGHBD_OBFP(BLOCK_8X8, aom_highbd_obmc_sad8x8_bits12, + aom_highbd_12_obmc_variance8x8, + aom_highbd_12_obmc_sub_pixel_variance8x8) + HIGHBD_OBFP(BLOCK_4X8, aom_highbd_obmc_sad4x8_bits12, + aom_highbd_12_obmc_variance4x8, + aom_highbd_12_obmc_sub_pixel_variance4x8) + HIGHBD_OBFP(BLOCK_8X4, aom_highbd_obmc_sad8x4_bits12, + aom_highbd_12_obmc_variance8x4, + aom_highbd_12_obmc_sub_pixel_variance8x4) + HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits12, + aom_highbd_12_obmc_variance4x4, + aom_highbd_12_obmc_sub_pixel_variance4x4) + HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits12, + aom_highbd_12_obmc_variance64x16, + aom_highbd_12_obmc_sub_pixel_variance64x16) + HIGHBD_OBFP(BLOCK_16X64, aom_highbd_obmc_sad16x64_bits12, + aom_highbd_12_obmc_variance16x64, + aom_highbd_12_obmc_sub_pixel_variance16x64) + HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits12, + aom_highbd_12_obmc_variance32x8, + aom_highbd_12_obmc_sub_pixel_variance32x8) + HIGHBD_OBFP(BLOCK_8X32, aom_highbd_obmc_sad8x32_bits12, + aom_highbd_12_obmc_variance8x32, + aom_highbd_12_obmc_sub_pixel_variance8x32) + HIGHBD_OBFP(BLOCK_16X4, aom_highbd_obmc_sad16x4_bits12, + aom_highbd_12_obmc_variance16x4, + aom_highbd_12_obmc_sub_pixel_variance16x4) + HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits12, + aom_highbd_12_obmc_variance4x16, + aom_highbd_12_obmc_sub_pixel_variance4x16) + break; + + default: + assert(0 && + "cm->seq_params.bit_depth should be AOM_BITS_8, " + "AOM_BITS_10 or AOM_BITS_12"); + } + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static void realloc_segmentation_maps(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + CommonModeInfoParams *const mi_params = &cm->mi_params; + + // Create the encoder segmentation map and set all entries to 0 + aom_free(cpi->enc_seg.map); + CHECK_MEM_ERROR(cm, cpi->enc_seg.map, + aom_calloc(mi_params->mi_rows * mi_params->mi_cols, 1)); + + // Create a map used for cyclic background refresh. + if (cpi->cyclic_refresh) av1_cyclic_refresh_free(cpi->cyclic_refresh); + CHECK_MEM_ERROR( + cm, cpi->cyclic_refresh, + av1_cyclic_refresh_alloc(mi_params->mi_rows, mi_params->mi_cols)); + + // Create a map used to mark inactive areas. + aom_free(cpi->active_map.map); + CHECK_MEM_ERROR(cm, cpi->active_map.map, + aom_calloc(mi_params->mi_rows * mi_params->mi_cols, 1)); +} + +static AOM_INLINE void set_tpl_stats_block_size(int width, int height, + uint8_t *block_mis_log2) { + const int is_720p_or_larger = AOMMIN(width, height) >= 720; + + // 0: 4x4, 1: 8x8, 2: 16x16 + *block_mis_log2 = is_720p_or_larger ? 2 : 1; +} + +void av1_alloc_compound_type_rd_buffers(AV1_COMMON *const cm, + CompoundTypeRdBuffers *const bufs) { + CHECK_MEM_ERROR( + cm, bufs->pred0, + (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred0))); + CHECK_MEM_ERROR( + cm, bufs->pred1, + (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred1))); + CHECK_MEM_ERROR( + cm, bufs->residual1, + (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->residual1))); + CHECK_MEM_ERROR( + cm, bufs->diff10, + (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->diff10))); + CHECK_MEM_ERROR(cm, bufs->tmp_best_mask_buf, + (uint8_t *)aom_malloc(2 * MAX_SB_SQUARE * + sizeof(*bufs->tmp_best_mask_buf))); +} + +void av1_release_compound_type_rd_buffers(CompoundTypeRdBuffers *const bufs) { + aom_free(bufs->pred0); + aom_free(bufs->pred1); + aom_free(bufs->residual1); + aom_free(bufs->diff10); + aom_free(bufs->tmp_best_mask_buf); + av1_zero(*bufs); // Set all pointers to NULL for safety. +} + +static void config_target_level(AV1_COMP *const cpi, AV1_LEVEL target_level, + int tier) { + aom_clear_system_state(); + + AV1EncoderConfig *const oxcf = &cpi->oxcf; + SequenceHeader *const seq_params = &cpi->common.seq_params; + + // Adjust target bitrate to be no larger than 70% of level limit. + const BITSTREAM_PROFILE profile = seq_params->profile; + const double level_bitrate_limit = + av1_get_max_bitrate_for_level(target_level, tier, profile); + const int64_t max_bitrate = (int64_t)(level_bitrate_limit * 0.70); + oxcf->target_bandwidth = AOMMIN(oxcf->target_bandwidth, max_bitrate); + // Also need to update cpi->twopass.bits_left. + TWO_PASS *const twopass = &cpi->twopass; + FIRSTPASS_STATS *stats = twopass->stats_buf_ctx->total_stats; + if (stats != NULL) + cpi->twopass.bits_left = + (int64_t)(stats->duration * cpi->oxcf.target_bandwidth / 10000000.0); + + // Adjust max over-shoot percentage. + oxcf->over_shoot_pct = 0; + + // Adjust max quantizer. + oxcf->worst_allowed_q = 255; + + // Adjust number of tiles and tile columns to be under level limit. + int max_tiles, max_tile_cols; + av1_get_max_tiles_for_level(target_level, &max_tiles, &max_tile_cols); + while (oxcf->tile_columns > 0 && (1 << oxcf->tile_columns) > max_tile_cols) { + --oxcf->tile_columns; + } + const int tile_cols = (1 << oxcf->tile_columns); + while (oxcf->tile_rows > 0 && + tile_cols * (1 << oxcf->tile_rows) > max_tiles) { + --oxcf->tile_rows; + } + + // Adjust min compression ratio. + const int still_picture = seq_params->still_picture; + const double min_cr = + av1_get_min_cr_for_level(target_level, tier, still_picture); + oxcf->min_cr = AOMMAX(oxcf->min_cr, (unsigned int)(min_cr * 100)); +} + +void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { + AV1_COMMON *const cm = &cpi->common; + SequenceHeader *const seq_params = &cm->seq_params; + const int num_planes = av1_num_planes(cm); + RATE_CONTROL *const rc = &cpi->rc; + MACROBLOCK *const x = &cpi->td.mb; + AV1LevelParams *const level_params = &cpi->level_params; + + if (seq_params->profile != oxcf->profile) seq_params->profile = oxcf->profile; + seq_params->bit_depth = oxcf->bit_depth; + seq_params->color_primaries = oxcf->color_primaries; + seq_params->transfer_characteristics = oxcf->transfer_characteristics; + seq_params->matrix_coefficients = oxcf->matrix_coefficients; + seq_params->monochrome = oxcf->monochrome; + seq_params->chroma_sample_position = oxcf->chroma_sample_position; + seq_params->color_range = oxcf->color_range; + + assert(IMPLIES(seq_params->profile <= PROFILE_1, + seq_params->bit_depth <= AOM_BITS_10)); + + seq_params->timing_info_present = oxcf->timing_info_present; + seq_params->timing_info.num_units_in_display_tick = + oxcf->timing_info.num_units_in_display_tick; + seq_params->timing_info.time_scale = oxcf->timing_info.time_scale; + seq_params->timing_info.equal_picture_interval = + oxcf->timing_info.equal_picture_interval; + seq_params->timing_info.num_ticks_per_picture = + oxcf->timing_info.num_ticks_per_picture; + + seq_params->display_model_info_present_flag = + oxcf->display_model_info_present_flag; + seq_params->decoder_model_info_present_flag = + oxcf->decoder_model_info_present_flag; + if (oxcf->decoder_model_info_present_flag) { + // set the decoder model parameters in schedule mode + seq_params->decoder_model_info.num_units_in_decoding_tick = + oxcf->buffer_model.num_units_in_decoding_tick; + cm->buffer_removal_time_present = 1; + av1_set_aom_dec_model_info(&seq_params->decoder_model_info); + av1_set_dec_model_op_parameters(&seq_params->op_params[0]); + } else if (seq_params->timing_info_present && + seq_params->timing_info.equal_picture_interval && + !seq_params->decoder_model_info_present_flag) { + // set the decoder model parameters in resource availability mode + av1_set_resource_availability_parameters(&seq_params->op_params[0]); + } else { + seq_params->op_params[0].initial_display_delay = + 10; // Default value (not signaled) + } + + update_film_grain_parameters(cpi, oxcf); + + cpi->oxcf = *oxcf; + cpi->superres_mode = oxcf->superres_mode; // default + x->e_mbd.bd = (int)seq_params->bit_depth; + x->e_mbd.global_motion = cm->global_motion; + + memcpy(level_params->target_seq_level_idx, cpi->oxcf.target_seq_level_idx, + sizeof(level_params->target_seq_level_idx)); + level_params->keep_level_stats = 0; + for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) { + if (level_params->target_seq_level_idx[i] <= SEQ_LEVELS) { + level_params->keep_level_stats |= 1u << i; + if (!level_params->level_info[i]) { + CHECK_MEM_ERROR(cm, level_params->level_info[i], + aom_calloc(1, sizeof(*level_params->level_info[i]))); + } + } + } + + // TODO(huisu@): level targeting currently only works for the 0th operating + // point, so scalable coding is not supported yet. + if (level_params->target_seq_level_idx[0] < SEQ_LEVELS) { + // Adjust encoder config in order to meet target level. + config_target_level(cpi, level_params->target_seq_level_idx[0], + seq_params->tier[0]); + } + + if ((has_no_stats_stage(cpi)) && (oxcf->rc_mode == AOM_Q)) { + rc->baseline_gf_interval = FIXED_GF_INTERVAL; + } else { + rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2; + } + + cpi->refresh_golden_frame = 0; + cpi->refresh_bwd_ref_frame = 0; + + cm->features.refresh_frame_context = (oxcf->frame_parallel_decoding_mode) + ? REFRESH_FRAME_CONTEXT_DISABLED + : REFRESH_FRAME_CONTEXT_BACKWARD; + if (oxcf->large_scale_tile) + cm->features.refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED; + + if (x->palette_buffer == NULL) { + CHECK_MEM_ERROR(cm, x->palette_buffer, + aom_memalign(16, sizeof(*x->palette_buffer))); + } + + if (x->comp_rd_buffer.pred0 == NULL) { + av1_alloc_compound_type_rd_buffers(cm, &x->comp_rd_buffer); + } + + if (x->tmp_conv_dst == NULL) { + CHECK_MEM_ERROR( + cm, x->tmp_conv_dst, + aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(*x->tmp_conv_dst))); + x->e_mbd.tmp_conv_dst = x->tmp_conv_dst; + } + for (int i = 0; i < 2; ++i) { + if (x->tmp_obmc_bufs[i] == NULL) { + CHECK_MEM_ERROR(cm, x->tmp_obmc_bufs[i], + aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE * + sizeof(*x->tmp_obmc_bufs[i]))); + x->e_mbd.tmp_obmc_bufs[i] = x->tmp_obmc_bufs[i]; + } + } + + av1_reset_segment_features(cm); + + av1_set_high_precision_mv(cpi, 1, 0); + + set_rc_buffer_sizes(rc, &cpi->oxcf); + + // Under a configuration change, where maximum_buffer_size may change, + // keep buffer level clipped to the maximum allowed buffer size. + rc->bits_off_target = AOMMIN(rc->bits_off_target, rc->maximum_buffer_size); + rc->buffer_level = AOMMIN(rc->buffer_level, rc->maximum_buffer_size); + + // Set up frame rate and related parameters rate control values. + av1_new_framerate(cpi, cpi->framerate); + + // Set absolute upper and lower quality limits + rc->worst_quality = cpi->oxcf.worst_allowed_q; + rc->best_quality = cpi->oxcf.best_allowed_q; + + cm->features.interp_filter = + oxcf->large_scale_tile ? EIGHTTAP_REGULAR : SWITCHABLE; + cm->features.switchable_motion_mode = 1; + + if (cpi->oxcf.render_width > 0 && cpi->oxcf.render_height > 0) { + cm->render_width = cpi->oxcf.render_width; + cm->render_height = cpi->oxcf.render_height; + } else { + cm->render_width = cpi->oxcf.width; + cm->render_height = cpi->oxcf.height; + } + cm->width = cpi->oxcf.width; + cm->height = cpi->oxcf.height; + + int sb_size = seq_params->sb_size; + // Superblock size should not be updated after the first key frame. + if (!cpi->seq_params_locked) { + set_sb_size(&cm->seq_params, select_sb_size(cpi)); + for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) + seq_params->tier[i] = (oxcf->tier_mask >> i) & 1; + } + + if (cpi->initial_width || sb_size != seq_params->sb_size) { + if (cm->width > cpi->initial_width || cm->height > cpi->initial_height || + seq_params->sb_size != sb_size) { + av1_free_context_buffers(cm); + av1_free_pc_tree(cpi, &cpi->td, num_planes, (BLOCK_SIZE)sb_size); + alloc_compressor_data(cpi); + realloc_segmentation_maps(cpi); + cpi->initial_width = cpi->initial_height = 0; + } + } + update_frame_size(cpi); + + rc->is_src_frame_alt_ref = 0; + + set_tile_info(cpi); + + if (!cpi->svc.external_ref_frame_config) + cpi->ext_flags.refresh_frame_flags_pending = 0; + cpi->ext_flags.refresh_frame_context_pending = 0; + +#if CONFIG_AV1_HIGHBITDEPTH + highbd_set_var_fns(cpi); +#endif + + // Init sequence level coding tools + // This should not be called after the first key frame. + if (!cpi->seq_params_locked) { + seq_params->operating_points_cnt_minus_1 = + (cm->number_spatial_layers > 1 || cm->number_temporal_layers > 1) + ? cm->number_spatial_layers * cm->number_temporal_layers - 1 + : 0; + init_seq_coding_tools(&cm->seq_params, cm, oxcf, cpi->use_svc); + } + + if (cpi->use_svc) + av1_update_layer_context_change_config(cpi, oxcf->target_bandwidth); +} + +static INLINE void setup_tpl_buffers(AV1_COMMON *const cm, + TplParams *const tpl_data) { + CommonModeInfoParams *const mi_params = &cm->mi_params; + set_tpl_stats_block_size(cm->width, cm->height, + &tpl_data->tpl_stats_block_mis_log2); + const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; + + for (int frame = 0; frame < MAX_LENGTH_TPL_FRAME_STATS; ++frame) { + const int mi_cols = + ALIGN_POWER_OF_TWO(mi_params->mi_cols, MAX_MIB_SIZE_LOG2); + const int mi_rows = + ALIGN_POWER_OF_TWO(mi_params->mi_rows, MAX_MIB_SIZE_LOG2); + + tpl_data->tpl_stats_buffer[frame].is_valid = 0; + tpl_data->tpl_stats_buffer[frame].width = mi_cols >> block_mis_log2; + tpl_data->tpl_stats_buffer[frame].height = mi_rows >> block_mis_log2; + tpl_data->tpl_stats_buffer[frame].stride = + tpl_data->tpl_stats_buffer[frame].width; + tpl_data->tpl_stats_buffer[frame].mi_rows = mi_params->mi_rows; + tpl_data->tpl_stats_buffer[frame].mi_cols = mi_params->mi_cols; + } + + for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) { + CHECK_MEM_ERROR( + cm, tpl_data->tpl_stats_pool[frame], + aom_calloc(tpl_data->tpl_stats_buffer[frame].width * + tpl_data->tpl_stats_buffer[frame].height, + sizeof(*tpl_data->tpl_stats_buffer[frame].tpl_stats_ptr))); + if (aom_alloc_frame_buffer( + &tpl_data->tpl_rec_pool[frame], cm->width, cm->height, + cm->seq_params.subsampling_x, cm->seq_params.subsampling_y, + cm->seq_params.use_highbitdepth, AOM_ENC_NO_SCALE_BORDER, + cm->features.byte_alignment)) + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + } + + tpl_data->tpl_frame = &tpl_data->tpl_stats_buffer[REF_FRAMES + 1]; +} + +static INLINE void init_frame_info(FRAME_INFO *frame_info, + const AV1_COMMON *const cm) { + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const SequenceHeader *const seq_params = &cm->seq_params; + frame_info->frame_width = cm->width; + frame_info->frame_height = cm->height; + frame_info->mi_cols = mi_params->mi_cols; + frame_info->mi_rows = mi_params->mi_rows; + frame_info->mb_cols = mi_params->mb_cols; + frame_info->mb_rows = mi_params->mb_rows; + frame_info->num_mbs = mi_params->MBs; + frame_info->bit_depth = seq_params->bit_depth; + frame_info->subsampling_x = seq_params->subsampling_x; + frame_info->subsampling_y = seq_params->subsampling_y; +} + +AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, BufferPool *const pool, + FIRSTPASS_STATS *frame_stats_buf, + COMPRESSOR_STAGE stage, int num_lap_buffers, + int lap_lag_in_frames, + STATS_BUFFER_CTX *stats_buf_context) { + AV1_COMP *volatile const cpi = aom_memalign(32, sizeof(AV1_COMP)); + AV1_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL; + + if (!cm) return NULL; + + av1_zero(*cpi); + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(cm->error.jmp)) { + cm->error.setjmp = 0; + av1_remove_compressor(cpi); + return 0; + } + + cm->error.setjmp = 1; + cpi->lap_enabled = num_lap_buffers > 0; + cpi->compressor_stage = stage; + + CommonModeInfoParams *const mi_params = &cm->mi_params; + mi_params->free_mi = enc_free_mi; + mi_params->setup_mi = enc_setup_mi; + mi_params->set_mb_mi = (oxcf->pass == 1 || cpi->compressor_stage == LAP_STAGE) + ? stat_stage_set_mb_mi + : enc_set_mb_mi; + + mi_params->mi_alloc_bsize = BLOCK_4X4; + + CHECK_MEM_ERROR(cm, cm->fc, + (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc))); + CHECK_MEM_ERROR( + cm, cm->default_frame_context, + (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->default_frame_context))); + memset(cm->fc, 0, sizeof(*cm->fc)); + memset(cm->default_frame_context, 0, sizeof(*cm->default_frame_context)); + + cpi->common.buffer_pool = pool; + + init_config(cpi, oxcf); + if (cpi->compressor_stage == LAP_STAGE) { + cpi->oxcf.lag_in_frames = lap_lag_in_frames; + } + + av1_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc); + + cpi->rc.enable_scenecut_detection = 1; + if (cpi->lap_enabled && + (num_lap_buffers < (MAX_GF_LENGTH_LAP + SCENE_CUT_KEY_TEST_INTERVAL + 1))) + cpi->rc.enable_scenecut_detection = 0; + init_frame_info(&cpi->frame_info, cm); + + cm->current_frame.frame_number = 0; + cm->current_frame_id = -1; + cpi->seq_params_locked = 0; + cpi->partition_search_skippable_frame = 0; + cpi->tile_data = NULL; + cpi->last_show_frame_buf = NULL; + realloc_segmentation_maps(cpi); + + cpi->refresh_alt_ref_frame = 0; + + cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS; +#if CONFIG_INTERNAL_STATS + cpi->b_calculate_blockiness = 1; + cpi->b_calculate_consistency = 1; + cpi->total_inconsistency = 0; + cpi->psnr.worst = 100.0; + cpi->worst_ssim = 100.0; + + cpi->count = 0; + cpi->bytes = 0; +#if CONFIG_SPEED_STATS + cpi->tx_search_count = 0; +#endif // CONFIG_SPEED_STATS + + if (cpi->b_calculate_psnr) { + cpi->total_sq_error = 0; + cpi->total_samples = 0; + cpi->tot_recode_hits = 0; + cpi->summed_quality = 0; + cpi->summed_weights = 0; + } + + cpi->fastssim.worst = 100.0; + cpi->psnrhvs.worst = 100.0; + + if (cpi->b_calculate_blockiness) { + cpi->total_blockiness = 0; + cpi->worst_blockiness = 0.0; + } + + if (cpi->b_calculate_consistency) { + CHECK_MEM_ERROR( + cm, cpi->ssim_vars, + aom_malloc(sizeof(*cpi->ssim_vars) * 4 * cpi->common.mi_params.mi_rows * + cpi->common.mi_params.mi_cols)); + cpi->worst_consistency = 100.0; + } +#endif +#if CONFIG_ENTROPY_STATS + av1_zero(aggregate_fc); +#endif // CONFIG_ENTROPY_STATS + + cpi->time_stamps.first_ever = INT64_MAX; + +#ifdef OUTPUT_YUV_SKINMAP + yuv_skinmap_file = fopen("skinmap.yuv", "ab"); +#endif +#ifdef OUTPUT_YUV_REC + yuv_rec_file = fopen("rec.yuv", "wb"); +#endif + + assert(MAX_LAP_BUFFERS >= MAX_LAG_BUFFERS); + int size = get_stats_buf_size(num_lap_buffers, MAX_LAG_BUFFERS); + for (int i = 0; i < size; i++) + cpi->twopass.frame_stats_arr[i] = &frame_stats_buf[i]; + + cpi->twopass.stats_buf_ctx = stats_buf_context; + cpi->twopass.stats_in = cpi->twopass.stats_buf_ctx->stats_in_start; + +#if !CONFIG_REALTIME_ONLY + if (is_stat_consumption_stage(cpi)) { + const size_t packet_sz = sizeof(FIRSTPASS_STATS); + const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz); + + if (!cpi->lap_enabled) { + /*Re-initialize to stats buffer, populated by application in the case of + * two pass*/ + cpi->twopass.stats_buf_ctx->stats_in_start = oxcf->two_pass_stats_in.buf; + cpi->twopass.stats_in = cpi->twopass.stats_buf_ctx->stats_in_start; + cpi->twopass.stats_buf_ctx->stats_in_end = + &cpi->twopass.stats_buf_ctx->stats_in_start[packets - 1]; + + av1_init_second_pass(cpi); + } else { + av1_init_single_pass_lap(cpi); + } + } +#endif + + int sb_mi_size = av1_get_sb_mi_size(cm); + + CHECK_MEM_ERROR( + cm, cpi->td.mb.above_pred_buf, + (uint8_t *)aom_memalign(16, MAX_MB_PLANE * MAX_SB_SQUARE * + sizeof(*cpi->td.mb.above_pred_buf))); + CHECK_MEM_ERROR( + cm, cpi->td.mb.left_pred_buf, + (uint8_t *)aom_memalign(16, MAX_MB_PLANE * MAX_SB_SQUARE * + sizeof(*cpi->td.mb.left_pred_buf))); + + CHECK_MEM_ERROR(cm, cpi->td.mb.wsrc_buf, + (int32_t *)aom_memalign( + 16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.wsrc_buf))); + + CHECK_MEM_ERROR( + cm, cpi->td.mb.inter_modes_info, + (InterModesInfo *)aom_malloc(sizeof(*cpi->td.mb.inter_modes_info))); + + for (int x = 0; x < 2; x++) + for (int y = 0; y < 2; y++) + CHECK_MEM_ERROR( + cm, cpi->td.mb.intrabc_hash_info.hash_value_buffer[x][y], + (uint32_t *)aom_malloc( + AOM_BUFFER_SIZE_FOR_BLOCK_HASH * + sizeof(*cpi->td.mb.intrabc_hash_info.hash_value_buffer[0][0]))); + + cpi->td.mb.intrabc_hash_info.g_crc_initialized = 0; + + CHECK_MEM_ERROR(cm, cpi->td.mb.mask_buf, + (int32_t *)aom_memalign( + 16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.mask_buf))); + + CHECK_MEM_ERROR(cm, cpi->td.mb.mbmi_ext, + aom_calloc(sb_mi_size, sizeof(*cpi->td.mb.mbmi_ext))); + + av1_set_speed_features_framesize_independent(cpi, oxcf->speed); + av1_set_speed_features_framesize_dependent(cpi, oxcf->speed); + + { + const int bsize = BLOCK_16X16; + const int w = mi_size_wide[bsize]; + const int h = mi_size_high[bsize]; + const int num_cols = (mi_params->mi_cols + w - 1) / w; + const int num_rows = (mi_params->mi_rows + h - 1) / h; + CHECK_MEM_ERROR(cm, cpi->tpl_rdmult_scaling_factors, + aom_calloc(num_rows * num_cols, + sizeof(*cpi->tpl_rdmult_scaling_factors))); + CHECK_MEM_ERROR(cm, cpi->tpl_sb_rdmult_scaling_factors, + aom_calloc(num_rows * num_cols, + sizeof(*cpi->tpl_sb_rdmult_scaling_factors))); + } + + { + const int bsize = BLOCK_16X16; + const int w = mi_size_wide[bsize]; + const int h = mi_size_high[bsize]; + const int num_cols = (mi_params->mi_cols + w - 1) / w; + const int num_rows = (mi_params->mi_rows + h - 1) / h; + CHECK_MEM_ERROR(cm, cpi->ssim_rdmult_scaling_factors, + aom_calloc(num_rows * num_cols, + sizeof(*cpi->ssim_rdmult_scaling_factors))); + } + +#if CONFIG_TUNE_VMAF + { + const int bsize = BLOCK_64X64; + const int w = mi_size_wide[bsize]; + const int h = mi_size_high[bsize]; + const int num_cols = (mi_params->mi_cols + w - 1) / w; + const int num_rows = (mi_params->mi_rows + h - 1) / h; + CHECK_MEM_ERROR(cm, cpi->vmaf_rdmult_scaling_factors, + aom_calloc(num_rows * num_cols, + sizeof(*cpi->vmaf_rdmult_scaling_factors))); + cpi->last_frame_unsharp_amount = 0.0; + } +#endif + + if (!is_stat_generation_stage(cpi)) { + setup_tpl_buffers(cm, &cpi->tpl_data); + } + +#if CONFIG_COLLECT_PARTITION_STATS == 2 + av1_zero(cpi->partition_stats); +#endif + +#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \ + cpi->fn_ptr[BT].sdf = SDF; \ + cpi->fn_ptr[BT].sdaf = SDAF; \ + cpi->fn_ptr[BT].vf = VF; \ + cpi->fn_ptr[BT].svf = SVF; \ + cpi->fn_ptr[BT].svaf = SVAF; \ + cpi->fn_ptr[BT].sdx4df = SDX4DF; \ + cpi->fn_ptr[BT].jsdaf = JSDAF; \ + cpi->fn_ptr[BT].jsvaf = JSVAF; + + BFP(BLOCK_4X16, aom_sad4x16, aom_sad4x16_avg, aom_variance4x16, + aom_sub_pixel_variance4x16, aom_sub_pixel_avg_variance4x16, + aom_sad4x16x4d, aom_dist_wtd_sad4x16_avg, + aom_dist_wtd_sub_pixel_avg_variance4x16) + + BFP(BLOCK_16X4, aom_sad16x4, aom_sad16x4_avg, aom_variance16x4, + aom_sub_pixel_variance16x4, aom_sub_pixel_avg_variance16x4, + aom_sad16x4x4d, aom_dist_wtd_sad16x4_avg, + aom_dist_wtd_sub_pixel_avg_variance16x4) + + BFP(BLOCK_8X32, aom_sad8x32, aom_sad8x32_avg, aom_variance8x32, + aom_sub_pixel_variance8x32, aom_sub_pixel_avg_variance8x32, + aom_sad8x32x4d, aom_dist_wtd_sad8x32_avg, + aom_dist_wtd_sub_pixel_avg_variance8x32) + + BFP(BLOCK_32X8, aom_sad32x8, aom_sad32x8_avg, aom_variance32x8, + aom_sub_pixel_variance32x8, aom_sub_pixel_avg_variance32x8, + aom_sad32x8x4d, aom_dist_wtd_sad32x8_avg, + aom_dist_wtd_sub_pixel_avg_variance32x8) + + BFP(BLOCK_16X64, aom_sad16x64, aom_sad16x64_avg, aom_variance16x64, + aom_sub_pixel_variance16x64, aom_sub_pixel_avg_variance16x64, + aom_sad16x64x4d, aom_dist_wtd_sad16x64_avg, + aom_dist_wtd_sub_pixel_avg_variance16x64) + + BFP(BLOCK_64X16, aom_sad64x16, aom_sad64x16_avg, aom_variance64x16, + aom_sub_pixel_variance64x16, aom_sub_pixel_avg_variance64x16, + aom_sad64x16x4d, aom_dist_wtd_sad64x16_avg, + aom_dist_wtd_sub_pixel_avg_variance64x16) + + BFP(BLOCK_128X128, aom_sad128x128, aom_sad128x128_avg, aom_variance128x128, + aom_sub_pixel_variance128x128, aom_sub_pixel_avg_variance128x128, + aom_sad128x128x4d, aom_dist_wtd_sad128x128_avg, + aom_dist_wtd_sub_pixel_avg_variance128x128) + + BFP(BLOCK_128X64, aom_sad128x64, aom_sad128x64_avg, aom_variance128x64, + aom_sub_pixel_variance128x64, aom_sub_pixel_avg_variance128x64, + aom_sad128x64x4d, aom_dist_wtd_sad128x64_avg, + aom_dist_wtd_sub_pixel_avg_variance128x64) + + BFP(BLOCK_64X128, aom_sad64x128, aom_sad64x128_avg, aom_variance64x128, + aom_sub_pixel_variance64x128, aom_sub_pixel_avg_variance64x128, + aom_sad64x128x4d, aom_dist_wtd_sad64x128_avg, + aom_dist_wtd_sub_pixel_avg_variance64x128) + + BFP(BLOCK_32X16, aom_sad32x16, aom_sad32x16_avg, aom_variance32x16, + aom_sub_pixel_variance32x16, aom_sub_pixel_avg_variance32x16, + aom_sad32x16x4d, aom_dist_wtd_sad32x16_avg, + aom_dist_wtd_sub_pixel_avg_variance32x16) + + BFP(BLOCK_16X32, aom_sad16x32, aom_sad16x32_avg, aom_variance16x32, + aom_sub_pixel_variance16x32, aom_sub_pixel_avg_variance16x32, + aom_sad16x32x4d, aom_dist_wtd_sad16x32_avg, + aom_dist_wtd_sub_pixel_avg_variance16x32) + + BFP(BLOCK_64X32, aom_sad64x32, aom_sad64x32_avg, aom_variance64x32, + aom_sub_pixel_variance64x32, aom_sub_pixel_avg_variance64x32, + aom_sad64x32x4d, aom_dist_wtd_sad64x32_avg, + aom_dist_wtd_sub_pixel_avg_variance64x32) + + BFP(BLOCK_32X64, aom_sad32x64, aom_sad32x64_avg, aom_variance32x64, + aom_sub_pixel_variance32x64, aom_sub_pixel_avg_variance32x64, + aom_sad32x64x4d, aom_dist_wtd_sad32x64_avg, + aom_dist_wtd_sub_pixel_avg_variance32x64) + + BFP(BLOCK_32X32, aom_sad32x32, aom_sad32x32_avg, aom_variance32x32, + aom_sub_pixel_variance32x32, aom_sub_pixel_avg_variance32x32, + aom_sad32x32x4d, aom_dist_wtd_sad32x32_avg, + aom_dist_wtd_sub_pixel_avg_variance32x32) + + BFP(BLOCK_64X64, aom_sad64x64, aom_sad64x64_avg, aom_variance64x64, + aom_sub_pixel_variance64x64, aom_sub_pixel_avg_variance64x64, + aom_sad64x64x4d, aom_dist_wtd_sad64x64_avg, + aom_dist_wtd_sub_pixel_avg_variance64x64) + + BFP(BLOCK_16X16, aom_sad16x16, aom_sad16x16_avg, aom_variance16x16, + aom_sub_pixel_variance16x16, aom_sub_pixel_avg_variance16x16, + aom_sad16x16x4d, aom_dist_wtd_sad16x16_avg, + aom_dist_wtd_sub_pixel_avg_variance16x16) + + BFP(BLOCK_16X8, aom_sad16x8, aom_sad16x8_avg, aom_variance16x8, + aom_sub_pixel_variance16x8, aom_sub_pixel_avg_variance16x8, + aom_sad16x8x4d, aom_dist_wtd_sad16x8_avg, + aom_dist_wtd_sub_pixel_avg_variance16x8) + + BFP(BLOCK_8X16, aom_sad8x16, aom_sad8x16_avg, aom_variance8x16, + aom_sub_pixel_variance8x16, aom_sub_pixel_avg_variance8x16, + aom_sad8x16x4d, aom_dist_wtd_sad8x16_avg, + aom_dist_wtd_sub_pixel_avg_variance8x16) + + BFP(BLOCK_8X8, aom_sad8x8, aom_sad8x8_avg, aom_variance8x8, + aom_sub_pixel_variance8x8, aom_sub_pixel_avg_variance8x8, aom_sad8x8x4d, + aom_dist_wtd_sad8x8_avg, aom_dist_wtd_sub_pixel_avg_variance8x8) + + BFP(BLOCK_8X4, aom_sad8x4, aom_sad8x4_avg, aom_variance8x4, + aom_sub_pixel_variance8x4, aom_sub_pixel_avg_variance8x4, aom_sad8x4x4d, + aom_dist_wtd_sad8x4_avg, aom_dist_wtd_sub_pixel_avg_variance8x4) + + BFP(BLOCK_4X8, aom_sad4x8, aom_sad4x8_avg, aom_variance4x8, + aom_sub_pixel_variance4x8, aom_sub_pixel_avg_variance4x8, aom_sad4x8x4d, + aom_dist_wtd_sad4x8_avg, aom_dist_wtd_sub_pixel_avg_variance4x8) + + BFP(BLOCK_4X4, aom_sad4x4, aom_sad4x4_avg, aom_variance4x4, + aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x4d, + aom_dist_wtd_sad4x4_avg, aom_dist_wtd_sub_pixel_avg_variance4x4) + +#define OBFP(BT, OSDF, OVF, OSVF) \ + cpi->fn_ptr[BT].osdf = OSDF; \ + cpi->fn_ptr[BT].ovf = OVF; \ + cpi->fn_ptr[BT].osvf = OSVF; + + OBFP(BLOCK_128X128, aom_obmc_sad128x128, aom_obmc_variance128x128, + aom_obmc_sub_pixel_variance128x128) + OBFP(BLOCK_128X64, aom_obmc_sad128x64, aom_obmc_variance128x64, + aom_obmc_sub_pixel_variance128x64) + OBFP(BLOCK_64X128, aom_obmc_sad64x128, aom_obmc_variance64x128, + aom_obmc_sub_pixel_variance64x128) + OBFP(BLOCK_64X64, aom_obmc_sad64x64, aom_obmc_variance64x64, + aom_obmc_sub_pixel_variance64x64) + OBFP(BLOCK_64X32, aom_obmc_sad64x32, aom_obmc_variance64x32, + aom_obmc_sub_pixel_variance64x32) + OBFP(BLOCK_32X64, aom_obmc_sad32x64, aom_obmc_variance32x64, + aom_obmc_sub_pixel_variance32x64) + OBFP(BLOCK_32X32, aom_obmc_sad32x32, aom_obmc_variance32x32, + aom_obmc_sub_pixel_variance32x32) + OBFP(BLOCK_32X16, aom_obmc_sad32x16, aom_obmc_variance32x16, + aom_obmc_sub_pixel_variance32x16) + OBFP(BLOCK_16X32, aom_obmc_sad16x32, aom_obmc_variance16x32, + aom_obmc_sub_pixel_variance16x32) + OBFP(BLOCK_16X16, aom_obmc_sad16x16, aom_obmc_variance16x16, + aom_obmc_sub_pixel_variance16x16) + OBFP(BLOCK_16X8, aom_obmc_sad16x8, aom_obmc_variance16x8, + aom_obmc_sub_pixel_variance16x8) + OBFP(BLOCK_8X16, aom_obmc_sad8x16, aom_obmc_variance8x16, + aom_obmc_sub_pixel_variance8x16) + OBFP(BLOCK_8X8, aom_obmc_sad8x8, aom_obmc_variance8x8, + aom_obmc_sub_pixel_variance8x8) + OBFP(BLOCK_4X8, aom_obmc_sad4x8, aom_obmc_variance4x8, + aom_obmc_sub_pixel_variance4x8) + OBFP(BLOCK_8X4, aom_obmc_sad8x4, aom_obmc_variance8x4, + aom_obmc_sub_pixel_variance8x4) + OBFP(BLOCK_4X4, aom_obmc_sad4x4, aom_obmc_variance4x4, + aom_obmc_sub_pixel_variance4x4) + OBFP(BLOCK_4X16, aom_obmc_sad4x16, aom_obmc_variance4x16, + aom_obmc_sub_pixel_variance4x16) + OBFP(BLOCK_16X4, aom_obmc_sad16x4, aom_obmc_variance16x4, + aom_obmc_sub_pixel_variance16x4) + OBFP(BLOCK_8X32, aom_obmc_sad8x32, aom_obmc_variance8x32, + aom_obmc_sub_pixel_variance8x32) + OBFP(BLOCK_32X8, aom_obmc_sad32x8, aom_obmc_variance32x8, + aom_obmc_sub_pixel_variance32x8) + OBFP(BLOCK_16X64, aom_obmc_sad16x64, aom_obmc_variance16x64, + aom_obmc_sub_pixel_variance16x64) + OBFP(BLOCK_64X16, aom_obmc_sad64x16, aom_obmc_variance64x16, + aom_obmc_sub_pixel_variance64x16) + +#define MBFP(BT, MCSDF, MCSVF) \ + cpi->fn_ptr[BT].msdf = MCSDF; \ + cpi->fn_ptr[BT].msvf = MCSVF; + + MBFP(BLOCK_128X128, aom_masked_sad128x128, + aom_masked_sub_pixel_variance128x128) + MBFP(BLOCK_128X64, aom_masked_sad128x64, aom_masked_sub_pixel_variance128x64) + MBFP(BLOCK_64X128, aom_masked_sad64x128, aom_masked_sub_pixel_variance64x128) + MBFP(BLOCK_64X64, aom_masked_sad64x64, aom_masked_sub_pixel_variance64x64) + MBFP(BLOCK_64X32, aom_masked_sad64x32, aom_masked_sub_pixel_variance64x32) + MBFP(BLOCK_32X64, aom_masked_sad32x64, aom_masked_sub_pixel_variance32x64) + MBFP(BLOCK_32X32, aom_masked_sad32x32, aom_masked_sub_pixel_variance32x32) + MBFP(BLOCK_32X16, aom_masked_sad32x16, aom_masked_sub_pixel_variance32x16) + MBFP(BLOCK_16X32, aom_masked_sad16x32, aom_masked_sub_pixel_variance16x32) + MBFP(BLOCK_16X16, aom_masked_sad16x16, aom_masked_sub_pixel_variance16x16) + MBFP(BLOCK_16X8, aom_masked_sad16x8, aom_masked_sub_pixel_variance16x8) + MBFP(BLOCK_8X16, aom_masked_sad8x16, aom_masked_sub_pixel_variance8x16) + MBFP(BLOCK_8X8, aom_masked_sad8x8, aom_masked_sub_pixel_variance8x8) + MBFP(BLOCK_4X8, aom_masked_sad4x8, aom_masked_sub_pixel_variance4x8) + MBFP(BLOCK_8X4, aom_masked_sad8x4, aom_masked_sub_pixel_variance8x4) + MBFP(BLOCK_4X4, aom_masked_sad4x4, aom_masked_sub_pixel_variance4x4) + + MBFP(BLOCK_4X16, aom_masked_sad4x16, aom_masked_sub_pixel_variance4x16) + + MBFP(BLOCK_16X4, aom_masked_sad16x4, aom_masked_sub_pixel_variance16x4) + + MBFP(BLOCK_8X32, aom_masked_sad8x32, aom_masked_sub_pixel_variance8x32) + + MBFP(BLOCK_32X8, aom_masked_sad32x8, aom_masked_sub_pixel_variance32x8) + + MBFP(BLOCK_16X64, aom_masked_sad16x64, aom_masked_sub_pixel_variance16x64) + + MBFP(BLOCK_64X16, aom_masked_sad64x16, aom_masked_sub_pixel_variance64x16) + +#if CONFIG_AV1_HIGHBITDEPTH + highbd_set_var_fns(cpi); +#endif + + /* av1_init_quantizer() is first called here. Add check in + * av1_frame_init_quantizer() so that av1_init_quantizer is only + * called later when needed. This will avoid unnecessary calls of + * av1_init_quantizer() for every frame. + */ + av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params, + cm->seq_params.bit_depth); + av1_qm_init(&cm->quant_params, av1_num_planes(cm)); + + av1_loop_filter_init(cm); + cm->superres_scale_denominator = SCALE_NUMERATOR; + cm->superres_upscaled_width = oxcf->width; + cm->superres_upscaled_height = oxcf->height; + av1_loop_restoration_precal(); + + cm->error.setjmp = 0; + + return cpi; +} + +#if CONFIG_INTERNAL_STATS +#define SNPRINT(H, T) snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T)) + +#define SNPRINT2(H, T, V) \ + snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V)) +#endif // CONFIG_INTERNAL_STATS + +void av1_remove_compressor(AV1_COMP *cpi) { + AV1_COMMON *cm; + TplParams *const tpl_data = &cpi->tpl_data; + int t; + + if (!cpi) return; + + cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + + if (cm->current_frame.frame_number > 0) { +#if CONFIG_ENTROPY_STATS + if (!is_stat_generation_stage(cpi)) { + fprintf(stderr, "Writing counts.stt\n"); + FILE *f = fopen("counts.stt", "wb"); + fwrite(&aggregate_fc, sizeof(aggregate_fc), 1, f); + fclose(f); + } +#endif // CONFIG_ENTROPY_STATS +#if CONFIG_INTERNAL_STATS + aom_clear_system_state(); + + if (!is_stat_generation_stage(cpi)) { + char headings[512] = { 0 }; + char results[512] = { 0 }; + FILE *f = fopen("opsnr.stt", "a"); + double time_encoded = + (cpi->time_stamps.prev_end_seen - cpi->time_stamps.first_ever) / + 10000000.000; + double total_encode_time = + (cpi->time_receive_data + cpi->time_compress_data) / 1000.000; + const double dr = + (double)cpi->bytes * (double)8 / (double)1000 / time_encoded; + const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1); + const double target_rate = (double)cpi->oxcf.target_bandwidth / 1000; + const double rate_err = ((100.0 * (dr - target_rate)) / target_rate); + + if (cpi->b_calculate_psnr) { + const double total_psnr = aom_sse_to_psnr( + (double)cpi->total_samples, peak, (double)cpi->total_sq_error); + const double total_ssim = + 100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0); + snprintf(headings, sizeof(headings), + "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t" + "AOMSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t" + "WstPsnr\tWstSsim\tWstFast\tWstHVS\t" + "AVPsrnY\tAPsnrCb\tAPsnrCr"); + snprintf(results, sizeof(results), + "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t" + "%7.3f\t%7.3f\t%7.3f\t%7.3f\t" + "%7.3f\t%7.3f\t%7.3f\t%7.3f\t" + "%7.3f\t%7.3f\t%7.3f", + dr, cpi->psnr.stat[STAT_ALL] / cpi->count, total_psnr, + cpi->psnr.stat[STAT_ALL] / cpi->count, total_psnr, total_ssim, + total_ssim, cpi->fastssim.stat[STAT_ALL] / cpi->count, + cpi->psnrhvs.stat[STAT_ALL] / cpi->count, cpi->psnr.worst, + cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst, + cpi->psnr.stat[STAT_Y] / cpi->count, + cpi->psnr.stat[STAT_U] / cpi->count, + cpi->psnr.stat[STAT_V] / cpi->count); + + if (cpi->b_calculate_blockiness) { + SNPRINT(headings, "\t Block\tWstBlck"); + SNPRINT2(results, "\t%7.3f", cpi->total_blockiness / cpi->count); + SNPRINT2(results, "\t%7.3f", cpi->worst_blockiness); + } + + if (cpi->b_calculate_consistency) { + double consistency = + aom_sse_to_psnr((double)cpi->total_samples, peak, + (double)cpi->total_inconsistency); + + SNPRINT(headings, "\tConsist\tWstCons"); + SNPRINT2(results, "\t%7.3f", consistency); + SNPRINT2(results, "\t%7.3f", cpi->worst_consistency); + } + + SNPRINT(headings, "\t Time\tRcErr\tAbsErr"); + SNPRINT2(results, "\t%8.0f", total_encode_time); + SNPRINT2(results, "\t%7.2f", rate_err); + SNPRINT2(results, "\t%7.2f", fabs(rate_err)); + + fprintf(f, "%s\tAPsnr611\n", headings); + fprintf(f, "%s\t%7.3f\n", results, + (6 * cpi->psnr.stat[STAT_Y] + cpi->psnr.stat[STAT_U] + + cpi->psnr.stat[STAT_V]) / + (cpi->count * 8)); + } + + fclose(f); + } +#endif // CONFIG_INTERNAL_STATS +#if CONFIG_SPEED_STATS + if (!is_stat_generation_stage(cpi)) { + fprintf(stdout, "tx_search_count = %d\n", cpi->tx_search_count); + } +#endif // CONFIG_SPEED_STATS + +#if CONFIG_COLLECT_PARTITION_STATS == 2 + if (!is_stat_generation_stage(cpi)) { + av1_print_partition_stats(&cpi->partition_stats); + } +#endif + } + + for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) { + aom_free(tpl_data->tpl_stats_pool[frame]); + aom_free_frame_buffer(&tpl_data->tpl_rec_pool[frame]); + } + + for (t = cpi->num_workers - 1; t >= 0; --t) { + AVxWorker *const worker = &cpi->workers[t]; + EncWorkerData *const thread_data = &cpi->tile_thr_data[t]; + + // Deallocate allocated threads. + aom_get_worker_interface()->end(worker); + + // Deallocate allocated thread data. + aom_free(thread_data->td->tctx); + if (t > 0) { + aom_free(thread_data->td->palette_buffer); + aom_free(thread_data->td->tmp_conv_dst); + av1_release_compound_type_rd_buffers(&thread_data->td->comp_rd_buffer); + for (int j = 0; j < 2; ++j) { + aom_free(thread_data->td->tmp_obmc_bufs[j]); + } + aom_free(thread_data->td->above_pred_buf); + aom_free(thread_data->td->left_pred_buf); + aom_free(thread_data->td->wsrc_buf); + aom_free(thread_data->td->vt64x64); + + aom_free(thread_data->td->inter_modes_info); + for (int x = 0; x < 2; x++) { + for (int y = 0; y < 2; y++) { + aom_free(thread_data->td->hash_value_buffer[x][y]); + thread_data->td->hash_value_buffer[x][y] = NULL; + } + } + aom_free(thread_data->td->mask_buf); + aom_free(thread_data->td->counts); + av1_free_pc_tree(cpi, thread_data->td, num_planes, + cm->seq_params.sb_size); + aom_free(thread_data->td->mbmi_ext); + aom_free(thread_data->td); + } + } +#if CONFIG_MULTITHREAD + if (cpi->row_mt_mutex_ != NULL) { + pthread_mutex_destroy(cpi->row_mt_mutex_); + aom_free(cpi->row_mt_mutex_); + } +#endif + av1_row_mt_mem_dealloc(cpi); + aom_free(cpi->tile_thr_data); + aom_free(cpi->workers); + + if (cpi->num_workers > 1) { + av1_loop_filter_dealloc(&cpi->lf_row_sync); + av1_loop_restoration_dealloc(&cpi->lr_row_sync, cpi->num_workers); + } + + dealloc_compressor_data(cpi); + +#if CONFIG_INTERNAL_STATS + aom_free(cpi->ssim_vars); + cpi->ssim_vars = NULL; +#endif // CONFIG_INTERNAL_STATS + + av1_remove_common(cm); +#if CONFIG_HTB_TRELLIS + if (cpi->sf.use_hash_based_trellis) hbt_destroy(); +#endif // CONFIG_HTB_TRELLIS + av1_free_ref_frame_buffers(cm->buffer_pool); + + aom_free(cpi); + +#ifdef OUTPUT_YUV_SKINMAP + fclose(yuv_skinmap_file); +#endif +#ifdef OUTPUT_YUV_REC + fclose(yuv_rec_file); +#endif +} + +static void generate_psnr_packet(AV1_COMP *cpi) { + struct aom_codec_cx_pkt pkt; + int i; + PSNR_STATS psnr; +#if CONFIG_AV1_HIGHBITDEPTH + const uint32_t in_bit_depth = cpi->oxcf.input_bit_depth; + const uint32_t bit_depth = cpi->td.mb.e_mbd.bd; + aom_calc_highbd_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr, + bit_depth, in_bit_depth); +#else + aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr); +#endif + + for (i = 0; i < 4; ++i) { + pkt.data.psnr.samples[i] = psnr.samples[i]; + pkt.data.psnr.sse[i] = psnr.sse[i]; + pkt.data.psnr.psnr[i] = psnr.psnr[i]; + } + pkt.kind = AOM_CODEC_PSNR_PKT; + aom_codec_pkt_list_add(cpi->output_pkt_list, &pkt); +} + +int av1_use_as_reference(int *ext_ref_frame_flags, int ref_frame_flags) { + if (ref_frame_flags > ((1 << INTER_REFS_PER_FRAME) - 1)) return -1; + + *ext_ref_frame_flags = ref_frame_flags; + return 0; +} + +int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) { + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + YV12_BUFFER_CONFIG *cfg = get_ref_frame(cm, idx); + if (cfg) { + aom_yv12_copy_frame(cfg, sd, num_planes); + return 0; + } else { + return -1; + } +} + +int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) { + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + YV12_BUFFER_CONFIG *cfg = get_ref_frame(cm, idx); + if (cfg) { + aom_yv12_copy_frame(sd, cfg, num_planes); + return 0; + } else { + return -1; + } +} + +int av1_update_entropy(bool *ext_refresh_frame_context, + bool *ext_refresh_frame_context_pending, bool update) { + *ext_refresh_frame_context = update; + *ext_refresh_frame_context_pending = 1; + return 0; +} + +#if defined(OUTPUT_YUV_DENOISED) || defined(OUTPUT_YUV_SKINMAP) +// The denoiser buffer is allocated as a YUV 440 buffer. This function writes it +// as YUV 420. We simply use the top-left pixels of the UV buffers, since we do +// not denoise the UV channels at this time. If ever we implement UV channel +// denoising we will have to modify this. +void aom_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) { + uint8_t *src = s->y_buffer; + int h = s->y_height; + + do { + fwrite(src, s->y_width, 1, f); + src += s->y_stride; + } while (--h); + + src = s->u_buffer; + h = s->uv_height; + + do { + fwrite(src, s->uv_width, 1, f); + src += s->uv_stride; + } while (--h); + + src = s->v_buffer; + h = s->uv_height; + + do { + fwrite(src, s->uv_width, 1, f); + src += s->uv_stride; + } while (--h); +} +#endif + +#ifdef OUTPUT_YUV_REC +void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) { + uint8_t *src = s->y_buffer; + int h = cm->height; + if (yuv_rec_file == NULL) return; + if (s->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *src16 = CONVERT_TO_SHORTPTR(s->y_buffer); + + do { + fwrite(src16, s->y_width, 2, yuv_rec_file); + src16 += s->y_stride; + } while (--h); + + src16 = CONVERT_TO_SHORTPTR(s->u_buffer); + h = s->uv_height; + + do { + fwrite(src16, s->uv_width, 2, yuv_rec_file); + src16 += s->uv_stride; + } while (--h); + + src16 = CONVERT_TO_SHORTPTR(s->v_buffer); + h = s->uv_height; + + do { + fwrite(src16, s->uv_width, 2, yuv_rec_file); + src16 += s->uv_stride; + } while (--h); + + fflush(yuv_rec_file); + return; + } + + do { + fwrite(src, s->y_width, 1, yuv_rec_file); + src += s->y_stride; + } while (--h); + + src = s->u_buffer; + h = s->uv_height; + + do { + fwrite(src, s->uv_width, 1, yuv_rec_file); + src += s->uv_stride; + } while (--h); + + src = s->v_buffer; + h = s->uv_height; + + do { + fwrite(src, s->uv_width, 1, yuv_rec_file); + src += s->uv_stride; + } while (--h); + + fflush(yuv_rec_file); +} +#endif // OUTPUT_YUV_REC + +#define GM_RECODE_LOOP_NUM4X4_FACTOR 192 +static int recode_loop_test_global_motion( + WarpedMotionParams *const global_motion, + const int *const global_motion_used, int *const gm_params_cost) { + int i; + int recode = 0; + for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + if (global_motion[i].wmtype != IDENTITY && + global_motion_used[i] * GM_RECODE_LOOP_NUM4X4_FACTOR < + gm_params_cost[i]) { + global_motion[i] = default_warp_params; + assert(global_motion[i].wmtype == IDENTITY); + gm_params_cost[i] = 0; + recode = 1; + // TODO(sarahparker): The earlier condition for recoding here was: + // "recode |= (rdc->global_motion_used[i] > 0);". Can we bring something + // similar to that back to speed up global motion? + } + } + return recode; +} + +// Function to test for conditions that indicate we should loop +// back and recode a frame. +static int recode_loop_test(AV1_COMP *cpi, int high_limit, int low_limit, int q, + int maxq, int minq) { + const RATE_CONTROL *const rc = &cpi->rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const int frame_is_kfgfarf = frame_is_kf_gf_arf(cpi); + int force_recode = 0; + + if ((rc->projected_frame_size >= rc->max_frame_bandwidth) || + (cpi->sf.hl_sf.recode_loop == ALLOW_RECODE) || + (frame_is_kfgfarf && + (cpi->sf.hl_sf.recode_loop == ALLOW_RECODE_KFARFGF))) { + // TODO(agrange) high_limit could be greater than the scale-down threshold. + if ((rc->projected_frame_size > high_limit && q < maxq) || + (rc->projected_frame_size < low_limit && q > minq)) { + force_recode = 1; + } else if (cpi->oxcf.rc_mode == AOM_CQ) { + // Deal with frame undershoot and whether or not we are + // below the automatically set cq level. + if (q > oxcf->cq_level && + rc->projected_frame_size < ((rc->this_frame_target * 7) >> 3)) { + force_recode = 1; + } + } + } + return force_recode; +} + +static void scale_references(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MV_REFERENCE_FRAME ref_frame; + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + // Need to convert from AOM_REFFRAME to index into ref_mask (subtract 1). + if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) { + BufferPool *const pool = cm->buffer_pool; + const YV12_BUFFER_CONFIG *const ref = + get_ref_frame_yv12_buf(cm, ref_frame); + + if (ref == NULL) { + cpi->scaled_ref_buf[ref_frame - 1] = NULL; + continue; + } + + if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) { + // Replace the reference buffer with a copy having a thicker border, + // if the reference buffer is higher resolution than the current + // frame, and the border is thin. + if ((ref->y_crop_width > cm->width || + ref->y_crop_height > cm->height) && + ref->border < AOM_BORDER_IN_PIXELS) { + RefCntBuffer *ref_fb = get_ref_frame_buf(cm, ref_frame); + if (aom_yv12_realloc_with_new_border( + &ref_fb->buf, AOM_BORDER_IN_PIXELS, + cm->features.byte_alignment, num_planes) != 0) { + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + } + } + int force_scaling = 0; + RefCntBuffer *new_fb = cpi->scaled_ref_buf[ref_frame - 1]; + if (new_fb == NULL) { + const int new_fb_idx = get_free_fb(cm); + if (new_fb_idx == INVALID_IDX) { + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Unable to find free frame buffer"); + } + force_scaling = 1; + new_fb = &pool->frame_bufs[new_fb_idx]; + } + + if (force_scaling || new_fb->buf.y_crop_width != cm->width || + new_fb->buf.y_crop_height != cm->height) { + if (aom_realloc_frame_buffer( + &new_fb->buf, cm->width, cm->height, + cm->seq_params.subsampling_x, cm->seq_params.subsampling_y, + cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS, + cm->features.byte_alignment, NULL, NULL, NULL)) { + if (force_scaling) { + // Release the reference acquired in the get_free_fb() call above. + --new_fb->ref_count; + } + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + } + av1_resize_and_extend_frame( + ref, &new_fb->buf, (int)cm->seq_params.bit_depth, num_planes); + cpi->scaled_ref_buf[ref_frame - 1] = new_fb; + alloc_frame_mvs(cm, new_fb); + } + } else { + RefCntBuffer *buf = get_ref_frame_buf(cm, ref_frame); + buf->buf.y_crop_width = ref->y_crop_width; + buf->buf.y_crop_height = ref->y_crop_height; + cpi->scaled_ref_buf[ref_frame - 1] = buf; + ++buf->ref_count; + } + } else { + if (!has_no_stats_stage(cpi)) cpi->scaled_ref_buf[ref_frame - 1] = NULL; + } + } +} + +static void release_scaled_references(AV1_COMP *cpi) { + // TODO(isbs): only refresh the necessary frames, rather than all of them + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + RefCntBuffer *const buf = cpi->scaled_ref_buf[i]; + if (buf != NULL) { + --buf->ref_count; + cpi->scaled_ref_buf[i] = NULL; + } + } +} + +static void set_mv_search_params(AV1_COMP *cpi) { + const AV1_COMMON *const cm = &cpi->common; + MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params; + const int max_mv_def = AOMMAX(cm->width, cm->height); + + // Default based on max resolution. + mv_search_params->mv_step_param = av1_init_search_range(max_mv_def); + + if (cpi->sf.mv_sf.auto_mv_step_size) { + if (frame_is_intra_only(cm)) { + // Initialize max_mv_magnitude for use in the first INTER frame + // after a key/intra-only frame. + mv_search_params->max_mv_magnitude = max_mv_def; + } else { + // Use cpi->max_mv_magnitude == -1 to exclude first pass case. + if (cm->show_frame && mv_search_params->max_mv_magnitude != -1) { + // Allow mv_steps to correspond to twice the max mv magnitude found + // in the previous frame, capped by the default max_mv_magnitude based + // on resolution. + mv_search_params->mv_step_param = av1_init_search_range( + AOMMIN(max_mv_def, 2 * mv_search_params->max_mv_magnitude)); + } + mv_search_params->max_mv_magnitude = -1; + } + } +} + +void av1_set_screen_content_options(const AV1_COMP *cpi, + FeatureFlags *features) { + const AV1_COMMON *const cm = &cpi->common; + + if (cm->seq_params.force_screen_content_tools != 2) { + features->allow_screen_content_tools = features->allow_intrabc = + cm->seq_params.force_screen_content_tools; + return; + } + + if (cpi->oxcf.mode == REALTIME) { + assert(cm->seq_params.reduced_still_picture_hdr); + features->allow_screen_content_tools = features->allow_intrabc = 0; + return; + } + + if (cpi->oxcf.content == AOM_CONTENT_SCREEN) { + features->allow_screen_content_tools = features->allow_intrabc = 1; + return; + } + + // Estimate if the source frame is screen content, based on the portion of + // blocks that have few luma colors. + const uint8_t *src = cpi->unfiltered_source->y_buffer; + assert(src != NULL); + const int use_hbd = cpi->unfiltered_source->flags & YV12_FLAG_HIGHBITDEPTH; + const int stride = cpi->unfiltered_source->y_stride; + const int width = cpi->unfiltered_source->y_width; + const int height = cpi->unfiltered_source->y_height; + const int bd = cm->seq_params.bit_depth; + const int blk_w = 16; + const int blk_h = 16; + // These threshold values are selected experimentally. + const int color_thresh = 4; + const unsigned int var_thresh = 0; + // Counts of blocks with no more than color_thresh colors. + int counts_1 = 0; + // Counts of blocks with no more than color_thresh colors and variance larger + // than var_thresh. + int counts_2 = 0; + + for (int r = 0; r + blk_h <= height; r += blk_h) { + for (int c = 0; c + blk_w <= width; c += blk_w) { + int count_buf[1 << 12]; // Maximum (1 << 12) color levels. + const uint8_t *const this_src = src + r * stride + c; + const int n_colors = + use_hbd ? av1_count_colors_highbd(this_src, stride, blk_w, blk_h, bd, + count_buf) + : av1_count_colors(this_src, stride, blk_w, blk_h, count_buf); + if (n_colors > 1 && n_colors <= color_thresh) { + ++counts_1; + struct buf_2d buf; + buf.stride = stride; + buf.buf = (uint8_t *)this_src; + const unsigned int var = + use_hbd + ? av1_high_get_sby_perpixel_variance(cpi, &buf, BLOCK_16X16, bd) + : av1_get_sby_perpixel_variance(cpi, &buf, BLOCK_16X16); + if (var > var_thresh) ++counts_2; + } + } + } + + // The threshold values are selected experimentally. + features->allow_screen_content_tools = + counts_1 * blk_h * blk_w * 10 > width * height; + // IntraBC would force loop filters off, so we use more strict rules that also + // requires that the block has high variance. + features->allow_intrabc = features->allow_screen_content_tools && + counts_2 * blk_h * blk_w * 12 > width * height; +} + +static void set_size_independent_vars(AV1_COMP *cpi) { + int i; + AV1_COMMON *const cm = &cpi->common; + for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + cm->global_motion[i] = default_warp_params; + } + cpi->gm_info.search_done = 0; + + av1_set_speed_features_framesize_independent(cpi, cpi->speed); + av1_set_rd_speed_thresholds(cpi); + cm->features.interp_filter = SWITCHABLE; + cm->features.switchable_motion_mode = 1; +} + +#if !CONFIG_REALTIME_ONLY +double av1_get_gfu_boost_projection_factor(double min_factor, double max_factor, + int frame_count) { + double factor = sqrt((double)frame_count); + factor = AOMMIN(factor, max_factor); + factor = AOMMAX(factor, min_factor); + factor = (200.0 + 10.0 * factor); + return factor; +} + +static int get_gfu_boost_from_r0_lap(double min_factor, double max_factor, + double r0, int frames_to_key) { + double factor = av1_get_gfu_boost_projection_factor(min_factor, max_factor, + frames_to_key); + const int boost = (int)rint(factor / r0); + return boost; +} + +double av1_get_kf_boost_projection_factor(int frame_count) { + double factor = sqrt((double)frame_count); + factor = AOMMIN(factor, 10.0); + factor = AOMMAX(factor, 4.0); + factor = (75.0 + 14.0 * factor); + return factor; +} + +static int get_kf_boost_from_r0(double r0, int frames_to_key) { + double factor = av1_get_kf_boost_projection_factor(frames_to_key); + const int boost = (int)rint(factor / r0); + return boost; +} +#endif + +#define MIN_BOOST_COMBINE_FACTOR 4.0 +#define MAX_BOOST_COMBINE_FACTOR 12.0 +int combine_prior_with_tpl_boost(double min_factor, double max_factor, + int prior_boost, int tpl_boost, + int frames_to_key) { + double factor = sqrt((double)frames_to_key); + double range = max_factor - min_factor; + factor = AOMMIN(factor, max_factor); + factor = AOMMAX(factor, min_factor); + factor -= min_factor; + int boost = + (int)((factor * prior_boost + (range - factor) * tpl_boost) / range); + return boost; +} + +#if !CONFIG_REALTIME_ONLY +static void process_tpl_stats_frame(AV1_COMP *cpi) { + const GF_GROUP *const gf_group = &cpi->gf_group; + AV1_COMMON *const cm = &cpi->common; + + assert(IMPLIES(gf_group->size > 0, gf_group->index < gf_group->size)); + + const int tpl_idx = gf_group->index; + TplParams *const tpl_data = &cpi->tpl_data; + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + + if (tpl_frame->is_valid) { + int tpl_stride = tpl_frame->stride; + int64_t intra_cost_base = 0; + int64_t mc_dep_cost_base = 0; + int64_t mc_saved_base = 0; + int64_t mc_count_base = 0; + const int step = 1 << tpl_data->tpl_stats_block_mis_log2; + const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); + + for (int row = 0; row < cm->mi_params.mi_rows; row += step) { + for (int col = 0; col < mi_cols_sr; col += step) { + TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos( + row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)]; + int64_t mc_dep_delta = + RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, + this_stats->mc_dep_dist); + intra_cost_base += (this_stats->recrf_dist << RDDIV_BITS); + mc_dep_cost_base += + (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta; + mc_count_base += this_stats->mc_count; + mc_saved_base += this_stats->mc_saved; + } + } + + if (mc_dep_cost_base == 0) { + tpl_frame->is_valid = 0; + } else { + aom_clear_system_state(); + cpi->rd.r0 = (double)intra_cost_base / mc_dep_cost_base; + if (is_frame_arf_and_tpl_eligible(gf_group)) { + cpi->rd.arf_r0 = cpi->rd.r0; + if (cpi->lap_enabled) { + double min_boost_factor = sqrt(cpi->rc.baseline_gf_interval); + const int gfu_boost = get_gfu_boost_from_r0_lap( + min_boost_factor, MAX_GFUBOOST_FACTOR, cpi->rd.arf_r0, + cpi->rc.num_stats_required_for_gfu_boost); + // printf("old boost %d new boost %d\n", cpi->rc.gfu_boost, + // gfu_boost); + cpi->rc.gfu_boost = combine_prior_with_tpl_boost( + min_boost_factor, MAX_BOOST_COMBINE_FACTOR, cpi->rc.gfu_boost, + gfu_boost, cpi->rc.num_stats_used_for_gfu_boost); + } else { + const int gfu_boost = (int)(200.0 / cpi->rd.r0); + cpi->rc.gfu_boost = combine_prior_with_tpl_boost( + MIN_BOOST_COMBINE_FACTOR, MAX_BOOST_COMBINE_FACTOR, + cpi->rc.gfu_boost, gfu_boost, cpi->rc.frames_to_key); + } + } else if (frame_is_intra_only(cm)) { + // TODO(debargha): Turn off q adjustment for kf temporarily to + // reduce impact on speed of encoding. Need to investigate how + // to mitigate the issue. + if (cpi->oxcf.rc_mode == AOM_Q) { + const int kf_boost = + get_kf_boost_from_r0(cpi->rd.r0, cpi->rc.frames_to_key); + if (cpi->lap_enabled) { + cpi->rc.kf_boost = combine_prior_with_tpl_boost( + MIN_BOOST_COMBINE_FACTOR, MAX_BOOST_COMBINE_FACTOR, + cpi->rc.kf_boost, kf_boost, + cpi->rc.num_stats_used_for_kf_boost); + } else { + cpi->rc.kf_boost = combine_prior_with_tpl_boost( + MIN_BOOST_COMBINE_FACTOR, MAX_BOOST_COMBINE_FACTOR, + cpi->rc.kf_boost, kf_boost, cpi->rc.frames_to_key); + } + } + } + cpi->rd.mc_count_base = (double)mc_count_base / + (cm->mi_params.mi_rows * cm->mi_params.mi_cols); + cpi->rd.mc_saved_base = (double)mc_saved_base / + (cm->mi_params.mi_rows * cm->mi_params.mi_cols); + aom_clear_system_state(); + } + } +} +#endif // !CONFIG_REALTIME_ONLY + +static void set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index, + int *top_index) { + AV1_COMMON *const cm = &cpi->common; + + // Setup variables that depend on the dimensions of the frame. + av1_set_speed_features_framesize_dependent(cpi, cpi->speed); + +#if !CONFIG_REALTIME_ONLY + if (cpi->oxcf.enable_tpl_model && is_frame_tpl_eligible(cpi)) { + process_tpl_stats_frame(cpi); + av1_tpl_rdmult_setup(cpi); + } +#endif + + // Decide q and q bounds. + *q = av1_rc_pick_q_and_bounds(cpi, &cpi->rc, cm->width, cm->height, + cpi->gf_group.index, bottom_index, top_index); + + // Configure experimental use of segmentation for enhanced coding of + // static regions if indicated. + // Only allowed in the second pass of a two pass encode, as it requires + // lagged coding, and if the relevant speed feature flag is set. + if (is_stat_consumption_stage_twopass(cpi) && + cpi->sf.hl_sf.static_segmentation) + configure_static_seg_features(cpi); +} + +static void init_motion_estimation(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params; + const int y_stride = cpi->scaled_source.y_stride; + const int y_stride_src = + ((cpi->oxcf.width != cm->width || cpi->oxcf.height != cm->height) || + av1_superres_scaled(cm)) + ? y_stride + : cpi->lookahead->buf->img.y_stride; + int fpf_y_stride = cm->cur_frame != NULL ? cm->cur_frame->buf.y_stride + : cpi->scaled_source.y_stride; + + // Update if ss_cfg is uninitialized or the current frame has a new stride + const int should_update = + !mv_search_params->ss_cfg[SS_CFG_SRC].stride || + !mv_search_params->ss_cfg[SS_CFG_LOOKAHEAD].stride || + (y_stride != mv_search_params->ss_cfg[SS_CFG_SRC].stride); + + if (!should_update) { + return; + } + + if (cpi->sf.mv_sf.search_method == DIAMOND) { + av1_init_dsmotion_compensation(&mv_search_params->ss_cfg[SS_CFG_SRC], + y_stride); + av1_init_dsmotion_compensation(&mv_search_params->ss_cfg[SS_CFG_LOOKAHEAD], + y_stride_src); + } else { + av1_init3smotion_compensation(&mv_search_params->ss_cfg[SS_CFG_SRC], + y_stride); + av1_init3smotion_compensation(&mv_search_params->ss_cfg[SS_CFG_LOOKAHEAD], + y_stride_src); + } + av1_init_motion_fpf(&mv_search_params->ss_cfg[SS_CFG_FPF], fpf_y_stride); +} + +#define COUPLED_CHROMA_FROM_LUMA_RESTORATION 0 +static void set_restoration_unit_size(int width, int height, int sx, int sy, + RestorationInfo *rst) { + (void)width; + (void)height; + (void)sx; + (void)sy; +#if COUPLED_CHROMA_FROM_LUMA_RESTORATION + int s = AOMMIN(sx, sy); +#else + int s = 0; +#endif // !COUPLED_CHROMA_FROM_LUMA_RESTORATION + + if (width * height > 352 * 288) + rst[0].restoration_unit_size = RESTORATION_UNITSIZE_MAX; + else + rst[0].restoration_unit_size = (RESTORATION_UNITSIZE_MAX >> 1); + rst[1].restoration_unit_size = rst[0].restoration_unit_size >> s; + rst[2].restoration_unit_size = rst[1].restoration_unit_size; +} + +static void init_ref_frame_bufs(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + int i; + BufferPool *const pool = cm->buffer_pool; + cm->cur_frame = NULL; + for (i = 0; i < REF_FRAMES; ++i) { + cm->ref_frame_map[i] = NULL; + } + for (i = 0; i < FRAME_BUFFERS; ++i) { + pool->frame_bufs[i].ref_count = 0; + } +} + +void av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth, + int subsampling_x, int subsampling_y) { + AV1_COMMON *const cm = &cpi->common; + SequenceHeader *const seq_params = &cm->seq_params; + + if (!cpi->initial_width || seq_params->use_highbitdepth != use_highbitdepth || + seq_params->subsampling_x != subsampling_x || + seq_params->subsampling_y != subsampling_y) { + seq_params->subsampling_x = subsampling_x; + seq_params->subsampling_y = subsampling_y; + seq_params->use_highbitdepth = use_highbitdepth; + + av1_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed); + av1_set_speed_features_framesize_dependent(cpi, cpi->oxcf.speed); + + if (!is_stat_generation_stage(cpi)) { + alloc_altref_frame_buffer(cpi); + alloc_util_frame_buffers(cpi); + } + init_ref_frame_bufs(cpi); + + init_motion_estimation(cpi); // TODO(agrange) This can be removed. + + cpi->initial_width = cm->width; + cpi->initial_height = cm->height; + cpi->initial_mbs = cm->mi_params.MBs; + } +} + +// Returns 1 if the assigned width or height was <= 0. +int av1_set_size_literal(AV1_COMP *cpi, int width, int height) { + AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + av1_check_initial_width(cpi, cm->seq_params.use_highbitdepth, + cm->seq_params.subsampling_x, + cm->seq_params.subsampling_y); + + if (width <= 0 || height <= 0) return 1; + + cm->width = width; + cm->height = height; + + if (cpi->initial_width && cpi->initial_height && + (cm->width > cpi->initial_width || cm->height > cpi->initial_height)) { + av1_free_context_buffers(cm); + av1_free_pc_tree(cpi, &cpi->td, num_planes, cm->seq_params.sb_size); + alloc_compressor_data(cpi); + realloc_segmentation_maps(cpi); + cpi->initial_width = cpi->initial_height = 0; + } + update_frame_size(cpi); + + return 0; +} + +void av1_set_frame_size(AV1_COMP *cpi, int width, int height) { + AV1_COMMON *const cm = &cpi->common; + const SequenceHeader *const seq_params = &cm->seq_params; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + int ref_frame; + + if (width != cm->width || height != cm->height) { + // There has been a change in the encoded frame size + av1_set_size_literal(cpi, width, height); + // Recalculate 'all_lossless' in case super-resolution was (un)selected. + cm->features.all_lossless = + cm->features.coded_lossless && !av1_superres_scaled(cm); + } + set_mv_search_params(cpi); + + if (is_stat_consumption_stage(cpi)) { + av1_set_target_rate(cpi, cm->width, cm->height); + } + + alloc_frame_mvs(cm, cm->cur_frame); + + // Allocate above context buffers + CommonContexts *const above_contexts = &cm->above_contexts; + if (above_contexts->num_planes < av1_num_planes(cm) || + above_contexts->num_mi_cols < cm->mi_params.mi_cols || + above_contexts->num_tile_rows < cm->tiles.rows) { + av1_free_above_context_buffers(above_contexts); + if (av1_alloc_above_context_buffers(above_contexts, cm->tiles.rows, + cm->mi_params.mi_cols, + av1_num_planes(cm))) + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate context buffers"); + } + + // Reset the frame pointers to the current frame size. + if (aom_realloc_frame_buffer( + &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL, + NULL)) + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + + const int frame_width = cm->superres_upscaled_width; + const int frame_height = cm->superres_upscaled_height; + set_restoration_unit_size(frame_width, frame_height, + seq_params->subsampling_x, + seq_params->subsampling_y, cm->rst_info); + for (int i = 0; i < num_planes; ++i) + cm->rst_info[i].frame_restoration_type = RESTORE_NONE; + + av1_alloc_restoration_buffers(cm); + if (!is_stat_generation_stage(cpi)) alloc_util_frame_buffers(cpi); + init_motion_estimation(cpi); + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + if (buf != NULL) { + struct scale_factors *sf = get_ref_scale_factors(cm, ref_frame); + av1_setup_scale_factors_for_frame(sf, buf->buf.y_crop_width, + buf->buf.y_crop_height, cm->width, + cm->height); + if (av1_is_scaled(sf)) aom_extend_frame_borders(&buf->buf, num_planes); + } + } + + av1_setup_scale_factors_for_frame(&cm->sf_identity, cm->width, cm->height, + cm->width, cm->height); + + set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME); +} + +static uint8_t calculate_next_resize_scale(const AV1_COMP *cpi) { + // Choose an arbitrary random number + static unsigned int seed = 56789; + const AV1EncoderConfig *oxcf = &cpi->oxcf; + if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR; + uint8_t new_denom = SCALE_NUMERATOR; + + if (cpi->common.seq_params.reduced_still_picture_hdr) return SCALE_NUMERATOR; + switch (oxcf->resize_mode) { + case RESIZE_NONE: new_denom = SCALE_NUMERATOR; break; + case RESIZE_FIXED: + if (cpi->common.current_frame.frame_type == KEY_FRAME) + new_denom = oxcf->resize_kf_scale_denominator; + else + new_denom = oxcf->resize_scale_denominator; + break; + case RESIZE_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break; + default: assert(0); + } + return new_denom; +} + +#if CONFIG_SUPERRES_IN_RECODE +static int superres_in_recode_allowed(const AV1_COMP *const cpi) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + // Empirically found to not be beneficial for AOM_Q mode and images coding. + return oxcf->superres_mode == SUPERRES_AUTO && + (oxcf->rc_mode == AOM_VBR || oxcf->rc_mode == AOM_CQ) && + cpi->rc.frames_to_key > 1; +} +#endif // CONFIG_SUPERRES_IN_RECODE + +#define SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME_SOLO 0.012 +#define SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME 0.008 +#define SUPERRES_ENERGY_BY_Q2_THRESH_ARFFRAME 0.008 +#define SUPERRES_ENERGY_BY_AC_THRESH 0.2 + +static double get_energy_by_q2_thresh(const GF_GROUP *gf_group, + const RATE_CONTROL *rc) { + // TODO(now): Return keyframe thresh * factor based on frame type / pyramid + // level. + if (gf_group->update_type[gf_group->index] == ARF_UPDATE) { + return SUPERRES_ENERGY_BY_Q2_THRESH_ARFFRAME; + } else if (gf_group->update_type[gf_group->index] == KF_UPDATE) { + if (rc->frames_to_key <= 1) + return SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME_SOLO; + else + return SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME; + } else { + assert(0); + } + return 0; +} + +static uint8_t get_superres_denom_from_qindex_energy(int qindex, double *energy, + double threshq, + double threshp) { + const double q = av1_convert_qindex_to_q(qindex, AOM_BITS_8); + const double tq = threshq * q * q; + const double tp = threshp * energy[1]; + const double thresh = AOMMIN(tq, tp); + int k; + for (k = SCALE_NUMERATOR * 2; k > SCALE_NUMERATOR; --k) { + if (energy[k - 1] > thresh) break; + } + return 3 * SCALE_NUMERATOR - k; +} + +static uint8_t get_superres_denom_for_qindex(const AV1_COMP *cpi, int qindex, + int sr_kf, int sr_arf) { + // Use superres for Key-frames and Alt-ref frames only. + const GF_GROUP *gf_group = &cpi->gf_group; + if (gf_group->update_type[gf_group->index] != KF_UPDATE && + gf_group->update_type[gf_group->index] != ARF_UPDATE) { + return SCALE_NUMERATOR; + } + if (gf_group->update_type[gf_group->index] == KF_UPDATE && !sr_kf) { + return SCALE_NUMERATOR; + } + if (gf_group->update_type[gf_group->index] == ARF_UPDATE && !sr_arf) { + return SCALE_NUMERATOR; + } + + double energy[16]; + analyze_hor_freq(cpi, energy); + + const double energy_by_q2_thresh = + get_energy_by_q2_thresh(gf_group, &cpi->rc); + int denom = get_superres_denom_from_qindex_energy( + qindex, energy, energy_by_q2_thresh, SUPERRES_ENERGY_BY_AC_THRESH); + /* + printf("\nenergy = ["); + for (int k = 1; k < 16; ++k) printf("%f, ", energy[k]); + printf("]\n"); + printf("boost = %d\n", + (gf_group->update_type[gf_group->index] == KF_UPDATE) + ? cpi->rc.kf_boost + : cpi->rc.gfu_boost); + printf("denom = %d\n", denom); + */ +#if CONFIG_SUPERRES_IN_RECODE + if (superres_in_recode_allowed(cpi)) { + assert(cpi->superres_mode != SUPERRES_NONE); + // Force superres to be tried in the recode loop, as full-res is also going + // to be tried anyway. + denom = AOMMAX(denom, SCALE_NUMERATOR + 1); + } +#endif // CONFIG_SUPERRES_IN_RECODE + return denom; +} + +// If true, SUPERRES_AUTO mode will exhaustively search over all superres +// denominators for all frames (except overlay and internal overlay frames). +#define SUPERRES_RECODE_ALL_RATIOS 0 + +static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) { + // Choose an arbitrary random number + static unsigned int seed = 34567; + const AV1EncoderConfig *oxcf = &cpi->oxcf; + if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR; + uint8_t new_denom = SCALE_NUMERATOR; + + // Make sure that superres mode of the frame is consistent with the + // sequence-level flag. + assert(IMPLIES(oxcf->superres_mode != SUPERRES_NONE, + cpi->common.seq_params.enable_superres)); + assert(IMPLIES(!cpi->common.seq_params.enable_superres, + oxcf->superres_mode == SUPERRES_NONE)); + // Make sure that superres mode for current encoding is consistent with user + // provided superres mode. + assert(IMPLIES(oxcf->superres_mode != SUPERRES_AUTO, + cpi->superres_mode == oxcf->superres_mode)); + + // Note: we must look at the current superres_mode to be tried in 'cpi' here, + // not the user given mode in 'oxcf'. + switch (cpi->superres_mode) { + case SUPERRES_NONE: new_denom = SCALE_NUMERATOR; break; + case SUPERRES_FIXED: + if (cpi->common.current_frame.frame_type == KEY_FRAME) + new_denom = oxcf->superres_kf_scale_denominator; + else + new_denom = oxcf->superres_scale_denominator; + break; + case SUPERRES_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break; + case SUPERRES_QTHRESH: { + // Do not use superres when screen content tools are used. + if (cpi->common.features.allow_screen_content_tools) break; + if (oxcf->rc_mode == AOM_VBR || oxcf->rc_mode == AOM_CQ) + av1_set_target_rate(cpi, cpi->oxcf.width, cpi->oxcf.height); + + // Now decide the use of superres based on 'q'. + int bottom_index, top_index; + const int q = av1_rc_pick_q_and_bounds( + cpi, &cpi->rc, cpi->oxcf.width, cpi->oxcf.height, cpi->gf_group.index, + &bottom_index, &top_index); + + const int qthresh = (frame_is_intra_only(&cpi->common)) + ? oxcf->superres_kf_qthresh + : oxcf->superres_qthresh; + if (q <= qthresh) { + new_denom = SCALE_NUMERATOR; + } else { + new_denom = get_superres_denom_for_qindex(cpi, q, 1, 1); + } + break; + } + case SUPERRES_AUTO: { + // Do not use superres when screen content tools are used. + if (cpi->common.features.allow_screen_content_tools) break; + if (oxcf->rc_mode == AOM_VBR || oxcf->rc_mode == AOM_CQ) + av1_set_target_rate(cpi, cpi->oxcf.width, cpi->oxcf.height); + + // Now decide the use of superres based on 'q'. + int bottom_index, top_index; + const int q = av1_rc_pick_q_and_bounds( + cpi, &cpi->rc, cpi->oxcf.width, cpi->oxcf.height, cpi->gf_group.index, + &bottom_index, &top_index); + + const int qthresh = 128; + if (q <= qthresh) { + new_denom = SCALE_NUMERATOR; + } else { +#if SUPERRES_RECODE_ALL_RATIOS + if (cpi->common.current_frame.frame_type == KEY_FRAME) + new_denom = oxcf->superres_kf_scale_denominator; + else + new_denom = oxcf->superres_scale_denominator; +#else + new_denom = get_superres_denom_for_qindex(cpi, q, 1, 1); +#endif // SUPERRES_RECODE_ALL_RATIOS + } + break; + } + default: assert(0); + } + return new_denom; +} + +static int dimension_is_ok(int orig_dim, int resized_dim, int denom) { + return (resized_dim * SCALE_NUMERATOR >= orig_dim * denom / 2); +} + +static int dimensions_are_ok(int owidth, int oheight, size_params_type *rsz) { + // Only need to check the width, as scaling is horizontal only. + (void)oheight; + return dimension_is_ok(owidth, rsz->resize_width, rsz->superres_denom); +} + +static int validate_size_scales(RESIZE_MODE resize_mode, + SUPERRES_MODE superres_mode, int owidth, + int oheight, size_params_type *rsz) { + if (dimensions_are_ok(owidth, oheight, rsz)) { // Nothing to do. + return 1; + } + + // Calculate current resize scale. + int resize_denom = + AOMMAX(DIVIDE_AND_ROUND(owidth * SCALE_NUMERATOR, rsz->resize_width), + DIVIDE_AND_ROUND(oheight * SCALE_NUMERATOR, rsz->resize_height)); + + if (resize_mode != RESIZE_RANDOM && superres_mode == SUPERRES_RANDOM) { + // Alter superres scale as needed to enforce conformity. + rsz->superres_denom = + (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / resize_denom; + if (!dimensions_are_ok(owidth, oheight, rsz)) { + if (rsz->superres_denom > SCALE_NUMERATOR) --rsz->superres_denom; + } + } else if (resize_mode == RESIZE_RANDOM && superres_mode != SUPERRES_RANDOM) { + // Alter resize scale as needed to enforce conformity. + resize_denom = + (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / rsz->superres_denom; + rsz->resize_width = owidth; + rsz->resize_height = oheight; + av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height, + resize_denom); + if (!dimensions_are_ok(owidth, oheight, rsz)) { + if (resize_denom > SCALE_NUMERATOR) { + --resize_denom; + rsz->resize_width = owidth; + rsz->resize_height = oheight; + av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height, + resize_denom); + } + } + } else if (resize_mode == RESIZE_RANDOM && superres_mode == SUPERRES_RANDOM) { + // Alter both resize and superres scales as needed to enforce conformity. + do { + if (resize_denom > rsz->superres_denom) + --resize_denom; + else + --rsz->superres_denom; + rsz->resize_width = owidth; + rsz->resize_height = oheight; + av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height, + resize_denom); + } while (!dimensions_are_ok(owidth, oheight, rsz) && + (resize_denom > SCALE_NUMERATOR || + rsz->superres_denom > SCALE_NUMERATOR)); + } else { // We are allowed to alter neither resize scale nor superres + // scale. + return 0; + } + return dimensions_are_ok(owidth, oheight, rsz); +} + +// Calculates resize and superres params for next frame +static size_params_type calculate_next_size_params(AV1_COMP *cpi) { + const AV1EncoderConfig *oxcf = &cpi->oxcf; + ResizePendingParams *resize_pending_params = &cpi->resize_pending_params; + size_params_type rsz = { oxcf->width, oxcf->height, SCALE_NUMERATOR }; + int resize_denom = SCALE_NUMERATOR; + if (has_no_stats_stage(cpi) && cpi->use_svc && + cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1) { + rsz.resize_width = cpi->common.width; + rsz.resize_height = cpi->common.height; + return rsz; + } + if (is_stat_generation_stage(cpi)) return rsz; + if (resize_pending_params->width && resize_pending_params->height) { + rsz.resize_width = resize_pending_params->width; + rsz.resize_height = resize_pending_params->height; + resize_pending_params->width = resize_pending_params->height = 0; + } else { + resize_denom = calculate_next_resize_scale(cpi); + rsz.resize_width = oxcf->width; + rsz.resize_height = oxcf->height; + av1_calculate_scaled_size(&rsz.resize_width, &rsz.resize_height, + resize_denom); + } + rsz.superres_denom = calculate_next_superres_scale(cpi); + if (!validate_size_scales(oxcf->resize_mode, cpi->superres_mode, oxcf->width, + oxcf->height, &rsz)) + assert(0 && "Invalid scale parameters"); + return rsz; +} + +static void setup_frame_size_from_params(AV1_COMP *cpi, + const size_params_type *rsz) { + int encode_width = rsz->resize_width; + int encode_height = rsz->resize_height; + + AV1_COMMON *cm = &cpi->common; + cm->superres_upscaled_width = encode_width; + cm->superres_upscaled_height = encode_height; + cm->superres_scale_denominator = rsz->superres_denom; + av1_calculate_scaled_superres_size(&encode_width, &encode_height, + rsz->superres_denom); + av1_set_frame_size(cpi, encode_width, encode_height); +} + +void av1_setup_frame_size(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + // Reset superres params from previous frame. + cm->superres_scale_denominator = SCALE_NUMERATOR; + const size_params_type rsz = calculate_next_size_params(cpi); + setup_frame_size_from_params(cpi, &rsz); + + assert(av1_is_min_tile_width_satisfied(cm)); +} + +static void superres_post_encode(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + + if (!av1_superres_scaled(cm)) return; + + assert(cpi->oxcf.enable_superres); + assert(!is_lossless_requested(&cpi->oxcf)); + assert(!cm->features.all_lossless); + + av1_superres_upscale(cm, NULL); + + // If regular resizing is occurring the source will need to be downscaled to + // match the upscaled superres resolution. Otherwise the original source is + // used. + if (!av1_resize_scaled(cm)) { + cpi->source = cpi->unscaled_source; + if (cpi->last_source != NULL) cpi->last_source = cpi->unscaled_last_source; + } else { + assert(cpi->unscaled_source->y_crop_width != cm->superres_upscaled_width); + assert(cpi->unscaled_source->y_crop_height != cm->superres_upscaled_height); + // Do downscale. cm->(width|height) has been updated by + // av1_superres_upscale + if (aom_realloc_frame_buffer( + &cpi->scaled_source, cm->superres_upscaled_width, + cm->superres_upscaled_height, cm->seq_params.subsampling_x, + cm->seq_params.subsampling_y, cm->seq_params.use_highbitdepth, + AOM_BORDER_IN_PIXELS, cm->features.byte_alignment, NULL, NULL, + NULL)) + aom_internal_error( + &cm->error, AOM_CODEC_MEM_ERROR, + "Failed to reallocate scaled source buffer for superres"); + assert(cpi->scaled_source.y_crop_width == cm->superres_upscaled_width); + assert(cpi->scaled_source.y_crop_height == cm->superres_upscaled_height); + av1_resize_and_extend_frame(cpi->unscaled_source, &cpi->scaled_source, + (int)cm->seq_params.bit_depth, num_planes); + cpi->source = &cpi->scaled_source; + } +} + +static void cdef_restoration_frame(AV1_COMP *cpi, AV1_COMMON *cm, + MACROBLOCKD *xd, int use_restoration, + int use_cdef) { + if (use_restoration) + av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 0); + + if (use_cdef) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, cdef_time); +#endif + // Find CDEF parameters + av1_cdef_search(&cm->cur_frame->buf, cpi->source, cm, xd, + cpi->sf.lpf_sf.cdef_pick_method, cpi->td.mb.rdmult); + + // Apply the filter + av1_cdef_frame(&cm->cur_frame->buf, cm, xd); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, cdef_time); +#endif + } else { + cm->cdef_info.cdef_bits = 0; + cm->cdef_info.cdef_strengths[0] = 0; + cm->cdef_info.nb_cdef_strengths = 1; + cm->cdef_info.cdef_uv_strengths[0] = 0; + } + + superres_post_encode(cpi); + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, loop_restoration_time); +#endif + if (use_restoration) { + av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 1); + av1_pick_filter_restoration(cpi->source, cpi); + if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE || + cm->rst_info[1].frame_restoration_type != RESTORE_NONE || + cm->rst_info[2].frame_restoration_type != RESTORE_NONE) { + if (cpi->num_workers > 1) + av1_loop_restoration_filter_frame_mt(&cm->cur_frame->buf, cm, 0, + cpi->workers, cpi->num_workers, + &cpi->lr_row_sync, &cpi->lr_ctxt); + else + av1_loop_restoration_filter_frame(&cm->cur_frame->buf, cm, 0, + &cpi->lr_ctxt); + } + } else { + cm->rst_info[0].frame_restoration_type = RESTORE_NONE; + cm->rst_info[1].frame_restoration_type = RESTORE_NONE; + cm->rst_info[2].frame_restoration_type = RESTORE_NONE; + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, loop_restoration_time); +#endif +} + +static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) { + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *xd = &cpi->td.mb.e_mbd; + + assert(IMPLIES(is_lossless_requested(&cpi->oxcf), + cm->features.coded_lossless && cm->features.all_lossless)); + + const int use_loopfilter = + !cm->features.coded_lossless && !cm->tiles.large_scale; + const int use_cdef = cm->seq_params.enable_cdef && + !cm->features.coded_lossless && !cm->tiles.large_scale; + const int use_restoration = cm->seq_params.enable_restoration && + !cm->features.all_lossless && + !cm->tiles.large_scale; + + struct loopfilter *lf = &cm->lf; + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, loop_filter_time); +#endif + if (use_loopfilter) { + aom_clear_system_state(); + av1_pick_filter_level(cpi->source, cpi, cpi->sf.lpf_sf.lpf_pick); + } else { + lf->filter_level[0] = 0; + lf->filter_level[1] = 0; + } + + if (lf->filter_level[0] || lf->filter_level[1]) { + if (cpi->num_workers > 1) + av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, xd, 0, num_planes, 0, +#if CONFIG_LPF_MASK + 0, +#endif + cpi->workers, cpi->num_workers, + &cpi->lf_row_sync); + else + av1_loop_filter_frame(&cm->cur_frame->buf, cm, xd, +#if CONFIG_LPF_MASK + 0, +#endif + 0, num_planes, 0); + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, loop_filter_time); +#endif + + cdef_restoration_frame(cpi, cm, xd, use_restoration, use_cdef); +} + +static void fix_interp_filter(InterpFilter *const interp_filter, + const FRAME_COUNTS *const counts) { + if (*interp_filter == SWITCHABLE) { + // Check to see if only one of the filters is actually used + int count[SWITCHABLE_FILTERS] = { 0 }; + int num_filters_used = 0; + for (int i = 0; i < SWITCHABLE_FILTERS; ++i) { + for (int j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) + count[i] += counts->switchable_interp[j][i]; + num_filters_used += (count[i] > 0); + } + if (num_filters_used == 1) { + // Only one filter is used. So set the filter at frame level + for (int i = 0; i < SWITCHABLE_FILTERS; ++i) { + if (count[i]) { + if (i == EIGHTTAP_REGULAR) *interp_filter = i; + break; + } + } + } + } +} + +static void finalize_encoded_frame(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + CurrentFrame *const current_frame = &cm->current_frame; + + if (!cm->seq_params.reduced_still_picture_hdr && + encode_show_existing_frame(cm)) { + RefCntBuffer *const frame_to_show = + cm->ref_frame_map[cpi->existing_fb_idx_to_show]; + + if (frame_to_show == NULL) { + aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Buffer does not contain a reconstructed frame"); + } + assert(frame_to_show->ref_count > 0); + assign_frame_buffer_p(&cm->cur_frame, frame_to_show); + } + + if (!encode_show_existing_frame(cm) && + cm->seq_params.film_grain_params_present && + (cm->show_frame || cm->showable_frame)) { + // Copy the current frame's film grain params to the its corresponding + // RefCntBuffer slot. + cm->cur_frame->film_grain_params = cm->film_grain_params; + + // We must update the parameters if this is not an INTER_FRAME + if (current_frame->frame_type != INTER_FRAME) + cm->cur_frame->film_grain_params.update_parameters = 1; + + // Iterate the random seed for the next frame. + cm->film_grain_params.random_seed += 3381; + if (cm->film_grain_params.random_seed == 0) + cm->film_grain_params.random_seed = 7391; + } + + // Initialise all tiles' contexts from the global frame context + for (int tile_col = 0; tile_col < cm->tiles.cols; tile_col++) { + for (int tile_row = 0; tile_row < cm->tiles.rows; tile_row++) { + const int tile_idx = tile_row * cm->tiles.cols + tile_col; + cpi->tile_data[tile_idx].tctx = *cm->fc; + } + } + + fix_interp_filter(&cm->features.interp_filter, cpi->td.counts); +} + +static int get_regulated_q_overshoot(AV1_COMP *const cpi, int q_low, int q_high, + int top_index, int bottom_index) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + + av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height); + + int q_regulated = + av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, + AOMMAX(q_high, top_index), cm->width, cm->height); + + int retries = 0; + while (q_regulated < q_low && retries < 10) { + av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height); + q_regulated = + av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, + AOMMAX(q_high, top_index), cm->width, cm->height); + retries++; + } + return q_regulated; +} + +static int get_regulated_q_undershoot(AV1_COMP *const cpi, int q_high, + int top_index, int bottom_index) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + + av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height); + int q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, + top_index, cm->width, cm->height); + + int retries = 0; + while (q_regulated > q_high && retries < 10) { + av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height); + q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, + top_index, cm->width, cm->height); + retries++; + } + return q_regulated; +} + +// Called after encode_with_recode_loop() has just encoded a frame and packed +// its bitstream. This function works out whether we under- or over-shot +// our bitrate target and adjusts q as appropriate. Also decides whether +// or not we should do another recode loop, indicated by *loop +static void recode_loop_update_q( + AV1_COMP *const cpi, int *const loop, int *const q, int *const q_low, + int *const q_high, const int top_index, const int bottom_index, + int *const undershoot_seen, int *const overshoot_seen, + int *const low_cr_seen, const int loop_at_this_size) { + AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + *loop = 0; + + const int min_cr = cpi->oxcf.min_cr; + if (min_cr > 0) { + aom_clear_system_state(); + const double compression_ratio = + av1_get_compression_ratio(cm, rc->projected_frame_size >> 3); + const double target_cr = min_cr / 100.0; + if (compression_ratio < target_cr) { + *low_cr_seen = 1; + if (*q < rc->worst_quality) { + const double cr_ratio = target_cr / compression_ratio; + const int projected_q = AOMMAX(*q + 1, (int)(*q * cr_ratio * cr_ratio)); + *q = AOMMIN(AOMMIN(projected_q, *q + 32), rc->worst_quality); + *q_low = AOMMAX(*q, *q_low); + *q_high = AOMMAX(*q, *q_high); + *loop = 1; + } + } + if (*low_cr_seen) return; + } + + if (cpi->oxcf.rc_mode == AOM_Q) return; + + const int last_q = *q; + int frame_over_shoot_limit = 0, frame_under_shoot_limit = 0; + av1_rc_compute_frame_size_bounds(cpi, rc->this_frame_target, + &frame_under_shoot_limit, + &frame_over_shoot_limit); + if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1; + + if (cm->current_frame.frame_type == KEY_FRAME && rc->this_key_frame_forced && + rc->projected_frame_size < rc->max_frame_bandwidth) { + int64_t kf_err; + const int64_t high_err_target = cpi->ambient_err; + const int64_t low_err_target = cpi->ambient_err >> 1; + +#if CONFIG_AV1_HIGHBITDEPTH + if (cm->seq_params.use_highbitdepth) { + kf_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf); + } else { + kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf); + } +#else + kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf); +#endif + // Prevent possible divide by zero error below for perfect KF + kf_err += !kf_err; + + // The key frame is not good enough or we can afford + // to make it better without undue risk of popping. + if ((kf_err > high_err_target && + rc->projected_frame_size <= frame_over_shoot_limit) || + (kf_err > low_err_target && + rc->projected_frame_size <= frame_under_shoot_limit)) { + // Lower q_high + *q_high = AOMMAX(*q - 1, *q_low); + + // Adjust Q + *q = (int)((*q * high_err_target) / kf_err); + *q = AOMMIN(*q, (*q_high + *q_low) >> 1); + } else if (kf_err < low_err_target && + rc->projected_frame_size >= frame_under_shoot_limit) { + // The key frame is much better than the previous frame + // Raise q_low + *q_low = AOMMIN(*q + 1, *q_high); + + // Adjust Q + *q = (int)((*q * low_err_target) / kf_err); + *q = AOMMIN(*q, (*q_high + *q_low + 1) >> 1); + } + + // Clamp Q to upper and lower limits: + *q = clamp(*q, *q_low, *q_high); + *loop = (*q != last_q); + return; + } + + if (recode_loop_test(cpi, frame_over_shoot_limit, frame_under_shoot_limit, *q, + AOMMAX(*q_high, top_index), bottom_index)) { + // Is the projected frame size out of range and are we allowed + // to attempt to recode. + + // Frame size out of permitted range: + // Update correction factor & compute new Q to try... + // Frame is too large + if (rc->projected_frame_size > rc->this_frame_target) { + // Special case if the projected size is > the max allowed. + if (*q == *q_high && + rc->projected_frame_size >= rc->max_frame_bandwidth) { + const double q_val_high_current = + av1_convert_qindex_to_q(*q_high, cm->seq_params.bit_depth); + const double q_val_high_new = + q_val_high_current * + ((double)rc->projected_frame_size / rc->max_frame_bandwidth); + *q_high = av1_find_qindex(q_val_high_new, cm->seq_params.bit_depth, + rc->best_quality, rc->worst_quality); + } + + // Raise Qlow as to at least the current value + *q_low = AOMMIN(*q + 1, *q_high); + + if (*undershoot_seen || loop_at_this_size > 2 || + (loop_at_this_size == 2 && !frame_is_intra_only(cm))) { + av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height); + + *q = (*q_high + *q_low + 1) / 2; + } else if (loop_at_this_size == 2 && frame_is_intra_only(cm)) { + const int q_mid = (*q_high + *q_low + 1) / 2; + const int q_regulated = get_regulated_q_overshoot( + cpi, *q_low, *q_high, top_index, bottom_index); + // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth + // transition between loop_at_this_size < 2 and loop_at_this_size > 2. + *q = (q_mid + q_regulated + 1) / 2; + } else { + *q = get_regulated_q_overshoot(cpi, *q_low, *q_high, top_index, + bottom_index); + } + + *overshoot_seen = 1; + } else { + // Frame is too small + *q_high = AOMMAX(*q - 1, *q_low); + + if (*overshoot_seen || loop_at_this_size > 2 || + (loop_at_this_size == 2 && !frame_is_intra_only(cm))) { + av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height); + *q = (*q_high + *q_low) / 2; + } else if (loop_at_this_size == 2 && frame_is_intra_only(cm)) { + const int q_mid = (*q_high + *q_low) / 2; + const int q_regulated = + get_regulated_q_undershoot(cpi, *q_high, top_index, bottom_index); + // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth + // transition between loop_at_this_size < 2 and loop_at_this_size > 2. + *q = (q_mid + q_regulated) / 2; + + // Special case reset for qlow for constrained quality. + // This should only trigger where there is very substantial + // undershoot on a frame and the auto cq level is above + // the user passsed in value. + if (cpi->oxcf.rc_mode == AOM_CQ && q_regulated < *q_low) { + *q_low = *q; + } + } else { + *q = get_regulated_q_undershoot(cpi, *q_high, top_index, bottom_index); + + // Special case reset for qlow for constrained quality. + // This should only trigger where there is very substantial + // undershoot on a frame and the auto cq level is above + // the user passsed in value. + if (cpi->oxcf.rc_mode == AOM_CQ && *q < *q_low) { + *q_low = *q; + } + } + + *undershoot_seen = 1; + } + + // Clamp Q to upper and lower limits: + *q = clamp(*q, *q_low, *q_high); + } + + *loop = (*q != last_q); +} + +static int get_interp_filter_selected(const AV1_COMMON *const cm, + MV_REFERENCE_FRAME ref, + InterpFilter ifilter) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref); + if (buf == NULL) return 0; + return buf->interp_filter_selected[ifilter]; +} + +static uint16_t setup_interp_filter_search_mask(AV1_COMP *cpi) { + const AV1_COMMON *const cm = &cpi->common; + int ref_total[REF_FRAMES] = { 0 }; + uint16_t mask = ALLOW_ALL_INTERP_FILT_MASK; + + if (cpi->last_frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame) + return mask; + + for (MV_REFERENCE_FRAME ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) { + for (InterpFilter ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP; + ++ifilter) { + ref_total[ref] += get_interp_filter_selected(cm, ref, ifilter); + } + } + int ref_total_total = (ref_total[LAST2_FRAME] + ref_total[LAST3_FRAME] + + ref_total[GOLDEN_FRAME] + ref_total[BWDREF_FRAME] + + ref_total[ALTREF2_FRAME] + ref_total[ALTREF_FRAME]); + + for (InterpFilter ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP; + ++ifilter) { + int last_score = get_interp_filter_selected(cm, LAST_FRAME, ifilter) * 30; + if (ref_total[LAST_FRAME] && last_score <= ref_total[LAST_FRAME]) { + int filter_score = + get_interp_filter_selected(cm, LAST2_FRAME, ifilter) * 20 + + get_interp_filter_selected(cm, LAST3_FRAME, ifilter) * 20 + + get_interp_filter_selected(cm, GOLDEN_FRAME, ifilter) * 20 + + get_interp_filter_selected(cm, BWDREF_FRAME, ifilter) * 10 + + get_interp_filter_selected(cm, ALTREF2_FRAME, ifilter) * 10 + + get_interp_filter_selected(cm, ALTREF_FRAME, ifilter) * 10; + if (filter_score < ref_total_total) { + DUAL_FILTER_TYPE filt_type = ifilter + SWITCHABLE_FILTERS * ifilter; + reset_interp_filter_allowed_mask(&mask, filt_type); + } + } + } + return mask; +} + +#if !CONFIG_REALTIME_ONLY +#define STRICT_PSNR_DIFF_THRESH 0.9 +// Encode key frame with/without screen content tools to determine whether +// screen content tools should be enabled for this key frame group or not. +// The first encoding is without screen content tools. +// The second encoding is with screen content tools. +// We compare the psnr and frame size to make the decision. +static void screen_content_tools_determination( + AV1_COMP *cpi, const int allow_screen_content_tools_orig_decision, + const int allow_intrabc_orig_decision, + const int is_screen_content_type_orig_decision, const int pass, + int *projected_size_pass, PSNR_STATS *psnr) { + AV1_COMMON *const cm = &cpi->common; + FeatureFlags *const features = &cm->features; + projected_size_pass[pass] = cpi->rc.projected_frame_size; +#if CONFIG_AV1_HIGHBITDEPTH + const uint32_t in_bit_depth = cpi->oxcf.input_bit_depth; + const uint32_t bit_depth = cpi->td.mb.e_mbd.bd; + aom_calc_highbd_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr[pass], + bit_depth, in_bit_depth); +#else + aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr[pass]); +#endif + if (pass != 1) return; + + const double psnr_diff = psnr[1].psnr[0] - psnr[0].psnr[0]; + const int is_sc_encoding_much_better = psnr_diff > STRICT_PSNR_DIFF_THRESH; + if (is_sc_encoding_much_better) { + // Use screen content tools, if we get coding gain. + features->allow_screen_content_tools = 1; + features->allow_intrabc = cpi->intrabc_used; + cpi->is_screen_content_type = 1; + } else { + // Use original screen content decision. + features->allow_screen_content_tools = + allow_screen_content_tools_orig_decision; + features->allow_intrabc = allow_intrabc_orig_decision; + cpi->is_screen_content_type = is_screen_content_type_orig_decision; + } +} + +// Set some encoding parameters to make the encoding process fast. +// A fixed block partition size, and a large q is used. +static void set_encoding_params_for_screen_content(AV1_COMP *cpi, + const int pass) { + AV1_COMMON *const cm = &cpi->common; + if (pass == 0) { + // In the first pass, encode without screen content tools. + // Use a high q, and a fixed block size for fast encoding. + cm->features.allow_screen_content_tools = 0; + cm->features.allow_intrabc = 0; + cpi->is_screen_content_type = 0; + cpi->sf.part_sf.partition_search_type = FIXED_PARTITION; + cpi->sf.part_sf.always_this_block_size = BLOCK_32X32; + return; + } + assert(pass == 1); + // In the second pass, encode with screen content tools. + // Use a high q, and a fixed block size for fast encoding. + cm->features.allow_screen_content_tools = 1; + // TODO(chengchen): turn intrabc on could lead to data race issue. + // cm->allow_intrabc = 1; + cpi->is_screen_content_type = 1; + cpi->sf.part_sf.partition_search_type = FIXED_PARTITION; + cpi->sf.part_sf.always_this_block_size = BLOCK_32X32; +} + +// Determines whether to use screen content tools for the key frame group. +// This function modifies "cm->features.allow_screen_content_tools", +// "cm->features.allow_intrabc" and "cpi->is_screen_content_type". +static void determine_sc_tools_with_encoding(AV1_COMP *cpi, const int q_orig) { + AV1_COMMON *const cm = &cpi->common; + // Variables to help determine if we should allow screen content tools. + int projected_size_pass[3] = { 0 }; + PSNR_STATS psnr[3]; + const int is_key_frame = cm->current_frame.frame_type == KEY_FRAME; + const int allow_screen_content_tools_orig_decision = + cm->features.allow_screen_content_tools; + const int allow_intrabc_orig_decision = cm->features.allow_intrabc; + const int is_screen_content_type_orig_decision = cpi->is_screen_content_type; + // Turn off the encoding trial for forward key frame and superres. + if (cpi->sf.rt_sf.use_nonrd_pick_mode || cpi->oxcf.fwd_kf_enabled || + cpi->superres_mode != SUPERRES_NONE || cpi->oxcf.mode == REALTIME || + is_screen_content_type_orig_decision || !is_key_frame) { + return; + } + + // TODO(chengchen): multiple encoding for the lossless mode is time consuming. + // Find a better way to determine whether screen content tools should be used + // for lossless coding. + // Use a high q and a fixed partition to do quick encoding. + const int q_for_screen_content_quick_run = + is_lossless_requested(&cpi->oxcf) ? q_orig : AOMMAX(q_orig, 244); + const int partition_search_type_orig = cpi->sf.part_sf.partition_search_type; + const BLOCK_SIZE fixed_partition_block_size_orig = + cpi->sf.part_sf.always_this_block_size; + + // Setup necessary params for encoding, including frame source, etc. + aom_clear_system_state(); + + cpi->source = + av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source); + if (cpi->unscaled_last_source != NULL) { + cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source, + &cpi->scaled_last_source); + } + + setup_frame(cpi); + + if (cm->seg.enabled) { + if (!cm->seg.update_data && cm->prev_frame) { + segfeatures_copy(&cm->seg, &cm->prev_frame->seg); + cm->seg.enabled = cm->prev_frame->seg.enabled; + } else { + av1_calculate_segdata(&cm->seg); + } + } else { + memset(&cm->seg, 0, sizeof(cm->seg)); + } + segfeatures_copy(&cm->cur_frame->seg, &cm->seg); + cm->cur_frame->seg.enabled = cm->seg.enabled; + + // The two encoding passes aim to help determine whether to use screen + // content tools, with a high q and fixed partition. + for (int pass = 0; pass < 2; ++pass) { + set_encoding_params_for_screen_content(cpi, pass); +#if CONFIG_TUNE_VMAF + if (cpi->oxcf.tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING || + cpi->oxcf.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING || + cpi->oxcf.tuning == AOM_TUNE_VMAF_MAX_GAIN) { + av1_set_quantizer( + cm, cpi->oxcf.qm_minlevel, cpi->oxcf.qm_maxlevel, + av1_get_vmaf_base_qindex(cpi, q_for_screen_content_quick_run)); + } else { +#endif + av1_set_quantizer(cm, cpi->oxcf.qm_minlevel, cpi->oxcf.qm_maxlevel, + q_for_screen_content_quick_run); +#if CONFIG_TUNE_VMAF + } +#endif + av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed); + if (cpi->oxcf.deltaq_mode != NO_DELTA_Q) + av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params, + cm->seq_params.bit_depth); + + av1_set_variance_partition_thresholds(cpi, q_for_screen_content_quick_run, + 0); + // transform / motion compensation build reconstruction frame + av1_encode_frame(cpi); + // Screen content decision + screen_content_tools_determination( + cpi, allow_screen_content_tools_orig_decision, + allow_intrabc_orig_decision, is_screen_content_type_orig_decision, pass, + projected_size_pass, psnr); + } + + // Set partition speed feature back. + cpi->sf.part_sf.partition_search_type = partition_search_type_orig; + cpi->sf.part_sf.always_this_block_size = fixed_partition_block_size_orig; +} +#endif // CONFIG_REALTIME_ONLY + +static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) { + AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + GlobalMotionInfo *const gm_info = &cpi->gm_info; + const int allow_recode = (cpi->sf.hl_sf.recode_loop != DISALLOW_RECODE); + // Must allow recode if minimum compression ratio is set. + assert(IMPLIES(cpi->oxcf.min_cr > 0, allow_recode)); + + set_size_independent_vars(cpi); + if (is_stat_consumption_stage_twopass(cpi) && + cpi->sf.interp_sf.adaptive_interp_filter_search) + cpi->interp_search_flags.interp_filter_search_mask = + setup_interp_filter_search_mask(cpi); + cpi->source->buf_8bit_valid = 0; + + av1_setup_frame_size(cpi); + +#if CONFIG_SUPERRES_IN_RECODE + if (superres_in_recode_allowed(cpi) && cpi->superres_mode != SUPERRES_NONE && + cm->superres_scale_denominator == SCALE_NUMERATOR) { + // Superres mode is currently enabled, but the denominator selected will + // disable superres. So no need to continue, as we will go through another + // recode loop for full-resolution after this anyway. + return -1; + } +#endif // CONFIG_SUPERRES_IN_RECODE + + int top_index = 0, bottom_index = 0; + int q = 0, q_low = 0, q_high = 0; + set_size_dependent_vars(cpi, &q, &bottom_index, &top_index); + q_low = bottom_index; + q_high = top_index; + if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) { + const int num_64x64_blocks = + (cm->seq_params.sb_size == BLOCK_64X64) ? 1 : 4; + if (cpi->td.vt64x64) { + if (num_64x64_blocks != cpi->td.num_64x64_blocks) { + aom_free(cpi->td.vt64x64); + cpi->td.vt64x64 = NULL; + } + } + if (!cpi->td.vt64x64) { + CHECK_MEM_ERROR(cm, cpi->td.vt64x64, + aom_malloc(sizeof(*cpi->td.vt64x64) * num_64x64_blocks)); + cpi->td.num_64x64_blocks = num_64x64_blocks; + } + } + + if (cm->current_frame.frame_type == KEY_FRAME) { + FrameProbInfo *const frame_probs = &cpi->frame_probs; + + if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) { + av1_copy(frame_probs->tx_type_probs, default_tx_type_probs); + } + + if (!cpi->sf.inter_sf.disable_obmc && + cpi->sf.inter_sf.prune_obmc_prob_thresh > 0) { + av1_copy(frame_probs->obmc_probs, default_obmc_probs); + } + + if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) { + av1_copy(frame_probs->warped_probs, default_warped_probs); + } + + if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) { + av1_copy(frame_probs->switchable_interp_probs, + default_switchable_interp_probs); + } + } +#if !CONFIG_REALTIME_ONLY + // Determine whether to use screen content tools using two fast encoding. + determine_sc_tools_with_encoding(cpi, q); +#endif // CONFIG_REALTIME_ONLY + +#if CONFIG_COLLECT_COMPONENT_TIMING + printf("\n Encoding a frame:"); +#endif + + // Loop variables + int loop = 0; + int loop_count = 0; + int loop_at_this_size = 0; + int overshoot_seen = 0; + int undershoot_seen = 0; + int low_cr_seen = 0; + int last_loop_allow_hp = 0; + + do { + loop = 0; + aom_clear_system_state(); + + // if frame was scaled calculate global_motion_search again if already + // done + if (loop_count > 0 && cpi->source && gm_info->search_done) { + if (cpi->source->y_crop_width != cm->width || + cpi->source->y_crop_height != cm->height) { + gm_info->search_done = 0; + } + } + cpi->source = + av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source); + if (cpi->unscaled_last_source != NULL) { + cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source, + &cpi->scaled_last_source); + } + + if (!frame_is_intra_only(cm)) { + if (loop_count > 0) { + release_scaled_references(cpi); + } + scale_references(cpi); + } +#if CONFIG_TUNE_VMAF + if (cpi->oxcf.tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING || + cpi->oxcf.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING || + cpi->oxcf.tuning == AOM_TUNE_VMAF_MAX_GAIN) { + av1_set_quantizer(cm, cpi->oxcf.qm_minlevel, cpi->oxcf.qm_maxlevel, + av1_get_vmaf_base_qindex(cpi, q)); + } else { +#endif + av1_set_quantizer(cm, cpi->oxcf.qm_minlevel, cpi->oxcf.qm_maxlevel, q); +#if CONFIG_TUNE_VMAF + } +#endif + av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed); + + if (cpi->oxcf.deltaq_mode != NO_DELTA_Q) + av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params, + cm->seq_params.bit_depth); + + av1_set_variance_partition_thresholds(cpi, q, 0); + + // printf("Frame %d/%d: q = %d, frame_type = %d superres_denom = %d\n", + // cm->current_frame.frame_number, cm->show_frame, q, + // cm->current_frame.frame_type, cm->superres_scale_denominator); + + if (loop_count == 0) { + setup_frame(cpi); + } else if (get_primary_ref_frame_buf(cm) == NULL) { + // Base q-index may have changed, so we need to assign proper default coef + // probs before every iteration. + av1_default_coef_probs(cm); + av1_setup_frame_contexts(cm); + } + + if (cpi->oxcf.aq_mode == VARIANCE_AQ) { + av1_vaq_frame_setup(cpi); + } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) { + av1_setup_in_frame_q_adj(cpi); + } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && !allow_recode) { + suppress_active_map(cpi); + av1_cyclic_refresh_setup(cpi); + apply_active_map(cpi); + } + + if (cm->seg.enabled) { + if (!cm->seg.update_data && cm->prev_frame) { + segfeatures_copy(&cm->seg, &cm->prev_frame->seg); + cm->seg.enabled = cm->prev_frame->seg.enabled; + } else { + av1_calculate_segdata(&cm->seg); + } + } else { + memset(&cm->seg, 0, sizeof(cm->seg)); + } + segfeatures_copy(&cm->cur_frame->seg, &cm->seg); + cm->cur_frame->seg.enabled = cm->seg.enabled; + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_encode_frame_time); +#endif + // Set the motion vector precision based on mv stats from the last coded + // frame. + if (!frame_is_intra_only(cm)) { + av1_pick_and_set_high_precision_mv(cpi, q); + + // If the precision has changed during different iteration of the loop, + // then we need to reset the global motion vectors + if (loop_count > 0 && + cm->features.allow_high_precision_mv != last_loop_allow_hp) { + gm_info->search_done = 0; + } + last_loop_allow_hp = cm->features.allow_high_precision_mv; + } + + // transform / motion compensation build reconstruction frame + av1_encode_frame(cpi); +#if !CONFIG_REALTIME_ONLY + // Reset the mv_stats in case we are interrupted by an intraframe or an + // overlay frame. + if (cpi->mv_stats.valid) { + av1_zero(cpi->mv_stats); + } + // Gather the mv_stats for the next frame + if (cpi->sf.hl_sf.high_precision_mv_usage == LAST_MV_DATA && + av1_frame_allows_smart_mv(cpi)) { + av1_collect_mv_stats(cpi, q); + } +#endif // !CONFIG_REALTIME_ONLY + +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_encode_frame_time); +#endif + + aom_clear_system_state(); + + // Dummy pack of the bitstream using up to date stats to get an + // accurate estimate of output frame size to determine if we need + // to recode. + const int do_dummy_pack = + (cpi->sf.hl_sf.recode_loop >= ALLOW_RECODE_KFARFGF && + cpi->oxcf.rc_mode != AOM_Q) || + cpi->oxcf.min_cr > 0; + if (do_dummy_pack) { + finalize_encoded_frame(cpi); + int largest_tile_id = 0; // Output from bitstream: unused here + if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + + rc->projected_frame_size = (int)(*size) << 3; + } + + if (allow_recode) { + // Update q and decide whether to do a recode loop + recode_loop_update_q(cpi, &loop, &q, &q_low, &q_high, top_index, + bottom_index, &undershoot_seen, &overshoot_seen, + &low_cr_seen, loop_at_this_size); + } + + // Special case for overlay frame. + if (loop && rc->is_src_frame_alt_ref && + rc->projected_frame_size < rc->max_frame_bandwidth) { + loop = 0; + } + + if (allow_recode && !cpi->sf.gm_sf.gm_disable_recode && + recode_loop_test_global_motion(cm->global_motion, + cpi->td.rd_counts.global_motion_used, + gm_info->params_cost)) { + loop = 1; + } + + if (loop) { + ++loop_count; + ++loop_at_this_size; + +#if CONFIG_INTERNAL_STATS + ++cpi->tot_recode_hits; +#endif + } +#if CONFIG_COLLECT_COMPONENT_TIMING + if (loop) printf("\n Recoding:"); +#endif + } while (loop); + + // Update some stats from cyclic refresh. + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && !frame_is_intra_only(cm)) + av1_cyclic_refresh_postencode(cpi); + + return AOM_CODEC_OK; +} + +static int encode_with_recode_loop_and_filter(AV1_COMP *cpi, size_t *size, + uint8_t *dest, int64_t *sse, + int64_t *rate, + int *largest_tile_id) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_with_recode_loop_time); +#endif + int err = encode_with_recode_loop(cpi, size, dest); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_with_recode_loop_time); +#endif + if (err != AOM_CODEC_OK) { + if (err == -1) { + // special case as described in encode_with_recode_loop(). + // Encoding was skipped. + err = AOM_CODEC_OK; + if (sse != NULL) *sse = INT64_MAX; + if (rate != NULL) *rate = INT64_MAX; + *largest_tile_id = 0; + } + return err; + } + +#ifdef OUTPUT_YUV_SKINMAP + if (cpi->common.current_frame.frame_number > 1) { + av1_compute_skin_map(cpi, yuv_skinmap_file); + } +#endif // OUTPUT_YUV_SKINMAP + + AV1_COMMON *const cm = &cpi->common; + SequenceHeader *const seq_params = &cm->seq_params; + + // Special case code to reduce pulsing when key frames are forced at a + // fixed interval. Note the reconstruction error if it is the frame before + // the force key frame + if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) { +#if CONFIG_AV1_HIGHBITDEPTH + if (seq_params->use_highbitdepth) { + cpi->ambient_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf); + } else { + cpi->ambient_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf); + } +#else + cpi->ambient_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf); +#endif + } + + cm->cur_frame->buf.color_primaries = seq_params->color_primaries; + cm->cur_frame->buf.transfer_characteristics = + seq_params->transfer_characteristics; + cm->cur_frame->buf.matrix_coefficients = seq_params->matrix_coefficients; + cm->cur_frame->buf.monochrome = seq_params->monochrome; + cm->cur_frame->buf.chroma_sample_position = + seq_params->chroma_sample_position; + cm->cur_frame->buf.color_range = seq_params->color_range; + cm->cur_frame->buf.render_width = cm->render_width; + cm->cur_frame->buf.render_height = cm->render_height; + + // TODO(zoeliu): For non-ref frames, loop filtering may need to be turned + // off. + + // Pick the loop filter level for the frame. + if (!cm->features.allow_intrabc) { + loopfilter_frame(cpi, cm); + } else { + cm->lf.filter_level[0] = 0; + cm->lf.filter_level[1] = 0; + cm->cdef_info.cdef_bits = 0; + cm->cdef_info.cdef_strengths[0] = 0; + cm->cdef_info.nb_cdef_strengths = 1; + cm->cdef_info.cdef_uv_strengths[0] = 0; + cm->rst_info[0].frame_restoration_type = RESTORE_NONE; + cm->rst_info[1].frame_restoration_type = RESTORE_NONE; + cm->rst_info[2].frame_restoration_type = RESTORE_NONE; + } + + // TODO(debargha): Fix mv search range on encoder side + // aom_extend_frame_inner_borders(&cm->cur_frame->buf, av1_num_planes(cm)); + aom_extend_frame_borders(&cm->cur_frame->buf, av1_num_planes(cm)); + +#ifdef OUTPUT_YUV_REC + aom_write_one_yuv_frame(cm, &cm->cur_frame->buf); +#endif + + finalize_encoded_frame(cpi); + // Build the bitstream +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_pack_bitstream_final_time); +#endif + if (av1_pack_bitstream(cpi, dest, size, largest_tile_id) != AOM_CODEC_OK) + return AOM_CODEC_ERROR; +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_pack_bitstream_final_time); +#endif + + // Compute sse and rate. + if (sse != NULL) { +#if CONFIG_AV1_HIGHBITDEPTH + *sse = (seq_params->use_highbitdepth) + ? aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf) + : aom_get_y_sse(cpi->source, &cm->cur_frame->buf); +#else + *sse = aom_get_y_sse(cpi->source, &cm->cur_frame->buf); +#endif + } + if (rate != NULL) { + const int64_t bits = (*size << 3); + *rate = (bits << 5); // To match scale. + } + return AOM_CODEC_OK; +} + +#if CONFIG_SUPERRES_IN_RECODE + +static void save_cur_buf(AV1_COMP *cpi) { + CODING_CONTEXT *const cc = &cpi->coding_context; + AV1_COMMON *cm = &cpi->common; + const YV12_BUFFER_CONFIG *ybf = &cm->cur_frame->buf; + memset(&cc->copy_buffer, 0, sizeof(cc->copy_buffer)); + if (aom_alloc_frame_buffer(&cc->copy_buffer, ybf->y_crop_width, + ybf->y_crop_height, ybf->subsampling_x, + ybf->subsampling_y, + ybf->flags & YV12_FLAG_HIGHBITDEPTH, ybf->border, + cm->features.byte_alignment) != AOM_CODEC_OK) { + aom_internal_error( + &cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate copy buffer for saving coding context"); + } + aom_yv12_copy_frame(ybf, &cc->copy_buffer, av1_num_planes(cm)); +} + +// Coding context that only needs to be saved when recode loop includes +// filtering (deblocking, CDEF, superres post-encode upscale and/or loop +// restoraton). +static void save_extra_coding_context(AV1_COMP *cpi) { + CODING_CONTEXT *const cc = &cpi->coding_context; + AV1_COMMON *cm = &cpi->common; + + cc->lf = cm->lf; + cc->cdef_info = cm->cdef_info; + cc->rc = cpi->rc; +} + +static void save_all_coding_context(AV1_COMP *cpi) { + save_cur_buf(cpi); + save_extra_coding_context(cpi); + if (!frame_is_intra_only(&cpi->common)) release_scaled_references(cpi); +} + +static void restore_cur_buf(AV1_COMP *cpi) { + CODING_CONTEXT *const cc = &cpi->coding_context; + AV1_COMMON *cm = &cpi->common; + aom_yv12_copy_frame(&cc->copy_buffer, &cm->cur_frame->buf, + av1_num_planes(cm)); +} + +// Coding context that only needs to be restored when recode loop includes +// filtering (deblocking, CDEF, superres post-encode upscale and/or loop +// restoraton). +static void restore_extra_coding_context(AV1_COMP *cpi) { + CODING_CONTEXT *const cc = &cpi->coding_context; + AV1_COMMON *cm = &cpi->common; + cm->lf = cc->lf; + cm->cdef_info = cc->cdef_info; + cpi->rc = cc->rc; +} + +static void restore_all_coding_context(AV1_COMP *cpi) { + restore_cur_buf(cpi); + restore_extra_coding_context(cpi); + if (!frame_is_intra_only(&cpi->common)) release_scaled_references(cpi); +} + +static void release_copy_buffer(CODING_CONTEXT *cc) { + aom_free_frame_buffer(&cc->copy_buffer); +} + +static int encode_with_and_without_superres(AV1_COMP *cpi, size_t *size, + uint8_t *dest, + int *largest_tile_id) { + const AV1_COMMON *const cm = &cpi->common; + assert(cm->seq_params.enable_superres); + assert(superres_in_recode_allowed(cpi)); + aom_codec_err_t err = AOM_CODEC_OK; + save_all_coding_context(cpi); + + // Encode with superres. +#if SUPERRES_RECODE_ALL_RATIOS + AV1EncoderConfig *const oxcf = &cpi->oxcf; + int64_t superres_sses[SCALE_NUMERATOR]; + int64_t superres_rates[SCALE_NUMERATOR]; + int superres_largest_tile_ids[SCALE_NUMERATOR]; + // Use superres for Key-frames and Alt-ref frames only. + const GF_GROUP *const gf_group = &cpi->gf_group; + if (gf_group->update_type[gf_group->index] != OVERLAY_UPDATE && + gf_group->update_type[gf_group->index] != INTNL_OVERLAY_UPDATE) { + for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR; + ++denom) { + oxcf->superres_scale_denominator = denom; + oxcf->superres_kf_scale_denominator = denom; + const int this_index = denom - (SCALE_NUMERATOR + 1); + err = encode_with_recode_loop_and_filter( + cpi, size, dest, &superres_sses[this_index], + &superres_rates[this_index], &superres_largest_tile_ids[this_index]); + if (err != AOM_CODEC_OK) return err; + restore_all_coding_context(cpi); + } + // Reset. + oxcf->superres_scale_denominator = SCALE_NUMERATOR; + oxcf->superres_kf_scale_denominator = SCALE_NUMERATOR; + } else { + for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR; + ++denom) { + const int this_index = denom - (SCALE_NUMERATOR + 1); + superres_sses[this_index] = INT64_MAX; + superres_rates[this_index] = INT64_MAX; + } + } +#else + int64_t sse1 = INT64_MAX; + int64_t rate1 = INT64_MAX; + int largest_tile_id1; + err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse1, &rate1, + &largest_tile_id1); + if (err != AOM_CODEC_OK) return err; + restore_all_coding_context(cpi); +#endif // SUPERRES_RECODE_ALL_RATIOS + + // Encode without superres. + int64_t sse2 = INT64_MAX; + int64_t rate2 = INT64_MAX; + int largest_tile_id2; + cpi->superres_mode = SUPERRES_NONE; // To force full-res. + err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse2, &rate2, + &largest_tile_id2); + cpi->superres_mode = cpi->oxcf.superres_mode; // Reset. + assert(cpi->oxcf.superres_mode == SUPERRES_AUTO); + if (err != AOM_CODEC_OK) return err; + + // Note: Both use common rdmult based on base qindex of fullres. + const int64_t rdmult = + av1_compute_rd_mult_based_on_qindex(cpi, cm->quant_params.base_qindex); + +#if SUPERRES_RECODE_ALL_RATIOS + // Find the best rdcost among all superres denoms. + double proj_rdcost1 = DBL_MAX; + int64_t sse1 = INT64_MAX; + int64_t rate1 = INT64_MAX; + int largest_tile_id1 = 0; + (void)sse1; + (void)rate1; + (void)largest_tile_id1; + int best_denom = -1; + for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR; ++denom) { + const int this_index = denom - (SCALE_NUMERATOR + 1); + const int64_t this_sse = superres_sses[this_index]; + const int64_t this_rate = superres_rates[this_index]; + const int this_largest_tile_id = superres_largest_tile_ids[this_index]; + const double this_rdcost = RDCOST_DBL(rdmult, this_rate, this_sse); + if (this_rdcost < proj_rdcost1) { + sse1 = this_sse; + rate1 = this_rate; + largest_tile_id1 = this_largest_tile_id; + proj_rdcost1 = this_rdcost; + best_denom = denom; + } + } +#else + const double proj_rdcost1 = RDCOST_DBL(rdmult, rate1, sse1); +#endif // SUPERRES_RECODE_ALL_RATIOS + const double proj_rdcost2 = RDCOST_DBL(rdmult, rate2, sse2); + + // Re-encode with superres if it's better. + if (proj_rdcost1 < proj_rdcost2) { + restore_all_coding_context(cpi); + // TODO(urvang): We should avoid rerunning the recode loop by saving + // previous output+state, or running encode only for the selected 'q' in + // previous step. +#if SUPERRES_RECODE_ALL_RATIOS + // Again, temporarily force the best denom. + oxcf->superres_scale_denominator = best_denom; + oxcf->superres_kf_scale_denominator = best_denom; +#endif // SUPERRES_RECODE_ALL_RATIOS + int64_t sse3 = INT64_MAX; + int64_t rate3 = INT64_MAX; + err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse3, &rate3, + largest_tile_id); + assert(sse1 == sse3); + assert(rate1 == rate3); + assert(largest_tile_id1 == *largest_tile_id); +#if SUPERRES_RECODE_ALL_RATIOS + // Reset. + oxcf->superres_scale_denominator = SCALE_NUMERATOR; + oxcf->superres_kf_scale_denominator = SCALE_NUMERATOR; +#endif // SUPERRES_RECODE_ALL_RATIOS + } else { + *largest_tile_id = largest_tile_id2; + } + + release_copy_buffer(&cpi->coding_context); + + return err; +} +#endif // CONFIG_SUPERRES_IN_RECODE + +#define DUMP_RECON_FRAMES 0 + +#if DUMP_RECON_FRAMES == 1 +// NOTE(zoeliu): For debug - Output the filtered reconstructed video. +static void dump_filtered_recon_frames(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const CurrentFrame *const current_frame = &cm->current_frame; + const YV12_BUFFER_CONFIG *recon_buf = &cm->cur_frame->buf; + + if (recon_buf == NULL) { + printf("Frame %d is not ready.\n", current_frame->frame_number); + return; + } + + static const int flag_list[REF_FRAMES] = { 0, + AOM_LAST_FLAG, + AOM_LAST2_FLAG, + AOM_LAST3_FLAG, + AOM_GOLD_FLAG, + AOM_BWD_FLAG, + AOM_ALT2_FLAG, + AOM_ALT_FLAG }; + printf( + "\n***Frame=%d (frame_offset=%d, show_frame=%d, " + "show_existing_frame=%d) " + "[LAST LAST2 LAST3 GOLDEN BWD ALT2 ALT]=[", + current_frame->frame_number, current_frame->order_hint, cm->show_frame, + cm->show_existing_frame); + for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + const int ref_offset = buf != NULL ? (int)buf->order_hint : -1; + printf(" %d(%c)", ref_offset, + (cpi->ref_frame_flags & flag_list[ref_frame]) ? 'Y' : 'N'); + } + printf(" ]\n"); + + if (!cm->show_frame) { + printf("Frame %d is a no show frame, so no image dump.\n", + current_frame->frame_number); + return; + } + + int h; + char file_name[256] = "/tmp/enc_filtered_recon.yuv"; + FILE *f_recon = NULL; + + if (current_frame->frame_number == 0) { + if ((f_recon = fopen(file_name, "wb")) == NULL) { + printf("Unable to open file %s to write.\n", file_name); + return; + } + } else { + if ((f_recon = fopen(file_name, "ab")) == NULL) { + printf("Unable to open file %s to append.\n", file_name); + return; + } + } + printf( + "\nFrame=%5d, encode_update_type[%5d]=%1d, frame_offset=%d, " + "show_frame=%d, show_existing_frame=%d, source_alt_ref_active=%d, " + "refresh_alt_ref_frame=%d, " + "y_stride=%4d, uv_stride=%4d, cm->width=%4d, cm->height=%4d\n\n", + current_frame->frame_number, cpi->gf_group.index, + cpi->gf_group.update_type[cpi->gf_group.index], current_frame->order_hint, + cm->show_frame, cm->show_existing_frame, cpi->rc.source_alt_ref_active, + cpi->refresh_alt_ref_frame, recon_buf->y_stride, recon_buf->uv_stride, + cm->width, cm->height); +#if 0 + int ref_frame; + printf("get_ref_frame_map_idx: ["); + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) + printf(" %d", get_ref_frame_map_idx(cm, ref_frame)); + printf(" ]\n"); +#endif // 0 + + // --- Y --- + for (h = 0; h < cm->height; ++h) { + fwrite(&recon_buf->y_buffer[h * recon_buf->y_stride], 1, cm->width, + f_recon); + } + // --- U --- + for (h = 0; h < (cm->height >> 1); ++h) { + fwrite(&recon_buf->u_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1), + f_recon); + } + // --- V --- + for (h = 0; h < (cm->height >> 1); ++h) { + fwrite(&recon_buf->v_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1), + f_recon); + } + + fclose(f_recon); +} +#endif // DUMP_RECON_FRAMES + +static int is_integer_mv(const YV12_BUFFER_CONFIG *cur_picture, + const YV12_BUFFER_CONFIG *last_picture, + ForceIntegerMVInfo *const force_intpel_info) { + aom_clear_system_state(); + // check use hash ME + int k; + + const int block_size = FORCE_INT_MV_DECISION_BLOCK_SIZE; + const double threshold_current = 0.8; + const double threshold_average = 0.95; + const int max_history_size = 32; + int T = 0; // total block + int C = 0; // match with collocated block + int S = 0; // smooth region but not match with collocated block + + const int pic_width = cur_picture->y_width; + const int pic_height = cur_picture->y_height; + for (int i = 0; i + block_size <= pic_height; i += block_size) { + for (int j = 0; j + block_size <= pic_width; j += block_size) { + const int x_pos = j; + const int y_pos = i; + int match = 1; + T++; + + // check whether collocated block match with current + uint8_t *p_cur = cur_picture->y_buffer; + uint8_t *p_ref = last_picture->y_buffer; + int stride_cur = cur_picture->y_stride; + int stride_ref = last_picture->y_stride; + p_cur += (y_pos * stride_cur + x_pos); + p_ref += (y_pos * stride_ref + x_pos); + + if (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *p16_cur = CONVERT_TO_SHORTPTR(p_cur); + uint16_t *p16_ref = CONVERT_TO_SHORTPTR(p_ref); + for (int tmpY = 0; tmpY < block_size && match; tmpY++) { + for (int tmpX = 0; tmpX < block_size && match; tmpX++) { + if (p16_cur[tmpX] != p16_ref[tmpX]) { + match = 0; + } + } + p16_cur += stride_cur; + p16_ref += stride_ref; + } + } else { + for (int tmpY = 0; tmpY < block_size && match; tmpY++) { + for (int tmpX = 0; tmpX < block_size && match; tmpX++) { + if (p_cur[tmpX] != p_ref[tmpX]) { + match = 0; + } + } + p_cur += stride_cur; + p_ref += stride_ref; + } + } + + if (match) { + C++; + continue; + } + + if (av1_hash_is_horizontal_perfect(cur_picture, block_size, x_pos, + y_pos) || + av1_hash_is_vertical_perfect(cur_picture, block_size, x_pos, y_pos)) { + S++; + continue; + } + } + } + + assert(T > 0); + double cs_rate = ((double)(C + S)) / ((double)(T)); + + force_intpel_info->cs_rate_array[force_intpel_info->rate_index] = cs_rate; + + force_intpel_info->rate_index = + (force_intpel_info->rate_index + 1) % max_history_size; + force_intpel_info->rate_size++; + force_intpel_info->rate_size = + AOMMIN(force_intpel_info->rate_size, max_history_size); + + if (cs_rate < threshold_current) { + return 0; + } + + if (C == T) { + return 1; + } + + double cs_average = 0.0; + + for (k = 0; k < force_intpel_info->rate_size; k++) { + cs_average += force_intpel_info->cs_rate_array[k]; + } + cs_average /= force_intpel_info->rate_size; + + if (cs_average < threshold_average) { + return 0; + } + + if ((T - C - S) < 0) { + return 1; + } + + if (cs_average > 1.01) { + return 1; + } + + return 0; +} + +// Refresh reference frame buffers according to refresh_frame_flags. +static void refresh_reference_frames(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + // All buffers are refreshed for shown keyframes and S-frames. + + for (int ref_frame = 0; ref_frame < REF_FRAMES; ref_frame++) { + if (((cm->current_frame.refresh_frame_flags >> ref_frame) & 1) == 1) { + assign_frame_buffer_p(&cm->ref_frame_map[ref_frame], cm->cur_frame); + } + } +} + +static void set_mb_ssim_rdmult_scaling(AV1_COMP *cpi) { + const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; + ThreadData *td = &cpi->td; + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + uint8_t *y_buffer = cpi->source->y_buffer; + const int y_stride = cpi->source->y_stride; + const int block_size = BLOCK_16X16; + + const int num_mi_w = mi_size_wide[block_size]; + const int num_mi_h = mi_size_high[block_size]; + const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w; + const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h; + double log_sum = 0.0; + const int use_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH; + + // Loop through each 16x16 block. + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + double var = 0.0, num_of_var = 0.0; + const int index = row * num_cols + col; + + // Loop through each 8x8 block. + for (int mi_row = row * num_mi_h; + mi_row < mi_params->mi_rows && mi_row < (row + 1) * num_mi_h; + mi_row += 2) { + for (int mi_col = col * num_mi_w; + mi_col < mi_params->mi_cols && mi_col < (col + 1) * num_mi_w; + mi_col += 2) { + struct buf_2d buf; + const int row_offset_y = mi_row << 2; + const int col_offset_y = mi_col << 2; + + buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y; + buf.stride = y_stride; + + if (use_hbd) { + var += av1_high_get_sby_perpixel_variance(cpi, &buf, BLOCK_8X8, + xd->bd); + } else { + var += av1_get_sby_perpixel_variance(cpi, &buf, BLOCK_8X8); + } + + num_of_var += 1.0; + } + } + var = var / num_of_var; + + // Curve fitting with an exponential model on all 16x16 blocks from the + // midres dataset. + var = 67.035434 * (1 - exp(-0.0021489 * var)) + 17.492222; + cpi->ssim_rdmult_scaling_factors[index] = var; + log_sum += log(var); + } + } + log_sum = exp(log_sum / (double)(num_rows * num_cols)); + + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + const int index = row * num_cols + col; + cpi->ssim_rdmult_scaling_factors[index] /= log_sum; + } + } +} + +extern void av1_print_frame_contexts(const FRAME_CONTEXT *fc, + const char *filename); + +static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, + uint8_t *dest) { + AV1_COMMON *const cm = &cpi->common; + SequenceHeader *const seq_params = &cm->seq_params; + CurrentFrame *const current_frame = &cm->current_frame; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + struct segmentation *const seg = &cm->seg; + FeatureFlags *const features = &cm->features; + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_frame_to_data_rate_time); +#endif + + // frame type has been decided outside of this function call + cm->cur_frame->frame_type = current_frame->frame_type; + + cm->tiles.large_scale = cpi->oxcf.large_scale_tile; + cm->tiles.single_tile_decoding = cpi->oxcf.single_tile_decoding; + + features->allow_ref_frame_mvs &= frame_might_allow_ref_frame_mvs(cm); + // features->allow_ref_frame_mvs needs to be written into the frame header + // while cm->tiles.large_scale is 1, therefore, "cm->tiles.large_scale=1" case + // is separated from frame_might_allow_ref_frame_mvs(). + features->allow_ref_frame_mvs &= !cm->tiles.large_scale; + + features->allow_warped_motion = + cpi->oxcf.allow_warped_motion && frame_might_allow_warped_motion(cm); + + cpi->last_frame_type = current_frame->frame_type; + + if (encode_show_existing_frame(cm)) { + finalize_encoded_frame(cpi); + // Build the bitstream + int largest_tile_id = 0; // Output from bitstream: unused here + if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) != AOM_CODEC_OK) + return AOM_CODEC_ERROR; + + if (seq_params->frame_id_numbers_present_flag && + current_frame->frame_type == KEY_FRAME) { + // Displaying a forward key-frame, so reset the ref buffer IDs + int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show]; + for (int i = 0; i < REF_FRAMES; i++) + cm->ref_frame_id[i] = display_frame_id; + } + + cpi->seq_params_locked = 1; + +#if DUMP_RECON_FRAMES == 1 + // NOTE(zoeliu): For debug - Output the filtered reconstructed video. + dump_filtered_recon_frames(cpi); +#endif // DUMP_RECON_FRAMES + + // NOTE: Save the new show frame buffer index for --test-code=warn, i.e., + // for the purpose to verify no mismatch between encoder and decoder. + if (cm->show_frame) cpi->last_show_frame_buf = cm->cur_frame; + + refresh_reference_frames(cpi); + + // Since we allocate a spot for the OVERLAY frame in the gf group, we need + // to do post-encoding update accordingly. + if (cpi->rc.is_src_frame_alt_ref) { + av1_set_target_rate(cpi, cm->width, cm->height); + av1_rc_postencode_update(cpi, *size); + } + + ++current_frame->frame_number; + + return AOM_CODEC_OK; + } + + // Work out whether to force_integer_mv this frame + if (!is_stat_generation_stage(cpi) && + cpi->common.features.allow_screen_content_tools && + !frame_is_intra_only(cm)) { + if (cpi->common.seq_params.force_integer_mv == 2) { + // Adaptive mode: see what previous frame encoded did + if (cpi->unscaled_last_source != NULL) { + features->cur_frame_force_integer_mv = is_integer_mv( + cpi->source, cpi->unscaled_last_source, &cpi->force_intpel_info); + } else { + cpi->common.features.cur_frame_force_integer_mv = 0; + } + } else { + cpi->common.features.cur_frame_force_integer_mv = + cpi->common.seq_params.force_integer_mv; + } + } else { + cpi->common.features.cur_frame_force_integer_mv = 0; + } + + // Set default state for segment based loop filter update flags. + cm->lf.mode_ref_delta_update = 0; + + // Set various flags etc to special state if it is a key frame. + if (frame_is_intra_only(cm) || frame_is_sframe(cm)) { + // Reset the loop filter deltas and segmentation map. + av1_reset_segment_features(cm); + + // If segmentation is enabled force a map update for key frames. + if (seg->enabled) { + seg->update_map = 1; + seg->update_data = 1; + } + + // The alternate reference frame cannot be active for a key frame. + cpi->rc.source_alt_ref_active = 0; + } + if (cpi->oxcf.mtu == 0) { + cpi->num_tg = cpi->oxcf.num_tile_groups; + } else { + // Use a default value for the purposes of weighting costs in probability + // updates + cpi->num_tg = DEFAULT_MAX_NUM_TG; + } + + // For 1 pass CBR, check if we are dropping this frame. + // Never drop on key frame. + if (has_no_stats_stage(cpi) && oxcf->rc_mode == AOM_CBR && + current_frame->frame_type != KEY_FRAME) { + if (av1_rc_drop_frame(cpi)) { + av1_setup_frame_size(cpi); + av1_rc_postencode_update_drop_frame(cpi); + release_scaled_references(cpi); + return AOM_CODEC_OK; + } + } + + if (oxcf->tuning == AOM_TUNE_SSIM) set_mb_ssim_rdmult_scaling(cpi); + +#if CONFIG_TUNE_VMAF + if (oxcf->tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING || + oxcf->tuning == AOM_TUNE_VMAF_MAX_GAIN) { + av1_set_mb_vmaf_rdmult_scaling(cpi); + } +#endif + + aom_clear_system_state(); + +#if CONFIG_INTERNAL_STATS + memset(cpi->mode_chosen_counts, 0, + MAX_MODES * sizeof(*cpi->mode_chosen_counts)); +#endif + + if (seq_params->frame_id_numbers_present_flag) { + /* Non-normative definition of current_frame_id ("frame counter" with + * wraparound) */ + if (cm->current_frame_id == -1) { + int lsb, msb; + /* quasi-random initialization of current_frame_id for a key frame */ + if (cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) { + lsb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[0] & 0xff; + msb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[1] & 0xff; + } else { + lsb = cpi->source->y_buffer[0] & 0xff; + msb = cpi->source->y_buffer[1] & 0xff; + } + cm->current_frame_id = + ((msb << 8) + lsb) % (1 << seq_params->frame_id_length); + + // S_frame is meant for stitching different streams of different + // resolutions together, so current_frame_id must be the + // same across different streams of the same content current_frame_id + // should be the same and not random. 0x37 is a chosen number as start + // point + if (cpi->oxcf.sframe_enabled) cm->current_frame_id = 0x37; + } else { + cm->current_frame_id = + (cm->current_frame_id + 1 + (1 << seq_params->frame_id_length)) % + (1 << seq_params->frame_id_length); + } + } + + switch (cpi->oxcf.cdf_update_mode) { + case 0: // No CDF update for any frames(4~6% compression loss). + features->disable_cdf_update = 1; + break; + case 1: // Enable CDF update for all frames. + features->disable_cdf_update = 0; + break; + case 2: + // Strategically determine at which frames to do CDF update. + // Currently only enable CDF update for all-intra and no-show frames(1.5% + // compression loss). + // TODO(huisu@google.com): design schemes for various trade-offs between + // compression quality and decoding speed. + features->disable_cdf_update = + (frame_is_intra_only(cm) || !cm->show_frame) ? 0 : 1; + break; + } + seq_params->timing_info_present &= !seq_params->reduced_still_picture_hdr; + + int largest_tile_id = 0; +#if CONFIG_SUPERRES_IN_RECODE + if (superres_in_recode_allowed(cpi)) { + if (encode_with_and_without_superres(cpi, size, dest, &largest_tile_id) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + } else { +#endif // CONFIG_SUPERRES_IN_RECODE + if (encode_with_recode_loop_and_filter(cpi, size, dest, NULL, NULL, + &largest_tile_id) != AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } +#if CONFIG_SUPERRES_IN_RECODE + } +#endif // CONFIG_SUPERRES_IN_RECODE + + cpi->seq_params_locked = 1; + + // Update reference frame ids for reference frames this frame will overwrite + if (seq_params->frame_id_numbers_present_flag) { + for (int i = 0; i < REF_FRAMES; i++) { + if ((current_frame->refresh_frame_flags >> i) & 1) { + cm->ref_frame_id[i] = cm->current_frame_id; + } + } + } + +#if DUMP_RECON_FRAMES == 1 + // NOTE(zoeliu): For debug - Output the filtered reconstructed video. + dump_filtered_recon_frames(cpi); +#endif // DUMP_RECON_FRAMES + + if (cm->seg.enabled) { + if (cm->seg.update_map) { + update_reference_segmentation_map(cpi); + } else if (cm->last_frame_seg_map) { + memcpy(cm->cur_frame->seg_map, cm->last_frame_seg_map, + cm->mi_params.mi_cols * cm->mi_params.mi_rows * sizeof(uint8_t)); + } + } + + if (frame_is_intra_only(cm) == 0) { + release_scaled_references(cpi); + } + + // NOTE: Save the new show frame buffer index for --test-code=warn, i.e., + // for the purpose to verify no mismatch between encoder and decoder. + if (cm->show_frame) cpi->last_show_frame_buf = cm->cur_frame; + + refresh_reference_frames(cpi); + +#if CONFIG_ENTROPY_STATS + av1_accumulate_frame_counts(&aggregate_fc, &cpi->counts); +#endif // CONFIG_ENTROPY_STATS + + if (features->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) { + *cm->fc = cpi->tile_data[largest_tile_id].tctx; + av1_reset_cdf_symbol_counters(cm->fc); + } + if (!cm->tiles.large_scale) { + cm->cur_frame->frame_context = *cm->fc; + } + + if (cpi->oxcf.ext_tile_debug) { + // (yunqing) This test ensures the correctness of large scale tile coding. + if (cm->tiles.large_scale && is_stat_consumption_stage(cpi)) { + char fn[20] = "./fc"; + fn[4] = current_frame->frame_number / 100 + '0'; + fn[5] = (current_frame->frame_number % 100) / 10 + '0'; + fn[6] = (current_frame->frame_number % 10) + '0'; + fn[7] = '\0'; + av1_print_frame_contexts(cm->fc, fn); + } + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_frame_to_data_rate_time); + + // Print out timing information. + int i; + fprintf(stderr, "\n Frame number: %d, Frame type: %s, Show Frame: %d\n", + cm->current_frame.frame_number, + get_frame_type_enum(cm->current_frame.frame_type), cm->show_frame); + for (i = 0; i < kTimingComponents; i++) { + cpi->component_time[i] += cpi->frame_component_time[i]; + fprintf(stderr, " %s: %" PRId64 " us (total: %" PRId64 " us)\n", + get_component_name(i), cpi->frame_component_time[i], + cpi->component_time[i]); + cpi->frame_component_time[i] = 0; + } +#endif + + cpi->last_frame_type = current_frame->frame_type; + + av1_rc_postencode_update(cpi, *size); + + // Clear the one shot update flags for segmentation map and mode/ref loop + // filter deltas. + cm->seg.update_map = 0; + cm->seg.update_data = 0; + cm->lf.mode_ref_delta_update = 0; + + // A droppable frame might not be shown but it always + // takes a space in the gf group. Therefore, even when + // it is not shown, we still need update the count down. + + if (cm->show_frame) { + // Don't increment frame counters if this was an altref buffer + // update not a real frame + ++current_frame->frame_number; + } + + return AOM_CODEC_OK; +} + +int av1_encode(AV1_COMP *const cpi, uint8_t *const dest, + const EncodeFrameInput *const frame_input, + const EncodeFrameParams *const frame_params, + EncodeFrameResults *const frame_results) { + AV1_COMMON *const cm = &cpi->common; + CurrentFrame *const current_frame = &cm->current_frame; + + cpi->unscaled_source = frame_input->source; + cpi->source = frame_input->source; + cpi->unscaled_last_source = frame_input->last_source; + + current_frame->refresh_frame_flags = frame_params->refresh_frame_flags; + cm->features.error_resilient_mode = frame_params->error_resilient_mode; + cm->features.primary_ref_frame = frame_params->primary_ref_frame; + cm->current_frame.frame_type = frame_params->frame_type; + cm->show_frame = frame_params->show_frame; + cpi->ref_frame_flags = frame_params->ref_frame_flags; + cpi->speed = frame_params->speed; + cm->show_existing_frame = frame_params->show_existing_frame; + cpi->existing_fb_idx_to_show = frame_params->existing_fb_idx_to_show; + + memcpy(cm->remapped_ref_idx, frame_params->remapped_ref_idx, + REF_FRAMES * sizeof(*cm->remapped_ref_idx)); + + cpi->refresh_golden_frame = frame_params->refresh_golden_frame; + cpi->refresh_bwd_ref_frame = frame_params->refresh_bwd_ref_frame; + cpi->refresh_alt_ref_frame = frame_params->refresh_alt_ref_frame; + + if (current_frame->frame_type == KEY_FRAME && cm->show_frame) + current_frame->frame_number = 0; + + current_frame->order_hint = + current_frame->frame_number + frame_params->order_offset; + current_frame->display_order_hint = current_frame->order_hint; + current_frame->order_hint %= + (1 << (cm->seq_params.order_hint_info.order_hint_bits_minus_1 + 1)); + + if (is_stat_generation_stage(cpi)) { +#if !CONFIG_REALTIME_ONLY + av1_first_pass(cpi, frame_input->ts_duration); +#endif + } else if (cpi->oxcf.pass == 0 || cpi->oxcf.pass == 2) { + if (encode_frame_to_data_rate(cpi, &frame_results->size, dest) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + } else { + return AOM_CODEC_ERROR; + } + + return AOM_CODEC_OK; +} + +#if CONFIG_DENOISE +static int apply_denoise_2d(AV1_COMP *cpi, YV12_BUFFER_CONFIG *sd, + int block_size, float noise_level, + int64_t time_stamp, int64_t end_time) { + AV1_COMMON *const cm = &cpi->common; + if (!cpi->denoise_and_model) { + cpi->denoise_and_model = aom_denoise_and_model_alloc( + cm->seq_params.bit_depth, block_size, noise_level); + if (!cpi->denoise_and_model) { + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Error allocating denoise and model"); + return -1; + } + } + if (!cpi->film_grain_table) { + cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table)); + if (!cpi->film_grain_table) { + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Error allocating grain table"); + return -1; + } + memset(cpi->film_grain_table, 0, sizeof(*cpi->film_grain_table)); + } + if (aom_denoise_and_model_run(cpi->denoise_and_model, sd, + &cm->film_grain_params)) { + if (cm->film_grain_params.apply_grain) { + aom_film_grain_table_append(cpi->film_grain_table, time_stamp, end_time, + &cm->film_grain_params); + } + } + return 0; +} +#endif + +int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, + YV12_BUFFER_CONFIG *sd, int64_t time_stamp, + int64_t end_time) { + AV1_COMMON *const cm = &cpi->common; + const SequenceHeader *const seq_params = &cm->seq_params; + int res = 0; + const int subsampling_x = sd->subsampling_x; + const int subsampling_y = sd->subsampling_y; + const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0; + +#if CONFIG_TUNE_VMAF + if (!is_stat_generation_stage(cpi) && + cpi->oxcf.tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING) { + av1_vmaf_frame_preprocessing(cpi, sd); + } + if (!is_stat_generation_stage(cpi) && + cpi->oxcf.tuning == AOM_TUNE_VMAF_MAX_GAIN) { + av1_vmaf_blk_preprocessing(cpi, sd); + } +#endif + +#if CONFIG_INTERNAL_STATS + struct aom_usec_timer timer; + aom_usec_timer_start(&timer); +#endif +#if CONFIG_DENOISE + if (cpi->oxcf.noise_level > 0) + if (apply_denoise_2d(cpi, sd, cpi->oxcf.noise_block_size, + cpi->oxcf.noise_level, time_stamp, end_time) < 0) + res = -1; +#endif // CONFIG_DENOISE + + if (av1_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, + use_highbitdepth, frame_flags)) + res = -1; +#if CONFIG_INTERNAL_STATS + aom_usec_timer_mark(&timer); + cpi->time_receive_data += aom_usec_timer_elapsed(&timer); +#endif + if ((seq_params->profile == PROFILE_0) && !seq_params->monochrome && + (subsampling_x != 1 || subsampling_y != 1)) { + aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM, + "Non-4:2:0 color format requires profile 1 or 2"); + res = -1; + } + if ((seq_params->profile == PROFILE_1) && + !(subsampling_x == 0 && subsampling_y == 0)) { + aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM, + "Profile 1 requires 4:4:4 color format"); + res = -1; + } + if ((seq_params->profile == PROFILE_2) && + (seq_params->bit_depth <= AOM_BITS_10) && + !(subsampling_x == 1 && subsampling_y == 0)) { + aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM, + "Profile 2 bit-depth < 10 requires 4:2:2 color format"); + res = -1; + } + + return res; +} + +#if CONFIG_INTERNAL_STATS +extern double av1_get_blockiness(const unsigned char *img1, int img1_pitch, + const unsigned char *img2, int img2_pitch, + int width, int height); + +static void adjust_image_stat(double y, double u, double v, double all, + ImageStat *s) { + s->stat[STAT_Y] += y; + s->stat[STAT_U] += u; + s->stat[STAT_V] += v; + s->stat[STAT_ALL] += all; + s->worst = AOMMIN(s->worst, all); +} + +static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) { + AV1_COMMON *const cm = &cpi->common; + double samples = 0.0; + const uint32_t in_bit_depth = cpi->oxcf.input_bit_depth; + const uint32_t bit_depth = cpi->td.mb.e_mbd.bd; + +#if CONFIG_INTER_STATS_ONLY + if (cm->current_frame.frame_type == KEY_FRAME) return; // skip key frame +#endif + cpi->bytes += frame_bytes; + if (cm->show_frame) { + const YV12_BUFFER_CONFIG *orig = cpi->source; + const YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf; + double y, u, v, frame_all; + + cpi->count++; + if (cpi->b_calculate_psnr) { + PSNR_STATS psnr; + double frame_ssim2 = 0.0, weight = 0.0; + aom_clear_system_state(); +#if CONFIG_AV1_HIGHBITDEPTH + aom_calc_highbd_psnr(orig, recon, &psnr, bit_depth, in_bit_depth); +#else + aom_calc_psnr(orig, recon, &psnr); +#endif + adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3], psnr.psnr[0], + &cpi->psnr); + cpi->total_sq_error += psnr.sse[0]; + cpi->total_samples += psnr.samples[0]; + samples = psnr.samples[0]; + // TODO(yaowu): unify these two versions into one. + if (cm->seq_params.use_highbitdepth) + frame_ssim2 = + aom_highbd_calc_ssim(orig, recon, &weight, bit_depth, in_bit_depth); + else + frame_ssim2 = aom_calc_ssim(orig, recon, &weight); + + cpi->worst_ssim = AOMMIN(cpi->worst_ssim, frame_ssim2); + cpi->summed_quality += frame_ssim2 * weight; + cpi->summed_weights += weight; + +#if 0 + { + FILE *f = fopen("q_used.stt", "a"); + double y2 = psnr.psnr[1]; + double u2 = psnr.psnr[2]; + double v2 = psnr.psnr[3]; + double frame_psnr2 = psnr.psnr[0]; + fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n", + cm->current_frame.frame_number, y2, u2, v2, + frame_psnr2, frame_ssim2); + fclose(f); + } +#endif + } + if (cpi->b_calculate_blockiness) { + if (!cm->seq_params.use_highbitdepth) { + const double frame_blockiness = + av1_get_blockiness(orig->y_buffer, orig->y_stride, recon->y_buffer, + recon->y_stride, orig->y_width, orig->y_height); + cpi->worst_blockiness = AOMMAX(cpi->worst_blockiness, frame_blockiness); + cpi->total_blockiness += frame_blockiness; + } + + if (cpi->b_calculate_consistency) { + if (!cm->seq_params.use_highbitdepth) { + const double this_inconsistency = aom_get_ssim_metrics( + orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride, + orig->y_width, orig->y_height, cpi->ssim_vars, &cpi->metrics, 1); + + const double peak = (double)((1 << in_bit_depth) - 1); + const double consistency = + aom_sse_to_psnr(samples, peak, cpi->total_inconsistency); + if (consistency > 0.0) + cpi->worst_consistency = + AOMMIN(cpi->worst_consistency, consistency); + cpi->total_inconsistency += this_inconsistency; + } + } + } + + frame_all = + aom_calc_fastssim(orig, recon, &y, &u, &v, bit_depth, in_bit_depth); + adjust_image_stat(y, u, v, frame_all, &cpi->fastssim); + frame_all = aom_psnrhvs(orig, recon, &y, &u, &v, bit_depth, in_bit_depth); + adjust_image_stat(y, u, v, frame_all, &cpi->psnrhvs); + } +} +#endif // CONFIG_INTERNAL_STATS +int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, + size_t *size, uint8_t *dest, int64_t *time_stamp, + int64_t *time_end, int flush, + const aom_rational64_t *timestamp_ratio) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + AV1_COMMON *const cm = &cpi->common; + +#if CONFIG_BITSTREAM_DEBUG + assert(cpi->oxcf.max_threads == 0 && + "bitstream debug tool does not support multithreading"); + bitstream_queue_record_write(); + aom_bitstream_queue_set_frame_write(cm->current_frame.frame_number * 2 + + cm->show_frame); +#endif + if (cpi->use_svc && cm->number_spatial_layers > 1) { + av1_one_pass_cbr_svc_start_layer(cpi); + } + + cm->showable_frame = 0; + *size = 0; +#if CONFIG_INTERNAL_STATS + struct aom_usec_timer cmptimer; + aom_usec_timer_start(&cmptimer); +#endif + av1_set_high_precision_mv(cpi, 1, 0); + + // Normal defaults + cm->features.refresh_frame_context = oxcf->frame_parallel_decoding_mode + ? REFRESH_FRAME_CONTEXT_DISABLED + : REFRESH_FRAME_CONTEXT_BACKWARD; + if (oxcf->large_scale_tile) + cm->features.refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED; + + // Initialize fields related to forward keyframes + cpi->no_show_kf = 0; + + if (assign_cur_frame_new_fb(cm) == NULL) return AOM_CODEC_ERROR; + + const int result = + av1_encode_strategy(cpi, size, dest, frame_flags, time_stamp, time_end, + timestamp_ratio, flush); + if (result != AOM_CODEC_OK && result != -1) { + return AOM_CODEC_ERROR; + } else if (result == -1) { + // Returning -1 indicates no frame encoded; more input is required + return -1; + } +#if CONFIG_INTERNAL_STATS + aom_usec_timer_mark(&cmptimer); + cpi->time_compress_data += aom_usec_timer_elapsed(&cmptimer); +#endif // CONFIG_INTERNAL_STATS + if (cpi->b_calculate_psnr) { + if (cm->show_existing_frame || + (!is_stat_generation_stage(cpi) && cm->show_frame)) { + generate_psnr_packet(cpi); + } + } + +#if CONFIG_TUNE_VMAF + if (!is_stat_generation_stage(cpi) && + (oxcf->tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING || + oxcf->tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING || + oxcf->tuning == AOM_TUNE_VMAF_MAX_GAIN)) { + av1_update_vmaf_curve(cpi, cpi->source, &cpi->common.cur_frame->buf); + } +#endif + + if (cpi->level_params.keep_level_stats && !is_stat_generation_stage(cpi)) { + // Initialize level info. at the beginning of each sequence. + if (cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) { + av1_init_level_info(cpi); + } + av1_update_level_info(cpi, *size, *time_stamp, *time_end); + } + +#if CONFIG_INTERNAL_STATS + if (!is_stat_generation_stage(cpi)) { + compute_internal_stats(cpi, (int)(*size)); + } +#endif // CONFIG_INTERNAL_STATS +#if CONFIG_SPEED_STATS + if (!is_stat_generation_stage(cpi) && !cm->show_existing_frame) { + cpi->tx_search_count += cpi->td.mb.tx_search_count; + cpi->td.mb.tx_search_count = 0; + } +#endif // CONFIG_SPEED_STATS + + aom_clear_system_state(); + + return AOM_CODEC_OK; +} + +int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) { + AV1_COMMON *cm = &cpi->common; + if (!cm->show_frame) { + return -1; + } else { + int ret; + if (cm->cur_frame != NULL) { + *dest = cm->cur_frame->buf; + dest->y_width = cm->width; + dest->y_height = cm->height; + dest->uv_width = cm->width >> cm->seq_params.subsampling_x; + dest->uv_height = cm->height >> cm->seq_params.subsampling_y; + ret = 0; + } else { + ret = -1; + } + aom_clear_system_state(); + return ret; + } +} + +int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame) { + if (cpi->last_show_frame_buf == NULL) return -1; + + *frame = cpi->last_show_frame_buf->buf; + return 0; +} + +static int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + return a->y_height == b->y_height && a->y_width == b->y_width && + a->uv_height == b->uv_height && a->uv_width == b->uv_width && + a->y_stride == b->y_stride && a->uv_stride == b->uv_stride && + a->border == b->border && + (a->flags & YV12_FLAG_HIGHBITDEPTH) == + (b->flags & YV12_FLAG_HIGHBITDEPTH); +} + +aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm, + YV12_BUFFER_CONFIG *new_frame, + YV12_BUFFER_CONFIG *sd) { + const int num_planes = av1_num_planes(cm); + if (!equal_dimensions_and_border(new_frame, sd)) + aom_internal_error(&cm->error, AOM_CODEC_ERROR, + "Incorrect buffer dimensions"); + else + aom_yv12_copy_frame(new_frame, sd, num_planes); + + return cm->error.error_code; +} + +int av1_set_internal_size(AV1EncoderConfig *const oxcf, + ResizePendingParams *resize_pending_params, + AOM_SCALING horiz_mode, AOM_SCALING vert_mode) { + int hr = 0, hs = 0, vr = 0, vs = 0; + + if (horiz_mode > ONETWO || vert_mode > ONETWO) return -1; + + Scale2Ratio(horiz_mode, &hr, &hs); + Scale2Ratio(vert_mode, &vr, &vs); + + // always go to the next whole number + resize_pending_params->width = (hs - 1 + oxcf->width * hr) / hs; + resize_pending_params->height = (vs - 1 + oxcf->height * vr) / vs; + + return 0; +} + +int av1_get_quantizer(AV1_COMP *cpi) { + return cpi->common.quant_params.base_qindex; +} + +int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *frame_size) { + size_t output_size = 0; + size_t total_bytes_read = 0; + size_t remaining_size = *frame_size; + uint8_t *buff_ptr = buffer; + + // go through each OBUs + while (total_bytes_read < *frame_size) { + uint8_t saved_obu_header[2]; + uint64_t obu_payload_size; + size_t length_of_payload_size; + size_t length_of_obu_size; + uint32_t obu_header_size = (buff_ptr[0] >> 2) & 0x1 ? 2 : 1; + size_t obu_bytes_read = obu_header_size; // bytes read for current obu + + // save the obu header (1 or 2 bytes) + memmove(saved_obu_header, buff_ptr, obu_header_size); + // clear the obu_has_size_field + saved_obu_header[0] = saved_obu_header[0] & (~0x2); + + // get the payload_size and length of payload_size + if (aom_uleb_decode(buff_ptr + obu_header_size, remaining_size, + &obu_payload_size, &length_of_payload_size) != 0) { + return AOM_CODEC_ERROR; + } + obu_bytes_read += length_of_payload_size; + + // calculate the length of size of the obu header plus payload + length_of_obu_size = + aom_uleb_size_in_bytes((uint64_t)(obu_header_size + obu_payload_size)); + + // move the rest of data to new location + memmove(buff_ptr + length_of_obu_size + obu_header_size, + buff_ptr + obu_bytes_read, remaining_size - obu_bytes_read); + obu_bytes_read += (size_t)obu_payload_size; + + // write the new obu size + const uint64_t obu_size = obu_header_size + obu_payload_size; + size_t coded_obu_size; + if (aom_uleb_encode(obu_size, sizeof(obu_size), buff_ptr, + &coded_obu_size) != 0) { + return AOM_CODEC_ERROR; + } + + // write the saved (modified) obu_header following obu size + memmove(buff_ptr + length_of_obu_size, saved_obu_header, obu_header_size); + + total_bytes_read += obu_bytes_read; + remaining_size -= obu_bytes_read; + buff_ptr += length_of_obu_size + obu_size; + output_size += length_of_obu_size + (size_t)obu_size; + } + + *frame_size = output_size; + return AOM_CODEC_OK; +} + +static void svc_set_updates_external_ref_frame_config( + ExternalFlags *const ext_flags, SVC *const svc) { + ext_flags->refresh_frame_flags_pending = 1; + ext_flags->refresh_last_frame = svc->refresh[svc->ref_idx[0]]; + ext_flags->refresh_golden_frame = svc->refresh[svc->ref_idx[3]]; + ext_flags->refresh_bwd_ref_frame = svc->refresh[svc->ref_idx[4]]; + ext_flags->refresh_alt2_ref_frame = svc->refresh[svc->ref_idx[5]]; + ext_flags->refresh_alt_ref_frame = svc->refresh[svc->ref_idx[6]]; + svc->non_reference_frame = 1; + for (int i = 0; i < REF_FRAMES; i++) { + if (svc->refresh[i] == 1) { + svc->non_reference_frame = 0; + break; + } + } +} + +static int svc_set_references_external_ref_frame_config(AV1_COMP *cpi) { + // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3), + // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6). + int ref = AOM_REFFRAME_ALL; + for (int i = 0; i < INTER_REFS_PER_FRAME; i++) { + if (!cpi->svc.reference[i]) ref ^= (1 << i); + } + return ref; +} + +void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) { + // TODO(yunqingwang): For what references to use, external encoding flags + // should be consistent with internal reference frame selection. Need to + // ensure that there is not conflict between the two. In AV1 encoder, the + // priority rank for 7 reference frames are: LAST, ALTREF, LAST2, LAST3, + // GOLDEN, BWDREF, ALTREF2. + + ExternalFlags *const ext_flags = &cpi->ext_flags; + ext_flags->ref_frame_flags = AOM_REFFRAME_ALL; + if (flags & + (AOM_EFLAG_NO_REF_LAST | AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 | + AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD | + AOM_EFLAG_NO_REF_ARF2)) { + int ref = AOM_REFFRAME_ALL; + + if (flags & AOM_EFLAG_NO_REF_LAST) ref ^= AOM_LAST_FLAG; + if (flags & AOM_EFLAG_NO_REF_LAST2) ref ^= AOM_LAST2_FLAG; + if (flags & AOM_EFLAG_NO_REF_LAST3) ref ^= AOM_LAST3_FLAG; + + if (flags & AOM_EFLAG_NO_REF_GF) ref ^= AOM_GOLD_FLAG; + + if (flags & AOM_EFLAG_NO_REF_ARF) { + ref ^= AOM_ALT_FLAG; + ref ^= AOM_BWD_FLAG; + ref ^= AOM_ALT2_FLAG; + } else { + if (flags & AOM_EFLAG_NO_REF_BWD) ref ^= AOM_BWD_FLAG; + if (flags & AOM_EFLAG_NO_REF_ARF2) ref ^= AOM_ALT2_FLAG; + } + + av1_use_as_reference(&ext_flags->ref_frame_flags, ref); + } else { + if (cpi->svc.external_ref_frame_config) { + int ref = svc_set_references_external_ref_frame_config(cpi); + av1_use_as_reference(&ext_flags->ref_frame_flags, ref); + } + } + + if (flags & + (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF)) { + int upd = AOM_REFFRAME_ALL; + + // Refreshing LAST/LAST2/LAST3 is handled by 1 common flag. + if (flags & AOM_EFLAG_NO_UPD_LAST) upd ^= AOM_LAST_FLAG; + + if (flags & AOM_EFLAG_NO_UPD_GF) upd ^= AOM_GOLD_FLAG; + + if (flags & AOM_EFLAG_NO_UPD_ARF) { + upd ^= AOM_ALT_FLAG; + upd ^= AOM_BWD_FLAG; + upd ^= AOM_ALT2_FLAG; + } + + ext_flags->refresh_last_frame = (upd & AOM_LAST_FLAG) != 0; + ext_flags->refresh_golden_frame = (upd & AOM_GOLD_FLAG) != 0; + ext_flags->refresh_alt_ref_frame = (upd & AOM_ALT_FLAG) != 0; + ext_flags->refresh_bwd_ref_frame = (upd & AOM_BWD_FLAG) != 0; + ext_flags->refresh_alt2_ref_frame = (upd & AOM_ALT2_FLAG) != 0; + ext_flags->refresh_frame_flags_pending = 1; + } else { + if (cpi->svc.external_ref_frame_config) + svc_set_updates_external_ref_frame_config(ext_flags, &cpi->svc); + else + ext_flags->refresh_frame_flags_pending = 0; + } + + ext_flags->use_ref_frame_mvs = cpi->oxcf.allow_ref_frame_mvs & + ((flags & AOM_EFLAG_NO_REF_FRAME_MVS) == 0); + ext_flags->use_error_resilient = cpi->oxcf.error_resilient_mode | + ((flags & AOM_EFLAG_ERROR_RESILIENT) != 0); + ext_flags->use_s_frame = + cpi->oxcf.s_frame_mode | ((flags & AOM_EFLAG_SET_S_FRAME) != 0); + ext_flags->use_primary_ref_none = + (flags & AOM_EFLAG_SET_PRIMARY_REF_NONE) != 0; + + if (flags & AOM_EFLAG_NO_UPD_ENTROPY) { + av1_update_entropy(&ext_flags->refresh_frame_context, + &ext_flags->refresh_frame_context_pending, 0); + } +} + +aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi) { + if (!cpi) return NULL; + + uint8_t header_buf[512] = { 0 }; + const uint32_t sequence_header_size = + av1_write_sequence_header_obu(&cpi->common.seq_params, &header_buf[0]); + assert(sequence_header_size <= sizeof(header_buf)); + if (sequence_header_size == 0) return NULL; + + const size_t obu_header_size = 1; + const size_t size_field_size = aom_uleb_size_in_bytes(sequence_header_size); + const size_t payload_offset = obu_header_size + size_field_size; + + if (payload_offset + sequence_header_size > sizeof(header_buf)) return NULL; + memmove(&header_buf[payload_offset], &header_buf[0], sequence_header_size); + + if (av1_write_obu_header(&cpi->level_params, OBU_SEQUENCE_HEADER, 0, + &header_buf[0]) != obu_header_size) { + return NULL; + } + + size_t coded_size_field_size = 0; + if (aom_uleb_encode(sequence_header_size, size_field_size, + &header_buf[obu_header_size], + &coded_size_field_size) != 0) { + return NULL; + } + assert(coded_size_field_size == size_field_size); + + aom_fixed_buf_t *global_headers = + (aom_fixed_buf_t *)malloc(sizeof(*global_headers)); + if (!global_headers) return NULL; + + const size_t global_header_buf_size = + obu_header_size + size_field_size + sequence_header_size; + + global_headers->buf = malloc(global_header_buf_size); + if (!global_headers->buf) { + free(global_headers); + return NULL; + } + + memcpy(global_headers->buf, &header_buf[0], global_header_buf_size); + global_headers->sz = global_header_buf_size; + return global_headers; +} diff --git a/libs/libaom/src/av1/encoder/encoder.h b/libs/libaom/src/av1/encoder/encoder.h new file mode 100644 index 000000000..82d00cb76 --- /dev/null +++ b/libs/libaom/src/av1/encoder/encoder.h @@ -0,0 +1,1965 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ENCODER_H_ +#define AOM_AV1_ENCODER_ENCODER_H_ + +#include +#include + +#include "config/aom_config.h" + +#include "aom/aomcx.h" + +#include "av1/common/alloccommon.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/entropymode.h" +#include "av1/common/enums.h" +#include "av1/common/resize.h" +#include "av1/common/thread_common.h" +#include "av1/common/timing.h" +#include "av1/encoder/aq_cyclicrefresh.h" +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/block.h" +#include "av1/encoder/context_tree.h" +#include "av1/encoder/encodemb.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/level.h" +#include "av1/encoder/lookahead.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/speed_features.h" +#include "av1/encoder/svc_layercontext.h" +#include "av1/encoder/tokenize.h" + +#if CONFIG_INTERNAL_STATS +#include "aom_dsp/ssim.h" +#endif +#include "aom_dsp/variance.h" +#if CONFIG_DENOISE +#include "aom_dsp/noise_model.h" +#endif +#include "aom/internal/aom_codec_internal.h" +#include "aom_util/aom_thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Number of frames required to test for scene cut detection +#define SCENE_CUT_KEY_TEST_INTERVAL 16 + +// Rational number with an int64 numerator +// This structure holds a fractional value +typedef struct aom_rational64 { + int64_t num; // fraction numerator + int den; // fraction denominator +} aom_rational64_t; // alias for struct aom_rational + +typedef struct { +#if CONFIG_SUPERRES_IN_RECODE + struct loopfilter lf; + CdefInfo cdef_info; + YV12_BUFFER_CONFIG copy_buffer; + RATE_CONTROL rc; +#endif // CONFIG_SUPERRES_IN_RECODE +} CODING_CONTEXT; + +enum { + NORMAL = 0, + FOURFIVE = 1, + THREEFIVE = 2, + ONETWO = 3 +} UENUM1BYTE(AOM_SCALING); + +enum { + // Good Quality Fast Encoding. The encoder balances quality with the amount of + // time it takes to encode the output. Speed setting controls how fast. + GOOD, + // Realtime Fast Encoding. Will force some restrictions on bitrate + // constraints. + REALTIME +} UENUM1BYTE(MODE); + +enum { + FRAMEFLAGS_KEY = 1 << 0, + FRAMEFLAGS_GOLDEN = 1 << 1, + FRAMEFLAGS_BWDREF = 1 << 2, + // TODO(zoeliu): To determine whether a frame flag is needed for ALTREF2_FRAME + FRAMEFLAGS_ALTREF = 1 << 3, + FRAMEFLAGS_INTRAONLY = 1 << 4, + FRAMEFLAGS_SWITCH = 1 << 5, + FRAMEFLAGS_ERROR_RESILIENT = 1 << 6, +} UENUM1BYTE(FRAMETYPE_FLAGS); + +enum { + NO_AQ = 0, + VARIANCE_AQ = 1, + COMPLEXITY_AQ = 2, + CYCLIC_REFRESH_AQ = 3, + AQ_MODE_COUNT // This should always be the last member of the enum +} UENUM1BYTE(AQ_MODE); +enum { + NO_DELTA_Q = 0, + DELTA_Q_OBJECTIVE = 1, // Modulation to improve objective quality + DELTA_Q_PERCEPTUAL = 2, // Modulation to improve perceptual quality + DELTA_Q_MODE_COUNT // This should always be the last member of the enum +} UENUM1BYTE(DELTAQ_MODE); + +enum { + RESIZE_NONE = 0, // No frame resizing allowed. + RESIZE_FIXED = 1, // All frames are coded at the specified scale. + RESIZE_RANDOM = 2, // All frames are coded at a random scale. + RESIZE_MODES +} UENUM1BYTE(RESIZE_MODE); + +enum { + SUPERRES_NONE, // No frame superres allowed. + SUPERRES_FIXED, // All frames are coded at the specified scale, + // and super-resolved. + SUPERRES_RANDOM, // All frames are coded at a random scale, + // and super-resolved. + SUPERRES_QTHRESH, // Superres scale for a frame is determined based on + // q_index. + SUPERRES_AUTO, // Automatically select superres for appropriate frames. + SUPERRES_MODES +} UENUM1BYTE(SUPERRES_MODE); + +typedef enum { + kInvalid = 0, + kLowSad = 1, + kHighSad = 2, + kLowVarHighSumdiff = 3, +} CONTENT_STATE_SB; + +enum { + SS_CFG_SRC = 0, + SS_CFG_LOOKAHEAD = 1, + SS_CFG_FPF = 2, + SS_CFG_TOTAL = 3 +} UENUM1BYTE(SS_CFG_OFFSET); + +// TODO(jingning): This needs to be cleaned up next. +#define MAX_LENGTH_TPL_FRAME_STATS (MAX_TOTAL_BUFFERS + REF_FRAMES + 1) + +typedef struct TplDepStats { + int64_t intra_cost; + int64_t inter_cost; + int64_t srcrf_dist; + int64_t recrf_dist; + int64_t srcrf_rate; + int64_t recrf_rate; + int64_t mc_dep_rate; + int64_t mc_dep_dist; + int_mv mv[INTER_REFS_PER_FRAME]; + int ref_frame_index; + int64_t pred_error[INTER_REFS_PER_FRAME]; + int64_t mc_count; + int64_t mc_saved; +} TplDepStats; + +typedef struct TplDepFrame { + uint8_t is_valid; + TplDepStats *tpl_stats_ptr; + const YV12_BUFFER_CONFIG *gf_picture; + YV12_BUFFER_CONFIG *rec_picture; + int ref_map_index[REF_FRAMES]; + int stride; + int width; + int height; + int mi_rows; + int mi_cols; + unsigned int frame_display_index; + int base_rdmult; +} TplDepFrame; + +typedef struct TplParams { + // Block granularity of tpl score storage. + uint8_t tpl_stats_block_mis_log2; + + // Buffer to store the frame level tpl information for each frame in a gf + // group. tpl_stats_buffer[i] stores the tpl information of ith frame in a gf + // group + TplDepFrame tpl_stats_buffer[MAX_LENGTH_TPL_FRAME_STATS]; + + // Buffer to store tpl stats at block granularity. + // tpl_stats_pool[i][j] stores the tpl stats of jth block of ith frame in a gf + // group. + TplDepStats *tpl_stats_pool[MAX_LAG_BUFFERS]; + + // Buffer to store tpl reconstructed frame. + // tpl_rec_pool[i] stores the reconstructed frame of ith frame in a gf group. + YV12_BUFFER_CONFIG tpl_rec_pool[MAX_LAG_BUFFERS]; + + // Pointer to tpl_stats_buffer. + TplDepFrame *tpl_frame; +} TplParams; + +typedef enum { + COST_UPD_SB, + COST_UPD_SBROW, + COST_UPD_TILE, + COST_UPD_OFF, +} COST_UPDATE_TYPE; + +#define TPL_DEP_COST_SCALE_LOG2 4 + +typedef struct AV1EncoderConfig { + BITSTREAM_PROFILE profile; + aom_bit_depth_t bit_depth; // Codec bit-depth. + int width; // width of data passed to the compressor + int height; // height of data passed to the compressor + int forced_max_frame_width; // forced maximum width of frame (if != 0) + int forced_max_frame_height; // forced maximum height of frame (if != 0) + unsigned int input_bit_depth; // Input bit depth. + double init_framerate; // set to passed in framerate + int64_t target_bandwidth; // bandwidth to be used in bits per second + + int noise_sensitivity; // pre processing blur: recommendation 0 + int sharpness; // sharpening output: recommendation 0: + int speed; + // maximum allowed bitrate for any intra frame in % of bitrate target. + unsigned int rc_max_intra_bitrate_pct; + // maximum allowed bitrate for any inter frame in % of bitrate target. + unsigned int rc_max_inter_bitrate_pct; + // percent of rate boost for golden frame in CBR mode. + unsigned int gf_cbr_boost_pct; + + MODE mode; + int pass; + + // Key Framing Operations + int auto_key; // autodetect cut scenes and set the keyframes + int key_freq; // maximum distance to key frame. + int sframe_dist; + int sframe_mode; + int sframe_enabled; + int lag_in_frames; // how many frames lag before we start encoding + int fwd_kf_enabled; + + // ---------------------------------------------------------------- + // DATARATE CONTROL OPTIONS + + // vbr, cbr, constrained quality or constant quality + enum aom_rc_mode rc_mode; + + // buffer targeting aggressiveness + int under_shoot_pct; + int over_shoot_pct; + + // buffering parameters + int64_t starting_buffer_level_ms; + int64_t optimal_buffer_level_ms; + int64_t maximum_buffer_size_ms; + + // Frame drop threshold. + int drop_frames_water_mark; + + // controlling quality + int fixed_q; + int worst_allowed_q; + int best_allowed_q; + int cq_level; + int enable_chroma_deltaq; + AQ_MODE aq_mode; // Adaptive Quantization mode + DELTAQ_MODE deltaq_mode; + int deltalf_mode; + int enable_cdef; + int enable_restoration; + int force_video_mode; + int enable_obmc; + int disable_trellis_quant; + int using_qm; + int qm_y; + int qm_u; + int qm_v; + int qm_minlevel; + int qm_maxlevel; + unsigned int num_tile_groups; + unsigned int mtu; + + // Internal frame size scaling. + RESIZE_MODE resize_mode; + uint8_t resize_scale_denominator; + uint8_t resize_kf_scale_denominator; + + // Frame Super-Resolution size scaling. + SUPERRES_MODE superres_mode; + uint8_t superres_scale_denominator; + uint8_t superres_kf_scale_denominator; + int superres_qthresh; + int superres_kf_qthresh; + + // Enable feature to reduce the frame quantization every x frames. + int frame_periodic_boost; + + // two pass datarate control + int two_pass_vbrbias; // two pass datarate control tweaks + int two_pass_vbrmin_section; + int two_pass_vbrmax_section; + // END DATARATE CONTROL OPTIONS + // ---------------------------------------------------------------- + + int enable_auto_arf; + int enable_auto_brf; // (b)ackward (r)ef (f)rame + + /* Bitfield defining the error resiliency features to enable. + * Can provide decodable frames after losses in previous + * frames and decodable partitions after losses in the same frame. + */ + unsigned int error_resilient_mode; + + unsigned int s_frame_mode; + + /* Bitfield defining the parallel decoding mode where the + * decoding in successive frames may be conducted in parallel + * just by decoding the frame headers. + */ + unsigned int frame_parallel_decoding_mode; + + unsigned int limit; + + int arnr_max_frames; + int arnr_strength; + + int min_gf_interval; + int max_gf_interval; + int gf_min_pyr_height; + int gf_max_pyr_height; + + int row_mt; + int tile_columns; + int tile_rows; + int tile_width_count; + int tile_height_count; + int tile_widths[MAX_TILE_COLS]; + int tile_heights[MAX_TILE_ROWS]; + + int enable_tpl_model; + int enable_keyframe_filtering; + + int max_threads; + + aom_fixed_buf_t two_pass_stats_in; + + aom_tune_metric tuning; + const char *vmaf_model_path; + aom_tune_content content; + int use_highbitdepth; + aom_color_primaries_t color_primaries; + aom_transfer_characteristics_t transfer_characteristics; + aom_matrix_coefficients_t matrix_coefficients; + aom_chroma_sample_position_t chroma_sample_position; + int color_range; + int render_width; + int render_height; + int timing_info_present; + aom_timing_info_t timing_info; + int decoder_model_info_present_flag; + int display_model_info_present_flag; + int buffer_removal_time_present; + aom_dec_model_info_t buffer_model; + int film_grain_test_vector; + const char *film_grain_table_filename; + + uint8_t cdf_update_mode; + aom_superblock_size_t superblock_size; + unsigned int large_scale_tile; + unsigned int single_tile_decoding; + uint8_t monochrome; + unsigned int full_still_picture_hdr; + int enable_dual_filter; + unsigned int motion_vector_unit_test; + unsigned int sb_multipass_unit_test; + unsigned int ext_tile_debug; + int enable_rect_partitions; + int enable_ab_partitions; + int enable_1to4_partitions; + int min_partition_size; + int max_partition_size; + int enable_intra_edge_filter; + int enable_tx64; + int enable_flip_idtx; + int enable_order_hint; + int enable_dist_wtd_comp; + int enable_ref_frame_mvs; + unsigned int max_reference_frames; + int enable_reduced_reference_set; + unsigned int allow_ref_frame_mvs; + int enable_masked_comp; + int enable_onesided_comp; + int enable_interintra_comp; + int enable_smooth_interintra; + int enable_diff_wtd_comp; + int enable_interinter_wedge; + int enable_interintra_wedge; + int enable_global_motion; + int enable_warped_motion; + int allow_warped_motion; + int enable_filter_intra; + int enable_smooth_intra; + int enable_paeth_intra; + int enable_cfl_intra; + int enable_superres; + int enable_overlay; + int enable_palette; + int enable_intrabc; + int enable_angle_delta; + unsigned int save_as_annexb; + +#if CONFIG_DENOISE + float noise_level; + int noise_block_size; +#endif + + unsigned int chroma_subsampling_x; + unsigned int chroma_subsampling_y; + int reduced_tx_type_set; + int use_intra_dct_only; + int use_inter_dct_only; + int use_intra_default_tx_only; + int quant_b_adapt; + COST_UPDATE_TYPE coeff_cost_upd_freq; + COST_UPDATE_TYPE mode_cost_upd_freq; + COST_UPDATE_TYPE mv_cost_upd_freq; + int border_in_pixels; + AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS]; + // Bit mask to specify which tier each of the 32 possible operating points + // conforms to. + unsigned int tier_mask; + // If true, encoder will use fixed QP offsets, that are either: + // - Given by the user, and stored in 'fixed_qp_offsets' array, OR + // - Picked automatically from cq_level. + int use_fixed_qp_offsets; + // List of QP offsets for: keyframe, ALTREF, and 3 levels of internal ARFs. + // If any of these values are negative, fixed offsets are disabled. + // Uses internal q range. + double fixed_qp_offsets[FIXED_QP_OFFSET_COUNT]; + // min_cr / 100 is the target minimum compression ratio for each frame. + unsigned int min_cr; + const cfg_options_t *encoder_cfg; +} AV1EncoderConfig; + +static INLINE int is_lossless_requested(const AV1EncoderConfig *cfg) { + return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0; +} + +typedef struct { + // obmc_probs[i][j] is the probability of OBMC being the best motion mode for + // jth block size and ith frame update type, averaged over past frames. If + // obmc_probs[i][j] < thresh, then OBMC search is pruned. + int obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL]; + + // warped_probs[i] is the probability of warped motion being the best motion + // mode for ith frame update type, averaged over past frames. If + // warped_probs[i] < thresh, then warped motion search is pruned. + int warped_probs[FRAME_UPDATE_TYPES]; + + // tx_type_probs[i][j][k] is the probability of kth tx_type being the best + // for jth transform size and ith frame update type, averaged over past + // frames. If tx_type_probs[i][j][k] < thresh, then transform search for that + // type is pruned. + int tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL][TX_TYPES]; + + // switchable_interp_probs[i][j][k] is the probability of kth interpolation + // filter being the best for jth filter context and ith frame update type, + // averaged over past frames. If switchable_interp_probs[i][j][k] < thresh, + // then interpolation filter search is pruned for that case. + int switchable_interp_probs[FRAME_UPDATE_TYPES][SWITCHABLE_FILTER_CONTEXTS] + [SWITCHABLE_FILTERS]; +} FrameProbInfo; + +typedef struct FRAME_COUNTS { +// Note: This structure should only contain 'unsigned int' fields, or +// aggregates built solely from 'unsigned int' fields/elements +#if CONFIG_ENTROPY_STATS + unsigned int kf_y_mode[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS][INTRA_MODES]; + unsigned int angle_delta[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1]; + unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES]; + unsigned int uv_mode[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES]; + unsigned int cfl_sign[CFL_JOINT_SIGNS]; + unsigned int cfl_alpha[CFL_ALPHA_CONTEXTS][CFL_ALPHABET_SIZE]; + unsigned int palette_y_mode[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2]; + unsigned int palette_uv_mode[PALETTE_UV_MODE_CONTEXTS][2]; + unsigned int palette_y_size[PALATTE_BSIZE_CTXS][PALETTE_SIZES]; + unsigned int palette_uv_size[PALATTE_BSIZE_CTXS][PALETTE_SIZES]; + unsigned int palette_y_color_index[PALETTE_SIZES] + [PALETTE_COLOR_INDEX_CONTEXTS] + [PALETTE_COLORS]; + unsigned int palette_uv_color_index[PALETTE_SIZES] + [PALETTE_COLOR_INDEX_CONTEXTS] + [PALETTE_COLORS]; + unsigned int partition[PARTITION_CONTEXTS][EXT_PARTITION_TYPES]; + unsigned int txb_skip[TOKEN_CDF_Q_CTXS][TX_SIZES][TXB_SKIP_CONTEXTS][2]; + unsigned int eob_extra[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES] + [EOB_COEF_CONTEXTS][2]; + unsigned int dc_sign[PLANE_TYPES][DC_SIGN_CONTEXTS][2]; + unsigned int coeff_lps[TX_SIZES][PLANE_TYPES][BR_CDF_SIZE - 1][LEVEL_CONTEXTS] + [2]; + unsigned int eob_flag[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS][2]; + unsigned int eob_multi16[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][5]; + unsigned int eob_multi32[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][6]; + unsigned int eob_multi64[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][7]; + unsigned int eob_multi128[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][8]; + unsigned int eob_multi256[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][9]; + unsigned int eob_multi512[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][10]; + unsigned int eob_multi1024[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][11]; + unsigned int coeff_lps_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES] + [LEVEL_CONTEXTS][BR_CDF_SIZE]; + unsigned int coeff_base_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES] + [SIG_COEF_CONTEXTS][NUM_BASE_LEVELS + 2]; + unsigned int coeff_base_eob_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES] + [SIG_COEF_CONTEXTS_EOB][NUM_BASE_LEVELS + 1]; + unsigned int newmv_mode[NEWMV_MODE_CONTEXTS][2]; + unsigned int zeromv_mode[GLOBALMV_MODE_CONTEXTS][2]; + unsigned int refmv_mode[REFMV_MODE_CONTEXTS][2]; + unsigned int drl_mode[DRL_MODE_CONTEXTS][2]; + unsigned int inter_compound_mode[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES]; + unsigned int wedge_idx[BLOCK_SIZES_ALL][16]; + unsigned int interintra[BLOCK_SIZE_GROUPS][2]; + unsigned int interintra_mode[BLOCK_SIZE_GROUPS][INTERINTRA_MODES]; + unsigned int wedge_interintra[BLOCK_SIZES_ALL][2]; + unsigned int compound_type[BLOCK_SIZES_ALL][MASKED_COMPOUND_TYPES]; + unsigned int motion_mode[BLOCK_SIZES_ALL][MOTION_MODES]; + unsigned int obmc[BLOCK_SIZES_ALL][2]; + unsigned int intra_inter[INTRA_INTER_CONTEXTS][2]; + unsigned int comp_inter[COMP_INTER_CONTEXTS][2]; + unsigned int comp_ref_type[COMP_REF_TYPE_CONTEXTS][2]; + unsigned int uni_comp_ref[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1][2]; + unsigned int single_ref[REF_CONTEXTS][SINGLE_REFS - 1][2]; + unsigned int comp_ref[REF_CONTEXTS][FWD_REFS - 1][2]; + unsigned int comp_bwdref[REF_CONTEXTS][BWD_REFS - 1][2]; + unsigned int intrabc[2]; + + unsigned int txfm_partition[TXFM_PARTITION_CONTEXTS][2]; + unsigned int intra_tx_size[MAX_TX_CATS][TX_SIZE_CONTEXTS][MAX_TX_DEPTH + 1]; + unsigned int skip_mode[SKIP_MODE_CONTEXTS][2]; + unsigned int skip[SKIP_CONTEXTS][2]; + unsigned int compound_index[COMP_INDEX_CONTEXTS][2]; + unsigned int comp_group_idx[COMP_GROUP_IDX_CONTEXTS][2]; + unsigned int delta_q[DELTA_Q_PROBS][2]; + unsigned int delta_lf_multi[FRAME_LF_COUNT][DELTA_LF_PROBS][2]; + unsigned int delta_lf[DELTA_LF_PROBS][2]; + + unsigned int inter_ext_tx[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES]; + unsigned int intra_ext_tx[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES] + [TX_TYPES]; + unsigned int filter_intra_mode[FILTER_INTRA_MODES]; + unsigned int filter_intra[BLOCK_SIZES_ALL][2]; + unsigned int switchable_restore[RESTORE_SWITCHABLE_TYPES]; + unsigned int wiener_restore[2]; + unsigned int sgrproj_restore[2]; +#endif // CONFIG_ENTROPY_STATS + + unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS] + [SWITCHABLE_FILTERS]; +} FRAME_COUNTS; + +#define INTER_MODE_RD_DATA_OVERALL_SIZE 6400 + +typedef struct { + int ready; + double a; + double b; + double dist_mean; + double ld_mean; + double sse_mean; + double sse_sse_mean; + double sse_ld_mean; + int num; + double dist_sum; + double ld_sum; + double sse_sum; + double sse_sse_sum; + double sse_ld_sum; +} InterModeRdModel; + +typedef struct { + int idx; + int64_t rd; +} RdIdxPair; +// TODO(angiebird): This is an estimated size. We still need to figure what is +// the maximum number of modes. +#define MAX_INTER_MODES 1024 +typedef struct inter_modes_info { + int num; + MB_MODE_INFO mbmi_arr[MAX_INTER_MODES]; + int mode_rate_arr[MAX_INTER_MODES]; + int64_t sse_arr[MAX_INTER_MODES]; + int64_t est_rd_arr[MAX_INTER_MODES]; + RdIdxPair rd_idx_pair_arr[MAX_INTER_MODES]; + RD_STATS rd_cost_arr[MAX_INTER_MODES]; + RD_STATS rd_cost_y_arr[MAX_INTER_MODES]; + RD_STATS rd_cost_uv_arr[MAX_INTER_MODES]; +} InterModesInfo; + +// Encoder row synchronization +typedef struct AV1RowMTSyncData { +#if CONFIG_MULTITHREAD + pthread_mutex_t *mutex_; + pthread_cond_t *cond_; +#endif + // Allocate memory to store the sb/mb block index in each row. + int *cur_col; + int sync_range; + int rows; +} AV1RowMTSync; + +typedef struct AV1RowMTInfo { + int current_mi_row; + int num_threads_working; +} AV1RowMTInfo; + +typedef struct { + // TODO(kyslov): consider changing to 64bit + + // This struct is used for computing variance in choose_partitioning(), where + // the max number of samples within a superblock is 32x32 (with 4x4 avg). + // With 8bit bitdepth, uint32_t is enough for sum_square_error (2^8 * 2^8 * 32 + // * 32 = 2^26). For high bitdepth we need to consider changing this to 64 bit + uint32_t sum_square_error; + int32_t sum_error; + int log2_count; + int variance; +} VPartVar; + +typedef struct { + VPartVar none; + VPartVar horz[2]; + VPartVar vert[2]; +} VPVariance; + +typedef struct { + VPVariance part_variances; + VPartVar split[4]; +} VP4x4; + +typedef struct { + VPVariance part_variances; + VP4x4 split[4]; +} VP8x8; + +typedef struct { + VPVariance part_variances; + VP8x8 split[4]; +} VP16x16; + +typedef struct { + VPVariance part_variances; + VP16x16 split[4]; +} VP32x32; + +typedef struct { + VPVariance part_variances; + VP32x32 split[4]; +} VP64x64; + +typedef struct { + VPVariance part_variances; + VP64x64 *split; +} VP128x128; + +typedef struct { + // Thresholds for variance based partitioning. If block variance > threshold, + // then that block is forced to split. + // thresholds[0] - threshold for 128x128; + // thresholds[1] - threshold for 64x64; + // thresholds[2] - threshold for 32x32; + // thresholds[3] - threshold for 16x16; + // thresholds[4] - threshold for 8x8; + int64_t thresholds[5]; + + // MinMax variance threshold for 8x8 sub blocks of a 16x16 block. If actual + // minmax > threshold_minmax, the 16x16 is forced to split. + int64_t threshold_minmax; +} VarBasedPartitionInfo; + +// TODO(jingning) All spatially adaptive variables should go to TileDataEnc. +typedef struct TileDataEnc { + TileInfo tile_info; + CFL_CTX cfl; + DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx); + FRAME_CONTEXT *row_ctx; + uint8_t allow_update_cdf; + InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL]; + AV1RowMTSync row_mt_sync; + AV1RowMTInfo row_mt_info; +} TileDataEnc; + +typedef struct { + TOKENEXTRA *start; + TOKENEXTRA *stop; + unsigned int count; +} TOKENLIST; + +typedef struct MultiThreadHandle { + int allocated_tile_rows; + int allocated_tile_cols; + int allocated_sb_rows; + int thread_id_to_tile_id[MAX_NUM_THREADS]; // Mapping of threads to tiles +} MultiThreadHandle; + +typedef struct RD_COUNTS { + int64_t comp_pred_diff[REFERENCE_MODES]; + // Stores number of 4x4 blocks using global motion per reference frame. + int global_motion_used[REF_FRAMES]; + int compound_ref_used_flag; + int skip_mode_used_flag; + int tx_type_used[TX_SIZES_ALL][TX_TYPES]; + int obmc_used[BLOCK_SIZES_ALL][2]; + int warped_used[2]; +} RD_COUNTS; + +typedef struct ThreadData { + MACROBLOCK mb; + RD_COUNTS rd_counts; + FRAME_COUNTS *counts; + PC_TREE *pc_tree; + PC_TREE *pc_root; + tran_low_t *tree_coeff_buf[MAX_MB_PLANE]; + tran_low_t *tree_qcoeff_buf[MAX_MB_PLANE]; + tran_low_t *tree_dqcoeff_buf[MAX_MB_PLANE]; + InterModesInfo *inter_modes_info; + uint32_t *hash_value_buffer[2][2]; + int32_t *wsrc_buf; + int32_t *mask_buf; + uint8_t *above_pred_buf; + uint8_t *left_pred_buf; + PALETTE_BUFFER *palette_buffer; + CompoundTypeRdBuffers comp_rd_buffer; + CONV_BUF_TYPE *tmp_conv_dst; + uint8_t *tmp_obmc_bufs[2]; + int intrabc_used; + int deltaq_used; + FRAME_CONTEXT *tctx; + MB_MODE_INFO_EXT *mbmi_ext; + VP64x64 *vt64x64; + int32_t num_64x64_blocks; +} ThreadData; + +struct EncWorkerData; + +typedef struct ActiveMap { + int enabled; + int update; + unsigned char *map; +} ActiveMap; + +typedef struct { + // cs_rate_array[i] is the fraction of blocks in a frame which either match + // with the collocated block or are smooth, where i is the rate_index. + double cs_rate_array[32]; + // rate_index is used to index cs_rate_array. + int rate_index; + // rate_size is the total number of entries populated in cs_rate_array. + int rate_size; +} ForceIntegerMVInfo; + +#if CONFIG_INTERNAL_STATS +// types of stats +enum { + STAT_Y, + STAT_U, + STAT_V, + STAT_ALL, + NUM_STAT_TYPES // This should always be the last member of the enum +} UENUM1BYTE(StatType); + +typedef struct IMAGE_STAT { + double stat[NUM_STAT_TYPES]; + double worst; +} ImageStat; +#endif // CONFIG_INTERNAL_STATS + +typedef struct { + int ref_count; + YV12_BUFFER_CONFIG buf; +} EncRefCntBuffer; + +typedef struct { + // Buffer to store mode information at mi_alloc_bsize (4x4 or 8x8) level for + // use in bitstream preparation. frame_base[mi_row * stride + mi_col] stores + // the mode information of block (mi_row,mi_col). + MB_MODE_INFO_EXT_FRAME *frame_base; + // Size of frame_base buffer. + int alloc_size; + // Stride of frame_base buffer. + int stride; +} MBMIExtFrameBufferInfo; + +#if CONFIG_COLLECT_PARTITION_STATS == 2 +typedef struct PartitionStats { + int partition_decisions[6][EXT_PARTITION_TYPES]; + int partition_attempts[6][EXT_PARTITION_TYPES]; + int64_t partition_times[6][EXT_PARTITION_TYPES]; + + int partition_redo; +} PartitionStats; +#endif + +#if CONFIG_COLLECT_COMPONENT_TIMING +#include "aom_ports/aom_timer.h" +// Adjust the following to add new components. +enum { + encode_frame_to_data_rate_time, + encode_with_recode_loop_time, + loop_filter_time, + cdef_time, + loop_restoration_time, + av1_pack_bitstream_final_time, + av1_encode_frame_time, + av1_compute_global_motion_time, + av1_setup_motion_field_time, + encode_sb_time, + rd_pick_partition_time, + rd_pick_sb_modes_time, + av1_rd_pick_intra_mode_sb_time, + av1_rd_pick_inter_mode_sb_time, + handle_intra_mode_time, + do_tx_search_time, + handle_newmv_time, + compound_type_rd_time, + interpolation_filter_search_time, + motion_mode_rd_time, + kTimingComponents, +} UENUM1BYTE(TIMING_COMPONENT); + +static INLINE char const *get_component_name(int index) { + switch (index) { + case encode_frame_to_data_rate_time: + return "encode_frame_to_data_rate_time"; + case encode_with_recode_loop_time: return "encode_with_recode_loop_time"; + case loop_filter_time: return "loop_filter_time"; + case cdef_time: return "cdef_time"; + case loop_restoration_time: return "loop_restoration_time"; + case av1_pack_bitstream_final_time: return "av1_pack_bitstream_final_time"; + case av1_encode_frame_time: return "av1_encode_frame_time"; + case av1_compute_global_motion_time: + return "av1_compute_global_motion_time"; + case av1_setup_motion_field_time: return "av1_setup_motion_field_time"; + case encode_sb_time: return "encode_sb_time"; + case rd_pick_partition_time: return "rd_pick_partition_time"; + case rd_pick_sb_modes_time: return "rd_pick_sb_modes_time"; + case av1_rd_pick_intra_mode_sb_time: + return "av1_rd_pick_intra_mode_sb_time"; + case av1_rd_pick_inter_mode_sb_time: + return "av1_rd_pick_inter_mode_sb_time"; + case handle_intra_mode_time: return "handle_intra_mode_time"; + case do_tx_search_time: return "do_tx_search_time"; + case handle_newmv_time: return "handle_newmv_time"; + case compound_type_rd_time: return "compound_type_rd_time"; + case interpolation_filter_search_time: + return "interpolation_filter_search_time"; + case motion_mode_rd_time: return "motion_mode_rd_time"; + default: assert(0); + } + return "error"; +} +#endif + +// The maximum number of internal ARFs except ALTREF_FRAME +#define MAX_INTERNAL_ARFS (REF_FRAMES - BWDREF_FRAME - 1) + +typedef struct { + // Array to store the cost for signalling each global motion model. + // gmtype_cost[i] stores the cost of signalling the ith Global Motion model. + int type_cost[TRANS_TYPES]; + + // Array to store the cost for signalling a particular global motion model for + // each reference frame. gmparams_cost[i] stores the cost of signalling global + // motion for the ith reference frame. + int params_cost[REF_FRAMES]; + + // Flag to indicate if global motion search needs to be rerun. + bool search_done; +} GlobalMotionInfo; + +typedef struct { + // Stores the default value of skip flag depending on chroma format + // Set as 1 for monochrome and 3 for other color formats + int default_interp_skip_flags; + // Filter mask to allow certain interp_filter type. + uint16_t interp_filter_search_mask; +} InterpSearchFlags; + +typedef struct { + // Largest MV component used in a frame. + // The value from the previous frame is used to set the full pixel search + // range for the current frame. + int max_mv_magnitude; + // Parameter indicating initial search window to be used in full-pixel search. + // Range [0, MAX_MVSEARCH_STEPS-2]. Lower value indicates larger window. + int mv_step_param; + // Pointer to sub-pixel search function. + // In encoder: av1_find_best_sub_pixel_tree + // av1_find_best_sub_pixel_tree_pruned + // av1_find_best_sub_pixel_tree_pruned_more + // av1_find_best_sub_pixel_tree_pruned_evenmore + // In MV unit test: av1_return_max_sub_pixel_mv + // av1_return_min_sub_pixel_mv + fractional_mv_step_fp *find_fractional_mv_step; + // Search site configuration for full-pel MV search. + // ss_cfg[SS_CFG_SRC]: Used in tpl, rd/non-rd inter mode loop, simple motion + // search. + // ss_cfg[SS_CFG_LOOKAHEAD]: Used in intraBC, temporal filter + // ss_cfg[SS_CFG_FPF]: Used during first pass and lookahead + search_site_config ss_cfg[SS_CFG_TOTAL]; +} MotionVectorSearchParams; + +typedef struct { + // When resize is triggered externally, the desired dimensions are stored in + // this struct until used in the next frame to be coded. These values are + // effective only for one frame and are reset after they are used. + int width; + int height; +} ResizePendingParams; + +typedef struct { + // Threshold of transform domain distortion + // Index 0: Default mode evaluation, Winner mode processing is not applicable + // (Eg : IntraBc). + // Index 1: Mode evaluation. + // Index 2: Winner mode evaluation. + // Index 1 and 2 are applicable when enable_winner_mode_for_use_tx_domain_dist + // speed feature is ON + unsigned int tx_domain_dist_threshold[MODE_EVAL_TYPES]; + + // Factor to control R-D optimization of coeffs based on block + // mse. + // Index 0: Default mode evaluation, Winner mode processing is not applicable + // (Eg : IntraBc). Index 1: Mode evaluation. + // Index 2: Winner mode evaluation + // Index 1 and 2 are applicable when enable_winner_mode_for_coeff_opt speed + // feature is ON + unsigned int coeff_opt_dist_threshold[MODE_EVAL_TYPES]; + + // Transform size to be used in transform search + // Index 0: Default mode evaluation, Winner mode processing is not applicable + // (Eg : IntraBc). + // Index 1: Mode evaluation. Index 2: Winner mode evaluation + // Index 1 and 2 are applicable when enable_winner_mode_for_tx_size_srch speed + // feature is ON + TX_SIZE_SEARCH_METHOD tx_size_search_methods[MODE_EVAL_TYPES]; + + // Transform domain distortion levels + // Index 0: Default mode evaluation, Winner mode processing is not applicable + // (Eg : IntraBc). + // Index 1: Mode evaluation. Index 2: Winner mode evaluation + // Index 1 and 2 are applicable when enable_winner_mode_for_use_tx_domain_dist + // speed feature is ON + unsigned int use_transform_domain_distortion[MODE_EVAL_TYPES]; + + // Predict transform skip levels to be used for default, mode and winner mode + // evaluation. Index 0: Default mode evaluation, Winner mode processing is not + // applicable. Index 1: Mode evaluation, Index 2: Winner mode evaluation + unsigned int predict_skip_level[MODE_EVAL_TYPES]; +} WinnerModeParams; + +typedef struct { + // Bit mask to disable certain reference frame types. + int ref_frame_flags; + + // Flags to determine which reference buffers are refreshed by this frame. + // When set, the encoder will update the particular reference frame buffer + // with the contents of the current frame. + bool refresh_last_frame; + bool refresh_golden_frame; + bool refresh_bwd_ref_frame; + bool refresh_alt2_ref_frame; + bool refresh_alt_ref_frame; + + // Flag to indicate that updation of refresh frame flags from external + // interface is pending. + bool refresh_frame_flags_pending; + + // Flag to enable the updation of frame contexts at the end of a frame decode. + bool refresh_frame_context; + + // Flag to indicate that updation of refresh_frame_context from external + // interface is pending. + bool refresh_frame_context_pending; + + // Flag to enable temporal MV prediction. + bool use_ref_frame_mvs; + + // Flag to code the frame as error-resilient. + bool use_error_resilient; + + // Flag to code the frame as s-frame. + bool use_s_frame; + + // Flag to set the frame's primary_ref_frame to PRIMARY_REF_NONE. + bool use_primary_ref_none; +} ExternalFlags; + +typedef struct { + int arf_stack[FRAME_BUFFERS]; + int arf_stack_size; + int lst_stack[FRAME_BUFFERS]; + int lst_stack_size; + int gld_stack[FRAME_BUFFERS]; + int gld_stack_size; +} RefBufferStack; + +typedef struct { + // Some misc info + int high_prec; + int q; + int order; + + // MV counters + int inter_count; + int intra_count; + int default_mvs; + int mv_joint_count[4]; + int last_bit_zero; + int last_bit_nonzero; + + // Keep track of the rates + int total_mv_rate; + int hp_total_mv_rate; + int lp_total_mv_rate; + + // Texture info + int horz_text; + int vert_text; + int diag_text; + + // Whether the current struct contains valid data + int valid; +} MV_STATS; + +typedef struct { + int frame_width; + int frame_height; + int mi_rows; + int mi_cols; + int mb_rows; + int mb_cols; + int num_mbs; + aom_bit_depth_t bit_depth; + int subsampling_x; + int subsampling_y; +} FRAME_INFO; + +typedef struct { + // 3-bit number containing the segment affiliation for each 4x4 block in the + // frame. map[y * stride + x] contains the segment id of the 4x4 block at + // (x,y) position. + uint8_t *map; + // Flag to indicate if current frame has lossless segments or not. + // 1: frame has at least one lossless segment. + // 0: frame has no lossless segments. + bool has_lossless_segment; +} EncSegmentationInfo; + +typedef struct { + // Start time stamp of the previous frame + int64_t prev_start_seen; + // End time stamp of the previous frame + int64_t prev_end_seen; + // Start time stamp of the first frame + int64_t first_ever; +} TimeStamps; + +typedef struct AV1_COMP { + // Quantization and dequantization parameters for internal quantizer setup + // in the encoder. + EncQuantDequantParams enc_quant_dequant_params; + ThreadData td; + FRAME_COUNTS counts; + + // Holds buffer storing mode information at 4x4/8x8 level. + MBMIExtFrameBufferInfo mbmi_ext_info; + + CB_COEFF_BUFFER *coeff_buffer_base; + AV1_COMMON common; + AV1EncoderConfig oxcf; + struct lookahead_ctx *lookahead; + int no_show_kf; + + TRELLIS_OPT_TYPE optimize_seg_arr[MAX_SEGMENTS]; + + YV12_BUFFER_CONFIG *source; + YV12_BUFFER_CONFIG *last_source; // NULL for first frame and alt_ref frames + YV12_BUFFER_CONFIG *unscaled_source; + YV12_BUFFER_CONFIG scaled_source; + YV12_BUFFER_CONFIG *unscaled_last_source; + YV12_BUFFER_CONFIG scaled_last_source; + YV12_BUFFER_CONFIG *unfiltered_source; + + TplParams tpl_data; + + // For a still frame, this flag is set to 1 to skip partition search. + int partition_search_skippable_frame; + + // Variables related to forcing integer mv decisions for the current frame. + ForceIntegerMVInfo force_intpel_info; + + unsigned int row_mt; + RefCntBuffer *scaled_ref_buf[INTER_REFS_PER_FRAME]; + + RefCntBuffer *last_show_frame_buf; // last show frame buffer + + // refresh_*_frame are boolean flags. If 'refresh_xyz_frame' is true, then + // after the current frame is encoded, the XYZ reference frame gets refreshed + // (updated) to be the current frame. + // + // Note: Usually at most one of these refresh flags is true at a time. + // But a key-frame is special, for which all the flags are true at once. + int refresh_golden_frame; + int refresh_bwd_ref_frame; + int refresh_alt_ref_frame; + + // For each type of reference frame, this contains the index of a reference + // frame buffer for a reference frame of the same type. We use this to + // choose our primary reference frame (which is the most recent reference + // frame of the same type as the current frame). + int fb_of_context_type[REF_FRAMES]; + + // Flags signalled by the external interface at frame level. + ExternalFlags ext_flags; + + YV12_BUFFER_CONFIG last_frame_uf; + YV12_BUFFER_CONFIG trial_frame_rst; + + // Ambient reconstruction err target for force key frames + int64_t ambient_err; + + RD_OPT rd; + + CODING_CONTEXT coding_context; + + // Parameters related to global motion search. + GlobalMotionInfo gm_info; + + // Parameters related to winner mode processing. + WinnerModeParams winner_mode_params; + + // Frame time stamps + TimeStamps time_stamps; + + RATE_CONTROL rc; + double framerate; + + struct aom_codec_pkt_list *output_pkt_list; + + int ref_frame_flags; + + // speed is passed as a per-frame parameter into the encoder + int speed; + // sf contains fine-grained config set internally based on speed + SPEED_FEATURES sf; + + // Parameters for motion vector search process. + MotionVectorSearchParams mv_search_params; + + int all_one_sided_refs; + + // Segmentation related information for current frame. + EncSegmentationInfo enc_seg; + + CYCLIC_REFRESH *cyclic_refresh; + ActiveMap active_map; + + aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES_ALL]; + +#if CONFIG_INTERNAL_STATS + uint64_t time_receive_data; + uint64_t time_compress_data; +#endif + + // number of show frames encoded in current gf_group + int num_gf_group_show_frames; + + TWO_PASS twopass; + + GF_GROUP gf_group; + + // To control the reference frame buffer and selection. + RefBufferStack ref_buffer_stack; + + YV12_BUFFER_CONFIG alt_ref_buffer; + + // Tell if OVERLAY frame shows existing alt_ref frame. + int show_existing_alt_ref; + +#if CONFIG_INTERNAL_STATS + unsigned int mode_chosen_counts[MAX_MODES]; + + int count; + uint64_t total_sq_error; + uint64_t total_samples; + ImageStat psnr; + + double total_blockiness; + double worst_blockiness; + + int bytes; + double summed_quality; + double summed_weights; + unsigned int tot_recode_hits; + double worst_ssim; + + ImageStat fastssim; + ImageStat psnrhvs; + + int b_calculate_blockiness; + int b_calculate_consistency; + + double total_inconsistency; + double worst_consistency; + Ssimv *ssim_vars; + Metrics metrics; +#endif + int b_calculate_psnr; +#if CONFIG_SPEED_STATS + unsigned int tx_search_count; +#endif // CONFIG_SPEED_STATS + + int droppable; + + FRAME_INFO frame_info; + + int initial_width; + int initial_height; + int initial_mbs; // Number of MBs in the full-size frame; to be used to + // normalize the firstpass stats. This will differ from the + // number of MBs in the current frame when the frame is + // scaled. + // Resize related parameters + ResizePendingParams resize_pending_params; + + TileDataEnc *tile_data; + int allocated_tiles; // Keep track of memory allocated for tiles. + + TOKENEXTRA *tile_tok[MAX_TILE_ROWS][MAX_TILE_COLS]; + TOKENLIST *tplist[MAX_TILE_ROWS][MAX_TILE_COLS]; + + // Sequence parameters have been transmitted already and locked + // or not. Once locked av1_change_config cannot change the seq + // parameters. + int seq_params_locked; + + // VARIANCE_AQ segment map refresh + int vaq_refresh; + + // Thresholds for variance based partitioning. + VarBasedPartitionInfo vbp_info; + + // Probabilities for pruning of various AV1 tools. + FrameProbInfo frame_probs; + + // Multi-threading + int num_workers; + AVxWorker *workers; + struct EncWorkerData *tile_thr_data; + int existing_fb_idx_to_show; + int internal_altref_allowed; + // A flag to indicate if intrabc is ever used in current frame. + int intrabc_used; + + // Tables to calculate IntraBC MV cost. + IntraBCMVCosts dv_costs; + + // Mark which ref frames can be skipped for encoding current frame druing RDO. + int prune_ref_frame_mask; + + AV1LfSync lf_row_sync; + AV1LrSync lr_row_sync; + AV1LrStruct lr_ctxt; + + aom_film_grain_table_t *film_grain_table; +#if CONFIG_DENOISE + struct aom_denoise_and_model_t *denoise_and_model; +#endif + + // Flags related to interpolation filter search. + InterpSearchFlags interp_search_flags; + + MultiThreadHandle multi_thread_ctxt; + void (*row_mt_sync_read_ptr)(AV1RowMTSync *const, int, int); + void (*row_mt_sync_write_ptr)(AV1RowMTSync *const, int, int, const int); +#if CONFIG_MULTITHREAD + pthread_mutex_t *row_mt_mutex_; +#endif + // Set if screen content is set or relevant tools are enabled + int is_screen_content_type; +#if CONFIG_COLLECT_PARTITION_STATS == 2 + PartitionStats partition_stats; +#endif + +#if CONFIG_COLLECT_COMPONENT_TIMING + // component_time[] are initialized to zero while encoder starts. + uint64_t component_time[kTimingComponents]; + struct aom_usec_timer component_timer[kTimingComponents]; + // frame_component_time[] are initialized to zero at beginning of each frame. + uint64_t frame_component_time[kTimingComponents]; +#endif + + // Parameters for AV1 bitstream levels. + AV1LevelParams level_params; + + // whether any no-zero delta_q was actually used + int deltaq_used; + + // Indicates the true relative distance of ref frame w.r.t. current frame + int ref_relative_dist[INTER_REFS_PER_FRAME]; + + // Indicate nearest references w.r.t. current frame in past and future + int8_t nearest_past_ref; + int8_t nearest_future_ref; + + // TODO(sdeng): consider merge the following arrays. + double *tpl_rdmult_scaling_factors; + double *tpl_sb_rdmult_scaling_factors; + double *ssim_rdmult_scaling_factors; + +#if CONFIG_TUNE_VMAF + double *vmaf_rdmult_scaling_factors; + double last_frame_ysse; + double last_frame_vmaf; + double last_frame_unsharp_amount; +#endif + + int use_svc; + SVC svc; + + int lap_enabled; + COMPRESSOR_STAGE compressor_stage; + + // Some motion vector stats from the last encoded frame to help us decide what + // precision to use to encode the current frame. + MV_STATS mv_stats; + + // Frame type of the last frame. May be used in some heuristics for speeding + // up the encoding. + FRAME_TYPE last_frame_type; + int num_tg; + + // Super-resolution mode currently being used by the encoder. + // This may / may not be same as user-supplied mode in oxcf->superres_mode + // (when we are recoding to try multiple options for example). + SUPERRES_MODE superres_mode; +} AV1_COMP; + +typedef struct { + YV12_BUFFER_CONFIG *source; + YV12_BUFFER_CONFIG *last_source; + int64_t ts_duration; +} EncodeFrameInput; + +// EncodeFrameParams contains per-frame encoding parameters decided upon by +// av1_encode_strategy() and passed down to av1_encode() +struct EncodeFrameParams { + int error_resilient_mode; + FRAME_TYPE frame_type; + int primary_ref_frame; + int order_offset; + int show_frame; + int refresh_frame_flags; + + int show_existing_frame; + int existing_fb_idx_to_show; + + // Bitmask of which reference buffers may be referenced by this frame + int ref_frame_flags; + + // Reference buffer assignment for this frame. + int remapped_ref_idx[REF_FRAMES]; + + // Flags which determine which reference buffers are refreshed by this frame + int refresh_golden_frame; + int refresh_bwd_ref_frame; + int refresh_alt_ref_frame; + + // Speed level to use for this frame: Bigger number means faster. + int speed; +}; +typedef struct EncodeFrameParams EncodeFrameParams; + +// EncodeFrameResults contains information about the result of encoding a +// single frame +typedef struct { + size_t size; // Size of resulting bitstream +} EncodeFrameResults; + +// Must not be called more than once. +void av1_initialize_enc(void); + +struct AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, + BufferPool *const pool, + FIRSTPASS_STATS *frame_stats_buf, + COMPRESSOR_STAGE stage, + int num_lap_buffers, + int lap_lag_in_frames, + STATS_BUFFER_CTX *stats_buf_context); +void av1_remove_compressor(AV1_COMP *cpi); + +void av1_change_config(AV1_COMP *cpi, const AV1EncoderConfig *oxcf); + +void av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth, + int subsampling_x, int subsampling_y); + +// receive a frames worth of data. caller can assume that a copy of this +// frame is made and not just a copy of the pointer.. +int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, + YV12_BUFFER_CONFIG *sd, int64_t time_stamp, + int64_t end_time_stamp); + +int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, + size_t *size, uint8_t *dest, int64_t *time_stamp, + int64_t *time_end, int flush, + const aom_rational64_t *timebase); + +int av1_encode(AV1_COMP *const cpi, uint8_t *const dest, + const EncodeFrameInput *const frame_input, + const EncodeFrameParams *const frame_params, + EncodeFrameResults *const frame_results); + +int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest); + +int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame); + +aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm, + YV12_BUFFER_CONFIG *new_frame, + YV12_BUFFER_CONFIG *sd); + +int av1_use_as_reference(int *ext_ref_frame_flags, int ref_frame_flags); + +int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd); + +int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd); + +int av1_set_size_literal(AV1_COMP *cpi, int width, int height); + +void av1_set_frame_size(AV1_COMP *cpi, int width, int height); + +int av1_update_entropy(bool *ext_refresh_frame_context, + bool *ext_refresh_frame_context_pending, bool update); + +int av1_set_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols); + +int av1_get_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols); + +int av1_set_internal_size(AV1EncoderConfig *const oxcf, + ResizePendingParams *resize_pending_params, + AOM_SCALING horiz_mode, AOM_SCALING vert_mode); + +int av1_get_quantizer(struct AV1_COMP *cpi); + +int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *input_size); + +void av1_alloc_compound_type_rd_buffers(AV1_COMMON *const cm, + CompoundTypeRdBuffers *const bufs); +void av1_release_compound_type_rd_buffers(CompoundTypeRdBuffers *const bufs); + +// Set screen content options. +// This function estimates whether to use screen content tools, by counting +// the portion of blocks that have few luma colors. +// Modifies: +// cpi->commom.allow_screen_content_tools +// cpi->common.allow_intrabc +// However, the estimation is not accurate and may misclassify videos. +// A slower but more accurate approach that determines whether to use screen +// content tools is employed later. See determine_sc_tools_with_encoding(). +void av1_set_screen_content_options(const struct AV1_COMP *cpi, + FeatureFlags *features); + +// TODO(jingning): Move these functions as primitive members for the new cpi +// class. +static INLINE void stack_push(int *stack, int *stack_size, int item) { + for (int i = *stack_size - 1; i >= 0; --i) stack[i + 1] = stack[i]; + stack[0] = item; + ++*stack_size; +} + +static INLINE int stack_pop(int *stack, int *stack_size) { + if (*stack_size <= 0) return -1; + + int item = stack[0]; + for (int i = 0; i < *stack_size; ++i) stack[i] = stack[i + 1]; + --*stack_size; + + return item; +} + +static INLINE int stack_pop_end(int *stack, int *stack_size) { + int item = stack[*stack_size - 1]; + stack[*stack_size - 1] = -1; + --*stack_size; + + return item; +} + +static INLINE void stack_reset(int *stack, int *stack_size) { + for (int i = 0; i < *stack_size; ++i) stack[i] = INVALID_IDX; + *stack_size = 0; +} + +// av1 uses 10,000,000 ticks/second as time stamp +#define TICKS_PER_SEC 10000000LL + +static INLINE int64_t +timebase_units_to_ticks(const aom_rational64_t *timestamp_ratio, int64_t n) { + return n * timestamp_ratio->num / timestamp_ratio->den; +} + +static INLINE int64_t +ticks_to_timebase_units(const aom_rational64_t *timestamp_ratio, int64_t n) { + int64_t round = timestamp_ratio->num / 2; + if (round > 0) --round; + return (n * timestamp_ratio->den + round) / timestamp_ratio->num; +} + +static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) { + const GF_GROUP *const gf_group = &cpi->gf_group; + const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index]; + + return frame_is_intra_only(&cpi->common) || update_type == ARF_UPDATE || + update_type == GF_UPDATE; +} + +// TODO(huisu@google.com, youzhou@microsoft.com): enable hash-me for HBD. +static INLINE int av1_use_hash_me(const AV1_COMP *const cpi) { + return (cpi->common.features.allow_screen_content_tools && + cpi->common.features.allow_intrabc && + frame_is_intra_only(&cpi->common)); +} + +static INLINE const YV12_BUFFER_CONFIG *get_ref_frame_yv12_buf( + const AV1_COMMON *const cm, MV_REFERENCE_FRAME ref_frame) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + return buf != NULL ? &buf->buf : NULL; +} + +static INLINE int enc_is_ref_frame_buf(const AV1_COMMON *const cm, + const RefCntBuffer *const frame_buf) { + MV_REFERENCE_FRAME ref_frame; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + if (buf == NULL) continue; + if (frame_buf == buf) break; + } + return (ref_frame <= ALTREF_FRAME); +} + +static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, RefCntBuffer *buf) { + assert(buf != NULL); + ensure_mv_buffer(buf, cm); + buf->width = cm->width; + buf->height = cm->height; +} + +// Token buffer is only used for palette tokens. +static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols, + int sb_size_log2, + const int num_planes) { + // Calculate the maximum number of max superblocks in the image. + const int shift = sb_size_log2 - 4; + const int sb_size = 1 << sb_size_log2; + const int sb_size_square = sb_size * sb_size; + const int sb_rows = ALIGN_POWER_OF_TWO(mb_rows, shift) >> shift; + const int sb_cols = ALIGN_POWER_OF_TWO(mb_cols, shift) >> shift; + + // One palette token for each pixel. There can be palettes on two planes. + const int sb_palette_toks = AOMMIN(2, num_planes) * sb_size_square; + + return sb_rows * sb_cols * sb_palette_toks; +} + +// Get the allocated token size for a tile. It does the same calculation as in +// the frame token allocation. +static INLINE unsigned int allocated_tokens(TileInfo tile, int sb_size_log2, + int num_planes) { + int tile_mb_rows = (tile.mi_row_end - tile.mi_row_start + 2) >> 2; + int tile_mb_cols = (tile.mi_col_end - tile.mi_col_start + 2) >> 2; + + return get_token_alloc(tile_mb_rows, tile_mb_cols, sb_size_log2, num_planes); +} + +static INLINE void get_start_tok(AV1_COMP *cpi, int tile_row, int tile_col, + int mi_row, TOKENEXTRA **tok, int sb_size_log2, + int num_planes) { + AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tiles.cols; + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + const TileInfo *const tile_info = &this_tile->tile_info; + + const int tile_mb_cols = + (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2; + const int tile_mb_row = (mi_row - tile_info->mi_row_start + 2) >> 2; + + *tok = cpi->tile_tok[tile_row][tile_col] + + get_token_alloc(tile_mb_row, tile_mb_cols, sb_size_log2, num_planes); +} + +void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags); + +#define ALT_MIN_LAG 3 +static INLINE int is_altref_enabled(const AV1_COMP *const cpi) { + return cpi->oxcf.lag_in_frames >= ALT_MIN_LAG && cpi->oxcf.enable_auto_arf; +} + +// Check if statistics generation stage +static INLINE int is_stat_generation_stage(const AV1_COMP *const cpi) { + assert(IMPLIES(cpi->compressor_stage == LAP_STAGE, + cpi->oxcf.pass == 0 && cpi->lap_enabled)); + return (cpi->oxcf.pass == 1 || (cpi->compressor_stage == LAP_STAGE)); +} +// Check if statistics consumption stage +static INLINE int is_stat_consumption_stage_twopass(const AV1_COMP *const cpi) { + return (cpi->oxcf.pass == 2); +} + +// Check if statistics consumption stage +static INLINE int is_stat_consumption_stage(const AV1_COMP *const cpi) { + return (is_stat_consumption_stage_twopass(cpi) || + (cpi->oxcf.pass == 0 && (cpi->compressor_stage == ENCODE_STAGE) && + cpi->lap_enabled)); +} + +// Check if the current stage has statistics +static INLINE int has_no_stats_stage(const AV1_COMP *const cpi) { + assert(IMPLIES(!cpi->lap_enabled, cpi->compressor_stage == ENCODE_STAGE)); + return (cpi->oxcf.pass == 0 && !cpi->lap_enabled); +} + +// Function return size of frame stats buffer +static INLINE int get_stats_buf_size(int num_lap_buffer, int num_lag_buffer) { + /* if lookahead is enabled return num_lap_buffers else num_lag_buffers */ + return (num_lap_buffer > 0 ? num_lap_buffer + 1 : num_lag_buffer); +} + +// TODO(zoeliu): To set up cpi->oxcf.enable_auto_brf + +static INLINE void set_ref_ptrs(const AV1_COMMON *cm, MACROBLOCKD *xd, + MV_REFERENCE_FRAME ref0, + MV_REFERENCE_FRAME ref1) { + xd->block_ref_scale_factors[0] = + get_ref_scale_factors_const(cm, ref0 >= LAST_FRAME ? ref0 : 1); + xd->block_ref_scale_factors[1] = + get_ref_scale_factors_const(cm, ref1 >= LAST_FRAME ? ref1 : 1); +} + +static INLINE int get_chessboard_index(int frame_index) { + return frame_index & 0x1; +} + +static INLINE const int *cond_cost_list_const(const struct AV1_COMP *cpi, + const int *cost_list) { + const int use_cost_list = cpi->sf.mv_sf.subpel_search_method != SUBPEL_TREE && + cpi->sf.mv_sf.use_fullpel_costlist; + return use_cost_list ? cost_list : NULL; +} + +static INLINE int *cond_cost_list(const struct AV1_COMP *cpi, int *cost_list) { + const int use_cost_list = cpi->sf.mv_sf.subpel_search_method != SUBPEL_TREE && + cpi->sf.mv_sf.use_fullpel_costlist; + return use_cost_list ? cost_list : NULL; +} + +// Compression ratio of current frame. +double av1_get_compression_ratio(const AV1_COMMON *const cm, + size_t encoded_frame_size); + +void av1_new_framerate(AV1_COMP *cpi, double framerate); + +void av1_setup_frame_size(AV1_COMP *cpi); + +#define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl)) + +// Returns 1 if a frame is scaled and 0 otherwise. +static INLINE int av1_resize_scaled(const AV1_COMMON *cm) { + return !(cm->superres_upscaled_width == cm->render_width && + cm->superres_upscaled_height == cm->render_height); +} + +static INLINE int av1_frame_scaled(const AV1_COMMON *cm) { + return !av1_superres_scaled(cm) && av1_resize_scaled(cm); +} + +// Don't allow a show_existing_frame to coincide with an error resilient +// frame. An exception can be made for a forward keyframe since it has no +// previous dependencies. +static INLINE int encode_show_existing_frame(const AV1_COMMON *cm) { + return cm->show_existing_frame && (!cm->features.error_resilient_mode || + cm->current_frame.frame_type == KEY_FRAME); +} + +// Get index into the 'cpi->mbmi_ext_info.frame_base' array for the given +// 'mi_row' and 'mi_col'. +static INLINE int get_mi_ext_idx(const int mi_row, const int mi_col, + const BLOCK_SIZE mi_alloc_bsize, + const int mbmi_ext_stride) { + const int mi_ext_size_1d = mi_size_wide[mi_alloc_bsize]; + const int mi_ext_row = mi_row / mi_ext_size_1d; + const int mi_ext_col = mi_col / mi_ext_size_1d; + return mi_ext_row * mbmi_ext_stride + mi_ext_col; +} + +// Lighter version of set_offsets that only sets the mode info +// pointers. +static INLINE void set_mode_info_offsets( + const CommonModeInfoParams *const mi_params, + const MBMIExtFrameBufferInfo *const mbmi_ext_info, MACROBLOCK *const x, + MACROBLOCKD *const xd, int mi_row, int mi_col) { + set_mi_offsets(mi_params, xd, mi_row, mi_col); + const int ext_idx = get_mi_ext_idx(mi_row, mi_col, mi_params->mi_alloc_bsize, + mbmi_ext_info->stride); + x->mbmi_ext_frame = mbmi_ext_info->frame_base + ext_idx; +} + +// Check to see if the given partition size is allowed for a specified number +// of mi block rows and columns remaining in the image. +// If not then return the largest allowed partition size +static INLINE BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left, + int cols_left, int *bh, int *bw) { + int int_size = (int)bsize; + if (rows_left <= 0 || cols_left <= 0) { + return AOMMIN(bsize, BLOCK_8X8); + } else { + for (; int_size > 0; int_size -= 3) { + *bh = mi_size_high[int_size]; + *bw = mi_size_wide[int_size]; + if ((*bh <= rows_left) && (*bw <= cols_left)) { + break; + } + } + } + return (BLOCK_SIZE)int_size; +} + +static const uint8_t av1_ref_frame_flag_list[REF_FRAMES] = { 0, + AOM_LAST_FLAG, + AOM_LAST2_FLAG, + AOM_LAST3_FLAG, + AOM_GOLD_FLAG, + AOM_BWD_FLAG, + AOM_ALT2_FLAG, + AOM_ALT_FLAG }; + +// When more than 'max_allowed_refs' are available, we reduce the number of +// reference frames one at a time based on this order. +static const MV_REFERENCE_FRAME disable_order[] = { + LAST3_FRAME, + LAST2_FRAME, + ALTREF2_FRAME, + GOLDEN_FRAME, +}; + +static INLINE int get_max_allowed_ref_frames(const AV1_COMP *cpi) { + const unsigned int max_allowed_refs_for_given_speed = + (cpi->sf.inter_sf.selective_ref_frame >= 3) ? INTER_REFS_PER_FRAME - 1 + : INTER_REFS_PER_FRAME; + return AOMMIN(max_allowed_refs_for_given_speed, + cpi->oxcf.max_reference_frames); +} + +static const MV_REFERENCE_FRAME + ref_frame_priority_order[INTER_REFS_PER_FRAME] = { + LAST_FRAME, ALTREF_FRAME, BWDREF_FRAME, GOLDEN_FRAME, + ALTREF2_FRAME, LAST2_FRAME, LAST3_FRAME, + }; + +static INLINE int get_ref_frame_flags(const SPEED_FEATURES *const sf, + const YV12_BUFFER_CONFIG **ref_frames, + const int ext_ref_frame_flags) { + // cpi->ext_flags.ref_frame_flags allows certain reference types to be + // disabled by the external interface. These are set by + // av1_apply_encoding_flags(). Start with what the external interface allows, + // then suppress any reference types which we have found to be duplicates. + int flags = ext_ref_frame_flags; + + for (int i = 1; i < INTER_REFS_PER_FRAME; ++i) { + const YV12_BUFFER_CONFIG *const this_ref = ref_frames[i]; + // If this_ref has appeared before, mark the corresponding ref frame as + // invalid. For nonrd mode, only disable GOLDEN_FRAME if it's the same + // as LAST_FRAME or ALTREF_FRAME (if ALTREF is being used in nonrd). + int index = (sf->rt_sf.use_nonrd_pick_mode && + ref_frame_priority_order[i] == GOLDEN_FRAME) + ? (1 + sf->rt_sf.use_nonrd_altref_frame) + : i; + for (int j = 0; j < index; ++j) { + if (this_ref == ref_frames[j]) { + flags &= ~(1 << (ref_frame_priority_order[i] - 1)); + break; + } + } + } + return flags; +} + +// Enforce the number of references for each arbitrary frame based on user +// options and speed. +static AOM_INLINE void enforce_max_ref_frames(AV1_COMP *cpi, + int *ref_frame_flags) { + MV_REFERENCE_FRAME ref_frame; + int total_valid_refs = 0; + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + if (*ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) { + total_valid_refs++; + } + } + + const int max_allowed_refs = get_max_allowed_ref_frames(cpi); + + for (int i = 0; i < 4 && total_valid_refs > max_allowed_refs; ++i) { + const MV_REFERENCE_FRAME ref_frame_to_disable = disable_order[i]; + + if (!(*ref_frame_flags & av1_ref_frame_flag_list[ref_frame_to_disable])) { + continue; + } + + switch (ref_frame_to_disable) { + case LAST3_FRAME: *ref_frame_flags &= ~AOM_LAST3_FLAG; break; + case LAST2_FRAME: *ref_frame_flags &= ~AOM_LAST2_FLAG; break; + case ALTREF2_FRAME: *ref_frame_flags &= ~AOM_ALT2_FLAG; break; + case GOLDEN_FRAME: *ref_frame_flags &= ~AOM_GOLD_FLAG; break; + default: assert(0); + } + --total_valid_refs; + } + assert(total_valid_refs <= max_allowed_refs); +} + +// Returns a Sequence Header OBU stored in an aom_fixed_buf_t, or NULL upon +// failure. When a non-NULL aom_fixed_buf_t pointer is returned by this +// function, the memory must be freed by the caller. Both the buf member of the +// aom_fixed_buf_t, and the aom_fixed_buf_t pointer itself must be freed. Memory +// returned must be freed via call to free(). +// +// Note: The OBU returned is in Low Overhead Bitstream Format. Specifically, +// the obu_has_size_field bit is set, and the buffer contains the obu_size +// field. +aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi); + +#define MAX_GFUBOOST_FACTOR 10.0 +#define MIN_GFUBOOST_FACTOR 4.0 +double av1_get_gfu_boost_projection_factor(double min_factor, double max_factor, + int frame_count); +double av1_get_kf_boost_projection_factor(int frame_count); + +#define ENABLE_KF_TPL 1 +#define MAX_PYR_LEVEL_FROMTOP_DELTAQ 0 + +static INLINE int is_frame_kf_and_tpl_eligible(AV1_COMP *const cpi) { + AV1_COMMON *cm = &cpi->common; + return (cm->current_frame.frame_type == KEY_FRAME) && cm->show_frame && + (cpi->rc.frames_to_key > 1); +} + +static INLINE int is_frame_arf_and_tpl_eligible(const GF_GROUP *gf_group) { + const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index]; + return update_type == ARF_UPDATE || update_type == GF_UPDATE; +} + +static INLINE int is_frame_tpl_eligible(AV1_COMP *const cpi) { +#if ENABLE_KF_TPL + return is_frame_kf_and_tpl_eligible(cpi) || + is_frame_arf_and_tpl_eligible(&cpi->gf_group); +#else + return is_frame_arf_and_tpl_eligible(&cpi->gf_group); +#endif // ENABLE_KF_TPL +} + +// Get update type of the current frame. +static INLINE FRAME_UPDATE_TYPE +get_frame_update_type(const GF_GROUP *gf_group) { + return gf_group->update_type[gf_group->index]; +} + +static INLINE int av1_pixels_to_mi(int pixels) { + return ALIGN_POWER_OF_TWO(pixels, 3) >> MI_SIZE_LOG2; +} + +#if CONFIG_COLLECT_PARTITION_STATS == 2 +static INLINE void av1_print_partition_stats(PartitionStats *part_stats) { + FILE *f = fopen("partition_stats.csv", "w"); + if (!f) { + return; + } + + fprintf(f, "bsize,redo,"); + for (int part = 0; part < EXT_PARTITION_TYPES; part++) { + fprintf(f, "decision_%d,", part); + } + for (int part = 0; part < EXT_PARTITION_TYPES; part++) { + fprintf(f, "attempt_%d,", part); + } + for (int part = 0; part < EXT_PARTITION_TYPES; part++) { + fprintf(f, "time_%d,", part); + } + fprintf(f, "\n"); + + const int bsizes[6] = { 128, 64, 32, 16, 8, 4 }; + + for (int bsize_idx = 0; bsize_idx < 6; bsize_idx++) { + fprintf(f, "%d,%d,", bsizes[bsize_idx], part_stats->partition_redo); + for (int part = 0; part < EXT_PARTITION_TYPES; part++) { + fprintf(f, "%d,", part_stats->partition_decisions[bsize_idx][part]); + } + for (int part = 0; part < EXT_PARTITION_TYPES; part++) { + fprintf(f, "%d,", part_stats->partition_attempts[bsize_idx][part]); + } + for (int part = 0; part < EXT_PARTITION_TYPES; part++) { + fprintf(f, "%ld,", part_stats->partition_times[bsize_idx][part]); + } + fprintf(f, "\n"); + } + fclose(f); +} + +static INLINE int av1_get_bsize_idx_for_part_stats(BLOCK_SIZE bsize) { + assert(bsize == BLOCK_128X128 || bsize == BLOCK_64X64 || + bsize == BLOCK_32X32 || bsize == BLOCK_16X16 || bsize == BLOCK_8X8 || + bsize == BLOCK_4X4); + switch (bsize) { + case BLOCK_128X128: return 0; + case BLOCK_64X64: return 1; + case BLOCK_32X32: return 2; + case BLOCK_16X16: return 3; + case BLOCK_8X8: return 4; + case BLOCK_4X4: return 5; + default: assert(0 && "Invalid bsize for partition_stats."); return -1; + } +} +#endif + +#if CONFIG_COLLECT_COMPONENT_TIMING +static INLINE void start_timing(AV1_COMP *cpi, int component) { + aom_usec_timer_start(&cpi->component_timer[component]); +} +static INLINE void end_timing(AV1_COMP *cpi, int component) { + aom_usec_timer_mark(&cpi->component_timer[component]); + cpi->frame_component_time[component] += + aom_usec_timer_elapsed(&cpi->component_timer[component]); +} +static INLINE char const *get_frame_type_enum(int type) { + switch (type) { + case 0: return "KEY_FRAME"; + case 1: return "INTER_FRAME"; + case 2: return "INTRA_ONLY_FRAME"; + case 3: return "S_FRAME"; + default: assert(0); + } + return "error"; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ENCODER_H_ diff --git a/libs/libaom/src/av1/encoder/encodetxb.c b/libs/libaom/src/av1/encoder/encodetxb.c new file mode 100644 index 000000000..825d52a7a --- /dev/null +++ b/libs/libaom/src/av1/encoder/encodetxb.c @@ -0,0 +1,2261 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/encoder/encodetxb.h" + +#include "aom_ports/mem.h" +#include "av1/common/blockd.h" +#include "av1/common/idct.h" +#include "av1/common/pred_common.h" +#include "av1/common/scan.h" +#include "av1/encoder/bitstream.h" +#include "av1/encoder/cost.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/hash.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/tokenize.h" + +#if CONFIG_HTB_TRELLIS +static int hbt_needs_init = 1; +static CRC32C crc_calculator; +static const int HBT_EOB = 16; // also the length in opt_qcoeff +static const int HBT_TABLE_SIZE = 65536; // 16 bit: holds 65536 'arrays' +static const int HBT_ARRAY_LENGTH = 256; // 8 bit: 256 entries +// If removed in hbt_create_hashes or increased beyond int8_t, widen deltas type +static const int HBT_KICKOUT = 3; + +typedef struct OptTxbQcoeff { + // Use larger type if larger/no kickout value is used in hbt_create_hashes + int8_t deltas[16]; + uint32_t hbt_qc_hash; + uint32_t hbt_ctx_hash; + int init; + int rate_cost; +} OptTxbQcoeff; + +OptTxbQcoeff *hbt_hash_table; +#endif // CONFIG_HTB_TRELLIS + +typedef struct LevelDownStats { + int update; + tran_low_t low_qc; + tran_low_t low_dqc; + int64_t dist0; + int rate; + int rate_low; + int64_t dist; + int64_t dist_low; + int64_t rd; + int64_t rd_low; + int64_t nz_rd; + int64_t rd_diff; + int cost_diff; + int64_t dist_diff; + int new_eob; +} LevelDownStats; + +static INLINE int get_dqv(const int16_t *dequant, int coeff_idx, + const qm_val_t *iqmatrix) { + int dqv = dequant[!!coeff_idx]; + if (iqmatrix != NULL) + dqv = + ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + return dqv; +} + +void av1_alloc_txb_buf(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + int size = ((cm->mi_params.mi_rows >> cm->seq_params.mib_size_log2) + 1) * + ((cm->mi_params.mi_cols >> cm->seq_params.mib_size_log2) + 1); + + av1_free_txb_buf(cpi); + // TODO(jingning): This should be further reduced. + CHECK_MEM_ERROR(cm, cpi->coeff_buffer_base, + aom_memalign(32, sizeof(*cpi->coeff_buffer_base) * size)); +} + +void av1_free_txb_buf(AV1_COMP *cpi) { aom_free(cpi->coeff_buffer_base); } + +static void write_golomb(aom_writer *w, int level) { + int x = level + 1; + int i = x; + int length = 0; + + while (i) { + i >>= 1; + ++length; + } + assert(length > 0); + + for (i = 0; i < length - 1; ++i) aom_write_bit(w, 0); + + for (i = length - 1; i >= 0; --i) aom_write_bit(w, (x >> i) & 0x01); +} + +static INLINE tran_low_t get_lower_coeff(tran_low_t qc) { + if (qc == 0) { + return 0; + } + return qc > 0 ? qc - 1 : qc + 1; +} + +static INLINE tran_low_t qcoeff_to_dqcoeff(tran_low_t qc, int coeff_idx, + int dqv, int shift, + const qm_val_t *iqmatrix) { + int sign = qc < 0 ? -1 : 1; + if (iqmatrix != NULL) + dqv = + ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + return sign * ((abs(qc) * dqv) >> shift); +} + +static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff, + int shift) { + const int64_t diff = (tcoeff - dqcoeff) * (1 << shift); + const int64_t error = diff * diff; + return error; +} + +static const int8_t eob_to_pos_small[33] = { + 0, 1, 2, // 0-2 + 3, 3, // 3-4 + 4, 4, 4, 4, // 5-8 + 5, 5, 5, 5, 5, 5, 5, 5, // 9-16 + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 // 17-32 +}; + +static const int8_t eob_to_pos_large[17] = { + 6, // place holder + 7, // 33-64 + 8, 8, // 65-128 + 9, 9, 9, 9, // 129-256 + 10, 10, 10, 10, 10, 10, 10, 10, // 257-512 + 11 // 513- +}; + +static INLINE int get_eob_pos_token(const int eob, int *const extra) { + int t; + + if (eob < 33) { + t = eob_to_pos_small[eob]; + } else { + const int e = AOMMIN((eob - 1) >> 5, 16); + t = eob_to_pos_large[e]; + } + + *extra = eob - av1_eob_group_start[t]; + + return t; +} + +#if CONFIG_ENTROPY_STATS +void av1_update_eob_context(int cdf_idx, int eob, TX_SIZE tx_size, + TX_CLASS tx_class, PLANE_TYPE plane, + FRAME_CONTEXT *ec_ctx, FRAME_COUNTS *counts, + uint8_t allow_update_cdf) { +#else +void av1_update_eob_context(int eob, TX_SIZE tx_size, TX_CLASS tx_class, + PLANE_TYPE plane, FRAME_CONTEXT *ec_ctx, + uint8_t allow_update_cdf) { +#endif + int eob_extra; + const int eob_pt = get_eob_pos_token(eob, &eob_extra); + TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + + const int eob_multi_size = txsize_log2_minus4[tx_size]; + const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1; + + switch (eob_multi_size) { + case 0: +#if CONFIG_ENTROPY_STATS + ++counts->eob_multi16[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; +#endif + if (allow_update_cdf) + update_cdf(ec_ctx->eob_flag_cdf16[plane][eob_multi_ctx], eob_pt - 1, 5); + break; + case 1: +#if CONFIG_ENTROPY_STATS + ++counts->eob_multi32[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; +#endif + if (allow_update_cdf) + update_cdf(ec_ctx->eob_flag_cdf32[plane][eob_multi_ctx], eob_pt - 1, 6); + break; + case 2: +#if CONFIG_ENTROPY_STATS + ++counts->eob_multi64[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; +#endif + if (allow_update_cdf) + update_cdf(ec_ctx->eob_flag_cdf64[plane][eob_multi_ctx], eob_pt - 1, 7); + break; + case 3: +#if CONFIG_ENTROPY_STATS + ++counts->eob_multi128[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; +#endif + if (allow_update_cdf) { + update_cdf(ec_ctx->eob_flag_cdf128[plane][eob_multi_ctx], eob_pt - 1, + 8); + } + break; + case 4: +#if CONFIG_ENTROPY_STATS + ++counts->eob_multi256[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; +#endif + if (allow_update_cdf) { + update_cdf(ec_ctx->eob_flag_cdf256[plane][eob_multi_ctx], eob_pt - 1, + 9); + } + break; + case 5: +#if CONFIG_ENTROPY_STATS + ++counts->eob_multi512[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; +#endif + if (allow_update_cdf) { + update_cdf(ec_ctx->eob_flag_cdf512[plane][eob_multi_ctx], eob_pt - 1, + 10); + } + break; + case 6: + default: +#if CONFIG_ENTROPY_STATS + ++counts->eob_multi1024[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; +#endif + if (allow_update_cdf) { + update_cdf(ec_ctx->eob_flag_cdf1024[plane][eob_multi_ctx], eob_pt - 1, + 11); + } + break; + } + + if (av1_eob_offset_bits[eob_pt] > 0) { + int eob_ctx = eob_pt - 3; + int eob_shift = av1_eob_offset_bits[eob_pt] - 1; + int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0; +#if CONFIG_ENTROPY_STATS + counts->eob_extra[cdf_idx][txs_ctx][plane][eob_pt][bit]++; +#endif // CONFIG_ENTROPY_STATS + if (allow_update_cdf) + update_cdf(ec_ctx->eob_extra_cdf[txs_ctx][plane][eob_ctx], bit, 2); + } +} + +static int get_eob_cost(int eob, const LV_MAP_EOB_COST *txb_eob_costs, + const LV_MAP_COEFF_COST *txb_costs, TX_CLASS tx_class) { + int eob_extra; + const int eob_pt = get_eob_pos_token(eob, &eob_extra); + int eob_cost = 0; + const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1; + eob_cost = txb_eob_costs->eob_cost[eob_multi_ctx][eob_pt - 1]; + + if (av1_eob_offset_bits[eob_pt] > 0) { + const int eob_ctx = eob_pt - 3; + const int eob_shift = av1_eob_offset_bits[eob_pt] - 1; + const int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0; + eob_cost += txb_costs->eob_extra_cost[eob_ctx][bit]; + const int offset_bits = av1_eob_offset_bits[eob_pt]; + if (offset_bits > 1) eob_cost += av1_cost_literal(offset_bits - 1); + } + return eob_cost; +} + +static INLINE int get_sign_bit_cost(tran_low_t qc, int coeff_idx, + const int (*dc_sign_cost)[2], + int dc_sign_ctx) { + if (coeff_idx == 0) { + const int sign = (qc < 0) ? 1 : 0; + return dc_sign_cost[dc_sign_ctx][sign]; + } + return av1_cost_literal(1); +} + +static const int golomb_bits_cost[32] = { + 0, 512, 512 * 3, 512 * 3, 512 * 5, 512 * 5, 512 * 5, 512 * 5, + 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, + 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, + 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9 +}; +static const int golomb_cost_diff[32] = { + 0, 512, 512 * 2, 0, 512 * 2, 0, 0, 0, 512 * 2, 0, 0, 0, 0, 0, 0, 0, + 512 * 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +static INLINE int get_golomb_cost(int abs_qc) { + if (abs_qc >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) { + const int r = abs_qc - COEFF_BASE_RANGE - NUM_BASE_LEVELS; + const int length = get_msb(r) + 1; + return av1_cost_literal(2 * length - 1); + } + return 0; +} + +static INLINE int get_br_cost_with_diff(tran_low_t level, const int *coeff_lps, + int *diff) { + const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE); + int golomb_bits = 0; + if (level <= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS) + *diff += coeff_lps[base_range + COEFF_BASE_RANGE + 1]; + + if (level >= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS) { + int r = level - COEFF_BASE_RANGE - NUM_BASE_LEVELS; + if (r < 32) { + golomb_bits = golomb_bits_cost[r]; + *diff += golomb_cost_diff[r]; + } else { + golomb_bits = get_golomb_cost(level); + *diff += (r & (r - 1)) == 0 ? 1024 : 0; + } + } + + return coeff_lps[base_range] + golomb_bits; +} + +static INLINE int get_br_cost(tran_low_t level, const int *coeff_lps) { + const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE); + return coeff_lps[base_range] + get_golomb_cost(level); +} + +static int get_coeff_cost(const tran_low_t qc, const int scan_idx, + const int is_eob, const TxbInfo *const txb_info, + const LV_MAP_COEFF_COST *const txb_costs, + const int coeff_ctx, const TX_CLASS tx_class) { + const TXB_CTX *const txb_ctx = txb_info->txb_ctx; + const int is_nz = (qc != 0); + const tran_low_t abs_qc = abs(qc); + int cost = 0; + const int16_t *const scan = txb_info->scan_order->scan; + const int pos = scan[scan_idx]; + + if (is_eob) { + cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1]; + } else { + cost += txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)]; + } + if (is_nz) { + cost += get_sign_bit_cost(qc, scan_idx, txb_costs->dc_sign_cost, + txb_ctx->dc_sign_ctx); + + if (abs_qc > NUM_BASE_LEVELS) { + const int ctx = + get_br_ctx(txb_info->levels, pos, txb_info->bwl, tx_class); + cost += get_br_cost(abs_qc, txb_costs->lps_cost[ctx]); + } + } + return cost; +} + +static INLINE int get_nz_map_ctx(const uint8_t *const levels, + const int coeff_idx, const int bwl, + const int height, const int scan_idx, + const int is_eob, const TX_SIZE tx_size, + const TX_CLASS tx_class) { + if (is_eob) { + if (scan_idx == 0) return 0; + if (scan_idx <= (height << bwl) / 8) return 1; + if (scan_idx <= (height << bwl) / 4) return 2; + return 3; + } + const int stats = + get_nz_mag(levels + get_padded_idx(coeff_idx, bwl), bwl, tx_class); + return get_nz_map_ctx_from_stats(stats, coeff_idx, bwl, tx_size, tx_class); +} + +static void get_dist_cost_stats(LevelDownStats *const stats, const int scan_idx, + const int is_eob, + const LV_MAP_COEFF_COST *const txb_costs, + const TxbInfo *const txb_info, + const TX_CLASS tx_class) { + const int16_t *const scan = txb_info->scan_order->scan; + const int coeff_idx = scan[scan_idx]; + const tran_low_t qc = txb_info->qcoeff[coeff_idx]; + const uint8_t *const levels = txb_info->levels; + stats->new_eob = -1; + stats->update = 0; + stats->rd_low = 0; + stats->rd = 0; + stats->nz_rd = 0; + stats->dist_low = 0; + stats->rate_low = 0; + stats->low_qc = 0; + + const tran_low_t tqc = txb_info->tcoeff[coeff_idx]; + const int dqv = txb_info->dequant[coeff_idx != 0]; + const int coeff_ctx = + get_nz_map_ctx(levels, coeff_idx, txb_info->bwl, txb_info->height, + scan_idx, is_eob, txb_info->tx_size, tx_class); + const int qc_cost = get_coeff_cost(qc, scan_idx, is_eob, txb_info, txb_costs, + coeff_ctx, tx_class); + assert(qc != 0); + const tran_low_t dqc = qcoeff_to_dqcoeff(qc, coeff_idx, dqv, txb_info->shift, + txb_info->iqmatrix); + const int64_t dqc_dist = get_coeff_dist(tqc, dqc, txb_info->shift); + + // distortion difference when coefficient is quantized to 0 + const tran_low_t dqc0 = + qcoeff_to_dqcoeff(0, coeff_idx, dqv, txb_info->shift, txb_info->iqmatrix); + + stats->dist0 = get_coeff_dist(tqc, dqc0, txb_info->shift); + stats->dist = dqc_dist - stats->dist0; + stats->rate = qc_cost; + + stats->rd = RDCOST(txb_info->rdmult, stats->rate, stats->dist); + + stats->low_qc = get_lower_coeff(qc); + + if (is_eob && stats->low_qc == 0) { + stats->rd_low = stats->rd; // disable selection of low_qc in this case. + } else { + if (stats->low_qc == 0) { + stats->dist_low = 0; + } else { + stats->low_dqc = qcoeff_to_dqcoeff(stats->low_qc, coeff_idx, dqv, + txb_info->shift, txb_info->iqmatrix); + const int64_t low_dqc_dist = + get_coeff_dist(tqc, stats->low_dqc, txb_info->shift); + stats->dist_low = low_dqc_dist - stats->dist0; + } + const int low_qc_cost = + get_coeff_cost(stats->low_qc, scan_idx, is_eob, txb_info, txb_costs, + coeff_ctx, tx_class); + stats->rate_low = low_qc_cost; + stats->rd_low = RDCOST(txb_info->rdmult, stats->rate_low, stats->dist_low); + } +} + +static void get_dist_cost_stats_with_eob( + LevelDownStats *const stats, const int scan_idx, + const LV_MAP_COEFF_COST *const txb_costs, const TxbInfo *const txb_info, + const TX_CLASS tx_class) { + const int is_eob = 0; + get_dist_cost_stats(stats, scan_idx, is_eob, txb_costs, txb_info, tx_class); + + const int16_t *const scan = txb_info->scan_order->scan; + const int coeff_idx = scan[scan_idx]; + const tran_low_t qc = txb_info->qcoeff[coeff_idx]; + const int coeff_ctx_temp = get_nz_map_ctx( + txb_info->levels, coeff_idx, txb_info->bwl, txb_info->height, scan_idx, 1, + txb_info->tx_size, tx_class); + const int qc_eob_cost = get_coeff_cost(qc, scan_idx, 1, txb_info, txb_costs, + coeff_ctx_temp, tx_class); + int64_t rd_eob = RDCOST(txb_info->rdmult, qc_eob_cost, stats->dist); + if (stats->low_qc != 0) { + const int low_qc_eob_cost = + get_coeff_cost(stats->low_qc, scan_idx, 1, txb_info, txb_costs, + coeff_ctx_temp, tx_class); + int64_t rd_eob_low = + RDCOST(txb_info->rdmult, low_qc_eob_cost, stats->dist_low); + rd_eob = (rd_eob > rd_eob_low) ? rd_eob_low : rd_eob; + } + + stats->nz_rd = AOMMIN(stats->rd_low, stats->rd) - rd_eob; +} + +static INLINE void update_qcoeff(const int coeff_idx, const tran_low_t qc, + const TxbInfo *const txb_info) { + txb_info->qcoeff[coeff_idx] = qc; + txb_info->levels[get_padded_idx(coeff_idx, txb_info->bwl)] = + (uint8_t)clamp(abs(qc), 0, INT8_MAX); +} + +static INLINE void update_coeff(const int coeff_idx, const tran_low_t qc, + const TxbInfo *const txb_info) { + update_qcoeff(coeff_idx, qc, txb_info); + const int dqv = txb_info->dequant[coeff_idx != 0]; + txb_info->dqcoeff[coeff_idx] = qcoeff_to_dqcoeff( + qc, coeff_idx, dqv, txb_info->shift, txb_info->iqmatrix); +} + +void av1_txb_init_levels_c(const tran_low_t *const coeff, const int width, + const int height, uint8_t *const levels) { + const int stride = width + TX_PAD_HOR; + uint8_t *ls = levels; + + memset(levels + stride * height, 0, + sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END)); + + for (int i = 0; i < height; i++) { + for (int j = 0; j < width; j++) { + *ls++ = (uint8_t)clamp(abs(coeff[i * width + j]), 0, INT8_MAX); + } + for (int j = 0; j < TX_PAD_HOR; j++) { + *ls++ = 0; + } + } +} + +void av1_get_nz_map_contexts_c(const uint8_t *const levels, + const int16_t *const scan, const uint16_t eob, + const TX_SIZE tx_size, const TX_CLASS tx_class, + int8_t *const coeff_contexts) { + const int bwl = get_txb_bwl(tx_size); + const int height = get_txb_high(tx_size); + for (int i = 0; i < eob; ++i) { + const int pos = scan[i]; + coeff_contexts[pos] = get_nz_map_ctx(levels, pos, bwl, height, i, + i == eob - 1, tx_size, tx_class); + } +} + +void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *const x, + aom_writer *w, int blk_row, int blk_col, int plane, + int block, TX_SIZE tx_size) { + MACROBLOCKD *xd = &x->e_mbd; + const CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff; + const int txb_offset = + x->mbmi_ext_frame->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN); + const uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset; + const uint16_t eob = eob_txb[block]; + const uint8_t *entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset; + const int txb_skip_ctx = entropy_ctx[block] & TXB_SKIP_CTX_MASK; + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + aom_write_symbol(w, eob == 0, ec_ctx->txb_skip_cdf[txs_ctx][txb_skip_ctx], 2); + if (eob == 0) return; + + const PLANE_TYPE plane_type = get_plane_type(plane); + const TX_TYPE tx_type = + av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size, + cm->features.reduced_tx_set_used); + // Only y plane's tx_type is transmitted + if (plane == 0) { + av1_write_tx_type(cm, xd, tx_type, tx_size, w); + } + + int eob_extra; + const int eob_pt = get_eob_pos_token(eob, &eob_extra); + const int eob_multi_size = txsize_log2_minus4[tx_size]; + const TX_CLASS tx_class = tx_type_to_class[tx_type]; + const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1; + switch (eob_multi_size) { + case 0: + aom_write_symbol(w, eob_pt - 1, + ec_ctx->eob_flag_cdf16[plane_type][eob_multi_ctx], 5); + break; + case 1: + aom_write_symbol(w, eob_pt - 1, + ec_ctx->eob_flag_cdf32[plane_type][eob_multi_ctx], 6); + break; + case 2: + aom_write_symbol(w, eob_pt - 1, + ec_ctx->eob_flag_cdf64[plane_type][eob_multi_ctx], 7); + break; + case 3: + aom_write_symbol(w, eob_pt - 1, + ec_ctx->eob_flag_cdf128[plane_type][eob_multi_ctx], 8); + break; + case 4: + aom_write_symbol(w, eob_pt - 1, + ec_ctx->eob_flag_cdf256[plane_type][eob_multi_ctx], 9); + break; + case 5: + aom_write_symbol(w, eob_pt - 1, + ec_ctx->eob_flag_cdf512[plane_type][eob_multi_ctx], 10); + break; + default: + aom_write_symbol(w, eob_pt - 1, + ec_ctx->eob_flag_cdf1024[plane_type][eob_multi_ctx], 11); + break; + } + + const int eob_offset_bits = av1_eob_offset_bits[eob_pt]; + if (eob_offset_bits > 0) { + const int eob_ctx = eob_pt - 3; + int eob_shift = eob_offset_bits - 1; + int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0; + aom_write_symbol(w, bit, + ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2); + for (int i = 1; i < eob_offset_bits; i++) { + eob_shift = eob_offset_bits - 1 - i; + bit = (eob_extra & (1 << eob_shift)) ? 1 : 0; + aom_write_bit(w, bit); + } + } + + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + uint8_t levels_buf[TX_PAD_2D]; + uint8_t *const levels = set_levels(levels_buf, width); + const tran_low_t *tcoeff_txb = + cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset; + const tran_low_t *tcoeff = tcoeff_txb + BLOCK_OFFSET(block); + av1_txb_init_levels(tcoeff, width, height, levels); + const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); + const int16_t *const scan = scan_order->scan; + DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]); + av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts); + + const int bwl = get_txb_bwl(tx_size); + for (int c = eob - 1; c >= 0; --c) { + const int pos = scan[c]; + const int coeff_ctx = coeff_contexts[pos]; + const tran_low_t v = tcoeff[pos]; + const tran_low_t level = abs(v); + + if (c == eob - 1) { + aom_write_symbol( + w, AOMMIN(level, 3) - 1, + ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx], 3); + } else { + aom_write_symbol(w, AOMMIN(level, 3), + ec_ctx->coeff_base_cdf[txs_ctx][plane_type][coeff_ctx], + 4); + } + if (level > NUM_BASE_LEVELS) { + // level is above 1. + const int base_range = level - 1 - NUM_BASE_LEVELS; + const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class); + aom_cdf_prob *cdf = + ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx]; + for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) { + const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1); + aom_write_symbol(w, k, cdf, BR_CDF_SIZE); + if (k < BR_CDF_SIZE - 1) break; + } + } + } + + // Loop to code all signs in the transform block, + // starting with the sign of DC (if applicable) + for (int c = 0; c < eob; ++c) { + const tran_low_t v = tcoeff[scan[c]]; + const tran_low_t level = abs(v); + const int sign = (v < 0) ? 1 : 0; + if (level) { + if (c == 0) { + const int dc_sign_ctx = + (entropy_ctx[block] >> DC_SIGN_CTX_SHIFT) & DC_SIGN_CTX_MASK; + aom_write_symbol(w, sign, ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], + 2); + } else { + aom_write_bit(w, sign); + } + if (level > COEFF_BASE_RANGE + NUM_BASE_LEVELS) + write_golomb(w, level - COEFF_BASE_RANGE - 1 - NUM_BASE_LEVELS); + } + } +} + +typedef struct encode_txb_args { + const AV1_COMMON *cm; + MACROBLOCK *x; + aom_writer *w; +} ENCODE_TXB_ARGS; + +void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x, + aom_writer *w, BLOCK_SIZE bsize) { + MACROBLOCKD *xd = &x->e_mbd; + const int num_planes = av1_num_planes(cm); + int block[MAX_MB_PLANE] = { 0 }; + int row, col; + assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x, + xd->plane[0].subsampling_y)); + const int max_blocks_wide = max_block_wide(xd, bsize, 0); + const int max_blocks_high = max_block_high(xd, bsize, 0); + const BLOCK_SIZE max_unit_bsize = BLOCK_64X64; + int mu_blocks_wide = mi_size_wide[max_unit_bsize]; + int mu_blocks_high = mi_size_high[max_unit_bsize]; + mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide); + mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high); + + for (row = 0; row < max_blocks_high; row += mu_blocks_high) { + for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) { + for (int plane = 0; plane < num_planes; ++plane) { + if (plane && !xd->is_chroma_ref) break; + const TX_SIZE tx_size = av1_get_tx_size(plane, xd); + const int stepr = tx_size_high_unit[tx_size]; + const int stepc = tx_size_wide_unit[tx_size]; + const int step = stepr * stepc; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int unit_height = ROUND_POWER_OF_TWO( + AOMMIN(mu_blocks_high + row, max_blocks_high), pd->subsampling_y); + const int unit_width = ROUND_POWER_OF_TWO( + AOMMIN(mu_blocks_wide + col, max_blocks_wide), pd->subsampling_x); + for (int blk_row = row >> pd->subsampling_y; blk_row < unit_height; + blk_row += stepr) { + for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width; + blk_col += stepc) { + av1_write_coeffs_txb(cm, x, w, blk_row, blk_col, plane, + block[plane], tx_size); + block[plane] += step; + } + } + } + } + } +} + +// TODO(angiebird): use this function whenever it's possible +static int get_tx_type_cost(const MACROBLOCK *x, const MACROBLOCKD *xd, + int plane, TX_SIZE tx_size, TX_TYPE tx_type, + int reduced_tx_set_used) { + if (plane > 0) return 0; + + const TX_SIZE square_tx_size = txsize_sqr_map[tx_size]; + + const MB_MODE_INFO *mbmi = xd->mi[0]; + const int is_inter = is_inter_block(mbmi); + if (get_ext_tx_types(tx_size, is_inter, reduced_tx_set_used) > 1 && + !xd->lossless[xd->mi[0]->segment_id]) { + const int ext_tx_set = + get_ext_tx_set(tx_size, is_inter, reduced_tx_set_used); + if (is_inter) { + if (ext_tx_set > 0) + return x->inter_tx_type_costs[ext_tx_set][square_tx_size][tx_type]; + } else { + if (ext_tx_set > 0) { + PREDICTION_MODE intra_dir; + if (mbmi->filter_intra_mode_info.use_filter_intra) + intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info + .filter_intra_mode]; + else + intra_dir = mbmi->mode; + return x->intra_tx_type_costs[ext_tx_set][square_tx_size][intra_dir] + [tx_type]; + } + } + } + return 0; +} + +static INLINE void update_coeff_eob_fast(int *eob, int shift, + const int16_t *dequant_ptr, + const int16_t *scan, + const tran_low_t *coeff_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr) { + // TODO(sarahparker) make this work for aomqm + int eob_out = *eob; + int zbin[2] = { dequant_ptr[0] + ROUND_POWER_OF_TWO(dequant_ptr[0] * 70, 7), + dequant_ptr[1] + ROUND_POWER_OF_TWO(dequant_ptr[1] * 70, 7) }; + + for (int i = *eob - 1; i >= 0; i--) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = AOMSIGN(coeff); + int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + if (((abs_coeff << (1 + shift)) < zbin[rc != 0]) || (qcoeff == 0)) { + eob_out--; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } else { + break; + } + } + + *eob = eob_out; +} + +static AOM_FORCE_INLINE int warehouse_efficients_txb( + const MACROBLOCK *x, const int plane, const int block, + const TX_SIZE tx_size, const TXB_CTX *const txb_ctx, + const struct macroblock_plane *p, const int eob, + const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs, + const MACROBLOCKD *const xd, const TX_TYPE tx_type, const TX_CLASS tx_class, + int reduced_tx_set_used) { + const tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block); + const int txb_skip_ctx = txb_ctx->txb_skip_ctx; + const int bwl = get_txb_bwl(tx_size); + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); + const int16_t *const scan = scan_order->scan; + uint8_t levels_buf[TX_PAD_2D]; + uint8_t *const levels = set_levels(levels_buf, width); + DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]); + const int eob_multi_size = txsize_log2_minus4[tx_size]; + const LV_MAP_EOB_COST *const eob_costs = + &x->eob_costs[eob_multi_size][plane_type]; + int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0]; + + av1_txb_init_levels(qcoeff, width, height, levels); + + cost += get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used); + + cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class); + + av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts); + + const int(*lps_cost)[COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1] = + coeff_costs->lps_cost; + int c = eob - 1; + { + const int pos = scan[c]; + const tran_low_t v = qcoeff[pos]; + const int sign = AOMSIGN(v); + const int level = (v ^ sign) - sign; + const int coeff_ctx = coeff_contexts[pos]; + cost += coeff_costs->base_eob_cost[coeff_ctx][AOMMIN(level, 3) - 1]; + + if (v) { + // sign bit cost + if (level > NUM_BASE_LEVELS) { + const int ctx = get_br_ctx_eob(pos, bwl, tx_class); + cost += get_br_cost(level, lps_cost[ctx]); + } + if (c) { + cost += av1_cost_literal(1); + } else { + const int sign01 = (sign ^ sign) - sign; + const int dc_sign_ctx = txb_ctx->dc_sign_ctx; + cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01]; + return cost; + } + } + } + const int(*base_cost)[8] = coeff_costs->base_cost; + for (c = eob - 2; c >= 1; --c) { + const int pos = scan[c]; + const int coeff_ctx = coeff_contexts[pos]; + const tran_low_t v = qcoeff[pos]; + const int level = abs(v); + cost += base_cost[coeff_ctx][AOMMIN(level, 3)]; + if (v) { + // sign bit cost + cost += av1_cost_literal(1); + if (level > NUM_BASE_LEVELS) { + const int ctx = get_br_ctx(levels, pos, bwl, tx_class); + cost += get_br_cost(level, lps_cost[ctx]); + } + } + } + // c == 0 after previous loop + { + const int pos = scan[c]; + const tran_low_t v = qcoeff[pos]; + const int coeff_ctx = coeff_contexts[pos]; + const int sign = AOMSIGN(v); + const int level = (v ^ sign) - sign; + cost += base_cost[coeff_ctx][AOMMIN(level, 3)]; + + if (v) { + // sign bit cost + const int sign01 = (sign ^ sign) - sign; + const int dc_sign_ctx = txb_ctx->dc_sign_ctx; + cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01]; + if (level > NUM_BASE_LEVELS) { + const int ctx = get_br_ctx(levels, pos, bwl, tx_class); + cost += get_br_cost(level, lps_cost[ctx]); + } + } + } + return cost; +} + +static AOM_FORCE_INLINE int warehouse_efficients_txb_laplacian( + const MACROBLOCK *x, const int plane, const int block, + const TX_SIZE tx_size, const TXB_CTX *const txb_ctx, const int eob, + const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs, + const MACROBLOCKD *const xd, const TX_TYPE tx_type, const TX_CLASS tx_class, + int reduced_tx_set_used) { + const int txb_skip_ctx = txb_ctx->txb_skip_ctx; + + const int eob_multi_size = txsize_log2_minus4[tx_size]; + const LV_MAP_EOB_COST *const eob_costs = + &x->eob_costs[eob_multi_size][plane_type]; + int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0]; + + cost += get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used); + + cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class); + + cost += av1_cost_coeffs_txb_estimate(x, plane, block, tx_size, tx_type); + return cost; +} + +// Look up table of individual cost of coefficient by its quantization level. +// determined based on Laplacian distribution conditioned on estimated context +static const int costLUT[15] = { -1143, 53, 545, 825, 1031, + 1209, 1393, 1577, 1763, 1947, + 2132, 2317, 2501, 2686, 2871 }; +static const int const_term = (1 << AV1_PROB_COST_SHIFT); +static const int loge_par = ((14427 << AV1_PROB_COST_SHIFT) + 5000) / 10000; +int av1_cost_coeffs_txb_estimate(const MACROBLOCK *x, const int plane, + const int block, const TX_SIZE tx_size, + const TX_TYPE tx_type) { + assert(plane == 0); + + int cost = 0; + const struct macroblock_plane *p = &x->plane[plane]; + const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type); + const int16_t *scan = scan_order->scan; + tran_low_t *qcoeff = p->qcoeff + BLOCK_OFFSET(block); + + int eob = p->eobs[block]; + + // coeffs + int c = eob - 1; + // eob + { + const int pos = scan[c]; + const tran_low_t v = abs(qcoeff[pos]) - 1; + cost += (v << (AV1_PROB_COST_SHIFT + 2)); + } + // other coeffs + for (c = eob - 2; c >= 0; c--) { + const int pos = scan[c]; + const tran_low_t v = abs(qcoeff[pos]); + const int idx = AOMMIN(v, 14); + + cost += costLUT[idx]; + } + + // const_term does not contain DC, and log(e) does not contain eob, so both + // (eob-1) + cost += (const_term + loge_par) * (eob - 1); + + return cost; +} + +int av1_cost_coeffs_txb(const MACROBLOCK *x, const int plane, const int block, + const TX_SIZE tx_size, const TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, int reduced_tx_set_used) { + const struct macroblock_plane *p = &x->plane[plane]; + const int eob = p->eobs[block]; + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + const PLANE_TYPE plane_type = get_plane_type(plane); + const LV_MAP_COEFF_COST *const coeff_costs = + &x->coeff_costs[txs_ctx][plane_type]; + if (eob == 0) { + return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1]; + } + + const MACROBLOCKD *const xd = &x->e_mbd; + const TX_CLASS tx_class = tx_type_to_class[tx_type]; + + return warehouse_efficients_txb(x, plane, block, tx_size, txb_ctx, p, eob, + plane_type, coeff_costs, xd, tx_type, + tx_class, reduced_tx_set_used); +} + +int av1_cost_coeffs_txb_laplacian(const MACROBLOCK *x, const int plane, + const int block, const TX_SIZE tx_size, + const TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, + const int reduced_tx_set_used, + const int adjust_eob) { + const struct macroblock_plane *p = &x->plane[plane]; + int eob = p->eobs[block]; + + if (adjust_eob) { + const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type); + const int16_t *scan = scan_order->scan; + tran_low_t *tcoeff = p->coeff + BLOCK_OFFSET(block); + tran_low_t *qcoeff = p->qcoeff + BLOCK_OFFSET(block); + const MACROBLOCKD *xd = &x->e_mbd; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + tran_low_t *dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block); + update_coeff_eob_fast(&eob, av1_get_tx_scale(tx_size), p->dequant_QTX, scan, + tcoeff, qcoeff, dqcoeff); + p->eobs[block] = eob; + } + + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + const PLANE_TYPE plane_type = get_plane_type(plane); + const LV_MAP_COEFF_COST *const coeff_costs = + &x->coeff_costs[txs_ctx][plane_type]; + if (eob == 0) { + return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1]; + } + + const MACROBLOCKD *const xd = &x->e_mbd; + const TX_CLASS tx_class = tx_type_to_class[tx_type]; + + return warehouse_efficients_txb_laplacian( + x, plane, block, tx_size, txb_ctx, eob, plane_type, coeff_costs, xd, + tx_type, tx_class, reduced_tx_set_used); +} + +static int optimize_txb(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs, + const LV_MAP_EOB_COST *txb_eob_costs, int *rate_cost) { + int update = 0; + if (txb_info->eob == 0) return update; + const int16_t *const scan = txb_info->scan_order->scan; + // forward optimize the nz_map` + const int init_eob = txb_info->eob; + const TX_CLASS tx_class = tx_type_to_class[txb_info->tx_type]; + const int eob_cost = + get_eob_cost(init_eob, txb_eob_costs, txb_costs, tx_class); + + // backward optimize the level-k map + int accu_rate = eob_cost; + int64_t accu_dist = 0; + int64_t prev_eob_rd_cost = INT64_MAX; + int64_t cur_eob_rd_cost = 0; + + { + const int si = init_eob - 1; + const int coeff_idx = scan[si]; + LevelDownStats stats; + get_dist_cost_stats(&stats, si, si == init_eob - 1, txb_costs, txb_info, + tx_class); + if ((stats.rd_low < stats.rd) && (stats.low_qc != 0)) { + update = 1; + update_coeff(coeff_idx, stats.low_qc, txb_info); + accu_rate += stats.rate_low; + accu_dist += stats.dist_low; + } else { + accu_rate += stats.rate; + accu_dist += stats.dist; + } + } + + int si = init_eob - 2; + int8_t has_nz_tail = 0; + // eob is not fixed + for (; si >= 0 && has_nz_tail < 2; --si) { + assert(si != init_eob - 1); + const int coeff_idx = scan[si]; + tran_low_t qc = txb_info->qcoeff[coeff_idx]; + + if (qc == 0) { + const int coeff_ctx = + get_lower_levels_ctx(txb_info->levels, coeff_idx, txb_info->bwl, + txb_info->tx_size, tx_class); + accu_rate += txb_costs->base_cost[coeff_ctx][0]; + } else { + LevelDownStats stats; + get_dist_cost_stats_with_eob(&stats, si, txb_costs, txb_info, tx_class); + // check if it is better to make this the last significant coefficient + int cur_eob_rate = + get_eob_cost(si + 1, txb_eob_costs, txb_costs, tx_class); + cur_eob_rd_cost = RDCOST(txb_info->rdmult, cur_eob_rate, 0); + prev_eob_rd_cost = + RDCOST(txb_info->rdmult, accu_rate, accu_dist) + stats.nz_rd; + if (cur_eob_rd_cost <= prev_eob_rd_cost) { + update = 1; + for (int j = si + 1; j < txb_info->eob; j++) { + const int coeff_pos_j = scan[j]; + update_coeff(coeff_pos_j, 0, txb_info); + } + txb_info->eob = si + 1; + + // rerun cost calculation due to change of eob + accu_rate = cur_eob_rate; + accu_dist = 0; + get_dist_cost_stats(&stats, si, 1, txb_costs, txb_info, tx_class); + if ((stats.rd_low < stats.rd) && (stats.low_qc != 0)) { + update = 1; + update_coeff(coeff_idx, stats.low_qc, txb_info); + accu_rate += stats.rate_low; + accu_dist += stats.dist_low; + } else { + accu_rate += stats.rate; + accu_dist += stats.dist; + } + + // reset non zero tail when new eob is found + has_nz_tail = 0; + } else { + int bUpdCoeff = 0; + if (stats.rd_low < stats.rd) { + if ((si < txb_info->eob - 1)) { + bUpdCoeff = 1; + update = 1; + } + } else { + ++has_nz_tail; + } + + if (bUpdCoeff) { + update_coeff(coeff_idx, stats.low_qc, txb_info); + accu_rate += stats.rate_low; + accu_dist += stats.dist_low; + } else { + accu_rate += stats.rate; + accu_dist += stats.dist; + } + } + } + } // for (si) + + // eob is fixed + for (; si >= 0; --si) { + assert(si != init_eob - 1); + const int coeff_idx = scan[si]; + tran_low_t qc = txb_info->qcoeff[coeff_idx]; + + if (qc == 0) { + const int coeff_ctx = + get_lower_levels_ctx(txb_info->levels, coeff_idx, txb_info->bwl, + txb_info->tx_size, tx_class); + accu_rate += txb_costs->base_cost[coeff_ctx][0]; + } else { + LevelDownStats stats; + get_dist_cost_stats(&stats, si, 0, txb_costs, txb_info, tx_class); + + int bUpdCoeff = 0; + if (stats.rd_low < stats.rd) { + if ((si < txb_info->eob - 1)) { + bUpdCoeff = 1; + update = 1; + } + } + if (bUpdCoeff) { + update_coeff(coeff_idx, stats.low_qc, txb_info); + accu_rate += stats.rate_low; + accu_dist += stats.dist_low; + } else { + accu_rate += stats.rate; + accu_dist += stats.dist; + } + } + } // for (si) + + int non_zero_blk_rate = + txb_costs->txb_skip_cost[txb_info->txb_ctx->txb_skip_ctx][0]; + prev_eob_rd_cost = + RDCOST(txb_info->rdmult, accu_rate + non_zero_blk_rate, accu_dist); + + int zero_blk_rate = + txb_costs->txb_skip_cost[txb_info->txb_ctx->txb_skip_ctx][1]; + int64_t zero_blk_rd_cost = RDCOST(txb_info->rdmult, zero_blk_rate, 0); + if (zero_blk_rd_cost <= prev_eob_rd_cost) { + update = 1; + for (int j = 0; j < txb_info->eob; j++) { + const int coeff_pos_j = scan[j]; + update_coeff(coeff_pos_j, 0, txb_info); + } + txb_info->eob = 0; + } + + // record total rate cost + *rate_cost = zero_blk_rd_cost <= prev_eob_rd_cost + ? zero_blk_rate + : accu_rate + non_zero_blk_rate; + + if (txb_info->eob > 0) { + *rate_cost += txb_info->tx_type_cost; + } + + return update; +} + +#if CONFIG_HTB_TRELLIS +static void hbt_init() { + hbt_hash_table = + aom_malloc(sizeof(OptTxbQcoeff) * HBT_TABLE_SIZE * HBT_ARRAY_LENGTH); + memset(hbt_hash_table, 0, + sizeof(OptTxbQcoeff) * HBT_TABLE_SIZE * HBT_ARRAY_LENGTH); + av1_crc32c_calculator_init(&crc_calculator); // 31 bit: qc & ctx + + hbt_needs_init = 0; +} + +void hbt_destroy() { aom_free(hbt_hash_table); } + +static int hbt_hash_miss(uint32_t hbt_ctx_hash, uint32_t hbt_qc_hash, + TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs, + const LV_MAP_EOB_COST *txb_eob_costs, + const struct macroblock_plane *p, int block, + int fast_mode, int *rate_cost) { + (void)fast_mode; + const int16_t *scan = txb_info->scan_order->scan; + int prev_eob = txb_info->eob; + assert(HBT_EOB <= 16); // Lengthen array if allowing longer eob. + int32_t prev_coeff[16]; + for (int i = 0; i < prev_eob; i++) { + prev_coeff[i] = txb_info->qcoeff[scan[i]]; + } + for (int i = prev_eob; i < HBT_EOB; i++) { + prev_coeff[i] = 0; // For compiler piece of mind. + } + + av1_txb_init_levels(txb_info->qcoeff, txb_info->width, txb_info->height, + txb_info->levels); + + const int update = + optimize_txb(txb_info, txb_costs, txb_eob_costs, rate_cost); + + // Overwrite old entry + uint16_t hbt_table_index = hbt_ctx_hash % HBT_TABLE_SIZE; + uint16_t hbt_array_index = hbt_qc_hash % HBT_ARRAY_LENGTH; + hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .rate_cost = *rate_cost; + hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index].init = 1; + hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .hbt_qc_hash = hbt_qc_hash; + hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .hbt_ctx_hash = hbt_ctx_hash; + assert(prev_eob >= txb_info->eob); // eob can't get longer + for (int i = 0; i < txb_info->eob; i++) { + // Record how coeff changed. Convention: towards zero is negative. + if (txb_info->qcoeff[scan[i]] > 0) + hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .deltas[i] = txb_info->qcoeff[scan[i]] - prev_coeff[i]; + else + hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .deltas[i] = prev_coeff[i] - txb_info->qcoeff[scan[i]]; + } + for (int i = txb_info->eob; i < prev_eob; i++) { + // If eob got shorter, record that all after it changed to zero. + if (prev_coeff[i] > 0) + hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .deltas[i] = -prev_coeff[i]; + else + hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .deltas[i] = prev_coeff[i]; + } + for (int i = prev_eob; i < HBT_EOB; i++) { + // Record 'no change' after optimized coefficients run out. + hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .deltas[i] = 0; + } + + if (update) { + p->eobs[block] = txb_info->eob; + p->txb_entropy_ctx[block] = av1_get_txb_entropy_context( + txb_info->qcoeff, txb_info->scan_order, txb_info->eob); + } + return txb_info->eob; +} + +static int hbt_hash_hit(uint32_t hbt_table_index, int hbt_array_index, + TxbInfo *txb_info, const struct macroblock_plane *p, + int block, int *rate_cost) { + const int16_t *scan = txb_info->scan_order->scan; + int new_eob = 0; + int update = 0; + + for (int i = 0; i < txb_info->eob; i++) { + // Delta convention is negatives go towards zero, so only apply those ones. + if (hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .deltas[i] < 0) { + if (txb_info->qcoeff[scan[i]] > 0) + txb_info->qcoeff[scan[i]] += + hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .deltas[i]; + else + txb_info->qcoeff[scan[i]] -= + hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .deltas[i]; + + update = 1; + update_coeff(scan[i], txb_info->qcoeff[scan[i]], txb_info); + } + if (txb_info->qcoeff[scan[i]]) new_eob = i + 1; + } + + // Rate_cost can be calculated here instead (av1_cost_coeffs_txb), but + // it is expensive and gives little benefit as long as qc_hash is high bit + *rate_cost = + hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .rate_cost; + + if (update) { + txb_info->eob = new_eob; + p->eobs[block] = txb_info->eob; + p->txb_entropy_ctx[block] = av1_get_txb_entropy_context( + txb_info->qcoeff, txb_info->scan_order, txb_info->eob); + } + + return txb_info->eob; +} + +static int hbt_search_match(uint32_t hbt_ctx_hash, uint32_t hbt_qc_hash, + TxbInfo *txb_info, + const LV_MAP_COEFF_COST *txb_costs, + const LV_MAP_EOB_COST *txb_eob_costs, + const struct macroblock_plane *p, int block, + int fast_mode, int *rate_cost) { + // Check for qcoeff match + int hbt_array_index = hbt_qc_hash % HBT_ARRAY_LENGTH; + int hbt_table_index = hbt_ctx_hash % HBT_TABLE_SIZE; + + if (hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .hbt_qc_hash == hbt_qc_hash && + hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .hbt_ctx_hash == hbt_ctx_hash && + hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index] + .init) { + return hbt_hash_hit(hbt_table_index, hbt_array_index, txb_info, p, block, + rate_cost); + } else { + return hbt_hash_miss(hbt_ctx_hash, hbt_qc_hash, txb_info, txb_costs, + txb_eob_costs, p, block, fast_mode, rate_cost); + } +} + +static int hbt_create_hashes(TxbInfo *txb_info, + const LV_MAP_COEFF_COST *txb_costs, + const LV_MAP_EOB_COST *txb_eob_costs, + const struct macroblock_plane *p, int block, + int fast_mode, int *rate_cost) { + // Initialize hash table if needed. + if (hbt_needs_init) { + hbt_init(); + } + + //// Hash creation + uint8_t txb_hash_data[256]; // Asserts below to ensure enough space. + const int16_t *scan = txb_info->scan_order->scan; + uint8_t chunk = 0; + int hash_data_index = 0; + + // Make qc_hash. + int packing_index = 0; // needed for packing. + for (int i = 0; i < txb_info->eob; i++) { + tran_low_t prechunk = txb_info->qcoeff[scan[i]]; + + // Softening: Improves speed. Aligns with signed deltas. + if (prechunk < 0) prechunk *= -1; + + // Early kick out: Don't apply feature if there are large coeffs: + // If this kickout value is removed or raised beyond int8_t, + // widen deltas type in OptTxbQcoeff struct. + assert((int8_t)HBT_KICKOUT == HBT_KICKOUT); // If not, widen types. + if (prechunk > HBT_KICKOUT) { + av1_txb_init_levels(txb_info->qcoeff, txb_info->width, txb_info->height, + txb_info->levels); + + const int update = + optimize_txb(txb_info, txb_costs, txb_eob_costs, rate_cost); + + if (update) { + p->eobs[block] = txb_info->eob; + p->txb_entropy_ctx[block] = av1_get_txb_entropy_context( + txb_info->qcoeff, txb_info->scan_order, txb_info->eob); + } + return txb_info->eob; + } + + // Since coeffs are 0 to 3, only 2 bits are needed: pack into bytes + if (packing_index == 0) txb_hash_data[hash_data_index] = 0; + chunk = prechunk << packing_index; + packing_index += 2; + txb_hash_data[hash_data_index] |= chunk; + + // Full byte: + if (packing_index == 8) { + packing_index = 0; + hash_data_index++; + } + } + // Needed when packing_index != 0, to include final byte. + hash_data_index++; + assert(hash_data_index <= 64); + // 31 bit qc_hash: index to array + uint32_t hbt_qc_hash = + av1_get_crc32c_value(&crc_calculator, txb_hash_data, hash_data_index); + + // Make ctx_hash. + hash_data_index = 0; + tran_low_t prechunk; + + for (int i = 0; i < txb_info->eob; i++) { + // Save as magnitudes towards or away from zero. + if (txb_info->tcoeff[scan[i]] >= 0) + prechunk = txb_info->tcoeff[scan[i]] - txb_info->dqcoeff[scan[i]]; + else + prechunk = txb_info->dqcoeff[scan[i]] - txb_info->tcoeff[scan[i]]; + + chunk = prechunk & 0xff; + txb_hash_data[hash_data_index++] = chunk; + } + + // Extra ctx data: + // Include dequants. + txb_hash_data[hash_data_index++] = txb_info->dequant[0] & 0xff; + txb_hash_data[hash_data_index++] = txb_info->dequant[1] & 0xff; + chunk = txb_info->txb_ctx->txb_skip_ctx & 0xff; + txb_hash_data[hash_data_index++] = chunk; + chunk = txb_info->txb_ctx->dc_sign_ctx & 0xff; + txb_hash_data[hash_data_index++] = chunk; + // eob + chunk = txb_info->eob & 0xff; + txb_hash_data[hash_data_index++] = chunk; + // rdmult (int64) + chunk = txb_info->rdmult & 0xff; + txb_hash_data[hash_data_index++] = chunk; + // tx_type + chunk = txb_info->tx_type & 0xff; + txb_hash_data[hash_data_index++] = chunk; + // base_eob_cost + for (int i = 1; i < 3; i++) { // i = 0 are softened away + for (int j = 0; j < SIG_COEF_CONTEXTS_EOB; j++) { + chunk = (txb_costs->base_eob_cost[j][i] & 0xff00) >> 8; + txb_hash_data[hash_data_index++] = chunk; + } + } + // eob_cost + for (int i = 0; i < 11; i++) { + for (int j = 0; j < 2; j++) { + chunk = (txb_eob_costs->eob_cost[j][i] & 0xff00) >> 8; + txb_hash_data[hash_data_index++] = chunk; + } + } + // dc_sign_cost + for (int i = 0; i < 2; i++) { + for (int j = 0; j < DC_SIGN_CONTEXTS; j++) { + chunk = (txb_costs->dc_sign_cost[j][i] & 0xff00) >> 8; + txb_hash_data[hash_data_index++] = chunk; + } + } + + assert(hash_data_index <= 256); + // 31 bit ctx_hash: used to index table + uint32_t hbt_ctx_hash = + av1_get_crc32c_value(&crc_calculator, txb_hash_data, hash_data_index); + //// End hash creation + + return hbt_search_match(hbt_ctx_hash, hbt_qc_hash, txb_info, txb_costs, + txb_eob_costs, p, block, fast_mode, rate_cost); +} +#endif // CONFIG_HTB_TRELLIS + +static AOM_FORCE_INLINE int get_two_coeff_cost_simple( + int ci, tran_low_t abs_qc, int coeff_ctx, + const LV_MAP_COEFF_COST *txb_costs, int bwl, TX_CLASS tx_class, + const uint8_t *levels, int *cost_low) { + // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0) + // and not the last (scan_idx != eob - 1) + assert(ci > 0); + int cost = txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)]; + int diff = 0; + if (abs_qc <= 3) diff = txb_costs->base_cost[coeff_ctx][abs_qc + 4]; + if (abs_qc) { + cost += av1_cost_literal(1); + if (abs_qc > NUM_BASE_LEVELS) { + const int br_ctx = get_br_ctx(levels, ci, bwl, tx_class); + int brcost_diff = 0; + cost += get_br_cost_with_diff(abs_qc, txb_costs->lps_cost[br_ctx], + &brcost_diff); + diff += brcost_diff; + } + } + *cost_low = cost - diff; + + return cost; +} + +static INLINE int get_coeff_cost_eob(int ci, tran_low_t abs_qc, int sign, + int coeff_ctx, int dc_sign_ctx, + const LV_MAP_COEFF_COST *txb_costs, + int bwl, TX_CLASS tx_class) { + int cost = 0; + cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1]; + if (abs_qc != 0) { + if (ci == 0) { + cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign]; + } else { + cost += av1_cost_literal(1); + } + if (abs_qc > NUM_BASE_LEVELS) { + int br_ctx; + br_ctx = get_br_ctx_eob(ci, bwl, tx_class); + cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]); + } + } + return cost; +} + +static INLINE int get_coeff_cost_general(int is_last, int ci, tran_low_t abs_qc, + int sign, int coeff_ctx, + int dc_sign_ctx, + const LV_MAP_COEFF_COST *txb_costs, + int bwl, TX_CLASS tx_class, + const uint8_t *levels) { + int cost = 0; + if (is_last) { + cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1]; + } else { + cost += txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)]; + } + if (abs_qc != 0) { + if (ci == 0) { + cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign]; + } else { + cost += av1_cost_literal(1); + } + if (abs_qc > NUM_BASE_LEVELS) { + int br_ctx; + if (is_last) + br_ctx = get_br_ctx_eob(ci, bwl, tx_class); + else + br_ctx = get_br_ctx(levels, ci, bwl, tx_class); + cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]); + } + } + return cost; +} + +static INLINE void get_qc_dqc_low(tran_low_t abs_qc, int sign, int dqv, + int shift, tran_low_t *qc_low, + tran_low_t *dqc_low) { + tran_low_t abs_qc_low = abs_qc - 1; + *qc_low = (-sign ^ abs_qc_low) + sign; + assert((sign ? -abs_qc_low : abs_qc_low) == *qc_low); + tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift; + *dqc_low = (-sign ^ abs_dqc_low) + sign; + assert((sign ? -abs_dqc_low : abs_dqc_low) == *dqc_low); +} + +static INLINE void update_coeff_general( + int *accu_rate, int64_t *accu_dist, int si, int eob, TX_SIZE tx_size, + TX_CLASS tx_class, int bwl, int height, int64_t rdmult, int shift, + int dc_sign_ctx, const int16_t *dequant, const int16_t *scan, + const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, + const qm_val_t *iqmatrix) { + const int dqv = get_dqv(dequant, scan[si], iqmatrix); + const int ci = scan[si]; + const tran_low_t qc = qcoeff[ci]; + const int is_last = si == (eob - 1); + const int coeff_ctx = get_lower_levels_ctx_general( + is_last, si, bwl, height, levels, ci, tx_size, tx_class); + if (qc == 0) { + *accu_rate += txb_costs->base_cost[coeff_ctx][0]; + } else { + const int sign = (qc < 0) ? 1 : 0; + const tran_low_t abs_qc = abs(qc); + const tran_low_t tqc = tcoeff[ci]; + const tran_low_t dqc = dqcoeff[ci]; + const int64_t dist = get_coeff_dist(tqc, dqc, shift); + const int64_t dist0 = get_coeff_dist(tqc, 0, shift); + const int rate = + get_coeff_cost_general(is_last, ci, abs_qc, sign, coeff_ctx, + dc_sign_ctx, txb_costs, bwl, tx_class, levels); + const int64_t rd = RDCOST(rdmult, rate, dist); + + tran_low_t qc_low, dqc_low; + tran_low_t abs_qc_low; + int64_t dist_low, rd_low; + int rate_low; + if (abs_qc == 1) { + abs_qc_low = qc_low = dqc_low = 0; + dist_low = dist0; + rate_low = txb_costs->base_cost[coeff_ctx][0]; + } else { + get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low); + abs_qc_low = abs_qc - 1; + dist_low = get_coeff_dist(tqc, dqc_low, shift); + rate_low = + get_coeff_cost_general(is_last, ci, abs_qc_low, sign, coeff_ctx, + dc_sign_ctx, txb_costs, bwl, tx_class, levels); + } + + rd_low = RDCOST(rdmult, rate_low, dist_low); + if (rd_low < rd) { + qcoeff[ci] = qc_low; + dqcoeff[ci] = dqc_low; + levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX); + *accu_rate += rate_low; + *accu_dist += dist_low - dist0; + } else { + *accu_rate += rate; + *accu_dist += dist - dist0; + } + } +} + +static AOM_FORCE_INLINE void update_coeff_simple( + int *accu_rate, int si, int eob, TX_SIZE tx_size, TX_CLASS tx_class, + int bwl, int64_t rdmult, int shift, const int16_t *dequant, + const int16_t *scan, const LV_MAP_COEFF_COST *txb_costs, + const tran_low_t *tcoeff, tran_low_t *qcoeff, tran_low_t *dqcoeff, + uint8_t *levels, const qm_val_t *iqmatrix) { + const int dqv = get_dqv(dequant, scan[si], iqmatrix); + (void)eob; + // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0) + // and not the last (scan_idx != eob - 1) + assert(si != eob - 1); + assert(si > 0); + const int ci = scan[si]; + const tran_low_t qc = qcoeff[ci]; + const int coeff_ctx = + get_lower_levels_ctx(levels, ci, bwl, tx_size, tx_class); + if (qc == 0) { + *accu_rate += txb_costs->base_cost[coeff_ctx][0]; + } else { + const tran_low_t abs_qc = abs(qc); + const tran_low_t abs_tqc = abs(tcoeff[ci]); + const tran_low_t abs_dqc = abs(dqcoeff[ci]); + int rate_low = 0; + const int rate = get_two_coeff_cost_simple( + ci, abs_qc, coeff_ctx, txb_costs, bwl, tx_class, levels, &rate_low); + if (abs_dqc < abs_tqc) { + *accu_rate += rate; + return; + } + + const int64_t dist = get_coeff_dist(abs_tqc, abs_dqc, shift); + const int64_t rd = RDCOST(rdmult, rate, dist); + + const tran_low_t abs_qc_low = abs_qc - 1; + const tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift; + const int64_t dist_low = get_coeff_dist(abs_tqc, abs_dqc_low, shift); + const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low); + + if (rd_low < rd) { + const int sign = (qc < 0) ? 1 : 0; + qcoeff[ci] = (-sign ^ abs_qc_low) + sign; + dqcoeff[ci] = (-sign ^ abs_dqc_low) + sign; + levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX); + *accu_rate += rate_low; + } else { + *accu_rate += rate; + } + } +} + +static AOM_FORCE_INLINE void update_coeff_eob( + int *accu_rate, int64_t *accu_dist, int *eob, int *nz_num, int *nz_ci, + int si, TX_SIZE tx_size, TX_CLASS tx_class, int bwl, int height, + int dc_sign_ctx, int64_t rdmult, int shift, const int16_t *dequant, + const int16_t *scan, const LV_MAP_EOB_COST *txb_eob_costs, + const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, int sharpness, + const qm_val_t *iqmatrix) { + const int dqv = get_dqv(dequant, scan[si], iqmatrix); + assert(si != *eob - 1); + const int ci = scan[si]; + const tran_low_t qc = qcoeff[ci]; + const int coeff_ctx = + get_lower_levels_ctx(levels, ci, bwl, tx_size, tx_class); + if (qc == 0) { + *accu_rate += txb_costs->base_cost[coeff_ctx][0]; + } else { + int lower_level = 0; + const tran_low_t abs_qc = abs(qc); + const tran_low_t tqc = tcoeff[ci]; + const tran_low_t dqc = dqcoeff[ci]; + const int sign = (qc < 0) ? 1 : 0; + const int64_t dist0 = get_coeff_dist(tqc, 0, shift); + int64_t dist = get_coeff_dist(tqc, dqc, shift) - dist0; + int rate = + get_coeff_cost_general(0, ci, abs_qc, sign, coeff_ctx, dc_sign_ctx, + txb_costs, bwl, tx_class, levels); + int64_t rd = RDCOST(rdmult, *accu_rate + rate, *accu_dist + dist); + + tran_low_t qc_low, dqc_low; + tran_low_t abs_qc_low; + int64_t dist_low, rd_low; + int rate_low; + if (abs_qc == 1) { + abs_qc_low = 0; + dqc_low = qc_low = 0; + dist_low = 0; + rate_low = txb_costs->base_cost[coeff_ctx][0]; + rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist); + } else { + get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low); + abs_qc_low = abs_qc - 1; + dist_low = get_coeff_dist(tqc, dqc_low, shift) - dist0; + rate_low = + get_coeff_cost_general(0, ci, abs_qc_low, sign, coeff_ctx, + dc_sign_ctx, txb_costs, bwl, tx_class, levels); + rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low); + } + + int lower_level_new_eob = 0; + const int new_eob = si + 1; + const int coeff_ctx_new_eob = get_lower_levels_ctx_eob(bwl, height, si); + const int new_eob_cost = + get_eob_cost(new_eob, txb_eob_costs, txb_costs, tx_class); + int rate_coeff_eob = + new_eob_cost + get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx_new_eob, + dc_sign_ctx, txb_costs, bwl, + tx_class); + int64_t dist_new_eob = dist; + int64_t rd_new_eob = RDCOST(rdmult, rate_coeff_eob, dist_new_eob); + + if (abs_qc_low > 0) { + const int rate_coeff_eob_low = + new_eob_cost + get_coeff_cost_eob(ci, abs_qc_low, sign, + coeff_ctx_new_eob, dc_sign_ctx, + txb_costs, bwl, tx_class); + const int64_t dist_new_eob_low = dist_low; + const int64_t rd_new_eob_low = + RDCOST(rdmult, rate_coeff_eob_low, dist_new_eob_low); + if (rd_new_eob_low < rd_new_eob) { + lower_level_new_eob = 1; + rd_new_eob = rd_new_eob_low; + rate_coeff_eob = rate_coeff_eob_low; + dist_new_eob = dist_new_eob_low; + } + } + + if (rd_low < rd) { + lower_level = 1; + rd = rd_low; + rate = rate_low; + dist = dist_low; + } + + if (sharpness == 0 && rd_new_eob < rd) { + for (int ni = 0; ni < *nz_num; ++ni) { + int last_ci = nz_ci[ni]; + levels[get_padded_idx(last_ci, bwl)] = 0; + qcoeff[last_ci] = 0; + dqcoeff[last_ci] = 0; + } + *eob = new_eob; + *nz_num = 0; + *accu_rate = rate_coeff_eob; + *accu_dist = dist_new_eob; + lower_level = lower_level_new_eob; + } else { + *accu_rate += rate; + *accu_dist += dist; + } + + if (lower_level) { + qcoeff[ci] = qc_low; + dqcoeff[ci] = dqc_low; + levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX); + } + if (qcoeff[ci]) { + nz_ci[*nz_num] = ci; + ++*nz_num; + } + } +} + +static INLINE void update_skip(int *accu_rate, int64_t accu_dist, int *eob, + int nz_num, int *nz_ci, int64_t rdmult, + int skip_cost, int non_skip_cost, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + int sharpness) { + const int64_t rd = RDCOST(rdmult, *accu_rate + non_skip_cost, accu_dist); + const int64_t rd_new_eob = RDCOST(rdmult, skip_cost, 0); + if (sharpness == 0 && rd_new_eob < rd) { + for (int i = 0; i < nz_num; ++i) { + const int ci = nz_ci[i]; + qcoeff[ci] = 0; + dqcoeff[ci] = 0; + // no need to set up levels because this is the last step + // levels[get_padded_idx(ci, bwl)] = 0; + } + *accu_rate = 0; + *eob = 0; + } +} + +int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, + int block, TX_SIZE tx_size, TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, int *rate_cost, + int sharpness, int fast_mode) { + MACROBLOCKD *xd = &x->e_mbd; + struct macroblockd_plane *pd = &xd->plane[plane]; + const struct macroblock_plane *p = &x->plane[plane]; + const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type); + const int16_t *scan = scan_order->scan; + const int shift = av1_get_tx_scale(tx_size); + int eob = p->eobs[block]; + const int16_t *dequant = p->dequant_QTX; + const qm_val_t *iqmatrix = + av1_get_iqmatrix(&cpi->common.quant_params, xd, plane, tx_size, tx_type); + const int block_offset = BLOCK_OFFSET(block); + tran_low_t *qcoeff = p->qcoeff + block_offset; + tran_low_t *dqcoeff = pd->dqcoeff + block_offset; + const tran_low_t *tcoeff = p->coeff + block_offset; + + // This function is not called if eob = 0. + assert(eob > 0); + + if (fast_mode) { + update_coeff_eob_fast(&eob, shift, dequant, scan, tcoeff, qcoeff, dqcoeff); + p->eobs[block] = eob; + if (eob == 0) { + *rate_cost = av1_cost_skip_txb(x, txb_ctx, plane, tx_size); + return eob; + } + } + + const AV1_COMMON *cm = &cpi->common; + const PLANE_TYPE plane_type = get_plane_type(plane); + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + const TX_CLASS tx_class = tx_type_to_class[tx_type]; + const MB_MODE_INFO *mbmi = xd->mi[0]; + const int bwl = get_txb_bwl(tx_size); + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + assert(width == (1 << bwl)); + const int is_inter = is_inter_block(mbmi); + const LV_MAP_COEFF_COST *txb_costs = &x->coeff_costs[txs_ctx][plane_type]; + const int eob_multi_size = txsize_log2_minus4[tx_size]; + const LV_MAP_EOB_COST *txb_eob_costs = + &x->eob_costs[eob_multi_size][plane_type]; + + const int rshift = + (sharpness + + (cpi->oxcf.aq_mode == VARIANCE_AQ && mbmi->segment_id < 4 + ? 7 - mbmi->segment_id + : 2) + + (cpi->oxcf.aq_mode != VARIANCE_AQ && + cpi->oxcf.deltaq_mode == DELTA_Q_PERCEPTUAL && + cm->delta_q_info.delta_q_present_flag && x->sb_energy_level < 0 + ? (3 - x->sb_energy_level) + : 0)); + const int64_t rdmult = + (((int64_t)x->rdmult * + (plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8)))) + + 2) >> + rshift; + + uint8_t levels_buf[TX_PAD_2D]; + uint8_t *const levels = set_levels(levels_buf, width); + + if (eob > 1) av1_txb_init_levels(qcoeff, width, height, levels); + + // TODO(angirbird): check iqmatrix + + const int non_skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][0]; + const int skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1]; + const int eob_cost = get_eob_cost(eob, txb_eob_costs, txb_costs, tx_class); + int accu_rate = eob_cost; + int64_t accu_dist = 0; + int si = eob - 1; + const int ci = scan[si]; + const tran_low_t qc = qcoeff[ci]; + const tran_low_t abs_qc = abs(qc); + const int sign = qc < 0; + const int max_nz_num = 2; + int nz_num = 1; + int nz_ci[3] = { ci, 0, 0 }; + if (abs_qc >= 2) { + update_coeff_general(&accu_rate, &accu_dist, si, eob, tx_size, tx_class, + bwl, height, rdmult, shift, txb_ctx->dc_sign_ctx, + dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff, + levels, iqmatrix); + --si; + } else { + assert(abs_qc == 1); + const int coeff_ctx = get_lower_levels_ctx_eob(bwl, height, si); + accu_rate += + get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx, txb_ctx->dc_sign_ctx, + txb_costs, bwl, tx_class); + const tran_low_t tqc = tcoeff[ci]; + const tran_low_t dqc = dqcoeff[ci]; + const int64_t dist = get_coeff_dist(tqc, dqc, shift); + const int64_t dist0 = get_coeff_dist(tqc, 0, shift); + accu_dist += dist - dist0; + --si; + } + +#define UPDATE_COEFF_EOB_CASE(tx_class_literal) \ + case tx_class_literal: \ + for (; si >= 0 && nz_num <= max_nz_num && !fast_mode; --si) { \ + update_coeff_eob(&accu_rate, &accu_dist, &eob, &nz_num, nz_ci, si, \ + tx_size, tx_class_literal, bwl, height, \ + txb_ctx->dc_sign_ctx, rdmult, shift, dequant, scan, \ + txb_eob_costs, txb_costs, tcoeff, qcoeff, dqcoeff, \ + levels, sharpness, iqmatrix); \ + } \ + break; + switch (tx_class) { + UPDATE_COEFF_EOB_CASE(TX_CLASS_2D); + UPDATE_COEFF_EOB_CASE(TX_CLASS_HORIZ); + UPDATE_COEFF_EOB_CASE(TX_CLASS_VERT); +#undef UPDATE_COEFF_EOB_CASE + default: assert(false); + } + + if (si == -1 && nz_num <= max_nz_num) { + update_skip(&accu_rate, accu_dist, &eob, nz_num, nz_ci, rdmult, skip_cost, + non_skip_cost, qcoeff, dqcoeff, sharpness); + } + +#define UPDATE_COEFF_SIMPLE_CASE(tx_class_literal) \ + case tx_class_literal: \ + for (; si >= 1; --si) { \ + update_coeff_simple(&accu_rate, si, eob, tx_size, tx_class_literal, bwl, \ + rdmult, shift, dequant, scan, txb_costs, tcoeff, \ + qcoeff, dqcoeff, levels, iqmatrix); \ + } \ + break; + switch (tx_class) { + UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_2D); + UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_HORIZ); + UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_VERT); +#undef UPDATE_COEFF_SIMPLE_CASE + default: assert(false); + } + + // DC position + if (si == 0) { + // no need to update accu_dist because it's not used after this point + int64_t dummy_dist = 0; + update_coeff_general(&accu_rate, &dummy_dist, si, eob, tx_size, tx_class, + bwl, height, rdmult, shift, txb_ctx->dc_sign_ctx, + dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff, + levels, iqmatrix); + } + + const int tx_type_cost = get_tx_type_cost(x, xd, plane, tx_size, tx_type, + cm->features.reduced_tx_set_used); + if (eob == 0) + accu_rate += skip_cost; + else + accu_rate += non_skip_cost + tx_type_cost; + + p->eobs[block] = eob; + p->txb_entropy_ctx[block] = + av1_get_txb_entropy_context(qcoeff, scan_order, p->eobs[block]); + + *rate_cost = accu_rate; + return eob; +} + +// This function is deprecated, but we keep it here because hash trellis +// is not integrated with av1_optimize_txb_new yet +int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, + int blk_row, int blk_col, int block, TX_SIZE tx_size, + TXB_CTX *txb_ctx, int fast_mode, int *rate_cost) { + const AV1_COMMON *cm = &cpi->common; + const int reduced_tx_set_used = cm->features.reduced_tx_set_used; + MACROBLOCKD *const xd = &x->e_mbd; + const PLANE_TYPE plane_type = get_plane_type(plane); + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, + tx_size, reduced_tx_set_used); + const MB_MODE_INFO *mbmi = xd->mi[0]; + const struct macroblock_plane *p = &x->plane[plane]; + struct macroblockd_plane *pd = &xd->plane[plane]; + const int eob = p->eobs[block]; + const int block_offset = BLOCK_OFFSET(block); + tran_low_t *qcoeff = p->qcoeff + block_offset; + tran_low_t *dqcoeff = pd->dqcoeff + block_offset; + const tran_low_t *tcoeff = p->coeff + block_offset; + const int16_t *dequant = p->dequant_QTX; + const int seg_eob = av1_get_max_eob(tx_size); + const int bwl = get_txb_bwl(tx_size); + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + const int is_inter = is_inter_block(mbmi); + const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); + const LV_MAP_COEFF_COST *txb_costs = &x->coeff_costs[txs_ctx][plane_type]; + const int eob_multi_size = txsize_log2_minus4[tx_size]; + const LV_MAP_EOB_COST txb_eob_costs = + x->eob_costs[eob_multi_size][plane_type]; + + const int shift = av1_get_tx_scale(tx_size); + const int64_t rdmult = + (((int64_t)x->rdmult * plane_rd_mult[is_inter][plane_type] + << (2 * (xd->bd - 8))) + + 2) >> + 2; + uint8_t levels_buf[TX_PAD_2D]; + uint8_t *const levels = set_levels(levels_buf, width); + const qm_val_t *iqmatrix = + av1_get_iqmatrix(&cpi->common.quant_params, xd, plane, tx_size, tx_type); + assert(width == (1 << bwl)); + const int tx_type_cost = + get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used); + TxbInfo txb_info = { + qcoeff, levels, dqcoeff, tcoeff, dequant, shift, tx_size, + txs_ctx, tx_type, bwl, width, height, eob, seg_eob, + scan_order, txb_ctx, rdmult, iqmatrix, tx_type_cost, + }; + +#if CONFIG_HTB_TRELLIS + // Hash based trellis (hbt) speed feature: avoid expensive optimize_txb calls + // by storing the coefficient deltas in a hash table. + // Currently disabled in speedfeatures.c + if (eob <= HBT_EOB && eob > 0 && cpi->sf.use_hash_based_trellis) { + return hbt_create_hashes(&txb_info, txb_costs, &txb_eob_costs, p, block, + fast_mode, rate_cost); + } +#else + (void)fast_mode; +#endif // CONFIG_HTB_TRELLIS + av1_txb_init_levels(qcoeff, width, height, levels); + + const int update = + optimize_txb(&txb_info, txb_costs, &txb_eob_costs, rate_cost); + + if (update) { + p->eobs[block] = txb_info.eob; + p->txb_entropy_ctx[block] = + av1_get_txb_entropy_context(qcoeff, scan_order, txb_info.eob); + } + return txb_info.eob; +} + +int av1_get_txb_entropy_context(const tran_low_t *qcoeff, + const SCAN_ORDER *scan_order, int eob) { + const int16_t *const scan = scan_order->scan; + int cul_level = 0; + int c; + + if (eob == 0) return 0; + for (c = 0; c < eob; ++c) { + cul_level += abs(qcoeff[scan[c]]); + if (cul_level > COEFF_CONTEXT_MASK) break; + } + + cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level); + set_dc_sign(&cul_level, qcoeff[0]); + + return cul_level; +} + +static void update_tx_type_count(const AV1_COMP *cpi, const AV1_COMMON *cm, + MACROBLOCKD *xd, int blk_row, int blk_col, + int plane, TX_SIZE tx_size, + FRAME_COUNTS *counts, + uint8_t allow_update_cdf) { + MB_MODE_INFO *mbmi = xd->mi[0]; + int is_inter = is_inter_block(mbmi); + const int reduced_tx_set_used = cm->features.reduced_tx_set_used; + FRAME_CONTEXT *fc = xd->tile_ctx; +#if !CONFIG_ENTROPY_STATS + (void)counts; +#endif // !CONFIG_ENTROPY_STATS + + // Only y plane's tx_type is updated + if (plane > 0) return; + const TX_TYPE tx_type = av1_get_tx_type(xd, PLANE_TYPE_Y, blk_row, blk_col, + tx_size, reduced_tx_set_used); + if (is_inter) { + if (cpi->oxcf.use_inter_dct_only) { + assert(tx_type == DCT_DCT); + } + } else { + if (cpi->oxcf.use_intra_dct_only) { + assert(tx_type == DCT_DCT); + } else if (cpi->oxcf.use_intra_default_tx_only) { + const TX_TYPE default_type = get_default_tx_type( + PLANE_TYPE_Y, xd, tx_size, cpi->is_screen_content_type); + (void)default_type; + assert(tx_type == default_type); + } + } + + if (get_ext_tx_types(tx_size, is_inter, reduced_tx_set_used) > 1 && + cm->quant_params.base_qindex > 0 && !mbmi->skip && + !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + const int eset = get_ext_tx_set(tx_size, is_inter, reduced_tx_set_used); + if (eset > 0) { + const TxSetType tx_set_type = + av1_get_ext_tx_set_type(tx_size, is_inter, reduced_tx_set_used); + if (is_inter) { + if (allow_update_cdf) { + update_cdf(fc->inter_ext_tx_cdf[eset][txsize_sqr_map[tx_size]], + av1_ext_tx_ind[tx_set_type][tx_type], + av1_num_ext_tx_set[tx_set_type]); + } +#if CONFIG_ENTROPY_STATS + ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]] + [av1_ext_tx_ind[tx_set_type][tx_type]]; +#endif // CONFIG_ENTROPY_STATS + } else { + PREDICTION_MODE intra_dir; + if (mbmi->filter_intra_mode_info.use_filter_intra) + intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info + .filter_intra_mode]; + else + intra_dir = mbmi->mode; +#if CONFIG_ENTROPY_STATS + ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][intra_dir] + [av1_ext_tx_ind[tx_set_type][tx_type]]; +#endif // CONFIG_ENTROPY_STATS + if (allow_update_cdf) { + update_cdf( + fc->intra_ext_tx_cdf[eset][txsize_sqr_map[tx_size]][intra_dir], + av1_ext_tx_ind[tx_set_type][tx_type], + av1_num_ext_tx_set[tx_set_type]); + } + } + } + } +} + +void av1_update_and_record_txb_context(int plane, int block, int blk_row, + int blk_col, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { + struct tokenize_b_args *const args = arg; + const AV1_COMP *cpi = args->cpi; + const AV1_COMMON *cm = &cpi->common; + ThreadData *const td = args->td; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *p = &x->plane[plane]; + struct macroblockd_plane *pd = &xd->plane[plane]; + const int eob = p->eobs[block]; + const int block_offset = BLOCK_OFFSET(block); + tran_low_t *qcoeff = p->qcoeff + block_offset; + const PLANE_TYPE plane_type = pd->plane_type; + const TX_TYPE tx_type = + av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size, + cm->features.reduced_tx_set_used); + const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); + tran_low_t *tcoeff; + assert(args->dry_run != DRY_RUN_COSTCOEFFS); + if (args->dry_run == OUTPUT_ENABLED) { + MB_MODE_INFO *mbmi = xd->mi[0]; + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, plane, + pd->above_entropy_context + blk_col, + pd->left_entropy_context + blk_row, &txb_ctx); + const int bwl = get_txb_bwl(tx_size); + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + const uint8_t allow_update_cdf = args->allow_update_cdf; + const TX_SIZE txsize_ctx = get_txsize_entropy_ctx(tx_size); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; +#if CONFIG_ENTROPY_STATS + int cdf_idx = cm->coef_cdf_category; + ++td->counts->txb_skip[cdf_idx][txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0]; +#endif // CONFIG_ENTROPY_STATS + if (allow_update_cdf) { + update_cdf(ec_ctx->txb_skip_cdf[txsize_ctx][txb_ctx.txb_skip_ctx], + eob == 0, 2); + } + + CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff; + const int txb_offset = + x->mbmi_ext_frame->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN); + uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset; + uint8_t *const entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset; + entropy_ctx[block] = txb_ctx.txb_skip_ctx; + eob_txb[block] = eob; + + if (eob == 0) { + av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, 0, blk_col, + blk_row); + return; + } + const int segment_id = mbmi->segment_id; + const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size); + tran_low_t *tcoeff_txb = + cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset; + tcoeff = tcoeff_txb + block_offset; + memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob); + + uint8_t levels_buf[TX_PAD_2D]; + uint8_t *const levels = set_levels(levels_buf, width); + av1_txb_init_levels(tcoeff, width, height, levels); + update_tx_type_count(cpi, cm, xd, blk_row, blk_col, plane, tx_size, + td->counts, allow_update_cdf); + + const TX_CLASS tx_class = tx_type_to_class[tx_type]; + const int16_t *const scan = scan_order->scan; + + // record tx type usage + td->rd_counts.tx_type_used[tx_size][tx_type]++; + +#if CONFIG_ENTROPY_STATS + av1_update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx, + td->counts, allow_update_cdf); +#else + av1_update_eob_context(eob, tx_size, tx_class, plane_type, ec_ctx, + allow_update_cdf); +#endif + + DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]); + av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, + coeff_contexts); + + for (int c = eob - 1; c >= 0; --c) { + const int pos = scan[c]; + const int coeff_ctx = coeff_contexts[pos]; + const tran_low_t v = qcoeff[pos]; + const tran_low_t level = abs(v); + + if (allow_update_cdf) { + if (c == eob - 1) { + assert(coeff_ctx < 4); + update_cdf( + ec_ctx->coeff_base_eob_cdf[txsize_ctx][plane_type][coeff_ctx], + AOMMIN(level, 3) - 1, 3); + } else { + update_cdf(ec_ctx->coeff_base_cdf[txsize_ctx][plane_type][coeff_ctx], + AOMMIN(level, 3), 4); + } + } + if (c == eob - 1) { + assert(coeff_ctx < 4); +#if CONFIG_ENTROPY_STATS + ++td->counts->coeff_base_eob_multi[cdf_idx][txsize_ctx][plane_type] + [coeff_ctx][AOMMIN(level, 3) - 1]; + } else { + ++td->counts->coeff_base_multi[cdf_idx][txsize_ctx][plane_type] + [coeff_ctx][AOMMIN(level, 3)]; +#endif + } + if (level > NUM_BASE_LEVELS) { + const int base_range = level - 1 - NUM_BASE_LEVELS; + const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class); + for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) { + const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1); + if (allow_update_cdf) { + update_cdf(ec_ctx->coeff_br_cdf[AOMMIN(txsize_ctx, TX_32X32)] + [plane_type][br_ctx], + k, BR_CDF_SIZE); + } + for (int lps = 0; lps < BR_CDF_SIZE - 1; lps++) { +#if CONFIG_ENTROPY_STATS + ++td->counts->coeff_lps[AOMMIN(txsize_ctx, TX_32X32)][plane_type] + [lps][br_ctx][lps == k]; +#endif // CONFIG_ENTROPY_STATS + if (lps == k) break; + } +#if CONFIG_ENTROPY_STATS + ++td->counts->coeff_lps_multi[cdf_idx][AOMMIN(txsize_ctx, TX_32X32)] + [plane_type][br_ctx][k]; +#endif + if (k < BR_CDF_SIZE - 1) break; + } + } + } + // Update the context needed to code the DC sign (if applicable) + if (tcoeff[0] != 0) { + const int dc_sign = (tcoeff[0] < 0) ? 1 : 0; + const int dc_sign_ctx = txb_ctx.dc_sign_ctx; +#if CONFIG_ENTROPY_STATS + ++td->counts->dc_sign[plane_type][dc_sign_ctx][dc_sign]; +#endif // CONFIG_ENTROPY_STATS + if (allow_update_cdf) + update_cdf(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], dc_sign, 2); + entropy_ctx[block] |= dc_sign_ctx << DC_SIGN_CTX_SHIFT; + } + } else { + tcoeff = qcoeff; + } + const int cul_level = av1_get_txb_entropy_context(tcoeff, scan_order, eob); + av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, + blk_col, blk_row); +} + +void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td, + RUN_TYPE dry_run, BLOCK_SIZE bsize, + uint8_t allow_update_cdf) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + struct tokenize_b_args arg = { cpi, td, 0, allow_update_cdf, dry_run }; + if (mbmi->skip) { + av1_reset_entropy_context(xd, bsize, num_planes); + return; + } + + for (int plane = 0; plane < num_planes; ++plane) { + if (plane && !xd->is_chroma_ref) break; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y); + av1_foreach_transformed_block_in_plane( + xd, plane_bsize, plane, av1_update_and_record_txb_context, &arg); + } +} + +CB_COEFF_BUFFER *av1_get_cb_coeff_buffer(const struct AV1_COMP *cpi, int mi_row, + int mi_col) { + const AV1_COMMON *const cm = &cpi->common; + const int mib_size_log2 = cm->seq_params.mib_size_log2; + const int stride = (cm->mi_params.mi_cols >> mib_size_log2) + 1; + const int offset = + (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2); + return cpi->coeff_buffer_base + offset; +} diff --git a/libs/libaom/src/av1/encoder/encodetxb.h b/libs/libaom/src/av1/encoder/encodetxb.h new file mode 100644 index 000000000..7122895d1 --- /dev/null +++ b/libs/libaom/src/av1/encoder/encodetxb.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ENCODETXB_H_ +#define AOM_AV1_ENCODER_ENCODETXB_H_ + +#include "config/aom_config.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/txb_common.h" +#include "av1/encoder/block.h" +#include "av1/encoder/encoder.h" +#include "aom_dsp/bitwriter.h" +#ifdef __cplusplus +extern "C" { +#endif + +#define TXB_SKIP_CTX_MASK 15 +#define DC_SIGN_CTX_SHIFT 4 +#define DC_SIGN_CTX_MASK 3 + +typedef struct TxbInfo { + tran_low_t *qcoeff; + uint8_t *levels; // absolute values and clamped to 255. + tran_low_t *dqcoeff; + const tran_low_t *tcoeff; + const int16_t *dequant; + int shift; + TX_SIZE tx_size; + TX_SIZE txs_ctx; + TX_TYPE tx_type; + int bwl; + int width; + int height; + int eob; + int seg_eob; + const SCAN_ORDER *scan_order; + TXB_CTX *txb_ctx; + int64_t rdmult; + const qm_val_t *iqmatrix; + int tx_type_cost; +} TxbInfo; + +void av1_alloc_txb_buf(AV1_COMP *cpi); +void av1_free_txb_buf(AV1_COMP *cpi); +int av1_cost_coeffs_txb(const MACROBLOCK *x, const int plane, const int block, + const TX_SIZE tx_size, const TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, int reduced_tx_set_used); +int av1_cost_coeffs_txb_laplacian(const MACROBLOCK *x, const int plane, + const int block, const TX_SIZE tx_size, + const TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, + const int reduced_tx_set_used, + const int adjust_eob); +int av1_cost_coeffs_txb_estimate(const MACROBLOCK *x, const int plane, + const int block, const TX_SIZE tx_size, + const TX_TYPE tx_type); +void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *const x, + aom_writer *w, int blk_row, int blk_col, int plane, + int block, TX_SIZE tx_size); +void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x, + aom_writer *w, BLOCK_SIZE bsize); +int av1_get_txb_entropy_context(const tran_low_t *qcoeff, + const SCAN_ORDER *scan_order, int eob); +void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td, + RUN_TYPE dry_run, BLOCK_SIZE bsize, + uint8_t allow_update_cdf); +void av1_update_and_record_txb_context(int plane, int block, int blk_row, + int blk_col, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg); +#if CONFIG_HTB_TRELLIS +void hbt_destroy(); +#endif // CONFIG_HTB_TRELLIS +int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, + int block, TX_SIZE tx_size, TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, int *rate_cost, + int sharpness, int fast_mode); + +CB_COEFF_BUFFER *av1_get_cb_coeff_buffer(const struct AV1_COMP *cpi, int mi_row, + int mi_col); + +// These numbers are empirically obtained. +static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = { + { 17, 13 }, + { 16, 10 }, +}; + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AV1_ENCODER_ENCODETXB_H_ diff --git a/libs/libaom/src/av1/encoder/ethread.c b/libs/libaom/src/av1/encoder/ethread.c new file mode 100644 index 000000000..693270b87 --- /dev/null +++ b/libs/libaom/src/av1/encoder/ethread.c @@ -0,0 +1,729 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/encoder/av1_multi_thread.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/ethread.h" +#include "av1/encoder/rdopt.h" +#include "aom_dsp/aom_dsp_common.h" + +static AOM_INLINE void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) { + for (int i = 0; i < REFERENCE_MODES; i++) + td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i]; + + for (int i = 0; i < REF_FRAMES; i++) + td->rd_counts.global_motion_used[i] += + td_t->rd_counts.global_motion_used[i]; + + td->rd_counts.compound_ref_used_flag |= + td_t->rd_counts.compound_ref_used_flag; + td->rd_counts.skip_mode_used_flag |= td_t->rd_counts.skip_mode_used_flag; + + for (int i = 0; i < TX_SIZES_ALL; i++) { + for (int j = 0; j < TX_TYPES; j++) + td->rd_counts.tx_type_used[i][j] += td_t->rd_counts.tx_type_used[i][j]; + } + + for (int i = 0; i < BLOCK_SIZES_ALL; i++) { + for (int j = 0; j < 2; j++) { + td->rd_counts.obmc_used[i][j] += td_t->rd_counts.obmc_used[i][j]; + } + } + + for (int i = 0; i < 2; i++) { + td->rd_counts.warped_used[i] += td_t->rd_counts.warped_used[i]; + } +} + +static AOM_INLINE void update_delta_lf_for_row_mt(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &cpi->td.mb.e_mbd; + const int mib_size = cm->seq_params.mib_size; + const int frame_lf_count = + av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + for (int row = 0; row < cm->tiles.rows; row++) { + for (int col = 0; col < cm->tiles.cols; col++) { + TileDataEnc *tile_data = &cpi->tile_data[row * cm->tiles.cols + col]; + const TileInfo *const tile_info = &tile_data->tile_info; + for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end; + mi_row += mib_size) { + if (mi_row == tile_info->mi_row_start) + av1_reset_loop_filter_delta(xd, av1_num_planes(cm)); + for (int mi_col = tile_info->mi_col_start; + mi_col < tile_info->mi_col_end; mi_col += mib_size) { + const int idx_str = cm->mi_params.mi_stride * mi_row + mi_col; + MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + idx_str; + MB_MODE_INFO *mbmi = mi[0]; + if (mbmi->skip == 1 && (mbmi->sb_type == cm->seq_params.sb_size)) { + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) + mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id]; + mbmi->delta_lf_from_base = xd->delta_lf_from_base; + } else { + if (cm->delta_q_info.delta_lf_multi) { + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) + xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id]; + } else { + xd->delta_lf_from_base = mbmi->delta_lf_from_base; + } + } + } + } + } + } +} + +void av1_row_mt_sync_read_dummy(struct AV1RowMTSyncData *const row_mt_sync, + int r, int c) { + (void)row_mt_sync; + (void)r; + (void)c; + return; +} + +void av1_row_mt_sync_write_dummy(struct AV1RowMTSyncData *const row_mt_sync, + int r, int c, const int cols) { + (void)row_mt_sync; + (void)r; + (void)c; + (void)cols; + return; +} + +void av1_row_mt_sync_read(AV1RowMTSync *const row_mt_sync, int r, int c) { +#if CONFIG_MULTITHREAD + const int nsync = row_mt_sync->sync_range; + + if (r) { + pthread_mutex_t *const mutex = &row_mt_sync->mutex_[r - 1]; + pthread_mutex_lock(mutex); + + while (c > row_mt_sync->cur_col[r - 1] - nsync) { + pthread_cond_wait(&row_mt_sync->cond_[r - 1], mutex); + } + pthread_mutex_unlock(mutex); + } +#else + (void)row_mt_sync; + (void)r; + (void)c; +#endif // CONFIG_MULTITHREAD +} + +void av1_row_mt_sync_write(AV1RowMTSync *const row_mt_sync, int r, int c, + const int cols) { +#if CONFIG_MULTITHREAD + const int nsync = row_mt_sync->sync_range; + int cur; + // Only signal when there are enough encoded blocks for next row to run. + int sig = 1; + + if (c < cols - 1) { + cur = c; + if (c % nsync) sig = 0; + } else { + cur = cols + nsync; + } + + if (sig) { + pthread_mutex_lock(&row_mt_sync->mutex_[r]); + + row_mt_sync->cur_col[r] = cur; + + pthread_cond_signal(&row_mt_sync->cond_[r]); + pthread_mutex_unlock(&row_mt_sync->mutex_[r]); + } +#else + (void)row_mt_sync; + (void)r; + (void)c; + (void)cols; +#endif // CONFIG_MULTITHREAD +} + +// Allocate memory for row synchronization +void av1_row_mt_sync_mem_alloc(AV1RowMTSync *row_mt_sync, AV1_COMMON *cm, + int rows) { + row_mt_sync->rows = rows; +#if CONFIG_MULTITHREAD + { + int i; + + CHECK_MEM_ERROR(cm, row_mt_sync->mutex_, + aom_malloc(sizeof(*row_mt_sync->mutex_) * rows)); + if (row_mt_sync->mutex_) { + for (i = 0; i < rows; ++i) { + pthread_mutex_init(&row_mt_sync->mutex_[i], NULL); + } + } + + CHECK_MEM_ERROR(cm, row_mt_sync->cond_, + aom_malloc(sizeof(*row_mt_sync->cond_) * rows)); + if (row_mt_sync->cond_) { + for (i = 0; i < rows; ++i) { + pthread_cond_init(&row_mt_sync->cond_[i], NULL); + } + } + } +#endif // CONFIG_MULTITHREAD + + CHECK_MEM_ERROR(cm, row_mt_sync->cur_col, + aom_malloc(sizeof(*row_mt_sync->cur_col) * rows)); + + // Set up nsync. + row_mt_sync->sync_range = 1; +} + +// Deallocate row based multi-threading synchronization related mutex and data +void av1_row_mt_sync_mem_dealloc(AV1RowMTSync *row_mt_sync) { + if (row_mt_sync != NULL) { +#if CONFIG_MULTITHREAD + int i; + + if (row_mt_sync->mutex_ != NULL) { + for (i = 0; i < row_mt_sync->rows; ++i) { + pthread_mutex_destroy(&row_mt_sync->mutex_[i]); + } + aom_free(row_mt_sync->mutex_); + } + if (row_mt_sync->cond_ != NULL) { + for (i = 0; i < row_mt_sync->rows; ++i) { + pthread_cond_destroy(&row_mt_sync->cond_[i]); + } + aom_free(row_mt_sync->cond_); + } +#endif // CONFIG_MULTITHREAD + aom_free(row_mt_sync->cur_col); + // clear the structure as the source of this call may be dynamic change + // in tiles in which case this call will be followed by an _alloc() + // which may fail. + av1_zero(*row_mt_sync); + } +} + +static AOM_INLINE void assign_tile_to_thread( + MultiThreadHandle *multi_thread_ctxt, int num_tiles, int num_workers) { + int tile_id = 0; + int i; + + for (i = 0; i < num_workers; i++) { + multi_thread_ctxt->thread_id_to_tile_id[i] = tile_id++; + if (tile_id == num_tiles) tile_id = 0; + } +} + +static int get_next_job(AV1_COMP *const cpi, int *current_mi_row, + int cur_tile_id) { + AV1_COMMON *const cm = &cpi->common; + TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id]; + AV1RowMTInfo *row_mt_info = &this_tile->row_mt_info; + + if (row_mt_info->current_mi_row < this_tile->tile_info.mi_row_end) { + *current_mi_row = row_mt_info->current_mi_row; + row_mt_info->num_threads_working++; + row_mt_info->current_mi_row += cm->seq_params.mib_size; + return 1; + } + return 0; +} + +static AOM_INLINE void switch_tile_and_get_next_job(AV1_COMP *const cpi, + int *cur_tile_id, + int *current_mi_row, + int *end_of_frame) { + AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + + int tile_id = -1; // Stores the tile ID with minimum proc done + int max_mis_to_encode = 0; + int min_num_threads_working = INT_MAX; + + for (int tile_row = 0; tile_row < tile_rows; tile_row++) { + for (int tile_col = 0; tile_col < tile_cols; tile_col++) { + int tile_index = tile_row * tile_cols + tile_col; + TileDataEnc *this_tile = &cpi->tile_data[tile_index]; + AV1RowMTInfo *row_mt_info = &this_tile->row_mt_info; + int num_sb_rows_in_tile = + av1_get_sb_rows_in_tile(cm, this_tile->tile_info); + int num_sb_cols_in_tile = + av1_get_sb_cols_in_tile(cm, this_tile->tile_info); + int theoretical_limit_on_threads = + AOMMIN((num_sb_cols_in_tile + 1) >> 1, num_sb_rows_in_tile); + int num_threads_working = row_mt_info->num_threads_working; + if (num_threads_working < theoretical_limit_on_threads) { + int num_mis_to_encode = + this_tile->tile_info.mi_row_end - row_mt_info->current_mi_row; + + // Tile to be processed by this thread is selected on the basis of + // availability of jobs: + // 1) If jobs are available, tile to be processed is chosen on the + // basis of minimum number of threads working for that tile. If two or + // more tiles have same number of threads working for them, then the + // tile with maximum number of jobs available will be chosen. + // 2) If no jobs are available, then end_of_frame is reached. + if (num_mis_to_encode > 0) { + if (num_threads_working < min_num_threads_working) { + min_num_threads_working = num_threads_working; + max_mis_to_encode = 0; + } + if (num_threads_working == min_num_threads_working && + num_mis_to_encode > max_mis_to_encode) { + tile_id = tile_index; + max_mis_to_encode = num_mis_to_encode; + } + } + } + } + } + if (tile_id == -1) { + *end_of_frame = 1; + } else { + // Update the cur ID to the next tile ID that will be processed, + // which will be the least processed tile + *cur_tile_id = tile_id; + get_next_job(cpi, current_mi_row, *cur_tile_id); + } +} + +static int enc_row_mt_worker_hook(void *arg1, void *unused) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; + AV1_COMP *const cpi = thread_data->cpi; + AV1_COMMON *const cm = &cpi->common; + + MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; + int thread_id = thread_data->thread_id; + int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id]; + (void)unused; + + assert(cur_tile_id != -1); + + int end_of_frame = 0; + while (1) { + int current_mi_row = -1; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(cpi->row_mt_mutex_); +#endif + if (!get_next_job(cpi, ¤t_mi_row, cur_tile_id)) { + // No jobs are available for the current tile. Query for the status of + // other tiles and get the next job if available + switch_tile_and_get_next_job(cpi, &cur_tile_id, ¤t_mi_row, + &end_of_frame); + } +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(cpi->row_mt_mutex_); +#endif + if (end_of_frame == 1) break; + + TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id]; + int tile_row = this_tile->tile_info.tile_row; + int tile_col = this_tile->tile_info.tile_col; + + assert(current_mi_row != -1 && + current_mi_row <= this_tile->tile_info.mi_row_end); + + ThreadData *td = thread_data->td; + + td->mb.e_mbd.tile_ctx = td->tctx; + td->mb.tile_pb_ctx = &this_tile->tctx; + if (this_tile->allow_update_cdf) { + td->mb.row_ctx = this_tile->row_ctx; + if (current_mi_row == this_tile->tile_info.mi_row_start) + memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT)); + } else { + memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT)); + } + + av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row, + &td->mb.e_mbd); + + cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params); + av1_crc32c_calculator_init(&td->mb.mb_rd_record.crc_calculator); + + av1_encode_sb_row(cpi, td, tile_row, tile_col, current_mi_row); +#if CONFIG_MULTITHREAD + pthread_mutex_lock(cpi->row_mt_mutex_); +#endif + this_tile->row_mt_info.num_threads_working--; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(cpi->row_mt_mutex_); +#endif + } + + return 1; +} + +static int enc_worker_hook(void *arg1, void *unused) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; + AV1_COMP *const cpi = thread_data->cpi; + const AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + int t; + + (void)unused; + + for (t = thread_data->start; t < tile_rows * tile_cols; + t += cpi->num_workers) { + int tile_row = t / tile_cols; + int tile_col = t % tile_cols; + + TileDataEnc *const this_tile = + &cpi->tile_data[tile_row * cm->tiles.cols + tile_col]; + thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx; + thread_data->td->mb.tile_pb_ctx = &this_tile->tctx; + av1_encode_tile(cpi, thread_data->td, tile_row, tile_col); + } + + return 1; +} + +static AOM_INLINE void create_enc_workers(AV1_COMP *cpi, int num_workers) { + AV1_COMMON *const cm = &cpi->common; + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + int sb_mi_size = av1_get_sb_mi_size(cm); + + CHECK_MEM_ERROR(cm, cpi->workers, + aom_malloc(num_workers * sizeof(*cpi->workers))); + + CHECK_MEM_ERROR(cm, cpi->tile_thr_data, + aom_calloc(num_workers, sizeof(*cpi->tile_thr_data))); + +#if CONFIG_MULTITHREAD + if (cpi->oxcf.row_mt == 1) { + if (cpi->row_mt_mutex_ == NULL) { + CHECK_MEM_ERROR(cm, cpi->row_mt_mutex_, + aom_malloc(sizeof(*(cpi->row_mt_mutex_)))); + if (cpi->row_mt_mutex_) pthread_mutex_init(cpi->row_mt_mutex_, NULL); + } + } +#endif + + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = &cpi->workers[i]; + EncWorkerData *const thread_data = &cpi->tile_thr_data[i]; + + ++cpi->num_workers; + winterface->init(worker); + worker->thread_name = "aom enc worker"; + + thread_data->cpi = cpi; + thread_data->thread_id = i; + + if (i > 0) { + // Allocate thread data. + CHECK_MEM_ERROR(cm, thread_data->td, + aom_memalign(32, sizeof(*thread_data->td))); + av1_zero(*thread_data->td); + + // Set up pc_tree. + thread_data->td->pc_tree = NULL; + av1_setup_pc_tree(cpi, thread_data->td); + + CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf, + (uint8_t *)aom_memalign( + 16, MAX_MB_PLANE * MAX_SB_SQUARE * + sizeof(*thread_data->td->above_pred_buf))); + CHECK_MEM_ERROR(cm, thread_data->td->left_pred_buf, + (uint8_t *)aom_memalign( + 16, MAX_MB_PLANE * MAX_SB_SQUARE * + sizeof(*thread_data->td->left_pred_buf))); + + CHECK_MEM_ERROR( + cm, thread_data->td->wsrc_buf, + (int32_t *)aom_memalign( + 16, MAX_SB_SQUARE * sizeof(*thread_data->td->wsrc_buf))); + + CHECK_MEM_ERROR(cm, thread_data->td->inter_modes_info, + (InterModesInfo *)aom_malloc( + sizeof(*thread_data->td->inter_modes_info))); + + for (int x = 0; x < 2; x++) + for (int y = 0; y < 2; y++) + CHECK_MEM_ERROR( + cm, thread_data->td->hash_value_buffer[x][y], + (uint32_t *)aom_malloc( + AOM_BUFFER_SIZE_FOR_BLOCK_HASH * + sizeof(*thread_data->td->hash_value_buffer[0][0]))); + + CHECK_MEM_ERROR( + cm, thread_data->td->mask_buf, + (int32_t *)aom_memalign( + 16, MAX_SB_SQUARE * sizeof(*thread_data->td->mask_buf))); + // Allocate frame counters in thread data. + CHECK_MEM_ERROR(cm, thread_data->td->counts, + aom_calloc(1, sizeof(*thread_data->td->counts))); + + // Allocate buffers used by palette coding mode. + CHECK_MEM_ERROR( + cm, thread_data->td->palette_buffer, + aom_memalign(16, sizeof(*thread_data->td->palette_buffer))); + + av1_alloc_compound_type_rd_buffers(cm, &thread_data->td->comp_rd_buffer); + + CHECK_MEM_ERROR( + cm, thread_data->td->tmp_conv_dst, + aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * + sizeof(*thread_data->td->tmp_conv_dst))); + for (int j = 0; j < 2; ++j) { + CHECK_MEM_ERROR( + cm, thread_data->td->tmp_obmc_bufs[j], + aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE * + sizeof(*thread_data->td->tmp_obmc_bufs[j]))); + } + + CHECK_MEM_ERROR( + cm, thread_data->td->mbmi_ext, + aom_calloc(sb_mi_size, sizeof(*thread_data->td->mbmi_ext))); + + if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) { + const int num_64x64_blocks = + (cm->seq_params.sb_size == BLOCK_64X64) ? 1 : 4; + CHECK_MEM_ERROR( + cm, thread_data->td->vt64x64, + aom_malloc(sizeof(*thread_data->td->vt64x64) * num_64x64_blocks)); + } + + // Create threads + if (!winterface->reset(worker)) + aom_internal_error(&cm->error, AOM_CODEC_ERROR, + "Tile encoder thread creation failed"); + } else { + // Main thread acts as a worker and uses the thread data in cpi. + thread_data->td = &cpi->td; + } + if (cpi->oxcf.row_mt == 1) + CHECK_MEM_ERROR( + cm, thread_data->td->tctx, + (FRAME_CONTEXT *)aom_memalign(16, sizeof(*thread_data->td->tctx))); + winterface->sync(worker); + } +} + +static AOM_INLINE void launch_enc_workers(AV1_COMP *cpi, int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + // Encode a frame + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = &cpi->workers[i]; + EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; + + // Set the starting tile for each thread. + thread_data->start = i; + + if (i == 0) + winterface->execute(worker); + else + winterface->launch(worker); + } +} + +static AOM_INLINE void sync_enc_workers(AV1_COMP *cpi, int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + int had_error = 0; + + // Encoding ends. + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = &cpi->workers[i]; + had_error |= !winterface->sync(worker); + } + + if (had_error) + aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, + "Failed to encode tile data"); +} + +static AOM_INLINE void accumulate_counters_enc_workers(AV1_COMP *cpi, + int num_workers) { + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = &cpi->workers[i]; + EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; + cpi->intrabc_used |= thread_data->td->intrabc_used; + cpi->deltaq_used |= thread_data->td->deltaq_used; + + // Accumulate counters. + if (i > 0) { + av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts); + accumulate_rd_opt(&cpi->td, thread_data->td); + cpi->td.mb.txb_split_count += thread_data->td->mb.txb_split_count; +#if CONFIG_SPEED_STATS + cpi->td.mb.tx_search_count += thread_data->td->mb.tx_search_count; +#endif // CONFIG_SPEED_STATS + } + } +} + +static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook, + int num_workers) { + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = &cpi->workers[i]; + EncWorkerData *const thread_data = &cpi->tile_thr_data[i]; + + worker->hook = hook; + worker->data1 = thread_data; + worker->data2 = NULL; + + thread_data->td->intrabc_used = 0; + thread_data->td->deltaq_used = 0; + + // Before encoding a frame, copy the thread data from cpi. + if (thread_data->td != &cpi->td) { + thread_data->td->mb = cpi->td.mb; + thread_data->td->rd_counts = cpi->td.rd_counts; + thread_data->td->mb.above_pred_buf = thread_data->td->above_pred_buf; + thread_data->td->mb.left_pred_buf = thread_data->td->left_pred_buf; + thread_data->td->mb.wsrc_buf = thread_data->td->wsrc_buf; + + thread_data->td->mb.inter_modes_info = thread_data->td->inter_modes_info; + for (int x = 0; x < 2; x++) { + for (int y = 0; y < 2; y++) { + memcpy(thread_data->td->hash_value_buffer[x][y], + cpi->td.mb.intrabc_hash_info.hash_value_buffer[x][y], + AOM_BUFFER_SIZE_FOR_BLOCK_HASH * + sizeof(*thread_data->td->hash_value_buffer[0][0])); + thread_data->td->mb.intrabc_hash_info.hash_value_buffer[x][y] = + thread_data->td->hash_value_buffer[x][y]; + } + } + thread_data->td->mb.mask_buf = thread_data->td->mask_buf; + thread_data->td->mb.mbmi_ext = thread_data->td->mbmi_ext; + } + if (thread_data->td->counts != &cpi->counts) { + memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts)); + } + + if (i > 0) { + thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer; + thread_data->td->mb.comp_rd_buffer = thread_data->td->comp_rd_buffer; + thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst; + for (int j = 0; j < 2; ++j) { + thread_data->td->mb.tmp_obmc_bufs[j] = + thread_data->td->tmp_obmc_bufs[j]; + } + + thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst; + for (int j = 0; j < 2; ++j) { + thread_data->td->mb.e_mbd.tmp_obmc_bufs[j] = + thread_data->td->mb.tmp_obmc_bufs[j]; + } + } + } +} + +void av1_encode_tiles_mt(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols * tile_rows); + + if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) + av1_alloc_tile_data(cpi); + + av1_init_tile_data(cpi); + // Only run once to create threads and allocate thread data. + if (cpi->num_workers == 0) { + create_enc_workers(cpi, num_workers); + } else { + num_workers = AOMMIN(num_workers, cpi->num_workers); + } + prepare_enc_workers(cpi, enc_worker_hook, num_workers); + launch_enc_workers(cpi, num_workers); + sync_enc_workers(cpi, num_workers); + accumulate_counters_enc_workers(cpi, num_workers); +} + +// Accumulate frame counts. FRAME_COUNTS consist solely of 'unsigned int' +// members, so we treat it as an array, and sum over the whole length. +void av1_accumulate_frame_counts(FRAME_COUNTS *acc_counts, + const FRAME_COUNTS *counts) { + unsigned int *const acc = (unsigned int *)acc_counts; + const unsigned int *const cnt = (const unsigned int *)counts; + + const unsigned int n_counts = sizeof(FRAME_COUNTS) / sizeof(unsigned int); + + for (unsigned int i = 0; i < n_counts; i++) acc[i] += cnt[i]; +} + +void av1_encode_tiles_row_mt(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; + int num_workers = 0; + int total_num_threads_row_mt = 0; + int max_sb_rows = 0; + + if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) { + av1_row_mt_mem_dealloc(cpi); + av1_alloc_tile_data(cpi); + } + + av1_init_tile_data(cpi); + + for (int row = 0; row < tile_rows; row++) { + for (int col = 0; col < tile_cols; col++) { + TileDataEnc *tile_data = &cpi->tile_data[row * cm->tiles.cols + col]; + int num_sb_rows_in_tile = + av1_get_sb_rows_in_tile(cm, tile_data->tile_info); + int num_sb_cols_in_tile = + av1_get_sb_cols_in_tile(cm, tile_data->tile_info); + total_num_threads_row_mt += + AOMMIN((num_sb_cols_in_tile + 1) >> 1, num_sb_rows_in_tile); + max_sb_rows = AOMMAX(max_sb_rows, num_sb_rows_in_tile); + } + } + // TODO(ravi.chaudhary@ittiam.com): Currently the percentage of + // post-processing stages in encoder is quiet low, so limiting the number of + // threads to the theoretical limit in row-mt does not have much impact on + // post-processing multi-threading stage. Need to revisit this when + // post-processing time starts shooting up. + num_workers = AOMMIN(cpi->oxcf.max_threads, total_num_threads_row_mt); + + if (multi_thread_ctxt->allocated_tile_cols != tile_cols || + multi_thread_ctxt->allocated_tile_rows != tile_rows || + multi_thread_ctxt->allocated_sb_rows != max_sb_rows) { + av1_row_mt_mem_dealloc(cpi); + av1_row_mt_mem_alloc(cpi, max_sb_rows); + } + + memset(multi_thread_ctxt->thread_id_to_tile_id, -1, + sizeof(*multi_thread_ctxt->thread_id_to_tile_id) * MAX_NUM_THREADS); + + for (int tile_row = 0; tile_row < tile_rows; tile_row++) { + for (int tile_col = 0; tile_col < tile_cols; tile_col++) { + int tile_id = tile_row * tile_cols + tile_col; + TileDataEnc *this_tile = &cpi->tile_data[tile_id]; + + // Initialize cur_col to -1 for all rows. + memset(this_tile->row_mt_sync.cur_col, -1, + sizeof(*this_tile->row_mt_sync.cur_col) * max_sb_rows); + this_tile->row_mt_info.current_mi_row = this_tile->tile_info.mi_row_start; + this_tile->row_mt_info.num_threads_working = 0; + + av1_inter_mode_data_init(this_tile); + av1_zero_above_context(cm, &cpi->td.mb.e_mbd, + this_tile->tile_info.mi_col_start, + this_tile->tile_info.mi_col_end, tile_row); + } + } + + // Only run once to create threads and allocate thread data. + if (cpi->num_workers == 0) { + create_enc_workers(cpi, num_workers); + } else { + num_workers = AOMMIN(num_workers, cpi->num_workers); + } + assign_tile_to_thread(multi_thread_ctxt, tile_cols * tile_rows, num_workers); + prepare_enc_workers(cpi, enc_row_mt_worker_hook, num_workers); + launch_enc_workers(cpi, num_workers); + sync_enc_workers(cpi, num_workers); + if (cm->delta_q_info.delta_lf_present_flag) update_delta_lf_for_row_mt(cpi); + accumulate_counters_enc_workers(cpi, num_workers); +} diff --git a/libs/libaom/src/av1/encoder/ethread.h b/libs/libaom/src/av1/encoder/ethread.h new file mode 100644 index 000000000..183075950 --- /dev/null +++ b/libs/libaom/src/av1/encoder/ethread.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ETHREAD_H_ +#define AOM_AV1_ENCODER_ETHREAD_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1_COMP; +struct ThreadData; +struct AV1RowMTSyncData; + +typedef struct EncWorkerData { + struct AV1_COMP *cpi; + struct ThreadData *td; + int start; + int thread_id; +} EncWorkerData; + +void av1_row_mt_sync_read(AV1RowMTSync *const row_mt_sync, int r, int c); +void av1_row_mt_sync_write(AV1RowMTSync *const row_mt_sync, int r, int c, + const int cols); + +void av1_row_mt_sync_read_dummy(struct AV1RowMTSyncData *const row_mt_sync, + int r, int c); +void av1_row_mt_sync_write_dummy(struct AV1RowMTSyncData *const row_mt_sync, + int r, int c, const int cols); + +void av1_row_mt_sync_mem_dealloc(AV1RowMTSync *row_mt_sync); +// Allocate memory for row based multi-threading synchronization. +void av1_row_mt_sync_mem_alloc(AV1RowMTSync *row_mt_sync, struct AV1Common *cm, + int rows); + +void av1_encode_tiles_mt(struct AV1_COMP *cpi); +void av1_encode_tiles_row_mt(struct AV1_COMP *cpi); + +void av1_accumulate_frame_counts(struct FRAME_COUNTS *acc_counts, + const struct FRAME_COUNTS *counts); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ETHREAD_H_ diff --git a/libs/libaom/src/av1/encoder/extend.c b/libs/libaom/src/av1/encoder/extend.c new file mode 100644 index 000000000..934cf5644 --- /dev/null +++ b/libs/libaom/src/av1/encoder/extend.c @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" + +#include "av1/common/common.h" +#include "av1/encoder/extend.h" + +static void copy_and_extend_plane(const uint8_t *src, int src_pitch, + uint8_t *dst, int dst_pitch, int w, int h, + int extend_top, int extend_left, + int extend_bottom, int extend_right) { + int i, linesize; + + // copy the left and right most columns out + const uint8_t *src_ptr1 = src; + const uint8_t *src_ptr2 = src + w - 1; + uint8_t *dst_ptr1 = dst - extend_left; + uint8_t *dst_ptr2 = dst + w; + + for (i = 0; i < h; i++) { + memset(dst_ptr1, src_ptr1[0], extend_left); + memcpy(dst_ptr1 + extend_left, src_ptr1, w); + memset(dst_ptr2, src_ptr2[0], extend_right); + src_ptr1 += src_pitch; + src_ptr2 += src_pitch; + dst_ptr1 += dst_pitch; + dst_ptr2 += dst_pitch; + } + + // Now copy the top and bottom lines into each line of the respective + // borders + src_ptr1 = dst - extend_left; + src_ptr2 = dst + dst_pitch * (h - 1) - extend_left; + dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left; + dst_ptr2 = dst + dst_pitch * (h)-extend_left; + linesize = extend_left + extend_right + w; + + for (i = 0; i < extend_top; i++) { + memcpy(dst_ptr1, src_ptr1, linesize); + dst_ptr1 += dst_pitch; + } + + for (i = 0; i < extend_bottom; i++) { + memcpy(dst_ptr2, src_ptr2, linesize); + dst_ptr2 += dst_pitch; + } +} + +static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch, + uint8_t *dst8, int dst_pitch, int w, + int h, int extend_top, int extend_left, + int extend_bottom, int extend_right) { + int i, linesize; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + + // copy the left and right most columns out + const uint16_t *src_ptr1 = src; + const uint16_t *src_ptr2 = src + w - 1; + uint16_t *dst_ptr1 = dst - extend_left; + uint16_t *dst_ptr2 = dst + w; + + for (i = 0; i < h; i++) { + aom_memset16(dst_ptr1, src_ptr1[0], extend_left); + memcpy(dst_ptr1 + extend_left, src_ptr1, w * sizeof(src_ptr1[0])); + aom_memset16(dst_ptr2, src_ptr2[0], extend_right); + src_ptr1 += src_pitch; + src_ptr2 += src_pitch; + dst_ptr1 += dst_pitch; + dst_ptr2 += dst_pitch; + } + + // Now copy the top and bottom lines into each line of the respective + // borders + src_ptr1 = dst - extend_left; + src_ptr2 = dst + dst_pitch * (h - 1) - extend_left; + dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left; + dst_ptr2 = dst + dst_pitch * (h)-extend_left; + linesize = extend_left + extend_right + w; + + for (i = 0; i < extend_top; i++) { + memcpy(dst_ptr1, src_ptr1, linesize * sizeof(src_ptr1[0])); + dst_ptr1 += dst_pitch; + } + + for (i = 0; i < extend_bottom; i++) { + memcpy(dst_ptr2, src_ptr2, linesize * sizeof(src_ptr2[0])); + dst_ptr2 += dst_pitch; + } +} + +void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst) { + // Extend src frame in buffer + const int et_y = dst->border; + const int el_y = dst->border; + const int er_y = + AOMMAX(src->y_width + dst->border, ALIGN_POWER_OF_TWO(src->y_width, 6)) - + src->y_crop_width; + const int eb_y = AOMMAX(src->y_height + dst->border, + ALIGN_POWER_OF_TWO(src->y_height, 6)) - + src->y_crop_height; + const int uv_width_subsampling = (src->uv_width != src->y_width); + const int uv_height_subsampling = (src->uv_height != src->y_height); + const int et_uv = et_y >> uv_height_subsampling; + const int el_uv = el_y >> uv_width_subsampling; + const int eb_uv = eb_y >> uv_height_subsampling; + const int er_uv = er_y >> uv_width_subsampling; + + if (src->flags & YV12_FLAG_HIGHBITDEPTH) { + highbd_copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, src->y_crop_width, + src->y_crop_height, et_y, el_y, eb_y, er_y); + if (src->u_buffer) { + highbd_copy_and_extend_plane( + src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv); + } + if (src->v_buffer) { + highbd_copy_and_extend_plane( + src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv); + } + return; + } + + copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, src->y_crop_width, src->y_crop_height, + et_y, el_y, eb_y, er_y); + if (src->u_buffer) { + copy_and_extend_plane(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, src->uv_crop_width, + src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv); + } + if (src->v_buffer) { + copy_and_extend_plane(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, src->uv_crop_width, + src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv); + } +} diff --git a/libs/libaom/src/av1/encoder/extend.h b/libs/libaom/src/av1/encoder/extend.h new file mode 100644 index 000000000..b8cc5b9d2 --- /dev/null +++ b/libs/libaom/src/av1/encoder/extend.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_EXTEND_H_ +#define AOM_AV1_ENCODER_EXTEND_H_ + +#include "aom_scale/yv12config.h" +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_EXTEND_H_ diff --git a/libs/libaom/src/av1/encoder/firstpass.c b/libs/libaom/src/av1/encoder/firstpass.c new file mode 100644 index 000000000..0955510ca --- /dev/null +++ b/libs/libaom/src/av1/encoder/firstpass.c @@ -0,0 +1,1065 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/variance.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "aom_ports/system_state.h" +#include "aom_scale/aom_scale.h" +#include "aom_scale/yv12config.h" + +#include "av1/common/entropymv.h" +#include "av1/common/quant_common.h" +#include "av1/common/reconinter.h" // av1_setup_dst_planes() +#include "av1/common/txb_common.h" +#include "av1/encoder/aq_variance.h" +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/block.h" +#include "av1/encoder/dwt.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encodemb.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encode_strategy.h" +#include "av1/encoder/extend.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/reconinter_enc.h" + +#define OUTPUT_FPF 0 + +#define FIRST_PASS_Q 10.0 +#define INTRA_MODE_PENALTY 1024 +#define NEW_MV_MODE_PENALTY 32 +#define DARK_THRESH 64 + +#define NCOUNT_INTRA_THRESH 8192 +#define NCOUNT_INTRA_FACTOR 3 + +static AOM_INLINE void output_stats(FIRSTPASS_STATS *stats, + struct aom_codec_pkt_list *pktlist) { + struct aom_codec_cx_pkt pkt; + pkt.kind = AOM_CODEC_STATS_PKT; + pkt.data.twopass_stats.buf = stats; + pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS); + if (pktlist != NULL) aom_codec_pkt_list_add(pktlist, &pkt); + +// TEMP debug code +#if OUTPUT_FPF + { + FILE *fpfile; + fpfile = fopen("firstpass.stt", "a"); + + fprintf(fpfile, + "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf" + "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf" + "%12.4lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf\n", + stats->frame, stats->weight, stats->intra_error, stats->coded_error, + stats->sr_coded_error, stats->pcnt_inter, stats->pcnt_motion, + stats->pcnt_second_ref, stats->pcnt_neutral, stats->intra_skip_pct, + stats->inactive_zone_rows, stats->inactive_zone_cols, stats->MVr, + stats->mvr_abs, stats->MVc, stats->mvc_abs, stats->MVrv, + stats->MVcv, stats->mv_in_out_count, stats->new_mv_count, + stats->count, stats->duration); + fclose(fpfile); + } +#endif +} + +void av1_twopass_zero_stats(FIRSTPASS_STATS *section) { + section->frame = 0.0; + section->weight = 0.0; + section->intra_error = 0.0; + section->frame_avg_wavelet_energy = 0.0; + section->coded_error = 0.0; + section->sr_coded_error = 0.0; + section->pcnt_inter = 0.0; + section->pcnt_motion = 0.0; + section->pcnt_second_ref = 0.0; + section->pcnt_neutral = 0.0; + section->intra_skip_pct = 0.0; + section->inactive_zone_rows = 0.0; + section->inactive_zone_cols = 0.0; + section->MVr = 0.0; + section->mvr_abs = 0.0; + section->MVc = 0.0; + section->mvc_abs = 0.0; + section->MVrv = 0.0; + section->MVcv = 0.0; + section->mv_in_out_count = 0.0; + section->new_mv_count = 0.0; + section->count = 0.0; + section->duration = 1.0; +} + +static AOM_INLINE void accumulate_stats(FIRSTPASS_STATS *section, + const FIRSTPASS_STATS *frame) { + section->frame += frame->frame; + section->weight += frame->weight; + section->intra_error += frame->intra_error; + section->frame_avg_wavelet_energy += frame->frame_avg_wavelet_energy; + section->coded_error += frame->coded_error; + section->sr_coded_error += frame->sr_coded_error; + section->pcnt_inter += frame->pcnt_inter; + section->pcnt_motion += frame->pcnt_motion; + section->pcnt_second_ref += frame->pcnt_second_ref; + section->pcnt_neutral += frame->pcnt_neutral; + section->intra_skip_pct += frame->intra_skip_pct; + section->inactive_zone_rows += frame->inactive_zone_rows; + section->inactive_zone_cols += frame->inactive_zone_cols; + section->MVr += frame->MVr; + section->mvr_abs += frame->mvr_abs; + section->MVc += frame->MVc; + section->mvc_abs += frame->mvc_abs; + section->MVrv += frame->MVrv; + section->MVcv += frame->MVcv; + section->mv_in_out_count += frame->mv_in_out_count; + section->new_mv_count += frame->new_mv_count; + section->count += frame->count; + section->duration += frame->duration; +} + +void av1_end_first_pass(AV1_COMP *cpi) { + if (cpi->twopass.stats_buf_ctx->total_stats) + output_stats(cpi->twopass.stats_buf_ctx->total_stats, cpi->output_pkt_list); +} + +static aom_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) { + switch (bsize) { + case BLOCK_8X8: return aom_mse8x8; + case BLOCK_16X8: return aom_mse16x8; + case BLOCK_8X16: return aom_mse8x16; + default: return aom_mse16x16; + } +} + +static unsigned int get_prediction_error(BLOCK_SIZE bsize, + const struct buf_2d *src, + const struct buf_2d *ref) { + unsigned int sse; + const aom_variance_fn_t fn = get_block_variance_fn(bsize); + fn(src->buf, src->stride, ref->buf, ref->stride, &sse); + return sse; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static aom_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize, + int bd) { + switch (bd) { + default: + switch (bsize) { + case BLOCK_8X8: return aom_highbd_8_mse8x8; + case BLOCK_16X8: return aom_highbd_8_mse16x8; + case BLOCK_8X16: return aom_highbd_8_mse8x16; + default: return aom_highbd_8_mse16x16; + } + break; + case 10: + switch (bsize) { + case BLOCK_8X8: return aom_highbd_10_mse8x8; + case BLOCK_16X8: return aom_highbd_10_mse16x8; + case BLOCK_8X16: return aom_highbd_10_mse8x16; + default: return aom_highbd_10_mse16x16; + } + break; + case 12: + switch (bsize) { + case BLOCK_8X8: return aom_highbd_12_mse8x8; + case BLOCK_16X8: return aom_highbd_12_mse16x8; + case BLOCK_8X16: return aom_highbd_12_mse8x16; + default: return aom_highbd_12_mse16x16; + } + break; + } +} + +static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize, + const struct buf_2d *src, + const struct buf_2d *ref, + int bd) { + unsigned int sse; + const aom_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd); + fn(src->buf, src->stride, ref->buf, ref->stride, &sse); + return sse; +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +// Refine the motion search range according to the frame dimension +// for first pass test. +static int get_search_range(const AV1_COMP *cpi) { + int sr = 0; + const int dim = AOMMIN(cpi->initial_width, cpi->initial_height); + + while ((dim << sr) < MAX_FULL_PEL_VAL) ++sr; + return sr; +} + +static AOM_INLINE void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x, + const MV *ref_mv, + FULLPEL_MV *best_mv, + int *best_motion_err) { + MACROBLOCKD *const xd = &x->e_mbd; + FULLPEL_MV start_mv = get_fullmv_from_mv(ref_mv); + int tmp_err; + const BLOCK_SIZE bsize = xd->mi[0]->sb_type; + aom_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize]; + const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY; + const int sr = get_search_range(cpi); + const int step_param = 3 + sr; + + const search_site_config *first_pass_search_sites = + &cpi->mv_search_params.ss_cfg[SS_CFG_FPF]; + FULLPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_fullpel_ms_params(&ms_params, cpi, x, bsize, ref_mv, + first_pass_search_sites); + ms_params.search_method = NSTEP; + + FULLPEL_MV this_best_mv; + tmp_err = av1_full_pixel_search(start_mv, &ms_params, step_param, NULL, + &this_best_mv, NULL); + + if (tmp_err < INT_MAX) { + tmp_err = av1_get_mvpred_sse(x, &this_best_mv, ref_mv, &v_fn_ptr) + + new_mv_mode_penalty; + } + + if (tmp_err < *best_motion_err) { + *best_motion_err = tmp_err; + *best_mv = this_best_mv; + } +} + +static BLOCK_SIZE get_bsize(const CommonModeInfoParams *const mi_params, + int mb_row, int mb_col) { + if (mi_size_wide[BLOCK_16X16] * mb_col + mi_size_wide[BLOCK_8X8] < + mi_params->mi_cols) { + return mi_size_wide[BLOCK_16X16] * mb_row + mi_size_wide[BLOCK_8X8] < + mi_params->mi_rows + ? BLOCK_16X16 + : BLOCK_16X8; + } else { + return mi_size_wide[BLOCK_16X16] * mb_row + mi_size_wide[BLOCK_8X8] < + mi_params->mi_rows + ? BLOCK_8X16 + : BLOCK_8X8; + } +} + +static int find_fp_qindex(aom_bit_depth_t bit_depth) { + return av1_find_qindex(FIRST_PASS_Q, bit_depth, 0, QINDEX_RANGE - 1); +} + +static double raw_motion_error_stdev(int *raw_motion_err_list, + int raw_motion_err_counts) { + int64_t sum_raw_err = 0; + double raw_err_avg = 0; + double raw_err_stdev = 0; + if (raw_motion_err_counts == 0) return 0; + + int i; + for (i = 0; i < raw_motion_err_counts; i++) { + sum_raw_err += raw_motion_err_list[i]; + } + raw_err_avg = (double)sum_raw_err / raw_motion_err_counts; + for (i = 0; i < raw_motion_err_counts; i++) { + raw_err_stdev += (raw_motion_err_list[i] - raw_err_avg) * + (raw_motion_err_list[i] - raw_err_avg); + } + // Calculate the standard deviation for the motion error of all the inter + // blocks of the 0,0 motion using the last source + // frame as the reference. + raw_err_stdev = sqrt(raw_err_stdev / raw_motion_err_counts); + return raw_err_stdev; +} + +// This structure contains several key parameters to be accumulate for this +// frame. +typedef struct { + // Intra prediction error. + int64_t intra_error; + // Average wavelet energy computed using Discrete Wavelet Transform (DWT). + int64_t frame_avg_wavelet_energy; + // Best of intra pred error and inter pred error using last frame as ref. + int64_t coded_error; + // Best of intra pred error and inter pred error using golden frame as ref. + int64_t sr_coded_error; + // Best of intra pred error and inter pred error using altref frame as ref. + int64_t tr_coded_error; + // Count of motion vector. + int mv_count; + // Count of blocks that pick inter prediction (inter pred error is smaller + // than intra pred error). + int inter_count; + // Count of blocks that pick second ref (golden frame). + int second_ref_count; + // Count of blocks that pick third ref (altref frame). + int third_ref_count; + // Count of blocks where the inter and intra are very close and very low. + double neutral_count; + // Count of blocks where intra error is very small. + int intra_skip_count; + // Start row. + int image_data_start_row; + // Count of unique non-zero motion vectors. + int new_mv_count; + // Sum of inward motion vectors. + int sum_in_vectors; + // Sum of motion vector row. + int sum_mvr; + // Sum of motion vector column. + int sum_mvc; + // Sum of absolute value of motion vector row. + int sum_mvr_abs; + // Sum of absolute value of motion vector column. + int sum_mvc_abs; + // Sum of the square of motion vector row. + int64_t sum_mvrs; + // Sum of the square of motion vector column. + int64_t sum_mvcs; + // A factor calculated using intra pred error. + double intra_factor; + // A factor that measures brightness. + double brightness_factor; +} FRAME_STATS; + +#define UL_INTRA_THRESH 50 +#define INVALID_ROW -1 +// Computes and returns the intra pred error of a block. +// intra pred error: sum of squared error of the intra predicted residual. +// Inputs: +// cpi: the encoder setting. Only a few params in it will be used. +// this_frame: the current frame buffer. +// tile: tile information (not used in first pass, already init to zero) +// mb_row: row index in the unit of first pass block size. +// mb_col: column index in the unit of first pass block size. +// y_offset: the offset of y frame buffer, indicating the starting point of +// the current block. +// uv_offset: the offset of u and v frame buffer, indicating the starting +// point of the current block. +// fp_block_size: first pass block size. +// qindex: quantization step size to encode the frame. +// stats: frame encoding stats. +// Modifies: +// stats->intra_skip_count +// stats->image_data_start_row +// stats->intra_factor +// stats->brightness_factor +// stats->intra_error +// stats->frame_avg_wavelet_energy +// Returns: +// this_intra_error. +static int firstpass_intra_prediction( + AV1_COMP *cpi, YV12_BUFFER_CONFIG *const this_frame, + const TileInfo *const tile, const int mb_row, const int mb_col, + const int y_offset, const int uv_offset, const BLOCK_SIZE fp_block_size, + const int qindex, FRAME_STATS *const stats) { + const AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const SequenceHeader *const seq_params = &cm->seq_params; + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int mb_scale = mi_size_wide[fp_block_size]; + const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); + const int num_planes = av1_num_planes(cm); + const BLOCK_SIZE bsize = get_bsize(mi_params, mb_row, mb_col); + + aom_clear_system_state(); + set_mi_offsets(mi_params, xd, mb_row * mb_scale, mb_col * mb_scale); + xd->plane[0].dst.buf = this_frame->y_buffer + y_offset; + xd->plane[1].dst.buf = this_frame->u_buffer + uv_offset; + xd->plane[2].dst.buf = this_frame->v_buffer + uv_offset; + xd->left_available = (mb_col != 0); + xd->mi[0]->sb_type = bsize; + xd->mi[0]->ref_frame[0] = INTRA_FRAME; + set_mi_row_col(xd, tile, mb_row * mb_scale, mi_size_high[bsize], + mb_col * mb_scale, mi_size_wide[bsize], mi_params->mi_rows, + mi_params->mi_cols); + set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize], num_planes); + xd->mi[0]->segment_id = 0; + xd->lossless[xd->mi[0]->segment_id] = (qindex == 0); + xd->mi[0]->mode = DC_PRED; + xd->mi[0]->tx_size = + use_dc_pred ? (bsize >= fp_block_size ? TX_16X16 : TX_8X8) : TX_4X4; + + av1_encode_intra_block_plane(cpi, x, bsize, 0, DRY_RUN_NORMAL, 0); + int this_intra_error = aom_get_mb_ss(x->plane[0].src_diff); + + if (this_intra_error < UL_INTRA_THRESH) { + ++stats->intra_skip_count; + } else if ((mb_col > 0) && (stats->image_data_start_row == INVALID_ROW)) { + stats->image_data_start_row = mb_row; + } + + if (seq_params->use_highbitdepth) { + switch (seq_params->bit_depth) { + case AOM_BITS_8: break; + case AOM_BITS_10: this_intra_error >>= 4; break; + case AOM_BITS_12: this_intra_error >>= 8; break; + default: + assert(0 && + "seq_params->bit_depth should be AOM_BITS_8, " + "AOM_BITS_10 or AOM_BITS_12"); + return -1; + } + } + + aom_clear_system_state(); + double log_intra = log(this_intra_error + 1.0); + if (log_intra < 10.0) { + stats->intra_factor += 1.0 + ((10.0 - log_intra) * 0.05); + } else { + stats->intra_factor += 1.0; + } + + int level_sample; + if (seq_params->use_highbitdepth) { + level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0]; + } else { + level_sample = x->plane[0].src.buf[0]; + } + if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) { + stats->brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample)); + } else { + stats->brightness_factor += 1.0; + } + + // Intrapenalty below deals with situations where the intra and inter + // error scores are very low (e.g. a plain black frame). + // We do not have special cases in first pass for 0,0 and nearest etc so + // all inter modes carry an overhead cost estimate for the mv. + // When the error score is very low this causes us to pick all or lots of + // INTRA modes and throw lots of key frames. + // This penalty adds a cost matching that of a 0,0 mv to the intra case. + this_intra_error += INTRA_MODE_PENALTY; + + // Accumulate the intra error. + stats->intra_error += (int64_t)this_intra_error; + + const int hbd = is_cur_buf_hbd(xd); + const int stride = x->plane[0].src.stride; + uint8_t *buf = x->plane[0].src.buf; + for (int r8 = 0; r8 < 2; ++r8) { + for (int c8 = 0; c8 < 2; ++c8) { + stats->frame_avg_wavelet_energy += av1_haar_ac_sad_8x8_uint8_input( + buf + c8 * 8 + r8 * 8 * stride, stride, hbd); + } + } + + return this_intra_error; +} + +// Returns the sum of square error between source and reference blocks. +static int get_prediction_error_bitdepth(const int is_high_bitdepth, + const int bitdepth, + const BLOCK_SIZE block_size, + const struct buf_2d *src, + const struct buf_2d *ref) { + (void)is_high_bitdepth; + (void)bitdepth; +#if CONFIG_AV1_HIGHBITDEPTH + if (is_high_bitdepth) { + return highbd_get_prediction_error(block_size, src, ref, bitdepth); + } +#endif // CONFIG_AV1_HIGHBITDEPTH + return get_prediction_error(block_size, src, ref); +} + +// Accumulates motion vector stats. +// Modifies member variables of "stats". +static void accumulate_mv_stats(const MV best_mv, const FULLPEL_MV mv, + const int mb_row, const int mb_col, + const int mb_rows, const int mb_cols, + MV *last_mv, FRAME_STATS *stats) { + if (is_zero_mv(&best_mv)) return; + + ++stats->mv_count; + // Non-zero vector, was it different from the last non zero vector? + if (!is_equal_mv(&best_mv, last_mv)) ++stats->new_mv_count; + *last_mv = best_mv; + + // Does the row vector point inwards or outwards? + if (mb_row < mb_rows / 2) { + if (mv.row > 0) { + --stats->sum_in_vectors; + } else if (mv.row < 0) { + ++stats->sum_in_vectors; + } + } else if (mb_row > mb_rows / 2) { + if (mv.row > 0) { + ++stats->sum_in_vectors; + } else if (mv.row < 0) { + --stats->sum_in_vectors; + } + } + + // Does the col vector point inwards or outwards? + if (mb_col < mb_cols / 2) { + if (mv.col > 0) { + --stats->sum_in_vectors; + } else if (mv.col < 0) { + ++stats->sum_in_vectors; + } + } else if (mb_col > mb_cols / 2) { + if (mv.col > 0) { + ++stats->sum_in_vectors; + } else if (mv.col < 0) { + --stats->sum_in_vectors; + } + } +} + +#define LOW_MOTION_ERROR_THRESH 25 +// Computes and returns the inter prediction error from the last frame. +// Computes inter prediction errors from the golden and alt ref frams and +// Updates stats accordingly. +// Inputs: +// cpi: the encoder setting. Only a few params in it will be used. +// last_frame: the frame buffer of the last frame. +// golden_frame: the frame buffer of the golden frame. +// alt_ref_frame: the frame buffer of the alt ref frame. +// mb_row: row index in the unit of first pass block size. +// mb_col: column index in the unit of first pass block size. +// recon_yoffset: the y offset of the reconstructed frame buffer, +// indicating the starting point of the current block. +// recont_uvoffset: the u/v offset of the reconstructed frame buffer, +// indicating the starting point of the current block. +// src_yoffset: the y offset of the source frame buffer. +// alt_ref_frame_offset: the y offset of the alt ref frame buffer. +// fp_block_size: first pass block size. +// this_intra_error: the intra prediction error of this block. +// raw_motion_err_counts: the count of raw motion vectors. +// raw_motion_err_list: the array that records the raw motion error. +// best_ref_mv: best reference mv found so far. +// last_mv: last mv. +// stats: frame encoding stats. +// Modifies: +// raw_motion_err_list +// best_ref_mv +// last_mv +// stats: many member params in it. +// Returns: +// this_inter_error +static int firstpass_inter_prediction( + AV1_COMP *cpi, const YV12_BUFFER_CONFIG *const last_frame, + const YV12_BUFFER_CONFIG *const golden_frame, + const YV12_BUFFER_CONFIG *const alt_ref_frame, const int mb_row, + const int mb_col, const int recon_yoffset, const int recon_uvoffset, + const int src_yoffset, const int alt_ref_frame_yoffset, + const BLOCK_SIZE fp_block_size, const int this_intra_error, + const int raw_motion_err_counts, int *raw_motion_err_list, MV *best_ref_mv, + MV *last_mv, FRAME_STATS *stats) { + int this_inter_error = this_intra_error; + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + CurrentFrame *const current_frame = &cm->current_frame; + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int is_high_bitdepth = is_cur_buf_hbd(xd); + const int bitdepth = xd->bd; + const int mb_scale = mi_size_wide[fp_block_size]; + const BLOCK_SIZE bsize = get_bsize(mi_params, mb_row, mb_col); + const int fp_block_size_height = block_size_wide[fp_block_size]; + // Assume 0,0 motion with no mv overhead. + FULLPEL_MV mv = kZeroFullMv; + FULLPEL_MV tmp_mv = kZeroFullMv; + xd->plane[0].pre[0].buf = last_frame->y_buffer + recon_yoffset; + // Set up limit values for motion vectors to prevent them extending + // outside the UMV borders. + av1_set_mv_col_limits(mi_params, &x->mv_limits, (mb_col << 2), + (fp_block_size_height >> MI_SIZE_LOG2), + cpi->oxcf.border_in_pixels); + + int motion_error = + get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize, + &x->plane[0].src, &xd->plane[0].pre[0]); + + // Compute the motion error of the 0,0 motion using the last source + // frame as the reference. Skip the further motion search on + // reconstructed frame if this error is small. + struct buf_2d unscaled_last_source_buf_2d; + unscaled_last_source_buf_2d.buf = + cpi->unscaled_last_source->y_buffer + src_yoffset; + unscaled_last_source_buf_2d.stride = cpi->unscaled_last_source->y_stride; + const int raw_motion_error = get_prediction_error_bitdepth( + is_high_bitdepth, bitdepth, bsize, &x->plane[0].src, + &unscaled_last_source_buf_2d); + raw_motion_err_list[raw_motion_err_counts] = raw_motion_error; + + // TODO(pengchong): Replace the hard-coded threshold + if (raw_motion_error > LOW_MOTION_ERROR_THRESH) { + // Test last reference frame using the previous best mv as the + // starting point (best reference) for the search. + first_pass_motion_search(cpi, x, best_ref_mv, &mv, &motion_error); + + // If the current best reference mv is not centered on 0,0 then do a + // 0,0 based search as well. + if (!is_zero_mv(best_ref_mv)) { + int tmp_err = INT_MAX; + first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &tmp_err); + + if (tmp_err < motion_error) { + motion_error = tmp_err; + mv = tmp_mv; + } + } + + // Motion search in 2nd reference frame. + int gf_motion_error = motion_error; + if ((current_frame->frame_number > 1) && golden_frame != NULL) { + // Assume 0,0 motion with no mv overhead. + xd->plane[0].pre[0].buf = golden_frame->y_buffer + recon_yoffset; + xd->plane[0].pre[0].stride = golden_frame->y_stride; + gf_motion_error = + get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize, + &x->plane[0].src, &xd->plane[0].pre[0]); + first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &gf_motion_error); + } + if (gf_motion_error < motion_error && gf_motion_error < this_intra_error) { + ++stats->second_ref_count; + } + // In accumulating a score for the 2nd reference frame take the + // best of the motion predicted score and the intra coded error + // (just as will be done for) accumulation of "coded_error" for + // the last frame. + if ((current_frame->frame_number > 1) && golden_frame != NULL) { + stats->sr_coded_error += AOMMIN(gf_motion_error, this_intra_error); + } else { + // TODO(chengchen): I believe logically this should also be changed to + // stats->sr_coded_error += AOMMIN(gf_motion_error, this_intra_error). + stats->sr_coded_error += motion_error; + } + + // Motion search in 3rd reference frame. + int alt_motion_error = motion_error; + if (alt_ref_frame != NULL) { + xd->plane[0].pre[0].buf = alt_ref_frame->y_buffer + alt_ref_frame_yoffset; + xd->plane[0].pre[0].stride = alt_ref_frame->y_stride; + alt_motion_error = + get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize, + &x->plane[0].src, &xd->plane[0].pre[0]); + first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &alt_motion_error); + } + if (alt_motion_error < motion_error && alt_motion_error < gf_motion_error && + alt_motion_error < this_intra_error) { + ++stats->third_ref_count; + } + // In accumulating a score for the 3rd reference frame take the + // best of the motion predicted score and the intra coded error + // (just as will be done for) accumulation of "coded_error" for + // the last frame. + if (alt_ref_frame != NULL) { + stats->tr_coded_error += AOMMIN(alt_motion_error, this_intra_error); + } else { + // TODO(chengchen): I believe logically this should also be changed to + // stats->tr_coded_error += AOMMIN(alt_motion_error, this_intra_error). + stats->tr_coded_error += motion_error; + } + + // Reset to last frame as reference buffer. + xd->plane[0].pre[0].buf = last_frame->y_buffer + recon_yoffset; + xd->plane[1].pre[0].buf = last_frame->u_buffer + recon_uvoffset; + xd->plane[2].pre[0].buf = last_frame->v_buffer + recon_uvoffset; + } else { + stats->sr_coded_error += motion_error; + stats->tr_coded_error += motion_error; + } + + // Start by assuming that intra mode is best. + best_ref_mv->row = 0; + best_ref_mv->col = 0; + + if (motion_error <= this_intra_error) { + aom_clear_system_state(); + + // Keep a count of cases where the inter and intra were very close + // and very low. This helps with scene cut detection for example in + // cropped clips with black bars at the sides or top and bottom. + if (((this_intra_error - INTRA_MODE_PENALTY) * 9 <= motion_error * 10) && + (this_intra_error < (2 * INTRA_MODE_PENALTY))) { + stats->neutral_count += 1.0; + // Also track cases where the intra is not much worse than the inter + // and use this in limiting the GF/arf group length. + } else if ((this_intra_error > NCOUNT_INTRA_THRESH) && + (this_intra_error < (NCOUNT_INTRA_FACTOR * motion_error))) { + stats->neutral_count += + (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_intra_error); + } + + const MV best_mv = get_mv_from_fullmv(&mv); + this_inter_error = motion_error; + xd->mi[0]->mode = NEWMV; + xd->mi[0]->mv[0].as_mv = best_mv; + xd->mi[0]->tx_size = TX_4X4; + xd->mi[0]->ref_frame[0] = LAST_FRAME; + xd->mi[0]->ref_frame[1] = NONE_FRAME; + av1_enc_build_inter_predictor(cm, xd, mb_row * mb_scale, mb_col * mb_scale, + NULL, bsize, AOM_PLANE_Y, AOM_PLANE_Y); + av1_encode_sby_pass1(cpi, x, bsize); + stats->sum_mvr += best_mv.row; + stats->sum_mvr_abs += abs(best_mv.row); + stats->sum_mvc += best_mv.col; + stats->sum_mvc_abs += abs(best_mv.col); + stats->sum_mvrs += best_mv.row * best_mv.row; + stats->sum_mvcs += best_mv.col * best_mv.col; + ++stats->inter_count; + + *best_ref_mv = best_mv; + accumulate_mv_stats(best_mv, mv, mb_row, mb_col, mi_params->mb_rows, + mi_params->mb_cols, last_mv, stats); + } + + return this_inter_error; +} + +// Updates the first pass stats of this frame. +// Input: +// cpi: the encoder setting. Only a few params in it will be used. +// stats: stats accumulated for this frame. +// raw_err_stdev: the statndard deviation for the motion error of all the +// inter blocks of the (0,0) motion using the last source +// frame as the reference. +// frame_number: current frame number. +// ts_duration: Duration of the frame / collection of frames. +// Updates: +// twopass->total_stats: the accumulated stats. +// twopass->stats_buf_ctx->stats_in_end: the pointer to the current stats, +// update its value and its position +// in the buffer. +static void update_firstpass_stats(AV1_COMP *cpi, + const FRAME_STATS *const stats, + const double raw_err_stdev, + const int frame_number, + const int64_t ts_duration) { + TWO_PASS *twopass = &cpi->twopass; + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + FIRSTPASS_STATS *this_frame_stats = twopass->stats_buf_ctx->stats_in_end; + FIRSTPASS_STATS fps; + // The minimum error here insures some bit allocation to frames even + // in static regions. The allocation per MB declines for larger formats + // where the typical "real" energy per MB also falls. + // Initial estimate here uses sqrt(mbs) to define the min_err, where the + // number of mbs is proportional to the image area. + const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs + : mi_params->MBs; + const double min_err = 200 * sqrt(num_mbs); + + fps.weight = stats->intra_factor * stats->brightness_factor; + fps.frame = frame_number; + fps.coded_error = (double)(stats->coded_error >> 8) + min_err; + fps.sr_coded_error = (double)(stats->sr_coded_error >> 8) + min_err; + fps.tr_coded_error = (double)(stats->tr_coded_error >> 8) + min_err; + fps.intra_error = (double)(stats->intra_error >> 8) + min_err; + fps.frame_avg_wavelet_energy = (double)stats->frame_avg_wavelet_energy; + fps.count = 1.0; + fps.pcnt_inter = (double)stats->inter_count / num_mbs; + fps.pcnt_second_ref = (double)stats->second_ref_count / num_mbs; + fps.pcnt_third_ref = (double)stats->third_ref_count / num_mbs; + fps.pcnt_neutral = (double)stats->neutral_count / num_mbs; + fps.intra_skip_pct = (double)stats->intra_skip_count / num_mbs; + fps.inactive_zone_rows = (double)stats->image_data_start_row; + fps.inactive_zone_cols = (double)0; // TODO(paulwilkins): fix + fps.raw_error_stdev = raw_err_stdev; + + if (stats->mv_count > 0) { + fps.MVr = (double)stats->sum_mvr / stats->mv_count; + fps.mvr_abs = (double)stats->sum_mvr_abs / stats->mv_count; + fps.MVc = (double)stats->sum_mvc / stats->mv_count; + fps.mvc_abs = (double)stats->sum_mvc_abs / stats->mv_count; + fps.MVrv = ((double)stats->sum_mvrs - + ((double)stats->sum_mvr * stats->sum_mvr / stats->mv_count)) / + stats->mv_count; + fps.MVcv = ((double)stats->sum_mvcs - + ((double)stats->sum_mvc * stats->sum_mvc / stats->mv_count)) / + stats->mv_count; + fps.mv_in_out_count = (double)stats->sum_in_vectors / (stats->mv_count * 2); + fps.new_mv_count = stats->new_mv_count; + fps.pcnt_motion = (double)stats->mv_count / num_mbs; + } else { + fps.MVr = 0.0; + fps.mvr_abs = 0.0; + fps.MVc = 0.0; + fps.mvc_abs = 0.0; + fps.MVrv = 0.0; + fps.MVcv = 0.0; + fps.mv_in_out_count = 0.0; + fps.new_mv_count = 0.0; + fps.pcnt_motion = 0.0; + } + + // TODO(paulwilkins): Handle the case when duration is set to 0, or + // something less than the full time between subsequent values of + // cpi->source_time_stamp. + fps.duration = (double)ts_duration; + + // We will store the stats inside the persistent twopass struct (and NOT the + // local variable 'fps'), and then cpi->output_pkt_list will point to it. + *this_frame_stats = fps; + output_stats(this_frame_stats, cpi->output_pkt_list); + if (cpi->twopass.stats_buf_ctx->total_stats != NULL) { + accumulate_stats(cpi->twopass.stats_buf_ctx->total_stats, &fps); + } + /*In the case of two pass, first pass uses it as a circular buffer, + * when LAP is enabled it is used as a linear buffer*/ + twopass->stats_buf_ctx->stats_in_end++; + if ((cpi->oxcf.pass == 1) && (twopass->stats_buf_ctx->stats_in_end >= + twopass->stats_buf_ctx->stats_in_buf_end)) { + twopass->stats_buf_ctx->stats_in_end = + twopass->stats_buf_ctx->stats_in_start; + } +} + +static void print_reconstruction_frame( + const YV12_BUFFER_CONFIG *const last_frame, int frame_number, + int do_print) { + if (!do_print) return; + + char filename[512]; + FILE *recon_file; + snprintf(filename, sizeof(filename), "enc%04d.yuv", frame_number); + + if (frame_number == 0) { + recon_file = fopen(filename, "wb"); + } else { + recon_file = fopen(filename, "ab"); + } + + fwrite(last_frame->buffer_alloc, last_frame->frame_size, 1, recon_file); + fclose(recon_file); +} + +#define FIRST_PASS_ALT_REF_DISTANCE 16 +void av1_first_pass(AV1_COMP *cpi, const int64_t ts_duration) { + MACROBLOCK *const x = &cpi->td.mb; + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + CurrentFrame *const current_frame = &cm->current_frame; + const SequenceHeader *const seq_params = &cm->seq_params; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + const PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none; + MV last_mv = kZeroMv; + const int qindex = find_fp_qindex(seq_params->bit_depth); + // Detect if the key frame is screen content type. + if (frame_is_intra_only(cm)) { + FeatureFlags *const features = &cm->features; + av1_set_screen_content_options(cpi, features); + cpi->is_screen_content_type = features->allow_screen_content_tools; + } + // First pass coding proceeds in raster scan order with unit size of 16x16. + const BLOCK_SIZE fp_block_size = BLOCK_16X16; + const int fp_block_size_width = block_size_high[fp_block_size]; + const int fp_block_size_height = block_size_wide[fp_block_size]; + int *raw_motion_err_list; + int raw_motion_err_counts = 0; + CHECK_MEM_ERROR(cm, raw_motion_err_list, + aom_calloc(mi_params->mb_rows * mi_params->mb_cols, + sizeof(*raw_motion_err_list))); + // Tiling is ignored in the first pass. + TileInfo tile; + av1_tile_init(&tile, cm, 0, 0); + FRAME_STATS stats = { 0 }; + stats.image_data_start_row = INVALID_ROW; + + const YV12_BUFFER_CONFIG *const last_frame = + get_ref_frame_yv12_buf(cm, LAST_FRAME); + const YV12_BUFFER_CONFIG *golden_frame = + get_ref_frame_yv12_buf(cm, GOLDEN_FRAME); + const YV12_BUFFER_CONFIG *alt_ref_frame = NULL; + const int alt_ref_offset = + FIRST_PASS_ALT_REF_DISTANCE - + (current_frame->frame_number % FIRST_PASS_ALT_REF_DISTANCE); + if (alt_ref_offset < FIRST_PASS_ALT_REF_DISTANCE) { + const struct lookahead_entry *const alt_ref_frame_buffer = + av1_lookahead_peek(cpi->lookahead, alt_ref_offset, + cpi->compressor_stage); + if (alt_ref_frame_buffer != NULL) { + alt_ref_frame = &alt_ref_frame_buffer->img; + } + } + YV12_BUFFER_CONFIG *const this_frame = &cm->cur_frame->buf; + // First pass code requires valid last and new frame buffers. + assert(this_frame != NULL); + assert(frame_is_intra_only(cm) || (last_frame != NULL)); + + av1_setup_frame_size(cpi); + aom_clear_system_state(); + + set_mi_offsets(mi_params, xd, 0, 0); + xd->mi[0]->sb_type = fp_block_size; + + // Do not use periodic key frames. + cpi->rc.frames_to_key = INT_MAX; + + av1_set_quantizer(cm, cpi->oxcf.qm_minlevel, cpi->oxcf.qm_maxlevel, qindex); + + av1_setup_block_planes(xd, seq_params->subsampling_x, + seq_params->subsampling_y, num_planes); + + av1_setup_src_planes(x, cpi->source, 0, 0, num_planes, fp_block_size); + av1_setup_dst_planes(xd->plane, seq_params->sb_size, this_frame, 0, 0, 0, + num_planes); + + if (!frame_is_intra_only(cm)) { + av1_setup_pre_planes(xd, 0, last_frame, 0, 0, NULL, num_planes); + } + + set_mi_offsets(mi_params, xd, 0, 0); + + // Don't store luma on the fist pass since chroma is not computed + xd->cfl.store_y = 0; + av1_frame_init_quantizer(cpi); + + for (int i = 0; i < num_planes; ++i) { + x->plane[i].coeff = ctx->coeff[i]; + x->plane[i].qcoeff = ctx->qcoeff[i]; + x->plane[i].eobs = ctx->eobs[i]; + x->plane[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i]; + xd->plane[i].dqcoeff = ctx->dqcoeff[i]; + } + + av1_init_mv_probs(cm); + av1_initialize_rd_consts(cpi); + + const int src_y_stride = cpi->source->y_stride; + const int recon_y_stride = this_frame->y_stride; + const int recon_uv_stride = this_frame->uv_stride; + const int uv_mb_height = + fp_block_size_height >> (this_frame->y_height > this_frame->uv_height); + + for (int mb_row = 0; mb_row < mi_params->mb_rows; ++mb_row) { + MV best_ref_mv = kZeroMv; + + // Reset above block coeffs. + xd->up_available = (mb_row != 0); + int recon_yoffset = (mb_row * recon_y_stride * fp_block_size_height); + int src_yoffset = (mb_row * src_y_stride * fp_block_size_height); + int recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height); + int alt_ref_frame_yoffset = + (alt_ref_frame != NULL) + ? mb_row * alt_ref_frame->y_stride * fp_block_size_height + : -1; + + // Set up limit values for motion vectors to prevent them extending + // outside the UMV borders. + av1_set_mv_row_limits(mi_params, &x->mv_limits, (mb_row << 2), + (fp_block_size_height >> MI_SIZE_LOG2), + cpi->oxcf.border_in_pixels); + + for (int mb_col = 0; mb_col < mi_params->mb_cols; ++mb_col) { + int this_intra_error = firstpass_intra_prediction( + cpi, this_frame, &tile, mb_row, mb_col, recon_yoffset, recon_uvoffset, + fp_block_size, qindex, &stats); + + if (!frame_is_intra_only(cm)) { + const int this_inter_error = firstpass_inter_prediction( + cpi, last_frame, golden_frame, alt_ref_frame, mb_row, mb_col, + recon_yoffset, recon_uvoffset, src_yoffset, alt_ref_frame_yoffset, + fp_block_size, this_intra_error, raw_motion_err_counts, + raw_motion_err_list, &best_ref_mv, &last_mv, &stats); + stats.coded_error += this_inter_error; + ++raw_motion_err_counts; + } else { + stats.sr_coded_error += this_intra_error; + stats.tr_coded_error += this_intra_error; + stats.coded_error += this_intra_error; + } + + // Adjust to the next column of MBs. + x->plane[0].src.buf += fp_block_size_width; + x->plane[1].src.buf += uv_mb_height; + x->plane[2].src.buf += uv_mb_height; + + recon_yoffset += fp_block_size_width; + src_yoffset += fp_block_size_width; + recon_uvoffset += uv_mb_height; + alt_ref_frame_yoffset += fp_block_size_width; + } + // Adjust to the next row of MBs. + x->plane[0].src.buf += fp_block_size_height * x->plane[0].src.stride - + fp_block_size_width * mi_params->mb_cols; + x->plane[1].src.buf += uv_mb_height * x->plane[1].src.stride - + uv_mb_height * mi_params->mb_cols; + x->plane[2].src.buf += uv_mb_height * x->plane[1].src.stride - + uv_mb_height * mi_params->mb_cols; + } + const double raw_err_stdev = + raw_motion_error_stdev(raw_motion_err_list, raw_motion_err_counts); + aom_free(raw_motion_err_list); + + // Clamp the image start to rows/2. This number of rows is discarded top + // and bottom as dead data so rows / 2 means the frame is blank. + if ((stats.image_data_start_row > mi_params->mb_rows / 2) || + (stats.image_data_start_row == INVALID_ROW)) { + stats.image_data_start_row = mi_params->mb_rows / 2; + } + // Exclude any image dead zone + if (stats.image_data_start_row > 0) { + stats.intra_skip_count = + AOMMAX(0, stats.intra_skip_count - + (stats.image_data_start_row * mi_params->mb_cols * 2)); + } + + TWO_PASS *twopass = &cpi->twopass; + const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs + : mi_params->MBs; + stats.intra_factor = stats.intra_factor / (double)num_mbs; + stats.brightness_factor = stats.brightness_factor / (double)num_mbs; + FIRSTPASS_STATS *this_frame_stats = twopass->stats_buf_ctx->stats_in_end; + update_firstpass_stats(cpi, &stats, raw_err_stdev, + current_frame->frame_number, ts_duration); + + // Copy the previous Last Frame back into gf buffer if the prediction is good + // enough... but also don't allow it to lag too far. + if ((twopass->sr_update_lag > 3) || + ((current_frame->frame_number > 0) && + (this_frame_stats->pcnt_inter > 0.20) && + ((this_frame_stats->intra_error / + DOUBLE_DIVIDE_CHECK(this_frame_stats->coded_error)) > 2.0))) { + if (golden_frame != NULL) { + assign_frame_buffer_p( + &cm->ref_frame_map[get_ref_frame_map_idx(cm, GOLDEN_FRAME)], + cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)]); + } + twopass->sr_update_lag = 1; + } else { + ++twopass->sr_update_lag; + } + + aom_extend_frame_borders(this_frame, num_planes); + + // The frame we just compressed now becomes the last frame. + assign_frame_buffer_p( + &cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)], cm->cur_frame); + + // Special case for the first frame. Copy into the GF buffer as a second + // reference. + if (current_frame->frame_number == 0 && + get_ref_frame_map_idx(cm, GOLDEN_FRAME) != INVALID_IDX) { + assign_frame_buffer_p( + &cm->ref_frame_map[get_ref_frame_map_idx(cm, GOLDEN_FRAME)], + cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)]); + } + + print_reconstruction_frame(last_frame, current_frame->frame_number, + /*do_print=*/0); + + ++current_frame->frame_number; +} diff --git a/libs/libaom/src/av1/encoder/firstpass.h b/libs/libaom/src/av1/encoder/firstpass.h new file mode 100644 index 000000000..99d444539 --- /dev/null +++ b/libs/libaom/src/av1/encoder/firstpass.h @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_FIRSTPASS_H_ +#define AOM_AV1_ENCODER_FIRSTPASS_H_ + +#include "av1/common/av1_common_int.h" +#include "av1/common/enums.h" +#include "av1/encoder/lookahead.h" +#include "av1/encoder/ratectrl.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001) + +#define MIN_ZERO_MOTION 0.95 +#define MAX_SR_CODED_ERROR 40 +#define MAX_RAW_ERR_VAR 2000 +#define MIN_MV_IN_OUT 0.4 + +#define VLOW_MOTION_THRESHOLD 950 + +typedef struct { + // Frame number in display order, if stats are for a single frame. + // No real meaning for a collection of frames. + double frame; + // Weight assigned to this frame (or total weight for the collection of + // frames) currently based on intra factor and brightness factor. This is used + // to distribute bits betweeen easier and harder frames. + double weight; + // Intra prediction error. + double intra_error; + // Average wavelet energy computed using Discrete Wavelet Transform (DWT). + double frame_avg_wavelet_energy; + // Best of intra pred error and inter pred error using last frame as ref. + double coded_error; + // Best of intra pred error and inter pred error using golden frame as ref. + double sr_coded_error; + // Best of intra pred error and inter pred error using altref frame as ref. + double tr_coded_error; + // Percentage of blocks with inter pred error < intra pred error. + double pcnt_inter; + // Percentage of blocks using (inter prediction and) non-zero motion vectors. + double pcnt_motion; + // Percentage of blocks where golden frame was better than last or intra: + // inter pred error using golden frame < inter pred error using last frame and + // inter pred error using golden frame < intra pred error + double pcnt_second_ref; + // Percentage of blocks where altref frame was better than intra, last, golden + double pcnt_third_ref; + // Percentage of blocks where intra and inter prediction errors were very + // close. Note that this is a 'weighted count', that is, the so blocks may be + // weighted by how close the two errors were. + double pcnt_neutral; + // Percentage of blocks that have almost no intra error residual + // (i.e. are in effect completely flat and untextured in the intra + // domain). In natural videos this is uncommon, but it is much more + // common in animations, graphics and screen content, so may be used + // as a signal to detect these types of content. + double intra_skip_pct; + // Image mask rows top and bottom. + double inactive_zone_rows; + // Image mask columns at left and right edges. + double inactive_zone_cols; + // Average of row motion vectors. + double MVr; + // Mean of absolute value of row motion vectors. + double mvr_abs; + // Mean of column motion vectors. + double MVc; + // Mean of absolute value of column motion vectors. + double mvc_abs; + // Variance of row motion vectors. + double MVrv; + // Variance of column motion vectors. + double MVcv; + // Value in range [-1,1] indicating fraction of row and column motion vectors + // that point inwards (negative MV value) or outwards (positive MV value). + // For example, value of 1 indicates, all row/column MVs are inwards. + double mv_in_out_count; + // Count of unique non-zero motion vectors. + double new_mv_count; + // Duration of the frame / collection of frames. + double duration; + // 1.0 if stats are for a single frame, OR + // Number of frames in this collection for which the stats are accumulated. + double count; + // standard deviation for (0, 0) motion prediction error + double raw_error_stdev; +} FIRSTPASS_STATS; + +#define FC_ANIMATION_THRESH 0.15 +enum { + FC_NORMAL = 0, + FC_GRAPHICS_ANIMATION = 1, + FRAME_CONTENT_TYPES = 2 +} UENUM1BYTE(FRAME_CONTENT_TYPE); + +typedef struct { + unsigned char index; + FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH]; + unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH]; + // The number of frames displayed so far within the GOP at a given coding + // frame. + unsigned char cur_frame_idx[MAX_STATIC_GF_GROUP_LENGTH]; + unsigned char frame_disp_idx[MAX_STATIC_GF_GROUP_LENGTH]; + int ref_frame_disp_idx[MAX_STATIC_GF_GROUP_LENGTH][REF_FRAMES]; + int ref_frame_gop_idx[MAX_STATIC_GF_GROUP_LENGTH][REF_FRAMES]; + + // TODO(jingning): Unify the data structure used here after the new control + // mechanism is in place. + int layer_depth[MAX_STATIC_GF_GROUP_LENGTH]; + int arf_boost[MAX_STATIC_GF_GROUP_LENGTH]; + int max_layer_depth; + int max_layer_depth_allowed; + // This is currently only populated for AOM_Q mode + unsigned char q_val[MAX_STATIC_GF_GROUP_LENGTH]; + int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH]; + int size; +} GF_GROUP; + +typedef struct { + FIRSTPASS_STATS *stats_in_start; + FIRSTPASS_STATS *stats_in_end; + FIRSTPASS_STATS *stats_in_buf_end; + FIRSTPASS_STATS *total_stats; + FIRSTPASS_STATS *total_left_stats; +} STATS_BUFFER_CTX; + +typedef struct { + unsigned int section_intra_rating; + // Circular queue of first pass stats stored for most recent frames. + // cpi->output_pkt_list[i].data.twopass_stats.buf points to actual data stored + // here. + FIRSTPASS_STATS *frame_stats_arr[MAX_LAP_BUFFERS + 1]; + int frame_stats_next_idx; // Index to next unused element in frame_stats_arr. + const FIRSTPASS_STATS *stats_in; + STATS_BUFFER_CTX *stats_buf_ctx; + int first_pass_done; + int64_t bits_left; + double modified_error_min; + double modified_error_max; + double modified_error_left; + double mb_av_energy; + double frame_avg_haar_energy; + + // An indication of the content type of the current frame + FRAME_CONTENT_TYPE fr_content_type; + + // Projected total bits available for a key frame group of frames + int64_t kf_group_bits; + + // Error score of frames still to be coded in kf group + int64_t kf_group_error_left; + + // Over time correction for bits per macro block estimation + double bpm_factor; + + // Record of target and actual bits spent in current ARF group + int rolling_arf_group_target_bits; + int rolling_arf_group_actual_bits; + + int sr_update_lag; + + int kf_zeromotion_pct; + int last_kfgroup_zeromotion_pct; + int extend_minq; + int extend_maxq; + int extend_minq_fast; +} TWO_PASS; + +struct AV1_COMP; +struct EncodeFrameParams; +struct AV1EncoderConfig; + +void av1_rc_get_first_pass_params(struct AV1_COMP *cpi); +void av1_first_pass(struct AV1_COMP *cpi, const int64_t ts_duration); +void av1_end_first_pass(struct AV1_COMP *cpi); + +void av1_twopass_zero_stats(FIRSTPASS_STATS *section); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_FIRSTPASS_H_ diff --git a/libs/libaom/src/av1/encoder/global_motion.c b/libs/libaom/src/av1/encoder/global_motion.c new file mode 100644 index 000000000..9623ec301 --- /dev/null +++ b/libs/libaom/src/av1/encoder/global_motion.c @@ -0,0 +1,1014 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "av1/encoder/global_motion.h" + +#include "av1/common/convolve.h" +#include "av1/common/resize.h" +#include "av1/common/warped_motion.h" + +#include "av1/encoder/segmentation.h" +#include "av1/encoder/corner_detect.h" +#include "av1/encoder/corner_match.h" +#include "av1/encoder/ransac.h" + +#define MIN_INLIER_PROB 0.1 + +#define MIN_TRANS_THRESH (1 * GM_TRANS_DECODE_FACTOR) + +// Border over which to compute the global motion +#define ERRORADV_BORDER 0 + +// Number of pyramid levels in disflow computation +#define N_LEVELS 2 +// Size of square patches in the disflow dense grid +#define PATCH_SIZE 8 +// Center point of square patch +#define PATCH_CENTER ((PATCH_SIZE + 1) >> 1) +// Step size between patches, lower value means greater patch overlap +#define PATCH_STEP 1 +// Minimum size of border padding for disflow +#define MIN_PAD 7 +// Warp error convergence threshold for disflow +#define DISFLOW_ERROR_TR 0.01 +// Max number of iterations if warp convergence is not found +#define DISFLOW_MAX_ITR 10 + +// Struct for an image pyramid +typedef struct { + int n_levels; + int pad_size; + int has_gradient; + int widths[N_LEVELS]; + int heights[N_LEVELS]; + int strides[N_LEVELS]; + int level_loc[N_LEVELS]; + unsigned char *level_buffer; + double *level_dx_buffer; + double *level_dy_buffer; +} ImagePyramid; + +int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost, + int erroradv_type) { + assert(erroradv_type < GM_ERRORADV_TR_TYPES); + return best_erroradvantage < erroradv_tr[erroradv_type] && + best_erroradvantage * params_cost < erroradv_prod_tr[erroradv_type]; +} + +static void convert_to_params(const double *params, int32_t *model) { + int i; + int alpha_present = 0; + model[0] = (int32_t)floor(params[0] * (1 << GM_TRANS_PREC_BITS) + 0.5); + model[1] = (int32_t)floor(params[1] * (1 << GM_TRANS_PREC_BITS) + 0.5); + model[0] = (int32_t)clamp(model[0], GM_TRANS_MIN, GM_TRANS_MAX) * + GM_TRANS_DECODE_FACTOR; + model[1] = (int32_t)clamp(model[1], GM_TRANS_MIN, GM_TRANS_MAX) * + GM_TRANS_DECODE_FACTOR; + + for (i = 2; i < 6; ++i) { + const int diag_value = ((i == 2 || i == 5) ? (1 << GM_ALPHA_PREC_BITS) : 0); + model[i] = (int32_t)floor(params[i] * (1 << GM_ALPHA_PREC_BITS) + 0.5); + model[i] = + (int32_t)clamp(model[i] - diag_value, GM_ALPHA_MIN, GM_ALPHA_MAX); + alpha_present |= (model[i] != 0); + model[i] = (model[i] + diag_value) * GM_ALPHA_DECODE_FACTOR; + } + for (; i < 8; ++i) { + model[i] = (int32_t)floor(params[i] * (1 << GM_ROW3HOMO_PREC_BITS) + 0.5); + model[i] = (int32_t)clamp(model[i], GM_ROW3HOMO_MIN, GM_ROW3HOMO_MAX) * + GM_ROW3HOMO_DECODE_FACTOR; + alpha_present |= (model[i] != 0); + } + + if (!alpha_present) { + if (abs(model[0]) < MIN_TRANS_THRESH && abs(model[1]) < MIN_TRANS_THRESH) { + model[0] = 0; + model[1] = 0; + } + } +} + +void av1_convert_model_to_params(const double *params, + WarpedMotionParams *model) { + convert_to_params(params, model->wmmat); + model->wmtype = get_wmtype(model); + model->invalid = 0; +} + +// Adds some offset to a global motion parameter and handles +// all of the necessary precision shifts, clamping, and +// zero-centering. +static int32_t add_param_offset(int param_index, int32_t param_value, + int32_t offset) { + const int scale_vals[3] = { GM_TRANS_PREC_DIFF, GM_ALPHA_PREC_DIFF, + GM_ROW3HOMO_PREC_DIFF }; + const int clamp_vals[3] = { GM_TRANS_MAX, GM_ALPHA_MAX, GM_ROW3HOMO_MAX }; + // type of param: 0 - translation, 1 - affine, 2 - homography + const int param_type = (param_index < 2 ? 0 : (param_index < 6 ? 1 : 2)); + const int is_one_centered = (param_index == 2 || param_index == 5); + + // Make parameter zero-centered and offset the shift that was done to make + // it compatible with the warped model + param_value = (param_value - (is_one_centered << WARPEDMODEL_PREC_BITS)) >> + scale_vals[param_type]; + // Add desired offset to the rescaled/zero-centered parameter + param_value += offset; + // Clamp the parameter so it does not overflow the number of bits allotted + // to it in the bitstream + param_value = (int32_t)clamp(param_value, -clamp_vals[param_type], + clamp_vals[param_type]); + // Rescale the parameter to WARPEDMODEL_PRECISION_BITS so it is compatible + // with the warped motion library + param_value *= (1 << scale_vals[param_type]); + + // Undo the zero-centering step if necessary + return param_value + (is_one_centered << WARPEDMODEL_PREC_BITS); +} + +static void force_wmtype(WarpedMotionParams *wm, TransformationType wmtype) { + switch (wmtype) { + case IDENTITY: + wm->wmmat[0] = 0; + wm->wmmat[1] = 0; + AOM_FALLTHROUGH_INTENDED; + case TRANSLATION: + wm->wmmat[2] = 1 << WARPEDMODEL_PREC_BITS; + wm->wmmat[3] = 0; + AOM_FALLTHROUGH_INTENDED; + case ROTZOOM: + wm->wmmat[4] = -wm->wmmat[3]; + wm->wmmat[5] = wm->wmmat[2]; + AOM_FALLTHROUGH_INTENDED; + case AFFINE: wm->wmmat[6] = wm->wmmat[7] = 0; break; + default: assert(0); + } + wm->wmtype = wmtype; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static int64_t highbd_warp_error( + WarpedMotionParams *wm, const uint16_t *const ref, int width, int height, + int stride, const uint16_t *const dst, int p_col, int p_row, int p_width, + int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, + int64_t best_error, uint8_t *segment_map, int segment_map_stride) { + int64_t gm_sumerr = 0; + const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK); + const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK); + uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]; + + ConvolveParams conv_params = get_conv_params(0, 0, bd); + conv_params.use_dist_wtd_comp_avg = 0; + for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) { + for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) { + int seg_x = j >> WARP_ERROR_BLOCK_LOG; + int seg_y = i >> WARP_ERROR_BLOCK_LOG; + // Only compute the error if this block contains inliers from the motion + // model + if (!segment_map[seg_y * segment_map_stride + seg_x]) continue; + // avoid warping extra 8x8 blocks in the padded region of the frame + // when p_width and p_height are not multiples of WARP_ERROR_BLOCK + const int warp_w = AOMMIN(error_bsize_w, p_col + p_width - j); + const int warp_h = AOMMIN(error_bsize_h, p_row + p_height - i); + highbd_warp_plane(wm, ref, width, height, stride, tmp, j, i, warp_w, + warp_h, WARP_ERROR_BLOCK, subsampling_x, subsampling_y, + bd, &conv_params); + gm_sumerr += av1_calc_highbd_frame_error(tmp, WARP_ERROR_BLOCK, + dst + j + i * p_stride, warp_w, + warp_h, p_stride, bd); + if (gm_sumerr > best_error) return INT64_MAX; + } + } + return gm_sumerr; +} +#endif + +static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref, + int width, int height, int stride, + const uint8_t *const dst, int p_col, int p_row, + int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, + int64_t best_error, uint8_t *segment_map, + int segment_map_stride) { + int64_t gm_sumerr = 0; + int warp_w, warp_h; + const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK); + const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK); + DECLARE_ALIGNED(16, uint8_t, tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]); + ConvolveParams conv_params = get_conv_params(0, 0, 8); + conv_params.use_dist_wtd_comp_avg = 0; + + for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) { + for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) { + int seg_x = j >> WARP_ERROR_BLOCK_LOG; + int seg_y = i >> WARP_ERROR_BLOCK_LOG; + // Only compute the error if this block contains inliers from the motion + // model + if (!segment_map[seg_y * segment_map_stride + seg_x]) continue; + // avoid warping extra 8x8 blocks in the padded region of the frame + // when p_width and p_height are not multiples of WARP_ERROR_BLOCK + warp_w = AOMMIN(error_bsize_w, p_col + p_width - j); + warp_h = AOMMIN(error_bsize_h, p_row + p_height - i); + warp_plane(wm, ref, width, height, stride, tmp, j, i, warp_w, warp_h, + WARP_ERROR_BLOCK, subsampling_x, subsampling_y, &conv_params); + + gm_sumerr += + av1_calc_frame_error(tmp, WARP_ERROR_BLOCK, dst + j + i * p_stride, + warp_w, warp_h, p_stride); + if (gm_sumerr > best_error) return INT64_MAX; + } + } + return gm_sumerr; +} + +int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd, + const uint8_t *ref, int width, int height, int stride, + uint8_t *dst, int p_col, int p_row, int p_width, + int p_height, int p_stride, int subsampling_x, + int subsampling_y, int64_t best_error, + uint8_t *segment_map, int segment_map_stride) { + if (wm->wmtype <= AFFINE) + if (!av1_get_shear_params(wm)) return INT64_MAX; +#if CONFIG_AV1_HIGHBITDEPTH + if (use_hbd) + return highbd_warp_error(wm, CONVERT_TO_SHORTPTR(ref), width, height, + stride, CONVERT_TO_SHORTPTR(dst), p_col, p_row, + p_width, p_height, p_stride, subsampling_x, + subsampling_y, bd, best_error, segment_map, + segment_map_stride); +#endif + (void)use_hbd; + (void)bd; + return warp_error(wm, ref, width, height, stride, dst, p_col, p_row, p_width, + p_height, p_stride, subsampling_x, subsampling_y, + best_error, segment_map, segment_map_stride); +} + +// Factors used to calculate the thresholds for av1_warp_error +static double thresh_factors[GM_REFINEMENT_COUNT] = { 1.25, 1.20, 1.15, 1.10, + 1.05 }; + +static INLINE int64_t calc_approx_erroradv_threshold( + double scaling_factor, int64_t erroradv_threshold) { + return erroradv_threshold < + (int64_t)(((double)INT64_MAX / scaling_factor) + 0.5) + ? (int64_t)(scaling_factor * erroradv_threshold + 0.5) + : INT64_MAX; +} + +int64_t av1_refine_integerized_param( + WarpedMotionParams *wm, TransformationType wmtype, int use_hbd, int bd, + uint8_t *ref, int r_width, int r_height, int r_stride, uint8_t *dst, + int d_width, int d_height, int d_stride, int n_refinements, + int64_t best_frame_error, uint8_t *segment_map, int segment_map_stride, + int64_t erroradv_threshold) { + static const int max_trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 }; + const int border = ERRORADV_BORDER; + int i = 0, p; + int n_params = max_trans_model_params[wmtype]; + int32_t *param_mat = wm->wmmat; + int64_t step_error, best_error; + int32_t step; + int32_t *param; + int32_t curr_param; + int32_t best_param; + + force_wmtype(wm, wmtype); + best_error = + av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride, + dst + border * d_stride + border, border, border, + d_width - 2 * border, d_height - 2 * border, d_stride, 0, + 0, best_frame_error, segment_map, segment_map_stride); + best_error = AOMMIN(best_error, best_frame_error); + step = 1 << (n_refinements - 1); + for (i = 0; i < n_refinements; i++, step >>= 1) { + int64_t error_adv_thresh = + calc_approx_erroradv_threshold(thresh_factors[i], erroradv_threshold); + for (p = 0; p < n_params; ++p) { + int step_dir = 0; + // Skip searches for parameters that are forced to be 0 + param = param_mat + p; + curr_param = *param; + best_param = curr_param; + // look to the left + *param = add_param_offset(p, curr_param, -step); + step_error = + av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride, + dst + border * d_stride + border, border, border, + d_width - 2 * border, d_height - 2 * border, d_stride, + 0, 0, AOMMIN(best_error, error_adv_thresh), + segment_map, segment_map_stride); + if (step_error < best_error) { + best_error = step_error; + best_param = *param; + step_dir = -1; + } + + // look to the right + *param = add_param_offset(p, curr_param, step); + step_error = + av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride, + dst + border * d_stride + border, border, border, + d_width - 2 * border, d_height - 2 * border, d_stride, + 0, 0, AOMMIN(best_error, error_adv_thresh), + segment_map, segment_map_stride); + if (step_error < best_error) { + best_error = step_error; + best_param = *param; + step_dir = 1; + } + *param = best_param; + + // look to the direction chosen above repeatedly until error increases + // for the biggest step size + while (step_dir) { + *param = add_param_offset(p, best_param, step * step_dir); + step_error = + av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride, + dst + border * d_stride + border, border, border, + d_width - 2 * border, d_height - 2 * border, + d_stride, 0, 0, AOMMIN(best_error, error_adv_thresh), + segment_map, segment_map_stride); + if (step_error < best_error) { + best_error = step_error; + best_param = *param; + } else { + *param = best_param; + step_dir = 0; + } + } + } + } + force_wmtype(wm, wmtype); + wm->wmtype = get_wmtype(wm); + return best_error; +} + +unsigned char *av1_downconvert_frame(YV12_BUFFER_CONFIG *frm, int bit_depth) { + int i, j; + uint16_t *orig_buf = CONVERT_TO_SHORTPTR(frm->y_buffer); + uint8_t *buf_8bit = frm->y_buffer_8bit; + assert(buf_8bit); + if (!frm->buf_8bit_valid) { + for (i = 0; i < frm->y_height; ++i) { + for (j = 0; j < frm->y_width; ++j) { + buf_8bit[i * frm->y_stride + j] = + orig_buf[i * frm->y_stride + j] >> (bit_depth - 8); + } + } + frm->buf_8bit_valid = 1; + } + return buf_8bit; +} + +static void get_inliers_from_indices(MotionModel *params, + int *correspondences) { + int *inliers_tmp = (int *)aom_malloc(2 * MAX_CORNERS * sizeof(*inliers_tmp)); + memset(inliers_tmp, 0, 2 * MAX_CORNERS * sizeof(*inliers_tmp)); + + for (int i = 0; i < params->num_inliers; i++) { + int index = params->inliers[i]; + inliers_tmp[2 * i] = correspondences[4 * index]; + inliers_tmp[2 * i + 1] = correspondences[4 * index + 1]; + } + memcpy(params->inliers, inliers_tmp, sizeof(*inliers_tmp) * 2 * MAX_CORNERS); + aom_free(inliers_tmp); +} + +#define FEAT_COUNT_TR 3 +#define SEG_COUNT_TR 0.40 +void av1_compute_feature_segmentation_map(uint8_t *segment_map, int width, + int height, int *inliers, + int num_inliers) { + int seg_count = 0; + memset(segment_map, 0, sizeof(*segment_map) * width * height); + + for (int i = 0; i < num_inliers; i++) { + int x = inliers[i * 2]; + int y = inliers[i * 2 + 1]; + int seg_x = x >> WARP_ERROR_BLOCK_LOG; + int seg_y = y >> WARP_ERROR_BLOCK_LOG; + segment_map[seg_y * width + seg_x] += 1; + } + + for (int i = 0; i < height; i++) { + for (int j = 0; j < width; j++) { + uint8_t feat_count = segment_map[i * width + j]; + segment_map[i * width + j] = (feat_count >= FEAT_COUNT_TR); + seg_count += (segment_map[i * width + j]); + } + } + + // If this motion does not make up a large enough portion of the frame, + // use the unsegmented version of the error metric + if (seg_count < (width * height * SEG_COUNT_TR)) + memset(segment_map, 1, width * height * sizeof(*segment_map)); +} + +static int compute_global_motion_feature_based( + TransformationType type, unsigned char *frm_buffer, int frm_width, + int frm_height, int frm_stride, int *frm_corners, int num_frm_corners, + YV12_BUFFER_CONFIG *ref, int bit_depth, int *num_inliers_by_motion, + MotionModel *params_by_motion, int num_motions) { + int i; + int num_ref_corners; + int num_correspondences; + int *correspondences; + int ref_corners[2 * MAX_CORNERS]; + unsigned char *ref_buffer = ref->y_buffer; + RansacFunc ransac = av1_get_ransac_type(type); + + if (ref->flags & YV12_FLAG_HIGHBITDEPTH) { + ref_buffer = av1_downconvert_frame(ref, bit_depth); + } + + num_ref_corners = + av1_fast_corner_detect(ref_buffer, ref->y_width, ref->y_height, + ref->y_stride, ref_corners, MAX_CORNERS); + + // find correspondences between the two images + correspondences = + (int *)malloc(num_frm_corners * 4 * sizeof(*correspondences)); + num_correspondences = av1_determine_correspondence( + frm_buffer, (int *)frm_corners, num_frm_corners, ref_buffer, + (int *)ref_corners, num_ref_corners, frm_width, frm_height, frm_stride, + ref->y_stride, correspondences); + + ransac(correspondences, num_correspondences, num_inliers_by_motion, + params_by_motion, num_motions); + + // Set num_inliers = 0 for motions with too few inliers so they are ignored. + for (i = 0; i < num_motions; ++i) { + if (num_inliers_by_motion[i] < MIN_INLIER_PROB * num_correspondences || + num_correspondences == 0) { + num_inliers_by_motion[i] = 0; + } else { + get_inliers_from_indices(¶ms_by_motion[i], correspondences); + } + } + + free(correspondences); + + // Return true if any one of the motions has inliers. + for (i = 0; i < num_motions; ++i) { + if (num_inliers_by_motion[i] > 0) return 1; + } + return 0; +} + +// Don't use points around the frame border since they are less reliable +static INLINE int valid_point(int x, int y, int width, int height) { + return (x > (PATCH_SIZE + PATCH_CENTER)) && + (x < (width - PATCH_SIZE - PATCH_CENTER)) && + (y > (PATCH_SIZE + PATCH_CENTER)) && + (y < (height - PATCH_SIZE - PATCH_CENTER)); +} + +static int determine_disflow_correspondence(int *frm_corners, + int num_frm_corners, double *flow_u, + double *flow_v, int width, + int height, int stride, + double *correspondences) { + int num_correspondences = 0; + int x, y; + for (int i = 0; i < num_frm_corners; ++i) { + x = frm_corners[2 * i]; + y = frm_corners[2 * i + 1]; + if (valid_point(x, y, width, height)) { + correspondences[4 * num_correspondences] = x; + correspondences[4 * num_correspondences + 1] = y; + correspondences[4 * num_correspondences + 2] = x + flow_u[y * stride + x]; + correspondences[4 * num_correspondences + 3] = y + flow_v[y * stride + x]; + num_correspondences++; + } + } + return num_correspondences; +} + +static double getCubicValue(double p[4], double x) { + return p[1] + 0.5 * x * + (p[2] - p[0] + + x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] + + x * (3.0 * (p[1] - p[2]) + p[3] - p[0]))); +} + +static void get_subcolumn(unsigned char *ref, double col[4], int stride, int x, + int y_start) { + int i; + for (i = 0; i < 4; ++i) { + col[i] = ref[(i + y_start) * stride + x]; + } +} + +static double bicubic(unsigned char *ref, double x, double y, int stride) { + double arr[4]; + int k; + int i = (int)x; + int j = (int)y; + for (k = 0; k < 4; ++k) { + double arr_temp[4]; + get_subcolumn(ref, arr_temp, stride, i + k - 1, j - 1); + arr[k] = getCubicValue(arr_temp, y - j); + } + return getCubicValue(arr, x - i); +} + +// Interpolate a warped block using bicubic interpolation when possible +static unsigned char interpolate(unsigned char *ref, double x, double y, + int width, int height, int stride) { + if (x < 0 && y < 0) + return ref[0]; + else if (x < 0 && y > height - 1) + return ref[(height - 1) * stride]; + else if (x > width - 1 && y < 0) + return ref[width - 1]; + else if (x > width - 1 && y > height - 1) + return ref[(height - 1) * stride + (width - 1)]; + else if (x < 0) { + int v; + int i = (int)y; + double a = y - i; + if (y > 1 && y < height - 2) { + double arr[4]; + get_subcolumn(ref, arr, stride, 0, i - 1); + return clamp((int)(getCubicValue(arr, a) + 0.5), 0, 255); + } + v = (int)(ref[i * stride] * (1 - a) + ref[(i + 1) * stride] * a + 0.5); + return clamp(v, 0, 255); + } else if (y < 0) { + int v; + int j = (int)x; + double b = x - j; + if (x > 1 && x < width - 2) { + double arr[4] = { ref[j - 1], ref[j], ref[j + 1], ref[j + 2] }; + return clamp((int)(getCubicValue(arr, b) + 0.5), 0, 255); + } + v = (int)(ref[j] * (1 - b) + ref[j + 1] * b + 0.5); + return clamp(v, 0, 255); + } else if (x > width - 1) { + int v; + int i = (int)y; + double a = y - i; + if (y > 1 && y < height - 2) { + double arr[4]; + get_subcolumn(ref, arr, stride, width - 1, i - 1); + return clamp((int)(getCubicValue(arr, a) + 0.5), 0, 255); + } + v = (int)(ref[i * stride + width - 1] * (1 - a) + + ref[(i + 1) * stride + width - 1] * a + 0.5); + return clamp(v, 0, 255); + } else if (y > height - 1) { + int v; + int j = (int)x; + double b = x - j; + if (x > 1 && x < width - 2) { + int row = (height - 1) * stride; + double arr[4] = { ref[row + j - 1], ref[row + j], ref[row + j + 1], + ref[row + j + 2] }; + return clamp((int)(getCubicValue(arr, b) + 0.5), 0, 255); + } + v = (int)(ref[(height - 1) * stride + j] * (1 - b) + + ref[(height - 1) * stride + j + 1] * b + 0.5); + return clamp(v, 0, 255); + } else if (x > 1 && y > 1 && x < width - 2 && y < height - 2) { + return clamp((int)(bicubic(ref, x, y, stride) + 0.5), 0, 255); + } else { + int i = (int)y; + int j = (int)x; + double a = y - i; + double b = x - j; + int v = (int)(ref[i * stride + j] * (1 - a) * (1 - b) + + ref[i * stride + j + 1] * (1 - a) * b + + ref[(i + 1) * stride + j] * a * (1 - b) + + ref[(i + 1) * stride + j + 1] * a * b); + return clamp(v, 0, 255); + } +} + +// Warps a block using flow vector [u, v] and computes the mse +static double compute_warp_and_error(unsigned char *ref, unsigned char *frm, + int width, int height, int stride, int x, + int y, double u, double v, int16_t *dt) { + int i, j; + unsigned char warped; + double x_w, y_w; + double mse = 0; + int16_t err = 0; + for (i = y; i < y + PATCH_SIZE; ++i) + for (j = x; j < x + PATCH_SIZE; ++j) { + x_w = (double)j + u; + y_w = (double)i + v; + warped = interpolate(ref, x_w, y_w, width, height, stride); + err = warped - frm[j + i * stride]; + mse += err * err; + dt[(i - y) * PATCH_SIZE + (j - x)] = err; + } + + mse /= (PATCH_SIZE * PATCH_SIZE); + return mse; +} + +// Computes the components of the system of equations used to solve for +// a flow vector. This includes: +// 1.) The hessian matrix for optical flow. This matrix is in the +// form of: +// +// M = |sum(dx * dx) sum(dx * dy)| +// |sum(dx * dy) sum(dy * dy)| +// +// 2.) b = |sum(dx * dt)| +// |sum(dy * dt)| +// Where the sums are computed over a square window of PATCH_SIZE. +static INLINE void compute_flow_system(const double *dx, int dx_stride, + const double *dy, int dy_stride, + const int16_t *dt, int dt_stride, + double *M, double *b) { + for (int i = 0; i < PATCH_SIZE; i++) { + for (int j = 0; j < PATCH_SIZE; j++) { + M[0] += dx[i * dx_stride + j] * dx[i * dx_stride + j]; + M[1] += dx[i * dx_stride + j] * dy[i * dy_stride + j]; + M[3] += dy[i * dy_stride + j] * dy[i * dy_stride + j]; + + b[0] += dx[i * dx_stride + j] * dt[i * dt_stride + j]; + b[1] += dy[i * dy_stride + j] * dt[i * dt_stride + j]; + } + } + + M[2] = M[1]; +} + +// Solves a general Mx = b where M is a 2x2 matrix and b is a 2x1 matrix +static INLINE void solve_2x2_system(const double *M, const double *b, + double *output_vec) { + double M_0 = M[0]; + double M_3 = M[3]; + double det = (M_0 * M_3) - (M[1] * M[2]); + if (det < 1e-5) { + // Handle singular matrix + // TODO(sarahparker) compare results using pseudo inverse instead + M_0 += 1e-10; + M_3 += 1e-10; + det = (M_0 * M_3) - (M[1] * M[2]); + } + const double det_inv = 1 / det; + const double mult_b0 = det_inv * b[0]; + const double mult_b1 = det_inv * b[1]; + output_vec[0] = M_3 * mult_b0 - M[1] * mult_b1; + output_vec[1] = -M[2] * mult_b0 + M_0 * mult_b1; +} + +/* +static INLINE void image_difference(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int16_t *dst, int dst_stride, int height, + int width) { + const int block_unit = 8; + // Take difference in 8x8 blocks to make use of optimized diff function + for (int i = 0; i < height; i += block_unit) { + for (int j = 0; j < width; j += block_unit) { + aom_subtract_block(block_unit, block_unit, dst + i * dst_stride + j, + dst_stride, src + i * src_stride + j, src_stride, + ref + i * ref_stride + j, ref_stride); + } + } +} +*/ + +// Compute an image gradient using a sobel filter. +// If dir == 1, compute the x gradient. If dir == 0, compute y. This function +// assumes the images have been padded so that they can be processed in units +// of 8. +static INLINE void sobel_xy_image_gradient(const uint8_t *src, int src_stride, + double *dst, int dst_stride, + int height, int width, int dir) { + double norm = 1.0; + // TODO(sarahparker) experiment with doing this over larger block sizes + const int block_unit = 8; + // Filter in 8x8 blocks to eventually make use of optimized convolve function + for (int i = 0; i < height; i += block_unit) { + for (int j = 0; j < width; j += block_unit) { + av1_convolve_2d_sobel_y_c(src + i * src_stride + j, src_stride, + dst + i * dst_stride + j, dst_stride, + block_unit, block_unit, dir, norm); + } + } +} + +static ImagePyramid *alloc_pyramid(int width, int height, int pad_size, + int compute_gradient) { + ImagePyramid *pyr = aom_malloc(sizeof(*pyr)); + pyr->has_gradient = compute_gradient; + // 2 * width * height is the upper bound for a buffer that fits + // all pyramid levels + padding for each level + const int buffer_size = sizeof(*pyr->level_buffer) * 2 * width * height + + (width + 2 * pad_size) * 2 * pad_size * N_LEVELS; + pyr->level_buffer = aom_malloc(buffer_size); + memset(pyr->level_buffer, 0, buffer_size); + + if (compute_gradient) { + const int gradient_size = + sizeof(*pyr->level_dx_buffer) * 2 * width * height + + (width + 2 * pad_size) * 2 * pad_size * N_LEVELS; + pyr->level_dx_buffer = aom_malloc(gradient_size); + pyr->level_dy_buffer = aom_malloc(gradient_size); + memset(pyr->level_dx_buffer, 0, gradient_size); + memset(pyr->level_dy_buffer, 0, gradient_size); + } + return pyr; +} + +static void free_pyramid(ImagePyramid *pyr) { + aom_free(pyr->level_buffer); + if (pyr->has_gradient) { + aom_free(pyr->level_dx_buffer); + aom_free(pyr->level_dy_buffer); + } + aom_free(pyr); +} + +static INLINE void update_level_dims(ImagePyramid *frm_pyr, int level) { + frm_pyr->widths[level] = frm_pyr->widths[level - 1] >> 1; + frm_pyr->heights[level] = frm_pyr->heights[level - 1] >> 1; + frm_pyr->strides[level] = frm_pyr->widths[level] + 2 * frm_pyr->pad_size; + // Point the beginning of the next level buffer to the correct location inside + // the padded border + frm_pyr->level_loc[level] = + frm_pyr->level_loc[level - 1] + + frm_pyr->strides[level - 1] * + (2 * frm_pyr->pad_size + frm_pyr->heights[level - 1]); +} + +// Compute coarse to fine pyramids for a frame +static void compute_flow_pyramids(unsigned char *frm, const int frm_width, + const int frm_height, const int frm_stride, + int n_levels, int pad_size, int compute_grad, + ImagePyramid *frm_pyr) { + int cur_width, cur_height, cur_stride, cur_loc; + assert((frm_width >> n_levels) > 0); + assert((frm_height >> n_levels) > 0); + + // Initialize first level + frm_pyr->n_levels = n_levels; + frm_pyr->pad_size = pad_size; + frm_pyr->widths[0] = frm_width; + frm_pyr->heights[0] = frm_height; + frm_pyr->strides[0] = frm_width + 2 * frm_pyr->pad_size; + // Point the beginning of the level buffer to the location inside + // the padded border + frm_pyr->level_loc[0] = + frm_pyr->strides[0] * frm_pyr->pad_size + frm_pyr->pad_size; + // This essentially copies the original buffer into the pyramid buffer + // without the original padding + av1_resize_plane(frm, frm_height, frm_width, frm_stride, + frm_pyr->level_buffer + frm_pyr->level_loc[0], + frm_pyr->heights[0], frm_pyr->widths[0], + frm_pyr->strides[0]); + + if (compute_grad) { + cur_width = frm_pyr->widths[0]; + cur_height = frm_pyr->heights[0]; + cur_stride = frm_pyr->strides[0]; + cur_loc = frm_pyr->level_loc[0]; + assert(frm_pyr->has_gradient && frm_pyr->level_dx_buffer != NULL && + frm_pyr->level_dy_buffer != NULL); + // Computation x gradient + sobel_xy_image_gradient(frm_pyr->level_buffer + cur_loc, cur_stride, + frm_pyr->level_dx_buffer + cur_loc, cur_stride, + cur_height, cur_width, 1); + + // Computation y gradient + sobel_xy_image_gradient(frm_pyr->level_buffer + cur_loc, cur_stride, + frm_pyr->level_dy_buffer + cur_loc, cur_stride, + cur_height, cur_width, 0); + } + + // Start at the finest level and resize down to the coarsest level + for (int level = 1; level < n_levels; ++level) { + update_level_dims(frm_pyr, level); + cur_width = frm_pyr->widths[level]; + cur_height = frm_pyr->heights[level]; + cur_stride = frm_pyr->strides[level]; + cur_loc = frm_pyr->level_loc[level]; + + av1_resize_plane(frm_pyr->level_buffer + frm_pyr->level_loc[level - 1], + frm_pyr->heights[level - 1], frm_pyr->widths[level - 1], + frm_pyr->strides[level - 1], + frm_pyr->level_buffer + cur_loc, cur_height, cur_width, + cur_stride); + + if (compute_grad) { + assert(frm_pyr->has_gradient && frm_pyr->level_dx_buffer != NULL && + frm_pyr->level_dy_buffer != NULL); + // Computation x gradient + sobel_xy_image_gradient(frm_pyr->level_buffer + cur_loc, cur_stride, + frm_pyr->level_dx_buffer + cur_loc, cur_stride, + cur_height, cur_width, 1); + + // Computation y gradient + sobel_xy_image_gradient(frm_pyr->level_buffer + cur_loc, cur_stride, + frm_pyr->level_dy_buffer + cur_loc, cur_stride, + cur_height, cur_width, 0); + } + } +} + +static INLINE void compute_flow_at_point(unsigned char *frm, unsigned char *ref, + double *dx, double *dy, int x, int y, + int width, int height, int stride, + double *u, double *v) { + double M[4] = { 0 }; + double b[2] = { 0 }; + double tmp_output_vec[2] = { 0 }; + double error = 0; + int16_t dt[PATCH_SIZE * PATCH_SIZE]; + double o_u = *u; + double o_v = *v; + + for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) { + error = compute_warp_and_error(ref, frm, width, height, stride, x, y, *u, + *v, dt); + if (error <= DISFLOW_ERROR_TR) break; + compute_flow_system(dx, stride, dy, stride, dt, PATCH_SIZE, M, b); + solve_2x2_system(M, b, tmp_output_vec); + *u += tmp_output_vec[0]; + *v += tmp_output_vec[1]; + } + if (fabs(*u - o_u) > PATCH_SIZE || fabs(*v - o_u) > PATCH_SIZE) { + *u = o_u; + *v = o_v; + } +} + +// make sure flow_u and flow_v start at 0 +static void compute_flow_field(ImagePyramid *frm_pyr, ImagePyramid *ref_pyr, + double *flow_u, double *flow_v) { + int cur_width, cur_height, cur_stride, cur_loc, patch_loc, patch_center; + double *u_upscale = + aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u)); + double *v_upscale = + aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v)); + + assert(frm_pyr->n_levels == ref_pyr->n_levels); + + // Compute flow field from coarsest to finest level of the pyramid + for (int level = frm_pyr->n_levels - 1; level >= 0; --level) { + cur_width = frm_pyr->widths[level]; + cur_height = frm_pyr->heights[level]; + cur_stride = frm_pyr->strides[level]; + cur_loc = frm_pyr->level_loc[level]; + + for (int i = PATCH_SIZE; i < cur_height - PATCH_SIZE; i += PATCH_STEP) { + for (int j = PATCH_SIZE; j < cur_width - PATCH_SIZE; j += PATCH_STEP) { + patch_loc = i * cur_stride + j; + patch_center = patch_loc + PATCH_CENTER * cur_stride + PATCH_CENTER; + compute_flow_at_point(frm_pyr->level_buffer + cur_loc, + ref_pyr->level_buffer + cur_loc, + frm_pyr->level_dx_buffer + cur_loc + patch_loc, + frm_pyr->level_dy_buffer + cur_loc + patch_loc, j, + i, cur_width, cur_height, cur_stride, + flow_u + patch_center, flow_v + patch_center); + } + } + // TODO(sarahparker) Replace this with upscale function in resize.c + if (level > 0) { + int h_upscale = frm_pyr->heights[level - 1]; + int w_upscale = frm_pyr->widths[level - 1]; + int s_upscale = frm_pyr->strides[level - 1]; + for (int i = 0; i < h_upscale; ++i) { + for (int j = 0; j < w_upscale; ++j) { + u_upscale[j + i * s_upscale] = + flow_u[(int)(j >> 1) + (int)(i >> 1) * cur_stride]; + v_upscale[j + i * s_upscale] = + flow_v[(int)(j >> 1) + (int)(i >> 1) * cur_stride]; + } + } + memcpy(flow_u, u_upscale, + frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u)); + memcpy(flow_v, v_upscale, + frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v)); + } + } + aom_free(u_upscale); + aom_free(v_upscale); +} + +static int compute_global_motion_disflow_based( + TransformationType type, unsigned char *frm_buffer, int frm_width, + int frm_height, int frm_stride, int *frm_corners, int num_frm_corners, + YV12_BUFFER_CONFIG *ref, int bit_depth, int *num_inliers_by_motion, + MotionModel *params_by_motion, int num_motions) { + unsigned char *ref_buffer = ref->y_buffer; + const int ref_width = ref->y_width; + const int ref_height = ref->y_height; + const int pad_size = AOMMAX(PATCH_SIZE, MIN_PAD); + int num_correspondences; + double *correspondences; + RansacFuncDouble ransac = av1_get_ransac_double_prec_type(type); + assert(frm_width == ref_width); + assert(frm_height == ref_height); + + // Ensure the number of pyramid levels will work with the frame resolution + const int msb = + frm_width < frm_height ? get_msb(frm_width) : get_msb(frm_height); + const int n_levels = AOMMIN(msb, N_LEVELS); + + if (ref->flags & YV12_FLAG_HIGHBITDEPTH) { + ref_buffer = av1_downconvert_frame(ref, bit_depth); + } + + // TODO(sarahparker) We will want to do the source pyramid computation + // outside of this function so it doesn't get recomputed for every + // reference. We also don't need to compute every pyramid level for the + // reference in advance, since lower levels can be overwritten once their + // flow field is computed and upscaled. I'll add these optimizations + // once the full implementation is working. + // Allocate frm image pyramids + int compute_gradient = 1; + ImagePyramid *frm_pyr = + alloc_pyramid(frm_width, frm_height, pad_size, compute_gradient); + compute_flow_pyramids(frm_buffer, frm_width, frm_height, frm_stride, n_levels, + pad_size, compute_gradient, frm_pyr); + // Allocate ref image pyramids + compute_gradient = 0; + ImagePyramid *ref_pyr = + alloc_pyramid(ref_width, ref_height, pad_size, compute_gradient); + compute_flow_pyramids(ref_buffer, ref_width, ref_height, ref->y_stride, + n_levels, pad_size, compute_gradient, ref_pyr); + + double *flow_u = + aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u)); + double *flow_v = + aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v)); + + memset(flow_u, 0, + frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u)); + memset(flow_v, 0, + frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v)); + + compute_flow_field(frm_pyr, ref_pyr, flow_u, flow_v); + + // find correspondences between the two images using the flow field + correspondences = aom_malloc(num_frm_corners * 4 * sizeof(*correspondences)); + num_correspondences = determine_disflow_correspondence( + frm_corners, num_frm_corners, flow_u, flow_v, frm_width, frm_height, + frm_pyr->strides[0], correspondences); + ransac(correspondences, num_correspondences, num_inliers_by_motion, + params_by_motion, num_motions); + + free_pyramid(frm_pyr); + free_pyramid(ref_pyr); + aom_free(correspondences); + aom_free(flow_u); + aom_free(flow_v); + // Set num_inliers = 0 for motions with too few inliers so they are ignored. + for (int i = 0; i < num_motions; ++i) { + if (num_inliers_by_motion[i] < MIN_INLIER_PROB * num_correspondences) { + num_inliers_by_motion[i] = 0; + } + } + + // Return true if any one of the motions has inliers. + for (int i = 0; i < num_motions; ++i) { + if (num_inliers_by_motion[i] > 0) return 1; + } + return 0; +} + +int av1_compute_global_motion(TransformationType type, + unsigned char *frm_buffer, int frm_width, + int frm_height, int frm_stride, int *frm_corners, + int num_frm_corners, YV12_BUFFER_CONFIG *ref, + int bit_depth, + GlobalMotionEstimationType gm_estimation_type, + int *num_inliers_by_motion, + MotionModel *params_by_motion, int num_motions) { + switch (gm_estimation_type) { + case GLOBAL_MOTION_FEATURE_BASED: + return compute_global_motion_feature_based( + type, frm_buffer, frm_width, frm_height, frm_stride, frm_corners, + num_frm_corners, ref, bit_depth, num_inliers_by_motion, + params_by_motion, num_motions); + case GLOBAL_MOTION_DISFLOW_BASED: + return compute_global_motion_disflow_based( + type, frm_buffer, frm_width, frm_height, frm_stride, frm_corners, + num_frm_corners, ref, bit_depth, num_inliers_by_motion, + params_by_motion, num_motions); + default: assert(0 && "Unknown global motion estimation type"); + } + return 0; +} diff --git a/libs/libaom/src/av1/encoder/global_motion.h b/libs/libaom/src/av1/encoder/global_motion.h new file mode 100644 index 000000000..0a6d0ecac --- /dev/null +++ b/libs/libaom/src/av1/encoder/global_motion.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_GLOBAL_MOTION_H_ +#define AOM_AV1_ENCODER_GLOBAL_MOTION_H_ + +#include "aom/aom_integer.h" +#include "aom_scale/yv12config.h" +#include "av1/common/mv.h" +#include "av1/common/warped_motion.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_CORNERS 4096 +#define RANSAC_NUM_MOTIONS 1 +#define GM_REFINEMENT_COUNT 5 + +typedef enum { + GLOBAL_MOTION_FEATURE_BASED, + GLOBAL_MOTION_DISFLOW_BASED, +} GlobalMotionEstimationType; + +unsigned char *av1_downconvert_frame(YV12_BUFFER_CONFIG *frm, int bit_depth); + +typedef struct { + double params[MAX_PARAMDIM - 1]; + int *inliers; + int num_inliers; +} MotionModel; + +void av1_convert_model_to_params(const double *params, + WarpedMotionParams *model); + +// TODO(sarahparker) These need to be retuned for speed 0 and 1 to +// maximize gains from segmented error metric +static const double erroradv_tr[] = { 0.65, 0.60, 0.65 }; +static const double erroradv_prod_tr[] = { 20000, 18000, 16000 }; + +int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost, + int erroradv_type); + +void av1_compute_feature_segmentation_map(uint8_t *segment_map, int width, + int height, int *inliers, + int num_inliers); + +// Returns the error between the result of applying motion 'wm' to the frame +// described by 'ref' and the frame described by 'dst'. +int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd, + const uint8_t *ref, int width, int height, int stride, + uint8_t *dst, int p_col, int p_row, int p_width, + int p_height, int p_stride, int subsampling_x, + int subsampling_y, int64_t best_error, + uint8_t *segment_map, int segment_map_stride); + +// Returns the av1_warp_error between "dst" and the result of applying the +// motion params that result from fine-tuning "wm" to "ref". Note that "wm" is +// modified in place. +int64_t av1_refine_integerized_param( + WarpedMotionParams *wm, TransformationType wmtype, int use_hbd, int bd, + uint8_t *ref, int r_width, int r_height, int r_stride, uint8_t *dst, + int d_width, int d_height, int d_stride, int n_refinements, + int64_t best_frame_error, uint8_t *segment_map, int segment_map_stride, + int64_t erroradv_threshold); + +/* + Computes "num_motions" candidate global motion parameters between two frames. + The array "params_by_motion" should be length 8 * "num_motions". The ordering + of each set of parameters is best described by the homography: + + [x' (m2 m3 m0 [x + z . y' = m4 m5 m1 * y + 1] m6 m7 1) 1] + + where m{i} represents the ith value in any given set of parameters. + + "num_inliers" should be length "num_motions", and will be populated with the + number of inlier feature points for each motion. Params for which the + num_inliers entry is 0 should be ignored by the caller. +*/ +int av1_compute_global_motion(TransformationType type, + unsigned char *frm_buffer, int frm_width, + int frm_height, int frm_stride, int *frm_corners, + int num_frm_corners, YV12_BUFFER_CONFIG *ref, + int bit_depth, + GlobalMotionEstimationType gm_estimation_type, + int *num_inliers_by_motion, + MotionModel *params_by_motion, int num_motions); +#ifdef __cplusplus +} // extern "C" +#endif +#endif // AOM_AV1_ENCODER_GLOBAL_MOTION_H_ diff --git a/libs/libaom/src/av1/encoder/gop_structure.c b/libs/libaom/src/av1/encoder/gop_structure.c new file mode 100644 index 000000000..1ed71a0f9 --- /dev/null +++ b/libs/libaom/src/av1/encoder/gop_structure.c @@ -0,0 +1,311 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_codec.h" +#include "aom/aom_encoder.h" + +#include "aom_ports/system_state.h" + +#include "av1/common/av1_common_int.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/gop_structure.h" + +// Set parameters for frames between 'start' and 'end' (excluding both). +static void set_multi_layer_params(const TWO_PASS *twopass, + GF_GROUP *const gf_group, RATE_CONTROL *rc, + FRAME_INFO *frame_info, int start, int end, + int *cur_frame_idx, int *frame_ind, + int arf_ind, int layer_depth) { + const int num_frames_to_process = end - start - 1; + assert(num_frames_to_process >= 0); + if (num_frames_to_process == 0) return; + + // Either we are at the last level of the pyramid, or we don't have enough + // frames between 'l' and 'r' to create one more level. + if (layer_depth > gf_group->max_layer_depth_allowed || + num_frames_to_process < 3) { + // Leaf nodes. + while (++start < end) { + gf_group->update_type[*frame_ind] = LF_UPDATE; + gf_group->arf_src_offset[*frame_ind] = 0; + ++*cur_frame_idx; + gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx; + gf_group->frame_disp_idx[*frame_ind] = start; + gf_group->layer_depth[*frame_ind] = MAX_ARF_LAYERS; + gf_group->arf_boost[*frame_ind] = av1_calc_arf_boost( + twopass, rc, frame_info, start, end - start, 0, NULL, NULL); + gf_group->max_layer_depth = + AOMMAX(gf_group->max_layer_depth, layer_depth); + ++(*frame_ind); + } + } else { + const int m = (start + end) / 2; + + // Internal ARF. + gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE; + gf_group->arf_src_offset[*frame_ind] = m - start - 1; + gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx; + gf_group->frame_disp_idx[*frame_ind] = m; + gf_group->layer_depth[*frame_ind] = layer_depth; + + // Get the boost factor for intermediate ARF frames. + gf_group->arf_boost[*frame_ind] = av1_calc_arf_boost( + twopass, rc, frame_info, m, end - m, m - start, NULL, NULL); + ++(*frame_ind); + + // Frames displayed before this internal ARF. + set_multi_layer_params(twopass, gf_group, rc, frame_info, start, m, + cur_frame_idx, frame_ind, 1, layer_depth + 1); + + // Overlay for internal ARF. + gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE; + gf_group->arf_src_offset[*frame_ind] = 0; + gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx; + gf_group->frame_disp_idx[*frame_ind] = m; + gf_group->arf_boost[*frame_ind] = 0; + gf_group->layer_depth[*frame_ind] = layer_depth; + ++(*frame_ind); + + // Frames displayed after this internal ARF. + set_multi_layer_params(twopass, gf_group, rc, frame_info, m, end, + cur_frame_idx, frame_ind, arf_ind, layer_depth + 1); + } +} + +static int construct_multi_layer_gf_structure( + AV1_COMP *cpi, TWO_PASS *twopass, GF_GROUP *const gf_group, + RATE_CONTROL *rc, FRAME_INFO *const frame_info, int gf_interval, + FRAME_UPDATE_TYPE first_frame_update_type) { + int frame_index = 0; + + // Keyframe / Overlay frame / Golden frame. + assert(gf_interval >= 1); + assert(first_frame_update_type == KF_UPDATE || + first_frame_update_type == OVERLAY_UPDATE || + first_frame_update_type == GF_UPDATE); + + gf_group->update_type[frame_index] = first_frame_update_type; + gf_group->arf_src_offset[frame_index] = 0; + gf_group->cur_frame_idx[frame_index] = 0; + gf_group->layer_depth[frame_index] = + first_frame_update_type == OVERLAY_UPDATE ? MAX_ARF_LAYERS + 1 : 0; + gf_group->max_layer_depth = 0; + ++frame_index; + + // ALTREF. + const int use_altref = gf_group->max_layer_depth_allowed > 0; + if (use_altref) { + gf_group->update_type[frame_index] = ARF_UPDATE; + gf_group->arf_src_offset[frame_index] = gf_interval - 1; + gf_group->cur_frame_idx[frame_index] = 0; + gf_group->frame_disp_idx[frame_index] = gf_interval; + gf_group->layer_depth[frame_index] = 1; + gf_group->arf_boost[frame_index] = cpi->rc.gfu_boost; + gf_group->max_layer_depth = 1; + ++frame_index; + } + + int cur_frame_index = 0; + // Rest of the frames. + set_multi_layer_params(twopass, gf_group, rc, frame_info, 0, gf_interval, + &cur_frame_index, &frame_index, 0, use_altref + 1); + + // The end frame will be Overlay frame for an ARF GOP; otherwise set it to + // be GF, for consistency, which will be updated in the next GOP. + gf_group->update_type[frame_index] = use_altref ? OVERLAY_UPDATE : GF_UPDATE; + gf_group->arf_src_offset[frame_index] = 0; + return frame_index; +} + +#define CHECK_GF_PARAMETER 0 +#if CHECK_GF_PARAMETER +void check_frame_params(GF_GROUP *const gf_group, int gf_interval) { + static const char *update_type_strings[FRAME_UPDATE_TYPES] = { + "KF_UPDATE", "LF_UPDATE", "GF_UPDATE", + "ARF_UPDATE", "OVERLAY_UPDATE", "INTNL_OVERLAY_UPDATE", + "INTNL_ARF_UPDATE" + }; + FILE *fid = fopen("GF_PARAMS.txt", "a"); + + fprintf(fid, "\ngf_interval = {%d}\n", gf_interval); + for (int i = 0; i < gf_group->size; ++i) { + fprintf(fid, "#%2d : %s %d %d %d %d\n", i, + update_type_strings[gf_group->update_type[i]], + gf_group->arf_src_offset[i], gf_group->arf_pos_in_gf[i], + gf_group->arf_update_idx[i], gf_group->pyramid_level[i]); + } + + fprintf(fid, "number of nodes in each level: \n"); + for (int i = 0; i < gf_group->pyramid_height; ++i) { + fprintf(fid, "lvl %d: %d ", i, gf_group->pyramid_lvl_nodes[i]); + } + fprintf(fid, "\n"); + fclose(fid); +} +#endif // CHECK_GF_PARAMETER + +#define REF_IDX(ref) ((ref)-LAST_FRAME) + +static INLINE void reset_ref_frame_idx(int *ref_idx, int reset_value) { + for (int i = 0; i < REF_FRAMES; ++i) ref_idx[i] = reset_value; +} + +static INLINE void set_ref_frame_disp_idx(GF_GROUP *const gf_group) { + for (int i = 0; i < gf_group->size; ++i) { + for (int ref = 0; ref < INTER_REFS_PER_FRAME + 1; ++ref) { + int ref_gop_idx = gf_group->ref_frame_gop_idx[i][ref]; + if (ref_gop_idx == -1) { + gf_group->ref_frame_disp_idx[i][ref] = -1; + } else { + gf_group->ref_frame_disp_idx[i][ref] = + gf_group->frame_disp_idx[ref_gop_idx]; + } + } + } +} + +static void set_gop_ref_frame_map(GF_GROUP *const gf_group) { + // Initialize the reference slots as all -1. + for (int frame_idx = 0; frame_idx < gf_group->size; ++frame_idx) + reset_ref_frame_idx(gf_group->ref_frame_gop_idx[frame_idx], -1); + + // Set the map for frames in the current gop + for (int frame_idx = 0; frame_idx < gf_group->size; ++frame_idx) { + const FRAME_UPDATE_TYPE update_type = gf_group->update_type[frame_idx]; + // TODO(yuec): need to figure out how to determine + // (1) whether a KEY_FRAME has show_frame on + // (2) whether a frame with INTNL_OVERLAY_UPDATE type has + // show_existing_frame on + const int show_frame = + update_type != ARF_UPDATE && update_type != INTNL_ARF_UPDATE; + const int show_existing_frame = + update_type == OVERLAY_UPDATE || update_type == INTNL_OVERLAY_UPDATE; + + int this_ref_map[INTER_REFS_PER_FRAME + 1]; + memcpy(this_ref_map, gf_group->ref_frame_gop_idx[frame_idx], + sizeof(this_ref_map)); + int *next_ref_map = &gf_group->ref_frame_gop_idx[frame_idx + 1][0]; + + switch (update_type) { + case KF_UPDATE: + if (show_frame) { + reset_ref_frame_idx(this_ref_map, frame_idx); + } else { + this_ref_map[REF_IDX(LAST3_FRAME)] = frame_idx; + this_ref_map[REF_IDX(EXTREF_FRAME)] = frame_idx; + this_ref_map[REF_IDX(ALTREF2_FRAME)] = frame_idx; + this_ref_map[REF_IDX(GOLDEN_FRAME)] = frame_idx; + this_ref_map[REF_IDX(ALTREF_FRAME)] = frame_idx; + } + break; + case LF_UPDATE: this_ref_map[REF_IDX(LAST3_FRAME)] = frame_idx; break; + case GF_UPDATE: + this_ref_map[REF_IDX(LAST3_FRAME)] = frame_idx; + this_ref_map[REF_IDX(GOLDEN_FRAME)] = frame_idx; + break; + case OVERLAY_UPDATE: + this_ref_map[REF_IDX(ALTREF_FRAME)] = frame_idx; + break; + case ARF_UPDATE: this_ref_map[REF_IDX(ALTREF_FRAME)] = frame_idx; break; + case INTNL_OVERLAY_UPDATE: + if (!show_existing_frame) + this_ref_map[REF_IDX(LAST3_FRAME)] = frame_idx; + break; + case INTNL_ARF_UPDATE: + this_ref_map[REF_IDX(EXTREF_FRAME)] = frame_idx; + break; + default: assert(0); break; + } + + memcpy(next_ref_map, this_ref_map, sizeof(this_ref_map)); + + switch (update_type) { + case LF_UPDATE: + case GF_UPDATE: + next_ref_map[REF_IDX(LAST3_FRAME)] = this_ref_map[REF_IDX(LAST2_FRAME)]; + next_ref_map[REF_IDX(LAST2_FRAME)] = this_ref_map[REF_IDX(LAST_FRAME)]; + next_ref_map[REF_IDX(LAST_FRAME)] = this_ref_map[REF_IDX(LAST3_FRAME)]; + break; + case INTNL_OVERLAY_UPDATE: + if (!show_existing_frame) { + next_ref_map[REF_IDX(LAST3_FRAME)] = + this_ref_map[REF_IDX(LAST2_FRAME)]; + next_ref_map[REF_IDX(LAST2_FRAME)] = + this_ref_map[REF_IDX(LAST_FRAME)]; + next_ref_map[REF_IDX(LAST_FRAME)] = + this_ref_map[REF_IDX(LAST3_FRAME)]; + } else { + next_ref_map[REF_IDX(LAST_FRAME)] = + this_ref_map[REF_IDX(BWDREF_FRAME)]; + next_ref_map[REF_IDX(LAST2_FRAME)] = + this_ref_map[REF_IDX(LAST_FRAME)]; + next_ref_map[REF_IDX(LAST3_FRAME)] = + this_ref_map[REF_IDX(LAST2_FRAME)]; + next_ref_map[REF_IDX(BWDREF_FRAME)] = + this_ref_map[REF_IDX(ALTREF2_FRAME)]; + next_ref_map[REF_IDX(ALTREF2_FRAME)] = + this_ref_map[REF_IDX(EXTREF_FRAME)]; + next_ref_map[REF_IDX(EXTREF_FRAME)] = + this_ref_map[REF_IDX(LAST3_FRAME)]; + } + break; + case INTNL_ARF_UPDATE: + if (!show_existing_frame) { + next_ref_map[REF_IDX(BWDREF_FRAME)] = + this_ref_map[REF_IDX(EXTREF_FRAME)]; + next_ref_map[REF_IDX(ALTREF2_FRAME)] = + this_ref_map[REF_IDX(BWDREF_FRAME)]; + next_ref_map[REF_IDX(EXTREF_FRAME)] = + this_ref_map[REF_IDX(ALTREF2_FRAME)]; + } + break; + case OVERLAY_UPDATE: + next_ref_map[REF_IDX(ALTREF_FRAME)] = + this_ref_map[REF_IDX(GOLDEN_FRAME)]; + next_ref_map[REF_IDX(GOLDEN_FRAME)] = + this_ref_map[REF_IDX(ALTREF_FRAME)]; + break; + default: break; + } + } + + // Set the map in display order index by converting from gop indices in the + // above map + set_ref_frame_disp_idx(gf_group); +} + +void av1_gop_setup_structure(AV1_COMP *cpi, + const EncodeFrameParams *const frame_params) { + RATE_CONTROL *const rc = &cpi->rc; + GF_GROUP *const gf_group = &cpi->gf_group; + TWO_PASS *const twopass = &cpi->twopass; + FRAME_INFO *const frame_info = &cpi->frame_info; + const int key_frame = (frame_params->frame_type == KEY_FRAME); + const FRAME_UPDATE_TYPE first_frame_update_type = + key_frame ? KF_UPDATE + : rc->source_alt_ref_active ? OVERLAY_UPDATE : GF_UPDATE; + gf_group->size = construct_multi_layer_gf_structure( + cpi, twopass, gf_group, rc, frame_info, rc->baseline_gf_interval, + first_frame_update_type); + + set_gop_ref_frame_map(gf_group); + +#if CHECK_GF_PARAMETER + check_frame_params(gf_group, rc->baseline_gf_interval); +#endif +} diff --git a/libs/libaom/src/av1/encoder/gop_structure.h b/libs/libaom/src/av1/encoder/gop_structure.h new file mode 100644 index 000000000..0c775c7b4 --- /dev/null +++ b/libs/libaom/src/av1/encoder/gop_structure.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_GOP_STRUCTURE_H_ +#define AOM_AV1_ENCODER_GOP_STRUCTURE_H_ + +#include "av1/common/av1_common_int.h" +#include "av1/encoder/ratectrl.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1_COMP; +struct EncodeFrameParams; + +#define MIN_ARF_GF_BOOST 240 +#define NORMAL_BOOST 100 + +// Set up the Group-Of-Pictures structure for this GF_GROUP. This involves +// deciding where to place the various FRAME_UPDATE_TYPEs in the group. It does +// this primarily by setting the contents of +// cpi->twopass.gf_group.update_type[]. +void av1_gop_setup_structure( + struct AV1_COMP *cpi, const struct EncodeFrameParams *const frame_params); + +int av1_calc_arf_boost(const TWO_PASS *twopass, const RATE_CONTROL *rc, + FRAME_INFO *frame_info, int offset, int f_frames, + int b_frames, int *num_fpstats_used, + int *num_fpstats_required); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_GOP_STRUCTURE_H_ diff --git a/libs/libaom/src/av1/encoder/grain_test_vectors.h b/libs/libaom/src/av1/encoder/grain_test_vectors.h new file mode 100644 index 000000000..945dc3733 --- /dev/null +++ b/libs/libaom/src/av1/encoder/grain_test_vectors.h @@ -0,0 +1,781 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_ +#define AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_ + +/* Test vectors for emulation of different film grain types. + * Note that bit depth would be derived from the bitstream and + * not signaled in film grain metadata. The parameters are valid + * for any bit depth. + */ +static aom_film_grain_t film_grain_test_vectors[16] = { + /* Test 1 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { { 16, 0 }, + { 25, 136 }, + { 33, 144 }, + { 41, 160 }, + { 48, 168 }, + { 56, 136 }, + { 67, 128 }, + { 82, 144 }, + { 97, 152 }, + { 113, 144 }, + { 128, 176 }, + { 143, 168 }, + { 158, 176 }, + { 178, 184 } }, + 14 /* num_points_y */, + { { 16, 0 }, + { 20, 64 }, + { 28, 88 }, + { 60, 104 }, + { 90, 136 }, + { 105, 160 }, + { 134, 168 }, + { 168, 208 } }, + 8 /* num_cb_points */, + { { 16, 0 }, + { 28, 96 }, + { 56, 80 }, + { 66, 96 }, + { 80, 104 }, + { 108, 96 }, + { 122, 112 }, + { 137, 112 }, + { 169, 176 } }, + 9 /* num_cr_points */, + 11 /* scaling_shift */, + 2 /* ar_coeff_lag */, + { 0, 0, -58, 0, 0, 0, -76, 100, -43, 0, -51, 82 }, + { 0, 0, -49, 0, 0, 0, -36, 22, -30, 0, -38, 7, 39 }, + { 0, 0, -47, 0, 0, 0, -31, 31, -25, 0, -32, 13, -100 }, + 8 /* ar_coeff_shift */, + 247 /* cb_mult */, + 192 /* cb_luma_mult */, + 18 /* cb_offset */, + 229 /* cr_mult */, + 192 /* cr_luma_mult */, + 54 /* cr_offset */, + 0 /* overlap_flag */, + 1 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /* chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 2 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { { 0, 96 }, { 255, 96 } }, + 2 /* num_points_y */, + { { 0, 64 }, { 255, 64 } }, + 2 /* num_cb_points */, + { { 0, 64 }, { 255, 64 } }, + 2 /* num_cr_points */, + 11 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127, + }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 3 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { { 0, 192 }, { 255, 192 } }, + 2 /* num_points_y */, + { { 0, 128 }, { 255, 128 } }, + 2 /* num_cb_points */, + { { 0, 128 }, { 255, 128 } }, + 2 /* num_cr_points */, + 11 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, + }, + { + 4, -7, 2, 4, 12, -12, 5, -8, 6, 8, -19, -16, 19, + -10, -2, 17, -42, 58, -2, -13, 9, 14, -36, 67, 0, + }, + { + 4, -7, 2, 4, 12, -12, 5, -8, 6, 8, -19, -16, 19, + -10, -2, 17, -42, 58, -2, -13, 9, 14, -36, 67, 0, + }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 1 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 1 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 4 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { + { 16, 0 }, + { 24, 137 }, + { 53, 146 }, + { 63, 155 }, + { 78, 155 }, + { 107, 150 }, + { 122, 147 }, + { 136, 147 }, + { 166, 153 }, + }, + 9 /* num_points_y */, + { + { 16, 0 }, + { 20, 72 }, + { 27, 82 }, + { 33, 91 }, + { 69, 121 }, + { 95, 143 }, + { 108, 154 }, + { 134, 169 }, + { 147, 177 }, + }, + 9 /* num_cb_points */, + { + { 16, 0 }, + { 24, 95 }, + { 54, 93 }, + { 65, 94 }, + { 79, 98 }, + { 109, 107 }, + { 124, 119 }, + { 139, 136 }, + { 169, 170 }, + }, + 9 /* num_cr_points */, + 11 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 7, -9, 2, 4, 7, -12, 7, -18, 18, -30, -27, -42, + 13, -20, 7, -18, 6, 107, 55, -2, -4, -9, -22, 113, + }, + { + -3, -1, -4, 3, -6, -2, 3, 1, -4, -10, -10, -5, -5, + -3, -1, -13, -28, -25, -31, -6, -4, 14, -64, 66, 0, + }, + { + 0, 4, -3, 13, 0, 1, -3, 0, -3, -10, -68, -4, -2, + -5, 2, -3, -20, 62, -31, 0, -4, -1, -8, -29, 0, + }, + 8 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 5 */ + { + 1 /* apply_grain */, + 0 /* update_parameters */, + { { 0, 64 }, { 255, 64 } }, + 2 /* num_points_y */, + { + { 0, 96 }, + { 32, 90 }, + { 64, 83 }, + { 96, 76 }, + { 128, 68 }, + { 159, 59 }, + { 191, 48 }, + { 223, 34 }, + { 255, 0 }, + }, + 9 /* num_cb_points */, + { + { 0, 0 }, + { 32, 34 }, + { 64, 48 }, + { 96, 59 }, + { 128, 68 }, + { 159, 76 }, + { 191, 83 }, + { 223, 90 }, + { 255, 96 }, + }, + 9 /* num_cr_points */, + 11 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, + }, + { + -2, 2, -5, 7, -6, 4, -2, -1, 1, -2, 0, -2, 2, + -3, -5, 13, -13, 6, -14, 8, -1, 18, -36, 58, 0, + }, + { + -2, -1, -3, 14, -4, -1, -3, 0, -1, 7, -31, 7, 2, + 0, 1, 0, -7, 50, -8, -2, 2, 2, 2, -4, 0, + }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 1 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 1063 /* random_seed */ + }, + /* Test 6 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { + { 0, 96 }, + { 20, 92 }, + { 39, 88 }, + { 59, 84 }, + { 78, 80 }, + { 98, 75 }, + { 118, 70 }, + { 137, 65 }, + { 157, 60 }, + { 177, 53 }, + { 196, 46 }, + { 216, 38 }, + { 235, 27 }, + { 255, 0 }, + }, + 14 /* num_points_y */, + { { 0, 0 } }, + 0 /* num_cb_points */, + { { 0, 0 } }, + 0 /* num_cr_points */, + 11 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 1 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 2754 /* random_seed */ + }, + /* Test 7 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { + { 0, 0 }, + { 20, 27 }, + { 39, 38 }, + { 59, 46 }, + { 78, 53 }, + { 98, 60 }, + { 118, 65 }, + { 137, 70 }, + { 157, 75 }, + { 177, 80 }, + { 196, 84 }, + { 216, 88 }, + { 235, 92 }, + { 255, 96 }, + }, + 14 /* num_points_y */, + { { 0, 0 }, { 255, 0 } }, + 2 /* num_cb_points */, + { { 0, 0 }, { 255, 0 } }, + 2 /* num_cr_points */, + 11 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 1 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 8 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { { 0, 96 }, { 255, 96 } }, + 2 /* num_points_y */, + { { 0, 62 }, { 255, 62 } }, + 2 /* num_cb_points */, + { { 0, 62 }, { 255, 62 } }, + 2 /* num_cr_points */, + 11 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, + }, + { + 0, -2, -2, 8, 5, -1, 1, -1, 5, 16, -33, -9, 6, + -1, -3, 10, -47, 63, 0, -15, 3, 11, -42, 75, -69, + }, + { + 1, -1, -1, 9, 5, 0, 1, -1, 5, 15, -32, -10, 8, + -2, -4, 11, -46, 62, 1, -16, 3, 13, -43, 75, -55, + }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 9 */ + { + 1 /* apply_grain */, + 0 /* update_parameters */, + { { 0, 48 }, { 255, 48 } }, + 2 /* num_points_y */, + { { 0, 32 }, { 255, 32 } }, + 2 /* num_cb_points */, + { { 0, 32 }, { 255, 32 } }, + 2 /* num_cr_points */, + 10 /* scaling_shift */, + 2 /* ar_coeff_lag */, + { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 }, + 8 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 10 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { { 0, 48 }, { 255, 48 } }, + 2 /* num_points_y */, + { { 0, 32 }, { 255, 32 } }, + 2 /* num_cb_points */, + { { 0, 32 }, { 255, 32 } }, + 2 /* num_cr_points */, + 10 /* scaling_shift */, + 2 /* ar_coeff_lag */, + { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 }, + { -7, -6, -48, -22, 2, -3, -45, 73, -11, -26, -52, 76, 0 }, + { -7, -6, -48, -22, 2, -3, -45, 73, -11, -26, -52, 76, 0 }, + 8 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 11 */ + { + 1 /* apply_grain */, + 0 /* update_parameters */, + { { 0, 32 }, { 255, 32 } }, + 2 /* num_points_y */, + { + { 0, 48 }, + { 32, 45 }, + { 64, 42 }, + { 96, 38 }, + { 128, 34 }, + { 159, 29 }, + { 191, 24 }, + { 223, 17 }, + { 255, 0 }, + }, + 9 /* num_cb_points */, + { + { 0, 0 }, + { 32, 17 }, + { 64, 24 }, + { 96, 29 }, + { 128, 34 }, + { 159, 38 }, + { 191, 42 }, + { 223, 45 }, + { 255, 48 }, + }, + 9 /* num_cr_points */, + 10 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 7, -9, 2, 4, 7, -12, 7, -18, 18, -30, -27, -42, + 13, -20, 7, -18, 6, 107, 55, -2, -4, -9, -22, 113, + }, + { + -3, -1, -4, 3, -6, -2, 3, 1, -4, -10, -10, -5, -5, + -3, -1, -13, -28, -25, -31, -6, -4, 14, -64, 66, 0, + }, + { + 0, 4, -3, 13, 0, 1, -3, 0, -3, -10, -68, -4, -2, + -5, 2, -3, -20, 62, -31, 0, -4, -1, -8, -29, 0, + }, + 8 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 1 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 1357 /* random_seed */ + }, + /* Test 12 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { + { 16, 0 }, + { 24, 49 }, + { 39, 69 }, + { 46, 84 }, + { 53, 91 }, + { 63, 100 }, + { 78, 114 }, + { 92, 134 }, + { 164, 139 }, + }, + 9 /* num_points_y */, + { + { 16, 0 }, + { 20, 31 }, + { 26, 42 }, + { 33, 54 }, + { 40, 65 }, + { 47, 72 }, + { 56, 85 }, + { 84, 123 }, + { 152, 157 }, + }, + 9 /* num_cb_points */, + { + { 16, 0 }, + { 25, 14 }, + { 39, 33 }, + { 47, 40 }, + { 54, 47 }, + { 64, 62 }, + { 79, 76 }, + { 94, 83 }, + { 167, 101 }, + }, + 9 /* num_cr_points */, + 10 /* scaling_shift */, + 2 /* ar_coeff_lag */, + { 0, 0, -58, 0, 0, 0, -76, 100, -43, 0, -51, 82 }, + { 0, 0, -49, 0, 0, 0, -36, 22, -30, 0, -38, 7, 39 }, + { 0, 0, -47, 0, 0, 0, -31, 31, -25, 0, -32, 13, -100 }, + 8 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 0 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 13 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { + { 0, 48 }, + { 20, 46 }, + { 39, 44 }, + { 59, 42 }, + { 78, 40 }, + { 98, 38 }, + { 118, 35 }, + { 137, 33 }, + { 157, 30 }, + { 177, 27 }, + { 196, 23 }, + { 216, 19 }, + { 235, 13 }, + { 255, 0 }, + }, + 14 /* num_points_y */, + { { 0, 0 }, { 255, 0 } }, + 0 /* num_cb_points */, + { { 0, 0 }, { 255, 0 } }, + 0 /* num_cr_points */, + 10 /* scaling_shift */, + 2 /* ar_coeff_lag */, + { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + 8 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 14 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { + { 0, 0 }, + { 20, 13 }, + { 39, 19 }, + { 59, 23 }, + { 78, 27 }, + { 98, 30 }, + { 118, 33 }, + { 137, 35 }, + { 157, 38 }, + { 177, 40 }, + { 196, 42 }, + { 216, 44 }, + { 235, 46 }, + { 255, 48 }, + }, + 14 /* num_points_y */, + { { 0, 0 }, { 255, 0 } }, + 0 /* num_cb_points */, + { { 0, 0 }, { 255, 0 } }, + 0 /* num_cr_points */, + 10 /* scaling_shift */, + 2 /* ar_coeff_lag */, + { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + 8 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 1 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 15 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { { 0, 96 }, { 255, 96 } }, + 1 /* num_points_y */, + { { 0, 96 }, { 255, 96 } }, + 0 /* num_cb_points */, + { { 0, 96 }, { 255, 96 } }, + 0 /* num_cr_points */, + 11 /* scaling_shift */, + 2 /* ar_coeff_lag */, + { 5, -15, -10, -19, 0, -12, 6, 51, 30, -5, -12, 56 }, + { 2, 2, -24, -5, 1, 1, -18, 37, -2, 0, -15, 39, -70 }, + { 2, 3, -24, -5, -1, 0, -18, 38, -2, 0, -15, 39, -55 }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 1 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 16 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { + { 16, 0 }, + { 58, 126 }, + { 87, 120 }, + { 97, 122 }, + { 112, 125 }, + { 126, 131 }, + { 141, 139 }, + { 199, 153 }, + }, + 8 /* num_points_y */, + { + { 16, 0 }, + { 59, 68 }, + { 66, 76 }, + { 73, 82 }, + { 79, 85 }, + { 86, 86 }, + { 151, 95 }, + { 192, 101 }, + }, + 8 /* num_cb_points */, + { + { 16, 0 }, + { 59, 64 }, + { 89, 80 }, + { 99, 86 }, + { 114, 90 }, + { 129, 93 }, + { 144, 97 }, + { 203, 85 }, + }, + 8 /* num_cr_points */, + 10 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, + }, + { + 0, -2, -2, 8, 5, -1, 1, -1, 5, 16, -33, -9, 6, + -1, -3, 10, -47, 63, 0, -15, 3, 11, -42, 75, -69, + }, + { + 1, -1, -1, 9, 5, 0, 1, -1, 5, 15, -32, -10, 8, + -2, -4, 11, -46, 62, 1, -16, 3, 13, -43, 75, -55, + }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 2 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, +}; +#endif // AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_ diff --git a/libs/libaom/src/av1/encoder/hash.c b/libs/libaom/src/av1/encoder/hash.c new file mode 100644 index 000000000..3091037eb --- /dev/null +++ b/libs/libaom/src/av1/encoder/hash.c @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/encoder/hash.h" + +static void crc_calculator_process_data(CRC_CALCULATOR *p_crc_calculator, + uint8_t *pData, uint32_t dataLength) { + for (uint32_t i = 0; i < dataLength; i++) { + const uint8_t index = (uint8_t)( + (p_crc_calculator->remainder >> (p_crc_calculator->bits - 8)) ^ + pData[i]); + p_crc_calculator->remainder <<= 8; + p_crc_calculator->remainder ^= p_crc_calculator->table[index]; + } +} + +static void crc_calculator_reset(CRC_CALCULATOR *p_crc_calculator) { + p_crc_calculator->remainder = 0; +} + +static uint32_t crc_calculator_get_crc(CRC_CALCULATOR *p_crc_calculator) { + return p_crc_calculator->remainder & p_crc_calculator->final_result_mask; +} + +static void crc_calculator_init_table(CRC_CALCULATOR *p_crc_calculator) { + const uint32_t high_bit = 1 << (p_crc_calculator->bits - 1); + const uint32_t byte_high_bit = 1 << (8 - 1); + + for (uint32_t value = 0; value < 256; value++) { + uint32_t remainder = 0; + for (uint8_t mask = byte_high_bit; mask != 0; mask >>= 1) { + if (value & mask) { + remainder ^= high_bit; + } + + if (remainder & high_bit) { + remainder <<= 1; + remainder ^= p_crc_calculator->trunc_poly; + } else { + remainder <<= 1; + } + } + p_crc_calculator->table[value] = remainder; + } +} + +void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits, + uint32_t truncPoly) { + p_crc_calculator->remainder = 0; + p_crc_calculator->bits = bits; + p_crc_calculator->trunc_poly = truncPoly; + p_crc_calculator->final_result_mask = (1 << bits) - 1; + crc_calculator_init_table(p_crc_calculator); +} + +uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p, + int length) { + crc_calculator_reset(p_crc_calculator); + crc_calculator_process_data(p_crc_calculator, p, length); + return crc_calculator_get_crc(p_crc_calculator); +} + +/* CRC-32C (iSCSI) polynomial in reversed bit order. */ +#define POLY 0x82f63b78 + +/* Construct table for software CRC-32C calculation. */ +void av1_crc32c_calculator_init(CRC32C *p_crc32c) { + uint32_t crc; + + for (int n = 0; n < 256; n++) { + crc = n; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + p_crc32c->table[0][n] = crc; + } + for (int n = 0; n < 256; n++) { + crc = p_crc32c->table[0][n]; + for (int k = 1; k < 8; k++) { + crc = p_crc32c->table[0][crc & 0xff] ^ (crc >> 8); + p_crc32c->table[k][n] = crc; + } + } +} + +/* Table-driven software version as a fall-back. This is about 15 times slower + than using the hardware instructions. This assumes little-endian integers, + as is the case on Intel processors that the assembler code here is for. */ +uint32_t av1_get_crc32c_value_c(void *c, uint8_t *buf, size_t len) { + const uint8_t *next = (const uint8_t *)(buf); + uint64_t crc; + CRC32C *p = (CRC32C *)c; + crc = 0 ^ 0xffffffff; + while (len && ((uintptr_t)next & 7) != 0) { + crc = p->table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); + len--; + } + while (len >= 8) { + crc ^= *(uint64_t *)next; + crc = p->table[7][crc & 0xff] ^ p->table[6][(crc >> 8) & 0xff] ^ + p->table[5][(crc >> 16) & 0xff] ^ p->table[4][(crc >> 24) & 0xff] ^ + p->table[3][(crc >> 32) & 0xff] ^ p->table[2][(crc >> 40) & 0xff] ^ + p->table[1][(crc >> 48) & 0xff] ^ p->table[0][crc >> 56]; + next += 8; + len -= 8; + } + while (len) { + crc = p->table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); + len--; + } + return (uint32_t)crc ^ 0xffffffff; +} diff --git a/libs/libaom/src/av1/encoder/hash.h b/libs/libaom/src/av1/encoder/hash.h new file mode 100644 index 000000000..d8e8cc3a0 --- /dev/null +++ b/libs/libaom/src/av1/encoder/hash.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_HASH_H_ +#define AOM_AV1_ENCODER_HASH_H_ + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _crc_calculator { + uint32_t remainder; + uint32_t trunc_poly; + uint32_t bits; + uint32_t table[256]; + uint32_t final_result_mask; +} CRC_CALCULATOR; + +// Initialize the crc calculator. It must be executed at least once before +// calling av1_get_crc_value(). +void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits, + uint32_t truncPoly); +uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p, + int length); + +// CRC32C: POLY = 0x82f63b78; +typedef struct _CRC32C { + /* Table for a quadword-at-a-time software crc. */ + uint32_t table[8][256]; +} CRC32C; + +// init table for software version crc32c +void av1_crc32c_calculator_init(CRC32C *p_crc32c); + +#define AOM_BUFFER_SIZE_FOR_BLOCK_HASH (4096) + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_HASH_H_ diff --git a/libs/libaom/src/av1/encoder/hash_motion.c b/libs/libaom/src/av1/encoder/hash_motion.c new file mode 100644 index 000000000..310cde886 --- /dev/null +++ b/libs/libaom/src/av1/encoder/hash_motion.c @@ -0,0 +1,491 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "av1/encoder/block.h" +#include "av1/encoder/hash.h" +#include "av1/encoder/hash_motion.h" + +#define kSrcBits 16 +#define kBlockSizeBits 3 +#define kMaxAddr (1 << (kSrcBits + kBlockSizeBits)) + +// TODO(youzhou@microsoft.com): is higher than 8 bits screen content supported? +// If yes, fix this function +static void get_pixels_in_1D_char_array_by_block_2x2(const uint8_t *y_src, + int stride, + uint8_t *p_pixels_in1D) { + const uint8_t *p_pel = y_src; + int index = 0; + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 2; j++) { + p_pixels_in1D[index++] = p_pel[j]; + } + p_pel += stride; + } +} + +static void get_pixels_in_1D_short_array_by_block_2x2(const uint16_t *y_src, + int stride, + uint16_t *p_pixels_in1D) { + const uint16_t *p_pel = y_src; + int index = 0; + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 2; j++) { + p_pixels_in1D[index++] = p_pel[j]; + } + p_pel += stride; + } +} + +static int is_block_2x2_row_same_value(const uint8_t *p) { + if (p[0] != p[1] || p[2] != p[3]) { + return 0; + } + return 1; +} + +static int is_block16_2x2_row_same_value(const uint16_t *p) { + if (p[0] != p[1] || p[2] != p[3]) { + return 0; + } + return 1; +} + +static int is_block_2x2_col_same_value(const uint8_t *p) { + if ((p[0] != p[2]) || (p[1] != p[3])) { + return 0; + } + return 1; +} + +static int is_block16_2x2_col_same_value(const uint16_t *p) { + if ((p[0] != p[2]) || (p[1] != p[3])) { + return 0; + } + return 1; +} + +// the hash value (hash_value1 consists two parts, the first 3 bits relate to +// the block size and the remaining 16 bits are the crc values. This fuction +// is used to get the first 3 bits. +static int hash_block_size_to_index(int block_size) { + switch (block_size) { + case 4: return 0; + case 8: return 1; + case 16: return 2; + case 32: return 3; + case 64: return 4; + case 128: return 5; + default: return -1; + } +} + +void av1_hash_table_init(IntraBCHashInfo *intrabc_hash_info) { + if (!intrabc_hash_info->g_crc_initialized) { + av1_crc_calculator_init(&intrabc_hash_info->crc_calculator1, 24, 0x5D6DCB); + av1_crc_calculator_init(&intrabc_hash_info->crc_calculator2, 24, 0x864CFB); + intrabc_hash_info->g_crc_initialized = 1; + } + intrabc_hash_info->intrabc_hash_table.p_lookup_table = NULL; +} + +void av1_hash_table_clear_all(hash_table *p_hash_table) { + if (p_hash_table->p_lookup_table == NULL) { + return; + } + for (int i = 0; i < kMaxAddr; i++) { + if (p_hash_table->p_lookup_table[i] != NULL) { + aom_vector_destroy(p_hash_table->p_lookup_table[i]); + aom_free(p_hash_table->p_lookup_table[i]); + p_hash_table->p_lookup_table[i] = NULL; + } + } +} + +void av1_hash_table_destroy(hash_table *p_hash_table) { + av1_hash_table_clear_all(p_hash_table); + aom_free(p_hash_table->p_lookup_table); + p_hash_table->p_lookup_table = NULL; +} + +void av1_hash_table_create(hash_table *p_hash_table) { + if (p_hash_table->p_lookup_table != NULL) { + av1_hash_table_clear_all(p_hash_table); + return; + } + p_hash_table->p_lookup_table = + (Vector **)aom_malloc(sizeof(p_hash_table->p_lookup_table[0]) * kMaxAddr); + memset(p_hash_table->p_lookup_table, 0, + sizeof(p_hash_table->p_lookup_table[0]) * kMaxAddr); +} + +static void hash_table_add_to_table(hash_table *p_hash_table, + uint32_t hash_value, + block_hash *curr_block_hash) { + if (p_hash_table->p_lookup_table[hash_value] == NULL) { + p_hash_table->p_lookup_table[hash_value] = + aom_malloc(sizeof(p_hash_table->p_lookup_table[0][0])); + aom_vector_setup(p_hash_table->p_lookup_table[hash_value], 10, + sizeof(curr_block_hash[0])); + aom_vector_push_back(p_hash_table->p_lookup_table[hash_value], + curr_block_hash); + } else { + aom_vector_push_back(p_hash_table->p_lookup_table[hash_value], + curr_block_hash); + } +} + +int32_t av1_hash_table_count(const hash_table *p_hash_table, + uint32_t hash_value) { + if (p_hash_table->p_lookup_table[hash_value] == NULL) { + return 0; + } else { + return (int32_t)(p_hash_table->p_lookup_table[hash_value]->size); + } +} + +Iterator av1_hash_get_first_iterator(hash_table *p_hash_table, + uint32_t hash_value) { + assert(av1_hash_table_count(p_hash_table, hash_value) > 0); + return aom_vector_begin(p_hash_table->p_lookup_table[hash_value]); +} + +int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1, + uint32_t hash_value2) { + if (p_hash_table->p_lookup_table[hash_value1] == NULL) { + return 0; + } + Iterator iterator = + aom_vector_begin(p_hash_table->p_lookup_table[hash_value1]); + Iterator last = aom_vector_end(p_hash_table->p_lookup_table[hash_value1]); + for (; !aom_iterator_equals(&iterator, &last); + aom_iterator_increment(&iterator)) { + if ((*(block_hash *)aom_iterator_get(&iterator)).hash_value2 == + hash_value2) { + return 1; + } + } + return 0; +} + +void av1_generate_block_2x2_hash_value(IntraBCHashInfo *intrabc_hash_info, + const YV12_BUFFER_CONFIG *picture, + uint32_t *pic_block_hash[2], + int8_t *pic_block_same_info[3]) { + const int width = 2; + const int height = 2; + const int x_end = picture->y_crop_width - width + 1; + const int y_end = picture->y_crop_height - height + 1; + CRC_CALCULATOR *calc_1 = &intrabc_hash_info->crc_calculator1; + CRC_CALCULATOR *calc_2 = &intrabc_hash_info->crc_calculator2; + + const int length = width * 2; + if (picture->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t p[4]; + int pos = 0; + for (int y_pos = 0; y_pos < y_end; y_pos++) { + for (int x_pos = 0; x_pos < x_end; x_pos++) { + get_pixels_in_1D_short_array_by_block_2x2( + CONVERT_TO_SHORTPTR(picture->y_buffer) + y_pos * picture->y_stride + + x_pos, + picture->y_stride, p); + pic_block_same_info[0][pos] = is_block16_2x2_row_same_value(p); + pic_block_same_info[1][pos] = is_block16_2x2_col_same_value(p); + + pic_block_hash[0][pos] = + av1_get_crc_value(calc_1, (uint8_t *)p, length * sizeof(p[0])); + pic_block_hash[1][pos] = + av1_get_crc_value(calc_2, (uint8_t *)p, length * sizeof(p[0])); + pos++; + } + pos += width - 1; + } + } else { + uint8_t p[4]; + int pos = 0; + for (int y_pos = 0; y_pos < y_end; y_pos++) { + for (int x_pos = 0; x_pos < x_end; x_pos++) { + get_pixels_in_1D_char_array_by_block_2x2( + picture->y_buffer + y_pos * picture->y_stride + x_pos, + picture->y_stride, p); + pic_block_same_info[0][pos] = is_block_2x2_row_same_value(p); + pic_block_same_info[1][pos] = is_block_2x2_col_same_value(p); + + pic_block_hash[0][pos] = + av1_get_crc_value(calc_1, p, length * sizeof(p[0])); + pic_block_hash[1][pos] = + av1_get_crc_value(calc_2, p, length * sizeof(p[0])); + pos++; + } + pos += width - 1; + } + } +} + +void av1_generate_block_hash_value(IntraBCHashInfo *intrabc_hash_info, + const YV12_BUFFER_CONFIG *picture, + int block_size, + uint32_t *src_pic_block_hash[2], + uint32_t *dst_pic_block_hash[2], + int8_t *src_pic_block_same_info[3], + int8_t *dst_pic_block_same_info[3]) { + CRC_CALCULATOR *calc_1 = &intrabc_hash_info->crc_calculator1; + CRC_CALCULATOR *calc_2 = &intrabc_hash_info->crc_calculator2; + + const int pic_width = picture->y_crop_width; + const int x_end = picture->y_crop_width - block_size + 1; + const int y_end = picture->y_crop_height - block_size + 1; + + const int src_size = block_size >> 1; + const int quad_size = block_size >> 2; + + uint32_t p[4]; + const int length = sizeof(p); + + int pos = 0; + for (int y_pos = 0; y_pos < y_end; y_pos++) { + for (int x_pos = 0; x_pos < x_end; x_pos++) { + p[0] = src_pic_block_hash[0][pos]; + p[1] = src_pic_block_hash[0][pos + src_size]; + p[2] = src_pic_block_hash[0][pos + src_size * pic_width]; + p[3] = src_pic_block_hash[0][pos + src_size * pic_width + src_size]; + dst_pic_block_hash[0][pos] = + av1_get_crc_value(calc_1, (uint8_t *)p, length); + + p[0] = src_pic_block_hash[1][pos]; + p[1] = src_pic_block_hash[1][pos + src_size]; + p[2] = src_pic_block_hash[1][pos + src_size * pic_width]; + p[3] = src_pic_block_hash[1][pos + src_size * pic_width + src_size]; + dst_pic_block_hash[1][pos] = + av1_get_crc_value(calc_2, (uint8_t *)p, length); + + dst_pic_block_same_info[0][pos] = + src_pic_block_same_info[0][pos] && + src_pic_block_same_info[0][pos + quad_size] && + src_pic_block_same_info[0][pos + src_size] && + src_pic_block_same_info[0][pos + src_size * pic_width] && + src_pic_block_same_info[0][pos + src_size * pic_width + quad_size] && + src_pic_block_same_info[0][pos + src_size * pic_width + src_size]; + + dst_pic_block_same_info[1][pos] = + src_pic_block_same_info[1][pos] && + src_pic_block_same_info[1][pos + src_size] && + src_pic_block_same_info[1][pos + quad_size * pic_width] && + src_pic_block_same_info[1][pos + quad_size * pic_width + src_size] && + src_pic_block_same_info[1][pos + src_size * pic_width] && + src_pic_block_same_info[1][pos + src_size * pic_width + src_size]; + pos++; + } + pos += block_size - 1; + } + + if (block_size >= 4) { + const int size_minus_1 = block_size - 1; + pos = 0; + for (int y_pos = 0; y_pos < y_end; y_pos++) { + for (int x_pos = 0; x_pos < x_end; x_pos++) { + dst_pic_block_same_info[2][pos] = + (!dst_pic_block_same_info[0][pos] && + !dst_pic_block_same_info[1][pos]) || + (((x_pos & size_minus_1) == 0) && ((y_pos & size_minus_1) == 0)); + pos++; + } + pos += block_size - 1; + } + } +} + +void av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table, + uint32_t *pic_hash[2], + int8_t *pic_is_same, + int pic_width, int pic_height, + int block_size) { + const int x_end = pic_width - block_size + 1; + const int y_end = pic_height - block_size + 1; + + const int8_t *src_is_added = pic_is_same; + const uint32_t *src_hash[2] = { pic_hash[0], pic_hash[1] }; + + int add_value = hash_block_size_to_index(block_size); + assert(add_value >= 0); + add_value <<= kSrcBits; + const int crc_mask = (1 << kSrcBits) - 1; + + for (int x_pos = 0; x_pos < x_end; x_pos++) { + for (int y_pos = 0; y_pos < y_end; y_pos++) { + const int pos = y_pos * pic_width + x_pos; + // valid data + if (src_is_added[pos]) { + block_hash curr_block_hash; + curr_block_hash.x = x_pos; + curr_block_hash.y = y_pos; + + const uint32_t hash_value1 = (src_hash[0][pos] & crc_mask) + add_value; + curr_block_hash.hash_value2 = src_hash[1][pos]; + + hash_table_add_to_table(p_hash_table, hash_value1, &curr_block_hash); + } + } + } +} + +int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture, + int block_size, int x_start, int y_start) { + const int stride = picture->y_stride; + const uint8_t *p = picture->y_buffer + y_start * stride + x_start; + + if (picture->flags & YV12_FLAG_HIGHBITDEPTH) { + const uint16_t *p16 = CONVERT_TO_SHORTPTR(p); + for (int i = 0; i < block_size; i++) { + for (int j = 1; j < block_size; j++) { + if (p16[j] != p16[0]) { + return 0; + } + } + p16 += stride; + } + } else { + for (int i = 0; i < block_size; i++) { + for (int j = 1; j < block_size; j++) { + if (p[j] != p[0]) { + return 0; + } + } + p += stride; + } + } + + return 1; +} + +int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture, + int block_size, int x_start, int y_start) { + const int stride = picture->y_stride; + const uint8_t *p = picture->y_buffer + y_start * stride + x_start; + + if (picture->flags & YV12_FLAG_HIGHBITDEPTH) { + const uint16_t *p16 = CONVERT_TO_SHORTPTR(p); + for (int i = 0; i < block_size; i++) { + for (int j = 1; j < block_size; j++) { + if (p16[j * stride + i] != p16[i]) { + return 0; + } + } + } + } else { + for (int i = 0; i < block_size; i++) { + for (int j = 1; j < block_size; j++) { + if (p[j * stride + i] != p[i]) { + return 0; + } + } + } + } + return 1; +} + +void av1_get_block_hash_value(IntraBCHashInfo *intrabc_hash_info, + const uint8_t *y_src, int stride, int block_size, + uint32_t *hash_value1, uint32_t *hash_value2, + int use_highbitdepth) { + int add_value = hash_block_size_to_index(block_size); + assert(add_value >= 0); + add_value <<= kSrcBits; + const int crc_mask = (1 << kSrcBits) - 1; + + CRC_CALCULATOR *calc_1 = &intrabc_hash_info->crc_calculator1; + CRC_CALCULATOR *calc_2 = &intrabc_hash_info->crc_calculator2; + uint32_t **buf_1 = intrabc_hash_info->hash_value_buffer[0]; + uint32_t **buf_2 = intrabc_hash_info->hash_value_buffer[1]; + + // 2x2 subblock hash values in current CU + int sub_block_in_width = (block_size >> 1); + if (use_highbitdepth) { + uint16_t pixel_to_hash[4]; + uint16_t *y16_src = CONVERT_TO_SHORTPTR(y_src); + for (int y_pos = 0; y_pos < block_size; y_pos += 2) { + for (int x_pos = 0; x_pos < block_size; x_pos += 2) { + int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1); + get_pixels_in_1D_short_array_by_block_2x2( + y16_src + y_pos * stride + x_pos, stride, pixel_to_hash); + assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); + buf_1[0][pos] = av1_get_crc_value(calc_1, (uint8_t *)pixel_to_hash, + sizeof(pixel_to_hash)); + buf_2[0][pos] = av1_get_crc_value(calc_2, (uint8_t *)pixel_to_hash, + sizeof(pixel_to_hash)); + } + } + } else { + uint8_t pixel_to_hash[4]; + for (int y_pos = 0; y_pos < block_size; y_pos += 2) { + for (int x_pos = 0; x_pos < block_size; x_pos += 2) { + int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1); + get_pixels_in_1D_char_array_by_block_2x2(y_src + y_pos * stride + x_pos, + stride, pixel_to_hash); + assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); + buf_1[0][pos] = + av1_get_crc_value(calc_1, pixel_to_hash, sizeof(pixel_to_hash)); + buf_2[0][pos] = + av1_get_crc_value(calc_2, pixel_to_hash, sizeof(pixel_to_hash)); + } + } + } + + int src_sub_block_in_width = sub_block_in_width; + sub_block_in_width >>= 1; + + int src_idx = 1; + int dst_idx = 0; + + // 4x4 subblock hash values to current block hash values + uint32_t to_hash[4]; + for (int sub_width = 4; sub_width <= block_size; sub_width *= 2) { + src_idx = 1 - src_idx; + dst_idx = 1 - dst_idx; + + int dst_pos = 0; + for (int y_pos = 0; y_pos < sub_block_in_width; y_pos++) { + for (int x_pos = 0; x_pos < sub_block_in_width; x_pos++) { + int srcPos = (y_pos << 1) * src_sub_block_in_width + (x_pos << 1); + + assert(srcPos + 1 < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); + assert(srcPos + src_sub_block_in_width + 1 < + AOM_BUFFER_SIZE_FOR_BLOCK_HASH); + assert(dst_pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); + to_hash[0] = buf_1[src_idx][srcPos]; + to_hash[1] = buf_1[src_idx][srcPos + 1]; + to_hash[2] = buf_1[src_idx][srcPos + src_sub_block_in_width]; + to_hash[3] = buf_1[src_idx][srcPos + src_sub_block_in_width + 1]; + + buf_1[dst_idx][dst_pos] = + av1_get_crc_value(calc_1, (uint8_t *)to_hash, sizeof(to_hash)); + + to_hash[0] = buf_2[src_idx][srcPos]; + to_hash[1] = buf_2[src_idx][srcPos + 1]; + to_hash[2] = buf_2[src_idx][srcPos + src_sub_block_in_width]; + to_hash[3] = buf_2[src_idx][srcPos + src_sub_block_in_width + 1]; + buf_2[dst_idx][dst_pos] = + av1_get_crc_value(calc_2, (uint8_t *)to_hash, sizeof(to_hash)); + dst_pos++; + } + } + + src_sub_block_in_width = sub_block_in_width; + sub_block_in_width >>= 1; + } + + *hash_value1 = (buf_1[dst_idx][0] & crc_mask) + add_value; + *hash_value2 = buf_2[dst_idx][0]; +} diff --git a/libs/libaom/src/av1/encoder/hash_motion.h b/libs/libaom/src/av1/encoder/hash_motion.h new file mode 100644 index 000000000..e4ea1f394 --- /dev/null +++ b/libs/libaom/src/av1/encoder/hash_motion.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_HASH_MOTION_H_ +#define AOM_AV1_ENCODER_HASH_MOTION_H_ + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_scale/yv12config.h" +#include "av1/encoder/hash.h" +#include "third_party/vector/vector.h" +#ifdef __cplusplus +extern "C" { +#endif + +// Block size used for force_integer_mv decisions +#define FORCE_INT_MV_DECISION_BLOCK_SIZE 8 + +// store a block's hash info. +// x and y are the position from the top left of the picture +// hash_value2 is used to store the second hash value +typedef struct _block_hash { + int16_t x; + int16_t y; + uint32_t hash_value2; +} block_hash; + +typedef struct _hash_table { + Vector **p_lookup_table; +} hash_table; + +struct intrabc_hash_info; + +typedef struct intrabc_hash_info { + // buffer for hash value calculation of a block + // used only in av1_get_block_hash_value() + // [first hash/second hash] + // [two buffers used ping-pong] + uint32_t *hash_value_buffer[2][2]; + hash_table intrabc_hash_table; + + CRC_CALCULATOR crc_calculator1; + CRC_CALCULATOR crc_calculator2; + int g_crc_initialized; +} IntraBCHashInfo; + +void av1_hash_table_init(IntraBCHashInfo *intra_bc_hash_info); +void av1_hash_table_clear_all(hash_table *p_hash_table); +void av1_hash_table_destroy(hash_table *p_hash_table); +void av1_hash_table_create(hash_table *p_hash_table); +int32_t av1_hash_table_count(const hash_table *p_hash_table, + uint32_t hash_value); +Iterator av1_hash_get_first_iterator(hash_table *p_hash_table, + uint32_t hash_value); +int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1, + uint32_t hash_value2); +void av1_generate_block_2x2_hash_value(IntraBCHashInfo *intra_bc_hash_info, + const YV12_BUFFER_CONFIG *picture, + uint32_t *pic_block_hash[2], + int8_t *pic_block_same_info[3]); +void av1_generate_block_hash_value(IntraBCHashInfo *intra_bc_hash_info, + const YV12_BUFFER_CONFIG *picture, + int block_size, + uint32_t *src_pic_block_hash[2], + uint32_t *dst_pic_block_hash[2], + int8_t *src_pic_block_same_info[3], + int8_t *dst_pic_block_same_info[3]); +void av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table, + uint32_t *pic_hash[2], + int8_t *pic_is_same, + int pic_width, int pic_height, + int block_size); + +// check whether the block starts from (x_start, y_start) with the size of +// block_size x block_size has the same color in all rows +int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture, + int block_size, int x_start, int y_start); +// check whether the block starts from (x_start, y_start) with the size of +// block_size x block_size has the same color in all columns +int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture, + int block_size, int x_start, int y_start); + +void av1_get_block_hash_value(IntraBCHashInfo *intrabc_hash_info, + const uint8_t *y_src, int stride, int block_size, + uint32_t *hash_value1, uint32_t *hash_value2, + int use_highbitdepth); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_HASH_MOTION_H_ diff --git a/libs/libaom/src/av1/encoder/hybrid_fwd_txfm.c b/libs/libaom/src/av1/encoder/hybrid_fwd_txfm.c new file mode 100644 index 000000000..06990857a --- /dev/null +++ b/libs/libaom/src/av1/encoder/hybrid_fwd_txfm.c @@ -0,0 +1,308 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" +#include "config/aom_dsp_rtcd.h" + +#include "av1/common/idct.h" +#include "av1/encoder/hybrid_fwd_txfm.h" + +/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per + pixel. */ +void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) { + int i; + tran_high_t a1, b1, c1, d1, e1; + const int16_t *ip_pass0 = input; + const tran_low_t *ip = NULL; + tran_low_t *op = output; + + for (i = 0; i < 4; i++) { + a1 = ip_pass0[0 * stride]; + b1 = ip_pass0[1 * stride]; + c1 = ip_pass0[2 * stride]; + d1 = ip_pass0[3 * stride]; + + a1 += b1; + d1 = d1 - c1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= c1; + d1 += b1; + op[0] = (tran_low_t)a1; + op[4] = (tran_low_t)c1; + op[8] = (tran_low_t)d1; + op[12] = (tran_low_t)b1; + + ip_pass0++; + op++; + } + ip = output; + op = output; + + for (i = 0; i < 4; i++) { + a1 = ip[0]; + b1 = ip[1]; + c1 = ip[2]; + d1 = ip[3]; + + a1 += b1; + d1 -= c1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= c1; + d1 += b1; + op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR); + op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR); + op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR); + op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR); + + ip += 4; + op += 4; + } +} + +void av1_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, + int stride) { + av1_fwht4x4_c(input, output, stride); +} + +static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + const TX_TYPE tx_type = txfm_param->tx_type; + const int bd = txfm_param->bd; + if (txfm_param->lossless) { + assert(tx_type == DCT_DCT); + av1_highbd_fwht4x4(src_diff, coeff, diff_stride); + return; + } + av1_fwd_txfm2d_4x4(src_diff, dst_coeff, diff_stride, tx_type, bd); +} + +static void highbd_fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + av1_fwd_txfm2d_4x8(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} + +static void highbd_fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + av1_fwd_txfm2d_8x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} + +static void highbd_fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + const TX_TYPE tx_type = txfm_param->tx_type; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_8x16(src_diff, dst_coeff, diff_stride, tx_type, bd); +} + +static void highbd_fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + const TX_TYPE tx_type = txfm_param->tx_type; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_16x8(src_diff, dst_coeff, diff_stride, tx_type, bd); +} + +static void highbd_fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + av1_fwd_txfm2d_16x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} + +static void highbd_fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + av1_fwd_txfm2d_32x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} + +static void highbd_fwd_txfm_16x4(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + av1_fwd_txfm2d_16x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} + +static void highbd_fwd_txfm_4x16(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + av1_fwd_txfm2d_4x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} + +static void highbd_fwd_txfm_32x8(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + av1_fwd_txfm2d_32x8(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} + +static void highbd_fwd_txfm_8x32(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + av1_fwd_txfm2d_8x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} + +static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + const TX_TYPE tx_type = txfm_param->tx_type; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd); +} + +static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + const TX_TYPE tx_type = txfm_param->tx_type; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd); +} + +static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + const TX_TYPE tx_type = txfm_param->tx_type; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd); +} + +static void highbd_fwd_txfm_32x64(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + assert(txfm_param->tx_type == DCT_DCT); + int32_t *dst_coeff = (int32_t *)coeff; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_32x64(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + bd); +} + +static void highbd_fwd_txfm_64x32(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + assert(txfm_param->tx_type == DCT_DCT); + int32_t *dst_coeff = (int32_t *)coeff; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_64x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + bd); +} + +static void highbd_fwd_txfm_16x64(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + assert(txfm_param->tx_type == DCT_DCT); + int32_t *dst_coeff = (int32_t *)coeff; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_16x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); +} + +static void highbd_fwd_txfm_64x16(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + assert(txfm_param->tx_type == DCT_DCT); + int32_t *dst_coeff = (int32_t *)coeff; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_64x16(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); +} + +static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + assert(txfm_param->tx_type == DCT_DCT); + int32_t *dst_coeff = (int32_t *)coeff; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_64x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); +} + +void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, + TxfmParam *txfm_param) { + if (txfm_param->bd == 8) + av1_lowbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param); + else + av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param); +} + +void av1_lowbd_fwd_txfm_c(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param); +} + +void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + const TX_SIZE tx_size = txfm_param->tx_size; + switch (tx_size) { + case TX_64X64: + highbd_fwd_txfm_64x64(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_32X64: + highbd_fwd_txfm_32x64(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_64X32: + highbd_fwd_txfm_64x32(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_16X64: + highbd_fwd_txfm_16x64(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_64X16: + highbd_fwd_txfm_64x16(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_32X32: + highbd_fwd_txfm_32x32(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_16X16: + highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_8X8: + highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_4X8: + highbd_fwd_txfm_4x8(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_8X4: + highbd_fwd_txfm_8x4(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_8X16: + highbd_fwd_txfm_8x16(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_16X8: + highbd_fwd_txfm_16x8(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_16X32: + highbd_fwd_txfm_16x32(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_32X16: + highbd_fwd_txfm_32x16(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_4X4: + highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_4X16: + highbd_fwd_txfm_4x16(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_16X4: + highbd_fwd_txfm_16x4(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_8X32: + highbd_fwd_txfm_8x32(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_32X8: + highbd_fwd_txfm_32x8(src_diff, coeff, diff_stride, txfm_param); + break; + default: assert(0); break; + } +} diff --git a/libs/libaom/src/av1/encoder/hybrid_fwd_txfm.h b/libs/libaom/src/av1/encoder/hybrid_fwd_txfm.h new file mode 100644 index 000000000..daabc7119 --- /dev/null +++ b/libs/libaom/src/av1/encoder/hybrid_fwd_txfm.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_ +#define AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_ + +#include "config/aom_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, + TxfmParam *txfm_param); + +void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_ diff --git a/libs/libaom/src/av1/encoder/interp_search.c b/libs/libaom/src/av1/encoder/interp_search.c new file mode 100644 index 000000000..6b7317be7 --- /dev/null +++ b/libs/libaom/src/av1/encoder/interp_search.c @@ -0,0 +1,753 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/pred_common.h" +#include "av1/encoder/interp_search.h" +#include "av1/encoder/model_rd.h" +#include "av1/encoder/rdopt_utils.h" +#include "av1/encoder/reconinter_enc.h" + +// return mv_diff +static INLINE int is_interp_filter_good_match( + const INTERPOLATION_FILTER_STATS *st, MB_MODE_INFO *const mi, + int skip_level) { + const int is_comp = has_second_ref(mi); + int i; + + for (i = 0; i < 1 + is_comp; ++i) { + if (st->ref_frames[i] != mi->ref_frame[i]) return INT_MAX; + } + + if (skip_level == 1 && is_comp) { + if (st->comp_type != mi->interinter_comp.type) return INT_MAX; + if (st->compound_idx != mi->compound_idx) return INT_MAX; + } + + int mv_diff = 0; + for (i = 0; i < 1 + is_comp; ++i) { + mv_diff += abs(st->mv[i].as_mv.row - mi->mv[i].as_mv.row) + + abs(st->mv[i].as_mv.col - mi->mv[i].as_mv.col); + } + return mv_diff; +} + +static INLINE int save_interp_filter_search_stat( + MB_MODE_INFO *const mbmi, int64_t rd, unsigned int pred_sse, + INTERPOLATION_FILTER_STATS *interp_filter_stats, + int interp_filter_stats_idx) { + if (interp_filter_stats_idx < MAX_INTERP_FILTER_STATS) { + INTERPOLATION_FILTER_STATS stat = { mbmi->interp_filters, + { mbmi->mv[0], mbmi->mv[1] }, + { mbmi->ref_frame[0], + mbmi->ref_frame[1] }, + mbmi->interinter_comp.type, + mbmi->compound_idx, + rd, + pred_sse }; + interp_filter_stats[interp_filter_stats_idx] = stat; + interp_filter_stats_idx++; + } + return interp_filter_stats_idx; +} + +static INLINE int find_interp_filter_in_stats( + MB_MODE_INFO *const mbmi, INTERPOLATION_FILTER_STATS *interp_filter_stats, + int interp_filter_stats_idx, int skip_level) { + // [skip_levels][single or comp] + const int thr[2][2] = { { 0, 0 }, { 3, 7 } }; + const int is_comp = has_second_ref(mbmi); + + // Find good enough match. + // TODO(yunqing): Separate single-ref mode and comp mode stats for fast + // search. + int best = INT_MAX; + int match = -1; + for (int j = 0; j < interp_filter_stats_idx; ++j) { + const INTERPOLATION_FILTER_STATS *st = &interp_filter_stats[j]; + const int mv_diff = is_interp_filter_good_match(st, mbmi, skip_level); + // Exact match is found. + if (mv_diff == 0) { + match = j; + break; + } else if (mv_diff < best && mv_diff <= thr[skip_level - 1][is_comp]) { + best = mv_diff; + match = j; + } + } + + if (match != -1) { + mbmi->interp_filters = interp_filter_stats[match].filters; + return match; + } + return -1; // no match result found +} + +int av1_find_interp_filter_match( + MB_MODE_INFO *const mbmi, const AV1_COMP *const cpi, + const InterpFilter assign_filter, const int need_search, + INTERPOLATION_FILTER_STATS *interp_filter_stats, + int interp_filter_stats_idx) { + int match_found_idx = -1; + if (cpi->sf.interp_sf.use_interp_filter && need_search) + match_found_idx = find_interp_filter_in_stats( + mbmi, interp_filter_stats, interp_filter_stats_idx, + cpi->sf.interp_sf.use_interp_filter); + + if (!need_search || match_found_idx == -1) + set_default_interp_filters(mbmi, assign_filter); + return match_found_idx; +} + +static INLINE void swap_dst_buf(MACROBLOCKD *xd, const BUFFER_SET *dst_bufs[2], + int num_planes) { + const BUFFER_SET *buf0 = dst_bufs[0]; + dst_bufs[0] = dst_bufs[1]; + dst_bufs[1] = buf0; + restore_dst_buf(xd, *dst_bufs[0], num_planes); +} + +static INLINE int get_switchable_rate(MACROBLOCK *const x, + const int_interpfilters filters, + const int ctx[2]) { + int inter_filter_cost; + const InterpFilter filter0 = filters.as_filters.y_filter; + const InterpFilter filter1 = filters.as_filters.x_filter; + inter_filter_cost = x->switchable_interp_costs[ctx[0]][filter0]; + inter_filter_cost += x->switchable_interp_costs[ctx[1]][filter1]; + return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost; +} + +// Build inter predictor and calculate model rd +// for a given plane. +static INLINE void interp_model_rd_eval( + MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize, + const BUFFER_SET *const orig_dst, int plane_from, int plane_to, + RD_STATS *rd_stats, int is_skip_build_pred) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + RD_STATS tmp_rd_stats; + av1_init_rd_stats(&tmp_rd_stats); + + // Skip inter predictor if the predictor is already avilable. + if (!is_skip_build_pred) { + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + plane_from, plane_to); + } + + model_rd_sb_fn[cpi->sf.rt_sf.use_simple_rd_model + ? MODELRD_LEGACY + : MODELRD_TYPE_INTERP_FILTER]( + cpi, bsize, x, xd, plane_from, plane_to, &tmp_rd_stats.rate, + &tmp_rd_stats.dist, &tmp_rd_stats.skip, &tmp_rd_stats.sse, NULL, NULL, + NULL); + + av1_merge_rd_stats(rd_stats, &tmp_rd_stats); +} + +// calculate the rdcost of given interpolation_filter +static INLINE int64_t interpolation_filter_rd( + MACROBLOCK *const x, const AV1_COMP *const cpi, + const TileDataEnc *tile_data, BLOCK_SIZE bsize, + const BUFFER_SET *const orig_dst, int64_t *const rd, + RD_STATS *rd_stats_luma, RD_STATS *rd_stats, int *const switchable_rate, + const BUFFER_SET *dst_bufs[2], int filter_idx, const int switchable_ctx[2], + const int skip_pred) { + const AV1_COMMON *cm = &cpi->common; + const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + RD_STATS this_rd_stats_luma, this_rd_stats; + + // Initialize rd_stats structures to default values. + av1_init_rd_stats(&this_rd_stats_luma); + this_rd_stats = *rd_stats_luma; + const int_interpfilters last_best = mbmi->interp_filters; + mbmi->interp_filters = filter_sets[filter_idx]; + const int tmp_rs = + get_switchable_rate(x, mbmi->interp_filters, switchable_ctx); + + int64_t min_rd = RDCOST(x->rdmult, tmp_rs, 0); + if (min_rd > *rd) { + mbmi->interp_filters = last_best; + return 0; + } + + (void)tile_data; + + assert(skip_pred != 2); + assert((rd_stats_luma->rate >= 0) && (rd_stats->rate >= 0)); + assert((rd_stats_luma->dist >= 0) && (rd_stats->dist >= 0)); + assert((rd_stats_luma->sse >= 0) && (rd_stats->sse >= 0)); + assert((rd_stats_luma->skip == 0) || (rd_stats_luma->skip == 1)); + assert((rd_stats->skip == 0) || (rd_stats->skip == 1)); + assert((skip_pred >= 0) && + (skip_pred <= interp_search_flags->default_interp_skip_flags)); + + // When skip pred is equal to default_interp_skip_flags, + // skip both luma and chroma MC. + // For mono-chrome images: + // num_planes = 1 and cpi->default_interp_skip_flags = 1, + // skip_pred = 1: skip both luma and chroma + // skip_pred = 0: Evaluate luma and as num_planes=1, + // skip chroma evaluation + int tmp_skip_pred = + (skip_pred == interp_search_flags->default_interp_skip_flags) + ? INTERP_SKIP_LUMA_SKIP_CHROMA + : skip_pred; + + switch (tmp_skip_pred) { + case INTERP_EVAL_LUMA_EVAL_CHROMA: + // skip_pred = 0: Evaluate both luma and chroma. + // Luma MC + interp_model_rd_eval(x, cpi, bsize, orig_dst, AOM_PLANE_Y, AOM_PLANE_Y, + &this_rd_stats_luma, 0); + this_rd_stats = this_rd_stats_luma; +#if CONFIG_COLLECT_RD_STATS == 3 + RD_STATS rd_stats_y; + av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, + INT64_MAX); + PrintPredictionUnitStats(cpi, tile_data, x, &rd_stats_y, bsize); +#endif // CONFIG_COLLECT_RD_STATS == 3 + AOM_FALLTHROUGH_INTENDED; + case INTERP_SKIP_LUMA_EVAL_CHROMA: + // skip_pred = 1: skip luma evaluation (retain previous best luma stats) + // and do chroma evaluation. + for (int plane = 1; plane < num_planes; ++plane) { + int64_t tmp_rd = + RDCOST(x->rdmult, tmp_rs + this_rd_stats.rate, this_rd_stats.dist); + if (tmp_rd >= *rd) { + mbmi->interp_filters = last_best; + return 0; + } + interp_model_rd_eval(x, cpi, bsize, orig_dst, plane, plane, + &this_rd_stats, 0); + } + break; + case INTERP_SKIP_LUMA_SKIP_CHROMA: + // both luma and chroma evaluation is skipped + this_rd_stats = *rd_stats; + break; + case INTERP_EVAL_INVALID: + default: assert(0); return 0; + } + int64_t tmp_rd = + RDCOST(x->rdmult, tmp_rs + this_rd_stats.rate, this_rd_stats.dist); + + if (tmp_rd < *rd) { + *rd = tmp_rd; + *switchable_rate = tmp_rs; + if (skip_pred != interp_search_flags->default_interp_skip_flags) { + if (skip_pred == INTERP_EVAL_LUMA_EVAL_CHROMA) { + // Overwrite the data as current filter is the best one + *rd_stats_luma = this_rd_stats_luma; + *rd_stats = this_rd_stats; + // As luma MC data is computed, no need to recompute after the search + x->recalc_luma_mc_data = 0; + } else if (skip_pred == INTERP_SKIP_LUMA_EVAL_CHROMA) { + // As luma MC data is not computed, update of luma data can be skipped + *rd_stats = this_rd_stats; + // As luma MC data is not recomputed and current filter is the best, + // indicate the possibility of recomputing MC data + // If current buffer contains valid MC data, toggle to indicate that + // luma MC data needs to be recomputed + x->recalc_luma_mc_data ^= 1; + } + swap_dst_buf(xd, dst_bufs, num_planes); + } + return 1; + } + mbmi->interp_filters = last_best; + return 0; +} + +static INLINE INTERP_PRED_TYPE is_pred_filter_search_allowed( + const AV1_COMP *const cpi, MACROBLOCKD *xd, BLOCK_SIZE bsize, + int_interpfilters *af, int_interpfilters *lf) { + const AV1_COMMON *cm = &cpi->common; + const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; + const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; + const int bsl = mi_size_wide_log2[bsize]; + int is_horiz_eq = 0, is_vert_eq = 0; + + if (above_mbmi && is_inter_block(above_mbmi)) + *af = above_mbmi->interp_filters; + + if (left_mbmi && is_inter_block(left_mbmi)) *lf = left_mbmi->interp_filters; + + if (af->as_filters.x_filter != INTERP_INVALID) + is_horiz_eq = af->as_filters.x_filter == lf->as_filters.x_filter; + if (af->as_filters.y_filter != INTERP_INVALID) + is_vert_eq = af->as_filters.y_filter == lf->as_filters.y_filter; + + INTERP_PRED_TYPE pred_filter_type = (is_vert_eq << 1) + is_horiz_eq; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + int pred_filter_enable = + cpi->sf.interp_sf.cb_pred_filter_search + ? (((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_frame.frame_number)) & + 0x1 + : 0; + pred_filter_enable &= is_horiz_eq || is_vert_eq; + // pred_filter_search = 0: pred_filter is disabled + // pred_filter_search = 1: pred_filter is enabled and only horz pred matching + // pred_filter_search = 2: pred_filter is enabled and only vert pred matching + // pred_filter_search = 3: pred_filter is enabled and + // both vert, horz pred matching + return pred_filter_enable * pred_filter_type; +} + +static DUAL_FILTER_TYPE find_best_interp_rd_facade( + MACROBLOCK *const x, const AV1_COMP *const cpi, + const TileDataEnc *tile_data, BLOCK_SIZE bsize, + const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y, + RD_STATS *rd_stats, int *const switchable_rate, + const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2], + const int skip_pred, uint16_t allow_interp_mask, int is_w4_or_h4) { + int tmp_skip_pred = skip_pred; + DUAL_FILTER_TYPE best_filt_type = REG_REG; + + // If no filter are set to be evaluated, return from function + if (allow_interp_mask == 0x0) return best_filt_type; + // For block width or height is 4, skip the pred evaluation of SHARP_SHARP + tmp_skip_pred = is_w4_or_h4 + ? cpi->interp_search_flags.default_interp_skip_flags + : skip_pred; + + // Loop over the all filter types and evaluate for only allowed filter types + for (int filt_type = SHARP_SHARP; filt_type >= REG_REG; --filt_type) { + const int is_filter_allowed = + get_interp_filter_allowed_mask(allow_interp_mask, filt_type); + if (is_filter_allowed) + if (interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, + rd_stats_y, rd_stats, switchable_rate, + dst_bufs, filt_type, switchable_ctx, + tmp_skip_pred)) + best_filt_type = filt_type; + tmp_skip_pred = skip_pred; + } + return best_filt_type; +} + +static INLINE void pred_dual_interp_filter_rd( + MACROBLOCK *const x, const AV1_COMP *const cpi, + const TileDataEnc *tile_data, BLOCK_SIZE bsize, + const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y, + RD_STATS *rd_stats, int *const switchable_rate, + const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2], + const int skip_pred, INTERP_PRED_TYPE pred_filt_type, int_interpfilters *af, + int_interpfilters *lf) { + (void)lf; + assert(pred_filt_type > INTERP_HORZ_NEQ_VERT_NEQ); + assert(pred_filt_type < INTERP_PRED_TYPE_ALL); + uint16_t allowed_interp_mask = 0; + + if (pred_filt_type == INTERP_HORZ_EQ_VERT_NEQ) { + // pred_filter_search = 1: Only horizontal filter is matching + allowed_interp_mask = + av1_interp_dual_filt_mask[pred_filt_type - 1][af->as_filters.x_filter]; + } else if (pred_filt_type == INTERP_HORZ_NEQ_VERT_EQ) { + // pred_filter_search = 2: Only vertical filter is matching + allowed_interp_mask = + av1_interp_dual_filt_mask[pred_filt_type - 1][af->as_filters.y_filter]; + } else { + // pred_filter_search = 3: Both horizontal and vertical filter are matching + int filt_type = + af->as_filters.x_filter + af->as_filters.y_filter * SWITCHABLE_FILTERS; + set_interp_filter_allowed_mask(&allowed_interp_mask, filt_type); + } + // REG_REG is already been evaluated in the beginning + reset_interp_filter_allowed_mask(&allowed_interp_mask, REG_REG); + find_best_interp_rd_facade(x, cpi, tile_data, bsize, orig_dst, rd, rd_stats_y, + rd_stats, switchable_rate, dst_bufs, + switchable_ctx, skip_pred, allowed_interp_mask, 0); +} +// Evaluate dual filter type +// a) Using above, left block interp filter +// b) Find the best horizontal filter and +// then evaluate corresponding vertical filters. +static INLINE void fast_dual_interp_filter_rd( + MACROBLOCK *const x, const AV1_COMP *const cpi, + const TileDataEnc *tile_data, BLOCK_SIZE bsize, + const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y, + RD_STATS *rd_stats, int *const switchable_rate, + const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2], + const int skip_hor, const int skip_ver) { + const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + INTERP_PRED_TYPE pred_filter_type = INTERP_HORZ_NEQ_VERT_NEQ; + int_interpfilters af = av1_broadcast_interp_filter(INTERP_INVALID); + int_interpfilters lf = af; + + if (!have_newmv_in_inter_mode(mbmi->mode)) { + pred_filter_type = is_pred_filter_search_allowed(cpi, xd, bsize, &af, &lf); + } + + if (pred_filter_type) { + pred_dual_interp_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, + rd_stats_y, rd_stats, switchable_rate, dst_bufs, + switchable_ctx, (skip_hor & skip_ver), + pred_filter_type, &af, &lf); + } else { + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + int best_dual_mode = 0; + int skip_pred = + bw <= 4 ? interp_search_flags->default_interp_skip_flags : skip_hor; + // TODO(any): Make use of find_best_interp_rd_facade() + // if speed impact is negligible + for (int i = (SWITCHABLE_FILTERS - 1); i >= 1; --i) { + if (interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, + rd_stats_y, rd_stats, switchable_rate, + dst_bufs, i, switchable_ctx, skip_pred)) { + best_dual_mode = i; + } + skip_pred = skip_hor; + } + // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes + skip_pred = + bh <= 4 ? interp_search_flags->default_interp_skip_flags : skip_ver; + for (int i = (best_dual_mode + (SWITCHABLE_FILTERS * 2)); + i >= (best_dual_mode + SWITCHABLE_FILTERS); i -= SWITCHABLE_FILTERS) { + interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, + rd_stats_y, rd_stats, switchable_rate, dst_bufs, + i, switchable_ctx, skip_pred); + skip_pred = skip_ver; + } + } +} + +// Find the best interp filter if dual_interp_filter = 0 +static INLINE void find_best_non_dual_interp_filter( + MACROBLOCK *const x, const AV1_COMP *const cpi, + const TileDataEnc *tile_data, BLOCK_SIZE bsize, + const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y, + RD_STATS *rd_stats, int *const switchable_rate, + const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2], + const int skip_ver, const int skip_hor) { + const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags; + int8_t i; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + + uint16_t interp_filter_search_mask = + interp_search_flags->interp_filter_search_mask; + + if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) { + const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group); + const int ctx0 = av1_get_pred_context_switchable_interp(xd, 0); + const int ctx1 = av1_get_pred_context_switchable_interp(xd, 1); + const int *switchable_interp_p0 = + cpi->frame_probs.switchable_interp_probs[update_type][ctx0]; + const int *switchable_interp_p1 = + cpi->frame_probs.switchable_interp_probs[update_type][ctx1]; + + static const int thr[7] = { 0, 8, 8, 8, 8, 0, 8 }; + const int thresh = thr[update_type]; + for (i = 0; i < SWITCHABLE_FILTERS; i++) { + // For non-dual case, the 2 dir's prob should be identical. + assert(switchable_interp_p0[i] == switchable_interp_p1[i]); + if (switchable_interp_p0[i] < thresh && + switchable_interp_p1[i] < thresh) { + DUAL_FILTER_TYPE filt_type = i + SWITCHABLE_FILTERS * i; + reset_interp_filter_allowed_mask(&interp_filter_search_mask, filt_type); + } + } + } + + // Regular filter evaluation should have been done and hence the same should + // be the winner + assert(x->e_mbd.mi[0]->interp_filters.as_int == filter_sets[0].as_int); + if ((skip_hor & skip_ver) != interp_search_flags->default_interp_skip_flags) { + INTERP_PRED_TYPE pred_filter_type = INTERP_HORZ_NEQ_VERT_NEQ; + int_interpfilters af = av1_broadcast_interp_filter(INTERP_INVALID); + int_interpfilters lf = af; + + pred_filter_type = is_pred_filter_search_allowed(cpi, xd, bsize, &af, &lf); + if (pred_filter_type) { + assert(af.as_filters.x_filter != INTERP_INVALID); + int filter_idx = SWITCHABLE * af.as_filters.x_filter; + // This assert tells that (filter_x == filter_y) for non-dual filter case + assert(filter_sets[filter_idx].as_filters.x_filter == + filter_sets[filter_idx].as_filters.y_filter); + if (cpi->sf.interp_sf.adaptive_interp_filter_search && + !(get_interp_filter_allowed_mask(interp_filter_search_mask, + filter_idx))) { + return; + } + if (filter_idx) { + interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, + rd_stats_y, rd_stats, switchable_rate, dst_bufs, + filter_idx, switchable_ctx, + (skip_hor & skip_ver)); + } + return; + } + } + // Reuse regular filter's modeled rd data for sharp filter for following + // cases + // 1) When bsize is 4x4 + // 2) When block width is 4 (i.e. 4x8/4x16 blocks) and MV in vertical + // direction is full-pel + // 3) When block height is 4 (i.e. 8x4/16x4 blocks) and MV in horizontal + // direction is full-pel + // TODO(any): Optimize cases 2 and 3 further if luma MV in relavant direction + // alone is full-pel + + if ((bsize == BLOCK_4X4) || + (block_size_wide[bsize] == 4 && + skip_ver == interp_search_flags->default_interp_skip_flags) || + (block_size_high[bsize] == 4 && + skip_hor == interp_search_flags->default_interp_skip_flags)) { + int skip_pred = skip_hor & skip_ver; + uint16_t allowed_interp_mask = 0; + + // REG_REG filter type is evaluated beforehand, hence skip it + set_interp_filter_allowed_mask(&allowed_interp_mask, SHARP_SHARP); + set_interp_filter_allowed_mask(&allowed_interp_mask, SMOOTH_SMOOTH); + if (cpi->sf.interp_sf.adaptive_interp_filter_search) + allowed_interp_mask &= interp_filter_search_mask; + + find_best_interp_rd_facade(x, cpi, tile_data, bsize, orig_dst, rd, + rd_stats_y, rd_stats, switchable_rate, dst_bufs, + switchable_ctx, skip_pred, allowed_interp_mask, + 1); + } else { + int skip_pred = (skip_hor & skip_ver); + for (i = (SWITCHABLE_FILTERS + 1); i < DUAL_FILTER_SET_SIZE; + i += (SWITCHABLE_FILTERS + 1)) { + // This assert tells that (filter_x == filter_y) for non-dual filter case + assert(filter_sets[i].as_filters.x_filter == + filter_sets[i].as_filters.y_filter); + if (cpi->sf.interp_sf.adaptive_interp_filter_search && + !(get_interp_filter_allowed_mask(interp_filter_search_mask, i))) { + continue; + } + interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, + rd_stats_y, rd_stats, switchable_rate, dst_bufs, + i, switchable_ctx, skip_pred); + // In first iteration, smooth filter is evaluated. If smooth filter + // (which is less sharper) is the winner among regular and smooth filters, + // sharp filter evaluation is skipped + // TODO(any): Refine this gating based on modelled rd only (i.e., by not + // accounting switchable filter rate) + if (cpi->sf.interp_sf.skip_sharp_interp_filter_search && + skip_pred != interp_search_flags->default_interp_skip_flags) { + if (mbmi->interp_filters.as_int == filter_sets[SMOOTH_SMOOTH].as_int) + break; + } + } + } +} + +static INLINE void calc_interp_skip_pred_flag(MACROBLOCK *const x, + const AV1_COMP *const cpi, + int *skip_hor, int *skip_ver) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int num_planes = av1_num_planes(cm); + const int is_compound = has_second_ref(mbmi); + assert(is_intrabc_block(mbmi) == 0); + for (int ref = 0; ref < 1 + is_compound; ++ref) { + const struct scale_factors *const sf = + get_ref_scale_factors_const(cm, mbmi->ref_frame[ref]); + // TODO(any): Refine skip flag calculation considering scaling + if (av1_is_scaled(sf)) { + *skip_hor = 0; + *skip_ver = 0; + break; + } + const MV mv = mbmi->mv[ref].as_mv; + int skip_hor_plane = 0; + int skip_ver_plane = 0; + for (int plane_idx = 0; plane_idx < AOMMAX(1, (num_planes - 1)); + ++plane_idx) { + struct macroblockd_plane *const pd = &xd->plane[plane_idx]; + const int bw = pd->width; + const int bh = pd->height; + const MV mv_q4 = clamp_mv_to_umv_border_sb( + xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y); + const int sub_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS; + const int sub_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS; + skip_hor_plane |= ((sub_x == 0) << plane_idx); + skip_ver_plane |= ((sub_y == 0) << plane_idx); + } + *skip_hor &= skip_hor_plane; + *skip_ver &= skip_ver_plane; + // It is not valid that "luma MV is sub-pel, whereas chroma MV is not" + assert(*skip_hor != 2); + assert(*skip_ver != 2); + } + // When compond prediction type is compound segment wedge, luma MC and chroma + // MC need to go hand in hand as mask generated during luma MC is reuired for + // chroma MC. If skip_hor = 0 and skip_ver = 1, mask used for chroma MC during + // vertical filter decision may be incorrect as temporary MC evaluation + // overwrites the mask. Make skip_ver as 0 for this case so that mask is + // populated during luma MC + if (is_compound && mbmi->compound_idx == 1 && + mbmi->interinter_comp.type == COMPOUND_DIFFWTD) { + assert(mbmi->comp_group_idx == 1); + if (*skip_hor == 0 && *skip_ver == 1) *skip_ver = 0; + } +} + +int64_t av1_interpolation_filter_search( + MACROBLOCK *const x, const AV1_COMP *const cpi, + const TileDataEnc *tile_data, BLOCK_SIZE bsize, + const BUFFER_SET *const tmp_dst, const BUFFER_SET *const orig_dst, + int64_t *const rd, int *const switchable_rate, int *skip_build_pred, + HandleInterModeArgs *args, int64_t ref_best_rd) { + const AV1_COMMON *cm = &cpi->common; + const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int need_search = + av1_is_interp_needed(xd) && !cpi->sf.rt_sf.skip_interp_filter_search; + const int ref_frame = xd->mi[0]->ref_frame[0]; + RD_STATS rd_stats_luma, rd_stats; + + // Initialization of rd_stats structures with default values + av1_init_rd_stats(&rd_stats_luma); + av1_init_rd_stats(&rd_stats); + + int match_found_idx = -1; + const InterpFilter assign_filter = cm->features.interp_filter; + + match_found_idx = av1_find_interp_filter_match( + mbmi, cpi, assign_filter, need_search, args->interp_filter_stats, + args->interp_filter_stats_idx); + + if (match_found_idx != -1) { + *rd = args->interp_filter_stats[match_found_idx].rd; + x->pred_sse[ref_frame] = + args->interp_filter_stats[match_found_idx].pred_sse; + return 0; + } + + int switchable_ctx[2]; + switchable_ctx[0] = av1_get_pred_context_switchable_interp(xd, 0); + switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1); + *switchable_rate = + get_switchable_rate(x, mbmi->interp_filters, switchable_ctx); + + // Do MC evaluation for default filter_type. + // Luma MC + interp_model_rd_eval(x, cpi, bsize, orig_dst, AOM_PLANE_Y, AOM_PLANE_Y, + &rd_stats_luma, *skip_build_pred); + +#if CONFIG_COLLECT_RD_STATS == 3 + RD_STATS rd_stats_y; + av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); + PrintPredictionUnitStats(cpi, tile_data, x, &rd_stats_y, bsize); +#endif // CONFIG_COLLECT_RD_STATS == 3 + // Chroma MC + if (num_planes > 1) { + interp_model_rd_eval(x, cpi, bsize, orig_dst, AOM_PLANE_U, AOM_PLANE_V, + &rd_stats, *skip_build_pred); + } + *skip_build_pred = 1; + + av1_merge_rd_stats(&rd_stats, &rd_stats_luma); + + assert(rd_stats.rate >= 0); + + *rd = RDCOST(x->rdmult, *switchable_rate + rd_stats.rate, rd_stats.dist); + x->pred_sse[ref_frame] = (unsigned int)(rd_stats_luma.sse >> 4); + + if (assign_filter != SWITCHABLE || match_found_idx != -1) { + return 0; + } + if (!need_search) { + int_interpfilters filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + assert(mbmi->interp_filters.as_int == filters.as_int); + (void)filters; + return 0; + } + if (args->modelled_rd != NULL) { + if (has_second_ref(mbmi)) { + const int ref_mv_idx = mbmi->ref_mv_idx; + MV_REFERENCE_FRAME *refs = mbmi->ref_frame; + const int mode0 = compound_ref0_mode(mbmi->mode); + const int mode1 = compound_ref1_mode(mbmi->mode); + const int64_t mrd = AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]], + args->modelled_rd[mode1][ref_mv_idx][refs[1]]); + if ((*rd >> 1) > mrd && ref_best_rd < INT64_MAX) { + return INT64_MAX; + } + } + } + + x->recalc_luma_mc_data = 0; + // skip_flag=xx (in binary form) + // Setting 0th flag corresonds to skipping luma MC and setting 1st bt + // corresponds to skipping chroma MC skip_flag=0 corresponds to "Don't skip + // luma and chroma MC" Skip flag=1 corresponds to "Skip Luma MC only" + // Skip_flag=2 is not a valid case + // skip_flag=3 corresponds to "Skip both luma and chroma MC" + int skip_hor = interp_search_flags->default_interp_skip_flags; + int skip_ver = interp_search_flags->default_interp_skip_flags; + calc_interp_skip_pred_flag(x, cpi, &skip_hor, &skip_ver); + + // do interp_filter search + restore_dst_buf(xd, *tmp_dst, num_planes); + const BUFFER_SET *dst_bufs[2] = { tmp_dst, orig_dst }; + // Evaluate dual interp filters + if (cm->seq_params.enable_dual_filter) { + if (cpi->sf.interp_sf.use_fast_interpolation_filter_search) { + fast_dual_interp_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, + &rd_stats_luma, &rd_stats, switchable_rate, + dst_bufs, switchable_ctx, skip_hor, skip_ver); + } else { + // Use full interpolation filter search + uint16_t allowed_interp_mask = ALLOW_ALL_INTERP_FILT_MASK; + // REG_REG filter type is evaluated beforehand, so loop is repeated over + // REG_SMOOTH to SHARP_SHARP for full interpolation filter search + reset_interp_filter_allowed_mask(&allowed_interp_mask, REG_REG); + find_best_interp_rd_facade(x, cpi, tile_data, bsize, orig_dst, rd, + &rd_stats_luma, &rd_stats, switchable_rate, + dst_bufs, switchable_ctx, + (skip_hor & skip_ver), allowed_interp_mask, 0); + } + } else { + // Evaluate non-dual interp filters + find_best_non_dual_interp_filter( + x, cpi, tile_data, bsize, orig_dst, rd, &rd_stats_luma, &rd_stats, + switchable_rate, dst_bufs, switchable_ctx, skip_ver, skip_hor); + } + swap_dst_buf(xd, dst_bufs, num_planes); + // Recompute final MC data if required + if (x->recalc_luma_mc_data == 1) { + // Recomputing final luma MC data is required only if the same was skipped + // in either of the directions Condition below is necessary, but not + // sufficient + assert((skip_hor == 1) || (skip_ver == 1)); + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + } + x->pred_sse[ref_frame] = (unsigned int)(rd_stats_luma.sse >> 4); + + // save search results + if (cpi->sf.interp_sf.use_interp_filter) { + assert(match_found_idx == -1); + args->interp_filter_stats_idx = save_interp_filter_search_stat( + mbmi, *rd, x->pred_sse[ref_frame], args->interp_filter_stats, + args->interp_filter_stats_idx); + } + return 0; +} diff --git a/libs/libaom/src/av1/encoder/interp_search.h b/libs/libaom/src/av1/encoder/interp_search.h new file mode 100644 index 000000000..401e14f5b --- /dev/null +++ b/libs/libaom/src/av1/encoder/interp_search.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_INTERP_FILTER_SEARCH_H_ +#define AOM_AV1_ENCODER_INTERP_FILTER_SEARCH_H_ + +#include "av1/encoder/block.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/rdopt_utils.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_INTERP_FILTER_STATS 128 +#define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS) + +typedef struct { + int_interpfilters filters; + int_mv mv[2]; + int8_t ref_frames[2]; + COMPOUND_TYPE comp_type; + int compound_idx; + int64_t rd; + unsigned int pred_sse; +} INTERPOLATION_FILTER_STATS; + +typedef struct { + // OBMC secondary prediction buffers and respective strides + uint8_t *above_pred_buf[MAX_MB_PLANE]; + int above_pred_stride[MAX_MB_PLANE]; + uint8_t *left_pred_buf[MAX_MB_PLANE]; + int left_pred_stride[MAX_MB_PLANE]; + int_mv (*single_newmv)[REF_FRAMES]; + // Pointer to array of motion vectors to use for each ref and their rates + // Should point to first of 2 arrays in 2D array + int (*single_newmv_rate)[REF_FRAMES]; + int (*single_newmv_valid)[REF_FRAMES]; + // Pointer to array of predicted rate-distortion + // Should point to first of 2 arrays in 2D array + int64_t (*modelled_rd)[MAX_REF_MV_SEARCH][REF_FRAMES]; + int ref_frame_cost; + int single_comp_cost; + int64_t (*simple_rd)[MAX_REF_MV_SEARCH][REF_FRAMES]; + int skip_motion_mode; + INTERINTRA_MODE *inter_intra_mode; + int single_ref_first_pass; + SimpleRDState *simple_rd_state; + // [comp_idx][saved stat_idx] + INTERPOLATION_FILTER_STATS interp_filter_stats[MAX_INTERP_FILTER_STATS]; + int interp_filter_stats_idx; +} HandleInterModeArgs; + +static const int_interpfilters filter_sets[DUAL_FILTER_SET_SIZE] = { + { 0x00000000 }, { 0x00010000 }, { 0x00020000 }, // y = 0 + { 0x00000001 }, { 0x00010001 }, { 0x00020001 }, // y = 1 + { 0x00000002 }, { 0x00010002 }, { 0x00020002 }, // y = 2 +}; + +int av1_find_interp_filter_match( + MB_MODE_INFO *const mbmi, const AV1_COMP *const cpi, + const InterpFilter assign_filter, const int need_search, + INTERPOLATION_FILTER_STATS *interp_filter_stats, + int interp_filter_stats_idx); + +int64_t av1_interpolation_filter_search( + MACROBLOCK *const x, const AV1_COMP *const cpi, + const TileDataEnc *tile_data, BLOCK_SIZE bsize, + const BUFFER_SET *const tmp_dst, const BUFFER_SET *const orig_dst, + int64_t *const rd, int *const switchable_rate, int *skip_build_pred, + HandleInterModeArgs *args, int64_t ref_best_rd); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_INTERP_FILTER_SEARCH_H_ diff --git a/libs/libaom/src/av1/encoder/intra_mode_search.c b/libs/libaom/src/av1/encoder/intra_mode_search.c new file mode 100644 index 000000000..43192a945 --- /dev/null +++ b/libs/libaom/src/av1/encoder/intra_mode_search.c @@ -0,0 +1,2132 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/encoder/intra_mode_search.h" +#include "av1/encoder/model_rd.h" +#include "av1/encoder/palette.h" +#include "av1/common/pred_common.h" +#include "av1/common/reconintra.h" +#include "av1/encoder/tx_search.h" + +static const PREDICTION_MODE intra_rd_search_mode_order[INTRA_MODES] = { + DC_PRED, H_PRED, V_PRED, SMOOTH_PRED, PAETH_PRED, + SMOOTH_V_PRED, SMOOTH_H_PRED, D135_PRED, D203_PRED, D157_PRED, + D67_PRED, D113_PRED, D45_PRED, +}; + +static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = { + UV_DC_PRED, UV_CFL_PRED, UV_H_PRED, UV_V_PRED, + UV_SMOOTH_PRED, UV_PAETH_PRED, UV_SMOOTH_V_PRED, UV_SMOOTH_H_PRED, + UV_D135_PRED, UV_D203_PRED, UV_D157_PRED, UV_D67_PRED, + UV_D113_PRED, UV_D45_PRED, +}; + +#define BINS 32 +static const float intra_hog_model_bias[DIRECTIONAL_MODES] = { + 0.450578f, 0.695518f, -0.717944f, -0.639894f, + -0.602019f, -0.453454f, 0.055857f, -0.465480f, +}; + +static const float intra_hog_model_weights[BINS * DIRECTIONAL_MODES] = { + -3.076402f, -3.757063f, -3.275266f, -3.180665f, -3.452105f, -3.216593f, + -2.871212f, -3.134296f, -1.822324f, -2.401411f, -1.541016f, -1.195322f, + -0.434156f, 0.322868f, 2.260546f, 3.368715f, 3.989290f, 3.308487f, + 2.277893f, 0.923793f, 0.026412f, -0.385174f, -0.718622f, -1.408867f, + -1.050558f, -2.323941f, -2.225827f, -2.585453f, -3.054283f, -2.875087f, + -2.985709f, -3.447155f, 3.758139f, 3.204353f, 2.170998f, 0.826587f, + -0.269665f, -0.702068f, -1.085776f, -2.175249f, -1.623180f, -2.975142f, + -2.779629f, -3.190799f, -3.521900f, -3.375480f, -3.319355f, -3.897389f, + -3.172334f, -3.594528f, -2.879132f, -2.547777f, -2.921023f, -2.281844f, + -1.818988f, -2.041771f, -0.618268f, -1.396458f, -0.567153f, -0.285868f, + -0.088058f, 0.753494f, 2.092413f, 3.215266f, -3.300277f, -2.748658f, + -2.315784f, -2.423671f, -2.257283f, -2.269583f, -2.196660f, -2.301076f, + -2.646516f, -2.271319f, -2.254366f, -2.300102f, -2.217960f, -2.473300f, + -2.116866f, -2.528246f, -3.314712f, -1.701010f, -0.589040f, -0.088077f, + 0.813112f, 1.702213f, 2.653045f, 3.351749f, 3.243554f, 3.199409f, + 2.437856f, 1.468854f, 0.533039f, -0.099065f, -0.622643f, -2.200732f, + -4.228861f, -2.875263f, -1.273956f, -0.433280f, 0.803771f, 1.975043f, + 3.179528f, 3.939064f, 3.454379f, 3.689386f, 3.116411f, 1.970991f, + 0.798406f, -0.628514f, -1.252546f, -2.825176f, -4.090178f, -3.777448f, + -3.227314f, -3.479403f, -3.320569f, -3.159372f, -2.729202f, -2.722341f, + -3.054913f, -2.742923f, -2.612703f, -2.662632f, -2.907314f, -3.117794f, + -3.102660f, -3.970972f, -4.891357f, -3.935582f, -3.347758f, -2.721924f, + -2.219011f, -1.702391f, -0.866529f, -0.153743f, 0.107733f, 1.416882f, + 2.572884f, 3.607755f, 3.974820f, 3.997783f, 2.970459f, 0.791687f, + -1.478921f, -1.228154f, -1.216955f, -1.765932f, -1.951003f, -1.985301f, + -1.975881f, -1.985593f, -2.422371f, -2.419978f, -2.531288f, -2.951853f, + -3.071380f, -3.277027f, -3.373539f, -4.462010f, -0.967888f, 0.805524f, + 2.794130f, 3.685984f, 3.745195f, 3.252444f, 2.316108f, 1.399146f, + -0.136519f, -0.162811f, -1.004357f, -1.667911f, -1.964662f, -2.937579f, + -3.019533f, -3.942766f, -5.102767f, -3.882073f, -3.532027f, -3.451956f, + -2.944015f, -2.643064f, -2.529872f, -2.077290f, -2.809965f, -1.803734f, + -1.783593f, -1.662585f, -1.415484f, -1.392673f, -0.788794f, -1.204819f, + -1.998864f, -1.182102f, -0.892110f, -1.317415f, -1.359112f, -1.522867f, + -1.468552f, -1.779072f, -2.332959f, -2.160346f, -2.329387f, -2.631259f, + -2.744936f, -3.052494f, -2.787363f, -3.442548f, -4.245075f, -3.032172f, + -2.061609f, -1.768116f, -1.286072f, -0.706587f, -0.192413f, 0.386938f, + 0.716997f, 1.481393f, 2.216702f, 2.737986f, 3.109809f, 3.226084f, + 2.490098f, -0.095827f, -3.864816f, -3.507248f, -3.128925f, -2.908251f, + -2.883836f, -2.881411f, -2.524377f, -2.624478f, -2.399573f, -2.367718f, + -1.918255f, -1.926277f, -1.694584f, -1.723790f, -0.966491f, -1.183115f, + -1.430687f, 0.872896f, 2.766550f, 3.610080f, 3.578041f, 3.334928f, + 2.586680f, 1.895721f, 1.122195f, 0.488519f, -0.140689f, -0.799076f, + -1.222860f, -1.502437f, -1.900969f, -3.206816f, +}; + +static void generate_hog(const uint8_t *src, int stride, int rows, int cols, + float *hist) { + const float step = (float)PI / BINS; + float total = 0.1f; + src += stride; + for (int r = 1; r < rows - 1; ++r) { + for (int c = 1; c < cols - 1; ++c) { + const uint8_t *above = &src[c - stride]; + const uint8_t *below = &src[c + stride]; + const uint8_t *left = &src[c - 1]; + const uint8_t *right = &src[c + 1]; + // Calculate gradient using Sobel fitlers. + const int dx = (right[-stride] + 2 * right[0] + right[stride]) - + (left[-stride] + 2 * left[0] + left[stride]); + const int dy = (below[-1] + 2 * below[0] + below[1]) - + (above[-1] + 2 * above[0] + above[1]); + if (dx == 0 && dy == 0) continue; + const int temp = abs(dx) + abs(dy); + if (!temp) continue; + total += temp; + if (dx == 0) { + hist[0] += temp / 2; + hist[BINS - 1] += temp / 2; + } else { + const float angle = atanf(dy * 1.0f / dx); + int idx = (int)roundf(angle / step) + BINS / 2; + idx = AOMMIN(idx, BINS - 1); + idx = AOMMAX(idx, 0); + hist[idx] += temp; + } + } + src += stride; + } + + for (int i = 0; i < BINS; ++i) hist[i] /= total; +} + +static void generate_hog_hbd(const uint8_t *src8, int stride, int rows, + int cols, float *hist) { + const float step = (float)PI / BINS; + float total = 0.1f; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + src += stride; + for (int r = 1; r < rows - 1; ++r) { + for (int c = 1; c < cols - 1; ++c) { + const uint16_t *above = &src[c - stride]; + const uint16_t *below = &src[c + stride]; + const uint16_t *left = &src[c - 1]; + const uint16_t *right = &src[c + 1]; + // Calculate gradient using Sobel fitlers. + const int dx = (right[-stride] + 2 * right[0] + right[stride]) - + (left[-stride] + 2 * left[0] + left[stride]); + const int dy = (below[-1] + 2 * below[0] + below[1]) - + (above[-1] + 2 * above[0] + above[1]); + if (dx == 0 && dy == 0) continue; + const int temp = abs(dx) + abs(dy); + if (!temp) continue; + total += temp; + if (dx == 0) { + hist[0] += temp / 2; + hist[BINS - 1] += temp / 2; + } else { + const float angle = atanf(dy * 1.0f / dx); + int idx = (int)roundf(angle / step) + BINS / 2; + idx = AOMMIN(idx, BINS - 1); + idx = AOMMAX(idx, 0); + hist[idx] += temp; + } + } + src += stride; + } + + for (int i = 0; i < BINS; ++i) hist[i] /= total; +} + +static void prune_intra_mode_with_hog(const MACROBLOCK *x, BLOCK_SIZE bsize, + float th, + uint8_t *directional_mode_skip_mask) { + aom_clear_system_state(); + + const int bh = block_size_high[bsize]; + const int bw = block_size_wide[bsize]; + const MACROBLOCKD *xd = &x->e_mbd; + const int rows = + (xd->mb_to_bottom_edge >= 0) ? bh : (xd->mb_to_bottom_edge >> 3) + bh; + const int cols = + (xd->mb_to_right_edge >= 0) ? bw : (xd->mb_to_right_edge >> 3) + bw; + const int src_stride = x->plane[0].src.stride; + const uint8_t *src = x->plane[0].src.buf; + float hist[BINS] = { 0.0f }; + if (is_cur_buf_hbd(xd)) { + generate_hog_hbd(src, src_stride, rows, cols, hist); + } else { + generate_hog(src, src_stride, rows, cols, hist); + } + + for (int i = 0; i < DIRECTIONAL_MODES; ++i) { + float this_score = intra_hog_model_bias[i]; + const float *weights = &intra_hog_model_weights[i * BINS]; + for (int j = 0; j < BINS; ++j) { + this_score += weights[j] * hist[j]; + } + if (this_score < th) directional_mode_skip_mask[i + 1] = 1; + } + + aom_clear_system_state(); +} + +#undef BINS + +// Model based RD estimation for luma intra blocks. +static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x, + BLOCK_SIZE bsize, int mode_cost) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(!is_inter_block(mbmi)); + RD_STATS this_rd_stats; + int row, col; + int64_t temp_sse, this_rd; + TX_SIZE tx_size = tx_size_from_tx_mode(bsize, x->tx_mode_search_type); + const int stepr = tx_size_high_unit[tx_size]; + const int stepc = tx_size_wide_unit[tx_size]; + const int max_blocks_wide = max_block_wide(xd, bsize, 0); + const int max_blocks_high = max_block_high(xd, bsize, 0); + mbmi->tx_size = tx_size; + // Prediction. + for (row = 0; row < max_blocks_high; row += stepr) { + for (col = 0; col < max_blocks_wide; col += stepc) { + av1_predict_intra_block_facade(cm, xd, 0, col, row, tx_size); + } + } + // RD estimation. + model_rd_sb_fn[cpi->sf.rt_sf.use_simple_rd_model ? MODELRD_LEGACY + : MODELRD_TYPE_INTRA]( + cpi, bsize, x, xd, 0, 0, &this_rd_stats.rate, &this_rd_stats.dist, + &this_rd_stats.skip, &temp_sse, NULL, NULL, NULL); + if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) { + mode_cost += + x->angle_delta_cost[mbmi->mode - V_PRED] + [MAX_ANGLE_DELTA + mbmi->angle_delta[PLANE_TYPE_Y]]; + } + if (mbmi->mode == DC_PRED && + av1_filter_intra_allowed_bsize(cm, mbmi->sb_type)) { + if (mbmi->filter_intra_mode_info.use_filter_intra) { + const int mode = mbmi->filter_intra_mode_info.filter_intra_mode; + mode_cost += x->filter_intra_cost[mbmi->sb_type][1] + + x->filter_intra_mode_cost[mode]; + } else { + mode_cost += x->filter_intra_cost[mbmi->sb_type][0]; + } + } + this_rd = + RDCOST(x->rdmult, this_rd_stats.rate + mode_cost, this_rd_stats.dist); + return this_rd; +} + +// Update the intra model yrd and prune the current mode if the new estimate +// y_rd > 1.5 * best_model_rd. +static AOM_INLINE int model_intra_yrd_and_prune(const AV1_COMP *const cpi, + MACROBLOCK *x, BLOCK_SIZE bsize, + int mode_info_cost, + int64_t *best_model_rd) { + const int64_t this_model_rd = intra_model_yrd(cpi, x, bsize, mode_info_cost); + if (*best_model_rd != INT64_MAX && + this_model_rd > *best_model_rd + (*best_model_rd >> 1)) { + return 1; + } else if (this_model_rd < *best_model_rd) { + *best_model_rd = this_model_rd; + } + return 0; +} + +// Run RD calculation with given luma intra prediction angle., and return +// the RD cost. Update the best mode info. if the RD cost is the best so far. +static int64_t calc_rd_given_intra_angle( + const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mode_cost, + int64_t best_rd_in, int8_t angle_delta, int max_angle_delta, int *rate, + RD_STATS *rd_stats, int *best_angle_delta, TX_SIZE *best_tx_size, + int64_t *best_rd, int64_t *best_model_rd, uint8_t *best_tx_type_map, + uint8_t *best_blk_skip, int skip_model_rd) { + RD_STATS tokenonly_rd_stats; + int64_t this_rd; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const int n4 = bsize_to_num_blk(bsize); + assert(!is_inter_block(mbmi)); + mbmi->angle_delta[PLANE_TYPE_Y] = angle_delta; + if (!skip_model_rd) { + if (model_intra_yrd_and_prune(cpi, x, bsize, mode_cost, best_model_rd)) { + return INT64_MAX; + } + } + av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize, + best_rd_in); + if (tokenonly_rd_stats.rate == INT_MAX) return INT64_MAX; + + int this_rate = + mode_cost + tokenonly_rd_stats.rate + + x->angle_delta_cost[mbmi->mode - V_PRED][max_angle_delta + angle_delta]; + this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); + + if (this_rd < *best_rd) { + memcpy(best_blk_skip, x->blk_skip, sizeof(best_blk_skip[0]) * n4); + av1_copy_array(best_tx_type_map, xd->tx_type_map, n4); + *best_rd = this_rd; + *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_Y]; + *best_tx_size = mbmi->tx_size; + *rate = this_rate; + rd_stats->rate = tokenonly_rd_stats.rate; + rd_stats->dist = tokenonly_rd_stats.dist; + rd_stats->skip = tokenonly_rd_stats.skip; + } + return this_rd; +} + +static INLINE int write_uniform_cost(int n, int v) { + const int l = get_unsigned_bits(n); + const int m = (1 << l) - n; + if (l == 0) return 0; + if (v < m) + return av1_cost_literal(l - 1); + else + return av1_cost_literal(l); +} + +// Return the rate cost for luma prediction mode info. of intra blocks. +static int intra_mode_info_cost_y(const AV1_COMP *cpi, const MACROBLOCK *x, + const MB_MODE_INFO *mbmi, BLOCK_SIZE bsize, + int mode_cost) { + int total_rate = mode_cost; + const int use_palette = mbmi->palette_mode_info.palette_size[0] > 0; + const int use_filter_intra = mbmi->filter_intra_mode_info.use_filter_intra; + const int use_intrabc = mbmi->use_intrabc; + // Can only activate one mode. + assert(((mbmi->mode != DC_PRED) + use_palette + use_intrabc + + use_filter_intra) <= 1); + const int try_palette = av1_allow_palette( + cpi->common.features.allow_screen_content_tools, mbmi->sb_type); + if (try_palette && mbmi->mode == DC_PRED) { + const MACROBLOCKD *xd = &x->e_mbd; + const int bsize_ctx = av1_get_palette_bsize_ctx(bsize); + const int mode_ctx = av1_get_palette_mode_ctx(xd); + total_rate += x->palette_y_mode_cost[bsize_ctx][mode_ctx][use_palette]; + if (use_palette) { + const uint8_t *const color_map = xd->plane[0].color_index_map; + int block_width, block_height, rows, cols; + av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows, + &cols); + const int plt_size = mbmi->palette_mode_info.palette_size[0]; + int palette_mode_cost = + x->palette_y_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] + + write_uniform_cost(plt_size, color_map[0]); + uint16_t color_cache[2 * PALETTE_MAX_SIZE]; + const int n_cache = av1_get_palette_cache(xd, 0, color_cache); + palette_mode_cost += + av1_palette_color_cost_y(&mbmi->palette_mode_info, color_cache, + n_cache, cpi->common.seq_params.bit_depth); + palette_mode_cost += + av1_cost_color_map(x, 0, bsize, mbmi->tx_size, PALETTE_MAP); + total_rate += palette_mode_cost; + } + } + if (av1_filter_intra_allowed(&cpi->common, mbmi)) { + total_rate += x->filter_intra_cost[mbmi->sb_type][use_filter_intra]; + if (use_filter_intra) { + total_rate += x->filter_intra_mode_cost[mbmi->filter_intra_mode_info + .filter_intra_mode]; + } + } + if (av1_is_directional_mode(mbmi->mode)) { + if (av1_use_angle_delta(bsize)) { + total_rate += x->angle_delta_cost[mbmi->mode - V_PRED] + [MAX_ANGLE_DELTA + + mbmi->angle_delta[PLANE_TYPE_Y]]; + } + } + if (av1_allow_intrabc(&cpi->common)) + total_rate += x->intrabc_cost[use_intrabc]; + return total_rate; +} + +// Return the rate cost for chroma prediction mode info. of intra blocks. +static int intra_mode_info_cost_uv(const AV1_COMP *cpi, const MACROBLOCK *x, + const MB_MODE_INFO *mbmi, BLOCK_SIZE bsize, + int mode_cost) { + int total_rate = mode_cost; + const int use_palette = mbmi->palette_mode_info.palette_size[1] > 0; + const UV_PREDICTION_MODE mode = mbmi->uv_mode; + // Can only activate one mode. + assert(((mode != UV_DC_PRED) + use_palette + mbmi->use_intrabc) <= 1); + + const int try_palette = av1_allow_palette( + cpi->common.features.allow_screen_content_tools, mbmi->sb_type); + if (try_palette && mode == UV_DC_PRED) { + const PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info; + total_rate += + x->palette_uv_mode_cost[pmi->palette_size[0] > 0][use_palette]; + if (use_palette) { + const int bsize_ctx = av1_get_palette_bsize_ctx(bsize); + const int plt_size = pmi->palette_size[1]; + const MACROBLOCKD *xd = &x->e_mbd; + const uint8_t *const color_map = xd->plane[1].color_index_map; + int palette_mode_cost = + x->palette_uv_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] + + write_uniform_cost(plt_size, color_map[0]); + uint16_t color_cache[2 * PALETTE_MAX_SIZE]; + const int n_cache = av1_get_palette_cache(xd, 1, color_cache); + palette_mode_cost += av1_palette_color_cost_uv( + pmi, color_cache, n_cache, cpi->common.seq_params.bit_depth); + palette_mode_cost += + av1_cost_color_map(x, 1, bsize, mbmi->tx_size, PALETTE_MAP); + total_rate += palette_mode_cost; + } + } + if (av1_is_directional_mode(get_uv_mode(mode))) { + if (av1_use_angle_delta(bsize)) { + total_rate += + x->angle_delta_cost[mode - V_PRED][mbmi->angle_delta[PLANE_TYPE_UV] + + MAX_ANGLE_DELTA]; + } + } + return total_rate; +} + +// Return 1 if an filter intra mode is selected; return 0 otherwise. +static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x, + int *rate, int *rate_tokenonly, + int64_t *distortion, int *skippable, + BLOCK_SIZE bsize, int mode_cost, + int64_t *best_rd, int64_t *best_model_rd, + PICK_MODE_CONTEXT *ctx) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + int filter_intra_selected_flag = 0; + FILTER_INTRA_MODE mode; + TX_SIZE best_tx_size = TX_8X8; + FILTER_INTRA_MODE_INFO filter_intra_mode_info; + uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + (void)ctx; + av1_zero(filter_intra_mode_info); + mbmi->filter_intra_mode_info.use_filter_intra = 1; + mbmi->mode = DC_PRED; + mbmi->palette_mode_info.palette_size[0] = 0; + + for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) { + int64_t this_rd; + RD_STATS tokenonly_rd_stats; + mbmi->filter_intra_mode_info.filter_intra_mode = mode; + + if (model_intra_yrd_and_prune(cpi, x, bsize, mode_cost, best_model_rd)) { + continue; + } + av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize, + *best_rd); + if (tokenonly_rd_stats.rate == INT_MAX) continue; + const int this_rate = + tokenonly_rd_stats.rate + + intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost); + this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); + + // Collect mode stats for multiwinner mode processing + const int txfm_search_done = 1; + store_winner_mode_stats( + &cpi->common, x, mbmi, NULL, NULL, NULL, 0, NULL, bsize, this_rd, + cpi->sf.winner_mode_sf.enable_multiwinner_mode_process, + txfm_search_done); + if (this_rd < *best_rd) { + *best_rd = this_rd; + best_tx_size = mbmi->tx_size; + filter_intra_mode_info = mbmi->filter_intra_mode_info; + av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + memcpy(ctx->blk_skip, x->blk_skip, + sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); + *rate = this_rate; + *rate_tokenonly = tokenonly_rd_stats.rate; + *distortion = tokenonly_rd_stats.dist; + *skippable = tokenonly_rd_stats.skip; + filter_intra_selected_flag = 1; + } + } + + if (filter_intra_selected_flag) { + mbmi->mode = DC_PRED; + mbmi->tx_size = best_tx_size; + mbmi->filter_intra_mode_info = filter_intra_mode_info; + av1_copy_array(ctx->tx_type_map, best_tx_type_map, ctx->num_4x4_blk); + return 1; + } else { + return 0; + } +} + +int av1_count_colors(const uint8_t *src, int stride, int rows, int cols, + int *val_count) { + const int max_pix_val = 1 << 8; + memset(val_count, 0, max_pix_val * sizeof(val_count[0])); + for (int r = 0; r < rows; ++r) { + for (int c = 0; c < cols; ++c) { + const int this_val = src[r * stride + c]; + assert(this_val < max_pix_val); + ++val_count[this_val]; + } + } + int n = 0; + for (int i = 0; i < max_pix_val; ++i) { + if (val_count[i]) ++n; + } + return n; +} + +int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols, + int bit_depth, int *val_count) { + assert(bit_depth <= 12); + const int max_pix_val = 1 << bit_depth; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + memset(val_count, 0, max_pix_val * sizeof(val_count[0])); + for (int r = 0; r < rows; ++r) { + for (int c = 0; c < cols; ++c) { + const int this_val = src[r * stride + c]; + assert(this_val < max_pix_val); + if (this_val >= max_pix_val) return 0; + ++val_count[this_val]; + } + } + int n = 0; + for (int i = 0; i < max_pix_val; ++i) { + if (val_count[i]) ++n; + } + return n; +} + +// Extends 'color_map' array from 'orig_width x orig_height' to 'new_width x +// new_height'. Extra rows and columns are filled in by copying last valid +// row/column. +static AOM_INLINE void extend_palette_color_map(uint8_t *const color_map, + int orig_width, int orig_height, + int new_width, int new_height) { + int j; + assert(new_width >= orig_width); + assert(new_height >= orig_height); + if (new_width == orig_width && new_height == orig_height) return; + + for (j = orig_height - 1; j >= 0; --j) { + memmove(color_map + j * new_width, color_map + j * orig_width, orig_width); + // Copy last column to extra columns. + memset(color_map + j * new_width + orig_width, + color_map[j * new_width + orig_width - 1], new_width - orig_width); + } + // Copy last row to extra rows. + for (j = orig_height; j < new_height; ++j) { + memcpy(color_map + j * new_width, color_map + (orig_height - 1) * new_width, + new_width); + } +} + +// Bias toward using colors in the cache. +// TODO(huisu): Try other schemes to improve compression. +static AOM_INLINE void optimize_palette_colors(uint16_t *color_cache, + int n_cache, int n_colors, + int stride, int *centroids) { + if (n_cache <= 0) return; + for (int i = 0; i < n_colors * stride; i += stride) { + int min_diff = abs(centroids[i] - (int)color_cache[0]); + int idx = 0; + for (int j = 1; j < n_cache; ++j) { + const int this_diff = abs(centroids[i] - color_cache[j]); + if (this_diff < min_diff) { + min_diff = this_diff; + idx = j; + } + } + if (min_diff <= 1) centroids[i] = color_cache[idx]; + } +} + +// Given the base colors as specified in centroids[], calculate the RD cost +// of palette mode. +static AOM_INLINE void palette_rd_y( + const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi, + BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *centroids, int n, + uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi, + uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd, + int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable, + int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip, + uint8_t *tx_type_map, int *beat_best_pallette_rd) { + optimize_palette_colors(color_cache, n_cache, n, 1, centroids); + const int num_unique_colors = av1_remove_duplicates(centroids, n); + if (num_unique_colors < PALETTE_MIN_SIZE) { + // Too few unique colors to create a palette. And DC_PRED will work + // well for that case anyway. So skip. + return; + } + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + if (cpi->common.seq_params.use_highbitdepth) { + for (int i = 0; i < num_unique_colors; ++i) { + pmi->palette_colors[i] = clip_pixel_highbd( + (int)centroids[i], cpi->common.seq_params.bit_depth); + } + } else { + for (int i = 0; i < num_unique_colors; ++i) { + pmi->palette_colors[i] = clip_pixel(centroids[i]); + } + } + pmi->palette_size[0] = num_unique_colors; + MACROBLOCKD *const xd = &x->e_mbd; + uint8_t *const color_map = xd->plane[0].color_index_map; + int block_width, block_height, rows, cols; + av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows, + &cols); + av1_calc_indices(data, centroids, color_map, rows * cols, num_unique_colors, + 1); + extend_palette_color_map(color_map, cols, rows, block_width, block_height); + + const int palette_mode_cost = + intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost); + if (model_intra_yrd_and_prune(cpi, x, bsize, palette_mode_cost, + best_model_rd)) { + return; + } + + RD_STATS tokenonly_rd_stats; + av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize, + *best_rd); + if (tokenonly_rd_stats.rate == INT_MAX) return; + int this_rate = tokenonly_rd_stats.rate + palette_mode_cost; + int64_t this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); + if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->sb_type)) { + tokenonly_rd_stats.rate -= tx_size_cost(x, bsize, mbmi->tx_size); + } + // Collect mode stats for multiwinner mode processing + const int txfm_search_done = 1; + store_winner_mode_stats( + &cpi->common, x, mbmi, NULL, NULL, NULL, THR_DC, color_map, bsize, + this_rd, cpi->sf.winner_mode_sf.enable_multiwinner_mode_process, + txfm_search_done); + if (this_rd < *best_rd) { + *best_rd = this_rd; + // Setting beat_best_rd flag because current mode rd is better than best_rd. + // This flag need to be updated only for palette evaluation in key frames + if (beat_best_rd) *beat_best_rd = 1; + memcpy(best_palette_color_map, color_map, + block_width * block_height * sizeof(color_map[0])); + *best_mbmi = *mbmi; + memcpy(blk_skip, x->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); + av1_copy_array(tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + if (rate) *rate = this_rate; + if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate; + if (distortion) *distortion = tokenonly_rd_stats.dist; + if (skippable) *skippable = tokenonly_rd_stats.skip; + if (beat_best_pallette_rd) *beat_best_pallette_rd = 1; + } +} + +static AOM_INLINE int perform_top_color_coarse_palette_search( + const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi, + BLOCK_SIZE bsize, int dc_mode_cost, const int *data, + const int *const top_colors, int start_n, int end_n, int step_size, + uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi, + uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd, + int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable, + int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip, + uint8_t *tx_type_map) { + int centroids[PALETTE_MAX_SIZE]; + int n = start_n; + int top_color_winner = end_n + 1; + while (1) { + int beat_best_pallette_rd = 0; + for (int i = 0; i < n; ++i) centroids[i] = top_colors[i]; + palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n, + color_cache, n_cache, best_mbmi, best_palette_color_map, + best_rd, best_model_rd, rate, rate_tokenonly, distortion, + skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map, + &beat_best_pallette_rd); + // Break if current palette colors is not winning + if (beat_best_pallette_rd) top_color_winner = n; + n += step_size; + if (n > end_n) break; + } + return top_color_winner; +} + +static AOM_INLINE int perform_k_means_coarse_palette_search( + const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi, + BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int lb, int ub, + int start_n, int end_n, int step_size, uint16_t *color_cache, int n_cache, + MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd, + int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion, + int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx, + uint8_t *best_blk_skip, uint8_t *tx_type_map, uint8_t *color_map, + int data_points) { + int centroids[PALETTE_MAX_SIZE]; + const int max_itr = 50; + int n = start_n; + int k_means_winner = end_n + 1; + while (1) { + int beat_best_pallette_rd = 0; + for (int i = 0; i < n; ++i) { + centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2; + } + av1_k_means(data, centroids, color_map, data_points, n, 1, max_itr); + palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n, + color_cache, n_cache, best_mbmi, best_palette_color_map, + best_rd, best_model_rd, rate, rate_tokenonly, distortion, + skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map, + &beat_best_pallette_rd); + // Break if current palette colors is not winning + if (beat_best_pallette_rd) k_means_winner = n; + n += step_size; + if (n > end_n) break; + } + return k_means_winner; +} + +// Perform palette search for top colors from minimum palette colors (/maximum) +// with a step-size of 1 (/-1) +static AOM_INLINE int perform_top_color_palette_search( + const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi, + BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *top_colors, + int start_n, int end_n, int step_size, uint16_t *color_cache, int n_cache, + MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd, + int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion, + int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx, + uint8_t *best_blk_skip, uint8_t *tx_type_map) { + int centroids[PALETTE_MAX_SIZE]; + int n = start_n; + assert((step_size == -1) || (step_size == 1) || (step_size == 0) || + (step_size == 2)); + assert(IMPLIES(step_size == -1, start_n > end_n)); + assert(IMPLIES(step_size == 1, start_n < end_n)); + while (1) { + int beat_best_pallette_rd = 0; + for (int i = 0; i < n; ++i) centroids[i] = top_colors[i]; + palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n, + color_cache, n_cache, best_mbmi, best_palette_color_map, + best_rd, best_model_rd, rate, rate_tokenonly, distortion, + skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map, + &beat_best_pallette_rd); + // Break if current palette colors is not winning + if ((cpi->sf.intra_sf.prune_palette_search_level == 2) && + !beat_best_pallette_rd) + return n; + n += step_size; + if (n == end_n) break; + } + return n; +} +// Perform k-means based palette search from minimum palette colors (/maximum) +// with a step-size of 1 (/-1) +static AOM_INLINE int perform_k_means_palette_search( + const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi, + BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int lb, int ub, + int start_n, int end_n, int step_size, uint16_t *color_cache, int n_cache, + MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd, + int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion, + int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx, + uint8_t *best_blk_skip, uint8_t *tx_type_map, uint8_t *color_map, + int data_points) { + int centroids[PALETTE_MAX_SIZE]; + const int max_itr = 50; + int n = start_n; + assert((step_size == -1) || (step_size == 1) || (step_size == 0) || + (step_size == 2)); + assert(IMPLIES(step_size == -1, start_n > end_n)); + assert(IMPLIES(step_size == 1, start_n < end_n)); + while (1) { + int beat_best_pallette_rd = 0; + for (int i = 0; i < n; ++i) { + centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2; + } + av1_k_means(data, centroids, color_map, data_points, n, 1, max_itr); + palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n, + color_cache, n_cache, best_mbmi, best_palette_color_map, + best_rd, best_model_rd, rate, rate_tokenonly, distortion, + skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map, + &beat_best_pallette_rd); + // Break if current palette colors is not winning + if ((cpi->sf.intra_sf.prune_palette_search_level == 2) && + !beat_best_pallette_rd) + return n; + n += step_size; + if (n == end_n) break; + } + return n; +} + +#define START_N_STAGE2(x) \ + ((x == PALETTE_MIN_SIZE) ? PALETTE_MIN_SIZE + 1 \ + : AOMMAX(x - 1, PALETTE_MIN_SIZE)); +#define END_N_STAGE2(x, end_n) \ + ((x == end_n) ? x - 1 : AOMMIN(x + 1, PALETTE_MAX_SIZE)); + +static AOM_INLINE void update_start_end_stage_2(int *start_n_stage2, + int *end_n_stage2, + int *step_size_stage2, + int winner, int end_n) { + *start_n_stage2 = START_N_STAGE2(winner); + *end_n_stage2 = END_N_STAGE2(winner, end_n); + *step_size_stage2 = *end_n_stage2 - *start_n_stage2; +} + +// Start index and step size below are chosen to evaluate unique +// candidates in neighbor search, in case a winner candidate is found in +// coarse search. Example, +// 1) 8 colors (end_n = 8): 2,3,4,5,6,7,8. start_n is chosen as 2 and step +// size is chosen as 3. Therefore, coarse search will evaluate 2, 5 and 8. +// If winner is found at 5, then 4 and 6 are evaluated. Similarly, for 2 +// (3) and 8 (7). +// 2) 7 colors (end_n = 7): 2,3,4,5,6,7. If start_n is chosen as 2 (same +// as for 8 colors) then step size should also be 2, to cover all +// candidates. Coarse search will evaluate 2, 4 and 6. If winner is either +// 2 or 4, 3 will be evaluated. Instead, if start_n=3 and step_size=3, +// coarse search will evaluate 3 and 6. For the winner, unique neighbors +// (3: 2,4 or 6: 5,7) would be evaluated. + +// start index for coarse palette search for dominant colors and k-means +static const uint8_t start_n_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0, + 3, 3, 2, + 3, 3, 2 }; +// step size for coarse palette search for dominant colors and k-means +static const uint8_t step_size_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0, + 3, 3, 3, + 3, 3, 3 }; + +static void rd_pick_palette_intra_sby( + const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, + int64_t *best_rd, int64_t *best_model_rd, int *rate, int *rate_tokenonly, + int64_t *distortion, int *skippable, int *beat_best_rd, + PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip, uint8_t *tx_type_map) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(!is_inter_block(mbmi)); + assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools, + bsize)); + + const int src_stride = x->plane[0].src.stride; + const uint8_t *const src = x->plane[0].src.buf; + int block_width, block_height, rows, cols; + av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows, + &cols); + const SequenceHeader *const seq_params = &cpi->common.seq_params; + const int is_hbd = seq_params->use_highbitdepth; + const int bit_depth = seq_params->bit_depth; + int count_buf[1 << 12]; // Maximum (1 << 12) color levels. + int colors; + if (is_hbd) { + colors = av1_count_colors_highbd(src, src_stride, rows, cols, bit_depth, + count_buf); + } else { + colors = av1_count_colors(src, src_stride, rows, cols, count_buf); + } + + uint8_t *const color_map = xd->plane[0].color_index_map; + if (colors > 1 && colors <= 64) { + int *const data = x->palette_buffer->kmeans_data_buf; + int centroids[PALETTE_MAX_SIZE]; + int lb, ub; + if (is_hbd) { + int *data_pt = data; + const uint16_t *src_pt = CONVERT_TO_SHORTPTR(src); + lb = ub = src_pt[0]; + for (int r = 0; r < rows; ++r) { + for (int c = 0; c < cols; ++c) { + const int val = src_pt[c]; + data_pt[c] = val; + lb = AOMMIN(lb, val); + ub = AOMMAX(ub, val); + } + src_pt += src_stride; + data_pt += cols; + } + } else { + int *data_pt = data; + const uint8_t *src_pt = src; + lb = ub = src[0]; + for (int r = 0; r < rows; ++r) { + for (int c = 0; c < cols; ++c) { + const int val = src_pt[c]; + data_pt[c] = val; + lb = AOMMIN(lb, val); + ub = AOMMAX(ub, val); + } + src_pt += src_stride; + data_pt += cols; + } + } + + mbmi->mode = DC_PRED; + mbmi->filter_intra_mode_info.use_filter_intra = 0; + + uint16_t color_cache[2 * PALETTE_MAX_SIZE]; + const int n_cache = av1_get_palette_cache(xd, 0, color_cache); + + // Find the dominant colors, stored in top_colors[]. + int top_colors[PALETTE_MAX_SIZE] = { 0 }; + for (int i = 0; i < AOMMIN(colors, PALETTE_MAX_SIZE); ++i) { + int max_count = 0; + for (int j = 0; j < (1 << bit_depth); ++j) { + if (count_buf[j] > max_count) { + max_count = count_buf[j]; + top_colors[i] = j; + } + } + assert(max_count > 0); + count_buf[top_colors[i]] = 0; + } + + // Try the dominant colors directly. + // TODO(huisu@google.com): Try to avoid duplicate computation in cases + // where the dominant colors and the k-means results are similar. + if ((cpi->sf.intra_sf.prune_palette_search_level == 1) && + (colors > PALETTE_MIN_SIZE)) { + const int end_n = AOMMIN(colors, PALETTE_MAX_SIZE); + assert(PALETTE_MAX_SIZE == 8); + assert(PALETTE_MIN_SIZE == 2); + // Choose the start index and step size for coarse search based on number + // of colors + const int start_n = start_n_lookup_table[end_n]; + const int step_size = step_size_lookup_table[end_n]; + // Perform top color coarse palette search to find the winner candidate + const int top_color_winner = perform_top_color_coarse_palette_search( + cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, start_n, end_n, + step_size, color_cache, n_cache, best_mbmi, best_palette_color_map, + best_rd, best_model_rd, rate, rate_tokenonly, distortion, skippable, + beat_best_rd, ctx, best_blk_skip, tx_type_map); + // Evaluate neighbors for the winner color (if winner is found) in the + // above coarse search for dominant colors + if (top_color_winner <= end_n) { + int start_n_stage2, end_n_stage2, step_size_stage2; + update_start_end_stage_2(&start_n_stage2, &end_n_stage2, + &step_size_stage2, top_color_winner, end_n); + // perform finer search for the winner candidate + perform_top_color_palette_search( + cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, start_n_stage2, + end_n_stage2 + step_size_stage2, step_size_stage2, color_cache, + n_cache, best_mbmi, best_palette_color_map, best_rd, best_model_rd, + rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx, + best_blk_skip, tx_type_map); + } + // K-means clustering. + // Perform k-means coarse palette search to find the winner candidate + const int k_means_winner = perform_k_means_coarse_palette_search( + cpi, x, mbmi, bsize, dc_mode_cost, data, lb, ub, start_n, end_n, + step_size, color_cache, n_cache, best_mbmi, best_palette_color_map, + best_rd, best_model_rd, rate, rate_tokenonly, distortion, skippable, + beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map, + rows * cols); + // Evaluate neighbors for the winner color (if winner is found) in the + // above coarse search for k-means + if (k_means_winner <= end_n) { + int start_n_stage2, end_n_stage2, step_size_stage2; + update_start_end_stage_2(&start_n_stage2, &end_n_stage2, + &step_size_stage2, k_means_winner, end_n); + // perform finer search for the winner candidate + perform_k_means_palette_search( + cpi, x, mbmi, bsize, dc_mode_cost, data, lb, ub, start_n_stage2, + end_n_stage2 + step_size_stage2, step_size_stage2, color_cache, + n_cache, best_mbmi, best_palette_color_map, best_rd, best_model_rd, + rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx, + best_blk_skip, tx_type_map, color_map, rows * cols); + } + } else { + const int start_n = AOMMIN(colors, PALETTE_MAX_SIZE), + end_n = PALETTE_MIN_SIZE; + // Perform top color palette search from start_n + const int top_color_winner = perform_top_color_palette_search( + cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, start_n, + end_n - 1, -1, color_cache, n_cache, best_mbmi, + best_palette_color_map, best_rd, best_model_rd, rate, rate_tokenonly, + distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map); + + if (top_color_winner > end_n) { + // Perform top color palette search in reverse order for the remaining + // colors + perform_top_color_palette_search( + cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, end_n, + top_color_winner, 1, color_cache, n_cache, best_mbmi, + best_palette_color_map, best_rd, best_model_rd, rate, + rate_tokenonly, distortion, skippable, beat_best_rd, ctx, + best_blk_skip, tx_type_map); + } + // K-means clustering. + if (colors == PALETTE_MIN_SIZE) { + // Special case: These colors automatically become the centroids. + assert(colors == 2); + centroids[0] = lb; + centroids[1] = ub; + palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, colors, + color_cache, n_cache, best_mbmi, best_palette_color_map, + best_rd, best_model_rd, rate, rate_tokenonly, distortion, + skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map, + NULL); + } else { + // Perform k-means palette search from start_n + const int k_means_winner = perform_k_means_palette_search( + cpi, x, mbmi, bsize, dc_mode_cost, data, lb, ub, start_n, end_n - 1, + -1, color_cache, n_cache, best_mbmi, best_palette_color_map, + best_rd, best_model_rd, rate, rate_tokenonly, distortion, skippable, + beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map, + rows * cols); + if (k_means_winner > end_n) { + // Perform k-means palette search in reverse order for the remaining + // colors + perform_k_means_palette_search( + cpi, x, mbmi, bsize, dc_mode_cost, data, lb, ub, end_n, + k_means_winner, 1, color_cache, n_cache, best_mbmi, + best_palette_color_map, best_rd, best_model_rd, rate, + rate_tokenonly, distortion, skippable, beat_best_rd, ctx, + best_blk_skip, tx_type_map, color_map, rows * cols); + } + } + } + } + + if (best_mbmi->palette_mode_info.palette_size[0] > 0) { + memcpy(color_map, best_palette_color_map, + block_width * block_height * sizeof(best_palette_color_map[0])); + } + *mbmi = *best_mbmi; +} + +static AOM_INLINE void rd_pick_palette_intra_sbuv( + const AV1_COMP *const cpi, MACROBLOCK *x, int dc_mode_cost, + uint8_t *best_palette_color_map, MB_MODE_INFO *const best_mbmi, + int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion, + int *skippable) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(!is_inter_block(mbmi)); + assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools, + mbmi->sb_type)); + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const BLOCK_SIZE bsize = mbmi->sb_type; + const SequenceHeader *const seq_params = &cpi->common.seq_params; + int this_rate; + int64_t this_rd; + int colors_u, colors_v, colors; + const int src_stride = x->plane[1].src.stride; + const uint8_t *const src_u = x->plane[1].src.buf; + const uint8_t *const src_v = x->plane[2].src.buf; + uint8_t *const color_map = xd->plane[1].color_index_map; + RD_STATS tokenonly_rd_stats; + int plane_block_width, plane_block_height, rows, cols; + av1_get_block_dimensions(bsize, 1, xd, &plane_block_width, + &plane_block_height, &rows, &cols); + + mbmi->uv_mode = UV_DC_PRED; + + int count_buf[1 << 12]; // Maximum (1 << 12) color levels. + if (seq_params->use_highbitdepth) { + colors_u = av1_count_colors_highbd(src_u, src_stride, rows, cols, + seq_params->bit_depth, count_buf); + colors_v = av1_count_colors_highbd(src_v, src_stride, rows, cols, + seq_params->bit_depth, count_buf); + } else { + colors_u = av1_count_colors(src_u, src_stride, rows, cols, count_buf); + colors_v = av1_count_colors(src_v, src_stride, rows, cols, count_buf); + } + + uint16_t color_cache[2 * PALETTE_MAX_SIZE]; + const int n_cache = av1_get_palette_cache(xd, 1, color_cache); + + colors = colors_u > colors_v ? colors_u : colors_v; + if (colors > 1 && colors <= 64) { + int r, c, n, i, j; + const int max_itr = 50; + int lb_u, ub_u, val_u; + int lb_v, ub_v, val_v; + int *const data = x->palette_buffer->kmeans_data_buf; + int centroids[2 * PALETTE_MAX_SIZE]; + + uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u); + uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v); + if (seq_params->use_highbitdepth) { + lb_u = src_u16[0]; + ub_u = src_u16[0]; + lb_v = src_v16[0]; + ub_v = src_v16[0]; + } else { + lb_u = src_u[0]; + ub_u = src_u[0]; + lb_v = src_v[0]; + ub_v = src_v[0]; + } + + for (r = 0; r < rows; ++r) { + for (c = 0; c < cols; ++c) { + if (seq_params->use_highbitdepth) { + val_u = src_u16[r * src_stride + c]; + val_v = src_v16[r * src_stride + c]; + data[(r * cols + c) * 2] = val_u; + data[(r * cols + c) * 2 + 1] = val_v; + } else { + val_u = src_u[r * src_stride + c]; + val_v = src_v[r * src_stride + c]; + data[(r * cols + c) * 2] = val_u; + data[(r * cols + c) * 2 + 1] = val_v; + } + if (val_u < lb_u) + lb_u = val_u; + else if (val_u > ub_u) + ub_u = val_u; + if (val_v < lb_v) + lb_v = val_v; + else if (val_v > ub_v) + ub_v = val_v; + } + } + + for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2; + --n) { + for (i = 0; i < n; ++i) { + centroids[i * 2] = lb_u + (2 * i + 1) * (ub_u - lb_u) / n / 2; + centroids[i * 2 + 1] = lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2; + } + av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr); + optimize_palette_colors(color_cache, n_cache, n, 2, centroids); + // Sort the U channel colors in ascending order. + for (i = 0; i < 2 * (n - 1); i += 2) { + int min_idx = i; + int min_val = centroids[i]; + for (j = i + 2; j < 2 * n; j += 2) + if (centroids[j] < min_val) min_val = centroids[j], min_idx = j; + if (min_idx != i) { + int temp_u = centroids[i], temp_v = centroids[i + 1]; + centroids[i] = centroids[min_idx]; + centroids[i + 1] = centroids[min_idx + 1]; + centroids[min_idx] = temp_u, centroids[min_idx + 1] = temp_v; + } + } + av1_calc_indices(data, centroids, color_map, rows * cols, n, 2); + extend_palette_color_map(color_map, cols, rows, plane_block_width, + plane_block_height); + pmi->palette_size[1] = n; + for (i = 1; i < 3; ++i) { + for (j = 0; j < n; ++j) { + if (seq_params->use_highbitdepth) + pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd( + (int)centroids[j * 2 + i - 1], seq_params->bit_depth); + else + pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = + clip_pixel((int)centroids[j * 2 + i - 1]); + } + } + + av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd); + if (tokenonly_rd_stats.rate == INT_MAX) continue; + this_rate = tokenonly_rd_stats.rate + + intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost); + this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); + if (this_rd < *best_rd) { + *best_rd = this_rd; + *best_mbmi = *mbmi; + memcpy(best_palette_color_map, color_map, + plane_block_width * plane_block_height * + sizeof(best_palette_color_map[0])); + *rate = this_rate; + *distortion = tokenonly_rd_stats.dist; + *rate_tokenonly = tokenonly_rd_stats.rate; + *skippable = tokenonly_rd_stats.skip; + } + } + } + if (best_mbmi->palette_mode_info.palette_size[1] > 0) { + memcpy(color_map, best_palette_color_map, + plane_block_width * plane_block_height * + sizeof(best_palette_color_map[0])); + } +} + +void av1_restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const BLOCK_SIZE bsize = mbmi->sb_type; + int src_stride = x->plane[1].src.stride; + const uint8_t *const src_u = x->plane[1].src.buf; + const uint8_t *const src_v = x->plane[2].src.buf; + int *const data = x->palette_buffer->kmeans_data_buf; + int centroids[2 * PALETTE_MAX_SIZE]; + uint8_t *const color_map = xd->plane[1].color_index_map; + int r, c; + const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u); + const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v); + int plane_block_width, plane_block_height, rows, cols; + av1_get_block_dimensions(bsize, 1, xd, &plane_block_width, + &plane_block_height, &rows, &cols); + + for (r = 0; r < rows; ++r) { + for (c = 0; c < cols; ++c) { + if (cpi->common.seq_params.use_highbitdepth) { + data[(r * cols + c) * 2] = src_u16[r * src_stride + c]; + data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c]; + } else { + data[(r * cols + c) * 2] = src_u[r * src_stride + c]; + data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c]; + } + } + } + + for (r = 1; r < 3; ++r) { + for (c = 0; c < pmi->palette_size[1]; ++c) { + centroids[c * 2 + r - 1] = pmi->palette_colors[r * PALETTE_MAX_SIZE + c]; + } + } + + av1_calc_indices(data, centroids, color_map, rows * cols, + pmi->palette_size[1], 2); + extend_palette_color_map(color_map, cols, rows, plane_block_width, + plane_block_height); +} + +static AOM_INLINE void choose_intra_uv_mode( + const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, + TX_SIZE max_tx_size, int *rate_uv, int *rate_uv_tokenonly, int64_t *dist_uv, + int *skip_uv, UV_PREDICTION_MODE *mode_uv) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + // Use an estimated rd for uv_intra based on DC_PRED if the + // appropriate speed flag is set. + init_sbuv_mode(mbmi); + if (!xd->is_chroma_ref) { + *rate_uv = 0; + *rate_uv_tokenonly = 0; + *dist_uv = 0; + *skip_uv = 1; + *mode_uv = UV_DC_PRED; + return; + } + + // Only store reconstructed luma when there's chroma RDO. When there's no + // chroma RDO, the reconstructed luma will be stored in encode_superblock(). + xd->cfl.store_y = store_cfl_required_rdo(cm, x); + if (xd->cfl.store_y) { + // Restore reconstructed luma values. + av1_encode_intra_block_plane(cpi, x, mbmi->sb_type, AOM_PLANE_Y, + DRY_RUN_NORMAL, + cpi->optimize_seg_arr[mbmi->segment_id]); + xd->cfl.store_y = 0; + } + av1_rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, + skip_uv, bsize, max_tx_size); + *mode_uv = mbmi->uv_mode; +} + +// Run RD calculation with given chroma intra prediction angle., and return +// the RD cost. Update the best mode info. if the RD cost is the best so far. +static int64_t pick_intra_angle_routine_sbuv( + const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + int rate_overhead, int64_t best_rd_in, int *rate, RD_STATS *rd_stats, + int *best_angle_delta, int64_t *best_rd) { + MB_MODE_INFO *mbmi = x->e_mbd.mi[0]; + assert(!is_inter_block(mbmi)); + int this_rate; + int64_t this_rd; + RD_STATS tokenonly_rd_stats; + + if (!av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in)) + return INT64_MAX; + this_rate = tokenonly_rd_stats.rate + + intra_mode_info_cost_uv(cpi, x, mbmi, bsize, rate_overhead); + this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); + if (this_rd < *best_rd) { + *best_rd = this_rd; + *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV]; + *rate = this_rate; + rd_stats->rate = tokenonly_rd_stats.rate; + rd_stats->dist = tokenonly_rd_stats.dist; + rd_stats->skip = tokenonly_rd_stats.skip; + } + return this_rd; +} + +// With given chroma directional intra prediction mode, pick the best angle +// delta. Return true if a RD cost that is smaller than the input one is found. +static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int rate_overhead, + int64_t best_rd, int *rate, + RD_STATS *rd_stats) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + assert(!is_inter_block(mbmi)); + int i, angle_delta, best_angle_delta = 0; + int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)]; + + rd_stats->rate = INT_MAX; + rd_stats->skip = 0; + rd_stats->dist = INT64_MAX; + for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX; + + for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { + for (i = 0; i < 2; ++i) { + best_rd_in = (best_rd == INT64_MAX) + ? INT64_MAX + : (best_rd + (best_rd >> ((angle_delta == 0) ? 3 : 5))); + mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta; + this_rd = pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead, + best_rd_in, rate, rd_stats, + &best_angle_delta, &best_rd); + rd_cost[2 * angle_delta + i] = this_rd; + if (angle_delta == 0) { + if (this_rd == INT64_MAX) return 0; + rd_cost[1] = this_rd; + break; + } + } + } + + assert(best_rd != INT64_MAX); + for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { + int64_t rd_thresh; + for (i = 0; i < 2; ++i) { + int skip_search = 0; + rd_thresh = best_rd + (best_rd >> 5); + if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh && + rd_cost[2 * (angle_delta - 1) + i] > rd_thresh) + skip_search = 1; + if (!skip_search) { + mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta; + pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead, best_rd, + rate, rd_stats, &best_angle_delta, + &best_rd); + } + } + } + + mbmi->angle_delta[PLANE_TYPE_UV] = best_angle_delta; + return rd_stats->rate != INT_MAX; +} + +#define PLANE_SIGN_TO_JOINT_SIGN(plane, a, b) \ + (plane == CFL_PRED_U ? a * CFL_SIGNS + b - 1 : b * CFL_SIGNS + a - 1) +static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi, + TX_SIZE tx_size, int64_t best_rd) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const MACROBLOCKD_PLANE *pd = &xd->plane[AOM_PLANE_U]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(mbmi->sb_type, pd->subsampling_x, pd->subsampling_y); + + assert(is_cfl_allowed(xd) && cpi->oxcf.enable_cfl_intra); + assert(plane_bsize < BLOCK_SIZES_ALL); + if (!xd->lossless[mbmi->segment_id]) { + assert(block_size_wide[plane_bsize] == tx_size_wide[tx_size]); + assert(block_size_high[plane_bsize] == tx_size_high[tx_size]); + } + + xd->cfl.use_dc_pred_cache = 1; + const int64_t mode_rd = + RDCOST(x->rdmult, + x->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED], 0); + int64_t best_rd_uv[CFL_JOINT_SIGNS][CFL_PRED_PLANES]; + int best_c[CFL_JOINT_SIGNS][CFL_PRED_PLANES]; +#if CONFIG_DEBUG + int best_rate_uv[CFL_JOINT_SIGNS][CFL_PRED_PLANES]; +#endif // CONFIG_DEBUG + + const int skip_trellis = 0; + for (int plane = 0; plane < CFL_PRED_PLANES; plane++) { + RD_STATS rd_stats; + av1_init_rd_stats(&rd_stats); + for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) { + best_rd_uv[joint_sign][plane] = INT64_MAX; + best_c[joint_sign][plane] = 0; + } + // Collect RD stats for an alpha value of zero in this plane. + // Skip i == CFL_SIGN_ZERO as (0, 0) is invalid. + for (int i = CFL_SIGN_NEG; i < CFL_SIGNS; i++) { + const int8_t joint_sign = + PLANE_SIGN_TO_JOINT_SIGN(plane, CFL_SIGN_ZERO, i); + if (i == CFL_SIGN_NEG) { + mbmi->cfl_alpha_idx = 0; + mbmi->cfl_alpha_signs = joint_sign; + av1_txfm_rd_in_plane( + x, cpi, &rd_stats, best_rd, 0, plane + 1, plane_bsize, tx_size, + cpi->sf.rd_sf.use_fast_coef_costing, FTXS_NONE, skip_trellis); + if (rd_stats.rate == INT_MAX) break; + } + const int alpha_rate = x->cfl_cost[joint_sign][plane][0]; + best_rd_uv[joint_sign][plane] = + RDCOST(x->rdmult, rd_stats.rate + alpha_rate, rd_stats.dist); +#if CONFIG_DEBUG + best_rate_uv[joint_sign][plane] = rd_stats.rate; +#endif // CONFIG_DEBUG + } + } + + int8_t best_joint_sign = -1; + + for (int plane = 0; plane < CFL_PRED_PLANES; plane++) { + for (int pn_sign = CFL_SIGN_NEG; pn_sign < CFL_SIGNS; pn_sign++) { + int progress = 0; + for (int c = 0; c < CFL_ALPHABET_SIZE; c++) { + int flag = 0; + RD_STATS rd_stats; + if (c > 2 && progress < c) break; + av1_init_rd_stats(&rd_stats); + for (int i = 0; i < CFL_SIGNS; i++) { + const int8_t joint_sign = PLANE_SIGN_TO_JOINT_SIGN(plane, pn_sign, i); + if (i == 0) { + mbmi->cfl_alpha_idx = (c << CFL_ALPHABET_SIZE_LOG2) + c; + mbmi->cfl_alpha_signs = joint_sign; + av1_txfm_rd_in_plane( + x, cpi, &rd_stats, best_rd, 0, plane + 1, plane_bsize, tx_size, + cpi->sf.rd_sf.use_fast_coef_costing, FTXS_NONE, skip_trellis); + if (rd_stats.rate == INT_MAX) break; + } + const int alpha_rate = x->cfl_cost[joint_sign][plane][c]; + int64_t this_rd = + RDCOST(x->rdmult, rd_stats.rate + alpha_rate, rd_stats.dist); + if (this_rd >= best_rd_uv[joint_sign][plane]) continue; + best_rd_uv[joint_sign][plane] = this_rd; + best_c[joint_sign][plane] = c; +#if CONFIG_DEBUG + best_rate_uv[joint_sign][plane] = rd_stats.rate; +#endif // CONFIG_DEBUG + flag = 2; + if (best_rd_uv[joint_sign][!plane] == INT64_MAX) continue; + this_rd += mode_rd + best_rd_uv[joint_sign][!plane]; + if (this_rd >= best_rd) continue; + best_rd = this_rd; + best_joint_sign = joint_sign; + } + progress += flag; + } + } + } + + int best_rate_overhead = INT_MAX; + uint8_t ind = 0; + if (best_joint_sign >= 0) { + const int u = best_c[best_joint_sign][CFL_PRED_U]; + const int v = best_c[best_joint_sign][CFL_PRED_V]; + ind = (u << CFL_ALPHABET_SIZE_LOG2) + v; + best_rate_overhead = x->cfl_cost[best_joint_sign][CFL_PRED_U][u] + + x->cfl_cost[best_joint_sign][CFL_PRED_V][v]; +#if CONFIG_DEBUG + xd->cfl.rate = x->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED] + + best_rate_overhead + + best_rate_uv[best_joint_sign][CFL_PRED_U] + + best_rate_uv[best_joint_sign][CFL_PRED_V]; +#endif // CONFIG_DEBUG + } else { + best_joint_sign = 0; + } + + mbmi->cfl_alpha_idx = ind; + mbmi->cfl_alpha_signs = best_joint_sign; + xd->cfl.use_dc_pred_cache = 0; + xd->cfl.dc_pred_is_cached[0] = 0; + xd->cfl.dc_pred_is_cached[1] = 0; + return best_rate_overhead; +} + +int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x, + int *rate, int *rate_tokenonly, + int64_t *distortion, int *skippable, + BLOCK_SIZE bsize, TX_SIZE max_tx_size) { + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + assert(!is_inter_block(mbmi)); + MB_MODE_INFO best_mbmi = *mbmi; + int64_t best_rd = INT64_MAX, this_rd; + + for (int mode_idx = 0; mode_idx < UV_INTRA_MODES; ++mode_idx) { + int this_rate; + RD_STATS tokenonly_rd_stats; + UV_PREDICTION_MODE mode = uv_rd_search_mode_order[mode_idx]; + const int is_directional_mode = av1_is_directional_mode(get_uv_mode(mode)); + if (!(cpi->sf.intra_sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] & + (1 << mode))) + continue; + if (!cpi->oxcf.enable_smooth_intra && mode >= UV_SMOOTH_PRED && + mode <= UV_SMOOTH_H_PRED) + continue; + + if (!cpi->oxcf.enable_paeth_intra && mode == UV_PAETH_PRED) continue; + + mbmi->uv_mode = mode; + int cfl_alpha_rate = 0; + if (mode == UV_CFL_PRED) { + if (!is_cfl_allowed(xd) || !cpi->oxcf.enable_cfl_intra) continue; + assert(!is_directional_mode); + const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd); + cfl_alpha_rate = cfl_rd_pick_alpha(x, cpi, uv_tx_size, best_rd); + if (cfl_alpha_rate == INT_MAX) continue; + } + mbmi->angle_delta[PLANE_TYPE_UV] = 0; + if (is_directional_mode && av1_use_angle_delta(mbmi->sb_type) && + cpi->oxcf.enable_angle_delta) { + const int rate_overhead = + x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode]; + if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd, + &this_rate, &tokenonly_rd_stats)) + continue; + } else { + if (!av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd)) { + continue; + } + } + const int mode_cost = + x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode] + + cfl_alpha_rate; + this_rate = tokenonly_rd_stats.rate + + intra_mode_info_cost_uv(cpi, x, mbmi, bsize, mode_cost); + if (mode == UV_CFL_PRED) { + assert(is_cfl_allowed(xd) && cpi->oxcf.enable_cfl_intra); +#if CONFIG_DEBUG + if (!xd->lossless[mbmi->segment_id]) + assert(xd->cfl.rate == tokenonly_rd_stats.rate + mode_cost); +#endif // CONFIG_DEBUG + } + this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); + + if (this_rd < best_rd) { + best_mbmi = *mbmi; + best_rd = this_rd; + *rate = this_rate; + *rate_tokenonly = tokenonly_rd_stats.rate; + *distortion = tokenonly_rd_stats.dist; + *skippable = tokenonly_rd_stats.skip; + } + } + + const int try_palette = + cpi->oxcf.enable_palette && + av1_allow_palette(cpi->common.features.allow_screen_content_tools, + mbmi->sb_type); + if (try_palette) { + uint8_t *best_palette_color_map = x->palette_buffer->best_palette_color_map; + rd_pick_palette_intra_sbuv( + cpi, x, + x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][UV_DC_PRED], + best_palette_color_map, &best_mbmi, &best_rd, rate, rate_tokenonly, + distortion, skippable); + } + + *mbmi = best_mbmi; + // Make sure we actually chose a mode + assert(best_rd < INT64_MAX); + return best_rd; +} + +int av1_search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *this_rd_cost, PICK_MODE_CONTEXT *ctx, + BLOCK_SIZE bsize, MB_MODE_INFO *const mbmi, + PALETTE_MODE_INFO *const pmi, + unsigned int *ref_costs_single, + IntraModeSearchState *intra_search_state, + int64_t best_rd) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + int rate2 = 0; + int64_t distortion2 = 0, best_rd_palette = best_rd, this_rd, + best_model_rd_palette = INT64_MAX; + int skippable = 0; + TX_SIZE uv_tx = TX_4X4; + uint8_t *const best_palette_color_map = + x->palette_buffer->best_palette_color_map; + uint8_t *const color_map = xd->plane[0].color_index_map; + MB_MODE_INFO best_mbmi_palette = *mbmi; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]]; + const int rows = block_size_high[bsize]; + const int cols = block_size_wide[bsize]; + + mbmi->mode = DC_PRED; + mbmi->uv_mode = UV_DC_PRED; + mbmi->ref_frame[0] = INTRA_FRAME; + mbmi->ref_frame[1] = NONE_FRAME; + RD_STATS rd_stats_y; + av1_invalid_rd_stats(&rd_stats_y); + rd_pick_palette_intra_sby( + cpi, x, bsize, intra_mode_cost[DC_PRED], &best_mbmi_palette, + best_palette_color_map, &best_rd_palette, &best_model_rd_palette, + &rd_stats_y.rate, NULL, &rd_stats_y.dist, &rd_stats_y.skip, NULL, ctx, + best_blk_skip, best_tx_type_map); + if (rd_stats_y.rate == INT_MAX || pmi->palette_size[0] == 0) { + this_rd_cost->rdcost = INT64_MAX; + return skippable; + } + + memcpy(x->blk_skip, best_blk_skip, + sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize)); + av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk); + memcpy(color_map, best_palette_color_map, + rows * cols * sizeof(best_palette_color_map[0])); + + skippable = rd_stats_y.skip; + distortion2 = rd_stats_y.dist; + rate2 = rd_stats_y.rate + ref_costs_single[INTRA_FRAME]; + if (num_planes > 1) { + uv_tx = av1_get_tx_size(AOM_PLANE_U, xd); + if (intra_search_state->rate_uv_intra == INT_MAX) { + choose_intra_uv_mode( + cpi, x, bsize, uv_tx, &intra_search_state->rate_uv_intra, + &intra_search_state->rate_uv_tokenonly, &intra_search_state->dist_uvs, + &intra_search_state->skip_uvs, &intra_search_state->mode_uv); + intra_search_state->pmi_uv = *pmi; + intra_search_state->uv_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV]; + } + mbmi->uv_mode = intra_search_state->mode_uv; + pmi->palette_size[1] = intra_search_state->pmi_uv.palette_size[1]; + if (pmi->palette_size[1] > 0) { + memcpy(pmi->palette_colors + PALETTE_MAX_SIZE, + intra_search_state->pmi_uv.palette_colors + PALETTE_MAX_SIZE, + 2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0])); + } + mbmi->angle_delta[PLANE_TYPE_UV] = intra_search_state->uv_angle_delta; + skippable = skippable && intra_search_state->skip_uvs; + distortion2 += intra_search_state->dist_uvs; + rate2 += intra_search_state->rate_uv_intra; + } + + if (skippable) { + rate2 -= rd_stats_y.rate; + if (num_planes > 1) rate2 -= intra_search_state->rate_uv_tokenonly; + rate2 += x->skip_cost[av1_get_skip_context(xd)][1]; + } else { + rate2 += x->skip_cost[av1_get_skip_context(xd)][0]; + } + this_rd = RDCOST(x->rdmult, rate2, distortion2); + this_rd_cost->rate = rate2; + this_rd_cost->dist = distortion2; + this_rd_cost->rdcost = this_rd; + return skippable; +} + +// Given selected prediction mode, search for the best tx type and size. +static AOM_INLINE int intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, const int *bmode_costs, + int64_t *best_rd, int *rate, + int *rate_tokenonly, int64_t *distortion, + int *skippable, MB_MODE_INFO *best_mbmi, + PICK_MODE_CONTEXT *ctx) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + RD_STATS rd_stats; + // In order to improve txfm search avoid rd based breakouts during winner + // mode evaluation. Hence passing ref_best_rd as a maximum value + av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats, bsize, INT64_MAX); + if (rd_stats.rate == INT_MAX) return 0; + int this_rate_tokenonly = rd_stats.rate; + if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->sb_type)) { + // av1_pick_uniform_tx_size_type_yrd above includes the cost of the tx_size + // in the tokenonly rate, but for intra blocks, tx_size is always coded + // (prediction granularity), so we account for it in the full rate, + // not the tokenonly rate. + this_rate_tokenonly -= tx_size_cost(x, bsize, mbmi->tx_size); + } + const int this_rate = + rd_stats.rate + + intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode]); + const int64_t this_rd = RDCOST(x->rdmult, this_rate, rd_stats.dist); + if (this_rd < *best_rd) { + *best_mbmi = *mbmi; + *best_rd = this_rd; + *rate = this_rate; + *rate_tokenonly = this_rate_tokenonly; + *distortion = rd_stats.dist; + *skippable = rd_stats.skip; + av1_copy_array(ctx->blk_skip, x->blk_skip, ctx->num_4x4_blk); + av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + return 1; + } + return 0; +} + +// With given luma directional intra prediction mode, pick the best angle delta +// Return the RD cost corresponding to the best angle delta. +static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x, + int *rate, RD_STATS *rd_stats, + BLOCK_SIZE bsize, int mode_cost, + int64_t best_rd, int64_t *best_model_rd, + int skip_model_rd_for_zero_deg) { + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + assert(!is_inter_block(mbmi)); + + int best_angle_delta = 0; + int64_t rd_cost[2 * (MAX_ANGLE_DELTA + 2)]; + TX_SIZE best_tx_size = mbmi->tx_size; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + + for (int i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX; + + int first_try = 1; + for (int angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { + for (int i = 0; i < 2; ++i) { + const int64_t best_rd_in = + (best_rd == INT64_MAX) ? INT64_MAX + : (best_rd + (best_rd >> (first_try ? 3 : 5))); + const int64_t this_rd = calc_rd_given_intra_angle( + cpi, x, bsize, mode_cost, best_rd_in, (1 - 2 * i) * angle_delta, + MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size, + &best_rd, best_model_rd, best_tx_type_map, best_blk_skip, + (skip_model_rd_for_zero_deg & !angle_delta)); + rd_cost[2 * angle_delta + i] = this_rd; + if (first_try && this_rd == INT64_MAX) return best_rd; + first_try = 0; + if (angle_delta == 0) { + rd_cost[1] = this_rd; + break; + } + } + } + + assert(best_rd != INT64_MAX); + for (int angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { + for (int i = 0; i < 2; ++i) { + int skip_search = 0; + const int64_t rd_thresh = best_rd + (best_rd >> 5); + if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh && + rd_cost[2 * (angle_delta - 1) + i] > rd_thresh) + skip_search = 1; + if (!skip_search) { + calc_rd_given_intra_angle( + cpi, x, bsize, mode_cost, best_rd, (1 - 2 * i) * angle_delta, + MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size, + &best_rd, best_model_rd, best_tx_type_map, best_blk_skip, 0); + } + } + } + + if (rd_stats->rate != INT_MAX) { + mbmi->tx_size = best_tx_size; + mbmi->angle_delta[PLANE_TYPE_Y] = best_angle_delta; + const int n4 = bsize_to_num_blk(bsize); + memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4); + av1_copy_array(xd->tx_type_map, best_tx_type_map, n4); + } + return best_rd; +} + +int64_t av1_handle_intra_mode(IntraModeSearchState *intra_search_state, + const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int ref_frame_cost, + const PICK_MODE_CONTEXT *ctx, int disable_skip, + RD_STATS *rd_stats, RD_STATS *rd_stats_y, + RD_STATS *rd_stats_uv, int64_t best_rd, + int64_t *best_intra_rd, int8_t best_mbmode_skip) { + const AV1_COMMON *cm = &cpi->common; + const SPEED_FEATURES *const sf = &cpi->sf; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(mbmi->ref_frame[0] == INTRA_FRAME); + const PREDICTION_MODE mode = mbmi->mode; + const int mode_cost = + x->mbmode_cost[size_group_lookup[bsize]][mode] + ref_frame_cost; + const int intra_cost_penalty = av1_get_intra_cost_penalty( + cm->quant_params.base_qindex, cm->quant_params.y_dc_delta_q, + cm->seq_params.bit_depth); + const int skip_ctx = av1_get_skip_context(xd); + + int known_rate = mode_cost; + known_rate += ref_frame_cost; + if (mode != DC_PRED && mode != PAETH_PRED) known_rate += intra_cost_penalty; + known_rate += AOMMIN(x->skip_cost[skip_ctx][0], x->skip_cost[skip_ctx][1]); + const int64_t known_rd = RDCOST(x->rdmult, known_rate, 0); + if (known_rd > best_rd) { + intra_search_state->skip_intra_modes = 1; + return INT64_MAX; + } + + const int is_directional_mode = av1_is_directional_mode(mode); + if (is_directional_mode && av1_use_angle_delta(bsize) && + cpi->oxcf.enable_angle_delta) { + if (sf->intra_sf.intra_pruning_with_hog && + !intra_search_state->angle_stats_ready) { + prune_intra_mode_with_hog(x, bsize, + cpi->sf.intra_sf.intra_pruning_with_hog_thresh, + intra_search_state->directional_mode_skip_mask); + intra_search_state->angle_stats_ready = 1; + } + if (intra_search_state->directional_mode_skip_mask[mode]) return INT64_MAX; + av1_init_rd_stats(rd_stats_y); + rd_stats_y->rate = INT_MAX; + int64_t model_rd = INT64_MAX; + int rate_dummy; + rd_pick_intra_angle_sby(cpi, x, &rate_dummy, rd_stats_y, bsize, mode_cost, + best_rd, &model_rd, 0); + + } else { + av1_init_rd_stats(rd_stats_y); + mbmi->angle_delta[PLANE_TYPE_Y] = 0; + av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, best_rd); + } + + // Pick filter intra modes. + if (mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) { + int try_filter_intra = 0; + int64_t best_rd_so_far = INT64_MAX; + if (rd_stats_y->rate != INT_MAX) { + const int tmp_rate = + rd_stats_y->rate + x->filter_intra_cost[bsize][0] + mode_cost; + best_rd_so_far = RDCOST(x->rdmult, tmp_rate, rd_stats_y->dist); + try_filter_intra = (best_rd_so_far / 2) <= best_rd; + } else { + try_filter_intra = !best_mbmode_skip; + } + + if (try_filter_intra) { + RD_STATS rd_stats_y_fi; + int filter_intra_selected_flag = 0; + TX_SIZE best_tx_size = mbmi->tx_size; + FILTER_INTRA_MODE best_fi_mode = FILTER_DC_PRED; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + memcpy(best_blk_skip, x->blk_skip, + sizeof(best_blk_skip[0]) * ctx->num_4x4_blk); + uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + mbmi->filter_intra_mode_info.use_filter_intra = 1; + for (FILTER_INTRA_MODE fi_mode = FILTER_DC_PRED; + fi_mode < FILTER_INTRA_MODES; ++fi_mode) { + mbmi->filter_intra_mode_info.filter_intra_mode = fi_mode; + av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y_fi, bsize, + best_rd); + if (rd_stats_y_fi.rate == INT_MAX) continue; + const int this_rate_tmp = + rd_stats_y_fi.rate + + intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost); + const int64_t this_rd_tmp = + RDCOST(x->rdmult, this_rate_tmp, rd_stats_y_fi.dist); + + if (this_rd_tmp != INT64_MAX && this_rd_tmp / 2 > best_rd) { + break; + } + if (this_rd_tmp < best_rd_so_far) { + best_tx_size = mbmi->tx_size; + av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + memcpy(best_blk_skip, x->blk_skip, + sizeof(best_blk_skip[0]) * ctx->num_4x4_blk); + best_fi_mode = fi_mode; + *rd_stats_y = rd_stats_y_fi; + filter_intra_selected_flag = 1; + best_rd_so_far = this_rd_tmp; + } + } + + mbmi->tx_size = best_tx_size; + av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk); + memcpy(x->blk_skip, best_blk_skip, + sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); + + if (filter_intra_selected_flag) { + mbmi->filter_intra_mode_info.use_filter_intra = 1; + mbmi->filter_intra_mode_info.filter_intra_mode = best_fi_mode; + } else { + mbmi->filter_intra_mode_info.use_filter_intra = 0; + } + } + } + + if (rd_stats_y->rate == INT_MAX) return INT64_MAX; + + const int mode_cost_y = + intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost); + av1_init_rd_stats(rd_stats); + av1_init_rd_stats(rd_stats_uv); + const int num_planes = av1_num_planes(cm); + if (num_planes > 1) { + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const int try_palette = + cpi->oxcf.enable_palette && + av1_allow_palette(cm->features.allow_screen_content_tools, + mbmi->sb_type); + const TX_SIZE uv_tx = av1_get_tx_size(AOM_PLANE_U, xd); + if (intra_search_state->rate_uv_intra == INT_MAX) { + const int rate_y = + rd_stats_y->skip ? x->skip_cost[skip_ctx][1] : rd_stats_y->rate; + const int64_t rdy = + RDCOST(x->rdmult, rate_y + mode_cost_y, rd_stats_y->dist); + if (best_rd < (INT64_MAX / 2) && rdy > (best_rd + (best_rd >> 2))) { + intra_search_state->skip_intra_modes = 1; + return INT64_MAX; + } + choose_intra_uv_mode( + cpi, x, bsize, uv_tx, &intra_search_state->rate_uv_intra, + &intra_search_state->rate_uv_tokenonly, &intra_search_state->dist_uvs, + &intra_search_state->skip_uvs, &intra_search_state->mode_uv); + if (try_palette) intra_search_state->pmi_uv = *pmi; + intra_search_state->uv_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV]; + + const int uv_rate = intra_search_state->rate_uv_tokenonly; + const int64_t uv_dist = intra_search_state->dist_uvs; + const int64_t uv_rd = RDCOST(x->rdmult, uv_rate, uv_dist); + if (uv_rd > best_rd) { + intra_search_state->skip_intra_modes = 1; + return INT64_MAX; + } + } + + rd_stats_uv->rate = intra_search_state->rate_uv_tokenonly; + rd_stats_uv->dist = intra_search_state->dist_uvs; + rd_stats_uv->skip = intra_search_state->skip_uvs; + rd_stats->skip = rd_stats_y->skip && rd_stats_uv->skip; + mbmi->uv_mode = intra_search_state->mode_uv; + if (try_palette) { + pmi->palette_size[1] = intra_search_state->pmi_uv.palette_size[1]; + memcpy(pmi->palette_colors + PALETTE_MAX_SIZE, + intra_search_state->pmi_uv.palette_colors + PALETTE_MAX_SIZE, + 2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0])); + } + mbmi->angle_delta[PLANE_TYPE_UV] = intra_search_state->uv_angle_delta; + } + + rd_stats->rate = rd_stats_y->rate + mode_cost_y; + if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) { + // av1_pick_uniform_tx_size_type_yrd above includes the cost of the tx_size + // in the tokenonly rate, but for intra blocks, tx_size is always coded + // (prediction granularity), so we account for it in the full rate, + // not the tokenonly rate. + rd_stats_y->rate -= tx_size_cost(x, bsize, mbmi->tx_size); + } + if (num_planes > 1 && xd->is_chroma_ref) { + const int uv_mode_cost = + x->intra_uv_mode_cost[is_cfl_allowed(xd)][mode][mbmi->uv_mode]; + rd_stats->rate += + rd_stats_uv->rate + + intra_mode_info_cost_uv(cpi, x, mbmi, bsize, uv_mode_cost); + } + if (mode != DC_PRED && mode != PAETH_PRED) { + rd_stats->rate += intra_cost_penalty; + } + + // Intra block is always coded as non-skip + rd_stats->skip = 0; + rd_stats->dist = rd_stats_y->dist + rd_stats_uv->dist; + // Add in the cost of the no skip flag. + rd_stats->rate += x->skip_cost[skip_ctx][0]; + // Calculate the final RD estimate for this mode. + const int64_t this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + // Keep record of best intra rd + if (this_rd < *best_intra_rd) { + *best_intra_rd = this_rd; + intra_search_state->best_intra_mode = mode; + } + + if (sf->intra_sf.skip_intra_in_interframe) { + if (best_rd < (INT64_MAX / 2) && this_rd > (best_rd + (best_rd >> 1))) + intra_search_state->skip_intra_modes = 1; + } + + if (!disable_skip) { + for (int i = 0; i < REFERENCE_MODES; ++i) { + intra_search_state->best_pred_rd[i] = + AOMMIN(intra_search_state->best_pred_rd[i], this_rd); + } + } + return this_rd; +} + +// This function is used only for intra_only frames +int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, + int *rate, int *rate_tokenonly, + int64_t *distortion, int *skippable, + BLOCK_SIZE bsize, int64_t best_rd, + PICK_MODE_CONTEXT *ctx) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(!is_inter_block(mbmi)); + int64_t best_model_rd = INT64_MAX; + int is_directional_mode; + uint8_t directional_mode_skip_mask[INTRA_MODES] = { 0 }; + // Flag to check rd of any intra mode is better than best_rd passed to this + // function + int beat_best_rd = 0; + const int *bmode_costs; + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const int try_palette = + cpi->oxcf.enable_palette && + av1_allow_palette(cpi->common.features.allow_screen_content_tools, + mbmi->sb_type); + uint8_t *best_palette_color_map = + try_palette ? x->palette_buffer->best_palette_color_map : NULL; + const MB_MODE_INFO *above_mi = xd->above_mbmi; + const MB_MODE_INFO *left_mi = xd->left_mbmi; + const PREDICTION_MODE A = av1_above_block_mode(above_mi); + const PREDICTION_MODE L = av1_left_block_mode(left_mi); + const int above_ctx = intra_mode_context[A]; + const int left_ctx = intra_mode_context[L]; + bmode_costs = x->y_mode_costs[above_ctx][left_ctx]; + + mbmi->angle_delta[PLANE_TYPE_Y] = 0; + if (cpi->sf.intra_sf.intra_pruning_with_hog) { + prune_intra_mode_with_hog(x, bsize, + cpi->sf.intra_sf.intra_pruning_with_hog_thresh, + directional_mode_skip_mask); + } + mbmi->filter_intra_mode_info.use_filter_intra = 0; + pmi->palette_size[0] = 0; + + // Set params for mode evaluation + set_mode_eval_params(cpi, x, MODE_EVAL); + + MB_MODE_INFO best_mbmi = *mbmi; + av1_zero(x->winner_mode_stats); + x->winner_mode_count = 0; + + /* Y Search for intra prediction mode */ + for (int mode_idx = INTRA_MODE_START; mode_idx < INTRA_MODE_END; ++mode_idx) { + RD_STATS this_rd_stats; + int this_rate, this_rate_tokenonly, s; + int64_t this_distortion, this_rd; + mbmi->mode = intra_rd_search_mode_order[mode_idx]; + if ((!cpi->oxcf.enable_smooth_intra || + cpi->sf.intra_sf.disable_smooth_intra) && + (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED || + mbmi->mode == SMOOTH_V_PRED)) + continue; + if (!cpi->oxcf.enable_paeth_intra && mbmi->mode == PAETH_PRED) continue; + mbmi->angle_delta[PLANE_TYPE_Y] = 0; + + if (model_intra_yrd_and_prune(cpi, x, bsize, bmode_costs[mbmi->mode], + &best_model_rd)) { + continue; + } + + is_directional_mode = av1_is_directional_mode(mbmi->mode); + if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue; + if (is_directional_mode && av1_use_angle_delta(bsize) && + cpi->oxcf.enable_angle_delta) { + this_rd_stats.rate = INT_MAX; + rd_pick_intra_angle_sby(cpi, x, &this_rate, &this_rd_stats, bsize, + bmode_costs[mbmi->mode], best_rd, &best_model_rd, + 1); + } else { + av1_pick_uniform_tx_size_type_yrd(cpi, x, &this_rd_stats, bsize, best_rd); + } + this_rate_tokenonly = this_rd_stats.rate; + this_distortion = this_rd_stats.dist; + s = this_rd_stats.skip; + + if (this_rate_tokenonly == INT_MAX) continue; + + if (!xd->lossless[mbmi->segment_id] && + block_signals_txsize(mbmi->sb_type)) { + // av1_pick_uniform_tx_size_type_yrd above includes the cost of the + // tx_size in the tokenonly rate, but for intra blocks, tx_size is always + // coded (prediction granularity), so we account for it in the full rate, + // not the tokenonly rate. + this_rate_tokenonly -= tx_size_cost(x, bsize, mbmi->tx_size); + } + this_rate = + this_rd_stats.rate + + intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode]); + this_rd = RDCOST(x->rdmult, this_rate, this_distortion); + // Collect mode stats for multiwinner mode processing + const int txfm_search_done = 1; + store_winner_mode_stats( + &cpi->common, x, mbmi, NULL, NULL, NULL, 0, NULL, bsize, this_rd, + cpi->sf.winner_mode_sf.enable_multiwinner_mode_process, + txfm_search_done); + if (this_rd < best_rd) { + best_mbmi = *mbmi; + best_rd = this_rd; + // Setting beat_best_rd flag because current mode rd is better than + // best_rd passed to this function + beat_best_rd = 1; + *rate = this_rate; + *rate_tokenonly = this_rate_tokenonly; + *distortion = this_distortion; + *skippable = s; + memcpy(ctx->blk_skip, x->blk_skip, + sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); + av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + } + } + + if (try_palette) { + rd_pick_palette_intra_sby( + cpi, x, bsize, bmode_costs[DC_PRED], &best_mbmi, best_palette_color_map, + &best_rd, &best_model_rd, rate, rate_tokenonly, distortion, skippable, + &beat_best_rd, ctx, ctx->blk_skip, ctx->tx_type_map); + } + + if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize)) { + if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion, + skippable, bsize, bmode_costs[DC_PRED], + &best_rd, &best_model_rd, ctx)) { + best_mbmi = *mbmi; + } + } + // No mode is identified with less rd value than best_rd passed to this + // function. In such cases winner mode processing is not necessary and return + // best_rd as INT64_MAX to indicate best mode is not identified + if (!beat_best_rd) return INT64_MAX; + + // In multi-winner mode processing, perform tx search for few best modes + // identified during mode evaluation. Winner mode processing uses best tx + // configuration for tx search. + if (cpi->sf.winner_mode_sf.enable_multiwinner_mode_process) { + int best_mode_idx = 0; + int block_width, block_height; + uint8_t *color_map_dst = xd->plane[PLANE_TYPE_Y].color_index_map; + av1_get_block_dimensions(bsize, AOM_PLANE_Y, xd, &block_width, + &block_height, NULL, NULL); + + for (int mode_idx = 0; mode_idx < x->winner_mode_count; mode_idx++) { + *mbmi = x->winner_mode_stats[mode_idx].mbmi; + if (is_winner_mode_processing_enabled(cpi, mbmi, mbmi->mode)) { + // Restore color_map of palette mode before winner mode processing + if (mbmi->palette_mode_info.palette_size[0] > 0) { + uint8_t *color_map_src = + x->winner_mode_stats[mode_idx].color_index_map; + memcpy(color_map_dst, color_map_src, + block_width * block_height * sizeof(*color_map_src)); + } + // Set params for winner mode evaluation + set_mode_eval_params(cpi, x, WINNER_MODE_EVAL); + + // Winner mode processing + // If previous searches use only the default tx type/no R-D optimization + // of quantized coeffs, do an extra search for the best tx type/better + // R-D optimization of quantized coeffs + if (intra_block_yrd(cpi, x, bsize, bmode_costs, &best_rd, rate, + rate_tokenonly, distortion, skippable, &best_mbmi, + ctx)) + best_mode_idx = mode_idx; + } + } + // Copy color_map of palette mode for final winner mode + if (best_mbmi.palette_mode_info.palette_size[0] > 0) { + uint8_t *color_map_src = + x->winner_mode_stats[best_mode_idx].color_index_map; + memcpy(color_map_dst, color_map_src, + block_width * block_height * sizeof(*color_map_src)); + } + } else { + // If previous searches use only the default tx type/no R-D optimization of + // quantized coeffs, do an extra search for the best tx type/better R-D + // optimization of quantized coeffs + if (is_winner_mode_processing_enabled(cpi, mbmi, best_mbmi.mode)) { + // Set params for winner mode evaluation + set_mode_eval_params(cpi, x, WINNER_MODE_EVAL); + *mbmi = best_mbmi; + intra_block_yrd(cpi, x, bsize, bmode_costs, &best_rd, rate, + rate_tokenonly, distortion, skippable, &best_mbmi, ctx); + } + } + *mbmi = best_mbmi; + av1_copy_array(xd->tx_type_map, ctx->tx_type_map, ctx->num_4x4_blk); + return best_rd; +} diff --git a/libs/libaom/src/av1/encoder/intra_mode_search.h b/libs/libaom/src/av1/encoder/intra_mode_search.h new file mode 100644 index 000000000..4b5d31c3e --- /dev/null +++ b/libs/libaom/src/av1/encoder/intra_mode_search.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_ +#define AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_ + +#include "av1/encoder/encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct IntraModeSearchState { + int skip_intra_modes; + PREDICTION_MODE best_intra_mode; + int angle_stats_ready; + uint8_t directional_mode_skip_mask[INTRA_MODES]; + int rate_uv_intra; + int rate_uv_tokenonly; + int64_t dist_uvs; + int skip_uvs; + UV_PREDICTION_MODE mode_uv; + PALETTE_MODE_INFO pmi_uv; + int8_t uv_angle_delta; + int64_t best_pred_rd[REFERENCE_MODES]; +} IntraModeSearchState; + +void av1_restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x); +int av1_search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *this_rd_cost, PICK_MODE_CONTEXT *ctx, + BLOCK_SIZE bsize, MB_MODE_INFO *const mbmi, + PALETTE_MODE_INFO *const pmi, + unsigned int *ref_costs_single, + IntraModeSearchState *intra_search_state, + int64_t best_rd); + +int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x, + int *rate, int *rate_tokenonly, + int64_t *distortion, int *skippable, + BLOCK_SIZE bsize, TX_SIZE max_tx_size); + +int64_t av1_handle_intra_mode(IntraModeSearchState *intra_search_state, + const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int ref_frame_cost, + const PICK_MODE_CONTEXT *ctx, int disable_skip, + RD_STATS *rd_stats, RD_STATS *rd_stats_y, + RD_STATS *rd_stats_uv, int64_t best_rd, + int64_t *best_intra_rd, int8_t best_mbmode_skip); + +int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, + int *rate, int *rate_tokenonly, + int64_t *distortion, int *skippable, + BLOCK_SIZE bsize, int64_t best_rd, + PICK_MODE_CONTEXT *ctx); +#endif // AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_ diff --git a/libs/libaom/src/av1/encoder/k_means_template.h b/libs/libaom/src/av1/encoder/k_means_template.h new file mode 100644 index 000000000..9e526b88b --- /dev/null +++ b/libs/libaom/src/av1/encoder/k_means_template.h @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "av1/encoder/palette.h" +#include "av1/encoder/random.h" + +#ifndef AV1_K_MEANS_DIM +#error "This template requires AV1_K_MEANS_DIM to be defined" +#endif + +#define RENAME_(x, y) AV1_K_MEANS_RENAME(x, y) +#define RENAME(x) RENAME_(x, AV1_K_MEANS_DIM) + +static int RENAME(calc_dist)(const int *p1, const int *p2) { + int dist = 0; + for (int i = 0; i < AV1_K_MEANS_DIM; ++i) { + const int diff = p1[i] - p2[i]; + dist += diff * diff; + } + return dist; +} + +void RENAME(av1_calc_indices)(const int *data, const int *centroids, + uint8_t *indices, int n, int k) { + for (int i = 0; i < n; ++i) { + int min_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM, centroids); + indices[i] = 0; + for (int j = 1; j < k; ++j) { + const int this_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM, + centroids + j * AV1_K_MEANS_DIM); + if (this_dist < min_dist) { + min_dist = this_dist; + indices[i] = j; + } + } + } +} + +static void RENAME(calc_centroids)(const int *data, int *centroids, + const uint8_t *indices, int n, int k) { + int i, j; + int count[PALETTE_MAX_SIZE] = { 0 }; + unsigned int rand_state = (unsigned int)data[0]; + assert(n <= 32768); + memset(centroids, 0, sizeof(centroids[0]) * k * AV1_K_MEANS_DIM); + + for (i = 0; i < n; ++i) { + const int index = indices[i]; + assert(index < k); + ++count[index]; + for (j = 0; j < AV1_K_MEANS_DIM; ++j) { + centroids[index * AV1_K_MEANS_DIM + j] += data[i * AV1_K_MEANS_DIM + j]; + } + } + + for (i = 0; i < k; ++i) { + if (count[i] == 0) { + memcpy(centroids + i * AV1_K_MEANS_DIM, + data + (lcg_rand16(&rand_state) % n) * AV1_K_MEANS_DIM, + sizeof(centroids[0]) * AV1_K_MEANS_DIM); + } else { + for (j = 0; j < AV1_K_MEANS_DIM; ++j) { + centroids[i * AV1_K_MEANS_DIM + j] = + DIVIDE_AND_ROUND(centroids[i * AV1_K_MEANS_DIM + j], count[i]); + } + } + } +} + +static int64_t RENAME(calc_total_dist)(const int *data, const int *centroids, + const uint8_t *indices, int n, int k) { + int64_t dist = 0; + (void)k; + for (int i = 0; i < n; ++i) { + dist += RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM, + centroids + indices[i] * AV1_K_MEANS_DIM); + } + return dist; +} + +void RENAME(av1_k_means)(const int *data, int *centroids, uint8_t *indices, + int n, int k, int max_itr) { + int pre_centroids[2 * PALETTE_MAX_SIZE]; + uint8_t pre_indices[MAX_SB_SQUARE]; + + RENAME(av1_calc_indices)(data, centroids, indices, n, k); + int64_t this_dist = RENAME(calc_total_dist)(data, centroids, indices, n, k); + + for (int i = 0; i < max_itr; ++i) { + const int64_t pre_dist = this_dist; + memcpy(pre_centroids, centroids, + sizeof(pre_centroids[0]) * k * AV1_K_MEANS_DIM); + memcpy(pre_indices, indices, sizeof(pre_indices[0]) * n); + + RENAME(calc_centroids)(data, centroids, indices, n, k); + RENAME(av1_calc_indices)(data, centroids, indices, n, k); + this_dist = RENAME(calc_total_dist)(data, centroids, indices, n, k); + + if (this_dist > pre_dist) { + memcpy(centroids, pre_centroids, + sizeof(pre_centroids[0]) * k * AV1_K_MEANS_DIM); + memcpy(indices, pre_indices, sizeof(pre_indices[0]) * n); + break; + } + if (!memcmp(centroids, pre_centroids, + sizeof(pre_centroids[0]) * k * AV1_K_MEANS_DIM)) + break; + } +} +#undef RENAME_ +#undef RENAME diff --git a/libs/libaom/src/av1/encoder/level.c b/libs/libaom/src/av1/encoder/level.c new file mode 100644 index 000000000..3403a3a84 --- /dev/null +++ b/libs/libaom/src/av1/encoder/level.c @@ -0,0 +1,1184 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_ports/system_state.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/level.h" + +#define UNDEFINED_LEVEL \ + { \ + .level = SEQ_LEVEL_MAX, .max_picture_size = 0, .max_h_size = 0, \ + .max_v_size = 0, .max_display_rate = 0, .max_decode_rate = 0, \ + .max_header_rate = 0, .main_mbps = 0, .high_mbps = 0, .main_cr = 0, \ + .high_cr = 0, .max_tiles = 0, .max_tile_cols = 0 \ + } + +static const AV1LevelSpec av1_level_defs[SEQ_LEVELS] = { + { .level = SEQ_LEVEL_2_0, + .max_picture_size = 147456, + .max_h_size = 2048, + .max_v_size = 1152, + .max_display_rate = 4423680L, + .max_decode_rate = 5529600L, + .max_header_rate = 150, + .main_mbps = 1.5, + .high_mbps = 0, + .main_cr = 2.0, + .high_cr = 0, + .max_tiles = 8, + .max_tile_cols = 4 }, + { .level = SEQ_LEVEL_2_1, + .max_picture_size = 278784, + .max_h_size = 2816, + .max_v_size = 1584, + .max_display_rate = 8363520L, + .max_decode_rate = 10454400L, + .max_header_rate = 150, + .main_mbps = 3.0, + .high_mbps = 0, + .main_cr = 2.0, + .high_cr = 0, + .max_tiles = 8, + .max_tile_cols = 4 }, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, + { .level = SEQ_LEVEL_3_0, + .max_picture_size = 665856, + .max_h_size = 4352, + .max_v_size = 2448, + .max_display_rate = 19975680L, + .max_decode_rate = 24969600L, + .max_header_rate = 150, + .main_mbps = 6.0, + .high_mbps = 0, + .main_cr = 2.0, + .high_cr = 0, + .max_tiles = 16, + .max_tile_cols = 6 }, + { .level = SEQ_LEVEL_3_1, + .max_picture_size = 1065024, + .max_h_size = 5504, + .max_v_size = 3096, + .max_display_rate = 31950720L, + .max_decode_rate = 39938400L, + .max_header_rate = 150, + .main_mbps = 10.0, + .high_mbps = 0, + .main_cr = 2.0, + .high_cr = 0, + .max_tiles = 16, + .max_tile_cols = 6 }, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, + { .level = SEQ_LEVEL_4_0, + .max_picture_size = 2359296, + .max_h_size = 6144, + .max_v_size = 3456, + .max_display_rate = 70778880L, + .max_decode_rate = 77856768L, + .max_header_rate = 300, + .main_mbps = 12.0, + .high_mbps = 30.0, + .main_cr = 4.0, + .high_cr = 4.0, + .max_tiles = 32, + .max_tile_cols = 8 }, + { .level = SEQ_LEVEL_4_1, + .max_picture_size = 2359296, + .max_h_size = 6144, + .max_v_size = 3456, + .max_display_rate = 141557760L, + .max_decode_rate = 155713536L, + .max_header_rate = 300, + .main_mbps = 20.0, + .high_mbps = 50.0, + .main_cr = 4.0, + .high_cr = 4.0, + .max_tiles = 32, + .max_tile_cols = 8 }, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, + { .level = SEQ_LEVEL_5_0, + .max_picture_size = 8912896, + .max_h_size = 8192, + .max_v_size = 4352, + .max_display_rate = 267386880L, + .max_decode_rate = 273715200L, + .max_header_rate = 300, + .main_mbps = 30.0, + .high_mbps = 100.0, + .main_cr = 6.0, + .high_cr = 4.0, + .max_tiles = 64, + .max_tile_cols = 8 }, + { .level = SEQ_LEVEL_5_1, + .max_picture_size = 8912896, + .max_h_size = 8192, + .max_v_size = 4352, + .max_display_rate = 534773760L, + .max_decode_rate = 547430400L, + .max_header_rate = 300, + .main_mbps = 40.0, + .high_mbps = 160.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 64, + .max_tile_cols = 8 }, + { .level = SEQ_LEVEL_5_2, + .max_picture_size = 8912896, + .max_h_size = 8192, + .max_v_size = 4352, + .max_display_rate = 1069547520L, + .max_decode_rate = 1094860800L, + .max_header_rate = 300, + .main_mbps = 60.0, + .high_mbps = 240.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 64, + .max_tile_cols = 8 }, + { .level = SEQ_LEVEL_5_3, + .max_picture_size = 8912896, + .max_h_size = 8192, + .max_v_size = 4352, + .max_display_rate = 1069547520L, + .max_decode_rate = 1176502272L, + .max_header_rate = 300, + .main_mbps = 60.0, + .high_mbps = 240.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 64, + .max_tile_cols = 8 }, + { .level = SEQ_LEVEL_6_0, + .max_picture_size = 35651584, + .max_h_size = 16384, + .max_v_size = 8704, + .max_display_rate = 1069547520L, + .max_decode_rate = 1176502272L, + .max_header_rate = 300, + .main_mbps = 60.0, + .high_mbps = 240.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 128, + .max_tile_cols = 16 }, + { .level = SEQ_LEVEL_6_1, + .max_picture_size = 35651584, + .max_h_size = 16384, + .max_v_size = 8704, + .max_display_rate = 2139095040L, + .max_decode_rate = 2189721600L, + .max_header_rate = 300, + .main_mbps = 100.0, + .high_mbps = 480.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 128, + .max_tile_cols = 16 }, + { .level = SEQ_LEVEL_6_2, + .max_picture_size = 35651584, + .max_h_size = 16384, + .max_v_size = 8704, + .max_display_rate = 4278190080L, + .max_decode_rate = 4379443200L, + .max_header_rate = 300, + .main_mbps = 160.0, + .high_mbps = 800.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 128, + .max_tile_cols = 16 }, + { .level = SEQ_LEVEL_6_3, + .max_picture_size = 35651584, + .max_h_size = 16384, + .max_v_size = 8704, + .max_display_rate = 4278190080L, + .max_decode_rate = 4706009088L, + .max_header_rate = 300, + .main_mbps = 160.0, + .high_mbps = 800.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 128, + .max_tile_cols = 16 }, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, +}; + +typedef enum { + LUMA_PIC_SIZE_TOO_LARGE, + LUMA_PIC_H_SIZE_TOO_LARGE, + LUMA_PIC_V_SIZE_TOO_LARGE, + LUMA_PIC_H_SIZE_TOO_SMALL, + LUMA_PIC_V_SIZE_TOO_SMALL, + TOO_MANY_TILE_COLUMNS, + TOO_MANY_TILES, + TILE_RATE_TOO_HIGH, + TILE_TOO_LARGE, + SUPERRES_TILE_WIDTH_TOO_LARGE, + CROPPED_TILE_WIDTH_TOO_SMALL, + CROPPED_TILE_HEIGHT_TOO_SMALL, + TILE_WIDTH_INVALID, + FRAME_HEADER_RATE_TOO_HIGH, + DISPLAY_RATE_TOO_HIGH, + DECODE_RATE_TOO_HIGH, + CR_TOO_SMALL, + TILE_SIZE_HEADER_RATE_TOO_HIGH, + BITRATE_TOO_HIGH, + DECODER_MODEL_FAIL, + + TARGET_LEVEL_FAIL_IDS, + TARGET_LEVEL_OK, +} TARGET_LEVEL_FAIL_ID; + +static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = { + "The picture size is too large.", + "The picture width is too large.", + "The picture height is too large.", + "The picture width is too small.", + "The picture height is too small.", + "Too many tile columns are used.", + "Too many tiles are used.", + "The tile rate is too high.", + "The tile size is too large.", + "The superres tile width is too large.", + "The cropped tile width is less than 8.", + "The cropped tile height is less than 8.", + "The tile width is invalid.", + "The frame header rate is too high.", + "The display luma sample rate is too high.", + "The decoded luma sample rate is too high.", + "The compression ratio is too small.", + "The product of max tile size and header rate is too high.", + "The bitrate is too high.", + "The decoder model fails.", +}; + +static double get_max_bitrate(const AV1LevelSpec *const level_spec, int tier, + BITSTREAM_PROFILE profile) { + if (level_spec->level < SEQ_LEVEL_4_0) tier = 0; + const double bitrate_basis = + (tier ? level_spec->high_mbps : level_spec->main_mbps) * 1e6; + const double bitrate_profile_factor = + profile == PROFILE_0 ? 1.0 : (profile == PROFILE_1 ? 2.0 : 3.0); + return bitrate_basis * bitrate_profile_factor; +} + +double av1_get_max_bitrate_for_level(AV1_LEVEL level_index, int tier, + BITSTREAM_PROFILE profile) { + assert(is_valid_seq_level_idx(level_index)); + return get_max_bitrate(&av1_level_defs[level_index], tier, profile); +} + +void av1_get_max_tiles_for_level(AV1_LEVEL level_index, int *const max_tiles, + int *const max_tile_cols) { + assert(is_valid_seq_level_idx(level_index)); + const AV1LevelSpec *const level_spec = &av1_level_defs[level_index]; + *max_tiles = level_spec->max_tiles; + *max_tile_cols = level_spec->max_tile_cols; +} + +// We assume time t to be valid if and only if t >= 0.0. +// So INVALID_TIME can be defined as anything less than 0. +#define INVALID_TIME (-1.0) + +// This corresponds to "free_buffer" in the spec. +static void release_buffer(DECODER_MODEL *const decoder_model, int idx) { + assert(idx >= 0 && idx < BUFFER_POOL_MAX_SIZE); + FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[idx]; + this_buffer->decoder_ref_count = 0; + this_buffer->player_ref_count = 0; + this_buffer->display_index = -1; + this_buffer->presentation_time = INVALID_TIME; +} + +static void initialize_buffer_pool(DECODER_MODEL *const decoder_model) { + for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) { + release_buffer(decoder_model, i); + } + for (int i = 0; i < REF_FRAMES; ++i) { + decoder_model->vbi[i] = -1; + } +} + +static int get_free_buffer(DECODER_MODEL *const decoder_model) { + for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) { + const FRAME_BUFFER *const this_buffer = + &decoder_model->frame_buffer_pool[i]; + if (this_buffer->decoder_ref_count == 0 && + this_buffer->player_ref_count == 0) + return i; + } + return -1; +} + +static void update_ref_buffers(DECODER_MODEL *const decoder_model, int idx, + int refresh_frame_flags) { + FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[idx]; + for (int i = 0; i < REF_FRAMES; ++i) { + if (refresh_frame_flags & (1 << i)) { + const int pre_idx = decoder_model->vbi[i]; + if (pre_idx != -1) { + --decoder_model->frame_buffer_pool[pre_idx].decoder_ref_count; + } + decoder_model->vbi[i] = idx; + ++this_buffer->decoder_ref_count; + } + } +} + +// The time (in seconds) required to decode a frame. +static double time_to_decode_frame(const AV1_COMMON *const cm, + int64_t max_decode_rate) { + if (cm->show_existing_frame) return 0.0; + + const FRAME_TYPE frame_type = cm->current_frame.frame_type; + int luma_samples = 0; + if (frame_type == KEY_FRAME || frame_type == INTRA_ONLY_FRAME) { + luma_samples = cm->superres_upscaled_width * cm->height; + } else { + const int spatial_layer_dimensions_present_flag = 0; + if (spatial_layer_dimensions_present_flag) { + assert(0 && "Spatial layer dimensions not supported yet."); + } else { + const SequenceHeader *const seq_params = &cm->seq_params; + const int max_frame_width = seq_params->max_frame_width; + const int max_frame_height = seq_params->max_frame_height; + luma_samples = max_frame_width * max_frame_height; + } + } + + return luma_samples / (double)max_decode_rate; +} + +// Release frame buffers that are no longer needed for decode or display. +// It corresponds to "start_decode_at_removal_time" in the spec. +static void release_processed_frames(DECODER_MODEL *const decoder_model, + double removal_time) { + for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) { + FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[i]; + if (this_buffer->player_ref_count > 0) { + if (this_buffer->presentation_time >= 0.0 && + this_buffer->presentation_time <= removal_time) { + this_buffer->player_ref_count = 0; + if (this_buffer->decoder_ref_count == 0) { + release_buffer(decoder_model, i); + } + } + } + } +} + +static int frames_in_buffer_pool(const DECODER_MODEL *const decoder_model) { + int frames_in_pool = 0; + for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) { + const FRAME_BUFFER *const this_buffer = + &decoder_model->frame_buffer_pool[i]; + if (this_buffer->decoder_ref_count > 0 || + this_buffer->player_ref_count > 0) { + ++frames_in_pool; + } + } + return frames_in_pool; +} + +static double get_presentation_time(const DECODER_MODEL *const decoder_model, + int display_index) { + if (decoder_model->mode == SCHEDULE_MODE) { + assert(0 && "SCHEDULE_MODE NOT SUPPORTED"); + return INVALID_TIME; + } else { + const double initial_presentation_delay = + decoder_model->initial_presentation_delay; + // Can't decide presentation time until the initial presentation delay is + // known. + if (initial_presentation_delay < 0.0) return INVALID_TIME; + + return initial_presentation_delay + + display_index * decoder_model->num_ticks_per_picture * + decoder_model->display_clock_tick; + } +} + +#define MAX_TIME 1e16 +double time_next_buffer_is_free(const DECODER_MODEL *const decoder_model) { + if (decoder_model->num_decoded_frame == 0) { + return (double)decoder_model->decoder_buffer_delay / 90000.0; + } + + double buf_free_time = MAX_TIME; + for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) { + const FRAME_BUFFER *const this_buffer = + &decoder_model->frame_buffer_pool[i]; + if (this_buffer->decoder_ref_count == 0) { + if (this_buffer->player_ref_count == 0) { + return decoder_model->current_time; + } + const double presentation_time = this_buffer->presentation_time; + if (presentation_time >= 0.0 && presentation_time < buf_free_time) { + buf_free_time = presentation_time; + } + } + } + return buf_free_time < MAX_TIME ? buf_free_time : INVALID_TIME; +} +#undef MAX_TIME + +static double get_removal_time(const DECODER_MODEL *const decoder_model) { + if (decoder_model->mode == SCHEDULE_MODE) { + assert(0 && "SCHEDULE_MODE IS NOT SUPPORTED YET"); + return INVALID_TIME; + } else { + return time_next_buffer_is_free(decoder_model); + } +} + +void av1_decoder_model_print_status(const DECODER_MODEL *const decoder_model) { + printf( + "\n status %d, num_frame %3d, num_decoded_frame %3d, " + "num_shown_frame %3d, current time %6.2f, frames in buffer %2d, " + "presentation delay %6.2f, total interval %6.2f\n", + decoder_model->status, decoder_model->num_frame, + decoder_model->num_decoded_frame, decoder_model->num_shown_frame, + decoder_model->current_time, frames_in_buffer_pool(decoder_model), + decoder_model->initial_presentation_delay, + decoder_model->dfg_interval_queue.total_interval); + for (int i = 0; i < 10; ++i) { + const FRAME_BUFFER *const this_buffer = + &decoder_model->frame_buffer_pool[i]; + printf("buffer %d, decode count %d, display count %d, present time %6.4f\n", + i, this_buffer->decoder_ref_count, this_buffer->player_ref_count, + this_buffer->presentation_time); + } +} + +// op_index is the operating point index. +void av1_decoder_model_init(const AV1_COMP *const cpi, AV1_LEVEL level, + int op_index, DECODER_MODEL *const decoder_model) { + aom_clear_system_state(); + + decoder_model->status = DECODER_MODEL_OK; + decoder_model->level = level; + + const AV1_COMMON *const cm = &cpi->common; + const SequenceHeader *const seq_params = &cm->seq_params; + decoder_model->bit_rate = get_max_bitrate( + av1_level_defs + level, seq_params->tier[op_index], seq_params->profile); + + // TODO(huisu or anyone): implement SCHEDULE_MODE. + decoder_model->mode = RESOURCE_MODE; + decoder_model->encoder_buffer_delay = 20000; + decoder_model->decoder_buffer_delay = 70000; + decoder_model->is_low_delay_mode = false; + + decoder_model->first_bit_arrival_time = 0.0; + decoder_model->last_bit_arrival_time = 0.0; + decoder_model->coded_bits = 0; + + decoder_model->removal_time = INVALID_TIME; + decoder_model->presentation_time = INVALID_TIME; + decoder_model->decode_samples = 0; + decoder_model->display_samples = 0; + decoder_model->max_decode_rate = 0.0; + decoder_model->max_display_rate = 0.0; + + decoder_model->num_frame = -1; + decoder_model->num_decoded_frame = -1; + decoder_model->num_shown_frame = -1; + decoder_model->current_time = 0.0; + + initialize_buffer_pool(decoder_model); + + DFG_INTERVAL_QUEUE *const dfg_interval_queue = + &decoder_model->dfg_interval_queue; + dfg_interval_queue->total_interval = 0.0; + dfg_interval_queue->head = 0; + dfg_interval_queue->size = 0; + + if (seq_params->timing_info_present) { + decoder_model->num_ticks_per_picture = + seq_params->timing_info.num_ticks_per_picture; + decoder_model->display_clock_tick = + seq_params->timing_info.num_units_in_display_tick / + seq_params->timing_info.time_scale; + } else { + decoder_model->num_ticks_per_picture = 1; + decoder_model->display_clock_tick = 1.0 / cpi->framerate; + } + + decoder_model->initial_display_delay = + seq_params->op_params[op_index].initial_display_delay; + decoder_model->initial_presentation_delay = INVALID_TIME; + decoder_model->decode_rate = av1_level_defs[level].max_decode_rate; +} + +void av1_decoder_model_process_frame(const AV1_COMP *const cpi, + size_t coded_bits, + DECODER_MODEL *const decoder_model) { + if (!decoder_model || decoder_model->status != DECODER_MODEL_OK) return; + + aom_clear_system_state(); + + const AV1_COMMON *const cm = &cpi->common; + const int luma_pic_size = cm->superres_upscaled_width * cm->height; + const int show_existing_frame = cm->show_existing_frame; + const int show_frame = cm->show_frame || show_existing_frame; + ++decoder_model->num_frame; + if (!show_existing_frame) ++decoder_model->num_decoded_frame; + if (show_frame) ++decoder_model->num_shown_frame; + decoder_model->coded_bits += coded_bits; + + int display_idx = -1; + if (show_existing_frame) { + display_idx = decoder_model->vbi[cpi->existing_fb_idx_to_show]; + if (display_idx < 0) { + decoder_model->status = DECODE_EXISTING_FRAME_BUF_EMPTY; + return; + } + if (decoder_model->frame_buffer_pool[display_idx].frame_type == KEY_FRAME) { + update_ref_buffers(decoder_model, display_idx, 0xFF); + } + } else { + const double removal_time = get_removal_time(decoder_model); + if (removal_time < 0.0) { + decoder_model->status = DECODE_FRAME_BUF_UNAVAILABLE; + return; + } + + const int previous_decode_samples = decoder_model->decode_samples; + const double previous_removal_time = decoder_model->removal_time; + assert(previous_removal_time < removal_time); + decoder_model->removal_time = removal_time; + decoder_model->decode_samples = luma_pic_size; + const double this_decode_rate = + previous_decode_samples / (removal_time - previous_removal_time); + decoder_model->max_decode_rate = + AOMMAX(decoder_model->max_decode_rate, this_decode_rate); + + // A frame with show_existing_frame being false indicates the end of a DFG. + // Update the bits arrival time of this DFG. + const double buffer_delay = (decoder_model->encoder_buffer_delay + + decoder_model->decoder_buffer_delay) / + 90000.0; + const double latest_arrival_time = removal_time - buffer_delay; + decoder_model->first_bit_arrival_time = + AOMMAX(decoder_model->last_bit_arrival_time, latest_arrival_time); + decoder_model->last_bit_arrival_time = + decoder_model->first_bit_arrival_time + + (double)decoder_model->coded_bits / decoder_model->bit_rate; + // Smoothing buffer underflows if the last bit arrives after the removal + // time. + if (decoder_model->last_bit_arrival_time > removal_time && + !decoder_model->is_low_delay_mode) { + decoder_model->status = SMOOTHING_BUFFER_UNDERFLOW; + return; + } + // Reset the coded bits for the next DFG. + decoder_model->coded_bits = 0; + + // Check if the smoothing buffer overflows. + DFG_INTERVAL_QUEUE *const queue = &decoder_model->dfg_interval_queue; + if (queue->size >= DFG_INTERVAL_QUEUE_SIZE) { + assert(0); + } + const double first_bit_arrival_time = decoder_model->first_bit_arrival_time; + const double last_bit_arrival_time = decoder_model->last_bit_arrival_time; + // Remove the DFGs with removal time earlier than last_bit_arrival_time. + while (queue->buf[queue->head].removal_time <= last_bit_arrival_time && + queue->size > 0) { + if (queue->buf[queue->head].removal_time - first_bit_arrival_time + + queue->total_interval > + 1.0) { + decoder_model->status = SMOOTHING_BUFFER_OVERFLOW; + return; + } + queue->total_interval -= queue->buf[queue->head].last_bit_arrival_time - + queue->buf[queue->head].first_bit_arrival_time; + queue->head = (queue->head + 1) % DFG_INTERVAL_QUEUE_SIZE; + --queue->size; + } + // Push current DFG into the queue. + const int queue_index = + (queue->head + queue->size++) % DFG_INTERVAL_QUEUE_SIZE; + queue->buf[queue_index].first_bit_arrival_time = first_bit_arrival_time; + queue->buf[queue_index].last_bit_arrival_time = last_bit_arrival_time; + queue->buf[queue_index].removal_time = removal_time; + queue->total_interval += last_bit_arrival_time - first_bit_arrival_time; + // The smoothing buffer can hold at most "bit_rate" bits, which is + // equivalent to 1 second of total interval. + if (queue->total_interval > 1.0) { + decoder_model->status = SMOOTHING_BUFFER_OVERFLOW; + return; + } + + release_processed_frames(decoder_model, removal_time); + decoder_model->current_time = + removal_time + time_to_decode_frame(cm, decoder_model->decode_rate); + + const int cfbi = get_free_buffer(decoder_model); + if (cfbi < 0) { + decoder_model->status = DECODE_FRAME_BUF_UNAVAILABLE; + return; + } + const CurrentFrame *const current_frame = &cm->current_frame; + decoder_model->frame_buffer_pool[cfbi].frame_type = + cm->current_frame.frame_type; + display_idx = cfbi; + update_ref_buffers(decoder_model, cfbi, current_frame->refresh_frame_flags); + + if (decoder_model->initial_presentation_delay < 0.0) { + // Display can begin after required number of frames have been buffered. + if (frames_in_buffer_pool(decoder_model) >= + decoder_model->initial_display_delay) { + decoder_model->initial_presentation_delay = decoder_model->current_time; + // Update presentation time for each shown frame in the frame buffer. + for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) { + FRAME_BUFFER *const this_buffer = + &decoder_model->frame_buffer_pool[i]; + if (this_buffer->player_ref_count == 0) continue; + assert(this_buffer->display_index >= 0); + this_buffer->presentation_time = + get_presentation_time(decoder_model, this_buffer->display_index); + } + } + } + } + + // Display. + if (show_frame) { + assert(display_idx >= 0 && display_idx < BUFFER_POOL_MAX_SIZE); + FRAME_BUFFER *const this_buffer = + &decoder_model->frame_buffer_pool[display_idx]; + ++this_buffer->player_ref_count; + this_buffer->display_index = decoder_model->num_shown_frame; + const double presentation_time = + get_presentation_time(decoder_model, this_buffer->display_index); + this_buffer->presentation_time = presentation_time; + if (presentation_time >= 0.0 && + decoder_model->current_time > presentation_time) { + decoder_model->status = DISPLAY_FRAME_LATE; + return; + } + + const int previous_display_samples = decoder_model->display_samples; + const double previous_presentation_time = decoder_model->presentation_time; + decoder_model->display_samples = luma_pic_size; + decoder_model->presentation_time = presentation_time; + if (presentation_time >= 0.0 && previous_presentation_time >= 0.0) { + assert(previous_presentation_time < presentation_time); + const double this_display_rate = + previous_display_samples / + (presentation_time - previous_presentation_time); + decoder_model->max_display_rate = + AOMMAX(decoder_model->max_display_rate, this_display_rate); + } + } +} + +void av1_init_level_info(AV1_COMP *cpi) { + for (int op_index = 0; op_index < MAX_NUM_OPERATING_POINTS; ++op_index) { + AV1LevelInfo *const this_level_info = + cpi->level_params.level_info[op_index]; + if (!this_level_info) continue; + memset(this_level_info, 0, sizeof(*this_level_info)); + AV1LevelSpec *const level_spec = &this_level_info->level_spec; + level_spec->level = SEQ_LEVEL_MAX; + AV1LevelStats *const level_stats = &this_level_info->level_stats; + level_stats->min_cropped_tile_width = INT_MAX; + level_stats->min_cropped_tile_height = INT_MAX; + level_stats->min_frame_width = INT_MAX; + level_stats->min_frame_height = INT_MAX; + level_stats->tile_width_is_valid = 1; + level_stats->min_cr = 1e8; + + FrameWindowBuffer *const frame_window_buffer = + &this_level_info->frame_window_buffer; + frame_window_buffer->num = 0; + frame_window_buffer->start = 0; + + const AV1_COMMON *const cm = &cpi->common; + const int upscaled_width = cm->superres_upscaled_width; + const int height = cm->height; + const int pic_size = upscaled_width * height; + for (AV1_LEVEL level = SEQ_LEVEL_2_0; level < SEQ_LEVELS; ++level) { + DECODER_MODEL *const this_model = &this_level_info->decoder_models[level]; + const AV1LevelSpec *const spec = &av1_level_defs[level]; + if (upscaled_width > spec->max_h_size || height > spec->max_v_size || + pic_size > spec->max_picture_size) { + // Turn off decoder model for this level as the frame size already + // exceeds level constraints. + this_model->status = DECODER_MODEL_DISABLED; + } else { + av1_decoder_model_init(cpi, level, op_index, this_model); + } + } + } +} + +static double get_min_cr(const AV1LevelSpec *const level_spec, int tier, + int is_still_picture, int64_t decoded_sample_rate) { + if (is_still_picture) return 0.8; + if (level_spec->level < SEQ_LEVEL_4_0) tier = 0; + const double min_cr_basis = tier ? level_spec->high_cr : level_spec->main_cr; + const double speed_adj = + (double)decoded_sample_rate / level_spec->max_display_rate; + return AOMMAX(min_cr_basis * speed_adj, 0.8); +} + +double av1_get_min_cr_for_level(AV1_LEVEL level_index, int tier, + int is_still_picture) { + assert(is_valid_seq_level_idx(level_index)); + const AV1LevelSpec *const level_spec = &av1_level_defs[level_index]; + return get_min_cr(level_spec, tier, is_still_picture, + level_spec->max_decode_rate); +} + +static void get_temporal_parallel_params(int scalability_mode_idc, + int *temporal_parallel_num, + int *temporal_parallel_denom) { + if (scalability_mode_idc < 0) { + *temporal_parallel_num = 1; + *temporal_parallel_denom = 1; + return; + } + + // TODO(huisu@): handle scalability cases. + if (scalability_mode_idc == SCALABILITY_SS) { + (void)scalability_mode_idc; + } else { + (void)scalability_mode_idc; + } +} + +#define MAX_TILE_SIZE (4096 * 2304) +#define MIN_CROPPED_TILE_WIDTH 8 +#define MIN_CROPPED_TILE_HEIGHT 8 +#define MIN_FRAME_WIDTH 16 +#define MIN_FRAME_HEIGHT 16 +#define MAX_TILE_SIZE_HEADER_RATE_PRODUCT 588251136 + +static TARGET_LEVEL_FAIL_ID check_level_constraints( + const AV1LevelInfo *const level_info, AV1_LEVEL level, int tier, + int is_still_picture, BITSTREAM_PROFILE profile, int check_bitrate) { + const DECODER_MODEL *const decoder_model = &level_info->decoder_models[level]; + const DECODER_MODEL_STATUS decoder_model_status = decoder_model->status; + if (decoder_model_status != DECODER_MODEL_OK && + decoder_model_status != DECODER_MODEL_DISABLED) { + return DECODER_MODEL_FAIL; + } + + const AV1LevelSpec *const level_spec = &level_info->level_spec; + const AV1LevelSpec *const target_level_spec = &av1_level_defs[level]; + const AV1LevelStats *const level_stats = &level_info->level_stats; + TARGET_LEVEL_FAIL_ID fail_id = TARGET_LEVEL_OK; + do { + if (level_spec->max_picture_size > target_level_spec->max_picture_size) { + fail_id = LUMA_PIC_SIZE_TOO_LARGE; + break; + } + + if (level_spec->max_h_size > target_level_spec->max_h_size) { + fail_id = LUMA_PIC_H_SIZE_TOO_LARGE; + break; + } + + if (level_spec->max_v_size > target_level_spec->max_v_size) { + fail_id = LUMA_PIC_V_SIZE_TOO_LARGE; + break; + } + + if (level_spec->max_tile_cols > target_level_spec->max_tile_cols) { + fail_id = TOO_MANY_TILE_COLUMNS; + break; + } + + if (level_spec->max_tiles > target_level_spec->max_tiles) { + fail_id = TOO_MANY_TILES; + break; + } + + if (level_spec->max_header_rate > target_level_spec->max_header_rate) { + fail_id = FRAME_HEADER_RATE_TOO_HIGH; + break; + } + + if (decoder_model->max_display_rate > + (double)target_level_spec->max_display_rate) { + fail_id = DISPLAY_RATE_TOO_HIGH; + break; + } + + // TODO(huisu): we are not using max decode rate calculated by the decoder + // model because the model in resource availability mode always returns + // MaxDecodeRate(as in the level definitions) as the max decode rate. + if (level_spec->max_decode_rate > target_level_spec->max_decode_rate) { + fail_id = DECODE_RATE_TOO_HIGH; + break; + } + + if (level_spec->max_tile_rate > target_level_spec->max_tiles * 120) { + fail_id = TILE_RATE_TOO_HIGH; + break; + } + + if (level_stats->max_tile_size > MAX_TILE_SIZE) { + fail_id = TILE_TOO_LARGE; + break; + } + + if (level_stats->max_superres_tile_width > MAX_TILE_WIDTH) { + fail_id = SUPERRES_TILE_WIDTH_TOO_LARGE; + break; + } + + if (level_stats->min_cropped_tile_width < MIN_CROPPED_TILE_WIDTH) { + fail_id = CROPPED_TILE_WIDTH_TOO_SMALL; + break; + } + + if (level_stats->min_cropped_tile_height < MIN_CROPPED_TILE_HEIGHT) { + fail_id = CROPPED_TILE_HEIGHT_TOO_SMALL; + break; + } + + if (level_stats->min_frame_width < MIN_FRAME_WIDTH) { + fail_id = LUMA_PIC_H_SIZE_TOO_SMALL; + break; + } + + if (level_stats->min_frame_height < MIN_FRAME_HEIGHT) { + fail_id = LUMA_PIC_V_SIZE_TOO_SMALL; + break; + } + + if (!level_stats->tile_width_is_valid) { + fail_id = TILE_WIDTH_INVALID; + break; + } + + const double min_cr = get_min_cr(target_level_spec, tier, is_still_picture, + level_spec->max_decode_rate); + if (level_stats->min_cr < min_cr) { + fail_id = CR_TOO_SMALL; + break; + } + + if (check_bitrate) { + // Check average bitrate instead of max_bitrate. + const double bitrate_limit = + get_max_bitrate(target_level_spec, tier, profile); + const double avg_bitrate = level_stats->total_compressed_size * 8.0 / + level_stats->total_time_encoded; + if (avg_bitrate > bitrate_limit) { + fail_id = BITRATE_TOO_HIGH; + break; + } + } + + if (target_level_spec->level > SEQ_LEVEL_5_1) { + int temporal_parallel_num; + int temporal_parallel_denom; + const int scalability_mode_idc = -1; + get_temporal_parallel_params(scalability_mode_idc, &temporal_parallel_num, + &temporal_parallel_denom); + const int val = level_stats->max_tile_size * level_spec->max_header_rate * + temporal_parallel_denom / temporal_parallel_num; + if (val > MAX_TILE_SIZE_HEADER_RATE_PRODUCT) { + fail_id = TILE_SIZE_HEADER_RATE_TOO_HIGH; + break; + } + } + } while (0); + + return fail_id; +} + +static void get_tile_stats(const AV1_COMMON *const cm, + const TileDataEnc *const tile_data, + int *max_tile_size, int *max_superres_tile_width, + int *min_cropped_tile_width, + int *min_cropped_tile_height, + int *tile_width_valid) { + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + const int superres_scale_denominator = cm->superres_scale_denominator; + + *max_tile_size = 0; + *max_superres_tile_width = 0; + *min_cropped_tile_width = INT_MAX; + *min_cropped_tile_height = INT_MAX; + *tile_width_valid = 1; + + for (int tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (int tile_col = 0; tile_col < tile_cols; ++tile_col) { + const TileInfo *const tile_info = + &tile_data[tile_row * cm->tiles.cols + tile_col].tile_info; + const int tile_width = + (tile_info->mi_col_end - tile_info->mi_col_start) * MI_SIZE; + const int tile_height = + (tile_info->mi_row_end - tile_info->mi_row_start) * MI_SIZE; + const int tile_size = tile_width * tile_height; + *max_tile_size = AOMMAX(*max_tile_size, tile_size); + + const int supperres_tile_width = + tile_width * superres_scale_denominator / SCALE_NUMERATOR; + *max_superres_tile_width = + AOMMAX(*max_superres_tile_width, supperres_tile_width); + + const int cropped_tile_width = + cm->width - tile_info->mi_col_start * MI_SIZE; + const int cropped_tile_height = + cm->height - tile_info->mi_row_start * MI_SIZE; + *min_cropped_tile_width = + AOMMIN(*min_cropped_tile_width, cropped_tile_width); + *min_cropped_tile_height = + AOMMIN(*min_cropped_tile_height, cropped_tile_height); + + const int is_right_most_tile = + tile_info->mi_col_end == cm->mi_params.mi_cols; + if (!is_right_most_tile) { + if (av1_superres_scaled(cm)) + *tile_width_valid &= tile_width >= 128; + else + *tile_width_valid &= tile_width >= 64; + } + } + } +} + +static int store_frame_record(int64_t ts_start, int64_t ts_end, + size_t encoded_size, int pic_size, + int frame_header_count, int tiles, int show_frame, + int show_existing_frame, + FrameWindowBuffer *const buffer) { + if (buffer->num < FRAME_WINDOW_SIZE) { + ++buffer->num; + } else { + buffer->start = (buffer->start + 1) % FRAME_WINDOW_SIZE; + } + const int new_idx = (buffer->start + buffer->num - 1) % FRAME_WINDOW_SIZE; + FrameRecord *const record = &buffer->buf[new_idx]; + record->ts_start = ts_start; + record->ts_end = ts_end; + record->encoded_size_in_bytes = encoded_size; + record->pic_size = pic_size; + record->frame_header_count = frame_header_count; + record->tiles = tiles; + record->show_frame = show_frame; + record->show_existing_frame = show_existing_frame; + + return new_idx; +} + +// Count the number of frames encoded in the last "duration" ticks, in display +// time. +static int count_frames(const FrameWindowBuffer *const buffer, + int64_t duration) { + const int current_idx = (buffer->start + buffer->num - 1) % FRAME_WINDOW_SIZE; + // Assume current frame is shown frame. + assert(buffer->buf[current_idx].show_frame); + + const int64_t current_time = buffer->buf[current_idx].ts_end; + const int64_t time_limit = AOMMAX(current_time - duration, 0); + int num_frames = 1; + int index = current_idx - 1; + for (int i = buffer->num - 2; i >= 0; --i, --index, ++num_frames) { + if (index < 0) index = FRAME_WINDOW_SIZE - 1; + const FrameRecord *const record = &buffer->buf[index]; + if (!record->show_frame) continue; + const int64_t ts_start = record->ts_start; + if (ts_start < time_limit) break; + } + + return num_frames; +} + +// Scan previously encoded frames and update level metrics accordingly. +static void scan_past_frames(const FrameWindowBuffer *const buffer, + int num_frames_to_scan, + AV1LevelSpec *const level_spec, + AV1LevelStats *const level_stats) { + const int num_frames_in_buffer = buffer->num; + int index = (buffer->start + num_frames_in_buffer - 1) % FRAME_WINDOW_SIZE; + int frame_headers = 0; + int tiles = 0; + int64_t display_samples = 0; + int64_t decoded_samples = 0; + size_t encoded_size_in_bytes = 0; + for (int i = 0; i < AOMMIN(num_frames_in_buffer, num_frames_to_scan); ++i) { + const FrameRecord *const record = &buffer->buf[index]; + if (!record->show_existing_frame) { + frame_headers += record->frame_header_count; + decoded_samples += record->pic_size; + } + if (record->show_frame) { + display_samples += record->pic_size; + } + tiles += record->tiles; + encoded_size_in_bytes += record->encoded_size_in_bytes; + --index; + if (index < 0) index = FRAME_WINDOW_SIZE - 1; + } + level_spec->max_header_rate = + AOMMAX(level_spec->max_header_rate, frame_headers); + // TODO(huisu): we can now compute max display rate with the decoder model, so + // these couple of lines can be removed. Keep them here for a while for + // debugging purpose. + level_spec->max_display_rate = + AOMMAX(level_spec->max_display_rate, display_samples); + level_spec->max_decode_rate = + AOMMAX(level_spec->max_decode_rate, decoded_samples); + level_spec->max_tile_rate = AOMMAX(level_spec->max_tile_rate, tiles); + level_stats->max_bitrate = + AOMMAX(level_stats->max_bitrate, (int)encoded_size_in_bytes * 8); +} + +void av1_update_level_info(AV1_COMP *cpi, size_t size, int64_t ts_start, + int64_t ts_end) { + AV1_COMMON *const cm = &cpi->common; + const AV1LevelParams *const level_params = &cpi->level_params; + + const int upscaled_width = cm->superres_upscaled_width; + const int width = cm->width; + const int height = cm->height; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + const int tiles = tile_cols * tile_rows; + const int luma_pic_size = upscaled_width * height; + const int frame_header_count = level_params->frame_header_count; + const int show_frame = cm->show_frame; + const int show_existing_frame = cm->show_existing_frame; + + int max_tile_size; + int min_cropped_tile_width; + int min_cropped_tile_height; + int max_superres_tile_width; + int tile_width_is_valid; + get_tile_stats(cm, cpi->tile_data, &max_tile_size, &max_superres_tile_width, + &min_cropped_tile_width, &min_cropped_tile_height, + &tile_width_is_valid); + + aom_clear_system_state(); + const double compression_ratio = av1_get_compression_ratio(cm, size); + const double total_time_encoded = + (cpi->time_stamps.prev_end_seen - cpi->time_stamps.first_ever) / + (double)TICKS_PER_SEC; + + const int temporal_layer_id = cm->temporal_layer_id; + const int spatial_layer_id = cm->spatial_layer_id; + const SequenceHeader *const seq_params = &cm->seq_params; + const BITSTREAM_PROFILE profile = seq_params->profile; + const int is_still_picture = seq_params->still_picture; + // update level_stats + // TODO(kyslov@) fix the implementation according to buffer model + for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; ++i) { + if (!is_in_operating_point(seq_params->operating_point_idc[i], + temporal_layer_id, spatial_layer_id) || + !((level_params->keep_level_stats >> i) & 1)) { + continue; + } + + AV1LevelInfo *const level_info = level_params->level_info[i]; + assert(level_info != NULL); + AV1LevelStats *const level_stats = &level_info->level_stats; + + level_stats->max_tile_size = + AOMMAX(level_stats->max_tile_size, max_tile_size); + level_stats->max_superres_tile_width = + AOMMAX(level_stats->max_superres_tile_width, max_superres_tile_width); + level_stats->min_cropped_tile_width = + AOMMIN(level_stats->min_cropped_tile_width, min_cropped_tile_width); + level_stats->min_cropped_tile_height = + AOMMIN(level_stats->min_cropped_tile_height, min_cropped_tile_height); + level_stats->tile_width_is_valid &= tile_width_is_valid; + level_stats->min_frame_width = AOMMIN(level_stats->min_frame_width, width); + level_stats->min_frame_height = + AOMMIN(level_stats->min_frame_height, height); + level_stats->min_cr = AOMMIN(level_stats->min_cr, compression_ratio); + level_stats->total_compressed_size += (double)size; + + // update level_spec + // TODO(kyslov@) update all spec fields + AV1LevelSpec *const level_spec = &level_info->level_spec; + level_spec->max_picture_size = + AOMMAX(level_spec->max_picture_size, luma_pic_size); + level_spec->max_h_size = + AOMMAX(level_spec->max_h_size, cm->superres_upscaled_width); + level_spec->max_v_size = AOMMAX(level_spec->max_v_size, height); + level_spec->max_tile_cols = AOMMAX(level_spec->max_tile_cols, tile_cols); + level_spec->max_tiles = AOMMAX(level_spec->max_tiles, tiles); + + // Store info. of current frame into FrameWindowBuffer. + FrameWindowBuffer *const buffer = &level_info->frame_window_buffer; + store_frame_record(ts_start, ts_end, size, luma_pic_size, + frame_header_count, tiles, show_frame, + show_existing_frame, buffer); + if (show_frame) { + // Count the number of frames encoded in the past 1 second. + const int encoded_frames_in_last_second = + show_frame ? count_frames(buffer, TICKS_PER_SEC) : 0; + scan_past_frames(buffer, encoded_frames_in_last_second, level_spec, + level_stats); + level_stats->total_time_encoded = total_time_encoded; + } + + DECODER_MODEL *const decoder_models = level_info->decoder_models; + for (AV1_LEVEL level = SEQ_LEVEL_2_0; level < SEQ_LEVELS; ++level) { + av1_decoder_model_process_frame(cpi, size << 3, &decoder_models[level]); + } + + // Check whether target level is met. + const AV1_LEVEL target_level = level_params->target_seq_level_idx[i]; + if (target_level < SEQ_LEVELS) { + assert(is_valid_seq_level_idx(target_level)); + const int tier = seq_params->tier[i]; + const TARGET_LEVEL_FAIL_ID fail_id = check_level_constraints( + level_info, target_level, tier, is_still_picture, profile, 0); + if (fail_id != TARGET_LEVEL_OK) { + const int target_level_major = 2 + (target_level >> 2); + const int target_level_minor = target_level & 3; + aom_internal_error(&cm->error, AOM_CODEC_ERROR, + "Failed to encode to the target level %d_%d. %s", + target_level_major, target_level_minor, + level_fail_messages[fail_id]); + } + } + } +} + +aom_codec_err_t av1_get_seq_level_idx(const SequenceHeader *seq_params, + const AV1LevelParams *level_params, + int *seq_level_idx) { + const int is_still_picture = seq_params->still_picture; + const BITSTREAM_PROFILE profile = seq_params->profile; + for (int op = 0; op < seq_params->operating_points_cnt_minus_1 + 1; ++op) { + seq_level_idx[op] = (int)SEQ_LEVEL_MAX; + if (!((level_params->keep_level_stats >> op) & 1)) continue; + const int tier = seq_params->tier[op]; + const AV1LevelInfo *const level_info = level_params->level_info[op]; + assert(level_info != NULL); + for (int level = 0; level < SEQ_LEVELS; ++level) { + if (!is_valid_seq_level_idx(level)) continue; + const TARGET_LEVEL_FAIL_ID fail_id = check_level_constraints( + level_info, level, tier, is_still_picture, profile, 1); + if (fail_id == TARGET_LEVEL_OK) { + seq_level_idx[op] = level; + break; + } + } + } + + return AOM_CODEC_OK; +} diff --git a/libs/libaom/src/av1/encoder/level.h b/libs/libaom/src/av1/encoder/level.h new file mode 100644 index 000000000..5e0cce200 --- /dev/null +++ b/libs/libaom/src/av1/encoder/level.h @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_LEVEL_H_ +#define AOM_AV1_ENCODER_LEVEL_H_ + +#include "av1/common/enums.h" + +struct AV1_COMP; + +// AV1 Level Specifications +typedef struct { + AV1_LEVEL level; + int max_picture_size; + int max_h_size; + int max_v_size; + int max_header_rate; + int max_tile_rate; + int max_tiles; + int max_tile_cols; + int64_t max_display_rate; + int64_t max_decode_rate; + double main_mbps; + double high_mbps; + double main_cr; + double high_cr; +} AV1LevelSpec; + +typedef struct { + int64_t ts_start; + int64_t ts_end; + size_t encoded_size_in_bytes; + int pic_size; + int frame_header_count; + int tiles; + int show_frame; + int show_existing_frame; +} FrameRecord; + +// Record frame info. in a rolling window. +#define FRAME_WINDOW_SIZE 256 +typedef struct { + FrameRecord buf[FRAME_WINDOW_SIZE]; + int num; // Number of FrameRecord stored in the buffer. + int start; // Buffer index of the first FrameRecord. +} FrameWindowBuffer; + +typedef struct { + int max_bitrate; // Max bitrate in any 1-second window, in bps. + int max_tile_size; + int max_superres_tile_width; + int min_cropped_tile_width; + int min_cropped_tile_height; + int tile_width_is_valid; + int min_frame_width; + int min_frame_height; + double total_compressed_size; // In bytes. + double total_time_encoded; // In seconds. + double min_cr; +} AV1LevelStats; + +// The following data structures are for the decoder model. +typedef struct { + int decoder_ref_count; + int player_ref_count; + int display_index; + FRAME_TYPE frame_type; + double presentation_time; +} FRAME_BUFFER; + +// Interval of bits transmission for a DFG(Decodable Frame Group). +typedef struct { + double first_bit_arrival_time; // Time when the first bit arrives. + double last_bit_arrival_time; // Time when the last bit arrives. + // Removal time means the time when the bits to be decoded are removed from + // the smoothing buffer. Removal time is essentially the time when the + // decoding of the frame starts. + double removal_time; +} DFG_INTERVAL; + +#define DFG_INTERVAL_QUEUE_SIZE 64 +typedef struct { + int head; + int size; + double total_interval; + DFG_INTERVAL buf[DFG_INTERVAL_QUEUE_SIZE]; +} DFG_INTERVAL_QUEUE; + +enum { + RESOURCE_MODE = 0, // Resource availability mode. + SCHEDULE_MODE // Decoding schedule mode. +} UENUM1BYTE(DECODER_MODEL_MODE); + +enum { + DECODER_MODEL_OK = 0, + DECODE_BUFFER_AVAILABLE_LATE, + DECODE_FRAME_BUF_UNAVAILABLE, + DECODE_EXISTING_FRAME_BUF_EMPTY, + DISPLAY_FRAME_LATE, + SMOOTHING_BUFFER_UNDERFLOW, + SMOOTHING_BUFFER_OVERFLOW, + DECODER_MODEL_DISABLED +} UENUM1BYTE(DECODER_MODEL_STATUS); + +#define BUFFER_POOL_MAX_SIZE 10 +typedef struct { + DECODER_MODEL_STATUS status; + DECODER_MODEL_MODE mode; + bool is_low_delay_mode; + AV1_LEVEL level; + int encoder_buffer_delay; // In units of 1/90000 seconds. + int decoder_buffer_delay; // In units of 1/90000 seconds. + int num_ticks_per_picture; + int initial_display_delay; // In units of frames. + int64_t decode_rate; + double display_clock_tick; // In units of seconds. + double current_time; // In units of seconds. + double initial_presentation_delay; // In units of seconds. + double bit_rate; // Bits per second. + + int num_frame; + int num_decoded_frame; + int num_shown_frame; + int vbi[REF_FRAMES]; // Virtual buffer index. + FRAME_BUFFER frame_buffer_pool[BUFFER_POOL_MAX_SIZE]; + DFG_INTERVAL_QUEUE dfg_interval_queue; + + // Information for the DFG(Decodable Frame Group) being processed. + double first_bit_arrival_time; + double last_bit_arrival_time; + size_t coded_bits; + + // Information for the frame being processed. + double removal_time; + double presentation_time; + int decode_samples; + int display_samples; + + double max_display_rate; + double max_decode_rate; +} DECODER_MODEL; + +typedef struct { + AV1LevelStats level_stats; + AV1LevelSpec level_spec; + FrameWindowBuffer frame_window_buffer; + DECODER_MODEL decoder_models[SEQ_LEVELS]; +} AV1LevelInfo; + +typedef struct AV1LevelParams { + // Specifies the level that the coded video sequence conforms to for each + // operating point. + AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS]; + // Bit mask to indicate whether to keep level stats for corresponding + // operating points. + uint32_t keep_level_stats; + // Level information for each operating point. + AV1LevelInfo *level_info[MAX_NUM_OPERATING_POINTS]; + // Count the number of OBU_FRAME and OBU_FRAME_HEADER for level calculation. + int frame_header_count; +} AV1LevelParams; + +static INLINE int is_in_operating_point(int operating_point, + int temporal_layer_id, + int spatial_layer_id) { + if (!operating_point) return 1; + + return ((operating_point >> temporal_layer_id) & 1) && + ((operating_point >> (spatial_layer_id + 8)) & 1); +} + +void av1_init_level_info(struct AV1_COMP *cpi); + +void av1_update_level_info(struct AV1_COMP *cpi, size_t size, int64_t ts_start, + int64_t ts_end); + +// Return sequence level indices in seq_level_idx[MAX_NUM_OPERATING_POINTS]. +aom_codec_err_t av1_get_seq_level_idx(const SequenceHeader *seq_params, + const AV1LevelParams *level_params, + int *seq_level_idx); + +// Print the status of the decoder model(for debugging). +void av1_decoder_model_print_status(const DECODER_MODEL *const decoder_model); + +void av1_decoder_model_init(const struct AV1_COMP *const cpi, AV1_LEVEL level, + int op_index, DECODER_MODEL *const decoder_model); + +void av1_decoder_model_process_frame(const struct AV1_COMP *const cpi, + size_t coded_bits, + DECODER_MODEL *const decoder_model); + +// Return max bitrate(bps) for given level. +double av1_get_max_bitrate_for_level(AV1_LEVEL level_index, int tier, + BITSTREAM_PROFILE profile); + +// Get max number of tiles and tile columns for given level. +void av1_get_max_tiles_for_level(AV1_LEVEL level_index, int *const max_tiles, + int *const max_tile_cols); + +// Return minimum compression ratio for given level. +double av1_get_min_cr_for_level(AV1_LEVEL level_index, int tier, + int is_still_picture); +#endif // AOM_AV1_ENCODER_LEVEL_H_ diff --git a/libs/libaom/src/av1/encoder/lookahead.c b/libs/libaom/src/av1/encoder/lookahead.c new file mode 100644 index 000000000..0f7c81989 --- /dev/null +++ b/libs/libaom/src/av1/encoder/lookahead.c @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include + +#include "config/aom_config.h" + +#include "aom_scale/yv12config.h" +#include "av1/common/common.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/extend.h" +#include "av1/encoder/lookahead.h" + +/* Return the buffer at the given absolute index and increment the index */ +static struct lookahead_entry *pop(struct lookahead_ctx *ctx, int *idx) { + int index = *idx; + struct lookahead_entry *buf = ctx->buf + index; + + assert(index < ctx->max_sz); + if (++index >= ctx->max_sz) index -= ctx->max_sz; + *idx = index; + return buf; +} + +void av1_lookahead_destroy(struct lookahead_ctx *ctx) { + if (ctx) { + if (ctx->buf) { + int i; + + for (i = 0; i < ctx->max_sz; i++) aom_free_frame_buffer(&ctx->buf[i].img); + free(ctx->buf); + } + free(ctx); + } +} + +struct lookahead_ctx *av1_lookahead_init( + unsigned int width, unsigned int height, unsigned int subsampling_x, + unsigned int subsampling_y, int use_highbitdepth, unsigned int depth, + const int border_in_pixels, int byte_alignment, int num_lap_buffers) { + struct lookahead_ctx *ctx = NULL; + int lag_in_frames = AOMMAX(1, depth); + + // Add the lags to depth and clamp + depth += num_lap_buffers; + depth = clamp(depth, 1, MAX_TOTAL_BUFFERS); + + // Allocate memory to keep previous source frames available. + depth += MAX_PRE_FRAMES; + + // Allocate the lookahead structures + ctx = calloc(1, sizeof(*ctx)); + if (ctx) { + unsigned int i; + ctx->max_sz = depth; + ctx->read_ctxs[ENCODE_STAGE].pop_sz = ctx->max_sz - MAX_PRE_FRAMES; + ctx->read_ctxs[ENCODE_STAGE].valid = 1; + if (num_lap_buffers) { + ctx->read_ctxs[LAP_STAGE].pop_sz = lag_in_frames; + ctx->read_ctxs[LAP_STAGE].valid = 1; + } + ctx->buf = calloc(depth, sizeof(*ctx->buf)); + if (!ctx->buf) goto fail; + for (i = 0; i < depth; i++) { + aom_free_frame_buffer(&ctx->buf[i].img); + if (aom_realloc_frame_buffer(&ctx->buf[i].img, width, height, + subsampling_x, subsampling_y, + use_highbitdepth, border_in_pixels, + byte_alignment, NULL, NULL, NULL)) + goto fail; + } + } + return ctx; +fail: + av1_lookahead_destroy(ctx); + return NULL; +} + +int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, + int64_t ts_start, int64_t ts_end, int use_highbitdepth, + aom_enc_frame_flags_t flags) { + struct lookahead_entry *buf; + int width = src->y_crop_width; + int height = src->y_crop_height; + int uv_width = src->uv_crop_width; + int uv_height = src->uv_crop_height; + int subsampling_x = src->subsampling_x; + int subsampling_y = src->subsampling_y; + int larger_dimensions, new_dimensions; + + assert(ctx->read_ctxs[ENCODE_STAGE].valid == 1); + if (ctx->read_ctxs[ENCODE_STAGE].sz + 1 + MAX_PRE_FRAMES > ctx->max_sz) + return 1; + ctx->read_ctxs[ENCODE_STAGE].sz++; + if (ctx->read_ctxs[LAP_STAGE].valid) { + ctx->read_ctxs[LAP_STAGE].sz++; + } + buf = pop(ctx, &ctx->write_idx); + + new_dimensions = width != buf->img.y_crop_width || + height != buf->img.y_crop_height || + uv_width != buf->img.uv_crop_width || + uv_height != buf->img.uv_crop_height; + larger_dimensions = width > buf->img.y_width || height > buf->img.y_height || + uv_width > buf->img.uv_width || + uv_height > buf->img.uv_height; + assert(!larger_dimensions || new_dimensions); + + if (larger_dimensions) { + YV12_BUFFER_CONFIG new_img; + memset(&new_img, 0, sizeof(new_img)); + if (aom_alloc_frame_buffer(&new_img, width, height, subsampling_x, + subsampling_y, use_highbitdepth, + AOM_BORDER_IN_PIXELS, 0)) + return 1; + aom_free_frame_buffer(&buf->img); + buf->img = new_img; + } else if (new_dimensions) { + buf->img.y_crop_width = src->y_crop_width; + buf->img.y_crop_height = src->y_crop_height; + buf->img.uv_crop_width = src->uv_crop_width; + buf->img.uv_crop_height = src->uv_crop_height; + buf->img.subsampling_x = src->subsampling_x; + buf->img.subsampling_y = src->subsampling_y; + } + // Partial copy not implemented yet + av1_copy_and_extend_frame(src, &buf->img); + + buf->ts_start = ts_start; + buf->ts_end = ts_end; + buf->flags = flags; + aom_remove_metadata_from_frame_buffer(&buf->img); + aom_copy_metadata_to_frame_buffer(&buf->img, src->metadata); + return 0; +} + +struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, int drain, + COMPRESSOR_STAGE stage) { + struct lookahead_entry *buf = NULL; + if (ctx) { + struct read_ctx *read_ctx = &ctx->read_ctxs[stage]; + assert(read_ctx->valid == 1); + if (read_ctx->sz && (drain || read_ctx->sz == read_ctx->pop_sz)) { + buf = pop(ctx, &read_ctx->read_idx); + read_ctx->sz--; + } + } + return buf; +} + +struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx, int index, + COMPRESSOR_STAGE stage) { + struct lookahead_entry *buf = NULL; + struct read_ctx *read_ctx = NULL; + if (ctx == NULL) { + return buf; + } + + read_ctx = &ctx->read_ctxs[stage]; + assert(read_ctx->valid == 1); + if (index >= 0) { + // Forward peek + if (index < read_ctx->sz) { + index += read_ctx->read_idx; + if (index >= ctx->max_sz) index -= ctx->max_sz; + buf = ctx->buf + index; + } + } else if (index < 0) { + // Backward peek + if (-index <= MAX_PRE_FRAMES) { + index += (int)(read_ctx->read_idx); + if (index < 0) index += (int)(ctx->max_sz); + buf = ctx->buf + index; + } + } + + return buf; +} + +unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx, + COMPRESSOR_STAGE stage) { + struct read_ctx *read_ctx = NULL; + assert(ctx != NULL); + + read_ctx = &ctx->read_ctxs[stage]; + assert(read_ctx->valid == 1); + return read_ctx->sz; +} + +int av1_lookahead_pop_sz(struct lookahead_ctx *ctx, COMPRESSOR_STAGE stage) { + struct read_ctx *read_ctx = NULL; + assert(ctx != NULL); + + read_ctx = &ctx->read_ctxs[stage]; + assert(read_ctx->valid == 1); + return read_ctx->pop_sz; +} diff --git a/libs/libaom/src/av1/encoder/lookahead.h b/libs/libaom/src/av1/encoder/lookahead.h new file mode 100644 index 000000000..03693d383 --- /dev/null +++ b/libs/libaom/src/av1/encoder/lookahead.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_LOOKAHEAD_H_ +#define AOM_AV1_ENCODER_LOOKAHEAD_H_ + +#include "aom_scale/yv12config.h" +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_LAG_BUFFERS 35 +#define MAX_LAP_BUFFERS 35 +#define MAX_TOTAL_BUFFERS (MAX_LAG_BUFFERS + MAX_LAP_BUFFERS) +#define LAP_LAG_IN_FRAMES 17 + +struct lookahead_entry { + YV12_BUFFER_CONFIG img; + int64_t ts_start; + int64_t ts_end; + aom_enc_frame_flags_t flags; +}; + +// The max of past frames we want to keep in the queue. +#define MAX_PRE_FRAMES 1 + +enum { ENCODE_STAGE, LAP_STAGE, MAX_STAGES } UENUM1BYTE(COMPRESSOR_STAGE); + +struct read_ctx { + int sz; /* Number of buffers currently in the queue */ + int read_idx; /* Read index */ + int pop_sz; /* Size to check for pop condition */ + int valid; /* Is this ctx valid? */ +}; + +struct lookahead_ctx { + int max_sz; /* Absolute size of the queue */ + int write_idx; /* Write index */ + struct read_ctx read_ctxs[MAX_STAGES]; /* Read context */ + struct lookahead_entry *buf; /* Buffer list */ +}; + +/**\brief Initializes the lookahead stage + * + * The lookahead stage is a queue of frame buffers on which some analysis + * may be done when buffers are enqueued. + */ +struct lookahead_ctx *av1_lookahead_init( + unsigned int width, unsigned int height, unsigned int subsampling_x, + unsigned int subsampling_y, int use_highbitdepth, unsigned int depth, + const int border_in_pixels, int byte_alignment, int num_lap_buffers); + +/**\brief Destroys the lookahead stage + */ +void av1_lookahead_destroy(struct lookahead_ctx *ctx); + +/**\brief Enqueue a source buffer + * + * This function will copy the source image into a new framebuffer with + * the expected stride/border. + * + * If active_map is non-NULL and there is only one frame in the queue, then copy + * only active macroblocks. + * + * \param[in] ctx Pointer to the lookahead context + * \param[in] src Pointer to the image to enqueue + * \param[in] ts_start Timestamp for the start of this frame + * \param[in] ts_end Timestamp for the end of this frame + * \param[in] flags Flags set on this frame + * \param[in] active_map Map that specifies which macroblock is active + */ +int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, + int64_t ts_start, int64_t ts_end, int use_highbitdepth, + aom_enc_frame_flags_t flags); + +/**\brief Get the next source buffer to encode + * + * + * \param[in] ctx Pointer to the lookahead context + * \param[in] drain Flag indicating the buffer should be drained + * (return a buffer regardless of the current queue depth) + * + * \retval NULL, if drain set and queue is empty + * \retval NULL, if drain not set and queue not of the configured depth + */ +struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, int drain, + COMPRESSOR_STAGE stage); + +/**\brief Get a future source buffer to encode + * + * \param[in] ctx Pointer to the lookahead context + * \param[in] index Index of the frame to be returned, 0 == next frame + * + * \retval NULL, if no buffer exists at the specified index + */ +struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx, int index, + COMPRESSOR_STAGE stage); + +/**\brief Get the number of frames currently in the lookahead queue + * + * \param[in] ctx Pointer to the lookahead context + */ +unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx, + COMPRESSOR_STAGE stage); + +int av1_lookahead_pop_sz(struct lookahead_ctx *ctx, COMPRESSOR_STAGE stage); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_LOOKAHEAD_H_ diff --git a/libs/libaom/src/av1/encoder/mathutils.h b/libs/libaom/src/av1/encoder/mathutils.h new file mode 100644 index 000000000..64f936176 --- /dev/null +++ b/libs/libaom/src/av1/encoder/mathutils.h @@ -0,0 +1,359 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_MATHUTILS_H_ +#define AOM_AV1_ENCODER_MATHUTILS_H_ + +#include +#include +#include +#include +#include + +static const double TINY_NEAR_ZERO = 1.0E-16; + +// Solves Ax = b, where x and b are column vectors of size nx1 and A is nxn +static INLINE int linsolve(int n, double *A, int stride, double *b, double *x) { + int i, j, k; + double c; + // Forward elimination + for (k = 0; k < n - 1; k++) { + // Bring the largest magnitude to the diagonal position + for (i = n - 1; i > k; i--) { + if (fabs(A[(i - 1) * stride + k]) < fabs(A[i * stride + k])) { + for (j = 0; j < n; j++) { + c = A[i * stride + j]; + A[i * stride + j] = A[(i - 1) * stride + j]; + A[(i - 1) * stride + j] = c; + } + c = b[i]; + b[i] = b[i - 1]; + b[i - 1] = c; + } + } + for (i = k; i < n - 1; i++) { + if (fabs(A[k * stride + k]) < TINY_NEAR_ZERO) return 0; + c = A[(i + 1) * stride + k] / A[k * stride + k]; + for (j = 0; j < n; j++) A[(i + 1) * stride + j] -= c * A[k * stride + j]; + b[i + 1] -= c * b[k]; + } + } + // Backward substitution + for (i = n - 1; i >= 0; i--) { + if (fabs(A[i * stride + i]) < TINY_NEAR_ZERO) return 0; + c = 0; + for (j = i + 1; j <= n - 1; j++) c += A[i * stride + j] * x[j]; + x[i] = (b[i] - c) / A[i * stride + i]; + } + + return 1; +} + +//////////////////////////////////////////////////////////////////////////////// +// Least-squares +// Solves for n-dim x in a least squares sense to minimize |Ax - b|^2 +// The solution is simply x = (A'A)^-1 A'b or simply the solution for +// the system: A'A x = A'b +static INLINE int least_squares(int n, double *A, int rows, int stride, + double *b, double *scratch, double *x) { + int i, j, k; + double *scratch_ = NULL; + double *AtA, *Atb; + if (!scratch) { + scratch_ = (double *)aom_malloc(sizeof(*scratch) * n * (n + 1)); + scratch = scratch_; + } + AtA = scratch; + Atb = scratch + n * n; + + for (i = 0; i < n; ++i) { + for (j = i; j < n; ++j) { + AtA[i * n + j] = 0.0; + for (k = 0; k < rows; ++k) + AtA[i * n + j] += A[k * stride + i] * A[k * stride + j]; + AtA[j * n + i] = AtA[i * n + j]; + } + Atb[i] = 0; + for (k = 0; k < rows; ++k) Atb[i] += A[k * stride + i] * b[k]; + } + int ret = linsolve(n, AtA, n, Atb, x); + if (scratch_) aom_free(scratch_); + return ret; +} + +// Matrix multiply +static INLINE void multiply_mat(const double *m1, const double *m2, double *res, + const int m1_rows, const int inner_dim, + const int m2_cols) { + double sum; + + int row, col, inner; + for (row = 0; row < m1_rows; ++row) { + for (col = 0; col < m2_cols; ++col) { + sum = 0; + for (inner = 0; inner < inner_dim; ++inner) + sum += m1[row * inner_dim + inner] * m2[inner * m2_cols + col]; + *(res++) = sum; + } + } +} + +// +// The functions below are needed only for homography computation +// Remove if the homography models are not used. +// +/////////////////////////////////////////////////////////////////////////////// +// svdcmp +// Adopted from Numerical Recipes in C + +static INLINE double sign(double a, double b) { + return ((b) >= 0 ? fabs(a) : -fabs(a)); +} + +static INLINE double pythag(double a, double b) { + double ct; + const double absa = fabs(a); + const double absb = fabs(b); + + if (absa > absb) { + ct = absb / absa; + return absa * sqrt(1.0 + ct * ct); + } else { + ct = absa / absb; + return (absb == 0) ? 0 : absb * sqrt(1.0 + ct * ct); + } +} + +static INLINE int svdcmp(double **u, int m, int n, double w[], double **v) { + const int max_its = 30; + int flag, i, its, j, jj, k, l, nm; + double anorm, c, f, g, h, s, scale, x, y, z; + double *rv1 = (double *)aom_malloc(sizeof(*rv1) * (n + 1)); + g = scale = anorm = 0.0; + for (i = 0; i < n; i++) { + l = i + 1; + rv1[i] = scale * g; + g = s = scale = 0.0; + if (i < m) { + for (k = i; k < m; k++) scale += fabs(u[k][i]); + if (scale != 0.) { + for (k = i; k < m; k++) { + u[k][i] /= scale; + s += u[k][i] * u[k][i]; + } + f = u[i][i]; + g = -sign(sqrt(s), f); + h = f * g - s; + u[i][i] = f - g; + for (j = l; j < n; j++) { + for (s = 0.0, k = i; k < m; k++) s += u[k][i] * u[k][j]; + f = s / h; + for (k = i; k < m; k++) u[k][j] += f * u[k][i]; + } + for (k = i; k < m; k++) u[k][i] *= scale; + } + } + w[i] = scale * g; + g = s = scale = 0.0; + if (i < m && i != n - 1) { + for (k = l; k < n; k++) scale += fabs(u[i][k]); + if (scale != 0.) { + for (k = l; k < n; k++) { + u[i][k] /= scale; + s += u[i][k] * u[i][k]; + } + f = u[i][l]; + g = -sign(sqrt(s), f); + h = f * g - s; + u[i][l] = f - g; + for (k = l; k < n; k++) rv1[k] = u[i][k] / h; + for (j = l; j < m; j++) { + for (s = 0.0, k = l; k < n; k++) s += u[j][k] * u[i][k]; + for (k = l; k < n; k++) u[j][k] += s * rv1[k]; + } + for (k = l; k < n; k++) u[i][k] *= scale; + } + } + anorm = fmax(anorm, (fabs(w[i]) + fabs(rv1[i]))); + } + + for (i = n - 1; i >= 0; i--) { + if (i < n - 1) { + if (g != 0.) { + for (j = l; j < n; j++) v[j][i] = (u[i][j] / u[i][l]) / g; + for (j = l; j < n; j++) { + for (s = 0.0, k = l; k < n; k++) s += u[i][k] * v[k][j]; + for (k = l; k < n; k++) v[k][j] += s * v[k][i]; + } + } + for (j = l; j < n; j++) v[i][j] = v[j][i] = 0.0; + } + v[i][i] = 1.0; + g = rv1[i]; + l = i; + } + for (i = AOMMIN(m, n) - 1; i >= 0; i--) { + l = i + 1; + g = w[i]; + for (j = l; j < n; j++) u[i][j] = 0.0; + if (g != 0.) { + g = 1.0 / g; + for (j = l; j < n; j++) { + for (s = 0.0, k = l; k < m; k++) s += u[k][i] * u[k][j]; + f = (s / u[i][i]) * g; + for (k = i; k < m; k++) u[k][j] += f * u[k][i]; + } + for (j = i; j < m; j++) u[j][i] *= g; + } else { + for (j = i; j < m; j++) u[j][i] = 0.0; + } + ++u[i][i]; + } + for (k = n - 1; k >= 0; k--) { + for (its = 0; its < max_its; its++) { + flag = 1; + for (l = k; l >= 0; l--) { + nm = l - 1; + if ((double)(fabs(rv1[l]) + anorm) == anorm || nm < 0) { + flag = 0; + break; + } + if ((double)(fabs(w[nm]) + anorm) == anorm) break; + } + if (flag) { + c = 0.0; + s = 1.0; + for (i = l; i <= k; i++) { + f = s * rv1[i]; + rv1[i] = c * rv1[i]; + if ((double)(fabs(f) + anorm) == anorm) break; + g = w[i]; + h = pythag(f, g); + w[i] = h; + h = 1.0 / h; + c = g * h; + s = -f * h; + for (j = 0; j < m; j++) { + y = u[j][nm]; + z = u[j][i]; + u[j][nm] = y * c + z * s; + u[j][i] = z * c - y * s; + } + } + } + z = w[k]; + if (l == k) { + if (z < 0.0) { + w[k] = -z; + for (j = 0; j < n; j++) v[j][k] = -v[j][k]; + } + break; + } + if (its == max_its - 1) { + aom_free(rv1); + return 1; + } + assert(k > 0); + x = w[l]; + nm = k - 1; + y = w[nm]; + g = rv1[nm]; + h = rv1[k]; + f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y); + g = pythag(f, 1.0); + f = ((x - z) * (x + z) + h * ((y / (f + sign(g, f))) - h)) / x; + c = s = 1.0; + for (j = l; j <= nm; j++) { + i = j + 1; + g = rv1[i]; + y = w[i]; + h = s * g; + g = c * g; + z = pythag(f, h); + rv1[j] = z; + c = f / z; + s = h / z; + f = x * c + g * s; + g = g * c - x * s; + h = y * s; + y *= c; + for (jj = 0; jj < n; jj++) { + x = v[jj][j]; + z = v[jj][i]; + v[jj][j] = x * c + z * s; + v[jj][i] = z * c - x * s; + } + z = pythag(f, h); + w[j] = z; + if (z != 0.) { + z = 1.0 / z; + c = f * z; + s = h * z; + } + f = c * g + s * y; + x = c * y - s * g; + for (jj = 0; jj < m; jj++) { + y = u[jj][j]; + z = u[jj][i]; + u[jj][j] = y * c + z * s; + u[jj][i] = z * c - y * s; + } + } + rv1[l] = 0.0; + rv1[k] = f; + w[k] = x; + } + } + aom_free(rv1); + return 0; +} + +static INLINE int SVD(double *U, double *W, double *V, double *matx, int M, + int N) { + // Assumes allocation for U is MxN + double **nrU = (double **)aom_malloc((M) * sizeof(*nrU)); + double **nrV = (double **)aom_malloc((N) * sizeof(*nrV)); + int problem, i; + + problem = !(nrU && nrV); + if (!problem) { + for (i = 0; i < M; i++) { + nrU[i] = &U[i * N]; + } + for (i = 0; i < N; i++) { + nrV[i] = &V[i * N]; + } + } else { + if (nrU) aom_free(nrU); + if (nrV) aom_free(nrV); + return 1; + } + + /* copy from given matx into nrU */ + for (i = 0; i < M; i++) { + memcpy(&(nrU[i][0]), matx + N * i, N * sizeof(*matx)); + } + + /* HERE IT IS: do SVD */ + if (svdcmp(nrU, M, N, W, nrV)) { + aom_free(nrU); + aom_free(nrV); + return 1; + } + + /* aom_free Numerical Recipes arrays */ + aom_free(nrU); + aom_free(nrV); + + return 0; +} + +#endif // AOM_AV1_ENCODER_MATHUTILS_H_ diff --git a/libs/libaom/src/av1/encoder/mcomp.c b/libs/libaom/src/av1/encoder/mcomp.c new file mode 100644 index 000000000..43f7f5c6c --- /dev/null +++ b/libs/libaom/src/av1/encoder/mcomp.c @@ -0,0 +1,3391 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/common.h" +#include "av1/common/filter.h" +#include "av1/common/mvref_common.h" +#include "av1/common/reconinter.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/reconinter_enc.h" + +static INLINE void init_mv_cost_params(MV_COST_PARAMS *mv_cost_params, + const MACROBLOCK *x, const MV *ref_mv) { + mv_cost_params->ref_mv = ref_mv; + mv_cost_params->full_ref_mv = get_fullmv_from_mv(ref_mv); + mv_cost_params->error_per_bit = x->errorperbit; + mv_cost_params->sad_per_bit = x->sadperbit; + mv_cost_params->mvjcost = x->nmv_vec_cost; + mv_cost_params->mvcost[0] = x->mv_cost_stack[0]; + mv_cost_params->mvcost[1] = x->mv_cost_stack[1]; + mv_cost_params->mv_cost_type = x->mv_cost_type; +} + +static INLINE void init_ms_buffers(MSBuffers *ms_buffers, const MACROBLOCK *x) { + ms_buffers->ref = &x->e_mbd.plane[0].pre[0]; + ms_buffers->src = &x->plane[0].src; + + av1_set_ms_compound_refs(ms_buffers, NULL, NULL, 0, 0); + + ms_buffers->wsrc = x->wsrc_buf; + ms_buffers->obmc_mask = x->mask_buf; +} + +void av1_make_default_fullpel_ms_params( + FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi, + const MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv, + const search_site_config *search_sites) { + // High level params + ms_params->bsize = bsize; + ms_params->vfp = &cpi->fn_ptr[bsize]; + + init_ms_buffers(&ms_params->ms_buffers, x); + + ms_params->search_method = cpi->sf.mv_sf.search_method; + ms_params->search_sites = search_sites; + + ms_params->mesh_patterns[0] = cpi->sf.mv_sf.mesh_patterns; + ms_params->mesh_patterns[1] = cpi->sf.mv_sf.intrabc_mesh_patterns; + ms_params->force_mesh_thresh = cpi->sf.mv_sf.exhaustive_searches_thresh; + ms_params->prune_mesh_search = cpi->sf.mv_sf.prune_mesh_search; + ms_params->run_mesh_search = 0; + + ms_params->is_intra_mode = 0; + + ms_params->fast_obmc_search = cpi->sf.mv_sf.obmc_full_pixel_search_level; + + ms_params->mv_limits = x->mv_limits; + av1_set_mv_search_range(&ms_params->mv_limits, ref_mv); + + // Mvcost params + init_mv_cost_params(&ms_params->mv_cost_params, x, ref_mv); +} + +void av1_make_default_subpel_ms_params(SUBPEL_MOTION_SEARCH_PARAMS *ms_params, + const struct AV1_COMP *cpi, + const MACROBLOCK *x, BLOCK_SIZE bsize, + const MV *ref_mv, const int *cost_list) { + const AV1_COMMON *cm = &cpi->common; + // High level params + ms_params->allow_hp = cm->features.allow_high_precision_mv; + ms_params->forced_stop = cpi->sf.mv_sf.subpel_force_stop; + ms_params->iters_per_step = cpi->sf.mv_sf.subpel_iters_per_step; + ms_params->cost_list = cond_cost_list_const(cpi, cost_list); + + av1_set_subpel_mv_search_range(&ms_params->mv_limits, &x->mv_limits, ref_mv); + + // Mvcost params + init_mv_cost_params(&ms_params->mv_cost_params, x, ref_mv); + + // Subpel variance params + ms_params->var_params.vfp = &cpi->fn_ptr[bsize]; + ms_params->var_params.subpel_search_type = + cpi->sf.mv_sf.use_accurate_subpel_search; + ms_params->var_params.w = block_size_wide[bsize]; + ms_params->var_params.h = block_size_high[bsize]; + + // Ref and src buffers + MSBuffers *ms_buffers = &ms_params->var_params.ms_buffers; + init_ms_buffers(ms_buffers, x); +} + +static INLINE int get_offset_from_fullmv(const FULLPEL_MV *mv, int stride) { + return mv->row * stride + mv->col; +} + +static INLINE const uint8_t *get_buf_from_fullmv(const struct buf_2d *buf, + const FULLPEL_MV *mv) { + return &buf->buf[get_offset_from_fullmv(mv, buf->stride)]; +} + +void av1_set_mv_search_range(FullMvLimits *mv_limits, const MV *mv) { + int col_min = + GET_MV_RAWPEL(mv->col) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0); + int row_min = + GET_MV_RAWPEL(mv->row) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0); + int col_max = GET_MV_RAWPEL(mv->col) + MAX_FULL_PEL_VAL; + int row_max = GET_MV_RAWPEL(mv->row) + MAX_FULL_PEL_VAL; + + col_min = AOMMAX(col_min, GET_MV_RAWPEL(MV_LOW) + 1); + row_min = AOMMAX(row_min, GET_MV_RAWPEL(MV_LOW) + 1); + col_max = AOMMIN(col_max, GET_MV_RAWPEL(MV_UPP) - 1); + row_max = AOMMIN(row_max, GET_MV_RAWPEL(MV_UPP) - 1); + + // Get intersection of UMV window and valid MV window to reduce # of checks + // in diamond search. + if (mv_limits->col_min < col_min) mv_limits->col_min = col_min; + if (mv_limits->col_max > col_max) mv_limits->col_max = col_max; + if (mv_limits->row_min < row_min) mv_limits->row_min = row_min; + if (mv_limits->row_max > row_max) mv_limits->row_max = row_max; +} + +int av1_init_search_range(int size) { + int sr = 0; + // Minimum search size no matter what the passed in value. + size = AOMMAX(16, size); + + while ((size << sr) < MAX_FULL_PEL_VAL) sr++; + + sr = AOMMIN(sr, MAX_MVSEARCH_STEPS - 2); + return sr; +} + +// ============================================================================ +// Cost of motion vectors +// ============================================================================ +// TODO(any): Adaptively adjust the regularization strength based on image size +// and motion activity instead of using hard-coded values. It seems like we +// roughly half the lambda for each increase in resolution +// These are multiplier used to perform regularization in motion compensation +// when x->mv_cost_type is set to MV_COST_L1. +// LOWRES +#define SSE_LAMBDA_LOWRES 2 // Used by mv_cost_err_fn +#define SAD_LAMBDA_LOWRES 32 // Used by mvsad_err_cost during full pixel search +// MIDRES +#define SSE_LAMBDA_MIDRES 0 // Used by mv_cost_err_fn +#define SAD_LAMBDA_MIDRES 15 // Used by mvsad_err_cost during full pixel search +// HDRES +#define SSE_LAMBDA_HDRES 1 // Used by mv_cost_err_fn +#define SAD_LAMBDA_HDRES 8 // Used by mvsad_err_cost during full pixel search + +// Returns the rate of encoding the current motion vector based on the +// joint_cost and comp_cost. joint_costs covers the cost of transmitting +// JOINT_MV, and comp_cost covers the cost of transmitting the actual motion +// vector. +static INLINE int mv_cost(const MV *mv, const int *joint_cost, + const int *const comp_cost[2]) { + return joint_cost[av1_get_mv_joint(mv)] + comp_cost[0][mv->row] + + comp_cost[1][mv->col]; +} + +#define CONVERT_TO_CONST_MVCOST(ptr) ((const int *const *)(ptr)) +// Returns the cost of encoding the motion vector diff := *mv - *ref. The cost +// is defined as the rate required to encode diff * weight, rounded to the +// nearest 2 ** 7. +// This is NOT used during motion compensation. +int av1_mv_bit_cost(const MV *mv, const MV *ref_mv, const int *mvjcost, + int *mvcost[2], int weight) { + const MV diff = { mv->row - ref_mv->row, mv->col - ref_mv->col }; + return ROUND_POWER_OF_TWO( + mv_cost(&diff, mvjcost, CONVERT_TO_CONST_MVCOST(mvcost)) * weight, 7); +} + +// Returns the cost of using the current mv during the motion search. This is +// used when var is used as the error metric. +#define PIXEL_TRANSFORM_ERROR_SCALE 4 +static INLINE int mv_err_cost(const MV *mv, const MV *ref_mv, + const int *mvjcost, const int *const mvcost[2], + int error_per_bit, MV_COST_TYPE mv_cost_type) { + const MV diff = { mv->row - ref_mv->row, mv->col - ref_mv->col }; + const MV abs_diff = { abs(diff.row), abs(diff.col) }; + + switch (mv_cost_type) { + case MV_COST_ENTROPY: + if (mvcost) { + return (int)ROUND_POWER_OF_TWO_64( + (int64_t)mv_cost(&diff, mvjcost, mvcost) * error_per_bit, + RDDIV_BITS + AV1_PROB_COST_SHIFT - RD_EPB_SHIFT + + PIXEL_TRANSFORM_ERROR_SCALE); + } + return 0; + case MV_COST_L1_LOWRES: + return (SSE_LAMBDA_LOWRES * (abs_diff.row + abs_diff.col)) >> 3; + case MV_COST_L1_MIDRES: + return (SSE_LAMBDA_MIDRES * (abs_diff.row + abs_diff.col)) >> 3; + case MV_COST_L1_HDRES: + return (SSE_LAMBDA_HDRES * (abs_diff.row + abs_diff.col)) >> 3; + case MV_COST_NONE: return 0; + default: assert(0 && "Invalid rd_cost_type"); return 0; + } +} + +static INLINE int mv_err_cost_(const MV *mv, + const MV_COST_PARAMS *mv_cost_params) { + return mv_err_cost(mv, mv_cost_params->ref_mv, mv_cost_params->mvjcost, + mv_cost_params->mvcost, mv_cost_params->error_per_bit, + mv_cost_params->mv_cost_type); +} + +// Returns the cost of using the current mv during the motion search. This is +// only used during full pixel motion search when sad is used as the error +// metric +static INLINE int mvsad_err_cost(const FULLPEL_MV *mv, const FULLPEL_MV *ref_mv, + const int *mvjcost, const int *const mvcost[2], + int sad_per_bit, MV_COST_TYPE mv_cost_type) { + const MV diff = { GET_MV_SUBPEL(mv->row - ref_mv->row), + GET_MV_SUBPEL(mv->col - ref_mv->col) }; + + switch (mv_cost_type) { + case MV_COST_ENTROPY: + return ROUND_POWER_OF_TWO( + (unsigned)mv_cost(&diff, mvjcost, CONVERT_TO_CONST_MVCOST(mvcost)) * + sad_per_bit, + AV1_PROB_COST_SHIFT); + case MV_COST_L1_LOWRES: + return (SAD_LAMBDA_LOWRES * (abs(diff.row) + abs(diff.col))) >> 3; + case MV_COST_L1_MIDRES: + return (SAD_LAMBDA_MIDRES * (abs(diff.row) + abs(diff.col))) >> 3; + case MV_COST_L1_HDRES: + return (SAD_LAMBDA_HDRES * (abs(diff.row) + abs(diff.col))) >> 3; + case MV_COST_NONE: return 0; + default: assert(0 && "Invalid rd_cost_type"); return 0; + } +} + +static INLINE int mvsad_err_cost_(const FULLPEL_MV *mv, + const MV_COST_PARAMS *mv_cost_params) { + return mvsad_err_cost(mv, &mv_cost_params->full_ref_mv, + mv_cost_params->mvjcost, mv_cost_params->mvcost, + mv_cost_params->sad_per_bit, + mv_cost_params->mv_cost_type); +} + +// ============================================================================= +// Fullpixel Motion Search: Translational +// ============================================================================= +#define MAX_PATTERN_SCALES 11 +#define MAX_PATTERN_CANDIDATES 8 // max number of candidates per scale +#define PATTERN_CANDIDATES_REF 3 // number of refinement candidates + +void av1_init_dsmotion_compensation(search_site_config *cfg, int stride) { + int ss_count = 0; + int stage_index = MAX_MVSEARCH_STEPS - 1; + + cfg->ss[stage_index][0].mv.col = cfg->ss[stage_index][0].mv.row = 0; + cfg->ss[stage_index][0].offset = 0; + cfg->stride = stride; + + for (int radius = MAX_FIRST_STEP; radius > 0; radius /= 2) { + int num_search_pts = 8; + + const FULLPEL_MV ss_mvs[13] = { + { 0, 0 }, { -radius, 0 }, { radius, 0 }, + { 0, -radius }, { 0, radius }, { -radius, -radius }, + { radius, radius }, { -radius, radius }, { radius, -radius }, + }; + + int i; + for (i = 0; i <= num_search_pts; ++i) { + search_site *const ss = &cfg->ss[stage_index][i]; + ss->mv = ss_mvs[i]; + ss->offset = get_offset_from_fullmv(&ss->mv, stride); + } + cfg->searches_per_step[stage_index] = num_search_pts; + cfg->radius[stage_index] = radius; + --stage_index; + ++ss_count; + } + cfg->ss_count = ss_count; +} + +void av1_init_motion_fpf(search_site_config *cfg, int stride) { + int ss_count = 0; + int stage_index = MAX_MVSEARCH_STEPS - 1; + + cfg->ss[stage_index][0].mv.col = cfg->ss[stage_index][0].mv.row = 0; + cfg->ss[stage_index][0].offset = 0; + cfg->stride = stride; + + for (int radius = MAX_FIRST_STEP; radius > 0; radius /= 2) { + // Generate offsets for 8 search sites per step. + int tan_radius = AOMMAX((int)(0.41 * radius), 1); + int num_search_pts = 12; + if (radius == 1) num_search_pts = 8; + + const FULLPEL_MV ss_mvs[13] = { + { 0, 0 }, + { -radius, 0 }, + { radius, 0 }, + { 0, -radius }, + { 0, radius }, + { -radius, -tan_radius }, + { radius, tan_radius }, + { -tan_radius, radius }, + { tan_radius, -radius }, + { -radius, tan_radius }, + { radius, -tan_radius }, + { tan_radius, radius }, + { -tan_radius, -radius }, + }; + + int i; + for (i = 0; i <= num_search_pts; ++i) { + search_site *const ss = &cfg->ss[stage_index][i]; + ss->mv = ss_mvs[i]; + ss->offset = get_offset_from_fullmv(&ss->mv, stride); + } + cfg->searches_per_step[stage_index] = num_search_pts; + cfg->radius[stage_index] = radius; + --stage_index; + ++ss_count; + } + cfg->ss_count = ss_count; +} + +void av1_init3smotion_compensation(search_site_config *cfg, int stride) { + int ss_count = 0; + int stage_index = 0; + cfg->stride = stride; + int radius = 1; + for (stage_index = 0; stage_index < 15; ++stage_index) { + int tan_radius = AOMMAX((int)(0.41 * radius), 1); + int num_search_pts = 12; + if (radius <= 5) { + tan_radius = radius; + num_search_pts = 8; + } + const FULLPEL_MV ss_mvs[13] = { + { 0, 0 }, + { -radius, 0 }, + { radius, 0 }, + { 0, -radius }, + { 0, radius }, + { -radius, -tan_radius }, + { radius, tan_radius }, + { -tan_radius, radius }, + { tan_radius, -radius }, + { -radius, tan_radius }, + { radius, -tan_radius }, + { tan_radius, radius }, + { -tan_radius, -radius }, + }; + + for (int i = 0; i <= num_search_pts; ++i) { + search_site *const ss = &cfg->ss[stage_index][i]; + ss->mv = ss_mvs[i]; + ss->offset = get_offset_from_fullmv(&ss->mv, stride); + } + cfg->searches_per_step[stage_index] = num_search_pts; + cfg->radius[stage_index] = radius; + ++ss_count; + if (stage_index < 12) + radius = (int)AOMMAX((radius * 1.5 + 0.5), radius + 1); + } + cfg->ss_count = ss_count; +} + +// Checks whether the mv is within range of the mv_limits +static INLINE int check_bounds(const FullMvLimits *mv_limits, int row, int col, + int range) { + return ((row - range) >= mv_limits->row_min) & + ((row + range) <= mv_limits->row_max) & + ((col - range) >= mv_limits->col_min) & + ((col + range) <= mv_limits->col_max); +} + +static INLINE int get_mvpred_var_cost( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv) { + const aom_variance_fn_ptr_t *vfp = ms_params->vfp; + const MV sub_this_mv = get_mv_from_fullmv(this_mv); + const struct buf_2d *const src = ms_params->ms_buffers.src; + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + const uint8_t *src_buf = src->buf; + const int src_stride = src->stride; + const int ref_stride = ref->stride; + + unsigned unused; + int bestsme; + + bestsme = vfp->vf(src_buf, src_stride, get_buf_from_fullmv(ref, this_mv), + ref_stride, &unused); + + bestsme += mv_err_cost_(&sub_this_mv, &ms_params->mv_cost_params); + + return bestsme; +} + +static INLINE int get_mvpred_sad(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const struct buf_2d *const src, + const uint8_t *const ref_address, + const int ref_stride) { + const aom_variance_fn_ptr_t *vfp = ms_params->vfp; + const uint8_t *src_buf = src->buf; + const int src_stride = src->stride; + + return vfp->sdf(src_buf, src_stride, ref_address, ref_stride); +} + +static INLINE int get_mvpred_compound_var_cost( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv) { + const aom_variance_fn_ptr_t *vfp = ms_params->vfp; + const struct buf_2d *const src = ms_params->ms_buffers.src; + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + const uint8_t *src_buf = src->buf; + const int src_stride = src->stride; + const int ref_stride = ref->stride; + + const uint8_t *mask = ms_params->ms_buffers.mask; + const uint8_t *second_pred = ms_params->ms_buffers.second_pred; + const int mask_stride = ms_params->ms_buffers.mask_stride; + const int invert_mask = ms_params->ms_buffers.inv_mask; + unsigned unused; + int bestsme; + + if (mask) { + bestsme = vfp->msvf(src_buf, src_stride, 0, 0, + get_buf_from_fullmv(ref, this_mv), ref_stride, + second_pred, mask, mask_stride, invert_mask, &unused); + } else if (second_pred) { + bestsme = vfp->svaf(get_buf_from_fullmv(ref, this_mv), ref_stride, 0, 0, + src_buf, src_stride, &unused, second_pred); + } else { + bestsme = vfp->vf(src_buf, src_stride, get_buf_from_fullmv(ref, this_mv), + ref_stride, &unused); + } + + const MV sub_this_mv = get_mv_from_fullmv(this_mv); + bestsme += mv_err_cost_(&sub_this_mv, &ms_params->mv_cost_params); + + return bestsme; +} + +static INLINE int get_mvpred_compound_sad( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const struct buf_2d *const src, const uint8_t *const ref_address, + const int ref_stride) { + const aom_variance_fn_ptr_t *vfp = ms_params->vfp; + const uint8_t *src_buf = src->buf; + const int src_stride = src->stride; + + const uint8_t *mask = ms_params->ms_buffers.mask; + const uint8_t *second_pred = ms_params->ms_buffers.second_pred; + const int mask_stride = ms_params->ms_buffers.mask_stride; + const int invert_mask = ms_params->ms_buffers.inv_mask; + + if (mask) { + return vfp->msdf(src_buf, src_stride, ref_address, ref_stride, second_pred, + mask, mask_stride, invert_mask); + } else if (second_pred) { + return vfp->sdaf(src_buf, src_stride, ref_address, ref_stride, second_pred); + } else { + return vfp->sdf(src_buf, src_stride, ref_address, ref_stride); + } +} + +// Calculates and returns a sad+mvcost list around an integer best pel during +// fullpixel motion search. The resulting list can be used to speed up subpel +// motion search later. +#define USE_SAD_COSTLIST 1 + +// calc_int_cost_list uses var to populate the costlist, which is more accurate +// than sad but slightly slower. +static AOM_FORCE_INLINE void calc_int_cost_list( + const FULLPEL_MV best_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + int *cost_list) { + static const FULLPEL_MV neighbors[4] = { + { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } + }; + const int br = best_mv.row; + const int bc = best_mv.col; + + cost_list[0] = get_mvpred_var_cost(ms_params, &best_mv); + + if (check_bounds(&ms_params->mv_limits, br, bc, 1)) { + for (int i = 0; i < 4; i++) { + const FULLPEL_MV neighbor_mv = { br + neighbors[i].row, + bc + neighbors[i].col }; + cost_list[i + 1] = get_mvpred_var_cost(ms_params, &neighbor_mv); + } + } else { + for (int i = 0; i < 4; i++) { + const FULLPEL_MV neighbor_mv = { br + neighbors[i].row, + bc + neighbors[i].col }; + if (!av1_is_fullmv_in_range(&ms_params->mv_limits, neighbor_mv)) { + cost_list[i + 1] = INT_MAX; + } else { + cost_list[i + 1] = get_mvpred_var_cost(ms_params, &neighbor_mv); + } + } + } +} + +// calc_int_sad_list uses sad to populate the costlist, which is less accurate +// than var but faster. +static AOM_FORCE_INLINE void calc_int_sad_list( + const FULLPEL_MV best_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + int *cost_list, int costlist_has_sad) { + static const FULLPEL_MV neighbors[4] = { + { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } + }; + const struct buf_2d *const src = ms_params->ms_buffers.src; + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + const int ref_stride = ref->stride; + const int br = best_mv.row; + const int bc = best_mv.col; + + assert(av1_is_fullmv_in_range(&ms_params->mv_limits, best_mv)); + + // Refresh the costlist it does not contain valid sad + if (!costlist_has_sad) { + cost_list[0] = get_mvpred_sad( + ms_params, src, get_buf_from_fullmv(ref, &best_mv), ref_stride); + + if (check_bounds(&ms_params->mv_limits, br, bc, 1)) { + for (int i = 0; i < 4; i++) { + const FULLPEL_MV this_mv = { br + neighbors[i].row, + bc + neighbors[i].col }; + cost_list[i + 1] = get_mvpred_sad( + ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride); + } + } else { + for (int i = 0; i < 4; i++) { + const FULLPEL_MV this_mv = { br + neighbors[i].row, + bc + neighbors[i].col }; + if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) { + cost_list[i + 1] = INT_MAX; + } else { + cost_list[i + 1] = get_mvpred_sad( + ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride); + } + } + } + } + + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + cost_list[0] += mvsad_err_cost_(&best_mv, mv_cost_params); + + for (int idx = 0; idx < 4; idx++) { + if (cost_list[idx + 1] != INT_MAX) { + const FULLPEL_MV this_mv = { br + neighbors[idx].row, + bc + neighbors[idx].col }; + cost_list[idx + 1] += mvsad_err_cost_(&this_mv, mv_cost_params); + } + } +} + +#define CHECK_BETTER \ + if (thissad < bestsad) { \ + int tmp_thissad = thissad; \ + if (use_mvcost) thissad += mvsad_err_cost_(&this_mv, mv_cost_params); \ + if (thissad < bestsad) { \ + raw_bestsad = tmp_thissad; \ + bestsad = thissad; \ + best_site = i; \ + } \ + } + +// Generic pattern search function that searches over multiple scales. +// Each scale can have a different number of candidates and shape of +// candidates as indicated in the num_candidates and candidates arrays +// passed into this function +static int pattern_search( + FULLPEL_MV start_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int search_param, const int do_init_search, + const int num_candidates[MAX_PATTERN_SCALES], + const MV candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES], + int *cost_list, FULLPEL_MV *best_mv) { + static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = { + 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, + }; + int i, s, t; + + const struct buf_2d *const src = ms_params->ms_buffers.src; + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + const int ref_stride = ref->stride; + const int last_is_4 = num_candidates[0] == 4; + int br, bc; + int bestsad = INT_MAX, raw_bestsad = INT_MAX; + int thissad; + int k = -1; + const int use_mvcost = ms_params->mv_cost_params.mv_cost_type != MV_COST_NONE; + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + assert(search_param < MAX_MVSEARCH_STEPS); + int best_init_s = search_param_to_steps[search_param]; + // adjust ref_mv to make sure it is within MV range + clamp_fullmv(&start_mv, &ms_params->mv_limits); + br = start_mv.row; + bc = start_mv.col; + if (cost_list != NULL) { + cost_list[0] = cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = + INT_MAX; + } + int costlist_has_sad = 0; + + // Work out the start point for the search + raw_bestsad = get_mvpred_sad(ms_params, src, + get_buf_from_fullmv(ref, &start_mv), ref_stride); + bestsad = raw_bestsad + mvsad_err_cost_(&start_mv, mv_cost_params); + + // Search all possible scales up to the search param around the center point + // pick the scale of the point that is best as the starting scale of + // further steps around it. + if (do_init_search) { + s = best_init_s; + best_init_s = -1; + for (t = 0; t <= s; ++t) { + int best_site = -1; + if (check_bounds(&ms_params->mv_limits, br, bc, 1 << t)) { + for (i = 0; i < num_candidates[t]; i++) { + const FULLPEL_MV this_mv = { br + candidates[t][i].row, + bc + candidates[t][i].col }; + thissad = get_mvpred_sad( + ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride); + CHECK_BETTER + } + } else { + for (i = 0; i < num_candidates[t]; i++) { + const FULLPEL_MV this_mv = { br + candidates[t][i].row, + bc + candidates[t][i].col }; + if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) continue; + thissad = get_mvpred_sad( + ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride); + CHECK_BETTER + } + } + if (best_site == -1) { + continue; + } else { + best_init_s = t; + k = best_site; + } + } + if (best_init_s != -1) { + br += candidates[best_init_s][k].row; + bc += candidates[best_init_s][k].col; + } + } + + // If the center point is still the best, just skip this and move to + // the refinement step. + if (best_init_s != -1) { + const int last_s = (last_is_4 && cost_list != NULL); + int best_site = -1; + s = best_init_s; + + for (; s >= last_s; s--) { + // No need to search all points the 1st time if initial search was used + if (!do_init_search || s != best_init_s) { + if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) { + for (i = 0; i < num_candidates[s]; i++) { + const FULLPEL_MV this_mv = { br + candidates[s][i].row, + bc + candidates[s][i].col }; + thissad = get_mvpred_sad( + ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride); + CHECK_BETTER + } + } else { + for (i = 0; i < num_candidates[s]; i++) { + const FULLPEL_MV this_mv = { br + candidates[s][i].row, + bc + candidates[s][i].col }; + if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) + continue; + thissad = get_mvpred_sad( + ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride); + CHECK_BETTER + } + } + + if (best_site == -1) { + continue; + } else { + br += candidates[s][best_site].row; + bc += candidates[s][best_site].col; + k = best_site; + } + } + + do { + int next_chkpts_indices[PATTERN_CANDIDATES_REF]; + best_site = -1; + next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1; + next_chkpts_indices[1] = k; + next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1; + + if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) { + for (i = 0; i < PATTERN_CANDIDATES_REF; i++) { + const FULLPEL_MV this_mv = { + br + candidates[s][next_chkpts_indices[i]].row, + bc + candidates[s][next_chkpts_indices[i]].col + }; + thissad = get_mvpred_sad( + ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride); + CHECK_BETTER + } + } else { + for (i = 0; i < PATTERN_CANDIDATES_REF; i++) { + const FULLPEL_MV this_mv = { + br + candidates[s][next_chkpts_indices[i]].row, + bc + candidates[s][next_chkpts_indices[i]].col + }; + if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) + continue; + thissad = get_mvpred_sad( + ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride); + CHECK_BETTER + } + } + + if (best_site != -1) { + k = next_chkpts_indices[best_site]; + br += candidates[s][k].row; + bc += candidates[s][k].col; + } + } while (best_site != -1); + } + + // Note: If we enter the if below, then cost_list must be non-NULL. + if (s == 0) { + cost_list[0] = raw_bestsad; + costlist_has_sad = 1; + if (!do_init_search || s != best_init_s) { + if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) { + for (i = 0; i < num_candidates[s]; i++) { + const FULLPEL_MV this_mv = { br + candidates[s][i].row, + bc + candidates[s][i].col }; + cost_list[i + 1] = thissad = get_mvpred_sad( + ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride); + CHECK_BETTER + } + } else { + for (i = 0; i < num_candidates[s]; i++) { + const FULLPEL_MV this_mv = { br + candidates[s][i].row, + bc + candidates[s][i].col }; + if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) + continue; + cost_list[i + 1] = thissad = get_mvpred_sad( + ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride); + CHECK_BETTER + } + } + + if (best_site != -1) { + br += candidates[s][best_site].row; + bc += candidates[s][best_site].col; + k = best_site; + } + } + while (best_site != -1) { + int next_chkpts_indices[PATTERN_CANDIDATES_REF]; + best_site = -1; + next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1; + next_chkpts_indices[1] = k; + next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1; + cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX; + cost_list[((k + 2) % 4) + 1] = cost_list[0]; + cost_list[0] = raw_bestsad; + + if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) { + for (i = 0; i < PATTERN_CANDIDATES_REF; i++) { + const FULLPEL_MV this_mv = { + br + candidates[s][next_chkpts_indices[i]].row, + bc + candidates[s][next_chkpts_indices[i]].col + }; + cost_list[next_chkpts_indices[i] + 1] = thissad = get_mvpred_sad( + ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride); + CHECK_BETTER + } + } else { + for (i = 0; i < PATTERN_CANDIDATES_REF; i++) { + const FULLPEL_MV this_mv = { + br + candidates[s][next_chkpts_indices[i]].row, + bc + candidates[s][next_chkpts_indices[i]].col + }; + if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) { + cost_list[next_chkpts_indices[i] + 1] = INT_MAX; + continue; + } + cost_list[next_chkpts_indices[i] + 1] = thissad = get_mvpred_sad( + ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride); + CHECK_BETTER + } + } + + if (best_site != -1) { + k = next_chkpts_indices[best_site]; + br += candidates[s][k].row; + bc += candidates[s][k].col; + } + } + } + } + + best_mv->row = br; + best_mv->col = bc; + + // Returns the one-away integer pel cost/sad around the best as follows: + // cost_list[0]: cost/sad at the best integer pel + // cost_list[1]: cost/sad at delta {0, -1} (left) from the best integer pel + // cost_list[2]: cost/sad at delta { 1, 0} (bottom) from the best integer pel + // cost_list[3]: cost/sad at delta { 0, 1} (right) from the best integer pel + // cost_list[4]: cost/sad at delta {-1, 0} (top) from the best integer pel + if (cost_list) { + if (USE_SAD_COSTLIST) { + calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad); + } else { + calc_int_cost_list(*best_mv, ms_params, cost_list); + } + } + best_mv->row = br; + best_mv->col = bc; + + const int var_cost = get_mvpred_var_cost(ms_params, best_mv); + return var_cost; +} +#undef CHECK_BETTER + +// For the following foo_search, the input arguments are: +// x: The struct used to hold a bunch of random configs. +// start_mv: where we are starting our motion search +// search_param: how many steps to skip in our motion search. For example, +// a value 3 suggests that 3 search steps have already taken place prior to +// this function call, so we jump directly to step 4 of the search process +// sad_per_bit: a multiplier used to convert rate to sad cost +// do_init_search: if on, do an initial search of all possible scales around the +// start_mv, and then pick the best scale. +// cond_list: used to hold the cost around the best full mv so we can use it to +// speed up subpel search later. +// vfp: a function pointer to the simd function so we can compute the cost +// efficiently +// ref_mv: the reference mv used to compute the mv cost +static int hex_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int search_param, const int do_init_search, + int *cost_list, FULLPEL_MV *best_mv) { + // First scale has 8-closest points, the rest have 6 points in hex shape + // at increasing scales + static const int hex_num_candidates[MAX_PATTERN_SCALES] = { 8, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6 }; + // Note that the largest candidate step at each scale is 2^scale + /* clang-format off */ + static const MV hex_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = { + { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 }, { -1, 1 }, + { -1, 0 } }, + { { -1, -2 }, { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 } }, + { { -2, -4 }, { 2, -4 }, { 4, 0 }, { 2, 4 }, { -2, 4 }, { -4, 0 } }, + { { -4, -8 }, { 4, -8 }, { 8, 0 }, { 4, 8 }, { -4, 8 }, { -8, 0 } }, + { { -8, -16 }, { 8, -16 }, { 16, 0 }, { 8, 16 }, { -8, 16 }, { -16, 0 } }, + { { -16, -32 }, { 16, -32 }, { 32, 0 }, { 16, 32 }, { -16, 32 }, + { -32, 0 } }, + { { -32, -64 }, { 32, -64 }, { 64, 0 }, { 32, 64 }, { -32, 64 }, + { -64, 0 } }, + { { -64, -128 }, { 64, -128 }, { 128, 0 }, { 64, 128 }, { -64, 128 }, + { -128, 0 } }, + { { -128, -256 }, { 128, -256 }, { 256, 0 }, { 128, 256 }, { -128, 256 }, + { -256, 0 } }, + { { -256, -512 }, { 256, -512 }, { 512, 0 }, { 256, 512 }, { -256, 512 }, + { -512, 0 } }, + { { -512, -1024 }, { 512, -1024 }, { 1024, 0 }, { 512, 1024 }, + { -512, 1024 }, { -1024, 0 } }, + }; + /* clang-format on */ + return pattern_search(start_mv, ms_params, search_param, do_init_search, + hex_num_candidates, hex_candidates, cost_list, best_mv); +} + +static int bigdia_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int search_param, const int do_init_search, + int *cost_list, FULLPEL_MV *best_mv) { + // First scale has 4-closest points, the rest have 8 points in diamond + // shape at increasing scales + static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = { + 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + }; + // Note that the largest candidate step at each scale is 2^scale + /* clang-format off */ + static const MV + bigdia_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = { + { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } }, + { { -1, -1 }, { 0, -2 }, { 1, -1 }, { 2, 0 }, { 1, 1 }, { 0, 2 }, + { -1, 1 }, { -2, 0 } }, + { { -2, -2 }, { 0, -4 }, { 2, -2 }, { 4, 0 }, { 2, 2 }, { 0, 4 }, + { -2, 2 }, { -4, 0 } }, + { { -4, -4 }, { 0, -8 }, { 4, -4 }, { 8, 0 }, { 4, 4 }, { 0, 8 }, + { -4, 4 }, { -8, 0 } }, + { { -8, -8 }, { 0, -16 }, { 8, -8 }, { 16, 0 }, { 8, 8 }, { 0, 16 }, + { -8, 8 }, { -16, 0 } }, + { { -16, -16 }, { 0, -32 }, { 16, -16 }, { 32, 0 }, { 16, 16 }, + { 0, 32 }, { -16, 16 }, { -32, 0 } }, + { { -32, -32 }, { 0, -64 }, { 32, -32 }, { 64, 0 }, { 32, 32 }, + { 0, 64 }, { -32, 32 }, { -64, 0 } }, + { { -64, -64 }, { 0, -128 }, { 64, -64 }, { 128, 0 }, { 64, 64 }, + { 0, 128 }, { -64, 64 }, { -128, 0 } }, + { { -128, -128 }, { 0, -256 }, { 128, -128 }, { 256, 0 }, { 128, 128 }, + { 0, 256 }, { -128, 128 }, { -256, 0 } }, + { { -256, -256 }, { 0, -512 }, { 256, -256 }, { 512, 0 }, { 256, 256 }, + { 0, 512 }, { -256, 256 }, { -512, 0 } }, + { { -512, -512 }, { 0, -1024 }, { 512, -512 }, { 1024, 0 }, + { 512, 512 }, { 0, 1024 }, { -512, 512 }, { -1024, 0 } }, + }; + /* clang-format on */ + return pattern_search(start_mv, ms_params, search_param, do_init_search, + bigdia_num_candidates, bigdia_candidates, cost_list, + best_mv); +} + +static int square_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int search_param, const int do_init_search, + int *cost_list, FULLPEL_MV *best_mv) { + // All scales have 8 closest points in square shape + static const int square_num_candidates[MAX_PATTERN_SCALES] = { + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + }; + // Note that the largest candidate step at each scale is 2^scale + /* clang-format off */ + static const MV + square_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = { + { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 }, + { -1, 1 }, { -1, 0 } }, + { { -2, -2 }, { 0, -2 }, { 2, -2 }, { 2, 0 }, { 2, 2 }, { 0, 2 }, + { -2, 2 }, { -2, 0 } }, + { { -4, -4 }, { 0, -4 }, { 4, -4 }, { 4, 0 }, { 4, 4 }, { 0, 4 }, + { -4, 4 }, { -4, 0 } }, + { { -8, -8 }, { 0, -8 }, { 8, -8 }, { 8, 0 }, { 8, 8 }, { 0, 8 }, + { -8, 8 }, { -8, 0 } }, + { { -16, -16 }, { 0, -16 }, { 16, -16 }, { 16, 0 }, { 16, 16 }, + { 0, 16 }, { -16, 16 }, { -16, 0 } }, + { { -32, -32 }, { 0, -32 }, { 32, -32 }, { 32, 0 }, { 32, 32 }, + { 0, 32 }, { -32, 32 }, { -32, 0 } }, + { { -64, -64 }, { 0, -64 }, { 64, -64 }, { 64, 0 }, { 64, 64 }, + { 0, 64 }, { -64, 64 }, { -64, 0 } }, + { { -128, -128 }, { 0, -128 }, { 128, -128 }, { 128, 0 }, { 128, 128 }, + { 0, 128 }, { -128, 128 }, { -128, 0 } }, + { { -256, -256 }, { 0, -256 }, { 256, -256 }, { 256, 0 }, { 256, 256 }, + { 0, 256 }, { -256, 256 }, { -256, 0 } }, + { { -512, -512 }, { 0, -512 }, { 512, -512 }, { 512, 0 }, { 512, 512 }, + { 0, 512 }, { -512, 512 }, { -512, 0 } }, + { { -1024, -1024 }, { 0, -1024 }, { 1024, -1024 }, { 1024, 0 }, + { 1024, 1024 }, { 0, 1024 }, { -1024, 1024 }, { -1024, 0 } }, + }; + /* clang-format on */ + return pattern_search(start_mv, ms_params, search_param, do_init_search, + square_num_candidates, square_candidates, cost_list, + best_mv); +} + +static int fast_hex_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int search_param, const int do_init_search, + int *cost_list, FULLPEL_MV *best_mv) { + return hex_search(start_mv, ms_params, + AOMMAX(MAX_MVSEARCH_STEPS - 2, search_param), + do_init_search, cost_list, best_mv); +} + +static int fast_dia_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int search_param, const int do_init_search, + int *cost_list, FULLPEL_MV *best_mv) { + return bigdia_search(start_mv, ms_params, + AOMMAX(MAX_MVSEARCH_STEPS - 2, search_param), + do_init_search, cost_list, best_mv); +} + +static int diamond_search_sad(FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int search_param, int *num00, + FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv) { + const struct buf_2d *const src = ms_params->ms_buffers.src; + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + + const int ref_stride = ref->stride; + const uint8_t *best_address; + + const aom_variance_fn_ptr_t *vfp = ms_params->vfp; + const uint8_t *mask = ms_params->ms_buffers.mask; + const uint8_t *second_pred = ms_params->ms_buffers.second_pred; + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + + const search_site_config *cfg = ms_params->search_sites; + + unsigned int bestsad = INT_MAX; + int best_site = 0; + int is_off_center = 0; + + clamp_fullmv(&start_mv, &ms_params->mv_limits); + + // search_param determines the length of the initial step and hence the number + // of iterations. + const int tot_steps = cfg->ss_count - search_param; + + *num00 = 0; + *best_mv = start_mv; + + // Check the starting position + best_address = get_buf_from_fullmv(ref, &start_mv); + bestsad = get_mvpred_compound_sad(ms_params, src, best_address, ref_stride); + bestsad += mvsad_err_cost_(best_mv, &ms_params->mv_cost_params); + + int next_step_size = tot_steps > 2 ? cfg->radius[tot_steps - 2] : 1; + for (int step = tot_steps - 1; step >= 0; --step) { + const search_site *ss = cfg->ss[step]; + best_site = 0; + if (step > 0) next_step_size = cfg->radius[step - 1]; + + int all_in = 1, j; + // Trap illegal vectors + all_in &= best_mv->row + ss[1].mv.row >= ms_params->mv_limits.row_min; + all_in &= best_mv->row + ss[2].mv.row <= ms_params->mv_limits.row_max; + all_in &= best_mv->col + ss[3].mv.col >= ms_params->mv_limits.col_min; + all_in &= best_mv->col + ss[4].mv.col <= ms_params->mv_limits.col_max; + + // TODO(anyone): Implement 4 points search for msdf&sdaf + if (all_in && !mask && !second_pred) { + const uint8_t *src_buf = src->buf; + const int src_stride = src->stride; + for (int idx = 1; idx <= cfg->searches_per_step[step]; idx += 4) { + unsigned char const *block_offset[4]; + unsigned int sads[4]; + + for (j = 0; j < 4; j++) + block_offset[j] = ss[idx + j].offset + best_address; + + vfp->sdx4df(src_buf, src_stride, block_offset, ref_stride, sads); + for (j = 0; j < 4; j++) { + if (sads[j] < bestsad) { + const FULLPEL_MV this_mv = { best_mv->row + ss[idx + j].mv.row, + best_mv->col + ss[idx + j].mv.col }; + unsigned int thissad = + sads[j] + mvsad_err_cost_(&this_mv, mv_cost_params); + if (thissad < bestsad) { + bestsad = thissad; + best_site = idx + j; + } + } + } + } + } else { + for (int idx = 1; idx <= cfg->searches_per_step[step]; idx++) { + const FULLPEL_MV this_mv = { best_mv->row + ss[idx].mv.row, + best_mv->col + ss[idx].mv.col }; + + if (av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) { + const uint8_t *const check_here = ss[idx].offset + best_address; + unsigned int thissad; + + thissad = + get_mvpred_compound_sad(ms_params, src, check_here, ref_stride); + + if (thissad < bestsad) { + thissad += mvsad_err_cost_(&this_mv, mv_cost_params); + if (thissad < bestsad) { + bestsad = thissad; + best_site = idx; + } + } + } + } + } + + if (best_site != 0) { + if (second_best_mv) { + *second_best_mv = *best_mv; + } + best_mv->row += ss[best_site].mv.row; + best_mv->col += ss[best_site].mv.col; + best_address += ss[best_site].offset; + is_off_center = 1; + } + + if (is_off_center == 0) (*num00)++; + + if (best_site == 0) { + while (next_step_size == cfg->radius[step] && step > 2) { + ++(*num00); + --step; + next_step_size = cfg->radius[step - 1]; + } + } + } + + return bestsad; +} + +/* do_refine: If last step (1-away) of n-step search doesn't pick the center + point as the best match, we will do a final 1-away diamond + refining search */ +static int full_pixel_diamond(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int step_param, int *cost_list, + FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv) { + const search_site_config *cfg = ms_params->search_sites; + int thissme, n, num00 = 0; + int bestsme = diamond_search_sad(start_mv, ms_params, step_param, &n, best_mv, + second_best_mv); + + if (bestsme < INT_MAX) { + bestsme = get_mvpred_compound_var_cost(ms_params, best_mv); + } + + // If there won't be more n-step search, check to see if refining search is + // needed. + const int further_steps = cfg->ss_count - 1 - step_param; + while (n < further_steps) { + ++n; + + if (num00) { + num00--; + } else { + // TODO(chiyotsai@google.com): There is another bug here where the second + // best mv gets incorrectly overwritten. Fix it later. + FULLPEL_MV tmp_best_mv; + thissme = diamond_search_sad(start_mv, ms_params, step_param + n, &num00, + &tmp_best_mv, second_best_mv); + + if (thissme < INT_MAX) { + thissme = get_mvpred_compound_var_cost(ms_params, &tmp_best_mv); + } + + if (thissme < bestsme) { + bestsme = thissme; + *best_mv = tmp_best_mv; + } + } + } + + // Return cost list. + if (cost_list) { + if (USE_SAD_COSTLIST) { + const int costlist_has_sad = 0; + calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad); + } else { + calc_int_cost_list(*best_mv, ms_params, cost_list); + } + } + return bestsme; +} + +// Exhaustive motion search around a given centre position with a given +// step size. +static int exhaustive_mesh_search(FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int range, const int step, + FULLPEL_MV *best_mv, + FULLPEL_MV *second_best_mv) { + const aom_variance_fn_ptr_t *vfp = ms_params->vfp; + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + const struct buf_2d *const src = ms_params->ms_buffers.src; + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + const int ref_stride = ref->stride; + unsigned int best_sad = INT_MAX; + int r, c, i; + int start_col, end_col, start_row, end_row; + int col_step = (step > 1) ? step : 4; + + assert(step >= 1); + + clamp_fullmv(&start_mv, &ms_params->mv_limits); + *best_mv = start_mv; + best_sad = get_mvpred_sad(ms_params, src, get_buf_from_fullmv(ref, &start_mv), + ref_stride); + best_sad += mvsad_err_cost_(&start_mv, mv_cost_params); + start_row = AOMMAX(-range, ms_params->mv_limits.row_min - start_mv.row); + start_col = AOMMAX(-range, ms_params->mv_limits.col_min - start_mv.col); + end_row = AOMMIN(range, ms_params->mv_limits.row_max - start_mv.row); + end_col = AOMMIN(range, ms_params->mv_limits.col_max - start_mv.col); + + for (r = start_row; r <= end_row; r += step) { + for (c = start_col; c <= end_col; c += col_step) { + // Step > 1 means we are not checking every location in this pass. + if (step > 1) { + const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c }; + unsigned int sad = get_mvpred_sad( + ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride); + if (sad < best_sad) { + sad += mvsad_err_cost_(&mv, mv_cost_params); + if (sad < best_sad) { + best_sad = sad; + if (second_best_mv) { + *second_best_mv = *best_mv; + } + *best_mv = mv; + } + } + } else { + // 4 sads in a single call if we are checking every location + if (c + 3 <= end_col) { + unsigned int sads[4]; + const uint8_t *addrs[4]; + for (i = 0; i < 4; ++i) { + const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i }; + addrs[i] = get_buf_from_fullmv(ref, &mv); + } + vfp->sdx4df(src->buf, src->stride, addrs, ref_stride, sads); + + for (i = 0; i < 4; ++i) { + if (sads[i] < best_sad) { + const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i }; + const unsigned int sad = + sads[i] + mvsad_err_cost_(&mv, mv_cost_params); + if (sad < best_sad) { + best_sad = sad; + if (second_best_mv) { + *second_best_mv = *best_mv; + } + *best_mv = mv; + } + } + } + } else { + for (i = 0; i < end_col - c; ++i) { + const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i }; + unsigned int sad = get_mvpred_sad( + ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride); + if (sad < best_sad) { + sad += mvsad_err_cost_(&mv, mv_cost_params); + if (sad < best_sad) { + best_sad = sad; + if (second_best_mv) { + *second_best_mv = *best_mv; + } + *best_mv = mv; + } + } + } + } + } + } + } + + return best_sad; +} + +// Runs an limited range exhaustive mesh search using a pattern set +// according to the encode speed profile. +static int full_pixel_exhaustive(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const struct MESH_PATTERN *const mesh_patterns, + int *cost_list, FULLPEL_MV *best_mv, + FULLPEL_MV *second_best_mv) { + const int kMinRange = 7; + const int kMaxRange = 256; + const int kMinInterval = 1; + + int bestsme; + int i; + int interval = mesh_patterns[0].interval; + int range = mesh_patterns[0].range; + int baseline_interval_divisor; + + *best_mv = start_mv; + + // Trap illegal values for interval and range for this function. + if ((range < kMinRange) || (range > kMaxRange) || (interval < kMinInterval) || + (interval > range)) + return INT_MAX; + + baseline_interval_divisor = range / interval; + + // Check size of proposed first range against magnitude of the centre + // value used as a starting point. + range = AOMMAX(range, (5 * AOMMAX(abs(best_mv->row), abs(best_mv->col))) / 4); + range = AOMMIN(range, kMaxRange); + interval = AOMMAX(interval, range / baseline_interval_divisor); + + // initial search + bestsme = exhaustive_mesh_search(*best_mv, ms_params, range, interval, + best_mv, second_best_mv); + + if ((interval > kMinInterval) && (range > kMinRange)) { + // Progressive searches with range and step size decreasing each time + // till we reach a step size of 1. Then break out. + for (i = 1; i < MAX_MESH_STEP; ++i) { + // First pass with coarser step and longer range + bestsme = exhaustive_mesh_search( + *best_mv, ms_params, mesh_patterns[i].range, + mesh_patterns[i].interval, best_mv, second_best_mv); + + if (mesh_patterns[i].interval == 1) break; + } + } + + if (bestsme < INT_MAX) { + bestsme = get_mvpred_var_cost(ms_params, best_mv); + } + + // Return cost list. + if (cost_list) { + if (USE_SAD_COSTLIST) { + const int costlist_has_sad = 0; + calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad); + } else { + calc_int_cost_list(*best_mv, ms_params, cost_list); + } + } + return bestsme; +} + +// This function is called when we do joint motion search in comp_inter_inter +// mode, or when searching for one component of an ext-inter compound mode. +int av1_refining_search_8p_c(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const FULLPEL_MV start_mv, FULLPEL_MV *best_mv) { + static const search_neighbors neighbors[8] = { + { { -1, 0 }, -1 * SEARCH_GRID_STRIDE_8P + 0 }, + { { 0, -1 }, 0 * SEARCH_GRID_STRIDE_8P - 1 }, + { { 0, 1 }, 0 * SEARCH_GRID_STRIDE_8P + 1 }, + { { 1, 0 }, 1 * SEARCH_GRID_STRIDE_8P + 0 }, + { { -1, -1 }, -1 * SEARCH_GRID_STRIDE_8P - 1 }, + { { 1, -1 }, 1 * SEARCH_GRID_STRIDE_8P - 1 }, + { { -1, 1 }, -1 * SEARCH_GRID_STRIDE_8P + 1 }, + { { 1, 1 }, 1 * SEARCH_GRID_STRIDE_8P + 1 } + }; + + uint8_t do_refine_search_grid[SEARCH_GRID_STRIDE_8P * + SEARCH_GRID_STRIDE_8P] = { 0 }; + int grid_center = SEARCH_GRID_CENTER_8P; + int grid_coord = grid_center; + + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + const FullMvLimits *mv_limits = &ms_params->mv_limits; + const MSBuffers *ms_buffers = &ms_params->ms_buffers; + const struct buf_2d *src = ms_buffers->src; + const struct buf_2d *ref = ms_buffers->ref; + const int ref_stride = ref->stride; + + *best_mv = start_mv; + clamp_fullmv(best_mv, mv_limits); + + unsigned int best_sad = get_mvpred_compound_sad( + ms_params, src, get_buf_from_fullmv(ref, best_mv), ref_stride); + best_sad += mvsad_err_cost_(best_mv, mv_cost_params); + + do_refine_search_grid[grid_coord] = 1; + + for (int i = 0; i < SEARCH_RANGE_8P; ++i) { + int best_site = -1; + + for (int j = 0; j < 8; ++j) { + grid_coord = grid_center + neighbors[j].coord_offset; + if (do_refine_search_grid[grid_coord] == 1) { + continue; + } + const FULLPEL_MV mv = { best_mv->row + neighbors[j].coord.row, + best_mv->col + neighbors[j].coord.col }; + + do_refine_search_grid[grid_coord] = 1; + if (av1_is_fullmv_in_range(mv_limits, mv)) { + unsigned int sad; + sad = get_mvpred_compound_sad( + ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride); + if (sad < best_sad) { + sad += mvsad_err_cost_(&mv, mv_cost_params); + + if (sad < best_sad) { + best_sad = sad; + best_site = j; + } + } + } + } + + if (best_site == -1) { + break; + } else { + best_mv->row += neighbors[best_site].coord.row; + best_mv->col += neighbors[best_site].coord.col; + grid_center += neighbors[best_site].coord_offset; + } + } + return best_sad; +} + +int av1_full_pixel_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int step_param, int *cost_list, + FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv) { + const BLOCK_SIZE bsize = ms_params->bsize; + const SEARCH_METHODS search_method = ms_params->search_method; + + const int is_intra_mode = ms_params->is_intra_mode; + int run_mesh_search = ms_params->run_mesh_search; + + int var = 0; + MARK_MV_INVALID(best_mv); + if (second_best_mv) { + MARK_MV_INVALID(second_best_mv); + } + + assert(ms_params->ms_buffers.second_pred == NULL && + ms_params->ms_buffers.mask == NULL && + "av1_full_pixel_search does not support compound pred"); + + if (cost_list) { + cost_list[0] = INT_MAX; + cost_list[1] = INT_MAX; + cost_list[2] = INT_MAX; + cost_list[3] = INT_MAX; + cost_list[4] = INT_MAX; + } + + switch (search_method) { + case FAST_DIAMOND: + var = fast_dia_search(start_mv, ms_params, step_param, 0, cost_list, + best_mv); + break; + case FAST_HEX: + var = fast_hex_search(start_mv, ms_params, step_param, 0, cost_list, + best_mv); + break; + case HEX: + var = hex_search(start_mv, ms_params, step_param, 1, cost_list, best_mv); + break; + case SQUARE: + var = + square_search(start_mv, ms_params, step_param, 1, cost_list, best_mv); + break; + case BIGDIA: + var = + bigdia_search(start_mv, ms_params, step_param, 1, cost_list, best_mv); + break; + case NSTEP: + case DIAMOND: + var = full_pixel_diamond(start_mv, ms_params, step_param, cost_list, + best_mv, second_best_mv); + break; + default: assert(0 && "Invalid search method."); + } + + // Should we allow a follow on exhaustive search? + if (!run_mesh_search && search_method == NSTEP) { + int exhuastive_thr = ms_params->force_mesh_thresh; + exhuastive_thr >>= + 10 - (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]); + // Threshold variance for an exhaustive full search. + if (var > exhuastive_thr) run_mesh_search = 1; + } + + // TODO(yunqing): the following is used to reduce mesh search in temporal + // filtering. Can extend it to intrabc. + if (!is_intra_mode && ms_params->prune_mesh_search) { + const int full_pel_mv_diff = AOMMAX(abs(start_mv.row - best_mv->row), + abs(start_mv.col - best_mv->col)); + if (full_pel_mv_diff <= 4) { + run_mesh_search = 0; + } + } + + if (run_mesh_search) { + int var_ex; + FULLPEL_MV tmp_mv_ex; + // Pick the mesh pattern for exhaustive search based on the toolset (intraBC + // or non-intraBC) + // TODO(chiyotsai@google.com): There is a bug here where the second best mv + // gets overwritten without actually comparing the rdcost. + const MESH_PATTERN *const mesh_patterns = + ms_params->mesh_patterns[is_intra_mode]; + // TODO(chiyotsai@google.com): the second best mv is not set correctly by + // full_pixel_exhaustive, which can incorrectly override it. + var_ex = full_pixel_exhaustive(*best_mv, ms_params, mesh_patterns, + cost_list, &tmp_mv_ex, second_best_mv); + if (var_ex < var) { + var = var_ex; + *best_mv = tmp_mv_ex; + } + } + + return var; +} + +int av1_intrabc_hash_search(const AV1_COMP *cpi, const MACROBLOCKD *xd, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + IntraBCHashInfo *intrabc_hash_info, + FULLPEL_MV *best_mv) { + if (!av1_use_hash_me(cpi)) return INT_MAX; + + const BLOCK_SIZE bsize = ms_params->bsize; + const int block_width = block_size_wide[bsize]; + const int block_height = block_size_high[bsize]; + + if (block_width != block_height) return INT_MAX; + + const FullMvLimits *mv_limits = &ms_params->mv_limits; + const MSBuffers *ms_buffer = &ms_params->ms_buffers; + + const uint8_t *src = ms_buffer->src->buf; + const int src_stride = ms_buffer->src->stride; + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const int x_pos = mi_col * MI_SIZE; + const int y_pos = mi_row * MI_SIZE; + + uint32_t hash_value1, hash_value2; + int best_hash_cost = INT_MAX; + + // for the hashMap + hash_table *ref_frame_hash = &intrabc_hash_info->intrabc_hash_table; + + av1_get_block_hash_value(intrabc_hash_info, src, src_stride, block_width, + &hash_value1, &hash_value2, is_cur_buf_hbd(xd)); + + const int count = av1_hash_table_count(ref_frame_hash, hash_value1); + if (count <= 1) { + return INT_MAX; + } + + Iterator iterator = av1_hash_get_first_iterator(ref_frame_hash, hash_value1); + for (int i = 0; i < count; i++, aom_iterator_increment(&iterator)) { + block_hash ref_block_hash = *(block_hash *)(aom_iterator_get(&iterator)); + if (hash_value2 == ref_block_hash.hash_value2) { + // Make sure the prediction is from valid area. + const MV dv = { GET_MV_SUBPEL(ref_block_hash.y - y_pos), + GET_MV_SUBPEL(ref_block_hash.x - x_pos) }; + if (!av1_is_dv_valid(dv, &cpi->common, xd, mi_row, mi_col, bsize, + cpi->common.seq_params.mib_size_log2)) + continue; + + FULLPEL_MV hash_mv; + hash_mv.col = ref_block_hash.x - x_pos; + hash_mv.row = ref_block_hash.y - y_pos; + if (!av1_is_fullmv_in_range(mv_limits, hash_mv)) continue; + const int refCost = get_mvpred_var_cost(ms_params, &hash_mv); + if (refCost < best_hash_cost) { + best_hash_cost = refCost; + *best_mv = hash_mv; + } + } + } + + return best_hash_cost; +} + +static int vector_match(int16_t *ref, int16_t *src, int bwl) { + int best_sad = INT_MAX; + int this_sad; + int d; + int center, offset = 0; + int bw = 4 << bwl; // redundant variable, to be changed in the experiments. + for (d = 0; d <= bw; d += 16) { + this_sad = aom_vector_var(&ref[d], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + offset = d; + } + } + center = offset; + + for (d = -8; d <= 8; d += 16) { + int this_pos = offset + d; + // check limit + if (this_pos < 0 || this_pos > bw) continue; + this_sad = aom_vector_var(&ref[this_pos], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + center = this_pos; + } + } + offset = center; + + for (d = -4; d <= 4; d += 8) { + int this_pos = offset + d; + // check limit + if (this_pos < 0 || this_pos > bw) continue; + this_sad = aom_vector_var(&ref[this_pos], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + center = this_pos; + } + } + offset = center; + + for (d = -2; d <= 2; d += 4) { + int this_pos = offset + d; + // check limit + if (this_pos < 0 || this_pos > bw) continue; + this_sad = aom_vector_var(&ref[this_pos], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + center = this_pos; + } + } + offset = center; + + for (d = -1; d <= 1; d += 2) { + int this_pos = offset + d; + // check limit + if (this_pos < 0 || this_pos > bw) continue; + this_sad = aom_vector_var(&ref[this_pos], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + center = this_pos; + } + } + + return (center - (bw >> 1)); +} + +// A special fast version of motion search used in rt mode +unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, + int mi_col, const MV *ref_mv) { + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mi = xd->mi[0]; + struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } }; + DECLARE_ALIGNED(16, int16_t, hbuf[256]); + DECLARE_ALIGNED(16, int16_t, vbuf[256]); + DECLARE_ALIGNED(16, int16_t, src_hbuf[128]); + DECLARE_ALIGNED(16, int16_t, src_vbuf[128]); + int idx; + const int bw = 4 << mi_size_wide_log2[bsize]; + const int bh = 4 << mi_size_high_log2[bsize]; + const int search_width = bw << 1; + const int search_height = bh << 1; + const int src_stride = x->plane[0].src.stride; + const int ref_stride = xd->plane[0].pre[0].stride; + uint8_t const *ref_buf, *src_buf; + int_mv *best_int_mv = &xd->mi[0]->mv[0]; + unsigned int best_sad, tmp_sad, this_sad[4]; + const int norm_factor = 3 + (bw >> 5); + const YV12_BUFFER_CONFIG *scaled_ref_frame = + av1_get_scaled_ref_frame(cpi, mi->ref_frame[0]); + static const MV search_pos[4] = { + { -1, 0 }, + { 0, -1 }, + { 0, 1 }, + { 1, 0 }, + }; + + if (scaled_ref_frame) { + int i; + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // motion search code to be used without additional modifications. + for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0]; + av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL, + MAX_MB_PLANE); + } + + if (xd->bd != 8) { + unsigned int sad; + best_int_mv->as_fullmv = kZeroFullMv; + sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride, + xd->plane[0].pre[0].buf, ref_stride); + + if (scaled_ref_frame) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i]; + } + return sad; + } + + // Set up prediction 1-D reference set + ref_buf = xd->plane[0].pre[0].buf - (bw >> 1); + for (idx = 0; idx < search_width; idx += 16) { + aom_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh); + ref_buf += 16; + } + + ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride; + for (idx = 0; idx < search_height; ++idx) { + vbuf[idx] = aom_int_pro_col(ref_buf, bw) >> norm_factor; + ref_buf += ref_stride; + } + + // Set up src 1-D reference set + for (idx = 0; idx < bw; idx += 16) { + src_buf = x->plane[0].src.buf + idx; + aom_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh); + } + + src_buf = x->plane[0].src.buf; + for (idx = 0; idx < bh; ++idx) { + src_vbuf[idx] = aom_int_pro_col(src_buf, bw) >> norm_factor; + src_buf += src_stride; + } + + // Find the best match per 1-D search + best_int_mv->as_fullmv.col = + vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize]); + best_int_mv->as_fullmv.row = + vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize]); + + FULLPEL_MV this_mv = best_int_mv->as_fullmv; + src_buf = x->plane[0].src.buf; + ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv); + best_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride); + + { + const uint8_t *const pos[4] = { + ref_buf - ref_stride, + ref_buf - 1, + ref_buf + 1, + ref_buf + ref_stride, + }; + + cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad); + } + + for (idx = 0; idx < 4; ++idx) { + if (this_sad[idx] < best_sad) { + best_sad = this_sad[idx]; + best_int_mv->as_fullmv.row = search_pos[idx].row + this_mv.row; + best_int_mv->as_fullmv.col = search_pos[idx].col + this_mv.col; + } + } + + if (this_sad[0] < this_sad[3]) + this_mv.row -= 1; + else + this_mv.row += 1; + + if (this_sad[1] < this_sad[2]) + this_mv.col -= 1; + else + this_mv.col += 1; + + ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv); + + tmp_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride); + if (best_sad > tmp_sad) { + best_int_mv->as_fullmv = this_mv; + best_sad = tmp_sad; + } + + convert_fullmv_to_mv(best_int_mv); + + SubpelMvLimits subpel_mv_limits; + av1_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv); + clamp_mv(&best_int_mv->as_mv, &subpel_mv_limits); + + if (scaled_ref_frame) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i]; + } + + return best_sad; +} + +// ============================================================================= +// Fullpixel Motion Search: OBMC +// ============================================================================= +static INLINE int get_obmc_mvpred_var( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv) { + const aom_variance_fn_ptr_t *vfp = ms_params->vfp; + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + const MSBuffers *ms_buffers = &ms_params->ms_buffers; + const int32_t *wsrc = ms_buffers->wsrc; + const int32_t *mask = ms_buffers->obmc_mask; + const struct buf_2d *ref_buf = ms_buffers->ref; + + const MV mv = get_mv_from_fullmv(this_mv); + unsigned int unused; + + return vfp->ovf(get_buf_from_fullmv(ref_buf, this_mv), ref_buf->stride, wsrc, + mask, &unused) + + mv_err_cost_(&mv, mv_cost_params); +} + +static int obmc_refining_search_sad( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV *best_mv) { + const aom_variance_fn_ptr_t *fn_ptr = ms_params->vfp; + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + const MSBuffers *ms_buffers = &ms_params->ms_buffers; + const int32_t *wsrc = ms_buffers->wsrc; + const int32_t *mask = ms_buffers->obmc_mask; + const struct buf_2d *ref_buf = ms_buffers->ref; + const FULLPEL_MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } }; + const int kSearchRange = 8; + + unsigned int best_sad = fn_ptr->osdf(get_buf_from_fullmv(ref_buf, best_mv), + ref_buf->stride, wsrc, mask) + + mvsad_err_cost_(best_mv, mv_cost_params); + + for (int i = 0; i < kSearchRange; i++) { + int best_site = -1; + + for (int j = 0; j < 4; j++) { + const FULLPEL_MV mv = { best_mv->row + neighbors[j].row, + best_mv->col + neighbors[j].col }; + if (av1_is_fullmv_in_range(&ms_params->mv_limits, mv)) { + unsigned int sad = fn_ptr->osdf(get_buf_from_fullmv(ref_buf, &mv), + ref_buf->stride, wsrc, mask); + if (sad < best_sad) { + sad += mvsad_err_cost_(&mv, mv_cost_params); + + if (sad < best_sad) { + best_sad = sad; + best_site = j; + } + } + } + } + + if (best_site == -1) { + break; + } else { + best_mv->row += neighbors[best_site].row; + best_mv->col += neighbors[best_site].col; + } + } + return best_sad; +} + +static int obmc_diamond_search_sad( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV start_mv, + FULLPEL_MV *best_mv, int search_param, int *num00) { + const aom_variance_fn_ptr_t *fn_ptr = ms_params->vfp; + const search_site_config *cfg = ms_params->search_sites; + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + const MSBuffers *ms_buffers = &ms_params->ms_buffers; + const int32_t *wsrc = ms_buffers->wsrc; + const int32_t *mask = ms_buffers->obmc_mask; + const struct buf_2d *const ref_buf = ms_buffers->ref; + // search_param determines the length of the initial step and hence the number + // of iterations + // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = + // (MAX_FIRST_STEP/4) pel... etc. + + const int tot_steps = MAX_MVSEARCH_STEPS - 1 - search_param; + const uint8_t *best_address, *init_ref; + int best_sad = INT_MAX; + int best_site = 0; + int step; + + clamp_fullmv(&start_mv, &ms_params->mv_limits); + best_address = init_ref = get_buf_from_fullmv(ref_buf, &start_mv); + *num00 = 0; + *best_mv = start_mv; + + // Check the starting position + best_sad = fn_ptr->osdf(best_address, ref_buf->stride, wsrc, mask) + + mvsad_err_cost_(best_mv, mv_cost_params); + + for (step = tot_steps; step >= 0; --step) { + const search_site *const ss = cfg->ss[step]; + best_site = 0; + for (int idx = 1; idx <= cfg->searches_per_step[step]; ++idx) { + const FULLPEL_MV mv = { best_mv->row + ss[idx].mv.row, + best_mv->col + ss[idx].mv.col }; + if (av1_is_fullmv_in_range(&ms_params->mv_limits, mv)) { + int sad = fn_ptr->osdf(best_address + ss[idx].offset, ref_buf->stride, + wsrc, mask); + if (sad < best_sad) { + sad += mvsad_err_cost_(&mv, mv_cost_params); + + if (sad < best_sad) { + best_sad = sad; + best_site = idx; + } + } + } + } + + if (best_site != 0) { + best_mv->row += ss[best_site].mv.row; + best_mv->col += ss[best_site].mv.col; + best_address += ss[best_site].offset; + } else if (best_address == init_ref) { + (*num00)++; + } + } + return best_sad; +} + +static int obmc_full_pixel_diamond( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV start_mv, + int step_param, int do_refine, FULLPEL_MV *best_mv) { + const search_site_config *cfg = ms_params->search_sites; + FULLPEL_MV tmp_mv; + int thissme, n, num00 = 0; + int bestsme = + obmc_diamond_search_sad(ms_params, start_mv, &tmp_mv, step_param, &n); + if (bestsme < INT_MAX) bestsme = get_obmc_mvpred_var(ms_params, &tmp_mv); + *best_mv = tmp_mv; + + // If there won't be more n-step search, check to see if refining search is + // needed. + const int further_steps = cfg->ss_count - 1 - step_param; + if (n > further_steps) do_refine = 0; + + while (n < further_steps) { + ++n; + + if (num00) { + num00--; + } else { + thissme = obmc_diamond_search_sad(ms_params, start_mv, &tmp_mv, + step_param + n, &num00); + if (thissme < INT_MAX) thissme = get_obmc_mvpred_var(ms_params, &tmp_mv); + + // check to see if refining search is needed. + if (num00 > further_steps - n) do_refine = 0; + + if (thissme < bestsme) { + bestsme = thissme; + *best_mv = tmp_mv; + } + } + } + + // final 1-away diamond refining search + if (do_refine) { + tmp_mv = *best_mv; + thissme = obmc_refining_search_sad(ms_params, &tmp_mv); + if (thissme < INT_MAX) thissme = get_obmc_mvpred_var(ms_params, &tmp_mv); + if (thissme < bestsme) { + bestsme = thissme; + *best_mv = tmp_mv; + } + } + return bestsme; +} + +int av1_obmc_full_pixel_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int step_param, FULLPEL_MV *best_mv) { + if (!ms_params->fast_obmc_search) { + const int do_refine = 1; + const int bestsme = obmc_full_pixel_diamond(ms_params, start_mv, step_param, + do_refine, best_mv); + return bestsme; + } else { + *best_mv = start_mv; + clamp_fullmv(best_mv, &ms_params->mv_limits); + int thissme = obmc_refining_search_sad(ms_params, best_mv); + if (thissme < INT_MAX) thissme = get_obmc_mvpred_var(ms_params, best_mv); + return thissme; + } +} + +// ============================================================================= +// Subpixel Motion Search: Translational +// ============================================================================= +#define INIT_SUBPEL_STEP_SIZE (4) +/* + * To avoid the penalty for crossing cache-line read, preload the reference + * area in a small buffer, which is aligned to make sure there won't be crossing + * cache-line read while reading from this buffer. This reduced the cpu + * cycles spent on reading ref data in sub-pixel filter functions. + * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x + * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we + * could reduce the area. + */ + +// Returns the subpel offset used by various subpel variance functions [m]sv[a]f +static INLINE int get_subpel_part(int x) { return x & 7; } + +// Gets the address of the ref buffer at subpel location (r, c), rounded to the +// nearest fullpel precision toward - \infty + +static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf, + const MV mv) { + const int offset = (mv.row >> 3) * buf->stride + (mv.col >> 3); + return &buf->buf[offset]; +} + +// Estimates the variance of prediction residue using bilinear filter for fast +// search. +static INLINE int estimated_pref_error( + const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params, + unsigned int *sse) { + const aom_variance_fn_ptr_t *vfp = var_params->vfp; + + const MSBuffers *ms_buffers = &var_params->ms_buffers; + const uint8_t *src = ms_buffers->src->buf; + const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv); + const int src_stride = ms_buffers->src->stride; + const int ref_stride = ms_buffers->ref->stride; + const uint8_t *second_pred = ms_buffers->second_pred; + const uint8_t *mask = ms_buffers->mask; + const int mask_stride = ms_buffers->mask_stride; + const int invert_mask = ms_buffers->inv_mask; + + const int subpel_x_q3 = get_subpel_part(this_mv->col); + const int subpel_y_q3 = get_subpel_part(this_mv->row); + + if (second_pred == NULL) { + return vfp->svf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride, + sse); + } else if (mask) { + return vfp->msvf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride, + second_pred, mask, mask_stride, invert_mask, sse); + } else { + return vfp->svaf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride, + sse, second_pred); + } +} + +// Calculates the variance of prediction residue. +static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm, + const MV *this_mv, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + unsigned int *sse) { + const aom_variance_fn_ptr_t *vfp = var_params->vfp; + const SUBPEL_SEARCH_TYPE subpel_search_type = var_params->subpel_search_type; + + const MSBuffers *ms_buffers = &var_params->ms_buffers; + const uint8_t *src = ms_buffers->src->buf; + const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv); + const int src_stride = ms_buffers->src->stride; + const int ref_stride = ms_buffers->ref->stride; + const uint8_t *second_pred = ms_buffers->second_pred; + const uint8_t *mask = ms_buffers->mask; + const int mask_stride = ms_buffers->mask_stride; + const int invert_mask = ms_buffers->inv_mask; + const int w = var_params->w; + const int h = var_params->h; + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const int subpel_x_q3 = get_subpel_part(this_mv->col); + const int subpel_y_q3 = get_subpel_part(this_mv->row); + + unsigned int besterr; +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]); + uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16); + if (second_pred != NULL) { + if (mask) { + aom_highbd_comp_mask_upsampled_pred( + xd, cm, mi_row, mi_col, this_mv, pred8, second_pred, w, h, + subpel_x_q3, subpel_y_q3, ref, ref_stride, mask, mask_stride, + invert_mask, xd->bd, subpel_search_type); + } else { + aom_highbd_comp_avg_upsampled_pred( + xd, cm, mi_row, mi_col, this_mv, pred8, second_pred, w, h, + subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd, + subpel_search_type); + } + } else { + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8, w, h, + subpel_x_q3, subpel_y_q3, ref, ref_stride, + xd->bd, subpel_search_type); + } + besterr = vfp->vf(pred8, w, src, src_stride, sse); + } else { + DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]); + if (second_pred != NULL) { + if (mask) { + aom_comp_mask_upsampled_pred( + xd, cm, mi_row, mi_col, this_mv, pred, second_pred, w, h, + subpel_x_q3, subpel_y_q3, ref, ref_stride, mask, mask_stride, + invert_mask, subpel_search_type); + } else { + aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, + second_pred, w, h, subpel_x_q3, subpel_y_q3, + ref, ref_stride, subpel_search_type); + } + } else { + aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, + subpel_x_q3, subpel_y_q3, ref, ref_stride, + subpel_search_type); + } + + besterr = vfp->vf(pred, w, src, src_stride, sse); + } +#else + DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]); + if (second_pred != NULL) { + if (mask) { + aom_comp_mask_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, + second_pred, w, h, subpel_x_q3, subpel_y_q3, + ref, ref_stride, mask, mask_stride, + invert_mask, subpel_search_type); + } else { + aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, + second_pred, w, h, subpel_x_q3, subpel_y_q3, + ref, ref_stride, subpel_search_type); + } + } else { + aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3, + subpel_y_q3, ref, ref_stride, subpel_search_type); + } + + besterr = vfp->vf(pred, w, src, src_stride, sse); +#endif + return besterr; +} + +// Estimates whether this_mv is better than best_mv. This function incorporates +// both prediction error and residue into account. It is suffixed "fast" because +// it uses bilinear filter to estimate the prediction. +static INLINE unsigned int check_better_fast( + const MV *this_mv, MV *best_mv, const SubpelMvLimits *mv_limits, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion, int *has_better_mv) { + unsigned int cost; + if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) { + unsigned int sse; + int thismse = estimated_pref_error(this_mv, var_params, &sse); + cost = mv_err_cost_(this_mv, mv_cost_params); + cost += thismse; + + if (cost < *besterr) { + *besterr = cost; + *best_mv = *this_mv; + *distortion = thismse; + *sse1 = sse; + *has_better_mv |= 1; + } + } else { + cost = INT_MAX; + } + return cost; +} + +// Checks whether this_mv is better than best_mv. This function incorporates +// both prediction error and residue into account. +static AOM_FORCE_INLINE unsigned int check_better( + MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv, + const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion, int *is_better) { + unsigned int cost; + if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) { + unsigned int sse; + int thismse; + thismse = upsampled_pref_error(xd, cm, this_mv, var_params, &sse); + cost = mv_err_cost_(this_mv, mv_cost_params); + cost += thismse; + if (cost < *besterr) { + *besterr = cost; + *best_mv = *this_mv; + *distortion = thismse; + *sse1 = sse; + *is_better |= 1; + } + } else { + cost = INT_MAX; + } + return cost; +} + +static INLINE MV get_best_diag_step(int step_size, unsigned int left_cost, + unsigned int right_cost, + unsigned int up_cost, + unsigned int down_cost) { + const MV diag_step = { up_cost <= down_cost ? -step_size : step_size, + left_cost <= right_cost ? -step_size : step_size }; + + return diag_step; +} + +// Searches the four cardinal direction for a better mv, then follows up with a +// search in the best quadrant. This uses bilinear filter to speed up the +// calculation. +static AOM_FORCE_INLINE MV first_level_check_fast( + const MV this_mv, MV *best_mv, int hstep, const SubpelMvLimits *mv_limits, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion) { + // Check the four cardinal directions + const MV left_mv = { this_mv.row, this_mv.col - hstep }; + int dummy = 0; + const unsigned int left = + check_better_fast(&left_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + + const MV right_mv = { this_mv.row, this_mv.col + hstep }; + const unsigned int right = + check_better_fast(&right_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + + const MV top_mv = { this_mv.row - hstep, this_mv.col }; + const unsigned int up = + check_better_fast(&top_mv, best_mv, mv_limits, var_params, mv_cost_params, + besterr, sse1, distortion, &dummy); + + const MV bottom_mv = { this_mv.row + hstep, this_mv.col }; + const unsigned int down = + check_better_fast(&bottom_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + + const MV diag_step = get_best_diag_step(hstep, left, right, up, down); + const MV diag_mv = { this_mv.row + diag_step.row, + this_mv.col + diag_step.col }; + + // Check the diagonal direction with the best mv + check_better_fast(&diag_mv, best_mv, mv_limits, var_params, mv_cost_params, + besterr, sse1, distortion, &dummy); + + return diag_step; +} + +// Performs a following up search after first_level_check_fast is called. This +// performs two extra chess pattern searches in the best quadrant. +static AOM_FORCE_INLINE void second_level_check_fast( + const MV this_mv, const MV diag_step, MV *best_mv, int hstep, + const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion) { + assert(diag_step.row == hstep || diag_step.row == -hstep); + assert(diag_step.col == hstep || diag_step.col == -hstep); + const int tr = this_mv.row; + const int tc = this_mv.col; + const int br = best_mv->row; + const int bc = best_mv->col; + int dummy = 0; + if (tr != br && tc != bc) { + assert(diag_step.col == bc - tc); + assert(diag_step.row == br - tr); + const MV chess_mv_1 = { br, bc + diag_step.col }; + const MV chess_mv_2 = { br + diag_step.row, bc }; + check_better_fast(&chess_mv_1, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + + check_better_fast(&chess_mv_2, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + } else if (tr == br && tc != bc) { + assert(diag_step.col == bc - tc); + // Continue searching in the best direction + const MV bottom_long_mv = { br + hstep, bc + diag_step.col }; + const MV top_long_mv = { br - hstep, bc + diag_step.col }; + check_better_fast(&bottom_long_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + check_better_fast(&top_long_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + + // Search in the direction opposite of the best quadrant + const MV rev_mv = { br - diag_step.row, bc }; + check_better_fast(&rev_mv, best_mv, mv_limits, var_params, mv_cost_params, + besterr, sse1, distortion, &dummy); + } else if (tr != br && tc == bc) { + assert(diag_step.row == br - tr); + // Continue searching in the best direction + const MV right_long_mv = { br + diag_step.row, bc + hstep }; + const MV left_long_mv = { br + diag_step.row, bc - hstep }; + check_better_fast(&right_long_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + check_better_fast(&left_long_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + + // Search in the direction opposite of the best quadrant + const MV rev_mv = { br, bc - diag_step.col }; + check_better_fast(&rev_mv, best_mv, mv_limits, var_params, mv_cost_params, + besterr, sse1, distortion, &dummy); + } +} + +// Combines first level check and second level check when applicable. This first +// searches the four cardinal directions, and perform several +// diagonal/chess-pattern searches in the best quadrant. +static AOM_FORCE_INLINE void two_level_checks_fast( + const MV this_mv, MV *best_mv, int hstep, const SubpelMvLimits *mv_limits, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion, int iters) { + const MV diag_step = + first_level_check_fast(this_mv, best_mv, hstep, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion); + if (iters > 1) { + second_level_check_fast(this_mv, diag_step, best_mv, hstep, mv_limits, + var_params, mv_cost_params, besterr, sse1, + distortion); + } +} + +static AOM_FORCE_INLINE MV +first_level_check(MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, + MV *best_mv, const int hstep, const SubpelMvLimits *mv_limits, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion) { + int dummy = 0; + const MV left_mv = { this_mv.row, this_mv.col - hstep }; + const MV right_mv = { this_mv.row, this_mv.col + hstep }; + const MV top_mv = { this_mv.row - hstep, this_mv.col }; + const MV bottom_mv = { this_mv.row + hstep, this_mv.col }; + + const unsigned int left = + check_better(xd, cm, &left_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + const unsigned int right = + check_better(xd, cm, &right_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + const unsigned int up = + check_better(xd, cm, &top_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + const unsigned int down = + check_better(xd, cm, &bottom_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + + const MV diag_step = get_best_diag_step(hstep, left, right, up, down); + const MV diag_mv = { this_mv.row + diag_step.row, + this_mv.col + diag_step.col }; + + // Check the diagonal direction with the best mv + check_better(xd, cm, &diag_mv, best_mv, mv_limits, var_params, mv_cost_params, + besterr, sse1, distortion, &dummy); + + return diag_step; +} + +// A newer version of second level check that gives better quality. +// TODO(chiyotsai@google.com): evaluate this on subpel_search_types different +// from av1_find_best_sub_pixel_tree +static AOM_FORCE_INLINE void second_level_check_v2( + MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV diag_step, + MV *best_mv, const SubpelMvLimits *mv_limits, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion) { + assert(best_mv->row == this_mv.row + diag_step.row || + best_mv->col == this_mv.col + diag_step.col); + if (CHECK_MV_EQUAL(this_mv, *best_mv)) { + return; + } else if (this_mv.row == best_mv->row) { + // Search away from diagonal step since diagonal search did not provide any + // improvement + diag_step.row *= -1; + } else if (this_mv.col == best_mv->col) { + diag_step.col *= -1; + } + + const MV row_bias_mv = { best_mv->row + diag_step.row, best_mv->col }; + const MV col_bias_mv = { best_mv->row, best_mv->col + diag_step.col }; + const MV diag_bias_mv = { best_mv->row + diag_step.row, + best_mv->col + diag_step.col }; + int has_better_mv = 0; + + if (var_params->subpel_search_type != USE_2_TAPS_ORIG) { + check_better(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &has_better_mv); + check_better(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &has_better_mv); + + // Do an additional search if the second iteration gives a better mv + if (has_better_mv) { + check_better(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &has_better_mv); + } + } else { + check_better_fast(&row_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, + &has_better_mv); + check_better_fast(&col_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, + &has_better_mv); + + // Do an additional search if the second iteration gives a better mv + if (has_better_mv) { + check_better_fast(&diag_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, + &has_better_mv); + } + } +} + +// Gets the error at the beginning when the mv has fullpel precision +static unsigned int setup_center_error( + const MACROBLOCKD *xd, const MV *bestmv, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) { + const aom_variance_fn_ptr_t *vfp = var_params->vfp; + const int w = var_params->w; + const int h = var_params->h; + + const MSBuffers *ms_buffers = &var_params->ms_buffers; + const uint8_t *src = ms_buffers->src->buf; + const uint8_t *y = get_buf_from_mv(ms_buffers->ref, *bestmv); + const int src_stride = ms_buffers->src->stride; + const int y_stride = ms_buffers->ref->stride; + const uint8_t *second_pred = ms_buffers->second_pred; + const uint8_t *mask = ms_buffers->mask; + const int mask_stride = ms_buffers->mask_stride; + const int invert_mask = ms_buffers->inv_mask; + + unsigned int besterr; + + if (second_pred != NULL) { +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]); + uint8_t *comp_pred = CONVERT_TO_BYTEPTR(comp_pred16); + if (mask) { + aom_highbd_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, + mask, mask_stride, invert_mask); + } else { + aom_highbd_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride); + } + besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); + } else { + DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]); + if (mask) { + aom_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, mask, + mask_stride, invert_mask); + } else { + aom_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride); + } + besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); + } +#else + (void)xd; + DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]); + if (mask) { + aom_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, mask, + mask_stride, invert_mask); + } else { + aom_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride); + } + besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); +#endif + } else { + besterr = vfp->vf(y, y_stride, src, src_stride, sse1); + } + *distortion = besterr; + besterr += mv_err_cost_(bestmv, mv_cost_params); + return besterr; +} + +// Gets the error at the beginning when the mv has fullpel precision +static unsigned int upsampled_setup_center_error( + MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV *bestmv, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) { + unsigned int besterr = upsampled_pref_error(xd, cm, bestmv, var_params, sse1); + *distortion = besterr; + besterr += mv_err_cost_(bestmv, mv_cost_params); + return besterr; +} + +static INLINE int divide_and_round(int n, int d) { + return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d); +} + +static INLINE int is_cost_list_wellbehaved(const int *cost_list) { + return cost_list[0] < cost_list[1] && cost_list[0] < cost_list[2] && + cost_list[0] < cost_list[3] && cost_list[0] < cost_list[4]; +} + +// Returns surface minima estimate at given precision in 1/2^n bits. +// Assume a model for the cost surface: S = A(x - x0)^2 + B(y - y0)^2 + C +// For a given set of costs S0, S1, S2, S3, S4 at points +// (y, x) = (0, 0), (0, -1), (1, 0), (0, 1) and (-1, 0) respectively, +// the solution for the location of the minima (x0, y0) is given by: +// x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0), +// y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0). +// The code below is an integerized version of that. +static AOM_INLINE void get_cost_surf_min(const int *cost_list, int *ir, int *ic, + int bits) { + *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)), + (cost_list[1] - 2 * cost_list[0] + cost_list[3])); + *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)), + (cost_list[4] - 2 * cost_list[0] + cost_list[2])); +} + +// Checks the list of mvs searched in the last iteration and see if we are +// repeating it. If so, return 1. Otherwise we update the last_mv_search_list +// with current_mv and return 0. +static INLINE int check_repeated_mv_and_update(int_mv *last_mv_search_list, + const MV current_mv, int iter) { + if (last_mv_search_list) { + if (CHECK_MV_EQUAL(last_mv_search_list[iter].as_mv, current_mv)) { + return 1; + } + + last_mv_search_list[iter].as_mv = current_mv; + } + return 0; +} + +int av1_find_best_sub_pixel_tree_pruned_evenmore( + MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, MV *bestmv, + int *distortion, unsigned int *sse1, int_mv *last_mv_search_list) { + (void)cm; + const int allow_hp = ms_params->allow_hp; + const int forced_stop = ms_params->forced_stop; + const int iters_per_step = ms_params->iters_per_step; + const int *cost_list = ms_params->cost_list; + const SubpelMvLimits *mv_limits = &ms_params->mv_limits; + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params; + + // The iteration we are current searching for. Iter 0 corresponds to fullpel + // mv, iter 1 to half pel, and so on + int iter = 0; + int hstep = INIT_SUBPEL_STEP_SIZE; // Step size, initialized to 4/8=1/2 pel + unsigned int besterr = INT_MAX; + *bestmv = start_mv; + + besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1, + distortion); + + if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) { + return INT_MAX; + } + iter++; + + if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX && + cost_list[2] != INT_MAX && cost_list[3] != INT_MAX && + cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) { + int ir, ic; + int dummy = 0; + get_cost_surf_min(cost_list, &ir, &ic, 2); + if (ir != 0 || ic != 0) { + const MV this_mv = { start_mv.row + 2 * ir, start_mv.col + 2 * ic }; + check_better_fast(&this_mv, bestmv, mv_limits, var_params, mv_cost_params, + &besterr, sse1, distortion, &dummy); + } + } else { + two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, + iters_per_step); + + // Each subsequent iteration checks at least one point in common with + // the last iteration could be 2 ( if diag selected) 1/4 pel + if (forced_stop != HALF_PEL) { + if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) { + return INT_MAX; + } + iter++; + + hstep >>= 1; + start_mv = *bestmv; + two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, + iters_per_step); + } + } + + if (allow_hp && forced_stop == EIGHTH_PEL) { + if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) { + return INT_MAX; + } + iter++; + + hstep >>= 1; + start_mv = *bestmv; + two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, + iters_per_step); + } + + return besterr; +} + +int av1_find_best_sub_pixel_tree_pruned_more( + MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, MV *bestmv, + int *distortion, unsigned int *sse1, int_mv *last_mv_search_list) { + (void)cm; + const int allow_hp = ms_params->allow_hp; + const int forced_stop = ms_params->forced_stop; + const int iters_per_step = ms_params->iters_per_step; + const int *cost_list = ms_params->cost_list; + const SubpelMvLimits *mv_limits = &ms_params->mv_limits; + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params; + + // The iteration we are current searching for. Iter 0 corresponds to fullpel + // mv, iter 1 to half pel, and so on + int iter = 0; + int hstep = INIT_SUBPEL_STEP_SIZE; // Step size, initialized to 4/8=1/2 pel + unsigned int besterr = INT_MAX; + *bestmv = start_mv; + + besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1, + distortion); + + if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) { + return INT_MAX; + } + iter++; + + if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX && + cost_list[2] != INT_MAX && cost_list[3] != INT_MAX && + cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) { + int ir, ic; + get_cost_surf_min(cost_list, &ir, &ic, 1); + if (ir != 0 || ic != 0) { + const MV this_mv = { start_mv.row + ir * hstep, + start_mv.col + ic * hstep }; + int dummy = 0; + check_better_fast(&this_mv, bestmv, mv_limits, var_params, mv_cost_params, + &besterr, sse1, distortion, &dummy); + } + } else { + two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, + iters_per_step); + } + + // Each subsequent iteration checks at least one point in common with + // the last iteration could be 2 ( if diag selected) 1/4 pel + if (forced_stop != HALF_PEL) { + if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) { + return INT_MAX; + } + iter++; + + hstep >>= 1; + start_mv = *bestmv; + two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, + iters_per_step); + } + + if (allow_hp && forced_stop == EIGHTH_PEL) { + if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) { + return INT_MAX; + } + iter++; + + hstep >>= 1; + start_mv = *bestmv; + two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, + iters_per_step); + } + + return besterr; +} + +int av1_find_best_sub_pixel_tree_pruned( + MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, MV *bestmv, + int *distortion, unsigned int *sse1, int_mv *last_mv_search_list) { + (void)cm; + const int allow_hp = ms_params->allow_hp; + const int forced_stop = ms_params->forced_stop; + const int iters_per_step = ms_params->iters_per_step; + const int *cost_list = ms_params->cost_list; + const SubpelMvLimits *mv_limits = &ms_params->mv_limits; + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params; + + // The iteration we are current searching for. Iter 0 corresponds to fullpel + // mv, iter 1 to half pel, and so on + int iter = 0; + int hstep = INIT_SUBPEL_STEP_SIZE; // Step size, initialized to 4/8=1/2 pel + unsigned int besterr = INT_MAX; + *bestmv = start_mv; + + besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1, + distortion); + if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) { + return INT_MAX; + } + iter++; + + if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX && + cost_list[2] != INT_MAX && cost_list[3] != INT_MAX && + cost_list[4] != INT_MAX) { + const unsigned int whichdir = (cost_list[1] < cost_list[3] ? 0 : 1) + + (cost_list[2] < cost_list[4] ? 0 : 2); + + const MV left_mv = { start_mv.row, start_mv.col - hstep }; + const MV right_mv = { start_mv.row, start_mv.col + hstep }; + const MV bottom_mv = { start_mv.row + hstep, start_mv.col }; + const MV top_mv = { start_mv.row - hstep, start_mv.col }; + + const MV bottom_left_mv = { start_mv.row + hstep, start_mv.col - hstep }; + const MV bottom_right_mv = { start_mv.row + hstep, start_mv.col + hstep }; + const MV top_left_mv = { start_mv.row - hstep, start_mv.col - hstep }; + const MV top_right_mv = { start_mv.row - hstep, start_mv.col + hstep }; + + int dummy = 0; + + switch (whichdir) { + case 0: // bottom left quadrant + check_better_fast(&left_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy); + check_better_fast(&bottom_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy); + check_better_fast(&bottom_left_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy); + break; + case 1: // bottom right quadrant + check_better_fast(&right_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy); + check_better_fast(&bottom_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy); + check_better_fast(&bottom_right_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy); + break; + case 2: // top left quadrant + check_better_fast(&left_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy); + check_better_fast(&top_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy); + check_better_fast(&top_left_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy); + break; + case 3: // top right quadrant + check_better_fast(&right_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy); + check_better_fast(&top_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy); + check_better_fast(&top_right_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy); + break; + } + } else { + two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, + iters_per_step); + } + + // Each subsequent iteration checks at least one point in common with + // the last iteration could be 2 ( if diag selected) 1/4 pel + if (forced_stop != HALF_PEL) { + if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) { + return INT_MAX; + } + iter++; + + hstep >>= 1; + start_mv = *bestmv; + two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, + iters_per_step); + } + + if (allow_hp && forced_stop == EIGHTH_PEL) { + if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) { + return INT_MAX; + } + iter++; + + hstep >>= 1; + start_mv = *bestmv; + two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, + iters_per_step); + } + + return besterr; +} + +int av1_find_best_sub_pixel_tree(MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, + MV start_mv, MV *bestmv, int *distortion, + unsigned int *sse1, + int_mv *last_mv_search_list) { + const int allow_hp = ms_params->allow_hp; + const int forced_stop = ms_params->forced_stop; + const int iters_per_step = ms_params->iters_per_step; + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params; + const SUBPEL_SEARCH_TYPE subpel_search_type = + ms_params->var_params.subpel_search_type; + const SubpelMvLimits *mv_limits = &ms_params->mv_limits; + + // How many steps to take. A round of 0 means fullpel search only, 1 means + // half-pel, and so on. + const int round = AOMMIN(FULL_PEL - forced_stop, 3 - !allow_hp); + int hstep = INIT_SUBPEL_STEP_SIZE; // Step size, initialized to 4/8=1/2 pel + + unsigned int besterr = INT_MAX; + + *bestmv = start_mv; + + if (subpel_search_type != USE_2_TAPS_ORIG) { + besterr = upsampled_setup_center_error(xd, cm, bestmv, var_params, + mv_cost_params, sse1, distortion); + } else { + besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1, + distortion); + } + + for (int iter = 0; iter < round; ++iter) { + MV iter_center_mv = *bestmv; + if (check_repeated_mv_and_update(last_mv_search_list, iter_center_mv, + iter)) { + return INT_MAX; + } + + MV diag_step; + if (subpel_search_type != USE_2_TAPS_ORIG) { + diag_step = first_level_check(xd, cm, iter_center_mv, bestmv, hstep, + mv_limits, var_params, mv_cost_params, + &besterr, sse1, distortion); + } else { + diag_step = first_level_check_fast(iter_center_mv, bestmv, hstep, + mv_limits, var_params, mv_cost_params, + &besterr, sse1, distortion); + } + + // Check diagonal sub-pixel position + if (!CHECK_MV_EQUAL(iter_center_mv, *bestmv) && iters_per_step > 1) { + second_level_check_v2(xd, cm, iter_center_mv, diag_step, bestmv, + mv_limits, var_params, mv_cost_params, &besterr, + sse1, distortion); + } + + hstep >>= 1; + } + + return besterr; +} + +// Note(yunqingwang): The following 2 functions are only used in the motion +// vector unit test, which return extreme motion vectors allowed by the MV +// limits. +// Returns the maximum MV. +int av1_return_max_sub_pixel_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, + MV start_mv, MV *bestmv, int *distortion, + unsigned int *sse1, + int_mv *last_mv_search_list) { + (void)xd; + (void)cm; + (void)start_mv; + (void)sse1; + (void)distortion; + (void)last_mv_search_list; + + const int allow_hp = ms_params->allow_hp; + const SubpelMvLimits *mv_limits = &ms_params->mv_limits; + + bestmv->row = mv_limits->row_max; + bestmv->col = mv_limits->col_max; + + unsigned int besterr = 0; + + // In the sub-pel motion search, if hp is not used, then the last bit of mv + // has to be 0. + lower_mv_precision(bestmv, allow_hp, 0); + return besterr; +} + +// Returns the minimum MV. +int av1_return_min_sub_pixel_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, + MV start_mv, MV *bestmv, int *distortion, + unsigned int *sse1, + int_mv *last_mv_search_list) { + (void)xd; + (void)cm; + (void)start_mv; + (void)sse1; + (void)distortion; + (void)last_mv_search_list; + + const int allow_hp = ms_params->allow_hp; + const SubpelMvLimits *mv_limits = &ms_params->mv_limits; + + bestmv->row = mv_limits->row_min; + bestmv->col = mv_limits->col_min; + + unsigned int besterr = 0; + // In the sub-pel motion search, if hp is not used, then the last bit of mv + // has to be 0. + lower_mv_precision(bestmv, allow_hp, 0); + return besterr; +} + +// Computes the cost of the current predictor by going through the whole +// av1_enc_build_inter_predictor pipeline. This is mainly used by warped mv +// during motion_mode_rd. We are going through the whole +// av1_enc_build_inter_predictor because we might have changed the interpolation +// filter, etc before motion_mode_rd is called. +static INLINE unsigned int compute_motion_cost( + MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, BLOCK_SIZE bsize, + const MV *this_mv) { + unsigned int mse; + unsigned int sse; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + + const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params; + const MSBuffers *ms_buffers = &var_params->ms_buffers; + + const uint8_t *const src = ms_buffers->src->buf; + const int src_stride = ms_buffers->src->stride; + const uint8_t *const dst = xd->plane[0].dst.buf; + const int dst_stride = xd->plane[0].dst.stride; + const aom_variance_fn_ptr_t *vfp = ms_params->var_params.vfp; + + mse = vfp->vf(dst, dst_stride, src, src_stride, &sse); + mse += mv_err_cost_(this_mv, &ms_params->mv_cost_params); + return mse; +} + +// Refines MV in a small range +unsigned int av1_refine_warped_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, + BLOCK_SIZE bsize, const int *pts0, + const int *pts_inref0, int total_samples) { + MB_MODE_INFO *mbmi = xd->mi[0]; + static const MV neighbors[8] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 }, + { 0, -2 }, { 2, 0 }, { 0, 2 }, { -2, 0 } }; + MV *best_mv = &mbmi->mv[0].as_mv; + + WarpedMotionParams best_wm_params = mbmi->wm_params; + int best_num_proj_ref = mbmi->num_proj_ref; + unsigned int bestmse; + const SubpelMvLimits *mv_limits = &ms_params->mv_limits; + + const int start = ms_params->allow_hp ? 0 : 4; + + // Calculate the center position's error + assert(av1_is_subpelmv_in_range(mv_limits, *best_mv)); + bestmse = compute_motion_cost(xd, cm, ms_params, bsize, best_mv); + + // MV search + int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + for (int ite = 0; ite < 2; ++ite) { + int best_idx = -1; + + for (int idx = start; idx < start + 4; ++idx) { + unsigned int thismse; + + MV this_mv = { best_mv->row + neighbors[idx].row, + best_mv->col + neighbors[idx].col }; + if (av1_is_subpelmv_in_range(mv_limits, this_mv)) { + memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0)); + memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0)); + if (total_samples > 1) + mbmi->num_proj_ref = + av1_selectSamples(&this_mv, pts, pts_inref, total_samples, bsize); + + if (!av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize, + this_mv.row, this_mv.col, &mbmi->wm_params, + mi_row, mi_col)) { + thismse = compute_motion_cost(xd, cm, ms_params, bsize, &this_mv); + + if (thismse < bestmse) { + best_idx = idx; + best_wm_params = mbmi->wm_params; + best_num_proj_ref = mbmi->num_proj_ref; + bestmse = thismse; + } + } + } + } + + if (best_idx == -1) break; + + if (best_idx >= 0) { + best_mv->row += neighbors[best_idx].row; + best_mv->col += neighbors[best_idx].col; + } + } + + mbmi->wm_params = best_wm_params; + mbmi->num_proj_ref = best_num_proj_ref; + return bestmse; +} +// ============================================================================= +// Subpixel Motion Search: OBMC +// ============================================================================= +// Estimates the variance of prediction residue +static INLINE int estimate_obmc_pref_error( + const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params, + unsigned int *sse) { + const aom_variance_fn_ptr_t *vfp = var_params->vfp; + + const MSBuffers *ms_buffers = &var_params->ms_buffers; + const int32_t *src = ms_buffers->wsrc; + const int32_t *mask = ms_buffers->obmc_mask; + const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv); + const int ref_stride = ms_buffers->ref->stride; + + const int subpel_x_q3 = get_subpel_part(this_mv->col); + const int subpel_y_q3 = get_subpel_part(this_mv->row); + + return vfp->osvf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, mask, sse); +} + +// Calculates the variance of prediction residue +static int upsampled_obmc_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm, + const MV *this_mv, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + unsigned int *sse) { + const aom_variance_fn_ptr_t *vfp = var_params->vfp; + const SUBPEL_SEARCH_TYPE subpel_search_type = var_params->subpel_search_type; + const int w = var_params->w; + const int h = var_params->h; + + const MSBuffers *ms_buffers = &var_params->ms_buffers; + const int32_t *wsrc = ms_buffers->wsrc; + const int32_t *mask = ms_buffers->obmc_mask; + const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv); + const int ref_stride = ms_buffers->ref->stride; + + const int subpel_x_q3 = get_subpel_part(this_mv->col); + const int subpel_y_q3 = get_subpel_part(this_mv->row); + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + unsigned int besterr; + DECLARE_ALIGNED(16, uint8_t, pred[2 * MAX_SB_SQUARE]); +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred); + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8, w, h, + subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd, + subpel_search_type); + besterr = vfp->ovf(pred8, w, wsrc, mask, sse); + } else { + aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3, + subpel_y_q3, ref, ref_stride, subpel_search_type); + + besterr = vfp->ovf(pred, w, wsrc, mask, sse); + } +#else + aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3, + subpel_y_q3, ref, ref_stride, subpel_search_type); + + besterr = vfp->ovf(pred, w, wsrc, mask, sse); +#endif + return besterr; +} + +static unsigned int setup_obmc_center_error( + const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) { + // TODO(chiyotsai@google.com): There might be a bug here where we didn't use + // get_buf_from_mv(ref, *this_mv). + const MSBuffers *ms_buffers = &var_params->ms_buffers; + const int32_t *wsrc = ms_buffers->wsrc; + const int32_t *mask = ms_buffers->obmc_mask; + const uint8_t *ref = ms_buffers->ref->buf; + const int ref_stride = ms_buffers->ref->stride; + unsigned int besterr = + var_params->vfp->ovf(ref, ref_stride, wsrc, mask, sse1); + *distortion = besterr; + besterr += mv_err_cost_(this_mv, mv_cost_params); + return besterr; +} + +static unsigned int upsampled_setup_obmc_center_error( + MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV *this_mv, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) { + unsigned int besterr = + upsampled_obmc_pref_error(xd, cm, this_mv, var_params, sse1); + *distortion = besterr; + besterr += mv_err_cost_(this_mv, mv_cost_params); + return besterr; +} + +// Estimates the variance of prediction residue +// TODO(chiyotsai@google.com): the cost does does not match the cost in +// mv_cost_. Investigate this later. +static INLINE int estimate_obmc_mvcost(const MV *this_mv, + const MV_COST_PARAMS *mv_cost_params) { + const MV *ref_mv = mv_cost_params->ref_mv; + const int *mvjcost = mv_cost_params->mvjcost; + const int *const *mvcost = mv_cost_params->mvcost; + const int error_per_bit = mv_cost_params->error_per_bit; + const MV_COST_TYPE mv_cost_type = mv_cost_params->mv_cost_type; + const MV diff_mv = { GET_MV_SUBPEL(this_mv->row - ref_mv->row), + GET_MV_SUBPEL(this_mv->col - ref_mv->col) }; + + switch (mv_cost_type) { + case MV_COST_ENTROPY: + return (unsigned)((mv_cost(&diff_mv, mvjcost, + CONVERT_TO_CONST_MVCOST(mvcost)) * + error_per_bit + + 4096) >> + 13); + case MV_COST_NONE: return 0; + default: + assert(0 && "L1 norm is not tuned for estimated obmc mvcost"); + return 0; + } +} + +// Estimates whether this_mv is better than best_mv. This function incorporates +// both prediction error and residue into account. +static INLINE unsigned int obmc_check_better_fast( + const MV *this_mv, MV *best_mv, const SubpelMvLimits *mv_limits, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion, int *has_better_mv) { + unsigned int cost; + if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) { + unsigned int sse; + const int thismse = estimate_obmc_pref_error(this_mv, var_params, &sse); + + cost = estimate_obmc_mvcost(this_mv, mv_cost_params); + cost += thismse; + + if (cost < *besterr) { + *besterr = cost; + *best_mv = *this_mv; + *distortion = thismse; + *sse1 = sse; + *has_better_mv |= 1; + } + } else { + cost = INT_MAX; + } + return cost; +} + +// Estimates whether this_mv is better than best_mv. This function incorporates +// both prediction error and residue into account. +static INLINE unsigned int obmc_check_better( + MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv, + const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion, int *has_better_mv) { + unsigned int cost; + if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) { + unsigned int sse; + const int thismse = + upsampled_obmc_pref_error(xd, cm, this_mv, var_params, &sse); + cost = mv_err_cost_(this_mv, mv_cost_params); + + cost += thismse; + + if (cost < *besterr) { + *besterr = cost; + *best_mv = *this_mv; + *distortion = thismse; + *sse1 = sse; + *has_better_mv |= 1; + } + } else { + cost = INT_MAX; + } + return cost; +} + +static AOM_FORCE_INLINE MV obmc_first_level_check( + MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV *best_mv, + const int hstep, const SubpelMvLimits *mv_limits, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion) { + int dummy = 0; + const MV left_mv = { this_mv.row, this_mv.col - hstep }; + const MV right_mv = { this_mv.row, this_mv.col + hstep }; + const MV top_mv = { this_mv.row - hstep, this_mv.col }; + const MV bottom_mv = { this_mv.row + hstep, this_mv.col }; + + if (var_params->subpel_search_type != USE_2_TAPS_ORIG) { + const unsigned int left = + obmc_check_better(xd, cm, &left_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + const unsigned int right = + obmc_check_better(xd, cm, &right_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + const unsigned int up = + obmc_check_better(xd, cm, &top_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + const unsigned int down = + obmc_check_better(xd, cm, &bottom_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + + const MV diag_step = get_best_diag_step(hstep, left, right, up, down); + const MV diag_mv = { this_mv.row + diag_step.row, + this_mv.col + diag_step.col }; + + // Check the diagonal direction with the best mv + obmc_check_better(xd, cm, &diag_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + + return diag_step; + } else { + const unsigned int left = obmc_check_better_fast( + &left_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, + distortion, &dummy); + const unsigned int right = obmc_check_better_fast( + &right_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, + sse1, distortion, &dummy); + + const unsigned int up = obmc_check_better_fast( + &top_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, + distortion, &dummy); + + const unsigned int down = obmc_check_better_fast( + &bottom_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, + sse1, distortion, &dummy); + + const MV diag_step = get_best_diag_step(hstep, left, right, up, down); + const MV diag_mv = { this_mv.row + diag_step.row, + this_mv.col + diag_step.col }; + + // Check the diagonal direction with the best mv + obmc_check_better_fast(&diag_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + + return diag_step; + } +} + +// A newer version of second level check for obmc that gives better quality. +static AOM_FORCE_INLINE void obmc_second_level_check_v2( + MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV diag_step, + MV *best_mv, const SubpelMvLimits *mv_limits, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion) { + assert(best_mv->row == this_mv.row + diag_step.row || + best_mv->col == this_mv.col + diag_step.col); + if (CHECK_MV_EQUAL(this_mv, *best_mv)) { + return; + } else if (this_mv.row == best_mv->row) { + // Search away from diagonal step since diagonal search did not provide any + // improvement + diag_step.row *= -1; + } else if (this_mv.col == best_mv->col) { + diag_step.col *= -1; + } + + const MV row_bias_mv = { best_mv->row + diag_step.row, best_mv->col }; + const MV col_bias_mv = { best_mv->row, best_mv->col + diag_step.col }; + const MV diag_bias_mv = { best_mv->row + diag_step.row, + best_mv->col + diag_step.col }; + int has_better_mv = 0; + + if (var_params->subpel_search_type != USE_2_TAPS_ORIG) { + obmc_check_better(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, + &has_better_mv); + obmc_check_better(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, + &has_better_mv); + + // Do an additional search if the second iteration gives a better mv + if (has_better_mv) { + obmc_check_better(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, + &has_better_mv); + } + } else { + obmc_check_better_fast(&row_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, + &has_better_mv); + obmc_check_better_fast(&col_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, + &has_better_mv); + + // Do an additional search if the second iteration gives a better mv + if (has_better_mv) { + obmc_check_better_fast(&diag_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, + &has_better_mv); + } + } +} + +int av1_find_best_obmc_sub_pixel_tree_up( + MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, MV *bestmv, + int *distortion, unsigned int *sse1, int_mv *last_mv_search_list) { + (void)last_mv_search_list; + const int allow_hp = ms_params->allow_hp; + const int forced_stop = ms_params->forced_stop; + const int iters_per_step = ms_params->iters_per_step; + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params; + const SUBPEL_SEARCH_TYPE subpel_search_type = + ms_params->var_params.subpel_search_type; + const SubpelMvLimits *mv_limits = &ms_params->mv_limits; + + int hstep = INIT_SUBPEL_STEP_SIZE; + const int round = AOMMIN(FULL_PEL - forced_stop, 3 - !allow_hp); + + unsigned int besterr = INT_MAX; + *bestmv = start_mv; + + if (subpel_search_type != USE_2_TAPS_ORIG) + besterr = upsampled_setup_obmc_center_error( + xd, cm, bestmv, var_params, mv_cost_params, sse1, distortion); + else + besterr = setup_obmc_center_error(bestmv, var_params, mv_cost_params, sse1, + distortion); + + for (int iter = 0; iter < round; ++iter) { + MV iter_center_mv = *bestmv; + MV diag_step = obmc_first_level_check(xd, cm, iter_center_mv, bestmv, hstep, + mv_limits, var_params, mv_cost_params, + &besterr, sse1, distortion); + + if (!CHECK_MV_EQUAL(iter_center_mv, *bestmv) && iters_per_step > 1) { + obmc_second_level_check_v2(xd, cm, iter_center_mv, diag_step, bestmv, + mv_limits, var_params, mv_cost_params, + &besterr, sse1, distortion); + } + hstep >>= 1; + } + + return besterr; +} + +// ============================================================================= +// Public cost function: mv_cost + pred error +// ============================================================================= +int av1_get_mvpred_sse(const MACROBLOCK *x, const FULLPEL_MV *best_mv, + const MV *ref_mv, const aom_variance_fn_ptr_t *vfp) { + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + const MV mv = get_mv_from_fullmv(best_mv); + const MV_COST_TYPE mv_cost_type = x->mv_cost_type; + unsigned int sse, var; + + var = vfp->vf(what->buf, what->stride, get_buf_from_fullmv(in_what, best_mv), + in_what->stride, &sse); + (void)var; + + return sse + mv_err_cost(&mv, ref_mv, x->nmv_vec_cost, + CONVERT_TO_CONST_MVCOST(x->mv_cost_stack), + x->errorperbit, mv_cost_type); +} + +static INLINE int get_mvpred_av_var(const MV_COST_PARAMS *mv_cost_params, + const FULLPEL_MV best_mv, + const uint8_t *second_pred, + const aom_variance_fn_ptr_t *vfp, + const struct buf_2d *src, + const struct buf_2d *pre) { + const struct buf_2d *const what = src; + const struct buf_2d *const in_what = pre; + const MV mv = get_mv_from_fullmv(&best_mv); + unsigned int unused; + + return vfp->svaf(get_buf_from_fullmv(in_what, &best_mv), in_what->stride, 0, + 0, what->buf, what->stride, &unused, second_pred) + + mv_err_cost_(&mv, mv_cost_params); +} + +static INLINE int get_mvpred_mask_var( + const MV_COST_PARAMS *mv_cost_params, const FULLPEL_MV best_mv, + const uint8_t *second_pred, const uint8_t *mask, int mask_stride, + int invert_mask, const aom_variance_fn_ptr_t *vfp, const struct buf_2d *src, + const struct buf_2d *pre) { + const struct buf_2d *const what = src; + const struct buf_2d *const in_what = pre; + const MV mv = get_mv_from_fullmv(&best_mv); + unsigned int unused; + + return vfp->msvf(what->buf, what->stride, 0, 0, + get_buf_from_fullmv(in_what, &best_mv), in_what->stride, + second_pred, mask, mask_stride, invert_mask, &unused) + + mv_err_cost_(&mv, mv_cost_params); +} + +int av1_get_mvpred_compound_var(const MV_COST_PARAMS *mv_cost_params, + const FULLPEL_MV best_mv, + const uint8_t *second_pred, const uint8_t *mask, + int mask_stride, int invert_mask, + const aom_variance_fn_ptr_t *vfp, + const struct buf_2d *src, + const struct buf_2d *pre) { + if (mask) { + return get_mvpred_mask_var(mv_cost_params, best_mv, second_pred, mask, + mask_stride, invert_mask, vfp, src, pre); + } else { + return get_mvpred_av_var(mv_cost_params, best_mv, second_pred, vfp, src, + pre); + } +} diff --git a/libs/libaom/src/av1/encoder/mcomp.h b/libs/libaom/src/av1/encoder/mcomp.h new file mode 100644 index 000000000..73135d859 --- /dev/null +++ b/libs/libaom/src/av1/encoder/mcomp.h @@ -0,0 +1,329 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_MCOMP_H_ +#define AOM_AV1_ENCODER_MCOMP_H_ + +#include "av1/common/mv.h" +#include "av1/encoder/block.h" + +#include "aom_dsp/variance.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// The maximum number of steps in a step search given the largest +// allowed initial step +#define MAX_MVSEARCH_STEPS 11 +// Max full pel mv specified in the unit of full pixel +// Enable the use of motion vector in range [-1023, 1023]. +#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS - 1)) - 1) +// Maximum size of the first step in full pel units +#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS - 1)) + +#define SEARCH_RANGE_8P 3 +#define SEARCH_GRID_STRIDE_8P (2 * SEARCH_RANGE_8P + 1) +#define SEARCH_GRID_CENTER_8P \ + (SEARCH_RANGE_8P * SEARCH_GRID_STRIDE_8P + SEARCH_RANGE_8P) + +// motion search site +typedef struct search_site { + FULLPEL_MV mv; + int offset; +} search_site; + +typedef struct search_site_config { + search_site ss[MAX_MVSEARCH_STEPS * 2][16 + 1]; + int ss_count; + int searches_per_step[MAX_MVSEARCH_STEPS * 2]; + int radius[MAX_MVSEARCH_STEPS * 2]; + int stride; +} search_site_config; + +typedef struct { + FULLPEL_MV coord; + int coord_offset; +} search_neighbors; + +struct AV1_COMP; +struct SPEED_FEATURES; + +// ============================================================================= +// Cost functions +// ============================================================================= +typedef struct { + const MV *ref_mv; + FULLPEL_MV full_ref_mv; + const int *mvjcost; + const int *mvcost[2]; + int error_per_bit; + int sad_per_bit; + MV_COST_TYPE mv_cost_type; +} MV_COST_PARAMS; + +int av1_mv_bit_cost(const MV *mv, const MV *ref_mv, const int *mvjcost, + int *mvcost[2], int weight); + +int av1_get_mvpred_sse(const MACROBLOCK *x, const FULLPEL_MV *best_mv, + const MV *ref_mv, const aom_variance_fn_ptr_t *vfp); +int av1_get_mvpred_compound_var(const MV_COST_PARAMS *ms_params, + const FULLPEL_MV best_mv, + const uint8_t *second_pred, const uint8_t *mask, + int mask_stride, int invert_mask, + const aom_variance_fn_ptr_t *vfp, + const struct buf_2d *src, + const struct buf_2d *pre); + +// ============================================================================= +// Motion Search +// ============================================================================= +typedef struct { + // The reference buffer + const struct buf_2d *ref; + + // The source and predictors/mask used by translational search + const struct buf_2d *src; + const uint8_t *second_pred; + const uint8_t *mask; + int mask_stride; + int inv_mask; + + // The weighted source and mask used by OBMC + const int32_t *wsrc; + const int32_t *obmc_mask; +} MSBuffers; + +static INLINE void av1_set_ms_compound_refs(MSBuffers *ms_buffers, + const uint8_t *second_pred, + const uint8_t *mask, + int mask_stride, int invert_mask) { + ms_buffers->second_pred = second_pred; + ms_buffers->mask = mask; + ms_buffers->mask_stride = mask_stride; + ms_buffers->inv_mask = invert_mask; +} + +// ============================================================================= +// Fullpixel Motion Search +// ============================================================================= +enum { + DIAMOND = 0, + NSTEP = 1, + HEX = 2, + BIGDIA = 3, + SQUARE = 4, + FAST_HEX = 5, + FAST_DIAMOND = 6 +} UENUM1BYTE(SEARCH_METHODS); + +// This struct holds fullpixel motion search parameters that should be constant +// during the search +typedef struct { + BLOCK_SIZE bsize; + const aom_variance_fn_ptr_t *vfp; + + MSBuffers ms_buffers; + + SEARCH_METHODS search_method; + const search_site_config *search_sites; + FullMvLimits mv_limits; + + int run_mesh_search; // Sets mesh search unless it got pruned by + // prune_mesh_search. + int prune_mesh_search; // Disables mesh search if the best_mv after a normal + // search if close to the start_mv. + int force_mesh_thresh; // Forces mesh search if the residue variance is + // higher than the threshold. + const struct MESH_PATTERN *mesh_patterns[2]; + + int is_intra_mode; + + int fast_obmc_search; + + // For calculating mv cost + MV_COST_PARAMS mv_cost_params; +} FULLPEL_MOTION_SEARCH_PARAMS; + +void av1_make_default_fullpel_ms_params(FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const struct AV1_COMP *cpi, + const MACROBLOCK *x, BLOCK_SIZE bsize, + const MV *ref_mv, + const search_site_config *search_sites); + +// Sets up configs for fullpixel diamond search +void av1_init_dsmotion_compensation(search_site_config *cfg, int stride); +// Sets up configs for firstpass motion search +void av1_init_motion_fpf(search_site_config *cfg, int stride); +// Sets up configs for all other types of motion search +void av1_init3smotion_compensation(search_site_config *cfg, int stride); + +// Set up limit values for MV components. +// Mv beyond the range do not produce new/different prediction block. +static INLINE void av1_set_mv_row_limits( + const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits, + int mi_row, int mi_height, int border) { + const int min1 = -(mi_row * MI_SIZE + border - 2 * AOM_INTERP_EXTEND); + const int min2 = -(((mi_row + mi_height) * MI_SIZE) + 2 * AOM_INTERP_EXTEND); + mv_limits->row_min = AOMMAX(min1, min2); + const int max1 = (mi_params->mi_rows - mi_row - mi_height) * MI_SIZE + + border - 2 * AOM_INTERP_EXTEND; + const int max2 = + (mi_params->mi_rows - mi_row) * MI_SIZE + 2 * AOM_INTERP_EXTEND; + mv_limits->row_max = AOMMIN(max1, max2); +} + +static INLINE void av1_set_mv_col_limits( + const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits, + int mi_col, int mi_width, int border) { + const int min1 = -(mi_col * MI_SIZE + border - 2 * AOM_INTERP_EXTEND); + const int min2 = -(((mi_col + mi_width) * MI_SIZE) + 2 * AOM_INTERP_EXTEND); + mv_limits->col_min = AOMMAX(min1, min2); + const int max1 = (mi_params->mi_cols - mi_col - mi_width) * MI_SIZE + border - + 2 * AOM_INTERP_EXTEND; + const int max2 = + (mi_params->mi_cols - mi_col) * MI_SIZE + 2 * AOM_INTERP_EXTEND; + mv_limits->col_max = AOMMIN(max1, max2); +} + +static INLINE void av1_set_mv_limits( + const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits, + int mi_row, int mi_col, int mi_height, int mi_width, int border) { + av1_set_mv_row_limits(mi_params, mv_limits, mi_row, mi_height, border); + av1_set_mv_col_limits(mi_params, mv_limits, mi_col, mi_width, border); +} + +void av1_set_mv_search_range(FullMvLimits *mv_limits, const MV *mv); + +int av1_init_search_range(int size); + +unsigned int av1_int_pro_motion_estimation(const struct AV1_COMP *cpi, + MACROBLOCK *x, BLOCK_SIZE bsize, + int mi_row, int mi_col, + const MV *ref_mv); + +int av1_refining_search_8p_c(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const FULLPEL_MV start_mv, FULLPEL_MV *best_mv); + +int av1_full_pixel_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int step_param, int *cost_list, + FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv); + +int av1_intrabc_hash_search(const struct AV1_COMP *cpi, const MACROBLOCKD *xd, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + IntraBCHashInfo *intrabc_hash_info, + FULLPEL_MV *best_mv); + +int av1_obmc_full_pixel_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int step_param, FULLPEL_MV *best_mv); + +static INLINE int av1_is_fullmv_in_range(const FullMvLimits *mv_limits, + FULLPEL_MV mv) { + return (mv.col >= mv_limits->col_min) && (mv.col <= mv_limits->col_max) && + (mv.row >= mv_limits->row_min) && (mv.row <= mv_limits->row_max); +} +// ============================================================================= +// Subpixel Motion Search +// ============================================================================= +enum { + EIGHTH_PEL, + QUARTER_PEL, + HALF_PEL, + FULL_PEL +} UENUM1BYTE(SUBPEL_FORCE_STOP); + +typedef struct { + const aom_variance_fn_ptr_t *vfp; + SUBPEL_SEARCH_TYPE subpel_search_type; + // Source and reference buffers + MSBuffers ms_buffers; + int w, h; +} SUBPEL_SEARCH_VAR_PARAMS; + +// This struct holds subpixel motion search parameters that should be constant +// during the search +typedef struct { + // High level motion search settings + int allow_hp; + const int *cost_list; + SUBPEL_FORCE_STOP forced_stop; + int iters_per_step; + SubpelMvLimits mv_limits; + + // For calculating mv cost + MV_COST_PARAMS mv_cost_params; + + // Distortion calculation params + SUBPEL_SEARCH_VAR_PARAMS var_params; +} SUBPEL_MOTION_SEARCH_PARAMS; + +void av1_make_default_subpel_ms_params(SUBPEL_MOTION_SEARCH_PARAMS *ms_params, + const struct AV1_COMP *cpi, + const MACROBLOCK *x, BLOCK_SIZE bsize, + const MV *ref_mv, const int *cost_list); + +typedef int(fractional_mv_step_fp)(MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, + MV start_mv, MV *bestmv, int *distortion, + unsigned int *sse1, + int_mv *last_mv_search_list); + +extern fractional_mv_step_fp av1_find_best_sub_pixel_tree; +extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned; +extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned_more; +extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned_evenmore; +extern fractional_mv_step_fp av1_return_max_sub_pixel_mv; +extern fractional_mv_step_fp av1_return_min_sub_pixel_mv; +extern fractional_mv_step_fp av1_find_best_obmc_sub_pixel_tree_up; + +unsigned int av1_refine_warped_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, + BLOCK_SIZE bsize, const int *pts0, + const int *pts_inref0, int total_samples); + +static INLINE void av1_set_fractional_mv(int_mv *fractional_best_mv) { + for (int z = 0; z < 3; z++) { + fractional_best_mv[z].as_int = INVALID_MV; + } +} + +static INLINE void av1_set_subpel_mv_search_range(SubpelMvLimits *subpel_limits, + const FullMvLimits *mv_limits, + const MV *ref_mv) { + const int max_mv = GET_MV_SUBPEL(MAX_FULL_PEL_VAL); + const int minc = + AOMMAX(GET_MV_SUBPEL(mv_limits->col_min), ref_mv->col - max_mv); + const int maxc = + AOMMIN(GET_MV_SUBPEL(mv_limits->col_max), ref_mv->col + max_mv); + const int minr = + AOMMAX(GET_MV_SUBPEL(mv_limits->row_min), ref_mv->row - max_mv); + const int maxr = + AOMMIN(GET_MV_SUBPEL(mv_limits->row_max), ref_mv->row + max_mv); + + subpel_limits->col_min = AOMMAX(MV_LOW + 1, minc); + subpel_limits->col_max = AOMMIN(MV_UPP - 1, maxc); + subpel_limits->row_min = AOMMAX(MV_LOW + 1, minr); + subpel_limits->row_max = AOMMIN(MV_UPP - 1, maxr); +} + +static INLINE int av1_is_subpelmv_in_range(const SubpelMvLimits *mv_limits, + MV mv) { + return (mv.col >= mv_limits->col_min) && (mv.col <= mv_limits->col_max) && + (mv.row >= mv_limits->row_min) && (mv.row <= mv_limits->row_max); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_MCOMP_H_ diff --git a/libs/libaom/src/av1/encoder/mips/msa/error_msa.c b/libs/libaom/src/av1/encoder/mips/msa/error_msa.c new file mode 100644 index 000000000..2e86dee43 --- /dev/null +++ b/libs/libaom/src/av1/encoder/mips/msa/error_msa.c @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/av1_rtcd.h" + +#include "aom_dsp/mips/macros_msa.h" + +#define BLOCK_ERROR_BLOCKSIZE_MSA(BSize) \ + static int64_t block_error_##BSize##size_msa( \ + const int16_t *coeff_ptr, const int16_t *dq_coeff_ptr, int64_t *ssz) { \ + int64_t err = 0; \ + uint32_t loop_cnt; \ + v8i16 coeff, dq_coeff, coeff_r_h, coeff_l_h; \ + v4i32 diff_r, diff_l, coeff_r_w, coeff_l_w; \ + v2i64 sq_coeff_r, sq_coeff_l; \ + v2i64 err0, err_dup0, err1, err_dup1; \ + \ + coeff = LD_SH(coeff_ptr); \ + dq_coeff = LD_SH(dq_coeff_ptr); \ + UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \ + ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \ + HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \ + DOTP_SW2_SD(coeff_r_w, coeff_l_w, coeff_r_w, coeff_l_w, sq_coeff_r, \ + sq_coeff_l); \ + DOTP_SW2_SD(diff_r, diff_l, diff_r, diff_l, err0, err1); \ + \ + coeff = LD_SH(coeff_ptr + 8); \ + dq_coeff = LD_SH(dq_coeff_ptr + 8); \ + UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \ + ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \ + HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \ + DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \ + DPADD_SD2_SD(diff_r, diff_l, err0, err1); \ + \ + coeff_ptr += 16; \ + dq_coeff_ptr += 16; \ + \ + for (loop_cnt = ((BSize >> 4) - 1); loop_cnt--;) { \ + coeff = LD_SH(coeff_ptr); \ + dq_coeff = LD_SH(dq_coeff_ptr); \ + UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \ + ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \ + HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \ + DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \ + DPADD_SD2_SD(diff_r, diff_l, err0, err1); \ + \ + coeff = LD_SH(coeff_ptr + 8); \ + dq_coeff = LD_SH(dq_coeff_ptr + 8); \ + UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \ + ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \ + HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \ + DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \ + DPADD_SD2_SD(diff_r, diff_l, err0, err1); \ + \ + coeff_ptr += 16; \ + dq_coeff_ptr += 16; \ + } \ + \ + err_dup0 = __msa_splati_d(sq_coeff_r, 1); \ + err_dup1 = __msa_splati_d(sq_coeff_l, 1); \ + sq_coeff_r += err_dup0; \ + sq_coeff_l += err_dup1; \ + *ssz = __msa_copy_s_d(sq_coeff_r, 0); \ + *ssz += __msa_copy_s_d(sq_coeff_l, 0); \ + \ + err_dup0 = __msa_splati_d(err0, 1); \ + err_dup1 = __msa_splati_d(err1, 1); \ + err0 += err_dup0; \ + err1 += err_dup1; \ + err = __msa_copy_s_d(err0, 0); \ + err += __msa_copy_s_d(err1, 0); \ + \ + return err; \ + } + +/* clang-format off */ +BLOCK_ERROR_BLOCKSIZE_MSA(16) +BLOCK_ERROR_BLOCKSIZE_MSA(64) +BLOCK_ERROR_BLOCKSIZE_MSA(256) +BLOCK_ERROR_BLOCKSIZE_MSA(1024) +/* clang-format on */ + +int64_t av1_block_error_msa(const tran_low_t *coeff_ptr, + const tran_low_t *dq_coeff_ptr, intptr_t blk_size, + int64_t *ssz) { + int64_t err; + const int16_t *coeff = (const int16_t *)coeff_ptr; + const int16_t *dq_coeff = (const int16_t *)dq_coeff_ptr; + + switch (blk_size) { + case 16: err = block_error_16size_msa(coeff, dq_coeff, ssz); break; + case 64: err = block_error_64size_msa(coeff, dq_coeff, ssz); break; + case 256: err = block_error_256size_msa(coeff, dq_coeff, ssz); break; + case 1024: err = block_error_1024size_msa(coeff, dq_coeff, ssz); break; + default: + err = av1_block_error_c(coeff_ptr, dq_coeff_ptr, blk_size, ssz); + break; + } + + return err; +} diff --git a/libs/libaom/src/av1/encoder/mips/msa/fdct4x4_msa.c b/libs/libaom/src/av1/encoder/mips/msa/fdct4x4_msa.c new file mode 100644 index 000000000..085c08bfb --- /dev/null +++ b/libs/libaom/src/av1/encoder/mips/msa/fdct4x4_msa.c @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/enums.h" + +void av1_fwht4x4_msa(const int16_t *input, int16_t *output, + int32_t src_stride) { + v8i16 in0, in1, in2, in3, in4; + + LD_SH4(input, src_stride, in0, in1, in2, in3); + + in0 += in1; + in3 -= in2; + in4 = (in0 - in3) >> 1; + SUB2(in4, in1, in4, in2, in1, in2); + in0 -= in2; + in3 += in1; + + TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1); + + in0 += in2; + in1 -= in3; + in4 = (in0 - in1) >> 1; + SUB2(in4, in2, in4, in3, in2, in3); + in0 -= in3; + in1 += in2; + + SLLI_4V(in0, in1, in2, in3, 2); + + TRANSPOSE4x4_SH_SH(in0, in3, in1, in2, in0, in3, in1, in2); + + ST4x2_UB(in0, output, 4); + ST4x2_UB(in3, output + 4, 4); + ST4x2_UB(in1, output + 8, 4); + ST4x2_UB(in2, output + 12, 4); +} diff --git a/libs/libaom/src/av1/encoder/mips/msa/temporal_filter_msa.c b/libs/libaom/src/av1/encoder/mips/msa/temporal_filter_msa.c new file mode 100644 index 000000000..effa75b83 --- /dev/null +++ b/libs/libaom/src/av1/encoder/mips/msa/temporal_filter_msa.c @@ -0,0 +1,286 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/av1_rtcd.h" + +#include "aom_dsp/mips/macros_msa.h" + +static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr, uint32_t stride, + uint8_t *frm2_ptr, int32_t filt_sth, + int32_t filt_wgt, uint32_t *acc, + uint16_t *cnt) { + uint32_t row; + uint64_t f0, f1, f2, f3; + v16i8 frm2, frm1 = { 0 }; + v16i8 frm4, frm3 = { 0 }; + v16u8 frm_r, frm_l; + v8i16 frm2_r, frm2_l; + v8i16 diff0, diff1, mod0_h, mod1_h; + v4i32 cnst3, cnst16, filt_wt, strength; + v4i32 mod0_w, mod1_w, mod2_w, mod3_w; + v4i32 diff0_r, diff0_l, diff1_r, diff1_l; + v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll; + v4i32 acc0, acc1, acc2, acc3; + v8i16 cnt0, cnt1; + + filt_wt = __msa_fill_w(filt_wgt); + strength = __msa_fill_w(filt_sth); + cnst3 = __msa_ldi_w(3); + cnst16 = __msa_ldi_w(16); + + for (row = 2; row--;) { + LD4(frm1_ptr, stride, f0, f1, f2, f3); + frm1_ptr += (4 * stride); + + LD_SB2(frm2_ptr, 16, frm2, frm4); + frm2_ptr += 32; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + + INSERT_D2_SB(f0, f1, frm1); + INSERT_D2_SB(f2, f3, frm3); + ILVRL_B2_UB(frm1, frm2, frm_r, frm_l); + HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, + mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, + mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + + diff0_r = (mod0_w < cnst16); + diff0_l = (mod1_w < cnst16); + diff1_r = (mod2_w < cnst16); + diff1_l = (mod3_w < cnst16); + + SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, + mod1_w, mod2_w, mod3_w); + + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + + MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, + mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + + UNPCK_UB_SH(frm2, frm2_r, frm2_l); + UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); + UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); + MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, + mod2_w, mod3_w); + + ST_SW2(mod0_w, mod1_w, acc, 4); + acc += 8; + ST_SW2(mod2_w, mod3_w, acc, 4); + acc += 8; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + + ILVRL_B2_UB(frm3, frm4, frm_r, frm_l); + HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, + mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, + mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + + diff0_r = (mod0_w < cnst16); + diff0_l = (mod1_w < cnst16); + diff1_r = (mod2_w < cnst16); + diff1_l = (mod3_w < cnst16); + + SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, + mod1_w, mod2_w, mod3_w); + + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + + MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, + mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + UNPCK_UB_SH(frm4, frm2_r, frm2_l); + UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); + UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); + MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, + mod2_w, mod3_w); + + ST_SW2(mod0_w, mod1_w, acc, 4); + acc += 8; + ST_SW2(mod2_w, mod3_w, acc, 4); + acc += 8; + } +} + +static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr, uint32_t stride, + uint8_t *frm2_ptr, + int32_t filt_sth, int32_t filt_wgt, + uint32_t *acc, uint16_t *cnt) { + uint32_t row; + v16i8 frm1, frm2, frm3, frm4; + v16u8 frm_r, frm_l; + v16i8 zero = { 0 }; + v8u16 frm2_r, frm2_l; + v8i16 diff0, diff1, mod0_h, mod1_h; + v4i32 cnst3, cnst16, filt_wt, strength; + v4i32 mod0_w, mod1_w, mod2_w, mod3_w; + v4i32 diff0_r, diff0_l, diff1_r, diff1_l; + v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll; + v4i32 acc0, acc1, acc2, acc3; + v8i16 cnt0, cnt1; + + filt_wt = __msa_fill_w(filt_wgt); + strength = __msa_fill_w(filt_sth); + cnst3 = __msa_ldi_w(3); + cnst16 = __msa_ldi_w(16); + + for (row = 8; row--;) { + LD_SB2(frm1_ptr, stride, frm1, frm3); + frm1_ptr += stride; + + LD_SB2(frm2_ptr, 16, frm2, frm4); + frm2_ptr += 16; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + + ILVRL_B2_UB(frm1, frm2, frm_r, frm_l); + HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, + mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, + mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + + diff0_r = (mod0_w < cnst16); + diff0_l = (mod1_w < cnst16); + diff1_r = (mod2_w < cnst16); + diff1_l = (mod3_w < cnst16); + + SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, + mod1_w, mod2_w, mod3_w); + + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + + MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, + mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + + ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l); + UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); + UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); + MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, + mod2_w, mod3_w); + + ST_SW2(mod0_w, mod1_w, acc, 4); + acc += 8; + ST_SW2(mod2_w, mod3_w, acc, 4); + acc += 8; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + + ILVRL_B2_UB(frm3, frm4, frm_r, frm_l); + HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, + mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, + mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + + diff0_r = (mod0_w < cnst16); + diff0_l = (mod1_w < cnst16); + diff1_r = (mod2_w < cnst16); + diff1_l = (mod3_w < cnst16); + + SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, + mod1_w, mod2_w, mod3_w); + + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + + MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, + mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + + ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l); + UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); + UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); + MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, + mod2_w, mod3_w); + ST_SW2(mod0_w, mod1_w, acc, 4); + acc += 8; + ST_SW2(mod2_w, mod3_w, acc, 4); + acc += 8; + + frm1_ptr += stride; + frm2_ptr += 16; + } +} + +// TODO(yunqing) The following optimization is not used since c code changes. +void av1_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride, + uint8_t *frame2_ptr, uint32_t blk_w, + uint32_t blk_h, int32_t strength, + int32_t filt_wgt, uint32_t *accu, + uint16_t *cnt) { + if (8 == (blk_w * blk_h)) { + temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr, strength, + filt_wgt, accu, cnt); + } else if (16 == (blk_w * blk_h)) { + temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr, strength, + filt_wgt, accu, cnt); + } else { + av1_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h, + strength, filt_wgt, accu, cnt); + } +} diff --git a/libs/libaom/src/av1/encoder/misc_model_weights.h b/libs/libaom/src/av1/encoder/misc_model_weights.h new file mode 100644 index 000000000..f00aeabcf --- /dev/null +++ b/libs/libaom/src/av1/encoder/misc_model_weights.h @@ -0,0 +1,696 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_MISC_MODEL_WEIGHTS_H_ +#define AOM_AV1_ENCODER_MISC_MODEL_WEIGHTS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/ml.h" + +#define MV_PREC_FEATURE_SIZE 18 + +#define NUM_DNN_LAYERS 1 +#define NUM_DNN_FEATURES MV_PREC_FEATURE_SIZE +#define MV_PREC_LAYER_SIZE_0 32 +#define NUM_LOGITS 1 + +const float av1_mv_prec_mean[MV_PREC_FEATURE_SIZE] = { 143.67358891063745f, + 141.6251917346238f, + 0.36313633945679064f, + 0.0028162791958822085f, + 0.000484820537626698f, + 0.002769969388939025f, + 0.0f, + 0.00031274626720947577f, + 0.00020578555375160075f, + 0.0007075246732697733f, + 0.000539641029909925f, + 0.0013939401375906984f, + 4.985394760423499f, + 4.985394760423499f, + 4.9992148717283085f, + 5.143739822380163f, + 5.518483124004564f, + 87.63597847427077f }; + +const float av1_mv_prec_std[MV_PREC_FEATURE_SIZE] = { 66.86256140247244f, + 68.04472572607503f, + 13.23247674430399f, + 0.0029123438396921955f, + 0.0015331406169374737f, + 0.0029149813096313775f, + 1.0f, + 0.00047501102871357813f, + 0.00030025962993117947f, + 0.0009861163580391207f, + 0.0012157593528004055f, + 0.002004954948490521f, + 6.539447500484038f, + 6.539447500484038f, + 6.396589058279465f, + 3.4870155874262516f, + 3.8911353973740535f, + 112.07985259573601f }; + +const float av1_mv_prec_nn_weights_layer_0[] = { -0.13008492159557145f, + -0.1483527373474774f, + 0.08112076098858864f, + -0.9582568679627453f, + -0.34794757171071206f, + 0.6465225723304947f, + 0.0f, + 0.06754171885839604f, + 0.27156803620541214f, + 0.10635231245664407f, + -0.031183926995968583f, + 0.048122572260291f, + -0.19498534230045128f, + -0.2614116319273316f, + -0.3223762845136331f, + -1.2063368350609205f, + -0.523333556911706f, + 1.075632260890728f, + 0.48989726814387946f, + -0.34816466111070477f, + 0.41668357610256473f, + -1.0973562848791671f, + 0.04183921854389494f, + -0.9123815389260476f, + 0.0f, + 0.859965047744027f, + 0.1962095804679813f, + 0.2606564339077058f, + 0.26695868715184895f, + 0.5319308568326692f, + -0.23717505799723165f, + -0.43127224481782567f, + -0.3214545776203726f, + 0.5850852241402176f, + -0.26705531612587813f, + -0.5786016766610093f, + 0.9360519909983003f, + 0.20771329289016555f, + -0.027614159544811823f, + -1.175022807046164f, + -0.07578967497693835f, + 0.6890172485324256f, + 0.0f, + -0.008008338164988263f, + -0.08064800010158935f, + -0.22606910981666667f, + 0.4541586669210879f, + 0.07731527661370792f, + -0.6744475941247964f, + -0.2625842448396184f, + 1.7018613444303785f, + -0.08622229073162656f, + 0.041858142814941275f, + -0.24575964090386415f, + -0.046626044730994964f, + 0.7608713064175202f, + -0.23330119070907146f, + -0.10115510984500826f, + 0.9722537349192069f, + 0.11718554254290829f, + 0.0f, + 0.2075123446014759f, + 0.09465167310768637f, + 0.7609896851963016f, + 0.4441038581385328f, + 0.26064144727430955f, + -0.14678625366485035f, + -0.03597014452200524f, + 0.3128680867196166f, + 1.102496797385966f, + 0.06642253233084111f, + -1.2665494483407629f, + 0.09049412632000911f, + -1.1160621999565095f, + 0.043420275255913035f, + -0.8811412259978966f, + 0.21076234632287777f, + 0.16571534463543866f, + 0.0f, + -0.7324075176473275f, + -0.3677622514459495f, + 0.3273532243056415f, + 0.22922161936797775f, + 0.8204766691058087f, + 0.02982161033720488f, + 0.5266419954188112f, + -1.0032154963302191f, + 0.7007602969763729f, + 0.37196355167990885f, + -0.7608579453228548f, + 0.08568111584781847f, + 0.07011061059123677f, + 0.3233263598082507f, + -0.08249928295410253f, + 0.08220165761319252f, + 0.22148722752246794f, + 0.0f, + 0.6122392701743506f, + -0.26429838296378333f, + 0.31958081620005463f, + -0.006027177397853826f, + -0.3088310785887994f, + -0.5436192046707807f, + -0.011080356757423306f, + 0.12632650770008413f, + -0.45097913215234525f, + 1.8008072867127298f, + -0.7630029654575501f, + -0.4054774329826579f, + 0.40386074452544535f, + -0.18541426257453025f, + 0.2444879765079863f, + -0.6216724756115081f, + 0.27030299321302f, + 0.0f, + -0.6835848952967989f, + -0.7914184320964815f, + -0.6761595019582928f, + -1.009565565604081f, + -0.1904242439353305f, + 0.4463417126318631f, + 0.6025503823452971f, + 0.5149990860115566f, + 1.0242970663937634f, + 0.037947306826401385f, + 0.07039339786212848f, + 0.14273796789711987f, + 0.168103961425691f, + 1.6596066376811978f, + 0.19321092229384657f, + -0.3710750388148514f, + -0.01717015559410288f, + 0.0f, + 0.3005688477942597f, + 0.23877080653829577f, + 0.2718594552971173f, + 0.3885402571589898f, + 0.32999531945669247f, + -0.6134460954213243f, + -0.13972265462799183f, + -0.07180089575716991f, + -1.014572598188105f, + 0.0717207322809836f, + 0.34896157745155615f, + -0.27127687591403f, + -0.5058651212773623f, + -1.5442435628306925f, + -0.6399784724734707f, + 0.6274301429074947f, + -0.4645750072767051f, + 0.0f, + -0.2406726815244178f, + -0.06321214115916597f, + 0.312856714253404f, + 0.16459514124116134f, + 0.3993579604809623f, + -0.15232044351561913f, + -0.5613743948568469f, + 0.7219801372223262f, + 0.2936857469624009f, + 0.7823466656034087f, + -0.12416947814098349f, + -0.36413756654028345f, + -0.07992098796866462f, + -0.7395722879842416f, + 0.8639913543220514f, + -0.311931773757945f, + -1.7308240470400613f, + 0.0f, + 0.394499716712104f, + 0.6511462819539963f, + -0.0722425275974144f, + 0.13490818194661386f, + 0.055319135836378035f, + 0.15389577508097013f, + 0.28958598328870605f, + -0.14608429470539772f, + 0.09488817462478298f, + -0.17231294096622088f, + 0.6721115415911466f, + -0.05664621150536103f, + 0.03291799673669331f, + 0.02845382711057482f, + -0.9953563446999164f, + -0.17994298220605923f, + 0.6560824519337476f, + 0.0f, + -0.30990646375917935f, + 0.17215517202874f, + 0.2026816225170481f, + 0.22011958747715601f, + 0.3562520768889686f, + -0.18436559057189175f, + 0.1733377147302066f, + 0.02818276995640877f, + -0.29703005574859076f, + -0.3310652639215064f, + -1.6091173258529277f, + 0.45461585790028003f, + -0.5078643334592593f, + -0.338997374732338f, + 0.4688619590359733f, + 0.627099126828289f, + -0.5249801376494249f, + 0.0f, + 0.34465498218272883f, + 0.009891680630908135f, + -0.27244020967349f, + 0.05404589867626979f, + -0.06220329325739666f, + -0.13365376464759104f, + -0.13098573553512366f, + 0.11434198976289106f, + 0.6740951247574676f, + 1.3381727185724581f, + -1.4865773213251936f, + 0.05809898701966341f, + 0.25380780261023456f, + 1.2716367496512722f, + 0.1768290070780598f, + -0.07554828135356352f, + 0.8180570085344856f, + 0.0f, + 1.0788448980077463f, + 0.0651938742459459f, + 0.3807672030015587f, + 0.6144792680268445f, + 0.011660612214908059f, + -0.018306023765580288f, + 0.44140813809926516f, + -0.13411994195502386f, + 0.15920368955127778f, + -0.19382358417849888f, + -0.08802147969690055f, + -0.019731052733814477f, + 0.1104744229169665f, + -0.195834419735958f, + -0.5005295046454347f, + -0.17041241868229032f, + -0.471942117351489f, + 0.0f, + -0.3599073304761372f, + -0.2745532782968519f, + -0.8323064841106417f, + -0.88355885384943f, + -0.02826466859020679f, + 0.06977870308805256f, + 0.11926112095374196f, + 1.367382707959643f, + -0.06119843162964051f, + -0.5331395268889569f, + -1.2155531584240624f, + -0.01896651779524327f, + 0.10591845408571081f, + -0.010632842156504733f, + 0.6150787968629282f, + -0.4191690185896091f, + -0.9961718918346271f, + 0.0f, + 0.23370364516013867f, + 0.4156033072362998f, + 0.1261005546633433f, + 0.0812413884532226f, + -0.008894337353937203f, + 0.07984447025056046f, + -0.1258098052766725f, + -0.40245475467767916f, + 1.78188906675019f, + -1.1544387954232302f, + -0.41768781481273387f, + 0.6791211165341995f, + -0.4175127856183446f, + -0.07353219159767788f, + -0.2888813577574072f, + -0.7107767892597061f, + -1.0450031091195449f, + 0.0f, + -0.9221599545079143f, + -0.6747876356740621f, + 0.30241454354872105f, + 0.4924965303373908f, + -0.14042722740054084f, + 0.27744210409350445f, + -0.14788270997426836f, + -0.9081467469237995f, + -0.04513115674995093f, + -0.5254168669125793f, + -0.6999012037974789f, + 0.434661246306547f, + -0.7193303957246092f, + -0.9117952623409744f, + -1.5097267865916142f, + -0.20779888103770922f, + 0.4935562480901218f, + 0.0f, + 0.18303393908923593f, + 0.34753722677570037f, + 0.29291001533177663f, + 0.3832351878354224f, + 0.3295194956120599f, + -0.32398033003617527f, + -0.31570906736433746f, + 0.23657779050372962f, + 0.9510794465234161f, + -0.5122243902568278f, + 0.08652112725315658f, + 0.2246634353717998f, + -0.9032595595582497f, + -0.8936484034533545f, + 0.6012969720865752f, + -0.6454216646117924f, + -1.1753786049658332f, + 0.0f, + -0.4360545677728656f, + -0.6586237455328507f, + -0.34347301697886656f, + -0.8909724651992144f, + -0.24378721818350263f, + 0.6179733359297576f, + 0.0661661181742234f, + -0.14120142044993794f, + -0.07732699885498932f, + 1.0221355882357506f, + 0.44514798994115284f, + -0.7371569579959046f, + -0.7212499572378936f, + 0.7453626921081045f, + 0.5478757761345768f, + -0.39411232789985384f, + 0.7200542656743857f, + 0.0f, + -0.11790869453118827f, + -0.12317030713581928f, + -0.4207902738133338f, + 0.15895105878327986f, + 0.304261777102111f, + 0.11450744587017621f, + -0.11470709991317944f, + 0.5949222371739038f, + 0.6549518619412444f, + -0.24390606570422838f, + -0.4212796009440803f, + -0.6269666206320964f, + -0.5421193969807078f, + -0.12297772128652287f, + 0.021517257619930424f, + 0.25462855095544523f, + -0.22107798187348246f, + 0.0f, + 0.5204516300095662f, + 0.2837402841862462f, + 0.11310823283285916f, + 0.8944351685018025f, + 0.17487203235834015f, + -0.5271221928634433f, + -0.19516594503423199f, + 0.452456617580365f, + 1.2456272242706414f, + 0.24166615894862817f, + 0.09411429305204502f, + -0.2730072283327243f, + -0.8129383770918172f, + -0.24093254193486136f, + 0.5696499174142177f, + -0.11110805836073044f, + -0.3968204166235694f, + 0.0f, + -0.04388165369378549f, + -0.005631266017272595f, + -0.02574211858479705f, + 0.06230399626660669f, + 0.17677671232932785f, + 0.5172871274400965f, + 0.4919150085620063f, + -1.597656637582941f, + 0.02415185715719143f, + -0.17945446376668306f, + -0.39340600199798886f, + 0.25013205256886845f, + 0.05972330340308685f, + 0.1359911505596489f, + -0.02341033271820833f, + 0.15726074644063684f, + 0.47512625913020357f, + 0.0f, + 0.7327341664835779f, + -0.3689092312320013f, + 0.4571824787436036f, + 0.6215465537945456f, + 0.0944111296842023f, + -0.12571956176607574f, + -0.2507235674395462f, + -0.09579602654351593f, + 1.4463357293728496f, + 0.749153535856049f, + -0.5553955120807588f, + -0.09622771929369946f, + -0.2598697420394813f, + -0.964691815299676f, + -0.8289963178173902f, + 0.7112949291983329f, + -0.8667009730492162f, + 0.0f, + -0.48698304169042794f, + -0.18786095669893707f, + -0.11425249263203247f, + -0.3693391011684809f, + 0.09933145842585253f, + 0.2568559685298844f, + 0.7048512233651738f, + 0.6056238412407038f, + -0.4355558119826642f, + 0.17318931883915484f, + 0.6481333496429564f, + -0.45728823054344486f, + -0.006325004538589701f, + 0.45609864075494927f, + -0.6199385981116988f, + 0.035105808783046165f, + 0.1203147963894839f, + 0.0f, + 0.383402190836527f, + 0.048429009055370106f, + 0.5887186439275204f, + -0.20538767641607814f, + -0.031237879611002117f, + 0.3140759860883231f, + 0.24447070584999556f, + 0.7271263905705878f, + 0.8432799162434237f, + -0.11530577554199217f, + -0.7781023892314718f, + 0.05359488822710336f, + 0.5624870388700809f, + 0.5134656523208906f, + 0.18304041423438375f, + -0.04237421156328257f, + -0.20759809886942207f, + 0.0f, + -0.06249337454975615f, + 0.10081284533873777f, + 0.3894374350259183f, + 1.518217777528342f, + -0.9100037950171563f, + 0.17796906121831477f, + -0.2892167255357892f, + 0.6117902467884032f, + 0.13332120964959573f, + -0.3487155932849374f, + -0.32920583745734694f, + 0.08242631209809854f, + -0.24920225708110588f, + 0.8401757259392635f, + 0.11729108681358365f, + 0.11222925752499184f, + -0.027078490721459958f, + 0.0f, + 0.726132375517389f, + 0.72220359881096f, + 0.5721582611845177f, + 0.15139162075524315f, + 0.6676549461551197f, + -0.321449586554697f, + -0.10141104515219895f, + -0.09711123988777906f, + 0.9623356184776928f, + -0.7941822373167173f, + -0.9373923554119346f, + 0.4573241832354059f, + -0.42029139056126147f, + 0.2675223459380999f, + -0.5487300191551386f, + 0.2236621891916084f, + 0.11692039230044018f, + 0.0f, + 0.1758399202780961f, + 0.676447587678781f, + 0.5945412815881029f, + 0.5669863357359594f, + 0.8433565415303922f, + -0.30300550790708036f, + -0.43332881999693673f, + -0.4996522695731392f, + -0.2084930815451962f, + 0.27765278702463786f, + 1.0886848763946915f, + -0.0739433655813831f, + -0.4762801579229192f, + -0.2490825339320731f, + -1.8820479350439439f, + -0.4251592225775914f, + -0.3992922365484464f, + 0.0f, + 0.19598917760218867f, + 0.4860238022746914f, + 0.3364528828641281f, + 0.3350950865226741f, + 0.2773654548632006f, + -0.30547262140782566f, + 0.028649620490728344f, + -0.11763407628280315f, + 0.6237318502627169f, + -0.3958952632477945f, + 0.14797171297835243f, + 0.45821729624747465f, + -0.8687137170773626f, + 0.06989667196937126f, + -0.5752606929478727f, + 0.16986945686358412f, + 0.6925071596817824f, + 0.0f, + 0.4991250796183003f, + 0.03424654896322111f, + 0.6153698611882319f, + 0.5070872444849457f, + 0.43615747516328135f, + -0.7870352838659244f, + -0.6424101231965247f, + -0.7005774876651399f, + 0.79983115431488f, + 0.15720357955596242f, + -1.408372612176309f, + -0.039294695217213765f, + 0.6979415372962309f, + 0.27403316751965656f, + 1.2844596102619275f, + -0.2781534150257364f, + 0.3248437714908865f, + 0.0f, + 0.4364362371752831f, + -0.2548580911485434f, + -0.19578001373349452f, + -0.04597194387828005f, + -0.010035156855533233f, + 0.0415941475251266f, + 0.07929549739797387f, + -0.060629652912508866f, + 0.5977303008711333f, + -1.4404008068066554f, + 0.8555694790197376f, + -0.03693438534401856f, + 0.17761411164512408f, + -0.11858304304109235f, + -1.4241324353471327f, + 0.1533849765389186f, + 0.7650643783126995f, + 0.0f, + -0.0639949379280401f, + 0.4288617817939563f, + 0.4235508646885404f, + 0.3419843254383798f, + -0.015992360660098768f, + -0.773247697505441f, + -0.4908452922015917f, + 0.9868134897291486f, + -0.5078689994742608f, + 1.05632043744864f, + -0.38867419409275117f, + -0.0065547696858664194f, + -0.3056003173415037f, + -0.333762331930102f, + 0.4459671174011671f, + 0.08219092584580244f, + -0.08099158579518179f, + 0.0f, + -0.1568180656346373f, + -0.061962372393910135f, + 0.14065868174859464f, + -0.055925712798972765f, + 0.05136117465820622f, + 0.0907831030477633f, + 0.19518110495319604f, + -0.7470794578145956f, + 1.5945999734733545f, + -0.4351697502345834f, + -0.33253649399571805f }; + +const float av1_mv_prec_nn_bias_layer_0[] = { + -0.651213833993862f, -1.1243309933417809f, -0.2123880023097051f, + 0.23095477452877616f, -0.6668057665893545f, 0.3082268148379634f, + -0.3344916753975844f, -0.20920185606857844f, 0.6057933917964854f, + 0.5031857662559803f, -1.5380096313468152f, -0.4457245344804041f, + 1.82368055812373f, 0.7973912064077963f, 0.25706500555622913f, + 0.1394695119825382f, 0.4508811973450553f, -0.5408959545111782f, + 1.064829233697863f, 0.3733268644246235f, 1.1173169029905483f, + -0.2012817466400134f, -0.16628447748302294f, 1.3086000088940826f, + 0.7267092979664235f, -0.9097857006590555f, -0.7564259343863077f, + -0.49844128036716173f, -0.4675729246975423f, -0.03626154526362181f, + -0.41957330902404616f, -0.9658160514319954f +}; + +const float av1_mv_prec_nn_weights_layer_1[] = { + 1.5017296484510276f, 1.044216918060133f, -1.066541411740906f, + -0.7762965171172661f, -0.9814396609661653f, 0.9334065847340715f, + 0.7117244268817873f, -0.7695942296628597f, 0.7892157680137047f, + -0.5786309358654476f, -2.4444494892027264f, 1.1666759262637185f, + -0.9699580532370483f, 0.5849682956422552f, -1.0372272986941953f, + -0.5005014627824439f, 1.1816204711740521f, -1.2204867615892114f, + 0.4510263977504913f, 0.35567865078585165f, -0.7811389330738839f, + -0.6643977800301099f, -0.6283287371705794f, 0.790873821018048f, + 0.8861643352684585f, 0.6438840651522237f, 0.6677191546466089f, + 0.9703715021995785f, 1.250893534236489f, 0.7733742028067933f, + -1.249673977776904f, -1.2890127265725608f +}; + +const float av1_mv_prec_nn_bias_layer_1[] = { -0.341771735378258f }; + +static const NN_CONFIG av1_mv_prec_dnn_config = { + NUM_DNN_FEATURES, + NUM_LOGITS, + NUM_DNN_LAYERS, + { MV_PREC_LAYER_SIZE_0 }, + { + av1_mv_prec_nn_weights_layer_0, + av1_mv_prec_nn_weights_layer_1, + }, + { + av1_mv_prec_nn_bias_layer_0, + av1_mv_prec_nn_bias_layer_1, + }, +}; +#undef NUM_DNN_LAYERS +#undef NUM_DNN_FEATURES +#undef NUM_LAYER_0_UNITS +#undef NUM_LOGITS + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_MISC_MODEL_WEIGHTS_H_ diff --git a/libs/libaom/src/av1/encoder/ml.c b/libs/libaom/src/av1/encoder/ml.c new file mode 100644 index 000000000..57228ec91 --- /dev/null +++ b/libs/libaom/src/av1/encoder/ml.c @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "av1/encoder/ml.h" + +void av1_nn_output_prec_reduce(float *const output, int num_output) { + const int prec_bits = 11; + const int prec = 1 << prec_bits; + const float inv_prec = (float)(1.0 / prec); + for (int i = 0; i < num_output; i++) { + output[i] = ((int)(output[i] * prec + 0.5)) * inv_prec; + } +} + +// Calculate prediction based on the given input features and neural net config. +// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden +// layer. +void av1_nn_predict_c(const float *input_nodes, + const NN_CONFIG *const nn_config, int reduce_prec, + float *const output) { + int num_input_nodes = nn_config->num_inputs; + int buf_index = 0; + float buf[2][NN_MAX_NODES_PER_LAYER]; + + // Propagate hidden layers. + const int num_layers = nn_config->num_hidden_layers; + assert(num_layers <= NN_MAX_HIDDEN_LAYERS); + for (int layer = 0; layer < num_layers; ++layer) { + const float *layer_weights = nn_config->weights[layer]; + const float *layer_bias = nn_config->bias[layer]; + float *output_nodes = buf[buf_index]; + const int num_output_nodes = nn_config->num_hidden_nodes[layer]; + assert(num_output_nodes < NN_MAX_NODES_PER_LAYER); + for (int node = 0; node < num_output_nodes; ++node) { + float val = layer_bias[node]; + for (int i = 0; i < num_input_nodes; ++i) + val += layer_weights[node * num_input_nodes + i] * input_nodes[i]; + // ReLU as activation function. + val = val > 0.0f ? val : 0.0f; // Could use AOMMAX(). + output_nodes[node] = val; + } + num_input_nodes = num_output_nodes; + input_nodes = output_nodes; + buf_index = 1 - buf_index; + } + + // Final output layer. + const float *layer_weights = nn_config->weights[num_layers]; + const float *layer_bias = nn_config->bias[num_layers]; + for (int node = 0; node < nn_config->num_outputs; ++node) { + float val = layer_bias[node]; + for (int i = 0; i < num_input_nodes; ++i) + val += layer_weights[node * num_input_nodes + i] * input_nodes[i]; + output[node] = val; + } + if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs); +} + +#if CONFIG_NN_V2 +// Applies the ReLu activation to one fc layer +// output[i] = Max(input[i],0.0f) +static float *nn_relu(const float *input, FC_LAYER *layer) { + for (int i = 0; i < layer->num_outputs; ++i) { + layer->output[i] = AOMMAX(input[i], 0.0f); + } + + return layer->output; +} + +// Applies the Sigmoid activation to one fc layer +// output[i] = 1/(1+exp(input[i])) +static float *nn_sigmoid(const float *input, FC_LAYER *layer) { + for (int i = 0; i < layer->num_outputs; ++i) { + const float tmp = AOMMIN(AOMMAX(input[i], -10.0f), 10.0f); + layer->output[i] = 1.0f / (1.0f + expf(-tmp)); + } + + return layer->output; +} + +// Forward prediction in one fc layer, used in function av1_nn_predict_V2 +static float *nn_fc_forward(const float *input, FC_LAYER *layer) { + const float *weights = layer->weights; + const float *bias = layer->bias; + assert(layer->num_outputs < NN_MAX_NODES_PER_LAYER); + // fc + for (int node = 0; node < layer->num_outputs; ++node) { + float val = bias[node]; + for (int i = 0; i < layer->num_inputs; ++i) val += weights[i] * input[i]; + layer->output[node] = val; + weights += layer->num_inputs; + } + + // activation + switch (layer->activation) { + case NONE: return layer->output; + case RELU: return nn_relu(layer->output, layer); + case SIGMOID: return nn_sigmoid(layer->output, layer); + case SOFTSIGN: + assert(0 && "Softsign has not been supported in NN."); // TO DO + return NULL; + default: + assert(0 && "Unknown activation"); // Unknown activation + return NULL; + } +} + +void av1_nn_predict_v2(const float *feature, NN_CONFIG_V2 *nn_config, + int reduce_prec, float *output) { + const float *input_nodes = feature; + + // Propagate the layers. + const int num_layers = nn_config->num_hidden_layers; + assert(num_layers <= NN_MAX_HIDDEN_LAYERS); + for (int i = 0; i < num_layers; ++i) { + input_nodes = nn_fc_forward(input_nodes, nn_config->layer + i); + assert(nn_config->layer[i + 1].num_inputs == + nn_config->layer[i].num_outputs); + } + + // Final layer + input_nodes = nn_fc_forward(input_nodes, nn_config->layer + num_layers); + assert(nn_config->layer[num_layers].num_outputs == nn_config->num_logits); + // Copy the final layer output + memcpy(output, input_nodes, sizeof(*input_nodes) * nn_config->num_logits); + if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_logits); +} +#endif // CONFIG_NN_V2 + +void av1_nn_softmax(const float *input, float *output, int n) { + // Softmax function is invariant to adding the same constant + // to all input values, so we subtract the maximum input to avoid + // possible overflow. + float max_inp = input[0]; + for (int i = 1; i < n; i++) max_inp = AOMMAX(max_inp, input[i]); + float sum_out = 0.0f; + for (int i = 0; i < n; i++) { + // Clamp to range [-10.0, 0.0] to prevent FE_UNDERFLOW errors. + const float normalized_input = AOMMAX(input[i] - max_inp, -10.0f); + output[i] = (float)exp(normalized_input); + sum_out += output[i]; + } + for (int i = 0; i < n; i++) output[i] /= sum_out; +} diff --git a/libs/libaom/src/av1/encoder/ml.h b/libs/libaom/src/av1/encoder/ml.h new file mode 100644 index 000000000..62d543d6b --- /dev/null +++ b/libs/libaom/src/av1/encoder/ml.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ML_H_ +#define AOM_AV1_ENCODER_ML_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "config/av1_rtcd.h" + +#define NN_MAX_HIDDEN_LAYERS 10 +#define NN_MAX_NODES_PER_LAYER 128 + +struct NN_CONFIG { + int num_inputs; // Number of input nodes, i.e. features. + int num_outputs; // Number of output nodes. + int num_hidden_layers; // Number of hidden layers, maximum 10. + // Number of nodes for each hidden layer. + int num_hidden_nodes[NN_MAX_HIDDEN_LAYERS]; + // Weight parameters, indexed by layer. + const float *weights[NN_MAX_HIDDEN_LAYERS + 1]; + // Bias parameters, indexed by layer. + const float *bias[NN_MAX_HIDDEN_LAYERS + 1]; +}; +// Typedef from struct NN_CONFIG to NN_CONFIG is in rtcd_defs + +#if CONFIG_NN_V2 +// Fully-connectedly layer configuration +struct FC_LAYER { + const int num_inputs; // Number of input nodes, i.e. features. + const int num_outputs; // Number of output nodes. + + float *weights; // Weight parameters. + float *bias; // Bias parameters. + const ACTIVATION activation; // Activation function. + + float *output; // The output array. + float *dY; // Gradient of outputs + float *dW; // Gradient of weights. + float *db; // Gradient of bias +}; + +// NN configure structure V2 +struct NN_CONFIG_V2 { + const int num_hidden_layers; // Number of hidden layers, max = 10. + FC_LAYER layer[NN_MAX_HIDDEN_LAYERS + 1]; // The layer array + const int num_logits; // Number of output nodes. + float *logits; // Raw prediction (same as output of final layer) + const LOSS loss; // Loss function +}; + +// Calculate prediction based on the given input features and neural net config. +// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden +// layer. +void av1_nn_predict_v2(const float *features, NN_CONFIG_V2 *nn_config, + int reduce_prec, float *output); +#endif // CONFIG_NN_V2 + +// Applies the softmax normalization function to the input +// to get a valid probability distribution in the output: +// output[i] = exp(input[i]) / sum_{k \in [0,n)}(exp(input[k])) +void av1_nn_softmax(const float *input, float *output, int n); + +// Applies a precision reduction to output of av1_nn_predict to prevent +// mismatches between C and SIMD implementations. +void av1_nn_output_prec_reduce(float *const output, int num_output); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ML_H_ diff --git a/libs/libaom/src/av1/encoder/mode_prune_model_weights.h b/libs/libaom/src/av1/encoder/mode_prune_model_weights.h new file mode 100644 index 000000000..98ec36808 --- /dev/null +++ b/libs/libaom/src/av1/encoder/mode_prune_model_weights.h @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_MODE_PRUNE_MODEL_WEIGHTS_H_ +#define AOM_AV1_ENCODER_MODE_PRUNE_MODEL_WEIGHTS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define NUM_HIDDEN_LAYERS_12 1 +#define NUM_FEATURES_12 6 +#define NUM_LAYER_0_UNITS_12 24 +#define NUM_LOGITS_12 2 + +static const float av1_intrap_hiddenlayer_0_kernel_12[] = { + 7.28372f, -1.3333898f, -1.3180022f, -0.007156151f, -0.40799126f, + -0.57538104f, -31.81647f, 6.7057495f, 6.351472f, -0.029544508f, + 0.026801195f, 1.12863f, -0.70769817f, -0.24183524f, 0.0649113f, + -0.7189517f, 0.21791299f, 0.12840256f, -0.56424767f, 0.16924907f, + 0.4605501f, -0.170895f, -0.60358995f, -0.15383226f, -4.0523643f, + 0.6961917f, 1.3100256f, -0.4189354f, 0.37264112f, -0.14555685f, + 10.628014f, 8.184437f, 8.941916f, -0.011731001f, -0.45127156f, + 0.42704004f, 36.84277f, 8.988796f, 8.844238f, 0.00030091056f, + -0.022038324f, 1.3566176f, -8.863219f, -0.84811693f, -1.0908632f, + 0.00023130262f, -1.0698471f, -6.755927f, 7.1711984f, 4.7216063f, + 3.5099216f, -0.6650184f, 0.5935173f, -0.6696286f, 11.8595295f, + 0.3001874f, 0.29822728f, 0.04319222f, -1.203178f, 1.1210147f, + 0.035045594f, -0.20559944f, -0.015388541f, -0.7857941f, -0.94100875f, + -0.1278549f, -19.22603f, 7.9466896f, 6.5048656f, -0.22195444f, + 0.19061874f, 1.3927288f, -8.896529f, -0.48146892f, -1.6098932f, + -0.0030235797f, -0.6533787f, -2.1333003f, -22.256454f, -4.934058f, + -4.4707212f, -0.015831878f, -0.4243649f, -2.776269f, -0.23762038f, + 0.1820098f, -0.51865315f, -1.1893421f, 0.34969202f, 0.10636194f, + 14.545696f, 1.3849198f, 2.6815193f, -0.5145498f, 0.45948258f, + -0.8842355f, -0.9111363f, -0.39652422f, 0.077266276f, -0.68084997f, + 0.4593515f, -0.28872707f, -6.936231f, 1.12253f, 1.7616503f, + -0.014069137f, -0.0052156276f, -4.5095444f, 6.2076726f, -0.058755957f, + -0.4675936f, -0.13039507f, 0.12094394f, -0.07285393f, 68.26125f, + 7.4893136f, 8.770954f, 0.020274093f, -0.027877754f, 1.6579602f, + -0.1825479f, 0.34832543f, 0.07472531f, -0.44812247f, -1.0941806f, + -0.16749863f, 1.1394324f, 0.47983396f, -0.99983627f, -0.00064249727f, + -1.3345739f, -0.057157427f, -18.14875f, 16.506035f, 15.539248f, + 0.013191509f, -0.021674965f, -25.006235f, 0.51220596f, 0.7334426f, + 0.81836903f, -1.0443225f, 0.4459505f, -1.2045046f +}; + +static const float av1_intrap_hiddenlayer_0_bias_12[] = { + -4.154915f, 14.33833f, 0.0f, 0.0f, 2.0440118f, 12.40922f, + -16.77514f, 0.5879813f, 3.2305415f, 0.8303539f, 0.0f, 14.488708f, + 2.94393f, 1.874383f, 0.0f, -0.53140444f, 0.0f, 1.8456234f, + -0.55427986f, -19.856262f, 0.0f, 0.17281002f, 48.31631f, 0.0f +}; + +static const float av1_intrap_logits_kernel_12[] = { + 0.26843873f, -0.09576241f, 0.34427166f, 0.09914787f, -0.10275399f, + 0.02999484f, -0.1467772f, 0.11594324f, 0.29200763f, 0.0067976206f, + 0.050393578f, -0.018694371f, 0.3333476f, 0.2127221f, 0.35128218f, + 0.19968672f, 0.08099991f, 0.084850654f, -0.16045967f, 0.30286232f, + 0.6164765f, -0.27140254f, 0.08210814f, 0.34852806f, 0.25028184f, + -0.12188078f, 0.16310331f, 0.31253803f, -0.10792341f, 0.065858394f, + -0.1349708f, 0.08948815f, 0.31905392f, 0.03680656f, -0.05040944f, + -0.051539157f, 0.3211852f, 0.2137136f, 0.45037416f, 0.22748767f, + -0.10978614f, 0.06475646f, -0.16954158f, 0.32831904f, 0.16479677f, + -0.30020145f, 0.066221856f, 0.37213042f +}; + +static const float av1_intrap_logits_bias_12[] = { 0.95783f, -0.95823103f }; + +static const NN_CONFIG av1_intrap_nn_config = { + NUM_FEATURES_12, + NUM_LOGITS_12, + NUM_HIDDEN_LAYERS_12, + { + NUM_LAYER_0_UNITS_12, + }, + { + av1_intrap_hiddenlayer_0_kernel_12, + av1_intrap_logits_kernel_12, + }, + { + av1_intrap_hiddenlayer_0_bias_12, + av1_intrap_logits_bias_12, + }, +}; + +#undef NUM_HIDDEN_LAYERS_12 +#undef NUM_FEATURES_12 +#undef NUM_LAYER_0_UNITS_12 +#undef NUM_LOGITS_12 + +#define NUM_HIDDEN_LAYERS_15 1 +#define NUM_FEATURES_15 6 +#define NUM_LAYER_0_UNITS_15 24 +#define NUM_LOGITS_15 2 + +static const float av1_intraph_hiddenlayer_0_kernel_15[] = { + -0.77480125f, 0.3219551f, -0.015702145f, -0.5310235f, 0.5254026f, + -1.1522819f, 2.682016f, 0.08001052f, -0.2539285f, 0.04711023f, + -0.81296307f, 0.2675382f, 0.1952474f, -0.0664705f, 1.2989824f, + -0.3150117f, -0.8022715f, 0.045423955f, -27.584324f, -2.5608704f, + -3.2280366f, 0.05272543f, -0.47141576f, -0.07644298f, -53.77942f, + -22.393923f, -23.027853f, -0.00015186476f, -0.010696465f, 2.7064638f, + -22.776028f, 11.514891f, 11.138167f, -0.001243723f, -0.4802433f, + -8.758646f, 0.26398206f, -0.23485385f, 0.27586034f, -0.004954741f, + -0.4935232f, -0.017607696f, 69.56049f, -1.1756641f, -0.052366666f, + -0.38052833f, 0.32474658f, 0.04634263f, 0.8583235f, -0.528438f, + -0.7868907f, -0.4757781f, 0.4620985f, -0.70621157f, 231.40195f, + 6.805205f, 9.420295f, 0.02585775f, -0.03480937f, 1.3577378f, + 0.1758226f, 15.056758f, 14.437874f, -0.1305005f, 0.115103304f, + 0.21297209f, 55.821743f, -6.611156f, -6.8552365f, -0.011928095f, + -0.2042175f, 1.2557873f, -1.0722278f, -0.2683614f, 0.48318478f, + -0.73739994f, 0.54055226f, -0.03224738f, -0.06767959f, -0.21015017f, + 0.29171246f, -0.6937296f, -1.2342545f, -0.41278538f, -37.9365f, + 17.68424f, 16.263042f, -0.074828684f, 0.06607806f, -0.16763286f, + 13.594707f, 0.6152676f, -0.4371223f, -0.8365592f, 0.8273623f, + -1.2126317f, 0.1216157f, -1.3002136f, -0.18856938f, -0.2589358f, + -0.76897144f, 0.21777137f, -122.25033f, -0.23490006f, -3.1238277f, + -0.13916978f, 0.08576391f, -1.7391548f, -116.24812f, 14.906071f, + 13.468357f, 0.02332889f, -0.034617376f, -18.506111f, 0.7500542f, + -1.1882535f, 0.40848416f, -0.28434393f, -0.71471655f, -0.29188696f, + -0.46588746f, -0.17324813f, -0.62460244f, -1.1801276f, 0.28993344f, + -0.22072886f, 129.2688f, -0.33782578f, -0.34836572f, -0.034112718f, + -0.023666814f, -0.5865087f, -33.484146f, 1.1431375f, 0.56056374f, + -0.0049730353f, -0.24347587f, -1.3003352f, 0.88973033f, 0.8499571f, + -0.5678484f, -0.39009875f, -0.062105156f, -0.13965102f +}; + +static const float av1_intraph_hiddenlayer_0_bias_15[] = { + 0.0f, -0.2926711f, 0.0f, -1.0303509f, -27.459345f, 12.412848f, + 0.0f, -2.5971522f, -0.02733541f, -19.881912f, 14.391992f, -8.249469f, + 0.0f, 0.0f, 13.676118f, -0.6472994f, -0.07189449f, 1.1986839f, + 52.479107f, 0.0f, 0.0f, -3.0187025f, 1.4435643f, 0.0f +}; + +static const float av1_intraph_logits_kernel_15[] = { + 0.05390722f, -0.06859513f, 0.036842898f, 0.190772f, 0.13623567f, + 0.09321194f, 0.2314745f, -0.13958375f, -0.3058229f, -0.0104543045f, + 0.11336068f, -0.276115f, 0.00470723f, -0.49123898f, -0.15988174f, + 0.087681435f, 0.022517204f, 0.073877744f, 0.2968856f, -0.1401399f, + -0.38788354f, -0.26005393f, -0.39564916f, -0.16195515f, 0.2680102f, + -0.032179773f, -0.35758728f, 0.25819537f, 0.11468631f, 0.13573235f, + -0.2672175f, 0.016490124f, 0.048118807f, 0.020319486f, 0.07892215f, + -0.21821865f, 0.08434734f, 0.3129456f, -0.18215221f, 0.08884877f, + -0.35621428f, 0.11405768f, 0.27370325f, 0.14956686f, 0.01604587f, + -0.18334487f, -0.42385718f, -0.08033409f +}; + +static const float av1_intraph_logits_bias_15[] = { 0.83619016f, -0.8340626f }; + +static const NN_CONFIG av1_intrap_hd_nn_config = { + NUM_FEATURES_15, + NUM_LOGITS_15, + NUM_HIDDEN_LAYERS_15, + { + NUM_LAYER_0_UNITS_15, + }, + { + av1_intraph_hiddenlayer_0_kernel_15, + av1_intraph_logits_kernel_15, + }, + { + av1_intraph_hiddenlayer_0_bias_15, + av1_intraph_logits_bias_15, + }, +}; + +#undef NUM_HIDDEN_LAYERS_15 +#undef NUM_FEATURES_15 +#undef NUM_LAYER_0_UNITS_15 +#undef NUM_LOGITS_15 + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_MODE_PRUNE_MODEL_WEIGHTS_H_ diff --git a/libs/libaom/src/av1/encoder/model_rd.h b/libs/libaom/src/av1/encoder/model_rd.h new file mode 100644 index 000000000..c353c8f85 --- /dev/null +++ b/libs/libaom/src/av1/encoder/model_rd.h @@ -0,0 +1,275 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_MODEL_RD_H_ +#define AOM_AV1_ENCODER_MODEL_RD_H_ + +#include "aom/aom_integer.h" +#include "av1/encoder/block.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/pustats.h" +#include "av1/encoder/rdopt_utils.h" +#include "aom_ports/system_state.h" +#include "config/aom_dsp_rtcd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// 0: Legacy model +// 1: Curve fit model +// 2: Surface fit model +// 3: DNN regression model +// 4: Full rd model +#define MODELRD_TYPE_INTERP_FILTER 1 +#define MODELRD_TYPE_TX_SEARCH_PRUNE 1 +#define MODELRD_TYPE_MASKED_COMPOUND 1 +#define MODELRD_TYPE_INTERINTRA 1 +#define MODELRD_TYPE_INTRA 1 +#define MODELRD_TYPE_MOTION_MODE_RD 1 + +typedef void (*model_rd_for_sb_type)(const AV1_COMP *const cpi, + BLOCK_SIZE bsize, MACROBLOCK *x, + MACROBLOCKD *xd, int plane_from, + int plane_to, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, + int64_t *skip_sse_sb, int *plane_rate, + int64_t *plane_sse, int64_t *plane_dist); +typedef void (*model_rd_from_sse_type)(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + BLOCK_SIZE plane_bsize, int plane, + int64_t sse, int num_samples, int *rate, + int64_t *dist); + +static int64_t calculate_sse(MACROBLOCKD *const xd, + const struct macroblock_plane *p, + struct macroblockd_plane *pd, const int bw, + const int bh) { + int64_t sse = 0; + const int shift = xd->bd - 8; +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, + bw, bh); + } else { + sse = + aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh); + } +#else + sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh); +#endif + sse = ROUND_POWER_OF_TWO(sse, shift * 2); + return sse; +} + +static AOM_INLINE int64_t compute_sse_plane(MACROBLOCK *x, MACROBLOCKD *xd, + int plane, const BLOCK_SIZE bsize) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + int bw, bh; + const struct macroblock_plane *const p = &x->plane[plane]; + get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw, + &bh); + + int64_t sse = calculate_sse(xd, p, pd, bw, bh); + + return sse; +} + +static AOM_INLINE void model_rd_from_sse(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + BLOCK_SIZE plane_bsize, int plane, + int64_t sse, int num_samples, + int *rate, int64_t *dist) { + (void)num_samples; + const MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; + + // Fast approximate the modelling function. + if (cpi->sf.rd_sf.simple_model_rd_from_var) { + const int64_t square_error = sse; + int quantizer = p->dequant_QTX[1] >> dequant_shift; + if (quantizer < 120) + *rate = (int)AOMMIN( + (square_error * (280 - quantizer)) >> (16 - AV1_PROB_COST_SHIFT), + INT_MAX); + else + *rate = 0; + assert(*rate >= 0); + *dist = (square_error * quantizer) >> 8; + } else { + av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[plane_bsize], + p->dequant_QTX[1] >> dequant_shift, rate, + dist); + } + *dist <<= 4; +} + +// Fits a curve for rate and distortion using as feature: +// log2(sse_norm/qstep^2) +static AOM_INLINE void model_rd_with_curvfit(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + BLOCK_SIZE plane_bsize, int plane, + int64_t sse, int num_samples, + int *rate, int64_t *dist) { + (void)cpi; + (void)plane_bsize; + const MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; + const int qstep = AOMMAX(p->dequant_QTX[1] >> dequant_shift, 1); + + if (sse == 0) { + if (rate) *rate = 0; + if (dist) *dist = 0; + return; + } + aom_clear_system_state(); + const double sse_norm = (double)sse / num_samples; + const double qstepsqr = (double)qstep * qstep; + const double xqr = log2(sse_norm / qstepsqr); + double rate_f, dist_by_sse_norm_f; + av1_model_rd_curvfit(plane_bsize, sse_norm, xqr, &rate_f, + &dist_by_sse_norm_f); + + const double dist_f = dist_by_sse_norm_f * sse_norm; + int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5); + int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5); + aom_clear_system_state(); + + // Check if skip is better + if (rate_i == 0) { + dist_i = sse << 4; + } else if (RDCOST(x->rdmult, rate_i, dist_i) >= + RDCOST(x->rdmult, 0, sse << 4)) { + rate_i = 0; + dist_i = sse << 4; + } + + if (rate) *rate = rate_i; + if (dist) *dist = dist_i; +} + +static AOM_INLINE void model_rd_for_sb( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum, + int *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate, + int64_t *plane_sse, int64_t *plane_dist) { + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + int plane; + const int ref = xd->mi[0]->ref_frame[0]; + + int64_t rate_sum = 0; + int64_t dist_sum = 0; + int64_t total_sse = 0; + + assert(bsize < BLOCK_SIZES_ALL); + + for (plane = plane_from; plane <= plane_to; ++plane) { + if (plane && !xd->is_chroma_ref) break; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + assert(plane_bsize < BLOCK_SIZES_ALL); + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + int64_t sse; + int rate; + int64_t dist; + + sse = calculate_sse(xd, p, pd, bw, bh); + + model_rd_from_sse(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist); + + if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); + + total_sse += sse; + rate_sum += rate; + dist_sum += dist; + if (plane_rate) plane_rate[plane] = rate; + if (plane_sse) plane_sse[plane] = sse; + if (plane_dist) plane_dist[plane] = dist; + assert(rate_sum >= 0); + } + + if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0; + if (skip_sse_sb) *skip_sse_sb = total_sse << 4; + rate_sum = AOMMIN(rate_sum, INT_MAX); + *out_rate_sum = (int)rate_sum; + *out_dist_sum = dist_sum; +} + +static AOM_INLINE void model_rd_for_sb_with_curvfit( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum, + int *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate, + int64_t *plane_sse, int64_t *plane_dist) { + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + const int ref = xd->mi[0]->ref_frame[0]; + + int64_t rate_sum = 0; + int64_t dist_sum = 0; + int64_t total_sse = 0; + + for (int plane = plane_from; plane <= plane_to; ++plane) { + if (plane && !xd->is_chroma_ref) break; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + int64_t dist, sse; + int rate; + int bw, bh; + const struct macroblock_plane *const p = &x->plane[plane]; + get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, + &bw, &bh); + + sse = calculate_sse(xd, p, pd, bw, bh); + model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, + &dist); + + if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); + + total_sse += sse; + rate_sum += rate; + dist_sum += dist; + + if (plane_rate) plane_rate[plane] = rate; + if (plane_sse) plane_sse[plane] = sse; + if (plane_dist) plane_dist[plane] = dist; + } + + if (skip_txfm_sb) *skip_txfm_sb = rate_sum == 0; + if (skip_sse_sb) *skip_sse_sb = total_sse << 4; + *out_rate_sum = (int)rate_sum; + *out_dist_sum = dist_sum; +} + +enum { MODELRD_LEGACY, MODELRD_CURVFIT, MODELRD_TYPES } UENUM1BYTE(ModelRdType); + +static const model_rd_for_sb_type model_rd_sb_fn[MODELRD_TYPES] = { + model_rd_for_sb, model_rd_for_sb_with_curvfit +}; + +static const model_rd_from_sse_type model_rd_sse_fn[MODELRD_TYPES] = { + model_rd_from_sse, model_rd_with_curvfit +}; + +#ifdef __cplusplus +} // extern "C" +#endif +#endif // AOM_AV1_ENCODER_MODEL_RD_H_ diff --git a/libs/libaom/src/av1/encoder/motion_search_facade.c b/libs/libaom/src/av1/encoder/motion_search_facade.c new file mode 100644 index 000000000..8db1423e7 --- /dev/null +++ b/libs/libaom/src/av1/encoder/motion_search_facade.c @@ -0,0 +1,861 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_ports/system_state.h" + +#include "av1/common/reconinter.h" + +#include "av1/encoder/encodemv.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/motion_search_facade.h" +#include "av1/encoder/partition_strategy.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/tpl_model.h" + +#define RIGHT_SHIFT_MV(x) (((x) + 3 + ((x) >= 0)) >> 3) + +typedef struct { + FULLPEL_MV fmv; + int weight; +} cand_mv_t; + +static int compare_weight(const void *a, const void *b) { + const int diff = ((cand_mv_t *)a)->weight - ((cand_mv_t *)b)->weight; + if (diff < 0) + return 1; + else if (diff > 0) + return -1; + return 0; +} + +void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int ref_idx, int *rate_mv, + int search_range, inter_mode_info *mode_info, + int_mv *best_mv) { + MACROBLOCKD *xd = &x->e_mbd; + const AV1_COMMON *cm = &cpi->common; + const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params; + const int num_planes = av1_num_planes(cm); + MB_MODE_INFO *mbmi = xd->mi[0]; + struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } }; + int bestsme = INT_MAX; + const int ref = mbmi->ref_frame[ref_idx]; + const YV12_BUFFER_CONFIG *scaled_ref_frame = + av1_get_scaled_ref_frame(cpi, ref); + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + if (scaled_ref_frame) { + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // full-pixel motion search code to be used without additional + // modifications. + for (int i = 0; i < num_planes; i++) { + backup_yv12[i] = xd->plane[i].pre[ref_idx]; + } + av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL, + num_planes); + } + + // Work out the size of the first step in the mv step search. + // 0 here is maximum length first step. 1 is AOMMAX >> 1 etc. + int step_param; + if (cpi->sf.mv_sf.auto_mv_step_size && cm->show_frame) { + // Take the weighted average of the step_params based on the last frame's + // max mv magnitude and that based on the best ref mvs of the current + // block for the given reference. + step_param = (av1_init_search_range(x->max_mv_context[ref]) + + mv_search_params->mv_step_param) / + 2; + } else { + step_param = mv_search_params->mv_step_param; + } + + if (cpi->sf.mv_sf.adaptive_motion_search && bsize < cm->seq_params.sb_size) { + int boffset = + 2 * (mi_size_wide_log2[cm->seq_params.sb_size] - + AOMMIN(mi_size_high_log2[bsize], mi_size_wide_log2[bsize])); + step_param = AOMMAX(step_param, boffset); + } + + if (cpi->sf.mv_sf.adaptive_motion_search) { + int bwl = mi_size_wide_log2[bsize]; + int bhl = mi_size_high_log2[bsize]; + int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4); + + if (tlevel < 5) { + step_param += 2; + step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 1); + } + + // prev_mv_sad is not setup for dynamically scaled frames. + if (cpi->oxcf.resize_mode != RESIZE_RANDOM) { + int i; + for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) { + if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) { + x->pred_mv[ref].row = 0; + x->pred_mv[ref].col = 0; + best_mv->as_int = INVALID_MV; + + if (scaled_ref_frame) { + // Swap back the original buffers before returning. + for (int j = 0; j < num_planes; ++j) + xd->plane[j].pre[ref_idx] = backup_yv12[j]; + } + return; + } + } + } + } + + const MV ref_mv = av1_get_ref_mv(x, ref_idx).as_mv; + FULLPEL_MV start_mv; + if (mbmi->motion_mode != SIMPLE_TRANSLATION) + start_mv = get_fullmv_from_mv(&mbmi->mv[0].as_mv); + else + start_mv = get_fullmv_from_mv(&ref_mv); + + // cand stores start_mv and all possible MVs in a SB. + cand_mv_t cand[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB + 1] = { + { { 0, 0 }, 0 } + }; + cand[0].fmv = start_mv; + int cnt = 1; + int total_weight = 0; + + if (!cpi->sf.mv_sf.full_pixel_search_level && + mbmi->motion_mode == SIMPLE_TRANSLATION) { + if (x->valid_cost_b) { + const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D); + const int tplw = mi_size_wide[tpl_bsize]; + const int tplh = mi_size_high[tpl_bsize]; + const int nw = mi_size_wide[bsize] / tplw; + const int nh = mi_size_high[bsize] / tplh; + + if (nw >= 1 && nh >= 1) { + const int of_h = mi_row % mi_size_high[cm->seq_params.sb_size]; + const int of_w = mi_col % mi_size_wide[cm->seq_params.sb_size]; + const int start = of_h / tplh * x->cost_stride + of_w / tplw; + int valid = 1; + + // Assign large weight to start_mv, so it is always tested. + cand[0].weight = nw * nh; + + for (int k = 0; k < nh; k++) { + for (int l = 0; l < nw; l++) { + const int_mv mv = + x->mv_b[start + k * x->cost_stride + l][ref - LAST_FRAME]; + if (mv.as_int == INVALID_MV) { + valid = 0; + break; + } + + const FULLPEL_MV fmv = { GET_MV_RAWPEL(mv.as_mv.row), + GET_MV_RAWPEL(mv.as_mv.col) }; + int unique = 1; + for (int m = 0; m < cnt; m++) { + if (RIGHT_SHIFT_MV(fmv.row) == RIGHT_SHIFT_MV(cand[m].fmv.row) && + RIGHT_SHIFT_MV(fmv.col) == RIGHT_SHIFT_MV(cand[m].fmv.col)) { + unique = 0; + cand[m].weight++; + break; + } + } + + if (unique) { + cand[cnt].fmv = fmv; + cand[cnt].weight = 1; + cnt++; + } + } + if (!valid) break; + } + + if (valid) { + total_weight = 2 * nh * nw; + if (cnt > 2) qsort(cand, cnt, sizeof(cand[0]), &compare_weight); + } + } + } + } + + // Further reduce the search range. + if (search_range < INT_MAX) { + const search_site_config *ss_cfg = &mv_search_params->ss_cfg[SS_CFG_SRC]; + // MAx step_param is ss_cfg->ss_count. + if (search_range < 1) { + step_param = ss_cfg->ss_count; + } else { + while (ss_cfg->radius[ss_cfg->ss_count - step_param - 1] > + (search_range << 1) && + ss_cfg->ss_count - step_param - 1 > 0) + step_param++; + } + } + + int cost_list[5]; + int_mv second_best_mv; + best_mv->as_int = second_best_mv.as_int = INVALID_MV; + + const search_site_config *src_search_sites = + &mv_search_params->ss_cfg[SS_CFG_SRC]; + FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; + av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &ref_mv, + src_search_sites); + + switch (mbmi->motion_mode) { + case SIMPLE_TRANSLATION: { + int sum_weight = 0; + + for (int m = 0; m < cnt; m++) { + FULLPEL_MV smv = cand[m].fmv; + FULLPEL_MV this_best_mv, this_second_best_mv; + + int thissme = av1_full_pixel_search( + smv, &full_ms_params, step_param, cond_cost_list(cpi, cost_list), + &this_best_mv, &this_second_best_mv); + + if (thissme < bestsme) { + bestsme = thissme; + best_mv->as_fullmv = this_best_mv; + second_best_mv.as_fullmv = this_second_best_mv; + } + + sum_weight += cand[m].weight; + if (m >= 2 || 4 * sum_weight > 3 * total_weight) break; + } + } break; + case OBMC_CAUSAL: + bestsme = av1_obmc_full_pixel_search(start_mv, &full_ms_params, + step_param, &best_mv->as_fullmv); + break; + default: assert(0 && "Invalid motion mode!\n"); + } + + if (scaled_ref_frame) { + // Swap back the original buffers for subpel motion search. + for (int i = 0; i < num_planes; i++) { + xd->plane[i].pre[ref_idx] = backup_yv12[i]; + } + } + + // Terminate search with the current ref_idx if we have already encountered + // another ref_mv in the drl such that: + // 1. The other drl has the same fullpel_mv during the SIMPLE_TRANSLATION + // search process as the current fullpel_mv. + // 2. The rate needed to encode the current fullpel_mv is larger than that + // for the other ref_mv. + if (cpi->sf.inter_sf.skip_repeated_full_newmv && + mbmi->motion_mode == SIMPLE_TRANSLATION && + best_mv->as_int != INVALID_MV) { + int_mv this_mv; + this_mv.as_mv = get_mv_from_fullmv(&best_mv->as_fullmv); + const int ref_mv_idx = mbmi->ref_mv_idx; + const int this_mv_rate = + av1_mv_bit_cost(&this_mv.as_mv, &ref_mv, x->nmv_vec_cost, + x->mv_cost_stack, MV_COST_WEIGHT); + mode_info[ref_mv_idx].full_search_mv.as_int = this_mv.as_int; + mode_info[ref_mv_idx].full_mv_rate = this_mv_rate; + + for (int prev_ref_idx = 0; prev_ref_idx < ref_mv_idx; ++prev_ref_idx) { + // Check if the motion search result same as previous results + if (this_mv.as_int == mode_info[prev_ref_idx].full_search_mv.as_int) { + // Compare the rate cost + const int prev_rate_cost = mode_info[prev_ref_idx].full_mv_rate + + mode_info[prev_ref_idx].drl_cost; + const int this_rate_cost = + this_mv_rate + mode_info[ref_mv_idx].drl_cost; + + if (prev_rate_cost <= this_rate_cost) { + // If the current rate_cost is worse than the previous rate_cost, then + // we terminate the search. Since av1_single_motion_search is only + // called by handle_new_mv in SIMPLE_TRANSLATION mode, we set the + // best_mv to INVALID mv to signal that we wish to terminate search + // for the current mode. + best_mv->as_int = INVALID_MV; + return; + } + } + } + } + + if (cpi->common.features.cur_frame_force_integer_mv) { + convert_fullmv_to_mv(best_mv); + } + + const int use_fractional_mv = + bestsme < INT_MAX && cpi->common.features.cur_frame_force_integer_mv == 0; + if (use_fractional_mv) { + int_mv fractional_ms_list[3]; + av1_set_fractional_mv(fractional_ms_list); + int dis; /* TODO: use dis in distortion calculation later. */ + + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv, + cost_list); + MV subpel_start_mv = get_mv_from_fullmv(&best_mv->as_fullmv); + + switch (mbmi->motion_mode) { + case SIMPLE_TRANSLATION: + if (cpi->sf.mv_sf.use_accurate_subpel_search) { + const int try_second = second_best_mv.as_int != INVALID_MV && + second_best_mv.as_int != best_mv->as_int; + const int best_mv_var = mv_search_params->find_fractional_mv_step( + xd, cm, &ms_params, subpel_start_mv, &best_mv->as_mv, &dis, + &x->pred_sse[ref], fractional_ms_list); + + if (try_second) { + MV this_best_mv; + subpel_start_mv = get_mv_from_fullmv(&second_best_mv.as_fullmv); + if (av1_is_subpelmv_in_range(&ms_params.mv_limits, + subpel_start_mv)) { + const int this_var = mv_search_params->find_fractional_mv_step( + xd, cm, &ms_params, subpel_start_mv, &this_best_mv, &dis, + &x->pred_sse[ref], fractional_ms_list); + if (this_var < best_mv_var) best_mv->as_mv = this_best_mv; + } + } + } else { + mv_search_params->find_fractional_mv_step( + xd, cm, &ms_params, subpel_start_mv, &best_mv->as_mv, &dis, + &x->pred_sse[ref], NULL); + } + break; + case OBMC_CAUSAL: + av1_find_best_obmc_sub_pixel_tree_up(xd, cm, &ms_params, + subpel_start_mv, &best_mv->as_mv, + &dis, &x->pred_sse[ref], NULL); + break; + default: assert(0 && "Invalid motion mode!\n"); + } + } + *rate_mv = av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, x->nmv_vec_cost, + x->mv_cost_stack, MV_COST_WEIGHT); + + if (cpi->sf.mv_sf.adaptive_motion_search && + mbmi->motion_mode == SIMPLE_TRANSLATION) + x->pred_mv[ref] = best_mv->as_mv; +} + +void av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int_mv *cur_mv, + const uint8_t *mask, int mask_stride, + int *rate_mv) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + const int pw = block_size_wide[bsize]; + const int ph = block_size_high[bsize]; + const int plane = 0; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + // This function should only ever be called for compound modes + assert(has_second_ref(mbmi)); + const int_mv init_mv[2] = { cur_mv[0], cur_mv[1] }; + const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] }; + int_mv ref_mv[2]; + int ite, ref; + + // Get the prediction block from the 'other' reference frame. + const int_interpfilters interp_filters = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + + InterPredParams inter_pred_params; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + // Do joint motion search in compound mode to get more accurate mv. + struct buf_2d backup_yv12[2][MAX_MB_PLANE]; + int last_besterr[2] = { INT_MAX, INT_MAX }; + const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = { + av1_get_scaled_ref_frame(cpi, refs[0]), + av1_get_scaled_ref_frame(cpi, refs[1]) + }; + + // Prediction buffer from second frame. + DECLARE_ALIGNED(16, uint8_t, second_pred16[MAX_SB_SQUARE * sizeof(uint16_t)]); + uint8_t *second_pred = get_buf_by_bd(xd, second_pred16); + int_mv best_mv; + + // Allow joint search multiple times iteratively for each reference frame + // and break out of the search loop if it couldn't find a better mv. + for (ite = 0; ite < 4; ite++) { + struct buf_2d ref_yv12[2]; + int bestsme = INT_MAX; + int id = ite % 2; // Even iterations search in the first reference frame, + // odd iterations search in the second. The predictor + // found for the 'other' reference frame is factored in. + if (ite >= 2 && cur_mv[!id].as_int == init_mv[!id].as_int) { + if (cur_mv[id].as_int == init_mv[id].as_int) { + break; + } else { + int_mv cur_int_mv, init_int_mv; + cur_int_mv.as_mv.col = cur_mv[id].as_mv.col >> 3; + cur_int_mv.as_mv.row = cur_mv[id].as_mv.row >> 3; + init_int_mv.as_mv.row = init_mv[id].as_mv.row >> 3; + init_int_mv.as_mv.col = init_mv[id].as_mv.col >> 3; + if (cur_int_mv.as_int == init_int_mv.as_int) { + break; + } + } + } + for (ref = 0; ref < 2; ++ref) { + ref_mv[ref] = av1_get_ref_mv(x, ref); + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // motion search code to be used without additional modifications. + if (scaled_ref_frame[ref]) { + int i; + for (i = 0; i < num_planes; i++) + backup_yv12[ref][i] = xd->plane[i].pre[ref]; + av1_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col, + NULL, num_planes); + } + } + + assert(IMPLIES(scaled_ref_frame[0] != NULL, + cm->width == scaled_ref_frame[0]->y_crop_width && + cm->height == scaled_ref_frame[0]->y_crop_height)); + assert(IMPLIES(scaled_ref_frame[1] != NULL, + cm->width == scaled_ref_frame[1]->y_crop_width && + cm->height == scaled_ref_frame[1]->y_crop_height)); + + // Initialize based on (possibly scaled) prediction buffers. + ref_yv12[0] = xd->plane[plane].pre[0]; + ref_yv12[1] = xd->plane[plane].pre[1]; + + av1_init_inter_params(&inter_pred_params, pw, ph, mi_row * MI_SIZE, + mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0, + &cm->sf_identity, &ref_yv12[!id], interp_filters); + inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd); + + // Since we have scaled the reference frames to match the size of the + // current frame we must use a unit scaling factor during mode selection. + av1_enc_build_one_inter_predictor(second_pred, pw, &cur_mv[!id].as_mv, + &inter_pred_params); + + const int order_idx = id != 0; + av1_dist_wtd_comp_weight_assign( + cm, mbmi, order_idx, &xd->jcp_param.fwd_offset, + &xd->jcp_param.bck_offset, &xd->jcp_param.use_dist_wtd_comp_avg, 1); + + // Do full-pixel compound motion search on the current reference frame. + if (id) xd->plane[plane].pre[0] = ref_yv12[id]; + + // Make motion search params + FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; + av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, + &ref_mv[id].as_mv, NULL); + av1_set_ms_compound_refs(&full_ms_params.ms_buffers, second_pred, mask, + mask_stride, id); + + // Use the mv result from the single mode as mv predictor. + const FULLPEL_MV start_fullmv = get_fullmv_from_mv(&cur_mv[id].as_mv); + + // Small-range full-pixel motion search. + bestsme = av1_refining_search_8p_c(&full_ms_params, start_fullmv, + &best_mv.as_fullmv); + + if (bestsme < INT_MAX) { + bestsme = av1_get_mvpred_compound_var( + &full_ms_params.mv_cost_params, best_mv.as_fullmv, second_pred, mask, + mask_stride, id, &cpi->fn_ptr[bsize], &x->plane[0].src, + &ref_yv12[id]); + } + + // Restore the pointer to the first (possibly scaled) prediction buffer. + if (id) xd->plane[plane].pre[0] = ref_yv12[0]; + + for (ref = 0; ref < 2; ++ref) { + if (scaled_ref_frame[ref]) { + // Swap back the original buffers for subpel motion search. + for (int i = 0; i < num_planes; i++) { + xd->plane[i].pre[ref] = backup_yv12[ref][i]; + } + // Re-initialize based on unscaled prediction buffers. + ref_yv12[ref] = xd->plane[plane].pre[ref]; + } + } + + // Do sub-pixel compound motion search on the current reference frame. + if (id) xd->plane[plane].pre[0] = ref_yv12[id]; + + if (cpi->common.features.cur_frame_force_integer_mv) { + convert_fullmv_to_mv(&best_mv); + } + if (bestsme < INT_MAX && + cpi->common.features.cur_frame_force_integer_mv == 0) { + int dis; /* TODO: use dis in distortion calculation later. */ + unsigned int sse; + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, + &ref_mv[id].as_mv, NULL); + av1_set_ms_compound_refs(&ms_params.var_params.ms_buffers, second_pred, + mask, mask_stride, id); + ms_params.forced_stop = EIGHTH_PEL; + MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv); + bestsme = cpi->mv_search_params.find_fractional_mv_step( + xd, cm, &ms_params, start_mv, &best_mv.as_mv, &dis, &sse, NULL); + } + + // Restore the pointer to the first prediction buffer. + if (id) xd->plane[plane].pre[0] = ref_yv12[0]; + if (bestsme < last_besterr[id]) { + cur_mv[id] = best_mv; + last_besterr[id] = bestsme; + } else { + break; + } + } + + *rate_mv = 0; + + for (ref = 0; ref < 2; ++ref) { + const int_mv curr_ref_mv = av1_get_ref_mv(x, ref); + *rate_mv += + av1_mv_bit_cost(&cur_mv[ref].as_mv, &curr_ref_mv.as_mv, x->nmv_vec_cost, + x->mv_cost_stack, MV_COST_WEIGHT); + } +} + +// Search for the best mv for one component of a compound, +// given that the other component is fixed. +void av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, MV *this_mv, + const uint8_t *second_pred, + const uint8_t *mask, int mask_stride, + int *rate_mv, int ref_idx) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const int ref = mbmi->ref_frame[ref_idx]; + const int_mv ref_mv = av1_get_ref_mv(x, ref_idx); + struct macroblockd_plane *const pd = &xd->plane[0]; + + struct buf_2d backup_yv12[MAX_MB_PLANE]; + const YV12_BUFFER_CONFIG *const scaled_ref_frame = + av1_get_scaled_ref_frame(cpi, ref); + + // Check that this is either an interinter or an interintra block + assert(has_second_ref(mbmi) || (ref_idx == 0 && is_interintra_mode(mbmi))); + + // Store the first prediction buffer. + struct buf_2d orig_yv12; + struct buf_2d ref_yv12 = pd->pre[ref_idx]; + if (ref_idx) { + orig_yv12 = pd->pre[0]; + pd->pre[0] = pd->pre[ref_idx]; + } + + if (scaled_ref_frame) { + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // full-pixel motion search code to be used without additional + // modifications. + for (int i = 0; i < num_planes; i++) { + backup_yv12[i] = xd->plane[i].pre[ref_idx]; + } + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL, + num_planes); + } + + int bestsme = INT_MAX; + int_mv best_mv; + + // Make motion search params + FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; + av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, + &ref_mv.as_mv, NULL); + av1_set_ms_compound_refs(&full_ms_params.ms_buffers, second_pred, mask, + mask_stride, ref_idx); + + // Use the mv result from the single mode as mv predictor. + const FULLPEL_MV start_fullmv = get_fullmv_from_mv(this_mv); + + // Small-range full-pixel motion search. + bestsme = av1_refining_search_8p_c(&full_ms_params, start_fullmv, + &best_mv.as_fullmv); + + if (bestsme < INT_MAX) { + bestsme = av1_get_mvpred_compound_var( + &full_ms_params.mv_cost_params, best_mv.as_fullmv, second_pred, mask, + mask_stride, ref_idx, &cpi->fn_ptr[bsize], &x->plane[0].src, &ref_yv12); + } + + if (scaled_ref_frame) { + // Swap back the original buffers for subpel motion search. + for (int i = 0; i < num_planes; i++) { + xd->plane[i].pre[ref_idx] = backup_yv12[i]; + } + } + + if (cpi->common.features.cur_frame_force_integer_mv) { + convert_fullmv_to_mv(&best_mv); + } + const int use_fractional_mv = + bestsme < INT_MAX && cpi->common.features.cur_frame_force_integer_mv == 0; + if (use_fractional_mv) { + int dis; /* TODO: use dis in distortion calculation later. */ + unsigned int sse; + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv.as_mv, + NULL); + av1_set_ms_compound_refs(&ms_params.var_params.ms_buffers, second_pred, + mask, mask_stride, ref_idx); + ms_params.forced_stop = EIGHTH_PEL; + MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv); + bestsme = cpi->mv_search_params.find_fractional_mv_step( + xd, cm, &ms_params, start_mv, &best_mv.as_mv, &dis, &sse, NULL); + } + + // Restore the pointer to the first unscaled prediction buffer. + if (ref_idx) pd->pre[0] = orig_yv12; + + if (bestsme < INT_MAX) *this_mv = best_mv.as_mv; + + *rate_mv = 0; + + *rate_mv += av1_mv_bit_cost(this_mv, &ref_mv.as_mv, x->nmv_vec_cost, + x->mv_cost_stack, MV_COST_WEIGHT); +} + +static AOM_INLINE void build_second_inter_pred(const AV1_COMP *cpi, + MACROBLOCK *x, BLOCK_SIZE bsize, + const MV *other_mv, int ref_idx, + uint8_t *second_pred) { + const AV1_COMMON *const cm = &cpi->common; + const int pw = block_size_wide[bsize]; + const int ph = block_size_high[bsize]; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + struct macroblockd_plane *const pd = &xd->plane[0]; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x); + const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y); + + // This function should only ever be called for compound modes + assert(has_second_ref(mbmi)); + + const int plane = 0; + struct buf_2d ref_yv12 = xd->plane[plane].pre[!ref_idx]; + + struct scale_factors sf; + av1_setup_scale_factors_for_frame(&sf, ref_yv12.width, ref_yv12.height, + cm->width, cm->height); + + InterPredParams inter_pred_params; + + av1_init_inter_params(&inter_pred_params, pw, ph, p_row, p_col, + pd->subsampling_x, pd->subsampling_y, xd->bd, + is_cur_buf_hbd(xd), 0, &sf, &ref_yv12, + mbmi->interp_filters); + inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); + + // Get the prediction block from the 'other' reference frame. + av1_enc_build_one_inter_predictor(second_pred, pw, other_mv, + &inter_pred_params); + + av1_dist_wtd_comp_weight_assign(cm, mbmi, 0, &xd->jcp_param.fwd_offset, + &xd->jcp_param.bck_offset, + &xd->jcp_param.use_dist_wtd_comp_avg, 1); +} + +// Wrapper for av1_compound_single_motion_search, for the common case +// where the second prediction is also an inter mode. +void av1_compound_single_motion_search_interinter( + const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv, + const uint8_t *mask, int mask_stride, int *rate_mv, int ref_idx) { + MACROBLOCKD *xd = &x->e_mbd; + // This function should only ever be called for compound modes + assert(has_second_ref(xd->mi[0])); + + // Prediction buffer from second frame. + DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]); + uint8_t *second_pred; + if (is_cur_buf_hbd(xd)) + second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16); + else + second_pred = (uint8_t *)second_pred_alloc_16; + + MV *this_mv = &cur_mv[ref_idx].as_mv; + const MV *other_mv = &cur_mv[!ref_idx].as_mv; + build_second_inter_pred(cpi, x, bsize, other_mv, ref_idx, second_pred); + av1_compound_single_motion_search(cpi, x, bsize, this_mv, second_pred, mask, + mask_stride, rate_mv, ref_idx); +} + +static AOM_INLINE void do_masked_motion_search_indexed( + const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv, + const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE bsize, + int_mv *tmp_mv, int *rate_mv, int which) { + // NOTE: which values: 0 - 0 only, 1 - 1 only, 2 - both + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + BLOCK_SIZE sb_type = mbmi->sb_type; + const uint8_t *mask; + const int mask_stride = block_size_wide[bsize]; + + mask = av1_get_compound_type_mask(comp_data, sb_type); + + tmp_mv[0].as_int = cur_mv[0].as_int; + tmp_mv[1].as_int = cur_mv[1].as_int; + if (which == 0 || which == 1) { + av1_compound_single_motion_search_interinter(cpi, x, bsize, tmp_mv, mask, + mask_stride, rate_mv, which); + } else if (which == 2) { + av1_joint_motion_search(cpi, x, bsize, tmp_mv, mask, mask_stride, rate_mv); + } +} + +int av1_interinter_compound_motion_search(const AV1_COMP *const cpi, + MACROBLOCK *x, + const int_mv *const cur_mv, + const BLOCK_SIZE bsize, + const PREDICTION_MODE this_mode) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + int_mv tmp_mv[2]; + int tmp_rate_mv = 0; + mbmi->interinter_comp.seg_mask = xd->seg_mask; + const INTERINTER_COMPOUND_DATA *compound_data = &mbmi->interinter_comp; + + if (this_mode == NEW_NEWMV) { + do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize, + tmp_mv, &tmp_rate_mv, 2); + mbmi->mv[0].as_int = tmp_mv[0].as_int; + mbmi->mv[1].as_int = tmp_mv[1].as_int; + } else if (this_mode >= NEAREST_NEWMV && this_mode <= NEW_NEARMV) { + // which = 1 if this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV + // which = 0 if this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV + int which = (NEWMV == compound_ref1_mode(this_mode)); + do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize, + tmp_mv, &tmp_rate_mv, which); + mbmi->mv[which].as_int = tmp_mv[which].as_int; + } + return tmp_rate_mv; +} + +int_mv av1_simple_motion_search(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row, + int mi_col, BLOCK_SIZE bsize, int ref, + FULLPEL_MV start_mv, int num_planes, + int use_subpixel) { + assert(num_planes == 1 && + "Currently simple_motion_search only supports luma plane"); + assert(!frame_is_intra_only(&cpi->common) && + "Simple motion search only enabled for non-key frames"); + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + + set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize); + + MB_MODE_INFO *mbmi = xd->mi[0]; + mbmi->sb_type = bsize; + mbmi->ref_frame[0] = ref; + mbmi->ref_frame[1] = NONE_FRAME; + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref); + const YV12_BUFFER_CONFIG *scaled_ref_frame = + av1_get_scaled_ref_frame(cpi, ref); + struct buf_2d backup_yv12; + // ref_mv is used to calculate the cost of the motion vector + const MV ref_mv = kZeroMv; + const int step_param = cpi->mv_search_params.mv_step_param; + const search_site_config *src_search_sites = + &cpi->mv_search_params.ss_cfg[SS_CFG_SRC]; + int cost_list[5]; + const int ref_idx = 0; + int var; + int_mv best_mv; + + av1_setup_pre_planes(xd, ref_idx, yv12, mi_row, mi_col, + get_ref_scale_factors(cm, ref), num_planes); + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + if (scaled_ref_frame) { + backup_yv12 = xd->plane[AOM_PLANE_Y].pre[ref_idx]; + av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL, + num_planes); + } + + FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; + av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &ref_mv, + src_search_sites); + + var = av1_full_pixel_search(start_mv, &full_ms_params, step_param, + cond_cost_list(cpi, cost_list), + &best_mv.as_fullmv, NULL); + + const int use_subpel_search = + var < INT_MAX && !cpi->common.features.cur_frame_force_integer_mv && + use_subpixel; + if (scaled_ref_frame) { + xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12; + } + if (use_subpel_search) { + int not_used = 0; + + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv, + cost_list); + // TODO(yunqing): integrate this into av1_make_default_subpel_ms_params(). + ms_params.forced_stop = cpi->sf.mv_sf.simple_motion_subpel_force_stop; + + MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv); + + cpi->mv_search_params.find_fractional_mv_step( + xd, cm, &ms_params, subpel_start_mv, &best_mv.as_mv, ¬_used, + &x->pred_sse[ref], NULL); + } else { + // Manually convert from units of pixel to 1/8-pixels if we are not doing + // subpel search + convert_fullmv_to_mv(&best_mv); + } + + mbmi->mv[0] = best_mv; + + // Get a copy of the prediction output + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + + aom_clear_system_state(); + + if (scaled_ref_frame) { + xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12; + } + + return best_mv; +} + +int_mv av1_simple_motion_sse_var(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, + int mi_col, BLOCK_SIZE bsize, + const FULLPEL_MV start_mv, int use_subpixel, + unsigned int *sse, unsigned int *var) { + MACROBLOCKD *xd = &x->e_mbd; + const MV_REFERENCE_FRAME ref = + cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME; + + int_mv best_mv = av1_simple_motion_search(cpi, x, mi_row, mi_col, bsize, ref, + start_mv, 1, use_subpixel); + + const uint8_t *src = x->plane[0].src.buf; + const int src_stride = x->plane[0].src.stride; + const uint8_t *dst = xd->plane[0].dst.buf; + const int dst_stride = xd->plane[0].dst.stride; + + *var = cpi->fn_ptr[bsize].vf(src, src_stride, dst, dst_stride, sse); + + return best_mv; +} diff --git a/libs/libaom/src/av1/encoder/motion_search_facade.h b/libs/libaom/src/av1/encoder/motion_search_facade.h new file mode 100644 index 000000000..3b86e9376 --- /dev/null +++ b/libs/libaom/src/av1/encoder/motion_search_facade.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_MOTION_SEARCH_H_ +#define AOM_AV1_ENCODER_MOTION_SEARCH_H_ + +#include "av1/encoder/encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + int64_t rd; + int drl_cost; + + int rate_mv; + int_mv mv; + + int_mv full_search_mv; + int full_mv_rate; +} inter_mode_info; + +void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int ref_idx, int *rate_mv, + int search_range, inter_mode_info *mode_info, + int_mv *best_mv); + +void av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int_mv *cur_mv, + const uint8_t *mask, int mask_stride, + int *rate_mv); + +int av1_interinter_compound_motion_search(const AV1_COMP *const cpi, + MACROBLOCK *x, + const int_mv *const cur_mv, + const BLOCK_SIZE bsize, + const PREDICTION_MODE this_mode); + +void av1_compound_single_motion_search_interinter( + const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv, + const uint8_t *mask, int mask_stride, int *rate_mv, int ref_idx); + +void av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, MV *this_mv, + const uint8_t *second_pred, + const uint8_t *mask, int mask_stride, + int *rate_mv, int ref_idx); + +// Performs a motion search in SIMPLE_TRANSLATION mode using reference frame +// ref. Note that this sets the offset of mbmi, so we will need to reset it +// after calling this function. +int_mv av1_simple_motion_search(struct AV1_COMP *const cpi, MACROBLOCK *x, + int mi_row, int mi_col, BLOCK_SIZE bsize, + int ref, FULLPEL_MV start_mv, int num_planes, + int use_subpixel); + +// Performs a simple motion search to calculate the sse and var of the residue +int_mv av1_simple_motion_sse_var(struct AV1_COMP *cpi, MACROBLOCK *x, + int mi_row, int mi_col, BLOCK_SIZE bsize, + const FULLPEL_MV start_mv, int use_subpixel, + unsigned int *sse, unsigned int *var); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_MOTION_SEARCH_H_ diff --git a/libs/libaom/src/av1/encoder/mv_prec.c b/libs/libaom/src/av1/encoder/mv_prec.c new file mode 100644 index 000000000..8fcbde98e --- /dev/null +++ b/libs/libaom/src/av1/encoder/mv_prec.c @@ -0,0 +1,430 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" + +#include "aom_ports/system_state.h" + +#include "av1/encoder/encodemv.h" +#if !CONFIG_REALTIME_ONLY +#include "av1/encoder/misc_model_weights.h" +#endif // !CONFIG_REALTIME_ONLY +#include "av1/encoder/mv_prec.h" + +#if !CONFIG_REALTIME_ONLY +static AOM_INLINE int_mv get_ref_mv_for_mv_stats( + const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame, + int ref_idx) { + int ref_mv_idx = mbmi->ref_mv_idx; + if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) { + assert(has_second_ref(mbmi)); + ref_mv_idx += 1; + } + + const MV_REFERENCE_FRAME *ref_frames = mbmi->ref_frame; + const int8_t ref_frame_type = av1_ref_frame_type(ref_frames); + const CANDIDATE_MV *curr_ref_mv_stack = mbmi_ext_frame->ref_mv_stack; + + if (ref_frames[1] > INTRA_FRAME) { + assert(ref_idx == 0 || ref_idx == 1); + return ref_idx ? curr_ref_mv_stack[ref_mv_idx].comp_mv + : curr_ref_mv_stack[ref_mv_idx].this_mv; + } + + assert(ref_idx == 0); + return ref_mv_idx < mbmi_ext_frame->ref_mv_count + ? curr_ref_mv_stack[ref_mv_idx].this_mv + : mbmi_ext_frame->global_mvs[ref_frame_type]; +} + +static AOM_INLINE int get_symbol_cost(const aom_cdf_prob *cdf, int symbol) { + const aom_cdf_prob cur_cdf = AOM_ICDF(cdf[symbol]); + const aom_cdf_prob prev_cdf = symbol ? AOM_ICDF(cdf[symbol - 1]) : 0; + const aom_cdf_prob p15 = AOMMAX(cur_cdf - prev_cdf, EC_MIN_PROB); + + return av1_cost_symbol(p15); +} + +static AOM_INLINE int keep_one_comp_stat(MV_STATS *mv_stats, int comp_val, + int comp_idx, const AV1_COMP *cpi, + int *rates) { + assert(comp_val != 0 && "mv component should not have zero value!"); + const int sign = comp_val < 0; + const int mag = sign ? -comp_val : comp_val; + const int mag_minus_1 = mag - 1; + int offset; + const int mv_class = av1_get_mv_class(mag_minus_1, &offset); + const int int_part = offset >> 3; // int mv data + const int frac_part = (offset >> 1) & 3; // fractional mv data + const int high_part = offset & 1; // high precision mv data + const int use_hp = cpi->common.features.allow_high_precision_mv; + int r_idx = 0; + + const MACROBLOCK *const x = &cpi->td.mb; + const MACROBLOCKD *const xd = &x->e_mbd; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + nmv_context *nmvc = &ec_ctx->nmvc; + nmv_component *mvcomp_ctx = nmvc->comps; + nmv_component *cur_mvcomp_ctx = &mvcomp_ctx[comp_idx]; + aom_cdf_prob *sign_cdf = cur_mvcomp_ctx->sign_cdf; + aom_cdf_prob *class_cdf = cur_mvcomp_ctx->classes_cdf; + aom_cdf_prob *class0_cdf = cur_mvcomp_ctx->class0_cdf; + aom_cdf_prob(*bits_cdf)[3] = cur_mvcomp_ctx->bits_cdf; + aom_cdf_prob *frac_part_cdf = mv_class + ? (cur_mvcomp_ctx->fp_cdf) + : (cur_mvcomp_ctx->class0_fp_cdf[int_part]); + aom_cdf_prob *high_part_cdf = + mv_class ? (cur_mvcomp_ctx->hp_cdf) : (cur_mvcomp_ctx->class0_hp_cdf); + + const int sign_rate = get_symbol_cost(sign_cdf, sign); + rates[r_idx++] = sign_rate; + update_cdf(sign_cdf, sign, 2); + + const int class_rate = get_symbol_cost(class_cdf, mv_class); + rates[r_idx++] = class_rate; + update_cdf(class_cdf, mv_class, MV_CLASSES); + + int int_bit_rate = 0; + if (mv_class == MV_CLASS_0) { + int_bit_rate = get_symbol_cost(class0_cdf, int_part); + update_cdf(class0_cdf, int_part, CLASS0_SIZE); + } else { + const int n = mv_class + CLASS0_BITS - 1; // number of bits + for (int i = 0; i < n; ++i) { + int_bit_rate += get_symbol_cost(bits_cdf[i], (int_part >> i) & 1); + update_cdf(bits_cdf[i], (int_part >> i) & 1, 2); + } + } + rates[r_idx++] = int_bit_rate; + const int frac_part_rate = get_symbol_cost(frac_part_cdf, frac_part); + rates[r_idx++] = frac_part_rate; + update_cdf(frac_part_cdf, frac_part, MV_FP_SIZE); + const int high_part_rate = + use_hp ? get_symbol_cost(high_part_cdf, high_part) : 0; + if (use_hp) { + update_cdf(high_part_cdf, high_part, 2); + } + rates[r_idx++] = high_part_rate; + + mv_stats->last_bit_zero += !high_part; + mv_stats->last_bit_nonzero += high_part; + const int total_rate = + (sign_rate + class_rate + int_bit_rate + frac_part_rate + high_part_rate); + return total_rate; +} + +static AOM_INLINE void keep_one_mv_stat(MV_STATS *mv_stats, const MV *ref_mv, + const MV *cur_mv, const AV1_COMP *cpi) { + const MACROBLOCK *const x = &cpi->td.mb; + const MACROBLOCKD *const xd = &x->e_mbd; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + nmv_context *nmvc = &ec_ctx->nmvc; + aom_cdf_prob *joint_cdf = nmvc->joints_cdf; + const int use_hp = cpi->common.features.allow_high_precision_mv; + + const MV diff = { cur_mv->row - ref_mv->row, cur_mv->col - ref_mv->col }; + const int mv_joint = av1_get_mv_joint(&diff); + // TODO(chiyotsai@google.com): Estimate hp_diff when we are using lp + const MV hp_diff = diff; + const int hp_mv_joint = av1_get_mv_joint(&hp_diff); + const MV truncated_diff = { (diff.row / 2) * 2, (diff.col / 2) * 2 }; + const MV lp_diff = use_hp ? truncated_diff : diff; + const int lp_mv_joint = av1_get_mv_joint(&lp_diff); + + aom_clear_system_state(); + const int mv_joint_rate = get_symbol_cost(joint_cdf, mv_joint); + const int hp_mv_joint_rate = get_symbol_cost(joint_cdf, hp_mv_joint); + const int lp_mv_joint_rate = get_symbol_cost(joint_cdf, lp_mv_joint); + + update_cdf(joint_cdf, mv_joint, MV_JOINTS); + + mv_stats->total_mv_rate += mv_joint_rate; + mv_stats->hp_total_mv_rate += hp_mv_joint_rate; + mv_stats->lp_total_mv_rate += lp_mv_joint_rate; + mv_stats->mv_joint_count[mv_joint]++; + + for (int comp_idx = 0; comp_idx < 2; comp_idx++) { + const int comp_val = comp_idx ? diff.col : diff.row; + const int hp_comp_val = comp_idx ? hp_diff.col : hp_diff.row; + const int lp_comp_val = comp_idx ? lp_diff.col : lp_diff.row; + int rates[5]; + av1_zero_array(rates, 5); + + const int comp_rate = + comp_val ? keep_one_comp_stat(mv_stats, comp_val, comp_idx, cpi, rates) + : 0; + // TODO(chiyotsai@google.com): Properly get hp rate when use_hp is false + const int hp_rate = + hp_comp_val ? rates[0] + rates[1] + rates[2] + rates[3] + rates[4] : 0; + const int lp_rate = + lp_comp_val ? rates[0] + rates[1] + rates[2] + rates[3] : 0; + + mv_stats->total_mv_rate += comp_rate; + mv_stats->hp_total_mv_rate += hp_rate; + mv_stats->lp_total_mv_rate += lp_rate; + } +} + +static AOM_INLINE void collect_mv_stats_b(MV_STATS *mv_stats, + const AV1_COMP *cpi, int mi_row, + int mi_col) { + const AV1_COMMON *cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + + if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) { + return; + } + + const MB_MODE_INFO *mbmi = + mi_params->mi_grid_base[mi_row * mi_params->mi_stride + mi_col]; + const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame = + cpi->mbmi_ext_info.frame_base + + get_mi_ext_idx(mi_row, mi_col, cm->mi_params.mi_alloc_bsize, + cpi->mbmi_ext_info.stride); + + if (!is_inter_block(mbmi)) { + mv_stats->intra_count++; + return; + } + mv_stats->inter_count++; + + const PREDICTION_MODE mode = mbmi->mode; + const int is_compound = has_second_ref(mbmi); + + if (mode == NEWMV || mode == NEW_NEWMV) { + // All mvs are new + for (int ref_idx = 0; ref_idx < 1 + is_compound; ++ref_idx) { + const MV ref_mv = + get_ref_mv_for_mv_stats(mbmi, mbmi_ext_frame, ref_idx).as_mv; + const MV cur_mv = mbmi->mv[ref_idx].as_mv; + keep_one_mv_stat(mv_stats, &ref_mv, &cur_mv, cpi); + } + } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV || + mode == NEW_NEARESTMV || mode == NEW_NEARMV) { + // has exactly one new_mv + mv_stats->default_mvs += 1; + + const int ref_idx = (mode == NEAREST_NEWMV || mode == NEAR_NEWMV); + const MV ref_mv = + get_ref_mv_for_mv_stats(mbmi, mbmi_ext_frame, ref_idx).as_mv; + const MV cur_mv = mbmi->mv[ref_idx].as_mv; + + keep_one_mv_stat(mv_stats, &ref_mv, &cur_mv, cpi); + } else { + // No new_mv + mv_stats->default_mvs += 1 + is_compound; + } + + // Add texture information + const BLOCK_SIZE bsize = mbmi->sb_type; + const int num_rows = block_size_high[bsize]; + const int num_cols = block_size_wide[bsize]; + const int y_stride = cpi->source->y_stride; + const int px_row = 4 * mi_row, px_col = 4 * mi_col; + const int buf_is_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH; + const int bd = cm->seq_params.bit_depth; + if (buf_is_hbd) { + uint16_t *source_buf = + CONVERT_TO_SHORTPTR(cpi->source->y_buffer) + px_row * y_stride + px_col; + for (int row = 0; row < num_rows - 1; row++) { + for (int col = 0; col < num_cols - 1; col++) { + const int offset = row * y_stride + col; + const int horz_diff = + abs(source_buf[offset + 1] - source_buf[offset]) >> (bd - 8); + const int vert_diff = + abs(source_buf[offset + y_stride] - source_buf[offset]) >> (bd - 8); + mv_stats->horz_text += horz_diff; + mv_stats->vert_text += vert_diff; + mv_stats->diag_text += horz_diff * vert_diff; + } + } + } else { + uint8_t *source_buf = cpi->source->y_buffer + px_row * y_stride + px_col; + for (int row = 0; row < num_rows - 1; row++) { + for (int col = 0; col < num_cols - 1; col++) { + const int offset = row * y_stride + col; + const int horz_diff = abs(source_buf[offset + 1] - source_buf[offset]); + const int vert_diff = + abs(source_buf[offset + y_stride] - source_buf[offset]); + mv_stats->horz_text += horz_diff; + mv_stats->vert_text += vert_diff; + mv_stats->diag_text += horz_diff * vert_diff; + } + } + } +} + +// Split block +static AOM_INLINE void collect_mv_stats_sb(MV_STATS *mv_stats, + const AV1_COMP *cpi, int mi_row, + int mi_col, BLOCK_SIZE bsize) { + assert(bsize < BLOCK_SIZES_ALL); + const AV1_COMMON *cm = &cpi->common; + + if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols) + return; + + const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize); + const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); + + const int hbs = mi_size_wide[bsize] / 2; + const int qbs = mi_size_wide[bsize] / 4; + switch (partition) { + case PARTITION_NONE: + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col); + break; + case PARTITION_HORZ: + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col); + collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col); + break; + case PARTITION_VERT: + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col); + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs); + break; + case PARTITION_SPLIT: + collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, subsize); + collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col + hbs, subsize); + collect_mv_stats_sb(mv_stats, cpi, mi_row + hbs, mi_col, subsize); + collect_mv_stats_sb(mv_stats, cpi, mi_row + hbs, mi_col + hbs, subsize); + break; + case PARTITION_HORZ_A: + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col); + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs); + collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col); + break; + case PARTITION_HORZ_B: + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col); + collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col); + collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col + hbs); + break; + case PARTITION_VERT_A: + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col); + collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col); + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs); + break; + case PARTITION_VERT_B: + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col); + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs); + collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col + hbs); + break; + case PARTITION_HORZ_4: + for (int i = 0; i < 4; ++i) { + const int this_mi_row = mi_row + i * qbs; + collect_mv_stats_b(mv_stats, cpi, this_mi_row, mi_col); + } + break; + case PARTITION_VERT_4: + for (int i = 0; i < 4; ++i) { + const int this_mi_col = mi_col + i * qbs; + collect_mv_stats_b(mv_stats, cpi, mi_row, this_mi_col); + } + break; + default: assert(0); + } +} + +static AOM_INLINE void collect_mv_stats_tile(MV_STATS *mv_stats, + const AV1_COMP *cpi, + const TileInfo *tile_info) { + const AV1_COMMON *cm = &cpi->common; + const int mi_row_start = tile_info->mi_row_start; + const int mi_row_end = tile_info->mi_row_end; + const int mi_col_start = tile_info->mi_col_start; + const int mi_col_end = tile_info->mi_col_end; + const int sb_size_mi = cm->seq_params.mib_size; + BLOCK_SIZE sb_size = cm->seq_params.sb_size; + for (int mi_row = mi_row_start; mi_row < mi_row_end; mi_row += sb_size_mi) { + for (int mi_col = mi_col_start; mi_col < mi_col_end; mi_col += sb_size_mi) { + collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, sb_size); + } + } +} + +void av1_collect_mv_stats(AV1_COMP *cpi, int current_q) { + MV_STATS *mv_stats = &cpi->mv_stats; + const AV1_COMMON *cm = &cpi->common; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + + for (int tile_row = 0; tile_row < tile_rows; tile_row++) { + TileInfo tile_info; + av1_tile_set_row(&tile_info, cm, tile_row); + for (int tile_col = 0; tile_col < tile_cols; tile_col++) { + const int tile_idx = tile_row * tile_cols + tile_col; + av1_tile_set_col(&tile_info, cm, tile_col); + cpi->tile_data[tile_idx].tctx = *cm->fc; + cpi->td.mb.e_mbd.tile_ctx = &cpi->tile_data[tile_idx].tctx; + collect_mv_stats_tile(mv_stats, cpi, &tile_info); + } + } + + mv_stats->q = current_q; + mv_stats->order = cpi->common.current_frame.order_hint; + mv_stats->valid = 1; +} + +static AOM_INLINE int get_smart_mv_prec(AV1_COMP *cpi, const MV_STATS *mv_stats, + int current_q) { + const AV1_COMMON *cm = &cpi->common; + const int order_hint = cpi->common.current_frame.order_hint; + const int order_diff = order_hint - mv_stats->order; + aom_clear_system_state(); + const float area = (float)(cm->width * cm->height); + float features[MV_PREC_FEATURE_SIZE] = { + (float)current_q, + (float)mv_stats->q, + (float)order_diff, + mv_stats->inter_count / area, + mv_stats->intra_count / area, + mv_stats->default_mvs / area, + mv_stats->mv_joint_count[0] / area, + mv_stats->mv_joint_count[1] / area, + mv_stats->mv_joint_count[2] / area, + mv_stats->mv_joint_count[3] / area, + mv_stats->last_bit_zero / area, + mv_stats->last_bit_nonzero / area, + mv_stats->total_mv_rate / area, + mv_stats->hp_total_mv_rate / area, + mv_stats->lp_total_mv_rate / area, + mv_stats->horz_text / area, + mv_stats->vert_text / area, + mv_stats->diag_text / area, + }; + + for (int f_idx = 0; f_idx < MV_PREC_FEATURE_SIZE; f_idx++) { + features[f_idx] = + (features[f_idx] - av1_mv_prec_mean[f_idx]) / av1_mv_prec_std[f_idx]; + } + float score = 0.0f; + + av1_nn_predict(features, &av1_mv_prec_dnn_config, 1, &score); + + const int use_high_hp = score >= 0.0f; + return use_high_hp; +} +#endif // !CONFIG_REALTIME_ONLY + +void av1_pick_and_set_high_precision_mv(AV1_COMP *cpi, int qindex) { + int use_hp = qindex < HIGH_PRECISION_MV_QTHRESH; + + if (cpi->sf.hl_sf.high_precision_mv_usage == QTR_ONLY) { + use_hp = 0; + } +#if !CONFIG_REALTIME_ONLY + else if (cpi->sf.hl_sf.high_precision_mv_usage == LAST_MV_DATA && + av1_frame_allows_smart_mv(cpi) && cpi->mv_stats.valid) { + use_hp = get_smart_mv_prec(cpi, &cpi->mv_stats, qindex); + } +#endif // !CONFIG_REALTIME_ONLY + + av1_set_high_precision_mv(cpi, use_hp, + cpi->common.features.cur_frame_force_integer_mv); +} diff --git a/libs/libaom/src/av1/encoder/mv_prec.h b/libs/libaom/src/av1/encoder/mv_prec.h new file mode 100644 index 000000000..8df8b96dc --- /dev/null +++ b/libs/libaom/src/av1/encoder/mv_prec.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_MV_PREC_H_ +#define AOM_AV1_ENCODER_MV_PREC_H_ + +#include "av1/encoder/encoder.h" +#include "av1/encoder/speed_features.h" + +// Q threshold for high precision mv. +#define HIGH_PRECISION_MV_QTHRESH 128 +#if !CONFIG_REALTIME_ONLY +void av1_collect_mv_stats(AV1_COMP *cpi, int current_q); + +static AOM_INLINE int av1_frame_allows_smart_mv(const AV1_COMP *cpi) { + const int gf_group_index = cpi->gf_group.index; + const int gf_update_type = cpi->gf_group.update_type[gf_group_index]; + return !frame_is_intra_only(&cpi->common) && + !(gf_update_type == INTNL_OVERLAY_UPDATE || + gf_update_type == OVERLAY_UPDATE); +} +#endif // !CONFIG_REALTIME_ONLY + +static AOM_INLINE void av1_set_high_precision_mv( + AV1_COMP *cpi, int allow_high_precision_mv, + int cur_frame_force_integer_mv) { + MACROBLOCK *const x = &cpi->td.mb; + const int copy_hp = cpi->common.features.allow_high_precision_mv = + allow_high_precision_mv && !cur_frame_force_integer_mv; + x->nmvcost[0] = &x->nmv_costs[0][MV_MAX]; + x->nmvcost[1] = &x->nmv_costs[1][MV_MAX]; + x->nmvcost_hp[0] = &x->nmv_costs_hp[0][MV_MAX]; + x->nmvcost_hp[1] = &x->nmv_costs_hp[1][MV_MAX]; + int *(*src)[2] = copy_hp ? &x->nmvcost_hp : &x->nmvcost; + x->mv_cost_stack = *src; +} + +void av1_pick_and_set_high_precision_mv(AV1_COMP *cpi, int qindex); + +#endif // AOM_AV1_ENCODER_MV_PREC_H_ diff --git a/libs/libaom/src/av1/encoder/nonrd_pickmode.c b/libs/libaom/src/av1/encoder/nonrd_pickmode.c new file mode 100644 index 000000000..a1180015c --- /dev/null +++ b/libs/libaom/src/av1/encoder/nonrd_pickmode.c @@ -0,0 +1,2182 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + + */ + +#include +#include +#include +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/mem.h" +#include "aom_ports/system_state.h" + +#include "av1/encoder/model_rd.h" +#include "av1/common/mvref_common.h" +#include "av1/common/pred_common.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" + +#include "av1/encoder/encodemv.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/reconinter_enc.h" + +extern int g_pick_inter_mode_cnt; +typedef struct { + uint8_t *data; + int stride; + int in_use; +} PRED_BUFFER; + +typedef struct { + PRED_BUFFER *best_pred; + PREDICTION_MODE best_mode; + TX_SIZE best_tx_size; + TX_SIZE best_intra_tx_size; + MV_REFERENCE_FRAME best_ref_frame; + MV_REFERENCE_FRAME best_second_ref_frame; + uint8_t best_mode_skip_txfm; + int_interpfilters best_pred_filter; +} BEST_PICKMODE; + +typedef struct { + MV_REFERENCE_FRAME ref_frame; + PREDICTION_MODE pred_mode; +} REF_MODE; + +static const int pos_shift_16x16[4][4] = { + { 9, 10, 13, 14 }, { 11, 12, 15, 16 }, { 17, 18, 21, 22 }, { 19, 20, 23, 24 } +}; + +#define RT_INTER_MODES 9 +static const REF_MODE ref_mode_set[RT_INTER_MODES] = { + { LAST_FRAME, NEARESTMV }, { LAST_FRAME, NEARMV }, + { LAST_FRAME, NEWMV }, { GOLDEN_FRAME, NEARESTMV }, + { GOLDEN_FRAME, NEARMV }, { GOLDEN_FRAME, NEWMV }, + { ALTREF_FRAME, NEARESTMV }, { ALTREF_FRAME, NEARMV }, + { ALTREF_FRAME, NEWMV } +}; + +static const THR_MODES mode_idx[REF_FRAMES][4] = { + { THR_DC, THR_V_PRED, THR_H_PRED, THR_SMOOTH }, + { THR_NEARESTMV, THR_NEARMV, THR_GLOBALMV, THR_NEWMV }, + { THR_NEARESTL2, THR_NEARL2, THR_GLOBALL2, THR_NEWL2 }, + { THR_NEARESTL3, THR_NEARL3, THR_GLOBALL3, THR_NEWL3 }, + { THR_NEARESTG, THR_NEARG, THR_GLOBALMV, THR_NEWG }, +}; + +static const PREDICTION_MODE intra_mode_list[] = { DC_PRED, V_PRED, H_PRED, + SMOOTH_PRED }; + +static INLINE int mode_offset(const PREDICTION_MODE mode) { + if (mode >= NEARESTMV) { + return INTER_OFFSET(mode); + } else { + switch (mode) { + case DC_PRED: return 0; + case V_PRED: return 1; + case H_PRED: return 2; + case SMOOTH_PRED: return 3; + default: assert(0); return -1; + } + } +} + +enum { + // INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV), + INTER_NEAREST = (1 << NEARESTMV), + INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV), + INTER_NEAREST_NEAR = (1 << NEARESTMV) | (1 << NEARMV), + INTER_NEAR_NEW = (1 << NEARMV) | (1 << NEWMV), +}; + +static INLINE void init_best_pickmode(BEST_PICKMODE *bp) { + bp->best_mode = NEARESTMV; + bp->best_ref_frame = LAST_FRAME; + bp->best_tx_size = TX_8X8; + bp->best_intra_tx_size = TX_8X8; + bp->best_pred_filter = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + bp->best_mode_skip_txfm = 0; + bp->best_second_ref_frame = NONE_FRAME; + bp->best_pred = NULL; +} + +static int combined_motion_search(AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int_mv *tmp_mv, int *rate_mv, + int64_t best_rd_sofar, int use_base_mv) { + MACROBLOCKD *xd = &x->e_mbd; + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MB_MODE_INFO *mi = xd->mi[0]; + struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } }; + int step_param = cpi->mv_search_params.mv_step_param; + FULLPEL_MV start_mv; + const int ref = mi->ref_frame[0]; + const MV ref_mv = av1_get_ref_mv(x, mi->ref_mv_idx).as_mv; + MV center_mv; + int dis; + int rv = 0; + int cost_list[5]; + int search_subpel = 1; + const YV12_BUFFER_CONFIG *scaled_ref_frame = + av1_get_scaled_ref_frame(cpi, ref); + + if (scaled_ref_frame) { + int i; + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // motion search code to be used without additional modifications. + for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0]; + av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL, + num_planes); + } + + start_mv = get_fullmv_from_mv(&ref_mv); + + if (!use_base_mv) + center_mv = ref_mv; + else + center_mv = tmp_mv->as_mv; + + const search_site_config *src_search_sites = + &cpi->mv_search_params.ss_cfg[SS_CFG_SRC]; + FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; + av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, ¢er_mv, + src_search_sites); + + av1_full_pixel_search(start_mv, &full_ms_params, step_param, + cond_cost_list(cpi, cost_list), &tmp_mv->as_fullmv, + NULL); + + // calculate the bit cost on motion vector + MV mvp_full = get_mv_from_fullmv(&tmp_mv->as_fullmv); + + *rate_mv = av1_mv_bit_cost(&mvp_full, &ref_mv, x->nmv_vec_cost, + x->mv_cost_stack, MV_COST_WEIGHT); + + // TODO(kyslov) Account for Rate Mode! + rv = !(RDCOST(x->rdmult, (*rate_mv), 0) > best_rd_sofar); + + if (rv && search_subpel) { + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv, + cost_list); + MV subpel_start_mv = get_mv_from_fullmv(&tmp_mv->as_fullmv); + cpi->mv_search_params.find_fractional_mv_step( + xd, cm, &ms_params, subpel_start_mv, &tmp_mv->as_mv, &dis, + &x->pred_sse[ref], NULL); + + *rate_mv = av1_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmv_vec_cost, + x->mv_cost_stack, MV_COST_WEIGHT); + } + + if (scaled_ref_frame) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i]; + } + return rv; +} + +static int search_new_mv(AV1_COMP *cpi, MACROBLOCK *x, + int_mv frame_mv[][REF_FRAMES], + MV_REFERENCE_FRAME ref_frame, int gf_temporal_ref, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int best_pred_sad, int *rate_mv, RD_STATS *best_rdc) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mi = xd->mi[0]; + AV1_COMMON *cm = &cpi->common; + if (ref_frame > LAST_FRAME && gf_temporal_ref && + cpi->oxcf.rc_mode == AOM_CBR) { + int tmp_sad; + int dis; + int cost_list[5] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX }; + + if (bsize < BLOCK_16X16) return -1; + + tmp_sad = av1_int_pro_motion_estimation( + cpi, x, bsize, mi_row, mi_col, + &x->mbmi_ext->ref_mv_stack[ref_frame][0].this_mv.as_mv); + + if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) return -1; + if (tmp_sad + (num_pels_log2_lookup[bsize] << 4) > best_pred_sad) return -1; + + frame_mv[NEWMV][ref_frame].as_int = mi->mv[0].as_int; + int_mv best_mv = mi->mv[0]; + best_mv.as_mv.row >>= 3; + best_mv.as_mv.col >>= 3; + MV ref_mv = av1_get_ref_mv(x, 0).as_mv; + + *rate_mv = + av1_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv, &ref_mv, + x->nmv_vec_cost, x->mv_cost_stack, MV_COST_WEIGHT); + frame_mv[NEWMV][ref_frame].as_mv.row >>= 3; + frame_mv[NEWMV][ref_frame].as_mv.col >>= 3; + + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv, + cost_list); + MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv); + cpi->mv_search_params.find_fractional_mv_step( + xd, cm, &ms_params, start_mv, &best_mv.as_mv, &dis, + &x->pred_sse[ref_frame], NULL); + frame_mv[NEWMV][ref_frame].as_int = best_mv.as_int; + } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, + &frame_mv[NEWMV][ref_frame], rate_mv, + best_rdc->rdcost, 0)) { + return -1; + } + + return 0; +} + +static INLINE void find_predictors( + AV1_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, + int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES], int *ref_frame_skip_mask, + const int flag_list[4], TileDataEnc *tile_data, + struct buf_2d yv12_mb[8][MAX_MB_PLANE], BLOCK_SIZE bsize, + int force_skip_low_temp_var) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref_frame); + const int num_planes = av1_num_planes(cm); + (void)tile_data; + + x->pred_mv_sad[ref_frame] = INT_MAX; + frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; + // TODO(kyslov) this needs various further optimizations. to be continued.. + if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) { + const struct scale_factors *const sf = + get_ref_scale_factors_const(cm, ref_frame); + av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes); + av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count, + xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs, + mbmi_ext->mode_context); + // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and + // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs. + av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame); + av1_find_best_ref_mvs_from_stack( + cm->features.allow_high_precision_mv, mbmi_ext, ref_frame, + &frame_mv[NEARESTMV][ref_frame], &frame_mv[NEARMV][ref_frame], 0); + // Early exit for non-LAST frame if force_skip_low_temp_var is set. + if (!av1_is_scaled(sf) && bsize >= BLOCK_8X8 && + !(force_skip_low_temp_var && ref_frame != LAST_FRAME)) { + av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame, + bsize); + } + } else { + *ref_frame_skip_mask |= (1 << ref_frame); + } + av1_count_overlappable_neighbors(cm, xd); + mbmi->num_proj_ref = 1; +} + +static void estimate_single_ref_frame_costs(const AV1_COMMON *cm, + const MACROBLOCKD *xd, + const MACROBLOCK *x, int segment_id, + unsigned int *ref_costs_single) { + int seg_ref_active = + segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME); + if (seg_ref_active) { + memset(ref_costs_single, 0, REF_FRAMES * sizeof(*ref_costs_single)); + } else { + int intra_inter_ctx = av1_get_intra_inter_context(xd); + ref_costs_single[INTRA_FRAME] = x->intra_inter_cost[intra_inter_ctx][0]; + unsigned int base_cost = x->intra_inter_cost[intra_inter_ctx][1]; + + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) + ref_costs_single[i] = base_cost; + + const int ctx_p1 = av1_get_pred_context_single_ref_p1(xd); + const int ctx_p2 = av1_get_pred_context_single_ref_p2(xd); + const int ctx_p3 = av1_get_pred_context_single_ref_p3(xd); + const int ctx_p4 = av1_get_pred_context_single_ref_p4(xd); + const int ctx_p5 = av1_get_pred_context_single_ref_p5(xd); + const int ctx_p6 = av1_get_pred_context_single_ref_p6(xd); + + // Determine cost of a single ref frame, where frame types are represented + // by a tree: + // Level 0: add cost whether this ref is a forward or backward ref + ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p1][0][0]; + ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p1][0][0]; + ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p1][0][0]; + ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p1][0][0]; + ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p1][0][1]; + ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p1][0][1]; + ref_costs_single[ALTREF_FRAME] += x->single_ref_cost[ctx_p1][0][1]; + + // Level 1: if this ref is forward ref, + // add cost whether it is last/last2 or last3/golden + ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p3][2][0]; + ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p3][2][0]; + ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p3][2][1]; + ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p3][2][1]; + + // Level 1: if this ref is backward ref + // then add cost whether this ref is altref or backward ref + ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p2][1][0]; + ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p2][1][0]; + ref_costs_single[ALTREF_FRAME] += x->single_ref_cost[ctx_p2][1][1]; + + // Level 2: further add cost whether this ref is last or last2 + ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p4][3][0]; + ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p4][3][1]; + + // Level 2: last3 or golden + ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p5][4][0]; + ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p5][4][1]; + + // Level 2: bwdref or altref2 + ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p6][5][0]; + ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p6][5][1]; + } +} + +static void estimate_comp_ref_frame_costs( + const AV1_COMMON *cm, const MACROBLOCKD *xd, const MACROBLOCK *x, + int segment_id, unsigned int (*ref_costs_comp)[REF_FRAMES]) { + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { + for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) + memset(ref_costs_comp[ref_frame], 0, + REF_FRAMES * sizeof((*ref_costs_comp)[0])); + } else { + int intra_inter_ctx = av1_get_intra_inter_context(xd); + unsigned int base_cost = x->intra_inter_cost[intra_inter_ctx][1]; + + if (cm->current_frame.reference_mode != SINGLE_REFERENCE) { + // Similar to single ref, determine cost of compound ref frames. + // cost_compound_refs = cost_first_ref + cost_second_ref + const int bwdref_comp_ctx_p = av1_get_pred_context_comp_bwdref_p(xd); + const int bwdref_comp_ctx_p1 = av1_get_pred_context_comp_bwdref_p1(xd); + const int ref_comp_ctx_p = av1_get_pred_context_comp_ref_p(xd); + const int ref_comp_ctx_p1 = av1_get_pred_context_comp_ref_p1(xd); + const int ref_comp_ctx_p2 = av1_get_pred_context_comp_ref_p2(xd); + + const int comp_ref_type_ctx = av1_get_comp_reference_type_context(xd); + unsigned int ref_bicomp_costs[REF_FRAMES] = { 0 }; + + ref_bicomp_costs[LAST_FRAME] = ref_bicomp_costs[LAST2_FRAME] = + ref_bicomp_costs[LAST3_FRAME] = ref_bicomp_costs[GOLDEN_FRAME] = + base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][1]; + ref_bicomp_costs[BWDREF_FRAME] = ref_bicomp_costs[ALTREF2_FRAME] = 0; + ref_bicomp_costs[ALTREF_FRAME] = 0; + + // cost of first ref frame + ref_bicomp_costs[LAST_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][0]; + ref_bicomp_costs[LAST2_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][0]; + ref_bicomp_costs[LAST3_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][1]; + ref_bicomp_costs[GOLDEN_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][1]; + + ref_bicomp_costs[LAST_FRAME] += x->comp_ref_cost[ref_comp_ctx_p1][1][0]; + ref_bicomp_costs[LAST2_FRAME] += x->comp_ref_cost[ref_comp_ctx_p1][1][1]; + + ref_bicomp_costs[LAST3_FRAME] += x->comp_ref_cost[ref_comp_ctx_p2][2][0]; + ref_bicomp_costs[GOLDEN_FRAME] += x->comp_ref_cost[ref_comp_ctx_p2][2][1]; + + // cost of second ref frame + ref_bicomp_costs[BWDREF_FRAME] += + x->comp_bwdref_cost[bwdref_comp_ctx_p][0][0]; + ref_bicomp_costs[ALTREF2_FRAME] += + x->comp_bwdref_cost[bwdref_comp_ctx_p][0][0]; + ref_bicomp_costs[ALTREF_FRAME] += + x->comp_bwdref_cost[bwdref_comp_ctx_p][0][1]; + + ref_bicomp_costs[BWDREF_FRAME] += + x->comp_bwdref_cost[bwdref_comp_ctx_p1][1][0]; + ref_bicomp_costs[ALTREF2_FRAME] += + x->comp_bwdref_cost[bwdref_comp_ctx_p1][1][1]; + + // cost: if one ref frame is forward ref, the other ref is backward ref + for (int ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) { + for (int ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1) { + ref_costs_comp[ref0][ref1] = + ref_bicomp_costs[ref0] + ref_bicomp_costs[ref1]; + } + } + + // cost: if both ref frames are the same side. + const int uni_comp_ref_ctx_p = av1_get_pred_context_uni_comp_ref_p(xd); + const int uni_comp_ref_ctx_p1 = av1_get_pred_context_uni_comp_ref_p1(xd); + const int uni_comp_ref_ctx_p2 = av1_get_pred_context_uni_comp_ref_p2(xd); + ref_costs_comp[LAST_FRAME][LAST2_FRAME] = + base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] + + x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] + + x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][0]; + ref_costs_comp[LAST_FRAME][LAST3_FRAME] = + base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] + + x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] + + x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] + + x->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][0]; + ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] = + base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] + + x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] + + x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] + + x->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][1]; + ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] = + base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] + + x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][1]; + } else { + for (int ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) { + for (int ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1) + ref_costs_comp[ref0][ref1] = 512; + } + ref_costs_comp[LAST_FRAME][LAST2_FRAME] = 512; + ref_costs_comp[LAST_FRAME][LAST3_FRAME] = 512; + ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] = 512; + ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] = 512; + } + } +} + +static TX_SIZE calculate_tx_size(const AV1_COMP *const cpi, BLOCK_SIZE bsize, + MACROBLOCK *const x, unsigned int var, + unsigned int sse) { + MACROBLOCKD *const xd = &x->e_mbd; + TX_SIZE tx_size; + if (x->tx_mode_search_type == TX_MODE_SELECT) { + if (sse > (var << 2)) + tx_size = AOMMIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[x->tx_mode_search_type]); + else + tx_size = TX_8X8; + + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id)) + tx_size = TX_8X8; + else if (tx_size > TX_16X16) + tx_size = TX_16X16; + } else { + tx_size = AOMMIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[x->tx_mode_search_type]); + } + + if (x->tx_mode_search_type != ONLY_4X4 && bsize > BLOCK_32X32) + tx_size = TX_16X16; + + return AOMMIN(tx_size, TX_16X16); +} + +static const uint8_t b_width_log2_lookup[BLOCK_SIZES] = { 0, 0, 1, 1, 1, 2, + 2, 2, 3, 3, 3, 4, + 4, 4, 5, 5 }; +static const uint8_t b_height_log2_lookup[BLOCK_SIZES] = { 0, 1, 0, 1, 2, 1, + 2, 3, 2, 3, 4, 3, + 4, 5, 4, 5 }; + +static void block_variance(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int w, int h, + unsigned int *sse, int *sum, int block_size, + uint32_t *sse8x8, int *sum8x8, uint32_t *var8x8) { + int i, j, k = 0; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + aom_get8x8var(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse8x8[k], + &sum8x8[k]); + *sse += sse8x8[k]; + *sum += sum8x8[k]; + var8x8[k] = sse8x8[k] - (uint32_t)(((int64_t)sum8x8[k] * sum8x8[k]) >> 6); + k++; + } + } +} + +static void calculate_variance(int bw, int bh, TX_SIZE tx_size, + unsigned int *sse_i, int *sum_i, + unsigned int *var_o, unsigned int *sse_o, + int *sum_o) { + const BLOCK_SIZE unit_size = txsize_to_bsize[tx_size]; + const int nw = 1 << (bw - b_width_log2_lookup[unit_size]); + const int nh = 1 << (bh - b_height_log2_lookup[unit_size]); + int i, j, k = 0; + + for (i = 0; i < nh; i += 2) { + for (j = 0; j < nw; j += 2) { + sse_o[k] = sse_i[i * nw + j] + sse_i[i * nw + j + 1] + + sse_i[(i + 1) * nw + j] + sse_i[(i + 1) * nw + j + 1]; + sum_o[k] = sum_i[i * nw + j] + sum_i[i * nw + j + 1] + + sum_i[(i + 1) * nw + j] + sum_i[(i + 1) * nw + j + 1]; + var_o[k] = sse_o[k] - (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >> + (b_width_log2_lookup[unit_size] + + b_height_log2_lookup[unit_size] + 6)); + k++; + } + } +} + +// Adjust the ac_thr according to speed, width, height and normalized sum +static int ac_thr_factor(const int speed, const int width, const int height, + const int norm_sum) { + if (speed >= 8 && norm_sum < 5) { + if (width <= 640 && height <= 480) + return 4; + else + return 2; + } + return 1; +} + +static void model_skip_for_sb_y_large(AV1_COMP *cpi, BLOCK_SIZE bsize, + int mi_row, int mi_col, MACROBLOCK *x, + MACROBLOCKD *xd, int *out_rate, + int64_t *out_dist, unsigned int *var_y, + unsigned int *sse_y, int *early_term, + int calculate_rd) { + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + unsigned int sse; + struct macroblock_plane *const p = &x->plane[0]; + struct macroblockd_plane *const pd = &xd->plane[0]; + const uint32_t dc_quant = p->dequant_QTX[0]; + const uint32_t ac_quant = p->dequant_QTX[1]; + const int64_t dc_thr = dc_quant * dc_quant >> 6; + int64_t ac_thr = ac_quant * ac_quant >> 6; + unsigned int var; + int sum; + + const int bw = b_width_log2_lookup[bsize]; + const int bh = b_height_log2_lookup[bsize]; + const int num8x8 = 1 << (bw + bh - 2); + unsigned int sse8x8[256] = { 0 }; + int sum8x8[256] = { 0 }; + unsigned int var8x8[256] = { 0 }; + TX_SIZE tx_size; + int k; + // Calculate variance for whole partition, and also save 8x8 blocks' variance + // to be used in following transform skipping test. + block_variance(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, + 4 << bw, 4 << bh, &sse, &sum, 8, sse8x8, sum8x8, var8x8); + var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4)); + + *var_y = var; + *sse_y = sse; + + ac_thr *= ac_thr_factor(cpi->oxcf.speed, cpi->common.width, + cpi->common.height, abs(sum) >> (bw + bh)); + + tx_size = calculate_tx_size(cpi, bsize, x, var, sse); + // The code below for setting skip flag assumes tranform size of at least 8x8, + // so force this lower limit on transform. + if (tx_size < TX_8X8) tx_size = TX_8X8; + xd->mi[0]->tx_size = tx_size; + + // Evaluate if the partition block is a skippable block in Y plane. + { + unsigned int sse16x16[64] = { 0 }; + int sum16x16[64] = { 0 }; + unsigned int var16x16[64] = { 0 }; + const int num16x16 = num8x8 >> 2; + + unsigned int sse32x32[16] = { 0 }; + int sum32x32[16] = { 0 }; + unsigned int var32x32[16] = { 0 }; + const int num32x32 = num8x8 >> 4; + + int ac_test = 1; + int dc_test = 1; + const int num = (tx_size == TX_8X8) + ? num8x8 + : ((tx_size == TX_16X16) ? num16x16 : num32x32); + const unsigned int *sse_tx = + (tx_size == TX_8X8) ? sse8x8 + : ((tx_size == TX_16X16) ? sse16x16 : sse32x32); + const unsigned int *var_tx = + (tx_size == TX_8X8) ? var8x8 + : ((tx_size == TX_16X16) ? var16x16 : var32x32); + + // Calculate variance if tx_size > TX_8X8 + if (tx_size >= TX_16X16) + calculate_variance(bw, bh, TX_8X8, sse8x8, sum8x8, var16x16, sse16x16, + sum16x16); + if (tx_size == TX_32X32) + calculate_variance(bw, bh, TX_16X16, sse16x16, sum16x16, var32x32, + sse32x32, sum32x32); + + // Skipping test + *early_term = 0; + for (k = 0; k < num; k++) + // Check if all ac coefficients can be quantized to zero. + if (!(var_tx[k] < ac_thr || var == 0)) { + ac_test = 0; + break; + } + + for (k = 0; k < num; k++) + // Check if dc coefficient can be quantized to zero. + if (!(sse_tx[k] - var_tx[k] < dc_thr || sse == var)) { + dc_test = 0; + break; + } + + if (ac_test && dc_test) { + int skip_uv[2] = { 0 }; + unsigned int var_uv[2]; + unsigned int sse_uv[2]; + AV1_COMMON *const cm = &cpi->common; + // Transform skipping test in UV planes. + for (int i = 1; i <= 2; i++) { + int j = i - 1; + skip_uv[j] = 1; + if (x->color_sensitivity[j]) { + skip_uv[j] = 0; + struct macroblock_plane *const puv = &x->plane[i]; + struct macroblockd_plane *const puvd = &xd->plane[i]; + const BLOCK_SIZE uv_bsize = get_plane_block_size( + bsize, puvd->subsampling_x, puvd->subsampling_y); + // Adjust these thresholds for UV. + const int64_t uv_dc_thr = + (puv->dequant_QTX[0] * puv->dequant_QTX[0]) >> 3; + const int64_t uv_ac_thr = + (puv->dequant_QTX[1] * puv->dequant_QTX[1]) >> 3; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, i, + i); + var_uv[j] = cpi->fn_ptr[uv_bsize].vf(puv->src.buf, puv->src.stride, + puvd->dst.buf, puvd->dst.stride, + &sse_uv[j]); + if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) && + (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j])) + skip_uv[j] = 1; + else + break; + } + } + if (skip_uv[0] & skip_uv[1]) { + *early_term = 1; + } + } + } + if (calculate_rd && out_dist != NULL && out_rate != NULL) { + if (!*early_term) { + const int bwide = block_size_wide[bsize]; + const int bhigh = block_size_high[bsize]; + + model_rd_with_curvfit(cpi, x, bsize, AOM_PLANE_Y, sse, bwide * bhigh, + out_rate, out_dist); + } + + if (*early_term) { + *out_rate = 0; + *out_dist = sse << 4; + } + } +} + +static void model_rd_for_sb_y(const AV1_COMP *const cpi, BLOCK_SIZE bsize, + MACROBLOCK *x, MACROBLOCKD *xd, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, + int64_t *skip_sse_sb, unsigned int *var_y, + unsigned int *sse_y, int calculate_rd) { + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + const int ref = xd->mi[0]->ref_frame[0]; + + assert(bsize < BLOCK_SIZES_ALL); + + struct macroblock_plane *const p = &x->plane[0]; + struct macroblockd_plane *const pd = &xd->plane[0]; + unsigned int sse; + int rate; + int64_t dist; + + unsigned int var = cpi->fn_ptr[bsize].vf(p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride, &sse); + xd->mi[0]->tx_size = calculate_tx_size(cpi, bsize, x, var, sse); + + if (calculate_rd) { + const int bwide = block_size_wide[bsize]; + const int bhigh = block_size_high[bsize]; + model_rd_with_curvfit(cpi, x, bsize, AOM_PLANE_Y, sse, bwide * bhigh, &rate, + &dist); + } else { + rate = INT_MAX; // this will be overwritten later with block_yrd + dist = INT_MAX; + } + *var_y = var; + *sse_y = sse; + x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); + + assert(rate >= 0); + + if (skip_txfm_sb) *skip_txfm_sb = rate == 0; + if (skip_sse_sb) *skip_sse_sb = sse << 4; + rate = AOMMIN(rate, INT_MAX); + *out_rate_sum = (int)rate; + *out_dist_sum = dist; +} + +static void block_yrd(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, + RD_STATS *this_rdc, int *skippable, int64_t *sse, + BLOCK_SIZE bsize, TX_SIZE tx_size) { + MACROBLOCKD *xd = &x->e_mbd; + const struct macroblockd_plane *pd = &xd->plane[0]; + struct macroblock_plane *const p = &x->plane[0]; + const int num_4x4_w = mi_size_wide[bsize]; + const int num_4x4_h = mi_size_high[bsize]; + const int step = 1 << (tx_size << 1); + const int block_step = (1 << tx_size); + int block = 0; + const int max_blocks_wide = + num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> 5); + const int max_blocks_high = + num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> 5); + int eob_cost = 0; + const int bw = 4 * num_4x4_w; + const int bh = 4 * num_4x4_h; + + (void)mi_row; + (void)mi_col; + (void)cpi; + +#if CONFIG_AV1_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + aom_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, + p->src.stride, pd->dst.buf, pd->dst.stride, + x->e_mbd.bd); + } else { + aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride); + } +#else + aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride); +#endif + + *skippable = 1; + // Keep track of the row and column of the blocks we use so that we know + // if we are in the unrestricted motion border. + for (int r = 0; r < max_blocks_high; r += block_step) { + for (int c = 0; c < num_4x4_w; c += block_step) { + if (c < max_blocks_wide) { + const SCAN_ORDER *const scan_order = &av1_default_scan_orders[tx_size]; + const int block_offset = BLOCK_OFFSET(block); +#if CONFIG_AV1_HIGHBITDEPTH + tran_low_t *const coeff = p->coeff + block_offset; + tran_low_t *const qcoeff = p->qcoeff + block_offset; + tran_low_t *const dqcoeff = pd->dqcoeff + block_offset; +#else + int16_t *const low_coeff = (int16_t *)p->coeff + block_offset; + int16_t *const low_qcoeff = (int16_t *)p->qcoeff + block_offset; + int16_t *const low_dqcoeff = (int16_t *)pd->dqcoeff + block_offset; +#endif + uint16_t *const eob = &p->eobs[block]; + const int diff_stride = bw; + const int16_t *src_diff; + src_diff = &p->src_diff[(r * diff_stride + c) << 2]; + + switch (tx_size) { + case TX_64X64: + assert(0); // Not implemented + break; + case TX_32X32: + assert(0); // Not used + break; +#if CONFIG_AV1_HIGHBITDEPTH + case TX_16X16: + aom_hadamard_16x16(src_diff, diff_stride, coeff); + av1_quantize_fp(coeff, 16 * 16, p->zbin_QTX, p->round_fp_QTX, + p->quant_fp_QTX, p->quant_shift_QTX, qcoeff, + dqcoeff, p->dequant_QTX, eob, scan_order->scan, + scan_order->iscan); + break; + case TX_8X8: + aom_hadamard_8x8(src_diff, diff_stride, coeff); + av1_quantize_fp(coeff, 8 * 8, p->zbin_QTX, p->round_fp_QTX, + p->quant_fp_QTX, p->quant_shift_QTX, qcoeff, + dqcoeff, p->dequant_QTX, eob, scan_order->scan, + scan_order->iscan); + break; +#else + case TX_16X16: + aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff); + av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX, + p->quant_fp_QTX, low_qcoeff, low_dqcoeff, + p->dequant_QTX, eob, scan_order->scan); + break; + case TX_8X8: + aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff); + av1_quantize_lp(low_coeff, 8 * 8, p->round_fp_QTX, p->quant_fp_QTX, + low_qcoeff, low_dqcoeff, p->dequant_QTX, eob, + scan_order->scan); + break; + default: + assert(tx_size == TX_4X4); + x->fwd_txfm4x4(src_diff, low_coeff, diff_stride); + av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX, p->quant_fp_QTX, + low_qcoeff, low_dqcoeff, p->dequant_QTX, eob, + scan_order->scan); + break; +#endif + } + *skippable &= (*eob == 0); + eob_cost += 1; + } + block += step; + } + } + this_rdc->skip = *skippable; + this_rdc->rate = 0; + if (*sse < INT64_MAX) { + *sse = (*sse << 6) >> 2; + if (*skippable) { + this_rdc->dist = *sse; + return; + } + } + + block = 0; + this_rdc->dist = 0; + for (int r = 0; r < max_blocks_high; r += block_step) { + for (int c = 0; c < num_4x4_w; c += block_step) { + if (c < max_blocks_wide) { + const int block_offset = BLOCK_OFFSET(block); + uint16_t *const eob = &p->eobs[block]; +#if CONFIG_AV1_HIGHBITDEPTH + int64_t dummy; + tran_low_t *const coeff = p->coeff + block_offset; + tran_low_t *const qcoeff = p->qcoeff + block_offset; + tran_low_t *const dqcoeff = pd->dqcoeff + block_offset; + + if (*eob == 1) + this_rdc->rate += (int)abs(qcoeff[0]); + else if (*eob > 1) + this_rdc->rate += aom_satd(qcoeff, step << 4); + + this_rdc->dist += + av1_block_error(coeff, dqcoeff, step << 4, &dummy) >> 2; +#else + int16_t *const low_coeff = (int16_t *)p->coeff + block_offset; + int16_t *const low_qcoeff = (int16_t *)p->qcoeff + block_offset; + int16_t *const low_dqcoeff = (int16_t *)pd->dqcoeff + block_offset; + + if (*eob == 1) + this_rdc->rate += (int)abs(low_qcoeff[0]); + else if (*eob > 1) + this_rdc->rate += aom_satd_lp(low_qcoeff, step << 4); + + this_rdc->dist += + av1_block_error_lp(low_coeff, low_dqcoeff, step << 4) >> 2; +#endif + } + block += step; + } + } + + // If skippable is set, rate gets clobbered later. + this_rdc->rate <<= (2 + AV1_PROB_COST_SHIFT); + this_rdc->rate += (eob_cost << AV1_PROB_COST_SHIFT); +} + +static INLINE void init_mbmi(MB_MODE_INFO *mbmi, PREDICTION_MODE pred_mode, + MV_REFERENCE_FRAME ref_frame0, + MV_REFERENCE_FRAME ref_frame1, + const AV1_COMMON *cm) { + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + mbmi->ref_mv_idx = 0; + mbmi->mode = pred_mode; + mbmi->uv_mode = UV_DC_PRED; + mbmi->ref_frame[0] = ref_frame0; + mbmi->ref_frame[1] = ref_frame1; + pmi->palette_size[0] = 0; + pmi->palette_size[1] = 0; + mbmi->filter_intra_mode_info.use_filter_intra = 0; + mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0; + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->num_proj_ref = 1; + mbmi->interintra_mode = 0; + set_default_interp_filters(mbmi, cm->features.interp_filter); +} + +#if CONFIG_INTERNAL_STATS +static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, + int mode_index) { +#else +static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { +#endif // CONFIG_INTERNAL_STATS + MACROBLOCKD *const xd = &x->e_mbd; + + // Take a snapshot of the coding context so it can be + // restored if we decide to encode this way + ctx->rd_stats.skip = x->force_skip; + memset(ctx->blk_skip, 0, sizeof(ctx->blk_skip[0]) * ctx->num_4x4_blk); + memset(ctx->tx_type_map, DCT_DCT, + sizeof(ctx->tx_type_map[0]) * ctx->num_4x4_blk); + ctx->skippable = x->force_skip; +#if CONFIG_INTERNAL_STATS + ctx->best_mode_index = mode_index; +#endif // CONFIG_INTERNAL_STATS + ctx->mic = *xd->mi[0]; + ctx->skippable = x->force_skip; + av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, x->mbmi_ext, + av1_ref_frame_type(xd->mi[0]->ref_frame)); + ctx->comp_pred_diff = 0; + ctx->hybrid_pred_diff = 0; + ctx->single_pred_diff = 0; +} + +static int get_pred_buffer(PRED_BUFFER *p, int len) { + for (int i = 0; i < len; i++) { + if (!p[i].in_use) { + p[i].in_use = 1; + return i; + } + } + return -1; +} + +static void free_pred_buffer(PRED_BUFFER *p) { + if (p != NULL) p->in_use = 0; +} + +static int cost_mv_ref(const MACROBLOCK *const x, PREDICTION_MODE mode, + int16_t mode_context) { + if (is_inter_compound_mode(mode)) { + return x + ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)]; + } + + int mode_cost = 0; + int16_t mode_ctx = mode_context & NEWMV_CTX_MASK; + + assert(is_inter_mode(mode)); + + if (mode == NEWMV) { + mode_cost = x->newmv_mode_cost[mode_ctx][0]; + return mode_cost; + } else { + mode_cost = x->newmv_mode_cost[mode_ctx][1]; + mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; + + if (mode == GLOBALMV) { + mode_cost += x->zeromv_mode_cost[mode_ctx][0]; + return mode_cost; + } else { + mode_cost += x->zeromv_mode_cost[mode_ctx][1]; + mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK; + mode_cost += x->refmv_mode_cost[mode_ctx][mode != NEARESTMV]; + return mode_cost; + } + } +} + +static void newmv_diff_bias(MACROBLOCKD *xd, PREDICTION_MODE this_mode, + RD_STATS *this_rdc, BLOCK_SIZE bsize, int mv_row, + int mv_col, int speed, uint32_t spatial_variance) { + // Bias against MVs associated with NEWMV mode that are very different from + // top/left neighbors. + if (this_mode == NEWMV) { + int al_mv_average_row; + int al_mv_average_col; + int left_row, left_col; + int row_diff, col_diff; + int above_mv_valid = 0; + int left_mv_valid = 0; + int above_row = 0; + int above_col = 0; + + if (xd->above_mbmi) { + above_mv_valid = xd->above_mbmi->mv[0].as_int != INVALID_MV; + above_row = xd->above_mbmi->mv[0].as_mv.row; + above_col = xd->above_mbmi->mv[0].as_mv.col; + } + if (xd->left_mbmi) { + left_mv_valid = xd->left_mbmi->mv[0].as_int != INVALID_MV; + left_row = xd->left_mbmi->mv[0].as_mv.row; + left_col = xd->left_mbmi->mv[0].as_mv.col; + } + if (above_mv_valid && left_mv_valid) { + al_mv_average_row = (above_row + left_row + 1) >> 1; + al_mv_average_col = (above_col + left_col + 1) >> 1; + } else if (above_mv_valid) { + al_mv_average_row = above_row; + al_mv_average_col = above_col; + } else if (left_mv_valid) { + al_mv_average_row = left_row; + al_mv_average_col = left_col; + } else { + al_mv_average_row = al_mv_average_col = 0; + } + row_diff = al_mv_average_row - mv_row; + col_diff = al_mv_average_col - mv_col; + if (row_diff > 80 || row_diff < -80 || col_diff > 80 || col_diff < -80) { + if (bsize >= BLOCK_32X32) + this_rdc->rdcost = this_rdc->rdcost << 1; + else + this_rdc->rdcost = 5 * this_rdc->rdcost >> 2; + } + } else { + // Bias for speed >= 8 for low spatial variance. + if (speed >= 8 && spatial_variance < 150 && + (mv_row > 64 || mv_row < -64 || mv_col > 64 || mv_col < -64)) + this_rdc->rdcost = 5 * this_rdc->rdcost >> 2; + } +} + +static void model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize, + MACROBLOCK *x, MACROBLOCKD *xd, + RD_STATS *this_rdc, unsigned int *var_y, + unsigned int *sse_y, int start_plane, + int stop_plane) { + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + unsigned int sse; + int rate; + int64_t dist; + int i; + uint32_t tot_var = *var_y; + uint32_t tot_sse = *sse_y; + + this_rdc->rate = 0; + this_rdc->dist = 0; + this_rdc->skip = 0; + + for (i = start_plane; i <= stop_plane; ++i) { + struct macroblock_plane *const p = &x->plane[i]; + struct macroblockd_plane *const pd = &xd->plane[i]; + const uint32_t dc_quant = p->dequant_QTX[0]; + const uint32_t ac_quant = p->dequant_QTX[1]; + const BLOCK_SIZE bs = plane_bsize; + unsigned int var; + if (!x->color_sensitivity[i - 1]) continue; + + var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, &sse); + assert(sse >= var); + tot_var += var; + tot_sse += sse; + + av1_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs], + dc_quant >> 3, &rate, &dist); + + this_rdc->rate += rate >> 1; + this_rdc->dist += dist << 3; + + av1_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs], ac_quant >> 3, + &rate, &dist); + + this_rdc->rate += rate; + this_rdc->dist += dist << 4; + } + + if (this_rdc->rate == 0) { + this_rdc->skip = 1; + } + + if (RDCOST(x->rdmult, this_rdc->rate, this_rdc->dist) >= + RDCOST(x->rdmult, 0, ((int64_t)tot_sse) << 4)) { + this_rdc->rate = 0; + this_rdc->dist = tot_sse << 4; + this_rdc->skip = 1; + } + + *var_y = tot_var; + *sse_y = tot_sse; +} + +struct estimate_block_intra_args { + AV1_COMP *cpi; + MACROBLOCK *x; + PREDICTION_MODE mode; + int skippable; + RD_STATS *rdc; +}; + +static void estimate_block_intra(int plane, int block, int row, int col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *arg) { + struct estimate_block_intra_args *const args = arg; + AV1_COMP *const cpi = args->cpi; + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE bsize_tx = txsize_to_bsize[tx_size]; + uint8_t *const src_buf_base = p->src.buf; + uint8_t *const dst_buf_base = pd->dst.buf; + const int64_t src_stride = p->src.stride; + const int64_t dst_stride = pd->dst.stride; + RD_STATS this_rdc; + + (void)block; + + p->src.buf = &src_buf_base[4 * (row * src_stride + col)]; + pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)]; + + av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size); + + if (plane == 0) { + int64_t this_sse = INT64_MAX; + block_yrd(cpi, x, 0, 0, &this_rdc, &args->skippable, &this_sse, bsize_tx, + AOMMIN(tx_size, TX_16X16)); + } else { + unsigned int var = 0; + unsigned int sse = 0; + model_rd_for_sb_uv(cpi, plane_bsize, x, xd, &this_rdc, &var, &sse, plane, + plane); + } + + p->src.buf = src_buf_base; + pd->dst.buf = dst_buf_base; + args->rdc->rate += this_rdc.rate; + args->rdc->dist += this_rdc.dist; +} + +static INLINE void update_thresh_freq_fact(AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, + MV_REFERENCE_FRAME ref_frame, + THR_MODES best_mode_idx, + PREDICTION_MODE mode) { + THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)]; + int *freq_fact = &x->thresh_freq_fact[bsize][thr_mode_idx]; + if (thr_mode_idx == best_mode_idx) { + *freq_fact -= (*freq_fact >> 4); + } else { + *freq_fact = + AOMMIN(*freq_fact + RD_THRESH_INC, + cpi->sf.inter_sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT); + } +} + +static INLINE int get_force_skip_low_temp_var_small_sb(uint8_t *variance_low, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { + // Relative indices of MB inside the superblock. + const int mi_x = mi_row & 0xF; + const int mi_y = mi_col & 0xF; + // Relative indices of 16x16 block inside the superblock. + const int i = mi_x >> 2; + const int j = mi_y >> 2; + int force_skip_low_temp_var = 0; + // Set force_skip_low_temp_var based on the block size and block offset. + switch (bsize) { + case BLOCK_64X64: force_skip_low_temp_var = variance_low[0]; break; + case BLOCK_64X32: + if (!mi_y && !mi_x) { + force_skip_low_temp_var = variance_low[1]; + } else if (!mi_y && mi_x) { + force_skip_low_temp_var = variance_low[2]; + } + break; + case BLOCK_32X64: + if (!mi_y && !mi_x) { + force_skip_low_temp_var = variance_low[3]; + } else if (mi_y && !mi_x) { + force_skip_low_temp_var = variance_low[4]; + } + break; + case BLOCK_32X32: + if (!mi_y && !mi_x) { + force_skip_low_temp_var = variance_low[5]; + } else if (mi_y && !mi_x) { + force_skip_low_temp_var = variance_low[6]; + } else if (!mi_y && mi_x) { + force_skip_low_temp_var = variance_low[7]; + } else if (mi_y && mi_x) { + force_skip_low_temp_var = variance_low[8]; + } + break; + case BLOCK_32X16: + case BLOCK_16X32: + case BLOCK_16X16: + force_skip_low_temp_var = variance_low[pos_shift_16x16[i][j]]; + break; + default: break; + } + + return force_skip_low_temp_var; +} + +static INLINE int get_force_skip_low_temp_var(uint8_t *variance_low, int mi_row, + int mi_col, BLOCK_SIZE bsize) { + int force_skip_low_temp_var = 0; + int x, y; + x = (mi_col & 0x1F) >> 4; + // y = (mi_row & 0x1F) >> 4; + // const int idx64 = (y << 1) + x; + y = (mi_row & 0x17) >> 3; + const int idx64 = y + x; + + x = (mi_col & 0xF) >> 3; + // y = (mi_row & 0xF) >> 3; + // const int idx32 = (y << 1) + x; + y = (mi_row & 0xB) >> 2; + const int idx32 = y + x; + + x = (mi_col & 0x7) >> 2; + // y = (mi_row & 0x7) >> 2; + // const int idx16 = (y << 1) + x; + y = (mi_row & 0x5) >> 1; + const int idx16 = y + x; + // Set force_skip_low_temp_var based on the block size and block offset. + switch (bsize) { + case BLOCK_128X128: force_skip_low_temp_var = variance_low[0]; break; + case BLOCK_128X64: + assert((mi_col & 0x1F) == 0); + force_skip_low_temp_var = variance_low[1 + ((mi_row & 0x1F) != 0)]; + break; + case BLOCK_64X128: + assert((mi_row & 0x1F) == 0); + force_skip_low_temp_var = variance_low[3 + ((mi_col & 0x1F) != 0)]; + break; + case BLOCK_64X64: + // Location of this 64x64 block inside the 128x128 superblock + force_skip_low_temp_var = variance_low[5 + idx64]; + break; + case BLOCK_64X32: + x = (mi_col & 0x1F) >> 4; + y = (mi_row & 0x1F) >> 3; + /* + .---------------.---------------. + | x=0,y=0,idx=0 | x=0,y=0,idx=2 | + :---------------+---------------: + | x=0,y=1,idx=1 | x=1,y=1,idx=3 | + :---------------+---------------: + | x=0,y=2,idx=4 | x=1,y=2,idx=6 | + :---------------+---------------: + | x=0,y=3,idx=5 | x=1,y=3,idx=7 | + '---------------'---------------' + */ + const int idx64x32 = (x << 1) + (y % 2) + ((y >> 1) << 2); + force_skip_low_temp_var = variance_low[9 + idx64x32]; + break; + case BLOCK_32X64: + x = (mi_col & 0x1F) >> 3; + y = (mi_row & 0x1F) >> 4; + const int idx32x64 = (y << 2) + x; + force_skip_low_temp_var = variance_low[17 + idx32x64]; + break; + case BLOCK_32X32: + force_skip_low_temp_var = variance_low[25 + (idx64 << 2) + idx32]; + break; + case BLOCK_32X16: + case BLOCK_16X32: + case BLOCK_16X16: + force_skip_low_temp_var = + variance_low[41 + (idx64 << 4) + (idx32 << 2) + idx16]; + break; + default: break; + } + return force_skip_low_temp_var; +} + +#define FILTER_SEARCH_SIZE 2 +static void search_filter_ref(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc, + int mi_row, int mi_col, PRED_BUFFER *tmp, + BLOCK_SIZE bsize, int reuse_inter_pred, + PRED_BUFFER **this_mode_pred, unsigned int *var_y, + unsigned int *sse_y, int *this_early_term, + int use_model_yrd_large, int64_t *sse_block_yrd) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblockd_plane *const pd = &xd->plane[0]; + MB_MODE_INFO *const mi = xd->mi[0]; + const int bw = block_size_wide[bsize]; + int pf_rate[FILTER_SEARCH_SIZE] = { 0 }; + int64_t pf_dist[FILTER_SEARCH_SIZE] = { 0 }; + unsigned int pf_var[FILTER_SEARCH_SIZE] = { 0 }; + unsigned int pf_sse[FILTER_SEARCH_SIZE] = { 0 }; + int64_t pf_sse_block_yrd[FILTER_SEARCH_SIZE] = { 0 }; + TX_SIZE pf_tx_size[FILTER_SEARCH_SIZE] = { 0 }; + PRED_BUFFER *current_pred = *this_mode_pred; + int skip_txfm[FILTER_SEARCH_SIZE] = { 0 }; + int best_skip = 0; + int best_early_term = 0; + int64_t best_cost = INT64_MAX; + int best_filter_index = -1; + InterpFilter filters[FILTER_SEARCH_SIZE] = { EIGHTTAP_REGULAR, + EIGHTTAP_SMOOTH }; + int i; + for (i = 0; i < FILTER_SEARCH_SIZE; ++i) { + int64_t cost; + InterpFilter filter = filters[i]; + mi->interp_filters = av1_broadcast_interp_filter(filter); + av1_enc_build_inter_predictor_y(xd, mi_row, mi_col); + if (use_model_yrd_large) + model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, &pf_rate[i], + &pf_dist[i], &pf_var[i], &pf_sse[i], + this_early_term, 1); + else + model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[i], &pf_dist[i], + &skip_txfm[i], NULL, &pf_var[i], &pf_sse[i], 1); + pf_rate[i] += av1_get_switchable_rate(x, xd, cm->features.interp_filter); + cost = RDCOST(x->rdmult, pf_rate[i], pf_dist[i]); + pf_tx_size[i] = mi->tx_size; + if (cost < best_cost) { + best_filter_index = i; + best_cost = cost; + best_skip = skip_txfm[i]; + best_early_term = *this_early_term; + if (reuse_inter_pred) { + if (*this_mode_pred != current_pred) { + free_pred_buffer(*this_mode_pred); + *this_mode_pred = current_pred; + } + current_pred = &tmp[get_pred_buffer(tmp, 3)]; + pd->dst.buf = current_pred->data; + pd->dst.stride = bw; + } + } + } + assert(best_filter_index >= 0 && best_filter_index < FILTER_SEARCH_SIZE); + if (reuse_inter_pred && *this_mode_pred != current_pred) + free_pred_buffer(current_pred); + + mi->interp_filters = av1_broadcast_interp_filter(filters[best_filter_index]); + mi->tx_size = pf_tx_size[best_filter_index]; + this_rdc->rate = pf_rate[best_filter_index]; + this_rdc->dist = pf_dist[best_filter_index]; + *var_y = pf_var[best_filter_index]; + *sse_y = pf_sse[best_filter_index]; + *sse_block_yrd = pf_sse_block_yrd[best_filter_index]; + this_rdc->skip = (best_skip || best_early_term); + *this_early_term = best_early_term; + if (reuse_inter_pred) { + pd->dst.buf = (*this_mode_pred)->data; + pd->dst.stride = (*this_mode_pred)->stride; + } else if (best_filter_index < FILTER_SEARCH_SIZE - 1) { + av1_enc_build_inter_predictor_y(xd, mi_row, mi_col); + } +} + +#define COLLECT_PICK_MODE_STAT 0 + +#if COLLECT_PICK_MODE_STAT +typedef struct _mode_search_stat { + int32_t num_blocks[BLOCK_SIZES]; + int64_t avg_block_times[BLOCK_SIZES]; + int32_t num_searches[BLOCK_SIZES][MB_MODE_COUNT]; + int32_t num_nonskipped_searches[BLOCK_SIZES][MB_MODE_COUNT]; + int64_t search_times[BLOCK_SIZES][MB_MODE_COUNT]; + int64_t nonskipped_search_times[BLOCK_SIZES][MB_MODE_COUNT]; + struct aom_usec_timer timer1; + struct aom_usec_timer timer2; +} mode_search_stat; +#endif // COLLECT_PICK_MODE_STAT + +static void compute_intra_yprediction(const AV1_COMMON *cm, + PREDICTION_MODE mode, BLOCK_SIZE bsize, + MACROBLOCK *x, MACROBLOCKD *xd) { + struct macroblockd_plane *const pd = &xd->plane[0]; + struct macroblock_plane *const p = &x->plane[0]; + uint8_t *const src_buf_base = p->src.buf; + uint8_t *const dst_buf_base = pd->dst.buf; + const int src_stride = p->src.stride; + const int dst_stride = pd->dst.stride; + int plane = 0; + int row, col; + // block and transform sizes, in number of 4x4 blocks log 2 ("*_b") + // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 + // transform size varies per plane, look it up in a common way. + const TX_SIZE tx_size = max_txsize_lookup[bsize]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + // If mb_to_right_edge is < 0 we are in a situation in which + // the current block size extends into the UMV and we won't + // visit the sub blocks that are wholly within the UMV. + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + const int max_blocks_high = max_block_high(xd, plane_bsize, plane); + // Keep track of the row and column of the blocks we use so that we know + // if we are in the unrestricted motion border. + for (row = 0; row < max_blocks_high; row += (1 << tx_size)) { + // Skip visiting the sub blocks that are wholly within the UMV. + for (col = 0; col < max_blocks_wide; col += (1 << tx_size)) { + p->src.buf = &src_buf_base[4 * (row * (int64_t)src_stride + col)]; + pd->dst.buf = &dst_buf_base[4 * (row * (int64_t)dst_stride + col)]; + av1_predict_intra_block(cm, xd, block_size_wide[bsize], + block_size_high[bsize], tx_size, mode, 0, 0, + FILTER_INTRA_MODES, pd->dst.buf, dst_stride, + pd->dst.buf, dst_stride, 0, 0, plane); + } + } + p->src.buf = src_buf_base; + pd->dst.buf = dst_buf_base; +} + +void av1_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mi = xd->mi[0]; + RD_STATS this_rdc, best_rdc; + struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 }; + const TX_SIZE intra_tx_size = + AOMMIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[x->tx_mode_search_type]); + int *bmode_costs; + const MB_MODE_INFO *above_mi = xd->above_mbmi; + const MB_MODE_INFO *left_mi = xd->left_mbmi; + const PREDICTION_MODE A = av1_above_block_mode(above_mi); + const PREDICTION_MODE L = av1_left_block_mode(left_mi); + bmode_costs = x->y_mode_costs[A][L]; + + av1_invalid_rd_stats(&best_rdc); + av1_invalid_rd_stats(&this_rdc); + + init_mbmi(mi, DC_PRED, INTRA_FRAME, NONE_FRAME, cm); + mi->mv[0].as_int = mi->mv[1].as_int = INVALID_MV; + + // Change the limit of this loop to add other intra prediction + // mode tests. + for (int i = 0; i < 4; ++i) { + PREDICTION_MODE this_mode = intra_mode_list[i]; + this_rdc.dist = this_rdc.rate = 0; + args.mode = this_mode; + args.skippable = 1; + args.rdc = &this_rdc; + mi->tx_size = intra_tx_size; + av1_foreach_transformed_block_in_plane(xd, bsize, 0, estimate_block_intra, + &args); + if (args.skippable) { + this_rdc.rate = av1_cost_symbol(av1_get_skip_cdf(xd)[1]); + } else { + this_rdc.rate += av1_cost_symbol(av1_get_skip_cdf(xd)[0]); + } + this_rdc.rate += bmode_costs[this_mode]; + this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist); + + if (this_rdc.rdcost < best_rdc.rdcost) { + best_rdc = this_rdc; + mi->mode = this_mode; + } + } + + *rd_cost = best_rdc; + +#if CONFIG_INTERNAL_STATS + store_coding_context(x, ctx, mi->mode); +#else + store_coding_context(x, ctx); +#endif // CONFIG_INTERNAL_STATS +} + +void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, + MACROBLOCK *x, RD_STATS *rd_cost, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mi = xd->mi[0]; + struct macroblockd_plane *const pd = &xd->plane[0]; + + BEST_PICKMODE best_pickmode; + int inter_mode_mask[BLOCK_SIZES]; +#if COLLECT_PICK_MODE_STAT + static mode_search_stat ms_stat; +#endif + MV_REFERENCE_FRAME ref_frame; + MV_REFERENCE_FRAME usable_ref_frame, second_ref_frame; + int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES]; + uint8_t mode_checked[MB_MODE_COUNT][REF_FRAMES]; + struct buf_2d yv12_mb[8][MAX_MB_PLANE]; + static const int flag_list[8] = { 0, AOM_LAST_FLAG, 0, 0, AOM_GOLD_FLAG, 0, + 0, AOM_ALT_FLAG }; + RD_STATS this_rdc, best_rdc; + // var_y and sse_y are saved to be used in skipping checking + unsigned int sse_y = UINT_MAX; + unsigned int var_y = UINT_MAX; + const int *const rd_threshes = cpi->rd.threshes[mi->segment_id][bsize]; + const int *const rd_thresh_freq_fact = x->thresh_freq_fact[bsize]; + InterpFilter filter_ref; + int ref_frame_skip_mask = 0; + int best_pred_sad = INT_MAX; + int best_early_term = 0; + unsigned int ref_costs_single[REF_FRAMES], + ref_costs_comp[REF_FRAMES][REF_FRAMES]; + int force_skip_low_temp_var = 0; + int skip_ref_find_pred[8] = { 0 }; + unsigned int sse_zeromv_norm = UINT_MAX; + const unsigned int thresh_skip_golden = 500; + int gf_temporal_ref = 0; + const struct segmentation *const seg = &cm->seg; + int num_inter_modes = RT_INTER_MODES; + unsigned char segment_id = mi->segment_id; + PRED_BUFFER tmp[4]; + DECLARE_ALIGNED(16, uint8_t, pred_buf[3 * 128 * 128]); + PRED_BUFFER *this_mode_pred = NULL; + const int reuse_inter_pred = + cpi->sf.rt_sf.reuse_inter_pred_nonrd && cm->seq_params.bit_depth == 8; + const int bh = block_size_high[bsize]; + const int bw = block_size_wide[bsize]; + const int pixels_in_block = bh * bw; + struct buf_2d orig_dst = pd->dst; + const CommonQuantParams *quant_params = &cm->quant_params; +#if COLLECT_PICK_MODE_STAT + aom_usec_timer_start(&ms_stat.timer2); +#endif + int intra_cost_penalty = av1_get_intra_cost_penalty( + quant_params->base_qindex, quant_params->y_dc_delta_q, + cm->seq_params.bit_depth); + int64_t inter_mode_thresh = RDCOST(x->rdmult, intra_cost_penalty, 0); + const int perform_intra_pred = cpi->sf.rt_sf.check_intra_pred_nonrd; + int use_modeled_non_rd_cost = 0; + int enable_filter_search = 0; + InterpFilter default_interp_filter = EIGHTTAP_REGULAR; + int64_t thresh_sad_pred = INT64_MAX; + + (void)best_rd_so_far; + + init_best_pickmode(&best_pickmode); + + for (int i = 0; i < BLOCK_SIZES; ++i) inter_mode_mask[i] = INTER_ALL; + + // TODO(kyslov) Move this to Speed Features + inter_mode_mask[BLOCK_128X128] = INTER_NEAREST_NEAR; + + struct scale_factors *const sf_last = get_ref_scale_factors(cm, LAST_FRAME); + struct scale_factors *const sf_golden = + get_ref_scale_factors(cm, GOLDEN_FRAME); + gf_temporal_ref = 1; + // For temporal long term prediction, check that the golden reference + // is same scale as last reference, otherwise disable. + if ((sf_last->x_scale_fp != sf_golden->x_scale_fp) || + (sf_last->y_scale_fp != sf_golden->y_scale_fp)) { + gf_temporal_ref = 0; + } + + av1_collect_neighbors_ref_counts(xd); + + estimate_single_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single); + if (cpi->sf.rt_sf.use_comp_ref_nonrd) + estimate_comp_ref_frame_costs(cm, xd, x, segment_id, ref_costs_comp); + + memset(&mode_checked[0][0], 0, MB_MODE_COUNT * REF_FRAMES); + if (reuse_inter_pred) { + for (int i = 0; i < 3; i++) { + tmp[i].data = &pred_buf[pixels_in_block * i]; + tmp[i].stride = bw; + tmp[i].in_use = 0; + } + tmp[3].data = pd->dst.buf; + tmp[3].stride = pd->dst.stride; + tmp[3].in_use = 0; + } + + x->force_skip = 0; + + // Instead of using av1_get_pred_context_switchable_interp(xd) to assign + // filter_ref, we use a less strict condition on assigning filter_ref. + // This is to reduce the probabily of entering the flow of not assigning + // filter_ref and then skip filter search. + filter_ref = cm->features.interp_filter; + + // initialize mode decisions + av1_invalid_rd_stats(&best_rdc); + av1_invalid_rd_stats(&this_rdc); + av1_invalid_rd_stats(rd_cost); + mi->sb_type = bsize; + mi->ref_frame[0] = NONE_FRAME; + mi->ref_frame[1] = NONE_FRAME; + + usable_ref_frame = + cpi->sf.rt_sf.use_nonrd_altref_frame ? ALTREF_FRAME : GOLDEN_FRAME; + + if (cpi->rc.frames_since_golden == 0 && gf_temporal_ref) { + skip_ref_find_pred[GOLDEN_FRAME] = 1; + if (!cpi->sf.rt_sf.use_nonrd_altref_frame) usable_ref_frame = LAST_FRAME; + } + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64); + if (cpi->sf.rt_sf.short_circuit_low_temp_var && + x->nonrd_prune_ref_frame_search) { + if (is_small_sb) + force_skip_low_temp_var = get_force_skip_low_temp_var_small_sb( + &x->variance_low[0], mi_row, mi_col, bsize); + else + force_skip_low_temp_var = get_force_skip_low_temp_var( + &x->variance_low[0], mi_row, mi_col, bsize); + // If force_skip_low_temp_var is set, skip golden reference. + if (force_skip_low_temp_var) { + usable_ref_frame = LAST_FRAME; + } + } + + // If the segment reference frame feature is enabled and it's set to GOLDEN + // reference, then make sure we don't skip checking GOLDEN, this is to + // prevent possibility of not picking any mode. + if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) == GOLDEN_FRAME) { + usable_ref_frame = GOLDEN_FRAME; + skip_ref_find_pred[GOLDEN_FRAME] = 0; + } + + for (MV_REFERENCE_FRAME ref_frame_iter = LAST_FRAME; + ref_frame_iter <= usable_ref_frame; ++ref_frame_iter) { + // Skip find_predictor if the reference frame is not in the + // ref_frame_flags (i.e., not used as a reference for this frame). + skip_ref_find_pred[ref_frame_iter] = + !(cpi->ref_frame_flags & flag_list[ref_frame_iter]); + if (!skip_ref_find_pred[ref_frame_iter]) { + find_predictors(cpi, x, ref_frame_iter, frame_mv, &ref_frame_skip_mask, + flag_list, tile_data, yv12_mb, bsize, + force_skip_low_temp_var); + } + } + + thresh_sad_pred = ((int64_t)x->pred_mv_sad[LAST_FRAME]) << 1; + // Increase threshold for less agressive pruning. + if (cpi->sf.rt_sf.nonrd_prune_ref_frame_search == 1) + thresh_sad_pred += (x->pred_mv_sad[LAST_FRAME] >> 2); + + const int large_block = bsize >= BLOCK_32X32; + const int use_model_yrd_large = + cpi->oxcf.rc_mode == AOM_CBR && large_block && + !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) && + quant_params->base_qindex && cm->seq_params.bit_depth == 8; + +#if COLLECT_PICK_MODE_STAT + ms_stat.num_blocks[bsize]++; +#endif + init_mbmi(mi, DC_PRED, NONE_FRAME, NONE_FRAME, cm); + mi->tx_size = + AOMMIN(AOMMIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[x->tx_mode_search_type]), + TX_16X16); + + // TODO(marpan): Look into reducing these conditions. For now constrain + // it to avoid significant bdrate loss. + if (cpi->sf.rt_sf.use_modeled_non_rd_cost && + quant_params->base_qindex > 120 && x->source_variance > 100 && + bsize <= BLOCK_16X16 && x->content_state_sb != kLowVarHighSumdiff && + x->content_state_sb != kHighSad) + use_modeled_non_rd_cost = 1; + + if (cpi->sf.rt_sf.use_nonrd_filter_search) { + enable_filter_search = 1; + if (cpi->sf.interp_sf.cb_pred_filter_search) { + const int bsl = mi_size_wide_log2[bsize]; + enable_filter_search = + (((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_frame.frame_number)) & + 0x1; + } + if (x->source_variance <= + cpi->sf.interp_sf.disable_filter_search_var_thresh) + enable_filter_search = 0; + } + + for (int idx = 0; idx < num_inter_modes; ++idx) { + int rate_mv = 0; + int mode_rd_thresh; + int mode_index; + int64_t this_sse; + int is_skippable; + int this_early_term = 0; + int skip_this_mv = 0; + int comp_pred = 0; + int force_mv_inter_layer = 0; + PREDICTION_MODE this_mode; + MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + second_ref_frame = NONE_FRAME; + + this_mode = ref_mode_set[idx].pred_mode; + ref_frame = ref_mode_set[idx].ref_frame; + +#if COLLECT_PICK_MODE_STAT + aom_usec_timer_start(&ms_stat.timer1); + ms_stat.num_searches[bsize][this_mode]++; +#endif + mi->mode = this_mode; + mi->ref_frame[0] = ref_frame; + + if (ref_frame > usable_ref_frame) continue; + if (skip_ref_find_pred[ref_frame]) continue; + + // Skip non-zero motion for SVC if skip_nonzeromv_ref is set. + if (cpi->use_svc && frame_mv[this_mode][ref_frame].as_int != 0) { + if (ref_frame == LAST_FRAME && cpi->svc.skip_nonzeromv_last) + continue; + else if (ref_frame == GOLDEN_FRAME && cpi->svc.skip_nonzeromv_gf) + continue; + } + + // If the segment reference frame feature is enabled then do nothing if the + // current ref frame is not allowed. + if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) + continue; + + if (ref_frame != LAST_FRAME && cpi->oxcf.rc_mode == AOM_CBR && + sse_zeromv_norm < thresh_skip_golden && this_mode == NEWMV) + continue; + + if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue; + + if (!(inter_mode_mask[bsize] & (1 << this_mode))) continue; + + // Skip testing non-LAST if this flag is set. + if (x->nonrd_prune_ref_frame_search) { + if (x->nonrd_prune_ref_frame_search > 1 && ref_frame != LAST_FRAME && + (bsize > BLOCK_64X64 || (bsize > BLOCK_16X16 && this_mode == NEWMV))) + continue; + + if (ref_frame != LAST_FRAME && this_mode == NEARMV) continue; + } + + // Skip non-zeromv mode search for non-LAST frame if force_skip_low_temp_var + // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped + // later. + if (!force_mv_inter_layer && force_skip_low_temp_var && + ref_frame != LAST_FRAME && frame_mv[this_mode][ref_frame].as_int != 0) { + continue; + } + +#if 0 + if (x->content_state_sb != kVeryHighSad && + (cpi->sf.short_circuit_low_temp_var >= 2 || + (cpi->sf.short_circuit_low_temp_var == 1 && bsize == BLOCK_64X64)) + && force_skip_low_temp_var && ref_frame == LAST_FRAME && this_mode == + NEWMV) { + continue; + } +#endif + + // Disable this drop out case if the ref frame segment level feature is + // enabled for this segment. This is to prevent the possibility that we + // end up unable to pick any mode. + if (!segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) { + // Check for skipping GOLDEN and ALTREF based pred_mv_sad. + if (cpi->sf.rt_sf.nonrd_prune_ref_frame_search > 0 && + x->pred_mv_sad[ref_frame] != INT_MAX && ref_frame != LAST_FRAME) { + if ((int64_t)(x->pred_mv_sad[ref_frame]) > thresh_sad_pred) + ref_frame_skip_mask |= (1 << ref_frame); + } + if (ref_frame_skip_mask & (1 << ref_frame)) continue; + } + + // Select prediction reference frames. + for (int i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].pre[0] = yv12_mb[ref_frame][i]; + } + + mi->ref_frame[0] = ref_frame; + mi->ref_frame[1] = second_ref_frame; + set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); + + mode_index = mode_idx[ref_frame][INTER_OFFSET(this_mode)]; + mode_rd_thresh = best_pickmode.best_mode_skip_txfm + ? rd_threshes[mode_index] << 1 + : rd_threshes[mode_index]; + + // Increase mode_rd_thresh value for non-LAST for improved encoding + // speed + if (ref_frame != LAST_FRAME) { + mode_rd_thresh = mode_rd_thresh << 1; + if (ref_frame == GOLDEN_FRAME && cpi->rc.frames_since_golden > 4) + mode_rd_thresh = mode_rd_thresh << 1; + } + + if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh, + rd_thresh_freq_fact[mode_index])) + if (frame_mv[this_mode][ref_frame].as_int != 0) continue; + + if (this_mode == NEWMV && !force_mv_inter_layer) { + if (search_new_mv(cpi, x, frame_mv, ref_frame, gf_temporal_ref, bsize, + mi_row, mi_col, best_pred_sad, &rate_mv, &best_rdc)) + continue; + } + + for (PREDICTION_MODE inter_mv_mode = NEARESTMV; inter_mv_mode <= NEWMV; + inter_mv_mode++) { + if (inter_mv_mode == this_mode || comp_pred) continue; + if (mode_checked[inter_mv_mode][ref_frame] && + frame_mv[this_mode][ref_frame].as_int == + frame_mv[inter_mv_mode][ref_frame].as_int) { + skip_this_mv = 1; + break; + } + } + + if (skip_this_mv) continue; + + mi->mode = this_mode; + mi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int; + mi->mv[1].as_int = 0; + if (reuse_inter_pred) { + if (!this_mode_pred) { + this_mode_pred = &tmp[3]; + } else { + this_mode_pred = &tmp[get_pred_buffer(tmp, 3)]; + pd->dst.buf = this_mode_pred->data; + pd->dst.stride = bw; + } + } +#if COLLECT_PICK_MODE_STAT + ms_stat.num_nonskipped_searches[bsize][this_mode]++; +#endif + if (enable_filter_search && + ((mi->mv[0].as_mv.row & 0x07) || (mi->mv[0].as_mv.col & 0x07)) && + (ref_frame == LAST_FRAME || !x->nonrd_prune_ref_frame_search)) { + search_filter_ref(cpi, x, &this_rdc, mi_row, mi_col, tmp, bsize, + reuse_inter_pred, &this_mode_pred, &var_y, &sse_y, + &this_early_term, use_model_yrd_large, &this_sse); + } else { + mi->interp_filters = + (filter_ref == SWITCHABLE) + ? av1_broadcast_interp_filter(default_interp_filter) + : av1_broadcast_interp_filter(filter_ref); + av1_enc_build_inter_predictor_y(xd, mi_row, mi_col); + if (use_model_yrd_large) { + model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, NULL, NULL, + &var_y, &sse_y, &this_early_term, + use_modeled_non_rd_cost); + } else { + model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist, + &this_rdc.skip, NULL, &var_y, &sse_y, + use_modeled_non_rd_cost); + } + } + + if (ref_frame == LAST_FRAME && frame_mv[this_mode][ref_frame].as_int == 0) { + sse_zeromv_norm = + sse_y >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + } + + const int skip_ctx = av1_get_skip_context(xd); + const int skip_cost = x->skip_cost[skip_ctx][1]; + const int no_skip_cost = x->skip_cost[skip_ctx][0]; + if (!this_early_term) { + if (use_modeled_non_rd_cost) { + if (this_rdc.skip) { + this_rdc.rate = skip_cost; + } else { + this_rdc.rate += no_skip_cost; + } + } else { + this_sse = (int64_t)sse_y; + block_yrd(cpi, x, mi_row, mi_col, &this_rdc, &is_skippable, &this_sse, + bsize, mi->tx_size); + if (this_rdc.skip) { + this_rdc.rate = skip_cost; + } else { + if (RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist) >= + RDCOST(x->rdmult, 0, + this_sse)) { // this_sse already multiplied by 16 in + // block_yrd + this_rdc.skip = 1; + this_rdc.rate = skip_cost; + this_rdc.dist = this_sse; + } else { + this_rdc.rate += no_skip_cost; + } + } + } + } else { + this_rdc.skip = 1; + this_rdc.rate = skip_cost; + this_rdc.dist = sse_y << 4; + } + + if (!this_early_term && + (x->color_sensitivity[0] || x->color_sensitivity[1])) { + RD_STATS rdc_uv; + const BLOCK_SIZE uv_bsize = get_plane_block_size( + bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y); + if (x->color_sensitivity[0]) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_U, AOM_PLANE_U); + } + if (x->color_sensitivity[1]) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_V, AOM_PLANE_V); + } + model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, &var_y, &sse_y, 1, 2); + this_rdc.rate += rdc_uv.rate; + this_rdc.dist += rdc_uv.dist; + this_rdc.skip = this_rdc.skip && rdc_uv.skip; + } + + // TODO(kyslov) account for UV prediction cost + this_rdc.rate += rate_mv; + const int16_t mode_ctx = + av1_mode_context_analyzer(mbmi_ext->mode_context, mi->ref_frame); + this_rdc.rate += cost_mv_ref(x, this_mode, mode_ctx); + + this_rdc.rate += ref_costs_single[ref_frame]; + + this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist); + if (cpi->oxcf.rc_mode == AOM_CBR) { + newmv_diff_bias(xd, this_mode, &this_rdc, bsize, + frame_mv[this_mode][ref_frame].as_mv.row, + frame_mv[this_mode][ref_frame].as_mv.col, cpi->speed, + x->source_variance); + } + + mode_checked[this_mode][ref_frame] = 1; +#if COLLECT_PICK_MODE_STAT + aom_usec_timer_mark(&ms_stat.timer1); + ms_stat.nonskipped_search_times[bsize][this_mode] += + aom_usec_timer_elapsed(&ms_stat.timer1); +#endif + if (this_rdc.rdcost < best_rdc.rdcost) { + best_rdc = this_rdc; + best_early_term = this_early_term; + best_pickmode.best_mode = this_mode; + best_pickmode.best_pred_filter = mi->interp_filters; + best_pickmode.best_tx_size = mi->tx_size; + best_pickmode.best_ref_frame = ref_frame; + best_pickmode.best_mode_skip_txfm = this_rdc.skip; + best_pickmode.best_second_ref_frame = second_ref_frame; + if (reuse_inter_pred) { + free_pred_buffer(best_pickmode.best_pred); + best_pickmode.best_pred = this_mode_pred; + } + } else { + if (reuse_inter_pred) free_pred_buffer(this_mode_pred); + } + if (best_early_term && idx > 0) { + x->force_skip = 1; + break; + } + } + + mi->mode = best_pickmode.best_mode; + mi->interp_filters = best_pickmode.best_pred_filter; + mi->tx_size = best_pickmode.best_tx_size; + memset(mi->inter_tx_size, mi->tx_size, sizeof(mi->inter_tx_size)); + mi->ref_frame[0] = best_pickmode.best_ref_frame; + mi->mv[0].as_int = + frame_mv[best_pickmode.best_mode][best_pickmode.best_ref_frame].as_int; + mi->ref_frame[1] = best_pickmode.best_second_ref_frame; + x->force_skip = best_rdc.skip; + + // Perform intra prediction search, if the best SAD is above a certain + // threshold. + mi->angle_delta[PLANE_TYPE_Y] = 0; + mi->angle_delta[PLANE_TYPE_UV] = 0; + mi->filter_intra_mode_info.use_filter_intra = 0; + + uint32_t spatial_var_thresh = 50; + int motion_thresh = 32; + // Adjust thresholds to make intra mode likely tested if the other + // references (golden, alt) are skipped/not checked. + if (cpi->sf.rt_sf.use_nonrd_altref_frame == 0 && + cpi->sf.rt_sf.nonrd_prune_ref_frame_search > 0) { + spatial_var_thresh = 150; + motion_thresh = 0; + } + int do_early_exit_rdthresh = 1; + // Some adjustments to checking intra mode based on source variance. + if (x->source_variance < spatial_var_thresh) { + // If the best inter mode is large motion or non-LAST ref reduce intra cost + // penalty, so intra mode is more likely tested. + if (best_pickmode.best_ref_frame != LAST_FRAME || + abs(mi->mv[0].as_mv.row) >= motion_thresh || + abs(mi->mv[0].as_mv.col) >= motion_thresh) { + intra_cost_penalty = intra_cost_penalty >> 2; + inter_mode_thresh = RDCOST(x->rdmult, intra_cost_penalty, 0); + do_early_exit_rdthresh = 0; + } + // For big blocks worth checking intra (since only DC will be checked), + // even if best_early_term is set. + if (bsize >= BLOCK_32X32) best_early_term = 0; + } + + if (best_rdc.rdcost == INT64_MAX || + (perform_intra_pred && !best_early_term && + best_rdc.rdcost > inter_mode_thresh && + bsize <= cpi->sf.part_sf.max_intra_bsize)) { + int64_t this_sse = INT64_MAX; + struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 }; + PRED_BUFFER *const best_pred = best_pickmode.best_pred; + TX_SIZE intra_tx_size = + AOMMIN(AOMMIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[x->tx_mode_search_type]), + TX_16X16); + + if (reuse_inter_pred && best_pred != NULL) { + if (best_pred->data == orig_dst.buf) { + this_mode_pred = &tmp[get_pred_buffer(tmp, 3)]; + aom_convolve_copy(best_pred->data, best_pred->stride, + this_mode_pred->data, this_mode_pred->stride, 0, 0, 0, + 0, bw, bh); + best_pickmode.best_pred = this_mode_pred; + } + } + pd->dst = orig_dst; + + for (int i = 0; i < 4; ++i) { + const PREDICTION_MODE this_mode = intra_mode_list[i]; + const THR_MODES mode_index = + mode_idx[INTRA_FRAME][mode_offset(this_mode)]; + const int mode_rd_thresh = rd_threshes[mode_index]; + + // Only check DC for blocks >= 32X32. + if (this_mode > 0 && bsize >= BLOCK_32X32) continue; + + if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh, + rd_thresh_freq_fact[mode_index]) && + (do_early_exit_rdthresh || this_mode == SMOOTH_PRED)) { + continue; + } + const BLOCK_SIZE uv_bsize = get_plane_block_size( + bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y); + + mi->mode = this_mode; + mi->ref_frame[0] = INTRA_FRAME; + mi->ref_frame[1] = NONE_FRAME; + + this_rdc.dist = this_rdc.rate = 0; + args.mode = this_mode; + args.skippable = 1; + args.rdc = &this_rdc; + mi->tx_size = intra_tx_size; + compute_intra_yprediction(cm, this_mode, bsize, x, xd); + // Look into selecting tx_size here, based on prediction residual. + if (use_modeled_non_rd_cost) + model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist, + &this_rdc.skip, NULL, &var_y, &sse_y, 1); + else + block_yrd(cpi, x, mi_row, mi_col, &this_rdc, &args.skippable, &this_sse, + bsize, mi->tx_size); + // TODO(kyslov@) Need to account for skippable + if (x->color_sensitivity[0]) { + av1_foreach_transformed_block_in_plane(xd, uv_bsize, 1, + estimate_block_intra, &args); + } + if (x->color_sensitivity[1]) { + av1_foreach_transformed_block_in_plane(xd, uv_bsize, 2, + estimate_block_intra, &args); + } + + int mode_cost = 0; + if (av1_is_directional_mode(this_mode) && av1_use_angle_delta(bsize)) { + mode_cost += x->angle_delta_cost[this_mode - V_PRED] + [MAX_ANGLE_DELTA + + mi->angle_delta[PLANE_TYPE_Y]]; + } + if (this_mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) { + mode_cost += x->filter_intra_cost[bsize][0]; + } + this_rdc.rate += ref_costs_single[INTRA_FRAME]; + this_rdc.rate += intra_cost_penalty; + this_rdc.rate += mode_cost; + this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist); + + if (this_rdc.rdcost < best_rdc.rdcost) { + best_rdc = this_rdc; + best_pickmode.best_mode = this_mode; + best_pickmode.best_intra_tx_size = mi->tx_size; + best_pickmode.best_ref_frame = INTRA_FRAME; + best_pickmode.best_second_ref_frame = NONE_FRAME; + mi->uv_mode = this_mode; + mi->mv[0].as_int = INVALID_MV; + mi->mv[1].as_int = INVALID_MV; + } + } + + // Reset mb_mode_info to the best inter mode. + if (best_pickmode.best_ref_frame != INTRA_FRAME) { + mi->tx_size = best_pickmode.best_tx_size; + } else { + mi->tx_size = best_pickmode.best_intra_tx_size; + } + } + + pd->dst = orig_dst; + mi->mode = best_pickmode.best_mode; + mi->ref_frame[0] = best_pickmode.best_ref_frame; + mi->ref_frame[1] = best_pickmode.best_second_ref_frame; + + if (!is_inter_block(mi)) { + mi->interp_filters = av1_broadcast_interp_filter(SWITCHABLE_FILTERS); + } + + if (reuse_inter_pred && best_pickmode.best_pred != NULL) { + PRED_BUFFER *const best_pred = best_pickmode.best_pred; + if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) { + aom_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf, + pd->dst.stride, 0, 0, 0, 0, bw, bh); + } + } + if (cpi->sf.inter_sf.adaptive_rd_thresh) { + THR_MODES best_mode_idx = + mode_idx[best_pickmode.best_ref_frame][mode_offset(mi->mode)]; + if (best_pickmode.best_ref_frame == INTRA_FRAME) { + // Only consider the modes that are included in the intra_mode_list. + int intra_modes = sizeof(intra_mode_list) / sizeof(PREDICTION_MODE); + for (int i = 0; i < intra_modes; i++) { + update_thresh_freq_fact(cpi, x, bsize, INTRA_FRAME, best_mode_idx, + intra_mode_list[i]); + } + } else { + for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) { + PREDICTION_MODE this_mode; + if (best_pickmode.best_ref_frame != ref_frame) continue; + for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { + update_thresh_freq_fact(cpi, x, bsize, ref_frame, best_mode_idx, + this_mode); + } + } + } + } + +#if CONFIG_INTERNAL_STATS + store_coding_context(x, ctx, mi->mode); +#else + store_coding_context(x, ctx); +#endif // CONFIG_INTERNAL_STATS +#if COLLECT_PICK_MODE_STAT + aom_usec_timer_mark(&ms_stat.timer2); + ms_stat.avg_block_times[bsize] += aom_usec_timer_elapsed(&ms_stat.timer2); + // + if ((mi_row + mi_size_high[bsize] >= (cpi->common.mi_params.mi_rows)) && + (mi_col + mi_size_wide[bsize] >= (cpi->common.mi_params.mi_cols))) { + int i, j; + PREDICTION_MODE used_modes[3] = { NEARESTMV, NEARMV, NEWMV }; + BLOCK_SIZE bss[5] = { BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, + BLOCK_128X128 }; + int64_t total_time = 0l; + int32_t total_blocks = 0; + + printf("\n"); + for (i = 0; i < 5; i++) { + printf("BS(%d) Num %d, Avg_time %f: ", bss[i], ms_stat.num_blocks[bss[i]], + ms_stat.num_blocks[bss[i]] > 0 + ? (float)ms_stat.avg_block_times[bss[i]] / + ms_stat.num_blocks[bss[i]] + : 0); + total_time += ms_stat.avg_block_times[bss[i]]; + total_blocks += ms_stat.num_blocks[bss[i]]; + for (j = 0; j < 3; j++) { + printf("Mode %d, %d/%d tps %f ", used_modes[j], + ms_stat.num_nonskipped_searches[bss[i]][used_modes[j]], + ms_stat.num_searches[bss[i]][used_modes[j]], + ms_stat.num_nonskipped_searches[bss[i]][used_modes[j]] > 0 + ? (float)ms_stat + .nonskipped_search_times[bss[i]][used_modes[j]] / + ms_stat.num_nonskipped_searches[bss[i]][used_modes[j]] + : 0l); + } + printf("\n"); + } + printf("Total time = %ld. Total blocks = %d\n", total_time, total_blocks); + } + // +#endif // COLLECT_PICK_MODE_STAT + *rd_cost = best_rdc; +} diff --git a/libs/libaom/src/av1/encoder/palette.c b/libs/libaom/src/av1/encoder/palette.c new file mode 100644 index 000000000..e61cd02ce --- /dev/null +++ b/libs/libaom/src/av1/encoder/palette.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "av1/encoder/cost.h" +#include "av1/encoder/palette.h" +#include "av1/encoder/random.h" + +#define AV1_K_MEANS_DIM 1 +#include "av1/encoder/k_means_template.h" +#undef AV1_K_MEANS_DIM +#define AV1_K_MEANS_DIM 2 +#include "av1/encoder/k_means_template.h" +#undef AV1_K_MEANS_DIM + +static int int_comparer(const void *a, const void *b) { + return (*(int *)a - *(int *)b); +} + +int av1_remove_duplicates(int *centroids, int num_centroids) { + int num_unique; // number of unique centroids + int i; + qsort(centroids, num_centroids, sizeof(*centroids), int_comparer); + // Remove duplicates. + num_unique = 1; + for (i = 1; i < num_centroids; ++i) { + if (centroids[i] != centroids[i - 1]) { // found a new unique centroid + centroids[num_unique++] = centroids[i]; + } + } + return num_unique; +} + +static int delta_encode_cost(const int *colors, int num, int bit_depth, + int min_val) { + if (num <= 0) return 0; + int bits_cost = bit_depth; + if (num == 1) return bits_cost; + bits_cost += 2; + int max_delta = 0; + int deltas[PALETTE_MAX_SIZE]; + const int min_bits = bit_depth - 3; + for (int i = 1; i < num; ++i) { + const int delta = colors[i] - colors[i - 1]; + deltas[i - 1] = delta; + assert(delta >= min_val); + if (delta > max_delta) max_delta = delta; + } + int bits_per_delta = AOMMAX(av1_ceil_log2(max_delta + 1 - min_val), min_bits); + assert(bits_per_delta <= bit_depth); + int range = (1 << bit_depth) - colors[0] - min_val; + for (int i = 0; i < num - 1; ++i) { + bits_cost += bits_per_delta; + range -= deltas[i]; + bits_per_delta = AOMMIN(bits_per_delta, av1_ceil_log2(range)); + } + return bits_cost; +} + +int av1_index_color_cache(const uint16_t *color_cache, int n_cache, + const uint16_t *colors, int n_colors, + uint8_t *cache_color_found, int *out_cache_colors) { + if (n_cache <= 0) { + for (int i = 0; i < n_colors; ++i) out_cache_colors[i] = colors[i]; + return n_colors; + } + memset(cache_color_found, 0, n_cache * sizeof(*cache_color_found)); + int n_in_cache = 0; + int in_cache_flags[PALETTE_MAX_SIZE]; + memset(in_cache_flags, 0, sizeof(in_cache_flags)); + for (int i = 0; i < n_cache && n_in_cache < n_colors; ++i) { + for (int j = 0; j < n_colors; ++j) { + if (colors[j] == color_cache[i]) { + in_cache_flags[j] = 1; + cache_color_found[i] = 1; + ++n_in_cache; + break; + } + } + } + int j = 0; + for (int i = 0; i < n_colors; ++i) + if (!in_cache_flags[i]) out_cache_colors[j++] = colors[i]; + assert(j == n_colors - n_in_cache); + return j; +} + +int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi, + int bit_depth, int *zero_count, + int *min_bits) { + const int n = pmi->palette_size[1]; + const int max_val = 1 << bit_depth; + int max_d = 0; + *min_bits = bit_depth - 4; + *zero_count = 0; + for (int i = 1; i < n; ++i) { + const int delta = pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] - + pmi->palette_colors[2 * PALETTE_MAX_SIZE + i - 1]; + const int v = abs(delta); + const int d = AOMMIN(v, max_val - v); + if (d > max_d) max_d = d; + if (d == 0) ++(*zero_count); + } + return AOMMAX(av1_ceil_log2(max_d + 1), *min_bits); +} + +int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi, + uint16_t *color_cache, int n_cache, + int bit_depth) { + const int n = pmi->palette_size[0]; + int out_cache_colors[PALETTE_MAX_SIZE]; + uint8_t cache_color_found[2 * PALETTE_MAX_SIZE]; + const int n_out_cache = + av1_index_color_cache(color_cache, n_cache, pmi->palette_colors, n, + cache_color_found, out_cache_colors); + const int total_bits = + n_cache + delta_encode_cost(out_cache_colors, n_out_cache, bit_depth, 1); + return av1_cost_literal(total_bits); +} + +int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi, + uint16_t *color_cache, int n_cache, + int bit_depth) { + const int n = pmi->palette_size[1]; + int total_bits = 0; + // U channel palette color cost. + int out_cache_colors[PALETTE_MAX_SIZE]; + uint8_t cache_color_found[2 * PALETTE_MAX_SIZE]; + const int n_out_cache = av1_index_color_cache( + color_cache, n_cache, pmi->palette_colors + PALETTE_MAX_SIZE, n, + cache_color_found, out_cache_colors); + total_bits += + n_cache + delta_encode_cost(out_cache_colors, n_out_cache, bit_depth, 0); + + // V channel palette color cost. + int zero_count = 0, min_bits_v = 0; + const int bits_v = + av1_get_palette_delta_bits_v(pmi, bit_depth, &zero_count, &min_bits_v); + const int bits_using_delta = + 2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count; + const int bits_using_raw = bit_depth * n; + total_bits += 1 + AOMMIN(bits_using_delta, bits_using_raw); + return av1_cost_literal(total_bits); +} diff --git a/libs/libaom/src/av1/encoder/palette.h b/libs/libaom/src/av1/encoder/palette.h new file mode 100644 index 000000000..8b88c4755 --- /dev/null +++ b/libs/libaom/src/av1/encoder/palette.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_PALETTE_H_ +#define AOM_AV1_ENCODER_PALETTE_H_ + +#include "av1/common/blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define AV1_K_MEANS_RENAME(func, dim) func##_dim##dim + +void AV1_K_MEANS_RENAME(av1_calc_indices, 1)(const int *data, + const int *centroids, + uint8_t *indices, int n, int k); +void AV1_K_MEANS_RENAME(av1_calc_indices, 2)(const int *data, + const int *centroids, + uint8_t *indices, int n, int k); +void AV1_K_MEANS_RENAME(av1_k_means, 1)(const int *data, int *centroids, + uint8_t *indices, int n, int k, + int max_itr); +void AV1_K_MEANS_RENAME(av1_k_means, 2)(const int *data, int *centroids, + uint8_t *indices, int n, int k, + int max_itr); + +// Given 'n' 'data' points and 'k' 'centroids' each of dimension 'dim', +// calculate the centroid 'indices' for the data points. +static INLINE void av1_calc_indices(const int *data, const int *centroids, + uint8_t *indices, int n, int k, int dim) { + if (dim == 1) { + AV1_K_MEANS_RENAME(av1_calc_indices, 1)(data, centroids, indices, n, k); + } else if (dim == 2) { + AV1_K_MEANS_RENAME(av1_calc_indices, 2)(data, centroids, indices, n, k); + } else { + assert(0 && "Untemplated k means dimension"); + } +} + +// Given 'n' 'data' points and an initial guess of 'k' 'centroids' each of +// dimension 'dim', runs up to 'max_itr' iterations of k-means algorithm to get +// updated 'centroids' and the centroid 'indices' for elements in 'data'. +// Note: the output centroids are rounded off to nearest integers. +static INLINE void av1_k_means(const int *data, int *centroids, + uint8_t *indices, int n, int k, int dim, + int max_itr) { + if (dim == 1) { + AV1_K_MEANS_RENAME(av1_k_means, 1)(data, centroids, indices, n, k, max_itr); + } else if (dim == 2) { + AV1_K_MEANS_RENAME(av1_k_means, 2)(data, centroids, indices, n, k, max_itr); + } else { + assert(0 && "Untemplated k means dimension"); + } +} + +// Given a list of centroids, returns the unique number of centroids 'k', and +// puts these unique centroids in first 'k' indices of 'centroids' array. +// Ideally, the centroids should be rounded to integers before calling this +// method. +int av1_remove_duplicates(int *centroids, int num_centroids); + +// Given a color cache and a set of base colors, find if each cache color is +// present in the base colors, record the binary results in "cache_color_found". +// Record the colors that are not in the color cache in "out_cache_colors". +int av1_index_color_cache(const uint16_t *color_cache, int n_cache, + const uint16_t *colors, int n_colors, + uint8_t *cache_color_found, int *out_cache_colors); + +// Return the number of bits used to transmit each v palette color delta; +// assign zero_count with the number of deltas being 0. +int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi, + int bit_depth, int *zero_count, int *min_bits); + +// Return the rate cost for transmitting luma palette color values. +int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi, + uint16_t *color_cache, int n_cache, int bit_depth); + +// Return the rate cost for transmitting chroma palette color values. +int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi, + uint16_t *color_cache, int n_cache, + int bit_depth); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_PALETTE_H_ diff --git a/libs/libaom/src/av1/encoder/partition_cnn_weights.h b/libs/libaom/src/av1/encoder/partition_cnn_weights.h new file mode 100644 index 000000000..504038c63 --- /dev/null +++ b/libs/libaom/src/av1/encoder/partition_cnn_weights.h @@ -0,0 +1,2139 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_PARTITION_CNN_WEIGHTS_H_ +#define AOM_AV1_ENCODER_PARTITION_CNN_WEIGHTS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/cnn.h" +#include "av1/encoder/ml.h" + +#define CNN_BRANCH_0_OUT_CH 20 +#define CNN_BRANCH_1_OUT_CH 4 +#define CNN_BRANCH_2_OUT_CH 20 +#define CNN_BRANCH_3_OUT_CH 20 +#define CNN_TOT_OUT_CH \ + (((CNN_BRANCH_0_OUT_CH) + (CNN_BRANCH_1_OUT_CH) + (CNN_BRANCH_2_OUT_CH) + \ + (CNN_BRANCH_3_OUT_CH))) +#define CNN_BRANCH_0_OUT_SIZE (CNN_BRANCH_0_OUT_CH) +#define CNN_BRANCH_1_OUT_SIZE ((CNN_BRANCH_1_OUT_CH)*2 * 2) +#define CNN_BRANCH_2_OUT_SIZE ((CNN_BRANCH_2_OUT_CH)*4 * 4) +#define CNN_BRANCH_3_OUT_SIZE ((CNN_BRANCH_3_OUT_CH)*8 * 8) +#define CNN_OUT_BUF_SIZE \ + (((CNN_BRANCH_0_OUT_SIZE) + (CNN_BRANCH_1_OUT_SIZE) + \ + (CNN_BRANCH_2_OUT_SIZE) + (CNN_BRANCH_3_OUT_SIZE))) + +#define NUM_DNN_BRANCHES 4 +#define NUM_CNN_LAYERS 5 +#define BRANCH_0_NUM_DNN_LAYERS 2 +#define BRANCH_1_NUM_DNN_LAYERS 2 +#define BRANCH_2_NUM_DNN_LAYERS 2 +#define BRANCH_3_NUM_DNN_LAYERS 2 +#define CNN_LAYER_0_HEIGHT 5 +#define CNN_LAYER_0_WIDTH 5 +#define CNN_LAYER_0_IN_CH 1 +#define CNN_LAYER_0_OUT_CH 20 +#define CNN_LAYER_0_HORZ_STRIDE 4 +#define CNN_LAYER_0_VERT_STRIDE 4 +#define CNN_LAYER_1_HEIGHT 2 +#define CNN_LAYER_1_WIDTH 2 +#define CNN_LAYER_1_IN_CH 20 +#define CNN_LAYER_1_OUT_CH 20 +#define CNN_LAYER_1_HORZ_STRIDE 2 +#define CNN_LAYER_1_VERT_STRIDE 2 +#define CNN_LAYER_2_HEIGHT 2 +#define CNN_LAYER_2_WIDTH 2 +#define CNN_LAYER_2_IN_CH 20 +#define CNN_LAYER_2_OUT_CH 20 +#define CNN_LAYER_2_HORZ_STRIDE 2 +#define CNN_LAYER_2_VERT_STRIDE 2 +#define CNN_LAYER_3_HEIGHT 2 +#define CNN_LAYER_3_WIDTH 2 +#define CNN_LAYER_3_IN_CH 20 +#define CNN_LAYER_3_OUT_CH 4 +#define CNN_LAYER_3_HORZ_STRIDE 2 +#define CNN_LAYER_3_VERT_STRIDE 2 +#define CNN_LAYER_4_HEIGHT 2 +#define CNN_LAYER_4_WIDTH 2 +#define CNN_LAYER_4_IN_CH 4 +#define CNN_LAYER_4_OUT_CH 20 +#define CNN_LAYER_4_HORZ_STRIDE 2 +#define CNN_LAYER_4_VERT_STRIDE 2 +#define BRANCH_0_NUM_DNN_FEATURES 37 +#define BRANCH_0_NUM_DNN_LAYER_0_UNITS 16 +#define BRANCH_0_NUM_DNN_LAYER_1_UNITS 24 +#define BRANCH_0_NUM_LOGITS 1 +#define BRANCH_1_NUM_DNN_FEATURES 25 +#define BRANCH_1_NUM_DNN_LAYER_0_UNITS 16 +#define BRANCH_1_NUM_DNN_LAYER_1_UNITS 24 +#define BRANCH_1_NUM_LOGITS 1 +#define BRANCH_2_NUM_DNN_FEATURES 25 +#define BRANCH_2_NUM_DNN_LAYER_0_UNITS 16 +#define BRANCH_2_NUM_DNN_LAYER_1_UNITS 24 +#define BRANCH_2_NUM_LOGITS 1 +#define BRANCH_3_NUM_DNN_FEATURES 41 +#define BRANCH_3_NUM_DNN_LAYER_0_UNITS 16 +#define BRANCH_3_NUM_DNN_LAYER_1_UNITS 24 +#define BRANCH_3_NUM_LOGITS 1 + +static const float av1_intra_mode_cnn_partition_cnn_layer_0_kernel[] = { + 0.131894f, -0.593536f, -0.212935f, -0.00220011f, -0.396949f, + 0.287753f, -0.91875f, -0.0095057f, 0.804197f, -0.395239f, + 0.516604f, 1.16439f, 0.445784f, -0.163349f, 0.746488f, + -0.33891f, -0.562652f, 0.481403f, 0.755378f, -0.200753f, + 0.0784307f, 0.105657f, 0.0205673f, -0.524089f, -0.476146f, + -0.161206f, -0.65079f, 0.137474f, 0.28584f, 0.508768f, + -0.643386f, 0.227068f, -0.899507f, -0.413382f, 0.631466f, + 0.398203f, -0.544392f, 0.825155f, 0.671847f, -0.249779f, + 0.323121f, 0.125357f, -0.719564f, -0.0714854f, -0.168472f, + -0.213246f, -0.674525f, 0.330148f, -0.138414f, 0.20462f, + -0.518571f, -0.15091f, -0.605116f, -0.448732f, -0.475599f, + 0.738f, -0.328526f, 0.755035f, 0.969414f, -0.321039f, + -0.23068f, 0.408567f, -0.377813f, -0.273974f, 1.0684f, + 0.373968f, -0.450305f, 0.439258f, -0.381846f, -0.267331f, + 0.30613f, -0.39369f, 0.622438f, -0.52877f, -0.334991f, + 0.263193f, -0.402121f, 0.64142f, 0.793048f, -0.0231174f, + -0.68474f, -0.293338f, -0.737511f, -0.462654f, 0.474629f, + 0.141397f, -0.152529f, 0.345879f, -0.499991f, 0.00174024f, + 0.337387f, -0.131151f, 0.427385f, -0.457449f, -0.879614f, + -0.425908f, -0.263172f, 0.0344974f, 1.07861f, -0.00416662f, + 0.0208952f, 0.233905f, 0.765965f, 0.0423685f, -0.117554f, + -0.248237f, 0.49848f, -0.845131f, 0.223648f, -0.838709f, + 0.5834f, 0.309956f, -0.0625093f, -0.619619f, 0.918957f, + 0.358271f, -0.668459f, 0.518783f, -0.418963f, -0.206788f, + 0.364983f, -0.0396087f, 0.624309f, -0.138679f, -0.142453f, + 0.28309f, 0.895092f, -0.215713f, 0.439025f, 0.659333f, + -0.366025f, -0.413518f, 0.66657f, -0.265919f, 0.473471f, + -1.0729f, -0.526702f, 0.2838f, 0.367648f, -0.61242f, + 0.121656f, 0.547727f, -0.0636793f, -0.33006f, -0.306604f, + -0.00897731f, 0.688242f, 0.0944626f, 0.321508f, 0.0437392f, + -0.560035f, -0.768334f, 0.0571051f, -0.0427601f, -0.0437806f, + -0.816209f, -0.395829f, 0.293733f, 0.217645f, -0.646428f, + 0.132448f, -0.435806f, -0.0556814f, 0.0218857f, 0.348525f, + -0.17296f, 0.669057f, 0.638604f, -0.0995596f, -0.024099f, + -0.262332f, -0.548975f, 0.357894f, 0.43873f, -0.688234f, + -0.425519f, 0.190986f, -0.074778f, 0.294232f, -0.548969f, + -0.731198f, 0.03616f, -0.475969f, -0.306075f, -0.111929f, + -0.234146f, 0.612669f, 0.882254f, -0.622893f, 0.262431f, + 0.465242f, 0.245384f, -0.811016f, 0.501798f, -0.925875f, + 0.264373f, 0.307766f, -0.26872f, 0.113027f, -0.158875f, + 0.0711483f, 0.220275f, -0.0699022f, -0.0111303f, -0.435384f, + -0.720014f, 0.593484f, -0.964082f, 0.750925f, 0.252433f, + 0.964332f, -0.256904f, -0.421715f, -0.403851f, -0.188081f, + 0.694014f, -1.00183f, 0.798921f, 0.0603123f, 0.213814f, + 0.739642f, -0.0203375f, 0.72569f, -0.260224f, 0.0199516f, + -0.322451f, 0.318204f, -0.38392f, 0.740994f, -0.265215f, + -0.54541f, -0.51479f, -0.458397f, 0.519564f, 0.0509182f, + 0.0363331f, -0.293051f, 0.317714f, -0.327488f, -0.0840401f, + 0.318437f, -0.619403f, 0.641094f, -0.288435f, -0.260185f, + 0.181083f, -0.169294f, 0.292645f, 0.140405f, 0.0572885f, + -0.637428f, -0.102616f, 0.288955f, 0.817314f, 0.116855f, + 0.635532f, 0.283334f, -0.236391f, -0.305035f, -0.217365f, + -0.033021f, -0.455858f, 0.439922f, -0.104039f, 0.373376f, + 0.310659f, 0.388789f, 0.266341f, 0.0746306f, -0.428192f, + -0.202695f, -0.347625f, 0.00585741f, 0.366203f, 0.221413f, + 0.518856f, 0.57245f, -0.375071f, -0.2436f, -0.511895f, + -1.03708f, 0.681455f, -0.111544f, -0.183563f, 0.109729f, + -0.422646f, -0.529777f, 0.747473f, -0.270223f, -0.11435f, + 0.378931f, 0.420456f, 0.236331f, 0.49261f, -0.0666801f, + 0.0475846f, 0.906095f, -0.4146f, -0.020588f, -0.653285f, + 0.135335f, 0.543846f, -0.309061f, 0.11899f, -0.639168f, + -0.719994f, -0.219706f, -0.645631f, -0.829049f, -0.0114746f, + 0.834604f, 0.0378035f, 0.107957f, 0.546929f, -0.674395f, + -0.854817f, -1.1443f, 0.223413f, -0.326324f, 0.440971f, + 0.383582f, -0.495084f, 0.280091f, -0.53116f, 0.0333923f, + -0.354339f, -0.0449156f, -0.538896f, -0.753355f, 0.463995f, + 0.000969967f, -0.2832f, 0.587276f, 0.853094f, -0.481985f, + -0.138202f, 0.180989f, -0.349044f, -0.417534f, 0.455591f, + 0.287332f, 0.251496f, 0.381416f, 0.339632f, -0.0825727f, + 0.352739f, 0.161697f, -0.319764f, -0.258015f, 0.668833f, + -0.553303f, -0.578815f, -0.3758f, 0.289f, 0.247368f, + 0.00681103f, 0.421092f, -0.191033f, -0.425868f, -0.1239f, + 0.0540422f, -0.0856856f, 0.481168f, -0.0283741f, -0.196018f, + 0.230923f, -0.145288f, 0.52188f, 0.00628462f, -0.604556f, + -0.562879f, 0.319282f, 0.323799f, 0.453941f, 0.271129f, + -0.0520196f, 0.684571f, -0.391779f, -0.404614f, 0.134097f, + -0.825482f, 0.0913949f, 0.483543f, 0.159084f, 0.301637f, + 0.427013f, 0.196153f, 0.460091f, -0.730573f, -0.12278f, + 0.221665f, 0.674622f, -0.623363f, -0.0761517f, 0.637979f, + -0.468498f, 0.527276f, -0.596894f, -0.34675f, -0.251241f, + 0.418533f, -0.476696f, -0.901267f, -0.0088241f, -0.12421f, + -0.660316f, -0.0222117f, -0.470898f, -1.10739f, -0.441645f, + 0.39516f, -0.0117906f, 0.254122f, 0.00722599f, -1.00697f, + 0.48908f, -0.122287f, -0.378608f, -0.339145f, 0.682463f, + 0.305606f, 0.453628f, -0.49923f, -0.791388f, -0.202515f, + 0.23214f, -0.434209f, -0.778283f, -0.538015f, 0.145769f, + 0.446281f, -0.339329f, -0.198478f, -0.183717f, -0.855441f, + -0.105778f, 0.575067f, -0.18592f, -0.348094f, 0.740614f, + 0.041549f, -0.109663f, 0.0434492f, 0.245242f, -1.22192f, + 0.685896f, -0.208115f, -0.0616216f, -1.00552f, 0.31045f, + -0.184394f, 0.466705f, -0.0984364f, -0.506252f, 0.144874f, + 0.357038f, 0.675221f, -0.822171f, -0.52729f, 0.991212f, + 0.432422f, 0.383493f, -0.372395f, 0.35651f, -0.25369f, + 0.660208f, -0.117745f, -0.142433f, -0.724115f, -1.0035f, + -0.59178f, 0.563444f, -0.282531f, -0.599989f, 0.507424f, + -0.782875f, 0.755029f, -0.754962f, -0.617825f, 0.565984f, + -0.826878f, -0.456563f, 0.0212161f, 0.469867f, -0.144864f, + 0.225748f, -0.279029f, 0.21052f, -0.440183f, 0.936069f, + 0.170595f, 0.40966f, 0.452453f, -0.576006f, 1.50696f, + 0.649049f, 0.094957f, -0.167706f, -0.258342f, 0.59269f +}; + +static const float av1_intra_mode_cnn_partition_cnn_layer_0_bias[] = { + 0.00475215f, -0.00362332f, -0.00317542f, 0.190083f, 0.0488147f, + -0.0268093f, -0.00432231f, 0.0112229f, 0.0626653f, -0.0025698f, + 0.0018675f, -0.00368139f, -0.00159125f, -0.00034354f, 0.311437f, + 0.000136436f, 0.0667295f, 0.0251274f, 0.00226553f, -0.000638344f +}; + +static const float av1_intra_mode_cnn_partition_cnn_layer_1_kernel[] = { + 0.228403f, 0.241933f, 0.181079f, 0.101728f, 0.278455f, + -0.222078f, 0.387578f, 0.0847356f, -0.0737012f, 0.26518f, + -1.0817f, 0.0404161f, -0.805199f, 0.336576f, -0.541494f, + 0.246264f, 0.116597f, -0.756804f, -0.914136f, 0.410265f, + 0.413294f, 0.07873f, 0.450017f, -0.264346f, 0.549095f, + 1.03755f, -0.203542f, 1.61018f, 0.374131f, 0.402515f, + -2.36115f, 0.116427f, -0.172157f, -0.231482f, -0.905736f, + -0.0183059f, -0.575746f, 0.110348f, -0.268018f, 0.140399f, + 0.427196f, 0.0718528f, 0.247936f, -0.326661f, 0.150404f, + -0.659979f, -0.157148f, 0.00826241f, -0.679275f, -0.131564f, + -1.04822f, 1.06039f, -0.207898f, 0.510167f, 0.484233f, + 0.138972f, -0.0801639f, -0.184416f, 0.0741107f, -0.0299281f, + 0.112263f, 0.380071f, -0.0185269f, -0.0821188f, 0.918796f, + -0.576106f, 0.593007f, 0.479446f, 0.0440703f, 0.322379f, + 0.176783f, -0.147111f, 0.0953247f, -0.636377f, 0.0702104f, + 0.130979f, 0.293892f, -0.0112124f, -0.040347f, -0.16034f, + 0.3252f, -0.586802f, 0.601786f, -0.487148f, -0.458777f, + 0.463835f, 0.144942f, 0.00339965f, -0.779966f, 0.0585298f, + -1.20758f, -0.275614f, 0.292346f, -0.132781f, 0.337892f, + -0.357677f, 1.48511f, 0.172907f, -0.148668f, 0.243184f, + -0.503392f, -0.0791543f, 0.0265389f, -0.102267f, 0.213294f, + 0.0657801f, 0.156996f, 0.0891168f, 0.120805f, 0.261285f, + -0.343025f, -0.0792235f, -0.106415f, 0.133878f, -0.112981f, + -0.00151126f, -0.0643829f, 0.0458938f, -0.0452731f, -0.00147422f, + 0.1871f, -0.0208793f, 0.0752037f, 0.0794674f, 0.167666f, + 0.198028f, -0.361015f, -0.0661721f, -0.10672f, -0.0773641f, + -1.15856f, -0.516443f, -0.322702f, 0.15668f, 0.0075841f, + -0.157731f, 0.270926f, -0.241551f, 0.0169097f, -0.0263953f, + -0.303556f, -0.239237f, 0.117792f, -0.137871f, 0.122054f, + -0.587381f, 0.112938f, 0.0867262f, -0.27909f, -0.203622f, + -0.622195f, 0.42623f, 0.670704f, 0.190826f, -0.304979f, + -0.570075f, -0.240699f, 0.43744f, 0.632896f, -0.563846f, + -0.0160434f, -0.0709745f, 0.816662f, 0.269999f, -0.358734f, + 0.193644f, 1.19339f, -0.118223f, -0.363291f, -0.723616f, + -1.58825f, 0.0222856f, 0.769852f, 0.322713f, 0.0857619f, + -0.669756f, -1.08414f, 1.18593f, 0.486166f, -0.520646f, + 0.0861854f, -0.134197f, 0.258337f, 0.223345f, 0.697639f, + -0.57261f, 0.54031f, 0.892644f, 0.497572f, -0.287076f, + -1.95928f, -0.0568128f, -0.253335f, 0.00233392f, -0.192787f, + -0.115203f, -0.0975649f, 0.277954f, 0.000704534f, -0.315884f, + 0.309583f, 0.357458f, 0.0939298f, -0.072701f, 0.433045f, + -0.536938f, 0.534523f, 0.184585f, -0.0415175f, -0.120909f, + -1.2622f, 0.412449f, -0.114741f, 0.290453f, -0.441671f, + -0.0242497f, -0.20746f, 0.139019f, -0.422668f, -0.146732f, + -0.688828f, -0.00339426f, 0.04166f, 0.41755f, 0.405675f, + 0.562564f, 0.0216812f, 0.0271391f, 0.215227f, 0.328183f, + -1.6442f, -0.827838f, 0.115491f, 0.0951442f, -0.133779f, + -0.0482928f, 0.203177f, 0.322953f, -0.513259f, 0.0676788f, + -0.0877928f, 0.224448f, 0.451957f, 0.314243f, 0.307403f, + 0.35653f, 0.0286278f, 2.27554f, 0.569313f, -0.0488753f, + -2.48809f, 0.274555f, -0.248375f, -0.635634f, -0.187663f, + 0.1827f, -0.409634f, -0.0280568f, -0.207119f, -0.208192f, + -0.410268f, -0.017669f, 0.134856f, 0.434551f, 0.165201f, + 0.584608f, -0.389997f, -0.088713f, 0.118087f, 0.00210905f, + -1.07698f, -0.520967f, -0.198742f, 0.190255f, -0.162639f, + 0.0122759f, 0.460774f, -0.684633f, -0.149512f, 0.167556f, + -0.295034f, -0.0650964f, 0.0868653f, -0.691352f, 0.089795f, + 0.0620608f, 0.0531289f, 0.0124286f, 0.151921f, 1.51067f, + -0.10586f, -0.0311871f, 0.114706f, 0.0565205f, -0.159634f, + -0.423987f, -0.226896f, 0.0605352f, -0.36324f, -0.142205f, + -0.252249f, 0.0666312f, 0.316655f, 0.00687196f, 0.131079f, + -0.128281f, -0.293468f, 1.3327f, 0.542277f, -0.060088f, + -1.73475f, 0.0542297f, -0.227522f, -0.376004f, -0.147028f, + 0.0228252f, 0.0569538f, -0.0796497f, 0.0937596f, -0.0660153f, + -0.979219f, -0.377322f, 0.0523787f, 0.467299f, 0.0824278f, + 0.437147f, 0.263637f, 0.0325681f, 0.303581f, 0.353479f, + -0.142369f, -0.394797f, 0.597185f, 0.116482f, -0.0782593f, + 0.364539f, -0.30396f, 0.119016f, -0.0022429f, -0.044292f, + -0.0110531f, 0.233571f, 0.000975879f, 0.447332f, -0.0320396f, + 0.541609f, 0.14232f, 0.163905f, 0.848609f, 0.19954f, + -0.186591f, -0.44465f, -0.431672f, 0.159037f, -0.129977f, + -0.141778f, 0.246818f, -0.197539f, -0.70115f, 0.185449f, + 0.400274f, -0.0350744f, 0.239727f, -0.290504f, 0.0698443f, + -0.180374f, -0.759591f, -0.0569088f, -0.50246f, -0.0986616f, + -0.892114f, 0.306737f, -0.133937f, 0.285625f, 0.495471f, + -0.686222f, -0.168647f, -0.0926158f, 0.351772f, -0.0215394f, + 0.361223f, 0.0657142f, 0.268229f, -0.616299f, 0.0564718f, + -0.294013f, -0.588019f, 0.0234195f, -0.426863f, -0.511253f, + -0.72177f, 0.420903f, 0.0987506f, 0.309368f, 0.523532f, + 1.06073f, -0.33028f, 0.0818142f, 0.0130354f, 0.0180882f, + 0.0316898f, -0.416614f, -0.566344f, -0.163083f, 0.285085f, + -0.0534352f, 0.385496f, 0.151068f, -0.208295f, -0.175648f, + 0.0476705f, 0.190428f, -0.643391f, 0.484004f, -0.421836f, + -0.19829f, -0.227574f, -0.0869152f, 1.09881f, 0.345129f, + -0.236732f, -0.381935f, -1.46271f, 0.465914f, 0.610375f, + 0.689968f, -0.688546f, 1.95033f, 0.420946f, 0.0282428f, + 0.147823f, 0.669393f, 0.429085f, -0.328385f, -0.150439f, + -0.419097f, -0.828102f, 0.248743f, 0.24644f, 0.0186131f, + -0.384319f, -0.126294f, -0.417067f, 0.271483f, -0.0128456f, + -0.881351f, 0.152581f, 0.185584f, -0.745827f, 0.0551359f, + 0.127083f, 0.936983f, -0.0225341f, 0.575861f, 0.767417f, + -0.140867f, -0.762518f, 0.422446f, -0.0611973f, 0.0515641f, + -0.144168f, -0.298882f, 0.308461f, 0.0208704f, 0.213872f, + -0.258708f, 1.13186f, 0.314083f, -0.347536f, -0.137768f, + 0.653953f, -0.217883f, -0.56112f, -0.864661f, 0.488836f, + 0.268133f, -0.548664f, -0.765226f, 0.117082f, 0.326798f, + -0.678246f, 0.477785f, -1.27584f, 0.198912f, -0.710395f, + 1.39096f, -0.411577f, -0.55119f, 0.51092f, -0.295023f, + 0.245983f, -0.0957192f, -0.312001f, 0.0175991f, 0.524423f, + -0.126379f, 0.124687f, -1.53945f, -0.342856f, 0.514072f, + 0.400884f, -0.00581101f, -0.219327f, 0.0977873f, 0.337551f, + -0.058603f, 0.20034f, 0.0429945f, 0.676803f, -0.273585f, + -0.173435f, -0.581596f, 0.226263f, -0.0946223f, -0.060088f, + -0.0100809f, -0.022242f, -0.22218f, -0.030463f, -0.141389f, + -0.190757f, -0.00526518f, -0.77519f, -0.0825695f, 0.308403f, + 0.262792f, -0.601842f, 0.0783697f, 0.197527f, 0.0714048f, + 0.0392629f, -0.388628f, 0.172541f, -0.0222009f, 0.252096f, + 0.0728652f, 0.173632f, 0.192914f, -0.00969965f, 0.0530136f, + -0.00765759f, 0.440234f, -0.0943323f, 0.112319f, 0.0878737f, + -0.739021f, 0.385305f, 0.133334f, -0.396697f, 0.177818f, + -0.0712558f, 0.516923f, 0.102174f, 0.17158f, -0.211068f, + 0.295795f, -0.36198f, 0.179087f, -0.845744f, -0.242514f, + -1.49073f, 0.272702f, 0.59011f, -0.408184f, -0.0731313f, + 0.234643f, 0.589642f, -0.100778f, 0.516921f, -0.700154f, + 0.316432f, 0.36117f, 0.0380282f, 0.480101f, -0.0975487f, + 0.941452f, 0.231705f, -0.151182f, -1.20305f, 0.28255f, + -0.0427662f, -0.00717175f, -0.842085f, -0.357376f, 0.545581f, + -0.290714f, 0.741498f, 1.00377f, 0.483864f, 0.150405f, + 0.0834512f, -0.10031f, 0.424054f, -0.0223491f, -0.0696701f, + -0.134479f, -0.747227f, 0.422208f, 0.123858f, -0.392624f, + -0.0299847f, -0.0376142f, -0.392536f, -0.0343114f, 0.298224f, + -0.375899f, 0.693119f, 0.27909f, -0.53463f, 0.105459f, + -0.0267383f, 0.5094f, -0.411557f, 0.451749f, -0.348479f, + -0.0497316f, -0.353913f, -0.14858f, 0.241838f, 0.331039f, + 0.756607f, -0.0701661f, -0.827264f, -0.367772f, 0.447201f, + 0.834616f, -0.00497265f, -0.0557285f, 0.055088f, -0.300115f, + -0.143833f, -1.07838f, -0.106896f, 0.16945f, 0.0170324f, + 0.108754f, 0.335893f, -0.0923708f, 0.450209f, -0.0713308f, + -0.0233037f, -0.0129902f, -1.40664f, -0.0996218f, 0.711236f, + 0.400716f, 0.227871f, 2.01499f, 0.572926f, 0.135673f, + -0.0340458f, -0.316736f, 0.24257f, -0.700768f, -0.194985f, + 0.312011f, -0.179599f, 0.128114f, 0.0725977f, -0.193816f, + 0.352143f, 0.070641f, -0.467808f, -0.399047f, 0.10136f, + 0.671574f, -0.553965f, 0.105729f, 0.210383f, 0.065048f, + 0.248198f, -0.731674f, 0.588725f, -0.308237f, 0.24511f, + 0.00608906f, 0.170906f, 0.246175f, 0.149521f, 0.106071f, + 0.160246f, 0.118487f, -0.104102f, 0.872823f, 0.227478f, + 0.0182631f, -0.115083f, 0.0142445f, 0.307947f, -0.884925f, + 0.0767105f, 0.0414042f, -0.448021f, -0.0400193f, -0.0765448f, + -0.411931f, -0.199624f, 0.333371f, 0.17267f, -0.0431816f, + 0.190826f, -0.0758961f, -1.02831f, -0.0414525f, 0.605374f, + -0.0188181f, -0.2207f, 1.30004f, -0.207005f, -0.0333617f, + 0.227145f, 0.105059f, -0.0473393f, -0.448752f, -0.0342152f, + -0.0244812f, 0.220329f, 0.0313591f, -0.0902074f, -0.0731945f, + 0.88488f, 0.306306f, -0.275613f, -0.476372f, 0.00678104f, + 0.442029f, 0.122049f, 0.118042f, 0.270527f, -0.462538f, + 0.0665021f, -0.260255f, 0.209182f, 0.162321f, 0.0629934f, + -0.244896f, -0.078863f, 0.655585f, -0.0506617f, -0.487128f, + 0.118765f, -0.34408f, 0.0930615f, -0.365632f, -0.0670776f, + 0.44428f, 0.286734f, 0.146608f, 0.686757f, -0.0738428f, + -0.10034f, -0.928438f, -0.172601f, -0.0959575f, -0.010532f, + 0.277549f, 0.28773f, -0.318883f, 0.71254f, 0.273593f, + -0.382845f, -0.0104587f, -0.647769f, 0.25541f, 0.194625f, + 0.265197f, -0.750938f, -0.0650515f, -0.567092f, 0.070613f, + 0.209531f, 0.429699f, 0.130676f, 0.514914f, 0.615778f, + 0.594535f, -0.0878778f, 0.40593f, -0.303383f, 0.0907863f, + -0.320068f, 0.0137162f, -0.303424f, 0.594207f, -0.236524f, + -0.692627f, -0.990063f, -0.0262934f, 0.222375f, 0.503412f, + 0.220224f, 0.676871f, -0.150996f, 0.379777f, 0.841339f, + -1.05981f, 0.259943f, -0.781745f, 0.0346478f, 0.115791f, + -0.25171f, -0.00872158f, 0.395561f, -0.0849893f, -1.20134f, + -0.313938f, 0.789542f, 0.159606f, -0.782095f, -0.229754f, + 0.266687f, -0.0354282f, -0.3041f, 0.0338618f, -0.390001f, + -0.28362f, -0.436144f, 0.777351f, 0.855321f, 0.653338f, + -0.0382912f, -0.204577f, 1.13828f, 0.220395f, -4.60853f, + 0.575694f, 0.0453189f, 1.76567f, 0.466151f, -0.366109f, + 0.594717f, 0.278891f, -0.750676f, -0.332739f, -0.942304f, + 0.280363f, 0.284561f, 0.209326f, 0.238347f, -0.0124311f, + -0.439463f, -0.036186f, 0.165997f, 0.374717f, -0.481148f, + -0.626417f, 0.0223598f, 0.039337f, -0.379918f, 0.211046f, + 0.0795812f, 0.863355f, -0.341448f, 0.421494f, 0.410477f, + -0.117025f, -0.511108f, 0.565193f, -0.063582f, -0.031349f, + -0.0750174f, 0.387941f, 0.541266f, 0.0919753f, 1.05041f, + 0.263004f, 0.289006f, 0.0439694f, -1.22439f, -0.247832f, + 0.260967f, 0.355794f, 0.599694f, -0.69418f, 0.372805f, + -0.161731f, 0.0720574f, 0.0394657f, 0.122772f, -0.458067f, + -0.370826f, -1.34495e-05f, -0.373404f, 0.0245539f, -2.3472f, + -2.61448f, 0.264794f, 0.0601582f, -0.968597f, -0.196022f, + -0.727067f, 0.167346f, 0.517478f, 0.0035377f, 0.777219f, + 0.553128f, 0.727211f, 0.606202f, -0.495604f, 2.41445f, + 0.465214f, -0.0443004f, 0.142972f, 0.141459f, -0.17771f, + 0.0156117f, 0.169264f, 0.0428022f, -0.164827f, -0.240632f, + 0.215289f, -0.213134f, -0.184163f, 0.0161321f, -0.20025f, + -0.0311616f, 0.00292108f, -0.0131921f, 0.0437664f, -0.104817f, + -0.131906f, 0.0822771f, 0.237307f, -0.347567f, -1.2485f, + 0.253616f, -0.442217f, 0.0514077f, 0.337561f, -0.0147658f, + -0.132888f, -0.643821f, 0.445573f, -0.0146213f, 0.235511f, + 0.53583f, -0.640644f, 0.0280044f, 0.00628834f, 0.143885f, + 0.380077f, -0.542342f, 0.363101f, 0.0647334f, -0.476556f, + -0.822676f, 0.482454f, -0.0467326f, -0.253083f, 0.116726f, + 0.317333f, 0.548131f, -0.234667f, 0.579923f, -0.420683f, + 0.595613f, -0.279864f, -0.753204f, -0.516844f, -0.436574f, + -0.120682f, -0.278939f, 0.752202f, -0.183443f, -0.14632f, + -0.0344068f, 0.127638f, -0.225245f, 0.489391f, 0.145082f, + -0.73672f, 0.980065f, -0.0367412f, 0.40632f, -0.802509f, + 0.356897f, 0.366172f, 1.23858f, -0.978381f, -0.684924f, + -0.0870693f, -0.353628f, 0.695788f, -0.244593f, -1.8897f, + -0.257803f, 0.686937f, 0.405155f, -0.125696f, 0.258075f, + 0.570584f, -0.439481f, -0.59798f, 0.0745711f, -0.235162f, + 0.133048f, -0.243033f, 0.0415527f, -0.00118735f, 0.00980514f, + -0.297429f, -0.144983f, 0.463093f, 0.0965441f, -0.338508f, + -0.651077f, 0.817577f, -0.0364773f, -0.388465f, 0.113288f, + 0.231198f, 0.316208f, -0.592201f, 0.530376f, -0.431434f, + 0.0200985f, 0.104303f, -0.130705f, 0.4374f, 0.362342f, + 0.70641f, 0.20037f, 0.309128f, -0.484535f, -1.18469f, + 0.513893f, 0.201236f, -0.022396f, 0.179638f, -0.361289f, + -0.0794946f, -1.04704f, -0.0281103f, 0.0494822f, 0.00196415f, + 0.0625478f, -0.229033f, 0.12018f, 0.542629f, -0.222423f, + -0.0123321f, -0.0988525f, 0.773192f, -0.192218f, -3.19156f, + 0.300606f, 0.462751f, 2.2968f, 0.137182f, 0.132539f, + 0.165884f, 0.128818f, -0.155856f, -0.558538f, -0.231742f, + -0.244377f, -0.442397f, 0.250947f, 0.0850658f, -0.00820139f, + 0.391284f, 0.17453f, 0.306003f, -0.531499f, -0.624451f, + 0.564584f, -0.343953f, -0.0278713f, 0.212664f, -0.135969f, + -0.0179867f, -0.687887f, 0.371065f, -0.0537029f, 0.0499509f, + 0.0980684f, -0.0438569f, 0.186731f, 0.182105f, 0.172254f, + -0.149446f, -0.0247637f, 0.148098f, 1.20772f, -0.136664f, + 0.00983112f, 0.0181381f, -0.0147549f, -0.0846561f, -0.827022f, + 0.00207177f, 0.0478215f, 0.0652549f, 0.0898219f, -0.0224959f, + -0.0274246f, 0.0166498f, -0.0211715f, -0.502932f, 0.0961452f, + 0.251206f, -0.0623632f, 0.741566f, 0.0078449f, -2.99162f, + -0.187244f, 0.0743479f, 1.46425f, 0.0737923f, 0.0133544f, + 0.20922f, -0.178671f, -0.0528492f, -0.526717f, 0.0282125f, + -0.0363201f, 0.37406f, -0.303658f, -0.066803f, 0.132237f, + 0.962057f, -0.399733f, 0.191765f, -0.452606f, -0.348732f, + 0.444939f, 0.153025f, 0.0796317f, 0.265985f, -0.319638f, + 0.0278161f, -0.333734f, 0.226108f, 0.147895f, -0.124066f, + -0.37306f, 0.19541f, 0.200175f, -0.0593244f, 0.0333887f, + -0.0284278f, 0.462491f, 0.0686487f, -0.332435f, -0.437166f, + 0.302795f, 0.100542f, 0.0265019f, 0.767212f, -0.140621f, + 0.11558f, -0.70584f, -0.00017415f, 0.00793092f, -0.0490901f, + 0.0598338f, 0.484876f, -0.13025f, 0.660349f, 0.147503f, + -0.462766f, 0.0843824f, 0.218493f, 0.310921f, -0.162284f, + 0.210404f, -0.788799f, 0.0698512f, -0.484799f, 0.0311505f, + -0.308243f, 0.417298f, 0.0593723f, 0.208908f, 0.451437f, + 0.354546f, -0.0700888f, -0.281678f, -0.311177f, 0.00914652f, + -0.372084f, 0.135036f, 0.185393f, 0.461347f, -0.114241f, + -0.402347f, -0.692327f, 0.0376155f, -0.200267f, 0.565963f, + -0.0627442f, 0.429677f, 0.170514f, 0.350565f, 0.699528f, + -0.948126f, -0.364205f, 0.348878f, -0.137832f, -0.0791649f, + -0.0462295f, -0.255078f, -0.398509f, 0.136783f, -0.0164628f, + -0.555472f, 0.690396f, 0.147715f, 0.000523095f, 0.14874f, + 0.524804f, 0.162974f, 0.797599f, 0.277473f, -0.500696f, + 0.189917f, -0.333309f, 0.00613646f, -1.07817f, 0.0470502f, + 0.210766f, 0.159768f, -0.447774f, -0.252968f, -1.72739f, + 0.0658259f, -0.448747f, 2.26511f, 0.349651f, 0.157232f, + 0.956842f, 0.856676f, 0.149227f, -0.626957f, -0.566771f, + -0.0980846f, 0.351668f, -0.362741f, -0.0272282f, -0.113632f, + 0.366015f, -0.00790003f, -0.458632f, -0.31157f, -0.182257f, + -0.953975f, 0.0583582f, 0.164721f, -0.900107f, -0.115542f, + 0.0654192f, 0.99056f, -0.247976f, 0.48254f, 0.670196f, + 0.098585f, -0.212855f, 0.310072f, 0.0894616f, 0.151944f, + 0.119629f, -0.26735f, 0.162257f, -0.0305818f, 0.681526f, + -0.229847f, 1.01556f, 0.29132f, 0.740113f, 0.0703937f, + 0.537892f, -0.18653f, -0.0252359f, -0.420014f, 0.197631f, + -0.176629f, 0.00674754f, 0.301288f, -0.162816f, 0.636235f, + -0.341362f, 0.197296f, -0.589747f, -0.749363f, -0.277197f, + -1.27291f, -0.0857908f, -0.147591f, -0.0956297f, -0.109097f, + 0.0717554f, 0.359078f, 0.301457f, 0.486934f, -0.260955f, + -0.126821f, 1.55756f, 0.477469f, -1.45363f, 1.42198f, + -0.360847f, -0.0211924f, -0.0184957f, -0.110706f, -0.152136f, + 0.104703f, 0.267615f, 0.127392f, 0.172996f, 0.258326f, + 0.268578f, -0.431123f, -0.114419f, 0.0101172f, -0.195671f, + 0.0792025f, -0.151505f, -0.064077f, 0.0479777f, -0.141882f, + 0.121492f, -0.139132f, -0.348252f, 0.341043f, -0.565367f, + -0.0791259f, -0.781086f, 0.0140045f, 0.571094f, -0.00875077f, + 0.217132f, -0.202345f, 0.157213f, 0.228445f, 0.366612f, + -0.529989f, 0.42241f, -0.540538f, -0.0425556f, -0.207774f, + -0.0663941f, 0.37836f, -0.0650245f, -0.0828694f, -0.0835478f, + -0.795512f, 0.470268f, 0.1551f, -0.69017f, -0.116735f, + 0.157614f, 0.555973f, -0.293311f, 0.245428f, -0.0853701f, + -0.449278f, -0.0551647f, -0.00137429f, 0.709439f, -0.456796f, + 0.132062f, -0.0449484f, -0.308599f, 0.180608f, -2.24196f, + 0.421478f, -0.640946f, -0.460397f, -0.920628f, -0.184949f, + -0.0416982f, 0.6484f, -0.22806f, 0.412229f, -0.468079f, + -0.72372f, -0.347698f, -1.3899f, 0.631876f, 0.0611046f, + 0.0294258f, -0.128091f, -0.205615f, 0.355348f, -0.267725f, + -0.644835f, 0.435879f, 0.517477f, -0.338123f, -0.157764f, + 0.32762f, -0.166454f, 0.221007f, -0.0438278f, -0.0777725f, + 0.10986f, 0.941545f, -0.542284f, -0.172312f, -0.256597f, + -0.0181391f, 0.220623f, -0.432456f, 0.0164074f, 0.250226f, + -0.522576f, 0.783109f, 0.198703f, -0.784554f, -0.0929628f, + 0.326861f, 0.470293f, 0.442684f, 0.271879f, -0.108256f, + 0.0483558f, -0.403151f, 0.36183f, -0.268186f, 0.270851f, + -0.696826f, -0.166037f, -0.354658f, 0.405977f, -0.473447f, + 0.649689f, -0.0863114f, -0.147319f, 0.0869966f, 0.319792f, + 0.493026f, -1.07456f, 0.354751f, 0.114605f, -0.120647f, + -0.238315f, 0.0290955f, -0.355299f, -0.45381f, 0.0812865f, + -0.0180434f, 0.00861318f, -0.892943f, -0.0127801f, -1.66398f, + 0.290505f, 0.126832f, 2.08173f, -0.0454847f, -0.162481f, + 1.07426f, 0.228566f, 0.280528f, -0.537625f, -0.175288f, + -0.118012f, 0.649114f, -0.349926f, -0.0189864f, -0.30934f, + -0.363178f, -0.119822f, -0.22656f, 0.484513f, -0.173269f, + 0.41987f, -0.448517f, -0.0950466f, 0.482443f, 0.061558f, + 0.4219f, -0.536388f, 0.0781972f, 0.212489f, 0.104229f, + -0.0792804f, 0.402066f, -0.676313f, -0.2272f, -0.16379f, + 0.260145f, -0.0504658f, -0.0826579f, -1.37749f, 0.00790747f, + 0.0841031f, -0.0671308f, -0.00301736f, -0.386206f, 0.190311f, + 0.0702639f, 0.0643968f, 0.133741f, -0.0141555f, -0.0365324f, + 0.87028f, 0.207894f, -0.421266f, 0.689256f, 0.145037f, + -0.270796f, 0.212604f, -0.345326f, 0.0074631f, -1.72379f, + 0.0672097f, -0.273153f, 1.30503f, -1.01324f, 0.00284696f, + 0.851459f, 0.176847f, 0.30948f, -0.57144f, -0.0596695f, + -0.111189f, 0.130361f, -0.298286f, 0.0567591f, -0.0885215f, + -0.847601f, 0.238624f, -0.162391f, 0.452357f, -0.0192713f, + 0.226661f, 0.0762922f, -0.0894055f, 0.332702f, 0.424484f, + 0.0443207f, -0.162345f, -0.601036f, 0.280527f, -0.137362f, + 0.266345f, 0.729438f, -0.887182f, 0.152943f, -0.573548f, + -0.0201383f, -0.56521f, 0.033582f, 0.300284f, -0.144472f, + 0.633026f, 0.30866f, 0.0653073f, 0.316901f, 0.0721326f, + 0.192252f, -0.833162f, 0.194292f, -0.08663f, -0.189401f, + -0.178242f, 0.111488f, 0.522487f, -0.65497f, 0.457049f, + 0.390654f, 0.0522936f, -0.39712f, -0.293717f, -0.374656f, + -0.118916f, -0.853076f, -0.0829578f, -0.17335f, -0.0218694f, + 0.367968f, 0.478469f, 0.0913813f, 0.519251f, 0.803526f, + -0.272516f, -0.341329f, 0.0897285f, 0.247653f, 0.000898686f, + 0.313196f, 0.000587979f, -0.314189f, -0.449439f, -0.0291611f, + -0.356287f, -0.722904f, -0.0480958f, -0.523758f, -0.576146f, + 0.133754f, 0.616921f, -0.085494f, 0.487487f, 0.745129f, + 0.993267f, 0.256555f, 0.0822743f, 0.0411971f, 0.139388f +}; + +static const float av1_intra_mode_cnn_partition_cnn_layer_1_bias[] = { + 0.00447951f, 0.0202534f, 0.00970833f, -0.00460874f, 0.0942288f, + -0.0534704f, 0.00829869f, -0.0255174f, -0.0809143f, 0.00169117f, + 0.0177427f, 0.0259387f, 0.0291077f, -0.0267599f, 0.100275f, + -0.00389366f, 0.0315499f, 0.0265846f, -0.000206604f, 0.0302221f +}; + +static const float av1_intra_mode_cnn_partition_cnn_layer_2_kernel[] = { + 0.153048f, 0.0725422f, 0.068901f, -0.475608f, 0.0736706f, + -0.134076f, 0.229289f, 0.0217921f, 0.0449205f, -1.00002f, + 0.149133f, 0.0497258f, 0.118988f, 0.0741764f, 0.0385486f, + 0.225181f, 0.012966f, 0.155593f, -3.07175f, -0.0641051f, + 0.09161f, 0.0259005f, -0.209998f, -0.420298f, 0.0587126f, + 0.00352744f, 0.0451313f, -0.049384f, 0.11516f, 0.083135f, + 0.103675f, -0.0185604f, 0.0623248f, -0.0993726f, 0.0448522f, + 0.0134017f, -0.294776f, -0.251924f, 0.0712635f, -0.0764298f, + -0.463766f, -0.0295011f, -0.579168f, 0.573853f, -0.00596607f, + 0.0237762f, -0.0500104f, -0.0969275f, 0.155573f, 0.0515382f, + -0.178454f, -0.154008f, -0.278299f, -0.166421f, 0.0149533f, + -0.0700236f, 0.239287f, -1.19545f, -0.0744625f, 0.143037f, + 0.141874f, 0.086302f, 0.0838633f, -0.454179f, 0.120308f, + -0.0896718f, 0.254909f, 0.0714462f, 0.00471098f, -0.869494f, + 0.209407f, 0.138285f, 0.0816641f, 0.0666266f, 0.0848555f, + 0.173313f, 0.0695633f, 0.285667f, -3.15384f, 0.00140275f, + -0.969824f, -0.0318689f, -0.00487396f, 0.412541f, 0.0263593f, + -0.249824f, 0.0897776f, 0.0208836f, -0.0982745f, -0.16049f, + -0.12719f, -0.186166f, 0.102338f, 0.273931f, -0.0886306f, + -0.19513f, -0.0135712f, -0.194127f, -0.0834291f, 0.426623f, + -0.0705446f, 0.0327476f, 0.0800862f, 0.478757f, -0.00849111f, + -0.554911f, -0.0489312f, -0.184029f, -0.227428f, 0.159989f, + -0.0677731f, -0.0901436f, 0.00308696f, -0.352243f, 0.278715f, + 0.306374f, -0.0772054f, -0.0122733f, -0.0693457f, 0.074365f, + -0.267458f, -0.123612f, -0.495954f, 0.552604f, -0.103951f, + -0.121771f, 0.179966f, -0.377947f, -1.35472f, 0.153294f, + -0.445284f, -0.089813f, -0.00529807f, 0.254047f, -0.0378426f, + 0.114597f, -0.143052f, 0.0815258f, -0.10528f, 0.00833533f, + -0.117508f, 0.129052f, 0.0706719f, -1.39506f, 0.0124731f, + 0.109831f, -0.0744156f, 0.181612f, 0.0787894f, 0.0293352f, + 0.494929f, 0.00997207f, -0.585882f, -0.0844138f, -0.00864134f, + -0.109943f, 0.0713114f, 0.14883f, 0.0610554f, 0.204145f, + -0.00390313f, 0.0184763f, -0.111387f, 0.175442f, -0.0840215f, + -0.178785f, -0.0693612f, -0.254507f, -0.191549f, 0.501561f, + -0.0858995f, -0.164921f, 0.0250706f, -0.0916282f, 0.247085f, + 0.13877f, -0.419487f, -0.295065f, -0.213812f, -0.10362f, + 0.138243f, 0.086985f, 0.113633f, -0.459273f, 0.12388f, + -0.139296f, 0.253792f, 0.0421624f, 0.0665065f, -0.977282f, + 0.199927f, 0.115194f, 0.099045f, 0.0534806f, 0.089283f, + 0.0815367f, 0.150901f, 0.253458f, -3.24825f, -0.0118163f, + -0.544565f, 0.0201825f, -0.0682201f, 0.759028f, 0.00479696f, + -0.00625607f, 0.058007f, -0.0811189f, -0.114617f, -0.0998578f, + 0.133312f, 0.0246256f, -0.0167416f, 0.196118f, 0.109823f, + 0.109489f, 0.474682f, -0.763475f, 0.0818745f, 0.0798777f, + -0.0994905f, -0.00138143f, -0.108563f, 0.697289f, -0.103702f, + -0.306085f, -0.0996705f, -0.142618f, -0.130989f, 0.0813303f, + -0.0909275f, -0.10786f, -0.0280431f, 0.206877f, -1.70798f, + 0.525568f, 0.559891f, -0.166132f, -0.227574f, -0.150955f, + 0.0849226f, 0.00497342f, -0.168667f, -0.282575f, 0.00537805f, + -0.0185572f, 0.0607167f, -0.0534948f, -0.0215776f, -0.14825f, + -0.0164577f, -0.0611978f, 0.0347562f, 0.286917f, 0.226598f, + 0.149497f, -0.478101f, -0.246006f, 0.0663239f, -0.121728f, + 0.267087f, 0.0802681f, -0.184741f, -0.558267f, 0.0437066f, + 0.13816f, -0.0710939f, 0.0725697f, 0.339857f, 0.161069f, + 0.304871f, 0.108138f, 0.193396f, 0.0891607f, -0.0701939f, + -0.182038f, -0.451873f, -0.233883f, 0.0444747f, 0.0436545f, + -0.245894f, -0.0721136f, 0.309013f, 0.278996f, 0.0259377f, + 0.0278116f, 0.0686773f, -0.271237f, 0.235082f, -0.0778285f, + -0.456541f, -0.109303f, -0.074565f, -0.407301f, -0.162191f, + -0.801819f, 0.372435f, -0.559083f, -0.039189f, 0.0477762f, + 0.0875363f, 0.0699926f, 0.116552f, -0.308217f, 0.0341607f, + -0.14202f, 0.135517f, 0.0316971f, 0.153297f, -0.759722f, + 0.12849f, 0.114229f, 0.0814893f, 0.275402f, 0.0403976f, + 0.0357503f, 0.212295f, 0.0673998f, -2.59822f, -0.0475021f, + -0.0594725f, 0.0659163f, 0.0469717f, -0.0370461f, -0.12863f, + -0.381743f, -0.0445055f, -0.106843f, -0.0880648f, 0.00591106f, + 0.235514f, -0.165162f, -0.0696645f, 0.115374f, 0.245558f, + 0.192049f, -0.388628f, -0.48291f, 0.154313f, -0.160207f, + 0.125928f, 0.122039f, 0.0713794f, -0.161244f, 0.128082f, + -0.234659f, 0.0680219f, 0.0597933f, 0.208421f, -0.163623f, + 0.196873f, 0.156603f, 0.184179f, -0.278331f, -0.0481286f, + 0.0828152f, 0.247004f, 0.0915582f, -0.0906229f, -0.20376f, + 0.136593f, 0.0740336f, -0.0134935f, -0.355048f, 0.0898485f, + -0.0962068f, 0.185804f, -0.0145596f, 0.0966589f, -0.515784f, + 0.121602f, 0.0320428f, 0.11093f, -0.0559421f, 0.0355484f, + 0.192128f, 0.0500888f, 0.133641f, -1.73282f, -0.0624599f, + 0.122524f, 0.0757292f, -0.0974648f, -0.193649f, 0.0561096f, + 0.0159959f, 0.0334472f, -0.0168832f, -0.12386f, -0.112419f, + 0.19552f, 0.0308502f, 0.0537643f, -0.0181012f, 0.0392183f, + 0.0461833f, -0.52623f, -0.238252f, 0.0821762f, -0.212384f, + 0.112901f, 0.096063f, 0.0540225f, 0.0773583f, 0.143045f, + -0.101551f, 0.282418f, 0.0176749f, -0.00244542f, -0.780154f, + -0.254428f, -5.82215f, 0.106638f, 0.11746f, 0.0486823f, + 0.164562f, 0.0303006f, 0.229614f, -2.41845f, -0.117122f, + 0.0451654f, 0.0237383f, -0.208731f, 0.0721137f, 0.0761163f, + -0.0569416f, -0.00830511f, -0.045256f, 0.14535f, -0.0189222f, + -0.283363f, -3.15502f, 0.0971161f, -0.035913f, 0.00813281f, + 0.0187974f, -0.361573f, -0.302067f, 0.118014f, -0.0956148f, + -0.596567f, 0.0105443f, -0.49019f, -0.0801959f, 0.0322344f, + -0.0280032f, 0.0555038f, -0.111495f, -0.0994456f, 0.0178021f, + 0.0358362f, 1.07063f, -0.0833138f, 0.0621246f, 0.0637157f, + 0.0999207f, 0.191975f, -1.2811f, 0.0341681f, 0.14818f, + 0.0957259f, 0.109909f, 0.0566115f, 0.0585633f, 0.179939f, + -0.104372f, 0.309091f, 0.0172941f, 0.0243182f, -0.935252f, + -0.296257f, -5.83634f, 0.0899249f, 0.455347f, 0.129505f, + 0.220212f, 0.0214801f, 0.284802f, -2.94585f, -0.0805413f, + -1.01819f, 0.00534034f, -0.057203f, 0.0869331f, 0.0207575f, + -0.124479f, -0.0465806f, 0.0894252f, 0.32203f, 0.0858497f, + 0.25178f, 0.0932205f, 0.0888455f, 0.233153f, -0.446398f, + -0.00791233f, 0.0909603f, -0.0904397f, 0.131835f, 0.475597f, + -0.1236f, 0.0231622f, 0.138602f, -0.097731f, -0.0282484f, + -0.549095f, -0.0457428f, -0.0895407f, -0.293965f, 0.166872f, + 0.46719f, 0.236254f, 0.0615991f, 0.499236f, 0.540366f, + 0.402035f, 0.0606324f, -0.0499928f, -0.0155198f, 0.0994403f, + -0.14773f, -0.183433f, -0.612093f, -0.334201f, -0.110877f, + -0.143441f, 0.05815f, -0.318586f, -0.344235f, 0.199593f, + 0.51109f, -0.252281f, -0.028834f, 0.0615421f, 0.0623699f, + 0.210745f, -0.236448f, 0.166279f, 0.127516f, -0.0971157f, + -0.204389f, 0.208112f, 0.0377023f, 0.271837f, -0.00859528f, + 0.0797081f, -0.00582115f, 0.140018f, -0.384865f, -0.0853243f, + -0.586727f, -0.0664489f, -0.631436f, -0.245828f, -0.0647894f, + -0.171912f, -0.0801706f, 0.0731614f, -0.11725f, 0.281478f, + -0.03047f, 0.0363488f, -0.0481651f, -0.326329f, -0.0155898f, + -0.428316f, -0.0989367f, -0.271902f, -0.00263837f, 0.366168f, + 0.325989f, 0.165463f, 0.0668512f, -0.142202f, 0.419992f, + 0.164971f, -0.515479f, -0.187585f, -0.151783f, -0.0682468f, + 0.0910191f, 0.117086f, 0.106579f, 0.0961825f, 0.162148f, + -0.129645f, 0.301039f, 0.000320343f, -0.0558097f, -0.844295f, + -0.218919f, -5.7571f, 0.0982612f, 0.238955f, 0.0703565f, + 0.0969388f, 0.107202f, 0.321585f, -3.00594f, -0.058755f, + -0.620004f, 0.052114f, 0.128423f, -0.177673f, -0.00341509f, + -0.146756f, -0.0414309f, -0.0893262f, -0.0584779f, -0.129552f, + 0.127629f, 0.13275f, -0.0973342f, -0.215617f, 0.0724309f, + 0.0102229f, 0.178137f, -0.943374f, -0.171465f, 0.304949f, + -0.0963836f, -0.0346437f, -0.138667f, -0.234184f, 0.0344159f, + -0.319592f, -0.0990766f, -0.16065f, 0.369432f, 0.194911f, + 0.363348f, -0.356009f, -0.00736217f, 0.241788f, -2.21311f, + 0.704816f, 0.697019f, 0.129186f, -0.132799f, -0.11861f, + 0.0383451f, 0.0247782f, -0.12687f, 0.0256552f, 0.048413f, + 0.00660549f, 0.0457962f, -0.012819f, 0.115991f, -0.1117f, + -0.291045f, -0.646138f, 0.0813613f, 0.112063f, 0.191675f, + 0.120835f, -0.444267f, -0.340385f, 0.0391936f, -0.151132f, + 0.184419f, 0.124998f, -0.14089f, 0.214087f, 0.00108535f, + 0.119611f, 0.0236965f, 0.0715074f, -0.225997f, -0.0126552f, + -0.459214f, -0.490444f, 0.173716f, 0.355811f, -0.13607f, + -0.191091f, -0.530085f, -0.400666f, 0.011221f, 0.10527f, + -0.11498f, -0.011864f, 0.364376f, 0.0319587f, -0.0528563f, + 0.0353899f, 0.0393453f, -0.289211f, -0.347785f, -0.0417157f, + 0.545848f, 0.741785f, -0.0732565f, -1.29687f, -0.0433128f, + -1.44162f, 0.318894f, -0.377784f, 0.123751f, -0.00444347f, + 0.0957118f, 0.0893616f, 0.0911595f, 0.092917f, 0.127681f, + -0.159929f, 0.190417f, -0.0297948f, -0.00132599f, -0.742756f, + -0.0364169f, -4.00108f, 0.0784767f, 0.223048f, 0.0430138f, + 0.0180493f, 0.212842f, 0.122987f, -2.83267f, -0.0641464f, + -0.173247f, 0.100946f, 0.0804885f, 0.0172631f, 0.0877408f, + -0.353222f, 0.0108262f, -0.0452121f, -0.116127f, 0.268154f, + -0.132587f, -0.27481f, -0.0316914f, 0.0610525f, 0.439691f, + 0.00966415f, -0.78962f, -0.424823f, -0.0214365f, -0.113846f, + 0.100793f, 0.126482f, 0.0415354f, 0.0427995f, 0.14273f, + -0.315674f, 0.110095f, 0.0061568f, 0.0320474f, -0.3596f, + -0.12533f, -1.28837f, 0.174673f, -0.235912f, 0.00495439f, + 0.0695473f, 0.266489f, 0.049248f, 0.0868526f, -0.0685969f, + 0.102984f, 0.0924639f, -0.027535f, 0.0709277f, 0.155776f, + -0.190944f, 0.188273f, -0.00897471f, 0.0964232f, -0.475822f, + -0.209374f, -5.00252f, 0.103495f, 0.110698f, 0.00682092f, + 0.208586f, 0.0489575f, 0.0966254f, -1.42973f, -0.0645128f, + 0.0515961f, 0.0571281f, -0.0992321f, 0.00791648f, 0.0087609f, + 0.0607367f, 0.0315705f, 0.0183317f, 0.0756087f, -0.0292847f, + -0.212932f, -0.782259f, 0.0899944f, 0.102677f, 0.0681135f, + 0.0447764f, -0.481969f, -0.221459f, 0.0794475f, -0.229157f, + 0.136781f, 0.0832359f, 0.0297807f, -0.00287225f, -5.97897f, + -0.0960581f, 0.250945f, -0.00133314f, -0.112396f, -0.856922f, + 0.115776f, 0.124536f, 0.0914194f, -0.160775f, 0.128684f, + 0.106718f, 0.100665f, 0.139579f, -0.86141f, -0.190323f, + 0.0884896f, 0.0363845f, -0.19831f, 0.121601f, 0.0264453f, + -0.00557822f, 0.0720238f, -0.0140132f, -0.166814f, -0.266214f, + 0.00500545f, 0.0146905f, 0.126035f, 0.0812372f, 0.0615973f, + 0.0766063f, -0.420156f, -0.126157f, -0.0284299f, -0.112513f, + -0.567008f, -0.0100263f, -0.607567f, 0.193053f, 0.0067527f, + -0.0753897f, 0.00134269f, -0.0512249f, -0.161661f, 0.0667741f, + -0.113702f, -0.071606f, -0.300563f, 0.276479f, -0.155318f, + -0.0512306f, 0.0896443f, -0.987911f, 0.0440889f, 0.430958f, + 0.175427f, 0.101385f, 0.0303662f, 0.0672653f, -6.62463f, + -0.10475f, 0.228249f, -0.00482173f, -0.0608713f, -0.895836f, + 0.187976f, 0.162173f, 0.0747544f, 0.219953f, 0.0682489f, + 0.142665f, 0.100287f, 0.301887f, -1.97736f, -0.295001f, + -1.0733f, -0.0562668f, -0.0604295f, 0.0304073f, 0.194274f, + -0.243593f, 0.0727137f, 0.0610967f, -0.0692415f, -0.02967f, + 0.055633f, 0.0192402f, 0.105841f, 0.102236f, -0.0757102f, + -0.0067639f, 0.0102317f, -0.257959f, -0.0638652f, 0.45521f, + -0.114967f, 0.0921177f, 0.223796f, 0.277072f, -0.0613282f, + -0.564693f, -0.151333f, -0.158035f, 0.228491f, 0.12997f, + -0.192625f, -0.125344f, 0.0983258f, -0.931206f, 0.618715f, + 0.273759f, -0.145527f, -0.099431f, -0.119551f, 0.0663484f, + -0.161419f, -0.202377f, -0.545393f, 0.0917645f, 0.042263f, + -0.17117f, -0.178622f, -0.336977f, 0.866715f, 0.0376922f, + -0.319728f, -0.127406f, 0.0599384f, 0.268804f, -0.0331844f, + 0.355326f, -0.103902f, 0.0425935f, 0.00525512f, -0.133687f, + -0.122695f, 0.145582f, 0.139013f, -0.0053352f, 0.0313566f, + 0.327295f, -0.0117993f, 0.233524f, 0.162388f, -0.0793262f, + 0.454543f, 0.0442224f, -0.742673f, -0.144882f, 0.0874983f, + -0.0707259f, 0.0219869f, 0.201728f, 0.0204537f, 0.0788857f, + -0.0374329f, 0.0724169f, 0.0743593f, -0.0193526f, -0.313546f, + -0.418882f, -0.0815754f, -0.197144f, 0.305053f, 0.330196f, + -0.131006f, -0.00113249f, 0.0750458f, -0.541764f, 0.299935f, + 0.308516f, -0.20547f, -0.333066f, 0.0285833f, 0.191147f, + 0.160372f, 0.0724649f, 0.0426326f, 0.153046f, -6.59656f, + -0.081237f, 0.219163f, 0.0147081f, -0.0109837f, -1.01487f, + 0.170055f, 0.163386f, 0.106413f, 0.150188f, 0.0688875f, + 0.0541359f, 0.156307f, 0.178844f, -1.51054f, -0.149477f, + -0.504503f, 0.017878f, -0.181821f, -0.0999659f, 0.0484548f, + -0.32211f, 0.0406744f, 0.0017627f, 0.0220593f, 0.0900512f, + -0.561625f, 0.107279f, -0.0861521f, -0.0862376f, 0.0816765f, + 0.168072f, 0.150063f, -0.816825f, -0.13569f, 0.557555f, + -0.155265f, 0.025135f, -0.109304f, -0.0487062f, -0.00347487f, + -0.454803f, -0.0394371f, -0.214597f, -0.248898f, 0.286501f, + -0.249246f, -0.138935f, 0.00391409f, -0.122544f, -2.14993f, + 0.588942f, 0.541231f, 0.0154047f, -0.359742f, 0.0520729f, + 0.0667058f, 0.0418163f, -0.132533f, -0.184759f, 0.0546118f, + -0.131198f, 0.109664f, -0.0714679f, -0.114163f, -0.243081f, + -0.0405089f, 0.0342795f, 0.0801825f, -0.268408f, 0.192207f, + 0.0800494f, -0.586539f, -0.118155f, -0.0508569f, -0.193987f, + 0.261478f, 0.105719f, -0.125361f, -0.0956201f, 0.0233802f, + 0.271098f, 0.0113352f, 0.0910447f, 0.00628244f, -0.071722f, + 0.21439f, 0.0747191f, 0.207765f, -0.0782454f, -0.0151716f, + -0.196505f, -0.44798f, -0.228597f, 0.0549039f, -0.120715f, + -0.19388f, -0.0768461f, 0.361102f, 0.122936f, -0.0334211f, + -0.202503f, -0.0450776f, -0.272345f, 0.662321f, 0.109247f, + -0.218026f, -0.0669386f, -0.0864701f, -0.633421f, -0.158007f, + -1.10778f, 0.351211f, -0.541458f, -0.0171707f, 0.149606f, + 0.106105f, 0.0880349f, 0.0968455f, 0.113269f, -5.01949f, + -0.106404f, 0.175578f, -0.030045f, -0.0267249f, -0.563713f, + 0.173885f, 0.130772f, 0.0334519f, 0.0770157f, 0.0394389f, + -0.0290326f, 0.220003f, 0.180901f, -1.62203f, -0.151858f, + -0.202386f, -0.0067836f, 0.0287665f, -0.194183f, -0.239834f, + -0.484159f, 0.00671722f, -0.122459f, 0.0808959f, -0.263769f, + -0.015066f, -0.0429868f, -0.111255f, -0.231872f, 0.219659f, + -0.0437412f, -0.536618f, -0.477831f, 0.0421895f, -0.0815851f, + 0.119638f, 0.0786293f, -0.000668378f, 0.0305567f, -0.0868189f, + -0.178327f, 0.0799657f, 0.0280923f, -0.211395f, -0.464577f, + 0.216912f, 0.0761976f, 0.160288f, -0.416372f, -0.10286f, + -0.0733786f, 0.261033f, 0.0493698f, 0.143137f, -0.179979f, + 0.15655f, 0.0897976f, -0.0258041f, -0.152852f, -6.15512f, + -0.118917f, 0.227283f, -0.0514043f, -0.0786432f, -0.523485f, + 0.1644f, 0.0869001f, 0.0984082f, -0.428288f, 0.0791992f, + 0.141904f, 0.0652073f, 0.104429f, -0.775125f, -0.121479f, + 0.0841637f, 0.0135705f, -0.208863f, -0.0629523f, 0.0455794f, + 0.0513898f, -0.0147657f, 0.0401145f, 0.0660079f, 0.0210609f, + -0.0151801f, 0.0562111f, 0.140308f, -0.0196394f, 0.0230753f, + -0.0336115f, -0.422411f, -0.196974f, -0.0405748f, -0.283428f, + 0.15458f, 0.0876296f, 0.0314038f, 0.16389f, -7.01385f, + -0.117146f, 0.197273f, -0.0400688f, 0.0143951f, -0.964007f, + -0.0618919f, 0.0406891f, 0.07992f, -0.144132f, 0.116416f, + 0.0326838f, 0.103641f, 0.171805f, -1.05158f, -0.182589f, + 0.116991f, 0.0530774f, -0.212454f, -0.016727f, -0.0565992f, + 0.0712873f, 0.0445466f, -0.000107032f, -0.121449f, -0.15148f, + 0.0220338f, 0.0762024f, 0.12253f, 0.0622466f, 0.0835822f, + 0.0465119f, -0.388743f, -0.34665f, -0.0720734f, -0.101581f, + -0.630565f, -0.0512685f, -0.520541f, 0.0530119f, -0.0245276f, + -0.19116f, -0.0144446f, -0.0604486f, 0.187251f, -0.021341f, + -0.217823f, 0.0510256f, -0.197946f, 0.060955f, -0.0617316f, + 0.0741673f, 0.117591f, -1.47844f, -0.0911093f, 0.359225f, + 0.145027f, 0.127513f, 0.0617905f, 0.141154f, -7.63868f, + -0.0808127f, 0.274843f, 0.00693195f, -0.0283113f, -0.853871f, + -0.15737f, 0.0858904f, 0.0746279f, 0.109912f, 0.193775f, + 0.0698094f, 0.174159f, 0.259556f, -1.49885f, -0.156706f, + -1.04113f, -0.0329546f, -0.0491449f, -0.0304125f, 0.0514892f, + -0.244284f, 0.126814f, -0.0387081f, -0.153173f, -0.0566748f, + 0.294111f, -0.0170534f, 0.102381f, 0.447606f, -0.0613267f, + -0.0636869f, -0.0347599f, -0.259572f, -0.0657846f, 0.454352f, + -0.169453f, -0.00177987f, 0.133279f, -0.0863932f, -0.134423f, + -0.475107f, -0.00448962f, -0.214607f, 0.111413f, 0.194377f, + -0.0710837f, 0.0562353f, 0.0401193f, 0.248595f, 0.538374f, + 0.449469f, -0.39111f, 0.0125057f, 0.0448811f, -0.00707751f, + -0.164894f, -0.317516f, -0.56231f, -0.270262f, 0.127016f, + -0.12092f, -0.0881587f, -0.323908f, 0.872344f, 0.103391f, + 0.267971f, -0.155088f, -0.0136683f, 0.309517f, 0.119901f, + 0.271307f, -0.188463f, 0.185121f, -0.142777f, -0.110535f, + -0.163107f, 0.175502f, 0.0801924f, 0.240499f, 0.0874759f, + 0.308907f, -0.00222504f, 0.193366f, 0.109018f, -0.0772158f, + -0.520675f, 0.0259432f, -0.736666f, -0.296579f, 0.043486f, + -0.128932f, 0.0417669f, 0.125747f, 0.157879f, 0.112857f, + -0.0595681f, 0.0611936f, -0.042125f, -0.270338f, 0.120072f, + -0.36675f, -0.0347962f, -0.119539f, 0.0873369f, 0.296432f, + -0.069501f, -0.0383859f, 0.0913597f, -0.40747f, 0.234276f, + 0.332536f, -0.732132f, -0.312291f, 0.137759f, 0.227593f, + 0.14165f, 0.129068f, 0.102734f, 0.135818f, -7.35883f, + -0.101533f, 0.256027f, -0.0142278f, -0.0561601f, -1.09899f, + -0.106538f, 0.0612256f, 0.099487f, -0.0605983f, 0.134311f, + 0.052226f, 0.143672f, 0.219944f, -1.47539f, -0.101828f, + -0.429979f, 0.010478f, -0.0132605f, 0.103363f, 0.0267373f, + -0.338865f, 0.0090188f, 0.0810085f, -0.124368f, -0.0133776f, + 0.595666f, -0.00162201f, -0.212444f, -0.26342f, 0.0913656f, + -0.106279f, 0.414515f, -0.709901f, -0.00198859f, 0.305288f, + -0.188536f, -0.0377482f, -0.131909f, -0.116099f, -0.236827f, + -0.36356f, 0.0179455f, -0.202143f, -0.00395508f, 0.177363f, + 0.0630679f, -0.145173f, -0.0558639f, -0.44879f, -1.55687f, + 0.473398f, 0.50531f, -0.0656231f, -0.137197f, 0.064707f, + 0.122083f, 0.0321111f, -0.167096f, 0.0406581f, -0.0793592f, + -0.0777081f, 0.0321379f, -0.0108834f, -0.0652323f, -0.102918f, + 0.0178664f, 0.0781873f, 0.0613189f, -0.04177f, 0.159566f, + 0.15134f, -0.445996f, -0.384905f, 0.0951659f, -0.175046f, + 0.255746f, 0.177047f, -0.150632f, 0.200522f, 0.00778549f, + 0.232168f, -0.0304652f, 0.083155f, -0.125395f, -0.0203289f, + -0.23874f, 0.0349836f, 0.231701f, -0.14849f, -0.204272f, + -0.198309f, -0.364955f, -0.228428f, 0.0614142f, -0.040976f, + -0.227785f, -0.0898404f, 0.271566f, -0.209196f, 0.0226431f, + -0.0911715f, 0.0840369f, -0.299411f, -0.529182f, 0.0622292f, + 0.202475f, 0.0155583f, -0.083114f, 0.124253f, -0.22721f, + -1.02565f, 0.193961f, -0.54287f, -0.00849364f, 0.11124f, + 0.0993531f, 0.120621f, 0.0959537f, 0.136274f, -5.23358f, + -0.107433f, 0.155286f, -0.0136043f, -0.0246768f, -0.631187f, + -0.0493852f, 0.0446751f, 0.0588353f, 0.160766f, -0.0354385f, + -0.0672548f, 0.243743f, 0.186004f, -1.20199f, -0.151872f, + -0.0760096f, -0.00775123f, -0.0122227f, 0.0891327f, -0.377876f, + -0.469926f, -0.134715f, -0.0969362f, 0.212542f, 0.0871489f, + 0.164638f, -0.0485785f, -0.167754f, -0.515052f, 0.13821f, + 0.0515572f, -0.430691f, -0.394719f, 0.143947f, -0.00670816f, + 0.129623f, 0.140299f, 0.0336978f, 0.153545f, -0.350927f, + -0.213485f, 0.0344809f, 0.0405889f, 0.0749967f, -0.369352f, + -0.109398f, 0.0350649f, 0.190893f, -0.284106f, -0.185376f, + 0.0105842f, 0.263692f, 0.160429f, 0.0998209f, -0.127779f, + 0.140558f, 0.108968f, -0.0122672f, 0.102875f, -5.72172f, + -0.161288f, 0.135935f, -0.0143087f, 0.106556f, -0.649813f, + -0.123049f, -0.0108861f, 0.102918f, -0.298137f, 0.0329013f, + 0.100763f, 0.12018f, 0.100782f, -0.648036f, -0.111122f, + 0.12363f, 0.0211952f, -0.225201f, 0.0506021f, 0.0167621f, + 0.0608759f, -0.0245646f, 0.0503477f, -0.0972749f, -0.0415155f, + -0.00578366f, -0.0977591f, 0.124867f, 0.0134788f, -0.0375816f, + -0.00581233f, -0.272292f, -0.250393f, 0.024511f, -0.184891f +}; + +static const float av1_intra_mode_cnn_partition_cnn_layer_2_bias[] = { + 0.182474f, 0.0223202f, 0.204111f, 0.0573683f, 0.111143f, + 0.0800926f, -0.0364215f, 0.192371f, 0.00498262f, 0.302543f, + 0.0133081f, 0.119719f, 0.237522f, -0.266705f, 0.129427f, + 0.0695857f, 0.22068f, 0.231667f, 0.405829f, -0.0972567f +}; + +static const float av1_intra_mode_cnn_partition_cnn_layer_3_kernel[] = { + -0.0393876f, -0.269924f, -0.0703231f, -0.0236484f, 0.170478f, + 0.245566f, 0.175963f, 0.104194f, -0.0490501f, -0.157605f, + -0.0275165f, -0.0169499f, -0.250725f, 0.215203f, -0.00733655f, + 0.0111298f, 0.205606f, 0.928046f, 0.15139f, 0.0955483f, + -0.015115f, -0.126643f, 0.0957605f, -0.140178f, -0.0246866f, + 0.097097f, 0.116287f, 0.177746f, 0.0570021f, -0.0518686f, + -0.0446482f, -0.0125318f, 0.0116092f, 0.102431f, 0.0898519f, + 0.0870372f, -0.843274f, 0.383311f, -0.102761f, -0.0246494f, + 0.0312555f, 0.19472f, 0.111573f, 0.0920392f, -0.0555618f, + 0.326461f, 0.219357f, -0.133727f, -0.118399f, -0.0611432f, + -0.169931f, 0.123733f, -0.204607f, 0.082592f, 0.0323181f, + 0.201618f, -0.00388867f, -0.053583f, 0.0266333f, -0.0951787f, + -0.0358283f, -0.0649549f, 0.0119263f, -0.11812f, 0.209851f, + -0.036616f, -0.014911f, -0.138096f, -0.139664f, -0.207395f, + 0.0128848f, -0.201816f, 0.0899419f, 0.343308f, -0.0096243f, + -0.212605f, -0.0905284f, -0.0597114f, -0.055261f, -0.0653405f, + 0.0330484f, -0.27681f, -0.0994095f, -0.0468272f, 0.145713f, + 0.267216f, 0.185335f, 0.1798f, -0.0437882f, -0.200401f, + -0.0398117f, -0.0736501f, -0.166349f, 0.203316f, 0.0710647f, + 0.061825f, 0.281131f, 0.733323f, 0.215488f, 0.00145659f, + -0.138995f, -0.0833713f, 0.107809f, -0.105343f, -0.0672139f, + 0.101852f, 0.135455f, 0.132903f, 0.0312017f, -0.0643586f, + -0.0274546f, -0.0687466f, -0.020233f, 0.109444f, 0.0774587f, + 0.139497f, -0.800587f, 0.325783f, -0.0546695f, -0.092003f, + -0.0773301f, 0.189672f, 0.0604666f, 0.0939425f, 0.679495f, + 0.114789f, -0.161153f, 0.12843f, -0.0345385f, -0.134641f, + -0.153995f, 0.0823055f, -0.0349296f, 0.0299183f, -0.0606872f, + 0.137588f, 0.0449805f, -0.0555399f, -0.00553351f, -0.120719f, + -0.204701f, -0.0739813f, 0.0584115f, -0.104833f, -0.110989f, + 0.00845446f, 0.0630702f, -0.147861f, 0.0268545f, -0.216419f, + 0.00531986f, -0.206641f, 0.253082f, 0.413215f, -0.05909f, + -0.0939983f, -0.116818f, -0.0450892f, -0.0551134f, -0.00696931f, + -0.113003f, -0.289192f, -0.00884866f, -0.0365724f, 0.0401887f, + 0.238622f, 0.149151f, 0.175751f, -0.157425f, -0.138924f, + -0.0277598f, -0.0285915f, 0.10165f, 0.209532f, 0.0862249f, + 0.0256428f, 0.623204f, -0.0941196f, 0.20345f, -0.132869f, + 0.00947298f, -0.14753f, 0.103918f, -0.161799f, 0.125566f, + 0.10916f, 0.115446f, 0.135627f, -0.0181667f, -0.0734694f, + -0.0154729f, -0.085849f, -0.000427605f, 0.113614f, 0.0776308f, + 0.111899f, -0.214917f, 0.393234f, -0.132223f, 0.020783f, + -0.074902f, 0.217477f, 0.107883f, 0.109466f, 0.146609f, + 0.317061f, 0.074379f, -0.0505457f, -0.0503772f, -0.0678954f, + -0.220003f, 0.114878f, 0.176014f, -0.00657996f, -0.0875497f, + 0.065582f, 0.00238612f, -0.063395f, 0.0295323f, -0.127126f, + 0.099813f, -0.115452f, 0.0106309f, -0.179632f, -0.0436553f, + 0.0120295f, 0.0652713f, -0.131512f, -0.081714f, -0.205363f, + -0.0374944f, -0.196707f, 0.680568f, -0.00991824f, -0.0212223f, + -0.186258f, -0.432361f, -0.0291303f, -0.0475983f, -0.071383f, + -0.0116416f, -0.28257f, -0.0635272f, -0.0576546f, -0.280129f, + 0.286528f, 0.199997f, 0.192851f, 0.323829f, -0.185006f, + -0.04791f, -0.0882187f, -0.0496895f, 0.293135f, 0.125539f, + 0.0341828f, 0.993452f, 0.0369177f, 0.0453796f, 0.0329807f, + 0.157673f, -0.153195f, 0.122383f, -0.161983f, -0.317619f, + 0.105129f, 0.155673f, 0.152489f, 0.0685417f, -0.0595907f, + -0.026657f, -0.0954336f, -0.0359557f, 0.105617f, 0.0825066f, + 0.100189f, -0.22125f, 0.382508f, -0.0247677f, -0.115807f, + -0.0639787f, 0.177786f, 0.0566206f, 0.0496389f, 1.31533f, + 0.0482907f, -0.118743f, 0.190632f, 0.172867f, -0.108446f, + -0.200186f, 0.122572f, 0.0897468f, 0.0155328f, -0.0380217f, + 0.125161f, -0.141723f, -0.023157f, 0.0270805f, -0.101961f, + 0.12358f, -0.0866255f, 0.00306761f, -0.131764f, -0.461118f, + -0.00803936f, 0.0895496f, -0.153905f, 0.207623f, -0.249099f, + -0.0198487f, -0.160013f, 0.81136f, -0.109978f, -0.0880332f, + -0.0761368f, -0.0755881f, -0.0384827f, -0.0554777f, -0.0750048f +}; + +static const float av1_intra_mode_cnn_partition_cnn_layer_3_bias[] = { + 0.0106809f, 0.136699f, 0.285316f, 0.395746f +}; + +static const float av1_intra_mode_cnn_partition_cnn_layer_4_kernel[] = { + -0.0161019f, -0.088871f, 0.0463358f, -0.198037f, 0.038122f, + 0.0135483f, -0.196641f, -0.433531f, 0.527972f, -0.143716f, + 0.558627f, 0.459889f, 0.322864f, -0.491514f, -0.190915f, + -0.0765601f, 0.210329f, 0.689389f, -0.100415f, -1.8788f, + 0.2228f, 0.292781f, -0.954838f, -0.0788763f, -0.131402f, + -0.17154f, 0.049934f, -0.0541183f, -0.530529f, -0.666165f, + 0.195492f, 0.218548f, -0.314895f, 0.0749444f, -0.191344f, + 0.349469f, 0.00811248f, -0.760157f, 0.0707434f, -0.0719285f, + -0.264495f, -0.432009f, -0.432686f, 0.155738f, -0.020197f, + 0.19278f, -0.658335f, -0.273143f, -0.286079f, 0.243402f, + 0.497701f, 0.0121003f, -0.666308f, 0.028172f, -0.547901f, + -0.11755f, 0.322028f, 0.0878274f, -0.0328334f, 0.311816f, + 0.0951026f, -1.11429f, -0.0417486f, 0.123467f, -0.0910681f, + -0.0154255f, 0.311201f, -0.0156158f, -0.600437f, 0.0274156f, + -0.174907f, -1.29313f, -0.178656f, 0.596556f, -0.421725f, + -0.289137f, 0.529297f, 0.114833f, -0.0155887f, -0.308232f, + -0.0228361f, 0.184017f, 0.138232f, 0.146347f, -0.117867f, + 0.248351f, -0.282846f, -0.18058f, 0.348355f, -0.415754f, + 0.0657168f, 0.431728f, -0.231043f, -0.186745f, 0.137401f, + -0.282329f, -0.159678f, 0.754262f, 0.037824f, -1.68521f, + -0.290175f, 0.289588f, -0.18683f, -0.300385f, 0.285449f, + -0.00386456f, 0.0563485f, -0.376541f, 0.159899f, -0.697312f, + 0.0284389f, 0.437307f, 0.3968f, -0.372082f, -0.232535f, + 0.394629f, 0.00315248f, -0.38374f, 0.0311291f, -0.624353f, + 0.498083f, -0.342663f, -0.125978f, 0.186797f, 0.187723f, + 0.149335f, -0.82727f, -0.0740974f, -0.659039f, 0.42671f, + -0.448835f, 0.150677f, 0.830742f, -0.233148f, -0.65308f, + -0.0878935f, -0.407797f, -0.511826f, -0.0739023f, 0.506305f, + -0.187451f, 0.0284968f, -0.822238f, 0.362523f, -0.270865f, + 0.032335f, 0.560413f, -0.00388247f, -0.446333f, 0.163147f, + -0.409633f, -0.372575f, 0.306993f, 0.55953f, -0.24362f, + -0.0929369f, -0.520298f, -0.444022f, 0.186077f, -0.0942208f, + 0.624049f, -0.429625f, -0.869528f, 0.405257f, -0.120445f, + 0.537685f, -0.3911f, 0.142142f, 0.0913808f, -0.00375967f, + 0.382781f, 0.60505f, -0.271608f, -0.0630436f, -0.150625f, + -0.0124598f, 0.0132878f, 0.138475f, -0.106264f, -0.416581f, + -0.518415f, 0.185127f, -0.464622f, -0.0102925f, 0.0389567f, + 0.406439f, -0.0414264f, -0.366185f, -0.511867f, -0.650255f, + 0.278252f, 0.0270234f, 0.262788f, -0.0294793f, 0.12651f, + 0.421537f, 0.0300837f, 0.0742187f, 0.281954f, -0.122069f, + -0.450145f, -0.312206f, -0.402633f, -0.0868137f, 0.190433f, + -0.149602f, -0.175029f, 0.00900023f, -0.266596f, 0.21721f, + -0.245079f, -1.09798f, 0.319409f, -0.337938f, 0.358514f, + 0.0771549f, 0.447087f, -0.305507f, -0.285492f, 0.383896f, + 0.145933f, -0.264944f, -0.118486f, 0.068805f, -0.194231f, + -1.79133f, 0.363408f, -0.17434f, -0.229629f, 0.132188f, + 0.207548f, -0.876264f, 0.265634f, 0.139332f, 0.236206f, + -0.0145184f, 0.562865f, 0.526612f, -0.0333508f, -0.421885f, + 0.273485f, -0.110882f, 0.425557f, 0.513303f, -0.422322f, + 0.0563155f, -0.0409693f, 0.194768f, -0.419828f, -0.107195f, + -1.19224f, 0.48552f, 0.132782f, -0.00932096f, -0.225484f, + -0.428484f, -0.0392684f, 0.750697f, 0.337615f, 0.158476f, + 0.413484f, 0.326017f, -0.757107f, -0.183962f, 0.00884361f, + 0.126507f, -0.0751588f, -0.308782f, -0.104237f, -0.703877f, + -0.491806f, -0.204251f, -0.317212f, 0.0815479f, 0.296323f, + 0.219632f, -0.039859f, 0.556257f, 0.176144f, -0.0750654f, + -0.106419f, 0.00400385f, -0.172266f, 0.000178763f, 0.146532f, + 0.255202f, -0.427235f, -0.182198f, -0.256557f, 0.260255f, + -0.0143364f, 0.0868664f, -0.564373f, -0.0876947f, 0.726289f, + 0.0160001f, -0.381562f, -0.638214f, -0.803803f, 0.25945f, + -0.371542f, -0.419611f, 0.238617f, 0.371834f, -0.226777f, + -0.894602f, 0.37458f, -0.354866f, 0.0249312f, 0.142374f, + 0.433813f, -0.0218183f, -0.33248f, 0.107223f, 0.390823f, + -0.0271108f, -0.616878f, -0.604984f, 0.517269f, -0.293573f +}; + +static const float av1_intra_mode_cnn_partition_cnn_layer_4_bias[] = { + -0.290371f, -0.0560272f, -0.118144f, -0.270583f, 0.401388f, + -0.308677f, 0.150729f, -0.0324442f, -0.135937f, 0.0875581f, + 0.0206493f, -0.212682f, -0.0266535f, -0.326656f, 0.0185105f, + -1.01429f, -0.00315052f, -0.0273938f, -0.0263379f, -0.171702f +}; + +static const CNN_CONFIG av1_intra_mode_cnn_partition_cnn_config = { + NUM_CNN_LAYERS, // num_layers + 0, // is_residue + 0, // ext_width + 0, // ext_height + 0, // strict_bounds + { + { + CNN_LAYER_0_IN_CH, // in_channels + CNN_LAYER_0_WIDTH, // filter_width + CNN_LAYER_0_WIDTH, // filter_height + CNN_LAYER_0_OUT_CH, // out_channels + CNN_LAYER_0_HORZ_STRIDE, // skip_width + CNN_LAYER_0_VERT_STRIDE, // skip_height + 0, // maxpool + av1_intra_mode_cnn_partition_cnn_layer_0_kernel, // weights + av1_intra_mode_cnn_partition_cnn_layer_0_bias, // bias + PADDING_VALID, // pad + RELU, // activation + 0, // deconvolve + 0, // branch + BRANCH_NO_COPY, // branch_copy_type + BRANCH_NOC, // branch_combine_type + NO_BRANCH_CONFIG, // branch_config + NO_BN_PARAMS, // bn_params + -1, // output_num + }, + { + CNN_LAYER_1_IN_CH, // in_channels + CNN_LAYER_1_WIDTH, // filter_width + CNN_LAYER_1_WIDTH, // filter_height + CNN_LAYER_1_OUT_CH, // out_channels + CNN_LAYER_1_HORZ_STRIDE, // skip_width + CNN_LAYER_1_VERT_STRIDE, // skip_height + 0, // maxpool + av1_intra_mode_cnn_partition_cnn_layer_1_kernel, // weights + av1_intra_mode_cnn_partition_cnn_layer_1_bias, // bias + PADDING_VALID, // pad + RELU, // activation + 0, // deconvolve + 0, // branch + BRANCH_NO_COPY, // branch_copy_type + BRANCH_NOC, // branch_combine_type + NO_BRANCH_CONFIG, // branch_config + NO_BN_PARAMS, // bn_params + 3, // output_num + }, + { + CNN_LAYER_2_IN_CH, // in_channels + CNN_LAYER_2_WIDTH, // filter_width + CNN_LAYER_2_WIDTH, // filter_height + CNN_LAYER_2_OUT_CH, // out_channels + CNN_LAYER_2_HORZ_STRIDE, // skip_width + CNN_LAYER_2_VERT_STRIDE, // skip_height + 0, // maxpool + av1_intra_mode_cnn_partition_cnn_layer_2_kernel, // weights + av1_intra_mode_cnn_partition_cnn_layer_2_bias, // bias + PADDING_VALID, // pad + RELU, // activation + 0, // deconvolve + 0, // branch + BRANCH_NO_COPY, // branch_copy_type + BRANCH_NOC, // branch_combine_type + NO_BRANCH_CONFIG, // branch_config + NO_BN_PARAMS, // bn_params + 2, // output_num + }, + { + CNN_LAYER_3_IN_CH, // in_channels + CNN_LAYER_3_WIDTH, // filter_width + CNN_LAYER_3_WIDTH, // filter_height + CNN_LAYER_3_OUT_CH, // out_channels + CNN_LAYER_3_HORZ_STRIDE, // skip_width + CNN_LAYER_3_VERT_STRIDE, // skip_height + 0, // maxpool + av1_intra_mode_cnn_partition_cnn_layer_3_kernel, // weights + av1_intra_mode_cnn_partition_cnn_layer_3_bias, // bias + PADDING_VALID, // pad + RELU, // activation + 0, // deconvolve + 0, // branch + BRANCH_NO_COPY, // branch_copy_type + BRANCH_NOC, // branch_combine_type + NO_BRANCH_CONFIG, // branch_config + NO_BN_PARAMS, // bn_params + 1, // output_num + }, + { + CNN_LAYER_4_IN_CH, // in_channels + CNN_LAYER_4_WIDTH, // filter_width + CNN_LAYER_4_WIDTH, // filter_height + CNN_LAYER_4_OUT_CH, // out_channels + CNN_LAYER_4_HORZ_STRIDE, // skip_width + CNN_LAYER_4_VERT_STRIDE, // skip_height + 0, // maxpool + av1_intra_mode_cnn_partition_cnn_layer_4_kernel, // weights + av1_intra_mode_cnn_partition_cnn_layer_4_bias, // bias + PADDING_VALID, // pad + RELU, // activation + 0, // deconvolve + 0, // branch + BRANCH_NO_COPY, // branch_copy_type + BRANCH_NOC, // branch_combine_type + NO_BRANCH_CONFIG, // branch_config + NO_BN_PARAMS, // bn_params + 0, // output_num + }, + }, +}; + +static const float + av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_kernel[] = { + 0.604356f, -0.236007f, 0.342172f, 0.531397f, -0.635698f, + -0.591573f, 0.833872f, 0.492814f, -0.100308f, 0.186385f, + 0.202779f, 0.263578f, 0.330001f, -0.15531f, 0.879584f, + -0.0048796f, 0.490796f, 0.242254f, -0.292211f, -0.696912f, + 0.746664f, 0.129371f, -0.0122443f, 0.196234f, -0.251605f, + -0.385617f, 0.157707f, 0.699963f, 0.0432536f, -0.11141f, + -0.0353473f, -0.0364045f, -0.113556f, -0.520842f, 0.231248f, + 0.230638f, -0.323852f, -1.08633f, -0.0469168f, -0.481821f, + 0.366838f, 0.189627f, -0.0637262f, -0.484917f, -0.109874f, + 0.292237f, 0.368702f, -0.183896f, -0.109038f, -1.22613f, + -0.880355f, -1.63768f, 0.337426f, -0.940994f, 0.413097f, + -0.37879f, -0.480525f, -0.594819f, -0.0172653f, -0.499436f, + -0.298395f, -0.840181f, -0.0758645f, -0.772089f, -0.232727f, + -0.815968f, 0.160785f, -0.0767165f, 0.0064244f, -0.540491f, + 0.417776f, -0.384337f, -0.497377f, 0.68414f, 0.00797514f, + 0.262626f, 0.203732f, 0.702047f, 0.0617544f, 0.0878249f, + -0.315032f, -0.0169776f, 0.403986f, 0.815872f, 0.135388f, + 0.0858594f, 0.169172f, -0.638227f, -1.65268f, -0.0476042f, + -0.982685f, 0.45707f, -0.0577537f, 0.367329f, 0.176513f, + -0.356454f, 0.0979095f, -0.277476f, 0.257271f, -0.333451f, + 0.0241497f, 0.0671127f, 0.221216f, 0.106065f, 0.537151f, + 0.0257329f, 0.265559f, -0.348353f, 0.285569f, -0.0610511f, + -1.59334f, -1.63826f, -0.164898f, -0.36605f, -0.489304f, + 0.729241f, 0.0197627f, 0.200291f, -0.231506f, -0.255715f, + -0.0932264f, -0.728793f, 0.468297f, -1.09592f, -0.079791f, + -1.76531f, -0.182904f, -2.05897f, -0.371894f, 0.207124f, + 0.255029f, 0.186501f, -0.005805f, 0.00160733f, -0.178206f, + -0.352757f, -0.164741f, -0.557583f, -0.559692f, -0.00731467f, + 0.149326f, 0.409735f, 0.22083f, -0.332572f, -0.1741f, + -0.0519008f, -0.266402f, 0.294031f, -2.4453f, 0.339851f, + -0.573747f, -5.97783f, -0.084142f, 0.20286f, -0.576038f, + -0.111081f, 0.101238f, -5.83427f, -1.98537f, 0.322796f, + -0.60171f, 0.212412f, 0.247176f, 0.603694f, -0.54357f, + -0.693439f, 0.250725f, -4.31988f, 0.0935924f, 0.43669f, + -0.139706f, -0.158391f, 0.244309f, 0.619213f, -0.309154f, + -0.135341f, 0.475815f, -0.290804f, -0.109038f, -0.0937104f, + 0.0385907f, -0.29105f, -0.0597651f, -0.451187f, -1.51821f, + 0.141772f, 0.822204f, -0.729661f, -0.109908f, 0.178217f, + -0.750278f, 0.113762f, -0.0959985f, 0.066579f, -0.104209f, + -0.951378f, 1.4087f, -1.13175f, -1.09103f, -1.50416f, + -0.182273f, -1.80129f, -0.152135f, 0.356931f, 0.205591f, + 0.183148f, -0.498671f, -0.183034f, -0.176428f, 0.395706f, + -0.589908f, -0.318276f, -0.421162f, 0.658766f, -0.186752f, + 0.0656253f, 0.248002f, 0.289618f, -0.458111f, -0.130789f, + -0.542988f, 0.405804f, -0.35364f, -0.311927f, 0.218339f, + 0.309215f, -0.130347f, -0.0257543f, 0.0413234f, -0.190205f, + -0.242382f, 0.819886f, -0.255157f, -0.181219f, -0.290903f, + -0.301995f, -0.0469988f, 0.702936f, 0.209122f, 0.0234243f, + 0.598637f, 0.0305196f, 0.0423457f, -0.618799f, 0.0190867f, + 0.420584f, -0.224752f, -0.410077f, 0.127854f, 0.395261f, + -0.393685f, -0.282822f, 0.0289504f, 0.0406515f, -0.511531f, + -0.497611f, 0.0252715f, 0.0812549f, 0.80205f, 1.29084f, + 0.764972f, 0.561258f, -0.23499f, 0.217594f, -0.690935f, + -0.26607f, 0.357955f, 0.391608f, 0.448352f, 0.458586f, + -0.790071f, 0.719959f, -0.468052f, 1.24579f, 0.220705f, + 0.284044f, 0.141346f, 0.246687f, 0.147826f, -0.403557f, + -0.00648195f, 0.398034f, -0.100464f, -0.77107f, -0.188274f, + -0.219245f, -0.0330375f, 0.367585f, -0.220391f, 0.308736f, + 0.221399f, 0.340292f, 0.037597f, 0.606083f, 0.665634f, + -0.755529f, -0.95989f, -0.243673f, 0.233709f, -0.454628f, + -0.110952f, 0.776062f, 0.731136f, -0.140422f, 0.19261f, + 0.355086f, 0.975026f, 0.190936f, 0.776205f, 0.982781f, + 0.555569f, 0.42382f, -0.409721f, 0.25053f, -0.271328f, + 0.859941f, -0.0210901f, 0.0176916f, -0.562895f, -0.0787431f, + -0.861032f, -0.34022f, -0.571995f, 0.205436f, 0.346968f, + 0.377033f, -1.08484f, 0.297007f, -1.01693f, 0.189463f, + -0.483242f, 0.147058f, 0.0159503f, 0.0908779f, -0.46962f, + 0.174024f, -0.490704f, -0.383501f, -0.0507626f, 0.00902188f, + -0.202495f, 0.205047f, 0.0562261f, -0.143371f, 0.219524f, + -0.317294f, -0.0575756f, -0.0595825f, -0.000625279f, -0.278864f, + -0.0516874f, -0.225259f, 0.429046f, -0.0952421f, 0.0799135f, + -0.122883f, -0.262308f, -0.481006f, -0.0466122f, -0.402822f, + 0.150595f, -0.0919558f, -0.356765f, -0.199222f, 0.219389f, + -0.214452f, -0.196361f, -0.095758f, -0.115891f, -0.143777f, + 0.549843f, -0.113036f, 0.764895f, -0.0114812f, -0.0684054f, + -0.98045f, -0.0170634f, 0.247719f, -0.18718f, -0.381566f, + 0.150758f, -0.526257f, 1.00851f, 0.776634f, 1.69728f, + -0.303058f, 0.228967f, -0.414134f, 0.0858226f, -0.285472f, + 0.431459f, 0.315318f, 0.587835f, 0.335737f, -0.0222039f, + 0.18945f, 0.274008f, 0.609263f, 0.320232f, -0.214137f, + -0.0297668f, 0.0439046f, -0.52821f, -0.0127375f, 0.431885f, + 0.508846f, -0.329189f, -0.166778f, -0.94338f, -0.358807f, + 0.208641f, -0.517986f, -0.128278f, 0.693464f, -0.24408f, + -0.0669412f, -0.410287f, 0.0444145f, -0.264179f, 0.143884f, + 0.276842f, 0.498934f, -0.682557f, -0.217198f, -0.8249f, + -0.40446f, -0.115376f, 0.417934f, 0.65605f, -0.00570035f, + -0.365742f, -0.367625f, 0.526824f, -0.0164913f, -0.255998f, + 0.247292f, 0.0846536f, 0.109302f, -0.302996f, 0.160564f, + 0.0228132f, 0.035211f, -0.236951f, 0.493801f, 1.37315f, + -0.182348f, 0.234437f, -0.256906f, 0.12523f, 0.667113f, + -0.437981f, -0.0721831f, 0.303976f, -0.041336f, -0.145894f, + -0.733741f, 0.436056f, 0.368542f, -0.149072f, -0.290281f, + 0.0946743f, -0.0579292f, 0.264539f, 0.170048f, 0.262411f, + 0.049679f, 0.371369f, 0.760675f, 0.482157f, -0.0196783f, + 0.260888f, 0.948856f, 0.170228f, -0.134432f, -0.942235f, + -1.23226f, -0.373963f, -0.0381773f, -0.17947f, 0.00947998f, + 0.01086f, 0.389578f, -0.380389f, -0.0865851f, -0.220328f, + -0.171901f, -0.384325f, -0.0787615f, 0.392678f, 0.123392f, + -0.0895824f, 0.00480886f, -0.162918f, 0.214336f, -0.00147339f, + 0.203899f, -0.00292344f, -0.148594f, 0.0425697f, -0.306896f, + -0.342225f, -0.45088f, -0.184454f, -0.00923638f, -0.521993f, + -0.334464f, 0.156497f, -0.0856832f, -0.277661f, -0.0721105f, + -0.488781f, -0.509543f, -0.012664f, 0.0940558f, -0.29869f, + 0.0434843f, -0.0178945f, -0.0525666f, -0.303178f, 0.713507f, + -0.137413f, -0.170289f, -0.142942f, -0.316002f, 0.229125f, + -0.277585f, 0.0125026f, 0.508316f, -1.20614f, -0.915129f, + -1.63389f, -0.454604f, -0.893951f, -0.447403f, -0.751423f, + 1.3886f, 0.617818f, 0.611458f, -0.884173f, -0.7779f, + -0.608639f, -0.164759f, -0.631846f, -0.176894f, -0.459361f, + -0.187119f, 0.173283f, -0.477191f, -0.156736f, 0.182675f, + 0.598854f, -0.489941f, -0.420493f, -0.162002f, 0.344418f, + 0.33832f, -0.187463f, -0.388721f, -0.0733151f, -0.138835f, + 0.313699f, 0.0625967f, -0.291488f, 0.114088f, -0.356843f, + 0.197506f, 0.0320749f, 1.16745f, -0.36081f, 1.63416f, + 0.198392f, 1.13928f, -0.317971f, 0.531019f, 0.526518f, + 0.185814f, 0.0923607f, 0.192858f, -0.234378f, 0.18091f, + -0.228837f, 0.397216f, 0.581501f, 0.284376f, -0.130434f, + 0.20076f, 0.242662f, -0.0480872f, 0.131746f, 0.362712f, + 0.0146821f, 0.475679f + }; + +static const float av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_bias[] = { + 0.477356f, 0.385222f, 0.389122f, 0.539506f, -0.0272558f, 0.581605f, + -0.800961f, 0.142229f, 0.117549f, -0.0724944f, 0.102095f, -0.71319f, + -0.0162434f, -0.132858f, 0.543411f, -0.626599f +}; + +static const float + av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_kernel[] = { + 0.195436f, -0.623354f, 1.27907f, 0.270071f, -0.677612f, + 0.0266141f, 0.272991f, -0.425446f, 0.891889f, -0.299836f, + -0.611825f, -0.0322273f, 0.185276f, 0.238639f, -0.150954f, + 0.083495f, -0.472106f, 0.573506f, 1.16465f, -0.154947f, + 0.640631f, -1.59467f, -9.8166f, -0.22889f, -0.189912f, + 0.227052f, -0.540787f, 0.0840873f, -3.04293f, -0.0209975f, + -6.10979f, -5.92801f, 0.288467f, -0.169476f, 0.0527948f, + -1.21202f, -0.280915f, 0.290863f, -0.601877f, 0.0598784f, + -0.592136f, -0.535588f, -0.0434018f, -0.653223f, 0.00339129f, + -0.133273f, 0.279463f, 0.483879f, 0.463664f, -0.14174f, + -1.56354f, 0.560043f, -1.44639f, 0.673528f, -0.108418f, + -0.707313f, 0.49633f, -0.0321971f, 0.411475f, -0.382184f, + -0.965501f, -0.0507655f, 0.540415f, -0.977297f, 0.370382f, + -0.375683f, 0.0844529f, -2.0002f, -0.346289f, 0.621251f, + -0.489855f, 0.191252f, -0.576629f, -0.35773f, 0.023167f, + 0.180793f, -0.417864f, 0.0587254f, 0.167824f, 0.0612058f, + -0.712108f, 0.155614f, 0.900036f, -0.480124f, 0.146117f, + 0.467011f, 0.412525f, 0.312724f, 0.551826f, -0.179601f, + 0.706261f, 0.00674965f, -0.495221f, 0.140829f, -0.0619195f, + -0.0697912f, 0.511967f, -0.0318237f, -0.285946f, -0.28608f, + 0.0894142f, 0.234351f, -0.272328f, -0.350369f, -0.392605f, + 0.287318f, 0.310426f, 0.293524f, 0.357681f, -0.157868f, + 0.149652f, -0.259363f, 0.192941f, -0.850096f, 0.456507f, + 0.387857f, -0.491187f, -0.0541993f, -0.28118f, 0.193991f, + -0.0956664f, 0.0679829f, 0.0341118f, 0.141826f, 0.271538f, + -0.285295f, -0.68666f, 0.306414f, 0.600678f, 0.494801f, + -1.11907f, 0.524849f, 0.151169f, 0.474068f, -0.43441f, + -0.229138f, 0.0345483f, 0.682888f, -0.471534f, -0.0457066f, + -2.36721f, 0.446407f, 0.20396f, -1.17868f, 0.815363f, + -1.13897f, 0.397217f, -0.593796f, -6.95512f, 0.650695f, + 0.771657f, 0.15227f, -0.824519f, 0.617854f, -0.295353f, + -0.101207f, 0.600989f, -0.550653f, -0.722371f, 0.292006f, + -0.451891f, 0.54544f, 0.354278f, 0.0136258f, 0.192003f, + 0.258275f, -0.0443647f, 0.0928186f, 0.667775f, 0.239558f, + 0.0523887f, 0.71586f, 0.292563f, 0.362479f, 0.373453f, + 0.250638f, -0.423037f, -0.486574f, -0.619397f, 0.343888f, + 0.974971f, 0.574218f, 0.273989f, -0.209956f, -0.274333f, + 0.0553766f, 0.263918f, 0.733824f, 0.038713f, -0.0788992f, + 0.292014f, 0.111808f, -0.197507f, 0.593668f, -0.0245337f, + 0.0873662f, 0.530997f, 0.620717f, 0.310697f, -1.54861f, + 1.12915f, 0.0991346f, -0.59214f, 0.422325f, -0.0157936f, + 0.380975f, 0.626403f, 0.268064f, -0.615231f, -1.43172f, + 0.0928048f, 0.0949026f, -0.470912f, -0.0867527f, -0.0381206f, + 0.178393f, -1.13737f, 0.12798f, 0.258214f, -0.803364f, + 0.177506f, 0.542718f, 0.660656f, 0.145091f, 0.183056f, + -0.47338f, 0.469287f, 0.10832f, 0.0994899f, -0.402719f, + 0.157287f, 0.523071f, -0.324493f, 0.343599f, 0.664839f, + -0.0375519f, -0.279238f, -0.0722333f, 0.395344f, -0.289316f, + 0.0259298f, -0.843245f, -0.160021f, 0.741429f, -1.38726f, + -0.2969f, -0.240443f, 0.247731f, -1.04088f, -0.280454f, + -0.237054f, -0.759227f, 0.0456369f, -0.647453f, -1.02372f, + -0.200395f, -0.546839f, -0.104226f, -0.152727f, -0.56685f, + -0.0559663f, -0.425494f, -0.610679f, -0.987096f, -0.575138f, + -0.0887979f, 0.463646f, -1.041f, -0.49412f, -0.175298f, + -0.463296f, -0.955177f, 0.17852f, -1.10694f, 0.181991f, + -0.18998f, 0.227818f, 0.688237f, -1.10444f, 0.549108f, + -0.171849f, -0.245614f, 0.120624f, 1.29571f, 0.607116f, + 0.00809927f, 0.1041f, -1.22918f, -0.212948f, 0.430239f, + -1.57341f, 0.482054f, 0.275905f, 0.939785f, -1.0209f, + -0.355534f, 0.397337f, -0.0593077f, -0.239603f, 0.475483f, + -0.999101f, -0.140578f, 1.04787f, -0.591981f, -0.306989f, + -0.879012f, -0.994715f, 0.0343158f, 0.218509f, 0.34704f, + 0.0672934f, -0.178941f, 0.20509f, -0.360031f, 0.161241f, + -0.324775f, -0.359531f, -0.0657085f, -0.864422f, -0.444865f, + 0.597095f, -0.948691f, 0.240001f, -0.783159f, -0.569422f, + 0.974205f, -1.04539f, 0.345915f, -0.681558f, -0.246047f, + 0.256174f, 0.493667f, 0.681324f, 0.155613f, 0.773309f, + -0.647027f, -0.214744f, -0.474202f, -0.661092f, -1.02316f, + 0.0572593f, -0.437082f, -0.119874f, -0.464877f, -0.58067f, + -0.218029f, 0.319516f, -0.378983f, -0.0698695f, 0.554693f, + -0.537875f, 0.126429f, -0.145113f, -0.594312f, -0.218021f, + -0.703569f, 0.0720548f, 0.261054f, -0.81438f, 0.249921f, + 0.165296f, -0.079028f, -0.322647f, 0.134458f, 0.0975046f, + 0.538594f, -0.250126f, 0.142309f, 0.526486f, 0.0532615f, + -0.383332f, -0.38143f, -0.101611f, 0.519776f, -0.278364f, + -0.23287f, -0.29139f, 0.22353f, 0.472085f, 0.366264f, + 0.741187f, 0.42019f, 0.0676459f, -0.230008f + }; + +static const float av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_bias[] = { + -0.48603f, -0.578556f, 0.257639f, 0.459915f, 0.178156f, -1.16663f, + 0.828891f, 0.620291f, 0.413257f, -1.00508f, -0.574179f, -1.20623f, + -0.377837f, -0.0360333f, 0.681536f, 0.137189f, -0.458718f, 0.387131f, + 0.0233112f, 0.126045f, 0.361304f, 0.655317f, 0.413134f, 0.769947f +}; + +static const float av1_intra_mode_cnn_partition_branch_0_logits_kernel[] = { + 0.67244f, -2.59179f, 0.50425f, -1.86481f, 1.15891f, -1.26447f, + 0.761081f, 0.645117f, -1.78594f, -0.872703f, -0.192054f, -1.82359f, + -0.560935f, 0.838959f, 0.502264f, -1.28958f, -0.205551f, 0.635671f, + -1.12619f, -1.68277f, 0.83361f, 1.57235f, 1.15839f, 0.35345f +}; + +static const float av1_intra_mode_cnn_partition_branch_0_logits_bias[] = { + 1.14463f +}; + +static const float + av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_kernel[] = { + 0.364612f, 0.237868f, -0.192821f, 0.12364f, 0.522205f, + -0.205785f, -0.503288f, -0.426503f, -0.083073f, 0.0164429f, + 0.184278f, -0.426055f, 0.0717997f, -0.261968f, 0.176412f, + -0.101226f, 0.0400285f, -0.332051f, 0.344385f, 0.189565f, + 0.441162f, 0.330462f, -0.719857f, -1.14209f, 0.557831f, + 0.104756f, 0.0562001f, -0.465923f, -0.344592f, -0.191554f, + -0.0656866f, -0.640162f, 0.419388f, 0.409308f, -1.68632f, + -1.10829f, 0.105485f, -0.14561f, -0.944738f, 0.104629f, + -0.146837f, 0.538823f, -0.153157f, 0.321081f, -1.77714f, + -0.0559296f, 0.324136f, -0.497023f, -1.15793f, -0.740144f, + -0.0888472f, 0.010059f, -0.18394f, -0.234405f, -0.10586f, + 0.130958f, -0.101944f, -0.186483f, -0.447049f, -0.900026f, + 0.128444f, 0.401696f, 0.128509f, 0.123778f, 0.062168f, + -0.321755f, -0.0691584f, 0.254468f, -0.115212f, -0.848885f, + 0.817005f, 0.0615853f, 0.153363f, 0.513855f, 0.789225f, + 0.356168f, 0.371613f, 0.269541f, 0.268173f, 0.220481f, + -0.109063f, -0.00620798f, -0.0334622f, 0.236267f, -0.0235294f, + -0.0800253f, 0.0294184f, 0.047131f, -0.224047f, 0.0890737f, + -0.356293f, 0.0989534f, 0.16799f, 0.498266f, 0.612581f, + -0.372897f, -0.75125f, 0.77698f, 1.1032f, -0.0764679f, + 0.0266299f, 0.309532f, 0.461305f, 0.0193521f, -0.0939161f, + -0.276156f, -0.102714f, -0.0828328f, 0.40003f, 0.122542f, + 0.0867203f, -0.170738f, 0.0850642f, -0.130762f, 0.082324f, + -0.115218f, -0.0244491f, 0.0434331f, 0.216453f, 0.443733f, + -0.173679f, -0.161617f, 0.316209f, -0.689656f, -1.52007f, + -0.421018f, 0.430833f, -0.00734122f, 0.284499f, -0.0207885f, + 0.0572024f, -0.878942f, 0.388264f, 0.0191589f, -0.123415f, + -0.0461196f, -0.0444461f, -0.00383171f, 0.0945655f, -0.0597219f, + -0.374918f, 0.0182124f, 0.523083f, 0.00519547f, 0.80513f, + -0.221433f, -1.30591f, -0.416917f, -0.718173f, 0.622999f, + 0.941798f, 0.0477536f, 0.0303772f, 0.268078f, 0.414778f, + 0.394325f, 0.299733f, -0.583208f, 0.309379f, 0.416581f, + 0.0299948f, -0.409145f, -0.161557f, -0.214082f, -0.0098119f, + 0.221912f, 0.107135f, 0.0692518f, 0.00490957f, 0.107613f, + -0.368404f, -0.548006f, 0.208274f, 0.550475f, 0.643678f, + -1.65859f, 0.095938f, -0.0434245f, -0.0792685f, 0.838109f, + -0.0138653f, -0.527573f, -0.123472f, -0.235618f, -0.677401f, + -0.125877f, -0.175604f, -0.203196f, 0.113478f, -0.228323f, + -0.53539f, 0.134458f, 0.0534899f, -0.213006f, -0.138679f, + -2.15023f, 0.186303f, 0.48566f, -1.22301f, -0.240982f, + -0.486836f, -0.121181f, -0.131382f, -0.0320283f, 0.278828f, + 0.342581f, -0.182257f, -0.365193f, -0.226351f, 0.108928f, + -0.100159f, 0.448355f, -0.0768947f, 0.0633719f, -0.104786f, + 0.0456653f, 0.0965752f, 0.156403f, -0.157337f, 0.212259f, + 0.317939f, 0.124193f, -0.329475f, 0.206868f, -2.15986f, + -0.108385f, -0.396769f, -0.0317231f, -0.271524f, -0.184697f, + 0.662615f, 0.412926f, -0.0217462f, -0.0285475f, -0.118826f, + 0.0252706f, -0.137091f, 0.198973f, 0.329509f, -0.0831966f, + -0.621237f, 0.0896179f, 0.805261f, -0.019675f, 0.962452f, + 0.307433f, 0.892168f, -0.537587f, -2.46145f, 0.125606f, + 0.920491f, 0.219462f, 0.292765f, -0.748238f, -0.0537239f, + -0.224326f, 0.505492f, 0.176426f, 0.0343168f, 0.16708f, + -0.581393f, 0.951726f, -1.1777f, -0.561914f, -1.53288f, + 0.864567f, -1.19648f, -1.24141f, -0.334688f, -0.622026f, + 0.666876f, -0.197005f, -0.600507f, -0.851924f, 0.492299f, + 0.31078f, -0.0736115f, 0.030999f, -6.02463e-05f, -0.0604341f, + -0.0254238f, 0.139222f, 0.333235f, 0.366534f, -0.191982f, + -0.0156092f, 0.44234f, -0.0193213f, 0.0938745f, -0.015709f, + -0.12043f, 0.00895591f, 0.0464401f, 0.0530699f, -0.623018f, + -1.23372f, -0.538647f, -1.12389f, 0.26742f, 0.548694f, + 0.00540655f, -0.219703f, 0.314894f, -0.573463f, -0.241555f, + 0.441851f, 0.422491f, 0.253785f, -0.384683f, 0.0370165f, + 0.226669f, 0.245587f, 0.215265f, -0.122272f, 0.0492235f, + 0.000658591f, -0.312877f, 0.436487f, -0.229199f, -0.174373f, + 0.904268f, -0.855845f, -0.877293f, -0.65409f, 0.313795f, + 0.461748f, -0.737766f, -0.228523f, 0.182181f, 0.334522f, + 0.0629676f, -0.151087f, 0.178798f, -0.325809f, -0.331672f, + 0.0865837f, -0.0684225f, 0.0252008f, -0.0820631f, 0.0481863f, + 0.209473f, -0.0242151f, -0.0898919f, -0.163828f, -0.164282f, + 0.581888f, 0.816896f, 0.0607674f, 0.364855f, -0.346512f, + -0.764174f, 0.595561f, 0.302872f, 0.206361f, 0.106917f, + -0.972338f, 0.176948f, 0.6415f, -0.131897f, -0.155802f, + 0.216337f, -0.342511f, 0.123743f, -0.123014f, 0.0205439f, + 0.15173f, -0.23801f, -1.00387f, 0.651328f, 0.237439f, + -0.542952f, 1.066f, -0.161107f, -0.593545f, 0.219343f, + -0.178094f, 0.0789992f, 0.428332f, 0.23827f, -0.327421f, + 0.416144f, 0.00394653f, 0.052046f, -0.238289f, 0.405942f, + 0.00141984f, 0.161017f, 0.077111f, 0.0823985f, 0.0981208f, + 0.109949f, -0.0428502f, 0.343629f, -0.722978f, -0.375269f, + -0.111634f, -0.271523f, 0.712093f, 0.684904f, -0.572331f + }; + +static const float av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_bias[] = { + 0.583367f, -0.202004f, -0.207626f, 0.412451f, -0.258311f, 0.0304954f, + -0.102458f, 0.450087f, -0.376851f, -0.338702f, 0.335226f, 0.889072f, + 0.502411f, 0.649282f, 0.15345f, -0.0109896f +}; + +static const float + av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_kernel[] = { + 0.0214882f, -0.934339f, -0.173335f, 0.8362f, -0.764234f, + 0.525163f, 0.409749f, 0.821539f, -0.784157f, -0.455593f, + 0.446099f, 0.406756f, 0.479242f, -0.814038f, -0.419332f, + 0.328869f, -0.340707f, 0.133219f, 0.0320347f, 0.25089f, + -0.324917f, -0.0684265f, 0.0377777f, -0.262556f, 0.673458f, + -0.0291454f, -0.417957f, -1.0075f, -0.481537f, 0.922105f, + -0.000516239f, -0.40034f, 0.242067f, -0.43178f, 0.32001f, + 0.143599f, -0.345172f, 0.126093f, 0.148518f, -1.12151f, + -1.03435f, 0.551691f, -0.310001f, -0.323194f, -0.595128f, + -0.395689f, 0.737268f, -0.729227f, 0.590804f, -0.590022f, + -1.01427f, -0.521159f, -0.617579f, 1.07292f, -0.613047f, + -0.619093f, 0.335268f, 0.473753f, -0.795027f, 1.24635f, + -0.556193f, 0.241046f, -0.0354181f, -0.354215f, 0.716752f, + -0.00200745f, -1.25171f, -0.440731f, -0.763918f, -0.588614f, + -0.183901f, -0.396056f, 0.226903f, 0.921471f, 1.10465f, + 0.207053f, 0.57681f, -0.555699f, 0.235469f, -0.92149f, + 0.625808f, 0.29653f, -0.81775f, -0.307889f, -1.41384f, + -0.136205f, -0.365314f, -0.516741f, 0.748052f, 0.617947f, + 0.0973239f, 0.839607f, 0.530668f, -0.227032f, -0.449044f, + -1.04725f, -0.244363f, -0.396888f, -0.146161f, 0.359789f, + 0.0436599f, 1.21645f, -0.336069f, 0.0534646f, -0.00200328f, + 0.658551f, -0.156142f, -1.0728f, 0.0951015f, 0.234837f, + -0.380525f, 0.041783f, -0.269273f, 0.0386013f, -0.455589f, + -0.174338f, 0.0345251f, 0.17116f, -0.507642f, 0.210453f, + 0.739987f, -0.0438776f, 0.570145f, -0.118811f, 0.0548662f, + 0.153458f, -0.89887f, 0.493704f, 0.283351f, 0.785441f, + -0.586002f, -0.0616167f, -0.714328f, -0.145941f, -0.449656f, + 0.850117f, 0.279997f, 0.204143f, -0.31356f, 0.947057f, + -0.135787f, 0.747071f, 0.0145968f, -0.81414f, 0.431009f, + -0.275824f, -0.342928f, -0.0528272f, -0.592183f, 0.433915f, + -0.251752f, -0.311815f, -1.47533f, -1.43677f, 0.0698436f, + 1.01341f, 0.305063f, -0.252003f, -0.428915f, -0.00104153f, + -0.368267f, -0.354523f, -0.27956f, -0.771664f, 0.232092f, + -0.428495f, 0.424952f, -0.343229f, 0.196899f, -0.761084f, + -0.0110293f, -0.335361f, 0.571637f, -0.423489f, -0.52773f, + 0.0108043f, -0.504715f, -1.1419f, -0.402904f, -0.160747f, + -0.329184f, 0.375374f, -1.02604f, -0.601371f, 0.631652f, + 0.0742486f, -0.464765f, 0.467445f, 0.240562f, -0.38211f, + -0.459004f, 0.704196f, 0.021357f, 0.860785f, -1.16731f, + -0.479029f, -0.139644f, -0.444087f, 0.322326f, -0.25455f, + 0.874399f, 0.477696f, 0.0464487f, 1.20658f, 0.0993356f, + 0.00682712f, -0.10163f, -0.371765f, -0.629513f, -0.679196f, + -0.193935f, 0.47405f, -0.18238f, 0.254918f, -0.35306f, + -0.375611f, 0.119771f, -0.257282f, -0.565124f, 0.162667f, + -0.356128f, 0.870351f, 0.241847f, -0.264712f, -0.384322f, + 0.31807f, 0.211621f, -0.180767f, 0.764944f, 0.368646f, + 0.186111f, 1.02458f, -0.494252f, -0.483375f, -0.699664f, + 0.00415657f, -0.189376f, -0.677103f, -0.030319f, 0.667087f, + 0.810951f, -0.488237f, -0.387355f, -0.726579f, -0.304763f, + 1.10392f, -0.775977f, -0.247731f, 0.532396f, 1.24089f, + 0.206621f, -0.670568f, -1.08142f, -0.342503f, 0.189854f, + -0.200846f, 0.784204f, 0.641112f, -0.509346f, 0.0805264f, + -1.40006f, 0.322084f, -0.823739f, -1.12965f, -0.215668f, + 0.099673f, 0.425966f, 0.771697f, 0.338834f, 0.345364f, + -0.297826f, -0.176746f, -0.297299f, -1.80029f, -0.178348f, + 0.421194f, -0.19155f, 0.417653f, 0.374441f, -0.135654f, + -0.895843f, 0.220647f, 0.368264f, 0.369233f, 0.382707f, + 0.0800511f, 0.542053f, 0.318896f, -0.385539f, 0.313305f, + -1.01166f, -0.222379f, -1.53708f, 1.32407f, -0.665444f, + -0.102348f, 0.0410504f, -0.616825f, 1.3108f, 0.405902f, + 1.27777f, 0.0630558f, -0.172696f, 0.16224f, -1.10111f, + -3.31326f, -0.242566f, 0.831422f, 0.917397f, 0.311749f, + -0.238613f, 0.438007f, -0.407089f, -0.0202555f, -1.82502f, + -0.907965f, -0.300031f, -0.616669f, -0.767921f, 0.285919f, + -0.112019f, 0.252677f, 0.350892f, 0.000214244f, 0.315915f, + 0.260344f, 0.327362f, -0.0211213f, -0.41241f, 0.0418355f, + 0.103328f, -0.0158439f, -0.230505f, -0.0215114f, 0.266739f, + -0.234376f, -0.352583f, 0.0709437f, -0.90649f, -0.535843f, + 1.21322f, -1.05144f, -0.983682f, -0.189956f, 1.14208f, + -0.0188492f, -0.254821f, -0.463214f, -0.708714f, 0.0447348f, + -0.220831f, 0.476299f, 0.102544f, 1.1173f, -0.36981f, + -0.814102f, 0.103604f, -0.247871f, 0.0610701f, -0.356616f, + -0.144093f, 1.66496f, 0.180206f, -1.04384f, -0.65883f, + 0.0290771f, -0.622728f, 0.761523f, -0.909091f, -0.0340348f, + 0.666895f, -0.0232575f, 0.962643f, -2.50103f, -1.69745f, + -0.0482305f, 0.771811f, -1.32233f, -0.778722f, -0.203309f, + 0.395875f, -0.171812f, 0.253794f, 0.432799f + }; + +static const float av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_bias[] = { + -0.152159f, 0.552347f, -0.806068f, 0.227901f, 0.335896f, 0.180785f, + 0.75277f, 0.982208f, 0.409823f, -0.17755f, -0.125365f, 0.738114f, + 0.202331f, 0.751737f, -0.360511f, 0.149254f, 0.085073f, -0.214542f, + 0.529727f, -0.0348777f, -2.13162f, -0.893332f, -0.136952f, -0.71258f +}; + +static const float av1_intra_mode_cnn_partition_branch_1_logits_kernel[] = { + -0.632145f, 0.738727f, -0.750737f, -0.931571f, -1.79763f, -2.31153f, + 0.912733f, 0.879995f, -1.00602f, -1.02467f, 0.0536835f, 1.76011f, + -0.898546f, 1.06959f, 1.60471f, -1.7312f, -0.877168f, -0.681185f, + -1.57286f, -1.16038f, -4.11303f, -3.06351f, -3.02536f, -2.92186f +}; + +static const float av1_intra_mode_cnn_partition_branch_1_logits_bias[] = { + 1.33207f +}; + +static const float + av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_kernel[] = { + 0.0419551f, 0.0924078f, -0.153084f, 0.191642f, 0.069586f, + -0.530661f, 0.431968f, 0.000453838f, 0.793047f, 0.0161817f, + -0.476075f, -0.156638f, -0.219066f, 0.372716f, -0.0642299f, + 0.156813f, -0.105819f, -0.0519422f, 0.149935f, 0.295544f, + 0.192037f, -0.0450383f, 0.828794f, -0.0510661f, -1.22549f, + -0.100293f, -0.178274f, 0.0304427f, -0.0664097f, -0.0438936f, + 0.948248f, 0.425486f, -0.238206f, 1.3744f, 0.336897f, + 0.0760769f, -0.583508f, 0.0735519f, -0.117024f, 0.0501598f, + 0.332212f, 0.199531f, 0.424764f, 0.206712f, 0.342868f, + 0.592673f, -0.0961148f, -0.190113f, -0.155027f, 0.00789871f, + -0.0514839f, -0.416154f, -0.290309f, 0.407541f, 0.48534f, + 0.126564f, 0.0709566f, -0.0469664f, 0.735403f, -0.365963f, + 0.150295f, -0.50147f, 0.021383f, 0.76514f, 0.0085721f, + -0.416384f, 1.22268f, 0.0832438f, 0.367813f, -0.12012f, + 0.823183f, -0.0525972f, -0.325526f, -0.0983032f, 0.370128f, + 0.368778f, 0.138971f, -0.0397997f, 0.411058f, -0.0400404f, + 0.588437f, -0.29963f, -0.107992f, -1.75238f, -0.274387f, + 0.430418f, 0.495152f, 0.283172f, -0.441166f, 0.195339f, + -0.436182f, -0.252613f, 0.176204f, -0.126541f, -0.474833f, + -0.0721603f, -0.496599f, -0.0608464f, 0.0333451f, -0.0621485f, + 0.0843859f, 0.0637854f, -0.145291f, 0.14876f, 0.181665f, + -0.675805f, 0.294903f, 0.301118f, -0.225957f, 0.0105897f, + -0.136427f, -0.555925f, -0.158853f, -0.216779f, 0.0612481f, + -0.107158f, 0.352451f, 0.140536f, -0.0148237f, 0.189371f, + -0.091046f, -0.0476226f, 0.366054f, -0.0723413f, 0.389883f, + -0.0213411f, 0.0279539f, 0.194827f, -0.271502f, -0.166474f, + 0.0690549f, 0.0584665f, 0.0198415f, -0.442348f, 0.1571f, + -0.113463f, -0.16822f, -0.0580659f, -0.13441f, -0.0022386f, + 0.251521f, -0.160494f, -0.0753547f, 0.0897289f, 0.137917f, + 0.129836f, 0.0816833f, -0.626288f, 0.0643293f, -1.20001f, + 0.085631f, -0.195602f, 0.251244f, 0.0321744f, 0.0493178f, + -0.220616f, 0.724075f, -0.00831514f, 2.00319f, 0.407932f, + 0.0710799f, -0.166128f, 0.0126611f, -0.229644f, -0.0984299f, + 0.632041f, -0.0946141f, 0.295315f, 0.100934f, 0.184883f, + -0.236173f, 0.158081f, 0.195775f, 0.413542f, 0.789801f, + 0.767741f, 0.166275f, -0.348271f, -0.384074f, -0.291648f, + -0.119899f, 0.0368354f, 0.0751987f, 1.04217f, -0.159002f, + -2.71592f, -0.788502f, -1.06268f, 0.536057f, 0.0575876f, + 1.06811f, 0.12033f, 0.198578f, -0.0419196f, 0.0631388f, + 0.623138f, -0.142226f, 1.33129f, 0.0868059f, -0.0287825f, + 0.139378f, -0.143037f, 0.307452f, 0.0363987f, -0.0976368f, + 0.040544f, 0.0269327f, -0.0845524f, 0.0674699f, 0.104501f, + -0.0351155f, 0.167071f, 0.00986971f, 0.10284f, 0.0300016f, + 0.192601f, 0.0397177f, 0.0251346f, -0.00912908f, -0.0452825f, + 0.0164356f, -0.0275149f, 0.194846f, 0.0943608f, 1.61674f, + 0.0124345f, 0.523787f, 0.0397258f, -0.17208f, -0.147808f, + -1.23583f, 0.676385f, 0.551994f, 0.0233041f, 0.0116391f, + -0.466706f, 0.154725f, -0.207371f, 0.606662f, 0.247286f, + 0.31216f, 0.173765f, -0.268033f, 0.224422f, 0.314649f, + 0.481922f, -0.190604f, -0.0129162f, 0.270552f, 0.135195f, + 0.0927735f, -0.226099f, 0.53897f, 0.103309f, -0.0257271f, + -0.0246776f, 0.442013f, -0.179246f, -1.02581f, 0.206176f, + -0.326365f, 0.391623f, -0.103549f, 0.115645f, 0.0269328f, + -0.584517f, -0.237502f, 0.157996f, 0.0447407f, -0.161f, + -0.126072f, -0.148967f, -0.416347f, 0.0236496f, -1.12612f, + 0.0120709f, -0.00979376f, 0.0507126f, -0.172262f, 0.0697059f, + -0.212334f, 0.335731f, -0.0301362f, -0.839583f, -0.238539f, + 0.0636752f, -0.0467217f, -0.0372118f, -0.144615f, -0.161773f, + -0.648242f, 0.158197f, -0.051471f, -0.0615805f, -0.0426936f, + -0.0745554f, 0.358975f, 0.358297f, 0.0568553f, -1.14383f, + -0.103955f, 0.728194f, -0.224945f, -0.31659f, -0.204458f, + 0.171763f, -0.465666f, 0.899234f, -0.37042f, -0.0894774f, + 0.11478f, -0.334957f, 0.0896514f, 0.413251f, 0.359471f, + 1.41597f, 0.558082f, 0.153486f, 0.0270558f, -0.0178797f, + 0.124983f, -0.12273f, -1.04516f, -0.125375f, 0.370336f, + -0.209423f, -0.36816f, -0.66077f, -0.0180773f, -0.628921f, + -0.178542f, 0.0346841f, 0.0319309f, -0.470138f, 0.172763f, + 0.0798846f, -0.259737f, -0.652461f, -0.386283f, -0.474447f, + -0.924054f, -0.0154613f, -0.613712f, -0.138068f, -0.337842f, + 0.217921f, -0.0711405f, 0.000404091f, -0.703766f, 0.0364683f, + 0.150173f, 0.0126249f, 0.170594f, 0.0371879f, -0.0862515f, + -0.23454f, -0.0144143f, 0.164947f, 0.45591f, 0.115703f, + 0.069752f, -0.011993f, 0.0402097f, 0.00697581f, 0.0811613f, + 0.384752f, 0.341977f, 0.06087f, 0.0590107f, 0.00812679f, + 0.121211f, -0.0612108f, 0.167851f, 0.195781f, -1.62162f, + 0.336292f, -0.0772523f, -0.310786f, 0.188257f, -0.0325804f, + -0.240098f, 0.158748f, -0.265264f, 3.19593f, -0.449251f, + -1.33102f, -0.482856f, -0.435731f, 0.300808f, 0.346503f, + 2.67378f, -0.152379f, 0.219322f, -0.146119f, -0.0584806f, + -0.0276895f, -0.21955f, -0.479179f, -0.689545f, 0.152799f + }; + +static const float av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_bias[] = { + -0.296575f, 0.101072f, -0.208429f, 0.111585f, 0.699552f, -0.379484f, + 0.313244f, -0.746369f, 0.867757f, 0.457318f, -0.0190943f, -0.290745f, + 0.45592f, -0.160465f, -0.634243f, 0.0829737f +}; + +static const float + av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_kernel[] = { + 0.27511f, -2.14172f, 1.25755f, -0.554772f, 0.589508f, + 0.228307f, 0.0754914f, 1.07061f, 0.293323f, 0.65162f, + -0.272016f, -1.33519f, -0.606759f, -0.57827f, 0.368807f, + -1.48668f, 0.162439f, 0.0821667f, 0.225535f, -0.795996f, + 0.0328293f, 0.975476f, -0.187514f, 2.47069f, -1.5638f, + -0.461524f, 0.00310062f, 1.1556f, -0.286206f, 0.00426021f, + 0.585836f, 0.900007f, 0.384055f, 0.189435f, -0.157291f, + -0.0710573f, -0.0663986f, -0.710772f, -0.669136f, -0.379493f, + -1.2634f, -0.377524f, 0.824094f, 0.312308f, 0.125368f, + -0.382737f, 0.637109f, 0.61907f, -0.741184f, 0.00257198f, + -0.0151343f, -0.669826f, -0.439855f, 0.564852f, -0.0588036f, + -1.38123f, -1.1126f, 0.701831f, 0.198686f, 0.266866f, + 0.270172f, -0.692401f, 0.272533f, -1.70914f, 0.66064f, + 0.0886659f, -0.132233f, 0.270531f, -0.479581f, 0.704338f, + -0.307039f, -0.111792f, -2.05753f, -0.231749f, 0.300528f, + 0.383266f, -0.130857f, -0.373944f, 1.21025f, 0.704655f, + -0.589422f, 0.267185f, -0.109065f, -0.195991f, 0.20209f, + -0.0676526f, -0.183926f, 0.164894f, 0.0877923f, 0.565943f, + -0.0610466f, -0.86354f, -0.80853f, -0.176111f, -1.45016f, + -2.29078f, -0.124524f, -0.139305f, -0.187858f, -0.0250151f, + -0.572544f, 0.185336f, -0.69275f, -0.430354f, -0.30861f, + -0.754258f, -0.468221f, -0.160487f, -0.766692f, -0.636418f, + -0.71016f, 0.576125f, -0.240476f, -0.954556f, -0.104693f, + 0.155557f, -0.840224f, -0.685457f, -0.0346927f, -0.644882f, + -1.92475f, -0.314544f, 0.463569f, 0.323569f, -0.990124f, + -0.213658f, 0.407183f, 1.19797f, -4.77004f, -0.0613379f, + -2.40345f, -0.0591791f, -0.477622f, -0.303556f, 0.104077f, + -0.974128f, -0.035172f, 1.47064f, 0.233727f, -0.0754056f, + 0.158553f, 0.0614361f, -1.38865f, 0.690729f, 0.568455f, + 0.205866f, -0.0236852f, -0.0921077f, -0.538954f, 0.336613f, + -0.427115f, 0.791754f, -1.819f, -0.404432f, 0.670242f, + -0.0343869f, -0.37191f, 0.0271262f, 0.988161f, -0.547343f, + 0.925304f, 0.548079f, -0.430343f, -0.214109f, 0.242013f, + 1.39027f, 0.37648f, -1.63524f, -0.158864f, -0.572779f, + -0.766801f, -2.62032f, 0.47799f, -1.12025f, -0.115283f, + 1.22349f, -0.262132f, -0.151274f, 0.390483f, -0.496482f, + 1.06166f, -0.183052f, 0.54647f, 0.847486f, 0.0229506f, + 0.653309f, -0.020736f, -1.27453f, 0.48386f, -0.366625f, + -0.515725f, -1.31196f, 0.140701f, -0.183636f, 0.000413912f, + 0.300993f, -0.849529f, -0.59764f, -0.212992f, -0.933365f, + -1.4054f, -0.091982f, 0.41695f, 0.264004f, -0.26379f, + -0.0738219f, 0.434052f, 1.16617f, -0.639624f, -0.146465f, + 0.0409936f, -0.900182f, 0.73517f, 0.805746f, -0.208088f, + 1.74459f, -0.0592751f, 0.624865f, -0.62325f, -0.446315f, + 0.150526f, 0.0526697f, 0.374254f, -0.658043f, 1.02623f, + -0.941758f, 0.381217f, -0.359448f, 0.160051f, 0.556455f, + 0.239382f, 0.75851f, 0.437583f, -0.122221f, 0.746136f, + 0.218286f, -0.426729f, 0.0353903f, -0.830513f, -0.877586f, + 0.488077f, -0.132354f, -0.180756f, 0.736163f, -0.202934f, + -0.882534f, 0.166305f, 0.183122f, 0.0599858f, 0.442687f, + 0.0522908f, -1.17755f, -1.03733f, 0.392363f, 0.672718f, + -1.44704f, 0.360623f, 0.390298f, -0.213968f, 0.169783f, + -0.717536f, -0.830984f, -0.445049f, 0.196772f, -0.730634f, + -1.09497f, 0.344012f, -0.292802f, -0.67966f, 0.138515f, + -0.361803f, 0.936778f, -0.189802f, 0.197777f, -0.367507f, + -0.293653f, 0.447759f, -0.409245f, -0.687568f, -0.431301f, + -0.271234f, -0.585413f, -0.936414f, -0.396049f, -0.29388f, + -0.0930843f, 0.0179339f, 0.262463f, -0.166598f, 0.0171466f, + -0.329641f, 0.39343f, 0.657445f, -0.579052f, -0.312444f, + -0.0915881f, -0.432622f, -0.247645f, 0.485749f, -0.602508f, + -0.347936f, 0.287353f, 0.288705f, 0.168397f, 0.568228f, + -0.493586f, 1.04155f, -0.097956f, 0.658928f, -0.561007f, + 0.0457783f, 2.12744f, 0.182683f, -0.690282f, 0.183302f, + 0.0309499f, -0.722251f, 0.0660448f, -0.333277f, 0.198929f, + -0.724102f, -0.405597f, 0.614868f, -0.292862f, 0.886513f, + 0.142353f, -1.48934f, -0.97273f, 0.199683f, 0.522121f, + 0.0877478f, -0.172593f, -1.58858f, 0.113191f, -0.436178f, + 0.640895f, -0.504676f, 0.0658654f, -0.361301f, 0.604323f, + 0.315196f, -0.423021f, -0.323484f, -0.563163f, 0.118989f, + -0.404508f, -0.0550995f, -0.0359236f, -0.126574f, -0.357288f, + -0.0494502f, 1.04959f, -0.31646f, -0.0376684f, -0.300744f, + -0.135016f, 0.102696f, -0.392333f, -1.17502f, 0.505227f, + 0.337608f, -0.348831f, -0.420815f, 0.202791f, -0.154264f, + -0.563686f, 0.0942187f, 0.353862f, 0.0303509f, -0.132794f, + 0.420746f, 0.143529f, 0.455822f, -1.28348f, -1.35662f, + -0.850688f, -1.76361f, -0.717546f, 0.443111f, 0.227155f, + -0.863307f, -0.452033f, -0.278151f, 1.86233f + }; + +static const float av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_bias[] = { + -0.103218f, -0.359587f, 0.619666f, -0.473497f, -0.649803f, 0.86992f, + -0.115561f, 0.335114f, -0.285044f, -0.59295f, 0.24497f, 0.611583f, + 0.38568f, 0.137913f, -0.281191f, -0.0107777f, 0.487236f, -0.262363f, + 0.696962f, 0.121565f, 0.312511f, 0.430916f, 0.694134f, 0.393632f +}; + +static const float av1_intra_mode_cnn_partition_branch_2_logits_kernel[] = { + -2.42496f, -1.239f, 0.832673f, 1.56923f, -2.6175f, -1.42492f, + -0.311387f, -1.94237f, 0.54071f, -2.50391f, 0.352205f, -0.96572f, + 1.47144f, -2.04702f, -1.12372f, -0.709186f, 0.812238f, 0.310389f, + 0.789163f, -0.65236f, 1.77018f, 0.273867f, 1.19506f, 1.07022f +}; + +static const float av1_intra_mode_cnn_partition_branch_2_logits_bias[] = { + 0.953424f +}; + +static const float + av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_kernel[] = { + 0.0485154f, 0.0496279f, 0.0268229f, -0.0584843f, -0.166928f, + 0.0316731f, -0.0895094f, -0.0433243f, -0.00893639f, -0.0886265f, + -0.0345622f, -0.235395f, -0.213754f, -0.00212398f, 0.0218857f, + -0.0054983f, -0.0248236f, 0.081822f, -0.0355708f, -0.0795593f, + -0.106995f, -0.0596378f, 0.0350686f, -0.133863f, -0.00582928f, + 0.114963f, 0.193906f, -0.00419085f, 0.0430529f, -0.128318f, + 0.0614715f, -0.000952935f, -0.0345722f, -0.109459f, 0.074204f, + -0.0865131f, 0.0649158f, -0.0942417f, -0.10122f, -0.047551f, + -1.27825f, -0.0125456f, -0.019722f, -0.152058f, 0.280306f, + -0.121231f, -0.0565484f, 0.0959188f, 0.0603919f, 0.0457468f, + 0.967589f, 0.105892f, -0.118326f, 0.198933f, 0.163437f, + -0.056824f, -0.0302956f, -0.07366f, -0.681407f, -0.0781575f, + 0.255732f, -0.0712105f, 0.177882f, 0.709206f, -0.232457f, + 1.33809f, -0.0328557f, 0.0572231f, -1.01361f, 0.130676f, + -0.205159f, 0.975398f, 0.356293f, 0.0766364f, -0.297397f, + -0.0261066f, -0.0933549f, 0.0568851f, -0.0123034f, -0.0433538f, + 0.131003f, 0.890705f, 0.0084565f, 0.00547395f, 0.00157634f, + 0.0047937f, -0.0511092f, 0.0300034f, -0.00604993f, -0.0133502f, + -0.000274302f, 0.129728f, -0.00532916f, 0.0855351f, 0.136885f, + 0.0175562f, -0.0123633f, -0.000512229f, -0.019924f, -0.0316328f, + 0.422972f, 0.0460336f, 0.0170841f, -0.00086795f, -0.0655137f, + 0.0287308f, -0.0375644f, -0.0329215f, -0.0273072f, 0.0241426f, + -0.0429052f, 0.0221593f, -0.063881f, -0.0347391f, -6.44339e-07f, + 0.0476934f, -0.0150068f, 0.0146403f, -0.0653099f, 0.0107635f, + 0.012407f, 0.0048935f, 1.50975f, 0.322256f, 0.17881f, + 0.0943775f, -0.100583f, -0.367022f, -0.156525f, -0.0397161f, + 0.0752784f, -0.00219022f, -0.887456f, 0.0153415f, -0.0148185f, + -0.56435f, 0.163996f, -0.0221024f, -0.0115872f, -0.0529284f, + 0.156838f, -1.13813f, -0.207863f, -0.00484959f, 0.135719f, + 0.131004f, 0.0417939f, 0.31453f, 0.121719f, -0.101515f, + 0.267951f, 0.219727f, 0.0398821f, 0.0713504f, 3.65918e-06f, + -0.00659998f, 0.477343f, -0.128426f, 0.0648877f, 0.111884f, + 0.224552f, 0.0617426f, 0.117742f, 0.031377f, 0.0586865f, + -0.459293f, 0.100211f, -0.14127f, 0.624412f, 0.014659f, + -1.41807f, -0.382452f, -0.695931f, -0.103153f, 0.145808f, + 0.333526f, -0.256367f, 0.096842f, 0.102458f, -0.181224f, + 0.729272f, 0.151177f, 1.46729f, 0.111044f, -4.28813f, + 0.0178379f, 0.47641f, -6.57533f, 0.0633335f, 0.496934f, + -0.154657f, -9.07298e-05f, 0.848937f, -5.40143f, 0.375685f, + 0.23586f, -0.166591f, -0.0191648f, -0.039862f, -3.25093f, + 0.168472f, -0.260317f, -5.51548f, 0.0575334f, 0.328979f, + 0.112644f, 0.231339f, -0.122641f, 0.0567331f, 1.19541f, + -0.038735f, 0.0630576f, 0.176668f, 0.0757184f, -0.833104f, + 0.133669f, 0.982669f, 0.0311783f, 0.0908558f, -0.10065f, + -0.0386599f, -0.231587f, -0.83876f, -0.347148f, 0.225529f, + -1.29625f, 0.0806834f, 0.369648f, -1.63367f, 0.118057f, + -0.311948f, 0.95022f, -0.354807f, -0.648657f, -1.72048f, + 0.260397f, 0.915555f, 0.057737f, -0.162019f, -0.453543f, + -1.70388f, -0.311632f, -0.731593f, -0.678089f, 0.10438f, + -0.293911f, 0.144864f, 0.039212f, 0.0289241f, -0.0685266f, + 0.634592f, -0.0798614f, -0.119197f, -0.00517433f, -0.04653f, + -0.127568f, -0.0582645f, 0.0735302f, -0.0946823f, 0.00865585f, + 0.0115748f, 0.0194847f, 0.0455664f, 0.181006f, -0.0824601f, + 0.0869093f, 0.264767f, -0.0750432f, 0.135136f, 0.316511f, + 0.399015f, 0.0994808f, -0.166944f, -0.102126f, 0.457858f, + 0.300488f, 0.467582f, 0.830244f, -0.0511439f, -0.522892f, + -0.183049f, 0.2626f, 0.118382f, 0.241674f, 0.250399f, + -0.0963507f, -0.83231f, -0.227699f, -0.133314f, 0.231718f, + -0.0700274f, 0.891311f, 0.224742f, -0.572836f, 0.402798f, + -0.191576f, 0.740922f, -0.00374073f, 0.658178f, -0.209364f, + -0.416259f, 0.166297f, 0.0095577f, -0.0876076f, 0.424954f, + 0.265226f, -0.129343f, -0.203146f, -0.194637f, -0.818142f, + -0.164152f, -0.368962f, 0.273373f, 0.599927f, -0.19859f, + 0.0939651f, -0.12458f, -0.751816f, -0.302997f, -0.139176f, + -0.372737f, 0.332704f, -0.206045f, -0.00593763f, -0.452363f, + -0.2704f, -0.198846f, 0.0976308f, -0.216124f, 0.110122f, + -0.220342f, 0.00763426f, -0.0272775f, -0.190395f, -0.0359411f, + -0.0395759f, 0.000941162f, -1.49959f, 0.0914233f, 0.448346f, + -0.420435f, -0.0102102f, -0.0757978f, -0.0177687f, -0.0231492f, + -0.142125f, 1.31774f, 0.0269368f, 0.134566f, 0.152079f, + -0.139933f, 0.139226f, -0.214467f, -0.194446f, -0.555893f, + 0.271197f, -0.111047f, 0.0888069f, -0.198121f, 0.0871713f, + 0.100612f, 0.429782f, -0.3787f, 0.123147f, -0.12538f, + 0.235678f, 0.139237f, 0.223326f, 0.85806f, -0.00554756f, + 0.285095f, 0.0954683f, 0.0464989f, 0.100806f, -0.0211297f, + 0.121672f, 0.242473f, 0.0810475f, -0.834356f, 0.119629f, + 0.111338f, -0.227126f, 0.159296f, -0.0584685f, -0.108265f, + -0.0909221f, -0.21749f, 0.0929309f, -0.176815f, 0.178067f, + -0.0025905f, 0.317883f, 0.313045f, 0.26774f, -0.589329f, + -1.19882f, -0.285513f, -0.109478f, 0.309441f, -0.0604479f, + 0.947461f, -0.142342f, -0.9086f, -0.814788f, 0.184588f, + -0.0736317f, 0.276237f, 0.13132f, -0.3931f, -0.381744f, + -0.0122719f, 0.0246101f, -0.0920412f, 0.11331f, -0.110355f, + 0.00848064f, 0.0931248f, -0.0638655f, -4.30869e-05f, -0.300367f, + 0.0489508f, 0.464441f, -0.0466243f, -0.0137732f, 0.0099241f, + -0.223972f, 0.188966f, -0.653173f, -0.354322f, 0.189237f, + -0.624276f, -1.46218f, -0.075161f, -0.516172f, 0.40993f, + 0.291178f, -1.95088f, -0.0352157f, 0.196354f, -0.335897f, + 0.0857039f, 0.605319f, -1.12923f, -0.638387f, 1.41868f, + 0.0955757f, -0.00913477f, 0.315935f, -0.671223f, -0.851436f, + -0.157464f, -0.296763f, 0.182277f, -0.139309f, 0.232789f, + 0.869562f, 0.248894f, 0.242709f, 0.195479f, 0.106153f, + 0.358881f, 0.167443f, 0.982987f, 0.104767f, -0.033925f, + -0.0263185f, 0.0045304f, 0.0722479f, -0.111307f, 0.00128896f, + 0.406128f, -0.00944947f, 0.121592f, 0.546284f, -0.00175696f, + 0.776588f, 0.238846f, 0.064469f, 0.27082f, 0.269187f, + 0.0294455f, 0.62364f, -0.27872f, -0.0488013f, 0.229024f, + 0.154457f, 0.0445898f, 0.349943f, 0.0710998f, 0.0820674f, + 0.0279449f, 0.172826f, -0.122156f, -0.164688f, 0.0292124f, + 0.0496112f, -0.741762f, 0.0673926f, 0.108159f, -0.0942327f, + -0.0562883f, 0.558231f, 0.0552399f, 0.211393f, 0.0376817f, + -0.275788f, 0.0548436f, 0.212732f, 0.163603f, 0.0663363f, + -0.0252315f, 0.164533f, 0.0826088f, 0.0301389f, 0.345705f, + -0.0378046f, -0.139581f, 1.30162f, 1.23551f, -0.446693f, + 0.682534f, -0.0831157f, -0.0121595f, 1.50505f, 0.0839017f, + -0.953413f, 0.0820985f, -0.125556f, 0.699796f, -0.140453f, + 0.168438f, -0.110966f, 0.173806f, 0.114683f, 0.132502f, + -0.0453539f, -0.133096f, 0.511947f, -0.180657f, -0.0298605f, + 0.291437f, -0.0275017f, -0.229703f, -0.0504205f, 0.559622f, + 0.384601f, 0.111024f, -0.0773559f, -0.0591752f, -0.0866182f, + -0.189437f, -0.262345f, -0.0372182f, 0.149925f, 0.154644f, + -0.188298f, 0.236949f, -0.199328f, -0.378909f, -0.680128f, + 0.277184f, -0.172784f, 0.184717f, -0.23899f, 0.0712069f, + 0.0235425f, 0.4225f, -0.441487f, 0.177434f, -0.298303f, + 0.295696f, 0.17346f, 0.220542f, -0.680116f, 0.00266223f, + -0.0408459f, -0.15486f, 0.24335f, 0.237258f, -0.0283245f, + 0.19703f, -0.100027f, 0.0554843f, -1.03081f, 0.151745f, + 0.538582f, 0.370368f, 0.196683f, 0.0222123f, -0.0831401f, + -0.0832803f, -0.286743f, -0.686003f, 0.0995004f, 0.148901f, + -0.0436037f, -0.316508f, 0.00391835f, -0.228452f, 0.940058f, + 0.520047f, -0.334211f, 0.652142f, -0.0755971f, 0.0965123f, + -0.98191f, 0.394096f, -0.420466f, 0.327284f, -0.134651f, + 0.849297f, -0.523372f, 0.010327f, 0.133636f, 0.298119f, + -0.257389f, 0.0376153f, -0.198298f, 0.0736235f, 0.608809f, + 0.0291836f, -0.290005f, -0.141316f, 0.0184599f, 0.0554437f, + 0.0621519f, 0.485276f, 0.617062f, -0.0924811f, -0.0120834f, + 0.0817611f, 0.100421f, -0.0153553f, -0.135958f, -0.0185322f, + -0.395803f, -0.204862f, 0.547916f, -0.438117f, 0.0229788f, + 0.406981f, 0.795584f, -2.02756f, -0.8355f, -0.386789f, + 0.00968368f, 1.2147f, -0.740869f, -1.18415f, -0.954918f, + -0.541142f, 0.0596003f, 0.107189f, -0.411708f, -0.964593f, + 0.511906f + }; + +static const float av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_bias[] = { + -0.485545f, 0.131552f, 0.796833f, -0.157582f, -0.0948124f, 0.00818613f, + -0.485562f, 0.3826f, -0.0839326f, 0.170998f, 0.279545f, -0.287143f, + 0.184986f, -0.0719864f, 0.19748f, 0.404145f +}; + +static const float + av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_kernel[] = { + 1.30172f, 0.720189f, 0.261675f, -0.466201f, 1.21773f, + 0.495525f, 0.62398f, 0.44567f, -0.330993f, -0.269798f, + 0.835161f, -0.294874f, 0.186981f, 0.0162467f, 0.367654f, + 0.658468f, 1.08325f, 1.01558f, 0.12783f, -0.280581f, + 2.2204f, 0.0337286f, -0.403649f, -0.230908f, -0.35188f, + 0.437712f, -0.103634f, -0.645929f, 1.17407f, 0.157385f, + 0.212438f, 1.41874f, 0.284242f, -0.493105f, 1.0703f, + 0.00632116f, 1.18222f, -0.26003f, 0.276795f, -0.823156f, + 0.29577f, -0.157467f, -0.18092f, 0.0237336f, 0.205715f, + -0.295679f, 0.165443f, -0.628279f, 1.00804f, 0.361232f, + 0.646155f, -0.028651f, 1.64317f, 0.334251f, -1.50713f, + -1.51685f, -0.488522f, 0.169694f, -0.593176f, -0.372682f, + -1.50223f, 0.35076f, -0.24641f, -0.237189f, 0.190502f, + -0.948191f, -0.303346f, 0.45108f, -0.794368f, -2.3116f, + 0.404008f, -2.67269f, -0.941992f, -0.45336f, 0.0655987f, + -0.288432f, 0.106068f, 0.286978f, 0.121403f, 0.462739f, + 0.0130292f, 0.240597f, -2.30983f, -0.453309f, -0.149335f, + 0.856424f, -0.186576f, 0.769961f, -0.0657097f, -0.976188f, + 0.972971f, -0.532728f, -0.699334f, -0.168803f, 0.361945f, + 0.950769f, 1.5368f, -0.223899f, 1.17547f, -0.281483f, + 0.533619f, 0.315344f, 0.0854543f, 0.464701f, 0.346828f, + 0.271794f, -0.0185388f, 0.109517f, 0.371662f, -0.10852f, + 0.244092f, 0.491959f, -0.750281f, 1.41865f, -3.51221f, + 0.298194f, -0.0790832f, -0.134158f, -0.424084f, 0.189593f, + -0.238361f, -0.407872f, -0.366222f, -0.606813f, -0.230498f, + 0.387248f, -0.102734f, -0.190544f, -1.43649f, 0.141338f, + -0.0438917f, 0.204628f, 1.57033f, 0.0366937f, -0.14733f, + 0.048198f, -0.122631f, 0.183354f, 0.0658753f, -0.243381f, + 0.0246889f, -0.768798f, -0.0644054f, 0.775073f, 1.63419f, + 0.491624f, 0.21898f, -0.358944f, 3.31304f, 0.0195916f, + 0.236174f, 0.530704f, 0.140124f, 0.0736778f, -0.27361f, + -0.598836f, -1.01659f, 0.361765f, 0.00455986f, -0.345222f, + 1.68731f, 0.764082f, 0.193555f, 0.322782f, 1.19801f, + 0.538935f, -0.0393231f, -0.0248292f, -0.151168f, 0.479879f, + -0.208582f, 0.22798f, 0.335473f, -0.00295455f, 0.139539f, + 0.400814f, 0.478307f, -0.189376f, 0.540084f, 0.466072f, + 0.920231f, 0.398774f, -0.472403f, -0.0431972f, -0.581665f, + -0.990058f, 0.258995f, -0.0148889f, 0.27105f, 0.340334f, + 0.223576f, -0.0405193f, -1.23888f, -1.45229f, -1.44543f, + -0.376146f, 0.132601f, -0.4064f, -0.583611f, -0.374588f, + 0.0659428f, 0.325652f, -0.338456f, 0.253767f, -0.0181164f, + 0.681732f, 0.222041f, 0.837496f, 1.09735f, 0.156328f, + 0.177236f, -0.702702f, 0.473689f, 0.322118f, 0.43343f, + 0.315441f, -0.40798f, 0.0811291f, 0.631431f, 0.361929f, + 0.0723276f, 0.0164498f, 0.0293847f, 0.156406f, -1.10453f, + 0.837977f, -1.03449f, -0.348408f, 1.71953f, -0.401765f, + 0.64272f, -0.182438f, -0.233954f, 0.364597f, 0.269177f, + -0.578512f, 0.397216f, 0.0425122f, -0.258728f, 1.41621f, + -0.688768f, 0.0944726f, 0.253163f, -0.989037f, 1.72726f, + 1.15976f, -0.0460612f, 0.534186f, -0.136814f, 0.49327f, + 0.115744f, -0.633052f, -0.433855f, -1.01874f, -0.324035f, + 0.489487f, 1.08696f, 0.836376f, -0.423477f, -0.421309f, + 1.07348f, 0.323266f, 0.717604f, 0.366422f, 0.32983f, + 0.336583f, 0.749292f, -0.210666f, 0.387101f, -0.583376f, + 0.0391101f, -1.07537f, 0.914591f, -0.51303f, 1.15023f, + -0.0378782f, 0.262889f, -0.841128f, 0.41619f, -0.669704f, + -0.109995f, 1.01825f, -0.194853f, 0.120739f, 0.627889f, + -0.00269221f, 0.751152f, -0.529865f, -1.50238f, 0.184521f, + 0.795464f, 0.106099f, 1.83117f, 0.0883305f, 0.306844f, + -0.0671504f, -0.169306f, -0.214575f, -0.121606f, -0.234965f, + 0.109752f, -0.35831f, -0.07894f, 0.497203f, -2.63013f, + 0.815608f, -0.193593f, -0.62292f, 0.338941f, 0.0970922f, + -0.531178f, 0.723346f, 0.35063f, 0.182647f, -0.257013f, + 0.784924f, -0.217915f, -0.0797363f, -0.399706f, -0.485602f, + 1.23155f, 0.345998f, 0.322949f, -0.168196f, -0.173313f, + 0.282205f, 0.45117f, 0.918706f, -0.046172f, -0.0873883f, + 0.56103f, -0.485768f, 0.546199f, 0.254997f, 0.394296f, + 0.607178f, 0.667532f, -0.343883f, 0.374402f, -0.531439f, + 2.27782f, -1.13255f, 0.505867f, -0.514742f, 0.998571f, + -1.60984f, -0.172873f, -0.0604094f, 0.719791f, -0.733982f, + 0.348905f, 1.39008f, -0.895343f, -0.677064f, -1.84221f, + 0.0434018f, -0.534794f, 0.0434753f, -0.266576f, 0.268099f, + -0.242935f, 0.00166289f, 0.0263789f, -0.224794f, -0.113493f, + -0.236397f, 0.0879936f, 0.510895f, -0.511789f, -1.48962f, + -2.78268f, -0.0495784f, -0.0343907f, 0.440459f, -0.364209f, + 0.833223f, -0.0589337f, 0.00181418f, 0.455499f, 0.101762f, + -1.16424f, 0.270405f, 0.219033f, -4.91105f + }; + +static const float av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_bias[] = { + -0.40114f, -0.372342f, -0.216186f, -0.240014f, -0.341773f, -0.344489f, + -0.113037f, 0.198479f, 0.482958f, -0.630072f, -0.728704f, -0.171963f, + 0.519883f, 0.253003f, -0.121618f, -0.0569875f, -0.485568f, -0.147577f, + 0.533305f, -0.587251f, -0.120837f, -0.483953f, 0.445641f, -0.125136f +}; + +static const float av1_intra_mode_cnn_partition_branch_3_logits_kernel[] = { + -1.57431f, -1.09069f, 1.67996f, -0.669702f, 0.499807f, -3.03145f, + -0.878135f, 0.637818f, -1.58419f, -3.79756f, 0.62755f, -0.446646f, + 0.653269f, -0.667854f, -2.19774f, -3.53349f, 2.6107f, -0.685892f, + -1.2603f, -0.89707f, -0.715551f, 0.382202f, 2.09574f, 0.469386f +}; + +static const float av1_intra_mode_cnn_partition_branch_3_logits_bias[] = { + -0.022787f +}; + +static const NN_CONFIG av1_intra_mode_cnn_partition_branch_0_dnn_config = { + BRANCH_0_NUM_DNN_FEATURES, + BRANCH_0_NUM_LOGITS, + BRANCH_0_NUM_DNN_LAYERS, + { + BRANCH_0_NUM_DNN_LAYER_0_UNITS, + BRANCH_0_NUM_DNN_LAYER_1_UNITS, + }, + { + av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_kernel, + av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_kernel, + av1_intra_mode_cnn_partition_branch_0_logits_kernel, + }, + { + av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_bias, + av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_bias, + av1_intra_mode_cnn_partition_branch_0_logits_bias, + }, +}; +static const NN_CONFIG av1_intra_mode_cnn_partition_branch_1_dnn_config = { + BRANCH_1_NUM_DNN_FEATURES, + BRANCH_1_NUM_LOGITS, + BRANCH_1_NUM_DNN_LAYERS, + { + BRANCH_1_NUM_DNN_LAYER_0_UNITS, + BRANCH_1_NUM_DNN_LAYER_1_UNITS, + }, + { + av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_kernel, + av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_kernel, + av1_intra_mode_cnn_partition_branch_1_logits_kernel, + }, + { + av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_bias, + av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_bias, + av1_intra_mode_cnn_partition_branch_1_logits_bias, + }, +}; +static const NN_CONFIG av1_intra_mode_cnn_partition_branch_2_dnn_config = { + BRANCH_2_NUM_DNN_FEATURES, + BRANCH_2_NUM_LOGITS, + BRANCH_2_NUM_DNN_LAYERS, + { + BRANCH_2_NUM_DNN_LAYER_0_UNITS, + BRANCH_2_NUM_DNN_LAYER_1_UNITS, + }, + { + av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_kernel, + av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_kernel, + av1_intra_mode_cnn_partition_branch_2_logits_kernel, + }, + { + av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_bias, + av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_bias, + av1_intra_mode_cnn_partition_branch_2_logits_bias, + }, +}; +static const NN_CONFIG av1_intra_mode_cnn_partition_branch_3_dnn_config = { + BRANCH_3_NUM_DNN_FEATURES, + BRANCH_3_NUM_LOGITS, + BRANCH_3_NUM_DNN_LAYERS, + { + BRANCH_3_NUM_DNN_LAYER_0_UNITS, + BRANCH_3_NUM_DNN_LAYER_1_UNITS, + }, + { + av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_kernel, + av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_kernel, + av1_intra_mode_cnn_partition_branch_3_logits_kernel, + }, + { + av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_bias, + av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_bias, + av1_intra_mode_cnn_partition_branch_3_logits_bias, + }, +}; + +#undef NUM_DNN_BRANCHES +#undef NUM_CNN_LAYERS +#undef BRANCH_0_NUM_DNN_LAYERS +#undef BRANCH_1_NUM_DNN_LAYERS +#undef BRANCH_2_NUM_DNN_LAYERS +#undef BRANCH_3_NUM_DNN_LAYERS +#undef CNN_LAYER_0_HEIGHT +#undef CNN_LAYER_0_WIDTH +#undef CNN_LAYER_0_IN_CH +#undef CNN_LAYER_0_OUT_CH +#undef CNN_LAYER_0_HORZ_STRIDE +#undef CNN_LAYER_0_VERT_STRIDE +#undef CNN_LAYER_1_HEIGHT +#undef CNN_LAYER_1_WIDTH +#undef CNN_LAYER_1_IN_CH +#undef CNN_LAYER_1_OUT_CH +#undef CNN_LAYER_1_HORZ_STRIDE +#undef CNN_LAYER_1_VERT_STRIDE +#undef CNN_LAYER_2_HEIGHT +#undef CNN_LAYER_2_WIDTH +#undef CNN_LAYER_2_IN_CH +#undef CNN_LAYER_2_OUT_CH +#undef CNN_LAYER_2_HORZ_STRIDE +#undef CNN_LAYER_2_VERT_STRIDE +#undef CNN_LAYER_3_HEIGHT +#undef CNN_LAYER_3_WIDTH +#undef CNN_LAYER_3_IN_CH +#undef CNN_LAYER_3_OUT_CH +#undef CNN_LAYER_3_HORZ_STRIDE +#undef CNN_LAYER_3_VERT_STRIDE +#undef CNN_LAYER_4_HEIGHT +#undef CNN_LAYER_4_WIDTH +#undef CNN_LAYER_4_IN_CH +#undef CNN_LAYER_4_OUT_CH +#undef CNN_LAYER_4_HORZ_STRIDE +#undef CNN_LAYER_4_VERT_STRIDE +#undef BRANCH_0_NUM_DNN_FEATURES +#undef BRANCH_0_NUM_DNN_LAYER_0_UNITS +#undef BRANCH_0_NUM_DNN_LAYER_1_UNITS +#undef BRANCH_0_NUM_LOGITS +#undef BRANCH_1_NUM_DNN_FEATURES +#undef BRANCH_1_NUM_DNN_LAYER_0_UNITS +#undef BRANCH_1_NUM_DNN_LAYER_1_UNITS +#undef BRANCH_1_NUM_LOGITS +#undef BRANCH_2_NUM_DNN_FEATURES +#undef BRANCH_2_NUM_DNN_LAYER_0_UNITS +#undef BRANCH_2_NUM_DNN_LAYER_1_UNITS +#undef BRANCH_2_NUM_LOGITS +#undef BRANCH_3_NUM_DNN_FEATURES +#undef BRANCH_3_NUM_DNN_LAYER_0_UNITS +#undef BRANCH_3_NUM_DNN_LAYER_1_UNITS +#undef BRANCH_3_NUM_LOGITS + +static const float av1_intra_mode_cnn_partition_split_thresh_hdres[5] = { + 100.000000f, 4.750139f, 1.655964f, 3.711212f, 0.963839f, +}; + +static const float av1_intra_mode_cnn_partition_no_split_thresh_hdres[5] = { + -100.000000f, -2.404842f, -3.858223f, -2.041206f, -1.573735f, +}; + +static const float av1_intra_mode_cnn_partition_split_thresh_midres[5] = { + 100.000000f, 3.218737f, 2.657764f, 0.868458f, 2.454447f, +}; + +static const float av1_intra_mode_cnn_partition_no_split_thresh_midres[5] = { + -100.000000f, -3.842426f, -4.005076f, -3.642994f, -2.467197f, +}; + +static const float av1_intra_mode_cnn_partition_split_thresh_lowres[5] = { + 100.000000f, 1.890757f, 2.658417f, 1.450626f, 1.833180f, +}; + +static const float av1_intra_mode_cnn_partition_no_split_thresh_lowres[5] = { + -100.000000f, -4.100921f, -4.564202f, -5.695176f, -1.483546f, +}; + +static const float av1_intra_mode_cnn_partition_mean[1] = { + 1.191922f, +}; + +static const float av1_intra_mode_cnn_partition_std[1] = { + 1.730044f, +}; + +static const int quad_to_linear_0[1] = { 0 }; +static const int quad_to_linear_1[4] = { 0, 1, 2, 3 }; +static const int quad_to_linear_2[16] = { 0, 1, 4, 5, 2, 3, 6, 7, + 8, 9, 12, 13, 10, 11, 14, 15 }; +static const int quad_to_linear_3[64] = { + 0, 1, 8, 9, 2, 3, 10, 11, 16, 17, 24, 25, 18, 19, 26, 27, + 4, 5, 12, 13, 6, 7, 14, 15, 20, 21, 28, 29, 22, 23, 30, 31, + 32, 33, 40, 41, 34, 35, 42, 43, 48, 49, 56, 57, 50, 51, 58, 59, + 36, 37, 44, 45, 38, 39, 46, 47, 52, 53, 60, 61, 54, 55, 62, 63 +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_PARTITION_CNN_WEIGHTS_H_ diff --git a/libs/libaom/src/av1/encoder/partition_model_weights.h b/libs/libaom/src/av1/encoder/partition_model_weights.h new file mode 100644 index 000000000..71c1ace78 --- /dev/null +++ b/libs/libaom/src/av1/encoder/partition_model_weights.h @@ -0,0 +1,5646 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_ +#define AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/ml.h" + +// TODO(chiyotsai@google.com): The performance of these models are getting worse +// due the changes in the encoder. We should retrain the models here to get +// better performance once we have the time. + +#define FEATURE_SIZE 10 +#define LABEL_SIZE 16 +// nn model for ab partition pruning, 128x128. +static const float av1_ab_partition_nn_weights_128_layer0[FEATURE_SIZE * 64] = { + -0.715251f, -0.015767f, -0.667353f, -0.345255f, 0.177887f, -0.469759f, + 0.426152f, 0.489798f, 0.469865f, 0.773821f, 0.088517f, 0.074585f, + 0.838754f, 0.048449f, -0.007584f, 0.638968f, 0.233305f, -0.319236f, + -0.257124f, -0.170869f, 0.137180f, 0.114852f, -0.721241f, -0.947962f, + -0.411298f, 0.494306f, -0.060435f, -0.648421f, -0.126624f, 0.072686f, + -0.143904f, -0.115839f, -0.175527f, -0.117728f, 0.040686f, -0.189925f, + 0.134361f, -0.258070f, -0.177558f, 0.158049f, 0.168668f, -0.062919f, + 0.341986f, 0.038100f, -0.435577f, -0.321255f, 0.203213f, 0.213061f, + 0.533304f, 0.359296f, -0.079558f, 0.004637f, 0.663904f, 0.043779f, + 0.383018f, 1.136559f, -0.084155f, 0.333057f, -0.199011f, 0.152059f, + -0.078419f, -0.167752f, -0.093651f, 0.083171f, -0.190143f, 0.086195f, + -0.280632f, -0.160663f, -0.017298f, 0.122628f, -0.138116f, 0.062927f, + 0.222462f, 0.626979f, 0.426928f, 0.117170f, -0.240457f, 0.053750f, + 0.038017f, 0.007359f, -0.017595f, 0.101407f, 0.332891f, 0.074933f, + 0.306498f, 0.219380f, -0.151638f, -0.247976f, 0.343405f, 0.121256f, + 0.049173f, 0.171474f, -0.139608f, -1.016599f, -0.345553f, -0.901138f, + 0.243401f, 0.059928f, -0.089396f, -0.195565f, 0.364705f, -0.020400f, + -1.383672f, 0.413018f, 0.536950f, -0.020904f, -1.335306f, -0.732290f, + 0.102885f, 0.315290f, -0.208521f, -0.081811f, 0.182300f, 0.125712f, + -0.593833f, -0.220639f, -0.314155f, 0.188327f, 0.118503f, 0.524427f, + -1.083859f, -1.130640f, 0.390352f, -0.045591f, 0.113160f, -0.009149f, + -0.096183f, 0.115829f, 0.377752f, 0.318396f, -0.591983f, 0.004797f, + -0.497377f, -0.342248f, 0.079546f, -0.025249f, -0.295972f, 0.615501f, + -0.464372f, 0.418315f, -0.173556f, 0.105217f, 0.298073f, 0.082478f, + 0.033223f, 0.977341f, -0.372982f, -0.052337f, 0.154124f, 0.396787f, + 0.536654f, -0.139061f, -0.223702f, 0.229666f, -0.846766f, 0.107723f, + 0.563839f, -0.483141f, 0.304813f, -0.765283f, 0.070964f, 0.151101f, + 0.275188f, 0.490303f, 1.175892f, 0.085377f, -0.191200f, 0.544532f, + -0.365075f, 0.167546f, 0.052183f, -0.220529f, -0.212227f, -0.144988f, + -0.273356f, -0.062023f, 0.103993f, -0.238493f, -0.161204f, -0.054611f, + -0.166672f, 0.128327f, 0.461751f, -0.545822f, 0.739798f, 0.594386f, + -0.163192f, -0.332501f, 0.363834f, -0.065043f, 0.474812f, -0.138811f, + 0.170924f, -0.778142f, -0.316474f, -0.508065f, -0.039986f, -0.478001f, + 0.340591f, 0.041783f, 0.055419f, 0.015155f, -0.981830f, -1.355237f, + 0.347516f, 1.155327f, 0.081319f, 0.274163f, -0.327230f, -0.113478f, + 0.556552f, -0.055986f, 0.217318f, -0.445351f, 0.325759f, 0.526547f, + -0.657434f, -0.572214f, -0.037087f, 0.081384f, 0.064518f, 0.014892f, + 0.215279f, 1.834504f, -0.242107f, 0.079810f, 0.129558f, 0.079588f, + -0.035189f, -0.221745f, -0.163414f, 0.043978f, -1.028662f, -0.623609f, + 1.130336f, 0.664661f, -0.063975f, -0.415863f, 0.018581f, 0.157758f, + 0.200570f, 0.063420f, 0.901039f, -0.746286f, 0.196230f, -0.290592f, + 0.042373f, -0.502500f, 0.183638f, 0.103394f, -0.298858f, 0.145436f, + 0.196916f, 0.108319f, -0.448572f, -0.881385f, 0.302497f, 0.121679f, + -0.021327f, 0.025150f, 0.481306f, -0.359634f, 0.350257f, -0.228647f, + -0.669860f, 0.260025f, -0.034182f, 0.619247f, -0.158826f, -0.405864f, + 0.674112f, -0.027885f, -0.325274f, -0.241492f, 0.036024f, -0.437685f, + -0.091458f, -0.109295f, -0.350676f, 0.044706f, 0.297059f, 0.016290f, + 1.121203f, 1.289062f, -1.299476f, -1.129221f, 0.103752f, 0.131302f, + -0.263265f, 0.222155f, -0.229908f, 0.013922f, -0.226001f, -0.248383f, + -0.004415f, -0.020958f, 0.055634f, 0.086200f, 0.114556f, -0.184061f, + -0.096210f, -0.146466f, -0.249618f, -0.195998f, 0.088758f, 0.023781f, + -0.264460f, 0.157026f, -0.235228f, -0.102564f, 0.043463f, -0.187823f, + -0.257500f, -0.199049f, -0.242210f, 0.030448f, 0.221604f, 0.151804f, + -0.100404f, -0.073931f, 0.144749f, -0.001572f, -1.438079f, -0.233716f, + 0.733422f, 1.727080f, -0.036397f, 0.027551f, 0.425321f, 0.085703f, + 0.031186f, 0.032333f, -0.675130f, 1.437733f, -0.202392f, -0.525003f, + 0.087048f, 0.328194f, -0.079989f, -0.391088f, -0.238732f, -0.120660f, + -0.139600f, 0.154665f, 0.026202f, -0.233501f, -0.009046f, -0.149187f, + -0.199646f, 0.115375f, 0.209762f, -0.014875f, 0.124038f, -0.119985f, + 1.079625f, -0.461513f, 0.614114f, 0.021003f, 0.439449f, -0.824834f, + -0.299701f, 0.193817f, -0.870551f, -1.262313f, -0.079517f, 0.341570f, + 0.305310f, -0.089721f, -0.317314f, -0.075631f, 0.127172f, -0.208635f, + 1.191922f, 0.163141f, 0.564285f, 0.286352f, 0.480865f, 0.173094f, + -0.094034f, -0.071339f, -0.328992f, -0.006382f, 0.314705f, 0.090258f, + -0.016099f, 0.193230f, 0.188061f, 0.398144f, 0.722781f, 0.769949f, + 0.025442f, -0.162016f, 0.070192f, -0.056946f, -0.100957f, -0.219934f, + -0.203492f, -0.015454f, -0.013272f, -0.098008f, 0.051707f, -0.017493f, + 0.527446f, 0.083605f, 0.588318f, 0.878215f, 0.028747f, -0.146479f, + -0.345170f, -0.136059f, -0.152005f, -0.203634f, 0.232702f, -0.101340f, + -0.027733f, -0.282611f, 0.265366f, 0.082362f, -0.265420f, -0.131124f, + 0.166303f, 0.040194f, -0.100710f, 0.579151f, -0.530136f, 0.163422f, + -0.998821f, -1.565311f, -1.774785f, -2.493372f, 0.116970f, -0.090302f, + 1.723272f, 0.552370f, -0.295954f, -0.439095f, -0.266730f, 0.027936f, + 0.539616f, -0.234902f, -0.167601f, -0.149877f, -0.242983f, 0.122353f, + -0.121620f, -0.205517f, -0.180144f, -0.264208f, 0.151500f, -0.159378f, + 0.029145f, -0.050892f, -0.223407f, -0.246239f, 0.043152f, -0.018460f, + 0.169972f, -0.187769f, -0.034670f, -0.238330f, 0.288070f, -0.093243f, + -0.437105f, -0.573376f, 0.660073f, 0.285727f, 0.408470f, 0.158475f, + 0.032699f, 0.056280f, -0.237176f, -0.083003f, 0.105598f, -0.169522f, + -0.260420f, -0.121100f, -0.173983f, -0.195693f, -0.232028f, 0.224940f, + 0.029124f, 0.009580f, -0.252034f, 0.103087f, 1.156561f, 0.603848f, + -0.562805f, -1.652742f, -0.568288f, -1.829395f, 0.046169f, 0.076095f, + 1.490819f, 0.415893f, -0.277788f, -0.115787f, 0.093750f, 0.270726f, + -0.395983f, -0.353742f, 0.034605f, 0.005342f, 0.184537f, 0.086445f, + 0.156417f, 1.476367f, 0.122587f, 0.002145f, 0.431057f, -0.381184f, + -1.646457f, -0.014009f, -0.671224f, 0.193726f, -0.019247f, -0.031267f, + -0.046208f, 0.298733f, 0.064734f, 0.616984f, 0.039381f, 0.182722f, + -0.116670f, 0.233093f, -1.214374f, -0.817970f, -0.064394f, -0.584783f, + 0.077697f, -0.266720f, 0.130875f, -0.235295f, -0.265754f, -0.159999f, + -0.250114f, -0.183017f, 0.194403f, -0.105808f, -0.169215f, -0.240866f, + -0.026662f, -0.045123f, -0.036175f, -0.167471f, -0.192908f, -0.232602f, + -0.267036f, -0.112500f, -0.257944f, -0.111909f, -0.802226f, -0.008800f, + 0.881460f, -0.678603f, 0.008666f, -0.252053f, -0.341035f, -0.175290f, + 0.183012f, 0.385991f, 0.079888f, -0.014039f, -0.148653f, 0.671778f, + -0.130219f, 1.086467f, 0.129267f, -0.040400f, -0.201221f, -0.077005f, + 0.015890f, 0.000781f, 0.137764f, 1.389546f, 0.172152f, 0.047279f, + -0.042783f, 0.127740f, 0.141467f, -0.335738f, -1.396392f, 0.031496f, + 0.357385f, 0.343602f, -0.714553f, 0.311014f, 0.132845f, 0.061149f, + 0.006796f, 0.568106f, -0.255949f, 0.104134f, -0.993447f, 0.298135f, + -0.406590f, -0.049228f, -0.578570f, -0.188561f, -0.107046f, 0.374095f, + 0.068481f, 0.036240f, -0.495801f, 0.180574f, -0.766129f, 0.886967f, + -0.568868f, -0.936062f, -0.418886f, -0.058735f, -0.511964f, -0.438596f, + 0.019016f, -0.015837f, 0.600197f, 0.429773f, 0.315026f, 0.319667f, + 0.214617f, -0.017316f, 0.270257f, -0.040524f, 0.695803f, -0.015223f, + -1.554965f, 0.356997f, -1.472428f, 0.024637f, -0.562958f, 0.870351f, + 0.193635f, 0.036063f, 0.328638f, 0.200274f, -1.634707f, 0.110534f, + 0.420104f, -0.072042f, -0.006404f, 0.171680f, +}; + +static const float av1_ab_partition_nn_bias_128_layer0[64] = { + 0.643147f, -1.348826f, 0.431627f, 0.000000f, 0.102717f, -0.772628f, + -0.034351f, -0.761977f, -0.638397f, 0.541969f, -0.391311f, 0.563076f, + 0.148553f, 0.267217f, -0.788092f, 0.544573f, -0.546280f, 0.000000f, + -0.446945f, 0.127732f, 0.270624f, -0.219435f, -1.220203f, 0.324584f, + 0.110885f, 0.276547f, 0.179726f, -0.375160f, 0.026401f, -0.032595f, + 0.000000f, -0.047932f, -0.648602f, -0.512637f, -0.031661f, -0.236761f, + 0.476453f, -0.028021f, -0.013673f, -0.015578f, -0.920077f, 0.000000f, + 0.915351f, -0.209962f, 0.000000f, -0.025731f, 0.218288f, 0.000000f, + 0.047726f, -0.813077f, -1.263281f, 0.239087f, 0.278614f, -0.030753f, + 0.000000f, 0.346744f, -0.948543f, -1.174211f, 0.216377f, 0.498913f, + 0.853918f, 0.002504f, -0.190403f, 0.452050f, +}; + +static const float av1_ab_partition_nn_weights_128_layer1[64 * LABEL_SIZE] = { + 0.179769f, 1.499417f, -0.445135f, -0.142278f, -0.337661f, 0.682064f, + -0.203213f, 0.302171f, 0.226877f, -0.422169f, 1.687586f, 0.783773f, + 0.220995f, 0.253482f, 0.370435f, -1.342775f, 0.337229f, -0.271473f, + 0.291796f, 1.362227f, -1.751397f, -0.086178f, 0.725496f, -0.118597f, + 0.227963f, -0.501577f, 0.223849f, -0.122421f, -0.123437f, -0.051045f, + -0.020115f, 0.212711f, 0.246025f, 0.088120f, -0.168995f, 1.740190f, + -0.195098f, 0.680339f, -0.589572f, -0.075244f, 0.878766f, 0.064092f, + -3.548527f, 0.001660f, 0.107926f, -0.169501f, -0.455212f, 0.123045f, + -1.836998f, 0.330365f, 1.301475f, 0.454761f, -0.576552f, -0.190761f, + 0.208459f, 0.618483f, 1.383364f, 0.970718f, 0.390174f, 0.406252f, + -0.564519f, -0.312062f, 1.345712f, -0.151873f, 0.109290f, 0.408847f, + 0.391243f, 0.152024f, 0.181764f, -0.036263f, -0.160466f, 0.153595f, + 0.049163f, -0.753012f, -1.804062f, 0.347475f, -2.746580f, 0.575618f, + 0.261799f, 0.210505f, -0.302054f, -0.109872f, 0.199506f, -1.182971f, + 0.723668f, 0.177758f, -0.338202f, 0.254396f, -0.220023f, 0.043504f, + 0.669866f, -0.040816f, -0.402730f, 0.017990f, 0.215523f, -0.216816f, + 0.454826f, -0.726067f, -0.018750f, -0.928679f, 0.154315f, -0.465641f, + 0.144566f, -0.030064f, -0.054667f, -0.154055f, 0.625384f, 1.323795f, + -0.159496f, 0.097072f, -0.463197f, -0.057938f, 0.750290f, -0.233061f, + 0.412631f, -0.535223f, -0.151423f, -0.154583f, 0.024721f, -0.494448f, + 0.230594f, -0.980138f, -0.653968f, 0.126079f, 0.051814f, -0.053219f, + -0.421708f, -0.228853f, 0.237885f, 0.888157f, 0.059655f, 0.241295f, + 0.210443f, 0.228238f, 0.119127f, -0.051989f, -0.355408f, 0.182215f, + 0.244277f, -0.104577f, -0.558035f, -0.023270f, 0.054571f, 0.700646f, + -0.223006f, 0.115523f, 0.023391f, 0.437264f, 0.709477f, -0.531212f, + -0.094731f, 0.328161f, -0.105418f, -0.133511f, 0.497168f, -0.030948f, + -0.407132f, -0.043943f, 0.155505f, 0.251945f, 0.205010f, 0.167160f, + 0.083654f, -0.636810f, 0.401315f, -0.398414f, 0.290046f, 0.206846f, + 0.042218f, 0.168150f, 0.843181f, -0.671242f, -0.202392f, -0.073301f, + 0.142895f, 0.237466f, 0.212145f, -0.091828f, 0.187038f, -0.720841f, + -0.616069f, -0.238021f, 0.065365f, 0.434119f, 0.179023f, -0.040107f, + -0.430734f, -0.297368f, 0.575954f, 0.382619f, -0.709787f, -0.320810f, + 0.242342f, -0.047614f, 0.705216f, 0.098077f, 0.357179f, 0.046017f, + 0.115074f, -0.412305f, -0.272304f, 0.048096f, -0.803811f, 0.275000f, + 0.642198f, 0.180286f, -0.087178f, -0.112707f, -0.394443f, 0.201989f, + 0.241759f, -1.038870f, 0.728124f, 0.800559f, -1.296268f, 0.198612f, + -0.053478f, 0.414344f, -0.510529f, 0.124179f, -2.219115f, -0.074583f, + -0.143055f, 0.001697f, 0.810811f, -0.657140f, 0.186818f, -0.936414f, + 0.539578f, -0.308244f, -0.126624f, -0.204767f, 0.091145f, -0.049340f, + 0.252014f, 0.394582f, 0.018764f, -0.060377f, -0.019133f, 0.064083f, + 0.069211f, -0.526693f, 0.209850f, -0.481466f, -0.468302f, -0.100407f, + 0.241018f, -1.037781f, 0.038539f, -2.113840f, -0.974895f, 0.163187f, + 0.425132f, -0.772546f, -1.261254f, -0.217488f, -0.971748f, -0.805640f, + -0.745175f, -0.177077f, 0.217658f, 0.381431f, -0.052338f, 0.087176f, + -0.165972f, 0.085937f, 0.472564f, -0.796627f, -2.453307f, 0.569664f, + -0.233010f, -0.192134f, 0.064339f, -0.111411f, -0.262469f, -0.410022f, + 0.519993f, -0.684620f, 0.393460f, -0.277753f, -0.153624f, 0.528984f, + -0.415558f, -0.445863f, 0.588512f, -0.142439f, -0.132127f, 0.199776f, + -0.579284f, 0.119488f, -0.033590f, -0.503846f, -0.674979f, 0.335125f, + 0.020519f, 0.233973f, -0.297998f, -0.051511f, 0.518626f, -0.412782f, + -0.074045f, 0.130523f, 0.465751f, -0.117795f, 2.535813f, 0.352108f, + -0.499228f, 0.379784f, 0.056699f, 0.173142f, -0.076519f, -0.026666f, + 0.017834f, 0.492333f, 0.093364f, 0.037867f, -0.165420f, -0.356429f, + -0.562334f, 0.057656f, -0.307544f, 0.085857f, -0.559851f, 0.107230f, + -0.398633f, 0.152618f, -0.216835f, -0.024539f, 0.026044f, -0.249519f, + -0.563594f, -0.746025f, 0.025265f, -0.298888f, -0.185243f, 0.058794f, + 0.233696f, -0.115223f, 0.144617f, -0.864390f, 0.619944f, -0.023980f, + 0.019481f, 0.225252f, 0.416552f, -0.115993f, 0.935387f, 0.744386f, + 0.053353f, -0.052582f, -0.065650f, 0.228488f, -0.032042f, -0.371252f, + -0.003638f, -0.736984f, -0.203776f, 0.030922f, -0.065577f, -0.031643f, + -0.049253f, -0.054640f, 0.787134f, 0.545414f, -0.140297f, -0.124274f, + -0.110011f, -0.029552f, 0.657005f, 0.214973f, -0.374300f, 0.251642f, + 0.276591f, 0.030566f, -0.145470f, 0.350579f, -0.356436f, -0.052694f, + -0.063966f, -0.751008f, -1.042392f, 0.328892f, -0.425058f, -0.421571f, + -0.571889f, -1.141472f, -0.125216f, 0.212713f, -0.485170f, -0.088791f, + 0.124589f, 0.023237f, 0.077635f, 0.020901f, -0.271402f, -0.321424f, + -0.513946f, -0.867872f, -0.284593f, 0.106276f, 0.220192f, -0.143532f, + -0.014648f, 0.073402f, 0.327256f, -0.139803f, 0.168763f, 0.048199f, + -0.122526f, 0.111713f, -0.134257f, 0.810364f, -0.085222f, -0.259221f, + -0.239349f, 0.044448f, 0.205031f, 0.413113f, -0.107720f, -0.018816f, + -0.247741f, -0.004963f, 0.041170f, -0.158019f, 0.134839f, 0.129502f, + 0.800488f, -1.041584f, -0.129336f, 0.170834f, 0.566586f, -0.230443f, + 0.437937f, -0.149922f, -0.046665f, -0.094646f, 0.200070f, 0.072943f, + -0.076943f, -0.084971f, -0.515843f, -0.146720f, 0.472869f, -0.444731f, + -0.100877f, 0.545196f, -1.786626f, -0.482946f, 0.500509f, -0.843257f, + 0.200374f, 0.045103f, -0.575718f, -0.164335f, -0.232522f, -0.021825f, + -0.139490f, 0.356058f, -0.352075f, 0.061751f, -0.200616f, -1.180921f, + -0.181355f, -0.137459f, 0.247574f, 0.181541f, 0.184314f, -0.961482f, + 0.493615f, 0.910261f, -2.279238f, 0.648631f, -0.055526f, -0.037137f, + 0.038643f, 0.136609f, -0.819373f, -0.040840f, -0.265989f, 0.006877f, + 0.454651f, -0.595323f, -0.099500f, -0.263717f, 0.150456f, 0.245077f, + -0.268666f, 0.162232f, -0.516451f, -0.024501f, 0.188046f, -0.002262f, + 0.261319f, 0.004173f, 0.746982f, 0.174761f, 0.470447f, -0.159558f, + -0.385240f, 0.023084f, -0.133520f, -0.220607f, -0.018731f, -0.373558f, + -0.707763f, -1.850150f, -0.807404f, -0.168063f, -0.071435f, -0.160740f, + -0.478789f, -1.070674f, -0.489740f, -0.255796f, 0.100486f, -0.153361f, + 0.334394f, -0.569472f, -0.198118f, 0.255922f, 0.104717f, -0.065179f, + 0.111879f, -0.447237f, 1.373623f, -0.190191f, -0.063311f, 0.337529f, + -0.138800f, 0.057009f, -0.137006f, 0.641378f, 0.883147f, -0.679655f, + 0.267717f, -0.351602f, -0.135225f, 0.229398f, -0.513225f, -1.120345f, + 0.528786f, -0.051081f, 0.086653f, 0.140141f, -0.563969f, 0.333402f, + -0.174745f, 0.321093f, -0.438641f, -0.005131f, 0.247415f, 0.110120f, + -0.076308f, -0.083244f, 0.838944f, -0.113043f, -0.013258f, -0.175028f, + -0.179941f, 0.272676f, -0.047946f, -0.088076f, -0.450031f, 0.053929f, + -0.083549f, -0.089952f, -0.186253f, 0.257483f, 0.011019f, 0.586435f, + 0.060580f, -0.052078f, 0.090277f, -0.780869f, 0.969811f, -0.025349f, + -0.281917f, 0.014857f, 0.231863f, -0.228601f, -0.003861f, 0.226550f, + 0.141825f, -0.102171f, -0.010387f, 0.220378f, -2.561975f, -0.497071f, + -0.315117f, 0.371981f, 0.138247f, 0.625031f, -0.308133f, -0.217876f, + 0.005615f, -0.860179f, 0.747491f, 0.006356f, -0.057024f, -0.483189f, + 0.055592f, -0.316834f, 0.069858f, 0.218788f, -0.200044f, 0.227588f, + 0.215496f, -0.055324f, -0.393147f, -0.394062f, -0.253264f, -0.075619f, + -0.152512f, -0.332995f, 0.129053f, 0.178668f, -0.302694f, 0.030678f, + 0.925896f, 0.964375f, 0.169021f, -0.218657f, -0.627204f, 0.206437f, + -0.521336f, 0.176206f, 0.142733f, 0.139248f, 0.411682f, 0.181544f, + 0.224850f, -0.935547f, -0.558208f, 0.348096f, 0.342129f, -0.389340f, + -0.236308f, -0.132099f, 0.073642f, 0.089391f, -0.306901f, -0.397842f, + 0.444282f, 0.074623f, -0.051075f, -0.106617f, -0.184037f, -0.239046f, + -0.138761f, 0.120794f, -0.647577f, -0.336471f, 0.527899f, -0.164234f, + -0.028354f, 1.083678f, -0.251534f, -0.145903f, -0.182783f, 0.070976f, + -0.199590f, -0.400306f, -0.029763f, -0.548042f, -0.266270f, -0.118084f, + -1.152632f, 0.383685f, -0.105895f, -0.096829f, 0.118382f, 0.047447f, + -0.019051f, 0.310180f, -0.162793f, -0.029574f, 0.058054f, -0.636017f, + 0.490639f, 0.158347f, -0.385701f, -0.147057f, 1.285825f, -1.276083f, + -0.021795f, -0.101600f, 0.163254f, 0.267160f, -2.317864f, -0.098598f, + -0.296337f, -0.309017f, 0.164127f, -0.270012f, -0.071187f, -0.262270f, + 0.075415f, -0.368328f, 0.186728f, -0.158031f, 0.481663f, 0.515950f, + -0.162551f, 0.497981f, 0.262196f, 0.168479f, 0.726066f, -0.243856f, + -0.058998f, 0.140168f, 0.053242f, -0.624623f, -0.249480f, 0.055197f, + -1.376804f, 0.417571f, 0.203784f, 0.174370f, -0.155531f, -0.029400f, + -0.491473f, 0.079811f, -0.080123f, 1.345900f, 0.637077f, 0.434862f, + -1.787438f, 0.005756f, -0.362706f, 0.179458f, -0.288263f, 0.516788f, + -0.921248f, 0.043794f, -0.137729f, -0.196171f, -0.046295f, -0.793781f, + -0.156532f, -0.132566f, 0.517989f, -0.154321f, -0.054174f, -0.077900f, + -0.373316f, -0.117718f, 0.188986f, -0.476188f, -0.245312f, 0.181439f, + -0.161024f, -0.229059f, -3.079907f, -0.225452f, -0.594355f, -0.558027f, + -0.135429f, 0.125766f, -0.081314f, -0.350894f, -0.163165f, -1.936507f, + -0.205966f, 0.031472f, 0.744446f, -0.006680f, -0.837551f, 0.605862f, + -0.854929f, -1.543750f, -0.307704f, -0.240517f, 0.178240f, -0.183586f, + -0.010307f, 0.099373f, -0.228278f, 0.175236f, -0.000133f, 0.104491f, + -1.540545f, -0.570971f, -0.252885f, 0.483036f, 0.052531f, 0.260214f, + -0.515016f, -0.602081f, -0.485690f, -0.730710f, 0.163719f, -1.775975f, + -0.298634f, 0.323626f, -0.373579f, -0.872977f, 0.619574f, 0.026862f, + -0.122531f, -0.084698f, -2.436297f, 0.483996f, -0.203640f, -0.302157f, + -0.150666f, -0.238320f, 0.089250f, 0.236485f, -0.668654f, -0.122863f, + 0.491152f, -0.226444f, -0.181248f, 0.120158f, 0.294027f, 0.250056f, + 0.307601f, 0.357875f, -1.746455f, -0.175670f, 0.385447f, -0.108808f, + -0.090235f, -0.642504f, -0.486004f, -0.055160f, -0.068692f, 0.009736f, + 0.607555f, -0.489426f, 0.150624f, 0.598114f, -0.128816f, -0.445793f, + -0.066524f, -0.254380f, 0.227106f, -0.406495f, -0.121632f, -0.275960f, + -0.136494f, 0.339457f, -1.318132f, -0.417572f, -2.614077f, 0.324603f, + -0.001211f, 0.375192f, -0.473448f, -0.162510f, 0.099329f, -0.277965f, + 0.101221f, -0.060263f, 0.121867f, -1.042140f, 0.440851f, 0.078898f, + -0.209007f, -0.243699f, 0.715197f, -0.093997f, 0.086022f, -0.178203f, + -2.275496f, -0.098413f, 0.199352f, -0.526791f, -0.162086f, -0.197806f, + -0.231657f, -0.269202f, -0.794294f, -0.223461f, 0.503584f, 0.416236f, + 0.064082f, 0.197655f, 0.340871f, -0.186645f, -0.291498f, 0.433938f, + -1.110063f, 0.003751f, 0.392738f, 0.069360f, 0.102088f, -0.302128f, + -1.518457f, 0.106939f, 0.404527f, -0.306868f, -0.286928f, 0.729276f, + -0.531710f, 0.745048f, -0.168837f, -1.953886f, -0.258828f, -0.190252f, + 0.241877f, -0.916744f, -0.030326f, -0.070541f, -0.271037f, 0.211303f, + -0.489957f, 0.100850f, 0.323999f, -0.802837f, -0.462408f, -0.079350f, + -0.029374f, 0.131213f, -0.825032f, 0.040202f, 0.351821f, 0.002869f, + -0.132516f, -0.471264f, -0.297002f, 0.263913f, 0.033478f, 0.146161f, + 0.533229f, -0.228608f, -0.200639f, -0.170955f, -0.915037f, 0.724491f, + 0.005151f, 0.018584f, -0.029771f, -0.396038f, -0.159236f, 0.038691f, + -1.197056f, 0.146302f, 0.226840f, -0.852126f, 0.031214f, 0.108880f, + 0.562000f, -0.134633f, -0.713343f, -0.342252f, -1.764521f, -0.114653f, + 0.515073f, -0.080515f, -0.121155f, -0.865139f, -0.833694f, -0.368553f, + 0.347673f, 0.623379f, 0.722067f, -0.492458f, -0.513263f, 0.585167f, + 0.721518f, -0.693499f, 0.343725f, -0.273861f, -0.040230f, -0.785664f, + -0.157500f, -0.308445f, 0.054062f, 0.600131f, -0.860887f, 0.434470f, + -0.191382f, -0.306150f, -0.243965f, 0.705444f, 0.007789f, -0.146154f, + -0.054499f, -0.073500f, -1.067364f, 0.404936f, -2.864590f, 0.182323f, + 0.326126f, 0.102405f, -0.135800f, 1.128095f, -0.012267f, -0.023996f, + -0.264834f, -0.108967f, -1.176746f, -0.926666f, 0.082999f, -0.498361f, + 0.083560f, -0.210074f, 0.019225f, -0.201614f, -0.904760f, 0.181421f, + 0.586384f, -0.177706f, 0.065471f, 0.168552f, 0.054705f, 0.045241f, + 0.048057f, -0.410957f, -2.188854f, -0.169812f, 0.015521f, 0.176856f, + -0.179331f, -0.352640f, -0.491735f, -1.743206f, 0.044227f, 0.010454f, + 0.823643f, -0.119781f, -0.098359f, 0.093119f, +}; + +static const float av1_ab_partition_nn_bias_128_layer1[LABEL_SIZE] = { + -0.433195f, -0.120488f, -0.116721f, 0.112134f, 0.118170f, -0.259769f, + -0.077530f, 0.394044f, 0.279167f, -0.317988f, 0.189538f, 0.314776f, + 0.325655f, -0.107123f, 0.591049f, 0.358744f, +}; + +static const NN_CONFIG av1_ab_partition_nnconfig_128 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + 64, // num_hidden_nodes + }, + { + av1_ab_partition_nn_weights_128_layer0, + av1_ab_partition_nn_weights_128_layer1, + }, + { + av1_ab_partition_nn_bias_128_layer0, + av1_ab_partition_nn_bias_128_layer1, + }, +}; + +// nn model for ab partition pruning, 64x64. +static const float av1_ab_partition_nn_weights_64_layer0[FEATURE_SIZE * 64] = { + -0.495347f, -0.049498f, -0.026804f, 0.030474f, -0.289308f, -0.264193f, + -0.141121f, -0.072562f, -0.391665f, -0.051491f, -0.234761f, 0.027155f, + -0.038217f, 0.014872f, -0.289728f, -0.233577f, -0.415875f, -0.343615f, + -0.442543f, -0.482492f, 0.073510f, 0.007503f, 2.162329f, -0.362849f, + 2.145915f, -0.883135f, 0.185636f, -0.062859f, -0.465574f, -0.486205f, + -0.056710f, -0.330642f, -0.321860f, 0.042321f, -0.348965f, 0.003542f, + -0.291365f, -0.078164f, -0.345093f, -0.220272f, -0.471270f, -0.763853f, + 0.246622f, 0.199651f, -0.663420f, -0.154152f, -1.220383f, 0.047138f, + 0.816811f, 0.083247f, -0.218839f, 0.038143f, -0.063436f, 0.015517f, + -0.307320f, -0.166956f, -0.169499f, -0.399005f, -0.234638f, -0.162266f, + 0.050425f, -0.221723f, -0.256942f, -0.287285f, 0.144011f, -0.033245f, + 0.083649f, 0.119428f, -0.056706f, -0.117805f, 0.021866f, -0.257300f, + -0.201378f, -0.217484f, -0.413780f, -0.145793f, 0.082792f, -0.347247f, + 0.042539f, -0.302697f, 1.652316f, 0.000701f, -0.482843f, -0.160332f, + -0.450099f, 0.212399f, -4.715360f, -5.336774f, -5.375758f, -6.048339f, + 0.085956f, -0.037767f, 1.052409f, -0.931924f, -2.221907f, 0.268946f, + 0.015512f, 1.237094f, -1.092185f, 0.418247f, -0.082143f, -0.076914f, + -0.060749f, -0.325440f, -0.296960f, -0.066815f, -0.158477f, -0.373945f, + -0.122322f, -0.113495f, -0.097978f, -0.192816f, -0.270418f, 0.035840f, + -0.015458f, -0.121071f, -0.279582f, -0.067683f, 0.097855f, 0.019839f, + 0.451127f, 0.004376f, 1.410392f, 3.255835f, -0.344815f, 0.145202f, + 0.204132f, 0.171948f, -0.527736f, -0.110353f, 0.901448f, 0.003238f, + -3.822090f, 0.235462f, 1.024823f, -0.821244f, 0.876056f, 2.553762f, + -3.478597f, -2.076582f, -0.265515f, -0.055923f, -0.156980f, -0.164097f, + -0.246040f, 0.039430f, -0.071769f, -0.118847f, -0.304053f, -0.281541f, + -0.226021f, -0.263091f, -0.127359f, -0.249410f, -0.051023f, 0.083911f, + 0.084721f, 0.168089f, -0.272169f, -0.204998f, -0.008303f, -0.173998f, + 0.079376f, -0.197426f, -0.199052f, -0.118794f, -0.063753f, -0.094769f, + 0.066176f, -0.175832f, -0.238752f, -0.287960f, -0.134307f, -0.185953f, + -0.385845f, 0.119769f, -0.006567f, -0.382126f, -0.214221f, 0.038449f, + -0.253484f, -0.282766f, -0.020249f, -0.193929f, 0.016281f, -0.114423f, + -0.145940f, -0.281621f, -0.007588f, -0.131470f, -0.189012f, -0.185699f, + -0.279011f, -0.008132f, 0.208463f, 0.020569f, -0.206803f, -0.213408f, + -0.206131f, -0.290245f, 0.069701f, -0.000371f, -0.307572f, -0.451785f, + -0.300838f, -0.453186f, -0.301691f, 0.046327f, -0.312668f, 0.058272f, + -0.303131f, -0.376252f, 0.108384f, -0.086623f, -0.100630f, -0.027330f, + -0.003969f, 0.089502f, -0.200722f, -0.107889f, 0.061843f, -0.008478f, + -0.265057f, -0.271132f, -0.073562f, 0.129337f, -0.283698f, -0.353414f, + 0.076420f, -0.244280f, -0.119537f, -0.105366f, -0.184692f, -0.038817f, + -0.478507f, -0.118808f, -0.472979f, -0.305884f, -0.462813f, -0.189581f, + -0.011932f, -0.585700f, 0.253212f, -1.061900f, -0.205116f, -0.336407f, + -0.762199f, 0.577737f, 0.230832f, 0.434440f, -0.096713f, 0.038552f, + -0.147800f, -0.213553f, 0.041740f, -0.281907f, -0.026154f, -0.082356f, + -0.331871f, -0.408247f, -0.129022f, -0.037550f, -0.310233f, -0.320883f, + -0.391963f, -0.467392f, 0.027453f, -0.394761f, -0.045544f, 0.076052f, + 0.483985f, 0.067093f, 0.141361f, 0.576772f, 0.859718f, 2.566515f, + -0.025476f, 0.769738f, -0.680235f, -1.683309f, -2.394131f, -0.000714f, + -0.615021f, -0.195856f, -0.434035f, -0.295010f, -0.668659f, -0.245959f, + 0.551148f, 1.777227f, -0.461630f, 0.043093f, 0.012293f, -0.255841f, + -0.097070f, -0.371156f, -0.146323f, -0.015508f, -0.103873f, -0.087476f, + -0.297266f, -0.128699f, -0.149555f, 0.016534f, -0.375498f, -0.346759f, + -0.455156f, -0.147509f, -0.427076f, -0.354431f, -0.158025f, -0.164604f, + -0.237038f, -0.010314f, -0.092884f, -0.397084f, -0.217980f, -0.127184f, + -0.048421f, -0.144133f, 0.889073f, 0.012606f, 3.007608f, -0.602584f, + -1.849480f, -0.373159f, -1.890695f, -3.609938f, 0.811923f, -1.867208f, + -0.244326f, -0.018012f, -0.211192f, -0.220196f, 0.169363f, 0.119141f, + -0.230715f, 0.083247f, 0.020367f, -0.128629f, -0.217455f, -0.159640f, + 1.815952f, -0.369238f, -1.186447f, -0.658753f, -0.511026f, -0.096934f, + 0.662971f, 0.486475f, 0.159746f, -0.018932f, 3.692397f, 1.384353f, + -0.401984f, -0.248380f, -0.140861f, 0.215248f, -0.023711f, 0.059679f, + -0.072260f, 0.004271f, 0.039545f, -0.347971f, -0.081851f, -0.474896f, + -0.181572f, 0.066736f, -0.157822f, -0.163760f, -0.171113f, -0.089935f, + -0.338281f, -0.421444f, -0.306687f, -0.085283f, -0.377953f, -0.138750f, + -0.102701f, -0.312336f, 0.149831f, 0.007229f, -0.155700f, -0.173611f, + 4.074261f, 1.342306f, -1.272712f, 1.570899f, -0.545093f, -0.317605f, + -0.189440f, -0.133910f, -0.273190f, -0.108020f, -0.166107f, 0.021413f, + -0.239130f, -0.067211f, 0.041957f, -0.039234f, -1.003587f, -0.094412f, + 0.532512f, -0.870538f, -1.118023f, -1.160983f, -0.736307f, -0.418752f, + 0.419466f, 0.492122f, -0.004368f, -0.022096f, -1.115132f, 0.150886f, + 2.396852f, 2.660000f, -0.376537f, 0.468628f, 0.149413f, -0.074898f, + -0.067154f, 0.021245f, 0.127857f, 0.294189f, 0.508056f, 0.390232f, + -3.899177f, -3.414681f, -3.929195f, -4.160545f, -0.274323f, -0.052583f, + -0.003545f, -0.433084f, -0.404891f, -0.145051f, -0.312367f, 0.004579f, + -0.398724f, -0.372068f, -0.234279f, 0.017799f, -0.424760f, -0.646717f, + -0.047568f, 2.924664f, -0.644165f, 0.359349f, -0.294800f, 0.591746f, + -0.404710f, -0.092358f, -0.250729f, 0.030829f, -0.147149f, -0.476023f, + -0.071803f, -0.482516f, -0.293117f, -0.215923f, -0.373122f, -0.085315f, + -0.377052f, -0.449899f, -0.056452f, 0.138081f, -0.085350f, -0.308391f, + 0.106661f, 0.176234f, 0.258869f, -0.230172f, -0.233029f, -0.241208f, + -0.067509f, -0.223172f, -0.118353f, -0.302478f, -0.579632f, -0.561326f, + -0.158114f, -0.223167f, -0.026689f, 0.051863f, 0.212834f, -0.304714f, + -0.169071f, -0.193695f, -0.075682f, -0.170860f, -0.241008f, -0.044648f, + 0.280815f, -0.002585f, -0.283552f, -0.037701f, -0.681169f, -0.274535f, + -0.380595f, 0.109504f, -0.111141f, -0.437685f, -0.094459f, 0.144206f, + -0.106139f, -0.211832f, -0.054742f, -0.172813f, -0.295905f, -0.071907f, + -0.418429f, -0.183240f, 0.031319f, -0.095785f, -0.315447f, 0.069404f, + -0.422910f, -0.029867f, -0.357321f, -0.199976f, -0.337707f, -0.070188f, + -0.178198f, 0.177208f, 0.134688f, -0.081933f, -0.229452f, -0.208872f, + 0.026287f, -0.364040f, -0.063696f, -0.227443f, -0.234401f, -0.205699f, + -0.267238f, -0.494125f, -0.056255f, 0.053715f, -0.487754f, 0.014818f, + 0.087383f, -0.077556f, -0.168085f, -0.436851f, -0.276286f, -0.137845f, + -0.107606f, -0.103653f, -0.233766f, -0.419083f, 0.169185f, 0.010186f, + -0.001587f, 0.086735f, -2.465718f, 1.482185f, 1.621193f, -2.081680f, + 1.386553f, -3.204335f, -0.267111f, -0.004508f, 0.164712f, 0.274147f, + 1.724306f, -2.273659f, 0.749574f, -0.891905f, 0.105965f, -0.030428f, + -0.416018f, -0.300762f, 0.122911f, -0.316908f, -0.292504f, 0.138666f, + -0.161327f, -0.042143f, -0.249128f, 0.149210f, -0.088987f, -0.654101f, + -1.501843f, 0.216777f, 0.955914f, 0.524158f, -1.642561f, -1.643626f, + 0.864797f, -0.425451f, -2.115764f, -0.012502f, 0.065172f, 1.297270f, + 0.018845f, 1.167276f, -0.470970f, -0.244995f, 0.374782f, -1.811056f, + -0.055430f, -0.024102f, -0.376519f, -0.339640f, -0.119177f, -0.277995f, + -0.290095f, -0.081362f, -0.144139f, -0.118037f, -0.180357f, -0.217559f, + -0.370683f, 0.172816f, -0.265069f, 0.194321f, -0.273478f, 0.037442f, + -0.235552f, -0.078625f, -0.447541f, 0.016836f, -0.271123f, -0.171481f, + -0.321477f, -0.184826f, -0.442981f, -0.227273f, -0.370666f, -0.237232f, + -0.257493f, -0.225714f, -0.153716f, -0.283487f, -0.155399f, 0.067697f, + 0.230343f, -0.034318f, -0.022687f, -0.047090f, +}; + +static const float av1_ab_partition_nn_bias_64_layer0[64] = { + -0.212182f, -0.233725f, -0.758846f, -0.158162f, 0.614743f, -0.150944f, + -0.075727f, -0.208414f, 1.054996f, 0.713758f, -0.300051f, -0.151482f, + -2.443570f, 0.430590f, -0.129001f, -0.160733f, -0.230547f, -0.143228f, + -0.140577f, -0.086812f, -0.212298f, -0.159557f, -0.055647f, -0.211423f, + 0.578161f, -0.220318f, -0.210107f, -3.111584f, 0.604419f, -0.232622f, + -0.209924f, -0.130794f, -0.084097f, -0.036005f, 0.294594f, -2.535531f, + -0.209783f, -0.211189f, -2.766337f, 0.000000f, 0.450177f, -1.754884f, + 3.262664f, -0.209691f, -0.614886f, -0.211257f, -0.109096f, -0.190492f, + -0.109007f, -0.026910f, -0.136035f, -0.212321f, -0.139320f, -0.212233f, + -0.305430f, 0.739171f, 0.991277f, -0.088150f, 0.086313f, -0.023379f, + -0.125366f, -0.063576f, -0.212169f, -0.047463f, +}; + +static const float av1_ab_partition_nn_weights_64_layer1[64 * LABEL_SIZE] = { + -0.036800f, 0.528721f, 0.490767f, 0.144409f, 1.103640f, 0.361910f, + -0.180069f, 0.068033f, -14.868382f, 0.359013f, 0.322567f, -0.199212f, + 0.906164f, -0.488254f, 0.149653f, -0.216394f, -0.099347f, 0.004936f, + -0.111391f, 0.074848f, -0.041709f, 0.147627f, -0.018905f, 0.096116f, + 0.184817f, -0.016241f, 0.115739f, 2.376754f, 0.637097f, 0.052954f, + 0.136428f, 0.225267f, -0.181873f, -0.142876f, 0.684048f, 0.658791f, + 0.105795f, 0.241705f, 1.381114f, -0.209379f, 1.145949f, 0.795293f, + -9.361877f, 0.198302f, 0.539600f, 0.092317f, -0.081695f, 0.200777f, + 0.102334f, 0.081583f, 0.060948f, -0.025110f, 0.160951f, -0.020170f, + 0.234006f, -0.029369f, 0.375036f, 0.270209f, -0.556529f, 1.402949f, + 0.101777f, -0.027331f, 0.004502f, -0.153166f, -0.116651f, 0.151573f, + -0.022187f, 0.144044f, -0.108719f, -0.129942f, -0.270321f, 0.227363f, + 1.892330f, -0.661052f, -0.219398f, -0.229417f, -0.856438f, -1.196988f, + -0.081774f, 0.078847f, -0.207057f, -0.048947f, 0.152073f, -0.243056f, + -0.233329f, -0.288689f, -0.158333f, -0.141177f, -0.715436f, 0.016947f, + -0.093752f, 0.204984f, -1.209782f, 0.155683f, 0.092239f, 0.146495f, + 0.813146f, -0.027757f, 0.330982f, 2.173948f, -0.028867f, -0.141815f, + 0.292708f, -0.204794f, 0.014496f, 1.032799f, 1.312155f, 0.107020f, + 0.824752f, -0.013945f, 0.184829f, -0.041633f, 0.215300f, -0.476088f, + -0.053213f, 0.126862f, -0.020777f, 0.082893f, -0.223727f, -0.923063f, + 0.466529f, 0.082140f, -0.845758f, -1.140791f, -0.262033f, 0.138491f, + 0.151717f, -0.182479f, -0.131128f, 0.055411f, 0.106771f, 0.125552f, + 0.297184f, -0.257403f, -0.059884f, -0.274903f, 2.694357f, -0.108244f, + 0.025377f, 0.043092f, -0.558317f, 3.517159f, -0.270833f, -0.240676f, + 0.205100f, -0.057068f, -0.140445f, -0.193449f, -0.030061f, -0.286762f, + -0.467523f, -0.012647f, 0.190564f, 0.022394f, -0.101479f, 0.339684f, + -0.902743f, -0.169578f, -0.178029f, -0.041836f, -3.952108f, -0.028298f, + -0.221137f, -0.733895f, -0.223895f, 0.039012f, 0.687867f, 0.021423f, + 0.113063f, 0.676087f, -0.961000f, -0.064847f, 0.712856f, -0.192765f, + -0.001132f, 0.016689f, -0.236020f, -0.766186f, -0.175729f, 0.012879f, + -0.251064f, -0.105523f, -0.039212f, -0.347584f, 0.304352f, -0.034174f, + -0.364258f, -0.685252f, -0.266115f, -0.247345f, -0.155905f, 0.152283f, + -0.156315f, 0.174082f, -0.757654f, 0.102303f, -2.192316f, -0.245815f, + 0.119882f, -0.086542f, 1.987246f, -1.353163f, -0.374813f, -0.233504f, + -1.980895f, 0.692093f, -0.168351f, 0.172700f, -0.009052f, -0.015734f, + 0.106679f, -0.060472f, -0.256813f, -0.074874f, -0.207488f, -0.329515f, + -0.418268f, -0.017940f, -0.036081f, 0.064719f, -1.488016f, 0.020591f, + -0.176325f, -0.141074f, 0.944494f, 0.150237f, -0.249805f, -0.277280f, + 0.012686f, 0.132483f, 0.116123f, 0.013737f, -0.116091f, 0.750340f, + 3.251343f, -0.188864f, 1.096992f, 0.058467f, -0.041433f, -0.037937f, + -0.133294f, -0.137908f, -0.171132f, 0.106362f, 0.069383f, -0.052662f, + -0.177883f, -0.408049f, 0.680221f, -0.117035f, -0.904240f, -1.395228f, + 0.154527f, 0.134427f, 0.022767f, -0.158886f, -0.230316f, 0.161096f, + 0.362213f, -0.235060f, -0.941620f, 0.055912f, -0.049458f, -0.166632f, + 0.481418f, 0.930146f, 0.041108f, 0.033674f, 1.372066f, -1.847709f, + 0.003324f, 0.259534f, 0.177014f, -0.202761f, -0.262017f, -0.190852f, + -0.102839f, 0.028338f, 0.187193f, -0.041684f, 0.123973f, -0.198576f, + -0.110369f, -1.431400f, 0.208369f, -0.302370f, -0.248549f, 0.062985f, + 0.673409f, 0.036662f, -0.711340f, -0.120584f, -0.189789f, 0.098812f, + 2.947819f, 0.216567f, -0.414472f, -0.181742f, 1.873779f, -0.222726f, + -0.782870f, 0.007889f, 0.015062f, -0.554328f, 0.182928f, -0.191430f, + 0.123636f, -0.215460f, -0.225245f, 0.251516f, -0.013025f, -1.359595f, + -0.750602f, 0.342667f, -0.141899f, -0.687493f, -0.072639f, 0.048018f, + -0.242107f, -0.031917f, -0.287472f, -0.046088f, 0.832197f, -0.016576f, + -1.553349f, -0.216341f, 0.023077f, -0.410867f, 4.243743f, -0.514878f, + -0.066007f, -0.160696f, -0.262678f, -0.648790f, -0.430586f, 0.199940f, + -0.202496f, -0.222241f, -0.016406f, -0.121473f, 0.000828f, -0.081584f, + -0.152641f, -0.190166f, 0.644400f, 0.040196f, -0.302104f, -1.143654f, + -0.160327f, -0.320780f, -0.187006f, 0.037311f, 0.440618f, -0.070733f, + -0.117785f, 1.527539f, -0.419310f, 0.001300f, 1.389956f, -0.036366f, + -0.269203f, 0.612265f, 2.721897f, -0.086836f, -0.446999f, 0.012525f, + -0.078317f, -0.287052f, -0.111188f, -0.085181f, -0.164667f, -0.010466f, + -0.569722f, -0.018888f, -0.101663f, -1.147130f, -0.465204f, 0.114524f, + -2.192402f, -0.221325f, 0.375748f, 0.206284f, -0.261548f, -0.246257f, + -0.143004f, -0.069981f, -0.057306f, -0.116481f, -0.435903f, -0.314970f, + 0.013210f, -0.010175f, 4.630571f, -0.473226f, -0.197199f, -0.028204f, + 0.122907f, 2.475548f, 0.025011f, -0.092603f, -0.127561f, -0.151330f, + -0.077295f, 0.245016f, -0.045005f, 0.183396f, -0.330556f, -0.384887f, + 0.356374f, -0.016618f, -0.463353f, -1.291546f, -0.071986f, -0.311599f, + 0.072385f, -0.430786f, -2.094788f, 0.202733f, -0.910109f, -1.336543f, + -0.086800f, -0.096413f, 1.544383f, 0.031860f, -0.796211f, 0.762786f, + 3.250022f, -0.441798f, -0.698537f, 0.062839f, 0.033525f, -0.362996f, + 0.027022f, -1.131264f, -0.228926f, 0.053885f, -0.338628f, 0.155037f, + -0.046844f, -0.888172f, -0.241767f, 0.084965f, -0.617743f, -0.049896f, + -0.036894f, -0.304783f, -0.002639f, 0.137957f, 0.052121f, -0.131161f, + -0.117200f, -0.253380f, -0.205561f, -0.302450f, -0.047397f, -0.330518f, + 3.613420f, -1.525951f, -0.026738f, 0.209150f, -2.103534f, 2.019689f, + -0.366199f, -0.095260f, 0.027417f, -0.242512f, 0.162579f, 0.052113f, + -0.293851f, -0.068138f, -0.005799f, -0.344696f, -0.114824f, -0.431107f, + -0.120058f, -1.139926f, -1.048379f, 0.036446f, -0.323020f, -0.432945f, + 0.454151f, -0.140058f, 0.050649f, -0.094900f, -0.017278f, -0.238719f, + 1.193153f, 0.120447f, -0.496061f, 0.917431f, 2.936126f, -0.115521f, + -0.347397f, -0.435325f, -0.004383f, -0.211864f, 0.162383f, -1.040726f, + 0.089537f, -0.128579f, -0.133505f, 0.107129f, -0.435657f, -0.180388f, + 0.043650f, 0.018709f, -0.773242f, -0.687192f, -0.120633f, -0.063626f, + 0.029912f, 0.113972f, -0.403502f, -0.127640f, -0.269625f, 0.129794f, + -0.188539f, 0.041641f, 0.029769f, -0.198374f, 1.401407f, 0.353887f, + -0.219925f, 0.260515f, 1.157034f, -2.992044f, -0.097618f, -0.064417f, + -0.203626f, -0.008217f, -0.112339f, -0.227407f, -0.155118f, 0.247705f, + -0.012304f, -0.248447f, -0.913463f, -0.064788f, -0.214619f, -0.251761f, + -0.386861f, -0.040574f, -0.163219f, -0.100700f, 1.488274f, -0.071684f, + -0.033626f, -0.006497f, -0.246945f, -0.145221f, -3.747390f, 0.149609f, + -0.263326f, -0.297385f, -1.039896f, -0.083174f, -0.025473f, -0.235586f, + -0.001087f, 0.254286f, 0.265106f, 0.007325f, 0.199239f, 0.134103f, + -0.578211f, -0.259801f, -0.062373f, 2.368348f, 0.560556f, -0.252260f, + 0.889997f, -0.447872f, -0.059218f, -0.095315f, -0.061667f, 0.183580f, + -0.157479f, 0.055387f, -0.831734f, 0.007606f, -1.104906f, 0.301180f, + -0.117115f, 0.212959f, 4.727223f, -0.243833f, -0.397495f, -0.025021f, + -0.367587f, -2.082058f, -0.217699f, 0.148111f, 0.252430f, 0.111088f, + -0.260692f, 0.095124f, -0.407774f, -0.322169f, 0.002927f, 0.126169f, + -1.272325f, -0.279772f, -0.373680f, -0.485177f, -0.605458f, 0.021225f, + -0.092031f, -0.226585f, 1.895162f, 0.037866f, -0.275475f, 1.614360f, + -0.014972f, -0.277679f, -3.449082f, -0.092060f, -0.747873f, 0.020716f, + 2.776178f, -0.049963f, 0.183999f, -0.295259f, -0.028868f, 0.221895f, + 0.001265f, 0.336823f, 0.219372f, 0.112824f, 0.408132f, -0.017940f, + -0.311666f, 1.489606f, -0.058093f, -0.305659f, -0.491933f, -0.143847f, + 0.166115f, 0.042867f, -0.123447f, -0.087099f, -0.305395f, -0.365079f, + -0.755801f, -0.160649f, 0.736260f, -0.008611f, 0.095836f, -0.017345f, + 5.697515f, -0.498971f, -0.125280f, 0.199907f, 0.300053f, 0.605026f, + -0.228225f, -0.259523f, 0.016384f, 0.146973f, 0.210258f, 0.226766f, + -0.075178f, -0.050924f, 0.188496f, -0.415266f, -0.484880f, -0.236384f, + 0.071931f, -0.331863f, -0.601243f, -0.232479f, -0.285272f, 0.123789f, + -1.341333f, 0.037082f, -0.315202f, -1.587215f, -0.271576f, 0.003216f, + -4.437186f, -0.256205f, -0.576589f, -0.114147f, 2.153916f, -0.369618f, + 0.271415f, 0.145036f, -0.158731f, -0.240938f, -0.187369f, 0.036325f, + 0.254771f, 0.211488f, -0.240297f, 0.098417f, -0.415011f, 2.334793f, + -0.127252f, 0.020069f, -0.168755f, -0.448922f, -0.219207f, 0.016232f, + -0.221935f, -0.269500f, -0.100636f, 0.102545f, -0.809376f, -0.054979f, + 0.360713f, -0.326541f, 0.112933f, 0.138073f, 4.229404f, -0.763801f, + -0.305429f, 0.199955f, -1.787713f, 0.272866f, 0.109895f, 0.138466f, + -0.250259f, -0.167162f, -0.212588f, -0.217589f, -0.067125f, -0.077490f, + -0.208970f, -0.006863f, -0.671146f, -0.298320f, -0.165509f, 0.044597f, + -1.408624f, -0.213957f, -0.220947f, 0.129718f, 1.316777f, -0.098928f, + -0.008121f, -0.558293f, -0.297290f, -0.218873f, -4.346638f, -0.228174f, + -0.204710f, -0.388864f, 2.697919f, 0.025260f, 0.857020f, 0.009921f, + 0.036915f, -0.320275f, -0.087937f, 0.022636f, 0.236667f, 0.135496f, + -0.059616f, -0.192955f, 0.009470f, 2.139589f, -0.200449f, 0.129818f, + 1.017444f, -0.608299f, 0.257914f, -0.134306f, -0.033327f, 0.002855f, + -0.338598f, 0.015559f, 0.117362f, -0.166760f, 0.086903f, -0.167666f, + 0.193523f, 0.033852f, -1.147686f, 0.489468f, -0.006969f, 0.125630f, + 1.557907f, -1.604449f, -0.071114f, 0.096178f, 0.007065f, 0.200013f, + 0.213393f, 0.168466f, -0.100568f, -0.117861f, -0.161542f, -0.072561f, + -1.069871f, -0.470138f, -0.352578f, -1.503513f, -0.001394f, -0.380109f, + 0.065089f, -0.281668f, 0.988953f, -0.002778f, -0.659026f, -0.470692f, + -0.407292f, 0.011710f, -1.362085f, 0.184738f, -0.135786f, -1.374241f, + 4.487930f, -0.067274f, -0.956404f, -0.233995f, 0.224527f, -0.454556f, + 0.037900f, -0.281658f, 0.208224f, -0.254753f, 0.045740f, 0.051444f, + -0.388281f, 0.257112f, -0.485030f, -0.082659f, 0.148103f, -1.007456f, + -0.022295f, 0.036984f, -0.369401f, -0.076943f, -0.007636f, -0.293022f, + 0.470466f, 0.199012f, -2.158182f, 0.036577f, -0.014725f, -0.229516f, + 2.236929f, 0.030945f, -0.400045f, 0.109348f, 0.214691f, -0.891516f, + -0.251379f, -0.217358f, 0.013733f, 0.205573f, -0.151725f, -0.191782f, + -0.339630f, -0.163905f, -0.119191f, -0.032516f, 0.503015f, 0.025772f, + 0.029094f, -1.146153f, 0.216723f, -0.330023f, 0.064695f, -0.262521f, + 0.425612f, -0.093080f, -0.489648f, 1.051293f, -0.092332f, 0.095557f, + -0.874132f, 0.218483f, -0.127648f, -1.605802f, 2.763617f, -0.186734f, + -1.243166f, -0.193514f, -0.173748f, 0.337822f, 0.183873f, -0.251594f, + -0.211582f, 0.144081f, 0.029620f, -0.024853f, -0.385140f, 0.467341f, + -0.928316f, -0.195442f, 0.917783f, 0.357084f, 0.174445f, -0.073659f, + -0.012811f, -0.115420f, -0.181147f, -0.364449f, -0.567395f, -0.012969f, + -1.680714f, 0.065323f, 0.198063f, -0.244201f, 1.428545f, -0.432539f, + -0.208931f, -0.091205f, 0.957125f, 0.813519f, -0.262677f, 0.246852f, + 0.015536f, 0.055026f, 0.067054f, 0.262103f, -0.358115f, -0.095206f, + -0.267522f, -0.402710f, -0.680397f, -0.123627f, -0.385590f, -1.504680f, + -0.169513f, -0.215338f, 0.043633f, -0.079052f, -0.464410f, 0.122894f, + -0.278231f, -2.456445f, -0.159917f, -0.015597f, -0.735449f, -0.078854f, + -0.400290f, -1.153870f, 3.657228f, -0.287093f, -1.174355f, -0.102001f, + -0.288281f, 0.185209f, -0.145228f, -0.200449f, -0.099914f, -0.138354f, + 0.254428f, -0.161751f, -0.118206f, 0.296043f, -0.482613f, 0.080932f, + 1.097605f, -0.010190f, 0.232439f, 0.447617f, -0.133508f, 0.115763f, + -0.388589f, 0.174695f, -0.236014f, 0.006284f, -1.374129f, 0.092015f, + -0.241419f, -0.231667f, 2.763950f, -0.922932f, -0.061605f, 0.208740f, + -1.597190f, 1.353325f, -0.198528f, 0.250498f, -0.013950f, -0.203861f, + -0.254563f, 0.081931f, -0.413369f, 0.011844f, 0.080961f, -0.231161f, + -1.234909f, -0.440843f, -0.174980f, -0.315283f, -0.337474f, -0.123243f, + -0.310001f, -0.271028f, 0.364179f, 0.022845f, -0.535517f, -0.772936f, + -0.188435f, 0.039667f, -0.807463f, 0.266550f, -0.288857f, -1.630789f, + 1.280155f, 0.065712f, -0.279960f, -0.300056f, 0.258440f, -0.073781f, + 0.213878f, 0.042196f, 0.021360f, 0.211698f, -0.003751f, -0.192673f, + -0.137008f, 0.247878f, -0.470604f, 0.073164f, 1.523241f, 0.734755f, + -0.114126f, -0.193834f, -0.025759f, 0.263183f, +}; + +static const float av1_ab_partition_nn_bias_64_layer1[LABEL_SIZE] = { + -0.343508f, -0.706936f, -0.160676f, -0.877101f, -0.517567f, -0.253254f, + -0.148074f, 0.923430f, -0.364770f, 0.203550f, 0.401216f, 0.938246f, + -0.872737f, 0.718723f, 0.703398f, 2.560015f, +}; + +static const NN_CONFIG av1_ab_partition_nnconfig_64 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + 64, // num_hidden_nodes + }, + { + av1_ab_partition_nn_weights_64_layer0, + av1_ab_partition_nn_weights_64_layer1, + }, + { + av1_ab_partition_nn_bias_64_layer0, + av1_ab_partition_nn_bias_64_layer1, + }, +}; + +// nn model for ab partition pruning, 32x32. +static const float av1_ab_partition_nn_weights_32_layer0[FEATURE_SIZE * 64] = { + -0.323723f, -0.214013f, -0.007772f, -0.458851f, -0.125542f, -0.123860f, + -0.410973f, -0.209389f, -0.087580f, -0.272881f, -0.168500f, -1.130845f, + 0.344916f, -0.475017f, -0.362262f, -0.195662f, -0.566124f, 0.782163f, + 0.411575f, -0.013378f, -0.318650f, -0.124678f, -0.612909f, -0.315788f, + -0.263990f, -0.508783f, -0.048938f, -0.416407f, -0.402648f, -0.156644f, + 0.225887f, -0.000493f, 2.682241f, 0.871204f, 0.059014f, 0.803542f, + -1.407028f, -1.154669f, 1.388148f, -0.293348f, -0.003669f, -0.009607f, + 1.330030f, -0.337841f, 2.118617f, 1.033059f, -0.084788f, 0.212904f, + 0.082405f, -0.070579f, -0.494005f, -0.173392f, 0.039546f, -0.463865f, + 0.077163f, -0.434066f, 0.030835f, -0.427139f, -0.560520f, -0.031606f, + -0.368541f, -0.027458f, 0.370574f, 0.461418f, 1.087682f, -0.572137f, + -1.509596f, -0.765697f, -0.499383f, -0.277998f, -0.106492f, -0.129564f, + -0.169133f, -0.269834f, -0.114270f, -0.275431f, 0.016339f, -0.156744f, + -0.267922f, 0.171216f, 0.110556f, 0.002954f, -0.200327f, -0.187663f, + 3.691601f, 1.234152f, 0.186315f, -0.125370f, -0.211235f, -0.554432f, + -0.131072f, -0.124982f, -0.130339f, -0.235350f, 0.018903f, 0.012896f, + -0.159372f, -0.269571f, -0.025709f, -0.221251f, 0.061919f, 0.016307f, + 0.384673f, -0.134525f, -1.599126f, -0.416459f, -0.743052f, 0.670249f, + -0.169709f, 0.421681f, -0.033360f, -0.072817f, 0.003647f, -0.110632f, + -0.158651f, -0.095136f, 0.223759f, 0.165767f, -0.269129f, -0.196075f, + -0.023183f, -0.293420f, 0.014875f, 0.018688f, -0.153407f, -0.172009f, + -0.259947f, -0.124015f, 0.173653f, -0.089103f, -0.021001f, -0.334230f, + 0.027177f, 0.103371f, -0.183860f, -0.204051f, -0.023721f, -0.192297f, + -0.143771f, -0.247106f, 0.218116f, -0.013240f, 2.831783f, 1.483928f, + -0.877025f, -0.313462f, -0.411320f, -0.447825f, 0.605977f, 0.234684f, + -0.119150f, -0.075182f, -0.330463f, 0.071503f, -0.254924f, -0.360071f, + -0.037022f, 0.063261f, -0.148759f, -0.238254f, -0.462018f, -0.027166f, + 0.065318f, -0.235743f, -0.257194f, -0.094784f, 0.022423f, 0.055925f, + 0.086672f, -0.021010f, 0.009965f, -0.001648f, -0.104917f, -0.387443f, + -0.102673f, -0.281706f, 0.145923f, -0.233391f, -0.378365f, -0.145584f, + -0.077751f, -0.121166f, 1.134565f, -0.097500f, -0.749202f, -0.544566f, + -1.361374f, -0.102494f, 1.089275f, 0.375299f, -0.105091f, 0.037641f, + -0.054248f, -0.282691f, -0.377797f, -0.066427f, -0.253815f, -0.329677f, + -0.339326f, -0.128217f, -0.282905f, 0.014937f, 1.067185f, -0.171764f, + 0.484458f, 0.396706f, -0.557055f, -0.891596f, -0.257839f, -0.720879f, + -0.218449f, -0.004755f, 1.572857f, 0.006229f, 1.962895f, -0.029746f, + -4.137691f, -2.185991f, -2.763477f, -0.520437f, -0.208708f, 0.006444f, + -1.263078f, -0.304560f, 1.072374f, 2.556429f, 0.312850f, 0.257488f, + -0.634264f, 0.156769f, -0.188943f, 0.040295f, -0.389915f, 0.085250f, + -0.248525f, 0.045667f, -0.776115f, -0.274680f, -0.448145f, -0.566161f, + -1.285316f, 0.079060f, 0.389124f, -0.510401f, -0.015299f, -0.664661f, + 0.099901f, -0.470694f, -0.051593f, -1.076381f, -0.442104f, -0.197867f, + -0.330011f, -0.448523f, -0.301018f, -0.442093f, -0.491953f, -0.582091f, + -0.064569f, -0.156516f, 0.543522f, -0.005924f, 0.161432f, 0.974793f, + 0.273712f, 1.104850f, -0.290312f, 0.313417f, -0.125370f, 0.136234f, + -0.191227f, -0.165054f, 0.011872f, -0.298871f, 0.095740f, 0.142760f, + -0.215771f, -0.031437f, 0.101041f, -0.085620f, 0.435387f, 0.002786f, + 1.971375f, 0.018392f, -1.771940f, -0.401433f, 0.808263f, -3.350013f, + 2.296952f, -1.024403f, -0.041645f, -0.034799f, -0.024078f, -0.347301f, + -0.276088f, -0.455907f, 0.266021f, 0.087348f, -0.146566f, 0.040492f, + -0.539866f, -0.206851f, -0.387874f, -0.125508f, -0.496676f, -0.373845f, + -0.472356f, -0.357082f, -0.081254f, -0.456466f, 0.554713f, 0.002185f, + -4.225019f, 0.344025f, 0.728796f, -0.262936f, 1.383924f, 1.577300f, + -2.653320f, -2.516156f, -0.301604f, -0.204105f, -0.138252f, -0.587536f, + -0.097889f, -0.352414f, -0.288276f, -0.184340f, -0.122741f, -0.243376f, + 0.031970f, -0.373402f, -0.396079f, 0.045566f, 0.072595f, -0.222681f, + -0.243802f, -0.340129f, -0.258494f, -0.192041f, -0.386112f, -0.240940f, + -0.047268f, -0.555802f, -0.032514f, -0.241341f, -0.167463f, -0.478308f, + -0.205936f, -0.316275f, 0.103729f, -0.197893f, -0.128029f, -0.218796f, + -0.167362f, -0.111814f, -0.126062f, -0.394260f, -0.025357f, -0.402697f, + -0.587395f, -0.400385f, -0.259664f, -0.415588f, -0.338503f, -0.399166f, + -0.270504f, 0.234505f, 0.272144f, 0.266938f, -0.392395f, -0.011717f, + -0.384221f, -0.473446f, -0.038420f, -0.241101f, -0.234402f, -0.275567f, + -0.410454f, -0.377599f, -0.179099f, -0.138432f, -0.248083f, -0.543026f, + -0.428043f, -0.239895f, -0.333193f, -0.103346f, -0.039038f, -0.171109f, + -0.119432f, -0.222351f, 0.000450f, 0.208724f, -0.510526f, -0.144656f, + -0.316721f, -0.344846f, -0.244794f, -0.129134f, -0.045634f, -0.400183f, + 0.043714f, -0.235414f, 0.115594f, -0.195616f, -0.106693f, -0.124242f, + 0.083990f, 0.049110f, -0.196130f, -0.059860f, -0.464235f, -0.516443f, + -0.101521f, -0.422379f, -0.413955f, -0.042991f, -0.345263f, -0.129264f, + -0.106911f, -0.140156f, -0.457841f, -0.199848f, -0.218954f, -0.329850f, + -0.364097f, -0.335262f, -0.312254f, -0.299331f, -0.052710f, -0.251019f, + -0.023459f, -0.222538f, 0.028849f, -0.088038f, -0.301550f, -0.273566f, + 0.067295f, -0.174608f, -0.445784f, -0.158366f, -0.567275f, -0.557652f, + -0.353503f, -0.302092f, -0.302049f, -0.551793f, -0.034535f, -0.225190f, + -0.210733f, -0.219377f, -0.057197f, -0.430933f, -0.025185f, -0.388150f, + -0.086147f, -0.430088f, 0.058466f, -0.152129f, -0.058411f, -0.236392f, + -0.547669f, -0.613849f, -0.893774f, -0.351715f, -0.399227f, -0.454909f, + -0.324501f, 0.000490f, -0.282167f, -0.073163f, -0.281452f, 0.047932f, + -0.175500f, 0.165220f, -0.276212f, 0.062153f, -0.217054f, -0.255487f, + -0.146416f, -0.097718f, -0.173809f, -0.559328f, -0.055695f, -0.391193f, + -0.132020f, -0.561184f, -0.308666f, -0.474053f, -0.219149f, -0.246558f, + -0.158325f, 0.151907f, -0.266835f, -0.144697f, -0.193960f, -0.046587f, + -0.220028f, -0.247355f, 0.135584f, 0.016511f, 0.367705f, -1.855877f, + 0.435622f, 0.444710f, -3.372301f, -3.030489f, 1.013267f, 0.380951f, + -0.170011f, -0.111415f, -0.456146f, -0.107254f, -0.095220f, -0.053078f, + -0.135864f, -0.591949f, -0.252810f, -0.324799f, -0.094796f, -0.260969f, + -0.391981f, -0.063170f, -0.336130f, -0.470127f, -0.405168f, -0.433219f, + -0.309563f, -0.295462f, -0.552270f, -0.012300f, -0.057793f, -0.034494f, + -0.446843f, -0.640160f, -1.188681f, -0.791361f, 0.543271f, 1.189112f, + 1.458468f, -0.005876f, -0.927475f, 0.062038f, -1.170818f, 0.338227f, + -3.007096f, -4.559296f, -4.045457f, -5.953635f, -0.228386f, -0.266890f, + -0.092595f, -0.377440f, -0.044534f, -0.053565f, -0.349268f, -0.415030f, + -0.310094f, 0.062721f, 0.251422f, -0.014350f, -1.282910f, 1.619560f, + 1.180566f, -0.032163f, -1.322951f, -0.603601f, 1.443710f, 0.654650f, + -0.393227f, 0.003536f, 0.029725f, -0.108925f, -0.053911f, 0.133977f, + -0.036145f, -0.168438f, 0.046989f, -0.331463f, -0.176983f, -0.311922f, + -0.272389f, -0.379592f, -0.399993f, -0.297873f, -0.193425f, -0.177524f, + -0.258309f, -0.567312f, -0.260217f, -0.241869f, 0.024010f, -0.032867f, + -0.039424f, -0.063670f, 0.193808f, -0.303514f, -0.013376f, -0.057761f, + 0.187922f, 0.006938f, 0.031810f, 0.180594f, -1.198427f, 2.820662f, + 0.154986f, -0.375518f, 0.116925f, -0.795782f, -0.085139f, -0.079365f, + -0.197936f, -0.321468f, -0.205271f, -0.558203f, -0.296235f, -0.151193f, + -0.158282f, -0.245402f, -0.208504f, -0.042335f, -0.087426f, -0.557129f, + -0.381427f, -0.441551f, -0.541011f, -0.060567f, -0.469305f, -0.032326f, + -2.453587f, -0.045568f, -0.296932f, 0.613061f, -0.320284f, 0.191620f, + -0.827145f, -0.225277f, 0.275800f, 1.696635f, +}; + +static const float av1_ab_partition_nn_bias_32_layer0[64] = { + -0.176206f, 0.660189f, -0.186156f, -2.481963f, -1.564218f, -0.280424f, + 0.732684f, -0.135581f, -2.193132f, -0.172771f, 0.605001f, -0.060392f, + -0.067190f, -0.132969f, -1.410812f, -0.298701f, -0.105963f, -0.086173f, + 0.632779f, 0.005585f, 1.310169f, 1.392136f, -0.563860f, -0.051053f, + 0.660998f, -0.214726f, -1.894342f, -0.128288f, -0.330721f, -0.053988f, + -0.177726f, 1.200859f, -0.178902f, -0.172620f, -0.184476f, -0.175559f, + 0.538503f, -0.322158f, -0.219080f, -0.058208f, -0.171347f, -0.216060f, + -0.174950f, -0.295740f, -0.184820f, -0.213896f, 1.317728f, -0.020116f, + -0.208096f, 0.000000f, 1.246166f, -0.225421f, -0.181555f, 0.861761f, + 1.172429f, -0.172892f, -0.737092f, -0.189904f, -0.179385f, -0.114618f, + -1.384604f, -0.201713f, -0.271948f, 0.372351f, +}; + +static const float av1_ab_partition_nn_weights_32_layer1[64 * 16] = { + -0.037828f, 1.529029f, 0.004927f, 1.475763f, 0.627172f, 0.325872f, + -0.990757f, 0.129476f, 0.889958f, -0.082031f, 0.332133f, 0.074422f, + -0.176212f, -0.074355f, 0.774378f, 0.110987f, -0.155469f, 0.253310f, + 0.882538f, 0.253605f, 0.332436f, -5.389474f, 0.278470f, 0.168644f, + 0.914611f, 0.154165f, 0.809262f, -0.174734f, 0.923673f, 0.064716f, + -0.070228f, -0.228735f, 0.002312f, 0.112222f, -0.045502f, -0.046004f, + 0.514101f, 0.306480f, 0.021232f, -0.015955f, -0.288260f, 0.189177f, + -0.104158f, 0.103273f, 0.096910f, -0.086328f, 1.327289f, -0.154247f, + 0.056676f, -0.243327f, -0.646676f, 0.177221f, -0.086761f, 0.729729f, + -14.710893f, -0.044881f, 0.339003f, -0.134737f, 0.073621f, -0.162913f, + 1.215237f, 0.140723f, 0.138630f, 1.241719f, 0.204092f, -0.463080f, + -0.176086f, 1.125868f, 1.034814f, 0.225455f, -0.203421f, -0.078787f, + -0.527498f, 0.012491f, -0.563307f, -0.170792f, 0.002679f, 0.116153f, + 0.211348f, -0.191900f, -0.212505f, 0.263445f, -0.074679f, -0.081441f, + -0.815405f, 2.448215f, 0.781299f, 0.149542f, -1.045162f, 0.043014f, + 0.217381f, -0.094500f, -0.090427f, 0.025784f, -0.228906f, -2.741798f, + 0.230475f, -0.256112f, -0.103297f, 0.159121f, -0.229793f, -0.014883f, + -0.104131f, -0.123816f, 0.164148f, -0.052279f, -0.071845f, -0.041197f, + 0.208527f, -0.234197f, -0.542336f, 0.020053f, 0.088870f, 0.014346f, + 2.502164f, -0.010244f, -0.267792f, 0.844394f, 2.711486f, -0.015262f, + -0.868053f, -0.295704f, 0.222289f, -0.000286f, -0.352098f, -0.079000f, + 0.021267f, -0.721739f, -0.240558f, -0.384775f, 0.065974f, -2.161058f, + 0.195889f, 0.268966f, -0.009329f, 0.014949f, 0.314943f, 0.235885f, + 0.072591f, -0.127120f, 0.150784f, 0.105697f, -1.297403f, -0.207509f, + -0.217688f, -0.076752f, 0.170952f, -0.294235f, 0.449973f, -1.712690f, + 0.860989f, 0.054757f, -0.812627f, -0.105316f, -0.736230f, -0.133192f, + -3.741608f, 0.495660f, -0.288936f, 4.654852f, -0.021305f, -0.308916f, + 0.049205f, -0.259996f, 0.114248f, -0.252647f, -0.253180f, -0.449314f, + 0.022979f, 0.063281f, -0.196154f, 0.078295f, -0.322317f, -0.145142f, + 0.300573f, 0.048385f, -0.254787f, 0.123939f, -1.263088f, -0.228565f, + -0.389061f, 0.391084f, 2.322438f, 0.075009f, 0.225743f, -0.198808f, + -0.280538f, -0.173939f, -0.120543f, -0.070792f, -0.417187f, -0.781056f, + -0.102756f, -1.760965f, 0.019149f, -0.867342f, 0.347141f, 0.031588f, + 0.302572f, -0.203573f, -0.357320f, -0.096078f, -0.527528f, 0.046699f, + -0.108561f, -0.167077f, -2.851509f, -0.307116f, 0.202720f, -0.160280f, + -0.215525f, 0.064355f, -0.427220f, 1.516230f, 0.634453f, 0.099400f, + -1.013887f, -0.029740f, -0.093426f, -0.044272f, -1.297636f, -0.237614f, + -0.160953f, 0.399036f, -0.030685f, -0.113619f, -0.184704f, 0.040519f, + -0.588252f, -0.210235f, -0.067623f, -0.031841f, -0.107261f, -0.192582f, + -0.253959f, -0.430821f, -0.103184f, -0.280185f, -0.357723f, 0.197761f, + -0.175087f, -0.055171f, 1.642014f, -0.192559f, -0.288147f, 0.610311f, + 4.688195f, -0.128728f, -0.914869f, -0.108286f, 0.013789f, 0.092125f, + 0.019770f, -0.178386f, 0.074164f, -1.152658f, -0.216738f, -0.277286f, + 0.012381f, 0.418259f, -0.680727f, -0.221917f, -0.485946f, 0.101672f, + 2.009457f, 0.054302f, 1.019838f, -0.116170f, 0.165134f, -0.112567f, + 0.852632f, -0.385796f, -0.108666f, 0.053181f, -0.311797f, -0.372875f, + -0.675717f, 2.409268f, -0.514720f, -0.214245f, -0.646596f, 0.009756f, + 0.203993f, 0.093617f, -0.301290f, 0.253551f, -0.128909f, -1.448442f, + -0.186823f, -0.278001f, -0.294993f, -0.176928f, -0.473605f, 0.062049f, + -0.212084f, -0.137326f, 0.012505f, 0.087850f, -0.200413f, -0.394119f, + -0.132224f, 0.146917f, 0.155746f, 0.198725f, -0.322541f, 0.196391f, + -0.945500f, 0.036736f, -0.155646f, -0.677341f, 1.130545f, -0.339554f, + 0.411628f, -0.355813f, -0.249843f, 0.213694f, -2.035607f, 0.055694f, + -0.111669f, 0.408696f, -0.067043f, -0.048182f, 0.398110f, -0.067542f, + 1.459801f, 0.236833f, -0.178806f, 0.168758f, 0.492387f, 0.099691f, + -0.776680f, -0.172865f, 0.204225f, 0.193982f, 0.575685f, -0.062248f, + 0.011486f, 0.058571f, -0.493391f, 0.026893f, -0.900467f, 3.793129f, + -0.634613f, -0.064660f, -0.048262f, 0.361905f, 0.033641f, 0.245171f, + -0.064671f, 0.034954f, 0.204358f, -0.904023f, -0.052714f, -0.250134f, + 0.136700f, 0.000734f, -0.371720f, 0.226483f, 0.217958f, 0.060559f, + 0.180111f, 0.000970f, 0.079556f, -0.096775f, 0.093855f, -0.026224f, + -0.243664f, 0.004290f, 0.123281f, -0.239476f, 1.230374f, -0.107826f, + -0.101982f, -0.153917f, 5.464427f, 0.304375f, -0.809957f, 0.090564f, + -0.278416f, -0.245555f, -2.078421f, 0.243093f, -0.127666f, 0.052451f, + -0.126662f, -0.783505f, 0.025149f, -1.422675f, -0.207769f, -0.362547f, + 0.115310f, 0.133390f, 1.264754f, -0.027055f, -0.485312f, -0.240717f, + -0.239722f, 0.146818f, -1.265043f, -0.235553f, 0.267104f, -0.021357f, + -0.435949f, -0.309371f, 0.049920f, 1.302721f, -0.233978f, -0.097551f, + -0.240631f, -0.287821f, -0.378380f, -0.273131f, -3.075169f, 0.226404f, + -0.029361f, 2.703590f, -0.430659f, 0.067927f, -0.387520f, -0.370630f, + -0.229236f, 0.085653f, -0.370956f, -0.065556f, -0.187859f, 0.068309f, + -0.109299f, -0.259898f, -0.103644f, -0.271199f, -0.209350f, 0.140993f, + -0.196713f, -0.135508f, -1.423209f, -0.406385f, -0.019956f, -0.864694f, + 5.963707f, -0.201157f, 0.726377f, -0.011076f, 0.010553f, -0.102918f, + -2.230088f, -0.258098f, -0.039547f, -0.029262f, -0.082324f, -0.860222f, + -0.094735f, -1.381839f, 0.587298f, -0.173048f, 0.721360f, 0.241900f, + 0.764302f, -0.023609f, -1.173755f, 0.103912f, -0.185363f, 0.078435f, + -2.245062f, -0.127269f, 0.202234f, 0.158975f, -0.260909f, 0.098608f, + -0.348247f, 1.732502f, -0.412298f, -0.269602f, -0.425771f, -0.146243f, + -0.530730f, 0.125716f, -1.004419f, 0.145109f, -0.059289f, 1.096304f, + 0.012891f, 0.045033f, -0.306875f, 0.003514f, -0.176110f, 0.037544f, + -0.441537f, -0.518921f, -0.262149f, -0.060407f, -0.379419f, -0.141245f, + -0.128894f, -0.176537f, -1.161318f, -0.249100f, -0.118330f, 0.042816f, + 1.173404f, 0.088312f, -0.393568f, -0.175134f, 6.529819f, -0.326652f, + -0.631917f, -0.393476f, 0.057781f, -0.217748f, -1.781139f, -0.012614f, + -0.212621f, -0.720322f, -0.218498f, -0.388556f, -0.254796f, -0.248399f, + -0.608744f, -0.265146f, 0.238517f, 0.066882f, -2.916806f, 0.054642f, + 0.282590f, 0.075248f, 0.010188f, -0.133486f, 0.985945f, -0.045849f, + -0.347564f, 0.057320f, -0.417920f, 0.063664f, 0.387062f, -2.692059f, + -0.535549f, 0.263736f, 0.327889f, -0.070273f, -0.775254f, 0.147250f, + 3.309425f, -0.212191f, -0.067204f, -2.912663f, -0.061496f, 0.084233f, + 0.022907f, 0.138421f, -0.112159f, -0.288447f, -0.010799f, 0.056049f, + -0.036527f, 0.021525f, 0.106649f, -0.291883f, 0.088424f, -0.057773f, + -0.086031f, 0.015277f, -0.318505f, -0.269049f, -1.008913f, -0.224785f, + -0.025820f, -0.649037f, 0.706381f, 0.096410f, 0.643776f, -0.046743f, + -0.009654f, -0.024246f, 1.469255f, -0.183536f, -0.370046f, -0.048442f, + -0.376527f, -0.431264f, -0.245109f, -0.093951f, 0.203683f, -0.099872f, + 0.087210f, 0.160692f, -3.527694f, -0.068891f, -0.228994f, -0.231817f, + -0.241949f, 0.193613f, 0.979597f, -0.091259f, 0.414424f, -0.047341f, + -0.209582f, -0.295134f, -0.016824f, 0.460327f, -0.072671f, 0.246234f, + 0.235896f, 0.127238f, -1.068683f, 0.035648f, 2.254888f, 0.180105f, + -0.260098f, -2.322120f, -0.184249f, -0.314801f, -0.099969f, -0.272117f, + -0.237916f, 0.031103f, -0.274063f, -0.049384f, -0.044917f, 0.102477f, + -0.342148f, -0.257558f, -0.346300f, 0.115333f, -0.115456f, 0.208354f, + -0.359301f, -0.167395f, 1.146514f, -0.177861f, -0.098658f, -0.444570f, + 6.759993f, -0.369772f, -0.831118f, 0.001866f, -0.073298f, -0.072095f, + 0.811902f, -0.431997f, -0.286587f, -0.269500f, 0.111492f, -0.525364f, + -0.351785f, -2.463474f, -1.852659f, 0.135325f, 0.138267f, 0.100643f, + -2.373278f, -0.285514f, -0.395388f, -0.185016f, -0.030249f, -0.005767f, + -0.716424f, -0.031674f, 0.011147f, 0.057405f, -0.215873f, -0.094401f, + 0.573528f, -1.223820f, 0.414852f, -0.059053f, -0.076488f, -0.287168f, + -0.842640f, 0.174084f, -0.567186f, 0.336629f, -0.062514f, 2.075448f, + -0.061680f, -0.131529f, -0.098994f, -0.204111f, -0.347865f, 0.108516f, + -0.049616f, -0.069212f, -0.273935f, -0.096545f, -0.210784f, -0.284698f, + 0.141501f, -0.176924f, -0.361341f, -0.251197f, -0.286694f, 0.245569f, + -1.521661f, -0.122639f, -0.015760f, -0.718912f, 5.877828f, 0.146916f, + 0.151767f, 0.220785f, -0.032298f, 0.230902f, 0.663943f, -0.252613f, + 0.057718f, -0.436038f, -0.323994f, -1.139787f, -0.042489f, -1.326298f, + -1.031206f, -0.104136f, 0.389897f, 0.127602f, -2.667789f, -0.212366f, + -0.506262f, -0.009115f, -0.213202f, 0.076167f, -1.629405f, 0.055129f, + 0.375393f, -0.150272f, -0.241515f, -0.326497f, 0.100069f, 0.410703f, + 0.340622f, 0.042437f, -0.349945f, 0.041176f, -1.178950f, 0.030992f, + 0.933908f, -0.035844f, -0.098660f, 1.030584f, -0.092043f, -0.355739f, + -0.305562f, 0.036161f, -0.049558f, -0.033225f, -0.403856f, -0.088276f, + 0.215493f, -0.149105f, -0.013363f, 0.025886f, -0.101306f, -0.205781f, + -1.072487f, -0.076019f, 0.077555f, 0.131003f, 1.267763f, -0.008954f, + -0.327617f, -0.246539f, 6.664081f, -0.404403f, -1.442489f, 0.191301f, + -0.336361f, 0.181156f, 0.833108f, 0.007879f, -0.194464f, -1.029408f, + -0.036268f, -0.927110f, -0.379190f, -0.293443f, -1.848579f, -0.242548f, + -0.065990f, 0.203160f, -0.291788f, 0.000680f, 0.587011f, -0.241289f, + 0.037034f, 0.000552f, 1.072308f, -0.387230f, -0.230050f, 0.292322f, + -0.720001f, 0.034109f, -0.467260f, 2.211644f, -1.839191f, -0.048797f, + -0.083469f, -0.334686f, -0.269056f, 0.051295f, 1.319904f, -0.035603f, + -0.018457f, -0.824915f, -0.212285f, -0.230516f, -0.035093f, -0.400843f, + -0.305469f, -0.099011f, 0.014225f, -0.452772f, 0.170331f, -0.389312f, + -0.115084f, -0.014770f, -0.429387f, -0.155961f, -0.568200f, -0.037853f, + -0.125137f, 0.067228f, -1.329271f, -0.117874f, -0.132499f, -0.218376f, + -0.588325f, -0.320024f, 0.085695f, -0.235047f, -0.217790f, 0.103015f, + -0.698644f, 0.017766f, -0.058299f, 0.199411f, -0.122485f, -0.563949f, + -0.349011f, -0.557045f, -0.131165f, 0.002281f, 0.118559f, -0.210302f, + -1.153815f, 0.116738f, -0.236007f, -0.003487f, -0.006885f, -0.244816f, + 0.953222f, 0.093748f, 0.266869f, 0.241869f, -0.860832f, -0.387012f, + -0.338986f, 2.097515f, -1.942512f, -0.298021f, 0.543911f, -0.043214f, + 0.082125f, -0.120242f, 0.712231f, 0.213327f, -0.301687f, -0.544011f, + -0.392131f, 0.004302f, 0.004825f, -0.317440f, -0.107518f, -0.293407f, + -0.159111f, -0.080367f, 0.132663f, -0.017726f, -0.237521f, -0.190297f, + -0.361633f, 0.200518f, -0.538296f, -0.027975f, -0.381704f, -0.016963f, + 0.630105f, -0.190997f, -0.287840f, -0.603488f, 3.605598f, -0.276614f, + -1.346383f, 0.186912f, -0.047575f, -0.189232f, -1.519072f, 0.097816f, + -0.223722f, 0.304924f, -0.213022f, -1.052433f, -0.322283f, -1.706734f, + -2.458027f, 0.237976f, 0.171050f, -0.103139f, -0.278689f, 0.329824f, + -0.262448f, -0.122916f, -0.236398f, -0.013848f, -0.969160f, -0.374907f, + 0.091018f, -0.386471f, -0.723940f, 0.064956f, -0.057652f, 1.321024f, + -1.397418f, -0.143136f, 0.272468f, -0.030749f, 0.037324f, 0.069316f, + -0.904925f, -0.333693f, -0.117709f, 2.279598f, -0.428065f, -0.131157f, + -0.014288f, -0.402862f, -0.666090f, 0.017070f, -0.028333f, 0.002481f, + 0.197156f, -0.038120f, -0.271062f, -0.188275f, -0.021370f, -0.070849f, + -0.905007f, -0.095886f, -0.093055f, -0.121821f, -1.239812f, -0.411799f, + -0.089948f, -0.936827f, 1.437569f, -0.388908f, 0.126170f, 0.186162f, + -0.018819f, -0.138364f, -1.066412f, -0.138222f, -0.022186f, 0.107331f, + -0.230436f, -1.352605f, -0.161323f, -1.081810f, -0.933825f, -0.136675f, + 0.378157f, 0.113377f, -0.850610f, 0.080245f, -0.087305f, -0.002852f, + 0.044408f, -0.188172f, -1.891998f, 0.092189f, 0.125325f, -0.105090f, + -0.848510f, -0.396308f, -0.384130f, 2.007509f, -1.480787f, -0.126946f, + 0.314767f, 0.000195f, -0.285628f, -0.110442f, -0.293948f, 0.258559f, + -0.417603f, 1.570705f, 0.092459f, -0.340974f, -0.284754f, -0.007801f, + -0.324610f, -0.004734f, -0.207716f, -0.057175f, 0.055467f, -0.210830f, + -0.113005f, -0.299177f, 0.068074f, 0.017929f, -2.897598f, -0.260074f, + -0.014422f, -0.206467f, 1.246997f, -0.372863f, -0.214160f, -0.114035f, + 5.805862f, 0.003611f, -1.340990f, -0.021085f, -0.260431f, -0.002720f, + -1.251640f, -0.353531f, -0.304009f, -0.153376f, +}; + +static const float av1_ab_partition_nn_bias_32_layer1[LABEL_SIZE] = { + -0.521497f, -1.061572f, -0.078756f, -0.660662f, -0.403741f, -0.960163f, + 0.001427f, 0.523607f, 0.225068f, -0.055273f, 1.019519f, 1.181880f, + -0.010198f, 0.130597f, 1.276752f, 2.028188f, +}; + +static const NN_CONFIG av1_ab_partition_nnconfig_32 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + 64, // num_hidden_nodes + }, + { + av1_ab_partition_nn_weights_32_layer0, + av1_ab_partition_nn_weights_32_layer1, + }, + { + av1_ab_partition_nn_bias_32_layer0, + av1_ab_partition_nn_bias_32_layer1, + }, +}; + +// nn model for ab partition pruning, 16x16. +static const float av1_ab_partition_nn_weights_16_layer0[FEATURE_SIZE * 64] = { + 0.151902f, 0.007947f, -1.788454f, 0.431869f, -2.971387f, 0.923566f, + 1.632542f, -1.665136f, -0.338632f, -5.075884f, 0.398267f, 0.030467f, + 2.263534f, -0.045532f, -1.066128f, 0.915139f, -0.560500f, -3.293125f, + 2.072793f, -1.011414f, 0.122716f, -0.060169f, -0.388860f, 0.031019f, + -0.381861f, 0.001551f, -0.328472f, 0.038296f, -0.060398f, -0.375556f, + 0.209226f, 0.014764f, -1.443469f, -0.345486f, 2.409269f, 1.524846f, + -0.640666f, 1.322139f, -2.074771f, -0.580944f, -0.203960f, -0.072893f, + 0.329701f, 0.115339f, -1.339542f, 0.249024f, -0.421545f, -0.409151f, + -0.258293f, 0.836288f, -0.073685f, -0.009624f, 0.895712f, 0.320639f, + 0.451002f, -1.544558f, 0.193709f, -1.389012f, 1.305451f, 0.089795f, + 0.050338f, -0.017433f, -0.304667f, 0.500729f, 0.504346f, 0.073757f, + 0.582649f, -0.993623f, 1.766766f, -3.067265f, -0.415774f, -0.006036f, + -1.245281f, 0.253205f, -0.591245f, -0.626238f, 0.551852f, 0.593755f, + 0.491023f, 1.099384f, -0.348448f, 0.054564f, -0.451422f, -0.375781f, + -0.248390f, -0.052548f, -0.380069f, -0.165391f, -0.297968f, -0.052142f, + -0.316381f, -0.045246f, -0.243905f, -0.034169f, -0.247523f, -0.180773f, + 0.068066f, -0.374920f, 0.057536f, -0.189748f, 0.058375f, -0.267749f, + -0.147286f, -0.246153f, 0.006183f, -0.202029f, -0.059128f, 0.116852f, + 0.134719f, -0.126900f, -0.064646f, -0.196458f, -0.182331f, 0.108029f, + -0.264499f, 0.155816f, -0.107255f, -0.056983f, -0.209771f, -0.099070f, + 0.007313f, -0.254124f, -0.231964f, -0.275972f, 0.032098f, -0.264564f, + -0.208743f, 0.155599f, -0.121511f, -0.156145f, -0.162315f, -0.059788f, + -0.257073f, -0.076654f, -0.110616f, -0.321675f, -0.051952f, 0.006301f, + -0.154114f, 0.017032f, -0.017364f, -0.233247f, 0.009918f, -0.179289f, + -0.190722f, 0.147106f, -0.063910f, -0.396872f, -0.263123f, -0.003850f, + -0.040718f, -0.324699f, 0.118660f, -0.170727f, -0.316788f, 0.100886f, + -0.202842f, 0.045371f, 0.150561f, -0.057054f, -0.308150f, 0.028346f, + -0.381473f, -0.195365f, 0.026221f, -0.281795f, 0.087204f, 0.047689f, + -0.027643f, -0.104724f, -0.089030f, -0.117661f, -0.349160f, 0.056982f, + -0.340273f, 0.048086f, 0.046103f, -0.121527f, 0.021697f, 0.054109f, + -0.002768f, -0.008461f, -2.297240f, 0.124651f, 3.621661f, -0.057120f, + -1.151656f, 2.296894f, -3.678720f, -0.290240f, 0.087683f, -0.186389f, + 0.007656f, -0.090236f, -0.245217f, 0.110389f, -0.251719f, -0.029084f, + -0.128203f, -0.100005f, -0.032779f, 0.007281f, -0.366596f, -0.267870f, + -0.215620f, 0.047687f, 0.010303f, 0.097980f, -0.191569f, -0.341162f, + 0.119249f, 0.026279f, -2.161546f, 0.459591f, 1.290566f, 1.791797f, + -0.409835f, 0.127081f, -1.156367f, 0.198286f, 0.099561f, -0.067445f, + -0.034352f, 0.017966f, -0.277380f, -0.057220f, -0.174198f, -0.014164f, + 0.146090f, -0.357530f, 0.097644f, -0.000932f, 0.446603f, -0.066793f, + 2.448620f, 0.937617f, -1.232922f, 0.313183f, 0.816827f, -0.275115f, + -0.245205f, -0.126895f, 0.156668f, -0.186977f, -0.273505f, 0.013315f, + 0.168629f, -0.089084f, 0.006166f, -0.116107f, -0.199316f, -0.024010f, + -0.242303f, 0.011612f, -0.218485f, -0.229661f, -0.123922f, 0.136699f, + 0.006732f, -0.148718f, -0.164225f, 0.116063f, 1.587898f, 0.690519f, + 0.360566f, 0.009739f, -0.678702f, -0.046003f, 0.126984f, 0.605212f, + 1.240663f, -0.000228f, -1.119369f, -0.415589f, -0.721003f, 0.097936f, + -1.410586f, -2.358833f, -2.773129f, -3.983361f, -0.087144f, -0.050029f, + -0.242255f, 0.137424f, -0.307490f, -0.084637f, -0.023812f, -0.196582f, + -0.078695f, 0.038257f, -0.012110f, -0.263521f, 0.009839f, -0.109125f, + -0.226036f, 0.060712f, 0.093671f, 0.153143f, 0.039116f, -0.290891f, + 0.227057f, -0.204633f, -0.207539f, -0.148242f, 0.046204f, -0.231268f, + -0.209315f, -0.307579f, -0.436556f, 0.023475f, 0.131793f, -0.038301f, + 1.650584f, 0.392570f, 1.446576f, 1.254380f, -0.516867f, -0.057116f, + 0.149320f, 0.414424f, -0.246309f, 0.003877f, -0.480238f, -1.037035f, + -0.830779f, -1.122244f, -0.408267f, -0.253956f, 0.382005f, 0.940609f, + -1.113370f, -0.018554f, 0.141064f, -0.182504f, 1.270707f, 0.414904f, + -0.216036f, 0.203831f, 0.450716f, -0.452909f, 0.139358f, -0.027143f, + 1.956892f, 1.643732f, -0.867839f, -0.620520f, -0.334607f, -0.519982f, + 0.205023f, 0.661159f, -0.000809f, 0.049033f, -0.348579f, -0.200338f, + -0.362144f, -0.346590f, -0.230096f, 0.180746f, -0.149954f, -0.253429f, + -0.378170f, -0.040724f, -0.041597f, 0.243659f, -0.472181f, 0.015401f, + -0.180376f, 0.153139f, -0.247738f, -0.010485f, -0.157158f, 0.016825f, + -0.238925f, -0.265798f, -0.318374f, 0.142352f, -0.210520f, 0.051928f, + -0.352190f, -0.179052f, -0.185498f, 0.025540f, -0.111667f, -0.235187f, + -0.215454f, 0.010931f, -0.238372f, -0.126659f, 0.075691f, -0.091167f, + -2.462379f, -0.007950f, -0.637990f, 0.285554f, -0.051275f, 0.282279f, + -0.744083f, -0.570646f, 0.592198f, 1.421332f, -0.256027f, -0.140315f, + 0.160247f, -0.063185f, -0.055895f, -0.199864f, -0.287353f, -0.074561f, + -0.071228f, 0.055864f, -1.084764f, -0.263409f, 0.779266f, 0.228187f, + 0.375013f, 0.121204f, -0.656948f, 0.533561f, 0.272671f, -0.015423f, + -0.124180f, -0.009127f, 2.934838f, -0.150998f, 1.163152f, 0.081997f, + -4.715939f, -3.676595f, -1.524886f, -0.167593f, 0.281186f, 0.024046f, + -1.451709f, 0.332558f, 0.990504f, 0.376290f, -1.466773f, -0.448439f, + -2.929108f, -4.255188f, 0.065238f, 0.019950f, 1.372393f, 0.444052f, + -2.538772f, 1.579767f, -0.464911f, -1.866114f, 1.053958f, 0.434467f, + -0.125964f, 0.034671f, 0.077116f, -0.138466f, -0.413395f, -0.223453f, + -0.172127f, -0.251265f, -0.048239f, -0.395519f, 0.023141f, 0.037459f, + -0.249593f, -0.062215f, -0.047209f, -0.435189f, -0.164155f, -0.077590f, + -0.241164f, -0.126128f, -0.038243f, -0.180888f, 0.198840f, -0.328036f, + -0.169790f, 0.036506f, 0.052572f, -0.183570f, -0.073617f, -0.244959f, + 0.266498f, 0.032846f, -1.902106f, 0.486078f, 2.414993f, 0.975182f, + -0.382875f, 1.647810f, -2.197017f, -0.890107f, 0.221287f, 0.010889f, + 3.817042f, 0.572728f, 0.092466f, 0.473337f, -1.634659f, -1.069455f, + 1.486776f, -1.023850f, 0.088184f, 0.008842f, 0.518202f, 0.270259f, + 1.757191f, -0.121839f, -2.912229f, -1.250866f, -2.381808f, 0.335309f, + -0.120079f, -0.061294f, -0.058725f, -0.315169f, -0.262443f, 0.072434f, + -0.267836f, -0.319354f, -0.274975f, 0.068970f, -0.406467f, 0.044074f, + -0.152311f, -0.333656f, -0.228355f, -0.185613f, 0.017346f, -0.177674f, + -0.090675f, -0.102047f, -0.011768f, -0.025280f, -0.271661f, 0.098099f, + -0.312272f, -0.222217f, -0.100548f, 0.106260f, -0.034655f, 0.135109f, + -0.021276f, 0.018177f, -0.353097f, -0.011128f, 0.061136f, -0.511662f, + -0.223236f, -0.308841f, 0.118789f, -0.154628f, -0.053178f, -0.055973f, + 0.013175f, -0.368337f, -0.090863f, -0.116920f, 0.178990f, -0.025278f, + -0.190553f, -0.238092f, 0.303943f, -0.024944f, 0.719373f, 0.384332f, + -0.378480f, -0.423316f, 0.709922f, 0.758514f, -1.559023f, -2.503173f, + 0.068652f, -0.234741f, -0.182932f, 0.037878f, 0.020684f, -0.174142f, + -0.182300f, -0.052796f, -0.219145f, 0.113028f, -1.041826f, 0.035317f, + 0.919904f, -0.676011f, 0.652297f, 1.456447f, -0.166904f, -0.861823f, + 0.895827f, 0.429821f, -0.180376f, -0.076587f, -0.273945f, -0.288990f, + -0.206692f, -0.080745f, -0.085444f, 0.186953f, -0.050135f, 0.044243f, + -0.391706f, -0.160498f, -0.292268f, 0.164060f, 0.412649f, 0.211611f, + -0.327294f, -0.919399f, 0.320297f, 0.385284f, -0.088848f, -0.072556f, + -0.384813f, -0.176267f, -0.065918f, 0.134724f, -0.231104f, -0.337707f, + -0.195442f, -0.263569f, 0.098090f, -0.341411f, -0.189211f, -0.439276f, + -0.404046f, 0.262491f, -0.311093f, -0.086454f, -0.013400f, -0.061447f, + -0.026945f, -0.112036f, -0.322985f, 0.078500f, -0.230205f, -0.344535f, + -0.021087f, 0.110220f, -0.128671f, 0.044219f, +}; + +static const float av1_ab_partition_nn_bias_16_layer0[64] = { + 2.936406f, -0.396539f, -0.110456f, -1.254954f, 0.785350f, 0.516290f, + -0.172341f, 0.254386f, -0.192465f, -0.106751f, -0.055518f, -0.094994f, + 0.000000f, -0.065018f, -0.004908f, -0.130483f, -0.119580f, -0.142072f, + 0.457446f, -0.125051f, -0.107712f, 0.714607f, -0.140809f, -1.788650f, + -0.087199f, 0.000000f, -1.290050f, 0.443930f, -0.110634f, -0.109380f, + -0.188213f, -1.414179f, 1.193579f, 0.388775f, -0.873193f, -0.110050f, + -0.072565f, -0.117050f, -0.119132f, 0.456959f, -0.132069f, 0.131974f, + 1.160474f, 1.746465f, 0.442628f, -0.188849f, -0.207794f, -0.108364f, + -0.856655f, -2.141620f, 0.335476f, -0.105508f, -0.212162f, -0.109319f, + -0.237213f, -0.109980f, -0.291044f, -0.137877f, 0.470191f, -0.023908f, + 0.123809f, -0.109797f, 0.200510f, -0.147542f, +}; + +static const float av1_ab_partition_nn_weights_16_layer1[64 * LABEL_SIZE] = { + -6.823716f, 1.406568f, -0.144009f, 2.228765f, 0.838336f, 0.738107f, + -0.319014f, -0.148756f, 0.240862f, -0.111089f, -0.004241f, 0.025758f, + -0.193820f, -0.246362f, -0.181363f, -0.201556f, 0.024268f, 0.252994f, + -0.289443f, 0.194932f, 0.057467f, 0.724735f, 0.014063f, 1.361352f, + 0.025191f, 0.024274f, 0.231462f, -7.227959f, -0.094515f, 0.039946f, + 0.412719f, 0.812318f, 3.038903f, -0.286289f, 0.647482f, -0.115114f, + 0.053590f, 0.066069f, 0.153134f, 0.996250f, -0.125700f, 0.951365f, + -6.243494f, -4.827697f, 0.566320f, 0.239515f, -0.099702f, 0.054546f, + 1.847330f, 3.680076f, -3.049829f, -0.127709f, 0.068469f, -0.017794f, + 0.223864f, -0.106778f, -0.020425f, -0.040226f, -0.251890f, -0.168673f, + -0.552073f, 0.043311f, 0.218668f, 0.033209f, -3.199210f, 0.193079f, + 0.321406f, 0.718307f, -0.181418f, -0.459612f, -1.981170f, 0.968496f, + -0.029757f, -0.130065f, 0.043782f, 0.072394f, -0.088686f, 0.025322f, + 0.129882f, 0.101324f, 0.335707f, 0.072714f, -2.079774f, 0.203997f, + 0.239321f, -0.301757f, 0.257845f, 1.288382f, -0.031275f, -0.234194f, + 0.310722f, 2.045469f, 0.034716f, 0.135638f, -0.251388f, 0.320071f, + -1.065301f, -0.322731f, -0.545028f, 0.226276f, 0.090799f, 0.019289f, + 0.048950f, -1.079300f, 0.231938f, 0.083683f, 4.762127f, 0.145037f, + -0.145549f, 0.075592f, 0.172336f, 0.108175f, 0.333751f, 1.090501f, + 1.056114f, 0.047073f, 0.182052f, -0.081587f, 0.089900f, 0.339286f, + 2.049988f, 0.073585f, 0.537355f, -0.243322f, -0.010179f, -0.052601f, + -0.174915f, 0.117793f, 2.222990f, -2.520837f, -0.092699f, 1.199887f, + 0.138720f, 0.679918f, -0.463155f, -0.659496f, -0.109913f, -0.003398f, + 0.114633f, -0.128377f, 0.092970f, -0.107489f, -0.191078f, 0.185182f, + 0.216980f, -0.019343f, 3.443133f, 0.287953f, 0.099314f, 0.985958f, + 0.157268f, -0.606516f, 0.049418f, -0.221809f, -0.453081f, -0.344796f, + -0.003735f, -0.107269f, -0.128541f, -0.259543f, -0.934806f, -0.542456f, + -1.011192f, 0.022795f, 0.186363f, -0.076356f, -0.050932f, -0.165098f, + 0.168177f, -0.101596f, -5.270886f, 2.553943f, -0.440870f, -0.017494f, + 0.215208f, -0.017032f, 1.495915f, -4.304677f, 0.762211f, 0.182937f, + 0.254406f, -0.029433f, -0.088364f, -0.110160f, -0.108257f, -0.036538f, + 0.737697f, -0.234989f, 0.168095f, 0.245118f, -0.077262f, 0.195718f, + 0.753302f, -1.637869f, 0.126227f, 0.982129f, -0.121444f, -0.295570f, + -1.215799f, 0.147867f, -0.068496f, 0.132726f, -0.005772f, -0.181774f, + 0.126513f, 0.204723f, -0.366123f, 0.103906f, -0.148053f, -0.075272f, + 0.243884f, -0.104828f, 0.198988f, 0.501034f, -0.112671f, 0.111421f, + 0.167508f, -0.117803f, -0.738624f, 2.046292f, 0.124011f, 0.057983f, + -0.359154f, -0.648883f, -0.259462f, -0.459041f, -2.501223f, -0.065138f, + 0.122417f, 0.060291f, -0.129033f, -0.843086f, 0.268241f, -0.399927f, + 1.585888f, 1.816393f, -0.631427f, 0.127826f, 0.088105f, 0.073488f, + 0.717694f, -1.497362f, 2.608528f, 0.066896f, -0.079230f, 0.223436f, + -0.010530f, 0.175310f, 1.120365f, 0.034391f, 0.835312f, 0.071652f, + -0.080615f, 0.111395f, 0.162742f, 0.079927f, -3.859582f, -0.638431f, + -0.167880f, -0.992659f, -0.885355f, -1.276197f, 1.334344f, 0.931940f, + -0.078244f, -0.149030f, -0.070974f, -0.133566f, 0.200034f, 0.102793f, + -0.048546f, 0.063545f, 0.023864f, -0.190863f, 1.934257f, -0.136286f, + -0.107916f, -0.637468f, 0.066449f, 1.089693f, -0.214047f, -0.265780f, + 0.899660f, -0.130333f, 0.288311f, -0.049024f, 0.090202f, 0.487969f, + 0.339704f, 0.858479f, 0.841253f, -0.184100f, -0.637070f, -0.125071f, + -0.077650f, -0.087877f, 0.202268f, -0.027300f, 2.842862f, -0.100698f, + -0.259080f, 0.260556f, 0.157912f, -0.070364f, 0.467190f, 1.200037f, + 1.419317f, -0.033588f, -0.227824f, 0.292617f, 0.228574f, 0.213839f, + -1.091099f, -0.022258f, -1.294681f, 0.136118f, 0.081652f, -0.185359f, + -0.039706f, 0.191407f, -2.053219f, -0.261934f, 0.047812f, -0.029536f, + -0.823869f, -1.090534f, -0.755890f, 0.441035f, -0.167945f, 0.231441f, + -0.135013f, -0.260762f, 0.256872f, 0.130339f, -0.243751f, 0.189760f, + -0.288454f, 0.145363f, 0.338490f, 0.403898f, -0.022814f, -1.263598f, + -0.101315f, 0.860135f, 0.136511f, 0.028942f, 0.574047f, 2.656370f, + 0.037587f, -0.188690f, -0.125312f, 1.100435f, -1.080402f, 0.380905f, + 0.004635f, 0.097144f, -0.214309f, 0.085552f, -0.285066f, -0.705134f, + -0.054704f, -0.319951f, 5.486626f, 0.958158f, -1.380585f, 0.223340f, + -0.169167f, -0.170697f, -0.216748f, 0.324232f, 2.684204f, -0.008490f, + -0.211052f, -0.201190f, 0.123466f, -0.000234f, 0.579907f, 0.096938f, + -0.042745f, 0.201855f, 0.157195f, -0.261440f, 0.029699f, -0.046599f, + 1.618216f, -2.596280f, -0.377420f, -0.526725f, -0.493592f, -0.579615f, + 0.579699f, -0.100392f, 0.150694f, 0.061794f, 0.200425f, -0.062515f, + -0.179122f, 0.250112f, -0.344675f, -0.118359f, -0.095670f, 0.152311f, + 3.662276f, -0.154921f, -0.312991f, 0.972008f, -0.308596f, -0.190426f, + 0.133889f, -0.238673f, -0.094726f, 1.683835f, -0.215629f, -0.198890f, + -0.035278f, -0.367973f, -0.822435f, 0.240848f, -0.194656f, 0.034655f, + -0.079424f, 0.146670f, 0.026646f, -0.034507f, 0.059467f, -0.153109f, + -0.431033f, 2.552991f, -1.894091f, -0.180462f, -0.306839f, -0.025648f, + 1.026326f, -3.096230f, 1.346935f, 0.033633f, -0.181827f, 0.094376f, + 0.001696f, -0.379264f, -1.069503f, -0.140972f, -0.208769f, -0.195239f, + 0.281795f, -0.127251f, 0.180776f, 0.067763f, 0.697124f, -1.040779f, + 0.111280f, 0.188351f, -0.340234f, -0.207790f, -0.720075f, -0.137409f, + -0.070310f, -0.032918f, -0.060787f, 0.131484f, -0.077845f, -0.258652f, + 0.056911f, -0.062034f, 0.007663f, -0.185100f, 1.340361f, 0.014096f, + -0.124602f, 0.194241f, 0.128383f, 0.360465f, 0.082979f, -0.050475f, + -0.519294f, 3.323262f, 0.067014f, 0.221203f, -0.085082f, -0.228606f, + -0.916668f, -0.022643f, -1.386737f, -0.131902f, -0.349952f, -0.032874f, + -0.189190f, -0.898790f, -0.102394f, -1.017387f, 2.214050f, 1.790253f, + -1.913561f, -0.043716f, -0.214924f, -0.194598f, -0.064723f, -1.671793f, + 2.251166f, -0.146007f, 0.138527f, -0.003134f, 0.103665f, 0.006928f, + -0.240253f, -0.227464f, 0.578437f, -0.214724f, 0.503085f, 0.158093f, + 0.033091f, 0.008061f, 4.815371f, 2.132264f, 0.281850f, -2.288560f, + -0.145012f, 1.296832f, -0.362401f, -0.403252f, 0.109873f, 0.185746f, + 0.244764f, 0.172367f, -0.185588f, 0.139801f, -0.178254f, 0.068629f, + 0.358488f, -0.153969f, -6.433524f, 0.225983f, -0.138123f, -0.095971f, + -0.036089f, -1.400083f, 0.265908f, 0.257787f, 0.181144f, -1.647228f, + -0.136289f, -0.074206f, 0.122988f, -0.088895f, -1.266717f, 0.006010f, + 0.536681f, 0.263061f, -0.032207f, -0.155136f, 0.086431f, 0.441950f, + -0.060755f, -0.280683f, -0.783475f, -2.567033f, 1.093221f, 0.117667f, + -0.000408f, 0.225719f, -2.199698f, 0.141447f, -1.459051f, 0.051315f, + 0.203228f, 0.354432f, -0.005775f, -0.028073f, -0.965817f, 0.231083f, + -0.666884f, 0.026283f, -0.317486f, 0.210754f, 0.123897f, 0.223827f, + 4.214405f, 1.457334f, -0.253945f, -1.306733f, -0.391235f, 0.451154f, + -1.553888f, -0.353429f, 0.069533f, 0.159278f, -0.173836f, -0.004952f, + -0.137033f, 0.127012f, 0.143600f, 0.051587f, -0.070549f, 0.066509f, + -5.776547f, 0.180021f, -0.189183f, -1.288504f, -0.233575f, -1.473873f, + 0.140940f, 0.144451f, -0.104534f, 2.089873f, -0.168168f, 0.110726f, + 0.132134f, -0.215223f, -1.682754f, 0.157757f, -0.146163f, 0.064882f, + 0.117313f, -0.038780f, -0.124720f, -0.501697f, 0.092047f, -0.233992f, + 3.324976f, 0.516601f, 1.294202f, 0.119989f, 0.061055f, 0.043420f, + -2.750727f, -0.382812f, -0.648496f, -0.115353f, -0.334205f, 0.024354f, + -0.282998f, -0.282705f, 0.073798f, 0.169851f, 0.135651f, 0.182677f, + -0.040220f, 0.132462f, -0.303120f, -0.230113f, 6.165739f, -0.258596f, + 0.024127f, -1.388283f, -0.006042f, 0.572600f, 0.348411f, -0.387376f, + -0.075845f, 0.122319f, -0.029616f, 0.077873f, 0.154763f, 0.049073f, + 0.018597f, 0.102688f, -0.204165f, 0.020734f, -1.389133f, -0.032854f, + -0.147561f, 0.853944f, 0.132100f, -3.259659f, 0.243745f, 0.181529f, + -0.738414f, 1.509994f, 0.023470f, -0.005329f, 0.066115f, -1.345081f, + -1.455402f, -0.172023f, -0.194625f, 0.071885f, -0.201742f, -0.262402f, + 0.077601f, -0.048938f, 0.257993f, -0.504029f, -2.032415f, 1.158880f, + 0.448647f, -0.025633f, 0.117586f, -0.072275f, -0.673744f, -3.854342f, + -0.983843f, 0.047766f, -0.017193f, -0.215775f, -0.158743f, -0.232042f, + -0.509112f, 0.148812f, 0.130122f, 0.006486f, -0.099016f, 0.022514f, + -0.486850f, -0.059623f, 4.012731f, 0.025454f, 0.029059f, -0.783546f, + -0.295260f, 0.322521f, -0.473201f, -0.172100f, -0.100087f, -0.076516f, + -0.258367f, -0.112897f, 0.269364f, -0.065912f, 0.169022f, -0.178783f, + -0.095114f, 0.122089f, -2.790099f, -0.100431f, -0.087963f, -0.009431f, + -0.087819f, -2.774399f, -0.100757f, 0.013005f, -0.964533f, 3.236665f, + -0.354903f, -0.144169f, -0.166869f, -1.396513f, -0.931271f, -0.046261f, + -1.799262f, -0.365269f, 0.108611f, 0.037994f, 0.024747f, -1.073639f, + -0.203158f, -0.935006f, 1.880891f, 1.578385f, 0.726272f, -0.024546f, + -0.011626f, -0.151363f, -1.121716f, -1.787484f, 0.232806f, 0.075451f, + 0.182899f, 0.092215f, -0.207347f, -0.030111f, 0.054316f, 0.192481f, + 0.594639f, -0.247694f, 0.547471f, -0.032094f, -0.065000f, 0.007198f, + 1.605377f, -0.155945f, -0.066200f, -2.343716f, -1.016283f, -0.079321f, + 0.919365f, 0.599980f, 0.125545f, 0.265813f, 0.246884f, 0.095385f, + -0.260374f, -0.202916f, -0.042770f, 0.234967f, -0.233139f, -0.326994f, + -1.375256f, 0.121766f, 0.077433f, -1.103569f, 0.019497f, -1.029185f, + 0.253905f, 0.206569f, 0.187334f, -0.237089f, -0.294351f, 0.164137f, + 0.149696f, -0.749787f, -0.413433f, 0.976587f, 1.027976f, -0.285264f, + 0.209273f, -0.124762f, 0.050884f, 0.250764f, -0.082031f, -0.646520f, + 4.116680f, 0.437336f, 0.671684f, 0.129509f, -0.078462f, 0.014072f, + -0.678232f, 0.094831f, 1.125624f, 0.207070f, -0.154750f, -0.025780f, + -0.103030f, 0.118019f, -0.908186f, -0.263546f, -1.555324f, -0.236887f, + -0.217854f, -0.051790f, 0.017915f, 0.171001f, 1.355562f, 0.094603f, + -0.233929f, -1.282169f, -0.773183f, -0.161682f, -0.834565f, -0.286776f, + -0.298901f, 0.038162f, 0.251899f, 0.039612f, -0.022935f, -0.232308f, + -0.043855f, -0.192892f, -0.279009f, -0.182234f, -1.272808f, -0.070344f, + -0.092432f, -1.915946f, -0.134373f, -1.405496f, -0.067071f, -0.131922f, + 0.185269f, 1.465082f, 0.040240f, 0.112665f, 0.144329f, -0.286112f, + -0.617649f, 0.916177f, 0.221044f, -0.079867f, 0.170251f, -0.093638f, + -0.212620f, -0.305945f, -0.234356f, -0.482501f, 3.928472f, 1.241179f, + 0.355922f, -0.170848f, -0.189168f, 0.080225f, -1.357793f, 0.190890f, + 0.976800f, -0.068070f, -0.016295f, -0.088623f, -0.129560f, -0.212267f, + -0.071537f, -0.219501f, -0.655198f, -0.225188f, -0.116024f, 0.224174f, + -0.049715f, -0.178005f, 3.029985f, -1.141546f, 0.080066f, -1.932316f, + -0.641137f, -0.189564f, 0.935080f, 0.136119f, 0.015558f, -0.179331f, + 0.204571f, 0.020350f, 0.009362f, 0.108478f, 0.037076f, -0.049009f, + 0.081090f, -0.180202f, 1.455561f, -0.081559f, 0.059361f, 0.484971f, + 0.160923f, -2.170744f, -0.013204f, 0.126561f, -0.407122f, 1.223661f, + 0.044262f, 0.118044f, 0.058274f, -1.747100f, -0.171318f, 0.971374f, + 0.306995f, -0.103268f, -0.319443f, -0.333176f, -0.038608f, 0.119674f, + -0.106479f, -0.907933f, 1.121231f, 1.673840f, -0.421458f, -0.021146f, + -0.254838f, 0.097632f, 0.235109f, -2.901782f, 0.289518f, -0.355459f, + -0.068264f, -0.179121f, 0.068560f, -0.047570f, -0.522523f, -0.228963f, + -1.037158f, -0.163723f, 0.280563f, -0.000868f, -0.197220f, -0.239329f, + 1.985274f, -0.256181f, -0.064341f, -0.822417f, -0.465140f, -0.010942f, + -0.792024f, -0.114290f, 0.060969f, 0.104106f, -0.252123f, -0.150400f, + -0.133277f, 0.267147f, 0.274413f, 0.223744f, -0.180223f, -0.345415f, + -0.104883f, 0.119210f, -0.095041f, -0.301635f, 0.013175f, -2.128121f, + -0.147208f, -0.151509f, -0.692013f, 3.418555f, -0.016541f, 0.171511f, + 0.107159f, -1.516672f, 0.127408f, 0.687035f, -0.906486f, -0.145463f, + -0.169382f, -0.143906f, 0.125091f, -0.960645f, -0.180869f, -0.716908f, + 2.840951f, 1.904919f, -0.416268f, -0.425181f, -0.194697f, -0.075932f, + -0.950604f, -1.599800f, 0.943671f, -0.022744f, -0.270492f, 0.080843f, + -0.372916f, 0.047838f, -0.100300f, -0.026600f, 0.011733f, -0.226051f, + 0.172790f, -0.172982f, 0.041258f, -0.299379f, +}; + +static const float av1_ab_partition_nn_bias_16_layer1[LABEL_SIZE] = { + -0.053805f, -1.248639f, 0.520965f, -0.904962f, -0.126425f, -0.118798f, + 0.748430f, 0.203096f, 0.059317f, 0.418219f, 0.841294f, 0.402693f, + -0.658522f, 0.723479f, 0.544264f, 1.035225f, +}; + +static const NN_CONFIG av1_ab_partition_nnconfig_16 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + 64, // num_hidden_nodes + }, + { + av1_ab_partition_nn_weights_16_layer0, + av1_ab_partition_nn_weights_16_layer1, + }, + { + av1_ab_partition_nn_bias_16_layer0, + av1_ab_partition_nn_bias_16_layer1, + }, +}; + +#undef FEATURE_SIZE +#undef LABEL_SIZE + +#define FEATURE_SIZE 18 +#define LABEL_SIZE 4 + +static const float av1_4_partition_nn_weights_16_layer0[FEATURE_SIZE * 24] = { + -2.032866f, 0.056691f, 0.495960f, 0.778785f, 0.548153f, -0.806942f, + 0.481155f, 0.282298f, 0.584980f, 0.504688f, 0.209648f, 0.234616f, + 0.213484f, 0.221969f, 0.205862f, 0.235054f, 0.317863f, 0.257139f, + 0.529478f, 0.098122f, -0.657532f, 0.036296f, 0.327728f, 1.323180f, + -0.813082f, 0.160216f, -0.702030f, 0.722733f, -0.270576f, -0.347416f, + -0.264700f, -0.254248f, 0.159820f, 0.087995f, -0.184163f, 0.117357f, + 0.074194f, -0.667369f, 0.498246f, 0.420506f, 0.072409f, -0.121581f, + 0.315788f, 0.000525f, 0.414986f, 0.678166f, -0.011230f, 0.188131f, + -0.227749f, 0.009564f, 0.108672f, 0.106923f, -0.080695f, -0.279382f, + -0.061339f, -0.297835f, -0.134707f, 0.145865f, -0.009655f, -0.000842f, + -0.047436f, -0.159149f, -0.320353f, -0.089646f, -0.344765f, 0.313416f, + -0.143413f, 0.279668f, 0.000885f, -0.022380f, -0.140194f, -0.310473f, + 0.252699f, 0.066204f, 0.477568f, 0.994609f, -0.276000f, 1.213182f, + 0.277028f, -0.411570f, -0.211559f, 0.377815f, 0.121488f, -0.100559f, + -0.317082f, -0.251039f, -0.335181f, -0.154114f, -0.052726f, -0.332558f, + -0.143196f, -0.334035f, 0.162305f, 0.142279f, -0.001210f, -0.135252f, + -0.033562f, 0.204307f, -0.039757f, -0.394174f, 0.126617f, -0.128648f, + -0.410979f, 0.107641f, -0.117573f, -0.326512f, 0.235166f, 0.084959f, + 0.290063f, -0.005838f, 0.459894f, 1.023709f, -0.196145f, 1.100137f, + -0.319815f, -0.308526f, -0.443389f, -0.272769f, -0.035259f, -0.026932f, + -0.029743f, 0.125113f, -0.131024f, -0.321458f, -0.143996f, 0.008714f, + -0.101234f, 0.079706f, -1.128615f, -0.467381f, 0.220563f, -0.409900f, + -0.435353f, 0.759499f, -0.465799f, -0.394309f, 0.176282f, -0.086275f, + -0.161225f, -0.354814f, 0.562871f, 0.418253f, 0.414361f, 0.445480f, + -0.995903f, -0.086632f, -0.230645f, 0.354656f, -0.317576f, 0.079926f, + 0.424369f, 0.997232f, -0.304388f, 1.071667f, -0.023540f, 0.029677f, + 0.108564f, 0.183581f, -0.201395f, -0.054854f, -0.193039f, -0.049899f, + -0.271949f, -0.358483f, 0.304930f, 0.023823f, -0.009319f, -0.214247f, + 0.100712f, -0.050162f, 0.327103f, -0.212999f, -0.030496f, 0.316380f, + -0.439589f, -0.249959f, 0.229777f, -0.353664f, -0.384559f, 0.114236f, + 0.023119f, 0.007927f, 0.618368f, 0.957759f, -0.019780f, -1.002389f, + 0.564277f, -0.839531f, 1.040445f, 0.054340f, 0.031908f, -0.032893f, + -0.019170f, -0.042011f, 0.568928f, 0.362567f, -0.559999f, -0.605344f, + -0.586146f, -0.290778f, 0.195943f, -0.109580f, -0.088898f, -0.113054f, + 0.293282f, 0.429019f, 0.306136f, 0.863025f, 0.021234f, 0.125770f, + -0.097108f, -0.072659f, -0.137053f, -0.191631f, 0.106281f, 0.064151f, + 0.029883f, 0.076287f, 0.757543f, 0.276713f, -2.529775f, -0.351727f, + -1.832316f, 0.544780f, -0.944529f, 0.509705f, -0.010236f, -0.016181f, + 0.021520f, 0.086417f, 0.041312f, 0.296853f, -0.372378f, 0.354446f, + -1.366762f, 0.048875f, 0.464918f, -0.007450f, 0.750013f, -0.360261f, + 0.518532f, 0.753776f, 0.641448f, 0.710746f, 0.250866f, 0.257063f, + 0.283421f, 0.253585f, 0.170303f, 0.210426f, 0.208842f, 0.158000f, + -0.033144f, 0.130748f, 0.907147f, 0.409248f, -0.854301f, -0.981307f, + 0.294427f, -0.507137f, 1.079967f, 0.203203f, 0.383890f, 0.368278f, + 0.305122f, 0.449288f, -0.044507f, -0.547263f, -0.298245f, -0.497834f, + 0.007016f, -0.101982f, -0.073488f, -0.096111f, -0.479418f, -0.045497f, + 0.033502f, -0.018578f, -0.231531f, 0.177949f, 0.099564f, -0.010233f, + -0.333055f, -0.078586f, -0.417867f, 0.171271f, 0.013662f, -0.143599f, + -0.117296f, 0.135382f, 0.048321f, 0.000924f, -0.055024f, -0.405595f, + -0.068260f, -0.271011f, -0.436425f, 0.206751f, -0.899890f, 0.605510f, + 0.535649f, -0.238919f, -0.037619f, -0.213734f, -0.391360f, -0.132344f, + 0.004660f, 0.176644f, -1.008475f, -0.038895f, 0.155429f, -0.095229f, + -0.680124f, -0.258063f, -0.261901f, 0.110380f, -0.337649f, -0.505870f, + -1.428536f, 0.610629f, 0.254905f, 0.045098f, 0.044109f, 0.172329f, + 0.060001f, -0.234009f, -0.184855f, -0.153028f, -0.140897f, -0.152006f, + -0.312134f, 0.081261f, 0.160166f, 0.112690f, 0.266081f, 0.030175f, + -0.242746f, 0.000754f, -0.341811f, -0.149774f, -0.017484f, -0.301342f, + -0.121466f, 0.067300f, 0.342176f, 0.474538f, 0.085441f, -0.263935f, + 0.479235f, -0.003713f, -0.784840f, 0.119480f, 0.456632f, -0.640082f, + -0.080575f, -0.744403f, 0.259970f, 0.034667f, -0.274641f, -0.257594f, + -1.121124f, -0.003745f, -0.420693f, 0.300441f, -0.100976f, -1.049016f, + 0.201960f, 0.113054f, 0.187010f, 1.237427f, 0.054803f, -0.028673f, + 0.003596f, -0.034724f, 0.117246f, 0.190977f, 0.278915f, 0.224307f, + 0.017852f, -0.336233f, -0.372311f, -0.182284f, -0.143510f, 0.331466f, + 0.045698f, -0.301095f, 0.184447f, 0.348240f, -0.017021f, -0.145064f, + -0.000221f, -0.382256f, -0.302683f, -0.083927f, -0.008070f, 0.217907f, + 0.647597f, -0.050490f, -0.572736f, -0.985748f, -0.289943f, 0.041391f, + -0.795464f, -0.186680f, -0.354062f, -0.617400f, -0.282783f, -0.170450f, + -0.197197f, -0.146496f, -0.173692f, -0.106277f, -0.071004f, -0.124405f, + -0.971412f, 0.038542f, 0.705204f, 0.887113f, 0.150430f, -0.243676f, + 0.638410f, 0.320953f, 0.776676f, 0.527584f, 0.070389f, 0.051554f, + 0.177519f, 0.140451f, 0.128892f, 0.087771f, 0.197660f, 0.194764f, +}; + +static const float av1_4_partition_nn_bias_16_layer0[24] = { + 0.614063f, -0.384872f, 0.084884f, -0.023980f, -0.378765f, -0.082312f, + -0.458271f, 0.189578f, -0.046169f, -0.073308f, -0.372322f, 0.162793f, + 0.148803f, 0.829214f, -0.221162f, -0.111157f, -0.017484f, -0.280596f, + -0.031905f, -0.143459f, 0.078823f, -0.021940f, 0.026834f, 0.257472f, +}; + +static const float av1_4_partition_nn_weights_16_layer1[24 * LABEL_SIZE] = { + -0.985391f, 0.587616f, 0.740683f, 0.192066f, 0.447080f, -0.016585f, + 0.680449f, 0.028983f, 0.643111f, 0.234338f, 0.107148f, 0.328456f, + -0.216394f, 1.106838f, -0.179062f, -0.129108f, -0.121655f, -0.151340f, + -0.306017f, -0.350989f, 0.859284f, -0.372831f, -0.954419f, 0.250495f, + 1.046732f, 0.287923f, -0.421088f, 0.326613f, -0.314396f, -0.084757f, + -0.474228f, 0.687999f, 0.052334f, 0.441708f, -0.630698f, -0.350348f, + -0.602067f, -0.434161f, -0.489824f, -0.313193f, 0.315568f, 0.603119f, + 0.120245f, 0.182920f, -1.117797f, -0.239594f, -0.296296f, -0.718093f, + 0.489497f, -0.527019f, 0.102453f, 0.426731f, 0.034606f, 0.311461f, + -0.012723f, -0.229877f, -0.284290f, 0.383227f, 0.065696f, -0.222400f, + 1.279248f, -0.862190f, 0.629766f, -0.250011f, -0.325060f, -0.360115f, + -0.159540f, -0.291856f, -0.038348f, 0.224639f, 0.600934f, 0.030205f, + 1.337615f, -0.286409f, -0.473710f, -0.418995f, -1.035249f, 0.004359f, + -0.481860f, 0.563625f, -0.154709f, -0.101198f, -0.758796f, -0.507616f, + -0.095253f, -0.711135f, 0.207759f, 0.076313f, -0.056087f, -0.162719f, + -0.232918f, -0.128402f, -0.444620f, -0.447344f, 1.126012f, -1.504446f, +}; + +static const float av1_4_partition_nn_bias_16_layer1[LABEL_SIZE] = { + -0.462133f, + 0.465060f, + 0.062211f, + 0.401786f, +}; + +static const NN_CONFIG av1_4_partition_nnconfig_16 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + 24, // num_hidden_nodes + }, + { + av1_4_partition_nn_weights_16_layer0, + av1_4_partition_nn_weights_16_layer1, + }, + { + av1_4_partition_nn_bias_16_layer0, + av1_4_partition_nn_bias_16_layer1, + }, +}; + +static const float av1_4_partition_nn_weights_32_layer0[FEATURE_SIZE * 32] = { + -0.219494f, -0.428273f, 0.471006f, 0.448210f, -0.152935f, 0.440435f, + 0.922857f, -0.074436f, 1.002195f, 0.414176f, -0.327202f, -0.380066f, + -0.212346f, 0.061868f, -0.056620f, 0.594134f, 0.617995f, 0.308358f, + 0.232484f, 0.129849f, 1.483593f, -0.071460f, 1.984515f, 1.116422f, + -1.141762f, -0.306220f, 0.089075f, -0.271845f, 0.187524f, 0.050396f, + -0.061025f, 0.030809f, 0.172799f, -0.458151f, -0.318357f, 0.122052f, + -0.414329f, 0.089366f, 0.118898f, -0.376213f, -0.206151f, -0.519946f, + -0.463252f, -0.206694f, -0.254383f, -0.379487f, 0.093059f, -0.245280f, + -0.205044f, -0.280060f, -0.171229f, -0.045389f, -0.179481f, -0.306245f, + -0.500856f, 0.003388f, -0.527397f, -0.449330f, -0.174272f, 0.123769f, + 0.023005f, 0.157273f, 0.073400f, 0.019099f, -0.113848f, -0.098601f, + -0.290946f, -0.046770f, -0.314592f, -0.179914f, -0.391411f, -0.235631f, + -1.282604f, 0.048505f, -0.746382f, 0.093740f, -0.706583f, -0.085729f, + 0.947382f, -0.002961f, 1.175362f, 1.007309f, 0.141638f, -0.037608f, + -0.118807f, -0.021474f, -0.146763f, 0.069363f, -0.074372f, -0.215713f, + -0.004134f, -0.114110f, -0.330438f, -0.031136f, 0.111821f, -0.534598f, + -0.357759f, -0.455950f, 0.139469f, 0.036582f, -0.384743f, -0.168828f, + -0.239250f, 0.003520f, -0.049003f, 0.075702f, -0.025809f, -0.225972f, + -0.228905f, -0.412489f, 0.060570f, -0.328819f, -0.206446f, -0.080231f, + -0.372008f, -0.218118f, -0.011954f, 0.024155f, 0.156014f, 0.020679f, + 0.194398f, -0.283491f, -0.024463f, -0.275099f, 0.028031f, 0.026340f, + -0.254668f, 0.103637f, 2.178693f, 0.552284f, 0.109366f, -0.474806f, + -0.379286f, -0.026315f, 2.487924f, -0.089466f, 0.206428f, 0.114578f, + 0.152248f, 0.184050f, -0.631948f, -0.014793f, -0.283782f, -0.830353f, + 0.009343f, -0.021029f, -0.060534f, -0.025164f, 1.841311f, 1.842748f, + -1.979708f, 0.450985f, -1.606357f, -0.785454f, -0.212679f, -0.344342f, + 0.198991f, -0.258070f, 0.055974f, 0.224069f, 0.453051f, 0.408053f, + 0.027873f, -0.180538f, 0.056609f, 0.207654f, 0.104086f, -0.194426f, + -0.359789f, -0.381143f, -0.331212f, -0.203973f, -0.324313f, -0.160825f, + -0.160439f, -0.044856f, -0.346647f, 0.044859f, 0.231398f, -0.023643f, + -0.140316f, -0.260177f, 0.206965f, -0.425386f, -0.420268f, -0.409748f, + 0.006971f, 0.066186f, -0.034950f, -0.345518f, 0.018633f, -0.122489f, + -0.038506f, -0.330942f, 0.161236f, -0.314119f, -0.050202f, -0.179597f, + 0.731897f, -0.184481f, 0.153598f, -0.539501f, -0.301493f, -0.184967f, + -0.883754f, -0.586959f, -0.136292f, -1.772065f, -0.196276f, -0.053272f, + -0.101083f, -0.064142f, 0.161190f, 0.430826f, 0.355647f, 0.138266f, + 0.051114f, -0.028893f, -0.477673f, -0.238663f, -0.354117f, -0.056747f, + -0.334273f, -0.497688f, -0.486004f, -0.092033f, -0.241304f, -0.373250f, + 0.120193f, 0.011360f, -0.010475f, -0.092739f, -0.159650f, -0.033129f, + -0.259893f, -0.073217f, 0.200128f, 0.103407f, -0.229233f, 0.128831f, + -0.063450f, -0.241732f, -0.408428f, -0.342239f, -0.264326f, -0.105403f, + -0.442879f, -0.310456f, -0.112881f, 0.263696f, -0.205014f, -0.497936f, + -0.261734f, -0.382312f, -0.426807f, -0.021995f, -0.152794f, -0.301494f, + 0.117232f, -0.577809f, 0.154596f, -0.409522f, -0.413113f, -0.359199f, + 0.307294f, -0.008746f, -0.310522f, 0.347620f, -0.384845f, -0.451398f, + -0.226199f, 0.054154f, -0.167608f, 0.046836f, -0.013285f, -0.408119f, + -0.177973f, -0.248293f, -0.465830f, 0.035827f, -0.222208f, -0.221717f, + 0.066392f, -0.349769f, -0.428029f, -0.516692f, 0.022398f, -0.251682f, + 0.134746f, 0.011167f, -2.078787f, 0.173592f, -1.948348f, 0.330060f, + 1.993785f, -0.052859f, -0.004795f, -3.703177f, 0.013450f, -0.011687f, + 0.073079f, 0.034803f, 0.025515f, 0.005994f, 0.101731f, 0.074303f, + -0.109962f, -0.270825f, -0.068273f, -0.163268f, -0.252826f, 0.137190f, + 0.007667f, -0.358453f, 0.027412f, 0.033492f, 0.021197f, -0.049991f, + 0.104468f, -0.012157f, -0.056252f, -0.380756f, -0.338483f, 0.233235f, + -0.048631f, -0.441209f, -0.158482f, -0.148108f, -0.263453f, 0.138847f, + -0.304073f, -0.336312f, -0.017941f, -0.135563f, 0.075137f, -0.246475f, + -0.229144f, -0.087744f, -0.346909f, 0.172611f, 0.004377f, -0.009386f, + -0.023104f, 0.008000f, -0.029390f, -0.317842f, 0.549674f, -0.195337f, + -0.863979f, 0.160889f, -0.269014f, -0.442104f, -1.799191f, 1.396533f, + -0.112837f, 0.881303f, 0.000764f, -0.035415f, -0.141877f, 0.184831f, + -0.363566f, -0.178569f, 0.254134f, -0.326893f, 0.127325f, 0.310620f, + -0.384621f, 0.146058f, -0.287682f, -0.373447f, 0.026930f, 0.251650f, + 0.053817f, 0.227509f, 0.121396f, 0.396514f, -0.278381f, -0.038969f, + -1.538756f, -0.002856f, -0.892900f, 0.363426f, -1.257922f, 0.743795f, + 0.941177f, 0.219345f, 0.684189f, 1.396858f, 0.026299f, -0.093433f, + -0.066182f, 0.057868f, -0.089278f, -0.159680f, -0.262035f, -0.236656f, + 0.005349f, -0.031314f, 0.027917f, -0.182113f, -0.212086f, -0.160774f, + 0.051468f, 0.036787f, 0.183881f, -0.288205f, -0.349691f, 0.162511f, + 0.117878f, -0.294534f, -0.365037f, -0.246313f, 0.073977f, -0.072378f, + -0.173579f, -0.584560f, 0.547194f, 0.259853f, -0.405287f, -0.421146f, + 0.165788f, -0.146964f, 0.257415f, 0.772394f, -0.475302f, -0.310906f, + 0.058723f, 0.276833f, 0.586842f, 0.248998f, -0.061135f, 0.255779f, + 0.152158f, -0.024781f, 2.821834f, 1.365141f, 0.914744f, 0.165752f, + -1.048304f, -0.333891f, 1.804087f, -0.437028f, -0.120211f, -0.020443f, + 0.040077f, 0.258600f, -0.598893f, -0.494579f, -0.281054f, -0.517041f, + 0.005258f, 0.053986f, 0.322755f, 0.429495f, -1.992364f, -0.717192f, + -1.774802f, 2.047362f, -0.016194f, 0.312606f, 0.019331f, 0.060950f, + 0.116428f, 0.168458f, -0.307001f, -0.420734f, 0.475843f, 0.425346f, + -0.107119f, 0.049892f, -1.168619f, 0.010878f, 0.354872f, 0.902717f, + -0.391407f, 0.332772f, -1.335037f, -0.447100f, 0.481719f, -0.101069f, + -1.806565f, 0.925280f, 0.346999f, 0.093809f, 0.006275f, 0.270814f, + -0.691123f, 0.230748f, 0.137033f, 0.068228f, 1.555975f, -0.271637f, + -0.370403f, 0.236131f, 0.367464f, -0.136562f, 0.428838f, 0.181750f, + 0.338762f, 0.292449f, -0.748204f, -0.922731f, -0.959445f, -0.806418f, + -0.140501f, 0.070525f, 1.248748f, 0.637990f, -1.307246f, -0.514055f, + 0.393858f, -1.858727f, 0.713591f, -0.141044f, 0.080723f, 0.120220f, + -0.031175f, 0.224488f, 0.753818f, -0.833351f, -1.099132f, 0.651100f, + -0.135061f, -0.043820f, 0.026983f, -0.059259f, 0.001345f, -0.281775f, + 0.006958f, 0.046103f, -0.246539f, 0.057630f, -0.360778f, -0.160681f, + -0.414870f, -0.301979f, 0.000683f, 0.132957f, -0.477609f, 0.106110f, + -0.637769f, -0.078374f, -0.229494f, 0.583108f, -0.822973f, -0.107540f, + 1.063426f, -0.268346f, 1.105787f, 2.587550f, -0.020314f, -0.002161f, + -0.063836f, -0.099990f, -0.103975f, -0.114078f, -0.094199f, -0.065181f, + -0.019870f, -0.018920f, -0.219732f, 0.035608f, -1.789450f, 0.483032f, + -0.464729f, 1.563277f, -1.054195f, 0.359991f, 0.065204f, 0.135623f, + 0.158380f, -0.103815f, -1.398726f, -1.436666f, -0.356311f, 0.507752f, +}; + +static const float av1_4_partition_nn_bias_32_layer0[32] = { + 0.421645f, -0.620548f, -0.187819f, -0.189414f, -0.204975f, -0.189600f, + -0.174917f, -0.651928f, -0.799655f, -0.086105f, -0.163449f, -0.089212f, + -0.214495f, -0.108500f, -0.065777f, -0.127704f, 1.544948f, -0.032831f, + -0.165621f, 0.145844f, -0.032104f, -0.453246f, -0.113444f, 0.321589f, + -0.862375f, -0.108826f, -0.486259f, 0.685325f, 0.072569f, -0.187961f, + 0.109579f, -0.082685f, +}; + +static const float av1_4_partition_nn_weights_32_layer1[32 * LABEL_SIZE] = { + 0.255012f, 0.658860f, 0.216907f, 0.165947f, 0.241182f, 0.340854f, + 0.409445f, 0.165220f, 0.553373f, -0.242385f, -0.209571f, 0.255515f, + 0.222500f, 0.037032f, 0.238590f, 0.061624f, -2.038693f, 0.264167f, + -0.230144f, 0.129952f, -0.027979f, 0.847761f, 0.438922f, 0.462323f, + 0.555345f, 0.030689f, 0.336357f, -0.357326f, -0.113137f, 0.272631f, + 0.421022f, 0.367776f, -0.197094f, 0.157117f, -0.015008f, -0.056123f, + -0.283913f, 0.186417f, 0.178561f, -0.763041f, 0.602038f, 0.341092f, + 0.320453f, -0.312776f, -0.371240f, -0.356279f, 0.220117f, -0.131871f, + 1.517429f, 0.162223f, -0.255069f, 0.451861f, 0.045071f, -0.223257f, + 0.003257f, 0.015734f, -0.630447f, -0.672588f, 0.670164f, 0.571031f, + -0.657948f, 0.034506f, -0.249076f, 0.790293f, 0.066491f, -0.131245f, + 0.355173f, 0.564622f, 0.374048f, 0.033974f, 0.253970f, 0.495498f, + -0.556321f, -0.104651f, 0.276947f, 0.057148f, -0.039126f, -0.170050f, + -0.141542f, 0.158541f, 0.582763f, -0.100992f, 0.096705f, -0.209029f, + 0.008449f, 0.255865f, 0.103565f, 0.317719f, 0.479499f, 0.599126f, + -0.065613f, -0.268614f, 0.508736f, 0.180813f, -0.815868f, 0.051238f, + 0.001223f, -0.305423f, -0.270079f, 0.036180f, 0.304342f, 0.202634f, + 0.218348f, -0.304304f, -0.438297f, 0.241123f, 0.200230f, 0.151804f, + 0.051944f, 0.160422f, -0.262981f, -0.417412f, 1.845729f, -0.086183f, + 0.403517f, 0.059667f, 0.564543f, -0.081752f, 0.114907f, -0.284489f, + -0.673943f, 0.056965f, 0.362221f, 0.403224f, -0.000233f, -0.209552f, + -0.800926f, -0.134132f, +}; + +static const float av1_4_partition_nn_bias_32_layer1[LABEL_SIZE] = { + -0.019518f, + 0.198546f, + 0.339015f, + -0.261961f, +}; + +static const NN_CONFIG av1_4_partition_nnconfig_32 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + 32, // num_hidden_nodes + }, + { + av1_4_partition_nn_weights_32_layer0, + av1_4_partition_nn_weights_32_layer1, + }, + { + av1_4_partition_nn_bias_32_layer0, + av1_4_partition_nn_bias_32_layer1, + }, +}; + +static const float av1_4_partition_nn_weights_64_layer0[FEATURE_SIZE * 24] = { + -0.152649f, 0.074509f, 1.000136f, 0.601661f, -1.416694f, -1.932396f, + -1.163850f, 0.640931f, -0.888625f, -0.345711f, 0.161799f, 0.103165f, + 0.147513f, 0.089956f, 0.204329f, 0.196922f, 0.014927f, 0.283714f, + -0.110422f, 0.062005f, -0.531870f, -0.075287f, -0.448349f, -0.218881f, + -0.005592f, -0.130490f, -0.015779f, 0.093521f, -0.158487f, 0.072241f, + 0.066879f, -0.418566f, -0.206281f, 0.025634f, 0.048334f, -0.534750f, + 0.302081f, 0.028707f, -1.543248f, 0.103799f, -1.214052f, 0.395870f, + 0.394754f, -0.272170f, -0.702953f, -4.057464f, -0.033497f, -0.042142f, + 0.014742f, 0.065263f, 0.000879f, -0.019768f, 0.101275f, 0.163059f, + -0.371392f, -0.283484f, 0.241915f, 0.012684f, -0.210101f, -0.166534f, + -0.024894f, 0.274696f, 0.098993f, 0.104086f, 0.055044f, -0.289378f, + 0.146571f, -0.147441f, 0.004056f, 0.112244f, -0.416162f, -0.033176f, + -0.214836f, -0.213787f, 0.023197f, -0.339043f, 0.301109f, -0.408551f, + 0.284922f, -0.344418f, -0.039255f, 0.158748f, -0.344169f, 0.078286f, + -0.043957f, -0.302162f, -0.310826f, 0.063425f, 0.198166f, -0.285324f, + -0.108252f, 0.038992f, -1.053110f, -1.663290f, -0.417185f, 1.504443f, + 0.643206f, -0.850240f, 0.889641f, -0.733214f, 0.147302f, 0.060291f, + -0.052954f, 0.167453f, 0.111870f, 0.085471f, 0.035107f, 0.064361f, + 0.176053f, 0.184373f, 0.676576f, 0.066164f, 1.455569f, 0.925111f, + -0.640845f, 0.803795f, -0.653782f, -0.201038f, 0.060033f, 0.016964f, + -0.047590f, 0.045908f, 0.354162f, 0.014812f, 0.156978f, 0.058792f, + -0.238119f, 0.002450f, -0.094388f, -0.155229f, 0.194858f, -0.355429f, + -0.187098f, -0.119264f, -0.088694f, -0.102845f, 0.184905f, -0.425339f, + -0.157808f, -0.104599f, -0.393248f, -0.379842f, 0.027741f, -0.185816f, + -0.317294f, 0.002453f, -0.498241f, -0.204302f, -0.079093f, 0.020646f, + -0.412850f, -0.426039f, -0.177050f, -0.419304f, -0.064478f, -0.191802f, + -0.146812f, 0.171111f, 0.090261f, -0.367033f, -0.299051f, -0.322132f, + 0.428192f, -0.252613f, 0.488498f, -0.559682f, 0.486720f, -0.511084f, + 0.992506f, 0.346765f, -0.118697f, -0.065127f, -0.376612f, -0.345137f, + -0.426517f, -0.516836f, 0.307083f, 0.609362f, 0.369555f, 0.093775f, + -0.375664f, -0.221595f, -0.025465f, 0.134374f, -0.387031f, 0.096236f, + 0.337465f, -0.124029f, -0.157340f, -0.368790f, -0.104490f, -0.279507f, + -0.247705f, 0.146559f, -0.236206f, -0.036073f, 0.064206f, -0.330919f, + 0.516591f, -0.013492f, 1.269568f, 1.182530f, -0.455390f, -1.328091f, + -0.200950f, -0.380513f, -0.195532f, -0.341479f, 0.016064f, 0.021176f, + 0.169119f, 0.103707f, -0.174504f, -0.462719f, -0.079445f, -0.247128f, + 0.459111f, 0.036129f, 0.769570f, -0.080405f, 1.667107f, 0.355567f, + -2.433896f, 0.627572f, -0.600090f, -0.651872f, -0.059769f, -0.041945f, + -0.009933f, 0.014864f, -0.049378f, -0.041561f, 0.075180f, 0.138307f, + 0.122366f, -0.160756f, 0.215327f, 0.013572f, 0.198194f, -0.762650f, + 0.054466f, 1.110332f, 1.692853f, 0.658654f, -0.409549f, 0.506085f, + 0.330962f, -0.223008f, 0.007448f, -0.289062f, -0.476231f, -0.228359f, + 0.013977f, -0.000609f, -0.673604f, 0.275996f, 0.405291f, 1.693561f, + -1.079768f, 1.122516f, -0.203227f, 0.099265f, -0.165207f, -0.323899f, + -0.269973f, -0.080122f, 0.127700f, 0.190201f, 0.219527f, 0.306194f, + 0.026049f, -0.003779f, 1.107357f, 1.720315f, 1.017908f, 0.078664f, + -1.599813f, -0.482636f, -0.117450f, 0.122249f, 0.030220f, 0.039794f, + 0.176350f, 0.129715f, -0.305755f, -0.274044f, -0.299640f, -0.187335f, + -0.073616f, -0.564507f, -0.127758f, 0.044855f, -0.191090f, 0.039095f, + 0.115378f, 0.969352f, -0.088360f, 0.301443f, 0.065726f, -0.019740f, + -0.102350f, -0.084913f, -0.194615f, 0.118582f, 0.920789f, -0.171615f, + -1.436553f, -0.026419f, -0.730864f, 0.615697f, -0.795079f, 0.119701f, + 0.601782f, 0.792902f, 0.184920f, 1.635090f, -0.085860f, -0.033187f, + -0.166883f, 0.008487f, -0.128300f, -0.089923f, -0.108781f, -0.133719f, + -0.011988f, -0.239816f, -0.092563f, -0.238471f, -0.339722f, 0.177432f, + -0.063101f, -0.121002f, 0.058072f, -0.031166f, 0.086413f, -0.016203f, + -0.305075f, -0.005420f, -0.168796f, 0.148745f, -0.116737f, -0.050222f, + -0.287952f, -0.290982f, -0.090449f, 0.076098f, -0.345632f, -0.061309f, + 0.142218f, 0.035692f, 0.304517f, -0.228031f, 0.119608f, -0.120350f, + 0.163404f, -0.105605f, -0.305462f, -0.176657f, 0.210070f, -0.227600f, + -0.081965f, -0.464027f, -0.053782f, -0.018367f, 0.119159f, 0.017162f, + -0.069792f, 0.305768f, -0.421095f, 0.187740f, -0.032059f, 0.575115f, + -0.064283f, -0.091828f, 0.772648f, -0.393189f, -0.297098f, 0.141420f, + 0.826389f, -0.071586f, -0.893968f, -0.346793f, -1.151655f, 0.039393f, + 1.546000f, -0.094029f, -0.005786f, -0.195764f, -0.169724f, -0.133167f, + -0.129312f, -0.418860f, -0.026553f, -0.053667f, -0.091976f, -0.106275f, + -0.492625f, 0.025350f, -0.332075f, -0.475638f, -0.076667f, -0.065779f, + 0.108957f, 0.246298f, -0.289007f, -0.442552f, -0.206692f, -0.257453f, + 0.073806f, -0.458606f, -0.410390f, -0.312674f, -0.144813f, 0.170128f, + 0.018810f, -0.098241f, 1.027369f, 0.479328f, 1.129707f, 0.484813f, + -0.085207f, 0.621873f, -0.520981f, 0.236175f, 0.273487f, 0.061426f, + 0.306085f, 0.161487f, 0.220991f, 0.223783f, -0.091826f, 0.391031f, +}; + +static const float av1_4_partition_nn_bias_64_layer0[24] = { + 0.580225f, -0.191304f, 1.091767f, -0.134522f, -0.089361f, 0.398750f, + -0.882708f, -0.213102f, -0.119981f, 0.378296f, -0.075719f, 0.426598f, + -2.015505f, 0.202534f, -1.044792f, -0.841519f, 0.266421f, -0.047115f, + -0.131147f, -0.075066f, -0.009441f, 0.853007f, -0.175606f, -0.868306f, +}; + +static const float av1_4_partition_nn_weights_64_layer1[24 * LABEL_SIZE] = { + -0.851937f, -0.211148f, -2.289513f, -0.275071f, 0.251340f, -0.340847f, + 0.498032f, 0.308652f, -0.051574f, 0.323146f, -0.097547f, -0.040269f, + 1.909655f, 0.098348f, 0.588136f, 0.568112f, 0.313297f, 0.920848f, + -0.014486f, 0.386014f, 0.029199f, -0.537330f, -0.021502f, 0.349073f, + -0.524715f, -0.351848f, 1.565454f, -0.297148f, 0.020177f, 0.648369f, + 0.027321f, -0.096052f, -0.363163f, -0.132642f, 0.024292f, -0.734176f, + -0.782700f, 0.408299f, 0.476945f, -0.489512f, -0.728318f, -0.632042f, + 0.405417f, 0.184086f, -0.400730f, 0.359032f, 0.019710f, -0.217409f, + 0.519159f, -0.136316f, 0.993592f, -0.147128f, 0.097495f, 0.426189f, + -0.295233f, 0.278799f, 0.080667f, -0.025052f, -0.307757f, 0.418716f, + -0.853388f, -0.374878f, -0.322725f, 0.696335f, -0.380649f, -0.160356f, + -0.140060f, 0.502455f, 0.656728f, -0.095023f, -0.184198f, -0.347069f, + 0.456372f, -0.029754f, 0.907923f, 0.265710f, -0.065505f, 0.226763f, + -0.277798f, 0.413292f, -0.593899f, -0.060740f, -0.313358f, -0.249944f, + -0.627329f, -0.327151f, -0.853788f, -1.163807f, -0.388944f, -0.228788f, + -0.057382f, 0.334741f, -0.283083f, 0.368280f, -0.407197f, -0.441849f, +}; + +static const float av1_4_partition_nn_bias_64_layer1[LABEL_SIZE] = { + -0.478735f, + 0.292948f, + 0.293172f, + 0.040013f, +}; + +static const NN_CONFIG av1_4_partition_nnconfig_64 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + 24, // num_hidden_nodes + }, + { + av1_4_partition_nn_weights_64_layer0, + av1_4_partition_nn_weights_64_layer1, + }, + { + av1_4_partition_nn_bias_64_layer0, + av1_4_partition_nn_bias_64_layer1, + }, +}; + +#undef FEATURE_SIZE +#undef LABEL_SIZE + +#define FEATURE_SIZE 4 +static const float + av1_partition_breakout_nn_weights_128_layer0[FEATURE_SIZE * 32] = { + -0.331785f, 0.068675f, -0.323814f, 0.033714f, -0.237835f, 0.166316f, + -0.498766f, -0.545634f, -0.266173f, -0.476957f, -0.120409f, -0.021042f, + 0.124056f, -0.278750f, -0.110120f, -0.372812f, 4.547939f, 0.097618f, + -0.002710f, -0.064169f, -1.841173f, -0.403833f, 0.005536f, 0.067188f, + -0.434935f, -0.227421f, -0.000011f, -0.139961f, -0.174056f, -0.652384f, + -0.000015f, -0.262847f, -3.319706f, -0.947693f, 0.002981f, 0.016717f, + -10.408850f, -0.014568f, -0.000018f, 0.019084f, 1.523383f, 0.074525f, + -0.002076f, -0.020734f, 4.881495f, 0.002799f, 0.000342f, -0.019623f, + 1.786154f, 0.037462f, -0.019037f, 0.052833f, 11.408153f, -0.044602f, + 0.026155f, -0.518627f, -0.474499f, -0.427430f, -0.442733f, -0.011116f, + -22.379410f, -0.000549f, -0.001418f, 0.008090f, -0.295090f, -0.230268f, + -0.337278f, -0.001127f, -0.644282f, -0.598783f, -0.539417f, -0.003303f, + 9.189824f, 0.038066f, -0.004097f, -0.460045f, -0.308858f, -0.242691f, + -0.230835f, -0.273057f, 0.152226f, 0.179239f, -0.146382f, -0.004655f, + -0.242940f, -0.718862f, -0.001685f, -0.214736f, 3.263186f, 0.079463f, + -0.003854f, -0.187461f, -0.599144f, -0.419808f, -0.000597f, -0.136980f, + 0.184813f, -0.319525f, -0.007246f, 0.079709f, -0.883229f, -0.343748f, + -0.000077f, -0.172214f, -0.548759f, -0.194674f, -0.144786f, 0.043896f, + -0.176364f, -0.248394f, -0.090215f, -0.294743f, -0.280980f, -0.181436f, + -0.115681f, -0.071915f, -13.035494f, -0.075623f, 0.017052f, -0.171152f, + 5.910803f, 0.128344f, 0.010256f, -1.073301f, 2.387826f, 0.166183f, + -0.007193f, -0.257836f, + }; + +static const float av1_partition_breakout_nn_bias_128_layer0[32] = { + 0.115591f, -0.100178f, -0.165523f, -0.122997f, 11.045759f, 1.034761f, + -0.323672f, -0.189087f, 2.850950f, 7.010029f, -21.447067f, 1.877031f, + 0.437442f, 5.929414f, -0.117274f, 4.462253f, -0.135198f, -0.145927f, + 8.727211f, 0.000000f, -3.532987f, -0.405898f, 11.364439f, -0.141728f, + -5.994947f, -0.362574f, 1.857687f, -0.100400f, -0.130312f, 0.006080f, + 0.429660f, -8.439470f, +}; + +static const float av1_partition_breakout_nn_weights_128_layer1[32] = { + -0.013738f, 0.022052f, -0.074437f, -0.211377f, -0.080433f, 0.015543f, + 0.002091f, 0.014252f, 0.134834f, 0.190263f, 0.244175f, -0.031747f, + 0.020068f, -0.068326f, 0.185471f, 0.660268f, -0.134898f, -0.010376f, + -0.276023f, -0.282921f, -0.022769f, 0.007070f, -0.186235f, 0.024407f, + -0.024837f, 0.005764f, 0.016599f, -0.040077f, 0.020990f, 0.095054f, + -0.039662f, 0.131499f, +}; + +static const float av1_partition_breakout_nn_bias_128_layer1[1] = { + 0.86678213f, +}; + +static const NN_CONFIG av1_partition_breakout_nnconfig_128 = { + FEATURE_SIZE, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 32, // num_hidden_nodes + }, + { + av1_partition_breakout_nn_weights_128_layer0, + av1_partition_breakout_nn_weights_128_layer1, + }, + { + av1_partition_breakout_nn_bias_128_layer0, + av1_partition_breakout_nn_bias_128_layer1, + }, +}; + +static const float + av1_partition_breakout_nn_weights_64_layer0[FEATURE_SIZE * 16] = { + 0.872892f, -0.235539f, -0.412159f, -0.142533f, -2.251479f, -0.057073f, + -0.001373f, 0.112147f, 5.281734f, 0.060704f, 0.000838f, -0.961554f, + 0.244995f, 0.154515f, -0.292654f, -0.167177f, -3.759112f, -0.486347f, + 0.003208f, -0.418226f, 2.618152f, 0.026832f, 0.003988f, -0.404406f, + -0.405434f, 0.102791f, -0.033406f, -0.029820f, -4.492342f, -0.154291f, + 0.012947f, -0.195075f, 0.009311f, -0.411410f, -0.010986f, -0.554822f, + 0.160576f, 0.020796f, -0.457230f, -0.191111f, -7.759542f, -0.065039f, + -0.001322f, 0.055691f, 0.291924f, -0.053076f, -0.148379f, -0.298383f, + 1.022023f, -0.033668f, -0.000804f, -0.825778f, -3.902254f, -0.085812f, + -0.052520f, -0.035012f, -0.465468f, -0.319231f, -0.497529f, -0.183068f, + -2.407131f, -0.062304f, 0.000874f, 0.108786f, + }; + +static const float av1_partition_breakout_nn_bias_64_layer0[16] = { + 0.081425f, -14.404084f, 11.511393f, -0.930053f, 1.841889f, 15.020920f, + -1.872288f, 5.392535f, -0.329335f, -0.005358f, 12.600776f, 0.000000f, + -0.337413f, 4.492778f, 0.000000f, 17.043072f, +}; + +static const float av1_partition_breakout_nn_weights_64_layer1[16] = { + -0.465338f, -0.103023f, -0.174808f, -0.005156f, -0.016366f, -0.172494f, + 0.014185f, 0.067030f, -0.001939f, -0.175049f, 0.245992f, -0.181660f, + -0.038572f, 0.307899f, -0.294283f, 0.118323f, +}; + +static const float av1_partition_breakout_nn_bias_64_layer1[1] = { + -1.33438122f, +}; + +static const NN_CONFIG av1_partition_breakout_nnconfig_64 = { + FEATURE_SIZE, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, // num_hidden_nodes + }, + { + av1_partition_breakout_nn_weights_64_layer0, + av1_partition_breakout_nn_weights_64_layer1, + }, + { + av1_partition_breakout_nn_bias_64_layer0, + av1_partition_breakout_nn_bias_64_layer1, + }, +}; + +static const float + av1_partition_breakout_nn_weights_32_layer0[FEATURE_SIZE * 16] = { + -4.825528f, -0.145737f, 0.001907f, 0.145415f, -1.858153f, -0.080744f, + 0.000601f, 0.211991f, 0.384265f, -0.043945f, -0.521332f, -0.170622f, + -0.046866f, -0.600506f, -0.001216f, -0.332760f, -0.447677f, -0.605844f, + -0.121008f, -0.119936f, -0.215739f, -0.269665f, -0.668587f, 0.071318f, + -1.202551f, -0.729727f, -0.370084f, 0.088215f, -1.926800f, -0.086519f, + 0.000359f, 0.215120f, 0.718749f, 0.022942f, 0.003840f, -0.176518f, + 1.213451f, 0.080786f, 0.001557f, -1.053430f, 0.202698f, -0.583919f, + -0.535512f, -0.239927f, -0.110151f, -0.128832f, -0.441087f, -0.145575f, + -0.178518f, -0.585784f, 0.000029f, -0.833014f, -0.331358f, -0.520297f, + -0.088676f, -0.178487f, -1.430755f, 0.022981f, -0.106931f, 0.015573f, + -0.520814f, -0.045386f, -0.443123f, -0.484209f, + }; + +static const float av1_partition_breakout_nn_bias_32_layer0[16] = { + 11.747026f, -9.337718f, 0.341648f, -0.155847f, -0.104005f, 4.666283f, + 6.669584f, 16.625504f, 9.885626f, 15.439183f, -0.346080f, 0.000000f, + -0.423808f, 0.000000f, 6.352258f, -0.155787f, +}; + +static const float av1_partition_breakout_nn_weights_32_layer1[16] = { + 0.168561f, -0.122519f, 0.524667f, 0.032474f, 0.059097f, 0.011900f, + 0.166445f, 0.127256f, -0.034838f, -0.212586f, -0.317973f, 0.348419f, + -0.004171f, 0.157694f, 0.117845f, 0.272115f, +}; + +static const float av1_partition_breakout_nn_bias_32_layer1[1] = { + 0.09049262f, +}; + +static const NN_CONFIG av1_partition_breakout_nnconfig_32 = { + FEATURE_SIZE, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, // num_hidden_nodes + }, + { + av1_partition_breakout_nn_weights_32_layer0, + av1_partition_breakout_nn_weights_32_layer1, + }, + { + av1_partition_breakout_nn_bias_32_layer0, + av1_partition_breakout_nn_bias_32_layer1, + }, +}; + +static const float + av1_partition_breakout_nn_weights_16_layer0[FEATURE_SIZE * 16] = { + 0.209371f, 0.028758f, 0.005764f, -0.384401f, -0.625777f, -0.005647f, + -0.316867f, 0.042985f, 0.127344f, 0.025461f, 0.011465f, -0.071043f, + -0.295977f, -0.076093f, -0.209681f, -0.311653f, -0.147538f, 0.009910f, + -0.130997f, -0.012326f, 0.024124f, -0.323578f, -0.005790f, -0.085664f, + -1.575066f, -0.119221f, 0.015018f, 0.187204f, 0.238117f, 0.084924f, + -0.004444f, -1.271538f, -0.709860f, -0.006226f, -0.903111f, 0.090573f, + -0.278642f, -0.011114f, 0.021162f, 0.081290f, -0.467486f, -0.040771f, + -0.224069f, -0.714390f, -0.281905f, -0.001336f, -0.761212f, -0.060385f, + -0.814479f, -0.050450f, -0.003666f, 0.085668f, -0.272589f, 0.057330f, + -0.206540f, -0.303418f, 0.075335f, -0.180468f, -0.064872f, -0.755948f, + -0.509287f, -0.048877f, -0.001512f, 0.077086f, + }; + +static const float av1_partition_breakout_nn_bias_16_layer0[16] = { + 16.421495f, 4.012273f, -1.828571f, 0.000000f, -0.263564f, -0.201972f, + 6.564987f, 14.651000f, -3.227779f, 2.241833f, -0.137116f, 0.762876f, + 5.625762f, 0.615822f, 0.040057f, 16.668884f, +}; + +static const float av1_partition_breakout_nn_weights_16_layer1[16] = { + -0.096440f, 0.184316f, -0.021148f, 0.424974f, 0.003743f, 0.006310f, + 0.046266f, -0.219224f, -0.087004f, 0.024623f, -0.275798f, 0.120164f, + 0.269773f, -0.021105f, -0.146698f, 0.188764f, +}; + +static const float av1_partition_breakout_nn_bias_16_layer1[1] = { + 1.60751927f, +}; + +static const NN_CONFIG av1_partition_breakout_nnconfig_16 = { + FEATURE_SIZE, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, // num_hidden_nodes + }, + { + av1_partition_breakout_nn_weights_16_layer0, + av1_partition_breakout_nn_weights_16_layer1, + }, + { + av1_partition_breakout_nn_bias_16_layer0, + av1_partition_breakout_nn_bias_16_layer1, + }, +}; + +static const float + av1_partition_breakout_nn_weights_8_layer0[FEATURE_SIZE * 16] = { + -0.255885f, 0.109548f, -0.111054f, -0.476119f, -1.083031f, -0.342003f, + 0.048241f, -0.356013f, -0.085054f, 0.124908f, 0.000084f, -0.149906f, + -0.729829f, 0.133535f, -0.002125f, 0.207516f, -0.210163f, -0.567365f, + -0.590103f, 0.045308f, -0.539406f, 0.130550f, -0.663879f, -0.170549f, + 0.017587f, -0.054187f, 0.000550f, 0.038297f, -0.112891f, -0.012751f, + -0.048067f, 0.095564f, 0.079892f, 0.077285f, -0.749708f, -0.286312f, + -0.054334f, 0.132242f, -0.004152f, -0.209758f, -0.073407f, 0.082306f, + -0.001034f, -0.090990f, 0.122823f, -0.109794f, -0.230066f, -0.391155f, + -0.262245f, -0.004744f, -0.232246f, 0.099290f, -0.637484f, 0.111937f, + -0.548556f, -0.598344f, 0.123265f, -0.281395f, -0.399711f, -0.525671f, + -0.596269f, 0.098494f, -0.005765f, 0.173652f, + }; + +static const float av1_partition_breakout_nn_bias_8_layer0[16] = { + 0.194141f, -0.111223f, 2.503733f, -7.155602f, -0.695068f, 0.114874f, + 2.056990f, 5.284306f, 0.639643f, -2.792049f, -2.232339f, -0.232209f, + 2.336705f, -0.278834f, 0.231905f, 7.954366f, +}; + +static const float av1_partition_breakout_nn_weights_8_layer1[16] = { + -0.014439f, 0.010171f, 0.048116f, -0.090659f, -0.081235f, -0.021840f, + -0.017360f, 0.031063f, -0.031737f, -0.023439f, -0.037725f, 0.021954f, + 0.055858f, 0.230970f, -0.056466f, 0.119780f, +}; + +static const float av1_partition_breakout_nn_bias_8_layer1[1] = { + 1.27784479f, +}; + +static const NN_CONFIG av1_partition_breakout_nnconfig_8 = { + FEATURE_SIZE, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, // num_hidden_nodes + }, + { + av1_partition_breakout_nn_weights_8_layer0, + av1_partition_breakout_nn_weights_8_layer1, + }, + { + av1_partition_breakout_nn_bias_8_layer0, + av1_partition_breakout_nn_bias_8_layer1, + }, +}; +#undef FEATURE_SIZE + +#define FEATURE_SIZE 9 // Input layer size +#define NUM_NODES 32 // Hidden layer size +#define LABEL_SIZE 3 // Output layer size + +static const float av1_rect_partition_nn_weights_8_layer0[FEATURE_SIZE * + NUM_NODES] = { + 0.22151f, 0.99424f, 0.23415f, -1.13841f, -0.11277f, 0.09530f, 0.14769f, + -1.18895f, -0.96640f, -0.21421f, -0.13974f, 0.03236f, 0.15777f, -0.03176f, + 0.02729f, -0.37344f, -0.01727f, -0.05469f, 0.19402f, -3.45508f, 0.90106f, + -2.91557f, 0.19379f, 0.14356f, -0.13291f, 0.05734f, -0.03032f, -0.13060f, + 0.35744f, 1.31630f, -1.54493f, -0.20749f, -0.24413f, -0.04524f, -0.12400f, + 1.08305f, -0.21596f, 0.76244f, 1.10616f, -1.71706f, 0.05768f, 0.10966f, + 0.00949f, -0.12680f, 0.00699f, -0.11522f, -0.38566f, 0.34283f, -0.35266f, + -0.40643f, -0.22462f, 0.32300f, -0.39737f, -0.20587f, -0.16096f, 1.07543f, + 0.30314f, -1.35659f, -0.38212f, 0.45857f, 0.76615f, 0.16819f, -1.24459f, + 0.39677f, 0.87436f, -2.33757f, 1.27471f, 0.27488f, 0.01019f, -0.01221f, + -0.07461f, -0.14577f, -0.01231f, -0.64426f, -1.02733f, -1.96242f, 0.95143f, + -0.06777f, -1.13868f, 0.01354f, -0.75590f, -0.78222f, -0.07453f, 0.61788f, + 0.56899f, 1.17144f, 0.70899f, 0.48568f, 0.11266f, 0.81579f, -0.03929f, + 0.01088f, 0.33599f, -0.22401f, -0.49654f, -0.02598f, 0.04509f, -0.08217f, + -0.30687f, 0.19851f, -2.96860f, -2.30698f, 0.01848f, 0.11801f, 0.06614f, + 0.01673f, -0.11002f, -0.08168f, 0.09204f, -0.06379f, 0.27972f, -0.31716f, + -0.00566f, -0.13651f, -0.37276f, 0.01511f, -0.23697f, 0.21696f, -0.19480f, + 0.60758f, -0.43506f, -0.02247f, -1.45073f, 0.84442f, -0.94018f, 0.32550f, + 0.03985f, -0.06581f, 0.21665f, 0.79472f, -2.41080f, 0.04788f, -0.09492f, + -0.10677f, 0.07250f, 0.14329f, -0.37319f, 0.53043f, -0.49108f, 0.25792f, + -0.36569f, -0.28669f, -0.18416f, -0.52385f, -1.17081f, -1.32153f, -1.13403f, + -0.26196f, 0.93379f, 0.72115f, 0.54464f, 0.27642f, 0.04757f, 2.01629f, + 1.55787f, -0.11665f, 1.00722f, -0.24352f, 0.53308f, 0.57719f, 0.39344f, + 0.19174f, 0.06339f, -0.02530f, 0.07724f, -0.32416f, -0.26992f, -0.35887f, + -0.35285f, -0.33379f, -0.37475f, -0.77335f, 1.70027f, -1.52153f, -0.26503f, + 0.97552f, -2.96705f, -0.91220f, -0.11827f, 0.00406f, -0.14514f, 0.18417f, + -0.20874f, 0.27293f, -0.34072f, -0.34838f, -0.19054f, -0.29806f, -0.27960f, + -0.19293f, -0.18275f, -0.05902f, 0.58625f, -0.05470f, -0.48814f, -0.45382f, + -0.05959f, 2.01250f, -0.30014f, 0.69546f, -1.24180f, 1.34923f, 0.20337f, + 0.16850f, 0.07187f, 0.72630f, -0.15380f, -2.40973f, -2.73561f, -1.71375f, + -1.61695f, 0.50052f, 0.09730f, 0.00579f, 0.06133f, -0.06512f, -0.61439f, + -1.16173f, -0.58716f, 1.60438f, 0.23242f, 0.91847f, 0.49041f, -0.16277f, + -0.02574f, -0.64593f, 1.17028f, 0.46852f, 0.14926f, 0.73853f, -0.78521f, + 0.05959f, -0.35590f, 0.02039f, 0.10812f, -0.28650f, 1.34038f, -0.72188f, + 0.62385f, -0.35271f, -0.39599f, 0.41543f, 0.53124f, -0.23510f, -0.15480f, + -0.05066f, -0.33529f, 0.05238f, -0.35311f, -0.26983f, -0.39764f, 0.01085f, + 0.26593f, -0.18411f, -0.29945f, 0.50090f, -0.03397f, 0.78562f, -0.33068f, + 1.21308f, -2.23273f, -0.33366f, -0.15164f, -1.13270f, 0.17394f, 0.65567f, + 0.76496f, 0.44325f, 0.01368f, -0.33619f, -0.64256f, 0.64478f, 0.84553f, + 1.74183f, 0.22563f, -0.14550f, -0.16258f, 0.03010f, 0.49922f, 0.64575f, + -0.29187f, -0.10348f, -1.43619f, -0.56540f, -0.14779f, 0.04616f, 0.87411f, + -1.08228f, +}; + +static const float av1_rect_partition_nn_bias_8_layer0[NUM_NODES] = { + 0.33919f, -0.03003f, 0.79073f, -0.18508f, 0.00668f, -0.12017f, 0.35362f, + -0.51642f, 0.06536f, 0.41668f, -0.06509f, 0.94606f, -0.15385f, 0.14936f, + 1.46274f, -0.06961f, 2.82537f, -1.95576f, -0.09457f, 0.02042f, -0.07480f, + -0.55083f, 0.26170f, 4.39883f, 0.33999f, -0.10502f, 0.70884f, -0.06992f, + -0.22638f, 1.40940f, -0.09309f, 0.05828f, +}; + +static const float av1_rect_partition_nn_weights_8_layer1[NUM_NODES * + LABEL_SIZE] = { + 0.09209f, 0.26236f, 0.62136f, 0.76324f, -1.14678f, 0.42289f, -0.08895f, + -0.97267f, 2.05958f, 0.00843f, 0.35335f, 1.12096f, -0.11679f, 0.07350f, + -1.23231f, -0.61990f, 1.51379f, -1.99450f, 0.22441f, 2.41974f, -0.30488f, + -0.37869f, 0.47168f, -3.70132f, 0.00061f, 0.19432f, 0.11512f, 0.26200f, + -0.35285f, 0.37985f, 0.90571f, 0.27344f, 0.74840f, -0.17965f, -2.51433f, + 0.59235f, 1.16670f, -0.53446f, 0.67897f, 0.04505f, -0.86874f, 0.45361f, + -0.35033f, 1.21283f, 0.31426f, -0.20841f, 0.56757f, 0.45909f, -1.23683f, + 0.09835f, -0.17214f, -0.96323f, 0.01138f, -0.50233f, 0.30104f, 2.01814f, + 1.15821f, -0.11947f, 0.74574f, -0.30714f, -0.39646f, -1.30086f, -0.88541f, + -0.12259f, -0.54977f, 0.30069f, 1.84299f, -0.95141f, -0.65887f, -0.25888f, + -0.63265f, 1.29531f, -0.56672f, 0.10837f, -0.21297f, -2.19131f, 0.01156f, + 0.51912f, 0.46704f, 0.42810f, -0.59271f, 0.98469f, -0.17914f, -1.91163f, + -0.32807f, 0.48199f, -0.99525f, 1.67108f, -0.87631f, -0.60258f, -0.78731f, + -0.32877f, 0.44237f, 0.01087f, 0.07489f, -0.28224f, +}; + +static const float av1_rect_partition_nn_bias_8_layer1[LABEL_SIZE] = { + 1.70665f, + -0.77954f, + -0.92709f, +}; + +static const NN_CONFIG av1_rect_partition_nnconfig_8 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + NUM_NODES, + }, // num_hidden_nodes + { av1_rect_partition_nn_weights_8_layer0, + av1_rect_partition_nn_weights_8_layer1 }, + { av1_rect_partition_nn_bias_8_layer0, av1_rect_partition_nn_bias_8_layer1 } +}; + +static const float av1_rect_partition_nn_weights_16_layer0[FEATURE_SIZE * + NUM_NODES] = { + -0.18480f, -0.05410f, -0.18957f, 0.15451f, -0.38649f, -0.26162f, -0.22727f, + -0.38555f, -0.36738f, 0.74384f, -1.85999f, 0.98491f, -0.72119f, 1.77321f, + 0.39983f, 0.96314f, 0.23695f, 0.30200f, 0.30629f, -0.47617f, -1.43320f, + -1.81730f, 0.36554f, -0.07142f, -1.27242f, -1.27697f, 0.00110f, -0.32179f, + 0.27460f, 0.45428f, 0.15308f, -0.73906f, -0.28577f, -0.01238f, -0.16958f, + -0.85390f, 1.05484f, -1.62812f, 0.77632f, -0.27327f, -0.32527f, 0.32726f, + 1.73255f, 0.53763f, 0.59121f, -0.39068f, -0.32451f, -0.31869f, 0.17777f, + 0.07519f, -0.18066f, -0.11250f, -0.14616f, -0.16882f, -0.04099f, -0.67959f, + 0.39674f, -0.08596f, 0.18587f, -2.04097f, -1.73993f, 1.57212f, 1.42410f, + -1.36762f, -0.41485f, -1.12103f, 0.56959f, 0.11500f, 0.48945f, -0.13585f, + 1.22125f, 0.67071f, -1.11812f, -0.20660f, -0.52856f, 0.70663f, 0.74382f, + 0.61114f, -0.11454f, 1.14687f, 0.80322f, -0.45965f, -0.44466f, -0.05830f, + 0.13206f, -0.53750f, -0.11324f, -0.37971f, -0.13491f, -0.21268f, 1.93407f, + 1.34433f, 2.49427f, 2.91955f, 1.71730f, 0.03295f, 0.03587f, -0.14550f, + 0.08189f, -0.38655f, -0.35432f, -0.62706f, -0.01849f, -0.57882f, -0.60438f, + -1.01334f, -0.57302f, 0.22592f, 0.05916f, -0.05305f, -0.89824f, -0.52969f, + -0.24542f, 0.27029f, -0.40924f, -0.82452f, -0.60665f, -5.03025f, 0.83302f, + 1.83695f, 2.19716f, 2.31001f, 0.03657f, 0.00063f, -0.04379f, 0.05835f, + -0.08623f, 0.20557f, -0.17791f, 0.07874f, -0.25456f, -0.19513f, -0.27753f, + -0.31982f, 0.00245f, -0.33183f, 0.26059f, -0.22165f, 0.37582f, -0.30411f, + -0.22639f, -0.14739f, -0.20201f, -0.37507f, -1.30653f, 0.49570f, 1.03673f, + 0.66139f, 0.44941f, -0.44461f, -0.50376f, -0.49664f, 0.18608f, -0.26175f, + 0.14844f, 0.78715f, -0.70344f, -0.87624f, -0.98535f, -0.35346f, 0.37094f, + -0.43135f, -0.22571f, 3.46263f, 3.13580f, -1.33203f, -0.15247f, -0.15866f, + -0.11214f, 0.12211f, 0.03964f, -1.87597f, -4.81597f, -4.80195f, -4.98096f, + -5.62336f, -0.05337f, -0.00943f, 0.00792f, 0.02742f, 1.05679f, 2.41455f, + 0.85382f, 1.42504f, 0.58096f, 0.21443f, 1.02694f, 1.06746f, 1.20242f, + 0.60767f, 1.98667f, -0.80879f, -0.63495f, 1.95508f, 0.23952f, -0.15019f, + -0.16097f, 0.30155f, -3.42407f, -1.34998f, 9.07689f, -2.22559f, 2.22562f, + -0.03348f, -0.05229f, 0.05931f, 0.03042f, -0.18068f, -0.05732f, -0.33010f, + -0.32279f, -0.26607f, -0.02723f, -0.04067f, 0.08700f, -0.16366f, -0.24935f, + -0.69124f, 0.58508f, 0.50654f, 0.04492f, 1.38340f, -1.51487f, 1.72889f, + -1.95618f, -3.65013f, -1.38525f, -3.05516f, -2.40448f, 2.47467f, 0.03784f, + 0.08052f, -0.01971f, -0.08918f, -0.84997f, -0.55302f, -1.07861f, -0.62626f, + 0.61751f, -0.11012f, -0.24185f, -0.39201f, -1.85390f, -0.31261f, -0.11927f, + 0.15671f, -0.23450f, -0.14916f, -0.31715f, -0.19350f, 0.01795f, -0.11533f, + -0.05799f, -0.03142f, 0.20218f, -0.39499f, -0.33859f, -0.13201f, -0.19527f, + -0.28459f, -0.20346f, 0.89457f, -2.22103f, -2.37455f, -2.00221f, 2.44553f, + 0.33915f, 0.50047f, -0.34625f, -0.19667f, -0.56333f, -0.84328f, 1.25767f, + -1.70297f, 1.00482f, -0.00103f, -1.40813f, 0.21311f, 0.39230f, -0.07302f, + -3.49100f, 1.60675f, -2.90692f, 0.11022f, 0.13507f, -0.13308f, 0.15201f, + -0.05573f, +}; + +static const float av1_rect_partition_nn_bias_16_layer0[NUM_NODES] = { + -0.16783f, -0.16023f, 0.52215f, -0.04109f, 2.00122f, -0.11633f, 0.25535f, + 1.80638f, 1.69273f, -0.25998f, -6.83550f, -0.79682f, -1.03466f, 1.42721f, + 0.00000f, -0.00000f, -0.11665f, -0.12047f, -1.01497f, 7.27181f, -0.78548f, + -1.39335f, -5.42248f, -0.10388f, 0.07634f, 2.81012f, -0.57429f, -0.15629f, + -0.12044f, 1.65478f, -0.75153f, 1.18441f, +}; + +static const float av1_rect_partition_nn_weights_16_layer1[NUM_NODES * + LABEL_SIZE] = { + -0.26407f, 0.06322f, 0.87932f, 0.17772f, 0.71686f, -0.12283f, 0.08454f, + 0.20098f, -0.31763f, -0.33178f, -4.59535f, -0.04367f, 0.17099f, 3.80486f, + 0.16750f, 0.29218f, 0.57234f, -0.96550f, -0.10599f, -4.91130f, -0.14658f, + 0.95803f, -4.13925f, 0.24567f, 0.25708f, 1.60547f, -1.03251f, -0.31053f, + -0.05659f, -0.94121f, -0.68926f, -0.24738f, -0.38019f, 0.98950f, 0.13689f, + 0.24504f, 0.49623f, 0.19980f, 0.38349f, 0.37481f, 0.54540f, -0.02198f, + 3.43385f, 1.02543f, -0.40921f, -3.07235f, 0.02996f, 0.00323f, -0.35414f, + 0.71099f, 1.39334f, 2.43741f, -1.11007f, -0.22739f, -4.21757f, 0.11905f, + 0.00353f, -1.69637f, 0.45944f, -0.19884f, 0.03624f, 0.25729f, 0.23659f, + -2.08405f, 0.08573f, -0.53393f, -1.28103f, -0.53970f, -0.65465f, 0.31821f, + -0.09884f, -0.69026f, -0.37284f, 0.04622f, 1.32973f, -0.15414f, 0.19138f, + -0.67927f, -0.17658f, 0.36008f, -0.51832f, 0.09887f, -1.94414f, 2.95227f, + 1.76937f, -0.26687f, 8.50976f, 0.26247f, 0.60262f, -0.27910f, 0.30061f, + -0.05117f, 0.16018f, 0.71195f, 0.57871f, 1.57794f, +}; + +static const float av1_rect_partition_nn_bias_16_layer1[3] = { + 2.68750f, + -1.31894f, + -1.36768f, +}; + +static const NN_CONFIG av1_rect_partition_nnconfig_16 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + NUM_NODES, + }, // num_hidden_nodes + { av1_rect_partition_nn_weights_16_layer0, + av1_rect_partition_nn_weights_16_layer1 }, + { av1_rect_partition_nn_bias_16_layer0, av1_rect_partition_nn_bias_16_layer1 } +}; + +static const float av1_rect_partition_nn_weights_32_layer0[FEATURE_SIZE * + NUM_NODES] = { + -0.54654f, -0.43537f, -0.10620f, -0.48051f, -0.43543f, -0.22737f, -0.15429f, + -0.09858f, -0.09438f, 0.37306f, 0.23934f, -1.86375f, -1.18307f, -0.32995f, + -0.09745f, 0.05431f, -0.13799f, 0.14734f, -0.33219f, 0.18057f, -0.23792f, + -0.28126f, 0.02977f, -0.07431f, 0.07860f, 0.00067f, -0.01927f, 1.01841f, + -0.57739f, 0.08412f, -1.33843f, -1.05563f, -0.28693f, -0.39425f, -0.69572f, + -0.16703f, 0.02808f, 0.11994f, -0.26267f, 0.19706f, -0.29707f, -0.25305f, + -0.07050f, -0.02704f, -0.31528f, -0.42301f, 0.22496f, -0.37001f, -0.23319f, + -0.11139f, -0.30513f, 0.04213f, -0.12550f, 0.02504f, 0.33245f, 0.01102f, + -0.35950f, -0.05949f, -0.19590f, -0.27457f, -0.28339f, -0.15676f, -0.21538f, + 0.65066f, 0.28443f, -1.24943f, -3.00246f, -1.01897f, 0.09304f, 0.70052f, + -0.12877f, 0.21120f, -0.37476f, 0.23261f, -0.28401f, 0.09837f, 0.00020f, + -0.12106f, -0.32354f, -0.02472f, -0.19772f, 1.01886f, 0.16596f, -0.06532f, + 1.72938f, 1.57754f, 0.55963f, 0.33246f, -0.20023f, 0.30715f, 0.08629f, + 0.18945f, -0.45988f, -1.22610f, -0.05152f, -0.48859f, -1.02104f, -0.27315f, + -0.57698f, 0.04157f, -0.92428f, -1.31268f, 1.78210f, 0.10291f, 1.55042f, + -1.26793f, 1.39042f, -1.43729f, 0.25600f, 5.21263f, 5.31955f, 5.19316f, + 5.43430f, 0.00294f, -0.00970f, -0.02333f, 0.00250f, 1.17672f, 6.27544f, + 4.95973f, 3.54009f, 4.51269f, 0.30750f, 0.78780f, -0.44741f, -0.76442f, + 0.75050f, 0.58799f, 0.03400f, -2.09859f, 1.67313f, 0.12503f, 0.28609f, + 1.15809f, 2.46530f, -0.04898f, 0.23072f, -0.12635f, -0.82097f, -0.63827f, + 2.16779f, 1.77132f, 0.15434f, -1.06427f, 0.06206f, -0.87732f, -0.61897f, + -0.44593f, -0.77131f, -0.15979f, -0.02282f, -0.74381f, 0.66052f, -0.22992f, + 1.74638f, 1.29199f, -0.55464f, 0.98316f, 0.06665f, 0.50254f, -0.66292f, + 0.17113f, -0.32633f, -1.85803f, -0.92759f, 4.44965f, 1.33057f, 0.02135f, + -0.27446f, -0.26018f, -0.12613f, -0.14470f, -0.23355f, -0.09717f, -0.24123f, + -0.05535f, -0.19146f, -0.36222f, -0.30458f, -0.40323f, 0.21779f, 0.14248f, + -0.48630f, 0.18840f, 0.11040f, 0.17287f, -0.51880f, 1.12466f, -0.38888f, + -0.16421f, -0.31784f, -0.36112f, -0.25386f, -0.01636f, 0.10029f, -0.26881f, + -0.17051f, -0.30903f, -0.08573f, -0.28774f, -0.01173f, -0.09706f, -0.23089f, + -0.12922f, -0.17463f, -0.12433f, -0.23074f, 0.15220f, 1.29826f, 0.23788f, + 0.04189f, 2.66416f, 0.48815f, -0.06803f, 0.96742f, 1.27165f, -0.70348f, + -0.09941f, -0.42948f, -0.20243f, -0.02364f, -0.26689f, -0.40629f, -0.68217f, + -0.48073f, 2.43657f, -2.60191f, -1.82837f, 0.50440f, 0.71829f, 0.76491f, + 0.28293f, 0.20568f, 0.92642f, -0.02496f, 1.43637f, -0.24474f, -1.21030f, + 0.54084f, 1.05130f, 1.29572f, 0.03750f, -0.36894f, 0.74548f, -1.33857f, + -0.84858f, 1.35230f, 0.80175f, 0.66136f, 1.06473f, 0.18701f, 1.42413f, + 0.04661f, -0.07820f, 0.64990f, -0.43595f, 1.18304f, -0.11437f, -0.06365f, + 0.03558f, 0.78260f, -1.74890f, 1.56217f, -1.23424f, 4.59193f, -3.35072f, + 0.01180f, -0.18296f, -0.20870f, 0.04510f, 1.52595f, -1.37402f, -0.33123f, + -0.85957f, 0.80598f, 0.03743f, 0.02354f, 0.37707f, 1.62095f, -0.29627f, + -0.31778f, -0.45789f, -0.14906f, 0.25315f, -0.10817f, -0.32610f, -0.40890f, + 0.33984f, +}; + +static const float av1_rect_partition_nn_bias_32_layer0[NUM_NODES] = { + -0.17482f, 0.39042f, 0.00000f, 1.69677f, 0.08792f, -0.09301f, 0.13809f, + 4.84061f, 0.00000f, 0.40515f, 0.46246f, 0.20644f, -5.77478f, -1.54510f, + 0.05660f, -0.32013f, 0.23649f, 0.03778f, -2.53710f, -0.27869f, 0.45623f, + -0.04155f, -0.18445f, -0.73405f, -0.50243f, 2.23191f, 1.93272f, -1.07032f, + -0.27602f, -1.98063f, 0.20816f, -0.01315f, +}; + +static const float av1_rect_partition_nn_weights_32_layer1[NUM_NODES * + LABEL_SIZE] = { + 0.02827f, 1.02560f, -0.07137f, -0.31911f, 0.11365f, 0.13684f, -0.07816f, + -5.23036f, -0.34340f, 0.84526f, -1.51845f, 0.07017f, -8.12570f, 6.24061f, + 0.35739f, -0.09937f, -0.30978f, 0.22032f, 0.74968f, -0.34557f, 0.45547f, + -0.16512f, 0.07118f, 1.66415f, 0.41320f, -1.81533f, -1.96004f, 1.04666f, + 0.84049f, 4.31009f, 0.68850f, 0.26322f, -0.24634f, -1.25889f, 0.31952f, + 0.63632f, 0.05801f, -0.10664f, -0.21992f, 2.44386f, 0.19526f, -0.09838f, + 1.53049f, -0.26630f, 3.54126f, -3.40574f, 0.72730f, 0.04557f, 0.92652f, + 0.15522f, 2.35895f, -0.13347f, 0.56907f, 0.15352f, 0.01823f, -0.73939f, + 0.43104f, 1.90321f, 0.31267f, -0.51972f, 0.50094f, -3.98372f, -3.41518f, + -0.48183f, 0.26661f, 0.64146f, 0.14500f, -0.01695f, 0.16653f, -0.37846f, + 0.08412f, 2.69714f, -0.20258f, -0.75786f, 0.11201f, 0.61878f, 4.22231f, + -3.55330f, -1.14137f, -0.37722f, -0.28000f, -0.72581f, -2.62827f, -0.19448f, + -0.59398f, -0.30136f, -0.17725f, -0.69630f, -0.41132f, 0.12208f, 2.11441f, + -1.08794f, -1.41694f, 0.02620f, 2.18792f, 0.04271f, +}; + +static const float av1_rect_partition_nn_bias_32_layer1[3] = { + 2.47332f, + -1.65756f, + -0.81573f, +}; + +static const NN_CONFIG av1_rect_partition_nnconfig_32 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + NUM_NODES, + }, // num_hidden_nodes + { av1_rect_partition_nn_weights_32_layer0, + av1_rect_partition_nn_weights_32_layer1 }, + { av1_rect_partition_nn_bias_32_layer0, av1_rect_partition_nn_bias_32_layer1 } +}; + +static const float av1_rect_partition_nn_weights_64_layer0[FEATURE_SIZE * + NUM_NODES] = { + 0.08972f, 4.09095f, -0.31398f, -2.43631f, -0.74767f, 1.42471f, 1.60926f, + 1.44721f, 1.88259f, 2.35375f, 1.88299f, 2.01109f, 0.98679f, 2.24131f, + 0.06279f, -0.08315f, 0.32107f, 0.91334f, -0.36569f, 5.55049f, 5.44943f, + 5.20471f, 5.39099f, -0.01943f, -0.00284f, 0.02203f, -0.01309f, 1.41917f, + 6.68460f, -6.15986f, 6.41341f, -3.20630f, -0.00567f, -0.00038f, 0.05960f, + 0.04308f, 0.95366f, 3.48535f, 2.98266f, 4.11784f, 3.44255f, 0.61630f, + 0.71405f, 0.63945f, -0.00713f, 0.39193f, 1.91621f, 3.32755f, 0.71674f, + -0.11647f, 2.07090f, 2.64191f, 0.07949f, -0.05023f, 0.99935f, 0.83145f, + 0.75898f, -0.98764f, -0.58731f, 1.21734f, -0.08076f, -3.26780f, 1.66278f, + 0.04189f, -0.33177f, -1.58648f, 1.00883f, -0.56132f, -2.34877f, 0.67056f, + -2.32297f, -0.91641f, -1.02909f, 4.19781f, 3.87484f, 4.32778f, -1.97171f, + -0.24734f, 0.00822f, 0.05892f, 0.12697f, -3.62915f, -2.93127f, 7.94856f, + -3.29311f, 3.26001f, -0.02231f, 0.02741f, 0.05919f, 0.08190f, -1.49344f, + -0.64475f, -0.24627f, 4.03324f, -1.14799f, -0.18465f, -0.17829f, 0.10394f, + 0.08580f, -5.74721f, 4.42467f, 3.63964f, 3.00258f, -1.22744f, -0.29408f, + 0.00767f, 0.12305f, 0.05249f, -0.17166f, -0.20120f, -0.32941f, -0.31901f, + 0.04628f, -0.35249f, -0.18272f, 0.03956f, -0.19329f, -0.33564f, 0.09856f, + -0.00173f, -0.31751f, -0.05702f, -0.20558f, -0.31464f, -0.02488f, -0.00729f, + -0.35854f, -0.14762f, -0.34897f, -0.12746f, 0.04011f, -0.24918f, -0.53516f, + -0.28440f, -0.36789f, -1.34889f, -9.10044f, -9.19238f, 4.48042f, 6.54429f, + -0.00226f, 0.00430f, 0.00321f, 0.00442f, 0.87551f, -0.16224f, -0.22832f, + -0.60640f, -0.28738f, 0.18062f, 0.22008f, -0.47406f, 0.80302f, 0.12149f, + 1.49530f, 1.05069f, -2.02985f, -0.92833f, 0.25616f, 0.12852f, 3.51840f, + 0.25226f, -2.63283f, -4.04386f, 8.46300f, -2.93408f, 0.44069f, 0.08276f, + 0.34482f, -0.22615f, 0.28666f, 3.02962f, -1.20055f, -1.04832f, -0.97632f, + -0.99530f, 1.44196f, 1.68550f, 0.49360f, 1.08155f, -0.26059f, -0.02876f, + -0.27492f, -0.06205f, -0.09496f, -0.12314f, -0.30228f, -0.07453f, -0.38857f, + 1.17443f, 2.41497f, 1.90537f, 2.37716f, 2.91495f, -0.44455f, -0.51176f, + 0.48195f, 0.53032f, 0.23696f, -1.06211f, 1.47459f, -0.89029f, 0.29521f, + 0.66291f, -0.42653f, 1.82308f, -1.30372f, -0.36192f, -3.40388f, -1.61476f, + -2.29745f, -0.66886f, -2.08252f, -0.54552f, -4.06849f, 0.02948f, 0.27297f, + -4.81472f, 4.60404f, -0.11053f, 0.14765f, 0.02826f, -0.14688f, -0.07066f, + -0.01224f, 1.20377f, 7.02725f, -6.02627f, 6.87255f, -3.14257f, 0.01074f, + 0.02397f, -0.02359f, 0.01901f, 0.14956f, -1.67671f, 2.26714f, 2.57043f, + -0.45888f, -1.60265f, -2.11475f, -2.74029f, -2.74658f, -0.35630f, -2.63013f, + -2.14814f, -0.67266f, -1.56850f, 0.57137f, -1.14428f, -0.34265f, -0.12521f, + 0.01220f, -0.74906f, -0.19270f, 0.68110f, -0.24737f, -0.70568f, -1.64826f, + -0.35847f, -0.15984f, -1.17932f, -8.72306f, -8.72834f, 3.93701f, 6.17812f, + -0.03191f, -0.00104f, 0.01402f, -0.00046f, -0.94517f, 1.51266f, -0.56318f, + 0.72260f, -0.09253f, -0.09069f, -2.16695f, -0.23653f, 0.24418f, 2.21148f, + -1.47954f, -1.01439f, 0.31536f, 0.77238f, -0.85083f, -0.15758f, -0.50886f, + 0.09101f, +}; + +static const float av1_rect_partition_nn_bias_64_layer0[NUM_NODES] = { + 0.91706f, -1.31328f, -5.16196f, 1.13191f, -0.98044f, -1.61122f, 1.03039f, + -0.98537f, -4.45568f, -4.34802f, -0.92116f, 0.66836f, -0.10752f, -0.13065f, + -0.35567f, -0.35693f, 1.74941f, 1.17379f, -3.45555f, 5.66321f, -0.24917f, + -1.11940f, -0.73656f, -0.19299f, -0.04181f, 1.11010f, -2.97859f, -0.16774f, + 0.59835f, -0.31269f, -0.30585f, -1.66212f, +}; + +static const float av1_rect_partition_nn_weights_64_layer1[NUM_NODES * + LABEL_SIZE] = { + 0.58963f, 4.20320f, -8.62465f, -6.54014f, 5.41108f, 2.33581f, -0.10354f, + -1.17753f, -3.45909f, -2.24722f, 2.20881f, 3.21971f, -0.09087f, -0.21624f, + 0.16529f, -8.40985f, -1.60205f, -1.41538f, 4.41826f, -4.63069f, -0.27742f, + 4.08710f, 0.26439f, -1.46028f, 0.51234f, 6.25212f, -3.35650f, -1.21348f, + 1.37201f, 8.89151f, 0.28859f, -0.97328f, -0.36196f, -2.71701f, 4.54196f, + -0.62476f, -2.43814f, -1.34209f, 0.12850f, 1.73859f, 3.09809f, -4.42434f, + -1.82552f, -3.66420f, -0.31535f, 0.00968f, -0.02019f, 9.66824f, 0.58835f, + 1.50425f, 2.84487f, 2.55522f, 0.01409f, -2.27594f, -0.31800f, 0.91076f, + -0.66808f, 0.33120f, -0.12460f, 0.64457f, -0.36416f, -10.30843f, 1.51013f, + 2.06861f, -0.20989f, -0.87119f, 3.68642f, 7.33662f, -2.88037f, -0.52414f, + -0.35036f, -0.45947f, -0.07406f, 6.46346f, -0.16031f, 0.27071f, 0.38845f, + -0.21940f, 0.08583f, -1.39526f, 0.50554f, 0.45279f, -6.61856f, 1.84069f, + -0.19149f, -1.77235f, 0.75136f, 1.11797f, 0.32677f, -7.10427f, 3.82908f, + 1.04238f, -0.91435f, 1.93317f, -1.84946f, -0.48909f, +}; + +static const float av1_rect_partition_nn_bias_64_layer1[3] = { + 0.32215f, + -0.57522f, + 0.25314f, +}; + +static const NN_CONFIG av1_rect_partition_nnconfig_64 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + NUM_NODES, + }, // num_hidden_nodes + { av1_rect_partition_nn_weights_64_layer0, + av1_rect_partition_nn_weights_64_layer1 }, + { av1_rect_partition_nn_bias_64_layer0, av1_rect_partition_nn_bias_64_layer1 } +}; + +static const float av1_rect_partition_nn_weights_128_layer0[FEATURE_SIZE * + NUM_NODES] = { + -0.70901f, -3.03481f, 3.30604f, -1.28803f, -0.08610f, -0.33320f, -0.30716f, + 0.25100f, 0.14323f, -0.98422f, -0.89084f, -0.24508f, -1.10785f, -0.82524f, + 0.11766f, -0.42777f, 1.08965f, 4.35125f, -1.19388f, 4.22042f, 4.96306f, + 6.32406f, 3.29899f, -0.90768f, 0.05203f, 0.38467f, 1.74257f, -0.19918f, + -0.11335f, 0.00140f, -0.42303f, -0.04419f, 0.03583f, -0.05441f, -0.19586f, + 0.01484f, -1.19964f, 0.25497f, 3.04502f, 0.05446f, -0.23253f, 0.00266f, + 0.07117f, -2.78986f, -4.62953f, 1.45331f, 0.43923f, 0.92298f, -0.47736f, + 1.49165f, 0.45942f, -1.99787f, 3.33510f, 0.17234f, 0.04024f, -1.42780f, + 0.23566f, -0.90970f, 1.18041f, -1.45865f, 2.30878f, -1.28507f, 1.87290f, + 1.91186f, 4.74826f, -3.70735f, 4.49808f, -4.72275f, -0.02696f, -0.02642f, + -0.06093f, -0.01121f, -0.70683f, 2.69737f, -1.88563f, 2.48637f, 1.10922f, + 0.74624f, 0.40308f, 2.06396f, 1.39289f, 0.00909f, -2.05271f, -1.53539f, + -1.38323f, 0.83303f, -0.32250f, 0.51172f, 3.91249f, 1.66373f, 1.13184f, + -2.22874f, -1.13448f, -0.11185f, 0.19387f, 0.36770f, -0.58933f, 0.22789f, + 1.17307f, 0.77461f, 0.20817f, 0.33417f, 0.54037f, 0.32961f, -0.18456f, + -9.78171f, -0.17216f, -3.44703f, -2.42158f, 0.51946f, 4.35949f, -0.73335f, + -1.61515f, -0.29622f, -0.37617f, -0.42316f, 0.74922f, 1.44386f, 3.92704f, + -3.76274f, 4.19775f, -3.86958f, 0.00074f, -0.02418f, -0.12944f, 0.05857f, + -0.85507f, 5.42546f, 5.40338f, 5.54347f, 5.59791f, -0.01611f, 0.01618f, + -0.01654f, -0.00270f, -0.39608f, -0.40410f, -0.24551f, 0.09124f, -0.34413f, + -0.11504f, 0.12793f, -0.31523f, 0.09148f, -0.08567f, -0.05140f, -0.13310f, + -0.81200f, 0.06882f, -0.52537f, -12.74048f, -0.45395f, -4.04775f, -1.84887f, + -1.02573f, 0.32788f, 1.06828f, -1.25503f, -0.42693f, 2.01413f, -2.29103f, + 0.62271f, 1.11764f, -1.83113f, -1.32325f, -1.65651f, -2.87826f, 1.46910f, + 0.60885f, 0.16079f, 0.00171f, -0.25658f, -0.25465f, -0.14149f, 0.19497f, + -0.07866f, -0.37080f, -0.05778f, -0.08870f, -0.20491f, 0.84521f, -0.18214f, + -1.38441f, -1.08932f, -1.76627f, 0.73172f, 0.05967f, 1.28057f, 3.42722f, + 1.69287f, 0.77169f, 0.44528f, 1.85513f, 0.07840f, 1.31252f, 2.89948f, + 1.49489f, 0.15281f, 0.54708f, -1.14185f, -2.51063f, 0.36618f, -0.55322f, + 0.96671f, 1.59470f, 1.38252f, 1.99697f, 0.03266f, -0.23200f, -0.01127f, + -0.18918f, -0.37598f, -0.03119f, -0.36039f, -0.21192f, -0.11565f, -4.22635f, + 1.41252f, 0.56608f, -0.08867f, 3.11924f, -0.54597f, -0.12504f, -0.05289f, + -0.28665f, -0.58297f, -1.18362f, -0.76201f, -1.22011f, -0.58756f, 0.14740f, + 1.43971f, 0.98381f, -0.02998f, -0.40678f, -0.23047f, -0.12979f, 0.04003f, + -0.22081f, -0.09294f, -0.15955f, -0.10379f, -0.10192f, -1.51316f, 2.39482f, + -1.69975f, 3.58976f, -0.91032f, -0.03498f, 0.48982f, -0.13418f, 0.76256f, + 1.61003f, -2.01676f, -1.24430f, -3.25763f, 1.12314f, 2.00740f, 0.04613f, + -0.14746f, -0.57374f, 3.44511f, -0.56767f, -4.08432f, -2.04894f, 2.35951f, + -0.00458f, 0.18512f, 0.09916f, -0.04084f, -1.56207f, 1.38034f, 4.17302f, + -1.47326f, -2.03530f, -0.00210f, 0.27469f, -0.17423f, 0.86860f, 2.76195f, + 2.43269f, -3.57331f, 2.08715f, -1.44171f, -0.17389f, 2.26157f, -0.07852f, + 2.02519f, +}; + +static const float av1_rect_partition_nn_bias_128_layer0[NUM_NODES] = { + 2.53427f, 1.66678f, -0.84914f, -0.15070f, -1.74769f, 0.45218f, -0.26067f, + 2.05916f, 0.08978f, 5.30984f, 2.66243f, -1.62740f, 0.70018f, 1.96403f, + -4.97152f, -0.05425f, -3.84474f, -1.28006f, 3.47490f, -0.08373f, 0.00225f, + -1.40692f, -0.27569f, -0.30253f, 0.77377f, -0.67636f, -0.26379f, 1.82348f, + 0.66120f, 0.61119f, -1.42293f, 0.32676f, +}; + +static const float av1_rect_partition_nn_weights_128_layer1[NUM_NODES * + LABEL_SIZE] = { + 1.53453f, -0.23707f, 7.88368f, 0.33340f, 0.97523f, 1.38538f, -0.16746f, + 4.42070f, 3.18678f, -5.03545f, -2.27029f, -3.75719f, -0.26850f, -4.93432f, + -8.75673f, 0.27398f, -5.77882f, -0.91616f, -2.62725f, -0.23961f, 0.31249f, + 3.32134f, 0.25375f, -0.00394f, 2.30213f, -0.14183f, 0.14544f, -1.42830f, + 1.31101f, 3.99389f, -0.00017f, -2.90184f, -2.11444f, 2.16734f, -3.05133f, + 0.39206f, 4.61489f, -2.88181f, -0.47745f, 2.86649f, -1.20621f, 3.70550f, + 1.58029f, -4.58731f, -2.29350f, -0.76930f, 5.19135f, -0.22521f, -5.08782f, + 2.17316f, 1.30563f, 0.16777f, -2.17767f, -2.09904f, 1.37001f, 0.25091f, + -1.76743f, 1.57940f, 0.30544f, -2.39895f, -0.08532f, -1.77122f, 1.84010f, + -0.88449f, 0.79299f, -1.35368f, -4.54110f, 0.02244f, -5.11580f, 1.60883f, + 0.29352f, -6.47042f, -1.81426f, 1.24013f, 0.90980f, 7.93977f, 2.12555f, + 5.24720f, 4.19508f, 0.21499f, 11.06045f, -0.74752f, 0.89396f, 0.26422f, + 1.72332f, -1.25113f, -1.71136f, 0.13676f, -0.07867f, -0.96929f, 0.19911f, + 3.58233f, -0.76470f, -2.24162f, -2.87465f, 3.18736f, +}; + +static const float av1_rect_partition_nn_bias_128_layer1[3] = { + 1.09014f, + -0.53317f, + -0.55668f, +}; + +static const NN_CONFIG av1_rect_partition_nnconfig_128 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + NUM_NODES, + }, // num_hidden_nodes + { av1_rect_partition_nn_weights_128_layer0, + av1_rect_partition_nn_weights_128_layer1 }, + { av1_rect_partition_nn_bias_128_layer0, + av1_rect_partition_nn_bias_128_layer1 } +}; +#undef FEATURE_SIZE +#undef NUM_NODES +#undef LABEL_SIZE + +// Below are the models used for simple_motion_search_based_split +// Thresholds +// The first index level is for aggresiveness, and the second is frame +// resolution, third is bsize +static const float av1_simple_motion_search_split_thresh[4][3][5] = { + // Aggressiveness = 0 + { + // lowres + { + 1.40402595879f, // p = 0.8028197 + 4.72845183649f, // p = 0.99123732 + 1.86517797783f, // p = 0.86589934 + 1.58715223005f, // p = 0.83021506 + 7.22695596987f, // p = 0.9992738 + }, + // midres + { + 5.839480f, // p = 0.997098 + 1.877167f, // p = 0.867285 + 3.073499f, // p = 0.955783 + 1.405601f, // p = 0.803071 + 2.555636f, // p = 0.927951 + }, + // hdres + { + 5.839480f, // p = 0.997098 + 1.877167f, // p = 0.867285 + 3.073499f, // p = 0.955783 + 1.405601f, // p = 0.803071 + 2.555636f, // p = 0.927951 + }, + }, + // Aggressiveness = 1 + { + // Lowres + { + 100.0000f, // p = 1.000000 + 4.952535f, // p = 0.992984 + 1.720880f, // p = 0.848242 + 1.426233f, // p = 0.806314 + 1.491905f, // p = 0.816364 + }, + // Midres + { + 100.0000f, // p = 100.0000 + 3.137263f, // p = 0.958404 + 2.703262f, // p = 0.937219 + 1.877166f, // p = 0.867285 + 2.221149f, // p = 0.902133 + }, + // Hdres + { + 4.417680f, // p = 0.988082 + 3.086898f, // p = 0.956349 + 3.966704f, // p = 0.981416 + 1.532565f, // p = 0.822381 + 3.449975f, // p = 0.969230 + }, + }, + // Aggressiveness = 2 + { + // lowres + { + 100.000000f, // p = 0.998048 + 1.484020f, // p = 0.815179 + 1.866781f, // p = 0.866085 + 1.706711f, // p = 0.846409 + 2.080369f, // p = 0.888980 + }, + // midres + { + 100.000000f, // p = 0.0 + 3.265763f, // p = 0.963235428881 + 2.024598f, // p = 0.883355591569 + 1.846446f, // p = 0.863709256976 + 2.240962f, // p = 0.903868036126 + }, + // hdres + { + 3.133026f, // p = 0.958234684141 + 2.940954f, // p = 0.949834204693 + 2.484544f, // p = 0.923051170045 + 1.702972f, // p = 0.845922460525 + 1.655562f, // p = 0.839641385729 + }, + }, + // Aggressiveness = 3 + { + // lowres + { 100.000000f, 1.41409519484f, 0.606066095487f, 0.0993410805635f, + 0.762099214988f }, + // midres + { 100.000000f, 0.702207995397f, 0.503550081119f, 0.0403228785199f, + 0.557298794638f }, + // hdres + { 1.21895384144f, 1.26798450469f, 0.872537808115f, 0.975869438148f, + 1.86572095242f }, + }, +}; + +static const float av1_simple_motion_search_no_split_thresh[4][3][5] = { + // Aggressiveness = 0 + { + // lowres + { + -100.0f, // p = 0.0 + -100.0f, // p = 0.0 + -100.0f, // p = 0.0 + -100.0f, // p = 0.0 + -100.0f, // p = 0.0 + }, + // midres + { + -3.38168078f, // p = 0.032872917 + -4.08610739f, // p = 0.016526795 + -1.78302370f, // p = 0.15270848 + -100.000000f, // p = 0.0 + -100.000000f, // p = 0.0 + }, + // hdres + { + -100.000000f, // p = 0.0 + -100.000000f, // p = 0.0 + -2.98718897f, // p = 0.048008 + -100.000000f, // p = 0.0 + -3.33229488f, // p = 0.03447975 + }, + }, + // Aggressiveness = 1 + { + // Lowres + { + -100.0000f, // p = 0.0 + -4.893793f, // p = 0.007437 + -3.387766f, // p = 0.032680 + -2.982806f, // p = 0.048209 + -2.330372f, // p = 0.088639 + }, + // Midres + { + -100.0000f, // p = 0.000000 + -6.131853f, // p = 0.002168 + -2.346579f, // p = 0.087338 + -2.712849f, // p = 0.062219 + -3.195430f, // p = 0.039338 + }, + // Hdres + { + -3.491416f, // p = 0.029557 + -2.192853f, // p = 0.100394 + -3.620180f, // p = 0.026079 + -2.030855f, // p = 0.116001 + -2.797586f, // p = 0.057455 + }, + }, + // Aggressiveness = 2 + { + // lowres + { + -100.0000f, // p = 0.0 + -3.617350f, // p = 0.026151 + -5.902503f, // p = 0.002725 + -4.677840f, // p = 0.009213 + -2.168378f, // p = 0.102626 + }, + // midres + { + -100.0000f, // p = 0.0 + -3.204195f, // p = 0.0390081679555 + -2.354128f, // p = 0.0867382128969 + -2.523326f, // p = 0.0742390077132 + -3.112328f, // p = 0.0426016085803 + }, + // hdres + { + -5.047760f, // p = 0.00638270448225 + -3.414994f, // p = 0.0318301469487 + -5.628090f, // p = 0.00358255438917 + -2.122691f, // p = 0.10691083145 + -1.972387f, // p = 0.122132728355 + }, + }, + // Aggressiveness = 3 + { + // lowres + { -100.000000f, -2.04766486133f, -1.00442099188f, -1.15077982642f, + -1.0830321897f }, + // midres + { -100.000000f, -0.985686808303f, -0.757739584866f, -0.890120107569f, + -0.228236297886f }, + // hdres + { -1.03535679263f, -1.57431743203f, -0.564851540156f, -0.35442301663f, + -1.36741555171f }, + }, +}; + +static const float av1_simple_motion_search_split_mean_128[17] = { + 14.119120f, 14.087010f, 12.016185f, 11.966075f, 12.042454f, 11.994805f, + 12.152105f, 12.100394f, 12.178377f, 12.128937f, 4.779944f, 0.714786f, + 3.535450f, 3.566207f, 0.835913f, 3.315452f, 3.302908f, +}; + +static const float av1_simple_motion_search_split_std_128[17] = { + 1.832420f, 1.835338f, 2.019207f, 2.020793f, 2.008731f, 2.008403f, + 1.900999f, 1.907081f, 1.908915f, 1.913122f, 2.109345f, 0.451517f, + 1.407097f, 1.372501f, 0.370355f, 1.321495f, 1.319665f, +}; + +static const float av1_simple_motion_search_split_mean_64[17] = { + 12.363721f, 12.314348f, 10.404341f, 10.333541f, 10.405775f, 10.336996f, + 10.402246f, 10.330084f, 10.405584f, 10.334330f, 4.554232f, 0.896393f, + 2.819613f, 2.855845f, 0.926296f, 2.808782f, 2.798229f, +}; + +static const float av1_simple_motion_search_split_std_64[17] = { + 1.878920f, 1.882255f, 1.950167f, 1.953289f, 1.913869f, 1.914781f, + 1.920096f, 1.924454f, 1.880200f, 1.882499f, 2.050922f, 0.304750f, + 1.144391f, 1.125088f, 0.261289f, 1.145059f, 1.131215f, +}; + +static const float av1_simple_motion_search_split_mean_32[17] = { + 10.750278f, 10.679627f, 8.745625f, 8.644149f, 8.757436f, 8.656657f, + 8.759780f, 8.656299f, 8.772563f, 8.669839f, 4.208026f, 0.958573f, + 2.308769f, 2.347375f, 0.961685f, 2.323464f, 2.296322f, +}; + +static const float av1_simple_motion_search_split_std_32[17] = { + 1.879269f, 1.883531f, 1.935828f, 1.935677f, 1.915823f, 1.914773f, + 1.909733f, 1.910315f, 1.890451f, 1.890032f, 1.913318f, 0.199276f, + 0.988825f, 0.972115f, 0.191956f, 0.977131f, 0.951418f, +}; + +static const float av1_simple_motion_search_split_mean_16[17] = { + 9.076768f, 8.974986f, 7.078364f, 6.926072f, 7.088739f, 6.936111f, + 7.096697f, 6.942841f, 7.114978f, 6.961046f, 3.865480f, 0.982632f, + 1.886023f, 1.912892f, 0.981492f, 1.926059f, 1.891233f, +}; + +static const float av1_simple_motion_search_split_std_16[17] = { + 1.922965f, 1.925609f, 1.851980f, 1.847558f, 1.848410f, 1.843990f, + 1.843931f, 1.839582f, 1.840304f, 1.836144f, 1.760042f, 0.130639f, + 0.841086f, 0.833523f, 0.134780f, 0.840790f, 0.831309f, +}; + +static const float av1_simple_motion_search_split_mean_8[17] = { + 7.120238f, 6.957731f, 5.176309f, 4.889594f, 5.178396f, 4.886607f, + 5.195322f, 4.905566f, 5.198845f, 4.904745f, 3.648933f, 0.993198f, + 1.496831f, 1.520804f, 0.991864f, 1.489763f, 1.460761f, +}; + +static const float av1_simple_motion_search_split_std_8[17] = { + 1.698498f, 1.696000f, 1.629605f, 1.614641f, 1.632476f, 1.618831f, + 1.618352f, 1.603742f, 1.623089f, 1.609674f, 1.668587f, 0.082193f, + 0.759407f, 0.759684f, 0.089830f, 0.742797f, 0.730632f, +}; + +static const float *const av1_simple_motion_search_split_mean[5] = { + av1_simple_motion_search_split_mean_128, + av1_simple_motion_search_split_mean_64, + av1_simple_motion_search_split_mean_32, + av1_simple_motion_search_split_mean_16, + av1_simple_motion_search_split_mean_8, +}; + +static const float *const av1_simple_motion_search_split_std[5] = { + av1_simple_motion_search_split_std_128, av1_simple_motion_search_split_std_64, + av1_simple_motion_search_split_std_32, av1_simple_motion_search_split_std_16, + av1_simple_motion_search_split_std_8, +}; + +#define NUM_HIDDEN_LAYERS_128 1 +#define NUM_FEATURES_128 17 +#define NUM_LAYER_0_UNITS_128 20 +#define NUM_LOGITS_128 1 + +static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_128[] = { + 0.24095f, -0.397761f, -0.388619f, -0.0629548f, -0.44577f, 0.688212f, + -0.20889f, -1.08227f, -0.0313894f, -0.615505f, -0.401839f, 0.40233f, + -0.171305f, 0.439803f, 1.58527f, -0.968535f, -1.29255f, 1.14846f, + 0.885777f, 0.116412f, -0.225704f, 0.316506f, 0.793951f, -0.63591f, + 0.097789f, -0.327027f, -0.778396f, -0.231667f, -0.9622f, 1.0044f, + 0.32594f, 0.179768f, -0.115529f, -0.499395f, -1.14727f, -1.26111f, + 0.269818f, -0.0882028f, -0.349107f, 0.100901f, 0.0249506f, 0.528929f, + 0.113961f, 0.929794f, 0.242494f, -0.122828f, -0.0477379f, 0.170659f, + 0.0500187f, 0.28859f, 0.78783f, 0.482412f, 0.795298f, 0.179517f, + 0.453911f, -0.298029f, -0.903332f, 0.510615f, 0.691994f, 0.433383f, + -0.140802f, -1.11635f, -0.547326f, 1.11318f, 0.71905f, 0.978538f, + 0.097444f, -0.0386012f, 0.713599f, 0.465164f, 0.391278f, -0.472864f, + 0.230224f, -0.279508f, 0.558192f, -0.468625f, 0.55995f, -0.57507f, + -1.39947f, -0.755819f, -1.04512f, -0.411552f, -0.830444f, -0.106571f, + -0.0972184f, 0.251842f, 0.269955f, 0.230492f, -0.290581f, -0.484799f, + 0.0151041f, 0.171047f, 0.829999f, -0.384581f, 0.220301f, -0.121687f, + 1.88848f, -0.482809f, -0.48185f, 1.34482f, -0.716438f, -0.284482f, + -1.78592f, -1.29333f, 0.886867f, 0.80106f, 0.456415f, 0.649095f, + 0.231093f, 0.361562f, 0.290018f, 0.128009f, -0.196343f, 0.0607802f, + 0.576761f, -0.0413836f, 0.0300984f, -0.318998f, 0.204434f, -0.712524f, + 0.833394f, -0.81168f, 0.765488f, -0.720973f, 1.12866f, -0.838694f, + 1.295f, -0.159127f, 1.05404f, 0.736519f, 0.248662f, 0.229233f, + 0.0434302f, 0.0551856f, 0.197862f, 0.354823f, -0.32429f, -0.227353f, + -0.132198f, -0.438118f, -0.210401f, -0.81046f, 0.653555f, 0.826737f, + 0.154235f, 0.228945f, 0.123089f, 0.614964f, -0.0940471f, -0.00676807f, + 0.24996f, 0.949233f, 0.746526f, -0.044474f, 0.386414f, 0.503221f, + 0.155133f, -0.698848f, -0.735356f, -0.255091f, 0.413235f, -0.335295f, + -0.145757f, 0.326299f, -0.602629f, -0.844474f, -0.346722f, -0.42598f, + -0.491016f, -0.447732f, -0.965366f, -0.0242841f, 0.836606f, -0.104877f, + 1.23236f, 0.683986f, 0.787005f, -0.0253437f, 1.2145f, 1.29554f, + -1.24302f, -0.229495f, 0.439415f, 0.885087f, -0.408704f, -0.119299f, + -0.0960972f, 0.60148f, 0.683271f, -0.057129f, -0.180295f, -0.264815f, + -0.363184f, 0.638271f, 0.631083f, -0.252899f, -0.164364f, -1.31274f, + 0.354408f, 0.0429172f, 0.371154f, -1.0978f, 0.0433642f, -0.467394f, + -0.706572f, 1.57198f, -0.0701271f, 1.93149f, -0.446267f, 1.4519f, + -1.29567f, 0.309978f, -0.878062f, 0.891494f, 0.364005f, -0.209611f, + -0.125927f, 0.184097f, 0.0629695f, -0.43375f, -0.0980562f, 1.08547f, + 0.578312f, 0.16566f, -0.198852f, -0.241854f, -0.523934f, -0.206037f, + -0.867721f, 1.00041f, 1.09848f, -2.12562f, -0.19992f, -0.186128f, + -0.03507f, 0.0484884f, 0.160856f, 0.10802f, -0.805141f, -1.06902f, + 0.290363f, 0.0222096f, -0.849266f, 0.112932f, 0.148682f, -0.0457585f, + 1.139f, 1.79141f, 0.194122f, -0.342508f, -0.403572f, 0.133678f, + 0.217553f, -0.263759f, 0.18441f, 0.254529f, 0.0471115f, 0.733178f, + -0.416205f, 0.441447f, -0.443335f, 0.725005f, -0.78946f, 0.71301f, + -0.644969f, 1.5445f, 0.365277f, -0.455775f, -0.365066f, 0.4742f, + -0.381714f, -0.545794f, -0.0464861f, -0.222768f, -0.0106466f, -0.069743f, + 0.0335566f, 0.378348f, -0.249663f, 0.922286f, 0.125711f, -0.894619f, + 0.444682f, 0.447893f, -1.98936f, -1.41978f, 0.0406667f, -0.199928f, + -0.199786f, 0.463481f, 0.334931f, -0.396222f, -0.0732259f, 0.796684f, + -0.140817f, -0.26878f, 0.194642f, 0.895784f, -0.369976f, -2.26981f, + -0.0791776f, -0.0492268f, 0.6715f, 0.281805f, 0.0156664f, -0.779785f, + 0.17743f, 0.188786f, -0.588077f, -0.359153f, 0.258319f, 0.881688f, + 0.846894f, 1.00292f, 0.838134f, 0.680632f, 0.273098f, -0.329261f, + 0.217757f, -0.506726f, -0.336523f, -0.695875f, -0.252006f, 0.751216f, + 0.334409f, -0.0151467f, 0.0885474f, 0.0973114f, -0.248754f, -0.263716f, + 0.369906f, -0.213749f, -0.0355395f, -0.137799f, 2.43233f, -0.944233f, + -0.745167f, 0.318558f, 0.316608f, 0.568678f +}; + +static const float av1_simple_motion_search_split_hiddenlayer_0_bias_128[] = { + 0.821344f, 1.11542f, -1.24172f, 1.03642f, 1.13511f, + 1.16414f, -0.278655f, -1.35558f, -1.26788f, -1.63189f, + -0.323271f, 1.21319f, -0.888415f, 0.987145f, -1.16767f, + 0.255833f, -0.1392f, 1.43265f, -1.54952f, 1.65159f +}; + +static const float av1_simple_motion_search_split_logits_kernel_128[] = { + 0.3565753f, 0.5490161f, -1.015597f, 0.565366f, 0.751604f, + 0.922747f, -1.931846f, 1.759353f, -0.7362949f, 0.5707034f, + -1.092127f, 0.936767f, 2.034499f, 2.08148f, 0.9509507f, + -1.342504f, -0.834566f, 0.618184f, 0.844113f, 1.182693f +}; + +static const float av1_simple_motion_search_split_logits_bias_128[] = { + 1.819351f +}; + +static const NN_CONFIG av1_simple_motion_search_split_nn_config_128 = { + NUM_FEATURES_128, + NUM_LOGITS_128, + NUM_HIDDEN_LAYERS_128, + { + NUM_LAYER_0_UNITS_128, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_kernel_128, + av1_simple_motion_search_split_logits_kernel_128, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_bias_128, + av1_simple_motion_search_split_logits_bias_128, + }, +}; + +#undef NUM_HIDDEN_LAYERS_128 +#undef NUM_FEATURES_128 +#undef NUM_LAYER_0_UNITS_128 +#undef NUM_LOGITS_128 + +#define NUM_HIDDEN_LAYERS_64 1 +#define NUM_FEATURES_64 17 +#define NUM_LAYER_0_UNITS_64 24 +#define NUM_LOGITS_64 1 + +static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_64[] = { + -1.40663f, -0.851503f, -0.0613111f, 0.741591f, 0.302754f, + 0.184001f, 0.0474853f, 0.371096f, 0.0541624f, 0.381508f, + 0.355427f, 0.0428822f, 0.154916f, -0.00490099f, 0.025484f, + 0.0208921f, 0.140596f, -0.292525f, -0.459067f, -0.081393f, + 0.109824f, -0.290183f, 0.720236f, 0.385835f, -0.150643f, + -0.078518f, 0.0979819f, -0.102135f, 0.137152f, -0.0786457f, + 0.0171441f, 0.991338f, -0.546583f, -1.0714f, -0.0842851f, + 0.244072f, 0.427379f, 0.146775f, -0.921613f, -0.912093f, + 0.393566f, -0.232375f, 0.19963f, 0.312355f, 0.55659f, + -0.104714f, -0.137563f, 0.0985237f, 0.0788307f, -0.225514f, + 0.0228832f, -0.288733f, -0.00737685f, -0.711657f, -0.256796f, + 0.0869605f, 0.583977f, 0.384306f, 1.46692f, -0.741126f, + -0.21105f, -0.276604f, -0.0151463f, -0.0227997f, -0.0403232f, + 0.044122f, 0.0185784f, -0.0451951f, 0.00489513f, -0.387131f, + 0.0966724f, -0.599174f, -0.00243351f, -0.21439f, 0.302043f, + 0.130334f, -0.191251f, 0.863261f, -1.50112f, 0.00901057f, + 0.000324294f, -0.0572545f, 0.0117685f, -0.0734682f, -0.0570435f, + -0.126253f, 1.2313f, -0.328267f, 0.211788f, -0.175438f, + -0.0419298f, 0.166447f, -0.178739f, -0.326221f, -0.0439188f, + 1.01182f, -0.390678f, -0.426343f, 0.0944665f, -0.225042f, + -0.183344f, 0.0500763f, -0.377393f, -0.673401f, -0.436907f, + -0.00366876f, -0.363412f, 0.195194f, 0.250248f, -0.397193f, + -0.0917222f, -0.0221579f, 1.7693f, -0.0694484f, -0.0410764f, + -0.134571f, -0.159992f, -0.170359f, -0.249333f, -0.128056f, + -0.617054f, -0.808701f, -0.540642f, 0.396391f, 0.147787f, + 0.346916f, 0.709852f, 0.116064f, 0.0509731f, 0.073713f, + -0.365082f, -1.09287f, -0.618214f, 0.20545f, 0.126161f, + -0.140012f, 0.62592f, 0.316326f, -0.392765f, -0.15934f, + 0.337617f, -0.41669f, -0.295225f, 0.0602025f, -0.0150657f, + -0.319629f, 0.783729f, -0.0661199f, -0.362657f, 0.390042f, + -0.043614f, -0.0414596f, 0.121155f, -0.309775f, -0.284761f, + -0.243932f, 0.279855f, -0.266823f, 0.734824f, -0.164028f, + 0.261776f, -0.105585f, 0.10733f, -0.180469f, 1.18875f, + -1.12836f, -0.173008f, 0.150221f, 0.111598f, 0.148306f, + -1.2833f, -1.06346f, 0.233546f, 0.16432f, 0.00142378f, + 0.340574f, -0.0140885f, 0.634761f, -0.122096f, 0.821487f, + 0.421424f, -0.0256687f, -0.035503f, -0.0453547f, -0.0215179f, + -0.0671277f, -0.0486862f, -0.962761f, -0.208383f, 0.109573f, + -0.210668f, -0.176485f, 0.421279f, 0.41605f, 0.342084f, + 0.619364f, 0.103718f, -0.00341643f, 0.00266677f, 0.249089f, + -0.22848f, -0.0368968f, 1.12092f, -0.64912f, -0.456579f, + 0.477823f, 0.418345f, 1.41515f, 0.0936279f, 0.886155f, + -0.785656f, -0.217109f, -0.561829f, -0.286435f, -0.884068f, + -0.148839f, -0.282848f, 0.0683745f, 0.0962815f, -0.111975f, + 0.0509158f, -0.211274f, 0.744909f, -0.8982f, 0.315232f, + -0.78624f, 0.598387f, -0.530952f, 0.677357f, 0.0371339f, + 0.99209f, -0.681899f, -0.291416f, -0.224822f, -0.26049f, + -0.0436525f, -0.380004f, -0.27187f, 0.534779f, 0.717939f, + 0.418197f, -0.152539f, -0.0684039f, -0.186308f, -0.0653121f, + 0.194145f, -0.196367f, 0.256997f, -0.726269f, -0.307672f, + -0.153362f, 0.450827f, 0.708842f, -0.0667079f, 0.555564f, + 0.0486892f, 0.0715072f, -0.7211f, -0.849797f, 0.0650271f, + 1.2747f, -0.646738f, -0.53042f, 0.182197f, 0.928203f, + 0.180621f, -0.00640791f, -0.171416f, 0.092688f, -0.391275f, + -0.0650657f, 0.0843773f, 0.170824f, 0.378085f, 0.0596657f, + 0.844398f, -1.3083f, -1.27828f, -0.199179f, 0.557855f, + 0.241479f, 0.385804f, 0.169533f, -0.0028072f, 0.0538041f, + 0.00136234f, 0.0130481f, 0.0349449f, -0.0366494f, -0.000474055f, + 0.437956f, 0.286724f, -0.298187f, 0.461967f, 0.43065f, + -0.0877194f, -0.19133f, 0.379121f, -0.687751f, -1.64077f, + -0.375191f, -0.336836f, -0.323904f, -0.101859f, 0.0126672f, + -0.346332f, 0.112303f, -0.863336f, 0.155538f, 0.366509f, + -0.0976829f, 0.635278f, -0.681967f, -0.527729f, 0.591839f, + 0.366678f, 0.189981f, 0.0208007f, -0.565809f, 0.70183f, + -0.282844f, -0.327485f, 0.347243f, -1.13014f, -0.373378f, + -0.514978f, 0.662994f, -0.144931f, 0.1402f, -0.820049f, + 0.711498f, 0.681156f, 1.06515f, -0.423409f, -0.0392664f, + 0.0675396f, -0.0508602f, 0.0431443f, 0.0212639f, -0.0279887f, + -0.62611f, -0.202064f, 0.701934f, 1.28452f, -0.00858481f, + -0.517249f, 0.0615832f, -0.260215f, 0.0949119f, -0.28423f, + -0.39573f, -0.0574246f, -0.318658f, 0.0601775f, -0.0629386f, + -0.134208f, 0.111686f, -0.23355f, 0.078667f, 0.741023f, + 0.828523f, -0.345067f, -0.315135f, -0.0957154f, 0.522825f, + -0.190057f, -0.473789f, -0.390489f, 0.200677f, -0.0271802f, + 0.110336f, 0.493302f, 0.663126f, 0.570148f, -0.380042f, + -0.437349f, -0.660884f, 0.301908f, 0.0644179f, 0.172494f, + 0.461917f, 0.330938f, -0.140041f, -0.0430205f, -1.51003f, + -0.410984f, -0.182161f, 0.0235313f, -0.364849f, 0.154183f, + -0.592465f, 0.272701f, 0.192389f, -0.0497777f, -0.924467f, + -0.179513f, -0.592217f, 0.436363f, -0.0716164f, 0.189094f, + -0.574697f, -0.304303f, 0.326441f, -0.0865553f, 0.735948f, + 0.266912f, 0.435824f, -0.123322f +}; + +static const float av1_simple_motion_search_split_hiddenlayer_0_bias_64[] = { + -1.19333f, 1.01834f, -1.10844f, 0.0454873f, -1.45506f, 0.580864f, + -0.040979f, -0.505681f, -1.15072f, 0.692697f, -0.520812f, -0.479384f, + 0.529652f, 0.507252f, -1.08619f, 0.0586375f, 0.0929614f, -0.46753f, + -0.701857f, -0.362933f, -0.291983f, -0.133933f, -0.0131351f, -0.267582f +}; + +static const float av1_simple_motion_search_split_logits_kernel_64[] = { + -3.32501f, 0.43082f, -1.060692f, 1.328908f, 0.8892894f, 0.6488833f, + -1.096516f, -0.664786f, -1.301339f, 0.508805f, -2.128406f, -0.757304f, + 0.383839f, 0.694763f, -0.591725f, 0.770385f, 1.021594f, 0.589181f, + -0.76238f, 1.488826f, 0.709135f, -0.575738f, 0.26421759f, -0.2484219f +}; + +static const float av1_simple_motion_search_split_logits_bias_64[] = { + 0.699037f +}; + +static const NN_CONFIG av1_simple_motion_search_split_nn_config_64 = { + NUM_FEATURES_64, + NUM_LOGITS_64, + NUM_HIDDEN_LAYERS_64, + { + NUM_LAYER_0_UNITS_64, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_kernel_64, + av1_simple_motion_search_split_logits_kernel_64, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_bias_64, + av1_simple_motion_search_split_logits_bias_64, + }, +}; + +#undef NUM_HIDDEN_LAYERS_64 +#undef NUM_FEATURES_64 +#undef NUM_LAYER_0_UNITS_64 +#undef NUM_LOGITS_64 + +#define NUM_HIDDEN_LAYERS_32 1 +#define NUM_FEATURES_32 17 +#define NUM_LAYER_0_UNITS_32 20 +#define NUM_LOGITS_32 1 + +static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_32[] = { + -0.980626f, -0.946611f, 0.103761f, 0.408899f, 0.498149f, + 0.0490161f, 0.253279f, 0.332029f, 0.00367441f, 0.364401f, + -0.236433f, 0.0592119f, -0.0978848f, 0.159733f, -0.018052f, + -1.10726f, 1.16167f, -0.244982f, -0.147819f, -0.147095f, + 0.111404f, -0.349502f, 0.441178f, 0.0984191f, -0.135537f, + -0.0423312f, 0.0123079f, 0.358012f, -0.266796f, 0.0125811f, + 0.196563f, 0.337093f, -1.07266f, -1.25134f, 0.57337f, + -0.521717f, 0.259824f, 0.537383f, -0.463688f, -0.336128f, + 0.373385f, 0.483443f, -0.229293f, -0.33373f, -0.656021f, + 0.768647f, 0.179279f, 0.315415f, 0.187749f, 1.07839f, + 0.0626629f, -0.230299f, 0.662606f, -0.414154f, 0.459334f, + -0.6312f, 0.427704f, -0.249849f, 0.701056f, -0.707969f, + 0.057401f, 0.620434f, 0.665748f, -0.501356f, -0.230685f, + 0.0722371f, -0.0988625f, -0.114035f, -0.653799f, 0.571353f, + 0.268276f, 1.13251f, -1.0695f, -0.225607f, -0.984355f, + -0.42213f, 0.300422f, 1.21492f, -0.139931f, -0.000726004f, + 0.045964f, -0.0817352f, -0.0278813f, -0.0102341f, -0.0144087f, + -0.475882f, 1.20682f, -0.359919f, 0.277189f, -0.166401f, + 0.599211f, -0.129872f, 0.574211f, -0.247573f, 0.824405f, + -1.53329f, -0.202151f, -0.328698f, -0.516322f, -0.281416f, + -0.383651f, -0.252862f, -0.43185f, 0.456802f, -0.430055f, + -0.55245f, -0.6884f, -0.541456f, -0.281376f, 1.10425f, + -0.140706f, 1.59816f, -0.0343895f, -0.00920039f, -0.0307667f, + 0.0560132f, -0.0340302f, -0.10848f, 0.0593314f, -0.951795f, + 0.876831f, -1.00548f, -0.566244f, 0.430061f, 1.10109f, + -0.634212f, -0.0755369f, -0.108953f, 1.03191f, 0.109036f, + -0.0415309f, 0.0681162f, -0.0611775f, -0.0231938f, 0.0973158f, + -0.0558169f, -0.823484f, -0.918509f, 0.16756f, 0.27087f, + 0.286074f, 0.174069f, 0.1304f, 0.386074f, 0.433953f, + 0.0291467f, -1.74087f, 0.0296094f, -0.00793714f, -0.13041f, + 0.00990992f, -0.0137848f, -0.0742606f, -0.251029f, -0.645316f, + 0.640029f, 0.550607f, 0.470097f, 0.549451f, -0.285723f, + -0.164759f, -0.128166f, -0.391496f, -0.80287f, 0.0769472f, + 1.34391f, 0.0215005f, 0.0669497f, 0.131919f, 0.291674f, + 0.0952889f, -0.677953f, -0.364054f, 0.144823f, 0.246198f, + -0.12393f, 0.363661f, 0.215091f, -0.239658f, 0.18491f, + 0.118703f, 0.0064156f, 1.38619f, -1.3845f, 0.0567323f, + 1.20812f, -0.720374f, -1.92158f, -1.48657f, 0.335601f, + 0.409379f, 0.373618f, 0.231274f, 0.292194f, 0.368619f, + 0.2398f, 0.473579f, 0.83402f, -0.0133751f, -0.00344358f, + 2.20688e-05f, 0.00836757f, 0.00405377f, 0.0110539f, -0.260154f, + 0.192112f, -0.666986f, 0.302875f, -0.113302f, 0.17882f, + -0.221493f, 0.146161f, -0.448697f, 0.584187f, 0.122109f, + 0.989981f, -1.14706f, -0.734042f, 0.0638213f, 0.213357f, + 0.068543f, -0.808558f, 0.404741f, 0.808313f, 1.57523f, + -0.113448f, 0.254102f, -0.350065f, -0.615f, 0.0753549f, + -0.540936f, -0.0250732f, -0.225681f, -0.161384f, 0.0128342f, + -0.0933368f, -0.286904f, 0.130133f, -0.874747f, 0.392585f, + -0.493135f, 0.169708f, 0.0909804f, 1.89921f, -0.469954f, + 0.65165f, -0.953401f, -0.21595f, -0.37479f, 0.0451146f, + 0.0234621f, -0.0596903f, -0.0682308f, -0.0830426f, 0.130011f, + -0.409141f, 0.0627038f, -0.581148f, -0.513922f, 0.631676f, + 0.0637034f, 0.0539081f, 0.0638872f, 0.515863f, -0.0123463f, + 0.177238f, 0.279506f, -0.930345f, 1.23726f, 0.202851f, + 0.708792f, -0.445086f, -0.0267075f, -0.913822f, -0.0714978f, + -0.281107f, -0.0770565f, -0.23086f, -0.165893f, -0.319683f, + 0.216235f, -0.490999f, 2.04841f, -0.0524071f, -0.239043f, + -0.0526375f, 0.023002f, -0.132685f, -0.155354f, -0.186503f, + -0.904296f, 0.166478f, 0.063268f, -0.302842f, -0.27179f, + -0.428299f, 0.50193f, 0.480717f, -0.864275f, 0.317096f, + 0.40698f, 0.0286107f, 0.189432f, -0.0374374f, 0.0671728f, + 0.203681f, -0.457959f, -0.155776f, 0.340948f, 0.542841f, + 0.342675f, -0.000952399f, 0.470957f, 0.744418f, -1.11763f, + -0.658812f, -0.044832f, 0.0688237f, -0.357766f, 0.428662f, + -0.087152f, -0.291903f, 0.373244f, -0.587853f, 0.415895f, + -0.535694f, 0.621785f, -0.143648f, 0.0451373f, 0.00068827f, + 1.84432f, -1.26239f, -0.432087f, -0.152307f, 0.0293551f, + 0.184744f, -0.0173156f, -0.00572154f, -0.0305062f, -0.0900071f +}; + +static const float av1_simple_motion_search_split_hiddenlayer_0_bias_32[] = { + 0.160011f, 0.903856f, -0.13738f, 0.358221f, -0.0906044f, + -0.606558f, -0.0215651f, -0.03377f, -1.67017f, -0.144554f, + -0.201482f, -0.87719f, 0.639815f, -0.51976f, -0.309922f, + -1.33421f, 0.721328f, -0.889354f, -1.7158f, -0.285963f +}; + +static const float av1_simple_motion_search_split_logits_kernel_32[] = { + -0.2745374f, 0.333548f, -0.2437388f, 0.288009f, 0.55635f, + 0.4560176f, 0.2970518f, 0.391192f, 1.311854f, -0.231219f, + -0.2968651f, -1.819984f, 0.2775824f, 0.28929857f, 0.419126f, + -0.32868411f, -0.916399f, -0.1921077f, -0.617489f, 0.637953f +}; + +static const float av1_simple_motion_search_split_logits_bias_32[] = { + 0.208473f +}; + +static const NN_CONFIG av1_simple_motion_search_split_nn_config_32 = { + NUM_FEATURES_32, + NUM_LOGITS_32, + NUM_HIDDEN_LAYERS_32, + { + NUM_LAYER_0_UNITS_32, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_kernel_32, + av1_simple_motion_search_split_logits_kernel_32, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_bias_32, + av1_simple_motion_search_split_logits_bias_32, + }, +}; + +#undef NUM_HIDDEN_LAYERS_32 +#undef NUM_FEATURES_32 +#undef NUM_LAYER_0_UNITS_32 +#undef NUM_LOGITS_32 + +#define NUM_HIDDEN_LAYERS_16 1 +#define NUM_FEATURES_16 17 +#define NUM_LAYER_0_UNITS_16 20 +#define NUM_LOGITS_16 1 + +static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_16[] = { + 0.0136957f, 0.182135f, -0.583394f, 0.0556956f, 0.211152f, + 0.168234f, -0.694203f, -0.678216f, 0.289943f, 1.00014f, + -0.0427784f, -0.0427538f, -0.0276009f, -0.00133608f, 0.0901944f, + 0.0674892f, 0.104068f, -0.308582f, -0.43596f, 0.855997f, + -0.223414f, 0.0390026f, 0.366492f, 0.216065f, -0.386863f, + -0.148823f, -0.297022f, 0.0529546f, -0.202885f, 1.26471f, + -0.861163f, -0.0949431f, 0.573627f, -0.00277083f, -0.616063f, + -0.626927f, 0.371583f, -0.411743f, 0.173387f, -0.209734f, + 0.293697f, -0.260714f, 0.442728f, -0.594486f, 1.38987f, + 0.208025f, -0.0433776f, 0.01173f, 0.921766f, -0.168379f, + 0.000697326f, 0.209967f, -0.304577f, 0.149551f, -0.196658f, + 0.389251f, -0.449106f, -0.456329f, 0.669073f, -0.163806f, + 0.083348f, -0.0783998f, 0.0678355f, 0.0510435f, 0.103964f, + 0.104537f, -0.778093f, -1.0641f, -0.626102f, -2.02131f, + 0.159591f, 0.254161f, -0.000362642f, 0.289859f, 0.192713f, + 0.139801f, -0.0251327f, 0.164002f, 1.22892f, -0.0852193f, + 0.0769487f, 0.0296408f, -0.0418688f, 0.0936023f, 0.0448523f, + 0.674015f, -0.0732944f, 0.313575f, -0.593432f, 0.642067f, + -1.06063f, 0.468223f, -0.769085f, -0.173798f, -0.175663f, + 0.692808f, 0.00753295f, -0.123327f, -0.0234937f, -0.0923153f, + 0.0216917f, -0.0690157f, -0.397488f, 0.426628f, 0.264475f, + 0.342074f, -0.139817f, 0.215915f, 0.422544f, -0.321102f, + 0.0355587f, 0.460193f, 0.0315326f, 0.080556f, -0.0256533f, + -0.0857874f, -0.488283f, -0.299653f, -0.245987f, 0.104383f, + 0.203731f, 0.328734f, 0.668104f, -0.586909f, -0.501335f, + -0.661292f, -0.359811f, 0.00951363f, 0.816315f, -0.0124104f, + 0.0545827f, 0.089863f, 0.0125486f, 0.043609f, -0.0259544f, + 0.0123911f, 0.12557f, -0.539875f, -0.0556721f, 0.16532f, + 0.265834f, -0.384171f, 0.646496f, 0.366147f, -0.111272f, + 0.262096f, -0.0845724f, 0.382724f, 0.165783f, 0.1025f, + 0.392988f, 0.290525f, 0.038659f, 0.540269f, -0.485586f, + -0.273065f, -0.154052f, -0.0896895f, -0.35394f, 0.193214f, + -0.423728f, 0.654576f, -0.373321f, 0.814914f, 0.026278f, + -0.0328304f, -0.220913f, -0.0442121f, 0.487545f, -0.509537f, + -0.777581f, -1.23886f, 0.223482f, 0.206009f, 0.20391f, + 0.194628f, 0.226762f, 0.171609f, -0.219037f, 0.557892f, + -0.312011f, 1.27709f, 0.064013f, 0.105384f, 0.0493933f, + 0.074059f, -0.0100078f, -0.0176888f, -0.440005f, 0.302922f, + -0.197456f, 0.296128f, -0.326647f, 0.305323f, -0.30696f, + 0.201951f, -0.15874f, -0.793042f, 0.0197254f, 0.0569867f, + -0.0295468f, -0.0215012f, 0.025855f, -0.0196102f, 0.215558f, + -0.253069f, 0.298469f, 0.261269f, 0.435305f, 0.0120354f, + -0.384789f, -0.2772f, 0.0366613f, -0.494994f, 0.149072f, + 1.32981f, -0.427717f, 0.43938f, -0.16375f, -0.444342f, + 0.548214f, 0.127955f, -1.24387f, 0.0863676f, 0.175071f, + 0.172673f, -0.0906204f, 0.444454f, -0.546669f, 0.215857f, + -0.100621f, 0.200699f, -0.0985915f, 0.134706f, -0.256396f, + 0.393427f, 0.119606f, -0.214278f, -0.0183637f, 0.194266f, + -0.238025f, 0.182203f, 0.599718f, 0.846933f, 0.0607852f, + -0.183434f, -0.723743f, -0.72414f, -0.124701f, 0.0227527f, + -0.0664636f, -0.0385867f, -0.0257377f, -0.149054f, 0.12077f, + 0.678029f, -0.624456f, 0.189644f, -0.518604f, 0.134397f, + -0.189777f, -0.309376f, -0.00377086f, 0.701132f, -0.170915f, + 0.00736111f, -0.121906f, 0.329136f, 0.165514f, 0.0328356f, + 0.171275f, 0.248619f, 0.247704f, -0.449933f, 0.0841684f, + 0.136982f, 0.122703f, -0.0169439f, -0.0726496f, 0.302648f, + -0.128556f, 0.0667425f, -0.289717f, -0.207532f, -1.20269f, + -0.68892f, 0.045259f, 0.0973945f, 0.0988314f, -0.944748f, + -0.180401f, 0.134331f, 0.033834f, 0.109023f, 0.265723f, + 0.38063f, -0.106518f, -0.0686953f, 0.3744f, -1.0957f, + 0.0302782f, 0.0515164f, 0.00188222f, 0.0014413f, -0.0404425f, + 0.0124618f, -0.0828645f, 0.506166f, -0.776352f, -0.405138f, + -0.123887f, 0.0732116f, 0.379928f, 0.604524f, -0.492317f, + 0.439191f, 0.0744193f, 0.389101f, 0.0604518f, 0.0943165f, + 0.0339942f, 0.0917975f, 0.0161988f, 0.512227f, 0.538021f, + -0.411495f, 0.307281f, 0.33746f, -0.218639f, 0.265742f, + 0.39738f, -0.12442f, 0.125236f, -0.0845223f, -0.150396f, + 0.0334878f, -0.00391915f, 0.0406864f, -0.0487059f, 0.0377073f +}; + +static const float av1_simple_motion_search_split_hiddenlayer_0_bias_16[] = { + 0.0535976f, -0.0130279f, 0.150146f, -0.511132f, -0.357698f, + 0.6719f, -1.27877f, -0.0208048f, 0.0961914f, 0.263603f, + 0.704574f, -1.48998f, 0.728063f, 0.941829f, -0.199981f, + 0.797802f, -0.29816f, -0.60894f, -0.116624f, -1.16723f +}; + +static const float av1_simple_motion_search_split_logits_kernel_16[] = { + 0.343153f, -0.2110482f, -0.487199f, 0.3274144f, -2.1975f, + -0.6051438f, 0.1901127f, 0.4741924f, -0.24029f, -0.185018f, + -0.652635f, 2.57714f, -0.31033031f, -0.307222f, 0.329035f, + -0.430181f, 0.3429f, 0.742292f, 0.3269808f, 0.4142165f +}; + +static const float av1_simple_motion_search_split_logits_bias_16[] = { + -0.783658f +}; + +static const NN_CONFIG av1_simple_motion_search_split_nn_config_16 = { + NUM_FEATURES_16, + NUM_LOGITS_16, + NUM_HIDDEN_LAYERS_16, + { + NUM_LAYER_0_UNITS_16, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_kernel_16, + av1_simple_motion_search_split_logits_kernel_16, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_bias_16, + av1_simple_motion_search_split_logits_bias_16, + }, +}; + +#undef NUM_HIDDEN_LAYERS_16 +#undef NUM_FEATURES_16 +#undef NUM_LAYER_0_UNITS_16 +#undef NUM_LOGITS_16 + +#define NUM_HIDDEN_LAYERS_8 1 +#define NUM_FEATURES_8 17 +#define NUM_LAYER_0_UNITS_8 20 +#define NUM_LOGITS_8 1 + +static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_8[] = { + 0.079443f, -1.04068f, 0.336819f, -0.20901f, 0.796251f, + 0.181066f, 0.0118876f, -0.207145f, 0.250671f, -0.402119f, + -0.0847227f, 1.88683f, 0.303469f, 0.0718458f, 0.0338589f, + 0.158896f, 0.0540238f, -0.385426f, 0.955925f, 0.424506f, + 0.492584f, -0.795058f, -0.248667f, -0.905349f, -0.316989f, + 0.545471f, 0.63762f, -0.232613f, -0.238947f, -0.395338f, + -0.322673f, -0.0761563f, -0.125357f, 0.0694415f, -0.371599f, + 0.358387f, -0.486841f, 0.403863f, -0.0295666f, 0.283074f, + -0.424396f, 0.156318f, -0.685355f, 0.6663f, 0.337949f, + 0.273198f, 0.517448f, 0.458911f, 0.157252f, 0.692096f, + 0.64965f, -0.23987f, -1.08431f, -0.252475f, -0.332614f, + -0.712291f, -0.380973f, 0.460545f, 0.48936f, 0.337601f, + 0.489223f, 1.65336f, -0.223585f, 0.17367f, -0.235057f, + -0.456773f, 0.327877f, -0.221192f, -0.940151f, -1.06616f, + 0.687084f, -0.109973f, 0.106636f, 0.445895f, 0.163432f, + 0.378306f, 0.201902f, 0.176811f, 0.693082f, 1.62156f, + -0.178346f, 0.455175f, 1.61943f, 0.231376f, 0.0890932f, + -0.889693f, -1.03298f, 0.778196f, -0.0289539f, 0.137848f, + 0.18707f, 0.171889f, 0.119157f, 0.24893f, -0.313628f, + 0.00250735f, -0.0758209f, 0.272974f, -0.229825f, 2.47926f, + -0.0354665f, 0.175366f, 0.0411555f, -1.52149f, -0.0258663f, + 0.253027f, -0.0520839f, -0.0189782f, 0.362387f, -0.371154f, + 0.622929f, 0.0447056f, 0.242529f, -0.168391f, 0.308935f, + -0.117294f, 2.16307f, 0.0673638f, 0.080771f, -0.460779f, + -0.940176f, 0.473266f, -0.0125302f, 0.475145f, -0.218187f, + 0.43258f, -0.0380196f, 0.413607f, -0.110856f, -1.52076f, + 0.0896812f, 0.246636f, -0.0612008f, 0.189583f, 0.0106902f, + -0.158403f, -0.629377f, -0.0634279f, -0.0864584f, -0.226568f, + -0.286234f, -0.0721132f, -0.43702f, 0.113702f, 0.433372f, + 0.743396f, 0.14312f, 0.29914f, 0.801188f, 0.7609f, + 0.385046f, 0.480314f, 0.171119f, -1.59058f, -1.18853f, + 0.150676f, 0.408123f, -0.00677924f, 0.398145f, 0.0914611f, + 0.176945f, 0.0677457f, 0.316478f, 0.998219f, -0.22618f, + 0.0756793f, -0.0156674f, 0.105716f, 0.0496245f, -0.0827133f, + -0.423119f, -0.161033f, 0.212962f, -0.234453f, 0.743366f, + 1.04108f, 0.0597604f, -0.285993f, -0.114829f, -0.557364f, + -0.840051f, 0.326509f, -0.192508f, -0.141769f, 0.370626f, + -0.126353f, 0.00672923f, 0.493623f, -0.852076f, 0.466798f, + -0.226436f, 0.259268f, -0.452662f, 0.0721126f, 0.0198245f, + 0.2048f, 0.02506f, 0.316194f, 0.814651f, 1.01288f, + -0.569607f, -0.0838994f, 1.37146f, -0.613135f, 0.441761f, + -0.643901f, 0.364269f, -0.147177f, 0.338001f, -0.332376f, + 0.518875f, -0.628964f, -0.291889f, -0.050736f, 0.108047f, + 1.05673f, 0.0479492f, 0.466756f, -0.0867334f, -0.0355575f, + 0.57626f, -0.227583f, -0.146421f, 0.0990489f, 0.117351f, + -0.103858f, -0.0336936f, 0.0201903f, -0.0766383f, -0.010211f, + 0.0400779f, 0.0725462f, 0.137142f, 0.478261f, 0.287869f, + 0.0882359f, -0.739754f, -0.853521f, -0.43703f, 0.316856f, + 0.27593f, 0.312149f, 0.175575f, 0.441839f, 0.264325f, + 0.0148051f, -0.005559f, 0.373176f, 0.933701f, -0.0197615f, + 0.0219723f, -0.0559883f, -0.103456f, -0.0323009f, 0.0773202f, + -0.390838f, 0.855488f, -0.596525f, -0.249093f, 0.124262f, + 0.220172f, 0.0552478f, 1.04041f, -0.960992f, -0.495255f, + -0.211612f, 0.350007f, -0.238998f, -0.0265068f, 0.384686f, + -0.0815808f, -0.0570019f, 0.123903f, -0.485114f, -0.00282573f, + -0.0649603f, 0.163719f, -0.469479f, -0.439713f, 0.0602562f, + -0.527993f, -0.111458f, 2.48686f, -0.180723f, 0.0553895f, + 0.0560679f, -0.0978928f, -0.216063f, 0.089457f, -1.5602f, + -1.62332f, -0.147388f, 0.736155f, 0.440409f, 0.243519f, + 0.0622638f, 0.522932f, 0.109686f, 0.422849f, 0.510589f, + 1.01116f, 0.174019f, 0.0191171f, -0.0717751f, -0.0068308f, + 0.172932f, -0.834888f, -0.635788f, 0.32012f, 0.298656f, + 0.274309f, -0.155456f, 0.1755f, -0.175171f, 0.343498f, + -0.122832f, -0.107696f, 0.279924f, -0.797633f, -0.344658f, + 0.162669f, 0.389092f, 0.644479f, -0.635216f, -0.181868f, + 0.0579244f, -0.0568976f, 0.433003f, -0.591067f, 0.71013f, + -0.165515f, 0.225725f, -0.358156f, 0.0541944f, 1.95485f, + -0.315223f, 0.61537f, -0.0401568f, 0.22811f, 0.271147f +}; + +static const float av1_simple_motion_search_split_hiddenlayer_0_bias_8[] = { + 1.63441f, -0.616459f, -0.437775f, -0.71669f, 1.56616f, 2.28109f, 1.64054f, + -1.51476f, 0.0274108f, 0.935156f, -0.966329f, 0.906069f, 1.19954f, -1.25867f, + -1.7376f, -0.594211f, 0.322242f, 0.438631f, -1.01682f, 1.30032f +}; + +static const float av1_simple_motion_search_split_logits_kernel_8[] = { + -0.463187f, 0.2936127f, 0.16762f, -0.1663271f, -0.292418f, + -0.421457f, -0.378265f, 1.053049f, 0.32432879f, -0.49775575f, + 0.427357f, -0.239251f, -0.1631546f, 0.335468f, 0.255371f, + 0.276901f, -0.665683f, -0.7021493f, 0.381513f, -0.1339761f +}; + +static const float av1_simple_motion_search_split_logits_bias_8[] = { + -1.739754f +}; + +static const NN_CONFIG av1_simple_motion_search_split_nn_config_8 = { + NUM_FEATURES_8, + NUM_LOGITS_8, + NUM_HIDDEN_LAYERS_8, + { + NUM_LAYER_0_UNITS_8, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_kernel_8, + av1_simple_motion_search_split_logits_kernel_8, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_bias_8, + av1_simple_motion_search_split_logits_bias_8, + }, +}; + +#undef NUM_HIDDEN_LAYERS_8 +#undef NUM_FEATURES_8 +#undef NUM_LAYER_0_UNITS_8 +#undef NUM_LOGITS_8 + +static const NN_CONFIG *const av1_simple_motion_search_split_nn_config[5] = { + &av1_simple_motion_search_split_nn_config_128, + &av1_simple_motion_search_split_nn_config_64, + &av1_simple_motion_search_split_nn_config_32, + &av1_simple_motion_search_split_nn_config_16, + &av1_simple_motion_search_split_nn_config_8, +}; + +// Model based on simple_motion_search for pruning rect +// Thresholds. The first idx level is aggresiveness, second is frame resolution, +// third is bsize +static const float av1_simple_motion_search_prune_rect_thresh[4][3][5] = { + // Aggressivness = 0 + { + // Lowres + { 0.0288721601835f, 0.0281573780991f, 0.0225501403434f, + 0.000961189195907f, 0.0f }, + // Midres + { 0.0288721601835f, 0.0281573780991f, 0.0225501403434f, + 0.000961189195907f, 0.0f }, + // Hdres + { 0.0288721601835f, 0.0281573780991f, 0.0225501403434f, + 0.000961189195907f, 0.0f }, + }, + // Aggressivness = 1 + { + // Lowres + { + 0.000000f, + 0.116076f, + 0.049759f, + 0.057747f, + 0.006001f, + }, + // Midres + { + 0.000000f, + 0.017380f, + 0.026077f, + 0.078111f, + 0.064477f, + }, + // Hdres + { + 0.002994f, + 0.103093f, + 0.076408f, + 0.010456f, + 0.187211f, + }, + }, + // Aggressiveness = 2 + { + // Lowres + { + 0.000000f, + 0.003111f, + 0.144294f, + 0.144884f, + 0.069924f, + }, + // Midres + { + 0.000000f, + 0.013696f, + 0.055203f, + 0.152271f, + 0.078886f, + }, + // Hdres + { + 0.030577f, + 0.082486f, + 0.040690f, + 0.140924f, + 0.067608f, + }, + }, + // Aggressiveness = 3 + { + // Lowres + { 0.0f, 0.352338114654f, 0.171190796972f, 0.322629318068f, + 0.287219697095f }, + // Midres + { 0.0f, 0.30938393361f, 0.271772875141f, 0.240627957104f, + 0.178833795641f }, + // Hdres + { 0.285731215187f, 0.37521798723f, 0.142380566244f, 0.338288917819f, + 0.21329309279f }, + }, +}; + +// Mean and std +static const float av1_simple_motion_search_prune_rect_mean_128[25] = { + 13.292176f, 13.231236f, 11.098058f, 11.049944f, 10.481336f, + 10.431587f, 10.789337f, 10.732787f, 10.233817f, 10.173738f, + 12.214045f, 12.157505f, 11.863353f, 11.802220f, 12.204053f, + 12.152315f, 11.517566f, 11.465651f, 5.383040f, 0.757934f, + 4.012611f, 4.052191f, 0.853365f, 3.954503f, 3.944135f, +}; + +static const float av1_simple_motion_search_prune_rect_std_128[25] = { + 2.589217f, 2.559396f, 2.268402f, 2.282274f, 3.341234f, 3.341994f, 3.033007f, + 3.041550f, 3.786247f, 3.784053f, 2.523459f, 2.511275f, 3.349364f, 3.340481f, + 2.390149f, 2.384226f, 3.599467f, 3.587460f, 2.319911f, 0.428335f, 1.241087f, + 1.208679f, 0.353742f, 1.228122f, 1.211777f, +}; + +static const float av1_simple_motion_search_prune_rect_mean_64[25] = { + 11.439831f, 11.382639f, 9.647134f, 9.578121f, 9.146770f, + 9.084122f, 8.559063f, 8.499496f, 8.095865f, 8.041795f, + 10.547537f, 10.486240f, 9.362147f, 9.308391f, 10.548071f, + 10.484358f, 10.002225f, 9.944480f, 4.964504f, 0.897164f, + 3.306144f, 3.351039f, 0.928582f, 3.319739f, 3.287726f, +}; + +static const float av1_simple_motion_search_prune_rect_std_64[25] = { + 2.033404f, 2.050657f, 2.064671f, 2.081519f, 2.916312f, 2.914649f, 3.628949f, + 3.618760f, 4.011421f, 3.996068f, 2.087127f, 2.103106f, 3.885277f, 3.876166f, + 2.035599f, 2.052976f, 3.052501f, 3.050985f, 2.232998f, 0.303745f, 1.111161f, + 1.081292f, 0.257521f, 1.112510f, 1.089404f, +}; + +static const float av1_simple_motion_search_prune_rect_mean_32[25] = { + 9.862349f, 9.793658f, 8.043962f, 7.954083f, 8.058867f, 7.966165f, 8.046844f, + 7.956817f, 8.061414f, 7.967906f, 8.966450f, 8.890165f, 8.968315f, 8.891513f, + 8.953573f, 8.877070f, 8.974275f, 8.895363f, 4.387239f, 0.954143f, 2.701000f, + 2.751266f, 0.963302f, 2.716584f, 2.709725f, +}; + +static const float av1_simple_motion_search_prune_rect_std_32[25] = { + 1.971555f, 1.985517f, 1.935986f, 1.944743f, 1.924122f, 1.932169f, 1.943151f, + 1.950612f, 1.931156f, 1.938242f, 1.987803f, 1.997670f, 2.000859f, 2.009913f, + 1.938270f, 1.949277f, 1.922999f, 1.933145f, 1.991504f, 0.209175f, 0.973824f, + 0.952221f, 0.188018f, 0.985295f, 0.946228f, +}; + +static const float av1_simple_motion_search_prune_rect_mean_16[25] = { + 8.391692f, 8.303431f, 6.590342f, 6.459725f, 6.460719f, 6.333274f, 6.592615f, + 6.461661f, 6.464787f, 6.337191f, 7.499753f, 7.395166f, 7.503220f, 7.398344f, + 7.498312f, 7.395039f, 7.353743f, 7.253139f, 3.874267f, 0.979701f, 2.087404f, + 2.131698f, 0.981005f, 2.110868f, 2.106539f, +}; + +static const float av1_simple_motion_search_prune_rect_std_16[25] = { + 1.865867f, 1.870012f, 1.773885f, 1.770447f, 1.972922f, 1.961361f, 1.777224f, + 1.772864f, 1.974519f, 1.962281f, 1.831632f, 1.831837f, 1.837595f, 1.837008f, + 1.822791f, 1.822053f, 2.074991f, 2.067200f, 1.676261f, 0.141022f, 0.840297f, + 0.829935f, 0.136507f, 0.828972f, 0.808563f, +}; + +static const float av1_simple_motion_search_prune_rect_mean_8[25] = { + 6.997798f, 6.867032f, 5.134819f, 4.883330f, 5.134804f, 4.879707f, 5.140518f, + 4.886751f, 5.142186f, 4.885262f, 6.069946f, 5.896944f, 6.080442f, 5.906130f, + 6.077539f, 5.905929f, 6.083087f, 5.909298f, 3.552709f, 0.990654f, 1.497349f, + 1.531762f, 0.989606f, 1.496581f, 1.484139f, +}; + +static const float av1_simple_motion_search_prune_rect_std_8[25] = { + 1.727562f, 1.725050f, 1.633396f, 1.618773f, 1.633586f, 1.620657f, 1.620798f, + 1.604892f, 1.621570f, 1.607439f, 1.691024f, 1.684225f, 1.676065f, 1.668442f, + 1.680016f, 1.672452f, 1.677775f, 1.671586f, 1.451902f, 0.096223f, 0.751190f, + 0.754040f, 0.101419f, 0.738239f, 0.729455f, +}; + +static const float *const av1_simple_motion_search_prune_rect_mean[5] = { + av1_simple_motion_search_prune_rect_mean_128, + av1_simple_motion_search_prune_rect_mean_64, + av1_simple_motion_search_prune_rect_mean_32, + av1_simple_motion_search_prune_rect_mean_16, + av1_simple_motion_search_prune_rect_mean_8, +}; + +static const float *const av1_simple_motion_search_prune_rect_std[5] = { + av1_simple_motion_search_prune_rect_std_128, + av1_simple_motion_search_prune_rect_std_64, + av1_simple_motion_search_prune_rect_std_32, + av1_simple_motion_search_prune_rect_std_16, + av1_simple_motion_search_prune_rect_std_8, +}; + +#define NUM_HIDDEN_LAYERS_128 1 +#define NUM_FEATURES_128 25 +#define NUM_LAYER_0_UNITS_128 8 +#define NUM_LOGITS_128 4 + +static const float av1_simple_motion_search_prune_rect_logits_kernel_128[] = { + -0.129103f, 0.457758f, -0.489986f, 0.65462f, -0.184312f, 3.81202f, + -0.444407f, -0.64198f, -0.575008f, 0.0311711f, 0.525243f, -20.892f, + 1.08811f, -65.0976f, -12.3973f, -1.38278f, -0.264233f, 0.241636f, + -10.6925f, -0.725414f, -18.8987f, -40.2284f, -16.08f, 0.995331f, + 1.47614f, -0.964864f, 0.405506f, 0.140449f, 0.459534f, -1.9093f, + 0.398452f, 0.696949f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_bias_128[] = { + 1.22789f, -1.34527f, 0.759048f, 0.315086f, + 1.0834f, -1.58019f, -0.465158f, 1.20716f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_kernel_128[] = { + -0.668677f, 0.58694f, -0.417094f, 0.754735f, -0.7859f, + 0.377479f, -0.0415929f, -0.0140585f, -0.730001f, 0.747528f, + -0.135247f, 0.406505f, -0.234184f, 0.956362f, -0.637555f, + 0.791884f, 0.0303722f, 1.04424f, -0.727859f, -0.274321f, + -0.122986f, 0.066312f, -0.00559175f, -0.239643f, -0.0188767f, + -0.102787f, -0.262967f, 0.071882f, -0.283398f, 0.111607f, + -0.425826f, 0.02699f, 0.108873f, -0.180558f, -0.0794057f, + 0.29665f, -0.0252969f, -0.0266213f, -0.277462f, -0.361973f, + 0.512552f, 0.395011f, -0.225876f, 0.301924f, 0.136954f, + 0.507259f, 1.23425f, 0.0137135f, 0.662572f, 0.591583f, + 0.101564f, 0.416805f, -0.645081f, -0.179086f, -0.36747f, + -0.332213f, 0.095177f, 0.220739f, -0.153256f, 0.706155f, + 0.161701f, 0.696815f, -1.21531f, -0.115059f, 0.486764f, + -0.396093f, 0.784883f, 0.535357f, -0.278021f, 0.143496f, + -0.44931f, -0.144543f, 0.319326f, 0.0190167f, -0.206295f, + 0.373995f, -0.247897f, -0.608095f, -0.41796f, -0.137129f, + -0.709562f, 0.678273f, 0.537607f, 0.557474f, 0.453308f, + 0.21405f, -0.0466495f, 0.519139f, -0.168832f, 0.902911f, + 0.681131f, -0.139876f, -0.2052f, -0.393271f, 0.262222f, + -0.246246f, -0.213993f, 0.646619f, 0.0496181f, -0.00354157f, + 0.822927f, 0.0939522f, 0.180738f, 0.118355f, 0.120456f, + -0.0472214f, -0.144958f, 0.173405f, -0.886644f, -0.0949769f, + -0.813518f, -0.3947f, -0.128021f, 0.356196f, 0.469169f, + -0.413702f, 1.04242f, 0.428853f, -0.387293f, 0.0850877f, + 0.279409f, -0.142276f, 0.0579376f, 0.211112f, 0.0703013f, + -1.9274f, -0.729147f, 0.534193f, 0.773586f, 0.922864f, + 0.642881f, 1.15127f, 0.621032f, 0.933942f, 1.01837f, + -0.660282f, -0.40059f, -1.11279f, -0.77088f, -0.43349f, + 0.202361f, -0.0840912f, 0.0935707f, 0.056333f, -0.0779369f, + 0.0173447f, -0.0104756f, 0.0115005f, -0.0195593f, 0.03592f, + -0.343454f, -0.618048f, 0.258172f, -0.412322f, -0.0463746f, + -0.0413654f, -0.0400194f, 0.615981f, -0.452094f, 0.644555f, + 0.0822476f, -0.359791f, -0.0904274f, 0.209427f, 0.0116338f, + -0.190978f, 0.890233f, 0.737769f, -1.66663f, -0.392605f, + 0.0785728f, -0.224553f, -0.128258f, -0.227227f, -0.0777773f, + 0.685976f, 0.347042f, -0.555325f, -0.249221f, 0.0919837f, + -0.0660016f, -0.272316f, 0.0390632f, -0.619624f, -0.0565801f, + 0.585026f, 0.597375f, 0.54114f, 0.593389f, 0.604391f, + 0.0820294f, -0.85339f, -1.40741f, -0.391675f, 0.0579205f, + -0.197626f, 0.130044f, -0.234488f, -0.0373991f, -0.0717973f +}; + +static const float av1_simple_motion_search_prune_rect_logits_bias_128[] = { + 1.58571f, -4.6314f, -2.00273f, 0.543699f +}; + +static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_128 = { + NUM_FEATURES_128, + NUM_LOGITS_128, + NUM_HIDDEN_LAYERS_128, + { + NUM_LAYER_0_UNITS_128, + }, + { + av1_simple_motion_search_prune_rect_layer_0_kernel_128, + av1_simple_motion_search_prune_rect_logits_kernel_128, + }, + { + av1_simple_motion_search_prune_rect_layer_0_bias_128, + av1_simple_motion_search_prune_rect_logits_bias_128, + }, +}; + +#undef NUM_HIDDEN_LAYERS_128 +#undef NUM_FEATURES_128 +#undef NUM_LAYER_0_UNITS_128 +#undef NUM_LOGITS_128 + +#define NUM_HIDDEN_LAYERS_64 1 +#define NUM_FEATURES_64 25 +#define NUM_LAYER_0_UNITS_64 32 +#define NUM_LOGITS_64 10 + +static const float av1_simple_motion_search_prune_rect_logits_kernel_64[] = { + 0.10424f, -0.346025f, 0.534547f, -0.385925f, 2.58341f, -0.256414f, + -0.232498f, 0.329823f, -0.0777376f, -0.590939f, 0.062657f, -0.628252f, + 0.0934588f, 2.04029f, -0.224448f, 0.371168f, -0.385348f, -0.589883f, + -3.73627f, -0.943144f, 0.346409f, -0.211215f, -0.351008f, 0.418807f, + 0.943663f, 0.173267f, 1.16585f, -0.0840888f, 0.227464f, 0.374412f, + 0.0422597f, -0.338868f, 0.222576f, 0.431713f, 1.12366f, 0.00753411f, + 0.248412f, -0.0902425f, 0.542455f, -0.665629f, -0.311245f, -0.205639f, + -0.447149f, -0.0502733f, -0.290186f, -0.794384f, 0.0940881f, -0.0686117f, + -0.0199961f, -0.587965f, 0.777096f, -0.083381f, -1.21282f, 0.652959f, + -1.18238f, 0.539991f, 0.352497f, -0.540076f, -0.26222f, -0.568556f, + 0.409102f, -0.131146f, -0.407161f, -0.188287f, -0.478657f, 0.000401932f, + -0.689324f, 0.351064f, -1.43704f, -0.315185f, -0.868726f, 0.376341f, + -0.0566277f, 0.364831f, 0.611298f, -0.495253f, -0.0193132f, 0.617978f, + 0.189586f, -0.236758f, -0.608246f, -0.149017f, -1.78303f, 0.143023f, + 0.698386f, -0.994086f, -0.673327f, 0.233868f, 0.360425f, 0.0294123f, + -0.248683f, -0.148392f, 0.0861829f, -0.190843f, -0.414906f, 0.607378f, + -0.756715f, -0.511713f, -0.321556f, 1.0078f, -1.18141f, 0.519751f, + 0.834629f, -0.359343f, 0.612262f, -0.0730553f, 0.262935f, 0.488276f, + 0.387071f, -1.44123f, 1.08269f, 0.554402f, -0.069f, 0.14113f, + 0.323817f, 0.824314f, -0.431417f, -0.349448f, 0.950728f, -0.587836f, + -0.83914f, -0.10844f, 0.26602f, 0.831933f, -0.271315f, 0.231563f, + 0.417049f, 0.190627f, -0.0940667f, 0.255363f, -0.0741022f, -0.0987662f, + -0.847522f, 0.00287554f, 0.0615741f, -0.0832218f, 0.0847148f, -0.392843f, + -0.938068f, -0.10621f, -0.260859f, -0.825175f, -0.401039f, 0.315213f, + -0.108269f, 0.288036f, -8.66166f, -0.970752f, -0.66678f, -0.593405f, + -0.518294f, -0.138722f, -0.454698f, -0.22969f, -0.553006f, -0.440111f, + 0.462661f, -0.536854f, 0.0108295f, -0.522888f, 0.00111157f, 0.229999f, + 0.0267768f, 0.176266f, -1.57043f, 0.0318106f, 0.257534f, -0.198583f, + 0.175564f, -0.251465f, -0.262441f, -1.65283f, -0.319603f, -0.875282f, + -0.301303f, 0.0170948f, -0.227075f, 0.0299545f, -4.98346f, 0.470046f, + -1.28051f, -0.213809f, -0.486585f, -0.906463f, -0.169984f, -0.333153f, + -0.376733f, 0.108016f, 0.486744f, -0.186936f, -0.429259f, 0.056501f, + -0.266545f, 0.265447f, -0.137718f, -0.490687f, -0.935668f, -0.16229f, + -0.696932f, 0.173157f, 0.434959f, -0.140595f, 0.345845f, -1.08013f, + -0.0205929f, -0.815874f, -0.179812f, 0.02767f, -0.141727f, 0.471936f, + -7.29453f, -1.04362f, -0.745482f, -0.28725f, -0.214997f, -0.0850651f, + -0.748471f, 0.161325f, -1.04387f, -0.705305f, 0.489427f, -0.765373f, + -0.301576f, 0.0742467f, -0.331282f, 0.0372328f, -0.90298f, -0.0608646f, + -2.18756f, 0.170384f, -0.258357f, 0.106287f, -0.161684f, -0.103799f, + -0.127774f, -0.156313f, 0.0705286f, -0.977908f, -0.281191f, -0.056757f, + -0.309474f, 0.050476f, -9.78198f, -2.42795f, -0.289626f, -1.07579f, + -0.439256f, -1.09948f, -0.564671f, 0.0913182f, -0.417216f, -1.19909f, + 0.287063f, 0.402315f, -0.17646f, 0.540488f, 0.00840239f, 0.397492f, + 0.702393f, -0.10566f, 0.655296f, -0.0443876f, 0.154918f, -0.760479f, + -0.0523153f, -0.366199f, -1.08212f, -0.398556f, -0.415203f, -1.10488f, + 0.208349f, 0.27079f, 0.101546f, -0.205752f, -13.7923f, -0.218637f, + -1.10077f, 0.355735f, -0.306196f, 0.627434f, -0.473101f, -0.308027f, + -1.12724f, 0.301597f, 0.660785f, 0.0576217f, -0.155925f, -0.56107f, + -0.223537f, 0.114299f, -0.53803f, -0.252674f, -2.66103f, -0.185245f, + -0.314673f, 0.403337f, 0.679821f, -0.69231f, 0.506264f, -0.999705f, + -0.549097f, 0.353745f, 0.188249f, 0.414484f, -0.615853f, 0.525681f, + -5.23065f, -3.05174f, 1.02074f, -0.965499f, -0.158947f, 0.0436088f, + -0.485824f, 0.0375094f, -1.39985f, -0.481392f, 0.485785f, -0.24874f, + -0.359633f, 0.668108f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_bias_64[] = { + 0.0735592f, -0.045064f, -0.0114103f, 1.39246f, -0.683467f, 0.155765f, + -0.667652f, -0.202425f, -0.585433f, -0.146752f, -0.0812931f, 0.580642f, + 0.578542f, -0.831916f, 0.610063f, 0.0101856f, -0.235863f, 0.538141f, + -2.91334f, -1.71887f, 0.126616f, 0.582497f, -0.438879f, 0.221833f, + 0.850773f, -0.280886f, 0.443233f, -0.0964873f, -0.216161f, 0.34413f, + 0.656818f, 0.0169274f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_kernel_64[] = { + -0.310947f, -0.232675f, 0.0171092f, 0.0834474f, 0.373977f, + 0.300429f, 0.215072f, -0.454074f, 0.187565f, 0.282742f, + 0.562562f, -0.0419322f, 0.000978486f, -0.298267f, 0.216934f, + -0.388722f, -0.146866f, -0.275946f, 0.202361f, 0.225847f, + 1.42868f, 0.473127f, -0.145747f, -0.104986f, 0.153459f, + 0.69382f, 0.162266f, 0.0207715f, -0.45095f, -0.412071f, + -0.235109f, -0.130199f, 0.231741f, 0.460193f, 0.0378202f, + 0.429516f, 0.387691f, -0.272479f, 0.0723884f, -0.453914f, + -0.150618f, -0.10745f, -0.258615f, 0.0838312f, -0.00554958f, + 0.105377f, -0.0415479f, 0.13228f, 1.09044f, -0.73053f, + -0.422553f, -0.435842f, 0.211416f, 0.420332f, 0.0181353f, + -0.030891f, 0.522788f, 0.613526f, 0.374032f, 0.287986f, + -0.403118f, -0.287362f, -1.11523f, -0.577713f, -0.020228f, + 0.86465f, -0.0590579f, 0.341274f, -0.0115644f, -0.260236f, + 0.192123f, -0.0849825f, 0.0501709f, 0.444382f, 0.0762727f, + 0.0926596f, -0.101157f, -0.142787f, 0.40861f, 0.555805f, + -0.00614654f, -0.122846f, 0.203163f, 0.234266f, 0.409795f, + -0.0206245f, -0.224679f, 0.025081f, 0.518044f, -0.287186f, + 0.016494f, -0.0886331f, 0.236438f, -1.01032f, 0.118332f, + 0.364217f, 0.061438f, 0.0381303f, 0.128418f, 0.0257077f, + -0.975751f, -0.694894f, 0.00351914f, 0.278179f, 0.29363f, + 0.525576f, 0.0604849f, 0.531734f, 0.406643f, 0.812497f, + -0.403196f, -0.16664f, -0.620887f, -0.428194f, 0.275401f, + 0.432063f, -0.00378342f, 0.295758f, 0.105615f, -0.00683626f, + 0.00396146f, 0.00598654f, -0.0131701f, -0.0115787f, 0.00386643f, + -0.69686f, -0.139623f, -0.440817f, 0.0542873f, 0.217962f, + 0.527035f, -0.0201046f, 0.0471354f, 0.0271858f, -0.0775197f, + -0.309797f, 0.184879f, -0.232854f, -0.407081f, 0.706227f, + -0.0877534f, 0.306843f, 0.455075f, -0.333961f, 0.0759148f, + 0.0444791f, -0.0693626f, -0.0850289f, -0.513063f, -0.643971f, + -0.630279f, -0.153889f, 0.123315f, 0.00548238f, 0.170707f, + 0.734339f, -0.176988f, 0.322519f, 0.178365f, 0.183519f, + -0.698683f, -0.12043f, -0.349914f, -0.0696762f, -0.53986f, + -0.104738f, 1.05264f, 0.983568f, -0.109035f, 0.0113748f, + 0.0815189f, -0.0628812f, 0.0769389f, 0.010261f, 0.146573f, + -0.433194f, -0.211572f, -0.000397392f, 0.445325f, 0.145091f, + -0.0625902f, 0.29394f, 0.302315f, 0.0892226f, -0.209504f, + -0.0150374f, 0.242608f, 0.216223f, 0.366857f, 0.209829f, + -0.540035f, 0.117599f, -0.329315f, 0.0471133f, -0.0115449f, + -0.0638235f, 0.0527461f, 0.348149f, 0.360802f, 1.06624f, + -0.615991f, -0.341396f, 0.18972f, 0.0709888f, -0.0414466f, + -0.0193809f, 0.0938933f, 0.209058f, 0.575042f, 0.483608f, + -0.285875f, -0.115905f, -0.363637f, 0.375425f, 0.336217f, + 0.0336358f, -0.00265618f, -0.406854f, -0.792959f, -0.219354f, + 0.0331615f, 0.0298859f, -0.211446f, -0.00280773f, -0.194011f, + 0.262109f, 0.548076f, 0.120183f, -0.661603f, 0.241855f, + -0.501428f, 0.00102718f, -0.347331f, -0.58306f, 0.0977254f, + 0.117491f, 0.0840667f, 0.00693675f, 0.000600294f, 0.649569f, + -0.0553811f, -0.197198f, 0.397236f, -0.523737f, -0.564192f, + -0.374679f, -0.249344f, 0.00861428f, 0.00393439f, -0.0834608f, + 0.124389f, -0.0393049f, 0.0425391f, -0.153383f, -0.182346f, + 0.420953f, 0.464221f, 0.288984f, 0.570921f, -0.239965f, + 0.247239f, -0.083434f, 0.714418f, 0.986323f, -0.460244f, + -0.260993f, -0.947743f, -1.0789f, -0.0391231f, 0.612407f, + -0.0306767f, 0.281419f, 0.0072426f, -0.37623f, 0.188744f, + 0.221666f, -0.424914f, 0.29703f, 0.261715f, 0.277809f, + -0.0617616f, -0.000611999f, -0.0547053f, -0.0901018f, -0.347669f, + 0.856072f, 0.596675f, -0.467639f, -1.09324f, -0.184224f, + -0.56051f, -0.0144704f, 0.102894f, -0.122982f, -0.0020749f, + -0.0423487f, 0.0328702f, -0.0154263f, 0.0349021f, -0.00315595f, + 0.0254802f, -0.729191f, 0.207296f, -0.0212349f, -0.207078f, + 0.20636f, -0.156883f, 0.429765f, -0.42672f, 0.138775f, + -0.0267343f, 0.631528f, 0.300646f, -0.4793f, -0.273833f, + -0.0135367f, -0.530819f, -0.534881f, 0.830896f, 0.0266992f, + 0.473744f, 0.210334f, 0.0234739f, 0.255394f, 0.123531f, + -0.489341f, -0.796627f, 0.372617f, 0.190136f, 0.275342f, + 0.739505f, 0.402354f, 0.782806f, 0.437374f, 1.04948f, + -0.55963f, 0.382704f, -0.698321f, 0.0817868f, -0.440108f, + -0.0635004f, -0.277851f, -0.524194f, 0.286157f, -0.01097f, + -0.0293145f, -0.0405071f, -0.035662f, -0.012871f, -0.0516409f, + -0.406671f, 0.709259f, -0.525177f, 0.521123f, -0.44813f, + 0.48412f, -0.0546513f, 0.305253f, -0.468328f, 0.316453f, + -0.36307f, 0.497515f, -0.0606276f, 0.315764f, -0.422066f, + 0.554025f, -0.679183f, 0.616914f, 0.00283324f, -0.000643824f, + 0.0639999f, 0.0488285f, -0.141031f, 0.068003f, -0.0792678f, + -0.425307f, -0.152235f, 0.269917f, -0.352327f, 0.44792f, + -0.116514f, -0.465868f, 0.154287f, 0.0161028f, -0.16848f, + -0.255487f, 0.189832f, 0.254883f, 0.0240822f, 0.432638f, + -0.136564f, 0.137036f, 0.0375734f, 0.989246f, -0.126287f, + 0.111416f, -0.0271002f, 0.718755f, -0.0412969f, 0.00645681f, + 0.253811f, -0.0186998f, 0.691971f, -0.282042f, -0.0783915f, + 0.274592f, -0.358449f, 0.34155f, -0.186374f, -0.136907f, + -0.192334f, -0.251168f, -0.100874f, -0.166578f, -0.336507f, + 0.402373f, 0.173695f, 0.108788f, 0.00885581f, -0.310063f, + 1.05545f, 0.0295867f, 0.180785f, -0.173469f, -0.469924f, + -0.224155f, 0.665862f, -0.126546f, 0.240691f, -0.0415301f, + -0.598534f, 0.0012723f, -0.122297f, -0.558947f, 0.268844f, + 0.241193f, 0.0524422f, -0.1683f, 0.575588f, -0.139012f, + 0.0636691f, -0.446709f, -0.094532f, 0.883809f, -0.112981f, + -0.224047f, 0.0811193f, -0.140571f, -0.09683f, -0.0796143f, + -0.102246f, -0.863392f, -0.0755124f, 0.23125f, -0.0301361f, + -0.153029f, -0.172238f, -0.0286382f, -0.338495f, -0.317216f, + -0.146629f, -0.242264f, -0.702306f, -0.285052f, 0.0623479f, + 0.265735f, 0.00674475f, 0.666196f, 0.883586f, 0.278416f, + -0.341692f, -0.509931f, -0.156263f, 0.635885f, -0.544143f, + -0.572632f, -0.213285f, 0.443396f, -0.268329f, 0.0638439f, + -0.185397f, 0.071126f, 0.386503f, -0.402212f, -0.140784f, + -0.411661f, 0.049398f, -0.0672907f, -0.267034f, -0.0560875f, + 0.0607937f, 0.0445484f, -0.547651f, 0.574718f, 0.417189f, + -0.0610166f, 0.0632293f, 0.391619f, -0.00671215f, -0.136883f, + -0.339346f, 0.0356183f, 0.511993f, 0.178676f, 0.286998f, + 0.136511f, -0.00796929f, 0.203985f, 0.0423532f, -0.175196f, + 0.378534f, 0.770417f, 0.593778f, 0.0256067f, -0.82394f, + -0.500691f, -0.425725f, -0.623708f, -0.0406241f, -0.00226464f, + 0.0207836f, 0.30732f, -0.00784268f, 0.0065445f, -0.0991039f, + -0.20871f, -0.206835f, 0.281219f, 0.119361f, 0.259346f, + -0.102713f, 0.186488f, -0.034455f, -0.00198392f, -0.279107f, + -0.638993f, -0.374404f, -0.48601f, -0.262345f, 0.624532f, + 0.620632f, -0.227014f, 0.433579f, -0.0455096f, 1.22123f, + -0.429156f, 0.12396f, 0.0815152f, -0.0837355f, 0.0282623f, + -0.407475f, 0.787321f, -0.434974f, 0.312904f, -0.230805f, + 0.213042f, -0.250929f, 0.302997f, -0.354709f, 0.0504905f, + -0.561706f, 0.595558f, 0.374951f, 0.802969f, -0.674902f, + 0.33136f, 0.156606f, 0.0218968f, -0.694188f, -0.0221949f, + -0.00639123f, 0.0146536f, 0.0104145f, 0.021635f, -0.0499428f, + -0.575116f, -0.239035f, -0.0588276f, 0.599722f, 0.541932f, + 0.437433f, 0.716268f, 0.193207f, 0.548351f, 0.326951f, + -0.197124f, 0.0355353f, -0.0952009f, -0.217265f, -0.389789f, + 0.0528124f, -0.21334f, -0.190296f, -1.17367f, 0.108905f, + 0.109397f, -0.0192577f, 0.0343813f, 0.085004f, -0.0556737f, + -0.0411158f, -0.534989f, 0.0361896f, 0.124415f, 0.291603f, + -0.0311974f, -0.326726f, 0.343131f, 0.0276456f, -0.231827f, + -0.373894f, -0.208898f, -0.273011f, 0.061323f, -0.0910538f, + -0.30746f, -0.108644f, -0.190736f, 1.58048f, -0.0739711f, + -0.0623489f, -0.137967f, -0.0601359f, -0.133004f, -0.0857153f, + 0.00955987f, -0.365561f, -0.0329051f, 0.463463f, 0.14758f, + -0.512256f, -0.227463f, -0.26008f, -0.567777f, 0.0646234f, + 1.02161f, 0.66157f, -0.16733f, 0.264921f, -0.242036f, + 0.214622f, 0.0712054f, -0.260377f, 0.0849665f, 0.735094f, + 0.11001f, 0.297301f, -0.333342f, 0.066978f, -0.123625f, + 1.07596f, 0.401263f, 0.0800875f, -0.340862f, -0.115587f, + -0.32692f, -0.300842f, 0.0277397f, 0.0630788f, -0.261198f, + 0.428695f, -0.0544757f, -0.124511f, 0.036992f, 0.126322f, + 0.0317603f, 0.0820762f, 0.117277f, -1.14594f, -0.108076f, + -0.0258198f, -0.00337525f, -0.00512531f, 0.1274f, -0.0660535f, + -0.640733f, 0.197142f, 0.147278f, 0.489271f, 0.226507f, + -0.0668414f, 0.0946318f, 0.0994164f, -0.820516f, 0.512939f, + -0.305172f, -0.715187f, -0.195125f, 0.279346f, 0.462144f, + 0.913882f, -0.453879f, 0.0582033f, -0.462866f, 0.0538736f, + 0.0115737f, 0.00626993f, -0.0185185f, 0.0114601f, -0.0181164f, + 0.41588f, -0.0447331f, 0.611756f, 0.43385f, 0.834465f, + 0.122019f, -0.352983f, 0.340429f, -0.245425f, -0.365328f, + -0.521825f, 0.0371057f, 0.172188f, -0.387949f, 0.221054f, + 0.0126359f, 0.422958f, 0.584198f, -0.581498f, -0.019466f, + -0.0271737f, -0.0740885f, 0.00540879f, 0.186086f, -0.0324402f, + -0.563462f, -0.458759f, -0.425296f, -0.0118862f, -0.641508f, + 0.0132084f, 0.0581128f, 0.0231444f, 0.468587f, 0.258838f, + 0.0296665f, 0.0562801f, 0.630014f, 0.381816f, -0.269761f, + -0.135515f, 0.046186f, 1.07632f, -0.050616f, 0.104987f, + 0.29991f, 0.119316f, 0.117248f, 0.0795009f, 0.242573f, + 0.0416634f, -0.0577639f, -0.0974078f, 0.106255f, -0.13098f, + 0.0141486f, -0.00418257f, 0.144848f, -0.463934f, 0.0452591f, + 0.252617f, 0.205222f, -0.189843f, 0.0652245f, -0.135386f, + 0.0500646f, -0.200368f, -0.0142312f, -0.0286832f, -0.254355f, + -1.02752f, -0.73549f, 0.0364518f, 0.0416227f, -0.13185f, + -0.0886515f, -0.502314f, -0.102916f, 0.410911f, -0.355655f, + 0.400416f, -0.340217f, 0.208829f, 0.245972f, 0.149739f, + -0.49458f, 0.589482f, 0.550827f, 0.912709f, -0.351275f, + -0.128076f, -0.285172f, -0.672752f, 0.090583f, -0.245286f, + -0.737297f, -0.201515f, -0.025122f, -0.109854f, 0.36738f +}; + +static const float av1_simple_motion_search_prune_rect_logits_bias_64[] = { + 0.346819f, 0.442965f, -0.0216032f, 0.0229235f, -0.402797f, + -0.666074f, -0.455388f, -0.00353411f, -0.595511f, -0.845667f +}; + +static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_64 = { + NUM_FEATURES_64, + NUM_LOGITS_64, + NUM_HIDDEN_LAYERS_64, + { + NUM_LAYER_0_UNITS_64, + }, + { + av1_simple_motion_search_prune_rect_layer_0_kernel_64, + av1_simple_motion_search_prune_rect_logits_kernel_64, + }, + { + av1_simple_motion_search_prune_rect_layer_0_bias_64, + av1_simple_motion_search_prune_rect_logits_bias_64, + }, +}; + +#undef NUM_HIDDEN_LAYERS_64 +#undef NUM_FEATURES_64 +#undef NUM_LAYER_0_UNITS_64 +#undef NUM_LOGITS_64 + +#define NUM_HIDDEN_LAYERS_32 1 +#define NUM_FEATURES_32 25 +#define NUM_LAYER_0_UNITS_32 28 +#define NUM_LOGITS_32 10 + +static const float av1_simple_motion_search_prune_rect_logits_kernel_32[] = { + 0.486581f, 0.340847f, -0.109226f, 0.467224f, -0.541561f, + 0.0943619f, -0.429442f, -0.207442f, 0.959963f, 0.618666f, + -0.0636751f, 0.144508f, -0.0278289f, 0.332293f, -0.751493f, + 0.245438f, -0.917758f, 0.612128f, -0.32648f, 0.534618f, + -0.615239f, 2.71641f, 0.233759f, 0.820558f, -0.249758f, + -0.427783f, -0.359361f, 0.0375732f, 0.806973f, 0.352512f, + -0.0532192f, 0.0576861f, -0.464178f, -0.334877f, -0.697042f, + 0.0538218f, 0.0919659f, -0.00765812f, 0.0603847f, -0.460315f, + 0.37979f, -0.0867612f, -0.670683f, -0.188619f, -0.570586f, + 0.233418f, 0.153581f, 0.290905f, -0.624885f, -0.557842f, + -0.555567f, 0.463773f, -0.123909f, -0.277731f, 0.0374468f, + 0.409903f, 0.287638f, -0.593066f, -0.223434f, 0.154263f, + -0.250464f, -0.077696f, 0.229652f, -0.304174f, 0.308053f, + 0.33155f, -0.502825f, 0.361216f, -0.499294f, 0.00595444f, + -0.307201f, 0.5766f, -0.438384f, -0.093701f, -0.118586f, + 0.202337f, -0.486623f, 0.261552f, 0.139756f, -0.655642f, + -0.0627001f, -0.213053f, -0.243037f, 0.205918f, 0.0718368f, + 0.188041f, 0.141529f, -0.132239f, 0.425827f, -0.218353f, + 0.153114f, 0.33268f, 0.0226116f, 0.167394f, 0.269854f, + -0.457001f, 0.1973f, -0.526087f, 0.467528f, 0.290934f, + 1.16267f, 0.0823663f, -0.754389f, -0.83716f, 0.270157f, + -1.41229f, 0.148511f, -0.286832f, 0.664796f, 0.492254f, + 0.360567f, -0.533993f, 0.0435672f, -0.103001f, 0.220668f, + 0.594621f, -0.0213356f, -0.347638f, -0.694457f, 0.0759505f, + 0.161358f, -0.389384f, -0.0455192f, -0.61252f, -0.174173f, + -0.00788878f, -1.22487f, 0.332233f, -0.0457021f, -0.225918f, + -0.197657f, -0.115408f, -0.240589f, -2.05681f, 0.00914629f, + -1.92213f, 0.0268578f, -0.49076f, -0.0120123f, 0.291157f, + 0.267116f, -0.0775724f, 0.181115f, -0.392441f, -0.488114f, + -0.28842f, -0.115465f, 0.128974f, -0.0829899f, -0.14096f, + -0.140145f, -0.700281f, 0.0368945f, -0.437598f, 0.243485f, + -1.00301f, 0.332324f, 0.125014f, -0.0604481f, -0.0652028f, + -0.207295f, -1.0209f, -0.341525f, 0.191326f, -0.147578f, + 0.0878327f, 0.129827f, -0.0848319f, 0.187381f, -1.28663f, + 0.00537885f, -0.134277f, -0.0411126f, -0.3434f, -0.0456494f, + 0.37861f, 0.409095f, 0.237177f, -0.396855f, -0.205418f, + -1.31701f, -0.319032f, -0.123404f, -0.240005f, -0.305206f, + -0.0258176f, -0.26367f, -0.142396f, 0.191672f, -1.44061f, + 0.0554776f, -0.571839f, -0.284789f, -0.425677f, -0.0307376f, + 0.20275f, -0.223146f, 0.144612f, 0.0212636f, 0.0238303f, + -0.253802f, -0.188922f, -0.0637066f, -0.340836f, 0.124774f, + 0.130474f, -0.154099f, -0.0292733f, 0.158148f, -0.246989f, + -0.259059f, 0.220224f, 0.228449f, -0.41956f, -0.321848f, + -0.2396f, -0.316449f, -1.3363f, 0.0264099f, -1.46865f, + 0.113073f, 0.0722885f, -0.166986f, -0.164877f, 0.0360911f, + 0.534472f, -0.551152f, -0.328501f, 0.0781121f, -0.378112f, + -0.459502f, 0.28015f, -0.212302f, -0.521641f, 0.618993f, + -0.347709f, 0.266253f, -0.0280894f, 0.348511f, -0.0155031f, + -0.100693f, 0.0447673f, 0.277519f, -0.233998f, -0.0796738f, + -1.73644f, -0.160776f, 0.53092f, -0.180406f, 0.056447f, + 0.385356f, -0.262337f, -0.241479f, -0.271426f, -0.457354f, + -0.266788f, 0.367371f, -0.103065f, 0.47783f, -0.188327f, + -0.159636f, 0.00142907f, -0.409756f, 0.454889f, -0.24566f, + -0.0760084f, 0.286355f, 0.462102f, 0.0431695f, -0.127395f, + -0.200476f, -0.350557f, 0.217275f, -0.23975f, 0.255148f, + -0.280626f, 0.42476f, 0.157411f, 0.0358675f, -0.192591f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_bias_32[] = { + 0.940498f, 0.15602f, -0.234831f, 0.0268585f, 0.144769f, 0.243081f, + 0.611406f, 0.366093f, 0.361868f, 0.39668f, 0.401479f, 0.369467f, + 0.0909503f, 0.710595f, 0.032786f, 0.525891f, -1.0232f, 0.732557f, + -0.064425f, 0.865222f, -0.042917f, -0.237191f, -0.527006f, -0.0172101f, + 0.59681f, -0.472405f, 0.0969218f, -0.250624f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_kernel_32[] = { + 0.355607f, 0.126701f, -0.0825159f, 0.200675f, -0.011308f, + -0.280057f, 0.559816f, 0.142689f, 0.0422419f, -0.151692f, + -0.0275637f, -0.283101f, -0.20822f, -0.200394f, 0.465427f, + 0.344491f, -0.525319f, -0.358813f, -0.39767f, 0.0974486f, + 0.00559058f, -0.00546089f, 0.0506486f, 0.114475f, -0.0436463f, + -0.574152f, -0.376294f, 0.16563f, -0.0967032f, 0.00579838f, + 0.0639909f, -0.037129f, 0.407574f, -0.231428f, 0.489326f, + -0.221566f, -0.270382f, -0.784628f, -0.155502f, 0.481698f, + -0.0296057f, 0.431855f, 0.840807f, 0.112291f, 0.773874f, + -0.0610936f, -0.012892f, 0.365154f, 0.0267687f, -0.0751114f, + 0.25043f, 0.516472f, -0.186133f, -0.12762f, -0.168804f, + -0.146309f, 0.139314f, -0.367113f, -0.601079f, 0.0559856f, + 0.176081f, 0.22397f, 0.434113f, 0.0363256f, 0.313051f, + 0.0143976f, 0.190076f, 0.474607f, -0.681134f, -0.0709097f, + -0.253289f, -0.216277f, -0.0593789f, -0.107795f, -0.194842f, + 0.513945f, 0.239171f, -0.720561f, 0.0136723f, -0.391147f, + -0.272043f, -0.164766f, 0.124248f, 0.147178f, -0.35497f, + 0.397725f, -0.117603f, 0.262937f, -0.331964f, 0.182418f, + 0.315671f, -0.0385649f, 0.488769f, -0.334568f, 0.00596018f, + 0.0661557f, -0.0446985f, -0.0928255f, -0.0221032f, -0.019045f, + -0.20881f, 0.197907f, -0.381881f, 0.0598071f, -0.0434551f, + 0.159283f, -0.110631f, 0.266996f, -0.0265494f, 0.135199f, + -0.00833162f, 0.804482f, -0.114698f, -0.15066f, -0.479553f, + 0.448407f, -0.344069f, -0.0280952f, -0.208211f, -0.102269f, + -0.679066f, -0.37476f, -0.0228875f, 0.0535049f, 0.111015f, + -0.18125f, -0.167584f, 0.0110497f, 0.262723f, -0.413839f, + -0.0611238f, 0.358499f, 0.0807514f, 0.208254f, 0.214499f, + 0.11137f, -0.14262f, -0.0513973f, 0.243718f, -0.373716f, + -0.00413366f, 0.216501f, -0.164149f, -0.064935f, -0.0840282f, + 0.0566148f, 0.0377686f, 0.289835f, 0.769388f, 0.891198f, + -0.592739f, 0.40744f, -0.153095f, 0.657311f, 0.140737f, + 0.28209f, 0.158344f, 0.353546f, 0.0868246f, 0.116887f, + 0.402004f, 0.437184f, 0.589219f, 0.760594f, -0.575419f, + -0.754308f, -0.709219f, -0.297814f, -0.418609f, -0.0262104f, + 0.0411959f, 0.0597708f, -0.143728f, -0.136642f, 0.099614f, + -0.257601f, -0.2404f, 0.305893f, 0.254009f, -0.0301398f, + -0.0653091f, -0.459002f, -0.163404f, 0.123152f, -0.0284252f, + -0.457272f, 0.00788622f, -0.828399f, -0.0534199f, 0.586877f, + 0.982728f, 0.424581f, 0.0891856f, 0.383182f, -0.122053f, + 0.0808408f, -0.00384914f, -0.0560201f, -0.0524772f, -0.263444f, + -0.239287f, -0.882777f, 0.0180592f, -0.0948711f, -0.177946f, + 0.0296473f, 0.096082f, 0.0455604f, -0.108608f, 0.00777951f, + -0.140896f, 0.117187f, -0.342467f, -0.0691604f, 0.0761611f, + -0.0892053f, 0.111386f, -0.167456f, 1.40616f, -0.00478793f, + 0.00547665f, -0.0441829f, 0.0151323f, -0.0674099f, -0.0380578f, + 0.16072f, 0.31882f, 0.245486f, -0.424318f, 0.101845f, + -0.203343f, -0.197402f, -0.163025f, -0.0771961f, -0.264435f, + 0.319429f, 0.250076f, 0.782726f, 0.386003f, 0.00700673f, + -0.375715f, 0.151453f, -0.296265f, -0.560183f, -0.00767249f, + -0.109593f, -0.119419f, -0.0161516f, 0.0380283f, -0.156417f, + 0.131708f, 0.396268f, -0.221796f, 0.232099f, 0.128852f, + 0.0567268f, 0.297297f, 0.173269f, 0.213411f, 0.0384426f, + -0.290985f, -0.0426841f, -0.488292f, -0.087101f, -0.311582f, + 0.83009f, -0.153163f, 0.903335f, -1.15644f, -0.0378635f, + -0.0552129f, -0.126362f, -0.176945f, 0.0653115f, 0.0989368f, + -0.333543f, -0.330586f, 0.29775f, -0.103535f, 0.210824f, + -0.00300509f, 0.317105f, 0.216852f, 0.479718f, 0.0485808f, + -0.15662f, 0.718199f, 0.327513f, 0.115169f, -0.423598f, + -0.456633f, -0.575814f, -0.494454f, 0.304411f, 0.0493055f, + -0.381171f, 0.467251f, -0.122872f, -0.167441f, 0.017253f, + -0.0583646f, -0.1586f, 0.214046f, -0.0284424f, -0.217112f, + 0.606567f, -0.107533f, 0.36615f, -0.0709227f, 0.604761f, + -0.244657f, -0.296651f, -0.595611f, -0.156629f, -0.693468f, + -0.310603f, 0.499272f, 0.282941f, 0.295043f, -0.178704f, + 0.281186f, 0.014329f, -0.120819f, 0.154234f, 0.0131325f, + -0.472231f, -0.631281f, 0.422955f, 0.711432f, -0.118025f, + 0.0864996f, 0.343971f, -0.301477f, -0.246638f, 0.165068f, + 0.218044f, 0.224236f, -0.0848522f, 0.00671216f, 0.401141f, + -0.218857f, -0.0298495f, -0.135725f, -0.377618f, 0.022473f, + 0.106955f, -0.0582005f, 0.0468484f, -0.0217442f, 0.130911f, + -0.0926905f, 0.383007f, -0.159353f, -0.222711f, -0.0286419f, + 0.372315f, -0.469095f, 0.797571f, -0.301315f, 0.239327f, + -0.997507f, -0.363409f, 0.353717f, 0.676686f, -0.0500028f, + 0.0638539f, -0.431927f, 0.243852f, 0.000884826f, -0.00166585f, + 0.0613292f, -0.029558f, -0.0248432f, -0.0125607f, -0.0309674f, + -0.743308f, 0.0409806f, 0.0921015f, 0.167816f, 0.406849f, + 0.095677f, 0.0308913f, 0.139956f, -0.400472f, 0.396617f, + 0.936517f, 0.355057f, -0.423816f, -0.232472f, -0.220188f, + -0.399746f, -0.409623f, -0.158797f, 0.361153f, 0.0327019f, + 0.0690844f, -0.032197f, 0.0248558f, 0.00438518f, 0.0222724f, + -0.326832f, -0.314295f, 0.156563f, 0.0562703f, 0.332694f, + 0.299424f, 0.228206f, 0.322038f, 0.0136098f, 0.0060297f, + -0.165851f, -0.306512f, 0.0796508f, -0.37158f, 0.239395f, + -0.349442f, 0.198515f, -0.253854f, -1.13694f, 0.0202873f, + -0.0504009f, -0.130528f, -0.017126f, -0.0370001f, -0.087458f, + -0.119952f, -0.130404f, 0.0333733f, -0.184736f, 0.182162f, + 0.227776f, -0.166563f, -0.156162f, 0.118215f, -0.220183f, + 0.00474779f, -0.107792f, 0.260493f, 0.11884f, 0.156587f, + 0.303936f, -0.131788f, -0.314774f, 0.310606f, 0.0935523f, + 0.790767f, 0.26461f, 0.0236426f, 0.0629469f, 0.0344072f, + -0.151513f, 0.211498f, 0.0245435f, 0.0629973f, 0.052019f, + -0.03308f, 0.123487f, 0.0885027f, 0.159172f, -0.0510615f, + 0.0298033f, -0.130515f, -0.121799f, -0.104915f, 0.208822f, + -0.310496f, -0.314106f, 0.303307f, -0.0196736f, 0.0420045f, + 0.461777f, -0.433699f, 0.00345407f, 0.703139f, -0.655637f, + -0.210767f, -0.201278f, 0.163694f, -0.236534f, 0.300877f, + 0.0769982f, -0.282453f, 0.149721f, -0.0303466f, -0.191473f, + -0.406056f, -0.213472f, 0.1619f, -0.245953f, 0.00544399f, + -0.121434f, 0.193012f, -0.307165f, 1.45431f, -0.161468f, + -0.12444f, -0.146129f, -0.0528212f, -0.0925165f, -0.134528f, + -0.479475f, 0.315525f, 0.133845f, 0.382158f, -0.0799693f, + -0.151041f, 0.255772f, 0.409536f, -0.240663f, -0.323741f, + -0.205876f, 0.03699f, -0.217541f, 0.108511f, 0.640628f, + 0.705993f, -0.423899f, -0.78314f, -0.100733f, -0.00859087f, + 0.0251879f, 0.0458335f, 0.00210128f, -0.047576f, -0.0560518f, + -1.23869f, -0.829914f, 0.0346551f, 0.350505f, 0.193688f, + 0.459154f, 0.137898f, 0.503818f, 0.260867f, 0.649539f, + 0.0150802f, 0.0239274f, -0.276069f, -0.0621478f, -0.193106f, + -0.0375665f, -0.654529f, 0.189493f, 0.446625f, -0.0208265f, + 0.019838f, -0.0201955f, 0.00180428f, -0.0110678f, -0.0172414f, + 0.0276489f, -0.252882f, -0.0351807f, -0.0518874f, 0.279098f, + -0.245122f, 0.101287f, -0.114202f, -0.0812187f, 0.572429f, + -0.0821731f, 0.564183f, 0.0222552f, 0.190111f, -0.0417497f, + -0.00385925f, -0.182995f, -0.240482f, -0.291572f, -0.0450444f, + 0.0962974f, -0.165973f, -0.0954637f, -0.163841f, -0.833405f, + -1.31541f, -0.336473f, -0.0920702f, 0.816105f, 0.393377f, + 0.0340241f, -0.0844545f, 0.61729f, -0.17596f, 0.241149f, + -0.42825f, -0.59091f, -0.290702f, 0.0796465f, 0.0982819f, + 0.466934f, 0.261666f, 0.0373333f, 0.332509f, -0.0266694f, + -0.0476951f, -0.00642167f, -0.0132542f, -0.000320841f, 0.00475532f, + 0.000502778f, 0.296534f, -0.13297f, -0.113082f, -0.327923f, + 0.35901f, -0.302246f, 0.189799f, -0.37994f, 0.16107f, + -0.20414f, 0.548575f, -0.460821f, 0.591878f, -0.213113f, + -0.169373f, -0.07332f, 0.228841f, 0.682302f, -0.0665316f, + -0.142456f, -0.0873117f, 0.00607451f, 0.0376443f, 0.0536673f, + -0.0109536f, -0.400279f, 0.550058f, 0.820871f, -0.666373f, + -0.471962f, -0.315925f, -0.313142f, 0.952742f, 0.473928f, + -0.119006f, 0.153241f, -0.0383078f, 0.631869f, -0.343423f, + -0.233473f, -0.218195f, -0.077688f, -0.728291f, 0.0382408f, + -0.00662886f, -0.0419666f, 0.0309776f, -0.0281592f, 0.0154229f, + -0.198534f, 0.0206324f, 0.0152272f, -0.235067f, 0.0330486f, + 0.139198f, -0.0612118f, 0.133154f, -0.258675f, 0.0900275f, + -0.127771f, 0.157322f, -0.00767807f, -0.329258f, 0.327458f, + 0.0528581f, -0.181125f, 0.409995f, -0.162979f, -0.0193475f, + 0.186009f, 0.0519501f, 0.651877f, -0.37821f, -1.10341f, + -0.189776f, -0.0922788f, 0.460256f, 0.168011f, 0.440295f, + 0.478135f, 0.374573f, 0.384048f, 0.116953f, 0.68886f, + -0.427727f, -0.36676f, -0.500013f, -0.228685f, -0.218859f, + 0.208396f, -0.0173765f, -0.0680241f, -0.00538013f, -0.0674409f, + -0.092764f, 0.0295707f, -0.0462887f, -0.00636006f, 0.0334169f +}; + +static const float av1_simple_motion_search_prune_rect_logits_bias_32[] = { + 0.176459f, 0.154405f, 0.281821f, 0.375264f, -0.882863f, + -0.240261f, -1.17075f, -0.280216f, -0.743836f, -0.317511f +}; + +static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_32 = { + NUM_FEATURES_32, + NUM_LOGITS_32, + NUM_HIDDEN_LAYERS_32, + { + NUM_LAYER_0_UNITS_32, + }, + { + av1_simple_motion_search_prune_rect_layer_0_kernel_32, + av1_simple_motion_search_prune_rect_logits_kernel_32, + }, + { + av1_simple_motion_search_prune_rect_layer_0_bias_32, + av1_simple_motion_search_prune_rect_logits_bias_32, + }, +}; + +#undef NUM_HIDDEN_LAYERS_32 +#undef NUM_FEATURES_32 +#undef NUM_LAYER_0_UNITS_32 +#undef NUM_LOGITS_32 + +#define NUM_HIDDEN_LAYERS_16 1 +#define NUM_FEATURES_16 25 +#define NUM_LAYER_0_UNITS_16 32 +#define NUM_LOGITS_16 10 + +static const float av1_simple_motion_search_prune_rect_logits_kernel_16[] = { + -0.520913f, 0.395611f, 0.0369091f, -0.318591f, -0.463252f, + 0.134992f, -0.43154f, -0.0739112f, -0.118817f, 0.476373f, + -0.281406f, 0.3413f, 0.456255f, 0.33307f, 0.2942f, + 0.1317f, 0.498113f, 1.95406f, -0.165726f, -0.219306f, + -0.302656f, -1.31157f, -0.433662f, 0.151716f, -0.214817f, + 0.504523f, -0.710049f, 0.359616f, -0.412695f, -0.103193f, + 0.341912f, 0.351378f, -0.181486f, 0.573862f, -0.0396254f, + -0.17855f, -0.276163f, 0.0367465f, -0.353905f, -0.204689f, + 0.309581f, -0.0439686f, -0.147855f, 0.152745f, 0.290871f, + 0.131049f, -0.27808f, -0.142997f, 0.207843f, -1.23074f, + -0.267714f, -0.336923f, 0.313781f, -0.61488f, -0.161984f, + 0.238059f, -0.0879942f, -0.085543f, -0.260156f, -0.13614f, + -0.242196f, 0.201216f, -0.248691f, 0.0936671f, -0.350522f, + -0.35002f, -0.156583f, -0.00579001f, 0.300578f, -0.341269f, + -0.290712f, 0.354802f, -0.31629f, 0.509107f, -0.236953f, + -0.0923519f, 0.544509f, -0.280991f, -0.017437f, -0.202721f, + -0.116388f, -0.7191f, 0.324586f, 0.254249f, 0.125505f, + 0.00658697f, -0.333322f, -0.126537f, -0.140004f, -0.0241202f, + -0.172466f, 0.210035f, -0.270833f, 0.0579044f, 0.0950352f, + -0.120382f, 0.063292f, -0.394925f, 0.482165f, 0.147753f, + 0.331465f, -0.187444f, 0.1083f, 0.414028f, 0.279238f, + -0.486889f, -0.674349f, -0.313656f, -0.131186f, -0.100662f, + 0.238191f, -1.19083f, -0.30667f, -2.4324f, 0.235311f, + 0.108605f, 1.67197f, 0.476157f, 0.30055f, 0.0839538f, + 0.408469f, -0.473517f, 0.560283f, -0.0188136f, 0.273824f, + -0.43707f, -0.0346978f, -0.438315f, -0.0196275f, -0.0567921f, + -0.220166f, 0.216175f, -0.0180461f, 0.0116429f, -0.0096949f, + -0.32613f, 0.176829f, -0.243563f, -0.240972f, -0.621819f, + -0.00619648f, -0.145525f, 0.124324f, -0.0306925f, 0.172208f, + -2.04631f, -0.200087f, -0.594135f, -0.352303f, -0.309826f, + 0.0922786f, -0.698371f, -0.0366823f, 0.0244036f, 0.338775f, + -0.115947f, 0.144971f, -0.0607037f, -0.762412f, 0.0125584f, + -0.262427f, -0.0830273f, -0.291252f, -0.176059f, -0.203983f, + 0.0871455f, -0.0894925f, 0.0426263f, -0.060001f, -0.542355f, + -0.407837f, -0.0419273f, 0.226608f, -0.114844f, 0.158733f, + -0.187237f, 0.113163f, -1.86337f, -0.367544f, -0.547048f, + -0.24192f, -0.226764f, 0.090912f, 0.819604f, 0.433766f, + -0.841657f, 0.446987f, -0.622761f, -0.0296385f, -0.130176f, + -0.0518136f, -0.640326f, -0.330107f, -0.137832f, -0.0119033f, + 0.39401f, 0.111331f, -0.141367f, -0.230289f, 0.171054f, + -0.924059f, -0.107317f, -0.347983f, 0.0261109f, 0.423002f, + -0.305817f, 0.247696f, 0.0436002f, 0.0305862f, -1.52448f, + -0.595587f, -0.155552f, -1.11949f, -0.513937f, 0.138347f, + -0.301487f, 0.352144f, -0.615801f, 0.0326701f, -0.215322f, + -0.0608176f, -0.416557f, -0.306073f, -0.441512f, -0.0569277f, + -0.709768f, -0.602527f, -0.311134f, 0.152471f, -0.255299f, + 0.354505f, 0.194464f, 0.0144251f, 0.110732f, -0.4452f, + -0.804814f, 0.205325f, -0.0957486f, 0.502684f, 0.09112f, + -0.533087f, -1.77979f, 0.556992f, -0.176157f, -0.642633f, + 0.11553f, -0.232561f, 0.161277f, -0.0631125f, -0.20759f, + 0.489253f, -0.067533f, 0.0231024f, -0.179831f, -0.272985f, + -0.390059f, 0.3089f, 0.185733f, -0.257065f, -0.508838f, + -0.550028f, 0.0665621f, -0.138288f, -0.413188f, 0.191193f, + -1.32969f, -0.431025f, 0.270242f, -0.340062f, 0.0817257f, + 0.0376051f, -0.18633f, 0.0828274f, 0.00670051f, -0.431295f, + -0.450316f, -0.173042f, -0.322248f, 0.370628f, 0.10019f, + 0.317293f, -0.266613f, 0.0752441f, -0.425656f, -0.112223f, + 0.557991f, -0.324368f, -0.195261f, -0.0526129f, -0.807472f, + -0.387466f, 0.192186f, 0.353213f, -0.120238f, 0.107686f, + 0.200678f, -0.75363f, 0.466857f, -0.282345f, -0.0849236f, + -0.0490695f, -0.00643182f, 0.123047f, -0.207805f, -0.130456f, + -1.09455f, 0.340973f, 0.334784f, 0.0706643f, -1.65681f, + -0.319952f, -0.198514f, -0.0787972f, 0.089524f, 0.0531034f, + -0.202705f, -0.0852339f, -0.62572f, -0.0734234f, -0.838088f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_bias_16[] = { + -0.0616197f, 0.939947f, 0.521161f, 0.213886f, 0.130324f, -0.127443f, + -0.0538715f, 0.708746f, 0.445031f, 0.418781f, -0.114539f, 0.521941f, + 1.13719f, 0.606545f, -0.32193f, -0.150788f, 0.158487f, -0.224005f, + 0.654715f, 0.115729f, -0.286506f, -2.06223f, 0.0117697f, 0.503905f, + -0.102339f, 0.653256f, -0.813561f, 0.905235f, -0.417269f, -0.206265f, + 0.661496f, 0.95533f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_kernel_16[] = { + -0.203489f, 0.00686229f, -0.161414f, 0.0637276f, 0.27516f, + 0.512219f, 0.164205f, 0.00326062f, -0.41914f, -0.400334f, + 0.554419f, 0.715772f, -0.295569f, -0.703503f, 0.0137744f, + -0.0934259f, 0.174234f, -0.148618f, -0.0360558f, -0.0986598f, + -0.138502f, -0.0770713f, 0.122922f, -0.00784415f, 0.0953234f, + -0.255754f, -0.310967f, 0.185306f, 0.464554f, 0.147338f, + -0.0612304f, 0.164783f, 0.301097f, 0.161364f, -0.12723f, + -0.0265984f, -0.471361f, 0.0578776f, -0.362865f, 0.425789f, + 0.402758f, -0.190235f, 0.00549738f, -0.570908f, 1.27206f, + 0.048868f, -0.0097675f, 0.0708324f, 0.0456103f, 0.0149062f, + -0.563032f, -0.420573f, 0.107278f, 0.0938258f, 0.142712f, + -0.00251036f, -0.250583f, 0.522272f, 0.0113175f, 0.126751f, + -0.433028f, -0.035542f, -0.536686f, -0.0668722f, 0.253094f, + 0.254007f, -0.435505f, 0.343001f, 0.0531542f, -0.361914f, + -0.102664f, 0.0404874f, 0.132686f, 0.0762298f, 0.0236971f, + -0.419454f, 0.230877f, -0.223714f, 0.037813f, 0.0818604f, + 0.383705f, -0.235028f, -0.0554801f, 0.429851f, 0.0845829f, + 0.166295f, 0.355111f, -0.421197f, 0.298949f, 0.0218224f, + 0.445705f, -0.392217f, -0.429578f, -0.076276f, -0.0963531f, + -0.631425f, -0.225977f, 8.06349e-06f, 0.0676679f, 0.0779651f, + 0.0706891f, 0.101377f, 0.517103f, 0.0945502f, -0.52522f, + -0.312022f, 0.0358089f, 0.616509f, -0.0507444f, -0.465814f, + -0.0326024f, 0.591298f, 0.188544f, -0.0633316f, -0.199987f, + 0.403118f, -0.511281f, -0.696263f, 0.112996f, 0.103875f, + 0.0495595f, -0.0107449f, 0.521539f, -0.0123823f, -0.0642751f, + 0.08548f, -0.0679207f, 0.526558f, 0.0651114f, -0.342643f, + -0.349934f, 0.307437f, 0.368763f, -0.194851f, -0.134117f, + 0.102448f, -0.0520666f, 0.0415824f, -0.175085f, 0.272685f, + 0.0675856f, 0.120627f, 0.391408f, -0.135249f, -0.357024f, + 0.019666f, -0.0622677f, 0.407427f, 0.22655f, -0.129432f, + -0.165327f, 0.004893f, 0.5479f, 0.0613981f, -0.479682f, + -0.144228f, -0.130106f, 0.206458f, -0.342086f, 0.12691f, + -0.113554f, 0.231164f, -0.051419f, 0.0401286f, -0.560429f, + -0.070609f, 0.420232f, 0.442465f, -0.237501f, -0.000293732f, + -1.017f, -0.210222f, 0.0157063f, 0.0488178f, 0.0734721f, + -0.52626f, -0.276441f, -0.521579f, 0.443532f, -0.0819051f, + -0.0732633f, -0.17999f, 0.258525f, -0.0374872f, 0.150115f, + 0.0510939f, 0.168116f, 0.473372f, 0.824489f, 0.302195f, + -0.348613f, 0.238569f, 0.176444f, -0.633945f, -0.0567195f, + -0.0305827f, -0.0551851f, 0.85822f, -0.0628099f, 0.0364294f, + -0.234823f, 0.179067f, 0.143208f, -0.0511014f, -0.404191f, + 0.428035f, 0.0235506f, 0.371991f, -0.312909f, 0.550933f, + -0.389265f, -0.271813f, -0.293461f, -0.583752f, 0.179991f, + 0.191698f, 0.659094f, 1.07941f, -0.509555f, -0.100638f, + 0.079988f, -0.0519107f, -0.112723f, -0.0663326f, 0.0353569f, + -0.795055f, -0.465999f, 0.283579f, 0.340913f, 0.152738f, + 0.294664f, 0.527839f, 0.187735f, 0.359461f, 0.164629f, + 0.107512f, 0.390402f, 0.236702f, 0.114674f, -0.525655f, + -0.555476f, -0.6589f, -0.266601f, -0.0946547f, 0.6306f, + 0.0248513f, 0.038497f, 0.432706f, -0.0715465f, 0.0410172f, + -0.115313f, -0.428684f, 0.136283f, 0.0913185f, 0.11277f, + 0.0968689f, -0.00437052f, 0.0888981f, 0.10304f, 0.02442f, + -0.211315f, 0.00981596f, -0.0974827f, 0.208611f, 0.140644f, + 0.0315567f, 0.350332f, -0.291049f, -0.0715449f, -0.352992f, + -0.858004f, 0.828658f, 0.439092f, 0.0151291f, 0.0503828f, + 0.0656112f, -0.710749f, -0.0951757f, 0.193908f, 0.00908018f, + 0.141486f, -0.0657711f, 0.099791f, 0.153729f, -0.419576f, + -0.892636f, -0.0449268f, -0.170786f, -0.156564f, 0.384511f, + 0.296565f, 0.0569815f, -0.103938f, 1.27479f, -0.0406475f, + 0.154083f, -0.186442f, 0.0282588f, 0.0312102f, -0.188994f, + 0.284243f, -0.564693f, 0.425525f, -0.00924596f, 0.810003f, + 0.233812f, -0.0180273f, 0.121082f, -0.209096f, 0.151437f, + 0.286921f, -0.348095f, 0.174813f, -0.413798f, 0.108994f, + -0.34266f, -0.0337981f, -0.459f, -0.409812f, -0.0890104f, + 0.0834802f, -0.00259191f, -0.105914f, -0.164207f, 0.0697689f, + -0.312098f, -0.00650536f, -0.486758f, -0.248486f, 0.24314f, + -0.0857144f, 0.0884781f, -0.65615f, -0.121744f, 0.0709335f, + -0.0237193f, 0.10764f, -0.0409452f, -0.0824305f, 0.42329f, + 0.138258f, 0.502607f, 0.228545f, 0.0687789f, 0.0361586f, + 0.39074f, 0.0722654f, -0.0133148f, 0.283278f, 0.0743384f, + 0.310292f, -0.297675f, -0.359935f, 0.521021f, -0.10082f, + -0.272333f, 0.0120283f, 0.138118f, -0.123711f, -0.0711386f, + 0.0170747f, 0.831039f, 0.0509626f, 0.790608f, -0.0863406f, + -0.31962f, 0.0631013f, 0.0873453f, -0.472331f, -0.0826027f, + -0.241722f, 0.148835f, -0.131611f, 0.000195347f, -0.0615804f, + -0.838663f, -0.586979f, 0.247713f, 0.362254f, 0.492727f, + -0.132163f, 0.0516545f, 0.477838f, -0.0395182f, 0.0124993f, + -0.771514f, 0.0386912f, -0.118525f, -0.346172f, -0.265905f, + -0.175257f, -0.406287f, 0.393837f, 0.409096f, -0.408501f, + -0.0207146f, 0.0487809f, 0.0636982f, 0.0276368f, 0.0878249f, + 0.0425889f, 0.0868633f, 0.17423f, -0.128217f, -0.477068f, + -0.321294f, 0.0393771f, 0.00812823f, -0.350529f, -0.129012f, + 0.439953f, 0.396662f, 0.410475f, -0.123129f, -0.565966f, + 0.0298635f, -0.614611f, -0.477514f, 0.453651f, 0.0617068f, + 0.0530563f, 0.0479074f, 0.213551f, 0.039034f, 0.0449095f, + -1.06868f, -1.2654f, -0.175482f, 0.595068f, -0.230095f, + 0.719838f, -0.272148f, 0.696564f, 0.0485396f, 0.468584f, + 0.0695439f, -0.0842122f, -0.228978f, 0.161397f, -0.000441421f, + -0.0297514f, -0.250599f, 0.196656f, 0.608423f, -0.0112096f, + 0.0236881f, -0.00167311f, 0.0040709f, 0.015495f, 0.00757698f, + -0.165886f, 0.359767f, -0.0214696f, 0.377208f, 0.0303547f, + 0.0657094f, 0.140775f, 0.21867f, -0.203922f, 0.263878f, + -0.0529099f, 0.202438f, -0.243226f, 0.156659f, -0.627056f, + -0.845036f, -0.500873f, 0.172588f, 0.402972f, -0.147734f, + 0.151792f, -0.075579f, 0.443519f, 0.0311335f, -0.0328222f, + -0.0299781f, 0.435956f, -0.0987376f, 0.288402f, 0.135902f, + -0.173584f, -0.186255f, 0.224524f, -0.249645f, 0.123702f, + -0.0846244f, 0.491317f, 0.544846f, 0.338677f, -0.258885f, + -0.617434f, -0.629003f, -0.347233f, 0.181262f, -0.0606015f, + -0.537766f, 0.215089f, -0.334527f, 0.0488534f, 0.0577997f, + -1.12431f, -0.932292f, -0.11559f, 0.573715f, 0.151128f, + 0.693818f, -0.16956f, 0.802591f, -0.231531f, 1.04318f, + -0.476417f, 0.293452f, -0.610136f, 0.27506f, -0.384012f, + 0.305366f, -0.0540464f, -0.337583f, -0.174285f, 0.157248f, + 0.0477345f, -0.0229535f, 0.0475766f, -0.00603319f, 0.00856119f, + -0.702893f, -0.0579673f, 0.183024f, -0.166222f, 0.109763f, + -0.148019f, -0.258873f, -0.0820157f, -0.186716f, -0.449265f, + -0.0534138f, 0.15732f, 0.46357f, 0.00502591f, -0.0282085f, + 0.152277f, -0.855199f, -0.357115f, 0.0366159f, 0.0131101f, + -0.0407758f, 0.0462835f, 0.146309f, -0.00276278f, -0.0591814f, + -0.109437f, 0.506764f, -0.044421f, 0.465907f, 0.114444f, + -0.241053f, -0.362649f, -0.432615f, 0.199989f, -0.00635866f, + -0.521886f, 0.0958924f, -0.485725f, 0.0430527f, 0.069746f, + 0.681091f, -0.288144f, 0.505671f, 0.0489065f, -0.0373836f, + 0.266079f, 0.145173f, -0.011481f, -0.225074f, -0.754501f, + -0.122939f, -0.294213f, 0.334738f, 0.281561f, 0.558977f, + -0.21551f, -0.346507f, -0.0625635f, 0.0782034f, -0.236999f, + -0.803783f, -0.601117f, 0.091192f, 0.636122f, -0.250626f, + 0.0354961f, 0.103915f, 0.508571f, 0.329911f, -0.0425999f, + -0.0867587f, -0.0385824f, 1.13914f, -0.0261992f, 0.00484478f, + 0.124603f, -0.012173f, -0.377358f, -0.243563f, 0.236094f, + 0.145663f, -0.132752f, 0.347497f, -0.529315f, 0.271632f, + -0.372805f, 0.0261836f, 0.126169f, 0.0941008f, 0.283773f, + 0.765701f, -0.226477f, -0.181549f, -0.306896f, 0.110165f, + -0.0784234f, -0.0827892f, -0.0374252f, -0.0950872f, -0.451015f, + -0.995793f, -0.452663f, 0.293338f, -0.380865f, 0.032683f, + 0.0178248f, 0.0699194f, -0.0811722f, -0.0866096f, 0.139289f, + 0.296604f, 0.192293f, -0.0589607f, -0.179878f, 0.00360266f, + -0.0905794f, 0.136744f, -0.191555f, 1.31877f, -0.0592033f, + -0.158766f, 0.0214746f, -0.190113f, -0.116671f, 0.0449292f, + -0.109533f, -0.709307f, 0.386424f, 0.40201f, 0.262211f, + -0.155244f, 0.233988f, -0.0166317f, 0.462665f, 0.0484462f, + 0.210902f, -0.352798f, 0.38698f, -0.228261f, -0.084309f, + -0.220751f, -0.170879f, -0.352617f, -1.24277f, 0.266004f, + -0.0125749f, -0.0380073f, 0.101838f, -0.0483024f, -0.0629178f, + -0.0695577f, -0.103439f, 0.242131f, -0.0796858f, 0.349718f, + -0.332045f, 0.0138352f, -0.380235f, -0.28717f, -0.176276f, + 0.865903f, 0.36593f, 0.243925f, -0.422289f, -0.117327f, + 0.21876f, 0.245393f, -0.426134f, -0.186077f, 0.0352515f, + -0.123742f, 0.249376f, 1.3281f, 0.0707771f, 0.071415f, + -0.286827f, -0.131691f, -0.270881f, -0.434378f, 0.376064f, + 0.35966f, 0.513374f, 0.439378f, -0.222716f, -0.5874f, + 0.487997f, -0.293271f, -0.184245f, -0.037256f, 0.17723f, + -0.438651f, 0.428184f, 0.112983f, -0.449287f, -0.0451963f, + 0.0854929f, 0.0735442f, -0.0148642f, -0.0586782f, -0.176455f, + -0.438979f, -0.127109f, 0.211478f, 0.388035f, -0.0372021f, + 0.220575f, 0.382144f, 0.302121f, 0.0857121f, 0.193445f, + -0.488858f, -0.195288f, -0.316184f, -0.314026f, -0.111956f, + 0.0744768f, 0.292709f, 0.30187f, -0.285506f, -0.105006f, + 0.0851402f, -0.082318f, 0.277518f, 0.725294f, -0.756304f, + 0.0155309f, -0.378542f, 0.293377f, -0.347252f, -0.338458f, + 0.221449f, -0.176443f, -0.131972f, 0.0129163f, -0.290649f, + 0.198596f, -0.0721333f, 0.620591f, 0.568736f, 0.174001f, + -0.205186f, -0.265606f, -0.249155f, 0.299163f, 1.11842f, + 0.17423f, 0.196417f, -0.014484f, 0.0735422f, 0.26329f, + 0.12284f, -0.750305f, -0.351337f, 0.121994f, -0.00542878f, + -0.295707f, -0.094124f, 0.300993f, 0.412408f, -0.170761f, + -0.0676329f, -0.106638f, -0.419785f, -0.43878f, 0.22421f, + 0.0339903f, 0.619851f, 0.0615381f, 0.514631f, 1.35424f, + -0.0679228f, -0.203457f, 0.131948f, -0.0041251f, -0.209054f +}; + +static const float av1_simple_motion_search_prune_rect_logits_bias_16[] = { + 0.304025f, 0.131887f, 0.259279f, -0.561564f, -0.161729f, + -0.208036f, 0.102206f, -0.162937f, -1.42311f, -0.708305f +}; + +static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_16 = { + NUM_FEATURES_16, + NUM_LOGITS_16, + NUM_HIDDEN_LAYERS_16, + { + NUM_LAYER_0_UNITS_16, + }, + { + av1_simple_motion_search_prune_rect_layer_0_kernel_16, + av1_simple_motion_search_prune_rect_logits_kernel_16, + }, + { + av1_simple_motion_search_prune_rect_layer_0_bias_16, + av1_simple_motion_search_prune_rect_logits_bias_16, + }, +}; + +#undef NUM_HIDDEN_LAYERS_16 +#undef NUM_FEATURES_16 +#undef NUM_LAYER_0_UNITS_16 +#undef NUM_LOGITS_16 + +#define NUM_HIDDEN_LAYERS_8 1 +#define NUM_FEATURES_8 25 +#define NUM_LAYER_0_UNITS_8 32 +#define NUM_LOGITS_8 4 + +static const float av1_simple_motion_search_prune_rect_logits_kernel_8[] = { + -0.266303f, -0.387676f, 0.204501f, -0.120842f, -0.0752326f, 0.0337739f, + 0.0243477f, -0.356748f, 0.0143051f, -0.16403f, -0.139013f, 0.175003f, + -0.206754f, 0.349059f, 0.181763f, 0.212768f, -0.313783f, 0.182829f, + 0.00205376f, -0.939525f, -0.0992424f, 0.306254f, 0.083329f, -0.133137f, + -0.179022f, -0.0237902f, 0.0601026f, -0.216698f, -0.551149f, 0.081711f, + -0.442191f, 0.0680832f, -0.0353678f, 0.237704f, 0.23155f, -0.36097f, + 0.123389f, -0.288927f, 0.178133f, -0.152222f, -0.235648f, -0.0495293f, + -0.316522f, 0.034207f, 0.0463139f, -0.817825f, 0.417443f, -0.110984f, + -0.402371f, 0.0341694f, -0.37383f, 0.414532f, 0.093993f, 0.0039505f, + 0.0803175f, -0.511859f, -0.0154802f, 0.0979595f, 0.0909049f, -0.120938f, + -0.577382f, -0.155041f, -0.404295f, 0.122223f, -0.084703f, 0.00415336f, + 0.149135f, 0.113219f, 0.124236f, -0.240905f, 0.163909f, -0.154202f, + -0.208917f, 0.00200158f, -0.71796f, 0.105984f, -0.131996f, -0.539603f, + 0.223768f, -0.0710733f, -0.346679f, -0.0745909f, 0.171032f, 0.215701f, + 0.218519f, 0.105981f, -0.096209f, -0.166453f, -0.468894f, -0.401578f, + -0.239222f, 0.111382f, 0.38747f, -0.164734f, -0.175955f, 0.336621f, + -0.0305501f, -0.0576765f, 0.0672671f, -0.183692f, 0.412082f, -0.262951f, + -0.153429f, -0.128589f, -0.530472f, 0.0936412f, -1.08296f, -0.45147f, + 0.0714904f, -3.96842f, 0.438125f, -0.313945f, 0.231104f, -0.00183851f, + -0.0192768f, -0.637531f, -0.109296f, 0.0531702f, 0.00262162f, -0.615951f, + -0.546241f, -0.635305f, -0.0762367f, 0.0122019f, 0.423693f, -0.129142f, + -0.112242f, 0.295184f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_bias_8[] = { + -2.16023f, -3.12831f, -0.213206f, -2.97875f, -1.83791f, -2.84713f, + -0.909636f, -2.05893f, 0.00525274f, -1.51672f, -3.95017f, 1.82847f, + -0.853224f, -3.29503f, -0.537517f, 0.923106f, -3.18665f, -1.29905f, + 1.64506f, -1.99848f, -2.24315f, 0.408613f, 0.503671f, -3.83393f, + -2.88388f, -3.52337f, 1.46818f, -1.67169f, -3.83253f, 1.52644f, + -0.490783f, -0.415782f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_kernel_8[] = { + -0.702198f, -0.102148f, 0.0564545f, -0.0555548f, 0.16184f, + 0.0950792f, 0.136974f, -0.00824146f, 0.05746f, 0.0447542f, + 0.145978f, 0.0855769f, -0.041449f, 0.301347f, -0.0206691f, + -0.0662514f, -0.0525079f, -0.0998387f, -0.0891438f, 0.110545f, + -0.863098f, -1.83798f, 0.238818f, 0.127797f, 0.116872f, + -0.270655f, -0.21057f, 0.197013f, -0.123332f, 0.137104f, + -0.174766f, -0.00803025f, 0.0234369f, -0.0894175f, -0.0380927f, + 0.00827928f, -0.134148f, 0.110575f, -0.250173f, 0.116273f, + 0.0197749f, 0.270391f, 0.108437f, 0.173197f, -0.0650348f, + 0.0884626f, 0.262792f, 0.0649228f, 0.5573f, -2.81315f, + -0.479801f, -1.15825f, 0.0807932f, -0.19144f, 0.404016f, + -0.211521f, 0.233269f, -0.391414f, 0.160381f, -0.277233f, + 0.426354f, 0.156839f, 0.494315f, -0.214259f, -0.0132062f, + 0.148628f, -0.0899568f, 0.161845f, 0.467689f, 0.229474f, + 0.590634f, -0.705793f, -0.0486113f, -0.439088f, 0.994566f, + 0.679065f, 0.777869f, -0.225291f, -0.0303006f, -0.638782f, + -0.0824632f, -0.128561f, -0.327603f, 0.105624f, 0.567581f, + -0.396135f, -0.471028f, 0.181286f, 0.274604f, 0.180169f, + 0.0612144f, -0.865004f, 0.0306804f, 0.142985f, -0.0914358f, + -0.243284f, 0.358359f, -0.443847f, -0.371978f, 0.606933f, + -0.900408f, -0.52076f, 0.472118f, 0.0610973f, 0.152526f, + -0.550379f, 0.309331f, -0.141573f, 0.203046f, -0.231485f, + 0.505156f, 0.393224f, 0.435487f, -0.218681f, 0.123707f, + -0.270383f, -0.033565f, 0.210373f, -2.33967f, 0.367434f, + 0.0308118f, -0.205771f, 0.546141f, 0.19837f, 0.035648f, + -0.467007f, -1.50995f, -0.0314176f, 0.11762f, -0.15307f, + 0.618257f, -0.139502f, 0.303386f, -0.00758681f, 0.228107f, + -0.594499f, -0.201984f, -0.239666f, 0.114878f, -0.922174f, + -0.530137f, -0.379366f, -0.319582f, 0.0889624f, -0.00544663f, + 0.316264f, -0.204262f, -0.0959358f, 0.23552f, 0.141369f, + -0.207129f, -1.04067f, -0.0780501f, 0.226768f, -0.246752f, + 0.0823105f, 0.114783f, 0.49315f, 0.0197732f, 0.705433f, + 0.158076f, -0.250584f, -0.157326f, -0.0439547f, -0.139047f, + 0.090531f, -0.38833f, 0.743143f, -1.47418f, -0.155009f, + 0.511466f, -0.726716f, -0.181075f, 0.450133f, -0.390204f, + 0.292725f, 0.00811462f, -0.347738f, 0.613381f, -0.237124f, + 0.750748f, -0.383123f, 0.410309f, -0.204166f, 0.667199f, + -0.313197f, 0.436059f, -0.607571f, 0.193681f, 0.409399f, + 0.631747f, -0.0454149f, 0.198232f, 0.345591f, -0.0137374f, + -0.307014f, -0.535515f, 0.764678f, -0.225686f, -0.451621f, + -2.75564f, -1.52877f, 0.0511933f, 0.905979f, 0.145029f, + 0.759615f, 0.130166f, 0.83827f, 0.0655081f, 1.07555f, + -0.529777f, 0.682967f, -0.412052f, 0.611947f, -0.83676f, + 0.940695f, -0.465681f, 0.51505f, -0.883659f, -0.105524f, + -0.0344173f, -0.0683618f, -0.00698688f, -0.139349f, 0.135741f, + -0.294455f, -0.377834f, -0.602084f, -1.00128f, 0.483291f, + 1.25327f, 0.178987f, 0.75068f, -0.520731f, -0.325517f, + 0.272032f, 0.144144f, -0.279453f, 0.564907f, 0.144036f, + 0.297448f, -0.504243f, -0.250508f, -1.26395f, 0.4816f, + 0.392771f, -0.389961f, -0.261585f, -0.127124f, -0.202945f, + -0.709716f, -0.174719f, 0.113613f, 0.477753f, -0.226659f, + 0.0697828f, -0.177994f, 0.300726f, -0.185504f, 0.339424f, + -0.316746f, 0.369693f, -0.339723f, -0.143886f, -0.0326589f, + -0.268761f, -0.241094f, 0.284876f, -0.0270867f, -0.207397f, + -1.42738f, 0.495612f, -0.0277732f, 0.199675f, 1.48638f, + -0.659257f, -1.28199f, 0.498702f, 0.140695f, 0.571152f, + 0.416368f, 0.14153f, 0.126876f, 0.521114f, -0.00150571f, + 0.375581f, 0.00537624f, 0.1286f, -0.332227f, 0.417663f, + -0.539023f, 0.217124f, -0.787111f, -0.0335266f, 1.56751f, + 0.0640563f, -0.158791f, 0.118195f, 0.000970493f, -0.0403852f, + -0.0572557f, -0.0201181f, -0.10255f, 0.63237f, 0.156662f, + 0.418696f, -0.274802f, -0.663923f, -0.375232f, -0.40846f, + 0.462092f, 1.2176f, -0.301532f, -0.779704f, -0.112876f, + 0.0806591f, -0.0141923f, 0.00960801f, -0.663557f, 0.0979948f, + -0.0575999f, -0.012847f, 0.0403853f, -0.133666f, -0.00330217f, + -0.931518f, -0.774599f, -0.21391f, 0.377601f, -0.183365f, + 0.299094f, 0.0238552f, 0.206716f, -0.18959f, 0.346013f, + -0.150991f, -0.192817f, -0.293962f, -0.0537604f, -0.0648171f, + -0.275941f, -0.144854f, -0.224092f, 2.43113f, 0.0422494f, + -0.047236f, -0.0262028f, 0.0282119f, -0.175553f, 0.0888502f, + 0.580682f, 0.951055f, -0.284441f, -0.120133f, -0.268058f, + -0.312083f, -0.411556f, 0.21431f, -0.28033f, 0.324851f, + -1.02787f, -0.936816f, -0.577628f, 0.544743f, 0.295807f, + 0.406157f, 0.447927f, 0.25369f, -0.811421f, -0.0424979f, + -0.189867f, 0.00778673f, -0.113587f, -0.116175f, -0.0542222f, + -1.80089f, -1.44175f, -0.35332f, 0.191314f, -0.236691f, + -0.0261926f, -0.502363f, 0.252278f, -0.485478f, 0.296495f, + 0.455612f, -0.0489631f, 0.227255f, 0.170975f, 0.473487f, + 0.257812f, 0.178048f, 0.2506f, 2.04637f, -0.173857f, + 0.0583379f, 0.00765589f, -0.025772f, -0.162666f, -0.016214f, + -0.607486f, -0.0808025f, 0.0551611f, -0.0772291f, 0.126421f, + 0.10869f, -0.0877463f, -0.111527f, -0.0775766f, 0.503886f, + -0.002757f, -0.0421354f, -0.247857f, 0.140827f, 0.383576f, + 0.228232f, -0.157877f, -0.0927911f, 0.344687f, 0.191181f, + 0.236533f, 0.00102869f, -0.0184502f, -1.4509f, -1.15945f, + -0.521978f, -0.643225f, 0.133139f, 0.0660321f, 0.0851957f, + 0.0303648f, 0.0296239f, 0.0455713f, 0.175647f, 0.080532f, + 0.0445691f, -0.257356f, -0.125602f, -0.138829f, -0.167057f, + -0.0992552f, -0.13944f, 0.507531f, 0.444997f, 0.221452f, + -0.308384f, -0.327554f, 0.13235f, 2.1487f, -1.15453f, + -0.280239f, -0.363582f, -0.00358745f, 0.012866f, 0.251088f, + 0.0676416f, 0.178492f, -0.136631f, 0.197938f, -0.078198f, + 0.812439f, 1.1173f, 0.712113f, 1.10124f, -0.836503f, + -1.22433f, -1.07894f, -1.29215f, 0.56057f, 2.23928f, + -0.419029f, 0.282178f, -0.0719266f, -0.172192f, 0.28034f, + -2.99124f, -2.01481f, 0.0688982f, 0.697466f, 0.00635555f, + 0.566069f, 0.047534f, 0.507755f, -0.00690707f, 0.712594f, + -0.191467f, 0.355733f, -0.480016f, 0.664669f, -0.390619f, + 0.351199f, -0.482342f, 0.325005f, 1.9089f, 0.155987f, + 0.17032f, 0.132729f, 0.0402649f, 0.146991f, 0.0314905f, + -0.775316f, -0.208892f, -0.105993f, 0.0181653f, -0.12735f, + 0.0897852f, 0.0470231f, 0.25807f, 0.127406f, -0.0893252f, + -0.279776f, 0.190844f, 0.110384f, -0.148833f, 0.025293f, + 0.239838f, 0.00932245f, 0.35103f, -0.128268f, -0.0536754f, + 0.506899f, -0.16793f, 0.0955582f, -2.01108f, 0.721433f, + -2.31413f, -2.08646f, 0.033315f, 0.689828f, -0.271213f, + 0.790425f, -0.114234f, 0.755325f, -0.211533f, 0.774544f, + -0.263268f, 0.795762f, -0.551455f, 0.953602f, -0.168454f, + 0.529055f, -0.768991f, 0.882371f, 0.29763f, -0.155017f, + 0.00464101f, 0.121093f, 0.948271f, 0.113138f, -0.110332f, + -2.0492f, -1.31322f, -0.129212f, 0.464778f, -0.181465f, + 0.618403f, 0.0627984f, 0.465228f, 0.165729f, 0.278277f, + -0.563276f, -0.358358f, -0.590638f, 0.0104993f, 0.731206f, + 0.752569f, 0.631615f, 0.811822f, 0.129804f, -0.0558327f, + 0.570081f, -0.417922f, -0.168275f, 0.0703671f, 0.269127f, + 0.240457f, -0.197159f, -0.00179261f, 0.220065f, 0.463511f, + 0.0714626f, -0.716477f, -0.441865f, -0.717028f, -0.149176f, + 0.452182f, 0.662699f, -0.906534f, -0.817133f, 0.237747f, + 0.26024f, -7.7441e-05f, 0.0934616f, 0.824641f, -0.0404494f, + -0.088297f, -0.157899f, 0.037408f, 0.132435f, -0.316155f, + -0.276785f, 0.0117868f, 0.185008f, 0.32369f, -0.465855f, + -0.302127f, 0.303289f, 0.338597f, -0.665408f, -0.507594f, + 0.526979f, 0.532091f, 0.234395f, 0.754063f, 0.116769f, + 0.0800309f, -0.939344f, -1.51269f, 1.4583f, 0.178444f, + 0.0106756f, -0.213468f, -0.00369439f, 0.071015f, -0.192798f, + -0.0933147f, -0.129901f, -0.368279f, -0.246564f, 0.126966f, + 0.478565f, -0.476246f, -0.762863f, 0.168883f, 0.536136f, + -0.272969f, 0.2573f, -0.161577f, 0.311428f, -0.777994f, + -1.29752f, 0.216046f, 0.329016f, 1.57265f, 0.168075f, + -0.192518f, 0.0829308f, -0.073533f, -0.0202034f, 0.114716f, + -0.34888f, -0.519215f, 0.190809f, 0.0138507f, 0.133635f, + 0.14194f, 0.410618f, -0.165106f, 0.214438f, 0.0438265f, + -0.8481f, -1.19182f, -1.07878f, -0.882217f, 0.45616f, + 0.977385f, 0.74929f, 0.918466f, 0.904704f, 0.041938f, + 0.0362776f, 0.0757255f, 1.14007f, 0.0516825f, -0.160068f, + 0.219535f, 0.638634f, -0.0284544f, -0.222849f, -0.0344915f, + -0.0350256f, -0.0504452f, -0.0458416f, 0.146099f, 0.0783083f, + 0.206579f, 0.241264f, 0.28401f, 0.0425312f, -0.802049f, + -0.746271f, -0.578969f, -0.078218f, 0.436176f, -0.281465f, + -2.5539f, 0.237868f, -0.121796f, 0.0715619f, 0.106992f, + -0.621862f, -0.167142f, 0.153716f, 0.0570912f, -0.06525f, + -0.923773f, 0.130759f, 0.0517066f, 0.0729862f, -0.873064f, + 0.0403328f, -0.186499f, -0.0831918f, -0.223723f, 0.144697f, + 0.212845f, 0.416876f, 0.361598f, 0.138229f, 0.0728777f, + -1.95419f, -0.00382816f, -0.0440387f, 0.433627f, 0.44781f, + -1.05229f, -1.54506f, 0.564827f, -0.263456f, 0.296105f, + -0.158055f, 0.388274f, -0.366639f, 0.212006f, -0.245619f, + 0.593064f, 0.088727f, 0.410632f, -0.263462f, 0.507075f, + -0.0974155f, 0.275268f, -0.1293f, 0.136679f, 1.98276f, + 0.411766f, 0.391987f, 0.34283f, -0.114077f, 0.258462f, + -0.302443f, 0.301138f, -0.00726621f, 0.276441f, -0.291582f, + 0.66498f, -0.321451f, -0.332805f, 0.0943272f, 0.572253f, + -0.45818f, -0.0219593f, -0.151679f, 0.402033f, -1.15502f, + -0.882955f, 0.772904f, 0.88126f, -0.149555f, 0.709525f, + 0.350116f, -0.21531f, 0.797893f, 0.0230234f, 0.0203034f, + 0.2744f, 1.08273f, 0.039349f, 0.503909f, -0.45892f, + -0.579516f, -0.344058f, 0.390628f, -0.386941f, -0.430317f, + -0.0807066f, 0.435906f, 0.522996f, 0.724476f, -0.74371f, + -0.05376f, -0.340898f, -0.962646f, -0.0278005f, 0.0981149f, + -0.0811161f, 0.00237994f, 0.850042f, 0.0665473f, 0.134413f +}; + +static const float av1_simple_motion_search_prune_rect_logits_bias_8[] = { + 1.63404f, -0.715866f, -1.0132f, -2.08745f +}; + +static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_8 = { + NUM_FEATURES_8, + NUM_LOGITS_8, + NUM_HIDDEN_LAYERS_8, + { + NUM_LAYER_0_UNITS_8, + }, + { + av1_simple_motion_search_prune_rect_layer_0_kernel_8, + av1_simple_motion_search_prune_rect_logits_kernel_8, + }, + { + av1_simple_motion_search_prune_rect_layer_0_bias_8, + av1_simple_motion_search_prune_rect_logits_bias_8, + }, +}; + +#undef NUM_HIDDEN_LAYERS_8 +#undef NUM_FEATURES_8 +#undef NUM_LAYER_0_UNITS_8 +#undef NUM_LOGITS_8 + +static const NN_CONFIG + *const av1_simple_motion_search_prune_rect_nn_config[5] = { + &av1_simple_motion_search_prune_rect_nn_config_128, + &av1_simple_motion_search_prune_rect_nn_config_64, + &av1_simple_motion_search_prune_rect_nn_config_32, + &av1_simple_motion_search_prune_rect_nn_config_16, + &av1_simple_motion_search_prune_rect_nn_config_8, + }; + +// nn model for predicting max square partition level of a superblock +#define NUM_HIDDEN_LAYERS 1 +#define NUM_FEATURES 13 +#define NUM_LAYER_0_UNITS 48 +#define NUM_LOGITS 4 + +static const float av1_max_part_pred_logits_kernel[] = { + -0.304561f, 0.0885596f, -0.988539f, 1.08147f, 0.215213f, + 0.202965f, -0.828457f, -0.233945f, -0.0866977f, -0.115521f, + 0.02079f, 0.196491f, -0.0285075f, 0.05067f, -0.00872862f, + 0.00281844f, -0.238954f, 0.0253801f, 0.0257775f, 0.339269f, + 0.176174f, -0.152545f, -0.0588704f, -1.62275f, -0.189329f, + 0.0808033f, 0.233844f, -4.53798f, 0.674968f, -0.0361688f, + -0.0754075f, 1.16129f, -0.0188879f, 0.113255f, -3.04378f, + 0.814728f, -0.568517f, -0.00179383f, -3.61223f, -1.67535f, + -2.20417f, -0.197196f, 0.0507745f, -0.0909394f, -0.0507879f, + -1.27999f, -0.055623f, 0.0318497f, 0.192867f, 0.138726f, + 0.0443392f, -0.595075f, -0.166774f, 0.0882958f, -0.348161f, + 0.0214428f, -0.0599275f, -0.0995385f, -0.82358f, 0.141205f, + -0.053232f, 0.00508296f, -1.90872f, 1.15004f, -0.194219f, + 0.0229019f, -0.00354318f, 0.22016f, 0.154101f, -0.159231f, + -0.0446647f, -0.197503f, 0.0408453f, 0.197659f, 0.797858f, + -0.189722f, 0.343653f, 0.124666f, -1.03083f, 0.603059f, + 0.101565f, 0.0932993f, 0.462484f, 0.295984f, 1.11198f, + 0.143709f, -0.846232f, -0.464392f, -1.06058f, -0.124889f, + 0.0727475f, 1.18446f, -0.100302f, 0.0641918f, -0.101622f, + 0.10219f, 0.130189f, 0.0915623f, -0.166904f, -1.10606f, + -0.16726f, -0.146152f, 0.145443f, -0.177091f, -0.0215214f, + 0.0158506f, -0.553294f, 0.0784749f, -0.0416628f, -0.027785f, + 0.280027f, 0.484898f, -0.164225f, 0.0238317f, -0.0345254f, + 0.0410244f, 0.131529f, 0.0239622f, -0.0749436f, -0.0224914f, + 0.128926f, 0.224539f, 0.413297f, 0.0638572f, 0.103308f, + 0.0913242f, -0.119274f, 0.0163103f, 0.113828f, 0.119809f, + 0.297057f, -0.124889f, -0.533108f, -0.181408f, -0.129896f, + 0.0221064f, -0.0773281f, -0.0386467f, 0.0342961f, 0.126575f, + -0.24114f, 0.0735576f, 0.0524791f, 0.246896f, -0.130674f, + -0.03979f, 0.173639f, 1.95193f, -0.113029f, -0.0305852f, + -0.00671737f, 0.157159f, -0.00102858f, -0.543688f, 0.566772f, + 0.124124f, -0.0294064f, -0.0699021f, -0.0704103f, -0.766097f, + -0.0625802f, -0.0906173f, -0.0520414f, -0.0272724f, 0.283064f, + 0.236213f, -0.127319f, 0.019392f, 0.170042f, -0.0214542f, + 0.0740938f, 0.356578f, -0.236257f, 0.269021f, 0.114759f, + -0.641166f, 0.136308f, -0.0386959f, -0.112024f, -0.361209f, + 0.686095f, 0.183906f, 0.288656f, 0.182007f, 0.337458f, + 0.058974f, -0.305512f, -0.841708f, -0.243779f, -0.0614058f, + 0.208747f, 0.448697f +}; + +static const float av1_max_part_pred_layer_0_bias[] = { + -0.776544f, -2.0022f, -0.330294f, 2.47665f, 1.90206f, -1.61571f, + 0.536246f, 1.00455f, 5.24561f, 1.55111f, -0.816399f, -4.88703f, + -1.06417f, -1.15359f, -0.145289f, 1.91831f, 0.630915f, -1.94256f, + -3.35239f, -1.05007f, -1.05186f, 1.36824f, -5.2878f, 1.10482f, + -5.00077f, -0.0445198f, 3.41427f, 2.3439f, -0.413306f, -1.88152f, + -2.28638f, 8.24783f, -1.91961f, -1.49324f, 1.96599f, -6.32309f, + -0.332426f, -0.425506f, 4.06511f, 5.84386f, 4.15747f, 1.22402f, + 2.8512f, 2.53027f, 0.0170272f, -1.43966f, -0.997785f, 5.43064f +}; + +static const float av1_max_part_pred_logits_bias[] = { -4.25432f, 0.144758f, + 1.96217f, 0.728905f }; + +static const float av1_max_part_pred_layer_0_kernel[] = { + 0.992471f, 0.533006f, 0.143743f, -2.51788f, -0.468337f, + -0.201376f, -0.151834f, 0.479883f, 1.16061f, -0.278878f, + -0.814954f, -0.152405f, -0.0521608f, 0.797104f, -2.08912f, + 0.385839f, -2.22889f, -0.106858f, -0.239766f, -0.951128f, + -0.698753f, 0.0831051f, 1.1702f, 0.342834f, -0.0352795f, + -0.0847639f, -0.802086f, 0.258982f, 1.14174f, 0.645885f, + -1.19226f, -0.592888f, -0.343659f, 1.1912f, 1.45411f, + -1.22927f, 0.152858f, 0.00373585f, -1.60637f, 0.592611f, + 0.0857475f, -0.346147f, -0.150784f, -0.0817408f, -0.189918f, + -0.804952f, -1.33036f, -1.03307f, 0.0248769f, 0.16607f, + -2.896f, -2.1293f, 0.12293f, -0.173179f, -0.212128f, + -6.76221f, 0.033188f, 0.0231787f, 0.905957f, 0.0551327f, + -0.356276f, 0.0181795f, 0.0977523f, -0.0352873f, -0.0396386f, + 2.3241f, 0.0632874f, -0.11804f, -6.32521f, 0.0224659f, + -0.00188896f, 0.267992f, 0.272337f, 0.00936963f, 0.659969f, + -2.25707f, -0.0278229f, -0.0185089f, -1.14466f, 0.104827f, + 0.0435885f, 0.558586f, -0.00697004f, 0.0312611f, 0.540574f, + -0.568625f, 0.218608f, 0.378911f, -0.0289192f, -0.0734742f, + -1.08782f, -2.42069f, -0.0127239f, 0.0493651f, -1.15837f, + 0.261831f, 0.401824f, -1.04545f, 0.284173f, 0.784972f, + -0.511243f, -0.982599f, -0.106134f, -0.325964f, -1.44107f, + -1.42434f, -1.02402f, -1.52034f, 0.0737116f, 0.0462242f, + 0.628722f, -1.0405f, -0.113718f, 2.20573f, -4.33951f, + -0.0192695f, -0.0229314f, -1.89156f, 0.645942f, 0.375708f, + -1.97447f, -0.267014f, 0.0989443f, -0.450534f, -1.01737f, + -0.642416f, -0.0897288f, -2.08724f, -0.190965f, -0.279135f, + -0.830178f, 0.808754f, -0.139091f, 1.11004f, -0.454439f, + -0.479238f, -1.44001f, 0.0888059f, 0.885689f, -0.642505f, + -0.00773651f, -0.0265721f, -0.906346f, 1.68504f, 0.084257f, + -0.951101f, -8.06495f, 0.19231f, 0.16389f, -0.193678f, + 0.729837f, -1.98392f, -5.98513f, 3.32638f, -0.0658378f, + -0.0910426f, -0.666567f, -0.315339f, 0.123124f, -2.66375f, + -0.714852f, -0.136176f, -0.460166f, -0.567551f, -1.06193f, + -1.21389f, -0.83865f, 0.00280695f, -0.199519f, -0.534704f, + 0.419311f, -0.149008f, -3.68707f, 0.00285113f, -0.0718198f, + -1.41026f, -1.34155f, -0.538687f, -0.623666f, -2.56462f, + -0.0183333f, -0.323532f, -1.27141f, -0.0212039f, 0.198633f, + 0.459554f, -4.65103f, -1.01293f, -1.39512f, -0.289026f, + 0.208724f, -0.665226f, 1.13369f, -1.96734f, -1.45442f, + -3.46172f, 0.810681f, -0.603973f, 0.842764f, -3.90371f, + -0.394561f, -3.61363f, -2.88085f, 0.031645f, -0.23125f, + -2.63898f, -1.35314f, -0.46726f, 1.33145f, 1.20269f, + 1.38682f, -0.331637f, 0.069021f, 0.149523f, -1.24957f, + -0.878857f, -0.200368f, 0.465744f, 1.01365f, -0.0122221f, + -0.550586f, -1.12581f, -0.422132f, -0.0744868f, -2.4804f, + -1.07072f, -0.479006f, 0.101817f, -0.118947f, 0.341576f, + -1.0538f, -0.812346f, -1.13727f, -0.00939806f, 10.1571f, + -0.0441302f, 0.00280407f, -21.5044f, 0.0181152f, -0.0143246f, + 3.23462f, -1.38624f, -1.80416f, 4.89763f, -2.67364f, + 2.31771e-05f, 0.000393989f, 0.352204f, -0.193455f, 0.531455f, + 0.488757f, -0.442555f, -0.518528f, 0.431482f, -2.67727f, + -2.00626f, -0.39729f, -0.221494f, -0.0188888f, -0.0377649f, + -1.80169f, 0.0810332f, -0.0408335f, -1.28675f, -0.0353824f, + -0.666723f, -1.07281f, 0.252912f, -1.24547f, -1.7831f, + -1.14354f, -0.137662f, 0.00230182f, 0.736862f, 0.175872f, + -0.187556f, 0.43963f, -0.796524f, 0.056219f, -0.387874f, + 0.0710224f, -0.16548f, -0.100993f, 0.931481f, -3.20738f, + -0.0197576f, 0.266148f, -0.173909f, -0.337795f, -0.0682381f, + 0.176844f, 0.140286f, 1.12033f, 0.429064f, -2.24192f, + -1.54682f, 2.23646f, -0.0371138f, -0.0475339f, -3.21766f, + 0.0412858f, 0.387811f, 6.6711f, 0.140649f, 0.0559547f, + -0.802839f, 0.599977f, 0.64552f, -2.08103f, -0.503401f, + -0.0407036f, -0.0299199f, 0.0849445f, -0.111657f, -1.63462f, + 3.33762f, 0.0441394f, 0.0466889f, -0.951806f, 0.0723954f, + 0.00348661f, -1.36903f, 2.24625f, -0.0348915f, -0.0508893f, + -0.240891f, -0.120143f, -0.17991f, -2.09137f, 0.0150871f, + 0.0480333f, 1.72012f, 0.0309551f, -0.0370507f, -0.377075f, + 0.103916f, -0.0169255f, -0.0145395f, -4.02144f, 0.83193f, + -0.316502f, 6.3832f, -1.70038f, -1.97215f, -1.94501f, + 1.45479f, 0.711725f, -0.348496f, -0.279056f, -1.13396f, + -1.51744f, -0.853307f, 1.53131f, -0.0032358f, 1.41808f, + -1.32989f, -0.245221f, -0.161614f, -0.500845f, -0.449252f, + 0.0724151f, -0.116333f, -0.0946182f, -2.0945f, 0.0564572f, + 0.393261f, -1.06861f, -0.111458f, -0.839943f, -0.0880348f, + 0.0365742f, 0.415339f, -1.57494f, -0.713697f, 1.02349f, + -0.221371f, -0.0446281f, 1.89223f, -0.0811754f, -0.402773f, + -0.930987f, 0.0243194f, 0.0678332f, -0.0233014f, 0.165372f, + -0.44083f, -1.2404f, 0.35675f, -0.040916f, -0.0512548f, + -2.9071f, 0.861174f, -0.778133f, 2.14436f, -0.688427f, + -0.480371f, -1.69032f, 0.706687f, -0.281982f, -2.30451f, + 1.61541f, -0.0213638f, -0.740509f, -0.266677f, 0.0268434f, + -0.0116908f, -3.17595f, 0.0114825f, 0.0196997f, -0.144005f, + 0.0550181f, -0.851459f, -0.000285073f, -0.538441f, -0.0254868f, + -0.0104454f, -0.0661998f, -0.196469f, -0.346372f, -5.52892f, + -0.643683f, -0.622224f, -0.31463f, -0.555956f, -0.520132f, + -0.843166f, -2.59479f, -0.750195f, 0.00635995f, -0.338615f, + -0.216676f, -0.391544f, -1.62185f, -0.718471f, -0.475406f, + -0.782041f, -0.608824f, -1.09633f, -1.27308f, -0.560719f, + -0.207539f, -0.0196445f, -1.05519f, -0.575249f, -1.0642f, + 1.01615f, -0.873633f, -0.417953f, -0.428051f, 0.350259f, + -2.53833f, -2.72203f, 0.672846f, -0.503094f, -1.1374f, + 0.214291f, 0.013305f, 0.0112064f, 1.10532f, 0.030455f, + 0.0239614f, 0.628072f, 0.0539135f, -0.472441f, -0.688439f, + -0.32044f, -0.0234867f, -0.0158436f, -0.949314f, -0.0453161f, + -1.18306f, 0.626845f, -0.426925f, -0.688371f, 0.415062f, + 0.0640985f, -0.638387f, -2.01399f, -0.209744f, -0.762892f, + -0.0753296f, -0.879315f, -0.520433f, -0.111375f, 0.389742f, + -0.398862f, -0.643227f, -0.246396f, 0.0317051f, 1.06973f, + 0.413617f, 0.180506f, -0.0507897f, -0.00650435f, 0.620892f, + 0.046312f, 0.475032f, 0.906993f, -0.0388061f, -0.256271f, + -1.03323f, 0.0125266f, -0.31116f, -0.377611f, -0.0386407f, + -0.0232745f, -0.353644f, -2.27289f, 0.0571779f, -0.00865006f, + 1.65101f, 0.0175711f, 0.0184585f, 0.558458f, 0.2213f, + -0.285089f, 0.433445f, -0.427177f, -0.0103682f, -0.0101273f, + 0.214085f, -0.0459885f, 0.00761981f, 0.836381f, 0.0175293f, + 0.02508f, -1.51778f, 0.0143956f, -0.162589f, 0.595418f, + 0.21445f, -0.0335848f, -0.0136684f, -0.16686f, -0.14612f, + 0.0816238f, 0.499636f, 0.12458f, -2.41673f, -0.261721f, + -0.676805f, -1.88366f, 0.730462f, 0.69196f, -0.0288489f, + -2.38272f, 0.329876f, 0.014517f, -0.115145f, -3.48151f, + -0.00209072f, -0.0732377f, 0.820443f, -0.0118701f, 0.112145f, + 0.272315f, 0.137531f, -0.0200997f, -0.0397883f, -2.19458f, + 0.183554f, -0.639716f, 0.481605f, -0.621639f, -0.0980299f, + -0.710534f, -0.143105f, -6.77626f, -1.65139f, -2.37718f, + -0.533127f, -1.12574f, 3.34182f, -0.0758663f, 0.0334238f, + -9.48647f, 0.0674974f, 0.0507665f, 0.523007f, -0.0668f, + 0.5736f, -0.589761f, -1.1692f, -0.0236497f, -0.00828928f, + -0.265823f, 1.15284f, 0.307927f, -0.695308f, 0.13725f, + -0.20394f, -0.363965f, -0.331159f, -1.50927f, -1.20051f, + -0.0205825f, -0.0381859f, -0.0579876f, -1.6913f, -1.94626f, + 3.4214f, 3.3922f, -2.13798f, -0.679848f, -0.890735f, + 0.235017f, -0.253202f, -1.0571f, 1.40354f, 0.00719052f, + -1.54365f, -0.7289f, -1.05492f, 0.0238169f, -0.00543592f, + -0.0510353f, -0.175386f, -0.724207f, -0.788936f, 0.039976f, + 1.36966f, 0.869475f, -0.0302774f, -0.0537556f +}; + +static const NN_CONFIG av1_max_part_pred_nn_config = { + NUM_FEATURES, + NUM_LOGITS, + NUM_HIDDEN_LAYERS, + { + NUM_LAYER_0_UNITS, + }, + { + av1_max_part_pred_layer_0_kernel, + av1_max_part_pred_logits_kernel, + }, + { + av1_max_part_pred_layer_0_bias, + av1_max_part_pred_logits_bias, + }, +}; + +#undef NUM_HIDDEN_LAYERS +#undef NUM_FEATURES +#undef NUM_LAYER_0_UNITS +#undef NUM_LOGITS + +// Early termination in second pass +static const float av1_simple_motion_search_term_none_mean_128[28] = { + 12.661922f, 12.638062f, 10.896497f, 10.865719f, 10.978963f, 10.940105f, + 11.012235f, 10.972760f, 11.069924f, 11.018533f, 11.773865f, 11.747426f, + 11.891315f, 11.858107f, 11.793916f, 11.766356f, 11.874997f, 11.840164f, + 5.940535f, 0.770746f, 4.292692f, 4.309581f, 0.848423f, 4.292334f, + 4.298179f, 8.514713f, 14.911736f, 19.825352f, +}; + +static const float av1_simple_motion_search_term_none_std_128[28] = { + 1.796731f, 1.797056f, 1.898383f, 1.900753f, 1.846624f, 1.846953f, 1.906632f, + 1.908089f, 1.836533f, 1.835967f, 1.840262f, 1.840671f, 1.816836f, 1.817103f, + 1.879846f, 1.881333f, 1.803102f, 1.802654f, 2.263402f, 0.420354f, 1.117165f, + 1.083779f, 0.358611f, 1.101183f, 1.084938f, 2.462638f, 1.577009f, 1.574711f, +}; + +static const float av1_simple_motion_search_term_none_mean_64[28] = { + 10.904455f, 10.853546f, 9.247903f, 9.184479f, 9.251985f, 9.186686f, + 9.253490f, 9.190190f, 9.270079f, 9.204357f, 10.086511f, 10.031060f, + 10.100875f, 10.045429f, 10.069688f, 10.013173f, 10.082980f, 10.024640f, + 4.888378f, 0.878113f, 3.598450f, 3.628491f, 0.925833f, 3.560971f, + 3.573322f, 8.807137f, 13.348477f, 18.269117f, +}; + +static const float av1_simple_motion_search_term_none_std_64[28] = { + 1.789300f, 1.787061f, 1.823519f, 1.820226f, 1.794643f, 1.788620f, 1.797194f, + 1.795135f, 1.777795f, 1.773634f, 1.794000f, 1.790377f, 1.772197f, 1.769692f, + 1.819050f, 1.817139f, 1.793577f, 1.789333f, 1.998251f, 0.327156f, 0.885748f, + 0.853767f, 0.262043f, 0.902435f, 0.860033f, 1.224865f, 1.603411f, 1.589296f, +}; + +static const float av1_simple_motion_search_term_none_mean_32[28] = { + 9.818970f, 9.751199f, 8.015079f, 7.927318f, 8.029113f, 7.938330f, 8.012570f, + 7.923719f, 8.033508f, 7.941911f, 8.933057f, 8.857422f, 8.935639f, 8.859187f, + 8.905495f, 8.829741f, 8.929428f, 8.851351f, 4.114069f, 0.954752f, 2.645082f, + 2.709703f, 0.964678f, 2.652077f, 2.673393f, 9.430499f, 11.922798f, 16.942251f, +}; + +static const float av1_simple_motion_search_term_none_std_32[28] = { + 1.737107f, 1.734327f, 1.727923f, 1.720244f, 1.721570f, 1.712775f, 1.718028f, + 1.710370f, 1.711612f, 1.702596f, 1.754856f, 1.748855f, 1.741871f, 1.736304f, + 1.722428f, 1.717380f, 1.713563f, 1.707582f, 1.761170f, 0.207847f, 0.900058f, + 0.862356f, 0.184593f, 0.903822f, 0.856120f, 1.529199f, 1.412085f, 1.453153f, +}; + +static const float av1_simple_motion_search_term_none_mean_16[28] = { + 8.998877f, 8.912468f, 7.085255f, 6.953476f, 7.086386f, 6.954091f, 7.088727f, + 6.955747f, 7.093955f, 6.960635f, 8.065050f, 7.961432f, 8.071631f, 7.967233f, + 8.041699f, 7.937715f, 8.046791f, 7.942183f, 3.833521f, 0.978421f, 1.901347f, + 1.950124f, 0.979418f, 1.928000f, 1.936727f, 9.773951f, 10.735227f, 15.949769f, +}; + +static const float av1_simple_motion_search_term_none_std_16[28] = { + 1.641193f, 1.640172f, 1.614794f, 1.608906f, 1.609571f, 1.603580f, 1.606928f, + 1.601246f, 1.599230f, 1.593529f, 1.633747f, 1.630219f, 1.625695f, 1.622547f, + 1.633827f, 1.630182f, 1.626607f, 1.622777f, 1.548838f, 0.145303f, 0.744550f, + 0.736552f, 0.141980f, 0.742979f, 0.736977f, 1.366255f, 1.258794f, 1.294309f, +}; + +static const float av1_simple_motion_search_term_none_model_128[] = { + -0.6106842357f, -1.0402954455f, 0.6054417656f, -0.2116623578f, + 0.2447714930f, 0.3782256209f, 0.5095592479f, -0.3275620904f, + 0.3886188013f, 0.2629499420f, -0.1979599415f, -0.5389565605f, + 0.1209207902f, -0.4913347466f, 0.3798542731f, -0.2812861709f, + -0.1049824167f, -0.1088672020f, 0.4059596517f, -0.1347896613f, + 0.2276868621f, 0.0506386970f, 0.0071088411f, 0.0467952100f, + 0.2091247458f, -0.7371964736f, 0.1368935545f, 0.3175247786f, + -0.5493146094f, +}; + +static const float av1_simple_motion_search_term_none_model_64[] = { + -0.4150046575f, -0.3954358561f, 0.1997997444f, 0.3395826831f, + 0.2827215753f, 0.3395683652f, 0.2483140395f, 0.2722216476f, + 0.2610308009f, 0.3724974359f, -0.0551479654f, -0.1721616359f, + -0.3459358629f, -0.0952524186f, -0.1428993840f, -0.0415654914f, + -0.3169539902f, -0.0269429900f, 0.9891530919f, -0.0125084982f, + 0.0972182377f, 0.0008889801f, 0.0205418050f, 0.0057237854f, + 0.1005222691f, -0.2851321920f, -1.5150336445f, 0.1893942436f, + -0.4337360901f, +}; + +static const float av1_simple_motion_search_term_none_model_32[] = { + -0.4667392852f, -0.3893302767f, 0.1603498635f, 0.2304974726f, + 0.1404975592f, 0.2505516225f, 0.1423053884f, 0.2189318406f, + 0.1379765409f, 0.2638241296f, -0.1342865463f, -0.0549054345f, + -0.1925223436f, -0.1142702769f, 0.0127811659f, 0.0868639997f, + -0.0643197251f, 0.0279496470f, 0.9904395769f, -0.0095178685f, + 0.1179410649f, -0.0013411972f, 0.0095060660f, 0.0195730400f, + 0.0779717771f, -0.2498860763f, -0.8168817125f, -0.4798397348f, + -0.6609679881f, +}; + +static const float av1_simple_motion_search_term_none_model_16[] = { + -0.3021081992f, -0.4620153673f, 0.0448577479f, 0.1738455035f, + 0.0663209177f, 0.1629614573f, 0.0555168744f, 0.1631870212f, + 0.0425805150f, 0.1688564954f, 0.0434083772f, -0.0046603915f, + -0.0271580056f, -0.0183879127f, 0.1073730471f, 0.0314201476f, + 0.0576891756f, 0.0119723753f, 0.9084332022f, -0.0188429077f, + 0.0755089811f, -0.0172550234f, 0.0037663075f, 0.0022094472f, + 0.0500247894f, -0.2944572004f, -0.8908521199f, -0.2555515792f, + -0.5396254205f, +}; + +#define FEATURES 31 +#define HIDDEN_NODES 32 +static const float av1_early_term_after_split_nn_weights_64_layer0[] = { + -0.306296f, -0.691664f, 0.335148f, -0.298465f, -0.509241f, -0.632796f, + -0.527979f, -0.009904f, -0.503646f, -0.494002f, -0.575101f, 0.239911f, + -0.413312f, -0.622825f, -0.405448f, -0.419103f, -0.505903f, -0.392550f, + -0.240293f, 0.121749f, -0.489777f, -0.756647f, 0.001047f, -0.016528f, + 0.145714f, 0.172910f, 0.086197f, 0.162882f, -0.070588f, -0.077104f, + 0.502730f, -0.244954f, 0.265605f, -0.323994f, 0.223397f, -1.086453f, + 0.391886f, 0.200343f, 0.253878f, 0.018925f, 0.201819f, -0.205136f, + 0.427314f, 0.041155f, 0.070484f, 0.159925f, -0.057095f, -0.146544f, + -0.073792f, 0.152628f, 0.003986f, -0.515965f, -0.209754f, 0.037457f, + 0.070622f, -0.143571f, -0.059602f, 0.111734f, 0.319674f, 0.149894f, + -0.219883f, 0.206678f, 0.015809f, -0.210549f, 0.130156f, -0.189502f, + -0.850392f, -0.156363f, -0.060354f, 0.189044f, 0.266495f, 0.151305f, + -0.563677f, -0.354896f, 0.300637f, 0.257568f, -0.008359f, -0.535497f, + -0.003127f, 0.293054f, -0.020212f, -0.157278f, 0.229972f, -0.309799f, + -0.329927f, -0.077140f, 0.001177f, -0.024415f, 0.134044f, -0.181587f, + -0.135380f, 0.230989f, -0.281451f, 0.912282f, 0.511562f, -3.900779f, + -0.039917f, 1.956406f, -0.357589f, 0.292998f, -0.950158f, 0.422041f, + 0.526572f, 0.605746f, -0.147110f, 0.256576f, 0.090010f, 0.221641f, + 0.029763f, 0.351592f, 0.458324f, -0.005888f, 0.010521f, -0.389326f, + -0.094006f, -0.171489f, -0.013153f, 0.026333f, -0.454571f, -1.932891f, + -0.168211f, 0.051298f, -0.258061f, -0.028936f, -0.555937f, -0.475566f, + -0.304046f, -0.318113f, 0.099697f, -0.217145f, 0.139433f, -0.203986f, + -0.164012f, 0.051527f, 0.138603f, -0.085100f, -0.082887f, -0.242955f, + -0.663410f, -0.535772f, -0.181665f, -0.197883f, 0.071319f, 0.135086f, + 0.146200f, 0.184827f, -0.199041f, 0.162570f, -0.300167f, 0.017748f, + -0.140111f, 0.103553f, 0.206929f, 0.193446f, 0.123141f, -1.201898f, + -0.052254f, -0.750121f, 0.111741f, 0.204092f, -0.166266f, 0.124008f, + -0.455496f, 0.306035f, 0.275903f, 0.193599f, -0.730011f, 0.126808f, + 0.051059f, 0.103634f, -0.044334f, 0.048889f, 0.405228f, 0.574099f, + 0.061167f, 0.260576f, 0.070032f, -0.038040f, 0.229183f, -0.243269f, + -0.130116f, -0.538563f, -0.070199f, -0.129249f, -0.205153f, -0.268530f, + -0.290828f, -0.233006f, 0.068712f, 0.618085f, -0.407008f, 0.686868f, + 0.172247f, 0.826287f, -0.002672f, 0.239825f, -0.051548f, 0.420773f, + 0.218747f, 0.041057f, -0.071189f, 0.286987f, -0.113915f, 0.122561f, + 0.013979f, -0.049046f, 0.148175f, 0.031313f, -0.248601f, 0.209488f, + 0.069008f, 0.072763f, 0.332475f, 0.079986f, -0.151042f, -0.205110f, + -0.155550f, -0.510408f, 0.330429f, 0.577729f, 0.266524f, -0.378489f, + 0.228204f, 0.055318f, 0.117583f, -0.588557f, -0.778201f, 0.434622f, + -0.227820f, 0.611642f, 0.170548f, 0.817761f, 0.006642f, -1.005794f, + -0.911490f, 1.633684f, -0.290664f, 0.308128f, 0.295986f, 0.243377f, + -0.001275f, -0.131156f, 0.275205f, -0.041865f, -0.201951f, -0.016380f, + 0.336604f, -0.258118f, 0.890810f, 0.441065f, -0.968006f, 0.135989f, + -1.447191f, 0.353426f, -0.343235f, 0.376837f, -0.071602f, -0.319639f, + -0.072347f, 0.547450f, -0.215380f, 0.182141f, -0.066186f, 0.033787f, + 0.257482f, 0.217428f, -0.130249f, 0.057525f, 0.263991f, 0.230664f, + -0.245113f, 0.048610f, -0.079955f, 0.251737f, -0.070368f, -0.017968f, + -0.151815f, 0.025945f, -0.257769f, 0.299735f, 0.077263f, -0.565526f, + 0.326263f, 0.096429f, 0.113414f, 0.092754f, -0.141908f, 0.172060f, + 0.393117f, -0.216755f, 0.331051f, -0.363369f, -0.113363f, -0.095164f, + -0.072784f, 0.214572f, 0.010993f, 0.209456f, 0.260381f, -0.314747f, + -0.422173f, -0.189963f, -0.225130f, 0.339448f, 0.153814f, 0.265616f, + -0.103575f, -0.123841f, -0.106236f, 0.155894f, -0.156264f, -1.361406f, + -0.040736f, -0.614998f, -0.468200f, -0.266505f, -0.342786f, -0.908088f, + 0.105758f, 0.040788f, -0.313589f, -1.359318f, 0.071329f, 0.176404f, + -0.476141f, 0.010108f, -0.201440f, -0.221167f, -0.197448f, -0.013927f, + -0.610270f, -0.607285f, 0.178070f, 0.174320f, 0.313115f, 0.026191f, + -0.112330f, 0.122338f, -0.367751f, 0.196794f, 0.153709f, -0.205454f, + -0.397471f, -1.879336f, -0.030129f, 0.143429f, -0.079832f, 0.435259f, + -1.729539f, 0.518301f, -0.141393f, 0.199399f, -1.914601f, 0.142865f, + -0.219899f, 0.508458f, 0.086365f, -0.220740f, -0.012507f, 1.263320f, + 0.042136f, 0.050922f, -0.329644f, -0.188198f, 0.251522f, 0.394731f, + -0.047866f, -0.260853f, -0.267207f, -0.248489f, 0.146474f, 0.359257f, + -0.427732f, -0.100652f, 0.192129f, 0.075572f, 0.916708f, 0.255747f, + 0.486384f, 0.127989f, -0.556449f, -0.484913f, 0.392298f, 0.045401f, + -0.839551f, -0.703619f, 0.069263f, -0.040720f, 0.542265f, 0.443739f, + 0.862552f, -0.021726f, 0.230858f, -0.261004f, -0.125697f, -0.106435f, + 0.002341f, 0.013904f, 0.011034f, 0.542296f, -0.284325f, 0.135736f, + 0.113882f, 0.040610f, -0.255485f, 0.224061f, -0.087140f, 0.127872f, + -0.002638f, 0.164889f, -0.335958f, -0.031166f, -0.393581f, 0.075455f, + 0.055995f, 0.087934f, -0.133859f, -0.342187f, 0.002492f, -0.340722f, + 0.058304f, 0.104165f, -0.142136f, -0.351111f, -0.158037f, -0.079924f, + -0.253209f, -0.092840f, -0.174646f, -0.202772f, -0.353438f, -0.031111f, + 0.076088f, -0.232091f, -0.070052f, 0.097595f, 0.063173f, -0.211195f, + 0.126478f, -0.178828f, 0.278723f, -0.070807f, -0.179783f, 0.034123f, + 0.035721f, -0.200431f, 0.170640f, 0.107933f, 0.226594f, -0.301499f, + -0.291096f, 0.228076f, -0.272951f, 0.002490f, -0.210707f, -0.128033f, + -0.194009f, -0.011347f, -0.256694f, -0.011841f, -0.005167f, -0.163203f, + -0.253796f, -0.198877f, -0.055827f, -0.882685f, -0.443471f, 0.349601f, + 0.749334f, -1.161845f, 0.505480f, 0.221733f, 0.210490f, -0.234984f, + 0.014183f, -0.510401f, 0.238692f, -0.134111f, 0.083844f, -0.478751f, + -0.088434f, 0.304063f, 0.150336f, -0.749682f, -0.081999f, 0.729739f, + 0.412508f, 0.132571f, 0.058306f, -0.047451f, -0.117435f, -0.445395f, + -0.005182f, -0.025757f, 0.175051f, -0.258194f, -0.150311f, -0.196533f, + -1.314316f, -0.428627f, 0.512451f, 0.045138f, -0.200925f, 0.081538f, + -0.346151f, -0.358197f, -0.422258f, -0.028542f, -0.383534f, -0.026163f, + -0.419858f, -0.154321f, 0.376970f, 0.094017f, 0.783520f, 0.110641f, + 0.077966f, -0.093064f, 0.160522f, -0.863041f, 0.086210f, 0.560764f, + 0.057032f, 0.159224f, 0.323068f, -0.173109f, 0.014042f, -0.126856f, + -0.128237f, -0.245273f, -0.317312f, -0.257597f, -0.181977f, 0.259485f, + -0.215834f, 0.062076f, -0.270596f, 0.271581f, -0.153486f, -0.247165f, + 0.079737f, -0.157049f, -0.027459f, -0.299397f, 0.136729f, -0.334192f, + -0.191722f, 0.145865f, -0.031324f, -0.307165f, -0.244923f, -0.228027f, + 0.063807f, 0.054965f, -0.005709f, -0.041977f, -0.276245f, 0.020003f, + 0.133323f, -0.145992f, -0.951030f, 0.414083f, -1.063323f, 0.137872f, + 0.104732f, -0.123728f, 0.542532f, 0.213654f, 0.542954f, 0.155619f, + 0.543072f, 0.399067f, 0.191402f, -0.102552f, -0.176734f, -0.136776f, + -0.012814f, -0.021298f, -0.802467f, -0.957481f, -0.238787f, -0.138482f, + 0.058331f, 0.126601f, 0.104420f, -0.148684f, 0.343218f, 0.093604f, + -0.055642f, -0.383918f, -0.045250f, -0.090480f, -0.155464f, 0.278299f, + 0.042791f, -0.029084f, -0.373861f, -0.073233f, -0.085172f, 0.186841f, + -0.070898f, -0.156415f, 0.112831f, -0.065931f, -0.353007f, 0.058453f, + -0.136982f, 0.233393f, 0.017240f, -0.018428f, 0.229104f, -0.371440f, + -0.262212f, 0.203075f, -0.263293f, 0.034413f, -0.299354f, 0.227269f, + 0.204977f, -0.118107f, -0.359832f, -0.068252f, 0.480105f, -0.214711f, + -0.614381f, 0.209048f, -0.456014f, -0.188819f, -0.220995f, -0.322104f, + -0.191457f, 0.420874f, -0.454919f, 0.023119f, 0.291700f, -0.532885f, + -0.032642f, 0.043271f, 0.133974f, 0.002399f, -0.179899f, -0.044158f, + -0.027078f, -0.350075f, 0.236766f, 0.346771f, -0.118534f, -0.421221f, + 0.019544f, 0.109349f, 0.141517f, 0.403561f, 0.409102f, 0.054555f, + -0.561751f, 0.577183f, -0.705156f, -0.231188f, -1.969772f, 0.172289f, + -0.048122f, 0.205671f, -0.667130f, -0.066870f, 0.202838f, -0.095538f, + -0.842651f, 0.254170f, 0.046256f, -0.271891f, -0.369254f, 0.492101f, + 0.001189f, -0.186525f, 0.188470f, -0.207072f, 0.030086f, -0.132904f, + 0.127001f, 0.116662f, -0.079246f, 0.227241f, -0.462178f, 0.446304f, + -1.660753f, 0.241832f, -0.288040f, 0.054663f, -0.435804f, 0.296782f, + -0.026421f, -0.115618f, 0.163416f, 0.834001f, 0.008019f, -0.014243f, + 0.524658f, 0.067894f, -0.253936f, -0.100657f, 1.285389f, -0.005952f, + 0.087134f, -0.088375f, -0.121866f, -0.171172f, 0.279463f, -0.598593f, + -0.727761f, 0.189831f, -0.822575f, -0.291141f, -0.012410f, -0.069999f, + 0.098842f, -0.218513f, 0.009494f, 0.100106f, -0.402884f, -0.299236f, + -0.345668f, -0.057739f, -0.213248f, -0.426661f, -0.360268f, -0.349860f, + -0.382177f, -0.357802f, -0.032030f, -0.110597f, -0.155442f, -0.418794f, + -0.012113f, -0.032962f, -0.450648f, 0.129060f, -0.135227f, -0.298593f, + 0.001435f, 0.278790f, -0.272945f, 0.162759f, -0.290208f, 0.058481f, + -0.490971f, 0.019630f, -0.210347f, 0.000520f, -0.340413f, 0.641562f, + 0.023104f, 0.194832f, -0.441894f, -0.253538f, -0.228332f, 0.423264f, + -1.094073f, -0.475657f, -0.238752f, 0.033910f, 0.440425f, 0.036320f, + 0.566989f, -0.065326f, -0.297939f, 0.406098f, 0.529561f, -0.113084f, + 0.141472f, -0.024462f, -0.179212f, 0.187801f, -0.235787f, -0.229624f, + 0.357791f, 0.061110f, -0.607788f, -1.713694f, -0.651041f, 1.734283f, + -0.334701f, 0.161687f, 0.010215f, 0.320708f, 0.169447f, 0.513558f, + 0.488340f, -0.619036f, -0.525441f, -1.144352f, -0.546154f, 0.669973f, + 0.327028f, -0.100539f, 0.012048f, -0.223013f, -0.239680f, 0.323035f, + 0.165950f, -0.155110f, 0.128664f, -0.157378f, -0.124490f, 0.291553f, + 0.055849f, -0.221664f, 0.077770f, -0.350658f, -0.181939f, 0.110230f, + -0.078219f, 0.007472f, -0.031620f, 0.007708f, -0.201794f, 0.017594f, + -0.027480f, 0.058884f, -0.369166f, -0.369770f, 0.181635f, -0.183318f, + -0.389184f, -0.256661f, 0.160107f, 0.037127f, -0.082573f, -0.095815f, + -0.322782f, 0.072528f, -0.348875f, 0.216247f, -0.161757f, -0.385502f, + -0.315738f, 0.020123f, -0.155609f, 0.114403f, -0.383232f, 0.629529f, + 0.066142f, 0.448392f, -0.389557f, -0.083315f, 0.829535f, -0.015531f, + -0.050728f, -0.325127f, 0.812992f, -0.196780f, 0.021060f, -0.952647f, + 0.006687f, -0.512715f, -0.066778f, 0.410067f, -0.116945f, -0.288283f, + 0.189334f, -0.083153f, 0.159980f, -0.068208f, 0.107358f, -0.154411f, + -0.068914f, 0.186816f, 0.032251f, 0.109242f, 0.134825f, 0.035101f, + -0.253175f, 0.157309f, -0.363597f, -0.138176f, -0.334141f, -0.172697f, + 0.045800f, -0.286057f, 0.173403f, -0.172444f, -0.117996f, -0.383848f, + -0.173303f, -0.258482f, -0.021404f, -0.017898f, -0.001970f, 0.003273f, + 0.056121f, 0.155046f, 0.044708f, -0.295609f, -0.211688f, -0.233229f, + -0.264980f, 0.145549f, 0.045323f, -0.027112f, 0.175638f, -0.207251f, + -0.055274f, 0.092706f, 0.086200f, -0.241340f, -0.147416f, 0.024510f, + -0.357194f, -0.181944f, -0.050104f, -0.079024f, -0.290473f, -0.169790f, + -0.277982f, -0.017781f, -0.004854f, -0.094132f, -0.348555f, 0.199291f, + -0.343989f, -0.319299f, -0.268935f, -0.021208f, 0.020938f, -0.090609f, + 0.006595f, -0.200790f, 0.171856f, -0.027766f, -0.032017f, -0.006745f, + 0.566426f, -0.096850f, 0.727633f, -0.408065f, -0.012436f, 0.005646f, + -0.305148f, -0.095075f, -0.391549f, -0.020378f, -0.236498f, -0.252773f, + -0.231385f, -0.203175f, 0.041903f, -0.373694f, 0.058239f, -0.101116f, + 0.183772f, 0.164523f, -0.099046f, -0.201272f, -0.394523f, -0.157517f, + 0.032079f, -0.381173f, -0.238496f, -0.037990f, -0.294553f, 0.141473f, + 0.100268f, -0.023806f, 0.004978f, 0.184916f, 0.142699f, -0.113240f, + -0.213364f, -0.160059f, -0.216263f, -0.406387f, -0.301140f, -0.406355f, + -0.113085f, -0.279699f, -0.267434f, 0.126263f, -0.260527f, -0.153904f, + -0.494653f, -0.355144f, 0.030549f, -0.216400f, -0.123363f, 0.189090f, + 0.219122f, 0.096677f, -0.202037f, -0.014489f, -0.137859f, -0.114184f, + -0.279423f, -0.270683f, +}; + +static const float av1_early_term_after_split_nn_bias_64_layer0[] = { + -0.491455f, 0.464538f, -0.005742f, -0.219951f, -0.073682f, 0.102027f, + 0.567071f, 0.441402f, 0.277521f, 0.314498f, -0.448199f, -0.065032f, + 0.488139f, -0.079632f, 0.000000f, 0.521555f, -0.151950f, -0.034616f, + 0.393438f, -0.072242f, -0.087343f, -0.571308f, 0.017372f, -0.126144f, + 0.372261f, -0.451537f, -0.140238f, -0.092377f, -0.074475f, -0.068879f, + -0.109614f, -0.164492f, +}; + +static const float av1_early_term_after_split_nn_weights_64_layer1[] = { + -0.373195f, -0.283141f, 0.416113f, 0.483659f, 0.230583f, 0.349197f, + -0.168582f, -0.813338f, -0.472369f, -0.173872f, 1.297845f, 0.339355f, + -0.828033f, 0.019617f, 0.118757f, -0.619360f, 0.282295f, -0.054116f, + -0.730596f, 0.068567f, -0.248707f, 0.461225f, 0.330224f, -0.287080f, + -0.458103f, 0.591852f, -0.008491f, 0.632119f, -0.007872f, 0.007869f, + -0.230698f, -0.011437f, +}; + +static const float av1_early_term_after_split_nn_bias_64_layer1[] = { + -0.55403697f, +}; + +static const NN_CONFIG av1_early_term_after_split_nnconfig_64 = { + FEATURES, + 1, + 1, + { + HIDDEN_NODES, + }, + { + av1_early_term_after_split_nn_weights_64_layer0, + av1_early_term_after_split_nn_weights_64_layer1, + }, + { + av1_early_term_after_split_nn_bias_64_layer0, + av1_early_term_after_split_nn_bias_64_layer1, + }, +}; + +static const float av1_early_term_after_split_nn_weights_32_layer0[] = { + 0.026050f, -0.226531f, 0.308107f, -0.083744f, 0.201785f, 0.098562f, + 0.147595f, -0.495771f, -0.245741f, 0.201616f, -0.272070f, -0.579545f, + -0.127261f, -0.229588f, 0.250831f, -0.176929f, -0.031689f, 0.284718f, + 0.085845f, -0.285027f, 0.012304f, 0.382402f, -0.204591f, 0.272514f, + -0.065854f, -0.054228f, -0.231174f, -0.174504f, 0.258287f, 0.195689f, + 0.242530f, 0.023528f, -0.294242f, -0.272132f, 0.460180f, -0.731281f, + -0.208103f, 0.208204f, 0.348250f, 0.016328f, 0.043707f, -0.169551f, + 0.108521f, 0.226895f, -0.020471f, 0.102443f, 0.429640f, -0.252555f, + -0.218434f, -0.163665f, 0.175531f, 0.101588f, -0.135798f, -0.158102f, + 0.142565f, 0.128277f, 0.174985f, -0.100073f, 0.113967f, 0.223682f, + -0.145576f, -0.008443f, 0.112748f, -0.037845f, 0.076954f, -0.287137f, + -0.518185f, -0.106833f, 0.175359f, 0.031408f, 0.219069f, -0.294440f, + 0.007766f, 0.067754f, -0.049168f, -0.212368f, -0.261708f, 0.309252f, + 0.220859f, -0.274852f, -0.653157f, 0.083438f, -0.265386f, 0.174429f, + -0.116931f, -0.091594f, -0.244897f, -0.089015f, 0.274453f, 0.212890f, + 0.272053f, -0.425315f, -0.107726f, 0.294444f, -0.354629f, 0.104402f, + -0.307663f, 0.558430f, 0.140334f, -0.054831f, -0.449456f, 0.058274f, + -0.033768f, -0.354117f, -0.331618f, -0.411772f, 0.232064f, -0.079297f, + -0.638571f, 0.181823f, -0.039611f, 0.206310f, -0.659157f, -0.102930f, + -0.067303f, -0.176881f, -0.001038f, 0.091835f, 0.079739f, -0.121923f, + 0.211070f, 0.362719f, -0.154915f, -0.151876f, -0.165460f, 0.023469f, + -0.251036f, 0.210014f, -0.537125f, 0.156832f, -0.216987f, 0.062975f, + -0.198462f, 0.329123f, 0.125870f, 0.225830f, 0.086377f, -0.128773f, + -0.179673f, -0.074612f, 0.456645f, 0.021905f, -0.243140f, 0.059145f, + -0.273942f, -0.277822f, 0.154556f, -0.025459f, 0.227614f, -0.313076f, + 0.044705f, -0.019017f, 0.108999f, -0.020243f, -0.016373f, 0.560270f, + -0.064818f, 0.050880f, -0.218458f, 0.825699f, -0.534056f, -0.258253f, + 0.222073f, 0.013295f, 0.477870f, -0.386727f, 0.388509f, 0.004128f, + 0.451388f, -0.175788f, 0.264093f, -0.109812f, 0.358132f, 0.500992f, + -0.446933f, -0.222397f, 0.345834f, 0.370943f, -0.233115f, -0.047005f, + -0.111335f, -0.111586f, 0.026975f, -0.052191f, -0.111800f, -0.129782f, + 0.225132f, 0.102524f, 0.544557f, -0.111674f, -0.857884f, 0.133258f, + 0.310001f, 0.043829f, 0.104143f, 0.256493f, 0.242520f, -0.342082f, + 0.421447f, 0.124227f, 0.061542f, -0.090206f, 0.316681f, 0.353452f, + -0.918408f, -0.001903f, -0.052303f, -0.004816f, -0.446393f, -0.053038f, + 0.255725f, -0.126346f, 0.034095f, -0.240276f, -0.135918f, 0.095682f, + -0.147457f, -0.338216f, -0.200426f, 0.010265f, -0.243915f, -0.231375f, + -0.323924f, -0.014353f, 0.150252f, -0.264346f, 0.205303f, -0.194610f, + -0.282527f, 0.180555f, -0.000087f, 0.027240f, -0.000903f, -0.345877f, + -0.353274f, -0.311829f, 0.172985f, -0.111748f, -0.309380f, 0.108110f, + -0.260914f, -0.164990f, 0.183625f, -0.319692f, -0.096988f, 0.094147f, + -0.047062f, -0.080978f, 0.227387f, -0.000450f, -0.220159f, -0.211448f, + -0.020885f, -0.139646f, -0.086721f, 0.067928f, -0.033084f, -0.251996f, + 0.090317f, 0.086313f, -0.228420f, -0.111356f, -0.314304f, -0.223664f, + 0.188176f, -0.002360f, -0.029491f, -0.006000f, -0.075343f, 0.173699f, + -0.272800f, -0.238507f, -0.272071f, -0.015000f, -0.215305f, -0.192943f, + -0.038595f, 0.119537f, 0.260477f, -0.168014f, -0.172751f, 0.532861f, + -0.753250f, -0.017485f, -0.115541f, -0.109291f, -1.098943f, 0.418559f, + -0.532110f, 0.359323f, -0.254786f, 0.471316f, -0.545024f, 0.291912f, + -0.836939f, 0.443427f, -0.441709f, 0.168866f, -0.140372f, 0.546607f, + -0.315465f, 0.023328f, 0.137709f, -0.083492f, -0.049986f, -0.071302f, + -0.293680f, -0.105049f, 0.315317f, 0.279569f, 0.220762f, 0.088161f, + -0.756456f, -0.074512f, 0.958318f, -0.332924f, -0.004906f, -0.629271f, + 0.212050f, 0.279123f, 0.311523f, -0.599580f, 0.516150f, 0.456952f, + 0.020255f, 0.247290f, -0.182670f, -0.335554f, 0.021203f, 0.131081f, + -0.208584f, 0.112530f, -0.198980f, 0.211583f, -0.101271f, -0.206453f, + -0.502688f, -0.294976f, -0.187019f, -0.114473f, 0.282050f, -0.165483f, + 0.094953f, -0.182578f, 0.055068f, 0.135605f, -0.266941f, -0.297556f, + 0.199181f, 0.015979f, -0.158659f, -0.226841f, 0.171306f, 0.013438f, + -0.286309f, -0.071753f, -0.170300f, -0.238188f, 0.093572f, -0.026230f, + -0.254502f, -0.297786f, -0.063480f, -0.300799f, -0.065644f, 0.074710f, + 0.248576f, -0.144425f, -0.113948f, -0.247297f, 0.276682f, 0.010963f, + -0.737786f, 0.026347f, 0.007830f, 0.753543f, 0.371904f, 0.305614f, + 0.105028f, 0.073530f, -0.119137f, 0.102352f, -0.080523f, 0.176366f, + -0.159457f, -0.339948f, 0.360131f, -0.007051f, -0.388378f, -0.101695f, + 0.663041f, -0.234486f, -0.142536f, -0.099931f, 0.041478f, 0.230425f, + 0.005743f, 0.154060f, 0.056233f, -0.080668f, -0.009754f, -0.194356f, + 0.185474f, -0.296474f, 0.192700f, 0.257767f, 0.348529f, 0.458265f, + 0.060276f, -0.130473f, 0.139889f, 0.310073f, -0.306869f, -0.272922f, + -0.259862f, 0.409207f, 0.431991f, -0.100357f, -0.050415f, -0.071830f, + -0.239665f, 0.153399f, 0.177192f, -0.611644f, -0.176114f, -0.022694f, + -0.033701f, -0.345842f, 0.015660f, 0.158931f, -0.097586f, 0.222001f, + 0.257887f, -0.171307f, -0.222607f, -0.245508f, -0.145742f, -0.096461f, + -0.010895f, 0.052815f, -0.265306f, -0.081059f, 0.219162f, -0.256084f, + -0.372676f, 0.148977f, 0.174831f, 0.086980f, 0.108518f, 0.074011f, + 0.038032f, -0.070856f, -0.109407f, 0.126174f, 0.022341f, -0.249786f, + -0.356164f, -0.202841f, -0.087437f, -0.133740f, 0.090956f, -0.017953f, + -0.028353f, 0.233621f, 0.109426f, 0.232798f, -0.104950f, -0.241798f, + -0.018995f, -0.167954f, 0.002473f, 0.060418f, -0.232717f, -0.195980f, + -0.283971f, -0.371881f, 0.219728f, 0.018072f, -0.166694f, -0.083301f, + -0.000616f, -0.212641f, -0.173158f, 0.222739f, -0.235302f, 0.237624f, + 0.222232f, -0.041235f, -0.342411f, 0.121194f, 0.211291f, -0.032237f, + -0.249401f, -0.291668f, 0.206055f, -0.148200f, 0.011824f, -0.272728f, + -0.194854f, 0.367175f, -0.257243f, 0.103433f, -0.231077f, 0.236734f, + 0.135733f, -0.362845f, 0.197147f, 0.242782f, -0.135289f, 0.123311f, + 0.259420f, -0.116278f, 0.127287f, 0.236789f, -0.097438f, 0.118073f, + 0.112796f, -0.035949f, 0.184408f, 0.200948f, -0.008859f, 0.195989f, + 0.161970f, -0.295320f, -0.330389f, 0.141034f, 0.066081f, -0.707857f, + 0.357037f, 0.149633f, 0.679877f, 0.548674f, 0.469076f, 0.194123f, + -0.209872f, -0.071764f, -0.126960f, 0.199420f, 0.327116f, -0.169053f, + -0.429156f, 0.443429f, -0.225530f, -0.130738f, -0.028351f, 0.644393f, + 0.049606f, -0.243602f, -0.409920f, 0.117028f, -0.258557f, 0.073865f, + -0.200454f, -0.139957f, -0.031314f, 0.162325f, 0.247221f, 0.071909f, + -0.336276f, 0.079922f, 0.192780f, -0.148882f, 0.133192f, -0.143177f, + -0.121327f, 0.126221f, -0.089521f, -0.181826f, 0.149923f, -0.280682f, + 0.391572f, 0.108990f, -0.445494f, -0.170787f, 0.225182f, 0.223313f, + -0.234828f, -0.071072f, -0.072673f, -0.093686f, 0.223892f, -0.049377f, + 0.057976f, 0.033558f, 0.068733f, -0.283353f, 0.217877f, 0.158093f, + -0.276761f, -0.097049f, -0.351913f, -0.383604f, 0.002863f, -0.474510f, + -0.096738f, 0.256940f, 0.234203f, -0.226667f, -0.260576f, -0.183403f, + -0.035578f, 0.141570f, 0.078764f, -0.028086f, 0.155800f, -0.251115f, + -0.286703f, -0.014739f, -0.072621f, -0.311506f, -0.048639f, 0.081621f, + 0.043057f, 0.068136f, -0.179903f, 0.143699f, -0.002571f, 0.239012f, + 0.197456f, 0.035745f, -0.311927f, 0.220320f, 0.102687f, -0.294105f, + 0.426740f, 0.209050f, 0.211907f, 0.083453f, 0.006578f, -0.143338f, + 0.003157f, 0.040295f, 0.234497f, 0.035344f, -0.163909f, 0.411115f, + 0.289453f, -0.075357f, -0.008884f, 0.469798f, -0.033304f, -0.153293f, + -0.229322f, -0.004162f, 0.113363f, 0.395381f, 0.067414f, -0.188966f, + -0.117424f, -0.166423f, 0.066839f, 0.595641f, -0.204782f, -0.451727f, + 0.198509f, -0.921583f, -0.246765f, -0.153411f, 0.046491f, 0.365906f, + 0.376710f, -0.017355f, -0.035232f, 0.138785f, -0.163918f, -0.283449f, + -0.094340f, 0.192127f, 0.154815f, 0.035787f, -0.029087f, 0.115649f, + -0.220133f, -0.452741f, 0.311667f, 0.157666f, 0.091401f, 0.236040f, + -0.168523f, 0.122176f, -0.219016f, -0.214856f, 0.172824f, -0.091810f, + 0.031520f, -0.857420f, 0.643446f, -0.017471f, 0.206082f, -0.933517f, + -0.020070f, -0.065091f, -0.117680f, -1.271870f, -0.069177f, -0.149409f, + 0.289970f, -0.889775f, -0.044741f, 0.232647f, -0.319416f, 0.073030f, + 0.278549f, 0.238782f, -0.202206f, 0.272540f, 0.201412f, 0.175574f, + -0.127971f, -0.253164f, -0.086352f, -0.005381f, 0.114714f, 0.505169f, + -0.175049f, -1.534280f, -0.320666f, -2.119298f, -0.023075f, -0.021259f, + -0.161019f, 0.344837f, 0.361958f, -0.097050f, 0.014375f, 0.267110f, + 0.341442f, -0.016688f, 0.073393f, 0.131500f, 0.246331f, 0.011059f, + 0.033597f, 0.014779f, -0.269366f, -0.504788f, 0.048651f, 0.295682f, + 0.237363f, 0.227484f, -0.235814f, -0.160530f, 0.182682f, -0.172999f, + -0.126630f, 0.168357f, -0.078729f, 0.052805f, 0.377021f, -0.004727f, + 0.230415f, -0.876673f, 0.458457f, 0.099401f, -0.019616f, 0.611982f, + -0.231508f, -0.070894f, -0.056142f, 0.548969f, -0.376599f, -0.600428f, + 0.241930f, -0.592893f, 0.189371f, 0.488651f, -0.092446f, -0.272569f, + 0.251643f, 0.315945f, -0.301468f, 0.112961f, 0.052119f, -0.066076f, + -0.082249f, 0.252805f, -0.195539f, 0.150386f, -0.865534f, 0.673447f, + 0.030177f, -0.438528f, -1.006174f, 0.575176f, -0.271656f, 0.035835f, + -1.056916f, 0.495267f, -0.092428f, -0.109511f, -0.192359f, 0.166669f, + -0.624326f, -0.000354f, -0.089075f, 0.176279f, -0.289347f, 0.021346f, + 0.020375f, 0.255282f, -0.045588f, 0.173675f, 0.100957f, -0.294373f, + 0.049303f, -0.134132f, -0.255731f, -0.025559f, -0.307463f, -0.205100f, + 0.079024f, 0.101113f, 0.135742f, -0.348869f, -0.026759f, -0.134155f, + -0.179275f, -0.054297f, -0.054948f, 0.029351f, 0.190560f, 0.102476f, + -0.025785f, 0.169442f, -0.271303f, 0.200667f, 0.099063f, 0.074767f, + -0.326533f, 0.044426f, -0.290251f, -0.082443f, -0.164482f, -0.349412f, + 0.045109f, -0.157330f, 0.165935f, 0.012672f, -0.059818f, 0.399140f, + -0.316620f, 0.386638f, -0.285399f, -0.296777f, -0.200473f, -0.144232f, + 0.251851f, -0.203768f, 0.001071f, -0.179063f, 0.248952f, -0.143029f, + 0.010423f, -0.030293f, -0.046786f, -0.196195f, -0.016845f, 0.295023f, + 0.322825f, 0.133683f, 0.017388f, 0.142467f, 0.221320f, 0.004059f, + -0.115770f, 0.143363f, 0.137972f, -0.272584f, 0.489366f, -0.091828f, + -0.014703f, 0.082332f, -0.476226f, -0.202859f, 0.356094f, -0.283049f, + 0.218086f, 0.202015f, 0.201724f, 0.012617f, 0.050720f, 0.255695f, + 0.244653f, 0.111296f, -0.151450f, -0.056210f, -0.757348f, 0.441724f, + -0.022455f, -0.244662f, 0.296205f, -0.421883f, -0.217386f, -0.254301f, + 0.409105f, -0.031309f, 0.050147f, -0.337170f, -0.106620f, -0.606455f, + 0.308024f, 0.298144f, 0.363993f, 0.704870f, -0.047292f, 0.166901f, + 0.105991f, -0.536757f, -0.424031f, -0.226034f, 0.213635f, -0.526754f, + 0.310990f, -0.116038f, 0.007775f, 0.538330f, -0.177912f, 0.445357f, + -0.290365f, 0.451169f, 0.030931f, 0.033388f, 0.209905f, -0.244492f, + -0.097792f, -0.246042f, 0.132047f, 0.032576f, 0.115516f, 0.022890f, + 0.093508f, -0.071840f, 0.362948f, -0.135245f, 0.659911f, -0.321413f, + 0.193118f, -0.795001f, -0.218311f, 0.024862f, 0.206172f, -0.832878f, + -0.255670f, 0.343402f, -0.275211f, -0.898363f, -0.025172f, 0.158565f, + 0.171347f, -0.127518f, -0.215156f, -0.159198f, 0.250355f, -0.132452f, + 0.061254f, -0.097544f, -0.223246f, 0.013183f, 0.239468f, 0.259017f, + -0.217739f, -0.032263f, 0.123755f, -0.701777f, 0.150049f, -0.555293f, + 0.062430f, -0.260304f, 0.494894f, -0.168702f, -0.134829f, -0.113989f, + 0.150092f, -0.060248f, 0.115711f, -0.277202f, 0.499811f, 0.417116f, + 0.191081f, -0.376432f, -0.321092f, 0.033992f, 0.057193f, 0.127077f, + -0.009042f, 0.014443f, 0.142808f, -0.124349f, 0.213087f, -0.381686f, + 0.129726f, -0.038396f, +}; + +static const float av1_early_term_after_split_nn_bias_32_layer0[] = { + -0.107171f, 0.060848f, -0.069480f, -0.121982f, 0.037637f, -0.291839f, + 0.102257f, -0.065889f, -0.032452f, 0.034171f, -0.073984f, -0.005236f, + 0.218820f, 0.132123f, -0.089621f, -0.067679f, 0.049368f, 0.329444f, + -0.184729f, 0.031702f, 0.009735f, -0.039964f, -0.018024f, -0.073031f, + -0.030166f, -0.191037f, -0.074862f, -0.076548f, 0.076537f, 0.216609f, + -0.078358f, -0.007740f, +}; + +static const float av1_early_term_after_split_nn_weights_32_layer1[] = { + 0.047869f, -0.231773f, -0.185663f, 0.460676f, -0.208182f, 0.590555f, + -0.622627f, 0.279377f, 0.351681f, 0.633504f, 1.069884f, 0.332449f, + -0.457703f, -0.435817f, -0.028853f, 0.327490f, -0.282469f, -0.975792f, + -0.062975f, -0.147187f, 0.348340f, -1.207116f, 0.516159f, -1.509626f, + -0.805072f, 0.522999f, 0.143671f, 0.304246f, -0.360720f, -0.612472f, + 0.260045f, -0.223243f, +}; + +static const float av1_early_term_after_split_nn_bias_32_layer1[] = { + -0.07571174f, +}; + +static const NN_CONFIG av1_early_term_after_split_nnconfig_32 = { + FEATURES, + 1, + 1, + { + HIDDEN_NODES, + }, + { + av1_early_term_after_split_nn_weights_32_layer0, + av1_early_term_after_split_nn_weights_32_layer1, + }, + { + av1_early_term_after_split_nn_bias_32_layer0, + av1_early_term_after_split_nn_bias_32_layer1, + }, +}; + +static const float av1_early_term_after_split_nn_weights_16_layer0[] = { + -0.113798f, 0.053357f, -0.037947f, -0.477171f, 0.276517f, -0.349252f, + -0.177284f, 0.189597f, 0.141744f, 0.230207f, -0.328104f, 0.074328f, + 0.247717f, 0.233533f, 0.145167f, 0.018029f, -0.398725f, -0.226199f, + -0.309724f, 0.125279f, 0.194759f, 0.025531f, 0.349714f, -0.273944f, + 0.186871f, 0.181735f, -0.520614f, -0.264076f, 0.308207f, 0.157438f, + -0.137791f, -0.054582f, 0.125879f, 0.796218f, -0.897562f, 0.885439f, + 0.381640f, 0.106625f, -2.027456f, 0.000874f, 0.179581f, 0.013287f, + -2.329439f, -0.163169f, -0.136191f, 0.320108f, -2.318779f, -0.196722f, + -0.295721f, 0.203658f, -0.182275f, 0.615941f, 0.015762f, 0.257181f, + -0.115297f, 0.295774f, -0.026144f, -0.022686f, -0.219423f, -0.042861f, + 0.207647f, -0.057791f, 0.201671f, -0.169569f, 0.291492f, -0.994991f, + 0.137473f, 0.230948f, 0.505626f, -1.065860f, 0.275225f, -0.250861f, + 0.519466f, -1.217242f, -0.087384f, 0.053441f, 0.030729f, -1.702304f, + -0.034635f, 0.010177f, -0.035422f, -0.749979f, 0.355499f, 0.408166f, + -0.086883f, 0.017203f, 0.195706f, -0.218056f, -0.029153f, 0.367335f, + -0.061732f, -0.241068f, 0.078496f, -0.370346f, -0.124223f, -0.172708f, + 0.037971f, 0.038875f, -0.282489f, -0.266323f, -0.210864f, 0.214714f, + 0.234695f, -0.045625f, 0.015357f, -0.007464f, -0.362003f, -0.113465f, + 0.145141f, 0.238470f, -0.202664f, -0.286587f, -0.347112f, 0.054501f, + -0.190290f, -0.283256f, 0.062179f, 0.041165f, -0.006935f, -0.220351f, + -0.088800f, 0.220924f, -0.200982f, 0.058493f, -0.225175f, 0.057175f, + -0.618187f, 0.761023f, -0.743774f, -0.500599f, -0.584999f, 1.545211f, + 0.123055f, -0.106848f, -0.353057f, 1.552187f, 0.174104f, 0.068060f, + -0.449859f, 1.254299f, -0.161716f, -0.060630f, -0.230721f, 0.165976f, + -0.101582f, -0.422415f, 0.110384f, -0.130098f, 0.104428f, 0.083518f, + 0.031626f, 0.083048f, 0.158877f, 0.173340f, 0.063962f, 0.427845f, + 0.663268f, 0.376996f, 0.146435f, -0.091329f, 0.443447f, 0.518432f, + -0.182777f, -0.091313f, 0.331229f, 0.532604f, -0.187001f, 0.054774f, + 0.298068f, 0.502295f, -0.362378f, 0.054283f, 0.292806f, 0.168901f, + -0.214787f, 0.025637f, 0.458009f, -0.322714f, -0.264059f, 0.140313f, + -0.102696f, -0.431208f, -0.134450f, -0.545415f, 0.253851f, -0.009061f, + -0.050681f, 0.108681f, 0.043272f, -1.073133f, 0.206410f, 0.469576f, + 0.291494f, -2.021244f, -0.001183f, -0.067542f, 0.364907f, -2.470543f, + 0.049147f, -0.018868f, 0.658500f, -2.531048f, 0.275433f, -0.034224f, + -0.171386f, 0.096369f, 0.728069f, 0.272332f, 0.222255f, -0.030426f, + 0.026994f, 0.208928f, -0.173943f, -0.227581f, -0.214798f, 0.079341f, + 0.032344f, -0.253575f, -0.044353f, -0.239265f, -0.055852f, -0.162582f, + -0.086592f, 0.066487f, 0.337353f, -0.168704f, 0.015702f, 0.022607f, + 0.286647f, 0.218106f, 0.193319f, -0.358714f, 0.030796f, 0.007646f, + -0.045617f, 0.165007f, -0.284641f, -0.291812f, 0.207544f, 0.082823f, + -0.141907f, -0.331336f, -0.052908f, 0.120716f, 0.202521f, 0.232782f, + -0.348141f, -0.017332f, 1.191126f, -0.391987f, -0.154537f, -0.206551f, + -2.378690f, 0.057918f, -0.328183f, 2.151556f, 0.238803f, 0.164880f, + -0.480039f, 1.616200f, 0.260243f, 0.083704f, -0.174461f, 1.804634f, + 0.194810f, 0.223837f, 0.550107f, -0.068171f, -0.293435f, -0.186770f, + -0.364846f, 0.127181f, 0.105556f, -0.016202f, 0.278403f, -0.344995f, + -0.009761f, -0.082555f, 0.046731f, -0.301452f, 0.604259f, 0.055895f, + 0.049862f, 0.314249f, -0.305811f, -0.112937f, 0.658787f, -0.549288f, + -0.307567f, -0.460650f, -0.840643f, 0.082576f, 0.373711f, 0.138318f, + 0.336901f, 0.284984f, -0.281400f, 0.408210f, -0.449858f, 0.461054f, + 0.227629f, -0.131705f, 0.301769f, -0.278540f, 0.189290f, -0.269041f, + 0.111350f, -0.300257f, 0.436858f, -0.265920f, -0.211938f, 0.272631f, + 0.206291f, 0.253273f, -0.229776f, -0.031112f, -0.171183f, -0.109676f, + -0.202390f, -0.068857f, 0.182125f, -0.140523f, -0.308742f, -0.045840f, + 0.256545f, -0.262405f, 0.225951f, -0.287463f, -0.189203f, -0.055552f, + -0.052448f, -0.242839f, -0.278877f, 0.140920f, -0.175755f, 0.215402f, + -0.248841f, -0.264080f, -0.178303f, 0.147777f, 0.049460f, -0.279877f, + -0.539725f, -0.004622f, 0.182874f, 0.338814f, 0.265974f, 0.249851f, + -0.141154f, 0.157228f, -0.090972f, 0.179444f, 0.305255f, 0.127788f, + 0.123270f, 0.355320f, 0.076797f, 0.263495f, 0.235965f, -0.133816f, + 0.243624f, 0.227062f, -0.213629f, 0.002075f, 0.061203f, -0.077820f, + -0.008807f, -0.247324f, -0.051464f, -0.191894f, -0.238713f, -0.389526f, + -0.274248f, 0.053950f, -0.225750f, -0.367097f, -0.122391f, 0.181212f, + -0.411824f, -0.084241f, -0.302288f, 0.077860f, -0.187443f, -0.300262f, + 0.083156f, -0.392461f, -0.332320f, -0.346474f, 0.140658f, -0.283656f, + 0.120714f, -0.056577f, -0.280968f, 0.017795f, -0.024686f, 0.073113f, + -0.346637f, 0.082567f, -0.036556f, -0.369730f, 0.081225f, -0.005211f, + 0.144886f, -0.003544f, 0.178307f, -0.366035f, -0.063887f, -0.191767f, + 0.105835f, -0.273978f, -0.266532f, -0.023984f, 0.039166f, 0.065848f, + -0.026802f, -0.268923f, 0.189659f, 0.086300f, 0.030718f, 0.216565f, + -0.130025f, -0.215687f, 0.146341f, -0.286438f, -0.394226f, -0.181509f, + -0.005612f, 0.186040f, 0.133491f, 0.032096f, -0.261609f, 0.074007f, + -0.042929f, -0.234479f, 0.189704f, 0.088395f, -0.003671f, -0.125055f, + -0.252418f, -0.086387f, 0.111197f, -0.297071f, -0.018793f, -0.031902f, + -0.333191f, -0.186279f, 0.039868f, 0.091419f, -0.264438f, -0.216150f, + -0.212550f, 0.203412f, -0.113028f, -0.197169f, -0.346771f, 0.086066f, + 0.091443f, -0.128507f, -0.007281f, -0.118389f, 0.003370f, -0.338661f, + 0.026739f, -0.063571f, -0.281567f, -0.166824f, 0.167455f, 0.216173f, + 0.199163f, 0.256314f, -0.222679f, 0.040282f, -0.154808f, -0.133943f, + -0.270163f, -0.357398f, 0.260373f, 0.176950f, -0.125162f, -0.085050f, + 0.226376f, -0.124585f, -0.324804f, 0.035536f, -0.133600f, 0.173450f, + 0.068107f, -0.337442f, 0.169629f, 0.047223f, 0.057878f, 0.055555f, + -0.317449f, -0.103768f, 0.080899f, -0.194759f, -1.137593f, 0.508999f, + 0.045372f, 1.746454f, 1.250347f, -0.342930f, -0.127821f, -0.220175f, + -0.417649f, -0.480595f, 0.071902f, 0.050231f, -0.562554f, -0.677866f, + -0.121416f, -0.247558f, -0.483876f, -0.504157f, 1.731953f, 0.572936f, + 0.047325f, 0.050619f, 0.112611f, -0.035393f, 0.052585f, -0.071076f, + -0.015798f, -0.050228f, -0.142875f, 0.189329f, 0.048833f, 0.503633f, + 0.249588f, 0.175492f, -0.137664f, -0.018533f, 0.288453f, -0.025644f, + 0.079131f, 0.195096f, -0.154039f, -0.104220f, -0.224072f, 0.095946f, + -0.208424f, 0.214745f, 0.056468f, 0.182603f, 0.341784f, -0.134664f, + -0.194050f, 0.058532f, -0.107336f, -0.087783f, -0.238795f, -0.387212f, + 0.049055f, -0.127417f, -0.299919f, -0.094371f, -0.011735f, -0.264753f, + 0.407375f, -0.462654f, -0.609488f, 0.027742f, -0.985512f, -0.109154f, + -0.423276f, 2.347960f, 0.129240f, 0.187610f, -0.057081f, 2.424892f, + 0.087666f, 0.106716f, -0.039379f, 2.764866f, 0.113309f, 0.028196f, + -0.582789f, 0.335385f, -0.538029f, -0.477337f, -0.114207f, 0.178829f, + 0.006276f, 0.123179f, 0.095101f, 0.139898f, -0.372074f, -0.111010f, + 0.136330f, 0.272900f, 0.126737f, -0.097808f, -0.363697f, 0.108665f, + -0.227749f, -0.083421f, 1.714677f, 0.451943f, 0.107931f, -0.392281f, + 1.615846f, 0.022307f, -0.247011f, 0.257703f, 1.039134f, 0.537789f, + 0.022177f, -0.271532f, 0.351350f, -0.399205f, -0.240534f, -0.315399f, + 0.026928f, -0.005618f, 0.053179f, -0.010277f, 0.000501f, 0.040896f, + -0.109160f, 0.018282f, 0.003887f, 0.199599f, 0.095349f, -0.337284f, + 0.169929f, -0.109409f, -0.166983f, 0.059908f, -0.226574f, -0.120114f, + 0.077329f, -0.333133f, -0.220936f, 0.114309f, -0.233965f, -0.281551f, + 0.042948f, 0.100940f, 0.116037f, -0.313122f, 0.215149f, -0.309057f, + -0.341052f, -0.294417f, -0.179722f, 0.010795f, 0.192053f, -0.275261f, + -0.033077f, 0.117348f, 0.090206f, 0.781573f, 0.602456f, -0.220296f, + 0.172159f, 0.758513f, 0.157910f, -0.217897f, -0.372659f, 0.031935f, + 0.791463f, 0.267195f, 0.931593f, -0.057349f, 0.405512f, -0.058512f, + -0.641663f, -0.076592f, 0.550227f, -0.024094f, 0.048218f, -0.289971f, + 0.180940f, 0.167533f, 0.052711f, -0.360726f, 0.019210f, -0.488879f, + 0.380498f, 0.151608f, -0.276895f, -0.596554f, 0.106076f, -0.245833f, + -0.048783f, 0.073823f, 0.098780f, 0.000211f, 0.113958f, -0.068964f, + -0.265533f, -0.185457f, 0.175586f, -0.163621f, -0.204919f, 0.145802f, + -0.163421f, 0.129576f, -0.153486f, -0.105573f, 0.067289f, -0.213120f, + -0.286103f, 0.249543f, -0.044970f, -0.170464f, -0.105501f, -0.094765f, + -0.050734f, -0.369468f, 0.180020f, -0.363328f, -0.151654f, -0.262550f, + -0.424503f, 0.829032f, -0.559452f, 0.506837f, 0.143823f, 0.276660f, + -1.808608f, -0.259517f, -0.053945f, 0.035676f, -1.842195f, -0.065960f, + -0.069285f, 0.462022f, -2.319453f, -0.370299f, 0.183329f, -0.146412f, + -0.563875f, 0.305068f, 0.480904f, 0.044319f, -0.016098f, 0.168516f, + 0.114874f, -0.097621f, -0.030373f, 0.177700f, 0.181591f, -0.146003f, + -0.330853f, -0.259200f, 0.779319f, -1.517524f, 0.178781f, 0.135451f, + 0.088784f, -2.076089f, 0.628717f, -0.048685f, 0.281327f, -2.341596f, + 0.422171f, 0.006135f, 0.367096f, -1.663118f, 0.365253f, -0.072884f, + -0.197620f, -0.688634f, 0.477354f, 0.395841f, -0.098505f, 0.208709f, + -0.027523f, 0.127119f, 0.106274f, 0.114424f, -0.122877f, -0.087245f, + 0.086923f, -0.527398f, -0.342062f, -0.764662f, 0.713094f, -0.626453f, + -0.081454f, -0.087683f, 0.885047f, 0.323440f, -0.018579f, -0.217166f, + 1.617984f, -0.159038f, 0.265991f, -0.390313f, 1.933182f, -0.032431f, + -0.057513f, -0.300841f, 0.461248f, -0.072147f, -0.287052f, -0.078056f, + 0.011734f, 0.044013f, 0.177174f, 0.093400f, 0.028819f, 0.193686f, + -0.224853f, 0.268321f, -0.075059f, 0.074526f, -0.015618f, 0.165615f, + -0.276780f, -0.063908f, -0.369264f, -0.171497f, -0.173624f, -0.130743f, + -0.224625f, -0.124980f, -0.104482f, 0.076864f, -0.009631f, -0.164682f, + 0.150480f, -0.111880f, -0.260425f, 0.086234f, -0.176936f, -0.136771f, + -0.168867f, -0.405626f, -0.288716f, -0.128950f, -0.207327f, 0.015581f, + -0.109061f, -0.098970f, 0.090792f, -0.109623f, 0.349851f, 0.266341f, + -0.088602f, -0.108071f, 0.082519f, 0.472650f, -1.838758f, 0.456694f, + 0.119927f, 0.461077f, -2.860022f, 0.231495f, 0.235771f, 0.256424f, + -1.938516f, -0.188202f, -0.000832f, -0.518206f, 0.194644f, 0.505510f, + 0.615657f, 0.193760f, 0.224600f, 0.265732f, -0.121553f, -0.354597f, + -0.242414f, -0.276639f, -0.057591f, 0.026369f, -0.261148f, -0.356155f, + -0.149178f, -0.353566f, -0.340835f, -0.141776f, 0.076535f, 0.221299f, + -0.108857f, -0.156514f, 0.050901f, 0.058541f, -0.077141f, 0.071515f, + -0.333283f, -0.181489f, -0.212900f, -0.224698f, -0.174693f, -0.178665f, + -0.143374f, -0.091811f, 0.165161f, 0.060156f, -0.086103f, -0.039031f, + -0.377759f, -0.370533f, 0.074431f, 0.064192f, 0.186576f, 0.447858f, + -0.082260f, -0.020268f, -0.123089f, -0.402017f, 0.080500f, 0.176286f, + 2.850013f, 0.019385f, -0.225361f, -0.235315f, 1.654694f, -0.073978f, + -0.341412f, -1.187575f, 2.815900f, -0.228063f, -0.174547f, 0.623825f, + -0.010676f, 0.157189f, 0.111879f, -0.198965f, 0.051851f, 0.158396f, + 0.045194f, 0.293531f, -0.246714f, -0.351493f, 0.026954f, 0.076233f, + 0.420367f, 0.168154f, -0.131450f, 0.134487f, -0.288851f, -0.134553f, + 0.014902f, 0.756381f, 0.277713f, 0.190080f, -0.020869f, 1.446672f, + 0.029792f, -0.025927f, 0.060640f, 0.559864f, 0.422229f, 0.198459f, + 0.036167f, 0.029432f, 0.001882f, 0.038480f, -0.160528f, -0.288855f, + -0.310886f, 0.291296f, 0.190558f, -0.182816f, -0.002252f, 0.073101f, + -0.172245f, -0.305980f, 0.112492f, -0.422839f, -0.295999f, -0.078160f, + -0.173405f, -0.032819f, 0.373774f, -0.715223f, 0.018911f, 0.131753f, + -0.237364f, -0.128499f, -0.228406f, 0.341619f, 0.343552f, -0.521581f, + -0.263790f, 0.362502f, -0.018450f, 0.054233f, 0.183068f, 0.382772f, + 0.188811f, -0.627287f, 0.040399f, -0.487338f, -0.192591f, 0.247426f, + 0.154372f, -0.483994f, +}; + +static const float av1_early_term_after_split_nn_bias_16_layer0[] = { + -0.173976f, 0.305495f, 0.250981f, -0.067127f, -0.313100f, 0.242464f, + 0.315196f, -0.056052f, -0.241227f, -0.253308f, -0.002697f, 0.003687f, + -0.124421f, -0.090383f, -0.070366f, -0.064074f, -0.056115f, 0.123313f, + -0.239698f, -0.182082f, -0.065296f, 0.021503f, -0.036787f, 0.311861f, + 0.118135f, -0.320456f, -0.110719f, 0.220692f, -0.071727f, -0.088226f, + -0.110874f, -0.111671f, +}; + +static const float av1_early_term_after_split_nn_weights_16_layer1[] = { + -0.338573f, 0.398159f, 0.314774f, -0.037448f, -0.271950f, -0.774991f, + 0.950901f, -0.225380f, -1.841906f, -0.350379f, -0.079350f, 0.383148f, + -0.183676f, -0.313132f, -0.340820f, -0.309401f, -1.050540f, -0.432267f, + -0.657195f, 0.927632f, -0.040150f, 0.578920f, 0.212301f, 0.292495f, + 0.563590f, -0.205735f, 0.195877f, 0.582122f, -0.217860f, 1.613379f, + 0.313278f, -0.555802f, +}; + +static const float av1_early_term_after_split_nn_bias_16_layer1[] = { + 0.16553f, +}; + +static const NN_CONFIG av1_early_term_after_split_nnconfig_16 = { + FEATURES, + 1, + 1, + { + HIDDEN_NODES, + }, + { + av1_early_term_after_split_nn_weights_16_layer0, + av1_early_term_after_split_nn_weights_16_layer1, + }, + { + av1_early_term_after_split_nn_bias_16_layer0, + av1_early_term_after_split_nn_bias_16_layer1, + }, +}; + +static const float av1_early_term_after_split_nn_weights_8_layer0[] = { + -0.719472f, 0.305806f, 0.855829f, 0.100094f, 0.412517f, 1.254673f, + 1.552105f, -5.890773f, -0.089957f, -0.016736f, 1.418074f, -5.393506f, + -0.028214f, 0.117758f, 1.479209f, -5.299794f, 0.171585f, -0.084182f, + -0.162105f, 0.388577f, -0.044319f, -0.025861f, 0.251782f, -0.181462f, + -0.101545f, -0.079999f, -0.033014f, -0.191627f, -0.032802f, -0.053404f, + 0.038038f, -0.119492f, 0.049104f, -0.344384f, -0.354513f, 0.036977f, + 0.017513f, -0.004025f, -0.163212f, -0.261999f, 0.146575f, 0.207541f, + 0.130365f, -0.252127f, 0.097419f, -0.231057f, -0.309421f, 0.347866f, + -0.064670f, -0.283171f, -0.244193f, -0.193323f, -0.226954f, -0.276194f, + -0.233553f, 0.156354f, -0.184009f, 0.344289f, -0.308058f, -0.205202f, + -0.325068f, 0.183820f, -0.361667f, -0.069559f, -0.121834f, -0.038357f, + -0.210043f, -0.266129f, 0.003188f, 0.074902f, -0.328843f, 0.293679f, + -0.234698f, -0.428268f, -0.308772f, -0.136538f, -0.008384f, -0.078227f, + 0.166074f, -0.262899f, 0.102114f, -0.323420f, 0.057064f, -0.203318f, + -0.397413f, -0.317324f, -0.307093f, 0.020574f, -0.188627f, 0.132529f, + 0.118992f, -0.487387f, -0.282975f, 0.573231f, -0.266071f, 0.125140f, + -0.970034f, 1.424008f, -0.487366f, -0.196415f, 3.680273f, -0.008407f, + 0.081109f, -0.187479f, 3.876021f, 0.159168f, 0.111721f, -0.337423f, + 3.901760f, 0.261268f, -0.245555f, -0.187632f, -0.324298f, 0.167234f, + 0.170986f, -0.473055f, 0.087016f, -0.003469f, 0.051035f, 0.251794f, + 0.153549f, 0.217609f, -0.326870f, -0.175511f, 0.637341f, -0.694837f, + -0.873487f, -0.186614f, -1.089884f, -0.607316f, -0.523519f, 5.256331f, + 0.071414f, 0.215265f, -0.835999f, 5.735746f, 0.300101f, 0.089626f, + -0.450261f, 5.608051f, 0.190491f, 0.110220f, -0.595360f, -0.446324f, + 0.311380f, 0.268812f, -0.339656f, -0.008708f, 0.011111f, -0.027557f, + 0.171534f, 0.000676f, 0.227232f, 0.033993f, 0.146684f, 0.094817f, + -0.175381f, -0.211927f, -0.362471f, 0.168834f, 0.264149f, -0.350538f, + -0.463249f, -0.288105f, 0.347155f, 0.183231f, -0.229732f, -0.252202f, + -0.218074f, -0.008769f, -0.156103f, 0.181233f, -0.354736f, 0.263270f, + -0.106636f, 0.081057f, 0.060634f, -0.046887f, 0.050468f, 0.071259f, + 0.221287f, 0.199071f, -0.180185f, -0.406902f, -0.239351f, -0.034957f, + 0.369140f, 0.864600f, 0.233798f, 0.423612f, -0.468918f, 0.976987f, + 0.691198f, -1.597908f, 0.102926f, 0.305546f, 0.391196f, -3.909059f, + 0.333635f, 0.311561f, 0.738886f, -4.002001f, 0.236394f, -0.233141f, + 0.263342f, 0.679898f, 0.136233f, 0.254743f, -0.367571f, 0.066412f, + 0.001606f, -0.059542f, 0.051726f, -0.347145f, -0.045501f, -0.313847f, + -0.021952f, 1.386316f, -0.579139f, -1.275844f, -0.003493f, -1.716577f, + 0.250209f, 0.192086f, 4.177055f, 0.351835f, 0.338177f, 0.140163f, + 4.099592f, 0.321866f, -0.128153f, -0.360414f, 4.350767f, 0.025943f, + -0.116740f, -0.664107f, -0.064558f, -0.039553f, -0.208186f, -0.678774f, + 0.149441f, -0.019823f, 0.012759f, 0.404442f, -0.108881f, 0.067974f, + -0.188278f, 0.136327f, 0.109927f, -0.179270f, -0.272342f, 0.018064f, + -0.304216f, -0.469470f, 0.109310f, -0.326214f, 0.061909f, -0.278997f, + -0.352329f, -0.333770f, -0.186522f, -0.328567f, -0.206211f, -0.008804f, + 0.042441f, -0.126699f, -0.420399f, -0.033842f, 0.016773f, -0.273789f, + 0.081928f, -0.191552f, -0.179533f, -0.263070f, -0.471807f, 0.062601f, + -0.232576f, 0.082955f, -0.490080f, 0.073820f, -0.090384f, 0.035781f, + -0.158880f, -0.506793f, -0.069132f, 0.047602f, -0.349640f, -0.058389f, + -0.017387f, -0.194636f, -0.457227f, -0.143105f, 0.222045f, -0.548909f, + -0.131561f, 0.247196f, -0.207923f, 0.133056f, -0.509854f, -0.193685f, + -0.181327f, -0.242442f, 0.091821f, 0.114430f, -0.375233f, -0.015254f, + -0.336632f, -0.060279f, -0.169169f, -0.429914f, -0.036563f, -0.400560f, + -0.076332f, -0.186232f, -0.268491f, 0.075561f, -0.389082f, -0.077435f, + 0.352562f, -0.020086f, -0.338181f, -0.404629f, 0.254983f, 0.150477f, + -0.265903f, 0.003341f, 0.099969f, -0.211964f, -0.129372f, -0.166366f, + 0.327712f, -0.276234f, 0.140675f, -0.433677f, -0.163050f, -0.143578f, + -0.397840f, -0.422130f, -0.293835f, -0.075362f, -0.468375f, 1.021238f, + 1.394155f, -0.922486f, -1.350222f, 2.030201f, 0.057717f, 0.227650f, + -0.193179f, 0.037224f, 0.065555f, 0.020558f, -0.059205f, -0.023690f, + -0.008718f, 0.095976f, -0.549587f, -0.321164f, -0.243728f, 1.344381f, + -1.254107f, 0.294244f, -0.154737f, -0.152597f, 0.342419f, 0.301883f, + 0.069866f, -0.327766f, 0.209323f, -0.364913f, -0.005530f, -0.558972f, + 0.057684f, -0.309357f, -0.283325f, -0.278445f, -0.420115f, -0.418457f, + -0.391481f, -0.418460f, -0.003897f, -0.023744f, -0.312330f, -0.366213f, + 0.269628f, -0.274877f, -0.189988f, -0.419555f, -0.034033f, 0.192874f, + -0.135487f, -0.326108f, -0.039019f, 0.185029f, -0.264883f, -0.563447f, + -0.163532f, -0.447652f, -0.141851f, 0.001714f, -0.193184f, 0.032609f, + -0.112883f, 0.074599f, 0.490665f, 0.434764f, 0.021652f, -0.219618f, + 0.743267f, 0.147195f, -0.303479f, -0.097674f, 0.195813f, 0.704007f, + -1.290851f, 0.119701f, 0.224065f, 0.260246f, -0.580657f, -0.096201f, + -0.333214f, -0.586689f, 0.567178f, 0.157340f, -0.043184f, 0.194358f, + -0.026506f, -0.339894f, -0.571803f, -0.234828f, 0.147054f, -0.564178f, + -0.156933f, -0.366055f, -0.691687f, -0.187501f, 0.215834f, -0.346106f, + -0.256892f, 0.110915f, -0.337464f, -0.341474f, -0.216113f, 0.249445f, + -0.070175f, -0.412141f, 0.153458f, -0.081280f, 0.164669f, -0.356396f, + -0.294971f, -0.165121f, -0.133585f, -0.071467f, 0.295147f, -0.253233f, + -0.213833f, -0.343416f, -0.474344f, -0.304000f, -0.341379f, -0.331456f, + -0.393952f, -0.508004f, -0.569518f, -0.509864f, 0.121961f, 0.011957f, + 0.000498f, -0.201969f, -0.407195f, -0.414375f, -0.295846f, 0.247492f, + 0.124249f, -0.550804f, -0.420397f, -0.123462f, 0.333292f, -0.240230f, + -0.025604f, 0.337536f, -0.295006f, -0.272614f, -0.496850f, -0.278521f, + 0.234591f, -0.052775f, -0.014052f, -0.260078f, -0.279128f, -0.036385f, + 0.008714f, -0.064018f, -0.124873f, -0.334014f, +}; + +static const float av1_early_term_after_split_nn_bias_8_layer0[] = { + 1.202379f, -0.117005f, -0.135527f, -0.262255f, -0.443658f, -0.078981f, + 0.615653f, -0.124482f, -0.227768f, -0.227014f, -0.135898f, 0.143216f, + -0.225995f, 0.370877f, -0.214821f, -0.227752f, +}; + +static const float av1_early_term_after_split_nn_weights_8_layer1[] = { + 0.376594f, 0.266703f, -0.039847f, 1.680142f, -0.879939f, 0.286806f, + -0.378223f, -0.405295f, -0.021107f, 0.039188f, 0.259308f, 0.193091f, + 0.077994f, -0.269141f, 0.011180f, -0.019262f, +}; + +static const float av1_early_term_after_split_nn_bias_8_layer1[] = { + -1.29585564f, +}; + +static const NN_CONFIG av1_early_term_after_split_nnconfig_8 = { + FEATURES, + 1, + 1, + { + 16, + }, + { + av1_early_term_after_split_nn_weights_8_layer0, + av1_early_term_after_split_nn_weights_8_layer1, + }, + { + av1_early_term_after_split_nn_bias_8_layer0, + av1_early_term_after_split_nn_bias_8_layer1, + }, +}; +#undef FEATURES +#undef HIDDEN_NODES + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_ diff --git a/libs/libaom/src/av1/encoder/partition_strategy.c b/libs/libaom/src/av1/encoder/partition_strategy.c new file mode 100644 index 000000000..cc820ba24 --- /dev/null +++ b/libs/libaom/src/av1/encoder/partition_strategy.c @@ -0,0 +1,1288 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_ports/system_state.h" + +#include "av1/common/enums.h" +#include "av1/common/reconinter.h" + +#if !CONFIG_REALTIME_ONLY +#include "av1/encoder/cnn.h" +#include "av1/encoder/partition_model_weights.h" +#include "av1/encoder/partition_cnn_weights.h" +#endif +#include "av1/encoder/encoder.h" + +#include "av1/encoder/motion_search_facade.h" +#include "av1/encoder/partition_strategy.h" +#include "av1/encoder/rdopt.h" + +#if !CONFIG_REALTIME_ONLY +static AOM_INLINE void simple_motion_search_prune_part_features( + AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row, + int mi_col, BLOCK_SIZE bsize, float *features, int features_to_get); +#endif + +static INLINE int convert_bsize_to_idx(BLOCK_SIZE bsize) { + switch (bsize) { + case BLOCK_128X128: return 0; + case BLOCK_64X64: return 1; + case BLOCK_32X32: return 2; + case BLOCK_16X16: return 3; + case BLOCK_8X8: return 4; + default: assert(0 && "Invalid bsize"); return -1; + } +} + +#if !CONFIG_REALTIME_ONLY +// TODO(chiyotsai@google.com): This is very much a work in progress. We still +// need to the following: +// -- add support for hdres +// -- add support for pruning rectangular partitions +// -- use reconstructed pixels instead of source pixels for padding +// -- use chroma pixels in addition to luma pixels +void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x, + int bsize, int quad_tree_idx, + int *partition_none_allowed, + int *partition_horz_allowed, + int *partition_vert_allowed, + int *do_rectangular_split, + int *do_square_split) { + assert(cm->seq_params.sb_size >= BLOCK_64X64 && + "Invalid sb_size for intra_cnn!"); + const int bsize_idx = convert_bsize_to_idx(bsize); + + if (bsize == BLOCK_128X128) { + return; + } + + // Precompute the CNN part and cache the result in MACROBLOCK + if (bsize == BLOCK_64X64 && !x->cnn_output_valid) { + aom_clear_system_state(); + const CNN_CONFIG *cnn_config = &av1_intra_mode_cnn_partition_cnn_config; + + // Prepare the output + const CNN_THREAD_DATA thread_data = { .num_workers = 1, .workers = NULL }; + const int num_outputs = 4; + const int output_dims[4] = { 1, 2, 4, 8 }; + const int out_chs[4] = { CNN_BRANCH_0_OUT_CH, CNN_BRANCH_1_OUT_CH, + CNN_BRANCH_2_OUT_CH, CNN_BRANCH_3_OUT_CH }; + float *output_buffer[CNN_TOT_OUT_CH]; + + float **cur_output_buf = output_buffer; + float *curr_buf_ptr = x->cnn_buffer; + for (int output_idx = 0; output_idx < num_outputs; output_idx++) { + const int num_chs = out_chs[output_idx]; + const int ch_size = output_dims[output_idx] * output_dims[output_idx]; + for (int ch = 0; ch < num_chs; ch++) { + cur_output_buf[ch] = curr_buf_ptr; + curr_buf_ptr += ch_size; + } + cur_output_buf += num_chs; + } + + CNN_MULTI_OUT output = { + .num_outputs = 4, + .output_channels = out_chs, + .output_strides = output_dims, + .output_buffer = output_buffer, + }; + + // Prepare the input + const MACROBLOCKD *xd = &x->e_mbd; + const int bit_depth = xd->bd; + const int dc_q = + av1_dc_quant_QTX(x->qindex, 0, bit_depth) >> (bit_depth - 8); + x->log_q = logf(1.0f + (float)(dc_q * dc_q) / 256.0f); + x->log_q = (x->log_q - av1_intra_mode_cnn_partition_mean[0]) / + av1_intra_mode_cnn_partition_std[0]; + + const int width = 65, height = 65, + stride = x->plane[AOM_PLANE_Y].src.stride; + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *image[1] = { + CONVERT_TO_SHORTPTR(x->plane[AOM_PLANE_Y].src.buf) - stride - 1 + }; + + av1_cnn_predict_img_multi_out_highbd(image, width, height, stride, + cnn_config, &thread_data, bit_depth, + &output); + } else { + uint8_t *image[1] = { x->plane[AOM_PLANE_Y].src.buf - stride - 1 }; + + av1_cnn_predict_img_multi_out(image, width, height, stride, cnn_config, + &thread_data, &output); + } + + x->cnn_output_valid = 1; + } + + if (!x->cnn_output_valid) { + return; + } + + const NN_CONFIG *dnn_configs[5] = { + NULL, + &av1_intra_mode_cnn_partition_branch_0_dnn_config, + &av1_intra_mode_cnn_partition_branch_1_dnn_config, + &av1_intra_mode_cnn_partition_branch_2_dnn_config, + &av1_intra_mode_cnn_partition_branch_3_dnn_config, + }; + + const NN_CONFIG *dnn_config = dnn_configs[bsize_idx]; + + aom_clear_system_state(); + float dnn_features[100]; + float logits[4] = { 0.0f }; + + const float *branch_0 = x->cnn_buffer; + const float *branch_1 = branch_0 + CNN_BRANCH_0_OUT_SIZE; + const float *branch_2 = branch_1 + CNN_BRANCH_1_OUT_SIZE; + const float *branch_3 = branch_2 + CNN_BRANCH_2_OUT_SIZE; + + if (bsize == BLOCK_64X64) { + int f_idx = 0; + for (int ch_idx = 0; ch_idx < CNN_BRANCH_0_OUT_CH; ch_idx++) { + dnn_features[f_idx++] = branch_0[ch_idx]; + } + + const int spa_stride = 2 * 2; + for (int lin_idx = 0; lin_idx < spa_stride; lin_idx++) { + for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) { + dnn_features[f_idx++] = branch_1[lin_idx + ch_idx * spa_stride]; + } + } + dnn_features[f_idx++] = x->log_q; + } else if (bsize == BLOCK_32X32) { + int f_idx = 0; + for (int idx = 0; idx < CNN_BRANCH_0_OUT_CH; idx++) { + dnn_features[f_idx++] = branch_0[idx]; + } + + const int curr_lin_idx = quad_to_linear_1[quad_tree_idx - 1]; + const int spa_stride = 2 * 2; + for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) { + dnn_features[f_idx++] = branch_1[curr_lin_idx + ch_idx * spa_stride]; + } + dnn_features[f_idx++] = x->log_q; + } else if (bsize == BLOCK_16X16) { + int f_idx = 0; + const int prev_quad_idx = (quad_tree_idx - 1) / 4; + const int prev_lin_idx = quad_to_linear_1[prev_quad_idx - 1]; + const int prev_spa_stride = 2 * 2; + for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) { + dnn_features[f_idx++] = branch_1[prev_lin_idx + ch_idx * prev_spa_stride]; + } + + const int curr_lin_idx = quad_to_linear_2[quad_tree_idx - 5]; + const int spa_stride = 4 * 4; + for (int ch_idx = 0; ch_idx < CNN_BRANCH_2_OUT_CH; ch_idx++) { + dnn_features[f_idx++] = branch_2[curr_lin_idx + ch_idx * spa_stride]; + } + dnn_features[f_idx++] = x->log_q; + } else if (bsize == BLOCK_8X8) { + int f_idx = 0; + const int prev_quad_idx = (quad_tree_idx - 1) / 4; + const int prev_lin_idx = quad_to_linear_2[prev_quad_idx - 5]; + const int prev_spa_stride = 4 * 4; + for (int ch_idx = 0; ch_idx < CNN_BRANCH_2_OUT_CH; ch_idx++) { + dnn_features[f_idx++] = branch_2[prev_lin_idx + ch_idx * prev_spa_stride]; + } + + const int curr_lin_idx = quad_to_linear_3[quad_tree_idx - 21]; + const int spa_stride = 8 * 8; + for (int ch_idx = 0; ch_idx < CNN_BRANCH_3_OUT_CH; ch_idx++) { + dnn_features[f_idx++] = branch_3[curr_lin_idx + ch_idx * spa_stride]; + } + dnn_features[f_idx++] = x->log_q; + } else { + assert(0 && "Invalid bsize in intra_cnn partition"); + } + + // Make decision + av1_nn_predict(dnn_features, dnn_config, 1, logits); + aom_clear_system_state(); + + const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; + const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; + float split_only_thresh = 100.0f, no_split_thresh = -100.0f; + if (is_720p_or_larger) { + split_only_thresh = + av1_intra_mode_cnn_partition_split_thresh_hdres[bsize_idx]; + no_split_thresh = + av1_intra_mode_cnn_partition_no_split_thresh_hdres[bsize_idx]; + } else if (is_480p_or_larger) { + split_only_thresh = + av1_intra_mode_cnn_partition_split_thresh_midres[bsize_idx]; + no_split_thresh = + av1_intra_mode_cnn_partition_no_split_thresh_midres[bsize_idx]; + } else { + split_only_thresh = + av1_intra_mode_cnn_partition_split_thresh_lowres[bsize_idx]; + no_split_thresh = + av1_intra_mode_cnn_partition_no_split_thresh_lowres[bsize_idx]; + } + + if (logits[0] > split_only_thresh) { + *partition_none_allowed = 0; + *partition_horz_allowed = 0; + *partition_vert_allowed = 0; + *do_rectangular_split = 0; + } + + if (logits[0] < no_split_thresh) { + *do_square_split = 0; + } +} + +void av1_simple_motion_search_based_split( + AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row, + int mi_col, BLOCK_SIZE bsize, int *partition_none_allowed, + int *partition_horz_allowed, int *partition_vert_allowed, + int *do_rectangular_split, int *do_square_split) { + aom_clear_system_state(); + + const AV1_COMMON *const cm = &cpi->common; + const int bsize_idx = convert_bsize_to_idx(bsize); + const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; + const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; + // res_idx is 0 for res < 480p, 1 for 480p, 2 for 720p+ + const int res_idx = is_480p_or_larger + is_720p_or_larger; + + assert(bsize_idx >= 0 && bsize_idx <= 4 && + "Invalid bsize in simple_motion_search_based_split"); + + const float *ml_mean = av1_simple_motion_search_split_mean[bsize_idx]; + const float *ml_std = av1_simple_motion_search_split_std[bsize_idx]; + const NN_CONFIG *nn_config = + av1_simple_motion_search_split_nn_config[bsize_idx]; + const int agg = cpi->sf.part_sf.simple_motion_search_prune_agg; + + const float split_only_thresh = + av1_simple_motion_search_split_thresh[agg][res_idx][bsize_idx]; + const float no_split_thresh = + av1_simple_motion_search_no_split_thresh[agg][res_idx][bsize_idx]; + + float features[FEATURE_SIZE_SMS_SPLIT] = { 0.0f }; + simple_motion_search_prune_part_features(cpi, x, pc_tree, mi_row, mi_col, + bsize, features, + FEATURE_SMS_SPLIT_MODEL_FLAG); + for (int idx = 0; idx < FEATURE_SIZE_SMS_SPLIT; idx++) { + features[idx] = (features[idx] - ml_mean[idx]) / ml_std[idx]; + } + + float score = 0.0f; + + av1_nn_predict(features, nn_config, 1, &score); + aom_clear_system_state(); + + if (score > split_only_thresh) { + *partition_none_allowed = 0; + *partition_horz_allowed = 0; + *partition_vert_allowed = 0; + *do_rectangular_split = 0; + } + + if (cpi->sf.part_sf.simple_motion_search_split >= 2 && + score < no_split_thresh) { + *do_square_split = 0; + } +} + +// Given a list of ref frames in refs, performs simple_motion_search on each of +// the refs and returns the ref with the smallest sse. Returns -1 if none of the +// ref in the list is available. Also stores the best sse and var in best_sse, +// best_var, respectively. If save_mv is 0, don't update mv_ref_fulls in +// pc_tree. If save_mv is 1, update mv_ref_fulls under pc_tree and the +// subtrees. +static int simple_motion_search_get_best_ref( + AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row, + int mi_col, BLOCK_SIZE bsize, const int *const refs, int num_refs, + int use_subpixel, int save_mv, unsigned int *best_sse, + unsigned int *best_var) { + const AV1_COMMON *const cm = &cpi->common; + int best_ref = -1; + + if (mi_col >= cm->mi_params.mi_cols || mi_row >= cm->mi_params.mi_rows) { + // If the whole block is outside of the image, set the var and sse to 0. + *best_var = 0; + *best_sse = 0; + + return best_ref; + } + + // Otherwise do loop through the reference frames and find the one with the + // minimum SSE + const MACROBLOCKD *xd = &x->e_mbd; + + const int num_planes = 1; + + *best_sse = INT_MAX; + + for (int ref_idx = 0; ref_idx < num_refs; ref_idx++) { + const int ref = refs[ref_idx]; + + if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref]) { + const FULLPEL_MV *start_mvs = pc_tree->start_mvs; + unsigned int curr_sse = 0, curr_var = 0; + int_mv best_mv = + av1_simple_motion_search(cpi, x, mi_row, mi_col, bsize, ref, + start_mvs[ref], num_planes, use_subpixel); + curr_var = cpi->fn_ptr[bsize].vf( + x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf, + xd->plane[0].dst.stride, &curr_sse); + if (curr_sse < *best_sse) { + *best_sse = curr_sse; + *best_var = curr_var; + best_ref = ref; + } + + if (save_mv) { + pc_tree->start_mvs[ref].row = best_mv.as_mv.row / 8; + pc_tree->start_mvs[ref].col = best_mv.as_mv.col / 8; + + if (bsize >= BLOCK_8X8) { + for (int r_idx = 0; r_idx < 4; r_idx++) { + // Propagate the new motion vectors to a lower level + PC_TREE *sub_tree = pc_tree->split[r_idx]; + sub_tree->start_mvs[ref] = pc_tree->start_mvs[ref]; + } + } + } + } + } + + return best_ref; +} + +// Collects features using simple_motion_search and store them in features. The +// features are also cached in PC_TREE. By default, the features collected are +// the sse and var from the subblocks flagged by features_to_get. Furthermore, +// if features is not NULL, then 7 more features are appended to the end of +// features: +// - log(1.0 + dc_q ** 2) +// - whether an above macroblock exists +// - width of above macroblock +// - height of above macroblock +// - whether a left marcoblock exists +// - width of left macroblock +// - height of left macroblock +static AOM_INLINE void simple_motion_search_prune_part_features( + AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row, + int mi_col, BLOCK_SIZE bsize, float *features, int features_to_get) { + const int w_mi = mi_size_wide[bsize]; + const int h_mi = mi_size_high[bsize]; + assert(mi_size_wide[bsize] == mi_size_high[bsize]); + assert(cpi->ref_frame_flags & av1_ref_frame_flag_list[LAST_FRAME] || + cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]); + + // Setting up motion search + const int ref_list[] = { cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME + : LAST_FRAME }; + const int num_refs = 1; + const int use_subpixel = 1; + + // Doing whole block first to update the mv + if (!pc_tree->sms_none_valid && features_to_get & FEATURE_SMS_NONE_FLAG) { + simple_motion_search_get_best_ref(cpi, x, pc_tree, mi_row, mi_col, bsize, + ref_list, num_refs, use_subpixel, 1, + &pc_tree->sms_none_feat[0], + &pc_tree->sms_none_feat[1]); + pc_tree->sms_none_valid = 1; + } + + // Split subblocks + if (features_to_get & FEATURE_SMS_SPLIT_FLAG) { + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + for (int r_idx = 0; r_idx < 4; r_idx++) { + const int sub_mi_col = mi_col + (r_idx & 1) * w_mi / 2; + const int sub_mi_row = mi_row + (r_idx >> 1) * h_mi / 2; + PC_TREE *sub_tree = pc_tree->split[r_idx]; + + if (!sub_tree->sms_none_valid) { + simple_motion_search_get_best_ref( + cpi, x, sub_tree, sub_mi_row, sub_mi_col, subsize, ref_list, + num_refs, use_subpixel, 1, &sub_tree->sms_none_feat[0], + &sub_tree->sms_none_feat[1]); + sub_tree->sms_none_valid = 1; + } + } + } + + // Rectangular subblocks + if (!pc_tree->sms_rect_valid && features_to_get & FEATURE_SMS_RECT_FLAG) { + // Horz subblock + BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ); + for (int r_idx = 0; r_idx < 2; r_idx++) { + const int sub_mi_col = mi_col + 0; + const int sub_mi_row = mi_row + r_idx * h_mi / 2; + + simple_motion_search_get_best_ref( + cpi, x, pc_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs, + use_subpixel, 0, &pc_tree->sms_rect_feat[2 * r_idx], + &pc_tree->sms_rect_feat[2 * r_idx + 1]); + } + + // Vert subblock + subsize = get_partition_subsize(bsize, PARTITION_VERT); + for (int r_idx = 0; r_idx < 2; r_idx++) { + const int sub_mi_col = mi_col + r_idx * w_mi / 2; + const int sub_mi_row = mi_row + 0; + + simple_motion_search_get_best_ref( + cpi, x, pc_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs, + use_subpixel, 0, &pc_tree->sms_rect_feat[4 + 2 * r_idx], + &pc_tree->sms_rect_feat[4 + 2 * r_idx + 1]); + } + pc_tree->sms_rect_valid = 1; + } + + if (!features) return; + + aom_clear_system_state(); + int f_idx = 0; + if (features_to_get & FEATURE_SMS_NONE_FLAG) { + for (int sub_idx = 0; sub_idx < 2; sub_idx++) { + features[f_idx++] = logf(1.0f + pc_tree->sms_none_feat[sub_idx]); + } + } + + if (features_to_get & FEATURE_SMS_SPLIT_FLAG) { + for (int sub_idx = 0; sub_idx < 4; sub_idx++) { + PC_TREE *sub_tree = pc_tree->split[sub_idx]; + features[f_idx++] = logf(1.0f + sub_tree->sms_none_feat[0]); + features[f_idx++] = logf(1.0f + sub_tree->sms_none_feat[1]); + } + } + + if (features_to_get & FEATURE_SMS_RECT_FLAG) { + for (int sub_idx = 0; sub_idx < 8; sub_idx++) { + features[f_idx++] = logf(1.0f + pc_tree->sms_rect_feat[sub_idx]); + } + } + + const MACROBLOCKD *xd = &x->e_mbd; + set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize); + + // Q_INDEX + const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8); + features[f_idx++] = logf(1.0f + (float)(dc_q * dc_q) / 256.0f); + + // Neighbor stuff + const int has_above = !!xd->above_mbmi; + const int has_left = !!xd->left_mbmi; + const BLOCK_SIZE above_bsize = has_above ? xd->above_mbmi->sb_type : bsize; + const BLOCK_SIZE left_bsize = has_left ? xd->left_mbmi->sb_type : bsize; + features[f_idx++] = (float)has_above; + features[f_idx++] = (float)mi_size_wide_log2[above_bsize]; + features[f_idx++] = (float)mi_size_high_log2[above_bsize]; + features[f_idx++] = (float)has_left; + features[f_idx++] = (float)mi_size_wide_log2[left_bsize]; + features[f_idx++] = (float)mi_size_high_log2[left_bsize]; +} + +void av1_simple_motion_search_prune_rect(AV1_COMP *const cpi, MACROBLOCK *x, + PC_TREE *pc_tree, int mi_row, + int mi_col, BLOCK_SIZE bsize, + int *partition_horz_allowed, + int *partition_vert_allowed, + int *prune_horz, int *prune_vert) { + aom_clear_system_state(); + const AV1_COMMON *const cm = &cpi->common; + const int bsize_idx = convert_bsize_to_idx(bsize); + const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; + const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; + // res_idx is 0 for lowres, 1 for 48p, 2 for 720p+ + const int res_idx = is_480p_or_larger + is_720p_or_larger; + + // Get model parameters + const NN_CONFIG *nn_config = + av1_simple_motion_search_prune_rect_nn_config[bsize_idx]; + const float *ml_mean = av1_simple_motion_search_prune_rect_mean[bsize_idx], + *ml_std = av1_simple_motion_search_prune_rect_std[bsize_idx]; + + const int agg = cpi->sf.part_sf.simple_motion_search_prune_agg; + const float prune_thresh = + av1_simple_motion_search_prune_rect_thresh[agg][res_idx][bsize_idx]; + + // If there is no valid threshold, return immediately. + if (!nn_config || prune_thresh == 0.0f) { + return; + } + + // Get features + float features[FEATURE_SIZE_SMS_PRUNE_PART] = { 0.0f }; + simple_motion_search_prune_part_features(cpi, x, pc_tree, mi_row, mi_col, + bsize, features, + FEATURE_SMS_PRUNE_PART_FLAG); + for (int f_idx = 0; f_idx < FEATURE_SIZE_SMS_PRUNE_PART; f_idx++) { + features[f_idx] = (features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx]; + } + + // Get probabilities + float scores[EXT_PARTITION_TYPES] = { 0.0f }, + probs[EXT_PARTITION_TYPES] = { 0.0f }; + const int num_classes = (bsize == BLOCK_128X128 || bsize == BLOCK_8X8) + ? PARTITION_TYPES + : EXT_PARTITION_TYPES; + + av1_nn_predict(features, nn_config, 1, scores); + aom_clear_system_state(); + + av1_nn_softmax(scores, probs, num_classes); + + // Determine if we should prune rectangular partitions. + if (cpi->sf.part_sf.simple_motion_search_prune_rect && + !frame_is_intra_only(cm) && + (*partition_horz_allowed || *partition_vert_allowed) && + bsize >= BLOCK_8X8 && !av1_superres_scaled(cm)) { + *prune_horz = probs[PARTITION_HORZ] <= prune_thresh; + *prune_vert = probs[PARTITION_VERT] <= prune_thresh; + } +} + +// Early terminates PARTITION_NONE using simple_motion_search features and the +// rate, distortion, and rdcost of PARTITION_NONE. This is only called when: +// - The frame is a show frame +// - The frame is not intra only +// - The current bsize is > BLOCK_8X8 +// - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols +void av1_simple_motion_search_early_term_none(AV1_COMP *const cpi, + MACROBLOCK *x, PC_TREE *pc_tree, + int mi_row, int mi_col, + BLOCK_SIZE bsize, + const RD_STATS *none_rdc, + int *early_terminate) { + // TODO(chiyotsai@google.com): There are other features we can extract from + // PARTITION_NONE. Play with this later. + float features[FEATURE_SIZE_SMS_TERM_NONE] = { 0.0f }; + simple_motion_search_prune_part_features(cpi, x, pc_tree, mi_row, mi_col, + bsize, features, + FEATURE_SMS_PRUNE_PART_FLAG); + int f_idx = FEATURE_SIZE_SMS_PRUNE_PART; + + features[f_idx++] = logf(1.0f + (float)none_rdc->rate); + features[f_idx++] = logf(1.0f + (float)none_rdc->dist); + features[f_idx++] = logf(1.0f + (float)none_rdc->rdcost); + + assert(f_idx == FEATURE_SIZE_SMS_TERM_NONE); + + const float *ml_mean = NULL; + const float *ml_std = NULL; + const float *ml_model = NULL; + + if (bsize == BLOCK_128X128) { + ml_mean = av1_simple_motion_search_term_none_mean_128; + ml_std = av1_simple_motion_search_term_none_std_128; + ml_model = av1_simple_motion_search_term_none_model_128; + } else if (bsize == BLOCK_64X64) { + ml_mean = av1_simple_motion_search_term_none_mean_64; + ml_std = av1_simple_motion_search_term_none_std_64; + ml_model = av1_simple_motion_search_term_none_model_64; + } else if (bsize == BLOCK_32X32) { + ml_mean = av1_simple_motion_search_term_none_mean_32; + ml_std = av1_simple_motion_search_term_none_std_32; + ml_model = av1_simple_motion_search_term_none_model_32; + } else if (bsize == BLOCK_16X16) { + ml_mean = av1_simple_motion_search_term_none_mean_16; + ml_std = av1_simple_motion_search_term_none_std_16; + ml_model = av1_simple_motion_search_term_none_model_16; + } else { + assert(0 && "Unexpected block size in simple_motion_term_none"); + } + + if (ml_model) { + float score = 0.0f; + for (f_idx = 0; f_idx < FEATURE_SIZE_SMS_TERM_NONE; f_idx++) { + score += + ml_model[f_idx] * (features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx]; + } + score += ml_model[FEATURE_SIZE_SMS_TERM_NONE]; + + if (score >= 0.0f) { + *early_terminate = 1; + } + } +} + +void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x, + int mi_row, int mi_col, + float *features) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + const BLOCK_SIZE sb_size = cm->seq_params.sb_size; + + assert(sb_size == BLOCK_128X128); + + int f_idx = 0; + + const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8); + aom_clear_system_state(); + const float log_q_sq = logf(1.0f + (float)(dc_q * dc_q) / 256.0f); + + // Perform full-pixel single motion search in Y plane of 16x16 mbs in the sb + float sum_mv_row_sq = 0; + float sum_mv_row = 0; + float min_abs_mv_row = FLT_MAX; + float max_abs_mv_row = 0; + + float sum_mv_col_sq = 0; + float sum_mv_col = 0; + float min_abs_mv_col = FLT_MAX; + float max_abs_mv_col = 0; + + float sum_log_sse_sq = 0; + float sum_log_sse = 0; + float min_log_sse = FLT_MAX; + float max_log_sse = 0; + + const BLOCK_SIZE mb_size = BLOCK_16X16; + const int mb_rows = block_size_high[sb_size] / block_size_high[mb_size]; + const int mb_cols = block_size_wide[sb_size] / block_size_wide[mb_size]; + const int mb_in_mi_size_high_log2 = mi_size_high_log2[mb_size]; + const int mb_in_mi_size_wide_log2 = mi_size_wide_log2[mb_size]; + + for (int mb_row = 0; mb_row < mb_rows; mb_row++) + for (int mb_col = 0; mb_col < mb_cols; mb_col++) { + const int this_mi_row = mi_row + (mb_row << mb_in_mi_size_high_log2); + const int this_mi_col = mi_col + (mb_col << mb_in_mi_size_wide_log2); + unsigned int sse = 0; + unsigned int var = 0; + const FULLPEL_MV start_mv = kZeroFullMv; + int_mv best_mv = av1_simple_motion_sse_var( + cpi, x, this_mi_row, this_mi_col, mb_size, start_mv, 0, &sse, &var); + + aom_clear_system_state(); + const float mv_row = (float)(best_mv.as_mv.row / 8); + const float mv_col = (float)(best_mv.as_mv.col / 8); + const float log_sse = logf(1.0f + (float)sse); + const float abs_mv_row = fabsf(mv_row); + const float abs_mv_col = fabsf(mv_col); + + sum_mv_row_sq += mv_row * mv_row; + sum_mv_row += mv_row; + sum_mv_col_sq += mv_col * mv_col; + sum_mv_col += mv_col; + + if (abs_mv_row < min_abs_mv_row) min_abs_mv_row = abs_mv_row; + if (abs_mv_row > max_abs_mv_row) max_abs_mv_row = abs_mv_row; + if (abs_mv_col < min_abs_mv_col) min_abs_mv_col = abs_mv_col; + if (abs_mv_col > max_abs_mv_col) max_abs_mv_col = abs_mv_col; + + sum_log_sse_sq += log_sse * log_sse; + sum_log_sse += log_sse; + if (log_sse < min_log_sse) min_log_sse = log_sse; + if (log_sse > max_log_sse) max_log_sse = log_sse; + } + aom_clear_system_state(); + const float avg_mv_row = sum_mv_row / 64.0f; + const float var_mv_row = sum_mv_row_sq / 64.0f - avg_mv_row * avg_mv_row; + + const float avg_mv_col = sum_mv_col / 64.0f; + const float var_mv_col = sum_mv_col_sq / 64.0f - avg_mv_col * avg_mv_col; + + const float avg_log_sse = sum_log_sse / 64.0f; + const float var_log_sse = sum_log_sse_sq / 64.0f - avg_log_sse * avg_log_sse; + + features[f_idx++] = avg_log_sse; + features[f_idx++] = avg_mv_col; + features[f_idx++] = avg_mv_row; + features[f_idx++] = log_q_sq; + features[f_idx++] = max_abs_mv_col; + features[f_idx++] = max_abs_mv_row; + features[f_idx++] = max_log_sse; + features[f_idx++] = min_abs_mv_col; + features[f_idx++] = min_abs_mv_row; + features[f_idx++] = min_log_sse; + features[f_idx++] = var_log_sse; + features[f_idx++] = var_mv_col; + features[f_idx++] = var_mv_row; + + assert(f_idx == FEATURE_SIZE_MAX_MIN_PART_PRED); +} + +BLOCK_SIZE av1_predict_max_partition(AV1_COMP *const cpi, MACROBLOCK *const x, + const float *features) { + float scores[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f }, + probs[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f }; + const NN_CONFIG *nn_config = &av1_max_part_pred_nn_config; + + assert(cpi->sf.part_sf.auto_max_partition_based_on_simple_motion != + NOT_IN_USE); + + aom_clear_system_state(); + av1_nn_predict(features, nn_config, 1, scores); + av1_nn_softmax(scores, probs, MAX_NUM_CLASSES_MAX_MIN_PART_PRED); + + int result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; + if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion == + DIRECT_PRED) { + result = 0; + float max_prob = probs[0]; + for (int i = 1; i < MAX_NUM_CLASSES_MAX_MIN_PART_PRED; ++i) { + if (probs[i] > max_prob) { + max_prob = probs[i]; + result = i; + } + } + } else if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion == + RELAXED_PRED) { + for (result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; result >= 0; + --result) { + if (result < MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1) { + probs[result] += probs[result + 1]; + } + if (probs[result] > 0.2) break; + } + } else if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion == + ADAPT_PRED) { + const BLOCK_SIZE sb_size = cpi->common.seq_params.sb_size; + MACROBLOCKD *const xd = &x->e_mbd; + // TODO(debargha): x->source_variance is unavailable at this point, + // so compute. The redundant recomputation later can be removed. + const unsigned int source_variance = + is_cur_buf_hbd(xd) + ? av1_high_get_sby_perpixel_variance(cpi, &x->plane[0].src, sb_size, + xd->bd) + : av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, sb_size); + if (source_variance > 16) { + const double thresh = source_variance < 128 ? 0.05 : 0.1; + for (result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; result >= 0; + --result) { + if (result < MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1) { + probs[result] += probs[result + 1]; + } + if (probs[result] > thresh) break; + } + } + } + + return (BLOCK_SIZE)((result + 2) * 3); +} + +// Get the minimum partition block width and height(in log scale) under a +// PC_TREE. +static AOM_INLINE void get_min_bsize(const PC_TREE *pc_tree, int *min_bw, + int *min_bh) { + if (!pc_tree) return; + + const BLOCK_SIZE bsize = pc_tree->block_size; + if (bsize == BLOCK_4X4) { + *min_bw = 0; + *min_bh = 0; + return; + } + + PARTITION_TYPE part_type = pc_tree->partitioning; + if (part_type == PARTITION_INVALID) return; + + if (part_type == PARTITION_SPLIT) { + for (int i = 0; i < 4; ++i) { + get_min_bsize(pc_tree->split[i], min_bw, min_bh); + } + } else { + if (part_type == PARTITION_HORZ_A || part_type == PARTITION_HORZ_B || + part_type == PARTITION_VERT_A || part_type == PARTITION_VERT_B) + part_type = PARTITION_SPLIT; + const BLOCK_SIZE subsize = get_partition_subsize(bsize, part_type); + if (subsize != BLOCK_INVALID) { + *min_bw = AOMMIN(*min_bw, mi_size_wide_log2[subsize]); + *min_bh = AOMMIN(*min_bh, mi_size_high_log2[subsize]); + } + } +} + +static INLINE void add_rd_feature(int64_t rd, int64_t best_rd, float *features, + int *feature_idx) { + const int rd_valid = rd > 0 && rd < INT64_MAX; + const float rd_ratio = rd_valid ? (float)rd / best_rd : 1.0f; + features[(*feature_idx)++] = (float)rd_valid; + features[(*feature_idx)++] = rd_ratio; +} + +#define FEATURES 31 +void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x, + PC_TREE *const pc_tree, BLOCK_SIZE bsize, + int64_t best_rd, int64_t part_none_rd, + int64_t part_split_rd, + int64_t *split_block_rd, int mi_row, + int mi_col, + int *const terminate_partition_search) { + if (best_rd <= 0 || best_rd == INT64_MAX || *terminate_partition_search) + return; + + const AV1_COMMON *const cm = &cpi->common; + const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; + const NN_CONFIG *nn_config = NULL; + float thresh = -1e6; + switch (bsize) { + case BLOCK_128X128: break; + case BLOCK_64X64: + nn_config = &av1_early_term_after_split_nnconfig_64; + thresh = is_480p_or_larger ? -2.0f : -1.2f; + break; + case BLOCK_32X32: + nn_config = &av1_early_term_after_split_nnconfig_32; + thresh = is_480p_or_larger ? -2.6f : -2.3f; + break; + case BLOCK_16X16: + nn_config = &av1_early_term_after_split_nnconfig_16; + thresh = is_480p_or_larger ? -2.0f : -2.4f; + break; + case BLOCK_8X8: + nn_config = &av1_early_term_after_split_nnconfig_8; + thresh = is_480p_or_larger ? -1.0f : -1.4f; + break; + case BLOCK_4X4: break; + default: + assert(0 && "Invalid block size in av1_ml_early_term_after_split()."); + break; + } + if (!nn_config) return; + + // Use more conservative threshold for level 1. + if (cpi->sf.part_sf.ml_early_term_after_part_split_level < 2) thresh -= 0.3f; + + const MACROBLOCKD *const xd = &x->e_mbd; + const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8); + const int bs = block_size_wide[bsize]; + int f_idx = 0; + float features[FEATURES] = { 0.0f }; + + aom_clear_system_state(); + + features[f_idx++] = logf(1.0f + (float)dc_q / 4.0f); + features[f_idx++] = logf(1.0f + (float)best_rd / bs / bs / 1024.0f); + + add_rd_feature(part_none_rd, best_rd, features, &f_idx); + add_rd_feature(part_split_rd, best_rd, features, &f_idx); + + for (int i = 0; i < 4; ++i) { + add_rd_feature(split_block_rd[i], best_rd, features, &f_idx); + int min_bw = MAX_SB_SIZE_LOG2; + int min_bh = MAX_SB_SIZE_LOG2; + get_min_bsize(pc_tree->split[i], &min_bw, &min_bh); + features[f_idx++] = (float)min_bw; + features[f_idx++] = (float)min_bh; + } + + simple_motion_search_prune_part_features(cpi, x, pc_tree, mi_row, mi_col, + bsize, NULL, + FEATURE_SMS_PRUNE_PART_FLAG); + + features[f_idx++] = logf(1.0f + (float)pc_tree->sms_none_feat[1]); + + features[f_idx++] = logf(1.0f + (float)pc_tree->split[0]->sms_none_feat[1]); + features[f_idx++] = logf(1.0f + (float)pc_tree->split[1]->sms_none_feat[1]); + features[f_idx++] = logf(1.0f + (float)pc_tree->split[2]->sms_none_feat[1]); + features[f_idx++] = logf(1.0f + (float)pc_tree->split[3]->sms_none_feat[1]); + + features[f_idx++] = logf(1.0f + (float)pc_tree->sms_rect_feat[1]); + features[f_idx++] = logf(1.0f + (float)pc_tree->sms_rect_feat[3]); + features[f_idx++] = logf(1.0f + (float)pc_tree->sms_rect_feat[5]); + features[f_idx++] = logf(1.0f + (float)pc_tree->sms_rect_feat[7]); + + assert(f_idx == FEATURES); + + float score = 0.0f; + av1_nn_predict(features, nn_config, 1, &score); + // Score is indicator of confidence that we should NOT terminate. + if (score < thresh) *terminate_partition_search = 1; +} +#undef FEATURES + +void av1_ml_prune_rect_partition(const AV1_COMP *const cpi, + const MACROBLOCK *const x, BLOCK_SIZE bsize, + int64_t best_rd, int64_t none_rd, + int64_t *split_rd, int *const dst_prune_horz, + int *const dst_prune_vert) { + if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return; + best_rd = AOMMAX(best_rd, 1); + const NN_CONFIG *nn_config = NULL; + const float prob_thresholds[5] = { 0.01f, 0.01f, 0.004f, 0.002f, 0.002f }; + float cur_thresh = 0.0f; + switch (bsize) { + case BLOCK_8X8: + nn_config = &av1_rect_partition_nnconfig_8; + cur_thresh = prob_thresholds[0]; + break; + case BLOCK_16X16: + nn_config = &av1_rect_partition_nnconfig_16; + cur_thresh = prob_thresholds[1]; + break; + case BLOCK_32X32: + nn_config = &av1_rect_partition_nnconfig_32; + cur_thresh = prob_thresholds[2]; + break; + case BLOCK_64X64: + nn_config = &av1_rect_partition_nnconfig_64; + cur_thresh = prob_thresholds[3]; + break; + case BLOCK_128X128: + nn_config = &av1_rect_partition_nnconfig_128; + cur_thresh = prob_thresholds[4]; + break; + default: assert(0 && "Unexpected bsize."); + } + if (!nn_config) return; + aom_clear_system_state(); + + // 1. Compute input features + float features[9]; + + // RD cost ratios + for (int i = 0; i < 5; i++) features[i] = 1.0f; + if (none_rd > 0 && none_rd < 1000000000) + features[0] = (float)none_rd / (float)best_rd; + for (int i = 0; i < 4; i++) { + if (split_rd[i] > 0 && split_rd[i] < 1000000000) + features[1 + i] = (float)split_rd[i] / (float)best_rd; + } + + // Variance ratios + const MACROBLOCKD *const xd = &x->e_mbd; + int whole_block_variance; + if (is_cur_buf_hbd(xd)) { + whole_block_variance = av1_high_get_sby_perpixel_variance( + cpi, &x->plane[0].src, bsize, xd->bd); + } else { + whole_block_variance = + av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize); + } + whole_block_variance = AOMMAX(whole_block_variance, 1); + + int split_variance[4]; + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + struct buf_2d buf; + buf.stride = x->plane[0].src.stride; + const int bw = block_size_wide[bsize]; + for (int i = 0; i < 4; ++i) { + const int x_idx = (i & 1) * bw / 2; + const int y_idx = (i >> 1) * bw / 2; + buf.buf = x->plane[0].src.buf + x_idx + y_idx * buf.stride; + if (is_cur_buf_hbd(xd)) { + split_variance[i] = + av1_high_get_sby_perpixel_variance(cpi, &buf, subsize, xd->bd); + } else { + split_variance[i] = av1_get_sby_perpixel_variance(cpi, &buf, subsize); + } + } + + for (int i = 0; i < 4; i++) + features[5 + i] = (float)split_variance[i] / (float)whole_block_variance; + + // 2. Do the prediction and prune 0-2 partitions based on their probabilities + float raw_scores[3] = { 0.0f }; + av1_nn_predict(features, nn_config, 1, raw_scores); + aom_clear_system_state(); + float probs[3] = { 0.0f }; + av1_nn_softmax(raw_scores, probs, 3); + + // probs[0] is the probability of the fact that both rectangular partitions + // are worse than current best_rd + if (probs[1] <= cur_thresh) (*dst_prune_horz) = 1; + if (probs[2] <= cur_thresh) (*dst_prune_vert) = 1; +} + +// Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be +// considered. +void av1_ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx, + int64_t best_rd, int64_t horz_rd[2], + int64_t vert_rd[2], int64_t split_rd[4], + int *const horza_partition_allowed, + int *const horzb_partition_allowed, + int *const verta_partition_allowed, + int *const vertb_partition_allowed) { + if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return; + const NN_CONFIG *nn_config = NULL; + switch (bsize) { + case BLOCK_8X8: nn_config = NULL; break; + case BLOCK_16X16: nn_config = &av1_ab_partition_nnconfig_16; break; + case BLOCK_32X32: nn_config = &av1_ab_partition_nnconfig_32; break; + case BLOCK_64X64: nn_config = &av1_ab_partition_nnconfig_64; break; + case BLOCK_128X128: nn_config = &av1_ab_partition_nnconfig_128; break; + default: assert(0 && "Unexpected bsize."); + } + if (!nn_config) return; + + aom_clear_system_state(); + + // Generate features. + float features[10]; + int feature_index = 0; + features[feature_index++] = (float)part_ctx; + features[feature_index++] = (float)var_ctx; + const int rdcost = (int)AOMMIN(INT_MAX, best_rd); + int sub_block_rdcost[8] = { 0 }; + int rd_index = 0; + for (int i = 0; i < 2; ++i) { + if (horz_rd[i] > 0 && horz_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)horz_rd[i]; + ++rd_index; + } + for (int i = 0; i < 2; ++i) { + if (vert_rd[i] > 0 && vert_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)vert_rd[i]; + ++rd_index; + } + for (int i = 0; i < 4; ++i) { + if (split_rd[i] > 0 && split_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)split_rd[i]; + ++rd_index; + } + for (int i = 0; i < 8; ++i) { + // Ratio between the sub-block RD and the whole-block RD. + float rd_ratio = 1.0f; + if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost) + rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost; + features[feature_index++] = rd_ratio; + } + assert(feature_index == 10); + + // Calculate scores using the NN model. + float score[16] = { 0.0f }; + av1_nn_predict(features, nn_config, 1, score); + aom_clear_system_state(); + int int_score[16]; + int max_score = -1000; + for (int i = 0; i < 16; ++i) { + int_score[i] = (int)(100 * score[i]); + max_score = AOMMAX(int_score[i], max_score); + } + + // Make decisions based on the model scores. + int thresh = max_score; + switch (bsize) { + case BLOCK_16X16: thresh -= 150; break; + case BLOCK_32X32: thresh -= 100; break; + default: break; + } + *horza_partition_allowed = 0; + *horzb_partition_allowed = 0; + *verta_partition_allowed = 0; + *vertb_partition_allowed = 0; + for (int i = 0; i < 16; ++i) { + if (int_score[i] >= thresh) { + if ((i >> 0) & 1) *horza_partition_allowed = 1; + if ((i >> 1) & 1) *horzb_partition_allowed = 1; + if ((i >> 2) & 1) *verta_partition_allowed = 1; + if ((i >> 3) & 1) *vertb_partition_allowed = 1; + } + } +} + +#define FEATURES 18 +#define LABELS 4 +// Use a ML model to predict if horz4 and vert4 should be considered. +void av1_ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x, + BLOCK_SIZE bsize, int part_ctx, int64_t best_rd, + int64_t horz_rd[2], int64_t vert_rd[2], + int64_t split_rd[4], + int *const partition_horz4_allowed, + int *const partition_vert4_allowed, + unsigned int pb_source_variance, int mi_row, + int mi_col) { + if (best_rd >= 1000000000) return; + const NN_CONFIG *nn_config = NULL; + switch (bsize) { + case BLOCK_16X16: nn_config = &av1_4_partition_nnconfig_16; break; + case BLOCK_32X32: nn_config = &av1_4_partition_nnconfig_32; break; + case BLOCK_64X64: nn_config = &av1_4_partition_nnconfig_64; break; + default: assert(0 && "Unexpected bsize."); + } + if (!nn_config) return; + + aom_clear_system_state(); + + // Generate features. + float features[FEATURES]; + int feature_index = 0; + features[feature_index++] = (float)part_ctx; + features[feature_index++] = (float)get_unsigned_bits(pb_source_variance); + + const int rdcost = (int)AOMMIN(INT_MAX, best_rd); + int sub_block_rdcost[8] = { 0 }; + int rd_index = 0; + for (int i = 0; i < 2; ++i) { + if (horz_rd[i] > 0 && horz_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)horz_rd[i]; + ++rd_index; + } + for (int i = 0; i < 2; ++i) { + if (vert_rd[i] > 0 && vert_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)vert_rd[i]; + ++rd_index; + } + for (int i = 0; i < 4; ++i) { + if (split_rd[i] > 0 && split_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)split_rd[i]; + ++rd_index; + } + for (int i = 0; i < 8; ++i) { + // Ratio between the sub-block RD and the whole-block RD. + float rd_ratio = 1.0f; + if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost) + rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost; + features[feature_index++] = rd_ratio; + } + + // Get variance of the 1:4 and 4:1 sub-blocks. + unsigned int horz_4_source_var[4] = { 0 }; + unsigned int vert_4_source_var[4] = { 0 }; + { + BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4); + BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4); + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, + av1_num_planes(&cpi->common), bsize); + const int src_stride = x->plane[0].src.stride; + uint8_t *src = x->plane[0].src.buf; + const MACROBLOCKD *const xd = &x->e_mbd; + + struct buf_2d horz_4_src, vert_4_src; + horz_4_src.stride = src_stride; + vert_4_src.stride = src_stride; + + for (int i = 0; i < 4; ++i) { + horz_4_src.buf = src + i * block_size_high[horz_4_bs] * src_stride; + vert_4_src.buf = src + i * block_size_wide[vert_4_bs]; + + if (is_cur_buf_hbd(xd)) { + horz_4_source_var[i] = av1_high_get_sby_perpixel_variance( + cpi, &horz_4_src, horz_4_bs, xd->bd); + vert_4_source_var[i] = av1_high_get_sby_perpixel_variance( + cpi, &vert_4_src, vert_4_bs, xd->bd); + } else { + horz_4_source_var[i] = + av1_get_sby_perpixel_variance(cpi, &horz_4_src, horz_4_bs); + vert_4_source_var[i] = + av1_get_sby_perpixel_variance(cpi, &vert_4_src, vert_4_bs); + } + } + } + + const float denom = (float)(pb_source_variance + 1); + const float low_b = 0.1f; + const float high_b = 10.0f; + for (int i = 0; i < 4; ++i) { + // Ratio between the 4:1 sub-block variance and the whole-block variance. + float var_ratio = (float)(horz_4_source_var[i] + 1) / denom; + if (var_ratio < low_b) var_ratio = low_b; + if (var_ratio > high_b) var_ratio = high_b; + features[feature_index++] = var_ratio; + } + for (int i = 0; i < 4; ++i) { + // Ratio between the 1:4 sub-block RD and the whole-block RD. + float var_ratio = (float)(vert_4_source_var[i] + 1) / denom; + if (var_ratio < low_b) var_ratio = low_b; + if (var_ratio > high_b) var_ratio = high_b; + features[feature_index++] = var_ratio; + } + assert(feature_index == FEATURES); + + // Calculate scores using the NN model. + float score[LABELS] = { 0.0f }; + av1_nn_predict(features, nn_config, 1, score); + aom_clear_system_state(); + int int_score[LABELS]; + int max_score = -1000; + for (int i = 0; i < LABELS; ++i) { + int_score[i] = (int)(100 * score[i]); + max_score = AOMMAX(int_score[i], max_score); + } + + // Make decisions based on the model scores. + int thresh = max_score; + switch (bsize) { + case BLOCK_16X16: thresh -= 500; break; + case BLOCK_32X32: thresh -= 500; break; + case BLOCK_64X64: thresh -= 200; break; + default: break; + } + *partition_horz4_allowed = 0; + *partition_vert4_allowed = 0; + for (int i = 0; i < LABELS; ++i) { + if (int_score[i] >= thresh) { + if ((i >> 0) & 1) *partition_horz4_allowed = 1; + if ((i >> 1) & 1) *partition_vert4_allowed = 1; + } + } +} +#undef FEATURES +#undef LABELS + +#define FEATURES 4 +int av1_ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize, + const MACROBLOCK *const x, + const RD_STATS *const rd_stats, + unsigned int pb_source_variance) { + const NN_CONFIG *nn_config = NULL; + int thresh = 0; + switch (bsize) { + case BLOCK_8X8: + nn_config = &av1_partition_breakout_nnconfig_8; + thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[0]; + break; + case BLOCK_16X16: + nn_config = &av1_partition_breakout_nnconfig_16; + thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[1]; + break; + case BLOCK_32X32: + nn_config = &av1_partition_breakout_nnconfig_32; + thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[2]; + break; + case BLOCK_64X64: + nn_config = &av1_partition_breakout_nnconfig_64; + thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[3]; + break; + case BLOCK_128X128: + nn_config = &av1_partition_breakout_nnconfig_128; + thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[4]; + break; + default: assert(0 && "Unexpected bsize."); + } + if (!nn_config || thresh < 0) return 0; + + // Generate feature values. + float features[FEATURES]; + int feature_index = 0; + aom_clear_system_state(); + + const int num_pels_log2 = num_pels_log2_lookup[bsize]; + float rate_f = (float)AOMMIN(rd_stats->rate, INT_MAX); + rate_f = ((float)x->rdmult / 128.0f / 512.0f / (float)(1 << num_pels_log2)) * + rate_f; + features[feature_index++] = rate_f; + + const float dist_f = + (float)(AOMMIN(rd_stats->dist, INT_MAX) >> num_pels_log2); + features[feature_index++] = dist_f; + + features[feature_index++] = (float)pb_source_variance; + + const int dc_q = (int)x->plane[0].dequant_QTX[0]; + features[feature_index++] = (float)(dc_q * dc_q) / 256.0f; + assert(feature_index == FEATURES); + + // Calculate score using the NN model. + float score = 0.0f; + av1_nn_predict(features, nn_config, 1, &score); + aom_clear_system_state(); + + // Make decision. + return (int)(score * 100) >= thresh; +} +#undef FEATURES +#endif // !CONFIG_REALTIME_ONLY diff --git a/libs/libaom/src/av1/encoder/partition_strategy.h b/libs/libaom/src/av1/encoder/partition_strategy.h new file mode 100644 index 000000000..f9b4d8bfd --- /dev/null +++ b/libs/libaom/src/av1/encoder/partition_strategy.h @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_PARTITION_STRATEGY_H_ +#define AOM_AV1_ENCODER_PARTITION_STRATEGY_H_ + +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encodemb.h" +#include "av1/encoder/encoder.h" + +#define FEATURE_SIZE_SMS_SPLIT_FAST 6 +#define FEATURE_SIZE_SMS_SPLIT 17 +#define FEATURE_SIZE_SMS_PRUNE_PART 25 +#define FEATURE_SIZE_SMS_TERM_NONE 28 +#define FEATURE_SIZE_FP_SMS_TERM_NONE 20 +#define FEATURE_SIZE_MAX_MIN_PART_PRED 13 +#define MAX_NUM_CLASSES_MAX_MIN_PART_PRED 4 + +#define FEATURE_SMS_NONE_FLAG 1 +#define FEATURE_SMS_SPLIT_FLAG (1 << 1) +#define FEATURE_SMS_RECT_FLAG (1 << 2) + +#define FEATURE_SMS_PRUNE_PART_FLAG \ + (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG | FEATURE_SMS_RECT_FLAG) +#define FEATURE_SMS_SPLIT_MODEL_FLAG \ + (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG) + +void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x, + int bsize, int label_idx, + int *partition_none_allowed, + int *partition_horz_allowed, + int *partition_vert_allowed, + int *do_rectangular_split, + int *do_square_split); + +// Performs a simple_motion_search with a single reference frame and extract +// the variance of residues. Then use the features to determine whether we want +// to go straight to splitting without trying PARTITION_NONE +void av1_simple_motion_search_based_split( + AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row, + int mi_col, BLOCK_SIZE bsize, int *partition_none_allowed, + int *partition_horz_allowed, int *partition_vert_allowed, + int *do_rectangular_split, int *do_square_split); + +// Performs a simple_motion_search with two reference frames and extract +// the variance of residues. Then use the features to determine whether we want +// to prune some partitions. +void av1_simple_motion_search_prune_rect(AV1_COMP *const cpi, MACROBLOCK *x, + PC_TREE *pc_tree, int mi_row, + int mi_col, BLOCK_SIZE bsize, + int *partition_horz_allowed, + int *partition_vert_allowed, + int *prune_horz, int *prune_vert); + +#if !CONFIG_REALTIME_ONLY +// Early terminates PARTITION_NONE using simple_motion_search features and the +// rate, distortion, and rdcost of PARTITION_NONE. This is only called when: +// - The frame is a show frame +// - The frame is not intra only +// - The current bsize is > BLOCK_8X8 +// - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols +void av1_simple_motion_search_early_term_none(AV1_COMP *const cpi, + MACROBLOCK *x, PC_TREE *pc_tree, + int mi_row, int mi_col, + BLOCK_SIZE bsize, + const RD_STATS *none_rdc, + int *early_terminate); + +// Get the features for selecting the max and min partition size. Currently this +// performs simple_motion_search on 16X16 subblocks of the current superblock, +// and then extract the statistics of sse and motion vectors as features. +void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x, + int mi_row, int mi_col, + float *features); + +// Predict the maximum BLOCK_SIZE to be used to encoder the current superblock. +BLOCK_SIZE av1_predict_max_partition(AV1_COMP *const cpi, MACROBLOCK *const x, + const float *features); + +// Attempts an early termination after PARTITION_SPLIT. +void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x, + PC_TREE *const pc_tree, BLOCK_SIZE bsize, + int64_t best_rd, int64_t part_none_rd, + int64_t part_split_rd, + int64_t *split_block_rd, int mi_row, + int mi_col, + int *const terminate_partition_search); + +// Use the rdcost ratio and source var ratio to prune PARTITION_HORZ and +// PARTITION_VERT. +// TODO(chiyotsai@google.com): Currently this model does not use q value and has +// no information about rectangular partitions. Preliminary experiments suggest +// that we can get better performance by adding in q_index and rectangular +// sse/var from SMS. We should retrain and tune this model later. +void av1_ml_prune_rect_partition(const AV1_COMP *const cpi, + const MACROBLOCK *const x, BLOCK_SIZE bsize, + int64_t best_rd, int64_t none_rd, + int64_t *split_rd, int *const dst_prune_horz, + int *const dst_prune_vert); + +// Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be +// considered. +void av1_ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx, + int64_t best_rd, int64_t horz_rd[2], + int64_t vert_rd[2], int64_t split_rd[4], + int *const horza_partition_allowed, + int *const horzb_partition_allowed, + int *const verta_partition_allowed, + int *const vertb_partition_allowed); + +// Use a ML model to predict if horz4 and vert4 should be considered. +void av1_ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x, + BLOCK_SIZE bsize, int part_ctx, int64_t best_rd, + int64_t horz_rd[2], int64_t vert_rd[2], + int64_t split_rd[4], + int *const partition_horz4_allowed, + int *const partition_vert4_allowed, + unsigned int pb_source_variance, int mi_row, + int mi_col); + +// ML-based partition search breakout after PARTITION_NONE +int av1_ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize, + const MACROBLOCK *const x, + const RD_STATS *const rd_stats, + unsigned int pb_source_variance); +#endif // !CONFIG_REALTIME_ONLY + +// A simplified version of set_offsets meant to be used for +// simple_motion_search. +static INLINE void set_offsets_for_motion_search(const AV1_COMP *const cpi, + MACROBLOCK *const x, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + + set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd, + mi_row, mi_col); + + // Set up destination pointers. + av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0, + num_planes); + + // Set up limit values for MV components. + // Mv beyond the range do not produce new/different prediction block. + av1_set_mv_limits(mi_params, &x->mv_limits, mi_row, mi_col, mi_height, + mi_width, cpi->oxcf.border_in_pixels); + + set_plane_n4(xd, mi_width, mi_height, num_planes); + + xd->mi_row = mi_row; + xd->mi_col = mi_col; + + // Set up distance of MB to edge of frame in 1/8th pel units. + assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1))); + xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE); + xd->mb_to_bottom_edge = + GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE); + xd->mb_to_left_edge = -GET_MV_SUBPEL(mi_col * MI_SIZE); + xd->mb_to_right_edge = + GET_MV_SUBPEL((mi_params->mi_cols - mi_width - mi_col) * MI_SIZE); + + // Set up source buffers. + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize); +} + +static INLINE void init_simple_motion_search_mvs(PC_TREE *pc_tree) { + av1_zero(pc_tree->start_mvs); + + av1_zero(pc_tree->sms_none_feat); + av1_zero(pc_tree->sms_rect_feat); + av1_zero(pc_tree->sms_none_valid); + av1_zero(pc_tree->sms_rect_valid); + + if (pc_tree->block_size >= BLOCK_8X8) { + init_simple_motion_search_mvs(pc_tree->split[0]); + init_simple_motion_search_mvs(pc_tree->split[1]); + init_simple_motion_search_mvs(pc_tree->split[2]); + init_simple_motion_search_mvs(pc_tree->split[3]); + } +} + +static INLINE int is_full_sb(const CommonModeInfoParams *const mi_params, + int mi_row, int mi_col, BLOCK_SIZE sb_size) { + const int sb_mi_wide = mi_size_wide[sb_size]; + const int sb_mi_high = mi_size_high[sb_size]; + + return (mi_row + sb_mi_high) <= mi_params->mi_rows && + (mi_col + sb_mi_wide) <= mi_params->mi_cols; +} + +// Do not use this criteria for screen content videos. +// Since screen content videos could often find good predictors and the largest +// block size is likely to be used. +static INLINE int use_auto_max_partition(AV1_COMP *const cpi, + BLOCK_SIZE sb_size, int mi_row, + int mi_col) { + assert(IMPLIES(cpi->gf_group.size > 0, + cpi->gf_group.index < cpi->gf_group.size)); + AV1_COMMON *const cm = &cpi->common; + return !frame_is_intra_only(cm) && !cpi->is_screen_content_type && + cpi->sf.part_sf.auto_max_partition_based_on_simple_motion != + NOT_IN_USE && + sb_size == BLOCK_128X128 && + is_full_sb(&cm->mi_params, mi_row, mi_col, sb_size) && + cpi->gf_group.update_type[cpi->gf_group.index] != OVERLAY_UPDATE && + cpi->gf_group.update_type[cpi->gf_group.index] != INTNL_OVERLAY_UPDATE; +} + +#endif // AOM_AV1_ENCODER_PARTITION_STRATEGY_H_ diff --git a/libs/libaom/src/av1/encoder/pass2_strategy.c b/libs/libaom/src/av1/encoder/pass2_strategy.c new file mode 100644 index 000000000..6adc1fbf9 --- /dev/null +++ b/libs/libaom/src/av1/encoder/pass2_strategy.c @@ -0,0 +1,2895 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_codec.h" +#include "aom/aom_encoder.h" + +#include "aom_ports/system_state.h" + +#include "av1/common/av1_common_int.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/gop_structure.h" +#include "av1/encoder/pass2_strategy.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/tpl_model.h" +#include "av1/encoder/use_flat_gop_model_params.h" +#include "av1/encoder/encode_strategy.h" + +#define DEFAULT_KF_BOOST 2300 +#define DEFAULT_GF_BOOST 2000 +#define GROUP_ADAPTIVE_MAXQ 1 +static void init_gf_stats(GF_GROUP_STATS *gf_stats); + +// Calculate an active area of the image that discounts formatting +// bars and partially discounts other 0 energy areas. +#define MIN_ACTIVE_AREA 0.5 +#define MAX_ACTIVE_AREA 1.0 +static double calculate_active_area(const FRAME_INFO *frame_info, + const FIRSTPASS_STATS *this_frame) { + const double active_pct = + 1.0 - + ((this_frame->intra_skip_pct / 2) + + ((this_frame->inactive_zone_rows * 2) / (double)frame_info->mb_rows)); + return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA); +} + +// Calculate a modified Error used in distributing bits between easier and +// harder frames. +#define ACT_AREA_CORRECTION 0.5 +static double calculate_modified_err(const FRAME_INFO *frame_info, + const TWO_PASS *twopass, + const AV1EncoderConfig *oxcf, + const FIRSTPASS_STATS *this_frame) { + const FIRSTPASS_STATS *const stats = twopass->stats_buf_ctx->total_stats; + if (stats == NULL) { + return 0; + } + const double av_weight = stats->weight / stats->count; + const double av_err = (stats->coded_error * av_weight) / stats->count; + double modified_error = + av_err * pow(this_frame->coded_error * this_frame->weight / + DOUBLE_DIVIDE_CHECK(av_err), + oxcf->two_pass_vbrbias / 100.0); + + // Correction for active area. Frames with a reduced active area + // (eg due to formatting bars) have a higher error per mb for the + // remaining active MBs. The correction here assumes that coding + // 0.5N blocks of complexity 2X is a little easier than coding N + // blocks of complexity X. + modified_error *= + pow(calculate_active_area(frame_info, this_frame), ACT_AREA_CORRECTION); + + return fclamp(modified_error, twopass->modified_error_min, + twopass->modified_error_max); +} + +// Resets the first pass file to the given position using a relative seek from +// the current position. +static void reset_fpf_position(TWO_PASS *p, const FIRSTPASS_STATS *position) { + p->stats_in = position; +} + +static int input_stats(TWO_PASS *p, FIRSTPASS_STATS *fps) { + if (p->stats_in >= p->stats_buf_ctx->stats_in_end) return EOF; + + *fps = *p->stats_in; + ++p->stats_in; + return 1; +} + +static int input_stats_lap(TWO_PASS *p, FIRSTPASS_STATS *fps) { + if (p->stats_in >= p->stats_buf_ctx->stats_in_end) return EOF; + + *fps = *p->stats_in; + /* Move old stats[0] out to accommodate for next frame stats */ + memmove(p->frame_stats_arr[0], p->frame_stats_arr[1], + (p->stats_buf_ctx->stats_in_end - p->stats_in - 1) * + sizeof(FIRSTPASS_STATS)); + p->stats_buf_ctx->stats_in_end--; + return 1; +} + +// Read frame stats at an offset from the current position. +static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) { + if ((offset >= 0 && p->stats_in + offset >= p->stats_buf_ctx->stats_in_end) || + (offset < 0 && p->stats_in + offset < p->stats_buf_ctx->stats_in_start)) { + return NULL; + } + + return &p->stats_in[offset]; +} + +static void subtract_stats(FIRSTPASS_STATS *section, + const FIRSTPASS_STATS *frame) { + section->frame -= frame->frame; + section->weight -= frame->weight; + section->intra_error -= frame->intra_error; + section->frame_avg_wavelet_energy -= frame->frame_avg_wavelet_energy; + section->coded_error -= frame->coded_error; + section->sr_coded_error -= frame->sr_coded_error; + section->pcnt_inter -= frame->pcnt_inter; + section->pcnt_motion -= frame->pcnt_motion; + section->pcnt_second_ref -= frame->pcnt_second_ref; + section->pcnt_neutral -= frame->pcnt_neutral; + section->intra_skip_pct -= frame->intra_skip_pct; + section->inactive_zone_rows -= frame->inactive_zone_rows; + section->inactive_zone_cols -= frame->inactive_zone_cols; + section->MVr -= frame->MVr; + section->mvr_abs -= frame->mvr_abs; + section->MVc -= frame->MVc; + section->mvc_abs -= frame->mvc_abs; + section->MVrv -= frame->MVrv; + section->MVcv -= frame->MVcv; + section->mv_in_out_count -= frame->mv_in_out_count; + section->new_mv_count -= frame->new_mv_count; + section->count -= frame->count; + section->duration -= frame->duration; +} + +// This function returns the maximum target rate per frame. +static int frame_max_bits(const RATE_CONTROL *rc, + const AV1EncoderConfig *oxcf) { + int64_t max_bits = ((int64_t)rc->avg_frame_bandwidth * + (int64_t)oxcf->two_pass_vbrmax_section) / + 100; + if (max_bits < 0) + max_bits = 0; + else if (max_bits > rc->max_frame_bandwidth) + max_bits = rc->max_frame_bandwidth; + + return (int)max_bits; +} + +static const double q_pow_term[(QINDEX_RANGE >> 5) + 1] = { 0.65, 0.70, 0.75, + 0.80, 0.85, 0.90, + 0.95, 0.95, 0.95 }; +#define ERR_DIVISOR 96.0 +static double calc_correction_factor(double err_per_mb, int q) { + const double error_term = err_per_mb / ERR_DIVISOR; + const int index = q >> 5; + // Adjustment to power term based on qindex + const double power_term = + q_pow_term[index] + + (((q_pow_term[index + 1] - q_pow_term[index]) * (q % 32)) / 32.0); + assert(error_term >= 0.0); + return fclamp(pow(error_term, power_term), 0.05, 5.0); +} + +static void twopass_update_bpm_factor(TWO_PASS *twopass) { + // Based on recent history adjust expectations of bits per macroblock. + double last_group_rate_err = + (double)twopass->rolling_arf_group_actual_bits / + DOUBLE_DIVIDE_CHECK((double)twopass->rolling_arf_group_target_bits); + last_group_rate_err = AOMMAX(0.25, AOMMIN(4.0, last_group_rate_err)); + twopass->bpm_factor *= (3.0 + last_group_rate_err) / 4.0; + twopass->bpm_factor = AOMMAX(0.25, AOMMIN(4.0, twopass->bpm_factor)); +} + +static int qbpm_enumerator(int rate_err_tol) { + return 1350000 + ((300000 * AOMMIN(75, AOMMAX(rate_err_tol - 25, 0))) / 75); +} + +// Similar to find_qindex_by_rate() function in ratectrl.c, but includes +// calculation of a correction_factor. +static int find_qindex_by_rate_with_correction( + int desired_bits_per_mb, aom_bit_depth_t bit_depth, double error_per_mb, + double group_weight_factor, int rate_err_tol, int best_qindex, + int worst_qindex) { + assert(best_qindex <= worst_qindex); + int low = best_qindex; + int high = worst_qindex; + + while (low < high) { + const int mid = (low + high) >> 1; + const double mid_factor = calc_correction_factor(error_per_mb, mid); + const double q = av1_convert_qindex_to_q(mid, bit_depth); + const int enumerator = qbpm_enumerator(rate_err_tol); + const int mid_bits_per_mb = + (int)((enumerator * mid_factor * group_weight_factor) / q); + + if (mid_bits_per_mb > desired_bits_per_mb) { + low = mid + 1; + } else { + high = mid; + } + } + return low; +} + +static int get_twopass_worst_quality(AV1_COMP *cpi, const double section_err, + double inactive_zone, + int section_target_bandwidth, + double group_weight_factor) { + const RATE_CONTROL *const rc = &cpi->rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + + inactive_zone = fclamp(inactive_zone, 0.0, 1.0); + + if (section_target_bandwidth <= 0) { + return rc->worst_quality; // Highest value allowed + } else { + const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) + ? cpi->initial_mbs + : cpi->common.mi_params.MBs; + const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone)); + const double av_err_per_mb = section_err / active_mbs; + const int target_norm_bits_per_mb = + (int)((uint64_t)section_target_bandwidth << BPER_MB_NORMBITS) / + active_mbs; + int rate_err_tol = + AOMMIN(cpi->oxcf.under_shoot_pct, cpi->oxcf.over_shoot_pct); + + twopass_update_bpm_factor(&cpi->twopass); + // Try and pick a max Q that will be high enough to encode the + // content at the given rate. + int q = find_qindex_by_rate_with_correction( + target_norm_bits_per_mb, cpi->common.seq_params.bit_depth, + av_err_per_mb, group_weight_factor, rate_err_tol, rc->best_quality, + rc->worst_quality); + + // Restriction on active max q for constrained quality mode. + if (cpi->oxcf.rc_mode == AOM_CQ) q = AOMMAX(q, oxcf->cq_level); + return q; + } +} + +#define SR_DIFF_PART 0.0015 +#define MOTION_AMP_PART 0.003 +#define INTRA_PART 0.005 +#define DEFAULT_DECAY_LIMIT 0.75 +#define LOW_SR_DIFF_TRHESH 0.1 +#define SR_DIFF_MAX 128.0 +#define NCOUNT_FRAME_II_THRESH 5.0 + +static double get_sr_decay_rate(const FRAME_INFO *frame_info, + const FIRSTPASS_STATS *frame) { + const int num_mbs = frame_info->num_mbs; + double sr_diff = (frame->sr_coded_error - frame->coded_error) / num_mbs; + double sr_decay = 1.0; + double modified_pct_inter; + double modified_pcnt_intra; + const double motion_amplitude_factor = + frame->pcnt_motion * ((frame->mvc_abs + frame->mvr_abs) / 2); + + modified_pct_inter = frame->pcnt_inter; + if ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) < + (double)NCOUNT_FRAME_II_THRESH) { + modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral; + } + modified_pcnt_intra = 100 * (1.0 - modified_pct_inter); + + if ((sr_diff > LOW_SR_DIFF_TRHESH)) { + sr_diff = AOMMIN(sr_diff, SR_DIFF_MAX); + sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) - + (MOTION_AMP_PART * motion_amplitude_factor) - + (INTRA_PART * modified_pcnt_intra); + } + return AOMMAX(sr_decay, AOMMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter)); +} + +// This function gives an estimate of how badly we believe the prediction +// quality is decaying from frame to frame. +static double get_zero_motion_factor(const FRAME_INFO *frame_info, + const FIRSTPASS_STATS *frame) { + const double zero_motion_pct = frame->pcnt_inter - frame->pcnt_motion; + double sr_decay = get_sr_decay_rate(frame_info, frame); + return AOMMIN(sr_decay, zero_motion_pct); +} + +#define ZM_POWER_FACTOR 0.75 + +static double get_prediction_decay_rate(const FRAME_INFO *frame_info, + const FIRSTPASS_STATS *next_frame) { + const double sr_decay_rate = get_sr_decay_rate(frame_info, next_frame); + const double zero_motion_factor = + (0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion), + ZM_POWER_FACTOR)); + + return AOMMAX(zero_motion_factor, + (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor))); +} + +// Function to test for a condition where a complex transition is followed +// by a static section. For example in slide shows where there is a fade +// between slides. This is to help with more optimal kf and gf positioning. +static int detect_transition_to_still(TWO_PASS *const twopass, + const int min_gf_interval, + const int frame_interval, + const int still_interval, + const double loop_decay_rate, + const double last_decay_rate) { + // Break clause to detect very still sections after motion + // For example a static image after a fade or other transition + // instead of a clean scene cut. + if (frame_interval > min_gf_interval && loop_decay_rate >= 0.999 && + last_decay_rate < 0.9) { + int j; + // Look ahead a few frames to see if static condition persists... + for (j = 0; j < still_interval; ++j) { + const FIRSTPASS_STATS *stats = &twopass->stats_in[j]; + if (stats >= twopass->stats_buf_ctx->stats_in_end) break; + + if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break; + } + // Only if it does do we signal a transition to still. + return j == still_interval; + } + return 0; +} + +// This function detects a flash through the high relative pcnt_second_ref +// score in the frame following a flash frame. The offset passed in should +// reflect this. +static int detect_flash(const TWO_PASS *twopass, const int offset) { + const FIRSTPASS_STATS *const next_frame = read_frame_stats(twopass, offset); + + // What we are looking for here is a situation where there is a + // brief break in prediction (such as a flash) but subsequent frames + // are reasonably well predicted by an earlier (pre flash) frame. + // The recovery after a flash is indicated by a high pcnt_second_ref + // compared to pcnt_inter. + return next_frame != NULL && + next_frame->pcnt_second_ref > next_frame->pcnt_inter && + next_frame->pcnt_second_ref >= 0.5; +} + +// Update the motion related elements to the GF arf boost calculation. +static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats, + GF_GROUP_STATS *gf_stats) { + const double pct = stats->pcnt_motion; + + // Accumulate Motion In/Out of frame stats. + gf_stats->this_frame_mv_in_out = stats->mv_in_out_count * pct; + gf_stats->mv_in_out_accumulator += gf_stats->this_frame_mv_in_out; + gf_stats->abs_mv_in_out_accumulator += fabs(gf_stats->this_frame_mv_in_out); + + // Accumulate a measure of how uniform (or conversely how random) the motion + // field is (a ratio of abs(mv) / mv). + if (pct > 0.05) { + const double mvr_ratio = + fabs(stats->mvr_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVr)); + const double mvc_ratio = + fabs(stats->mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVc)); + + gf_stats->mv_ratio_accumulator += + pct * (mvr_ratio < stats->mvr_abs ? mvr_ratio : stats->mvr_abs); + gf_stats->mv_ratio_accumulator += + pct * (mvc_ratio < stats->mvc_abs ? mvc_ratio : stats->mvc_abs); + } +} + +static void accumulate_this_frame_stats(const FIRSTPASS_STATS *stats, + const double mod_frame_err, + GF_GROUP_STATS *gf_stats) { + gf_stats->gf_group_err += mod_frame_err; +#if GROUP_ADAPTIVE_MAXQ + gf_stats->gf_group_raw_error += stats->coded_error; +#endif + gf_stats->gf_group_skip_pct += stats->intra_skip_pct; + gf_stats->gf_group_inactive_zone_rows += stats->inactive_zone_rows; +} + +static void accumulate_next_frame_stats( + const FIRSTPASS_STATS *stats, const FRAME_INFO *frame_info, + TWO_PASS *const twopass, const int flash_detected, + const int frames_since_key, const int cur_idx, const int can_disable_arf, + const int min_gf_interval, GF_GROUP_STATS *gf_stats) { + accumulate_frame_motion_stats(stats, gf_stats); + // sum up the metric values of current gf group + gf_stats->avg_sr_coded_error += stats->sr_coded_error; + gf_stats->avg_tr_coded_error += stats->tr_coded_error; + gf_stats->avg_pcnt_second_ref += stats->pcnt_second_ref; + gf_stats->avg_pcnt_third_ref += stats->pcnt_third_ref; + gf_stats->avg_new_mv_count += stats->new_mv_count; + gf_stats->avg_wavelet_energy += stats->frame_avg_wavelet_energy; + if (fabs(stats->raw_error_stdev) > 0.000001) { + gf_stats->non_zero_stdev_count++; + gf_stats->avg_raw_err_stdev += stats->raw_error_stdev; + } + + // Accumulate the effect of prediction quality decay + if (!flash_detected) { + gf_stats->last_loop_decay_rate = gf_stats->loop_decay_rate; + gf_stats->loop_decay_rate = get_prediction_decay_rate(frame_info, stats); + + gf_stats->decay_accumulator = + gf_stats->decay_accumulator * gf_stats->loop_decay_rate; + + // Monitor for static sections. + if ((frames_since_key + cur_idx - 1) > 1) { + gf_stats->zero_motion_accumulator = + AOMMIN(gf_stats->zero_motion_accumulator, + get_zero_motion_factor(frame_info, stats)); + } + + // Break clause to detect very still sections after motion. For example, + // a static image after a fade or other transition. + if (can_disable_arf && + detect_transition_to_still(twopass, min_gf_interval, cur_idx, 5, + gf_stats->loop_decay_rate, + gf_stats->last_loop_decay_rate)) { + gf_stats->allow_alt_ref = 0; + } + } +} + +static void average_gf_stats(const int total_frame, + const FIRSTPASS_STATS *last_stat, + GF_GROUP_STATS *gf_stats) { + if (total_frame) { + gf_stats->avg_sr_coded_error /= total_frame; + gf_stats->avg_tr_coded_error /= total_frame; + gf_stats->avg_pcnt_second_ref /= total_frame; + if (total_frame - 1) { + gf_stats->avg_pcnt_third_ref_nolast = + (gf_stats->avg_pcnt_third_ref - last_stat->pcnt_third_ref) / + (total_frame - 1); + } else { + gf_stats->avg_pcnt_third_ref_nolast = + gf_stats->avg_pcnt_third_ref / total_frame; + } + gf_stats->avg_pcnt_third_ref /= total_frame; + gf_stats->avg_new_mv_count /= total_frame; + gf_stats->avg_wavelet_energy /= total_frame; + } + + if (gf_stats->non_zero_stdev_count) + gf_stats->avg_raw_err_stdev /= gf_stats->non_zero_stdev_count; +} + +static void get_features_from_gf_stats(const GF_GROUP_STATS *gf_stats, + const GF_FRAME_STATS *first_frame, + const GF_FRAME_STATS *last_frame, + const int num_mbs, + const int constrained_gf_group, + const int kf_zeromotion_pct, + const int num_frames, float *features) { + *features++ = (float)gf_stats->abs_mv_in_out_accumulator; + *features++ = (float)(gf_stats->avg_new_mv_count / num_mbs); + *features++ = (float)gf_stats->avg_pcnt_second_ref; + *features++ = (float)gf_stats->avg_pcnt_third_ref; + *features++ = (float)gf_stats->avg_pcnt_third_ref_nolast; + *features++ = (float)(gf_stats->avg_sr_coded_error / num_mbs); + *features++ = (float)(gf_stats->avg_tr_coded_error / num_mbs); + *features++ = (float)(gf_stats->avg_wavelet_energy / num_mbs); + *features++ = (float)(constrained_gf_group); + *features++ = (float)gf_stats->decay_accumulator; + *features++ = (float)(first_frame->frame_coded_error / num_mbs); + *features++ = (float)(first_frame->frame_sr_coded_error / num_mbs); + *features++ = (float)(first_frame->frame_tr_coded_error / num_mbs); + *features++ = (float)(first_frame->frame_err / num_mbs); + *features++ = (float)(kf_zeromotion_pct); + *features++ = (float)(last_frame->frame_coded_error / num_mbs); + *features++ = (float)(last_frame->frame_sr_coded_error / num_mbs); + *features++ = (float)(last_frame->frame_tr_coded_error / num_mbs); + *features++ = (float)num_frames; + *features++ = (float)gf_stats->mv_ratio_accumulator; + *features++ = (float)gf_stats->non_zero_stdev_count; +} + +#define BOOST_FACTOR 12.5 +static double baseline_err_per_mb(const FRAME_INFO *frame_info) { + unsigned int screen_area = frame_info->frame_height * frame_info->frame_width; + + // Use a different error per mb factor for calculating boost for + // different formats. + if (screen_area <= 640 * 360) { + return 500.0; + } else { + return 1000.0; + } +} + +static double calc_frame_boost(const RATE_CONTROL *rc, + const FRAME_INFO *frame_info, + const FIRSTPASS_STATS *this_frame, + double this_frame_mv_in_out, double max_boost) { + double frame_boost; + const double lq = av1_convert_qindex_to_q(rc->avg_frame_qindex[INTER_FRAME], + frame_info->bit_depth); + const double boost_q_correction = AOMMIN((0.5 + (lq * 0.015)), 1.5); + const double active_area = calculate_active_area(frame_info, this_frame); + int num_mbs = frame_info->num_mbs; + + // Correct for any inactive region in the image + num_mbs = (int)AOMMAX(1, num_mbs * active_area); + + // Underlying boost factor is based on inter error ratio. + frame_boost = AOMMAX(baseline_err_per_mb(frame_info) * num_mbs, + this_frame->intra_error * active_area) / + DOUBLE_DIVIDE_CHECK(this_frame->coded_error); + frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction; + + // Increase boost for frames where new data coming into frame (e.g. zoom out). + // Slightly reduce boost if there is a net balance of motion out of the frame + // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0. + if (this_frame_mv_in_out > 0.0) + frame_boost += frame_boost * (this_frame_mv_in_out * 2.0); + // In the extreme case the boost is halved. + else + frame_boost += frame_boost * (this_frame_mv_in_out / 2.0); + + return AOMMIN(frame_boost, max_boost * boost_q_correction); +} + +static double calc_kf_frame_boost(const RATE_CONTROL *rc, + const FRAME_INFO *frame_info, + const FIRSTPASS_STATS *this_frame, + double *sr_accumulator, double max_boost) { + double frame_boost; + const double lq = av1_convert_qindex_to_q(rc->avg_frame_qindex[INTER_FRAME], + frame_info->bit_depth); + const double boost_q_correction = AOMMIN((0.50 + (lq * 0.015)), 2.00); + const double active_area = calculate_active_area(frame_info, this_frame); + int num_mbs = frame_info->num_mbs; + + // Correct for any inactive region in the image + num_mbs = (int)AOMMAX(1, num_mbs * active_area); + + // Underlying boost factor is based on inter error ratio. + frame_boost = AOMMAX(baseline_err_per_mb(frame_info) * num_mbs, + this_frame->intra_error * active_area) / + DOUBLE_DIVIDE_CHECK( + (this_frame->coded_error + *sr_accumulator) * active_area); + + // Update the accumulator for second ref error difference. + // This is intended to give an indication of how much the coded error is + // increasing over time. + *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error); + *sr_accumulator = AOMMAX(0.0, *sr_accumulator); + + // Q correction and scaling + // The 40.0 value here is an experimentally derived baseline minimum. + // This value is in line with the minimum per frame boost in the alt_ref + // boost calculation. + frame_boost = ((frame_boost + 40.0) * boost_q_correction); + + return AOMMIN(frame_boost, max_boost * boost_q_correction); +} + +static int get_projected_gfu_boost(const RATE_CONTROL *rc, int gfu_boost, + int frames_to_project, + int num_stats_used_for_gfu_boost) { + /* + * If frames_to_project is equal to num_stats_used_for_gfu_boost, + * it means that gfu_boost was calculated over frames_to_project to + * begin with(ie; all stats required were available), hence return + * the original boost. + */ + if (num_stats_used_for_gfu_boost >= frames_to_project) return gfu_boost; + + double min_boost_factor = sqrt(rc->baseline_gf_interval); + // Get the current tpl factor (number of frames = frames_to_project). + double tpl_factor = av1_get_gfu_boost_projection_factor( + min_boost_factor, MAX_GFUBOOST_FACTOR, frames_to_project); + // Get the tpl factor when number of frames = num_stats_used_for_prior_boost. + double tpl_factor_num_stats = av1_get_gfu_boost_projection_factor( + min_boost_factor, MAX_GFUBOOST_FACTOR, num_stats_used_for_gfu_boost); + int projected_gfu_boost = + (int)rint((tpl_factor * gfu_boost) / tpl_factor_num_stats); + return projected_gfu_boost; +} + +#define GF_MAX_BOOST 90.0 +#define MIN_DECAY_FACTOR 0.01 +int av1_calc_arf_boost(const TWO_PASS *twopass, const RATE_CONTROL *rc, + FRAME_INFO *frame_info, int offset, int f_frames, + int b_frames, int *num_fpstats_used, + int *num_fpstats_required) { + int i; + GF_GROUP_STATS gf_stats; + init_gf_stats(&gf_stats); + double boost_score = (double)NORMAL_BOOST; + int arf_boost; + int flash_detected = 0; + if (num_fpstats_used) *num_fpstats_used = 0; + + // Search forward from the proposed arf/next gf position. + for (i = 0; i < f_frames; ++i) { + const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset); + if (this_frame == NULL) break; + + // Update the motion related elements to the boost calculation. + accumulate_frame_motion_stats(this_frame, &gf_stats); + + // We want to discount the flash frame itself and the recovery + // frame that follows as both will have poor scores. + flash_detected = detect_flash(twopass, i + offset) || + detect_flash(twopass, i + offset + 1); + + // Accumulate the effect of prediction quality decay. + if (!flash_detected) { + gf_stats.decay_accumulator *= + get_prediction_decay_rate(frame_info, this_frame); + gf_stats.decay_accumulator = gf_stats.decay_accumulator < MIN_DECAY_FACTOR + ? MIN_DECAY_FACTOR + : gf_stats.decay_accumulator; + } + + boost_score += + gf_stats.decay_accumulator * + calc_frame_boost(rc, frame_info, this_frame, + gf_stats.this_frame_mv_in_out, GF_MAX_BOOST); + if (num_fpstats_used) (*num_fpstats_used)++; + } + + arf_boost = (int)boost_score; + + // Reset for backward looking loop. + boost_score = 0.0; + init_gf_stats(&gf_stats); + // Search backward towards last gf position. + for (i = -1; i >= -b_frames; --i) { + const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset); + if (this_frame == NULL) break; + + // Update the motion related elements to the boost calculation. + accumulate_frame_motion_stats(this_frame, &gf_stats); + + // We want to discount the the flash frame itself and the recovery + // frame that follows as both will have poor scores. + flash_detected = detect_flash(twopass, i + offset) || + detect_flash(twopass, i + offset + 1); + + // Cumulative effect of prediction quality decay. + if (!flash_detected) { + gf_stats.decay_accumulator *= + get_prediction_decay_rate(frame_info, this_frame); + gf_stats.decay_accumulator = gf_stats.decay_accumulator < MIN_DECAY_FACTOR + ? MIN_DECAY_FACTOR + : gf_stats.decay_accumulator; + } + + boost_score += + gf_stats.decay_accumulator * + calc_frame_boost(rc, frame_info, this_frame, + gf_stats.this_frame_mv_in_out, GF_MAX_BOOST); + if (num_fpstats_used) (*num_fpstats_used)++; + } + arf_boost += (int)boost_score; + + if (num_fpstats_required) { + *num_fpstats_required = f_frames + b_frames; + if (num_fpstats_used) { + arf_boost = get_projected_gfu_boost(rc, arf_boost, *num_fpstats_required, + *num_fpstats_used); + } + } + + if (arf_boost < ((b_frames + f_frames) * 50)) + arf_boost = ((b_frames + f_frames) * 50); + + return arf_boost; +} + +// Calculate a section intra ratio used in setting max loop filter. +static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin, + const FIRSTPASS_STATS *end, + int section_length) { + const FIRSTPASS_STATS *s = begin; + double intra_error = 0.0; + double coded_error = 0.0; + int i = 0; + + while (s < end && i < section_length) { + intra_error += s->intra_error; + coded_error += s->coded_error; + ++s; + ++i; + } + + return (int)(intra_error / DOUBLE_DIVIDE_CHECK(coded_error)); +} + +// Calculate the total bits to allocate in this GF/ARF group. +static int64_t calculate_total_gf_group_bits(AV1_COMP *cpi, + double gf_group_err) { + const RATE_CONTROL *const rc = &cpi->rc; + const TWO_PASS *const twopass = &cpi->twopass; + const int max_bits = frame_max_bits(rc, &cpi->oxcf); + int64_t total_group_bits; + + // Calculate the bits to be allocated to the group as a whole. + if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) { + total_group_bits = (int64_t)(twopass->kf_group_bits * + (gf_group_err / twopass->kf_group_error_left)); + } else { + total_group_bits = 0; + } + + // Clamp odd edge cases. + total_group_bits = (total_group_bits < 0) + ? 0 + : (total_group_bits > twopass->kf_group_bits) + ? twopass->kf_group_bits + : total_group_bits; + + // Clip based on user supplied data rate variability limit. + if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval) + total_group_bits = (int64_t)max_bits * rc->baseline_gf_interval; + + return total_group_bits; +} + +// Calculate the number of bits to assign to boosted frames in a group. +static int calculate_boost_bits(int frame_count, int boost, + int64_t total_group_bits) { + int allocation_chunks; + + // return 0 for invalid inputs (could arise e.g. through rounding errors) + if (!boost || (total_group_bits <= 0)) return 0; + + if (frame_count <= 0) return (int)(AOMMIN(total_group_bits, INT_MAX)); + + allocation_chunks = (frame_count * 100) + boost; + + // Prevent overflow. + if (boost > 1023) { + int divisor = boost >> 10; + boost /= divisor; + allocation_chunks /= divisor; + } + + // Calculate the number of extra bits for use in the boosted frame or frames. + return AOMMAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks), + 0); +} + +// Calculate the boost factor based on the number of bits assigned, i.e. the +// inverse of calculate_boost_bits(). +static int calculate_boost_factor(int frame_count, int bits, + int64_t total_group_bits) { + aom_clear_system_state(); + return (int)(100.0 * frame_count * bits / (total_group_bits - bits)); +} + +// Reduce the number of bits assigned to keyframe or arf if necessary, to +// prevent bitrate spikes that may break level constraints. +// frame_type: 0: keyframe; 1: arf. +static int adjust_boost_bits_for_target_level(const AV1_COMP *const cpi, + RATE_CONTROL *const rc, + int bits_assigned, + int64_t group_bits, + int frame_type) { + const AV1_COMMON *const cm = &cpi->common; + const SequenceHeader *const seq_params = &cm->seq_params; + const int temporal_layer_id = cm->temporal_layer_id; + const int spatial_layer_id = cm->spatial_layer_id; + for (int index = 0; index < seq_params->operating_points_cnt_minus_1 + 1; + ++index) { + if (!is_in_operating_point(seq_params->operating_point_idc[index], + temporal_layer_id, spatial_layer_id)) { + continue; + } + + const AV1_LEVEL target_level = + cpi->level_params.target_seq_level_idx[index]; + if (target_level >= SEQ_LEVELS) continue; + + assert(is_valid_seq_level_idx(target_level)); + + const double level_bitrate_limit = av1_get_max_bitrate_for_level( + target_level, seq_params->tier[0], seq_params->profile); + const int target_bits_per_frame = + (int)(level_bitrate_limit / cpi->framerate); + if (frame_type == 0) { + // Maximum bits for keyframe is 8 times the target_bits_per_frame. + const int level_enforced_max_kf_bits = target_bits_per_frame * 8; + if (bits_assigned > level_enforced_max_kf_bits) { + const int frames = rc->frames_to_key - 1; + rc->kf_boost = calculate_boost_factor( + frames, level_enforced_max_kf_bits, group_bits); + bits_assigned = calculate_boost_bits(frames, rc->kf_boost, group_bits); + } + } else if (frame_type == 1) { + // Maximum bits for arf is 4 times the target_bits_per_frame. + const int level_enforced_max_arf_bits = target_bits_per_frame * 4; + if (bits_assigned > level_enforced_max_arf_bits) { + rc->gfu_boost = calculate_boost_factor( + rc->baseline_gf_interval, level_enforced_max_arf_bits, group_bits); + bits_assigned = calculate_boost_bits(rc->baseline_gf_interval, + rc->gfu_boost, group_bits); + } + } else { + assert(0); + } + } + + return bits_assigned; +} + +// Compile time switch on alternate algorithm to allocate bits in ARF groups +// #define ALT_ARF_ALLOCATION +#ifdef ALT_ARF_ALLOCATION +double layer_fraction[MAX_ARF_LAYERS + 1] = { 1.0, 0.70, 0.55, 0.60, + 0.60, 1.0, 1.0 }; +static void allocate_gf_group_bits(GF_GROUP *gf_group, RATE_CONTROL *const rc, + int64_t gf_group_bits, int gf_arf_bits, + int key_frame, int use_arf) { + int64_t total_group_bits = gf_group_bits; + int base_frame_bits; + const int gf_group_size = gf_group->size; + int layer_frames[MAX_ARF_LAYERS + 1] = { 0 }; + + // Subtract the extra bits set aside for ARF frames from the Group Total + if (use_arf || !key_frame) total_group_bits -= gf_arf_bits; + + if (rc->baseline_gf_interval) + base_frame_bits = (int)(total_group_bits / rc->baseline_gf_interval); + else + base_frame_bits = (int)1; + + // For key frames the frame target rate is already set and it + // is also the golden frame. + // === [frame_index == 0] === + int frame_index = 0; + if (!key_frame) { + if (rc->source_alt_ref_active) + gf_group->bit_allocation[frame_index] = 0; + else + gf_group->bit_allocation[frame_index] = + base_frame_bits + (int)(gf_arf_bits * layer_fraction[1]); + } + frame_index++; + + // Check the number of frames in each layer in case we have a + // non standard group length. + int max_arf_layer = gf_group->max_layer_depth - 1; + for (int idx = frame_index; idx < gf_group_size; ++idx) { + if ((gf_group->update_type[idx] == ARF_UPDATE) || + (gf_group->update_type[idx] == INTNL_ARF_UPDATE)) { + // max_arf_layer = AOMMAX(max_arf_layer, gf_group->layer_depth[idx]); + layer_frames[gf_group->layer_depth[idx]]++; + } + } + + // Allocate extra bits to each ARF layer + int i; + int layer_extra_bits[MAX_ARF_LAYERS + 1] = { 0 }; + for (i = 1; i <= max_arf_layer; ++i) { + double fraction = (i == max_arf_layer) ? 1.0 : layer_fraction[i]; + layer_extra_bits[i] = + (int)((gf_arf_bits * fraction) / AOMMAX(1, layer_frames[i])); + gf_arf_bits -= (int)(gf_arf_bits * fraction); + } + + // Now combine ARF layer and baseline bits to give total bits for each frame. + int arf_extra_bits; + for (int idx = frame_index; idx < gf_group_size; ++idx) { + switch (gf_group->update_type[idx]) { + case ARF_UPDATE: + case INTNL_ARF_UPDATE: + arf_extra_bits = layer_extra_bits[gf_group->layer_depth[idx]]; + gf_group->bit_allocation[idx] = base_frame_bits + arf_extra_bits; + break; + case INTNL_OVERLAY_UPDATE: + case OVERLAY_UPDATE: gf_group->bit_allocation[idx] = 0; break; + default: gf_group->bit_allocation[idx] = base_frame_bits; break; + } + } + + // Set the frame following the current GOP to 0 bit allocation. For ARF + // groups, this next frame will be overlay frame, which is the first frame + // in the next GOP. For GF group, next GOP will overwrite the rate allocation. + // Setting this frame to use 0 bit (of out the current GOP budget) will + // simplify logics in reference frame management. + gf_group->bit_allocation[gf_group_size] = 0; +} +#else +static void allocate_gf_group_bits(GF_GROUP *gf_group, RATE_CONTROL *const rc, + int64_t gf_group_bits, int gf_arf_bits, + int key_frame, int use_arf) { + int64_t total_group_bits = gf_group_bits; + + // For key frames the frame target rate is already set and it + // is also the golden frame. + // === [frame_index == 0] === + int frame_index = 0; + if (!key_frame) { + if (rc->source_alt_ref_active) + gf_group->bit_allocation[frame_index] = 0; + else + gf_group->bit_allocation[frame_index] = gf_arf_bits; + } + + // Deduct the boost bits for arf (or gf if it is not a key frame) + // from the group total. + if (use_arf || !key_frame) total_group_bits -= gf_arf_bits; + + frame_index++; + + // Store the bits to spend on the ARF if there is one. + // === [frame_index == 1] === + if (use_arf) { + gf_group->bit_allocation[frame_index] = gf_arf_bits; + ++frame_index; + } + + const int gf_group_size = gf_group->size; + int arf_depth_bits[MAX_ARF_LAYERS + 1] = { 0 }; + int arf_depth_count[MAX_ARF_LAYERS + 1] = { 0 }; + int arf_depth_boost[MAX_ARF_LAYERS + 1] = { 0 }; + int total_arfs = 0; + int total_overlays = rc->source_alt_ref_active; + + for (int idx = 0; idx < gf_group_size; ++idx) { + if (gf_group->update_type[idx] == ARF_UPDATE || + gf_group->update_type[idx] == INTNL_ARF_UPDATE || + gf_group->update_type[idx] == LF_UPDATE) { + arf_depth_boost[gf_group->layer_depth[idx]] += gf_group->arf_boost[idx]; + ++arf_depth_count[gf_group->layer_depth[idx]]; + } + } + + for (int idx = 2; idx <= MAX_ARF_LAYERS; ++idx) { + arf_depth_bits[idx] = + calculate_boost_bits(rc->baseline_gf_interval - total_arfs - + total_overlays - arf_depth_count[idx], + arf_depth_boost[idx], total_group_bits); + total_group_bits -= arf_depth_bits[idx]; + total_arfs += arf_depth_count[idx]; + } + + for (int idx = frame_index; idx < gf_group_size; ++idx) { + switch (gf_group->update_type[idx]) { + case ARF_UPDATE: + case INTNL_ARF_UPDATE: + case LF_UPDATE: + gf_group->bit_allocation[idx] = + (int)(((int64_t)arf_depth_bits[gf_group->layer_depth[idx]] * + gf_group->arf_boost[idx]) / + arf_depth_boost[gf_group->layer_depth[idx]]); + break; + case INTNL_OVERLAY_UPDATE: + case OVERLAY_UPDATE: + default: gf_group->bit_allocation[idx] = 0; break; + } + } + + // Set the frame following the current GOP to 0 bit allocation. For ARF + // groups, this next frame will be overlay frame, which is the first frame + // in the next GOP. For GF group, next GOP will overwrite the rate allocation. + // Setting this frame to use 0 bit (of out the current GOP budget) will + // simplify logics in reference frame management. + gf_group->bit_allocation[gf_group_size] = 0; +} +#endif + +// Returns true if KF group and GF group both are almost completely static. +static INLINE int is_almost_static(double gf_zero_motion, int kf_zero_motion) { + return (gf_zero_motion >= 0.995) && + (kf_zero_motion >= STATIC_KF_GROUP_THRESH); +} + +#define ARF_ABS_ZOOM_THRESH 4.4 +static INLINE int detect_gf_cut(AV1_COMP *cpi, int frame_index, int cur_start, + int flash_detected, int active_max_gf_interval, + int active_min_gf_interval, + GF_GROUP_STATS *gf_stats) { + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->twopass; + // Motion breakout threshold for loop below depends on image size. + const double mv_ratio_accumulator_thresh = + (cpi->initial_height + cpi->initial_width) / 4.0; + + if (!flash_detected) { + // Break clause to detect very still sections after motion. For example, + // a static image after a fade or other transition. + if (detect_transition_to_still( + twopass, rc->min_gf_interval, frame_index - cur_start, 5, + gf_stats->loop_decay_rate, gf_stats->last_loop_decay_rate)) { + return 1; + } + } + + // Some conditions to breakout after min interval. + if (frame_index - cur_start >= active_min_gf_interval && + // If possible don't break very close to a kf + (rc->frames_to_key - frame_index >= rc->min_gf_interval) && + ((frame_index - cur_start) & 0x01) && !flash_detected && + (gf_stats->mv_ratio_accumulator > mv_ratio_accumulator_thresh || + gf_stats->abs_mv_in_out_accumulator > ARF_ABS_ZOOM_THRESH)) { + return 1; + } + + // If almost totally static, we will not use the the max GF length later, + // so we can continue for more frames. + if (((frame_index - cur_start) >= active_max_gf_interval + 1) && + !is_almost_static(gf_stats->zero_motion_accumulator, + twopass->kf_zeromotion_pct)) { + return 1; + } + return 0; +} + +#define MAX_PAD_GF_CHECK 6 // padding length to check for gf length +#define AVG_SI_THRES 0.6 // thres for average silouette +#define GF_SHRINK_OUTPUT 0 // print output for gf length decision +int determine_high_err_gf(double *errs, int *is_high, double *si, int len, + double *ratio, int gf_start, int gf_end, + int before_pad) { + (void)gf_start; + (void)gf_end; + (void)before_pad; + // alpha and beta controls the threshold placement + // e.g. a smaller alpha makes the lower group more rigid + const double alpha = 0.5; + const double beta = 1 - alpha; + double mean = 0; + double mean_low = 0; + double mean_high = 0; + double prev_mean_low = 0; + double prev_mean_high = 0; + int count_low = 0; + int count_high = 0; + // calculate mean of errs + for (int i = 0; i < len; i++) { + mean += errs[i]; + } + mean /= len; + // separate into two initial groups with greater / lower than mean + for (int i = 0; i < len; i++) { + if (errs[i] <= mean) { + is_high[i] = 0; + count_low++; + prev_mean_low += errs[i]; + } else { + is_high[i] = 1; + count_high++; + prev_mean_high += errs[i]; + } + } + prev_mean_low /= count_low; + prev_mean_high /= count_high; + // kmeans to refine + int count = 0; + while (count < 10) { + // re-group + mean_low = 0; + mean_high = 0; + count_low = 0; + count_high = 0; + double thres = prev_mean_low * alpha + prev_mean_high * beta; + for (int i = 0; i < len; i++) { + if (errs[i] <= thres) { + is_high[i] = 0; + count_low++; + mean_low += errs[i]; + } else { + is_high[i] = 1; + count_high++; + mean_high += errs[i]; + } + } + mean_low /= count_low; + mean_high /= count_high; + + // break if not changed much + if (fabs((mean_low - prev_mean_low) / (prev_mean_low + 0.00001)) < + 0.00001 && + fabs((mean_high - prev_mean_high) / (prev_mean_high + 0.00001)) < + 0.00001) + break; + + // update means + prev_mean_high = mean_high; + prev_mean_low = mean_low; + + count++; + } + + // count how many jumps of group changes + int num_change = 0; + for (int i = 0; i < len - 1; i++) { + if (is_high[i] != is_high[i + 1]) num_change++; + } + + // get silhouette as a measure of the classification quality + double avg_si = 0; + // ai: avg dist of its own class, bi: avg dist to the other class + double ai, bi; + if (count_low > 1 && count_high > 1) { + for (int i = 0; i < len; i++) { + ai = 0; + bi = 0; + // calculate average distance to everyone in the same group + // and in the other group + for (int j = 0; j < len; j++) { + if (i == j) continue; + if (is_high[i] == is_high[j]) { + ai += fabs(errs[i] - errs[j]); + } else { + bi += fabs(errs[i] - errs[j]); + } + } + if (is_high[i] == 0) { + ai = ai / (count_low - 1); + bi = bi / count_high; + } else { + ai = ai / (count_high - 1); + bi = bi / count_low; + } + if (ai <= bi) { + si[i] = 1 - ai / (bi + 0.00001); + } else { + si[i] = bi / (ai + 0.00001) - 1; + } + avg_si += si[i]; + } + avg_si /= len; + } + + int reset = 0; + *ratio = mean_high / (mean_low + 0.00001); + // if the two groups too similar, or + // if too many numbers of changes, or + // silhouette is too small, not confident + // reset everything to 0 later so we fallback to the original decision + if (*ratio < 1.3 || num_change > AOMMAX(len / 3, 6) || + avg_si < AVG_SI_THRES) { + reset = 1; + } + +#if GF_SHRINK_OUTPUT + printf("\n"); + for (int i = 0; i < len; i++) { + printf("%d: err %.1f, ishigh %d, si %.2f, (i=%d)\n", + gf_start + i - before_pad, errs[i], is_high[i], si[i], gf_end); + } + printf( + "count: %d, mean_high: %.1f, mean_low: %.1f, avg_si: %.2f, num_change: " + "%d, ratio %.2f, reset: %d\n", + count, mean_high, mean_low, avg_si, num_change, + mean_high / (mean_low + 0.000001), reset); +#endif + + if (reset) { + memset(is_high, 0, sizeof(is_high[0]) * len); + memset(si, 0, sizeof(si[0]) * len); + } + return reset; +} + +#if GROUP_ADAPTIVE_MAXQ +#define RC_FACTOR_MIN 0.75 +#define RC_FACTOR_MAX 1.25 +#endif // GROUP_ADAPTIVE_MAXQ +#define MIN_FWD_KF_INTERVAL 8 +#define MIN_SHRINK_LEN 6 // the minimum length of gf if we are shrinking +#define SI_HIGH AVG_SI_THRES // high quality classification +#define SI_LOW 0.3 // very unsure classification +// this function finds an low error frame previously to the current last frame +// in the gf group, and set the last frame to it. +// The resulting last frame is then returned by *cur_last_ptr +// *cur_start_ptr and cut_pos[n] could also change due to shrinking +// previous gf groups +void set_last_prev_low_err(int *cur_start_ptr, int *cur_last_ptr, int *cut_pos, + int count_cuts, int before_pad, double ratio, + int *is_high, double *si, int prev_lows) { + int n; + int cur_start = *cur_start_ptr; + int cur_last = *cur_last_ptr; + for (n = cur_last; n >= cur_start + MIN_SHRINK_LEN; n--) { + // try to find a point that is very probable to be good + if (is_high[n - cur_start + before_pad] == 0 && + si[n - cur_start + before_pad] > SI_HIGH) { + *cur_last_ptr = n; + return; + } + } + // could not find a low-err point, then let's try find an "unsure" + // point at least + for (n = cur_last; n >= cur_start + MIN_SHRINK_LEN; n--) { + if ((is_high[n - cur_start + before_pad] == 0) || + (is_high[n - cur_start + before_pad] && + si[n - cur_start + before_pad] < SI_LOW)) { + *cur_last_ptr = n; + return; + } + } + if (prev_lows) { + // try with shrinking previous all_zero interval + for (n = cur_start + MIN_SHRINK_LEN - 1; n > cur_start; n--) { + if (is_high[n - cur_start + before_pad] == 0 && + si[n - cur_start + before_pad] > SI_HIGH) { + int tentative_start = n - MIN_SHRINK_LEN; + // check if the previous interval can shrink this much + int available = + tentative_start - cut_pos[count_cuts - 2] > MIN_SHRINK_LEN && + cur_start - tentative_start < prev_lows; + // shrinking too agressively may worsen performance + // set stricter thres for shorter length + double ratio_thres = + 1.0 * (cur_start - tentative_start) / (double)(MIN_SHRINK_LEN) + + 1.0; + + if (available && (ratio > ratio_thres)) { + cut_pos[count_cuts - 1] = tentative_start; + *cur_start_ptr = tentative_start; + *cur_last_ptr = n; + return; + } + } + } + } + if (prev_lows) { + // try with shrinking previous all_zero interval with unsure points + for (n = cur_start + MIN_SHRINK_LEN - 1; n > cur_start; n--) { + if ((is_high[n - cur_start + before_pad] == 0) || + (is_high[n - cur_start + before_pad] && + si[n - cur_start + before_pad] < SI_LOW)) { + int tentative_start = n - MIN_SHRINK_LEN; + // check if the previous interval can shrink this much + int available = + tentative_start - cut_pos[count_cuts - 2] > MIN_SHRINK_LEN && + cur_start - tentative_start < prev_lows; + // shrinking too agressively may worsen performance + double ratio_thres = + 1.0 * (cur_start - tentative_start) / (double)(MIN_SHRINK_LEN) + + 1.0; + + if (available && (ratio > ratio_thres)) { + cut_pos[count_cuts - 1] = tentative_start; + *cur_start_ptr = tentative_start; + *cur_last_ptr = n; + return; + } + } + } + } // prev_lows + return; +} + +// This function decides the gf group length of future frames in batch +// rc->gf_intervals is modified to store the group lengths +static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length, + int max_intervals) { + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->twopass; + FIRSTPASS_STATS next_frame; + const FIRSTPASS_STATS *const start_pos = twopass->stats_in; + FRAME_INFO *frame_info = &cpi->frame_info; + int i; + + int flash_detected; + + aom_clear_system_state(); + av1_zero(next_frame); + + if (has_no_stats_stage(cpi)) { + for (i = 0; i < MAX_NUM_GF_INTERVALS; i++) { + rc->gf_intervals[i] = AOMMIN(rc->max_gf_interval, max_gop_length); + } + rc->cur_gf_index = 0; + rc->intervals_till_gf_calculate_due = MAX_NUM_GF_INTERVALS; + return; + } + + // TODO(urvang): Try logic to vary min and max interval based on q. + const int active_min_gf_interval = rc->min_gf_interval; + const int active_max_gf_interval = + AOMMIN(rc->max_gf_interval, max_gop_length); + + i = 0; + max_intervals = cpi->lap_enabled ? 1 : max_intervals; + int cut_pos[MAX_NUM_GF_INTERVALS + 1] = { 0 }; + int count_cuts = 1; + int cur_start = 0, cur_last; + int cut_here; + int prev_lows = 0; + GF_GROUP_STATS gf_stats; + init_gf_stats(&gf_stats); + while (count_cuts < max_intervals + 1) { + ++i; + + // reaches next key frame, break here + if (i >= rc->frames_to_key) { + cut_pos[count_cuts] = i - 1; + count_cuts++; + break; + } + + // reached maximum len, but nothing special yet (almost static) + // let's look at the next interval + if (i - cur_start >= rc->static_scene_max_gf_interval) { + cut_here = 1; + } else { + // reaches last frame, break + if (EOF == input_stats(twopass, &next_frame)) { + cut_pos[count_cuts] = i - 1; + count_cuts++; + break; + } + // Test for the case where there is a brief flash but the prediction + // quality back to an earlier frame is then restored. + flash_detected = detect_flash(twopass, 0); + // TODO(bohanli): remove redundant accumulations here, or unify + // this and the ones in define_gf_group + accumulate_next_frame_stats(&next_frame, frame_info, twopass, + flash_detected, rc->frames_since_key, i, 0, + rc->min_gf_interval, &gf_stats); + + cut_here = detect_gf_cut(cpi, i, cur_start, flash_detected, + active_max_gf_interval, active_min_gf_interval, + &gf_stats); + } + if (cut_here) { + cur_last = i - 1; // the current last frame in the gf group + // only try shrinking if interval smaller than active_max_gf_interval + if (cur_last - cur_start <= active_max_gf_interval) { + // determine in the current decided gop the higher and lower errs + int n; + double ratio; + + // load neighboring coded errs + int is_high[MAX_GF_INTERVAL + 1 + MAX_PAD_GF_CHECK * 2] = { 0 }; + double errs[MAX_GF_INTERVAL + 1 + MAX_PAD_GF_CHECK * 2] = { 0 }; + double si[MAX_GF_INTERVAL + 1 + MAX_PAD_GF_CHECK * 2] = { 0 }; + int before_pad = + AOMMIN(MAX_PAD_GF_CHECK, rc->frames_since_key - 1 + cur_start); + int after_pad = + AOMMIN(MAX_PAD_GF_CHECK, rc->frames_to_key - cur_last - 1); + for (n = cur_start - before_pad; n <= cur_last + after_pad; n++) { + if (start_pos + n - 1 > twopass->stats_buf_ctx->stats_in_end) { + after_pad = n - cur_last - 1; + assert(after_pad >= 0); + break; + } else if (start_pos + n - 1 < + twopass->stats_buf_ctx->stats_in_start) { + before_pad = cur_start - n - 1; + continue; + } + errs[n + before_pad - cur_start] = (start_pos + n - 1)->coded_error; + } + const int len = before_pad + after_pad + cur_last - cur_start + 1; + const int reset = determine_high_err_gf( + errs, is_high, si, len, &ratio, cur_start, cur_last, before_pad); + + // if the current frame may have high error, try shrinking + if (is_high[cur_last - cur_start + before_pad] == 1 || + (!reset && si[cur_last - cur_start + before_pad] < SI_LOW)) { + // try not to cut in high err area + set_last_prev_low_err(&cur_start, &cur_last, cut_pos, count_cuts, + before_pad, ratio, is_high, si, prev_lows); + } // if current frame high error + // count how many trailing lower error frames we have in this decided + // gf group + prev_lows = 0; + for (n = cur_last - 1; n > cur_start + MIN_SHRINK_LEN; n--) { + if (is_high[n - cur_start + before_pad] == 0 && + (si[n - cur_start + before_pad] > SI_HIGH || reset)) { + prev_lows++; + } else { + break; + } + } + } + cut_pos[count_cuts] = cur_last; + count_cuts++; + + // reset pointers to the shrinked location + twopass->stats_in = start_pos + cur_last; + cur_start = cur_last; + i = cur_last; + + // reset accumulators + init_gf_stats(&gf_stats); + } + } + + // save intervals + rc->intervals_till_gf_calculate_due = count_cuts - 1; + for (int n = 1; n < count_cuts; n++) { + rc->gf_intervals[n - 1] = cut_pos[n] + 1 - cut_pos[n - 1]; + } + rc->cur_gf_index = 0; + twopass->stats_in = start_pos; + +#if GF_SHRINK_OUTPUT + printf("\nf_to_key: %d, count_cut: %d. ", rc->frames_to_key, count_cuts); + for (int n = 0; n < count_cuts; n++) { + printf("%d ", cut_pos[n]); + } + printf("\n"); + + for (int n = 0; n < rc->intervals_till_gf_calculate_due; n++) { + printf("%d ", rc->gf_intervals[n]); + } + printf("\n\n"); +#endif +} + +static void correct_frames_to_key(AV1_COMP *cpi) { + int lookahead_size = + (int)av1_lookahead_depth(cpi->lookahead, cpi->compressor_stage) + 1; + if (lookahead_size < + av1_lookahead_pop_sz(cpi->lookahead, cpi->compressor_stage)) { + cpi->rc.frames_to_key = AOMMIN(cpi->rc.frames_to_key, lookahead_size); + } +} + +static void define_gf_group_pass0(AV1_COMP *cpi, + const EncodeFrameParams *const frame_params) { + RATE_CONTROL *const rc = &cpi->rc; + GF_GROUP *const gf_group = &cpi->gf_group; + int target; + + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + av1_cyclic_refresh_set_golden_update(cpi); + } else { + rc->baseline_gf_interval = rc->gf_intervals[rc->cur_gf_index]; + rc->intervals_till_gf_calculate_due--; + rc->cur_gf_index++; + } + + // correct frames_to_key when lookahead queue is flushing + correct_frames_to_key(cpi); + + if (rc->baseline_gf_interval > rc->frames_to_key) + rc->baseline_gf_interval = rc->frames_to_key; + + rc->gfu_boost = DEFAULT_GF_BOOST; + rc->constrained_gf_group = + (rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0; + + gf_group->max_layer_depth_allowed = cpi->oxcf.gf_max_pyr_height; + + // Rare case when the look-ahead is less than the target GOP length, can't + // generate ARF frame. + if (rc->baseline_gf_interval > cpi->oxcf.lag_in_frames || + !is_altref_enabled(cpi) || rc->baseline_gf_interval < rc->min_gf_interval) + gf_group->max_layer_depth_allowed = 0; + + // Set up the structure of this Group-Of-Pictures (same as GF_GROUP) + av1_gop_setup_structure(cpi, frame_params); + + // Allocate bits to each of the frames in the GF group. + // TODO(sarahparker) Extend this to work with pyramid structure. + for (int cur_index = 0; cur_index < gf_group->size; ++cur_index) { + const FRAME_UPDATE_TYPE cur_update_type = gf_group->update_type[cur_index]; + if (cpi->oxcf.rc_mode == AOM_CBR) { + if (cur_update_type == KEY_FRAME) { + target = av1_calc_iframe_target_size_one_pass_cbr(cpi); + } else { + target = av1_calc_pframe_target_size_one_pass_cbr(cpi, cur_update_type); + } + } else { + if (cur_update_type == KEY_FRAME) { + target = av1_calc_iframe_target_size_one_pass_vbr(cpi); + } else { + target = av1_calc_pframe_target_size_one_pass_vbr(cpi, cur_update_type); + } + } + gf_group->bit_allocation[cur_index] = target; + } +} + +static INLINE void set_baseline_gf_interval(AV1_COMP *cpi, int arf_position, + int active_max_gf_interval, + int use_alt_ref, + int is_final_pass) { + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->twopass; + // Set the interval until the next gf. + // If forward keyframes are enabled, ensure the final gf group obeys the + // MIN_FWD_KF_INTERVAL. + if (cpi->oxcf.fwd_kf_enabled && use_alt_ref && + ((twopass->stats_in - arf_position + rc->frames_to_key) < + twopass->stats_buf_ctx->stats_in_end) && + cpi->rc.next_is_fwd_key) { + if (arf_position == rc->frames_to_key) { + rc->baseline_gf_interval = arf_position; + // if the last gf group will be smaller than MIN_FWD_KF_INTERVAL + } else if ((rc->frames_to_key - arf_position < + AOMMAX(MIN_FWD_KF_INTERVAL, rc->min_gf_interval)) && + (rc->frames_to_key != arf_position)) { + // if possible, merge the last two gf groups + if (rc->frames_to_key <= active_max_gf_interval) { + rc->baseline_gf_interval = rc->frames_to_key; + if (is_final_pass) rc->intervals_till_gf_calculate_due = 0; + // if merging the last two gf groups creates a group that is too long, + // split them and force the last gf group to be the MIN_FWD_KF_INTERVAL + } else { + rc->baseline_gf_interval = rc->frames_to_key - MIN_FWD_KF_INTERVAL; + if (is_final_pass) rc->intervals_till_gf_calculate_due = 0; + } + } else { + rc->baseline_gf_interval = arf_position - rc->source_alt_ref_pending; + } + } else { + rc->baseline_gf_interval = arf_position - rc->source_alt_ref_pending; + } +} + +// initialize GF_GROUP_STATS +static void init_gf_stats(GF_GROUP_STATS *gf_stats) { + gf_stats->gf_group_err = 0.0; + gf_stats->gf_group_raw_error = 0.0; + gf_stats->gf_group_skip_pct = 0.0; + gf_stats->gf_group_inactive_zone_rows = 0.0; + + gf_stats->mv_ratio_accumulator = 0.0; + gf_stats->decay_accumulator = 1.0; + gf_stats->zero_motion_accumulator = 1.0; + gf_stats->loop_decay_rate = 1.0; + gf_stats->last_loop_decay_rate = 1.0; + gf_stats->this_frame_mv_in_out = 0.0; + gf_stats->mv_in_out_accumulator = 0.0; + gf_stats->abs_mv_in_out_accumulator = 0.0; + + gf_stats->avg_sr_coded_error = 0.0; + gf_stats->avg_tr_coded_error = 0.0; + gf_stats->avg_pcnt_second_ref = 0.0; + gf_stats->avg_pcnt_third_ref = 0.0; + gf_stats->avg_pcnt_third_ref_nolast = 0.0; + gf_stats->avg_new_mv_count = 0.0; + gf_stats->avg_wavelet_energy = 0.0; + gf_stats->avg_raw_err_stdev = 0.0; + gf_stats->non_zero_stdev_count = 0; + + gf_stats->allow_alt_ref = 0; +} + +// Analyse and define a gf/arf group. +#define MAX_GF_BOOST 5400 +static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame, + const EncodeFrameParams *const frame_params, + int max_gop_length, int is_final_pass) { + AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + AV1EncoderConfig *const oxcf = &cpi->oxcf; + TWO_PASS *const twopass = &cpi->twopass; + FIRSTPASS_STATS next_frame; + const FIRSTPASS_STATS *const start_pos = twopass->stats_in; + GF_GROUP *gf_group = &cpi->gf_group; + FRAME_INFO *frame_info = &cpi->frame_info; + int i; + + int flash_detected; + int64_t gf_group_bits; + const int is_intra_only = frame_params->frame_type == KEY_FRAME || + frame_params->frame_type == INTRA_ONLY_FRAME; + const int arf_active_or_kf = is_intra_only || rc->source_alt_ref_active; + + cpi->internal_altref_allowed = (oxcf->gf_max_pyr_height > 1); + + // Reset the GF group data structures unless this is a key + // frame in which case it will already have been done. + if (!is_intra_only) { + av1_zero(cpi->gf_group); + } + + aom_clear_system_state(); + av1_zero(next_frame); + + if (has_no_stats_stage(cpi)) { + define_gf_group_pass0(cpi, frame_params); + return; + } + + // correct frames_to_key when lookahead queue is emptying + if (cpi->lap_enabled) { + correct_frames_to_key(cpi); + } + + GF_GROUP_STATS gf_stats; + init_gf_stats(&gf_stats); + GF_FRAME_STATS first_frame_stats, last_frame_stats; + + gf_stats.allow_alt_ref = is_altref_enabled(cpi); + const int can_disable_arf = (oxcf->gf_min_pyr_height == MIN_PYRAMID_LVL); + + // Load stats for the current frame. + double mod_frame_err = + calculate_modified_err(frame_info, twopass, oxcf, this_frame); + + // Note the error of the frame at the start of the group. This will be + // the GF frame error if we code a normal gf. + first_frame_stats.frame_err = mod_frame_err; + first_frame_stats.frame_coded_error = this_frame->coded_error; + first_frame_stats.frame_sr_coded_error = this_frame->sr_coded_error; + first_frame_stats.frame_tr_coded_error = this_frame->tr_coded_error; + + // If this is a key frame or the overlay from a previous arf then + // the error score / cost of this frame has already been accounted for. + if (arf_active_or_kf) { + gf_stats.gf_group_err -= first_frame_stats.frame_err; +#if GROUP_ADAPTIVE_MAXQ + gf_stats.gf_group_raw_error -= this_frame->coded_error; +#endif + gf_stats.gf_group_skip_pct -= this_frame->intra_skip_pct; + gf_stats.gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows; + } + + // TODO(urvang): Try logic to vary min and max interval based on q. + const int active_min_gf_interval = rc->min_gf_interval; + const int active_max_gf_interval = + AOMMIN(rc->max_gf_interval, max_gop_length); + + i = 0; + // get the determined gf group length from rc->gf_intervals + while (i < rc->gf_intervals[rc->cur_gf_index]) { + ++i; + // Accumulate error score of frames in this gf group. + mod_frame_err = + calculate_modified_err(frame_info, twopass, oxcf, this_frame); + // accumulate stats for this frame + accumulate_this_frame_stats(this_frame, mod_frame_err, &gf_stats); + + // read in the next frame + if (EOF == input_stats(twopass, &next_frame)) break; + + // Test for the case where there is a brief flash but the prediction + // quality back to an earlier frame is then restored. + flash_detected = detect_flash(twopass, 0); + + // accumulate stats for next frame + accumulate_next_frame_stats( + &next_frame, frame_info, twopass, flash_detected, rc->frames_since_key, + i, can_disable_arf, rc->min_gf_interval, &gf_stats); + + *this_frame = next_frame; + } + // save the errs for the last frame + last_frame_stats.frame_coded_error = next_frame.coded_error; + last_frame_stats.frame_sr_coded_error = next_frame.sr_coded_error; + last_frame_stats.frame_tr_coded_error = next_frame.tr_coded_error; + + if (is_final_pass) { + rc->intervals_till_gf_calculate_due--; + rc->cur_gf_index++; + } + + // Was the group length constrained by the requirement for a new KF? + rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0; + + const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) + ? cpi->initial_mbs + : cm->mi_params.MBs; + assert(num_mbs > 0); + + average_gf_stats(i, &next_frame, &gf_stats); + + // Disable internal ARFs for "still" gf groups. + // zero_motion_accumulator: minimum percentage of (0,0) motion; + // avg_sr_coded_error: average of the SSE per pixel of each frame; + // avg_raw_err_stdev: average of the standard deviation of (0,0) + // motion error per block of each frame. + const int can_disable_internal_arfs = + (oxcf->gf_min_pyr_height <= MIN_PYRAMID_LVL + 1); + if (can_disable_internal_arfs && + gf_stats.zero_motion_accumulator > MIN_ZERO_MOTION && + gf_stats.avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR && + gf_stats.avg_raw_err_stdev < MAX_RAW_ERR_VAR) { + cpi->internal_altref_allowed = 0; + } + + int use_alt_ref; + if (can_disable_arf) { + use_alt_ref = !is_almost_static(gf_stats.zero_motion_accumulator, + twopass->kf_zeromotion_pct) && + gf_stats.allow_alt_ref && (i < cpi->oxcf.lag_in_frames) && + (i >= MIN_GF_INTERVAL) && + (cpi->oxcf.gf_max_pyr_height > MIN_PYRAMID_LVL); + + // TODO(urvang): Improve and use model for VBR, CQ etc as well. + if (use_alt_ref && cpi->oxcf.rc_mode == AOM_Q && + cpi->oxcf.cq_level <= 200) { + aom_clear_system_state(); + float features[21]; + get_features_from_gf_stats( + &gf_stats, &first_frame_stats, &last_frame_stats, num_mbs, + rc->constrained_gf_group, twopass->kf_zeromotion_pct, i, features); + // Infer using ML model. + float score; + av1_nn_predict(features, &av1_use_flat_gop_nn_config, 1, &score); + use_alt_ref = (score <= 0.0); + } + } else { + assert(cpi->oxcf.gf_max_pyr_height > MIN_PYRAMID_LVL); + use_alt_ref = + gf_stats.allow_alt_ref && (i < cpi->oxcf.lag_in_frames) && (i > 2); + } + +#define REDUCE_GF_LENGTH_THRESH 4 +#define REDUCE_GF_LENGTH_TO_KEY_THRESH 9 +#define REDUCE_GF_LENGTH_BY 1 + int alt_offset = 0; + // The length reduction strategy is tweaked for certain cases, and doesn't + // work well for certain other cases. + const int allow_gf_length_reduction = + ((cpi->oxcf.rc_mode == AOM_Q && cpi->oxcf.cq_level <= 128) || + !cpi->internal_altref_allowed) && + !is_lossless_requested(&cpi->oxcf); + + if (allow_gf_length_reduction && use_alt_ref) { + // adjust length of this gf group if one of the following condition met + // 1: only one overlay frame left and this gf is too long + // 2: next gf group is too short to have arf compared to the current gf + + // maximum length of next gf group + const int next_gf_len = rc->frames_to_key - i; + const int single_overlay_left = + next_gf_len == 0 && i > REDUCE_GF_LENGTH_THRESH; + // the next gf is probably going to have a ARF but it will be shorter than + // this gf + const int unbalanced_gf = + i > REDUCE_GF_LENGTH_TO_KEY_THRESH && + next_gf_len + 1 < REDUCE_GF_LENGTH_TO_KEY_THRESH && + next_gf_len + 1 >= rc->min_gf_interval; + + if (single_overlay_left || unbalanced_gf) { + const int roll_back = REDUCE_GF_LENGTH_BY; + // Reduce length only if active_min_gf_interval will be respected later. + if (i - roll_back >= active_min_gf_interval + 1) { + alt_offset = -roll_back; + i -= roll_back; + if (is_final_pass) rc->intervals_till_gf_calculate_due = 0; + } + } + } + + // Should we use the alternate reference frame. + if (use_alt_ref) { + rc->source_alt_ref_pending = 1; + gf_group->max_layer_depth_allowed = cpi->oxcf.gf_max_pyr_height; + set_baseline_gf_interval(cpi, i, active_max_gf_interval, use_alt_ref, + is_final_pass); + + const int forward_frames = (rc->frames_to_key - i >= i - 1) + ? i - 1 + : AOMMAX(0, rc->frames_to_key - i); + + // Calculate the boost for alt ref. + rc->gfu_boost = av1_calc_arf_boost( + twopass, rc, frame_info, alt_offset, forward_frames, (i - 1), + cpi->lap_enabled ? &rc->num_stats_used_for_gfu_boost : NULL, + cpi->lap_enabled ? &rc->num_stats_required_for_gfu_boost : NULL); + } else { + reset_fpf_position(twopass, start_pos); + rc->source_alt_ref_pending = 0; + gf_group->max_layer_depth_allowed = 0; + set_baseline_gf_interval(cpi, i, active_max_gf_interval, use_alt_ref, + is_final_pass); + + rc->gfu_boost = AOMMIN( + MAX_GF_BOOST, + av1_calc_arf_boost( + twopass, rc, frame_info, alt_offset, (i - 1), 0, + cpi->lap_enabled ? &rc->num_stats_used_for_gfu_boost : NULL, + cpi->lap_enabled ? &rc->num_stats_required_for_gfu_boost : NULL)); + } + + // rc->gf_intervals assumes the usage of alt_ref, therefore adding one overlay + // frame to the next gf. If no alt_ref is used, should substract 1 frame from + // the next gf group. + // TODO(bohanli): should incorporate the usage of alt_ref into + // calculate_gf_length + if (is_final_pass && rc->source_alt_ref_pending == 0 && + rc->intervals_till_gf_calculate_due > 0) { + rc->gf_intervals[rc->cur_gf_index]--; + } + +#define LAST_ALR_BOOST_FACTOR 0.2f + rc->arf_boost_factor = 1.0; + if (rc->source_alt_ref_pending && !is_lossless_requested(&cpi->oxcf)) { + // Reduce the boost of altref in the last gf group + if (rc->frames_to_key - i == REDUCE_GF_LENGTH_BY || + rc->frames_to_key - i == 0) { + rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR; + } + } + + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + + // Reset the file position. + reset_fpf_position(twopass, start_pos); + + // Calculate the bits to be allocated to the gf/arf group as a whole + gf_group_bits = calculate_total_gf_group_bits(cpi, gf_stats.gf_group_err); + rc->gf_group_bits = gf_group_bits; + +#if GROUP_ADAPTIVE_MAXQ + // Calculate an estimate of the maxq needed for the group. + // We are more agressive about correcting for sections + // where there could be significant overshoot than for easier + // sections where we do not wish to risk creating an overshoot + // of the allocated bit budget. + if ((cpi->oxcf.rc_mode != AOM_Q) && (rc->baseline_gf_interval > 1)) { + const int vbr_group_bits_per_frame = + (int)(gf_group_bits / rc->baseline_gf_interval); + const double group_av_err = + gf_stats.gf_group_raw_error / rc->baseline_gf_interval; + const double group_av_skip_pct = + gf_stats.gf_group_skip_pct / rc->baseline_gf_interval; + const double group_av_inactive_zone = + ((gf_stats.gf_group_inactive_zone_rows * 2) / + (rc->baseline_gf_interval * (double)cm->mi_params.mb_rows)); + + int tmp_q; + // rc factor is a weight factor that corrects for local rate control drift. + double rc_factor = 1.0; + int64_t bits = cpi->oxcf.target_bandwidth; + + if (bits > 0) { + int rate_error; + + rate_error = (int)((rc->vbr_bits_off_target * 100) / bits); + rate_error = clamp(rate_error, -100, 100); + if (rate_error > 0) { + rc_factor = AOMMAX(RC_FACTOR_MIN, (double)(100 - rate_error) / 100.0); + } else { + rc_factor = AOMMIN(RC_FACTOR_MAX, (double)(100 - rate_error) / 100.0); + } + } + + tmp_q = get_twopass_worst_quality( + cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone), + vbr_group_bits_per_frame, rc_factor); + rc->active_worst_quality = AOMMAX(tmp_q, rc->active_worst_quality >> 1); + } +#endif + + // Adjust KF group bits and error remaining. + if (is_final_pass) + twopass->kf_group_error_left -= (int64_t)gf_stats.gf_group_err; + + // Set up the structure of this Group-Of-Pictures (same as GF_GROUP) + av1_gop_setup_structure(cpi, frame_params); + + // Reset the file position. + reset_fpf_position(twopass, start_pos); + + // Calculate a section intra ratio used in setting max loop filter. + if (frame_params->frame_type != KEY_FRAME) { + twopass->section_intra_rating = calculate_section_intra_ratio( + start_pos, twopass->stats_buf_ctx->stats_in_end, + rc->baseline_gf_interval); + } + + // Reset rolling actual and target bits counters for ARF groups. + twopass->rolling_arf_group_target_bits = 1; + twopass->rolling_arf_group_actual_bits = 1; + + av1_gop_bit_allocation(cpi, rc, gf_group, + frame_params->frame_type == KEY_FRAME, use_alt_ref, + gf_group_bits); +} + +// #define FIXED_ARF_BITS +#ifdef FIXED_ARF_BITS +#define ARF_BITS_FRACTION 0.75 +#endif +void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc, + GF_GROUP *gf_group, int is_key_frame, int use_arf, + int64_t gf_group_bits) { + // Calculate the extra bits to be used for boosted frame(s) +#ifdef FIXED_ARF_BITS + int gf_arf_bits = (int)(ARF_BITS_FRACTION * gf_group_bits); +#else + int gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval, + rc->gfu_boost, gf_group_bits); +#endif + + gf_arf_bits = adjust_boost_bits_for_target_level(cpi, rc, gf_arf_bits, + gf_group_bits, 1); + + // Allocate bits to each of the frames in the GF group. + allocate_gf_group_bits(gf_group, rc, gf_group_bits, gf_arf_bits, is_key_frame, + use_arf); +} + +// Minimum % intra coding observed in first pass (1.0 = 100%) +#define MIN_INTRA_LEVEL 0.25 +// Minimum ratio between the % of intra coding and inter coding in the first +// pass after discounting neutral blocks (discounting neutral blocks in this +// way helps catch scene cuts in clips with very flat areas or letter box +// format clips with image padding. +#define INTRA_VS_INTER_THRESH 2.0 +// Hard threshold where the first pass chooses intra for almost all blocks. +// In such a case even if the frame is not a scene cut coding a key frame +// may be a good option. +#define VERY_LOW_INTER_THRESH 0.05 +// Maximum threshold for the relative ratio of intra error score vs best +// inter error score. +#define KF_II_ERR_THRESHOLD 2.5 +// In real scene cuts there is almost always a sharp change in the intra +// or inter error score. +#define ERR_CHANGE_THRESHOLD 0.4 +// For real scene cuts we expect an improvment in the intra inter error +// ratio in the next frame. +#define II_IMPROVEMENT_THRESHOLD 3.5 +#define KF_II_MAX 128.0 + +// Threshold for use of the lagging second reference frame. High second ref +// usage may point to a transient event like a flash or occlusion rather than +// a real scene cut. +// We adapt the threshold based on number of frames in this key-frame group so +// far. +static double get_second_ref_usage_thresh(int frame_count_so_far) { + const int adapt_upto = 32; + const double min_second_ref_usage_thresh = 0.085; + const double second_ref_usage_thresh_max_delta = 0.035; + if (frame_count_so_far >= adapt_upto) { + return min_second_ref_usage_thresh + second_ref_usage_thresh_max_delta; + } + return min_second_ref_usage_thresh + + ((double)frame_count_so_far / (adapt_upto - 1)) * + second_ref_usage_thresh_max_delta; +} + +static int test_candidate_kf(TWO_PASS *twopass, + const FIRSTPASS_STATS *last_frame, + const FIRSTPASS_STATS *this_frame, + const FIRSTPASS_STATS *next_frame, + int frame_count_so_far, enum aom_rc_mode rc_mode) { + int is_viable_kf = 0; + double pcnt_intra = 1.0 - this_frame->pcnt_inter; + double modified_pcnt_inter = + this_frame->pcnt_inter - this_frame->pcnt_neutral; + const double second_ref_usage_thresh = + get_second_ref_usage_thresh(frame_count_so_far); + + // Does the frame satisfy the primary criteria of a key frame? + // See above for an explanation of the test criteria. + // If so, then examine how well it predicts subsequent frames. + if (IMPLIES(rc_mode == AOM_Q, frame_count_so_far >= 3) && + (this_frame->pcnt_second_ref < second_ref_usage_thresh) && + (next_frame->pcnt_second_ref < second_ref_usage_thresh) && + ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) || + ((pcnt_intra > MIN_INTRA_LEVEL) && + (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) && + ((this_frame->intra_error / + DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < + KF_II_ERR_THRESHOLD) && + ((fabs(last_frame->coded_error - this_frame->coded_error) / + DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > + ERR_CHANGE_THRESHOLD) || + (fabs(last_frame->intra_error - this_frame->intra_error) / + DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > + ERR_CHANGE_THRESHOLD) || + ((next_frame->intra_error / + DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > + II_IMPROVEMENT_THRESHOLD))))) { + int i; + const FIRSTPASS_STATS *start_pos = twopass->stats_in; + FIRSTPASS_STATS local_next_frame = *next_frame; + double boost_score = 0.0; + double old_boost_score = 0.0; + double decay_accumulator = 1.0; + + // Examine how well the key frame predicts subsequent frames. + for (i = 0; i < SCENE_CUT_KEY_TEST_INTERVAL; ++i) { + double next_iiratio = (BOOST_FACTOR * local_next_frame.intra_error / + DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error)); + + if (next_iiratio > KF_II_MAX) next_iiratio = KF_II_MAX; + + // Cumulative effect of decay in prediction quality. + if (local_next_frame.pcnt_inter > 0.85) + decay_accumulator *= local_next_frame.pcnt_inter; + else + decay_accumulator *= (0.85 + local_next_frame.pcnt_inter) / 2.0; + + // Keep a running total. + boost_score += (decay_accumulator * next_iiratio); + + // Test various breakout clauses. + if ((local_next_frame.pcnt_inter < 0.05) || (next_iiratio < 1.5) || + (((local_next_frame.pcnt_inter - local_next_frame.pcnt_neutral) < + 0.20) && + (next_iiratio < 3.0)) || + ((boost_score - old_boost_score) < 3.0) || + (local_next_frame.intra_error < 200)) { + break; + } + + old_boost_score = boost_score; + + // Get the next frame details + if (EOF == input_stats(twopass, &local_next_frame)) break; + } + + // If there is tolerable prediction for at least the next 3 frames then + // break out else discard this potential key frame and move on + if (boost_score > 30.0 && (i > 3)) { + is_viable_kf = 1; + } else { + // Reset the file position + reset_fpf_position(twopass, start_pos); + + is_viable_kf = 0; + } + } + + return is_viable_kf; +} + +#define FRAMES_TO_CHECK_DECAY 8 +#define KF_MIN_FRAME_BOOST 80.0 +#define KF_MAX_FRAME_BOOST 128.0 +#define MIN_KF_BOOST 600 // Minimum boost for non-static KF interval +#define MAX_KF_BOOST 3200 +#define MIN_STATIC_KF_BOOST 5400 // Minimum boost for static KF interval + +static int detect_app_forced_key(AV1_COMP *cpi) { + if (cpi->oxcf.fwd_kf_enabled) cpi->rc.next_is_fwd_key = 1; + int num_frames_to_app_forced_key = is_forced_keyframe_pending( + cpi->lookahead, cpi->lookahead->max_sz, cpi->compressor_stage); + if (num_frames_to_app_forced_key != -1) cpi->rc.next_is_fwd_key = 0; + return num_frames_to_app_forced_key; +} + +static int get_projected_kf_boost(AV1_COMP *cpi) { + /* + * If num_stats_used_for_kf_boost >= frames_to_key, then + * all stats needed for prior boost calculation are available. + * Hence projecting the prior boost is not needed in this cases. + */ + if (cpi->rc.num_stats_used_for_kf_boost >= cpi->rc.frames_to_key) + return cpi->rc.kf_boost; + + // Get the current tpl factor (number of frames = frames_to_key). + double tpl_factor = av1_get_kf_boost_projection_factor(cpi->rc.frames_to_key); + // Get the tpl factor when number of frames = num_stats_used_for_kf_boost. + double tpl_factor_num_stats = + av1_get_kf_boost_projection_factor(cpi->rc.num_stats_used_for_kf_boost); + int projected_kf_boost = + (int)rint((tpl_factor * cpi->rc.kf_boost) / tpl_factor_num_stats); + return projected_kf_boost; +} + +static int define_kf_interval(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame, + double *kf_group_err, + int num_frames_to_detect_scenecut) { + TWO_PASS *const twopass = &cpi->twopass; + RATE_CONTROL *const rc = &cpi->rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + double recent_loop_decay[FRAMES_TO_CHECK_DECAY]; + FIRSTPASS_STATS last_frame; + double decay_accumulator = 1.0; + int i = 0, j; + int frames_to_key = 1; + int frames_since_key = rc->frames_since_key + 1; + FRAME_INFO *const frame_info = &cpi->frame_info; + int num_stats_used_for_kf_boost = 1; + int scenecut_detected = 0; + + int num_frames_to_next_key = detect_app_forced_key(cpi); + + if (num_frames_to_detect_scenecut == 0) { + if (num_frames_to_next_key != -1) + return num_frames_to_next_key; + else + return rc->frames_to_key; + } + + if (num_frames_to_next_key != -1) + num_frames_to_detect_scenecut = + AOMMIN(num_frames_to_detect_scenecut, num_frames_to_next_key); + + // Initialize the decay rates for the recent frames to check + for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0; + + i = 0; + while (twopass->stats_in < twopass->stats_buf_ctx->stats_in_end && + frames_to_key < num_frames_to_detect_scenecut) { + // Accumulate total number of stats available till next key frame + num_stats_used_for_kf_boost++; + + // Accumulate kf group error. + if (kf_group_err != NULL) + *kf_group_err += + calculate_modified_err(frame_info, twopass, oxcf, this_frame); + + // Load the next frame's stats. + last_frame = *this_frame; + input_stats(twopass, this_frame); + + // Provided that we are not at the end of the file... + if (cpi->rc.enable_scenecut_detection && cpi->oxcf.auto_key && + twopass->stats_in < twopass->stats_buf_ctx->stats_in_end) { + double loop_decay_rate; + + // Check for a scene cut. + if (test_candidate_kf(twopass, &last_frame, this_frame, twopass->stats_in, + frames_since_key, oxcf->rc_mode)) { + scenecut_detected = 1; + break; + } + + // How fast is the prediction quality decaying? + loop_decay_rate = + get_prediction_decay_rate(frame_info, twopass->stats_in); + + // We want to know something about the recent past... rather than + // as used elsewhere where we are concerned with decay in prediction + // quality since the last GF or KF. + recent_loop_decay[i % FRAMES_TO_CHECK_DECAY] = loop_decay_rate; + decay_accumulator = 1.0; + for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) + decay_accumulator *= recent_loop_decay[j]; + + // Special check for transition or high motion followed by a + // static scene. + if (detect_transition_to_still(twopass, rc->min_gf_interval, i, + cpi->oxcf.key_freq - i, loop_decay_rate, + decay_accumulator)) { + scenecut_detected = 1; + break; + } + + // Step on to the next frame. + ++frames_to_key; + ++frames_since_key; + + // If we don't have a real key frame within the next two + // key_freq intervals then break out of the loop. + if (frames_to_key >= 2 * cpi->oxcf.key_freq) break; + } else { + ++frames_to_key; + ++frames_since_key; + } + ++i; + } + + if (kf_group_err != NULL) + rc->num_stats_used_for_kf_boost = num_stats_used_for_kf_boost; + + if (cpi->lap_enabled && !scenecut_detected) + frames_to_key = num_frames_to_next_key; + + return frames_to_key; +} + +static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->twopass; + GF_GROUP *const gf_group = &cpi->gf_group; + FRAME_INFO *const frame_info = &cpi->frame_info; + AV1_COMMON *const cm = &cpi->common; + CurrentFrame *const current_frame = &cm->current_frame; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const FIRSTPASS_STATS first_frame = *this_frame; + FIRSTPASS_STATS next_frame; + av1_zero(next_frame); + + rc->frames_since_key = 0; + + // Reset the GF group data structures. + av1_zero(*gf_group); + + // Clear the alt ref active flag and last group multi arf flags as they + // can never be set for a key frame. + rc->source_alt_ref_active = 0; + + // KF is always a GF so clear frames till next gf counter. + rc->frames_till_gf_update_due = 0; + + rc->frames_to_key = 1; + + if (has_no_stats_stage(cpi)) { + int num_frames_to_app_forced_key = detect_app_forced_key(cpi); + rc->this_key_frame_forced = + current_frame->frame_number != 0 && rc->frames_to_key == 0; + if (num_frames_to_app_forced_key != -1) + rc->frames_to_key = num_frames_to_app_forced_key; + else + rc->frames_to_key = AOMMAX(1, cpi->oxcf.key_freq); + correct_frames_to_key(cpi); + rc->kf_boost = DEFAULT_KF_BOOST; + rc->source_alt_ref_active = 0; + gf_group->update_type[0] = KF_UPDATE; + return; + } + int i; + const FIRSTPASS_STATS *const start_position = twopass->stats_in; + int kf_bits = 0; + double zero_motion_accumulator = 1.0; + double boost_score = 0.0; + double kf_raw_err = 0.0; + double kf_mod_err = 0.0; + double kf_group_err = 0.0; + double sr_accumulator = 0.0; + int frames_to_key; + // Is this a forced key frame by interval. + rc->this_key_frame_forced = rc->next_key_frame_forced; + + twopass->kf_group_bits = 0; // Total bits available to kf group + twopass->kf_group_error_left = 0; // Group modified error score. + + kf_raw_err = this_frame->intra_error; + kf_mod_err = calculate_modified_err(frame_info, twopass, oxcf, this_frame); + + frames_to_key = + define_kf_interval(cpi, this_frame, &kf_group_err, oxcf->key_freq); + + if (frames_to_key != -1) + rc->frames_to_key = AOMMIN(oxcf->key_freq, frames_to_key); + else + rc->frames_to_key = oxcf->key_freq; + + if (cpi->lap_enabled) correct_frames_to_key(cpi); + + // If there is a max kf interval set by the user we must obey it. + // We already breakout of the loop above at 2x max. + // This code centers the extra kf if the actual natural interval + // is between 1x and 2x. + if (cpi->oxcf.auto_key && rc->frames_to_key > cpi->oxcf.key_freq) { + FIRSTPASS_STATS tmp_frame = first_frame; + + rc->frames_to_key /= 2; + + // Reset to the start of the group. + reset_fpf_position(twopass, start_position); + + kf_group_err = 0.0; + + // Rescan to get the correct error data for the forced kf group. + for (i = 0; i < rc->frames_to_key; ++i) { + kf_group_err += + calculate_modified_err(frame_info, twopass, oxcf, &tmp_frame); + if (EOF == input_stats(twopass, &tmp_frame)) break; + } + rc->next_key_frame_forced = 1; + } else if ((twopass->stats_in == twopass->stats_buf_ctx->stats_in_end && + is_stat_consumption_stage_twopass(cpi)) || + rc->frames_to_key >= cpi->oxcf.key_freq) { + rc->next_key_frame_forced = 1; + } else { + rc->next_key_frame_forced = 0; + } + + // Special case for the last key frame of the file. + if (twopass->stats_in >= twopass->stats_buf_ctx->stats_in_end) { + // Accumulate kf group error. + kf_group_err += + calculate_modified_err(frame_info, twopass, oxcf, this_frame); + } + + // Calculate the number of bits that should be assigned to the kf group. + if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) { + // Maximum number of bits for a single normal frame (not key frame). + const int max_bits = frame_max_bits(rc, &cpi->oxcf); + + // Maximum number of bits allocated to the key frame group. + int64_t max_grp_bits; + + // Default allocation based on bits left and relative + // complexity of the section. + twopass->kf_group_bits = (int64_t)( + twopass->bits_left * (kf_group_err / twopass->modified_error_left)); + + // Clip based on maximum per frame rate defined by the user. + max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key; + if (twopass->kf_group_bits > max_grp_bits) + twopass->kf_group_bits = max_grp_bits; + } else { + twopass->kf_group_bits = 0; + } + twopass->kf_group_bits = AOMMAX(0, twopass->kf_group_bits); + + // Reset the first pass file position. + reset_fpf_position(twopass, start_position); + + // Scan through the kf group collating various stats used to determine + // how many bits to spend on it. + boost_score = 0.0; + const double kf_max_boost = + cpi->oxcf.rc_mode == AOM_Q + ? AOMMIN(AOMMAX(rc->frames_to_key * 2.0, KF_MIN_FRAME_BOOST), + KF_MAX_FRAME_BOOST) + : KF_MAX_FRAME_BOOST; + for (i = 0; i < (rc->frames_to_key - 1); ++i) { + if (EOF == input_stats(twopass, &next_frame)) break; + + // Monitor for static sections. + // For the first frame in kf group, the second ref indicator is invalid. + if (i > 0) { + zero_motion_accumulator = + AOMMIN(zero_motion_accumulator, + get_zero_motion_factor(frame_info, &next_frame)); + } else { + zero_motion_accumulator = next_frame.pcnt_inter - next_frame.pcnt_motion; + } + + // Not all frames in the group are necessarily used in calculating boost. + if ((sr_accumulator < (kf_raw_err * 1.50)) && + (i <= rc->max_gf_interval * 2)) { + double frame_boost; + double zm_factor; + + // Factor 0.75-1.25 based on how much of frame is static. + zm_factor = (0.75 + (zero_motion_accumulator / 2.0)); + + if (i < 2) sr_accumulator = 0.0; + frame_boost = calc_kf_frame_boost(rc, frame_info, &next_frame, + &sr_accumulator, kf_max_boost); + boost_score += frame_boost * zm_factor; + } + } + + reset_fpf_position(twopass, start_position); + + // Store the zero motion percentage + twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0); + + // Calculate a section intra ratio used in setting max loop filter. + twopass->section_intra_rating = calculate_section_intra_ratio( + start_position, twopass->stats_buf_ctx->stats_in_end, rc->frames_to_key); + + rc->kf_boost = (int)boost_score; + + if (cpi->lap_enabled) { + rc->kf_boost = get_projected_kf_boost(cpi); + } + + // Special case for static / slide show content but don't apply + // if the kf group is very short. + if ((zero_motion_accumulator > STATIC_KF_GROUP_FLOAT_THRESH) && + (rc->frames_to_key > 8)) { + rc->kf_boost = AOMMAX(rc->kf_boost, MIN_STATIC_KF_BOOST); + } else { + // Apply various clamps for min and max boost + rc->kf_boost = AOMMAX(rc->kf_boost, (rc->frames_to_key * 3)); + rc->kf_boost = AOMMAX(rc->kf_boost, MIN_KF_BOOST); +#ifdef STRICT_RC + rc->kf_boost = AOMMIN(rc->kf_boost, MAX_KF_BOOST); +#endif + } + + // Work out how many bits to allocate for the key frame itself. + kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost, + twopass->kf_group_bits); + // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n", rc->kf_boost, + // kf_bits, twopass->kf_zeromotion_pct); + kf_bits = adjust_boost_bits_for_target_level(cpi, rc, kf_bits, + twopass->kf_group_bits, 0); + + twopass->kf_group_bits -= kf_bits; + + // Save the bits to spend on the key frame. + gf_group->bit_allocation[0] = kf_bits; + gf_group->update_type[0] = KF_UPDATE; + + // Note the total error score of the kf group minus the key frame itself. + twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err); + + // Adjust the count of total modified error left. + // The count of bits left is adjusted elsewhere based on real coded frame + // sizes. + twopass->modified_error_left -= kf_group_err; +} + +static int is_skippable_frame(const AV1_COMP *cpi) { + if (has_no_stats_stage(cpi)) return 0; + // If the current frame does not have non-zero motion vector detected in the + // first pass, and so do its previous and forward frames, then this frame + // can be skipped for partition check, and the partition size is assigned + // according to the variance + const TWO_PASS *const twopass = &cpi->twopass; + + return (!frame_is_intra_only(&cpi->common) && + twopass->stats_in - 2 > twopass->stats_buf_ctx->stats_in_start && + twopass->stats_in < twopass->stats_buf_ctx->stats_in_end && + (twopass->stats_in - 1)->pcnt_inter - + (twopass->stats_in - 1)->pcnt_motion == + 1 && + (twopass->stats_in - 2)->pcnt_inter - + (twopass->stats_in - 2)->pcnt_motion == + 1 && + twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1); +} + +#define ARF_STATS_OUTPUT 0 +#if ARF_STATS_OUTPUT +unsigned int arf_count = 0; +#endif +#define DEFAULT_GRP_WEIGHT 1.0 + +static void process_first_pass_stats(AV1_COMP *cpi, + FIRSTPASS_STATS *this_frame) { + AV1_COMMON *const cm = &cpi->common; + CurrentFrame *const current_frame = &cm->current_frame; + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->twopass; + + if (cpi->oxcf.rc_mode != AOM_Q && current_frame->frame_number == 0 && + cpi->twopass.stats_buf_ctx->total_stats && + cpi->twopass.stats_buf_ctx->total_left_stats) { + if (cpi->lap_enabled) { + /* + * Accumulate total_stats using available limited number of stats, + * and assign it to total_left_stats. + */ + *cpi->twopass.stats_buf_ctx->total_left_stats = + *cpi->twopass.stats_buf_ctx->total_stats; + } + const int frames_left = (int)(twopass->stats_buf_ctx->total_stats->count - + current_frame->frame_number); + + // Special case code for first frame. + const int section_target_bandwidth = + (int)(twopass->bits_left / frames_left); + const double section_length = + twopass->stats_buf_ctx->total_left_stats->count; + const double section_error = + twopass->stats_buf_ctx->total_left_stats->coded_error / section_length; + const double section_intra_skip = + twopass->stats_buf_ctx->total_left_stats->intra_skip_pct / + section_length; + const double section_inactive_zone = + (twopass->stats_buf_ctx->total_left_stats->inactive_zone_rows * 2) / + ((double)cm->mi_params.mb_rows * section_length); + const int tmp_q = get_twopass_worst_quality( + cpi, section_error, section_intra_skip + section_inactive_zone, + section_target_bandwidth, DEFAULT_GRP_WEIGHT); + + rc->active_worst_quality = tmp_q; + rc->ni_av_qi = tmp_q; + rc->last_q[INTER_FRAME] = tmp_q; + rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params.bit_depth); + rc->avg_frame_qindex[INTER_FRAME] = tmp_q; + rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.best_allowed_q) / 2; + rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME]; + } + + int err = 0; + if (cpi->lap_enabled) { + err = input_stats_lap(twopass, this_frame); + } else { + err = input_stats(twopass, this_frame); + } + if (err == EOF) return; + + { + const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) + ? cpi->initial_mbs + : cm->mi_params.MBs; + // The multiplication by 256 reverses a scaling factor of (>> 8) + // applied when combining MB error values for the frame. + twopass->mb_av_energy = log((this_frame->intra_error / num_mbs) + 1.0); + twopass->frame_avg_haar_energy = + log((this_frame->frame_avg_wavelet_energy / num_mbs) + 1.0); + } + + // Update the total stats remaining structure. + if (twopass->stats_buf_ctx->total_left_stats) + subtract_stats(twopass->stats_buf_ctx->total_left_stats, this_frame); + + // Set the frame content type flag. + if (this_frame->intra_skip_pct >= FC_ANIMATION_THRESH) + twopass->fr_content_type = FC_GRAPHICS_ANIMATION; + else + twopass->fr_content_type = FC_NORMAL; +} + +static void setup_target_rate(AV1_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + GF_GROUP *const gf_group = &cpi->gf_group; + + int target_rate = gf_group->bit_allocation[gf_group->index]; + + if (has_no_stats_stage(cpi)) { + av1_rc_set_frame_target(cpi, target_rate, cpi->common.width, + cpi->common.height); + } + + rc->base_frame_target = target_rate; +} + +void av1_get_second_pass_params(AV1_COMP *cpi, + EncodeFrameParams *const frame_params, + const EncodeFrameInput *const frame_input, + unsigned int frame_flags) { + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->twopass; + GF_GROUP *const gf_group = &cpi->gf_group; + AV1_COMMON *cm = &cpi->common; + + if (frame_is_intra_only(cm)) { + FeatureFlags *const features = &cm->features; + av1_set_screen_content_options(cpi, features); + cpi->is_screen_content_type = features->allow_screen_content_tools; + } + + if (is_stat_consumption_stage(cpi) && !twopass->stats_in) return; + + if (rc->frames_till_gf_update_due > 0 && !(frame_flags & FRAMEFLAGS_KEY)) { + assert(gf_group->index < gf_group->size); + const int update_type = gf_group->update_type[gf_group->index]; + + setup_target_rate(cpi); + + // If this is an arf frame then we dont want to read the stats file or + // advance the input pointer as we already have what we need. + if (update_type == ARF_UPDATE || update_type == INTNL_ARF_UPDATE) { + if (cpi->no_show_kf) { + assert(update_type == ARF_UPDATE); + frame_params->frame_type = KEY_FRAME; + } else { + frame_params->frame_type = INTER_FRAME; + } + + // Do the firstpass stats indicate that this frame is skippable for the + // partition search? + if (cpi->sf.part_sf.allow_partition_search_skip && cpi->oxcf.pass == 2) { + cpi->partition_search_skippable_frame = is_skippable_frame(cpi); + } + + return; + } + } + + aom_clear_system_state(); + + if (cpi->oxcf.rc_mode == AOM_Q) rc->active_worst_quality = cpi->oxcf.cq_level; + FIRSTPASS_STATS this_frame; + av1_zero(this_frame); + // call above fn + if (is_stat_consumption_stage(cpi)) { + process_first_pass_stats(cpi, &this_frame); + } else { + rc->active_worst_quality = cpi->oxcf.cq_level; + } + + // Keyframe and section processing. + if (rc->frames_to_key == 0 || (frame_flags & FRAMEFLAGS_KEY)) { + FIRSTPASS_STATS this_frame_copy; + this_frame_copy = this_frame; + frame_params->frame_type = KEY_FRAME; + // Define next KF group and assign bits to it. + find_next_key_frame(cpi, &this_frame); + this_frame = this_frame_copy; + } else { + frame_params->frame_type = INTER_FRAME; + const int altref_enabled = is_altref_enabled(cpi); + const int sframe_dist = cpi->oxcf.sframe_dist; + const int sframe_mode = cpi->oxcf.sframe_mode; + const int sframe_enabled = cpi->oxcf.sframe_enabled; + const int update_type = gf_group->update_type[gf_group->index]; + CurrentFrame *const current_frame = &cpi->common.current_frame; + if (sframe_enabled) { + if (altref_enabled) { + if (sframe_mode == 1) { + // sframe_mode == 1: insert sframe if it matches altref frame. + if (current_frame->frame_number % sframe_dist == 0 && + current_frame->frame_number != 0 && update_type == ARF_UPDATE) { + frame_params->frame_type = S_FRAME; + } + } else { + // sframe_mode != 1: if sframe will be inserted at the next available + // altref frame + if (current_frame->frame_number % sframe_dist == 0 && + current_frame->frame_number != 0) { + rc->sframe_due = 1; + } + if (rc->sframe_due && update_type == ARF_UPDATE) { + frame_params->frame_type = S_FRAME; + rc->sframe_due = 0; + } + } + } else { + if (current_frame->frame_number % sframe_dist == 0 && + current_frame->frame_number != 0) { + frame_params->frame_type = S_FRAME; + } + } + } + } + + // Define a new GF/ARF group. (Should always enter here for key frames). + if (rc->frames_till_gf_update_due == 0) { + assert(cpi->common.current_frame.frame_number == 0 || + gf_group->index == gf_group->size); + const FIRSTPASS_STATS *const start_position = twopass->stats_in; + int num_frames_to_detect_scenecut, frames_to_key; + if (cpi->lap_enabled && cpi->rc.enable_scenecut_detection) + num_frames_to_detect_scenecut = MAX_GF_LENGTH_LAP + 1; + else + num_frames_to_detect_scenecut = 0; + frames_to_key = define_kf_interval(cpi, &this_frame, NULL, + num_frames_to_detect_scenecut); + reset_fpf_position(twopass, start_position); + if (frames_to_key != -1) + rc->frames_to_key = AOMMIN(rc->frames_to_key, frames_to_key); + + int max_gop_length = (cpi->oxcf.lag_in_frames >= 32 && + is_stat_consumption_stage_twopass(cpi)) + ? MAX_GF_INTERVAL + : MAX_GF_LENGTH_LAP; + if (rc->intervals_till_gf_calculate_due == 0) { + calculate_gf_length(cpi, max_gop_length, MAX_NUM_GF_INTERVALS); + } + + if (max_gop_length > 16) { + if (rc->gf_intervals[rc->cur_gf_index] - 1 > 16) { + // The calculate_gf_length function is previously used with + // max_gop_length = 32 with look-ahead gf intervals. + define_gf_group(cpi, &this_frame, frame_params, max_gop_length, 0); + if (!av1_tpl_setup_stats(cpi, 1, frame_params, frame_input)) { + // Tpl decides that a shorter gf interval is better. + // TODO(jingning): Remove redundant computations here. + max_gop_length = 16; + calculate_gf_length(cpi, max_gop_length, 1); + } + } else { + // Even based on 32 we still decide to use a short gf interval. + // Better to re-decide based on 16 then + max_gop_length = 16; + calculate_gf_length(cpi, max_gop_length, 1); + } + } + define_gf_group(cpi, &this_frame, frame_params, max_gop_length, 1); + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + cpi->num_gf_group_show_frames = 0; + assert(gf_group->index == 0); + +#if ARF_STATS_OUTPUT + { + FILE *fpfile; + fpfile = fopen("arf.stt", "a"); + ++arf_count; + fprintf(fpfile, "%10d %10d %10d %10d %10d\n", + cpi->common.current_frame.frame_number, + rc->frames_till_gf_update_due, rc->kf_boost, arf_count, + rc->gfu_boost); + + fclose(fpfile); + } +#endif + } + assert(gf_group->index < gf_group->size); + + // Do the firstpass stats indicate that this frame is skippable for the + // partition search? + if (cpi->sf.part_sf.allow_partition_search_skip && cpi->oxcf.pass == 2) { + cpi->partition_search_skippable_frame = is_skippable_frame(cpi); + } + + setup_target_rate(cpi); +} + +void av1_init_second_pass(AV1_COMP *cpi) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + TWO_PASS *const twopass = &cpi->twopass; + FRAME_INFO *const frame_info = &cpi->frame_info; + double frame_rate; + FIRSTPASS_STATS *stats; + + if (!twopass->stats_buf_ctx->stats_in_end) return; + + stats = twopass->stats_buf_ctx->total_stats; + + *stats = *twopass->stats_buf_ctx->stats_in_end; + *twopass->stats_buf_ctx->total_left_stats = *stats; + + frame_rate = 10000000.0 * stats->count / stats->duration; + // Each frame can have a different duration, as the frame rate in the source + // isn't guaranteed to be constant. The frame rate prior to the first frame + // encoded in the second pass is a guess. However, the sum duration is not. + // It is calculated based on the actual durations of all frames from the + // first pass. + av1_new_framerate(cpi, frame_rate); + twopass->bits_left = + (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0); + + // This variable monitors how far behind the second ref update is lagging. + twopass->sr_update_lag = 1; + + // Scan the first pass file and calculate a modified total error based upon + // the bias/power function used to allocate bits. + { + const double avg_error = + stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count); + const FIRSTPASS_STATS *s = twopass->stats_in; + double modified_error_total = 0.0; + twopass->modified_error_min = + (avg_error * oxcf->two_pass_vbrmin_section) / 100; + twopass->modified_error_max = + (avg_error * oxcf->two_pass_vbrmax_section) / 100; + while (s < twopass->stats_buf_ctx->stats_in_end) { + modified_error_total += + calculate_modified_err(frame_info, twopass, oxcf, s); + ++s; + } + twopass->modified_error_left = modified_error_total; + } + + // Reset the vbr bits off target counters + cpi->rc.vbr_bits_off_target = 0; + cpi->rc.vbr_bits_off_target_fast = 0; + + cpi->rc.rate_error_estimate = 0; + + // Static sequence monitor variables. + twopass->kf_zeromotion_pct = 100; + twopass->last_kfgroup_zeromotion_pct = 100; + + // Initialize bits per macro_block estimate correction factor. + twopass->bpm_factor = 1.0; + // Initialize actual and target bits counters for ARF groups so that + // at the start we have a neutral bpm adjustment. + twopass->rolling_arf_group_target_bits = 1; + twopass->rolling_arf_group_actual_bits = 1; +} + +void av1_init_single_pass_lap(AV1_COMP *cpi) { + TWO_PASS *const twopass = &cpi->twopass; + + if (!twopass->stats_buf_ctx->stats_in_end) return; + + // This variable monitors how far behind the second ref update is lagging. + twopass->sr_update_lag = 1; + + twopass->bits_left = 0; + twopass->modified_error_min = 0.0; + twopass->modified_error_max = 0.0; + twopass->modified_error_left = 0.0; + + // Reset the vbr bits off target counters + cpi->rc.vbr_bits_off_target = 0; + cpi->rc.vbr_bits_off_target_fast = 0; + + cpi->rc.rate_error_estimate = 0; + + // Static sequence monitor variables. + twopass->kf_zeromotion_pct = 100; + twopass->last_kfgroup_zeromotion_pct = 100; + + // Initialize bits per macro_block estimate correction factor. + twopass->bpm_factor = 1.0; + // Initialize actual and target bits counters for ARF groups so that + // at the start we have a neutral bpm adjustment. + twopass->rolling_arf_group_target_bits = 1; + twopass->rolling_arf_group_actual_bits = 1; +} + +#define MINQ_ADJ_LIMIT 48 +#define MINQ_ADJ_LIMIT_CQ 20 +#define HIGH_UNDERSHOOT_RATIO 2 +void av1_twopass_postencode_update(AV1_COMP *cpi) { + TWO_PASS *const twopass = &cpi->twopass; + RATE_CONTROL *const rc = &cpi->rc; + const int bits_used = rc->base_frame_target; + + // VBR correction is done through rc->vbr_bits_off_target. Based on the + // sign of this value, a limited % adjustment is made to the target rate + // of subsequent frames, to try and push it back towards 0. This method + // is designed to prevent extreme behaviour at the end of a clip + // or group of frames. + rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size; + twopass->bits_left = AOMMAX(twopass->bits_left - bits_used, 0); + + // Target vs actual bits for this arf group. + twopass->rolling_arf_group_target_bits += rc->this_frame_target; + twopass->rolling_arf_group_actual_bits += rc->projected_frame_size; + + // Calculate the pct rc error. + if (rc->total_actual_bits) { + rc->rate_error_estimate = + (int)((rc->vbr_bits_off_target * 100) / rc->total_actual_bits); + rc->rate_error_estimate = clamp(rc->rate_error_estimate, -100, 100); + } else { + rc->rate_error_estimate = 0; + } + + // Update the active best quality pyramid. + if (!rc->is_src_frame_alt_ref) { + const int pyramid_level = cpi->gf_group.layer_depth[cpi->gf_group.index]; + int i; + for (i = pyramid_level; i <= MAX_ARF_LAYERS; ++i) { + rc->active_best_quality[i] = cpi->common.quant_params.base_qindex; + // if (pyramid_level >= 2) { + // rc->active_best_quality[pyramid_level] = + // AOMMAX(rc->active_best_quality[pyramid_level], + // cpi->common.base_qindex); + // } + } + } + +#if 0 + { + AV1_COMMON *cm = &cpi->common; + FILE *fpfile; + fpfile = fopen("details.stt", "a"); + fprintf(fpfile, + "%10d %10d %10d %10" PRId64 " %10" PRId64 + " %10d %10d %10d %10.4lf %10.4lf %10.4lf %10.4lf\n", + cm->current_frame.frame_number, rc->base_frame_target, + rc->projected_frame_size, rc->total_actual_bits, + rc->vbr_bits_off_target, rc->rate_error_estimate, + twopass->rolling_arf_group_target_bits, + twopass->rolling_arf_group_actual_bits, + (double)twopass->rolling_arf_group_actual_bits / + (double)twopass->rolling_arf_group_target_bits, + twopass->bpm_factor, + av1_convert_qindex_to_q(quant_params->base_qindex, + cm->seq_params.bit_depth), + av1_convert_qindex_to_q(rc->active_worst_quality, + cm->seq_params.bit_depth)); + fclose(fpfile); + } +#endif + + if (cpi->common.current_frame.frame_type != KEY_FRAME) { + twopass->kf_group_bits -= bits_used; + twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct; + } + twopass->kf_group_bits = AOMMAX(twopass->kf_group_bits, 0); + + // If the rate control is drifting consider adjustment to min or maxq. + if ((cpi->oxcf.rc_mode != AOM_Q) && !cpi->rc.is_src_frame_alt_ref) { + const int maxq_adj_limit = rc->worst_quality - rc->active_worst_quality; + const int minq_adj_limit = + (cpi->oxcf.rc_mode == AOM_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT); + + // Undershoot. + if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) { + --twopass->extend_maxq; + if (rc->rolling_target_bits >= rc->rolling_actual_bits) + ++twopass->extend_minq; + // Overshoot. + } else if (rc->rate_error_estimate < -cpi->oxcf.over_shoot_pct) { + --twopass->extend_minq; + if (rc->rolling_target_bits < rc->rolling_actual_bits) + ++twopass->extend_maxq; + } else { + // Adjustment for extreme local overshoot. + if (rc->projected_frame_size > (2 * rc->base_frame_target) && + rc->projected_frame_size > (2 * rc->avg_frame_bandwidth)) + ++twopass->extend_maxq; + + // Unwind undershoot or overshoot adjustment. + if (rc->rolling_target_bits < rc->rolling_actual_bits) + --twopass->extend_minq; + else if (rc->rolling_target_bits > rc->rolling_actual_bits) + --twopass->extend_maxq; + } + + twopass->extend_minq = clamp(twopass->extend_minq, 0, minq_adj_limit); + twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit); + + // If there is a big and undexpected undershoot then feed the extra + // bits back in quickly. One situation where this may happen is if a + // frame is unexpectedly almost perfectly predicted by the ARF or GF + // but not very well predcited by the previous frame. + if (!frame_is_kf_gf_arf(cpi) && !cpi->rc.is_src_frame_alt_ref) { + int fast_extra_thresh = rc->base_frame_target / HIGH_UNDERSHOOT_RATIO; + if (rc->projected_frame_size < fast_extra_thresh) { + rc->vbr_bits_off_target_fast += + fast_extra_thresh - rc->projected_frame_size; + rc->vbr_bits_off_target_fast = + AOMMIN(rc->vbr_bits_off_target_fast, (4 * rc->avg_frame_bandwidth)); + + // Fast adaptation of minQ if necessary to use up the extra bits. + if (rc->avg_frame_bandwidth) { + twopass->extend_minq_fast = + (int)(rc->vbr_bits_off_target_fast * 8 / rc->avg_frame_bandwidth); + } + twopass->extend_minq_fast = AOMMIN( + twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq); + } else if (rc->vbr_bits_off_target_fast) { + twopass->extend_minq_fast = AOMMIN( + twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq); + } else { + twopass->extend_minq_fast = 0; + } + } + } +} diff --git a/libs/libaom/src/av1/encoder/pass2_strategy.h b/libs/libaom/src/av1/encoder/pass2_strategy.h new file mode 100644 index 000000000..437fb8f79 --- /dev/null +++ b/libs/libaom/src/av1/encoder/pass2_strategy.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_PASS2_STRATEGY_H_ +#define AOM_AV1_ENCODER_PASS2_STRATEGY_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1_COMP; +struct EncodeFrameParams; +// structure of accumulated stats and features in a gf group +typedef struct { + double gf_group_err; + double gf_group_raw_error; + double gf_group_skip_pct; + double gf_group_inactive_zone_rows; + + double mv_ratio_accumulator; + double decay_accumulator; + double zero_motion_accumulator; + double loop_decay_rate; + double last_loop_decay_rate; + double this_frame_mv_in_out; + double mv_in_out_accumulator; + double abs_mv_in_out_accumulator; + + double avg_sr_coded_error; + double avg_tr_coded_error; + double avg_pcnt_second_ref; + double avg_pcnt_third_ref; + double avg_pcnt_third_ref_nolast; + double avg_new_mv_count; + double avg_wavelet_energy; + double avg_raw_err_stdev; + int non_zero_stdev_count; + + unsigned int allow_alt_ref; +} GF_GROUP_STATS; + +typedef struct { + double frame_err; + double frame_coded_error; + double frame_sr_coded_error; + double frame_tr_coded_error; +} GF_FRAME_STATS; + +void av1_init_second_pass(struct AV1_COMP *cpi); + +void av1_init_single_pass_lap(AV1_COMP *cpi); + +void av1_get_second_pass_params(struct AV1_COMP *cpi, + struct EncodeFrameParams *const frame_params, + const EncodeFrameInput *const frame_input, + unsigned int frame_flags); + +void av1_twopass_postencode_update(struct AV1_COMP *cpi); + +void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc, + GF_GROUP *gf_group, int is_key_frame, int use_arf, + int64_t gf_group_bits); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_PASS2_STRATEGY_H_ diff --git a/libs/libaom/src/av1/encoder/pickcdef.c b/libs/libaom/src/av1/encoder/pickcdef.c new file mode 100644 index 000000000..a1092fd59 --- /dev/null +++ b/libs/libaom/src/av1/encoder/pickcdef.c @@ -0,0 +1,587 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_ports/system_state.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/cdef.h" +#include "av1/common/reconinter.h" +#include "av1/encoder/encoder.h" + +#define REDUCED_PRI_STRENGTHS_LVL1 8 +#define REDUCED_PRI_STRENGTHS_LVL2 5 + +#define REDUCED_TOTAL_STRENGTHS_LVL1 \ + (REDUCED_PRI_STRENGTHS_LVL1 * CDEF_SEC_STRENGTHS) +#define REDUCED_TOTAL_STRENGTHS_LVL2 \ + (REDUCED_PRI_STRENGTHS_LVL2 * CDEF_SEC_STRENGTHS) +#define TOTAL_STRENGTHS (CDEF_PRI_STRENGTHS * CDEF_SEC_STRENGTHS) + +static const int priconv_lvl1[REDUCED_TOTAL_STRENGTHS_LVL1] = { 0, 1, 2, 3, + 5, 7, 10, 13 }; +static const int priconv_lvl2[REDUCED_TOTAL_STRENGTHS_LVL2] = { 0, 2, 4, 8, + 14 }; +static const int nb_cdef_strengths[CDEF_PICK_METHODS] = { + TOTAL_STRENGTHS, REDUCED_TOTAL_STRENGTHS_LVL1, REDUCED_TOTAL_STRENGTHS_LVL2, + TOTAL_STRENGTHS +}; + +// Get primary strength value for the given index and search method +static INLINE int get_pri_strength(CDEF_PICK_METHOD pick_method, int pri_idx) { + switch (pick_method) { + case CDEF_FAST_SEARCH_LVL1: return priconv_lvl1[pri_idx]; + case CDEF_FAST_SEARCH_LVL2: return priconv_lvl2[pri_idx]; + default: assert(0 && "Invalid CDEF primary index"); return -1; + } +} + +/* Search for the best strength to add as an option, knowing we + already selected nb_strengths options. */ +static uint64_t search_one(int *lev, int nb_strengths, + uint64_t mse[][TOTAL_STRENGTHS], int sb_count, + CDEF_PICK_METHOD pick_method) { + uint64_t tot_mse[TOTAL_STRENGTHS]; + const int total_strengths = nb_cdef_strengths[pick_method]; + int i, j; + uint64_t best_tot_mse = (uint64_t)1 << 63; + int best_id = 0; + memset(tot_mse, 0, sizeof(tot_mse)); + for (i = 0; i < sb_count; i++) { + int gi; + uint64_t best_mse = (uint64_t)1 << 63; + /* Find best mse among already selected options. */ + for (gi = 0; gi < nb_strengths; gi++) { + if (mse[i][lev[gi]] < best_mse) { + best_mse = mse[i][lev[gi]]; + } + } + /* Find best mse when adding each possible new option. */ + for (j = 0; j < total_strengths; j++) { + uint64_t best = best_mse; + if (mse[i][j] < best) best = mse[i][j]; + tot_mse[j] += best; + } + } + for (j = 0; j < total_strengths; j++) { + if (tot_mse[j] < best_tot_mse) { + best_tot_mse = tot_mse[j]; + best_id = j; + } + } + lev[nb_strengths] = best_id; + return best_tot_mse; +} + +/* Search for the best luma+chroma strength to add as an option, knowing we + already selected nb_strengths options. */ +static uint64_t search_one_dual(int *lev0, int *lev1, int nb_strengths, + uint64_t (**mse)[TOTAL_STRENGTHS], int sb_count, + CDEF_PICK_METHOD pick_method) { + uint64_t tot_mse[TOTAL_STRENGTHS][TOTAL_STRENGTHS]; + int i, j; + uint64_t best_tot_mse = (uint64_t)1 << 63; + int best_id0 = 0; + int best_id1 = 0; + const int total_strengths = nb_cdef_strengths[pick_method]; + memset(tot_mse, 0, sizeof(tot_mse)); + for (i = 0; i < sb_count; i++) { + int gi; + uint64_t best_mse = (uint64_t)1 << 63; + /* Find best mse among already selected options. */ + for (gi = 0; gi < nb_strengths; gi++) { + uint64_t curr = mse[0][i][lev0[gi]]; + curr += mse[1][i][lev1[gi]]; + if (curr < best_mse) { + best_mse = curr; + } + } + /* Find best mse when adding each possible new option. */ + for (j = 0; j < total_strengths; j++) { + int k; + for (k = 0; k < total_strengths; k++) { + uint64_t best = best_mse; + uint64_t curr = mse[0][i][j]; + curr += mse[1][i][k]; + if (curr < best) best = curr; + tot_mse[j][k] += best; + } + } + } + for (j = 0; j < total_strengths; j++) { + int k; + for (k = 0; k < total_strengths; k++) { + if (tot_mse[j][k] < best_tot_mse) { + best_tot_mse = tot_mse[j][k]; + best_id0 = j; + best_id1 = k; + } + } + } + lev0[nb_strengths] = best_id0; + lev1[nb_strengths] = best_id1; + return best_tot_mse; +} + +/* Search for the set of strengths that minimizes mse. */ +static uint64_t joint_strength_search(int *best_lev, int nb_strengths, + uint64_t mse[][TOTAL_STRENGTHS], + int sb_count, + CDEF_PICK_METHOD pick_method) { + uint64_t best_tot_mse; + int fast = (pick_method == CDEF_FAST_SEARCH_LVL1 || + pick_method == CDEF_FAST_SEARCH_LVL2); + int i; + best_tot_mse = (uint64_t)1 << 63; + /* Greedy search: add one strength options at a time. */ + for (i = 0; i < nb_strengths; i++) { + best_tot_mse = search_one(best_lev, i, mse, sb_count, pick_method); + } + /* Trying to refine the greedy search by reconsidering each + already-selected option. */ + if (!fast) { + for (i = 0; i < 4 * nb_strengths; i++) { + int j; + for (j = 0; j < nb_strengths - 1; j++) best_lev[j] = best_lev[j + 1]; + best_tot_mse = + search_one(best_lev, nb_strengths - 1, mse, sb_count, pick_method); + } + } + return best_tot_mse; +} + +/* Search for the set of luma+chroma strengths that minimizes mse. */ +static uint64_t joint_strength_search_dual(int *best_lev0, int *best_lev1, + int nb_strengths, + uint64_t (**mse)[TOTAL_STRENGTHS], + int sb_count, + CDEF_PICK_METHOD pick_method) { + uint64_t best_tot_mse; + int i; + best_tot_mse = (uint64_t)1 << 63; + /* Greedy search: add one strength options at a time. */ + for (i = 0; i < nb_strengths; i++) { + best_tot_mse = + search_one_dual(best_lev0, best_lev1, i, mse, sb_count, pick_method); + } + /* Trying to refine the greedy search by reconsidering each + already-selected option. */ + for (i = 0; i < 4 * nb_strengths; i++) { + int j; + for (j = 0; j < nb_strengths - 1; j++) { + best_lev0[j] = best_lev0[j + 1]; + best_lev1[j] = best_lev1[j + 1]; + } + best_tot_mse = search_one_dual(best_lev0, best_lev1, nb_strengths - 1, mse, + sb_count, pick_method); + } + return best_tot_mse; +} + +typedef void (*copy_fn_t)(uint16_t *dst, int dstride, const void *src, + int src_voffset, int src_hoffset, int sstride, + int vsize, int hsize); +typedef uint64_t (*compute_cdef_dist_t)(void *dst, int dstride, uint16_t *src, + cdef_list *dlist, int cdef_count, + BLOCK_SIZE bsize, int coeff_shift, + int row, int col); + +static void copy_sb16_16_highbd(uint16_t *dst, int dstride, const void *src, + int src_voffset, int src_hoffset, int sstride, + int vsize, int hsize) { + int r; + const uint16_t *src16 = CONVERT_TO_SHORTPTR((uint8_t *)src); + const uint16_t *base = &src16[src_voffset * sstride + src_hoffset]; + for (r = 0; r < vsize; r++) + memcpy(dst + r * dstride, base + r * sstride, hsize * sizeof(*base)); +} + +static void copy_sb16_16(uint16_t *dst, int dstride, const void *src, + int src_voffset, int src_hoffset, int sstride, + int vsize, int hsize) { + int r, c; + const uint8_t *src8 = (uint8_t *)src; + const uint8_t *base = &src8[src_voffset * sstride + src_hoffset]; + for (r = 0; r < vsize; r++) + for (c = 0; c < hsize; c++) + dst[r * dstride + c] = (uint16_t)base[r * sstride + c]; +} + +static INLINE uint64_t mse_wxh_16bit_highbd(uint16_t *dst, int dstride, + uint16_t *src, int sstride, int w, + int h) { + uint64_t sum = 0; + int i, j; + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + int e = dst[i * dstride + j] - src[i * sstride + j]; + sum += e * e; + } + } + return sum; +} + +static INLINE uint64_t mse_wxh_16bit(uint8_t *dst, int dstride, uint16_t *src, + int sstride, int w, int h) { + uint64_t sum = 0; + int i, j; + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + int e = (uint16_t)dst[i * dstride + j] - src[i * sstride + j]; + sum += e * e; + } + } + return sum; +} + +static INLINE void init_src_params(int *src_stride, int *width, int *height, + int *width_log2, int *height_log2, + BLOCK_SIZE bsize) { + *src_stride = block_size_wide[bsize]; + *width = block_size_wide[bsize]; + *height = block_size_high[bsize]; + *width_log2 = MI_SIZE_LOG2 + mi_size_wide_log2[bsize]; + *height_log2 = MI_SIZE_LOG2 + mi_size_wide_log2[bsize]; +} + +/* Compute MSE only on the blocks we filtered. */ +static uint64_t compute_cdef_dist_highbd(void *dst, int dstride, uint16_t *src, + cdef_list *dlist, int cdef_count, + BLOCK_SIZE bsize, int coeff_shift, + int row, int col) { + assert(bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 || + bsize == BLOCK_8X8); + uint64_t sum = 0; + int bi, bx, by; + uint16_t *dst16 = CONVERT_TO_SHORTPTR((uint8_t *)dst); + uint16_t *dst_buff = &dst16[row * dstride + col]; + int src_stride, width, height, width_log2, height_log2; + init_src_params(&src_stride, &width, &height, &width_log2, &height_log2, + bsize); + for (bi = 0; bi < cdef_count; bi++) { + by = dlist[bi].by; + bx = dlist[bi].bx; + sum += mse_wxh_16bit_highbd( + &dst_buff[(by << height_log2) * dstride + (bx << width_log2)], dstride, + &src[bi << (height_log2 + width_log2)], src_stride, width, height); + } + return sum >> 2 * coeff_shift; +} + +static uint64_t compute_cdef_dist(void *dst, int dstride, uint16_t *src, + cdef_list *dlist, int cdef_count, + BLOCK_SIZE bsize, int coeff_shift, int row, + int col) { + assert(bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 || + bsize == BLOCK_8X8); + uint64_t sum = 0; + int bi, bx, by; + uint8_t *dst8 = (uint8_t *)dst; + uint8_t *dst_buff = &dst8[row * dstride + col]; + int src_stride, width, height, width_log2, height_log2; + init_src_params(&src_stride, &width, &height, &width_log2, &height_log2, + bsize); + for (bi = 0; bi < cdef_count; bi++) { + by = dlist[bi].by; + bx = dlist[bi].bx; + sum += mse_wxh_16bit( + &dst_buff[(by << height_log2) * dstride + (bx << width_log2)], dstride, + &src[bi << (height_log2 + width_log2)], src_stride, width, height); + } + return sum >> 2 * coeff_shift; +} + +static int sb_all_skip(const CommonModeInfoParams *const mi_params, int mi_row, + int mi_col) { + const int maxr = AOMMIN(mi_params->mi_rows - mi_row, MI_SIZE_64X64); + const int maxc = AOMMIN(mi_params->mi_cols - mi_col, MI_SIZE_64X64); + const int stride = mi_params->mi_stride; + MB_MODE_INFO **mbmi = mi_params->mi_grid_base + mi_row * stride + mi_col; + for (int r = 0; r < maxr; ++r, mbmi += stride) { + for (int c = 0; c < maxc; ++c) { + if (!mbmi[c]->skip) return 0; + } + } + return 1; +} + +static void pick_cdef_from_qp(AV1_COMMON *const cm) { + const int bd = cm->seq_params.bit_depth; + const int q = + av1_ac_quant_QTX(cm->quant_params.base_qindex, 0, bd) >> (bd - 8); + CdefInfo *const cdef_info = &cm->cdef_info; + cdef_info->cdef_bits = 0; + cdef_info->nb_cdef_strengths = 1; + cdef_info->cdef_damping = 3 + (cm->quant_params.base_qindex >> 6); + + int predicted_y_f1 = 0; + int predicted_y_f2 = 0; + int predicted_uv_f1 = 0; + int predicted_uv_f2 = 0; + aom_clear_system_state(); + if (!frame_is_intra_only(cm)) { + predicted_y_f1 = clamp((int)roundf(q * q * -0.0000023593946f + + q * 0.0068615186f + 0.02709886f), + 0, 15); + predicted_y_f2 = clamp((int)roundf(q * q * -0.00000057629734f + + q * 0.0013993345f + 0.03831067f), + 0, 3); + predicted_uv_f1 = clamp((int)roundf(q * q * -0.0000007095069f + + q * 0.0034628846f + 0.00887099f), + 0, 15); + predicted_uv_f2 = clamp((int)roundf(q * q * 0.00000023874085f + + q * 0.00028223585f + 0.05576307f), + 0, 3); + } else { + predicted_y_f1 = clamp( + (int)roundf(q * q * 0.0000033731974f + q * 0.008070594f + 0.0187634f), + 0, 15); + predicted_y_f2 = clamp( + (int)roundf(q * q * 0.0000029167343f + q * 0.0027798624f + 0.0079405f), + 0, 3); + predicted_uv_f1 = clamp( + (int)roundf(q * q * -0.0000130790995f + q * 0.012892405f - 0.00748388f), + 0, 15); + predicted_uv_f2 = clamp((int)roundf(q * q * 0.0000032651783f + + q * 0.00035520183f + 0.00228092f), + 0, 3); + } + cdef_info->cdef_strengths[0] = + predicted_y_f1 * CDEF_SEC_STRENGTHS + predicted_y_f2; + cdef_info->cdef_uv_strengths[0] = + predicted_uv_f1 * CDEF_SEC_STRENGTHS + predicted_uv_f2; + + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + MB_MODE_INFO **mbmi = mi_params->mi_grid_base; + for (int r = 0; r < nvfb; ++r) { + for (int c = 0; c < nhfb; ++c) { + mbmi[MI_SIZE_64X64 * c]->cdef_strength = 0; + } + mbmi += MI_SIZE_64X64 * mi_params->mi_stride; + } +} + +void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref, + AV1_COMMON *cm, MACROBLOCKD *xd, int pick_method, + int rdmult) { + if (pick_method == CDEF_PICK_FROM_Q) { + pick_cdef_from_qp(cm); + return; + } + + cdef_list dlist[MI_SIZE_128X128 * MI_SIZE_128X128]; + int dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } }; + int var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } }; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + int *sb_index = aom_malloc(nvfb * nhfb * sizeof(*sb_index)); + const int damping = 3 + (cm->quant_params.base_qindex >> 6); + const int fast = (pick_method == CDEF_FAST_SEARCH_LVL1 || + pick_method == CDEF_FAST_SEARCH_LVL2); + const int total_strengths = nb_cdef_strengths[pick_method]; + DECLARE_ALIGNED(32, uint16_t, tmp_dst[1 << (MAX_SB_SIZE_LOG2 * 2)]); + const int num_planes = av1_num_planes(cm); + av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0, + num_planes); + uint64_t(*mse[2])[TOTAL_STRENGTHS]; + mse[0] = aom_malloc(sizeof(**mse) * nvfb * nhfb); + mse[1] = aom_malloc(sizeof(**mse) * nvfb * nhfb); + + int bsize[3]; + int mi_wide_l2[3]; + int mi_high_l2[3]; + int xdec[3]; + int ydec[3]; + uint8_t *ref_buffer[3] = { ref->y_buffer, ref->u_buffer, ref->v_buffer }; + int ref_stride[3] = { ref->y_stride, ref->uv_stride, ref->uv_stride }; + + for (int pli = 0; pli < num_planes; pli++) { + xdec[pli] = xd->plane[pli].subsampling_x; + ydec[pli] = xd->plane[pli].subsampling_y; + bsize[pli] = ydec[pli] ? (xdec[pli] ? BLOCK_4X4 : BLOCK_8X4) + : (xdec[pli] ? BLOCK_4X8 : BLOCK_8X8); + mi_wide_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_x; + mi_high_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_y; + } + + copy_fn_t copy_fn; + compute_cdef_dist_t compute_cdef_dist_fn; + + if (cm->seq_params.use_highbitdepth) { + copy_fn = copy_sb16_16_highbd; + compute_cdef_dist_fn = compute_cdef_dist_highbd; + } else { + copy_fn = copy_sb16_16; + compute_cdef_dist_fn = compute_cdef_dist; + } + + DECLARE_ALIGNED(32, uint16_t, inbuf[CDEF_INBUF_SIZE]); + uint16_t *const in = inbuf + CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER; + const int coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0); + int sb_count = 0; + for (int fbr = 0; fbr < nvfb; ++fbr) { + for (int fbc = 0; fbc < nhfb; ++fbc) { + // No filtering if the entire filter block is skipped + if (sb_all_skip(mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64)) + continue; + + const MB_MODE_INFO *const mbmi = + mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride + + MI_SIZE_64X64 * fbc]; + if (((fbc & 1) && + (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_128X64)) || + ((fbr & 1) && + (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_64X128))) + continue; + + int nhb = AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc); + int nvb = AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr); + int hb_step = 1; + int vb_step = 1; + BLOCK_SIZE bs; + if (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_128X64 || + mbmi->sb_type == BLOCK_64X128) { + bs = mbmi->sb_type; + if (bs == BLOCK_128X128 || bs == BLOCK_128X64) { + nhb = + AOMMIN(MI_SIZE_128X128, mi_params->mi_cols - MI_SIZE_64X64 * fbc); + hb_step = 2; + } + if (bs == BLOCK_128X128 || bs == BLOCK_64X128) { + nvb = + AOMMIN(MI_SIZE_128X128, mi_params->mi_rows - MI_SIZE_64X64 * fbr); + vb_step = 2; + } + } else { + bs = BLOCK_64X64; + } + + const int cdef_count = av1_cdef_compute_sb_list( + mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64, dlist, bs); + + const int yoff = CDEF_VBORDER * (fbr != 0); + const int xoff = CDEF_HBORDER * (fbc != 0); + int dirinit = 0; + for (int pli = 0; pli < num_planes; pli++) { + for (int i = 0; i < CDEF_INBUF_SIZE; i++) inbuf[i] = CDEF_VERY_LARGE; + /* We avoid filtering the pixels for which some of the pixels to + average are outside the frame. We could change the filter instead, + but it would add special cases for any future vectorization. */ + const int ysize = (nvb << mi_high_l2[pli]) + + CDEF_VBORDER * (fbr + vb_step < nvfb) + yoff; + const int xsize = (nhb << mi_wide_l2[pli]) + + CDEF_HBORDER * (fbc + hb_step < nhfb) + xoff; + const int row = fbr * MI_SIZE_64X64 << mi_high_l2[pli]; + const int col = fbc * MI_SIZE_64X64 << mi_wide_l2[pli]; + for (int gi = 0; gi < total_strengths; gi++) { + int pri_strength = gi / CDEF_SEC_STRENGTHS; + if (fast) pri_strength = get_pri_strength(pick_method, pri_strength); + const int sec_strength = gi % CDEF_SEC_STRENGTHS; + copy_fn(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE, + xd->plane[pli].dst.buf, row - yoff, col - xoff, + xd->plane[pli].dst.stride, ysize, xsize); + av1_cdef_filter_fb( + NULL, tmp_dst, CDEF_BSTRIDE, in, xdec[pli], ydec[pli], dir, + &dirinit, var, pli, dlist, cdef_count, pri_strength, + sec_strength + (sec_strength == 3), damping, coeff_shift); + const uint64_t curr_mse = compute_cdef_dist_fn( + ref_buffer[pli], ref_stride[pli], tmp_dst, dlist, cdef_count, + bsize[pli], coeff_shift, row, col); + if (pli < 2) + mse[pli][sb_count][gi] = curr_mse; + else + mse[1][sb_count][gi] += curr_mse; + } + } + sb_index[sb_count++] = + MI_SIZE_64X64 * fbr * mi_params->mi_stride + MI_SIZE_64X64 * fbc; + } + } + + /* Search for different number of signalling bits. */ + int nb_strength_bits = 0; + uint64_t best_rd = UINT64_MAX; + CdefInfo *const cdef_info = &cm->cdef_info; + for (int i = 0; i <= 3; i++) { + int best_lev0[CDEF_MAX_STRENGTHS]; + int best_lev1[CDEF_MAX_STRENGTHS] = { 0 }; + const int nb_strengths = 1 << i; + uint64_t tot_mse; + if (num_planes > 1) { + tot_mse = joint_strength_search_dual(best_lev0, best_lev1, nb_strengths, + mse, sb_count, pick_method); + } else { + tot_mse = joint_strength_search(best_lev0, nb_strengths, mse[0], sb_count, + pick_method); + } + + const int total_bits = sb_count * i + nb_strengths * CDEF_STRENGTH_BITS * + (num_planes > 1 ? 2 : 1); + const int rate_cost = av1_cost_literal(total_bits); + const uint64_t dist = tot_mse * 16; + const uint64_t rd = RDCOST(rdmult, rate_cost, dist); + if (rd < best_rd) { + best_rd = rd; + nb_strength_bits = i; + memcpy(cdef_info->cdef_strengths, best_lev0, + nb_strengths * sizeof(best_lev0[0])); + if (num_planes > 1) { + memcpy(cdef_info->cdef_uv_strengths, best_lev1, + nb_strengths * sizeof(best_lev1[0])); + } + } + } + + cdef_info->cdef_bits = nb_strength_bits; + cdef_info->nb_cdef_strengths = 1 << nb_strength_bits; + for (int i = 0; i < sb_count; i++) { + uint64_t best_mse = UINT64_MAX; + int best_gi = 0; + for (int gi = 0; gi < cdef_info->nb_cdef_strengths; gi++) { + uint64_t curr = mse[0][i][cdef_info->cdef_strengths[gi]]; + if (num_planes > 1) curr += mse[1][i][cdef_info->cdef_uv_strengths[gi]]; + if (curr < best_mse) { + best_gi = gi; + best_mse = curr; + } + } + mi_params->mi_grid_base[sb_index[i]]->cdef_strength = best_gi; + } + + if (fast) { + for (int j = 0; j < cdef_info->nb_cdef_strengths; j++) { + const int luma_strength = cdef_info->cdef_strengths[j]; + const int chroma_strength = cdef_info->cdef_uv_strengths[j]; + int pri_strength; + pri_strength = + get_pri_strength(pick_method, luma_strength / CDEF_SEC_STRENGTHS); + cdef_info->cdef_strengths[j] = pri_strength * CDEF_SEC_STRENGTHS + + (luma_strength % CDEF_SEC_STRENGTHS); + pri_strength = + get_pri_strength(pick_method, chroma_strength / CDEF_SEC_STRENGTHS); + cdef_info->cdef_uv_strengths[j] = pri_strength * CDEF_SEC_STRENGTHS + + (chroma_strength % CDEF_SEC_STRENGTHS); + } + } + + cdef_info->cdef_damping = damping; + + aom_free(mse[0]); + aom_free(mse[1]); + aom_free(sb_index); +} diff --git a/libs/libaom/src/av1/encoder/picklpf.c b/libs/libaom/src/av1/encoder/picklpf.c new file mode 100644 index 000000000..17c996551 --- /dev/null +++ b/libs/libaom/src/av1/encoder/picklpf.c @@ -0,0 +1,285 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_scale_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/psnr.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/av1_loopfilter.h" +#include "av1/common/quant_common.h" + +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/picklpf.h" + +static void yv12_copy_plane(const YV12_BUFFER_CONFIG *src_bc, + YV12_BUFFER_CONFIG *dst_bc, int plane) { + switch (plane) { + case 0: aom_yv12_copy_y(src_bc, dst_bc); break; + case 1: aom_yv12_copy_u(src_bc, dst_bc); break; + case 2: aom_yv12_copy_v(src_bc, dst_bc); break; + default: assert(plane >= 0 && plane <= 2); break; + } +} + +int av1_get_max_filter_level(const AV1_COMP *cpi) { + if (is_stat_consumption_stage_twopass(cpi)) { + return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4 + : MAX_LOOP_FILTER; + } else { + return MAX_LOOP_FILTER; + } +} + +static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd, + AV1_COMP *const cpi, int filt_level, + int partial_frame, int plane, int dir) { + AV1_COMMON *const cm = &cpi->common; + int64_t filt_err; + + assert(plane >= 0 && plane <= 2); + int filter_level[2] = { filt_level, filt_level }; + if (plane == 0 && dir == 0) filter_level[1] = cm->lf.filter_level[1]; + if (plane == 0 && dir == 1) filter_level[0] = cm->lf.filter_level[0]; + + // set base filters for use of av1_get_filter_level when in DELTA_LF mode + switch (plane) { + case 0: + cm->lf.filter_level[0] = filter_level[0]; + cm->lf.filter_level[1] = filter_level[1]; + break; + case 1: cm->lf.filter_level_u = filter_level[0]; break; + case 2: cm->lf.filter_level_v = filter_level[0]; break; + } + + // TODO(any): please enable multi-thread and remove the flag when loop + // filter mask is compatible with multi-thread. + if (cpi->num_workers > 1) + av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd, plane, + plane + 1, partial_frame, +#if CONFIG_LPF_MASK + 0, +#endif + cpi->workers, cpi->num_workers, &cpi->lf_row_sync); + else + av1_loop_filter_frame(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd, +#if CONFIG_LPF_MASK + 0, +#endif + plane, plane + 1, partial_frame); + + filt_err = aom_get_sse_plane(sd, &cm->cur_frame->buf, plane, + cm->seq_params.use_highbitdepth); + + // Re-instate the unfiltered frame + yv12_copy_plane(&cpi->last_frame_uf, &cm->cur_frame->buf, plane); + + return filt_err; +} + +static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, + int partial_frame, + const int *last_frame_filter_level, + double *best_cost_ret, int plane, int dir) { + const AV1_COMMON *const cm = &cpi->common; + const int min_filter_level = 0; + const int max_filter_level = av1_get_max_filter_level(cpi); + int filt_direction = 0; + int64_t best_err; + int filt_best; + MACROBLOCK *x = &cpi->td.mb; + + // Start the search at the previous frame filter level unless it is now out of + // range. + int lvl; + switch (plane) { + case 0: + switch (dir) { + case 2: + lvl = (last_frame_filter_level[0] + last_frame_filter_level[1] + 1) >> + 1; + break; + case 0: + case 1: lvl = last_frame_filter_level[dir]; break; + default: assert(dir >= 0 && dir <= 2); return 0; + } + break; + case 1: lvl = last_frame_filter_level[2]; break; + case 2: lvl = last_frame_filter_level[3]; break; + default: assert(plane >= 0 && plane <= 2); return 0; + } + int filt_mid = clamp(lvl, min_filter_level, max_filter_level); + int filter_step = filt_mid < 16 ? 4 : filt_mid / 4; + // Sum squared error at each filter level + int64_t ss_err[MAX_LOOP_FILTER + 1]; + + // Set each entry to -1 + memset(ss_err, 0xFF, sizeof(ss_err)); + yv12_copy_plane(&cm->cur_frame->buf, &cpi->last_frame_uf, plane); + best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame, plane, dir); + filt_best = filt_mid; + ss_err[filt_mid] = best_err; + + while (filter_step > 0) { + const int filt_high = AOMMIN(filt_mid + filter_step, max_filter_level); + const int filt_low = AOMMAX(filt_mid - filter_step, min_filter_level); + + // Bias against raising loop filter in favor of lowering it. + int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; + + if ((is_stat_consumption_stage_twopass(cpi)) && + (cpi->twopass.section_intra_rating < 20)) + bias = (bias * cpi->twopass.section_intra_rating) / 20; + + // yx, bias less for large block size + if (cm->features.tx_mode != ONLY_4X4) bias >>= 1; + + if (filt_direction <= 0 && filt_low != filt_mid) { + // Get Low filter error score + if (ss_err[filt_low] < 0) { + ss_err[filt_low] = + try_filter_frame(sd, cpi, filt_low, partial_frame, plane, dir); + } + // If value is close to the best so far then bias towards a lower loop + // filter value. + if (ss_err[filt_low] < (best_err + bias)) { + // Was it actually better than the previous best? + if (ss_err[filt_low] < best_err) { + best_err = ss_err[filt_low]; + } + filt_best = filt_low; + } + } + + // Now look at filt_high + if (filt_direction >= 0 && filt_high != filt_mid) { + if (ss_err[filt_high] < 0) { + ss_err[filt_high] = + try_filter_frame(sd, cpi, filt_high, partial_frame, plane, dir); + } + // If value is significantly better than previous best, bias added against + // raising filter value + if (ss_err[filt_high] < (best_err - bias)) { + best_err = ss_err[filt_high]; + filt_best = filt_high; + } + } + + // Half the step distance if the best filter value was the same as last time + if (filt_best == filt_mid) { + filter_step /= 2; + filt_direction = 0; + } else { + filt_direction = (filt_best < filt_mid) ? -1 : 1; + filt_mid = filt_best; + } + } + + // Update best error + best_err = ss_err[filt_best]; + + if (best_cost_ret) *best_cost_ret = RDCOST_DBL(x->rdmult, 0, best_err); + return filt_best; +} + +void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, + LPF_PICK_METHOD method) { + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + struct loopfilter *const lf = &cm->lf; + (void)sd; + + lf->sharpness_level = 0; + cpi->td.mb.rdmult = cpi->rd.RDMULT; + + if (method == LPF_PICK_MINIMAL_LPF) { + lf->filter_level[0] = 0; + lf->filter_level[1] = 0; + } else if (method >= LPF_PICK_FROM_Q) { + const int min_filter_level = 0; + const int max_filter_level = av1_get_max_filter_level(cpi); + const int q = av1_ac_quant_QTX(cm->quant_params.base_qindex, 0, + cm->seq_params.bit_depth); + // based on tests result for rtc test set + // 0.04590 boosted or 0.02295 non-booseted in 18-bit fixed point + const int strength_boost_q_treshold = 700; + const int inter_frame_multiplier = + q > strength_boost_q_treshold ? 12034 : 6017; + // These values were determined by linear fitting the result of the + // searched level for 8 bit depth: + // Keyframes: filt_guess = q * 0.06699 - 1.60817 + // Other frames: filt_guess = q * inter_frame_multiplier + 2.48225 + // + // And high bit depth separately: + // filt_guess = q * 0.316206 + 3.87252 + int filt_guess; + switch (cm->seq_params.bit_depth) { + case AOM_BITS_8: + filt_guess = + (cm->current_frame.frame_type == KEY_FRAME) + ? ROUND_POWER_OF_TWO(q * 17563 - 421574, 18) + : ROUND_POWER_OF_TWO(q * inter_frame_multiplier + 650707, 18); + break; + case AOM_BITS_10: + filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20); + break; + case AOM_BITS_12: + filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22); + break; + default: + assert(0 && + "bit_depth should be AOM_BITS_8, AOM_BITS_10 " + "or AOM_BITS_12"); + return; + } + if (cm->seq_params.bit_depth != AOM_BITS_8 && + cm->current_frame.frame_type == KEY_FRAME) + filt_guess -= 4; + // TODO(chengchen): retrain the model for Y, U, V filter levels + lf->filter_level[0] = clamp(filt_guess, min_filter_level, max_filter_level); + lf->filter_level[1] = clamp(filt_guess, min_filter_level, max_filter_level); + lf->filter_level_u = clamp(filt_guess, min_filter_level, max_filter_level); + lf->filter_level_v = clamp(filt_guess, min_filter_level, max_filter_level); + } else { + const int last_frame_filter_level[4] = { lf->filter_level[0], + lf->filter_level[1], + lf->filter_level_u, + lf->filter_level_v }; + + lf->filter_level[0] = lf->filter_level[1] = + search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, + last_frame_filter_level, NULL, 0, 2); + if (method != LPF_PICK_FROM_FULL_IMAGE_NON_DUAL) { + lf->filter_level[0] = + search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, + last_frame_filter_level, NULL, 0, 0); + lf->filter_level[1] = + search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, + last_frame_filter_level, NULL, 0, 1); + } + + if (num_planes > 1) { + lf->filter_level_u = + search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, + last_frame_filter_level, NULL, 1, 0); + lf->filter_level_v = + search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, + last_frame_filter_level, NULL, 2, 0); + } + } +} diff --git a/libs/libaom/src/av1/encoder/picklpf.h b/libs/libaom/src/av1/encoder/picklpf.h new file mode 100644 index 000000000..357097ae1 --- /dev/null +++ b/libs/libaom/src/av1/encoder/picklpf.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_PICKLPF_H_ +#define AOM_AV1_ENCODER_PICKLPF_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/encoder.h" + +struct yv12_buffer_config; +struct AV1_COMP; +int av1_get_max_filter_level(const AV1_COMP *cpi); +void av1_pick_filter_level(const struct yv12_buffer_config *sd, + struct AV1_COMP *cpi, LPF_PICK_METHOD method); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_PICKLPF_H_ diff --git a/libs/libaom/src/av1/encoder/pickrst.c b/libs/libaom/src/av1/encoder/pickrst.c new file mode 100644 index 000000000..ccbe1cc3e --- /dev/null +++ b/libs/libaom/src/av1/encoder/pickrst.c @@ -0,0 +1,1768 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "config/aom_scale_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/binary_codes_writer.h" +#include "aom_dsp/psnr.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "aom_ports/system_state.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/quant_common.h" +#include "av1/common/restoration.h" + +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/mathutils.h" +#include "av1/encoder/picklpf.h" +#include "av1/encoder/pickrst.h" + +// When set to RESTORE_WIENER or RESTORE_SGRPROJ only those are allowed. +// When set to RESTORE_TYPES we allow switchable. +static const RestorationType force_restore_type = RESTORE_TYPES; + +// Number of Wiener iterations +#define NUM_WIENER_ITERS 5 + +// Penalty factor for use of dual sgr +#define DUAL_SGR_PENALTY_MULT 0.01 + +// Working precision for Wiener filter coefficients +#define WIENER_TAP_SCALE_FACTOR ((int64_t)1 << 16) + +#define SGRPROJ_EP_GRP1_START_IDX 0 +#define SGRPROJ_EP_GRP1_END_IDX 9 +#define SGRPROJ_EP_GRP1_SEARCH_COUNT 4 +#define SGRPROJ_EP_GRP2_3_SEARCH_COUNT 2 +static const int sgproj_ep_grp1_seed[SGRPROJ_EP_GRP1_SEARCH_COUNT] = { 0, 3, 6, + 9 }; +static const int sgproj_ep_grp2_3[SGRPROJ_EP_GRP2_3_SEARCH_COUNT][14] = { + { 10, 10, 11, 11, 12, 12, 13, 13, 13, 13, -1, -1, -1, -1 }, + { 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15 } +}; + +typedef int64_t (*sse_extractor_type)(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b); +typedef int64_t (*sse_part_extractor_type)(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, + int hstart, int width, int vstart, + int height); +typedef uint64_t (*var_part_extractor_type)(const YV12_BUFFER_CONFIG *a, + int hstart, int width, int vstart, + int height); + +#if CONFIG_AV1_HIGHBITDEPTH +#define NUM_EXTRACTORS (3 * (1 + 1)) +#else +#define NUM_EXTRACTORS 3 +#endif +static const sse_part_extractor_type sse_part_extractors[NUM_EXTRACTORS] = { + aom_get_y_sse_part, aom_get_u_sse_part, + aom_get_v_sse_part, +#if CONFIG_AV1_HIGHBITDEPTH + aom_highbd_get_y_sse_part, aom_highbd_get_u_sse_part, + aom_highbd_get_v_sse_part, +#endif +}; +static const var_part_extractor_type var_part_extractors[NUM_EXTRACTORS] = { + aom_get_y_var, aom_get_u_var, aom_get_v_var, +#if CONFIG_AV1_HIGHBITDEPTH + aom_highbd_get_y_var, aom_highbd_get_u_var, aom_highbd_get_v_var, +#endif +}; + +static int64_t sse_restoration_unit(const RestorationTileLimits *limits, + const YV12_BUFFER_CONFIG *src, + const YV12_BUFFER_CONFIG *dst, int plane, + int highbd) { + return sse_part_extractors[3 * highbd + plane]( + src, dst, limits->h_start, limits->h_end - limits->h_start, + limits->v_start, limits->v_end - limits->v_start); +} + +static uint64_t var_restoration_unit(const RestorationTileLimits *limits, + const YV12_BUFFER_CONFIG *src, int plane, + int highbd) { + return var_part_extractors[3 * highbd + plane]( + src, limits->h_start, limits->h_end - limits->h_start, limits->v_start, + limits->v_end - limits->v_start); +} + +typedef struct { + // The best coefficients for Wiener or Sgrproj restoration + WienerInfo wiener; + SgrprojInfo sgrproj; + + // The sum of squared errors for this rtype. + int64_t sse[RESTORE_SWITCHABLE_TYPES]; + + // The rtype to use for this unit given a frame rtype as + // index. Indices: WIENER, SGRPROJ, SWITCHABLE. + RestorationType best_rtype[RESTORE_TYPES - 1]; + + // This flag will be set based on the speed feature + // 'prune_sgr_based_on_wiener'. 0 implies no pruning and 1 implies pruning. + uint8_t skip_sgr_eval; +} RestUnitSearchInfo; + +typedef struct { + const YV12_BUFFER_CONFIG *src; + YV12_BUFFER_CONFIG *dst; + + const AV1_COMMON *cm; + const MACROBLOCK *x; + int plane; + int plane_width; + int plane_height; + RestUnitSearchInfo *rusi; + + // Speed features + const SPEED_FEATURES *sf; + + uint8_t *dgd_buffer; + int dgd_stride; + const uint8_t *src_buffer; + int src_stride; + + // sse and bits are initialised by reset_rsc in search_rest_type + int64_t sse; + int64_t bits; + int tile_y0, tile_stripe0; + + // sgrproj and wiener are initialised by rsc_on_tile when starting the first + // tile in the frame. + SgrprojInfo sgrproj; + WienerInfo wiener; + AV1PixelRect tile_rect; +} RestSearchCtxt; + +static AOM_INLINE void rsc_on_tile(void *priv) { + RestSearchCtxt *rsc = (RestSearchCtxt *)priv; + set_default_sgrproj(&rsc->sgrproj); + set_default_wiener(&rsc->wiener); + rsc->tile_stripe0 = 0; +} + +static AOM_INLINE void reset_rsc(RestSearchCtxt *rsc) { + rsc->sse = 0; + rsc->bits = 0; +} + +static AOM_INLINE void init_rsc(const YV12_BUFFER_CONFIG *src, + const AV1_COMMON *cm, const MACROBLOCK *x, + const SPEED_FEATURES *sf, int plane, + RestUnitSearchInfo *rusi, + YV12_BUFFER_CONFIG *dst, RestSearchCtxt *rsc) { + rsc->src = src; + rsc->dst = dst; + rsc->cm = cm; + rsc->x = x; + rsc->plane = plane; + rsc->rusi = rusi; + rsc->sf = sf; + + const YV12_BUFFER_CONFIG *dgd = &cm->cur_frame->buf; + const int is_uv = plane != AOM_PLANE_Y; + rsc->plane_width = src->crop_widths[is_uv]; + rsc->plane_height = src->crop_heights[is_uv]; + rsc->src_buffer = src->buffers[plane]; + rsc->src_stride = src->strides[is_uv]; + rsc->dgd_buffer = dgd->buffers[plane]; + rsc->dgd_stride = dgd->strides[is_uv]; + rsc->tile_rect = av1_whole_frame_rect(cm, is_uv); + assert(src->crop_widths[is_uv] == dgd->crop_widths[is_uv]); + assert(src->crop_heights[is_uv] == dgd->crop_heights[is_uv]); +} + +static int64_t try_restoration_unit(const RestSearchCtxt *rsc, + const RestorationTileLimits *limits, + const AV1PixelRect *tile_rect, + const RestorationUnitInfo *rui) { + const AV1_COMMON *const cm = rsc->cm; + const int plane = rsc->plane; + const int is_uv = plane > 0; + const RestorationInfo *rsi = &cm->rst_info[plane]; + RestorationLineBuffers rlbs; + const int bit_depth = cm->seq_params.bit_depth; + const int highbd = cm->seq_params.use_highbitdepth; + + const YV12_BUFFER_CONFIG *fts = &cm->cur_frame->buf; + // TODO(yunqing): For now, only use optimized LR filter in decoder. Can be + // also used in encoder. + const int optimized_lr = 0; + + av1_loop_restoration_filter_unit( + limits, rui, &rsi->boundaries, &rlbs, tile_rect, rsc->tile_stripe0, + is_uv && cm->seq_params.subsampling_x, + is_uv && cm->seq_params.subsampling_y, highbd, bit_depth, + fts->buffers[plane], fts->strides[is_uv], rsc->dst->buffers[plane], + rsc->dst->strides[is_uv], cm->rst_tmpbuf, optimized_lr); + + return sse_restoration_unit(limits, rsc->src, rsc->dst, plane, highbd); +} + +int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height, + int src_stride, const uint8_t *dat8, + int dat_stride, int32_t *flt0, + int flt0_stride, int32_t *flt1, + int flt1_stride, int xq[2], + const sgr_params_type *params) { + int i, j; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + int64_t err = 0; + if (params->r[0] > 0 && params->r[1] > 0) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15)); + assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15)); + const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS); + int32_t v = u << SGRPROJ_PRJ_BITS; + v += xq[0] * (flt0[j] - u) + xq[1] * (flt1[j] - u); + const int32_t e = + ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + } + } else if (params->r[0] > 0) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15)); + const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS); + int32_t v = u << SGRPROJ_PRJ_BITS; + v += xq[0] * (flt0[j] - u); + const int32_t e = + ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + } + } else if (params->r[1] > 0) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15)); + const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS); + int32_t v = u << SGRPROJ_PRJ_BITS; + v += xq[1] * (flt1[j] - u); + const int32_t e = + ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt1 += flt1_stride; + } + } else { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int32_t e = (int32_t)(dat[j]) - src[j]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + } + } + + return err; +} + +#if CONFIG_AV1_HIGHBITDEPTH +int64_t av1_highbd_pixel_proj_error_c(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, int dat_stride, + int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int xq[2], + const sgr_params_type *params) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + int i, j; + int64_t err = 0; + const int32_t half = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1); + if (params->r[0] > 0 && params->r[1] > 0) { + int xq0 = xq[0]; + int xq1 = xq[1]; + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int32_t d = dat[j]; + const int32_t s = src[j]; + const int32_t u = (int32_t)(d << SGRPROJ_RST_BITS); + int32_t v0 = flt0[j] - u; + int32_t v1 = flt1[j] - u; + int32_t v = half; + v += xq0 * v0; + v += xq1 * v1; + const int32_t e = (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s; + err += ((int64_t)e * e); + } + dat += dat_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + src += src_stride; + } + } else if (params->r[0] > 0 || params->r[1] > 0) { + int exq; + int32_t *flt; + int flt_stride; + if (params->r[0] > 0) { + exq = xq[0]; + flt = flt0; + flt_stride = flt0_stride; + } else { + exq = xq[1]; + flt = flt1; + flt_stride = flt1_stride; + } + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int32_t d = dat[j]; + const int32_t s = src[j]; + const int32_t u = (int32_t)(d << SGRPROJ_RST_BITS); + int32_t v = half; + v += exq * (flt[j] - u); + const int32_t e = (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s; + err += ((int64_t)e * e); + } + dat += dat_stride; + flt += flt_stride; + src += src_stride; + } + } else { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int32_t d = dat[j]; + const int32_t s = src[j]; + const int32_t e = d - s; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + } + } + return err; +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height, + int src_stride, const uint8_t *dat8, + int dat_stride, int use_highbitdepth, + int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int *xqd, + const sgr_params_type *params) { + int xq[2]; + av1_decode_xq(xqd, xq, params); + +#if CONFIG_AV1_HIGHBITDEPTH + if (use_highbitdepth) { + return av1_highbd_pixel_proj_error(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, xq, params); + + } else { + return av1_lowbd_pixel_proj_error(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, xq, params); + } +#else + (void)use_highbitdepth; + return av1_lowbd_pixel_proj_error(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, xq, params); +#endif +} + +#define USE_SGRPROJ_REFINEMENT_SEARCH 1 +static int64_t finer_search_pixel_proj_error( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int use_highbitdepth, int32_t *flt0, + int flt0_stride, int32_t *flt1, int flt1_stride, int start_step, int *xqd, + const sgr_params_type *params) { + int64_t err = get_pixel_proj_error( + src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0, + flt0_stride, flt1, flt1_stride, xqd, params); + (void)start_step; +#if USE_SGRPROJ_REFINEMENT_SEARCH + int64_t err2; + int tap_min[] = { SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MIN1 }; + int tap_max[] = { SGRPROJ_PRJ_MAX0, SGRPROJ_PRJ_MAX1 }; + for (int s = start_step; s >= 1; s >>= 1) { + for (int p = 0; p < 2; ++p) { + if ((params->r[0] == 0 && p == 0) || (params->r[1] == 0 && p == 1)) { + continue; + } + int skip = 0; + do { + if (xqd[p] - s >= tap_min[p]) { + xqd[p] -= s; + err2 = + get_pixel_proj_error(src8, width, height, src_stride, dat8, + dat_stride, use_highbitdepth, flt0, + flt0_stride, flt1, flt1_stride, xqd, params); + if (err2 > err) { + xqd[p] += s; + } else { + err = err2; + skip = 1; + // At the highest step size continue moving in the same direction + if (s == start_step) continue; + } + } + break; + } while (1); + if (skip) break; + do { + if (xqd[p] + s <= tap_max[p]) { + xqd[p] += s; + err2 = + get_pixel_proj_error(src8, width, height, src_stride, dat8, + dat_stride, use_highbitdepth, flt0, + flt0_stride, flt1, flt1_stride, xqd, params); + if (err2 > err) { + xqd[p] -= s; + } else { + err = err2; + // At the highest step size continue moving in the same direction + if (s == start_step) continue; + } + } + break; + } while (1); + } + } +#endif // USE_SGRPROJ_REFINEMENT_SEARCH + return err; +} + +static int64_t signed_rounded_divide(int64_t dividend, int64_t divisor) { + if (dividend < 0) + return (dividend - divisor / 2) / divisor; + else + return (dividend + divisor / 2) / divisor; +} + +static AOM_INLINE void calc_proj_params_r0_r1_c( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); + const int32_t s = + (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u; + const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u; + const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u; + H[0][0] += (int64_t)f1 * f1; + H[1][1] += (int64_t)f2 * f2; + H[0][1] += (int64_t)f1 * f2; + C[0] += (int64_t)f1 * s; + C[1] += (int64_t)f2 * s; + } + } + H[0][0] /= size; + H[0][1] /= size; + H[1][1] /= size; + H[1][0] = H[0][1]; + C[0] /= size; + C[1] /= size; +} + +static AOM_INLINE void calc_proj_params_r0_r1_high_bd_c( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); + const int32_t s = + (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u; + const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u; + const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u; + H[0][0] += (int64_t)f1 * f1; + H[1][1] += (int64_t)f2 * f2; + H[0][1] += (int64_t)f1 * f2; + C[0] += (int64_t)f1 * s; + C[1] += (int64_t)f2 * s; + } + } + H[0][0] /= size; + H[0][1] /= size; + H[1][1] /= size; + H[1][0] = H[0][1]; + C[0] /= size; + C[1] /= size; +} + +static AOM_INLINE void calc_proj_params_r0_c(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, + int dat_stride, int32_t *flt0, + int flt0_stride, int64_t H[2][2], + int64_t C[2]) { + const int size = width * height; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); + const int32_t s = + (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u; + const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u; + H[0][0] += (int64_t)f1 * f1; + C[0] += (int64_t)f1 * s; + } + } + H[0][0] /= size; + C[0] /= size; +} + +static AOM_INLINE void calc_proj_params_r0_high_bd_c( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); + const int32_t s = + (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u; + const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u; + H[0][0] += (int64_t)f1 * f1; + C[0] += (int64_t)f1 * s; + } + } + H[0][0] /= size; + C[0] /= size; +} + +static AOM_INLINE void calc_proj_params_r1_c(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, + int dat_stride, int32_t *flt1, + int flt1_stride, int64_t H[2][2], + int64_t C[2]) { + const int size = width * height; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); + const int32_t s = + (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u; + const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u; + H[1][1] += (int64_t)f2 * f2; + C[1] += (int64_t)f2 * s; + } + } + H[1][1] /= size; + C[1] /= size; +} + +static AOM_INLINE void calc_proj_params_r1_high_bd_c( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride, + int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); + const int32_t s = + (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u; + const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u; + H[1][1] += (int64_t)f2 * f2; + C[1] += (int64_t)f2 * s; + } + } + H[1][1] /= size; + C[1] /= size; +} + +// The function calls 3 subfunctions for the following cases : +// 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements +// of C and H need to be computed. +// 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are +// non-zero and need to be computed. +// 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are +// non-zero and need to be computed. +void av1_calc_proj_params_c(const uint8_t *src8, int width, int height, + int src_stride, const uint8_t *dat8, int dat_stride, + int32_t *flt0, int flt0_stride, int32_t *flt1, + int flt1_stride, int64_t H[2][2], int64_t C[2], + const sgr_params_type *params) { + if ((params->r[0] > 0) && (params->r[1] > 0)) { + calc_proj_params_r0_r1_c(src8, width, height, src_stride, dat8, dat_stride, + flt0, flt0_stride, flt1, flt1_stride, H, C); + } else if (params->r[0] > 0) { + calc_proj_params_r0_c(src8, width, height, src_stride, dat8, dat_stride, + flt0, flt0_stride, H, C); + } else if (params->r[1] > 0) { + calc_proj_params_r1_c(src8, width, height, src_stride, dat8, dat_stride, + flt1, flt1_stride, H, C); + } +} + +static AOM_INLINE void av1_calc_proj_params_high_bd_c( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], + const sgr_params_type *params) { + if ((params->r[0] > 0) && (params->r[1] > 0)) { + calc_proj_params_r0_r1_high_bd_c(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, H, C); + } else if (params->r[0] > 0) { + calc_proj_params_r0_high_bd_c(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, H, C); + } else if (params->r[1] > 0) { + calc_proj_params_r1_high_bd_c(src8, width, height, src_stride, dat8, + dat_stride, flt1, flt1_stride, H, C); + } +} + +static AOM_INLINE void get_proj_subspace(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, int dat_stride, + int use_highbitdepth, int32_t *flt0, + int flt0_stride, int32_t *flt1, + int flt1_stride, int *xq, + const sgr_params_type *params) { + int64_t H[2][2] = { { 0, 0 }, { 0, 0 } }; + int64_t C[2] = { 0, 0 }; + + // Default values to be returned if the problem becomes ill-posed + xq[0] = 0; + xq[1] = 0; + + if (!use_highbitdepth) { + if ((width & 0x7) == 0) { + av1_calc_proj_params(src8, width, height, src_stride, dat8, dat_stride, + flt0, flt0_stride, flt1, flt1_stride, H, C, params); + } else { + av1_calc_proj_params_c(src8, width, height, src_stride, dat8, dat_stride, + flt0, flt0_stride, flt1, flt1_stride, H, C, + params); + } + } else { + av1_calc_proj_params_high_bd_c(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, H, C, params); + } + + if (params->r[0] == 0) { + // H matrix is now only the scalar H[1][1] + // C vector is now only the scalar C[1] + const int64_t Det = H[1][1]; + if (Det == 0) return; // ill-posed, return default values + xq[0] = 0; + xq[1] = (int)signed_rounded_divide(C[1] * (1 << SGRPROJ_PRJ_BITS), Det); + } else if (params->r[1] == 0) { + // H matrix is now only the scalar H[0][0] + // C vector is now only the scalar C[0] + const int64_t Det = H[0][0]; + if (Det == 0) return; // ill-posed, return default values + xq[0] = (int)signed_rounded_divide(C[0] * (1 << SGRPROJ_PRJ_BITS), Det); + xq[1] = 0; + } else { + const int64_t Det = H[0][0] * H[1][1] - H[0][1] * H[1][0]; + if (Det == 0) return; // ill-posed, return default values + + // If scaling up dividend would overflow, instead scale down the divisor + const int64_t div1 = H[1][1] * C[0] - H[0][1] * C[1]; + if ((div1 > 0 && INT64_MAX / (1 << SGRPROJ_PRJ_BITS) < div1) || + (div1 < 0 && INT64_MIN / (1 << SGRPROJ_PRJ_BITS) > div1)) + xq[0] = (int)signed_rounded_divide(div1, Det / (1 << SGRPROJ_PRJ_BITS)); + else + xq[0] = (int)signed_rounded_divide(div1 * (1 << SGRPROJ_PRJ_BITS), Det); + + const int64_t div2 = H[0][0] * C[1] - H[1][0] * C[0]; + if ((div2 > 0 && INT64_MAX / (1 << SGRPROJ_PRJ_BITS) < div2) || + (div2 < 0 && INT64_MIN / (1 << SGRPROJ_PRJ_BITS) > div2)) + xq[1] = (int)signed_rounded_divide(div2, Det / (1 << SGRPROJ_PRJ_BITS)); + else + xq[1] = (int)signed_rounded_divide(div2 * (1 << SGRPROJ_PRJ_BITS), Det); + } +} + +static AOM_INLINE void encode_xq(int *xq, int *xqd, + const sgr_params_type *params) { + if (params->r[0] == 0) { + xqd[0] = 0; + xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xq[1], SGRPROJ_PRJ_MIN1, + SGRPROJ_PRJ_MAX1); + } else if (params->r[1] == 0) { + xqd[0] = clamp(xq[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0); + xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xqd[0], SGRPROJ_PRJ_MIN1, + SGRPROJ_PRJ_MAX1); + } else { + xqd[0] = clamp(xq[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0); + xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xqd[0] - xq[1], SGRPROJ_PRJ_MIN1, + SGRPROJ_PRJ_MAX1); + } +} + +// Apply the self-guided filter across an entire restoration unit. +static AOM_INLINE void apply_sgr(int sgr_params_idx, const uint8_t *dat8, + int width, int height, int dat_stride, + int use_highbd, int bit_depth, int pu_width, + int pu_height, int32_t *flt0, int32_t *flt1, + int flt_stride) { + for (int i = 0; i < height; i += pu_height) { + const int h = AOMMIN(pu_height, height - i); + int32_t *flt0_row = flt0 + i * flt_stride; + int32_t *flt1_row = flt1 + i * flt_stride; + const uint8_t *dat8_row = dat8 + i * dat_stride; + + // Iterate over the stripe in blocks of width pu_width + for (int j = 0; j < width; j += pu_width) { + const int w = AOMMIN(pu_width, width - j); + const int ret = av1_selfguided_restoration( + dat8_row + j, w, h, dat_stride, flt0_row + j, flt1_row + j, + flt_stride, sgr_params_idx, bit_depth, use_highbd); + (void)ret; + assert(!ret); + } + } +} + +static AOM_INLINE void compute_sgrproj_err( + const uint8_t *dat8, const int width, const int height, + const int dat_stride, const uint8_t *src8, const int src_stride, + const int use_highbitdepth, const int bit_depth, const int pu_width, + const int pu_height, const int ep, int32_t *flt0, int32_t *flt1, + const int flt_stride, int *exqd, int64_t *err) { + int exq[2]; + apply_sgr(ep, dat8, width, height, dat_stride, use_highbitdepth, bit_depth, + pu_width, pu_height, flt0, flt1, flt_stride); + aom_clear_system_state(); + const sgr_params_type *const params = &av1_sgr_params[ep]; + get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride, + use_highbitdepth, flt0, flt_stride, flt1, flt_stride, exq, + params); + aom_clear_system_state(); + encode_xq(exq, exqd, params); + *err = finer_search_pixel_proj_error( + src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0, + flt_stride, flt1, flt_stride, 2, exqd, params); +} + +static AOM_INLINE void get_best_error(int64_t *besterr, const int64_t err, + const int *exqd, int *bestxqd, + int *bestep, const int ep) { + if (*besterr == -1 || err < *besterr) { + *bestep = ep; + *besterr = err; + bestxqd[0] = exqd[0]; + bestxqd[1] = exqd[1]; + } +} + +static SgrprojInfo search_selfguided_restoration( + const uint8_t *dat8, int width, int height, int dat_stride, + const uint8_t *src8, int src_stride, int use_highbitdepth, int bit_depth, + int pu_width, int pu_height, int32_t *rstbuf, int enable_sgr_ep_pruning) { + int32_t *flt0 = rstbuf; + int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; + int ep, idx, bestep = 0; + int64_t besterr = -1; + int exqd[2], bestxqd[2] = { 0, 0 }; + int flt_stride = ((width + 7) & ~7) + 8; + assert(pu_width == (RESTORATION_PROC_UNIT_SIZE >> 1) || + pu_width == RESTORATION_PROC_UNIT_SIZE); + assert(pu_height == (RESTORATION_PROC_UNIT_SIZE >> 1) || + pu_height == RESTORATION_PROC_UNIT_SIZE); + if (!enable_sgr_ep_pruning) { + for (ep = 0; ep < SGRPROJ_PARAMS; ep++) { + int64_t err; + compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride, + use_highbitdepth, bit_depth, pu_width, pu_height, ep, + flt0, flt1, flt_stride, exqd, &err); + get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep); + } + } else { + // evaluate first four seed ep in first group + for (idx = 0; idx < SGRPROJ_EP_GRP1_SEARCH_COUNT; idx++) { + ep = sgproj_ep_grp1_seed[idx]; + int64_t err; + compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride, + use_highbitdepth, bit_depth, pu_width, pu_height, ep, + flt0, flt1, flt_stride, exqd, &err); + get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep); + } + // evaluate left and right ep of winner in seed ep + int bestep_ref = bestep; + for (ep = bestep_ref - 1; ep < bestep_ref + 2; ep += 2) { + if (ep < SGRPROJ_EP_GRP1_START_IDX || ep > SGRPROJ_EP_GRP1_END_IDX) + continue; + int64_t err; + compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride, + use_highbitdepth, bit_depth, pu_width, pu_height, ep, + flt0, flt1, flt_stride, exqd, &err); + get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep); + } + // evaluate last two group + for (idx = 0; idx < SGRPROJ_EP_GRP2_3_SEARCH_COUNT; idx++) { + ep = sgproj_ep_grp2_3[idx][bestep]; + int64_t err; + compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride, + use_highbitdepth, bit_depth, pu_width, pu_height, ep, + flt0, flt1, flt_stride, exqd, &err); + get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep); + } + } + + SgrprojInfo ret; + ret.ep = bestep; + ret.xqd[0] = bestxqd[0]; + ret.xqd[1] = bestxqd[1]; + return ret; +} + +static int count_sgrproj_bits(SgrprojInfo *sgrproj_info, + SgrprojInfo *ref_sgrproj_info) { + int bits = SGRPROJ_PARAMS_BITS; + const sgr_params_type *params = &av1_sgr_params[sgrproj_info->ep]; + if (params->r[0] > 0) + bits += aom_count_primitive_refsubexpfin( + SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, + sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0); + if (params->r[1] > 0) + bits += aom_count_primitive_refsubexpfin( + SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, + sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1); + return bits; +} + +static AOM_INLINE void search_sgrproj(const RestorationTileLimits *limits, + const AV1PixelRect *tile, + int rest_unit_idx, void *priv, + int32_t *tmpbuf, + RestorationLineBuffers *rlbs) { + (void)rlbs; + RestSearchCtxt *rsc = (RestSearchCtxt *)priv; + RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx]; + + const MACROBLOCK *const x = rsc->x; + const AV1_COMMON *const cm = rsc->cm; + const int highbd = cm->seq_params.use_highbitdepth; + const int bit_depth = cm->seq_params.bit_depth; + + const int64_t bits_none = x->sgrproj_restore_cost[0]; + // Prune evaluation of RESTORE_SGRPROJ if 'skip_sgr_eval' is set + if (rusi->skip_sgr_eval) { + rsc->bits += bits_none; + rsc->sse += rusi->sse[RESTORE_NONE]; + rusi->best_rtype[RESTORE_SGRPROJ - 1] = RESTORE_NONE; + rusi->sse[RESTORE_SGRPROJ] = INT64_MAX; + return; + } + + uint8_t *dgd_start = + rsc->dgd_buffer + limits->v_start * rsc->dgd_stride + limits->h_start; + const uint8_t *src_start = + rsc->src_buffer + limits->v_start * rsc->src_stride + limits->h_start; + + const int is_uv = rsc->plane > 0; + const int ss_x = is_uv && cm->seq_params.subsampling_x; + const int ss_y = is_uv && cm->seq_params.subsampling_y; + const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x; + const int procunit_height = RESTORATION_PROC_UNIT_SIZE >> ss_y; + + rusi->sgrproj = search_selfguided_restoration( + dgd_start, limits->h_end - limits->h_start, + limits->v_end - limits->v_start, rsc->dgd_stride, src_start, + rsc->src_stride, highbd, bit_depth, procunit_width, procunit_height, + tmpbuf, rsc->sf->lpf_sf.enable_sgr_ep_pruning); + + RestorationUnitInfo rui; + rui.restoration_type = RESTORE_SGRPROJ; + rui.sgrproj_info = rusi->sgrproj; + + rusi->sse[RESTORE_SGRPROJ] = try_restoration_unit(rsc, limits, tile, &rui); + + const int64_t bits_sgr = x->sgrproj_restore_cost[1] + + (count_sgrproj_bits(&rusi->sgrproj, &rsc->sgrproj) + << AV1_PROB_COST_SHIFT); + + double cost_none = + RDCOST_DBL(x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE]); + double cost_sgr = + RDCOST_DBL(x->rdmult, bits_sgr >> 4, rusi->sse[RESTORE_SGRPROJ]); + if (rusi->sgrproj.ep < 10) + cost_sgr *= + (1 + DUAL_SGR_PENALTY_MULT * rsc->sf->lpf_sf.dual_sgr_penalty_level); + + RestorationType rtype = + (cost_sgr < cost_none) ? RESTORE_SGRPROJ : RESTORE_NONE; + rusi->best_rtype[RESTORE_SGRPROJ - 1] = rtype; + + rsc->sse += rusi->sse[rtype]; + rsc->bits += (cost_sgr < cost_none) ? bits_sgr : bits_none; + if (cost_sgr < cost_none) rsc->sgrproj = rusi->sgrproj; +} + +void av1_compute_stats_c(int wiener_win, const uint8_t *dgd, const uint8_t *src, + int h_start, int h_end, int v_start, int v_end, + int dgd_stride, int src_stride, int64_t *M, + int64_t *H) { + int i, j, k, l; + int16_t Y[WIENER_WIN2]; + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + uint8_t avg = find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + memset(M, 0, sizeof(*M) * wiener_win2); + memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2); + for (i = v_start; i < v_end; i++) { + for (j = h_start; j < h_end; j++) { + const int16_t X = (int16_t)src[i * src_stride + j] - (int16_t)avg; + int idx = 0; + for (k = -wiener_halfwin; k <= wiener_halfwin; k++) { + for (l = -wiener_halfwin; l <= wiener_halfwin; l++) { + Y[idx] = (int16_t)dgd[(i + l) * dgd_stride + (j + k)] - (int16_t)avg; + idx++; + } + } + assert(idx == wiener_win2); + for (k = 0; k < wiener_win2; ++k) { + M[k] += (int32_t)Y[k] * X; + for (l = k; l < wiener_win2; ++l) { + // H is a symmetric matrix, so we only need to fill out the upper + // triangle here. We can copy it down to the lower triangle outside + // the (i, j) loops. + H[k * wiener_win2 + l] += (int32_t)Y[k] * Y[l]; + } + } + } + } + for (k = 0; k < wiener_win2; ++k) { + for (l = k + 1; l < wiener_win2; ++l) { + H[l * wiener_win2 + k] = H[k * wiener_win2 + l]; + } + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +void av1_compute_stats_highbd_c(int wiener_win, const uint8_t *dgd8, + const uint8_t *src8, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + aom_bit_depth_t bit_depth) { + int i, j, k, l; + int32_t Y[WIENER_WIN2]; + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); + uint16_t avg = + find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + uint8_t bit_depth_divider = 1; + if (bit_depth == AOM_BITS_12) + bit_depth_divider = 16; + else if (bit_depth == AOM_BITS_10) + bit_depth_divider = 4; + + memset(M, 0, sizeof(*M) * wiener_win2); + memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2); + for (i = v_start; i < v_end; i++) { + for (j = h_start; j < h_end; j++) { + const int32_t X = (int32_t)src[i * src_stride + j] - (int32_t)avg; + int idx = 0; + for (k = -wiener_halfwin; k <= wiener_halfwin; k++) { + for (l = -wiener_halfwin; l <= wiener_halfwin; l++) { + Y[idx] = (int32_t)dgd[(i + l) * dgd_stride + (j + k)] - (int32_t)avg; + idx++; + } + } + assert(idx == wiener_win2); + for (k = 0; k < wiener_win2; ++k) { + M[k] += (int64_t)Y[k] * X; + for (l = k; l < wiener_win2; ++l) { + // H is a symmetric matrix, so we only need to fill out the upper + // triangle here. We can copy it down to the lower triangle outside + // the (i, j) loops. + H[k * wiener_win2 + l] += (int64_t)Y[k] * Y[l]; + } + } + } + } + for (k = 0; k < wiener_win2; ++k) { + M[k] /= bit_depth_divider; + H[k * wiener_win2 + k] /= bit_depth_divider; + for (l = k + 1; l < wiener_win2; ++l) { + H[k * wiener_win2 + l] /= bit_depth_divider; + H[l * wiener_win2 + k] = H[k * wiener_win2 + l]; + } + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static INLINE int wrap_index(int i, int wiener_win) { + const int wiener_halfwin1 = (wiener_win >> 1) + 1; + return (i >= wiener_halfwin1 ? wiener_win - 1 - i : i); +} + +// Solve linear equations to find Wiener filter tap values +// Taps are output scaled by WIENER_FILT_STEP +static int linsolve_wiener(int n, int64_t *A, int stride, int64_t *b, + int32_t *x) { + for (int k = 0; k < n - 1; k++) { + // Partial pivoting: bring the row with the largest pivot to the top + for (int i = n - 1; i > k; i--) { + // If row i has a better (bigger) pivot than row (i-1), swap them + if (llabs(A[(i - 1) * stride + k]) < llabs(A[i * stride + k])) { + for (int j = 0; j < n; j++) { + const int64_t c = A[i * stride + j]; + A[i * stride + j] = A[(i - 1) * stride + j]; + A[(i - 1) * stride + j] = c; + } + const int64_t c = b[i]; + b[i] = b[i - 1]; + b[i - 1] = c; + } + } + // Forward elimination (convert A to row-echelon form) + for (int i = k; i < n - 1; i++) { + if (A[k * stride + k] == 0) return 0; + const int64_t c = A[(i + 1) * stride + k]; + const int64_t cd = A[k * stride + k]; + for (int j = 0; j < n; j++) { + A[(i + 1) * stride + j] -= c / 256 * A[k * stride + j] / cd * 256; + } + b[i + 1] -= c * b[k] / cd; + } + } + // Back-substitution + for (int i = n - 1; i >= 0; i--) { + if (A[i * stride + i] == 0) return 0; + int64_t c = 0; + for (int j = i + 1; j <= n - 1; j++) { + c += A[i * stride + j] * x[j] / WIENER_TAP_SCALE_FACTOR; + } + // Store filter taps x in scaled form. + x[i] = (int32_t)(WIENER_TAP_SCALE_FACTOR * (b[i] - c) / A[i * stride + i]); + } + + return 1; +} + +// Fix vector b, update vector a +static AOM_INLINE void update_a_sep_sym(int wiener_win, int64_t **Mc, + int64_t **Hc, int32_t *a, int32_t *b) { + int i, j; + int32_t S[WIENER_WIN]; + int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1]; + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin1 = (wiener_win >> 1) + 1; + memset(A, 0, sizeof(A)); + memset(B, 0, sizeof(B)); + for (i = 0; i < wiener_win; i++) { + for (j = 0; j < wiener_win; ++j) { + const int jj = wrap_index(j, wiener_win); + A[jj] += Mc[i][j] * b[i] / WIENER_TAP_SCALE_FACTOR; + } + } + for (i = 0; i < wiener_win; i++) { + for (j = 0; j < wiener_win; j++) { + int k, l; + for (k = 0; k < wiener_win; ++k) { + for (l = 0; l < wiener_win; ++l) { + const int kk = wrap_index(k, wiener_win); + const int ll = wrap_index(l, wiener_win); + B[ll * wiener_halfwin1 + kk] += + Hc[j * wiener_win + i][k * wiener_win2 + l] * b[i] / + WIENER_TAP_SCALE_FACTOR * b[j] / WIENER_TAP_SCALE_FACTOR; + } + } + } + } + // Normalization enforcement in the system of equations itself + for (i = 0; i < wiener_halfwin1 - 1; ++i) { + A[i] -= + A[wiener_halfwin1 - 1] * 2 + + B[i * wiener_halfwin1 + wiener_halfwin1 - 1] - + 2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + (wiener_halfwin1 - 1)]; + } + for (i = 0; i < wiener_halfwin1 - 1; ++i) { + for (j = 0; j < wiener_halfwin1 - 1; ++j) { + B[i * wiener_halfwin1 + j] -= + 2 * (B[i * wiener_halfwin1 + (wiener_halfwin1 - 1)] + + B[(wiener_halfwin1 - 1) * wiener_halfwin1 + j] - + 2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + + (wiener_halfwin1 - 1)]); + } + } + if (linsolve_wiener(wiener_halfwin1 - 1, B, wiener_halfwin1, A, S)) { + S[wiener_halfwin1 - 1] = WIENER_TAP_SCALE_FACTOR; + for (i = wiener_halfwin1; i < wiener_win; ++i) { + S[i] = S[wiener_win - 1 - i]; + S[wiener_halfwin1 - 1] -= 2 * S[i]; + } + memcpy(a, S, wiener_win * sizeof(*a)); + } +} + +// Fix vector a, update vector b +static AOM_INLINE void update_b_sep_sym(int wiener_win, int64_t **Mc, + int64_t **Hc, int32_t *a, int32_t *b) { + int i, j; + int32_t S[WIENER_WIN]; + int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1]; + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin1 = (wiener_win >> 1) + 1; + memset(A, 0, sizeof(A)); + memset(B, 0, sizeof(B)); + for (i = 0; i < wiener_win; i++) { + const int ii = wrap_index(i, wiener_win); + for (j = 0; j < wiener_win; j++) { + A[ii] += Mc[i][j] * a[j] / WIENER_TAP_SCALE_FACTOR; + } + } + + for (i = 0; i < wiener_win; i++) { + for (j = 0; j < wiener_win; j++) { + const int ii = wrap_index(i, wiener_win); + const int jj = wrap_index(j, wiener_win); + int k, l; + for (k = 0; k < wiener_win; ++k) { + for (l = 0; l < wiener_win; ++l) { + B[jj * wiener_halfwin1 + ii] += + Hc[i * wiener_win + j][k * wiener_win2 + l] * a[k] / + WIENER_TAP_SCALE_FACTOR * a[l] / WIENER_TAP_SCALE_FACTOR; + } + } + } + } + // Normalization enforcement in the system of equations itself + for (i = 0; i < wiener_halfwin1 - 1; ++i) { + A[i] -= + A[wiener_halfwin1 - 1] * 2 + + B[i * wiener_halfwin1 + wiener_halfwin1 - 1] - + 2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + (wiener_halfwin1 - 1)]; + } + for (i = 0; i < wiener_halfwin1 - 1; ++i) { + for (j = 0; j < wiener_halfwin1 - 1; ++j) { + B[i * wiener_halfwin1 + j] -= + 2 * (B[i * wiener_halfwin1 + (wiener_halfwin1 - 1)] + + B[(wiener_halfwin1 - 1) * wiener_halfwin1 + j] - + 2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + + (wiener_halfwin1 - 1)]); + } + } + if (linsolve_wiener(wiener_halfwin1 - 1, B, wiener_halfwin1, A, S)) { + S[wiener_halfwin1 - 1] = WIENER_TAP_SCALE_FACTOR; + for (i = wiener_halfwin1; i < wiener_win; ++i) { + S[i] = S[wiener_win - 1 - i]; + S[wiener_halfwin1 - 1] -= 2 * S[i]; + } + memcpy(b, S, wiener_win * sizeof(*b)); + } +} + +static int wiener_decompose_sep_sym(int wiener_win, int64_t *M, int64_t *H, + int32_t *a, int32_t *b) { + static const int32_t init_filt[WIENER_WIN] = { + WIENER_FILT_TAP0_MIDV, WIENER_FILT_TAP1_MIDV, WIENER_FILT_TAP2_MIDV, + WIENER_FILT_TAP3_MIDV, WIENER_FILT_TAP2_MIDV, WIENER_FILT_TAP1_MIDV, + WIENER_FILT_TAP0_MIDV, + }; + int64_t *Hc[WIENER_WIN2]; + int64_t *Mc[WIENER_WIN]; + int i, j, iter; + const int plane_off = (WIENER_WIN - wiener_win) >> 1; + const int wiener_win2 = wiener_win * wiener_win; + for (i = 0; i < wiener_win; i++) { + a[i] = b[i] = + WIENER_TAP_SCALE_FACTOR / WIENER_FILT_STEP * init_filt[i + plane_off]; + } + for (i = 0; i < wiener_win; i++) { + Mc[i] = M + i * wiener_win; + for (j = 0; j < wiener_win; j++) { + Hc[i * wiener_win + j] = + H + i * wiener_win * wiener_win2 + j * wiener_win; + } + } + + iter = 1; + while (iter < NUM_WIENER_ITERS) { + update_a_sep_sym(wiener_win, Mc, Hc, a, b); + update_b_sep_sym(wiener_win, Mc, Hc, a, b); + iter++; + } + return 1; +} + +// Computes the function x'*H*x - x'*M for the learned 2D filter x, and compares +// against identity filters; Final score is defined as the difference between +// the function values +static int64_t compute_score(int wiener_win, int64_t *M, int64_t *H, + InterpKernel vfilt, InterpKernel hfilt) { + int32_t ab[WIENER_WIN * WIENER_WIN]; + int16_t a[WIENER_WIN], b[WIENER_WIN]; + int64_t P = 0, Q = 0; + int64_t iP = 0, iQ = 0; + int64_t Score, iScore; + int i, k, l; + const int plane_off = (WIENER_WIN - wiener_win) >> 1; + const int wiener_win2 = wiener_win * wiener_win; + + aom_clear_system_state(); + + a[WIENER_HALFWIN] = b[WIENER_HALFWIN] = WIENER_FILT_STEP; + for (i = 0; i < WIENER_HALFWIN; ++i) { + a[i] = a[WIENER_WIN - i - 1] = vfilt[i]; + b[i] = b[WIENER_WIN - i - 1] = hfilt[i]; + a[WIENER_HALFWIN] -= 2 * a[i]; + b[WIENER_HALFWIN] -= 2 * b[i]; + } + memset(ab, 0, sizeof(ab)); + for (k = 0; k < wiener_win; ++k) { + for (l = 0; l < wiener_win; ++l) + ab[k * wiener_win + l] = a[l + plane_off] * b[k + plane_off]; + } + for (k = 0; k < wiener_win2; ++k) { + P += ab[k] * M[k] / WIENER_FILT_STEP / WIENER_FILT_STEP; + for (l = 0; l < wiener_win2; ++l) { + Q += ab[k] * H[k * wiener_win2 + l] * ab[l] / WIENER_FILT_STEP / + WIENER_FILT_STEP / WIENER_FILT_STEP / WIENER_FILT_STEP; + } + } + Score = Q - 2 * P; + + iP = M[wiener_win2 >> 1]; + iQ = H[(wiener_win2 >> 1) * wiener_win2 + (wiener_win2 >> 1)]; + iScore = iQ - 2 * iP; + + return Score - iScore; +} + +static AOM_INLINE void finalize_sym_filter(int wiener_win, int32_t *f, + InterpKernel fi) { + int i; + const int wiener_halfwin = (wiener_win >> 1); + + for (i = 0; i < wiener_halfwin; ++i) { + const int64_t dividend = f[i] * WIENER_FILT_STEP; + const int64_t divisor = WIENER_TAP_SCALE_FACTOR; + // Perform this division with proper rounding rather than truncation + if (dividend < 0) { + fi[i] = (int16_t)((dividend - (divisor / 2)) / divisor); + } else { + fi[i] = (int16_t)((dividend + (divisor / 2)) / divisor); + } + } + // Specialize for 7-tap filter + if (wiener_win == WIENER_WIN) { + fi[0] = CLIP(fi[0], WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP0_MAXV); + fi[1] = CLIP(fi[1], WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_MAXV); + fi[2] = CLIP(fi[2], WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_MAXV); + } else { + fi[2] = CLIP(fi[1], WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_MAXV); + fi[1] = CLIP(fi[0], WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_MAXV); + fi[0] = 0; + } + // Satisfy filter constraints + fi[WIENER_WIN - 1] = fi[0]; + fi[WIENER_WIN - 2] = fi[1]; + fi[WIENER_WIN - 3] = fi[2]; + // The central element has an implicit +WIENER_FILT_STEP + fi[3] = -2 * (fi[0] + fi[1] + fi[2]); +} + +static int count_wiener_bits(int wiener_win, WienerInfo *wiener_info, + WienerInfo *ref_wiener_info) { + int bits = 0; + if (wiener_win == WIENER_WIN) + bits += aom_count_primitive_refsubexpfin( + WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1, + WIENER_FILT_TAP0_SUBEXP_K, + ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV, + wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV); + bits += aom_count_primitive_refsubexpfin( + WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1, + WIENER_FILT_TAP1_SUBEXP_K, + ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV, + wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV); + bits += aom_count_primitive_refsubexpfin( + WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1, + WIENER_FILT_TAP2_SUBEXP_K, + ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV, + wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV); + if (wiener_win == WIENER_WIN) + bits += aom_count_primitive_refsubexpfin( + WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1, + WIENER_FILT_TAP0_SUBEXP_K, + ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV, + wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV); + bits += aom_count_primitive_refsubexpfin( + WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1, + WIENER_FILT_TAP1_SUBEXP_K, + ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV, + wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV); + bits += aom_count_primitive_refsubexpfin( + WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1, + WIENER_FILT_TAP2_SUBEXP_K, + ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV, + wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV); + return bits; +} + +#define USE_WIENER_REFINEMENT_SEARCH 1 +static int64_t finer_tile_search_wiener(const RestSearchCtxt *rsc, + const RestorationTileLimits *limits, + const AV1PixelRect *tile, + RestorationUnitInfo *rui, + int wiener_win) { + const int plane_off = (WIENER_WIN - wiener_win) >> 1; + int64_t err = try_restoration_unit(rsc, limits, tile, rui); +#if USE_WIENER_REFINEMENT_SEARCH + int64_t err2; + int tap_min[] = { WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP1_MINV, + WIENER_FILT_TAP2_MINV }; + int tap_max[] = { WIENER_FILT_TAP0_MAXV, WIENER_FILT_TAP1_MAXV, + WIENER_FILT_TAP2_MAXV }; + + WienerInfo *plane_wiener = &rui->wiener_info; + + // printf("err pre = %"PRId64"\n", err); + const int start_step = 4; + for (int s = start_step; s >= 1; s >>= 1) { + for (int p = plane_off; p < WIENER_HALFWIN; ++p) { + int skip = 0; + do { + if (plane_wiener->hfilter[p] - s >= tap_min[p]) { + plane_wiener->hfilter[p] -= s; + plane_wiener->hfilter[WIENER_WIN - p - 1] -= s; + plane_wiener->hfilter[WIENER_HALFWIN] += 2 * s; + err2 = try_restoration_unit(rsc, limits, tile, rui); + if (err2 > err) { + plane_wiener->hfilter[p] += s; + plane_wiener->hfilter[WIENER_WIN - p - 1] += s; + plane_wiener->hfilter[WIENER_HALFWIN] -= 2 * s; + } else { + err = err2; + skip = 1; + // At the highest step size continue moving in the same direction + if (s == start_step) continue; + } + } + break; + } while (1); + if (skip) break; + do { + if (plane_wiener->hfilter[p] + s <= tap_max[p]) { + plane_wiener->hfilter[p] += s; + plane_wiener->hfilter[WIENER_WIN - p - 1] += s; + plane_wiener->hfilter[WIENER_HALFWIN] -= 2 * s; + err2 = try_restoration_unit(rsc, limits, tile, rui); + if (err2 > err) { + plane_wiener->hfilter[p] -= s; + plane_wiener->hfilter[WIENER_WIN - p - 1] -= s; + plane_wiener->hfilter[WIENER_HALFWIN] += 2 * s; + } else { + err = err2; + // At the highest step size continue moving in the same direction + if (s == start_step) continue; + } + } + break; + } while (1); + } + for (int p = plane_off; p < WIENER_HALFWIN; ++p) { + int skip = 0; + do { + if (plane_wiener->vfilter[p] - s >= tap_min[p]) { + plane_wiener->vfilter[p] -= s; + plane_wiener->vfilter[WIENER_WIN - p - 1] -= s; + plane_wiener->vfilter[WIENER_HALFWIN] += 2 * s; + err2 = try_restoration_unit(rsc, limits, tile, rui); + if (err2 > err) { + plane_wiener->vfilter[p] += s; + plane_wiener->vfilter[WIENER_WIN - p - 1] += s; + plane_wiener->vfilter[WIENER_HALFWIN] -= 2 * s; + } else { + err = err2; + skip = 1; + // At the highest step size continue moving in the same direction + if (s == start_step) continue; + } + } + break; + } while (1); + if (skip) break; + do { + if (plane_wiener->vfilter[p] + s <= tap_max[p]) { + plane_wiener->vfilter[p] += s; + plane_wiener->vfilter[WIENER_WIN - p - 1] += s; + plane_wiener->vfilter[WIENER_HALFWIN] -= 2 * s; + err2 = try_restoration_unit(rsc, limits, tile, rui); + if (err2 > err) { + plane_wiener->vfilter[p] -= s; + plane_wiener->vfilter[WIENER_WIN - p - 1] -= s; + plane_wiener->vfilter[WIENER_HALFWIN] += 2 * s; + } else { + err = err2; + // At the highest step size continue moving in the same direction + if (s == start_step) continue; + } + } + break; + } while (1); + } + } + // printf("err post = %"PRId64"\n", err); +#endif // USE_WIENER_REFINEMENT_SEARCH + return err; +} + +static AOM_INLINE void search_wiener(const RestorationTileLimits *limits, + const AV1PixelRect *tile_rect, + int rest_unit_idx, void *priv, + int32_t *tmpbuf, + RestorationLineBuffers *rlbs) { + (void)tmpbuf; + (void)rlbs; + RestSearchCtxt *rsc = (RestSearchCtxt *)priv; + RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx]; + + const MACROBLOCK *const x = rsc->x; + const int64_t bits_none = x->wiener_restore_cost[0]; + + // Skip Wiener search for low variance contents + if (rsc->sf->lpf_sf.prune_wiener_based_on_src_var) { + const int scale[3] = { 0, 1, 2 }; + // Obtain the normalized Qscale + const int qs = av1_dc_quant_QTX(rsc->cm->quant_params.base_qindex, 0, + rsc->cm->seq_params.bit_depth) >> + 3; + // Derive threshold as sqr(normalized Qscale) * scale / 16, + const uint64_t thresh = + (qs * qs * scale[rsc->sf->lpf_sf.prune_wiener_based_on_src_var]) >> 4; + const int highbd = rsc->cm->seq_params.use_highbitdepth; + const uint64_t src_var = + var_restoration_unit(limits, rsc->src, rsc->plane, highbd); + // Do not perform Wiener search if source variance is lower than threshold + // or if the reconstruction error is zero + int prune_wiener = (src_var < thresh) || (rusi->sse[RESTORE_NONE] == 0); + if (prune_wiener) { + rsc->bits += bits_none; + rsc->sse += rusi->sse[RESTORE_NONE]; + rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE; + rusi->sse[RESTORE_WIENER] = INT64_MAX; + if (rsc->sf->lpf_sf.prune_sgr_based_on_wiener == 2) + rusi->skip_sgr_eval = 1; + return; + } + } + + const int wiener_win = + (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA; + + int reduced_wiener_win = wiener_win; + if (rsc->sf->lpf_sf.reduce_wiener_window_size) { + reduced_wiener_win = + (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN_REDUCED : WIENER_WIN_CHROMA; + } + + int64_t M[WIENER_WIN2]; + int64_t H[WIENER_WIN2 * WIENER_WIN2]; + int32_t vfilter[WIENER_WIN], hfilter[WIENER_WIN]; + +#if CONFIG_AV1_HIGHBITDEPTH + const AV1_COMMON *const cm = rsc->cm; + if (cm->seq_params.use_highbitdepth) { + av1_compute_stats_highbd(reduced_wiener_win, rsc->dgd_buffer, + rsc->src_buffer, limits->h_start, limits->h_end, + limits->v_start, limits->v_end, rsc->dgd_stride, + rsc->src_stride, M, H, cm->seq_params.bit_depth); + } else { + av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer, + limits->h_start, limits->h_end, limits->v_start, + limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H); + } +#else + av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer, + limits->h_start, limits->h_end, limits->v_start, + limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H); +#endif + + if (!wiener_decompose_sep_sym(reduced_wiener_win, M, H, vfilter, hfilter)) { + rsc->bits += bits_none; + rsc->sse += rusi->sse[RESTORE_NONE]; + rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE; + rusi->sse[RESTORE_WIENER] = INT64_MAX; + if (rsc->sf->lpf_sf.prune_sgr_based_on_wiener == 2) rusi->skip_sgr_eval = 1; + return; + } + + RestorationUnitInfo rui; + memset(&rui, 0, sizeof(rui)); + rui.restoration_type = RESTORE_WIENER; + finalize_sym_filter(reduced_wiener_win, vfilter, rui.wiener_info.vfilter); + finalize_sym_filter(reduced_wiener_win, hfilter, rui.wiener_info.hfilter); + + // Filter score computes the value of the function x'*A*x - x'*b for the + // learned filter and compares it against identity filer. If there is no + // reduction in the function, the filter is reverted back to identity + if (compute_score(reduced_wiener_win, M, H, rui.wiener_info.vfilter, + rui.wiener_info.hfilter) > 0) { + rsc->bits += bits_none; + rsc->sse += rusi->sse[RESTORE_NONE]; + rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE; + rusi->sse[RESTORE_WIENER] = INT64_MAX; + if (rsc->sf->lpf_sf.prune_sgr_based_on_wiener == 2) rusi->skip_sgr_eval = 1; + return; + } + + aom_clear_system_state(); + + rusi->sse[RESTORE_WIENER] = finer_tile_search_wiener( + rsc, limits, tile_rect, &rui, reduced_wiener_win); + rusi->wiener = rui.wiener_info; + + if (reduced_wiener_win != WIENER_WIN) { + assert(rui.wiener_info.vfilter[0] == 0 && + rui.wiener_info.vfilter[WIENER_WIN - 1] == 0); + assert(rui.wiener_info.hfilter[0] == 0 && + rui.wiener_info.hfilter[WIENER_WIN - 1] == 0); + } + + const int64_t bits_wiener = + x->wiener_restore_cost[1] + + (count_wiener_bits(wiener_win, &rusi->wiener, &rsc->wiener) + << AV1_PROB_COST_SHIFT); + + double cost_none = + RDCOST_DBL(x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE]); + double cost_wiener = + RDCOST_DBL(x->rdmult, bits_wiener >> 4, rusi->sse[RESTORE_WIENER]); + + RestorationType rtype = + (cost_wiener < cost_none) ? RESTORE_WIENER : RESTORE_NONE; + rusi->best_rtype[RESTORE_WIENER - 1] = rtype; + + // Set 'skip_sgr_eval' based on rdcost ratio of RESTORE_WIENER and + // RESTORE_NONE or based on best_rtype + if (rsc->sf->lpf_sf.prune_sgr_based_on_wiener == 1) { + rusi->skip_sgr_eval = cost_wiener > (1.01 * cost_none); + } else if (rsc->sf->lpf_sf.prune_sgr_based_on_wiener == 2) { + rusi->skip_sgr_eval = rusi->best_rtype[RESTORE_WIENER - 1] == RESTORE_NONE; + } + + rsc->sse += rusi->sse[rtype]; + rsc->bits += (cost_wiener < cost_none) ? bits_wiener : bits_none; + if (cost_wiener < cost_none) rsc->wiener = rusi->wiener; +} + +static AOM_INLINE void search_norestore(const RestorationTileLimits *limits, + const AV1PixelRect *tile_rect, + int rest_unit_idx, void *priv, + int32_t *tmpbuf, + RestorationLineBuffers *rlbs) { + (void)tile_rect; + (void)tmpbuf; + (void)rlbs; + + RestSearchCtxt *rsc = (RestSearchCtxt *)priv; + RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx]; + + const int highbd = rsc->cm->seq_params.use_highbitdepth; + rusi->sse[RESTORE_NONE] = sse_restoration_unit( + limits, rsc->src, &rsc->cm->cur_frame->buf, rsc->plane, highbd); + + rsc->sse += rusi->sse[RESTORE_NONE]; +} + +static AOM_INLINE void search_switchable(const RestorationTileLimits *limits, + const AV1PixelRect *tile_rect, + int rest_unit_idx, void *priv, + int32_t *tmpbuf, + RestorationLineBuffers *rlbs) { + (void)limits; + (void)tile_rect; + (void)tmpbuf; + (void)rlbs; + RestSearchCtxt *rsc = (RestSearchCtxt *)priv; + RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx]; + + const MACROBLOCK *const x = rsc->x; + + const int wiener_win = + (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA; + + double best_cost = 0; + int64_t best_bits = 0; + RestorationType best_rtype = RESTORE_NONE; + + for (RestorationType r = 0; r < RESTORE_SWITCHABLE_TYPES; ++r) { + // Check for the condition that wiener or sgrproj search could not + // find a solution or the solution was worse than RESTORE_NONE. + // In either case the best_rtype will be set as RESTORE_NONE. These + // should be skipped from the test below. + if (r > RESTORE_NONE) { + if (rusi->best_rtype[r - 1] == RESTORE_NONE) continue; + } + + const int64_t sse = rusi->sse[r]; + int64_t coeff_pcost = 0; + switch (r) { + case RESTORE_NONE: coeff_pcost = 0; break; + case RESTORE_WIENER: + coeff_pcost = + count_wiener_bits(wiener_win, &rusi->wiener, &rsc->wiener); + break; + case RESTORE_SGRPROJ: + coeff_pcost = count_sgrproj_bits(&rusi->sgrproj, &rsc->sgrproj); + break; + default: assert(0); break; + } + const int64_t coeff_bits = coeff_pcost << AV1_PROB_COST_SHIFT; + const int64_t bits = x->switchable_restore_cost[r] + coeff_bits; + double cost = RDCOST_DBL(x->rdmult, bits >> 4, sse); + if (r == RESTORE_SGRPROJ && rusi->sgrproj.ep < 10) + cost *= + (1 + DUAL_SGR_PENALTY_MULT * rsc->sf->lpf_sf.dual_sgr_penalty_level); + if (r == 0 || cost < best_cost) { + best_cost = cost; + best_bits = bits; + best_rtype = r; + } + } + + rusi->best_rtype[RESTORE_SWITCHABLE - 1] = best_rtype; + + rsc->sse += rusi->sse[best_rtype]; + rsc->bits += best_bits; + if (best_rtype == RESTORE_WIENER) rsc->wiener = rusi->wiener; + if (best_rtype == RESTORE_SGRPROJ) rsc->sgrproj = rusi->sgrproj; +} + +static AOM_INLINE void copy_unit_info(RestorationType frame_rtype, + const RestUnitSearchInfo *rusi, + RestorationUnitInfo *rui) { + assert(frame_rtype > 0); + rui->restoration_type = rusi->best_rtype[frame_rtype - 1]; + if (rui->restoration_type == RESTORE_WIENER) + rui->wiener_info = rusi->wiener; + else + rui->sgrproj_info = rusi->sgrproj; +} + +static double search_rest_type(RestSearchCtxt *rsc, RestorationType rtype) { + static const rest_unit_visitor_t funs[RESTORE_TYPES] = { + search_norestore, search_wiener, search_sgrproj, search_switchable + }; + + reset_rsc(rsc); + rsc_on_tile(rsc); + + av1_foreach_rest_unit_in_plane(rsc->cm, rsc->plane, funs[rtype], rsc, + &rsc->tile_rect, rsc->cm->rst_tmpbuf, NULL); + return RDCOST_DBL(rsc->x->rdmult, rsc->bits >> 4, rsc->sse); +} + +static int rest_tiles_in_plane(const AV1_COMMON *cm, int plane) { + const RestorationInfo *rsi = &cm->rst_info[plane]; + return rsi->units_per_tile; +} + +void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + assert(!cm->features.all_lossless); + + int ntiles[2]; + for (int is_uv = 0; is_uv < 2; ++is_uv) + ntiles[is_uv] = rest_tiles_in_plane(cm, is_uv); + + assert(ntiles[1] <= ntiles[0]); + RestUnitSearchInfo *rusi = + (RestUnitSearchInfo *)aom_memalign(16, sizeof(*rusi) * ntiles[0]); + + // If the restoration unit dimensions are not multiples of + // rsi->restoration_unit_size then some elements of the rusi array may be + // left uninitialised when we reach copy_unit_info(...). This is not a + // problem, as these elements are ignored later, but in order to quiet + // Valgrind's warnings we initialise the array below. + memset(rusi, 0, sizeof(*rusi) * ntiles[0]); + cpi->td.mb.rdmult = cpi->rd.RDMULT; + + RestSearchCtxt rsc; + const int plane_start = AOM_PLANE_Y; + const int plane_end = num_planes > 1 ? AOM_PLANE_V : AOM_PLANE_Y; + for (int plane = plane_start; plane <= plane_end; ++plane) { + init_rsc(src, &cpi->common, &cpi->td.mb, &cpi->sf, plane, rusi, + &cpi->trial_frame_rst, &rsc); + + const int plane_ntiles = ntiles[plane > 0]; + const RestorationType num_rtypes = + (plane_ntiles > 1) ? RESTORE_TYPES : RESTORE_SWITCHABLE_TYPES; + + double best_cost = 0; + RestorationType best_rtype = RESTORE_NONE; + + const int highbd = rsc.cm->seq_params.use_highbitdepth; + if (!cpi->sf.lpf_sf.disable_loop_restoration_chroma || !plane) { + av1_extend_frame(rsc.dgd_buffer, rsc.plane_width, rsc.plane_height, + rsc.dgd_stride, RESTORATION_BORDER, RESTORATION_BORDER, + highbd); + + for (RestorationType r = 0; r < num_rtypes; ++r) { + if ((force_restore_type != RESTORE_TYPES) && (r != RESTORE_NONE) && + (r != force_restore_type)) + continue; + + double cost = search_rest_type(&rsc, r); + + if (r == 0 || cost < best_cost) { + best_cost = cost; + best_rtype = r; + } + } + } + + cm->rst_info[plane].frame_restoration_type = best_rtype; + if (force_restore_type != RESTORE_TYPES) + assert(best_rtype == force_restore_type || best_rtype == RESTORE_NONE); + + if (best_rtype != RESTORE_NONE) { + for (int u = 0; u < plane_ntiles; ++u) { + copy_unit_info(best_rtype, &rusi[u], &cm->rst_info[plane].unit_info[u]); + } + } + } + + aom_free(rusi); +} diff --git a/libs/libaom/src/av1/encoder/pickrst.h b/libs/libaom/src/av1/encoder/pickrst.h new file mode 100644 index 000000000..eee30553d --- /dev/null +++ b/libs/libaom/src/av1/encoder/pickrst.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_ENCODER_PICKRST_H_ +#define AOM_AV1_ENCODER_PICKRST_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/encoder.h" +#include "aom_ports/system_state.h" + +struct yv12_buffer_config; +struct AV1_COMP; + +static const uint8_t g_shuffle_stats_data[16] = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, +}; + +static const uint8_t g_shuffle_stats_highbd_data[32] = { + 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, + 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, +}; + +static INLINE uint8_t find_average(const uint8_t *src, int h_start, int h_end, + int v_start, int v_end, int stride) { + uint64_t sum = 0; + for (int i = v_start; i < v_end; i++) { + for (int j = h_start; j < h_end; j++) { + sum += src[i * stride + j]; + } + } + uint64_t avg = sum / ((v_end - v_start) * (h_end - h_start)); + return (uint8_t)avg; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE uint16_t find_average_highbd(const uint16_t *src, int h_start, + int h_end, int v_start, int v_end, + int stride) { + uint64_t sum = 0; + for (int i = v_start; i < v_end; i++) { + for (int j = h_start; j < h_end; j++) { + sum += src[i * stride + j]; + } + } + uint64_t avg = sum / ((v_end - v_start) * (h_end - h_start)); + return (uint16_t)avg; +} +#endif + +void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_PICKRST_H_ diff --git a/libs/libaom/src/av1/encoder/pustats.h b/libs/libaom/src/av1/encoder/pustats.h new file mode 100644 index 000000000..2e8710108 --- /dev/null +++ b/libs/libaom/src/av1/encoder/pustats.h @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_PUSTATS_H_ +#define AOM_AV1_ENCODER_PUSTATS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/ml.h" + +#define NUM_FEATURES_PUSTATS 8 +#define NUM_HIDDEN_LAYERS 2 +#define HIDDEN_LAYERS_0_NODES 12 +#define HIDDEN_LAYERS_1_NODES 10 +#define LOGITS_NODES 1 + +static const float + av1_pustats_rate_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS * + HIDDEN_LAYERS_0_NODES] = { + -0.1758f, -0.0499f, -10.0069f, -2.2838f, -0.3359f, 0.3459f, -0.3285f, + -0.0515f, -0.5417f, 0.2357f, -0.0575f, -69.0782f, 0.5348f, 1.4068f, + 0.2213f, -1.0490f, -0.0636f, 0.1654f, 1.1002f, 33.4924f, 0.4358f, + 1.2499f, 0.1143f, 0.0592f, -1.6335f, -0.0092f, 1.2207f, -28.4543f, + -0.4973f, 0.4368f, 0.2341f, -0.1623f, -3.8986f, 0.1311f, -1.8789f, + -3.9079f, -0.8158f, -0.8420f, 1.4295f, -2.3629f, -1.4825f, 0.6498f, + -5.3669f, 6.4434f, 1.8393f, -35.0678f, 3.7459f, -2.8504f, 2.0502f, + -0.1812f, -3.9011f, -1.0155f, 1.8375f, -1.4517f, 1.3917f, 3.8664f, + 0.8345f, -0.3472f, 5.7740f, -1.1196f, -0.3264f, -1.2481f, -0.9284f, + -4.9657f, 2.2831f, 0.7337f, 2.3176f, 0.6416f, 0.8804f, 1.9988f, + -1.3426f, 1.2728f, 1.2249f, -0.1551f, 5.6045f, 0.2046f, -2.1464f, + -2.4922f, -0.5334f, 12.1055f, 7.2467f, -0.0070f, 0.0234f, 0.0021f, + 0.0215f, -0.0098f, -0.0682f, -6.1494f, -0.3176f, -1.6069f, -0.2119f, + -1.0533f, -0.3566f, 0.5294f, -0.4335f, 0.1626f, + }; + +static const float + av1_pustats_rate_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] = { + 10.5266f, 5.3268f, -1.0678f, 7.7411f, 8.7164f, -0.3235f, + 7.3028f, 9.0874f, -6.4594f, -1.0102f, -1.1146f, 10.8419f, + }; + +static const float + av1_pustats_rate_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES * + HIDDEN_LAYERS_1_NODES] = { + 10.5932f, 2.5192f, -0.0015f, 5.9479f, 5.2426f, -0.4091f, 5.3220f, + 6.0469f, 0.7200f, 3.3241f, 5.5006f, 12.8290f, -1.6396f, 0.5743f, + -0.8370f, 1.9956f, -4.9270f, -1.5295f, 2.1350f, -9.4415f, -0.7094f, + 5.1822f, 19.7287f, -3.0444f, -0.3320f, 0.0031f, -0.2709f, -0.5249f, + 0.3281f, -0.2240f, 0.2225f, -0.2386f, -0.4370f, -0.2438f, -0.4928f, + -0.2842f, -2.1772f, 9.2570f, -17.6655f, 3.5448f, -2.8394f, -1.0167f, + -0.5115f, -1.9260f, -0.2111f, -0.7528f, -1.2387f, -0.0401f, 5.0716f, + -3.3763f, -0.2898f, -0.4956f, -7.9993f, 0.1526f, -0.0242f, 0.7354f, + 6.0432f, 4.8043f, 7.4790f, -0.6295f, 1.7565f, 3.7197f, -2.3963f, + 6.8945f, 2.9717f, -3.1623f, 3.4241f, 4.4676f, -1.8154f, -2.9401f, + -8.5657f, -3.0240f, -1.4661f, 8.1145f, -12.7858f, 3.3624f, -1.0819f, + -4.2856f, 1.1801f, -0.5587f, -1.6062f, -1.1813f, -3.5882f, -0.2490f, + -24.9566f, -0.4140f, -0.1113f, 3.5537f, 4.4112f, 0.1367f, -1.5876f, + 1.6605f, 1.3903f, -0.0253f, -2.1419f, -2.2197f, -0.7659f, -0.4249f, + -0.0424f, 0.1486f, 0.4643f, -0.9068f, -0.3619f, -0.7624f, -0.9132f, + -0.4947f, -0.3527f, -0.5445f, -0.4768f, -1.7761f, -1.0686f, 0.5462f, + 1.3371f, 4.3116f, 0.0777f, -2.7216f, -1.8908f, 3.4989f, 7.7269f, + -2.7566f, + }; + +static const float + av1_pustats_rate_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] = { + 13.2435f, -8.5477f, -0.0998f, -1.5131f, -12.0187f, + 6.1715f, 0.5094f, 7.6433f, -0.3992f, -1.3555f, + }; + +static const float + av1_pustats_rate_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = { + 4.3078f, -17.3497f, 0.0195f, 34.6032f, -5.0127f, + 5.3079f, 10.0077f, -13.129f, 0.0087f, -8.4009f, + }; + +static const float av1_pustats_rate_logits_bias[LOGITS_NODES] = { + 4.5103f, +}; + +static const NN_CONFIG av1_pustats_rate_nnconfig = { + NUM_FEATURES_PUSTATS, // num_inputs + LOGITS_NODES, // num_outputs + NUM_HIDDEN_LAYERS, // num_hidden_layers + { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES }, // num_hidden_nodes + { + av1_pustats_rate_hiddenlayer_0_kernel, + av1_pustats_rate_hiddenlayer_1_kernel, + av1_pustats_rate_logits_kernel, + }, + { + av1_pustats_rate_hiddenlayer_0_bias, + av1_pustats_rate_hiddenlayer_1_bias, + av1_pustats_rate_logits_bias, + }, +}; + +static const float + av1_pustats_dist_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS * + HIDDEN_LAYERS_0_NODES] = { + -0.2560f, 0.1105f, -0.8434f, -0.0132f, -8.9371f, -1.1176f, -0.3655f, + 0.4885f, 1.7518f, 0.4985f, 0.5582f, -0.3739f, 0.9403f, 0.3874f, + 0.3265f, 1.7383f, 3.1747f, 0.0285f, 3.3942f, -0.0123f, 0.5057f, + 0.1584f, 0.2697f, 4.6151f, 3.6251f, -0.0121f, -1.0047f, -0.0037f, + 0.0127f, 0.1935f, -0.5277f, -2.7144f, 0.0729f, -0.1457f, -0.0816f, + -0.5462f, 0.4738f, 0.3599f, -0.0564f, 0.0910f, 0.0126f, -0.0310f, + -2.1311f, -0.4666f, -0.0074f, -0.0765f, 0.0287f, -0.2662f, -0.0999f, + -0.2983f, -0.4899f, -0.2314f, 0.2873f, -0.3614f, 0.1783f, -0.1210f, + 0.3569f, 0.5436f, -8.0536f, -0.0044f, -1.5255f, -0.8247f, -0.4556f, + 1.9045f, 0.5463f, 0.1102f, -0.9293f, -0.0185f, -0.8302f, -0.4378f, + -0.3531f, -1.3095f, 0.6099f, 0.7977f, 4.1950f, -0.0067f, -0.2762f, + -0.1574f, -0.2149f, 0.6104f, -1.7053f, 0.1904f, 4.2402f, -0.2671f, + 0.8940f, 0.6820f, 0.2241f, -0.9459f, 1.4571f, 0.5255f, 2.3352f, + -0.0806f, 0.5231f, 0.3928f, 0.4146f, 2.0956f, + }; + +static const float + av1_pustats_dist_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] = { + 1.1597f, 0.0836f, -0.7471f, -0.2439f, -0.0438f, 2.4626f, + 0.f, 1.1485f, 2.7085f, -4.7897f, 1.4093f, -1.657f, + }; + +static const float + av1_pustats_dist_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES * + HIDDEN_LAYERS_1_NODES] = { + -0.5203f, -1.3468f, 0.3865f, -0.6859f, 0.0058f, 4.0682f, 0.4807f, + -0.1380f, 0.6050f, 0.8958f, 0.7748f, -0.1311f, 1.7317f, 1.1265f, + 0.0827f, 0.1407f, -0.3605f, 0.5429f, 0.1880f, -0.1439f, 0.2837f, + 1.6477f, 0.0832f, 0.0593f, -1.8464f, -0.7241f, -1.0672f, -0.3546f, + -0.3842f, -2.3637f, 0.2514f, 0.8263f, -0.1872f, 0.5774f, -0.3610f, + -0.0205f, 1.3977f, -0.1083f, 0.6923f, 1.3039f, -0.2870f, 1.0622f, + -0.0566f, 0.2697f, -0.5429f, -0.6193f, 1.7559f, 0.3246f, 1.9159f, + 0.3744f, 0.0686f, 1.0191f, -0.4212f, 1.9591f, -0.0691f, -0.1085f, + -1.2034f, 0.0606f, 1.0116f, 0.5565f, -0.1874f, -0.7898f, 0.4796f, + 0.2290f, 0.4334f, -0.5817f, -0.2949f, 0.1367f, -0.2932f, -1.1265f, + 0.0133f, -0.5309f, -3.3191f, 0.0939f, 0.3895f, -2.5812f, -0.0066f, + -3.0063f, -0.2982f, 0.7309f, -0.2422f, -0.2770f, -0.7152f, 0.1700f, + 1.9630f, 0.1988f, 0.4194f, 0.8762f, 0.3402f, 0.1051f, -0.1598f, + 0.2405f, 0.0392f, 1.1256f, 1.5245f, 0.0950f, 0.2160f, -0.5023f, + 0.2584f, 0.2074f, 0.2218f, 0.3966f, -0.0921f, -0.2435f, -0.4560f, + -1.1923f, -0.3716f, -0.3286f, -1.3225f, 0.1896f, -0.3342f, -0.7888f, + -0.4488f, -1.7168f, 0.3341f, 0.1146f, 0.5226f, 0.2610f, -0.4574f, + -0.4164f, + }; + +static const float + av1_pustats_dist_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] = { + -2.3014f, -2.4292f, 1.3317f, -3.2361f, -1.918f, + 2.7149f, -2.5649f, 2.7765f, 2.9617f, 2.7684f, + }; + +static const float + av1_pustats_dist_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = { + -0.6868f, -0.6715f, 0.449f, -1.293f, 0.6214f, + 0.9894f, -0.4342f, 0.7002f, 1.4363f, 0.6951f, + }; + +static const float av1_pustats_dist_logits_bias[LOGITS_NODES] = { + 2.3371f, +}; + +static const NN_CONFIG av1_pustats_dist_nnconfig = { + NUM_FEATURES_PUSTATS, // num_inputs + LOGITS_NODES, // num_outputs + NUM_HIDDEN_LAYERS, // num_hidden_layers + { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES }, // num_hidden_nodes + { + av1_pustats_dist_hiddenlayer_0_kernel, + av1_pustats_dist_hiddenlayer_1_kernel, + av1_pustats_dist_logits_kernel, + }, + { + av1_pustats_dist_hiddenlayer_0_bias, + av1_pustats_dist_hiddenlayer_1_bias, + av1_pustats_dist_logits_bias, + }, +}; + +#undef NUM_HIDDEN_LAYERS +#undef HIDDEN_LAYERS_0_NODES +#undef HIDDEN_LAYERS_1_NODES +#undef LOGITS_NODES + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_PUSTATS_H_ diff --git a/libs/libaom/src/av1/encoder/random.h b/libs/libaom/src/av1/encoder/random.h new file mode 100644 index 000000000..0bca39102 --- /dev/null +++ b/libs/libaom/src/av1/encoder/random.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_RANDOM_H_ +#define AOM_AV1_ENCODER_RANDOM_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +// Generate a random number in the range [0, 32768). +static INLINE unsigned int lcg_rand16(unsigned int *state) { + *state = (unsigned int)(*state * 1103515245ULL + 12345); + return *state / 65536 % 32768; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_RANDOM_H_ diff --git a/libs/libaom/src/av1/encoder/ransac.c b/libs/libaom/src/av1/encoder/ransac.c new file mode 100644 index 000000000..07e1a5f5f --- /dev/null +++ b/libs/libaom/src/av1/encoder/ransac.c @@ -0,0 +1,820 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include +#include +#include +#include +#include + +#include "av1/encoder/ransac.h" +#include "av1/encoder/mathutils.h" +#include "av1/encoder/random.h" + +#define MAX_MINPTS 4 +#define MAX_DEGENERATE_ITER 10 +#define MINPTS_MULTIPLIER 5 + +#define INLIER_THRESHOLD 1.25 +#define MIN_TRIALS 20 + +//////////////////////////////////////////////////////////////////////////////// +// ransac +typedef int (*IsDegenerateFunc)(double *p); +typedef void (*NormalizeFunc)(double *p, int np, double *T); +typedef void (*DenormalizeFunc)(double *params, double *T1, double *T2); +typedef int (*FindTransformationFunc)(int points, double *points1, + double *points2, double *params); +typedef void (*ProjectPointsDoubleFunc)(double *mat, double *points, + double *proj, int n, int stride_points, + int stride_proj); + +static void project_points_double_translation(double *mat, double *points, + double *proj, int n, + int stride_points, + int stride_proj) { + int i; + for (i = 0; i < n; ++i) { + const double x = *(points++), y = *(points++); + *(proj++) = x + mat[0]; + *(proj++) = y + mat[1]; + points += stride_points - 2; + proj += stride_proj - 2; + } +} + +static void project_points_double_rotzoom(double *mat, double *points, + double *proj, int n, + int stride_points, int stride_proj) { + int i; + for (i = 0; i < n; ++i) { + const double x = *(points++), y = *(points++); + *(proj++) = mat[2] * x + mat[3] * y + mat[0]; + *(proj++) = -mat[3] * x + mat[2] * y + mat[1]; + points += stride_points - 2; + proj += stride_proj - 2; + } +} + +static void project_points_double_affine(double *mat, double *points, + double *proj, int n, int stride_points, + int stride_proj) { + int i; + for (i = 0; i < n; ++i) { + const double x = *(points++), y = *(points++); + *(proj++) = mat[2] * x + mat[3] * y + mat[0]; + *(proj++) = mat[4] * x + mat[5] * y + mat[1]; + points += stride_points - 2; + proj += stride_proj - 2; + } +} + +static void normalize_homography(double *pts, int n, double *T) { + double *p = pts; + double mean[2] = { 0, 0 }; + double msqe = 0; + double scale; + int i; + + assert(n > 0); + for (i = 0; i < n; ++i, p += 2) { + mean[0] += p[0]; + mean[1] += p[1]; + } + mean[0] /= n; + mean[1] /= n; + for (p = pts, i = 0; i < n; ++i, p += 2) { + p[0] -= mean[0]; + p[1] -= mean[1]; + msqe += sqrt(p[0] * p[0] + p[1] * p[1]); + } + msqe /= n; + scale = (msqe == 0 ? 1.0 : sqrt(2) / msqe); + T[0] = scale; + T[1] = 0; + T[2] = -scale * mean[0]; + T[3] = 0; + T[4] = scale; + T[5] = -scale * mean[1]; + T[6] = 0; + T[7] = 0; + T[8] = 1; + for (p = pts, i = 0; i < n; ++i, p += 2) { + p[0] *= scale; + p[1] *= scale; + } +} + +static void invnormalize_mat(double *T, double *iT) { + double is = 1.0 / T[0]; + double m0 = -T[2] * is; + double m1 = -T[5] * is; + iT[0] = is; + iT[1] = 0; + iT[2] = m0; + iT[3] = 0; + iT[4] = is; + iT[5] = m1; + iT[6] = 0; + iT[7] = 0; + iT[8] = 1; +} + +static void denormalize_homography(double *params, double *T1, double *T2) { + double iT2[9]; + double params2[9]; + invnormalize_mat(T2, iT2); + multiply_mat(params, T1, params2, 3, 3, 3); + multiply_mat(iT2, params2, params, 3, 3, 3); +} + +static void denormalize_affine_reorder(double *params, double *T1, double *T2) { + double params_denorm[MAX_PARAMDIM]; + params_denorm[0] = params[0]; + params_denorm[1] = params[1]; + params_denorm[2] = params[4]; + params_denorm[3] = params[2]; + params_denorm[4] = params[3]; + params_denorm[5] = params[5]; + params_denorm[6] = params_denorm[7] = 0; + params_denorm[8] = 1; + denormalize_homography(params_denorm, T1, T2); + params[0] = params_denorm[2]; + params[1] = params_denorm[5]; + params[2] = params_denorm[0]; + params[3] = params_denorm[1]; + params[4] = params_denorm[3]; + params[5] = params_denorm[4]; + params[6] = params[7] = 0; +} + +static void denormalize_rotzoom_reorder(double *params, double *T1, + double *T2) { + double params_denorm[MAX_PARAMDIM]; + params_denorm[0] = params[0]; + params_denorm[1] = params[1]; + params_denorm[2] = params[2]; + params_denorm[3] = -params[1]; + params_denorm[4] = params[0]; + params_denorm[5] = params[3]; + params_denorm[6] = params_denorm[7] = 0; + params_denorm[8] = 1; + denormalize_homography(params_denorm, T1, T2); + params[0] = params_denorm[2]; + params[1] = params_denorm[5]; + params[2] = params_denorm[0]; + params[3] = params_denorm[1]; + params[4] = -params[3]; + params[5] = params[2]; + params[6] = params[7] = 0; +} + +static void denormalize_translation_reorder(double *params, double *T1, + double *T2) { + double params_denorm[MAX_PARAMDIM]; + params_denorm[0] = 1; + params_denorm[1] = 0; + params_denorm[2] = params[0]; + params_denorm[3] = 0; + params_denorm[4] = 1; + params_denorm[5] = params[1]; + params_denorm[6] = params_denorm[7] = 0; + params_denorm[8] = 1; + denormalize_homography(params_denorm, T1, T2); + params[0] = params_denorm[2]; + params[1] = params_denorm[5]; + params[2] = params[5] = 1; + params[3] = params[4] = 0; + params[6] = params[7] = 0; +} + +static int find_translation(int np, double *pts1, double *pts2, double *mat) { + int i; + double sx, sy, dx, dy; + double sumx, sumy; + + double T1[9], T2[9]; + normalize_homography(pts1, np, T1); + normalize_homography(pts2, np, T2); + + sumx = 0; + sumy = 0; + for (i = 0; i < np; ++i) { + dx = *(pts2++); + dy = *(pts2++); + sx = *(pts1++); + sy = *(pts1++); + + sumx += dx - sx; + sumy += dy - sy; + } + mat[0] = sumx / np; + mat[1] = sumy / np; + denormalize_translation_reorder(mat, T1, T2); + return 0; +} + +static int find_rotzoom(int np, double *pts1, double *pts2, double *mat) { + const int np2 = np * 2; + double *a = (double *)aom_malloc(sizeof(*a) * (np2 * 5 + 20)); + double *b = a + np2 * 4; + double *temp = b + np2; + int i; + double sx, sy, dx, dy; + + double T1[9], T2[9]; + normalize_homography(pts1, np, T1); + normalize_homography(pts2, np, T2); + + for (i = 0; i < np; ++i) { + dx = *(pts2++); + dy = *(pts2++); + sx = *(pts1++); + sy = *(pts1++); + + a[i * 2 * 4 + 0] = sx; + a[i * 2 * 4 + 1] = sy; + a[i * 2 * 4 + 2] = 1; + a[i * 2 * 4 + 3] = 0; + a[(i * 2 + 1) * 4 + 0] = sy; + a[(i * 2 + 1) * 4 + 1] = -sx; + a[(i * 2 + 1) * 4 + 2] = 0; + a[(i * 2 + 1) * 4 + 3] = 1; + + b[2 * i] = dx; + b[2 * i + 1] = dy; + } + if (!least_squares(4, a, np2, 4, b, temp, mat)) { + aom_free(a); + return 1; + } + denormalize_rotzoom_reorder(mat, T1, T2); + aom_free(a); + return 0; +} + +static int find_affine(int np, double *pts1, double *pts2, double *mat) { + assert(np > 0); + const int np2 = np * 2; + double *a = (double *)aom_malloc(sizeof(*a) * (np2 * 7 + 42)); + if (a == NULL) return 1; + double *b = a + np2 * 6; + double *temp = b + np2; + int i; + double sx, sy, dx, dy; + + double T1[9], T2[9]; + normalize_homography(pts1, np, T1); + normalize_homography(pts2, np, T2); + + for (i = 0; i < np; ++i) { + dx = *(pts2++); + dy = *(pts2++); + sx = *(pts1++); + sy = *(pts1++); + + a[i * 2 * 6 + 0] = sx; + a[i * 2 * 6 + 1] = sy; + a[i * 2 * 6 + 2] = 0; + a[i * 2 * 6 + 3] = 0; + a[i * 2 * 6 + 4] = 1; + a[i * 2 * 6 + 5] = 0; + a[(i * 2 + 1) * 6 + 0] = 0; + a[(i * 2 + 1) * 6 + 1] = 0; + a[(i * 2 + 1) * 6 + 2] = sx; + a[(i * 2 + 1) * 6 + 3] = sy; + a[(i * 2 + 1) * 6 + 4] = 0; + a[(i * 2 + 1) * 6 + 5] = 1; + + b[2 * i] = dx; + b[2 * i + 1] = dy; + } + if (!least_squares(6, a, np2, 6, b, temp, mat)) { + aom_free(a); + return 1; + } + denormalize_affine_reorder(mat, T1, T2); + aom_free(a); + return 0; +} + +static int get_rand_indices(int npoints, int minpts, int *indices, + unsigned int *seed) { + int i, j; + int ptr = lcg_rand16(seed) % npoints; + if (minpts > npoints) return 0; + indices[0] = ptr; + ptr = (ptr == npoints - 1 ? 0 : ptr + 1); + i = 1; + while (i < minpts) { + int index = lcg_rand16(seed) % npoints; + while (index) { + ptr = (ptr == npoints - 1 ? 0 : ptr + 1); + for (j = 0; j < i; ++j) { + if (indices[j] == ptr) break; + } + if (j == i) index--; + } + indices[i++] = ptr; + } + return 1; +} + +typedef struct { + int num_inliers; + double variance; + int *inlier_indices; +} RANSAC_MOTION; + +// Return -1 if 'a' is a better motion, 1 if 'b' is better, 0 otherwise. +static int compare_motions(const void *arg_a, const void *arg_b) { + const RANSAC_MOTION *motion_a = (RANSAC_MOTION *)arg_a; + const RANSAC_MOTION *motion_b = (RANSAC_MOTION *)arg_b; + + if (motion_a->num_inliers > motion_b->num_inliers) return -1; + if (motion_a->num_inliers < motion_b->num_inliers) return 1; + if (motion_a->variance < motion_b->variance) return -1; + if (motion_a->variance > motion_b->variance) return 1; + return 0; +} + +static int is_better_motion(const RANSAC_MOTION *motion_a, + const RANSAC_MOTION *motion_b) { + return compare_motions(motion_a, motion_b) < 0; +} + +static void copy_points_at_indices(double *dest, const double *src, + const int *indices, int num_points) { + for (int i = 0; i < num_points; ++i) { + const int index = indices[i]; + dest[i * 2] = src[index * 2]; + dest[i * 2 + 1] = src[index * 2 + 1]; + } +} + +static const double kInfiniteVariance = 1e12; + +static void clear_motion(RANSAC_MOTION *motion, int num_points) { + motion->num_inliers = 0; + motion->variance = kInfiniteVariance; + memset(motion->inlier_indices, 0, + sizeof(*motion->inlier_indices) * num_points); +} + +static int ransac(const int *matched_points, int npoints, + int *num_inliers_by_motion, MotionModel *params_by_motion, + int num_desired_motions, int minpts, + IsDegenerateFunc is_degenerate, + FindTransformationFunc find_transformation, + ProjectPointsDoubleFunc projectpoints) { + int trial_count = 0; + int i = 0; + int ret_val = 0; + + unsigned int seed = (unsigned int)npoints; + + int indices[MAX_MINPTS] = { 0 }; + + double *points1, *points2; + double *corners1, *corners2; + double *image1_coord; + + // Store information for the num_desired_motions best transformations found + // and the worst motion among them, as well as the motion currently under + // consideration. + RANSAC_MOTION *motions, *worst_kept_motion = NULL; + RANSAC_MOTION current_motion; + + // Store the parameters and the indices of the inlier points for the motion + // currently under consideration. + double params_this_motion[MAX_PARAMDIM]; + + double *cnp1, *cnp2; + + for (i = 0; i < num_desired_motions; ++i) { + num_inliers_by_motion[i] = 0; + } + if (npoints < minpts * MINPTS_MULTIPLIER || npoints == 0) { + return 1; + } + + points1 = (double *)aom_malloc(sizeof(*points1) * npoints * 2); + points2 = (double *)aom_malloc(sizeof(*points2) * npoints * 2); + corners1 = (double *)aom_malloc(sizeof(*corners1) * npoints * 2); + corners2 = (double *)aom_malloc(sizeof(*corners2) * npoints * 2); + image1_coord = (double *)aom_malloc(sizeof(*image1_coord) * npoints * 2); + + motions = + (RANSAC_MOTION *)aom_malloc(sizeof(RANSAC_MOTION) * num_desired_motions); + for (i = 0; i < num_desired_motions; ++i) { + motions[i].inlier_indices = + (int *)aom_malloc(sizeof(*motions->inlier_indices) * npoints); + clear_motion(motions + i, npoints); + } + current_motion.inlier_indices = + (int *)aom_malloc(sizeof(*current_motion.inlier_indices) * npoints); + clear_motion(¤t_motion, npoints); + + worst_kept_motion = motions; + + if (!(points1 && points2 && corners1 && corners2 && image1_coord && motions && + current_motion.inlier_indices)) { + ret_val = 1; + goto finish_ransac; + } + + cnp1 = corners1; + cnp2 = corners2; + for (i = 0; i < npoints; ++i) { + *(cnp1++) = *(matched_points++); + *(cnp1++) = *(matched_points++); + *(cnp2++) = *(matched_points++); + *(cnp2++) = *(matched_points++); + } + + while (MIN_TRIALS > trial_count) { + double sum_distance = 0.0; + double sum_distance_squared = 0.0; + + clear_motion(¤t_motion, npoints); + + int degenerate = 1; + int num_degenerate_iter = 0; + + while (degenerate) { + num_degenerate_iter++; + if (!get_rand_indices(npoints, minpts, indices, &seed)) { + ret_val = 1; + goto finish_ransac; + } + + copy_points_at_indices(points1, corners1, indices, minpts); + copy_points_at_indices(points2, corners2, indices, minpts); + + degenerate = is_degenerate(points1); + if (num_degenerate_iter > MAX_DEGENERATE_ITER) { + ret_val = 1; + goto finish_ransac; + } + } + + if (find_transformation(minpts, points1, points2, params_this_motion)) { + trial_count++; + continue; + } + + projectpoints(params_this_motion, corners1, image1_coord, npoints, 2, 2); + + for (i = 0; i < npoints; ++i) { + double dx = image1_coord[i * 2] - corners2[i * 2]; + double dy = image1_coord[i * 2 + 1] - corners2[i * 2 + 1]; + double distance = sqrt(dx * dx + dy * dy); + + if (distance < INLIER_THRESHOLD) { + current_motion.inlier_indices[current_motion.num_inliers++] = i; + sum_distance += distance; + sum_distance_squared += distance * distance; + } + } + + if (current_motion.num_inliers >= worst_kept_motion->num_inliers && + current_motion.num_inliers > 1) { + double mean_distance; + mean_distance = sum_distance / ((double)current_motion.num_inliers); + current_motion.variance = + sum_distance_squared / ((double)current_motion.num_inliers - 1.0) - + mean_distance * mean_distance * ((double)current_motion.num_inliers) / + ((double)current_motion.num_inliers - 1.0); + if (is_better_motion(¤t_motion, worst_kept_motion)) { + // This motion is better than the worst currently kept motion. Remember + // the inlier points and variance. The parameters for each kept motion + // will be recomputed later using only the inliers. + worst_kept_motion->num_inliers = current_motion.num_inliers; + worst_kept_motion->variance = current_motion.variance; + memcpy(worst_kept_motion->inlier_indices, current_motion.inlier_indices, + sizeof(*current_motion.inlier_indices) * npoints); + assert(npoints > 0); + // Determine the new worst kept motion and its num_inliers and variance. + for (i = 0; i < num_desired_motions; ++i) { + if (is_better_motion(worst_kept_motion, &motions[i])) { + worst_kept_motion = &motions[i]; + } + } + } + } + trial_count++; + } + + // Sort the motions, best first. + qsort(motions, num_desired_motions, sizeof(RANSAC_MOTION), compare_motions); + + // Recompute the motions using only the inliers. + for (i = 0; i < num_desired_motions; ++i) { + if (motions[i].num_inliers >= minpts) { + copy_points_at_indices(points1, corners1, motions[i].inlier_indices, + motions[i].num_inliers); + copy_points_at_indices(points2, corners2, motions[i].inlier_indices, + motions[i].num_inliers); + + find_transformation(motions[i].num_inliers, points1, points2, + params_by_motion[i].params); + + params_by_motion[i].num_inliers = motions[i].num_inliers; + memcpy(params_by_motion[i].inliers, motions[i].inlier_indices, + sizeof(*motions[i].inlier_indices) * npoints); + num_inliers_by_motion[i] = motions[i].num_inliers; + } + } + +finish_ransac: + aom_free(points1); + aom_free(points2); + aom_free(corners1); + aom_free(corners2); + aom_free(image1_coord); + aom_free(current_motion.inlier_indices); + for (i = 0; i < num_desired_motions; ++i) { + aom_free(motions[i].inlier_indices); + } + aom_free(motions); + + return ret_val; +} + +static int ransac_double_prec(const double *matched_points, int npoints, + int *num_inliers_by_motion, + MotionModel *params_by_motion, + int num_desired_motions, int minpts, + IsDegenerateFunc is_degenerate, + FindTransformationFunc find_transformation, + ProjectPointsDoubleFunc projectpoints) { + int trial_count = 0; + int i = 0; + int ret_val = 0; + + unsigned int seed = (unsigned int)npoints; + + int indices[MAX_MINPTS] = { 0 }; + + double *points1, *points2; + double *corners1, *corners2; + double *image1_coord; + + // Store information for the num_desired_motions best transformations found + // and the worst motion among them, as well as the motion currently under + // consideration. + RANSAC_MOTION *motions, *worst_kept_motion = NULL; + RANSAC_MOTION current_motion; + + // Store the parameters and the indices of the inlier points for the motion + // currently under consideration. + double params_this_motion[MAX_PARAMDIM]; + + double *cnp1, *cnp2; + + for (i = 0; i < num_desired_motions; ++i) { + num_inliers_by_motion[i] = 0; + } + if (npoints < minpts * MINPTS_MULTIPLIER || npoints == 0) { + return 1; + } + + points1 = (double *)aom_malloc(sizeof(*points1) * npoints * 2); + points2 = (double *)aom_malloc(sizeof(*points2) * npoints * 2); + corners1 = (double *)aom_malloc(sizeof(*corners1) * npoints * 2); + corners2 = (double *)aom_malloc(sizeof(*corners2) * npoints * 2); + image1_coord = (double *)aom_malloc(sizeof(*image1_coord) * npoints * 2); + + motions = + (RANSAC_MOTION *)aom_malloc(sizeof(RANSAC_MOTION) * num_desired_motions); + for (i = 0; i < num_desired_motions; ++i) { + motions[i].inlier_indices = + (int *)aom_malloc(sizeof(*motions->inlier_indices) * npoints); + clear_motion(motions + i, npoints); + } + current_motion.inlier_indices = + (int *)aom_malloc(sizeof(*current_motion.inlier_indices) * npoints); + clear_motion(¤t_motion, npoints); + + worst_kept_motion = motions; + + if (!(points1 && points2 && corners1 && corners2 && image1_coord && motions && + current_motion.inlier_indices)) { + ret_val = 1; + goto finish_ransac; + } + + cnp1 = corners1; + cnp2 = corners2; + for (i = 0; i < npoints; ++i) { + *(cnp1++) = *(matched_points++); + *(cnp1++) = *(matched_points++); + *(cnp2++) = *(matched_points++); + *(cnp2++) = *(matched_points++); + } + + while (MIN_TRIALS > trial_count) { + double sum_distance = 0.0; + double sum_distance_squared = 0.0; + + clear_motion(¤t_motion, npoints); + + int degenerate = 1; + int num_degenerate_iter = 0; + + while (degenerate) { + num_degenerate_iter++; + if (!get_rand_indices(npoints, minpts, indices, &seed)) { + ret_val = 1; + goto finish_ransac; + } + + copy_points_at_indices(points1, corners1, indices, minpts); + copy_points_at_indices(points2, corners2, indices, minpts); + + degenerate = is_degenerate(points1); + if (num_degenerate_iter > MAX_DEGENERATE_ITER) { + ret_val = 1; + goto finish_ransac; + } + } + + if (find_transformation(minpts, points1, points2, params_this_motion)) { + trial_count++; + continue; + } + + projectpoints(params_this_motion, corners1, image1_coord, npoints, 2, 2); + + for (i = 0; i < npoints; ++i) { + double dx = image1_coord[i * 2] - corners2[i * 2]; + double dy = image1_coord[i * 2 + 1] - corners2[i * 2 + 1]; + double distance = sqrt(dx * dx + dy * dy); + + if (distance < INLIER_THRESHOLD) { + current_motion.inlier_indices[current_motion.num_inliers++] = i; + sum_distance += distance; + sum_distance_squared += distance * distance; + } + } + + if (current_motion.num_inliers >= worst_kept_motion->num_inliers && + current_motion.num_inliers > 1) { + double mean_distance; + mean_distance = sum_distance / ((double)current_motion.num_inliers); + current_motion.variance = + sum_distance_squared / ((double)current_motion.num_inliers - 1.0) - + mean_distance * mean_distance * ((double)current_motion.num_inliers) / + ((double)current_motion.num_inliers - 1.0); + if (is_better_motion(¤t_motion, worst_kept_motion)) { + // This motion is better than the worst currently kept motion. Remember + // the inlier points and variance. The parameters for each kept motion + // will be recomputed later using only the inliers. + worst_kept_motion->num_inliers = current_motion.num_inliers; + worst_kept_motion->variance = current_motion.variance; + memcpy(worst_kept_motion->inlier_indices, current_motion.inlier_indices, + sizeof(*current_motion.inlier_indices) * npoints); + assert(npoints > 0); + // Determine the new worst kept motion and its num_inliers and variance. + for (i = 0; i < num_desired_motions; ++i) { + if (is_better_motion(worst_kept_motion, &motions[i])) { + worst_kept_motion = &motions[i]; + } + } + } + } + trial_count++; + } + + // Sort the motions, best first. + qsort(motions, num_desired_motions, sizeof(RANSAC_MOTION), compare_motions); + + // Recompute the motions using only the inliers. + for (i = 0; i < num_desired_motions; ++i) { + if (motions[i].num_inliers >= minpts) { + copy_points_at_indices(points1, corners1, motions[i].inlier_indices, + motions[i].num_inliers); + copy_points_at_indices(points2, corners2, motions[i].inlier_indices, + motions[i].num_inliers); + + find_transformation(motions[i].num_inliers, points1, points2, + params_by_motion[i].params); + memcpy(params_by_motion[i].inliers, motions[i].inlier_indices, + sizeof(*motions[i].inlier_indices) * npoints); + } + num_inliers_by_motion[i] = motions[i].num_inliers; + } + +finish_ransac: + aom_free(points1); + aom_free(points2); + aom_free(corners1); + aom_free(corners2); + aom_free(image1_coord); + aom_free(current_motion.inlier_indices); + for (i = 0; i < num_desired_motions; ++i) { + aom_free(motions[i].inlier_indices); + } + aom_free(motions); + + return ret_val; +} + +static int is_collinear3(double *p1, double *p2, double *p3) { + static const double collinear_eps = 1e-3; + const double v = + (p2[0] - p1[0]) * (p3[1] - p1[1]) - (p2[1] - p1[1]) * (p3[0] - p1[0]); + return fabs(v) < collinear_eps; +} + +static int is_degenerate_translation(double *p) { + return (p[0] - p[2]) * (p[0] - p[2]) + (p[1] - p[3]) * (p[1] - p[3]) <= 2; +} + +static int is_degenerate_affine(double *p) { + return is_collinear3(p, p + 2, p + 4); +} + +static int ransac_translation(int *matched_points, int npoints, + int *num_inliers_by_motion, + MotionModel *params_by_motion, + int num_desired_motions) { + return ransac(matched_points, npoints, num_inliers_by_motion, + params_by_motion, num_desired_motions, 3, + is_degenerate_translation, find_translation, + project_points_double_translation); +} + +static int ransac_rotzoom(int *matched_points, int npoints, + int *num_inliers_by_motion, + MotionModel *params_by_motion, + int num_desired_motions) { + return ransac(matched_points, npoints, num_inliers_by_motion, + params_by_motion, num_desired_motions, 3, is_degenerate_affine, + find_rotzoom, project_points_double_rotzoom); +} + +static int ransac_affine(int *matched_points, int npoints, + int *num_inliers_by_motion, + MotionModel *params_by_motion, + int num_desired_motions) { + return ransac(matched_points, npoints, num_inliers_by_motion, + params_by_motion, num_desired_motions, 3, is_degenerate_affine, + find_affine, project_points_double_affine); +} + +RansacFunc av1_get_ransac_type(TransformationType type) { + switch (type) { + case AFFINE: return ransac_affine; + case ROTZOOM: return ransac_rotzoom; + case TRANSLATION: return ransac_translation; + default: assert(0); return NULL; + } +} + +static int ransac_translation_double_prec(double *matched_points, int npoints, + int *num_inliers_by_motion, + MotionModel *params_by_motion, + int num_desired_motions) { + return ransac_double_prec(matched_points, npoints, num_inliers_by_motion, + params_by_motion, num_desired_motions, 3, + is_degenerate_translation, find_translation, + project_points_double_translation); +} + +static int ransac_rotzoom_double_prec(double *matched_points, int npoints, + int *num_inliers_by_motion, + MotionModel *params_by_motion, + int num_desired_motions) { + return ransac_double_prec(matched_points, npoints, num_inliers_by_motion, + params_by_motion, num_desired_motions, 3, + is_degenerate_affine, find_rotzoom, + project_points_double_rotzoom); +} + +static int ransac_affine_double_prec(double *matched_points, int npoints, + int *num_inliers_by_motion, + MotionModel *params_by_motion, + int num_desired_motions) { + return ransac_double_prec(matched_points, npoints, num_inliers_by_motion, + params_by_motion, num_desired_motions, 3, + is_degenerate_affine, find_affine, + project_points_double_affine); +} + +RansacFuncDouble av1_get_ransac_double_prec_type(TransformationType type) { + switch (type) { + case AFFINE: return ransac_affine_double_prec; + case ROTZOOM: return ransac_rotzoom_double_prec; + case TRANSLATION: return ransac_translation_double_prec; + default: assert(0); return NULL; + } +} diff --git a/libs/libaom/src/av1/encoder/ransac.h b/libs/libaom/src/av1/encoder/ransac.h new file mode 100644 index 000000000..583d97152 --- /dev/null +++ b/libs/libaom/src/av1/encoder/ransac.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_RANSAC_H_ +#define AOM_AV1_ENCODER_RANSAC_H_ + +#include +#include +#include +#include + +#include "av1/common/warped_motion.h" +#include "av1/encoder/global_motion.h" + +typedef int (*RansacFunc)(int *matched_points, int npoints, + int *num_inliers_by_motion, + MotionModel *params_by_motion, int num_motions); +typedef int (*RansacFuncDouble)(double *matched_points, int npoints, + int *num_inliers_by_motion, + MotionModel *params_by_motion, int num_motions); +RansacFunc av1_get_ransac_type(TransformationType type); +RansacFuncDouble av1_get_ransac_double_prec_type(TransformationType type); +#endif // AOM_AV1_ENCODER_RANSAC_H_ diff --git a/libs/libaom/src/av1/encoder/ratectrl.c b/libs/libaom/src/av1/encoder/ratectrl.c new file mode 100644 index 000000000..433163f2e --- /dev/null +++ b/libs/libaom/src/av1/encoder/ratectrl.c @@ -0,0 +1,2117 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "aom_ports/system_state.h" + +#include "av1/common/alloccommon.h" +#include "av1/encoder/aq_cyclicrefresh.h" +#include "av1/common/common.h" +#include "av1/common/entropymode.h" +#include "av1/common/quant_common.h" +#include "av1/common/seg_common.h" + +#include "av1/encoder/encodemv.h" +#include "av1/encoder/encode_strategy.h" +#include "av1/encoder/gop_structure.h" +#include "av1/encoder/random.h" +#include "av1/encoder/ratectrl.h" + +#define USE_UNRESTRICTED_Q_IN_CQ_MODE 0 + +// Max rate target for 1080P and below encodes under normal circumstances +// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB +#define MAX_MB_RATE 250 +#define MAXRATE_1080P 2025000 + +#define MIN_BPB_FACTOR 0.005 +#define MAX_BPB_FACTOR 50 + +#define SUPERRES_QADJ_PER_DENOM_KEYFRAME_SOLO 0 +#define SUPERRES_QADJ_PER_DENOM_KEYFRAME 2 +#define SUPERRES_QADJ_PER_DENOM_ARFFRAME 0 + +#define FRAME_OVERHEAD_BITS 200 +#define ASSIGN_MINQ_TABLE(bit_depth, name) \ + do { \ + switch (bit_depth) { \ + case AOM_BITS_8: name = name##_8; break; \ + case AOM_BITS_10: name = name##_10; break; \ + case AOM_BITS_12: name = name##_12; break; \ + default: \ + assert(0 && \ + "bit_depth should be AOM_BITS_8, AOM_BITS_10" \ + " or AOM_BITS_12"); \ + name = NULL; \ + } \ + } while (0) + +// Tables relating active max Q to active min Q +static int kf_low_motion_minq_8[QINDEX_RANGE]; +static int kf_high_motion_minq_8[QINDEX_RANGE]; +static int arfgf_low_motion_minq_8[QINDEX_RANGE]; +static int arfgf_high_motion_minq_8[QINDEX_RANGE]; +static int inter_minq_8[QINDEX_RANGE]; +static int rtc_minq_8[QINDEX_RANGE]; + +static int kf_low_motion_minq_10[QINDEX_RANGE]; +static int kf_high_motion_minq_10[QINDEX_RANGE]; +static int arfgf_low_motion_minq_10[QINDEX_RANGE]; +static int arfgf_high_motion_minq_10[QINDEX_RANGE]; +static int inter_minq_10[QINDEX_RANGE]; +static int rtc_minq_10[QINDEX_RANGE]; +static int kf_low_motion_minq_12[QINDEX_RANGE]; +static int kf_high_motion_minq_12[QINDEX_RANGE]; +static int arfgf_low_motion_minq_12[QINDEX_RANGE]; +static int arfgf_high_motion_minq_12[QINDEX_RANGE]; +static int inter_minq_12[QINDEX_RANGE]; +static int rtc_minq_12[QINDEX_RANGE]; + +static int gf_high = 2400; +static int gf_low = 300; +#ifdef STRICT_RC +static int kf_high = 3200; +#else +static int kf_high = 5000; +#endif +static int kf_low = 400; + +// How many times less pixels there are to encode given the current scaling. +// Temporary replacement for rcf_mult and rate_thresh_mult. +static double resize_rate_factor(const AV1_COMP *cpi, int width, int height) { + return (double)(cpi->oxcf.width * cpi->oxcf.height) / (width * height); +} + +// Functions to compute the active minq lookup table entries based on a +// formulaic approach to facilitate easier adjustment of the Q tables. +// The formulae were derived from computing a 3rd order polynomial best +// fit to the original data (after plotting real maxq vs minq (not q index)) +static int get_minq_index(double maxq, double x3, double x2, double x1, + aom_bit_depth_t bit_depth) { + const double minqtarget = AOMMIN(((x3 * maxq + x2) * maxq + x1) * maxq, maxq); + + // Special case handling to deal with the step from q2.0 + // down to lossless mode represented by q 1.0. + if (minqtarget <= 2.0) return 0; + + return av1_find_qindex(minqtarget, bit_depth, 0, QINDEX_RANGE - 1); +} + +static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low, + int *arfgf_high, int *inter, int *rtc, + aom_bit_depth_t bit_depth) { + int i; + for (i = 0; i < QINDEX_RANGE; i++) { + const double maxq = av1_convert_qindex_to_q(i, bit_depth); + kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth); + kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.45, bit_depth); + arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth); + arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth); + inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90, bit_depth); + rtc[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth); + } +} + +void av1_rc_init_minq_luts(void) { + init_minq_luts(kf_low_motion_minq_8, kf_high_motion_minq_8, + arfgf_low_motion_minq_8, arfgf_high_motion_minq_8, + inter_minq_8, rtc_minq_8, AOM_BITS_8); + init_minq_luts(kf_low_motion_minq_10, kf_high_motion_minq_10, + arfgf_low_motion_minq_10, arfgf_high_motion_minq_10, + inter_minq_10, rtc_minq_10, AOM_BITS_10); + init_minq_luts(kf_low_motion_minq_12, kf_high_motion_minq_12, + arfgf_low_motion_minq_12, arfgf_high_motion_minq_12, + inter_minq_12, rtc_minq_12, AOM_BITS_12); +} + +// These functions use formulaic calculations to make playing with the +// quantizer tables easier. If necessary they can be replaced by lookup +// tables if and when things settle down in the experimental bitstream +double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth) { + // Convert the index to a real Q value (scaled down to match old Q values) + switch (bit_depth) { + case AOM_BITS_8: return av1_ac_quant_QTX(qindex, 0, bit_depth) / 4.0; + case AOM_BITS_10: return av1_ac_quant_QTX(qindex, 0, bit_depth) / 16.0; + case AOM_BITS_12: return av1_ac_quant_QTX(qindex, 0, bit_depth) / 64.0; + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + return -1.0; + } +} + +int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex, + double correction_factor, aom_bit_depth_t bit_depth) { + const double q = av1_convert_qindex_to_q(qindex, bit_depth); + int enumerator = frame_type == KEY_FRAME ? 2000000 : 1500000; + + assert(correction_factor <= MAX_BPB_FACTOR && + correction_factor >= MIN_BPB_FACTOR); + + // q based adjustment to baseline enumerator + return (int)(enumerator * correction_factor / q); +} + +int av1_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs, + double correction_factor, + aom_bit_depth_t bit_depth) { + const int bpm = + (int)(av1_rc_bits_per_mb(frame_type, q, correction_factor, bit_depth)); + return AOMMAX(FRAME_OVERHEAD_BITS, + (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS); +} + +int av1_rc_clamp_pframe_target_size(const AV1_COMP *const cpi, int target, + FRAME_UPDATE_TYPE frame_update_type) { + const RATE_CONTROL *rc = &cpi->rc; + const AV1EncoderConfig *oxcf = &cpi->oxcf; + const int min_frame_target = + AOMMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5); + // Clip the frame target to the minimum setup value. + if (frame_update_type == OVERLAY_UPDATE || + frame_update_type == INTNL_OVERLAY_UPDATE) { + // If there is an active ARF at this location use the minimum + // bits on this frame even if it is a constructed arf. + // The active maximum quantizer insures that an appropriate + // number of bits will be spent if needed for constructed ARFs. + target = min_frame_target; + } else if (target < min_frame_target) { + target = min_frame_target; + } + + // Clip the frame target to the maximum allowed value. + if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth; + if (oxcf->rc_max_inter_bitrate_pct) { + const int max_rate = + rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100; + target = AOMMIN(target, max_rate); + } + + return target; +} + +int av1_rc_clamp_iframe_target_size(const AV1_COMP *const cpi, int target) { + const RATE_CONTROL *rc = &cpi->rc; + const AV1EncoderConfig *oxcf = &cpi->oxcf; + if (oxcf->rc_max_intra_bitrate_pct) { + const int max_rate = + rc->avg_frame_bandwidth * oxcf->rc_max_intra_bitrate_pct / 100; + target = AOMMIN(target, max_rate); + } + if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth; + return target; +} + +// Update the buffer level for higher temporal layers, given the encoded current +// temporal layer. +static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) { + const int current_temporal_layer = svc->temporal_layer_id; + for (int i = current_temporal_layer + 1; i < svc->number_temporal_layers; + ++i) { + const int layer = + LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + lrc->bits_off_target += + (int)(lc->target_bandwidth / lc->framerate) - encoded_frame_size; + // Clip buffer level to maximum buffer size for the layer. + lrc->bits_off_target = + AOMMIN(lrc->bits_off_target, lrc->maximum_buffer_size); + lrc->buffer_level = lrc->bits_off_target; + } +} +// Update the buffer level: leaky bucket model. +static void update_buffer_level(AV1_COMP *cpi, int encoded_frame_size) { + const AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + + // Non-viewable frames are a special case and are treated as pure overhead. + if (!cm->show_frame) + rc->bits_off_target -= encoded_frame_size; + else + rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size; + + // Clip the buffer level to the maximum specified buffer size. + rc->bits_off_target = AOMMIN(rc->bits_off_target, rc->maximum_buffer_size); + rc->buffer_level = rc->bits_off_target; + + if (cpi->use_svc) update_layer_buffer_level(&cpi->svc, encoded_frame_size); +} + +int av1_rc_get_default_min_gf_interval(int width, int height, + double framerate) { + // Assume we do not need any constraint lower than 4K 20 fps + static const double factor_safe = 3840 * 2160 * 20.0; + const double factor = width * height * framerate; + const int default_interval = + clamp((int)(framerate * 0.125), MIN_GF_INTERVAL, MAX_GF_INTERVAL); + + if (factor <= factor_safe) + return default_interval; + else + return AOMMAX(default_interval, + (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5)); + // Note this logic makes: + // 4K24: 5 + // 4K30: 6 + // 4K60: 12 +} + +int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) { + int interval = AOMMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75)); + interval += (interval & 0x01); // Round to even value + interval = AOMMAX(MAX_GF_INTERVAL, interval); + return AOMMAX(interval, min_gf_interval); +} + +void av1_rc_init(const AV1EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { + int i; + + if (pass == 0 && oxcf->rc_mode == AOM_CBR) { + rc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q; + rc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q; + } else { + rc->avg_frame_qindex[KEY_FRAME] = + (oxcf->worst_allowed_q + oxcf->best_allowed_q) / 2; + rc->avg_frame_qindex[INTER_FRAME] = + (oxcf->worst_allowed_q + oxcf->best_allowed_q) / 2; + } + + rc->last_q[KEY_FRAME] = oxcf->best_allowed_q; + rc->last_q[INTER_FRAME] = oxcf->worst_allowed_q; + + rc->buffer_level = rc->starting_buffer_level; + rc->bits_off_target = rc->starting_buffer_level; + + rc->rolling_target_bits = rc->avg_frame_bandwidth; + rc->rolling_actual_bits = rc->avg_frame_bandwidth; + rc->long_rolling_target_bits = rc->avg_frame_bandwidth; + rc->long_rolling_actual_bits = rc->avg_frame_bandwidth; + + rc->total_actual_bits = 0; + rc->total_target_bits = 0; + rc->total_target_vs_actual = 0; + + rc->frames_since_key = 8; // Sensible default for first frame. + rc->this_key_frame_forced = 0; + rc->next_key_frame_forced = 0; + rc->source_alt_ref_pending = 0; + rc->source_alt_ref_active = 0; + + rc->frames_till_gf_update_due = 0; + rc->ni_av_qi = oxcf->worst_allowed_q; + rc->ni_tot_qi = 0; + rc->ni_frames = 0; + + rc->tot_q = 0.0; + rc->avg_q = av1_convert_qindex_to_q(oxcf->worst_allowed_q, oxcf->bit_depth); + + for (i = 0; i < RATE_FACTOR_LEVELS; ++i) { + rc->rate_correction_factors[i] = 0.7; + } + rc->rate_correction_factors[KF_STD] = 1.0; + rc->min_gf_interval = oxcf->min_gf_interval; + rc->max_gf_interval = oxcf->max_gf_interval; + if (rc->min_gf_interval == 0) + rc->min_gf_interval = av1_rc_get_default_min_gf_interval( + oxcf->width, oxcf->height, oxcf->init_framerate); + if (rc->max_gf_interval == 0) + rc->max_gf_interval = av1_rc_get_default_max_gf_interval( + oxcf->init_framerate, rc->min_gf_interval); + rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2; +} + +int av1_rc_drop_frame(AV1_COMP *cpi) { + const AV1EncoderConfig *oxcf = &cpi->oxcf; + RATE_CONTROL *const rc = &cpi->rc; + + if (!oxcf->drop_frames_water_mark) { + return 0; + } else { + if (rc->buffer_level < 0) { + // Always drop if buffer is below 0. + return 1; + } else { + // If buffer is below drop_mark, for now just drop every other frame + // (starting with the next frame) until it increases back over drop_mark. + int drop_mark = + (int)(oxcf->drop_frames_water_mark * rc->optimal_buffer_level / 100); + if ((rc->buffer_level > drop_mark) && (rc->decimation_factor > 0)) { + --rc->decimation_factor; + } else if (rc->buffer_level <= drop_mark && rc->decimation_factor == 0) { + rc->decimation_factor = 1; + } + if (rc->decimation_factor > 0) { + if (rc->decimation_count > 0) { + --rc->decimation_count; + return 1; + } else { + rc->decimation_count = rc->decimation_factor; + return 0; + } + } else { + rc->decimation_count = 0; + return 0; + } + } + } +} + +static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality) { + const RATE_CONTROL *const rc = &cpi->rc; + const AV1_COMMON *const cm = &cpi->common; + const int max_delta = 16; + const int change_avg_frame_bandwidth = + abs(rc->avg_frame_bandwidth - rc->prev_avg_frame_bandwidth) > + 0.1 * (rc->avg_frame_bandwidth); + // If resolution changes or avg_frame_bandwidth significantly changed, + // then set this flag to indicate change in target bits per macroblock. + const int change_target_bits_mb = + cm->prev_frame && + (cm->width != cm->prev_frame->width || + cm->height != cm->prev_frame->height || change_avg_frame_bandwidth); + // Apply some control/clamp to QP under certain conditions. + if (cm->current_frame.frame_type != KEY_FRAME && !cpi->use_svc && + rc->frames_since_key > 1 && !change_target_bits_mb && + (!cpi->oxcf.gf_cbr_boost_pct || + !(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame))) { + // Make sure q is between oscillating Qs to prevent resonance. + if (rc->rc_1_frame * rc->rc_2_frame == -1 && + rc->q_1_frame != rc->q_2_frame) { + q = clamp(q, AOMMIN(rc->q_1_frame, rc->q_2_frame), + AOMMAX(rc->q_1_frame, rc->q_2_frame)); + } + // Limit the decrease in Q from previous frame. + if (rc->q_1_frame - q > max_delta) q = rc->q_1_frame - max_delta; + } + // For single spatial layer: if resolution has increased push q closer + // to the active_worst to avoid excess overshoot. + if (cpi->svc.number_spatial_layers <= 1 && cm->prev_frame && + (cm->width * cm->height > + 1.5 * cm->prev_frame->width * cm->prev_frame->height)) + q = (q + active_worst_quality) >> 1; + return AOMMAX(AOMMIN(q, cpi->rc.worst_quality), cpi->rc.best_quality); +} + +static const RATE_FACTOR_LEVEL rate_factor_levels[FRAME_UPDATE_TYPES] = { + KF_STD, // KF_UPDATE + INTER_NORMAL, // LF_UPDATE + GF_ARF_STD, // GF_UPDATE + GF_ARF_STD, // ARF_UPDATE + INTER_NORMAL, // OVERLAY_UPDATE + INTER_NORMAL, // INTNL_OVERLAY_UPDATE + GF_ARF_LOW, // INTNL_ARF_UPDATE +}; + +static RATE_FACTOR_LEVEL get_rate_factor_level(const GF_GROUP *const gf_group) { + const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index]; + assert(update_type < FRAME_UPDATE_TYPES); + return rate_factor_levels[update_type]; +} + +static double get_rate_correction_factor(const AV1_COMP *cpi, int width, + int height) { + const RATE_CONTROL *const rc = &cpi->rc; + double rcf; + + if (cpi->common.current_frame.frame_type == KEY_FRAME) { + rcf = rc->rate_correction_factors[KF_STD]; + } else if (is_stat_consumption_stage(cpi)) { + const RATE_FACTOR_LEVEL rf_lvl = get_rate_factor_level(&cpi->gf_group); + rcf = rc->rate_correction_factors[rf_lvl]; + } else { + if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) && + !rc->is_src_frame_alt_ref && !cpi->use_svc && + (cpi->oxcf.rc_mode != AOM_CBR || cpi->oxcf.gf_cbr_boost_pct > 20)) + rcf = rc->rate_correction_factors[GF_ARF_STD]; + else + rcf = rc->rate_correction_factors[INTER_NORMAL]; + } + rcf *= resize_rate_factor(cpi, width, height); + return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR); +} + +static void set_rate_correction_factor(AV1_COMP *cpi, double factor, int width, + int height) { + RATE_CONTROL *const rc = &cpi->rc; + + // Normalize RCF to account for the size-dependent scaling factor. + factor /= resize_rate_factor(cpi, width, height); + + factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR); + + if (cpi->common.current_frame.frame_type == KEY_FRAME) { + rc->rate_correction_factors[KF_STD] = factor; + } else if (is_stat_consumption_stage(cpi)) { + const RATE_FACTOR_LEVEL rf_lvl = get_rate_factor_level(&cpi->gf_group); + rc->rate_correction_factors[rf_lvl] = factor; + } else { + if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) && + !rc->is_src_frame_alt_ref && !cpi->use_svc && + (cpi->oxcf.rc_mode != AOM_CBR || cpi->oxcf.gf_cbr_boost_pct > 20)) + rc->rate_correction_factors[GF_ARF_STD] = factor; + else + rc->rate_correction_factors[INTER_NORMAL] = factor; + } +} + +void av1_rc_update_rate_correction_factors(AV1_COMP *cpi, int width, + int height) { + const AV1_COMMON *const cm = &cpi->common; + int correction_factor = 100; + double rate_correction_factor = + get_rate_correction_factor(cpi, width, height); + double adjustment_limit; + const int MBs = av1_get_MBs(width, height); + + int projected_size_based_on_q = 0; + + // Do not update the rate factors for arf overlay frames. + if (cpi->rc.is_src_frame_alt_ref) return; + + // Clear down mmx registers to allow floating point in what follows + aom_clear_system_state(); + + // Work out how big we would have expected the frame to be at this Q given + // the current correction factor. + // Stay in double to avoid int overflow when values are large + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled) { + projected_size_based_on_q = + av1_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor); + } else { + projected_size_based_on_q = av1_estimate_bits_at_q( + cm->current_frame.frame_type, cm->quant_params.base_qindex, MBs, + rate_correction_factor, cm->seq_params.bit_depth); + } + // Work out a size correction factor. + if (projected_size_based_on_q > FRAME_OVERHEAD_BITS) + correction_factor = (int)((100 * (int64_t)cpi->rc.projected_frame_size) / + projected_size_based_on_q); + + // More heavily damped adjustment used if we have been oscillating either side + // of target. + if (correction_factor > 0) { + adjustment_limit = + 0.25 + 0.5 * AOMMIN(1, fabs(log10(0.01 * correction_factor))); + } else { + adjustment_limit = 0.75; + } + + cpi->rc.q_2_frame = cpi->rc.q_1_frame; + cpi->rc.q_1_frame = cm->quant_params.base_qindex; + cpi->rc.rc_2_frame = cpi->rc.rc_1_frame; + if (correction_factor > 110) + cpi->rc.rc_1_frame = -1; + else if (correction_factor < 90) + cpi->rc.rc_1_frame = 1; + else + cpi->rc.rc_1_frame = 0; + + if (correction_factor > 102) { + // We are not already at the worst allowable quality + correction_factor = + (int)(100 + ((correction_factor - 100) * adjustment_limit)); + rate_correction_factor = (rate_correction_factor * correction_factor) / 100; + // Keep rate_correction_factor within limits + if (rate_correction_factor > MAX_BPB_FACTOR) + rate_correction_factor = MAX_BPB_FACTOR; + } else if (correction_factor < 99) { + // We are not already at the best allowable quality + correction_factor = + (int)(100 - ((100 - correction_factor) * adjustment_limit)); + rate_correction_factor = (rate_correction_factor * correction_factor) / 100; + + // Keep rate_correction_factor within limits + if (rate_correction_factor < MIN_BPB_FACTOR) + rate_correction_factor = MIN_BPB_FACTOR; + } + + set_rate_correction_factor(cpi, rate_correction_factor, width, height); +} + +// Calculate rate for the given 'q'. +static int get_bits_per_mb(const AV1_COMP *cpi, int use_cyclic_refresh, + double correction_factor, int q) { + const AV1_COMMON *const cm = &cpi->common; + return use_cyclic_refresh + ? av1_cyclic_refresh_rc_bits_per_mb(cpi, q, correction_factor) + : av1_rc_bits_per_mb(cm->current_frame.frame_type, q, + correction_factor, cm->seq_params.bit_depth); +} + +// Similar to find_qindex_by_rate() function in ratectrl.c, but returns the q +// index with rate just above or below the desired rate, depending on which of +// the two rates is closer to the desired rate. +// Also, respects the selected aq_mode when computing the rate. +static int find_closest_qindex_by_rate(int desired_bits_per_mb, + const AV1_COMP *cpi, + double correction_factor, + int best_qindex, int worst_qindex) { + const int use_cyclic_refresh = cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + cpi->cyclic_refresh->apply_cyclic_refresh; + + // Find 'qindex' based on 'desired_bits_per_mb'. + assert(best_qindex <= worst_qindex); + int low = best_qindex; + int high = worst_qindex; + while (low < high) { + const int mid = (low + high) >> 1; + const int mid_bits_per_mb = + get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, mid); + if (mid_bits_per_mb > desired_bits_per_mb) { + low = mid + 1; + } else { + high = mid; + } + } + assert(low == high); + + // Calculate rate difference of this q index from the desired rate. + const int curr_q = low; + const int curr_bits_per_mb = + get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, curr_q); + const int curr_bit_diff = (curr_bits_per_mb <= desired_bits_per_mb) + ? desired_bits_per_mb - curr_bits_per_mb + : INT_MAX; + assert((curr_bit_diff != INT_MAX && curr_bit_diff >= 0) || + curr_q == worst_qindex); + + // Calculate rate difference for previous q index too. + const int prev_q = curr_q - 1; + int prev_bit_diff; + if (curr_bit_diff == INT_MAX || curr_q == best_qindex) { + prev_bit_diff = INT_MAX; + } else { + const int prev_bits_per_mb = + get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, prev_q); + assert(prev_bits_per_mb > desired_bits_per_mb); + prev_bit_diff = prev_bits_per_mb - desired_bits_per_mb; + } + + // Pick one of the two q indices, depending on which one has rate closer to + // the desired rate. + return (curr_bit_diff <= prev_bit_diff) ? curr_q : prev_q; +} + +int av1_rc_regulate_q(const AV1_COMP *cpi, int target_bits_per_frame, + int active_best_quality, int active_worst_quality, + int width, int height) { + const int MBs = av1_get_MBs(width, height); + const double correction_factor = + get_rate_correction_factor(cpi, width, height); + const int target_bits_per_mb = + (int)(((uint64_t)target_bits_per_frame << BPER_MB_NORMBITS) / MBs); + + int q = + find_closest_qindex_by_rate(target_bits_per_mb, cpi, correction_factor, + active_best_quality, active_worst_quality); + if (cpi->oxcf.rc_mode == AOM_CBR && has_no_stats_stage(cpi)) + return adjust_q_cbr(cpi, q, active_worst_quality); + + return q; +} + +static int get_active_quality(int q, int gfu_boost, int low, int high, + int *low_motion_minq, int *high_motion_minq) { + if (gfu_boost > high) { + return low_motion_minq[q]; + } else if (gfu_boost < low) { + return high_motion_minq[q]; + } else { + const int gap = high - low; + const int offset = high - gfu_boost; + const int qdiff = high_motion_minq[q] - low_motion_minq[q]; + const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap; + return low_motion_minq[q] + adjustment; + } +} + +static int get_kf_active_quality(const RATE_CONTROL *const rc, int q, + aom_bit_depth_t bit_depth) { + int *kf_low_motion_minq; + int *kf_high_motion_minq; + ASSIGN_MINQ_TABLE(bit_depth, kf_low_motion_minq); + ASSIGN_MINQ_TABLE(bit_depth, kf_high_motion_minq); + return get_active_quality(q, rc->kf_boost, kf_low, kf_high, + kf_low_motion_minq, kf_high_motion_minq); +} + +static int get_gf_active_quality(const RATE_CONTROL *const rc, int q, + aom_bit_depth_t bit_depth) { + int *arfgf_low_motion_minq; + int *arfgf_high_motion_minq; + ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq); + ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq); + return get_active_quality(q, rc->gfu_boost, gf_low, gf_high, + arfgf_low_motion_minq, arfgf_high_motion_minq); +} + +static int get_gf_high_motion_quality(int q, aom_bit_depth_t bit_depth) { + int *arfgf_high_motion_minq; + ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq); + return arfgf_high_motion_minq[q]; +} + +static int calc_active_worst_quality_one_pass_vbr(const AV1_COMP *cpi) { + const RATE_CONTROL *const rc = &cpi->rc; + const unsigned int curr_frame = cpi->common.current_frame.frame_number; + int active_worst_quality; + + if (cpi->common.current_frame.frame_type == KEY_FRAME) { + active_worst_quality = + curr_frame == 0 ? rc->worst_quality : rc->last_q[KEY_FRAME] * 2; + } else { + if (!rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_bwd_ref_frame || + cpi->refresh_alt_ref_frame)) { + active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 5 / 4 + : rc->last_q[INTER_FRAME]; + } else { + active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 2 + : rc->last_q[INTER_FRAME] * 2; + } + } + return AOMMIN(active_worst_quality, rc->worst_quality); +} + +// Adjust active_worst_quality level based on buffer level. +static int calc_active_worst_quality_one_pass_cbr(const AV1_COMP *cpi) { + // Adjust active_worst_quality: If buffer is above the optimal/target level, + // bring active_worst_quality down depending on fullness of buffer. + // If buffer is below the optimal level, let the active_worst_quality go from + // ambient Q (at buffer = optimal level) to worst_quality level + // (at buffer = critical level). + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *rc = &cpi->rc; + // Buffer level below which we push active_worst to worst_quality. + int64_t critical_level = rc->optimal_buffer_level >> 3; + int64_t buff_lvl_step = 0; + int adjustment = 0; + int active_worst_quality; + int ambient_qp; + if (cm->current_frame.frame_type == KEY_FRAME) return rc->worst_quality; + // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME] + // for the first few frames following key frame. These are both initialized + // to worst_quality and updated with (3/4, 1/4) average in postencode_update. + // So for first few frames following key, the qp of that key frame is weighted + // into the active_worst_quality setting. + ambient_qp = (cm->current_frame.frame_number < 5) + ? AOMMIN(rc->avg_frame_qindex[INTER_FRAME], + rc->avg_frame_qindex[KEY_FRAME]) + : rc->avg_frame_qindex[INTER_FRAME]; + active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp * 5 / 4); + if (rc->buffer_level > rc->optimal_buffer_level) { + // Adjust down. + // Maximum limit for down adjustment, ~30%. + int max_adjustment_down = active_worst_quality / 3; + if (max_adjustment_down) { + buff_lvl_step = ((rc->maximum_buffer_size - rc->optimal_buffer_level) / + max_adjustment_down); + if (buff_lvl_step) + adjustment = (int)((rc->buffer_level - rc->optimal_buffer_level) / + buff_lvl_step); + active_worst_quality -= adjustment; + } + } else if (rc->buffer_level > critical_level) { + // Adjust up from ambient Q. + if (critical_level) { + buff_lvl_step = (rc->optimal_buffer_level - critical_level); + if (buff_lvl_step) { + adjustment = (int)((rc->worst_quality - ambient_qp) * + (rc->optimal_buffer_level - rc->buffer_level) / + buff_lvl_step); + } + active_worst_quality = ambient_qp + adjustment; + } + } else { + // Set to worst_quality if buffer is below critical level. + active_worst_quality = rc->worst_quality; + } + return active_worst_quality; +} + +static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, int width, + int height, int *bottom_index, + int *top_index) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const CurrentFrame *const current_frame = &cm->current_frame; + int active_best_quality; + int active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi); + int q; + int *rtc_minq; + const int bit_depth = cm->seq_params.bit_depth; + ASSIGN_MINQ_TABLE(bit_depth, rtc_minq); + + if (frame_is_intra_only(cm)) { + active_best_quality = rc->best_quality; + // Handle the special case for key frames forced when we have reached + // the maximum key frame interval. Here force the Q to a range + // based on the ambient Q to reduce the risk of popping. + if (rc->this_key_frame_forced) { + int qindex = rc->last_boosted_qindex; + double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth); + int delta_qindex = av1_compute_qdelta(rc, last_boosted_q, + (last_boosted_q * 0.75), bit_depth); + active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); + } else if (current_frame->frame_number > 0) { + // not first frame of one pass and kf_boost is set + double q_adj_factor = 1.0; + double q_val; + + active_best_quality = + get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME], bit_depth); + + // Allow somewhat lower kf minq with small image formats. + if ((width * height) <= (352 * 288)) { + q_adj_factor -= 0.25; + } + + // Convert the adjustment factor to a qindex delta + // on active_best_quality. + q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth); + active_best_quality += + av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth); + } + } else if (!rc->is_src_frame_alt_ref && !cpi->use_svc && + cpi->oxcf.gf_cbr_boost_pct && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + // Use the lower of active_worst_quality and recent + // average Q as basis for GF/ARF best Q limit unless last frame was + // a key frame. + if (rc->frames_since_key > 1 && + rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) { + q = rc->avg_frame_qindex[INTER_FRAME]; + } else { + q = active_worst_quality; + } + active_best_quality = get_gf_active_quality(rc, q, bit_depth); + } else { + // Use the lower of active_worst_quality and recent/average Q. + if (current_frame->frame_number > 1) { + if (rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) + active_best_quality = rtc_minq[rc->avg_frame_qindex[INTER_FRAME]]; + else + active_best_quality = rtc_minq[active_worst_quality]; + } else { + if (rc->avg_frame_qindex[KEY_FRAME] < active_worst_quality) + active_best_quality = rtc_minq[rc->avg_frame_qindex[KEY_FRAME]]; + else + active_best_quality = rtc_minq[active_worst_quality]; + } + } + + // Clip the active best and worst quality values to limits + active_best_quality = + clamp(active_best_quality, rc->best_quality, rc->worst_quality); + active_worst_quality = + clamp(active_worst_quality, active_best_quality, rc->worst_quality); + + *top_index = active_worst_quality; + *bottom_index = active_best_quality; + + // Limit Q range for the adaptive loop. + if (current_frame->frame_type == KEY_FRAME && !rc->this_key_frame_forced && + !(current_frame->frame_number == 0)) { + int qdelta = 0; + aom_clear_system_state(); + qdelta = av1_compute_qdelta_by_rate(&cpi->rc, current_frame->frame_type, + active_worst_quality, 2.0, bit_depth); + *top_index = active_worst_quality + qdelta; + *top_index = AOMMAX(*top_index, *bottom_index); + } + + // Special case code to try and match quality with forced key frames + if (current_frame->frame_type == KEY_FRAME && rc->this_key_frame_forced) { + q = rc->last_boosted_qindex; + } else { + q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, + active_worst_quality, width, height); + if (q > *top_index) { + // Special case when we are targeting the max allowed rate + if (rc->this_frame_target >= rc->max_frame_bandwidth) + *top_index = q; + else + q = *top_index; + } + } + + assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality); + assert(*bottom_index <= rc->worst_quality && + *bottom_index >= rc->best_quality); + assert(q <= rc->worst_quality && q >= rc->best_quality); + return q; +} + +static int gf_group_pyramid_level(const GF_GROUP *gf_group, int gf_index) { + return gf_group->layer_depth[gf_index]; +} + +static int get_active_cq_level(const RATE_CONTROL *rc, + const AV1EncoderConfig *const oxcf, + int intra_only, SUPERRES_MODE superres_mode, + int superres_denom) { + static const double cq_adjust_threshold = 0.1; + int active_cq_level = oxcf->cq_level; + (void)intra_only; + if (oxcf->rc_mode == AOM_CQ || oxcf->rc_mode == AOM_Q) { + // printf("Superres %d %d %d = %d\n", superres_denom, intra_only, + // rc->frames_to_key, !(intra_only && rc->frames_to_key <= 1)); + if ((superres_mode == SUPERRES_QTHRESH || superres_mode == SUPERRES_AUTO) && + superres_denom != SCALE_NUMERATOR) { + int mult = SUPERRES_QADJ_PER_DENOM_KEYFRAME_SOLO; + if (intra_only && rc->frames_to_key <= 1) { + mult = 0; + } else if (intra_only) { + mult = SUPERRES_QADJ_PER_DENOM_KEYFRAME; + } else { + mult = SUPERRES_QADJ_PER_DENOM_ARFFRAME; + } + active_cq_level = AOMMAX( + active_cq_level - ((superres_denom - SCALE_NUMERATOR) * mult), 0); + } + } + if (oxcf->rc_mode == AOM_CQ && rc->total_target_bits > 0) { + const double x = (double)rc->total_actual_bits / rc->total_target_bits; + if (x < cq_adjust_threshold) { + active_cq_level = (int)(active_cq_level * x / cq_adjust_threshold); + } + } + return active_cq_level; +} + +static int get_q_using_fixed_offsets(const AV1EncoderConfig *const oxcf, + const RATE_CONTROL *const rc, + const GF_GROUP *const gf_group, + int gf_index, int cq_level, + int bit_depth) { + assert(oxcf->use_fixed_qp_offsets); + assert(oxcf->rc_mode == AOM_Q); + const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_index]; + + int offset_idx = -1; + if (update_type == KF_UPDATE) { + if (rc->frames_to_key == 1) { + // Image / intra-only coding: ignore offsets. + return cq_level; + } + offset_idx = 0; + } else if (update_type == ARF_UPDATE || update_type == GF_UPDATE) { + offset_idx = 1; + } else if (update_type == INTNL_ARF_UPDATE) { + offset_idx = + AOMMIN(gf_group->layer_depth[gf_index], FIXED_QP_OFFSET_COUNT - 1); + } else { // Leaf level / overlay frame. + assert(update_type == LF_UPDATE || update_type == OVERLAY_UPDATE || + update_type == INTNL_OVERLAY_UPDATE); + return cq_level; // Directly Return worst quality allowed. + } + assert(offset_idx >= 0 && offset_idx < FIXED_QP_OFFSET_COUNT); + assert(oxcf->fixed_qp_offsets[offset_idx] >= 0); + + // Get qindex offset, by first converting to 'q' and then back. + const double q_val_orig = av1_convert_qindex_to_q(cq_level, bit_depth); + const double q_val_target = + AOMMAX(q_val_orig - oxcf->fixed_qp_offsets[offset_idx], 0.0); + const int delta_qindex = + av1_compute_qdelta(rc, q_val_orig, q_val_target, bit_depth); + return AOMMAX(cq_level + delta_qindex, 0); +} + +static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width, + int height, int *bottom_index, + int *top_index) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const CurrentFrame *const current_frame = &cm->current_frame; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const int cq_level = + get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode, + cm->superres_scale_denominator); + const int bit_depth = cm->seq_params.bit_depth; + + if (oxcf->use_fixed_qp_offsets) { + return get_q_using_fixed_offsets(oxcf, rc, &cpi->gf_group, + cpi->gf_group.index, cq_level, bit_depth); + } + + int active_best_quality; + int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi); + int q; + int *inter_minq; + ASSIGN_MINQ_TABLE(bit_depth, inter_minq); + + if (frame_is_intra_only(cm)) { + if (oxcf->rc_mode == AOM_Q) { + const int qindex = cq_level; + const double q_val = av1_convert_qindex_to_q(qindex, bit_depth); + const int delta_qindex = + av1_compute_qdelta(rc, q_val, q_val * 0.25, bit_depth); + active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); + } else if (rc->this_key_frame_forced) { + const int qindex = rc->last_boosted_qindex; + const double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth); + const int delta_qindex = av1_compute_qdelta( + rc, last_boosted_q, last_boosted_q * 0.75, bit_depth); + active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); + } else { // not first frame of one pass and kf_boost is set + double q_adj_factor = 1.0; + + active_best_quality = + get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME], bit_depth); + + // Allow somewhat lower kf minq with small image formats. + if ((width * height) <= (352 * 288)) { + q_adj_factor -= 0.25; + } + + // Convert the adjustment factor to a qindex delta on active_best_quality. + { + const double q_val = + av1_convert_qindex_to_q(active_best_quality, bit_depth); + active_best_quality += + av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth); + } + } + } else if (!rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + // Use the lower of active_worst_quality and recent + // average Q as basis for GF/ARF best Q limit unless last frame was + // a key frame. + q = (rc->frames_since_key > 1 && + rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) + ? rc->avg_frame_qindex[INTER_FRAME] + : rc->avg_frame_qindex[KEY_FRAME]; + // For constrained quality dont allow Q less than the cq level + if (oxcf->rc_mode == AOM_CQ) { + if (q < cq_level) q = cq_level; + active_best_quality = get_gf_active_quality(rc, q, bit_depth); + // Constrained quality use slightly lower active best. + active_best_quality = active_best_quality * 15 / 16; + } else if (oxcf->rc_mode == AOM_Q) { + const int qindex = cq_level; + const double q_val = av1_convert_qindex_to_q(qindex, bit_depth); + const int delta_qindex = + (cpi->refresh_alt_ref_frame) + ? av1_compute_qdelta(rc, q_val, q_val * 0.40, bit_depth) + : av1_compute_qdelta(rc, q_val, q_val * 0.50, bit_depth); + active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); + } else { + active_best_quality = get_gf_active_quality(rc, q, bit_depth); + } + } else { + if (oxcf->rc_mode == AOM_Q) { + const int qindex = cq_level; + const double q_val = av1_convert_qindex_to_q(qindex, bit_depth); + const double delta_rate[FIXED_GF_INTERVAL] = { 0.50, 1.0, 0.85, 1.0, + 0.70, 1.0, 0.85, 1.0 }; + const int delta_qindex = av1_compute_qdelta( + rc, q_val, + q_val * delta_rate[current_frame->frame_number % FIXED_GF_INTERVAL], + bit_depth); + active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); + } else { + // Use the lower of active_worst_quality and recent/average Q. + active_best_quality = (current_frame->frame_number > 1) + ? inter_minq[rc->avg_frame_qindex[INTER_FRAME]] + : inter_minq[rc->avg_frame_qindex[KEY_FRAME]]; + // For the constrained quality mode we don't want + // q to fall below the cq level. + if ((oxcf->rc_mode == AOM_CQ) && (active_best_quality < cq_level)) { + active_best_quality = cq_level; + } + } + } + + // Clip the active best and worst quality values to limits + active_best_quality = + clamp(active_best_quality, rc->best_quality, rc->worst_quality); + active_worst_quality = + clamp(active_worst_quality, active_best_quality, rc->worst_quality); + + *top_index = active_worst_quality; + *bottom_index = active_best_quality; + + // Limit Q range for the adaptive loop. + { + int qdelta = 0; + aom_clear_system_state(); + if (current_frame->frame_type == KEY_FRAME && !rc->this_key_frame_forced && + !(current_frame->frame_number == 0)) { + qdelta = av1_compute_qdelta_by_rate(&cpi->rc, current_frame->frame_type, + active_worst_quality, 2.0, bit_depth); + } else if (!rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + qdelta = + av1_compute_qdelta_by_rate(&cpi->rc, current_frame->frame_type, + active_worst_quality, 1.75, bit_depth); + } + *top_index = active_worst_quality + qdelta; + *top_index = AOMMAX(*top_index, *bottom_index); + } + + if (oxcf->rc_mode == AOM_Q) { + q = active_best_quality; + // Special case code to try and match quality with forced key frames + } else if ((current_frame->frame_type == KEY_FRAME) && + rc->this_key_frame_forced) { + q = rc->last_boosted_qindex; + } else { + q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, + active_worst_quality, width, height); + if (q > *top_index) { + // Special case when we are targeting the max allowed rate + if (rc->this_frame_target >= rc->max_frame_bandwidth) + *top_index = q; + else + q = *top_index; + } + } + + assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality); + assert(*bottom_index <= rc->worst_quality && + *bottom_index >= rc->best_quality); + assert(q <= rc->worst_quality && q >= rc->best_quality); + return q; +} + +static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = { + 1.00, // INTER_NORMAL + 1.50, // GF_ARF_LOW + 2.00, // GF_ARF_STD + 2.00, // KF_STD +}; + +int av1_frame_type_qdelta(const AV1_COMP *cpi, int q) { + const RATE_FACTOR_LEVEL rf_lvl = get_rate_factor_level(&cpi->gf_group); + const FRAME_TYPE frame_type = (rf_lvl == KF_STD) ? KEY_FRAME : INTER_FRAME; + double rate_factor; + + rate_factor = rate_factor_deltas[rf_lvl]; + if (rf_lvl == GF_ARF_LOW) { + rate_factor -= (cpi->gf_group.layer_depth[cpi->gf_group.index] - 2) * 0.1; + rate_factor = AOMMAX(rate_factor, 1.0); + } + return av1_compute_qdelta_by_rate(&cpi->rc, frame_type, q, rate_factor, + cpi->common.seq_params.bit_depth); +} + +// This unrestricted Q selection on CQ mode is useful when testing new features, +// but may lead to Q being out of range on current RC restrictions +#if USE_UNRESTRICTED_Q_IN_CQ_MODE +static int rc_pick_q_and_bounds_one_pass_cq(const AV1_COMP *cpi, int width, + int height, int *bottom_index, + int *top_index) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const int cq_level = get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), + cm->superres_scale_denominator); + const int bit_depth = cm->seq_params.bit_depth; + const int q = (int)av1_convert_qindex_to_q(cq_level, bit_depth); + (void)width; + (void)height; + *top_index = q; + *bottom_index = q; + + return q; +} +#endif // USE_UNRESTRICTED_Q_IN_CQ_MODE + +#define STATIC_MOTION_THRESH 95 +static void get_intra_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, + int height, int *active_best, + int *active_worst, int cq_level, + int is_fwd_kf) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + int active_best_quality; + int active_worst_quality = *active_worst; + const int bit_depth = cm->seq_params.bit_depth; + + if (rc->frames_to_key == 1 && oxcf->rc_mode == AOM_Q) { + // If the next frame is also a key frame or the current frame is the + // only frame in the sequence in AOM_Q mode, just use the cq_level + // as q. + active_best_quality = cq_level; + active_worst_quality = cq_level; + } else if (is_fwd_kf) { + // Handle the special case for forward reference key frames. + // Increase the boost because this keyframe is used as a forward and + // backward reference. + const int qindex = rc->last_boosted_qindex; + const double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth); + const int delta_qindex = av1_compute_qdelta( + rc, last_boosted_q, last_boosted_q * 0.25, bit_depth); + active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); + } else if (rc->this_key_frame_forced) { + // Handle the special case for key frames forced when we have reached + // the maximum key frame interval. Here force the Q to a range + // based on the ambient Q to reduce the risk of popping. + double last_boosted_q; + int delta_qindex; + int qindex; + + if (is_stat_consumption_stage_twopass(cpi) && + cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) { + qindex = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex); + active_best_quality = qindex; + last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth); + delta_qindex = av1_compute_qdelta(rc, last_boosted_q, + last_boosted_q * 1.25, bit_depth); + active_worst_quality = + AOMMIN(qindex + delta_qindex, active_worst_quality); + } else { + qindex = rc->last_boosted_qindex; + last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth); + delta_qindex = av1_compute_qdelta(rc, last_boosted_q, + last_boosted_q * 0.50, bit_depth); + active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); + } + } else { + // Not forced keyframe. + double q_adj_factor = 1.0; + double q_val; + + // Baseline value derived from cpi->active_worst_quality and kf boost. + active_best_quality = + get_kf_active_quality(rc, active_worst_quality, bit_depth); + + if (is_stat_consumption_stage_twopass(cpi) && + cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) { + active_best_quality /= 3; + } + + // Allow somewhat lower kf minq with small image formats. + if ((width * height) <= (352 * 288)) { + q_adj_factor -= 0.25; + } + + // Make a further adjustment based on the kf zero motion measure. + if (is_stat_consumption_stage_twopass(cpi)) + q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct); + + // Convert the adjustment factor to a qindex delta + // on active_best_quality. + q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth); + active_best_quality += + av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth); + + // Tweak active_best_quality for AOM_Q mode when superres is on, as this + // will be used directly as 'q' later. + if (oxcf->rc_mode == AOM_Q && + (cpi->superres_mode == SUPERRES_QTHRESH || + cpi->superres_mode == SUPERRES_AUTO) && + cm->superres_scale_denominator != SCALE_NUMERATOR) { + active_best_quality = + AOMMAX(active_best_quality - + ((cm->superres_scale_denominator - SCALE_NUMERATOR) * + SUPERRES_QADJ_PER_DENOM_KEYFRAME), + 0); + } + } + *active_best = active_best_quality; + *active_worst = active_worst_quality; +} + +static void adjust_active_best_and_worst_quality(const AV1_COMP *cpi, + const int is_intrl_arf_boost, + int *active_worst, + int *active_best) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const int bit_depth = cpi->common.seq_params.bit_depth; + int active_best_quality = *active_best; + int active_worst_quality = *active_worst; + // Extension to max or min Q if undershoot or overshoot is outside + // the permitted range. + if (cpi->oxcf.rc_mode != AOM_Q) { + if (frame_is_intra_only(cm) || + (!rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || is_intrl_arf_boost || + cpi->refresh_alt_ref_frame))) { + active_best_quality -= + (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast); + active_worst_quality += (cpi->twopass.extend_maxq / 2); + } else { + active_best_quality -= + (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast) / 2; + active_worst_quality += cpi->twopass.extend_maxq; + } + } + + aom_clear_system_state(); +#ifndef STRICT_RC + // Static forced key frames Q restrictions dealt with elsewhere. + if (!(frame_is_intra_only(cm)) || !rc->this_key_frame_forced || + (cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) { + const int qdelta = av1_frame_type_qdelta(cpi, active_worst_quality); + active_worst_quality = + AOMMAX(active_worst_quality + qdelta, active_best_quality); + } +#endif + + // Modify active_best_quality for downscaled normal frames. + if (av1_frame_scaled(cm) && !frame_is_kf_gf_arf(cpi)) { + int qdelta = av1_compute_qdelta_by_rate( + rc, cm->current_frame.frame_type, active_best_quality, 2.0, bit_depth); + active_best_quality = + AOMMAX(active_best_quality + qdelta, rc->best_quality); + } + + active_best_quality = + clamp(active_best_quality, rc->best_quality, rc->worst_quality); + active_worst_quality = + clamp(active_worst_quality, active_best_quality, rc->worst_quality); + + *active_best = active_best_quality; + *active_worst = active_worst_quality; +} + +static int get_q(const AV1_COMP *cpi, const int width, const int height, + const int active_worst_quality, + const int active_best_quality) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + int q; + + if (cpi->oxcf.rc_mode == AOM_Q || + (frame_is_intra_only(cm) && !rc->this_key_frame_forced && + cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH && + rc->frames_to_key > 1)) { + q = active_best_quality; + // Special case code to try and match quality with forced key frames. + } else if (frame_is_intra_only(cm) && rc->this_key_frame_forced) { + // If static since last kf use better of last boosted and last kf q. + if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) { + q = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex); + } else { + q = AOMMIN(rc->last_boosted_qindex, + (active_best_quality + active_worst_quality) / 2); + } + q = clamp(q, active_best_quality, active_worst_quality); + } else { + q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, + active_worst_quality, width, height); + if (q > active_worst_quality) { + // Special case when we are targeting the max allowed rate. + if (rc->this_frame_target < rc->max_frame_bandwidth) { + q = active_worst_quality; + } + } + q = AOMMAX(q, active_best_quality); + } + return q; +} + +// Returns |active_best_quality| for an inter frame. +// The |active_best_quality| depends on different rate control modes: +// VBR, Q, CQ, CBR. +// The returning active_best_quality could further be adjusted in +// adjust_active_best_and_worst_quality(). +static int get_active_best_quality(const AV1_COMP *const cpi, + const int active_worst_quality, + const int cq_level, const int gf_index) { + const AV1_COMMON *const cm = &cpi->common; + const int bit_depth = cm->seq_params.bit_depth; + const RATE_CONTROL *const rc = &cpi->rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const GF_GROUP *gf_group = &cpi->gf_group; + const int rc_mode = oxcf->rc_mode; + int *inter_minq; + ASSIGN_MINQ_TABLE(bit_depth, inter_minq); + int active_best_quality = 0; + const int is_intrl_arf_boost = + gf_group->update_type[gf_index] == INTNL_ARF_UPDATE; + const int is_leaf_frame = !(cpi->refresh_golden_frame || + cpi->refresh_alt_ref_frame || is_intrl_arf_boost); + const int is_overlay_frame = rc->is_src_frame_alt_ref; + + if (is_leaf_frame || is_overlay_frame) { + if (rc_mode == AOM_Q) return cq_level; + + active_best_quality = inter_minq[active_worst_quality]; + // For the constrained quality mode we don't want + // q to fall below the cq level. + if ((rc_mode == AOM_CQ) && (active_best_quality < cq_level)) { + active_best_quality = cq_level; + } + return active_best_quality; + } + + // TODO(chengchen): can we remove this condition? + if (rc_mode == AOM_Q && !cpi->refresh_alt_ref_frame && !is_intrl_arf_boost) { + return cq_level; + } + + // Determine active_best_quality for frames that are not leaf or overlay. + int q = active_worst_quality; + // Use the lower of active_worst_quality and recent + // average Q as basis for GF/ARF best Q limit unless last frame was + // a key frame. + if (rc->frames_since_key > 1 && + rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) { + q = rc->avg_frame_qindex[INTER_FRAME]; + } + if (rc_mode == AOM_CQ && q < cq_level) q = cq_level; + active_best_quality = get_gf_active_quality(rc, q, bit_depth); + // Constrained quality use slightly lower active best. + if (rc_mode == AOM_CQ) active_best_quality = active_best_quality * 15 / 16; + const int min_boost = get_gf_high_motion_quality(q, bit_depth); + const int boost = min_boost - active_best_quality; + active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor); + if (!is_intrl_arf_boost) return active_best_quality; + + if (rc_mode == AOM_Q || rc_mode == AOM_CQ) active_best_quality = rc->arf_q; + int this_height = gf_group_pyramid_level(gf_group, gf_index); + while (this_height > 1) { + active_best_quality = (active_best_quality + active_worst_quality + 1) / 2; + --this_height; + } + return active_best_quality; +} + +static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, + int height, int gf_index, + int *bottom_index, int *top_index) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const GF_GROUP *gf_group = &cpi->gf_group; + const int cq_level = + get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode, + cm->superres_scale_denominator); + const int bit_depth = cm->seq_params.bit_depth; + + if (oxcf->use_fixed_qp_offsets) { + return get_q_using_fixed_offsets(oxcf, rc, gf_group, gf_group->index, + cq_level, bit_depth); + } + + int active_best_quality = 0; + int active_worst_quality = rc->active_worst_quality; + int q; + + const int is_intrl_arf_boost = + gf_group->update_type[gf_index] == INTNL_ARF_UPDATE; + + if (frame_is_intra_only(cm)) { + const int is_fwd_kf = + cm->current_frame.frame_type == KEY_FRAME && cm->show_frame == 0; + get_intra_q_and_bounds_two_pass(cpi, width, height, &active_best_quality, + &active_worst_quality, cq_level, is_fwd_kf); +#ifdef STRICT_RC + active_best_quality = 0; +#endif + } else { +#ifdef STRICT_RC + // Active best quality limited by previous layer. + const int pyramid_level = gf_group_pyramid_level(gf_group, gf_index); + active_best_quality = + rc->active_best_quality[pyramid_level - 1] + + AOMMAX((rc->active_best_quality[pyramid_level - 1] / 10), 5); +#else + active_best_quality = + get_active_best_quality(cpi, active_worst_quality, cq_level, gf_index); +#endif + + // For alt_ref and GF frames (including internal arf frames) adjust the + // worst allowed quality as well. This insures that even on hard + // sections we dont clamp the Q at the same value for arf frames and + // leaf (non arf) frames. This is important to the TPL model which assumes + // Q drops with each arf level. + if (!(rc->is_src_frame_alt_ref) && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame || + is_intrl_arf_boost)) { + active_worst_quality = + (active_best_quality + (3 * active_worst_quality) + 2) / 4; + } + } + + adjust_active_best_and_worst_quality( + cpi, is_intrl_arf_boost, &active_worst_quality, &active_best_quality); + q = get_q(cpi, width, height, active_worst_quality, active_best_quality); + + // Special case when we are targeting the max allowed rate. + if (rc->this_frame_target >= rc->max_frame_bandwidth && + q > active_worst_quality) { + active_worst_quality = q; + } + +#ifdef STRICT_RC + *top_index = rc->worst_quality; +#else + *top_index = active_worst_quality; +#endif + *bottom_index = active_best_quality; + + assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality); + assert(*bottom_index <= rc->worst_quality && + *bottom_index >= rc->best_quality); + assert(q <= rc->worst_quality && q >= rc->best_quality); + + return q; +} + +int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, RATE_CONTROL *rc, int width, + int height, int gf_index, int *bottom_index, + int *top_index) { + int q; + // TODO(sarahparker) merge onepass vbr and altref q computation + // with two pass + const GF_GROUP *gf_group = &cpi->gf_group; + if ((cpi->oxcf.rc_mode != AOM_Q || + gf_group->update_type[gf_index] == ARF_UPDATE) && + has_no_stats_stage(cpi)) { + if (cpi->oxcf.rc_mode == AOM_CBR) + q = rc_pick_q_and_bounds_one_pass_cbr(cpi, width, height, bottom_index, + top_index); +#if USE_UNRESTRICTED_Q_IN_CQ_MODE + else if (cpi->oxcf.rc_mode == AOM_CQ) + q = rc_pick_q_and_bounds_one_pass_cq(cpi, width, height, bottom_index, + top_index); +#endif // USE_UNRESTRICTED_Q_IN_CQ_MODE + else + q = rc_pick_q_and_bounds_one_pass_vbr(cpi, width, height, bottom_index, + top_index); + } else { + q = rc_pick_q_and_bounds_two_pass(cpi, width, height, gf_index, + bottom_index, top_index); + } + if (gf_group->update_type[gf_index] == ARF_UPDATE) rc->arf_q = q; + + return q; +} + +void av1_rc_compute_frame_size_bounds(const AV1_COMP *cpi, int frame_target, + int *frame_under_shoot_limit, + int *frame_over_shoot_limit) { + if (cpi->oxcf.rc_mode == AOM_Q) { + *frame_under_shoot_limit = 0; + *frame_over_shoot_limit = INT_MAX; + } else { + // For very small rate targets where the fractional adjustment + // may be tiny make sure there is at least a minimum range. + const int tolerance = + AOMMAX(100, (cpi->sf.hl_sf.recode_tolerance * frame_target) / 100); + *frame_under_shoot_limit = AOMMAX(frame_target - tolerance, 0); + *frame_over_shoot_limit = + AOMMIN(frame_target + tolerance, cpi->rc.max_frame_bandwidth); + } +} + +void av1_rc_set_frame_target(AV1_COMP *cpi, int target, int width, int height) { + const AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + + rc->this_frame_target = target; + + // Modify frame size target when down-scaled. + if (av1_frame_scaled(cm)) + rc->this_frame_target = + (int)(rc->this_frame_target * resize_rate_factor(cpi, width, height)); + + // Target rate per SB64 (including partial SB64s. + rc->sb64_target_rate = + (int)(((int64_t)rc->this_frame_target << 12) / (width * height)); +} + +static void update_alt_ref_frame_stats(AV1_COMP *cpi) { + // this frame refreshes means next frames don't unless specified by user + RATE_CONTROL *const rc = &cpi->rc; + rc->frames_since_golden = 0; + + // Mark the alt ref as done (setting to 0 means no further alt refs pending). + rc->source_alt_ref_pending = 0; + + // Set the alternate reference frame active flag + rc->source_alt_ref_active = 1; +} + +static void update_golden_frame_stats(AV1_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + const GF_GROUP *const gf_group = &cpi->gf_group; + + // Update the Golden frame usage counts. + if (cpi->refresh_golden_frame || rc->is_src_frame_alt_ref) { + rc->frames_since_golden = 0; + + // If we are not using alt ref in the up and coming group clear the arf + // active flag. In multi arf group case, if the index is not 0 then + // we are overlaying a mid group arf so should not reset the flag. + if (!rc->source_alt_ref_pending && (gf_group->index == 0)) + rc->source_alt_ref_active = 0; + } else if (cpi->common.show_frame) { + rc->frames_since_golden++; + } +} + +void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) { + const AV1_COMMON *const cm = &cpi->common; + const CurrentFrame *const current_frame = &cm->current_frame; + RATE_CONTROL *const rc = &cpi->rc; + const GF_GROUP *const gf_group = &cpi->gf_group; + + const int is_intrnl_arf = + gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE; + + const int qindex = cm->quant_params.base_qindex; + + // Update rate control heuristics + rc->projected_frame_size = (int)(bytes_used << 3); + + // Post encode loop adjustment of Q prediction. + av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height); + + // Keep a record of last Q and ambient average Q. + if (current_frame->frame_type == KEY_FRAME) { + rc->last_q[KEY_FRAME] = qindex; + rc->avg_frame_qindex[KEY_FRAME] = + ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2); + } else { + if ((cpi->use_svc && cpi->oxcf.rc_mode == AOM_CBR) || + (!rc->is_src_frame_alt_ref && + !(cpi->refresh_golden_frame || is_intrnl_arf || + cpi->refresh_alt_ref_frame))) { + rc->last_q[INTER_FRAME] = qindex; + rc->avg_frame_qindex[INTER_FRAME] = + ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2); + rc->ni_frames++; + rc->tot_q += av1_convert_qindex_to_q(qindex, cm->seq_params.bit_depth); + rc->avg_q = rc->tot_q / rc->ni_frames; + // Calculate the average Q for normal inter frames (not key or GFU + // frames). + rc->ni_tot_qi += qindex; + rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames; + } + } + + // Keep record of last boosted (KF/GF/ARF) Q value. + // If the current frame is coded at a lower Q then we also update it. + // If all mbs in this group are skipped only update if the Q value is + // better than that already stored. + // This is used to help set quality in forced key frames to reduce popping + if ((qindex < rc->last_boosted_qindex) || + (current_frame->frame_type == KEY_FRAME) || + (!rc->constrained_gf_group && + (cpi->refresh_alt_ref_frame || is_intrnl_arf || + (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) { + rc->last_boosted_qindex = qindex; + } + if (current_frame->frame_type == KEY_FRAME) rc->last_kf_qindex = qindex; + + update_buffer_level(cpi, rc->projected_frame_size); + rc->prev_avg_frame_bandwidth = rc->avg_frame_bandwidth; + + // Rolling monitors of whether we are over or underspending used to help + // regulate min and Max Q in two pass. + if (av1_frame_scaled(cm)) + rc->this_frame_target = + (int)(rc->this_frame_target / + resize_rate_factor(cpi, cm->width, cm->height)); + if (current_frame->frame_type != KEY_FRAME) { + rc->rolling_target_bits = (int)ROUND_POWER_OF_TWO_64( + rc->rolling_target_bits * 3 + rc->this_frame_target, 2); + rc->rolling_actual_bits = (int)ROUND_POWER_OF_TWO_64( + rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2); + rc->long_rolling_target_bits = (int)ROUND_POWER_OF_TWO_64( + rc->long_rolling_target_bits * 31 + rc->this_frame_target, 5); + rc->long_rolling_actual_bits = (int)ROUND_POWER_OF_TWO_64( + rc->long_rolling_actual_bits * 31 + rc->projected_frame_size, 5); + } + + // Actual bits spent + rc->total_actual_bits += rc->projected_frame_size; + rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0; + + rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits; + + if (is_altref_enabled(cpi) && cpi->refresh_alt_ref_frame && + (current_frame->frame_type != KEY_FRAME)) + // Update the alternate reference frame stats as appropriate. + update_alt_ref_frame_stats(cpi); + else + // Update the Golden frame stats as appropriate. + update_golden_frame_stats(cpi); + + if (current_frame->frame_type == KEY_FRAME) rc->frames_since_key = 0; + // if (current_frame->frame_number == 1 && cm->show_frame) + /* + rc->this_frame_target = + (int)(rc->this_frame_target / resize_rate_factor(cpi, cm->width, + cm->height)); + */ +} + +void av1_rc_postencode_update_drop_frame(AV1_COMP *cpi) { + // Update buffer level with zero size, update frame counters, and return. + update_buffer_level(cpi, 0); + cpi->rc.frames_since_key++; + cpi->rc.frames_to_key--; + cpi->rc.rc_2_frame = 0; + cpi->rc.rc_1_frame = 0; +} + +int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth, + int best_qindex, int worst_qindex) { + assert(best_qindex <= worst_qindex); + int low = best_qindex; + int high = worst_qindex; + while (low < high) { + const int mid = (low + high) >> 1; + const double mid_q = av1_convert_qindex_to_q(mid, bit_depth); + if (mid_q < desired_q) { + low = mid + 1; + } else { + high = mid; + } + } + assert(low == high); + assert(av1_convert_qindex_to_q(low, bit_depth) >= desired_q || + low == worst_qindex); + return low; +} + +int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget, + aom_bit_depth_t bit_depth) { + const int start_index = + av1_find_qindex(qstart, bit_depth, rc->best_quality, rc->worst_quality); + const int target_index = + av1_find_qindex(qtarget, bit_depth, rc->best_quality, rc->worst_quality); + return target_index - start_index; +} + +// Find q_index for the desired_bits_per_mb, within [best_qindex, worst_qindex], +// assuming 'correction_factor' is 1.0. +// To be precise, 'q_index' is the smallest integer, for which the corresponding +// bits per mb <= desired_bits_per_mb. +// If no such q index is found, returns 'worst_qindex'. +static int find_qindex_by_rate(int desired_bits_per_mb, + aom_bit_depth_t bit_depth, FRAME_TYPE frame_type, + int best_qindex, int worst_qindex) { + assert(best_qindex <= worst_qindex); + int low = best_qindex; + int high = worst_qindex; + while (low < high) { + const int mid = (low + high) >> 1; + const int mid_bits_per_mb = + av1_rc_bits_per_mb(frame_type, mid, 1.0, bit_depth); + if (mid_bits_per_mb > desired_bits_per_mb) { + low = mid + 1; + } else { + high = mid; + } + } + assert(low == high); + assert(av1_rc_bits_per_mb(frame_type, low, 1.0, bit_depth) <= + desired_bits_per_mb || + low == worst_qindex); + return low; +} + +int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type, + int qindex, double rate_target_ratio, + aom_bit_depth_t bit_depth) { + // Look up the current projected bits per block for the base index + const int base_bits_per_mb = + av1_rc_bits_per_mb(frame_type, qindex, 1.0, bit_depth); + + // Find the target bits per mb based on the base value and given ratio. + const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb); + + const int target_index = + find_qindex_by_rate(target_bits_per_mb, bit_depth, frame_type, + rc->best_quality, rc->worst_quality); + return target_index - qindex; +} + +void av1_rc_set_gf_interval_range(const AV1_COMP *const cpi, + RATE_CONTROL *const rc) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + + // Special case code for 1 pass fixed Q mode tests + if ((has_no_stats_stage(cpi)) && (oxcf->rc_mode == AOM_Q)) { + rc->max_gf_interval = FIXED_GF_INTERVAL; + rc->min_gf_interval = FIXED_GF_INTERVAL; + rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL; + } else { + // Set Maximum gf/arf interval + rc->max_gf_interval = oxcf->max_gf_interval; + rc->min_gf_interval = oxcf->min_gf_interval; + if (rc->min_gf_interval == 0) + rc->min_gf_interval = av1_rc_get_default_min_gf_interval( + oxcf->width, oxcf->height, cpi->framerate); + if (rc->max_gf_interval == 0) + rc->max_gf_interval = av1_rc_get_default_max_gf_interval( + cpi->framerate, rc->min_gf_interval); + /* + * Extended max interval for genuinely static scenes like slide shows. + * The no.of.stats available in the case of LAP is limited, + * hence setting to max_gf_interval. + */ + if (cpi->lap_enabled) + rc->static_scene_max_gf_interval = rc->max_gf_interval + 1; + else + rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH; + + if (rc->max_gf_interval > rc->static_scene_max_gf_interval) + rc->max_gf_interval = rc->static_scene_max_gf_interval; + + // Clamp min to max + rc->min_gf_interval = AOMMIN(rc->min_gf_interval, rc->max_gf_interval); + } +} + +void av1_rc_update_framerate(AV1_COMP *cpi, int width, int height) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + RATE_CONTROL *const rc = &cpi->rc; + int vbr_max_bits; + const int MBs = av1_get_MBs(width, height); + + rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / cpi->framerate); + rc->min_frame_bandwidth = + (int)(rc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100); + + rc->min_frame_bandwidth = + AOMMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS); + + // A maximum bitrate for a frame is defined. + // The baseline for this aligns with HW implementations that + // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits + // per 16x16 MB (averaged over a frame). However this limit is extended if + // a very high rate is given on the command line or the the rate cannnot + // be acheived because of a user specificed max q (e.g. when the user + // specifies lossless encode. + vbr_max_bits = + (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section) / + 100); + rc->max_frame_bandwidth = + AOMMAX(AOMMAX((MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits); + + av1_rc_set_gf_interval_range(cpi, rc); +} + +#define VBR_PCT_ADJUSTMENT_LIMIT 50 +// For VBR...adjustment to the frame target based on error from previous frames +static void vbr_rate_correction(AV1_COMP *cpi, int *this_frame_target) { + RATE_CONTROL *const rc = &cpi->rc; + int64_t vbr_bits_off_target = rc->vbr_bits_off_target; + const int stats_count = + cpi->twopass.stats_buf_ctx->total_stats != NULL + ? (int)cpi->twopass.stats_buf_ctx->total_stats->count + : 0; + const int frame_window = AOMMIN( + 16, (int)(stats_count - (int)cpi->common.current_frame.frame_number)); + + if (frame_window > 0) { + const int max_delta = + AOMMIN(abs((int)(vbr_bits_off_target / frame_window)), + (*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100); + + // vbr_bits_off_target > 0 means we have extra bits to spend + // vbr_bits_off_target < 0 we are currently overshooting + *this_frame_target += (vbr_bits_off_target >= 0) ? max_delta : -max_delta; + } + + // Fast redistribution of bits arising from massive local undershoot. + // Dont do it for kf,arf,gf or overlay frames. + if (!frame_is_kf_gf_arf(cpi) && !rc->is_src_frame_alt_ref && + rc->vbr_bits_off_target_fast) { + int one_frame_bits = AOMMAX(rc->avg_frame_bandwidth, *this_frame_target); + int fast_extra_bits; + fast_extra_bits = (int)AOMMIN(rc->vbr_bits_off_target_fast, one_frame_bits); + fast_extra_bits = (int)AOMMIN( + fast_extra_bits, + AOMMAX(one_frame_bits / 8, rc->vbr_bits_off_target_fast / 8)); + *this_frame_target += (int)fast_extra_bits; + rc->vbr_bits_off_target_fast -= fast_extra_bits; + } +} + +void av1_set_target_rate(AV1_COMP *cpi, int width, int height) { + RATE_CONTROL *const rc = &cpi->rc; + int target_rate = rc->base_frame_target; + + // Correction to rate target based on prior over or under shoot. + if (cpi->oxcf.rc_mode == AOM_VBR || cpi->oxcf.rc_mode == AOM_CQ) + vbr_rate_correction(cpi, &target_rate); + av1_rc_set_frame_target(cpi, target_rate, width, height); +} + +int av1_calc_pframe_target_size_one_pass_vbr( + const AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type) { + static const int af_ratio = 10; + const RATE_CONTROL *const rc = &cpi->rc; + int64_t target; +#if USE_ALTREF_FOR_ONE_PASS + if (frame_update_type == KF_UPDATE || frame_update_type == GF_UPDATE || + frame_update_type == ARF_UPDATE) { + target = ((int64_t)rc->avg_frame_bandwidth * rc->baseline_gf_interval * + af_ratio) / + (rc->baseline_gf_interval + af_ratio - 1); + } else { + target = ((int64_t)rc->avg_frame_bandwidth * rc->baseline_gf_interval) / + (rc->baseline_gf_interval + af_ratio - 1); + } + if (target > INT_MAX) target = INT_MAX; +#else + target = rc->avg_frame_bandwidth; +#endif + return av1_rc_clamp_pframe_target_size(cpi, (int)target, frame_update_type); +} + +int av1_calc_iframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) { + static const int kf_ratio = 25; + const RATE_CONTROL *rc = &cpi->rc; + const int target = rc->avg_frame_bandwidth * kf_ratio; + return av1_rc_clamp_iframe_target_size(cpi, target); +} + +int av1_calc_pframe_target_size_one_pass_cbr( + const AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type) { + const AV1EncoderConfig *oxcf = &cpi->oxcf; + const RATE_CONTROL *rc = &cpi->rc; + const int64_t diff = rc->optimal_buffer_level - rc->buffer_level; + const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100; + int min_frame_target = + AOMMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS); + int target; + + if (oxcf->gf_cbr_boost_pct) { + const int af_ratio_pct = oxcf->gf_cbr_boost_pct + 100; + if (frame_update_type == GF_UPDATE || frame_update_type == OVERLAY_UPDATE) { + target = + (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio_pct) / + (rc->baseline_gf_interval * 100 + af_ratio_pct - 100); + } else { + target = (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) / + (rc->baseline_gf_interval * 100 + af_ratio_pct - 100); + } + } else { + target = rc->avg_frame_bandwidth; + } + if (cpi->use_svc) { + // Note that for layers, avg_frame_bandwidth is the cumulative + // per-frame-bandwidth. For the target size of this frame, use the + // layer average frame size (i.e., non-cumulative per-frame-bw). + int layer = + LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, cpi->svc.temporal_layer_id, + cpi->svc.number_temporal_layers); + const LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; + target = lc->avg_frame_size; + min_frame_target = AOMMAX(lc->avg_frame_size >> 4, FRAME_OVERHEAD_BITS); + } + if (diff > 0) { + // Lower the target bandwidth for this frame. + const int pct_low = (int)AOMMIN(diff / one_pct_bits, oxcf->under_shoot_pct); + target -= (target * pct_low) / 200; + } else if (diff < 0) { + // Increase the target bandwidth for this frame. + const int pct_high = + (int)AOMMIN(-diff / one_pct_bits, oxcf->over_shoot_pct); + target += (target * pct_high) / 200; + } + if (oxcf->rc_max_inter_bitrate_pct) { + const int max_rate = + rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100; + target = AOMMIN(target, max_rate); + } + return AOMMAX(min_frame_target, target); +} + +int av1_calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) { + const RATE_CONTROL *rc = &cpi->rc; + int target; + if (cpi->common.current_frame.frame_number == 0) { + target = ((rc->starting_buffer_level / 2) > INT_MAX) + ? INT_MAX + : (int)(rc->starting_buffer_level / 2); + } else { + int kf_boost = 32; + double framerate = cpi->framerate; + + kf_boost = AOMMAX(kf_boost, (int)(2 * framerate - 16)); + if (rc->frames_since_key < framerate / 2) { + kf_boost = (int)(kf_boost * rc->frames_since_key / (framerate / 2)); + } + target = ((16 + kf_boost) * rc->avg_frame_bandwidth) >> 4; + } + return av1_rc_clamp_iframe_target_size(cpi, target); +} + +static void set_reference_structure_one_pass_rt(AV1_COMP *cpi, int gf_update) { + AV1_COMMON *const cm = &cpi->common; + ExternalFlags *const ext_flags = &cpi->ext_flags; + SVC *const svc = &cpi->svc; + // Specify the reference prediction structure, for 1 layer nonrd mode. + // Current structue is to use 3 references (LAST, GOLDEN, ALTREF), + // where ALT_REF always behind current by lag_alt frames, and GOLDEN is + // either updated on LAST with period baseline_gf_interval (fixed slot) + // or always behind current by lag_gld (gld_fixed_slot = 0, lag_gld <= 7). + const int gld_fixed_slot = 1; + const unsigned int lag_alt = 4; + int last_idx = 0; + int last_idx_refresh = 0; + int gld_idx = 0; + int alt_ref_idx = 0; + ext_flags->refresh_frame_flags_pending = 1; + svc->external_ref_frame_config = 1; + ext_flags->ref_frame_flags = 0; + ext_flags->refresh_last_frame = 1; + ext_flags->refresh_golden_frame = 0; + ext_flags->refresh_alt_ref_frame = 0; + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) svc->ref_idx[i] = 7; + for (int i = 0; i < REF_FRAMES; ++i) svc->refresh[i] = 0; + // Always reference LAST, GOLDEN, ALTREF + ext_flags->ref_frame_flags ^= AOM_LAST_FLAG; + ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG; + ext_flags->ref_frame_flags ^= AOM_ALT_FLAG; + const int sh = 7 - gld_fixed_slot; + // Moving index slot for last: 0 - (sh - 1). + if (cm->current_frame.frame_number > 1) + last_idx = ((cm->current_frame.frame_number - 1) % sh); + // Moving index for refresh of last: one ahead for next frame. + last_idx_refresh = (cm->current_frame.frame_number % sh); + gld_idx = 6; + if (!gld_fixed_slot) { + gld_idx = 7; + const unsigned int lag_gld = 7; // Must be <= 7. + // Moving index for gld_ref, lag behind current by gld_interval frames. + if (cm->current_frame.frame_number > lag_gld) + gld_idx = ((cm->current_frame.frame_number - lag_gld) % sh); + } + // Moving index for alt_ref, lag behind LAST by lag_alt frames. + if (cm->current_frame.frame_number > lag_alt) + alt_ref_idx = ((cm->current_frame.frame_number - lag_alt) % sh); + svc->ref_idx[0] = last_idx; // LAST + svc->ref_idx[1] = last_idx_refresh; // LAST2 (for refresh of last). + svc->ref_idx[3] = gld_idx; // GOLDEN + svc->ref_idx[6] = alt_ref_idx; // ALT_REF + // Refresh this slot, which will become LAST on next frame. + svc->refresh[last_idx_refresh] = 1; + // Update GOLDEN on period for fixed slot case. + if (gld_fixed_slot && gf_update) { + ext_flags->refresh_golden_frame = 1; + svc->refresh[gld_idx] = 1; + } +} + +#define DEFAULT_KF_BOOST_RT 2300 +#define DEFAULT_GF_BOOST_RT 2000 + +void av1_get_one_pass_rt_params(AV1_COMP *cpi, + EncodeFrameParams *const frame_params, + unsigned int frame_flags) { + RATE_CONTROL *const rc = &cpi->rc; + AV1_COMMON *const cm = &cpi->common; + GF_GROUP *const gf_group = &cpi->gf_group; + ResizePendingParams *const resize_pending_params = + &cpi->resize_pending_params; + int gf_update = 0; + int target; + const int resize_pending = + (resize_pending_params->width && resize_pending_params->height && + (cm->width != resize_pending_params->width || + cm->height != resize_pending_params->height)); + // Turn this on to explicitly set the reference structure rather than + // relying on internal/default structure. + const int set_reference_structure = 1; + if (cpi->use_svc) { + av1_update_temporal_layer_framerate(cpi); + av1_restore_layer_context(cpi); + } + if ((!cpi->use_svc && rc->frames_to_key == 0) || + (cpi->use_svc && cpi->svc.spatial_layer_id == 0 && + cpi->svc.current_superframe % cpi->oxcf.key_freq == 0) || + (frame_flags & FRAMEFLAGS_KEY)) { + frame_params->frame_type = KEY_FRAME; + rc->this_key_frame_forced = + cm->current_frame.frame_number != 0 && rc->frames_to_key == 0; + rc->frames_to_key = cpi->oxcf.key_freq; + rc->kf_boost = DEFAULT_KF_BOOST_RT; + rc->source_alt_ref_active = 0; + gf_group->update_type[gf_group->index] = KF_UPDATE; + if (cpi->use_svc && cm->current_frame.frame_number > 0) + av1_svc_reset_temporal_layers(cpi, 1); + } else { + frame_params->frame_type = INTER_FRAME; + gf_group->update_type[gf_group->index] = LF_UPDATE; + } + // GF update based on frames_till_gf_update_due, also + // force upddate on resize pending frame. + if ((resize_pending || rc->frames_till_gf_update_due == 0) && + cpi->svc.temporal_layer_id == 0 && cpi->svc.spatial_layer_id == 0) { + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + av1_cyclic_refresh_set_golden_update(cpi); + else + rc->baseline_gf_interval = MAX_GF_INTERVAL; + if (rc->baseline_gf_interval > rc->frames_to_key) + rc->baseline_gf_interval = rc->frames_to_key; + rc->gfu_boost = DEFAULT_GF_BOOST_RT; + rc->constrained_gf_group = + (rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0; + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + gf_group->index = 0; + // SVC does not use GF as periodid boost. + // TODO(marpan): Find better way to disable this for SVC. + if (cpi->use_svc) { + SVC *const svc = &cpi->svc; + rc->baseline_gf_interval = MAX_STATIC_GF_GROUP_LENGTH - 1; + rc->gfu_boost = 1; + rc->constrained_gf_group = 0; + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + for (int layer = 0; + layer < svc->number_spatial_layers * svc->number_temporal_layers; + ++layer) { + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + lc->rc.baseline_gf_interval = rc->baseline_gf_interval; + lc->rc.gfu_boost = rc->gfu_boost; + lc->rc.constrained_gf_group = rc->constrained_gf_group; + lc->rc.frames_till_gf_update_due = rc->frames_till_gf_update_due; + lc->group_index = 0; + } + } + gf_group->size = rc->baseline_gf_interval; + gf_group->update_type[0] = + (frame_params->frame_type == KEY_FRAME) ? KF_UPDATE : GF_UPDATE; + gf_update = 1; + } + if (cpi->oxcf.rc_mode == AOM_CBR) { + if (frame_params->frame_type == KEY_FRAME) { + target = av1_calc_iframe_target_size_one_pass_cbr(cpi); + } else { + target = av1_calc_pframe_target_size_one_pass_cbr( + cpi, gf_group->update_type[gf_group->index]); + } + } else { + if (frame_params->frame_type == KEY_FRAME) { + target = av1_calc_iframe_target_size_one_pass_vbr(cpi); + } else { + target = av1_calc_pframe_target_size_one_pass_vbr( + cpi, gf_group->update_type[gf_group->index]); + } + } + av1_rc_set_frame_target(cpi, target, cm->width, cm->height); + rc->base_frame_target = target; + if (set_reference_structure && cpi->oxcf.speed >= 6 && + cm->number_spatial_layers == 1 && cm->number_temporal_layers == 1) + set_reference_structure_one_pass_rt(cpi, gf_update); + cm->current_frame.frame_type = frame_params->frame_type; +} diff --git a/libs/libaom/src/av1/encoder/ratectrl.h b/libs/libaom/src/av1/encoder/ratectrl.h new file mode 100644 index 000000000..c46378663 --- /dev/null +++ b/libs/libaom/src/av1/encoder/ratectrl.h @@ -0,0 +1,324 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_RATECTRL_H_ +#define AOM_AV1_ENCODER_RATECTRL_H_ + +#include "aom/aom_codec.h" +#include "aom/aom_integer.h" + +#include "aom_ports/mem.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Bits Per MB at different Q (Multiplied by 512) +#define BPER_MB_NORMBITS 9 + +// Use this macro to turn on/off use of alt-refs in one-pass mode. +#define USE_ALTREF_FOR_ONE_PASS 1 + +// Threshold used to define if a KF group is static (e.g. a slide show). +// Essentially, this means that no frame in the group has more than 1% of MBs +// that are not marked as coded with 0,0 motion in the first pass. +#define STATIC_KF_GROUP_THRESH 99 +#define STATIC_KF_GROUP_FLOAT_THRESH 0.99 + +// The maximum duration of a GF group that is static (e.g. a slide show). +#define MAX_STATIC_GF_GROUP_LENGTH 250 + +// Minimum and maximum height for the new pyramid structure. +// (Old structure supports height = 1, but does NOT support height = 4). +#define MIN_PYRAMID_LVL 0 +#define MAX_PYRAMID_LVL 4 + +#define MIN_GF_INTERVAL 4 +#define MAX_GF_INTERVAL 32 +#define FIXED_GF_INTERVAL 8 // Used in some testing modes only +#define MAX_GF_LENGTH_LAP 16 + +#define MAX_NUM_GF_INTERVALS 15 + +#define MAX_ARF_LAYERS 6 +// #define STRICT_RC + +typedef struct { + int resize_width; + int resize_height; + uint8_t superres_denom; +} size_params_type; + +enum { + INTER_NORMAL, + GF_ARF_LOW, + GF_ARF_STD, + KF_STD, + RATE_FACTOR_LEVELS +} UENUM1BYTE(RATE_FACTOR_LEVEL); + +enum { + KF_UPDATE, + LF_UPDATE, + GF_UPDATE, + ARF_UPDATE, + OVERLAY_UPDATE, + INTNL_OVERLAY_UPDATE, // Internal Overlay Frame + INTNL_ARF_UPDATE, // Internal Altref Frame + FRAME_UPDATE_TYPES +} UENUM1BYTE(FRAME_UPDATE_TYPE); + +typedef struct { + // Rate targetting variables + int base_frame_target; // A baseline frame target before adjustment + // for previous under or over shoot. + int this_frame_target; // Actual frame target after rc adjustment. + + // gop bit budget + int64_t gf_group_bits; + + int projected_frame_size; + int sb64_target_rate; + int last_q[FRAME_TYPES]; // Separate values for Intra/Inter + int last_boosted_qindex; // Last boosted GF/KF/ARF q + int last_kf_qindex; // Q index of the last key frame coded. + + int gfu_boost; + int kf_boost; + + double rate_correction_factors[RATE_FACTOR_LEVELS]; + + int frames_since_golden; + int frames_till_gf_update_due; + + // number of determined gf group length left + int intervals_till_gf_calculate_due; + // stores gf group length intervals + int gf_intervals[MAX_NUM_GF_INTERVALS]; + // the current index in gf_intervals + int cur_gf_index; + + int min_gf_interval; + int max_gf_interval; + int static_scene_max_gf_interval; + int baseline_gf_interval; + int constrained_gf_group; + int frames_to_key; + int frames_since_key; + int this_key_frame_forced; + int next_key_frame_forced; + int source_alt_ref_pending; + int source_alt_ref_active; + int is_src_frame_alt_ref; + int sframe_due; + + int avg_frame_bandwidth; // Average frame size target for clip + int min_frame_bandwidth; // Minimum allocation used for any frame + int max_frame_bandwidth; // Maximum burst rate allowed for a frame. + int prev_avg_frame_bandwidth; + + int ni_av_qi; + int ni_tot_qi; + int ni_frames; + int avg_frame_qindex[FRAME_TYPES]; + double tot_q; + double avg_q; + + int64_t buffer_level; + int64_t bits_off_target; + int64_t vbr_bits_off_target; + int64_t vbr_bits_off_target_fast; + + int decimation_factor; + int decimation_count; + + int rolling_target_bits; + int rolling_actual_bits; + + int long_rolling_target_bits; + int long_rolling_actual_bits; + + int rate_error_estimate; + + int64_t total_actual_bits; + int64_t total_target_bits; + int64_t total_target_vs_actual; + + int worst_quality; + int best_quality; + + int64_t starting_buffer_level; + int64_t optimal_buffer_level; + int64_t maximum_buffer_size; + + // rate control history for last frame(1) and the frame before(2). + // -1: undershot + // 1: overshoot + // 0: not initialized. + int rc_1_frame; + int rc_2_frame; + int q_1_frame; + int q_2_frame; + + float_t arf_boost_factor; + // Q index used for ALT frame + int arf_q; + int active_worst_quality; + int active_best_quality[MAX_ARF_LAYERS + 1]; + int base_layer_qp; + + // Total number of stats used only for kf_boost calculation. + int num_stats_used_for_kf_boost; + // Total number of stats used only for gfu_boost calculation. + int num_stats_used_for_gfu_boost; + // Total number of stats required by gfu_boost calculation. + int num_stats_required_for_gfu_boost; + int next_is_fwd_key; + int enable_scenecut_detection; +} RATE_CONTROL; + +struct AV1_COMP; +struct AV1EncoderConfig; + +void av1_rc_init(const struct AV1EncoderConfig *oxcf, int pass, + RATE_CONTROL *rc); + +int av1_estimate_bits_at_q(FRAME_TYPE frame_kind, int q, int mbs, + double correction_factor, aom_bit_depth_t bit_depth); + +double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth); + +void av1_rc_init_minq_luts(void); + +int av1_rc_get_default_min_gf_interval(int width, int height, double framerate); +// Note av1_rc_get_default_max_gf_interval() requires the min_gf_interval to +// be passed in to ensure that the max_gf_interval returned is at least as bis +// as that. +int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval); + +// Generally at the high level, the following flow is expected +// to be enforced for rate control: +// First call per frame, one of: +// av1_rc_get_first_pass_params() +// av1_rc_get_second_pass_params() +// depending on the usage to set the rate control encode parameters desired. +// +// Then, call encode_frame_to_data_rate() to perform the +// actual encode. This function will in turn call encode_frame() +// one or more times, followed by one of: +// av1_rc_postencode_update() +// av1_rc_postencode_update_drop_frame() +// +// The majority of rate control parameters are only expected +// to be set in the av1_rc_get_..._params() functions and +// updated during the av1_rc_postencode_update...() functions. +// The only exceptions are av1_rc_drop_frame() and +// av1_rc_update_rate_correction_factors() functions. + +// Functions to set parameters for encoding before the actual +// encode_frame_to_data_rate() function. +struct EncodeFrameParams; + +// Post encode update of the rate control parameters based +// on bytes used +void av1_rc_postencode_update(struct AV1_COMP *cpi, uint64_t bytes_used); +// Post encode update of the rate control parameters for dropped frames +void av1_rc_postencode_update_drop_frame(struct AV1_COMP *cpi); + +// Updates rate correction factors +// Changes only the rate correction factors in the rate control structure. +void av1_rc_update_rate_correction_factors(struct AV1_COMP *cpi, int width, + int height); + +// Decide if we should drop this frame: For 1-pass CBR. +// Changes only the decimation count in the rate control structure +int av1_rc_drop_frame(struct AV1_COMP *cpi); + +// Computes frame size bounds. +void av1_rc_compute_frame_size_bounds(const struct AV1_COMP *cpi, + int this_frame_target, + int *frame_under_shoot_limit, + int *frame_over_shoot_limit); + +// Picks q and q bounds given the target for bits +int av1_rc_pick_q_and_bounds(const struct AV1_COMP *cpi, RATE_CONTROL *rc, + int width, int height, int gf_index, + int *bottom_index, int *top_index); + +// Estimates q to achieve a target bits per frame +int av1_rc_regulate_q(const struct AV1_COMP *cpi, int target_bits_per_frame, + int active_best_quality, int active_worst_quality, + int width, int height); + +// Estimates bits per mb for a given qindex and correction factor. +int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex, + double correction_factor, aom_bit_depth_t bit_depth); + +// Clamping utilities for bitrate targets for iframes and pframes. +int av1_rc_clamp_iframe_target_size(const struct AV1_COMP *const cpi, + int target); +int av1_rc_clamp_pframe_target_size(const struct AV1_COMP *const cpi, + int target, uint8_t frame_update_type); + +// Find q_index corresponding to desired_q, within [best_qindex, worst_qindex]. +// To be precise, 'q_index' is the smallest integer, for which the corresponding +// q >= desired_q. +// If no such q index is found, returns 'worst_qindex'. +int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth, + int best_qindex, int worst_qindex); + +// Computes a q delta (in "q index" terms) to get from a starting q value +// to a target q value +int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget, + aom_bit_depth_t bit_depth); + +// Computes a q delta (in "q index" terms) to get from a starting q value +// to a value that should equate to the given rate ratio. +int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type, + int qindex, double rate_target_ratio, + aom_bit_depth_t bit_depth); + +int av1_frame_type_qdelta(const struct AV1_COMP *cpi, int q); + +void av1_rc_update_framerate(struct AV1_COMP *cpi, int width, int height); + +void av1_rc_set_gf_interval_range(const struct AV1_COMP *const cpi, + RATE_CONTROL *const rc); + +void av1_set_target_rate(struct AV1_COMP *cpi, int width, int height); + +int av1_resize_one_pass_cbr(struct AV1_COMP *cpi); + +void av1_rc_set_frame_target(struct AV1_COMP *cpi, int target, int width, + int height); + +int av1_calc_pframe_target_size_one_pass_vbr( + const struct AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type); + +int av1_calc_iframe_target_size_one_pass_vbr(const struct AV1_COMP *const cpi); + +int av1_calc_pframe_target_size_one_pass_cbr( + const struct AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type); + +int av1_calc_iframe_target_size_one_pass_cbr(const struct AV1_COMP *cpi); + +void av1_get_one_pass_rt_params(struct AV1_COMP *cpi, + struct EncodeFrameParams *const frame_params, + unsigned int frame_flags); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_RATECTRL_H_ diff --git a/libs/libaom/src/av1/encoder/rd.c b/libs/libaom/src/av1/encoder/rd.c new file mode 100644 index 000000000..e48c77119 --- /dev/null +++ b/libs/libaom/src/av1/encoder/rd.c @@ -0,0 +1,1332 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/bitops.h" +#include "aom_ports/mem.h" +#include "aom_ports/system_state.h" + +#include "av1/common/common.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/mvref_common.h" +#include "av1/common/pred_common.h" +#include "av1/common/quant_common.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/common/seg_common.h" + +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/cost.h" +#include "av1/encoder/encodemb.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encodetxb.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/tokenize.h" + +#define RD_THRESH_POW 1.25 + +// The baseline rd thresholds for breaking out of the rd loop for +// certain modes are assumed to be based on 8x8 blocks. +// This table is used to correct for block size. +// The factors here are << 2 (2 = x0.5, 32 = x8 etc). +static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES_ALL] = { + 2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32, 48, 48, 64, 4, 4, 8, 8, 16, 16 +}; + +static const int use_intra_ext_tx_for_txsize[EXT_TX_SETS_INTRA] + [EXT_TX_SIZES] = { + { 1, 1, 1, 1 }, // unused + { 1, 1, 0, 0 }, + { 0, 0, 1, 0 }, + }; + +static const int use_inter_ext_tx_for_txsize[EXT_TX_SETS_INTER] + [EXT_TX_SIZES] = { + { 1, 1, 1, 1 }, // unused + { 1, 1, 0, 0 }, + { 0, 0, 1, 0 }, + { 0, 1, 1, 1 }, + }; + +static const int av1_ext_tx_set_idx_to_type[2][AOMMAX(EXT_TX_SETS_INTRA, + EXT_TX_SETS_INTER)] = { + { + // Intra + EXT_TX_SET_DCTONLY, + EXT_TX_SET_DTT4_IDTX_1DDCT, + EXT_TX_SET_DTT4_IDTX, + }, + { + // Inter + EXT_TX_SET_DCTONLY, + EXT_TX_SET_ALL16, + EXT_TX_SET_DTT9_IDTX_1DDCT, + EXT_TX_SET_DCT_IDTX, + }, +}; + +void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x, + FRAME_CONTEXT *fc) { + int i, j; + + for (i = 0; i < PARTITION_CONTEXTS; ++i) + av1_cost_tokens_from_cdf(x->partition_cost[i], fc->partition_cdf[i], NULL); + + if (cm->current_frame.skip_mode_info.skip_mode_flag) { + for (i = 0; i < SKIP_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(x->skip_mode_cost[i], fc->skip_mode_cdfs[i], + NULL); + } + } + + for (i = 0; i < SKIP_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(x->skip_cost[i], fc->skip_cdfs[i], NULL); + } + + for (i = 0; i < KF_MODE_CONTEXTS; ++i) + for (j = 0; j < KF_MODE_CONTEXTS; ++j) + av1_cost_tokens_from_cdf(x->y_mode_costs[i][j], fc->kf_y_cdf[i][j], NULL); + + for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) + av1_cost_tokens_from_cdf(x->mbmode_cost[i], fc->y_mode_cdf[i], NULL); + for (i = 0; i < CFL_ALLOWED_TYPES; ++i) + for (j = 0; j < INTRA_MODES; ++j) + av1_cost_tokens_from_cdf(x->intra_uv_mode_cost[i][j], + fc->uv_mode_cdf[i][j], NULL); + + av1_cost_tokens_from_cdf(x->filter_intra_mode_cost, fc->filter_intra_mode_cdf, + NULL); + for (i = 0; i < BLOCK_SIZES_ALL; ++i) { + if (av1_filter_intra_allowed_bsize(cm, i)) + av1_cost_tokens_from_cdf(x->filter_intra_cost[i], + fc->filter_intra_cdfs[i], NULL); + } + + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) + av1_cost_tokens_from_cdf(x->switchable_interp_costs[i], + fc->switchable_interp_cdf[i], NULL); + + for (i = 0; i < PALATTE_BSIZE_CTXS; ++i) { + av1_cost_tokens_from_cdf(x->palette_y_size_cost[i], + fc->palette_y_size_cdf[i], NULL); + av1_cost_tokens_from_cdf(x->palette_uv_size_cost[i], + fc->palette_uv_size_cdf[i], NULL); + for (j = 0; j < PALETTE_Y_MODE_CONTEXTS; ++j) { + av1_cost_tokens_from_cdf(x->palette_y_mode_cost[i][j], + fc->palette_y_mode_cdf[i][j], NULL); + } + } + + for (i = 0; i < PALETTE_UV_MODE_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(x->palette_uv_mode_cost[i], + fc->palette_uv_mode_cdf[i], NULL); + } + + for (i = 0; i < PALETTE_SIZES; ++i) { + for (j = 0; j < PALETTE_COLOR_INDEX_CONTEXTS; ++j) { + av1_cost_tokens_from_cdf(x->palette_y_color_cost[i][j], + fc->palette_y_color_index_cdf[i][j], NULL); + av1_cost_tokens_from_cdf(x->palette_uv_color_cost[i][j], + fc->palette_uv_color_index_cdf[i][j], NULL); + } + } + + int sign_cost[CFL_JOINT_SIGNS]; + av1_cost_tokens_from_cdf(sign_cost, fc->cfl_sign_cdf, NULL); + for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) { + int *cost_u = x->cfl_cost[joint_sign][CFL_PRED_U]; + int *cost_v = x->cfl_cost[joint_sign][CFL_PRED_V]; + if (CFL_SIGN_U(joint_sign) == CFL_SIGN_ZERO) { + memset(cost_u, 0, CFL_ALPHABET_SIZE * sizeof(*cost_u)); + } else { + const aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)]; + av1_cost_tokens_from_cdf(cost_u, cdf_u, NULL); + } + if (CFL_SIGN_V(joint_sign) == CFL_SIGN_ZERO) { + memset(cost_v, 0, CFL_ALPHABET_SIZE * sizeof(*cost_v)); + } else { + const aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)]; + av1_cost_tokens_from_cdf(cost_v, cdf_v, NULL); + } + for (int u = 0; u < CFL_ALPHABET_SIZE; u++) + cost_u[u] += sign_cost[joint_sign]; + } + + for (i = 0; i < MAX_TX_CATS; ++i) + for (j = 0; j < TX_SIZE_CONTEXTS; ++j) + av1_cost_tokens_from_cdf(x->tx_size_cost[i][j], fc->tx_size_cdf[i][j], + NULL); + + for (i = 0; i < TXFM_PARTITION_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(x->txfm_partition_cost[i], + fc->txfm_partition_cdf[i], NULL); + } + + for (i = TX_4X4; i < EXT_TX_SIZES; ++i) { + int s; + for (s = 1; s < EXT_TX_SETS_INTER; ++s) { + if (use_inter_ext_tx_for_txsize[s][i]) { + av1_cost_tokens_from_cdf( + x->inter_tx_type_costs[s][i], fc->inter_ext_tx_cdf[s][i], + av1_ext_tx_inv[av1_ext_tx_set_idx_to_type[1][s]]); + } + } + for (s = 1; s < EXT_TX_SETS_INTRA; ++s) { + if (use_intra_ext_tx_for_txsize[s][i]) { + for (j = 0; j < INTRA_MODES; ++j) { + av1_cost_tokens_from_cdf( + x->intra_tx_type_costs[s][i][j], fc->intra_ext_tx_cdf[s][i][j], + av1_ext_tx_inv[av1_ext_tx_set_idx_to_type[0][s]]); + } + } + } + } + for (i = 0; i < DIRECTIONAL_MODES; ++i) { + av1_cost_tokens_from_cdf(x->angle_delta_cost[i], fc->angle_delta_cdf[i], + NULL); + } + av1_cost_tokens_from_cdf(x->switchable_restore_cost, + fc->switchable_restore_cdf, NULL); + av1_cost_tokens_from_cdf(x->wiener_restore_cost, fc->wiener_restore_cdf, + NULL); + av1_cost_tokens_from_cdf(x->sgrproj_restore_cost, fc->sgrproj_restore_cdf, + NULL); + av1_cost_tokens_from_cdf(x->intrabc_cost, fc->intrabc_cdf, NULL); + + if (!frame_is_intra_only(cm)) { + for (i = 0; i < COMP_INTER_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(x->comp_inter_cost[i], fc->comp_inter_cdf[i], + NULL); + } + + for (i = 0; i < REF_CONTEXTS; ++i) { + for (j = 0; j < SINGLE_REFS - 1; ++j) { + av1_cost_tokens_from_cdf(x->single_ref_cost[i][j], + fc->single_ref_cdf[i][j], NULL); + } + } + + for (i = 0; i < COMP_REF_TYPE_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(x->comp_ref_type_cost[i], + fc->comp_ref_type_cdf[i], NULL); + } + + for (i = 0; i < UNI_COMP_REF_CONTEXTS; ++i) { + for (j = 0; j < UNIDIR_COMP_REFS - 1; ++j) { + av1_cost_tokens_from_cdf(x->uni_comp_ref_cost[i][j], + fc->uni_comp_ref_cdf[i][j], NULL); + } + } + + for (i = 0; i < REF_CONTEXTS; ++i) { + for (j = 0; j < FWD_REFS - 1; ++j) { + av1_cost_tokens_from_cdf(x->comp_ref_cost[i][j], fc->comp_ref_cdf[i][j], + NULL); + } + } + + for (i = 0; i < REF_CONTEXTS; ++i) { + for (j = 0; j < BWD_REFS - 1; ++j) { + av1_cost_tokens_from_cdf(x->comp_bwdref_cost[i][j], + fc->comp_bwdref_cdf[i][j], NULL); + } + } + + for (i = 0; i < INTRA_INTER_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(x->intra_inter_cost[i], fc->intra_inter_cdf[i], + NULL); + } + + for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(x->newmv_mode_cost[i], fc->newmv_cdf[i], NULL); + } + + for (i = 0; i < GLOBALMV_MODE_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(x->zeromv_mode_cost[i], fc->zeromv_cdf[i], NULL); + } + + for (i = 0; i < REFMV_MODE_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(x->refmv_mode_cost[i], fc->refmv_cdf[i], NULL); + } + + for (i = 0; i < DRL_MODE_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(x->drl_mode_cost0[i], fc->drl_cdf[i], NULL); + } + for (i = 0; i < INTER_MODE_CONTEXTS; ++i) + av1_cost_tokens_from_cdf(x->inter_compound_mode_cost[i], + fc->inter_compound_mode_cdf[i], NULL); + for (i = 0; i < BLOCK_SIZES_ALL; ++i) + av1_cost_tokens_from_cdf(x->compound_type_cost[i], + fc->compound_type_cdf[i], NULL); + for (i = 0; i < BLOCK_SIZES_ALL; ++i) { + if (av1_is_wedge_used(i)) { + av1_cost_tokens_from_cdf(x->wedge_idx_cost[i], fc->wedge_idx_cdf[i], + NULL); + } + } + for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) { + av1_cost_tokens_from_cdf(x->interintra_cost[i], fc->interintra_cdf[i], + NULL); + av1_cost_tokens_from_cdf(x->interintra_mode_cost[i], + fc->interintra_mode_cdf[i], NULL); + } + for (i = 0; i < BLOCK_SIZES_ALL; ++i) { + av1_cost_tokens_from_cdf(x->wedge_interintra_cost[i], + fc->wedge_interintra_cdf[i], NULL); + } + for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) { + av1_cost_tokens_from_cdf(x->motion_mode_cost[i], fc->motion_mode_cdf[i], + NULL); + } + for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) { + av1_cost_tokens_from_cdf(x->motion_mode_cost1[i], fc->obmc_cdf[i], NULL); + } + for (i = 0; i < COMP_INDEX_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(x->comp_idx_cost[i], fc->compound_index_cdf[i], + NULL); + } + for (i = 0; i < COMP_GROUP_IDX_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(x->comp_group_idx_cost[i], + fc->comp_group_idx_cdf[i], NULL); + } + } +} + +// Values are now correlated to quantizer. +static int sad_per_bit_lut_8[QINDEX_RANGE]; +static int sad_per_bit_lut_10[QINDEX_RANGE]; +static int sad_per_bit_lut_12[QINDEX_RANGE]; + +static void init_me_luts_bd(int *bit16lut, int range, + aom_bit_depth_t bit_depth) { + int i; + // Initialize the sad lut tables using a formulaic calculation for now. + // This is to make it easier to resolve the impact of experimental changes + // to the quantizer tables. + for (i = 0; i < range; i++) { + const double q = av1_convert_qindex_to_q(i, bit_depth); + bit16lut[i] = (int)(0.0418 * q + 2.4107); + } +} + +void av1_init_me_luts(void) { + init_me_luts_bd(sad_per_bit_lut_8, QINDEX_RANGE, AOM_BITS_8); + init_me_luts_bd(sad_per_bit_lut_10, QINDEX_RANGE, AOM_BITS_10); + init_me_luts_bd(sad_per_bit_lut_12, QINDEX_RANGE, AOM_BITS_12); +} + +static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12, + 8, 8, 4, 4, 2, 2, 1, 0 }; +static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128, + 128, 144, 144, + 128 }; + +int av1_compute_rd_mult_based_on_qindex(const AV1_COMP *cpi, int qindex) { + const int q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth); + int rdmult = q * q; + rdmult = rdmult * 3 + (rdmult * 2 / 3); + switch (cpi->common.seq_params.bit_depth) { + case AOM_BITS_8: break; + case AOM_BITS_10: rdmult = ROUND_POWER_OF_TWO(rdmult, 4); break; + case AOM_BITS_12: rdmult = ROUND_POWER_OF_TWO(rdmult, 8); break; + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + return -1; + } + return rdmult > 0 ? rdmult : 1; +} + +int av1_compute_rd_mult(const AV1_COMP *cpi, int qindex) { + int64_t rdmult = av1_compute_rd_mult_based_on_qindex(cpi, qindex); + if (is_stat_consumption_stage(cpi) && + (cpi->common.current_frame.frame_type != KEY_FRAME)) { + const GF_GROUP *const gf_group = &cpi->gf_group; + const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index]; + const int boost_index = AOMMIN(15, (cpi->rc.gfu_boost / 100)); + + rdmult = (rdmult * rd_frame_type_factor[frame_type]) >> 7; + rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7); + } + return (int)rdmult; +} + +int av1_get_deltaq_offset(const AV1_COMP *cpi, int qindex, double beta) { + assert(beta > 0.0); + int q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth); + int newq = (int)rint(q / sqrt(beta)); + int orig_qindex = qindex; + if (newq < q) { + do { + qindex--; + q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth); + } while (newq < q && qindex > 0); + } else { + do { + qindex++; + q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth); + } while (newq > q && qindex < MAXQ); + } + return qindex - orig_qindex; +} + +int av1_get_adaptive_rdmult(const AV1_COMP *cpi, double beta) { + assert(beta > 0.0); + const AV1_COMMON *cm = &cpi->common; + int64_t q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0, + cm->seq_params.bit_depth); + int64_t rdmult = 0; + + switch (cm->seq_params.bit_depth) { + case AOM_BITS_8: rdmult = (int)((88 * q * q / beta) / 24); break; + case AOM_BITS_10: + rdmult = ROUND_POWER_OF_TWO((int)((88 * q * q / beta) / 24), 4); + break; + default: + assert(cm->seq_params.bit_depth == AOM_BITS_12); + rdmult = ROUND_POWER_OF_TWO((int)((88 * q * q / beta) / 24), 8); + break; + } + + if (is_stat_consumption_stage(cpi) && + (cm->current_frame.frame_type != KEY_FRAME)) { + const GF_GROUP *const gf_group = &cpi->gf_group; + const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index]; + const int boost_index = AOMMIN(15, (cpi->rc.gfu_boost / 100)); + + rdmult = (rdmult * rd_frame_type_factor[frame_type]) >> 7; + rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7); + } + if (rdmult < 1) rdmult = 1; + return (int)rdmult; +} + +static int compute_rd_thresh_factor(int qindex, aom_bit_depth_t bit_depth) { + double q; + switch (bit_depth) { + case AOM_BITS_8: q = av1_dc_quant_QTX(qindex, 0, AOM_BITS_8) / 4.0; break; + case AOM_BITS_10: + q = av1_dc_quant_QTX(qindex, 0, AOM_BITS_10) / 16.0; + break; + case AOM_BITS_12: + q = av1_dc_quant_QTX(qindex, 0, AOM_BITS_12) / 64.0; + break; + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + return -1; + } + // TODO(debargha): Adjust the function below. + return AOMMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8); +} + +void av1_initialize_me_consts(const AV1_COMP *cpi, MACROBLOCK *x, int qindex) { + switch (cpi->common.seq_params.bit_depth) { + case AOM_BITS_8: x->sadperbit = sad_per_bit_lut_8[qindex]; break; + case AOM_BITS_10: x->sadperbit = sad_per_bit_lut_10[qindex]; break; + case AOM_BITS_12: x->sadperbit = sad_per_bit_lut_12[qindex]; break; + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + } +} + +static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd) { + int i, bsize, segment_id; + + for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) { + const int qindex = clamp( + av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex) + + cm->quant_params.y_dc_delta_q, + 0, MAXQ); + const int q = compute_rd_thresh_factor(qindex, cm->seq_params.bit_depth); + + for (bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) { + // Threshold here seems unnecessarily harsh but fine given actual + // range of values used for cpi->sf.thresh_mult[]. + const int t = q * rd_thresh_block_size_factor[bsize]; + const int thresh_max = INT_MAX / t; + + for (i = 0; i < MAX_MODES; ++i) + rd->threshes[segment_id][bsize][i] = rd->thresh_mult[i] < thresh_max + ? rd->thresh_mult[i] * t / 4 + : INT_MAX; + } + } +} + +void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc, + const int num_planes) { + const int nplanes = AOMMIN(num_planes, PLANE_TYPES); + for (int eob_multi_size = 0; eob_multi_size < 7; ++eob_multi_size) { + for (int plane = 0; plane < nplanes; ++plane) { + LV_MAP_EOB_COST *pcost = &x->eob_costs[eob_multi_size][plane]; + + for (int ctx = 0; ctx < 2; ++ctx) { + aom_cdf_prob *pcdf; + switch (eob_multi_size) { + case 0: pcdf = fc->eob_flag_cdf16[plane][ctx]; break; + case 1: pcdf = fc->eob_flag_cdf32[plane][ctx]; break; + case 2: pcdf = fc->eob_flag_cdf64[plane][ctx]; break; + case 3: pcdf = fc->eob_flag_cdf128[plane][ctx]; break; + case 4: pcdf = fc->eob_flag_cdf256[plane][ctx]; break; + case 5: pcdf = fc->eob_flag_cdf512[plane][ctx]; break; + case 6: + default: pcdf = fc->eob_flag_cdf1024[plane][ctx]; break; + } + av1_cost_tokens_from_cdf(pcost->eob_cost[ctx], pcdf, NULL); + } + } + } + for (int tx_size = 0; tx_size < TX_SIZES; ++tx_size) { + for (int plane = 0; plane < nplanes; ++plane) { + LV_MAP_COEFF_COST *pcost = &x->coeff_costs[tx_size][plane]; + + for (int ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx) + av1_cost_tokens_from_cdf(pcost->txb_skip_cost[ctx], + fc->txb_skip_cdf[tx_size][ctx], NULL); + + for (int ctx = 0; ctx < SIG_COEF_CONTEXTS_EOB; ++ctx) + av1_cost_tokens_from_cdf(pcost->base_eob_cost[ctx], + fc->coeff_base_eob_cdf[tx_size][plane][ctx], + NULL); + for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) + av1_cost_tokens_from_cdf(pcost->base_cost[ctx], + fc->coeff_base_cdf[tx_size][plane][ctx], NULL); + + for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) { + pcost->base_cost[ctx][4] = 0; + pcost->base_cost[ctx][5] = pcost->base_cost[ctx][1] + + av1_cost_literal(1) - + pcost->base_cost[ctx][0]; + pcost->base_cost[ctx][6] = + pcost->base_cost[ctx][2] - pcost->base_cost[ctx][1]; + pcost->base_cost[ctx][7] = + pcost->base_cost[ctx][3] - pcost->base_cost[ctx][2]; + } + + for (int ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx) + av1_cost_tokens_from_cdf(pcost->eob_extra_cost[ctx], + fc->eob_extra_cdf[tx_size][plane][ctx], NULL); + + for (int ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx) + av1_cost_tokens_from_cdf(pcost->dc_sign_cost[ctx], + fc->dc_sign_cdf[plane][ctx], NULL); + + for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) { + int br_rate[BR_CDF_SIZE]; + int prev_cost = 0; + int i, j; + av1_cost_tokens_from_cdf( + br_rate, fc->coeff_br_cdf[AOMMIN(tx_size, TX_32X32)][plane][ctx], + NULL); + // printf("br_rate: "); + // for(j = 0; j < BR_CDF_SIZE; j++) + // printf("%4d ", br_rate[j]); + // printf("\n"); + for (i = 0; i < COEFF_BASE_RANGE; i += BR_CDF_SIZE - 1) { + for (j = 0; j < BR_CDF_SIZE - 1; j++) { + pcost->lps_cost[ctx][i + j] = prev_cost + br_rate[j]; + } + prev_cost += br_rate[j]; + } + pcost->lps_cost[ctx][i] = prev_cost; + // printf("lps_cost: %d %d %2d : ", tx_size, plane, ctx); + // for (i = 0; i <= COEFF_BASE_RANGE; i++) + // printf("%5d ", pcost->lps_cost[ctx][i]); + // printf("\n"); + } + for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) { + pcost->lps_cost[ctx][0 + COEFF_BASE_RANGE + 1] = + pcost->lps_cost[ctx][0]; + for (int i = 1; i <= COEFF_BASE_RANGE; ++i) { + pcost->lps_cost[ctx][i + COEFF_BASE_RANGE + 1] = + pcost->lps_cost[ctx][i] - pcost->lps_cost[ctx][i - 1]; + } + } + } + } +} + +void av1_fill_mv_costs(const FRAME_CONTEXT *fc, int integer_mv, int usehp, + MACROBLOCK *x) { + x->nmvcost[0] = &x->nmv_costs[0][MV_MAX]; + x->nmvcost[1] = &x->nmv_costs[1][MV_MAX]; + x->nmvcost_hp[0] = &x->nmv_costs_hp[0][MV_MAX]; + x->nmvcost_hp[1] = &x->nmv_costs_hp[1][MV_MAX]; + if (integer_mv) { + av1_build_nmv_cost_table(x->nmv_vec_cost, x->nmvcost, &fc->nmvc, + MV_SUBPEL_NONE); + x->mv_cost_stack = (int **)&x->nmvcost; + } else { + int *(*src)[2] = usehp ? &x->nmvcost_hp : &x->nmvcost; + x->mv_cost_stack = *src; + av1_build_nmv_cost_table( + x->nmv_vec_cost, usehp ? x->nmvcost_hp : x->nmvcost, &fc->nmvc, usehp); + } +} + +void av1_initialize_rd_consts(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->td.mb; + RD_OPT *const rd = &cpi->rd; + + aom_clear_system_state(); + + rd->RDMULT = av1_compute_rd_mult( + cpi, cm->quant_params.base_qindex + cm->quant_params.y_dc_delta_q); + + set_error_per_bit(x, rd->RDMULT); + + set_block_thresholds(cm, rd); + + if ((!cpi->sf.rt_sf.use_nonrd_pick_mode && + cpi->oxcf.mv_cost_upd_freq != COST_UPD_OFF) || + frame_is_intra_only(cm) || (cm->current_frame.frame_number & 0x07) == 1) + av1_fill_mv_costs(cm->fc, cm->features.cur_frame_force_integer_mv, + cm->features.allow_high_precision_mv, x); + + if (!cpi->sf.rt_sf.use_nonrd_pick_mode && frame_is_intra_only(cm) && + cm->features.allow_screen_content_tools && + !is_stat_generation_stage(cpi)) { + IntraBCMVCosts *const dv_costs = &cpi->dv_costs; + int *dvcost[2] = { &dv_costs->mv_component[0][MV_MAX], + &dv_costs->mv_component[1][MV_MAX] }; + av1_build_nmv_cost_table(dv_costs->joint_mv, dvcost, &cm->fc->ndvc, + MV_SUBPEL_NONE); + } + + if (!is_stat_generation_stage(cpi)) { + for (int i = 0; i < TRANS_TYPES; ++i) + // IDENTITY: 1 bit + // TRANSLATION: 3 bits + // ROTZOOM: 2 bits + // AFFINE: 3 bits + cpi->gm_info.type_cost[i] = (1 + (i > 0 ? (i == ROTZOOM ? 1 : 2) : 0)) + << AV1_PROB_COST_SHIFT; + } +} + +static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) { + // NOTE: The tables below must be of the same size. + + // The functions described below are sampled at the four most significant + // bits of x^2 + 8 / 256. + + // Normalized rate: + // This table models the rate for a Laplacian source with given variance + // when quantized with a uniform quantizer with given stepsize. The + // closed form expression is: + // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)], + // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance), + // and H(x) is the binary entropy function. + static const int rate_tab_q10[] = { + 65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142, + 4044, 3958, 3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186, + 3133, 3037, 2952, 2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353, + 2290, 2232, 2179, 2130, 2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651, + 1608, 1530, 1460, 1398, 1342, 1290, 1243, 1199, 1159, 1086, 1021, 963, + 911, 864, 821, 781, 745, 680, 623, 574, 530, 490, 455, 424, + 395, 345, 304, 269, 239, 213, 190, 171, 154, 126, 104, 87, + 73, 61, 52, 44, 38, 28, 21, 16, 12, 10, 8, 6, + 5, 3, 2, 1, 1, 1, 0, 0, + }; + // Normalized distortion: + // This table models the normalized distortion for a Laplacian source + // with given variance when quantized with a uniform quantizer + // with given stepsize. The closed form expression is: + // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2)) + // where x = qpstep / sqrt(variance). + // Note the actual distortion is Dn * variance. + static const int dist_tab_q10[] = { + 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5, + 5, 6, 7, 7, 8, 9, 11, 12, 13, 15, 16, 17, + 18, 21, 24, 26, 29, 31, 34, 36, 39, 44, 49, 54, + 59, 64, 69, 73, 78, 88, 97, 106, 115, 124, 133, 142, + 151, 167, 184, 200, 215, 231, 245, 260, 274, 301, 327, 351, + 375, 397, 418, 439, 458, 495, 528, 559, 587, 613, 637, 659, + 680, 717, 749, 777, 801, 823, 842, 859, 874, 899, 919, 936, + 949, 960, 969, 977, 983, 994, 1001, 1006, 1010, 1013, 1015, 1017, + 1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024, + }; + static const int xsq_iq_q10[] = { + 0, 4, 8, 12, 16, 20, 24, 28, 32, + 40, 48, 56, 64, 72, 80, 88, 96, 112, + 128, 144, 160, 176, 192, 208, 224, 256, 288, + 320, 352, 384, 416, 448, 480, 544, 608, 672, + 736, 800, 864, 928, 992, 1120, 1248, 1376, 1504, + 1632, 1760, 1888, 2016, 2272, 2528, 2784, 3040, 3296, + 3552, 3808, 4064, 4576, 5088, 5600, 6112, 6624, 7136, + 7648, 8160, 9184, 10208, 11232, 12256, 13280, 14304, 15328, + 16352, 18400, 20448, 22496, 24544, 26592, 28640, 30688, 32736, + 36832, 40928, 45024, 49120, 53216, 57312, 61408, 65504, 73696, + 81888, 90080, 98272, 106464, 114656, 122848, 131040, 147424, 163808, + 180192, 196576, 212960, 229344, 245728, + }; + const int tmp = (xsq_q10 >> 2) + 8; + const int k = get_msb(tmp) - 3; + const int xq = (k << 3) + ((tmp >> k) & 0x7); + const int one_q10 = 1 << 10; + const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k); + const int b_q10 = one_q10 - a_q10; + *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10; + *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10; +} + +void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n_log2, + unsigned int qstep, int *rate, + int64_t *dist) { + // This function models the rate and distortion for a Laplacian + // source with given variance when quantized with a uniform quantizer + // with given stepsize. The closed form expressions are in: + // Hang and Chen, "Source Model for transform video coder and its + // application - Part I: Fundamental Theory", IEEE Trans. Circ. + // Sys. for Video Tech., April 1997. + if (var == 0) { + *rate = 0; + *dist = 0; + } else { + int d_q10, r_q10; + static const uint32_t MAX_XSQ_Q10 = 245727; + const uint64_t xsq_q10_64 = + (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var; + const int xsq_q10 = (int)AOMMIN(xsq_q10_64, MAX_XSQ_Q10); + model_rd_norm(xsq_q10, &r_q10, &d_q10); + *rate = ROUND_POWER_OF_TWO(r_q10 << n_log2, 10 - AV1_PROB_COST_SHIFT); + *dist = (var * (int64_t)d_q10 + 512) >> 10; + } +} + +static double interp_cubic(const double *p, double x) { + return p[1] + 0.5 * x * + (p[2] - p[0] + + x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] + + x * (3.0 * (p[1] - p[2]) + p[3] - p[0]))); +} + +/* +static double interp_bicubic(const double *p, int p_stride, double x, + double y) { + double q[4]; + q[0] = interp_cubic(p, x); + q[1] = interp_cubic(p + p_stride, x); + q[2] = interp_cubic(p + 2 * p_stride, x); + q[3] = interp_cubic(p + 3 * p_stride, x); + return interp_cubic(q, y); +} +*/ + +static const uint8_t bsize_curvfit_model_cat_lookup[BLOCK_SIZES_ALL] = { + 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 1, 1, 2, 2, 3, 3 +}; + +static int sse_norm_curvfit_model_cat_lookup(double sse_norm) { + return (sse_norm > 16.0); +} + +// Models distortion by sse using a logistic function on +// l = log2(sse / q^2) as: +// dbysse = 16 / (1 + k exp(l + c)) +static double get_dbysse_logistic(double l, double c, double k) { + const double A = 16.0; + const double dbysse = A / (1 + k * exp(l + c)); + return dbysse; +} + +// Models rate using a clamped linear function on +// l = log2(sse / q^2) as: +// rate = max(0, a + b * l) +static double get_rate_clamplinear(double l, double a, double b) { + const double rate = a + b * l; + return (rate < 0 ? 0 : rate); +} + +static const uint8_t bsize_surffit_model_cat_lookup[BLOCK_SIZES_ALL] = { + 0, 0, 0, 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 0, 0, 2, 2, 4, 4 +}; + +static const double surffit_rate_params[9][4] = { + { + 638.390212, + 2.253108, + 166.585650, + -3.939401, + }, + { + 5.256905, + 81.997240, + -1.321771, + 17.694216, + }, + { + -74.193045, + 72.431868, + -19.033152, + 15.407276, + }, + { + 416.770113, + 14.794188, + 167.686830, + -6.997756, + }, + { + 378.511276, + 9.558376, + 154.658843, + -6.635663, + }, + { + 277.818787, + 4.413180, + 150.317637, + -9.893038, + }, + { + 142.212132, + 11.542038, + 94.393964, + -5.518517, + }, + { + 219.100256, + 4.007421, + 108.932852, + -6.981310, + }, + { + 222.261971, + 3.251049, + 95.972916, + -5.609789, + }, +}; + +static const double surffit_dist_params[7] = { 1.475844, 4.328362, -5.680233, + -0.500994, 0.554585, 4.839478, + -0.695837 }; + +static void rate_surffit_model_params_lookup(BLOCK_SIZE bsize, double xm, + double *rpar) { + const int cat = bsize_surffit_model_cat_lookup[bsize]; + rpar[0] = surffit_rate_params[cat][0] + surffit_rate_params[cat][1] * xm; + rpar[1] = surffit_rate_params[cat][2] + surffit_rate_params[cat][3] * xm; +} + +static void dist_surffit_model_params_lookup(BLOCK_SIZE bsize, double xm, + double *dpar) { + (void)bsize; + const double *params = surffit_dist_params; + dpar[0] = params[0] + params[1] / (1 + exp((xm + params[2]) * params[3])); + dpar[1] = params[4] + params[5] * exp(params[6] * xm); +} + +void av1_model_rd_surffit(BLOCK_SIZE bsize, double sse_norm, double xm, + double yl, double *rate_f, double *distbysse_f) { + (void)sse_norm; + double rpar[2], dpar[2]; + rate_surffit_model_params_lookup(bsize, xm, rpar); + dist_surffit_model_params_lookup(bsize, xm, dpar); + + *rate_f = get_rate_clamplinear(yl, rpar[0], rpar[1]); + *distbysse_f = get_dbysse_logistic(yl, dpar[0], dpar[1]); +} + +static const double interp_rgrid_curv[4][65] = { + { + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 118.257702, 120.210658, 121.434853, 122.100487, + 122.377758, 122.436865, 72.290102, 96.974289, 101.652727, + 126.830141, 140.417377, 157.644879, 184.315291, 215.823873, + 262.300169, 335.919859, 420.624173, 519.185032, 619.854243, + 726.053595, 827.663369, 933.127475, 1037.988755, 1138.839609, + 1233.342933, 1333.508064, 1428.760126, 1533.396364, 1616.952052, + 1744.539319, 1803.413586, 1951.466618, 1994.227838, 2086.031680, + 2148.635443, 2239.068450, 2222.590637, 2338.859809, 2402.929011, + 2418.727875, 2435.342670, 2471.159469, 2523.187446, 2591.183827, + 2674.905840, 2774.110714, 2888.555675, 3017.997952, 3162.194773, + 3320.903365, 3493.880956, 3680.884773, 3881.672045, 4096.000000, + }, + { + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 13.087244, 15.919735, 25.930313, 24.412411, + 28.567417, 29.924194, 30.857010, 32.742979, 36.382570, + 39.210386, 42.265690, 47.378572, 57.014850, 82.740067, + 137.346562, 219.968084, 316.781856, 415.643773, 516.706538, + 614.914364, 714.303763, 815.512135, 911.210485, 1008.501528, + 1109.787854, 1213.772279, 1322.922561, 1414.752579, 1510.505641, + 1615.741888, 1697.989032, 1780.123933, 1847.453790, 1913.742309, + 1960.828122, 2047.500168, 2085.454095, 2129.230668, 2158.171824, + 2182.231724, 2217.684864, 2269.589211, 2337.264824, 2420.618694, + 2519.557814, 2633.989178, 2763.819779, 2908.956609, 3069.306660, + 3244.776927, 3435.274401, 3640.706076, 3860.978945, 4096.000000, + }, + { + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 4.656893, 5.123633, 5.594132, 6.162376, + 6.918433, 7.768444, 8.739415, 10.105862, 11.477328, + 13.236604, 15.421030, 19.093623, 25.801871, 46.724612, + 98.841054, 181.113466, 272.586364, 359.499769, 445.546343, + 525.944439, 605.188743, 681.793483, 756.668359, 838.486885, + 926.950356, 1015.482542, 1113.353926, 1204.897193, 1288.871992, + 1373.464145, 1455.746628, 1527.796460, 1588.475066, 1658.144771, + 1710.302500, 1807.563351, 1863.197608, 1927.281616, 1964.450872, + 2022.719898, 2100.041145, 2185.205712, 2280.993936, 2387.616216, + 2505.282950, 2634.204540, 2774.591385, 2926.653884, 3090.602436, + 3266.647443, 3454.999303, 3655.868416, 3869.465182, 4096.000000, + }, + { + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.337370, 0.391916, 0.468839, 0.566334, + 0.762564, 1.069225, 1.384361, 1.787581, 2.293948, + 3.251909, 4.412991, 8.050068, 11.606073, 27.668092, + 65.227758, 128.463938, 202.097653, 262.715851, 312.464873, + 355.601398, 400.609054, 447.201352, 495.761568, 552.871938, + 619.067625, 691.984883, 773.753288, 860.628503, 946.262808, + 1019.805896, 1106.061360, 1178.422145, 1244.852258, 1302.173987, + 1399.650266, 1548.092912, 1545.928652, 1670.817500, 1694.523823, + 1779.195362, 1882.155494, 1990.662097, 2108.325181, 2235.456119, + 2372.366287, 2519.367059, 2676.769812, 2844.885918, 3024.026754, + 3214.503695, 3416.628115, 3630.711389, 3857.064892, 4096.000000, + }, +}; + +static const double interp_dgrid_curv[3][65] = { + { + 16.000000, 15.962891, 15.925174, 15.886888, 15.848074, 15.808770, + 15.769015, 15.728850, 15.688313, 15.647445, 15.606284, 15.564870, + 15.525918, 15.483820, 15.373330, 15.126844, 14.637442, 14.184387, + 13.560070, 12.880717, 12.165995, 11.378144, 10.438769, 9.130790, + 7.487633, 5.688649, 4.267515, 3.196300, 2.434201, 1.834064, + 1.369920, 1.035921, 0.775279, 0.574895, 0.427232, 0.314123, + 0.233236, 0.171440, 0.128188, 0.092762, 0.067569, 0.049324, + 0.036330, 0.027008, 0.019853, 0.015539, 0.011093, 0.008733, + 0.007624, 0.008105, 0.005427, 0.004065, 0.003427, 0.002848, + 0.002328, 0.001865, 0.001457, 0.001103, 0.000801, 0.000550, + 0.000348, 0.000193, 0.000085, 0.000021, 0.000000, + }, + { + 16.000000, 15.996116, 15.984769, 15.966413, 15.941505, 15.910501, + 15.873856, 15.832026, 15.785466, 15.734633, 15.679981, 15.621967, + 15.560961, 15.460157, 15.288367, 15.052462, 14.466922, 13.921212, + 13.073692, 12.222005, 11.237799, 9.985848, 8.898823, 7.423519, + 5.995325, 4.773152, 3.744032, 2.938217, 2.294526, 1.762412, + 1.327145, 1.020728, 0.765535, 0.570548, 0.425833, 0.313825, + 0.232959, 0.171324, 0.128174, 0.092750, 0.067558, 0.049319, + 0.036330, 0.027008, 0.019853, 0.015539, 0.011093, 0.008733, + 0.007624, 0.008105, 0.005427, 0.004065, 0.003427, 0.002848, + 0.002328, 0.001865, 0.001457, 0.001103, 0.000801, 0.000550, + 0.000348, 0.000193, 0.000085, 0.000021, -0.000000, + }, +}; + +void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr, + double *rate_f, double *distbysse_f) { + const double x_start = -15.5; + const double x_end = 16.5; + const double x_step = 0.5; + const double epsilon = 1e-6; + const int rcat = bsize_curvfit_model_cat_lookup[bsize]; + const int dcat = sse_norm_curvfit_model_cat_lookup(sse_norm); + (void)x_end; + + xqr = AOMMAX(xqr, x_start + x_step + epsilon); + xqr = AOMMIN(xqr, x_end - x_step - epsilon); + const double x = (xqr - x_start) / x_step; + const int xi = (int)floor(x); + const double xo = x - xi; + + assert(xi > 0); + + const double *prate = &interp_rgrid_curv[rcat][(xi - 1)]; + *rate_f = interp_cubic(prate, xo); + const double *pdist = &interp_dgrid_curv[dcat][(xi - 1)]; + *distbysse_f = interp_cubic(pdist, xo); +} + +static void get_entropy_contexts_plane(BLOCK_SIZE plane_bsize, + const struct macroblockd_plane *pd, + ENTROPY_CONTEXT t_above[MAX_MIB_SIZE], + ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]) { + const int num_4x4_w = mi_size_wide[plane_bsize]; + const int num_4x4_h = mi_size_high[plane_bsize]; + const ENTROPY_CONTEXT *const above = pd->above_entropy_context; + const ENTROPY_CONTEXT *const left = pd->left_entropy_context; + + memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w); + memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h); +} + +void av1_get_entropy_contexts(BLOCK_SIZE plane_bsize, + const struct macroblockd_plane *pd, + ENTROPY_CONTEXT t_above[MAX_MIB_SIZE], + ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]) { + assert(plane_bsize < BLOCK_SIZES_ALL); + get_entropy_contexts_plane(plane_bsize, pd, t_above, t_left); +} + +void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer, + int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) { + const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME }; + const int_mv ref_mv = + av1_get_ref_mv_from_stack(0, ref_frames, 0, x->mbmi_ext); + const int_mv ref_mv1 = + av1_get_ref_mv_from_stack(0, ref_frames, 1, x->mbmi_ext); + MV pred_mv[MAX_MV_REF_CANDIDATES + 1]; + int num_mv_refs = 0; + pred_mv[num_mv_refs++] = ref_mv.as_mv; + if (ref_mv.as_int != ref_mv1.as_int) { + pred_mv[num_mv_refs++] = ref_mv1.as_mv; + } + if (cpi->sf.mv_sf.adaptive_motion_search && + block_size < x->max_partition_size) { + pred_mv[num_mv_refs++] = x->pred_mv[ref_frame]; + } + + assert(num_mv_refs <= (int)(sizeof(pred_mv) / sizeof(pred_mv[0]))); + + const uint8_t *const src_y_ptr = x->plane[0].src.buf; + int zero_seen = 0; + int best_sad = INT_MAX; + int max_mv = 0; + // Get the sad for each candidate reference mv. + for (int i = 0; i < num_mv_refs; ++i) { + const MV *this_mv = &pred_mv[i]; + const int fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3; + const int fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3; + max_mv = AOMMAX(max_mv, AOMMAX(abs(this_mv->row), abs(this_mv->col)) >> 3); + + if (fp_row == 0 && fp_col == 0 && zero_seen) continue; + zero_seen |= (fp_row == 0 && fp_col == 0); + + const uint8_t *const ref_y_ptr = + &ref_y_buffer[ref_y_stride * fp_row + fp_col]; + // Find sad for current vector. + const int this_sad = cpi->fn_ptr[block_size].sdf( + src_y_ptr, x->plane[0].src.stride, ref_y_ptr, ref_y_stride); + // Note if it is the best so far. + if (this_sad < best_sad) { + best_sad = this_sad; + } + } + + // Note the index of the mv that worked best in the reference list. + x->max_mv_context[ref_frame] = max_mv; + x->pred_mv_sad[ref_frame] = best_sad; +} + +void av1_setup_pred_block(const MACROBLOCKD *xd, + struct buf_2d dst[MAX_MB_PLANE], + const YV12_BUFFER_CONFIG *src, + const struct scale_factors *scale, + const struct scale_factors *scale_uv, + const int num_planes) { + dst[0].buf = src->y_buffer; + dst[0].stride = src->y_stride; + dst[1].buf = src->u_buffer; + dst[2].buf = src->v_buffer; + dst[1].stride = dst[2].stride = src->uv_stride; + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + for (int i = 0; i < num_planes; ++i) { + setup_pred_plane(dst + i, xd->mi[0]->sb_type, dst[i].buf, + i ? src->uv_crop_width : src->y_crop_width, + i ? src->uv_crop_height : src->y_crop_height, + dst[i].stride, mi_row, mi_col, i ? scale_uv : scale, + xd->plane[i].subsampling_x, xd->plane[i].subsampling_y); + } +} + +YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const AV1_COMP *cpi, + int ref_frame) { + assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME); + RefCntBuffer *const scaled_buf = cpi->scaled_ref_buf[ref_frame - 1]; + const RefCntBuffer *const ref_buf = + get_ref_frame_buf(&cpi->common, ref_frame); + return (scaled_buf != ref_buf && scaled_buf != NULL) ? &scaled_buf->buf + : NULL; +} + +int av1_get_switchable_rate(const MACROBLOCK *x, const MACROBLOCKD *xd, + InterpFilter interp_filter) { + if (interp_filter == SWITCHABLE) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + int inter_filter_cost = 0; + int dir; + + for (dir = 0; dir < 2; ++dir) { + const int ctx = av1_get_pred_context_switchable_interp(xd, dir); + const InterpFilter filter = + av1_extract_interp_filter(mbmi->interp_filters, dir); + inter_filter_cost += x->switchable_interp_costs[ctx][filter]; + } + return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost; + } else { + return 0; + } +} + +void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { + RD_OPT *const rd = &cpi->rd; + + // Set baseline threshold values. + av1_zero(rd->thresh_mult); + + rd->thresh_mult[THR_NEARESTMV] = 300; + rd->thresh_mult[THR_NEARESTL2] = 300; + rd->thresh_mult[THR_NEARESTL3] = 300; + rd->thresh_mult[THR_NEARESTB] = 300; + rd->thresh_mult[THR_NEARESTA2] = 300; + rd->thresh_mult[THR_NEARESTA] = 300; + rd->thresh_mult[THR_NEARESTG] = 300; + + rd->thresh_mult[THR_NEWMV] = 1000; + rd->thresh_mult[THR_NEWL2] = 1000; + rd->thresh_mult[THR_NEWL3] = 1000; + rd->thresh_mult[THR_NEWB] = 1000; + rd->thresh_mult[THR_NEWA2] = 1100; + rd->thresh_mult[THR_NEWA] = 1000; + rd->thresh_mult[THR_NEWG] = 1000; + + rd->thresh_mult[THR_NEARMV] = 1000; + rd->thresh_mult[THR_NEARL2] = 1000; + rd->thresh_mult[THR_NEARL3] = 1000; + rd->thresh_mult[THR_NEARB] = 1000; + rd->thresh_mult[THR_NEARA2] = 1000; + rd->thresh_mult[THR_NEARA] = 1000; + rd->thresh_mult[THR_NEARG] = 1000; + + rd->thresh_mult[THR_GLOBALMV] = 2200; + rd->thresh_mult[THR_GLOBALL2] = 2000; + rd->thresh_mult[THR_GLOBALL3] = 2000; + rd->thresh_mult[THR_GLOBALB] = 2400; + rd->thresh_mult[THR_GLOBALA2] = 2000; + rd->thresh_mult[THR_GLOBALG] = 2000; + rd->thresh_mult[THR_GLOBALA] = 2400; + + rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] = 1100; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] = 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] = 800; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA] = 900; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTLB] = 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2B] = 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3B] = 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTGB] = 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA2] = 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A2] = 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A2] = 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA2] = 1000; + + rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL2] = 2000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL3] = 2000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTLG] = 2000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTBA] = 2000; + + rd->thresh_mult[THR_COMP_NEAR_NEARLA] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWLA] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTLA] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWLA] = 1530; + rd->thresh_mult[THR_COMP_NEW_NEARLA] = 1870; + rd->thresh_mult[THR_COMP_NEW_NEWLA] = 2400; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA] = 2750; + + rd->thresh_mult[THR_COMP_NEAR_NEARL2A] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWL2A] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTL2A] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWL2A] = 1870; + rd->thresh_mult[THR_COMP_NEW_NEARL2A] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWL2A] = 1800; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A] = 2500; + + rd->thresh_mult[THR_COMP_NEAR_NEARL3A] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWL3A] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTL3A] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWL3A] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEARL3A] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWL3A] = 2000; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A] = 3000; + + rd->thresh_mult[THR_COMP_NEAR_NEARGA] = 1320; + rd->thresh_mult[THR_COMP_NEAREST_NEWGA] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTGA] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWGA] = 2040; + rd->thresh_mult[THR_COMP_NEW_NEARGA] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWGA] = 2000; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA] = 2250; + + rd->thresh_mult[THR_COMP_NEAR_NEARLB] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWLB] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTLB] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWLB] = 1360; + rd->thresh_mult[THR_COMP_NEW_NEARLB] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWLB] = 2400; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLB] = 2250; + + rd->thresh_mult[THR_COMP_NEAR_NEARL2B] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWL2B] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTL2B] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWL2B] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEARL2B] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWL2B] = 2000; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2B] = 2500; + + rd->thresh_mult[THR_COMP_NEAR_NEARL3B] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWL3B] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTL3B] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWL3B] = 1870; + rd->thresh_mult[THR_COMP_NEW_NEARL3B] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWL3B] = 2000; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3B] = 2500; + + rd->thresh_mult[THR_COMP_NEAR_NEARGB] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWGB] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTGB] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWGB] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEARGB] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWGB] = 2000; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGB] = 2500; + + rd->thresh_mult[THR_COMP_NEAR_NEARLA2] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWLA2] = 1800; + rd->thresh_mult[THR_COMP_NEW_NEARESTLA2] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWLA2] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEARLA2] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWLA2] = 2000; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA2] = 2500; + + rd->thresh_mult[THR_COMP_NEAR_NEARL2A2] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWL2A2] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTL2A2] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWL2A2] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEARL2A2] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWL2A2] = 2000; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A2] = 2500; + + rd->thresh_mult[THR_COMP_NEAR_NEARL3A2] = 1440; + rd->thresh_mult[THR_COMP_NEAREST_NEWL3A2] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTL3A2] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWL3A2] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEARL3A2] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWL3A2] = 2000; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A2] = 2500; + + rd->thresh_mult[THR_COMP_NEAR_NEARGA2] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWGA2] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTGA2] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWGA2] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEARGA2] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWGA2] = 2000; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA2] = 2750; + + rd->thresh_mult[THR_COMP_NEAR_NEARLL2] = 1600; + rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] = 2000; + rd->thresh_mult[THR_COMP_NEW_NEARESTLL2] = 2000; + rd->thresh_mult[THR_COMP_NEAR_NEWLL2] = 2640; + rd->thresh_mult[THR_COMP_NEW_NEARLL2] = 2200; + rd->thresh_mult[THR_COMP_NEW_NEWLL2] = 2400; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL2] = 3200; + + rd->thresh_mult[THR_COMP_NEAR_NEARLL3] = 1600; + rd->thresh_mult[THR_COMP_NEAREST_NEWLL3] = 2000; + rd->thresh_mult[THR_COMP_NEW_NEARESTLL3] = 1800; + rd->thresh_mult[THR_COMP_NEAR_NEWLL3] = 2200; + rd->thresh_mult[THR_COMP_NEW_NEARLL3] = 2200; + rd->thresh_mult[THR_COMP_NEW_NEWLL3] = 2400; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL3] = 3200; + + rd->thresh_mult[THR_COMP_NEAR_NEARLG] = 1760; + rd->thresh_mult[THR_COMP_NEAREST_NEWLG] = 2400; + rd->thresh_mult[THR_COMP_NEW_NEARESTLG] = 2000; + rd->thresh_mult[THR_COMP_NEAR_NEWLG] = 1760; + rd->thresh_mult[THR_COMP_NEW_NEARLG] = 2640; + rd->thresh_mult[THR_COMP_NEW_NEWLG] = 2400; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLG] = 3200; + + rd->thresh_mult[THR_COMP_NEAR_NEARBA] = 1600; + rd->thresh_mult[THR_COMP_NEAREST_NEWBA] = 2000; + rd->thresh_mult[THR_COMP_NEW_NEARESTBA] = 2000; + rd->thresh_mult[THR_COMP_NEAR_NEWBA] = 2200; + rd->thresh_mult[THR_COMP_NEW_NEARBA] = 1980; + rd->thresh_mult[THR_COMP_NEW_NEWBA] = 2640; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALBA] = 3200; + + rd->thresh_mult[THR_DC] = 1000; + rd->thresh_mult[THR_PAETH] = 1000; + rd->thresh_mult[THR_SMOOTH] = 2200; + rd->thresh_mult[THR_SMOOTH_V] = 2000; + rd->thresh_mult[THR_SMOOTH_H] = 2000; + rd->thresh_mult[THR_H_PRED] = 2000; + rd->thresh_mult[THR_V_PRED] = 1800; + rd->thresh_mult[THR_D135_PRED] = 2500; + rd->thresh_mult[THR_D203_PRED] = 2000; + rd->thresh_mult[THR_D157_PRED] = 2500; + rd->thresh_mult[THR_D67_PRED] = 2000; + rd->thresh_mult[THR_D113_PRED] = 2500; + rd->thresh_mult[THR_D45_PRED] = 2500; +} + +void av1_update_rd_thresh_fact(const AV1_COMMON *const cm, + int (*factor_buf)[MAX_MODES], + int use_adaptive_rd_thresh, BLOCK_SIZE bsize, + THR_MODES best_mode_index) { + assert(use_adaptive_rd_thresh > 0); + const THR_MODES top_mode = MAX_MODES; + const int max_rd_thresh_factor = use_adaptive_rd_thresh * RD_THRESH_MAX_FACT; + + const int bsize_is_1_to_4 = bsize > cm->seq_params.sb_size; + BLOCK_SIZE min_size, max_size; + if (bsize_is_1_to_4) { + // This part handles block sizes with 1:4 and 4:1 aspect ratios + // TODO(any): Experiment with threshold update for parent/child blocks + min_size = bsize; + max_size = bsize; + } else { + min_size = AOMMAX(bsize - 2, BLOCK_4X4); + max_size = AOMMIN(bsize + 2, (int)cm->seq_params.sb_size); + } + + for (THR_MODES mode = 0; mode < top_mode; ++mode) { + for (BLOCK_SIZE bs = min_size; bs <= max_size; ++bs) { + int *const fact = &factor_buf[bs][mode]; + if (mode == best_mode_index) { + *fact -= (*fact >> RD_THRESH_LOG_DEC_FACTOR); + } else { + *fact = AOMMIN(*fact + RD_THRESH_INC, max_rd_thresh_factor); + } + } + } +} + +int av1_get_intra_cost_penalty(int qindex, int qdelta, + aom_bit_depth_t bit_depth) { + const int q = av1_dc_quant_QTX(qindex, qdelta, bit_depth); + switch (bit_depth) { + case AOM_BITS_8: return 20 * q; + case AOM_BITS_10: return 5 * q; + case AOM_BITS_12: return ROUND_POWER_OF_TWO(5 * q, 2); + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + return -1; + } +} diff --git a/libs/libaom/src/av1/encoder/rd.h b/libs/libaom/src/av1/encoder/rd.h new file mode 100644 index 000000000..1addbaeb9 --- /dev/null +++ b/libs/libaom/src/av1/encoder/rd.h @@ -0,0 +1,370 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_RD_H_ +#define AOM_AV1_ENCODER_RD_H_ + +#include + +#include "av1/common/blockd.h" + +#include "av1/encoder/block.h" +#include "av1/encoder/context_tree.h" +#include "av1/encoder/cost.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define RDDIV_BITS 7 +#define RD_EPB_SHIFT 6 + +#define RDCOST(RM, R, D) \ + (ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), AV1_PROB_COST_SHIFT) + \ + ((D) * (1 << RDDIV_BITS))) + +#define RDCOST_NEG_R(RM, R, D) \ + (((D) * (1 << RDDIV_BITS)) - \ + ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), AV1_PROB_COST_SHIFT)) + +#define RDCOST_DBL(RM, R, D) \ + (((((double)(R)) * (RM)) / (double)(1 << AV1_PROB_COST_SHIFT)) + \ + ((double)(D) * (1 << RDDIV_BITS))) + +#define QIDX_SKIP_THRESH 115 + +#define MV_COST_WEIGHT 108 +#define MV_COST_WEIGHT_SUB 120 + +// The fractional part of rd_thresh factor is stored with 5 bits. The maximum +// factor that we allow is two, which is stored as 2 ** (5+1) = 64 +#define RD_THRESH_FAC_FRAC_BITS (5) +#define RD_THRESH_FAC_FRAC_VAL (1 << (RD_THRESH_FAC_FRAC_BITS)) +#define RD_THRESH_MAX_FACT ((RD_THRESH_FAC_FRAC_VAL) << 1) +#define RD_THRESH_LOG_DEC_FACTOR (4) +#define RD_THRESH_INC (1) + +// Factor to weigh the rate for switchable interp filters. +#define SWITCHABLE_INTERP_RATE_FACTOR 1 + +enum { + // Default initialization when we are not using winner mode framework. e.g. + // intrabc + DEFAULT_EVAL = 0, + // Initialization for selecting winner mode + MODE_EVAL, + // Initialization for winner mode evaluation + WINNER_MODE_EVAL, + // All mode evaluation types + MODE_EVAL_TYPES, +} UENUM1BYTE(MODE_EVAL_TYPE); + +typedef struct RD_OPT { + // Thresh_mult is used to set a threshold for the rd score. A higher value + // means that we will accept the best mode so far more often. This number + // is used in combination with the current block size, and thresh_freq_fact + // to pick a threshold. + int thresh_mult[MAX_MODES]; + + int threshes[MAX_SEGMENTS][BLOCK_SIZES_ALL][MAX_MODES]; + + int RDMULT; + + double r0, arf_r0; + double mc_saved_base, mc_count_base; +} RD_OPT; + +typedef struct { + // Cost of transmitting the actual motion vector. + // mv_component[0][i] is the cost of motion vector with horizontal component + // (mv_row) equal to i - MV_MAX. + // mv_component[1][i] is the cost of motion vector with vertical component + // (mv_col) equal to i - MV_MAX. + int mv_component[2][MV_VALS]; + + // joint_mv[i] is the cost of transmitting joint mv(MV_JOINT_TYPE) of + // type i. + // TODO(huisu@google.com): we can update dv_joint_cost per SB. + int joint_mv[MV_JOINTS]; +} IntraBCMVCosts; + +static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) { +#if CONFIG_RD_DEBUG + int plane; +#endif + rd_stats->rate = 0; + rd_stats->dist = 0; + rd_stats->rdcost = 0; + rd_stats->sse = 0; + rd_stats->skip = 1; + rd_stats->zero_rate = 0; +#if CONFIG_RD_DEBUG + // This may run into problems when monochrome video is + // encoded, as there will only be 1 plane + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + rd_stats->txb_coeff_cost[plane] = 0; + { + int r, c; + for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) + for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) + rd_stats->txb_coeff_cost_map[plane][r][c] = 0; + } + } +#endif +} + +static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) { +#if CONFIG_RD_DEBUG + int plane; +#endif + rd_stats->rate = INT_MAX; + rd_stats->dist = INT64_MAX; + rd_stats->rdcost = INT64_MAX; + rd_stats->sse = INT64_MAX; + rd_stats->skip = 0; + rd_stats->zero_rate = 0; +#if CONFIG_RD_DEBUG + // This may run into problems when monochrome video is + // encoded, as there will only be 1 plane + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + rd_stats->txb_coeff_cost[plane] = INT_MAX; + { + int r, c; + for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) + for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) + rd_stats->txb_coeff_cost_map[plane][r][c] = INT16_MAX; + } + } +#endif +} + +static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst, + const RD_STATS *rd_stats_src) { + assert(rd_stats_dst->rate != INT_MAX && rd_stats_src->rate != INT_MAX); + rd_stats_dst->rate = (int)AOMMIN( + ((int64_t)rd_stats_dst->rate + (int64_t)rd_stats_src->rate), INT_MAX); + if (!rd_stats_dst->zero_rate) + rd_stats_dst->zero_rate = rd_stats_src->zero_rate; + rd_stats_dst->dist += rd_stats_src->dist; + rd_stats_dst->sse += rd_stats_src->sse; + rd_stats_dst->skip &= rd_stats_src->skip; +#if CONFIG_RD_DEBUG + // This may run into problems when monochrome video is + // encoded, as there will only be 1 plane + for (int plane = 0; plane < MAX_MB_PLANE; ++plane) { + rd_stats_dst->txb_coeff_cost[plane] += rd_stats_src->txb_coeff_cost[plane]; + { + // TODO(angiebird): optimize this part + int r, c; + int ref_txb_coeff_cost = 0; + for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) + for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) { + rd_stats_dst->txb_coeff_cost_map[plane][r][c] += + rd_stats_src->txb_coeff_cost_map[plane][r][c]; + ref_txb_coeff_cost += rd_stats_dst->txb_coeff_cost_map[plane][r][c]; + } + assert(ref_txb_coeff_cost == rd_stats_dst->txb_coeff_cost[plane]); + } + } +#endif +} + +static INLINE void av1_accumulate_rd_stats(RD_STATS *rd_stats, int64_t dist, + int rate, int skip, int64_t sse, + int zero_rate) { + assert(rd_stats->rate != INT_MAX && rate != INT_MAX); + rd_stats->rate += rate; + if (!rd_stats->zero_rate) rd_stats->zero_rate = zero_rate; + rd_stats->dist += dist; + rd_stats->skip &= skip; + rd_stats->sse += sse; +} + +static INLINE int64_t av1_calculate_rd_cost(int mult, int rate, int64_t dist) { + assert(mult >= 0); + if (rate >= 0) { + return RDCOST(mult, rate, dist); + } + return RDCOST_NEG_R(mult, -rate, dist); +} + +static INLINE void av1_rd_cost_update(int mult, RD_STATS *rd_cost) { + if (rd_cost->rate < INT_MAX && rd_cost->dist < INT64_MAX && + rd_cost->rdcost < INT64_MAX) { + rd_cost->rdcost = av1_calculate_rd_cost(mult, rd_cost->rate, rd_cost->dist); + } else { + av1_invalid_rd_stats(rd_cost); + } +} + +static INLINE void av1_rd_stats_subtraction(int mult, + const RD_STATS *const left, + const RD_STATS *const right, + RD_STATS *result) { + if (left->rate == INT_MAX || right->rate == INT_MAX || + left->dist == INT64_MAX || right->dist == INT64_MAX || + left->rdcost == INT64_MAX || right->rdcost == INT64_MAX) { + av1_invalid_rd_stats(result); + } else { + result->rate = left->rate - right->rate; + result->dist = left->dist - right->dist; + result->rdcost = av1_calculate_rd_cost(mult, result->rate, result->dist); + } +} + +struct TileInfo; +struct TileDataEnc; +struct AV1_COMP; +struct macroblock; + +int av1_compute_rd_mult_based_on_qindex(const struct AV1_COMP *cpi, int qindex); + +int av1_compute_rd_mult(const struct AV1_COMP *cpi, int qindex); + +void av1_initialize_rd_consts(struct AV1_COMP *cpi); + +void av1_initialize_me_consts(const struct AV1_COMP *cpi, MACROBLOCK *x, + int qindex); + +void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n, + unsigned int qstep, int *rate, int64_t *dist); + +void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr, + double *rate_f, double *distbysse_f); +void av1_model_rd_surffit(BLOCK_SIZE bsize, double sse_norm, double xm, + double yl, double *rate_f, double *distbysse_f); + +int av1_get_switchable_rate(const MACROBLOCK *x, const MACROBLOCKD *xd, + InterpFilter interp_filter); + +YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const struct AV1_COMP *cpi, + int ref_frame); + +void av1_init_me_luts(void); + +void av1_set_mvcost(MACROBLOCK *x, int ref, int ref_mv_idx); + +void av1_get_entropy_contexts(BLOCK_SIZE plane_bsize, + const struct macroblockd_plane *pd, + ENTROPY_CONTEXT t_above[MAX_MIB_SIZE], + ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]); + +void av1_set_rd_speed_thresholds(struct AV1_COMP *cpi); + +void av1_update_rd_thresh_fact(const AV1_COMMON *const cm, + int (*fact)[MAX_MODES], int rd_thresh, + BLOCK_SIZE bsize, THR_MODES best_mode_index); + +static INLINE void reset_thresh_freq_fact(MACROBLOCK *const x) { + for (int i = 0; i < BLOCK_SIZES_ALL; ++i) { + for (int j = 0; j < MAX_MODES; ++j) { + x->thresh_freq_fact[i][j] = RD_THRESH_FAC_FRAC_VAL; + } + } +} + +static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh, + int thresh_fact) { + return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX; +} + +void av1_mv_pred(const struct AV1_COMP *cpi, MACROBLOCK *x, + uint8_t *ref_y_buffer, int ref_y_stride, int ref_frame, + BLOCK_SIZE block_size); + +static INLINE void set_error_per_bit(MACROBLOCK *x, int rdmult) { + x->errorperbit = rdmult >> RD_EPB_SHIFT; + x->errorperbit += (x->errorperbit == 0); +} + +// Get the threshold for R-D optimization of coefficients depending upon mode +// decision/winner mode processing +static INLINE uint32_t get_rd_opt_coeff_thresh( + const uint32_t *const coeff_opt_dist_threshold, + int enable_winner_mode_for_coeff_opt, int is_winner_mode) { + // Default initialization of threshold + uint32_t coeff_opt_thresh = coeff_opt_dist_threshold[DEFAULT_EVAL]; + // TODO(any): Experiment with coeff_opt_dist_threshold values when + // enable_winner_mode_for_coeff_opt is ON + // TODO(any): Skip the winner mode processing for blocks with lower residual + // energy as R-D optimization of coefficients would have been enabled during + // mode decision + if (enable_winner_mode_for_coeff_opt) { + // Use conservative threshold during mode decision and perform R-D + // optimization of coeffs always for winner modes + if (is_winner_mode) + coeff_opt_thresh = coeff_opt_dist_threshold[WINNER_MODE_EVAL]; + else + coeff_opt_thresh = coeff_opt_dist_threshold[MODE_EVAL]; + } + return coeff_opt_thresh; +} + +// Used to reset the state of tx/mb rd hash information +static INLINE void reset_hash_records(MACROBLOCK *const x, + int use_inter_txb_hash) { + int32_t record_idx; + + // Reset the state for use_inter_txb_hash + if (use_inter_txb_hash) { + for (record_idx = 0; + record_idx < ((MAX_MIB_SIZE >> 1) * (MAX_MIB_SIZE >> 1)); record_idx++) + x->txb_rd_record_8X8[record_idx].num = + x->txb_rd_record_8X8[record_idx].index_start = 0; + for (record_idx = 0; + record_idx < ((MAX_MIB_SIZE >> 2) * (MAX_MIB_SIZE >> 2)); record_idx++) + x->txb_rd_record_16X16[record_idx].num = + x->txb_rd_record_16X16[record_idx].index_start = 0; + for (record_idx = 0; + record_idx < ((MAX_MIB_SIZE >> 3) * (MAX_MIB_SIZE >> 3)); record_idx++) + x->txb_rd_record_32X32[record_idx].num = + x->txb_rd_record_32X32[record_idx].index_start = 0; + for (record_idx = 0; + record_idx < ((MAX_MIB_SIZE >> 4) * (MAX_MIB_SIZE >> 4)); record_idx++) + x->txb_rd_record_64X64[record_idx].num = + x->txb_rd_record_64X64[record_idx].index_start = 0; + } + + // Reset the state for use_intra_txb_hash + x->txb_rd_record_intra.num = x->txb_rd_record_intra.index_start = 0; + + // Reset the state for use_mb_rd_hash + x->mb_rd_record.num = x->mb_rd_record.index_start = 0; +} + +void av1_setup_pred_block(const MACROBLOCKD *xd, + struct buf_2d dst[MAX_MB_PLANE], + const YV12_BUFFER_CONFIG *src, + const struct scale_factors *scale, + const struct scale_factors *scale_uv, + const int num_planes); + +int av1_get_intra_cost_penalty(int qindex, int qdelta, + aom_bit_depth_t bit_depth); + +void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x, + FRAME_CONTEXT *fc); + +void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc, + const int num_planes); + +void av1_fill_mv_costs(const FRAME_CONTEXT *fc, int integer_mv, int usehp, + MACROBLOCK *x); + +int av1_get_adaptive_rdmult(const struct AV1_COMP *cpi, double beta); + +int av1_get_deltaq_offset(const struct AV1_COMP *cpi, int qindex, double beta); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_RD_H_ diff --git a/libs/libaom/src/av1/encoder/rdopt.c b/libs/libaom/src/av1/encoder/rdopt.c new file mode 100644 index 000000000..02afcd1ff --- /dev/null +++ b/libs/libaom/src/av1/encoder/rdopt.c @@ -0,0 +1,5505 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/mem.h" +#include "aom_ports/system_state.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/cfl.h" +#include "av1/common/common.h" +#include "av1/common/common_data.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/idct.h" +#include "av1/common/mvref_common.h" +#include "av1/common/obmc.h" +#include "av1/common/pred_common.h" +#include "av1/common/quant_common.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/common/scan.h" +#include "av1/common/seg_common.h" +#include "av1/common/txb_common.h" +#include "av1/common/warped_motion.h" + +#include "av1/encoder/aq_variance.h" +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/cost.h" +#include "av1/encoder/compound_type.h" +#include "av1/encoder/encodemb.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encodetxb.h" +#include "av1/encoder/hybrid_fwd_txfm.h" +#include "av1/encoder/interp_search.h" +#include "av1/encoder/intra_mode_search.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/ml.h" +#include "av1/encoder/mode_prune_model_weights.h" +#include "av1/encoder/model_rd.h" +#include "av1/encoder/motion_search_facade.h" +#include "av1/encoder/palette.h" +#include "av1/encoder/pustats.h" +#include "av1/encoder/random.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/tokenize.h" +#include "av1/encoder/tpl_model.h" +#include "av1/encoder/tx_search.h" + +#define LAST_NEW_MV_INDEX 6 + +// Mode_threshold multiplication factor table for prune_inter_modes_if_skippable +// The values are kept in Q12 format and equation used to derive is +// (2.5 - ((float)x->qindex / MAXQ) * 1.5) +#define MODE_THRESH_QBITS 12 +static const int mode_threshold_mul_factor[QINDEX_RANGE] = { + 10240, 10216, 10192, 10168, 10144, 10120, 10095, 10071, 10047, 10023, 9999, + 9975, 9951, 9927, 9903, 9879, 9854, 9830, 9806, 9782, 9758, 9734, + 9710, 9686, 9662, 9638, 9614, 9589, 9565, 9541, 9517, 9493, 9469, + 9445, 9421, 9397, 9373, 9349, 9324, 9300, 9276, 9252, 9228, 9204, + 9180, 9156, 9132, 9108, 9083, 9059, 9035, 9011, 8987, 8963, 8939, + 8915, 8891, 8867, 8843, 8818, 8794, 8770, 8746, 8722, 8698, 8674, + 8650, 8626, 8602, 8578, 8553, 8529, 8505, 8481, 8457, 8433, 8409, + 8385, 8361, 8337, 8312, 8288, 8264, 8240, 8216, 8192, 8168, 8144, + 8120, 8096, 8072, 8047, 8023, 7999, 7975, 7951, 7927, 7903, 7879, + 7855, 7831, 7806, 7782, 7758, 7734, 7710, 7686, 7662, 7638, 7614, + 7590, 7566, 7541, 7517, 7493, 7469, 7445, 7421, 7397, 7373, 7349, + 7325, 7301, 7276, 7252, 7228, 7204, 7180, 7156, 7132, 7108, 7084, + 7060, 7035, 7011, 6987, 6963, 6939, 6915, 6891, 6867, 6843, 6819, + 6795, 6770, 6746, 6722, 6698, 6674, 6650, 6626, 6602, 6578, 6554, + 6530, 6505, 6481, 6457, 6433, 6409, 6385, 6361, 6337, 6313, 6289, + 6264, 6240, 6216, 6192, 6168, 6144, 6120, 6096, 6072, 6048, 6024, + 5999, 5975, 5951, 5927, 5903, 5879, 5855, 5831, 5807, 5783, 5758, + 5734, 5710, 5686, 5662, 5638, 5614, 5590, 5566, 5542, 5518, 5493, + 5469, 5445, 5421, 5397, 5373, 5349, 5325, 5301, 5277, 5253, 5228, + 5204, 5180, 5156, 5132, 5108, 5084, 5060, 5036, 5012, 4987, 4963, + 4939, 4915, 4891, 4867, 4843, 4819, 4795, 4771, 4747, 4722, 4698, + 4674, 4650, 4626, 4602, 4578, 4554, 4530, 4506, 4482, 4457, 4433, + 4409, 4385, 4361, 4337, 4313, 4289, 4265, 4241, 4216, 4192, 4168, + 4144, 4120, 4096 +}; + +static const THR_MODES av1_default_mode_order[MAX_MODES] = { + THR_NEARESTMV, + THR_NEARESTL2, + THR_NEARESTL3, + THR_NEARESTB, + THR_NEARESTA2, + THR_NEARESTA, + THR_NEARESTG, + + THR_NEWMV, + THR_NEWL2, + THR_NEWL3, + THR_NEWB, + THR_NEWA2, + THR_NEWA, + THR_NEWG, + + THR_NEARMV, + THR_NEARL2, + THR_NEARL3, + THR_NEARB, + THR_NEARA2, + THR_NEARA, + THR_NEARG, + + THR_GLOBALMV, + THR_GLOBALL2, + THR_GLOBALL3, + THR_GLOBALB, + THR_GLOBALA2, + THR_GLOBALA, + THR_GLOBALG, + + THR_COMP_NEAREST_NEARESTLA, + THR_COMP_NEAREST_NEARESTL2A, + THR_COMP_NEAREST_NEARESTL3A, + THR_COMP_NEAREST_NEARESTGA, + THR_COMP_NEAREST_NEARESTLB, + THR_COMP_NEAREST_NEARESTL2B, + THR_COMP_NEAREST_NEARESTL3B, + THR_COMP_NEAREST_NEARESTGB, + THR_COMP_NEAREST_NEARESTLA2, + THR_COMP_NEAREST_NEARESTL2A2, + THR_COMP_NEAREST_NEARESTL3A2, + THR_COMP_NEAREST_NEARESTGA2, + THR_COMP_NEAREST_NEARESTLL2, + THR_COMP_NEAREST_NEARESTLL3, + THR_COMP_NEAREST_NEARESTLG, + THR_COMP_NEAREST_NEARESTBA, + + THR_COMP_NEAR_NEARLA, + THR_COMP_NEW_NEARESTLA, + THR_COMP_NEAREST_NEWLA, + THR_COMP_NEW_NEARLA, + THR_COMP_NEAR_NEWLA, + THR_COMP_NEW_NEWLA, + THR_COMP_GLOBAL_GLOBALLA, + + THR_COMP_NEAR_NEARL2A, + THR_COMP_NEW_NEARESTL2A, + THR_COMP_NEAREST_NEWL2A, + THR_COMP_NEW_NEARL2A, + THR_COMP_NEAR_NEWL2A, + THR_COMP_NEW_NEWL2A, + THR_COMP_GLOBAL_GLOBALL2A, + + THR_COMP_NEAR_NEARL3A, + THR_COMP_NEW_NEARESTL3A, + THR_COMP_NEAREST_NEWL3A, + THR_COMP_NEW_NEARL3A, + THR_COMP_NEAR_NEWL3A, + THR_COMP_NEW_NEWL3A, + THR_COMP_GLOBAL_GLOBALL3A, + + THR_COMP_NEAR_NEARGA, + THR_COMP_NEW_NEARESTGA, + THR_COMP_NEAREST_NEWGA, + THR_COMP_NEW_NEARGA, + THR_COMP_NEAR_NEWGA, + THR_COMP_NEW_NEWGA, + THR_COMP_GLOBAL_GLOBALGA, + + THR_COMP_NEAR_NEARLB, + THR_COMP_NEW_NEARESTLB, + THR_COMP_NEAREST_NEWLB, + THR_COMP_NEW_NEARLB, + THR_COMP_NEAR_NEWLB, + THR_COMP_NEW_NEWLB, + THR_COMP_GLOBAL_GLOBALLB, + + THR_COMP_NEAR_NEARL2B, + THR_COMP_NEW_NEARESTL2B, + THR_COMP_NEAREST_NEWL2B, + THR_COMP_NEW_NEARL2B, + THR_COMP_NEAR_NEWL2B, + THR_COMP_NEW_NEWL2B, + THR_COMP_GLOBAL_GLOBALL2B, + + THR_COMP_NEAR_NEARL3B, + THR_COMP_NEW_NEARESTL3B, + THR_COMP_NEAREST_NEWL3B, + THR_COMP_NEW_NEARL3B, + THR_COMP_NEAR_NEWL3B, + THR_COMP_NEW_NEWL3B, + THR_COMP_GLOBAL_GLOBALL3B, + + THR_COMP_NEAR_NEARGB, + THR_COMP_NEW_NEARESTGB, + THR_COMP_NEAREST_NEWGB, + THR_COMP_NEW_NEARGB, + THR_COMP_NEAR_NEWGB, + THR_COMP_NEW_NEWGB, + THR_COMP_GLOBAL_GLOBALGB, + + THR_COMP_NEAR_NEARLA2, + THR_COMP_NEW_NEARESTLA2, + THR_COMP_NEAREST_NEWLA2, + THR_COMP_NEW_NEARLA2, + THR_COMP_NEAR_NEWLA2, + THR_COMP_NEW_NEWLA2, + THR_COMP_GLOBAL_GLOBALLA2, + + THR_COMP_NEAR_NEARL2A2, + THR_COMP_NEW_NEARESTL2A2, + THR_COMP_NEAREST_NEWL2A2, + THR_COMP_NEW_NEARL2A2, + THR_COMP_NEAR_NEWL2A2, + THR_COMP_NEW_NEWL2A2, + THR_COMP_GLOBAL_GLOBALL2A2, + + THR_COMP_NEAR_NEARL3A2, + THR_COMP_NEW_NEARESTL3A2, + THR_COMP_NEAREST_NEWL3A2, + THR_COMP_NEW_NEARL3A2, + THR_COMP_NEAR_NEWL3A2, + THR_COMP_NEW_NEWL3A2, + THR_COMP_GLOBAL_GLOBALL3A2, + + THR_COMP_NEAR_NEARGA2, + THR_COMP_NEW_NEARESTGA2, + THR_COMP_NEAREST_NEWGA2, + THR_COMP_NEW_NEARGA2, + THR_COMP_NEAR_NEWGA2, + THR_COMP_NEW_NEWGA2, + THR_COMP_GLOBAL_GLOBALGA2, + + THR_COMP_NEAR_NEARLL2, + THR_COMP_NEW_NEARESTLL2, + THR_COMP_NEAREST_NEWLL2, + THR_COMP_NEW_NEARLL2, + THR_COMP_NEAR_NEWLL2, + THR_COMP_NEW_NEWLL2, + THR_COMP_GLOBAL_GLOBALLL2, + + THR_COMP_NEAR_NEARLL3, + THR_COMP_NEW_NEARESTLL3, + THR_COMP_NEAREST_NEWLL3, + THR_COMP_NEW_NEARLL3, + THR_COMP_NEAR_NEWLL3, + THR_COMP_NEW_NEWLL3, + THR_COMP_GLOBAL_GLOBALLL3, + + THR_COMP_NEAR_NEARLG, + THR_COMP_NEW_NEARESTLG, + THR_COMP_NEAREST_NEWLG, + THR_COMP_NEW_NEARLG, + THR_COMP_NEAR_NEWLG, + THR_COMP_NEW_NEWLG, + THR_COMP_GLOBAL_GLOBALLG, + + THR_COMP_NEAR_NEARBA, + THR_COMP_NEW_NEARESTBA, + THR_COMP_NEAREST_NEWBA, + THR_COMP_NEW_NEARBA, + THR_COMP_NEAR_NEWBA, + THR_COMP_NEW_NEWBA, + THR_COMP_GLOBAL_GLOBALBA, + + THR_DC, + THR_PAETH, + THR_SMOOTH, + THR_SMOOTH_V, + THR_SMOOTH_H, + THR_H_PRED, + THR_V_PRED, + THR_D135_PRED, + THR_D203_PRED, + THR_D157_PRED, + THR_D67_PRED, + THR_D113_PRED, + THR_D45_PRED, +}; + +static int find_last_single_ref_mode_idx(const THR_MODES *mode_order) { + uint8_t mode_found[NUM_SINGLE_REF_MODES]; + av1_zero(mode_found); + int num_single_ref_modes_left = NUM_SINGLE_REF_MODES; + + for (int idx = 0; idx < MAX_MODES; idx++) { + const THR_MODES curr_mode = mode_order[idx]; + if (curr_mode < SINGLE_REF_MODE_END) { + num_single_ref_modes_left--; + } + if (!num_single_ref_modes_left) { + return idx; + } + } + return -1; +} + +typedef struct SingleInterModeState { + int64_t rd; + MV_REFERENCE_FRAME ref_frame; + int valid; +} SingleInterModeState; + +typedef struct InterModeSearchState { + int64_t best_rd; + int64_t best_skip_rd[2]; + MB_MODE_INFO best_mbmode; + int best_rate_y; + int best_rate_uv; + int best_mode_skippable; + int best_skip2; + THR_MODES best_mode_index; + int num_available_refs; + int64_t dist_refs[REF_FRAMES]; + int dist_order_refs[REF_FRAMES]; + int64_t mode_threshold[MAX_MODES]; + int64_t best_intra_rd; + unsigned int best_pred_sse; + int64_t best_pred_diff[REFERENCE_MODES]; + // Save a set of single_newmv for each checked ref_mv. + int_mv single_newmv[MAX_REF_MV_SEARCH][REF_FRAMES]; + int single_newmv_rate[MAX_REF_MV_SEARCH][REF_FRAMES]; + int single_newmv_valid[MAX_REF_MV_SEARCH][REF_FRAMES]; + int64_t modelled_rd[MB_MODE_COUNT][MAX_REF_MV_SEARCH][REF_FRAMES]; + // The rd of simple translation in single inter modes + int64_t simple_rd[MB_MODE_COUNT][MAX_REF_MV_SEARCH][REF_FRAMES]; + + // Single search results by [directions][modes][reference frames] + SingleInterModeState single_state[2][SINGLE_INTER_MODE_NUM][FWD_REFS]; + int single_state_cnt[2][SINGLE_INTER_MODE_NUM]; + SingleInterModeState single_state_modelled[2][SINGLE_INTER_MODE_NUM] + [FWD_REFS]; + int single_state_modelled_cnt[2][SINGLE_INTER_MODE_NUM]; + MV_REFERENCE_FRAME single_rd_order[2][SINGLE_INTER_MODE_NUM][FWD_REFS]; + IntraModeSearchState intra_search_state; +} InterModeSearchState; + +void av1_inter_mode_data_init(TileDataEnc *tile_data) { + for (int i = 0; i < BLOCK_SIZES_ALL; ++i) { + InterModeRdModel *md = &tile_data->inter_mode_rd_models[i]; + md->ready = 0; + md->num = 0; + md->dist_sum = 0; + md->ld_sum = 0; + md->sse_sum = 0; + md->sse_sse_sum = 0; + md->sse_ld_sum = 0; + } +} + +static int get_est_rate_dist(const TileDataEnc *tile_data, BLOCK_SIZE bsize, + int64_t sse, int *est_residue_cost, + int64_t *est_dist) { + aom_clear_system_state(); + const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; + if (md->ready) { + if (sse < md->dist_mean) { + *est_residue_cost = 0; + *est_dist = sse; + } else { + *est_dist = (int64_t)round(md->dist_mean); + const double est_ld = md->a * sse + md->b; + // Clamp estimated rate cost by INT_MAX / 2. + // TODO(angiebird@google.com): find better solution than clamping. + if (fabs(est_ld) < 1e-2) { + *est_residue_cost = INT_MAX / 2; + } else { + double est_residue_cost_dbl = ((sse - md->dist_mean) / est_ld); + if (est_residue_cost_dbl < 0) { + *est_residue_cost = 0; + } else { + *est_residue_cost = + (int)AOMMIN((int64_t)round(est_residue_cost_dbl), INT_MAX / 2); + } + } + if (*est_residue_cost <= 0) { + *est_residue_cost = 0; + *est_dist = sse; + } + } + return 1; + } + return 0; +} + +void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult) { + aom_clear_system_state(); + for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) { + const int block_idx = inter_mode_data_block_idx(bsize); + InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; + if (block_idx == -1) continue; + if ((md->ready == 0 && md->num < 200) || (md->ready == 1 && md->num < 64)) { + continue; + } else { + if (md->ready == 0) { + md->dist_mean = md->dist_sum / md->num; + md->ld_mean = md->ld_sum / md->num; + md->sse_mean = md->sse_sum / md->num; + md->sse_sse_mean = md->sse_sse_sum / md->num; + md->sse_ld_mean = md->sse_ld_sum / md->num; + } else { + const double factor = 3; + md->dist_mean = + (md->dist_mean * factor + (md->dist_sum / md->num)) / (factor + 1); + md->ld_mean = + (md->ld_mean * factor + (md->ld_sum / md->num)) / (factor + 1); + md->sse_mean = + (md->sse_mean * factor + (md->sse_sum / md->num)) / (factor + 1); + md->sse_sse_mean = + (md->sse_sse_mean * factor + (md->sse_sse_sum / md->num)) / + (factor + 1); + md->sse_ld_mean = + (md->sse_ld_mean * factor + (md->sse_ld_sum / md->num)) / + (factor + 1); + } + + const double my = md->ld_mean; + const double mx = md->sse_mean; + const double dx = sqrt(md->sse_sse_mean); + const double dxy = md->sse_ld_mean; + + md->a = (dxy - mx * my) / (dx * dx - mx * mx); + md->b = my - md->a * mx; + md->ready = 1; + + md->num = 0; + md->dist_sum = 0; + md->ld_sum = 0; + md->sse_sum = 0; + md->sse_sse_sum = 0; + md->sse_ld_sum = 0; + } + (void)rdmult; + } +} + +static AOM_INLINE void inter_mode_data_push(TileDataEnc *tile_data, + BLOCK_SIZE bsize, int64_t sse, + int64_t dist, int residue_cost) { + if (residue_cost == 0 || sse == dist) return; + const int block_idx = inter_mode_data_block_idx(bsize); + if (block_idx == -1) return; + InterModeRdModel *rd_model = &tile_data->inter_mode_rd_models[bsize]; + if (rd_model->num < INTER_MODE_RD_DATA_OVERALL_SIZE) { + aom_clear_system_state(); + const double ld = (sse - dist) * 1. / residue_cost; + ++rd_model->num; + rd_model->dist_sum += dist; + rd_model->ld_sum += ld; + rd_model->sse_sum += sse; + rd_model->sse_sse_sum += (double)sse * (double)sse; + rd_model->sse_ld_sum += sse * ld; + } +} + +static AOM_INLINE void inter_modes_info_push(InterModesInfo *inter_modes_info, + int mode_rate, int64_t sse, + int64_t rd, RD_STATS *rd_cost, + RD_STATS *rd_cost_y, + RD_STATS *rd_cost_uv, + const MB_MODE_INFO *mbmi) { + const int num = inter_modes_info->num; + assert(num < MAX_INTER_MODES); + inter_modes_info->mbmi_arr[num] = *mbmi; + inter_modes_info->mode_rate_arr[num] = mode_rate; + inter_modes_info->sse_arr[num] = sse; + inter_modes_info->est_rd_arr[num] = rd; + inter_modes_info->rd_cost_arr[num] = *rd_cost; + inter_modes_info->rd_cost_y_arr[num] = *rd_cost_y; + inter_modes_info->rd_cost_uv_arr[num] = *rd_cost_uv; + ++inter_modes_info->num; +} + +static int compare_rd_idx_pair(const void *a, const void *b) { + if (((RdIdxPair *)a)->rd == ((RdIdxPair *)b)->rd) { + return 0; + } else if (((const RdIdxPair *)a)->rd > ((const RdIdxPair *)b)->rd) { + return 1; + } else { + return -1; + } +} + +static AOM_INLINE void inter_modes_info_sort( + const InterModesInfo *inter_modes_info, RdIdxPair *rd_idx_pair_arr) { + if (inter_modes_info->num == 0) { + return; + } + for (int i = 0; i < inter_modes_info->num; ++i) { + rd_idx_pair_arr[i].idx = i; + rd_idx_pair_arr[i].rd = inter_modes_info->est_rd_arr[i]; + } + qsort(rd_idx_pair_arr, inter_modes_info->num, sizeof(rd_idx_pair_arr[0]), + compare_rd_idx_pair); +} + +// Similar to get_horver_correlation, but also takes into account first +// row/column, when computing horizontal/vertical correlation. +void av1_get_horver_correlation_full_c(const int16_t *diff, int stride, + int width, int height, float *hcorr, + float *vcorr) { + // The following notation is used: + // x - current pixel + // y - left neighbor pixel + // z - top neighbor pixel + int64_t x_sum = 0, x2_sum = 0, xy_sum = 0, xz_sum = 0; + int64_t x_firstrow = 0, x_finalrow = 0, x_firstcol = 0, x_finalcol = 0; + int64_t x2_firstrow = 0, x2_finalrow = 0, x2_firstcol = 0, x2_finalcol = 0; + + // First, process horizontal correlation on just the first row + x_sum += diff[0]; + x2_sum += diff[0] * diff[0]; + x_firstrow += diff[0]; + x2_firstrow += diff[0] * diff[0]; + for (int j = 1; j < width; ++j) { + const int16_t x = diff[j]; + const int16_t y = diff[j - 1]; + x_sum += x; + x_firstrow += x; + x2_sum += x * x; + x2_firstrow += x * x; + xy_sum += x * y; + } + + // Process vertical correlation in the first column + x_firstcol += diff[0]; + x2_firstcol += diff[0] * diff[0]; + for (int i = 1; i < height; ++i) { + const int16_t x = diff[i * stride]; + const int16_t z = diff[(i - 1) * stride]; + x_sum += x; + x_firstcol += x; + x2_sum += x * x; + x2_firstcol += x * x; + xz_sum += x * z; + } + + // Now process horiz and vert correlation through the rest unit + for (int i = 1; i < height; ++i) { + for (int j = 1; j < width; ++j) { + const int16_t x = diff[i * stride + j]; + const int16_t y = diff[i * stride + j - 1]; + const int16_t z = diff[(i - 1) * stride + j]; + x_sum += x; + x2_sum += x * x; + xy_sum += x * y; + xz_sum += x * z; + } + } + + for (int j = 0; j < width; ++j) { + x_finalrow += diff[(height - 1) * stride + j]; + x2_finalrow += + diff[(height - 1) * stride + j] * diff[(height - 1) * stride + j]; + } + for (int i = 0; i < height; ++i) { + x_finalcol += diff[i * stride + width - 1]; + x2_finalcol += diff[i * stride + width - 1] * diff[i * stride + width - 1]; + } + + int64_t xhor_sum = x_sum - x_finalcol; + int64_t xver_sum = x_sum - x_finalrow; + int64_t y_sum = x_sum - x_firstcol; + int64_t z_sum = x_sum - x_firstrow; + int64_t x2hor_sum = x2_sum - x2_finalcol; + int64_t x2ver_sum = x2_sum - x2_finalrow; + int64_t y2_sum = x2_sum - x2_firstcol; + int64_t z2_sum = x2_sum - x2_firstrow; + + const float num_hor = (float)(height * (width - 1)); + const float num_ver = (float)((height - 1) * width); + + const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor; + const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver; + + const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor; + const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver; + + const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor; + const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver; + + if (xhor_var_n > 0 && y_var_n > 0) { + *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n); + *hcorr = *hcorr < 0 ? 0 : *hcorr; + } else { + *hcorr = 1.0; + } + if (xver_var_n > 0 && z_var_n > 0) { + *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n); + *vcorr = *vcorr < 0 ? 0 : *vcorr; + } else { + *vcorr = 1.0; + } +} + +static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x, + int64_t *sse_y) { + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + const MACROBLOCKD *xd = &x->e_mbd; + const MB_MODE_INFO *mbmi = xd->mi[0]; + int64_t total_sse = 0; + for (int plane = 0; plane < num_planes; ++plane) { + if (plane && !xd->is_chroma_ref) break; + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE bs = get_plane_block_size(mbmi->sb_type, pd->subsampling_x, + pd->subsampling_y); + unsigned int sse; + + cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, + &sse); + total_sse += sse; + if (!plane && sse_y) *sse_y = sse; + } + total_sse <<= 4; + return total_sse; +} + +int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + int i; + int64_t error = 0, sqcoeff = 0; + + for (i = 0; i < block_size; i++) { + const int diff = coeff[i] - dqcoeff[i]; + error += diff * diff; + sqcoeff += coeff[i] * coeff[i]; + } + + *ssz = sqcoeff; + return error; +} + +int64_t av1_block_error_lp_c(const int16_t *coeff, const int16_t *dqcoeff, + intptr_t block_size) { + int64_t error = 0; + + for (int i = 0; i < block_size; i++) { + const int diff = coeff[i] - dqcoeff[i]; + error += diff * diff; + } + + return error; +} + +#if CONFIG_AV1_HIGHBITDEPTH +int64_t av1_highbd_block_error_c(const tran_low_t *coeff, + const tran_low_t *dqcoeff, intptr_t block_size, + int64_t *ssz, int bd) { + int i; + int64_t error = 0, sqcoeff = 0; + int shift = 2 * (bd - 8); + int rounding = shift > 0 ? 1 << (shift - 1) : 0; + + for (i = 0; i < block_size; i++) { + const int64_t diff = coeff[i] - dqcoeff[i]; + error += diff * diff; + sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i]; + } + assert(error >= 0 && sqcoeff >= 0); + error = (error + rounding) >> shift; + sqcoeff = (sqcoeff + rounding) >> shift; + + *ssz = sqcoeff; + return error; +} +#endif + +static int conditional_skipintra(PREDICTION_MODE mode, + PREDICTION_MODE best_intra_mode) { + if (mode == D113_PRED && best_intra_mode != V_PRED && + best_intra_mode != D135_PRED) + return 1; + if (mode == D67_PRED && best_intra_mode != V_PRED && + best_intra_mode != D45_PRED) + return 1; + if (mode == D203_PRED && best_intra_mode != H_PRED && + best_intra_mode != D45_PRED) + return 1; + if (mode == D157_PRED && best_intra_mode != H_PRED && + best_intra_mode != D135_PRED) + return 1; + return 0; +} + +static int cost_mv_ref(const MACROBLOCK *const x, PREDICTION_MODE mode, + int16_t mode_context) { + if (is_inter_compound_mode(mode)) { + return x + ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)]; + } + + int mode_cost = 0; + int16_t mode_ctx = mode_context & NEWMV_CTX_MASK; + + assert(is_inter_mode(mode)); + + if (mode == NEWMV) { + mode_cost = x->newmv_mode_cost[mode_ctx][0]; + return mode_cost; + } else { + mode_cost = x->newmv_mode_cost[mode_ctx][1]; + mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; + + if (mode == GLOBALMV) { + mode_cost += x->zeromv_mode_cost[mode_ctx][0]; + return mode_cost; + } else { + mode_cost += x->zeromv_mode_cost[mode_ctx][1]; + mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK; + mode_cost += x->refmv_mode_cost[mode_ctx][mode != NEARESTMV]; + return mode_cost; + } + } +} + +static INLINE PREDICTION_MODE get_single_mode(PREDICTION_MODE this_mode, + int ref_idx) { + return ref_idx ? compound_ref1_mode(this_mode) + : compound_ref0_mode(this_mode); +} + +static AOM_INLINE void estimate_ref_frame_costs( + const AV1_COMMON *cm, const MACROBLOCKD *xd, const MACROBLOCK *x, + int segment_id, unsigned int *ref_costs_single, + unsigned int (*ref_costs_comp)[REF_FRAMES]) { + int seg_ref_active = + segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME); + if (seg_ref_active) { + memset(ref_costs_single, 0, REF_FRAMES * sizeof(*ref_costs_single)); + int ref_frame; + for (ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) + memset(ref_costs_comp[ref_frame], 0, + REF_FRAMES * sizeof((*ref_costs_comp)[0])); + } else { + int intra_inter_ctx = av1_get_intra_inter_context(xd); + ref_costs_single[INTRA_FRAME] = x->intra_inter_cost[intra_inter_ctx][0]; + unsigned int base_cost = x->intra_inter_cost[intra_inter_ctx][1]; + + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) + ref_costs_single[i] = base_cost; + + const int ctx_p1 = av1_get_pred_context_single_ref_p1(xd); + const int ctx_p2 = av1_get_pred_context_single_ref_p2(xd); + const int ctx_p3 = av1_get_pred_context_single_ref_p3(xd); + const int ctx_p4 = av1_get_pred_context_single_ref_p4(xd); + const int ctx_p5 = av1_get_pred_context_single_ref_p5(xd); + const int ctx_p6 = av1_get_pred_context_single_ref_p6(xd); + + // Determine cost of a single ref frame, where frame types are represented + // by a tree: + // Level 0: add cost whether this ref is a forward or backward ref + ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p1][0][0]; + ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p1][0][0]; + ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p1][0][0]; + ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p1][0][0]; + ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p1][0][1]; + ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p1][0][1]; + ref_costs_single[ALTREF_FRAME] += x->single_ref_cost[ctx_p1][0][1]; + + // Level 1: if this ref is forward ref, + // add cost whether it is last/last2 or last3/golden + ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p3][2][0]; + ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p3][2][0]; + ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p3][2][1]; + ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p3][2][1]; + + // Level 1: if this ref is backward ref + // then add cost whether this ref is altref or backward ref + ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p2][1][0]; + ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p2][1][0]; + ref_costs_single[ALTREF_FRAME] += x->single_ref_cost[ctx_p2][1][1]; + + // Level 2: further add cost whether this ref is last or last2 + ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p4][3][0]; + ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p4][3][1]; + + // Level 2: last3 or golden + ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p5][4][0]; + ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p5][4][1]; + + // Level 2: bwdref or altref2 + ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p6][5][0]; + ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p6][5][1]; + + if (cm->current_frame.reference_mode != SINGLE_REFERENCE) { + // Similar to single ref, determine cost of compound ref frames. + // cost_compound_refs = cost_first_ref + cost_second_ref + const int bwdref_comp_ctx_p = av1_get_pred_context_comp_bwdref_p(xd); + const int bwdref_comp_ctx_p1 = av1_get_pred_context_comp_bwdref_p1(xd); + const int ref_comp_ctx_p = av1_get_pred_context_comp_ref_p(xd); + const int ref_comp_ctx_p1 = av1_get_pred_context_comp_ref_p1(xd); + const int ref_comp_ctx_p2 = av1_get_pred_context_comp_ref_p2(xd); + + const int comp_ref_type_ctx = av1_get_comp_reference_type_context(xd); + unsigned int ref_bicomp_costs[REF_FRAMES] = { 0 }; + + ref_bicomp_costs[LAST_FRAME] = ref_bicomp_costs[LAST2_FRAME] = + ref_bicomp_costs[LAST3_FRAME] = ref_bicomp_costs[GOLDEN_FRAME] = + base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][1]; + ref_bicomp_costs[BWDREF_FRAME] = ref_bicomp_costs[ALTREF2_FRAME] = 0; + ref_bicomp_costs[ALTREF_FRAME] = 0; + + // cost of first ref frame + ref_bicomp_costs[LAST_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][0]; + ref_bicomp_costs[LAST2_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][0]; + ref_bicomp_costs[LAST3_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][1]; + ref_bicomp_costs[GOLDEN_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][1]; + + ref_bicomp_costs[LAST_FRAME] += x->comp_ref_cost[ref_comp_ctx_p1][1][0]; + ref_bicomp_costs[LAST2_FRAME] += x->comp_ref_cost[ref_comp_ctx_p1][1][1]; + + ref_bicomp_costs[LAST3_FRAME] += x->comp_ref_cost[ref_comp_ctx_p2][2][0]; + ref_bicomp_costs[GOLDEN_FRAME] += x->comp_ref_cost[ref_comp_ctx_p2][2][1]; + + // cost of second ref frame + ref_bicomp_costs[BWDREF_FRAME] += + x->comp_bwdref_cost[bwdref_comp_ctx_p][0][0]; + ref_bicomp_costs[ALTREF2_FRAME] += + x->comp_bwdref_cost[bwdref_comp_ctx_p][0][0]; + ref_bicomp_costs[ALTREF_FRAME] += + x->comp_bwdref_cost[bwdref_comp_ctx_p][0][1]; + + ref_bicomp_costs[BWDREF_FRAME] += + x->comp_bwdref_cost[bwdref_comp_ctx_p1][1][0]; + ref_bicomp_costs[ALTREF2_FRAME] += + x->comp_bwdref_cost[bwdref_comp_ctx_p1][1][1]; + + // cost: if one ref frame is forward ref, the other ref is backward ref + int ref0, ref1; + for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) { + for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1) { + ref_costs_comp[ref0][ref1] = + ref_bicomp_costs[ref0] + ref_bicomp_costs[ref1]; + } + } + + // cost: if both ref frames are the same side. + const int uni_comp_ref_ctx_p = av1_get_pred_context_uni_comp_ref_p(xd); + const int uni_comp_ref_ctx_p1 = av1_get_pred_context_uni_comp_ref_p1(xd); + const int uni_comp_ref_ctx_p2 = av1_get_pred_context_uni_comp_ref_p2(xd); + ref_costs_comp[LAST_FRAME][LAST2_FRAME] = + base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] + + x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] + + x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][0]; + ref_costs_comp[LAST_FRAME][LAST3_FRAME] = + base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] + + x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] + + x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] + + x->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][0]; + ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] = + base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] + + x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] + + x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] + + x->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][1]; + ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] = + base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] + + x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][1]; + } else { + int ref0, ref1; + for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) { + for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1) + ref_costs_comp[ref0][ref1] = 512; + } + ref_costs_comp[LAST_FRAME][LAST2_FRAME] = 512; + ref_costs_comp[LAST_FRAME][LAST3_FRAME] = 512; + ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] = 512; + ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] = 512; + } + } +} + +static AOM_INLINE void store_coding_context( +#if CONFIG_INTERNAL_STATS + MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int mode_index, +#else + MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, +#endif // CONFIG_INTERNAL_STATS + int64_t comp_pred_diff[REFERENCE_MODES], int skippable) { + MACROBLOCKD *const xd = &x->e_mbd; + + // Take a snapshot of the coding context so it can be + // restored if we decide to encode this way + ctx->rd_stats.skip = x->force_skip; + ctx->skippable = skippable; +#if CONFIG_INTERNAL_STATS + ctx->best_mode_index = mode_index; +#endif // CONFIG_INTERNAL_STATS + ctx->mic = *xd->mi[0]; + av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, x->mbmi_ext, + av1_ref_frame_type(xd->mi[0]->ref_frame)); + ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE]; + ctx->comp_pred_diff = (int)comp_pred_diff[COMPOUND_REFERENCE]; + ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT]; +} + +static AOM_INLINE void setup_buffer_ref_mvs_inter( + const AV1_COMP *const cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, + BLOCK_SIZE block_size, struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) { + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + const YV12_BUFFER_CONFIG *scaled_ref_frame = + av1_get_scaled_ref_frame(cpi, ref_frame); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const struct scale_factors *const sf = + get_ref_scale_factors_const(cm, ref_frame); + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref_frame); + assert(yv12 != NULL); + + if (scaled_ref_frame) { + // Setup pred block based on scaled reference, because av1_mv_pred() doesn't + // support scaling. + av1_setup_pred_block(xd, yv12_mb[ref_frame], scaled_ref_frame, NULL, NULL, + num_planes); + } else { + av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes); + } + + // Gets an initial list of candidate vectors from neighbours and orders them + av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count, + xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs, + mbmi_ext->mode_context); + // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and + // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs. + av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame); + // Further refinement that is encode side only to test the top few candidates + // in full and choose the best as the center point for subsequent searches. + // The current implementation doesn't support scaling. + av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12_mb[ref_frame][0].stride, + ref_frame, block_size); + + // Go back to unscaled reference. + if (scaled_ref_frame) { + // We had temporarily setup pred block based on scaled reference above. Go + // back to unscaled reference now, for subsequent use. + av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes); + } +} + +#define LEFT_TOP_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3) +#define RIGHT_BOTTOM_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3) + +// TODO(jingning): this mv clamping function should be block size dependent. +static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) { + const SubpelMvLimits mv_limits = { xd->mb_to_left_edge - LEFT_TOP_MARGIN, + xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN, + xd->mb_to_top_edge - LEFT_TOP_MARGIN, + xd->mb_to_bottom_edge + + RIGHT_BOTTOM_MARGIN }; + clamp_mv(mv, &mv_limits); +} + +/* If the current mode shares the same mv with other modes with higher cost, + * skip this mode. */ +static int skip_repeated_mv(const AV1_COMMON *const cm, + const MACROBLOCK *const x, + PREDICTION_MODE this_mode, + const MV_REFERENCE_FRAME ref_frames[2], + InterModeSearchState *search_state) { + const int is_comp_pred = ref_frames[1] > INTRA_FRAME; + const uint8_t ref_frame_type = av1_ref_frame_type(ref_frames); + const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type]; + PREDICTION_MODE compare_mode = MB_MODE_COUNT; + if (!is_comp_pred) { + if (this_mode == NEARMV) { + if (ref_mv_count == 0) { + // NEARMV has the same motion vector as NEARESTMV + compare_mode = NEARESTMV; + } + if (ref_mv_count == 1 && + cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) { + // NEARMV has the same motion vector as GLOBALMV + compare_mode = GLOBALMV; + } + } + if (this_mode == GLOBALMV) { + if (ref_mv_count == 0 && + cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) { + // GLOBALMV has the same motion vector as NEARESTMV + compare_mode = NEARESTMV; + } + if (ref_mv_count == 1) { + // GLOBALMV has the same motion vector as NEARMV + compare_mode = NEARMV; + } + } + + if (compare_mode != MB_MODE_COUNT) { + // Use modelled_rd to check whether compare mode was searched + if (search_state->modelled_rd[compare_mode][0][ref_frames[0]] != + INT64_MAX) { + const int16_t mode_ctx = + av1_mode_context_analyzer(mbmi_ext->mode_context, ref_frames); + const int compare_cost = cost_mv_ref(x, compare_mode, mode_ctx); + const int this_cost = cost_mv_ref(x, this_mode, mode_ctx); + + // Only skip if the mode cost is larger than compare mode cost + if (this_cost > compare_cost) { + search_state->modelled_rd[this_mode][0][ref_frames[0]] = + search_state->modelled_rd[compare_mode][0][ref_frames[0]]; + return 1; + } + } + } + } + return 0; +} + +static INLINE int clamp_and_check_mv(int_mv *out_mv, int_mv in_mv, + const AV1_COMMON *cm, + const MACROBLOCK *x) { + const MACROBLOCKD *const xd = &x->e_mbd; + *out_mv = in_mv; + lower_mv_precision(&out_mv->as_mv, cm->features.allow_high_precision_mv, + cm->features.cur_frame_force_integer_mv); + clamp_mv2(&out_mv->as_mv, xd); + return av1_is_fullmv_in_range(&x->mv_limits, + get_fullmv_from_mv(&out_mv->as_mv)); +} + +// To use single newmv directly for compound modes, need to clamp the mv to the +// valid mv range. Without this, encoder would generate out of range mv, and +// this is seen in 8k encoding. +static INLINE void clamp_mv_in_range(MACROBLOCK *const x, int_mv *mv, + int ref_idx) { + const int_mv ref_mv = av1_get_ref_mv(x, ref_idx); + SubpelMvLimits mv_limits; + + av1_set_subpel_mv_search_range(&mv_limits, &x->mv_limits, &ref_mv.as_mv); + clamp_mv(&mv->as_mv, &mv_limits); +} + +static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x, + const BLOCK_SIZE bsize, int_mv *cur_mv, + int *const rate_mv, HandleInterModeArgs *const args, + inter_mode_info *mode_info) { + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const int is_comp_pred = has_second_ref(mbmi); + const PREDICTION_MODE this_mode = mbmi->mode; + const int refs[2] = { mbmi->ref_frame[0], + mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] }; + const int ref_mv_idx = mbmi->ref_mv_idx; + + if (is_comp_pred) { + const int valid_mv0 = args->single_newmv_valid[ref_mv_idx][refs[0]]; + const int valid_mv1 = args->single_newmv_valid[ref_mv_idx][refs[1]]; + + if (this_mode == NEW_NEWMV) { + if (valid_mv0) { + cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int; + clamp_mv_in_range(x, &cur_mv[0], 0); + } + if (valid_mv1) { + cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int; + clamp_mv_in_range(x, &cur_mv[1], 1); + } + + // aomenc1 + if (cpi->sf.inter_sf.comp_inter_joint_search_thresh <= bsize || + !valid_mv0 || !valid_mv1) { + av1_joint_motion_search(cpi, x, bsize, cur_mv, NULL, 0, rate_mv); + } else { + *rate_mv = 0; + for (int i = 0; i < 2; ++i) { + const int_mv ref_mv = av1_get_ref_mv(x, i); + *rate_mv += + av1_mv_bit_cost(&cur_mv[i].as_mv, &ref_mv.as_mv, x->nmv_vec_cost, + x->mv_cost_stack, MV_COST_WEIGHT); + } + } + } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) { + if (valid_mv1) { + cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int; + clamp_mv_in_range(x, &cur_mv[1], 1); + } + + // aomenc2 + if (cpi->sf.inter_sf.comp_inter_joint_search_thresh <= bsize || + !valid_mv1) { + av1_compound_single_motion_search_interinter(cpi, x, bsize, cur_mv, + NULL, 0, rate_mv, 1); + } else { + const int_mv ref_mv = av1_get_ref_mv(x, 1); + *rate_mv = + av1_mv_bit_cost(&cur_mv[1].as_mv, &ref_mv.as_mv, x->nmv_vec_cost, + x->mv_cost_stack, MV_COST_WEIGHT); + } + } else { + assert(this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV); + if (valid_mv0) { + cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int; + clamp_mv_in_range(x, &cur_mv[0], 0); + } + + // aomenc3 + if (cpi->sf.inter_sf.comp_inter_joint_search_thresh <= bsize || + !valid_mv0) { + av1_compound_single_motion_search_interinter(cpi, x, bsize, cur_mv, + NULL, 0, rate_mv, 0); + } else { + const int_mv ref_mv = av1_get_ref_mv(x, 0); + *rate_mv = + av1_mv_bit_cost(&cur_mv[0].as_mv, &ref_mv.as_mv, x->nmv_vec_cost, + x->mv_cost_stack, MV_COST_WEIGHT); + } + } + } else { + // Single ref case. + const int ref_idx = 0; + int search_range = INT_MAX; + + if (cpi->sf.mv_sf.reduce_search_range && mbmi->ref_mv_idx > 0) { + const MV ref_mv = av1_get_ref_mv(x, ref_idx).as_mv; + int min_mv_diff = INT_MAX; + int best_match = -1; + MV prev_ref_mv[2] = { { 0 } }; + for (int idx = 0; idx < mbmi->ref_mv_idx; ++idx) { + prev_ref_mv[idx] = av1_get_ref_mv_from_stack(ref_idx, mbmi->ref_frame, + idx, x->mbmi_ext) + .as_mv; + const int ref_mv_diff = AOMMAX(abs(ref_mv.row - prev_ref_mv[idx].row), + abs(ref_mv.col - prev_ref_mv[idx].col)); + + if (min_mv_diff > ref_mv_diff) { + min_mv_diff = ref_mv_diff; + best_match = idx; + } + } + + if (min_mv_diff < (16 << 3)) { + if (args->single_newmv_valid[best_match][refs[0]]) { + search_range = min_mv_diff; + search_range += + AOMMAX(abs(args->single_newmv[best_match][refs[0]].as_mv.row - + prev_ref_mv[best_match].row), + abs(args->single_newmv[best_match][refs[0]].as_mv.col - + prev_ref_mv[best_match].col)); + // Get full pixel search range. + search_range = (search_range + 4) >> 3; + } + } + } + + int_mv best_mv; + av1_single_motion_search(cpi, x, bsize, ref_idx, rate_mv, search_range, + mode_info, &best_mv); + if (best_mv.as_int == INVALID_MV) return INT64_MAX; + + args->single_newmv[ref_mv_idx][refs[0]] = best_mv; + args->single_newmv_rate[ref_mv_idx][refs[0]] = *rate_mv; + args->single_newmv_valid[ref_mv_idx][refs[0]] = 1; + cur_mv[0].as_int = best_mv.as_int; + } + + return 0; +} + +// If number of valid neighbours is 1, +// 1) ROTZOOM parameters can be obtained reliably (2 parameters from +// one neighbouring MV) +// 2) For IDENTITY/TRANSLATION cases, warp can perform better due to +// a different interpolation filter being used. However the quality +// gains (due to the same) may not be much +// For above 2 cases warp evaluation is skipped + +static int check_if_optimal_warp(const AV1_COMP *cpi, + WarpedMotionParams *wm_params, + int num_proj_ref) { + int is_valid_warp = 1; + if (cpi->sf.inter_sf.prune_warp_using_wmtype) { + TransformationType wmtype = get_wmtype(wm_params); + if (num_proj_ref == 1) { + if (wmtype != ROTZOOM) is_valid_warp = 0; + } else { + if (wmtype < ROTZOOM) is_valid_warp = 0; + } + } + return is_valid_warp; +} + +static INLINE void update_mode_start_end_index(const AV1_COMP *const cpi, + int *mode_index_start, + int *mode_index_end, + int last_motion_mode_allowed, + int interintra_allowed, + int eval_motion_mode) { + *mode_index_start = (int)SIMPLE_TRANSLATION; + *mode_index_end = (int)last_motion_mode_allowed + interintra_allowed; + if (cpi->sf.winner_mode_sf.motion_mode_for_winner_cand) { + if (!eval_motion_mode) { + *mode_index_end = (int)SIMPLE_TRANSLATION; + } else { + // Set the start index appropriately to process motion modes other than + // simple translation + *mode_index_start = 1; + } + } +} + +// TODO(afergs): Refactor the MBMI references in here - there's four +// TODO(afergs): Refactor optional args - add them to a struct or remove +static int64_t motion_mode_rd( + const AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *const x, + BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y, + RD_STATS *rd_stats_uv, int *disable_skip, HandleInterModeArgs *const args, + int64_t ref_best_rd, int64_t *ref_skip_rd, int *rate_mv, + const BUFFER_SET *orig_dst, int64_t *best_est_rd, int do_tx_search, + InterModesInfo *inter_modes_info, int eval_motion_mode) { + const AV1_COMMON *const cm = &cpi->common; + const FeatureFlags *const features = &cm->features; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const int is_comp_pred = has_second_ref(mbmi); + const PREDICTION_MODE this_mode = mbmi->mode; + const int rate2_nocoeff = rd_stats->rate; + int best_xskip = 0, best_disable_skip = 0; + RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + const int rate_mv0 = *rate_mv; + const int interintra_allowed = cm->seq_params.enable_interintra_compound && + is_interintra_allowed(mbmi) && + mbmi->compound_idx; + int pts0[SAMPLES_ARRAY_SIZE], pts_inref0[SAMPLES_ARRAY_SIZE]; + + assert(mbmi->ref_frame[1] != INTRA_FRAME); + const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1]; + (void)tile_data; + av1_invalid_rd_stats(&best_rd_stats); + aom_clear_system_state(); + mbmi->num_proj_ref = 1; // assume num_proj_ref >=1 + MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION; + if (features->switchable_motion_mode) { + last_motion_mode_allowed = motion_mode_allowed( + xd->global_motion, xd, mbmi, features->allow_warped_motion); + } + + if (last_motion_mode_allowed == WARPED_CAUSAL) { + mbmi->num_proj_ref = av1_findSamples(cm, xd, pts0, pts_inref0); + } + const int total_samples = mbmi->num_proj_ref; + if (total_samples == 0) { + last_motion_mode_allowed = OBMC_CAUSAL; + } + + const MB_MODE_INFO base_mbmi = *mbmi; + MB_MODE_INFO best_mbmi; + SimpleRDState *const simple_states = &args->simple_rd_state[mbmi->ref_mv_idx]; + const int interp_filter = features->interp_filter; + const int switchable_rate = + av1_is_interp_needed(xd) ? av1_get_switchable_rate(x, xd, interp_filter) + : 0; + int64_t best_rd = INT64_MAX; + int best_rate_mv = rate_mv0; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + int mode_index_start, mode_index_end; + update_mode_start_end_index(cpi, &mode_index_start, &mode_index_end, + last_motion_mode_allowed, interintra_allowed, + eval_motion_mode); + for (int mode_index = mode_index_start; mode_index <= mode_index_end; + mode_index++) { + if (args->skip_motion_mode && mode_index) continue; + if (cpi->sf.inter_sf.prune_single_motion_modes_by_simple_trans && + args->single_ref_first_pass && mode_index) + break; + int tmp_rate2 = rate2_nocoeff; + const int is_interintra_mode = mode_index > (int)last_motion_mode_allowed; + int tmp_rate_mv = rate_mv0; + + *mbmi = base_mbmi; + if (is_interintra_mode) { + mbmi->motion_mode = SIMPLE_TRANSLATION; + } else { + mbmi->motion_mode = (MOTION_MODE)mode_index; + assert(mbmi->ref_frame[1] != INTRA_FRAME); + } + + const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group); + const int prune_obmc = cpi->frame_probs.obmc_probs[update_type][bsize] < + cpi->sf.inter_sf.prune_obmc_prob_thresh; + if ((cpi->oxcf.enable_obmc == 0 || cpi->sf.inter_sf.disable_obmc || + cpi->sf.rt_sf.use_nonrd_pick_mode || prune_obmc) && + mbmi->motion_mode == OBMC_CAUSAL) + continue; + + if (mbmi->motion_mode == SIMPLE_TRANSLATION && !is_interintra_mode) { + // SIMPLE_TRANSLATION mode: no need to recalculate. + // The prediction is calculated before motion_mode_rd() is called in + // handle_inter_mode() + if (cpi->sf.inter_sf.prune_single_motion_modes_by_simple_trans && + !is_comp_pred) { + if (args->single_ref_first_pass == 0) { + if (simple_states->early_skipped) { + assert(simple_states->rd_stats.rdcost == INT64_MAX); + return INT64_MAX; + } + if (simple_states->rd_stats.rdcost != INT64_MAX) { + best_rd = simple_states->rd_stats.rdcost; + best_rd_stats = simple_states->rd_stats; + best_rd_stats_y = simple_states->rd_stats_y; + best_rd_stats_uv = simple_states->rd_stats_uv; + memcpy(best_blk_skip, simple_states->blk_skip, + sizeof(x->blk_skip[0]) * xd->height * xd->width); + av1_copy_array(best_tx_type_map, simple_states->tx_type_map, + xd->height * xd->width); + best_xskip = simple_states->skip; + best_disable_skip = simple_states->disable_skip; + best_mbmi = *mbmi; + } + continue; + } + simple_states->early_skipped = 0; + } + } else if (mbmi->motion_mode == OBMC_CAUSAL) { + const uint32_t cur_mv = mbmi->mv[0].as_int; + assert(!is_comp_pred); + if (have_newmv_in_inter_mode(this_mode)) { + av1_single_motion_search(cpi, x, bsize, 0, &tmp_rate_mv, INT_MAX, NULL, + &mbmi->mv[0]); + tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv; + } + if ((mbmi->mv[0].as_int != cur_mv) || eval_motion_mode) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + 0, av1_num_planes(cm) - 1); + } + av1_build_obmc_inter_prediction( + cm, xd, args->above_pred_buf, args->above_pred_stride, + args->left_pred_buf, args->left_pred_stride); + } else if (mbmi->motion_mode == WARPED_CAUSAL) { + int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; + mbmi->motion_mode = WARPED_CAUSAL; + mbmi->wm_params.wmtype = DEFAULT_WMTYPE; + mbmi->interp_filters = + av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter)); + + memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0)); + memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0)); + // Select the samples according to motion vector difference + if (mbmi->num_proj_ref > 1) { + mbmi->num_proj_ref = av1_selectSamples( + &mbmi->mv[0].as_mv, pts, pts_inref, mbmi->num_proj_ref, bsize); + } + + if (!av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize, + mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col, + &mbmi->wm_params, mi_row, mi_col)) { + // Refine MV for NEWMV mode + assert(!is_comp_pred); + if (have_newmv_in_inter_mode(this_mode)) { + const int_mv mv0 = mbmi->mv[0]; + const WarpedMotionParams wm_params0 = mbmi->wm_params; + const int num_proj_ref0 = mbmi->num_proj_ref; + + if (cpi->sf.inter_sf.prune_warp_using_wmtype) { + TransformationType wmtype = get_wmtype(&mbmi->wm_params); + if (wmtype < ROTZOOM) continue; + } + + const int_mv ref_mv = av1_get_ref_mv(x, 0); + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, + &ref_mv.as_mv, NULL); + + // Refine MV in a small range. + av1_refine_warped_mv(xd, cm, &ms_params, bsize, pts0, pts_inref0, + total_samples); + + // Keep the refined MV and WM parameters. + if (mv0.as_int != mbmi->mv[0].as_int) { + tmp_rate_mv = av1_mv_bit_cost(&mbmi->mv[0].as_mv, &ref_mv.as_mv, + x->nmv_vec_cost, x->mv_cost_stack, + MV_COST_WEIGHT); + if (cpi->sf.mv_sf.adaptive_motion_search) { + x->pred_mv[mbmi->ref_frame[0]] = mbmi->mv[0].as_mv; + } + tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv; + } else { + // Restore the old MV and WM parameters. + mbmi->mv[0] = mv0; + mbmi->wm_params = wm_params0; + mbmi->num_proj_ref = num_proj_ref0; + } + } else { + if (!check_if_optimal_warp(cpi, &mbmi->wm_params, mbmi->num_proj_ref)) + continue; + } + + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, + av1_num_planes(cm) - 1); + } else { + continue; + } + } else if (is_interintra_mode) { + const int ret = + av1_handle_inter_intra_mode(cpi, x, bsize, mbmi, args, ref_best_rd, + &tmp_rate_mv, &tmp_rate2, orig_dst); + if (ret < 0) continue; + } + + // If we are searching newmv and the mv is the same as refmv, skip the + // current mode + if (this_mode == NEW_NEWMV) { + const int_mv ref_mv_0 = av1_get_ref_mv(x, 0); + const int_mv ref_mv_1 = av1_get_ref_mv(x, 1); + if (mbmi->mv[0].as_int == ref_mv_0.as_int || + mbmi->mv[1].as_int == ref_mv_1.as_int) { + continue; + } + } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) { + const int_mv ref_mv_1 = av1_get_ref_mv(x, 1); + if (mbmi->mv[1].as_int == ref_mv_1.as_int) { + continue; + } + } else if (this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV) { + const int_mv ref_mv_0 = av1_get_ref_mv(x, 0); + if (mbmi->mv[0].as_int == ref_mv_0.as_int) { + continue; + } + } else if (this_mode == NEWMV) { + const int_mv ref_mv_0 = av1_get_ref_mv(x, 0); + if (mbmi->mv[0].as_int == ref_mv_0.as_int) { + continue; + } + } + + x->force_skip = 0; + rd_stats->dist = 0; + rd_stats->sse = 0; + rd_stats->skip = 1; + rd_stats->rate = tmp_rate2; + if (mbmi->motion_mode != WARPED_CAUSAL) rd_stats->rate += switchable_rate; + if (interintra_allowed) { + rd_stats->rate += x->interintra_cost[size_group_lookup[bsize]] + [mbmi->ref_frame[1] == INTRA_FRAME]; + } + if ((last_motion_mode_allowed > SIMPLE_TRANSLATION) && + (mbmi->ref_frame[1] != INTRA_FRAME)) { + if (last_motion_mode_allowed == WARPED_CAUSAL) { + rd_stats->rate += x->motion_mode_cost[bsize][mbmi->motion_mode]; + } else { + rd_stats->rate += x->motion_mode_cost1[bsize][mbmi->motion_mode]; + } + } + + if (!do_tx_search) { + int64_t curr_sse = -1; + int64_t sse_y = -1; + int est_residue_cost = 0; + int64_t est_dist = 0; + int64_t est_rd = 0; + if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) { + curr_sse = get_sse(cpi, x, &sse_y); + // Scale luma SSE as per bit depth so as to be consistent with + // model_rd_sb_fn and compound type rd + sse_y = ROUND_POWER_OF_TWO(sse_y, (xd->bd - 8) * 2); + const int has_est_rd = get_est_rate_dist(tile_data, bsize, curr_sse, + &est_residue_cost, &est_dist); + (void)has_est_rd; + assert(has_est_rd); + } else if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 2 || + cpi->sf.rt_sf.use_nonrd_pick_mode) { + model_rd_sb_fn[MODELRD_TYPE_MOTION_MODE_RD]( + cpi, bsize, x, xd, 0, num_planes - 1, &est_residue_cost, &est_dist, + NULL, &curr_sse, NULL, NULL, NULL); + sse_y = x->pred_sse[xd->mi[0]->ref_frame[0]]; + } + est_rd = RDCOST(x->rdmult, rd_stats->rate + est_residue_cost, est_dist); + if (est_rd * 0.80 > *best_est_rd) { + mbmi->ref_frame[1] = ref_frame_1; + continue; + } + const int mode_rate = rd_stats->rate; + rd_stats->rate += est_residue_cost; + rd_stats->dist = est_dist; + rd_stats->rdcost = est_rd; + if (rd_stats->rdcost < *best_est_rd) { + *best_est_rd = rd_stats->rdcost; + assert(sse_y >= 0); + ref_skip_rd[1] = cpi->sf.inter_sf.txfm_rd_gate_level + ? RDCOST(x->rdmult, mode_rate, (sse_y << 4)) + : INT64_MAX; + } + if (cm->current_frame.reference_mode == SINGLE_REFERENCE) { + if (!is_comp_pred) { + assert(curr_sse >= 0); + inter_modes_info_push(inter_modes_info, mode_rate, curr_sse, + rd_stats->rdcost, rd_stats, rd_stats_y, + rd_stats_uv, mbmi); + } + } else { + assert(curr_sse >= 0); + inter_modes_info_push(inter_modes_info, mode_rate, curr_sse, + rd_stats->rdcost, rd_stats, rd_stats_y, + rd_stats_uv, mbmi); + } + mbmi->skip = 0; + } else { + int64_t skip_rd = INT64_MAX; + int64_t skip_rdy = INT64_MAX; + if (cpi->sf.inter_sf.txfm_rd_gate_level) { + // Check if the mode is good enough based on skip RD + int64_t sse_y = INT64_MAX; + int64_t curr_sse = get_sse(cpi, x, &sse_y); + // Scale luma SSE as per bit depth so as to be consistent with + // model_rd_sb_fn and compound type rd + sse_y = ROUND_POWER_OF_TWO(sse_y, (xd->bd - 8) * 2); + skip_rd = RDCOST(x->rdmult, rd_stats->rate, curr_sse); + skip_rdy = RDCOST(x->rdmult, rd_stats->rate, (sse_y << 4)); + int eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd[0], skip_rd, + cpi->sf.inter_sf.txfm_rd_gate_level, 0); + if (!eval_txfm) continue; + } + + if (!av1_txfm_search(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, + rd_stats->rate, ref_best_rd)) { + if (rd_stats_y->rate == INT_MAX && mode_index == 0) { + if (cpi->sf.inter_sf.prune_single_motion_modes_by_simple_trans && + !is_comp_pred) { + simple_states->early_skipped = 1; + } + return INT64_MAX; + } + continue; + } + + const int64_t curr_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + if (curr_rd < ref_best_rd) { + ref_best_rd = curr_rd; + ref_skip_rd[0] = skip_rd; + ref_skip_rd[1] = skip_rdy; + } + *disable_skip = 0; + if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) { + const int skip_ctx = av1_get_skip_context(xd); + inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats->sse, + rd_stats->dist, + rd_stats_y->rate + rd_stats_uv->rate + + x->skip_cost[skip_ctx][mbmi->skip]); + } + } + + if (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV) { + if (is_nontrans_global_motion(xd, xd->mi[0])) { + mbmi->interp_filters = + av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter)); + } + } + + const int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + if (mode_index == 0) { + args->simple_rd[this_mode][mbmi->ref_mv_idx][mbmi->ref_frame[0]] = tmp_rd; + if (!is_comp_pred) { + simple_states->rd_stats = *rd_stats; + simple_states->rd_stats.rdcost = tmp_rd; + simple_states->rd_stats_y = *rd_stats_y; + simple_states->rd_stats_uv = *rd_stats_uv; + memcpy(simple_states->blk_skip, x->blk_skip, + sizeof(x->blk_skip[0]) * xd->height * xd->width); + av1_copy_array(simple_states->tx_type_map, xd->tx_type_map, + xd->height * xd->width); + simple_states->skip = mbmi->skip; + simple_states->disable_skip = *disable_skip; + } + } + if (mode_index == 0 || tmp_rd < best_rd) { + best_mbmi = *mbmi; + best_rd = tmp_rd; + best_rd_stats = *rd_stats; + best_rd_stats_y = *rd_stats_y; + best_rate_mv = tmp_rate_mv; + if (num_planes > 1) best_rd_stats_uv = *rd_stats_uv; + memcpy(best_blk_skip, x->blk_skip, + sizeof(x->blk_skip[0]) * xd->height * xd->width); + av1_copy_array(best_tx_type_map, xd->tx_type_map, xd->height * xd->width); + best_xskip = mbmi->skip; + best_disable_skip = *disable_skip; + // TODO(anyone): evaluate the quality and speed trade-off of the early + // termination logic below. + // if (best_xskip) break; + } + } + mbmi->ref_frame[1] = ref_frame_1; + *rate_mv = best_rate_mv; + if (best_rd == INT64_MAX) { + av1_invalid_rd_stats(rd_stats); + restore_dst_buf(xd, *orig_dst, num_planes); + return INT64_MAX; + } + *mbmi = best_mbmi; + *rd_stats = best_rd_stats; + *rd_stats_y = best_rd_stats_y; + if (num_planes > 1) *rd_stats_uv = best_rd_stats_uv; + memcpy(x->blk_skip, best_blk_skip, + sizeof(x->blk_skip[0]) * xd->height * xd->width); + av1_copy_array(xd->tx_type_map, best_tx_type_map, xd->height * xd->width); + x->force_skip = best_xskip; + *disable_skip = best_disable_skip; + + restore_dst_buf(xd, *orig_dst, num_planes); + return 0; +} + +static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi, + MACROBLOCK *const x, BLOCK_SIZE bsize, + const BUFFER_SET *const orig_dst) { + assert(bsize < BLOCK_SIZES_ALL); + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, 0, + av1_num_planes(cm) - 1); + + int64_t total_sse = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + + av1_subtract_plane(x, plane_bsize, plane); + int64_t sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh) << 4; + total_sse += sse; + } + const int skip_mode_ctx = av1_get_skip_mode_context(xd); + rd_stats->dist = rd_stats->sse = total_sse; + rd_stats->rate = x->skip_mode_cost[skip_mode_ctx][1]; + rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + + restore_dst_buf(xd, *orig_dst, num_planes); + return 0; +} + +// Check NEARESTMV, NEARMV, GLOBALMV ref mvs for duplicate and skip the relevant +// mode +static INLINE int check_repeat_ref_mv(const MB_MODE_INFO_EXT *mbmi_ext, + int ref_idx, + const MV_REFERENCE_FRAME *ref_frame, + PREDICTION_MODE single_mode) { + const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame); + const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type]; + assert(single_mode != NEWMV); + if (single_mode == NEARESTMV) { + return 0; + } else if (single_mode == NEARMV) { + // when ref_mv_count = 0, NEARESTMV and NEARMV are same as GLOBALMV + // when ref_mv_count = 1, NEARMV is same as GLOBALMV + if (ref_mv_count < 2) return 1; + } else if (single_mode == GLOBALMV) { + // when ref_mv_count == 0, GLOBALMV is same as NEARESTMV + if (ref_mv_count == 0) return 1; + // when ref_mv_count == 1, NEARMV is same as GLOBALMV + else if (ref_mv_count == 1) + return 0; + + int stack_size = AOMMIN(USABLE_REF_MV_STACK_SIZE, ref_mv_count); + // Check GLOBALMV is matching with any mv in ref_mv_stack + for (int ref_mv_idx = 0; ref_mv_idx < stack_size; ref_mv_idx++) { + int_mv this_mv; + + if (ref_idx == 0) + this_mv = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv; + else + this_mv = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv; + + if (this_mv.as_int == mbmi_ext->global_mvs[ref_frame[ref_idx]].as_int) + return 1; + } + } + return 0; +} + +static INLINE int get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode, + int ref_idx, int ref_mv_idx, + int skip_repeated_ref_mv, + const MV_REFERENCE_FRAME *ref_frame, + const MB_MODE_INFO_EXT *mbmi_ext) { + const PREDICTION_MODE single_mode = get_single_mode(this_mode, ref_idx); + assert(is_inter_singleref_mode(single_mode)); + if (single_mode == NEWMV) { + this_mv->as_int = INVALID_MV; + } else if (single_mode == GLOBALMV) { + if (skip_repeated_ref_mv && + check_repeat_ref_mv(mbmi_ext, ref_idx, ref_frame, single_mode)) + return 0; + *this_mv = mbmi_ext->global_mvs[ref_frame[ref_idx]]; + } else { + assert(single_mode == NEARMV || single_mode == NEARESTMV); + const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame); + const int ref_mv_offset = single_mode == NEARESTMV ? 0 : ref_mv_idx + 1; + if (ref_mv_offset < mbmi_ext->ref_mv_count[ref_frame_type]) { + assert(ref_mv_offset >= 0); + if (ref_idx == 0) { + *this_mv = + mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_offset].this_mv; + } else { + *this_mv = + mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_offset].comp_mv; + } + } else { + if (skip_repeated_ref_mv && + check_repeat_ref_mv(mbmi_ext, ref_idx, ref_frame, single_mode)) + return 0; + *this_mv = mbmi_ext->global_mvs[ref_frame[ref_idx]]; + } + } + return 1; +} + +// This function update the non-new mv for the current prediction mode +static INLINE int build_cur_mv(int_mv *cur_mv, PREDICTION_MODE this_mode, + const AV1_COMMON *cm, const MACROBLOCK *x, + int skip_repeated_ref_mv) { + const MACROBLOCKD *xd = &x->e_mbd; + const MB_MODE_INFO *mbmi = xd->mi[0]; + const int is_comp_pred = has_second_ref(mbmi); + + int ret = 1; + for (int i = 0; i < is_comp_pred + 1; ++i) { + int_mv this_mv; + this_mv.as_int = INVALID_MV; + ret = get_this_mv(&this_mv, this_mode, i, mbmi->ref_mv_idx, + skip_repeated_ref_mv, mbmi->ref_frame, x->mbmi_ext); + if (!ret) return 0; + const PREDICTION_MODE single_mode = get_single_mode(this_mode, i); + if (single_mode == NEWMV) { + const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + cur_mv[i] = + (i == 0) ? x->mbmi_ext->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx] + .this_mv + : x->mbmi_ext->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx] + .comp_mv; + } else { + ret &= clamp_and_check_mv(cur_mv + i, this_mv, cm, x); + } + } + return ret; +} + +static INLINE int get_drl_cost(const MB_MODE_INFO *mbmi, + const MB_MODE_INFO_EXT *mbmi_ext, + const int (*const drl_mode_cost0)[2], + int8_t ref_frame_type) { + int cost = 0; + if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) { + for (int idx = 0; idx < 2; ++idx) { + if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { + uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx); + cost += drl_mode_cost0[drl_ctx][mbmi->ref_mv_idx != idx]; + if (mbmi->ref_mv_idx == idx) return cost; + } + } + return cost; + } + + if (have_nearmv_in_inter_mode(mbmi->mode)) { + for (int idx = 1; idx < 3; ++idx) { + if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { + uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx); + cost += drl_mode_cost0[drl_ctx][mbmi->ref_mv_idx != (idx - 1)]; + if (mbmi->ref_mv_idx == (idx - 1)) return cost; + } + } + return cost; + } + return cost; +} + +static INLINE int is_single_newmv_valid(const HandleInterModeArgs *const args, + const MB_MODE_INFO *const mbmi, + PREDICTION_MODE this_mode) { + for (int ref_idx = 0; ref_idx < 2; ++ref_idx) { + const PREDICTION_MODE single_mode = get_single_mode(this_mode, ref_idx); + const MV_REFERENCE_FRAME ref = mbmi->ref_frame[ref_idx]; + if (single_mode == NEWMV && + args->single_newmv_valid[mbmi->ref_mv_idx][ref] == 0) { + return 0; + } + } + return 1; +} + +static int get_drl_refmv_count(const MACROBLOCK *const x, + const MV_REFERENCE_FRAME *ref_frame, + PREDICTION_MODE mode) { + MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const int8_t ref_frame_type = av1_ref_frame_type(ref_frame); + const int has_nearmv = have_nearmv_in_inter_mode(mode) ? 1 : 0; + const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type]; + const int only_newmv = (mode == NEWMV || mode == NEW_NEWMV); + const int has_drl = + (has_nearmv && ref_mv_count > 2) || (only_newmv && ref_mv_count > 1); + const int ref_set = + has_drl ? AOMMIN(MAX_REF_MV_SEARCH, ref_mv_count - has_nearmv) : 1; + + return ref_set; +} + +// Whether this reference motion vector can be skipped, based on initial +// heuristics. +static bool ref_mv_idx_early_breakout(const AV1_COMP *const cpi, MACROBLOCK *x, + const HandleInterModeArgs *const args, + int64_t ref_best_rd, int ref_mv_idx) { + const SPEED_FEATURES *const sf = &cpi->sf; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + const int is_comp_pred = has_second_ref(mbmi); + if (sf->inter_sf.reduce_inter_modes && ref_mv_idx > 0) { + if (mbmi->ref_frame[0] == LAST2_FRAME || + mbmi->ref_frame[0] == LAST3_FRAME || + mbmi->ref_frame[1] == LAST2_FRAME || + mbmi->ref_frame[1] == LAST3_FRAME) { + const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0; + if (mbmi_ext->weight[ref_frame_type][ref_mv_idx + has_nearmv] < + REF_CAT_LEVEL) { + return true; + } + } + // TODO(any): Experiment with reduce_inter_modes for compound prediction + if (sf->inter_sf.reduce_inter_modes >= 2 && !is_comp_pred && + have_newmv_in_inter_mode(mbmi->mode)) { + if (mbmi->ref_frame[0] != cpi->nearest_past_ref && + mbmi->ref_frame[0] != cpi->nearest_future_ref) { + const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0; + if (mbmi_ext->weight[ref_frame_type][ref_mv_idx + has_nearmv] < + REF_CAT_LEVEL) { + return true; + } + } + } + } + if (sf->inter_sf.prune_single_motion_modes_by_simple_trans && !is_comp_pred && + args->single_ref_first_pass == 0) { + if (args->simple_rd_state[ref_mv_idx].early_skipped) { + return true; + } + } + mbmi->ref_mv_idx = ref_mv_idx; + if (is_comp_pred && (!is_single_newmv_valid(args, mbmi, mbmi->mode))) { + return true; + } + size_t est_rd_rate = args->ref_frame_cost + args->single_comp_cost; + const int drl_cost = + get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type); + est_rd_rate += drl_cost; + if (RDCOST(x->rdmult, est_rd_rate, 0) > ref_best_rd && + mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) { + return true; + } + return false; +} + +// Compute the estimated RD cost for the motion vector with simple translation. +static int64_t simple_translation_pred_rd( + AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, + HandleInterModeArgs *args, int ref_mv_idx, inter_mode_info *mode_info, + int64_t ref_best_rd, BLOCK_SIZE bsize) { + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + const AV1_COMMON *cm = &cpi->common; + const int is_comp_pred = has_second_ref(mbmi); + + struct macroblockd_plane *p = xd->plane; + const BUFFER_SET orig_dst = { + { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf }, + { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride }, + }; + av1_init_rd_stats(rd_stats); + + mbmi->interinter_comp.type = COMPOUND_AVERAGE; + mbmi->comp_group_idx = 0; + mbmi->compound_idx = 1; + if (mbmi->ref_frame[1] == INTRA_FRAME) { + mbmi->ref_frame[1] = NONE_FRAME; + } + int16_t mode_ctx = + av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame); + + mbmi->num_proj_ref = 0; + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->ref_mv_idx = ref_mv_idx; + + rd_stats->rate += args->ref_frame_cost + args->single_comp_cost; + const int drl_cost = + get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type); + rd_stats->rate += drl_cost; + mode_info[ref_mv_idx].drl_cost = drl_cost; + + int_mv cur_mv[2]; + if (!build_cur_mv(cur_mv, mbmi->mode, cm, x, 0)) { + return INT64_MAX; + } + assert(have_nearmv_in_inter_mode(mbmi->mode)); + for (int i = 0; i < is_comp_pred + 1; ++i) { + mbmi->mv[i].as_int = cur_mv[i].as_int; + } + const int ref_mv_cost = cost_mv_ref(x, mbmi->mode, mode_ctx); + rd_stats->rate += ref_mv_cost; + + if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd) { + return INT64_MAX; + } + + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->num_proj_ref = 0; + if (is_comp_pred) { + // Only compound_average + mbmi->interinter_comp.type = COMPOUND_AVERAGE; + mbmi->comp_group_idx = 0; + mbmi->compound_idx = 1; + } + set_default_interp_filters(mbmi, cm->features.interp_filter); + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + int est_rate; + int64_t est_dist; + model_rd_sb_fn[MODELRD_CURVFIT](cpi, bsize, x, xd, 0, 0, &est_rate, &est_dist, + NULL, NULL, NULL, NULL, NULL); + return RDCOST(x->rdmult, rd_stats->rate + est_rate, est_dist); +} + +// Represents a set of integers, from 0 to sizeof(int) * 8, as bits in +// an integer. 0 for the i-th bit means that integer is excluded, 1 means +// it is included. +static INLINE void mask_set_bit(int *mask, int index) { *mask |= (1 << index); } + +static INLINE bool mask_check_bit(int mask, int index) { + return (mask >> index) & 0x1; +} + +// Before performing the full MV search in handle_inter_mode, do a simple +// translation search and see if we can eliminate any motion vectors. +// Returns an integer where, if the i-th bit is set, it means that the i-th +// motion vector should be searched. This is only set for NEAR_MV. +static int ref_mv_idx_to_search(AV1_COMP *const cpi, MACROBLOCK *x, + RD_STATS *rd_stats, + HandleInterModeArgs *const args, + int64_t ref_best_rd, inter_mode_info *mode_info, + BLOCK_SIZE bsize, const int ref_set) { + AV1_COMMON *const cm = &cpi->common; + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const PREDICTION_MODE this_mode = mbmi->mode; + + // Only search indices if they have some chance of being good. + int good_indices = 0; + for (int i = 0; i < ref_set; ++i) { + if (ref_mv_idx_early_breakout(cpi, x, args, ref_best_rd, i)) { + continue; + } + mask_set_bit(&good_indices, i); + } + + // Only prune in NEARMV mode, if the speed feature is set, and the block size + // is large enough. If these conditions are not met, return all good indices + // found so far. + if (!cpi->sf.inter_sf.prune_mode_search_simple_translation) + return good_indices; + if (!have_nearmv_in_inter_mode(this_mode)) return good_indices; + if (num_pels_log2_lookup[bsize] <= 6) return good_indices; + // Do not prune when there is internal resizing. TODO(elliottk) fix this + // so b/2384 can be resolved. + if (av1_is_scaled(get_ref_scale_factors(cm, mbmi->ref_frame[0])) || + (mbmi->ref_frame[1] > 0 && + av1_is_scaled(get_ref_scale_factors(cm, mbmi->ref_frame[1])))) { + return good_indices; + } + + // Calculate the RD cost for the motion vectors using simple translation. + int64_t idx_rdcost[] = { INT64_MAX, INT64_MAX, INT64_MAX }; + for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) { + // If this index is bad, ignore it. + if (!mask_check_bit(good_indices, ref_mv_idx)) { + continue; + } + idx_rdcost[ref_mv_idx] = simple_translation_pred_rd( + cpi, x, rd_stats, args, ref_mv_idx, mode_info, ref_best_rd, bsize); + } + // Find the index with the best RD cost. + int best_idx = 0; + for (int i = 1; i < MAX_REF_MV_SEARCH; ++i) { + if (idx_rdcost[i] < idx_rdcost[best_idx]) { + best_idx = i; + } + } + // Only include indices that are good and within a % of the best. + const double dth = has_second_ref(mbmi) ? 1.05 : 1.001; + // If the simple translation cost is not within this multiple of the + // best RD, skip it. Note that the cutoff is derived experimentally. + const double ref_dth = 5; + int result = 0; + for (int i = 0; i < ref_set; ++i) { + if (mask_check_bit(good_indices, i) && + (1.0 * idx_rdcost[i]) / idx_rdcost[best_idx] < dth && + (1.0 * idx_rdcost[i]) / ref_best_rd < ref_dth) { + mask_set_bit(&result, i); + } + } + return result; +} + +typedef struct motion_mode_candidate { + MB_MODE_INFO mbmi; + int rate_mv; + int rate2_nocoeff; + int skip_motion_mode; + int64_t rd_cost; +} motion_mode_candidate; + +typedef struct motion_mode_best_st_candidate { + motion_mode_candidate motion_mode_cand[MAX_WINNER_MOTION_MODES]; + int num_motion_mode_cand; +} motion_mode_best_st_candidate; + +// Checks if the current reference frame matches with neighbouring block's +// (top/left) reference frames +static AOM_INLINE int ref_match_found_in_nb_blocks(MB_MODE_INFO *cur_mbmi, + MB_MODE_INFO *nb_mbmi) { + MV_REFERENCE_FRAME nb_ref_frames[2] = { nb_mbmi->ref_frame[0], + nb_mbmi->ref_frame[1] }; + MV_REFERENCE_FRAME cur_ref_frames[2] = { cur_mbmi->ref_frame[0], + cur_mbmi->ref_frame[1] }; + const int is_cur_comp_pred = has_second_ref(cur_mbmi); + int match_found = 0; + + for (int i = 0; i < (is_cur_comp_pred + 1); i++) { + if ((cur_ref_frames[i] == nb_ref_frames[0]) || + (cur_ref_frames[i] == nb_ref_frames[1])) + match_found = 1; + } + return match_found; +} + +static AOM_INLINE int find_ref_match_in_above_nbs(const int total_mi_cols, + MACROBLOCKD *xd) { + if (!xd->up_available) return 0; + const int mi_col = xd->mi_col; + MB_MODE_INFO **cur_mbmi = xd->mi; + // prev_row_mi points into the mi array, starting at the beginning of the + // previous row. + MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride; + const int end_col = AOMMIN(mi_col + xd->width, total_mi_cols); + uint8_t mi_step; + for (int above_mi_col = mi_col; above_mi_col < end_col; + above_mi_col += mi_step) { + MB_MODE_INFO **above_mi = prev_row_mi + above_mi_col; + mi_step = mi_size_wide[above_mi[0]->sb_type]; + int match_found = 0; + if (is_inter_block(*above_mi)) + match_found = ref_match_found_in_nb_blocks(*cur_mbmi, *above_mi); + if (match_found) return 1; + } + return 0; +} + +static AOM_INLINE int find_ref_match_in_left_nbs(const int total_mi_rows, + MACROBLOCKD *xd) { + if (!xd->left_available) return 0; + const int mi_row = xd->mi_row; + MB_MODE_INFO **cur_mbmi = xd->mi; + // prev_col_mi points into the mi array, starting at the top of the + // previous column + MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride; + const int end_row = AOMMIN(mi_row + xd->height, total_mi_rows); + uint8_t mi_step; + for (int left_mi_row = mi_row; left_mi_row < end_row; + left_mi_row += mi_step) { + MB_MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride; + mi_step = mi_size_high[left_mi[0]->sb_type]; + int match_found = 0; + if (is_inter_block(*left_mi)) + match_found = ref_match_found_in_nb_blocks(*cur_mbmi, *left_mi); + if (match_found) return 1; + } + return 0; +} + +typedef struct { + int64_t best_inter_cost; + int64_t ref_inter_cost[INTER_REFS_PER_FRAME]; +} PruneInfoFromTpl; + +#if !CONFIG_REALTIME_ONLY +// TODO(Remya): Check if get_tpl_stats_b() can be reused +static AOM_INLINE void get_block_level_tpl_stats( + AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int mi_col, int *valid_refs, + PruneInfoFromTpl *inter_cost_info_from_tpl) { + const GF_GROUP *const gf_group = &cpi->gf_group; + AV1_COMMON *const cm = &cpi->common; + + assert(IMPLIES(gf_group->size > 0, gf_group->index < gf_group->size)); + const int tpl_idx = gf_group->index; + TplParams *const tpl_data = &cpi->tpl_data; + const TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx]; + if (tpl_idx >= MAX_LAG_BUFFERS || !tpl_frame->is_valid) { + return; + } + + const TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + const int mi_wide = mi_size_wide[bsize]; + const int mi_high = mi_size_high[bsize]; + const int tpl_stride = tpl_frame->stride; + const int step = 1 << tpl_data->tpl_stats_block_mis_log2; + const int mi_col_sr = + coded_to_superres_mi(mi_col, cm->superres_scale_denominator); + const int mi_col_end_sr = + coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator); + const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); + + for (int row = mi_row; row < AOMMIN(mi_row + mi_high, cm->mi_params.mi_rows); + row += step) { + for (int col = mi_col_sr; col < AOMMIN(mi_col_end_sr, mi_cols_sr); + col += step) { + const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos( + row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)]; + + // Sums up the inter cost of corresponding ref frames + for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ref_idx++) { + inter_cost_info_from_tpl->ref_inter_cost[ref_idx] += + this_stats->pred_error[ref_idx]; + } + } + } + + // Computes the best inter cost (minimum inter_cost) + int64_t best_inter_cost = INT64_MAX; + for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ref_idx++) { + const int64_t cur_inter_cost = + inter_cost_info_from_tpl->ref_inter_cost[ref_idx]; + // For invalid ref frames, cur_inter_cost = 0 and has to be handled while + // calculating the minimum inter_cost + if (cur_inter_cost != 0 && (cur_inter_cost < best_inter_cost) && + valid_refs[ref_idx]) + best_inter_cost = cur_inter_cost; + } + inter_cost_info_from_tpl->best_inter_cost = best_inter_cost; +} +#endif + +static AOM_INLINE int prune_modes_based_on_tpl_stats( + PruneInfoFromTpl *inter_cost_info_from_tpl, const int *refs, int ref_mv_idx, + const PREDICTION_MODE this_mode, int prune_mode_level) { + const int have_newmv = have_newmv_in_inter_mode(this_mode); + if ((prune_mode_level < 3) && have_newmv) return 0; + + static const int prune_level_idx[3] = { 0, 1, 1 }; + const int prune_level = prune_level_idx[prune_mode_level - 1]; + int64_t cur_inter_cost; + + const int is_globalmv = + (this_mode == GLOBALMV) || (this_mode == GLOBAL_GLOBALMV); + const int prune_index = is_globalmv ? MAX_REF_MV_SEARCH : ref_mv_idx; + + // Thresholds used for pruning: + // Lower value indicates aggressive pruning and higher value indicates + // conservative pruning which is set based on ref_mv_idx and speed feature. + // 'prune_index' 0, 1, 2 corresponds to ref_mv indices 0, 1 and 2. prune_index + // 3 corresponds to GLOBALMV/GLOBAL_GLOBALMV + static const int tpl_inter_mode_prune_mul_factor[2][MAX_REF_MV_SEARCH + 1] = { + { 3, 3, 3, 2 }, { 3, 2, 2, 2 } + }; + + const int is_comp_pred = (refs[1] > INTRA_FRAME); + if (!is_comp_pred) { + cur_inter_cost = inter_cost_info_from_tpl->ref_inter_cost[refs[0] - 1]; + } else { + const int64_t inter_cost_ref0 = + inter_cost_info_from_tpl->ref_inter_cost[refs[0] - 1]; + const int64_t inter_cost_ref1 = + inter_cost_info_from_tpl->ref_inter_cost[refs[1] - 1]; + // Choose maximum inter_cost among inter_cost_ref0 and inter_cost_ref1 for + // more aggressive pruning + cur_inter_cost = AOMMAX(inter_cost_ref0, inter_cost_ref1); + } + + // Prune the mode if cur_inter_cost is greater than threshold times + // best_inter_cost + const int64_t best_inter_cost = inter_cost_info_from_tpl->best_inter_cost; + if (cur_inter_cost > + ((tpl_inter_mode_prune_mul_factor[prune_level][prune_index] * + best_inter_cost) >> + 1)) + return 1; + return 0; +} + +static int64_t handle_inter_mode( + AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *x, + BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y, + RD_STATS *rd_stats_uv, int *disable_skip, HandleInterModeArgs *args, + int64_t ref_best_rd, uint8_t *const tmp_buf, + const CompoundTypeRdBuffers *rd_buffers, int64_t *best_est_rd, + const int do_tx_search, InterModesInfo *inter_modes_info, + motion_mode_candidate *motion_mode_cand, int64_t *skip_rd, + PruneInfoFromTpl *inter_cost_info_from_tpl) { + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const int is_comp_pred = has_second_ref(mbmi); + const PREDICTION_MODE this_mode = mbmi->mode; + + const GF_GROUP *const gf_group = &cpi->gf_group; + const int tpl_idx = gf_group->index; + TplDepFrame *tpl_frame = &cpi->tpl_data.tpl_frame[tpl_idx]; + const int prune_modes_based_on_tpl = + cpi->sf.inter_sf.prune_inter_modes_based_on_tpl && + tpl_idx >= MAX_LAG_BUFFERS && tpl_frame->is_valid; + int i; + const int refs[2] = { mbmi->ref_frame[0], + (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) }; + int rate_mv = 0; + int64_t rd = INT64_MAX; + // do first prediction into the destination buffer. Do the next + // prediction into a temporary buffer. Then keep track of which one + // of these currently holds the best predictor, and use the other + // one for future predictions. In the end, copy from tmp_buf to + // dst if necessary. + struct macroblockd_plane *p = xd->plane; + const BUFFER_SET orig_dst = { + { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf }, + { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride }, + }; + const BUFFER_SET tmp_dst = { { tmp_buf, tmp_buf + 1 * MAX_SB_SQUARE, + tmp_buf + 2 * MAX_SB_SQUARE }, + { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE } }; + + const int masked_compound_used = is_any_masked_compound_used(bsize) && + cm->seq_params.enable_masked_compound; + int64_t ret_val = INT64_MAX; + const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv; + int64_t best_rd = INT64_MAX; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + MB_MODE_INFO best_mbmi = *mbmi; + int best_disable_skip = 0; + int best_xskip = 0; + int64_t newmv_ret_val = INT64_MAX; + inter_mode_info mode_info[MAX_REF_MV_SEARCH]; + + int mode_search_mask = (1 << COMPOUND_AVERAGE) | (1 << COMPOUND_DISTWTD) | + (1 << COMPOUND_WEDGE) | (1 << COMPOUND_DIFFWTD); + + // Do not prune the mode based on inter cost from tpl if the current ref frame + // is the winner ref in neighbouring blocks. + int ref_match_found_in_above_nb = 0; + int ref_match_found_in_left_nb = 0; + if (prune_modes_based_on_tpl) { + ref_match_found_in_above_nb = + find_ref_match_in_above_nbs(cm->mi_params.mi_cols, xd); + ref_match_found_in_left_nb = + find_ref_match_in_left_nbs(cm->mi_params.mi_rows, xd); + } + + // First, perform a simple translation search for each of the indices. If + // an index performs well, it will be fully searched here. + const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode); + // Save MV results from first 2 ref_mv_idx. + int_mv save_mv[MAX_REF_MV_SEARCH - 1][2] = { { { 0 } } }; + int best_ref_mv_idx = -1; + const int idx_mask = ref_mv_idx_to_search(cpi, x, rd_stats, args, ref_best_rd, + mode_info, bsize, ref_set); + const int16_t mode_ctx = + av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame); + const int ref_mv_cost = cost_mv_ref(x, this_mode, mode_ctx); + const int base_rate = + args->ref_frame_cost + args->single_comp_cost + ref_mv_cost; + for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) { + mode_info[ref_mv_idx].full_search_mv.as_int = INVALID_MV; + mode_info[ref_mv_idx].mv.as_int = INVALID_MV; + mode_info[ref_mv_idx].rd = INT64_MAX; + + if (!mask_check_bit(idx_mask, ref_mv_idx)) { + // MV did not perform well in simple translation search. Skip it. + continue; + } + if (prune_modes_based_on_tpl && !ref_match_found_in_above_nb && + !ref_match_found_in_left_nb && (ref_best_rd != INT64_MAX)) { + if (prune_modes_based_on_tpl_stats( + inter_cost_info_from_tpl, refs, ref_mv_idx, this_mode, + cpi->sf.inter_sf.prune_inter_modes_based_on_tpl)) + continue; + } + av1_init_rd_stats(rd_stats); + + mbmi->interinter_comp.type = COMPOUND_AVERAGE; + mbmi->comp_group_idx = 0; + mbmi->compound_idx = 1; + if (mbmi->ref_frame[1] == INTRA_FRAME) mbmi->ref_frame[1] = NONE_FRAME; + + mbmi->num_proj_ref = 0; + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->ref_mv_idx = ref_mv_idx; + + rd_stats->rate = base_rate; + const int drl_cost = + get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type); + rd_stats->rate += drl_cost; + mode_info[ref_mv_idx].drl_cost = drl_cost; + + int rs = 0; + int compmode_interinter_cost = 0; + + int_mv cur_mv[2]; + + // TODO(Cherma): Extend this speed feature to support compound mode + int skip_repeated_ref_mv = + is_comp_pred ? 0 : cpi->sf.inter_sf.skip_repeated_ref_mv; + if (!build_cur_mv(cur_mv, this_mode, cm, x, skip_repeated_ref_mv)) { + continue; + } + + if (have_newmv_in_inter_mode(this_mode)) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, handle_newmv_time); +#endif + if (cpi->sf.inter_sf.prune_single_motion_modes_by_simple_trans && + args->single_ref_first_pass == 0 && !is_comp_pred) { + const int ref0 = mbmi->ref_frame[0]; + newmv_ret_val = args->single_newmv_valid[ref_mv_idx][ref0] ? 0 : 1; + cur_mv[0] = args->single_newmv[ref_mv_idx][ref0]; + rate_mv = args->single_newmv_rate[ref_mv_idx][ref0]; + } else { + newmv_ret_val = + handle_newmv(cpi, x, bsize, cur_mv, &rate_mv, args, mode_info); + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, handle_newmv_time); +#endif + + if (newmv_ret_val != 0) continue; + + rd_stats->rate += rate_mv; + + if (cpi->sf.inter_sf.skip_repeated_newmv) { + if (!is_comp_pred && this_mode == NEWMV && ref_mv_idx > 0) { + int skip = 0; + int this_rate_mv = 0; + for (i = 0; i < ref_mv_idx; ++i) { + // Check if the motion search result same as previous results + if (cur_mv[0].as_int == args->single_newmv[i][refs[0]].as_int && + args->single_newmv_valid[i][refs[0]]) { + // If the compared mode has no valid rd, it is unlikely this + // mode will be the best mode + if (mode_info[i].rd == INT64_MAX) { + skip = 1; + break; + } + // Compare the cost difference including drl cost and mv cost + if (mode_info[i].mv.as_int != INVALID_MV) { + const int compare_cost = + mode_info[i].rate_mv + mode_info[i].drl_cost; + const int_mv ref_mv = av1_get_ref_mv(x, 0); + this_rate_mv = av1_mv_bit_cost( + &mode_info[i].mv.as_mv, &ref_mv.as_mv, x->nmv_vec_cost, + x->mv_cost_stack, MV_COST_WEIGHT); + const int this_cost = this_rate_mv + drl_cost; + + if (compare_cost <= this_cost) { + skip = 1; + break; + } else { + // If the cost is less than current best result, make this + // the best and update corresponding variables unless the + // best_mv is the same as ref_mv. In this case we skip and + // rely on NEAR(EST)MV instead + if (best_mbmi.ref_mv_idx == i && + mode_info[i].mv.as_int != ref_mv.as_int) { + assert(best_rd != INT64_MAX); + best_mbmi.ref_mv_idx = ref_mv_idx; + motion_mode_cand->rate_mv = this_rate_mv; + best_rd_stats.rate += this_cost - compare_cost; + best_rd = RDCOST(x->rdmult, best_rd_stats.rate, + best_rd_stats.dist); + if (best_rd < ref_best_rd) ref_best_rd = best_rd; + break; + } + } + } + } + } + if (skip) { + const THR_MODES mode_enum = get_prediction_mode_idx( + best_mbmi.mode, best_mbmi.ref_frame[0], best_mbmi.ref_frame[1]); + // Collect mode stats for multiwinner mode processing + store_winner_mode_stats( + &cpi->common, x, &best_mbmi, &best_rd_stats, &best_rd_stats_y, + &best_rd_stats_uv, mode_enum, NULL, bsize, best_rd, + cpi->sf.winner_mode_sf.enable_multiwinner_mode_process, + do_tx_search); + args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = + args->modelled_rd[this_mode][i][refs[0]]; + args->simple_rd[this_mode][ref_mv_idx][refs[0]] = + args->simple_rd[this_mode][i][refs[0]]; + mode_info[ref_mv_idx].rd = mode_info[i].rd; + mode_info[ref_mv_idx].rate_mv = this_rate_mv; + mode_info[ref_mv_idx].mv.as_int = mode_info[i].mv.as_int; + + restore_dst_buf(xd, orig_dst, num_planes); + continue; + } + } + } + } + for (i = 0; i < is_comp_pred + 1; ++i) { + mbmi->mv[i].as_int = cur_mv[i].as_int; + } + + if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd && + mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) { + continue; + } + + if (cpi->sf.inter_sf.prune_ref_mv_idx_search && is_comp_pred) { + // TODO(yunqing): Move this part to a separate function when it is done. + // Store MV result. + if (ref_mv_idx < MAX_REF_MV_SEARCH - 1) { + for (i = 0; i < is_comp_pred + 1; ++i) + save_mv[ref_mv_idx][i].as_int = mbmi->mv[i].as_int; + } + // Skip the evaluation if an MV match is found. + if (ref_mv_idx > 0) { + int match = 0; + for (int idx = 0; idx < ref_mv_idx; ++idx) { + int mv_diff = 0; + for (i = 0; i < 1 + is_comp_pred; ++i) { + mv_diff += abs(save_mv[idx][i].as_mv.row - mbmi->mv[i].as_mv.row) + + abs(save_mv[idx][i].as_mv.col - mbmi->mv[i].as_mv.col); + } + + // If this mode is not the best one, and current MV is similar to + // previous stored MV, terminate this ref_mv_idx evaluation. + if (best_ref_mv_idx == -1 && mv_diff < 1) { + match = 1; + break; + } + } + if (match == 1) continue; + } + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, compound_type_rd_time); +#endif + int skip_build_pred = 0; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + if (is_comp_pred) { + // Find matching interp filter or set to default interp filter + const int need_search = av1_is_interp_needed(xd); + const InterpFilter assign_filter = cm->features.interp_filter; + int is_luma_interp_done = 0; + av1_find_interp_filter_match(mbmi, cpi, assign_filter, need_search, + args->interp_filter_stats, + args->interp_filter_stats_idx); + + int64_t best_rd_compound; + int64_t rd_thresh; + const int comp_type_rd_shift = COMP_TYPE_RD_THRESH_SHIFT; + const int comp_type_rd_scale = COMP_TYPE_RD_THRESH_SCALE; + rd_thresh = get_rd_thresh_from_best_rd( + ref_best_rd, (1 << comp_type_rd_shift), comp_type_rd_scale); + compmode_interinter_cost = av1_compound_type_rd( + cpi, x, bsize, cur_mv, mode_search_mask, masked_compound_used, + &orig_dst, &tmp_dst, rd_buffers, &rate_mv, &best_rd_compound, + rd_stats, ref_best_rd, skip_rd[1], &is_luma_interp_done, rd_thresh); + if (ref_best_rd < INT64_MAX && + (best_rd_compound >> comp_type_rd_shift) * comp_type_rd_scale > + ref_best_rd) { + restore_dst_buf(xd, orig_dst, num_planes); + continue; + } + // No need to call av1_enc_build_inter_predictor for luma if + // COMPOUND_AVERAGE is selected because it is the first + // candidate in av1_compound_type_rd, and the following + // compound types searching uses tmp_dst buffer + + if (mbmi->interinter_comp.type == COMPOUND_AVERAGE && + is_luma_interp_done) { + if (num_planes > 1) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, + bsize, AOM_PLANE_U, num_planes - 1); + } + skip_build_pred = 1; + } + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, compound_type_rd_time); +#endif + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, interpolation_filter_search_time); +#endif + ret_val = av1_interpolation_filter_search( + x, cpi, tile_data, bsize, &tmp_dst, &orig_dst, &rd, &rs, + &skip_build_pred, args, ref_best_rd); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, interpolation_filter_search_time); +#endif + if (args->modelled_rd != NULL && !is_comp_pred) { + args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = rd; + } + if (ret_val != 0) { + restore_dst_buf(xd, orig_dst, num_planes); + continue; + } else if (cpi->sf.inter_sf.model_based_post_interp_filter_breakout && + ref_best_rd != INT64_MAX && (rd >> 3) * 3 > ref_best_rd) { + restore_dst_buf(xd, orig_dst, num_planes); + continue; + } + + if (args->modelled_rd != NULL) { + if (is_comp_pred) { + const int mode0 = compound_ref0_mode(this_mode); + const int mode1 = compound_ref1_mode(this_mode); + const int64_t mrd = + AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]], + args->modelled_rd[mode1][ref_mv_idx][refs[1]]); + if ((rd >> 3) * 6 > mrd && ref_best_rd < INT64_MAX) { + restore_dst_buf(xd, orig_dst, num_planes); + continue; + } + } + } + rd_stats->rate += compmode_interinter_cost; + if (skip_build_pred != 1) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, bsize, 0, + av1_num_planes(cm) - 1); + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, motion_mode_rd_time); +#endif + int rate2_nocoeff = rd_stats->rate; + ret_val = motion_mode_rd(cpi, tile_data, x, bsize, rd_stats, rd_stats_y, + rd_stats_uv, disable_skip, args, ref_best_rd, + skip_rd, &rate_mv, &orig_dst, best_est_rd, + do_tx_search, inter_modes_info, 0); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, motion_mode_rd_time); +#endif + + mode_info[ref_mv_idx].mv.as_int = mbmi->mv[0].as_int; + mode_info[ref_mv_idx].rate_mv = rate_mv; + if (ret_val != INT64_MAX) { + int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + mode_info[ref_mv_idx].rd = tmp_rd; + const THR_MODES mode_enum = get_prediction_mode_idx( + mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]); + // Collect mode stats for multiwinner mode processing + store_winner_mode_stats( + &cpi->common, x, mbmi, rd_stats, rd_stats_y, rd_stats_uv, mode_enum, + NULL, bsize, tmp_rd, + cpi->sf.winner_mode_sf.enable_multiwinner_mode_process, do_tx_search); + if (tmp_rd < best_rd) { + best_rd_stats = *rd_stats; + best_rd_stats_y = *rd_stats_y; + best_rd_stats_uv = *rd_stats_uv; + best_rd = tmp_rd; + best_mbmi = *mbmi; + best_disable_skip = *disable_skip; + best_xskip = x->force_skip; + memcpy(best_blk_skip, x->blk_skip, + sizeof(best_blk_skip[0]) * xd->height * xd->width); + av1_copy_array(best_tx_type_map, xd->tx_type_map, + xd->height * xd->width); + motion_mode_cand->rate_mv = rate_mv; + motion_mode_cand->rate2_nocoeff = rate2_nocoeff; + } + + if (tmp_rd < ref_best_rd) { + ref_best_rd = tmp_rd; + best_ref_mv_idx = ref_mv_idx; + } + } + restore_dst_buf(xd, orig_dst, num_planes); + } + + if (best_rd == INT64_MAX) return INT64_MAX; + + // re-instate status of the best choice + *rd_stats = best_rd_stats; + *rd_stats_y = best_rd_stats_y; + *rd_stats_uv = best_rd_stats_uv; + *mbmi = best_mbmi; + *disable_skip = best_disable_skip; + x->force_skip = best_xskip; + assert(IMPLIES(mbmi->comp_group_idx == 1, + mbmi->interinter_comp.type != COMPOUND_AVERAGE)); + memcpy(x->blk_skip, best_blk_skip, + sizeof(best_blk_skip[0]) * xd->height * xd->width); + av1_copy_array(xd->tx_type_map, best_tx_type_map, xd->height * xd->width); + + rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + + return rd_stats->rdcost; +} + +static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, + PICK_MODE_CONTEXT *ctx, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int64_t best_rd) { + const AV1_COMMON *const cm = &cpi->common; + if (!av1_allow_intrabc(cm) || !cpi->oxcf.enable_intrabc) return INT64_MAX; + const int num_planes = av1_num_planes(cm); + + MACROBLOCKD *const xd = &x->e_mbd; + const TileInfo *tile = &xd->tile; + MB_MODE_INFO *mbmi = xd->mi[0]; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const int w = block_size_wide[bsize]; + const int h = block_size_high[bsize]; + const int sb_row = mi_row >> cm->seq_params.mib_size_log2; + const int sb_col = mi_col >> cm->seq_params.mib_size_log2; + + MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + MV_REFERENCE_FRAME ref_frame = INTRA_FRAME; + av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count, + xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs, + mbmi_ext->mode_context); + // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and + // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs. + av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame); + int_mv nearestmv, nearmv; + av1_find_best_ref_mvs_from_stack(0, mbmi_ext, ref_frame, &nearestmv, &nearmv, + 0); + + if (nearestmv.as_int == INVALID_MV) { + nearestmv.as_int = 0; + } + if (nearmv.as_int == INVALID_MV) { + nearmv.as_int = 0; + } + + int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv; + if (dv_ref.as_int == 0) { + av1_find_ref_dv(&dv_ref, tile, cm->seq_params.mib_size, mi_row); + } + // Ref DV should not have sub-pel. + assert((dv_ref.as_mv.col & 7) == 0); + assert((dv_ref.as_mv.row & 7) == 0); + mbmi_ext->ref_mv_stack[INTRA_FRAME][0].this_mv = dv_ref; + + struct buf_2d yv12_mb[MAX_MB_PLANE]; + av1_setup_pred_block(xd, yv12_mb, xd->cur_buf, NULL, NULL, num_planes); + for (int i = 0; i < num_planes; ++i) { + xd->plane[i].pre[0] = yv12_mb[i]; + } + + enum IntrabcMotionDirection { + IBC_MOTION_ABOVE, + IBC_MOTION_LEFT, + IBC_MOTION_DIRECTIONS + }; + + MB_MODE_INFO best_mbmi = *mbmi; + RD_STATS best_rdstats = *rd_stats; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 }; + uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + + FULLPEL_MOTION_SEARCH_PARAMS fullms_params; + const search_site_config *lookahead_search_sites = + &cpi->mv_search_params.ss_cfg[SS_CFG_LOOKAHEAD]; + av1_make_default_fullpel_ms_params(&fullms_params, cpi, x, bsize, + &dv_ref.as_mv, lookahead_search_sites); + fullms_params.is_intra_mode = 1; + + for (enum IntrabcMotionDirection dir = IBC_MOTION_ABOVE; + dir < IBC_MOTION_DIRECTIONS; ++dir) { + switch (dir) { + case IBC_MOTION_ABOVE: + fullms_params.mv_limits.col_min = + (tile->mi_col_start - mi_col) * MI_SIZE; + fullms_params.mv_limits.col_max = + (tile->mi_col_end - mi_col) * MI_SIZE - w; + fullms_params.mv_limits.row_min = + (tile->mi_row_start - mi_row) * MI_SIZE; + fullms_params.mv_limits.row_max = + (sb_row * cm->seq_params.mib_size - mi_row) * MI_SIZE - h; + break; + case IBC_MOTION_LEFT: + fullms_params.mv_limits.col_min = + (tile->mi_col_start - mi_col) * MI_SIZE; + fullms_params.mv_limits.col_max = + (sb_col * cm->seq_params.mib_size - mi_col) * MI_SIZE - w; + // TODO(aconverse@google.com): Minimize the overlap between above and + // left areas. + fullms_params.mv_limits.row_min = + (tile->mi_row_start - mi_row) * MI_SIZE; + int bottom_coded_mi_edge = + AOMMIN((sb_row + 1) * cm->seq_params.mib_size, tile->mi_row_end); + fullms_params.mv_limits.row_max = + (bottom_coded_mi_edge - mi_row) * MI_SIZE - h; + break; + default: assert(0); + } + assert(fullms_params.mv_limits.col_min >= fullms_params.mv_limits.col_min); + assert(fullms_params.mv_limits.col_max <= fullms_params.mv_limits.col_max); + assert(fullms_params.mv_limits.row_min >= fullms_params.mv_limits.row_min); + assert(fullms_params.mv_limits.row_max <= fullms_params.mv_limits.row_max); + + av1_set_mv_search_range(&fullms_params.mv_limits, &dv_ref.as_mv); + + if (fullms_params.mv_limits.col_max < fullms_params.mv_limits.col_min || + fullms_params.mv_limits.row_max < fullms_params.mv_limits.row_min) { + continue; + } + + const int step_param = cpi->mv_search_params.mv_step_param; + const FULLPEL_MV start_mv = get_fullmv_from_mv(&dv_ref.as_mv); + IntraBCHashInfo *intrabc_hash_info = &x->intrabc_hash_info; + int_mv best_mv, best_hash_mv; + + int bestsme = av1_full_pixel_search(start_mv, &fullms_params, step_param, + NULL, &best_mv.as_fullmv, NULL); + const int hashsme = av1_intrabc_hash_search( + cpi, xd, &fullms_params, intrabc_hash_info, &best_hash_mv.as_fullmv); + if (hashsme < bestsme) { + best_mv = best_hash_mv; + bestsme = hashsme; + } + + if (bestsme == INT_MAX) continue; + const MV dv = get_mv_from_fullmv(&best_mv.as_fullmv); + if (!av1_is_fullmv_in_range(&fullms_params.mv_limits, + get_fullmv_from_mv(&dv))) + continue; + if (!av1_is_dv_valid(dv, cm, xd, mi_row, mi_col, bsize, + cm->seq_params.mib_size_log2)) + continue; + + // DV should not have sub-pel. + assert((dv.col & 7) == 0); + assert((dv.row & 7) == 0); + memset(&mbmi->palette_mode_info, 0, sizeof(mbmi->palette_mode_info)); + mbmi->filter_intra_mode_info.use_filter_intra = 0; + mbmi->use_intrabc = 1; + mbmi->mode = DC_PRED; + mbmi->uv_mode = UV_DC_PRED; + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->mv[0].as_mv = dv; + mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR); + mbmi->skip = 0; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, + av1_num_planes(cm) - 1); + + const IntraBCMVCosts *const dv_costs = &cpi->dv_costs; + int *dvcost[2] = { (int *)&dv_costs->mv_component[0][MV_MAX], + (int *)&dv_costs->mv_component[1][MV_MAX] }; + // TODO(aconverse@google.com): The full motion field defining discount + // in MV_COST_WEIGHT is too large. Explore other values. + const int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, dv_costs->joint_mv, + dvcost, MV_COST_WEIGHT_SUB); + const int rate_mode = x->intrabc_cost[1]; + RD_STATS rd_stats_yuv, rd_stats_y, rd_stats_uv; + if (!av1_txfm_search(cpi, x, bsize, &rd_stats_yuv, &rd_stats_y, + &rd_stats_uv, rate_mode + rate_mv, INT64_MAX)) + continue; + rd_stats_yuv.rdcost = + RDCOST(x->rdmult, rd_stats_yuv.rate, rd_stats_yuv.dist); + if (rd_stats_yuv.rdcost < best_rd) { + best_rd = rd_stats_yuv.rdcost; + best_mbmi = *mbmi; + best_rdstats = rd_stats_yuv; + memcpy(best_blk_skip, x->blk_skip, + sizeof(x->blk_skip[0]) * xd->height * xd->width); + av1_copy_array(best_tx_type_map, xd->tx_type_map, xd->height * xd->width); + } + } + *mbmi = best_mbmi; + *rd_stats = best_rdstats; + memcpy(x->blk_skip, best_blk_skip, + sizeof(x->blk_skip[0]) * xd->height * xd->width); + av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk); +#if CONFIG_RD_DEBUG + mbmi->rd_stats = *rd_stats; +#endif + return best_rd; +} + +void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, int64_t best_rd) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int num_planes = av1_num_planes(cm); + int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0; + int y_skip = 0, uv_skip = 0; + int64_t dist_y = 0, dist_uv = 0; + + ctx->rd_stats.skip = 0; + mbmi->ref_frame[0] = INTRA_FRAME; + mbmi->ref_frame[1] = NONE_FRAME; + mbmi->use_intrabc = 0; + mbmi->mv[0].as_int = 0; + mbmi->skip_mode = 0; + + const int64_t intra_yrd = + av1_rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y, + &y_skip, bsize, best_rd, ctx); + + // Initialize default mode evaluation params + set_mode_eval_params(cpi, x, DEFAULT_EVAL); + + if (intra_yrd < best_rd) { + // Only store reconstructed luma when there's chroma RDO. When there's no + // chroma RDO, the reconstructed luma will be stored in encode_superblock(). + xd->cfl.store_y = store_cfl_required_rdo(cm, x); + if (xd->cfl.store_y) { + // Restore reconstructed luma values. + memcpy(x->blk_skip, ctx->blk_skip, + sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); + av1_copy_array(xd->tx_type_map, ctx->tx_type_map, ctx->num_4x4_blk); + av1_encode_intra_block_plane(cpi, x, bsize, AOM_PLANE_Y, DRY_RUN_NORMAL, + cpi->optimize_seg_arr[mbmi->segment_id]); + av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + xd->cfl.store_y = 0; + } + if (num_planes > 1) { + init_sbuv_mode(mbmi); + if (xd->is_chroma_ref) { + const TX_SIZE max_uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd); + av1_rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, + &dist_uv, &uv_skip, bsize, max_uv_tx_size); + } + } + + // Intra block is always coded as non-skip + rd_cost->rate = + rate_y + rate_uv + x->skip_cost[av1_get_skip_context(xd)][0]; + rd_cost->dist = dist_y + dist_uv; + rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist); + rd_cost->skip = 0; + } else { + rd_cost->rate = INT_MAX; + } + + if (rd_cost->rate != INT_MAX && rd_cost->rdcost < best_rd) + best_rd = rd_cost->rdcost; + if (rd_pick_intrabc_mode_sb(cpi, x, ctx, rd_cost, bsize, best_rd) < best_rd) { + ctx->rd_stats.skip = mbmi->skip; + memcpy(ctx->blk_skip, x->blk_skip, + sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); + assert(rd_cost->rate != INT_MAX); + } + if (rd_cost->rate == INT_MAX) return; + + ctx->mic = *xd->mi[0]; + av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, x->mbmi_ext, + av1_ref_frame_type(xd->mi[0]->ref_frame)); + av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); +} + +static AOM_INLINE void calc_target_weighted_pred( + const AV1_COMMON *cm, const MACROBLOCK *x, const MACROBLOCKD *xd, + const uint8_t *above, int above_stride, const uint8_t *left, + int left_stride); + +static AOM_INLINE void rd_pick_skip_mode( + RD_STATS *rd_cost, InterModeSearchState *search_state, + const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, + struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) { + const AV1_COMMON *const cm = &cpi->common; + const SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + + x->compound_idx = 1; // COMPOUND_AVERAGE + RD_STATS skip_mode_rd_stats; + av1_invalid_rd_stats(&skip_mode_rd_stats); + + if (skip_mode_info->ref_frame_idx_0 == INVALID_IDX || + skip_mode_info->ref_frame_idx_1 == INVALID_IDX) { + return; + } + + const MV_REFERENCE_FRAME ref_frame = + LAST_FRAME + skip_mode_info->ref_frame_idx_0; + const MV_REFERENCE_FRAME second_ref_frame = + LAST_FRAME + skip_mode_info->ref_frame_idx_1; + const PREDICTION_MODE this_mode = NEAREST_NEARESTMV; + const THR_MODES mode_index = + get_prediction_mode_idx(this_mode, ref_frame, second_ref_frame); + + if (mode_index == THR_INVALID) { + return; + } + + if ((!cpi->oxcf.enable_onesided_comp || + cpi->sf.inter_sf.disable_onesided_comp) && + cpi->all_one_sided_refs) { + return; + } + + mbmi->mode = this_mode; + mbmi->uv_mode = UV_DC_PRED; + mbmi->ref_frame[0] = ref_frame; + mbmi->ref_frame[1] = second_ref_frame; + const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + if (x->mbmi_ext->ref_mv_count[ref_frame_type] == UINT8_MAX) { + if (x->mbmi_ext->ref_mv_count[ref_frame] == UINT8_MAX || + x->mbmi_ext->ref_mv_count[second_ref_frame] == UINT8_MAX) { + return; + } + MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext; + av1_find_mv_refs(cm, xd, mbmi, ref_frame_type, mbmi_ext->ref_mv_count, + xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs, + mbmi_ext->mode_context); + // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and + // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs. + av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame_type); + } + + assert(this_mode == NEAREST_NEARESTMV); + if (!build_cur_mv(mbmi->mv, this_mode, cm, x, 0)) { + return; + } + + mbmi->filter_intra_mode_info.use_filter_intra = 0; + mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1); + mbmi->comp_group_idx = 0; + mbmi->compound_idx = x->compound_idx; + mbmi->interinter_comp.type = COMPOUND_AVERAGE; + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->ref_mv_idx = 0; + mbmi->skip_mode = mbmi->skip = 1; + + set_default_interp_filters(mbmi, cm->features.interp_filter); + + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + for (int i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i]; + xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i]; + } + + BUFFER_SET orig_dst; + for (int i = 0; i < num_planes; i++) { + orig_dst.plane[i] = xd->plane[i].dst.buf; + orig_dst.stride[i] = xd->plane[i].dst.stride; + } + + // Obtain the rdcost for skip_mode. + skip_mode_rd(&skip_mode_rd_stats, cpi, x, bsize, &orig_dst); + + // Compare the use of skip_mode with the best intra/inter mode obtained. + const int skip_mode_ctx = av1_get_skip_mode_context(xd); + int64_t best_intra_inter_mode_cost = INT64_MAX; + if (rd_cost->dist < INT64_MAX && rd_cost->rate < INT32_MAX) { + best_intra_inter_mode_cost = + RDCOST(x->rdmult, rd_cost->rate + x->skip_mode_cost[skip_mode_ctx][0], + rd_cost->dist); + // Account for non-skip mode rate in total rd stats + rd_cost->rate += x->skip_mode_cost[skip_mode_ctx][0]; + av1_rd_cost_update(x->rdmult, rd_cost); + } + + if (skip_mode_rd_stats.rdcost <= best_intra_inter_mode_cost && + (!xd->lossless[mbmi->segment_id] || skip_mode_rd_stats.dist == 0)) { + assert(mode_index != THR_INVALID); + search_state->best_mbmode.skip_mode = 1; + search_state->best_mbmode = *mbmi; + + search_state->best_mbmode.skip_mode = search_state->best_mbmode.skip = 1; + search_state->best_mbmode.mode = NEAREST_NEARESTMV; + search_state->best_mbmode.ref_frame[0] = mbmi->ref_frame[0]; + search_state->best_mbmode.ref_frame[1] = mbmi->ref_frame[1]; + search_state->best_mbmode.mv[0].as_int = mbmi->mv[0].as_int; + search_state->best_mbmode.mv[1].as_int = mbmi->mv[1].as_int; + search_state->best_mbmode.ref_mv_idx = 0; + + // Set up tx_size related variables for skip-specific loop filtering. + search_state->best_mbmode.tx_size = + block_signals_txsize(bsize) + ? tx_size_from_tx_mode(bsize, x->tx_mode_search_type) + : max_txsize_rect_lookup[bsize]; + memset(search_state->best_mbmode.inter_tx_size, + search_state->best_mbmode.tx_size, + sizeof(search_state->best_mbmode.inter_tx_size)); + set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->width, xd->height, + search_state->best_mbmode.skip && is_inter_block(mbmi), xd); + + // Set up color-related variables for skip mode. + search_state->best_mbmode.uv_mode = UV_DC_PRED; + search_state->best_mbmode.palette_mode_info.palette_size[0] = 0; + search_state->best_mbmode.palette_mode_info.palette_size[1] = 0; + + search_state->best_mbmode.comp_group_idx = 0; + search_state->best_mbmode.compound_idx = x->compound_idx; + search_state->best_mbmode.interinter_comp.type = COMPOUND_AVERAGE; + search_state->best_mbmode.motion_mode = SIMPLE_TRANSLATION; + + search_state->best_mbmode.interintra_mode = + (INTERINTRA_MODE)(II_DC_PRED - 1); + search_state->best_mbmode.filter_intra_mode_info.use_filter_intra = 0; + + set_default_interp_filters(&search_state->best_mbmode, + cm->features.interp_filter); + + search_state->best_mode_index = mode_index; + + // Update rd_cost + rd_cost->rate = skip_mode_rd_stats.rate; + rd_cost->dist = rd_cost->sse = skip_mode_rd_stats.dist; + rd_cost->rdcost = skip_mode_rd_stats.rdcost; + + search_state->best_rd = rd_cost->rdcost; + search_state->best_skip2 = 1; + search_state->best_mode_skippable = 1; + + x->force_skip = 1; + } +} + +// Get winner mode stats of given mode index +static AOM_INLINE MB_MODE_INFO *get_winner_mode_stats( + MACROBLOCK *x, MB_MODE_INFO *best_mbmode, RD_STATS *best_rd_cost, + int best_rate_y, int best_rate_uv, THR_MODES *best_mode_index, + RD_STATS **winner_rd_cost, int *winner_rate_y, int *winner_rate_uv, + THR_MODES *winner_mode_index, int enable_multiwinner_mode_process, + int mode_idx) { + MB_MODE_INFO *winner_mbmi; + if (enable_multiwinner_mode_process) { + assert(mode_idx >= 0 && mode_idx < x->winner_mode_count); + WinnerModeStats *winner_mode_stat = &x->winner_mode_stats[mode_idx]; + winner_mbmi = &winner_mode_stat->mbmi; + + *winner_rd_cost = &winner_mode_stat->rd_cost; + *winner_rate_y = winner_mode_stat->rate_y; + *winner_rate_uv = winner_mode_stat->rate_uv; + *winner_mode_index = winner_mode_stat->mode_index; + } else { + winner_mbmi = best_mbmode; + *winner_rd_cost = best_rd_cost; + *winner_rate_y = best_rate_y; + *winner_rate_uv = best_rate_uv; + *winner_mode_index = *best_mode_index; + } + return winner_mbmi; +} + +// speed feature: fast intra/inter transform type search +// Used for speed >= 2 +// When this speed feature is on, in rd mode search, only DCT is used. +// After the mode is determined, this function is called, to select +// transform types and get accurate rdcost. +static AOM_INLINE void refine_winner_mode_tx( + const AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, THR_MODES *best_mode_index, + MB_MODE_INFO *best_mbmode, struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE], + int best_rate_y, int best_rate_uv, int *best_skip2, int winner_mode_count) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + int64_t best_rd; + const int num_planes = av1_num_planes(cm); + + if (!is_winner_mode_processing_enabled(cpi, best_mbmode, best_mbmode->mode)) + return; + + // Set params for winner mode evaluation + set_mode_eval_params(cpi, x, WINNER_MODE_EVAL); + + // No best mode identified so far + if (*best_mode_index == THR_INVALID) return; + + best_rd = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist); + for (int mode_idx = 0; mode_idx < winner_mode_count; mode_idx++) { + RD_STATS *winner_rd_stats = NULL; + int winner_rate_y = 0, winner_rate_uv = 0; + THR_MODES winner_mode_index = 0; + + // TODO(any): Combine best mode and multi-winner mode processing paths + // Get winner mode stats for current mode index + MB_MODE_INFO *winner_mbmi = get_winner_mode_stats( + x, best_mbmode, rd_cost, best_rate_y, best_rate_uv, best_mode_index, + &winner_rd_stats, &winner_rate_y, &winner_rate_uv, &winner_mode_index, + cpi->sf.winner_mode_sf.enable_multiwinner_mode_process, mode_idx); + + if (xd->lossless[winner_mbmi->segment_id] == 0 && + winner_mode_index != THR_INVALID && + is_winner_mode_processing_enabled(cpi, winner_mbmi, + winner_mbmi->mode)) { + RD_STATS rd_stats = *winner_rd_stats; + int skip_blk = 0; + RD_STATS rd_stats_y, rd_stats_uv; + const int skip_ctx = av1_get_skip_context(xd); + + *mbmi = *winner_mbmi; + + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + + // Select prediction reference frames. + for (int i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i]; + if (has_second_ref(mbmi)) + xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i]; + } + + if (is_inter_mode(mbmi->mode)) { + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, + av1_num_planes(cm) - 1); + if (mbmi->motion_mode == OBMC_CAUSAL) + av1_build_obmc_inter_predictors_sb(cm, xd); + + av1_subtract_plane(x, bsize, 0); + if (x->tx_mode_search_type == TX_MODE_SELECT && + !xd->lossless[mbmi->segment_id]) { + av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, + INT64_MAX); + assert(rd_stats_y.rate != INT_MAX); + } else { + av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, + INT64_MAX); + memset(mbmi->inter_tx_size, mbmi->tx_size, + sizeof(mbmi->inter_tx_size)); + for (int i = 0; i < xd->height * xd->width; ++i) + set_blk_skip(x, 0, i, rd_stats_y.skip); + } + } else { + av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, + INT64_MAX); + } + + if (num_planes > 1) { + av1_txfm_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX); + } else { + av1_init_rd_stats(&rd_stats_uv); + } + + if (is_inter_mode(mbmi->mode) && + RDCOST(x->rdmult, + x->skip_cost[skip_ctx][0] + rd_stats_y.rate + rd_stats_uv.rate, + (rd_stats_y.dist + rd_stats_uv.dist)) > + RDCOST(x->rdmult, x->skip_cost[skip_ctx][1], + (rd_stats_y.sse + rd_stats_uv.sse))) { + skip_blk = 1; + rd_stats_y.rate = x->skip_cost[skip_ctx][1]; + rd_stats_uv.rate = 0; + rd_stats_y.dist = rd_stats_y.sse; + rd_stats_uv.dist = rd_stats_uv.sse; + } else { + skip_blk = 0; + rd_stats_y.rate += x->skip_cost[skip_ctx][0]; + } + int this_rate = rd_stats.rate + rd_stats_y.rate + rd_stats_uv.rate - + winner_rate_y - winner_rate_uv; + int64_t this_rd = + RDCOST(x->rdmult, this_rate, (rd_stats_y.dist + rd_stats_uv.dist)); + if (best_rd > this_rd) { + *best_mbmode = *mbmi; + *best_mode_index = winner_mode_index; + av1_copy_array(ctx->blk_skip, x->blk_skip, ctx->num_4x4_blk); + av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + rd_cost->rate = this_rate; + rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist; + rd_cost->sse = rd_stats_y.sse + rd_stats_uv.sse; + rd_cost->rdcost = this_rd; + best_rd = this_rd; + *best_skip2 = skip_blk; + } + } + } +} + +typedef struct { + // Mask for each reference frame, specifying which prediction modes to NOT try + // during search. + uint32_t pred_modes[REF_FRAMES]; + // If ref_combo[i][j + 1] is true, do NOT try prediction using combination of + // reference frames (i, j). + // Note: indexing with 'j + 1' is due to the fact that 2nd reference can be -1 + // (NONE_FRAME). + bool ref_combo[REF_FRAMES][REF_FRAMES + 1]; +} mode_skip_mask_t; + +// Update 'ref_combo' mask to disable given 'ref' in single and compound modes. +static AOM_INLINE void disable_reference( + MV_REFERENCE_FRAME ref, bool ref_combo[REF_FRAMES][REF_FRAMES + 1]) { + for (MV_REFERENCE_FRAME ref2 = NONE_FRAME; ref2 < REF_FRAMES; ++ref2) { + ref_combo[ref][ref2 + 1] = true; + } +} + +// Update 'ref_combo' mask to disable all inter references except ALTREF. +static AOM_INLINE void disable_inter_references_except_altref( + bool ref_combo[REF_FRAMES][REF_FRAMES + 1]) { + disable_reference(LAST_FRAME, ref_combo); + disable_reference(LAST2_FRAME, ref_combo); + disable_reference(LAST3_FRAME, ref_combo); + disable_reference(GOLDEN_FRAME, ref_combo); + disable_reference(BWDREF_FRAME, ref_combo); + disable_reference(ALTREF2_FRAME, ref_combo); +} + +static const MV_REFERENCE_FRAME reduced_ref_combos[][2] = { + { LAST_FRAME, NONE_FRAME }, { ALTREF_FRAME, NONE_FRAME }, + { LAST_FRAME, ALTREF_FRAME }, { GOLDEN_FRAME, NONE_FRAME }, + { INTRA_FRAME, NONE_FRAME }, { GOLDEN_FRAME, ALTREF_FRAME }, + { LAST_FRAME, GOLDEN_FRAME }, { LAST_FRAME, INTRA_FRAME }, + { LAST_FRAME, BWDREF_FRAME }, { LAST_FRAME, LAST3_FRAME }, + { GOLDEN_FRAME, BWDREF_FRAME }, { GOLDEN_FRAME, INTRA_FRAME }, + { BWDREF_FRAME, NONE_FRAME }, { BWDREF_FRAME, ALTREF_FRAME }, + { ALTREF_FRAME, INTRA_FRAME }, { BWDREF_FRAME, INTRA_FRAME }, +}; + +static const MV_REFERENCE_FRAME real_time_ref_combos[][2] = { + { LAST_FRAME, NONE_FRAME }, + { ALTREF_FRAME, NONE_FRAME }, + { GOLDEN_FRAME, NONE_FRAME }, + { INTRA_FRAME, NONE_FRAME } +}; + +typedef enum { REF_SET_FULL, REF_SET_REDUCED, REF_SET_REALTIME } REF_SET; + +static AOM_INLINE void default_skip_mask(mode_skip_mask_t *mask, + REF_SET ref_set) { + if (ref_set == REF_SET_FULL) { + // Everything available by default. + memset(mask, 0, sizeof(*mask)); + } else { + // All modes available by default. + memset(mask->pred_modes, 0, sizeof(mask->pred_modes)); + // All references disabled first. + for (MV_REFERENCE_FRAME ref1 = INTRA_FRAME; ref1 < REF_FRAMES; ++ref1) { + for (MV_REFERENCE_FRAME ref2 = NONE_FRAME; ref2 < REF_FRAMES; ++ref2) { + mask->ref_combo[ref1][ref2 + 1] = true; + } + } + const MV_REFERENCE_FRAME(*ref_set_combos)[2]; + int num_ref_combos; + + // Then enable reduced set of references explicitly. + switch (ref_set) { + case REF_SET_REDUCED: + ref_set_combos = reduced_ref_combos; + num_ref_combos = + (int)sizeof(reduced_ref_combos) / sizeof(reduced_ref_combos[0]); + break; + case REF_SET_REALTIME: + ref_set_combos = real_time_ref_combos; + num_ref_combos = + (int)sizeof(real_time_ref_combos) / sizeof(real_time_ref_combos[0]); + break; + default: assert(0); num_ref_combos = 0; + } + + for (int i = 0; i < num_ref_combos; ++i) { + const MV_REFERENCE_FRAME *const this_combo = ref_set_combos[i]; + mask->ref_combo[this_combo[0]][this_combo[1] + 1] = false; + } + } +} + +static AOM_INLINE void init_mode_skip_mask(mode_skip_mask_t *mask, + const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + const struct segmentation *const seg = &cm->seg; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + unsigned char segment_id = mbmi->segment_id; + const SPEED_FEATURES *const sf = &cpi->sf; + REF_SET ref_set = REF_SET_FULL; + + if (sf->rt_sf.use_real_time_ref_set) + ref_set = REF_SET_REALTIME; + else if (cpi->oxcf.enable_reduced_reference_set) + ref_set = REF_SET_REDUCED; + + default_skip_mask(mask, ref_set); + + int min_pred_mv_sad = INT_MAX; + MV_REFERENCE_FRAME ref_frame; + if (ref_set == REF_SET_REALTIME) { + // For real-time encoding, we only look at a subset of ref frames. So the + // threshold for pruning should be computed from this subset as well. + const int num_rt_refs = + sizeof(real_time_ref_combos) / sizeof(*real_time_ref_combos); + for (int r_idx = 0; r_idx < num_rt_refs; r_idx++) { + const MV_REFERENCE_FRAME ref = real_time_ref_combos[r_idx][0]; + if (ref != INTRA_FRAME) { + min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref]); + } + } + } else { + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) + min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref_frame]); + } + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame])) { + // Skip checking missing reference in both single and compound reference + // modes. + disable_reference(ref_frame, mask->ref_combo); + } else { + // Skip fixed mv modes for poor references + if ((x->pred_mv_sad[ref_frame] >> 2) > min_pred_mv_sad) { + mask->pred_modes[ref_frame] |= INTER_NEAREST_NEAR_ZERO; + } + } + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) { + // Reference not used for the segment. + disable_reference(ref_frame, mask->ref_combo); + } + } + // Note: We use the following drop-out only if the SEG_LVL_REF_FRAME feature + // is disabled for this segment. This is to prevent the possibility that we + // end up unable to pick any mode. + if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { + // Only consider GLOBALMV/ALTREF_FRAME for alt ref frame, + // unless ARNR filtering is enabled in which case we want + // an unfiltered alternative. We allow near/nearest as well + // because they may result in zero-zero MVs but be cheaper. + if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) { + disable_inter_references_except_altref(mask->ref_combo); + + mask->pred_modes[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO; + const MV_REFERENCE_FRAME tmp_ref_frames[2] = { ALTREF_FRAME, NONE_FRAME }; + int_mv near_mv, nearest_mv, global_mv; + get_this_mv(&nearest_mv, NEARESTMV, 0, 0, 0, tmp_ref_frames, x->mbmi_ext); + get_this_mv(&near_mv, NEARMV, 0, 0, 0, tmp_ref_frames, x->mbmi_ext); + get_this_mv(&global_mv, GLOBALMV, 0, 0, 0, tmp_ref_frames, x->mbmi_ext); + + if (near_mv.as_int != global_mv.as_int) + mask->pred_modes[ALTREF_FRAME] |= (1 << NEARMV); + if (nearest_mv.as_int != global_mv.as_int) + mask->pred_modes[ALTREF_FRAME] |= (1 << NEARESTMV); + } + } + + if (cpi->rc.is_src_frame_alt_ref) { + if (sf->inter_sf.alt_ref_search_fp) { + assert(cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]); + mask->pred_modes[ALTREF_FRAME] = 0; + disable_inter_references_except_altref(mask->ref_combo); + disable_reference(INTRA_FRAME, mask->ref_combo); + } + } + + if (sf->inter_sf.alt_ref_search_fp) { + if (!cm->show_frame && x->best_pred_mv_sad < INT_MAX) { + int sad_thresh = x->best_pred_mv_sad + (x->best_pred_mv_sad >> 3); + // Conservatively skip the modes w.r.t. BWDREF, ALTREF2 and ALTREF, if + // those are past frames + for (ref_frame = BWDREF_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { + if (cpi->ref_relative_dist[ref_frame - LAST_FRAME] < 0) + if (x->pred_mv_sad[ref_frame] > sad_thresh) + mask->pred_modes[ref_frame] |= INTER_ALL; + } + } + } + + if (sf->inter_sf.adaptive_mode_search) { + if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref && + cpi->rc.frames_since_golden >= 3) + if ((x->pred_mv_sad[GOLDEN_FRAME] >> 1) > x->pred_mv_sad[LAST_FRAME]) + mask->pred_modes[GOLDEN_FRAME] |= INTER_ALL; + } + + if (bsize > sf->part_sf.max_intra_bsize) { + disable_reference(INTRA_FRAME, mask->ref_combo); + } + + mask->pred_modes[INTRA_FRAME] |= + ~(sf->intra_sf.intra_y_mode_mask[max_txsize_lookup[bsize]]); +} + +static AOM_INLINE void init_pred_buf(const MACROBLOCK *const x, + HandleInterModeArgs *const args) { + const MACROBLOCKD *const xd = &x->e_mbd; + if (is_cur_buf_hbd(xd)) { + const int len = sizeof(uint16_t); + args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf); + args->above_pred_buf[1] = + CONVERT_TO_BYTEPTR(x->above_pred_buf + (MAX_SB_SQUARE >> 1) * len); + args->above_pred_buf[2] = + CONVERT_TO_BYTEPTR(x->above_pred_buf + MAX_SB_SQUARE * len); + args->left_pred_buf[0] = CONVERT_TO_BYTEPTR(x->left_pred_buf); + args->left_pred_buf[1] = + CONVERT_TO_BYTEPTR(x->left_pred_buf + (MAX_SB_SQUARE >> 1) * len); + args->left_pred_buf[2] = + CONVERT_TO_BYTEPTR(x->left_pred_buf + MAX_SB_SQUARE * len); + } else { + args->above_pred_buf[0] = x->above_pred_buf; + args->above_pred_buf[1] = x->above_pred_buf + (MAX_SB_SQUARE >> 1); + args->above_pred_buf[2] = x->above_pred_buf + MAX_SB_SQUARE; + args->left_pred_buf[0] = x->left_pred_buf; + args->left_pred_buf[1] = x->left_pred_buf + (MAX_SB_SQUARE >> 1); + args->left_pred_buf[2] = x->left_pred_buf + MAX_SB_SQUARE; + } +} + +// Please add/modify parameter setting in this function, making it consistent +// and easy to read and maintain. +static AOM_INLINE void set_params_rd_pick_inter_mode( + const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args, + BLOCK_SIZE bsize, mode_skip_mask_t *mode_skip_mask, int skip_ref_frame_mask, + unsigned int *ref_costs_single, unsigned int (*ref_costs_comp)[REF_FRAMES], + struct buf_2d (*yv12_mb)[MAX_MB_PLANE]) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + unsigned char segment_id = mbmi->segment_id; + + init_pred_buf(x, args); + av1_collect_neighbors_ref_counts(xd); + estimate_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single, + ref_costs_comp); + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + MV_REFERENCE_FRAME ref_frame; + x->best_pred_mv_sad = INT_MAX; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + x->pred_mv_sad[ref_frame] = INT_MAX; + x->mbmi_ext->mode_context[ref_frame] = 0; + mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX; + if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) { + if (mbmi->partition != PARTITION_NONE && + mbmi->partition != PARTITION_SPLIT) { + if (skip_ref_frame_mask & (1 << ref_frame)) { + int skip = 1; + for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) { + if (!(skip_ref_frame_mask & (1 << r))) { + const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES]; + if (rf[0] == ref_frame || rf[1] == ref_frame) { + skip = 0; + break; + } + } + } + if (skip) continue; + } + } + assert(get_ref_frame_yv12_buf(cm, ref_frame) != NULL); + setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, yv12_mb); + } + // Store the best pred_mv_sad across all past frames + if (cpi->sf.inter_sf.alt_ref_search_fp && + cpi->ref_relative_dist[ref_frame - LAST_FRAME] < 0) + x->best_pred_mv_sad = + AOMMIN(x->best_pred_mv_sad, x->pred_mv_sad[ref_frame]); + } + // ref_frame = ALTREF_FRAME + if (!cpi->sf.rt_sf.use_real_time_ref_set) { + // No second reference on RT ref set, so no need to initialize + for (; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) { + x->mbmi_ext->mode_context[ref_frame] = 0; + mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX; + const MV_REFERENCE_FRAME *rf = ref_frame_map[ref_frame - REF_FRAMES]; + if (!((cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[0]]) && + (cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[1]]))) { + continue; + } + + if (mbmi->partition != PARTITION_NONE && + mbmi->partition != PARTITION_SPLIT) { + if (skip_ref_frame_mask & (1 << ref_frame)) { + continue; + } + } + av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count, + xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs, + mbmi_ext->mode_context); + // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and + // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs. + av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame); + } + } + + av1_count_overlappable_neighbors(cm, xd); + const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group); + const int prune_obmc = cpi->frame_probs.obmc_probs[update_type][bsize] < + cpi->sf.inter_sf.prune_obmc_prob_thresh; + if (cpi->oxcf.enable_obmc && !cpi->sf.inter_sf.disable_obmc && !prune_obmc) { + if (check_num_overlappable_neighbors(mbmi) && + is_motion_variation_allowed_bsize(bsize)) { + int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, + MAX_SB_SIZE >> 1 }; + int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, + MAX_SB_SIZE >> 1 }; + int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + av1_build_prediction_by_above_preds(cm, xd, args->above_pred_buf, + dst_width1, dst_height1, + args->above_pred_stride); + av1_build_prediction_by_left_preds(cm, xd, args->left_pred_buf, + dst_width2, dst_height2, + args->left_pred_stride); + const int num_planes = av1_num_planes(cm); + av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, + mi_col, 0, num_planes); + calc_target_weighted_pred( + cm, x, xd, args->above_pred_buf[0], args->above_pred_stride[0], + args->left_pred_buf[0], args->left_pred_stride[0]); + } + } + + init_mode_skip_mask(mode_skip_mask, cpi, x, bsize); + + // Set params for mode evaluation + set_mode_eval_params(cpi, x, MODE_EVAL); + + x->comp_rd_stats_idx = 0; +} + +static AOM_INLINE void init_intra_mode_search_state( + IntraModeSearchState *intra_search_state) { + intra_search_state->skip_intra_modes = 0; + intra_search_state->best_intra_mode = DC_PRED; + intra_search_state->angle_stats_ready = 0; + av1_zero(intra_search_state->directional_mode_skip_mask); + intra_search_state->rate_uv_intra = INT_MAX; + av1_zero(intra_search_state->pmi_uv); + for (int i = 0; i < REFERENCE_MODES; ++i) + intra_search_state->best_pred_rd[i] = INT64_MAX; +} + +static AOM_INLINE void init_inter_mode_search_state( + InterModeSearchState *search_state, const AV1_COMP *cpi, + const MACROBLOCK *x, BLOCK_SIZE bsize, int64_t best_rd_so_far) { + init_intra_mode_search_state(&search_state->intra_search_state); + + search_state->best_rd = best_rd_so_far; + search_state->best_skip_rd[0] = INT64_MAX; + search_state->best_skip_rd[1] = INT64_MAX; + + av1_zero(search_state->best_mbmode); + + search_state->best_rate_y = INT_MAX; + + search_state->best_rate_uv = INT_MAX; + + search_state->best_mode_skippable = 0; + + search_state->best_skip2 = 0; + + search_state->best_mode_index = THR_INVALID; + + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const unsigned char segment_id = mbmi->segment_id; + + search_state->num_available_refs = 0; + memset(search_state->dist_refs, -1, sizeof(search_state->dist_refs)); + memset(search_state->dist_order_refs, -1, + sizeof(search_state->dist_order_refs)); + + for (int i = 0; i <= LAST_NEW_MV_INDEX; ++i) + search_state->mode_threshold[i] = 0; + const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize]; + for (int i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i) + search_state->mode_threshold[i] = + ((int64_t)rd_threshes[i] * x->thresh_freq_fact[bsize][i]) >> + RD_THRESH_FAC_FRAC_BITS; + + search_state->best_intra_rd = INT64_MAX; + + search_state->best_pred_sse = UINT_MAX; + + av1_zero(search_state->single_newmv); + av1_zero(search_state->single_newmv_rate); + av1_zero(search_state->single_newmv_valid); + for (int i = 0; i < MB_MODE_COUNT; ++i) { + for (int j = 0; j < MAX_REF_MV_SEARCH; ++j) { + for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) { + search_state->modelled_rd[i][j][ref_frame] = INT64_MAX; + search_state->simple_rd[i][j][ref_frame] = INT64_MAX; + } + } + } + + for (int dir = 0; dir < 2; ++dir) { + for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { + for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) { + SingleInterModeState *state; + + state = &search_state->single_state[dir][mode][ref_frame]; + state->ref_frame = NONE_FRAME; + state->rd = INT64_MAX; + + state = &search_state->single_state_modelled[dir][mode][ref_frame]; + state->ref_frame = NONE_FRAME; + state->rd = INT64_MAX; + } + } + } + for (int dir = 0; dir < 2; ++dir) { + for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { + for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) { + search_state->single_rd_order[dir][mode][ref_frame] = NONE_FRAME; + } + } + } + av1_zero(search_state->single_state_cnt); + av1_zero(search_state->single_state_modelled_cnt); +} + +static bool mask_says_skip(const mode_skip_mask_t *mode_skip_mask, + const MV_REFERENCE_FRAME *ref_frame, + const PREDICTION_MODE this_mode) { + if (mode_skip_mask->pred_modes[ref_frame[0]] & (1 << this_mode)) { + return true; + } + + return mode_skip_mask->ref_combo[ref_frame[0]][ref_frame[1] + 1]; +} + +static int inter_mode_compatible_skip(const AV1_COMP *cpi, const MACROBLOCK *x, + BLOCK_SIZE bsize, + PREDICTION_MODE curr_mode, + const MV_REFERENCE_FRAME *ref_frames) { + const int comp_pred = ref_frames[1] > INTRA_FRAME; + if (comp_pred) { + if (!is_comp_ref_allowed(bsize)) return 1; + if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frames[1]])) { + return 1; + } + + const AV1_COMMON *const cm = &cpi->common; + if (frame_is_intra_only(cm)) return 1; + + const CurrentFrame *const current_frame = &cm->current_frame; + if (current_frame->reference_mode == SINGLE_REFERENCE) return 1; + + const struct segmentation *const seg = &cm->seg; + const unsigned char segment_id = x->e_mbd.mi[0]->segment_id; + // Do not allow compound prediction if the segment level reference frame + // feature is in use as in this case there can only be one reference. + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) return 1; + } + + if (ref_frames[0] > INTRA_FRAME && ref_frames[1] == INTRA_FRAME) { + // Mode must be compatible + if (!is_interintra_allowed_bsize(bsize)) return 1; + if (!is_interintra_allowed_mode(curr_mode)) return 1; + } + + return 0; +} + +static int fetch_picked_ref_frames_mask(const MACROBLOCK *const x, + BLOCK_SIZE bsize, int mib_size) { + const int sb_size_mask = mib_size - 1; + const MACROBLOCKD *const xd = &x->e_mbd; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const int mi_row_in_sb = mi_row & sb_size_mask; + const int mi_col_in_sb = mi_col & sb_size_mask; + const int mi_w = mi_size_wide[bsize]; + const int mi_h = mi_size_high[bsize]; + int picked_ref_frames_mask = 0; + for (int i = mi_row_in_sb; i < mi_row_in_sb + mi_h; ++i) { + for (int j = mi_col_in_sb; j < mi_col_in_sb + mi_w; ++j) { + picked_ref_frames_mask |= x->picked_ref_frames_mask[i * 32 + j]; + } + } + return picked_ref_frames_mask; +} + +// Case 1: return 0, means don't skip this mode +// Case 2: return 1, means skip this mode completely +// Case 3: return 2, means skip compound only, but still try single motion modes +static int inter_mode_search_order_independent_skip( + const AV1_COMP *cpi, const MACROBLOCK *x, mode_skip_mask_t *mode_skip_mask, + InterModeSearchState *search_state, int skip_ref_frame_mask, + PREDICTION_MODE mode, const MV_REFERENCE_FRAME *ref_frame) { + if (mask_says_skip(mode_skip_mask, ref_frame, mode)) { + return 1; + } + + const int ref_type = av1_ref_frame_type(ref_frame); + if ((cpi->prune_ref_frame_mask >> ref_type) & 1) return 1; + + // This is only used in motion vector unit test. + if (cpi->oxcf.motion_vector_unit_test && ref_frame[0] == INTRA_FRAME) + return 1; + + const AV1_COMMON *const cm = &cpi->common; + if (skip_repeated_mv(cm, x, mode, ref_frame, search_state)) { + return 1; + } + + const int comp_pred = ref_frame[1] > INTRA_FRAME; + if ((!cpi->oxcf.enable_onesided_comp || + cpi->sf.inter_sf.disable_onesided_comp) && + comp_pred && cpi->all_one_sided_refs) { + return 1; + } + + const MB_MODE_INFO *const mbmi = x->e_mbd.mi[0]; + // If no valid mode has been found so far in PARTITION_NONE when finding a + // valid partition is required, do not skip mode. + if (search_state->best_rd == INT64_MAX && mbmi->partition == PARTITION_NONE && + x->must_find_valid_partition) + return 0; + + int skip_motion_mode = 0; + if (mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) { + int skip_ref = skip_ref_frame_mask & (1 << ref_type); + if (ref_type <= ALTREF_FRAME && skip_ref) { + // Since the compound ref modes depends on the motion estimation result of + // two single ref modes( best mv of single ref modes as the start point ) + // If current single ref mode is marked skip, we need to check if it will + // be used in compound ref modes. + for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) { + if (skip_ref_frame_mask & (1 << r)) continue; + const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES]; + if (rf[0] == ref_type || rf[1] == ref_type) { + // Found a not skipped compound ref mode which contains current + // single ref. So this single ref can't be skipped completly + // Just skip it's motion mode search, still try it's simple + // transition mode. + skip_motion_mode = 1; + skip_ref = 0; + break; + } + } + } + if (skip_ref) return 1; + } + + const SPEED_FEATURES *const sf = &cpi->sf; + if (ref_frame[0] == INTRA_FRAME) { + if (mode != DC_PRED) { + // Disable intra modes other than DC_PRED for blocks with low variance + // Threshold for intra skipping based on source variance + // TODO(debargha): Specialize the threshold for super block sizes + const unsigned int skip_intra_var_thresh = 64; + if ((sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) && + x->source_variance < skip_intra_var_thresh) + return 1; + } + } + + if (prune_ref_by_selective_ref_frame(cpi, x, ref_frame, + cm->cur_frame->ref_display_order_hint)) + return 1; + + if (skip_motion_mode) return 2; + + return 0; +} + +static INLINE void init_mbmi(MB_MODE_INFO *mbmi, PREDICTION_MODE curr_mode, + const MV_REFERENCE_FRAME *ref_frames, + const AV1_COMMON *cm) { + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + mbmi->ref_mv_idx = 0; + mbmi->mode = curr_mode; + mbmi->uv_mode = UV_DC_PRED; + mbmi->ref_frame[0] = ref_frames[0]; + mbmi->ref_frame[1] = ref_frames[1]; + pmi->palette_size[0] = 0; + pmi->palette_size[1] = 0; + mbmi->filter_intra_mode_info.use_filter_intra = 0; + mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0; + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1); + set_default_interp_filters(mbmi, cm->features.interp_filter); +} + +static AOM_INLINE void collect_single_states(MACROBLOCK *x, + InterModeSearchState *search_state, + const MB_MODE_INFO *const mbmi) { + int i, j; + const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0]; + const PREDICTION_MODE this_mode = mbmi->mode; + const int dir = ref_frame <= GOLDEN_FRAME ? 0 : 1; + const int mode_offset = INTER_OFFSET(this_mode); + const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode); + + // Simple rd + int64_t simple_rd = search_state->simple_rd[this_mode][0][ref_frame]; + for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) { + const int64_t rd = + search_state->simple_rd[this_mode][ref_mv_idx][ref_frame]; + if (rd < simple_rd) simple_rd = rd; + } + + // Insertion sort of single_state + const SingleInterModeState this_state_s = { simple_rd, ref_frame, 1 }; + SingleInterModeState *state_s = search_state->single_state[dir][mode_offset]; + i = search_state->single_state_cnt[dir][mode_offset]; + for (j = i; j > 0 && state_s[j - 1].rd > this_state_s.rd; --j) + state_s[j] = state_s[j - 1]; + state_s[j] = this_state_s; + search_state->single_state_cnt[dir][mode_offset]++; + + // Modelled rd + int64_t modelled_rd = search_state->modelled_rd[this_mode][0][ref_frame]; + for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) { + const int64_t rd = + search_state->modelled_rd[this_mode][ref_mv_idx][ref_frame]; + if (rd < modelled_rd) modelled_rd = rd; + } + + // Insertion sort of single_state_modelled + const SingleInterModeState this_state_m = { modelled_rd, ref_frame, 1 }; + SingleInterModeState *state_m = + search_state->single_state_modelled[dir][mode_offset]; + i = search_state->single_state_modelled_cnt[dir][mode_offset]; + for (j = i; j > 0 && state_m[j - 1].rd > this_state_m.rd; --j) + state_m[j] = state_m[j - 1]; + state_m[j] = this_state_m; + search_state->single_state_modelled_cnt[dir][mode_offset]++; +} + +static AOM_INLINE void analyze_single_states( + const AV1_COMP *cpi, InterModeSearchState *search_state) { + const int prune_level = cpi->sf.inter_sf.prune_comp_search_by_single_result; + assert(prune_level >= 1); + int i, j, dir, mode; + + for (dir = 0; dir < 2; ++dir) { + int64_t best_rd; + SingleInterModeState(*state)[FWD_REFS]; + const int prune_factor = prune_level >= 2 ? 6 : 5; + + // Use the best rd of GLOBALMV or NEWMV to prune the unlikely + // reference frames for all the modes (NEARESTMV and NEARMV may not + // have same motion vectors). Always keep the best of each mode + // because it might form the best possible combination with other mode. + state = search_state->single_state[dir]; + best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd, + state[INTER_OFFSET(GLOBALMV)][0].rd); + for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { + for (i = 1; i < search_state->single_state_cnt[dir][mode]; ++i) { + if (state[mode][i].rd != INT64_MAX && + (state[mode][i].rd >> 3) * prune_factor > best_rd) { + state[mode][i].valid = 0; + } + } + } + + state = search_state->single_state_modelled[dir]; + best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd, + state[INTER_OFFSET(GLOBALMV)][0].rd); + for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { + for (i = 1; i < search_state->single_state_modelled_cnt[dir][mode]; ++i) { + if (state[mode][i].rd != INT64_MAX && + (state[mode][i].rd >> 3) * prune_factor > best_rd) { + state[mode][i].valid = 0; + } + } + } + } + + // Ordering by simple rd first, then by modelled rd + for (dir = 0; dir < 2; ++dir) { + for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { + const int state_cnt_s = search_state->single_state_cnt[dir][mode]; + const int state_cnt_m = + search_state->single_state_modelled_cnt[dir][mode]; + SingleInterModeState *state_s = search_state->single_state[dir][mode]; + SingleInterModeState *state_m = + search_state->single_state_modelled[dir][mode]; + int count = 0; + const int max_candidates = AOMMAX(state_cnt_s, state_cnt_m); + for (i = 0; i < state_cnt_s; ++i) { + if (state_s[i].rd == INT64_MAX) break; + if (state_s[i].valid) { + search_state->single_rd_order[dir][mode][count++] = + state_s[i].ref_frame; + } + } + if (count >= max_candidates) continue; + + for (i = 0; i < state_cnt_m && count < max_candidates; ++i) { + if (state_m[i].rd == INT64_MAX) break; + if (!state_m[i].valid) continue; + const int ref_frame = state_m[i].ref_frame; + int match = 0; + // Check if existing already + for (j = 0; j < count; ++j) { + if (search_state->single_rd_order[dir][mode][j] == ref_frame) { + match = 1; + break; + } + } + if (match) continue; + // Check if this ref_frame is removed in simple rd + int valid = 1; + for (j = 0; j < state_cnt_s; ++j) { + if (ref_frame == state_s[j].ref_frame) { + valid = state_s[j].valid; + break; + } + } + if (valid) { + search_state->single_rd_order[dir][mode][count++] = ref_frame; + } + } + } + } +} + +static int compound_skip_get_candidates( + const AV1_COMP *cpi, const InterModeSearchState *search_state, + const int dir, const PREDICTION_MODE mode) { + const int mode_offset = INTER_OFFSET(mode); + const SingleInterModeState *state = + search_state->single_state[dir][mode_offset]; + const SingleInterModeState *state_modelled = + search_state->single_state_modelled[dir][mode_offset]; + + int max_candidates = 0; + for (int i = 0; i < FWD_REFS; ++i) { + if (search_state->single_rd_order[dir][mode_offset][i] == NONE_FRAME) break; + max_candidates++; + } + + int candidates = max_candidates; + if (cpi->sf.inter_sf.prune_comp_search_by_single_result >= 2) { + candidates = AOMMIN(2, max_candidates); + } + if (cpi->sf.inter_sf.prune_comp_search_by_single_result >= 3) { + if (state[0].rd != INT64_MAX && state_modelled[0].rd != INT64_MAX && + state[0].ref_frame == state_modelled[0].ref_frame) + candidates = 1; + if (mode == NEARMV || mode == GLOBALMV) candidates = 1; + } + + if (cpi->sf.inter_sf.prune_comp_search_by_single_result >= 4) { + // Limit the number of candidates to 1 in each direction for compound + // prediction + candidates = AOMMIN(1, candidates); + } + return candidates; +} + +static int compound_skip_by_single_states( + const AV1_COMP *cpi, const InterModeSearchState *search_state, + const PREDICTION_MODE this_mode, const MV_REFERENCE_FRAME ref_frame, + const MV_REFERENCE_FRAME second_ref_frame, const MACROBLOCK *x) { + const MV_REFERENCE_FRAME refs[2] = { ref_frame, second_ref_frame }; + const int mode[2] = { compound_ref0_mode(this_mode), + compound_ref1_mode(this_mode) }; + const int mode_offset[2] = { INTER_OFFSET(mode[0]), INTER_OFFSET(mode[1]) }; + const int mode_dir[2] = { refs[0] <= GOLDEN_FRAME ? 0 : 1, + refs[1] <= GOLDEN_FRAME ? 0 : 1 }; + int ref_searched[2] = { 0, 0 }; + int ref_mv_match[2] = { 1, 1 }; + int i, j; + + for (i = 0; i < 2; ++i) { + const SingleInterModeState *state = + search_state->single_state[mode_dir[i]][mode_offset[i]]; + const int state_cnt = + search_state->single_state_cnt[mode_dir[i]][mode_offset[i]]; + for (j = 0; j < state_cnt; ++j) { + if (state[j].ref_frame == refs[i]) { + ref_searched[i] = 1; + break; + } + } + } + + const int ref_set = get_drl_refmv_count(x, refs, this_mode); + for (i = 0; i < 2; ++i) { + if (!ref_searched[i] || (mode[i] != NEARESTMV && mode[i] != NEARMV)) { + continue; + } + const MV_REFERENCE_FRAME single_refs[2] = { refs[i], NONE_FRAME }; + for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ref_mv_idx++) { + int_mv single_mv; + int_mv comp_mv; + get_this_mv(&single_mv, mode[i], 0, ref_mv_idx, 0, single_refs, + x->mbmi_ext); + get_this_mv(&comp_mv, this_mode, i, ref_mv_idx, 0, refs, x->mbmi_ext); + if (single_mv.as_int != comp_mv.as_int) { + ref_mv_match[i] = 0; + break; + } + } + } + + for (i = 0; i < 2; ++i) { + if (!ref_searched[i] || !ref_mv_match[i]) continue; + const int candidates = + compound_skip_get_candidates(cpi, search_state, mode_dir[i], mode[i]); + const MV_REFERENCE_FRAME *ref_order = + search_state->single_rd_order[mode_dir[i]][mode_offset[i]]; + int match = 0; + for (j = 0; j < candidates; ++j) { + if (refs[i] == ref_order[j]) { + match = 1; + break; + } + } + if (!match) return 1; + } + + return 0; +} + +// Check if ref frames of current block matches with given block. +static INLINE void match_ref_frame(const MB_MODE_INFO *const mbmi, + const MV_REFERENCE_FRAME *ref_frames, + int *const is_ref_match) { + if (is_inter_block(mbmi)) { + is_ref_match[0] |= ref_frames[0] == mbmi->ref_frame[0]; + is_ref_match[1] |= ref_frames[1] == mbmi->ref_frame[0]; + if (has_second_ref(mbmi)) { + is_ref_match[0] |= ref_frames[0] == mbmi->ref_frame[1]; + is_ref_match[1] |= ref_frames[1] == mbmi->ref_frame[1]; + } + } +} + +// Prune compound mode using ref frames of neighbor blocks. +static INLINE int compound_skip_using_neighbor_refs( + MACROBLOCKD *const xd, const PREDICTION_MODE this_mode, + const MV_REFERENCE_FRAME *ref_frames, int prune_compound_using_neighbors) { + // Exclude non-extended compound modes from pruning + if (this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV || + this_mode == NEW_NEWMV || this_mode == GLOBAL_GLOBALMV) + return 0; + + int is_ref_match[2] = { 0 }; // 0 - match for forward refs + // 1 - match for backward refs + // Check if ref frames of this block matches with left neighbor. + if (xd->left_available) + match_ref_frame(xd->left_mbmi, ref_frames, is_ref_match); + + // Check if ref frames of this block matches with above neighbor. + if (xd->up_available) + match_ref_frame(xd->above_mbmi, ref_frames, is_ref_match); + + // Combine ref frame match with neighbors in forward and backward refs. + const int track_ref_match = is_ref_match[0] + is_ref_match[1]; + + // Pruning based on ref frame match with neighbors. + if (track_ref_match >= prune_compound_using_neighbors) return 0; + return 1; +} + +static int compare_int64(const void *a, const void *b) { + int64_t a64 = *((int64_t *)a); + int64_t b64 = *((int64_t *)b); + if (a64 < b64) { + return -1; + } else if (a64 == b64) { + return 0; + } else { + return 1; + } +} + +static INLINE void update_search_state( + InterModeSearchState *search_state, RD_STATS *best_rd_stats_dst, + PICK_MODE_CONTEXT *ctx, const RD_STATS *new_best_rd_stats, + const RD_STATS *new_best_rd_stats_y, const RD_STATS *new_best_rd_stats_uv, + THR_MODES new_best_mode, const MACROBLOCK *x, int txfm_search_done) { + const MACROBLOCKD *xd = &x->e_mbd; + const MB_MODE_INFO *mbmi = xd->mi[0]; + const int skip_ctx = av1_get_skip_context(xd); + const int mode_is_intra = + (av1_mode_defs[new_best_mode].mode < INTRA_MODE_END); + const int skip = mbmi->skip && !mode_is_intra; + + search_state->best_rd = new_best_rd_stats->rdcost; + search_state->best_mode_index = new_best_mode; + *best_rd_stats_dst = *new_best_rd_stats; + search_state->best_mbmode = *mbmi; + search_state->best_skip2 = skip; + search_state->best_mode_skippable = new_best_rd_stats->skip; + // When !txfm_search_done, new_best_rd_stats won't provide correct rate_y and + // rate_uv because av1_txfm_search process is replaced by rd estimation. + // Therfore, we should avoid updating best_rate_y and best_rate_uv here. + // These two values will be updated when av1_txfm_search is called. + if (txfm_search_done) { + search_state->best_rate_y = + new_best_rd_stats_y->rate + + x->skip_cost[skip_ctx][new_best_rd_stats->skip || skip]; + search_state->best_rate_uv = new_best_rd_stats_uv->rate; + } + memcpy(ctx->blk_skip, x->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); + av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); +} + +// Find the best RD for a reference frame (among single reference modes) +// and store +10% of it in the 0-th element in ref_frame_rd. +static AOM_INLINE void find_top_ref(int64_t ref_frame_rd[REF_FRAMES]) { + assert(ref_frame_rd[0] == INT64_MAX); + int64_t ref_copy[REF_FRAMES - 1]; + memcpy(ref_copy, ref_frame_rd + 1, + sizeof(ref_frame_rd[0]) * (REF_FRAMES - 1)); + qsort(ref_copy, REF_FRAMES - 1, sizeof(int64_t), compare_int64); + + int64_t cutoff = ref_copy[0]; + // The cut-off is within 10% of the best. + if (cutoff != INT64_MAX) { + assert(cutoff < INT64_MAX / 200); + cutoff = (110 * cutoff) / 100; + } + ref_frame_rd[0] = cutoff; +} + +// Check if either frame is within the cutoff. +static INLINE bool in_single_ref_cutoff(int64_t ref_frame_rd[REF_FRAMES], + MV_REFERENCE_FRAME frame1, + MV_REFERENCE_FRAME frame2) { + assert(frame2 > 0); + return ref_frame_rd[frame1] <= ref_frame_rd[0] || + ref_frame_rd[frame2] <= ref_frame_rd[0]; +} + +static AOM_INLINE void evaluate_motion_mode_for_winner_candidates( + const AV1_COMP *const cpi, MACROBLOCK *const x, RD_STATS *const rd_cost, + HandleInterModeArgs *const args, TileDataEnc *const tile_data, + PICK_MODE_CONTEXT *const ctx, + struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE], + const motion_mode_best_st_candidate *const best_motion_mode_cands, + int do_tx_search, const BLOCK_SIZE bsize, int64_t *const best_est_rd, + InterModeSearchState *const search_state) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + InterModesInfo *const inter_modes_info = x->inter_modes_info; + const int num_best_cand = best_motion_mode_cands->num_motion_mode_cand; + + for (int cand = 0; cand < num_best_cand; cand++) { + RD_STATS rd_stats; + RD_STATS rd_stats_y; + RD_STATS rd_stats_uv; + av1_init_rd_stats(&rd_stats); + av1_init_rd_stats(&rd_stats_y); + av1_init_rd_stats(&rd_stats_uv); + int disable_skip = 0, rate_mv; + + rate_mv = best_motion_mode_cands->motion_mode_cand[cand].rate_mv; + args->skip_motion_mode = + best_motion_mode_cands->motion_mode_cand[cand].skip_motion_mode; + *mbmi = best_motion_mode_cands->motion_mode_cand[cand].mbmi; + rd_stats.rate = + best_motion_mode_cands->motion_mode_cand[cand].rate2_nocoeff; + + // Continue if the best candidate is compound. + if (!is_inter_singleref_mode(mbmi->mode)) continue; + + x->force_skip = 0; + const int mode_index = get_prediction_mode_idx( + mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]); + struct macroblockd_plane *p = xd->plane; + const BUFFER_SET orig_dst = { + { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf }, + { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride }, + }; + + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + args->simple_rd_state = x->simple_rd_state[mode_index]; + // Initialize motion mode to simple translation + // Calculation of switchable rate depends on it. + mbmi->motion_mode = 0; + const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME; + for (int i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i]; + if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i]; + } + + int64_t skip_rd[2] = { search_state->best_skip_rd[0], + search_state->best_skip_rd[1] }; + int64_t ret_value = motion_mode_rd( + cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, + &disable_skip, args, search_state->best_rd, skip_rd, &rate_mv, + &orig_dst, best_est_rd, do_tx_search, inter_modes_info, 1); + + if (ret_value != INT64_MAX) { + rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist); + const THR_MODES mode_enum = get_prediction_mode_idx( + mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]); + // Collect mode stats for multiwinner mode processing + store_winner_mode_stats( + &cpi->common, x, mbmi, &rd_stats, &rd_stats_y, &rd_stats_uv, + mode_enum, NULL, bsize, rd_stats.rdcost, + cpi->sf.winner_mode_sf.enable_multiwinner_mode_process, do_tx_search); + if (rd_stats.rdcost < search_state->best_rd) { + update_search_state(search_state, rd_cost, ctx, &rd_stats, &rd_stats_y, + &rd_stats_uv, mode_enum, x, do_tx_search); + if (do_tx_search) search_state->best_skip_rd[0] = skip_rd[0]; + } + } + } +} + +// Arguments for speed feature pruning of inter mode search +typedef struct { + int *skip_motion_mode; + mode_skip_mask_t *mode_skip_mask; + InterModeSearchState *search_state; + int skip_ref_frame_mask; + int reach_first_comp_mode; + int mode_thresh_mul_fact; + int *intra_mode_idx_ls; + int *intra_mode_num; + int prune_cpd_using_sr_stats_ready; +} InterModeSFArgs; + +static int skip_inter_mode(AV1_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize, + int64_t *ref_frame_rd, int midx, + InterModeSFArgs *args) { + const SPEED_FEATURES *const sf = &cpi->sf; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + // Get the actual prediction mode we are trying in this iteration + const THR_MODES mode_enum = av1_default_mode_order[midx]; + const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum]; + const PREDICTION_MODE this_mode = mode_def->mode; + const MV_REFERENCE_FRAME *ref_frames = mode_def->ref_frame; + const MV_REFERENCE_FRAME ref_frame = ref_frames[0]; + const MV_REFERENCE_FRAME second_ref_frame = ref_frames[1]; + const int comp_pred = second_ref_frame > INTRA_FRAME; + const int last_single_ref_mode_idx = + find_last_single_ref_mode_idx(av1_default_mode_order); + + // After we done with single reference modes, find the 2nd best RD + // for a reference frame. Only search compound modes that have a reference + // frame at least as good as the 2nd best. + if (sf->inter_sf.prune_compound_using_single_ref && + midx == last_single_ref_mode_idx + 1) { + find_top_ref(ref_frame_rd); + args->prune_cpd_using_sr_stats_ready = 1; + } + + // Check if this mode should be skipped because it is incompatible with the + // current frame + if (inter_mode_compatible_skip(cpi, x, bsize, this_mode, ref_frames)) + return 1; + const int ret = inter_mode_search_order_independent_skip( + cpi, x, args->mode_skip_mask, args->search_state, + args->skip_ref_frame_mask, this_mode, mode_def->ref_frame); + if (ret == 1) return 1; + *(args->skip_motion_mode) = (ret == 2); + + // We've reached the first compound prediction mode, get stats from the + // single reference predictors to help with pruning + if (sf->inter_sf.prune_comp_search_by_single_result > 0 && comp_pred && + args->reach_first_comp_mode == 0) { + analyze_single_states(cpi, args->search_state); + args->reach_first_comp_mode = 1; + } + + // Prune aggressively when best mode is skippable. + int mul_fact = args->search_state->best_mode_skippable + ? args->mode_thresh_mul_fact + : (1 << MODE_THRESH_QBITS); + int64_t mode_threshold = + (args->search_state->mode_threshold[mode_enum] * mul_fact) >> + MODE_THRESH_QBITS; + + if (args->search_state->best_rd < mode_threshold) return 1; + + // Skip this compound mode based on the RD results from the single prediction + // modes + if (sf->inter_sf.prune_comp_search_by_single_result > 0 && comp_pred) { + if (compound_skip_by_single_states(cpi, args->search_state, this_mode, + ref_frame, second_ref_frame, x)) + return 1; + } + + // Speed features to prune out INTRA frames + if (ref_frame == INTRA_FRAME) { + if ((!cpi->oxcf.enable_smooth_intra || sf->intra_sf.disable_smooth_intra) && + (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED || + mbmi->mode == SMOOTH_V_PRED)) + return 1; + if (!cpi->oxcf.enable_paeth_intra && mbmi->mode == PAETH_PRED) return 1; + if (sf->inter_sf.adaptive_mode_search > 1) + if ((x->source_variance << num_pels_log2_lookup[bsize]) > + args->search_state->best_pred_sse) + return 1; + + // Intra modes will be handled in another loop later. + assert(*args->intra_mode_num < INTRA_MODES); + args->intra_mode_idx_ls[(*args->intra_mode_num)++] = mode_enum; + return 1; + } + + if (sf->inter_sf.prune_compound_using_single_ref && + args->prune_cpd_using_sr_stats_ready && comp_pred && + !in_single_ref_cutoff(ref_frame_rd, ref_frame, second_ref_frame)) { + return 1; + } + + if (sf->inter_sf.prune_compound_using_neighbors && comp_pred) { + if (compound_skip_using_neighbor_refs( + xd, this_mode, ref_frames, + sf->inter_sf.prune_compound_using_neighbors)) + return 1; + } + + return 0; +} + +static void record_best_compound(REFERENCE_MODE reference_mode, + RD_STATS *rd_stats, int comp_pred, int rdmult, + InterModeSearchState *search_state, + int compmode_cost) { + int64_t single_rd, hybrid_rd, single_rate, hybrid_rate; + + if (reference_mode == REFERENCE_MODE_SELECT) { + single_rate = rd_stats->rate - compmode_cost; + hybrid_rate = rd_stats->rate; + } else { + single_rate = rd_stats->rate; + hybrid_rate = rd_stats->rate + compmode_cost; + } + + single_rd = RDCOST(rdmult, single_rate, rd_stats->dist); + hybrid_rd = RDCOST(rdmult, hybrid_rate, rd_stats->dist); + + if (!comp_pred) { + if (single_rd < + search_state->intra_search_state.best_pred_rd[SINGLE_REFERENCE]) + search_state->intra_search_state.best_pred_rd[SINGLE_REFERENCE] = + single_rd; + } else { + if (single_rd < + search_state->intra_search_state.best_pred_rd[COMPOUND_REFERENCE]) + search_state->intra_search_state.best_pred_rd[COMPOUND_REFERENCE] = + single_rd; + } + if (hybrid_rd < + search_state->intra_search_state.best_pred_rd[REFERENCE_MODE_SELECT]) + search_state->intra_search_state.best_pred_rd[REFERENCE_MODE_SELECT] = + hybrid_rd; +} + +// Indicates number of winner simple translation modes to be used +static const unsigned int num_winner_motion_modes[3] = { 0, 10, 3 }; + +// Adds a motion mode to the candidate list for motion_mode_for_winner_cand +// speed feature. This list consists of modes that have only searched +// SIMPLE_TRANSLATION. The final list will be used to search other motion +// modes after the initial RD search. +static void handle_winner_cand( + MB_MODE_INFO *const mbmi, + motion_mode_best_st_candidate *best_motion_mode_cands, + int max_winner_motion_mode_cand, int64_t this_rd, + motion_mode_candidate *motion_mode_cand, int skip_motion_mode) { + // Number of current motion mode candidates in list + const int num_motion_mode_cand = best_motion_mode_cands->num_motion_mode_cand; + int valid_motion_mode_cand_loc = num_motion_mode_cand; + + // find the best location to insert new motion mode candidate + for (int j = 0; j < num_motion_mode_cand; j++) { + if (this_rd < best_motion_mode_cands->motion_mode_cand[j].rd_cost) { + valid_motion_mode_cand_loc = j; + break; + } + } + + // Insert motion mode if location is found + if (valid_motion_mode_cand_loc < max_winner_motion_mode_cand) { + if (num_motion_mode_cand > 0 && + valid_motion_mode_cand_loc < max_winner_motion_mode_cand - 1) + memmove( + &best_motion_mode_cands + ->motion_mode_cand[valid_motion_mode_cand_loc + 1], + &best_motion_mode_cands->motion_mode_cand[valid_motion_mode_cand_loc], + (AOMMIN(num_motion_mode_cand, max_winner_motion_mode_cand - 1) - + valid_motion_mode_cand_loc) * + sizeof(best_motion_mode_cands->motion_mode_cand[0])); + motion_mode_cand->mbmi = *mbmi; + motion_mode_cand->rd_cost = this_rd; + motion_mode_cand->skip_motion_mode = skip_motion_mode; + best_motion_mode_cands->motion_mode_cand[valid_motion_mode_cand_loc] = + *motion_mode_cand; + best_motion_mode_cands->num_motion_mode_cand = + AOMMIN(max_winner_motion_mode_cand, + best_motion_mode_cands->num_motion_mode_cand + 1); + } +} + +void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, + MACROBLOCK *x, RD_STATS *rd_cost, + const BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far) { + AV1_COMMON *const cm = &cpi->common; + const FeatureFlags *const features = &cm->features; + const int num_planes = av1_num_planes(cm); + const SPEED_FEATURES *const sf = &cpi->sf; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + int i; + const int *comp_inter_cost = + x->comp_inter_cost[av1_get_reference_mode_context(xd)]; + + InterModeSearchState search_state; + init_inter_mode_search_state(&search_state, cpi, x, bsize, best_rd_so_far); + INTERINTRA_MODE interintra_modes[REF_FRAMES] = { + INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, + INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES + }; + HandleInterModeArgs args = { { NULL }, + { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }, + { NULL }, + { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, + MAX_SB_SIZE >> 1 }, + NULL, + NULL, + NULL, + search_state.modelled_rd, + INT_MAX, + INT_MAX, + search_state.simple_rd, + 0, + interintra_modes, + 1, + NULL, + { { { 0 }, { { 0 } }, { 0 }, 0, 0, 0, 0 } }, + 0 }; + // Indicates the appropriate number of simple translation winner modes for + // exhaustive motion mode evaluation + const int max_winner_motion_mode_cand = + num_winner_motion_modes[cpi->sf.winner_mode_sf + .motion_mode_for_winner_cand]; + assert(max_winner_motion_mode_cand <= MAX_WINNER_MOTION_MODES); + motion_mode_candidate motion_mode_cand; + motion_mode_best_st_candidate best_motion_mode_cands; + // Initializing the number of motion mode candidates to zero. + best_motion_mode_cands.num_motion_mode_cand = 0; + for (i = 0; i < MAX_WINNER_MOTION_MODES; ++i) + best_motion_mode_cands.motion_mode_cand[i].rd_cost = INT64_MAX; + + for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX; + + av1_invalid_rd_stats(rd_cost); + + // Ref frames that are selected by square partition blocks. + int picked_ref_frames_mask = 0; + if (cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions && + mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) { + // prune_ref_frame_for_rect_partitions = 1 implies prune only extended + // partition blocks. prune_ref_frame_for_rect_partitions >=2 + // implies prune for vert, horiz and extended partition blocks. + if ((mbmi->partition != PARTITION_VERT && + mbmi->partition != PARTITION_HORZ) || + cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions >= 2) { + picked_ref_frames_mask = + fetch_picked_ref_frames_mask(x, bsize, cm->seq_params.mib_size); + } + } + + // Skip ref frames that never selected by square blocks. + const int skip_ref_frame_mask = + picked_ref_frames_mask ? ~picked_ref_frames_mask : 0; + mode_skip_mask_t mode_skip_mask; + unsigned int ref_costs_single[REF_FRAMES]; + unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES]; + struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]; + // init params, set frame modes, speed features + set_params_rd_pick_inter_mode(cpi, x, &args, bsize, &mode_skip_mask, + skip_ref_frame_mask, ref_costs_single, + ref_costs_comp, yv12_mb); + + int64_t best_est_rd = INT64_MAX; + const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; + // If do_tx_search is 0, only estimated RD should be computed. + // If do_tx_search is 1, all modes have TX search performed. + const int do_tx_search = + !((cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1 && md->ready) || + (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 2 && + num_pels_log2_lookup[bsize] > 8) || + cpi->sf.rt_sf.force_tx_search_off); + InterModesInfo *inter_modes_info = x->inter_modes_info; + inter_modes_info->num = 0; + + int intra_mode_num = 0; + int intra_mode_idx_ls[INTRA_MODES]; + + // Temporary buffers used by handle_inter_mode(). + uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_obmc_bufs[0]); + + // The best RD found for the reference frame, among single reference modes. + // Note that the 0-th element will contain a cut-off that is later used + // to determine if we should skip a compound mode. + int64_t ref_frame_rd[REF_FRAMES] = { INT64_MAX, INT64_MAX, INT64_MAX, + INT64_MAX, INT64_MAX, INT64_MAX, + INT64_MAX, INT64_MAX }; + const int skip_ctx = av1_get_skip_context(xd); + + // Prepared stats used later to check if we could skip intra mode eval. + int64_t inter_cost = -1; + int64_t intra_cost = -1; + // Need to tweak the threshold for hdres speed 0 & 1. + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + // Obtain the relevant tpl stats for pruning inter modes + PruneInfoFromTpl inter_cost_info_from_tpl; +#if !CONFIG_REALTIME_ONLY + if (cpi->sf.inter_sf.prune_inter_modes_based_on_tpl) { + // x->search_ref_frame[id] = 1 => no pruning in + // prune_ref_by_selective_ref_frame() + // x->search_ref_frame[id] = 0 => ref frame can be pruned in + // prune_ref_by_selective_ref_frame() + // Populating valid_refs[idx] = 1 ensures that + // 'inter_cost_info_from_tpl.best_inter_cost' does not correspond to a + // pruned ref frame. + int valid_refs[INTER_REFS_PER_FRAME]; + for (MV_REFERENCE_FRAME frame = LAST_FRAME; frame < REF_FRAMES; frame++) { + const MV_REFERENCE_FRAME refs[2] = { frame, NONE_FRAME }; + valid_refs[frame - 1] = + x->search_ref_frame[frame] || + !prune_ref_by_selective_ref_frame( + cpi, x, refs, cm->cur_frame->ref_display_order_hint); + } + av1_zero(inter_cost_info_from_tpl); + get_block_level_tpl_stats(cpi, bsize, mi_row, mi_col, valid_refs, + &inter_cost_info_from_tpl); + } +#endif + const int do_pruning = + (AOMMIN(cm->width, cm->height) > 480 && cpi->speed <= 1) ? 0 : 1; + if (do_pruning && sf->intra_sf.skip_intra_in_interframe) { + // Only consider full SB. + int len = tpl_blocks_in_sb(cm->seq_params.sb_size); + if (len == x->valid_cost_b) { + const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D); + const int tplw = mi_size_wide[tpl_bsize]; + const int tplh = mi_size_high[tpl_bsize]; + const int nw = mi_size_wide[bsize] / tplw; + const int nh = mi_size_high[bsize] / tplh; + if (nw >= 1 && nh >= 1) { + const int of_h = mi_row % mi_size_high[cm->seq_params.sb_size]; + const int of_w = mi_col % mi_size_wide[cm->seq_params.sb_size]; + const int start = of_h / tplh * x->cost_stride + of_w / tplw; + + for (int k = 0; k < nh; k++) { + for (int l = 0; l < nw; l++) { + inter_cost += x->inter_cost_b[start + k * x->cost_stride + l]; + intra_cost += x->intra_cost_b[start + k * x->cost_stride + l]; + } + } + inter_cost /= nw * nh; + intra_cost /= nw * nh; + } + } + } + + // Initialize best mode stats for winner mode processing + av1_zero(x->winner_mode_stats); + x->winner_mode_count = 0; + store_winner_mode_stats( + &cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID, NULL, bsize, + best_rd_so_far, cpi->sf.winner_mode_sf.enable_multiwinner_mode_process, + 0); + + int mode_thresh_mul_fact = (1 << MODE_THRESH_QBITS); + if (sf->inter_sf.prune_inter_modes_if_skippable) { + // Higher multiplication factor values for lower quantizers. + mode_thresh_mul_fact = mode_threshold_mul_factor[x->qindex]; + } + + // Initialize arguments for mode loop speed features + InterModeSFArgs sf_args = { &args.skip_motion_mode, + &mode_skip_mask, + &search_state, + skip_ref_frame_mask, + 0, + mode_thresh_mul_fact, + intra_mode_idx_ls, + &intra_mode_num, + 0 }; + + // Here midx is just an iterator index that should not be used by itself + // except to keep track of the number of modes searched. It should be used + // with av1_default_mode_order to get the enum that defines the mode, which + // can be used with av1_mode_defs to get the prediction mode and the ref + // frames. + for (THR_MODES midx = THR_MODE_START; midx < THR_MODE_END; ++midx) { + // Get the actual prediction mode we are trying in this iteration + const THR_MODES mode_enum = av1_default_mode_order[midx]; + const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum]; + const PREDICTION_MODE this_mode = mode_def->mode; + const MV_REFERENCE_FRAME *ref_frames = mode_def->ref_frame; + + const MV_REFERENCE_FRAME ref_frame = ref_frames[0]; + const MV_REFERENCE_FRAME second_ref_frame = ref_frames[1]; + const int is_single_pred = + ref_frame > INTRA_FRAME && second_ref_frame == NONE_FRAME; + const int comp_pred = second_ref_frame > INTRA_FRAME; + + init_mbmi(mbmi, this_mode, ref_frames, cm); + + x->force_skip = 0; + set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); + + // Apply speed features to decide if this inter mode can be skipped + if (skip_inter_mode(cpi, x, bsize, ref_frame_rd, midx, &sf_args)) continue; + + // Select prediction reference frames. + for (i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = yv12_mb[ref_frame][i]; + if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i]; + } + + mbmi->angle_delta[PLANE_TYPE_Y] = 0; + mbmi->angle_delta[PLANE_TYPE_UV] = 0; + mbmi->filter_intra_mode_info.use_filter_intra = 0; + mbmi->ref_mv_idx = 0; + + const int64_t ref_best_rd = search_state.best_rd; + int disable_skip = 0; + RD_STATS rd_stats, rd_stats_y, rd_stats_uv; + av1_init_rd_stats(&rd_stats); + + const int ref_frame_cost = comp_pred + ? ref_costs_comp[ref_frame][second_ref_frame] + : ref_costs_single[ref_frame]; + const int compmode_cost = + is_comp_ref_allowed(mbmi->sb_type) ? comp_inter_cost[comp_pred] : 0; + const int real_compmode_cost = + cm->current_frame.reference_mode == REFERENCE_MODE_SELECT + ? compmode_cost + : 0; + // Point to variables that are maintained between loop iterations + args.single_newmv = search_state.single_newmv; + args.single_newmv_rate = search_state.single_newmv_rate; + args.single_newmv_valid = search_state.single_newmv_valid; + args.single_comp_cost = real_compmode_cost; + args.ref_frame_cost = ref_frame_cost; + if (is_single_pred) { + args.simple_rd_state = x->simple_rd_state[mode_enum]; + } + + int64_t skip_rd[2] = { search_state.best_skip_rd[0], + search_state.best_skip_rd[1] }; + int64_t this_rd = handle_inter_mode( + cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, + &disable_skip, &args, ref_best_rd, tmp_buf, &x->comp_rd_buffer, + &best_est_rd, do_tx_search, inter_modes_info, &motion_mode_cand, + skip_rd, &inter_cost_info_from_tpl); + + if (sf->inter_sf.prune_comp_search_by_single_result > 0 && + is_inter_singleref_mode(this_mode) && args.single_ref_first_pass) { + collect_single_states(x, &search_state, mbmi); + } + + if (this_rd == INT64_MAX) continue; + + if (mbmi->skip) { + rd_stats_y.rate = 0; + rd_stats_uv.rate = 0; + } + + if (sf->inter_sf.prune_compound_using_single_ref && is_single_pred && + this_rd < ref_frame_rd[ref_frame]) { + ref_frame_rd[ref_frame] = this_rd; + } + + // Did this mode help, i.e., is it the new best mode + if (this_rd < search_state.best_rd) { + assert(IMPLIES(comp_pred, + cm->current_frame.reference_mode != SINGLE_REFERENCE)); + search_state.best_pred_sse = x->pred_sse[ref_frame]; + update_search_state(&search_state, rd_cost, ctx, &rd_stats, &rd_stats_y, + &rd_stats_uv, mode_enum, x, do_tx_search); + if (do_tx_search) search_state.best_skip_rd[0] = skip_rd[0]; + search_state.best_skip_rd[1] = skip_rd[1]; + } + if (cpi->sf.winner_mode_sf.motion_mode_for_winner_cand) { + // Add this mode to motion mode candidate list for motion mode search + // if using motion_mode_for_winner_cand speed feature + handle_winner_cand(mbmi, &best_motion_mode_cands, + max_winner_motion_mode_cand, this_rd, + &motion_mode_cand, args.skip_motion_mode); + } + + /* keep record of best compound/single-only prediction */ + if (!disable_skip) { + record_best_compound(cm->current_frame.reference_mode, &rd_stats, + comp_pred, x->rdmult, &search_state, compmode_cost); + } + } + + if (cpi->sf.winner_mode_sf.motion_mode_for_winner_cand) { + // For the single ref winner candidates, evaluate other motion modes (non + // simple translation). + evaluate_motion_mode_for_winner_candidates( + cpi, x, rd_cost, &args, tile_data, ctx, yv12_mb, + &best_motion_mode_cands, do_tx_search, bsize, &best_est_rd, + &search_state); + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, do_tx_search_time); +#endif + if (do_tx_search != 1) { + inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr); + search_state.best_rd = best_rd_so_far; + search_state.best_mode_index = THR_INVALID; + // Initialize best mode stats for winner mode processing + x->winner_mode_count = 0; + store_winner_mode_stats( + &cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID, NULL, bsize, + best_rd_so_far, cpi->sf.winner_mode_sf.enable_multiwinner_mode_process, + do_tx_search); + inter_modes_info->num = + inter_modes_info->num < cpi->sf.rt_sf.num_inter_modes_for_tx_search + ? inter_modes_info->num + : cpi->sf.rt_sf.num_inter_modes_for_tx_search; + const int64_t top_est_rd = + inter_modes_info->num > 0 + ? inter_modes_info + ->est_rd_arr[inter_modes_info->rd_idx_pair_arr[0].idx] + : INT64_MAX; + for (int j = 0; j < inter_modes_info->num; ++j) { + const int data_idx = inter_modes_info->rd_idx_pair_arr[j].idx; + *mbmi = inter_modes_info->mbmi_arr[data_idx]; + int64_t curr_est_rd = inter_modes_info->est_rd_arr[data_idx]; + if (curr_est_rd * 0.80 > top_est_rd) break; + + x->force_skip = 0; + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + + // Select prediction reference frames. + const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME; + for (i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i]; + if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i]; + } + + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, + av1_num_planes(cm) - 1); + if (mbmi->motion_mode == OBMC_CAUSAL) { + av1_build_obmc_inter_predictors_sb(cm, xd); + } + + RD_STATS rd_stats; + RD_STATS rd_stats_y; + RD_STATS rd_stats_uv; + const int mode_rate = inter_modes_info->mode_rate_arr[data_idx]; + int64_t skip_rd = INT64_MAX; + if (cpi->sf.inter_sf.txfm_rd_gate_level) { + // Check if the mode is good enough based on skip RD + int64_t curr_sse = inter_modes_info->sse_arr[data_idx]; + skip_rd = RDCOST(x->rdmult, mode_rate, curr_sse); + int eval_txfm = + check_txfm_eval(x, bsize, search_state.best_skip_rd[0], skip_rd, + cpi->sf.inter_sf.txfm_rd_gate_level, 0); + if (!eval_txfm) continue; + } + + if (!av1_txfm_search(cpi, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, + mode_rate, search_state.best_rd)) { + continue; + } else if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) { + inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats.sse, + rd_stats.dist, + rd_stats_y.rate + rd_stats_uv.rate + + x->skip_cost[skip_ctx][mbmi->skip]); + } + rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist); + + const THR_MODES mode_enum = get_prediction_mode_idx( + mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]); + + // Collect mode stats for multiwinner mode processing + const int txfm_search_done = 1; + store_winner_mode_stats( + &cpi->common, x, mbmi, &rd_stats, &rd_stats_y, &rd_stats_uv, + mode_enum, NULL, bsize, rd_stats.rdcost, + cpi->sf.winner_mode_sf.enable_multiwinner_mode_process, + txfm_search_done); + + if (rd_stats.rdcost < search_state.best_rd) { + update_search_state(&search_state, rd_cost, ctx, &rd_stats, &rd_stats_y, + &rd_stats_uv, mode_enum, x, txfm_search_done); + search_state.best_skip_rd[0] = skip_rd; + } + } + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, do_tx_search_time); +#endif + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, handle_intra_mode_time); +#endif + + // Gate intra mode evaluation if best of inter is skip except when source + // variance is extremely low + if (sf->intra_sf.skip_intra_in_interframe && + (x->source_variance > sf->intra_sf.src_var_thresh_intra_skip)) { + if (inter_cost >= 0 && intra_cost >= 0) { + aom_clear_system_state(); + const NN_CONFIG *nn_config = (AOMMIN(cm->width, cm->height) <= 480) + ? &av1_intrap_nn_config + : &av1_intrap_hd_nn_config; + float nn_features[6]; + float scores[2] = { 0.0f }; + float probs[2] = { 0.0f }; + + nn_features[0] = (float)search_state.best_mbmode.skip; + nn_features[1] = (float)mi_size_wide_log2[bsize]; + nn_features[2] = (float)mi_size_high_log2[bsize]; + nn_features[3] = (float)intra_cost; + nn_features[4] = (float)inter_cost; + const int ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd); + const int ac_q_max = av1_ac_quant_QTX(255, 0, xd->bd); + nn_features[5] = (float)(ac_q_max / ac_q); + + av1_nn_predict(nn_features, nn_config, 1, scores); + aom_clear_system_state(); + av1_nn_softmax(scores, probs, 2); + + if (probs[1] > 0.8) search_state.intra_search_state.skip_intra_modes = 1; + } else if ((search_state.best_mbmode.skip) && + (sf->intra_sf.skip_intra_in_interframe >= 2)) { + search_state.intra_search_state.skip_intra_modes = 1; + } + } + + const int intra_ref_frame_cost = ref_costs_single[INTRA_FRAME]; + for (int j = 0; j < intra_mode_num; ++j) { + if (sf->intra_sf.skip_intra_in_interframe && + search_state.intra_search_state.skip_intra_modes) + break; + const THR_MODES mode_enum = intra_mode_idx_ls[j]; + const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum]; + const PREDICTION_MODE this_mode = mode_def->mode; + + assert(av1_mode_defs[mode_enum].ref_frame[0] == INTRA_FRAME); + assert(av1_mode_defs[mode_enum].ref_frame[1] == NONE_FRAME); + init_mbmi(mbmi, this_mode, av1_mode_defs[mode_enum].ref_frame, cm); + x->force_skip = 0; + + if (this_mode != DC_PRED) { + // Only search the oblique modes if the best so far is + // one of the neighboring directional modes + if ((sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) && + (this_mode >= D45_PRED && this_mode <= PAETH_PRED)) { + if (search_state.best_mode_index != THR_INVALID && + search_state.best_mbmode.ref_frame[0] > INTRA_FRAME) + continue; + } + if (sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) { + if (conditional_skipintra( + this_mode, search_state.intra_search_state.best_intra_mode)) + continue; + } + } + + RD_STATS intra_rd_stats, intra_rd_stats_y, intra_rd_stats_uv; + intra_rd_stats.rdcost = av1_handle_intra_mode( + &search_state.intra_search_state, cpi, x, bsize, intra_ref_frame_cost, + ctx, 0, &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv, + search_state.best_rd, &search_state.best_intra_rd, + search_state.best_mbmode.skip); + // Collect mode stats for multiwinner mode processing + const int txfm_search_done = 1; + store_winner_mode_stats( + &cpi->common, x, mbmi, &intra_rd_stats, &intra_rd_stats_y, + &intra_rd_stats_uv, mode_enum, NULL, bsize, intra_rd_stats.rdcost, + cpi->sf.winner_mode_sf.enable_multiwinner_mode_process, + txfm_search_done); + if (intra_rd_stats.rdcost < search_state.best_rd) { + update_search_state(&search_state, rd_cost, ctx, &intra_rd_stats, + &intra_rd_stats_y, &intra_rd_stats_uv, mode_enum, x, + txfm_search_done); + } + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, handle_intra_mode_time); +#endif + + int winner_mode_count = cpi->sf.winner_mode_sf.enable_multiwinner_mode_process + ? x->winner_mode_count + : 1; + // In effect only when fast tx search speed features are enabled. + refine_winner_mode_tx( + cpi, x, rd_cost, bsize, ctx, &search_state.best_mode_index, + &search_state.best_mbmode, yv12_mb, search_state.best_rate_y, + search_state.best_rate_uv, &search_state.best_skip2, winner_mode_count); + + // Initialize default mode evaluation params + set_mode_eval_params(cpi, x, DEFAULT_EVAL); + + // Only try palette mode when the best mode so far is an intra mode. + const int try_palette = + cpi->oxcf.enable_palette && + av1_allow_palette(features->allow_screen_content_tools, mbmi->sb_type) && + !is_inter_mode(search_state.best_mbmode.mode); + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + RD_STATS this_rd_cost; + int this_skippable = 0; + if (try_palette) { + this_skippable = av1_search_palette_mode( + cpi, x, &this_rd_cost, ctx, bsize, mbmi, pmi, ref_costs_single, + &search_state.intra_search_state, search_state.best_rd); + if (this_rd_cost.rdcost < search_state.best_rd) { + search_state.best_mode_index = THR_DC; + mbmi->mv[0].as_int = 0; + rd_cost->rate = this_rd_cost.rate; + rd_cost->dist = this_rd_cost.dist; + rd_cost->rdcost = this_rd_cost.rdcost; + search_state.best_rd = rd_cost->rdcost; + search_state.best_mbmode = *mbmi; + search_state.best_skip2 = 0; + search_state.best_mode_skippable = this_skippable; + memcpy(ctx->blk_skip, x->blk_skip, + sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); + av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + } + } + + search_state.best_mbmode.skip_mode = 0; + if (cm->current_frame.skip_mode_info.skip_mode_flag && + is_comp_ref_allowed(bsize)) { + const struct segmentation *const seg = &cm->seg; + unsigned char segment_id = mbmi->segment_id; + if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { + rd_pick_skip_mode(rd_cost, &search_state, cpi, x, bsize, yv12_mb); + } + } + + // Make sure that the ref_mv_idx is only nonzero when we're + // using a mode which can support ref_mv_idx + if (search_state.best_mbmode.ref_mv_idx != 0 && + !(search_state.best_mbmode.mode == NEWMV || + search_state.best_mbmode.mode == NEW_NEWMV || + have_nearmv_in_inter_mode(search_state.best_mbmode.mode))) { + search_state.best_mbmode.ref_mv_idx = 0; + } + + if (search_state.best_mode_index == THR_INVALID || + search_state.best_rd >= best_rd_so_far) { + rd_cost->rate = INT_MAX; + rd_cost->rdcost = INT64_MAX; + return; + } + + const InterpFilter interp_filter = features->interp_filter; + assert((interp_filter == SWITCHABLE) || + (interp_filter == + search_state.best_mbmode.interp_filters.as_filters.y_filter) || + !is_inter_block(&search_state.best_mbmode)); + assert((interp_filter == SWITCHABLE) || + (interp_filter == + search_state.best_mbmode.interp_filters.as_filters.x_filter) || + !is_inter_block(&search_state.best_mbmode)); + + if (!cpi->rc.is_src_frame_alt_ref && cpi->sf.inter_sf.adaptive_rd_thresh) { + av1_update_rd_thresh_fact(cm, x->thresh_freq_fact, + sf->inter_sf.adaptive_rd_thresh, bsize, + search_state.best_mode_index); + } + + // macroblock modes + *mbmi = search_state.best_mbmode; + x->force_skip |= search_state.best_skip2; + + // Note: this section is needed since the mode may have been forced to + // GLOBALMV by the all-zero mode handling of ref-mv. + if (mbmi->mode == GLOBALMV || mbmi->mode == GLOBAL_GLOBALMV) { + // Correct the interp filters for GLOBALMV + if (is_nontrans_global_motion(xd, xd->mi[0])) { + int_interpfilters filters = + av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter)); + assert(mbmi->interp_filters.as_int == filters.as_int); + (void)filters; + } + } + + for (i = 0; i < REFERENCE_MODES; ++i) { + if (search_state.intra_search_state.best_pred_rd[i] == INT64_MAX) { + search_state.best_pred_diff[i] = INT_MIN; + } else { + search_state.best_pred_diff[i] = + search_state.best_rd - + search_state.intra_search_state.best_pred_rd[i]; + } + } + + x->force_skip |= search_state.best_mode_skippable; + + assert(search_state.best_mode_index != THR_INVALID); + +#if CONFIG_INTERNAL_STATS + store_coding_context(x, ctx, search_state.best_mode_index, + search_state.best_pred_diff, + search_state.best_mode_skippable); +#else + store_coding_context(x, ctx, search_state.best_pred_diff, + search_state.best_mode_skippable); +#endif // CONFIG_INTERNAL_STATS + + if (pmi->palette_size[1] > 0) { + assert(try_palette); + av1_restore_uv_color_map(cpi, x); + } +} + +void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi, + TileDataEnc *tile_data, MACROBLOCK *x, + int mi_row, int mi_col, + RD_STATS *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far) { + const AV1_COMMON *const cm = &cpi->common; + const FeatureFlags *const features = &cm->features; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + unsigned char segment_id = mbmi->segment_id; + const int comp_pred = 0; + int i; + int64_t best_pred_diff[REFERENCE_MODES]; + unsigned int ref_costs_single[REF_FRAMES]; + unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES]; + int *comp_inter_cost = x->comp_inter_cost[av1_get_reference_mode_context(xd)]; + InterpFilter best_filter = SWITCHABLE; + int64_t this_rd = INT64_MAX; + int rate2 = 0; + const int64_t distortion2 = 0; + (void)mi_row; + (void)mi_col; + (void)tile_data; + + av1_collect_neighbors_ref_counts(xd); + + estimate_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single, + ref_costs_comp); + + for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX; + for (i = LAST_FRAME; i < REF_FRAMES; ++i) x->pred_mv_sad[i] = INT_MAX; + + rd_cost->rate = INT_MAX; + + assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)); + + mbmi->palette_mode_info.palette_size[0] = 0; + mbmi->palette_mode_info.palette_size[1] = 0; + mbmi->filter_intra_mode_info.use_filter_intra = 0; + mbmi->mode = GLOBALMV; + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->uv_mode = UV_DC_PRED; + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) + mbmi->ref_frame[0] = get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME); + else + mbmi->ref_frame[0] = LAST_FRAME; + mbmi->ref_frame[1] = NONE_FRAME; + mbmi->mv[0].as_int = + gm_get_motion_vector(&cm->global_motion[mbmi->ref_frame[0]], + features->allow_high_precision_mv, bsize, mi_col, + mi_row, features->cur_frame_force_integer_mv) + .as_int; + mbmi->tx_size = max_txsize_lookup[bsize]; + x->force_skip = 1; + + mbmi->ref_mv_idx = 0; + + mbmi->motion_mode = SIMPLE_TRANSLATION; + av1_count_overlappable_neighbors(cm, xd); + if (is_motion_variation_allowed_bsize(bsize) && !has_second_ref(mbmi)) { + int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; + mbmi->num_proj_ref = av1_findSamples(cm, xd, pts, pts_inref); + // Select the samples according to motion vector difference + if (mbmi->num_proj_ref > 1) + mbmi->num_proj_ref = av1_selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref, + mbmi->num_proj_ref, bsize); + } + + const InterpFilter interp_filter = features->interp_filter; + set_default_interp_filters(mbmi, interp_filter); + + if (interp_filter != SWITCHABLE) { + best_filter = interp_filter; + } else { + best_filter = EIGHTTAP_REGULAR; + if (av1_is_interp_needed(xd) && + x->source_variance >= + cpi->sf.interp_sf.disable_filter_search_var_thresh) { + int rs; + int best_rs = INT_MAX; + for (i = 0; i < SWITCHABLE_FILTERS; ++i) { + mbmi->interp_filters = av1_broadcast_interp_filter(i); + rs = av1_get_switchable_rate(x, xd, interp_filter); + if (rs < best_rs) { + best_rs = rs; + best_filter = mbmi->interp_filters.as_filters.y_filter; + } + } + } + } + // Set the appropriate filter + mbmi->interp_filters = av1_broadcast_interp_filter(best_filter); + rate2 += av1_get_switchable_rate(x, xd, interp_filter); + + if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) + rate2 += comp_inter_cost[comp_pred]; + + // Estimate the reference frame signaling cost and add it + // to the rolling cost variable. + rate2 += ref_costs_single[LAST_FRAME]; + this_rd = RDCOST(x->rdmult, rate2, distortion2); + + rd_cost->rate = rate2; + rd_cost->dist = distortion2; + rd_cost->rdcost = this_rd; + + if (this_rd >= best_rd_so_far) { + rd_cost->rate = INT_MAX; + rd_cost->rdcost = INT64_MAX; + return; + } + + assert((interp_filter == SWITCHABLE) || + (interp_filter == mbmi->interp_filters.as_filters.y_filter)); + + if (cpi->sf.inter_sf.adaptive_rd_thresh) { + av1_update_rd_thresh_fact(cm, x->thresh_freq_fact, + cpi->sf.inter_sf.adaptive_rd_thresh, bsize, + THR_GLOBALMV); + } + + av1_zero(best_pred_diff); + +#if CONFIG_INTERNAL_STATS + store_coding_context(x, ctx, THR_GLOBALMV, best_pred_diff, 0); +#else + store_coding_context(x, ctx, best_pred_diff, 0); +#endif // CONFIG_INTERNAL_STATS +} + +struct calc_target_weighted_pred_ctxt { + const MACROBLOCK *x; + const uint8_t *tmp; + int tmp_stride; + int overlap; +}; + +static INLINE void calc_target_weighted_pred_above( + MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *nb_mi, void *fun_ctxt, const int num_planes) { + (void)nb_mi; + (void)num_planes; + (void)rel_mi_row; + (void)dir; + + struct calc_target_weighted_pred_ctxt *ctxt = + (struct calc_target_weighted_pred_ctxt *)fun_ctxt; + + const int bw = xd->width << MI_SIZE_LOG2; + const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap); + + int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_col * MI_SIZE); + int32_t *mask = ctxt->x->mask_buf + (rel_mi_col * MI_SIZE); + const uint8_t *tmp = ctxt->tmp + rel_mi_col * MI_SIZE; + const int is_hbd = is_cur_buf_hbd(xd); + + if (!is_hbd) { + for (int row = 0; row < ctxt->overlap; ++row) { + const uint8_t m0 = mask1d[row]; + const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0; + for (int col = 0; col < op_mi_size * MI_SIZE; ++col) { + wsrc[col] = m1 * tmp[col]; + mask[col] = m0; + } + wsrc += bw; + mask += bw; + tmp += ctxt->tmp_stride; + } + } else { + const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp); + + for (int row = 0; row < ctxt->overlap; ++row) { + const uint8_t m0 = mask1d[row]; + const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0; + for (int col = 0; col < op_mi_size * MI_SIZE; ++col) { + wsrc[col] = m1 * tmp16[col]; + mask[col] = m0; + } + wsrc += bw; + mask += bw; + tmp16 += ctxt->tmp_stride; + } + } +} + +static INLINE void calc_target_weighted_pred_left( + MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *nb_mi, void *fun_ctxt, const int num_planes) { + (void)nb_mi; + (void)num_planes; + (void)rel_mi_col; + (void)dir; + + struct calc_target_weighted_pred_ctxt *ctxt = + (struct calc_target_weighted_pred_ctxt *)fun_ctxt; + + const int bw = xd->width << MI_SIZE_LOG2; + const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap); + + int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_row * MI_SIZE * bw); + int32_t *mask = ctxt->x->mask_buf + (rel_mi_row * MI_SIZE * bw); + const uint8_t *tmp = ctxt->tmp + (rel_mi_row * MI_SIZE * ctxt->tmp_stride); + const int is_hbd = is_cur_buf_hbd(xd); + + if (!is_hbd) { + for (int row = 0; row < op_mi_size * MI_SIZE; ++row) { + for (int col = 0; col < ctxt->overlap; ++col) { + const uint8_t m0 = mask1d[col]; + const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0; + wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 + + (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1; + mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0; + } + wsrc += bw; + mask += bw; + tmp += ctxt->tmp_stride; + } + } else { + const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp); + + for (int row = 0; row < op_mi_size * MI_SIZE; ++row) { + for (int col = 0; col < ctxt->overlap; ++col) { + const uint8_t m0 = mask1d[col]; + const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0; + wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 + + (tmp16[col] << AOM_BLEND_A64_ROUND_BITS) * m1; + mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0; + } + wsrc += bw; + mask += bw; + tmp16 += ctxt->tmp_stride; + } + } +} + +// This function has a structure similar to av1_build_obmc_inter_prediction +// +// The OBMC predictor is computed as: +// +// PObmc(x,y) = +// AOM_BLEND_A64(Mh(x), +// AOM_BLEND_A64(Mv(y), P(x,y), PAbove(x,y)), +// PLeft(x, y)) +// +// Scaling up by AOM_BLEND_A64_MAX_ALPHA ** 2 and omitting the intermediate +// rounding, this can be written as: +// +// AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * Pobmc(x,y) = +// Mh(x) * Mv(y) * P(x,y) + +// Mh(x) * Cv(y) * Pabove(x,y) + +// AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y) +// +// Where : +// +// Cv(y) = AOM_BLEND_A64_MAX_ALPHA - Mv(y) +// Ch(y) = AOM_BLEND_A64_MAX_ALPHA - Mh(y) +// +// This function computes 'wsrc' and 'mask' as: +// +// wsrc(x, y) = +// AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * src(x, y) - +// Mh(x) * Cv(y) * Pabove(x,y) + +// AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y) +// +// mask(x, y) = Mh(x) * Mv(y) +// +// These can then be used to efficiently approximate the error for any +// predictor P in the context of the provided neighbouring predictors by +// computing: +// +// error(x, y) = +// wsrc(x, y) - mask(x, y) * P(x, y) / (AOM_BLEND_A64_MAX_ALPHA ** 2) +// +static AOM_INLINE void calc_target_weighted_pred( + const AV1_COMMON *cm, const MACROBLOCK *x, const MACROBLOCKD *xd, + const uint8_t *above, int above_stride, const uint8_t *left, + int left_stride) { + const BLOCK_SIZE bsize = xd->mi[0]->sb_type; + const int bw = xd->width << MI_SIZE_LOG2; + const int bh = xd->height << MI_SIZE_LOG2; + int32_t *mask_buf = x->mask_buf; + int32_t *wsrc_buf = x->wsrc_buf; + + const int is_hbd = is_cur_buf_hbd(xd); + const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA; + + // plane 0 should not be sub-sampled + assert(xd->plane[0].subsampling_x == 0); + assert(xd->plane[0].subsampling_y == 0); + + av1_zero_array(wsrc_buf, bw * bh); + for (int i = 0; i < bw * bh; ++i) mask_buf[i] = AOM_BLEND_A64_MAX_ALPHA; + + // handle above row + if (xd->up_available) { + const int overlap = + AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1; + struct calc_target_weighted_pred_ctxt ctxt = { x, above, above_stride, + overlap }; + foreach_overlappable_nb_above(cm, (MACROBLOCKD *)xd, + max_neighbor_obmc[mi_size_wide_log2[bsize]], + calc_target_weighted_pred_above, &ctxt); + } + + for (int i = 0; i < bw * bh; ++i) { + wsrc_buf[i] *= AOM_BLEND_A64_MAX_ALPHA; + mask_buf[i] *= AOM_BLEND_A64_MAX_ALPHA; + } + + // handle left column + if (xd->left_available) { + const int overlap = + AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1; + struct calc_target_weighted_pred_ctxt ctxt = { x, left, left_stride, + overlap }; + foreach_overlappable_nb_left(cm, (MACROBLOCKD *)xd, + max_neighbor_obmc[mi_size_high_log2[bsize]], + calc_target_weighted_pred_left, &ctxt); + } + + if (!is_hbd) { + const uint8_t *src = x->plane[0].src.buf; + + for (int row = 0; row < bh; ++row) { + for (int col = 0; col < bw; ++col) { + wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col]; + } + wsrc_buf += bw; + src += x->plane[0].src.stride; + } + } else { + const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[0].src.buf); + + for (int row = 0; row < bh; ++row) { + for (int col = 0; col < bw; ++col) { + wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col]; + } + wsrc_buf += bw; + src += x->plane[0].src.stride; + } + } +} + +/* Use standard 3x3 Sobel matrix. Macro so it can be used for either high or + low bit-depth arrays. */ +#define SOBEL_X(src, stride, i, j) \ + ((src)[((i)-1) + (stride) * ((j)-1)] - \ + (src)[((i) + 1) + (stride) * ((j)-1)] + /* NOLINT */ \ + 2 * (src)[((i)-1) + (stride) * (j)] - /* NOLINT */ \ + 2 * (src)[((i) + 1) + (stride) * (j)] + /* NOLINT */ \ + (src)[((i)-1) + (stride) * ((j) + 1)] - /* NOLINT */ \ + (src)[((i) + 1) + (stride) * ((j) + 1)]) /* NOLINT */ +#define SOBEL_Y(src, stride, i, j) \ + ((src)[((i)-1) + (stride) * ((j)-1)] + \ + 2 * (src)[(i) + (stride) * ((j)-1)] + /* NOLINT */ \ + (src)[((i) + 1) + (stride) * ((j)-1)] - /* NOLINT */ \ + (src)[((i)-1) + (stride) * ((j) + 1)] - /* NOLINT */ \ + 2 * (src)[(i) + (stride) * ((j) + 1)] - /* NOLINT */ \ + (src)[((i) + 1) + (stride) * ((j) + 1)]) /* NOLINT */ + +sobel_xy av1_sobel(const uint8_t *input, int stride, int i, int j, + bool high_bd) { + int16_t s_x; + int16_t s_y; + if (high_bd) { + const uint16_t *src = CONVERT_TO_SHORTPTR(input); + s_x = SOBEL_X(src, stride, i, j); + s_y = SOBEL_Y(src, stride, i, j); + } else { + s_x = SOBEL_X(input, stride, i, j); + s_y = SOBEL_Y(input, stride, i, j); + } + sobel_xy r = { .x = s_x, .y = s_y }; + return r; +} + +// 8-tap Gaussian convolution filter with sigma = 1.3, sums to 128, +// all co-efficients must be even. +DECLARE_ALIGNED(16, static const int16_t, gauss_filter[8]) = { 2, 12, 30, 40, + 30, 12, 2, 0 }; + +void av1_gaussian_blur(const uint8_t *src, int src_stride, int w, int h, + uint8_t *dst, bool high_bd, int bd) { + ConvolveParams conv_params = get_conv_params(0, 0, bd); + InterpFilterParams filter = { .filter_ptr = gauss_filter, + .taps = 8, + .subpel_shifts = 0, + .interp_filter = EIGHTTAP_REGULAR }; + // Requirements from the vector-optimized implementations. + assert(h % 4 == 0); + assert(w % 8 == 0); + // Because we use an eight tap filter, the stride should be at least 7 + w. + assert(src_stride >= w + 7); +#if CONFIG_AV1_HIGHBITDEPTH + if (high_bd) { + av1_highbd_convolve_2d_sr(CONVERT_TO_SHORTPTR(src), src_stride, + CONVERT_TO_SHORTPTR(dst), w, w, h, &filter, + &filter, 0, 0, &conv_params, bd); + } else { + av1_convolve_2d_sr(src, src_stride, dst, w, w, h, &filter, &filter, 0, 0, + &conv_params); + } +#else + (void)high_bd; + av1_convolve_2d_sr(src, src_stride, dst, w, w, h, &filter, &filter, 0, 0, + &conv_params); +#endif +} + +static EdgeInfo edge_probability(const uint8_t *input, int w, int h, + bool high_bd, int bd) { + // The probability of an edge in the whole image is the same as the highest + // probability of an edge for any individual pixel. Use Sobel as the metric + // for finding an edge. + uint16_t highest = 0; + uint16_t highest_x = 0; + uint16_t highest_y = 0; + // Ignore the 1 pixel border around the image for the computation. + for (int j = 1; j < h - 1; ++j) { + for (int i = 1; i < w - 1; ++i) { + sobel_xy g = av1_sobel(input, w, i, j, high_bd); + // Scale down to 8-bit to get same output regardless of bit depth. + int16_t g_x = g.x >> (bd - 8); + int16_t g_y = g.y >> (bd - 8); + uint16_t magnitude = (uint16_t)sqrt(g_x * g_x + g_y * g_y); + highest = AOMMAX(highest, magnitude); + highest_x = AOMMAX(highest_x, g_x); + highest_y = AOMMAX(highest_y, g_y); + } + } + EdgeInfo ei = { .magnitude = highest, .x = highest_x, .y = highest_y }; + return ei; +} + +/* Uses most of the Canny edge detection algorithm to find if there are any + * edges in the image. + */ +EdgeInfo av1_edge_exists(const uint8_t *src, int src_stride, int w, int h, + bool high_bd, int bd) { + if (w < 3 || h < 3) { + EdgeInfo n = { .magnitude = 0, .x = 0, .y = 0 }; + return n; + } + uint8_t *blurred; + if (high_bd) { + blurred = CONVERT_TO_BYTEPTR(aom_memalign(32, sizeof(uint16_t) * w * h)); + } else { + blurred = (uint8_t *)aom_memalign(32, sizeof(uint8_t) * w * h); + } + av1_gaussian_blur(src, src_stride, w, h, blurred, high_bd, bd); + // Skip the non-maximum suppression step in Canny edge detection. We just + // want a probability of an edge existing in the buffer, which is determined + // by the strongest edge in it -- we don't need to eliminate the weaker + // edges. Use Sobel for the edge detection. + EdgeInfo prob = edge_probability(blurred, w, h, high_bd, bd); + if (high_bd) { + aom_free(CONVERT_TO_SHORTPTR(blurred)); + } else { + aom_free(blurred); + } + return prob; +} diff --git a/libs/libaom/src/av1/encoder/rdopt.h b/libs/libaom/src/av1/encoder/rdopt.h new file mode 100644 index 000000000..c7c99ac4b --- /dev/null +++ b/libs/libaom/src/av1/encoder/rdopt.h @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_RDOPT_H_ +#define AOM_AV1_ENCODER_RDOPT_H_ + +#include + +#include "av1/common/blockd.h" +#include "av1/common/txb_common.h" + +#include "av1/encoder/block.h" +#include "av1/encoder/context_tree.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encodetxb.h" +#include "av1/encoder/rdopt_utils.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define COMP_TYPE_RD_THRESH_SCALE 11 +#define COMP_TYPE_RD_THRESH_SHIFT 4 +#define MAX_WINNER_MOTION_MODES 10 + +struct TileInfo; +struct macroblock; +struct RD_STATS; + +// Returns the number of colors in 'src'. +int av1_count_colors(const uint8_t *src, int stride, int rows, int cols, + int *val_count); +// Same as av1_count_colors(), but for high-bitdepth mode. +int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols, + int bit_depth, int *val_count); + +static INLINE int av1_cost_skip_txb(MACROBLOCK *x, const TXB_CTX *const txb_ctx, + int plane, TX_SIZE tx_size) { + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + const PLANE_TYPE plane_type = get_plane_type(plane); + const LV_MAP_COEFF_COST *const coeff_costs = + &x->coeff_costs[txs_ctx][plane_type]; + return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1]; +} + +void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x, + struct RD_STATS *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, int64_t best_rd); + +unsigned int av1_get_sby_perpixel_variance(const struct AV1_COMP *cpi, + const struct buf_2d *ref, + BLOCK_SIZE bs); +unsigned int av1_high_get_sby_perpixel_variance(const struct AV1_COMP *cpi, + const struct buf_2d *ref, + BLOCK_SIZE bs, int bd); + +void av1_rd_pick_inter_mode_sb(struct AV1_COMP *cpi, + struct TileDataEnc *tile_data, + struct macroblock *x, struct RD_STATS *rd_cost, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far); + +void av1_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx); + +void av1_nonrd_pick_inter_mode_sb(struct AV1_COMP *cpi, + struct TileDataEnc *tile_data, + struct macroblock *x, + struct RD_STATS *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far); + +void av1_rd_pick_inter_mode_sb_seg_skip( + const struct AV1_COMP *cpi, struct TileDataEnc *tile_data, + struct macroblock *x, int mi_row, int mi_col, struct RD_STATS *rd_cost, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far); + +// The best edge strength seen in the block, as well as the best x and y +// components of edge strength seen. +typedef struct { + uint16_t magnitude; + uint16_t x; + uint16_t y; +} EdgeInfo; + +/** Returns an integer indicating the strength of the edge. + * 0 means no edge found, 556 is the strength of a solid black/white edge, + * and the number may range higher if the signal is even stronger (e.g., on a + * corner). high_bd is a bool indicating the source should be treated + * as a 16-bit array. bd is the bit depth. + */ +EdgeInfo av1_edge_exists(const uint8_t *src, int src_stride, int w, int h, + bool high_bd, int bd); + +/** Applies a Gaussian blur with sigma = 1.3. Used by av1_edge_exists and + * tests. + */ +void av1_gaussian_blur(const uint8_t *src, int src_stride, int w, int h, + uint8_t *dst, bool high_bd, int bd); + +/* Applies standard 3x3 Sobel matrix. */ +typedef struct { + int16_t x; + int16_t y; +} sobel_xy; + +sobel_xy av1_sobel(const uint8_t *input, int stride, int i, int j, + bool high_bd); + +void av1_inter_mode_data_init(struct TileDataEnc *tile_data); +void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult); + +#if !CONFIG_REALTIME_ONLY +static INLINE int coded_to_superres_mi(int mi_col, int denom) { + return (mi_col * denom + SCALE_NUMERATOR / 2) / SCALE_NUMERATOR; +} +#endif + +static INLINE int av1_encoder_get_relative_dist(const OrderHintInfo *oh, int a, + int b) { + if (!oh->enable_order_hint) return 0; + + assert(a >= 0 && b >= 0); + return (a - b); +} + +// This function will return number of mi's in a superblock. +static INLINE int av1_get_sb_mi_size(const AV1_COMMON *const cm) { + const int mi_alloc_size_1d = mi_size_wide[cm->mi_params.mi_alloc_bsize]; + int sb_mi_rows = + (mi_size_wide[cm->seq_params.sb_size] + mi_alloc_size_1d - 1) / + mi_alloc_size_1d; + assert(mi_size_wide[cm->seq_params.sb_size] == + mi_size_high[cm->seq_params.sb_size]); + int sb_mi_size = sb_mi_rows * sb_mi_rows; + + return sb_mi_size; +} + +// This function will copy usable ref_mv_stack[ref_frame][4] and +// weight[ref_frame][4] information from ref_mv_stack[ref_frame][8] and +// weight[ref_frame][8]. +static INLINE void av1_copy_usable_ref_mv_stack_and_weight( + const MACROBLOCKD *xd, MB_MODE_INFO_EXT *const mbmi_ext, + MV_REFERENCE_FRAME ref_frame) { + memcpy(mbmi_ext->weight[ref_frame], xd->weight[ref_frame], + USABLE_REF_MV_STACK_SIZE * sizeof(xd->weight[0][0])); + memcpy(mbmi_ext->ref_mv_stack[ref_frame], xd->ref_mv_stack[ref_frame], + USABLE_REF_MV_STACK_SIZE * sizeof(xd->ref_mv_stack[0][0])); +} + +// This function prunes the mode if either of the reference frame falls in the +// pruning list +static INLINE int prune_ref(const MV_REFERENCE_FRAME *const ref_frame, + const OrderHintInfo *const order_hint_info, + const unsigned int *const ref_display_order_hint, + const unsigned int frame_display_order_hint, + const int *ref_frame_list) { + for (int i = 0; i < 2; i++) { + if (ref_frame_list[i] == NONE_FRAME) continue; + + if (ref_frame[0] == ref_frame_list[i] || + ref_frame[1] == ref_frame_list[i]) { + if (av1_encoder_get_relative_dist( + order_hint_info, + ref_display_order_hint[ref_frame_list[i] - LAST_FRAME], + frame_display_order_hint) < 0) + return 1; + } + } + return 0; +} + +static INLINE int prune_ref_by_selective_ref_frame( + const AV1_COMP *const cpi, const MACROBLOCK *const x, + const MV_REFERENCE_FRAME *const ref_frame, + const unsigned int *const ref_display_order_hint) { + const SPEED_FEATURES *const sf = &cpi->sf; + if (!sf->inter_sf.selective_ref_frame) return 0; + + const AV1_COMMON *const cm = &cpi->common; + const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info; + const int comp_pred = ref_frame[1] > INTRA_FRAME; + + if (sf->inter_sf.selective_ref_frame >= 2 || + (sf->inter_sf.selective_ref_frame == 1 && comp_pred)) { + int ref_frame_list[2] = { LAST3_FRAME, LAST2_FRAME }; + + if (x != NULL) { + if (x->search_ref_frame[LAST3_FRAME]) ref_frame_list[0] = NONE_FRAME; + if (x->search_ref_frame[LAST2_FRAME]) ref_frame_list[1] = NONE_FRAME; + } + + if (prune_ref(ref_frame, order_hint_info, ref_display_order_hint, + ref_display_order_hint[GOLDEN_FRAME - LAST_FRAME], + ref_frame_list)) + return 1; + } + + if (sf->inter_sf.selective_ref_frame >= 3) { + int ref_frame_list[2] = { ALTREF2_FRAME, BWDREF_FRAME }; + + if (x != NULL) { + if (x->search_ref_frame[ALTREF2_FRAME]) ref_frame_list[0] = NONE_FRAME; + if (x->search_ref_frame[BWDREF_FRAME]) ref_frame_list[1] = NONE_FRAME; + } + + if (prune_ref(ref_frame, order_hint_info, ref_display_order_hint, + ref_display_order_hint[LAST_FRAME - LAST_FRAME], + ref_frame_list)) + return 1; + } + + return 0; +} + +// This function will copy the best reference mode information from +// MB_MODE_INFO_EXT to MB_MODE_INFO_EXT_FRAME. +static INLINE void av1_copy_mbmi_ext_to_mbmi_ext_frame( + MB_MODE_INFO_EXT_FRAME *mbmi_ext_best, + const MB_MODE_INFO_EXT *const mbmi_ext, uint8_t ref_frame_type) { + memcpy(mbmi_ext_best->ref_mv_stack, mbmi_ext->ref_mv_stack[ref_frame_type], + sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE])); + memcpy(mbmi_ext_best->weight, mbmi_ext->weight[ref_frame_type], + sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE])); + mbmi_ext_best->mode_context = mbmi_ext->mode_context[ref_frame_type]; + mbmi_ext_best->ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type]; + memcpy(mbmi_ext_best->global_mvs, mbmi_ext->global_mvs, + sizeof(mbmi_ext->global_mvs)); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_RDOPT_H_ diff --git a/libs/libaom/src/av1/encoder/rdopt_data_defs.h b/libs/libaom/src/av1/encoder/rdopt_data_defs.h new file mode 100644 index 000000000..ca7ef810f --- /dev/null +++ b/libs/libaom/src/av1/encoder/rdopt_data_defs.h @@ -0,0 +1,294 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_RDOPT_DATA_DEFS_H_ +#define AOM_AV1_ENCODER_RDOPT_DATA_DEFS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +static const THR_MODES intra_to_mode_idx[INTRA_MODE_NUM] = { + THR_DC, // DC_PRED, + THR_V_PRED, // V_PRED, + THR_H_PRED, // H_PRED, + THR_D45_PRED, // D45_PRED, + THR_D135_PRED, // D135_PRED, + THR_D113_PRED, // D113_PRED, + THR_D157_PRED, // D157_PRED, + THR_D203_PRED, // D203_PRED, + THR_D67_PRED, // D67_PRED, + THR_SMOOTH, // SMOOTH_PRED, + THR_SMOOTH_V, // SMOOTH_V_PRED, + THR_SMOOTH_H, // SMOOTH_H_PRED, + THR_PAETH, // PAETH_PRED, +}; + +/* clang-format off */ +static const THR_MODES single_inter_to_mode_idx[SINGLE_INTER_MODE_NUM] + [REF_FRAMES] = { + // NEARESTMV, + { THR_INVALID, THR_NEARESTMV, THR_NEARESTL2, THR_NEARESTL3, + THR_NEARESTG, THR_NEARESTB, THR_NEARESTA2, THR_NEARESTA, }, + // NEARMV, + { THR_INVALID, THR_NEARMV, THR_NEARL2, THR_NEARL3, + THR_NEARG, THR_NEARB, THR_NEARA2, THR_NEARA, }, + // GLOBALMV, + { THR_INVALID, THR_GLOBALMV, THR_GLOBALL2, THR_GLOBALL3, + THR_GLOBALG, THR_GLOBALB, THR_GLOBALA2, THR_GLOBALA, }, + // NEWMV, + { THR_INVALID, THR_NEWMV, THR_NEWL2, THR_NEWL3, + THR_NEWG, THR_NEWB, THR_NEWA2, THR_NEWA, }, +}; +/* clang-format on */ + +/* clang-format off */ +static const THR_MODES comp_inter_to_mode_idx[COMP_INTER_MODE_NUM][REF_FRAMES] + [REF_FRAMES] = { + // NEAREST_NEARESTMV, + { + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, + THR_COMP_NEAREST_NEARESTLL2, THR_COMP_NEAREST_NEARESTLL3, + THR_COMP_NEAREST_NEARESTLG, THR_COMP_NEAREST_NEARESTLB, + THR_COMP_NEAREST_NEARESTLA2, THR_COMP_NEAREST_NEARESTLA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAREST_NEARESTL2B, + THR_COMP_NEAREST_NEARESTL2A2, THR_COMP_NEAREST_NEARESTL2A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAREST_NEARESTL3B, + THR_COMP_NEAREST_NEARESTL3A2, THR_COMP_NEAREST_NEARESTL3A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAREST_NEARESTGB, + THR_COMP_NEAREST_NEARESTGA2, THR_COMP_NEAREST_NEARESTGA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAREST_NEARESTBA, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + }, + // NEAR_NEARMV, + { + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, + THR_COMP_NEAR_NEARLL2, THR_COMP_NEAR_NEARLL3, + THR_COMP_NEAR_NEARLG, THR_COMP_NEAR_NEARLB, + THR_COMP_NEAR_NEARLA2, THR_COMP_NEAR_NEARLA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAR_NEARL2B, + THR_COMP_NEAR_NEARL2A2, THR_COMP_NEAR_NEARL2A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAR_NEARL3B, + THR_COMP_NEAR_NEARL3A2, THR_COMP_NEAR_NEARL3A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAR_NEARGB, + THR_COMP_NEAR_NEARGA2, THR_COMP_NEAR_NEARGA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAR_NEARBA, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + }, + // NEAREST_NEWMV, + { + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, + THR_COMP_NEAREST_NEWLL2, THR_COMP_NEAREST_NEWLL3, + THR_COMP_NEAREST_NEWLG, THR_COMP_NEAREST_NEWLB, + THR_COMP_NEAREST_NEWLA2, THR_COMP_NEAREST_NEWLA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAREST_NEWL2B, + THR_COMP_NEAREST_NEWL2A2, THR_COMP_NEAREST_NEWL2A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAREST_NEWL3B, + THR_COMP_NEAREST_NEWL3A2, THR_COMP_NEAREST_NEWL3A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAREST_NEWGB, + THR_COMP_NEAREST_NEWGA2, THR_COMP_NEAREST_NEWGA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAREST_NEWBA, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + }, + // NEW_NEARESTMV, + { + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, + THR_COMP_NEW_NEARESTLL2, THR_COMP_NEW_NEARESTLL3, + THR_COMP_NEW_NEARESTLG, THR_COMP_NEW_NEARESTLB, + THR_COMP_NEW_NEARESTLA2, THR_COMP_NEW_NEARESTLA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEARESTL2B, + THR_COMP_NEW_NEARESTL2A2, THR_COMP_NEW_NEARESTL2A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEARESTL3B, + THR_COMP_NEW_NEARESTL3A2, THR_COMP_NEW_NEARESTL3A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEARESTGB, + THR_COMP_NEW_NEARESTGA2, THR_COMP_NEW_NEARESTGA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEARESTBA, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + }, + // NEAR_NEWMV, + { + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, + THR_COMP_NEAR_NEWLL2, THR_COMP_NEAR_NEWLL3, + THR_COMP_NEAR_NEWLG, THR_COMP_NEAR_NEWLB, + THR_COMP_NEAR_NEWLA2, THR_COMP_NEAR_NEWLA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAR_NEWL2B, + THR_COMP_NEAR_NEWL2A2, THR_COMP_NEAR_NEWL2A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAR_NEWL3B, + THR_COMP_NEAR_NEWL3A2, THR_COMP_NEAR_NEWL3A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAR_NEWGB, + THR_COMP_NEAR_NEWGA2, THR_COMP_NEAR_NEWGA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAR_NEWBA, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + }, + // NEW_NEARMV, + { + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, + THR_COMP_NEW_NEARLL2, THR_COMP_NEW_NEARLL3, + THR_COMP_NEW_NEARLG, THR_COMP_NEW_NEARLB, + THR_COMP_NEW_NEARLA2, THR_COMP_NEW_NEARLA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEARL2B, + THR_COMP_NEW_NEARL2A2, THR_COMP_NEW_NEARL2A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEARL3B, + THR_COMP_NEW_NEARL3A2, THR_COMP_NEW_NEARL3A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEARGB, + THR_COMP_NEW_NEARGA2, THR_COMP_NEW_NEARGA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEARBA, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + }, + // GLOBAL_GLOBALMV, + { + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, + THR_COMP_GLOBAL_GLOBALLL2, THR_COMP_GLOBAL_GLOBALLL3, + THR_COMP_GLOBAL_GLOBALLG, THR_COMP_GLOBAL_GLOBALLB, + THR_COMP_GLOBAL_GLOBALLA2, THR_COMP_GLOBAL_GLOBALLA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_GLOBAL_GLOBALL2B, + THR_COMP_GLOBAL_GLOBALL2A2, THR_COMP_GLOBAL_GLOBALL2A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_GLOBAL_GLOBALL3B, + THR_COMP_GLOBAL_GLOBALL3A2, THR_COMP_GLOBAL_GLOBALL3A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_GLOBAL_GLOBALGB, + THR_COMP_GLOBAL_GLOBALGA2, THR_COMP_GLOBAL_GLOBALGA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_GLOBAL_GLOBALBA, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + }, + // NEW_NEWMV, + { + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, + THR_COMP_NEW_NEWLL2, THR_COMP_NEW_NEWLL3, + THR_COMP_NEW_NEWLG, THR_COMP_NEW_NEWLB, + THR_COMP_NEW_NEWLA2, THR_COMP_NEW_NEWLA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEWL2B, + THR_COMP_NEW_NEWL2A2, THR_COMP_NEW_NEWL2A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEWL3B, + THR_COMP_NEW_NEWL3A2, THR_COMP_NEW_NEWL3A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEWGB, + THR_COMP_NEW_NEWGA2, THR_COMP_NEW_NEWGA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEWBA, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + }, +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_RDOPT_DATA_DEFS_H_ diff --git a/libs/libaom/src/av1/encoder/rdopt_utils.h b/libs/libaom/src/av1/encoder/rdopt_utils.h new file mode 100644 index 000000000..53b410a22 --- /dev/null +++ b/libs/libaom/src/av1/encoder/rdopt_utils.h @@ -0,0 +1,652 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_RDOPT_UTILS_H_ +#define AOM_AV1_ENCODER_RDOPT_UTILS_H_ + +#include "aom/aom_integer.h" +#include "av1/encoder/block.h" +#include "av1/common/cfl.h" +#include "av1/common/pred_common.h" +#include "av1/encoder/rdopt_data_defs.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_REF_MV_SEARCH 3 +#define INTER_INTRA_RD_THRESH_SCALE 9 +#define INTER_INTRA_RD_THRESH_SHIFT 4 + +typedef struct { + PREDICTION_MODE mode; + MV_REFERENCE_FRAME ref_frame[2]; +} MODE_DEFINITION; + +// This array defines the mapping from the enums in THR_MODES to the actual +// prediction modes and refrence frames +static const MODE_DEFINITION av1_mode_defs[MAX_MODES] = { + { NEARESTMV, { LAST_FRAME, NONE_FRAME } }, + { NEARESTMV, { LAST2_FRAME, NONE_FRAME } }, + { NEARESTMV, { LAST3_FRAME, NONE_FRAME } }, + { NEARESTMV, { BWDREF_FRAME, NONE_FRAME } }, + { NEARESTMV, { ALTREF2_FRAME, NONE_FRAME } }, + { NEARESTMV, { ALTREF_FRAME, NONE_FRAME } }, + { NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } }, + + { NEWMV, { LAST_FRAME, NONE_FRAME } }, + { NEWMV, { LAST2_FRAME, NONE_FRAME } }, + { NEWMV, { LAST3_FRAME, NONE_FRAME } }, + { NEWMV, { BWDREF_FRAME, NONE_FRAME } }, + { NEWMV, { ALTREF2_FRAME, NONE_FRAME } }, + { NEWMV, { ALTREF_FRAME, NONE_FRAME } }, + { NEWMV, { GOLDEN_FRAME, NONE_FRAME } }, + + { NEARMV, { LAST_FRAME, NONE_FRAME } }, + { NEARMV, { LAST2_FRAME, NONE_FRAME } }, + { NEARMV, { LAST3_FRAME, NONE_FRAME } }, + { NEARMV, { BWDREF_FRAME, NONE_FRAME } }, + { NEARMV, { ALTREF2_FRAME, NONE_FRAME } }, + { NEARMV, { ALTREF_FRAME, NONE_FRAME } }, + { NEARMV, { GOLDEN_FRAME, NONE_FRAME } }, + + { GLOBALMV, { LAST_FRAME, NONE_FRAME } }, + { GLOBALMV, { LAST2_FRAME, NONE_FRAME } }, + { GLOBALMV, { LAST3_FRAME, NONE_FRAME } }, + { GLOBALMV, { BWDREF_FRAME, NONE_FRAME } }, + { GLOBALMV, { ALTREF2_FRAME, NONE_FRAME } }, + { GLOBALMV, { ALTREF_FRAME, NONE_FRAME } }, + { GLOBALMV, { GOLDEN_FRAME, NONE_FRAME } }, + + // TODO(zoeliu): May need to reconsider the order on the modes to check + + { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } }, + { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { NEAREST_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEAREST_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEAREST_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEAREST_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } }, + { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } }, + { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } }, + { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, + + { NEAREST_NEARESTMV, { LAST_FRAME, LAST2_FRAME } }, + { NEAREST_NEARESTMV, { LAST_FRAME, LAST3_FRAME } }, + { NEAREST_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } }, + { NEAREST_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } }, + + { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEW_NEARMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEAR_NEWMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEW_NEWMV, { LAST_FRAME, ALTREF_FRAME } }, + { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF_FRAME } }, + + { NEAR_NEARMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEW_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEAREST_NEWMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEW_NEARMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEAR_NEWMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEW_NEWMV, { LAST2_FRAME, ALTREF_FRAME } }, + { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF_FRAME } }, + + { NEAR_NEARMV, { LAST3_FRAME, ALTREF_FRAME } }, + { NEW_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } }, + { NEAREST_NEWMV, { LAST3_FRAME, ALTREF_FRAME } }, + { NEW_NEARMV, { LAST3_FRAME, ALTREF_FRAME } }, + { NEAR_NEWMV, { LAST3_FRAME, ALTREF_FRAME } }, + { NEW_NEWMV, { LAST3_FRAME, ALTREF_FRAME } }, + { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF_FRAME } }, + + { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { NEW_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { NEW_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + + { NEAR_NEARMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEW_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEAREST_NEWMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEW_NEARMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEAR_NEWMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEW_NEWMV, { LAST_FRAME, BWDREF_FRAME } }, + { GLOBAL_GLOBALMV, { LAST_FRAME, BWDREF_FRAME } }, + + { NEAR_NEARMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEW_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEAREST_NEWMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEW_NEARMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEAR_NEWMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEW_NEWMV, { LAST2_FRAME, BWDREF_FRAME } }, + { GLOBAL_GLOBALMV, { LAST2_FRAME, BWDREF_FRAME } }, + + { NEAR_NEARMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEW_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEAREST_NEWMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEW_NEARMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEAR_NEWMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEW_NEWMV, { LAST3_FRAME, BWDREF_FRAME } }, + { GLOBAL_GLOBALMV, { LAST3_FRAME, BWDREF_FRAME } }, + + { NEAR_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { NEW_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { NEAREST_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { NEW_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { NEAR_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { GLOBAL_GLOBALMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + + { NEAR_NEARMV, { LAST_FRAME, ALTREF2_FRAME } }, + { NEW_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } }, + { NEAREST_NEWMV, { LAST_FRAME, ALTREF2_FRAME } }, + { NEW_NEARMV, { LAST_FRAME, ALTREF2_FRAME } }, + { NEAR_NEWMV, { LAST_FRAME, ALTREF2_FRAME } }, + { NEW_NEWMV, { LAST_FRAME, ALTREF2_FRAME } }, + { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF2_FRAME } }, + + { NEAR_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } }, + { NEW_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } }, + { NEAREST_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } }, + { NEW_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } }, + { NEAR_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } }, + { NEW_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } }, + { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF2_FRAME } }, + + { NEAR_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } }, + { NEW_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } }, + { NEAREST_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } }, + { NEW_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } }, + { NEAR_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } }, + { NEW_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } }, + { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF2_FRAME } }, + + { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, + { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, + { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, + { NEW_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, + { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, + { NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, + { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, + + { NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } }, + { NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } }, + { NEAREST_NEWMV, { LAST_FRAME, LAST2_FRAME } }, + { NEW_NEARMV, { LAST_FRAME, LAST2_FRAME } }, + { NEAR_NEWMV, { LAST_FRAME, LAST2_FRAME } }, + { NEW_NEWMV, { LAST_FRAME, LAST2_FRAME } }, + { GLOBAL_GLOBALMV, { LAST_FRAME, LAST2_FRAME } }, + + { NEAR_NEARMV, { LAST_FRAME, LAST3_FRAME } }, + { NEW_NEARESTMV, { LAST_FRAME, LAST3_FRAME } }, + { NEAREST_NEWMV, { LAST_FRAME, LAST3_FRAME } }, + { NEW_NEARMV, { LAST_FRAME, LAST3_FRAME } }, + { NEAR_NEWMV, { LAST_FRAME, LAST3_FRAME } }, + { NEW_NEWMV, { LAST_FRAME, LAST3_FRAME } }, + { GLOBAL_GLOBALMV, { LAST_FRAME, LAST3_FRAME } }, + + { NEAR_NEARMV, { LAST_FRAME, GOLDEN_FRAME } }, + { NEW_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } }, + { NEAREST_NEWMV, { LAST_FRAME, GOLDEN_FRAME } }, + { NEW_NEARMV, { LAST_FRAME, GOLDEN_FRAME } }, + { NEAR_NEWMV, { LAST_FRAME, GOLDEN_FRAME } }, + { NEW_NEWMV, { LAST_FRAME, GOLDEN_FRAME } }, + { GLOBAL_GLOBALMV, { LAST_FRAME, GOLDEN_FRAME } }, + + { NEAR_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } }, + { NEW_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } }, + { NEAREST_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } }, + { NEW_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } }, + { NEAR_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } }, + { NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } }, + { GLOBAL_GLOBALMV, { BWDREF_FRAME, ALTREF_FRAME } }, + + // intra modes + { DC_PRED, { INTRA_FRAME, NONE_FRAME } }, + { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } }, + { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } }, + { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } }, + { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } }, + { H_PRED, { INTRA_FRAME, NONE_FRAME } }, + { V_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D135_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D203_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D157_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D67_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D113_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D45_PRED, { INTRA_FRAME, NONE_FRAME } }, +}; + +static AOM_INLINE void restore_dst_buf(MACROBLOCKD *xd, const BUFFER_SET dst, + const int num_planes) { + for (int i = 0; i < num_planes; i++) { + xd->plane[i].dst.buf = dst.plane[i]; + xd->plane[i].dst.stride = dst.stride[i]; + } +} + +/* clang-format on */ +// Calculate rd threshold based on ref best rd and relevant scaling factors +static AOM_INLINE int64_t get_rd_thresh_from_best_rd(int64_t ref_best_rd, + int mul_factor, + int div_factor) { + int64_t rd_thresh = ref_best_rd; + if (div_factor != 0) { + rd_thresh = ref_best_rd < (div_factor * (INT64_MAX / mul_factor)) + ? ((ref_best_rd / div_factor) * mul_factor) + : INT64_MAX; + } + return rd_thresh; +} + +static AOM_INLINE THR_MODES +get_prediction_mode_idx(PREDICTION_MODE this_mode, MV_REFERENCE_FRAME ref_frame, + MV_REFERENCE_FRAME second_ref_frame) { + if (this_mode < INTRA_MODE_END) { + assert(ref_frame == INTRA_FRAME); + assert(second_ref_frame == NONE_FRAME); + return intra_to_mode_idx[this_mode - INTRA_MODE_START]; + } + if (this_mode >= SINGLE_INTER_MODE_START && + this_mode < SINGLE_INTER_MODE_END) { + assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME)); + return single_inter_to_mode_idx[this_mode - SINGLE_INTER_MODE_START] + [ref_frame]; + } + if (this_mode >= COMP_INTER_MODE_START && this_mode < COMP_INTER_MODE_END) { + assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME)); + assert((second_ref_frame > INTRA_FRAME) && + (second_ref_frame <= ALTREF_FRAME)); + return comp_inter_to_mode_idx[this_mode - COMP_INTER_MODE_START][ref_frame] + [second_ref_frame]; + } + assert(0); + return THR_INVALID; +} + +static AOM_INLINE int inter_mode_data_block_idx(BLOCK_SIZE bsize) { + if (bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 || + bsize == BLOCK_4X16 || bsize == BLOCK_16X4) { + return -1; + } + return 1; +} + +// Get transform block visible dimensions cropped to the MI units. +static AOM_INLINE void get_txb_dimensions(const MACROBLOCKD *xd, int plane, + BLOCK_SIZE plane_bsize, int blk_row, + int blk_col, BLOCK_SIZE tx_bsize, + int *width, int *height, + int *visible_width, + int *visible_height) { + assert(tx_bsize <= plane_bsize); + const int txb_height = block_size_high[tx_bsize]; + const int txb_width = block_size_wide[tx_bsize]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + + // TODO(aconverse@google.com): Investigate using crop_width/height here rather + // than the MI size + if (xd->mb_to_bottom_edge >= 0) { + *visible_height = txb_height; + } else { + const int block_height = block_size_high[plane_bsize]; + const int block_rows = + (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) + block_height; + *visible_height = + clamp(block_rows - (blk_row << MI_SIZE_LOG2), 0, txb_height); + } + if (height) *height = txb_height; + + if (xd->mb_to_right_edge >= 0) { + *visible_width = txb_width; + } else { + const int block_width = block_size_wide[plane_bsize]; + const int block_cols = + (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) + block_width; + *visible_width = + clamp(block_cols - (blk_col << MI_SIZE_LOG2), 0, txb_width); + } + if (width) *width = txb_width; +} + +static AOM_INLINE int bsize_to_num_blk(BLOCK_SIZE bsize) { + int num_blk = 1 << (num_pels_log2_lookup[bsize] - 2 * MI_SIZE_LOG2); + return num_blk; +} + +static INLINE int check_txfm_eval(MACROBLOCK *const x, BLOCK_SIZE bsize, + int64_t best_skip_rd, int64_t skip_rd, + int level, int is_luma_only) { + int eval_txfm = 1; + // Derive aggressiveness factor for gating the transform search + // Lower value indicates more aggressiveness. Be more conservative (high + // value) for (i) low quantizers (ii) regions where prediction is poor + const int scale[5] = { INT_MAX, 4, 3, 3, 2 }; + const int qslope = 2 * (!is_luma_only); + int aggr_factor = 1; + if (!is_luma_only) { + aggr_factor = AOMMAX( + 1, ((MAXQ - x->qindex) * qslope + QINDEX_RANGE / 2) >> QINDEX_BITS); + } + if (best_skip_rd > + (x->source_variance << (num_pels_log2_lookup[bsize] + RDDIV_BITS))) + aggr_factor *= scale[level]; + // For level setting 1, be more conservative for luma only case even when + // prediction is good + else if ((level <= 1) && !is_luma_only) + aggr_factor *= 2; + + // Be more conservative for luma only cases (called from compound type rd) + // since best_skip_rd is computed after and skip_rd is computed (with 8-bit + // prediction signals blended for WEDGE/DIFFWTD rather than 16-bit) before + // interpolation filter search + const int luma_mul[5] = { INT_MAX, 32, 29, 20, 17 }; + int mul_factor = is_luma_only ? luma_mul[level] : 16; + int64_t rd_thresh = + (best_skip_rd == INT64_MAX) + ? best_skip_rd + : (int64_t)(best_skip_rd * aggr_factor * mul_factor >> 4); + if (skip_rd > rd_thresh) eval_txfm = 0; + return eval_txfm; +} + +static TX_MODE select_tx_mode( + const AV1_COMMON *cm, const TX_SIZE_SEARCH_METHOD tx_size_search_method) { + if (cm->features.coded_lossless) return ONLY_4X4; + if (tx_size_search_method == USE_LARGESTALL) { + return TX_MODE_LARGEST; + } else { + assert(tx_size_search_method == USE_FULL_RD || + tx_size_search_method == USE_FAST_RD); + return TX_MODE_SELECT; + } +} +// Checks the conditions to enable winner mode processing +static INLINE int is_winner_mode_processing_enabled( + const struct AV1_COMP *cpi, MB_MODE_INFO *const mbmi, + const PREDICTION_MODE best_mode) { + const SPEED_FEATURES *sf = &cpi->sf; + + // TODO(any): Move block independent condition checks to frame level + if (is_inter_block(mbmi)) { + if (is_inter_mode(best_mode) && + sf->tx_sf.tx_type_search.fast_inter_tx_type_search && + !cpi->oxcf.use_inter_dct_only) + return 1; + } else { + if (sf->tx_sf.tx_type_search.fast_intra_tx_type_search && + !cpi->oxcf.use_intra_default_tx_only && !cpi->oxcf.use_intra_dct_only) + return 1; + } + + // Check speed feature related to winner mode processing + if (sf->winner_mode_sf.enable_winner_mode_for_coeff_opt && + cpi->optimize_seg_arr[mbmi->segment_id] != NO_TRELLIS_OPT && + cpi->optimize_seg_arr[mbmi->segment_id] != FINAL_PASS_TRELLIS_OPT) + return 1; + if (sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch) return 1; + + return 0; +} + +static INLINE void set_tx_size_search_method( + const AV1_COMMON *cm, const WinnerModeParams *winner_mode_params, + MACROBLOCK *x, int enable_winner_mode_for_tx_size_srch, + int is_winner_mode) { + // Populate transform size search method/transform mode appropriately + x->tx_size_search_method = + winner_mode_params->tx_size_search_methods[DEFAULT_EVAL]; + if (enable_winner_mode_for_tx_size_srch) { + if (is_winner_mode) + x->tx_size_search_method = + winner_mode_params->tx_size_search_methods[WINNER_MODE_EVAL]; + else + x->tx_size_search_method = + winner_mode_params->tx_size_search_methods[MODE_EVAL]; + } + x->tx_mode_search_type = select_tx_mode(cm, x->tx_size_search_method); +} + +static INLINE void set_tx_type_prune(const SPEED_FEATURES *sf, MACROBLOCK *x, + int enable_winner_mode_tx_type_pruning, + int is_winner_mode) { + // Populate prune transform mode appropriately + x->prune_mode = sf->tx_sf.tx_type_search.prune_mode; + if (enable_winner_mode_tx_type_pruning) { + if (is_winner_mode) + x->prune_mode = NO_PRUNE; + else + x->prune_mode = PRUNE_2D_AGGRESSIVE; + } +} + +static INLINE void set_tx_domain_dist_params( + const WinnerModeParams *winner_mode_params, MACROBLOCK *x, + int enable_winner_mode_for_tx_domain_dist, int is_winner_mode) { + if (!enable_winner_mode_for_tx_domain_dist) { + x->use_transform_domain_distortion = + winner_mode_params->use_transform_domain_distortion[DEFAULT_EVAL]; + x->tx_domain_dist_threshold = + winner_mode_params->tx_domain_dist_threshold[DEFAULT_EVAL]; + return; + } + + if (is_winner_mode) { + x->use_transform_domain_distortion = + winner_mode_params->use_transform_domain_distortion[WINNER_MODE_EVAL]; + x->tx_domain_dist_threshold = + winner_mode_params->tx_domain_dist_threshold[WINNER_MODE_EVAL]; + } else { + x->use_transform_domain_distortion = + winner_mode_params->use_transform_domain_distortion[MODE_EVAL]; + x->tx_domain_dist_threshold = + winner_mode_params->tx_domain_dist_threshold[MODE_EVAL]; + } +} + +// This function sets mode parameters for different mode evaluation stages +static INLINE void set_mode_eval_params(const struct AV1_COMP *cpi, + MACROBLOCK *x, + MODE_EVAL_TYPE mode_eval_type) { + const AV1_COMMON *cm = &cpi->common; + const SPEED_FEATURES *sf = &cpi->sf; + const WinnerModeParams *winner_mode_params = &cpi->winner_mode_params; + + switch (mode_eval_type) { + case DEFAULT_EVAL: + x->use_default_inter_tx_type = 0; + x->use_default_intra_tx_type = 0; + x->predict_skip_level = + winner_mode_params->predict_skip_level[DEFAULT_EVAL]; + // Set default transform domain distortion type + set_tx_domain_dist_params(winner_mode_params, x, 0, 0); + + // Get default threshold for R-D optimization of coefficients + x->coeff_opt_dist_threshold = get_rd_opt_coeff_thresh( + winner_mode_params->coeff_opt_dist_threshold, 0, 0); + // Set default transform size search method + set_tx_size_search_method(cm, winner_mode_params, x, 0, 0); + // Set default transform type prune + set_tx_type_prune(sf, x, 0, 0); + break; + case MODE_EVAL: + x->use_default_intra_tx_type = + (cpi->sf.tx_sf.tx_type_search.fast_intra_tx_type_search || + cpi->oxcf.use_intra_default_tx_only); + x->use_default_inter_tx_type = + cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_search; + x->predict_skip_level = winner_mode_params->predict_skip_level[MODE_EVAL]; + + // Set transform domain distortion type for mode evaluation + set_tx_domain_dist_params( + winner_mode_params, x, + sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist, 0); + + // Get threshold for R-D optimization of coefficients during mode + // evaluation + x->coeff_opt_dist_threshold = get_rd_opt_coeff_thresh( + winner_mode_params->coeff_opt_dist_threshold, + sf->winner_mode_sf.enable_winner_mode_for_coeff_opt, 0); + // Set the transform size search method for mode evaluation + set_tx_size_search_method( + cm, winner_mode_params, x, + sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch, 0); + // Set transform type prune for mode evaluation + set_tx_type_prune( + sf, x, sf->tx_sf.tx_type_search.enable_winner_mode_tx_type_pruning, + 0); + break; + case WINNER_MODE_EVAL: + x->use_default_inter_tx_type = 0; + x->use_default_intra_tx_type = 0; + x->predict_skip_level = + winner_mode_params->predict_skip_level[WINNER_MODE_EVAL]; + + // Set transform domain distortion type for winner mode evaluation + set_tx_domain_dist_params( + winner_mode_params, x, + sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist, 1); + + // Get threshold for R-D optimization of coefficients for winner mode + // evaluation + x->coeff_opt_dist_threshold = get_rd_opt_coeff_thresh( + winner_mode_params->coeff_opt_dist_threshold, + sf->winner_mode_sf.enable_winner_mode_for_coeff_opt, 1); + // Set the transform size search method for winner mode evaluation + set_tx_size_search_method( + cm, winner_mode_params, x, + sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch, 1); + // Set default transform type prune mode for winner mode evaluation + set_tx_type_prune( + sf, x, sf->tx_sf.tx_type_search.enable_winner_mode_tx_type_pruning, + 1); + + // Reset hash state for winner mode processing. Winner mode and subsequent + // transform/mode evaluations (palette/IntraBC) cann't reuse old data as + // the decisions would have been sub-optimal + // TODO(any): Move the evaluation of palette/IntraBC modes before winner + // mode is processed and clean-up the code below + reset_hash_records(x, cpi->sf.tx_sf.use_inter_txb_hash); + + break; + default: assert(0); + } +} + +// Similar to store_cfl_required(), but for use during the RDO process, +// where we haven't yet determined whether this block uses CfL. +static INLINE CFL_ALLOWED_TYPE store_cfl_required_rdo(const AV1_COMMON *cm, + const MACROBLOCK *x) { + const MACROBLOCKD *xd = &x->e_mbd; + + if (cm->seq_params.monochrome || !xd->is_chroma_ref) return CFL_DISALLOWED; + + if (!xd->is_chroma_ref) { + // For non-chroma-reference blocks, we should always store the luma pixels, + // in case the corresponding chroma-reference block uses CfL. + // Note that this can only happen for block sizes which are <8 on + // their shortest side, as otherwise they would be chroma reference + // blocks. + return CFL_ALLOWED; + } + + // For chroma reference blocks, we should store data in the encoder iff we're + // allowed to try out CfL. + return is_cfl_allowed(xd); +} + +static AOM_INLINE void init_sbuv_mode(MB_MODE_INFO *const mbmi) { + mbmi->uv_mode = UV_DC_PRED; + mbmi->palette_mode_info.palette_size[1] = 0; +} + +// Store best mode stats for winner mode processing +static INLINE void store_winner_mode_stats( + const AV1_COMMON *const cm, MACROBLOCK *x, MB_MODE_INFO *mbmi, + RD_STATS *rd_cost, RD_STATS *rd_cost_y, RD_STATS *rd_cost_uv, + THR_MODES mode_index, uint8_t *color_map, BLOCK_SIZE bsize, int64_t this_rd, + int enable_multiwinner_mode_process, int txfm_search_done) { + WinnerModeStats *winner_mode_stats = x->winner_mode_stats; + int mode_idx = 0; + int is_palette_mode = mbmi->palette_mode_info.palette_size[PLANE_TYPE_Y] > 0; + // Mode stat is not required when multiwinner mode processing is disabled + if (!enable_multiwinner_mode_process) return; + // Ignore mode with maximum rd + if (this_rd == INT64_MAX) return; + // TODO(any): Winner mode processing is currently not applicable for palette + // mode in Inter frames. Clean-up the following code, once support is added + if (!frame_is_intra_only(cm) && is_palette_mode) return; + + const int max_winner_mode_count = frame_is_intra_only(cm) + ? MAX_WINNER_MODE_COUNT_INTRA + : MAX_WINNER_MODE_COUNT_INTER; + assert(x->winner_mode_count >= 0 && + x->winner_mode_count <= max_winner_mode_count); + + if (x->winner_mode_count) { + // Find the mode which has higher rd cost than this_rd + for (mode_idx = 0; mode_idx < x->winner_mode_count; mode_idx++) + if (winner_mode_stats[mode_idx].rd > this_rd) break; + + if (mode_idx == max_winner_mode_count) { + // No mode has higher rd cost than this_rd + return; + } else if (mode_idx < max_winner_mode_count - 1) { + // Create a slot for current mode and move others to the next slot + memmove( + &winner_mode_stats[mode_idx + 1], &winner_mode_stats[mode_idx], + (max_winner_mode_count - mode_idx - 1) * sizeof(*winner_mode_stats)); + } + } + // Add a mode stat for winner mode processing + winner_mode_stats[mode_idx].mbmi = *mbmi; + winner_mode_stats[mode_idx].rd = this_rd; + winner_mode_stats[mode_idx].mode_index = mode_index; + + // Update rd stats required for inter frame + if (!frame_is_intra_only(cm) && rd_cost && rd_cost_y && rd_cost_uv) { + const MACROBLOCKD *xd = &x->e_mbd; + const int skip_ctx = av1_get_skip_context(xd); + const int is_intra_mode = av1_mode_defs[mode_index].mode < INTRA_MODE_END; + const int skip = mbmi->skip && !is_intra_mode; + + winner_mode_stats[mode_idx].rd_cost = *rd_cost; + if (txfm_search_done) { + winner_mode_stats[mode_idx].rate_y = + rd_cost_y->rate + x->skip_cost[skip_ctx][rd_cost->skip || skip]; + winner_mode_stats[mode_idx].rate_uv = rd_cost_uv->rate; + } + } + + if (color_map) { + // Store color_index_map for palette mode + const MACROBLOCKD *const xd = &x->e_mbd; + int block_width, block_height; + av1_get_block_dimensions(bsize, AOM_PLANE_Y, xd, &block_width, + &block_height, NULL, NULL); + memcpy(winner_mode_stats[mode_idx].color_index_map, color_map, + block_width * block_height * sizeof(color_map[0])); + } + + x->winner_mode_count = + AOMMIN(x->winner_mode_count + 1, max_winner_mode_count); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_RDOPT_UTILS_H_ diff --git a/libs/libaom/src/av1/encoder/reconinter_enc.c b/libs/libaom/src/av1/encoder/reconinter_enc.c new file mode 100644 index 000000000..231b02091 --- /dev/null +++ b/libs/libaom/src/av1/encoder/reconinter_enc.c @@ -0,0 +1,407 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/blend.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/mvref_common.h" +#include "av1/common/obmc.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/encoder/reconinter_enc.h" + +static void enc_calc_subpel_params(const MV *const src_mv, + InterPredParams *const inter_pred_params, + MACROBLOCKD *xd, int mi_x, int mi_y, int ref, + uint8_t **pre, SubpelParams *subpel_params, + int *src_stride) { + // These are part of the function signature to use this function through a + // function pointer. See typedef of 'CalcSubpelParamsFunc'. + (void)xd; + (void)mi_x; + (void)mi_y; + (void)ref; + + const struct scale_factors *sf = inter_pred_params->scale_factors; + + struct buf_2d *pre_buf = &inter_pred_params->ref_frame_buf; + int ssx = inter_pred_params->subsampling_x; + int ssy = inter_pred_params->subsampling_y; + int orig_pos_y = inter_pred_params->pix_row << SUBPEL_BITS; + orig_pos_y += src_mv->row * (1 << (1 - ssy)); + int orig_pos_x = inter_pred_params->pix_col << SUBPEL_BITS; + orig_pos_x += src_mv->col * (1 << (1 - ssx)); + int pos_y = sf->scale_value_y(orig_pos_y, sf); + int pos_x = sf->scale_value_x(orig_pos_x, sf); + pos_x += SCALE_EXTRA_OFF; + pos_y += SCALE_EXTRA_OFF; + + const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy); + const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx); + const int bottom = (pre_buf->height + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS; + const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS; + pos_y = clamp(pos_y, top, bottom); + pos_x = clamp(pos_x, left, right); + + subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK; + subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK; + subpel_params->xs = sf->x_step_q4; + subpel_params->ys = sf->y_step_q4; + *pre = pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride + + (pos_x >> SCALE_SUBPEL_BITS); + *src_stride = pre_buf->stride; +} + +void av1_enc_build_one_inter_predictor(uint8_t *dst, int dst_stride, + const MV *src_mv, + InterPredParams *inter_pred_params) { + av1_build_one_inter_predictor(dst, dst_stride, src_mv, inter_pred_params, + NULL /* xd */, 0 /* mi_x */, 0 /* mi_y */, + 0 /* ref */, enc_calc_subpel_params); +} + +static void enc_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, + int plane, const MB_MODE_INFO *mi, + int bw, int bh, int mi_x, int mi_y) { + av1_build_inter_predictors(cm, xd, plane, mi, 0 /* build_for_obmc */, bw, bh, + mi_x, mi_y, enc_calc_subpel_params); +} + +void av1_enc_build_inter_predictor_y(MACROBLOCKD *xd, int mi_row, int mi_col) { + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; + InterPredParams inter_pred_params; + + struct buf_2d *const dst_buf = &pd->dst; + uint8_t *const dst = dst_buf->buf; + const MV mv = xd->mi[0]->mv[0].as_mv; + const struct scale_factors *const sf = xd->block_ref_scale_factors[0]; + + av1_init_inter_params(&inter_pred_params, pd->width, pd->height, mi_y, mi_x, + pd->subsampling_x, pd->subsampling_y, xd->bd, + is_cur_buf_hbd(xd), false, sf, pd->pre, + xd->mi[0]->interp_filters); + + inter_pred_params.conv_params = get_conv_params_no_round( + 0, AOM_PLANE_Y, xd->tmp_conv_dst, MAX_SB_SIZE, false, xd->bd); + + inter_pred_params.conv_params.use_dist_wtd_comp_avg = 0; + av1_enc_build_one_inter_predictor(dst, dst_buf->stride, &mv, + &inter_pred_params); +} + +void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, + const BUFFER_SET *ctx, BLOCK_SIZE bsize, + int plane_from, int plane_to) { + for (int plane = plane_from; plane <= plane_to; ++plane) { + if (plane && !xd->is_chroma_ref) break; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + enc_build_inter_predictors(cm, xd, plane, xd->mi[0], xd->plane[plane].width, + xd->plane[plane].height, mi_x, mi_y); + + if (is_interintra_pred(xd->mi[0])) { + BUFFER_SET default_ctx = { + { xd->plane[0].dst.buf, xd->plane[1].dst.buf, xd->plane[2].dst.buf }, + { xd->plane[0].dst.stride, xd->plane[1].dst.stride, + xd->plane[2].dst.stride } + }; + if (!ctx) { + ctx = &default_ctx; + } + av1_build_interintra_predictor(cm, xd, xd->plane[plane].dst.buf, + xd->plane[plane].dst.stride, ctx, plane, + bsize); + } + } +} + +static INLINE void build_obmc_prediction(MACROBLOCKD *xd, int rel_mi_row, + int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *above_mbmi, + void *fun_ctxt, const int num_planes) { + struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt; + av1_setup_address_for_obmc(xd, rel_mi_row, rel_mi_col, above_mbmi, ctxt, + num_planes); + + const int mi_x = (xd->mi_col + rel_mi_col) << MI_SIZE_LOG2; + const int mi_y = (xd->mi_row + rel_mi_row) << MI_SIZE_LOG2; + + const BLOCK_SIZE bsize = xd->mi[0]->sb_type; + + InterPredParams inter_pred_params; + + for (int j = 0; j < num_planes; ++j) { + const struct macroblockd_plane *pd = &xd->plane[j]; + int bw = 0, bh = 0; + + if (dir) { + // prepare left reference block size + bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4, + block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1)); + bh = (op_mi_size << MI_SIZE_LOG2) >> pd->subsampling_y; + } else { + // prepare above reference block size + bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x; + bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4, + block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1)); + } + + if (av1_skip_u4x4_pred_in_obmc(bsize, pd, dir)) continue; + + const struct buf_2d *const pre_buf = &pd->pre[0]; + const MV mv = above_mbmi->mv[0].as_mv; + + av1_init_inter_params(&inter_pred_params, bw, bh, mi_y >> pd->subsampling_y, + mi_x >> pd->subsampling_x, pd->subsampling_x, + pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), 0, + xd->block_ref_scale_factors[0], pre_buf, + above_mbmi->interp_filters); + inter_pred_params.conv_params = get_conv_params(0, j, xd->bd); + + av1_enc_build_one_inter_predictor(pd->dst.buf, pd->dst.stride, &mv, + &inter_pred_params); + } +} + +void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, + uint8_t *tmp_buf[MAX_MB_PLANE], + int tmp_width[MAX_MB_PLANE], + int tmp_height[MAX_MB_PLANE], + int tmp_stride[MAX_MB_PLANE]) { + if (!xd->up_available) return; + struct build_prediction_ctxt ctxt = { cm, tmp_buf, + tmp_width, tmp_height, + tmp_stride, xd->mb_to_right_edge }; + BLOCK_SIZE bsize = xd->mi[0]->sb_type; + foreach_overlappable_nb_above(cm, xd, + max_neighbor_obmc[mi_size_wide_log2[bsize]], + build_obmc_prediction, &ctxt); +} + +void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, + uint8_t *tmp_buf[MAX_MB_PLANE], + int tmp_width[MAX_MB_PLANE], + int tmp_height[MAX_MB_PLANE], + int tmp_stride[MAX_MB_PLANE]) { + if (!xd->left_available) return; + struct build_prediction_ctxt ctxt = { cm, tmp_buf, + tmp_width, tmp_height, + tmp_stride, xd->mb_to_bottom_edge }; + BLOCK_SIZE bsize = xd->mi[0]->sb_type; + foreach_overlappable_nb_left(cm, xd, + max_neighbor_obmc[mi_size_high_log2[bsize]], + build_obmc_prediction, &ctxt); +} + +void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd) { + const int num_planes = av1_num_planes(cm); + uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE]; + int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + + if (is_cur_buf_hbd(xd)) { + int len = sizeof(uint16_t); + dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]); + dst_buf1[1] = + CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len); + dst_buf1[2] = + CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len); + dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]); + dst_buf2[1] = + CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len); + dst_buf2[2] = + CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len); + } else { + dst_buf1[0] = xd->tmp_obmc_bufs[0]; + dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE; + dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2; + dst_buf2[0] = xd->tmp_obmc_bufs[1]; + dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE; + dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2; + } + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + av1_build_prediction_by_above_preds(cm, xd, dst_buf1, dst_width1, dst_height1, + dst_stride1); + av1_build_prediction_by_left_preds(cm, xd, dst_buf2, dst_width2, dst_height2, + dst_stride2); + av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, &cm->cur_frame->buf, + mi_row, mi_col, 0, num_planes); + av1_build_obmc_inter_prediction(cm, xd, dst_buf1, dst_stride1, dst_buf2, + dst_stride2); +} + +void av1_build_inter_predictors_for_planes_single_buf( + MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int ref, + uint8_t *ext_dst[3], int ext_dst_stride[3]) { + assert(bsize < BLOCK_SIZES_ALL); + const MB_MODE_INFO *mi = xd->mi[0]; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + WarpTypesAllowed warp_types; + const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]]; + warp_types.global_warp_allowed = is_global_mv_block(mi, wm->wmtype); + warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; + + for (int plane = plane_from; plane <= plane_to; ++plane) { + const struct macroblockd_plane *pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + + InterPredParams inter_pred_params; + + av1_init_inter_params(&inter_pred_params, bw, bh, mi_y >> pd->subsampling_y, + mi_x >> pd->subsampling_x, pd->subsampling_x, + pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), 0, + xd->block_ref_scale_factors[ref], &pd->pre[ref], + mi->interp_filters); + inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); + av1_init_warp_params(&inter_pred_params, &warp_types, ref, xd, mi); + + uint8_t *const dst = get_buf_by_bd(xd, ext_dst[plane]); + const MV mv = mi->mv[ref].as_mv; + + av1_enc_build_one_inter_predictor(dst, ext_dst_stride[plane], &mv, + &inter_pred_params); + } +} + +static void build_masked_compound( + uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride, + const uint8_t *src1, int src1_stride, + const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h, + int w) { + // Derive subsampling from h and w passed in. May be refactored to + // pass in subsampling factors directly. + const int subh = (2 << mi_size_high_log2[sb_type]) == h; + const int subw = (2 << mi_size_wide_log2[sb_type]) == w; + const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type); + aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, block_size_wide[sb_type], w, h, subw, subh); +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void build_masked_compound_highbd( + uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride, + const uint8_t *src1_8, int src1_stride, + const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h, + int w, int bd) { + // Derive subsampling from h and w passed in. May be refactored to + // pass in subsampling factors directly. + const int subh = (2 << mi_size_high_log2[sb_type]) == h; + const int subw = (2 << mi_size_wide_log2[sb_type]) == w; + const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type); + // const uint8_t *mask = + // av1_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type); + aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8, + src1_stride, mask, block_size_wide[sb_type], w, h, + subw, subh, bd); +} +#endif + +static void build_wedge_inter_predictor_from_buf( + MACROBLOCKD *xd, int plane, int x, int y, int w, int h, uint8_t *ext_dst0, + int ext_dst_stride0, uint8_t *ext_dst1, int ext_dst_stride1) { + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int is_compound = has_second_ref(mbmi); + MACROBLOCKD_PLANE *const pd = &xd->plane[plane]; + struct buf_2d *const dst_buf = &pd->dst; + uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x; + mbmi->interinter_comp.seg_mask = xd->seg_mask; + const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp; + const int is_hbd = is_cur_buf_hbd(xd); + + if (is_compound && is_masked_compound_type(comp_data->type)) { + if (!plane && comp_data->type == COMPOUND_DIFFWTD) { + if (is_hbd) { + av1_build_compound_diffwtd_mask_highbd( + comp_data->seg_mask, comp_data->mask_type, + CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0, + CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd); + } else { + av1_build_compound_diffwtd_mask( + comp_data->seg_mask, comp_data->mask_type, ext_dst0, + ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w); + } + } +#if CONFIG_AV1_HIGHBITDEPTH + if (is_hbd) { + build_masked_compound_highbd( + dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0, + CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data, + mbmi->sb_type, h, w, xd->bd); + } else { + build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0, + ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type, + h, w); + } +#else + build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0, + ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type, + h, w); +#endif + } else { +#if CONFIG_AV1_HIGHBITDEPTH + if (is_hbd) { + aom_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0, + dst, dst_buf->stride, NULL, 0, NULL, 0, w, h, + xd->bd); + } else { + aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL, + 0, NULL, 0, w, h); + } +#else + aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL, 0, + NULL, 0, w, h); +#endif + } +} + +void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize, + int plane_from, int plane_to, + uint8_t *ext_dst0[3], + int ext_dst_stride0[3], + uint8_t *ext_dst1[3], + int ext_dst_stride1[3]) { + int plane; + assert(bsize < BLOCK_SIZES_ALL); + for (plane = plane_from; plane <= plane_to; ++plane) { + const BLOCK_SIZE plane_bsize = get_plane_block_size( + bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y); + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + build_wedge_inter_predictor_from_buf( + xd, plane, 0, 0, bw, bh, ext_dst0[plane], ext_dst_stride0[plane], + ext_dst1[plane], ext_dst_stride1[plane]); + } +} diff --git a/libs/libaom/src/av1/encoder/reconinter_enc.h b/libs/libaom/src/av1/encoder/reconinter_enc.h new file mode 100644 index 000000000..fdc1f31c8 --- /dev/null +++ b/libs/libaom/src/av1/encoder/reconinter_enc.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_RECONINTER_ENC_H_ +#define AOM_AV1_ENCODER_RECONINTER_ENC_H_ + +#include "aom/aom_integer.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" +#include "av1/common/reconinter.h" +#include "av1/common/warped_motion.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Build single or compound reference inter predictors for all planes. +// Can build inter-intra predictors, masked predictors etc as well. +void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, + const BUFFER_SET *ctx, BLOCK_SIZE bsize, + int plane_from, int plane_to); + +void av1_enc_build_inter_predictor_y(MACROBLOCKD *xd, int mi_row, int mi_col); + +// Build one inter predictor. It is called for building predictor for single +// reference case, or just the 1st or 2nd reference in compound reference case. +// Can build both regular and masked predictors. +void av1_enc_build_one_inter_predictor(uint8_t *dst, int dst_stride, + const MV *src_mv, + InterPredParams *inter_pred_params); + +void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, + uint8_t *tmp_buf[MAX_MB_PLANE], + int tmp_width[MAX_MB_PLANE], + int tmp_height[MAX_MB_PLANE], + int tmp_stride[MAX_MB_PLANE]); + +void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, + uint8_t *tmp_buf[MAX_MB_PLANE], + int tmp_width[MAX_MB_PLANE], + int tmp_height[MAX_MB_PLANE], + int tmp_stride[MAX_MB_PLANE]); + +void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd); + +void av1_build_inter_predictors_for_planes_single_buf( + MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int ref, + uint8_t *ext_dst[3], int ext_dst_stride[3]); + +void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize, + int plane_from, int plane_to, + uint8_t *ext_dst0[3], + int ext_dst_stride0[3], + uint8_t *ext_dst1[3], + int ext_dst_stride1[3]); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_RECONINTER_ENC_H_ diff --git a/libs/libaom/src/av1/encoder/segmentation.c b/libs/libaom/src/av1/encoder/segmentation.c new file mode 100644 index 000000000..0c029c0e6 --- /dev/null +++ b/libs/libaom/src/av1/encoder/segmentation.c @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_mem/aom_mem.h" + +#include "av1/common/pred_common.h" +#include "av1/common/tile_common.h" + +#include "av1/encoder/cost.h" +#include "av1/encoder/segmentation.h" + +void av1_enable_segmentation(struct segmentation *seg) { + seg->enabled = 1; + seg->update_map = 1; + seg->update_data = 1; + seg->temporal_update = 0; +} + +void av1_disable_segmentation(struct segmentation *seg) { + seg->enabled = 0; + seg->update_map = 0; + seg->update_data = 0; + seg->temporal_update = 0; +} + +void av1_disable_segfeature(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id) { + seg->feature_mask[segment_id] &= ~(1 << feature_id); +} + +void av1_clear_segdata(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id) { + seg->feature_data[segment_id][feature_id] = 0; +} + +static void count_segs(const AV1_COMMON *cm, MACROBLOCKD *xd, + const TileInfo *tile, MB_MODE_INFO **mi, + unsigned *no_pred_segcounts, + unsigned (*temporal_predictor_count)[2], + unsigned *t_unpred_seg_counts, int bw, int bh, + int mi_row, int mi_col) { + const CommonModeInfoParams *const mi_params = &cm->mi_params; + if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return; + + xd->mi = mi; + set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows, + mi_params->mi_cols); + + // Count the number of hits on each segment with no prediction + const int segment_id = xd->mi[0]->segment_id; + no_pred_segcounts[segment_id]++; + + // Temporal prediction not allowed on key frames + if (cm->current_frame.frame_type != KEY_FRAME) { + const BLOCK_SIZE bsize = xd->mi[0]->sb_type; + // Test to see if the segment id matches the predicted value. + const int pred_segment_id = + cm->last_frame_seg_map + ? get_segment_id(mi_params, cm->last_frame_seg_map, bsize, mi_row, + mi_col) + : 0; + const int pred_flag = pred_segment_id == segment_id; + const int pred_context = av1_get_pred_context_seg_id(xd); + + // Store the prediction status for this mb and update counts + // as appropriate + xd->mi[0]->seg_id_predicted = pred_flag; + temporal_predictor_count[pred_context][pred_flag]++; + + // Update the "unpredicted" segment count + if (!pred_flag) t_unpred_seg_counts[segment_id]++; + } +} + +static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, + const TileInfo *tile, MB_MODE_INFO **mi, + unsigned *no_pred_segcounts, + unsigned (*temporal_predictor_count)[2], + unsigned *t_unpred_seg_counts, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int mis = mi_params->mi_stride; + const int bs = mi_size_wide[bsize], hbs = bs / 2; + PARTITION_TYPE partition; + const int qbs = bs / 4; + + if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return; + +#define CSEGS(cs_bw, cs_bh, cs_rowoff, cs_coloff) \ + count_segs(cm, xd, tile, mi + mis * (cs_rowoff) + (cs_coloff), \ + no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, \ + (cs_bw), (cs_bh), mi_row + (cs_rowoff), mi_col + (cs_coloff)); + + if (bsize == BLOCK_8X8) + partition = PARTITION_NONE; + else + partition = get_partition(cm, mi_row, mi_col, bsize); + switch (partition) { + case PARTITION_NONE: CSEGS(bs, bs, 0, 0); break; + case PARTITION_HORZ: + CSEGS(bs, hbs, 0, 0); + CSEGS(bs, hbs, hbs, 0); + break; + case PARTITION_VERT: + CSEGS(hbs, bs, 0, 0); + CSEGS(hbs, bs, 0, hbs); + break; + case PARTITION_HORZ_A: + CSEGS(hbs, hbs, 0, 0); + CSEGS(hbs, hbs, 0, hbs); + CSEGS(bs, hbs, hbs, 0); + break; + case PARTITION_HORZ_B: + CSEGS(bs, hbs, 0, 0); + CSEGS(hbs, hbs, hbs, 0); + CSEGS(hbs, hbs, hbs, hbs); + break; + case PARTITION_VERT_A: + CSEGS(hbs, hbs, 0, 0); + CSEGS(hbs, hbs, hbs, 0); + CSEGS(hbs, bs, 0, hbs); + break; + case PARTITION_VERT_B: + CSEGS(hbs, bs, 0, 0); + CSEGS(hbs, hbs, 0, hbs); + CSEGS(hbs, hbs, hbs, hbs); + break; + case PARTITION_HORZ_4: + CSEGS(bs, qbs, 0, 0); + CSEGS(bs, qbs, qbs, 0); + CSEGS(bs, qbs, 2 * qbs, 0); + if (mi_row + 3 * qbs < mi_params->mi_rows) CSEGS(bs, qbs, 3 * qbs, 0); + break; + + case PARTITION_VERT_4: + CSEGS(qbs, bs, 0, 0); + CSEGS(qbs, bs, 0, qbs); + CSEGS(qbs, bs, 0, 2 * qbs); + if (mi_col + 3 * qbs < mi_params->mi_cols) CSEGS(qbs, bs, 0, 3 * qbs); + break; + + case PARTITION_SPLIT: { + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + int n; + assert(subsize < BLOCK_SIZES_ALL); + + for (n = 0; n < 4; n++) { + const int mi_dc = hbs * (n & 1); + const int mi_dr = hbs * (n >> 1); + + count_segs_sb(cm, xd, tile, &mi[mi_dr * mis + mi_dc], no_pred_segcounts, + temporal_predictor_count, t_unpred_seg_counts, + mi_row + mi_dr, mi_col + mi_dc, subsize); + } + } break; + default: assert(0); + } + +#undef CSEGS +} + +void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd) { + struct segmentation *seg = &cm->seg; + struct segmentation_probs *segp = &cm->fc->seg; + int no_pred_cost; + int t_pred_cost = INT_MAX; + int tile_col, tile_row, mi_row, mi_col; + unsigned temporal_predictor_count[SEG_TEMPORAL_PRED_CTXS][2] = { { 0 } }; + unsigned no_pred_segcounts[MAX_SEGMENTS] = { 0 }; + unsigned t_unpred_seg_counts[MAX_SEGMENTS] = { 0 }; + (void)xd; + int scale_up = cm->prev_frame && (cm->width > cm->prev_frame->width || + cm->height > cm->prev_frame->height); + // First of all generate stats regarding how well the last segment map + // predicts this one + if (!scale_up) { + for (tile_row = 0; tile_row < cm->tiles.rows; tile_row++) { + TileInfo tile_info; + av1_tile_set_row(&tile_info, cm, tile_row); + for (tile_col = 0; tile_col < cm->tiles.cols; tile_col++) { + MB_MODE_INFO **mi_ptr; + av1_tile_set_col(&tile_info, cm, tile_col); + mi_ptr = cm->mi_params.mi_grid_base + + tile_info.mi_row_start * cm->mi_params.mi_stride + + tile_info.mi_col_start; + for (mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end; + mi_row += cm->seq_params.mib_size, + mi_ptr += cm->seq_params.mib_size * cm->mi_params.mi_stride) { + MB_MODE_INFO **mi = mi_ptr; + for (mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end; + mi_col += cm->seq_params.mib_size, + mi += cm->seq_params.mib_size) { + count_segs_sb(cm, xd, &tile_info, mi, no_pred_segcounts, + temporal_predictor_count, t_unpred_seg_counts, mi_row, + mi_col, cm->seq_params.sb_size); + } + } + } + } + } + + int seg_id_cost[MAX_SEGMENTS]; + av1_cost_tokens_from_cdf(seg_id_cost, segp->tree_cdf, NULL); + no_pred_cost = 0; + for (int i = 0; i < MAX_SEGMENTS; ++i) + no_pred_cost += no_pred_segcounts[i] * seg_id_cost[i]; + + // Frames without past dependency cannot use temporal prediction + if (cm->features.primary_ref_frame != PRIMARY_REF_NONE) { + int pred_flag_cost[SEG_TEMPORAL_PRED_CTXS][2]; + for (int i = 0; i < SEG_TEMPORAL_PRED_CTXS; ++i) + av1_cost_tokens_from_cdf(pred_flag_cost[i], segp->pred_cdf[i], NULL); + t_pred_cost = 0; + // Cost for signaling the prediction flag. + for (int i = 0; i < SEG_TEMPORAL_PRED_CTXS; ++i) { + for (int j = 0; j < 2; ++j) + t_pred_cost += temporal_predictor_count[i][j] * pred_flag_cost[i][j]; + } + // Cost for signaling the unpredicted segment id. + for (int i = 0; i < MAX_SEGMENTS; ++i) + t_pred_cost += t_unpred_seg_counts[i] * seg_id_cost[i]; + } + + // Now choose which coding method to use. + if (t_pred_cost < no_pred_cost) { + assert(!cm->features.error_resilient_mode); + seg->temporal_update = 1; + } else { + seg->temporal_update = 0; + } +} + +void av1_reset_segment_features(AV1_COMMON *cm) { + struct segmentation *seg = &cm->seg; + + // Set up default state for MB feature flags + seg->enabled = 0; + seg->update_map = 0; + seg->update_data = 0; + av1_clearall_segfeatures(seg); +} diff --git a/libs/libaom/src/av1/encoder/segmentation.h b/libs/libaom/src/av1/encoder/segmentation.h new file mode 100644 index 000000000..1ad13d66a --- /dev/null +++ b/libs/libaom/src/av1/encoder/segmentation.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_SEGMENTATION_H_ +#define AOM_AV1_ENCODER_SEGMENTATION_H_ + +#include "av1/common/blockd.h" +#include "av1/encoder/encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_enable_segmentation(struct segmentation *seg); +void av1_disable_segmentation(struct segmentation *seg); + +void av1_disable_segfeature(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id); +void av1_clear_segdata(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id); + +void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd); + +void av1_reset_segment_features(AV1_COMMON *cm); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_SEGMENTATION_H_ diff --git a/libs/libaom/src/av1/encoder/speed_features.c b/libs/libaom/src/av1/encoder/speed_features.c new file mode 100644 index 000000000..e03faeccc --- /dev/null +++ b/libs/libaom/src/av1/encoder/speed_features.c @@ -0,0 +1,1322 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/reconintra.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/speed_features.h" +#include "av1/encoder/rdopt.h" + +#include "aom_dsp/aom_dsp_common.h" + +#define MAX_MESH_SPEED 5 // Max speed setting for mesh motion method +// Max speed setting for tx domain evaluation +#define MAX_TX_DOMAIN_EVAL_SPEED 5 +static MESH_PATTERN + good_quality_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = { + { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } }, + { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } }, + { { 64, 8 }, { 14, 2 }, { 7, 1 }, { 7, 1 } }, + { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } }, + { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } }, + { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } }, + }; + +// TODO(huisu@google.com): These settings are pretty relaxed, tune them for +// each speed setting +static MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = { + { { 256, 1 }, { 256, 1 }, { 0, 0 }, { 0, 0 } }, + { { 256, 1 }, { 256, 1 }, { 0, 0 }, { 0, 0 } }, + { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } }, + { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } }, + { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } }, + { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } }, +}; + +// Threshold values to be used for pruning the txfm_domain_distortion +// based on block MSE +// Index 0: Default mode evaluation, Winner mode processing is not +// applicable (Eg : IntraBc). Index 1: Mode evaluation. +// Index 2: Winner mode evaluation. Index 1 and 2 are applicable when +// enable_winner_mode_for_use_tx_domain_dist speed feature is ON +// TODO(any): Experiment the threshold logic based on variance metric +static unsigned int tx_domain_dist_thresholds[3][MODE_EVAL_TYPES] = { + { UINT_MAX, UINT_MAX, UINT_MAX }, { 22026, 22026, 22026 }, { 0, 0, 0 } +}; + +// Transform domain distortion type to be used for default, mode and winner mode +// evaluation Index 0: Default mode evaluation, Winner mode processing is not +// applicable (Eg : IntraBc). Index 1: Mode evaluation. Index 2: Winner mode +// evaluation. Index 1 and 2 are applicable when +// enable_winner_mode_for_use_tx_domain_dist speed feature is ON +static unsigned int tx_domain_dist_types[3][MODE_EVAL_TYPES] = { { 0, 2, 0 }, + { 1, 2, 0 }, + { 2, 2, 0 } }; + +// Threshold values to be used for disabling coeff RD-optimization +// based on block MSE / qstep^2. +// TODO(any): Experiment the threshold logic based on variance metric. +// For each row, the indices are as follows. +// Index 0: Default mode evaluation, Winner mode processing is not applicable +// (Eg : IntraBc) +// Index 1: Mode evaluation. +// Index 2: Winner mode evaluation. +// Index 1 and 2 are applicable when enable_winner_mode_for_coeff_opt speed +// feature is ON +// There are 6 levels with increasing speed, mapping to vertical indices. +static unsigned int coeff_opt_dist_thresholds[6][MODE_EVAL_TYPES] = { + { UINT_MAX, UINT_MAX, UINT_MAX }, + { 3200, 250, UINT_MAX }, + { 1728, 142, UINT_MAX }, + { 864, 142, UINT_MAX }, + { 432, 86, UINT_MAX }, + { 216, 86, UINT_MAX } +}; + +// Transform size to be used for default, mode and winner mode evaluation +// Index 0: Default mode evaluation, Winner mode processing is not applicable +// (Eg : IntraBc) Index 1: Mode evaluation. Index 2: Winner mode evaluation. +// Index 1 and 2 are applicable when enable_winner_mode_for_tx_size_srch speed +// feature is ON +static TX_SIZE_SEARCH_METHOD tx_size_search_methods[3][MODE_EVAL_TYPES] = { + { USE_FULL_RD, USE_LARGESTALL, USE_FULL_RD }, + { USE_FAST_RD, USE_LARGESTALL, USE_FULL_RD }, + { USE_LARGESTALL, USE_LARGESTALL, USE_FULL_RD } +}; + +// Predict transform skip levels to be used for default, mode and winner mode +// evaluation. Index 0: Default mode evaluation, Winner mode processing is not +// applicable. Index 1: Mode evaluation, Index 2: Winner mode evaluation +// Values indicate the aggressiveness of skip flag prediction. +// 0 : no early skip prediction +// 1 : conservative early skip prediction using DCT_DCT +// 2 : early skip prediction based on SSE +static unsigned int predict_skip_levels[3][MODE_EVAL_TYPES] = { { 0, 0, 0 }, + { 1, 1, 1 }, + { 1, 2, 1 } }; + +// Intra only frames, golden frames (except alt ref overlays) and +// alt ref frames tend to be coded at a higher than ambient quality +static int frame_is_boosted(const AV1_COMP *cpi) { + return frame_is_kf_gf_arf(cpi); +} + +static BLOCK_SIZE dim_to_size(int dim) { + switch (dim) { + case 4: return BLOCK_4X4; + case 8: return BLOCK_8X8; + case 16: return BLOCK_16X16; + case 32: return BLOCK_32X32; + case 64: return BLOCK_64X64; + case 128: return BLOCK_128X128; + default: assert(0); return 0; + } +} + +static void set_good_speed_feature_framesize_dependent( + const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) { + const AV1_COMMON *const cm = &cpi->common; + const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; + const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; + const int is_4k_or_larger = AOMMIN(cm->width, cm->height) >= 2160; + + if (is_480p_or_larger) { + sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128; + if (is_720p_or_larger) + sf->part_sf.auto_max_partition_based_on_simple_motion = ADAPT_PRED; + else + sf->part_sf.auto_max_partition_based_on_simple_motion = RELAXED_PRED; + } else { + sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64; + sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED; + } + + if (is_4k_or_larger) { + sf->part_sf.default_min_partition_size = BLOCK_8X8; + } + + // TODO(huisu@google.com): train models for 720P and above. + if (!is_720p_or_larger) { + sf->part_sf.ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8 + sf->part_sf.ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16 + sf->part_sf.ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32 + sf->part_sf.ml_partition_search_breakout_thresh[3] = 500; // BLOCK_64X64 + sf->part_sf.ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128 + sf->part_sf.ml_early_term_after_part_split_level = 1; + } + + if (speed >= 1) { + if (is_720p_or_larger) { + sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128; + } else if (is_480p_or_larger) { + sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64; + } else { + sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32; + } + + if (!is_720p_or_larger) { + sf->part_sf.ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8 + sf->part_sf.ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16 + sf->part_sf.ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32 + sf->part_sf.ml_partition_search_breakout_thresh[3] = 300; // BLOCK_64X64 + sf->part_sf.ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128 + } + sf->part_sf.ml_early_term_after_part_split_level = 2; + } + + if (speed >= 2) { + if (is_720p_or_larger) { + sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64; + } else if (is_480p_or_larger) { + sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32; + } else { + sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32; + } + + if (is_720p_or_larger) { + sf->part_sf.partition_search_breakout_dist_thr = (1 << 24); + sf->part_sf.partition_search_breakout_rate_thr = 120; + } else { + sf->part_sf.partition_search_breakout_dist_thr = (1 << 22); + sf->part_sf.partition_search_breakout_rate_thr = 100; + } + + if (is_720p_or_larger) { + sf->inter_sf.prune_obmc_prob_thresh = 16; + } else { + sf->inter_sf.prune_obmc_prob_thresh = 8; + } + + if (is_480p_or_larger) { + sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 1; + } + } + + if (speed >= 3) { + sf->part_sf.ml_early_term_after_part_split_level = 0; + + if (is_720p_or_larger) { + sf->part_sf.partition_search_breakout_dist_thr = (1 << 25); + sf->part_sf.partition_search_breakout_rate_thr = 200; + } else { + sf->part_sf.max_intra_bsize = BLOCK_32X32; + sf->part_sf.partition_search_breakout_dist_thr = (1 << 23); + sf->part_sf.partition_search_breakout_rate_thr = 120; + } + } + + if (speed >= 4) { + if (is_720p_or_larger) { + sf->part_sf.partition_search_breakout_dist_thr = (1 << 26); + } else { + sf->part_sf.partition_search_breakout_dist_thr = (1 << 24); + } + + if (is_480p_or_larger) { + sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 2; + } + + sf->inter_sf.prune_obmc_prob_thresh = 16; + } + + if (speed >= 5) { + if (is_720p_or_larger) { + sf->inter_sf.prune_warped_prob_thresh = 16; + } else if (is_480p_or_larger) { + sf->inter_sf.prune_warped_prob_thresh = 8; + } + } +} + +static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi, + SPEED_FEATURES *const sf, + int speed) { + const AV1_COMMON *const cm = &cpi->common; + const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; + const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; + const int is_360p_or_larger = AOMMIN(cm->width, cm->height) >= 360; + + (void)is_720p_or_larger; // Not used so far + + if (!is_360p_or_larger) { + if (speed >= 6) sf->rt_sf.force_tx_search_off = 1; + if (speed >= 8) { + sf->rt_sf.use_modeled_non_rd_cost = 0; + sf->rt_sf.use_nonrd_filter_search = 0; + } + } + if (is_360p_or_larger) { + if (speed >= 7) { + sf->interp_sf.disable_filter_search_var_thresh = 0; + } + } + if (!is_480p_or_larger) { + if (speed == 7) { + sf->rt_sf.nonrd_check_partition_merge_mode = 2; + } + if (speed >= 8) { + sf->mv_sf.subpel_search_method = SUBPEL_TREE; + + sf->rt_sf.estimate_motion_for_var_based_partition = 1; + } + } +} + +static void set_good_speed_features_framesize_independent( + const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) { + const AV1_COMMON *const cm = &cpi->common; + const GF_GROUP *const gf_group = &cpi->gf_group; + const int boosted = frame_is_boosted(cpi); + const int is_boosted_arf2_bwd_type = + boosted || gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE; + const int allow_screen_content_tools = + cm->features.allow_screen_content_tools; + if (!cpi->oxcf.large_scale_tile) { + sf->hl_sf.high_precision_mv_usage = LAST_MV_DATA; + } + + // Speed 0 for all speed features that give neutral coding performance change. + sf->gm_sf.gm_disable_recode = 1; + sf->gm_sf.gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3; + + sf->part_sf.less_rectangular_check_level = 1; + sf->part_sf.ml_prune_4_partition = 1; + sf->part_sf.ml_prune_ab_partition = 1; + sf->part_sf.ml_prune_rect_partition = 1; + sf->part_sf.prune_ext_partition_types_search_level = 1; + sf->part_sf.simple_motion_search_prune_rect = 1; + + sf->inter_sf.disable_wedge_search_edge_thresh = 0; + sf->inter_sf.disable_wedge_search_var_thresh = 0; + // TODO(debargha): Test, tweak and turn on either 1 or 2 + sf->inter_sf.inter_mode_rd_model_estimation = 1; + sf->inter_sf.model_based_post_interp_filter_breakout = 1; + sf->inter_sf.prune_compound_using_single_ref = 1; + sf->inter_sf.prune_mode_search_simple_translation = 1; + sf->inter_sf.prune_motion_mode_level = 1; + sf->inter_sf.prune_ref_frame_for_rect_partitions = + (boosted || (allow_screen_content_tools)) + ? 0 + : (is_boosted_arf2_bwd_type ? 1 : 2); + sf->inter_sf.prune_wedge_pred_diff_based = 1; + sf->inter_sf.reduce_inter_modes = 1; + sf->inter_sf.selective_ref_frame = 1; + sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_SKIP_MV_SEARCH; + + sf->interp_sf.cb_pred_filter_search = 0; + sf->interp_sf.use_fast_interpolation_filter_search = 1; + + sf->intra_sf.intra_pruning_with_hog = 1; + sf->intra_sf.intra_pruning_with_hog_thresh = -1.2f; + + sf->tx_sf.adaptive_txb_search_level = 1; + sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1; + sf->tx_sf.model_based_prune_tx_search_level = 1; + sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1; + + sf->rt_sf.use_nonrd_pick_mode = 0; + sf->rt_sf.use_real_time_ref_set = 0; + + if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) + sf->mv_sf.exhaustive_searches_thresh = (1 << 24); + else + sf->mv_sf.exhaustive_searches_thresh = (1 << 25); + + sf->rd_sf.perform_coeff_opt = 1; + + if (speed >= 1) { + sf->gm_sf.disable_adaptive_warp_error_thresh = 0; + sf->gm_sf.gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2; + sf->gm_sf.prune_ref_frame_for_gm_search = boosted ? 0 : 1; + + sf->part_sf.intra_cnn_split = 1; + sf->part_sf.simple_motion_search_early_term_none = 1; + // TODO(Venkat): Clean-up frame type dependency for + // simple_motion_search_split in partition search function and set the + // speed feature accordingly + sf->part_sf.simple_motion_search_split = allow_screen_content_tools ? 1 : 2; + + sf->mv_sf.exhaustive_searches_thresh <<= 1; + sf->mv_sf.obmc_full_pixel_search_level = 1; + sf->mv_sf.use_accurate_subpel_search = USE_4_TAPS; + + sf->inter_sf.disable_interinter_wedge_newmv_search = boosted ? 0 : 1; + sf->inter_sf.prune_comp_search_by_single_result = boosted ? 2 : 1; + sf->inter_sf.prune_comp_type_by_comp_avg = 1; + sf->inter_sf.prune_comp_type_by_model_rd = boosted ? 0 : 1; + sf->inter_sf.prune_motion_mode_level = 2; + sf->inter_sf.prune_ref_frame_for_rect_partitions = + (frame_is_intra_only(&cpi->common) || (allow_screen_content_tools)) + ? 0 + : (boosted ? 1 : 2); + sf->inter_sf.reduce_inter_modes = boosted ? 1 : 2; + sf->inter_sf.reuse_inter_intra_mode = 1; + sf->inter_sf.selective_ref_frame = 2; + sf->inter_sf.skip_repeated_newmv = 1; + + sf->interp_sf.cb_pred_filter_search = 0; + sf->interp_sf.use_interp_filter = 1; + sf->intra_sf.prune_palette_search_level = 1; + + sf->tx_sf.adaptive_txb_search_level = 2; + sf->tx_sf.inter_tx_size_search_init_depth_rect = 1; + sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1; + sf->tx_sf.intra_tx_size_search_init_depth_rect = 1; + sf->tx_sf.model_based_prune_tx_search_level = 0; + sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000; + sf->tx_sf.tx_type_search.prune_mode = PRUNE_2D_FAST; + sf->tx_sf.tx_type_search.skip_tx_search = 1; + sf->tx_sf.use_intra_txb_hash = 1; + + sf->rd_sf.perform_coeff_opt = boosted ? 2 : 3; + sf->rd_sf.tx_domain_dist_level = boosted ? 1 : 2; + sf->rd_sf.tx_domain_dist_thres_level = 1; + + sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL1; + sf->lpf_sf.dual_sgr_penalty_level = 1; + sf->lpf_sf.enable_sgr_ep_pruning = 1; + + // TODO(any, yunqing): move this feature to speed 0. + sf->tpl_sf.skip_alike_starting_mv = 1; + } + + if (speed >= 2) { + sf->gm_sf.gm_erroradv_type = GM_ERRORADV_TR_2; + + sf->part_sf.allow_partition_search_skip = 1; + + sf->mv_sf.auto_mv_step_size = 1; + sf->mv_sf.subpel_iters_per_step = 1; + + // TODO(chiyotsai@google.com): We can get 10% speed up if we move + // adaptive_rd_thresh to speed 1. But currently it performs poorly on some + // clips (e.g. 5% loss on dinner_1080p). We need to examine the sequence a + // bit more closely to figure out why. + sf->inter_sf.adaptive_rd_thresh = 1; + sf->inter_sf.comp_inter_joint_search_thresh = BLOCK_SIZES_ALL; + sf->inter_sf.disable_interinter_wedge_newmv_search = 1; + sf->inter_sf.disable_wedge_search_edge_thresh = 0; + sf->inter_sf.disable_wedge_search_var_thresh = 100; + sf->inter_sf.fast_interintra_wedge_search = 1; + sf->inter_sf.fast_wedge_sign_estimate = 1; + sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 1; + sf->inter_sf.prune_compound_using_neighbors = 1; + sf->inter_sf.prune_comp_type_by_comp_avg = 2; + sf->inter_sf.prune_warp_using_wmtype = 1; + sf->inter_sf.selective_ref_frame = 3; + sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED; + + // TODO(Sachin): Enable/Enhance this speed feature for speed 2 & 3 + sf->interp_sf.adaptive_interp_filter_search = 1; + sf->interp_sf.disable_dual_filter = 1; + sf->interp_sf.disable_filter_search_var_thresh = 100; + + sf->intra_sf.disable_smooth_intra = + !frame_is_intra_only(&cpi->common) || (cpi->rc.frames_to_key != 1); + + sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 3 : 4; + + sf->lpf_sf.prune_wiener_based_on_src_var = 1; + sf->lpf_sf.prune_sgr_based_on_wiener = !allow_screen_content_tools; + } + + if (speed >= 3) { + sf->hl_sf.high_precision_mv_usage = CURRENT_Q; + sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF; + + sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH; + + sf->part_sf.less_rectangular_check_level = 2; + sf->part_sf.simple_motion_search_prune_agg = 1; + sf->part_sf.prune_4_partition_using_split_info = + !allow_screen_content_tools; + + // adaptive_motion_search breaks encoder multi-thread tests. + // The values in x->pred_mv[] differ for single and multi-thread cases. + // See aomedia:1778. + // sf->mv_sf.adaptive_motion_search = 1; + sf->mv_sf.full_pixel_search_level = 1; + sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED; + sf->mv_sf.use_accurate_subpel_search = USE_2_TAPS; + sf->mv_sf.search_method = DIAMOND; + + sf->inter_sf.disable_sb_level_mv_cost_upd = 1; + // TODO(yunqing): evaluate this speed feature for speed 1 & 2, and combine + // it with cpi->sf.disable_wedge_search_var_thresh. + sf->inter_sf.disable_wedge_interintra_search = 1; + // TODO(any): Experiment with the early exit mechanism for speeds 0, 1 and 2 + // and clean-up the speed feature + sf->inter_sf.perform_best_rd_based_gating_for_chroma = 1; + sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 1; + sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 2; + sf->inter_sf.prune_motion_mode_level = boosted ? 2 : 3; + sf->inter_sf.selective_ref_frame = 4; + sf->inter_sf.skip_repeated_ref_mv = 1; + sf->inter_sf.skip_repeated_full_newmv = 1; + if (cpi->oxcf.enable_smooth_interintra) + sf->inter_sf.disable_smooth_interintra = boosted ? 0 : 1; + sf->inter_sf.reuse_compound_type_decision = 1; + sf->inter_sf.txfm_rd_gate_level = (boosted || allow_screen_content_tools) + ? 0 + : (is_boosted_arf2_bwd_type ? 1 : 2); + + sf->intra_sf.prune_palette_search_level = 2; + + sf->tpl_sf.skip_alike_starting_mv = 2; + sf->tpl_sf.prune_intra_modes = 1; + sf->tpl_sf.reduce_first_step_size = 6; + + sf->tx_sf.adaptive_txb_search_level = boosted ? 2 : 3; + sf->tx_sf.tx_type_search.use_skip_flag_prediction = + allow_screen_content_tools ? 1 : 2; + + // TODO(any): Refactor the code related to following winner mode speed + // features + sf->winner_mode_sf.enable_winner_mode_for_coeff_opt = 1; + // TODO(any): Experiment with this speed feature by enabling for key frames + sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = + frame_is_intra_only(&cpi->common) ? 0 : 1; + sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist = + !allow_screen_content_tools; + sf->winner_mode_sf.motion_mode_for_winner_cand = + boosted + ? 0 + : gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE ? 1 + : 2; + + // TODO(any): evaluate if these lpf features can be moved to speed 2. + sf->lpf_sf.prune_sgr_based_on_wiener = allow_screen_content_tools ? 0 : 2; + sf->lpf_sf.disable_loop_restoration_chroma = + (boosted || allow_screen_content_tools) ? 0 : 1; + sf->lpf_sf.reduce_wiener_window_size = !boosted; + sf->lpf_sf.prune_wiener_based_on_src_var = 2; + + sf->hl_sf.second_alt_ref_filtering = 0; + } + + if (speed >= 4) { + sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE; + + sf->part_sf.simple_motion_search_prune_agg = 2; + sf->part_sf.prune_ab_partition_using_split_info = + !allow_screen_content_tools; + + sf->inter_sf.adaptive_mode_search = 1; + sf->inter_sf.alt_ref_search_fp = 1; + sf->inter_sf.prune_ref_mv_idx_search = 1; + sf->inter_sf.txfm_rd_gate_level = + (boosted || allow_screen_content_tools) ? 0 : 3; + + sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 2; + sf->inter_sf.prune_compound_using_neighbors = 2; + sf->inter_sf.disable_smooth_interintra = 1; + + sf->interp_sf.cb_pred_filter_search = 1; + sf->interp_sf.skip_sharp_interp_filter_search = 1; + sf->interp_sf.use_interp_filter = 2; + sf->interp_sf.adaptive_interp_filter_search = 2; + + sf->intra_sf.intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL; + sf->intra_sf.intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL; + sf->intra_sf.intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL; + sf->intra_sf.intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; + sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; + sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V; + // TODO(any): Experiment with this speed feature set to 2 for higher quality + // presets as well + sf->intra_sf.skip_intra_in_interframe = 2; + + sf->tx_sf.tx_type_search.enable_winner_mode_tx_type_pruning = 1; + sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1; + sf->tx_sf.tx_type_search.prune_mode = PRUNE_2D_MORE; + sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 1; + // TODO(any): Experiment with enabling of this speed feature as hash state + // is reset during winner mode processing + sf->tx_sf.use_intra_txb_hash = 0; + + sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 3 : 5; + sf->rd_sf.tx_domain_dist_thres_level = 2; + + // TODO(any): Extend multi-winner mode processing support for inter frames + sf->winner_mode_sf.enable_multiwinner_mode_process = + frame_is_intra_only(&cpi->common) ? 1 : 0; + sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = 1; + + sf->lpf_sf.cdef_pick_method = allow_screen_content_tools + ? CDEF_FAST_SEARCH_LVL1 + : CDEF_FAST_SEARCH_LVL2; + + // TODO(any): The following features have no impact on quality and speed, + // and are disabled. + // sf->part_sf.partition_search_breakout_rate_thr = 300; + // sf->interp_sf.disable_filter_search_var_thresh = 200; + // sf->rd_sf.use_fast_coef_costing = 1; + + // TODO(any): The following features give really bad quality/speed trade + // off. Needs to be re-worked. + // sf->mv_sf.search_method = BIGDIA; + // sf->inter_sf.adaptive_rd_thresh = 4; + // sf->rd_sf.tx_domain_dist_level = 2; + // sf->rt_sf.mode_search_skip_flags = + // (cm->current_frame.frame_type == KEY_FRAME) + // ? 0 + // : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER | + // FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR | + // FLAG_EARLY_TERMINATE; + } + + if (speed >= 5) { + sf->part_sf.simple_motion_search_prune_agg = 3; + sf->part_sf.ext_partition_eval_thresh = + allow_screen_content_tools ? BLOCK_8X8 : BLOCK_16X16; + + sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 3; + sf->inter_sf.disable_interinter_wedge = 1; + sf->inter_sf.disable_obmc = 1; + sf->inter_sf.disable_onesided_comp = 1; + sf->inter_sf.txfm_rd_gate_level = + (boosted || allow_screen_content_tools) ? 0 : 4; + sf->inter_sf.prune_inter_modes_if_skippable = 1; + + sf->lpf_sf.lpf_pick = LPF_PICK_FROM_FULL_IMAGE_NON_DUAL; + sf->lpf_sf.disable_lr_filter = 1; + + sf->mv_sf.simple_motion_subpel_force_stop = QUARTER_PEL; + sf->mv_sf.prune_mesh_search = 1; + sf->mv_sf.reduce_search_range = 1; + + sf->tpl_sf.subpel_force_stop = QUARTER_PEL; + } + + if (speed >= 6) { + } +} + +// TODO(kyslov): now this is very similar to +// set_good_speed_features_framesize_independent +// except it sets non-rd flag on speed8. This function will likely +// be modified in the future with RT-specific speed features +static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi, + SPEED_FEATURES *sf, + int speed) { + AV1_COMMON *const cm = &cpi->common; + const int boosted = frame_is_boosted(cpi); + + // Speed 0 for all speed features that give neutral coding performance change. + sf->gm_sf.gm_disable_recode = 1; + sf->gm_sf.gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3; + + sf->part_sf.less_rectangular_check_level = 1; + sf->part_sf.ml_prune_4_partition = 1; + sf->part_sf.ml_prune_ab_partition = 1; + sf->part_sf.ml_prune_rect_partition = 1; + sf->part_sf.prune_ext_partition_types_search_level = 1; + + // TODO(debargha): Test, tweak and turn on either 1 or 2 + sf->inter_sf.inter_mode_rd_model_estimation = 0; + sf->inter_sf.disable_wedge_search_edge_thresh = 0; + sf->inter_sf.disable_wedge_search_var_thresh = 0; + sf->inter_sf.model_based_post_interp_filter_breakout = 1; + sf->inter_sf.prune_compound_using_single_ref = 0; + sf->inter_sf.prune_mode_search_simple_translation = 1; + sf->inter_sf.prune_motion_mode_level = 1; + sf->inter_sf.prune_ref_frame_for_rect_partitions = !boosted; + sf->inter_sf.prune_wedge_pred_diff_based = 1; + sf->inter_sf.reduce_inter_modes = 1; + sf->inter_sf.selective_ref_frame = 1; + sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_SKIP_MV_SEARCH; + + sf->interp_sf.cb_pred_filter_search = 0; + sf->interp_sf.use_fast_interpolation_filter_search = 1; + + sf->intra_sf.intra_pruning_with_hog = 1; + sf->intra_sf.intra_pruning_with_hog_thresh = -1.2f; + + sf->mv_sf.full_pixel_search_level = 1; + sf->mv_sf.exhaustive_searches_thresh = INT_MAX; + + sf->rt_sf.check_intra_pred_nonrd = 1; + sf->rt_sf.estimate_motion_for_var_based_partition = 1; + sf->rt_sf.hybrid_intra_pickmode = 0; + sf->rt_sf.nonrd_prune_ref_frame_search = 0; + sf->rt_sf.reuse_inter_pred_nonrd = 0; + sf->rt_sf.use_comp_ref_nonrd = 1; + sf->rt_sf.use_nonrd_filter_search = 1; + sf->rt_sf.use_nonrd_pick_mode = 0; + sf->rt_sf.use_real_time_ref_set = 0; + sf->tx_sf.adaptive_txb_search_level = 1; + sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1; + sf->tx_sf.model_based_prune_tx_search_level = 1; + sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1; + + if (speed >= 1) { + sf->gm_sf.gm_erroradv_type = GM_ERRORADV_TR_1; + sf->gm_sf.gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2; + + sf->part_sf.prune_ext_partition_types_search_level = 2; + sf->part_sf.simple_motion_search_prune_rect = 1; + + sf->mv_sf.obmc_full_pixel_search_level = 1; + sf->mv_sf.use_accurate_subpel_search = USE_4_TAPS; + + sf->inter_sf.prune_comp_search_by_single_result = 1; + sf->inter_sf.reuse_inter_intra_mode = 1; + sf->inter_sf.selective_ref_frame = 2; + sf->inter_sf.skip_repeated_newmv = 1; + sf->inter_sf.disable_wedge_search_var_thresh = 0; + sf->inter_sf.disable_wedge_search_edge_thresh = 0; + sf->inter_sf.prune_comp_type_by_comp_avg = 1; + sf->inter_sf.prune_motion_mode_level = 2; + sf->inter_sf.prune_single_motion_modes_by_simple_trans = 1; + + sf->interp_sf.cb_pred_filter_search = 1; + sf->interp_sf.use_interp_filter = 1; + + sf->tx_sf.adaptive_txb_search_level = 2; + sf->tx_sf.intra_tx_size_search_init_depth_rect = 1; + sf->tx_sf.tx_size_search_lgr_block = 1; + sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000; + sf->tx_sf.tx_type_search.skip_tx_search = 1; + sf->tx_sf.use_intra_txb_hash = 1; + + sf->rd_sf.optimize_b_precheck = 1; + sf->rd_sf.tx_domain_dist_level = boosted ? 0 : 1; + sf->rd_sf.tx_domain_dist_thres_level = 1; + + sf->lpf_sf.dual_sgr_penalty_level = 1; + } + + if (speed >= 2) { + sf->gm_sf.gm_erroradv_type = GM_ERRORADV_TR_2; + + sf->part_sf.allow_partition_search_skip = 1; + sf->part_sf.partition_search_breakout_rate_thr = 80; + + sf->mv_sf.auto_mv_step_size = 1; + sf->mv_sf.subpel_iters_per_step = 1; + + sf->inter_sf.adaptive_rd_thresh = 1; + sf->inter_sf.comp_inter_joint_search_thresh = BLOCK_SIZES_ALL; + sf->inter_sf.disable_wedge_search_edge_thresh = 0; + sf->inter_sf.disable_wedge_search_var_thresh = 100; + sf->inter_sf.fast_wedge_sign_estimate = 1; + sf->inter_sf.prune_comp_type_by_comp_avg = 2; + sf->inter_sf.selective_ref_frame = 3; + sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED; + + sf->interp_sf.adaptive_interp_filter_search = 1; + sf->interp_sf.cb_pred_filter_search = 0; + sf->interp_sf.disable_dual_filter = 1; + sf->interp_sf.disable_filter_search_var_thresh = 100; + + sf->tx_sf.inter_tx_size_search_init_depth_rect = 1; + sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1; + sf->tx_sf.model_based_prune_tx_search_level = 0; + + sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL1; + } + + if (speed >= 3) { + sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF; + + sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH; + + sf->part_sf.less_rectangular_check_level = 2; + + sf->mv_sf.use_accurate_subpel_search = USE_2_TAPS; + // adaptive_motion_search breaks encoder multi-thread tests. + // The values in x->pred_mv[] differ for single and multi-thread cases. + // See aomedia:1778. + // sf->mv_sf.adaptive_motion_search = 1; + + sf->inter_sf.adaptive_rd_thresh = 2; + sf->inter_sf.disable_sb_level_mv_cost_upd = 1; + // TODO(yunqing): evaluate this speed feature for speed 1 & 2, and combine + // it with cpi->sf.disable_wedge_search_var_thresh. + sf->inter_sf.disable_wedge_interintra_search = 1; + sf->inter_sf.prune_comp_search_by_single_result = 2; + sf->inter_sf.prune_motion_mode_level = boosted ? 2 : 3; + sf->inter_sf.prune_warp_using_wmtype = 1; + sf->inter_sf.selective_ref_frame = 4; + + sf->tx_sf.tx_type_search.prune_mode = PRUNE_2D_FAST; + + sf->rd_sf.tx_domain_dist_level = 1; + + sf->winner_mode_sf.tx_size_search_level = boosted ? 0 : 2; + } + + if (speed >= 4) { + sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED; + + sf->inter_sf.adaptive_mode_search = 1; + sf->inter_sf.alt_ref_search_fp = 1; + + sf->interp_sf.skip_sharp_interp_filter_search = 1; + + sf->tx_sf.tx_type_search.fast_inter_tx_type_search = 1; + sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1; + sf->tx_sf.use_intra_txb_hash = 0; + + sf->rd_sf.use_mb_rd_hash = 0; + + sf->winner_mode_sf.tx_size_search_level = frame_is_intra_only(cm) ? 0 : 2; + } + + if (speed >= 5) { + sf->hl_sf.recode_loop = ALLOW_RECODE_KFMAXBW; + + sf->inter_sf.adaptive_rd_thresh = 4; + sf->interp_sf.disable_filter_search_var_thresh = 200; + + sf->rd_sf.use_fast_coef_costing = 1; + sf->rd_sf.tx_domain_dist_level = 2; + sf->rd_sf.tx_domain_dist_thres_level = 2; + sf->winner_mode_sf.tx_size_search_level = 1; + + sf->rt_sf.mode_search_skip_flags = + (cm->current_frame.frame_type == KEY_FRAME) + ? 0 + : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER | + FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR | + FLAG_EARLY_TERMINATE; + sf->hl_sf.frame_parameter_update = 0; + + sf->part_sf.default_max_partition_size = BLOCK_128X128; + sf->part_sf.default_min_partition_size = BLOCK_8X8; + sf->part_sf.max_intra_bsize = BLOCK_32X32; + sf->part_sf.partition_search_breakout_rate_thr = 500; + sf->part_sf.partition_search_type = VAR_BASED_PARTITION; + sf->part_sf.adjust_var_based_rd_partitioning = 2; + + sf->mv_sf.search_method = FAST_DIAMOND; + sf->mv_sf.subpel_force_stop = QUARTER_PEL; + sf->mv_sf.use_fullpel_costlist = 1; + sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE; + + sf->inter_sf.adaptive_mode_search = 2; + sf->inter_sf.inter_mode_rd_model_estimation = 2; + + for (int i = 0; i < TX_SIZES; ++i) { + sf->intra_sf.intra_y_mode_mask[i] = INTRA_DC; + sf->intra_sf.intra_uv_mode_mask[i] = UV_INTRA_DC_CFL; + } + + sf->tx_sf.tx_type_search.prune_mode = PRUNE_2D_MORE; + sf->tx_sf.use_inter_txb_hash = 0; + sf->tx_sf.refine_fast_tx_search_results = 0; + + sf->rd_sf.optimize_coefficients = NO_TRELLIS_OPT; + sf->rd_sf.simple_model_rd_from_var = 1; + + sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q; + sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q; + + sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH; + sf->rt_sf.num_inter_modes_for_tx_search = 5; + sf->rt_sf.skip_interp_filter_search = 1; + sf->rt_sf.use_comp_ref_nonrd = 0; + sf->rt_sf.use_real_time_ref_set = 1; + sf->rt_sf.use_simple_rd_model = 1; + } + + if (speed >= 6) { + sf->part_sf.adjust_var_based_rd_partitioning = 1; + } + + if (speed >= 7) { + sf->hl_sf.frame_parameter_update = 0; + + sf->part_sf.default_max_partition_size = BLOCK_128X128; + sf->part_sf.default_min_partition_size = BLOCK_8X8; + sf->part_sf.partition_search_type = VAR_BASED_PARTITION; + + sf->mv_sf.search_method = FAST_DIAMOND; + sf->mv_sf.subpel_force_stop = QUARTER_PEL; + sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED; + + sf->inter_sf.inter_mode_rd_model_estimation = 2; + + sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q; + sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q; + + sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH; + sf->rt_sf.nonrd_prune_ref_frame_search = 1; + sf->rt_sf.reuse_inter_pred_nonrd = 0; + sf->rt_sf.short_circuit_low_temp_var = 0; + sf->rt_sf.skip_interp_filter_search = 0; + sf->rt_sf.use_comp_ref_nonrd = 0; + sf->rt_sf.use_nonrd_altref_frame = 1; + sf->rt_sf.use_nonrd_pick_mode = 1; + sf->rt_sf.nonrd_check_partition_merge_mode = 1; + sf->rt_sf.nonrd_check_partition_split = 0; + sf->rt_sf.hybrid_intra_pickmode = 1; + } + + if (speed >= 8) { + sf->rt_sf.estimate_motion_for_var_based_partition = 0; + sf->rt_sf.short_circuit_low_temp_var = 1; + sf->rt_sf.reuse_inter_pred_nonrd = 1; + sf->rt_sf.use_nonrd_altref_frame = 0; + sf->rt_sf.nonrd_prune_ref_frame_search = 2; + sf->rt_sf.nonrd_check_partition_merge_mode = 0; + sf->rt_sf.nonrd_check_partition_split = 0; + sf->rt_sf.use_modeled_non_rd_cost = 1; + sf->rt_sf.source_metrics_sb_nonrd = 1; + sf->interp_sf.cb_pred_filter_search = 1; + } +} + +static AOM_INLINE void init_hl_sf(HIGH_LEVEL_SPEED_FEATURES *hl_sf) { + // best quality defaults + hl_sf->frame_parameter_update = 1; + hl_sf->recode_loop = ALLOW_RECODE; + hl_sf->disable_overlay_frames = 0; + hl_sf->adaptive_overlay_encoding = 1; + // Recode loop tolerance %. + hl_sf->recode_tolerance = 25; + hl_sf->high_precision_mv_usage = CURRENT_Q; + hl_sf->second_alt_ref_filtering = 1; +} + +static AOM_INLINE void init_tpl_sf(TPL_SPEED_FEATURES *tpl_sf) { + tpl_sf->prune_intra_modes = 0; + tpl_sf->reduce_first_step_size = 0; + tpl_sf->skip_alike_starting_mv = 0; + tpl_sf->subpel_force_stop = EIGHTH_PEL; +} + +static AOM_INLINE void init_gm_sf(GLOBAL_MOTION_SPEED_FEATURES *gm_sf) { + gm_sf->gm_erroradv_type = GM_ERRORADV_TR_0; + gm_sf->disable_adaptive_warp_error_thresh = 1; + gm_sf->selective_ref_gm = 1; + gm_sf->gm_search_type = GM_FULL_SEARCH; + gm_sf->gm_disable_recode = 0; + gm_sf->prune_ref_frame_for_gm_search = 0; +} + +static AOM_INLINE void init_part_sf(PARTITION_SPEED_FEATURES *part_sf) { + part_sf->partition_search_type = SEARCH_PARTITION; + part_sf->less_rectangular_check_level = 0; + part_sf->use_square_partition_only_threshold = BLOCK_128X128; + part_sf->auto_max_partition_based_on_simple_motion = NOT_IN_USE; + part_sf->auto_min_partition_based_on_simple_motion = 0; + part_sf->default_max_partition_size = BLOCK_LARGEST; + part_sf->default_min_partition_size = BLOCK_4X4; + part_sf->adjust_var_based_rd_partitioning = 0; + part_sf->allow_partition_search_skip = 0; + part_sf->max_intra_bsize = BLOCK_LARGEST; + // This setting only takes effect when partition_search_type is set + // to FIXED_PARTITION. + part_sf->always_this_block_size = BLOCK_16X16; + // Recode loop tolerance %. + part_sf->partition_search_breakout_dist_thr = 0; + part_sf->partition_search_breakout_rate_thr = 0; + part_sf->prune_ext_partition_types_search_level = 0; + part_sf->ml_prune_rect_partition = 0; + part_sf->ml_prune_ab_partition = 0; + part_sf->ml_prune_4_partition = 0; + part_sf->ml_early_term_after_part_split_level = 0; + for (int i = 0; i < PARTITION_BLOCK_SIZES; ++i) { + part_sf->ml_partition_search_breakout_thresh[i] = + -1; // -1 means not enabled. + } + part_sf->simple_motion_search_prune_agg = 0; + part_sf->simple_motion_search_split = 0; + part_sf->simple_motion_search_prune_rect = 0; + part_sf->simple_motion_search_early_term_none = 0; + part_sf->intra_cnn_split = 0; + part_sf->ext_partition_eval_thresh = BLOCK_8X8; + part_sf->prune_4_partition_using_split_info = 0; + part_sf->prune_ab_partition_using_split_info = 0; +} + +static AOM_INLINE void init_mv_sf(MV_SPEED_FEATURES *mv_sf) { + mv_sf->full_pixel_search_level = 0; + mv_sf->adaptive_motion_search = 0; + mv_sf->auto_mv_step_size = 0; + mv_sf->exhaustive_searches_thresh = 0; + mv_sf->obmc_full_pixel_search_level = 0; + mv_sf->prune_mesh_search = 0; + mv_sf->reduce_search_range = 0; + mv_sf->search_method = NSTEP; + mv_sf->simple_motion_subpel_force_stop = EIGHTH_PEL; + mv_sf->subpel_force_stop = EIGHTH_PEL; + mv_sf->subpel_iters_per_step = 2; + mv_sf->subpel_search_method = SUBPEL_TREE; + mv_sf->use_accurate_subpel_search = USE_8_TAPS; + mv_sf->use_fullpel_costlist = 0; +} + +static AOM_INLINE void init_inter_sf(INTER_MODE_SPEED_FEATURES *inter_sf) { + inter_sf->comp_inter_joint_search_thresh = BLOCK_4X4; + inter_sf->adaptive_rd_thresh = 0; + inter_sf->model_based_post_interp_filter_breakout = 0; + inter_sf->reduce_inter_modes = 0; + inter_sf->adaptive_mode_search = 0; + inter_sf->alt_ref_search_fp = 0; + inter_sf->selective_ref_frame = 0; + inter_sf->prune_ref_frame_for_rect_partitions = 0; + inter_sf->disable_wedge_search_edge_thresh = 0; + inter_sf->disable_wedge_search_var_thresh = 0; + inter_sf->fast_wedge_sign_estimate = 0; + inter_sf->prune_wedge_pred_diff_based = 0; + inter_sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_ENABLED; + inter_sf->reuse_inter_intra_mode = 0; + inter_sf->disable_sb_level_coeff_cost_upd = 0; + inter_sf->disable_sb_level_mv_cost_upd = 0; + inter_sf->prune_inter_modes_based_on_tpl = 0; + inter_sf->prune_comp_search_by_single_result = 0; + inter_sf->skip_repeated_ref_mv = 0; + inter_sf->skip_repeated_newmv = 0; + inter_sf->skip_repeated_full_newmv = 0; + inter_sf->prune_single_motion_modes_by_simple_trans = 0; + inter_sf->inter_mode_rd_model_estimation = 0; + inter_sf->prune_compound_using_single_ref = 0; + inter_sf->prune_compound_using_neighbors = 0; + inter_sf->disable_onesided_comp = 0; + inter_sf->prune_mode_search_simple_translation = 0; + inter_sf->prune_comp_type_by_comp_avg = 0; + inter_sf->disable_interinter_wedge_newmv_search = 0; + inter_sf->enable_interinter_diffwtd_newmv_search = 0; + inter_sf->disable_smooth_interintra = 0; + inter_sf->prune_motion_mode_level = 0; + inter_sf->prune_warp_using_wmtype = 0; + inter_sf->disable_wedge_interintra_search = 0; + inter_sf->fast_interintra_wedge_search = 0; + inter_sf->prune_comp_type_by_model_rd = 0; + inter_sf->perform_best_rd_based_gating_for_chroma = 0; + inter_sf->prune_obmc_prob_thresh = 0; + inter_sf->disable_obmc = 0; + inter_sf->disable_interinter_wedge = 0; + inter_sf->prune_ref_mv_idx_search = 0; + inter_sf->prune_warped_prob_thresh = 0; + inter_sf->reuse_compound_type_decision = 0; + inter_sf->txfm_rd_gate_level = 0; + inter_sf->prune_inter_modes_if_skippable = 0; +} + +static AOM_INLINE void init_interp_sf(INTERP_FILTER_SPEED_FEATURES *interp_sf) { + interp_sf->disable_filter_search_var_thresh = 0; + interp_sf->adaptive_interp_filter_search = 0; + interp_sf->use_fast_interpolation_filter_search = 0; + interp_sf->disable_dual_filter = 0; + interp_sf->use_interp_filter = 0; + interp_sf->skip_sharp_interp_filter_search = 0; +} + +static AOM_INLINE void init_intra_sf(INTRA_MODE_SPEED_FEATURES *intra_sf) { + intra_sf->skip_intra_in_interframe = 1; + intra_sf->intra_pruning_with_hog = 0; + intra_sf->src_var_thresh_intra_skip = 1; + intra_sf->prune_palette_search_level = 0; + + for (int i = 0; i < TX_SIZES; i++) { + intra_sf->intra_y_mode_mask[i] = INTRA_ALL; + intra_sf->intra_uv_mode_mask[i] = UV_INTRA_ALL; + } + intra_sf->disable_smooth_intra = 0; +} + +static AOM_INLINE void init_tx_sf(TX_SPEED_FEATURES *tx_sf) { + tx_sf->inter_tx_size_search_init_depth_sqr = 0; + tx_sf->inter_tx_size_search_init_depth_rect = 0; + tx_sf->intra_tx_size_search_init_depth_rect = 0; + tx_sf->intra_tx_size_search_init_depth_sqr = 0; + tx_sf->tx_size_search_lgr_block = 0; + tx_sf->model_based_prune_tx_search_level = 0; + tx_sf->tx_type_search.prune_mode = PRUNE_2D_ACCURATE; + tx_sf->tx_type_search.ml_tx_split_thresh = 8500; + tx_sf->tx_type_search.use_skip_flag_prediction = 1; + tx_sf->tx_type_search.use_reduced_intra_txset = 0; + tx_sf->tx_type_search.fast_intra_tx_type_search = 0; + tx_sf->tx_type_search.fast_inter_tx_type_search = 0; + tx_sf->tx_type_search.skip_tx_search = 0; + tx_sf->tx_type_search.prune_tx_type_using_stats = 0; + tx_sf->tx_type_search.prune_tx_type_est_rd = 0; + tx_sf->tx_type_search.enable_winner_mode_tx_type_pruning = 0; + tx_sf->txb_split_cap = 1; + tx_sf->adaptive_txb_search_level = 0; + tx_sf->use_intra_txb_hash = 0; + tx_sf->use_inter_txb_hash = 1; + tx_sf->refine_fast_tx_search_results = 1; +} + +static AOM_INLINE void init_rd_sf(RD_CALC_SPEED_FEATURES *rd_sf, + const AV1_COMP *cpi) { + if (cpi->oxcf.disable_trellis_quant == 3) { + rd_sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf) + ? NO_ESTIMATE_YRD_TRELLIS_OPT + : NO_TRELLIS_OPT; + } else if (cpi->oxcf.disable_trellis_quant == 2) { + rd_sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf) + ? FINAL_PASS_TRELLIS_OPT + : NO_TRELLIS_OPT; + } else if (cpi->oxcf.disable_trellis_quant == 0) { + if (is_lossless_requested(&cpi->oxcf)) { + rd_sf->optimize_coefficients = NO_TRELLIS_OPT; + } else { + rd_sf->optimize_coefficients = FULL_TRELLIS_OPT; + } + } else if (cpi->oxcf.disable_trellis_quant == 1) { + rd_sf->optimize_coefficients = NO_TRELLIS_OPT; + } else { + assert(0 && "Invalid disable_trellis_quant value"); + } + // TODO(sarahparker) Pair this with a speed setting once experiments are done + rd_sf->trellis_eob_fast = 0; + rd_sf->use_mb_rd_hash = 1; + rd_sf->optimize_b_precheck = 0; + rd_sf->use_fast_coef_costing = 0; + rd_sf->simple_model_rd_from_var = 0; + rd_sf->tx_domain_dist_level = 0; + rd_sf->tx_domain_dist_thres_level = 0; + rd_sf->use_hash_based_trellis = 0; + rd_sf->perform_coeff_opt = 0; +} + +static AOM_INLINE void init_winner_mode_sf( + WINNER_MODE_SPEED_FEATURES *winner_mode_sf) { + winner_mode_sf->motion_mode_for_winner_cand = 0; + // Set this at the appropriate speed levels + winner_mode_sf->tx_size_search_level = USE_FULL_RD; + winner_mode_sf->enable_winner_mode_for_coeff_opt = 0; + winner_mode_sf->enable_winner_mode_for_tx_size_srch = 0; + winner_mode_sf->enable_winner_mode_for_use_tx_domain_dist = 0; + winner_mode_sf->enable_multiwinner_mode_process = 0; +} + +static AOM_INLINE void init_lpf_sf(LOOP_FILTER_SPEED_FEATURES *lpf_sf) { + lpf_sf->disable_loop_restoration_chroma = 0; + lpf_sf->prune_wiener_based_on_src_var = 0; + lpf_sf->prune_sgr_based_on_wiener = 0; + lpf_sf->enable_sgr_ep_pruning = 0; + lpf_sf->reduce_wiener_window_size = 0; + lpf_sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE; + lpf_sf->cdef_pick_method = CDEF_FULL_SEARCH; + // Set decoder side speed feature to use less dual sgr modes + lpf_sf->dual_sgr_penalty_level = 0; + lpf_sf->disable_lr_filter = 0; +} + +static AOM_INLINE void init_rt_sf(REAL_TIME_SPEED_FEATURES *rt_sf) { + rt_sf->mode_search_skip_flags = 0; + rt_sf->skip_interp_filter_search = 0; + rt_sf->force_tx_search_off = 0; + rt_sf->num_inter_modes_for_tx_search = INT_MAX; + rt_sf->use_simple_rd_model = 0; + rt_sf->nonrd_check_partition_merge_mode = 0; + rt_sf->nonrd_check_partition_split = 0; +} + +void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi, int speed) { + SPEED_FEATURES *const sf = &cpi->sf; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + + if (oxcf->mode == GOOD) { + set_good_speed_feature_framesize_dependent(cpi, sf, speed); + } else if (oxcf->mode == REALTIME) { + set_rt_speed_feature_framesize_dependent(cpi, sf, speed); + } + + // This is only used in motion vector unit test. + if (cpi->oxcf.motion_vector_unit_test == 1) + cpi->mv_search_params.find_fractional_mv_step = av1_return_max_sub_pixel_mv; + else if (cpi->oxcf.motion_vector_unit_test == 2) + cpi->mv_search_params.find_fractional_mv_step = av1_return_min_sub_pixel_mv; + + MACROBLOCK *const x = &cpi->td.mb; + AV1_COMMON *const cm = &cpi->common; + x->min_partition_size = AOMMAX(sf->part_sf.default_min_partition_size, + dim_to_size(cpi->oxcf.min_partition_size)); + x->max_partition_size = AOMMIN(sf->part_sf.default_max_partition_size, + dim_to_size(cpi->oxcf.max_partition_size)); + x->min_partition_size = AOMMIN(x->min_partition_size, cm->seq_params.sb_size); + x->max_partition_size = AOMMIN(x->max_partition_size, cm->seq_params.sb_size); +} + +void av1_set_speed_features_framesize_independent(AV1_COMP *cpi, int speed) { + AV1_COMMON *const cm = &cpi->common; + SPEED_FEATURES *const sf = &cpi->sf; + MACROBLOCK *const x = &cpi->td.mb; + WinnerModeParams *const winner_mode_params = &cpi->winner_mode_params; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + int i; + + init_hl_sf(&sf->hl_sf); + init_tpl_sf(&sf->tpl_sf); + init_gm_sf(&sf->gm_sf); + init_part_sf(&sf->part_sf); + init_mv_sf(&sf->mv_sf); + init_inter_sf(&sf->inter_sf); + init_interp_sf(&sf->interp_sf); + init_intra_sf(&sf->intra_sf); + init_tx_sf(&sf->tx_sf); + init_rd_sf(&sf->rd_sf, cpi); + init_winner_mode_sf(&sf->winner_mode_sf); + init_lpf_sf(&sf->lpf_sf); + init_rt_sf(&sf->rt_sf); + + if (oxcf->mode == GOOD) + set_good_speed_features_framesize_independent(cpi, sf, speed); + else if (oxcf->mode == REALTIME) + set_rt_speed_features_framesize_independent(cpi, sf, speed); + + if (!cpi->seq_params_locked) { + cpi->common.seq_params.enable_dual_filter &= + !sf->interp_sf.disable_dual_filter; + cpi->common.seq_params.enable_restoration &= !sf->lpf_sf.disable_lr_filter; + } + + // sf->part_sf.partition_search_breakout_dist_thr is set assuming max 64x64 + // blocks. Normalise this if the blocks are bigger. + if (MAX_SB_SIZE_LOG2 > 6) { + sf->part_sf.partition_search_breakout_dist_thr <<= + 2 * (MAX_SB_SIZE_LOG2 - 6); + } + + const int mesh_speed = AOMMIN(speed, MAX_MESH_SPEED); + for (i = 0; i < MAX_MESH_STEP; ++i) { + sf->mv_sf.mesh_patterns[i].range = + good_quality_mesh_patterns[mesh_speed][i].range; + sf->mv_sf.mesh_patterns[i].interval = + good_quality_mesh_patterns[mesh_speed][i].interval; + } + + // Update the mesh pattern of exhaustive motion search for intraBC + // Though intraBC mesh pattern is populated for all frame types, it is used + // only for intra frames of screen contents + for (i = 0; i < MAX_MESH_STEP; ++i) { + sf->mv_sf.intrabc_mesh_patterns[i].range = + intrabc_mesh_patterns[mesh_speed][i].range; + sf->mv_sf.intrabc_mesh_patterns[i].interval = + intrabc_mesh_patterns[mesh_speed][i].interval; + } + + // Slow quant, dct and trellis not worthwhile for first pass + // so make sure they are always turned off. + if (is_stat_generation_stage(cpi)) + sf->rd_sf.optimize_coefficients = NO_TRELLIS_OPT; + + // No recode or trellis for 1 pass. + if (oxcf->pass == 0) sf->hl_sf.recode_loop = DISALLOW_RECODE; + + MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params; + if (sf->mv_sf.subpel_search_method == SUBPEL_TREE) { + mv_search_params->find_fractional_mv_step = av1_find_best_sub_pixel_tree; + } else if (sf->mv_sf.subpel_search_method == SUBPEL_TREE_PRUNED) { + mv_search_params->find_fractional_mv_step = + av1_find_best_sub_pixel_tree_pruned; + } else if (sf->mv_sf.subpel_search_method == SUBPEL_TREE_PRUNED_MORE) { + mv_search_params->find_fractional_mv_step = + av1_find_best_sub_pixel_tree_pruned_more; + } else if (sf->mv_sf.subpel_search_method == SUBPEL_TREE_PRUNED_EVENMORE) { + mv_search_params->find_fractional_mv_step = + av1_find_best_sub_pixel_tree_pruned_evenmore; + } + + x->min_partition_size = AOMMAX(sf->part_sf.default_min_partition_size, + dim_to_size(cpi->oxcf.min_partition_size)); + x->max_partition_size = AOMMIN(sf->part_sf.default_max_partition_size, + dim_to_size(cpi->oxcf.max_partition_size)); + x->min_partition_size = AOMMIN(x->min_partition_size, cm->seq_params.sb_size); + x->max_partition_size = AOMMIN(x->max_partition_size, cm->seq_params.sb_size); + + // This is only used in motion vector unit test. + if (cpi->oxcf.motion_vector_unit_test == 1) + mv_search_params->find_fractional_mv_step = av1_return_max_sub_pixel_mv; + else if (cpi->oxcf.motion_vector_unit_test == 2) + mv_search_params->find_fractional_mv_step = av1_return_min_sub_pixel_mv; + + // assert ensures that tx_domain_dist_level is accessed correctly + assert(cpi->sf.rd_sf.tx_domain_dist_thres_level >= 0 && + cpi->sf.rd_sf.tx_domain_dist_thres_level < 3); + memcpy(winner_mode_params->tx_domain_dist_threshold, + tx_domain_dist_thresholds[cpi->sf.rd_sf.tx_domain_dist_thres_level], + sizeof(winner_mode_params->tx_domain_dist_threshold)); + + assert(cpi->sf.rd_sf.tx_domain_dist_level >= 0 && + cpi->sf.rd_sf.tx_domain_dist_level < 3); + memcpy(winner_mode_params->use_transform_domain_distortion, + tx_domain_dist_types[cpi->sf.rd_sf.tx_domain_dist_level], + sizeof(winner_mode_params->use_transform_domain_distortion)); + + // assert ensures that coeff_opt_dist_thresholds is accessed correctly + assert(cpi->sf.rd_sf.perform_coeff_opt >= 0 && + cpi->sf.rd_sf.perform_coeff_opt < 6); + memcpy(winner_mode_params->coeff_opt_dist_threshold, + coeff_opt_dist_thresholds[cpi->sf.rd_sf.perform_coeff_opt], + sizeof(winner_mode_params->coeff_opt_dist_threshold)); + + // assert ensures that predict_skip_levels is accessed correctly + assert(cpi->sf.tx_sf.tx_type_search.use_skip_flag_prediction >= 0 && + cpi->sf.tx_sf.tx_type_search.use_skip_flag_prediction < 3); + memcpy(winner_mode_params->predict_skip_level, + predict_skip_levels[cpi->sf.tx_sf.tx_type_search + .use_skip_flag_prediction], + sizeof(winner_mode_params->predict_skip_level)); + + // assert ensures that tx_size_search_level is accessed correctly + assert(cpi->sf.winner_mode_sf.tx_size_search_level >= 0 && + cpi->sf.winner_mode_sf.tx_size_search_level < 3); + memcpy(winner_mode_params->tx_size_search_methods, + tx_size_search_methods[cpi->sf.winner_mode_sf.tx_size_search_level], + sizeof(winner_mode_params->tx_size_search_methods)); + + if (cpi->oxcf.row_mt == 1 && (cpi->oxcf.max_threads > 1)) { + if (sf->inter_sf.inter_mode_rd_model_estimation == 1) { + // Revert to type 2 + sf->inter_sf.inter_mode_rd_model_estimation = 2; + } + } +} + +// Override some speed features based on qindex +void av1_set_speed_features_qindex_dependent(AV1_COMP *cpi, int speed) { + AV1_COMMON *const cm = &cpi->common; + SPEED_FEATURES *const sf = &cpi->sf; + WinnerModeParams *const winner_mode_params = &cpi->winner_mode_params; + const int boosted = frame_is_boosted(cpi); + const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; + if (is_720p_or_larger && cpi->oxcf.mode == GOOD && speed == 0) { + if (cm->quant_params.base_qindex <= 80) { + sf->rd_sf.perform_coeff_opt = 2; + memcpy(winner_mode_params->coeff_opt_dist_threshold, + coeff_opt_dist_thresholds[sf->rd_sf.perform_coeff_opt], + sizeof(winner_mode_params->coeff_opt_dist_threshold)); + sf->part_sf.simple_motion_search_split = + cm->features.allow_screen_content_tools ? 1 : 2; + sf->tx_sf.inter_tx_size_search_init_depth_rect = 1; + sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1; + sf->tx_sf.intra_tx_size_search_init_depth_rect = 1; + } + } + + if (cpi->oxcf.mode == GOOD && speed >= 3) { + // Disable extended partitions for lower quantizers + if (cm->quant_params.base_qindex <= 100 && + !cm->features.allow_screen_content_tools && !boosted) { + sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128; + } + } + + if (cpi->oxcf.mode == GOOD && speed >= 4) { + // Disable extended partitions for lower quantizers + const int qindex_thresh = boosted ? 80 : 120; + if (cm->quant_params.base_qindex <= qindex_thresh && + !cm->features.allow_screen_content_tools && + !frame_is_intra_only(&cpi->common)) { + sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128; + } + } +} diff --git a/libs/libaom/src/av1/encoder/speed_features.h b/libs/libaom/src/av1/encoder/speed_features.h new file mode 100644 index 000000000..d12c3c02e --- /dev/null +++ b/libs/libaom/src/av1/encoder/speed_features.h @@ -0,0 +1,1034 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_SPEED_FEATURES_H_ +#define AOM_AV1_ENCODER_SPEED_FEATURES_H_ + +#include "av1/common/enums.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_MESH_STEP 4 + +typedef struct MESH_PATTERN { + int range; + int interval; +} MESH_PATTERN; + +enum { + GM_FULL_SEARCH, + GM_REDUCED_REF_SEARCH_SKIP_L2_L3, + GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2, + GM_DISABLE_SEARCH +} UENUM1BYTE(GM_SEARCH_TYPE); + +enum { + GM_ERRORADV_TR_0, + GM_ERRORADV_TR_1, + GM_ERRORADV_TR_2, + GM_ERRORADV_TR_TYPES, +} UENUM1BYTE(GM_ERRORADV_TYPE); + +enum { + FULL_TXFM_RD, + LOW_TXFM_RD, +} UENUM1BYTE(TXFM_RD_MODEL); + +enum { + DIST_WTD_COMP_ENABLED, + DIST_WTD_COMP_SKIP_MV_SEARCH, + DIST_WTD_COMP_DISABLED, +} UENUM1BYTE(DIST_WTD_COMP_FLAG); + +enum { + INTRA_ALL = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << D45_PRED) | + (1 << D135_PRED) | (1 << D113_PRED) | (1 << D157_PRED) | + (1 << D203_PRED) | (1 << D67_PRED) | (1 << SMOOTH_PRED) | + (1 << SMOOTH_V_PRED) | (1 << SMOOTH_H_PRED) | (1 << PAETH_PRED), + UV_INTRA_ALL = + (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED) | + (1 << UV_D45_PRED) | (1 << UV_D135_PRED) | (1 << UV_D113_PRED) | + (1 << UV_D157_PRED) | (1 << UV_D203_PRED) | (1 << UV_D67_PRED) | + (1 << UV_SMOOTH_PRED) | (1 << UV_SMOOTH_V_PRED) | + (1 << UV_SMOOTH_H_PRED) | (1 << UV_PAETH_PRED) | (1 << UV_CFL_PRED), + UV_INTRA_DC = (1 << UV_DC_PRED), + UV_INTRA_DC_CFL = (1 << UV_DC_PRED) | (1 << UV_CFL_PRED), + UV_INTRA_DC_TM = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED), + UV_INTRA_DC_PAETH_CFL = + (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) | (1 << UV_CFL_PRED), + UV_INTRA_DC_H_V = (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED), + UV_INTRA_DC_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_V_PRED) | + (1 << UV_H_PRED) | (1 << UV_CFL_PRED), + UV_INTRA_DC_PAETH_H_V = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) | + (1 << UV_V_PRED) | (1 << UV_H_PRED), + UV_INTRA_DC_PAETH_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) | + (1 << UV_V_PRED) | (1 << UV_H_PRED) | + (1 << UV_CFL_PRED), + INTRA_DC = (1 << DC_PRED), + INTRA_DC_TM = (1 << DC_PRED) | (1 << PAETH_PRED), + INTRA_DC_H_V = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED), + INTRA_DC_PAETH_H_V = + (1 << DC_PRED) | (1 << PAETH_PRED) | (1 << V_PRED) | (1 << H_PRED) +}; + +enum { + INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) | + (1 << NEWMV) | (1 << NEAREST_NEARESTMV) | (1 << NEAR_NEARMV) | + (1 << NEW_NEWMV) | (1 << NEAREST_NEWMV) | (1 << NEAR_NEWMV) | + (1 << NEW_NEARMV) | (1 << NEW_NEARESTMV) | (1 << GLOBAL_GLOBALMV), + INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) | + (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) | + (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV) | + (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) | + (1 << NEAR_NEARMV), +}; + +enum { + DISABLE_ALL_INTER_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) | + (1 << THR_ALTR) | (1 << THR_GOLD) | (1 << THR_LAST), + + DISABLE_ALL_SPLIT = (1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT, + + DISABLE_COMPOUND_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA), + + LAST_AND_INTRA_SPLIT_ONLY = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) | + (1 << THR_ALTR) | (1 << THR_GOLD) +}; + +enum { + TXFM_CODING_SF = 1, + INTER_PRED_SF = 2, + INTRA_PRED_SF = 4, + PARTITION_SF = 8, + LOOP_FILTER_SF = 16, + RD_SKIP_SF = 32, + RESERVE_2_SF = 64, + RESERVE_3_SF = 128, +} UENUM1BYTE(DEV_SPEED_FEATURES); + +enum { + // No recode. + DISALLOW_RECODE = 0, + // Allow recode for KF and exceeding maximum frame bandwidth. + ALLOW_RECODE_KFMAXBW = 1, + // Allow recode only for KF/ARF/GF frames. + ALLOW_RECODE_KFARFGF = 2, + // Allow recode for all frames based on bitrate constraints. + ALLOW_RECODE = 3, +} UENUM1BYTE(RECODE_LOOP_TYPE); + +enum { + SUBPEL_TREE = 0, + SUBPEL_TREE_PRUNED = 1, // Prunes 1/2-pel searches + SUBPEL_TREE_PRUNED_MORE = 2, // Prunes 1/2-pel searches more aggressively + SUBPEL_TREE_PRUNED_EVENMORE = 3, // Prunes 1/2- and 1/4-pel searches + // Other methods to come +} UENUM1BYTE(SUBPEL_SEARCH_METHODS); + +enum { + USE_FULL_RD = 0, + USE_FAST_RD, + USE_LARGESTALL, +} UENUM1BYTE(TX_SIZE_SEARCH_METHOD); + +enum { + // Try the full image with different values. + LPF_PICK_FROM_FULL_IMAGE, + // Try the full image filter search with non-dual filter only. + LPF_PICK_FROM_FULL_IMAGE_NON_DUAL, + // Try a small portion of the image with different values. + LPF_PICK_FROM_SUBIMAGE, + // Estimate the level based on quantizer and frame type + LPF_PICK_FROM_Q, + // Pick 0 to disable LPF if LPF was enabled last frame + LPF_PICK_MINIMAL_LPF +} UENUM1BYTE(LPF_PICK_METHOD); + +enum { + CDEF_FULL_SEARCH, + CDEF_FAST_SEARCH_LVL1, // Search among a subset of all possible filters. + CDEF_FAST_SEARCH_LVL2, // Search reduced subset of filters than Level 1. + CDEF_PICK_FROM_Q, // Estimate filter strength based on quantizer. + CDEF_PICK_METHODS +} UENUM1BYTE(CDEF_PICK_METHOD); + +enum { + // Terminate search early based on distortion so far compared to + // qp step, distortion in the neighborhood of the frame, etc. + FLAG_EARLY_TERMINATE = 1 << 0, + + // Skips comp inter modes if the best so far is an intra mode. + FLAG_SKIP_COMP_BESTINTRA = 1 << 1, + + // Skips oblique intra modes if the best so far is an inter mode. + FLAG_SKIP_INTRA_BESTINTER = 1 << 3, + + // Skips oblique intra modes at angles 27, 63, 117, 153 if the best + // intra so far is not one of the neighboring directions. + FLAG_SKIP_INTRA_DIRMISMATCH = 1 << 4, + + // Skips intra modes other than DC_PRED if the source variance is small + FLAG_SKIP_INTRA_LOWVAR = 1 << 5, +} UENUM1BYTE(MODE_SEARCH_SKIP_LOGIC); + +enum { + NO_PRUNE = 0, + // adaptively prunes the least perspective tx types out of all 16 + // (tuned to provide negligible quality loss) + PRUNE_2D_ACCURATE = 1, + // similar, but applies much more aggressive pruning to get better speed-up + PRUNE_2D_FAST = 2, + PRUNE_2D_MORE = 3, + // More aggressive pruning based on tx type score and allowed tx count + PRUNE_2D_AGGRESSIVE = 4, +} UENUM1BYTE(TX_TYPE_PRUNE_MODE); + +typedef struct { + TX_TYPE_PRUNE_MODE prune_mode; + int fast_intra_tx_type_search; + int fast_inter_tx_type_search; + + // prune two least frequently chosen transforms for each intra mode + int use_reduced_intra_txset; + + // Use a skip flag prediction model to detect blocks with skip = 1 early + // and avoid doing full TX type search for such blocks. + int use_skip_flag_prediction; + + // Threshold used by the ML based method to predict TX block split decisions. + int ml_tx_split_thresh; + + // skip remaining transform type search when we found the rdcost of skip is + // better than applying transform + int skip_tx_search; + + // Prune tx type search using previous frame stats. + int prune_tx_type_using_stats; + // Prune tx type search using estimated RDcost + int prune_tx_type_est_rd; + + // Flag used to control the winner mode processing for tx type pruning for + // inter blocks. It enables further tx type mode pruning based on ML model for + // mode evaluation and disables tx type mode pruning for winner mode + // processing. + int enable_winner_mode_tx_type_pruning; +} TX_TYPE_SEARCH; + +enum { + // Search partitions using RD criterion + SEARCH_PARTITION, + + // Always use a fixed size partition + FIXED_PARTITION, + + REFERENCE_PARTITION, + + VAR_BASED_PARTITION +} UENUM1BYTE(PARTITION_SEARCH_TYPE); + +enum { + NOT_IN_USE, + DIRECT_PRED, + RELAXED_PRED, + ADAPT_PRED +} UENUM1BYTE(MAX_PART_PRED_MODE); + +enum { + LAST_MV_DATA, + CURRENT_Q, + QTR_ONLY, +} UENUM1BYTE(MV_PREC_LOGIC); + +typedef struct HIGH_LEVEL_SPEED_FEATURES { + // Frame level coding parameter update + int frame_parameter_update; + + RECODE_LOOP_TYPE recode_loop; + + // This feature controls the tolerence vs target used in deciding whether to + // recode a frame. It has no meaning if recode is disabled. + int recode_tolerance; + + // Determine how motion vector precision is chosen. The possibilities are: + // LAST_MV_DATA: use the mv data from the last coded frame + // CURRENT_Q: use the current q as a threshold + // QTR_ONLY: use quarter pel precision only. + MV_PREC_LOGIC high_precision_mv_usage; + + // Whether to disable overlay frames for filtered Altref frames, + // overiding oxcf->enable_overlay flag set as 1. + int disable_overlay_frames; + + // Enable/disable adaptively deciding whether or not to encode ALTREF overlay + // frame. + int adaptive_overlay_encoding; + + // Always set to 0. If on it enables 0 cost background transmission + // (except for the initial transmission of the segmentation). The feature is + // disabled because the addition of very large block sizes make the + // backgrounds very to cheap to encode, and the segmentation we have + // adds overhead. + int static_segmentation; + + // Enable/disable second_alt_ref temporal filtering. + int second_alt_ref_filtering; +} HIGH_LEVEL_SPEED_FEATURES; + +typedef struct TPL_SPEED_FEATURES { + // Prune the intra modes search by tpl. + // If set to 0, we will search all intra modes from DC_PRED to PAETH_PRED. + // If set to 1, we only search DC_PRED, V_PRED, and H_PRED. + int prune_intra_modes; + // This parameter controls which step in the n-step process we start at. + int reduce_first_step_size; + // Skip motion estimation based on the precision of center MVs and the + // difference between center MVs. + // If set to 0, motion estimation is skipped for duplicate center MVs + // (default). If set to 1, motion estimation is skipped for duplicate + // full-pixel center MVs. If set to 2, motion estimation is skipped if the + // difference between center MVs is less than the threshold. + int skip_alike_starting_mv; + + // When to stop subpel search. + SUBPEL_FORCE_STOP subpel_force_stop; +} TPL_SPEED_FEATURES; + +typedef struct GLOBAL_MOTION_SPEED_FEATURES { + // Global motion warp error threshold + GM_ERRORADV_TYPE gm_erroradv_type; + + // Disable adaptive threshold for global motion warp error + int disable_adaptive_warp_error_thresh; + + // Do not compute the global motion parameters for a LAST2_FRAME or + // LAST3_FRAME if the GOLDEN_FRAME is closer and it has a non identity + // global model. + int selective_ref_gm; + + GM_SEARCH_TYPE gm_search_type; + + // whether to disable the global motion recode loop + int gm_disable_recode; + + // During global motion estimation, prune remaining reference frames in a + // given direction(past/future), if the evaluated ref_frame in that direction + // yields gm_type as INVALID/TRANSLATION/IDENTITY + int prune_ref_frame_for_gm_search; +} GLOBAL_MOTION_SPEED_FEATURES; + +typedef struct PARTITION_SPEED_FEATURES { + PARTITION_SEARCH_TYPE partition_search_type; + + // Used if partition_search_type = FIXED_SIZE_PARTITION + BLOCK_SIZE always_this_block_size; + + // Prune extended partition types search + // Can take values 0 - 2, 0 referring to no pruning, and 1 - 2 increasing + // aggressiveness of pruning in order. + int prune_ext_partition_types_search_level; + + // Use a ML model to prune horz and vert partitions + int ml_prune_rect_partition; + + // Use a ML model to prune horz_a, horz_b, vert_a and vert_b partitions. + int ml_prune_ab_partition; + + // Use a ML model to prune horz4 and vert4 partitions. + int ml_prune_4_partition; + + // Use a ML model to adaptively terminate partition search after trying + // PARTITION_SPLIT. Can take values 0 - 2, 0 meaning not being enabled, and + // 1 - 2 increasing aggressiveness in order. + int ml_early_term_after_part_split_level; + + // Skip rectangular partition test when partition type none gives better + // rd than partition type split. Can take values 0 - 2, 0 referring to no + // skipping, and 1 - 2 increasing aggressiveness of skipping in order. + int less_rectangular_check_level; + + // Use square partition only beyond this block size. + BLOCK_SIZE use_square_partition_only_threshold; + + // Sets min and max square partition levels for this superblock based on + // motion vector and prediction error distribution produced from 16x16 + // simple motion search + MAX_PART_PRED_MODE auto_max_partition_based_on_simple_motion; + int auto_min_partition_based_on_simple_motion; + + // Min and max square partition size we enable (block_size) as per auto + // min max, but also used by adjust partitioning, and pick_partitioning. + BLOCK_SIZE default_min_partition_size; + BLOCK_SIZE default_max_partition_size; + + // Sets level of adjustmet of variace-based partitioning during + // rd_use_partition 0 - no partition adjusment, 1 - try to merge partitions + // for small blocks and high QP, 2 - always try to merge leaf partitions, 3 - + // try to merge and split leaf partitions + int adjust_var_based_rd_partitioning; + + // Partition search early breakout thresholds. + int64_t partition_search_breakout_dist_thr; + int partition_search_breakout_rate_thr; + + // Thresholds for ML based partition search breakout. + int ml_partition_search_breakout_thresh[PARTITION_BLOCK_SIZES]; + + // Allow skipping partition search for still image frame + int allow_partition_search_skip; + + // The aggresiveness of pruning with simple_motion_search. + // Currently 0 is the lowest, and 2 the highest. + int simple_motion_search_prune_agg; + + // Perform simple_motion_search on each possible subblock and use it to prune + // PARTITION_HORZ and PARTITION_VERT. + int simple_motion_search_prune_rect; + + // Perform simple motion search before none_partition to decide if we + // want to remove all partitions other than PARTITION_SPLIT. If set to 0, this + // model is disabled. If set to 1, the model attempts to perform + // PARTITION_SPLIT only. If set to 2, the model also attempts to prune + // PARTITION_SPLIT. + int simple_motion_search_split; + + // Use features from simple_motion_search to terminate prediction block + // partition after PARTITION_NONE + int simple_motion_search_early_term_none; + + // This variable controls the maximum block size where intra blocks can be + // used in inter frames. + // TODO(aconverse): Fold this into one of the other many mode skips + BLOCK_SIZE max_intra_bsize; + + // Use CNN with luma pixels on source frame on each of the 64x64 subblock to + // perform split/no_split decision on intra-frames. + int intra_cnn_split; + + // Disable extended partition search for lower block sizes. + int ext_partition_eval_thresh; + + // Prune 1:4 partition search based on winner info from split partitions + int prune_4_partition_using_split_info; + + // Prune AB partition search using split and HORZ/VERT info + int prune_ab_partition_using_split_info; +} PARTITION_SPEED_FEATURES; + +typedef struct MV_SPEED_FEATURES { + // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc). + SEARCH_METHODS search_method; + + // If this is set to 1, we limit the motion search range to 2 times the + // largest motion vector found in the last frame. + int auto_mv_step_size; + + // Subpel_search_method can only be subpel_tree which does a subpixel + // logarithmic search that keeps stepping at 1/2 pixel units until + // you stop getting a gain, and then goes on to 1/4 and repeats + // the same process. Along the way it skips many diagonals. + SUBPEL_SEARCH_METHODS subpel_search_method; + + // Maximum number of steps in logarithmic subpel search before giving up. + int subpel_iters_per_step; + + // When to stop subpel search. + SUBPEL_FORCE_STOP subpel_force_stop; + + // When to stop subpel search in simple motion search. + SUBPEL_FORCE_STOP simple_motion_subpel_force_stop; + + // If true, sub-pixel search uses the exact convolve function used for final + // encoding and decoding; otherwise, it uses bilinear interpolation. + SUBPEL_SEARCH_TYPE use_accurate_subpel_search; + + // TODO(jingning): combine the related motion search speed features + // This allows us to use motion search at other sizes as a starting + // point for this motion search and limits the search range around it. + int adaptive_motion_search; + + // Threshold for allowing exhaustive motion search. + int exhaustive_searches_thresh; + + // Pattern to be used for any exhaustive mesh searches (except intraBC ME). + MESH_PATTERN mesh_patterns[MAX_MESH_STEP]; + + // Pattern to be used for exhaustive mesh searches of intraBC ME. + MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_STEP]; + + // Reduce single motion search range based on MV result of prior ref_mv_idx. + int reduce_search_range; + + // Prune mesh search. + int prune_mesh_search; + + // Use the rd cost around the best FULLPEL_MV to speed up subpel search + int use_fullpel_costlist; + + // Set the full pixel search level of obmc + // 0: obmc_full_pixel_diamond + // 1: obmc_refining_search_sad (faster) + int obmc_full_pixel_search_level; + + // Accurate full pixel motion search based on TPL stats. + int full_pixel_search_level; +} MV_SPEED_FEATURES; + +typedef struct INTER_MODE_SPEED_FEATURES { + // 2-pass inter mode model estimation where the preliminary pass skips + // transform search and uses a model to estimate rd, while the final pass + // computes the full transform search. Two types of models are supported: + // 0: not used + // 1: used with online dynamic rd model + // 2: used with static rd model + int inter_mode_rd_model_estimation; + + // Bypass transform search based on skip rd + int txfm_rd_gate_level; + + // Limit the inter mode tested in the RD loop + int reduce_inter_modes; + + // Adaptive prediction mode search + int adaptive_mode_search; + + // This variable is used to cap the maximum number of times we skip testing a + // mode to be evaluated. A high value means we will be faster. + int adaptive_rd_thresh; + + // Aggressively prune inter modes when best mode is skippable. + int prune_inter_modes_if_skippable; + + // Drop less likely to be picked reference frames in the RD search. + // Has five levels for now: 0, 1, 2, 3 and 4, where higher levels prune more + // aggressively than lower ones. (0 means no pruning). + int selective_ref_frame; + + // Prune reference frames for rectangular partitions. + // 0 implies no pruning + // 1 implies prune for extended partition + // 2 implies prune horiz, vert and extended partition + int prune_ref_frame_for_rect_partitions; + + int alt_ref_search_fp; + + // flag to skip NEWMV mode in drl if the motion search result is the same + int skip_repeated_newmv; + + // Skip the current ref_mv in NEW_MV mode if we have already encountered + // another ref_mv in the drl such that: + // 1. The other drl has the same fullpel_mv during the SIMPLE_TRANSLATION + // search process as the current fullpel_mv. + // 2. The rate needed to encode the current fullpel_mv is larger than that + // for the other ref_mv. + int skip_repeated_full_newmv; + + // This speed feature checks duplicate ref MVs among NEARESTMV, NEARMV, + // GLOBALMV and skips NEARMV or GLOBALMV (in order) if a duplicate is found + // TODO(any): Instead of skipping repeated ref mv, use the recalculated + // rd-cost based on mode rate and skip the mode evaluation + int skip_repeated_ref_mv; + + // Flag used to control the ref_best_rd based gating for chroma + int perform_best_rd_based_gating_for_chroma; + + // Skip certain motion modes (OBMC, warped, interintra) for single reference + // motion search, using the results of single ref SIMPLE_TRANSLATION + int prune_single_motion_modes_by_simple_trans; + + // Reuse the inter_intra_mode search result from NEARESTMV mode to other + // single ref modes + int reuse_inter_intra_mode; + + // prune wedge and compound segment approximate rd evaluation based on + // compound average modeled rd + int prune_comp_type_by_model_rd; + + // prune wedge and compound segment approximate rd evaluation based on + // compound average rd/ref_best_rd + int prune_comp_type_by_comp_avg; + + // Skip some ref frames in compound motion search by single motion search + // result. Has three levels for now: 0 referring to no skipping, and 1 - 3 + // increasing aggressiveness of skipping in order. + // Note: The search order might affect the result. It assumes that the single + // reference modes are searched before compound modes. It is better to search + // same single inter mode as a group. + int prune_comp_search_by_single_result; + + // If 1 we iterate finding a best reference for 2 ref frames together - via + // a log search that iterates 4 times (check around mv for last for best + // error of combined predictor then check around mv for alt). If 0 we + // we just use the best motion vector found for each frame by itself. + BLOCK_SIZE comp_inter_joint_search_thresh; + + // Instead of performing a full MV search, do a simple translation first + // and only perform a full MV search on the motion vectors that performed + // well. + int prune_mode_search_simple_translation; + + // Only search compound modes with at least one "good" reference frame. + // A reference frame is good if, after looking at its performance among + // the single reference modes, it is one of the two best performers. + int prune_compound_using_single_ref; + + // Skip extended compound mode using ref frames of above and left neighbor + // blocks. + // 0 : no pruning + // 1 : prune extended compound mode (less aggressiveness) + // 2 : prune extended compound mode (high aggressiveness) + int prune_compound_using_neighbors; + + // Based on previous ref_mv_idx search result, prune the following search. + int prune_ref_mv_idx_search; + + // Disable one sided compound modes. + int disable_onesided_comp; + + // Prune/gate motion mode evaluation based on token based rd + // during transform search for inter blocks + // Values are 0 (not used) , 1 - 3 with progressively increasing + // aggressiveness + int prune_motion_mode_level; + + // Prune obmc search using previous frame stats. + int prune_obmc_prob_thresh; + + // Disable obmc. + int disable_obmc; + + // Gate warp evaluation for motions of type IDENTITY, + // TRANSLATION and AFFINE(based on number of warp neighbors) + int prune_warp_using_wmtype; + + // Prune warped motion search using previous frame stats. + int prune_warped_prob_thresh; + + // Enable/disable interintra wedge search. + int disable_wedge_interintra_search; + + // De-couple wedge and mode search during interintra RDO. + int fast_interintra_wedge_search; + + // Only enable wedge search if the edge strength is greater than + // this threshold. A value of 0 signals that this check is disabled. + unsigned int disable_wedge_search_edge_thresh; + + // Only enable wedge search if the variance is above this threshold. + unsigned int disable_wedge_search_var_thresh; + + // Whether fast wedge sign estimate is used + int fast_wedge_sign_estimate; + + // Whether to prune wedge search based on predictor difference + int prune_wedge_pred_diff_based; + + // Enable/disable ME for interinter wedge search. + int disable_interinter_wedge_newmv_search; + + // Enable/disable ME for interinter diffwtd search. PSNR BD-rate gain of + // ~0.1 on the lowres test set, but ~15% slower computation. + int enable_interinter_diffwtd_newmv_search; + + // Enable/disable smooth inter-intra mode + int disable_smooth_interintra; + + // Disable interinter_wedge + int disable_interinter_wedge; + + // Decide when and how to use joint_comp. + DIST_WTD_COMP_FLAG use_dist_wtd_comp_flag; + + // Whether to override and disable sb level coeff cost updates, if + // cpi->oxcf.coeff_cost_upd_freq = COST_UPD_SB (i.e. set at SB level) + int disable_sb_level_coeff_cost_upd; + + // Whether to override and disable sb level mv cost updates, if + // cpi->oxcf.coeff_cost_upd_freq = COST_UPD_SB (i.e. set at SB level) + int disable_sb_level_mv_cost_upd; + + // Prune inter modes based on tpl stats + // 0 : no pruning + // 1 - 3 indicate increasing aggressiveness in order. + int prune_inter_modes_based_on_tpl; + + // Model based breakout after interpolation filter search + // 0: no breakout + // 1: use model based rd breakout + int model_based_post_interp_filter_breakout; + + // Reuse compound type rd decision when exact match is found + // 0: No reuse + // 1: Reuse the compound type decision + int reuse_compound_type_decision; +} INTER_MODE_SPEED_FEATURES; + +typedef struct INTERP_FILTER_SPEED_FEATURES { + // A source variance threshold below which filter search is disabled + // Choose a very large value (UINT_MAX) to use 8-tap always + unsigned int disable_filter_search_var_thresh; + + // Do limited interpolation filter search for dual filters, since best choice + // usually includes EIGHTTAP_REGULAR. + int use_fast_interpolation_filter_search; + + // Disable dual filter + int disable_dual_filter; + + // Save results of av1_interpolation_filter_search for a block + // Check mv and ref_frames before search, if they are very close with previous + // saved results, filter search can be skipped. + int use_interp_filter; + + // skip sharp_filter evaluation based on regular and smooth filter rd for + // dual_filter=0 case + int skip_sharp_interp_filter_search; + + int cb_pred_filter_search; + + // adaptive interp_filter search to allow skip of certain filter types. + int adaptive_interp_filter_search; +} INTERP_FILTER_SPEED_FEATURES; + +typedef struct INTRA_MODE_SPEED_FEATURES { + // These bit masks allow you to enable or disable intra modes for each + // transform size separately. + int intra_y_mode_mask[TX_SIZES]; + int intra_uv_mode_mask[TX_SIZES]; + + // flag to allow skipping intra mode for inter frame prediction + int skip_intra_in_interframe; + + // variance threshold for intra mode gating when inter turned out to be skip + // in inter frame prediction + unsigned int src_var_thresh_intra_skip; + + // Prune intra mode candidates based on source block histogram of gradient. + int intra_pruning_with_hog; + + // TODO(anyone): tune intra_pruning_with_hog_thresh for various speeds. + float intra_pruning_with_hog_thresh; + + // Enable/disable smooth intra modes. + int disable_smooth_intra; + + // prune palette search + // 0: No pruning + // 1: Perform coarse search to prune the palette colors. For winner colors, + // neighbors are also evaluated using a finer search. + // 2: Perform 2 way palette search from max colors to min colors (and min + // colors to remaining colors) and terminate the search if current number of + // palette colors is not the winner. + int prune_palette_search_level; +} INTRA_MODE_SPEED_FEATURES; + +typedef struct TX_SPEED_FEATURES { + // Init search depth for square and rectangular transform partitions. + // Values: + // 0 - search full tree, 1: search 1 level, 2: search the highest level only + int inter_tx_size_search_init_depth_sqr; + int inter_tx_size_search_init_depth_rect; + int intra_tx_size_search_init_depth_sqr; + int intra_tx_size_search_init_depth_rect; + + // If any dimension of a coding block size above 64, always search the + // largest transform only, since the largest transform block size is 64x64. + int tx_size_search_lgr_block; + + TX_TYPE_SEARCH tx_type_search; + + // Skip split transform block partition when the collocated bigger block + // is selected as all zero coefficients. + int txb_split_cap; + + // Shortcut the transform block partition and type search when the target + // rdcost is relatively lower. + // Values are 0 (not used) , or 1 - 2 with progressively increasing + // aggressiveness + int adaptive_txb_search_level; + + // Prune level for tx_size_type search for inter based on rd model + // 0: no pruning + // 1-2: progressively increasing aggressiveness of pruning + int model_based_prune_tx_search_level; + + // Use hash table to store intra(keyframe only) txb transform search results + // to avoid repeated search on the same residue signal. + int use_intra_txb_hash; + + // Use hash table to store inter txb transform search results + // to avoid repeated search on the same residue signal. + int use_inter_txb_hash; + + // Refine TX type after fast TX search. + int refine_fast_tx_search_results; +} TX_SPEED_FEATURES; + +typedef struct RD_CALC_SPEED_FEATURES { + // This feature controls whether we do the expensive context update and + // calculation in the rd coefficient costing loop. + int use_fast_coef_costing; + + // Fast approximation of av1_model_rd_from_var_lapndz + int simple_model_rd_from_var; + + // Whether to compute distortion in the image domain (slower but + // more accurate), or in the transform domain (faster but less acurate). + // 0: use image domain + // 1: use transform domain in tx_type search, and use image domain for + // RD_STATS + // 2: use transform domain + int tx_domain_dist_level; + + // Transform domain distortion threshold level + int tx_domain_dist_thres_level; + + // Trellis (dynamic programming) optimization of quantized values + TRELLIS_OPT_TYPE optimize_coefficients; + + // Use a hash table to store previously computed optimized qcoeffs from + // expensive calls to optimize_txb. + int use_hash_based_trellis; + + // Use hash table to store macroblock RD search results + // to avoid repeated search on the same residue signal. + int use_mb_rd_hash; + + // Flag used to control the speed of the eob selection in trellis. + int trellis_eob_fast; + + // Calculate RD cost before doing optimize_b, and skip if the cost is large. + int optimize_b_precheck; + + // Flag used to control the extent of coeff R-D optimization + int perform_coeff_opt; +} RD_CALC_SPEED_FEATURES; + +typedef struct WINNER_MODE_SPEED_FEATURES { + // Flag used to control the winner mode processing for better R-D optimization + // of quantized coeffs + int enable_winner_mode_for_coeff_opt; + + // Flag used to control the winner mode processing for transform size + // search method + int enable_winner_mode_for_tx_size_srch; + + // Control transform size search level + // Eval type: Default Mode Winner + // Level 0 : FULL RD LARGEST ALL FULL RD + // Level 1 : FAST RD LARGEST ALL FULL RD + // Level 2 : LARGEST ALL LARGEST ALL FULL RD + int tx_size_search_level; + + // Flag used to control the winner mode processing for use transform + // domain distortion + int enable_winner_mode_for_use_tx_domain_dist; + + // Flag used to enable processing of multiple winner modes + int enable_multiwinner_mode_process; + + // Motion mode for winner candidates: + // 0: speed feature OFF + // 1 / 2 : Use configured number of winner candidates + int motion_mode_for_winner_cand; +} WINNER_MODE_SPEED_FEATURES; + +typedef struct LOOP_FILTER_SPEED_FEATURES { + // This feature controls how the loop filter level is determined. + LPF_PICK_METHOD lpf_pick; + + // Control how the CDEF strength is determined. + CDEF_PICK_METHOD cdef_pick_method; + + // Decoder side speed feature to add penalty for use of dual-sgr filters. + // Takes values 0 - 10, 0 indicating no penalty and each additional level + // adding a penalty of 1% + int dual_sgr_penalty_level; + + // prune sgr ep using binary search like mechanism + int enable_sgr_ep_pruning; + + // Disable loop restoration for Chroma plane + int disable_loop_restoration_chroma; + + // Prune RESTORE_WIENER evaluation based on source variance + // 0 : no pruning + // 1 : conservative pruning + // 2 : aggressive pruning + int prune_wiener_based_on_src_var; + + // Prune self-guided loop restoration based on wiener search results + // 0 : no pruning + // 1 : pruning based on rdcost ratio of RESTORE_WIENER and RESTORE_NONE + // 2 : pruning based on winner restoration type among RESTORE_WIENER and + // RESTORE_NONE + int prune_sgr_based_on_wiener; + + // Reduce the wiener filter win size for luma + int reduce_wiener_window_size; + + // Disable loop restoration filter + int disable_lr_filter; +} LOOP_FILTER_SPEED_FEATURES; + +typedef struct REAL_TIME_SPEED_FEATURES { + // check intra prediction for non-RD mode. + int check_intra_pred_nonrd; + + // Perform coarse ME before calculating variance in variance-based partition + int estimate_motion_for_var_based_partition; + + // For nonrd_use_partition: mode of extra check of leaf partition + // 0 - don't check merge + // 1 - always check merge + // 2 - check merge and prune checking final split + int nonrd_check_partition_merge_mode; + + // For nonrd_use_partition: check of leaf partition extra split + int nonrd_check_partition_split; + + // Implements various heuristics to skip searching modes + // The heuristics selected are based on flags + // defined in the MODE_SEARCH_SKIP_HEURISTICS enum + unsigned int mode_search_skip_flags; + + // For nonrd: Reduces ref frame search. + // 0 - low level of search prune in non last frames + // 1 - pruned search in non last frames + // 2 - more pruned search in non last frames + int nonrd_prune_ref_frame_search; + + // This flag controls the use of non-RD mode decision. + int use_nonrd_pick_mode; + + // Use ALTREF frame in non-RD mode decision. + int use_nonrd_altref_frame; + + // Use compound reference for non-RD mode. + int use_comp_ref_nonrd; + + // use reduced ref set for real-time mode + int use_real_time_ref_set; + + // Skip a number of expensive mode evaluations for blocks with very low + // temporal variance. + int short_circuit_low_temp_var; + + // Use modeled (currently CurvFit model) RDCost for fast non-RD mode + int use_modeled_non_rd_cost; + + // Reuse inter prediction in fast non-rd mode. + int reuse_inter_pred_nonrd; + + // Number of best inter modes to search transform. INT_MAX - search all. + int num_inter_modes_for_tx_search; + + // Forces TX search off for RDCost calulation. + int force_tx_search_off; + + // Use interpolation filter search in non-RD mode decision. + int use_nonrd_filter_search; + + // Use simplified RD model for interpolation search and Intra + int use_simple_rd_model; + + // If set forces interpolation filter to EIGHTTAP_REGULAR + int skip_interp_filter_search; + + // Use hybrid (rd for bsize < 16x16, otherwise nonrd) intra search for intra + // only frames. + int hybrid_intra_pickmode; + + // Compute variance/sse on source difference, prior to encoding superblock. + int source_metrics_sb_nonrd; +} REAL_TIME_SPEED_FEATURES; + +typedef struct SPEED_FEATURES { + /* + * Sequence/frame level speed features: + */ + HIGH_LEVEL_SPEED_FEATURES hl_sf; + + /* + * Speed features related to how tpl's searches are done. + */ + TPL_SPEED_FEATURES tpl_sf; + + /* + * Global motion speed features: + */ + GLOBAL_MOTION_SPEED_FEATURES gm_sf; + + /* + * Partition search speed features: + */ + PARTITION_SPEED_FEATURES part_sf; + + /* + * Motion search speed features: + */ + MV_SPEED_FEATURES mv_sf; + + /* + * Inter mode search speed features: + */ + INTER_MODE_SPEED_FEATURES inter_sf; + + /* + * Interpolation filter search speed features: + */ + INTERP_FILTER_SPEED_FEATURES interp_sf; + + /* + * Intra mode search speed features: + */ + INTRA_MODE_SPEED_FEATURES intra_sf; + + /* + * Transform size/type search speed features: + */ + TX_SPEED_FEATURES tx_sf; + + /* + * RD calculation speed features: + */ + RD_CALC_SPEED_FEATURES rd_sf; + + /* + * Two-pass mode evaluation features: + */ + WINNER_MODE_SPEED_FEATURES winner_mode_sf; + + /* + * In-loop filter speed features: + */ + LOOP_FILTER_SPEED_FEATURES lpf_sf; + + /* + * Real-time mode speed features: + */ + REAL_TIME_SPEED_FEATURES rt_sf; +} SPEED_FEATURES; + +struct AV1_COMP; + +void av1_set_speed_features_framesize_independent(struct AV1_COMP *cpi, + int speed); +void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi, + int speed); +void av1_set_speed_features_qindex_dependent(struct AV1_COMP *cpi, int speed); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_SPEED_FEATURES_H_ diff --git a/libs/libaom/src/av1/encoder/svc_layercontext.c b/libs/libaom/src/av1/encoder/svc_layercontext.c new file mode 100644 index 000000000..b72d8aa73 --- /dev/null +++ b/libs/libaom/src/av1/encoder/svc_layercontext.c @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "av1/encoder/encoder.h" + +static void swap_ptr(void *a, void *b) { + void **a_p = (void **)a; + void **b_p = (void **)b; + void *c = *a_p; + *a_p = *b_p; + *b_p = c; +} + +void av1_init_layer_context(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + SVC *const svc = &cpi->svc; + int mi_rows = cpi->common.mi_params.mi_rows; + int mi_cols = cpi->common.mi_params.mi_cols; + svc->base_framerate = 30.0; + svc->current_superframe = 0; + + for (int sl = 0; sl < svc->number_spatial_layers; ++sl) { + for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { + int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + lrc->ni_av_qi = oxcf->worst_allowed_q; + lrc->total_actual_bits = 0; + lrc->total_target_vs_actual = 0; + lrc->ni_tot_qi = 0; + lrc->tot_q = 0.0; + lrc->avg_q = 0.0; + lrc->ni_frames = 0; + lrc->decimation_count = 0; + lrc->decimation_factor = 0; + lrc->worst_quality = av1_quantizer_to_qindex(lc->max_q); + lrc->best_quality = av1_quantizer_to_qindex(lc->min_q); + for (int i = 0; i < RATE_FACTOR_LEVELS; ++i) { + lrc->rate_correction_factors[i] = 1.0; + } + lc->target_bandwidth = lc->layer_target_bitrate; + lrc->last_q[INTER_FRAME] = lrc->worst_quality; + lrc->avg_frame_qindex[INTER_FRAME] = lrc->worst_quality; + lrc->avg_frame_qindex[KEY_FRAME] = lrc->worst_quality; + lrc->buffer_level = + oxcf->starting_buffer_level_ms * lc->target_bandwidth / 1000; + lrc->bits_off_target = lrc->buffer_level; + // Initialize the cyclic refresh parameters. If spatial layers are used + // (i.e., ss_number_layers > 1), these need to be updated per spatial + // layer. Cyclic refresh is only applied on base temporal layer. + if (svc->number_spatial_layers > 1 && tl == 0) { + size_t last_coded_q_map_size; + lc->sb_index = 0; + lc->actual_num_seg1_blocks = 0; + lc->actual_num_seg2_blocks = 0; + lc->counter_encode_maxq_scene_change = 0; + CHECK_MEM_ERROR(cm, lc->map, + aom_malloc(mi_rows * mi_cols * sizeof(*lc->map))); + memset(lc->map, 0, mi_rows * mi_cols); + last_coded_q_map_size = + mi_rows * mi_cols * sizeof(*lc->last_coded_q_map); + CHECK_MEM_ERROR(cm, lc->last_coded_q_map, + aom_malloc(last_coded_q_map_size)); + assert(MAXQ <= 255); + memset(lc->last_coded_q_map, MAXQ, last_coded_q_map_size); + } + } + } +} + +// Update the layer context from a change_config() call. +void av1_update_layer_context_change_config(AV1_COMP *const cpi, + const int64_t target_bandwidth) { + const RATE_CONTROL *const rc = &cpi->rc; + SVC *const svc = &cpi->svc; + int layer = 0; + int64_t spatial_layer_target = 0; + float bitrate_alloc = 1.0; + + for (int sl = 0; sl < svc->number_spatial_layers; ++sl) { + for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { + layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + svc->layer_context[layer].target_bandwidth = lc->layer_target_bitrate; + } + spatial_layer_target = svc->layer_context[layer].target_bandwidth; + for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { + LAYER_CONTEXT *const lc = + &svc->layer_context[sl * svc->number_temporal_layers + tl]; + RATE_CONTROL *const lrc = &lc->rc; + lc->spatial_layer_target_bandwidth = spatial_layer_target; + bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; + lrc->starting_buffer_level = + (int64_t)(rc->starting_buffer_level * bitrate_alloc); + lrc->optimal_buffer_level = + (int64_t)(rc->optimal_buffer_level * bitrate_alloc); + lrc->maximum_buffer_size = + (int64_t)(rc->maximum_buffer_size * bitrate_alloc); + lrc->bits_off_target = + AOMMIN(lrc->bits_off_target, lrc->maximum_buffer_size); + lrc->buffer_level = AOMMIN(lrc->buffer_level, lrc->maximum_buffer_size); + lc->framerate = cpi->framerate / lc->framerate_factor; + lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); + lrc->max_frame_bandwidth = rc->max_frame_bandwidth; + lrc->worst_quality = av1_quantizer_to_qindex(lc->max_q); + lrc->best_quality = av1_quantizer_to_qindex(lc->min_q); + } + } +} + +static LAYER_CONTEXT *get_layer_context(AV1_COMP *const cpi) { + return &cpi->svc.layer_context[cpi->svc.spatial_layer_id * + cpi->svc.number_temporal_layers + + cpi->svc.temporal_layer_id]; +} + +void av1_update_temporal_layer_framerate(AV1_COMP *const cpi) { + SVC *const svc = &cpi->svc; + LAYER_CONTEXT *const lc = get_layer_context(cpi); + RATE_CONTROL *const lrc = &lc->rc; + const int tl = svc->temporal_layer_id; + lc->framerate = cpi->framerate / lc->framerate_factor; + lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); + lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth; + // Update the average layer frame size (non-cumulative per-frame-bw). + if (tl == 0) { + lc->avg_frame_size = lrc->avg_frame_bandwidth; + } else { + int prev_layer = svc->spatial_layer_id * svc->number_temporal_layers + + svc->temporal_layer_id - 1; + LAYER_CONTEXT *const lcprev = &svc->layer_context[prev_layer]; + const double prev_layer_framerate = + cpi->framerate / lcprev->framerate_factor; + const int64_t prev_layer_target_bandwidth = lcprev->layer_target_bitrate; + lc->avg_frame_size = + (int)((lc->target_bandwidth - prev_layer_target_bandwidth) / + (lc->framerate - prev_layer_framerate)); + } +} + +void av1_restore_layer_context(AV1_COMP *const cpi) { + GF_GROUP *const gf_group = &cpi->gf_group; + SVC *const svc = &cpi->svc; + LAYER_CONTEXT *const lc = get_layer_context(cpi); + const int old_frame_since_key = cpi->rc.frames_since_key; + const int old_frame_to_key = cpi->rc.frames_to_key; + // Restore layer rate control. + cpi->rc = lc->rc; + cpi->oxcf.target_bandwidth = lc->target_bandwidth; + gf_group->index = lc->group_index; + // Reset the frames_since_key and frames_to_key counters to their values + // before the layer restore. Keep these defined for the stream (not layer). + cpi->rc.frames_since_key = old_frame_since_key; + cpi->rc.frames_to_key = old_frame_to_key; + // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers, + // for the base temporal layer. + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + svc->number_spatial_layers > 1 && svc->temporal_layer_id == 0) { + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + swap_ptr(&cr->map, &lc->map); + swap_ptr(&cr->last_coded_q_map, &lc->last_coded_q_map); + cr->sb_index = lc->sb_index; + cr->actual_num_seg1_blocks = lc->actual_num_seg1_blocks; + cr->actual_num_seg2_blocks = lc->actual_num_seg2_blocks; + } + svc->skip_nonzeromv_last = 0; + svc->skip_nonzeromv_gf = 0; + // For each reference (LAST/GOLDEN) set the skip_nonzero_last/gf frame flags. + // This is to skip testing nonzero-mv for that reference if it was last + // refreshed (i.e., buffer slot holding that reference was refreshed) on the + // previous spatial layer at the same time (current_superframe). + if (svc->external_ref_frame_config) { + int ref_frame_idx = svc->ref_idx[LAST_FRAME - 1]; + if (svc->buffer_time_index[ref_frame_idx] == svc->current_superframe && + svc->buffer_spatial_layer[ref_frame_idx] == svc->spatial_layer_id - 1) + svc->skip_nonzeromv_last = 1; + ref_frame_idx = svc->ref_idx[GOLDEN_FRAME - 1]; + if (svc->buffer_time_index[ref_frame_idx] == svc->current_superframe && + svc->buffer_spatial_layer[ref_frame_idx] == svc->spatial_layer_id - 1) + svc->skip_nonzeromv_gf = 1; + } +} + +void av1_save_layer_context(AV1_COMP *const cpi) { + GF_GROUP *const gf_group = &cpi->gf_group; + SVC *const svc = &cpi->svc; + LAYER_CONTEXT *lc = get_layer_context(cpi); + lc->rc = cpi->rc; + lc->target_bandwidth = (int)cpi->oxcf.target_bandwidth; + lc->group_index = gf_group->index; + if (svc->spatial_layer_id == 0) svc->base_framerate = cpi->framerate; + // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers, + // for the base temporal layer. + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + cpi->svc.number_spatial_layers > 1 && svc->temporal_layer_id == 0) { + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + signed char *temp = lc->map; + uint8_t *temp2 = lc->last_coded_q_map; + lc->map = cr->map; + cr->map = temp; + lc->last_coded_q_map = cr->last_coded_q_map; + cr->last_coded_q_map = temp2; + lc->sb_index = cr->sb_index; + lc->actual_num_seg1_blocks = cr->actual_num_seg1_blocks; + lc->actual_num_seg2_blocks = cr->actual_num_seg2_blocks; + } + // For any buffer slot that is refreshed, update it with + // the spatial_layer_id and the current_superframe. + if (cpi->common.current_frame.frame_type == KEY_FRAME) { + // All slots are refreshed on KEY. + for (unsigned int i = 0; i < REF_FRAMES; i++) { + svc->buffer_time_index[i] = svc->current_superframe; + svc->buffer_spatial_layer[i] = svc->spatial_layer_id; + } + } else if (cpi->svc.external_ref_frame_config) { + for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) { + int ref_frame_map_idx = svc->ref_idx[i]; + if (cpi->svc.refresh[ref_frame_map_idx]) { + svc->buffer_time_index[ref_frame_map_idx] = svc->current_superframe; + svc->buffer_spatial_layer[ref_frame_map_idx] = svc->spatial_layer_id; + } + } + } + if (svc->spatial_layer_id == svc->number_spatial_layers - 1) + svc->current_superframe++; +} + +void av1_free_svc_cyclic_refresh(AV1_COMP *const cpi) { + SVC *const svc = &cpi->svc; + for (int sl = 0; sl < svc->number_spatial_layers; ++sl) { + for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { + int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + if (lc->map) aom_free(lc->map); + if (lc->last_coded_q_map) aom_free(lc->last_coded_q_map); + } + } +} + +// Reset on key frame: reset counters, references and buffer updates. +void av1_svc_reset_temporal_layers(AV1_COMP *const cpi, int is_key) { + SVC *const svc = &cpi->svc; + LAYER_CONTEXT *lc = NULL; + for (int sl = 0; sl < svc->number_spatial_layers; ++sl) { + for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { + lc = &cpi->svc.layer_context[sl * svc->number_temporal_layers + tl]; + if (is_key) lc->frames_from_key_frame = 0; + } + } + av1_update_temporal_layer_framerate(cpi); + av1_restore_layer_context(cpi); +} + +static void get_layer_resolution(const int width_org, const int height_org, + const int num, const int den, int *width_out, + int *height_out) { + int w, h; + if (width_out == NULL || height_out == NULL || den == 0) return; + w = width_org * num / den; + h = height_org * num / den; + // Make height and width even. + w += w % 2; + h += h % 2; + *width_out = w; + *height_out = h; +} + +void av1_one_pass_cbr_svc_start_layer(AV1_COMP *const cpi) { + SVC *const svc = &cpi->svc; + LAYER_CONTEXT *lc = NULL; + int width = 0, height = 0; + lc = &svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers + + svc->temporal_layer_id]; + get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height, + lc->scaling_factor_num, lc->scaling_factor_den, &width, + &height); + av1_set_size_literal(cpi, width, height); +} diff --git a/libs/libaom/src/av1/encoder/svc_layercontext.h b/libs/libaom/src/av1/encoder/svc_layercontext.h new file mode 100644 index 000000000..7cb85a3c9 --- /dev/null +++ b/libs/libaom/src/av1/encoder/svc_layercontext.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_ +#define AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_ + +#include "av1/encoder/aq_cyclicrefresh.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/ratectrl.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + RATE_CONTROL rc; + int framerate_factor; + int64_t layer_target_bitrate; + int scaling_factor_num; + int scaling_factor_den; + int64_t target_bandwidth; + int64_t spatial_layer_target_bandwidth; + double framerate; + int avg_frame_size; + int max_q; + int min_q; + int frames_from_key_frame; + // Cyclic refresh parameters (aq-mode=3), that need to be updated per-frame. + int sb_index; + int8_t *map; + uint8_t *last_coded_q_map; + int actual_num_seg1_blocks; + int actual_num_seg2_blocks; + int counter_encode_maxq_scene_change; + uint8_t speed; + unsigned char group_index; +} LAYER_CONTEXT; + +typedef struct SVC { + int spatial_layer_id; + int temporal_layer_id; + int number_spatial_layers; + int number_temporal_layers; + int external_ref_frame_config; + int non_reference_frame; + // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3), + // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6). + int reference[INTER_REFS_PER_FRAME]; + int ref_idx[INTER_REFS_PER_FRAME]; + int refresh[REF_FRAMES]; + double base_framerate; + unsigned int current_superframe; + unsigned int buffer_time_index[REF_FRAMES]; + unsigned char buffer_spatial_layer[REF_FRAMES]; + int skip_nonzeromv_last; + int skip_nonzeromv_gf; + // Layer context used for rate control in one pass temporal CBR mode or + // two pass spatial mode. + LAYER_CONTEXT layer_context[AOM_MAX_LAYERS]; +} SVC; + +struct AV1_COMP; + +// Initialize layer context data from init_config(). +void av1_init_layer_context(struct AV1_COMP *const cpi); + +// Update the layer context from a change_config() call. +void av1_update_layer_context_change_config(struct AV1_COMP *const cpi, + const int64_t target_bandwidth); + +// Prior to encoding the frame, update framerate-related quantities +// for the current temporal layer. +void av1_update_temporal_layer_framerate(struct AV1_COMP *const cpi); + +// Prior to encoding the frame, set the layer context, for the current layer +// to be encoded, to the cpi struct. +void av1_restore_layer_context(struct AV1_COMP *const cpi); + +// Save the layer context after encoding the frame. +void av1_save_layer_context(struct AV1_COMP *const cpi); + +void av1_free_svc_cyclic_refresh(struct AV1_COMP *const cpi); + +void av1_svc_reset_temporal_layers(struct AV1_COMP *const cpi, int is_key); + +void av1_one_pass_cbr_svc_start_layer(struct AV1_COMP *const cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_ diff --git a/libs/libaom/src/av1/encoder/temporal_filter.c b/libs/libaom/src/av1/encoder/temporal_filter.c new file mode 100644 index 000000000..a637df559 --- /dev/null +++ b/libs/libaom/src/av1/encoder/temporal_filter.c @@ -0,0 +1,1338 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" + +#include "av1/common/alloccommon.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/odintrin.h" +#include "av1/common/quant_common.h" +#include "av1/common/reconinter.h" +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/extend.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/segmentation.h" +#include "av1/encoder/temporal_filter.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/mem.h" +#include "aom_ports/system_state.h" +#include "aom_scale/aom_scale.h" + +// NOTE: All `tf` in this file means `temporal filtering`. + +// Does motion search for blocks in temporal filtering. This is the first step +// for temporal filtering. More specifically, given a frame to be filtered and +// another frame as reference, this function searches the reference frame to +// find out the most alike block as that from the frame to be filtered. This +// found block will be further used for weighted averaging. +// NOTE: Besides doing motion search for the entire block, this function will +// also do motion search for each 1/4 sub-block to get more precise prediction. +// Inputs: +// cpi: Pointer to the composed information of input video. +// frame_to_filter: Pointer to the frame to be filtered. +// ref_frame: Pointer to the reference frame. +// block_size: Block size used for motion search. +// mb_row: Row index of the block in the entire frame. +// mb_col: Column index of the block in the entire frame. +// ref_mv: Reference motion vector, which is commonly inherited from the +// motion search result of previous frame. +// subblock_mvs: Pointer to the result motion vectors for 4 sub-blocks. +// subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks. +// Returns: +// Search error (MSE) of the entire block. +static int tf_motion_search(AV1_COMP *cpi, + const YV12_BUFFER_CONFIG *frame_to_filter, + const YV12_BUFFER_CONFIG *ref_frame, + const BLOCK_SIZE block_size, const int mb_row, + const int mb_col, MV *ref_mv, MV *subblock_mvs, + int *subblock_mses) { + // Frame information + const int min_frame_size = AOMMIN(cpi->common.width, cpi->common.height); + + // Block information (ONLY Y-plane is used for motion search). + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int mb_pels = mb_height * mb_width; + const int y_stride = frame_to_filter->y_stride; + assert(y_stride == ref_frame->y_stride); + const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width; + + // Save input state. + MACROBLOCK *const mb = &cpi->td.mb; + MACROBLOCKD *const mbd = &mb->e_mbd; + const struct buf_2d ori_src_buf = mb->plane[0].src; + const struct buf_2d ori_pre_buf = mbd->plane[0].pre[0]; + const MV_COST_TYPE ori_mv_cost_type = mb->mv_cost_type; + + // Parameters used for motion search. + FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + + const search_site_config ss_cfg = + cpi->mv_search_params.ss_cfg[SS_CFG_LOOKAHEAD]; + const SEARCH_METHODS full_search_method = NSTEP; + const int step_param = av1_init_search_range( + AOMMAX(frame_to_filter->y_crop_width, frame_to_filter->y_crop_height)); + const SUBPEL_SEARCH_TYPE subpel_search_type = USE_8_TAPS; + const int force_integer_mv = cpi->common.features.cur_frame_force_integer_mv; + const MV_COST_TYPE mv_cost_type = + min_frame_size >= 720 + ? MV_COST_L1_HDRES + : (min_frame_size >= 480 ? MV_COST_L1_MIDRES : MV_COST_L1_LOWRES); + + // Starting position for motion search. + FULLPEL_MV start_mv = get_fullmv_from_mv(ref_mv); + // Baseline position for motion search (used for rate distortion comparison). + const MV baseline_mv = kZeroMv; + + // Setup. + mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset; + mb->plane[0].src.stride = y_stride; + mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset; + mbd->plane[0].pre[0].stride = y_stride; + // Unused intermediate results for motion search. + unsigned int sse, error; + int distortion; + int cost_list[5]; + + // Do motion search. + // NOTE: In `av1_full_pixel_search()` and `find_fractional_mv_step()`, the + // searched result will be stored in `mb->best_mv`. + int_mv best_mv; + int block_mse = INT_MAX; + mb->mv_cost_type = mv_cost_type; + + av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size, + &baseline_mv, &ss_cfg); + full_ms_params.run_mesh_search = 1; + full_ms_params.search_method = full_search_method; + av1_full_pixel_search(start_mv, &full_ms_params, step_param, + cond_cost_list(cpi, cost_list), &best_mv.as_fullmv, + NULL); + + // Since we are merely refining the result from full pixel search, we don't + // need regularization for subpel search + mb->mv_cost_type = MV_COST_NONE; + if (force_integer_mv == 1) { // Only do full search on the entire block. + const int mv_row = best_mv.as_mv.row; + const int mv_col = best_mv.as_mv.col; + best_mv.as_mv.row = GET_MV_SUBPEL(mv_row); + best_mv.as_mv.col = GET_MV_SUBPEL(mv_col); + const int mv_offset = mv_row * y_stride + mv_col; + error = cpi->fn_ptr[block_size].vf( + ref_frame->y_buffer + y_offset + mv_offset, y_stride, + frame_to_filter->y_buffer + y_offset, y_stride, &sse); + block_mse = DIVIDE_AND_ROUND(error, mb_pels); + mb->e_mbd.mi[0]->mv[0] = best_mv; + } else { // Do fractional search on the entire block and all sub-blocks. + av1_make_default_subpel_ms_params(&ms_params, cpi, mb, block_size, + &baseline_mv, cost_list); + ms_params.forced_stop = EIGHTH_PEL; + ms_params.var_params.subpel_search_type = subpel_search_type; + MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv); + error = cpi->mv_search_params.find_fractional_mv_step( + &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv, &best_mv.as_mv, + &distortion, &sse, NULL); + block_mse = DIVIDE_AND_ROUND(error, mb_pels); + mb->e_mbd.mi[0]->mv[0] = best_mv; + *ref_mv = best_mv.as_mv; + // On 4 sub-blocks. + const BLOCK_SIZE subblock_size = ss_size_lookup[block_size][1][1]; + const int subblock_height = block_size_high[subblock_size]; + const int subblock_width = block_size_wide[subblock_size]; + const int subblock_pels = subblock_height * subblock_width; + start_mv = get_fullmv_from_mv(ref_mv); + + int subblock_idx = 0; + for (int i = 0; i < mb_height; i += subblock_height) { + for (int j = 0; j < mb_width; j += subblock_width) { + const int offset = i * y_stride + j; + mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset + offset; + mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset + offset; + mb->mv_cost_type = mv_cost_type; + + av1_make_default_fullpel_ms_params( + &full_ms_params, cpi, mb, subblock_size, &baseline_mv, &ss_cfg); + full_ms_params.run_mesh_search = 1; + full_ms_params.search_method = full_search_method; + av1_full_pixel_search(start_mv, &full_ms_params, step_param, + cond_cost_list(cpi, cost_list), + &best_mv.as_fullmv, NULL); + + // Since we are merely refining the result from full pixel search, we + // don't need regularization for subpel search + mb->mv_cost_type = MV_COST_NONE; + av1_make_default_subpel_ms_params(&ms_params, cpi, mb, subblock_size, + &baseline_mv, cost_list); + ms_params.forced_stop = EIGHTH_PEL; + ms_params.var_params.subpel_search_type = subpel_search_type; + subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv); + error = cpi->mv_search_params.find_fractional_mv_step( + &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv, + &best_mv.as_mv, &distortion, &sse, NULL); + subblock_mses[subblock_idx] = DIVIDE_AND_ROUND(error, subblock_pels); + subblock_mvs[subblock_idx] = best_mv.as_mv; + ++subblock_idx; + } + } + } + + // Restore input state. + mb->plane[0].src = ori_src_buf; + mbd->plane[0].pre[0] = ori_pre_buf; + mb->mv_cost_type = ori_mv_cost_type; + + return block_mse; +} + +// Helper function to get weight according to thresholds. +static INLINE int get_weight_by_thresh(const int value, const int low, + const int high) { + return value < low ? 2 : value < high ? 1 : 0; +} + +// Gets filter weight for blocks in temporal filtering. The weights will be +// assigned based on the motion search errors. +// NOTE: Besides assigning filter weight for the block, this function will also +// determine whether to split the entire block into 4 sub-blocks for further +// filtering. +// TODO(any): Many magic numbers are used in this function. They may be tuned +// to improve the performance. +// Inputs: +// block_mse: Motion search error (MSE) for the entire block. +// subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks. +// is_second_arf: Whether the to-filter frame is the second ARF. This field +// will affect the filter weight for the to-filter frame. +// subblock_filter_weights: Pointer to the assigned filter weight for each +// sub-block. If not using sub-blocks, the first +// element will be used for the entire block. +// Returns: Whether to use 4 sub-blocks to replace the original block. +static int tf_get_filter_weight(const int block_mse, const int *subblock_mses, + const int is_second_arf, + int *subblock_filter_weights) { + // `block_mse` is initialized as INT_MAX and will be overwritten after the + // motion search with reference frame, therefore INT_MAX can ONLY be accessed + // by to-filter frame. + if (block_mse == INT_MAX) { + const int weight = TF_ENABLE_PLANEWISE_STRATEGY + ? TF_PLANEWISE_FILTER_WEIGHT_SCALE + : is_second_arf ? 64 : 32; + subblock_filter_weights[0] = subblock_filter_weights[1] = + subblock_filter_weights[2] = subblock_filter_weights[3] = weight; + return 0; + } + + const int thresh_low = is_second_arf ? 20 : 40; + const int thresh_high = is_second_arf ? 40 : 80; + + int min_subblock_mse = INT_MAX; + int max_subblock_mse = INT_MIN; + int sum_subblock_mse = 0; + for (int i = 0; i < 4; ++i) { + sum_subblock_mse += subblock_mses[i]; + min_subblock_mse = AOMMIN(min_subblock_mse, subblock_mses[i]); + max_subblock_mse = AOMMAX(max_subblock_mse, subblock_mses[i]); + subblock_filter_weights[i] = + get_weight_by_thresh(subblock_mses[i], thresh_low, thresh_high); + } + + if (((block_mse * 15 < sum_subblock_mse * 4) && + max_subblock_mse - min_subblock_mse < 48) || + ((block_mse * 14 < sum_subblock_mse * 4) && + max_subblock_mse - min_subblock_mse < 24)) { // No split. + const int weight = get_weight_by_thresh(block_mse, thresh_low, thresh_high); + subblock_filter_weights[0] = subblock_filter_weights[1] = + subblock_filter_weights[2] = subblock_filter_weights[3] = weight; + return 0; + } else { // Do split. + return 1; + } +} + +// Helper function to determine whether a frame is encoded with high bit-depth. +static INLINE int is_frame_high_bitdepth(const YV12_BUFFER_CONFIG *frame) { + return (frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0; +} + +// Builds predictor for blocks in temporal filtering. This is the second step +// for temporal filtering, which is to construct predictions from all reference +// frames INCLUDING the frame to be filtered itself. These predictors are built +// based on the motion search results (motion vector is set as 0 for the frame +// to be filtered), and will be futher used for weighted averaging. +// Inputs: +// ref_frame: Pointer to the reference frame (or the frame to be filtered). +// mbd: Pointer to the block for filtering. Besides containing the subsampling +// information of all planes, this field also gives the searched motion +// vector for the entire block, i.e., `mbd->mi[0]->mv[0]`. This vector +// should be 0 if the `ref_frame` itself is the frame to be filtered. +// block_size: Size of the block. +// mb_row: Row index of the block in the entire frame. +// mb_col: Column index of the block in the entire frame. +// num_planes: Number of planes in the frame. +// scale: Scaling factor. +// use_subblock: Whether to use 4 sub-blocks to replace the original block. +// subblock_mvs: The motion vectors for each sub-block (row-major order). +// pred: Pointer to the predictor to build. +// Returns: +// Nothing will be returned. But the content to which `pred` points will be +// modified. +static void tf_build_predictor(const YV12_BUFFER_CONFIG *ref_frame, + const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, + const int mb_col, const int num_planes, + const struct scale_factors *scale, + const int use_subblock, const MV *subblock_mvs, + uint8_t *pred) { + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + + // Information of the entire block. + const int mb_height = block_size_high[block_size]; // Height. + const int mb_width = block_size_wide[block_size]; // Width. + const int mb_pels = mb_height * mb_width; // Number of pixels. + const int mb_y = mb_height * mb_row; // Y-coord (Top-left). + const int mb_x = mb_width * mb_col; // X-coord (Top-left). + const int bit_depth = mbd->bd; // Bit depth. + const int is_intrabc = 0; // Is intra-copied? + const int mb_mv_row = mbd->mi[0]->mv[0].as_mv.row; // Motion vector (y). + const int mb_mv_col = mbd->mi[0]->mv[0].as_mv.col; // Motion vector (x). + const MV mb_mv = { (int16_t)mb_mv_row, (int16_t)mb_mv_col }; + const int is_high_bitdepth = is_frame_high_bitdepth(ref_frame); + + // Information of each sub-block (actually in use). + const int num_blocks = use_subblock ? 2 : 1; // Num of blocks on each side. + const int block_height = mb_height >> (num_blocks - 1); // Height. + const int block_width = mb_width >> (num_blocks - 1); // Width. + + // Default interpolation filters. + const int_interpfilters interp_filters = + av1_broadcast_interp_filter(MULTITAP_SHARP); + + // Handle Y-plane, U-plane and V-plane (if needed) in sequence. + int plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const int subsampling_y = mbd->plane[plane].subsampling_y; + const int subsampling_x = mbd->plane[plane].subsampling_x; + // Information of each sub-block in current plane. + const int plane_h = mb_height >> subsampling_y; // Plane height. + const int plane_w = mb_width >> subsampling_x; // Plane width. + const int plane_y = mb_y >> subsampling_y; // Y-coord (Top-left). + const int plane_x = mb_x >> subsampling_x; // X-coord (Top-left). + const int h = block_height >> subsampling_y; // Sub-block height. + const int w = block_width >> subsampling_x; // Sub-block width. + const int is_y_plane = (plane == 0); // Is Y-plane? + + const struct buf_2d ref_buf = { NULL, ref_frame->buffers[plane], + ref_frame->widths[is_y_plane ? 0 : 1], + ref_frame->heights[is_y_plane ? 0 : 1], + ref_frame->strides[is_y_plane ? 0 : 1] }; + + // Handle entire block or sub-blocks if needed. + int subblock_idx = 0; + for (int i = 0; i < plane_h; i += h) { + for (int j = 0; j < plane_w; j += w) { + // Choose proper motion vector. + const MV mv = use_subblock ? subblock_mvs[subblock_idx] : mb_mv; + assert(mv.row >= INT16_MIN && mv.row <= INT16_MAX && + mv.col >= INT16_MIN && mv.col <= INT16_MAX); + + const int y = plane_y + i; + const int x = plane_x + j; + + // Build predictior for each sub-block on current plane. + InterPredParams inter_pred_params; + av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x, + subsampling_y, bit_depth, is_high_bitdepth, + is_intrabc, scale, &ref_buf, interp_filters); + inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth); + av1_enc_build_one_inter_predictor(&pred[plane_offset + i * plane_w + j], + plane_w, &mv, &inter_pred_params); + + ++subblock_idx; + } + } + plane_offset += mb_pels; + } +} + +// Computes temporal filter weights and accumulators for the frame to be +// filtered. More concretely, the filter weights for all pixels are the same. +// Inputs: +// mbd: Pointer to the block for filtering, which is ONLY used to get +// subsampling information of all planes as well as the bit-depth. +// block_size: Size of the block. +// num_planes: Number of planes in the frame. +// filter_weight: Weight used for filtering. +// pred: Pointer to the well-built predictors. +// accum: Pointer to the pixel-wise accumulator for filtering. +// count: Pointer to the pixel-wise counter fot filtering. +// Returns: +// Nothing will be returned. But the content to which `accum` and `pred` +// point will be modified. +void av1_apply_temporal_filter_self(const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, + const int num_planes, + const int filter_weight, + const uint8_t *pred, uint32_t *accum, + uint16_t *count) { + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + + // Block information. + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int mb_pels = mb_height * mb_width; + const int is_high_bitdepth = is_cur_buf_hbd(mbd); + const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred); + + int plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const int subsampling_y = mbd->plane[plane].subsampling_y; + const int subsampling_x = mbd->plane[plane].subsampling_x; + const int h = mb_height >> subsampling_y; // Plane height. + const int w = mb_width >> subsampling_x; // Plane width. + + int pred_idx = 0; + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + const int idx = plane_offset + pred_idx; // Index with plane shift. + const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx]; + accum[idx] += filter_weight * pred_value; + count[idx] += filter_weight; + ++pred_idx; + } + } + plane_offset += mb_pels; + } +} + +// Function to compute pixel-wise squared difference between two buffers. +// Inputs: +// ref: Pointer to reference buffer. +// ref_offset: Start position of reference buffer for computation. +// ref_stride: Stride for reference buffer. +// tgt: Pointer to target buffer. +// tgt_offset: Start position of target buffer for computation. +// tgt_stride: Stride for target buffer. +// height: Height of block for computation. +// width: Width of block for computation. +// is_high_bitdepth: Whether the two buffers point to high bit-depth frames. +// square_diff: Pointer to save the squared differces. +// Returns: +// Nothing will be returned. But the content to which `square_diff` points +// will be modified. +static INLINE void compute_square_diff(const uint8_t *ref, const int ref_offset, + const int ref_stride, const uint8_t *tgt, + const int tgt_offset, + const int tgt_stride, const int height, + const int width, + const int is_high_bitdepth, + uint32_t *square_diff) { + const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref); + const uint16_t *tgt16 = CONVERT_TO_SHORTPTR(tgt); + + int ref_idx = 0; + int tgt_idx = 0; + int idx = 0; + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + const uint16_t ref_value = is_high_bitdepth ? ref16[ref_offset + ref_idx] + : ref[ref_offset + ref_idx]; + const uint16_t tgt_value = is_high_bitdepth ? tgt16[tgt_offset + tgt_idx] + : tgt[tgt_offset + tgt_idx]; + const uint32_t diff = (ref_value > tgt_value) ? (ref_value - tgt_value) + : (tgt_value - ref_value); + square_diff[idx] = diff * diff; + + ++ref_idx; + ++tgt_idx; + ++idx; + } + ref_idx += (ref_stride - width); + tgt_idx += (tgt_stride - width); + } +} + +// Function to adjust the filter weight when use YUV strategy. +// Inputs: +// filter_weight: Original filter weight. +// sum_square_diff: Sum of squared difference between input frame and +// prediction. This field is computed pixel by pixel, and +// is used as a reference for the filter weight adjustment. +// num_ref_pixels: Number of pixels used to compute the `sum_square_diff`. +// This field should align with the above lookup tables +// `filter_weight_adjustment_lookup_table_yuv` and +// `highbd_filter_weight_adjustment_lookup_table_yuv`. +// strength: Strength for filter weight adjustment. +// Returns: +// Adjusted filter weight which will finally be used for filtering. +static INLINE int adjust_filter_weight_yuv(const int filter_weight, + const uint64_t sum_square_diff, + const int num_ref_pixels, + const int strength) { + int modifier = + (int)(AOMMIN(sum_square_diff * TF_YUV_FILTER_WEIGHT_SCALE, INT32_MAX)) / + num_ref_pixels; + const int rounding = (1 << strength) >> 1; + modifier = (modifier + rounding) >> strength; + return (modifier >= 16) ? 0 : (16 - modifier) * filter_weight; +} + +// Applies temporal filter with YUV strategy. +// Inputs: +// frame_to_filter: Pointer to the frame to be filtered, which is used as +// reference to compute squared differece from the predictor. +// mbd: Pointer to the block for filtering, which is ONLY used to get +// subsampling information of all YUV planes. +// block_size: Size of the block. +// mb_row: Row index of the block in the entire frame. +// mb_col: Column index of the block in the entire frame. +// num_planes: Number of planes in the frame. +// strength: Strength for filter weight adjustment. +// use_subblock: Whether to use 4 sub-blocks to replace the original block. +// subblock_filter_weights: The filter weights for each sub-block (row-major +// order). If `use_subblock` is set as 0, the first +// weight will be applied to the entire block. +// pred: Pointer to the well-built predictors. +// accum: Pointer to the pixel-wise accumulator for filtering. +// count: Pointer to the pixel-wise counter fot filtering. +// Returns: +// Nothing will be returned. But the content to which `accum` and `pred` +// point will be modified. +void av1_apply_temporal_filter_yuv_c( + const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const int strength, const int use_subblock, + const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum, + uint16_t *count) { + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + + // Block information. + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int mb_pels = mb_height * mb_width; + const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter); + const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred); + + // Allocate memory for pixel-wise squared differences for all planes. They, + // regardless of the subsampling, are assigned with memory of size `mb_pels`. + uint32_t *square_diff = + aom_memalign(16, num_planes * mb_pels * sizeof(uint32_t)); + memset(square_diff, 0, num_planes * mb_pels * sizeof(square_diff[0])); + + int plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + // Locate pixel on reference frame. + const int plane_h = mb_height >> mbd->plane[plane].subsampling_y; + const int plane_w = mb_width >> mbd->plane[plane].subsampling_x; + const int frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1]; + const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; + const uint8_t *ref = frame_to_filter->buffers[plane]; + compute_square_diff(ref, frame_offset, frame_stride, pred, plane_offset, + plane_w, plane_h, plane_w, is_high_bitdepth, + square_diff + plane_offset); + plane_offset += mb_pels; + } + + // Get window size for pixel-wise filtering. + assert(TF_YUV_FILTER_WINDOW_LENGTH % 2 == 1); + const int half_window = TF_YUV_FILTER_WINDOW_LENGTH >> 1; + + // Handle planes in sequence. + plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const int subsampling_y = mbd->plane[plane].subsampling_y; + const int subsampling_x = mbd->plane[plane].subsampling_x; + const int h = mb_height >> subsampling_y; // Plane height. + const int w = mb_width >> subsampling_x; // Plane width. + + // Perform filtering. + int pred_idx = 0; + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + // non-local mean approach + uint64_t sum_square_diff = 0; + int num_ref_pixels = 0; + + for (int wi = -half_window; wi <= half_window; ++wi) { + for (int wj = -half_window; wj <= half_window; ++wj) { + const int y = i + wi; // Y-coord on the current plane. + const int x = j + wj; // X-coord on the current plane. + if (y >= 0 && y < h && x >= 0 && x < w) { + sum_square_diff += square_diff[plane_offset + y * w + x]; + ++num_ref_pixels; + } + } + } + + if (plane == 0) { // Filter Y-plane using both U-plane and V-plane. + for (int p = 1; p < num_planes; ++p) { + const int ss_y_shift = mbd->plane[p].subsampling_y - subsampling_y; + const int ss_x_shift = mbd->plane[p].subsampling_x - subsampling_x; + const int yy = i >> ss_y_shift; // Y-coord on UV-plane. + const int xx = j >> ss_x_shift; // X-coord on UV-plane. + const int ww = w >> ss_x_shift; // Width of UV-plane. + sum_square_diff += square_diff[p * mb_pels + yy * ww + xx]; + ++num_ref_pixels; + } + } else { // Filter U-plane and V-plane using Y-plane. + const int ss_y_shift = subsampling_y - mbd->plane[0].subsampling_y; + const int ss_x_shift = subsampling_x - mbd->plane[0].subsampling_x; + for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { + for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { + const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. + const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane. + const int ww = w << ss_x_shift; // Width of Y-plane. + sum_square_diff += square_diff[yy * ww + xx]; + ++num_ref_pixels; + } + } + } + + // Base filter weight estimated by motion search error. + const int subblock_idx = + use_subblock ? (i >= h / 2) * 2 + (j >= w / 2) : 0; + const int filter_weight = subblock_filter_weights[subblock_idx]; + + const int idx = plane_offset + pred_idx; // Index with plane shift. + const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx]; + const int adjusted_weight = adjust_filter_weight_yuv( + filter_weight, sum_square_diff, num_ref_pixels, strength); + accum[idx] += adjusted_weight * pred_value; + count[idx] += adjusted_weight; + + ++pred_idx; + } + } + plane_offset += mb_pels; + } + + aom_free(square_diff); +} + +// Applies temporal filter with plane-wise strategy. +// The strategy of filter weight adjustment is different from the function +// `av1_apply_temporal_filter_yuv_c()`. +// Inputs: +// frame_to_filter: Pointer to the frame to be filtered, which is used as +// reference to compute squared differece from the predictor. +// mbd: Pointer to the block for filtering, which is ONLY used to get +// subsampling information of all planes. +// block_size: Size of the block. +// mb_row: Row index of the block in the entire frame. +// mb_col: Column index of the block in the entire frame. +// num_planes: Number of planes in the frame. +// noise_levels: Pointer to the noise levels of the to-filter frame, estimated +// with each plane (in Y, U, V order). +// use_subblock: Whether to use 4 sub-blocks to replace the original block. +// block_mse: Motion search error (MSE) for the entire block. +// subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks. +// q_factor: Quantization factor. This is actually the `q` defined in libaom, +// which is converted from `qindex`. +// pred: Pointer to the well-built predictors. +// accum: Pointer to the pixel-wise accumulator for filtering. +// count: Pointer to the pixel-wise counter fot filtering. +// Returns: +// Nothing will be returned. But the content to which `accum` and `pred` +// point will be modified. +void av1_apply_temporal_filter_planewise_c( + const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const double *noise_levels, const int use_subblock, + const int block_mse, const int *subblock_mses, const int q_factor, + const uint8_t *pred, uint32_t *accum, uint16_t *count) { + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + + // Block information. + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int mb_pels = mb_height * mb_width; + const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter); + const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred); + + // Allocate memory for pixel-wise squared differences for all planes. They, + // regardless of the subsampling, are assigned with memory of size `mb_pels`. + uint32_t *square_diff = + aom_memalign(16, num_planes * mb_pels * sizeof(uint32_t)); + memset(square_diff, 0, num_planes * mb_pels * sizeof(square_diff[0])); + + int plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + // Locate pixel on reference frame. + const int plane_h = mb_height >> mbd->plane[plane].subsampling_y; + const int plane_w = mb_width >> mbd->plane[plane].subsampling_x; + const int frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1]; + const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; + const uint8_t *ref = frame_to_filter->buffers[plane]; + compute_square_diff(ref, frame_offset, frame_stride, pred, plane_offset, + plane_w, plane_h, plane_w, is_high_bitdepth, + square_diff + plane_offset); + plane_offset += mb_pels; + } + + // Get window size for pixel-wise filtering. + assert(TF_PLANEWISE_FILTER_WINDOW_LENGTH % 2 == 1); + const int half_window = TF_PLANEWISE_FILTER_WINDOW_LENGTH >> 1; + + // Hyper-parameter for filter weight adjustment. + const int frame_height = frame_to_filter->heights[0] + << mbd->plane[0].subsampling_y; + const int decay_control = frame_height >= 720 ? 4 : 3; + + // Handle planes in sequence. + plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const int subsampling_y = mbd->plane[plane].subsampling_y; + const int subsampling_x = mbd->plane[plane].subsampling_x; + const int h = mb_height >> subsampling_y; // Plane height. + const int w = mb_width >> subsampling_x; // Plane width. + + // Perform filtering. + int pred_idx = 0; + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + // non-local mean approach + uint64_t sum_square_diff = 0; + int num_ref_pixels = 0; + + for (int wi = -half_window; wi <= half_window; ++wi) { + for (int wj = -half_window; wj <= half_window; ++wj) { + const int y = CLIP(i + wi, 0, h - 1); // Y-coord on current plane. + const int x = CLIP(j + wj, 0, w - 1); // X-coord on current plane. + sum_square_diff += square_diff[plane_offset + y * w + x]; + ++num_ref_pixels; + } + } + + // Filter U-plane and V-plane using Y-plane. This is because motion + // search is only done on Y-plane, so the information from Y-plane will + // be more accurate. + if (plane != 0) { + const int ss_y_shift = subsampling_y - mbd->plane[0].subsampling_y; + const int ss_x_shift = subsampling_x - mbd->plane[0].subsampling_x; + for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { + for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { + const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. + const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane. + const int ww = w << ss_x_shift; // Width of Y-plane. + sum_square_diff += square_diff[yy * ww + xx]; + ++num_ref_pixels; + } + } + } + + // Scale down the difference for high bit depth input. + if (mbd->bd > 8) sum_square_diff >>= (mbd->bd - 8) * (mbd->bd - 8); + const double window_error = (double)(sum_square_diff) / num_ref_pixels; + const int subblock_idx = (i >= h / 2) * 2 + (j >= w / 2); + const double block_error = + (double)(use_subblock ? subblock_mses[subblock_idx] : block_mse); + + // Control factor for non-local mean approach. + const double r = + (double)decay_control * (0.7 + log(noise_levels[plane] + 1.0)); + const double q = AOMMIN((double)(q_factor * q_factor) / 256.0, 1); + + // Compute filter weight. + const double scaled_diff = + AOMMAX(-(window_error + block_error / 10) / (2 * r * r * q), -15.0); + const int adjusted_weight = + (int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE); + + const int idx = plane_offset + pred_idx; // Index with plane shift. + const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx]; + accum[idx] += adjusted_weight * pred_value; + count[idx] += adjusted_weight; + + ++pred_idx; + } + } + plane_offset += mb_pels; + } + + aom_free(square_diff); +} + +// Computes temporal filter weights and accumulators from all reference frames +// excluding the current frame to be filtered. +// Inputs: +// frame_to_filter: Pointer to the frame to be filtered, which is used as +// reference to compute squared differece from the predictor. +// mbd: Pointer to the block for filtering, which is ONLY used to get +// subsampling information of all planes and the bit-depth. +// block_size: Size of the block. +// mb_row: Row index of the block in the entire frame. +// mb_col: Column index of the block in the entire frame. +// num_planes: Number of planes in the frame. +// strength: Strength for filter weight adjustment. (Used in YUV strategy) +// use_subblock: Whether to use 4 sub-blocks to replace the original block. +// (Used in YUV strategy) +// subblock_filter_weights: The filter weights for each sub-block (row-major +// order). If `use_subblock` is set as 0, the first +// weight will be applied to the entire block. (Used +// in YUV strategy) +// noise_levels: Pointer to the noise levels of the to-filter frame, estimated +// with each plane (in Y, U, V order). (Used in plane-wise +// strategy) +// block_mse: Motion search error (MSE) for the entire block. +// subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks. +// q_factor: Quantization factor. +// pred: Pointer to the well-built predictors. +// accum: Pointer to the pixel-wise accumulator for filtering. +// count: Pointer to the pixel-wise counter fot filtering. +// Returns: +// Nothing will be returned. But the content to which `accum` and `pred` +// point will be modified. +void av1_apply_temporal_filter_others( + const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const int strength, const int use_subblock, + const int *subblock_filter_weights, const double *noise_levels, + const int block_mse, const int *subblock_mses, const int q_factor, + const uint8_t *pred, uint32_t *accum, uint16_t *count) { + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + + if (TF_ENABLE_PLANEWISE_STRATEGY) { + // TODO(any): avx2 and sse2 version should be changed to align with C + // function before using. + if (is_frame_high_bitdepth(frame_to_filter) || block_size != BLOCK_32X32) { + av1_apply_temporal_filter_planewise_c( + frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes, + noise_levels, use_subblock, block_mse, subblock_mses, q_factor, pred, + accum, count); + } else { + av1_apply_temporal_filter_planewise( + frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes, + noise_levels, use_subblock, block_mse, subblock_mses, q_factor, pred, + accum, count); + } + } else { // Commonly used for low-resolution video. + if (subblock_filter_weights[0] == 0 && subblock_filter_weights[1] == 0 && + subblock_filter_weights[2] == 0 && subblock_filter_weights[3] == 0) { + return; + } + const int adj_strength = strength + 2 * (mbd->bd - 8); + if (num_planes == 3 && TF_YUV_FILTER_WEIGHT_SCALE == 3 && + block_size != BLOCK_32X32) { + av1_apply_temporal_filter_yuv(frame_to_filter, mbd, block_size, mb_row, + mb_col, num_planes, adj_strength, + use_subblock, subblock_filter_weights, pred, + accum, count); + } else { + // TODO(any): sse4 version should be changed to align with C function + // before using. + av1_apply_temporal_filter_yuv_c(frame_to_filter, mbd, block_size, mb_row, + mb_col, num_planes, adj_strength, + use_subblock, subblock_filter_weights, + pred, accum, count); + } + } +} + +// Normalizes the accumulated filtering result to produce the filtered frame. +// Inputs: +// mbd: Pointer to the block for filtering, which is ONLY used to get +// subsampling information of all planes. +// block_size: Size of the block. +// mb_row: Row index of the block in the entire frame. +// mb_col: Column index of the block in the entire frame. +// num_planes: Number of planes in the frame. +// accum: Pointer to the pre-computed accumulator. +// count: Pointer to the pre-computed count. +// result_buffer: Pointer to result buffer. +// Returns: +// Nothing will be returned. But the content to which `result_buffer` point +// will be modified. +static void tf_normalize_filtered_frame( + const MACROBLOCKD *mbd, const BLOCK_SIZE block_size, const int mb_row, + const int mb_col, const int num_planes, const uint32_t *accum, + const uint16_t *count, YV12_BUFFER_CONFIG *result_buffer) { + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + + // Block information. + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int mb_pels = mb_height * mb_width; + const int is_high_bitdepth = is_frame_high_bitdepth(result_buffer); + + int plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const int plane_h = mb_height >> mbd->plane[plane].subsampling_y; + const int plane_w = mb_width >> mbd->plane[plane].subsampling_x; + const int frame_stride = result_buffer->strides[plane == 0 ? 0 : 1]; + const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; + uint8_t *const buf = result_buffer->buffers[plane]; + uint16_t *const buf16 = CONVERT_TO_SHORTPTR(buf); + + int plane_idx = 0; // Pixel index on current plane (block-base). + int frame_idx = frame_offset; // Pixel index on the entire frame. + for (int i = 0; i < plane_h; ++i) { + for (int j = 0; j < plane_w; ++j) { + const int idx = plane_idx + plane_offset; + const uint16_t rounding = count[idx] >> 1; + if (is_high_bitdepth) { + buf16[frame_idx] = + (uint16_t)OD_DIVU(accum[idx] + rounding, count[idx]); + } else { + buf[frame_idx] = (uint8_t)OD_DIVU(accum[idx] + rounding, count[idx]); + } + ++plane_idx; + ++frame_idx; + } + frame_idx += (frame_stride - plane_w); + } + plane_offset += mb_pels; + } +} + +// Helper function to compute number of blocks on either side of the frame. +static INLINE int get_num_blocks(const int frame_length, const int mb_length) { + return (frame_length + mb_length - 1) / mb_length; +} + +typedef struct { + int64_t sum; + int64_t sse; +} FRAME_DIFF; + +// Does temporal filter for a particular frame. +// Inputs: +// cpi: Pointer to the composed information of input video. +// frames: Frame buffers used for temporal filtering. +// num_frames: Number of frames in the frame buffer. +// filter_frame_idx: Index of the frame to be filtered. +// is_key_frame: Whether the to-filter is a key frame. +// is_second_arf: Whether the to-filter frame is the second ARF. This field +// is ONLY used for assigning filter weight. +// block_size: Block size used for temporal filtering. +// scale: Scaling factor. +// strength: Pre-estimated strength for filter weight adjustment. +// noise_levels: Pointer to the noise levels of the to-filter frame, estimated +// with each plane (in Y, U, V order). +// Returns: +// Difference between filtered frame and the original frame. +static FRAME_DIFF tf_do_filtering( + AV1_COMP *cpi, YV12_BUFFER_CONFIG **frames, const int num_frames, + const int filter_frame_idx, const int is_key_frame, const int is_second_arf, + const BLOCK_SIZE block_size, const struct scale_factors *scale, + const int strength, const double *noise_levels) { + // Basic information. + const YV12_BUFFER_CONFIG *const frame_to_filter = frames[filter_frame_idx]; + const int frame_height = frame_to_filter->y_crop_height; + const int frame_width = frame_to_filter->y_crop_width; + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int mb_pels = mb_height * mb_width; + const int mb_rows = get_num_blocks(frame_height, mb_height); + const int mb_cols = get_num_blocks(frame_width, mb_width); + const int num_planes = av1_num_planes(&cpi->common); + const int mi_h = mi_size_high_log2[block_size]; + const int mi_w = mi_size_wide_log2[block_size]; + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter); + + // Save input state. + MACROBLOCK *const mb = &cpi->td.mb; + MACROBLOCKD *const mbd = &mb->e_mbd; + uint8_t *input_buffer[MAX_MB_PLANE]; + for (int i = 0; i < num_planes; i++) { + input_buffer[i] = mbd->plane[i].pre[0].buf; + } + MB_MODE_INFO **input_mb_mode_info = mbd->mi; + + // Setup. + mbd->block_ref_scale_factors[0] = scale; + mbd->block_ref_scale_factors[1] = scale; + // A temporary block info used to store state in temporal filtering process. + MB_MODE_INFO *tmp_mb_mode_info = (MB_MODE_INFO *)malloc(sizeof(MB_MODE_INFO)); + memset(tmp_mb_mode_info, 0, sizeof(MB_MODE_INFO)); + mbd->mi = &tmp_mb_mode_info; + mbd->mi[0]->motion_mode = SIMPLE_TRANSLATION; + // Allocate memory for predictor, accumulator and count. + uint8_t *pred8 = aom_memalign(32, num_planes * mb_pels * sizeof(uint8_t)); + uint16_t *pred16 = aom_memalign(32, num_planes * mb_pels * sizeof(uint16_t)); + uint32_t *accum = aom_memalign(16, num_planes * mb_pels * sizeof(uint32_t)); + uint16_t *count = aom_memalign(16, num_planes * mb_pels * sizeof(uint16_t)); + memset(pred8, 0, num_planes * mb_pels * sizeof(pred8[0])); + memset(pred16, 0, num_planes * mb_pels * sizeof(pred16[0])); + uint8_t *const pred = is_high_bitdepth ? CONVERT_TO_BYTEPTR(pred16) : pred8; + + // Do filtering. + FRAME_DIFF diff = { 0, 0 }; + // Perform temporal filtering block by block. + for (int mb_row = 0; mb_row < mb_rows; mb_row++) { + av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits, + (mb_row << mi_h), (mb_height >> MI_SIZE_LOG2), + cpi->oxcf.border_in_pixels); + for (int mb_col = 0; mb_col < mb_cols; mb_col++) { + av1_set_mv_col_limits(&cpi->common.mi_params, &mb->mv_limits, + (mb_col << mi_w), (mb_width >> MI_SIZE_LOG2), + cpi->oxcf.border_in_pixels); + memset(accum, 0, num_planes * mb_pels * sizeof(accum[0])); + memset(count, 0, num_planes * mb_pels * sizeof(count[0])); + MV ref_mv = kZeroMv; // Reference motion vector passed down along frames. + // Perform temporal filtering frame by frame. + for (int frame = 0; frame < num_frames; frame++) { + if (frames[frame] == NULL) continue; + + // Motion search. + MV subblock_mvs[4] = { kZeroMv, kZeroMv, kZeroMv, kZeroMv }; + int subblock_filter_weights[4] = { 0, 0, 0, 0 }; + int block_mse = INT_MAX; + int subblock_mses[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX }; + + if (frame == filter_frame_idx) { // Frame to be filtered. + // Set motion vector as 0 for the frame to be filtered. + mbd->mi[0]->mv[0].as_mv = kZeroMv; + // Change ref_mv sign for following frames. + ref_mv.row *= -1; + ref_mv.col *= -1; + } else { // Other reference frames. + block_mse = tf_motion_search(cpi, frame_to_filter, frames[frame], + block_size, mb_row, mb_col, &ref_mv, + subblock_mvs, subblock_mses); + // Do not pass down the reference motion vector if error is too large. + const int thresh = AOMMIN(frame_height, frame_width) >= 720 ? 12 : 3; + if (block_mse > (thresh << (mbd->bd - 8))) { + ref_mv = kZeroMv; + } + } + + // Build predictor. + int use_subblock = tf_get_filter_weight( + block_mse, subblock_mses, is_second_arf, subblock_filter_weights); + tf_build_predictor(frames[frame], mbd, block_size, mb_row, mb_col, + num_planes, scale, use_subblock, subblock_mvs, pred); + + // Perform weighted averaging. + if (frame == filter_frame_idx) { // Frame to be filtered. + av1_apply_temporal_filter_self(mbd, block_size, num_planes, + subblock_filter_weights[0], pred, + accum, count); + } else { // Other reference frames. + const FRAME_TYPE frame_type = + (cpi->common.current_frame.frame_number > 1) ? INTER_FRAME + : KEY_FRAME; + const int q_factor = + (int)av1_convert_qindex_to_q(cpi->rc.avg_frame_qindex[frame_type], + cpi->common.seq_params.bit_depth); + av1_apply_temporal_filter_others( + frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes, + strength, use_subblock, subblock_filter_weights, noise_levels, + block_mse, subblock_mses, q_factor, pred, accum, count); + } + } + + tf_normalize_filtered_frame(mbd, block_size, mb_row, mb_col, num_planes, + accum, count, &cpi->alt_ref_buffer); + + if (!is_key_frame && cpi->sf.hl_sf.adaptive_overlay_encoding) { + const int y_height = mb_height >> mbd->plane[0].subsampling_y; + const int y_width = mb_width >> mbd->plane[0].subsampling_x; + const int source_y_stride = frame_to_filter->y_stride; + const int filter_y_stride = cpi->alt_ref_buffer.y_stride; + const int source_offset = + mb_row * y_height * source_y_stride + mb_col * y_width; + const int filter_offset = + mb_row * y_height * filter_y_stride + mb_col * y_width; + unsigned int sse = 0; + cpi->fn_ptr[block_size].vf(frame_to_filter->y_buffer + source_offset, + source_y_stride, + cpi->alt_ref_buffer.y_buffer + filter_offset, + filter_y_stride, &sse); + diff.sum += sse; + diff.sse += sse * sse; + } + } + } + + // Restore input state + for (int i = 0; i < num_planes; i++) { + mbd->plane[i].pre[0].buf = input_buffer[i]; + } + mbd->mi = input_mb_mode_info; + + free(tmp_mb_mode_info); + aom_free(pred8); + aom_free(pred16); + aom_free(accum); + aom_free(count); + + return diff; +} + +// A constant number, sqrt(pi / 2), used for noise estimation. +static const double SQRT_PI_BY_2 = 1.25331413732; + +double av1_estimate_noise_from_single_plane(const YV12_BUFFER_CONFIG *frame, + const int plane, + const int bit_depth) { + const int is_y_plane = (plane == 0); + const int height = frame->crop_heights[is_y_plane ? 0 : 1]; + const int width = frame->crop_widths[is_y_plane ? 0 : 1]; + const int stride = frame->strides[is_y_plane ? 0 : 1]; + const uint8_t *src = frame->buffers[plane]; + const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); + const int is_high_bitdepth = is_frame_high_bitdepth(frame); + + int64_t accum = 0; + int count = 0; + for (int i = 1; i < height - 1; ++i) { + for (int j = 1; j < width - 1; ++j) { + // Setup a small 3x3 matrix. + const int center_idx = i * stride + j; + int mat[3][3]; + for (int ii = -1; ii <= 1; ++ii) { + for (int jj = -1; jj <= 1; ++jj) { + const int idx = center_idx + ii * stride + jj; + mat[ii + 1][jj + 1] = is_high_bitdepth ? src16[idx] : src[idx]; + } + } + // Compute sobel gradients. + const int Gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) + + 2 * (mat[1][0] - mat[1][2]); + const int Gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) + + 2 * (mat[0][1] - mat[2][1]); + const int Ga = ROUND_POWER_OF_TWO(abs(Gx) + abs(Gy), bit_depth - 8); + // Accumulate Laplacian. + if (Ga < NOISE_ESTIMATION_EDGE_THRESHOLD) { // Only count smooth pixels. + const int v = 4 * mat[1][1] - + 2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) + + (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]); + accum += ROUND_POWER_OF_TWO(abs(v), bit_depth - 8); + ++count; + } + } + } + + // Return -1.0 (unreliable estimation) if there are too few smooth pixels. + return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2; +} + +// Estimates the strength for filter weight adjustment, which is used in YUV +// strategy. This estimation is based on the pre-estimated noise level of the +// to-filter frame. +// Inputs: +// cpi: Pointer to the composed information of input video. +// noise_level: Noise level of the to-filter frame, estimated with Y-plane. +// group_boost: Boost level for the current group of frames. +// Returns: +// Estimated strength which will be used for filter weight adjustment. +static int tf_estimate_strength(const AV1_COMP *cpi, const double noise_level, + const int group_boost) { + int strength = cpi->oxcf.arnr_strength; + + // Adjust the strength based on the estimated noise level. + if (noise_level > 0) { // Adjust when the noise level is reliable. + if (noise_level < 0.75) { // Noise level lies in range (0, 0.75). + strength = strength - 2; + } else if (noise_level < 1.75) { // Noise level lies in range [0.75, 1.75). + strength = strength - 1; + } else if (noise_level < 4.0) { // Noise level lies in range [1.75, 4.0). + strength = strength + 0; + } else { // Noise level lies in range [4.0, +inf). + strength = strength + 1; + } + } + + // Adjust the strength based on active max q. + const FRAME_TYPE frame_type = + (cpi->common.current_frame.frame_number > 1) ? INTER_FRAME : KEY_FRAME; + const int q = (int)av1_convert_qindex_to_q( + cpi->rc.avg_frame_qindex[frame_type], cpi->common.seq_params.bit_depth); + strength = strength - AOMMAX(0, (16 - q) / 2); + + return CLIP(strength, 0, group_boost / 300); +} + +// Setups the frame buffer for temporal filtering. Basically, this fuction +// determines how many frames will be used for temporal filtering and then +// groups them into a buffer. +// Inputs: +// cpi: Pointer to the composed information of input video. +// filter_frame_lookahead_idx: The index of the to-filter frame in the +// lookahead buffer `cpi->lookahead`. +// is_second_arf: Whether the to-filter frame is the second ARF. This field +// will affect the number of frames used for filtering. +// frames: Pointer to the frame buffer to setup. +// num_frames_for_filtering: Number of frames used for filtering. +// filter_frame_idx: Index of the to-filter frame in the setup frame buffer. +// Returns: +// Nothing will be returned. But the frame buffer `frames`, number of frames +// in the buffer `num_frames_for_filtering`, and the index of the to-filter +// frame in the buffer `filter_frame_idx` will be updated in this function. +static void tf_setup_filtering_buffer(const AV1_COMP *cpi, + const int filter_frame_lookahead_idx, + const int is_second_arf, + YV12_BUFFER_CONFIG **frames, + int *num_frames_for_filtering, + int *filter_frame_idx) { + int num_frames = 0; // Number of frames used for filtering. + int num_frames_before = -1; // Number of frames before the to-filter frame. + int filter_frame_offset; + + if (filter_frame_lookahead_idx == -1) { // Key frame. + num_frames = TF_NUM_FILTERING_FRAMES_FOR_KEY_FRAME; + num_frames_before = 0; + filter_frame_offset = filter_frame_lookahead_idx; + } else if (filter_frame_lookahead_idx < -1) { // Key frame in one-pass mode. + num_frames = TF_NUM_FILTERING_FRAMES_FOR_KEY_FRAME; + num_frames_before = num_frames - 1; + filter_frame_offset = -filter_frame_lookahead_idx; + } else { + num_frames = cpi->oxcf.arnr_max_frames; + if (is_second_arf) { // Only use 2 neighbours for the second ARF. + num_frames = AOMMIN(num_frames, 3); + } + if (num_frames > cpi->rc.gfu_boost / 150) { + num_frames = cpi->rc.gfu_boost / 150; + num_frames += !(num_frames & 1); + } + num_frames_before = AOMMIN(num_frames >> 1, filter_frame_lookahead_idx + 1); + const int lookahead_depth = + av1_lookahead_depth(cpi->lookahead, cpi->compressor_stage); + const int num_frames_after = + AOMMIN((num_frames - 1) >> 1, + lookahead_depth - filter_frame_lookahead_idx - 1); + num_frames = num_frames_before + 1 + num_frames_after; + filter_frame_offset = filter_frame_lookahead_idx; + } + *num_frames_for_filtering = num_frames; + *filter_frame_idx = num_frames_before; + + // Setup the frame buffer. + for (int frame = 0; frame < num_frames; ++frame) { + const int lookahead_idx = frame - num_frames_before + filter_frame_offset; + struct lookahead_entry *buf = av1_lookahead_peek( + cpi->lookahead, lookahead_idx, cpi->compressor_stage); + frames[frame] = (buf == NULL) ? NULL : &buf->img; + } +} + +int av1_temporal_filter(AV1_COMP *cpi, const int filter_frame_lookahead_idx, + int *show_existing_arf) { + // Basic informaton of the current frame. + const GF_GROUP *const gf_group = &cpi->gf_group; + const uint8_t group_idx = gf_group->index; + const FRAME_UPDATE_TYPE update_type = gf_group->update_type[group_idx]; + // Filter one more ARF if the lookahead index is leq 7 (w.r.t. 9-th frame). + // This frame is ALWAYS a show existing frame. + const int is_second_arf = (update_type == INTNL_ARF_UPDATE) && + (filter_frame_lookahead_idx >= 7) && + cpi->sf.hl_sf.second_alt_ref_filtering; + // TODO(anyone): Currently, we enforce the filtering strength on internal + // ARFs except the second ARF to be zero. We should investigate in which case + // it is more beneficial to use non-zero strength filtering. + if (update_type == INTNL_ARF_UPDATE && !is_second_arf) { + return 0; + } + + // TODO(yunqing): For INTNL_ARF_UPDATE type, the following me initialization + // is used somewhere unexpectedly. Should be resolved later. + // Initialize errorperbit, sadperbit16 and sadperbit4. + const int rdmult = av1_compute_rd_mult_based_on_qindex(cpi, TF_QINDEX); + set_error_per_bit(&cpi->td.mb, rdmult); + av1_initialize_me_consts(cpi, &cpi->td.mb, TF_QINDEX); + av1_fill_mv_costs(cpi->common.fc, + cpi->common.features.cur_frame_force_integer_mv, + cpi->common.features.allow_high_precision_mv, &cpi->td.mb); + + // Setup frame buffer for filtering. + YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL }; + int num_frames_for_filtering = 0; + int filter_frame_idx = -1; + tf_setup_filtering_buffer(cpi, filter_frame_lookahead_idx, is_second_arf, + frames, &num_frames_for_filtering, + &filter_frame_idx); + + // Estimate noise and strength. + const int bit_depth = cpi->common.seq_params.bit_depth; + const int num_planes = av1_num_planes(&cpi->common); + double noise_levels[MAX_MB_PLANE] = { 0 }; + for (int plane = 0; plane < num_planes; ++plane) { + noise_levels[plane] = av1_estimate_noise_from_single_plane( + frames[filter_frame_idx], plane, bit_depth); + } + const int strength = + tf_estimate_strength(cpi, noise_levels[0], cpi->rc.gfu_boost); + if (filter_frame_lookahead_idx >= 0) { + cpi->common.showable_frame = + (strength == 0 && num_frames_for_filtering == 1) || is_second_arf || + (cpi->oxcf.enable_overlay == 0 || cpi->sf.hl_sf.disable_overlay_frames); + } + + // Do filtering. + const int is_key_frame = (filter_frame_lookahead_idx < 0); + FRAME_DIFF diff = { 0, 0 }; + if (num_frames_for_filtering > 0 && frames[0] != NULL) { + // Setup scaling factors. Scaling on each of the arnr frames is not + // supported. + // ARF is produced at the native frame size and resized when coded. + struct scale_factors sf; + av1_setup_scale_factors_for_frame( + &sf, frames[0]->y_crop_width, frames[0]->y_crop_height, + frames[0]->y_crop_width, frames[0]->y_crop_height); + diff = tf_do_filtering(cpi, frames, num_frames_for_filtering, + filter_frame_idx, is_key_frame, is_second_arf, + TF_BLOCK_SIZE, &sf, strength, noise_levels); + } + + if (is_key_frame) { // Key frame should always be filtered. + return 1; + } + + if ((show_existing_arf != NULL && cpi->sf.hl_sf.adaptive_overlay_encoding) || + is_second_arf) { + const int frame_height = frames[filter_frame_idx]->y_crop_height; + const int frame_width = frames[filter_frame_idx]->y_crop_width; + const int block_height = block_size_high[TF_BLOCK_SIZE]; + const int block_width = block_size_wide[TF_BLOCK_SIZE]; + const int mb_rows = get_num_blocks(frame_height, block_height); + const int mb_cols = get_num_blocks(frame_width, block_width); + const int num_mbs = AOMMAX(1, mb_rows * mb_cols); + const float mean = (float)diff.sum / num_mbs; + const float std = (float)sqrt((float)diff.sse / num_mbs - mean * mean); + + aom_clear_system_state(); + // TODO(yunqing): This can be combined with TPL q calculation later. + cpi->rc.base_frame_target = gf_group->bit_allocation[group_idx]; + av1_set_target_rate(cpi, cpi->common.width, cpi->common.height); + int top_index = 0; + int bottom_index = 0; + const int q = av1_rc_pick_q_and_bounds(cpi, &cpi->rc, cpi->oxcf.width, + cpi->oxcf.height, group_idx, + &bottom_index, &top_index); + const int ac_q = av1_ac_quant_QTX(q, 0, bit_depth); + const float threshold = 0.7f * ac_q * ac_q; + + if (!is_second_arf) { + *show_existing_arf = 0; + if (mean < threshold && std < mean * 1.2) { + *show_existing_arf = 1; + } + cpi->common.showable_frame |= *show_existing_arf; + } else { + // Use source frame if the filtered frame becomes very different. + if (!(mean < threshold && std < mean * 1.2)) { + return 0; + } + } + } + + return 1; +} diff --git a/libs/libaom/src/av1/encoder/temporal_filter.h b/libs/libaom/src/av1/encoder/temporal_filter.h new file mode 100644 index 000000000..5a6bde259 --- /dev/null +++ b/libs/libaom/src/av1/encoder/temporal_filter.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_TEMPORAL_FILTER_H_ +#define AOM_AV1_ENCODER_TEMPORAL_FILTER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +// TODO(any): These two variables are only used in avx2, sse2, sse4 +// implementations, where the block size is still hard coded. This should be +// fixed to align with the c implementation. +#define BH 32 +#define BW 32 + +// Block size used in temporal filtering. +#define TF_BLOCK_SIZE BLOCK_32X32 + +// Window size for YUV temporal filtering. +// This is particually used for function `av1_apply_temporal_filter_yuv()`. +#define TF_YUV_FILTER_WINDOW_LENGTH 3 +// A scale factor used in YUV temporal filtering for weight adjustment. +#define TF_YUV_FILTER_WEIGHT_SCALE 3 + +#define TF_ENABLE_PLANEWISE_STRATEGY 1 +// Window size for plane-wise temporal filtering. +// This is particually used for function `av1_apply_temporal_filter_planewise()` +#define TF_PLANEWISE_FILTER_WINDOW_LENGTH 5 +// A scale factor used in plane-wise temporal filtering to raise the filter +// weight from `double` with range [0, 1] to `int` with range [0, 1000]. +#define TF_PLANEWISE_FILTER_WEIGHT_SCALE 1000 + +#define NOISE_ESTIMATION_EDGE_THRESHOLD 50 +// Estimates noise level from a given frame using a single plane (Y, U, or V). +// This is an adaptation of the mehtod in the following paper: +// Shen-Chuan Tai, Shih-Ming Yang, "A fast method for image noise +// estimation using Laplacian operator and adaptive edge detection", +// Proc. 3rd International Symposium on Communications, Control and +// Signal Processing, 2008, St Julians, Malta. +// Inputs: +// frame: Pointer to the frame to estimate noise level from. +// plane: Index of the plane used for noise estimation. Commonly, 0 for +// Y-plane, 1 for U-plane, and 2 for V-plane. +// bit_depth: Actual bit-depth instead of the encoding bit-depth of the frame. +// Returns: +// The estimated noise, or -1.0 if there are too few smooth pixels. +double av1_estimate_noise_from_single_plane(const YV12_BUFFER_CONFIG *frame, + const int plane, + const int bit_depth); + +#define TF_QINDEX 128 // Q-index used in temporal filtering. +#define TF_NUM_FILTERING_FRAMES_FOR_KEY_FRAME 7 +// Performs temporal filtering if needed. +// NOTE: In this function, the lookahead index is different from the 0-based +// real index. For example, if we want to filter the first frame in the +// pre-fetched buffer `cpi->lookahead`, the lookahead index will be -1 instead +// of 0. More concretely, 0 indicates the first LOOKAHEAD frame, which is the +// second frame in the pre-fetched buffer. Another example: if we want to filter +// the 17-th frame, which is an ARF, the lookahead index is 15 instead of 16. +// Futhermore, negative number is used for key frame in one-pass mode, where key +// frame is filtered with the frames before it instead of after it. For example, +// -15 means to filter the 17-th frame, which is a key frame in one-pass mode. +// Inputs: +// cpi: Pointer to the composed information of input video. +// filter_frame_lookahead_idx: The index of the to-filter frame in the +// lookahead buffer `cpi->lookahead`. +// show_existing_arf: Whether to show existing ARF. This field will be updated +// in this function. +// Returns: +// Whether temporal filtering is successfully done. +int av1_temporal_filter(AV1_COMP *cpi, const int filter_frame_lookahead_idx, + int *show_existing_arf); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_TEMPORAL_FILTER_H_ diff --git a/libs/libaom/src/av1/encoder/tokenize.c b/libs/libaom/src/av1/encoder/tokenize.c new file mode 100644 index 000000000..e67415349 --- /dev/null +++ b/libs/libaom/src/av1/encoder/tokenize.c @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "aom_mem/aom_mem.h" + +#include "av1/common/entropy.h" +#include "av1/common/pred_common.h" +#include "av1/common/scan.h" +#include "av1/common/seg_common.h" + +#include "av1/encoder/cost.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encodetxb.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/tokenize.h" + +static int cost_and_tokenize_map(Av1ColorMapParam *param, TOKENEXTRA **t, + int plane, int calc_rate, int allow_update_cdf, + FRAME_COUNTS *counts, MapCdf map_pb_cdf) { + const uint8_t *const color_map = param->color_map; + MapCdf map_cdf = param->map_cdf; + ColorCost color_cost = param->color_cost; + const int plane_block_width = param->plane_width; + const int rows = param->rows; + const int cols = param->cols; + const int n = param->n_colors; + const int palette_size_idx = n - PALETTE_MIN_SIZE; + int this_rate = 0; + uint8_t color_order[PALETTE_MAX_SIZE]; + + (void)plane; + (void)counts; + + for (int k = 1; k < rows + cols - 1; ++k) { + for (int j = AOMMIN(k, cols - 1); j >= AOMMAX(0, k - rows + 1); --j) { + int i = k - j; + int color_new_idx; + const int color_ctx = av1_get_palette_color_index_context( + color_map, plane_block_width, i, j, n, color_order, &color_new_idx); + assert(color_new_idx >= 0 && color_new_idx < n); + if (calc_rate) { + this_rate += (*color_cost)[palette_size_idx][color_ctx][color_new_idx]; + } else { + (*t)->token = color_new_idx; + (*t)->color_map_cdf = map_pb_cdf[palette_size_idx][color_ctx]; + ++(*t); + if (allow_update_cdf) + update_cdf(map_cdf[palette_size_idx][color_ctx], color_new_idx, n); +#if CONFIG_ENTROPY_STATS + if (plane) { + ++counts->palette_uv_color_index[palette_size_idx][color_ctx] + [color_new_idx]; + } else { + ++counts->palette_y_color_index[palette_size_idx][color_ctx] + [color_new_idx]; + } +#endif + } + } + } + if (calc_rate) return this_rate; + return 0; +} + +static void get_palette_params(const MACROBLOCK *const x, int plane, + BLOCK_SIZE bsize, Av1ColorMapParam *params) { + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + params->color_map = xd->plane[plane].color_index_map; + params->map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf + : xd->tile_ctx->palette_y_color_index_cdf; + params->color_cost = + plane ? &x->palette_uv_color_cost : &x->palette_y_color_cost; + params->n_colors = pmi->palette_size[plane]; + av1_get_block_dimensions(bsize, plane, xd, ¶ms->plane_width, NULL, + ¶ms->rows, ¶ms->cols); +} + +static void get_color_map_params(const MACROBLOCK *const x, int plane, + BLOCK_SIZE bsize, TX_SIZE tx_size, + COLOR_MAP_TYPE type, + Av1ColorMapParam *params) { + (void)tx_size; + memset(params, 0, sizeof(*params)); + switch (type) { + case PALETTE_MAP: get_palette_params(x, plane, bsize, params); break; + default: assert(0 && "Invalid color map type"); return; + } +} + +int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize, + TX_SIZE tx_size, COLOR_MAP_TYPE type) { + assert(plane == 0 || plane == 1); + Av1ColorMapParam color_map_params; + get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params); + MapCdf map_pb_cdf = plane ? x->tile_pb_ctx->palette_uv_color_index_cdf + : x->tile_pb_ctx->palette_y_color_index_cdf; + return cost_and_tokenize_map(&color_map_params, NULL, plane, 1, 0, NULL, + map_pb_cdf); +} + +void av1_tokenize_color_map(const MACROBLOCK *const x, int plane, + TOKENEXTRA **t, BLOCK_SIZE bsize, TX_SIZE tx_size, + COLOR_MAP_TYPE type, int allow_update_cdf, + FRAME_COUNTS *counts) { + assert(plane == 0 || plane == 1); + Av1ColorMapParam color_map_params; + get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params); + // The first color index does not use context or entropy. + (*t)->token = color_map_params.color_map[0]; + (*t)->color_map_cdf = NULL; + ++(*t); + MapCdf map_pb_cdf = plane ? x->tile_pb_ctx->palette_uv_color_index_cdf + : x->tile_pb_ctx->palette_y_color_index_cdf; + cost_and_tokenize_map(&color_map_params, t, plane, 0, allow_update_cdf, + counts, map_pb_cdf); +} + +static void tokenize_vartx(ThreadData *td, TX_SIZE tx_size, + BLOCK_SIZE plane_bsize, int blk_row, int blk_col, + int block, int plane, void *arg) { + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int max_blocks_high = max_block_high(xd, plane_bsize, plane); + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + const TX_SIZE plane_tx_size = + plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x, + pd->subsampling_y) + : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row, + blk_col)]; + + if (tx_size == plane_tx_size || plane) { + plane_bsize = get_plane_block_size(mbmi->sb_type, pd->subsampling_x, + pd->subsampling_y); + av1_update_and_record_txb_context(plane, block, blk_row, blk_col, + plane_bsize, tx_size, arg); + + } else { + // Half the block size in transform block unit. + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + const int step = bsw * bsh; + + assert(bsw > 0 && bsh > 0); + + for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) { + for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) { + const int offsetr = blk_row + row; + const int offsetc = blk_col + col; + + if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; + + tokenize_vartx(td, sub_txs, plane_bsize, offsetr, offsetc, block, plane, + arg); + block += step; + } + } + } +} + +void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td, + RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate, + uint8_t allow_update_cdf) { + assert(bsize < BLOCK_SIZES_ALL); + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols) + return; + + const int num_planes = av1_num_planes(cm); + MB_MODE_INFO *const mbmi = xd->mi[0]; + struct tokenize_b_args arg = { cpi, td, 0, allow_update_cdf, dry_run }; + + if (mbmi->skip) { + av1_reset_entropy_context(xd, bsize, num_planes); + return; + } + + for (int plane = 0; plane < num_planes; ++plane) { + if (plane && !xd->is_chroma_ref) break; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y); + assert(plane_bsize < BLOCK_SIZES_ALL); + const int mi_width = mi_size_wide[plane_bsize]; + const int mi_height = mi_size_high[plane_bsize]; + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane); + const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size]; + const int bw = mi_size_wide[txb_size]; + const int bh = mi_size_high[txb_size]; + int block = 0; + const int step = + tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size]; + + const BLOCK_SIZE max_unit_bsize = + get_plane_block_size(BLOCK_64X64, ss_x, ss_y); + int mu_blocks_wide = mi_size_wide[max_unit_bsize]; + int mu_blocks_high = mi_size_high[max_unit_bsize]; + + mu_blocks_wide = AOMMIN(mi_width, mu_blocks_wide); + mu_blocks_high = AOMMIN(mi_height, mu_blocks_high); + + for (int idy = 0; idy < mi_height; idy += mu_blocks_high) { + for (int idx = 0; idx < mi_width; idx += mu_blocks_wide) { + const int unit_height = AOMMIN(mu_blocks_high + idy, mi_height); + const int unit_width = AOMMIN(mu_blocks_wide + idx, mi_width); + for (int blk_row = idy; blk_row < unit_height; blk_row += bh) { + for (int blk_col = idx; blk_col < unit_width; blk_col += bw) { + tokenize_vartx(td, max_tx_size, plane_bsize, blk_row, blk_col, + block, plane, &arg); + block += step; + } + } + } + } + } + if (rate) *rate += arg.this_rate; +} diff --git a/libs/libaom/src/av1/encoder/tokenize.h b/libs/libaom/src/av1/encoder/tokenize.h new file mode 100644 index 000000000..52caacbae --- /dev/null +++ b/libs/libaom/src/av1/encoder/tokenize.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_TOKENIZE_H_ +#define AOM_AV1_ENCODER_TOKENIZE_H_ + +#include "av1/common/entropy.h" +#include "av1/encoder/block.h" +#include "aom_dsp/bitwriter.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + aom_cdf_prob *color_map_cdf; + uint8_t token; +} TOKENEXTRA; + +struct AV1_COMP; +struct ThreadData; +struct FRAME_COUNTS; + +enum { + OUTPUT_ENABLED = 0, + DRY_RUN_NORMAL, + DRY_RUN_COSTCOEFFS, +} UENUM1BYTE(RUN_TYPE); + +struct tokenize_b_args { + const struct AV1_COMP *cpi; + struct ThreadData *td; + int this_rate; + uint8_t allow_update_cdf; + RUN_TYPE dry_run; +}; + +// Note in all the tokenize functions rate if non NULL is incremented +// with the coefficient token cost only if dry_run = DRY_RUN_COSTCOEFS, +// otherwise rate is not incremented. +void av1_tokenize_sb_vartx(const struct AV1_COMP *cpi, struct ThreadData *td, + RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate, + uint8_t allow_update_cdf); + +int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize, + TX_SIZE tx_size, COLOR_MAP_TYPE type); + +void av1_tokenize_color_map(const MACROBLOCK *const x, int plane, + TOKENEXTRA **t, BLOCK_SIZE bsize, TX_SIZE tx_size, + COLOR_MAP_TYPE type, int allow_update_cdf, + struct FRAME_COUNTS *counts); + +static INLINE int av1_get_tx_eob(const struct segmentation *seg, int segment_id, + TX_SIZE tx_size) { + const int eob_max = av1_get_max_eob(tx_size); + return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_TOKENIZE_H_ diff --git a/libs/libaom/src/av1/encoder/tpl_model.c b/libs/libaom/src/av1/encoder/tpl_model.c new file mode 100644 index 000000000..79b94f373 --- /dev/null +++ b/libs/libaom/src/av1/encoder/tpl_model.c @@ -0,0 +1,1189 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_codec.h" +#include "aom_ports/system_state.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/enums.h" +#include "av1/common/idct.h" +#include "av1/common/reconintra.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/encode_strategy.h" +#include "av1/encoder/hybrid_fwd_txfm.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/tpl_model.h" + +static AOM_INLINE void get_quantize_error(const MACROBLOCK *x, int plane, + const tran_low_t *coeff, + tran_low_t *qcoeff, + tran_low_t *dqcoeff, TX_SIZE tx_size, + uint16_t *eob, int64_t *recon_error, + int64_t *sse) { + const struct macroblock_plane *const p = &x->plane[plane]; + const SCAN_ORDER *const scan_order = &av1_default_scan_orders[tx_size]; + int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]; + const int shift = tx_size == TX_32X32 ? 0 : 2; + + av1_quantize_fp(coeff, pix_num, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX, + p->quant_shift_QTX, qcoeff, dqcoeff, p->dequant_QTX, eob, + scan_order->scan, scan_order->iscan); + + *recon_error = av1_block_error(coeff, dqcoeff, pix_num, sse) >> shift; + *recon_error = AOMMAX(*recon_error, 1); + + *sse = (*sse) >> shift; + *sse = AOMMAX(*sse, 1); +} + +static AOM_INLINE void tpl_fwd_txfm(const int16_t *src_diff, int bw, + tran_low_t *coeff, TX_SIZE tx_size, + int bit_depth, int is_hbd) { + TxfmParam txfm_param; + txfm_param.tx_type = DCT_DCT; + txfm_param.tx_size = tx_size; + txfm_param.lossless = 0; + txfm_param.tx_set_type = EXT_TX_SET_ALL16; + + txfm_param.bd = bit_depth; + txfm_param.is_hbd = is_hbd; + av1_fwd_txfm(src_diff, coeff, bw, &txfm_param); +} + +static AOM_INLINE int64_t tpl_get_satd_cost(const MACROBLOCK *x, + int16_t *src_diff, int diff_stride, + const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + tran_low_t *coeff, int bw, int bh, + TX_SIZE tx_size) { + const MACROBLOCKD *xd = &x->e_mbd; + const int pix_num = bw * bh; + + av1_subtract_block(xd, bh, bw, src_diff, diff_stride, src, src_stride, dst, + dst_stride); + tpl_fwd_txfm(src_diff, bw, coeff, tx_size, xd->bd, is_cur_buf_hbd(xd)); + return aom_satd(coeff, pix_num); +} + +static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) { + const SCAN_ORDER *const scan_order = &av1_default_scan_orders[tx_size]; + + assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob); + + int rate_cost = 1; + + for (int idx = 0; idx < eob; ++idx) { + int abs_level = abs(qcoeff[scan_order->scan[idx]]); + rate_cost += (int)(log(abs_level + 1.0) / log(2.0)) + 1; + } + + return (rate_cost << AV1_PROB_COST_SHIFT); +} + +static AOM_INLINE void txfm_quant_rdcost( + const MACROBLOCK *x, int16_t *src_diff, int diff_stride, uint8_t *src, + int src_stride, uint8_t *dst, int dst_stride, tran_low_t *coeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, int bw, int bh, TX_SIZE tx_size, + int *rate_cost, int64_t *recon_error, int64_t *sse) { + const MACROBLOCKD *xd = &x->e_mbd; + uint16_t eob; + av1_subtract_block(xd, bh, bw, src_diff, diff_stride, src, src_stride, dst, + dst_stride); + tpl_fwd_txfm(src_diff, diff_stride, coeff, tx_size, xd->bd, + is_cur_buf_hbd(xd)); + + get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, &eob, recon_error, + sse); + + *rate_cost = rate_estimator(qcoeff, eob, tx_size); + + av1_inverse_transform_block(xd, dqcoeff, 0, DCT_DCT, tx_size, dst, dst_stride, + eob, 0); +} + +static uint32_t motion_estimation(AV1_COMP *cpi, MACROBLOCK *x, + uint8_t *cur_frame_buf, + uint8_t *ref_frame_buf, int stride, + int stride_ref, BLOCK_SIZE bsize, + MV center_mv, int_mv *best_mv) { + AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + TPL_SPEED_FEATURES *tpl_sf = &cpi->sf.tpl_sf; + int step_param; + uint32_t bestsme = UINT_MAX; + int distortion; + uint32_t sse; + int cost_list[5]; + FULLPEL_MV start_mv = get_fullmv_from_mv(¢er_mv); + + // Setup frame pointers + x->plane[0].src.buf = cur_frame_buf; + x->plane[0].src.stride = stride; + xd->plane[0].pre[0].buf = ref_frame_buf; + xd->plane[0].pre[0].stride = stride_ref; + + step_param = tpl_sf->reduce_first_step_size; + step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2); + + search_site_config *ss_cfg = &cpi->mv_search_params.ss_cfg[SS_CFG_SRC]; + if (ss_cfg->stride != stride_ref) + ss_cfg = &cpi->mv_search_params.ss_cfg[SS_CFG_LOOKAHEAD]; + + assert(ss_cfg->stride == stride_ref); + + FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; + av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, ¢er_mv, + ss_cfg); + + av1_full_pixel_search(start_mv, &full_ms_params, step_param, + cond_cost_list(cpi, cost_list), &best_mv->as_fullmv, + NULL); + + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, ¢er_mv, + cost_list); + ms_params.forced_stop = tpl_sf->subpel_force_stop; + ms_params.var_params.subpel_search_type = USE_2_TAPS; + ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE; + MV subpel_start_mv = get_mv_from_fullmv(&best_mv->as_fullmv); + bestsme = cpi->mv_search_params.find_fractional_mv_step( + xd, cm, &ms_params, subpel_start_mv, &best_mv->as_mv, &distortion, &sse, + NULL); + + return bestsme; +} + +static int is_alike_mv(int_mv candidate_mv, int_mv *center_mvs, + int center_mvs_count, int skip_alike_starting_mv) { + // MV difference threshold is in 1/8 precision. + const int mv_diff_thr[3] = { 1, (8 << 3), (16 << 3) }; + int thr = mv_diff_thr[skip_alike_starting_mv]; + int i; + + for (i = 0; i < center_mvs_count; i++) { + if (abs(center_mvs[i].as_mv.col - candidate_mv.as_mv.col) < thr && + abs(center_mvs[i].as_mv.row - candidate_mv.as_mv.row) < thr) + return 1; + } + + return 0; +} + +static AOM_INLINE void mode_estimation( + AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, struct scale_factors *sf, + int frame_idx, int mi_row, int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size, + const YV12_BUFFER_CONFIG *ref_frame[], + const YV12_BUFFER_CONFIG *src_ref_frame[], TplDepStats *tpl_stats) { + AV1_COMMON *cm = &cpi->common; + const GF_GROUP *gf_group = &cpi->gf_group; + + (void)gf_group; + + TplParams *tpl_data = &cpi->tpl_data; + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx]; + const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; + + const int bw = 4 << mi_size_wide_log2[bsize]; + const int bh = 4 << mi_size_high_log2[bsize]; + const int_interpfilters kernel = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + + int64_t best_intra_cost = INT64_MAX; + int64_t intra_cost; + PREDICTION_MODE best_mode = DC_PRED; + + int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; + uint8_t *src_mb_buffer = xd->cur_buf->y_buffer + mb_y_offset; + const int src_stride = xd->cur_buf->y_stride; + + const int dst_mb_offset = + mi_row * MI_SIZE * tpl_frame->rec_picture->y_stride + mi_col * MI_SIZE; + uint8_t *dst_buffer = tpl_frame->rec_picture->y_buffer + dst_mb_offset; + const int dst_buffer_stride = tpl_frame->rec_picture->y_stride; + + // Temporaray buffers + DECLARE_ALIGNED(32, uint8_t, predictor8[MC_FLOW_NUM_PELS * 2]); + DECLARE_ALIGNED(32, int16_t, src_diff[MC_FLOW_NUM_PELS]); + DECLARE_ALIGNED(32, tran_low_t, coeff[MC_FLOW_NUM_PELS]); + DECLARE_ALIGNED(32, tran_low_t, qcoeff[MC_FLOW_NUM_PELS]); + DECLARE_ALIGNED(32, tran_low_t, dqcoeff[MC_FLOW_NUM_PELS]); + DECLARE_ALIGNED(32, tran_low_t, best_coeff[MC_FLOW_NUM_PELS]); + uint8_t *predictor = + is_cur_buf_hbd(xd) ? CONVERT_TO_BYTEPTR(predictor8) : predictor8; + int64_t recon_error = 1, sse = 1; + + memset(tpl_stats, 0, sizeof(*tpl_stats)); + + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd, + mi_row, mi_col); + set_mi_row_col(xd, &xd->tile, mi_row, mi_height, mi_col, mi_width, + cm->mi_params.mi_rows, cm->mi_params.mi_cols); + set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize], + av1_num_planes(cm)); + xd->mi[0]->sb_type = bsize; + xd->mi[0]->motion_mode = SIMPLE_TRANSLATION; + + // Intra prediction search + xd->mi[0]->ref_frame[0] = INTRA_FRAME; + + // Pre-load the bottom left line. + if (xd->left_available && + mi_row + tx_size_high_unit[tx_size] < xd->tile.mi_row_end) { +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + uint16_t *dst = CONVERT_TO_SHORTPTR(dst_buffer); + for (int i = 0; i < bw; ++i) + dst[(bw + i) * dst_buffer_stride - 1] = + dst[(bw - 1) * dst_buffer_stride - 1]; + } else { + for (int i = 0; i < bw; ++i) + dst_buffer[(bw + i) * dst_buffer_stride - 1] = + dst_buffer[(bw - 1) * dst_buffer_stride - 1]; + } +#else + for (int i = 0; i < bw; ++i) + dst_buffer[(bw + i) * dst_buffer_stride - 1] = + dst_buffer[(bw - 1) * dst_buffer_stride - 1]; +#endif + } + + // if cpi->sf.tpl_sf.prune_intra_modes is on, then search only DC_PRED, + // H_PRED, and V_PRED + const PREDICTION_MODE last_intra_mode = + cpi->sf.tpl_sf.prune_intra_modes ? D45_PRED : INTRA_MODE_END; + for (PREDICTION_MODE mode = INTRA_MODE_START; mode < last_intra_mode; + ++mode) { + av1_predict_intra_block(cm, xd, block_size_wide[bsize], + block_size_high[bsize], tx_size, mode, 0, 0, + FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride, + predictor, bw, 0, 0, 0); + + intra_cost = tpl_get_satd_cost(x, src_diff, bw, src_mb_buffer, src_stride, + predictor, bw, coeff, bw, bh, tx_size); + + if (intra_cost < best_intra_cost) { + best_intra_cost = intra_cost; + best_mode = mode; + } + } + + // Motion compensated prediction + xd->mi[0]->ref_frame[0] = INTRA_FRAME; + + int best_rf_idx = -1; + int_mv best_mv; + int64_t inter_cost; + int64_t best_inter_cost = INT64_MAX; + int rf_idx; + + best_mv.as_int = INVALID_MV; + + for (rf_idx = 0; rf_idx < INTER_REFS_PER_FRAME; ++rf_idx) { + if (ref_frame[rf_idx] == NULL || src_ref_frame[rf_idx] == NULL) { + tpl_stats->mv[rf_idx].as_int = INVALID_MV; + continue; + } + + const YV12_BUFFER_CONFIG *ref_frame_ptr = src_ref_frame[rf_idx]; + int ref_mb_offset = + mi_row * MI_SIZE * ref_frame_ptr->y_stride + mi_col * MI_SIZE; + uint8_t *ref_mb = ref_frame_ptr->y_buffer + ref_mb_offset; + int ref_stride = ref_frame_ptr->y_stride; + + int_mv best_rfidx_mv = { 0 }; + uint32_t bestsme = UINT32_MAX; + + int_mv center_mvs[4] = { { 0 } }; + int refmv_count = 1; + + if (xd->up_available) { + TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos( + mi_row - mi_height, mi_col, tpl_frame->stride, block_mis_log2)]; + if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count, + cpi->sf.tpl_sf.skip_alike_starting_mv)) { + center_mvs[refmv_count].as_int = ref_tpl_stats->mv[rf_idx].as_int; + ++refmv_count; + } + } + + if (xd->left_available) { + TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos( + mi_row, mi_col - mi_width, tpl_frame->stride, block_mis_log2)]; + if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count, + cpi->sf.tpl_sf.skip_alike_starting_mv)) { + center_mvs[refmv_count].as_int = ref_tpl_stats->mv[rf_idx].as_int; + ++refmv_count; + } + } + + if (xd->up_available && mi_col + mi_width < xd->tile.mi_col_end) { + TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos( + mi_row - mi_height, mi_col + mi_width, tpl_frame->stride, + block_mis_log2)]; + if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count, + cpi->sf.tpl_sf.skip_alike_starting_mv)) { + center_mvs[refmv_count].as_int = ref_tpl_stats->mv[rf_idx].as_int; + ++refmv_count; + } + } + + for (int idx = 0; idx < refmv_count; ++idx) { + int_mv this_mv; + uint32_t thissme = + motion_estimation(cpi, x, src_mb_buffer, ref_mb, src_stride, + ref_stride, bsize, center_mvs[idx].as_mv, &this_mv); + + if (thissme < bestsme) { + bestsme = thissme; + best_rfidx_mv = this_mv; + } + } + + tpl_stats->mv[rf_idx].as_int = best_rfidx_mv.as_int; + + struct buf_2d ref_buf = { NULL, ref_frame_ptr->y_buffer, + ref_frame_ptr->y_width, ref_frame_ptr->y_height, + ref_frame_ptr->y_stride }; + InterPredParams inter_pred_params; + av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE, + mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0, + sf, &ref_buf, kernel); + inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd); + + av1_enc_build_one_inter_predictor(predictor, bw, &best_rfidx_mv.as_mv, + &inter_pred_params); + + inter_cost = tpl_get_satd_cost(x, src_diff, bw, src_mb_buffer, src_stride, + predictor, bw, coeff, bw, bh, tx_size); + // Store inter cost for each ref frame + tpl_stats->pred_error[rf_idx] = AOMMAX(1, inter_cost); + + if (inter_cost < best_inter_cost) { + memcpy(best_coeff, coeff, sizeof(best_coeff)); + best_rf_idx = rf_idx; + + best_inter_cost = inter_cost; + best_mv.as_int = best_rfidx_mv.as_int; + if (best_inter_cost < best_intra_cost) { + best_mode = NEWMV; + xd->mi[0]->ref_frame[0] = best_rf_idx + LAST_FRAME; + xd->mi[0]->mv[0].as_int = best_mv.as_int; + } + } + } + + if (best_inter_cost < INT64_MAX) { + uint16_t eob; + get_quantize_error(x, 0, best_coeff, qcoeff, dqcoeff, tx_size, &eob, + &recon_error, &sse); + + const int rate_cost = rate_estimator(qcoeff, eob, tx_size); + tpl_stats->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2; + } + + best_intra_cost = AOMMAX(best_intra_cost, 1); + if (frame_idx == 0) { + best_inter_cost = 0; + } else { + best_inter_cost = AOMMIN(best_intra_cost, best_inter_cost); + } + tpl_stats->inter_cost = best_inter_cost << TPL_DEP_COST_SCALE_LOG2; + tpl_stats->intra_cost = best_intra_cost << TPL_DEP_COST_SCALE_LOG2; + + tpl_stats->srcrf_dist = recon_error << (TPL_DEP_COST_SCALE_LOG2); + + // Final encode + if (is_inter_mode(best_mode)) { + const YV12_BUFFER_CONFIG *ref_frame_ptr = ref_frame[best_rf_idx]; + + InterPredParams inter_pred_params; + struct buf_2d ref_buf = { NULL, ref_frame_ptr->y_buffer, + ref_frame_ptr->y_width, ref_frame_ptr->y_height, + ref_frame_ptr->y_stride }; + av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE, + mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0, + sf, &ref_buf, kernel); + inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd); + + av1_enc_build_one_inter_predictor(dst_buffer, dst_buffer_stride, + &best_mv.as_mv, &inter_pred_params); + } else { + av1_predict_intra_block(cm, xd, block_size_wide[bsize], + block_size_high[bsize], tx_size, best_mode, 0, 0, + FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride, + dst_buffer, dst_buffer_stride, 0, 0, 0); + } + + int rate_cost; + txfm_quant_rdcost(x, src_diff, bw, src_mb_buffer, src_stride, dst_buffer, + dst_buffer_stride, coeff, qcoeff, dqcoeff, bw, bh, tx_size, + &rate_cost, &recon_error, &sse); + + tpl_stats->recrf_dist = recon_error << (TPL_DEP_COST_SCALE_LOG2); + tpl_stats->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2; + if (!is_inter_mode(best_mode)) { + tpl_stats->srcrf_dist = recon_error << (TPL_DEP_COST_SCALE_LOG2); + tpl_stats->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2; + } + tpl_stats->recrf_dist = AOMMAX(tpl_stats->srcrf_dist, tpl_stats->recrf_dist); + tpl_stats->recrf_rate = AOMMAX(tpl_stats->srcrf_rate, tpl_stats->recrf_rate); + + if (best_rf_idx >= 0) { + tpl_stats->mv[best_rf_idx].as_int = best_mv.as_int; + tpl_stats->ref_frame_index = best_rf_idx; + } + + for (int idy = 0; idy < mi_height; ++idy) { + for (int idx = 0; idx < mi_width; ++idx) { + if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > idx && + (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > idy) { + xd->mi[idx + idy * cm->mi_params.mi_stride] = xd->mi[0]; + } + } + } +} + +static int round_floor(int ref_pos, int bsize_pix) { + int round; + if (ref_pos < 0) + round = -(1 + (-ref_pos - 1) / bsize_pix); + else + round = ref_pos / bsize_pix; + + return round; +} + +static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row, + int ref_pos_col, int block, BLOCK_SIZE bsize) { + int width = 0, height = 0; + int bw = 4 << mi_size_wide_log2[bsize]; + int bh = 4 << mi_size_high_log2[bsize]; + + switch (block) { + case 0: + width = grid_pos_col + bw - ref_pos_col; + height = grid_pos_row + bh - ref_pos_row; + break; + case 1: + width = ref_pos_col + bw - grid_pos_col; + height = grid_pos_row + bh - ref_pos_row; + break; + case 2: + width = grid_pos_col + bw - ref_pos_col; + height = ref_pos_row + bh - grid_pos_row; + break; + case 3: + width = ref_pos_col + bw - grid_pos_col; + height = ref_pos_row + bh - grid_pos_row; + break; + default: assert(0); + } + + return width * height; +} + +int av1_tpl_ptr_pos(int mi_row, int mi_col, int stride, uint8_t right_shift) { + return (mi_row >> right_shift) * stride + (mi_col >> right_shift); +} + +static int64_t delta_rate_cost(int64_t delta_rate, int64_t recrf_dist, + int64_t srcrf_dist, int pix_num) { + double beta = (double)srcrf_dist / recrf_dist; + int64_t rate_cost = delta_rate; + + if (srcrf_dist <= 128) return rate_cost; + + double dr = + (double)(delta_rate >> (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT)) / + pix_num; + + double log_den = log(beta) / log(2.0) + 2.0 * dr; + + if (log_den > log(10.0) / log(2.0)) { + rate_cost = (int64_t)((log(1.0 / beta) * pix_num) / log(2.0) / 2.0); + rate_cost <<= (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT); + return rate_cost; + } + + double num = pow(2.0, log_den); + double den = num * beta + (1 - beta) * beta; + + rate_cost = (int64_t)((pix_num * log(num / den)) / log(2.0) / 2.0); + + rate_cost <<= (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT); + + return rate_cost; +} + +static AOM_INLINE void tpl_model_update_b(TplParams *const tpl_data, int mi_row, + int mi_col, const BLOCK_SIZE bsize, + int frame_idx) { + TplDepFrame *tpl_frame_ptr = &tpl_data->tpl_frame[frame_idx]; + TplDepStats *tpl_ptr = tpl_frame_ptr->tpl_stats_ptr; + TplDepFrame *tpl_frame = tpl_data->tpl_frame; + const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; + TplDepStats *tpl_stats_ptr = &tpl_ptr[av1_tpl_ptr_pos( + mi_row, mi_col, tpl_frame->stride, block_mis_log2)]; + + if (tpl_stats_ptr->ref_frame_index < 0) return; + const int ref_frame_index = tpl_stats_ptr->ref_frame_index; + TplDepFrame *ref_tpl_frame = + &tpl_frame[tpl_frame[frame_idx].ref_map_index[ref_frame_index]]; + TplDepStats *ref_stats_ptr = ref_tpl_frame->tpl_stats_ptr; + + if (tpl_frame[frame_idx].ref_map_index[ref_frame_index] < 0) return; + + const FULLPEL_MV full_mv = + get_fullmv_from_mv(&tpl_stats_ptr->mv[ref_frame_index].as_mv); + const int ref_pos_row = mi_row * MI_SIZE + full_mv.row; + const int ref_pos_col = mi_col * MI_SIZE + full_mv.col; + + const int bw = 4 << mi_size_wide_log2[bsize]; + const int bh = 4 << mi_size_high_log2[bsize]; + const int mi_height = mi_size_high[bsize]; + const int mi_width = mi_size_wide[bsize]; + const int pix_num = bw * bh; + + // top-left on grid block location in pixel + int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh; + int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw; + int block; + + int64_t cur_dep_dist = tpl_stats_ptr->recrf_dist - tpl_stats_ptr->srcrf_dist; + int64_t mc_dep_dist = (int64_t)( + tpl_stats_ptr->mc_dep_dist * + ((double)(tpl_stats_ptr->recrf_dist - tpl_stats_ptr->srcrf_dist) / + tpl_stats_ptr->recrf_dist)); + int64_t delta_rate = tpl_stats_ptr->recrf_rate - tpl_stats_ptr->srcrf_rate; + int64_t mc_dep_rate = + delta_rate_cost(tpl_stats_ptr->mc_dep_rate, tpl_stats_ptr->recrf_dist, + tpl_stats_ptr->srcrf_dist, pix_num); + + for (block = 0; block < 4; ++block) { + int grid_pos_row = grid_pos_row_base + bh * (block >> 1); + int grid_pos_col = grid_pos_col_base + bw * (block & 0x01); + + if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE && + grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) { + int overlap_area = get_overlap_area( + grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize); + int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height; + int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width; + const int step = 1 << block_mis_log2; + + for (int idy = 0; idy < mi_height; idy += step) { + for (int idx = 0; idx < mi_width; idx += step) { + TplDepStats *des_stats = &ref_stats_ptr[av1_tpl_ptr_pos( + ref_mi_row + idy, ref_mi_col + idx, ref_tpl_frame->stride, + block_mis_log2)]; + des_stats->mc_dep_dist += + ((cur_dep_dist + mc_dep_dist) * overlap_area) / pix_num; + des_stats->mc_dep_rate += + ((delta_rate + mc_dep_rate) * overlap_area) / pix_num; + + assert(overlap_area >= 0); + } + } + } + } +} + +static AOM_INLINE void tpl_model_update(TplParams *const tpl_data, int mi_row, + int mi_col, const BLOCK_SIZE bsize, + int frame_idx) { + const int mi_height = mi_size_high[bsize]; + const int mi_width = mi_size_wide[bsize]; + const int step = 1 << tpl_data->tpl_stats_block_mis_log2; + const BLOCK_SIZE tpl_block_size = + convert_length_to_bsize(MI_SIZE << tpl_data->tpl_stats_block_mis_log2); + + for (int idy = 0; idy < mi_height; idy += step) { + for (int idx = 0; idx < mi_width; idx += step) { + tpl_model_update_b(tpl_data, mi_row + idy, mi_col + idx, tpl_block_size, + frame_idx); + } + } +} + +static AOM_INLINE void tpl_model_store(TplDepStats *tpl_stats_ptr, int mi_row, + int mi_col, BLOCK_SIZE bsize, int stride, + const TplDepStats *src_stats, + uint8_t block_mis_log2) { + const int mi_height = mi_size_high[bsize]; + const int mi_width = mi_size_wide[bsize]; + const int step = 1 << block_mis_log2; + + int64_t intra_cost = src_stats->intra_cost / (mi_height * mi_width); + int64_t inter_cost = src_stats->inter_cost / (mi_height * mi_width); + int64_t srcrf_dist = src_stats->srcrf_dist / (mi_height * mi_width); + int64_t recrf_dist = src_stats->recrf_dist / (mi_height * mi_width); + int64_t srcrf_rate = src_stats->srcrf_rate / (mi_height * mi_width); + int64_t recrf_rate = src_stats->recrf_rate / (mi_height * mi_width); + + intra_cost = AOMMAX(1, intra_cost); + inter_cost = AOMMAX(1, inter_cost); + srcrf_dist = AOMMAX(1, srcrf_dist); + recrf_dist = AOMMAX(1, recrf_dist); + srcrf_rate = AOMMAX(1, srcrf_rate); + recrf_rate = AOMMAX(1, recrf_rate); + + for (int idy = 0; idy < mi_height; idy += step) { + TplDepStats *tpl_ptr = &tpl_stats_ptr[av1_tpl_ptr_pos( + mi_row + idy, mi_col, stride, block_mis_log2)]; + for (int idx = 0; idx < mi_width; idx += step) { + tpl_ptr->intra_cost = intra_cost; + tpl_ptr->inter_cost = inter_cost; + tpl_ptr->srcrf_dist = srcrf_dist; + tpl_ptr->recrf_dist = recrf_dist; + tpl_ptr->srcrf_rate = srcrf_rate; + tpl_ptr->recrf_rate = recrf_rate; + memcpy(tpl_ptr->mv, src_stats->mv, sizeof(tpl_ptr->mv)); + memcpy(tpl_ptr->pred_error, src_stats->pred_error, + sizeof(tpl_ptr->pred_error)); + tpl_ptr->ref_frame_index = src_stats->ref_frame_index; + ++tpl_ptr; + } + } +} + +static AOM_INLINE void mc_flow_dispenser(AV1_COMP *cpi, int frame_idx, + int pframe_qindex) { + const GF_GROUP *gf_group = &cpi->gf_group; + if (frame_idx == gf_group->size) return; + TplParams *const tpl_data = &cpi->tpl_data; + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx]; + const YV12_BUFFER_CONFIG *this_frame = tpl_frame->gf_picture; + const YV12_BUFFER_CONFIG *ref_frame[7] = { NULL, NULL, NULL, NULL, + NULL, NULL, NULL }; + const YV12_BUFFER_CONFIG *ref_frames_ordered[INTER_REFS_PER_FRAME]; + int ref_frame_flags; + const YV12_BUFFER_CONFIG *src_frame[7] = { NULL, NULL, NULL, NULL, + NULL, NULL, NULL }; + + AV1_COMMON *cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + struct scale_factors sf; + int rdmult, idx; + ThreadData *td = &cpi->td; + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + int mi_row, mi_col; + const BLOCK_SIZE bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D); + av1_tile_init(&xd->tile, cm, 0, 0); + + const TX_SIZE tx_size = max_txsize_lookup[bsize]; + const int mi_height = mi_size_high[bsize]; + const int mi_width = mi_size_wide[bsize]; + + // Setup scaling factor + av1_setup_scale_factors_for_frame( + &sf, this_frame->y_crop_width, this_frame->y_crop_height, + this_frame->y_crop_width, this_frame->y_crop_height); + + xd->cur_buf = this_frame; + + for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) { + ref_frame[idx] = + tpl_data->tpl_frame[tpl_frame->ref_map_index[idx]].rec_picture; + src_frame[idx] = + tpl_data->tpl_frame[tpl_frame->ref_map_index[idx]].gf_picture; + } + + // Store the reference frames based on priority order + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + ref_frames_ordered[i] = ref_frame[ref_frame_priority_order[i] - 1]; + } + + // Work out which reference frame slots may be used. + ref_frame_flags = get_ref_frame_flags(&cpi->sf, ref_frames_ordered, + cpi->ext_flags.ref_frame_flags); + + enforce_max_ref_frames(cpi, &ref_frame_flags); + + // Prune reference frames + for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) { + if ((ref_frame_flags & (1 << idx)) == 0) { + ref_frame[idx] = NULL; + } + } + + // Make a temporary mbmi for tpl model + MB_MODE_INFO mbmi; + memset(&mbmi, 0, sizeof(mbmi)); + MB_MODE_INFO *mbmi_ptr = &mbmi; + xd->mi = &mbmi_ptr; + + xd->block_ref_scale_factors[0] = &sf; + + const int base_qindex = pframe_qindex; + // Get rd multiplier set up. + rdmult = (int)av1_compute_rd_mult(cpi, base_qindex); + if (rdmult < 1) rdmult = 1; + set_error_per_bit(x, rdmult); + av1_initialize_me_consts(cpi, x, base_qindex); + + tpl_frame->is_valid = 1; + + cm->quant_params.base_qindex = base_qindex; + av1_frame_init_quantizer(cpi); + + tpl_frame->base_rdmult = + av1_compute_rd_mult_based_on_qindex(cpi, pframe_qindex) / 6; + + for (mi_row = 0; mi_row < mi_params->mi_rows; mi_row += mi_height) { + // Motion estimation row boundary + av1_set_mv_row_limits(mi_params, &x->mv_limits, mi_row, mi_height, + cpi->oxcf.border_in_pixels); + xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE); + xd->mb_to_bottom_edge = + GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE); + for (mi_col = 0; mi_col < mi_params->mi_cols; mi_col += mi_width) { + TplDepStats tpl_stats; + + // Motion estimation column boundary + av1_set_mv_col_limits(mi_params, &x->mv_limits, mi_col, mi_width, + cpi->oxcf.border_in_pixels); + xd->mb_to_left_edge = -GET_MV_SUBPEL(mi_col * MI_SIZE); + xd->mb_to_right_edge = + GET_MV_SUBPEL(mi_params->mi_cols - mi_width - mi_col); + mode_estimation(cpi, x, xd, &sf, frame_idx, mi_row, mi_col, bsize, + tx_size, ref_frame, src_frame, &tpl_stats); + + // Motion flow dependency dispenser. + tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, + tpl_frame->stride, &tpl_stats, + tpl_data->tpl_stats_block_mis_log2); + } + } +} + +static void mc_flow_synthesizer(AV1_COMP *cpi, int frame_idx) { + AV1_COMMON *cm = &cpi->common; + + const GF_GROUP *gf_group = &cpi->gf_group; + if (frame_idx == gf_group->size) return; + + TplParams *const tpl_data = &cpi->tpl_data; + + const BLOCK_SIZE bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D); + const int mi_height = mi_size_high[bsize]; + const int mi_width = mi_size_wide[bsize]; + + for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += mi_height) { + for (int mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += mi_width) { + if (frame_idx) { + tpl_model_update(tpl_data, mi_row, mi_col, bsize, frame_idx); + } + } + } +} + +static AOM_INLINE void init_gop_frames_for_tpl( + AV1_COMP *cpi, const EncodeFrameParams *const init_frame_params, + GF_GROUP *gf_group, int gop_eval, int *tpl_group_frames, + const EncodeFrameInput *const frame_input, int *pframe_qindex) { + AV1_COMMON *cm = &cpi->common; + int cur_frame_idx = gf_group->index; + *pframe_qindex = 0; + + RefBufferStack ref_buffer_stack = cpi->ref_buffer_stack; + EncodeFrameParams frame_params = *init_frame_params; + TplParams *const tpl_data = &cpi->tpl_data; + + int ref_picture_map[REF_FRAMES]; + + for (int i = 0; i < REF_FRAMES; ++i) { + if (frame_params.frame_type == KEY_FRAME || gop_eval) { + tpl_data->tpl_frame[-i - 1].gf_picture = NULL; + tpl_data->tpl_frame[-1 - 1].rec_picture = NULL; + tpl_data->tpl_frame[-i - 1].frame_display_index = 0; + } else { + tpl_data->tpl_frame[-i - 1].gf_picture = &cm->ref_frame_map[i]->buf; + tpl_data->tpl_frame[-i - 1].rec_picture = &cm->ref_frame_map[i]->buf; + tpl_data->tpl_frame[-i - 1].frame_display_index = + cm->ref_frame_map[i]->display_order_hint; + } + + ref_picture_map[i] = -i - 1; + } + + *tpl_group_frames = cur_frame_idx; + + int gf_index; + int use_arf = gf_group->update_type[1] == ARF_UPDATE; + int anc_frame_offset = gf_group->cur_frame_idx[cur_frame_idx] + 1; + int process_frame_count = 0; + const int gop_length = + AOMMIN(gf_group->size - 1 + use_arf, MAX_LENGTH_TPL_FRAME_STATS - 1); + for (gf_index = cur_frame_idx; gf_index <= gop_length; ++gf_index) { + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_index]; + FRAME_UPDATE_TYPE frame_update_type = gf_group->update_type[gf_index]; + + frame_params.show_frame = frame_update_type != ARF_UPDATE && + frame_update_type != INTNL_ARF_UPDATE; + frame_params.show_existing_frame = + frame_update_type == INTNL_OVERLAY_UPDATE || + frame_update_type == OVERLAY_UPDATE; + frame_params.frame_type = + frame_update_type == KF_UPDATE ? KEY_FRAME : INTER_FRAME; + + if (frame_update_type == LF_UPDATE) + *pframe_qindex = gf_group->q_val[gf_index]; + + if (gf_index == cur_frame_idx) { + tpl_frame->gf_picture = frame_input->source; + // frame display index = frame offset within the gf group + start frame of + // the gf group + tpl_frame->frame_display_index = + gf_group->frame_disp_idx[gf_index] + + cpi->common.current_frame.display_order_hint; + } else { + int frame_display_index = gf_index == gf_group->size + ? cpi->rc.baseline_gf_interval + : gf_group->frame_disp_idx[gf_index]; + struct lookahead_entry *buf = av1_lookahead_peek( + cpi->lookahead, frame_display_index - anc_frame_offset, + cpi->compressor_stage); + if (buf == NULL) break; + tpl_frame->gf_picture = &buf->img; + // frame display index = frame offset within the gf group + start frame of + // the gf group + tpl_frame->frame_display_index = + frame_display_index + cpi->common.current_frame.display_order_hint; + } + + if (frame_update_type != OVERLAY_UPDATE && + frame_update_type != INTNL_OVERLAY_UPDATE) { + tpl_frame->rec_picture = &tpl_data->tpl_rec_pool[process_frame_count]; + tpl_frame->tpl_stats_ptr = tpl_data->tpl_stats_pool[process_frame_count]; + ++process_frame_count; + } + + av1_get_ref_frames(cpi, &ref_buffer_stack); + int refresh_mask = av1_get_refresh_frame_flags( + cpi, &frame_params, frame_update_type, &ref_buffer_stack); + + int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask); + av1_update_ref_frame_map(cpi, frame_update_type, + frame_params.show_existing_frame, + refresh_frame_map_index, &ref_buffer_stack); + + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) + tpl_frame->ref_map_index[i - LAST_FRAME] = + ref_picture_map[cm->remapped_ref_idx[i - LAST_FRAME]]; + + if (refresh_mask) ref_picture_map[refresh_frame_map_index] = gf_index; + + ++*tpl_group_frames; + } + + if (cur_frame_idx == 0) return; + + int extend_frame_count = 0; + int extend_frame_length = + AOMMIN(cpi->rc.baseline_gf_interval, + cpi->rc.frames_to_key - cpi->rc.baseline_gf_interval); + int frame_display_index = cpi->rc.baseline_gf_interval + 1; + + for (; gf_index < MAX_LENGTH_TPL_FRAME_STATS && + extend_frame_count < extend_frame_length; + ++gf_index) { + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_index]; + FRAME_UPDATE_TYPE frame_update_type = LF_UPDATE; + frame_params.show_frame = frame_update_type != ARF_UPDATE && + frame_update_type != INTNL_ARF_UPDATE; + frame_params.show_existing_frame = + frame_update_type == INTNL_OVERLAY_UPDATE; + frame_params.frame_type = INTER_FRAME; + + struct lookahead_entry *buf = av1_lookahead_peek( + cpi->lookahead, frame_display_index - anc_frame_offset, + cpi->compressor_stage); + + if (buf == NULL) break; + + tpl_frame->gf_picture = &buf->img; + tpl_frame->rec_picture = &tpl_data->tpl_rec_pool[process_frame_count]; + tpl_frame->tpl_stats_ptr = tpl_data->tpl_stats_pool[process_frame_count]; + ++process_frame_count; + + // frame display index = frame offset within the gf group + start frame of + // the gf group + tpl_frame->frame_display_index = + frame_display_index + cpi->common.current_frame.display_order_hint; + + gf_group->update_type[gf_index] = LF_UPDATE; + gf_group->q_val[gf_index] = *pframe_qindex; + + av1_get_ref_frames(cpi, &ref_buffer_stack); + int refresh_mask = av1_get_refresh_frame_flags( + cpi, &frame_params, frame_update_type, &ref_buffer_stack); + int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask); + av1_update_ref_frame_map(cpi, frame_update_type, + frame_params.show_existing_frame, + refresh_frame_map_index, &ref_buffer_stack); + + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) + tpl_frame->ref_map_index[i - LAST_FRAME] = + ref_picture_map[cm->remapped_ref_idx[i - LAST_FRAME]]; + + tpl_frame->ref_map_index[ALTREF_FRAME - LAST_FRAME] = -1; + tpl_frame->ref_map_index[LAST3_FRAME - LAST_FRAME] = -1; + tpl_frame->ref_map_index[BWDREF_FRAME - LAST_FRAME] = -1; + tpl_frame->ref_map_index[ALTREF2_FRAME - LAST_FRAME] = -1; + + if (refresh_mask) ref_picture_map[refresh_frame_map_index] = gf_index; + + ++*tpl_group_frames; + ++extend_frame_count; + ++frame_display_index; + } + + av1_get_ref_frames(cpi, &cpi->ref_buffer_stack); +} + +static AOM_INLINE void init_tpl_stats(TplParams *const tpl_data) { + for (int frame_idx = 0; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) { + TplDepFrame *tpl_frame = &tpl_data->tpl_stats_buffer[frame_idx]; + memset(tpl_data->tpl_stats_pool[frame_idx], 0, + tpl_frame->height * tpl_frame->width * + sizeof(*tpl_frame->tpl_stats_ptr)); + tpl_frame->is_valid = 0; + } +} + +int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval, + const EncodeFrameParams *const frame_params, + const EncodeFrameInput *const frame_input) { + AV1_COMMON *cm = &cpi->common; + GF_GROUP *gf_group = &cpi->gf_group; + int bottom_index, top_index; + EncodeFrameParams this_frame_params = *frame_params; + TplParams *const tpl_data = &cpi->tpl_data; + + if (cpi->superres_mode != SUPERRES_NONE) return 0; + + cm->current_frame.frame_type = frame_params->frame_type; + for (int gf_index = gf_group->index; gf_index < gf_group->size; ++gf_index) { + av1_configure_buffer_updates(cpi, &this_frame_params, + gf_group->update_type[gf_index], 0); + + cpi->refresh_golden_frame = this_frame_params.refresh_golden_frame; + cpi->refresh_bwd_ref_frame = this_frame_params.refresh_bwd_ref_frame; + cpi->refresh_alt_ref_frame = this_frame_params.refresh_alt_ref_frame; + + cm->show_frame = gf_group->update_type[gf_index] != ARF_UPDATE && + gf_group->update_type[gf_index] != INTNL_ARF_UPDATE; + + gf_group->q_val[gf_index] = + av1_rc_pick_q_and_bounds(cpi, &cpi->rc, cm->width, cm->height, gf_index, + &bottom_index, &top_index); + + cm->current_frame.frame_type = INTER_FRAME; + } + + int pframe_qindex; + int tpl_gf_group_frames; + init_gop_frames_for_tpl(cpi, frame_params, gf_group, gop_eval, + &tpl_gf_group_frames, frame_input, &pframe_qindex); + + cpi->rc.base_layer_qp = pframe_qindex; + + init_tpl_stats(tpl_data); + + // Backward propagation from tpl_group_frames to 1. + for (int frame_idx = gf_group->index; frame_idx < tpl_gf_group_frames; + ++frame_idx) { + if (gf_group->update_type[frame_idx] == INTNL_OVERLAY_UPDATE || + gf_group->update_type[frame_idx] == OVERLAY_UPDATE) + continue; + + mc_flow_dispenser(cpi, frame_idx, pframe_qindex); + + aom_extend_frame_borders(tpl_data->tpl_frame[frame_idx].rec_picture, + av1_num_planes(cm)); + } + + for (int frame_idx = tpl_gf_group_frames - 1; frame_idx >= gf_group->index; + --frame_idx) { + if (gf_group->update_type[frame_idx] == INTNL_OVERLAY_UPDATE || + gf_group->update_type[frame_idx] == OVERLAY_UPDATE) + continue; + + mc_flow_synthesizer(cpi, frame_idx); + } + + av1_configure_buffer_updates(cpi, &this_frame_params, + gf_group->update_type[gf_group->index], 0); + cm->current_frame.frame_type = frame_params->frame_type; + cm->show_frame = frame_params->show_frame; + + if (cpi->common.tiles.large_scale) return 0; + if (gf_group->max_layer_depth_allowed == 0) return 1; + + double beta[2] = { 0.0 }; + for (int frame_idx = 1; frame_idx <= AOMMIN(tpl_gf_group_frames - 1, 2); + ++frame_idx) { + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + int tpl_stride = tpl_frame->stride; + int64_t intra_cost_base = 0; + int64_t mc_dep_cost_base = 0; + const int step = 1 << tpl_data->tpl_stats_block_mis_log2; + const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); + + for (int row = 0; row < cm->mi_params.mi_rows; row += step) { + for (int col = 0; col < mi_cols_sr; col += step) { + TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos( + row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)]; + int64_t mc_dep_delta = + RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, + this_stats->mc_dep_dist); + intra_cost_base += (this_stats->recrf_dist << RDDIV_BITS); + mc_dep_cost_base += + (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta; + } + } + beta[frame_idx - 1] = (double)mc_dep_cost_base / intra_cost_base; + } + + // Allow larger GOP size if the base layer ARF has higher dependency factor + // than the intermediate ARF and both ARFs have reasonably high dependency + // factors. + return (beta[0] >= beta[1] + 0.7) && beta[0] > 3.0; +} + +void av1_tpl_rdmult_setup(AV1_COMP *cpi) { + const AV1_COMMON *const cm = &cpi->common; + const GF_GROUP *const gf_group = &cpi->gf_group; + const int tpl_idx = gf_group->index; + + assert(IMPLIES(gf_group->size > 0, tpl_idx < gf_group->size)); + + TplParams *const tpl_data = &cpi->tpl_data; + const TplDepFrame *const tpl_frame = &tpl_data->tpl_frame[tpl_idx]; + + if (!tpl_frame->is_valid) return; + if (cpi->superres_mode != SUPERRES_NONE) return; + + const TplDepStats *const tpl_stats = tpl_frame->tpl_stats_ptr; + const int tpl_stride = tpl_frame->stride; + const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); + + const int block_size = BLOCK_16X16; + const int num_mi_w = mi_size_wide[block_size]; + const int num_mi_h = mi_size_high[block_size]; + const int num_cols = (mi_cols_sr + num_mi_w - 1) / num_mi_w; + const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h; + const double c = 1.2; + const int step = 1 << tpl_data->tpl_stats_block_mis_log2; + + aom_clear_system_state(); + + // Loop through each 'block_size' X 'block_size' block. + for (int row = 0; row < num_rows; row++) { + for (int col = 0; col < num_cols; col++) { + double intra_cost = 0.0, mc_dep_cost = 0.0; + // Loop through each mi block. + for (int mi_row = row * num_mi_h; mi_row < (row + 1) * num_mi_h; + mi_row += step) { + for (int mi_col = col * num_mi_w; mi_col < (col + 1) * num_mi_w; + mi_col += step) { + if (mi_row >= cm->mi_params.mi_rows || mi_col >= mi_cols_sr) continue; + const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos( + mi_row, mi_col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)]; + int64_t mc_dep_delta = + RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, + this_stats->mc_dep_dist); + intra_cost += (double)(this_stats->recrf_dist << RDDIV_BITS); + mc_dep_cost += + (double)(this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta; + } + } + const double rk = intra_cost / mc_dep_cost; + const int index = row * num_cols + col; + cpi->tpl_rdmult_scaling_factors[index] = rk / cpi->rd.r0 + c; + } + } + aom_clear_system_state(); +} + +void av1_tpl_rdmult_setup_sb(AV1_COMP *cpi, MACROBLOCK *const x, + BLOCK_SIZE sb_size, int mi_row, int mi_col) { + AV1_COMMON *const cm = &cpi->common; + assert(IMPLIES(cpi->gf_group.size > 0, + cpi->gf_group.index < cpi->gf_group.size)); + const int tpl_idx = cpi->gf_group.index; + TplDepFrame *tpl_frame = &cpi->tpl_data.tpl_frame[tpl_idx]; + + if (tpl_frame->is_valid == 0) return; + if (!is_frame_tpl_eligible(cpi)) return; + if (tpl_idx >= MAX_LAG_BUFFERS) return; + if (cpi->superres_mode != SUPERRES_NONE) return; + if (cpi->oxcf.aq_mode != NO_AQ) return; + + const int bsize_base = BLOCK_16X16; + const int num_mi_w = mi_size_wide[bsize_base]; + const int num_mi_h = mi_size_high[bsize_base]; + const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w; + const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h; + const int num_bcols = (mi_size_wide[sb_size] + num_mi_w - 1) / num_mi_w; + const int num_brows = (mi_size_high[sb_size] + num_mi_h - 1) / num_mi_h; + int row, col; + + double base_block_count = 0.0; + double log_sum = 0.0; + + aom_clear_system_state(); + for (row = mi_row / num_mi_w; + row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) { + for (col = mi_col / num_mi_h; + col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) { + const int index = row * num_cols + col; + log_sum += log(cpi->tpl_rdmult_scaling_factors[index]); + base_block_count += 1.0; + } + } + + MACROBLOCKD *const xd = &x->e_mbd; + const CommonQuantParams *quant_params = &cm->quant_params; + const int orig_rdmult = av1_compute_rd_mult( + cpi, quant_params->base_qindex + quant_params->y_dc_delta_q); + const int new_rdmult = + av1_compute_rd_mult(cpi, quant_params->base_qindex + xd->delta_qindex + + quant_params->y_dc_delta_q); + const double scaling_factor = (double)new_rdmult / (double)orig_rdmult; + + double scale_adj = log(scaling_factor) - log_sum / base_block_count; + scale_adj = exp(scale_adj); + + for (row = mi_row / num_mi_w; + row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) { + for (col = mi_col / num_mi_h; + col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) { + const int index = row * num_cols + col; + cpi->tpl_sb_rdmult_scaling_factors[index] = + scale_adj * cpi->tpl_rdmult_scaling_factors[index]; + } + } + aom_clear_system_state(); +} diff --git a/libs/libaom/src/av1/encoder/tpl_model.h b/libs/libaom/src/av1/encoder/tpl_model.h new file mode 100644 index 000000000..11a61b649 --- /dev/null +++ b/libs/libaom/src/av1/encoder/tpl_model.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_TPL_MODEL_H_ +#define AOM_AV1_ENCODER_TPL_MODEL_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE BLOCK_SIZE convert_length_to_bsize(int length) { + switch (length) { + case 64: return BLOCK_64X64; + case 32: return BLOCK_32X32; + case 16: return BLOCK_16X16; + case 8: return BLOCK_8X8; + case 4: return BLOCK_4X4; + default: + assert(0 && "Invalid block size for tpl model"); + return BLOCK_16X16; + } +} + +int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval, + const EncodeFrameParams *const frame_params, + const EncodeFrameInput *const frame_input); + +int av1_tpl_ptr_pos(int mi_row, int mi_col, int stride, uint8_t right_shift); + +void av1_tpl_rdmult_setup(AV1_COMP *cpi); + +void av1_tpl_rdmult_setup_sb(AV1_COMP *cpi, MACROBLOCK *const x, + BLOCK_SIZE sb_size, int mi_row, int mi_col); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_TPL_MODEL_H_ diff --git a/libs/libaom/src/av1/encoder/tune_vmaf.c b/libs/libaom/src/av1/encoder/tune_vmaf.c new file mode 100644 index 000000000..997f78e27 --- /dev/null +++ b/libs/libaom/src/av1/encoder/tune_vmaf.c @@ -0,0 +1,794 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/encoder/tune_vmaf.h" + +#include "aom_dsp/psnr.h" +#include "aom_dsp/vmaf.h" +#include "aom_ports/system_state.h" +#include "av1/encoder/extend.h" +#include "av1/encoder/rdopt.h" + +static const double kBaselineVmaf = 97.42773; + +// TODO(sdeng): Add the SIMD implementation. +static AOM_INLINE void highbd_unsharp_rect(const uint16_t *source, + int source_stride, + const uint16_t *blurred, + int blurred_stride, uint16_t *dst, + int dst_stride, int w, int h, + double amount, int bit_depth) { + const int max_value = (1 << bit_depth) - 1; + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + const double val = + (double)source[j] + amount * ((double)source[j] - (double)blurred[j]); + dst[j] = (uint16_t)clamp((int)(val + 0.5), 0, max_value); + } + source += source_stride; + blurred += blurred_stride; + dst += dst_stride; + } +} + +static AOM_INLINE void unsharp_rect(const uint8_t *source, int source_stride, + const uint8_t *blurred, int blurred_stride, + uint8_t *dst, int dst_stride, int w, int h, + double amount) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + const double val = + (double)source[j] + amount * ((double)source[j] - (double)blurred[j]); + dst[j] = (uint8_t)clamp((int)(val + 0.5), 0, 255); + } + source += source_stride; + blurred += blurred_stride; + dst += dst_stride; + } +} + +static AOM_INLINE void unsharp(const AV1_COMP *const cpi, + const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *blurred, + const YV12_BUFFER_CONFIG *dst, double amount) { + const int bit_depth = cpi->td.mb.e_mbd.bd; + if (bit_depth > 8) { + highbd_unsharp_rect(CONVERT_TO_SHORTPTR(source->y_buffer), source->y_stride, + CONVERT_TO_SHORTPTR(blurred->y_buffer), + blurred->y_stride, CONVERT_TO_SHORTPTR(dst->y_buffer), + dst->y_stride, source->y_width, source->y_height, + amount, bit_depth); + } else { + unsharp_rect(source->y_buffer, source->y_stride, blurred->y_buffer, + blurred->y_stride, dst->y_buffer, dst->y_stride, + source->y_width, source->y_height, amount); + } +} + +// 8-tap Gaussian convolution filter with sigma = 1.0, sums to 128, +// all co-efficients must be even. +DECLARE_ALIGNED(16, static const int16_t, gauss_filter[8]) = { 0, 8, 30, 52, + 30, 8, 0, 0 }; +static AOM_INLINE void gaussian_blur(const int bit_depth, + const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dst) { + const int block_size = BLOCK_128X128; + const int block_w = mi_size_wide[block_size] * 4; + const int block_h = mi_size_high[block_size] * 4; + const int num_cols = (source->y_width + block_w - 1) / block_w; + const int num_rows = (source->y_height + block_h - 1) / block_h; + int row, col; + + ConvolveParams conv_params = get_conv_params(0, 0, bit_depth); + InterpFilterParams filter = { .filter_ptr = gauss_filter, + .taps = 8, + .subpel_shifts = 0, + .interp_filter = EIGHTTAP_REGULAR }; + + for (row = 0; row < num_rows; ++row) { + for (col = 0; col < num_cols; ++col) { + const int row_offset_y = row * block_h; + const int col_offset_y = col * block_w; + + uint8_t *src_buf = + source->y_buffer + row_offset_y * source->y_stride + col_offset_y; + uint8_t *dst_buf = + dst->y_buffer + row_offset_y * dst->y_stride + col_offset_y; + + if (bit_depth > 8) { + av1_highbd_convolve_2d_sr( + CONVERT_TO_SHORTPTR(src_buf), source->y_stride, + CONVERT_TO_SHORTPTR(dst_buf), dst->y_stride, block_w, block_h, + &filter, &filter, 0, 0, &conv_params, bit_depth); + } else { + av1_convolve_2d_sr(src_buf, source->y_stride, dst_buf, dst->y_stride, + block_w, block_h, &filter, &filter, 0, 0, + &conv_params); + } + } + } +} + +static double frame_average_variance(const AV1_COMP *const cpi, + const YV12_BUFFER_CONFIG *const frame) { + const uint8_t *const y_buffer = frame->y_buffer; + const int y_stride = frame->y_stride; + const BLOCK_SIZE block_size = BLOCK_64X64; + + const int block_w = mi_size_wide[block_size] * 4; + const int block_h = mi_size_high[block_size] * 4; + int row, col; + const int bit_depth = cpi->td.mb.e_mbd.bd; + double var = 0.0, var_count = 0.0; + + // Loop through each block. + for (row = 0; row < frame->y_height / block_h; ++row) { + for (col = 0; col < frame->y_width / block_w; ++col) { + struct buf_2d buf; + const int row_offset_y = row * block_h; + const int col_offset_y = col * block_w; + + buf.buf = (uint8_t *)y_buffer + row_offset_y * y_stride + col_offset_y; + buf.stride = y_stride; + + if (bit_depth > 8) { + var += av1_high_get_sby_perpixel_variance(cpi, &buf, block_size, + bit_depth); + } else { + var += av1_get_sby_perpixel_variance(cpi, &buf, block_size); + } + var_count += 1.0; + } + } + var /= var_count; + return var; +} + +static double cal_approx_vmaf(const AV1_COMP *const cpi, double source_variance, + YV12_BUFFER_CONFIG *const source, + YV12_BUFFER_CONFIG *const sharpened) { + const int bit_depth = cpi->td.mb.e_mbd.bd; + double new_vmaf; + aom_calc_vmaf(cpi->oxcf.vmaf_model_path, source, sharpened, bit_depth, + &new_vmaf); + const double sharpened_var = frame_average_variance(cpi, sharpened); + return source_variance / sharpened_var * (new_vmaf - kBaselineVmaf); +} + +static double find_best_frame_unsharp_amount_loop( + const AV1_COMP *const cpi, YV12_BUFFER_CONFIG *const source, + YV12_BUFFER_CONFIG *const blurred, YV12_BUFFER_CONFIG *const sharpened, + double best_vmaf, const double baseline_variance, + const double unsharp_amount_start, const double step_size, + const int max_loop_count, const double max_amount) { + const double min_amount = 0.0; + int loop_count = 0; + double approx_vmaf = best_vmaf; + double unsharp_amount = unsharp_amount_start; + do { + best_vmaf = approx_vmaf; + unsharp_amount += step_size; + if (unsharp_amount > max_amount || unsharp_amount < min_amount) break; + unsharp(cpi, source, blurred, sharpened, unsharp_amount); + approx_vmaf = cal_approx_vmaf(cpi, baseline_variance, source, sharpened); + + loop_count++; + } while (approx_vmaf > best_vmaf && loop_count < max_loop_count); + unsharp_amount = + approx_vmaf > best_vmaf ? unsharp_amount : unsharp_amount - step_size; + return AOMMIN(max_amount, AOMMAX(unsharp_amount, min_amount)); +} + +static double find_best_frame_unsharp_amount(const AV1_COMP *const cpi, + YV12_BUFFER_CONFIG *const source, + YV12_BUFFER_CONFIG *const blurred, + const double unsharp_amount_start, + const double step_size, + const int max_loop_count, + const double max_filter_amount) { + const AV1_COMMON *const cm = &cpi->common; + const int width = source->y_width; + const int height = source->y_height; + + YV12_BUFFER_CONFIG sharpened; + memset(&sharpened, 0, sizeof(sharpened)); + aom_alloc_frame_buffer( + &sharpened, width, height, 1, 1, cm->seq_params.use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment); + + const double baseline_variance = frame_average_variance(cpi, source); + double unsharp_amount; + if (unsharp_amount_start <= step_size) { + unsharp_amount = find_best_frame_unsharp_amount_loop( + cpi, source, blurred, &sharpened, 0.0, baseline_variance, 0.0, + step_size, max_loop_count, max_filter_amount); + } else { + double a0 = unsharp_amount_start - step_size, a1 = unsharp_amount_start; + double v0, v1; + unsharp(cpi, source, blurred, &sharpened, a0); + v0 = cal_approx_vmaf(cpi, baseline_variance, source, &sharpened); + unsharp(cpi, source, blurred, &sharpened, a1); + v1 = cal_approx_vmaf(cpi, baseline_variance, source, &sharpened); + if (fabs(v0 - v1) < 0.01) { + unsharp_amount = a0; + } else if (v0 > v1) { + unsharp_amount = find_best_frame_unsharp_amount_loop( + cpi, source, blurred, &sharpened, v0, baseline_variance, a0, + -step_size, max_loop_count, max_filter_amount); + } else { + unsharp_amount = find_best_frame_unsharp_amount_loop( + cpi, source, blurred, &sharpened, v1, baseline_variance, a1, + step_size, max_loop_count, max_filter_amount); + } + } + + aom_free_frame_buffer(&sharpened); + return unsharp_amount; +} + +void av1_vmaf_frame_preprocessing(AV1_COMP *const cpi, + YV12_BUFFER_CONFIG *const source) { + aom_clear_system_state(); + const AV1_COMMON *const cm = &cpi->common; + const int bit_depth = cpi->td.mb.e_mbd.bd; + const int width = source->y_width; + const int height = source->y_height; + + YV12_BUFFER_CONFIG source_extended, blurred; + memset(&source_extended, 0, sizeof(source_extended)); + memset(&blurred, 0, sizeof(blurred)); + aom_alloc_frame_buffer( + &source_extended, width, height, 1, 1, cm->seq_params.use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment); + aom_alloc_frame_buffer( + &blurred, width, height, 1, 1, cm->seq_params.use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment); + + av1_copy_and_extend_frame(source, &source_extended); + gaussian_blur(bit_depth, &source_extended, &blurred); + aom_free_frame_buffer(&source_extended); + + const double best_frame_unsharp_amount = find_best_frame_unsharp_amount( + cpi, source, &blurred, cpi->last_frame_unsharp_amount, 0.05, 20, 1.01); + cpi->last_frame_unsharp_amount = best_frame_unsharp_amount; + + unsharp(cpi, source, &blurred, source, best_frame_unsharp_amount); + aom_free_frame_buffer(&blurred); + aom_clear_system_state(); +} + +void av1_vmaf_blk_preprocessing(AV1_COMP *const cpi, + YV12_BUFFER_CONFIG *const source) { + aom_clear_system_state(); + const AV1_COMMON *const cm = &cpi->common; + const int width = source->y_width; + const int height = source->y_height; + const int bit_depth = cpi->td.mb.e_mbd.bd; + + YV12_BUFFER_CONFIG source_extended, blurred; + memset(&blurred, 0, sizeof(blurred)); + memset(&source_extended, 0, sizeof(source_extended)); + aom_alloc_frame_buffer( + &blurred, width, height, 1, 1, cm->seq_params.use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment); + aom_alloc_frame_buffer( + &source_extended, width, height, 1, 1, cm->seq_params.use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment); + + av1_copy_and_extend_frame(source, &source_extended); + gaussian_blur(bit_depth, &source_extended, &blurred); + aom_free_frame_buffer(&source_extended); + + const double best_frame_unsharp_amount = find_best_frame_unsharp_amount( + cpi, source, &blurred, cpi->last_frame_unsharp_amount, 0.05, 20, 1.01); + cpi->last_frame_unsharp_amount = best_frame_unsharp_amount; + + const int block_size = BLOCK_64X64; + const int block_w = mi_size_wide[block_size] * 4; + const int block_h = mi_size_high[block_size] * 4; + const int num_cols = (source->y_width + block_w - 1) / block_w; + const int num_rows = (source->y_height + block_h - 1) / block_h; + double *best_unsharp_amounts = + aom_malloc(sizeof(*best_unsharp_amounts) * num_cols * num_rows); + memset(best_unsharp_amounts, 0, + sizeof(*best_unsharp_amounts) * num_cols * num_rows); + + YV12_BUFFER_CONFIG source_block, blurred_block; + memset(&source_block, 0, sizeof(source_block)); + memset(&blurred_block, 0, sizeof(blurred_block)); + aom_alloc_frame_buffer( + &source_block, block_w, block_h, 1, 1, cm->seq_params.use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment); + aom_alloc_frame_buffer( + &blurred_block, block_w, block_h, 1, 1, cm->seq_params.use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment); + + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + const int row_offset_y = row * block_h; + const int col_offset_y = col * block_w; + const int block_width = AOMMIN(width - col_offset_y, block_w); + const int block_height = AOMMIN(height - row_offset_y, block_h); + const int index = col + row * num_cols; + + if (bit_depth > 8) { + uint16_t *frame_src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) + + row_offset_y * source->y_stride + + col_offset_y; + uint16_t *frame_blurred_buf = CONVERT_TO_SHORTPTR(blurred.y_buffer) + + row_offset_y * blurred.y_stride + + col_offset_y; + uint16_t *blurred_dst = CONVERT_TO_SHORTPTR(blurred_block.y_buffer); + uint16_t *src_dst = CONVERT_TO_SHORTPTR(source_block.y_buffer); + + // Copy block from source frame. + for (int i = 0; i < block_h; ++i) { + for (int j = 0; j < block_w; ++j) { + if (i >= block_height || j >= block_width) { + src_dst[j] = 0; + blurred_dst[j] = 0; + } else { + src_dst[j] = frame_src_buf[j]; + blurred_dst[j] = frame_blurred_buf[j]; + } + } + frame_src_buf += source->y_stride; + frame_blurred_buf += blurred.y_stride; + src_dst += source_block.y_stride; + blurred_dst += blurred_block.y_stride; + } + } else { + uint8_t *frame_src_buf = + source->y_buffer + row_offset_y * source->y_stride + col_offset_y; + uint8_t *frame_blurred_buf = + blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y; + uint8_t *blurred_dst = blurred_block.y_buffer; + uint8_t *src_dst = source_block.y_buffer; + + // Copy block from source frame. + for (int i = 0; i < block_h; ++i) { + for (int j = 0; j < block_w; ++j) { + if (i >= block_height || j >= block_width) { + src_dst[j] = 0; + blurred_dst[j] = 0; + } else { + src_dst[j] = frame_src_buf[j]; + blurred_dst[j] = frame_blurred_buf[j]; + } + } + frame_src_buf += source->y_stride; + frame_blurred_buf += blurred.y_stride; + src_dst += source_block.y_stride; + blurred_dst += blurred_block.y_stride; + } + } + + best_unsharp_amounts[index] = find_best_frame_unsharp_amount( + cpi, &source_block, &blurred_block, best_frame_unsharp_amount, 0.1, 3, + 1.5); + } + } + + // Apply best blur amounts + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + const int row_offset_y = row * block_h; + const int col_offset_y = col * block_w; + const int block_width = AOMMIN(source->y_width - col_offset_y, block_w); + const int block_height = AOMMIN(source->y_height - row_offset_y, block_h); + const int index = col + row * num_cols; + + if (bit_depth > 8) { + uint16_t *src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) + + row_offset_y * source->y_stride + col_offset_y; + uint16_t *blurred_buf = CONVERT_TO_SHORTPTR(blurred.y_buffer) + + row_offset_y * blurred.y_stride + col_offset_y; + highbd_unsharp_rect(src_buf, source->y_stride, blurred_buf, + blurred.y_stride, src_buf, source->y_stride, + block_width, block_height, + best_unsharp_amounts[index], bit_depth); + } else { + uint8_t *src_buf = + source->y_buffer + row_offset_y * source->y_stride + col_offset_y; + uint8_t *blurred_buf = + blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y; + unsharp_rect(src_buf, source->y_stride, blurred_buf, blurred.y_stride, + src_buf, source->y_stride, block_width, block_height, + best_unsharp_amounts[index]); + } + } + } + + aom_free_frame_buffer(&source_block); + aom_free_frame_buffer(&blurred_block); + aom_free_frame_buffer(&blurred); + aom_free(best_unsharp_amounts); + aom_clear_system_state(); +} + +typedef struct FrameData { + const YV12_BUFFER_CONFIG *source, *blurred; + int block_w, block_h, num_rows, num_cols, row, col, bit_depth; +} FrameData; + +// A callback function used to pass data to VMAF. +// Returns 0 after reading a frame. +// Returns 2 when there is no more frame to read. +static int update_frame(float *ref_data, float *main_data, float *temp_data, + int stride, void *user_data) { + FrameData *frames = (FrameData *)user_data; + const int width = frames->source->y_width; + const int height = frames->source->y_height; + const int row = frames->row; + const int col = frames->col; + const int num_rows = frames->num_rows; + const int num_cols = frames->num_cols; + const int block_w = frames->block_w; + const int block_h = frames->block_h; + const YV12_BUFFER_CONFIG *source = frames->source; + const YV12_BUFFER_CONFIG *blurred = frames->blurred; + const int bit_depth = frames->bit_depth; + const float scale_factor = 1.0f / (float)(1 << (bit_depth - 8)); + (void)temp_data; + stride /= (int)sizeof(*ref_data); + + for (int i = 0; i < height; ++i) { + float *ref, *main; + ref = ref_data + i * stride; + main = main_data + i * stride; + if (bit_depth == 8) { + uint8_t *src; + src = source->y_buffer + i * source->y_stride; + for (int j = 0; j < width; ++j) { + ref[j] = main[j] = (float)src[j]; + } + } else { + uint16_t *src; + src = CONVERT_TO_SHORTPTR(source->y_buffer) + i * source->y_stride; + for (int j = 0; j < width; ++j) { + ref[j] = main[j] = scale_factor * (float)src[j]; + } + } + } + if (row < num_rows && col < num_cols) { + // Set current block + const int row_offset = row * block_h; + const int col_offset = col * block_w; + const int block_width = AOMMIN(width - col_offset, block_w); + const int block_height = AOMMIN(height - row_offset, block_h); + + float *main_buf = main_data + col_offset + row_offset * stride; + if (bit_depth == 8) { + uint8_t *blurred_buf = + blurred->y_buffer + row_offset * blurred->y_stride + col_offset; + for (int i = 0; i < block_height; ++i) { + for (int j = 0; j < block_width; ++j) { + main_buf[j] = (float)blurred_buf[j]; + } + main_buf += stride; + blurred_buf += blurred->y_stride; + } + } else { + uint16_t *blurred_buf = CONVERT_TO_SHORTPTR(blurred->y_buffer) + + row_offset * blurred->y_stride + col_offset; + for (int i = 0; i < block_height; ++i) { + for (int j = 0; j < block_width; ++j) { + main_buf[j] = scale_factor * (float)blurred_buf[j]; + } + main_buf += stride; + blurred_buf += blurred->y_stride; + } + } + + frames->col++; + if (frames->col >= num_cols) { + frames->col = 0; + frames->row++; + } + return 0; + } else { + return 2; + } +} + +void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + const int y_width = cpi->source->y_width; + const int y_height = cpi->source->y_height; + const int resized_block_size = BLOCK_32X32; + const int resize_factor = 2; + const int bit_depth = cpi->td.mb.e_mbd.bd; + + aom_clear_system_state(); + YV12_BUFFER_CONFIG resized_source; + memset(&resized_source, 0, sizeof(resized_source)); + aom_alloc_frame_buffer( + &resized_source, y_width / resize_factor, y_height / resize_factor, 1, 1, + cm->seq_params.use_highbitdepth, cpi->oxcf.border_in_pixels, + cm->features.byte_alignment); + av1_resize_and_extend_frame(cpi->source, &resized_source, bit_depth, + av1_num_planes(cm)); + + const int resized_y_width = resized_source.y_width; + const int resized_y_height = resized_source.y_height; + const int resized_block_w = mi_size_wide[resized_block_size] * 4; + const int resized_block_h = mi_size_high[resized_block_size] * 4; + const int num_cols = + (resized_y_width + resized_block_w - 1) / resized_block_w; + const int num_rows = + (resized_y_height + resized_block_h - 1) / resized_block_h; + + YV12_BUFFER_CONFIG blurred; + memset(&blurred, 0, sizeof(blurred)); + aom_alloc_frame_buffer(&blurred, resized_y_width, resized_y_height, 1, 1, + cm->seq_params.use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment); + gaussian_blur(bit_depth, &resized_source, &blurred); + + double *scores = aom_malloc(sizeof(*scores) * (num_rows * num_cols)); + memset(scores, 0, sizeof(*scores) * (num_rows * num_cols)); + FrameData frame_data; + frame_data.source = &resized_source; + frame_data.blurred = &blurred; + frame_data.block_w = resized_block_w; + frame_data.block_h = resized_block_h; + frame_data.num_rows = num_rows; + frame_data.num_cols = num_cols; + frame_data.row = 0; + frame_data.col = 0; + frame_data.bit_depth = bit_depth; + aom_calc_vmaf_multi_frame(&frame_data, cpi->oxcf.vmaf_model_path, + update_frame, resized_y_width, resized_y_height, + bit_depth, scores); + + // Loop through each 'block_size' block. + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + const int index = row * num_cols + col; + const int row_offset_y = row * resized_block_h; + const int col_offset_y = col * resized_block_w; + + uint8_t *const orig_buf = resized_source.y_buffer + + row_offset_y * resized_source.y_stride + + col_offset_y; + uint8_t *const blurred_buf = + blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y; + + const double vmaf = scores[index]; + const double dvmaf = kBaselineVmaf - vmaf; + unsigned int sse; + cpi->fn_ptr[resized_block_size].vf(orig_buf, resized_source.y_stride, + blurred_buf, blurred.y_stride, &sse); + + const double mse = + (double)sse / (double)(resized_y_width * resized_y_height); + double weight; + const double eps = 0.01 / (num_rows * num_cols); + if (dvmaf < eps || mse < eps) { + weight = 1.0; + } else { + weight = mse / dvmaf; + } + + // Normalize it with a data fitted model. + weight = 6.0 * (1.0 - exp(-0.05 * weight)) + 0.8; + cpi->vmaf_rdmult_scaling_factors[index] = weight; + } + } + + aom_free_frame_buffer(&resized_source); + aom_free_frame_buffer(&blurred); + aom_free(scores); + aom_clear_system_state(); +} + +void av1_set_vmaf_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col, int *const rdmult) { + const AV1_COMMON *const cm = &cpi->common; + + const int bsize_base = BLOCK_64X64; + const int num_mi_w = mi_size_wide[bsize_base]; + const int num_mi_h = mi_size_high[bsize_base]; + const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w; + const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h; + const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w; + const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h; + int row, col; + double num_of_mi = 0.0; + double geom_mean_of_scale = 0.0; + + aom_clear_system_state(); + for (row = mi_row / num_mi_w; + row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) { + for (col = mi_col / num_mi_h; + col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) { + const int index = row * num_cols + col; + geom_mean_of_scale += log(cpi->vmaf_rdmult_scaling_factors[index]); + num_of_mi += 1.0; + } + } + geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi); + + *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5); + *rdmult = AOMMAX(*rdmult, 0); + set_error_per_bit(x, *rdmult); + aom_clear_system_state(); +} + +// TODO(sdeng): replace them with the SIMD versions. +static AOM_INLINE double highbd_image_sad_c(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int w, int h) { + double accum = 0.0; + int i, j; + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + double img1px = src[i * src_stride + j]; + double img2px = ref[i * ref_stride + j]; + + accum += fabs(img1px - img2px); + } + } + + return accum / (double)(h * w); +} + +static AOM_INLINE double image_sad_c(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int w, + int h) { + double accum = 0.0; + int i, j; + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + double img1px = src[i * src_stride + j]; + double img2px = ref[i * ref_stride + j]; + + accum += fabs(img1px - img2px); + } + } + + return accum / (double)(h * w); +} + +static AOM_INLINE double calc_vmaf_motion_score( + const AV1_COMP *const cpi, const AV1_COMMON *const cm, + const YV12_BUFFER_CONFIG *const cur, const YV12_BUFFER_CONFIG *const last, + const YV12_BUFFER_CONFIG *const next) { + const int y_width = cur->y_width; + const int y_height = cur->y_height; + YV12_BUFFER_CONFIG blurred_cur, blurred_last, blurred_next; + const int bit_depth = cpi->td.mb.e_mbd.bd; + + memset(&blurred_cur, 0, sizeof(blurred_cur)); + memset(&blurred_last, 0, sizeof(blurred_last)); + memset(&blurred_next, 0, sizeof(blurred_next)); + + aom_alloc_frame_buffer( + &blurred_cur, y_width, y_height, 1, 1, cm->seq_params.use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment); + aom_alloc_frame_buffer( + &blurred_last, y_width, y_height, 1, 1, cm->seq_params.use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment); + aom_alloc_frame_buffer( + &blurred_next, y_width, y_height, 1, 1, cm->seq_params.use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment); + + gaussian_blur(bit_depth, cur, &blurred_cur); + gaussian_blur(bit_depth, last, &blurred_last); + if (next) gaussian_blur(bit_depth, next, &blurred_next); + + double motion1, motion2 = 65536.0; + if (bit_depth > 8) { + const float scale_factor = 1.0f / (float)(1 << (bit_depth - 8)); + motion1 = highbd_image_sad_c(CONVERT_TO_SHORTPTR(blurred_cur.y_buffer), + blurred_cur.y_stride, + CONVERT_TO_SHORTPTR(blurred_last.y_buffer), + blurred_last.y_stride, y_width, y_height) * + scale_factor; + if (next) { + motion2 = highbd_image_sad_c(CONVERT_TO_SHORTPTR(blurred_cur.y_buffer), + blurred_cur.y_stride, + CONVERT_TO_SHORTPTR(blurred_next.y_buffer), + blurred_next.y_stride, y_width, y_height) * + scale_factor; + } + } else { + motion1 = image_sad_c(blurred_cur.y_buffer, blurred_cur.y_stride, + blurred_last.y_buffer, blurred_last.y_stride, y_width, + y_height); + if (next) { + motion2 = image_sad_c(blurred_cur.y_buffer, blurred_cur.y_stride, + blurred_next.y_buffer, blurred_next.y_stride, + y_width, y_height); + } + } + + aom_free_frame_buffer(&blurred_cur); + aom_free_frame_buffer(&blurred_last); + aom_free_frame_buffer(&blurred_next); + + return AOMMIN(motion1, motion2); +} + +// Calculates the new qindex from the VMAF motion score. This is based on the +// observation: when the motion score becomes higher, the VMAF score of the +// same source and distorted frames would become higher. +int av1_get_vmaf_base_qindex(const AV1_COMP *const cpi, int current_qindex) { + const AV1_COMMON *const cm = &cpi->common; + if (cm->current_frame.frame_number == 0 || cpi->oxcf.pass == 1) { + return current_qindex; + } + const int bit_depth = cpi->td.mb.e_mbd.bd; + const double approx_sse = + cpi->last_frame_ysse / + (double)((1 << (bit_depth - 8)) * (1 << (bit_depth - 8))); + const double approx_dvmaf = kBaselineVmaf - cpi->last_frame_vmaf; + const double sse_threshold = + 0.01 * cpi->source->y_width * cpi->source->y_height; + const double vmaf_threshold = 0.01; + if (approx_sse < sse_threshold || approx_dvmaf < vmaf_threshold) { + return current_qindex; + } + aom_clear_system_state(); + const GF_GROUP *gf_group = &cpi->gf_group; + YV12_BUFFER_CONFIG *cur_buf = cpi->source; + int src_index = 0; + if (cm->show_frame == 0) { + src_index = gf_group->arf_src_offset[gf_group->index]; + struct lookahead_entry *cur_entry = + av1_lookahead_peek(cpi->lookahead, src_index, cpi->compressor_stage); + cur_buf = &cur_entry->img; + } + assert(cur_buf); + + const struct lookahead_entry *last_entry = + av1_lookahead_peek(cpi->lookahead, src_index - 1, cpi->compressor_stage); + const struct lookahead_entry *next_entry = + av1_lookahead_peek(cpi->lookahead, src_index + 1, cpi->compressor_stage); + const YV12_BUFFER_CONFIG *next_buf = &next_entry->img; + const YV12_BUFFER_CONFIG *last_buf = + cm->show_frame ? cpi->last_source : &last_entry->img; + + assert(last_buf); + + const double motion = + calc_vmaf_motion_score(cpi, cm, cur_buf, last_buf, next_buf); + + // Get dVMAF through a data fitted model. + const double dvmaf = 26.11 * (1.0 - exp(-0.06 * motion)); + const double dsse = dvmaf * approx_sse / approx_dvmaf; + + const double beta = approx_sse / (dsse + approx_sse); + const int offset = av1_get_deltaq_offset(cpi, current_qindex, beta); + int qindex = current_qindex + offset; + + qindex = AOMMIN(qindex, MAXQ); + qindex = AOMMAX(qindex, MINQ); + + aom_clear_system_state(); + return qindex; +} + +void av1_update_vmaf_curve(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *recon) { + const int bit_depth = cpi->td.mb.e_mbd.bd; + aom_calc_vmaf(cpi->oxcf.vmaf_model_path, source, recon, bit_depth, + &cpi->last_frame_vmaf); + if (bit_depth > 8) { + cpi->last_frame_ysse = (double)aom_highbd_get_y_sse(source, recon); + } else { + cpi->last_frame_ysse = (double)aom_get_y_sse(source, recon); + } +} diff --git a/libs/libaom/src/av1/encoder/tune_vmaf.h b/libs/libaom/src/av1/encoder/tune_vmaf.h new file mode 100644 index 000000000..c4cf07224 --- /dev/null +++ b/libs/libaom/src/av1/encoder/tune_vmaf.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_TUNE_VMAF_H_ +#define AOM_AV1_ENCODER_TUNE_VMAF_H_ + +#include "aom_scale/yv12config.h" +#include "av1/encoder/encoder.h" + +void av1_vmaf_blk_preprocessing(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source); + +void av1_vmaf_frame_preprocessing(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source); + +void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi); + +void av1_set_vmaf_rdmult(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + int mi_row, int mi_col, int *rdmult); + +int av1_get_vmaf_base_qindex(const AV1_COMP *cpi, int current_qindex); + +void av1_update_vmaf_curve(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *recon); + +#endif // AOM_AV1_ENCODER_TUNE_VMAF_H_ diff --git a/libs/libaom/src/av1/encoder/tx_prune_model_weights.h b/libs/libaom/src/av1/encoder/tx_prune_model_weights.h new file mode 100644 index 000000000..76efe9382 --- /dev/null +++ b/libs/libaom/src/av1/encoder/tx_prune_model_weights.h @@ -0,0 +1,3320 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_ +#define AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/ml.h" + +/***************************CONFIG_NN_V2 (New)********************************/ +#if CONFIG_NN_V2 +// Tx type model for 4x4 block. +static float av1_tx_type_nn_4x4_hor_layer0_weights[32] = { + -1.64947f, -1.54497f, -1.62832f, -0.17774f, -2.89498f, -0.72498f, 0.72036f, + 0.17996f, 1.20000f, -0.27654f, 0.77396f, 1.21684f, -1.75909f, -0.51272f, + -1.25923f, 0.35005f, -0.04257f, -0.23389f, -0.41841f, -0.08229f, 0.09503f, + 2.73144f, -0.16875f, -0.23482f, 0.02194f, -0.26427f, 0.28049f, 0.21260f, + 1.35792f, 0.27733f, 0.88660f, -0.68304f, +}; + +static float av1_tx_type_nn_4x4_hor_layer0_bias[8] = { + 1.38742f, 0.59540f, -1.37622f, 1.92114f, + 0.00000f, -0.38998f, -0.32726f, -0.15650f, +}; + +static float av1_tx_type_nn_4x4_hor_layer1_weights[32] = { + 1.65254f, 1.00915f, -0.89318f, -2.05142f, -0.23235f, 0.96781f, -0.37145f, + -0.21056f, 1.13891f, 0.38675f, 0.87739f, -1.42697f, 0.48015f, 0.61883f, + -0.03979f, 0.11487f, 0.48042f, 0.45200f, -0.23242f, 0.75166f, 0.55458f, + 0.39452f, -0.35285f, 1.59120f, -1.49221f, -0.48349f, -0.64692f, 1.49297f, + -0.26782f, -0.65416f, -0.10648f, 0.05568f, +}; + +static float av1_tx_type_nn_4x4_hor_layer1_bias[4] = { + 4.07177f, + 3.26961f, + 0.58083f, + 1.21199f, +}; + +static float av1_tx_type_nn_4x4_hor_layer0_out[8] = { 0 }; +static float av1_tx_type_nn_4x4_hor_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_4x4_hor = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 4, // num_inputs + 8, // num_outputs + av1_tx_type_nn_4x4_hor_layer0_weights, // weights + av1_tx_type_nn_4x4_hor_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_4x4_hor_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 8, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_4x4_hor_layer1_weights, + av1_tx_type_nn_4x4_hor_layer1_bias, + NONE, + av1_tx_type_nn_4x4_hor_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_4x4_hor_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; + +static float av1_tx_type_nn_4x4_ver_layer0_weights[32] = { + -0.02032f, 2.61610f, 0.02098f, -0.30217f, 0.12637f, 0.11017f, -3.01996f, + 0.35144f, 1.93776f, -0.20463f, 1.64102f, -1.41986f, -3.66717f, -0.51655f, + 0.43910f, 0.37778f, -1.02634f, 0.85337f, -0.69753f, 1.00206f, 2.11784f, + 1.89427f, 1.92919f, 0.43201f, -1.67358f, -1.67035f, -1.54623f, 0.16714f, + -0.06589f, -0.28142f, -0.33118f, 1.72227f, +}; + +static float av1_tx_type_nn_4x4_ver_layer0_bias[8] = { + -0.33685f, 0.22025f, 0.28140f, 0.56138f, + 0.93489f, -1.77048f, 1.34989f, -0.93747f, +}; + +static float av1_tx_type_nn_4x4_ver_layer1_weights[32] = { + -1.39506f, -1.06271f, -1.10886f, -1.69719f, 0.19699f, -2.39850f, -1.26457f, + 0.75328f, -1.26005f, -0.82738f, -0.12015f, -1.02702f, 1.40828f, -2.37739f, + -0.65639f, -0.71992f, -0.90453f, -1.12510f, -2.41362f, -1.16061f, -1.85577f, + -0.99165f, -1.91366f, 0.16785f, 0.34776f, 0.58154f, -0.18217f, -0.29257f, + -0.86315f, -0.53336f, 0.30320f, -1.32331f, +}; + +static float av1_tx_type_nn_4x4_ver_layer1_bias[4] = { + -1.31519f, + -3.26321f, + 1.71794f, + -1.90778f, +}; + +static float av1_tx_type_nn_4x4_ver_layer0_out[8] = { 0 }; +static float av1_tx_type_nn_4x4_ver_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_4x4_ver = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 4, // num_inputs + 8, // num_outputs + av1_tx_type_nn_4x4_ver_layer0_weights, // weights + av1_tx_type_nn_4x4_ver_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_4x4_ver_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 8, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_4x4_ver_layer1_weights, + av1_tx_type_nn_4x4_ver_layer1_bias, + NONE, + av1_tx_type_nn_4x4_ver_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_4x4_ver_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; +/******************************************************************************/ + +// Tx type model for 4x8 block. +static float av1_tx_type_nn_4x8_hor_layer0_weights[32] = { + 0.00218f, -0.41880f, -0.61215f, -0.92588f, 0.54291f, -0.10898f, 0.70691f, + 0.46819f, -1.61598f, -0.08834f, -0.96839f, 1.18489f, -0.45171f, -0.65445f, + -0.32179f, -0.10399f, 1.04379f, 0.91895f, 0.85589f, 0.08267f, 1.35388f, + -2.03096f, 0.08168f, -0.06372f, -0.26732f, -0.48262f, -0.08682f, 2.44071f, + -1.35896f, -1.17121f, 1.68866f, 0.10357f, +}; + +static float av1_tx_type_nn_4x8_hor_layer0_bias[8] = { + 2.93391f, 0.66831f, -0.21419f, 0.00000f, + -0.72878f, 0.15127f, -1.46755f, 0.16658f, +}; + +static float av1_tx_type_nn_4x8_hor_layer1_weights[32] = { + -1.52077f, -1.06243f, 0.35319f, -0.49207f, 0.54524f, 0.44271f, 1.37117f, + -0.38957f, -1.28889f, -0.57133f, 0.04658f, 0.62278f, 0.37984f, 0.33247f, + 1.65547f, -0.56806f, -1.38645f, -0.76258f, 0.67926f, 0.08783f, -0.01443f, + 0.34950f, 1.45812f, -0.51332f, -1.41331f, -0.16453f, 0.05755f, 0.31405f, + -0.50191f, 0.18219f, 1.83664f, -0.75276f, +}; + +static float av1_tx_type_nn_4x8_hor_layer1_bias[4] = { + -1.17455f, + -2.26089f, + -1.79863f, + -2.26333f, +}; + +static float av1_tx_type_nn_4x8_hor_layer0_out[8] = { 0 }; +static float av1_tx_type_nn_4x8_hor_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_4x8_hor = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 4, // num_inputs + 8, // num_outputs + av1_tx_type_nn_4x8_hor_layer0_weights, // weights + av1_tx_type_nn_4x8_hor_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_4x8_hor_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 8, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_4x8_hor_layer1_weights, + av1_tx_type_nn_4x8_hor_layer1_bias, + NONE, + av1_tx_type_nn_4x8_hor_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_4x8_hor_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; + +static float av1_tx_type_nn_4x8_ver_layer0_weights[128] = { + -0.00952f, -0.98858f, -0.93181f, 1.39594f, 0.96559f, 0.18162f, -0.76064f, + -0.06066f, 0.07907f, -0.09365f, -0.21313f, -0.02187f, -2.61707f, -2.68702f, + -0.10982f, 0.18559f, 1.17049f, 1.11387f, 1.12697f, 1.05804f, 1.12764f, + 1.06318f, 1.12052f, 0.17406f, 1.83157f, 0.19362f, 0.46910f, 0.39608f, + 0.33342f, 0.40083f, 0.27645f, 1.06864f, -4.06645f, -0.38775f, -0.11070f, + 0.03781f, -0.09141f, 0.06185f, -0.04852f, 0.20163f, 0.16784f, 0.16641f, + -0.50941f, -0.61087f, 2.07008f, -0.82381f, -0.85558f, 0.05528f, -0.10535f, + -2.81150f, 0.67038f, 0.43643f, 0.49062f, -0.04465f, 0.90438f, 0.00977f, + 0.46272f, 1.59751f, 0.95234f, 0.35086f, 0.85624f, 0.73149f, 1.67779f, + -2.21511f, -1.24746f, -1.09014f, -0.92441f, -1.22591f, -1.06961f, -0.95897f, + -1.24956f, 0.73797f, 1.23275f, -0.60064f, -0.07851f, 0.14397f, 0.22110f, + -0.04422f, 0.14350f, 0.75926f, 0.35032f, 0.48104f, 2.81408f, 0.34662f, + 0.42090f, 0.35521f, -1.36804f, -0.14974f, -0.47696f, -0.07892f, 0.36910f, + 0.32299f, 0.23916f, 0.06032f, -0.17844f, -0.17558f, -1.42746f, -0.55828f, + -1.00418f, -0.64823f, -0.73654f, -0.85197f, -1.50989f, 1.69385f, -0.04973f, + -0.09273f, 1.04249f, 0.79235f, 1.13229f, 0.99617f, 0.03851f, 0.56334f, + 0.90795f, 1.08296f, 0.58519f, 1.74765f, 0.63971f, 1.35951f, 0.07803f, + -0.05127f, 0.26514f, -0.84629f, -0.66343f, -2.10630f, 0.11017f, 2.18528f, + -0.21958f, 0.05970f, +}; + +static float av1_tx_type_nn_4x8_ver_layer0_bias[16] = { + 0.04205f, 0.22260f, -1.03870f, -1.19568f, 0.44283f, 0.01143f, + 0.00235f, 4.26772f, 0.44364f, -0.33199f, -0.39076f, -0.35129f, + 0.08288f, 0.18195f, -0.79890f, 0.10047f, +}; + +static float av1_tx_type_nn_4x8_ver_layer1_weights[64] = { + -0.38193f, -0.12095f, 1.57802f, 0.34932f, -0.47333f, -0.12304f, -0.01736f, + -2.52445f, 0.18983f, -0.64707f, -0.60889f, -0.53750f, 0.91666f, -0.62823f, + -0.13377f, -0.43594f, -0.38618f, -0.01328f, 0.97457f, 1.48589f, -1.03238f, + -0.33459f, -0.35108f, -2.42417f, 0.60229f, 0.06824f, -0.75495f, 0.26902f, + 0.65311f, -0.23887f, -0.44604f, -0.55800f, -0.33842f, 0.04259f, -0.59589f, + 0.49738f, -0.62301f, -0.30896f, -0.29602f, -2.57052f, 2.00943f, -0.66490f, + -0.76312f, 0.28256f, 1.06311f, -0.38364f, -0.63508f, -0.57609f, -0.88765f, + -1.04403f, -0.46531f, 0.34084f, -1.20498f, -0.68352f, -0.72251f, -2.63242f, + -0.68736f, -0.37904f, -1.32371f, 0.47288f, 1.51904f, 0.78372f, -1.01830f, + -1.01848f, +}; + +static float av1_tx_type_nn_4x8_ver_layer1_bias[4] = { + -1.45955f, + -2.08949f, + -1.24813f, + -1.55368f, +}; + +static float av1_tx_type_nn_4x8_ver_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_4x8_ver_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_4x8_ver = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_4x8_ver_layer0_weights, // weights + av1_tx_type_nn_4x8_ver_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_4x8_ver_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_4x8_ver_layer1_weights, + av1_tx_type_nn_4x8_ver_layer1_bias, + NONE, + av1_tx_type_nn_4x8_ver_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_4x8_ver_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; + +/******************************************************************************/ + +// Tx type model for 8x4 block. +static float av1_tx_type_nn_8x4_hor_layer0_weights[128] = { + -0.22492f, 0.13341f, -4.03243f, -0.64015f, 0.02783f, 0.60466f, -0.13335f, + 0.16828f, 0.12336f, 0.52904f, 1.18455f, -0.32425f, 0.13052f, 0.93810f, + -3.71165f, 0.02990f, -4.63558f, 0.05666f, 0.03524f, -0.07449f, -0.44006f, + -0.33215f, -0.33713f, 0.08097f, 0.60873f, 0.29582f, 0.21696f, -0.78729f, + -0.16757f, -0.26567f, -0.00720f, -1.11226f, 1.58189f, 1.58463f, 1.48536f, + 1.54374f, 1.60069f, 1.46125f, 1.53932f, 0.05974f, -1.82192f, 0.47043f, + 0.38090f, 0.20833f, -0.05637f, 0.05183f, 0.01323f, -0.25662f, 0.78634f, + -0.55069f, -0.02975f, -1.29294f, -0.77192f, -2.34299f, -1.28074f, 0.77894f, + -1.69740f, -1.66032f, -1.44323f, -1.55063f, -1.50845f, -1.23690f, -1.80663f, + 0.75079f, 2.32551f, 0.05878f, 0.80438f, 0.88584f, 0.69153f, 0.89060f, + 0.73660f, 0.87259f, -0.00745f, -1.30044f, -0.59430f, 2.07270f, 1.03307f, + -0.84697f, -1.19393f, 0.17549f, -0.24978f, -3.67234f, 0.20781f, -0.53946f, + -0.05068f, 0.88274f, 1.30371f, 0.10288f, 0.07585f, 0.12259f, -0.30815f, + 0.25437f, -2.82096f, -2.69482f, 0.02370f, 0.12500f, -0.21019f, -0.49220f, + 0.03638f, -0.29795f, 0.28645f, -0.48432f, -0.38584f, -0.32148f, -0.47197f, + 0.32437f, 0.32528f, -0.19437f, 0.30383f, -0.31879f, 0.26359f, -0.12164f, + -0.43647f, -0.08288f, -0.33438f, -0.63608f, -0.46647f, -0.46574f, 0.47806f, + -0.49012f, -1.51234f, -1.13502f, -1.20470f, -1.02913f, -1.09182f, -0.93921f, + -1.85523f, 0.92532f, +}; + +static float av1_tx_type_nn_8x4_hor_layer0_bias[16] = { + 0.36631f, 0.02901f, 0.64305f, 1.53074f, -1.40229f, 0.03852f, + -0.05043f, 0.89632f, -1.23312f, 0.07036f, 0.17070f, 0.56250f, + -0.28958f, -0.32869f, -0.01704f, 0.68171f, +}; + +static float av1_tx_type_nn_8x4_hor_layer1_weights[64] = { + -0.49441f, -0.31960f, -0.84946f, -0.85800f, -2.37767f, 0.81373f, -0.73172f, + -0.69337f, 0.88807f, -0.49242f, -0.44717f, -0.11436f, 0.09978f, 0.15393f, + 0.17083f, 1.44850f, -0.20582f, -0.04906f, 0.42990f, -0.61939f, -1.09692f, + -1.14885f, -1.36879f, -1.30828f, -0.59558f, -0.30903f, -0.08906f, 0.06953f, + 0.15383f, -0.04193f, -0.54858f, 1.82676f, -0.22411f, 0.05264f, -0.45848f, + -0.72985f, 0.87553f, 0.04116f, -1.29774f, -2.63018f, 1.09089f, -0.36048f, + -0.16725f, 0.11627f, 0.49918f, 0.07539f, 0.00763f, 0.73706f, 0.87800f, + 0.57049f, 0.60969f, 1.02779f, 1.53339f, -0.35915f, 0.06410f, 1.44582f, + 0.09698f, 0.71888f, 0.60594f, 0.84103f, -0.50440f, -0.38825f, 0.15626f, + -1.10654f, +}; + +static float av1_tx_type_nn_8x4_hor_layer1_bias[4] = { + -0.92861f, + -1.45151f, + -1.33588f, + -4.33853f, +}; + +static float av1_tx_type_nn_8x4_hor_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_8x4_hor_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_8x4_hor = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_8x4_hor_layer0_weights, // weights + av1_tx_type_nn_8x4_hor_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_8x4_hor_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_8x4_hor_layer1_weights, + av1_tx_type_nn_8x4_hor_layer1_bias, + NONE, + av1_tx_type_nn_8x4_hor_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_8x4_hor_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; + +static float av1_tx_type_nn_8x4_ver_layer0_weights[32] = { + -1.10946f, 1.86574f, -1.59343f, 0.27018f, -1.70676f, -0.73982f, -0.19021f, + -1.94208f, -2.29759f, -1.44402f, 0.28700f, -1.18340f, -1.50158f, -0.44175f, + -1.36831f, 1.00374f, 2.59312f, 0.50291f, -0.71042f, -0.12238f, -0.15901f, + -0.22807f, -0.67376f, -0.30215f, 0.54407f, -0.45538f, 1.18262f, 2.28687f, + 1.66212f, 1.70826f, 1.55182f, 0.12230f, +}; + +static float av1_tx_type_nn_8x4_ver_layer0_bias[8] = { + 0.10943f, 2.09789f, 2.16578f, 0.15766f, + -0.42461f, 0.00000f, 1.22090f, -1.28717f, +}; + +static float av1_tx_type_nn_8x4_ver_layer1_weights[32] = { + 1.20426f, -1.23237f, 2.41053f, -0.72488f, 1.25249f, 0.18018f, -0.09586f, + 2.17901f, 0.15364f, 1.21535f, -0.38263f, -0.74309f, 0.50551f, -0.54208f, + 0.59139f, 1.16095f, 0.55919f, -0.60183f, 1.18949f, 1.60787f, 0.54002f, + -0.10712f, -0.16153f, 0.16207f, -0.32338f, 2.68712f, -2.83483f, -0.27086f, + -1.15005f, -0.39311f, 1.51236f, -1.68973f, +}; + +static float av1_tx_type_nn_8x4_ver_layer1_bias[4] = { + 1.81013f, + 1.10517f, + 2.90059f, + 0.95391f, +}; + +static float av1_tx_type_nn_8x4_ver_layer0_out[8] = { 0 }; +static float av1_tx_type_nn_8x4_ver_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_8x4_ver = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 4, // num_inputs + 8, // num_outputs + av1_tx_type_nn_8x4_ver_layer0_weights, // weights + av1_tx_type_nn_8x4_ver_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_8x4_ver_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 8, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_8x4_ver_layer1_weights, + av1_tx_type_nn_8x4_ver_layer1_bias, + NONE, + av1_tx_type_nn_8x4_ver_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_8x4_ver_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; +/******************************************************************************/ + +// Tx type model for 8x8 block. +static float av1_tx_type_nn_8x8_hor_layer0_weights[128] = { + -0.85529f, 0.37619f, 0.12754f, 0.08622f, 0.45278f, 0.54929f, 1.60651f, + -0.62654f, -0.54929f, -0.10131f, -0.17569f, 0.13948f, 0.31695f, -0.05616f, + 0.20483f, -0.36448f, 2.27203f, -0.33087f, 0.47679f, 0.86888f, 0.39370f, + 0.46239f, 0.01113f, 1.50327f, -1.48226f, -1.69621f, -1.49777f, -1.38885f, + -1.37753f, -1.22681f, -1.70576f, 0.51329f, -1.65662f, 1.74197f, -0.13579f, + -0.13133f, -0.58396f, -0.55510f, -1.10709f, -2.34975f, 0.22445f, -0.56491f, + -0.83432f, 0.13492f, 1.32147f, 2.85285f, 0.13819f, 0.03792f, -1.30792f, + 0.04155f, -0.70644f, -0.43430f, -0.16212f, -0.86945f, -1.16976f, 1.68339f, + 0.29540f, 0.01137f, -0.25335f, -0.16856f, 0.12028f, 0.05207f, 0.39357f, + -0.01545f, -0.21980f, -1.94091f, -1.01315f, -0.68270f, -0.40590f, -0.67111f, + 2.08283f, 0.19291f, -4.81426f, -0.65044f, -0.24598f, 0.06371f, -0.10272f, + -0.14502f, -0.06821f, 0.45202f, 0.21091f, -0.80864f, 0.39255f, 1.79189f, + 1.80453f, 1.10484f, 1.17608f, 0.96901f, -0.35871f, -0.94311f, 0.63147f, + 2.95157f, 0.45917f, -0.42849f, -0.55643f, -0.06097f, 3.49299f, -0.50972f, + 0.11075f, -0.08405f, -0.09274f, -0.22694f, -0.42426f, 0.48632f, -1.61074f, + 1.82998f, 0.37623f, -1.20330f, -0.01142f, -1.33307f, -0.27492f, -2.23621f, + 1.38846f, 1.42085f, 1.42568f, 1.36152f, 1.46910f, 1.27473f, 1.34752f, + 0.12753f, -1.08197f, -1.08280f, -0.79489f, -1.12338f, -1.06795f, -0.87857f, + -0.99892f, 1.09823f, +}; + +static float av1_tx_type_nn_8x8_hor_layer0_bias[16] = { + -0.49232f, -0.29685f, -1.44020f, 1.10940f, 1.16452f, -0.34862f, + -0.38761f, -0.36243f, 0.21776f, 0.28234f, 2.34269f, -0.04104f, + -0.26319f, 2.65579f, -1.30137f, -0.01487f, +}; + +static float av1_tx_type_nn_8x8_hor_layer1_weights[64] = { + -0.38058f, -0.41295f, -1.26884f, -0.75560f, -1.57450f, 0.56072f, -1.42322f, + -0.29106f, 0.07228f, 0.04391f, 1.61388f, -0.03055f, 0.81637f, 2.06045f, + 0.27119f, -0.48328f, -0.45528f, -0.60534f, -1.61209f, -0.78157f, -1.65034f, + 0.60958f, -1.30523f, 0.25143f, 0.11398f, 0.37860f, 1.54829f, 0.02309f, + 0.67288f, 2.11447f, 0.44845f, -0.70406f, -0.67897f, -0.38759f, -1.30383f, + -1.22646f, -1.54571f, 0.60552f, -1.52565f, 0.11469f, 0.17344f, 0.08622f, + 1.57906f, -0.00909f, 0.81634f, 2.04909f, 1.26466f, -1.45741f, -0.75229f, + 0.06200f, -1.05835f, -0.66257f, -1.73766f, 0.99923f, -1.87082f, 0.14580f, + 0.49525f, 0.46839f, 1.32203f, 0.33923f, 0.97001f, 2.38584f, 1.58811f, + 0.06161f, +}; + +static float av1_tx_type_nn_8x8_hor_layer1_bias[4] = { + 1.70385f, + 1.82373f, + 1.78496f, + 1.80826f, +}; + +static float av1_tx_type_nn_8x8_hor_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_8x8_hor_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_8x8_hor = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_8x8_hor_layer0_weights, // weights + av1_tx_type_nn_8x8_hor_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_8x8_hor_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_8x8_hor_layer1_weights, + av1_tx_type_nn_8x8_hor_layer1_bias, + NONE, + av1_tx_type_nn_8x8_hor_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_8x8_hor_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; + +static float av1_tx_type_nn_8x8_ver_layer0_weights[128] = { + -0.67016f, -1.72366f, -1.86576f, -1.50962f, -1.70419f, -1.73964f, -1.84615f, + 2.09681f, -0.05081f, -0.61030f, 2.02541f, 0.60222f, 0.99936f, 2.02114f, + -0.53893f, -0.23757f, 0.73566f, 0.25443f, 0.00132f, -0.74036f, -0.75351f, + -0.76964f, -1.71007f, -0.15770f, 1.60982f, 2.17638f, 0.90681f, 0.64973f, + 0.85914f, 0.58786f, -1.46228f, 0.05187f, 1.18804f, 0.30850f, 0.29512f, + 0.40526f, 0.37635f, 0.32311f, 0.37471f, 1.12346f, 3.41856f, -0.36653f, + 0.42537f, -0.19240f, 0.00155f, 0.30826f, -0.02116f, -0.53435f, -0.34829f, + -0.52466f, -0.11521f, -0.29163f, -2.05689f, -2.87372f, -0.62626f, 0.09585f, + -0.75257f, 0.10057f, 1.43474f, 0.89450f, 0.75900f, 1.11147f, 1.00558f, + 0.25886f, 2.22095f, -0.17926f, 0.57161f, 0.39546f, 0.47846f, 0.40452f, + 0.54298f, 0.45814f, -3.62788f, -3.02374f, 0.03716f, -0.13937f, -0.09415f, + -0.12463f, 0.05682f, 0.03672f, 1.20746f, 1.25003f, 1.27071f, 1.31883f, + 1.27473f, 1.34943f, 1.23158f, 0.09039f, 0.19388f, 0.63420f, 2.79612f, + 0.93803f, -0.11323f, -0.02027f, 0.41286f, -0.05979f, -3.80705f, -0.52451f, + -0.77098f, -0.68132f, -0.65559f, -0.60975f, -1.26165f, 0.25582f, 0.05346f, + 0.61403f, 0.32140f, -2.39831f, -1.42355f, 1.30541f, 1.02361f, 0.12930f, + -1.61469f, -0.77036f, -0.59144f, 1.27769f, 1.52068f, 0.82137f, 1.83159f, + -0.66626f, -0.69806f, -1.00564f, -0.85995f, -0.90889f, -0.84412f, -0.85712f, + -1.29848f, 0.39308f, +}; + +static float av1_tx_type_nn_8x8_ver_layer0_bias[16] = { + -0.14868f, -0.48343f, 3.94416f, -0.78037f, -1.33789f, -0.60611f, + 0.51793f, 0.44030f, -0.71563f, 0.22561f, -1.19083f, -0.46149f, + 0.83015f, 0.06024f, 1.17180f, 0.65122f, +}; + +static float av1_tx_type_nn_8x8_ver_layer1_weights[64] = { + -1.42711f, -0.21683f, 2.12061f, 0.20489f, -0.50228f, -0.24770f, 0.23391f, + 1.03470f, -0.44847f, -0.63225f, -0.21583f, -0.06467f, -0.21892f, -0.07786f, + 1.43322f, 0.00280f, -1.53057f, -0.18912f, 1.95333f, 0.31151f, -2.07601f, + 0.06776f, 0.25529f, 0.94800f, -1.11453f, -0.20594f, -0.13281f, 0.01485f, + 0.17650f, -0.07955f, 1.43734f, -0.23193f, -2.06463f, -0.21238f, 2.13707f, + 0.30351f, 0.27594f, -0.36245f, 0.19539f, 0.91045f, -0.24068f, -0.37616f, + 0.88792f, 0.02947f, -0.16903f, -0.04932f, 1.51293f, -0.95967f, -1.62903f, + 0.05326f, 2.30703f, 0.64445f, -1.09464f, -0.16623f, 1.00240f, 0.07548f, + -0.50406f, 0.63854f, 1.02340f, 0.49833f, 0.13671f, 0.26722f, 2.09516f, + -0.41305f, +}; + +static float av1_tx_type_nn_8x8_ver_layer1_bias[4] = { + 2.14067f, + 2.76699f, + 2.04233f, + 1.34803f, +}; + +static float av1_tx_type_nn_8x8_ver_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_8x8_ver_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_8x8_ver = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_8x8_ver_layer0_weights, // weights + av1_tx_type_nn_8x8_ver_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_8x8_ver_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_8x8_ver_layer1_weights, + av1_tx_type_nn_8x8_ver_layer1_bias, + NONE, + av1_tx_type_nn_8x8_ver_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_8x8_ver_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; +/******************************************************************************/ + +// Tx type model for 8x16 block. +static float av1_tx_type_nn_8x16_hor_layer0_weights[128] = { + -1.61872f, -1.58520f, -1.41236f, -1.53255f, -1.59794f, -1.25769f, -1.90043f, + 0.73431f, 1.10135f, 0.47054f, 0.43230f, -0.43009f, -0.09135f, -0.07289f, + -0.38785f, 1.23775f, -0.35312f, 0.73789f, 0.88864f, 0.75957f, 0.62579f, + 0.46974f, 0.21851f, 1.63821f, -2.27289f, -0.68522f, -0.69814f, -0.84368f, + -0.91320f, -0.63055f, -1.03296f, 0.55778f, -0.00071f, 1.27539f, 1.60068f, + 1.40975f, 0.97372f, 0.92843f, 1.90853f, 0.12626f, 1.71953f, 1.41978f, + -0.12234f, -1.27058f, 0.76207f, 0.02495f, -0.67038f, -0.05255f, 1.72923f, + 1.47630f, 1.47058f, 1.47614f, 1.49354f, 1.66131f, 1.50801f, 0.17145f, + -2.30947f, -2.10850f, -1.25636f, -0.24900f, 0.72602f, 1.26572f, 0.97865f, + -0.65466f, 1.31129f, 0.26916f, 0.12139f, -0.12761f, -0.39143f, -0.28134f, + 0.06584f, 2.24418f, 0.22516f, 0.05011f, -0.01671f, -0.29476f, -0.40326f, + 0.21138f, -0.11573f, -0.31154f, -0.36828f, 0.03694f, -0.07172f, -0.63419f, + -3.14351f, -1.23125f, 0.65311f, -0.11406f, 1.97287f, -0.10422f, 0.83896f, + 0.85033f, 0.49724f, 0.80482f, 0.51454f, 1.06447f, 0.76693f, 0.72599f, + -0.78573f, -0.53950f, 0.40894f, 0.00086f, 0.10784f, -0.70498f, 1.16395f, + 1.14597f, 1.13496f, 1.12177f, 1.02100f, -1.37574f, -2.97144f, 0.33899f, + 0.42013f, 0.86327f, 2.31983f, 2.04008f, 0.95503f, 0.15081f, 0.11530f, + -0.02574f, -4.77119f, 0.13257f, -0.01704f, -0.23087f, -0.00825f, 0.07029f, + -0.28136f, 0.42556f, +}; + +static float av1_tx_type_nn_8x16_hor_layer0_bias[16] = { + 0.93617f, -0.24000f, -1.26821f, 0.78780f, 0.13690f, -0.21948f, + -1.45162f, 0.44584f, -1.92582f, -0.23169f, 0.56004f, -1.19937f, + 1.81560f, -1.02643f, -0.81690f, 0.08302f, +}; + +static float av1_tx_type_nn_8x16_hor_layer1_weights[64] = { + 0.06696f, -0.11538f, -1.42029f, 0.32965f, 0.81046f, 0.01146f, 1.20945f, + -0.16899f, 0.53224f, -0.40232f, 0.01786f, -0.73242f, 1.29750f, 1.95185f, + 0.70143f, 1.43287f, 0.76220f, 0.79937f, -1.79011f, -1.15178f, 0.42526f, + -0.67519f, 0.77267f, -0.30697f, 2.46004f, -0.49828f, 0.02875f, 1.09972f, + 1.47662f, 0.61719f, 0.61417f, -0.12363f, 2.53048f, 0.00418f, -1.38964f, + 0.88117f, 0.39239f, -0.19347f, -2.58600f, -0.33715f, 1.09323f, -0.32127f, + 0.02456f, -0.19125f, 1.12728f, 0.66502f, 0.34296f, 1.14897f, 0.29967f, + 1.19209f, 0.22108f, -0.11975f, 1.49776f, -1.34624f, -2.58478f, -1.34632f, + 1.53207f, 0.45634f, -1.48476f, 0.17489f, 0.71790f, -2.12086f, -1.21778f, + -1.31243f, +}; + +static float av1_tx_type_nn_8x16_hor_layer1_bias[4] = { + 0.83359f, + 1.06875f, + 1.77645f, + 1.49570f, +}; + +static float av1_tx_type_nn_8x16_hor_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_8x16_hor_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_8x16_hor = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_8x16_hor_layer0_weights, // weights + av1_tx_type_nn_8x16_hor_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_8x16_hor_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_8x16_hor_layer1_weights, + av1_tx_type_nn_8x16_hor_layer1_bias, + NONE, + av1_tx_type_nn_8x16_hor_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_8x16_hor_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; + +static float av1_tx_type_nn_8x16_ver_layer0_weights[128] = { + 0.32858f, -1.28887f, 0.25632f, -0.05262f, 2.69203f, -0.07004f, 1.37337f, + -0.05725f, -0.05659f, 0.05592f, 0.01039f, -0.29343f, 1.58628f, -0.30003f, + -3.43118f, 0.00272f, 1.70928f, -0.76348f, 0.05889f, -0.03263f, -0.07724f, + 0.03523f, -0.19890f, 1.18005f, -0.03605f, -0.20530f, -4.00733f, 0.10210f, + -0.05368f, -0.17650f, -0.15317f, 0.06499f, 0.56705f, 1.04341f, 0.62890f, + 0.73451f, -0.22199f, 0.86659f, 0.78443f, -0.61664f, -0.50606f, 0.30247f, + 0.14455f, 0.39276f, 0.49203f, 0.65019f, 0.12269f, 1.64080f, 1.68289f, + 1.42694f, 1.60825f, 1.58501f, 1.47252f, 1.62589f, 1.48218f, 0.17726f, + -0.04884f, 0.35376f, -0.04796f, 0.32589f, 0.35087f, 0.35258f, -0.46103f, + -0.31176f, -0.05203f, 0.07247f, -0.26756f, 0.22019f, 0.03412f, 0.33773f, + 0.29811f, -0.11140f, 0.12831f, -0.44673f, -0.09858f, 0.07889f, 0.15137f, + 0.00347f, -0.23394f, 0.08886f, -0.31201f, -0.79912f, -0.51092f, 0.14123f, + -1.09599f, -4.26020f, -0.68675f, -0.02842f, -1.54538f, -1.28977f, -1.30558f, + -1.21074f, -1.37142f, -1.14743f, -1.85397f, 0.82985f, -0.30681f, 0.04494f, + -0.24023f, -4.18053f, -0.16096f, -0.55492f, -0.27882f, 0.05829f, -0.41224f, + -2.52088f, -0.56162f, -1.04547f, -1.70685f, -0.28842f, -1.43673f, -0.01468f, + -3.20585f, -0.69120f, -0.43931f, -0.46270f, -0.65885f, -0.55884f, -0.75138f, + 0.36381f, -5.70858f, -0.14548f, -0.15745f, -0.11812f, -0.07605f, -0.07693f, + -0.12236f, 0.16075f, +}; + +static float av1_tx_type_nn_8x16_ver_layer0_bias[16] = { + -0.35385f, 0.30491f, -0.90011f, 0.42941f, 1.20928f, -0.88331f, + -1.48818f, -0.34785f, -0.32668f, -0.22695f, 0.89188f, 0.65521f, + 0.57598f, 0.99819f, 0.75175f, 0.17044f, +}; + +static float av1_tx_type_nn_8x16_ver_layer1_weights[64] = { + -0.62913f, -0.34304f, 0.42963f, -0.17440f, -1.44092f, 0.69142f, -1.36067f, + 0.52211f, 0.44658f, -0.26501f, -0.41657f, 0.34428f, -0.34390f, -0.58567f, + -0.84097f, -1.96311f, -0.37215f, -0.22250f, -1.23811f, -0.07247f, -0.81731f, + 0.58755f, -1.30559f, 0.39551f, 0.41743f, -0.09940f, -0.33230f, 0.14458f, + -0.25139f, -0.54517f, 0.13469f, -0.38157f, -0.39109f, -0.18205f, 0.06834f, + -0.08395f, -0.92187f, 0.56724f, 1.44381f, 0.53226f, -0.22356f, 0.12285f, + -0.29418f, -1.86749f, -0.22372f, -0.60204f, -0.87746f, -1.16936f, 0.56884f, + 0.62641f, -0.11823f, 1.00395f, 1.64794f, -0.64535f, 2.29322f, -0.23397f, + 0.17251f, -0.35927f, 0.65631f, -0.26812f, 0.80128f, 0.85748f, 0.47404f, + 2.20547f, +}; + +static float av1_tx_type_nn_8x16_ver_layer1_bias[4] = { + -0.44080f, + -1.67455f, + -1.46332f, + -6.13206f, +}; + +static float av1_tx_type_nn_8x16_ver_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_8x16_ver_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_8x16_ver = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_8x16_ver_layer0_weights, // weights + av1_tx_type_nn_8x16_ver_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_8x16_ver_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_8x16_ver_layer1_weights, + av1_tx_type_nn_8x16_ver_layer1_bias, + NONE, + av1_tx_type_nn_8x16_ver_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_8x16_ver_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; +/******************************************************************************/ + +// Tx type model for 16x8 block. +static float av1_tx_type_nn_16x8_hor_layer0_weights[128] = { + 0.02600f, 0.09786f, -1.05107f, -0.35594f, -0.15658f, 2.99828f, -0.07106f, + -0.10101f, -0.14412f, -0.83790f, -0.19434f, 2.28368f, 1.91727f, -0.00956f, + -0.90640f, 0.09174f, 1.58895f, 1.38945f, 1.49431f, 1.51381f, 1.44803f, + 1.53544f, 1.44694f, 0.17753f, 1.69735f, -0.78652f, 0.31092f, -0.23736f, + 0.02231f, -0.09884f, -0.00493f, 1.21189f, -1.94382f, -0.34629f, -0.58309f, + 0.72291f, -0.30056f, 0.90660f, -0.57495f, 3.07809f, 0.73644f, 1.43050f, + 1.34356f, -0.66554f, 0.50102f, -0.64305f, 0.42044f, -1.66165f, -0.05733f, + -2.51402f, -1.01067f, -0.33390f, -0.32986f, -0.92431f, 1.86281f, -0.07290f, + -0.26290f, -0.68941f, 1.81156f, 0.66125f, -2.09974f, 0.17032f, -0.67461f, + -0.00876f, -1.50154f, 1.17153f, 1.00377f, 0.33022f, 0.74689f, 0.42878f, + 0.61725f, -0.83967f, 0.09467f, -0.39892f, 0.33863f, 0.10656f, -0.09249f, + -0.39757f, 0.48481f, -0.35162f, 1.47014f, 1.67827f, -1.84051f, 0.16291f, + -0.50135f, -2.29911f, -0.42217f, -0.13358f, 1.45899f, -0.14743f, -0.02763f, + -0.28003f, -0.01364f, 0.21014f, -0.29026f, -0.20198f, 1.38782f, 0.56731f, + 0.27489f, 0.43227f, 0.41326f, 0.42721f, 0.87720f, -1.90067f, -5.04951f, + -0.17638f, -0.58119f, -0.08954f, -0.13692f, -0.12325f, -0.38548f, 0.66462f, + -1.42377f, -1.21917f, -1.38193f, -1.36539f, -1.39378f, -1.19629f, -1.59812f, + 0.28689f, 0.32394f, 0.52128f, 0.01013f, -0.28948f, -0.26293f, -0.44331f, + -0.36570f, -0.50757f, +}; + +static float av1_tx_type_nn_16x8_hor_layer0_bias[16] = { + -0.08696f, -0.22110f, -1.43604f, -1.00451f, -1.51029f, 0.63736f, + 0.45260f, 0.16229f, 4.01393f, -0.21748f, 0.36411f, -0.08764f, + -0.12329f, 0.08986f, 1.08117f, -0.00220f, +}; + +static float av1_tx_type_nn_16x8_hor_layer1_weights[64] = { + 0.55824f, -0.14648f, 0.81947f, -0.45867f, -1.86078f, -0.17291f, 0.34849f, + 0.15153f, 1.75625f, -0.25760f, 0.72015f, -0.30059f, -0.57975f, 0.07609f, + -0.02036f, 0.07912f, 0.57080f, -0.13792f, 0.74184f, -0.87669f, -1.87572f, + -0.27270f, 0.39751f, 0.19652f, 2.03514f, -0.32944f, 0.76251f, 0.04399f, + -0.63175f, 0.37420f, 0.08309f, 0.04466f, 0.60255f, -0.12820f, 1.66065f, + -0.59496f, -1.94794f, -0.14847f, 0.39424f, 0.16273f, 1.80587f, 0.41197f, + 0.74691f, -0.21217f, -0.63173f, 0.09510f, -0.35538f, -0.04407f, 0.92847f, + 0.20141f, 1.68680f, -0.56528f, -2.26960f, 0.12978f, 0.73748f, 0.42438f, + 2.00673f, -0.40189f, 0.95423f, 0.23234f, -0.80953f, 0.65814f, 0.49444f, + -0.23347f, +}; + +static float av1_tx_type_nn_16x8_hor_layer1_bias[4] = { + 3.57175f, + 2.42612f, + 3.31259f, + 2.08287f, +}; + +static float av1_tx_type_nn_16x8_hor_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_16x8_hor_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_16x8_hor = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_16x8_hor_layer0_weights, // weights + av1_tx_type_nn_16x8_hor_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_16x8_hor_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_16x8_hor_layer1_weights, + av1_tx_type_nn_16x8_hor_layer1_bias, + NONE, + av1_tx_type_nn_16x8_hor_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_16x8_hor_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; + +static float av1_tx_type_nn_16x8_ver_layer0_weights[128] = { + 0.46633f, 1.55328f, -0.11230f, -0.29571f, 0.18814f, -1.52430f, -2.34660f, + 0.08644f, -1.97718f, -1.29140f, -1.12262f, -1.12985f, -1.25911f, -0.96506f, + -1.57129f, 0.96021f, 1.34192f, 1.28623f, 1.21655f, 1.28758f, 1.25482f, + 1.30195f, 1.19190f, 0.09310f, 0.52072f, 0.91487f, 1.24100f, 1.61236f, + 1.72166f, 2.20750f, 1.62379f, -1.43936f, 0.50665f, 0.40213f, 0.66502f, + -1.66699f, -3.07618f, 0.05877f, 0.60987f, -0.09995f, -0.10916f, 0.48049f, + 0.23812f, 0.39847f, -0.21682f, -0.63455f, 0.33453f, -0.67939f, -4.14355f, + -0.62756f, -0.22502f, -0.17215f, 0.01062f, 0.27049f, -0.10748f, 0.30945f, + 2.72445f, -0.89181f, -0.06800f, 0.20595f, -0.73385f, 0.04071f, -1.30294f, + 1.83507f, 0.92570f, 0.69609f, 0.76285f, 0.69892f, 0.76409f, 0.63104f, + 0.73397f, 1.09575f, -0.20129f, -0.24022f, -0.24599f, -0.59107f, -0.88755f, + -0.68987f, -0.75495f, -1.31002f, -1.30237f, -0.94093f, -2.15678f, -1.49303f, + -1.17498f, -1.39952f, -0.91270f, -0.05587f, 1.02381f, -0.75580f, -0.65263f, + -0.78996f, -0.71075f, -0.71018f, -0.70350f, -1.26196f, 2.34208f, -0.53611f, + 0.19752f, -0.16842f, -0.24828f, 0.21857f, 0.08222f, -2.55894f, -1.75702f, + 0.11394f, 1.03083f, 0.79972f, -1.54112f, -1.82341f, -0.57597f, -0.02077f, + -0.39616f, -0.00995f, -0.12809f, 0.01188f, -0.25117f, 0.09202f, 0.09336f, + -0.05614f, -0.30039f, 0.25834f, 1.19944f, 1.22533f, 0.92330f, 0.75967f, + -0.81945f, -0.41647f, +}; + +static float av1_tx_type_nn_16x8_ver_layer0_bias[16] = { + 0.17841f, 0.67315f, -1.24450f, 3.13859f, 0.16203f, -0.14992f, + 0.29553f, -1.15567f, -0.71421f, 1.15977f, 1.14585f, 3.02460f, + -0.04510f, 0.48000f, -0.09354f, -0.42422f, +}; + +static float av1_tx_type_nn_16x8_ver_layer1_weights[64] = { + 0.29912f, -0.10009f, -1.11478f, 1.76812f, -0.27719f, 0.52148f, 0.17622f, + -1.17116f, 0.73397f, -0.69279f, -0.11080f, 1.53751f, -1.42003f, 0.14731f, + 0.13592f, -0.04883f, 0.39186f, -0.13655f, -0.43994f, 1.82759f, -0.25601f, + -0.15018f, 0.51920f, -1.56070f, 0.31683f, -0.79367f, -0.02904f, 1.28637f, + -1.15203f, 0.26627f, 0.42828f, -0.24258f, 0.38647f, -0.83352f, 0.32553f, + 2.09522f, -0.26822f, -0.42191f, 0.32825f, -1.30748f, 1.50551f, -0.52669f, + 0.20045f, 1.69318f, -1.47839f, 0.30802f, -0.07290f, -0.28106f, 0.68192f, + -0.15522f, 1.12579f, 2.21921f, 0.09720f, -0.50265f, 0.83165f, -1.31721f, + 0.72422f, -1.24952f, 0.61653f, 2.04117f, -1.42406f, 0.52568f, -0.46180f, + -0.00873f, +}; + +static float av1_tx_type_nn_16x8_ver_layer1_bias[4] = { + 3.34981f, + 3.74710f, + 1.38339f, + 0.45176f, +}; + +static float av1_tx_type_nn_16x8_ver_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_16x8_ver_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_16x8_ver = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_16x8_ver_layer0_weights, // weights + av1_tx_type_nn_16x8_ver_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_16x8_ver_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_16x8_ver_layer1_weights, + av1_tx_type_nn_16x8_ver_layer1_bias, + NONE, + av1_tx_type_nn_16x8_ver_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_16x8_ver_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; +/******************************************************************************/ + +// Tx type model for 16x16 block. +static float av1_tx_type_nn_16x16_layer0_weights[128] = { + 1.26592f, 1.36313f, 1.30956f, 1.29926f, 1.48816f, 1.68851f, 1.32000f, + 0.13321f, -0.22477f, -0.88906f, -0.19622f, 1.69605f, 1.22180f, -1.57771f, + -1.15765f, 0.05710f, -1.13355f, -0.85486f, -0.99971f, -0.91571f, -1.06031f, + -0.77952f, -1.15723f, 1.17809f, 1.35602f, -0.05243f, -0.37596f, 0.26108f, + 0.17611f, -0.10323f, 0.77279f, -0.48911f, -0.79308f, 0.55112f, 0.43918f, + 0.27872f, 0.28714f, 0.45830f, 1.05689f, 0.03705f, -2.49975f, -0.01940f, + 0.05709f, 0.07942f, -0.13290f, -0.10359f, 0.00143f, 0.37303f, 0.96470f, + 0.53293f, 1.14459f, 0.89185f, 0.43378f, 0.47764f, 0.90924f, 0.15279f, + -0.15361f, 0.02949f, 0.42240f, 0.68143f, 0.89588f, 0.73754f, 0.10974f, + 1.57755f, -0.39870f, -0.32914f, 0.35638f, 0.34991f, -0.00003f, -0.23373f, + 0.29630f, -0.76699f, -0.01356f, 0.04234f, 0.84253f, 1.92078f, 0.93160f, + 0.71993f, 0.71604f, 0.76455f, -1.59782f, 0.32332f, 1.11628f, 0.33062f, + -0.03728f, -0.05710f, 0.80447f, -0.14719f, 1.34658f, -0.05718f, 0.64015f, + 0.21926f, 0.41653f, 0.12720f, 0.54092f, 1.39411f, 1.81819f, -0.24513f, + 0.00955f, 0.38011f, -0.57787f, -0.41759f, 0.68834f, -0.31783f, -0.40607f, + -0.10107f, -0.79374f, 0.75599f, -0.16282f, -0.14490f, -0.20783f, -0.55019f, + -0.13793f, -0.22293f, 0.18305f, 0.12445f, 0.56830f, 0.24567f, 0.09278f, + 0.70803f, 0.35803f, -1.52676f, -0.89624f, 0.77665f, 0.19877f, 0.77175f, + 0.50355f, 0.08592f, +}; + +static float av1_tx_type_nn_16x16_layer0_bias[16] = { + -1.31834f, 0.14346f, -0.10062f, 0.84489f, 0.95617f, -0.06720f, + -0.68502f, -0.91442f, -0.31932f, 0.25276f, -0.15138f, -1.57661f, + -0.14062f, -0.42120f, 0.94573f, -0.09287f, +}; + +static float av1_tx_type_nn_16x16_layer1_weights[64] = { + -1.80333f, -1.06353f, 0.55139f, 0.74644f, 0.13747f, -0.93018f, -0.10286f, + 0.67133f, 0.24460f, 1.44583f, 0.02173f, 0.26037f, -0.73687f, 0.19566f, + 0.61846f, -0.58601f, -1.03196f, -0.74415f, 0.30041f, -0.41967f, 1.08740f, + 0.96224f, -0.59139f, 0.03813f, 0.05403f, 1.33427f, -0.54375f, -1.92181f, + 0.54704f, 0.13608f, 0.22151f, -0.38076f, 1.18390f, -0.77508f, -1.84283f, + 1.00894f, 0.62318f, -0.15296f, 1.27600f, 0.22822f, 0.12751f, 0.93910f, + -0.28502f, 0.53912f, -0.96889f, 0.10182f, 0.81508f, -0.43028f, 2.67386f, + 0.52204f, 0.49820f, -0.41711f, 1.05038f, 1.12192f, 0.74349f, -0.75417f, + -0.03718f, -0.35769f, 0.89651f, 0.63236f, 0.54215f, -0.07894f, 0.48274f, + 1.08829f, +}; + +static float av1_tx_type_nn_16x16_layer1_bias[4] = { + 0.81986f, + 1.26865f, + 0.11118f, + 2.48404f, +}; + +static float av1_tx_type_nn_16x16_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_16x16_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_16x16 = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_16x16_layer0_weights, // weights + av1_tx_type_nn_16x16_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_16x16_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_16x16_layer1_weights, + av1_tx_type_nn_16x16_layer1_bias, + NONE, + av1_tx_type_nn_16x16_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_16x16_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; +/******************************************************************************/ + +// Tx type model for 4x16 block. +static float av1_tx_type_nn_4x16_hor_layer0_weights[32] = { + 0.36539f, 0.25667f, 0.01491f, -0.21959f, 2.55105f, 0.17615f, 1.79884f, + 1.65936f, -0.44363f, 0.00706f, -0.68004f, -0.64360f, 1.75760f, 1.91906f, + 1.47682f, 0.09650f, -3.59244f, -0.35004f, 0.93295f, 0.25806f, -0.08154f, + 0.79332f, 0.79535f, 1.09467f, 1.57855f, -0.51359f, 0.90553f, -1.67744f, + -1.74563f, -0.88830f, -1.77603f, 2.15935f, +}; + +static float av1_tx_type_nn_4x16_hor_layer0_bias[8] = { + -0.36435f, -2.22731f, -0.00837f, -1.34546f, + 0.62806f, -0.20675f, 4.91940f, -0.56079f, +}; + +static float av1_tx_type_nn_4x16_hor_layer1_weights[32] = { + -0.57191f, -1.46418f, 0.67331f, -1.15027f, 0.46288f, 0.81251f, 2.51768f, + -0.27147f, 0.00761f, -2.15214f, -0.69650f, -0.50808f, 0.92832f, 0.45668f, + 2.34201f, -0.52941f, 0.51008f, -1.55496f, -0.01371f, -0.12356f, 0.66624f, + 0.88043f, 2.64862f, -1.28024f, -0.17578f, -1.80034f, -0.32217f, 0.89519f, + 1.28413f, -0.30326f, 2.45329f, -0.83335f, +}; + +static float av1_tx_type_nn_4x16_hor_layer1_bias[4] = { + 2.33198f, + 3.36245f, + 1.62603f, + 2.91056f, +}; + +static float av1_tx_type_nn_4x16_hor_layer0_out[8] = { 0 }; +static float av1_tx_type_nn_4x16_hor_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_4x16_hor = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 4, // num_inputs + 8, // num_outputs + av1_tx_type_nn_4x16_hor_layer0_weights, // weights + av1_tx_type_nn_4x16_hor_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_4x16_hor_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 8, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_4x16_hor_layer1_weights, + av1_tx_type_nn_4x16_hor_layer1_bias, + NONE, + av1_tx_type_nn_4x16_hor_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_4x16_hor_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; + +static float av1_tx_type_nn_4x16_ver_layer0_weights[128] = { + 1.61392f, 1.41239f, 1.47646f, 1.47325f, 1.46110f, 1.49208f, 1.49414f, + 0.12835f, -0.76986f, 0.07087f, -0.24572f, -0.93168f, 3.07935f, -0.18183f, + -0.09831f, -0.07703f, -0.03222f, -0.25473f, -0.06090f, 2.93713f, -0.38711f, + -0.12884f, -0.18329f, -0.06262f, -0.00327f, -0.02930f, -0.01641f, -0.00622f, + -0.03305f, -4.07069f, -2.76643f, 0.04413f, -1.03176f, -0.19217f, -0.44980f, + -2.48615f, -2.58112f, -0.87695f, 0.16187f, -0.04891f, -0.06854f, 1.08104f, + 0.75245f, 1.49302f, 0.63363f, 1.45715f, 0.92574f, 1.72029f, 0.33326f, + 3.86646f, 0.04422f, 0.41019f, 0.36212f, 0.56600f, -1.01552f, 0.05128f, + 0.40454f, -1.05100f, -0.47461f, -1.33168f, -0.46145f, -1.36870f, -0.88838f, + -1.05358f, -0.18537f, -0.34357f, -0.03698f, 0.68905f, 0.41010f, 0.31223f, + -0.43382f, -0.74715f, 2.03366f, -0.30419f, 0.45747f, 0.09526f, 0.31678f, + 0.22915f, 0.21832f, 1.26385f, -0.06814f, -0.71417f, -1.18947f, 0.03762f, + 0.10936f, 2.97396f, -0.42638f, -0.03123f, -5.49756f, -0.17029f, -0.11323f, + 0.05173f, -0.44274f, -0.15738f, 0.11311f, 0.43872f, 0.16837f, -0.52849f, + 2.90050f, -0.54735f, -0.29591f, 1.24030f, 0.21696f, -0.04443f, -1.60877f, + -1.36365f, -1.27432f, -1.52060f, -1.34397f, -1.13371f, -1.87554f, 0.80123f, + 0.42820f, -0.14157f, -2.73963f, -0.68040f, -0.35236f, 0.14490f, 2.23477f, + 0.01370f, -0.20426f, -1.51411f, -0.72293f, 0.64516f, 0.97638f, 0.32616f, + -0.27975f, -0.01149f, +}; + +static float av1_tx_type_nn_4x16_ver_layer0_bias[16] = { + -1.37863f, -0.05763f, -0.07041f, 0.15306f, 0.96026f, -1.42105f, + -0.55822f, 1.04845f, -0.17662f, -1.25345f, -0.11927f, 0.49845f, + -0.32530f, 0.73483f, 0.08322f, -0.23890f, +}; + +static float av1_tx_type_nn_4x16_ver_layer1_weights[64] = { + 0.27194f, 0.50607f, 0.49229f, -0.48192f, 0.15667f, -1.38891f, 0.38102f, + -0.58825f, -0.07337f, -0.52909f, 0.36975f, 0.28710f, 0.34992f, -0.73630f, + 0.30386f, -0.58822f, 0.36127f, 0.57950f, 0.55878f, -0.42796f, 0.19967f, + -1.45517f, 0.42529f, -0.54630f, -0.38169f, -0.84899f, 0.41622f, 0.46935f, + 0.39077f, -0.75448f, 0.31698f, -0.76187f, 0.97765f, 0.57052f, 0.55825f, + -0.54273f, 0.20466f, -1.46347f, 0.41813f, -0.55019f, -0.19948f, -0.57982f, + 0.41206f, 0.32373f, 0.38537f, -1.11657f, 0.32887f, -0.76911f, 1.12259f, + 0.72163f, 0.82603f, 0.37786f, 0.34976f, -1.86642f, 0.59961f, -0.16329f, + -0.36631f, -0.56814f, 0.60410f, 0.53158f, 0.56389f, -0.70508f, 0.51009f, + -0.56513f, +}; + +static float av1_tx_type_nn_4x16_ver_layer1_bias[4] = { + 4.60896f, + 4.53551f, + 4.53124f, + 4.27435f, +}; + +static float av1_tx_type_nn_4x16_ver_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_4x16_ver_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_4x16_ver = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_4x16_ver_layer0_weights, // weights + av1_tx_type_nn_4x16_ver_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_4x16_ver_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_4x16_ver_layer1_weights, + av1_tx_type_nn_4x16_ver_layer1_bias, + NONE, + av1_tx_type_nn_4x16_ver_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_4x16_ver_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; +/******************************************************************************/ + +// Tx type model for 16x4 block. +static float av1_tx_type_nn_16x4_hor_layer0_weights[128] = { + 1.45347f, -0.15743f, 0.44236f, 0.25808f, 0.33944f, 0.38678f, 0.24428f, + 1.67287f, 0.09539f, -0.42940f, -0.31507f, -0.00154f, -2.98755f, -2.27744f, + -0.49183f, 0.09333f, -0.99026f, -0.22157f, 0.53701f, 0.60447f, 0.15686f, + -0.04646f, 0.26341f, 2.12361f, 0.27090f, -1.14716f, -0.64146f, -0.91604f, + -0.75335f, -0.60056f, -1.25084f, 1.68473f, -3.24075f, -4.03867f, -2.07877f, + -0.02347f, 0.00333f, -0.01259f, -0.00465f, 0.02526f, 0.36286f, -0.10324f, + 2.12780f, -0.74584f, -1.05052f, 1.78467f, -0.55065f, -0.03326f, 2.46781f, + 1.18349f, 0.96015f, 1.01696f, 1.10584f, 1.07263f, 1.11531f, -1.06413f, + 0.32389f, -1.87360f, -0.14435f, 1.77926f, 1.09966f, -0.12680f, -0.61386f, + -0.09724f, -0.33095f, 1.12122f, 1.00791f, 1.52416f, 1.35004f, 1.32657f, + 0.60950f, -1.13538f, -0.38654f, 0.06473f, 2.10669f, 0.27734f, -0.38359f, + -1.91455f, -1.22676f, 0.05786f, 0.97432f, 2.19967f, 0.50457f, 0.78976f, + 0.95183f, -0.32414f, 0.49437f, -0.04506f, 0.18993f, -0.07971f, 0.23889f, + -0.09872f, -0.66036f, 0.05377f, 2.69638f, -0.08259f, -0.69210f, -1.08296f, + -1.96504f, -2.31947f, -0.80161f, -0.80456f, -1.35556f, -0.05323f, -4.42658f, + -0.30732f, -0.12043f, 0.11126f, 0.10771f, -0.14956f, -0.02218f, 0.41016f, + 1.16599f, 1.14629f, 1.12881f, 1.18676f, 1.24677f, 1.28695f, 1.11270f, + 0.08233f, 1.75440f, 0.49228f, -0.34858f, -0.17032f, 0.29288f, 0.47175f, + 0.19055f, -1.56413f, +}; + +static float av1_tx_type_nn_16x4_hor_layer0_bias[16] = { + -1.71227f, 0.47291f, -0.97536f, -0.66216f, 0.11729f, -0.21451f, + 2.75281f, 0.04318f, 2.03965f, 0.14618f, -0.70483f, -0.24517f, + 1.14048f, 0.33308f, -1.10886f, 0.41184f, +}; + +static float av1_tx_type_nn_16x4_hor_layer1_weights[64] = { + -1.17079f, 0.19096f, -1.05753f, -0.30803f, -1.21680f, -0.67255f, 1.60115f, + 0.05972f, 1.44759f, -0.04068f, -0.26331f, 0.31400f, 0.96923f, 0.33443f, + -0.77215f, -0.91316f, -1.78928f, 0.21483f, -1.24008f, -0.46190f, -0.12127f, + -0.62144f, 1.37593f, 0.08373f, 1.56215f, 0.00279f, -0.14556f, 0.38710f, + 0.96228f, 0.66433f, -0.51798f, -0.80738f, -0.18539f, 0.19377f, -1.03090f, + -1.51044f, -0.59485f, -0.62589f, 1.90742f, 0.09078f, 1.49113f, 0.00205f, + -0.15918f, 0.40827f, 1.08553f, 0.43431f, 0.33519f, -1.12669f, -1.10274f, + 0.80004f, -1.83599f, -0.53134f, 2.00515f, -0.32670f, 1.37124f, 0.51136f, + 1.62563f, 0.24787f, 0.31757f, 0.81751f, 1.57262f, 0.83214f, 1.04661f, + -0.43819f, +}; + +static float av1_tx_type_nn_16x4_hor_layer1_bias[4] = { + 2.32575f, + 2.75703f, + 1.12304f, + 2.15567f, +}; + +static float av1_tx_type_nn_16x4_hor_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_16x4_hor_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_16x4_hor = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_16x4_hor_layer0_weights, // weights + av1_tx_type_nn_16x4_hor_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_16x4_hor_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_16x4_hor_layer1_weights, + av1_tx_type_nn_16x4_hor_layer1_bias, + NONE, + av1_tx_type_nn_16x4_hor_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_16x4_hor_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; + +static float av1_tx_type_nn_16x4_ver_layer0_weights[32] = { + 0.26047f, 0.99930f, 1.16484f, -0.28196f, -2.67483f, -0.21456f, -0.16854f, + 0.46375f, 1.47951f, 1.13735f, 1.12356f, 0.27385f, 0.50978f, 2.09967f, + -1.47386f, 0.01950f, -0.06362f, 0.26014f, 1.04544f, -0.03099f, 0.07478f, + -0.39701f, 0.05545f, 2.73633f, -0.56305f, -0.02208f, -0.44517f, -0.00897f, + -0.17967f, -0.96622f, 0.42635f, -1.04784f, +}; + +static float av1_tx_type_nn_16x4_ver_layer0_bias[8] = { + -0.52088f, 0.52844f, -1.03655f, -0.30974f, + 2.59952f, -1.93604f, 0.00000f, 2.51787f, +}; + +static float av1_tx_type_nn_16x4_ver_layer1_weights[32] = { + 0.10916f, -0.21219f, -0.51340f, 0.69161f, 1.45988f, -1.36942f, -0.40899f, + 1.05136f, -0.08486f, 0.10008f, -0.55304f, 0.88012f, 1.61177f, -1.64507f, + 0.63428f, 1.15130f, -0.17287f, -0.18592f, -0.01143f, 0.88293f, 1.73326f, + -1.63624f, 0.09359f, 1.18393f, 0.26531f, 0.22378f, 0.15170f, 1.06965f, + 1.26814f, -1.93873f, -0.00768f, 1.58309f, +}; + +static float av1_tx_type_nn_16x4_ver_layer1_bias[4] = { + 2.34713f, + 1.68667f, + 1.25488f, + 1.69812f, +}; + +static float av1_tx_type_nn_16x4_ver_layer0_out[8] = { 0 }; +static float av1_tx_type_nn_16x4_ver_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_16x4_ver = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 4, // num_inputs + 8, // num_outputs + av1_tx_type_nn_16x4_ver_layer0_weights, // weights + av1_tx_type_nn_16x4_ver_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_16x4_ver_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 8, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_16x4_ver_layer1_weights, + av1_tx_type_nn_16x4_ver_layer1_bias, + NONE, + av1_tx_type_nn_16x4_ver_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_16x4_ver_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; +/******************************************************************************/ + +// Map tx_size to its corresponding neural net model for tx type prediction. +static NN_CONFIG_V2 *av1_tx_type_nnconfig_map_hor[] = { + &av1_tx_type_nnconfig_4x4_hor, // 4x4 transform + &av1_tx_type_nnconfig_8x8_hor, // 8x8 transform + &av1_tx_type_nnconfig_16x16, // 16x16 transform + NULL, // 32x32 transform + NULL, // 64x64 transform + &av1_tx_type_nnconfig_4x8_hor, // 4x8 transform + &av1_tx_type_nnconfig_8x4_hor, // 8x4 transform + &av1_tx_type_nnconfig_8x16_hor, // 8x16 transform + &av1_tx_type_nnconfig_16x8_hor, // 16x8 transform + NULL, // 16x32 transform + NULL, // 32x16 transform + NULL, // 32x64 transform + NULL, // 64x32 transform + &av1_tx_type_nnconfig_4x16_hor, // 4x16 transform + &av1_tx_type_nnconfig_16x4_hor, // 16x4 transform + NULL, // 8x32 transform + NULL, // 32x8 transform + NULL, // 16x64 transform + NULL, // 64x16 transform +}; + +static NN_CONFIG_V2 *av1_tx_type_nnconfig_map_ver[] = { + &av1_tx_type_nnconfig_4x4_ver, // 4x4 transform + &av1_tx_type_nnconfig_8x8_ver, // 8x8 transform + &av1_tx_type_nnconfig_16x16, // 16x16 transform + NULL, // 32x32 transform + NULL, // 64x64 transform + &av1_tx_type_nnconfig_4x8_ver, // 4x8 transform + &av1_tx_type_nnconfig_8x4_ver, // 8x4 transform + &av1_tx_type_nnconfig_8x16_ver, // 8x16 transform + &av1_tx_type_nnconfig_16x8_ver, // 16x8 transform + NULL, // 16x32 transform + NULL, // 32x16 transform + NULL, // 32x64 transform + NULL, // 64x32 transform + &av1_tx_type_nnconfig_4x16_ver, // 4x16 transform + &av1_tx_type_nnconfig_16x4_ver, // 16x4 transform + NULL, // 8x32 transform + NULL, // 32x8 transform + NULL, // 16x64 transform + NULL, // 64x16 transform +}; +#else +/******************************CONFIG_NN***************************************/ +// Tx type model for 4x4 block. +static const float av1_tx_type_nn_weights_4x4_hor_layer0[32] = { + -1.64947f, -1.54497f, -1.62832f, -0.17774f, -2.89498f, -0.72498f, 0.72036f, + 0.17996f, 1.20000f, -0.27654f, 0.77396f, 1.21684f, -1.75909f, -0.51272f, + -1.25923f, 0.35005f, -0.04257f, -0.23389f, -0.41841f, -0.08229f, 0.09503f, + 2.73144f, -0.16875f, -0.23482f, 0.02194f, -0.26427f, 0.28049f, 0.21260f, + 1.35792f, 0.27733f, 0.88660f, -0.68304f, +}; + +static const float av1_tx_type_nn_bias_4x4_hor_layer0[8] = { + 1.38742f, 0.59540f, -1.37622f, 1.92114f, + 0.00000f, -0.38998f, -0.32726f, -0.15650f, +}; + +static const float av1_tx_type_nn_weights_4x4_hor_layer1[32] = { + 1.65254f, 1.00915f, -0.89318f, -2.05142f, -0.23235f, 0.96781f, -0.37145f, + -0.21056f, 1.13891f, 0.38675f, 0.87739f, -1.42697f, 0.48015f, 0.61883f, + -0.03979f, 0.11487f, 0.48042f, 0.45200f, -0.23242f, 0.75166f, 0.55458f, + 0.39452f, -0.35285f, 1.59120f, -1.49221f, -0.48349f, -0.64692f, 1.49297f, + -0.26782f, -0.65416f, -0.10648f, 0.05568f, +}; + +static const float av1_tx_type_nn_bias_4x4_hor_layer1[4] = { + 4.07177f, + 3.26961f, + 0.58083f, + 1.21199f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_4x4_hor = { + 4, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_4x4_hor_layer0, + av1_tx_type_nn_weights_4x4_hor_layer1 }, + { av1_tx_type_nn_bias_4x4_hor_layer0, av1_tx_type_nn_bias_4x4_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_4x4_ver_layer0[32] = { + -0.02032f, 2.61610f, 0.02098f, -0.30217f, 0.12637f, 0.11017f, -3.01996f, + 0.35144f, 1.93776f, -0.20463f, 1.64102f, -1.41986f, -3.66717f, -0.51655f, + 0.43910f, 0.37778f, -1.02634f, 0.85337f, -0.69753f, 1.00206f, 2.11784f, + 1.89427f, 1.92919f, 0.43201f, -1.67358f, -1.67035f, -1.54623f, 0.16714f, + -0.06589f, -0.28142f, -0.33118f, 1.72227f, +}; + +static const float av1_tx_type_nn_bias_4x4_ver_layer0[8] = { + -0.33685f, 0.22025f, 0.28140f, 0.56138f, + 0.93489f, -1.77048f, 1.34989f, -0.93747f, +}; + +static const float av1_tx_type_nn_weights_4x4_ver_layer1[32] = { + -1.39506f, -1.06271f, -1.10886f, -1.69719f, 0.19699f, -2.39850f, -1.26457f, + 0.75328f, -1.26005f, -0.82738f, -0.12015f, -1.02702f, 1.40828f, -2.37739f, + -0.65639f, -0.71992f, -0.90453f, -1.12510f, -2.41362f, -1.16061f, -1.85577f, + -0.99165f, -1.91366f, 0.16785f, 0.34776f, 0.58154f, -0.18217f, -0.29257f, + -0.86315f, -0.53336f, 0.30320f, -1.32331f, +}; + +static const float av1_tx_type_nn_bias_4x4_ver_layer1[4] = { + -1.31519f, + -3.26321f, + 1.71794f, + -1.90778f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_4x4_ver = { + 4, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_4x4_ver_layer0, + av1_tx_type_nn_weights_4x4_ver_layer1 }, + { av1_tx_type_nn_bias_4x4_ver_layer0, av1_tx_type_nn_bias_4x4_ver_layer1 } +}; +/******************************************************************************/ + +// Tx type model for 4x8 block. +static const float av1_tx_type_nn_weights_4x8_hor_layer0[32] = { + 0.00218f, -0.41880f, -0.61215f, -0.92588f, 0.54291f, -0.10898f, 0.70691f, + 0.46819f, -1.61598f, -0.08834f, -0.96839f, 1.18489f, -0.45171f, -0.65445f, + -0.32179f, -0.10399f, 1.04379f, 0.91895f, 0.85589f, 0.08267f, 1.35388f, + -2.03096f, 0.08168f, -0.06372f, -0.26732f, -0.48262f, -0.08682f, 2.44071f, + -1.35896f, -1.17121f, 1.68866f, 0.10357f, +}; + +static const float av1_tx_type_nn_bias_4x8_hor_layer0[8] = { + 2.93391f, 0.66831f, -0.21419f, 0.00000f, + -0.72878f, 0.15127f, -1.46755f, 0.16658f, +}; + +static const float av1_tx_type_nn_weights_4x8_hor_layer1[32] = { + -1.52077f, -1.06243f, 0.35319f, -0.49207f, 0.54524f, 0.44271f, 1.37117f, + -0.38957f, -1.28889f, -0.57133f, 0.04658f, 0.62278f, 0.37984f, 0.33247f, + 1.65547f, -0.56806f, -1.38645f, -0.76258f, 0.67926f, 0.08783f, -0.01443f, + 0.34950f, 1.45812f, -0.51332f, -1.41331f, -0.16453f, 0.05755f, 0.31405f, + -0.50191f, 0.18219f, 1.83664f, -0.75276f, +}; + +static const float av1_tx_type_nn_bias_4x8_hor_layer1[4] = { + -1.17455f, + -2.26089f, + -1.79863f, + -2.26333f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_4x8_hor = { + 4, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_4x8_hor_layer0, + av1_tx_type_nn_weights_4x8_hor_layer1 }, + { av1_tx_type_nn_bias_4x8_hor_layer0, av1_tx_type_nn_bias_4x8_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_4x8_ver_layer0[128] = { + -0.00952f, -0.98858f, -0.93181f, 1.39594f, 0.96559f, 0.18162f, -0.76064f, + -0.06066f, 0.07907f, -0.09365f, -0.21313f, -0.02187f, -2.61707f, -2.68702f, + -0.10982f, 0.18559f, 1.17049f, 1.11387f, 1.12697f, 1.05804f, 1.12764f, + 1.06318f, 1.12052f, 0.17406f, 1.83157f, 0.19362f, 0.46910f, 0.39608f, + 0.33342f, 0.40083f, 0.27645f, 1.06864f, -4.06645f, -0.38775f, -0.11070f, + 0.03781f, -0.09141f, 0.06185f, -0.04852f, 0.20163f, 0.16784f, 0.16641f, + -0.50941f, -0.61087f, 2.07008f, -0.82381f, -0.85558f, 0.05528f, -0.10535f, + -2.81150f, 0.67038f, 0.43643f, 0.49062f, -0.04465f, 0.90438f, 0.00977f, + 0.46272f, 1.59751f, 0.95234f, 0.35086f, 0.85624f, 0.73149f, 1.67779f, + -2.21511f, -1.24746f, -1.09014f, -0.92441f, -1.22591f, -1.06961f, -0.95897f, + -1.24956f, 0.73797f, 1.23275f, -0.60064f, -0.07851f, 0.14397f, 0.22110f, + -0.04422f, 0.14350f, 0.75926f, 0.35032f, 0.48104f, 2.81408f, 0.34662f, + 0.42090f, 0.35521f, -1.36804f, -0.14974f, -0.47696f, -0.07892f, 0.36910f, + 0.32299f, 0.23916f, 0.06032f, -0.17844f, -0.17558f, -1.42746f, -0.55828f, + -1.00418f, -0.64823f, -0.73654f, -0.85197f, -1.50989f, 1.69385f, -0.04973f, + -0.09273f, 1.04249f, 0.79235f, 1.13229f, 0.99617f, 0.03851f, 0.56334f, + 0.90795f, 1.08296f, 0.58519f, 1.74765f, 0.63971f, 1.35951f, 0.07803f, + -0.05127f, 0.26514f, -0.84629f, -0.66343f, -2.10630f, 0.11017f, 2.18528f, + -0.21958f, 0.05970f, +}; + +static const float av1_tx_type_nn_bias_4x8_ver_layer0[16] = { + 0.04205f, 0.22260f, -1.03870f, -1.19568f, 0.44283f, 0.01143f, + 0.00235f, 4.26772f, 0.44364f, -0.33199f, -0.39076f, -0.35129f, + 0.08288f, 0.18195f, -0.79890f, 0.10047f, +}; + +static const float av1_tx_type_nn_weights_4x8_ver_layer1[64] = { + -0.38193f, -0.12095f, 1.57802f, 0.34932f, -0.47333f, -0.12304f, -0.01736f, + -2.52445f, 0.18983f, -0.64707f, -0.60889f, -0.53750f, 0.91666f, -0.62823f, + -0.13377f, -0.43594f, -0.38618f, -0.01328f, 0.97457f, 1.48589f, -1.03238f, + -0.33459f, -0.35108f, -2.42417f, 0.60229f, 0.06824f, -0.75495f, 0.26902f, + 0.65311f, -0.23887f, -0.44604f, -0.55800f, -0.33842f, 0.04259f, -0.59589f, + 0.49738f, -0.62301f, -0.30896f, -0.29602f, -2.57052f, 2.00943f, -0.66490f, + -0.76312f, 0.28256f, 1.06311f, -0.38364f, -0.63508f, -0.57609f, -0.88765f, + -1.04403f, -0.46531f, 0.34084f, -1.20498f, -0.68352f, -0.72251f, -2.63242f, + -0.68736f, -0.37904f, -1.32371f, 0.47288f, 1.51904f, 0.78372f, -1.01830f, + -1.01848f, +}; + +static const float av1_tx_type_nn_bias_4x8_ver_layer1[4] = { + -1.45955f, + -2.08949f, + -1.24813f, + -1.55368f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_4x8_ver = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_4x8_ver_layer0, + av1_tx_type_nn_weights_4x8_ver_layer1 }, + { av1_tx_type_nn_bias_4x8_ver_layer0, av1_tx_type_nn_bias_4x8_ver_layer1 } +}; +/******************************************************************************/ + +// Tx type model for 8x4 block. +static const float av1_tx_type_nn_weights_8x4_hor_layer0[128] = { + -0.22492f, 0.13341f, -4.03243f, -0.64015f, 0.02783f, 0.60466f, -0.13335f, + 0.16828f, 0.12336f, 0.52904f, 1.18455f, -0.32425f, 0.13052f, 0.93810f, + -3.71165f, 0.02990f, -4.63558f, 0.05666f, 0.03524f, -0.07449f, -0.44006f, + -0.33215f, -0.33713f, 0.08097f, 0.60873f, 0.29582f, 0.21696f, -0.78729f, + -0.16757f, -0.26567f, -0.00720f, -1.11226f, 1.58189f, 1.58463f, 1.48536f, + 1.54374f, 1.60069f, 1.46125f, 1.53932f, 0.05974f, -1.82192f, 0.47043f, + 0.38090f, 0.20833f, -0.05637f, 0.05183f, 0.01323f, -0.25662f, 0.78634f, + -0.55069f, -0.02975f, -1.29294f, -0.77192f, -2.34299f, -1.28074f, 0.77894f, + -1.69740f, -1.66032f, -1.44323f, -1.55063f, -1.50845f, -1.23690f, -1.80663f, + 0.75079f, 2.32551f, 0.05878f, 0.80438f, 0.88584f, 0.69153f, 0.89060f, + 0.73660f, 0.87259f, -0.00745f, -1.30044f, -0.59430f, 2.07270f, 1.03307f, + -0.84697f, -1.19393f, 0.17549f, -0.24978f, -3.67234f, 0.20781f, -0.53946f, + -0.05068f, 0.88274f, 1.30371f, 0.10288f, 0.07585f, 0.12259f, -0.30815f, + 0.25437f, -2.82096f, -2.69482f, 0.02370f, 0.12500f, -0.21019f, -0.49220f, + 0.03638f, -0.29795f, 0.28645f, -0.48432f, -0.38584f, -0.32148f, -0.47197f, + 0.32437f, 0.32528f, -0.19437f, 0.30383f, -0.31879f, 0.26359f, -0.12164f, + -0.43647f, -0.08288f, -0.33438f, -0.63608f, -0.46647f, -0.46574f, 0.47806f, + -0.49012f, -1.51234f, -1.13502f, -1.20470f, -1.02913f, -1.09182f, -0.93921f, + -1.85523f, 0.92532f, +}; + +static const float av1_tx_type_nn_bias_8x4_hor_layer0[16] = { + 0.36631f, 0.02901f, 0.64305f, 1.53074f, -1.40229f, 0.03852f, + -0.05043f, 0.89632f, -1.23312f, 0.07036f, 0.17070f, 0.56250f, + -0.28958f, -0.32869f, -0.01704f, 0.68171f, +}; + +static const float av1_tx_type_nn_weights_8x4_hor_layer1[64] = { + -0.49441f, -0.31960f, -0.84946f, -0.85800f, -2.37767f, 0.81373f, -0.73172f, + -0.69337f, 0.88807f, -0.49242f, -0.44717f, -0.11436f, 0.09978f, 0.15393f, + 0.17083f, 1.44850f, -0.20582f, -0.04906f, 0.42990f, -0.61939f, -1.09692f, + -1.14885f, -1.36879f, -1.30828f, -0.59558f, -0.30903f, -0.08906f, 0.06953f, + 0.15383f, -0.04193f, -0.54858f, 1.82676f, -0.22411f, 0.05264f, -0.45848f, + -0.72985f, 0.87553f, 0.04116f, -1.29774f, -2.63018f, 1.09089f, -0.36048f, + -0.16725f, 0.11627f, 0.49918f, 0.07539f, 0.00763f, 0.73706f, 0.87800f, + 0.57049f, 0.60969f, 1.02779f, 1.53339f, -0.35915f, 0.06410f, 1.44582f, + 0.09698f, 0.71888f, 0.60594f, 0.84103f, -0.50440f, -0.38825f, 0.15626f, + -1.10654f, +}; + +static const float av1_tx_type_nn_bias_8x4_hor_layer1[4] = { + -0.92861f, + -1.45151f, + -1.33588f, + -4.33853f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_8x4_hor = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_8x4_hor_layer0, + av1_tx_type_nn_weights_8x4_hor_layer1 }, + { av1_tx_type_nn_bias_8x4_hor_layer0, av1_tx_type_nn_bias_8x4_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_8x4_ver_layer0[32] = { + -1.10946f, 1.86574f, -1.59343f, 0.27018f, -1.70676f, -0.73982f, -0.19021f, + -1.94208f, -2.29759f, -1.44402f, 0.28700f, -1.18340f, -1.50158f, -0.44175f, + -1.36831f, 1.00374f, 2.59312f, 0.50291f, -0.71042f, -0.12238f, -0.15901f, + -0.22807f, -0.67376f, -0.30215f, 0.54407f, -0.45538f, 1.18262f, 2.28687f, + 1.66212f, 1.70826f, 1.55182f, 0.12230f, +}; + +static const float av1_tx_type_nn_bias_8x4_ver_layer0[8] = { + 0.10943f, 2.09789f, 2.16578f, 0.15766f, + -0.42461f, 0.00000f, 1.22090f, -1.28717f, +}; + +static const float av1_tx_type_nn_weights_8x4_ver_layer1[32] = { + 1.20426f, -1.23237f, 2.41053f, -0.72488f, 1.25249f, 0.18018f, -0.09586f, + 2.17901f, 0.15364f, 1.21535f, -0.38263f, -0.74309f, 0.50551f, -0.54208f, + 0.59139f, 1.16095f, 0.55919f, -0.60183f, 1.18949f, 1.60787f, 0.54002f, + -0.10712f, -0.16153f, 0.16207f, -0.32338f, 2.68712f, -2.83483f, -0.27086f, + -1.15005f, -0.39311f, 1.51236f, -1.68973f, +}; + +static const float av1_tx_type_nn_bias_8x4_ver_layer1[4] = { + 1.81013f, + 1.10517f, + 2.90059f, + 0.95391f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_8x4_ver = { + 4, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_8x4_ver_layer0, + av1_tx_type_nn_weights_8x4_ver_layer1 }, + { av1_tx_type_nn_bias_8x4_ver_layer0, av1_tx_type_nn_bias_8x4_ver_layer1 } +}; +/******************************************************************************/ + +// Tx type model for 8x8 block. +static const float av1_tx_type_nn_weights_8x8_hor_layer0[128] = { + -0.85529f, 0.37619f, 0.12754f, 0.08622f, 0.45278f, 0.54929f, 1.60651f, + -0.62654f, -0.54929f, -0.10131f, -0.17569f, 0.13948f, 0.31695f, -0.05616f, + 0.20483f, -0.36448f, 2.27203f, -0.33087f, 0.47679f, 0.86888f, 0.39370f, + 0.46239f, 0.01113f, 1.50327f, -1.48226f, -1.69621f, -1.49777f, -1.38885f, + -1.37753f, -1.22681f, -1.70576f, 0.51329f, -1.65662f, 1.74197f, -0.13579f, + -0.13133f, -0.58396f, -0.55510f, -1.10709f, -2.34975f, 0.22445f, -0.56491f, + -0.83432f, 0.13492f, 1.32147f, 2.85285f, 0.13819f, 0.03792f, -1.30792f, + 0.04155f, -0.70644f, -0.43430f, -0.16212f, -0.86945f, -1.16976f, 1.68339f, + 0.29540f, 0.01137f, -0.25335f, -0.16856f, 0.12028f, 0.05207f, 0.39357f, + -0.01545f, -0.21980f, -1.94091f, -1.01315f, -0.68270f, -0.40590f, -0.67111f, + 2.08283f, 0.19291f, -4.81426f, -0.65044f, -0.24598f, 0.06371f, -0.10272f, + -0.14502f, -0.06821f, 0.45202f, 0.21091f, -0.80864f, 0.39255f, 1.79189f, + 1.80453f, 1.10484f, 1.17608f, 0.96901f, -0.35871f, -0.94311f, 0.63147f, + 2.95157f, 0.45917f, -0.42849f, -0.55643f, -0.06097f, 3.49299f, -0.50972f, + 0.11075f, -0.08405f, -0.09274f, -0.22694f, -0.42426f, 0.48632f, -1.61074f, + 1.82998f, 0.37623f, -1.20330f, -0.01142f, -1.33307f, -0.27492f, -2.23621f, + 1.38846f, 1.42085f, 1.42568f, 1.36152f, 1.46910f, 1.27473f, 1.34752f, + 0.12753f, -1.08197f, -1.08280f, -0.79489f, -1.12338f, -1.06795f, -0.87857f, + -0.99892f, 1.09823f, +}; + +static const float av1_tx_type_nn_bias_8x8_hor_layer0[16] = { + -0.49232f, -0.29685f, -1.44020f, 1.10940f, 1.16452f, -0.34862f, + -0.38761f, -0.36243f, 0.21776f, 0.28234f, 2.34269f, -0.04104f, + -0.26319f, 2.65579f, -1.30137f, -0.01487f, +}; + +static const float av1_tx_type_nn_weights_8x8_hor_layer1[64] = { + -0.38058f, -0.41295f, -1.26884f, -0.75560f, -1.57450f, 0.56072f, -1.42322f, + -0.29106f, 0.07228f, 0.04391f, 1.61388f, -0.03055f, 0.81637f, 2.06045f, + 0.27119f, -0.48328f, -0.45528f, -0.60534f, -1.61209f, -0.78157f, -1.65034f, + 0.60958f, -1.30523f, 0.25143f, 0.11398f, 0.37860f, 1.54829f, 0.02309f, + 0.67288f, 2.11447f, 0.44845f, -0.70406f, -0.67897f, -0.38759f, -1.30383f, + -1.22646f, -1.54571f, 0.60552f, -1.52565f, 0.11469f, 0.17344f, 0.08622f, + 1.57906f, -0.00909f, 0.81634f, 2.04909f, 1.26466f, -1.45741f, -0.75229f, + 0.06200f, -1.05835f, -0.66257f, -1.73766f, 0.99923f, -1.87082f, 0.14580f, + 0.49525f, 0.46839f, 1.32203f, 0.33923f, 0.97001f, 2.38584f, 1.58811f, + 0.06161f, +}; + +static const float av1_tx_type_nn_bias_8x8_hor_layer1[4] = { + 1.70385f, + 1.82373f, + 1.78496f, + 1.80826f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_8x8_hor = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_8x8_hor_layer0, + av1_tx_type_nn_weights_8x8_hor_layer1 }, + { av1_tx_type_nn_bias_8x8_hor_layer0, av1_tx_type_nn_bias_8x8_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_8x8_ver_layer0[128] = { + -0.67016f, -1.72366f, -1.86576f, -1.50962f, -1.70419f, -1.73964f, -1.84615f, + 2.09681f, -0.05081f, -0.61030f, 2.02541f, 0.60222f, 0.99936f, 2.02114f, + -0.53893f, -0.23757f, 0.73566f, 0.25443f, 0.00132f, -0.74036f, -0.75351f, + -0.76964f, -1.71007f, -0.15770f, 1.60982f, 2.17638f, 0.90681f, 0.64973f, + 0.85914f, 0.58786f, -1.46228f, 0.05187f, 1.18804f, 0.30850f, 0.29512f, + 0.40526f, 0.37635f, 0.32311f, 0.37471f, 1.12346f, 3.41856f, -0.36653f, + 0.42537f, -0.19240f, 0.00155f, 0.30826f, -0.02116f, -0.53435f, -0.34829f, + -0.52466f, -0.11521f, -0.29163f, -2.05689f, -2.87372f, -0.62626f, 0.09585f, + -0.75257f, 0.10057f, 1.43474f, 0.89450f, 0.75900f, 1.11147f, 1.00558f, + 0.25886f, 2.22095f, -0.17926f, 0.57161f, 0.39546f, 0.47846f, 0.40452f, + 0.54298f, 0.45814f, -3.62788f, -3.02374f, 0.03716f, -0.13937f, -0.09415f, + -0.12463f, 0.05682f, 0.03672f, 1.20746f, 1.25003f, 1.27071f, 1.31883f, + 1.27473f, 1.34943f, 1.23158f, 0.09039f, 0.19388f, 0.63420f, 2.79612f, + 0.93803f, -0.11323f, -0.02027f, 0.41286f, -0.05979f, -3.80705f, -0.52451f, + -0.77098f, -0.68132f, -0.65559f, -0.60975f, -1.26165f, 0.25582f, 0.05346f, + 0.61403f, 0.32140f, -2.39831f, -1.42355f, 1.30541f, 1.02361f, 0.12930f, + -1.61469f, -0.77036f, -0.59144f, 1.27769f, 1.52068f, 0.82137f, 1.83159f, + -0.66626f, -0.69806f, -1.00564f, -0.85995f, -0.90889f, -0.84412f, -0.85712f, + -1.29848f, 0.39308f, +}; + +static const float av1_tx_type_nn_bias_8x8_ver_layer0[16] = { + -0.14868f, -0.48343f, 3.94416f, -0.78037f, -1.33789f, -0.60611f, + 0.51793f, 0.44030f, -0.71563f, 0.22561f, -1.19083f, -0.46149f, + 0.83015f, 0.06024f, 1.17180f, 0.65122f, +}; + +static const float av1_tx_type_nn_weights_8x8_ver_layer1[64] = { + -1.42711f, -0.21683f, 2.12061f, 0.20489f, -0.50228f, -0.24770f, 0.23391f, + 1.03470f, -0.44847f, -0.63225f, -0.21583f, -0.06467f, -0.21892f, -0.07786f, + 1.43322f, 0.00280f, -1.53057f, -0.18912f, 1.95333f, 0.31151f, -2.07601f, + 0.06776f, 0.25529f, 0.94800f, -1.11453f, -0.20594f, -0.13281f, 0.01485f, + 0.17650f, -0.07955f, 1.43734f, -0.23193f, -2.06463f, -0.21238f, 2.13707f, + 0.30351f, 0.27594f, -0.36245f, 0.19539f, 0.91045f, -0.24068f, -0.37616f, + 0.88792f, 0.02947f, -0.16903f, -0.04932f, 1.51293f, -0.95967f, -1.62903f, + 0.05326f, 2.30703f, 0.64445f, -1.09464f, -0.16623f, 1.00240f, 0.07548f, + -0.50406f, 0.63854f, 1.02340f, 0.49833f, 0.13671f, 0.26722f, 2.09516f, + -0.41305f, +}; + +static const float av1_tx_type_nn_bias_8x8_ver_layer1[4] = { + 2.14067f, + 2.76699f, + 2.04233f, + 1.34803f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_8x8_ver = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_8x8_ver_layer0, + av1_tx_type_nn_weights_8x8_ver_layer1 }, + { av1_tx_type_nn_bias_8x8_ver_layer0, av1_tx_type_nn_bias_8x8_ver_layer1 } +}; +/******************************************************************************/ + +// Tx type model for 8x16 block. +static const float av1_tx_type_nn_weights_8x16_hor_layer0[128] = { + -1.61872f, -1.58520f, -1.41236f, -1.53255f, -1.59794f, -1.25769f, -1.90043f, + 0.73431f, 1.10135f, 0.47054f, 0.43230f, -0.43009f, -0.09135f, -0.07289f, + -0.38785f, 1.23775f, -0.35312f, 0.73789f, 0.88864f, 0.75957f, 0.62579f, + 0.46974f, 0.21851f, 1.63821f, -2.27289f, -0.68522f, -0.69814f, -0.84368f, + -0.91320f, -0.63055f, -1.03296f, 0.55778f, -0.00071f, 1.27539f, 1.60068f, + 1.40975f, 0.97372f, 0.92843f, 1.90853f, 0.12626f, 1.71953f, 1.41978f, + -0.12234f, -1.27058f, 0.76207f, 0.02495f, -0.67038f, -0.05255f, 1.72923f, + 1.47630f, 1.47058f, 1.47614f, 1.49354f, 1.66131f, 1.50801f, 0.17145f, + -2.30947f, -2.10850f, -1.25636f, -0.24900f, 0.72602f, 1.26572f, 0.97865f, + -0.65466f, 1.31129f, 0.26916f, 0.12139f, -0.12761f, -0.39143f, -0.28134f, + 0.06584f, 2.24418f, 0.22516f, 0.05011f, -0.01671f, -0.29476f, -0.40326f, + 0.21138f, -0.11573f, -0.31154f, -0.36828f, 0.03694f, -0.07172f, -0.63419f, + -3.14351f, -1.23125f, 0.65311f, -0.11406f, 1.97287f, -0.10422f, 0.83896f, + 0.85033f, 0.49724f, 0.80482f, 0.51454f, 1.06447f, 0.76693f, 0.72599f, + -0.78573f, -0.53950f, 0.40894f, 0.00086f, 0.10784f, -0.70498f, 1.16395f, + 1.14597f, 1.13496f, 1.12177f, 1.02100f, -1.37574f, -2.97144f, 0.33899f, + 0.42013f, 0.86327f, 2.31983f, 2.04008f, 0.95503f, 0.15081f, 0.11530f, + -0.02574f, -4.77119f, 0.13257f, -0.01704f, -0.23087f, -0.00825f, 0.07029f, + -0.28136f, 0.42556f, +}; + +static const float av1_tx_type_nn_bias_8x16_hor_layer0[16] = { + 0.93617f, -0.24000f, -1.26821f, 0.78780f, 0.13690f, -0.21948f, + -1.45162f, 0.44584f, -1.92582f, -0.23169f, 0.56004f, -1.19937f, + 1.81560f, -1.02643f, -0.81690f, 0.08302f, +}; + +static const float av1_tx_type_nn_weights_8x16_hor_layer1[64] = { + 0.06696f, -0.11538f, -1.42029f, 0.32965f, 0.81046f, 0.01146f, 1.20945f, + -0.16899f, 0.53224f, -0.40232f, 0.01786f, -0.73242f, 1.29750f, 1.95185f, + 0.70143f, 1.43287f, 0.76220f, 0.79937f, -1.79011f, -1.15178f, 0.42526f, + -0.67519f, 0.77267f, -0.30697f, 2.46004f, -0.49828f, 0.02875f, 1.09972f, + 1.47662f, 0.61719f, 0.61417f, -0.12363f, 2.53048f, 0.00418f, -1.38964f, + 0.88117f, 0.39239f, -0.19347f, -2.58600f, -0.33715f, 1.09323f, -0.32127f, + 0.02456f, -0.19125f, 1.12728f, 0.66502f, 0.34296f, 1.14897f, 0.29967f, + 1.19209f, 0.22108f, -0.11975f, 1.49776f, -1.34624f, -2.58478f, -1.34632f, + 1.53207f, 0.45634f, -1.48476f, 0.17489f, 0.71790f, -2.12086f, -1.21778f, + -1.31243f, +}; + +static const float av1_tx_type_nn_bias_8x16_hor_layer1[4] = { + 0.83359f, + 1.06875f, + 1.77645f, + 1.49570f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_8x16_hor = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_8x16_hor_layer0, + av1_tx_type_nn_weights_8x16_hor_layer1 }, + { av1_tx_type_nn_bias_8x16_hor_layer0, av1_tx_type_nn_bias_8x16_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_8x16_ver_layer0[128] = { + 0.32858f, -1.28887f, 0.25632f, -0.05262f, 2.69203f, -0.07004f, 1.37337f, + -0.05725f, -0.05659f, 0.05592f, 0.01039f, -0.29343f, 1.58628f, -0.30003f, + -3.43118f, 0.00272f, 1.70928f, -0.76348f, 0.05889f, -0.03263f, -0.07724f, + 0.03523f, -0.19890f, 1.18005f, -0.03605f, -0.20530f, -4.00733f, 0.10210f, + -0.05368f, -0.17650f, -0.15317f, 0.06499f, 0.56705f, 1.04341f, 0.62890f, + 0.73451f, -0.22199f, 0.86659f, 0.78443f, -0.61664f, -0.50606f, 0.30247f, + 0.14455f, 0.39276f, 0.49203f, 0.65019f, 0.12269f, 1.64080f, 1.68289f, + 1.42694f, 1.60825f, 1.58501f, 1.47252f, 1.62589f, 1.48218f, 0.17726f, + -0.04884f, 0.35376f, -0.04796f, 0.32589f, 0.35087f, 0.35258f, -0.46103f, + -0.31176f, -0.05203f, 0.07247f, -0.26756f, 0.22019f, 0.03412f, 0.33773f, + 0.29811f, -0.11140f, 0.12831f, -0.44673f, -0.09858f, 0.07889f, 0.15137f, + 0.00347f, -0.23394f, 0.08886f, -0.31201f, -0.79912f, -0.51092f, 0.14123f, + -1.09599f, -4.26020f, -0.68675f, -0.02842f, -1.54538f, -1.28977f, -1.30558f, + -1.21074f, -1.37142f, -1.14743f, -1.85397f, 0.82985f, -0.30681f, 0.04494f, + -0.24023f, -4.18053f, -0.16096f, -0.55492f, -0.27882f, 0.05829f, -0.41224f, + -2.52088f, -0.56162f, -1.04547f, -1.70685f, -0.28842f, -1.43673f, -0.01468f, + -3.20585f, -0.69120f, -0.43931f, -0.46270f, -0.65885f, -0.55884f, -0.75138f, + 0.36381f, -5.70858f, -0.14548f, -0.15745f, -0.11812f, -0.07605f, -0.07693f, + -0.12236f, 0.16075f, +}; + +static const float av1_tx_type_nn_bias_8x16_ver_layer0[16] = { + -0.35385f, 0.30491f, -0.90011f, 0.42941f, 1.20928f, -0.88331f, + -1.48818f, -0.34785f, -0.32668f, -0.22695f, 0.89188f, 0.65521f, + 0.57598f, 0.99819f, 0.75175f, 0.17044f, +}; + +static const float av1_tx_type_nn_weights_8x16_ver_layer1[64] = { + -0.62913f, -0.34304f, 0.42963f, -0.17440f, -1.44092f, 0.69142f, -1.36067f, + 0.52211f, 0.44658f, -0.26501f, -0.41657f, 0.34428f, -0.34390f, -0.58567f, + -0.84097f, -1.96311f, -0.37215f, -0.22250f, -1.23811f, -0.07247f, -0.81731f, + 0.58755f, -1.30559f, 0.39551f, 0.41743f, -0.09940f, -0.33230f, 0.14458f, + -0.25139f, -0.54517f, 0.13469f, -0.38157f, -0.39109f, -0.18205f, 0.06834f, + -0.08395f, -0.92187f, 0.56724f, 1.44381f, 0.53226f, -0.22356f, 0.12285f, + -0.29418f, -1.86749f, -0.22372f, -0.60204f, -0.87746f, -1.16936f, 0.56884f, + 0.62641f, -0.11823f, 1.00395f, 1.64794f, -0.64535f, 2.29322f, -0.23397f, + 0.17251f, -0.35927f, 0.65631f, -0.26812f, 0.80128f, 0.85748f, 0.47404f, + 2.20547f, +}; + +static const float av1_tx_type_nn_bias_8x16_ver_layer1[4] = { + -0.44080f, + -1.67455f, + -1.46332f, + -6.13206f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_8x16_ver = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_8x16_ver_layer0, + av1_tx_type_nn_weights_8x16_ver_layer1 }, + { av1_tx_type_nn_bias_8x16_ver_layer0, av1_tx_type_nn_bias_8x16_ver_layer1 } +}; +/******************************************************************************/ + +// Tx type model for 16x8 block. +static const float av1_tx_type_nn_weights_16x8_hor_layer0[128] = { + 0.02600f, 0.09786f, -1.05107f, -0.35594f, -0.15658f, 2.99828f, -0.07106f, + -0.10101f, -0.14412f, -0.83790f, -0.19434f, 2.28368f, 1.91727f, -0.00956f, + -0.90640f, 0.09174f, 1.58895f, 1.38945f, 1.49431f, 1.51381f, 1.44803f, + 1.53544f, 1.44694f, 0.17753f, 1.69735f, -0.78652f, 0.31092f, -0.23736f, + 0.02231f, -0.09884f, -0.00493f, 1.21189f, -1.94382f, -0.34629f, -0.58309f, + 0.72291f, -0.30056f, 0.90660f, -0.57495f, 3.07809f, 0.73644f, 1.43050f, + 1.34356f, -0.66554f, 0.50102f, -0.64305f, 0.42044f, -1.66165f, -0.05733f, + -2.51402f, -1.01067f, -0.33390f, -0.32986f, -0.92431f, 1.86281f, -0.07290f, + -0.26290f, -0.68941f, 1.81156f, 0.66125f, -2.09974f, 0.17032f, -0.67461f, + -0.00876f, -1.50154f, 1.17153f, 1.00377f, 0.33022f, 0.74689f, 0.42878f, + 0.61725f, -0.83967f, 0.09467f, -0.39892f, 0.33863f, 0.10656f, -0.09249f, + -0.39757f, 0.48481f, -0.35162f, 1.47014f, 1.67827f, -1.84051f, 0.16291f, + -0.50135f, -2.29911f, -0.42217f, -0.13358f, 1.45899f, -0.14743f, -0.02763f, + -0.28003f, -0.01364f, 0.21014f, -0.29026f, -0.20198f, 1.38782f, 0.56731f, + 0.27489f, 0.43227f, 0.41326f, 0.42721f, 0.87720f, -1.90067f, -5.04951f, + -0.17638f, -0.58119f, -0.08954f, -0.13692f, -0.12325f, -0.38548f, 0.66462f, + -1.42377f, -1.21917f, -1.38193f, -1.36539f, -1.39378f, -1.19629f, -1.59812f, + 0.28689f, 0.32394f, 0.52128f, 0.01013f, -0.28948f, -0.26293f, -0.44331f, + -0.36570f, -0.50757f, +}; + +static const float av1_tx_type_nn_bias_16x8_hor_layer0[16] = { + -0.08696f, -0.22110f, -1.43604f, -1.00451f, -1.51029f, 0.63736f, + 0.45260f, 0.16229f, 4.01393f, -0.21748f, 0.36411f, -0.08764f, + -0.12329f, 0.08986f, 1.08117f, -0.00220f, +}; + +static const float av1_tx_type_nn_weights_16x8_hor_layer1[64] = { + 0.55824f, -0.14648f, 0.81947f, -0.45867f, -1.86078f, -0.17291f, 0.34849f, + 0.15153f, 1.75625f, -0.25760f, 0.72015f, -0.30059f, -0.57975f, 0.07609f, + -0.02036f, 0.07912f, 0.57080f, -0.13792f, 0.74184f, -0.87669f, -1.87572f, + -0.27270f, 0.39751f, 0.19652f, 2.03514f, -0.32944f, 0.76251f, 0.04399f, + -0.63175f, 0.37420f, 0.08309f, 0.04466f, 0.60255f, -0.12820f, 1.66065f, + -0.59496f, -1.94794f, -0.14847f, 0.39424f, 0.16273f, 1.80587f, 0.41197f, + 0.74691f, -0.21217f, -0.63173f, 0.09510f, -0.35538f, -0.04407f, 0.92847f, + 0.20141f, 1.68680f, -0.56528f, -2.26960f, 0.12978f, 0.73748f, 0.42438f, + 2.00673f, -0.40189f, 0.95423f, 0.23234f, -0.80953f, 0.65814f, 0.49444f, + -0.23347f, +}; + +static const float av1_tx_type_nn_bias_16x8_hor_layer1[4] = { + 3.57175f, + 2.42612f, + 3.31259f, + 2.08287f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_16x8_hor = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_16x8_hor_layer0, + av1_tx_type_nn_weights_16x8_hor_layer1 }, + { av1_tx_type_nn_bias_16x8_hor_layer0, av1_tx_type_nn_bias_16x8_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_16x8_ver_layer0[128] = { + 0.46633f, 1.55328f, -0.11230f, -0.29571f, 0.18814f, -1.52430f, -2.34660f, + 0.08644f, -1.97718f, -1.29140f, -1.12262f, -1.12985f, -1.25911f, -0.96506f, + -1.57129f, 0.96021f, 1.34192f, 1.28623f, 1.21655f, 1.28758f, 1.25482f, + 1.30195f, 1.19190f, 0.09310f, 0.52072f, 0.91487f, 1.24100f, 1.61236f, + 1.72166f, 2.20750f, 1.62379f, -1.43936f, 0.50665f, 0.40213f, 0.66502f, + -1.66699f, -3.07618f, 0.05877f, 0.60987f, -0.09995f, -0.10916f, 0.48049f, + 0.23812f, 0.39847f, -0.21682f, -0.63455f, 0.33453f, -0.67939f, -4.14355f, + -0.62756f, -0.22502f, -0.17215f, 0.01062f, 0.27049f, -0.10748f, 0.30945f, + 2.72445f, -0.89181f, -0.06800f, 0.20595f, -0.73385f, 0.04071f, -1.30294f, + 1.83507f, 0.92570f, 0.69609f, 0.76285f, 0.69892f, 0.76409f, 0.63104f, + 0.73397f, 1.09575f, -0.20129f, -0.24022f, -0.24599f, -0.59107f, -0.88755f, + -0.68987f, -0.75495f, -1.31002f, -1.30237f, -0.94093f, -2.15678f, -1.49303f, + -1.17498f, -1.39952f, -0.91270f, -0.05587f, 1.02381f, -0.75580f, -0.65263f, + -0.78996f, -0.71075f, -0.71018f, -0.70350f, -1.26196f, 2.34208f, -0.53611f, + 0.19752f, -0.16842f, -0.24828f, 0.21857f, 0.08222f, -2.55894f, -1.75702f, + 0.11394f, 1.03083f, 0.79972f, -1.54112f, -1.82341f, -0.57597f, -0.02077f, + -0.39616f, -0.00995f, -0.12809f, 0.01188f, -0.25117f, 0.09202f, 0.09336f, + -0.05614f, -0.30039f, 0.25834f, 1.19944f, 1.22533f, 0.92330f, 0.75967f, + -0.81945f, -0.41647f, +}; + +static const float av1_tx_type_nn_bias_16x8_ver_layer0[16] = { + 0.17841f, 0.67315f, -1.24450f, 3.13859f, 0.16203f, -0.14992f, + 0.29553f, -1.15567f, -0.71421f, 1.15977f, 1.14585f, 3.02460f, + -0.04510f, 0.48000f, -0.09354f, -0.42422f, +}; + +static const float av1_tx_type_nn_weights_16x8_ver_layer1[64] = { + 0.29912f, -0.10009f, -1.11478f, 1.76812f, -0.27719f, 0.52148f, 0.17622f, + -1.17116f, 0.73397f, -0.69279f, -0.11080f, 1.53751f, -1.42003f, 0.14731f, + 0.13592f, -0.04883f, 0.39186f, -0.13655f, -0.43994f, 1.82759f, -0.25601f, + -0.15018f, 0.51920f, -1.56070f, 0.31683f, -0.79367f, -0.02904f, 1.28637f, + -1.15203f, 0.26627f, 0.42828f, -0.24258f, 0.38647f, -0.83352f, 0.32553f, + 2.09522f, -0.26822f, -0.42191f, 0.32825f, -1.30748f, 1.50551f, -0.52669f, + 0.20045f, 1.69318f, -1.47839f, 0.30802f, -0.07290f, -0.28106f, 0.68192f, + -0.15522f, 1.12579f, 2.21921f, 0.09720f, -0.50265f, 0.83165f, -1.31721f, + 0.72422f, -1.24952f, 0.61653f, 2.04117f, -1.42406f, 0.52568f, -0.46180f, + -0.00873f, +}; + +static const float av1_tx_type_nn_bias_16x8_ver_layer1[4] = { + 3.34981f, + 3.74710f, + 1.38339f, + 0.45176f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_16x8_ver = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_16x8_ver_layer0, + av1_tx_type_nn_weights_16x8_ver_layer1 }, + { av1_tx_type_nn_bias_16x8_ver_layer0, av1_tx_type_nn_bias_16x8_ver_layer1 } +}; +/******************************************************************************/ + +// Tx type model for 16x16 block. +static const float av1_tx_type_nn_weights_16x16_layer0[128] = { + 1.26592f, 1.36313f, 1.30956f, 1.29926f, 1.48816f, 1.68851f, 1.32000f, + 0.13321f, -0.22477f, -0.88906f, -0.19622f, 1.69605f, 1.22180f, -1.57771f, + -1.15765f, 0.05710f, -1.13355f, -0.85486f, -0.99971f, -0.91571f, -1.06031f, + -0.77952f, -1.15723f, 1.17809f, 1.35602f, -0.05243f, -0.37596f, 0.26108f, + 0.17611f, -0.10323f, 0.77279f, -0.48911f, -0.79308f, 0.55112f, 0.43918f, + 0.27872f, 0.28714f, 0.45830f, 1.05689f, 0.03705f, -2.49975f, -0.01940f, + 0.05709f, 0.07942f, -0.13290f, -0.10359f, 0.00143f, 0.37303f, 0.96470f, + 0.53293f, 1.14459f, 0.89185f, 0.43378f, 0.47764f, 0.90924f, 0.15279f, + -0.15361f, 0.02949f, 0.42240f, 0.68143f, 0.89588f, 0.73754f, 0.10974f, + 1.57755f, -0.39870f, -0.32914f, 0.35638f, 0.34991f, -0.00003f, -0.23373f, + 0.29630f, -0.76699f, -0.01356f, 0.04234f, 0.84253f, 1.92078f, 0.93160f, + 0.71993f, 0.71604f, 0.76455f, -1.59782f, 0.32332f, 1.11628f, 0.33062f, + -0.03728f, -0.05710f, 0.80447f, -0.14719f, 1.34658f, -0.05718f, 0.64015f, + 0.21926f, 0.41653f, 0.12720f, 0.54092f, 1.39411f, 1.81819f, -0.24513f, + 0.00955f, 0.38011f, -0.57787f, -0.41759f, 0.68834f, -0.31783f, -0.40607f, + -0.10107f, -0.79374f, 0.75599f, -0.16282f, -0.14490f, -0.20783f, -0.55019f, + -0.13793f, -0.22293f, 0.18305f, 0.12445f, 0.56830f, 0.24567f, 0.09278f, + 0.70803f, 0.35803f, -1.52676f, -0.89624f, 0.77665f, 0.19877f, 0.77175f, + 0.50355f, 0.08592f, +}; + +static const float av1_tx_type_nn_bias_16x16_layer0[16] = { + -1.31834f, 0.14346f, -0.10062f, 0.84489f, 0.95617f, -0.06720f, + -0.68502f, -0.91442f, -0.31932f, 0.25276f, -0.15138f, -1.57661f, + -0.14062f, -0.42120f, 0.94573f, -0.09287f, +}; + +static const float av1_tx_type_nn_weights_16x16_layer1[64] = { + -1.80333f, -1.06353f, 0.55139f, 0.74644f, 0.13747f, -0.93018f, -0.10286f, + 0.67133f, 0.24460f, 1.44583f, 0.02173f, 0.26037f, -0.73687f, 0.19566f, + 0.61846f, -0.58601f, -1.03196f, -0.74415f, 0.30041f, -0.41967f, 1.08740f, + 0.96224f, -0.59139f, 0.03813f, 0.05403f, 1.33427f, -0.54375f, -1.92181f, + 0.54704f, 0.13608f, 0.22151f, -0.38076f, 1.18390f, -0.77508f, -1.84283f, + 1.00894f, 0.62318f, -0.15296f, 1.27600f, 0.22822f, 0.12751f, 0.93910f, + -0.28502f, 0.53912f, -0.96889f, 0.10182f, 0.81508f, -0.43028f, 2.67386f, + 0.52204f, 0.49820f, -0.41711f, 1.05038f, 1.12192f, 0.74349f, -0.75417f, + -0.03718f, -0.35769f, 0.89651f, 0.63236f, 0.54215f, -0.07894f, 0.48274f, + 1.08829f, +}; + +static const float av1_tx_type_nn_bias_16x16_layer1[4] = { + 0.81986f, + 1.26865f, + 0.11118f, + 2.48404f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_16x16 = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { + av1_tx_type_nn_weights_16x16_layer0, + av1_tx_type_nn_weights_16x16_layer1, + }, + { + av1_tx_type_nn_bias_16x16_layer0, + av1_tx_type_nn_bias_16x16_layer1, + }, +}; +/******************************************************************************/ + +// Tx type model for 4x16 block. +static const float av1_tx_type_nn_weights_4x16_hor_layer0[32] = { + 0.36539f, 0.25667f, 0.01491f, -0.21959f, 2.55105f, 0.17615f, 1.79884f, + 1.65936f, -0.44363f, 0.00706f, -0.68004f, -0.64360f, 1.75760f, 1.91906f, + 1.47682f, 0.09650f, -3.59244f, -0.35004f, 0.93295f, 0.25806f, -0.08154f, + 0.79332f, 0.79535f, 1.09467f, 1.57855f, -0.51359f, 0.90553f, -1.67744f, + -1.74563f, -0.88830f, -1.77603f, 2.15935f, +}; + +static const float av1_tx_type_nn_bias_4x16_hor_layer0[8] = { + -0.36435f, -2.22731f, -0.00837f, -1.34546f, + 0.62806f, -0.20675f, 4.91940f, -0.56079f, +}; + +static const float av1_tx_type_nn_weights_4x16_hor_layer1[32] = { + -0.57191f, -1.46418f, 0.67331f, -1.15027f, 0.46288f, 0.81251f, 2.51768f, + -0.27147f, 0.00761f, -2.15214f, -0.69650f, -0.50808f, 0.92832f, 0.45668f, + 2.34201f, -0.52941f, 0.51008f, -1.55496f, -0.01371f, -0.12356f, 0.66624f, + 0.88043f, 2.64862f, -1.28024f, -0.17578f, -1.80034f, -0.32217f, 0.89519f, + 1.28413f, -0.30326f, 2.45329f, -0.83335f, +}; + +static const float av1_tx_type_nn_bias_4x16_hor_layer1[4] = { + 2.33198f, + 3.36245f, + 1.62603f, + 2.91056f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_4x16_hor = { + 4, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_4x16_hor_layer0, + av1_tx_type_nn_weights_4x16_hor_layer1 }, + { av1_tx_type_nn_bias_4x16_hor_layer0, av1_tx_type_nn_bias_4x16_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_4x16_ver_layer0[128] = { + 1.61392f, 1.41239f, 1.47646f, 1.47325f, 1.46110f, 1.49208f, 1.49414f, + 0.12835f, -0.76986f, 0.07087f, -0.24572f, -0.93168f, 3.07935f, -0.18183f, + -0.09831f, -0.07703f, -0.03222f, -0.25473f, -0.06090f, 2.93713f, -0.38711f, + -0.12884f, -0.18329f, -0.06262f, -0.00327f, -0.02930f, -0.01641f, -0.00622f, + -0.03305f, -4.07069f, -2.76643f, 0.04413f, -1.03176f, -0.19217f, -0.44980f, + -2.48615f, -2.58112f, -0.87695f, 0.16187f, -0.04891f, -0.06854f, 1.08104f, + 0.75245f, 1.49302f, 0.63363f, 1.45715f, 0.92574f, 1.72029f, 0.33326f, + 3.86646f, 0.04422f, 0.41019f, 0.36212f, 0.56600f, -1.01552f, 0.05128f, + 0.40454f, -1.05100f, -0.47461f, -1.33168f, -0.46145f, -1.36870f, -0.88838f, + -1.05358f, -0.18537f, -0.34357f, -0.03698f, 0.68905f, 0.41010f, 0.31223f, + -0.43382f, -0.74715f, 2.03366f, -0.30419f, 0.45747f, 0.09526f, 0.31678f, + 0.22915f, 0.21832f, 1.26385f, -0.06814f, -0.71417f, -1.18947f, 0.03762f, + 0.10936f, 2.97396f, -0.42638f, -0.03123f, -5.49756f, -0.17029f, -0.11323f, + 0.05173f, -0.44274f, -0.15738f, 0.11311f, 0.43872f, 0.16837f, -0.52849f, + 2.90050f, -0.54735f, -0.29591f, 1.24030f, 0.21696f, -0.04443f, -1.60877f, + -1.36365f, -1.27432f, -1.52060f, -1.34397f, -1.13371f, -1.87554f, 0.80123f, + 0.42820f, -0.14157f, -2.73963f, -0.68040f, -0.35236f, 0.14490f, 2.23477f, + 0.01370f, -0.20426f, -1.51411f, -0.72293f, 0.64516f, 0.97638f, 0.32616f, + -0.27975f, -0.01149f, +}; + +static const float av1_tx_type_nn_bias_4x16_ver_layer0[16] = { + -1.37863f, -0.05763f, -0.07041f, 0.15306f, 0.96026f, -1.42105f, + -0.55822f, 1.04845f, -0.17662f, -1.25345f, -0.11927f, 0.49845f, + -0.32530f, 0.73483f, 0.08322f, -0.23890f, +}; + +static const float av1_tx_type_nn_weights_4x16_ver_layer1[64] = { + 0.27194f, 0.50607f, 0.49229f, -0.48192f, 0.15667f, -1.38891f, 0.38102f, + -0.58825f, -0.07337f, -0.52909f, 0.36975f, 0.28710f, 0.34992f, -0.73630f, + 0.30386f, -0.58822f, 0.36127f, 0.57950f, 0.55878f, -0.42796f, 0.19967f, + -1.45517f, 0.42529f, -0.54630f, -0.38169f, -0.84899f, 0.41622f, 0.46935f, + 0.39077f, -0.75448f, 0.31698f, -0.76187f, 0.97765f, 0.57052f, 0.55825f, + -0.54273f, 0.20466f, -1.46347f, 0.41813f, -0.55019f, -0.19948f, -0.57982f, + 0.41206f, 0.32373f, 0.38537f, -1.11657f, 0.32887f, -0.76911f, 1.12259f, + 0.72163f, 0.82603f, 0.37786f, 0.34976f, -1.86642f, 0.59961f, -0.16329f, + -0.36631f, -0.56814f, 0.60410f, 0.53158f, 0.56389f, -0.70508f, 0.51009f, + -0.56513f, +}; + +static const float av1_tx_type_nn_bias_4x16_ver_layer1[4] = { + 4.60896f, + 4.53551f, + 4.53124f, + 4.27435f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_4x16_ver = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_4x16_ver_layer0, + av1_tx_type_nn_weights_4x16_ver_layer1 }, + { av1_tx_type_nn_bias_4x16_ver_layer0, av1_tx_type_nn_bias_4x16_ver_layer1 } +}; +/******************************************************************************/ + +// Tx type model for 16x4 block. +static const float av1_tx_type_nn_weights_16x4_hor_layer0[128] = { + 1.45347f, -0.15743f, 0.44236f, 0.25808f, 0.33944f, 0.38678f, 0.24428f, + 1.67287f, 0.09539f, -0.42940f, -0.31507f, -0.00154f, -2.98755f, -2.27744f, + -0.49183f, 0.09333f, -0.99026f, -0.22157f, 0.53701f, 0.60447f, 0.15686f, + -0.04646f, 0.26341f, 2.12361f, 0.27090f, -1.14716f, -0.64146f, -0.91604f, + -0.75335f, -0.60056f, -1.25084f, 1.68473f, -3.24075f, -4.03867f, -2.07877f, + -0.02347f, 0.00333f, -0.01259f, -0.00465f, 0.02526f, 0.36286f, -0.10324f, + 2.12780f, -0.74584f, -1.05052f, 1.78467f, -0.55065f, -0.03326f, 2.46781f, + 1.18349f, 0.96015f, 1.01696f, 1.10584f, 1.07263f, 1.11531f, -1.06413f, + 0.32389f, -1.87360f, -0.14435f, 1.77926f, 1.09966f, -0.12680f, -0.61386f, + -0.09724f, -0.33095f, 1.12122f, 1.00791f, 1.52416f, 1.35004f, 1.32657f, + 0.60950f, -1.13538f, -0.38654f, 0.06473f, 2.10669f, 0.27734f, -0.38359f, + -1.91455f, -1.22676f, 0.05786f, 0.97432f, 2.19967f, 0.50457f, 0.78976f, + 0.95183f, -0.32414f, 0.49437f, -0.04506f, 0.18993f, -0.07971f, 0.23889f, + -0.09872f, -0.66036f, 0.05377f, 2.69638f, -0.08259f, -0.69210f, -1.08296f, + -1.96504f, -2.31947f, -0.80161f, -0.80456f, -1.35556f, -0.05323f, -4.42658f, + -0.30732f, -0.12043f, 0.11126f, 0.10771f, -0.14956f, -0.02218f, 0.41016f, + 1.16599f, 1.14629f, 1.12881f, 1.18676f, 1.24677f, 1.28695f, 1.11270f, + 0.08233f, 1.75440f, 0.49228f, -0.34858f, -0.17032f, 0.29288f, 0.47175f, + 0.19055f, -1.56413f, +}; + +static const float av1_tx_type_nn_bias_16x4_hor_layer0[16] = { + -1.71227f, 0.47291f, -0.97536f, -0.66216f, 0.11729f, -0.21451f, + 2.75281f, 0.04318f, 2.03965f, 0.14618f, -0.70483f, -0.24517f, + 1.14048f, 0.33308f, -1.10886f, 0.41184f, +}; + +static const float av1_tx_type_nn_weights_16x4_hor_layer1[64] = { + -1.17079f, 0.19096f, -1.05753f, -0.30803f, -1.21680f, -0.67255f, 1.60115f, + 0.05972f, 1.44759f, -0.04068f, -0.26331f, 0.31400f, 0.96923f, 0.33443f, + -0.77215f, -0.91316f, -1.78928f, 0.21483f, -1.24008f, -0.46190f, -0.12127f, + -0.62144f, 1.37593f, 0.08373f, 1.56215f, 0.00279f, -0.14556f, 0.38710f, + 0.96228f, 0.66433f, -0.51798f, -0.80738f, -0.18539f, 0.19377f, -1.03090f, + -1.51044f, -0.59485f, -0.62589f, 1.90742f, 0.09078f, 1.49113f, 0.00205f, + -0.15918f, 0.40827f, 1.08553f, 0.43431f, 0.33519f, -1.12669f, -1.10274f, + 0.80004f, -1.83599f, -0.53134f, 2.00515f, -0.32670f, 1.37124f, 0.51136f, + 1.62563f, 0.24787f, 0.31757f, 0.81751f, 1.57262f, 0.83214f, 1.04661f, + -0.43819f, +}; + +static const float av1_tx_type_nn_bias_16x4_hor_layer1[4] = { + 2.32575f, + 2.75703f, + 1.12304f, + 2.15567f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_16x4_hor = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_16x4_hor_layer0, + av1_tx_type_nn_weights_16x4_hor_layer1 }, + { av1_tx_type_nn_bias_16x4_hor_layer0, av1_tx_type_nn_bias_16x4_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_16x4_ver_layer0[32] = { + 0.26047f, 0.99930f, 1.16484f, -0.28196f, -2.67483f, -0.21456f, -0.16854f, + 0.46375f, 1.47951f, 1.13735f, 1.12356f, 0.27385f, 0.50978f, 2.09967f, + -1.47386f, 0.01950f, -0.06362f, 0.26014f, 1.04544f, -0.03099f, 0.07478f, + -0.39701f, 0.05545f, 2.73633f, -0.56305f, -0.02208f, -0.44517f, -0.00897f, + -0.17967f, -0.96622f, 0.42635f, -1.04784f, +}; + +static const float av1_tx_type_nn_bias_16x4_ver_layer0[8] = { + -0.52088f, 0.52844f, -1.03655f, -0.30974f, + 2.59952f, -1.93604f, 0.00000f, 2.51787f, +}; + +static const float av1_tx_type_nn_weights_16x4_ver_layer1[32] = { + 0.10916f, -0.21219f, -0.51340f, 0.69161f, 1.45988f, -1.36942f, -0.40899f, + 1.05136f, -0.08486f, 0.10008f, -0.55304f, 0.88012f, 1.61177f, -1.64507f, + 0.63428f, 1.15130f, -0.17287f, -0.18592f, -0.01143f, 0.88293f, 1.73326f, + -1.63624f, 0.09359f, 1.18393f, 0.26531f, 0.22378f, 0.15170f, 1.06965f, + 1.26814f, -1.93873f, -0.00768f, 1.58309f, +}; + +static const float av1_tx_type_nn_bias_16x4_ver_layer1[4] = { + 2.34713f, + 1.68667f, + 1.25488f, + 1.69812f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_16x4_ver = { + 4, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_16x4_ver_layer0, + av1_tx_type_nn_weights_16x4_ver_layer1 }, + { av1_tx_type_nn_bias_16x4_ver_layer0, av1_tx_type_nn_bias_16x4_ver_layer1 } +}; +/******************************************************************************/ + +// Map tx_size to its corresponding neural net model for tx type prediction. +static const NN_CONFIG *av1_tx_type_nnconfig_map_hor[] = { + &av1_tx_type_nnconfig_4x4_hor, // 4x4 transform + &av1_tx_type_nnconfig_8x8_hor, // 8x8 transform + &av1_tx_type_nnconfig_16x16, // 16x16 transform + NULL, // 32x32 transform + NULL, // 64x64 transform + &av1_tx_type_nnconfig_4x8_hor, // 4x8 transform + &av1_tx_type_nnconfig_8x4_hor, // 8x4 transform + &av1_tx_type_nnconfig_8x16_hor, // 8x16 transform + &av1_tx_type_nnconfig_16x8_hor, // 16x8 transform + NULL, // 16x32 transform + NULL, // 32x16 transform + NULL, // 32x64 transform + NULL, // 64x32 transform + &av1_tx_type_nnconfig_4x16_hor, // 4x16 transform + &av1_tx_type_nnconfig_16x4_hor, // 16x4 transform + NULL, // 8x32 transform + NULL, // 32x8 transform + NULL, // 16x64 transform + NULL, // 64x16 transform +}; + +static const NN_CONFIG *av1_tx_type_nnconfig_map_ver[] = { + &av1_tx_type_nnconfig_4x4_ver, // 4x4 transform + &av1_tx_type_nnconfig_8x8_ver, // 8x8 transform + &av1_tx_type_nnconfig_16x16, // 16x16 transform + NULL, // 32x32 transform + NULL, // 64x64 transform + &av1_tx_type_nnconfig_4x8_ver, // 4x8 transform + &av1_tx_type_nnconfig_8x4_ver, // 8x4 transform + &av1_tx_type_nnconfig_8x16_ver, // 8x16 transform + &av1_tx_type_nnconfig_16x8_ver, // 16x8 transform + NULL, // 16x32 transform + NULL, // 32x16 transform + NULL, // 32x64 transform + NULL, // 64x32 transform + &av1_tx_type_nnconfig_4x16_ver, // 4x16 transform + &av1_tx_type_nnconfig_16x4_ver, // 16x4 transform + NULL, // 8x32 transform + NULL, // 32x8 transform + NULL, // 16x64 transform + NULL, // 64x16 transform +}; +#endif // CONFIG_NN_V2 + +// Tx split model for 4x8 block. +static const float av1_tx_split_nn_weights_4x8_layer0[8 * 16] = { + 0.068650f, -0.732073f, -0.040361f, 0.322550f, -0.021123f, 0.212518f, + -0.350546f, 0.435987f, -0.111756f, -0.401568f, 0.069548f, -0.313000f, + 0.073918f, -0.373805f, -0.775810f, -0.124753f, 0.181094f, -0.602641f, + -0.026219f, -0.350112f, 0.020599f, -0.311752f, -0.476482f, -0.669465f, + -0.310921f, 0.348869f, -0.115984f, 0.154250f, 0.200485f, -0.016689f, + 0.020392f, 0.413810f, 0.634064f, -0.627530f, 0.399178f, -0.012284f, + 0.472030f, 0.091087f, -0.706100f, -0.447944f, -0.274226f, 0.445656f, + 0.309339f, 0.505522f, 0.038496f, -0.152809f, 0.408684f, -0.068151f, + 0.271612f, 0.353233f, -0.150365f, 0.075212f, -0.035096f, 0.346615f, + 0.124382f, 0.477072f, 0.216288f, 0.070548f, -0.106362f, 0.681613f, + -0.145502f, -0.218631f, -0.099248f, -0.001983f, -0.196819f, -0.969045f, + 0.063009f, -0.123053f, 0.104875f, -0.137581f, -0.282933f, -0.003624f, + -0.315659f, -0.333523f, -0.503000f, -0.100063f, -0.536711f, -0.059978f, + -0.670248f, -0.353762f, 0.181109f, 0.289715f, -0.071206f, 0.261141f, + 0.052796f, -0.114554f, -0.139214f, -0.261380f, 0.075984f, -0.647925f, + -0.099528f, -0.677814f, 0.015712f, -0.389385f, -0.095622f, -0.165117f, + -0.109454f, -0.175240f, -0.393914f, 0.212330f, 0.037822f, 0.248280f, + 0.180197f, 0.110493f, -0.525727f, -0.092329f, -0.524029f, -0.407364f, + -0.542373f, -0.435626f, -0.912194f, 0.062794f, 0.160433f, 0.741485f, + -0.103659f, -0.119327f, -0.055275f, 0.334358f, 0.014713f, 0.046327f, + 0.831114f, -0.576682f, 0.354369f, -0.082088f, 0.452331f, 0.039730f, + -0.792429f, -0.385862f, +}; + +static const float av1_tx_split_nn_bias_4x8_layer0[16] = { + 0.238621f, 2.186830f, 1.383035f, -0.867139f, 1.257119f, -0.351571f, + -0.240650f, -0.971692f, 2.744843f, 1.116991f, 0.139062f, -0.165332f, + 0.262171f, -1.598153f, -1.427340f, -1.602306f, +}; + +static const float av1_tx_split_nn_weights_4x8_layer1[16] = { + -0.367134f, 1.373058f, -0.897039f, -0.326819f, -0.734030f, -0.290413f, + -0.501249f, 0.505321f, -0.537692f, -0.767893f, 0.268697f, 0.278987f, + 0.085082f, 0.614986f, 0.847904f, 0.637578f, +}; + +static const float av1_tx_split_nn_bias_4x8_layer1[1] = { + 0.20586078f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_4x8 = { + 8, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_4x8_layer0, + av1_tx_split_nn_weights_4x8_layer1, + }, + { + av1_tx_split_nn_bias_4x8_layer0, + av1_tx_split_nn_bias_4x8_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 8x8 block. +static const float av1_tx_split_nn_weights_8x8_layer0[144] = { + 0.177983f, -0.938386f, -0.074460f, -0.221843f, -0.073182f, -0.295155f, + -0.098202f, -0.279510f, 0.001054f, -0.119319f, -1.835282f, -0.581507f, + -1.222222f, -1.049006f, -0.807508f, -0.454252f, -0.774879f, -0.180607f, + -0.886976f, -0.231971f, -0.824677f, -0.351872f, -1.323819f, 0.235378f, + 0.015331f, -0.341818f, 0.145549f, -0.348362f, 0.147647f, -0.323400f, + 0.047558f, -0.553025f, -0.295485f, -0.330368f, -0.530605f, -0.407516f, + 0.447740f, 0.782381f, -0.179164f, -0.584675f, -0.052645f, 0.038656f, + -0.096783f, 0.038342f, -0.170762f, -0.405844f, -0.552665f, -0.509866f, + 0.757204f, -1.296465f, 0.631015f, 0.009265f, 0.646192f, 0.044523f, + 0.653161f, 0.033820f, 0.849639f, -0.068555f, -1.036085f, -0.511652f, + 0.104693f, -1.458690f, 0.286051f, -0.089800f, 0.381564f, -0.302640f, + 0.304465f, -0.268706f, 0.432603f, -0.117914f, -2.070031f, -0.565696f, + -0.073027f, -1.783570f, -0.318144f, -0.320990f, -0.343966f, -0.140996f, + -0.322977f, -0.232147f, -0.373210f, -0.158266f, -1.922305f, -0.634373f, + 0.101894f, -0.221847f, 0.018412f, -0.423887f, -0.266684f, -0.444930f, + -0.196237f, 0.106638f, -0.065834f, -0.538401f, -0.280772f, -0.620348f, + 1.089957f, -0.799928f, 0.504112f, -0.165763f, 0.578741f, -0.172653f, + 0.547316f, -0.143484f, 0.717220f, -0.297190f, -1.237854f, -0.074819f, + -0.977304f, -0.484092f, -0.646427f, -0.451443f, -0.612126f, -0.224475f, + -0.731608f, -0.257077f, -0.665857f, -0.346742f, -1.216372f, 0.227267f, + 0.231249f, -1.693073f, -0.035899f, 0.380845f, -0.058476f, 0.409405f, + -0.066679f, 0.406731f, -0.068501f, 0.396748f, 0.639462f, 0.150834f, + -0.418659f, -1.421931f, 0.101889f, 0.083573f, 0.129746f, 0.134460f, + 0.081185f, 0.127420f, 0.083664f, 0.051096f, 1.361688f, 0.386093f, +}; + +static const float av1_tx_split_nn_bias_8x8_layer0[12] = { + 4.280443f, 2.218902f, -0.256953f, 3.161431f, 2.082548f, 2.506052f, + 2.563224f, 1.421976f, -1.627813f, -1.436085f, 2.297265f, 1.500469f, +}; + +static const float av1_tx_split_nn_weights_8x8_layer1[12] = { + 1.178833f, -0.428527f, -0.078737f, 0.381434f, -0.466895f, -0.901745f, + -0.766968f, -0.356663f, 0.450146f, 0.509370f, -0.356604f, -0.443506f, +}; + +static const float av1_tx_split_nn_bias_8x8_layer1[1] = { + -0.156294f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_8x8 = { + 12, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 12, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_8x8_layer0, + av1_tx_split_nn_weights_8x8_layer1, + }, + { + av1_tx_split_nn_bias_8x8_layer0, + av1_tx_split_nn_bias_8x8_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 8x16 block. +static const float av1_tx_split_nn_weights_8x16_layer0[8 * 64] = { + 0.374660f, 0.218905f, -0.139779f, 0.212141f, 0.056517f, 0.051114f, + 0.042860f, -0.273258f, -0.340809f, 0.138983f, -0.216996f, -0.241519f, + -0.123244f, 0.078577f, -0.472273f, -0.194201f, 0.125056f, 0.239761f, + -0.332782f, 0.174782f, -0.211400f, -0.129795f, 0.062195f, 0.113176f, + -0.008869f, 0.140764f, 0.059833f, 0.163826f, 0.359293f, -0.109797f, + -0.022091f, -0.059536f, -0.188226f, 0.179709f, 0.031386f, 0.164790f, + 0.214364f, 0.198555f, 0.152262f, -0.242980f, 0.319367f, -0.136902f, + 0.046524f, -0.043591f, 0.342178f, -0.011757f, -0.014286f, 0.072871f, + -0.278314f, -0.345303f, -0.252103f, -0.107154f, -0.235101f, -0.106739f, + -0.120865f, -0.160042f, 0.240028f, 0.112902f, -0.141587f, -0.703012f, + -0.136591f, 0.318993f, -0.154417f, -0.054668f, 0.192870f, 0.176166f, + -0.029965f, 0.266942f, -0.178384f, 0.038680f, 0.134403f, -0.002426f, + 0.534825f, -0.070923f, 0.413281f, 0.418148f, 0.093729f, 0.016454f, + 0.305358f, -0.040512f, 0.069904f, -0.227588f, -0.362220f, -0.031604f, + -0.394901f, 0.071506f, -0.342833f, -0.142550f, -0.164005f, 0.182600f, + 0.213062f, 0.076805f, 0.278758f, 0.125613f, -0.035552f, 0.040971f, + 0.182785f, -0.227961f, -0.105413f, -0.074949f, -0.084629f, -0.254767f, + 0.114657f, 0.047121f, 0.195902f, 0.264759f, 0.017799f, 0.210230f, + 0.150749f, -0.142142f, 0.182494f, -0.142415f, -0.259782f, -0.114830f, + -0.198826f, 0.000061f, -0.375668f, -0.276656f, -0.373202f, 0.210298f, + 0.422680f, 0.066960f, 0.351106f, -0.209034f, 0.367195f, -0.110274f, + 0.115573f, -0.066642f, -0.389673f, -0.260447f, 0.056949f, -0.180425f, + 0.069922f, -0.153506f, -0.097053f, -0.111757f, 0.094069f, 0.144837f, + -0.052984f, -0.506681f, -0.034474f, 0.279057f, -0.105025f, 0.006656f, + -0.125017f, -0.114096f, 0.103153f, -0.117402f, -0.359472f, 0.072534f, + 0.110291f, 0.003088f, -0.456897f, 0.038331f, -0.322298f, 0.113942f, + -0.119916f, -0.194392f, 0.093167f, 0.193459f, 0.074671f, 0.033602f, + 0.004440f, -0.179578f, -0.036637f, -0.216172f, -0.296530f, -0.318992f, + 0.319160f, -0.066218f, 0.291246f, 0.181292f, 0.089914f, 0.025273f, + 0.303128f, 0.019063f, 0.078545f, -0.396919f, 0.014065f, -0.122121f, + 0.037107f, -0.151886f, -0.299392f, -0.172207f, -0.124571f, -0.232553f, + 0.102970f, -0.225040f, 0.061059f, -0.258188f, -0.469871f, -0.099607f, + -0.061524f, -0.213700f, 0.070237f, -0.289134f, -0.238225f, 0.256403f, + -0.119344f, 0.067782f, -0.398983f, -0.123975f, -0.200205f, -0.047038f, + 0.026569f, 0.031037f, 0.094302f, -0.101239f, 0.433307f, -0.303612f, + 0.088537f, -0.164436f, 0.202471f, -0.048592f, -0.251904f, 0.122577f, + -0.309874f, -0.263405f, -0.292503f, 0.216589f, 0.035378f, 0.136599f, + -0.145844f, -0.018211f, 0.174084f, -0.449941f, -0.001428f, 0.064134f, + 0.039652f, 0.111083f, -0.246076f, -0.204733f, 0.056559f, -0.000123f, + 0.104049f, 0.138512f, -0.128309f, 0.087855f, 0.232784f, 0.247138f, + 0.162766f, 0.154829f, 0.313605f, -0.164115f, -0.050844f, 0.156549f, + 0.185279f, -0.238962f, -0.308281f, -0.179592f, -0.193262f, 0.201670f, + -0.203399f, -0.096831f, -0.127867f, 0.310674f, -0.008181f, 0.004078f, + -0.211038f, -0.193480f, -0.185639f, -0.150202f, -0.204858f, -0.240758f, + 0.114268f, -0.032535f, -0.052403f, -0.234333f, -0.064072f, -0.208444f, + -0.352853f, -0.224001f, -0.156330f, 0.215436f, 0.171846f, 0.291849f, + 0.108832f, 0.046991f, -0.127801f, 0.032485f, 0.141493f, 0.123319f, + -0.057250f, 0.315346f, -0.061317f, -0.465086f, -0.130179f, -0.217841f, + -0.239089f, -0.073251f, -0.327718f, 0.054905f, -0.283169f, -0.028900f, + 0.071450f, 0.270072f, 0.248891f, 0.088052f, 0.253319f, 0.122808f, + 0.175490f, -0.147805f, 0.089169f, -0.045457f, -0.330788f, 0.099791f, + -0.137376f, -0.195977f, -0.350942f, -0.284930f, -0.559037f, 0.030504f, + 0.162554f, -0.199100f, -0.050453f, -0.131320f, -0.077863f, -0.066253f, + -0.379723f, -0.424047f, -0.081182f, -0.252261f, -0.102815f, 0.058240f, + -0.182036f, 0.176772f, -0.070823f, 0.216054f, -0.211533f, -0.232992f, + 0.279346f, 0.117984f, 0.236674f, 0.126625f, -0.046220f, 0.044919f, + 0.278492f, 0.083944f, 0.180512f, 0.217994f, 0.401170f, -0.064417f, + 0.011636f, -0.139597f, -0.050020f, -0.268438f, -0.032803f, 0.024908f, + -0.085713f, -0.012984f, -0.055192f, -0.338657f, 0.045826f, -0.312849f, + -0.023393f, -0.168800f, -0.030886f, -0.131816f, -0.253542f, -0.104812f, + -0.354389f, 0.169464f, 0.094151f, -0.217122f, -0.456397f, 0.211478f, + 0.219232f, -0.155519f, -0.353700f, -0.264759f, -0.034709f, 0.034409f, + -0.148639f, -0.132850f, -0.216791f, -0.118492f, 0.173721f, -0.144181f, + 0.335028f, 0.176439f, 0.105980f, 0.169390f, 0.155615f, -0.040618f, + -0.176029f, 0.155569f, -0.184833f, -0.171099f, -0.178663f, -0.032051f, + -0.434334f, 0.092238f, -0.263103f, 0.061804f, -0.172957f, 0.005962f, + -0.100176f, 0.125898f, 0.048092f, -0.088141f, 0.247196f, -0.221601f, + -0.114474f, -0.124410f, -0.156393f, -0.181782f, -0.083562f, 0.034937f, + 0.403401f, -0.046200f, 0.322259f, 0.219678f, 0.109850f, 0.051837f, + 0.196861f, -0.019118f, 0.248818f, -0.137567f, 0.127862f, 0.052293f, + 0.298726f, 0.275788f, 0.015344f, 0.058714f, 0.283691f, -0.053794f, + -0.123270f, -0.227761f, -0.141744f, -0.268515f, -0.007189f, -0.242117f, + -0.252396f, -0.069017f, 0.034803f, -0.003388f, -0.262577f, 0.062115f, + -0.298393f, 0.215415f, -0.153615f, 0.289902f, 0.085886f, -0.504290f, + 0.077178f, 0.150861f, -0.228848f, -0.261020f, 0.198204f, 0.162113f, + 0.346418f, -0.286950f, 0.354756f, -0.226419f, 0.024720f, 0.208037f, + 0.107286f, -0.110849f, 0.104415f, -0.207725f, 0.063932f, -0.037748f, + -0.167037f, -0.068282f, 0.320815f, -0.051884f, 0.099989f, -0.078388f, + 0.127071f, 0.046675f, -0.336571f, -0.273080f, 0.264694f, -0.007352f, + -0.093828f, 0.094773f, -0.144434f, 0.091795f, -0.031615f, 0.056914f, + 0.064673f, -0.136669f, 0.344734f, 0.225926f, 0.283451f, -0.068354f, + 0.030572f, 0.180784f, -0.378047f, -0.092962f, -0.083291f, 0.038970f, + 0.052094f, -0.017932f, 0.216302f, -0.184396f, 0.079888f, 0.210406f, + -0.020627f, 0.244744f, 0.336972f, -0.182914f, -0.220976f, -0.304225f, + -0.330974f, -0.370868f, -0.084935f, -0.136489f, -0.210082f, -0.188088f, + -0.408768f, 0.184693f, +}; + +static const float av1_tx_split_nn_bias_8x16_layer0[64] = { + -0.274107f, 0.445751f, 0.234359f, 0.291593f, 0.163298f, 0.183707f, + -0.548839f, -0.190779f, -0.163346f, -0.669028f, 0.399209f, -0.354974f, + 0.000000f, -0.254630f, 0.220149f, 0.371104f, 0.789759f, 0.270300f, + 0.195126f, -0.206958f, 0.917708f, -0.256232f, 1.131933f, 1.178944f, + 0.461270f, 0.246169f, -0.818614f, -0.111986f, 0.759355f, 0.154889f, + 0.470299f, -1.025250f, 0.678678f, 0.959346f, -0.164105f, 0.544079f, + -0.448733f, 0.649221f, -0.536672f, 0.962758f, -0.256427f, 0.808664f, + -0.118694f, 0.684873f, -0.015635f, -0.046469f, 0.075481f, 0.412647f, + 0.454456f, -0.107169f, 0.775235f, -0.261629f, -1.194849f, 0.010093f, + -0.231289f, 0.658286f, -0.769320f, 0.564545f, 0.482962f, -0.131378f, + -0.255844f, -0.078400f, 0.476752f, 0.643001f, +}; + +static const float av1_tx_split_nn_weights_8x16_layer1[64] = { + -0.145065f, -0.145101f, 0.174786f, 0.196692f, 0.102025f, -0.087735f, + 0.386353f, -0.660539f, -0.183940f, 0.490045f, -0.276404f, -0.145669f, + 0.209846f, -0.085574f, -0.156821f, -0.377450f, -0.950010f, 0.450709f, + -0.108545f, -0.261181f, 1.435606f, -0.176621f, -1.158548f, 2.035680f, + 0.218069f, -0.138629f, 0.305958f, -0.277194f, -0.602468f, 0.203873f, + 0.120720f, 0.216095f, -0.434502f, -0.579746f, -0.239450f, 0.755529f, + 0.545643f, 0.232091f, 0.330169f, 0.988136f, -0.070465f, -0.345584f, + -0.162455f, -0.617064f, 0.123881f, -0.201098f, 0.222756f, 0.112932f, + 0.048647f, -0.147890f, 0.394584f, -0.262148f, 0.280564f, -0.195432f, + -0.047515f, 1.133410f, 0.255415f, -0.299032f, -0.397807f, -0.153246f, + -0.256734f, 0.177370f, 0.213522f, -0.530158f, +}; + +static const float av1_tx_split_nn_bias_8x16_layer1[1] = { + 0.14910713f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_8x16 = { + 8, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 64, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_8x16_layer0, + av1_tx_split_nn_weights_8x16_layer1, + }, + { + av1_tx_split_nn_bias_8x16_layer0, + av1_tx_split_nn_bias_8x16_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 16x16 block. +static const float av1_tx_split_nn_weights_16x16_layer0[12 * 24] = { + -0.177215f, -0.297166f, 0.299924f, 0.207878f, 0.216871f, 0.173264f, + 0.295464f, 0.048395f, 0.154731f, 0.305880f, 0.056787f, -0.166617f, + 0.115653f, -0.529477f, -0.073995f, -0.211746f, -0.018169f, 0.000788f, + -0.024940f, -0.007055f, 0.001392f, 0.021678f, -1.594600f, -0.099593f, + 0.332930f, 0.103574f, 0.158249f, 0.182601f, 0.332665f, 0.226207f, + -0.139566f, 0.185531f, 0.099074f, -0.185654f, -0.203121f, -0.285678f, + -0.313453f, -0.294452f, -0.143707f, -0.031265f, -0.453030f, -0.061874f, + -0.066150f, -0.099058f, -0.458879f, 0.127544f, 0.338314f, -0.161350f, + 0.030091f, -0.075528f, 0.004320f, 0.353690f, -0.013480f, -0.420402f, + -0.004659f, -0.329401f, -0.001745f, 0.227384f, -0.055183f, 0.121405f, + 0.160340f, 0.143603f, -0.221813f, 0.079107f, -0.657639f, -0.084348f, + -0.303414f, 0.046774f, -0.367679f, 0.060005f, 0.168645f, 0.084421f, + -0.133625f, 0.301375f, 0.079412f, -0.419303f, 0.017235f, 0.068637f, + 0.018384f, -0.428325f, -0.019753f, 0.149444f, -0.474836f, -0.287162f, + 0.198083f, 0.028292f, -0.299092f, -0.005849f, -0.256245f, 0.233277f, + -0.217561f, -0.264003f, 0.269411f, 0.207032f, -0.339411f, -0.198431f, + -0.028521f, 0.158076f, 0.177116f, 0.345702f, -0.145132f, 0.064623f, + -0.090867f, 0.288816f, -0.263198f, -0.071028f, -0.044546f, 0.380017f, + -0.014100f, -0.271192f, -0.318559f, 0.129015f, -0.050314f, -0.093355f, + -0.578498f, 0.099090f, -0.133080f, -0.029975f, -0.059828f, -0.157765f, + -0.321153f, -0.343671f, -0.242959f, 0.128304f, 0.017170f, 0.072787f, + -0.475838f, -0.003806f, -0.068615f, 0.150556f, -0.159903f, -0.416513f, + 0.218794f, -0.290456f, -0.084569f, -0.170014f, -0.044414f, -0.153069f, + -0.077329f, -0.089747f, -0.096526f, 0.537952f, 0.134725f, -0.006469f, + -0.323335f, -0.168183f, -0.107163f, -0.139954f, 0.011286f, -0.021712f, + -0.513992f, 0.259135f, -0.319808f, 0.077811f, 0.104613f, 0.370571f, + 0.185244f, 0.065530f, -0.091098f, -0.573741f, 0.111934f, 0.437417f, + -0.123691f, 0.220641f, -0.024783f, -0.149460f, -0.354185f, -0.134127f, + 0.038015f, -0.380596f, 0.250980f, 0.142208f, 0.135170f, -0.131129f, + -0.357556f, -0.530945f, 0.159672f, -0.147025f, -0.377829f, -0.504508f, + -0.492870f, 0.020753f, 0.142818f, 0.025172f, 0.086140f, 0.091283f, + 0.087491f, -0.186415f, 0.177785f, -0.195121f, -1.191148f, -0.477102f, + 0.023371f, 0.227004f, -0.023502f, -0.242913f, -0.074398f, -0.153480f, + 0.162900f, 0.415509f, -0.162565f, -0.131709f, -0.258852f, -0.252027f, + -0.080845f, -0.330274f, 0.021874f, 0.232398f, 0.069277f, 0.220567f, + -0.024237f, -0.366771f, 0.081673f, -0.429906f, -0.302170f, 0.061045f, + 0.352777f, -0.230376f, 0.408153f, 0.064758f, 0.142051f, 0.007219f, + 0.622878f, 0.212577f, 0.036489f, 0.081150f, -0.284767f, 0.107763f, + -0.529786f, -0.072190f, -0.300421f, -0.287959f, -0.568900f, 0.011547f, + -0.131696f, -0.356854f, -0.587962f, -0.026598f, 0.405829f, 0.057565f, + 0.414265f, -0.159155f, 0.221456f, 0.146314f, 0.265776f, -0.006516f, + 0.473978f, -0.186431f, 0.288672f, -0.060437f, 0.083380f, -0.205641f, + 0.360016f, 0.222041f, 0.420011f, 0.024579f, 0.377546f, 0.250380f, + -0.069900f, 0.296743f, 0.073532f, -0.243225f, -0.374987f, -0.387288f, + -0.237255f, -0.287013f, 0.417831f, -0.252988f, -0.257652f, -0.066775f, + -0.253926f, 0.057841f, 0.346133f, -0.157797f, -0.406028f, -0.286893f, + 0.274507f, -0.452561f, 0.143381f, -0.097755f, 0.021242f, 0.034561f, + 0.044115f, 0.004065f, 0.066729f, 0.043558f, 0.102991f, -0.477574f, +}; + +static const float av1_tx_split_nn_bias_16x16_layer0[24] = { + -0.479033f, 1.467402f, -0.366291f, 0.372511f, 0.715322f, -0.605500f, + 0.176848f, 0.032318f, 0.237429f, -0.046047f, 0.452082f, 0.451805f, + -0.822845f, 0.636762f, -0.057350f, 1.163978f, 0.728287f, 0.603654f, + -0.245519f, -0.893569f, -1.428185f, 0.808870f, -0.076159f, 1.231976f, +}; + +static const float av1_tx_split_nn_weights_16x16_layer1[24] = { + -0.176161f, 1.670188f, -0.180755f, -0.321326f, 0.249728f, -0.170504f, + -0.538432f, 0.033893f, 0.149842f, 0.404140f, -0.377812f, 0.338838f, + -0.176091f, 0.249844f, -0.362533f, 1.412460f, 0.196862f, 0.278194f, + -0.140444f, 0.297746f, 0.172533f, 0.116470f, -0.151656f, -0.603250f, +}; + +static const float av1_tx_split_nn_bias_16x16_layer1[1] = { + 0.184803f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_16x16 = { + 12, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 24, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_16x16_layer0, + av1_tx_split_nn_weights_16x16_layer1, + }, + { + av1_tx_split_nn_bias_16x16_layer0, + av1_tx_split_nn_bias_16x16_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 32x32 block. +static const float av1_tx_split_nn_weights_32x32_layer0[12 * 32] = { + -0.439303f, 0.004813f, -0.365052f, -0.116868f, -0.356716f, -0.196537f, + -0.196770f, -0.076096f, 0.357004f, -0.044909f, -0.112910f, -0.129081f, + 0.156725f, -0.386346f, 0.038971f, 0.160696f, 0.204923f, -0.384333f, + -0.319546f, 0.028179f, -0.250524f, -0.289669f, -0.284138f, -0.258963f, + -0.180854f, -0.000807f, -0.029620f, -0.353134f, 0.212408f, 0.141414f, + 0.303016f, 0.098066f, 0.482455f, 0.036069f, -0.166279f, 0.210119f, + -0.086337f, -0.023550f, -0.250796f, -0.183945f, -0.393856f, 0.170608f, + -0.306403f, 0.026318f, -0.277296f, 0.092684f, -0.033584f, -0.018371f, + -0.025043f, -0.257659f, -0.139163f, -0.206949f, -0.190105f, 0.028053f, + 0.361851f, -0.364726f, -0.096771f, -0.184166f, -0.433228f, -0.182191f, + -0.097051f, 0.259172f, 0.016432f, 0.259358f, 0.145059f, 0.037196f, + 0.091581f, -0.219644f, 0.140384f, -0.446837f, -0.234531f, 0.149508f, + -0.083429f, 0.186189f, -0.099890f, -0.111277f, 0.495214f, 0.085053f, + -0.266613f, -0.051366f, 0.148593f, 0.111875f, 0.077787f, -0.371653f, + -0.146157f, -0.229235f, 0.076203f, 0.488975f, 0.096771f, -0.009483f, + 0.192985f, 0.246273f, -0.192671f, -0.557890f, -0.292650f, -0.088907f, + -0.106892f, -0.329659f, 0.012105f, -0.359326f, 0.170723f, -0.004357f, + 0.171593f, -0.478768f, -0.236016f, -0.035077f, 0.133731f, 0.137962f, + -0.397926f, -0.155164f, -0.276709f, -0.186602f, -0.258301f, 0.036965f, + -0.649359f, 0.127605f, 0.097930f, 0.182775f, -0.313324f, 0.053349f, + 0.204203f, -0.222948f, -0.059008f, -0.049759f, -0.056848f, 0.087497f, + -0.039987f, -0.055042f, -0.041623f, -0.078424f, -0.317291f, -0.191398f, + 0.632147f, 0.221825f, 0.268394f, -0.096357f, 0.442545f, -0.007117f, + -0.036125f, 0.000525f, 0.088092f, -0.203653f, 0.086925f, 0.439141f, + 0.329889f, -0.370050f, -0.194306f, -0.207430f, 0.132779f, -0.217614f, + -0.039444f, -0.053019f, -0.260725f, -0.116563f, -0.271048f, 0.283737f, + -0.007300f, 0.062257f, -0.347865f, -0.296767f, -0.359123f, 0.230459f, + -0.189117f, -0.087622f, -0.561091f, 0.184182f, -0.044980f, 0.012643f, + 0.241672f, 0.050272f, -0.204851f, -0.159285f, -0.064081f, -0.118666f, + -0.269471f, 0.231668f, 0.135749f, -0.131162f, 0.062760f, 0.100949f, + 0.074967f, -0.056918f, 0.251707f, 0.034098f, 0.341290f, -0.105027f, + 0.313246f, -0.092679f, -0.014632f, -0.390967f, 0.136881f, -0.241554f, + 0.097674f, 0.110832f, -0.390245f, 0.017654f, -0.506222f, 0.065252f, + 0.244834f, -0.171352f, -0.331702f, 0.111043f, 0.125217f, -0.058116f, + -0.382595f, -0.052545f, 0.114261f, -0.493617f, 0.243984f, -0.171053f, + 0.165009f, -0.063020f, 0.096502f, 0.341339f, -0.013443f, 0.056372f, + 0.339284f, 0.398376f, 0.389409f, 0.257252f, 0.517368f, 0.078856f, + 0.087716f, -0.171092f, 0.227461f, 0.125307f, -0.054423f, -0.143161f, + 0.224041f, -0.086477f, -0.092548f, 0.072392f, -0.061608f, 0.258347f, + 0.147033f, -0.478244f, -0.204869f, 0.038552f, -0.144563f, 0.224087f, + -0.296705f, 0.153889f, -0.064624f, 0.085265f, -0.103826f, 0.127971f, + 0.019965f, 0.111937f, -0.074187f, -0.029518f, -0.127305f, -0.012210f, + 0.042714f, 0.070052f, -0.202360f, 0.348144f, -0.132097f, -0.209585f, + -0.248286f, -0.065774f, -0.089482f, -0.133226f, 0.325430f, -0.013468f, + -0.406090f, -0.144936f, 0.208620f, 0.343445f, -0.059639f, 0.114857f, + -0.069431f, -0.218725f, 0.190575f, -0.368101f, 0.030030f, 0.062815f, + -0.239369f, -0.537852f, 0.022487f, 0.023038f, 0.190788f, 0.040123f, + -0.004304f, 0.060749f, -0.108929f, 0.136796f, -0.542875f, -0.227074f, + -0.182244f, 0.082559f, 0.019149f, 0.178854f, 0.120284f, 0.009070f, + 0.068268f, -0.544822f, 0.120536f, 0.354028f, -0.119890f, -0.122055f, + -0.405335f, 0.122341f, -0.304412f, 0.062405f, -0.302568f, -0.276505f, + -0.120915f, -0.221841f, 0.282007f, -0.253971f, 0.059517f, -0.144976f, + 0.149391f, -0.047355f, -0.167742f, -0.392333f, -0.041132f, 0.342135f, + 0.017485f, 0.021038f, -0.023728f, -0.192181f, -0.103996f, 0.092873f, + -0.114365f, -0.397732f, -0.065421f, 0.053084f, 0.035201f, 0.053019f, + -0.105377f, -0.039500f, 0.131904f, -0.123911f, -0.390328f, -0.125198f, + -0.000126f, 0.014864f, -0.220187f, 0.084056f, -0.492155f, -0.164979f, + 0.133592f, 0.121519f, -0.240813f, 0.186680f, 0.118673f, 0.235006f, + -0.239894f, -0.185759f, -0.336992f, 0.209620f, -0.298845f, 0.127803f, + -0.083992f, 0.194340f, -0.245378f, 0.212308f, 0.142512f, -0.163324f, + 0.383495f, 0.291065f, 0.286620f, -0.239957f, 0.225127f, -0.174424f, + 0.297231f, -0.045434f, 0.156444f, -0.184273f, -0.204567f, 0.202551f, + 0.370019f, -0.073910f, 0.344897f, 0.063100f, 0.338547f, -0.099145f, + 0.391863f, -0.214244f, -0.241734f, -0.281851f, -0.035133f, -0.153157f, +}; + +static const float av1_tx_split_nn_bias_32x32_layer0[32] = { + 0.143343f, -0.021982f, -0.314939f, 0.170867f, -0.081248f, 0.125758f, + -0.355762f, 0.279798f, 1.027712f, -0.434660f, 1.072005f, 0.668893f, + -0.031216f, -0.528650f, 0.328349f, 0.543645f, -0.188810f, 0.221110f, + -1.638637f, 0.058045f, -1.731105f, -0.444284f, 0.513693f, 0.890025f, + 0.160288f, 0.393312f, 0.332856f, -0.080767f, 0.299822f, 0.235876f, + 0.254942f, -0.017796f, +}; + +static const float av1_tx_split_nn_weights_32x32_layer1[32] = { + -0.090326f, -0.267553f, -0.026071f, 0.100912f, 0.279137f, 0.079064f, + -0.074885f, 0.053804f, 0.736810f, -0.031693f, -0.970514f, 0.174069f, + 0.095940f, -0.065047f, 0.052911f, 0.176728f, -0.058274f, 0.148364f, + -0.162210f, 0.093875f, -0.367663f, 0.020876f, 0.137280f, -1.099116f, + 0.146854f, 0.075590f, 0.228534f, 0.141993f, 0.072143f, 0.101421f, + -0.068547f, -0.154148f, +}; + +static const float av1_tx_split_nn_bias_32x32_layer1[1] = { + 0.316622f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_32x32 = { + 12, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 32, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_32x32_layer0, + av1_tx_split_nn_weights_32x32_layer1, + }, + { + av1_tx_split_nn_bias_32x32_layer0, + av1_tx_split_nn_bias_32x32_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 64x64 block. +static const float av1_tx_split_nn_weights_64x64_layer0[12 * 32] = { + -0.006828f, 0.149944f, -0.017614f, -0.044599f, -0.024517f, 0.507698f, + 0.001039f, 0.037164f, 0.015091f, -0.306620f, -0.162047f, -0.369440f, + 0.396310f, 0.087121f, 0.208609f, -0.083068f, 0.493774f, 0.217682f, + 0.377393f, 0.172879f, 0.397422f, 0.078919f, 0.741350f, 0.064169f, + -0.099989f, -0.192983f, -0.278230f, -0.310048f, -0.439965f, -0.226698f, + -0.436596f, -0.007551f, -0.396721f, 0.153570f, -0.190838f, -0.071869f, + 0.048799f, -0.301301f, -0.005015f, 0.500480f, -0.030622f, -0.559095f, + -0.032634f, -0.054160f, -0.056979f, -0.456545f, 0.306536f, -0.411323f, + -0.005366f, -0.069496f, 0.019990f, 0.327931f, -0.002516f, 0.393190f, + 0.001759f, 0.035093f, -0.030302f, -0.528984f, 0.174781f, 0.241462f, + -0.415427f, -0.164502f, 0.143065f, -0.122595f, 0.082049f, -0.143346f, + 0.055642f, -0.124701f, 0.004050f, -0.216235f, -2.681730f, 0.101658f, + 0.381239f, 0.465936f, 0.331154f, 0.301708f, -0.360171f, 0.054886f, + -0.118658f, 0.287921f, 0.277859f, 0.203784f, 0.247809f, 0.656924f, + -0.354628f, 0.315081f, 0.105108f, -0.510179f, 0.059267f, 0.061386f, + 0.076423f, 0.347119f, 0.100134f, 0.028402f, -0.118621f, -0.238689f, + 0.080141f, -0.138863f, 0.009009f, -0.100526f, -0.138875f, 0.066992f, + 0.005949f, 0.564336f, 0.046994f, 0.004655f, 0.366047f, 0.014695f, + -0.146928f, -0.024665f, -0.440357f, -0.109395f, 0.527231f, -0.020925f, + -0.227236f, -0.068141f, 0.282009f, 0.040192f, -0.267100f, 0.229228f, + 0.133861f, 0.338706f, -0.030178f, -0.040919f, -0.026343f, -0.330338f, + -0.066931f, -0.110580f, -0.072056f, 0.599457f, -0.020738f, 0.169200f, + 0.836240f, -0.157548f, 0.386273f, 0.002404f, 0.329410f, -0.007020f, + 0.351705f, -0.041259f, 0.388861f, 0.003899f, 0.582627f, 0.023572f, + 0.409912f, -0.158472f, 0.536383f, 0.525093f, 0.604247f, 0.439159f, + 0.692832f, 0.046272f, 0.590367f, -0.082166f, 0.262357f, 0.478671f, + 0.031935f, 0.042675f, 0.120002f, 0.398616f, -0.078967f, 0.227986f, + -0.044679f, 0.151061f, -0.085564f, 0.220205f, -0.265606f, -0.203623f, + 0.204719f, -0.125922f, 0.038544f, -0.269379f, 0.025866f, 0.109967f, + 0.019064f, -0.237297f, -0.309746f, -0.329118f, -0.278368f, -0.063859f, + 0.278496f, 0.018620f, 0.209971f, 0.296250f, 0.142850f, 0.288689f, + 0.137084f, 0.130517f, 0.128171f, -0.155396f, -0.008449f, -0.099845f, + 0.173455f, -0.059909f, -0.147318f, 0.102851f, -0.251389f, -0.001448f, + 0.103907f, 0.297273f, -0.027846f, 0.028260f, -0.382601f, 0.346695f, + -0.601641f, 0.162366f, -0.477495f, -0.042731f, -0.387871f, -0.051791f, + -0.401498f, -0.048446f, -0.456270f, -0.062287f, 0.493919f, 0.003008f, + 0.099917f, -0.358525f, -0.094903f, -0.022811f, -0.062259f, 0.019455f, + -0.050644f, 0.020041f, -0.132912f, -0.061578f, -3.083691f, -0.014961f, + -0.129115f, -0.710559f, 0.157213f, -0.844037f, -0.121991f, -0.943386f, + -0.231269f, -0.003462f, 0.331478f, -0.132703f, -1.285993f, -0.120957f, + -0.373755f, -0.322609f, 0.309059f, -0.131523f, -0.118334f, -0.063805f, + -0.104251f, 0.012166f, -0.094699f, -0.283753f, 0.128168f, -0.526929f, + -0.050331f, 0.186153f, 0.005913f, -0.221236f, 0.036363f, 0.160909f, + -0.001342f, -0.382749f, 0.037820f, 0.281689f, -0.024275f, 0.028854f, + 0.318291f, 0.318526f, 0.035778f, 0.034031f, 0.189663f, -0.293367f, + 0.082022f, 0.127923f, 0.078866f, -0.081361f, -0.268117f, 0.246675f, + 0.248605f, -0.215479f, -0.073084f, 0.496140f, -0.067327f, 0.396237f, + -0.120739f, 0.033752f, -0.044120f, -0.218941f, -0.028078f, 0.195132f, + -0.040400f, 0.281604f, -0.100471f, 0.415207f, -0.258503f, -0.429749f, + 0.150569f, -0.010859f, 0.136448f, 0.026589f, 0.148466f, 0.110764f, + 0.380967f, 0.009177f, 0.103075f, 0.116417f, 0.226273f, -0.327746f, + 0.169346f, 0.284553f, -0.094986f, 0.312745f, -0.147840f, 0.025062f, + -0.494482f, 0.112388f, -0.213962f, 0.107050f, -0.433371f, -0.096276f, + -0.244835f, -0.003518f, -0.459148f, -0.145080f, 0.017150f, 0.042846f, + -0.237479f, 0.104746f, 0.158677f, 0.358937f, 0.099921f, 0.277109f, + 0.012410f, -0.062897f, 0.116130f, 0.255309f, 0.341628f, 0.145002f, + -0.429344f, -0.016433f, -0.068985f, 0.285194f, -0.286719f, -0.018298f, + -0.179369f, -0.194655f, -0.165380f, 0.026071f, -0.428268f, -0.379929f, + -0.727543f, 0.179610f, -0.963979f, -0.042026f, -0.616202f, 0.133401f, + -0.784966f, 0.061205f, -0.713357f, 0.129795f, 0.120512f, -0.339545f, + 0.353557f, 0.114906f, -0.329813f, -0.209987f, 0.085410f, 0.214313f, + -0.122082f, 0.335770f, -0.020937f, 0.202456f, 0.289023f, -0.421186f, + 0.337905f, 0.407663f, 0.132771f, 0.071734f, 0.213914f, 0.128595f, + 0.302659f, -0.209501f, 0.217756f, 0.253079f, -0.089505f, -0.205614f, +}; + +static const float av1_tx_split_nn_bias_64x64_layer0[32] = { + 0.296914f, -1.826816f, 0.346130f, 0.969520f, -0.528154f, 1.175862f, + -0.075985f, -0.097323f, -0.233059f, 0.004846f, 0.401279f, -2.272435f, + 0.086257f, 0.414162f, -0.194786f, -0.233887f, -0.113215f, -2.453546f, + 0.861214f, 0.298361f, 0.267397f, -0.158557f, -0.119911f, -0.098134f, + -0.339263f, 0.385871f, -0.678123f, 0.263218f, 0.251611f, -1.155773f, + -0.365437f, 0.229255f, +}; + +static const float av1_tx_split_nn_weights_64x64_layer1[32] = { + 0.502104f, -0.708023f, 0.419648f, 1.583418f, 0.419355f, -1.462981f, + -0.439623f, 0.405691f, 0.823257f, 0.061654f, 0.750875f, 0.775031f, + -0.387909f, 0.447385f, 0.284690f, 0.353262f, -0.224347f, 0.832864f, + -1.708491f, -1.042447f, -0.272829f, 0.540640f, 0.310509f, 0.723745f, + 0.245592f, -0.218417f, -0.597987f, -0.362301f, 0.702217f, -0.692614f, + 0.207812f, 0.513560f, +}; + +static const float av1_tx_split_nn_bias_64x64_layer1[1] = { -0.2307045f }; + +static const NN_CONFIG av1_tx_split_nnconfig_64x64 = { + 12, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 32, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_64x64_layer0, + av1_tx_split_nn_weights_64x64_layer1, + }, + { + av1_tx_split_nn_bias_64x64_layer0, + av1_tx_split_nn_bias_64x64_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 4x16 block. +static const float av1_tx_split_nn_weights_4x16_layer0[8 * 16] = { + -1.344184f, -1.454625f, -0.703110f, -0.140570f, -0.841536f, -0.068131f, + -2.128968f, -0.655518f, 0.432180f, 0.879752f, -0.222211f, 0.061615f, + -0.230969f, 0.569496f, 1.424188f, 0.598063f, -0.436005f, -0.737606f, + -0.137875f, -0.085730f, -0.076512f, -0.583101f, -0.937377f, -0.203556f, + -0.215797f, -0.015361f, -0.124098f, -0.411917f, 0.340441f, -0.331752f, + -0.472607f, -0.097714f, -0.930572f, -1.354713f, -0.550724f, 0.176212f, + -0.636060f, 0.183271f, -0.610212f, 0.345895f, -1.100906f, -1.605713f, + 0.111888f, -0.140937f, 0.063013f, -0.013315f, -0.273472f, -0.255870f, + 1.200328f, 0.274002f, 1.005776f, 0.322392f, 1.222373f, 0.158227f, + 0.408810f, 0.145022f, 0.139842f, -1.249412f, 0.286672f, -0.635699f, + 0.312562f, -0.495606f, -1.117034f, -0.085107f, -0.097484f, -0.341521f, + -0.132199f, -0.863055f, 0.217579f, -1.161425f, -0.302087f, -1.357271f, + -0.520724f, -1.211069f, -1.048729f, -0.333087f, -1.171527f, -0.280824f, + -2.057684f, -0.228755f, 0.606278f, 0.101198f, -0.314847f, -1.303255f, + -0.294964f, 1.301923f, 0.041712f, 0.077593f, -1.152746f, 0.495315f, + -0.751566f, 0.230249f, -0.840661f, 0.100731f, 1.346269f, 0.649898f, + -1.432258f, -0.456710f, -1.018123f, -0.348559f, -1.225226f, -0.170717f, + -0.354072f, 0.068292f, -0.234168f, 0.277503f, 0.179134f, 0.907420f, + 0.354626f, -0.627210f, 0.905779f, 0.512612f, 0.161190f, -0.843177f, + 0.014953f, -0.354983f, 0.011116f, -0.429598f, -1.017138f, -0.211432f, + 0.941840f, -0.281747f, 0.957776f, -0.541914f, 1.041880f, -0.433580f, + -1.416451f, -0.166467f, +}; + +static const float av1_tx_split_nn_bias_4x16_layer0[16] = { + 3.086118f, -3.235095f, 4.830956f, -0.165706f, 0.955031f, 4.055783f, + -0.311489f, 4.660205f, -0.576277f, -0.248111f, -0.790519f, -1.686412f, + -1.191704f, -3.800073f, 4.121552f, -1.399397f, +}; + +static const float av1_tx_split_nn_weights_4x16_layer1[16] = { + -0.758677f, 0.388776f, 0.439906f, 0.011390f, -0.084319f, -0.667969f, + -0.467316f, -0.875491f, -0.160668f, 0.805292f, 0.114393f, -0.549682f, + 0.462109f, 0.343315f, 1.092593f, 0.483152f, +}; + +static const float av1_tx_split_nn_bias_4x16_layer1[1] = { + 0.8205083f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_4x16 = { + 8, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_4x16_layer0, + av1_tx_split_nn_weights_4x16_layer1, + }, + { + av1_tx_split_nn_bias_4x16_layer0, + av1_tx_split_nn_bias_4x16_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 16x32 block. +static const float av1_tx_split_nn_weights_16x32_layer0[8 * 32] = { + 0.180713f, 0.033211f, 0.607561f, 0.138642f, 0.637204f, -0.000940f, + 0.012630f, 0.358109f, 0.022238f, 0.190418f, 0.079088f, 0.065925f, + 0.038242f, 0.162380f, -0.122728f, 0.379382f, -0.303283f, -0.327550f, + 0.029120f, -0.284553f, 0.269588f, -0.309805f, -0.241036f, -0.161103f, + -0.304887f, 0.239843f, -0.149146f, 0.311234f, -0.073640f, -0.132718f, + 0.178901f, 0.474712f, 0.020280f, 0.063685f, -0.609170f, -0.013658f, + -0.338074f, 0.250429f, 0.082978f, -0.186315f, -0.788959f, 0.039859f, + -0.426461f, -0.001524f, -0.447211f, 0.378102f, 0.315617f, 0.017428f, + 0.745494f, -0.219024f, 0.512836f, 0.200522f, 0.680449f, 0.313686f, + -0.412569f, -0.132927f, 0.631120f, 0.042735f, 0.336153f, 0.044772f, + 0.432606f, 0.175681f, -0.634411f, -0.073509f, -0.040643f, -0.559260f, + -0.104034f, -0.570495f, -0.247365f, 0.063256f, -0.582021f, -0.492585f, + -0.194955f, -0.207934f, -0.506627f, 0.021743f, -0.416518f, 0.320876f, + 0.115889f, 0.149399f, -0.229376f, 0.095505f, 0.115191f, -0.471921f, + 0.113068f, 0.343684f, -0.036831f, 0.021240f, 0.295112f, 0.031166f, + 0.448201f, -0.132241f, 0.164032f, 0.355572f, 0.072154f, 0.017335f, + -0.046113f, 0.178719f, -0.026881f, -0.242590f, 0.055073f, -0.012958f, + 0.077904f, 0.351356f, 0.107655f, 0.260568f, -0.080052f, -0.197553f, + 0.085763f, 0.263416f, -0.327741f, 0.158855f, 0.056899f, -0.162121f, + 0.339518f, -0.571204f, 0.264966f, -0.252214f, -0.202560f, -0.134213f, + -0.330188f, 0.009470f, -0.468376f, -0.065240f, -0.307957f, 0.116479f, + -0.222238f, -0.458716f, 0.186493f, -0.391415f, 0.118649f, -0.104653f, + -0.259958f, -0.332081f, -0.403785f, -0.050147f, -0.573511f, 0.177117f, + -0.598358f, 0.164947f, -0.119694f, -0.058520f, 0.203829f, -0.267404f, + -0.048202f, -0.600006f, 0.181594f, -0.731805f, 0.146417f, -0.687148f, + -1.210525f, -0.450101f, -0.620635f, 0.208825f, -0.611357f, 0.112202f, + -0.309468f, -0.323545f, 0.357770f, 0.308061f, 0.553199f, 0.049012f, + 0.530093f, -0.208597f, 0.607882f, -0.058120f, -0.527634f, 0.018136f, + 0.060753f, 0.118894f, 0.175649f, 0.014731f, 0.428318f, -0.106465f, + -0.119077f, 0.080179f, 0.524997f, 0.368286f, 0.528286f, 0.213659f, + 0.639286f, 0.195079f, -0.049815f, -0.092008f, -0.302958f, 0.298149f, + -0.173870f, -0.145205f, -0.233589f, -0.303368f, 0.141275f, 0.325622f, + -0.115293f, 0.155188f, 0.047225f, 0.231050f, -0.167447f, 0.349754f, + 0.295544f, -0.319466f, 0.095144f, 0.174612f, -0.194652f, 0.305915f, + -0.239008f, -0.037453f, 0.280696f, 0.125850f, 0.749196f, -0.101919f, + 0.791808f, -0.236811f, 0.064157f, 0.032865f, -0.225911f, 0.350384f, + 0.723183f, -0.103992f, 0.483085f, -0.123992f, 0.602138f, 0.023895f, + -0.692601f, -0.118387f, 0.162527f, 0.145178f, -0.184702f, -0.017753f, + -0.159436f, 0.124105f, -0.131067f, 0.310275f, 0.151499f, 0.138924f, + 0.537459f, 0.263212f, 0.615896f, 0.281255f, 0.021293f, -0.473459f, + 0.210145f, -0.056682f, 0.063658f, 0.377254f, -0.314410f, -0.183487f, + 0.300384f, 0.328471f, 0.164694f, -0.159272f, -0.160942f, -0.502861f, + -0.129147f, 0.045916f, -0.606865f, -0.101378f, +}; + +static const float av1_tx_split_nn_bias_16x32_layer0[32] = { + 0.051664f, -0.212487f, -0.077596f, -0.818467f, 0.638475f, -0.759937f, + 0.157198f, 0.989640f, 1.586035f, 0.431144f, 0.041605f, 0.543085f, + 0.498379f, 0.320504f, 0.134233f, 0.670979f, -0.105562f, -1.574879f, + 1.261812f, -0.287530f, -1.610592f, 0.730899f, -0.894240f, -0.657790f, + 0.270806f, -0.181708f, 0.298578f, 0.817240f, -0.221508f, -0.201771f, + -0.294389f, 1.456413f, +}; + +static const float av1_tx_split_nn_weights_16x32_layer1[32] = { + 1.208914f, 0.324728f, 0.383352f, -0.874321f, 0.172565f, -0.580927f, + -0.432927f, 0.433698f, -0.801935f, 0.672028f, 0.563493f, 0.260077f, + -0.200557f, -0.121638f, 0.530735f, -0.525196f, 0.281799f, 0.624204f, + -0.662775f, -0.230887f, 0.980989f, 0.223437f, -0.790591f, 0.600724f, + -0.273445f, 0.427635f, -0.501641f, -0.878390f, 0.234731f, -0.172550f, + 0.418904f, 1.792187f, +}; + +static const float av1_tx_split_nn_bias_16x32_layer1[1] = { + -0.29233751f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_16x32 = { + 8, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 32, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_16x32_layer0, + av1_tx_split_nn_weights_16x32_layer1, + }, + { + av1_tx_split_nn_bias_16x32_layer0, + av1_tx_split_nn_bias_16x32_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 32x64 block. +static const float av1_tx_split_nn_weights_32x64_layer0[8 * 32] = { + 0.031614f, -0.110926f, 0.052418f, -0.702506f, 0.045708f, 0.238329f, + -0.021806f, -0.208128f, 0.509745f, -0.293891f, 0.277788f, 0.113937f, + 0.741576f, 0.062848f, 0.351878f, 0.212532f, 0.385842f, 0.081517f, + 0.398502f, -0.015156f, 0.242616f, 0.214619f, -0.182678f, -0.170546f, + 0.110605f, -0.236749f, -0.023831f, -0.285243f, 0.147156f, -0.257639f, + 0.341355f, -0.571641f, -0.721797f, 0.139588f, -0.518494f, -0.206526f, + -0.570560f, -0.184295f, 0.110271f, 0.210292f, -0.109132f, -0.001080f, + 0.129251f, -0.204230f, -0.396312f, -0.183024f, 0.421243f, -0.013154f, + 0.222627f, 0.169826f, 0.226037f, 0.218153f, -0.343528f, 0.274906f, + -0.156632f, 0.250261f, -0.484020f, 0.019909f, -0.349575f, -0.286643f, + -0.507396f, 0.202446f, -0.154110f, -0.292644f, 0.122666f, 0.306963f, + 0.424895f, 0.005579f, 0.494094f, -0.079551f, 0.473740f, 0.352414f, + -0.356917f, 0.264331f, -0.554487f, 0.119978f, 0.012291f, -0.141641f, + -0.254714f, -0.213723f, -0.116701f, -0.011267f, 0.190025f, -0.118501f, + 0.305151f, -0.316782f, -0.220801f, -0.308420f, -0.324285f, 0.421329f, + -0.177066f, -0.055114f, 0.229698f, -0.199523f, 0.054278f, 0.365020f, + -0.060586f, -0.300618f, 0.157563f, -0.064338f, -0.005711f, -0.176991f, + -0.424502f, -0.111914f, 0.092608f, 0.126621f, 0.078547f, 0.148008f, + 0.024221f, 0.124599f, 0.001343f, 0.059402f, 0.453753f, 0.047102f, + 0.242544f, 0.055735f, -0.067451f, -0.170061f, -0.170469f, -0.232173f, + 0.214908f, 0.248889f, 0.544348f, -0.084566f, 0.402478f, 0.298031f, + 0.099038f, -0.238019f, -0.475085f, -0.070042f, -0.754955f, -0.049095f, + -0.783801f, -0.099857f, -0.582008f, -0.055194f, -0.103655f, 0.143689f, + 0.100219f, 0.293934f, 0.099271f, -0.036320f, 0.356626f, -0.261445f, + 0.879544f, 0.000878f, 0.532920f, -0.093918f, 0.508867f, -0.040215f, + -0.789042f, -0.145380f, -0.090040f, -0.066636f, 0.015212f, 0.352989f, + -0.058831f, -0.164588f, 0.039890f, 0.122861f, 0.222508f, 0.061217f, + 0.466487f, 0.022666f, 0.423777f, -0.002200f, -0.656835f, -0.099760f, + -0.520606f, 0.303204f, -0.563620f, -0.160922f, -0.243203f, 0.313354f, + -0.336516f, -0.206764f, -0.236040f, 0.325899f, -0.418748f, 0.163205f, + -0.476242f, -0.121928f, 0.139178f, -0.157193f, -0.531766f, -0.180202f, + -0.485254f, 0.187703f, -0.440072f, 0.137854f, 0.029139f, 0.109530f, + -0.078475f, -0.360618f, -0.334672f, -0.350890f, -0.403976f, 0.180336f, + -0.304542f, 0.005123f, 0.413995f, 0.314639f, 0.342648f, -0.293264f, + 0.358135f, -0.180425f, -0.369530f, -0.048413f, 0.498366f, 0.121875f, + 0.270948f, -0.187966f, 0.342503f, 0.174420f, -0.352105f, 0.088080f, + 0.008277f, 0.020275f, -0.002381f, 0.504389f, -0.018832f, -0.366047f, + -0.090947f, -0.168150f, 0.016184f, -0.328914f, 0.089579f, -0.017349f, + 0.005844f, -0.005010f, -1.857514f, -0.282426f, 0.010177f, -0.214727f, + -0.182529f, 0.156943f, -0.162032f, -0.472654f, 0.069432f, 0.016901f, + -0.767905f, 0.137129f, -0.411463f, 0.049056f, -0.431657f, -0.037641f, + 0.785500f, 0.046225f, 0.195831f, 0.245204f, 0.368614f, 0.212261f, + 0.440626f, -0.158048f, -0.461031f, -0.146280f, +}; + +static const float av1_tx_split_nn_bias_32x64_layer0[32] = { + 0.490777f, -1.894238f, 0.621333f, -0.076756f, 0.286298f, 0.286375f, + -0.126431f, -0.350034f, -1.017572f, 0.620125f, 0.408128f, 0.238756f, + -0.060728f, 0.210912f, 0.043124f, 0.445649f, 0.907025f, 0.360272f, + 1.083101f, -0.068952f, 1.062348f, 0.396354f, 0.280075f, 0.501732f, + 0.328422f, 0.066241f, 0.474697f, 0.126313f, 0.741206f, 0.314796f, + 0.552712f, 0.299410f, +}; + +static const float av1_tx_split_nn_weights_32x64_layer1[32] = { + 1.033823f, 0.603439f, 0.304591f, -0.279940f, -0.780909f, -0.132801f, + 0.154059f, 0.662014f, -0.718368f, 0.198733f, 0.039766f, -0.208516f, + -0.104909f, -0.394209f, 0.081617f, 0.365041f, -0.874960f, -0.063315f, + -1.189897f, 0.337225f, 0.410893f, 0.307519f, 0.221323f, 0.233895f, + 0.469536f, 0.438557f, 0.280144f, 0.422423f, -1.394513f, 0.781900f, + 0.352981f, 0.111265f, +}; + +static const float av1_tx_split_nn_bias_32x64_layer1[1] = { + -0.18160765f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_32x64 = { + 8, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 32, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_32x64_layer0, + av1_tx_split_nn_weights_32x64_layer1, + }, + { + av1_tx_split_nn_bias_32x64_layer0, + av1_tx_split_nn_bias_32x64_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 8x32 block. +static const float av1_tx_split_nn_weights_8x32_layer0[8 * 24] = { + -0.687846f, 0.121404f, -0.372905f, 0.126770f, -0.103298f, -0.101650f, + -0.148490f, -0.271740f, 0.682915f, -0.079765f, 0.634347f, -0.151503f, + 0.287692f, -0.079072f, -0.236948f, 0.065064f, 0.713383f, 0.397123f, + 0.553621f, 0.368529f, 0.767663f, -0.046601f, -0.392402f, -0.294822f, + -0.292325f, -0.010573f, -0.837945f, 0.050113f, -0.811360f, 0.199162f, + 0.150832f, 0.011602f, 0.369694f, -0.225876f, 0.234113f, -0.269808f, + 0.303805f, -0.190281f, -0.451136f, 0.209755f, -0.308894f, 0.326956f, + 0.313591f, 0.089923f, -0.095754f, 0.390981f, 0.467366f, 0.169670f, + 0.853322f, 0.054055f, 0.830319f, -0.121918f, 0.262019f, -0.093526f, + 0.385558f, 0.419174f, 0.040198f, -0.347030f, -0.450492f, -0.106764f, + 0.487502f, -0.204188f, 0.430374f, -0.116388f, 0.236407f, -0.157376f, + 0.732294f, -0.651387f, 0.347446f, 0.342575f, 0.048406f, 0.187657f, + 0.434899f, -0.447782f, 0.032728f, -0.071168f, -0.255327f, 0.104174f, + 0.095689f, -0.431743f, 0.725694f, 0.031797f, 0.523171f, 0.061801f, + 0.469804f, -0.071068f, -0.059024f, -0.211937f, 0.392134f, -0.321490f, + 0.366060f, -0.427798f, 0.166771f, 0.299652f, 0.044660f, 0.205142f, + 0.039133f, -0.051835f, -0.465475f, 0.216976f, -0.341156f, 0.095358f, + 0.230807f, 0.201674f, 0.279266f, -0.713534f, -0.091690f, -0.569708f, + -0.119001f, 0.252160f, -1.544578f, -0.284477f, 0.555348f, 0.226471f, + 0.347690f, 0.034365f, 0.770835f, -0.241859f, -0.130241f, 0.292936f, + 0.396622f, -0.417916f, 0.492224f, 0.125517f, 0.344824f, 0.232172f, + -0.432106f, -0.278745f, 0.035069f, -0.307247f, -0.120760f, 0.170950f, + 0.433601f, 0.044286f, 0.141463f, -0.041382f, 0.529346f, 0.010868f, + -0.323674f, 0.185205f, 0.623459f, 0.232842f, -0.406693f, -0.142944f, + 0.222988f, 0.343634f, 0.065401f, 0.002621f, 0.805335f, -0.426926f, + 0.279181f, 0.131364f, 0.192339f, -0.402391f, 0.544120f, -0.060618f, + 0.467780f, 0.165224f, -0.373131f, 0.002427f, 0.688064f, 0.322317f, + 0.259713f, 0.130583f, 0.185032f, -0.189111f, -0.067821f, 0.010875f, + 0.644724f, -0.179291f, 0.463222f, 0.155230f, 0.721384f, -0.046019f, + 0.438501f, 0.440027f, -0.462090f, -0.002039f, -0.468026f, -0.008890f, + -0.328530f, 0.370102f, 0.482531f, 0.043471f, -0.469732f, -0.532663f, + 0.122081f, -0.379659f, 0.037219f, -0.519913f, -0.128975f, -0.404365f, +}; + +static const float av1_tx_split_nn_bias_8x32_layer0[24] = { + -1.198965f, 0.395204f, -0.408627f, -0.021654f, -0.658355f, 0.154525f, + -0.288354f, 1.207574f, 0.411608f, 0.964678f, -1.176893f, 1.059006f, + -0.472969f, 2.087975f, 1.065536f, 0.595569f, 0.197907f, -0.349938f, + 1.013651f, -0.931093f, -0.973595f, -0.459094f, -1.253062f, 1.624782f, +}; + +static const float av1_tx_split_nn_weights_8x32_layer1[24] = { + 0.815787f, -0.393465f, -0.483427f, -0.565592f, 0.493494f, 0.430229f, + -0.507073f, -0.251379f, -0.353418f, -0.495445f, 0.820029f, 0.649146f, + -0.487383f, 1.844503f, 0.480324f, -0.982705f, -0.501446f, -0.220584f, + 0.334299f, 0.802238f, 0.805838f, -0.487848f, 0.300772f, -1.232857f, +}; + +static const float av1_tx_split_nn_bias_8x32_layer1[1] = { + 0.13435879f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_8x32 = { + 8, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 24, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_8x32_layer0, + av1_tx_split_nn_weights_8x32_layer1, + }, + { + av1_tx_split_nn_bias_8x32_layer0, + av1_tx_split_nn_bias_8x32_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 16x32 block. +static const float av1_tx_split_nn_weights_16x64_layer0[8 * 16] = { + -0.378223f, -0.124216f, -0.514089f, -0.110117f, -0.585801f, -0.094838f, + -0.455385f, -0.220254f, -0.504568f, -0.082351f, -0.476420f, -0.253993f, + -0.454709f, -0.059461f, 0.210313f, -0.155683f, 0.192968f, -0.127804f, + 0.471996f, 0.253377f, 0.472625f, 0.485322f, 0.150560f, 0.164868f, + -0.475587f, 0.447559f, -0.455759f, -0.306665f, -0.194866f, -0.283716f, + -0.243897f, 0.293020f, -0.308298f, -0.191904f, -0.468568f, 0.014053f, + -0.618848f, 0.096273f, -0.444586f, 0.347750f, -0.280643f, -0.062872f, + 0.118661f, 0.540099f, 0.104141f, -0.279300f, -0.098721f, -0.173427f, + -0.984558f, -0.424559f, -0.411928f, -0.120875f, -0.488999f, -0.050716f, + -0.523103f, 0.093620f, -0.930396f, -0.431997f, -1.163297f, 0.190384f, + -0.422581f, -0.005354f, 0.450552f, 0.369210f, 0.562484f, 0.679922f, + 0.282099f, -0.039075f, 0.404196f, 0.006371f, 0.069679f, -0.196160f, + -0.213675f, 0.275187f, -0.104235f, -0.193090f, 0.003116f, -0.252454f, + -0.094591f, 0.210439f, -0.137070f, 0.145043f, 0.024558f, 0.121718f, + 0.010138f, 0.301651f, -0.377990f, 0.444414f, 0.001845f, -0.095334f, + 0.550259f, 0.087603f, 0.792492f, -0.044584f, 0.641706f, -0.328458f, + -0.447791f, 0.135376f, 0.356385f, 0.135748f, 0.310370f, 0.293757f, + -0.062000f, -0.056368f, 0.343930f, 0.312039f, 0.370763f, 0.452381f, + -0.023630f, -0.185909f, 0.422277f, -0.006306f, 0.045166f, 0.423359f, + -0.157735f, -0.084901f, 0.219527f, -0.209510f, 0.575057f, 0.249276f, + 0.069267f, 0.233898f, -0.229392f, 0.117197f, -0.038551f, 0.293976f, + 0.101996f, 0.120878f, +}; + +static const float av1_tx_split_nn_bias_16x64_layer0[16] = { + 1.036995f, 0.160249f, 0.100264f, 0.694881f, 0.694677f, 0.128379f, + -0.843405f, -0.405515f, 0.104139f, 0.182980f, -0.025472f, 0.901067f, + -0.299866f, -0.103079f, -0.190352f, -0.048121f, +}; + +static const float av1_tx_split_nn_weights_16x64_layer1[16] = { + -1.778868f, 0.174690f, 0.211991f, 0.712138f, 0.589352f, 0.466652f, + 1.029146f, -0.490044f, 0.483015f, 0.600215f, -0.577776f, -0.755546f, + 0.348337f, -0.205082f, 0.347129f, -0.322277f, +}; + +static const float av1_tx_split_nn_bias_16x64_layer1[1] = { + 0.04230947f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_16x64 = { + 8, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_16x64_layer0, + av1_tx_split_nn_weights_16x64_layer1, + }, + { + av1_tx_split_nn_bias_16x64_layer0, + av1_tx_split_nn_bias_16x64_layer1, + }, +}; +/******************************************************************************/ + +// Map block size to its corresponding neural net model for tx split prediction. +static const NN_CONFIG *av1_tx_split_nnconfig_map[TX_SIZES_ALL] = { + NULL, // TX_4X4, + &av1_tx_split_nnconfig_8x8, // TX_8X8, + &av1_tx_split_nnconfig_16x16, // TX_16X16, + &av1_tx_split_nnconfig_32x32, // TX_32X32, + &av1_tx_split_nnconfig_64x64, // TX_64X64, + &av1_tx_split_nnconfig_4x8, // TX_4X8, + &av1_tx_split_nnconfig_4x8, // TX_8X4, + &av1_tx_split_nnconfig_8x16, // TX_8X16, + &av1_tx_split_nnconfig_8x16, // TX_16X8, + &av1_tx_split_nnconfig_16x32, // TX_16X32, + &av1_tx_split_nnconfig_16x32, // TX_32X16, + &av1_tx_split_nnconfig_32x64, // TX_32X64, + &av1_tx_split_nnconfig_32x64, // TX_64X32, + &av1_tx_split_nnconfig_4x16, // TX_4X16, + &av1_tx_split_nnconfig_4x16, // TX_16X4, + &av1_tx_split_nnconfig_8x32, // TX_8X32, + &av1_tx_split_nnconfig_8x32, // TX_32X8, + &av1_tx_split_nnconfig_16x64, // TX_16X64, + &av1_tx_split_nnconfig_16x64, // TX_64X16, +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_ diff --git a/libs/libaom/src/av1/encoder/tx_search.c b/libs/libaom/src/av1/encoder/tx_search.c new file mode 100644 index 000000000..65b9a2472 --- /dev/null +++ b/libs/libaom/src/av1/encoder/tx_search.c @@ -0,0 +1,3602 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/cfl.h" +#include "av1/common/reconintra.h" +#include "av1/encoder/encodetxb.h" +#include "av1/encoder/hybrid_fwd_txfm.h" +#include "av1/common/idct.h" +#include "av1/encoder/model_rd.h" +#include "av1/encoder/random.h" +#include "av1/encoder/rdopt_utils.h" +#include "av1/encoder/tx_prune_model_weights.h" +#include "av1/encoder/tx_search.h" + +struct rdcost_block_args { + const AV1_COMP *cpi; + MACROBLOCK *x; + ENTROPY_CONTEXT t_above[MAX_MIB_SIZE]; + ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]; + RD_STATS rd_stats; + int64_t current_rd; + int64_t best_rd; + int exit_early; + int incomplete_exit; + int use_fast_coef_costing; + FAST_TX_SEARCH_MODE ftxs_mode; + int skip_trellis; +}; + +typedef struct { + int64_t rd; + int txb_entropy_ctx; + TX_TYPE tx_type; +} TxCandidateInfo; + +typedef struct { + int leaf; + int8_t children[4]; +} RD_RECORD_IDX_NODE; + +// origin_threshold * 128 / 100 +static const uint32_t skip_pred_threshold[3][BLOCK_SIZES_ALL] = { + { + 64, 64, 64, 70, 60, 60, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 64, 64, 70, 70, 68, 68, + }, + { + 88, 88, 88, 86, 87, 87, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 88, 88, 86, 86, 68, 68, + }, + { + 90, 93, 93, 90, 93, 93, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 90, 90, 90, 90, 74, 74, + }, +}; + +// lookup table for predict_skip_flag +// int max_tx_size = max_txsize_rect_lookup[bsize]; +// if (tx_size_high[max_tx_size] > 16 || tx_size_wide[max_tx_size] > 16) +// max_tx_size = AOMMIN(max_txsize_lookup[bsize], TX_16X16); +static const TX_SIZE max_predict_sf_tx_size[BLOCK_SIZES_ALL] = { + TX_4X4, TX_4X8, TX_8X4, TX_8X8, TX_8X16, TX_16X8, + TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, + TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_4X16, TX_16X4, + TX_8X8, TX_8X8, TX_16X16, TX_16X16, +}; + +static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record, + const uint32_t hash) { + // Linear search through the circular buffer to find matching hash. + for (int i = cur_record->index_start - 1; i >= 0; i--) { + if (cur_record->hash_vals[i] == hash) return i; + } + for (int i = cur_record->num - 1; i >= cur_record->index_start; i--) { + if (cur_record->hash_vals[i] == hash) return i; + } + int index; + // If not found - add new RD info into the buffer and return its index + if (cur_record->num < TX_SIZE_RD_RECORD_BUFFER_LEN) { + index = (cur_record->index_start + cur_record->num) % + TX_SIZE_RD_RECORD_BUFFER_LEN; + cur_record->num++; + } else { + index = cur_record->index_start; + cur_record->index_start = + (cur_record->index_start + 1) % TX_SIZE_RD_RECORD_BUFFER_LEN; + } + + cur_record->hash_vals[index] = hash; + av1_zero(cur_record->tx_rd_info[index]); + return index; +} + +static const RD_RECORD_IDX_NODE rd_record_tree_8x8[] = { + { 1, { 0 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_8x16[] = { + { 0, { 1, 2, -1, -1 } }, + { 1, { 0, 0, 0, 0 } }, + { 1, { 0, 0, 0, 0 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_16x8[] = { + { 0, { 1, 2, -1, -1 } }, + { 1, { 0 } }, + { 1, { 0 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_16x16[] = { + { 0, { 1, 2, 3, 4 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_1_2[] = { + { 0, { 1, 2, -1, -1 } }, + { 0, { 3, 4, 5, 6 } }, + { 0, { 7, 8, 9, 10 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_2_1[] = { + { 0, { 1, 2, -1, -1 } }, + { 0, { 3, 4, 7, 8 } }, + { 0, { 5, 6, 9, 10 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_sqr[] = { + { 0, { 1, 2, 3, 4 } }, { 0, { 5, 6, 9, 10 } }, { 0, { 7, 8, 11, 12 } }, + { 0, { 13, 14, 17, 18 } }, { 0, { 15, 16, 19, 20 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_64x128[] = { + { 0, { 2, 3, 4, 5 } }, { 0, { 6, 7, 8, 9 } }, + { 0, { 10, 11, 14, 15 } }, { 0, { 12, 13, 16, 17 } }, + { 0, { 18, 19, 22, 23 } }, { 0, { 20, 21, 24, 25 } }, + { 0, { 26, 27, 30, 31 } }, { 0, { 28, 29, 32, 33 } }, + { 0, { 34, 35, 38, 39 } }, { 0, { 36, 37, 40, 41 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_128x64[] = { + { 0, { 2, 3, 6, 7 } }, { 0, { 4, 5, 8, 9 } }, + { 0, { 10, 11, 18, 19 } }, { 0, { 12, 13, 20, 21 } }, + { 0, { 14, 15, 22, 23 } }, { 0, { 16, 17, 24, 25 } }, + { 0, { 26, 27, 34, 35 } }, { 0, { 28, 29, 36, 37 } }, + { 0, { 30, 31, 38, 39 } }, { 0, { 32, 33, 40, 41 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_128x128[] = { + { 0, { 4, 5, 8, 9 } }, { 0, { 6, 7, 10, 11 } }, + { 0, { 12, 13, 16, 17 } }, { 0, { 14, 15, 18, 19 } }, + { 0, { 20, 21, 28, 29 } }, { 0, { 22, 23, 30, 31 } }, + { 0, { 24, 25, 32, 33 } }, { 0, { 26, 27, 34, 35 } }, + { 0, { 36, 37, 44, 45 } }, { 0, { 38, 39, 46, 47 } }, + { 0, { 40, 41, 48, 49 } }, { 0, { 42, 43, 50, 51 } }, + { 0, { 52, 53, 60, 61 } }, { 0, { 54, 55, 62, 63 } }, + { 0, { 56, 57, 64, 65 } }, { 0, { 58, 59, 66, 67 } }, + { 0, { 68, 69, 76, 77 } }, { 0, { 70, 71, 78, 79 } }, + { 0, { 72, 73, 80, 81 } }, { 0, { 74, 75, 82, 83 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_1_4[] = { + { 0, { 1, -1, 2, -1 } }, + { 0, { 3, 4, -1, -1 } }, + { 0, { 5, 6, -1, -1 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_4_1[] = { + { 0, { 1, 2, -1, -1 } }, + { 0, { 3, 4, -1, -1 } }, + { 0, { 5, 6, -1, -1 } }, +}; + +static const RD_RECORD_IDX_NODE *rd_record_tree[BLOCK_SIZES_ALL] = { + NULL, // BLOCK_4X4 + NULL, // BLOCK_4X8 + NULL, // BLOCK_8X4 + rd_record_tree_8x8, // BLOCK_8X8 + rd_record_tree_8x16, // BLOCK_8X16 + rd_record_tree_16x8, // BLOCK_16X8 + rd_record_tree_16x16, // BLOCK_16X16 + rd_record_tree_1_2, // BLOCK_16X32 + rd_record_tree_2_1, // BLOCK_32X16 + rd_record_tree_sqr, // BLOCK_32X32 + rd_record_tree_1_2, // BLOCK_32X64 + rd_record_tree_2_1, // BLOCK_64X32 + rd_record_tree_sqr, // BLOCK_64X64 + rd_record_tree_64x128, // BLOCK_64X128 + rd_record_tree_128x64, // BLOCK_128X64 + rd_record_tree_128x128, // BLOCK_128X128 + NULL, // BLOCK_4X16 + NULL, // BLOCK_16X4 + rd_record_tree_1_4, // BLOCK_8X32 + rd_record_tree_4_1, // BLOCK_32X8 + rd_record_tree_1_4, // BLOCK_16X64 + rd_record_tree_4_1, // BLOCK_64X16 +}; + +static const int rd_record_tree_size[BLOCK_SIZES_ALL] = { + 0, // BLOCK_4X4 + 0, // BLOCK_4X8 + 0, // BLOCK_8X4 + sizeof(rd_record_tree_8x8) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_8X8 + sizeof(rd_record_tree_8x16) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_8X16 + sizeof(rd_record_tree_16x8) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_16X8 + sizeof(rd_record_tree_16x16) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_16X16 + sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_16X32 + sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_32X16 + sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_32X32 + sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_32X64 + sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_64X32 + sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_64X64 + sizeof(rd_record_tree_64x128) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_64X128 + sizeof(rd_record_tree_128x64) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_128X64 + sizeof(rd_record_tree_128x128) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_128X128 + 0, // BLOCK_4X16 + 0, // BLOCK_16X4 + sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_8X32 + sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_32X8 + sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_16X64 + sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_64X16 +}; + +static INLINE void init_rd_record_tree(TXB_RD_INFO_NODE *tree, + BLOCK_SIZE bsize) { + const RD_RECORD_IDX_NODE *rd_record = rd_record_tree[bsize]; + const int size = rd_record_tree_size[bsize]; + for (int i = 0; i < size; ++i) { + if (rd_record[i].leaf) { + av1_zero(tree[i].children); + } else { + for (int j = 0; j < 4; ++j) { + const int8_t idx = rd_record[i].children[j]; + tree[i].children[j] = idx > 0 ? &tree[idx] : NULL; + } + } + } +} + +// Go through all TX blocks that could be used in TX size search, compute +// residual hash values for them and find matching RD info that stores previous +// RD search results for these TX blocks. The idea is to prevent repeated +// rate/distortion computations that happen because of the combination of +// partition and TX size search. The resulting RD info records are returned in +// the form of a quadtree for easier access in actual TX size search. +static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize, + TXB_RD_INFO_NODE *dst_rd_info) { + TXB_RD_RECORD *rd_records_table[4] = { x->txb_rd_record_8X8, + x->txb_rd_record_16X16, + x->txb_rd_record_32X32, + x->txb_rd_record_64X64 }; + const TX_SIZE max_square_tx_size = max_txsize_lookup[bsize]; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + + // Hashing is performed only for square TX sizes larger than TX_4X4 + if (max_square_tx_size < TX_8X8) return 0; + const int diff_stride = bw; + const struct macroblock_plane *const p = &x->plane[0]; + const int16_t *diff = &p->src_diff[0]; + init_rd_record_tree(dst_rd_info, bsize); + // Coordinates of the top-left corner of current block within the superblock + // measured in pixels: + const int mi_row = x->e_mbd.mi_row; + const int mi_col = x->e_mbd.mi_col; + const int mi_row_in_sb = (mi_row % MAX_MIB_SIZE) << MI_SIZE_LOG2; + const int mi_col_in_sb = (mi_col % MAX_MIB_SIZE) << MI_SIZE_LOG2; + int cur_rd_info_idx = 0; + int cur_tx_depth = 0; + TX_SIZE cur_tx_size = max_txsize_rect_lookup[bsize]; + while (cur_tx_depth <= MAX_VARTX_DEPTH) { + const int cur_tx_bw = tx_size_wide[cur_tx_size]; + const int cur_tx_bh = tx_size_high[cur_tx_size]; + if (cur_tx_bw < 8 || cur_tx_bh < 8) break; + const TX_SIZE next_tx_size = sub_tx_size_map[cur_tx_size]; + const int tx_size_idx = cur_tx_size - TX_8X8; + for (int row = 0; row < bh; row += cur_tx_bh) { + for (int col = 0; col < bw; col += cur_tx_bw) { + if (cur_tx_bw != cur_tx_bh) { + // Use dummy nodes for all rectangular transforms within the + // TX size search tree. + dst_rd_info[cur_rd_info_idx].rd_info_array = NULL; + } else { + // Get spatial location of this TX block within the superblock + // (measured in cur_tx_bsize units). + const int row_in_sb = (mi_row_in_sb + row) / cur_tx_bh; + const int col_in_sb = (mi_col_in_sb + col) / cur_tx_bw; + + int16_t hash_data[MAX_SB_SQUARE]; + int16_t *cur_hash_row = hash_data; + const int16_t *cur_diff_row = diff + row * diff_stride + col; + for (int i = 0; i < cur_tx_bh; i++) { + memcpy(cur_hash_row, cur_diff_row, sizeof(*hash_data) * cur_tx_bw); + cur_hash_row += cur_tx_bw; + cur_diff_row += diff_stride; + } + const int hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator, + (uint8_t *)hash_data, + 2 * cur_tx_bw * cur_tx_bh); + // Find corresponding RD info based on the hash value. + const int record_idx = + row_in_sb * (MAX_MIB_SIZE >> (tx_size_idx + 1)) + col_in_sb; + TXB_RD_RECORD *records = &rd_records_table[tx_size_idx][record_idx]; + int idx = find_tx_size_rd_info(records, hash); + dst_rd_info[cur_rd_info_idx].rd_info_array = + &records->tx_rd_info[idx]; + } + ++cur_rd_info_idx; + } + } + cur_tx_size = next_tx_size; + ++cur_tx_depth; + } + return 1; +} + +static INLINE uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) { + const int rows = block_size_high[bsize]; + const int cols = block_size_wide[bsize]; + const int16_t *diff = x->plane[0].src_diff; + const uint32_t hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator, + (uint8_t *)diff, 2 * rows * cols); + return (hash << 5) + bsize; +} + +static INLINE int32_t find_mb_rd_info(const MB_RD_RECORD *const mb_rd_record, + const int64_t ref_best_rd, + const uint32_t hash) { + int32_t match_index = -1; + if (ref_best_rd != INT64_MAX) { + for (int i = 0; i < mb_rd_record->num; ++i) { + const int index = (mb_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN; + // If there is a match in the tx_rd_record, fetch the RD decision and + // terminate early. + if (mb_rd_record->tx_rd_info[index].hash_value == hash) { + match_index = index; + break; + } + } + } + return match_index; +} + +static AOM_INLINE void fetch_tx_rd_info(int n4, + const MB_RD_INFO *const tx_rd_info, + RD_STATS *const rd_stats, + MACROBLOCK *const x) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + mbmi->tx_size = tx_rd_info->tx_size; + memcpy(x->blk_skip, tx_rd_info->blk_skip, + sizeof(tx_rd_info->blk_skip[0]) * n4); + av1_copy(mbmi->inter_tx_size, tx_rd_info->inter_tx_size); + av1_copy_array(xd->tx_type_map, tx_rd_info->tx_type_map, n4); + *rd_stats = tx_rd_info->rd_stats; +} + +// Compute the pixel domain distortion from diff on all visible 4x4s in the +// transform block. +static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane, + int blk_row, int blk_col, + const BLOCK_SIZE plane_bsize, + const BLOCK_SIZE tx_bsize, + unsigned int *block_mse_q8) { + int visible_rows, visible_cols; + const MACROBLOCKD *xd = &x->e_mbd; + get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL, + NULL, &visible_cols, &visible_rows); + const int diff_stride = block_size_wide[plane_bsize]; + const int16_t *diff = x->plane[plane].src_diff; + + diff += ((blk_row * diff_stride + blk_col) << MI_SIZE_LOG2); + uint64_t sse = + aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows); + if (block_mse_q8 != NULL) { + if (visible_cols > 0 && visible_rows > 0) + *block_mse_q8 = + (unsigned int)((256 * sse) / (visible_cols * visible_rows)); + else + *block_mse_q8 = UINT_MAX; + } + return sse; +} + +// Uses simple features on top of DCT coefficients to quickly predict +// whether optimal RD decision is to skip encoding the residual. +// The sse value is stored in dist. +static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist, + int reduced_tx_set) { + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const MACROBLOCKD *xd = &x->e_mbd; + const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd); + + *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize, NULL); + + const int64_t mse = *dist / bw / bh; + // Normalized quantizer takes the transform upscaling factor (8 for tx size + // smaller than 32) into account. + const int16_t normalized_dc_q = dc_q >> 3; + const int64_t mse_thresh = (int64_t)normalized_dc_q * normalized_dc_q / 8; + // For faster early skip decision, use dist to compare against threshold so + // that quality risk is less for the skip=1 decision. Otherwise, use mse + // since the fwd_txfm coeff checks will take care of quality + // TODO(any): Use dist to return 0 when predict_skip_level is 1 + int64_t pred_err = (x->predict_skip_level >= 2) ? *dist : mse; + // Predict not to skip when error is larger than threshold. + if (pred_err > mse_thresh) return 0; + // Return as skip otherwise for aggressive early skip + else if (x->predict_skip_level >= 2) + return 1; + + const int max_tx_size = max_predict_sf_tx_size[bsize]; + const int tx_h = tx_size_high[max_tx_size]; + const int tx_w = tx_size_wide[max_tx_size]; + DECLARE_ALIGNED(32, tran_low_t, coefs[32 * 32]); + TxfmParam param; + param.tx_type = DCT_DCT; + param.tx_size = max_tx_size; + param.bd = xd->bd; + param.is_hbd = is_cur_buf_hbd(xd); + param.lossless = 0; + param.tx_set_type = av1_get_ext_tx_set_type( + param.tx_size, is_inter_block(xd->mi[0]), reduced_tx_set); + const int bd_idx = (xd->bd == 8) ? 0 : ((xd->bd == 10) ? 1 : 2); + const uint32_t max_qcoef_thresh = skip_pred_threshold[bd_idx][bsize]; + const int16_t *src_diff = x->plane[0].src_diff; + const int n_coeff = tx_w * tx_h; + const int16_t ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd); + const uint32_t dc_thresh = max_qcoef_thresh * dc_q; + const uint32_t ac_thresh = max_qcoef_thresh * ac_q; + for (int row = 0; row < bh; row += tx_h) { + for (int col = 0; col < bw; col += tx_w) { + av1_fwd_txfm(src_diff + col, coefs, bw, ¶m); + // Operating on TX domain, not pixels; we want the QTX quantizers + const uint32_t dc_coef = (((uint32_t)abs(coefs[0])) << 7); + if (dc_coef >= dc_thresh) return 0; + for (int i = 1; i < n_coeff; ++i) { + const uint32_t ac_coef = (((uint32_t)abs(coefs[i])) << 7); + if (ac_coef >= ac_thresh) return 0; + } + } + src_diff += tx_h * bw; + } + return 1; +} + +// Used to set proper context for early termination with skip = 1. +static AOM_INLINE void set_skip_flag(MACROBLOCK *x, RD_STATS *rd_stats, + int bsize, int64_t dist) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int n4 = bsize_to_num_blk(bsize); + const TX_SIZE tx_size = max_txsize_rect_lookup[bsize]; + memset(xd->tx_type_map, DCT_DCT, sizeof(xd->tx_type_map[0]) * n4); + memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size)); + mbmi->tx_size = tx_size; + for (int i = 0; i < n4; ++i) set_blk_skip(x, 0, i, 1); + rd_stats->skip = 1; + if (is_cur_buf_hbd(xd)) dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2); + rd_stats->dist = rd_stats->sse = (dist << 4); + // Though decision is to make the block as skip based on luma stats, + // it is possible that block becomes non skip after chroma rd. In addition + // intermediate non skip costs calculated by caller function will be + // incorrect, if rate is set as zero (i.e., if zero_blk_rate is not + // accounted). Hence intermediate rate is populated to code the luma tx blks + // as skip, the caller function based on final rd decision (i.e., skip vs + // non-skip) sets the final rate accordingly. Here the rate populated + // corresponds to coding all the tx blocks with zero_blk_rate (based on max tx + // size possible) in the current block. Eg: For 128*128 block, rate would be + // 4 * zero_blk_rate where zero_blk_rate corresponds to coding of one 64x64 tx + // block as 'all zeros' + ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE]; + ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE]; + av1_get_entropy_contexts(bsize, &xd->plane[0], ctxa, ctxl); + ENTROPY_CONTEXT *ta = ctxa; + ENTROPY_CONTEXT *tl = ctxl; + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + TXB_CTX txb_ctx; + get_txb_ctx(bsize, tx_size, 0, ta, tl, &txb_ctx); + const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_Y] + .txb_skip_cost[txb_ctx.txb_skip_ctx][1]; + rd_stats->rate = zero_blk_rate * + (block_size_wide[bsize] >> tx_size_wide_log2[tx_size]) * + (block_size_high[bsize] >> tx_size_high_log2[tx_size]); +} + +static AOM_INLINE void save_tx_rd_info(int n4, uint32_t hash, + const MACROBLOCK *const x, + const RD_STATS *const rd_stats, + MB_RD_RECORD *tx_rd_record) { + int index; + if (tx_rd_record->num < RD_RECORD_BUFFER_LEN) { + index = + (tx_rd_record->index_start + tx_rd_record->num) % RD_RECORD_BUFFER_LEN; + ++tx_rd_record->num; + } else { + index = tx_rd_record->index_start; + tx_rd_record->index_start = + (tx_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN; + } + MB_RD_INFO *const tx_rd_info = &tx_rd_record->tx_rd_info[index]; + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + tx_rd_info->hash_value = hash; + tx_rd_info->tx_size = mbmi->tx_size; + memcpy(tx_rd_info->blk_skip, x->blk_skip, + sizeof(tx_rd_info->blk_skip[0]) * n4); + av1_copy(tx_rd_info->inter_tx_size, mbmi->inter_tx_size); + av1_copy_array(tx_rd_info->tx_type_map, xd->tx_type_map, n4); + tx_rd_info->rd_stats = *rd_stats; +} + +static int get_search_init_depth(int mi_width, int mi_height, int is_inter, + const SPEED_FEATURES *sf, + int tx_size_search_method) { + if (tx_size_search_method == USE_LARGESTALL) return MAX_VARTX_DEPTH; + + if (sf->tx_sf.tx_size_search_lgr_block) { + if (mi_width > mi_size_wide[BLOCK_64X64] || + mi_height > mi_size_high[BLOCK_64X64]) + return MAX_VARTX_DEPTH; + } + + if (is_inter) { + return (mi_height != mi_width) + ? sf->tx_sf.inter_tx_size_search_init_depth_rect + : sf->tx_sf.inter_tx_size_search_init_depth_sqr; + } else { + return (mi_height != mi_width) + ? sf->tx_sf.intra_tx_size_search_init_depth_rect + : sf->tx_sf.intra_tx_size_search_init_depth_sqr; + } +} + +static AOM_INLINE void select_tx_block( + const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block, + TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta, + ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left, + RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd, + int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode, + TXB_RD_INFO_NODE *rd_info_node); + +// NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values +// 0: Do not collect any RD stats +// 1: Collect RD stats for transform units +// 2: Collect RD stats for partition units +#if CONFIG_COLLECT_RD_STATS + +static AOM_INLINE void get_energy_distribution_fine( + const AV1_COMP *cpi, BLOCK_SIZE bsize, const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, int need_4th, double *hordist, + double *verdist) { + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + unsigned int esq[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + + if (bsize < BLOCK_16X16 || (bsize >= BLOCK_4X16 && bsize <= BLOCK_32X8)) { + // Special cases: calculate 'esq' values manually, as we don't have 'vf' + // functions for the 16 (very small) sub-blocks of this block. + const int w_shift = (bw == 4) ? 0 : (bw == 8) ? 1 : (bw == 16) ? 2 : 3; + const int h_shift = (bh == 4) ? 0 : (bh == 8) ? 1 : (bh == 16) ? 2 : 3; + assert(bw <= 32); + assert(bh <= 32); + assert(((bw - 1) >> w_shift) + (((bh - 1) >> h_shift) << 2) == 15); + if (cpi->common.seq_params.use_highbitdepth) { + const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); + const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); + for (int i = 0; i < bh; ++i) + for (int j = 0; j < bw; ++j) { + const int index = (j >> w_shift) + ((i >> h_shift) << 2); + esq[index] += + (src16[j + i * src_stride] - dst16[j + i * dst_stride]) * + (src16[j + i * src_stride] - dst16[j + i * dst_stride]); + } + } else { + for (int i = 0; i < bh; ++i) + for (int j = 0; j < bw; ++j) { + const int index = (j >> w_shift) + ((i >> h_shift) << 2); + esq[index] += (src[j + i * src_stride] - dst[j + i * dst_stride]) * + (src[j + i * src_stride] - dst[j + i * dst_stride]); + } + } + } else { // Calculate 'esq' values using 'vf' functions on the 16 sub-blocks. + const int f_index = + (bsize < BLOCK_SIZES) ? bsize - BLOCK_16X16 : bsize - BLOCK_8X16; + assert(f_index >= 0 && f_index < BLOCK_SIZES_ALL); + const BLOCK_SIZE subsize = (BLOCK_SIZE)f_index; + assert(block_size_wide[bsize] == 4 * block_size_wide[subsize]); + assert(block_size_high[bsize] == 4 * block_size_high[subsize]); + cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[0]); + cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, + &esq[1]); + cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, + &esq[2]); + cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, + dst_stride, &esq[3]); + src += bh / 4 * src_stride; + dst += bh / 4 * dst_stride; + + cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[4]); + cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, + &esq[5]); + cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, + &esq[6]); + cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, + dst_stride, &esq[7]); + src += bh / 4 * src_stride; + dst += bh / 4 * dst_stride; + + cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[8]); + cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, + &esq[9]); + cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, + &esq[10]); + cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, + dst_stride, &esq[11]); + src += bh / 4 * src_stride; + dst += bh / 4 * dst_stride; + + cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[12]); + cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, + &esq[13]); + cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, + &esq[14]); + cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, + dst_stride, &esq[15]); + } + + double total = (double)esq[0] + esq[1] + esq[2] + esq[3] + esq[4] + esq[5] + + esq[6] + esq[7] + esq[8] + esq[9] + esq[10] + esq[11] + + esq[12] + esq[13] + esq[14] + esq[15]; + if (total > 0) { + const double e_recip = 1.0 / total; + hordist[0] = ((double)esq[0] + esq[4] + esq[8] + esq[12]) * e_recip; + hordist[1] = ((double)esq[1] + esq[5] + esq[9] + esq[13]) * e_recip; + hordist[2] = ((double)esq[2] + esq[6] + esq[10] + esq[14]) * e_recip; + if (need_4th) { + hordist[3] = ((double)esq[3] + esq[7] + esq[11] + esq[15]) * e_recip; + } + verdist[0] = ((double)esq[0] + esq[1] + esq[2] + esq[3]) * e_recip; + verdist[1] = ((double)esq[4] + esq[5] + esq[6] + esq[7]) * e_recip; + verdist[2] = ((double)esq[8] + esq[9] + esq[10] + esq[11]) * e_recip; + if (need_4th) { + verdist[3] = ((double)esq[12] + esq[13] + esq[14] + esq[15]) * e_recip; + } + } else { + hordist[0] = verdist[0] = 0.25; + hordist[1] = verdist[1] = 0.25; + hordist[2] = verdist[2] = 0.25; + if (need_4th) { + hordist[3] = verdist[3] = 0.25; + } + } +} + +static double get_sse_norm(const int16_t *diff, int stride, int w, int h) { + double sum = 0.0; + for (int j = 0; j < h; ++j) { + for (int i = 0; i < w; ++i) { + const int err = diff[j * stride + i]; + sum += err * err; + } + } + assert(w > 0 && h > 0); + return sum / (w * h); +} + +static double get_sad_norm(const int16_t *diff, int stride, int w, int h) { + double sum = 0.0; + for (int j = 0; j < h; ++j) { + for (int i = 0; i < w; ++i) { + sum += abs(diff[j * stride + i]); + } + } + assert(w > 0 && h > 0); + return sum / (w * h); +} + +static AOM_INLINE void get_2x2_normalized_sses_and_sads( + const AV1_COMP *const cpi, BLOCK_SIZE tx_bsize, const uint8_t *const src, + int src_stride, const uint8_t *const dst, int dst_stride, + const int16_t *const src_diff, int diff_stride, double *const sse_norm_arr, + double *const sad_norm_arr) { + const BLOCK_SIZE tx_bsize_half = + get_partition_subsize(tx_bsize, PARTITION_SPLIT); + if (tx_bsize_half == BLOCK_INVALID) { // manually calculate stats + const int half_width = block_size_wide[tx_bsize] / 2; + const int half_height = block_size_high[tx_bsize] / 2; + for (int row = 0; row < 2; ++row) { + for (int col = 0; col < 2; ++col) { + const int16_t *const this_src_diff = + src_diff + row * half_height * diff_stride + col * half_width; + if (sse_norm_arr) { + sse_norm_arr[row * 2 + col] = + get_sse_norm(this_src_diff, diff_stride, half_width, half_height); + } + if (sad_norm_arr) { + sad_norm_arr[row * 2 + col] = + get_sad_norm(this_src_diff, diff_stride, half_width, half_height); + } + } + } + } else { // use function pointers to calculate stats + const int half_width = block_size_wide[tx_bsize_half]; + const int half_height = block_size_high[tx_bsize_half]; + const int num_samples_half = half_width * half_height; + for (int row = 0; row < 2; ++row) { + for (int col = 0; col < 2; ++col) { + const uint8_t *const this_src = + src + row * half_height * src_stride + col * half_width; + const uint8_t *const this_dst = + dst + row * half_height * dst_stride + col * half_width; + + if (sse_norm_arr) { + unsigned int this_sse; + cpi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst, + dst_stride, &this_sse); + sse_norm_arr[row * 2 + col] = (double)this_sse / num_samples_half; + } + + if (sad_norm_arr) { + const unsigned int this_sad = cpi->fn_ptr[tx_bsize_half].sdf( + this_src, src_stride, this_dst, dst_stride); + sad_norm_arr[row * 2 + col] = (double)this_sad / num_samples_half; + } + } + } + } +} + +#if CONFIG_COLLECT_RD_STATS == 1 +static double get_mean(const int16_t *diff, int stride, int w, int h) { + double sum = 0.0; + for (int j = 0; j < h; ++j) { + for (int i = 0; i < w; ++i) { + sum += diff[j * stride + i]; + } + } + assert(w > 0 && h > 0); + return sum / (w * h); +} +static AOM_INLINE void PrintTransformUnitStats( + const AV1_COMP *const cpi, MACROBLOCK *x, const RD_STATS *const rd_stats, + int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + TX_TYPE tx_type, int64_t rd) { + if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return; + + // Generate small sample to restrict output size. + static unsigned int seed = 21743; + if (lcg_rand16(&seed) % 256 > 0) return; + + const char output_file[] = "tu_stats.txt"; + FILE *fout = fopen(output_file, "a"); + if (!fout) return; + + const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; + const MACROBLOCKD *const xd = &x->e_mbd; + const int plane = 0; + struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int txw = tx_size_wide[tx_size]; + const int txh = tx_size_high[tx_size]; + const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; + const int q_step = p->dequant_QTX[1] >> dequant_shift; + const int num_samples = txw * txh; + + const double rate_norm = (double)rd_stats->rate / num_samples; + const double dist_norm = (double)rd_stats->dist / num_samples; + + fprintf(fout, "%g %g", rate_norm, dist_norm); + + const int src_stride = p->src.stride; + const uint8_t *const src = + &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2]; + const int dst_stride = pd->dst.stride; + const uint8_t *const dst = + &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2]; + unsigned int sse; + cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse); + const double sse_norm = (double)sse / num_samples; + + const unsigned int sad = + cpi->fn_ptr[tx_bsize].sdf(src, src_stride, dst, dst_stride); + const double sad_norm = (double)sad / num_samples; + + fprintf(fout, " %g %g", sse_norm, sad_norm); + + const int diff_stride = block_size_wide[plane_bsize]; + const int16_t *const src_diff = + &p->src_diff[(blk_row * diff_stride + blk_col) << MI_SIZE_LOG2]; + + double sse_norm_arr[4], sad_norm_arr[4]; + get_2x2_normalized_sses_and_sads(cpi, tx_bsize, src, src_stride, dst, + dst_stride, src_diff, diff_stride, + sse_norm_arr, sad_norm_arr); + for (int i = 0; i < 4; ++i) { + fprintf(fout, " %g", sse_norm_arr[i]); + } + for (int i = 0; i < 4; ++i) { + fprintf(fout, " %g", sad_norm_arr[i]); + } + + const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type]; + const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type]; + + fprintf(fout, " %d %d %d %d %d", q_step, tx_size_wide[tx_size], + tx_size_high[tx_size], tx_type_1d_row, tx_type_1d_col); + + int model_rate; + int64_t model_dist; + model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, tx_bsize, plane, sse, num_samples, + &model_rate, &model_dist); + const double model_rate_norm = (double)model_rate / num_samples; + const double model_dist_norm = (double)model_dist / num_samples; + fprintf(fout, " %g %g", model_rate_norm, model_dist_norm); + + const double mean = get_mean(src_diff, diff_stride, txw, txh); + float hor_corr, vert_corr; + av1_get_horver_correlation_full(src_diff, diff_stride, txw, txh, &hor_corr, + &vert_corr); + fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr); + + double hdist[4] = { 0 }, vdist[4] = { 0 }; + get_energy_distribution_fine(cpi, tx_bsize, src, src_stride, dst, dst_stride, + 1, hdist, vdist); + fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2], + hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]); + + fprintf(fout, " %d %" PRId64, x->rdmult, rd); + + fprintf(fout, "\n"); + fclose(fout); +} +#endif // CONFIG_COLLECT_RD_STATS == 1 + +#if CONFIG_COLLECT_RD_STATS >= 2 +static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) { + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + const MACROBLOCKD *xd = &x->e_mbd; + const MB_MODE_INFO *mbmi = xd->mi[0]; + int64_t total_sse = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE bs = get_plane_block_size(mbmi->sb_type, pd->subsampling_x, + pd->subsampling_y); + unsigned int sse; + + if (x->skip_chroma_rd && plane) continue; + + cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, + &sse); + total_sse += sse; + } + total_sse <<= 4; + return total_sse; +} + +static int get_est_rate_dist(const TileDataEnc *tile_data, BLOCK_SIZE bsize, + int64_t sse, int *est_residue_cost, + int64_t *est_dist) { + aom_clear_system_state(); + const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; + if (md->ready) { + if (sse < md->dist_mean) { + *est_residue_cost = 0; + *est_dist = sse; + } else { + *est_dist = (int64_t)round(md->dist_mean); + const double est_ld = md->a * sse + md->b; + // Clamp estimated rate cost by INT_MAX / 2. + // TODO(angiebird@google.com): find better solution than clamping. + if (fabs(est_ld) < 1e-2) { + *est_residue_cost = INT_MAX / 2; + } else { + double est_residue_cost_dbl = ((sse - md->dist_mean) / est_ld); + if (est_residue_cost_dbl < 0) { + *est_residue_cost = 0; + } else { + *est_residue_cost = + (int)AOMMIN((int64_t)round(est_residue_cost_dbl), INT_MAX / 2); + } + } + if (*est_residue_cost <= 0) { + *est_residue_cost = 0; + *est_dist = sse; + } + } + return 1; + } + return 0; +} + +static double get_highbd_diff_mean(const uint8_t *src8, int src_stride, + const uint8_t *dst8, int dst_stride, int w, + int h) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + double sum = 0.0; + for (int j = 0; j < h; ++j) { + for (int i = 0; i < w; ++i) { + const int diff = src[j * src_stride + i] - dst[j * dst_stride + i]; + sum += diff; + } + } + assert(w > 0 && h > 0); + return sum / (w * h); +} + +static double get_diff_mean(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, int w, int h) { + double sum = 0.0; + for (int j = 0; j < h; ++j) { + for (int i = 0; i < w; ++i) { + const int diff = src[j * src_stride + i] - dst[j * dst_stride + i]; + sum += diff; + } + } + assert(w > 0 && h > 0); + return sum / (w * h); +} + +static AOM_INLINE void PrintPredictionUnitStats(const AV1_COMP *const cpi, + const TileDataEnc *tile_data, + MACROBLOCK *x, + const RD_STATS *const rd_stats, + BLOCK_SIZE plane_bsize) { + if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return; + + if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1 && + (tile_data == NULL || + !tile_data->inter_mode_rd_models[plane_bsize].ready)) + return; + (void)tile_data; + // Generate small sample to restrict output size. + static unsigned int seed = 95014; + + if ((lcg_rand16(&seed) % (1 << (14 - num_pels_log2_lookup[plane_bsize]))) != + 1) + return; + + const char output_file[] = "pu_stats.txt"; + FILE *fout = fopen(output_file, "a"); + if (!fout) return; + + MACROBLOCKD *const xd = &x->e_mbd; + const int plane = 0; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *pd = &xd->plane[plane]; + const int diff_stride = block_size_wide[plane_bsize]; + int bw, bh; + get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw, + &bh); + const int num_samples = bw * bh; + const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; + const int q_step = p->dequant_QTX[1] >> dequant_shift; + const int shift = (xd->bd - 8); + + const double rate_norm = (double)rd_stats->rate / num_samples; + const double dist_norm = (double)rd_stats->dist / num_samples; + const double rdcost_norm = + (double)RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) / num_samples; + + fprintf(fout, "%g %g %g", rate_norm, dist_norm, rdcost_norm); + + const int src_stride = p->src.stride; + const uint8_t *const src = p->src.buf; + const int dst_stride = pd->dst.stride; + const uint8_t *const dst = pd->dst.buf; + const int16_t *const src_diff = p->src_diff; + + int64_t sse = calculate_sse(xd, p, pd, bw, bh); + const double sse_norm = (double)sse / num_samples; + + const unsigned int sad = + cpi->fn_ptr[plane_bsize].sdf(src, src_stride, dst, dst_stride); + const double sad_norm = + (double)sad / (1 << num_pels_log2_lookup[plane_bsize]); + + fprintf(fout, " %g %g", sse_norm, sad_norm); + + double sse_norm_arr[4], sad_norm_arr[4]; + get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst, + dst_stride, src_diff, diff_stride, + sse_norm_arr, sad_norm_arr); + if (shift) { + for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift)); + for (int k = 0; k < 4; ++k) sad_norm_arr[k] /= (1 << shift); + } + for (int i = 0; i < 4; ++i) { + fprintf(fout, " %g", sse_norm_arr[i]); + } + for (int i = 0; i < 4; ++i) { + fprintf(fout, " %g", sad_norm_arr[i]); + } + + fprintf(fout, " %d %d %d %d", q_step, x->rdmult, bw, bh); + + int model_rate; + int64_t model_dist; + model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, plane_bsize, plane, sse, num_samples, + &model_rate, &model_dist); + const double model_rdcost_norm = + (double)RDCOST(x->rdmult, model_rate, model_dist) / num_samples; + const double model_rate_norm = (double)model_rate / num_samples; + const double model_dist_norm = (double)model_dist / num_samples; + fprintf(fout, " %g %g %g", model_rate_norm, model_dist_norm, + model_rdcost_norm); + + double mean; + if (is_cur_buf_hbd(xd)) { + mean = get_highbd_diff_mean(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, bw, bh); + } else { + mean = get_diff_mean(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, + bw, bh); + } + mean /= (1 << shift); + float hor_corr, vert_corr; + av1_get_horver_correlation_full(src_diff, diff_stride, bw, bh, &hor_corr, + &vert_corr); + fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr); + + double hdist[4] = { 0 }, vdist[4] = { 0 }; + get_energy_distribution_fine(cpi, plane_bsize, src, src_stride, dst, + dst_stride, 1, hdist, vdist); + fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2], + hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]); + + if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) { + assert(tile_data->inter_mode_rd_models[plane_bsize].ready); + const int64_t overall_sse = get_sse(cpi, x); + int est_residue_cost = 0; + int64_t est_dist = 0; + get_est_rate_dist(tile_data, plane_bsize, overall_sse, &est_residue_cost, + &est_dist); + const double est_residue_cost_norm = (double)est_residue_cost / num_samples; + const double est_dist_norm = (double)est_dist / num_samples; + const double est_rdcost_norm = + (double)RDCOST(x->rdmult, est_residue_cost, est_dist) / num_samples; + fprintf(fout, " %g %g %g", est_residue_cost_norm, est_dist_norm, + est_rdcost_norm); + } + + fprintf(fout, "\n"); + fclose(fout); +} +#endif // CONFIG_COLLECT_RD_STATS >= 2 +#endif // CONFIG_COLLECT_RD_STATS + +static AOM_INLINE void inverse_transform_block_facade(MACROBLOCKD *xd, + int plane, int block, + int blk_row, int blk_col, + int eob, + int reduced_tx_set) { + if (!eob) return; + + struct macroblockd_plane *const pd = &xd->plane[plane]; + tran_low_t *dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block); + const PLANE_TYPE plane_type = get_plane_type(plane); + const TX_SIZE tx_size = av1_get_tx_size(plane, xd); + const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, + tx_size, reduced_tx_set); + const int dst_stride = pd->dst.stride; + uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2]; + av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, + dst_stride, eob, reduced_tx_set); +} + +static INLINE void recon_intra(const AV1_COMP *cpi, MACROBLOCK *x, int plane, + int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + const TXB_CTX *const txb_ctx, int skip_trellis, + TX_TYPE best_tx_type, int do_quant, + int *rate_cost, uint16_t best_eob) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const int is_inter = is_inter_block(mbmi); + if (!is_inter && best_eob && + (blk_row + tx_size_high_unit[tx_size] < mi_size_high[plane_bsize] || + blk_col + tx_size_wide_unit[tx_size] < mi_size_wide[plane_bsize])) { + // if the quantized coefficients are stored in the dqcoeff buffer, we don't + // need to do transform and quantization again. + if (do_quant) { + TxfmParam txfm_param_intra; + QUANT_PARAM quant_param_intra; + av1_setup_xform(cm, x, tx_size, best_tx_type, &txfm_param_intra); + av1_setup_quant(tx_size, !skip_trellis, + skip_trellis + ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B + : AV1_XFORM_QUANT_FP) + : AV1_XFORM_QUANT_FP, + cpi->oxcf.quant_b_adapt, &quant_param_intra); + av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, best_tx_type, + &quant_param_intra); + av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, + &txfm_param_intra, &quant_param_intra); + if (quant_param_intra.use_optimize_b) { + av1_optimize_b(cpi, x, plane, block, tx_size, best_tx_type, txb_ctx, + cpi->sf.rd_sf.trellis_eob_fast, rate_cost); + } + } + + inverse_transform_block_facade(xd, plane, block, blk_row, blk_col, + x->plane[plane].eobs[block], + cm->features.reduced_tx_set_used); + + // This may happen because of hash collision. The eob stored in the hash + // table is non-zero, but the real eob is zero. We need to make sure tx_type + // is DCT_DCT in this case. + if (plane == 0 && x->plane[plane].eobs[block] == 0 && + best_tx_type != DCT_DCT) { + update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT); + } + } +} + +static unsigned pixel_dist_visible_only( + const AV1_COMP *const cpi, const MACROBLOCK *x, const uint8_t *src, + const int src_stride, const uint8_t *dst, const int dst_stride, + const BLOCK_SIZE tx_bsize, int txb_rows, int txb_cols, int visible_rows, + int visible_cols) { + unsigned sse; + + if (txb_rows == visible_rows && txb_cols == visible_cols) { + cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse); + return sse; + } + +#if CONFIG_AV1_HIGHBITDEPTH + const MACROBLOCKD *xd = &x->e_mbd; + if (is_cur_buf_hbd(xd)) { + uint64_t sse64 = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride, + visible_cols, visible_rows); + return (unsigned int)ROUND_POWER_OF_TWO(sse64, (xd->bd - 8) * 2); + } +#else + (void)x; +#endif + sse = aom_sse_odd_size(src, src_stride, dst, dst_stride, visible_cols, + visible_rows); + return sse; +} + +// Compute the pixel domain distortion from src and dst on all visible 4x4s in +// the +// transform block. +static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x, + int plane, const uint8_t *src, const int src_stride, + const uint8_t *dst, const int dst_stride, + int blk_row, int blk_col, + const BLOCK_SIZE plane_bsize, + const BLOCK_SIZE tx_bsize) { + int txb_rows, txb_cols, visible_rows, visible_cols; + const MACROBLOCKD *xd = &x->e_mbd; + + get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, + &txb_cols, &txb_rows, &visible_cols, &visible_rows); + assert(visible_rows > 0); + assert(visible_cols > 0); + + unsigned sse = pixel_dist_visible_only(cpi, x, src, src_stride, dst, + dst_stride, tx_bsize, txb_rows, + txb_cols, visible_rows, visible_cols); + + return sse; +} + +static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x, + int plane, BLOCK_SIZE plane_bsize, + int block, int blk_row, int blk_col, + TX_SIZE tx_size) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const uint16_t eob = p->eobs[block]; + const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; + const int bsw = block_size_wide[tx_bsize]; + const int bsh = block_size_high[tx_bsize]; + const int src_stride = x->plane[plane].src.stride; + const int dst_stride = xd->plane[plane].dst.stride; + // Scale the transform block index to pixel unit. + const int src_idx = (blk_row * src_stride + blk_col) << MI_SIZE_LOG2; + const int dst_idx = (blk_row * dst_stride + blk_col) << MI_SIZE_LOG2; + const uint8_t *src = &x->plane[plane].src.buf[src_idx]; + const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx]; + const tran_low_t *dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block); + + assert(cpi != NULL); + assert(tx_size_wide_log2[0] == tx_size_high_log2[0]); + + uint8_t *recon; + DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]); + +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + recon = CONVERT_TO_BYTEPTR(recon16); + av1_highbd_convolve_2d_copy_sr(CONVERT_TO_SHORTPTR(dst), dst_stride, + CONVERT_TO_SHORTPTR(recon), MAX_TX_SIZE, bsw, + bsh, NULL, NULL, 0, 0, NULL, xd->bd); + } else { + recon = (uint8_t *)recon16; + av1_convolve_2d_copy_sr(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh, NULL, + NULL, 0, 0, NULL); + } +#else + recon = (uint8_t *)recon16; + av1_convolve_2d_copy_sr(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh, NULL, + NULL, 0, 0, NULL); +#endif + + const PLANE_TYPE plane_type = get_plane_type(plane); + TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size, + cpi->common.features.reduced_tx_set_used); + av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, recon, + MAX_TX_SIZE, eob, + cpi->common.features.reduced_tx_set_used); + + return 16 * pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE, + blk_row, blk_col, plane_bsize, tx_bsize); +} + +static uint32_t get_intra_txb_hash(MACROBLOCK *x, int plane, int blk_row, + int blk_col, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size) { + int16_t tmp_data[64 * 64]; + const int diff_stride = block_size_wide[plane_bsize]; + const int16_t *diff = x->plane[plane].src_diff; + const int16_t *cur_diff_row = diff + 4 * blk_row * diff_stride + 4 * blk_col; + const int txb_w = tx_size_wide[tx_size]; + const int txb_h = tx_size_high[tx_size]; + uint8_t *hash_data = (uint8_t *)cur_diff_row; + if (txb_w != diff_stride) { + int16_t *cur_hash_row = tmp_data; + for (int i = 0; i < txb_h; i++) { + memcpy(cur_hash_row, cur_diff_row, sizeof(*diff) * txb_w); + cur_hash_row += txb_w; + cur_diff_row += diff_stride; + } + hash_data = (uint8_t *)tmp_data; + } + CRC32C *crc = &x->mb_rd_record.crc_calculator; + const uint32_t hash = av1_get_crc32c_value(crc, hash_data, 2 * txb_w * txb_h); + return (hash << 5) + tx_size; +} + +// pruning thresholds for prune_txk_type and prune_txk_type_separ +static const int prune_factors[5] = { 200, 200, 120, 80, 40 }; // scale 1000 +static const int mul_factors[5] = { 80, 80, 70, 50, 30 }; // scale 100 + +static INLINE int is_intra_hash_match(const AV1_COMP *cpi, MACROBLOCK *x, + int plane, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + const TXB_CTX *const txb_ctx, + TXB_RD_INFO **intra_txb_rd_info, + const int tx_type_map_idx, + uint16_t *cur_joint_ctx) { + MACROBLOCKD *xd = &x->e_mbd; + assert(cpi->sf.tx_sf.use_intra_txb_hash && + frame_is_intra_only(&cpi->common) && !is_inter_block(xd->mi[0]) && + plane == 0 && tx_size_wide[tx_size] == tx_size_high[tx_size]); + const uint32_t intra_hash = + get_intra_txb_hash(x, plane, blk_row, blk_col, plane_bsize, tx_size); + const int intra_hash_idx = + find_tx_size_rd_info(&x->txb_rd_record_intra, intra_hash); + *intra_txb_rd_info = &x->txb_rd_record_intra.tx_rd_info[intra_hash_idx]; + *cur_joint_ctx = (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx; + if ((*intra_txb_rd_info)->entropy_context == *cur_joint_ctx && + x->txb_rd_record_intra.tx_rd_info[intra_hash_idx].valid) { + xd->tx_type_map[tx_type_map_idx] = (*intra_txb_rd_info)->tx_type; + const TX_TYPE ref_tx_type = + av1_get_tx_type(xd, get_plane_type(plane), blk_row, blk_col, tx_size, + cpi->common.features.reduced_tx_set_used); + return (ref_tx_type == (*intra_txb_rd_info)->tx_type); + } + return 0; +} + +// R-D costs are sorted in ascending order. +static INLINE void sort_rd(int64_t rds[], int txk[], int len) { + int i, j, k; + + for (i = 1; i <= len - 1; ++i) { + for (j = 0; j < i; ++j) { + if (rds[j] > rds[i]) { + int64_t temprd; + int tempi; + + temprd = rds[i]; + tempi = txk[i]; + + for (k = i; k > j; k--) { + rds[k] = rds[k - 1]; + txk[k] = txk[k - 1]; + } + + rds[j] = temprd; + txk[j] = tempi; + break; + } + } + } +} + +static INLINE void dist_block_tx_domain(MACROBLOCK *x, int plane, int block, + TX_SIZE tx_size, int64_t *out_dist, + int64_t *out_sse) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + // Transform domain distortion computation is more efficient as it does + // not involve an inverse transform, but it is less accurate. + const int buffer_length = av1_get_max_eob(tx_size); + int64_t this_sse; + // TX-domain results need to shift down to Q2/D10 to match pixel + // domain distortion values which are in Q2^2 + int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2; + const int block_offset = BLOCK_OFFSET(block); + tran_low_t *const coeff = p->coeff + block_offset; + tran_low_t *const dqcoeff = pd->dqcoeff + block_offset; +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) + *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse, + xd->bd); + else + *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse); +#else + *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse); +#endif + *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift); + *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift); +} + +uint16_t prune_txk_type_separ(const AV1_COMP *cpi, MACROBLOCK *x, int plane, + int block, TX_SIZE tx_size, int blk_row, + int blk_col, BLOCK_SIZE plane_bsize, int *txk_map, + int16_t allowed_tx_mask, int prune_factor, + const TXB_CTX *const txb_ctx, + int reduced_tx_set_used, int64_t ref_best_rd, + int num_sel) { + const AV1_COMMON *cm = &cpi->common; + + int idx; + + int64_t rds_v[4]; + int64_t rds_h[4]; + int idx_v[4] = { 0, 1, 2, 3 }; + int idx_h[4] = { 0, 1, 2, 3 }; + int skip_v[4] = { 0 }; + int skip_h[4] = { 0 }; + const int idx_map[16] = { + DCT_DCT, DCT_ADST, DCT_FLIPADST, V_DCT, + ADST_DCT, ADST_ADST, ADST_FLIPADST, V_ADST, + FLIPADST_DCT, FLIPADST_ADST, FLIPADST_FLIPADST, V_FLIPADST, + H_DCT, H_ADST, H_FLIPADST, IDTX + }; + + const int sel_pattern_v[16] = { + 0, 0, 1, 1, 0, 2, 1, 2, 2, 0, 3, 1, 3, 2, 3, 3 + }; + const int sel_pattern_h[16] = { + 0, 1, 0, 1, 2, 0, 2, 1, 2, 3, 0, 3, 1, 3, 2, 3 + }; + + QUANT_PARAM quant_param; + TxfmParam txfm_param; + av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param); + av1_setup_quant(tx_size, 1, AV1_XFORM_QUANT_B, cpi->oxcf.quant_b_adapt, + &quant_param); + int tx_type; + // to ensure we can try ones even outside of ext_tx_set of current block + // this function should only be called for size < 16 + assert(txsize_sqr_up_map[tx_size] <= TX_16X16); + txfm_param.tx_set_type = EXT_TX_SET_ALL16; + + int rate_cost = 0; + int64_t dist = 0, sse = 0; + // evaluate horizontal with vertical DCT + for (idx = 0; idx < 4; ++idx) { + tx_type = idx_map[idx]; + txfm_param.tx_type = tx_type; + + av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param, + &quant_param); + + dist_block_tx_domain(x, plane, block, tx_size, &dist, &sse); + + rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type, + txb_ctx, reduced_tx_set_used, 0); + + rds_h[idx] = RDCOST(x->rdmult, rate_cost, dist); + + if ((rds_h[idx] - (rds_h[idx] >> 2)) > ref_best_rd) { + skip_h[idx] = 1; + } + } + sort_rd(rds_h, idx_h, 4); + for (idx = 1; idx < 4; idx++) { + if (rds_h[idx] > rds_h[0] * 1.2) skip_h[idx_h[idx]] = 1; + } + + if (skip_h[idx_h[0]]) return (uint16_t)0xFFFF; + + // evaluate vertical with the best horizontal chosen + rds_v[0] = rds_h[0]; + int start_v = 1, end_v = 4; + const int *idx_map_v = idx_map + idx_h[0]; + + for (idx = start_v; idx < end_v; ++idx) { + tx_type = idx_map_v[idx_v[idx] * 4]; + txfm_param.tx_type = tx_type; + + av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param, + &quant_param); + + dist_block_tx_domain(x, plane, block, tx_size, &dist, &sse); + + rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type, + txb_ctx, reduced_tx_set_used, 0); + + rds_v[idx] = RDCOST(x->rdmult, rate_cost, dist); + + if ((rds_v[idx] - (rds_v[idx] >> 2)) > ref_best_rd) { + skip_v[idx] = 1; + } + } + sort_rd(rds_v, idx_v, 4); + for (idx = 1; idx < 4; idx++) { + if (rds_v[idx] > rds_v[0] * 1.2) skip_v[idx_v[idx]] = 1; + } + + // combine rd_h and rd_v to prune tx candidates + int i_v, i_h; + int64_t rds[16]; + int num_cand = 0, last = TX_TYPES - 1; + + for (int i = 0; i < 16; i++) { + i_v = sel_pattern_v[i]; + i_h = sel_pattern_h[i]; + tx_type = idx_map[idx_v[i_v] * 4 + idx_h[i_h]]; + if (!(allowed_tx_mask & (1 << tx_type)) || skip_h[idx_h[i_h]] || + skip_v[idx_v[i_v]]) { + txk_map[last] = tx_type; + last--; + } else { + txk_map[num_cand] = tx_type; + rds[num_cand] = rds_v[i_v] + rds_h[i_h]; + if (rds[num_cand] == 0) rds[num_cand] = 1; + num_cand++; + } + } + sort_rd(rds, txk_map, num_cand); + + uint16_t prune = (uint16_t)(~(1 << txk_map[0])); + num_sel = AOMMIN(num_sel, num_cand); + + for (int i = 1; i < num_sel; i++) { + int64_t factor = 1800 * (rds[i] - rds[0]) / (rds[0]); + if (factor < (int64_t)prune_factor) + prune &= ~(1 << txk_map[i]); + else + break; + } + return prune; +} + +uint16_t prune_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, + int block, TX_SIZE tx_size, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, int *txk_map, + uint16_t allowed_tx_mask, int prune_factor, + const TXB_CTX *const txb_ctx, int reduced_tx_set_used) { + const AV1_COMMON *cm = &cpi->common; + int tx_type; + + int64_t rds[TX_TYPES]; + + int num_cand = 0; + int last = TX_TYPES - 1; + + TxfmParam txfm_param; + QUANT_PARAM quant_param; + av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param); + av1_setup_quant(tx_size, 1, AV1_XFORM_QUANT_B, cpi->oxcf.quant_b_adapt, + &quant_param); + + for (int idx = 0; idx < TX_TYPES; idx++) { + tx_type = idx; + int rate_cost = 0; + int64_t dist = 0, sse = 0; + if (!(allowed_tx_mask & (1 << tx_type))) { + txk_map[last] = tx_type; + last--; + continue; + } + txfm_param.tx_type = tx_type; + + // do txfm and quantization + av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param, + &quant_param); + // estimate rate cost + rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type, + txb_ctx, reduced_tx_set_used, 0); + // tx domain dist + dist_block_tx_domain(x, plane, block, tx_size, &dist, &sse); + + txk_map[num_cand] = tx_type; + rds[num_cand] = RDCOST(x->rdmult, rate_cost, dist); + if (rds[num_cand] == 0) rds[num_cand] = 1; + num_cand++; + } + + if (num_cand == 0) return (uint16_t)0xFFFF; + + sort_rd(rds, txk_map, num_cand); + uint16_t prune = (uint16_t)(~(1 << txk_map[0])); + + // 0 < prune_factor <= 1000 controls aggressiveness + int64_t factor = 0; + for (int idx = 1; idx < num_cand; idx++) { + factor = 1000 * (rds[idx] - rds[0]) / rds[0]; + if (factor < (int64_t)prune_factor) + prune &= ~(1 << txk_map[idx]); + else + break; + } + return prune; +} + +// These thresholds were calibrated to provide a certain number of TX types +// pruned by the model on average, i.e. selecting a threshold with index i +// will lead to pruning i+1 TX types on average +static const float *prune_2D_adaptive_thresholds[] = { + // TX_4X4 + (float[]){ 0.00549f, 0.01306f, 0.02039f, 0.02747f, 0.03406f, 0.04065f, + 0.04724f, 0.05383f, 0.06067f, 0.06799f, 0.07605f, 0.08533f, + 0.09778f, 0.11780f }, + // TX_8X8 + (float[]){ 0.00037f, 0.00183f, 0.00525f, 0.01038f, 0.01697f, 0.02502f, + 0.03381f, 0.04333f, 0.05286f, 0.06287f, 0.07434f, 0.08850f, + 0.10803f, 0.14124f }, + // TX_16X16 + (float[]){ 0.01404f, 0.02000f, 0.04211f, 0.05164f, 0.05798f, 0.06335f, + 0.06897f, 0.07629f, 0.08875f, 0.11169f }, + // TX_32X32 + NULL, + // TX_64X64 + NULL, + // TX_4X8 + (float[]){ 0.00183f, 0.00745f, 0.01428f, 0.02185f, 0.02966f, 0.03723f, + 0.04456f, 0.05188f, 0.05920f, 0.06702f, 0.07605f, 0.08704f, + 0.10168f, 0.12585f }, + // TX_8X4 + (float[]){ 0.00085f, 0.00476f, 0.01135f, 0.01892f, 0.02698f, 0.03528f, + 0.04358f, 0.05164f, 0.05994f, 0.06848f, 0.07849f, 0.09021f, + 0.10583f, 0.13123f }, + // TX_8X16 + (float[]){ 0.00037f, 0.00232f, 0.00671f, 0.01257f, 0.01965f, 0.02722f, + 0.03552f, 0.04382f, 0.05237f, 0.06189f, 0.07336f, 0.08728f, + 0.10730f, 0.14221f }, + // TX_16X8 + (float[]){ 0.00061f, 0.00330f, 0.00818f, 0.01453f, 0.02185f, 0.02966f, + 0.03772f, 0.04578f, 0.05383f, 0.06262f, 0.07288f, 0.08582f, + 0.10339f, 0.13464f }, + // TX_16X32 + NULL, + // TX_32X16 + NULL, + // TX_32X64 + NULL, + // TX_64X32 + NULL, + // TX_4X16 + (float[]){ 0.00232f, 0.00671f, 0.01257f, 0.01941f, 0.02673f, 0.03430f, + 0.04211f, 0.04968f, 0.05750f, 0.06580f, 0.07507f, 0.08655f, + 0.10242f, 0.12878f }, + // TX_16X4 + (float[]){ 0.00110f, 0.00525f, 0.01208f, 0.01990f, 0.02795f, 0.03601f, + 0.04358f, 0.05115f, 0.05896f, 0.06702f, 0.07629f, 0.08752f, + 0.10217f, 0.12610f }, + // TX_8X32 + NULL, + // TX_32X8 + NULL, + // TX_16X64 + NULL, + // TX_64X16 + NULL, +}; + +// Probablities are sorted in descending order. +static INLINE void sort_probability(float prob[], int txk[], int len) { + int i, j, k; + + for (i = 1; i <= len - 1; ++i) { + for (j = 0; j < i; ++j) { + if (prob[j] < prob[i]) { + float temp; + int tempi; + + temp = prob[i]; + tempi = txk[i]; + + for (k = i; k > j; k--) { + prob[k] = prob[k - 1]; + txk[k] = txk[k - 1]; + } + + prob[j] = temp; + txk[j] = tempi; + break; + } + } + } +} + +static INLINE float get_adaptive_thresholds(TX_SIZE tx_size, + TxSetType tx_set_type, + TX_TYPE_PRUNE_MODE prune_mode) { + const int prune_aggr_table[4][2] = { { 4, 1 }, { 6, 3 }, { 9, 6 }, { 9, 6 } }; + int pruning_aggressiveness = 0; + if (tx_set_type == EXT_TX_SET_ALL16) + pruning_aggressiveness = + prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][0]; + else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT) + pruning_aggressiveness = + prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][1]; + + return prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness]; +} + +static AOM_INLINE void get_energy_distribution_finer(const int16_t *diff, + int stride, int bw, int bh, + float *hordist, + float *verdist) { + // First compute downscaled block energy values (esq); downscale factors + // are defined by w_shift and h_shift. + unsigned int esq[256]; + const int w_shift = bw <= 8 ? 0 : 1; + const int h_shift = bh <= 8 ? 0 : 1; + const int esq_w = bw >> w_shift; + const int esq_h = bh >> h_shift; + const int esq_sz = esq_w * esq_h; + int i, j; + memset(esq, 0, esq_sz * sizeof(esq[0])); + if (w_shift) { + for (i = 0; i < bh; i++) { + unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w; + const int16_t *cur_diff_row = diff + i * stride; + for (j = 0; j < bw; j += 2) { + cur_esq_row[j >> 1] += (cur_diff_row[j] * cur_diff_row[j] + + cur_diff_row[j + 1] * cur_diff_row[j + 1]); + } + } + } else { + for (i = 0; i < bh; i++) { + unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w; + const int16_t *cur_diff_row = diff + i * stride; + for (j = 0; j < bw; j++) { + cur_esq_row[j] += cur_diff_row[j] * cur_diff_row[j]; + } + } + } + + uint64_t total = 0; + for (i = 0; i < esq_sz; i++) total += esq[i]; + + // Output hordist and verdist arrays are normalized 1D projections of esq + if (total == 0) { + float hor_val = 1.0f / esq_w; + for (j = 0; j < esq_w - 1; j++) hordist[j] = hor_val; + float ver_val = 1.0f / esq_h; + for (i = 0; i < esq_h - 1; i++) verdist[i] = ver_val; + return; + } + + const float e_recip = 1.0f / (float)total; + memset(hordist, 0, (esq_w - 1) * sizeof(hordist[0])); + memset(verdist, 0, (esq_h - 1) * sizeof(verdist[0])); + const unsigned int *cur_esq_row; + for (i = 0; i < esq_h - 1; i++) { + cur_esq_row = esq + i * esq_w; + for (j = 0; j < esq_w - 1; j++) { + hordist[j] += (float)cur_esq_row[j]; + verdist[i] += (float)cur_esq_row[j]; + } + verdist[i] += (float)cur_esq_row[j]; + } + cur_esq_row = esq + i * esq_w; + for (j = 0; j < esq_w - 1; j++) hordist[j] += (float)cur_esq_row[j]; + + for (j = 0; j < esq_w - 1; j++) hordist[j] *= e_recip; + for (i = 0; i < esq_h - 1; i++) verdist[i] *= e_recip; +} + +static void prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size, + int blk_row, int blk_col, TxSetType tx_set_type, + TX_TYPE_PRUNE_MODE prune_mode, int *txk_map, + uint16_t *allowed_tx_mask) { + int tx_type_table_2D[16] = { + DCT_DCT, DCT_ADST, DCT_FLIPADST, V_DCT, + ADST_DCT, ADST_ADST, ADST_FLIPADST, V_ADST, + FLIPADST_DCT, FLIPADST_ADST, FLIPADST_FLIPADST, V_FLIPADST, + H_DCT, H_ADST, H_FLIPADST, IDTX + }; + if (tx_set_type != EXT_TX_SET_ALL16 && + tx_set_type != EXT_TX_SET_DTT9_IDTX_1DDCT) + return; +#if CONFIG_NN_V2 + NN_CONFIG_V2 *nn_config_hor = av1_tx_type_nnconfig_map_hor[tx_size]; + NN_CONFIG_V2 *nn_config_ver = av1_tx_type_nnconfig_map_ver[tx_size]; +#else + const NN_CONFIG *nn_config_hor = av1_tx_type_nnconfig_map_hor[tx_size]; + const NN_CONFIG *nn_config_ver = av1_tx_type_nnconfig_map_ver[tx_size]; +#endif + if (!nn_config_hor || !nn_config_ver) return; // Model not established yet. + + aom_clear_system_state(); + float hfeatures[16], vfeatures[16]; + float hscores[4], vscores[4]; + float scores_2D_raw[16]; + float scores_2D[16]; + const int bw = tx_size_wide[tx_size]; + const int bh = tx_size_high[tx_size]; + const int hfeatures_num = bw <= 8 ? bw : bw / 2; + const int vfeatures_num = bh <= 8 ? bh : bh / 2; + assert(hfeatures_num <= 16); + assert(vfeatures_num <= 16); + + const struct macroblock_plane *const p = &x->plane[0]; + const int diff_stride = block_size_wide[bsize]; + const int16_t *diff = p->src_diff + 4 * blk_row * diff_stride + 4 * blk_col; + get_energy_distribution_finer(diff, diff_stride, bw, bh, hfeatures, + vfeatures); + av1_get_horver_correlation_full(diff, diff_stride, bw, bh, + &hfeatures[hfeatures_num - 1], + &vfeatures[vfeatures_num - 1]); + aom_clear_system_state(); +#if CONFIG_NN_V2 + av1_nn_predict_v2(hfeatures, nn_config_hor, 0, hscores); + av1_nn_predict_v2(vfeatures, nn_config_ver, 0, vscores); +#else + av1_nn_predict(hfeatures, nn_config_hor, 1, hscores); + av1_nn_predict(vfeatures, nn_config_ver, 1, vscores); +#endif + aom_clear_system_state(); + + for (int i = 0; i < 4; i++) { + float *cur_scores_2D = scores_2D_raw + i * 4; + cur_scores_2D[0] = vscores[i] * hscores[0]; + cur_scores_2D[1] = vscores[i] * hscores[1]; + cur_scores_2D[2] = vscores[i] * hscores[2]; + cur_scores_2D[3] = vscores[i] * hscores[3]; + } + + av1_nn_softmax(scores_2D_raw, scores_2D, 16); + + const float score_thresh = + get_adaptive_thresholds(tx_size, tx_set_type, prune_mode); + + // Always keep the TX type with the highest score, prune all others with + // score below score_thresh. + int max_score_i = 0; + float max_score = 0.0f; + uint16_t allow_bitmask = 0; + float sum_score = 0.0; + // Calculate sum of allowed tx type score and Populate allow bit mask based + // on score_thresh and allowed_tx_mask + for (int tx_idx = 0; tx_idx < TX_TYPES; tx_idx++) { + int allow_tx_type = *allowed_tx_mask & (1 << tx_type_table_2D[tx_idx]); + if (scores_2D[tx_idx] > max_score && allow_tx_type) { + max_score = scores_2D[tx_idx]; + max_score_i = tx_idx; + } + if (scores_2D[tx_idx] >= score_thresh && allow_tx_type) { + // Set allow mask based on score_thresh + allow_bitmask |= (1 << tx_type_table_2D[tx_idx]); + + // Accumulate score of allowed tx type + sum_score += scores_2D[tx_idx]; + } + } + if (!((allow_bitmask >> max_score_i) & 0x01)) { + // Set allow mask based on tx type with max score + allow_bitmask |= (1 << tx_type_table_2D[max_score_i]); + sum_score += scores_2D[max_score_i]; + } + // Sort tx type probability of all types + sort_probability(scores_2D, tx_type_table_2D, TX_TYPES); + + // Enable more pruning based on tx type probability and number of allowed tx + // types + if (prune_mode == PRUNE_2D_AGGRESSIVE) { + float temp_score = 0.0; + float score_ratio = 0.0; + int tx_idx, tx_count = 0; + const float inv_sum_score = 100 / sum_score; + // Get allowed tx types based on sorted probability score and tx count + for (tx_idx = 0; tx_idx < TX_TYPES; tx_idx++) { + // Skip the tx type which has more than 30% of cumulative + // probability and allowed tx type count is more than 2 + if (score_ratio > 30.0 && tx_count >= 2) break; + + // Calculate cumulative probability of allowed tx types + if (allow_bitmask & (1 << tx_type_table_2D[tx_idx])) { + // Calculate cumulative probability + temp_score += scores_2D[tx_idx]; + + // Calculate percentage of cumulative probability of allowed tx type + score_ratio = temp_score * inv_sum_score; + tx_count++; + } + } + // Set remaining tx types as pruned + for (; tx_idx < TX_TYPES; tx_idx++) + allow_bitmask &= ~(1 << tx_type_table_2D[tx_idx]); + } + memcpy(txk_map, tx_type_table_2D, sizeof(tx_type_table_2D)); + *allowed_tx_mask = allow_bitmask; +} + +static float get_dev(float mean, double x2_sum, int num) { + const float e_x2 = (float)(x2_sum / num); + const float diff = e_x2 - mean * mean; + const float dev = (diff > 0) ? sqrtf(diff) : 0; + return dev; +} + +// Feature used by the model to predict tx split: the mean and standard +// deviation values of the block and sub-blocks. +static AOM_INLINE void get_mean_dev_features(const int16_t *data, int stride, + int bw, int bh, float *feature) { + const int16_t *const data_ptr = &data[0]; + const int subh = (bh >= bw) ? (bh >> 1) : bh; + const int subw = (bw >= bh) ? (bw >> 1) : bw; + const int num = bw * bh; + const int sub_num = subw * subh; + int feature_idx = 2; + int total_x_sum = 0; + int64_t total_x2_sum = 0; + int blk_idx = 0; + double mean2_sum = 0.0f; + float dev_sum = 0.0f; + + for (int row = 0; row < bh; row += subh) { + for (int col = 0; col < bw; col += subw) { + int x_sum; + int64_t x2_sum; + // TODO(any): Write a SIMD version. Clear registers. + aom_get_blk_sse_sum(data_ptr + row * stride + col, stride, subw, subh, + &x_sum, &x2_sum); + total_x_sum += x_sum; + total_x2_sum += x2_sum; + + aom_clear_system_state(); + const float mean = (float)x_sum / sub_num; + const float dev = get_dev(mean, (double)x2_sum, sub_num); + feature[feature_idx++] = mean; + feature[feature_idx++] = dev; + mean2_sum += (double)(mean * mean); + dev_sum += dev; + blk_idx++; + } + } + + const float lvl0_mean = (float)total_x_sum / num; + feature[0] = lvl0_mean; + feature[1] = get_dev(lvl0_mean, (double)total_x2_sum, num); + + if (blk_idx > 1) { + // Deviation of means. + feature[feature_idx++] = get_dev(lvl0_mean, mean2_sum, blk_idx); + // Mean of deviations. + feature[feature_idx++] = dev_sum / blk_idx; + } +} + +static int ml_predict_tx_split(MACROBLOCK *x, BLOCK_SIZE bsize, int blk_row, + int blk_col, TX_SIZE tx_size) { + const NN_CONFIG *nn_config = av1_tx_split_nnconfig_map[tx_size]; + if (!nn_config) return -1; + + const int diff_stride = block_size_wide[bsize]; + const int16_t *diff = + x->plane[0].src_diff + 4 * blk_row * diff_stride + 4 * blk_col; + const int bw = tx_size_wide[tx_size]; + const int bh = tx_size_high[tx_size]; + aom_clear_system_state(); + + float features[64] = { 0.0f }; + get_mean_dev_features(diff, diff_stride, bw, bh, features); + + float score = 0.0f; + av1_nn_predict(features, nn_config, 1, &score); + aom_clear_system_state(); + + int int_score = (int)(score * 10000); + return clamp(int_score, -80000, 80000); +} + +static INLINE uint16_t +get_tx_mask(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block, + int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + const TXB_CTX *const txb_ctx, FAST_TX_SEARCH_MODE ftxs_mode, + int64_t ref_best_rd, TX_TYPE *allowed_txk_types, int *txk_map) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const int is_inter = is_inter_block(mbmi); + const int fast_tx_search = ftxs_mode & FTXS_DCT_AND_1D_DCT_ONLY; + // if txk_allowed = TX_TYPES, >1 tx types are allowed, else, if txk_allowed < + // TX_TYPES, only that specific tx type is allowed. + TX_TYPE txk_allowed = TX_TYPES; + + if ((!is_inter && x->use_default_intra_tx_type) || + (is_inter && x->use_default_inter_tx_type)) { + txk_allowed = + get_default_tx_type(0, xd, tx_size, cpi->is_screen_content_type); + } else if (x->rd_model == LOW_TXFM_RD) { + if (plane == 0) txk_allowed = DCT_DCT; + } + + const TxSetType tx_set_type = av1_get_ext_tx_set_type( + tx_size, is_inter, cm->features.reduced_tx_set_used); + + TX_TYPE uv_tx_type = DCT_DCT; + if (plane) { + // tx_type of PLANE_TYPE_UV should be the same as PLANE_TYPE_Y + uv_tx_type = txk_allowed = + av1_get_tx_type(xd, get_plane_type(plane), blk_row, blk_col, tx_size, + cm->features.reduced_tx_set_used); + } + PREDICTION_MODE intra_dir = + mbmi->filter_intra_mode_info.use_filter_intra + ? fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode] + : mbmi->mode; + uint16_t ext_tx_used_flag = + cpi->sf.tx_sf.tx_type_search.use_reduced_intra_txset && + tx_set_type == EXT_TX_SET_DTT4_IDTX_1DDCT + ? av1_reduced_intra_tx_used_flag[intra_dir] + : av1_ext_tx_used_flag[tx_set_type]; + if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32 || + ext_tx_used_flag == 0x0001 || + (is_inter && cpi->oxcf.use_inter_dct_only) || + (!is_inter && cpi->oxcf.use_intra_dct_only)) { + txk_allowed = DCT_DCT; + } + + if (cpi->oxcf.enable_flip_idtx == 0) ext_tx_used_flag &= DCT_ADST_TX_MASK; + + uint16_t allowed_tx_mask = 0; // 1: allow; 0: skip. + if (txk_allowed < TX_TYPES) { + allowed_tx_mask = 1 << txk_allowed; + allowed_tx_mask &= ext_tx_used_flag; + } else if (fast_tx_search) { + allowed_tx_mask = 0x0c01; // V_DCT, H_DCT, DCT_DCT + allowed_tx_mask &= ext_tx_used_flag; + } else { + assert(plane == 0); + allowed_tx_mask = ext_tx_used_flag; + int num_allowed = 0; + const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group); + const int *tx_type_probs = + cpi->frame_probs.tx_type_probs[update_type][tx_size]; + int i; + + if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) { + static const int thresh_arr[2][7] = { { 10, 15, 15, 10, 15, 15, 15 }, + { 10, 17, 17, 10, 17, 17, 17 } }; + const int thresh = + thresh_arr[cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats - 1] + [update_type]; + uint16_t prune = 0; + int max_prob = -1; + int max_idx = 0; + for (i = 0; i < TX_TYPES; i++) { + if (tx_type_probs[i] > max_prob && (allowed_tx_mask & (1 << i))) { + max_prob = tx_type_probs[i]; + max_idx = i; + } + if (tx_type_probs[i] < thresh) prune |= (1 << i); + } + if ((prune >> max_idx) & 0x01) prune &= ~(1 << max_idx); + allowed_tx_mask &= (~prune); + } + for (i = 0; i < TX_TYPES; i++) { + if (allowed_tx_mask & (1 << i)) num_allowed++; + } + assert(num_allowed > 0); + + if (num_allowed > 2 && cpi->sf.tx_sf.tx_type_search.prune_tx_type_est_rd) { + int pf = prune_factors[x->prune_mode]; + int mf = mul_factors[x->prune_mode]; + if (num_allowed <= 7) { + const uint16_t prune = + prune_txk_type(cpi, x, plane, block, tx_size, blk_row, blk_col, + plane_bsize, txk_map, allowed_tx_mask, pf, txb_ctx, + cm->features.reduced_tx_set_used); + allowed_tx_mask &= (~prune); + } else { + const int num_sel = (num_allowed * mf + 50) / 100; + const uint16_t prune = prune_txk_type_separ( + cpi, x, plane, block, tx_size, blk_row, blk_col, plane_bsize, + txk_map, allowed_tx_mask, pf, txb_ctx, + cm->features.reduced_tx_set_used, ref_best_rd, num_sel); + + allowed_tx_mask &= (~prune); + } + } else { + assert(num_allowed > 0); + int allowed_tx_count = (x->prune_mode == PRUNE_2D_AGGRESSIVE) ? 1 : 5; + // !fast_tx_search && txk_end != txk_start && plane == 0 + if (x->prune_mode >= PRUNE_2D_ACCURATE && is_inter && + num_allowed > allowed_tx_count) { + prune_tx_2D(x, plane_bsize, tx_size, blk_row, blk_col, tx_set_type, + x->prune_mode, txk_map, &allowed_tx_mask); + } + } + } + + // Need to have at least one transform type allowed. + if (allowed_tx_mask == 0) { + txk_allowed = (plane ? uv_tx_type : DCT_DCT); + allowed_tx_mask = (1 << txk_allowed); + } + + assert(IMPLIES(txk_allowed < TX_TYPES, allowed_tx_mask == 1 << txk_allowed)); + *allowed_txk_types = txk_allowed; + return allowed_tx_mask; +} + +#if CONFIG_RD_DEBUG +static INLINE void update_txb_coeff_cost(RD_STATS *rd_stats, int plane, + TX_SIZE tx_size, int blk_row, + int blk_col, int txb_coeff_cost) { + (void)blk_row; + (void)blk_col; + (void)tx_size; + rd_stats->txb_coeff_cost[plane] += txb_coeff_cost; + + { + const int txb_h = tx_size_high_unit[tx_size]; + const int txb_w = tx_size_wide_unit[tx_size]; + int idx, idy; + for (idy = 0; idy < txb_h; ++idy) + for (idx = 0; idx < txb_w; ++idx) + rd_stats->txb_coeff_cost_map[plane][blk_row + idy][blk_col + idx] = 0; + + rd_stats->txb_coeff_cost_map[plane][blk_row][blk_col] = txb_coeff_cost; + } + assert(blk_row < TXB_COEFF_COST_MAP_SIZE); + assert(blk_col < TXB_COEFF_COST_MAP_SIZE); +} +#endif + +static INLINE int cost_coeffs(MACROBLOCK *x, int plane, int block, + TX_SIZE tx_size, const TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, + int use_fast_coef_costing, + int reduced_tx_set_used) { +#if TXCOEFF_COST_TIMER + struct aom_usec_timer timer; + aom_usec_timer_start(&timer); +#endif + (void)use_fast_coef_costing; + const int cost = av1_cost_coeffs_txb(x, plane, block, tx_size, tx_type, + txb_ctx, reduced_tx_set_used); +#if TXCOEFF_COST_TIMER + AV1_COMMON *tmp_cm = (AV1_COMMON *)&cpi->common; + aom_usec_timer_mark(&timer); + const int64_t elapsed_time = aom_usec_timer_elapsed(&timer); + tmp_cm->txcoeff_cost_timer += elapsed_time; + ++tmp_cm->txcoeff_cost_count; +#endif + return cost; +} + +// Search for the best transform type for a given transform block. +// This function can be used for both inter and intra, both luma and chroma. +static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, + int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + const TXB_CTX *const txb_ctx, + FAST_TX_SEARCH_MODE ftxs_mode, + int use_fast_coef_costing, int skip_trellis, + int64_t ref_best_rd, RD_STATS *best_rd_stats) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + struct macroblockd_plane *const pd = &xd->plane[plane]; + MB_MODE_INFO *mbmi = xd->mi[0]; + int64_t best_rd = INT64_MAX; + uint16_t best_eob = 0; + TX_TYPE best_tx_type = DCT_DCT; + int rate_cost = 0; + // The buffer used to swap dqcoeff in macroblockd_plane so we can keep dqcoeff + // of the best tx_type + DECLARE_ALIGNED(32, tran_low_t, this_dqcoeff[MAX_SB_SQUARE]); + tran_low_t *orig_dqcoeff = pd->dqcoeff; + tran_low_t *best_dqcoeff = this_dqcoeff; + const int tx_type_map_idx = + plane ? 0 : blk_row * xd->tx_type_map_stride + blk_col; + av1_invalid_rd_stats(best_rd_stats); + + skip_trellis |= !is_trellis_used(cpi->optimize_seg_arr[xd->mi[0]->segment_id], + DRY_RUN_NORMAL); + + // Hashing based speed feature for intra block. If the hash of the residue + // is found in the hash table, use the previous RD search results stored in + // the table and terminate early. + TXB_RD_INFO *intra_txb_rd_info = NULL; + uint16_t cur_joint_ctx = 0; + const int is_inter = is_inter_block(mbmi); + const int use_intra_txb_hash = + cpi->sf.tx_sf.use_intra_txb_hash && frame_is_intra_only(cm) && + !is_inter && plane == 0 && tx_size_wide[tx_size] == tx_size_high[tx_size]; + if (use_intra_txb_hash) { + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const int within_border = + mi_row >= xd->tile.mi_row_start && + (mi_row + mi_size_high[plane_bsize] < xd->tile.mi_row_end) && + mi_col >= xd->tile.mi_col_start && + (mi_col + mi_size_wide[plane_bsize] < xd->tile.mi_col_end); + if (within_border && + is_intra_hash_match(cpi, x, plane, blk_row, blk_col, plane_bsize, + tx_size, txb_ctx, &intra_txb_rd_info, + tx_type_map_idx, &cur_joint_ctx)) { + best_rd_stats->rate = intra_txb_rd_info->rate; + best_rd_stats->dist = intra_txb_rd_info->dist; + best_rd_stats->sse = intra_txb_rd_info->sse; + best_rd_stats->skip = intra_txb_rd_info->eob == 0; + x->plane[plane].eobs[block] = intra_txb_rd_info->eob; + x->plane[plane].txb_entropy_ctx[block] = + intra_txb_rd_info->txb_entropy_ctx; + best_eob = intra_txb_rd_info->eob; + best_tx_type = intra_txb_rd_info->tx_type; + skip_trellis |= !intra_txb_rd_info->perform_block_coeff_opt; + update_txk_array(xd, blk_row, blk_col, tx_size, best_tx_type); + recon_intra(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, + txb_ctx, skip_trellis, best_tx_type, 1, &rate_cost, best_eob); + pd->dqcoeff = orig_dqcoeff; + return; + } + } + + uint8_t best_txb_ctx = 0; + // txk_allowed = TX_TYPES: >1 tx types are allowed + // txk_allowed < TX_TYPES: only that specific tx type is allowed. + TX_TYPE txk_allowed = TX_TYPES; + int txk_map[TX_TYPES] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + }; + // Bit mask to indicate which transform types are allowed in the RD search. + const uint16_t allowed_tx_mask = + get_tx_mask(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, + txb_ctx, ftxs_mode, ref_best_rd, &txk_allowed, txk_map); + + unsigned int block_mse_q8; + int64_t block_sse = pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, + txsize_to_bsize[tx_size], &block_mse_q8); + assert(block_mse_q8 != UINT_MAX); + if (is_cur_buf_hbd(xd)) { + block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2); + block_mse_q8 = ROUND_POWER_OF_TWO(block_mse_q8, (xd->bd - 8) * 2); + } + block_sse *= 16; + const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; + const int qstep = x->plane[plane].dequant_QTX[1] >> dequant_shift; + // Use mse / qstep^2 based threshold logic to take decision of R-D + // optimization of coeffs. For smaller residuals, coeff optimization + // would be helpful. For larger residuals, R-D optimization may not be + // effective. + // TODO(any): Experiment with variance and mean based thresholds + const int perform_block_coeff_opt = + ((uint64_t)block_mse_q8 <= + (uint64_t)x->coeff_opt_dist_threshold * qstep * qstep); + skip_trellis |= !perform_block_coeff_opt; + + // Flag to indicate if distortion should be calculated in transform domain or + // not during iterating through transform type candidates. + // Transform domain distortion is accurate for higher residuals. + // TODO(any): Experiment with variance and mean based thresholds + int use_transform_domain_distortion = + (x->use_transform_domain_distortion > 0) && + (block_mse_q8 >= x->tx_domain_dist_threshold) && + // Any 64-pt transforms only preserves half the coefficients. + // Therefore transform domain distortion is not valid for these + // transform sizes. + txsize_sqr_up_map[tx_size] != TX_64X64; + // Flag to indicate if an extra calculation of distortion in the pixel domain + // should be performed at the end, after the best transform type has been + // decided. + int calc_pixel_domain_distortion_final = + x->use_transform_domain_distortion == 1 && + use_transform_domain_distortion && x->rd_model != LOW_TXFM_RD; + if (calc_pixel_domain_distortion_final && + (txk_allowed < TX_TYPES || allowed_tx_mask == 0x0001)) + calc_pixel_domain_distortion_final = use_transform_domain_distortion = 0; + + const uint16_t *eobs_ptr = x->plane[plane].eobs; + + TxfmParam txfm_param; + QUANT_PARAM quant_param; + av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param); + av1_setup_quant(tx_size, !skip_trellis, + skip_trellis ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B + : AV1_XFORM_QUANT_FP) + : AV1_XFORM_QUANT_FP, + cpi->oxcf.quant_b_adapt, &quant_param); + + // Iterate through all transform type candidates. + for (int idx = 0; idx < TX_TYPES; ++idx) { + const TX_TYPE tx_type = (TX_TYPE)txk_map[idx]; + if (!(allowed_tx_mask & (1 << tx_type))) continue; + txfm_param.tx_type = tx_type; + if (av1_use_qmatrix(&cm->quant_params, xd, mbmi->segment_id)) { + av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type, + &quant_param); + } + if (plane == 0) xd->tx_type_map[tx_type_map_idx] = tx_type; + RD_STATS this_rd_stats; + av1_invalid_rd_stats(&this_rd_stats); + + av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param, + &quant_param); + + // Calculate rate cost of quantized coefficients. + if (quant_param.use_optimize_b) { + if (cpi->sf.rd_sf.optimize_b_precheck && best_rd < INT64_MAX && + eobs_ptr[block] >= 4) { + // Calculate distortion quickly in transform domain. + dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist, + &this_rd_stats.sse); + + const int64_t best_rd_ = AOMMIN(best_rd, ref_best_rd); + const int64_t dist_cost_estimate = + RDCOST(x->rdmult, 0, AOMMIN(this_rd_stats.dist, this_rd_stats.sse)); + if (dist_cost_estimate - (dist_cost_estimate >> 3) > best_rd_) continue; + } + av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx, + cpi->sf.rd_sf.trellis_eob_fast, &rate_cost); + } else { + rate_cost = + cost_coeffs(x, plane, block, tx_size, tx_type, txb_ctx, + use_fast_coef_costing, cm->features.reduced_tx_set_used); + } + + // If rd cost based on coeff rate alone is already more than best_rd, + // terminate early. + if (RDCOST(x->rdmult, rate_cost, 0) > best_rd) continue; + + // Calculate distortion. + if (eobs_ptr[block] == 0) { + // When eob is 0, pixel domain distortion is more efficient and accurate. + this_rd_stats.dist = this_rd_stats.sse = block_sse; + } else if (use_transform_domain_distortion) { + dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist, + &this_rd_stats.sse); + } else { + int64_t sse_diff = INT64_MAX; + // high_energy threshold assumes that every pixel within a txfm block + // has a residue energy of at least 25% of the maximum, i.e. 128 * 128 + // for 8 bit, then the threshold is scaled based on input bit depth. + const int64_t high_energy_thresh = + ((int64_t)128 * 128 * tx_size_2d[tx_size]) << ((xd->bd - 8) * 2); + const int is_high_energy = (block_sse >= high_energy_thresh); + if (tx_size == TX_64X64 || is_high_energy) { + // Because 3 out 4 quadrants of transform coefficients are forced to + // zero, the inverse transform has a tendency to overflow. sse_diff + // is effectively the energy of those 3 quadrants, here we use it + // to decide if we should do pixel domain distortion. If the energy + // is mostly in first quadrant, then it is unlikely that we have + // overflow issue in inverse transform. + dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist, + &this_rd_stats.sse); + sse_diff = block_sse - this_rd_stats.sse; + } + if (tx_size != TX_64X64 || !is_high_energy || + (sse_diff * 2) < this_rd_stats.sse) { + const int64_t tx_domain_dist = this_rd_stats.dist; + this_rd_stats.dist = dist_block_px_domain( + cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size); + // For high energy blocks, occasionally, the pixel domain distortion + // can be artificially low due to clamping at reconstruction stage + // even when inverse transform output is hugely different from the + // actual residue. + if (is_high_energy && this_rd_stats.dist < tx_domain_dist) + this_rd_stats.dist = tx_domain_dist; + } else { + assert(sse_diff < INT64_MAX); + this_rd_stats.dist += sse_diff; + } + this_rd_stats.sse = block_sse; + } + + this_rd_stats.rate = rate_cost; + + const int64_t rd = + RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist); + + if (rd < best_rd) { + best_rd = rd; + *best_rd_stats = this_rd_stats; + best_tx_type = tx_type; + best_txb_ctx = x->plane[plane].txb_entropy_ctx[block]; + best_eob = x->plane[plane].eobs[block]; + // Swap dqcoeff buffers + tran_low_t *const tmp_dqcoeff = best_dqcoeff; + best_dqcoeff = pd->dqcoeff; + pd->dqcoeff = tmp_dqcoeff; + } + +#if CONFIG_COLLECT_RD_STATS == 1 + if (plane == 0) { + PrintTransformUnitStats(cpi, x, &this_rd_stats, blk_row, blk_col, + plane_bsize, tx_size, tx_type, rd); + } +#endif // CONFIG_COLLECT_RD_STATS == 1 + +#if COLLECT_TX_SIZE_DATA + // Generate small sample to restrict output size. + static unsigned int seed = 21743; + if (lcg_rand16(&seed) % 200 == 0) { + FILE *fp = NULL; + + if (within_border) { + fp = fopen(av1_tx_size_data_output_file, "a"); + } + + if (fp) { + // Transform info and RD + const int txb_w = tx_size_wide[tx_size]; + const int txb_h = tx_size_high[tx_size]; + + // Residue signal. + const int diff_stride = block_size_wide[plane_bsize]; + struct macroblock_plane *const p = &x->plane[plane]; + const int16_t *src_diff = + &p->src_diff[(blk_row * diff_stride + blk_col) * 4]; + + for (int r = 0; r < txb_h; ++r) { + for (int c = 0; c < txb_w; ++c) { + fprintf(fp, "%d,", src_diff[c]); + } + src_diff += diff_stride; + } + + fprintf(fp, "%d,%d,%d,%" PRId64, txb_w, txb_h, tx_type, rd); + fprintf(fp, "\n"); + fclose(fp); + } + } +#endif // COLLECT_TX_SIZE_DATA + + // If the current best RD cost is much worse than the reference RD cost, + // terminate early. + if (cpi->sf.tx_sf.adaptive_txb_search_level) { + if ((best_rd - (best_rd >> cpi->sf.tx_sf.adaptive_txb_search_level)) > + ref_best_rd) { + break; + } + } + + // Terminate transform type search if the block has been quantized to + // all zero. + if (cpi->sf.tx_sf.tx_type_search.skip_tx_search && !best_eob) break; + } + + assert(best_rd != INT64_MAX); + + best_rd_stats->skip = best_eob == 0; + if (plane == 0) update_txk_array(xd, blk_row, blk_col, tx_size, best_tx_type); + x->plane[plane].txb_entropy_ctx[block] = best_txb_ctx; + x->plane[plane].eobs[block] = best_eob; + + // Point dqcoeff to the quantized coefficients corresponding to the best + // transform type, then we can skip transform and quantization, e.g. in the + // final pixel domain distortion calculation and recon_intra(). + pd->dqcoeff = best_dqcoeff; + + if (calc_pixel_domain_distortion_final && best_eob) { + best_rd_stats->dist = dist_block_px_domain( + cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size); + best_rd_stats->sse = block_sse; + } + + if (intra_txb_rd_info != NULL) { + intra_txb_rd_info->valid = 1; + intra_txb_rd_info->entropy_context = cur_joint_ctx; + intra_txb_rd_info->rate = best_rd_stats->rate; + intra_txb_rd_info->dist = best_rd_stats->dist; + intra_txb_rd_info->sse = best_rd_stats->sse; + intra_txb_rd_info->eob = best_eob; + intra_txb_rd_info->txb_entropy_ctx = best_txb_ctx; + intra_txb_rd_info->perform_block_coeff_opt = perform_block_coeff_opt; + if (plane == 0) intra_txb_rd_info->tx_type = best_tx_type; + } + + // Intra mode needs decoded pixels such that the next transform block + // can use them for prediction. + recon_intra(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, + txb_ctx, skip_trellis, best_tx_type, 0, &rate_cost, best_eob); + pd->dqcoeff = orig_dqcoeff; +} + +// Pick transform type for a luma transform block of tx_size. Note this function +// is used only for inter-predicted blocks. +static AOM_INLINE void tx_type_rd(const AV1_COMP *cpi, MACROBLOCK *x, + TX_SIZE tx_size, int blk_row, int blk_col, + int block, int plane_bsize, TXB_CTX *txb_ctx, + RD_STATS *rd_stats, + FAST_TX_SEARCH_MODE ftxs_mode, + int64_t ref_rdcost, + TXB_RD_INFO *rd_info_array) { + const struct macroblock_plane *const p = &x->plane[0]; + const uint16_t cur_joint_ctx = + (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx; + MACROBLOCKD *xd = &x->e_mbd; + assert(is_inter_block(xd->mi[0])); + const int tx_type_map_idx = blk_row * xd->tx_type_map_stride + blk_col; + // Look up RD and terminate early in case when we've already processed exactly + // the same residue with exactly the same entropy context. + if (rd_info_array != NULL && rd_info_array->valid && + rd_info_array->entropy_context == cur_joint_ctx) { + xd->tx_type_map[tx_type_map_idx] = rd_info_array->tx_type; + const TX_TYPE ref_tx_type = + av1_get_tx_type(&x->e_mbd, get_plane_type(0), blk_row, blk_col, tx_size, + cpi->common.features.reduced_tx_set_used); + if (ref_tx_type == rd_info_array->tx_type) { + rd_stats->rate += rd_info_array->rate; + rd_stats->dist += rd_info_array->dist; + rd_stats->sse += rd_info_array->sse; + rd_stats->skip &= rd_info_array->eob == 0; + p->eobs[block] = rd_info_array->eob; + p->txb_entropy_ctx[block] = rd_info_array->txb_entropy_ctx; + return; + } + } + + RD_STATS this_rd_stats; + const int skip_trellis = 0; + search_tx_type(cpi, x, 0, block, blk_row, blk_col, plane_bsize, tx_size, + txb_ctx, ftxs_mode, 0, skip_trellis, ref_rdcost, + &this_rd_stats); + + av1_merge_rd_stats(rd_stats, &this_rd_stats); + + // Save RD results for possible reuse in future. + if (rd_info_array != NULL) { + rd_info_array->valid = 1; + rd_info_array->entropy_context = cur_joint_ctx; + rd_info_array->rate = this_rd_stats.rate; + rd_info_array->dist = this_rd_stats.dist; + rd_info_array->sse = this_rd_stats.sse; + rd_info_array->eob = p->eobs[block]; + rd_info_array->txb_entropy_ctx = p->txb_entropy_ctx[block]; + rd_info_array->tx_type = xd->tx_type_map[tx_type_map_idx]; + } +} + +static AOM_INLINE void try_tx_block_no_split( + const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block, + TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, + const ENTROPY_CONTEXT *ta, const ENTROPY_CONTEXT *tl, + int txfm_partition_ctx, RD_STATS *rd_stats, int64_t ref_best_rd, + FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node, + TxCandidateInfo *no_split) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + struct macroblock_plane *const p = &x->plane[0]; + const int bw = mi_size_wide[plane_bsize]; + const ENTROPY_CONTEXT *const pta = ta + blk_col; + const ENTROPY_CONTEXT *const ptl = tl + blk_row; + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, 0, pta, ptl, &txb_ctx); + const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_Y] + .txb_skip_cost[txb_ctx.txb_skip_ctx][1]; + rd_stats->zero_rate = zero_blk_rate; + const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col); + mbmi->inter_tx_size[index] = tx_size; + tx_type_rd(cpi, x, tx_size, blk_row, blk_col, block, plane_bsize, &txb_ctx, + rd_stats, ftxs_mode, ref_best_rd, + rd_info_node != NULL ? rd_info_node->rd_info_array : NULL); + assert(rd_stats->rate < INT_MAX); + + const int pick_skip = !xd->lossless[mbmi->segment_id] && + (rd_stats->skip == 1 || + RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >= + RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse)); + if (pick_skip) { +#if CONFIG_RD_DEBUG + update_txb_coeff_cost(rd_stats, 0, tx_size, blk_row, blk_col, + zero_blk_rate - rd_stats->rate); +#endif // CONFIG_RD_DEBUG + rd_stats->rate = zero_blk_rate; + rd_stats->dist = rd_stats->sse; + p->eobs[block] = 0; + update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT); + } + rd_stats->skip = pick_skip; + set_blk_skip(x, 0, blk_row * bw + blk_col, pick_skip); + + if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) + rd_stats->rate += x->txfm_partition_cost[txfm_partition_ctx][0]; + + no_split->rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + no_split->txb_entropy_ctx = p->txb_entropy_ctx[block]; + no_split->tx_type = + xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col]; +} + +static AOM_INLINE void try_tx_block_split( + const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block, + TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta, + ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left, + int txfm_partition_ctx, int64_t no_split_rd, int64_t ref_best_rd, + FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node, + RD_STATS *split_rd_stats) { + assert(tx_size < TX_SIZES_ALL); + MACROBLOCKD *const xd = &x->e_mbd; + const int max_blocks_high = max_block_high(xd, plane_bsize, 0); + const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0); + const int txb_width = tx_size_wide_unit[tx_size]; + const int txb_height = tx_size_high_unit[tx_size]; + // Transform size after splitting current block. + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int sub_txb_width = tx_size_wide_unit[sub_txs]; + const int sub_txb_height = tx_size_high_unit[sub_txs]; + const int sub_step = sub_txb_width * sub_txb_height; + const int nblks = (txb_height / sub_txb_height) * (txb_width / sub_txb_width); + assert(nblks > 0); + av1_init_rd_stats(split_rd_stats); + split_rd_stats->rate = x->txfm_partition_cost[txfm_partition_ctx][1]; + + for (int r = 0, blk_idx = 0; r < txb_height; r += sub_txb_height) { + for (int c = 0; c < txb_width; c += sub_txb_width, ++blk_idx) { + assert(blk_idx < 4); + const int offsetr = blk_row + r; + const int offsetc = blk_col + c; + if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; + + RD_STATS this_rd_stats; + int this_cost_valid = 1; + select_tx_block( + cpi, x, offsetr, offsetc, block, sub_txs, depth + 1, plane_bsize, ta, + tl, tx_above, tx_left, &this_rd_stats, no_split_rd / nblks, + ref_best_rd - split_rd_stats->rdcost, &this_cost_valid, ftxs_mode, + (rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL); + if (!this_cost_valid) { + split_rd_stats->rdcost = INT64_MAX; + return; + } + av1_merge_rd_stats(split_rd_stats, &this_rd_stats); + split_rd_stats->rdcost = + RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist); + if (split_rd_stats->rdcost > ref_best_rd) { + split_rd_stats->rdcost = INT64_MAX; + return; + } + block += sub_step; + } + } +} + +// Search for the best transform partition(recursive)/type for a given +// inter-predicted luma block. The obtained transform selection will be saved +// in xd->mi[0], the corresponding RD stats will be saved in rd_stats. +static AOM_INLINE void select_tx_block( + const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block, + TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta, + ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left, + RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd, + int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode, + TXB_RD_INFO_NODE *rd_info_node) { + assert(tx_size < TX_SIZES_ALL); + av1_init_rd_stats(rd_stats); + if (ref_best_rd < 0) { + *is_cost_valid = 0; + return; + } + + MACROBLOCKD *const xd = &x->e_mbd; + assert(blk_row < max_block_high(xd, plane_bsize, 0) && + blk_col < max_block_wide(xd, plane_bsize, 0)); + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row, + mbmi->sb_type, tx_size); + struct macroblock_plane *const p = &x->plane[0]; + + const int try_no_split = + cpi->oxcf.enable_tx64 || txsize_sqr_up_map[tx_size] != TX_64X64; + int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH; + TxCandidateInfo no_split = { INT64_MAX, 0, TX_TYPES }; + + // Try using current block as a single transform block without split. + if (try_no_split) { + try_tx_block_no_split(cpi, x, blk_row, blk_col, block, tx_size, depth, + plane_bsize, ta, tl, ctx, rd_stats, ref_best_rd, + ftxs_mode, rd_info_node, &no_split); + + // Speed features for early termination. + const int search_level = cpi->sf.tx_sf.adaptive_txb_search_level; + if (search_level) { + if ((no_split.rd - (no_split.rd >> (1 + search_level))) > ref_best_rd) { + *is_cost_valid = 0; + return; + } + if (no_split.rd - (no_split.rd >> (2 + search_level)) > prev_level_rd) { + try_split = 0; + } + } + if (cpi->sf.tx_sf.txb_split_cap) { + if (p->eobs[block] == 0) try_split = 0; + } + } + + // ML based speed feature to skip searching for split transform blocks. + if (x->e_mbd.bd == 8 && try_split && + !(ref_best_rd == INT64_MAX && no_split.rd == INT64_MAX)) { + const int threshold = cpi->sf.tx_sf.tx_type_search.ml_tx_split_thresh; + if (threshold >= 0) { + const int split_score = + ml_predict_tx_split(x, plane_bsize, blk_row, blk_col, tx_size); + if (split_score < -threshold) try_split = 0; + } + } + + RD_STATS split_rd_stats; + split_rd_stats.rdcost = INT64_MAX; + // Try splitting current block into smaller transform blocks. + if (try_split) { + try_tx_block_split(cpi, x, blk_row, blk_col, block, tx_size, depth, + plane_bsize, ta, tl, tx_above, tx_left, ctx, no_split.rd, + AOMMIN(no_split.rd, ref_best_rd), ftxs_mode, + rd_info_node, &split_rd_stats); + } + + if (no_split.rd < split_rd_stats.rdcost) { + ENTROPY_CONTEXT *pta = ta + blk_col; + ENTROPY_CONTEXT *ptl = tl + blk_row; + p->txb_entropy_ctx[block] = no_split.txb_entropy_ctx; + av1_set_txb_context(x, 0, block, tx_size, pta, ptl); + txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size, + tx_size); + for (int idy = 0; idy < tx_size_high_unit[tx_size]; ++idy) { + for (int idx = 0; idx < tx_size_wide_unit[tx_size]; ++idx) { + const int index = + av1_get_txb_size_index(plane_bsize, blk_row + idy, blk_col + idx); + mbmi->inter_tx_size[index] = tx_size; + } + } + mbmi->tx_size = tx_size; + update_txk_array(xd, blk_row, blk_col, tx_size, no_split.tx_type); + const int bw = mi_size_wide[plane_bsize]; + set_blk_skip(x, 0, blk_row * bw + blk_col, rd_stats->skip); + } else { + *rd_stats = split_rd_stats; + if (split_rd_stats.rdcost == INT64_MAX) *is_cost_valid = 0; + } +} + +static AOM_INLINE void choose_largest_tx_size(const AV1_COMP *const cpi, + MACROBLOCK *x, RD_STATS *rd_stats, + int64_t ref_best_rd, + BLOCK_SIZE bs) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + mbmi->tx_size = tx_size_from_tx_mode(bs, x->tx_mode_search_type); + + // If tx64 is not enabled, we need to go down to the next available size + if (!cpi->oxcf.enable_tx64) { + static const TX_SIZE tx_size_max_32[TX_SIZES_ALL] = { + TX_4X4, // 4x4 transform + TX_8X8, // 8x8 transform + TX_16X16, // 16x16 transform + TX_32X32, // 32x32 transform + TX_32X32, // 64x64 transform + TX_4X8, // 4x8 transform + TX_8X4, // 8x4 transform + TX_8X16, // 8x16 transform + TX_16X8, // 16x8 transform + TX_16X32, // 16x32 transform + TX_32X16, // 32x16 transform + TX_32X32, // 32x64 transform + TX_32X32, // 64x32 transform + TX_4X16, // 4x16 transform + TX_16X4, // 16x4 transform + TX_8X32, // 8x32 transform + TX_32X8, // 32x8 transform + TX_16X32, // 16x64 transform + TX_32X16, // 64x16 transform + }; + + mbmi->tx_size = tx_size_max_32[mbmi->tx_size]; + } + + const int skip_ctx = av1_get_skip_context(xd); + const int no_skip_flag_rate = x->skip_cost[skip_ctx][0]; + const int skip_flag_rate = x->skip_cost[skip_ctx][1]; + // Skip RDcost is used only for Inter blocks + const int64_t skip_rd = + is_inter_block(mbmi) ? RDCOST(x->rdmult, skip_flag_rate, 0) : INT64_MAX; + const int64_t no_skip_rd = RDCOST(x->rdmult, no_skip_flag_rate, 0); + const int skip_trellis = 0; + av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, + AOMMIN(no_skip_rd, skip_rd), AOM_PLANE_Y, bs, + mbmi->tx_size, cpi->sf.rd_sf.use_fast_coef_costing, + FTXS_NONE, skip_trellis); +} + +static AOM_INLINE void choose_smallest_tx_size(const AV1_COMP *const cpi, + MACROBLOCK *x, + RD_STATS *rd_stats, + int64_t ref_best_rd, + BLOCK_SIZE bs) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + + mbmi->tx_size = TX_4X4; + // TODO(any) : Pass this_rd based on skip/non-skip cost + const int skip_trellis = 0; + av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, 0, bs, mbmi->tx_size, + cpi->sf.rd_sf.use_fast_coef_costing, FTXS_NONE, + skip_trellis); +} + +// Search for the best uniform transform size and type for current coding block. +static AOM_INLINE void choose_tx_size_type_from_rd(const AV1_COMP *const cpi, + MACROBLOCK *x, + RD_STATS *rd_stats, + int64_t ref_best_rd, + BLOCK_SIZE bs) { + av1_invalid_rd_stats(rd_stats); + + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bs]; + const int tx_select = x->tx_mode_search_type == TX_MODE_SELECT; + int start_tx; + // The split depth can be at most MAX_TX_DEPTH, so the init_depth controls + // how many times of splitting is allowed during the RD search. + int init_depth; + + if (tx_select) { + start_tx = max_rect_tx_size; + init_depth = get_search_init_depth(mi_size_wide[bs], mi_size_high[bs], + is_inter_block(mbmi), &cpi->sf, + x->tx_size_search_method); + } else { + const TX_SIZE chosen_tx_size = + tx_size_from_tx_mode(bs, x->tx_mode_search_type); + start_tx = chosen_tx_size; + init_depth = MAX_TX_DEPTH; + } + + const int skip_trellis = 0; + uint8_t best_txk_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + TX_SIZE best_tx_size = max_rect_tx_size; + int64_t best_rd = INT64_MAX; + const int num_blks = bsize_to_num_blk(bs); + x->rd_model = FULL_TXFM_RD; + int64_t rd[MAX_TX_DEPTH + 1] = { INT64_MAX, INT64_MAX, INT64_MAX }; + for (int tx_size = start_tx, depth = init_depth; depth <= MAX_TX_DEPTH; + depth++, tx_size = sub_tx_size_map[tx_size]) { + if (!cpi->oxcf.enable_tx64 && txsize_sqr_up_map[tx_size] == TX_64X64) { + continue; + } + + RD_STATS this_rd_stats; + rd[depth] = av1_uniform_txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, + tx_size, FTXS_NONE, skip_trellis); + if (rd[depth] < best_rd) { + av1_copy_array(best_blk_skip, x->blk_skip, num_blks); + av1_copy_array(best_txk_type_map, xd->tx_type_map, num_blks); + best_tx_size = tx_size; + best_rd = rd[depth]; + *rd_stats = this_rd_stats; + } + if (tx_size == TX_4X4) break; + // If we are searching three depths, prune the smallest size depending + // on rd results for the first two depths for low contrast blocks. + if (depth > init_depth && depth != MAX_TX_DEPTH && + x->source_variance < 256) { + if (rd[depth - 1] != INT64_MAX && rd[depth] > rd[depth - 1]) break; + } + } + + if (rd_stats->rate != INT_MAX) { + mbmi->tx_size = best_tx_size; + av1_copy_array(xd->tx_type_map, best_txk_type_map, num_blks); + av1_copy_array(x->blk_skip, best_blk_skip, num_blks); + } +} + +// Search for the best transform type for the given transform block in the +// given plane/channel, and calculate the corresponding RD cost. +static AOM_INLINE void block_rd_txfm(int plane, int block, int blk_row, + int blk_col, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { + struct rdcost_block_args *args = arg; + if (args->exit_early) { + args->incomplete_exit = 1; + return; + } + + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + const int is_inter = is_inter_block(xd->mi[0]); + const AV1_COMP *cpi = args->cpi; + ENTROPY_CONTEXT *a = args->t_above + blk_col; + ENTROPY_CONTEXT *l = args->t_left + blk_row; + const AV1_COMMON *cm = &cpi->common; + RD_STATS this_rd_stats; + av1_init_rd_stats(&this_rd_stats); + + if (!is_inter) { + av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size); + av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size); + } + + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx); + search_tx_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, + &txb_ctx, args->ftxs_mode, args->use_fast_coef_costing, + args->skip_trellis, args->best_rd - args->current_rd, + &this_rd_stats); + + if (plane == AOM_PLANE_Y && xd->cfl.store_y) { + assert(!is_inter || plane_bsize < BLOCK_8X8); + cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize); + } + +#if CONFIG_RD_DEBUG + update_txb_coeff_cost(&this_rd_stats, plane, tx_size, blk_row, blk_col, + this_rd_stats.rate); +#endif // CONFIG_RD_DEBUG + av1_set_txb_context(x, plane, block, tx_size, a, l); + + const int blk_idx = + blk_row * (block_size_wide[plane_bsize] >> MI_SIZE_LOG2) + blk_col; + if (plane == 0) + set_blk_skip(x, plane, blk_idx, x->plane[plane].eobs[block] == 0); + else + set_blk_skip(x, plane, blk_idx, 0); + + int64_t rd; + if (is_inter) { + const int64_t no_skip_rd = + RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist); + const int64_t skip_rd = RDCOST(x->rdmult, 0, this_rd_stats.sse); + rd = AOMMIN(no_skip_rd, skip_rd); + this_rd_stats.skip &= !x->plane[plane].eobs[block]; + } else { + // Signal non-skip for Intra blocks + rd = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist); + this_rd_stats.skip = 0; + } + + av1_merge_rd_stats(&args->rd_stats, &this_rd_stats); + + args->current_rd += rd; + if (args->current_rd > args->best_rd) args->exit_early = 1; +} + +// Search for the best transform type and return the transform coefficients RD +// cost of current luma coding block with the given uniform transform size. +int64_t av1_uniform_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, + RD_STATS *rd_stats, int64_t ref_best_rd, + BLOCK_SIZE bs, TX_SIZE tx_size, + FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis) { + assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed_bsize(bs))); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int is_inter = is_inter_block(mbmi); + const int tx_select = x->tx_mode_search_type == TX_MODE_SELECT && + block_signals_txsize(mbmi->sb_type); + int tx_size_rate = 0; + if (tx_select) { + const int ctx = txfm_partition_context( + xd->above_txfm_context, xd->left_txfm_context, mbmi->sb_type, tx_size); + tx_size_rate = is_inter ? x->txfm_partition_cost[ctx][0] + : tx_size_cost(x, bs, tx_size); + } + const int skip_ctx = av1_get_skip_context(xd); + const int no_skip_flag_rate = x->skip_cost[skip_ctx][0]; + const int skip_flag_rate = x->skip_cost[skip_ctx][1]; + const int64_t skip_rd = + is_inter ? RDCOST(x->rdmult, skip_flag_rate, 0) : INT64_MAX; + const int64_t no_this_rd = + RDCOST(x->rdmult, no_skip_flag_rate + tx_size_rate, 0); + + mbmi->tx_size = tx_size; + av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, + AOMMIN(no_this_rd, skip_rd), AOM_PLANE_Y, bs, tx_size, + cpi->sf.rd_sf.use_fast_coef_costing, ftxs_mode, + skip_trellis); + if (rd_stats->rate == INT_MAX) return INT64_MAX; + + int64_t rd; + // rdstats->rate should include all the rate except skip/non-skip cost as the + // same is accounted in the caller functions after rd evaluation of all + // planes. However the decisions should be done after considering the + // skip/non-skip header cost + if (rd_stats->skip && is_inter) { + rd = RDCOST(x->rdmult, skip_flag_rate, rd_stats->sse); + } else { + // Intra blocks are always signalled as non-skip + rd = RDCOST(x->rdmult, rd_stats->rate + no_skip_flag_rate + tx_size_rate, + rd_stats->dist); + rd_stats->rate += tx_size_rate; + } + // Check if forcing the block to skip transform leads to smaller RD cost. + if (is_inter && !rd_stats->skip && !xd->lossless[mbmi->segment_id]) { + int64_t temp_skip_rd = RDCOST(x->rdmult, skip_flag_rate, rd_stats->sse); + if (temp_skip_rd <= rd) { + rd = temp_skip_rd; + rd_stats->rate = 0; + rd_stats->dist = rd_stats->sse; + rd_stats->skip = 1; + } + } + + return rd; +} + +// Search for the best transform type for a luma inter-predicted block, given +// the transform block partitions. +// This function is used only when some speed features are enabled. +static AOM_INLINE void tx_block_yrd( + const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block, + TX_SIZE tx_size, BLOCK_SIZE plane_bsize, int depth, + ENTROPY_CONTEXT *above_ctx, ENTROPY_CONTEXT *left_ctx, + TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left, int64_t ref_best_rd, + RD_STATS *rd_stats, FAST_TX_SEARCH_MODE ftxs_mode) { + assert(tx_size < TX_SIZES_ALL); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(is_inter_block(mbmi)); + const int max_blocks_high = max_block_high(xd, plane_bsize, 0); + const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0); + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + const TX_SIZE plane_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index( + plane_bsize, blk_row, blk_col)]; + const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row, + mbmi->sb_type, tx_size); + + av1_init_rd_stats(rd_stats); + if (tx_size == plane_tx_size) { + ENTROPY_CONTEXT *ta = above_ctx + blk_col; + ENTROPY_CONTEXT *tl = left_ctx + blk_row; + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, 0, ta, tl, &txb_ctx); + + const int zero_blk_rate = x->coeff_costs[txs_ctx][get_plane_type(0)] + .txb_skip_cost[txb_ctx.txb_skip_ctx][1]; + rd_stats->zero_rate = zero_blk_rate; + tx_type_rd(cpi, x, tx_size, blk_row, blk_col, block, plane_bsize, &txb_ctx, + rd_stats, ftxs_mode, ref_best_rd, NULL); + const int mi_width = mi_size_wide[plane_bsize]; + if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >= + RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) || + rd_stats->skip == 1) { + rd_stats->rate = zero_blk_rate; + rd_stats->dist = rd_stats->sse; + rd_stats->skip = 1; + set_blk_skip(x, 0, blk_row * mi_width + blk_col, 1); + x->plane[0].eobs[block] = 0; + x->plane[0].txb_entropy_ctx[block] = 0; + update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT); + } else { + rd_stats->skip = 0; + set_blk_skip(x, 0, blk_row * mi_width + blk_col, 0); + } + if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) + rd_stats->rate += x->txfm_partition_cost[ctx][0]; + av1_set_txb_context(x, 0, block, tx_size, ta, tl); + txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size, + tx_size); + } else { + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int txb_width = tx_size_wide_unit[sub_txs]; + const int txb_height = tx_size_high_unit[sub_txs]; + const int step = txb_height * txb_width; + RD_STATS pn_rd_stats; + int64_t this_rd = 0; + assert(txb_width > 0 && txb_height > 0); + + for (int row = 0; row < tx_size_high_unit[tx_size]; row += txb_height) { + for (int col = 0; col < tx_size_wide_unit[tx_size]; col += txb_width) { + const int offsetr = blk_row + row; + const int offsetc = blk_col + col; + if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; + + av1_init_rd_stats(&pn_rd_stats); + tx_block_yrd(cpi, x, offsetr, offsetc, block, sub_txs, plane_bsize, + depth + 1, above_ctx, left_ctx, tx_above, tx_left, + ref_best_rd - this_rd, &pn_rd_stats, ftxs_mode); + if (pn_rd_stats.rate == INT_MAX) { + av1_invalid_rd_stats(rd_stats); + return; + } + av1_merge_rd_stats(rd_stats, &pn_rd_stats); + this_rd += RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist); + block += step; + } + } + + if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) + rd_stats->rate += x->txfm_partition_cost[ctx][1]; + } +} + +// search for tx type with tx sizes already decided for a inter-predicted luma +// partition block. It's used only when some speed features are enabled. +// Return value 0: early termination triggered, no valid rd cost available; +// 1: rd cost values are valid. +static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int64_t ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode) { + if (ref_best_rd < 0) { + av1_invalid_rd_stats(rd_stats); + return 0; + } + + av1_init_rd_stats(rd_stats); + + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblockd_plane *const pd = &xd->plane[0]; + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, bsize, 0); + const int bh = tx_size_high_unit[max_tx_size]; + const int bw = tx_size_wide_unit[max_tx_size]; + const int step = bw * bh; + const int init_depth = get_search_init_depth(mi_width, mi_height, 1, &cpi->sf, + x->tx_size_search_method); + ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE]; + ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE]; + TXFM_CONTEXT tx_above[MAX_MIB_SIZE]; + TXFM_CONTEXT tx_left[MAX_MIB_SIZE]; + av1_get_entropy_contexts(bsize, pd, ctxa, ctxl); + memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width); + memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height); + + int64_t this_rd = 0; + for (int idy = 0, block = 0; idy < mi_height; idy += bh) { + for (int idx = 0; idx < mi_width; idx += bw) { + RD_STATS pn_rd_stats; + av1_init_rd_stats(&pn_rd_stats); + tx_block_yrd(cpi, x, idy, idx, block, max_tx_size, bsize, init_depth, + ctxa, ctxl, tx_above, tx_left, ref_best_rd - this_rd, + &pn_rd_stats, ftxs_mode); + if (pn_rd_stats.rate == INT_MAX) { + av1_invalid_rd_stats(rd_stats); + return 0; + } + av1_merge_rd_stats(rd_stats, &pn_rd_stats); + this_rd += + AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist), + RDCOST(x->rdmult, pn_rd_stats.zero_rate, pn_rd_stats.sse)); + block += step; + } + } + + const int skip_ctx = av1_get_skip_context(xd); + const int no_skip_flag_rate = x->skip_cost[skip_ctx][0]; + const int skip_flag_rate = x->skip_cost[skip_ctx][1]; + const int64_t skip_rd = RDCOST(x->rdmult, skip_flag_rate, rd_stats->sse); + this_rd = + RDCOST(x->rdmult, rd_stats->rate + no_skip_flag_rate, rd_stats->dist); + if (skip_rd < this_rd) { + this_rd = skip_rd; + rd_stats->rate = 0; + rd_stats->dist = rd_stats->sse; + rd_stats->skip = 1; + } + + const int is_cost_valid = this_rd > ref_best_rd; + if (!is_cost_valid) { + // reset cost value + av1_invalid_rd_stats(rd_stats); + } + return is_cost_valid; +} + +// Search for the best transform size and type for current inter-predicted +// luma block with recursive transform block partitioning. The obtained +// transform selection will be saved in xd->mi[0], the corresponding RD stats +// will be saved in rd_stats. The returned value is the corresponding RD cost. +static int64_t select_tx_size_and_type(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int64_t ref_best_rd, + TXB_RD_INFO_NODE *rd_info_tree) { + MACROBLOCKD *const xd = &x->e_mbd; + assert(is_inter_block(xd->mi[0])); + assert(bsize < BLOCK_SIZES_ALL); + const int fast_tx_search = x->tx_size_search_method > USE_FULL_RD; + int64_t rd_thresh = ref_best_rd; + if (fast_tx_search && rd_thresh < INT64_MAX) { + if (INT64_MAX - rd_thresh > (rd_thresh >> 3)) rd_thresh += (rd_thresh >> 3); + } + assert(rd_thresh > 0); + const FAST_TX_SEARCH_MODE ftxs_mode = + fast_tx_search ? FTXS_DCT_AND_1D_DCT_ONLY : FTXS_NONE; + const struct macroblockd_plane *const pd = &xd->plane[0]; + assert(bsize < BLOCK_SIZES_ALL); + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE]; + ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE]; + TXFM_CONTEXT tx_above[MAX_MIB_SIZE]; + TXFM_CONTEXT tx_left[MAX_MIB_SIZE]; + av1_get_entropy_contexts(bsize, pd, ctxa, ctxl); + memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width); + memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height); + const int init_depth = get_search_init_depth(mi_width, mi_height, 1, &cpi->sf, + x->tx_size_search_method); + const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize]; + const int bh = tx_size_high_unit[max_tx_size]; + const int bw = tx_size_wide_unit[max_tx_size]; + const int step = bw * bh; + const int skip_ctx = av1_get_skip_context(xd); + const int no_skip_flag_cost = x->skip_cost[skip_ctx][0]; + const int skip_flag_cost = x->skip_cost[skip_ctx][1]; + int64_t skip_rd = RDCOST(x->rdmult, skip_flag_cost, 0); + int64_t no_skip_rd = RDCOST(x->rdmult, no_skip_flag_cost, 0); + int block = 0; + + av1_init_rd_stats(rd_stats); + for (int idy = 0; idy < max_block_high(xd, bsize, 0); idy += bh) { + for (int idx = 0; idx < max_block_wide(xd, bsize, 0); idx += bw) { + const int64_t best_rd_sofar = + (rd_thresh == INT64_MAX) + ? INT64_MAX + : (rd_thresh - (AOMMIN(skip_rd, no_skip_rd))); + int is_cost_valid = 1; + RD_STATS pn_rd_stats; + // Search for the best transform block size and type for the sub-block. + select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth, bsize, + ctxa, ctxl, tx_above, tx_left, &pn_rd_stats, INT64_MAX, + best_rd_sofar, &is_cost_valid, ftxs_mode, rd_info_tree); + if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) { + av1_invalid_rd_stats(rd_stats); + return INT64_MAX; + } + av1_merge_rd_stats(rd_stats, &pn_rd_stats); + skip_rd = RDCOST(x->rdmult, skip_flag_cost, rd_stats->sse); + no_skip_rd = + RDCOST(x->rdmult, rd_stats->rate + no_skip_flag_cost, rd_stats->dist); + block += step; + if (rd_info_tree != NULL) rd_info_tree += 1; + } + } + + if (rd_stats->rate == INT_MAX) return INT64_MAX; + + rd_stats->skip = (skip_rd <= no_skip_rd); + + // If fast_tx_search is true, only DCT and 1D DCT were tested in + // select_inter_block_yrd() above. Do a better search for tx type with + // tx sizes already decided. + if (fast_tx_search && cpi->sf.tx_sf.refine_fast_tx_search_results) { + if (!inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd, FTXS_NONE)) + return INT64_MAX; + } + + int64_t final_rd; + if (rd_stats->skip) { + final_rd = RDCOST(x->rdmult, skip_flag_cost, rd_stats->sse); + } else { + final_rd = + RDCOST(x->rdmult, rd_stats->rate + no_skip_flag_cost, rd_stats->dist); + if (!xd->lossless[xd->mi[0]->segment_id]) { + final_rd = + AOMMIN(final_rd, RDCOST(x->rdmult, skip_flag_cost, rd_stats->sse)); + } + } + + return final_rd; +} + +// Return 1 to terminate transform search early. The decision is made based on +// the comparison with the reference RD cost and the model-estimated RD cost. +static AOM_INLINE int model_based_tx_search_prune(const AV1_COMP *cpi, + MACROBLOCK *x, + BLOCK_SIZE bsize, + int64_t ref_best_rd) { + const int level = cpi->sf.tx_sf.model_based_prune_tx_search_level; + assert(level >= 0 && level <= 2); + int model_rate; + int64_t model_dist; + int model_skip; + MACROBLOCKD *const xd = &x->e_mbd; + model_rd_sb_fn[MODELRD_TYPE_TX_SEARCH_PRUNE]( + cpi, bsize, x, xd, 0, 0, &model_rate, &model_dist, &model_skip, NULL, + NULL, NULL, NULL); + if (model_skip) return 0; + const int64_t model_rd = RDCOST(x->rdmult, model_rate, model_dist); + // TODO(debargha, urvang): Improve the model and make the check below + // tighter. + static const int prune_factor_by8[] = { 3, 5 }; + const int factor = prune_factor_by8[level - 1]; + return ((model_rd * factor) >> 3) > ref_best_rd; +} + +// Search for best transform size and type for luma inter blocks. The transform +// block partitioning can be recursive resulting in non-uniform transform sizes. +// The best transform size and type, if found, will be saved in the MB_MODE_INFO +// structure, and the corresponding RD stats will be saved in rd_stats. +void av1_pick_recursive_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int64_t ref_best_rd) { + MACROBLOCKD *const xd = &x->e_mbd; + assert(is_inter_block(xd->mi[0])); + + av1_invalid_rd_stats(rd_stats); + + // If modeled RD cost is a lot worse than the best so far, terminate early. + if (cpi->sf.tx_sf.model_based_prune_tx_search_level && + ref_best_rd != INT64_MAX) { + if (model_based_tx_search_prune(cpi, x, bsize, ref_best_rd)) return; + } + + // Hashing based speed feature. If the hash of the prediction residue block is + // found in the hash table, use previous search results and terminate early. + uint32_t hash = 0; + MB_RD_RECORD *mb_rd_record = NULL; + const int mi_row = x->e_mbd.mi_row; + const int mi_col = x->e_mbd.mi_col; + const int within_border = + mi_row >= xd->tile.mi_row_start && + (mi_row + mi_size_high[bsize] < xd->tile.mi_row_end) && + mi_col >= xd->tile.mi_col_start && + (mi_col + mi_size_wide[bsize] < xd->tile.mi_col_end); + const int is_mb_rd_hash_enabled = + (within_border && cpi->sf.rd_sf.use_mb_rd_hash); + const int n4 = bsize_to_num_blk(bsize); + if (is_mb_rd_hash_enabled) { + hash = get_block_residue_hash(x, bsize); + mb_rd_record = &x->mb_rd_record; + const int match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash); + if (match_index != -1) { + MB_RD_INFO *tx_rd_info = &mb_rd_record->tx_rd_info[match_index]; + fetch_tx_rd_info(n4, tx_rd_info, rd_stats, x); + return; + } + } + + // If we predict that skip is the optimal RD decision - set the respective + // context and terminate early. + int64_t dist; + if (x->predict_skip_level && + predict_skip_flag(x, bsize, &dist, + cpi->common.features.reduced_tx_set_used)) { + set_skip_flag(x, rd_stats, bsize, dist); + // Save the RD search results into tx_rd_record. + if (is_mb_rd_hash_enabled) + save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record); + return; + } +#if CONFIG_SPEED_STATS + ++x->tx_search_count; +#endif // CONFIG_SPEED_STATS + + // Pre-compute residue hashes (transform block level) and find existing or + // add new RD records to store and reuse rate and distortion values to speed + // up TX size/type search. + TXB_RD_INFO_NODE matched_rd_info[4 + 16 + 64]; + int found_rd_info = 0; + if (ref_best_rd != INT64_MAX && within_border && + cpi->sf.tx_sf.use_inter_txb_hash) { + found_rd_info = find_tx_size_rd_records(x, bsize, matched_rd_info); + } + + const int64_t rd = + select_tx_size_and_type(cpi, x, rd_stats, bsize, ref_best_rd, + found_rd_info ? matched_rd_info : NULL); + + if (rd == INT64_MAX) { + // We should always find at least one candidate unless ref_best_rd is less + // than INT64_MAX (in which case, all the calls to select_tx_size_fix_type + // might have failed to find something better) + assert(ref_best_rd != INT64_MAX); + av1_invalid_rd_stats(rd_stats); + return; + } + + // Save the RD search results into tx_rd_record. + if (is_mb_rd_hash_enabled) { + assert(mb_rd_record != NULL); + save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record); + } +} + +// Search for the best transform size and type for current coding block, with +// the assumption that all the transform blocks have a uniform size (VP9 style). +// The selected transform size and type will be saved in the MB_MODE_INFO +// structure; the corresponding RD stats will be saved in rd_stats. +// This function may be used for both intra and inter predicted blocks. +void av1_pick_uniform_tx_size_type_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bs, + int64_t ref_best_rd) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(bs == mbmi->sb_type); + const int is_inter = is_inter_block(mbmi); + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + av1_init_rd_stats(rd_stats); + + // Hashing based speed feature for inter blocks. If the hash of the residue + // block is found in the table, use previously saved search results and + // terminate early. + uint32_t hash = 0; + MB_RD_RECORD *mb_rd_record = NULL; + const int num_blks = bsize_to_num_blk(bs); + if (is_inter && cpi->sf.rd_sf.use_mb_rd_hash) { + const int within_border = + mi_row >= xd->tile.mi_row_start && + (mi_row + mi_size_high[bs] < xd->tile.mi_row_end) && + mi_col >= xd->tile.mi_col_start && + (mi_col + mi_size_wide[bs] < xd->tile.mi_col_end); + if (within_border) { + hash = get_block_residue_hash(x, bs); + mb_rd_record = &x->mb_rd_record; + const int match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash); + if (match_index != -1) { + MB_RD_INFO *tx_rd_info = &mb_rd_record->tx_rd_info[match_index]; + fetch_tx_rd_info(num_blks, tx_rd_info, rd_stats, x); + return; + } + } + } + + // If we predict that skip is the optimal RD decision - set the respective + // context and terminate early. + int64_t dist; + if (x->predict_skip_level && is_inter && !xd->lossless[mbmi->segment_id] && + predict_skip_flag(x, bs, &dist, + cpi->common.features.reduced_tx_set_used)) { + // Populate rdstats as per skip decision + set_skip_flag(x, rd_stats, bs, dist); + // Save the RD search results into tx_rd_record. + if (mb_rd_record) { + save_tx_rd_info(num_blks, hash, x, rd_stats, mb_rd_record); + } + return; + } + + if (xd->lossless[mbmi->segment_id]) { + // Lossless mode can only pick the smallest (4x4) transform size. + choose_smallest_tx_size(cpi, x, rd_stats, ref_best_rd, bs); + } else if (x->tx_size_search_method == USE_LARGESTALL) { + choose_largest_tx_size(cpi, x, rd_stats, ref_best_rd, bs); + } else { + choose_tx_size_type_from_rd(cpi, x, rd_stats, ref_best_rd, bs); + } + + // Save the RD search results into tx_rd_record for possible reuse in future. + if (mb_rd_record) { + save_tx_rd_info(num_blks, hash, x, rd_stats, mb_rd_record); + } +} + +// Calculate the transform coefficient RD cost for the given chroma coding block +// Return value 0: early termination triggered, no valid rd cost available; +// 1: rd cost values are valid. +int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, + BLOCK_SIZE bsize, int64_t ref_best_rd) { + av1_init_rd_stats(rd_stats); + if (ref_best_rd < 0) return 0; + if (!x->e_mbd.is_chroma_ref) return 1; + + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_U]; + const int is_inter = is_inter_block(mbmi); + int64_t this_rd = 0, skip_rd = 0; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + + if (is_inter) { + for (int plane = 1; plane < MAX_MB_PLANE; ++plane) + av1_subtract_plane(x, plane_bsize, plane); + } + + const int skip_trellis = 0; + const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd); + int is_cost_valid = 1; + for (int plane = 1; plane < MAX_MB_PLANE; ++plane) { + RD_STATS this_rd_stats; + int64_t chroma_ref_best_rd = ref_best_rd; + // For inter blocks, refined ref_best_rd is used for early exit + // For intra blocks, even though current rd crosses ref_best_rd, early + // exit is not recommended as current rd is used for gating subsequent + // modes as well (say, for angular modes) + // TODO(any): Extend the early exit mechanism for intra modes as well + if (cpi->sf.inter_sf.perform_best_rd_based_gating_for_chroma && is_inter && + chroma_ref_best_rd != INT64_MAX) + chroma_ref_best_rd = ref_best_rd - AOMMIN(this_rd, skip_rd); + av1_txfm_rd_in_plane(x, cpi, &this_rd_stats, chroma_ref_best_rd, 0, plane, + plane_bsize, uv_tx_size, + cpi->sf.rd_sf.use_fast_coef_costing, FTXS_NONE, + skip_trellis); + if (this_rd_stats.rate == INT_MAX) { + is_cost_valid = 0; + break; + } + av1_merge_rd_stats(rd_stats, &this_rd_stats); + this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + skip_rd = RDCOST(x->rdmult, 0, rd_stats->sse); + if (AOMMIN(this_rd, skip_rd) > ref_best_rd) { + is_cost_valid = 0; + break; + } + } + + if (!is_cost_valid) { + // reset cost value + av1_invalid_rd_stats(rd_stats); + } + + return is_cost_valid; +} + +// Search for the best transform type and calculate the transform coefficients +// RD cost of the current coding block with the specified (uniform) transform +// size and channel. The RD results will be saved in rd_stats. +void av1_txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi, + RD_STATS *rd_stats, int64_t ref_best_rd, + int64_t current_rd, int plane, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, int use_fast_coef_costing, + FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis) { + assert(IMPLIES(plane == 0, x->e_mbd.mi[0]->tx_size == tx_size)); + + if (!cpi->oxcf.enable_tx64 && txsize_sqr_up_map[tx_size] == TX_64X64) { + av1_invalid_rd_stats(rd_stats); + return; + } + + if (current_rd > ref_best_rd) { + av1_invalid_rd_stats(rd_stats); + return; + } + + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + struct rdcost_block_args args; + av1_zero(args); + args.x = x; + args.cpi = cpi; + args.best_rd = ref_best_rd; + args.current_rd = current_rd; + args.use_fast_coef_costing = use_fast_coef_costing; + args.ftxs_mode = ftxs_mode; + args.skip_trellis = skip_trellis; + av1_init_rd_stats(&args.rd_stats); + + av1_get_entropy_contexts(plane_bsize, pd, args.t_above, args.t_left); + av1_foreach_transformed_block_in_plane(xd, plane_bsize, plane, block_rd_txfm, + &args); + + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int is_inter = is_inter_block(mbmi); + const int invalid_rd = is_inter ? args.incomplete_exit : args.exit_early; + + if (invalid_rd) { + av1_invalid_rd_stats(rd_stats); + } else { + *rd_stats = args.rd_stats; + } +} + +// This function combines y and uv planes' transform search processes together +// for inter-predicted blocks (including IntraBC), when the prediction is +// already generated. It first does subtraction to obtain the prediction error. +// Then it calls +// av1_pick_recursive_tx_size_type_yrd/av1_pick_uniform_tx_size_type_yrd and +// av1_txfm_uvrd sequentially and handles the early terminations +// happening in those functions. At the end, it computes the +// rd_stats/_y/_uv accordingly. +int av1_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + RD_STATS *rd_stats, RD_STATS *rd_stats_y, + RD_STATS *rd_stats_uv, int mode_rate, int64_t ref_best_rd) { + MACROBLOCKD *const xd = &x->e_mbd; + const int skip_ctx = av1_get_skip_context(xd); + const int skip_flag_cost[2] = { x->skip_cost[skip_ctx][0], + x->skip_cost[skip_ctx][1] }; + const int64_t min_header_rate = + mode_rate + AOMMIN(skip_flag_cost[0], skip_flag_cost[1]); + // Account for minimum skip and non_skip rd. + // Eventually either one of them will be added to mode_rate + const int64_t min_header_rd_possible = RDCOST(x->rdmult, min_header_rate, 0); + if (min_header_rd_possible > ref_best_rd) { + av1_invalid_rd_stats(rd_stats_y); + return 0; + } + + const AV1_COMMON *cm = &cpi->common; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int64_t mode_rd = RDCOST(x->rdmult, mode_rate, 0); + const int64_t rd_thresh = + ref_best_rd == INT64_MAX ? INT64_MAX : ref_best_rd - mode_rd; + av1_init_rd_stats(rd_stats); + av1_init_rd_stats(rd_stats_y); + rd_stats->rate = mode_rate; + + // cost and distortion + av1_subtract_plane(x, bsize, 0); + if (x->tx_mode_search_type == TX_MODE_SELECT && + !xd->lossless[mbmi->segment_id]) { + av1_pick_recursive_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, rd_thresh); +#if CONFIG_COLLECT_RD_STATS == 2 + PrintPredictionUnitStats(cpi, tile_data, x, rd_stats_y, bsize); +#endif // CONFIG_COLLECT_RD_STATS == 2 + } else { + av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, rd_thresh); + memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); + for (int i = 0; i < xd->height * xd->width; ++i) + set_blk_skip(x, 0, i, rd_stats_y->skip); + } + + if (rd_stats_y->rate == INT_MAX) return 0; + + av1_merge_rd_stats(rd_stats, rd_stats_y); + + const int64_t non_skip_rdcosty = + RDCOST(x->rdmult, rd_stats->rate + skip_flag_cost[0], rd_stats->dist); + const int64_t skip_rdcosty = + RDCOST(x->rdmult, mode_rate + skip_flag_cost[1], rd_stats->sse); + const int64_t min_rdcosty = AOMMIN(non_skip_rdcosty, skip_rdcosty); + if (min_rdcosty > ref_best_rd) { + const int64_t tokenonly_rdy = + AOMMIN(RDCOST(x->rdmult, rd_stats_y->rate, rd_stats_y->dist), + RDCOST(x->rdmult, 0, rd_stats_y->sse)); + // Invalidate rd_stats_y to skip the rest of the motion modes search + if (tokenonly_rdy - + (tokenonly_rdy >> cpi->sf.inter_sf.prune_motion_mode_level) > + rd_thresh) { + av1_invalid_rd_stats(rd_stats_y); + } + return 0; + } + + av1_init_rd_stats(rd_stats_uv); + const int num_planes = av1_num_planes(cm); + if (num_planes > 1) { + int64_t ref_best_chroma_rd = ref_best_rd; + // Calculate best rd cost possible for chroma + if (cpi->sf.inter_sf.perform_best_rd_based_gating_for_chroma && + (ref_best_chroma_rd != INT64_MAX)) { + ref_best_chroma_rd = + (ref_best_chroma_rd - AOMMIN(non_skip_rdcosty, skip_rdcosty)); + } + const int is_cost_valid_uv = + av1_txfm_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_chroma_rd); + if (!is_cost_valid_uv) return 0; + av1_merge_rd_stats(rd_stats, rd_stats_uv); + } + + int choose_skip = rd_stats->skip; + if (!choose_skip && !xd->lossless[mbmi->segment_id]) { + const int64_t rdcost_no_skip = RDCOST( + x->rdmult, rd_stats_y->rate + rd_stats_uv->rate + skip_flag_cost[0], + rd_stats->dist); + const int64_t rdcost_skip = + RDCOST(x->rdmult, skip_flag_cost[1], rd_stats->sse); + if (rdcost_no_skip >= rdcost_skip) choose_skip = 1; + } + if (choose_skip) { + rd_stats_y->rate = 0; + rd_stats_uv->rate = 0; + rd_stats->rate = mode_rate + skip_flag_cost[1]; + rd_stats->dist = rd_stats->sse; + rd_stats_y->dist = rd_stats_y->sse; + rd_stats_uv->dist = rd_stats_uv->sse; + mbmi->skip = 1; + if (rd_stats->skip) { + const int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + if (tmprd > ref_best_rd) return 0; + } + } else { + rd_stats->rate += skip_flag_cost[0]; + mbmi->skip = 0; + } + + return 1; +} diff --git a/libs/libaom/src/av1/encoder/tx_search.h b/libs/libaom/src/av1/encoder/tx_search.h new file mode 100644 index 000000000..82d56719d --- /dev/null +++ b/libs/libaom/src/av1/encoder/tx_search.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_TRANSFORM_SEARCH_H_ +#define AOM_AV1_ENCODER_TRANSFORM_SEARCH_H_ + +#include "av1/common/pred_common.h" +#include "av1/encoder/encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Set this macro as 1 to collect data about tx size selection. +#define COLLECT_TX_SIZE_DATA 0 + +#if COLLECT_TX_SIZE_DATA +static const char av1_tx_size_data_output_file[] = "tx_size_data.txt"; +#endif + +enum { + FTXS_NONE = 0, + FTXS_DCT_AND_1D_DCT_ONLY = 1 << 0, + FTXS_DISABLE_TRELLIS_OPT = 1 << 1, + FTXS_USE_TRANSFORM_DOMAIN = 1 << 2 +} UENUM1BYTE(FAST_TX_SEARCH_MODE); + +static AOM_INLINE int tx_size_cost(const MACROBLOCK *const x, BLOCK_SIZE bsize, + TX_SIZE tx_size) { + assert(bsize == x->e_mbd.mi[0]->sb_type); + if (x->tx_mode_search_type != TX_MODE_SELECT || !block_signals_txsize(bsize)) + return 0; + + const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize); + const int depth = tx_size_to_depth(tx_size, bsize); + const MACROBLOCKD *const xd = &x->e_mbd; + const int tx_size_ctx = get_tx_size_context(xd); + return x->tx_size_cost[tx_size_cat][tx_size_ctx][depth]; +} + +int64_t av1_uniform_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, + RD_STATS *rd_stats, int64_t ref_best_rd, + BLOCK_SIZE bs, TX_SIZE tx_size, + FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis); + +void av1_pick_recursive_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int64_t ref_best_rd); + +void av1_pick_uniform_tx_size_type_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bs, + int64_t ref_best_rd); + +int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, + BLOCK_SIZE bsize, int64_t ref_best_rd); + +void av1_txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi, + RD_STATS *rd_stats, int64_t ref_best_rd, + int64_t this_rd, int plane, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, int use_fast_coef_costing, + FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis); + +int av1_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + RD_STATS *rd_stats, RD_STATS *rd_stats_y, + RD_STATS *rd_stats_uv, int mode_rate, int64_t ref_best_rd); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_TRANSFORM_SEARCH_H_ diff --git a/libs/libaom/src/av1/encoder/use_flat_gop_model_params.h b/libs/libaom/src/av1/encoder/use_flat_gop_model_params.h new file mode 100644 index 000000000..cf0776644 --- /dev/null +++ b/libs/libaom/src/av1/encoder/use_flat_gop_model_params.h @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_USE_FLAT_GOP_MODEL_PARAMS_H_ +#define AOM_AV1_ENCODER_USE_FLAT_GOP_MODEL_PARAMS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/ml.h" + +// A binary classifier that returns true (score > 0) if it is better to use a +// flat GOP structure, rather than a GOP structure that uses ALT-REFs and +// internal ARFs. + +#define NUM_FEATURES 21 +#define NUM_HIDDEN_LAYERS 1 +#define NUM_HIDDEN_NODES_LAYER0 48 +#define NUM_LABELS 1 + +static const float + av1_use_flat_gop_nn_weights_layer0[NUM_FEATURES * + NUM_HIDDEN_NODES_LAYER0] = { + 0.3801f, -2.1832f, 1.7469f, 2.0130f, 2.1264f, -0.7293f, -0.2814f, + 0.0692f, -4.6589f, -1.4591f, 0.3023f, -0.4310f, -0.1911f, -0.8284f, + -1.3322f, -0.4621f, -0.1148f, -0.3531f, -0.0794f, -0.3114f, -0.1664f, + -0.1615f, 0.2913f, -0.0394f, -0.0620f, 0.1845f, 0.0204f, -0.2124f, + -0.1233f, -0.1685f, 0.1215f, -0.2372f, -0.2865f, -0.1976f, 0.2137f, + -0.1318f, -0.0324f, 0.0415f, -0.1172f, 0.1077f, -0.1135f, -0.2462f, + -0.0743f, -0.1584f, -0.3267f, -0.0566f, -0.1615f, -0.3931f, -0.5200f, + -0.1786f, -0.1811f, -0.2812f, -0.1986f, -0.4393f, -0.3941f, -0.2500f, + -0.2029f, -0.4605f, -0.4973f, -0.2238f, -0.2599f, -0.1951f, -0.2034f, + -0.3186f, -0.1368f, -0.5076f, -0.4718f, -0.1815f, -0.3338f, -0.0550f, + -0.3920f, -0.5328f, -0.1658f, -0.2194f, -0.2867f, -0.0916f, -0.1678f, + -0.1760f, -0.5055f, -0.2322f, -0.4668f, -0.0121f, -0.3903f, -0.2721f, + -0.1306f, 0.1199f, 0.2894f, 0.1098f, -0.0155f, -0.0844f, 0.0421f, + -0.2364f, -0.1073f, -0.0878f, -0.2146f, -0.1713f, -0.2283f, 0.0342f, + 0.0394f, -0.2808f, -0.0048f, 0.2640f, -0.1371f, 0.1709f, 0.0155f, + -0.3614f, -0.1843f, -0.3215f, -0.3121f, -0.2609f, -0.0254f, -0.2474f, + -0.4674f, -0.3674f, -0.2076f, 0.0149f, -0.3304f, -0.2678f, -0.0465f, + -0.1326f, -0.4504f, -0.5101f, -0.1280f, -0.0416f, -0.4296f, -0.4568f, + -0.6762f, -2.8105f, 0.7249f, 1.4288f, 1.3731f, 0.3034f, 0.1841f, + -0.0912f, -0.1508f, 1.2637f, -0.2009f, 0.3236f, -0.2500f, -0.0736f, + 0.8655f, -0.2599f, 0.1150f, -0.0368f, -0.1122f, -0.7650f, -0.2004f, + -0.0891f, -0.3832f, -0.2576f, -0.3532f, -0.1735f, -0.4018f, -0.0265f, + -0.2988f, 0.2555f, -0.1041f, -0.3391f, -0.5316f, -0.0171f, -0.3232f, + -0.0565f, -0.3359f, -0.1842f, -0.0582f, 0.0073f, -0.0278f, -0.5517f, + 0.0892f, -0.1354f, 0.0548f, -0.0401f, -0.1697f, 0.0432f, 0.0832f, + -0.3538f, 0.2602f, -0.0066f, -0.2130f, -0.3085f, 0.0025f, 0.2464f, + -0.0103f, -0.3082f, -0.1136f, -0.2359f, -0.3421f, 0.1335f, -0.3016f, + -1.0355f, -1.0572f, -0.3316f, -0.1235f, -0.3730f, -0.1751f, -0.1921f, + 0.0031f, -0.6297f, -0.5179f, 0.1082f, -0.3130f, -0.1120f, -0.5430f, + -0.1782f, 0.0534f, -0.1052f, 0.1471f, -0.7156f, -0.5453f, -0.5437f, + 1.8709f, 1.9696f, -1.0343f, -0.3150f, -0.8399f, -0.0052f, -0.1123f, + -0.1059f, 0.6755f, 1.2593f, -0.2512f, -0.2053f, 0.0835f, 0.3261f, + -0.0172f, 0.1230f, -0.3687f, 0.1993f, 0.9390f, -0.0165f, 0.6856f, + -0.4372f, -0.4041f, -0.2869f, -0.3871f, -0.3587f, -0.2418f, 0.0518f, + 0.0110f, -1.4713f, -0.1307f, -0.3246f, -0.5091f, -0.4652f, -0.4288f, + -0.0763f, -0.1755f, 0.0662f, -0.3026f, -0.4462f, -0.4123f, -0.2891f, + -0.2251f, -0.4925f, -0.3820f, -0.1840f, -0.2878f, -0.1973f, -0.1010f, + -0.1622f, -0.3108f, -0.5292f, -0.1017f, -0.0607f, -0.2426f, -0.6406f, + -0.3834f, -0.2313f, -0.2433f, -0.1773f, -0.1581f, -0.3295f, -0.3799f, + -0.4447f, -0.2389f, -0.4231f, -0.1498f, -0.0181f, -0.4429f, -0.3515f, + 0.0425f, -0.5280f, -0.3462f, -0.3659f, 0.0153f, -0.1002f, -0.5057f, + -0.2134f, -0.2859f, -0.1988f, -0.4758f, 0.0967f, -0.4784f, 0.1868f, + -0.4387f, -1.3376f, -0.4452f, 0.3837f, 0.1698f, -0.7076f, -0.4320f, + 0.0382f, -1.8053f, -0.6589f, 0.1406f, -0.4340f, 0.0641f, -0.2558f, + -0.4496f, -0.5003f, -0.6241f, -0.2217f, -0.8312f, -0.6793f, -0.3563f, + 0.5153f, -0.7851f, 1.0570f, 0.9702f, 0.5238f, -0.6932f, -0.4443f, + 0.0407f, -3.0961f, -0.8461f, 0.0562f, -0.0642f, 0.2471f, -0.5911f, + -0.7715f, -0.1574f, -0.0375f, -0.1951f, -0.3097f, -0.2040f, 0.0128f, + -0.0918f, -0.0698f, -0.0970f, -0.2946f, -0.1723f, -0.2569f, -0.4382f, + -0.5174f, -0.2058f, -0.2973f, -0.0858f, -0.2526f, -0.2648f, -0.2339f, + -0.3474f, 0.0607f, 0.0272f, -0.3142f, -0.1306f, -0.4938f, -0.1894f, + -0.0551f, -0.1061f, -0.1613f, -0.1942f, 0.0590f, -0.2009f, -0.1286f, + -0.2035f, -0.0393f, -0.0650f, -0.1110f, 0.0123f, -0.1122f, -0.0246f, + -0.2042f, 0.0411f, -0.2771f, -0.0189f, 0.0927f, 0.0286f, -0.1559f, + -0.3217f, -0.1039f, 0.1471f, 0.2489f, 0.2085f, -0.4199f, -0.2404f, + 0.0358f, -0.7567f, -0.2413f, -0.3437f, -0.2433f, -0.3687f, -0.1194f, + -0.4289f, -0.1138f, -0.0721f, -0.3461f, -0.0244f, -0.3530f, -0.2842f, + -0.3823f, -0.1238f, -0.5475f, -0.2688f, -0.0073f, 0.0491f, -0.4500f, + 0.0201f, 0.0303f, -0.2160f, -0.4219f, -0.4831f, -0.4593f, -0.2304f, + -0.2082f, -0.0367f, -0.5226f, -0.0082f, -0.1867f, -0.1812f, -0.2753f, + 2.6650f, 1.9698f, -2.9425f, 1.2119f, 1.5000f, 0.3356f, 0.3905f, + -0.2006f, -1.4038f, -1.0917f, 0.1423f, -0.3528f, 0.0888f, 0.5802f, + 1.0977f, 0.1083f, -0.0693f, -0.0784f, 0.4247f, 0.4108f, 0.4970f, + -0.7290f, -0.1659f, -0.0517f, 0.0776f, -0.0550f, -0.2374f, -0.4245f, + -0.0165f, -0.6804f, -0.3211f, -0.3101f, -0.1883f, -0.0786f, -0.3971f, + -0.4130f, -0.0606f, 0.1432f, -0.0518f, -0.4179f, -0.4949f, -0.3451f, + -0.7559f, -4.0792f, 1.5526f, 0.2824f, 0.6086f, -0.2148f, 0.0959f, + 0.0506f, -5.5176f, -3.9702f, 0.1597f, -0.1760f, -0.0627f, 0.1657f, + -1.2996f, -0.2899f, -0.0600f, -0.0531f, -1.5160f, -0.4837f, -1.6961f, + -0.1134f, -0.1838f, -0.3071f, -0.4215f, -0.4184f, 0.0192f, -0.2128f, + -0.3094f, -0.2607f, -0.4855f, -0.1881f, 0.0258f, -0.5085f, -0.3630f, + -0.4824f, -0.3762f, -0.3324f, -0.1134f, -0.3350f, 0.0217f, -0.2803f, + -0.5669f, -0.5674f, -0.5441f, -0.5965f, -0.3062f, -0.4666f, -0.4079f, + -0.0065f, -0.7566f, -0.3437f, -0.2474f, -0.2360f, -0.5683f, -0.3853f, + -0.6670f, -0.4158f, -0.2831f, -0.3327f, -0.7419f, -0.6481f, -0.4004f, + -0.4025f, -0.6405f, -0.4265f, -0.0167f, 0.3195f, -0.0822f, -0.4350f, + -0.0032f, -1.0448f, -0.4407f, 0.0488f, 0.0776f, -0.3828f, -0.3380f, + -0.2983f, -0.2220f, -0.4105f, -0.2312f, -0.4166f, -0.3258f, -0.1424f, + -0.6588f, -0.9433f, 0.3402f, 0.5800f, 0.6368f, -0.4298f, -0.5743f, + 0.0822f, -1.0843f, -0.1645f, -0.1990f, 0.0255f, -0.1039f, -0.3673f, + 0.4367f, -0.5491f, -0.0932f, -0.0323f, -0.2405f, -0.2922f, -0.4019f, + -0.4936f, -1.2338f, 0.4681f, 0.7454f, 0.8181f, -0.3680f, -0.1613f, + -0.0008f, -1.3326f, -0.0667f, 0.1569f, -0.0978f, -0.3229f, -0.4222f, + 0.0330f, 0.1064f, -0.1325f, 0.0121f, -0.3976f, -0.2254f, -0.3942f, + -0.4771f, -0.1887f, 0.1020f, 0.3331f, 0.3098f, -0.1256f, -0.4736f, + 0.0295f, -0.3919f, -0.0931f, -0.2484f, -0.4629f, -0.2800f, -0.2851f, + -0.2243f, -0.3958f, -0.3053f, -0.6585f, -0.1159f, -0.2330f, -0.1989f, + 0.2273f, 0.1963f, 0.0283f, 0.0198f, -0.1298f, -0.0627f, -0.2753f, + -0.1552f, 0.2734f, -0.0551f, -0.2927f, -0.3772f, -0.4522f, -0.0786f, + 0.0079f, 0.1664f, -0.0228f, -0.2908f, -0.1714f, 0.1223f, -0.0680f, + -0.5048f, -0.0852f, -0.4653f, -0.5142f, -0.1818f, -0.1659f, 0.0678f, + -0.1296f, 0.0295f, -0.3487f, -0.1224f, -0.2690f, -0.3217f, -0.1957f, + -0.3196f, -0.4530f, -0.1746f, -0.2307f, -0.0504f, -0.0131f, -0.4613f, + -0.1476f, -0.5596f, -0.3829f, -0.4302f, -0.2910f, -0.2182f, -0.0811f, + -0.3967f, -0.3912f, -0.0371f, -0.1109f, -0.0793f, -0.2063f, -0.0060f, + -0.0236f, -0.4098f, -0.0276f, -0.3352f, -0.1888f, -0.2439f, -0.3748f, + 0.0371f, 0.8460f, -0.5547f, -1.2680f, -1.1623f, -0.1740f, -0.4815f, + -0.0294f, 4.4764f, 0.3716f, -0.2826f, -0.0549f, -0.2937f, 0.0632f, + 0.0686f, -0.4681f, -0.2555f, -0.2427f, -0.2261f, -0.1567f, -0.5199f, + -0.4079f, -0.0801f, -0.2075f, -0.3956f, -0.0307f, -0.3150f, -0.3490f, + -0.0379f, 0.3060f, -0.1775f, -0.1651f, 0.0677f, -0.1947f, 0.0032f, + -0.2014f, -0.1575f, -0.1289f, -0.0250f, -0.0762f, -0.2324f, -0.2895f, + -0.4531f, -0.4601f, -0.1718f, -0.3139f, -0.4350f, 0.0346f, -0.0891f, + -0.1581f, 0.2123f, -0.1074f, 0.0221f, 0.0951f, 0.1161f, 0.0245f, + -0.0701f, -0.1677f, -0.4170f, -0.2214f, -0.3419f, -0.4873f, -0.0701f, + -0.0613f, -0.1031f, 0.0141f, -0.1299f, -0.3953f, -0.2182f, -0.2679f, + -0.0141f, 0.3392f, -0.0722f, -0.2390f, 0.1638f, -0.1596f, -0.1527f, + -0.3581f, -0.4037f, -0.0736f, 0.0397f, -0.1288f, -0.1362f, -0.0249f, + -0.5099f, -0.4040f, -0.1893f, -0.0298f, -0.1332f, -0.1693f, -0.3301f, + -0.1058f, -0.1414f, -0.5737f, -0.2342f, -0.2560f, -0.3834f, -0.0917f, + -0.1334f, -0.5077f, -0.3666f, -0.2515f, -0.4824f, -0.4714f, -0.5723f, + -0.1361f, -0.5244f, -0.2468f, 0.0237f, -0.1862f, -0.3124f, -0.0183f, + -0.4662f, -0.4444f, -0.5400f, -0.1730f, -0.0123f, -0.2134f, -0.1024f, + -0.0172f, -0.4430f, -0.1403f, -0.0751f, -0.2403f, -0.2100f, -0.0678f, + 2.4232f, 1.9825f, 0.1260f, 1.9972f, 2.8061f, 0.3916f, 0.1842f, + -0.2603f, -1.6092f, -1.6037f, 0.1475f, 0.0516f, -0.2593f, 0.0359f, + -0.1802f, 0.0159f, -0.0529f, -0.0983f, 0.7638f, 0.5529f, 0.9662f, + -0.4049f, -0.6372f, 0.4907f, 0.7360f, 0.9271f, -0.6879f, -0.1067f, + 0.0323f, -1.8447f, 0.2176f, -0.1047f, -0.0048f, -0.1031f, -0.7931f, + -0.3059f, -0.4595f, -0.1287f, -0.4031f, 0.1441f, -0.6651f, 0.2530f, + -0.4572f, -0.0614f, 0.0345f, -0.0008f, 0.0333f, -0.3431f, 0.0538f, + -0.2691f, 0.2930f, -0.0820f, -0.0979f, -0.0307f, 0.1713f, 0.0783f, + -0.4337f, -0.2702f, -0.1677f, -0.1719f, -0.4669f, -0.2847f, -0.4495f, + -0.3692f, -0.2641f, -0.2833f, -0.1168f, -0.0523f, -0.2368f, -0.4922f, + -0.3453f, -0.4452f, -0.5212f, 0.0412f, -0.3310f, -0.2656f, -0.4903f, + -0.3854f, -0.1009f, -0.1038f, -0.2350f, -0.4430f, -0.5097f, -0.1755f, + 0.0110f, -0.0712f, -0.0662f, -0.4493f, -0.2111f, -0.3402f, -0.3100f, + -0.2525f, -0.1856f, -0.2689f, -0.4288f, -0.3912f, -0.0754f, -0.5191f, + -0.0747f, -0.0626f, -0.4821f, -0.2014f, -0.3124f, -0.4858f, -0.1896f, + 1.0673f, -0.8529f, 13.7564f, 18.7299f, 19.0062f, -1.1047f, -0.8654f, + 0.1089f, -1.2958f, -0.7793f, 0.0780f, -0.1679f, 0.0054f, -1.2451f, + -0.1287f, 0.0082f, -0.2960f, -0.0442f, 2.3817f, 0.4716f, 1.3862f, + -0.0782f, -0.1871f, -0.2596f, 0.0093f, 0.1451f, -0.1124f, -0.2315f, + -0.2677f, -0.1086f, 0.2216f, 0.2928f, 0.0391f, 0.0372f, -0.2551f, + 0.0552f, -0.1876f, -0.2361f, -0.1889f, -0.0279f, 0.1204f, 0.2016f, + -0.5787f, -0.5830f, 0.0530f, -0.1452f, -0.4899f, -0.2937f, 0.1430f, + -0.2752f, -0.2320f, -0.1908f, -0.5538f, -0.0858f, -0.1378f, -0.1505f, + -0.3908f, -0.4732f, -0.3018f, 0.0244f, -0.2392f, -0.2833f, -0.3997f, + -0.4495f, -0.2570f, -0.3189f, -0.1534f, -0.1040f, -0.5497f, -0.3524f, + -0.2053f, 0.2415f, -0.5027f, 0.0288f, -0.1904f, -0.2183f, -0.1062f, + -0.3560f, 0.0165f, -0.4601f, -0.2144f, -0.0439f, -0.4913f, -0.3160f, + -0.1641f, 0.1010f, -0.1044f, -0.4064f, -0.3580f, -0.4015f, 0.1010f, + -0.1973f, 0.6392f, -0.5177f, -0.0472f, -0.1526f, 0.1533f, -0.0819f, + -0.0252f, -0.0783f, 0.1301f, 0.0158f, -0.2003f, -0.4700f, -0.2329f, + }; + +static const float + av1_use_flat_gop_nn_biases_layer0[NUM_HIDDEN_NODES_LAYER0] = { + -1.113218f, 0.f, -0.268537f, -0.268537f, 0.f, -0.268534f, + -0.40681f, -0.268537f, -0.061835f, -0.614956f, 0.984277f, -0.280228f, + -0.354716f, -0.202312f, -0.772829f, -0.464005f, -0.230795f, 0.f, + -0.124187f, -0.265949f, 0.325168f, -0.359008f, -2.455546f, -0.229222f, + -0.692233f, -0.29401f, -0.632682f, -0.479061f, -0.166094f, 0.077291f, + -0.235293f, -0.268537f, 0.167899f, -0.141991f, -0.210089f, -0.177294f, + -0.325401f, -0.268537f, 0.323627f, -0.156593f, -0.218451f, -0.230792f, + -0.268537f, 0.833177f, 0.f, -0.353177f, -0.260953f, -0.209537f, + }; + +static const float + av1_use_flat_gop_nn_weights_layer1[NUM_HIDDEN_NODES_LAYER0 * NUM_LABELS] = { + -0.024695f, 0.146668f, -0.02723f, 0.034577f, -0.255426f, 0.22402f, + -0.112595f, -0.131262f, 0.091164f, -0.045294f, 0.028304f, -0.051683f, + 0.310497f, -0.077786f, -0.047873f, -0.057205f, -0.065119f, 0.227417f, + -0.051126f, -0.137241f, 0.035742f, -0.058992f, -0.021466f, 0.107947f, + -0.077183f, -0.04144f, 0.003568f, -0.027656f, 0.038196f, 0.19684f, + -0.128401f, 0.149629f, 0.024526f, 0.037376f, 0.090752f, -0.061666f, + -0.15743f, 0.057773f, -0.010582f, 0.120997f, 0.060368f, 0.210028f, + -0.192244f, -0.064764f, -0.237655f, 0.1852f, -0.084281f, -0.010434f, + }; + +static const float av1_use_flat_gop_nn_biases_layer1[NUM_LABELS] = { + -0.672434f, +}; + +static const NN_CONFIG av1_use_flat_gop_nn_config = { + NUM_FEATURES, + NUM_LABELS, + NUM_HIDDEN_LAYERS, + { + NUM_HIDDEN_NODES_LAYER0, + }, + { + av1_use_flat_gop_nn_weights_layer0, + av1_use_flat_gop_nn_weights_layer1, + }, + { + av1_use_flat_gop_nn_biases_layer0, + av1_use_flat_gop_nn_biases_layer1, + }, +}; + +#undef NUM_FEATURES +#undef NUM_HIDDEN_LAYERS +#undef NUM_HIDDEN_NODES_LAYER0 +#undef NUM_LABELS + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_USE_FLAT_GOP_MODEL_PARAMS_H_ diff --git a/libs/libaom/src/av1/encoder/var_based_part.c b/libs/libaom/src/av1/encoder/var_based_part.c new file mode 100644 index 000000000..e3cb1fa8f --- /dev/null +++ b/libs/libaom/src/av1/encoder/var_based_part.c @@ -0,0 +1,1006 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/binary_codes_writer.h" +#include "aom_ports/mem.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/system_state.h" + +#include "av1/common/reconinter.h" +#include "av1/common/blockd.h" + +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/var_based_part.h" +#include "av1/encoder/reconinter_enc.h" + +extern const uint8_t AV1_VAR_OFFS[]; + +typedef struct { + VPVariance *part_variances; + VPartVar *split[4]; +} variance_node; + +static AOM_INLINE void tree_to_node(void *data, BLOCK_SIZE bsize, + variance_node *node) { + int i; + node->part_variances = NULL; + switch (bsize) { + case BLOCK_128X128: { + VP128x128 *vt = (VP128x128 *)data; + node->part_variances = &vt->part_variances; + for (i = 0; i < 4; i++) + node->split[i] = &vt->split[i].part_variances.none; + break; + } + case BLOCK_64X64: { + VP64x64 *vt = (VP64x64 *)data; + node->part_variances = &vt->part_variances; + for (i = 0; i < 4; i++) + node->split[i] = &vt->split[i].part_variances.none; + break; + } + case BLOCK_32X32: { + VP32x32 *vt = (VP32x32 *)data; + node->part_variances = &vt->part_variances; + for (i = 0; i < 4; i++) + node->split[i] = &vt->split[i].part_variances.none; + break; + } + case BLOCK_16X16: { + VP16x16 *vt = (VP16x16 *)data; + node->part_variances = &vt->part_variances; + for (i = 0; i < 4; i++) + node->split[i] = &vt->split[i].part_variances.none; + break; + } + case BLOCK_8X8: { + VP8x8 *vt = (VP8x8 *)data; + node->part_variances = &vt->part_variances; + for (i = 0; i < 4; i++) + node->split[i] = &vt->split[i].part_variances.none; + break; + } + default: { + VP4x4 *vt = (VP4x4 *)data; + assert(bsize == BLOCK_4X4); + node->part_variances = &vt->part_variances; + for (i = 0; i < 4; i++) node->split[i] = &vt->split[i]; + break; + } + } +} + +// Set variance values given sum square error, sum error, count. +static AOM_INLINE void fill_variance(uint32_t s2, int32_t s, int c, + VPartVar *v) { + v->sum_square_error = s2; + v->sum_error = s; + v->log2_count = c; +} + +static AOM_INLINE void get_variance(VPartVar *v) { + v->variance = + (int)(256 * (v->sum_square_error - + (uint32_t)(((int64_t)v->sum_error * v->sum_error) >> + v->log2_count)) >> + v->log2_count); +} + +static AOM_INLINE void sum_2_variances(const VPartVar *a, const VPartVar *b, + VPartVar *r) { + assert(a->log2_count == b->log2_count); + fill_variance(a->sum_square_error + b->sum_square_error, + a->sum_error + b->sum_error, a->log2_count + 1, r); +} + +static AOM_INLINE void fill_variance_tree(void *data, BLOCK_SIZE bsize) { + variance_node node; + memset(&node, 0, sizeof(node)); + tree_to_node(data, bsize, &node); + sum_2_variances(node.split[0], node.split[1], &node.part_variances->horz[0]); + sum_2_variances(node.split[2], node.split[3], &node.part_variances->horz[1]); + sum_2_variances(node.split[0], node.split[2], &node.part_variances->vert[0]); + sum_2_variances(node.split[1], node.split[3], &node.part_variances->vert[1]); + sum_2_variances(&node.part_variances->vert[0], &node.part_variances->vert[1], + &node.part_variances->none); +} + +static AOM_INLINE void set_block_size(AV1_COMP *const cpi, MACROBLOCK *const x, + MACROBLOCKD *const xd, int mi_row, + int mi_col, BLOCK_SIZE bsize) { + if (cpi->common.mi_params.mi_cols > mi_col && + cpi->common.mi_params.mi_rows > mi_row) { + set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd, + mi_row, mi_col); + xd->mi[0]->sb_type = bsize; + } +} + +static int set_vt_partitioning(AV1_COMP *cpi, MACROBLOCK *const x, + MACROBLOCKD *const xd, + const TileInfo *const tile, void *data, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int64_t threshold, BLOCK_SIZE bsize_min, + int force_split) { + AV1_COMMON *const cm = &cpi->common; + variance_node vt; + const int block_width = mi_size_wide[bsize]; + const int block_height = mi_size_high[bsize]; + + assert(block_height == block_width); + tree_to_node(data, bsize, &vt); + + if (force_split == 1) return 0; + + // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if + // variance is below threshold, otherwise split will be selected. + // No check for vert/horiz split as too few samples for variance. + if (bsize == bsize_min) { + // Variance already computed to set the force_split. + if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none); + if (mi_col + block_width <= tile->mi_col_end && + mi_row + block_height <= tile->mi_row_end && + vt.part_variances->none.variance < threshold) { + set_block_size(cpi, x, xd, mi_row, mi_col, bsize); + return 1; + } + return 0; + } else if (bsize > bsize_min) { + // Variance already computed to set the force_split. + if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none); + // For key frame: take split for bsize above 32X32 or very high variance. + if (frame_is_intra_only(cm) && + (bsize > BLOCK_32X32 || + vt.part_variances->none.variance > (threshold << 4))) { + return 0; + } + // If variance is low, take the bsize (no split). + if (mi_col + block_width <= tile->mi_col_end && + mi_row + block_height <= tile->mi_row_end && + vt.part_variances->none.variance < threshold) { + set_block_size(cpi, x, xd, mi_row, mi_col, bsize); + return 1; + } + // Check vertical split. + if (mi_row + block_height <= tile->mi_row_end && + mi_col + block_width / 2 <= tile->mi_col_end) { + BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_VERT); + get_variance(&vt.part_variances->vert[0]); + get_variance(&vt.part_variances->vert[1]); + if (vt.part_variances->vert[0].variance < threshold && + vt.part_variances->vert[1].variance < threshold && + get_plane_block_size(subsize, xd->plane[1].subsampling_x, + xd->plane[1].subsampling_y) < BLOCK_INVALID) { + set_block_size(cpi, x, xd, mi_row, mi_col, subsize); + set_block_size(cpi, x, xd, mi_row, mi_col + block_width / 2, subsize); + return 1; + } + } + // Check horizontal split. + if (mi_col + block_width <= tile->mi_col_end && + mi_row + block_height / 2 <= tile->mi_row_end) { + BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ); + get_variance(&vt.part_variances->horz[0]); + get_variance(&vt.part_variances->horz[1]); + if (vt.part_variances->horz[0].variance < threshold && + vt.part_variances->horz[1].variance < threshold && + get_plane_block_size(subsize, xd->plane[1].subsampling_x, + xd->plane[1].subsampling_y) < BLOCK_INVALID) { + set_block_size(cpi, x, xd, mi_row, mi_col, subsize); + set_block_size(cpi, x, xd, mi_row + block_height / 2, mi_col, subsize); + return 1; + } + } + return 0; + } + return 0; +} + +static AOM_INLINE void fill_variance_8x8avg(const uint8_t *s, int sp, + const uint8_t *d, int dp, + int x16_idx, int y16_idx, + VP16x16 *vst, +#if CONFIG_AV1_HIGHBITDEPTH + int highbd_flag, +#endif + int pixels_wide, int pixels_high, + int is_key_frame) { + int k; + for (k = 0; k < 4; k++) { + int x8_idx = x16_idx + ((k & 1) << 3); + int y8_idx = y16_idx + ((k >> 1) << 3); + unsigned int sse = 0; + int sum = 0; + if (x8_idx < pixels_wide && y8_idx < pixels_high) { + int s_avg; + int d_avg = 128; +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) { + s_avg = aom_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp); + if (!is_key_frame) + d_avg = aom_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp); + } else { + s_avg = aom_avg_8x8(s + y8_idx * sp + x8_idx, sp); + if (!is_key_frame) d_avg = aom_avg_8x8(d + y8_idx * dp + x8_idx, dp); + } +#else + s_avg = aom_avg_8x8(s + y8_idx * sp + x8_idx, sp); + if (!is_key_frame) d_avg = aom_avg_8x8(d + y8_idx * dp + x8_idx, dp); +#endif + sum = s_avg - d_avg; + sse = sum * sum; + } + fill_variance(sse, sum, 0, &vst->split[k].part_variances.none); + } +} + +static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d, + int dp, int x16_idx, int y16_idx, +#if CONFIG_AV1_HIGHBITDEPTH + int highbd_flag, +#endif + int pixels_wide, int pixels_high) { + int k; + int minmax_max = 0; + int minmax_min = 255; + // Loop over the 4 8x8 subblocks. + for (k = 0; k < 4; k++) { + int x8_idx = x16_idx + ((k & 1) << 3); + int y8_idx = y16_idx + ((k >> 1) << 3); + int min = 0; + int max = 0; + if (x8_idx < pixels_wide && y8_idx < pixels_high) { +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) { + aom_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp, + d + y8_idx * dp + x8_idx, dp, &min, &max); + } else { + aom_minmax_8x8(s + y8_idx * sp + x8_idx, sp, d + y8_idx * dp + x8_idx, + dp, &min, &max); + } +#else + aom_minmax_8x8(s + y8_idx * sp + x8_idx, sp, d + y8_idx * dp + x8_idx, dp, + &min, &max); +#endif + if ((max - min) > minmax_max) minmax_max = (max - min); + if ((max - min) < minmax_min) minmax_min = (max - min); + } + } + return (minmax_max - minmax_min); +} + +static AOM_INLINE void fill_variance_4x4avg(const uint8_t *s, int sp, + const uint8_t *d, int dp, + int x8_idx, int y8_idx, VP8x8 *vst, +#if CONFIG_AV1_HIGHBITDEPTH + int highbd_flag, +#endif + int pixels_wide, int pixels_high, + int is_key_frame) { + int k; + for (k = 0; k < 4; k++) { + int x4_idx = x8_idx + ((k & 1) << 2); + int y4_idx = y8_idx + ((k >> 1) << 2); + unsigned int sse = 0; + int sum = 0; + if (x4_idx < pixels_wide && y4_idx < pixels_high) { + int s_avg; + int d_avg = 128; +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) { + s_avg = aom_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp); + if (!is_key_frame) + d_avg = aom_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp); + } else { + s_avg = aom_avg_4x4(s + y4_idx * sp + x4_idx, sp); + if (!is_key_frame) d_avg = aom_avg_4x4(d + y4_idx * dp + x4_idx, dp); + } +#else + s_avg = aom_avg_4x4(s + y4_idx * sp + x4_idx, sp); + if (!is_key_frame) d_avg = aom_avg_4x4(d + y4_idx * dp + x4_idx, dp); +#endif + + sum = s_avg - d_avg; + sse = sum * sum; + } + fill_variance(sse, sum, 0, &vst->split[k].part_variances.none); + } +} + +// TODO(kyslov) Bring back threshold adjustment based on content state +static int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed, + int width, int height, + int content_state) { + (void)width; + (void)height; + (void)content_state; + if (speed >= 8) { + return (5 * threshold_base) >> 2; + } + return threshold_base; +} + +// Set the variance split thresholds for following the block sizes: +// 0 - threshold_128x128, 1 - threshold_64x64, 2 - threshold_32x32, +// 3 - vbp_threshold_16x16. 4 - vbp_threshold_8x8 (to split to 4x4 partition) is +// currently only used on key frame. +static AOM_INLINE void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[], + int q, int content_state) { + AV1_COMMON *const cm = &cpi->common; + const int is_key_frame = frame_is_intra_only(cm); + const int threshold_multiplier = is_key_frame ? 40 : 1; + int64_t threshold_base = + (int64_t)(threshold_multiplier * + cpi->enc_quant_dequant_params.dequants.y_dequant_QTX[q][1]); + + if (is_key_frame) { + thresholds[0] = threshold_base; + thresholds[1] = threshold_base; + thresholds[2] = threshold_base >> 2; + thresholds[3] = threshold_base >> 2; + thresholds[4] = threshold_base << 2; + } else { + // Increase base variance threshold based on content_state/sum_diff level. + threshold_base = scale_part_thresh_sumdiff( + threshold_base, cpi->oxcf.speed, cm->width, cm->height, content_state); + + thresholds[0] = threshold_base >> 1; + thresholds[1] = threshold_base; + thresholds[3] = threshold_base << cpi->oxcf.speed; + if (cm->width >= 1280 && cm->height >= 720) + thresholds[3] = thresholds[3] << 1; + if (cm->width * cm->height <= 352 * 288) { + int last_qindex = cpi->rc.last_q[INTER_FRAME]; + if (last_qindex >= QINDEX_HIGH_THR) { + threshold_base = (5 * threshold_base) >> 1; + thresholds[1] = threshold_base >> 3; + thresholds[2] = threshold_base << 2; + thresholds[3] = threshold_base << 5; + } else if (last_qindex < QINDEX_LOW_THR) { + thresholds[1] = threshold_base >> 3; + thresholds[2] = threshold_base >> 1; + thresholds[3] = threshold_base << 3; + } else { + int64_t qi_diff_low = last_qindex - QINDEX_LOW_THR; + int64_t qi_diff_high = QINDEX_HIGH_THR - last_qindex; + int64_t threshold_diff = QINDEX_HIGH_THR - QINDEX_LOW_THR; + int64_t threshold_base_high = (5 * threshold_base) >> 1; + + threshold_diff = threshold_diff > 0 ? threshold_diff : 1; + threshold_base = (qi_diff_low * threshold_base_high + + qi_diff_high * threshold_base) / + threshold_diff; + thresholds[1] = threshold_base >> 3; + thresholds[2] = ((qi_diff_low * threshold_base) + + qi_diff_high * (threshold_base >> 1)) / + threshold_diff; + thresholds[3] = ((qi_diff_low * (threshold_base << 5)) + + qi_diff_high * (threshold_base << 3)) / + threshold_diff; + } + } else if (cm->width < 1280 && cm->height < 720) { + thresholds[2] = (5 * threshold_base) >> 2; + } else if (cm->width < 1920 && cm->height < 1080) { + thresholds[2] = threshold_base << 1; + } else { + thresholds[2] = (5 * threshold_base) >> 1; + } + } +} + +// Set temporal variance low flag for superblock 64x64. +// Only first 25 in the array are used in this case. +static AOM_INLINE void set_low_temp_var_flag_64x64( + CommonModeInfoParams *mi_params, MACROBLOCK *x, MACROBLOCKD *xd, + VP64x64 *vt, const int64_t thresholds[], int mi_col, int mi_row) { + if (xd->mi[0]->sb_type == BLOCK_64X64) { + if ((vt->part_variances).none.variance < (thresholds[0] >> 1)) + x->variance_low[0] = 1; + } else if (xd->mi[0]->sb_type == BLOCK_64X32) { + for (int i = 0; i < 2; i++) { + if (vt->part_variances.horz[i].variance < (thresholds[0] >> 2)) + x->variance_low[i + 1] = 1; + } + } else if (xd->mi[0]->sb_type == BLOCK_32X64) { + for (int i = 0; i < 2; i++) { + if (vt->part_variances.vert[i].variance < (thresholds[0] >> 2)) + x->variance_low[i + 3] = 1; + } + } else { + static const int idx[4][2] = { { 0, 0 }, { 0, 8 }, { 8, 0 }, { 8, 8 } }; + for (int i = 0; i < 4; i++) { + const int idx_str = + mi_params->mi_stride * (mi_row + idx[i][0]) + mi_col + idx[i][1]; + MB_MODE_INFO **this_mi = mi_params->mi_grid_base + idx_str; + + if (mi_params->mi_cols <= mi_col + idx[i][1] || + mi_params->mi_rows <= mi_row + idx[i][0]) + continue; + + if (*this_mi == NULL) continue; + + if ((*this_mi)->sb_type == BLOCK_32X32) { + int64_t threshold_32x32 = (5 * thresholds[1]) >> 3; + if (vt->split[i].part_variances.none.variance < threshold_32x32) + x->variance_low[i + 5] = 1; + } else { + // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block + // inside. + if ((*this_mi)->sb_type == BLOCK_16X16 || + (*this_mi)->sb_type == BLOCK_32X16 || + (*this_mi)->sb_type == BLOCK_16X32) { + for (int j = 0; j < 4; j++) { + if (vt->split[i].split[j].part_variances.none.variance < + (thresholds[2] >> 8)) + x->variance_low[(i << 2) + j + 9] = 1; + } + } + } + } + } +} + +static AOM_INLINE void set_low_temp_var_flag_128x128( + CommonModeInfoParams *mi_params, MACROBLOCK *x, MACROBLOCKD *xd, + VP128x128 *vt, const int64_t thresholds[], int mi_col, int mi_row) { + if (xd->mi[0]->sb_type == BLOCK_128X128) { + if (vt->part_variances.none.variance < (thresholds[0] >> 1)) + x->variance_low[0] = 1; + } else if (xd->mi[0]->sb_type == BLOCK_128X64) { + for (int i = 0; i < 2; i++) { + if (vt->part_variances.horz[i].variance < (thresholds[0] >> 2)) + x->variance_low[i + 1] = 1; + } + } else if (xd->mi[0]->sb_type == BLOCK_64X128) { + for (int i = 0; i < 2; i++) { + if (vt->part_variances.vert[i].variance < (thresholds[0] >> 2)) + x->variance_low[i + 3] = 1; + } + } else { + static const int idx64[4][2] = { + { 0, 0 }, { 0, 16 }, { 16, 0 }, { 16, 16 } + }; + static const int idx32[4][2] = { { 0, 0 }, { 0, 8 }, { 8, 0 }, { 8, 8 } }; + for (int i = 0; i < 4; i++) { + const int idx_str = + mi_params->mi_stride * (mi_row + idx64[i][0]) + mi_col + idx64[i][1]; + MB_MODE_INFO **mi_64 = mi_params->mi_grid_base + idx_str; + if (*mi_64 == NULL) continue; + if (mi_params->mi_cols <= mi_col + idx64[i][1] || + mi_params->mi_rows <= mi_row + idx64[i][0]) + continue; + const int64_t threshold_64x64 = (5 * thresholds[1]) >> 3; + if ((*mi_64)->sb_type == BLOCK_64X64) { + if (vt->split[i].part_variances.none.variance < threshold_64x64) + x->variance_low[5 + i] = 1; + } else if ((*mi_64)->sb_type == BLOCK_64X32) { + for (int j = 0; j < 2; j++) + if (vt->split[i].part_variances.horz[j].variance < + (threshold_64x64 >> 1)) + x->variance_low[9 + (i << 1) + j] = 1; + } else if ((*mi_64)->sb_type == BLOCK_32X64) { + for (int j = 0; j < 2; j++) + if (vt->split[i].part_variances.vert[j].variance < + (threshold_64x64 >> 1)) + x->variance_low[17 + (i << 1) + j] = 1; + } else { + for (int k = 0; k < 4; k++) { + const int idx_str1 = mi_params->mi_stride * idx32[k][0] + idx32[k][1]; + MB_MODE_INFO **mi_32 = mi_params->mi_grid_base + idx_str + idx_str1; + if (*mi_32 == NULL) continue; + + if (mi_params->mi_cols <= mi_col + idx64[i][1] + idx32[k][1] || + mi_params->mi_rows <= mi_row + idx64[i][0] + idx32[k][0]) + continue; + const int64_t threshold_32x32 = (5 * thresholds[2]) >> 3; + if ((*mi_32)->sb_type == BLOCK_32X32) { + if (vt->split[i].split[k].part_variances.none.variance < + threshold_32x32) + x->variance_low[25 + (i << 2) + k] = 1; + } else { + // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block + // inside. + if ((*mi_32)->sb_type == BLOCK_16X16 || + (*mi_32)->sb_type == BLOCK_32X16 || + (*mi_32)->sb_type == BLOCK_16X32) { + for (int j = 0; j < 4; j++) { + if (vt->split[i] + .split[k] + .split[j] + .part_variances.none.variance < (thresholds[3] >> 8)) + x->variance_low[41 + (i << 4) + (k << 2) + j] = 1; + } + } + } + } + } + } + } +} + +static AOM_INLINE void set_low_temp_var_flag( + AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, VP128x128 *vt, + int64_t thresholds[], MV_REFERENCE_FRAME ref_frame_partition, int mi_col, + int mi_row) { + AV1_COMMON *const cm = &cpi->common; + const int mv_thr = cm->width > 640 ? 8 : 4; + // Check temporal variance for bsize >= 16x16, if LAST_FRAME was selected and + // int_pro mv is small. If the temporal variance is small set the flag + // variance_low for the block. The variance threshold can be adjusted, the + // higher the more aggressive. + if (ref_frame_partition == LAST_FRAME && + (cpi->sf.rt_sf.short_circuit_low_temp_var == 1 || + (cpi->sf.rt_sf.estimate_motion_for_var_based_partition && + xd->mi[0]->mv[0].as_mv.col < mv_thr && + xd->mi[0]->mv[0].as_mv.col > -mv_thr && + xd->mi[0]->mv[0].as_mv.row < mv_thr && + xd->mi[0]->mv[0].as_mv.row > -mv_thr))) { + const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64); + if (is_small_sb) + set_low_temp_var_flag_64x64(&cm->mi_params, x, xd, &(vt->split[0]), + thresholds, mi_col, mi_row); + else + set_low_temp_var_flag_128x128(&cm->mi_params, x, xd, vt, thresholds, + mi_col, mi_row); + } +} + +void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q, + int content_state) { + SPEED_FEATURES *const sf = &cpi->sf; + if (sf->part_sf.partition_search_type != VAR_BASED_PARTITION) { + return; + } else { + set_vbp_thresholds(cpi, cpi->vbp_info.thresholds, q, content_state); + // The threshold below is not changed locally. + cpi->vbp_info.threshold_minmax = 15 + (q >> 3); + } +} + +static AOM_INLINE void chroma_check(AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, unsigned int y_sad, + int is_key_frame) { + int i; + MACROBLOCKD *xd = &x->e_mbd; + + if (is_key_frame || cpi->oxcf.monochrome) return; + + for (i = 1; i <= 2; ++i) { + unsigned int uv_sad = UINT_MAX; + struct macroblock_plane *p = &x->plane[i]; + struct macroblockd_plane *pd = &xd->plane[i]; + const BLOCK_SIZE bs = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + + if (bs != BLOCK_INVALID) + uv_sad = cpi->fn_ptr[bs].sdf(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride); + + x->color_sensitivity[i - 1] = uv_sad > (y_sad >> 2); + } +} + +// This function chooses partitioning based on the variance between source and +// reconstructed last, where variance is computed for down-sampled inputs. +// TODO(kyslov): lot of things. Bring back noise estimation, brush up partition +// selection and most of all - retune the thresholds +int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile, + ThreadData *td, MACROBLOCK *x, int mi_row, + int mi_col) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + const int64_t *const vbp_thresholds = cpi->vbp_info.thresholds; + + int i, j, k, m; + VP128x128 *vt; + VP16x16 *vt2 = NULL; + unsigned char force_split[85]; + int avg_32x32; + int max_var_32x32[4]; + int min_var_32x32[4]; + int var_32x32; + int var_64x64; + int min_var_64x64 = INT_MAX; + int max_var_64x64 = 0; + int avg_16x16[4][4]; + int maxvar_16x16[4][4]; + int minvar_16x16[4][4]; + int64_t threshold_4x4avg; + int content_state = 0; + uint8_t *s; + const uint8_t *d; + int sp; + int dp; + // TODO(kyslov) Bring back compute_minmax_variance with content type detection + int compute_minmax_variance = 0; + int is_key_frame = frame_is_intra_only(cm); + int pixels_wide = 128, pixels_high = 128; + assert(cm->seq_params.sb_size == BLOCK_64X64 || + cm->seq_params.sb_size == BLOCK_128X128); + const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64); + const int num_64x64_blocks = is_small_sb ? 1 : 4; + + unsigned int y_sad = UINT_MAX; + unsigned int y_sad_g = UINT_MAX; + BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128; + + // Ref frame used in partitioning. + MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME; + + CHECK_MEM_ERROR(cm, vt, aom_malloc(sizeof(*vt))); + + vt->split = td->vt64x64; + + int64_t thresholds[5] = { vbp_thresholds[0], vbp_thresholds[1], + vbp_thresholds[2], vbp_thresholds[3], + vbp_thresholds[4] }; + + const int low_res = (cm->width <= 352 && cm->height <= 288); + int variance4x4downsample[64]; + int segment_id; + const int num_planes = av1_num_planes(cm); + + segment_id = xd->mi[0]->segment_id; + + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled && + cyclic_refresh_segment_id_boosted(segment_id) && + cpi->sf.rt_sf.use_nonrd_pick_mode) { + int q = av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex); + set_vbp_thresholds(cpi, thresholds, q, content_state); + } else { + set_vbp_thresholds(cpi, thresholds, cm->quant_params.base_qindex, + content_state); + } + + if (is_small_sb) { + pixels_wide = 64; + pixels_high = 64; + } + + // For non keyframes, disable 4x4 average for low resolution when speed = 8 + threshold_4x4avg = INT64_MAX; + + if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3); + if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3); + + s = x->plane[0].src.buf; + sp = x->plane[0].src.stride; + + // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks, + // 5-20 for the 16x16 blocks. + force_split[0] = 0; + memset(x->variance_low, 0, sizeof(x->variance_low)); + + if (!is_key_frame) { + // TODO(kyslov): we are assuming that the ref is LAST_FRAME! Check if it + // is!! + MB_MODE_INFO *mi = xd->mi[0]; + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME); + assert(yv12 != NULL); + const YV12_BUFFER_CONFIG *yv12_g = NULL; + + // For non-SVC GOLDEN is another temporal reference. Check if it should be + // used as reference for partitioning. + if (!cpi->use_svc && (cpi->ref_frame_flags & AOM_GOLD_FLAG) && + cpi->sf.rt_sf.use_nonrd_pick_mode) { + yv12_g = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME); + if (yv12_g && yv12_g != yv12) { + av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, + get_ref_scale_factors(cm, GOLDEN_FRAME), + num_planes); + y_sad_g = cpi->fn_ptr[bsize].sdf( + x->plane[0].src.buf, x->plane[0].src.stride, + xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride); + } + } + + av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, + get_ref_scale_factors(cm, LAST_FRAME), num_planes); + mi->ref_frame[0] = LAST_FRAME; + mi->ref_frame[1] = NONE_FRAME; + mi->sb_type = cm->seq_params.sb_size; + mi->mv[0].as_int = 0; + mi->interp_filters = av1_broadcast_interp_filter(BILINEAR); + if (cpi->sf.rt_sf.estimate_motion_for_var_based_partition) { + if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) { + const MV dummy_mv = { 0, 0 }; + y_sad = av1_int_pro_motion_estimation(cpi, x, cm->seq_params.sb_size, + mi_row, mi_col, &dummy_mv); + } + } + if (y_sad == UINT_MAX) { + y_sad = cpi->fn_ptr[bsize].sdf( + x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf, + xd->plane[0].pre[0].stride); + } + + // Pick the ref frame for partitioning, use golden frame only if its + // lower sad. + if (y_sad_g < 0.9 * y_sad) { + av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, + get_ref_scale_factors(cm, GOLDEN_FRAME), num_planes); + mi->ref_frame[0] = GOLDEN_FRAME; + mi->mv[0].as_int = 0; + y_sad = y_sad_g; + ref_frame_partition = GOLDEN_FRAME; + x->nonrd_prune_ref_frame_search = 0; + } else { + x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv; + ref_frame_partition = LAST_FRAME; + x->nonrd_prune_ref_frame_search = + cpi->sf.rt_sf.nonrd_prune_ref_frame_search; + } + + set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, + cm->seq_params.sb_size, AOM_PLANE_Y, + AOM_PLANE_Y); + + d = xd->plane[0].dst.buf; + dp = xd->plane[0].dst.stride; + } else { + d = AV1_VAR_OFFS; + dp = 0; + } + + if (low_res && threshold_4x4avg < INT64_MAX) + CHECK_MEM_ERROR(cm, vt2, aom_malloc(sizeof(*vt2))); + // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances + // for splits. + for (m = 0; m < num_64x64_blocks; m++) { + const int x64_idx = ((m & 1) << 6); + const int y64_idx = ((m >> 1) << 6); + const int m2 = m << 2; + force_split[m + 1] = 0; + max_var_32x32[m] = 0; + min_var_32x32[m] = INT_MAX; + for (i = 0; i < 4; i++) { + const int x32_idx = x64_idx + ((i & 1) << 5); + const int y32_idx = y64_idx + ((i >> 1) << 5); + const int i2 = (m2 + i) << 2; + force_split[5 + m2 + i] = 0; + avg_16x16[m][i] = 0; + maxvar_16x16[m][i] = 0; + minvar_16x16[m][i] = INT_MAX; + for (j = 0; j < 4; j++) { + const int x16_idx = x32_idx + ((j & 1) << 4); + const int y16_idx = y32_idx + ((j >> 1) << 4); + const int split_index = 21 + i2 + j; + VP16x16 *vst = &vt->split[m].split[i].split[j]; + force_split[split_index] = 0; + variance4x4downsample[i2 + j] = 0; + if (!is_key_frame) { + fill_variance_8x8avg(s, sp, d, dp, x16_idx, y16_idx, vst, +#if CONFIG_AV1_HIGHBITDEPTH + xd->cur_buf->flags, +#endif + pixels_wide, pixels_high, is_key_frame); + fill_variance_tree(&vt->split[m].split[i].split[j], BLOCK_16X16); + get_variance(&vt->split[m].split[i].split[j].part_variances.none); + avg_16x16[m][i] += + vt->split[m].split[i].split[j].part_variances.none.variance; + if (vt->split[m].split[i].split[j].part_variances.none.variance < + minvar_16x16[m][i]) + minvar_16x16[m][i] = + vt->split[m].split[i].split[j].part_variances.none.variance; + if (vt->split[m].split[i].split[j].part_variances.none.variance > + maxvar_16x16[m][i]) + maxvar_16x16[m][i] = + vt->split[m].split[i].split[j].part_variances.none.variance; + if (vt->split[m].split[i].split[j].part_variances.none.variance > + thresholds[3]) { + // 16X16 variance is above threshold for split, so force split to + // 8x8 for this 16x16 block (this also forces splits for upper + // levels). + force_split[split_index] = 1; + force_split[5 + m2 + i] = 1; + force_split[m + 1] = 1; + force_split[0] = 1; + } else if (compute_minmax_variance && + vt->split[m] + .split[i] + .split[j] + .part_variances.none.variance > thresholds[2] && + !cyclic_refresh_segment_id_boosted(segment_id)) { + // We have some nominal amount of 16x16 variance (based on average), + // compute the minmax over the 8x8 sub-blocks, and if above + // threshold, force split to 8x8 block for this 16x16 block. + int minmax = compute_minmax_8x8(s, sp, d, dp, x16_idx, y16_idx, +#if CONFIG_AV1_HIGHBITDEPTH + xd->cur_buf->flags, +#endif + pixels_wide, pixels_high); + int thresh_minmax = (int)cpi->vbp_info.threshold_minmax; + if (minmax > thresh_minmax) { + force_split[split_index] = 1; + force_split[5 + m2 + i] = 1; + force_split[m + 1] = 1; + force_split[0] = 1; + } + } + } + if (is_key_frame) { + force_split[split_index] = 0; + // Go down to 4x4 down-sampling for variance. + variance4x4downsample[i2 + j] = 1; + for (k = 0; k < 4; k++) { + int x8_idx = x16_idx + ((k & 1) << 3); + int y8_idx = y16_idx + ((k >> 1) << 3); + VP8x8 *vst2 = is_key_frame ? &vst->split[k] : &vt2[i2 + j].split[k]; + fill_variance_4x4avg(s, sp, d, dp, x8_idx, y8_idx, vst2, +#if CONFIG_AV1_HIGHBITDEPTH + xd->cur_buf->flags, +#endif + pixels_wide, pixels_high, is_key_frame); + } + } + } + } + } + + // Fill the rest of the variance tree by summing split partition values. + for (m = 0; m < num_64x64_blocks; ++m) { + avg_32x32 = 0; + const int m2 = m << 2; + for (i = 0; i < 4; i++) { + const int i2 = (m2 + i) << 2; + for (j = 0; j < 4; j++) { + const int split_index = 21 + i2 + j; + if (variance4x4downsample[i2 + j] == 1) { + VP16x16 *vtemp = + (!is_key_frame) ? &vt2[i2 + j] : &vt->split[m].split[i].split[j]; + for (k = 0; k < 4; k++) + fill_variance_tree(&vtemp->split[k], BLOCK_8X8); + fill_variance_tree(vtemp, BLOCK_16X16); + // If variance of this 16x16 block is above the threshold, force block + // to split. This also forces a split on the upper levels. + get_variance(&vtemp->part_variances.none); + if (vtemp->part_variances.none.variance > thresholds[3]) { + force_split[split_index] = 1; + force_split[5 + m2 + i] = 1; + force_split[m + 1] = 1; + force_split[0] = 1; + } + } + } + fill_variance_tree(&vt->split[m].split[i], BLOCK_32X32); + // If variance of this 32x32 block is above the threshold, or if its above + // (some threshold of) the average variance over the sub-16x16 blocks, + // then force this block to split. This also forces a split on the upper + // (64x64) level. + if (!force_split[5 + m2 + i]) { + get_variance(&vt->split[m].split[i].part_variances.none); + var_32x32 = vt->split[m].split[i].part_variances.none.variance; + max_var_32x32[m] = AOMMAX(var_32x32, max_var_32x32[m]); + min_var_32x32[m] = AOMMIN(var_32x32, min_var_32x32[m]); + if (vt->split[m].split[i].part_variances.none.variance > + thresholds[2] || + (!is_key_frame && + vt->split[m].split[i].part_variances.none.variance > + (thresholds[2] >> 1) && + vt->split[m].split[i].part_variances.none.variance > + (avg_16x16[m][i] >> 1))) { + force_split[5 + m2 + i] = 1; + force_split[m + 1] = 1; + force_split[0] = 1; + } else if (!is_key_frame && cm->height <= 360 && + (maxvar_16x16[m][i] - minvar_16x16[m][i]) > + (thresholds[2] >> 1) && + maxvar_16x16[m][i] > thresholds[2]) { + force_split[5 + m2 + i] = 1; + force_split[m + 1] = 1; + force_split[0] = 1; + } + avg_32x32 += var_32x32; + } + } + if (!force_split[1 + m]) { + fill_variance_tree(&vt->split[m], BLOCK_64X64); + get_variance(&vt->split[m].part_variances.none); + var_64x64 = vt->split[m].part_variances.none.variance; + max_var_64x64 = AOMMAX(var_64x64, max_var_64x64); + min_var_64x64 = AOMMIN(var_64x64, min_var_64x64); + // If variance of this 64x64 block is above (some threshold of) the + // average variance over the sub-32x32 blocks, then force this block to + // split. Only checking this for noise level >= medium for now. + + if (!is_key_frame && + (max_var_32x32[m] - min_var_32x32[m]) > 3 * (thresholds[1] >> 3) && + max_var_32x32[m] > thresholds[1] >> 1) + force_split[1 + m] = 1; + } + if (is_small_sb) force_split[0] = 1; + } + + if (!force_split[0]) { + fill_variance_tree(vt, BLOCK_128X128); + get_variance(&vt->part_variances.none); + if (!is_key_frame && + (max_var_64x64 - min_var_64x64) > 3 * (thresholds[0] >> 3) && + max_var_64x64 > thresholds[0] >> 1) + force_split[0] = 1; + } + + if (mi_col + 32 > tile->mi_col_end || mi_row + 32 > tile->mi_row_end || + !set_vt_partitioning(cpi, x, xd, tile, vt, BLOCK_128X128, mi_row, mi_col, + thresholds[0], BLOCK_16X16, force_split[0])) { + for (m = 0; m < num_64x64_blocks; ++m) { + const int x64_idx = ((m & 1) << 4); + const int y64_idx = ((m >> 1) << 4); + const int m2 = m << 2; + + // Now go through the entire structure, splitting every block size until + // we get to one that's got a variance lower than our threshold. + if (!set_vt_partitioning(cpi, x, xd, tile, &vt->split[m], BLOCK_64X64, + mi_row + y64_idx, mi_col + x64_idx, + thresholds[1], BLOCK_16X16, + force_split[1 + m])) { + for (i = 0; i < 4; ++i) { + const int x32_idx = ((i & 1) << 3); + const int y32_idx = ((i >> 1) << 3); + const int i2 = (m2 + i) << 2; + if (!set_vt_partitioning(cpi, x, xd, tile, &vt->split[m].split[i], + BLOCK_32X32, (mi_row + y64_idx + y32_idx), + (mi_col + x64_idx + x32_idx), thresholds[2], + BLOCK_16X16, force_split[5 + m2 + i])) { + for (j = 0; j < 4; ++j) { + const int x16_idx = ((j & 1) << 2); + const int y16_idx = ((j >> 1) << 2); + const int split_index = 21 + i2 + j; + // For inter frames: if variance4x4downsample[] == 1 for this + // 16x16 block, then the variance is based on 4x4 down-sampling, + // so use vt2 in set_vt_partioning(), otherwise use vt. + VP16x16 *vtemp = + (!is_key_frame && variance4x4downsample[i2 + j] == 1) + ? &vt2[i2 + j] + : &vt->split[m].split[i].split[j]; + if (!set_vt_partitioning(cpi, x, xd, tile, vtemp, BLOCK_16X16, + mi_row + y64_idx + y32_idx + y16_idx, + mi_col + x64_idx + x32_idx + x16_idx, + thresholds[3], BLOCK_8X8, + force_split[split_index])) { + for (k = 0; k < 4; ++k) { + const int x8_idx = (k & 1) << 1; + const int y8_idx = (k >> 1) << 1; + set_block_size( + cpi, x, xd, + (mi_row + y64_idx + y32_idx + y16_idx + y8_idx), + (mi_col + x64_idx + x32_idx + x16_idx + x8_idx), + BLOCK_8X8); + } + } + } + } + } + } + } + } + + if (cpi->sf.rt_sf.short_circuit_low_temp_var) { + set_low_temp_var_flag(cpi, x, xd, vt, thresholds, ref_frame_partition, + mi_col, mi_row); + } + chroma_check(cpi, x, bsize, y_sad, is_key_frame); + + if (vt2) aom_free(vt2); + if (vt) aom_free(vt); + return 0; +} diff --git a/libs/libaom/src/av1/encoder/var_based_part.h b/libs/libaom/src/av1/encoder/var_based_part.h new file mode 100644 index 000000000..a80e25c32 --- /dev/null +++ b/libs/libaom/src/av1/encoder/var_based_part.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_VAR_BASED_PART_H_ +#define AOM_AV1_ENCODER_VAR_BASED_PART_H_ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "av1/encoder/encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define QINDEX_LOW_THR \ + 200 // Use low qindex variance partition thresholds when qindex is below this + // threshold +#define QINDEX_HIGH_THR \ + 220 // Use high qindex variance partition thresholds when qindex is above + // this threshold + +void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q, + int content_state); + +int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile, + ThreadData *td, MACROBLOCK *x, int mi_row, + int mi_col); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_VAR_BASED_PART_H_ diff --git a/libs/libaom/src/av1/encoder/wedge_utils.c b/libs/libaom/src/av1/encoder/wedge_utils.c new file mode 100644 index 000000000..40670178d --- /dev/null +++ b/libs/libaom/src/av1/encoder/wedge_utils.c @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom/aom_integer.h" + +#include "aom_ports/mem.h" + +#include "aom_dsp/aom_dsp_common.h" + +#include "av1/common/reconinter.h" + +#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS) + +/** + * Computes SSE of a compound predictor constructed from 2 fundamental + * predictors p0 and p1 using blending with mask. + * + * r1: Residuals of p1. + * (source - p1) + * d: Difference of p1 and p0. + * (p1 - p0) + * m: The blending mask + * N: Number of pixels + * + * 'r1', 'd', and 'm' are contiguous. + * + * Computes: + * Sum((MAX_MASK_VALUE*r1 + mask*d)**2), which is equivalent to: + * Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2), + * where r0 is (source - p0), and r1 is (source - p1), which is in turn + * is equivalent to: + * Sum((source*MAX_MASK_VALUE - (mask*p0 + (MAX_MASK_VALUE-mask)*p1))**2), + * which is the SSE of the residuals of the compound predictor scaled up by + * MAX_MASK_VALUE**2. + * + * Note that we clamp the partial term in the loop to 16 bits signed. This is + * to facilitate equivalent SIMD implementation. It should have no effect if + * residuals are within 16 - WEDGE_WEIGHT_BITS (=10) signed, which always + * holds for 8 bit input, and on real input, it should hold practically always, + * as residuals are expected to be small. + */ +uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d, + const uint8_t *m, int N) { + uint64_t csse = 0; + int i; + + for (i = 0; i < N; i++) { + int32_t t = MAX_MASK_VALUE * r1[i] + m[i] * d[i]; + t = clamp(t, INT16_MIN, INT16_MAX); + csse += t * t; + } + return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); +} + +/** + * Choose the mask sign for a compound predictor. + * + * ds: Difference of the squares of the residuals. + * r0**2 - r1**2 + * m: The blending mask + * N: Number of pixels + * limit: Pre-computed threshold value. + * MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2)) + * + * 'ds' and 'm' are contiguous. + * + * Returns true if the negated mask has lower SSE compared to the positive + * mask. Computation is based on: + * Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2) + * > + * Sum(((MAX_MASK_VALUE-mask)*r0 + mask*r1)**2) + * + * which can be simplified to: + * + * Sum(mask*(r0**2 - r1**2)) > MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2)) + * + * The right hand side does not depend on the mask, and needs to be passed as + * the 'limit' parameter. + * + * After pre-computing (r0**2 - r1**2), which is passed in as 'ds', the left + * hand side is simply a scalar product between an int16_t and uint8_t vector. + * + * Note that for efficiency, ds is stored on 16 bits. Real input residuals + * being small, this should not cause a noticeable issue. + */ +int8_t av1_wedge_sign_from_residuals_c(const int16_t *ds, const uint8_t *m, + int N, int64_t limit) { + int64_t acc = 0; + + do { + acc += *ds++ * *m++; + } while (--N); + + return acc > limit; +} + +/** + * Compute the element-wise difference of the squares of 2 arrays. + * + * d: Difference of the squares of the inputs: a**2 - b**2 + * a: First input array + * b: Second input array + * N: Number of elements + * + * 'd', 'a', and 'b' are contiguous. + * + * The result is saturated to signed 16 bits. + */ +void av1_wedge_compute_delta_squares_c(int16_t *d, const int16_t *a, + const int16_t *b, int N) { + int i; + + for (i = 0; i < N; i++) + d[i] = clamp(a[i] * a[i] - b[i] * b[i], INT16_MIN, INT16_MAX); +} diff --git a/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm1d_sse4.c new file mode 100644 index 000000000..62eaa3074 --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm1d_sse4.c @@ -0,0 +1,1417 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/encoder/x86/av1_txfm1d_sse4.h" + +void av1_fdct32_sse4_1(__m128i *input, __m128i *output, int cos_bit, + const int stride) { + __m128i buf0[32]; + __m128i buf1[32]; + const int32_t *cospi; + + int startidx = 0 * stride; + int endidx = 31 * stride; + // stage 0 + // stage 1 + buf1[0] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[31] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[1] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[30] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[2] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[29] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[3] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[28] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[4] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[27] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[5] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[26] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[6] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[25] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[7] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[24] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[8] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[23] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[9] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[22] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[10] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[21] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[11] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[20] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[12] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[19] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[13] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[18] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[14] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[17] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[15] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[16] = _mm_sub_epi32(input[startidx], input[endidx]); + + // stage 2 + cospi = cospi_arr(cos_bit); + buf0[0] = _mm_add_epi32(buf1[0], buf1[15]); + buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]); + buf0[1] = _mm_add_epi32(buf1[1], buf1[14]); + buf0[14] = _mm_sub_epi32(buf1[1], buf1[14]); + buf0[2] = _mm_add_epi32(buf1[2], buf1[13]); + buf0[13] = _mm_sub_epi32(buf1[2], buf1[13]); + buf0[3] = _mm_add_epi32(buf1[3], buf1[12]); + buf0[12] = _mm_sub_epi32(buf1[3], buf1[12]); + buf0[4] = _mm_add_epi32(buf1[4], buf1[11]); + buf0[11] = _mm_sub_epi32(buf1[4], buf1[11]); + buf0[5] = _mm_add_epi32(buf1[5], buf1[10]); + buf0[10] = _mm_sub_epi32(buf1[5], buf1[10]); + buf0[6] = _mm_add_epi32(buf1[6], buf1[9]); + buf0[9] = _mm_sub_epi32(buf1[6], buf1[9]); + buf0[7] = _mm_add_epi32(buf1[7], buf1[8]); + buf0[8] = _mm_sub_epi32(buf1[7], buf1[8]); + buf0[16] = buf1[16]; + buf0[17] = buf1[17]; + buf0[18] = buf1[18]; + buf0[19] = buf1[19]; + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20], + buf0[27], cos_bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22], + buf0[25], cos_bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23], + buf0[24], cos_bit); + buf0[28] = buf1[28]; + buf0[29] = buf1[29]; + buf0[30] = buf1[30]; + buf0[31] = buf1[31]; + + // stage 3 + cospi = cospi_arr(cos_bit); + buf1[0] = _mm_add_epi32(buf0[0], buf0[7]); + buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]); + buf1[1] = _mm_add_epi32(buf0[1], buf0[6]); + buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]); + buf1[2] = _mm_add_epi32(buf0[2], buf0[5]); + buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]); + buf1[3] = _mm_add_epi32(buf0[3], buf0[4]); + buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]); + buf1[8] = buf0[8]; + buf1[9] = buf0[9]; + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10], + buf1[13], cos_bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11], + buf1[12], cos_bit); + buf1[14] = buf0[14]; + buf1[15] = buf0[15]; + buf1[16] = _mm_add_epi32(buf0[16], buf0[23]); + buf1[23] = _mm_sub_epi32(buf0[16], buf0[23]); + buf1[17] = _mm_add_epi32(buf0[17], buf0[22]); + buf1[22] = _mm_sub_epi32(buf0[17], buf0[22]); + buf1[18] = _mm_add_epi32(buf0[18], buf0[21]); + buf1[21] = _mm_sub_epi32(buf0[18], buf0[21]); + buf1[19] = _mm_add_epi32(buf0[19], buf0[20]); + buf1[20] = _mm_sub_epi32(buf0[19], buf0[20]); + buf1[24] = _mm_sub_epi32(buf0[31], buf0[24]); + buf1[31] = _mm_add_epi32(buf0[31], buf0[24]); + buf1[25] = _mm_sub_epi32(buf0[30], buf0[25]); + buf1[30] = _mm_add_epi32(buf0[30], buf0[25]); + buf1[26] = _mm_sub_epi32(buf0[29], buf0[26]); + buf1[29] = _mm_add_epi32(buf0[29], buf0[26]); + buf1[27] = _mm_sub_epi32(buf0[28], buf0[27]); + buf1[28] = _mm_add_epi32(buf0[28], buf0[27]); + + // stage 4 + cospi = cospi_arr(cos_bit); + buf0[0] = _mm_add_epi32(buf1[0], buf1[3]); + buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]); + buf0[1] = _mm_add_epi32(buf1[1], buf1[2]); + buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]); + buf0[4] = buf1[4]; + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6], + cos_bit); + buf0[7] = buf1[7]; + buf0[8] = _mm_add_epi32(buf1[8], buf1[11]); + buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]); + buf0[9] = _mm_add_epi32(buf1[9], buf1[10]); + buf0[10] = _mm_sub_epi32(buf1[9], buf1[10]); + buf0[12] = _mm_sub_epi32(buf1[15], buf1[12]); + buf0[15] = _mm_add_epi32(buf1[15], buf1[12]); + buf0[13] = _mm_sub_epi32(buf1[14], buf1[13]); + buf0[14] = _mm_add_epi32(buf1[14], buf1[13]); + buf0[16] = buf1[16]; + buf0[17] = buf1[17]; + btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18], + buf0[29], cos_bit); + btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19], + buf0[28], cos_bit); + btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20], + buf0[27], cos_bit); + btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + buf0[22] = buf1[22]; + buf0[23] = buf1[23]; + buf0[24] = buf1[24]; + buf0[25] = buf1[25]; + buf0[30] = buf1[30]; + buf0[31] = buf1[31]; + + // stage 5 + cospi = cospi_arr(cos_bit); + btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1], + cos_bit); + btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3], + cos_bit); + buf1[4] = _mm_add_epi32(buf0[4], buf0[5]); + buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]); + buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]); + buf1[7] = _mm_add_epi32(buf0[7], buf0[6]); + buf1[8] = buf0[8]; + btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], + buf1[14], cos_bit); + btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10], + buf1[13], cos_bit); + buf1[11] = buf0[11]; + buf1[12] = buf0[12]; + buf1[15] = buf0[15]; + buf1[16] = _mm_add_epi32(buf0[16], buf0[19]); + buf1[19] = _mm_sub_epi32(buf0[16], buf0[19]); + buf1[17] = _mm_add_epi32(buf0[17], buf0[18]); + buf1[18] = _mm_sub_epi32(buf0[17], buf0[18]); + buf1[20] = _mm_sub_epi32(buf0[23], buf0[20]); + buf1[23] = _mm_add_epi32(buf0[23], buf0[20]); + buf1[21] = _mm_sub_epi32(buf0[22], buf0[21]); + buf1[22] = _mm_add_epi32(buf0[22], buf0[21]); + buf1[24] = _mm_add_epi32(buf0[24], buf0[27]); + buf1[27] = _mm_sub_epi32(buf0[24], buf0[27]); + buf1[25] = _mm_add_epi32(buf0[25], buf0[26]); + buf1[26] = _mm_sub_epi32(buf0[25], buf0[26]); + buf1[28] = _mm_sub_epi32(buf0[31], buf0[28]); + buf1[31] = _mm_add_epi32(buf0[31], buf0[28]); + buf1[29] = _mm_sub_epi32(buf0[30], buf0[29]); + buf1[30] = _mm_add_epi32(buf0[30], buf0[29]); + + // stage 6 + cospi = cospi_arr(cos_bit); + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + buf0[2] = buf1[2]; + buf0[3] = buf1[3]; + btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7], + cos_bit); + btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6], + cos_bit); + buf0[8] = _mm_add_epi32(buf1[8], buf1[9]); + buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]); + buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]); + buf0[11] = _mm_add_epi32(buf1[11], buf1[10]); + buf0[12] = _mm_add_epi32(buf1[12], buf1[13]); + buf0[13] = _mm_sub_epi32(buf1[12], buf1[13]); + buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]); + buf0[15] = _mm_add_epi32(buf1[15], buf1[14]); + buf0[16] = buf1[16]; + btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17], + buf0[30], cos_bit); + btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18], + buf0[29], cos_bit); + buf0[19] = buf1[19]; + buf0[20] = buf1[20]; + btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22], + buf0[25], cos_bit); + buf0[23] = buf1[23]; + buf0[24] = buf1[24]; + buf0[27] = buf1[27]; + buf0[28] = buf1[28]; + buf0[31] = buf1[31]; + + // stage 7 + cospi = cospi_arr(cos_bit); + buf1[0] = buf0[0]; + buf1[1] = buf0[1]; + buf1[2] = buf0[2]; + buf1[3] = buf0[3]; + buf1[4] = buf0[4]; + buf1[5] = buf0[5]; + buf1[6] = buf0[6]; + buf1[7] = buf0[7]; + btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15], + cos_bit); + btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9], + buf1[14], cos_bit); + btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10], + buf1[13], cos_bit); + btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11], + buf1[12], cos_bit); + buf1[16] = _mm_add_epi32(buf0[16], buf0[17]); + buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]); + buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]); + buf1[19] = _mm_add_epi32(buf0[19], buf0[18]); + buf1[20] = _mm_add_epi32(buf0[20], buf0[21]); + buf1[21] = _mm_sub_epi32(buf0[20], buf0[21]); + buf1[22] = _mm_sub_epi32(buf0[23], buf0[22]); + buf1[23] = _mm_add_epi32(buf0[23], buf0[22]); + buf1[24] = _mm_add_epi32(buf0[24], buf0[25]); + buf1[25] = _mm_sub_epi32(buf0[24], buf0[25]); + buf1[26] = _mm_sub_epi32(buf0[27], buf0[26]); + buf1[27] = _mm_add_epi32(buf0[27], buf0[26]); + buf1[28] = _mm_add_epi32(buf0[28], buf0[29]); + buf1[29] = _mm_sub_epi32(buf0[28], buf0[29]); + buf1[30] = _mm_sub_epi32(buf0[31], buf0[30]); + buf1[31] = _mm_add_epi32(buf0[31], buf0[30]); + + // stage 8 + cospi = cospi_arr(cos_bit); + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + buf0[2] = buf1[2]; + buf0[3] = buf1[3]; + buf0[4] = buf1[4]; + buf0[5] = buf1[5]; + buf0[6] = buf1[6]; + buf0[7] = buf1[7]; + buf0[8] = buf1[8]; + buf0[9] = buf1[9]; + buf0[10] = buf1[10]; + buf0[11] = buf1[11]; + buf0[12] = buf1[12]; + buf0[13] = buf1[13]; + buf0[14] = buf1[14]; + buf0[15] = buf1[15]; + btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16], + buf0[31], cos_bit); + btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17], + buf0[30], cos_bit); + btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18], + buf0[29], cos_bit); + btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19], + buf0[28], cos_bit); + btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20], + buf0[27], cos_bit); + btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22], + buf0[25], cos_bit); + btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23], + buf0[24], cos_bit); + + startidx = 0 * stride; + endidx = 31 * stride; + // stage 9 + output[startidx] = buf0[0]; + output[endidx] = buf0[31]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[16]; + output[endidx] = buf0[15]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[8]; + output[endidx] = buf0[23]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[24]; + output[endidx] = buf0[7]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[4]; + output[endidx] = buf0[27]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[20]; + output[endidx] = buf0[11]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[12]; + output[endidx] = buf0[19]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[28]; + output[endidx] = buf0[3]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[2]; + output[endidx] = buf0[29]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[18]; + output[endidx] = buf0[13]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[10]; + output[endidx] = buf0[21]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[26]; + output[endidx] = buf0[5]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[6]; + output[endidx] = buf0[25]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[22]; + output[endidx] = buf0[9]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[14]; + output[endidx] = buf0[17]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[30]; + output[endidx] = buf0[1]; +} + +void av1_fadst4_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range) { + const int txfm_size = 4; + const int num_per_128 = 4; + const int32_t *cospi; + __m128i buf0[4]; + __m128i buf1[4]; + int col_num = txfm_size / num_per_128; + int col; + (void)stage_range; + for (col = 0; col < col_num; col++) { + // stage 0; + int32_t stage_idx = 0; + int j; + for (j = 0; j < 4; ++j) { + buf0[j] = input[j * col_num + col]; + } + + // stage 1 + stage_idx++; + buf1[0] = buf0[3]; + buf1[1] = buf0[0]; + buf1[2] = buf0[1]; + buf1[3] = buf0[2]; + + // stage 2 + stage_idx++; + + cospi = cospi_arr(cos_bit); + btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1], + cos_bit); + btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2], + buf0[3], cos_bit); + + // stage 3 + stage_idx++; + buf1[0] = _mm_add_epi32(buf0[0], buf0[2]); + buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]); + buf1[1] = _mm_add_epi32(buf0[1], buf0[3]); + buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]); + + // stage 4 + stage_idx++; + + cospi = cospi_arr(cos_bit); + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], + buf0[3], cos_bit); + + // stage 5 + stage_idx++; + buf1[0] = buf0[0]; + buf1[1] = _mm_sub_epi32(_mm_setzero_si128(), buf0[2]); + buf1[2] = buf0[3]; + buf1[3] = _mm_sub_epi32(_mm_setzero_si128(), buf0[1]); + + for (j = 0; j < 4; ++j) { + output[j * col_num + col] = buf1[j]; + } + } +} + +void av1_fdct64_sse4_1(__m128i *input, __m128i *output, int8_t cos_bit, + const int instride, const int outstride) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_m32 = _mm_set1_epi32(-cospi[32]); + __m128i cospi_p32 = _mm_set1_epi32(cospi[32]); + __m128i cospi_m16 = _mm_set1_epi32(-cospi[16]); + __m128i cospi_p48 = _mm_set1_epi32(cospi[48]); + __m128i cospi_m48 = _mm_set1_epi32(-cospi[48]); + __m128i cospi_p16 = _mm_set1_epi32(cospi[16]); + __m128i cospi_m08 = _mm_set1_epi32(-cospi[8]); + __m128i cospi_p56 = _mm_set1_epi32(cospi[56]); + __m128i cospi_m56 = _mm_set1_epi32(-cospi[56]); + __m128i cospi_m40 = _mm_set1_epi32(-cospi[40]); + __m128i cospi_p24 = _mm_set1_epi32(cospi[24]); + __m128i cospi_m24 = _mm_set1_epi32(-cospi[24]); + __m128i cospi_p08 = _mm_set1_epi32(cospi[8]); + __m128i cospi_p40 = _mm_set1_epi32(cospi[40]); + __m128i cospi_p60 = _mm_set1_epi32(cospi[60]); + __m128i cospi_p04 = _mm_set1_epi32(cospi[4]); + __m128i cospi_p28 = _mm_set1_epi32(cospi[28]); + __m128i cospi_p36 = _mm_set1_epi32(cospi[36]); + __m128i cospi_p44 = _mm_set1_epi32(cospi[44]); + __m128i cospi_p20 = _mm_set1_epi32(cospi[20]); + __m128i cospi_p12 = _mm_set1_epi32(cospi[12]); + __m128i cospi_p52 = _mm_set1_epi32(cospi[52]); + __m128i cospi_m04 = _mm_set1_epi32(-cospi[4]); + __m128i cospi_m60 = _mm_set1_epi32(-cospi[60]); + __m128i cospi_m36 = _mm_set1_epi32(-cospi[36]); + __m128i cospi_m28 = _mm_set1_epi32(-cospi[28]); + __m128i cospi_m20 = _mm_set1_epi32(-cospi[20]); + __m128i cospi_m44 = _mm_set1_epi32(-cospi[44]); + __m128i cospi_m52 = _mm_set1_epi32(-cospi[52]); + __m128i cospi_m12 = _mm_set1_epi32(-cospi[12]); + __m128i cospi_p62 = _mm_set1_epi32(cospi[62]); + __m128i cospi_p02 = _mm_set1_epi32(cospi[2]); + __m128i cospi_p30 = _mm_set1_epi32(cospi[30]); + __m128i cospi_p34 = _mm_set1_epi32(cospi[34]); + __m128i cospi_p46 = _mm_set1_epi32(cospi[46]); + __m128i cospi_p18 = _mm_set1_epi32(cospi[18]); + __m128i cospi_p14 = _mm_set1_epi32(cospi[14]); + __m128i cospi_p50 = _mm_set1_epi32(cospi[50]); + __m128i cospi_p54 = _mm_set1_epi32(cospi[54]); + __m128i cospi_p10 = _mm_set1_epi32(cospi[10]); + __m128i cospi_p22 = _mm_set1_epi32(cospi[22]); + __m128i cospi_p42 = _mm_set1_epi32(cospi[42]); + __m128i cospi_p38 = _mm_set1_epi32(cospi[38]); + __m128i cospi_p26 = _mm_set1_epi32(cospi[26]); + __m128i cospi_p06 = _mm_set1_epi32(cospi[6]); + __m128i cospi_p58 = _mm_set1_epi32(cospi[58]); + __m128i cospi_p63 = _mm_set1_epi32(cospi[63]); + __m128i cospi_p01 = _mm_set1_epi32(cospi[1]); + __m128i cospi_p31 = _mm_set1_epi32(cospi[31]); + __m128i cospi_p33 = _mm_set1_epi32(cospi[33]); + __m128i cospi_p47 = _mm_set1_epi32(cospi[47]); + __m128i cospi_p17 = _mm_set1_epi32(cospi[17]); + __m128i cospi_p15 = _mm_set1_epi32(cospi[15]); + __m128i cospi_p49 = _mm_set1_epi32(cospi[49]); + __m128i cospi_p55 = _mm_set1_epi32(cospi[55]); + __m128i cospi_p09 = _mm_set1_epi32(cospi[9]); + __m128i cospi_p23 = _mm_set1_epi32(cospi[23]); + __m128i cospi_p41 = _mm_set1_epi32(cospi[41]); + __m128i cospi_p39 = _mm_set1_epi32(cospi[39]); + __m128i cospi_p25 = _mm_set1_epi32(cospi[25]); + __m128i cospi_p07 = _mm_set1_epi32(cospi[7]); + __m128i cospi_p57 = _mm_set1_epi32(cospi[57]); + __m128i cospi_p59 = _mm_set1_epi32(cospi[59]); + __m128i cospi_p05 = _mm_set1_epi32(cospi[5]); + __m128i cospi_p27 = _mm_set1_epi32(cospi[27]); + __m128i cospi_p37 = _mm_set1_epi32(cospi[37]); + __m128i cospi_p43 = _mm_set1_epi32(cospi[43]); + __m128i cospi_p21 = _mm_set1_epi32(cospi[21]); + __m128i cospi_p11 = _mm_set1_epi32(cospi[11]); + __m128i cospi_p53 = _mm_set1_epi32(cospi[53]); + __m128i cospi_p51 = _mm_set1_epi32(cospi[51]); + __m128i cospi_p13 = _mm_set1_epi32(cospi[13]); + __m128i cospi_p19 = _mm_set1_epi32(cospi[19]); + __m128i cospi_p45 = _mm_set1_epi32(cospi[45]); + __m128i cospi_p35 = _mm_set1_epi32(cospi[35]); + __m128i cospi_p29 = _mm_set1_epi32(cospi[29]); + __m128i cospi_p03 = _mm_set1_epi32(cospi[3]); + __m128i cospi_p61 = _mm_set1_epi32(cospi[61]); + + int startidx = 0 * instride; + int endidx = 63 * instride; + // stage 1 + __m128i x1[64]; + x1[0] = _mm_add_epi32(input[startidx], input[endidx]); + x1[63] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[1] = _mm_add_epi32(input[startidx], input[endidx]); + x1[62] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[2] = _mm_add_epi32(input[startidx], input[endidx]); + x1[61] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[3] = _mm_add_epi32(input[startidx], input[endidx]); + x1[60] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[4] = _mm_add_epi32(input[startidx], input[endidx]); + x1[59] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[5] = _mm_add_epi32(input[startidx], input[endidx]); + x1[58] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[6] = _mm_add_epi32(input[startidx], input[endidx]); + x1[57] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[7] = _mm_add_epi32(input[startidx], input[endidx]); + x1[56] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[8] = _mm_add_epi32(input[startidx], input[endidx]); + x1[55] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[9] = _mm_add_epi32(input[startidx], input[endidx]); + x1[54] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[10] = _mm_add_epi32(input[startidx], input[endidx]); + x1[53] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[11] = _mm_add_epi32(input[startidx], input[endidx]); + x1[52] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[12] = _mm_add_epi32(input[startidx], input[endidx]); + x1[51] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[13] = _mm_add_epi32(input[startidx], input[endidx]); + x1[50] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[14] = _mm_add_epi32(input[startidx], input[endidx]); + x1[49] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[15] = _mm_add_epi32(input[startidx], input[endidx]); + x1[48] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[16] = _mm_add_epi32(input[startidx], input[endidx]); + x1[47] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[17] = _mm_add_epi32(input[startidx], input[endidx]); + x1[46] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[18] = _mm_add_epi32(input[startidx], input[endidx]); + x1[45] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[19] = _mm_add_epi32(input[startidx], input[endidx]); + x1[44] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[20] = _mm_add_epi32(input[startidx], input[endidx]); + x1[43] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[21] = _mm_add_epi32(input[startidx], input[endidx]); + x1[42] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[22] = _mm_add_epi32(input[startidx], input[endidx]); + x1[41] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[23] = _mm_add_epi32(input[startidx], input[endidx]); + x1[40] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[24] = _mm_add_epi32(input[startidx], input[endidx]); + x1[39] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[25] = _mm_add_epi32(input[startidx], input[endidx]); + x1[38] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[26] = _mm_add_epi32(input[startidx], input[endidx]); + x1[37] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[27] = _mm_add_epi32(input[startidx], input[endidx]); + x1[36] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[28] = _mm_add_epi32(input[startidx], input[endidx]); + x1[35] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[29] = _mm_add_epi32(input[startidx], input[endidx]); + x1[34] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[30] = _mm_add_epi32(input[startidx], input[endidx]); + x1[33] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[31] = _mm_add_epi32(input[startidx], input[endidx]); + x1[32] = _mm_sub_epi32(input[startidx], input[endidx]); + + // stage 2 + __m128i x2[64]; + x2[0] = _mm_add_epi32(x1[0], x1[31]); + x2[31] = _mm_sub_epi32(x1[0], x1[31]); + x2[1] = _mm_add_epi32(x1[1], x1[30]); + x2[30] = _mm_sub_epi32(x1[1], x1[30]); + x2[2] = _mm_add_epi32(x1[2], x1[29]); + x2[29] = _mm_sub_epi32(x1[2], x1[29]); + x2[3] = _mm_add_epi32(x1[3], x1[28]); + x2[28] = _mm_sub_epi32(x1[3], x1[28]); + x2[4] = _mm_add_epi32(x1[4], x1[27]); + x2[27] = _mm_sub_epi32(x1[4], x1[27]); + x2[5] = _mm_add_epi32(x1[5], x1[26]); + x2[26] = _mm_sub_epi32(x1[5], x1[26]); + x2[6] = _mm_add_epi32(x1[6], x1[25]); + x2[25] = _mm_sub_epi32(x1[6], x1[25]); + x2[7] = _mm_add_epi32(x1[7], x1[24]); + x2[24] = _mm_sub_epi32(x1[7], x1[24]); + x2[8] = _mm_add_epi32(x1[8], x1[23]); + x2[23] = _mm_sub_epi32(x1[8], x1[23]); + x2[9] = _mm_add_epi32(x1[9], x1[22]); + x2[22] = _mm_sub_epi32(x1[9], x1[22]); + x2[10] = _mm_add_epi32(x1[10], x1[21]); + x2[21] = _mm_sub_epi32(x1[10], x1[21]); + x2[11] = _mm_add_epi32(x1[11], x1[20]); + x2[20] = _mm_sub_epi32(x1[11], x1[20]); + x2[12] = _mm_add_epi32(x1[12], x1[19]); + x2[19] = _mm_sub_epi32(x1[12], x1[19]); + x2[13] = _mm_add_epi32(x1[13], x1[18]); + x2[18] = _mm_sub_epi32(x1[13], x1[18]); + x2[14] = _mm_add_epi32(x1[14], x1[17]); + x2[17] = _mm_sub_epi32(x1[14], x1[17]); + x2[15] = _mm_add_epi32(x1[15], x1[16]); + x2[16] = _mm_sub_epi32(x1[15], x1[16]); + x2[32] = x1[32]; + x2[33] = x1[33]; + x2[34] = x1[34]; + x2[35] = x1[35]; + x2[36] = x1[36]; + x2[37] = x1[37]; + x2[38] = x1[38]; + x2[39] = x1[39]; + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[40], x1[55], x2[40], x2[55], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[41], x1[54], x2[41], x2[54], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[42], x1[53], x2[42], x2[53], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[43], x1[52], x2[43], x2[52], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[44], x1[51], x2[44], x2[51], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[45], x1[50], x2[45], x2[50], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[46], x1[49], x2[46], x2[49], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[47], x1[48], x2[47], x2[48], + __rounding, cos_bit); + x2[56] = x1[56]; + x2[57] = x1[57]; + x2[58] = x1[58]; + x2[59] = x1[59]; + x2[60] = x1[60]; + x2[61] = x1[61]; + x2[62] = x1[62]; + x2[63] = x1[63]; + + // stage 3 + __m128i x3[64]; + x3[0] = _mm_add_epi32(x2[0], x2[15]); + x3[15] = _mm_sub_epi32(x2[0], x2[15]); + x3[1] = _mm_add_epi32(x2[1], x2[14]); + x3[14] = _mm_sub_epi32(x2[1], x2[14]); + x3[2] = _mm_add_epi32(x2[2], x2[13]); + x3[13] = _mm_sub_epi32(x2[2], x2[13]); + x3[3] = _mm_add_epi32(x2[3], x2[12]); + x3[12] = _mm_sub_epi32(x2[3], x2[12]); + x3[4] = _mm_add_epi32(x2[4], x2[11]); + x3[11] = _mm_sub_epi32(x2[4], x2[11]); + x3[5] = _mm_add_epi32(x2[5], x2[10]); + x3[10] = _mm_sub_epi32(x2[5], x2[10]); + x3[6] = _mm_add_epi32(x2[6], x2[9]); + x3[9] = _mm_sub_epi32(x2[6], x2[9]); + x3[7] = _mm_add_epi32(x2[7], x2[8]); + x3[8] = _mm_sub_epi32(x2[7], x2[8]); + x3[16] = x2[16]; + x3[17] = x2[17]; + x3[18] = x2[18]; + x3[19] = x2[19]; + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[20], x2[27], x3[20], x3[27], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[21], x2[26], x3[21], x3[26], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[22], x2[25], x3[22], x3[25], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[23], x2[24], x3[23], x3[24], + __rounding, cos_bit); + x3[28] = x2[28]; + x3[29] = x2[29]; + x3[30] = x2[30]; + x3[31] = x2[31]; + x3[32] = _mm_add_epi32(x2[32], x2[47]); + x3[47] = _mm_sub_epi32(x2[32], x2[47]); + x3[33] = _mm_add_epi32(x2[33], x2[46]); + x3[46] = _mm_sub_epi32(x2[33], x2[46]); + x3[34] = _mm_add_epi32(x2[34], x2[45]); + x3[45] = _mm_sub_epi32(x2[34], x2[45]); + x3[35] = _mm_add_epi32(x2[35], x2[44]); + x3[44] = _mm_sub_epi32(x2[35], x2[44]); + x3[36] = _mm_add_epi32(x2[36], x2[43]); + x3[43] = _mm_sub_epi32(x2[36], x2[43]); + x3[37] = _mm_add_epi32(x2[37], x2[42]); + x3[42] = _mm_sub_epi32(x2[37], x2[42]); + x3[38] = _mm_add_epi32(x2[38], x2[41]); + x3[41] = _mm_sub_epi32(x2[38], x2[41]); + x3[39] = _mm_add_epi32(x2[39], x2[40]); + x3[40] = _mm_sub_epi32(x2[39], x2[40]); + x3[48] = _mm_sub_epi32(x2[63], x2[48]); + x3[63] = _mm_add_epi32(x2[63], x2[48]); + x3[49] = _mm_sub_epi32(x2[62], x2[49]); + x3[62] = _mm_add_epi32(x2[62], x2[49]); + x3[50] = _mm_sub_epi32(x2[61], x2[50]); + x3[61] = _mm_add_epi32(x2[61], x2[50]); + x3[51] = _mm_sub_epi32(x2[60], x2[51]); + x3[60] = _mm_add_epi32(x2[60], x2[51]); + x3[52] = _mm_sub_epi32(x2[59], x2[52]); + x3[59] = _mm_add_epi32(x2[59], x2[52]); + x3[53] = _mm_sub_epi32(x2[58], x2[53]); + x3[58] = _mm_add_epi32(x2[58], x2[53]); + x3[54] = _mm_sub_epi32(x2[57], x2[54]); + x3[57] = _mm_add_epi32(x2[57], x2[54]); + x3[55] = _mm_sub_epi32(x2[56], x2[55]); + x3[56] = _mm_add_epi32(x2[56], x2[55]); + + // stage 4 + __m128i x4[64]; + x4[0] = _mm_add_epi32(x3[0], x3[7]); + x4[7] = _mm_sub_epi32(x3[0], x3[7]); + x4[1] = _mm_add_epi32(x3[1], x3[6]); + x4[6] = _mm_sub_epi32(x3[1], x3[6]); + x4[2] = _mm_add_epi32(x3[2], x3[5]); + x4[5] = _mm_sub_epi32(x3[2], x3[5]); + x4[3] = _mm_add_epi32(x3[3], x3[4]); + x4[4] = _mm_sub_epi32(x3[3], x3[4]); + x4[8] = x3[8]; + x4[9] = x3[9]; + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[10], x3[13], x4[10], x4[13], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[11], x3[12], x4[11], x4[12], + __rounding, cos_bit); + x4[14] = x3[14]; + x4[15] = x3[15]; + x4[16] = _mm_add_epi32(x3[16], x3[23]); + x4[23] = _mm_sub_epi32(x3[16], x3[23]); + x4[17] = _mm_add_epi32(x3[17], x3[22]); + x4[22] = _mm_sub_epi32(x3[17], x3[22]); + x4[18] = _mm_add_epi32(x3[18], x3[21]); + x4[21] = _mm_sub_epi32(x3[18], x3[21]); + x4[19] = _mm_add_epi32(x3[19], x3[20]); + x4[20] = _mm_sub_epi32(x3[19], x3[20]); + x4[24] = _mm_sub_epi32(x3[31], x3[24]); + x4[31] = _mm_add_epi32(x3[31], x3[24]); + x4[25] = _mm_sub_epi32(x3[30], x3[25]); + x4[30] = _mm_add_epi32(x3[30], x3[25]); + x4[26] = _mm_sub_epi32(x3[29], x3[26]); + x4[29] = _mm_add_epi32(x3[29], x3[26]); + x4[27] = _mm_sub_epi32(x3[28], x3[27]); + x4[28] = _mm_add_epi32(x3[28], x3[27]); + x4[32] = x3[32]; + x4[33] = x3[33]; + x4[34] = x3[34]; + x4[35] = x3[35]; + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[36], x3[59], x4[36], x4[59], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[37], x3[58], x4[37], x4[58], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[38], x3[57], x4[38], x4[57], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[39], x3[56], x4[39], x4[56], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[40], x3[55], x4[40], x4[55], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[41], x3[54], x4[41], x4[54], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[42], x3[53], x4[42], x4[53], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[43], x3[52], x4[43], x4[52], + __rounding, cos_bit); + x4[44] = x3[44]; + x4[45] = x3[45]; + x4[46] = x3[46]; + x4[47] = x3[47]; + x4[48] = x3[48]; + x4[49] = x3[49]; + x4[50] = x3[50]; + x4[51] = x3[51]; + x4[60] = x3[60]; + x4[61] = x3[61]; + x4[62] = x3[62]; + x4[63] = x3[63]; + + // stage 5 + __m128i x5[64]; + x5[0] = _mm_add_epi32(x4[0], x4[3]); + x5[3] = _mm_sub_epi32(x4[0], x4[3]); + x5[1] = _mm_add_epi32(x4[1], x4[2]); + x5[2] = _mm_sub_epi32(x4[1], x4[2]); + x5[4] = x4[4]; + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x4[5], x4[6], x5[5], x5[6], + __rounding, cos_bit); + x5[7] = x4[7]; + x5[8] = _mm_add_epi32(x4[8], x4[11]); + x5[11] = _mm_sub_epi32(x4[8], x4[11]); + x5[9] = _mm_add_epi32(x4[9], x4[10]); + x5[10] = _mm_sub_epi32(x4[9], x4[10]); + x5[12] = _mm_sub_epi32(x4[15], x4[12]); + x5[15] = _mm_add_epi32(x4[15], x4[12]); + x5[13] = _mm_sub_epi32(x4[14], x4[13]); + x5[14] = _mm_add_epi32(x4[14], x4[13]); + x5[16] = x4[16]; + x5[17] = x4[17]; + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[18], x4[29], x5[18], x5[29], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[19], x4[28], x5[19], x5[28], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[20], x4[27], x5[20], x5[27], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[21], x4[26], x5[21], x5[26], + __rounding, cos_bit); + x5[22] = x4[22]; + x5[23] = x4[23]; + x5[24] = x4[24]; + x5[25] = x4[25]; + x5[30] = x4[30]; + x5[31] = x4[31]; + x5[32] = _mm_add_epi32(x4[32], x4[39]); + x5[39] = _mm_sub_epi32(x4[32], x4[39]); + x5[33] = _mm_add_epi32(x4[33], x4[38]); + x5[38] = _mm_sub_epi32(x4[33], x4[38]); + x5[34] = _mm_add_epi32(x4[34], x4[37]); + x5[37] = _mm_sub_epi32(x4[34], x4[37]); + x5[35] = _mm_add_epi32(x4[35], x4[36]); + x5[36] = _mm_sub_epi32(x4[35], x4[36]); + x5[40] = _mm_sub_epi32(x4[47], x4[40]); + x5[47] = _mm_add_epi32(x4[47], x4[40]); + x5[41] = _mm_sub_epi32(x4[46], x4[41]); + x5[46] = _mm_add_epi32(x4[46], x4[41]); + x5[42] = _mm_sub_epi32(x4[45], x4[42]); + x5[45] = _mm_add_epi32(x4[45], x4[42]); + x5[43] = _mm_sub_epi32(x4[44], x4[43]); + x5[44] = _mm_add_epi32(x4[44], x4[43]); + x5[48] = _mm_add_epi32(x4[48], x4[55]); + x5[55] = _mm_sub_epi32(x4[48], x4[55]); + x5[49] = _mm_add_epi32(x4[49], x4[54]); + x5[54] = _mm_sub_epi32(x4[49], x4[54]); + x5[50] = _mm_add_epi32(x4[50], x4[53]); + x5[53] = _mm_sub_epi32(x4[50], x4[53]); + x5[51] = _mm_add_epi32(x4[51], x4[52]); + x5[52] = _mm_sub_epi32(x4[51], x4[52]); + x5[56] = _mm_sub_epi32(x4[63], x4[56]); + x5[63] = _mm_add_epi32(x4[63], x4[56]); + x5[57] = _mm_sub_epi32(x4[62], x4[57]); + x5[62] = _mm_add_epi32(x4[62], x4[57]); + x5[58] = _mm_sub_epi32(x4[61], x4[58]); + x5[61] = _mm_add_epi32(x4[61], x4[58]); + x5[59] = _mm_sub_epi32(x4[60], x4[59]); + x5[60] = _mm_add_epi32(x4[60], x4[59]); + + // stage 6 + __m128i x6[64]; + btf_32_type0_sse4_1_new(cospi_p32, cospi_p32, x5[0], x5[1], x6[0], x6[1], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p48, cospi_p16, x5[2], x5[3], x6[2], x6[3], + __rounding, cos_bit); + x6[4] = _mm_add_epi32(x5[4], x5[5]); + x6[5] = _mm_sub_epi32(x5[4], x5[5]); + x6[6] = _mm_sub_epi32(x5[7], x5[6]); + x6[7] = _mm_add_epi32(x5[7], x5[6]); + x6[8] = x5[8]; + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x5[9], x5[14], x6[9], x6[14], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x5[10], x5[13], x6[10], x6[13], + __rounding, cos_bit); + x6[11] = x5[11]; + x6[12] = x5[12]; + x6[15] = x5[15]; + x6[16] = _mm_add_epi32(x5[16], x5[19]); + x6[19] = _mm_sub_epi32(x5[16], x5[19]); + x6[17] = _mm_add_epi32(x5[17], x5[18]); + x6[18] = _mm_sub_epi32(x5[17], x5[18]); + x6[20] = _mm_sub_epi32(x5[23], x5[20]); + x6[23] = _mm_add_epi32(x5[23], x5[20]); + x6[21] = _mm_sub_epi32(x5[22], x5[21]); + x6[22] = _mm_add_epi32(x5[22], x5[21]); + x6[24] = _mm_add_epi32(x5[24], x5[27]); + x6[27] = _mm_sub_epi32(x5[24], x5[27]); + x6[25] = _mm_add_epi32(x5[25], x5[26]); + x6[26] = _mm_sub_epi32(x5[25], x5[26]); + x6[28] = _mm_sub_epi32(x5[31], x5[28]); + x6[31] = _mm_add_epi32(x5[31], x5[28]); + x6[29] = _mm_sub_epi32(x5[30], x5[29]); + x6[30] = _mm_add_epi32(x5[30], x5[29]); + x6[32] = x5[32]; + x6[33] = x5[33]; + btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[34], x5[61], x6[34], x6[61], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[35], x5[60], x6[35], x6[60], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[36], x5[59], x6[36], x6[59], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[37], x5[58], x6[37], x6[58], + __rounding, cos_bit); + x6[38] = x5[38]; + x6[39] = x5[39]; + x6[40] = x5[40]; + x6[41] = x5[41]; + btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[42], x5[53], x6[42], x6[53], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[43], x5[52], x6[43], x6[52], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[44], x5[51], x6[44], x6[51], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[45], x5[50], x6[45], x6[50], + __rounding, cos_bit); + x6[46] = x5[46]; + x6[47] = x5[47]; + x6[48] = x5[48]; + x6[49] = x5[49]; + x6[54] = x5[54]; + x6[55] = x5[55]; + x6[56] = x5[56]; + x6[57] = x5[57]; + x6[62] = x5[62]; + x6[63] = x5[63]; + + // stage 7 + __m128i x7[64]; + x7[0] = x6[0]; + x7[1] = x6[1]; + x7[2] = x6[2]; + x7[3] = x6[3]; + btf_32_type1_sse4_1_new(cospi_p56, cospi_p08, x6[4], x6[7], x7[4], x7[7], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p24, cospi_p40, x6[5], x6[6], x7[5], x7[6], + __rounding, cos_bit); + x7[8] = _mm_add_epi32(x6[8], x6[9]); + x7[9] = _mm_sub_epi32(x6[8], x6[9]); + x7[10] = _mm_sub_epi32(x6[11], x6[10]); + x7[11] = _mm_add_epi32(x6[11], x6[10]); + x7[12] = _mm_add_epi32(x6[12], x6[13]); + x7[13] = _mm_sub_epi32(x6[12], x6[13]); + x7[14] = _mm_sub_epi32(x6[15], x6[14]); + x7[15] = _mm_add_epi32(x6[15], x6[14]); + x7[16] = x6[16]; + btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x6[17], x6[30], x7[17], x7[30], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x6[18], x6[29], x7[18], x7[29], + __rounding, cos_bit); + x7[19] = x6[19]; + x7[20] = x6[20]; + btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x6[21], x6[26], x7[21], x7[26], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x6[22], x6[25], x7[22], x7[25], + __rounding, cos_bit); + x7[23] = x6[23]; + x7[24] = x6[24]; + x7[27] = x6[27]; + x7[28] = x6[28]; + x7[31] = x6[31]; + x7[32] = _mm_add_epi32(x6[32], x6[35]); + x7[35] = _mm_sub_epi32(x6[32], x6[35]); + x7[33] = _mm_add_epi32(x6[33], x6[34]); + x7[34] = _mm_sub_epi32(x6[33], x6[34]); + x7[36] = _mm_sub_epi32(x6[39], x6[36]); + x7[39] = _mm_add_epi32(x6[39], x6[36]); + x7[37] = _mm_sub_epi32(x6[38], x6[37]); + x7[38] = _mm_add_epi32(x6[38], x6[37]); + x7[40] = _mm_add_epi32(x6[40], x6[43]); + x7[43] = _mm_sub_epi32(x6[40], x6[43]); + x7[41] = _mm_add_epi32(x6[41], x6[42]); + x7[42] = _mm_sub_epi32(x6[41], x6[42]); + x7[44] = _mm_sub_epi32(x6[47], x6[44]); + x7[47] = _mm_add_epi32(x6[47], x6[44]); + x7[45] = _mm_sub_epi32(x6[46], x6[45]); + x7[46] = _mm_add_epi32(x6[46], x6[45]); + x7[48] = _mm_add_epi32(x6[48], x6[51]); + x7[51] = _mm_sub_epi32(x6[48], x6[51]); + x7[49] = _mm_add_epi32(x6[49], x6[50]); + x7[50] = _mm_sub_epi32(x6[49], x6[50]); + x7[52] = _mm_sub_epi32(x6[55], x6[52]); + x7[55] = _mm_add_epi32(x6[55], x6[52]); + x7[53] = _mm_sub_epi32(x6[54], x6[53]); + x7[54] = _mm_add_epi32(x6[54], x6[53]); + x7[56] = _mm_add_epi32(x6[56], x6[59]); + x7[59] = _mm_sub_epi32(x6[56], x6[59]); + x7[57] = _mm_add_epi32(x6[57], x6[58]); + x7[58] = _mm_sub_epi32(x6[57], x6[58]); + x7[60] = _mm_sub_epi32(x6[63], x6[60]); + x7[63] = _mm_add_epi32(x6[63], x6[60]); + x7[61] = _mm_sub_epi32(x6[62], x6[61]); + x7[62] = _mm_add_epi32(x6[62], x6[61]); + + // stage 8 + __m128i x8[64]; + x8[0] = x7[0]; + x8[1] = x7[1]; + x8[2] = x7[2]; + x8[3] = x7[3]; + x8[4] = x7[4]; + x8[5] = x7[5]; + x8[6] = x7[6]; + x8[7] = x7[7]; + btf_32_type1_sse4_1_new(cospi_p60, cospi_p04, x7[8], x7[15], x8[8], x8[15], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p28, cospi_p36, x7[9], x7[14], x8[9], x8[14], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p44, cospi_p20, x7[10], x7[13], x8[10], x8[13], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p12, cospi_p52, x7[11], x7[12], x8[11], x8[12], + __rounding, cos_bit); + x8[16] = _mm_add_epi32(x7[16], x7[17]); + x8[17] = _mm_sub_epi32(x7[16], x7[17]); + x8[18] = _mm_sub_epi32(x7[19], x7[18]); + x8[19] = _mm_add_epi32(x7[19], x7[18]); + x8[20] = _mm_add_epi32(x7[20], x7[21]); + x8[21] = _mm_sub_epi32(x7[20], x7[21]); + x8[22] = _mm_sub_epi32(x7[23], x7[22]); + x8[23] = _mm_add_epi32(x7[23], x7[22]); + x8[24] = _mm_add_epi32(x7[24], x7[25]); + x8[25] = _mm_sub_epi32(x7[24], x7[25]); + x8[26] = _mm_sub_epi32(x7[27], x7[26]); + x8[27] = _mm_add_epi32(x7[27], x7[26]); + x8[28] = _mm_add_epi32(x7[28], x7[29]); + x8[29] = _mm_sub_epi32(x7[28], x7[29]); + x8[30] = _mm_sub_epi32(x7[31], x7[30]); + x8[31] = _mm_add_epi32(x7[31], x7[30]); + x8[32] = x7[32]; + btf_32_type0_sse4_1_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61], + __rounding, cos_bit); + x8[35] = x7[35]; + x8[36] = x7[36]; + btf_32_type0_sse4_1_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57], + __rounding, cos_bit); + x8[39] = x7[39]; + x8[40] = x7[40]; + btf_32_type0_sse4_1_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53], + __rounding, cos_bit); + x8[43] = x7[43]; + x8[44] = x7[44]; + btf_32_type0_sse4_1_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49], + __rounding, cos_bit); + x8[47] = x7[47]; + x8[48] = x7[48]; + x8[51] = x7[51]; + x8[52] = x7[52]; + x8[55] = x7[55]; + x8[56] = x7[56]; + x8[59] = x7[59]; + x8[60] = x7[60]; + x8[63] = x7[63]; + + // stage 9 + __m128i x9[64]; + x9[0] = x8[0]; + x9[1] = x8[1]; + x9[2] = x8[2]; + x9[3] = x8[3]; + x9[4] = x8[4]; + x9[5] = x8[5]; + x9[6] = x8[6]; + x9[7] = x8[7]; + x9[8] = x8[8]; + x9[9] = x8[9]; + x9[10] = x8[10]; + x9[11] = x8[11]; + x9[12] = x8[12]; + x9[13] = x8[13]; + x9[14] = x8[14]; + x9[15] = x8[15]; + btf_32_type1_sse4_1_new(cospi_p62, cospi_p02, x8[16], x8[31], x9[16], x9[31], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p30, cospi_p34, x8[17], x8[30], x9[17], x9[30], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p46, cospi_p18, x8[18], x8[29], x9[18], x9[29], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p14, cospi_p50, x8[19], x8[28], x9[19], x9[28], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p54, cospi_p10, x8[20], x8[27], x9[20], x9[27], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p22, cospi_p42, x8[21], x8[26], x9[21], x9[26], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p38, cospi_p26, x8[22], x8[25], x9[22], x9[25], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p06, cospi_p58, x8[23], x8[24], x9[23], x9[24], + __rounding, cos_bit); + x9[32] = _mm_add_epi32(x8[32], x8[33]); + x9[33] = _mm_sub_epi32(x8[32], x8[33]); + x9[34] = _mm_sub_epi32(x8[35], x8[34]); + x9[35] = _mm_add_epi32(x8[35], x8[34]); + x9[36] = _mm_add_epi32(x8[36], x8[37]); + x9[37] = _mm_sub_epi32(x8[36], x8[37]); + x9[38] = _mm_sub_epi32(x8[39], x8[38]); + x9[39] = _mm_add_epi32(x8[39], x8[38]); + x9[40] = _mm_add_epi32(x8[40], x8[41]); + x9[41] = _mm_sub_epi32(x8[40], x8[41]); + x9[42] = _mm_sub_epi32(x8[43], x8[42]); + x9[43] = _mm_add_epi32(x8[43], x8[42]); + x9[44] = _mm_add_epi32(x8[44], x8[45]); + x9[45] = _mm_sub_epi32(x8[44], x8[45]); + x9[46] = _mm_sub_epi32(x8[47], x8[46]); + x9[47] = _mm_add_epi32(x8[47], x8[46]); + x9[48] = _mm_add_epi32(x8[48], x8[49]); + x9[49] = _mm_sub_epi32(x8[48], x8[49]); + x9[50] = _mm_sub_epi32(x8[51], x8[50]); + x9[51] = _mm_add_epi32(x8[51], x8[50]); + x9[52] = _mm_add_epi32(x8[52], x8[53]); + x9[53] = _mm_sub_epi32(x8[52], x8[53]); + x9[54] = _mm_sub_epi32(x8[55], x8[54]); + x9[55] = _mm_add_epi32(x8[55], x8[54]); + x9[56] = _mm_add_epi32(x8[56], x8[57]); + x9[57] = _mm_sub_epi32(x8[56], x8[57]); + x9[58] = _mm_sub_epi32(x8[59], x8[58]); + x9[59] = _mm_add_epi32(x8[59], x8[58]); + x9[60] = _mm_add_epi32(x8[60], x8[61]); + x9[61] = _mm_sub_epi32(x8[60], x8[61]); + x9[62] = _mm_sub_epi32(x8[63], x8[62]); + x9[63] = _mm_add_epi32(x8[63], x8[62]); + + // stage 10 + __m128i x10[64]; + x10[0] = x9[0]; + x10[1] = x9[1]; + x10[2] = x9[2]; + x10[3] = x9[3]; + x10[4] = x9[4]; + x10[5] = x9[5]; + x10[6] = x9[6]; + x10[7] = x9[7]; + x10[8] = x9[8]; + x10[9] = x9[9]; + x10[10] = x9[10]; + x10[11] = x9[11]; + x10[12] = x9[12]; + x10[13] = x9[13]; + x10[14] = x9[14]; + x10[15] = x9[15]; + x10[16] = x9[16]; + x10[17] = x9[17]; + x10[18] = x9[18]; + x10[19] = x9[19]; + x10[20] = x9[20]; + x10[21] = x9[21]; + x10[22] = x9[22]; + x10[23] = x9[23]; + x10[24] = x9[24]; + x10[25] = x9[25]; + x10[26] = x9[26]; + x10[27] = x9[27]; + x10[28] = x9[28]; + x10[29] = x9[29]; + x10[30] = x9[30]; + x10[31] = x9[31]; + btf_32_type1_sse4_1_new(cospi_p63, cospi_p01, x9[32], x9[63], x10[32], + x10[63], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p31, cospi_p33, x9[33], x9[62], x10[33], + x10[62], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p47, cospi_p17, x9[34], x9[61], x10[34], + x10[61], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p15, cospi_p49, x9[35], x9[60], x10[35], + x10[60], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p55, cospi_p09, x9[36], x9[59], x10[36], + x10[59], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p23, cospi_p41, x9[37], x9[58], x10[37], + x10[58], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p39, cospi_p25, x9[38], x9[57], x10[38], + x10[57], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p07, cospi_p57, x9[39], x9[56], x10[39], + x10[56], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p59, cospi_p05, x9[40], x9[55], x10[40], + x10[55], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p27, cospi_p37, x9[41], x9[54], x10[41], + x10[54], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p43, cospi_p21, x9[42], x9[53], x10[42], + x10[53], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p11, cospi_p53, x9[43], x9[52], x10[43], + x10[52], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p51, cospi_p13, x9[44], x9[51], x10[44], + x10[51], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p19, cospi_p45, x9[45], x9[50], x10[45], + x10[50], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p35, cospi_p29, x9[46], x9[49], x10[46], + x10[49], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p03, cospi_p61, x9[47], x9[48], x10[47], + x10[48], __rounding, cos_bit); + + startidx = 0 * outstride; + endidx = 63 * outstride; + // stage 11 + output[startidx] = x10[0]; + output[endidx] = x10[63]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[32]; + output[endidx] = x10[31]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[16]; + output[endidx] = x10[47]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[48]; + output[endidx] = x10[15]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[8]; + output[endidx] = x10[55]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[40]; + output[endidx] = x10[23]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[24]; + output[endidx] = x10[39]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[56]; + output[endidx] = x10[7]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[4]; + output[endidx] = x10[59]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[36]; + output[endidx] = x10[27]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[20]; + output[endidx] = x10[43]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[52]; + output[endidx] = x10[11]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[12]; + output[endidx] = x10[51]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[44]; + output[endidx] = x10[19]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[28]; + output[endidx] = x10[35]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[60]; + output[endidx] = x10[3]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[2]; + output[endidx] = x10[61]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[34]; + output[endidx] = x10[29]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[18]; + output[endidx] = x10[45]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[50]; + output[endidx] = x10[13]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[10]; + output[endidx] = x10[53]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[42]; + output[endidx] = x10[21]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[26]; + output[endidx] = x10[37]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[58]; + output[endidx] = x10[5]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[6]; + output[endidx] = x10[57]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[38]; + output[endidx] = x10[25]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[22]; + output[endidx] = x10[41]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[54]; + output[endidx] = x10[9]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[14]; + output[endidx] = x10[49]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[46]; + output[endidx] = x10[17]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[30]; + output[endidx] = x10[33]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[62]; + output[endidx] = x10[1]; +} + +void av1_idtx32_sse4_1(__m128i *input, __m128i *output, int cos_bit, + const int col_num) { + (void)cos_bit; + for (int i = 0; i < 32; i++) { + output[i * col_num] = _mm_slli_epi32(input[i * col_num], 2); + } +} diff --git a/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm2d_avx2.c b/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm2d_avx2.c new file mode 100644 index 000000000..634d50bb2 --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm2d_avx2.c @@ -0,0 +1,2814 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/av1_rtcd.h" + +#include "av1/common/enums.h" +#include "av1/common/av1_txfm.h" +#include "av1/encoder/x86/av1_fwd_txfm_avx2.h" +#include "av1/common/x86/av1_txfm_sse2.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" +#include "av1/encoder/x86/av1_txfm1d_sse4.h" +#include "av1/encoder/x86/av1_fwd_txfm_sse2.h" +#include "aom_dsp/x86/txfm_common_avx2.h" + +static INLINE void fdct16x16_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); + __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); + __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); + __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); + __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); + __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); + __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); + __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); + + // stage 1 + __m256i x1[16]; + btf_16_adds_subs_out_avx2(&x1[0], &x1[15], input[0], input[15]); + btf_16_adds_subs_out_avx2(&x1[1], &x1[14], input[1], input[14]); + btf_16_adds_subs_out_avx2(&x1[2], &x1[13], input[2], input[13]); + btf_16_adds_subs_out_avx2(&x1[3], &x1[12], input[3], input[12]); + btf_16_adds_subs_out_avx2(&x1[4], &x1[11], input[4], input[11]); + btf_16_adds_subs_out_avx2(&x1[5], &x1[10], input[5], input[10]); + btf_16_adds_subs_out_avx2(&x1[6], &x1[9], input[6], input[9]); + btf_16_adds_subs_out_avx2(&x1[7], &x1[8], input[7], input[8]); + + // stage 2 + btf_16_adds_subs_avx2(&x1[0], &x1[7]); + btf_16_adds_subs_avx2(&x1[1], &x1[6]); + btf_16_adds_subs_avx2(&x1[2], &x1[5]); + btf_16_adds_subs_avx2(&x1[3], &x1[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit); + + // stage 3 + btf_16_adds_subs_avx2(&x1[0], &x1[3]); + btf_16_adds_subs_avx2(&x1[1], &x1[2]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[11]); + btf_16_adds_subs_avx2(&x1[9], &x1[10]); + btf_16_adds_subs_avx2(&x1[15], &x1[12]); + btf_16_adds_subs_avx2(&x1[14], &x1[13]); + + // stage 4 + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); + + // stage 5 + btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); + + // stage 6 + btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit); + btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit); + + // stage 7 + output[0] = x1[0]; + output[1] = x1[8]; + output[2] = x1[4]; + output[3] = x1[12]; + output[4] = x1[2]; + output[5] = x1[10]; + output[6] = x1[6]; + output[7] = x1[14]; + output[8] = x1[1]; + output[9] = x1[9]; + output[10] = x1[5]; + output[11] = x1[13]; + output[12] = x1[3]; + output[13] = x1[11]; + output[14] = x1[7]; + output[15] = x1[15]; +} + +static INLINE void fdct16x32_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); + __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); + __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); + __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); + __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); + __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); + __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); + __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); + __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); + __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); + __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]); + __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]); + __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]); + __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]); + __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]); + __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]); + __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]); + __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]); + __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]); + __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]); + __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]); + __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]); + __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]); + __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]); + __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]); + __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]); + + // stage 1 + __m256i x1[32]; + btf_16_adds_subs_out_avx2(&x1[0], &x1[31], input[0], input[31]); + btf_16_adds_subs_out_avx2(&x1[1], &x1[30], input[1], input[30]); + btf_16_adds_subs_out_avx2(&x1[2], &x1[29], input[2], input[29]); + btf_16_adds_subs_out_avx2(&x1[3], &x1[28], input[3], input[28]); + btf_16_adds_subs_out_avx2(&x1[4], &x1[27], input[4], input[27]); + btf_16_adds_subs_out_avx2(&x1[5], &x1[26], input[5], input[26]); + btf_16_adds_subs_out_avx2(&x1[6], &x1[25], input[6], input[25]); + btf_16_adds_subs_out_avx2(&x1[7], &x1[24], input[7], input[24]); + btf_16_adds_subs_out_avx2(&x1[8], &x1[23], input[8], input[23]); + btf_16_adds_subs_out_avx2(&x1[9], &x1[22], input[9], input[22]); + btf_16_adds_subs_out_avx2(&x1[10], &x1[21], input[10], input[21]); + btf_16_adds_subs_out_avx2(&x1[11], &x1[20], input[11], input[20]); + btf_16_adds_subs_out_avx2(&x1[12], &x1[19], input[12], input[19]); + btf_16_adds_subs_out_avx2(&x1[13], &x1[18], input[13], input[18]); + btf_16_adds_subs_out_avx2(&x1[14], &x1[17], input[14], input[17]); + btf_16_adds_subs_out_avx2(&x1[15], &x1[16], input[15], input[16]); + + // stage 2 + btf_16_adds_subs_avx2(&x1[0], &x1[15]); + btf_16_adds_subs_avx2(&x1[1], &x1[14]); + btf_16_adds_subs_avx2(&x1[2], &x1[13]); + btf_16_adds_subs_avx2(&x1[3], &x1[12]); + btf_16_adds_subs_avx2(&x1[4], &x1[11]); + btf_16_adds_subs_avx2(&x1[5], &x1[10]); + btf_16_adds_subs_avx2(&x1[6], &x1[9]); + btf_16_adds_subs_avx2(&x1[7], &x1[8]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit); + + // stage 3 + btf_16_adds_subs_avx2(&x1[0], &x1[7]); + btf_16_adds_subs_avx2(&x1[1], &x1[6]); + btf_16_adds_subs_avx2(&x1[2], &x1[5]); + btf_16_adds_subs_avx2(&x1[3], &x1[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[23]); + btf_16_adds_subs_avx2(&x1[17], &x1[22]); + btf_16_adds_subs_avx2(&x1[18], &x1[21]); + btf_16_adds_subs_avx2(&x1[19], &x1[20]); + btf_16_adds_subs_avx2(&x1[31], &x1[24]); + btf_16_adds_subs_avx2(&x1[30], &x1[25]); + btf_16_adds_subs_avx2(&x1[29], &x1[26]); + btf_16_adds_subs_avx2(&x1[28], &x1[27]); + + // stage 4 + btf_16_adds_subs_avx2(&x1[0], &x1[3]); + btf_16_adds_subs_avx2(&x1[1], &x1[2]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[11]); + btf_16_adds_subs_avx2(&x1[9], &x1[10]); + btf_16_adds_subs_avx2(&x1[15], &x1[12]); + btf_16_adds_subs_avx2(&x1[14], &x1[13]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit); + + // stage 5 + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[19]); + btf_16_adds_subs_avx2(&x1[17], &x1[18]); + btf_16_adds_subs_avx2(&x1[23], &x1[20]); + btf_16_adds_subs_avx2(&x1[22], &x1[21]); + btf_16_adds_subs_avx2(&x1[24], &x1[27]); + btf_16_adds_subs_avx2(&x1[25], &x1[26]); + btf_16_adds_subs_avx2(&x1[31], &x1[28]); + btf_16_adds_subs_avx2(&x1[30], &x1[29]); + + // stage 6 + btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit); + + // stage 7 + btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit); + btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[17]); + btf_16_adds_subs_avx2(&x1[19], &x1[18]); + btf_16_adds_subs_avx2(&x1[20], &x1[21]); + btf_16_adds_subs_avx2(&x1[23], &x1[22]); + btf_16_adds_subs_avx2(&x1[24], &x1[25]); + btf_16_adds_subs_avx2(&x1[27], &x1[26]); + btf_16_adds_subs_avx2(&x1[28], &x1[29]); + btf_16_adds_subs_avx2(&x1[31], &x1[30]); + + // stage 8 + btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit); + btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit); + btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit); + btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit); + btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit); + + // stage 9 + output[0] = x1[0]; + output[1] = x1[16]; + output[2] = x1[8]; + output[3] = x1[24]; + output[4] = x1[4]; + output[5] = x1[20]; + output[6] = x1[12]; + output[7] = x1[28]; + output[8] = x1[2]; + output[9] = x1[18]; + output[10] = x1[10]; + output[11] = x1[26]; + output[12] = x1[6]; + output[13] = x1[22]; + output[14] = x1[14]; + output[15] = x1[30]; + output[16] = x1[1]; + output[17] = x1[17]; + output[18] = x1[9]; + output[19] = x1[25]; + output[20] = x1[5]; + output[21] = x1[21]; + output[22] = x1[13]; + output[23] = x1[29]; + output[24] = x1[3]; + output[25] = x1[19]; + output[26] = x1[11]; + output[27] = x1[27]; + output[28] = x1[7]; + output[29] = x1[23]; + output[30] = x1[15]; + output[31] = x1[31]; +} + +static INLINE void fdct16x64_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); + __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); + __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); + __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); + __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); + __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); + __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); + __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); + __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); + __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); + __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]); + __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]); + __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]); + __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]); + __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]); + __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]); + __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]); + __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]); + __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]); + __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]); + __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]); + __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]); + __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]); + __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]); + __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]); + __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]); + __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]); + __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]); + __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]); + __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]); + __m256i cospi_p63_p01 = pair_set_w16_epi16(cospi[63], cospi[1]); + __m256i cospi_m01_p63 = pair_set_w16_epi16(-cospi[1], cospi[63]); + __m256i cospi_p31_p33 = pair_set_w16_epi16(cospi[31], cospi[33]); + __m256i cospi_m33_p31 = pair_set_w16_epi16(-cospi[33], cospi[31]); + __m256i cospi_p47_p17 = pair_set_w16_epi16(cospi[47], cospi[17]); + __m256i cospi_m17_p47 = pair_set_w16_epi16(-cospi[17], cospi[47]); + __m256i cospi_p15_p49 = pair_set_w16_epi16(cospi[15], cospi[49]); + __m256i cospi_m49_p15 = pair_set_w16_epi16(-cospi[49], cospi[15]); + __m256i cospi_p55_p09 = pair_set_w16_epi16(cospi[55], cospi[9]); + __m256i cospi_m09_p55 = pair_set_w16_epi16(-cospi[9], cospi[55]); + __m256i cospi_p23_p41 = pair_set_w16_epi16(cospi[23], cospi[41]); + __m256i cospi_m41_p23 = pair_set_w16_epi16(-cospi[41], cospi[23]); + __m256i cospi_p39_p25 = pair_set_w16_epi16(cospi[39], cospi[25]); + __m256i cospi_m25_p39 = pair_set_w16_epi16(-cospi[25], cospi[39]); + __m256i cospi_p07_p57 = pair_set_w16_epi16(cospi[7], cospi[57]); + __m256i cospi_m57_p07 = pair_set_w16_epi16(-cospi[57], cospi[7]); + __m256i cospi_p59_p05 = pair_set_w16_epi16(cospi[59], cospi[5]); + __m256i cospi_m05_p59 = pair_set_w16_epi16(-cospi[5], cospi[59]); + __m256i cospi_p27_p37 = pair_set_w16_epi16(cospi[27], cospi[37]); + __m256i cospi_m37_p27 = pair_set_w16_epi16(-cospi[37], cospi[27]); + __m256i cospi_p43_p21 = pair_set_w16_epi16(cospi[43], cospi[21]); + __m256i cospi_m21_p43 = pair_set_w16_epi16(-cospi[21], cospi[43]); + __m256i cospi_p11_p53 = pair_set_w16_epi16(cospi[11], cospi[53]); + __m256i cospi_m53_p11 = pair_set_w16_epi16(-cospi[53], cospi[11]); + __m256i cospi_p51_p13 = pair_set_w16_epi16(cospi[51], cospi[13]); + __m256i cospi_m13_p51 = pair_set_w16_epi16(-cospi[13], cospi[51]); + __m256i cospi_p19_p45 = pair_set_w16_epi16(cospi[19], cospi[45]); + __m256i cospi_m45_p19 = pair_set_w16_epi16(-cospi[45], cospi[19]); + __m256i cospi_p35_p29 = pair_set_w16_epi16(cospi[35], cospi[29]); + __m256i cospi_m29_p35 = pair_set_w16_epi16(-cospi[29], cospi[35]); + __m256i cospi_p03_p61 = pair_set_w16_epi16(cospi[3], cospi[61]); + __m256i cospi_m61_p03 = pair_set_w16_epi16(-cospi[61], cospi[3]); + + // stage 1 + __m256i x1[64]; + btf_16_adds_subs_out_avx2(&x1[0], &x1[63], input[0], input[63]); + btf_16_adds_subs_out_avx2(&x1[1], &x1[62], input[1], input[62]); + btf_16_adds_subs_out_avx2(&x1[2], &x1[61], input[2], input[61]); + btf_16_adds_subs_out_avx2(&x1[3], &x1[60], input[3], input[60]); + btf_16_adds_subs_out_avx2(&x1[4], &x1[59], input[4], input[59]); + btf_16_adds_subs_out_avx2(&x1[5], &x1[58], input[5], input[58]); + btf_16_adds_subs_out_avx2(&x1[6], &x1[57], input[6], input[57]); + btf_16_adds_subs_out_avx2(&x1[7], &x1[56], input[7], input[56]); + btf_16_adds_subs_out_avx2(&x1[8], &x1[55], input[8], input[55]); + btf_16_adds_subs_out_avx2(&x1[9], &x1[54], input[9], input[54]); + btf_16_adds_subs_out_avx2(&x1[10], &x1[53], input[10], input[53]); + btf_16_adds_subs_out_avx2(&x1[11], &x1[52], input[11], input[52]); + btf_16_adds_subs_out_avx2(&x1[12], &x1[51], input[12], input[51]); + btf_16_adds_subs_out_avx2(&x1[13], &x1[50], input[13], input[50]); + btf_16_adds_subs_out_avx2(&x1[14], &x1[49], input[14], input[49]); + btf_16_adds_subs_out_avx2(&x1[15], &x1[48], input[15], input[48]); + btf_16_adds_subs_out_avx2(&x1[16], &x1[47], input[16], input[47]); + btf_16_adds_subs_out_avx2(&x1[17], &x1[46], input[17], input[46]); + btf_16_adds_subs_out_avx2(&x1[18], &x1[45], input[18], input[45]); + btf_16_adds_subs_out_avx2(&x1[19], &x1[44], input[19], input[44]); + btf_16_adds_subs_out_avx2(&x1[20], &x1[43], input[20], input[43]); + btf_16_adds_subs_out_avx2(&x1[21], &x1[42], input[21], input[42]); + btf_16_adds_subs_out_avx2(&x1[22], &x1[41], input[22], input[41]); + btf_16_adds_subs_out_avx2(&x1[23], &x1[40], input[23], input[40]); + btf_16_adds_subs_out_avx2(&x1[24], &x1[39], input[24], input[39]); + btf_16_adds_subs_out_avx2(&x1[25], &x1[38], input[25], input[38]); + btf_16_adds_subs_out_avx2(&x1[26], &x1[37], input[26], input[37]); + btf_16_adds_subs_out_avx2(&x1[27], &x1[36], input[27], input[36]); + btf_16_adds_subs_out_avx2(&x1[28], &x1[35], input[28], input[35]); + btf_16_adds_subs_out_avx2(&x1[29], &x1[34], input[29], input[34]); + btf_16_adds_subs_out_avx2(&x1[30], &x1[33], input[30], input[33]); + btf_16_adds_subs_out_avx2(&x1[31], &x1[32], input[31], input[32]); + + // stage 2 + btf_16_adds_subs_avx2(&x1[0], &x1[31]); + btf_16_adds_subs_avx2(&x1[1], &x1[30]); + btf_16_adds_subs_avx2(&x1[2], &x1[29]); + btf_16_adds_subs_avx2(&x1[3], &x1[28]); + btf_16_adds_subs_avx2(&x1[4], &x1[27]); + btf_16_adds_subs_avx2(&x1[5], &x1[26]); + btf_16_adds_subs_avx2(&x1[6], &x1[25]); + btf_16_adds_subs_avx2(&x1[7], &x1[24]); + btf_16_adds_subs_avx2(&x1[8], &x1[23]); + btf_16_adds_subs_avx2(&x1[9], &x1[22]); + btf_16_adds_subs_avx2(&x1[10], &x1[21]); + btf_16_adds_subs_avx2(&x1[11], &x1[20]); + btf_16_adds_subs_avx2(&x1[12], &x1[19]); + btf_16_adds_subs_avx2(&x1[13], &x1[18]); + btf_16_adds_subs_avx2(&x1[14], &x1[17]); + btf_16_adds_subs_avx2(&x1[15], &x1[16]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[40], &x1[55], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[41], &x1[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[42], &x1[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[43], &x1[52], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[44], &x1[51], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[45], &x1[50], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[46], &x1[49], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[47], &x1[48], _r, cos_bit); + + // stage 3 + btf_16_adds_subs_avx2(&x1[0], &x1[15]); + btf_16_adds_subs_avx2(&x1[1], &x1[14]); + btf_16_adds_subs_avx2(&x1[2], &x1[13]); + btf_16_adds_subs_avx2(&x1[3], &x1[12]); + btf_16_adds_subs_avx2(&x1[4], &x1[11]); + btf_16_adds_subs_avx2(&x1[5], &x1[10]); + btf_16_adds_subs_avx2(&x1[6], &x1[9]); + btf_16_adds_subs_avx2(&x1[7], &x1[8]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[32], &x1[47]); + btf_16_adds_subs_avx2(&x1[33], &x1[46]); + btf_16_adds_subs_avx2(&x1[34], &x1[45]); + btf_16_adds_subs_avx2(&x1[35], &x1[44]); + btf_16_adds_subs_avx2(&x1[36], &x1[43]); + btf_16_adds_subs_avx2(&x1[37], &x1[42]); + btf_16_adds_subs_avx2(&x1[38], &x1[41]); + btf_16_adds_subs_avx2(&x1[39], &x1[40]); + btf_16_adds_subs_avx2(&x1[63], &x1[48]); + btf_16_adds_subs_avx2(&x1[62], &x1[49]); + btf_16_adds_subs_avx2(&x1[61], &x1[50]); + btf_16_adds_subs_avx2(&x1[60], &x1[51]); + btf_16_adds_subs_avx2(&x1[59], &x1[52]); + btf_16_adds_subs_avx2(&x1[58], &x1[53]); + btf_16_adds_subs_avx2(&x1[57], &x1[54]); + btf_16_adds_subs_avx2(&x1[56], &x1[55]); + + // stage 4 + btf_16_adds_subs_avx2(&x1[0], &x1[7]); + btf_16_adds_subs_avx2(&x1[1], &x1[6]); + btf_16_adds_subs_avx2(&x1[2], &x1[5]); + btf_16_adds_subs_avx2(&x1[3], &x1[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[23]); + btf_16_adds_subs_avx2(&x1[17], &x1[22]); + btf_16_adds_subs_avx2(&x1[18], &x1[21]); + btf_16_adds_subs_avx2(&x1[19], &x1[20]); + btf_16_adds_subs_avx2(&x1[31], &x1[24]); + btf_16_adds_subs_avx2(&x1[30], &x1[25]); + btf_16_adds_subs_avx2(&x1[29], &x1[26]); + btf_16_adds_subs_avx2(&x1[28], &x1[27]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[36], &x1[59], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[37], &x1[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[38], &x1[57], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[39], &x1[56], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[40], &x1[55], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[41], &x1[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[42], &x1[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[43], &x1[52], _r, cos_bit); + + // stage 5 + btf_16_adds_subs_avx2(&x1[0], &x1[3]); + btf_16_adds_subs_avx2(&x1[1], &x1[2]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[11]); + btf_16_adds_subs_avx2(&x1[9], &x1[10]); + btf_16_adds_subs_avx2(&x1[15], &x1[12]); + btf_16_adds_subs_avx2(&x1[14], &x1[13]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[32], &x1[39]); + btf_16_adds_subs_avx2(&x1[33], &x1[38]); + btf_16_adds_subs_avx2(&x1[34], &x1[37]); + btf_16_adds_subs_avx2(&x1[35], &x1[36]); + btf_16_adds_subs_avx2(&x1[47], &x1[40]); + btf_16_adds_subs_avx2(&x1[46], &x1[41]); + btf_16_adds_subs_avx2(&x1[45], &x1[42]); + btf_16_adds_subs_avx2(&x1[44], &x1[43]); + btf_16_adds_subs_avx2(&x1[48], &x1[55]); + btf_16_adds_subs_avx2(&x1[49], &x1[54]); + btf_16_adds_subs_avx2(&x1[50], &x1[53]); + btf_16_adds_subs_avx2(&x1[51], &x1[52]); + btf_16_adds_subs_avx2(&x1[63], &x1[56]); + btf_16_adds_subs_avx2(&x1[62], &x1[57]); + btf_16_adds_subs_avx2(&x1[61], &x1[58]); + btf_16_adds_subs_avx2(&x1[60], &x1[59]); + + // stage 6 + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[19]); + btf_16_adds_subs_avx2(&x1[17], &x1[18]); + btf_16_adds_subs_avx2(&x1[23], &x1[20]); + btf_16_adds_subs_avx2(&x1[22], &x1[21]); + btf_16_adds_subs_avx2(&x1[24], &x1[27]); + btf_16_adds_subs_avx2(&x1[25], &x1[26]); + btf_16_adds_subs_avx2(&x1[31], &x1[28]); + btf_16_adds_subs_avx2(&x1[30], &x1[29]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[34], &x1[61], _r, cos_bit); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[35], &x1[60], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[36], &x1[59], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[37], &x1[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[42], &x1[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[43], &x1[52], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[44], &x1[51], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[45], &x1[50], _r, cos_bit); + + // stage 7 + btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[32], &x1[35]); + btf_16_adds_subs_avx2(&x1[33], &x1[34]); + btf_16_adds_subs_avx2(&x1[39], &x1[36]); + btf_16_adds_subs_avx2(&x1[38], &x1[37]); + btf_16_adds_subs_avx2(&x1[40], &x1[43]); + btf_16_adds_subs_avx2(&x1[41], &x1[42]); + btf_16_adds_subs_avx2(&x1[47], &x1[44]); + btf_16_adds_subs_avx2(&x1[46], &x1[45]); + btf_16_adds_subs_avx2(&x1[48], &x1[51]); + btf_16_adds_subs_avx2(&x1[49], &x1[50]); + btf_16_adds_subs_avx2(&x1[55], &x1[52]); + btf_16_adds_subs_avx2(&x1[54], &x1[53]); + btf_16_adds_subs_avx2(&x1[56], &x1[59]); + btf_16_adds_subs_avx2(&x1[57], &x1[58]); + btf_16_adds_subs_avx2(&x1[63], &x1[60]); + btf_16_adds_subs_avx2(&x1[62], &x1[61]); + + // stage 8 + btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit); + btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[17]); + btf_16_adds_subs_avx2(&x1[19], &x1[18]); + btf_16_adds_subs_avx2(&x1[20], &x1[21]); + btf_16_adds_subs_avx2(&x1[23], &x1[22]); + btf_16_adds_subs_avx2(&x1[24], &x1[25]); + btf_16_adds_subs_avx2(&x1[27], &x1[26]); + btf_16_adds_subs_avx2(&x1[28], &x1[29]); + btf_16_adds_subs_avx2(&x1[31], &x1[30]); + btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x1[33], &x1[62], _r, cos_bit); + btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x1[34], &x1[61], _r, cos_bit); + btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x1[37], &x1[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x1[38], &x1[57], _r, cos_bit); + btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x1[41], &x1[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x1[42], &x1[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x1[45], &x1[50], _r, cos_bit); + btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x1[46], &x1[49], _r, cos_bit); + + // stage 9 + btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit); + btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit); + btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit); + btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit); + btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[32], &x1[33]); + btf_16_adds_subs_avx2(&x1[35], &x1[34]); + btf_16_adds_subs_avx2(&x1[36], &x1[37]); + btf_16_adds_subs_avx2(&x1[39], &x1[38]); + btf_16_adds_subs_avx2(&x1[40], &x1[41]); + btf_16_adds_subs_avx2(&x1[43], &x1[42]); + btf_16_adds_subs_avx2(&x1[44], &x1[45]); + btf_16_adds_subs_avx2(&x1[47], &x1[46]); + btf_16_adds_subs_avx2(&x1[48], &x1[49]); + btf_16_adds_subs_avx2(&x1[51], &x1[50]); + btf_16_adds_subs_avx2(&x1[52], &x1[53]); + btf_16_adds_subs_avx2(&x1[55], &x1[54]); + btf_16_adds_subs_avx2(&x1[56], &x1[57]); + btf_16_adds_subs_avx2(&x1[59], &x1[58]); + btf_16_adds_subs_avx2(&x1[60], &x1[61]); + btf_16_adds_subs_avx2(&x1[63], &x1[62]); + + // stage 10 + btf_16_w16_avx2(cospi_p63_p01, cospi_m01_p63, &x1[32], &x1[63], _r, cos_bit); + btf_16_w16_avx2(cospi_p31_p33, cospi_m33_p31, &x1[33], &x1[62], _r, cos_bit); + btf_16_w16_avx2(cospi_p47_p17, cospi_m17_p47, &x1[34], &x1[61], _r, cos_bit); + btf_16_w16_avx2(cospi_p15_p49, cospi_m49_p15, &x1[35], &x1[60], _r, cos_bit); + btf_16_w16_avx2(cospi_p55_p09, cospi_m09_p55, &x1[36], &x1[59], _r, cos_bit); + btf_16_w16_avx2(cospi_p23_p41, cospi_m41_p23, &x1[37], &x1[58], _r, cos_bit); + btf_16_w16_avx2(cospi_p39_p25, cospi_m25_p39, &x1[38], &x1[57], _r, cos_bit); + btf_16_w16_avx2(cospi_p07_p57, cospi_m57_p07, &x1[39], &x1[56], _r, cos_bit); + btf_16_w16_avx2(cospi_p59_p05, cospi_m05_p59, &x1[40], &x1[55], _r, cos_bit); + btf_16_w16_avx2(cospi_p27_p37, cospi_m37_p27, &x1[41], &x1[54], _r, cos_bit); + btf_16_w16_avx2(cospi_p43_p21, cospi_m21_p43, &x1[42], &x1[53], _r, cos_bit); + btf_16_w16_avx2(cospi_p11_p53, cospi_m53_p11, &x1[43], &x1[52], _r, cos_bit); + btf_16_w16_avx2(cospi_p51_p13, cospi_m13_p51, &x1[44], &x1[51], _r, cos_bit); + btf_16_w16_avx2(cospi_p19_p45, cospi_m45_p19, &x1[45], &x1[50], _r, cos_bit); + btf_16_w16_avx2(cospi_p35_p29, cospi_m29_p35, &x1[46], &x1[49], _r, cos_bit); + btf_16_w16_avx2(cospi_p03_p61, cospi_m61_p03, &x1[47], &x1[48], _r, cos_bit); + + // stage 11 + output[0] = x1[0]; + output[1] = x1[32]; + output[2] = x1[16]; + output[3] = x1[48]; + output[4] = x1[8]; + output[5] = x1[40]; + output[6] = x1[24]; + output[7] = x1[56]; + output[8] = x1[4]; + output[9] = x1[36]; + output[10] = x1[20]; + output[11] = x1[52]; + output[12] = x1[12]; + output[13] = x1[44]; + output[14] = x1[28]; + output[15] = x1[60]; + output[16] = x1[2]; + output[17] = x1[34]; + output[18] = x1[18]; + output[19] = x1[50]; + output[20] = x1[10]; + output[21] = x1[42]; + output[22] = x1[26]; + output[23] = x1[58]; + output[24] = x1[6]; + output[25] = x1[38]; + output[26] = x1[22]; + output[27] = x1[54]; + output[28] = x1[14]; + output[29] = x1[46]; + output[30] = x1[30]; + output[31] = x1[62]; + output[32] = x1[1]; + output[33] = x1[33]; + output[34] = x1[17]; + output[35] = x1[49]; + output[36] = x1[9]; + output[37] = x1[41]; + output[38] = x1[25]; + output[39] = x1[57]; + output[40] = x1[5]; + output[41] = x1[37]; + output[42] = x1[21]; + output[43] = x1[53]; + output[44] = x1[13]; + output[45] = x1[45]; + output[46] = x1[29]; + output[47] = x1[61]; + output[48] = x1[3]; + output[49] = x1[35]; + output[50] = x1[19]; + output[51] = x1[51]; + output[52] = x1[11]; + output[53] = x1[43]; + output[54] = x1[27]; + output[55] = x1[59]; + output[56] = x1[7]; + output[57] = x1[39]; + output[58] = x1[23]; + output[59] = x1[55]; + output[60] = x1[15]; + output[61] = x1[47]; + output[62] = x1[31]; + output[63] = x1[63]; +} + +static INLINE void fdct32_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + __m256i x1[32]; + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + // stage 0 + // stage 1 + btf_32_add_sub_out_avx2(&x1[0], &x1[31], input[0], input[31]); + btf_32_add_sub_out_avx2(&x1[1], &x1[30], input[1], input[30]); + btf_32_add_sub_out_avx2(&x1[2], &x1[29], input[2], input[29]); + btf_32_add_sub_out_avx2(&x1[3], &x1[28], input[3], input[28]); + btf_32_add_sub_out_avx2(&x1[4], &x1[27], input[4], input[27]); + btf_32_add_sub_out_avx2(&x1[5], &x1[26], input[5], input[26]); + btf_32_add_sub_out_avx2(&x1[6], &x1[25], input[6], input[25]); + btf_32_add_sub_out_avx2(&x1[7], &x1[24], input[7], input[24]); + btf_32_add_sub_out_avx2(&x1[8], &x1[23], input[8], input[23]); + btf_32_add_sub_out_avx2(&x1[9], &x1[22], input[9], input[22]); + btf_32_add_sub_out_avx2(&x1[10], &x1[21], input[10], input[21]); + btf_32_add_sub_out_avx2(&x1[11], &x1[20], input[11], input[20]); + btf_32_add_sub_out_avx2(&x1[12], &x1[19], input[12], input[19]); + btf_32_add_sub_out_avx2(&x1[13], &x1[18], input[13], input[18]); + btf_32_add_sub_out_avx2(&x1[14], &x1[17], input[14], input[17]); + btf_32_add_sub_out_avx2(&x1[15], &x1[16], input[15], input[16]); + + // stage 2 + btf_32_add_sub_avx2(&x1[0], &x1[15]); + btf_32_add_sub_avx2(&x1[1], &x1[14]); + btf_32_add_sub_avx2(&x1[2], &x1[13]); + btf_32_add_sub_avx2(&x1[3], &x1[12]); + btf_32_add_sub_avx2(&x1[4], &x1[11]); + btf_32_add_sub_avx2(&x1[5], &x1[10]); + btf_32_add_sub_avx2(&x1[6], &x1[9]); + btf_32_add_sub_avx2(&x1[7], &x1[8]); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[22], &x1[25], _r, cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[23], &x1[24], _r, cos_bit); + + // stage 3 + btf_32_add_sub_avx2(&x1[0], &x1[7]); + btf_32_add_sub_avx2(&x1[1], &x1[6]); + btf_32_add_sub_avx2(&x1[2], &x1[5]); + btf_32_add_sub_avx2(&x1[3], &x1[4]); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[10], &x1[13], _r, cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[11], &x1[12], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[23]); + btf_32_add_sub_avx2(&x1[17], &x1[22]); + btf_32_add_sub_avx2(&x1[18], &x1[21]); + btf_32_add_sub_avx2(&x1[19], &x1[20]); + btf_32_add_sub_avx2(&x1[31], &x1[24]); + btf_32_add_sub_avx2(&x1[30], &x1[25]); + btf_32_add_sub_avx2(&x1[29], &x1[26]); + btf_32_add_sub_avx2(&x1[28], &x1[27]); + + // stage 4 + btf_32_add_sub_avx2(&x1[0], &x1[3]); + btf_32_add_sub_avx2(&x1[1], &x1[2]); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[5], &x1[6], _r, cos_bit); + btf_32_add_sub_avx2(&x1[8], &x1[11]); + btf_32_add_sub_avx2(&x1[9], &x1[10]); + btf_32_add_sub_avx2(&x1[15], &x1[12]); + btf_32_add_sub_avx2(&x1[14], &x1[13]); + btf_32_avx2_type0(-cospi[16], cospi[48], &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type0(-cospi[16], cospi[48], &x1[19], &x1[28], _r, cos_bit); + btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[21], &x1[26], _r, cos_bit); + + // stage 5 + btf_32_avx2_type0(cospi[32], cospi[32], &x1[0], &x1[1], _r, cos_bit); + btf_32_avx2_type1(cospi[48], cospi[16], &x1[2], &x1[3], _r, cos_bit); + btf_32_add_sub_avx2(&x1[4], &x1[5]); + btf_32_add_sub_avx2(&x1[7], &x1[6]); + btf_32_avx2_type0(-cospi[16], cospi[48], &x1[9], &x1[14], _r, cos_bit); + btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[10], &x1[13], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[19]); + btf_32_add_sub_avx2(&x1[17], &x1[18]); + btf_32_add_sub_avx2(&x1[23], &x1[20]); + btf_32_add_sub_avx2(&x1[22], &x1[21]); + btf_32_add_sub_avx2(&x1[24], &x1[27]); + btf_32_add_sub_avx2(&x1[25], &x1[26]); + btf_32_add_sub_avx2(&x1[31], &x1[28]); + btf_32_add_sub_avx2(&x1[30], &x1[29]); + + // stage 6 + btf_32_avx2_type1(cospi[56], cospi[8], &x1[4], &x1[7], _r, cos_bit); + btf_32_avx2_type1(cospi[24], cospi[40], &x1[5], &x1[6], _r, cos_bit); + btf_32_add_sub_avx2(&x1[8], &x1[9]); + btf_32_add_sub_avx2(&x1[11], &x1[10]); + btf_32_add_sub_avx2(&x1[12], &x1[13]); + btf_32_add_sub_avx2(&x1[15], &x1[14]); + btf_32_avx2_type0(-cospi[8], cospi[56], &x1[17], &x1[30], _r, cos_bit); + btf_32_avx2_type0(-cospi[56], -cospi[8], &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type0(-cospi[40], cospi[24], &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type0(-cospi[24], -cospi[40], &x1[22], &x1[25], _r, cos_bit); + + // stage 7 + btf_32_avx2_type1(cospi[60], cospi[4], &x1[8], &x1[15], _r, cos_bit); + btf_32_avx2_type1(cospi[28], cospi[36], &x1[9], &x1[14], _r, cos_bit); + btf_32_avx2_type1(cospi[44], cospi[20], &x1[10], &x1[13], _r, cos_bit); + btf_32_avx2_type1(cospi[12], cospi[52], &x1[11], &x1[12], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[17]); + btf_32_add_sub_avx2(&x1[19], &x1[18]); + btf_32_add_sub_avx2(&x1[20], &x1[21]); + btf_32_add_sub_avx2(&x1[23], &x1[22]); + btf_32_add_sub_avx2(&x1[24], &x1[25]); + btf_32_add_sub_avx2(&x1[27], &x1[26]); + btf_32_add_sub_avx2(&x1[28], &x1[29]); + btf_32_add_sub_avx2(&x1[31], &x1[30]); + + // stage 8 + btf_32_avx2_type1(cospi[62], cospi[2], &x1[16], &x1[31], _r, cos_bit); + btf_32_avx2_type1(cospi[30], cospi[34], &x1[17], &x1[30], _r, cos_bit); + btf_32_avx2_type1(cospi[46], cospi[18], &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type1(cospi[14], cospi[50], &x1[19], &x1[28], _r, cos_bit); + btf_32_avx2_type1(cospi[54], cospi[10], &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type1(cospi[22], cospi[42], &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type1(cospi[38], cospi[26], &x1[22], &x1[25], _r, cos_bit); + btf_32_avx2_type1(cospi[6], cospi[58], &x1[23], &x1[24], _r, cos_bit); + + // stage 9 + output[0] = x1[0]; + output[1] = x1[16]; + output[2] = x1[8]; + output[3] = x1[24]; + output[4] = x1[4]; + output[5] = x1[20]; + output[6] = x1[12]; + output[7] = x1[28]; + output[8] = x1[2]; + output[9] = x1[18]; + output[10] = x1[10]; + output[11] = x1[26]; + output[12] = x1[6]; + output[13] = x1[22]; + output[14] = x1[14]; + output[15] = x1[30]; + output[16] = x1[1]; + output[17] = x1[17]; + output[18] = x1[9]; + output[19] = x1[25]; + output[20] = x1[5]; + output[21] = x1[21]; + output[22] = x1[13]; + output[23] = x1[29]; + output[24] = x1[3]; + output[25] = x1[19]; + output[26] = x1[11]; + output[27] = x1[27]; + output[28] = x1[7]; + output[29] = x1[23]; + output[30] = x1[15]; + output[31] = x1[31]; +} + +static INLINE void fdct64_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]); + __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]); + __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]); + __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]); + __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]); + __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]); + __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]); + __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]); + __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]); + __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]); + __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]); + __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]); + __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]); + __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]); + __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]); + __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]); + __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]); + __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]); + __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]); + __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]); + __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]); + __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]); + __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]); + __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]); + __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]); + __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]); + __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]); + __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]); + __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]); + __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]); + __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]); + __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]); + __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]); + __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]); + __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]); + __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]); + __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]); + __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]); + __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]); + __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]); + __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]); + __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]); + __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]); + __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]); + __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]); + __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]); + __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]); + __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]); + __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]); + __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]); + __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]); + __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]); + __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]); + __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]); + __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]); + __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]); + __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]); + __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]); + __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]); + __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]); + __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]); + __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]); + __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]); + __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]); + __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]); + __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]); + __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]); + __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]); + __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]); + __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]); + __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]); + __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]); + __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]); + __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]); + __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]); + __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]); + __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]); + __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]); + + // stage 1 + __m256i x1[64]; + btf_32_add_sub_out_avx2(&x1[0], &x1[63], input[0], input[63]); + btf_32_add_sub_out_avx2(&x1[1], &x1[62], input[1], input[62]); + btf_32_add_sub_out_avx2(&x1[2], &x1[61], input[2], input[61]); + btf_32_add_sub_out_avx2(&x1[3], &x1[60], input[3], input[60]); + btf_32_add_sub_out_avx2(&x1[4], &x1[59], input[4], input[59]); + btf_32_add_sub_out_avx2(&x1[5], &x1[58], input[5], input[58]); + btf_32_add_sub_out_avx2(&x1[6], &x1[57], input[6], input[57]); + btf_32_add_sub_out_avx2(&x1[7], &x1[56], input[7], input[56]); + btf_32_add_sub_out_avx2(&x1[8], &x1[55], input[8], input[55]); + btf_32_add_sub_out_avx2(&x1[9], &x1[54], input[9], input[54]); + btf_32_add_sub_out_avx2(&x1[10], &x1[53], input[10], input[53]); + btf_32_add_sub_out_avx2(&x1[11], &x1[52], input[11], input[52]); + btf_32_add_sub_out_avx2(&x1[12], &x1[51], input[12], input[51]); + btf_32_add_sub_out_avx2(&x1[13], &x1[50], input[13], input[50]); + btf_32_add_sub_out_avx2(&x1[14], &x1[49], input[14], input[49]); + btf_32_add_sub_out_avx2(&x1[15], &x1[48], input[15], input[48]); + btf_32_add_sub_out_avx2(&x1[16], &x1[47], input[16], input[47]); + btf_32_add_sub_out_avx2(&x1[17], &x1[46], input[17], input[46]); + btf_32_add_sub_out_avx2(&x1[18], &x1[45], input[18], input[45]); + btf_32_add_sub_out_avx2(&x1[19], &x1[44], input[19], input[44]); + btf_32_add_sub_out_avx2(&x1[20], &x1[43], input[20], input[43]); + btf_32_add_sub_out_avx2(&x1[21], &x1[42], input[21], input[42]); + btf_32_add_sub_out_avx2(&x1[22], &x1[41], input[22], input[41]); + btf_32_add_sub_out_avx2(&x1[23], &x1[40], input[23], input[40]); + btf_32_add_sub_out_avx2(&x1[24], &x1[39], input[24], input[39]); + btf_32_add_sub_out_avx2(&x1[25], &x1[38], input[25], input[38]); + btf_32_add_sub_out_avx2(&x1[26], &x1[37], input[26], input[37]); + btf_32_add_sub_out_avx2(&x1[27], &x1[36], input[27], input[36]); + btf_32_add_sub_out_avx2(&x1[28], &x1[35], input[28], input[35]); + btf_32_add_sub_out_avx2(&x1[29], &x1[34], input[29], input[34]); + btf_32_add_sub_out_avx2(&x1[30], &x1[33], input[30], input[33]); + btf_32_add_sub_out_avx2(&x1[31], &x1[32], input[31], input[32]); + + // stage 2 + btf_32_add_sub_avx2(&x1[0], &x1[31]); + btf_32_add_sub_avx2(&x1[1], &x1[30]); + btf_32_add_sub_avx2(&x1[2], &x1[29]); + btf_32_add_sub_avx2(&x1[3], &x1[28]); + btf_32_add_sub_avx2(&x1[4], &x1[27]); + btf_32_add_sub_avx2(&x1[5], &x1[26]); + btf_32_add_sub_avx2(&x1[6], &x1[25]); + btf_32_add_sub_avx2(&x1[7], &x1[24]); + btf_32_add_sub_avx2(&x1[8], &x1[23]); + btf_32_add_sub_avx2(&x1[9], &x1[22]); + btf_32_add_sub_avx2(&x1[10], &x1[21]); + btf_32_add_sub_avx2(&x1[11], &x1[20]); + btf_32_add_sub_avx2(&x1[12], &x1[19]); + btf_32_add_sub_avx2(&x1[13], &x1[18]); + btf_32_add_sub_avx2(&x1[14], &x1[17]); + btf_32_add_sub_avx2(&x1[15], &x1[16]); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[40], &x1[55], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[41], &x1[54], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[42], &x1[53], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[43], &x1[52], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[44], &x1[51], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[45], &x1[50], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[46], &x1[49], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[47], &x1[48], _r, cos_bit); + + // stage 3 + btf_32_add_sub_avx2(&x1[0], &x1[15]); + btf_32_add_sub_avx2(&x1[1], &x1[14]); + btf_32_add_sub_avx2(&x1[2], &x1[13]); + btf_32_add_sub_avx2(&x1[3], &x1[12]); + btf_32_add_sub_avx2(&x1[4], &x1[11]); + btf_32_add_sub_avx2(&x1[5], &x1[10]); + btf_32_add_sub_avx2(&x1[6], &x1[9]); + btf_32_add_sub_avx2(&x1[7], &x1[8]); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[22], &x1[25], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[23], &x1[24], _r, cos_bit); + btf_32_add_sub_avx2(&x1[32], &x1[47]); + btf_32_add_sub_avx2(&x1[33], &x1[46]); + btf_32_add_sub_avx2(&x1[34], &x1[45]); + btf_32_add_sub_avx2(&x1[35], &x1[44]); + btf_32_add_sub_avx2(&x1[36], &x1[43]); + btf_32_add_sub_avx2(&x1[37], &x1[42]); + btf_32_add_sub_avx2(&x1[38], &x1[41]); + btf_32_add_sub_avx2(&x1[39], &x1[40]); + btf_32_add_sub_avx2(&x1[63], &x1[48]); + btf_32_add_sub_avx2(&x1[62], &x1[49]); + btf_32_add_sub_avx2(&x1[61], &x1[50]); + btf_32_add_sub_avx2(&x1[60], &x1[51]); + btf_32_add_sub_avx2(&x1[59], &x1[52]); + btf_32_add_sub_avx2(&x1[58], &x1[53]); + btf_32_add_sub_avx2(&x1[57], &x1[54]); + btf_32_add_sub_avx2(&x1[56], &x1[55]); + + // stage 4 + btf_32_add_sub_avx2(&x1[0], &x1[7]); + btf_32_add_sub_avx2(&x1[1], &x1[6]); + btf_32_add_sub_avx2(&x1[2], &x1[5]); + btf_32_add_sub_avx2(&x1[3], &x1[4]); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[10], &x1[13], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[11], &x1[12], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[23]); + btf_32_add_sub_avx2(&x1[17], &x1[22]); + btf_32_add_sub_avx2(&x1[18], &x1[21]); + btf_32_add_sub_avx2(&x1[19], &x1[20]); + btf_32_add_sub_avx2(&x1[31], &x1[24]); + btf_32_add_sub_avx2(&x1[30], &x1[25]); + btf_32_add_sub_avx2(&x1[29], &x1[26]); + btf_32_add_sub_avx2(&x1[28], &x1[27]); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[36], &x1[59], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[37], &x1[58], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[38], &x1[57], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[39], &x1[56], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[40], &x1[55], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[41], &x1[54], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[42], &x1[53], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[43], &x1[52], _r, cos_bit); + + // stage 5 + btf_32_add_sub_avx2(&x1[0], &x1[3]); + btf_32_add_sub_avx2(&x1[1], &x1[2]); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[5], &x1[6], _r, cos_bit); + btf_32_add_sub_avx2(&x1[8], &x1[11]); + btf_32_add_sub_avx2(&x1[9], &x1[10]); + btf_32_add_sub_avx2(&x1[15], &x1[12]); + btf_32_add_sub_avx2(&x1[14], &x1[13]); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[19], &x1[28], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[21], &x1[26], _r, cos_bit); + btf_32_add_sub_avx2(&x1[32], &x1[39]); + btf_32_add_sub_avx2(&x1[33], &x1[38]); + btf_32_add_sub_avx2(&x1[34], &x1[37]); + btf_32_add_sub_avx2(&x1[35], &x1[36]); + btf_32_add_sub_avx2(&x1[47], &x1[40]); + btf_32_add_sub_avx2(&x1[46], &x1[41]); + btf_32_add_sub_avx2(&x1[45], &x1[42]); + btf_32_add_sub_avx2(&x1[44], &x1[43]); + btf_32_add_sub_avx2(&x1[48], &x1[55]); + btf_32_add_sub_avx2(&x1[49], &x1[54]); + btf_32_add_sub_avx2(&x1[50], &x1[53]); + btf_32_add_sub_avx2(&x1[51], &x1[52]); + btf_32_add_sub_avx2(&x1[63], &x1[56]); + btf_32_add_sub_avx2(&x1[62], &x1[57]); + btf_32_add_sub_avx2(&x1[61], &x1[58]); + btf_32_add_sub_avx2(&x1[60], &x1[59]); + + // stage 6 + btf_32_avx2_type0_new(cospi_p32, cospi_p32, &x1[0], &x1[1], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p48, cospi_p16, &x1[2], &x1[3], _r, cos_bit); + btf_32_add_sub_avx2(&x1[4], &x1[5]); + btf_32_add_sub_avx2(&x1[7], &x1[6]); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[9], &x1[14], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[10], &x1[13], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[19]); + btf_32_add_sub_avx2(&x1[17], &x1[18]); + btf_32_add_sub_avx2(&x1[23], &x1[20]); + btf_32_add_sub_avx2(&x1[22], &x1[21]); + btf_32_add_sub_avx2(&x1[24], &x1[27]); + btf_32_add_sub_avx2(&x1[25], &x1[26]); + btf_32_add_sub_avx2(&x1[31], &x1[28]); + btf_32_add_sub_avx2(&x1[30], &x1[29]); + btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[34], &x1[61], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[35], &x1[60], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[36], &x1[59], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[37], &x1[58], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[42], &x1[53], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[43], &x1[52], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[44], &x1[51], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[45], &x1[50], _r, cos_bit); + + // stage 7 + btf_32_avx2_type1_new(cospi_p56, cospi_p08, &x1[4], &x1[7], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p24, cospi_p40, &x1[5], &x1[6], _r, cos_bit); + btf_32_add_sub_avx2(&x1[8], &x1[9]); + btf_32_add_sub_avx2(&x1[11], &x1[10]); + btf_32_add_sub_avx2(&x1[12], &x1[13]); + btf_32_add_sub_avx2(&x1[15], &x1[14]); + btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[17], &x1[30], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[22], &x1[25], _r, cos_bit); + btf_32_add_sub_avx2(&x1[32], &x1[35]); + btf_32_add_sub_avx2(&x1[33], &x1[34]); + btf_32_add_sub_avx2(&x1[39], &x1[36]); + btf_32_add_sub_avx2(&x1[38], &x1[37]); + btf_32_add_sub_avx2(&x1[40], &x1[43]); + btf_32_add_sub_avx2(&x1[41], &x1[42]); + btf_32_add_sub_avx2(&x1[47], &x1[44]); + btf_32_add_sub_avx2(&x1[46], &x1[45]); + btf_32_add_sub_avx2(&x1[48], &x1[51]); + btf_32_add_sub_avx2(&x1[49], &x1[50]); + btf_32_add_sub_avx2(&x1[55], &x1[52]); + btf_32_add_sub_avx2(&x1[54], &x1[53]); + btf_32_add_sub_avx2(&x1[56], &x1[59]); + btf_32_add_sub_avx2(&x1[57], &x1[58]); + btf_32_add_sub_avx2(&x1[63], &x1[60]); + btf_32_add_sub_avx2(&x1[62], &x1[61]); + + // stage 8 + btf_32_avx2_type1_new(cospi_p60, cospi_p04, &x1[8], &x1[15], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p28, cospi_p36, &x1[9], &x1[14], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p44, cospi_p20, &x1[10], &x1[13], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p12, cospi_p52, &x1[11], &x1[12], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[17]); + btf_32_add_sub_avx2(&x1[19], &x1[18]); + btf_32_add_sub_avx2(&x1[20], &x1[21]); + btf_32_add_sub_avx2(&x1[23], &x1[22]); + btf_32_add_sub_avx2(&x1[24], &x1[25]); + btf_32_add_sub_avx2(&x1[27], &x1[26]); + btf_32_add_sub_avx2(&x1[28], &x1[29]); + btf_32_add_sub_avx2(&x1[31], &x1[30]); + btf_32_avx2_type0_new(cospi_m04, cospi_p60, &x1[33], &x1[62], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m60, cospi_m04, &x1[34], &x1[61], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m36, cospi_p28, &x1[37], &x1[58], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m28, cospi_m36, &x1[38], &x1[57], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m20, cospi_p44, &x1[41], &x1[54], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m44, cospi_m20, &x1[42], &x1[53], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m52, cospi_p12, &x1[45], &x1[50], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m12, cospi_m52, &x1[46], &x1[49], _r, cos_bit); + + // stage 9 + btf_32_avx2_type1_new(cospi_p62, cospi_p02, &x1[16], &x1[31], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p30, cospi_p34, &x1[17], &x1[30], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p46, cospi_p18, &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p14, cospi_p50, &x1[19], &x1[28], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p54, cospi_p10, &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p22, cospi_p42, &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p38, cospi_p26, &x1[22], &x1[25], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p06, cospi_p58, &x1[23], &x1[24], _r, cos_bit); + btf_32_add_sub_avx2(&x1[32], &x1[33]); + btf_32_add_sub_avx2(&x1[35], &x1[34]); + btf_32_add_sub_avx2(&x1[36], &x1[37]); + btf_32_add_sub_avx2(&x1[39], &x1[38]); + btf_32_add_sub_avx2(&x1[40], &x1[41]); + btf_32_add_sub_avx2(&x1[43], &x1[42]); + btf_32_add_sub_avx2(&x1[44], &x1[45]); + btf_32_add_sub_avx2(&x1[47], &x1[46]); + btf_32_add_sub_avx2(&x1[48], &x1[49]); + btf_32_add_sub_avx2(&x1[51], &x1[50]); + btf_32_add_sub_avx2(&x1[52], &x1[53]); + btf_32_add_sub_avx2(&x1[55], &x1[54]); + btf_32_add_sub_avx2(&x1[56], &x1[57]); + btf_32_add_sub_avx2(&x1[59], &x1[58]); + btf_32_add_sub_avx2(&x1[60], &x1[61]); + btf_32_add_sub_avx2(&x1[63], &x1[62]); + + // stage 10 + btf_32_avx2_type1_new(cospi_p63, cospi_p01, &x1[32], &x1[63], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p31, cospi_p33, &x1[33], &x1[62], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p47, cospi_p17, &x1[34], &x1[61], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p15, cospi_p49, &x1[35], &x1[60], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p55, cospi_p09, &x1[36], &x1[59], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p23, cospi_p41, &x1[37], &x1[58], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p39, cospi_p25, &x1[38], &x1[57], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p07, cospi_p57, &x1[39], &x1[56], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p59, cospi_p05, &x1[40], &x1[55], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p27, cospi_p37, &x1[41], &x1[54], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p43, cospi_p21, &x1[42], &x1[53], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p11, cospi_p53, &x1[43], &x1[52], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p51, cospi_p13, &x1[44], &x1[51], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p19, cospi_p45, &x1[45], &x1[50], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p35, cospi_p29, &x1[46], &x1[49], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p03, cospi_p61, &x1[47], &x1[48], _r, cos_bit); + + // stage 11 + output[0] = x1[0]; + output[1] = x1[32]; + output[2] = x1[16]; + output[3] = x1[48]; + output[4] = x1[8]; + output[5] = x1[40]; + output[6] = x1[24]; + output[7] = x1[56]; + output[8] = x1[4]; + output[9] = x1[36]; + output[10] = x1[20]; + output[11] = x1[52]; + output[12] = x1[12]; + output[13] = x1[44]; + output[14] = x1[28]; + output[15] = x1[60]; + output[16] = x1[2]; + output[17] = x1[34]; + output[18] = x1[18]; + output[19] = x1[50]; + output[20] = x1[10]; + output[21] = x1[42]; + output[22] = x1[26]; + output[23] = x1[58]; + output[24] = x1[6]; + output[25] = x1[38]; + output[26] = x1[22]; + output[27] = x1[54]; + output[28] = x1[14]; + output[29] = x1[46]; + output[30] = x1[30]; + output[31] = x1[62]; + output[32] = x1[1]; + output[33] = x1[33]; + output[34] = x1[17]; + output[35] = x1[49]; + output[36] = x1[9]; + output[37] = x1[41]; + output[38] = x1[25]; + output[39] = x1[57]; + output[40] = x1[5]; + output[41] = x1[37]; + output[42] = x1[21]; + output[43] = x1[53]; + output[44] = x1[13]; + output[45] = x1[45]; + output[46] = x1[29]; + output[47] = x1[61]; + output[48] = x1[3]; + output[49] = x1[35]; + output[50] = x1[19]; + output[51] = x1[51]; + output[52] = x1[11]; + output[53] = x1[43]; + output[54] = x1[27]; + output[55] = x1[59]; + output[56] = x1[7]; + output[57] = x1[39]; + output[58] = x1[23]; + output[59] = x1[55]; + output[60] = x1[15]; + output[61] = x1[47]; + output[62] = x1[31]; + output[63] = x1[63]; +} + +static INLINE void fadst16x16_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i __zero = _mm256_setzero_si256(); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); + __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); + __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]); + __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); + __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); + __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); + __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); + __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]); + __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]); + __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]); + __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]); + __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]); + __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]); + __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]); + __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]); + __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]); + __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]); + __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]); + __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]); + __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]); + __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]); + __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]); + __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]); + __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]); + __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]); + + // stage 1 + __m256i x1[16]; + x1[0] = input[0]; + x1[1] = _mm256_subs_epi16(__zero, input[15]); + x1[2] = _mm256_subs_epi16(__zero, input[7]); + x1[3] = input[8]; + x1[4] = _mm256_subs_epi16(__zero, input[3]); + x1[5] = input[12]; + x1[6] = input[4]; + x1[7] = _mm256_subs_epi16(__zero, input[11]); + x1[8] = _mm256_subs_epi16(__zero, input[1]); + x1[9] = input[14]; + x1[10] = input[6]; + x1[11] = _mm256_subs_epi16(__zero, input[9]); + x1[12] = input[2]; + x1[13] = _mm256_subs_epi16(__zero, input[13]); + x1[14] = _mm256_subs_epi16(__zero, input[5]); + x1[15] = input[10]; + + // stage 2 + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit); + + // stage 3 + btf_16_adds_subs_avx2(&x1[0], &x1[2]); + btf_16_adds_subs_avx2(&x1[1], &x1[3]); + btf_16_adds_subs_avx2(&x1[4], &x1[6]); + btf_16_adds_subs_avx2(&x1[5], &x1[7]); + btf_16_adds_subs_avx2(&x1[8], &x1[10]); + btf_16_adds_subs_avx2(&x1[9], &x1[11]); + btf_16_adds_subs_avx2(&x1[12], &x1[14]); + btf_16_adds_subs_avx2(&x1[13], &x1[15]); + + // stage 4 + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[6], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[14], &x1[15], _r, cos_bit); + + // stage 5 + btf_16_adds_subs_avx2(&x1[0], &x1[4]); + btf_16_adds_subs_avx2(&x1[1], &x1[5]); + btf_16_adds_subs_avx2(&x1[2], &x1[6]); + btf_16_adds_subs_avx2(&x1[3], &x1[7]); + btf_16_adds_subs_avx2(&x1[8], &x1[12]); + btf_16_adds_subs_avx2(&x1[9], &x1[13]); + btf_16_adds_subs_avx2(&x1[10], &x1[14]); + btf_16_adds_subs_avx2(&x1[11], &x1[15]); + + // stage 6 + btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit); + btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x1[10], &x1[11], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x1[12], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x1[14], &x1[15], _r, cos_bit); + + // stage 7 + btf_16_adds_subs_avx2(&x1[0], &x1[8]); + btf_16_adds_subs_avx2(&x1[1], &x1[9]); + btf_16_adds_subs_avx2(&x1[2], &x1[10]); + btf_16_adds_subs_avx2(&x1[3], &x1[11]); + btf_16_adds_subs_avx2(&x1[4], &x1[12]); + btf_16_adds_subs_avx2(&x1[5], &x1[13]); + btf_16_adds_subs_avx2(&x1[6], &x1[14]); + btf_16_adds_subs_avx2(&x1[7], &x1[15]); + + // stage 8 + btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit); + btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit); + btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit); + btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit); + btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit); + + // stage 9 + output[0] = x1[1]; + output[1] = x1[14]; + output[2] = x1[3]; + output[3] = x1[12]; + output[4] = x1[5]; + output[5] = x1[10]; + output[6] = x1[7]; + output[7] = x1[8]; + output[8] = x1[9]; + output[9] = x1[6]; + output[10] = x1[11]; + output[11] = x1[4]; + output[12] = x1[13]; + output[13] = x1[2]; + output[14] = x1[15]; + output[15] = x1[0]; +} + +static INLINE void fidentity16x16_new_avx2(const __m256i *input, + __m256i *output, int8_t cos_bit) { + (void)cos_bit; + const __m256i one = _mm256_set1_epi16(1); + + for (int i = 0; i < 16; ++i) { + const __m256i a_lo = _mm256_unpacklo_epi16(input[i], one); + const __m256i a_hi = _mm256_unpackhi_epi16(input[i], one); + const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2); + const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2); + output[i] = _mm256_packs_epi32(b_lo, b_hi); + } +} + +static INLINE void fidentity16x32_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)cos_bit; + for (int i = 0; i < 32; ++i) { + output[i] = _mm256_slli_epi16(input[i], 2); + } +} + +static INLINE void transpose_32_8x8_avx2(int stride, const __m256i *inputA, + __m256i *output) { + __m256i temp0 = _mm256_unpacklo_epi32(inputA[0], inputA[2]); + __m256i temp1 = _mm256_unpackhi_epi32(inputA[0], inputA[2]); + __m256i temp2 = _mm256_unpacklo_epi32(inputA[1], inputA[3]); + __m256i temp3 = _mm256_unpackhi_epi32(inputA[1], inputA[3]); + __m256i temp4 = _mm256_unpacklo_epi32(inputA[4], inputA[6]); + __m256i temp5 = _mm256_unpackhi_epi32(inputA[4], inputA[6]); + __m256i temp6 = _mm256_unpacklo_epi32(inputA[5], inputA[7]); + __m256i temp7 = _mm256_unpackhi_epi32(inputA[5], inputA[7]); + + __m256i t0 = _mm256_unpacklo_epi32(temp0, temp2); + __m256i t1 = _mm256_unpackhi_epi32(temp0, temp2); + __m256i t2 = _mm256_unpacklo_epi32(temp1, temp3); + __m256i t3 = _mm256_unpackhi_epi32(temp1, temp3); + __m256i t4 = _mm256_unpacklo_epi32(temp4, temp6); + __m256i t5 = _mm256_unpackhi_epi32(temp4, temp6); + __m256i t6 = _mm256_unpacklo_epi32(temp5, temp7); + __m256i t7 = _mm256_unpackhi_epi32(temp5, temp7); + + output[0 * stride] = _mm256_permute2x128_si256(t0, t4, 0x20); + output[1 * stride] = _mm256_permute2x128_si256(t1, t5, 0x20); + output[2 * stride] = _mm256_permute2x128_si256(t2, t6, 0x20); + output[3 * stride] = _mm256_permute2x128_si256(t3, t7, 0x20); + output[4 * stride] = _mm256_permute2x128_si256(t0, t4, 0x31); + output[5 * stride] = _mm256_permute2x128_si256(t1, t5, 0x31); + output[6 * stride] = _mm256_permute2x128_si256(t2, t6, 0x31); + output[7 * stride] = _mm256_permute2x128_si256(t3, t7, 0x31); +} + +// Store 8 16 bit values. Sign extend the values. +static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in, + int32_t *out, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + _mm256_store_si256((__m256i *)(out), + _mm256_cvtepi16_epi32(_mm256_castsi256_si128(in[i]))); + _mm256_store_si256( + (__m256i *)(out + 8), + _mm256_cvtepi16_epi32(_mm256_extracti128_si256(in[i], 1))); + out += stride; + } +} + +static INLINE void store_rect_16bit_to_32bit_avx2(const __m256i a, + int32_t *const b) { + const __m256i one = _mm256_set1_epi16(1); + const __m256i a_reoder = _mm256_permute4x64_epi64(a, 0xd8); + const __m256i a_lo = _mm256_unpacklo_epi16(a_reoder, one); + const __m256i a_hi = _mm256_unpackhi_epi16(a_reoder, one); + const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2); + const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2); + _mm256_store_si256((__m256i *)b, b_lo); + _mm256_store_si256((__m256i *)(b + 8), b_hi); +} + +static INLINE void store_rect_buffer_16bit_to_32bit_w16_avx2( + const __m256i *const in, int32_t *const out, const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + store_rect_16bit_to_32bit_avx2(in[i], out + i * stride); + } +} + +typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output, + int8_t cos_bit); + +static const transform_1d_avx2 col_txfm16x32_arr[TX_TYPES] = { + fdct16x32_avx2, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + fidentity16x32_avx2, // IDTX + fdct16x32_avx2, // V_DCT + fidentity16x32_avx2, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +static const transform_1d_avx2 row_txfm16x32_arr[TX_TYPES] = { + fdct16x32_avx2, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + fidentity16x32_avx2, // IDTX + fidentity16x32_avx2, // V_DCT + fdct16x32_avx2, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +static const transform_1d_avx2 col_txfm16x16_arr[TX_TYPES] = { + fdct16x16_new_avx2, // DCT_DCT + fadst16x16_new_avx2, // ADST_DCT + fdct16x16_new_avx2, // DCT_ADST + fadst16x16_new_avx2, // ADST_ADST + fadst16x16_new_avx2, // FLIPADST_DCT + fdct16x16_new_avx2, // DCT_FLIPADST + fadst16x16_new_avx2, // FLIPADST_FLIPADST + fadst16x16_new_avx2, // ADST_FLIPADST + fadst16x16_new_avx2, // FLIPADST_ADST + fidentity16x16_new_avx2, // IDTX + fdct16x16_new_avx2, // V_DCT + fidentity16x16_new_avx2, // H_DCT + fadst16x16_new_avx2, // V_ADST + fidentity16x16_new_avx2, // H_ADST + fadst16x16_new_avx2, // V_FLIPADST + fidentity16x16_new_avx2 // H_FLIPADST +}; + +static const transform_1d_avx2 row_txfm16x16_arr[TX_TYPES] = { + fdct16x16_new_avx2, // DCT_DCT + fdct16x16_new_avx2, // ADST_DCT + fadst16x16_new_avx2, // DCT_ADST + fadst16x16_new_avx2, // ADST_ADST + fdct16x16_new_avx2, // FLIPADST_DCT + fadst16x16_new_avx2, // DCT_FLIPADST + fadst16x16_new_avx2, // FLIPADST_FLIPADST + fadst16x16_new_avx2, // ADST_FLIPADST + fadst16x16_new_avx2, // FLIPADST_ADST + fidentity16x16_new_avx2, // IDTX + fidentity16x16_new_avx2, // V_DCT + fdct16x16_new_avx2, // H_DCT + fidentity16x16_new_avx2, // V_ADST + fadst16x16_new_avx2, // H_ADST + fidentity16x16_new_avx2, // V_FLIPADST + fadst16x16_new_avx2 // H_FLIPADST +}; + +static void lowbd_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + const TX_SIZE tx_size = TX_16X16; + __m256i buf0[16], buf1[16]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + const int32_t i = 0; + if (ud_flip) { + load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + } + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i); + + __m256i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_avx2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + transpose_16bit_16x16_avx2(buf, buf); + store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i, width, 16); +} + +static void lowbd_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + const TX_SIZE tx_size = TX_32X32; + __m256i buf0[32], buf1[128]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, + height); + } else { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + } + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + transpose_16bit_16x16_avx2(buf0 + 0 * 16, buf1 + 0 * width + 16 * i); + transpose_16bit_16x16_avx2(buf0 + 1 * 16, buf1 + 1 * width + 16 * i); + } + + for (int i = 0; i < 2; i++) { + __m256i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_avx2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + transpose_16bit_16x16_avx2(buf, buf); + store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i, width, + 16); + transpose_16bit_16x16_avx2(buf + 16, buf + 16); + store_buffer_16bit_to_32bit_w16_avx2(buf + 16, output + 16 * width * i + 16, + width, 16); + } +} + +static void lowbd_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_64X64; + __m256i buf0[64], buf1[256]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = fdct16x64_new_avx2; + const int width_div16 = (width >> 4); + const int height_div16 = (height >> 4); + + for (int i = 0; i < width_div16; i++) { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(2, height_div16); ++j) { + transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); + } + } + + for (int i = 0; i < AOMMIN(2, height_div16); i++) { + __m256i bufA[64]; + __m256i bufB[64]; + __m128i *buf = (__m128i *)(buf1 + width * i); + for (int j = 0; j < width; ++j) { + bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]); + bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]); + } + fdct64_new_avx2(bufA, bufA, cos_bit_row); + fdct64_new_avx2(bufB, bufB, cos_bit_row); + av1_round_shift_array_32_avx2(bufA, bufA, 32, -shift[2]); + av1_round_shift_array_32_avx2(bufB, bufB, 32, -shift[2]); + + int32_t *output8 = output + 16 * 32 * i; + for (int j = 0; j < 4; ++j) { + __m256i *out = (__m256i *)(output8 + 8 * j); + transpose_32_8x8_avx2(4, bufA + 8 * j, out); + transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4); + } + } +} + +static void lowbd_fwd_txfm2d_16x32_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + const TX_SIZE tx_size = TX_16X32; + __m256i buf0[32], buf1[32]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + if (ud_flip) { + load_buffer_16bit_to_16bit_flip_avx2(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit_avx2(input, stride, buf0, height); + } + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + transpose_16bit_16x16_avx2(buf0, buf1); + transpose_16bit_16x16_avx2(buf0 + 16, buf1 + 16); + + for (int i = 0; i < 2; i++) { + __m256i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_avx2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + transpose_16bit_16x16_avx2(buf, buf); + store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i, + width, 16); + } +} + +static void lowbd_fwd_txfm2d_32x16_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m256i buf0[32], buf1[64]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16]; + const int txw_idx = get_txw_idx(TX_32X16); + const int txh_idx = get_txh_idx(TX_32X16); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 32; + const int height = 16; + const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, + height); + } else { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + } + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i); + } + + __m256i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_avx2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + transpose_16bit_16x16_avx2(buf, buf); + store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output, width, 16); + + transpose_16bit_16x16_avx2(buf + 16, buf + 16); + store_rect_buffer_16bit_to_32bit_w16_avx2(buf + 16, output + 16, width, 16); +} + +static void lowbd_fwd_txfm2d_64x32_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + const TX_SIZE tx_size = TX_64X32; + __m256i buf0[64], buf1[256]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type]; + const int width_div16 = (width >> 4); + const int height_div16 = (height >> 4); + + for (int i = 0; i < width_div16; i++) { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(4, height_div16); ++j) { + transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); + } + } + assert(tx_type == DCT_DCT); + for (int i = 0; i < AOMMIN(2, height_div16); i++) { + __m256i bufA[64]; + __m256i bufB[64]; + __m128i *buf = (__m128i *)(buf1 + width * i); + for (int j = 0; j < width; ++j) { + bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]); + bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]); + } + fdct64_new_avx2(bufA, bufA, cos_bit_row); + fdct64_new_avx2(bufB, bufB, cos_bit_row); + av1_round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2); + av1_round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2); + + int32_t *output8 = output + 16 * 32 * i; + for (int j = 0; j < 4; ++j) { + __m256i *out = (__m256i *)(output8 + 8 * j); + transpose_32_8x8_avx2(4, bufA + 8 * j, out); + transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4); + } + } +} + +static void lowbd_fwd_txfm2d_32x64_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_32X64; + __m256i buf0[64], buf1[256]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = fdct16x64_new_avx2; + const int width_div16 = (width >> 4); + const int height_div16 = (height >> 4); + + for (int i = 0; i < width_div16; i++) { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(2, height_div16); ++j) { + transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); + } + } + + for (int i = 0; i < AOMMIN(2, height_div16); i++) { + __m256i bufA[32]; + __m256i bufB[32]; + __m128i *buf = (__m128i *)(buf1 + width * i); + for (int j = 0; j < width; ++j) { + bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]); + bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]); + } + fdct32_avx2(bufA, bufA, cos_bit_row); + fdct32_avx2(bufB, bufB, cos_bit_row); + av1_round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2); + av1_round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2); + + int32_t *output8 = output + 16 * 32 * i; + for (int j = 0; j < 4; ++j) { + __m256i *out = (__m256i *)(output8 + 8 * j); + transpose_32_8x8_avx2(4, bufA + 8 * j, out); + transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4); + } + } +} + +static void lowbd_fwd_txfm2d_16x64_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_16X64; + __m256i buf0[64], buf1[64]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = fdct16x64_new_avx2; + const transform_1d_avx2 row_txfm = fdct16x16_new_avx2; + const int width_div16 = (width >> 4); + const int height_div16 = (height >> 4); + + for (int i = 0; i < width_div16; i++) { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + for (int j = 0; j < height_div16; ++j) { + transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); + } + } + + for (int i = 0; i < AOMMIN(4, height_div16); i++) { + __m256i *buf = buf1 + width * i; + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + int32_t *output16 = output + 16 * width * i; + for (int j = 0; j < width_div16; ++j) { + __m256i *buf16 = buf + 16 * j; + transpose_16bit_16x16_avx2(buf16, buf16); + store_buffer_16bit_to_32bit_w16_avx2(buf16, output16 + 16 * j, width, 16); + } + } + // Zero out the bottom 16x32 area. + memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output)); +} + +static void lowbd_fwd_txfm2d_64x16_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_64X16; + __m256i buf0[64], buf1[64]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = fdct16x16_new_avx2; + const transform_1d_avx2 row_txfm = fdct16x64_new_avx2; + const int width_div16 = (width >> 4); + const int height_div16 = (height >> 4); + + for (int i = 0; i < width_div16; i++) { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + for (int j = 0; j < height_div16; ++j) { + transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); + } + } + + for (int i = 0; i < height_div16; i++) { + __m256i *buf = buf1 + width * i; + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + int32_t *output16 = output + 16 * 32 * i; + for (int j = 0; j < 2; ++j) { + __m256i *buf16 = buf + 16 * j; + transpose_16bit_16x16_avx2(buf16, buf16); + store_buffer_16bit_to_32bit_w16_avx2(buf16, output16 + 16 * j, 32, 16); + } + } +} + +static INLINE void btf_16_avx2(__m256i *w0, __m256i *w1, __m256i *in0, + __m256i *in1, __m128i *out0, __m128i *out1, + __m128i *out2, __m128i *out3, + const __m256i *__rounding, int8_t *cos_bit) { + __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1); + __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1); + __m256i u0 = _mm256_madd_epi16(t0, *w0); + __m256i u1 = _mm256_madd_epi16(t1, *w0); + __m256i v0 = _mm256_madd_epi16(t0, *w1); + __m256i v1 = _mm256_madd_epi16(t1, *w1); + + __m256i a0 = _mm256_add_epi32(u0, *__rounding); + __m256i a1 = _mm256_add_epi32(u1, *__rounding); + __m256i b0 = _mm256_add_epi32(v0, *__rounding); + __m256i b1 = _mm256_add_epi32(v1, *__rounding); + + __m256i c0 = _mm256_srai_epi32(a0, *cos_bit); + __m256i c1 = _mm256_srai_epi32(a1, *cos_bit); + __m256i d0 = _mm256_srai_epi32(b0, *cos_bit); + __m256i d1 = _mm256_srai_epi32(b1, *cos_bit); + + __m256i temp0 = _mm256_packs_epi32(c0, c1); + __m256i temp1 = _mm256_packs_epi32(d0, d1); + + *out0 = _mm256_castsi256_si128(temp0); + *out1 = _mm256_castsi256_si128(temp1); + *out2 = _mm256_extracti128_si256(temp0, 0x01); + *out3 = _mm256_extracti128_si256(temp1, 0x01); +} + +static INLINE void fdct8x8_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + + // stage 1 + __m256i x1[8]; + x1[0] = _mm256_adds_epi16(input[0], input[7]); + x1[7] = _mm256_subs_epi16(input[0], input[7]); + x1[1] = _mm256_adds_epi16(input[1], input[6]); + x1[6] = _mm256_subs_epi16(input[1], input[6]); + x1[2] = _mm256_adds_epi16(input[2], input[5]); + x1[5] = _mm256_subs_epi16(input[2], input[5]); + x1[3] = _mm256_adds_epi16(input[3], input[4]); + x1[4] = _mm256_subs_epi16(input[3], input[4]); + + // stage 2 + __m256i x2[8]; + x2[0] = _mm256_adds_epi16(x1[0], x1[3]); + x2[3] = _mm256_subs_epi16(x1[0], x1[3]); + x2[1] = _mm256_adds_epi16(x1[1], x1[2]); + x2[2] = _mm256_subs_epi16(x1[1], x1[2]); + x2[4] = x1[4]; + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], __rounding, + cos_bit); + x2[5] = x1[5]; + x2[6] = x1[6]; + x2[7] = x1[7]; + + // stage 3 + __m256i x3[8]; + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x2[0], &x2[1], __rounding, + cos_bit); + x3[0] = x2[0]; + x3[1] = x2[1]; + btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x2[2], &x2[3], __rounding, + cos_bit); + x3[2] = x2[2]; + x3[3] = x2[3]; + x3[4] = _mm256_adds_epi16(x2[4], x2[5]); + x3[5] = _mm256_subs_epi16(x2[4], x2[5]); + x3[6] = _mm256_subs_epi16(x2[7], x2[6]); + x3[7] = _mm256_adds_epi16(x2[7], x2[6]); + + // stage 4 + __m256i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x3[4], &x3[7], __rounding, + cos_bit); + x4[4] = x3[4]; + x4[7] = x3[7]; + btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x3[5], &x3[6], __rounding, + cos_bit); + x4[5] = x3[5]; + x4[6] = x3[6]; + // stage 5 + output[0] = x4[0]; + output[1] = x4[4]; + output[2] = x4[2]; + output[3] = x4[6]; + output[4] = x4[1]; + output[5] = x4[5]; + output[6] = x4[3]; + output[7] = x4[7]; +} + +static INLINE void fadst8x8_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i __zero = _mm256_setzero_si256(); + const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); + __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); + __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]); + __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]); + __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]); + __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]); + __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]); + __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]); + __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]); + __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]); + __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]); + + // stage 1 + __m256i x1[8]; + x1[0] = input[0]; + x1[1] = _mm256_subs_epi16(__zero, input[7]); + x1[2] = _mm256_subs_epi16(__zero, input[3]); + x1[3] = input[4]; + x1[4] = _mm256_subs_epi16(__zero, input[1]); + x1[5] = input[6]; + x1[6] = input[2]; + x1[7] = _mm256_subs_epi16(__zero, input[5]); + + // stage 2 + __m256i x2[8]; + x2[0] = x1[0]; + x2[1] = x1[1]; + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], __rounding, + cos_bit); + x2[2] = x1[2]; + x2[3] = x1[3]; + x2[4] = x1[4]; + x2[5] = x1[5]; + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], __rounding, + cos_bit); + x2[6] = x1[6]; + x2[7] = x1[7]; + + // stage 3 + __m256i x3[8]; + x3[0] = _mm256_adds_epi16(x2[0], x2[2]); + x3[2] = _mm256_subs_epi16(x2[0], x2[2]); + x3[1] = _mm256_adds_epi16(x2[1], x2[3]); + x3[3] = _mm256_subs_epi16(x2[1], x2[3]); + x3[4] = _mm256_adds_epi16(x2[4], x2[6]); + x3[6] = _mm256_subs_epi16(x2[4], x2[6]); + x3[5] = _mm256_adds_epi16(x2[5], x2[7]); + x3[7] = _mm256_subs_epi16(x2[5], x2[7]); + + // stage 4 + __m256i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x3[4], &x3[5], __rounding, + cos_bit); + x4[4] = x3[4]; + x4[5] = x3[5]; + btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x3[6], &x3[7], __rounding, + cos_bit); + x4[6] = x3[6]; + x4[7] = x3[7]; + + // stage 5 + __m256i x5[8]; + x5[0] = _mm256_adds_epi16(x4[0], x4[4]); + x5[4] = _mm256_subs_epi16(x4[0], x4[4]); + x5[1] = _mm256_adds_epi16(x4[1], x4[5]); + x5[5] = _mm256_subs_epi16(x4[1], x4[5]); + x5[2] = _mm256_adds_epi16(x4[2], x4[6]); + x5[6] = _mm256_subs_epi16(x4[2], x4[6]); + x5[3] = _mm256_adds_epi16(x4[3], x4[7]); + x5[7] = _mm256_subs_epi16(x4[3], x4[7]); + + // stage 6 + __m256i x6[8]; + btf_16_w16_avx2(cospi_p04_p60, cospi_p60_m04, &x5[0], &x5[1], __rounding, + cos_bit); + x6[0] = x5[0]; + x6[1] = x5[1]; + btf_16_w16_avx2(cospi_p20_p44, cospi_p44_m20, &x5[2], &x5[3], __rounding, + cos_bit); + x6[2] = x5[2]; + x6[3] = x5[3]; + btf_16_w16_avx2(cospi_p36_p28, cospi_p28_m36, &x5[4], &x5[5], __rounding, + cos_bit); + x6[4] = x5[4]; + x6[5] = x5[5]; + btf_16_w16_avx2(cospi_p52_p12, cospi_p12_m52, &x5[6], &x5[7], __rounding, + cos_bit); + x6[6] = x5[6]; + x6[7] = x5[7]; + + // stage 7 + output[0] = x6[1]; + output[1] = x6[6]; + output[2] = x6[3]; + output[3] = x6[4]; + output[4] = x6[5]; + output[5] = x6[2]; + output[6] = x6[7]; + output[7] = x6[0]; +} + +static INLINE void fidentity8x8_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)cos_bit; + + output[0] = _mm256_adds_epi16(input[0], input[0]); + output[1] = _mm256_adds_epi16(input[1], input[1]); + output[2] = _mm256_adds_epi16(input[2], input[2]); + output[3] = _mm256_adds_epi16(input[3], input[3]); + output[4] = _mm256_adds_epi16(input[4], input[4]); + output[5] = _mm256_adds_epi16(input[5], input[5]); + output[6] = _mm256_adds_epi16(input[6], input[6]); + output[7] = _mm256_adds_epi16(input[7], input[7]); +} + +static INLINE void fdct8x16_new_avx2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1)); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + __m128i temp0, temp1, temp2, temp3; + __m256i in0, in1; + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); + __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); + __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); + __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); + __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); + __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); + __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); + __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); + + __m256i cospi_arr[12]; + + cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m32_p32), + cospi_m32_p32, 0x1); + cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32), + cospi_p32_p32, 0x1); + cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32), + cospi_p48_p16, 0x1); + cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32), + cospi_m16_p48, 0x1); + cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m16_p48), + cospi_m48_m16, 0x1); + cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_p16), + cospi_m16_p48, 0x1); + cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_p08), + cospi_p24_p40, 0x1); + cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m08_p56), + cospi_m40_p24, 0x1); + cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p60_p04), + cospi_p28_p36, 0x1); + cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m04_p60), + cospi_m36_p28, 0x1); + cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p44_p20), + cospi_p12_p52, 0x1); + cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m20_p44), + cospi_m52_p12, 0x1); + + __m256i x[8]; + x[0] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[1], 0x1); + x[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[15]), input[14], + 0x1); + x[2] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[3], 0x1); + x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[13]), input[12], + 0x1); + x[4] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[4], 0x1); + x[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[11], + 0x1); + x[6] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[6], 0x1); + x[7] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[9], 0x1); + + // stage 1 + __m256i x1[8]; + x1[0] = _mm256_adds_epi16(x[0], x[1]); + x1[7] = _mm256_subs_epi16(x[0], x[1]); + x1[1] = _mm256_adds_epi16(x[2], x[3]); + x1[6] = _mm256_subs_epi16(x[2], x[3]); + x1[2] = _mm256_adds_epi16(x[4], x[5]); + x1[5] = _mm256_subs_epi16(x[4], x[5]); + x1[3] = _mm256_adds_epi16(x[6], x[7]); + x1[4] = _mm256_subs_epi16(x[6], x[7]); + + // stage 2 + __m256i x2[8]; + x2[0] = _mm256_adds_epi16(x1[0], x1[3]); + x2[7] = _mm256_subs_epi16(x1[0], x1[3]); + x2[1] = _mm256_adds_epi16(x1[1], x1[2]); + x2[6] = _mm256_subs_epi16(x1[1], x1[2]); + x2[2] = x1[4]; + x2[3] = x1[7]; + btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &x1[5], &x1[6], &temp0, &temp1, + &temp2, &temp3, &__rounding_256, &cos_bit); + x2[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp0, 0x1); + x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1); + + // stage 3 + __m256i x3[8]; + x2[1] = _mm256_permute4x64_epi64(x2[1], 0x4e); + x3[0] = _mm256_adds_epi16(x2[0], x2[1]); + x3[1] = _mm256_subs_epi16(x2[0], x2[1]); + x3[2] = _mm256_blend_epi32(x2[7], x2[6], 0xf0); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, _mm256_castsi256_si128(x2[6]), + _mm256_extractf128_si256(x2[7], 0x01), temp0, temp1); + x3[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp1), temp0, 0x1); + x3[3] = _mm256_adds_epi16(x2[2], x2[4]); + x3[4] = _mm256_subs_epi16(x2[2], x2[4]); + x3[5] = _mm256_adds_epi16(x2[3], x2[5]); + x3[6] = _mm256_subs_epi16(x2[3], x2[5]); + + // stage 4 + __m256i x4[8]; + x4[0] = _mm256_blend_epi32(x3[0], x3[1], 0xf0); + x4[1] = _mm256_permute2f128_si256(x3[0], x3[1], 0x21); + btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &x4[0], &x4[1], &output[0], + &output[8], &output[4], &output[12], &__rounding_256, &cos_bit); + x4[2] = _mm256_adds_epi16(x3[2], x3[7]); + x4[3] = _mm256_subs_epi16(x3[2], x3[7]); + x4[4] = _mm256_permute2f128_si256(x3[3], x3[4], 0x20); + x4[5] = _mm256_permute2f128_si256(x3[6], x3[5], 0x20); + in0 = _mm256_permute2f128_si256(x3[3], x3[4], 0x31); + in1 = _mm256_permute2f128_si256(x3[5], x3[6], 0x31); + btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2, + &temp3, &__rounding_256, &cos_bit); + + x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp2, 0x1); + x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1); + + // stage 5 + __m256i x5[4]; + in0 = _mm256_permute2f128_si256(x4[2], x4[3], 0x31); + in1 = _mm256_permute2f128_si256(x4[2], x4[3], 0x20); + btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &output[2], &output[14], + &output[10], &output[6], &__rounding_256, &cos_bit); + x5[0] = _mm256_adds_epi16(x4[4], x4[6]); + x5[1] = _mm256_subs_epi16(x4[4], x4[6]); + x5[2] = _mm256_adds_epi16(x4[5], x4[7]); + x5[3] = _mm256_subs_epi16(x4[5], x4[7]); + + // stage 6 + in0 = _mm256_permute2f128_si256(x5[0], x5[1], 0x20); + in1 = _mm256_permute2f128_si256(x5[2], x5[3], 0x31); + btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &output[1], &output[15], + &output[9], &output[7], &__rounding_256, &cos_bit); + in0 = _mm256_permute2f128_si256(x5[1], x5[0], 0x31); + in1 = _mm256_permute2f128_si256(x5[3], x5[2], 0x20); + btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &output[5], + &output[11], &output[13], &output[3], &__rounding_256, &cos_bit); +} + +static INLINE void fadst8x16_new_avx2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i __zero = _mm256_setzero_si256(); + const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1)); + __m256i in0, in1; + __m128i temp0, temp1, temp2, temp3; + + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]); + __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]); + __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); + __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); + __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); + __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); + __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); + __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); + __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); + __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); + __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); + __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); + __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); + __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); + __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); + __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); + __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); + __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); + + __m256i cospi_arr[20]; + + cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32), + cospi_p32_p32, 0x1); + cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32), + cospi_p32_m32, 0x1); + cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32), + cospi_p32_p32, 0x1); + cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32), + cospi_p32_m32, 0x1); + cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48), + cospi_m48_p16, 0x1); + cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16), + cospi_p16_p48, 0x1); + cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48), + cospi_m48_p16, 0x1); + cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16), + cospi_p16_p48, 0x1); + cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56), + cospi_p40_p24, 0x1); + cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_m08), + cospi_p24_m40, 0x1); + cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m56_p08), + cospi_m24_p40, 0x1); + cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56), + cospi_p40_p24, 0x1); + cospi_arr[12] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p02_p62), + cospi_p10_p54, 0x1); + cospi_arr[13] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p62_m02), + cospi_p54_m10, 0x1); + cospi_arr[14] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p18_p46), + cospi_p26_p38, 0x1); + cospi_arr[15] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p46_m18), + cospi_p38_m26, 0x1); + cospi_arr[16] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p34_p30), + cospi_p42_p22, 0x1); + cospi_arr[17] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p30_m34), + cospi_p22_m42, 0x1); + cospi_arr[18] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p50_p14), + cospi_p58_p06, 0x1); + cospi_arr[19] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p14_m50), + cospi_p06_m58, 0x1); + + __m256i x[8]; + x[0] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[4], 0x1); + x[1] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[6], 0x1); + x[2] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[12], 0x1); + x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[14], + 0x1); + x[4] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[1]), input[9], 0x1); + x[5] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[3]), input[11], 0x1); + x[6] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[13], 0x1); + x[7] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[15], 0x1); + + // stage 1 + __m256i x1[8]; + x1[0] = x[0]; + x1[1] = _mm256_subs_epi16(__zero, x[7]); + x1[2] = x[2]; + x1[3] = _mm256_subs_epi16(__zero, x[5]); + x1[4] = _mm256_subs_epi16(__zero, x[4]); + x1[5] = x[3]; + x1[6] = _mm256_subs_epi16(__zero, x[6]); + x1[7] = x[1]; + + // stage 2 + __m256i x2[8]; + x2[0] = _mm256_blend_epi32(x1[0], x1[1], 0xf0); + x2[3] = _mm256_blend_epi32(x1[3], x1[2], 0xf0); + x2[4] = _mm256_blend_epi32(x1[4], x1[5], 0xf0); + x2[7] = _mm256_blend_epi32(x1[7], x1[6], 0xf0); + in0 = _mm256_blend_epi32(x1[1], x1[0], 0xf0); + in1 = _mm256_blend_epi32(x1[2], x1[3], 0xf0); + btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &in0, &in1, &temp0, &temp1, &temp2, + &temp3, &__rounding_256, &cos_bit); + x2[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); + x2[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); + in0 = _mm256_permute2f128_si256(x1[7], x1[6], 0x21); + in1 = _mm256_permute2f128_si256(x1[4], x1[5], 0x21); + btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &in0, &in1, &temp0, &temp1, &temp2, + &temp3, &__rounding_256, &cos_bit); + x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); + x2[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); + + // stage 3 + __m256i x3[8]; + x3[0] = _mm256_adds_epi16(x2[0], x2[1]); + x3[1] = _mm256_subs_epi16(x2[0], x2[1]); + x3[2] = _mm256_adds_epi16(x2[3], x2[2]); + x3[3] = _mm256_subs_epi16(x2[3], x2[2]); + x3[4] = _mm256_adds_epi16(x2[4], x2[5]); + x3[5] = _mm256_subs_epi16(x2[4], x2[5]); + x3[6] = _mm256_adds_epi16(x2[7], x2[6]); + x3[7] = _mm256_subs_epi16(x2[7], x2[6]); + + // stage 4 + __m256i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[4] = x3[4]; + x4[5] = x3[5]; + in0 = _mm256_permute2f128_si256(x3[2], x3[3], 0x20); + in1 = _mm256_permute2f128_si256(x3[2], x3[3], 0x31); + btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2, + &temp3, &__rounding_256, &cos_bit); + x4[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); + x4[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); + in0 = _mm256_permute2f128_si256(x3[6], x3[7], 0x20); + in1 = _mm256_permute2f128_si256(x3[6], x3[7], 0x31); + btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &temp0, &temp1, &temp2, + &temp3, &__rounding_256, &cos_bit); + x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); + x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); + + // stage 5 + __m256i x5[8]; + x5[0] = _mm256_adds_epi16(x4[0], x4[2]); + x5[1] = _mm256_subs_epi16(x4[0], x4[2]); + x5[2] = _mm256_adds_epi16(x4[1], x4[3]); + x5[3] = _mm256_subs_epi16(x4[1], x4[3]); + x5[4] = _mm256_adds_epi16(x4[4], x4[6]); + x5[5] = _mm256_subs_epi16(x4[4], x4[6]); + x5[6] = _mm256_adds_epi16(x4[5], x4[7]); + x5[7] = _mm256_subs_epi16(x4[5], x4[7]); + + // stage 6 + __m256i x6[8]; + x6[0] = x5[0]; + x6[1] = x5[2]; + x6[2] = x5[1]; + x6[3] = x5[3]; + in0 = _mm256_permute2f128_si256(x5[4], x5[6], 0x20); + in1 = _mm256_permute2f128_si256(x5[4], x5[6], 0x31); + btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &temp0, &temp1, &temp2, + &temp3, &__rounding_256, &cos_bit); + x6[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); + x6[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); + in0 = _mm256_permute2f128_si256(x5[5], x5[7], 0x20); + in1 = _mm256_permute2f128_si256(x5[5], x5[7], 0x31); + btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &temp0, &temp1, + &temp2, &temp3, &__rounding_256, &cos_bit); + x6[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); + x6[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); + + // stage 7 + __m256i x7[8]; + x7[0] = _mm256_adds_epi16(x6[0], x6[4]); + x7[1] = _mm256_subs_epi16(x6[0], x6[4]); + x7[2] = _mm256_adds_epi16(x6[1], x6[5]); + x7[3] = _mm256_subs_epi16(x6[1], x6[5]); + x7[4] = _mm256_adds_epi16(x6[2], x6[6]); + x7[5] = _mm256_subs_epi16(x6[2], x6[6]); + x7[6] = _mm256_adds_epi16(x6[3], x6[7]); + x7[7] = _mm256_subs_epi16(x6[3], x6[7]); + + // stage 8 + in0 = _mm256_permute2f128_si256(x7[0], x7[2], 0x20); + in1 = _mm256_permute2f128_si256(x7[0], x7[2], 0x31); + btf_16_avx2(&cospi_arr[12], &cospi_arr[13], &in0, &in1, &output[15], + &output[0], &output[13], &output[2], &__rounding_256, &cos_bit); + in0 = _mm256_permute2f128_si256(x7[4], x7[6], 0x20); + in1 = _mm256_permute2f128_si256(x7[4], x7[6], 0x31); + btf_16_avx2(&cospi_arr[14], &cospi_arr[15], &in0, &in1, &output[11], + &output[4], &output[9], &output[6], &__rounding_256, &cos_bit); + in0 = _mm256_permute2f128_si256(x7[1], x7[3], 0x20); + in1 = _mm256_permute2f128_si256(x7[1], x7[3], 0x31); + btf_16_avx2(&cospi_arr[16], &cospi_arr[17], &in0, &in1, &output[7], + &output[8], &output[5], &output[10], &__rounding_256, &cos_bit); + in0 = _mm256_permute2f128_si256(x7[5], x7[7], 0x20); + in1 = _mm256_permute2f128_si256(x7[5], x7[7], 0x31); + btf_16_avx2(&cospi_arr[18], &cospi_arr[19], &in0, &in1, &output[3], + &output[12], &output[1], &output[14], &__rounding_256, &cos_bit); +} + +static INLINE void fidentity8x16_new_avx2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const __m256i one = _mm256_set1_epi16(1); + __m256i temp; + for (int i = 0; i < 16; i += 2) { + temp = _mm256_insertf128_si256(_mm256_castsi128_si256(input[i]), + input[i + 1], 0x1); + const __m256i a_lo = _mm256_unpacklo_epi16(temp, one); + const __m256i a_hi = _mm256_unpackhi_epi16(temp, one); + const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2); + const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2); + temp = _mm256_packs_epi32(b_lo, b_hi); + output[i] = _mm256_castsi256_si128(temp); + output[i + 1] = _mm256_extractf128_si256(temp, 0x1); + } +} + +static const transform_1d_avx2 row_txfm8x16_arr[TX_TYPES] = { + fdct8x8_new_avx2, // DCT_DCT + fdct8x8_new_avx2, // ADST_DCT + fadst8x8_new_avx2, // DCT_ADST + fadst8x8_new_avx2, // ADST_ADST + fdct8x8_new_avx2, // FLIPADST_DCT + fadst8x8_new_avx2, // DCT_FLIPADST + fadst8x8_new_avx2, // FLIPADST_FLIPADST + fadst8x8_new_avx2, // ADST_FLIPADST + fadst8x8_new_avx2, // FLIPADST_ADST + fidentity8x8_new_avx2, // IDTX + fidentity8x8_new_avx2, // V_DCT + fdct8x8_new_avx2, // H_DCT + fidentity8x8_new_avx2, // V_ADST + fadst8x8_new_avx2, // H_ADST + fidentity8x8_new_avx2, // V_FLIPADST + fadst8x8_new_avx2 // H_FLIPADST +}; + +static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = { + fdct8x16_new_avx2, // DCT_DCT + fadst8x16_new_avx2, // ADST_DCT + fdct8x16_new_avx2, // DCT_ADST + fadst8x16_new_avx2, // ADST_ADST + fadst8x16_new_avx2, // FLIPADST_DCT + fdct8x16_new_avx2, // DCT_FLIPADST + fadst8x16_new_avx2, // FLIPADST_FLIPADST + fadst8x16_new_avx2, // ADST_FLIPADST + fadst8x16_new_avx2, // FLIPADST_ADST + fidentity8x16_new_avx2, // IDTX + fdct8x16_new_avx2, // V_DCT + fidentity8x16_new_avx2, // H_DCT + fadst8x16_new_avx2, // V_ADST + fidentity8x16_new_avx2, // H_ADST + fadst8x16_new_avx2, // V_FLIPADST + fidentity8x16_new_avx2 // H_FLIPADST +}; + +static const transform_1d_avx2 col_txfm16x8_arr[TX_TYPES] = { + fdct8x8_new_avx2, // DCT_DCT + fadst8x8_new_avx2, // ADST_DCT + fdct8x8_new_avx2, // DCT_ADST + fadst8x8_new_avx2, // ADST_ADST + fadst8x8_new_avx2, // FLIPADST_DCT + fdct8x8_new_avx2, // DCT_FLIPADST + fadst8x8_new_avx2, // FLIPADST_FLIPADST + fadst8x8_new_avx2, // ADST_FLIPADST + fadst8x8_new_avx2, // FLIPADST_ADST + fidentity8x8_new_avx2, // IDTX + fdct8x8_new_avx2, // V_DCT + fidentity8x8_new_avx2, // H_DCT + fadst8x8_new_avx2, // V_ADST + fidentity8x8_new_avx2, // H_ADST + fadst8x8_new_avx2, // V_FLIPADST + fidentity8x8_new_avx2, // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm16x8_arr[TX_TYPES] = { + fdct8x16_new_avx2, // DCT_DCT + fdct8x16_new_avx2, // ADST_DCT + fadst8x16_new_avx2, // DCT_ADST + fadst8x16_new_avx2, // ADST_ADST + fdct8x16_new_avx2, // FLIPADST_DCT + fadst8x16_new_avx2, // DCT_FLIPADST + fadst8x16_new_avx2, // FLIPADST_FLIPADST + fadst8x16_new_avx2, // ADST_FLIPADST + fadst8x16_new_avx2, // FLIPADST_ADST + fidentity8x16_new_avx2, // IDTX + fidentity8x16_new_avx2, // V_DCT + fdct8x16_new_avx2, // H_DCT + fidentity8x16_new_avx2, // V_ADST + fadst8x16_new_avx2, // H_ADST + fidentity8x16_new_avx2, // V_FLIPADST + fadst8x16_new_avx2 // H_FLIPADST +}; + +static void lowbd_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[16]; + __m256i buf2[8]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16]; + const int txw_idx = get_txw_idx(TX_8X16); + const int txh_idx = get_txh_idx(TX_8X16); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 8; + const int height = 16; + const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_txfm8x16_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1); + transpose_16bit_8x8(buf0 + 8, buf1 + 8); + + __m128i *bufl, *bufu; + if (lr_flip) { + bufl = buf0; + bufu = buf0 + 8; + flip_buf_sse2(buf1 + width * 0, bufl, width); + flip_buf_sse2(buf1 + width * 1, bufu, width); + } else { + bufl = buf1 + width * 0; + bufu = buf1 + width * 1; + } + pack_reg(bufl, bufu, buf2); + row_txfm(buf2, buf2, cos_bit_row); + round_shift_16bit_w16_avx2(buf2, width, shift[2]); + transpose_16bit_16x8_avx2(buf2, buf2); + store_rect_buffer_16bit_to_32bit_w8_avx2(buf2, output, width, 8); +} + +static void lowbd_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[16]; + __m256i buf2[8]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8]; + const int txw_idx = get_txw_idx(TX_16X8); + const int txh_idx = get_txh_idx(TX_16X8); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 16; + const int height = 8; + const transform_1d_avx2 col_txfm = col_txfm16x8_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm16x8_arr[tx_type]; + __m128i *buf; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * 0, stride, buf0, height); + load_buffer_16bit_to_16bit_flip(input + 8 * 1, stride, &buf0[8], height); + } else { + load_buffer_16bit_to_16bit(input + 8 * 0, stride, buf0, height); + load_buffer_16bit_to_16bit(input + 8 * 1, stride, &buf0[8], height); + } + pack_reg(buf0, &buf0[8], buf2); + round_shift_16bit_w16_avx2(buf2, height, shift[0]); + col_txfm(buf2, buf2, cos_bit_col); + round_shift_16bit_w16_avx2(buf2, height, shift[1]); + transpose_16bit_16x8_avx2(buf2, buf2); + extract_reg(buf2, buf1); + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height); + transpose_16bit_8x8(buf + 8, buf + 8); + store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height); +} + +static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = { + av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform + av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform + lowbd_fwd_txfm2d_16x16_avx2, // 16x16 transform + lowbd_fwd_txfm2d_32x32_avx2, // 32x32 transform + lowbd_fwd_txfm2d_64x64_avx2, // 64x64 transform + av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform + av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform + lowbd_fwd_txfm2d_8x16_avx2, // 8x16 transform + lowbd_fwd_txfm2d_16x8_avx2, // 16x8 transform + lowbd_fwd_txfm2d_16x32_avx2, // 16x32 transform + lowbd_fwd_txfm2d_32x16_avx2, // 32x16 transform + lowbd_fwd_txfm2d_32x64_avx2, // 32x64 transform + lowbd_fwd_txfm2d_64x32_avx2, // 64x32 transform + av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform + av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform + av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform + av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform + lowbd_fwd_txfm2d_16x64_avx2, // 16x64 transform + lowbd_fwd_txfm2d_64x16_avx2, // 64x16 transform +}; + +void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size]; + if ((fwd_txfm2d_func == NULL) || + (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) { + av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param); + } else { + fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); + } +} diff --git a/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm2d_sse4.c b/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm2d_sse4.c new file mode 100644 index 000000000..0bc3fbc2d --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm2d_sse4.c @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/av1_rtcd.h" + +#include "av1/common/enums.h" +#include "av1/common/av1_txfm.h" +#include "av1/common/x86/av1_txfm_sse2.h" +#include "av1/common/x86/highbd_txfm_utility_sse4.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" +#include "av1/encoder/x86/av1_txfm1d_sse4.h" +#include "av1/encoder/x86/av1_fwd_txfm_sse2.h" + +static INLINE void int16_array_with_stride_to_int32_array_without_stride( + const int16_t *input, int stride, int32_t *output, int txfm1d_size) { + int r, c; + for (r = 0; r < txfm1d_size; r++) { + for (c = 0; c < txfm1d_size; c++) { + output[r * txfm1d_size + c] = (int32_t)input[r * stride + c]; + } + } +} + +typedef void (*TxfmFuncSSE2)(__m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); + +static void fdct32_sse4_1(__m128i *input, __m128i *output, const int8_t cos_bit, + const int8_t *stage_range) { + const int txfm_size = 32; + const int num_per_128 = 4; + int col_num = txfm_size / num_per_128; + int col; + (void)stage_range; + for (col = 0; col < col_num; col++) { + av1_fdct32_sse4_1((input + col), (output + col), cos_bit, col_num); + } +} + +static void fdct64_new_sse4_1(__m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range) { + const int txfm_size = 64; + const int num_per_128 = 4; + int col_num = txfm_size / num_per_128; + (void)stage_range; + for (int col = 0; col < col_num; col++) { + av1_fdct64_sse4_1((input + col), (output + col), cos_bit, col_num, col_num); + } +} +static void idtx32x32_sse4_1(__m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range) { + (void)stage_range; + + for (int i = 0; i < 8; i++) { + av1_idtx32_sse4_1(&input[i * 32], &output[i * 32], cos_bit, 1); + } +} + +static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) { + switch (txfm_type) { + case TXFM_TYPE_DCT32: return fdct32_sse4_1; break; + case TXFM_TYPE_DCT64: return fdct64_new_sse4_1; break; + case TXFM_TYPE_IDENTITY32: return idtx32x32_sse4_1; break; + default: assert(0); + } + return NULL; +} + +static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output, + const int stride, + const TXFM_2D_FLIP_CFG *cfg, + int32_t *txfm_buf) { + // TODO(sarahparker) This does not currently support rectangular transforms + // and will break without splitting txfm_size out into row and col size. + // Rectangular transforms use c code only, so it should be ok for now. + // It will be corrected when there are sse implementations for rectangular + // transforms. + assert(cfg->tx_size < TX_SIZES); + const int txfm_size = tx_size_wide[cfg->tx_size]; + const int8_t *shift = cfg->shift; + const int8_t *stage_range_col = cfg->stage_range_col; + const int8_t *stage_range_row = cfg->stage_range_row; + const int8_t cos_bit_col = cfg->cos_bit_col; + const int8_t cos_bit_row = cfg->cos_bit_row; + const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col); + const TxfmFuncSSE2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row); + + __m128i *buf_128 = (__m128i *)txfm_buf; + __m128i *out_128 = (__m128i *)output; + int num_per_128 = 4; + int txfm2d_size_128 = txfm_size * txfm_size / num_per_128; + + int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf, + txfm_size); + av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]); + txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col); + av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]); + transpose_32(txfm_size, out_128, buf_128); + txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row); + av1_round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]); + transpose_32(txfm_size, buf_128, out_128); +} + +static INLINE void fwd_txfm2d_64x64_sse4_1(const int16_t *input, + int32_t *output, const int stride, + const TXFM_2D_FLIP_CFG *cfg, + int32_t *txfm_buf) { + assert(cfg->tx_size < TX_SIZES); + const int txfm_size = tx_size_wide[cfg->tx_size]; + const int8_t *shift = cfg->shift; + const int8_t *stage_range_col = cfg->stage_range_col; + const int8_t cos_bit_col = cfg->cos_bit_col; + const int8_t cos_bit_row = cfg->cos_bit_row; + const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col); + __m128i *buf_128 = (__m128i *)txfm_buf; + __m128i *out_128 = (__m128i *)output; + + const int num_per_128 = 4; + int txfm2d_size_128 = txfm_size * txfm_size / num_per_128; + int col_num = txfm_size / num_per_128; + + int16_array_with_stride_to_int32_array_without_stride(input, stride, output, + txfm_size); + /*col wise transform*/ + txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col); + av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]); + transpose_32(txfm_size, out_128, buf_128); + + /*row wise transform*/ + for (int col = 0; col < (col_num >> 1); col++) { + av1_fdct64_sse4_1((buf_128 + col), (out_128 + col), cos_bit_row, col_num, + (col_num >> 1)); + } + + txfm2d_size_128 = (col_num >> 1) * (txfm_size >> 1); + av1_round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]); + transpose_8nx8n(buf_128, out_128, 32, 32); +} + +void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg); + (void)bd; + fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf); +} + +void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg); + (void)bd; + fwd_txfm2d_64x64_sse4_1(input, output, stride, &cfg, txfm_buf); +} + +static INLINE void transpose_32_4x4x2(int stride, const __m128i *inputA, + const __m128i *inputB, __m128i *output) { + __m128i temp0 = _mm_unpacklo_epi32(inputA[0], inputA[2]); + __m128i temp1 = _mm_unpackhi_epi32(inputA[0], inputA[2]); + __m128i temp2 = _mm_unpacklo_epi32(inputA[1], inputA[3]); + __m128i temp3 = _mm_unpackhi_epi32(inputA[1], inputA[3]); + + output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2); + output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2); + output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3); + output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3); + + temp0 = _mm_unpacklo_epi32(inputB[0], inputB[2]); + temp1 = _mm_unpackhi_epi32(inputB[0], inputB[2]); + temp2 = _mm_unpacklo_epi32(inputB[1], inputB[3]); + temp3 = _mm_unpackhi_epi32(inputB[1], inputB[3]); + + output[4 * stride] = _mm_unpacklo_epi32(temp0, temp2); + output[5 * stride] = _mm_unpackhi_epi32(temp0, temp2); + output[6 * stride] = _mm_unpacklo_epi32(temp1, temp3); + output[7 * stride] = _mm_unpackhi_epi32(temp1, temp3); +} + +static void lowbd_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_64X64; + __m128i buf0[64], buf1[512]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2; + const int width_div8 = (width >> 3); + const int height_div8 = (height >> 3); + + for (int i = 0; i < width_div8; i++) { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(4, height_div8); ++j) { + transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); + } + } + for (int i = 0; i < AOMMIN(4, height_div8); i++) { + __m128i bufA[64]; + __m128i bufB[64]; + __m128i *buf = buf1 + width * i; + for (int j = 0; j < width; ++j) { + bufA[j] = _mm_cvtepi16_epi32(buf[j]); + bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); + } + av1_fdct64_sse4_1(bufA, bufA, cos_bit_row, 1, 1); + av1_fdct64_sse4_1(bufB, bufB, cos_bit_row, 1, 1); + av1_round_shift_array_32_sse4_1(bufA, bufA, 32, -shift[2]); + av1_round_shift_array_32_sse4_1(bufB, bufB, 32, -shift[2]); + + int32_t *output8 = output + 8 * 32 * i; + for (int j = 0; j < width_div8; ++j) { + __m128i *out = (__m128i *)(output8 + 4 * j); + transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out); + } + } +} + +static void lowbd_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + const TX_SIZE tx_size = TX_64X32; + __m128i buf0[64], buf1[256]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type]; + const int width_div8 = (width >> 3); + const int height_div8 = (height >> 3); + + for (int i = 0; i < width_div8; i++) { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(4, height_div8); ++j) { + transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); + } + } + assert(tx_type == DCT_DCT); + for (int i = 0; i < AOMMIN(4, height_div8); i++) { + __m128i bufA[64]; + __m128i bufB[64]; + __m128i *buf = buf1 + width * i; + for (int j = 0; j < width; ++j) { + bufA[j] = _mm_cvtepi16_epi32(buf[j]); + bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); + } + av1_fdct64_sse4_1(bufA, bufA, cos_bit_row, 1, 1); + av1_fdct64_sse4_1(bufB, bufB, cos_bit_row, 1, 1); + av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2); + av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2); + + int32_t *output8 = output + 8 * 32 * i; + for (int j = 0; j < width_div8; ++j) { + __m128i *out = (__m128i *)(output8 + 4 * j); + transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out); + } + } +} + +static void lowbd_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_32X64; + __m128i buf0[64], buf1[256]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2; + const int width_div8 = (width >> 3); + const int height_div8 = (height >> 3); + + for (int i = 0; i < width_div8; i++) { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(4, height_div8); ++j) { + transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); + } + } + + for (int i = 0; i < AOMMIN(4, height_div8); i++) { + __m128i bufA[32]; + __m128i bufB[32]; + __m128i *buf = buf1 + width * i; + for (int j = 0; j < width; ++j) { + bufA[j] = _mm_cvtepi16_epi32(buf[j]); + bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); + } + av1_fdct32_sse4_1(bufA, bufA, cos_bit_row, 1); + av1_fdct32_sse4_1(bufB, bufB, cos_bit_row, 1); + av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2); + av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2); + + int32_t *output8 = output + 8 * 32 * i; + for (int j = 0; j < (32 / 4); ++j) { + __m128i *out = (__m128i *)(output8 + 4 * j); + transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out); + } + } +} + +static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = { + av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform + av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform + av1_lowbd_fwd_txfm2d_16x16_sse2, // 16x16 transform + av1_lowbd_fwd_txfm2d_32x32_sse2, // 32x32 transform + lowbd_fwd_txfm2d_64x64_sse4_1, // 64x64 transform + av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform + av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform + av1_lowbd_fwd_txfm2d_8x16_sse2, // 8x16 transform + av1_lowbd_fwd_txfm2d_16x8_sse2, // 16x8 transform + av1_lowbd_fwd_txfm2d_16x32_sse2, // 16x32 transform + av1_lowbd_fwd_txfm2d_32x16_sse2, // 32x16 transform + lowbd_fwd_txfm2d_32x64_sse4_1, // 32x64 transform + lowbd_fwd_txfm2d_64x32_sse4_1, // 64x32 transform + av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform + av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform + av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform + av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform + av1_lowbd_fwd_txfm2d_16x64_sse2, // 16x64 transform + av1_lowbd_fwd_txfm2d_64x16_sse2, // 64x16 transform +}; + +void av1_lowbd_fwd_txfm_sse4_1(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size]; + if ((fwd_txfm2d_func == NULL) || + (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) { + av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param); + } else { + fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); + } +} diff --git a/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm_avx2.h b/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm_avx2.h new file mode 100644 index 000000000..aaad76e5a --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm_avx2.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_ +#define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_ +#include + +// out0 = in0*w0 + in1*w1 +// out1 = -in1*w0 + in0*w1 +static INLINE void btf_32_avx2_type0(const int32_t w0, const int32_t w1, + __m256i *in0, __m256i *in1, + const __m256i _r, const int32_t cos_bit) { + __m256i _in0 = *in0; + __m256i _in1 = *in1; + const __m256i ww0 = _mm256_set1_epi32(w0); + const __m256i ww1 = _mm256_set1_epi32(w1); + const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0); + const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1); + __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1); + temp0 = _mm256_add_epi32(temp0, _r); + *in0 = _mm256_srai_epi32(temp0, cos_bit); + const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1); + const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0); + __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0); + temp1 = _mm256_add_epi32(temp1, _r); + *in1 = _mm256_srai_epi32(temp1, cos_bit); +} + +static INLINE void btf_32_avx2_type1(const int32_t w0, const int32_t w1, + __m256i *in0, __m256i *in1, + const __m256i _r, const int32_t cos_bit) { + __m256i _in0 = *in0; + __m256i _in1 = *in1; + const __m256i ww0 = _mm256_set1_epi32(w0); + const __m256i ww1 = _mm256_set1_epi32(w1); + const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0); + const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1); + __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1); + temp0 = _mm256_add_epi32(temp0, _r); + *in0 = _mm256_srai_epi32(temp0, cos_bit); + const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1); + const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0); + __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1); + temp1 = _mm256_add_epi32(temp1, _r); + *in1 = _mm256_srai_epi32(temp1, cos_bit); +} + +// out0 = in0*w0 + in1*w1 +// out1 = -in1*w0 + in0*w1 +static INLINE void btf_32_avx2_type0_new(const __m256i ww0, const __m256i ww1, + __m256i *in0, __m256i *in1, + const __m256i _r, + const int32_t cos_bit) { + __m256i _in0 = *in0; + __m256i _in1 = *in1; + const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0); + const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1); + __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1); + temp0 = _mm256_add_epi32(temp0, _r); + *in0 = _mm256_srai_epi32(temp0, cos_bit); + const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1); + const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0); + __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0); + temp1 = _mm256_add_epi32(temp1, _r); + *in1 = _mm256_srai_epi32(temp1, cos_bit); +} + +// out0 = in0*w0 + in1*w1 +// out1 = in1*w0 - in0*w1 +static INLINE void btf_32_avx2_type1_new(const __m256i ww0, const __m256i ww1, + __m256i *in0, __m256i *in1, + const __m256i _r, + const int32_t cos_bit) { + __m256i _in0 = *in0; + __m256i _in1 = *in1; + const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0); + const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1); + __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1); + temp0 = _mm256_add_epi32(temp0, _r); + *in0 = _mm256_srai_epi32(temp0, cos_bit); + const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1); + const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0); + __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1); + temp1 = _mm256_add_epi32(temp1, _r); + *in1 = _mm256_srai_epi32(temp1, cos_bit); +} + +#endif // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_ diff --git a/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm_sse2.c b/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm_sse2.c new file mode 100644 index 000000000..694e6131c --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm_sse2.c @@ -0,0 +1,2891 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/x86/av1_txfm_sse2.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" +#include "av1/encoder/x86/av1_fwd_txfm_sse2.h" + +// TODO(linfengz): refine fdct4x8 and fadst4x8 optimization (if possible). + +static void fdct4x4_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + __m128i u[4], v[4]; + + u[0] = _mm_unpacklo_epi16(input[0], input[1]); + u[1] = _mm_unpacklo_epi16(input[3], input[2]); + + v[0] = _mm_add_epi16(u[0], u[1]); + v[1] = _mm_sub_epi16(u[0], u[1]); + + u[0] = _mm_madd_epi16(v[0], cospi_p32_p32); // 0 + u[1] = _mm_madd_epi16(v[0], cospi_p32_m32); // 2 + u[2] = _mm_madd_epi16(v[1], cospi_p16_p48); // 1 + u[3] = _mm_madd_epi16(v[1], cospi_p48_m16); // 3 + + v[0] = _mm_add_epi32(u[0], __rounding); + v[1] = _mm_add_epi32(u[1], __rounding); + v[2] = _mm_add_epi32(u[2], __rounding); + v[3] = _mm_add_epi32(u[3], __rounding); + u[0] = _mm_srai_epi32(v[0], cos_bit); + u[1] = _mm_srai_epi32(v[1], cos_bit); + u[2] = _mm_srai_epi32(v[2], cos_bit); + u[3] = _mm_srai_epi32(v[3], cos_bit); + + output[0] = _mm_packs_epi32(u[0], u[1]); + output[1] = _mm_packs_epi32(u[2], u[3]); + output[2] = _mm_srli_si128(output[0], 8); + output[3] = _mm_srli_si128(output[1], 8); +} + +static void fdct8x4_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + + // stage 1 + __m128i x1[4]; + x1[0] = _mm_adds_epi16(input[0], input[3]); + x1[3] = _mm_subs_epi16(input[0], input[3]); + x1[1] = _mm_adds_epi16(input[1], input[2]); + x1[2] = _mm_subs_epi16(input[1], input[2]); + + // stage 2 + __m128i x2[4]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x2[0], x2[1]); + btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x1[2], x1[3], x2[2], x2[3]); + + // stage 3 + output[0] = x2[0]; + output[1] = x2[2]; + output[2] = x2[1]; + output[3] = x2[3]; +} + +static void fdct4x8_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + + // stage 1 + __m128i x1[8]; + x1[0] = _mm_adds_epi16(input[0], input[7]); + x1[7] = _mm_subs_epi16(input[0], input[7]); + x1[1] = _mm_adds_epi16(input[1], input[6]); + x1[6] = _mm_subs_epi16(input[1], input[6]); + x1[2] = _mm_adds_epi16(input[2], input[5]); + x1[5] = _mm_subs_epi16(input[2], input[5]); + x1[3] = _mm_adds_epi16(input[3], input[4]); + x1[4] = _mm_subs_epi16(input[3], input[4]); + + // stage 2 + __m128i x2[8]; + x2[0] = _mm_adds_epi16(x1[0], x1[3]); + x2[3] = _mm_subs_epi16(x1[0], x1[3]); + x2[1] = _mm_adds_epi16(x1[1], x1[2]); + x2[2] = _mm_subs_epi16(x1[1], x1[2]); + x2[4] = x1[4]; + btf_16_w4_sse2(&cospi_m32_p32, &cospi_p32_p32, __rounding, cos_bit, &x1[5], + &x1[6], &x2[5], &x2[6]); + x2[7] = x1[7]; + + // stage 3 + __m128i x3[8]; + btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x2[0], + &x2[1], &x3[0], &x3[1]); + btf_16_w4_sse2(&cospi_p48_p16, &cospi_m16_p48, __rounding, cos_bit, &x2[2], + &x2[3], &x3[2], &x3[3]); + x3[4] = _mm_adds_epi16(x2[4], x2[5]); + x3[5] = _mm_subs_epi16(x2[4], x2[5]); + x3[6] = _mm_subs_epi16(x2[7], x2[6]); + x3[7] = _mm_adds_epi16(x2[7], x2[6]); + + // stage 4 + __m128i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_w4_sse2(&cospi_p56_p08, &cospi_m08_p56, __rounding, cos_bit, &x3[4], + &x3[7], &x4[4], &x4[7]); + btf_16_w4_sse2(&cospi_p24_p40, &cospi_m40_p24, __rounding, cos_bit, &x3[5], + &x3[6], &x4[5], &x4[6]); + + // stage 5 + output[0] = x4[0]; + output[1] = x4[4]; + output[2] = x4[2]; + output[3] = x4[6]; + output[4] = x4[1]; + output[5] = x4[5]; + output[6] = x4[3]; + output[7] = x4[7]; +} + +static void fdct8x8_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + + // stage 1 + __m128i x1[8]; + x1[0] = _mm_adds_epi16(input[0], input[7]); + x1[7] = _mm_subs_epi16(input[0], input[7]); + x1[1] = _mm_adds_epi16(input[1], input[6]); + x1[6] = _mm_subs_epi16(input[1], input[6]); + x1[2] = _mm_adds_epi16(input[2], input[5]); + x1[5] = _mm_subs_epi16(input[2], input[5]); + x1[3] = _mm_adds_epi16(input[3], input[4]); + x1[4] = _mm_subs_epi16(input[3], input[4]); + + // stage 2 + __m128i x2[8]; + x2[0] = _mm_adds_epi16(x1[0], x1[3]); + x2[3] = _mm_subs_epi16(x1[0], x1[3]); + x2[1] = _mm_adds_epi16(x1[1], x1[2]); + x2[2] = _mm_subs_epi16(x1[1], x1[2]); + x2[4] = x1[4]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[5], x1[6], x2[5], x2[6]); + x2[7] = x1[7]; + + // stage 3 + __m128i x3[8]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x2[0], x2[1], x3[0], x3[1]); + btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x2[2], x2[3], x3[2], x3[3]); + x3[4] = _mm_adds_epi16(x2[4], x2[5]); + x3[5] = _mm_subs_epi16(x2[4], x2[5]); + x3[6] = _mm_subs_epi16(x2[7], x2[6]); + x3[7] = _mm_adds_epi16(x2[7], x2[6]); + + // stage 4 + __m128i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x3[4], x3[7], x4[4], x4[7]); + btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x3[5], x3[6], x4[5], x4[6]); + + // stage 5 + output[0] = x4[0]; + output[1] = x4[4]; + output[2] = x4[2]; + output[3] = x4[6]; + output[4] = x4[1]; + output[5] = x4[5]; + output[6] = x4[3]; + output[7] = x4[7]; +} + +static void fdct8x16_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); + __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); + __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); + __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); + __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); + __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); + __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); + __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); + + // stage 1 + __m128i x1[16]; + x1[0] = _mm_adds_epi16(input[0], input[15]); + x1[15] = _mm_subs_epi16(input[0], input[15]); + x1[1] = _mm_adds_epi16(input[1], input[14]); + x1[14] = _mm_subs_epi16(input[1], input[14]); + x1[2] = _mm_adds_epi16(input[2], input[13]); + x1[13] = _mm_subs_epi16(input[2], input[13]); + x1[3] = _mm_adds_epi16(input[3], input[12]); + x1[12] = _mm_subs_epi16(input[3], input[12]); + x1[4] = _mm_adds_epi16(input[4], input[11]); + x1[11] = _mm_subs_epi16(input[4], input[11]); + x1[5] = _mm_adds_epi16(input[5], input[10]); + x1[10] = _mm_subs_epi16(input[5], input[10]); + x1[6] = _mm_adds_epi16(input[6], input[9]); + x1[9] = _mm_subs_epi16(input[6], input[9]); + x1[7] = _mm_adds_epi16(input[7], input[8]); + x1[8] = _mm_subs_epi16(input[7], input[8]); + + // stage 2 + __m128i x2[16]; + x2[0] = _mm_adds_epi16(x1[0], x1[7]); + x2[7] = _mm_subs_epi16(x1[0], x1[7]); + x2[1] = _mm_adds_epi16(x1[1], x1[6]); + x2[6] = _mm_subs_epi16(x1[1], x1[6]); + x2[2] = _mm_adds_epi16(x1[2], x1[5]); + x2[5] = _mm_subs_epi16(x1[2], x1[5]); + x2[3] = _mm_adds_epi16(x1[3], x1[4]); + x2[4] = _mm_subs_epi16(x1[3], x1[4]); + x2[8] = x1[8]; + x2[9] = x1[9]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[10], x1[13], x2[10], x2[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[11], x1[12], x2[11], x2[12]); + x2[14] = x1[14]; + x2[15] = x1[15]; + + // stage 3 + __m128i x3[16]; + x3[0] = _mm_adds_epi16(x2[0], x2[3]); + x3[3] = _mm_subs_epi16(x2[0], x2[3]); + x3[1] = _mm_adds_epi16(x2[1], x2[2]); + x3[2] = _mm_subs_epi16(x2[1], x2[2]); + x3[4] = x2[4]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[5], x2[6], x3[5], x3[6]); + x3[7] = x2[7]; + x3[8] = _mm_adds_epi16(x2[8], x2[11]); + x3[11] = _mm_subs_epi16(x2[8], x2[11]); + x3[9] = _mm_adds_epi16(x2[9], x2[10]); + x3[10] = _mm_subs_epi16(x2[9], x2[10]); + x3[12] = _mm_subs_epi16(x2[15], x2[12]); + x3[15] = _mm_adds_epi16(x2[15], x2[12]); + x3[13] = _mm_subs_epi16(x2[14], x2[13]); + x3[14] = _mm_adds_epi16(x2[14], x2[13]); + + // stage 4 + __m128i x4[16]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x3[0], x3[1], x4[0], x4[1]); + btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x3[2], x3[3], x4[2], x4[3]); + x4[4] = _mm_adds_epi16(x3[4], x3[5]); + x4[5] = _mm_subs_epi16(x3[4], x3[5]); + x4[6] = _mm_subs_epi16(x3[7], x3[6]); + x4[7] = _mm_adds_epi16(x3[7], x3[6]); + x4[8] = x3[8]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[9], x3[14], x4[9], x4[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[10], x3[13], x4[10], x4[13]); + x4[11] = x3[11]; + x4[12] = x3[12]; + x4[15] = x3[15]; + + // stage 5 + __m128i x5[16]; + x5[0] = x4[0]; + x5[1] = x4[1]; + x5[2] = x4[2]; + x5[3] = x4[3]; + btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x4[4], x4[7], x5[4], x5[7]); + btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x4[5], x4[6], x5[5], x5[6]); + x5[8] = _mm_adds_epi16(x4[8], x4[9]); + x5[9] = _mm_subs_epi16(x4[8], x4[9]); + x5[10] = _mm_subs_epi16(x4[11], x4[10]); + x5[11] = _mm_adds_epi16(x4[11], x4[10]); + x5[12] = _mm_adds_epi16(x4[12], x4[13]); + x5[13] = _mm_subs_epi16(x4[12], x4[13]); + x5[14] = _mm_subs_epi16(x4[15], x4[14]); + x5[15] = _mm_adds_epi16(x4[15], x4[14]); + + // stage 6 + __m128i x6[16]; + x6[0] = x5[0]; + x6[1] = x5[1]; + x6[2] = x5[2]; + x6[3] = x5[3]; + x6[4] = x5[4]; + x6[5] = x5[5]; + x6[6] = x5[6]; + x6[7] = x5[7]; + btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x5[8], x5[15], x6[8], x6[15]); + btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x5[9], x5[14], x6[9], x6[14]); + btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x5[10], x5[13], x6[10], x6[13]); + btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x5[11], x5[12], x6[11], x6[12]); + + // stage 7 + output[0] = x6[0]; + output[1] = x6[8]; + output[2] = x6[4]; + output[3] = x6[12]; + output[4] = x6[2]; + output[5] = x6[10]; + output[6] = x6[6]; + output[7] = x6[14]; + output[8] = x6[1]; + output[9] = x6[9]; + output[10] = x6[5]; + output[11] = x6[13]; + output[12] = x6[3]; + output[13] = x6[11]; + output[14] = x6[7]; + output[15] = x6[15]; +} + +void av1_fdct8x32_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); + __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); + __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); + __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); + __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); + __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); + __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); + __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); + __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); + __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); + __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]); + __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]); + __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]); + __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]); + __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]); + __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]); + __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]); + __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]); + __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]); + __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]); + __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]); + __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]); + __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]); + __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]); + __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]); + __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]); + + // stage 1 + __m128i x1[32]; + x1[0] = _mm_adds_epi16(input[0], input[31]); + x1[31] = _mm_subs_epi16(input[0], input[31]); + x1[1] = _mm_adds_epi16(input[1], input[30]); + x1[30] = _mm_subs_epi16(input[1], input[30]); + x1[2] = _mm_adds_epi16(input[2], input[29]); + x1[29] = _mm_subs_epi16(input[2], input[29]); + x1[3] = _mm_adds_epi16(input[3], input[28]); + x1[28] = _mm_subs_epi16(input[3], input[28]); + x1[4] = _mm_adds_epi16(input[4], input[27]); + x1[27] = _mm_subs_epi16(input[4], input[27]); + x1[5] = _mm_adds_epi16(input[5], input[26]); + x1[26] = _mm_subs_epi16(input[5], input[26]); + x1[6] = _mm_adds_epi16(input[6], input[25]); + x1[25] = _mm_subs_epi16(input[6], input[25]); + x1[7] = _mm_adds_epi16(input[7], input[24]); + x1[24] = _mm_subs_epi16(input[7], input[24]); + x1[8] = _mm_adds_epi16(input[8], input[23]); + x1[23] = _mm_subs_epi16(input[8], input[23]); + x1[9] = _mm_adds_epi16(input[9], input[22]); + x1[22] = _mm_subs_epi16(input[9], input[22]); + x1[10] = _mm_adds_epi16(input[10], input[21]); + x1[21] = _mm_subs_epi16(input[10], input[21]); + x1[11] = _mm_adds_epi16(input[11], input[20]); + x1[20] = _mm_subs_epi16(input[11], input[20]); + x1[12] = _mm_adds_epi16(input[12], input[19]); + x1[19] = _mm_subs_epi16(input[12], input[19]); + x1[13] = _mm_adds_epi16(input[13], input[18]); + x1[18] = _mm_subs_epi16(input[13], input[18]); + x1[14] = _mm_adds_epi16(input[14], input[17]); + x1[17] = _mm_subs_epi16(input[14], input[17]); + x1[15] = _mm_adds_epi16(input[15], input[16]); + x1[16] = _mm_subs_epi16(input[15], input[16]); + + // stage 2 + __m128i x2[32]; + x2[0] = _mm_adds_epi16(x1[0], x1[15]); + x2[15] = _mm_subs_epi16(x1[0], x1[15]); + x2[1] = _mm_adds_epi16(x1[1], x1[14]); + x2[14] = _mm_subs_epi16(x1[1], x1[14]); + x2[2] = _mm_adds_epi16(x1[2], x1[13]); + x2[13] = _mm_subs_epi16(x1[2], x1[13]); + x2[3] = _mm_adds_epi16(x1[3], x1[12]); + x2[12] = _mm_subs_epi16(x1[3], x1[12]); + x2[4] = _mm_adds_epi16(x1[4], x1[11]); + x2[11] = _mm_subs_epi16(x1[4], x1[11]); + x2[5] = _mm_adds_epi16(x1[5], x1[10]); + x2[10] = _mm_subs_epi16(x1[5], x1[10]); + x2[6] = _mm_adds_epi16(x1[6], x1[9]); + x2[9] = _mm_subs_epi16(x1[6], x1[9]); + x2[7] = _mm_adds_epi16(x1[7], x1[8]); + x2[8] = _mm_subs_epi16(x1[7], x1[8]); + x2[16] = x1[16]; + x2[17] = x1[17]; + x2[18] = x1[18]; + x2[19] = x1[19]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[20], x1[27], x2[20], x2[27]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[21], x1[26], x2[21], x2[26]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[22], x1[25], x2[22], x2[25]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[23], x1[24], x2[23], x2[24]); + x2[28] = x1[28]; + x2[29] = x1[29]; + x2[30] = x1[30]; + x2[31] = x1[31]; + + // stage 3 + __m128i x3[32]; + x3[0] = _mm_adds_epi16(x2[0], x2[7]); + x3[7] = _mm_subs_epi16(x2[0], x2[7]); + x3[1] = _mm_adds_epi16(x2[1], x2[6]); + x3[6] = _mm_subs_epi16(x2[1], x2[6]); + x3[2] = _mm_adds_epi16(x2[2], x2[5]); + x3[5] = _mm_subs_epi16(x2[2], x2[5]); + x3[3] = _mm_adds_epi16(x2[3], x2[4]); + x3[4] = _mm_subs_epi16(x2[3], x2[4]); + x3[8] = x2[8]; + x3[9] = x2[9]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[10], x2[13], x3[10], x3[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[11], x2[12], x3[11], x3[12]); + x3[14] = x2[14]; + x3[15] = x2[15]; + x3[16] = _mm_adds_epi16(x2[16], x2[23]); + x3[23] = _mm_subs_epi16(x2[16], x2[23]); + x3[17] = _mm_adds_epi16(x2[17], x2[22]); + x3[22] = _mm_subs_epi16(x2[17], x2[22]); + x3[18] = _mm_adds_epi16(x2[18], x2[21]); + x3[21] = _mm_subs_epi16(x2[18], x2[21]); + x3[19] = _mm_adds_epi16(x2[19], x2[20]); + x3[20] = _mm_subs_epi16(x2[19], x2[20]); + x3[24] = _mm_subs_epi16(x2[31], x2[24]); + x3[31] = _mm_adds_epi16(x2[31], x2[24]); + x3[25] = _mm_subs_epi16(x2[30], x2[25]); + x3[30] = _mm_adds_epi16(x2[30], x2[25]); + x3[26] = _mm_subs_epi16(x2[29], x2[26]); + x3[29] = _mm_adds_epi16(x2[29], x2[26]); + x3[27] = _mm_subs_epi16(x2[28], x2[27]); + x3[28] = _mm_adds_epi16(x2[28], x2[27]); + + // stage 4 + __m128i x4[32]; + x4[0] = _mm_adds_epi16(x3[0], x3[3]); + x4[3] = _mm_subs_epi16(x3[0], x3[3]); + x4[1] = _mm_adds_epi16(x3[1], x3[2]); + x4[2] = _mm_subs_epi16(x3[1], x3[2]); + x4[4] = x3[4]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[5], x3[6], x4[5], x4[6]); + x4[7] = x3[7]; + x4[8] = _mm_adds_epi16(x3[8], x3[11]); + x4[11] = _mm_subs_epi16(x3[8], x3[11]); + x4[9] = _mm_adds_epi16(x3[9], x3[10]); + x4[10] = _mm_subs_epi16(x3[9], x3[10]); + x4[12] = _mm_subs_epi16(x3[15], x3[12]); + x4[15] = _mm_adds_epi16(x3[15], x3[12]); + x4[13] = _mm_subs_epi16(x3[14], x3[13]); + x4[14] = _mm_adds_epi16(x3[14], x3[13]); + x4[16] = x3[16]; + x4[17] = x3[17]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[18], x3[29], x4[18], x4[29]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[19], x3[28], x4[19], x4[28]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[20], x3[27], x4[20], x4[27]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[21], x3[26], x4[21], x4[26]); + x4[22] = x3[22]; + x4[23] = x3[23]; + x4[24] = x3[24]; + x4[25] = x3[25]; + x4[30] = x3[30]; + x4[31] = x3[31]; + + // stage 5 + __m128i x5[32]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x4[0], x4[1], x5[0], x5[1]); + btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x4[2], x4[3], x5[2], x5[3]); + x5[4] = _mm_adds_epi16(x4[4], x4[5]); + x5[5] = _mm_subs_epi16(x4[4], x4[5]); + x5[6] = _mm_subs_epi16(x4[7], x4[6]); + x5[7] = _mm_adds_epi16(x4[7], x4[6]); + x5[8] = x4[8]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[9], x4[14], x5[9], x5[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[10], x4[13], x5[10], x5[13]); + x5[11] = x4[11]; + x5[12] = x4[12]; + x5[15] = x4[15]; + x5[16] = _mm_adds_epi16(x4[16], x4[19]); + x5[19] = _mm_subs_epi16(x4[16], x4[19]); + x5[17] = _mm_adds_epi16(x4[17], x4[18]); + x5[18] = _mm_subs_epi16(x4[17], x4[18]); + x5[20] = _mm_subs_epi16(x4[23], x4[20]); + x5[23] = _mm_adds_epi16(x4[23], x4[20]); + x5[21] = _mm_subs_epi16(x4[22], x4[21]); + x5[22] = _mm_adds_epi16(x4[22], x4[21]); + x5[24] = _mm_adds_epi16(x4[24], x4[27]); + x5[27] = _mm_subs_epi16(x4[24], x4[27]); + x5[25] = _mm_adds_epi16(x4[25], x4[26]); + x5[26] = _mm_subs_epi16(x4[25], x4[26]); + x5[28] = _mm_subs_epi16(x4[31], x4[28]); + x5[31] = _mm_adds_epi16(x4[31], x4[28]); + x5[29] = _mm_subs_epi16(x4[30], x4[29]); + x5[30] = _mm_adds_epi16(x4[30], x4[29]); + + // stage 6 + __m128i x6[32]; + x6[0] = x5[0]; + x6[1] = x5[1]; + x6[2] = x5[2]; + x6[3] = x5[3]; + btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x5[4], x5[7], x6[4], x6[7]); + btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x5[5], x5[6], x6[5], x6[6]); + x6[8] = _mm_adds_epi16(x5[8], x5[9]); + x6[9] = _mm_subs_epi16(x5[8], x5[9]); + x6[10] = _mm_subs_epi16(x5[11], x5[10]); + x6[11] = _mm_adds_epi16(x5[11], x5[10]); + x6[12] = _mm_adds_epi16(x5[12], x5[13]); + x6[13] = _mm_subs_epi16(x5[12], x5[13]); + x6[14] = _mm_subs_epi16(x5[15], x5[14]); + x6[15] = _mm_adds_epi16(x5[15], x5[14]); + x6[16] = x5[16]; + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[17], x5[30], x6[17], x6[30]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[18], x5[29], x6[18], x6[29]); + x6[19] = x5[19]; + x6[20] = x5[20]; + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[21], x5[26], x6[21], x6[26]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[22], x5[25], x6[22], x6[25]); + x6[23] = x5[23]; + x6[24] = x5[24]; + x6[27] = x5[27]; + x6[28] = x5[28]; + x6[31] = x5[31]; + + // stage 7 + __m128i x7[32]; + x7[0] = x6[0]; + x7[1] = x6[1]; + x7[2] = x6[2]; + x7[3] = x6[3]; + x7[4] = x6[4]; + x7[5] = x6[5]; + x7[6] = x6[6]; + x7[7] = x6[7]; + btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x6[8], x6[15], x7[8], x7[15]); + btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x6[9], x6[14], x7[9], x7[14]); + btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x6[10], x6[13], x7[10], x7[13]); + btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x6[11], x6[12], x7[11], x7[12]); + x7[16] = _mm_adds_epi16(x6[16], x6[17]); + x7[17] = _mm_subs_epi16(x6[16], x6[17]); + x7[18] = _mm_subs_epi16(x6[19], x6[18]); + x7[19] = _mm_adds_epi16(x6[19], x6[18]); + x7[20] = _mm_adds_epi16(x6[20], x6[21]); + x7[21] = _mm_subs_epi16(x6[20], x6[21]); + x7[22] = _mm_subs_epi16(x6[23], x6[22]); + x7[23] = _mm_adds_epi16(x6[23], x6[22]); + x7[24] = _mm_adds_epi16(x6[24], x6[25]); + x7[25] = _mm_subs_epi16(x6[24], x6[25]); + x7[26] = _mm_subs_epi16(x6[27], x6[26]); + x7[27] = _mm_adds_epi16(x6[27], x6[26]); + x7[28] = _mm_adds_epi16(x6[28], x6[29]); + x7[29] = _mm_subs_epi16(x6[28], x6[29]); + x7[30] = _mm_subs_epi16(x6[31], x6[30]); + x7[31] = _mm_adds_epi16(x6[31], x6[30]); + + // stage 8 + __m128i x8[32]; + x8[0] = x7[0]; + x8[1] = x7[1]; + x8[2] = x7[2]; + x8[3] = x7[3]; + x8[4] = x7[4]; + x8[5] = x7[5]; + x8[6] = x7[6]; + x8[7] = x7[7]; + x8[8] = x7[8]; + x8[9] = x7[9]; + x8[10] = x7[10]; + x8[11] = x7[11]; + x8[12] = x7[12]; + x8[13] = x7[13]; + x8[14] = x7[14]; + x8[15] = x7[15]; + btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x7[16], x7[31], x8[16], x8[31]); + btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x7[17], x7[30], x8[17], x8[30]); + btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x7[18], x7[29], x8[18], x8[29]); + btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x7[19], x7[28], x8[19], x8[28]); + btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x7[20], x7[27], x8[20], x8[27]); + btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x7[21], x7[26], x8[21], x8[26]); + btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x7[22], x7[25], x8[22], x8[25]); + btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x7[23], x7[24], x8[23], x8[24]); + + // stage 9 + output[0] = x8[0]; + output[1] = x8[16]; + output[2] = x8[8]; + output[3] = x8[24]; + output[4] = x8[4]; + output[5] = x8[20]; + output[6] = x8[12]; + output[7] = x8[28]; + output[8] = x8[2]; + output[9] = x8[18]; + output[10] = x8[10]; + output[11] = x8[26]; + output[12] = x8[6]; + output[13] = x8[22]; + output[14] = x8[14]; + output[15] = x8[30]; + output[16] = x8[1]; + output[17] = x8[17]; + output[18] = x8[9]; + output[19] = x8[25]; + output[20] = x8[5]; + output[21] = x8[21]; + output[22] = x8[13]; + output[23] = x8[29]; + output[24] = x8[3]; + output[25] = x8[19]; + output[26] = x8[11]; + output[27] = x8[27]; + output[28] = x8[7]; + output[29] = x8[23]; + output[30] = x8[15]; + output[31] = x8[31]; +} + +void av1_fdct8x64_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); + __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); + __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); + __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); + __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); + __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); + __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); + __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); + __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); + __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); + __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]); + __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]); + __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]); + __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]); + __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]); + __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]); + __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]); + __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]); + __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]); + __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]); + __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]); + __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]); + __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]); + __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]); + __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]); + __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]); + __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]); + __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]); + __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]); + __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]); + __m128i cospi_p63_p01 = pair_set_epi16(cospi[63], cospi[1]); + __m128i cospi_m01_p63 = pair_set_epi16(-cospi[1], cospi[63]); + __m128i cospi_p31_p33 = pair_set_epi16(cospi[31], cospi[33]); + __m128i cospi_m33_p31 = pair_set_epi16(-cospi[33], cospi[31]); + __m128i cospi_p47_p17 = pair_set_epi16(cospi[47], cospi[17]); + __m128i cospi_m17_p47 = pair_set_epi16(-cospi[17], cospi[47]); + __m128i cospi_p15_p49 = pair_set_epi16(cospi[15], cospi[49]); + __m128i cospi_m49_p15 = pair_set_epi16(-cospi[49], cospi[15]); + __m128i cospi_p55_p09 = pair_set_epi16(cospi[55], cospi[9]); + __m128i cospi_m09_p55 = pair_set_epi16(-cospi[9], cospi[55]); + __m128i cospi_p23_p41 = pair_set_epi16(cospi[23], cospi[41]); + __m128i cospi_m41_p23 = pair_set_epi16(-cospi[41], cospi[23]); + __m128i cospi_p39_p25 = pair_set_epi16(cospi[39], cospi[25]); + __m128i cospi_m25_p39 = pair_set_epi16(-cospi[25], cospi[39]); + __m128i cospi_p07_p57 = pair_set_epi16(cospi[7], cospi[57]); + __m128i cospi_m57_p07 = pair_set_epi16(-cospi[57], cospi[7]); + __m128i cospi_p59_p05 = pair_set_epi16(cospi[59], cospi[5]); + __m128i cospi_m05_p59 = pair_set_epi16(-cospi[5], cospi[59]); + __m128i cospi_p27_p37 = pair_set_epi16(cospi[27], cospi[37]); + __m128i cospi_m37_p27 = pair_set_epi16(-cospi[37], cospi[27]); + __m128i cospi_p43_p21 = pair_set_epi16(cospi[43], cospi[21]); + __m128i cospi_m21_p43 = pair_set_epi16(-cospi[21], cospi[43]); + __m128i cospi_p11_p53 = pair_set_epi16(cospi[11], cospi[53]); + __m128i cospi_m53_p11 = pair_set_epi16(-cospi[53], cospi[11]); + __m128i cospi_p51_p13 = pair_set_epi16(cospi[51], cospi[13]); + __m128i cospi_m13_p51 = pair_set_epi16(-cospi[13], cospi[51]); + __m128i cospi_p19_p45 = pair_set_epi16(cospi[19], cospi[45]); + __m128i cospi_m45_p19 = pair_set_epi16(-cospi[45], cospi[19]); + __m128i cospi_p35_p29 = pair_set_epi16(cospi[35], cospi[29]); + __m128i cospi_m29_p35 = pair_set_epi16(-cospi[29], cospi[35]); + __m128i cospi_p03_p61 = pair_set_epi16(cospi[3], cospi[61]); + __m128i cospi_m61_p03 = pair_set_epi16(-cospi[61], cospi[3]); + + // stage 1 + __m128i x1[64]; + x1[0] = _mm_adds_epi16(input[0], input[63]); + x1[63] = _mm_subs_epi16(input[0], input[63]); + x1[1] = _mm_adds_epi16(input[1], input[62]); + x1[62] = _mm_subs_epi16(input[1], input[62]); + x1[2] = _mm_adds_epi16(input[2], input[61]); + x1[61] = _mm_subs_epi16(input[2], input[61]); + x1[3] = _mm_adds_epi16(input[3], input[60]); + x1[60] = _mm_subs_epi16(input[3], input[60]); + x1[4] = _mm_adds_epi16(input[4], input[59]); + x1[59] = _mm_subs_epi16(input[4], input[59]); + x1[5] = _mm_adds_epi16(input[5], input[58]); + x1[58] = _mm_subs_epi16(input[5], input[58]); + x1[6] = _mm_adds_epi16(input[6], input[57]); + x1[57] = _mm_subs_epi16(input[6], input[57]); + x1[7] = _mm_adds_epi16(input[7], input[56]); + x1[56] = _mm_subs_epi16(input[7], input[56]); + x1[8] = _mm_adds_epi16(input[8], input[55]); + x1[55] = _mm_subs_epi16(input[8], input[55]); + x1[9] = _mm_adds_epi16(input[9], input[54]); + x1[54] = _mm_subs_epi16(input[9], input[54]); + x1[10] = _mm_adds_epi16(input[10], input[53]); + x1[53] = _mm_subs_epi16(input[10], input[53]); + x1[11] = _mm_adds_epi16(input[11], input[52]); + x1[52] = _mm_subs_epi16(input[11], input[52]); + x1[12] = _mm_adds_epi16(input[12], input[51]); + x1[51] = _mm_subs_epi16(input[12], input[51]); + x1[13] = _mm_adds_epi16(input[13], input[50]); + x1[50] = _mm_subs_epi16(input[13], input[50]); + x1[14] = _mm_adds_epi16(input[14], input[49]); + x1[49] = _mm_subs_epi16(input[14], input[49]); + x1[15] = _mm_adds_epi16(input[15], input[48]); + x1[48] = _mm_subs_epi16(input[15], input[48]); + x1[16] = _mm_adds_epi16(input[16], input[47]); + x1[47] = _mm_subs_epi16(input[16], input[47]); + x1[17] = _mm_adds_epi16(input[17], input[46]); + x1[46] = _mm_subs_epi16(input[17], input[46]); + x1[18] = _mm_adds_epi16(input[18], input[45]); + x1[45] = _mm_subs_epi16(input[18], input[45]); + x1[19] = _mm_adds_epi16(input[19], input[44]); + x1[44] = _mm_subs_epi16(input[19], input[44]); + x1[20] = _mm_adds_epi16(input[20], input[43]); + x1[43] = _mm_subs_epi16(input[20], input[43]); + x1[21] = _mm_adds_epi16(input[21], input[42]); + x1[42] = _mm_subs_epi16(input[21], input[42]); + x1[22] = _mm_adds_epi16(input[22], input[41]); + x1[41] = _mm_subs_epi16(input[22], input[41]); + x1[23] = _mm_adds_epi16(input[23], input[40]); + x1[40] = _mm_subs_epi16(input[23], input[40]); + x1[24] = _mm_adds_epi16(input[24], input[39]); + x1[39] = _mm_subs_epi16(input[24], input[39]); + x1[25] = _mm_adds_epi16(input[25], input[38]); + x1[38] = _mm_subs_epi16(input[25], input[38]); + x1[26] = _mm_adds_epi16(input[26], input[37]); + x1[37] = _mm_subs_epi16(input[26], input[37]); + x1[27] = _mm_adds_epi16(input[27], input[36]); + x1[36] = _mm_subs_epi16(input[27], input[36]); + x1[28] = _mm_adds_epi16(input[28], input[35]); + x1[35] = _mm_subs_epi16(input[28], input[35]); + x1[29] = _mm_adds_epi16(input[29], input[34]); + x1[34] = _mm_subs_epi16(input[29], input[34]); + x1[30] = _mm_adds_epi16(input[30], input[33]); + x1[33] = _mm_subs_epi16(input[30], input[33]); + x1[31] = _mm_adds_epi16(input[31], input[32]); + x1[32] = _mm_subs_epi16(input[31], input[32]); + + // stage 2 + __m128i x2[64]; + x2[0] = _mm_adds_epi16(x1[0], x1[31]); + x2[31] = _mm_subs_epi16(x1[0], x1[31]); + x2[1] = _mm_adds_epi16(x1[1], x1[30]); + x2[30] = _mm_subs_epi16(x1[1], x1[30]); + x2[2] = _mm_adds_epi16(x1[2], x1[29]); + x2[29] = _mm_subs_epi16(x1[2], x1[29]); + x2[3] = _mm_adds_epi16(x1[3], x1[28]); + x2[28] = _mm_subs_epi16(x1[3], x1[28]); + x2[4] = _mm_adds_epi16(x1[4], x1[27]); + x2[27] = _mm_subs_epi16(x1[4], x1[27]); + x2[5] = _mm_adds_epi16(x1[5], x1[26]); + x2[26] = _mm_subs_epi16(x1[5], x1[26]); + x2[6] = _mm_adds_epi16(x1[6], x1[25]); + x2[25] = _mm_subs_epi16(x1[6], x1[25]); + x2[7] = _mm_adds_epi16(x1[7], x1[24]); + x2[24] = _mm_subs_epi16(x1[7], x1[24]); + x2[8] = _mm_adds_epi16(x1[8], x1[23]); + x2[23] = _mm_subs_epi16(x1[8], x1[23]); + x2[9] = _mm_adds_epi16(x1[9], x1[22]); + x2[22] = _mm_subs_epi16(x1[9], x1[22]); + x2[10] = _mm_adds_epi16(x1[10], x1[21]); + x2[21] = _mm_subs_epi16(x1[10], x1[21]); + x2[11] = _mm_adds_epi16(x1[11], x1[20]); + x2[20] = _mm_subs_epi16(x1[11], x1[20]); + x2[12] = _mm_adds_epi16(x1[12], x1[19]); + x2[19] = _mm_subs_epi16(x1[12], x1[19]); + x2[13] = _mm_adds_epi16(x1[13], x1[18]); + x2[18] = _mm_subs_epi16(x1[13], x1[18]); + x2[14] = _mm_adds_epi16(x1[14], x1[17]); + x2[17] = _mm_subs_epi16(x1[14], x1[17]); + x2[15] = _mm_adds_epi16(x1[15], x1[16]); + x2[16] = _mm_subs_epi16(x1[15], x1[16]); + x2[32] = x1[32]; + x2[33] = x1[33]; + x2[34] = x1[34]; + x2[35] = x1[35]; + x2[36] = x1[36]; + x2[37] = x1[37]; + x2[38] = x1[38]; + x2[39] = x1[39]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[40], x1[55], x2[40], x2[55]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[41], x1[54], x2[41], x2[54]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[42], x1[53], x2[42], x2[53]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[43], x1[52], x2[43], x2[52]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[44], x1[51], x2[44], x2[51]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[45], x1[50], x2[45], x2[50]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[46], x1[49], x2[46], x2[49]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[47], x1[48], x2[47], x2[48]); + x2[56] = x1[56]; + x2[57] = x1[57]; + x2[58] = x1[58]; + x2[59] = x1[59]; + x2[60] = x1[60]; + x2[61] = x1[61]; + x2[62] = x1[62]; + x2[63] = x1[63]; + + // stage 3 + __m128i x3[64]; + x3[0] = _mm_adds_epi16(x2[0], x2[15]); + x3[15] = _mm_subs_epi16(x2[0], x2[15]); + x3[1] = _mm_adds_epi16(x2[1], x2[14]); + x3[14] = _mm_subs_epi16(x2[1], x2[14]); + x3[2] = _mm_adds_epi16(x2[2], x2[13]); + x3[13] = _mm_subs_epi16(x2[2], x2[13]); + x3[3] = _mm_adds_epi16(x2[3], x2[12]); + x3[12] = _mm_subs_epi16(x2[3], x2[12]); + x3[4] = _mm_adds_epi16(x2[4], x2[11]); + x3[11] = _mm_subs_epi16(x2[4], x2[11]); + x3[5] = _mm_adds_epi16(x2[5], x2[10]); + x3[10] = _mm_subs_epi16(x2[5], x2[10]); + x3[6] = _mm_adds_epi16(x2[6], x2[9]); + x3[9] = _mm_subs_epi16(x2[6], x2[9]); + x3[7] = _mm_adds_epi16(x2[7], x2[8]); + x3[8] = _mm_subs_epi16(x2[7], x2[8]); + x3[16] = x2[16]; + x3[17] = x2[17]; + x3[18] = x2[18]; + x3[19] = x2[19]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[20], x2[27], x3[20], x3[27]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[21], x2[26], x3[21], x3[26]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[22], x2[25], x3[22], x3[25]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[23], x2[24], x3[23], x3[24]); + x3[28] = x2[28]; + x3[29] = x2[29]; + x3[30] = x2[30]; + x3[31] = x2[31]; + x3[32] = _mm_adds_epi16(x2[32], x2[47]); + x3[47] = _mm_subs_epi16(x2[32], x2[47]); + x3[33] = _mm_adds_epi16(x2[33], x2[46]); + x3[46] = _mm_subs_epi16(x2[33], x2[46]); + x3[34] = _mm_adds_epi16(x2[34], x2[45]); + x3[45] = _mm_subs_epi16(x2[34], x2[45]); + x3[35] = _mm_adds_epi16(x2[35], x2[44]); + x3[44] = _mm_subs_epi16(x2[35], x2[44]); + x3[36] = _mm_adds_epi16(x2[36], x2[43]); + x3[43] = _mm_subs_epi16(x2[36], x2[43]); + x3[37] = _mm_adds_epi16(x2[37], x2[42]); + x3[42] = _mm_subs_epi16(x2[37], x2[42]); + x3[38] = _mm_adds_epi16(x2[38], x2[41]); + x3[41] = _mm_subs_epi16(x2[38], x2[41]); + x3[39] = _mm_adds_epi16(x2[39], x2[40]); + x3[40] = _mm_subs_epi16(x2[39], x2[40]); + x3[48] = _mm_subs_epi16(x2[63], x2[48]); + x3[63] = _mm_adds_epi16(x2[63], x2[48]); + x3[49] = _mm_subs_epi16(x2[62], x2[49]); + x3[62] = _mm_adds_epi16(x2[62], x2[49]); + x3[50] = _mm_subs_epi16(x2[61], x2[50]); + x3[61] = _mm_adds_epi16(x2[61], x2[50]); + x3[51] = _mm_subs_epi16(x2[60], x2[51]); + x3[60] = _mm_adds_epi16(x2[60], x2[51]); + x3[52] = _mm_subs_epi16(x2[59], x2[52]); + x3[59] = _mm_adds_epi16(x2[59], x2[52]); + x3[53] = _mm_subs_epi16(x2[58], x2[53]); + x3[58] = _mm_adds_epi16(x2[58], x2[53]); + x3[54] = _mm_subs_epi16(x2[57], x2[54]); + x3[57] = _mm_adds_epi16(x2[57], x2[54]); + x3[55] = _mm_subs_epi16(x2[56], x2[55]); + x3[56] = _mm_adds_epi16(x2[56], x2[55]); + + // stage 4 + __m128i x4[64]; + x4[0] = _mm_adds_epi16(x3[0], x3[7]); + x4[7] = _mm_subs_epi16(x3[0], x3[7]); + x4[1] = _mm_adds_epi16(x3[1], x3[6]); + x4[6] = _mm_subs_epi16(x3[1], x3[6]); + x4[2] = _mm_adds_epi16(x3[2], x3[5]); + x4[5] = _mm_subs_epi16(x3[2], x3[5]); + x4[3] = _mm_adds_epi16(x3[3], x3[4]); + x4[4] = _mm_subs_epi16(x3[3], x3[4]); + x4[8] = x3[8]; + x4[9] = x3[9]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[10], x3[13], x4[10], x4[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[11], x3[12], x4[11], x4[12]); + x4[14] = x3[14]; + x4[15] = x3[15]; + x4[16] = _mm_adds_epi16(x3[16], x3[23]); + x4[23] = _mm_subs_epi16(x3[16], x3[23]); + x4[17] = _mm_adds_epi16(x3[17], x3[22]); + x4[22] = _mm_subs_epi16(x3[17], x3[22]); + x4[18] = _mm_adds_epi16(x3[18], x3[21]); + x4[21] = _mm_subs_epi16(x3[18], x3[21]); + x4[19] = _mm_adds_epi16(x3[19], x3[20]); + x4[20] = _mm_subs_epi16(x3[19], x3[20]); + x4[24] = _mm_subs_epi16(x3[31], x3[24]); + x4[31] = _mm_adds_epi16(x3[31], x3[24]); + x4[25] = _mm_subs_epi16(x3[30], x3[25]); + x4[30] = _mm_adds_epi16(x3[30], x3[25]); + x4[26] = _mm_subs_epi16(x3[29], x3[26]); + x4[29] = _mm_adds_epi16(x3[29], x3[26]); + x4[27] = _mm_subs_epi16(x3[28], x3[27]); + x4[28] = _mm_adds_epi16(x3[28], x3[27]); + x4[32] = x3[32]; + x4[33] = x3[33]; + x4[34] = x3[34]; + x4[35] = x3[35]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[36], x3[59], x4[36], x4[59]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[37], x3[58], x4[37], x4[58]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[38], x3[57], x4[38], x4[57]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[39], x3[56], x4[39], x4[56]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[40], x3[55], x4[40], x4[55]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[41], x3[54], x4[41], x4[54]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[42], x3[53], x4[42], x4[53]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[43], x3[52], x4[43], x4[52]); + x4[44] = x3[44]; + x4[45] = x3[45]; + x4[46] = x3[46]; + x4[47] = x3[47]; + x4[48] = x3[48]; + x4[49] = x3[49]; + x4[50] = x3[50]; + x4[51] = x3[51]; + x4[60] = x3[60]; + x4[61] = x3[61]; + x4[62] = x3[62]; + x4[63] = x3[63]; + + // stage 5 + __m128i x5[64]; + x5[0] = _mm_adds_epi16(x4[0], x4[3]); + x5[3] = _mm_subs_epi16(x4[0], x4[3]); + x5[1] = _mm_adds_epi16(x4[1], x4[2]); + x5[2] = _mm_subs_epi16(x4[1], x4[2]); + x5[4] = x4[4]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x4[5], x4[6], x5[5], x5[6]); + x5[7] = x4[7]; + x5[8] = _mm_adds_epi16(x4[8], x4[11]); + x5[11] = _mm_subs_epi16(x4[8], x4[11]); + x5[9] = _mm_adds_epi16(x4[9], x4[10]); + x5[10] = _mm_subs_epi16(x4[9], x4[10]); + x5[12] = _mm_subs_epi16(x4[15], x4[12]); + x5[15] = _mm_adds_epi16(x4[15], x4[12]); + x5[13] = _mm_subs_epi16(x4[14], x4[13]); + x5[14] = _mm_adds_epi16(x4[14], x4[13]); + x5[16] = x4[16]; + x5[17] = x4[17]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[18], x4[29], x5[18], x5[29]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[19], x4[28], x5[19], x5[28]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[20], x4[27], x5[20], x5[27]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[21], x4[26], x5[21], x5[26]); + x5[22] = x4[22]; + x5[23] = x4[23]; + x5[24] = x4[24]; + x5[25] = x4[25]; + x5[30] = x4[30]; + x5[31] = x4[31]; + x5[32] = _mm_adds_epi16(x4[32], x4[39]); + x5[39] = _mm_subs_epi16(x4[32], x4[39]); + x5[33] = _mm_adds_epi16(x4[33], x4[38]); + x5[38] = _mm_subs_epi16(x4[33], x4[38]); + x5[34] = _mm_adds_epi16(x4[34], x4[37]); + x5[37] = _mm_subs_epi16(x4[34], x4[37]); + x5[35] = _mm_adds_epi16(x4[35], x4[36]); + x5[36] = _mm_subs_epi16(x4[35], x4[36]); + x5[40] = _mm_subs_epi16(x4[47], x4[40]); + x5[47] = _mm_adds_epi16(x4[47], x4[40]); + x5[41] = _mm_subs_epi16(x4[46], x4[41]); + x5[46] = _mm_adds_epi16(x4[46], x4[41]); + x5[42] = _mm_subs_epi16(x4[45], x4[42]); + x5[45] = _mm_adds_epi16(x4[45], x4[42]); + x5[43] = _mm_subs_epi16(x4[44], x4[43]); + x5[44] = _mm_adds_epi16(x4[44], x4[43]); + x5[48] = _mm_adds_epi16(x4[48], x4[55]); + x5[55] = _mm_subs_epi16(x4[48], x4[55]); + x5[49] = _mm_adds_epi16(x4[49], x4[54]); + x5[54] = _mm_subs_epi16(x4[49], x4[54]); + x5[50] = _mm_adds_epi16(x4[50], x4[53]); + x5[53] = _mm_subs_epi16(x4[50], x4[53]); + x5[51] = _mm_adds_epi16(x4[51], x4[52]); + x5[52] = _mm_subs_epi16(x4[51], x4[52]); + x5[56] = _mm_subs_epi16(x4[63], x4[56]); + x5[63] = _mm_adds_epi16(x4[63], x4[56]); + x5[57] = _mm_subs_epi16(x4[62], x4[57]); + x5[62] = _mm_adds_epi16(x4[62], x4[57]); + x5[58] = _mm_subs_epi16(x4[61], x4[58]); + x5[61] = _mm_adds_epi16(x4[61], x4[58]); + x5[59] = _mm_subs_epi16(x4[60], x4[59]); + x5[60] = _mm_adds_epi16(x4[60], x4[59]); + + // stage 6 + __m128i x6[64]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x5[0], x5[1], x6[0], x6[1]); + btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x5[2], x5[3], x6[2], x6[3]); + x6[4] = _mm_adds_epi16(x5[4], x5[5]); + x6[5] = _mm_subs_epi16(x5[4], x5[5]); + x6[6] = _mm_subs_epi16(x5[7], x5[6]); + x6[7] = _mm_adds_epi16(x5[7], x5[6]); + x6[8] = x5[8]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x5[9], x5[14], x6[9], x6[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x5[10], x5[13], x6[10], x6[13]); + x6[11] = x5[11]; + x6[12] = x5[12]; + x6[15] = x5[15]; + x6[16] = _mm_adds_epi16(x5[16], x5[19]); + x6[19] = _mm_subs_epi16(x5[16], x5[19]); + x6[17] = _mm_adds_epi16(x5[17], x5[18]); + x6[18] = _mm_subs_epi16(x5[17], x5[18]); + x6[20] = _mm_subs_epi16(x5[23], x5[20]); + x6[23] = _mm_adds_epi16(x5[23], x5[20]); + x6[21] = _mm_subs_epi16(x5[22], x5[21]); + x6[22] = _mm_adds_epi16(x5[22], x5[21]); + x6[24] = _mm_adds_epi16(x5[24], x5[27]); + x6[27] = _mm_subs_epi16(x5[24], x5[27]); + x6[25] = _mm_adds_epi16(x5[25], x5[26]); + x6[26] = _mm_subs_epi16(x5[25], x5[26]); + x6[28] = _mm_subs_epi16(x5[31], x5[28]); + x6[31] = _mm_adds_epi16(x5[31], x5[28]); + x6[29] = _mm_subs_epi16(x5[30], x5[29]); + x6[30] = _mm_adds_epi16(x5[30], x5[29]); + x6[32] = x5[32]; + x6[33] = x5[33]; + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[34], x5[61], x6[34], x6[61]); + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[35], x5[60], x6[35], x6[60]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[36], x5[59], x6[36], x6[59]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[37], x5[58], x6[37], x6[58]); + x6[38] = x5[38]; + x6[39] = x5[39]; + x6[40] = x5[40]; + x6[41] = x5[41]; + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[42], x5[53], x6[42], x6[53]); + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[43], x5[52], x6[43], x6[52]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[44], x5[51], x6[44], x6[51]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[45], x5[50], x6[45], x6[50]); + x6[46] = x5[46]; + x6[47] = x5[47]; + x6[48] = x5[48]; + x6[49] = x5[49]; + x6[54] = x5[54]; + x6[55] = x5[55]; + x6[56] = x5[56]; + x6[57] = x5[57]; + x6[62] = x5[62]; + x6[63] = x5[63]; + + // stage 7 + __m128i x7[64]; + x7[0] = x6[0]; + x7[1] = x6[1]; + x7[2] = x6[2]; + x7[3] = x6[3]; + btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x6[4], x6[7], x7[4], x7[7]); + btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x6[5], x6[6], x7[5], x7[6]); + x7[8] = _mm_adds_epi16(x6[8], x6[9]); + x7[9] = _mm_subs_epi16(x6[8], x6[9]); + x7[10] = _mm_subs_epi16(x6[11], x6[10]); + x7[11] = _mm_adds_epi16(x6[11], x6[10]); + x7[12] = _mm_adds_epi16(x6[12], x6[13]); + x7[13] = _mm_subs_epi16(x6[12], x6[13]); + x7[14] = _mm_subs_epi16(x6[15], x6[14]); + x7[15] = _mm_adds_epi16(x6[15], x6[14]); + x7[16] = x6[16]; + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x6[17], x6[30], x7[17], x7[30]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x6[18], x6[29], x7[18], x7[29]); + x7[19] = x6[19]; + x7[20] = x6[20]; + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x6[21], x6[26], x7[21], x7[26]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x6[22], x6[25], x7[22], x7[25]); + x7[23] = x6[23]; + x7[24] = x6[24]; + x7[27] = x6[27]; + x7[28] = x6[28]; + x7[31] = x6[31]; + x7[32] = _mm_adds_epi16(x6[32], x6[35]); + x7[35] = _mm_subs_epi16(x6[32], x6[35]); + x7[33] = _mm_adds_epi16(x6[33], x6[34]); + x7[34] = _mm_subs_epi16(x6[33], x6[34]); + x7[36] = _mm_subs_epi16(x6[39], x6[36]); + x7[39] = _mm_adds_epi16(x6[39], x6[36]); + x7[37] = _mm_subs_epi16(x6[38], x6[37]); + x7[38] = _mm_adds_epi16(x6[38], x6[37]); + x7[40] = _mm_adds_epi16(x6[40], x6[43]); + x7[43] = _mm_subs_epi16(x6[40], x6[43]); + x7[41] = _mm_adds_epi16(x6[41], x6[42]); + x7[42] = _mm_subs_epi16(x6[41], x6[42]); + x7[44] = _mm_subs_epi16(x6[47], x6[44]); + x7[47] = _mm_adds_epi16(x6[47], x6[44]); + x7[45] = _mm_subs_epi16(x6[46], x6[45]); + x7[46] = _mm_adds_epi16(x6[46], x6[45]); + x7[48] = _mm_adds_epi16(x6[48], x6[51]); + x7[51] = _mm_subs_epi16(x6[48], x6[51]); + x7[49] = _mm_adds_epi16(x6[49], x6[50]); + x7[50] = _mm_subs_epi16(x6[49], x6[50]); + x7[52] = _mm_subs_epi16(x6[55], x6[52]); + x7[55] = _mm_adds_epi16(x6[55], x6[52]); + x7[53] = _mm_subs_epi16(x6[54], x6[53]); + x7[54] = _mm_adds_epi16(x6[54], x6[53]); + x7[56] = _mm_adds_epi16(x6[56], x6[59]); + x7[59] = _mm_subs_epi16(x6[56], x6[59]); + x7[57] = _mm_adds_epi16(x6[57], x6[58]); + x7[58] = _mm_subs_epi16(x6[57], x6[58]); + x7[60] = _mm_subs_epi16(x6[63], x6[60]); + x7[63] = _mm_adds_epi16(x6[63], x6[60]); + x7[61] = _mm_subs_epi16(x6[62], x6[61]); + x7[62] = _mm_adds_epi16(x6[62], x6[61]); + + // stage 8 + __m128i x8[64]; + x8[0] = x7[0]; + x8[1] = x7[1]; + x8[2] = x7[2]; + x8[3] = x7[3]; + x8[4] = x7[4]; + x8[5] = x7[5]; + x8[6] = x7[6]; + x8[7] = x7[7]; + btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x7[8], x7[15], x8[8], x8[15]); + btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x7[9], x7[14], x8[9], x8[14]); + btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x7[10], x7[13], x8[10], x8[13]); + btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x7[11], x7[12], x8[11], x8[12]); + x8[16] = _mm_adds_epi16(x7[16], x7[17]); + x8[17] = _mm_subs_epi16(x7[16], x7[17]); + x8[18] = _mm_subs_epi16(x7[19], x7[18]); + x8[19] = _mm_adds_epi16(x7[19], x7[18]); + x8[20] = _mm_adds_epi16(x7[20], x7[21]); + x8[21] = _mm_subs_epi16(x7[20], x7[21]); + x8[22] = _mm_subs_epi16(x7[23], x7[22]); + x8[23] = _mm_adds_epi16(x7[23], x7[22]); + x8[24] = _mm_adds_epi16(x7[24], x7[25]); + x8[25] = _mm_subs_epi16(x7[24], x7[25]); + x8[26] = _mm_subs_epi16(x7[27], x7[26]); + x8[27] = _mm_adds_epi16(x7[27], x7[26]); + x8[28] = _mm_adds_epi16(x7[28], x7[29]); + x8[29] = _mm_subs_epi16(x7[28], x7[29]); + x8[30] = _mm_subs_epi16(x7[31], x7[30]); + x8[31] = _mm_adds_epi16(x7[31], x7[30]); + x8[32] = x7[32]; + btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x7[33], x7[62], x8[33], x8[62]); + btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x7[34], x7[61], x8[34], x8[61]); + x8[35] = x7[35]; + x8[36] = x7[36]; + btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x7[37], x7[58], x8[37], x8[58]); + btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x7[38], x7[57], x8[38], x8[57]); + x8[39] = x7[39]; + x8[40] = x7[40]; + btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x7[41], x7[54], x8[41], x8[54]); + btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x7[42], x7[53], x8[42], x8[53]); + x8[43] = x7[43]; + x8[44] = x7[44]; + btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x7[45], x7[50], x8[45], x8[50]); + btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x7[46], x7[49], x8[46], x8[49]); + x8[47] = x7[47]; + x8[48] = x7[48]; + x8[51] = x7[51]; + x8[52] = x7[52]; + x8[55] = x7[55]; + x8[56] = x7[56]; + x8[59] = x7[59]; + x8[60] = x7[60]; + x8[63] = x7[63]; + + // stage 9 + __m128i x9[64]; + x9[0] = x8[0]; + x9[1] = x8[1]; + x9[2] = x8[2]; + x9[3] = x8[3]; + x9[4] = x8[4]; + x9[5] = x8[5]; + x9[6] = x8[6]; + x9[7] = x8[7]; + x9[8] = x8[8]; + x9[9] = x8[9]; + x9[10] = x8[10]; + x9[11] = x8[11]; + x9[12] = x8[12]; + x9[13] = x8[13]; + x9[14] = x8[14]; + x9[15] = x8[15]; + btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x8[16], x8[31], x9[16], x9[31]); + btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x8[17], x8[30], x9[17], x9[30]); + btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x8[18], x8[29], x9[18], x9[29]); + btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x8[19], x8[28], x9[19], x9[28]); + btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x8[20], x8[27], x9[20], x9[27]); + btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x8[21], x8[26], x9[21], x9[26]); + btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x8[22], x8[25], x9[22], x9[25]); + btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x8[23], x8[24], x9[23], x9[24]); + x9[32] = _mm_adds_epi16(x8[32], x8[33]); + x9[33] = _mm_subs_epi16(x8[32], x8[33]); + x9[34] = _mm_subs_epi16(x8[35], x8[34]); + x9[35] = _mm_adds_epi16(x8[35], x8[34]); + x9[36] = _mm_adds_epi16(x8[36], x8[37]); + x9[37] = _mm_subs_epi16(x8[36], x8[37]); + x9[38] = _mm_subs_epi16(x8[39], x8[38]); + x9[39] = _mm_adds_epi16(x8[39], x8[38]); + x9[40] = _mm_adds_epi16(x8[40], x8[41]); + x9[41] = _mm_subs_epi16(x8[40], x8[41]); + x9[42] = _mm_subs_epi16(x8[43], x8[42]); + x9[43] = _mm_adds_epi16(x8[43], x8[42]); + x9[44] = _mm_adds_epi16(x8[44], x8[45]); + x9[45] = _mm_subs_epi16(x8[44], x8[45]); + x9[46] = _mm_subs_epi16(x8[47], x8[46]); + x9[47] = _mm_adds_epi16(x8[47], x8[46]); + x9[48] = _mm_adds_epi16(x8[48], x8[49]); + x9[49] = _mm_subs_epi16(x8[48], x8[49]); + x9[50] = _mm_subs_epi16(x8[51], x8[50]); + x9[51] = _mm_adds_epi16(x8[51], x8[50]); + x9[52] = _mm_adds_epi16(x8[52], x8[53]); + x9[53] = _mm_subs_epi16(x8[52], x8[53]); + x9[54] = _mm_subs_epi16(x8[55], x8[54]); + x9[55] = _mm_adds_epi16(x8[55], x8[54]); + x9[56] = _mm_adds_epi16(x8[56], x8[57]); + x9[57] = _mm_subs_epi16(x8[56], x8[57]); + x9[58] = _mm_subs_epi16(x8[59], x8[58]); + x9[59] = _mm_adds_epi16(x8[59], x8[58]); + x9[60] = _mm_adds_epi16(x8[60], x8[61]); + x9[61] = _mm_subs_epi16(x8[60], x8[61]); + x9[62] = _mm_subs_epi16(x8[63], x8[62]); + x9[63] = _mm_adds_epi16(x8[63], x8[62]); + + // stage 10 + __m128i x10[64]; + x10[0] = x9[0]; + x10[1] = x9[1]; + x10[2] = x9[2]; + x10[3] = x9[3]; + x10[4] = x9[4]; + x10[5] = x9[5]; + x10[6] = x9[6]; + x10[7] = x9[7]; + x10[8] = x9[8]; + x10[9] = x9[9]; + x10[10] = x9[10]; + x10[11] = x9[11]; + x10[12] = x9[12]; + x10[13] = x9[13]; + x10[14] = x9[14]; + x10[15] = x9[15]; + x10[16] = x9[16]; + x10[17] = x9[17]; + x10[18] = x9[18]; + x10[19] = x9[19]; + x10[20] = x9[20]; + x10[21] = x9[21]; + x10[22] = x9[22]; + x10[23] = x9[23]; + x10[24] = x9[24]; + x10[25] = x9[25]; + x10[26] = x9[26]; + x10[27] = x9[27]; + x10[28] = x9[28]; + x10[29] = x9[29]; + x10[30] = x9[30]; + x10[31] = x9[31]; + btf_16_sse2(cospi_p63_p01, cospi_m01_p63, x9[32], x9[63], x10[32], x10[63]); + btf_16_sse2(cospi_p31_p33, cospi_m33_p31, x9[33], x9[62], x10[33], x10[62]); + btf_16_sse2(cospi_p47_p17, cospi_m17_p47, x9[34], x9[61], x10[34], x10[61]); + btf_16_sse2(cospi_p15_p49, cospi_m49_p15, x9[35], x9[60], x10[35], x10[60]); + btf_16_sse2(cospi_p55_p09, cospi_m09_p55, x9[36], x9[59], x10[36], x10[59]); + btf_16_sse2(cospi_p23_p41, cospi_m41_p23, x9[37], x9[58], x10[37], x10[58]); + btf_16_sse2(cospi_p39_p25, cospi_m25_p39, x9[38], x9[57], x10[38], x10[57]); + btf_16_sse2(cospi_p07_p57, cospi_m57_p07, x9[39], x9[56], x10[39], x10[56]); + btf_16_sse2(cospi_p59_p05, cospi_m05_p59, x9[40], x9[55], x10[40], x10[55]); + btf_16_sse2(cospi_p27_p37, cospi_m37_p27, x9[41], x9[54], x10[41], x10[54]); + btf_16_sse2(cospi_p43_p21, cospi_m21_p43, x9[42], x9[53], x10[42], x10[53]); + btf_16_sse2(cospi_p11_p53, cospi_m53_p11, x9[43], x9[52], x10[43], x10[52]); + btf_16_sse2(cospi_p51_p13, cospi_m13_p51, x9[44], x9[51], x10[44], x10[51]); + btf_16_sse2(cospi_p19_p45, cospi_m45_p19, x9[45], x9[50], x10[45], x10[50]); + btf_16_sse2(cospi_p35_p29, cospi_m29_p35, x9[46], x9[49], x10[46], x10[49]); + btf_16_sse2(cospi_p03_p61, cospi_m61_p03, x9[47], x9[48], x10[47], x10[48]); + + // stage 11 + output[0] = x10[0]; + output[1] = x10[32]; + output[2] = x10[16]; + output[3] = x10[48]; + output[4] = x10[8]; + output[5] = x10[40]; + output[6] = x10[24]; + output[7] = x10[56]; + output[8] = x10[4]; + output[9] = x10[36]; + output[10] = x10[20]; + output[11] = x10[52]; + output[12] = x10[12]; + output[13] = x10[44]; + output[14] = x10[28]; + output[15] = x10[60]; + output[16] = x10[2]; + output[17] = x10[34]; + output[18] = x10[18]; + output[19] = x10[50]; + output[20] = x10[10]; + output[21] = x10[42]; + output[22] = x10[26]; + output[23] = x10[58]; + output[24] = x10[6]; + output[25] = x10[38]; + output[26] = x10[22]; + output[27] = x10[54]; + output[28] = x10[14]; + output[29] = x10[46]; + output[30] = x10[30]; + output[31] = x10[62]; + output[32] = x10[1]; + output[33] = x10[33]; + output[34] = x10[17]; + output[35] = x10[49]; + output[36] = x10[9]; + output[37] = x10[41]; + output[38] = x10[25]; + output[39] = x10[57]; + output[40] = x10[5]; + output[41] = x10[37]; + output[42] = x10[21]; + output[43] = x10[53]; + output[44] = x10[13]; + output[45] = x10[45]; + output[46] = x10[29]; + output[47] = x10[61]; + output[48] = x10[3]; + output[49] = x10[35]; + output[50] = x10[19]; + output[51] = x10[51]; + output[52] = x10[11]; + output[53] = x10[43]; + output[54] = x10[27]; + output[55] = x10[59]; + output[56] = x10[7]; + output[57] = x10[39]; + output[58] = x10[23]; + output[59] = x10[55]; + output[60] = x10[15]; + output[61] = x10[47]; + output[62] = x10[31]; + output[63] = x10[63]; +} + +static void fadst4x4_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *sinpi = sinpi_arr(cos_bit); + const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]); + const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]); + const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]); + const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]); + const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]); + const __m128i __zero = _mm_set1_epi16(0); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + const __m128i in7 = _mm_add_epi16(input[0], input[1]); + __m128i u[8], v[8]; + + u[0] = _mm_unpacklo_epi16(input[0], input[1]); + u[1] = _mm_unpacklo_epi16(input[2], input[3]); + u[2] = _mm_unpacklo_epi16(in7, __zero); + u[3] = _mm_unpacklo_epi16(input[2], __zero); + u[4] = _mm_unpacklo_epi16(input[3], __zero); + + v[0] = _mm_madd_epi16(u[0], sinpi_p01_p02); // s0 + s2 + v[1] = _mm_madd_epi16(u[1], sinpi_p03_p04); // s4 + s5 + v[2] = _mm_madd_epi16(u[2], sinpi_p03_p03); // x1 + v[3] = _mm_madd_epi16(u[0], sinpi_p04_m01); // s1 - s3 + v[4] = _mm_madd_epi16(u[1], sinpi_m03_p02); // -s4 + s6 + v[5] = _mm_madd_epi16(u[3], sinpi_p03_p03); // s4 + v[6] = _mm_madd_epi16(u[4], sinpi_p03_p03); + + u[0] = _mm_add_epi32(v[0], v[1]); + u[1] = _mm_sub_epi32(v[2], v[6]); + u[2] = _mm_add_epi32(v[3], v[4]); + u[3] = _mm_sub_epi32(u[2], u[0]); + u[4] = _mm_slli_epi32(v[5], 2); + u[5] = _mm_sub_epi32(u[4], v[5]); + u[6] = _mm_add_epi32(u[3], u[5]); + + v[0] = _mm_add_epi32(u[0], __rounding); + v[1] = _mm_add_epi32(u[1], __rounding); + v[2] = _mm_add_epi32(u[2], __rounding); + v[3] = _mm_add_epi32(u[6], __rounding); + + u[0] = _mm_srai_epi32(v[0], cos_bit); + u[1] = _mm_srai_epi32(v[1], cos_bit); + u[2] = _mm_srai_epi32(v[2], cos_bit); + u[3] = _mm_srai_epi32(v[3], cos_bit); + + output[0] = _mm_packs_epi32(u[0], u[2]); + output[1] = _mm_packs_epi32(u[1], u[3]); + output[2] = _mm_srli_si128(output[0], 8); + output[3] = _mm_srli_si128(output[1], 8); +} + +static void fadst4x8_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); + __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); + __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); + __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); + __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); + __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); + __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); + __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); + + // stage 1 + __m128i x1[8]; + x1[0] = input[0]; + x1[1] = _mm_subs_epi16(__zero, input[7]); + x1[2] = _mm_subs_epi16(__zero, input[3]); + x1[3] = input[4]; + x1[4] = _mm_subs_epi16(__zero, input[1]); + x1[5] = input[6]; + x1[6] = input[2]; + x1[7] = _mm_subs_epi16(__zero, input[5]); + + // stage 2 + __m128i x2[8]; + x2[0] = x1[0]; + x2[1] = x1[1]; + btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[2], + &x1[3], &x2[2], &x2[3]); + x2[4] = x1[4]; + x2[5] = x1[5]; + btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[6], + &x1[7], &x2[6], &x2[7]); + + // stage 3 + __m128i x3[8]; + x3[0] = _mm_adds_epi16(x2[0], x2[2]); + x3[2] = _mm_subs_epi16(x2[0], x2[2]); + x3[1] = _mm_adds_epi16(x2[1], x2[3]); + x3[3] = _mm_subs_epi16(x2[1], x2[3]); + x3[4] = _mm_adds_epi16(x2[4], x2[6]); + x3[6] = _mm_subs_epi16(x2[4], x2[6]); + x3[5] = _mm_adds_epi16(x2[5], x2[7]); + x3[7] = _mm_subs_epi16(x2[5], x2[7]); + + // stage 4 + __m128i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_w4_sse2(&cospi_p16_p48, &cospi_p48_m16, __rounding, cos_bit, &x3[4], + &x3[5], &x4[4], &x4[5]); + btf_16_w4_sse2(&cospi_m48_p16, &cospi_p16_p48, __rounding, cos_bit, &x3[6], + &x3[7], &x4[6], &x4[7]); + + // stage 5 + __m128i x5[8]; + x5[0] = _mm_adds_epi16(x4[0], x4[4]); + x5[4] = _mm_subs_epi16(x4[0], x4[4]); + x5[1] = _mm_adds_epi16(x4[1], x4[5]); + x5[5] = _mm_subs_epi16(x4[1], x4[5]); + x5[2] = _mm_adds_epi16(x4[2], x4[6]); + x5[6] = _mm_subs_epi16(x4[2], x4[6]); + x5[3] = _mm_adds_epi16(x4[3], x4[7]); + x5[7] = _mm_subs_epi16(x4[3], x4[7]); + + // stage 6 + __m128i x6[8]; + btf_16_w4_sse2(&cospi_p04_p60, &cospi_p60_m04, __rounding, cos_bit, &x5[0], + &x5[1], &x6[0], &x6[1]); + btf_16_w4_sse2(&cospi_p20_p44, &cospi_p44_m20, __rounding, cos_bit, &x5[2], + &x5[3], &x6[2], &x6[3]); + btf_16_w4_sse2(&cospi_p36_p28, &cospi_p28_m36, __rounding, cos_bit, &x5[4], + &x5[5], &x6[4], &x6[5]); + btf_16_w4_sse2(&cospi_p52_p12, &cospi_p12_m52, __rounding, cos_bit, &x5[6], + &x5[7], &x6[6], &x6[7]); + + // stage 7 + output[0] = x6[1]; + output[1] = x6[6]; + output[2] = x6[3]; + output[3] = x6[4]; + output[4] = x6[5]; + output[5] = x6[2]; + output[6] = x6[7]; + output[7] = x6[0]; +} + +static void fadst8x4_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *sinpi = sinpi_arr(cos_bit); + const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]); + const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]); + const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]); + const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]); + const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]); + const __m128i __zero = _mm_set1_epi16(0); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + const __m128i in7 = _mm_add_epi16(input[0], input[1]); + __m128i u_lo[8], u_hi[8], v_lo[8], v_hi[8]; + + u_lo[0] = _mm_unpacklo_epi16(input[0], input[1]); + u_hi[0] = _mm_unpackhi_epi16(input[0], input[1]); + u_lo[1] = _mm_unpacklo_epi16(input[2], input[3]); + u_hi[1] = _mm_unpackhi_epi16(input[2], input[3]); + u_lo[2] = _mm_unpacklo_epi16(in7, __zero); + u_hi[2] = _mm_unpackhi_epi16(in7, __zero); + u_lo[3] = _mm_unpacklo_epi16(input[2], __zero); + u_hi[3] = _mm_unpackhi_epi16(input[2], __zero); + u_lo[4] = _mm_unpacklo_epi16(input[3], __zero); + u_hi[4] = _mm_unpackhi_epi16(input[3], __zero); + + v_lo[0] = _mm_madd_epi16(u_lo[0], sinpi_p01_p02); // s0 + s2 + v_hi[0] = _mm_madd_epi16(u_hi[0], sinpi_p01_p02); // s0 + s2 + v_lo[1] = _mm_madd_epi16(u_lo[1], sinpi_p03_p04); // s4 + s5 + v_hi[1] = _mm_madd_epi16(u_hi[1], sinpi_p03_p04); // s4 + s5 + v_lo[2] = _mm_madd_epi16(u_lo[2], sinpi_p03_p03); // x1 + v_hi[2] = _mm_madd_epi16(u_hi[2], sinpi_p03_p03); // x1 + v_lo[3] = _mm_madd_epi16(u_lo[0], sinpi_p04_m01); // s1 - s3 + v_hi[3] = _mm_madd_epi16(u_hi[0], sinpi_p04_m01); // s1 - s3 + v_lo[4] = _mm_madd_epi16(u_lo[1], sinpi_m03_p02); // -s4 + s6 + v_hi[4] = _mm_madd_epi16(u_hi[1], sinpi_m03_p02); // -s4 + s6 + v_lo[5] = _mm_madd_epi16(u_lo[3], sinpi_p03_p03); // s4 + v_hi[5] = _mm_madd_epi16(u_hi[3], sinpi_p03_p03); // s4 + v_lo[6] = _mm_madd_epi16(u_lo[4], sinpi_p03_p03); + v_hi[6] = _mm_madd_epi16(u_hi[4], sinpi_p03_p03); + + u_lo[0] = _mm_add_epi32(v_lo[0], v_lo[1]); + u_hi[0] = _mm_add_epi32(v_hi[0], v_hi[1]); + u_lo[1] = _mm_sub_epi32(v_lo[2], v_lo[6]); + u_hi[1] = _mm_sub_epi32(v_hi[2], v_hi[6]); + u_lo[2] = _mm_add_epi32(v_lo[3], v_lo[4]); + u_hi[2] = _mm_add_epi32(v_hi[3], v_hi[4]); + u_lo[3] = _mm_sub_epi32(u_lo[2], u_lo[0]); + u_hi[3] = _mm_sub_epi32(u_hi[2], u_hi[0]); + u_lo[4] = _mm_slli_epi32(v_lo[5], 2); + u_hi[4] = _mm_slli_epi32(v_hi[5], 2); + u_lo[5] = _mm_sub_epi32(u_lo[4], v_lo[5]); + u_hi[5] = _mm_sub_epi32(u_hi[4], v_hi[5]); + u_lo[6] = _mm_add_epi32(u_lo[3], u_lo[5]); + u_hi[6] = _mm_add_epi32(u_hi[3], u_hi[5]); + + v_lo[0] = _mm_add_epi32(u_lo[0], __rounding); + v_hi[0] = _mm_add_epi32(u_hi[0], __rounding); + v_lo[1] = _mm_add_epi32(u_lo[1], __rounding); + v_hi[1] = _mm_add_epi32(u_hi[1], __rounding); + v_lo[2] = _mm_add_epi32(u_lo[2], __rounding); + v_hi[2] = _mm_add_epi32(u_hi[2], __rounding); + v_lo[3] = _mm_add_epi32(u_lo[6], __rounding); + v_hi[3] = _mm_add_epi32(u_hi[6], __rounding); + + u_lo[0] = _mm_srai_epi32(v_lo[0], cos_bit); + u_hi[0] = _mm_srai_epi32(v_hi[0], cos_bit); + u_lo[1] = _mm_srai_epi32(v_lo[1], cos_bit); + u_hi[1] = _mm_srai_epi32(v_hi[1], cos_bit); + u_lo[2] = _mm_srai_epi32(v_lo[2], cos_bit); + u_hi[2] = _mm_srai_epi32(v_hi[2], cos_bit); + u_lo[3] = _mm_srai_epi32(v_lo[3], cos_bit); + u_hi[3] = _mm_srai_epi32(v_hi[3], cos_bit); + + output[0] = _mm_packs_epi32(u_lo[0], u_hi[0]); + output[1] = _mm_packs_epi32(u_lo[1], u_hi[1]); + output[2] = _mm_packs_epi32(u_lo[2], u_hi[2]); + output[3] = _mm_packs_epi32(u_lo[3], u_hi[3]); +} + +static void fadst8x8_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); + __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); + __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); + __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); + __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); + __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); + __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); + __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); + + // stage 1 + __m128i x1[8]; + x1[0] = input[0]; + x1[1] = _mm_subs_epi16(__zero, input[7]); + x1[2] = _mm_subs_epi16(__zero, input[3]); + x1[3] = input[4]; + x1[4] = _mm_subs_epi16(__zero, input[1]); + x1[5] = input[6]; + x1[6] = input[2]; + x1[7] = _mm_subs_epi16(__zero, input[5]); + + // stage 2 + __m128i x2[8]; + x2[0] = x1[0]; + x2[1] = x1[1]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]); + x2[4] = x1[4]; + x2[5] = x1[5]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]); + + // stage 3 + __m128i x3[8]; + x3[0] = _mm_adds_epi16(x2[0], x2[2]); + x3[2] = _mm_subs_epi16(x2[0], x2[2]); + x3[1] = _mm_adds_epi16(x2[1], x2[3]); + x3[3] = _mm_subs_epi16(x2[1], x2[3]); + x3[4] = _mm_adds_epi16(x2[4], x2[6]); + x3[6] = _mm_subs_epi16(x2[4], x2[6]); + x3[5] = _mm_adds_epi16(x2[5], x2[7]); + x3[7] = _mm_subs_epi16(x2[5], x2[7]); + + // stage 4 + __m128i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]); + btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]); + + // stage 5 + __m128i x5[8]; + x5[0] = _mm_adds_epi16(x4[0], x4[4]); + x5[4] = _mm_subs_epi16(x4[0], x4[4]); + x5[1] = _mm_adds_epi16(x4[1], x4[5]); + x5[5] = _mm_subs_epi16(x4[1], x4[5]); + x5[2] = _mm_adds_epi16(x4[2], x4[6]); + x5[6] = _mm_subs_epi16(x4[2], x4[6]); + x5[3] = _mm_adds_epi16(x4[3], x4[7]); + x5[7] = _mm_subs_epi16(x4[3], x4[7]); + + // stage 6 + __m128i x6[8]; + btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x5[0], x5[1], x6[0], x6[1]); + btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x5[2], x5[3], x6[2], x6[3]); + btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x5[4], x5[5], x6[4], x6[5]); + btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x5[6], x5[7], x6[6], x6[7]); + + // stage 7 + output[0] = x6[1]; + output[1] = x6[6]; + output[2] = x6[3]; + output[3] = x6[4]; + output[4] = x6[5]; + output[5] = x6[2]; + output[6] = x6[7]; + output[7] = x6[0]; +} + +static void fadst8x16_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]); + __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]); + __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); + __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); + __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); + __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); + __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); + __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); + __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); + __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); + __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); + __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); + __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); + __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); + __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); + __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); + __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); + __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); + + // stage 1 + __m128i x1[16]; + x1[0] = input[0]; + x1[1] = _mm_subs_epi16(__zero, input[15]); + x1[2] = _mm_subs_epi16(__zero, input[7]); + x1[3] = input[8]; + x1[4] = _mm_subs_epi16(__zero, input[3]); + x1[5] = input[12]; + x1[6] = input[4]; + x1[7] = _mm_subs_epi16(__zero, input[11]); + x1[8] = _mm_subs_epi16(__zero, input[1]); + x1[9] = input[14]; + x1[10] = input[6]; + x1[11] = _mm_subs_epi16(__zero, input[9]); + x1[12] = input[2]; + x1[13] = _mm_subs_epi16(__zero, input[13]); + x1[14] = _mm_subs_epi16(__zero, input[5]); + x1[15] = input[10]; + + // stage 2 + __m128i x2[16]; + x2[0] = x1[0]; + x2[1] = x1[1]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]); + x2[4] = x1[4]; + x2[5] = x1[5]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]); + x2[8] = x1[8]; + x2[9] = x1[9]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[10], x1[11], x2[10], x2[11]); + x2[12] = x1[12]; + x2[13] = x1[13]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[14], x1[15], x2[14], x2[15]); + + // stage 3 + __m128i x3[16]; + x3[0] = _mm_adds_epi16(x2[0], x2[2]); + x3[2] = _mm_subs_epi16(x2[0], x2[2]); + x3[1] = _mm_adds_epi16(x2[1], x2[3]); + x3[3] = _mm_subs_epi16(x2[1], x2[3]); + x3[4] = _mm_adds_epi16(x2[4], x2[6]); + x3[6] = _mm_subs_epi16(x2[4], x2[6]); + x3[5] = _mm_adds_epi16(x2[5], x2[7]); + x3[7] = _mm_subs_epi16(x2[5], x2[7]); + x3[8] = _mm_adds_epi16(x2[8], x2[10]); + x3[10] = _mm_subs_epi16(x2[8], x2[10]); + x3[9] = _mm_adds_epi16(x2[9], x2[11]); + x3[11] = _mm_subs_epi16(x2[9], x2[11]); + x3[12] = _mm_adds_epi16(x2[12], x2[14]); + x3[14] = _mm_subs_epi16(x2[12], x2[14]); + x3[13] = _mm_adds_epi16(x2[13], x2[15]); + x3[15] = _mm_subs_epi16(x2[13], x2[15]); + + // stage 4 + __m128i x4[16]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]); + btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]); + x4[8] = x3[8]; + x4[9] = x3[9]; + x4[10] = x3[10]; + x4[11] = x3[11]; + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[12], x3[13], x4[12], x4[13]); + btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[14], x3[15], x4[14], x4[15]); + + // stage 5 + __m128i x5[16]; + x5[0] = _mm_adds_epi16(x4[0], x4[4]); + x5[4] = _mm_subs_epi16(x4[0], x4[4]); + x5[1] = _mm_adds_epi16(x4[1], x4[5]); + x5[5] = _mm_subs_epi16(x4[1], x4[5]); + x5[2] = _mm_adds_epi16(x4[2], x4[6]); + x5[6] = _mm_subs_epi16(x4[2], x4[6]); + x5[3] = _mm_adds_epi16(x4[3], x4[7]); + x5[7] = _mm_subs_epi16(x4[3], x4[7]); + x5[8] = _mm_adds_epi16(x4[8], x4[12]); + x5[12] = _mm_subs_epi16(x4[8], x4[12]); + x5[9] = _mm_adds_epi16(x4[9], x4[13]); + x5[13] = _mm_subs_epi16(x4[9], x4[13]); + x5[10] = _mm_adds_epi16(x4[10], x4[14]); + x5[14] = _mm_subs_epi16(x4[10], x4[14]); + x5[11] = _mm_adds_epi16(x4[11], x4[15]); + x5[15] = _mm_subs_epi16(x4[11], x4[15]); + + // stage 6 + __m128i x6[16]; + x6[0] = x5[0]; + x6[1] = x5[1]; + x6[2] = x5[2]; + x6[3] = x5[3]; + x6[4] = x5[4]; + x6[5] = x5[5]; + x6[6] = x5[6]; + x6[7] = x5[7]; + btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x5[8], x5[9], x6[8], x6[9]); + btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x5[10], x5[11], x6[10], x6[11]); + btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x5[12], x5[13], x6[12], x6[13]); + btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x5[14], x5[15], x6[14], x6[15]); + + // stage 7 + __m128i x7[16]; + x7[0] = _mm_adds_epi16(x6[0], x6[8]); + x7[8] = _mm_subs_epi16(x6[0], x6[8]); + x7[1] = _mm_adds_epi16(x6[1], x6[9]); + x7[9] = _mm_subs_epi16(x6[1], x6[9]); + x7[2] = _mm_adds_epi16(x6[2], x6[10]); + x7[10] = _mm_subs_epi16(x6[2], x6[10]); + x7[3] = _mm_adds_epi16(x6[3], x6[11]); + x7[11] = _mm_subs_epi16(x6[3], x6[11]); + x7[4] = _mm_adds_epi16(x6[4], x6[12]); + x7[12] = _mm_subs_epi16(x6[4], x6[12]); + x7[5] = _mm_adds_epi16(x6[5], x6[13]); + x7[13] = _mm_subs_epi16(x6[5], x6[13]); + x7[6] = _mm_adds_epi16(x6[6], x6[14]); + x7[14] = _mm_subs_epi16(x6[6], x6[14]); + x7[7] = _mm_adds_epi16(x6[7], x6[15]); + x7[15] = _mm_subs_epi16(x6[7], x6[15]); + + // stage 8 + __m128i x8[16]; + btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x7[0], x7[1], x8[0], x8[1]); + btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x7[2], x7[3], x8[2], x8[3]); + btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x7[4], x7[5], x8[4], x8[5]); + btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x7[6], x7[7], x8[6], x8[7]); + btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x7[8], x7[9], x8[8], x8[9]); + btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x7[10], x7[11], x8[10], x8[11]); + btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x7[12], x7[13], x8[12], x8[13]); + btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x7[14], x7[15], x8[14], x8[15]); + + // stage 9 + output[0] = x8[1]; + output[1] = x8[14]; + output[2] = x8[3]; + output[3] = x8[12]; + output[4] = x8[5]; + output[5] = x8[10]; + output[6] = x8[7]; + output[7] = x8[8]; + output[8] = x8[9]; + output[9] = x8[6]; + output[10] = x8[11]; + output[11] = x8[4]; + output[12] = x8[13]; + output[13] = x8[2]; + output[14] = x8[15]; + output[15] = x8[0]; +} + +static const transform_1d_sse2 col_txfm4x4_arr[TX_TYPES] = { + fdct4x4_new_sse2, // DCT_DCT + fadst4x4_new_sse2, // ADST_DCT + fdct4x4_new_sse2, // DCT_ADST + fadst4x4_new_sse2, // ADST_ADST + fadst4x4_new_sse2, // FLIPADST_DCT + fdct4x4_new_sse2, // DCT_FLIPADST + fadst4x4_new_sse2, // FLIPADST_FLIPADST + fadst4x4_new_sse2, // ADST_FLIPADST + fadst4x4_new_sse2, // FLIPADST_ADST + fidentity4x4_new_sse2, // IDTX + fdct4x4_new_sse2, // V_DCT + fidentity4x4_new_sse2, // H_DCT + fadst4x4_new_sse2, // V_ADST + fidentity4x4_new_sse2, // H_ADST + fadst4x4_new_sse2, // V_FLIPADST + fidentity4x4_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm4x4_arr[TX_TYPES] = { + fdct4x4_new_sse2, // DCT_DCT + fdct4x4_new_sse2, // ADST_DCT + fadst4x4_new_sse2, // DCT_ADST + fadst4x4_new_sse2, // ADST_ADST + fdct4x4_new_sse2, // FLIPADST_DCT + fadst4x4_new_sse2, // DCT_FLIPADST + fadst4x4_new_sse2, // FLIPADST_FLIPADST + fadst4x4_new_sse2, // ADST_FLIPADST + fadst4x4_new_sse2, // FLIPADST_ADST + fidentity4x4_new_sse2, // IDTX + fidentity4x4_new_sse2, // V_DCT + fdct4x4_new_sse2, // H_DCT + fidentity4x4_new_sse2, // V_ADST + fadst4x4_new_sse2, // H_ADST + fidentity4x4_new_sse2, // V_FLIPADST + fadst4x4_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 col_txfm4x8_arr[TX_TYPES] = { + fdct4x8_new_sse2, // DCT_DCT + fadst4x8_new_sse2, // ADST_DCT + fdct4x8_new_sse2, // DCT_ADST + fadst4x8_new_sse2, // ADST_ADST + fadst4x8_new_sse2, // FLIPADST_DCT + fdct4x8_new_sse2, // DCT_FLIPADST + fadst4x8_new_sse2, // FLIPADST_FLIPADST + fadst4x8_new_sse2, // ADST_FLIPADST + fadst4x8_new_sse2, // FLIPADST_ADST + fidentity8x8_new_sse2, // IDTX + fdct4x8_new_sse2, // V_DCT + fidentity8x8_new_sse2, // H_DCT + fadst4x8_new_sse2, // V_ADST + fidentity8x8_new_sse2, // H_ADST + fadst4x8_new_sse2, // V_FLIPADST + fidentity8x8_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm8x4_arr[TX_TYPES] = { + fdct8x4_new_sse2, // DCT_DCT + fdct8x4_new_sse2, // ADST_DCT + fadst8x4_new_sse2, // DCT_ADST + fadst8x4_new_sse2, // ADST_ADST + fdct8x4_new_sse2, // FLIPADST_DCT + fadst8x4_new_sse2, // DCT_FLIPADST + fadst8x4_new_sse2, // FLIPADST_FLIPADST + fadst8x4_new_sse2, // ADST_FLIPADST + fadst8x4_new_sse2, // FLIPADST_ADST + fidentity8x4_new_sse2, // IDTX + fidentity8x4_new_sse2, // V_DCT + fdct8x4_new_sse2, // H_DCT + fidentity8x4_new_sse2, // V_ADST + fadst8x4_new_sse2, // H_ADST + fidentity8x4_new_sse2, // V_FLIPADST + fadst8x4_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 col_txfm8x4_arr[TX_TYPES] = { + fdct8x4_new_sse2, // DCT_DCT + fadst8x4_new_sse2, // ADST_DCT + fdct8x4_new_sse2, // DCT_ADST + fadst8x4_new_sse2, // ADST_ADST + fadst8x4_new_sse2, // FLIPADST_DCT + fdct8x4_new_sse2, // DCT_FLIPADST + fadst8x4_new_sse2, // FLIPADST_FLIPADST + fadst8x4_new_sse2, // ADST_FLIPADST + fadst8x4_new_sse2, // FLIPADST_ADST + fidentity8x4_new_sse2, // IDTX + fdct8x4_new_sse2, // V_DCT + fidentity8x4_new_sse2, // H_DCT + fadst8x4_new_sse2, // V_ADST + fidentity8x4_new_sse2, // H_ADST + fadst8x4_new_sse2, // V_FLIPADST + fidentity8x4_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm4x8_arr[TX_TYPES] = { + fdct4x8_new_sse2, // DCT_DCT + fdct4x8_new_sse2, // ADST_DCT + fadst4x8_new_sse2, // DCT_ADST + fadst4x8_new_sse2, // ADST_ADST + fdct4x8_new_sse2, // FLIPADST_DCT + fadst4x8_new_sse2, // DCT_FLIPADST + fadst4x8_new_sse2, // FLIPADST_FLIPADST + fadst4x8_new_sse2, // ADST_FLIPADST + fadst4x8_new_sse2, // FLIPADST_ADST + fidentity8x8_new_sse2, // IDTX + fidentity8x8_new_sse2, // V_DCT + fdct4x8_new_sse2, // H_DCT + fidentity8x8_new_sse2, // V_ADST + fadst4x8_new_sse2, // H_ADST + fidentity8x8_new_sse2, // V_FLIPADST + fadst4x8_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 col_txfm8x8_arr[TX_TYPES] = { + fdct8x8_new_sse2, // DCT_DCT + fadst8x8_new_sse2, // ADST_DCT + fdct8x8_new_sse2, // DCT_ADST + fadst8x8_new_sse2, // ADST_ADST + fadst8x8_new_sse2, // FLIPADST_DCT + fdct8x8_new_sse2, // DCT_FLIPADST + fadst8x8_new_sse2, // FLIPADST_FLIPADST + fadst8x8_new_sse2, // ADST_FLIPADST + fadst8x8_new_sse2, // FLIPADST_ADST + fidentity8x8_new_sse2, // IDTX + fdct8x8_new_sse2, // V_DCT + fidentity8x8_new_sse2, // H_DCT + fadst8x8_new_sse2, // V_ADST + fidentity8x8_new_sse2, // H_ADST + fadst8x8_new_sse2, // V_FLIPADST + fidentity8x8_new_sse2, // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm8x8_arr[TX_TYPES] = { + fdct8x8_new_sse2, // DCT_DCT + fdct8x8_new_sse2, // ADST_DCT + fadst8x8_new_sse2, // DCT_ADST + fadst8x8_new_sse2, // ADST_ADST + fdct8x8_new_sse2, // FLIPADST_DCT + fadst8x8_new_sse2, // DCT_FLIPADST + fadst8x8_new_sse2, // FLIPADST_FLIPADST + fadst8x8_new_sse2, // ADST_FLIPADST + fadst8x8_new_sse2, // FLIPADST_ADST + fidentity8x8_new_sse2, // IDTX + fidentity8x8_new_sse2, // V_DCT + fdct8x8_new_sse2, // H_DCT + fidentity8x8_new_sse2, // V_ADST + fadst8x8_new_sse2, // H_ADST + fidentity8x8_new_sse2, // V_FLIPADST + fadst8x8_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = { + fdct8x16_new_sse2, // DCT_DCT + fadst8x16_new_sse2, // ADST_DCT + fdct8x16_new_sse2, // DCT_ADST + fadst8x16_new_sse2, // ADST_ADST + fadst8x16_new_sse2, // FLIPADST_DCT + fdct8x16_new_sse2, // DCT_FLIPADST + fadst8x16_new_sse2, // FLIPADST_FLIPADST + fadst8x16_new_sse2, // ADST_FLIPADST + fadst8x16_new_sse2, // FLIPADST_ADST + fidentity8x16_new_sse2, // IDTX + fdct8x16_new_sse2, // V_DCT + fidentity8x16_new_sse2, // H_DCT + fadst8x16_new_sse2, // V_ADST + fidentity8x16_new_sse2, // H_ADST + fadst8x16_new_sse2, // V_FLIPADST + fidentity8x16_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm8x16_arr[TX_TYPES] = { + fdct8x16_new_sse2, // DCT_DCT + fdct8x16_new_sse2, // ADST_DCT + fadst8x16_new_sse2, // DCT_ADST + fadst8x16_new_sse2, // ADST_ADST + fdct8x16_new_sse2, // FLIPADST_DCT + fadst8x16_new_sse2, // DCT_FLIPADST + fadst8x16_new_sse2, // FLIPADST_FLIPADST + fadst8x16_new_sse2, // ADST_FLIPADST + fadst8x16_new_sse2, // FLIPADST_ADST + fidentity8x16_new_sse2, // IDTX + fidentity8x16_new_sse2, // V_DCT + fdct8x16_new_sse2, // H_DCT + fidentity8x16_new_sse2, // V_ADST + fadst8x16_new_sse2, // H_ADST + fidentity8x16_new_sse2, // V_FLIPADST + fadst8x16_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm8x32_arr[TX_TYPES] = { + av1_fdct8x32_new_sse2, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + fidentity8x32_new_sse2, // IDTX + fidentity8x32_new_sse2, // V_DCT + av1_fdct8x32_new_sse2, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[4], buf1[4], *buf; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4]; + const int txw_idx = get_txw_idx(TX_4X4); + const int txh_idx = get_txh_idx(TX_4X4); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 4; + const int height = 4; + const transform_1d_sse2 col_txfm = col_txfm4x4_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm4x4_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit_w4(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_4x4(buf0, buf1); + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_4x4(buf, buf); + store_buffer_16bit_to_32bit_w4(buf, output, width, height); +} + +void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)stride; + (void)bd; + __m128i buf0[8], buf1[8], *buf; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8]; + const int txw_idx = get_txw_idx(TX_4X8); + const int txh_idx = get_txh_idx(TX_4X8); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 4; + const int height = 8; + const transform_1d_sse2 col_txfm = col_txfm4x8_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit_w4(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_4x8(buf0, buf1); + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x4(buf, buf); + store_rect_buffer_16bit_to_32bit_w4(buf, output, width, height); +} + +void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[16]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16]; + const int txw_idx = get_txw_idx(TX_4X16); + const int txh_idx = get_txh_idx(TX_4X16); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 4; + const int height = 16; + const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit_w4(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_4x8(buf0, buf1); + transpose_16bit_4x8(buf0 + 8, buf1 + 8); + + for (int i = 0; i < 2; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + 8 * i, buf, width); + } else { + buf = buf1 + 8 * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x4(buf, buf); + store_buffer_16bit_to_32bit_w4(buf, output + 8 * width * i, width, 8); + } +} + +void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[8], buf1[8], *buf; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4]; + const int txw_idx = get_txw_idx(TX_8X4); + const int txh_idx = get_txh_idx(TX_8X4); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 8; + const int height = 4; + const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm4x8_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) + load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); + else + load_buffer_16bit_to_16bit(input, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1); + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height); +} + +void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[8], buf1[8], *buf; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8]; + const int txw_idx = get_txw_idx(TX_8X8); + const int txh_idx = get_txh_idx(TX_8X8); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 8; + const int height = 8; + const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) + load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); + else + load_buffer_16bit_to_16bit(input, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1); + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_buffer_16bit_to_32bit_w8(buf, output, width, height); +} + +void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[16]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16]; + const int txw_idx = get_txw_idx(TX_8X16); + const int txh_idx = get_txh_idx(TX_8X16); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 8; + const int height = 16; + const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1); + transpose_16bit_8x8(buf0 + 8, buf1 + 8); + + for (int i = 0; i < 2; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8); + } +} + +void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[32], buf1[32]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32]; + const int txw_idx = get_txw_idx(TX_8X32); + const int txh_idx = get_txh_idx(TX_8X32); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 8; + const int height = 32; + const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1); + transpose_16bit_8x8(buf0 + 8, buf1 + 8); + transpose_16bit_8x8(buf0 + 16, buf1 + 16); + transpose_16bit_8x8(buf0 + 24, buf1 + 24); + + for (int i = 0; i < 4; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8); + } +} + +void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[16]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4]; + const int txw_idx = get_txw_idx(TX_16X4); + const int txh_idx = get_txh_idx(TX_16X4); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 16; + const int height = 4; + const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type]; + __m128i *buf; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x4(buf0, buf1 + 8 * i); + } + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_4x8(buf, buf); + store_buffer_16bit_to_32bit_w8(buf, output, width, height); + transpose_16bit_4x8(buf + 8, buf + 8); + store_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height); +} + +void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[16]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8]; + const int txw_idx = get_txw_idx(TX_16X8); + const int txh_idx = get_txh_idx(TX_16X8); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 16; + const int height = 8; + const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type]; + __m128i *buf; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1 + 8 * i); + } + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height); + transpose_16bit_8x8(buf + 8, buf + 8); + store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height); +} + +void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[32]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16]; + const int txw_idx = get_txw_idx(TX_16X16); + const int txh_idx = get_txh_idx(TX_16X16); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 16; + const int height = 16; + const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i); + transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i); + } + + for (int i = 0; i < 2; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8); + transpose_16bit_8x8(buf + 8, buf + 8); + store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width, + 8); + } +} + +void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[32], buf1[64]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32]; + const int txw_idx = get_txw_idx(TX_16X32); + const int txh_idx = get_txh_idx(TX_16X32); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 16; + const int height = 32; + const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type]; + + if (col_txfm != NULL && row_txfm != NULL) { + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i); + transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i); + transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i); + transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i); + } + + for (int i = 0; i < 4; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, + 8); + transpose_16bit_8x8(buf + 8, buf + 8); + store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, + width, 8); + } + } else { + av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd); + } +} + +void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[32], buf1[32]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8]; + const int txw_idx = get_txw_idx(TX_32X8); + const int txh_idx = get_txh_idx(TX_32X8); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 32; + const int height = 8; + const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type]; + + if (col_txfm != NULL && row_txfm != NULL) { + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 4; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i); + } + + for (int i = 0; i < 1; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, + height); + transpose_16bit_8x8(buf + 8, buf + 8); + store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width, + height); + transpose_16bit_8x8(buf + 16, buf + 16); + store_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16, + width, height); + transpose_16bit_8x8(buf + 24, buf + 24); + store_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24, + width, height); + } + } else { + av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd); + } +} + +void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[32], buf1[64]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16]; + const int txw_idx = get_txw_idx(TX_32X16); + const int txh_idx = get_txh_idx(TX_32X16); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 32; + const int height = 16; + const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type]; + + if (col_txfm != NULL && row_txfm != NULL) { + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 4; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i); + transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i); + } + + for (int i = 0; i < 2; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, + 8); + transpose_16bit_8x8(buf + 8, buf + 8); + store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, + width, 8); + transpose_16bit_8x8(buf + 16, buf + 16); + store_rect_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16, + width, 8); + transpose_16bit_8x8(buf + 24, buf + 24); + store_rect_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24, + width, 8); + } + } else { + av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd); + } +} + +void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[32], buf1[128]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X32]; + const int txw_idx = get_txw_idx(TX_32X32); + const int txh_idx = get_txh_idx(TX_32X32); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 32; + const int height = 32; + const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type]; + + if (col_txfm != NULL && row_txfm != NULL) { + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 4; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i); + transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i); + transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i); + transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i); + } + + for (int i = 0; i < 4; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8); + transpose_16bit_8x8(buf + 8, buf + 8); + store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width, + 8); + transpose_16bit_8x8(buf + 16, buf + 16); + store_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16, + width, 8); + transpose_16bit_8x8(buf + 24, buf + 24); + store_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24, + width, 8); + } + } else { + av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd); + } +} + +void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_64X16; + __m128i buf0[64], buf1[128]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_sse2 col_txfm = fdct8x16_new_sse2; + const transform_1d_sse2 row_txfm = av1_fdct8x64_new_sse2; + const int width_div8 = (width >> 3); + const int height_div8 = (height >> 3); + + for (int i = 0; i < width_div8; i++) { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + for (int j = 0; j < height_div8; ++j) { + transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); + } + } + + for (int i = 0; i < height_div8; i++) { + __m128i *buf = buf1 + width * i; + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + int32_t *output8 = output + 8 * 32 * i; + for (int j = 0; j < 4; ++j) { + __m128i *buf8 = buf + 8 * j; + transpose_16bit_8x8(buf8, buf8); + store_buffer_16bit_to_32bit_w8(buf8, output8 + 8 * j, 32, 8); + } + } +} + +void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_16X64; + __m128i buf0[64], buf1[128]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2; + const transform_1d_sse2 row_txfm = fdct8x16_new_sse2; + const int width_div8 = (width >> 3); + const int height_div8 = (height >> 3); + + for (int i = 0; i < width_div8; i++) { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + for (int j = 0; j < height_div8; ++j) { + transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); + } + } + + for (int i = 0; i < AOMMIN(4, height_div8); i++) { + __m128i *buf = buf1 + width * i; + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + int32_t *output8 = output + 8 * width * i; + for (int j = 0; j < width_div8; ++j) { + __m128i *buf8 = buf + 8 * j; + transpose_16bit_8x8(buf8, buf8); + store_buffer_16bit_to_32bit_w8(buf8, output8 + 8 * j, width, 8); + } + } + // Zero out the bottom 16x32 area. + memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output)); +} + +static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = { + av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform + av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform + av1_lowbd_fwd_txfm2d_16x16_sse2, // 16x16 transform + av1_lowbd_fwd_txfm2d_32x32_sse2, // 32x32 transform + NULL, // 64x64 transform + av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform + av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform + av1_lowbd_fwd_txfm2d_8x16_sse2, // 8x16 transform + av1_lowbd_fwd_txfm2d_16x8_sse2, // 16x8 transform + av1_lowbd_fwd_txfm2d_16x32_sse2, // 16x32 transform + av1_lowbd_fwd_txfm2d_32x16_sse2, // 32x16 transform + NULL, // 32x64 transform + NULL, // 64x32 transform + av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform + av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform + av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform + av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform + av1_lowbd_fwd_txfm2d_16x64_sse2, // 16x64 transform + av1_lowbd_fwd_txfm2d_64x16_sse2, // 64x16 transform +}; + +void av1_lowbd_fwd_txfm_sse2(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size]; + + if ((fwd_txfm2d_func == NULL) || + (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) + av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param); + else + fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} diff --git a/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm_sse2.h b/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm_sse2.h new file mode 100644 index 000000000..a0e32f538 --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm_sse2.h @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_ +#define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_ + +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/transpose_sse2.h" +#include "aom_dsp/x86/txfm_common_sse2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_fdct8x32_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit); +void av1_fdct8x64_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit); + +static INLINE void fidentity4x4_new_sse2(const __m128i *const input, + __m128i *const output, + const int8_t cos_bit) { + (void)cos_bit; + const __m128i one = _mm_set1_epi16(1); + + for (int i = 0; i < 4; ++i) { + const __m128i a = _mm_unpacklo_epi16(input[i], one); + const __m128i b = scale_round_sse2(a, NewSqrt2); + output[i] = _mm_packs_epi32(b, b); + } +} + +static INLINE void fidentity8x4_new_sse2(const __m128i *const input, + __m128i *const output, + const int8_t cos_bit) { + (void)cos_bit; + const __m128i one = _mm_set1_epi16(1); + + for (int i = 0; i < 4; ++i) { + const __m128i a_lo = _mm_unpacklo_epi16(input[i], one); + const __m128i a_hi = _mm_unpackhi_epi16(input[i], one); + const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2); + const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2); + output[i] = _mm_packs_epi32(b_lo, b_hi); + } +} + +static INLINE void fidentity8x8_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + + output[0] = _mm_adds_epi16(input[0], input[0]); + output[1] = _mm_adds_epi16(input[1], input[1]); + output[2] = _mm_adds_epi16(input[2], input[2]); + output[3] = _mm_adds_epi16(input[3], input[3]); + output[4] = _mm_adds_epi16(input[4], input[4]); + output[5] = _mm_adds_epi16(input[5], input[5]); + output[6] = _mm_adds_epi16(input[6], input[6]); + output[7] = _mm_adds_epi16(input[7], input[7]); +} + +static INLINE void fidentity8x16_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const __m128i one = _mm_set1_epi16(1); + + for (int i = 0; i < 16; ++i) { + const __m128i a_lo = _mm_unpacklo_epi16(input[i], one); + const __m128i a_hi = _mm_unpackhi_epi16(input[i], one); + const __m128i b_lo = scale_round_sse2(a_lo, 2 * NewSqrt2); + const __m128i b_hi = scale_round_sse2(a_hi, 2 * NewSqrt2); + output[i] = _mm_packs_epi32(b_lo, b_hi); + } +} + +static INLINE void fidentity8x32_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + for (int i = 0; i < 32; ++i) { + output[i] = _mm_slli_epi16(input[i], 2); + } +} + +static const transform_1d_sse2 col_txfm8x32_arr[TX_TYPES] = { + av1_fdct8x32_new_sse2, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + fidentity8x32_new_sse2, // IDTX + av1_fdct8x32_new_sse2, // V_DCT + fidentity8x32_new_sse2, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_ diff --git a/libs/libaom/src/av1/encoder/x86/av1_highbd_quantize_avx2.c b/libs/libaom/src/av1/encoder/x86/av1_highbd_quantize_avx2.c new file mode 100644 index 000000000..b58911fcb --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/av1_highbd_quantize_avx2.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" + +static INLINE void init_one_qp(const __m128i *p, __m256i *qp) { + const __m128i zero = _mm_setzero_si128(); + const __m128i dc = _mm_unpacklo_epi16(*p, zero); + const __m128i ac = _mm_unpackhi_epi16(*p, zero); + *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1); +} + +static INLINE void update_qp(__m256i *qp) { + qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11); + qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11); + qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11); +} + +static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *dequant_ptr, int log_scale, + __m256i *qp) { + __m128i round = _mm_loadu_si128((const __m128i *)round_ptr); + if (log_scale) { + const __m128i round_scale = _mm_set1_epi16(1 << (15 - log_scale)); + round = _mm_mulhrs_epi16(round, round_scale); + } + const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr); + const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr); + + init_one_qp(&round, &qp[0]); + init_one_qp(&quant, &qp[1]); + init_one_qp(&dequant, &qp[2]); +} + +static INLINE void quantize(const __m256i *qp, __m256i *c, + const int16_t *iscan_ptr, int log_scale, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + __m256i *eob) { + const __m256i abs_coeff = _mm256_abs_epi32(*c); + __m256i q = _mm256_add_epi32(abs_coeff, qp[0]); + + __m256i q_lo = _mm256_mul_epi32(q, qp[1]); + __m256i q_hi = _mm256_srli_epi64(q, 32); + const __m256i qp_hi = _mm256_srli_epi64(qp[1], 32); + q_hi = _mm256_mul_epi32(q_hi, qp_hi); + q_lo = _mm256_srli_epi64(q_lo, 16 - log_scale); + q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale); + q_hi = _mm256_slli_epi64(q_hi, 32); + q = _mm256_or_si256(q_lo, q_hi); + const __m256i abs_s = _mm256_slli_epi32(abs_coeff, 1 + log_scale); + const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s); + q = _mm256_andnot_si256(mask, q); + + __m256i dq = _mm256_mullo_epi32(q, qp[2]); + dq = _mm256_srai_epi32(dq, log_scale); + q = _mm256_sign_epi32(q, *c); + dq = _mm256_sign_epi32(dq, *c); + + _mm256_storeu_si256((__m256i *)qcoeff, q); + _mm256_storeu_si256((__m256i *)dqcoeff, dq); + + const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr); + const __m128i zr = _mm_setzero_si128(); + const __m128i lo = _mm_unpacklo_epi16(isc, zr); + const __m128i hi = _mm_unpackhi_epi16(isc, zr); + const __m256i iscan = + _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); + + const __m256i zero = _mm256_setzero_si256(); + const __m256i zc = _mm256_cmpeq_epi32(dq, zero); + const __m256i nz = _mm256_cmpeq_epi32(zc, zero); + __m256i cur_eob = _mm256_sub_epi32(iscan, nz); + cur_eob = _mm256_and_si256(cur_eob, nz); + *eob = _mm256_max_epi32(cur_eob, *eob); +} + +void av1_highbd_quantize_fp_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, int log_scale) { + (void)scan; + (void)zbin_ptr; + (void)quant_shift_ptr; + const unsigned int step = 8; + __m256i qp[3], coeff; + + init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, qp); + coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + + __m256i eob = _mm256_setzero_si256(); + quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + + update_qp(qp); + while (n_coeffs > 0) { + coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + } + { + __m256i eob_s; + eob_s = _mm256_shuffle_epi32(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 1); + eob = _mm256_max_epi16(eob, eob_s); + const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob), + _mm256_extractf128_si256(eob, 1)); + *eob_ptr = _mm_extract_epi16(final_eob, 0); + } +} diff --git a/libs/libaom/src/av1/encoder/x86/av1_highbd_quantize_sse4.c b/libs/libaom/src/av1/encoder/x86/av1_highbd_quantize_sse4.c new file mode 100644 index 000000000..40b3b460b --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/av1_highbd_quantize_sse4.c @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/x86/synonyms.h" + +// Coefficient quantization phase 1 +// param[0-2] : rounding/quan/dequan constants +static INLINE void quantize_coeff_phase1(__m128i *coeff, const __m128i *param, + const int shift, const int scale, + __m128i *qcoeff, __m128i *dquan, + __m128i *sign) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi32(1); + + *sign = _mm_cmplt_epi32(*coeff, zero); + *sign = _mm_or_si128(*sign, one); + *coeff = _mm_abs_epi32(*coeff); + + qcoeff[0] = _mm_add_epi32(*coeff, param[0]); + qcoeff[1] = _mm_unpackhi_epi32(qcoeff[0], zero); + qcoeff[0] = _mm_unpacklo_epi32(qcoeff[0], zero); + + qcoeff[0] = _mm_mul_epi32(qcoeff[0], param[1]); + qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift); + dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]); + dquan[0] = _mm_srli_epi64(dquan[0], scale); + const __m128i abs_s = _mm_slli_epi32(*coeff, 1 + scale); + qcoeff[2] = _mm_cmplt_epi32(abs_s, param[3]); +} + +// Coefficient quantization phase 2 +static INLINE void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan, + const __m128i *sign, + const __m128i *param, const int shift, + const int scale, tran_low_t *qAddr, + tran_low_t *dqAddr) { + __m128i mask0L = _mm_set_epi32(-1, -1, 0, 0); + __m128i mask0H = _mm_set_epi32(0, 0, -1, -1); + + qcoeff[1] = _mm_mul_epi32(qcoeff[1], param[1]); + qcoeff[1] = _mm_srli_epi64(qcoeff[1], shift); + dquan[1] = _mm_mul_epi32(qcoeff[1], param[2]); + dquan[1] = _mm_srli_epi64(dquan[1], scale); + + // combine L&H + qcoeff[0] = _mm_shuffle_epi32(qcoeff[0], 0xd8); + qcoeff[1] = _mm_shuffle_epi32(qcoeff[1], 0x8d); + + qcoeff[0] = _mm_and_si128(qcoeff[0], mask0H); + qcoeff[1] = _mm_and_si128(qcoeff[1], mask0L); + + dquan[0] = _mm_shuffle_epi32(dquan[0], 0xd8); + dquan[1] = _mm_shuffle_epi32(dquan[1], 0x8d); + + dquan[0] = _mm_and_si128(dquan[0], mask0H); + dquan[1] = _mm_and_si128(dquan[1], mask0L); + + qcoeff[0] = _mm_or_si128(qcoeff[0], qcoeff[1]); + dquan[0] = _mm_or_si128(dquan[0], dquan[1]); + + qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign); + dquan[0] = _mm_sign_epi32(dquan[0], *sign); + qcoeff[0] = _mm_andnot_si128(qcoeff[2], qcoeff[0]); + dquan[0] = _mm_andnot_si128(qcoeff[2], dquan[0]); + _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]); + _mm_storeu_si128((__m128i *)dqAddr, dquan[0]); +} + +static INLINE void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan, + __m128i *eob) { + const __m128i zero = _mm_setzero_si128(); + __m128i mask, iscanIdx; + const __m128i q0 = _mm_loadu_si128((__m128i const *)qcoeff_ptr); + const __m128i q1 = _mm_loadu_si128((__m128i const *)(qcoeff_ptr + 4)); + __m128i nz_flag0 = _mm_cmpeq_epi32(q0, zero); + __m128i nz_flag1 = _mm_cmpeq_epi32(q1, zero); + + nz_flag0 = _mm_cmpeq_epi32(nz_flag0, zero); + nz_flag1 = _mm_cmpeq_epi32(nz_flag1, zero); + + mask = _mm_packs_epi32(nz_flag0, nz_flag1); + iscanIdx = _mm_loadu_si128((__m128i const *)iscan); + iscanIdx = _mm_sub_epi16(iscanIdx, mask); + iscanIdx = _mm_and_si128(iscanIdx, mask); + *eob = _mm_max_epi16(*eob, iscanIdx); +} + +static INLINE uint16_t get_accumulated_eob(__m128i *eob) { + __m128i eob_shuffled; + uint16_t eobValue; + eob_shuffled = _mm_shuffle_epi32(*eob, 0xe); + *eob = _mm_max_epi16(*eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(*eob, 0xe); + *eob = _mm_max_epi16(*eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(*eob, 0x1); + *eob = _mm_max_epi16(*eob, eob_shuffled); + eobValue = _mm_extract_epi16(*eob, 0); + return eobValue; +} + +void av1_highbd_quantize_fp_sse4_1( + const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, int log_scale) { + __m128i coeff[2], qcoeff[3], dequant[2], qparam[4], coeff_sign; + __m128i eob = _mm_setzero_si128(); + const tran_low_t *src = coeff_ptr; + tran_low_t *quanAddr = qcoeff_ptr; + tran_low_t *dquanAddr = dqcoeff_ptr; + const int shift = 16 - log_scale; + const int coeff_stride = 4; + const int quan_stride = coeff_stride; + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)scan; + + memset(quanAddr, 0, count * sizeof(quanAddr[0])); + memset(dquanAddr, 0, count * sizeof(dquanAddr[0])); + + coeff[0] = _mm_loadu_si128((__m128i const *)src); + const int round1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale); + const int round0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale); + + qparam[0] = _mm_set_epi32(round1, round1, round1, round0); + qparam[1] = xx_set_64_from_32i(quant_ptr[1], quant_ptr[0]); + qparam[2] = xx_set_64_from_32i(dequant_ptr[1], dequant_ptr[0]); + qparam[3] = _mm_set_epi32(dequant_ptr[1], dequant_ptr[1], dequant_ptr[1], + dequant_ptr[0]); + + // DC and first 3 AC + quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant, + &coeff_sign); + + // update round/quan/dquan for AC + qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]); + qparam[1] = xx_set1_64_from_32i(quant_ptr[1]); + qparam[2] = xx_set1_64_from_32i(dequant_ptr[1]); + qparam[3] = _mm_set1_epi32(dequant_ptr[1]); + quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale, + quanAddr, dquanAddr); + + // next 4 AC + coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride)); + quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant, + &coeff_sign); + quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale, + quanAddr + quan_stride, dquanAddr + quan_stride); + + find_eob(quanAddr, iscan, &eob); + + count -= 8; + + // loop for the rest of AC + while (count > 0) { + src += coeff_stride << 1; + quanAddr += quan_stride << 1; + dquanAddr += quan_stride << 1; + iscan += quan_stride << 1; + + coeff[0] = _mm_loadu_si128((__m128i const *)src); + coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride)); + + quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant, + &coeff_sign); + quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, + log_scale, quanAddr, dquanAddr); + + quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant, + &coeff_sign); + quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, + log_scale, quanAddr + quan_stride, + dquanAddr + quan_stride); + + find_eob(quanAddr, iscan, &eob); + + count -= 8; + } + *eob_ptr = get_accumulated_eob(&eob); +} diff --git a/libs/libaom/src/av1/encoder/x86/av1_quantize_avx2.c b/libs/libaom/src/av1/encoder/x86/av1_quantize_avx2.c new file mode 100644 index 000000000..f5f7ee115 --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/av1_quantize_avx2.c @@ -0,0 +1,445 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" + +static INLINE void read_coeff(const tran_low_t *coeff, __m256i *c) { + if (sizeof(tran_low_t) == 4) { + const __m256i x0 = _mm256_loadu_si256((const __m256i *)coeff); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)coeff + 1); + *c = _mm256_packs_epi32(x0, x1); + *c = _mm256_permute4x64_epi64(*c, 0xD8); + } else { + *c = _mm256_loadu_si256((const __m256i *)coeff); + } +} + +static INLINE void write_zero(tran_low_t *qcoeff) { + const __m256i zero = _mm256_setzero_si256(); + if (sizeof(tran_low_t) == 4) { + _mm256_storeu_si256((__m256i *)qcoeff, zero); + _mm256_storeu_si256((__m256i *)qcoeff + 1, zero); + } else { + _mm256_storeu_si256((__m256i *)qcoeff, zero); + } +} + +static INLINE void init_one_qp(const __m128i *p, __m256i *qp) { + const __m128i ac = _mm_unpackhi_epi64(*p, *p); + *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(*p), ac, 1); +} + +static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *dequant_ptr, int log_scale, + __m256i *thr, __m256i *qp) { + __m128i round = _mm_loadu_si128((const __m128i *)round_ptr); + const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr); + const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr); + + if (log_scale > 0) { + const __m128i rnd = _mm_set1_epi16((int16_t)1 << (log_scale - 1)); + round = _mm_add_epi16(round, rnd); + round = _mm_srai_epi16(round, log_scale); + } + + init_one_qp(&round, &qp[0]); + init_one_qp(&quant, &qp[1]); + + if (log_scale == 1) { + qp[1] = _mm256_slli_epi16(qp[1], log_scale); + } + + init_one_qp(&dequant, &qp[2]); + *thr = _mm256_srai_epi16(qp[2], 1 + log_scale); +} + +static INLINE void update_qp(int log_scale, __m256i *thr, __m256i *qp) { + qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11); + qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11); + qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11); + *thr = _mm256_srai_epi16(qp[2], 1 + log_scale); +} + +#define store_quan(q, addr) \ + do { \ + __m256i sign_bits = _mm256_srai_epi16(q, 15); \ + __m256i y0 = _mm256_unpacklo_epi16(q, sign_bits); \ + __m256i y1 = _mm256_unpackhi_epi16(q, sign_bits); \ + __m256i x0 = _mm256_permute2x128_si256(y0, y1, 0x20); \ + __m256i x1 = _mm256_permute2x128_si256(y0, y1, 0x31); \ + _mm256_storeu_si256((__m256i *)addr, x0); \ + _mm256_storeu_si256((__m256i *)addr + 1, x1); \ + } while (0) + +#define store_two_quan(q, addr1, dq, addr2) \ + do { \ + if (sizeof(tran_low_t) == 4) { \ + store_quan(q, addr1); \ + store_quan(dq, addr2); \ + } else { \ + _mm256_storeu_si256((__m256i *)addr1, q); \ + _mm256_storeu_si256((__m256i *)addr2, dq); \ + } \ + } while (0) + +static INLINE uint16_t quant_gather_eob(__m256i eob) { + const __m128i eob_lo = _mm256_castsi256_si128(eob); + const __m128i eob_hi = _mm256_extractf128_si256(eob, 1); + __m128i eob_s = _mm_max_epi16(eob_lo, eob_hi); + eob_s = _mm_subs_epu16(_mm_set1_epi16(INT16_MAX), eob_s); + eob_s = _mm_minpos_epu16(eob_s); + return INT16_MAX - _mm_extract_epi16(eob_s, 0); +} + +static INLINE void quantize(const __m256i *thr, const __m256i *qp, __m256i *c, + const int16_t *iscan_ptr, tran_low_t *qcoeff, + tran_low_t *dqcoeff, __m256i *eob) { + const __m256i abs_coeff = _mm256_abs_epi16(*c); + __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr); + mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr)); + const int nzflag = _mm256_movemask_epi8(mask); + + if (nzflag) { + __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]); + q = _mm256_mulhi_epi16(q, qp[1]); + q = _mm256_sign_epi16(q, *c); + const __m256i dq = _mm256_mullo_epi16(q, qp[2]); + + store_two_quan(q, qcoeff, dq, dqcoeff); + const __m256i zero = _mm256_setzero_si256(); + const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr); + const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero); + const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero); + __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff); + cur_eob = _mm256_and_si256(cur_eob, nzero_coeff); + *eob = _mm256_max_epi16(*eob, cur_eob); + } else { + write_zero(qcoeff); + write_zero(dqcoeff); + } +} + +static INLINE __m256i scan_eob_256(const __m256i *iscan_ptr, + __m256i *coeff256) { + const __m256i iscan = _mm256_loadu_si256(iscan_ptr); + const __m256i zero256 = _mm256_setzero_si256(); + const __m256i zero_coeff0 = _mm256_cmpeq_epi16(*coeff256, zero256); + const __m256i nzero_coeff0 = _mm256_cmpeq_epi16(zero_coeff0, zero256); + // Add one to convert from indices to counts + const __m256i iscan_plus_one = _mm256_sub_epi16(iscan, nzero_coeff0); + return _mm256_and_si256(iscan_plus_one, nzero_coeff0); +} + +static INLINE int16_t accumulate_eob(__m128i eob) { + __m128i eob_shuffled; + eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + return _mm_extract_epi16(eob, 1); +} + +static INLINE void store_zero_tran_low(int16_t *a) { + const __m256i zero = _mm256_setzero_si256(); + _mm256_storeu_si256((__m256i *)(a), zero); +} + +void av1_quantize_lp_avx2(const int16_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, const int16_t *quant_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan) { + __m128i eob; + __m256i round256, quant256, dequant256; + __m256i eob256, thr256; + + coeff_ptr += n_coeffs; + scan += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + { + __m256i coeff256; + + // Setup global values + { + const __m128i round = _mm_load_si128((const __m128i *)round_ptr); + const __m128i quant = _mm_load_si128((const __m128i *)quant_ptr); + const __m128i dequant = _mm_load_si128((const __m128i *)dequant_ptr); + round256 = _mm256_castsi128_si256(round); + round256 = _mm256_permute4x64_epi64(round256, 0x54); + + quant256 = _mm256_castsi128_si256(quant); + quant256 = _mm256_permute4x64_epi64(quant256, 0x54); + + dequant256 = _mm256_castsi128_si256(dequant); + dequant256 = _mm256_permute4x64_epi64(dequant256, 0x54); + } + + { + __m256i qcoeff256; + __m256i qtmp256; + coeff256 = _mm256_loadu_si256((const __m256i *)(coeff_ptr + n_coeffs)); + qcoeff256 = _mm256_abs_epi16(coeff256); + qcoeff256 = _mm256_adds_epi16(qcoeff256, round256); + qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256); + qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256); + _mm256_storeu_si256((__m256i *)(qcoeff_ptr + n_coeffs), qcoeff256); + coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256); + _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + n_coeffs), coeff256); + } + + eob256 = scan_eob_256((const __m256i *)(scan + n_coeffs), &coeff256); + n_coeffs += 8 * 2; + } + + // remove dc constants + dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31); + quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31); + round256 = _mm256_permute2x128_si256(round256, round256, 0x31); + + thr256 = _mm256_srai_epi16(dequant256, 1); + + // AC only loop + while (n_coeffs < 0) { + __m256i coeff256 = + _mm256_loadu_si256((const __m256i *)(coeff_ptr + n_coeffs)); + __m256i qcoeff256 = _mm256_abs_epi16(coeff256); + int32_t nzflag = + _mm256_movemask_epi8(_mm256_cmpgt_epi16(qcoeff256, thr256)); + + if (nzflag) { + __m256i qtmp256; + qcoeff256 = _mm256_adds_epi16(qcoeff256, round256); + qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256); + qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256); + _mm256_storeu_si256((__m256i *)(qcoeff_ptr + n_coeffs), qcoeff256); + coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256); + _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + n_coeffs), coeff256); + eob256 = _mm256_max_epi16( + eob256, scan_eob_256((const __m256i *)(scan + n_coeffs), &coeff256)); + } else { + store_zero_tran_low(qcoeff_ptr + n_coeffs); + store_zero_tran_low(dqcoeff_ptr + n_coeffs); + } + n_coeffs += 8 * 2; + } + + eob = _mm_max_epi16(_mm256_castsi256_si128(eob256), + _mm256_extracti128_si256(eob256, 1)); + + *eob_ptr = accumulate_eob(eob); +} + +void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, const int16_t *iscan_ptr) { + (void)scan_ptr; + (void)zbin_ptr; + (void)quant_shift_ptr; + const unsigned int step = 16; + + __m256i qp[3]; + __m256i coeff, thr; + const int log_scale = 0; + + init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp); + read_coeff(coeff_ptr, &coeff); + + __m256i eob = _mm256_setzero_si256(); + quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; + + update_qp(log_scale, &thr, qp); + + while (n_coeffs > 0) { + read_coeff(coeff_ptr, &coeff); + quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; + } + *eob_ptr = quant_gather_eob(eob); +} + +static INLINE void quantize_32x32(const __m256i *thr, const __m256i *qp, + __m256i *c, const int16_t *iscan_ptr, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + __m256i *eob) { + const __m256i abs_coeff = _mm256_abs_epi16(*c); + __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr); + mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr)); + const int nzflag = _mm256_movemask_epi8(mask); + + if (nzflag) { + __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]); + q = _mm256_mulhi_epu16(q, qp[1]); + + __m256i dq = _mm256_mullo_epi16(q, qp[2]); + dq = _mm256_srli_epi16(dq, 1); + + q = _mm256_sign_epi16(q, *c); + dq = _mm256_sign_epi16(dq, *c); + + store_two_quan(q, qcoeff, dq, dqcoeff); + const __m256i zero = _mm256_setzero_si256(); + const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr); + const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero); + const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero); + __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff); + cur_eob = _mm256_and_si256(cur_eob, nzero_coeff); + *eob = _mm256_max_epi16(*eob, cur_eob); + } else { + write_zero(qcoeff); + write_zero(dqcoeff); + } +} + +void av1_quantize_fp_32x32_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, const int16_t *iscan_ptr) { + (void)scan_ptr; + (void)zbin_ptr; + (void)quant_shift_ptr; + const unsigned int step = 16; + + __m256i qp[3]; + __m256i coeff, thr; + const int log_scale = 1; + + init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp); + read_coeff(coeff_ptr, &coeff); + + __m256i eob = _mm256_setzero_si256(); + quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; + + update_qp(log_scale, &thr, qp); + + while (n_coeffs > 0) { + read_coeff(coeff_ptr, &coeff); + quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; + } + *eob_ptr = quant_gather_eob(eob); +} + +static INLINE void quantize_64x64(const __m256i *thr, const __m256i *qp, + __m256i *c, const int16_t *iscan_ptr, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + __m256i *eob) { + const __m256i abs_coeff = _mm256_abs_epi16(*c); + __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr); + mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr)); + const int nzflag = _mm256_movemask_epi8(mask); + + if (nzflag) { + __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]); + __m256i qh = _mm256_mulhi_epi16(q, qp[1]); + __m256i ql = _mm256_mullo_epi16(q, qp[1]); + qh = _mm256_slli_epi16(qh, 2); + ql = _mm256_srli_epi16(ql, 14); + q = _mm256_or_si256(qh, ql); + const __m256i dqh = _mm256_slli_epi16(_mm256_mulhi_epi16(q, qp[2]), 14); + const __m256i dql = _mm256_srli_epi16(_mm256_mullo_epi16(q, qp[2]), 2); + __m256i dq = _mm256_or_si256(dqh, dql); + + q = _mm256_sign_epi16(q, *c); + dq = _mm256_sign_epi16(dq, *c); + + store_two_quan(q, qcoeff, dq, dqcoeff); + const __m256i zero = _mm256_setzero_si256(); + const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr); + const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero); + const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero); + __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff); + cur_eob = _mm256_and_si256(cur_eob, nzero_coeff); + *eob = _mm256_max_epi16(*eob, cur_eob); + } else { + write_zero(qcoeff); + write_zero(dqcoeff); + } +} + +void av1_quantize_fp_64x64_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, const int16_t *iscan_ptr) { + (void)scan_ptr; + (void)zbin_ptr; + (void)quant_shift_ptr; + const unsigned int step = 16; + + __m256i qp[3]; + __m256i coeff, thr; + const int log_scale = 2; + + init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp); + read_coeff(coeff_ptr, &coeff); + + __m256i eob = _mm256_setzero_si256(); + quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; + + update_qp(log_scale, &thr, qp); + + while (n_coeffs > 0) { + read_coeff(coeff_ptr, &coeff); + quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; + } + *eob_ptr = quant_gather_eob(eob); +} diff --git a/libs/libaom/src/av1/encoder/x86/av1_quantize_sse2.c b/libs/libaom/src/av1/encoder/x86/av1_quantize_sse2.c new file mode 100644 index 000000000..5497c7eb7 --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/av1_quantize_sse2.c @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" + +static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset, + __m128i *c0, __m128i *c1) { + const tran_low_t *addr = coeff + offset; + if (sizeof(tran_low_t) == 4) { + const __m128i x0 = _mm_load_si128((const __m128i *)addr); + const __m128i x1 = _mm_load_si128((const __m128i *)addr + 1); + const __m128i x2 = _mm_load_si128((const __m128i *)addr + 2); + const __m128i x3 = _mm_load_si128((const __m128i *)addr + 3); + *c0 = _mm_packs_epi32(x0, x1); + *c1 = _mm_packs_epi32(x2, x3); + } else { + *c0 = _mm_load_si128((const __m128i *)addr); + *c1 = _mm_load_si128((const __m128i *)addr + 1); + } +} + +static INLINE void write_qcoeff(const __m128i *qc0, const __m128i *qc1, + tran_low_t *qcoeff, intptr_t offset) { + tran_low_t *addr = qcoeff + offset; + if (sizeof(tran_low_t) == 4) { + const __m128i zero = _mm_setzero_si128(); + __m128i sign_bits = _mm_cmplt_epi16(*qc0, zero); + __m128i y0 = _mm_unpacklo_epi16(*qc0, sign_bits); + __m128i y1 = _mm_unpackhi_epi16(*qc0, sign_bits); + _mm_store_si128((__m128i *)addr, y0); + _mm_store_si128((__m128i *)addr + 1, y1); + + sign_bits = _mm_cmplt_epi16(*qc1, zero); + y0 = _mm_unpacklo_epi16(*qc1, sign_bits); + y1 = _mm_unpackhi_epi16(*qc1, sign_bits); + _mm_store_si128((__m128i *)addr + 2, y0); + _mm_store_si128((__m128i *)addr + 3, y1); + } else { + _mm_store_si128((__m128i *)addr, *qc0); + _mm_store_si128((__m128i *)addr + 1, *qc1); + } +} + +static INLINE void write_zero(tran_low_t *qcoeff, intptr_t offset) { + const __m128i zero = _mm_setzero_si128(); + tran_low_t *addr = qcoeff + offset; + if (sizeof(tran_low_t) == 4) { + _mm_store_si128((__m128i *)addr, zero); + _mm_store_si128((__m128i *)addr + 1, zero); + _mm_store_si128((__m128i *)addr + 2, zero); + _mm_store_si128((__m128i *)addr + 3, zero); + } else { + _mm_store_si128((__m128i *)addr, zero); + _mm_store_si128((__m128i *)addr + 1, zero); + } +} + +static INLINE void quantize(const int16_t *iscan_ptr, + const tran_low_t *coeff_ptr, intptr_t n_coeffs, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const __m128i *round0, const __m128i *round1, + const __m128i *quant0, const __m128i *quant1, + const __m128i *dequant0, const __m128i *dequant1, + const __m128i *thr0, const __m128i *thr1, + __m128i *eob) { + __m128i coeff0, coeff1; + // Do DC and first 15 AC + read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1); + + // Poor man's sign extract + const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15); + const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15); + __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); + __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + const __m128i mask0 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff0, *thr0), + _mm_cmpeq_epi16(qcoeff0, *thr0)); + const __m128i mask1 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff1, *thr1), + _mm_cmpeq_epi16(qcoeff1, *thr1)); + const int nzflag = _mm_movemask_epi8(mask0) | _mm_movemask_epi8(mask1); + + if (nzflag) { + qcoeff0 = _mm_adds_epi16(qcoeff0, *round0); + qcoeff1 = _mm_adds_epi16(qcoeff1, *round1); + const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0); + const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1); + + // Reinsert signs + qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); + qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs); + + coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0); + coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1); + + write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs); + + const __m128i zero = _mm_setzero_si128(); + // Scan for eob + const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); + const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); + const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); + const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); + const __m128i iscan0 = + _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + const __m128i iscan1 = + _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + // Add one to convert from indices to counts + const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0); + const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1); + const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0); + const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1); + const __m128i eob2 = _mm_max_epi16(eob0, eob1); + *eob = _mm_max_epi16(*eob, eob2); + } else { + write_zero(qcoeff_ptr, n_coeffs); + write_zero(dqcoeff_ptr, n_coeffs); + } +} + +void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, const int16_t *iscan_ptr) { + (void)scan_ptr; + (void)zbin_ptr; + (void)quant_shift_ptr; + + coeff_ptr += n_coeffs; + iscan_ptr += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr); + const __m128i round1 = _mm_unpackhi_epi64(round0, round0); + const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr); + const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0); + const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr); + const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0); + const __m128i thr0 = _mm_srai_epi16(dequant0, 1); + const __m128i thr1 = _mm_srai_epi16(dequant1, 1); + __m128i eob = _mm_setzero_si128(); + + quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0, + &round1, &quant0, &quant1, &dequant0, &dequant1, &thr0, &thr1, &eob); + + n_coeffs += 8 * 2; + + // AC only loop + while (n_coeffs < 0) { + quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1, + &round1, &quant1, &quant1, &dequant1, &dequant1, &thr1, &thr1, + &eob); + n_coeffs += 8 * 2; + } + + // Accumulate EOB + { + __m128i eob_shuffled; + eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + *eob_ptr = _mm_extract_epi16(eob, 1); + } +} diff --git a/libs/libaom/src/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm b/libs/libaom/src/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm new file mode 100644 index 000000000..ad4ae274e --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm @@ -0,0 +1,204 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%define private_prefix av1 + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_1: times 8 dw 1 + +SECTION .text + +%macro QUANTIZE_FP 2 +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ + shift, qcoeff, dqcoeff, dequant, \ + eob, scan, iscan + cmp dword skipm, 0 + jne .blank + + ; actual quantize loop - setup pointers, rounders, etc. + movifnidn coeffq, coeffmp + movifnidn ncoeffq, ncoeffmp + mov r2, dequantmp + movifnidn zbinq, zbinmp + movifnidn roundq, roundmp + movifnidn quantq, quantmp + mova m1, [roundq] ; m1 = round + mova m2, [quantq] ; m2 = quant +%ifidn %1, fp_32x32 + pcmpeqw m5, m5 + psrlw m5, 15 + paddw m1, m5 + psrlw m1, 1 ; m1 = (m1 + 1) / 2 +%endif + mova m3, [r2q] ; m3 = dequant + mov r3, qcoeffmp + mov r4, dqcoeffmp + mov r5, iscanmp +%ifidn %1, fp_32x32 + psllw m2, 1 +%endif + pxor m5, m5 ; m5 = dedicated zero + + lea coeffq, [ coeffq+ncoeffq*2] + lea r5q, [ r5q+ncoeffq*2] + lea r3q, [ r3q+ncoeffq*2] + lea r4q, [r4q+ncoeffq*2] + neg ncoeffq + + ; get DC and first 15 AC coeffs + mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] + mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpeqw m7, m7 + + paddsw m6, m1 ; m6 += round + punpckhqdq m1, m1 + paddsw m11, m1 ; m11 += round + pmulhw m8, m6, m2 ; m8 = m6*q>>16 + punpckhqdq m2, m2 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + psignw m8, m9 ; m8 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + mova [r3q+ncoeffq*2+ 0], m8 + mova [r3q+ncoeffq*2+16], m13 +%ifidn %1, fp_32x32 + pabsw m8, m8 + pabsw m13, m13 +%endif + pmullw m8, m3 ; r4[i] = r3[i] * q + punpckhqdq m3, m3 + pmullw m13, m3 ; r4[i] = r3[i] * q +%ifidn %1, fp_32x32 + psrlw m8, 1 + psrlw m13, 1 + psignw m8, m9 + psignw m13, m10 + psrlw m0, m3, 2 +%else + psrlw m0, m3, 1 +%endif + mova [r4q+ncoeffq*2+ 0], m8 + mova [r4q+ncoeffq*2+16], m13 + pcmpeqw m8, m5 ; m8 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m7 ; m11 = scan[i] + 1 + pandn m8, m6 ; m8 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m13 + add ncoeffq, mmsize + jz .accumulate_eob + +.ac_only_loop: + mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] + mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + + pcmpgtw m7, m6, m0 + pcmpgtw m12, m11, m0 + pmovmskb r6d, m7 + pmovmskb r2d, m12 + + or r6, r2 + jz .skip_iter + + pcmpeqw m7, m7 + + paddsw m6, m1 ; m6 += round + paddsw m11, m1 ; m11 += round + pmulhw m14, m6, m2 ; m14 = m6*q>>16 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + psignw m14, m9 ; m14 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + mova [r3q+ncoeffq*2+ 0], m14 + mova [r3q+ncoeffq*2+16], m13 +%ifidn %1, fp_32x32 + pabsw m14, m14 + pabsw m13, m13 +%endif + pmullw m14, m3 ; r4[i] = r3[i] * q + pmullw m13, m3 ; r4[i] = r3[i] * q +%ifidn %1, fp_32x32 + psrlw m14, 1 + psrlw m13, 1 + psignw m14, m9 + psignw m13, m10 +%endif + mova [r4q+ncoeffq*2+ 0], m14 + mova [r4q+ncoeffq*2+16], m13 + pcmpeqw m14, m5 ; m14 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m7 ; m11 = scan[i] + 1 + pandn m14, m6 ; m14 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m14 + pmaxsw m8, m13 + add ncoeffq, mmsize + jl .ac_only_loop + + jmp .accumulate_eob +.skip_iter: + mova [r3q+ncoeffq*2+ 0], m5 + mova [r3q+ncoeffq*2+16], m5 + mova [r4q+ncoeffq*2+ 0], m5 + mova [r4q+ncoeffq*2+16], m5 + add ncoeffq, mmsize + jl .ac_only_loop + +.accumulate_eob: + ; horizontally accumulate/max eobs and write into [eob] memory pointer + mov r2, eobmp + pshufd m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0x1 + pmaxsw m8, m7 + pextrw r6, m8, 0 + mov [r2], r6 + RET + + ; skip-block, i.e. just write all zeroes +.blank: + mov r0, dqcoeffmp + movifnidn ncoeffq, ncoeffmp + mov r2, qcoeffmp + mov r3, eobmp + + lea r0q, [r0q+ncoeffq*2] + lea r2q, [r2q+ncoeffq*2] + neg ncoeffq + pxor m7, m7 +.blank_loop: + mova [r0q+ncoeffq*2+ 0], m7 + mova [r0q+ncoeffq*2+16], m7 + mova [r2q+ncoeffq*2+ 0], m7 + mova [r2q+ncoeffq*2+16], m7 + add ncoeffq, mmsize + jl .blank_loop + mov word [r3q], 0 + RET +%endmacro + +INIT_XMM ssse3 +QUANTIZE_FP fp, 7 +QUANTIZE_FP fp_32x32, 7 diff --git a/libs/libaom/src/av1/encoder/x86/av1_ssim_opt_x86_64.asm b/libs/libaom/src/av1/encoder/x86/av1_ssim_opt_x86_64.asm new file mode 100644 index 000000000..faa2a232a --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/av1_ssim_opt_x86_64.asm @@ -0,0 +1,222 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "aom_ports/x86_abi_support.asm" + +; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr +%macro TABULATE_SSIM 0 + paddusw xmm15, xmm3 ; sum_s + paddusw xmm14, xmm4 ; sum_r + movdqa xmm1, xmm3 + pmaddwd xmm1, xmm1 + paddd xmm13, xmm1 ; sum_sq_s + movdqa xmm2, xmm4 + pmaddwd xmm2, xmm2 + paddd xmm12, xmm2 ; sum_sq_r + pmaddwd xmm3, xmm4 + paddd xmm11, xmm3 ; sum_sxr +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_Q 1 + movdqa xmm2,%1 + punpckldq %1,xmm0 + punpckhdq xmm2,xmm0 + paddq %1,xmm2 + movdqa xmm2,%1 + punpcklqdq %1,xmm0 + punpckhqdq xmm2,xmm0 + paddq %1,xmm2 +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_W 1 + movdqa xmm1, %1 + punpcklwd %1,xmm0 + punpckhwd xmm1,xmm0 + paddd %1, xmm1 + SUM_ACROSS_Q %1 +%endmacro + +SECTION .text + +;void ssim_parms_sse2( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; unsigned long *sum_s, +; unsigned long *sum_r, +; unsigned long *sum_sq_s, +; unsigned long *sum_sq_r, +; unsigned long *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +global sym(av1_ssim_parms_16x16_sse2) PRIVATE +sym(av1_ssim_parms_16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 15 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 16 ;row counter +.NextRow: + + ;grab source and reference pixels + movdqu xmm5, [rsi] + movdqu xmm6, [rdi] + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpckhbw xmm3, xmm0 ; high_s + punpckhbw xmm4, xmm0 ; high_r + + TABULATE_SSIM + + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz .NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movd [rdi], xmm15; + mov rdi,arg(5) + movd [rdi], xmm14; + mov rdi,arg(6) + movd [rdi], xmm13; + mov rdi,arg(7) + movd [rdi], xmm12; + mov rdi,arg(8) + movd [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void ssim_parms_sse2( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; unsigned long *sum_s, +; unsigned long *sum_r, +; unsigned long *sum_sq_s, +; unsigned long *sum_sq_r, +; unsigned long *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +global sym(av1_ssim_parms_8x8_sse2) PRIVATE +sym(av1_ssim_parms_8x8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 15 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 8 ;row counter +.NextRow: + + ;grab source and reference pixels + movq xmm3, [rsi] + movq xmm4, [rdi] + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz .NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movd [rdi], xmm15; + mov rdi,arg(5) + movd [rdi], xmm14; + mov rdi,arg(6) + movd [rdi], xmm13; + mov rdi,arg(7) + movd [rdi], xmm12; + mov rdi,arg(8) + movd [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/libs/libaom/src/av1/encoder/x86/av1_txfm1d_sse4.h b/libs/libaom/src/av1/encoder/x86/av1_txfm1d_sse4.h new file mode 100644 index 000000000..7a0f32898 --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/av1_txfm1d_sse4.h @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_ +#define AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_ + +#include +#include "av1/common/av1_txfm.h" +#include "av1/common/x86/av1_txfm_sse4.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_fdct4_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_fdct8_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_fdct16_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_fdct32_sse4_1(__m128i *input, __m128i *output, int cos_bit, + const int stride); +void av1_fdct64_sse4_1(__m128i *input, __m128i *output, int8_t cos_bit, + const int instride, const int outstride); +void av1_fadst4_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_fadst8_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_fadst16_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); + +void av1_idct4_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_idct8_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_idct16_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_idct32_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_idct64_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); + +void av1_iadst4_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_iadst8_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_iadst16_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); + +void av1_idtx32_sse4_1(__m128i *input, __m128i *output, int cos_bit, + const int col_num); + +static INLINE void transpose_32_4x4(int stride, const __m128i *input, + __m128i *output) { + __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]); + __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]); + __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]); + __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]); + + output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2); + output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2); + output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3); + output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3); +} + +// the entire input block can be represent by a grid of 4x4 blocks +// each 4x4 blocks can be represent by 4 vertical __m128i +// we first transpose each 4x4 block internally +// then transpose the grid +static INLINE void transpose_32(int txfm_size, const __m128i *input, + __m128i *output) { + const int num_per_128 = 4; + const int row_size = txfm_size; + const int col_size = txfm_size / num_per_128; + int r, c; + + // transpose each 4x4 block internally + for (r = 0; r < row_size; r += 4) { + for (c = 0; c < col_size; c++) { + transpose_32_4x4(col_size, &input[r * col_size + c], + &output[c * 4 * col_size + r / 4]); + } + } +} + +// out0 = in0*w0 + in1*w1 +// out1 = -in1*w0 + in0*w1 +#define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \ + do { \ + const __m128i ww0 = _mm_set1_epi32(w0); \ + const __m128i ww1 = _mm_set1_epi32(w1); \ + const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0); \ + const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1); \ + out0 = _mm_add_epi32(in0_w0, in1_w1); \ + out0 = av1_round_shift_32_sse4_1(out0, bit); \ + const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1); \ + const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0); \ + out1 = _mm_sub_epi32(in0_w1, in1_w0); \ + out1 = av1_round_shift_32_sse4_1(out1, bit); \ + } while (0) + +// out0 = in0*w0 + in1*w1 +// out1 = in1*w0 - in0*w1 +#define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \ + do { \ + btf_32_sse4_1_type0(w1, w0, in1, in0, out0, out1, bit); \ + } while (0) + +// out0 = in0*w0 + in1*w1 +// out1 = -in1*w0 + in0*w1 +#define btf_32_type0_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \ + do { \ + const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0); \ + const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1); \ + out0 = _mm_add_epi32(in0_w0, in1_w1); \ + out0 = _mm_add_epi32(out0, r); \ + out0 = _mm_srai_epi32(out0, bit); \ + const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1); \ + const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0); \ + out1 = _mm_sub_epi32(in0_w1, in1_w0); \ + out1 = _mm_add_epi32(out1, r); \ + out1 = _mm_srai_epi32(out1, bit); \ + } while (0) + +// out0 = in0*w0 + in1*w1 +// out1 = in1*w0 - in0*w1 +#define btf_32_type1_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \ + do { \ + btf_32_type0_sse4_1_new(ww1, ww0, in1, in0, out0, out1, r, bit); \ + } while (0) + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_ diff --git a/libs/libaom/src/av1/encoder/x86/corner_match_avx2.c b/libs/libaom/src/av1/encoder/x86/corner_match_avx2.c new file mode 100644 index 000000000..8d7eb3f03 --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/corner_match_avx2.c @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include +#include "config/av1_rtcd.h" + +#include "aom_ports/mem.h" +#include "aom_ports/system_state.h" +#include "av1/encoder/corner_match.h" + +DECLARE_ALIGNED(16, static const uint8_t, + byte_mask[16]) = { 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 0, 0, 0 }; +#if MATCH_SZ != 13 +#error "Need to change byte_mask in corner_match_sse4.c if MATCH_SZ != 13" +#endif + +/* Compute corr(im1, im2) * MATCH_SZ * stddev(im1), where the +correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows +of each image, centered at (x1, y1) and (x2, y2) respectively. +*/ +double av1_compute_cross_correlation_avx2(unsigned char *im1, int stride1, + int x1, int y1, unsigned char *im2, + int stride2, int x2, int y2) { + int i, stride1_i = 0, stride2_i = 0; + __m256i temp1, sum_vec, sumsq2_vec, cross_vec, v, v1_1, v2_1; + const __m128i mask = _mm_load_si128((__m128i *)byte_mask); + const __m256i zero = _mm256_setzero_si256(); + __m128i v1, v2; + + sum_vec = zero; + sumsq2_vec = zero; + cross_vec = zero; + + im1 += (y1 - MATCH_SZ_BY2) * stride1 + (x1 - MATCH_SZ_BY2); + im2 += (y2 - MATCH_SZ_BY2) * stride2 + (x2 - MATCH_SZ_BY2); + + for (i = 0; i < MATCH_SZ; ++i) { + v1 = _mm_and_si128(_mm_loadu_si128((__m128i *)&im1[stride1_i]), mask); + v1_1 = _mm256_cvtepu8_epi16(v1); + v2 = _mm_and_si128(_mm_loadu_si128((__m128i *)&im2[stride2_i]), mask); + v2_1 = _mm256_cvtepu8_epi16(v2); + + v = _mm256_insertf128_si256(_mm256_castsi128_si256(v1), v2, 1); + sumsq2_vec = _mm256_add_epi32(sumsq2_vec, _mm256_madd_epi16(v2_1, v2_1)); + + sum_vec = _mm256_add_epi16(sum_vec, _mm256_sad_epu8(v, zero)); + cross_vec = _mm256_add_epi32(cross_vec, _mm256_madd_epi16(v1_1, v2_1)); + stride1_i += stride1; + stride2_i += stride2; + } + __m256i sum_vec1 = _mm256_srli_si256(sum_vec, 8); + sum_vec = _mm256_add_epi32(sum_vec, sum_vec1); + int sum1_acc = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_vec)); + int sum2_acc = _mm256_extract_epi32(sum_vec, 4); + + __m256i unp_low = _mm256_unpacklo_epi64(sumsq2_vec, cross_vec); + __m256i unp_hig = _mm256_unpackhi_epi64(sumsq2_vec, cross_vec); + temp1 = _mm256_add_epi32(unp_low, unp_hig); + + __m128i low_sumsq = _mm256_castsi256_si128(temp1); + low_sumsq = _mm_add_epi32(low_sumsq, _mm256_extractf128_si256(temp1, 1)); + low_sumsq = _mm_add_epi32(low_sumsq, _mm_srli_epi64(low_sumsq, 32)); + int sumsq2_acc = _mm_cvtsi128_si32(low_sumsq); + int cross_acc = _mm_extract_epi32(low_sumsq, 2); + + int var2 = sumsq2_acc * MATCH_SZ_SQ - sum2_acc * sum2_acc; + int cov = cross_acc * MATCH_SZ_SQ - sum1_acc * sum2_acc; + aom_clear_system_state(); + return cov / sqrt((double)var2); +} diff --git a/libs/libaom/src/av1/encoder/x86/corner_match_sse4.c b/libs/libaom/src/av1/encoder/x86/corner_match_sse4.c new file mode 100644 index 000000000..5c9ca207e --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/corner_match_sse4.c @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include + +#include "config/av1_rtcd.h" + +#include "aom_ports/mem.h" +#include "aom_ports/system_state.h" +#include "av1/encoder/corner_match.h" + +DECLARE_ALIGNED(16, static const uint8_t, + byte_mask[16]) = { 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 0, 0, 0 }; +#if MATCH_SZ != 13 +#error "Need to change byte_mask in corner_match_sse4.c if MATCH_SZ != 13" +#endif + +/* Compute corr(im1, im2) * MATCH_SZ * stddev(im1), where the + correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows + of each image, centered at (x1, y1) and (x2, y2) respectively. +*/ +double av1_compute_cross_correlation_sse4_1(unsigned char *im1, int stride1, + int x1, int y1, unsigned char *im2, + int stride2, int x2, int y2) { + int i; + // 2 16-bit partial sums in lanes 0, 4 (== 2 32-bit partial sums in lanes 0, + // 2) + __m128i sum1_vec = _mm_setzero_si128(); + __m128i sum2_vec = _mm_setzero_si128(); + // 4 32-bit partial sums of squares + __m128i sumsq2_vec = _mm_setzero_si128(); + __m128i cross_vec = _mm_setzero_si128(); + + const __m128i mask = _mm_load_si128((__m128i *)byte_mask); + const __m128i zero = _mm_setzero_si128(); + + im1 += (y1 - MATCH_SZ_BY2) * stride1 + (x1 - MATCH_SZ_BY2); + im2 += (y2 - MATCH_SZ_BY2) * stride2 + (x2 - MATCH_SZ_BY2); + + for (i = 0; i < MATCH_SZ; ++i) { + const __m128i v1 = + _mm_and_si128(_mm_loadu_si128((__m128i *)&im1[i * stride1]), mask); + const __m128i v2 = + _mm_and_si128(_mm_loadu_si128((__m128i *)&im2[i * stride2]), mask); + + // Using the 'sad' intrinsic here is a bit faster than adding + // v1_l + v1_r and v2_l + v2_r, plus it avoids the need for a 16->32 bit + // conversion step later, for a net speedup of ~10% + sum1_vec = _mm_add_epi16(sum1_vec, _mm_sad_epu8(v1, zero)); + sum2_vec = _mm_add_epi16(sum2_vec, _mm_sad_epu8(v2, zero)); + + const __m128i v1_l = _mm_cvtepu8_epi16(v1); + const __m128i v1_r = _mm_cvtepu8_epi16(_mm_srli_si128(v1, 8)); + const __m128i v2_l = _mm_cvtepu8_epi16(v2); + const __m128i v2_r = _mm_cvtepu8_epi16(_mm_srli_si128(v2, 8)); + + sumsq2_vec = _mm_add_epi32( + sumsq2_vec, + _mm_add_epi32(_mm_madd_epi16(v2_l, v2_l), _mm_madd_epi16(v2_r, v2_r))); + cross_vec = _mm_add_epi32( + cross_vec, + _mm_add_epi32(_mm_madd_epi16(v1_l, v2_l), _mm_madd_epi16(v1_r, v2_r))); + } + + // Now we can treat the four registers (sum1_vec, sum2_vec, sumsq2_vec, + // cross_vec) + // as holding 4 32-bit elements each, which we want to sum horizontally. + // We do this by transposing and then summing vertically. + __m128i tmp_0 = _mm_unpacklo_epi32(sum1_vec, sum2_vec); + __m128i tmp_1 = _mm_unpackhi_epi32(sum1_vec, sum2_vec); + __m128i tmp_2 = _mm_unpacklo_epi32(sumsq2_vec, cross_vec); + __m128i tmp_3 = _mm_unpackhi_epi32(sumsq2_vec, cross_vec); + + __m128i tmp_4 = _mm_unpacklo_epi64(tmp_0, tmp_2); + __m128i tmp_5 = _mm_unpackhi_epi64(tmp_0, tmp_2); + __m128i tmp_6 = _mm_unpacklo_epi64(tmp_1, tmp_3); + __m128i tmp_7 = _mm_unpackhi_epi64(tmp_1, tmp_3); + + __m128i res = + _mm_add_epi32(_mm_add_epi32(tmp_4, tmp_5), _mm_add_epi32(tmp_6, tmp_7)); + + int sum1 = _mm_extract_epi32(res, 0); + int sum2 = _mm_extract_epi32(res, 1); + int sumsq2 = _mm_extract_epi32(res, 2); + int cross = _mm_extract_epi32(res, 3); + + int var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2; + int cov = cross * MATCH_SZ_SQ - sum1 * sum2; + aom_clear_system_state(); + return cov / sqrt((double)var2); +} diff --git a/libs/libaom/src/av1/encoder/x86/dct_sse2.asm b/libs/libaom/src/av1/encoder/x86/dct_sse2.asm new file mode 100644 index 000000000..b18554818 --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/dct_sse2.asm @@ -0,0 +1,82 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +%define private_prefix av1 + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro TRANSFORM_COLS 0 + paddw m0, m1 + movq m4, m0 + psubw m3, m2 + psubw m4, m3 + psraw m4, 1 + movq m5, m4 + psubw m5, m1 ;b1 + psubw m4, m2 ;c1 + psubw m0, m4 + paddw m3, m5 + ; m0 a0 + SWAP 1, 4 ; m1 c1 + SWAP 2, 3 ; m2 d1 + SWAP 3, 5 ; m3 b1 +%endmacro + +%macro TRANSPOSE_4X4 0 + ; 00 01 02 03 + ; 10 11 12 13 + ; 20 21 22 23 + ; 30 31 32 33 + punpcklwd m0, m1 ; 00 10 01 11 02 12 03 13 + punpcklwd m2, m3 ; 20 30 21 31 22 32 23 33 + mova m1, m0 + punpckldq m0, m2 ; 00 10 20 30 01 11 21 31 + punpckhdq m1, m2 ; 02 12 22 32 03 13 23 33 +%endmacro + +INIT_XMM sse2 +cglobal fwht4x4, 3, 4, 8, input, output, stride + lea r3q, [inputq + strideq*4] + movq m0, [inputq] ;a1 + movq m1, [inputq + strideq*2] ;b1 + movq m2, [r3q] ;c1 + movq m3, [r3q + strideq*2] ;d1 + + TRANSFORM_COLS + TRANSPOSE_4X4 + SWAP 1, 2 + psrldq m1, m0, 8 + psrldq m3, m2, 8 + TRANSFORM_COLS + TRANSPOSE_4X4 + + psllw m0, 2 + psllw m1, 2 + + ; sign extension + mova m2, m0 + mova m3, m1 + punpcklwd m0, m0 + punpcklwd m1, m1 + punpckhwd m2, m2 + punpckhwd m3, m3 + psrad m0, 16 + psrad m1, 16 + psrad m2, 16 + psrad m3, 16 + mova [outputq], m0 + mova [outputq + 16], m2 + mova [outputq + 32], m1 + mova [outputq + 48], m3 + + RET diff --git a/libs/libaom/src/av1/encoder/x86/encodetxb_avx2.c b/libs/libaom/src/av1/encoder/x86/encodetxb_avx2.c new file mode 100644 index 000000000..30a412909 --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/encodetxb_avx2.c @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 +#include /* SSE4.1 */ +#include /* AVX2 */ + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/mem_sse2.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/txb_common.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +void av1_txb_init_levels_avx2(const tran_low_t *const coeff, const int width, + const int height, uint8_t *const levels) { + const int stride = width + TX_PAD_HOR; + const __m256i y_zeros = _mm256_setzero_si256(); + + const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride); + uint8_t *bottom_buf_end = levels + (height + TX_PAD_BOTTOM) * stride; + uint8_t *bottom_buf = bottom_buf_end - ((bottom_len + 31) & (~31)); + + do { + yy_storeu_256(bottom_buf, y_zeros); + bottom_buf += 32; + } while (bottom_buf < bottom_buf_end); + + int i = 0; + uint8_t *ls = levels; + const tran_low_t *cf = coeff; + if (width == 4) { + do { + const __m256i c0 = yy_loadu_256(cf); + const __m256i c1 = yy_loadu_256(cf + 8); + const __m256i abs01 = _mm256_abs_epi16(_mm256_packs_epi32(c0, c1)); + const __m256i abs01_8 = _mm256_packs_epi16(abs01, y_zeros); + const __m256i res_ = _mm256_shuffle_epi32(abs01_8, 0xd8); + const __m256i res = _mm256_permute4x64_epi64(res_, 0xd8); + yy_storeu_256(ls, res); + ls += 32; + cf += 16; + i += 4; + } while (i < height); + } else if (width == 8) { + do { + const __m256i coeffA = yy_loadu_256(cf); + const __m256i coeffB = yy_loadu_256(cf + 8); + const __m256i coeffC = yy_loadu_256(cf + 16); + const __m256i coeffD = yy_loadu_256(cf + 24); + const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB); + const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD); + const __m256i absAB = _mm256_abs_epi16(coeffAB); + const __m256i absCD = _mm256_abs_epi16(coeffCD); + const __m256i absABCD = _mm256_packs_epi16(absAB, absCD); + const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8); + const __m256i res = _mm256_shuffle_epi32(res_, 0xd8); + const __m128i res0 = _mm256_castsi256_si128(res); + const __m128i res1 = _mm256_extracti128_si256(res, 1); + xx_storel_64(ls, res0); + *(int32_t *)(ls + width) = 0; + xx_storel_64(ls + stride, _mm_srli_si128(res0, 8)); + *(int32_t *)(ls + width + stride) = 0; + xx_storel_64(ls + stride * 2, res1); + *(int32_t *)(ls + width + stride * 2) = 0; + xx_storel_64(ls + stride * 3, _mm_srli_si128(res1, 8)); + *(int32_t *)(ls + width + stride * 3) = 0; + cf += 32; + ls += stride << 2; + i += 4; + } while (i < height); + } else if (width == 16) { + do { + const __m256i coeffA = yy_loadu_256(cf); + const __m256i coeffB = yy_loadu_256(cf + 8); + const __m256i coeffC = yy_loadu_256(cf + 16); + const __m256i coeffD = yy_loadu_256(cf + 24); + const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB); + const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD); + const __m256i absAB = _mm256_abs_epi16(coeffAB); + const __m256i absCD = _mm256_abs_epi16(coeffCD); + const __m256i absABCD = _mm256_packs_epi16(absAB, absCD); + const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8); + const __m256i res = _mm256_shuffle_epi32(res_, 0xd8); + xx_storeu_128(ls, _mm256_castsi256_si128(res)); + xx_storeu_128(ls + stride, _mm256_extracti128_si256(res, 1)); + cf += 32; + *(int32_t *)(ls + width) = 0; + *(int32_t *)(ls + stride + width) = 0; + ls += stride << 1; + i += 2; + } while (i < height); + } else { + do { + const __m256i coeffA = yy_loadu_256(cf); + const __m256i coeffB = yy_loadu_256(cf + 8); + const __m256i coeffC = yy_loadu_256(cf + 16); + const __m256i coeffD = yy_loadu_256(cf + 24); + const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB); + const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD); + const __m256i absAB = _mm256_abs_epi16(coeffAB); + const __m256i absCD = _mm256_abs_epi16(coeffCD); + const __m256i absABCD = _mm256_packs_epi16(absAB, absCD); + const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8); + const __m256i res = _mm256_shuffle_epi32(res_, 0xd8); + yy_storeu_256(ls, res); + cf += 32; + *(int32_t *)(ls + width) = 0; + ls += stride; + i += 1; + } while (i < height); + } +} diff --git a/libs/libaom/src/av1/encoder/x86/encodetxb_sse2.c b/libs/libaom/src/av1/encoder/x86/encodetxb_sse2.c new file mode 100644 index 000000000..394befb7b --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/encodetxb_sse2.c @@ -0,0 +1,505 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/mem_sse2.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/txb_common.h" + +static INLINE void load_levels_4x4x5_sse2(const uint8_t *const src, + const int stride, + const ptrdiff_t *const offsets, + __m128i *const level) { + level[0] = load_8bit_4x4_to_1_reg_sse2(src + 1, stride); + level[1] = load_8bit_4x4_to_1_reg_sse2(src + stride, stride); + level[2] = load_8bit_4x4_to_1_reg_sse2(src + offsets[0], stride); + level[3] = load_8bit_4x4_to_1_reg_sse2(src + offsets[1], stride); + level[4] = load_8bit_4x4_to_1_reg_sse2(src + offsets[2], stride); +} + +static INLINE void load_levels_8x2x5_sse2(const uint8_t *const src, + const int stride, + const ptrdiff_t *const offsets, + __m128i *const level) { + level[0] = load_8bit_8x2_to_1_reg_sse2(src + 1, stride); + level[1] = load_8bit_8x2_to_1_reg_sse2(src + stride, stride); + level[2] = load_8bit_8x2_to_1_reg_sse2(src + offsets[0], stride); + level[3] = load_8bit_8x2_to_1_reg_sse2(src + offsets[1], stride); + level[4] = load_8bit_8x2_to_1_reg_sse2(src + offsets[2], stride); +} + +static INLINE void load_levels_16x1x5_sse2(const uint8_t *const src, + const int stride, + const ptrdiff_t *const offsets, + __m128i *const level) { + level[0] = _mm_loadu_si128((__m128i *)(src + 1)); + level[1] = _mm_loadu_si128((__m128i *)(src + stride)); + level[2] = _mm_loadu_si128((__m128i *)(src + offsets[0])); + level[3] = _mm_loadu_si128((__m128i *)(src + offsets[1])); + level[4] = _mm_loadu_si128((__m128i *)(src + offsets[2])); +} + +static INLINE __m128i get_coeff_contexts_kernel_sse2(__m128i *const level) { + const __m128i const_3 = _mm_set1_epi8(3); + const __m128i const_4 = _mm_set1_epi8(4); + __m128i count; + + count = _mm_min_epu8(level[0], const_3); + level[1] = _mm_min_epu8(level[1], const_3); + level[2] = _mm_min_epu8(level[2], const_3); + level[3] = _mm_min_epu8(level[3], const_3); + level[4] = _mm_min_epu8(level[4], const_3); + count = _mm_add_epi8(count, level[1]); + count = _mm_add_epi8(count, level[2]); + count = _mm_add_epi8(count, level[3]); + count = _mm_add_epi8(count, level[4]); + count = _mm_avg_epu8(count, _mm_setzero_si128()); + count = _mm_min_epu8(count, const_4); + return count; +} + +static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels, + const int height, + const ptrdiff_t *const offsets, + int8_t *const coeff_contexts) { + const int stride = 4 + TX_PAD_HOR; + const __m128i pos_to_offset_large = _mm_set1_epi8(21); + __m128i pos_to_offset = + (height == 4) + ? _mm_setr_epi8(0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21) + : _mm_setr_epi8(0, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 6, 21, + 21, 21); + __m128i count; + __m128i level[5]; + int8_t *cc = coeff_contexts; + int row = height; + + assert(!(height % 4)); + + do { + load_levels_4x4x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)cc, count); + pos_to_offset = pos_to_offset_large; + levels += 4 * stride; + cc += 16; + row -= 4; + } while (row); + + coeff_contexts[0] = 0; +} + +static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels, + const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = 4 + TX_PAD_HOR; + const __m128i pos_to_offset = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); + __m128i count; + __m128i level[5]; + int row = height; + + assert(!(height % 4)); + + do { + load_levels_4x4x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)coeff_contexts, count); + levels += 4 * stride; + coeff_contexts += 16; + row -= 4; + } while (row); +} + +static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels, + const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = 4 + TX_PAD_HOR; + const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10); + __m128i pos_to_offset = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); + __m128i count; + __m128i level[5]; + int row = height; + + assert(!(height % 4)); + + do { + load_levels_4x4x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)coeff_contexts, count); + pos_to_offset = pos_to_offset_large; + levels += 4 * stride; + coeff_contexts += 16; + row -= 4; + } while (row); +} + +static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels, + const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = 8 + TX_PAD_HOR; + int8_t *cc = coeff_contexts; + int row = height; + __m128i count; + __m128i level[5]; + __m128i pos_to_offset[3]; + + assert(!(height % 2)); + + if (height == 8) { + pos_to_offset[0] = + _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21); + pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, + 21, 21, 21, 21, 21); + } else if (height < 8) { + pos_to_offset[0] = _mm_setr_epi8(0, 16, 6, 6, 21, 21, 21, 21, 16, 16, 6, 21, + 21, 21, 21, 21); + pos_to_offset[1] = _mm_setr_epi8(16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21, + 21, 21, 21, 21, 21); + } else { + pos_to_offset[0] = _mm_setr_epi8(0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11); + pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, + 21, 21, 21, 21, 21); + } + pos_to_offset[2] = _mm_set1_epi8(21); + + do { + load_levels_8x2x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset[0]); + _mm_store_si128((__m128i *)cc, count); + pos_to_offset[0] = pos_to_offset[1]; + pos_to_offset[1] = pos_to_offset[2]; + levels += 2 * stride; + cc += 16; + row -= 2; + } while (row); + + coeff_contexts[0] = 0; +} + +static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels, + const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = 8 + TX_PAD_HOR; + const __m128i pos_to_offset = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); + int row = height; + __m128i count; + __m128i level[5]; + + assert(!(height % 2)); + + do { + load_levels_8x2x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)coeff_contexts, count); + levels += 2 * stride; + coeff_contexts += 16; + row -= 2; + } while (row); +} + +static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels, + const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = 8 + TX_PAD_HOR; + const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10); + __m128i pos_to_offset = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5); + int row = height; + __m128i count; + __m128i level[5]; + + assert(!(height % 2)); + + do { + load_levels_8x2x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)coeff_contexts, count); + pos_to_offset = pos_to_offset_large; + levels += 2 * stride; + coeff_contexts += 16; + row -= 2; + } while (row); +} + +static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels, + const int real_width, + const int real_height, + const int width, const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = width + TX_PAD_HOR; + int8_t *cc = coeff_contexts; + int row = height; + __m128i pos_to_offset[5]; + __m128i pos_to_offset_large[3]; + __m128i count; + __m128i level[5]; + + assert(!(width % 16)); + + pos_to_offset_large[2] = _mm_set1_epi8(21); + if (real_width == real_height) { + pos_to_offset[0] = _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21); + pos_to_offset[1] = _mm_setr_epi8(1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] = + pos_to_offset_large[2]; + } else if (real_width > real_height) { + pos_to_offset[0] = _mm_setr_epi8(0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[1] = _mm_setr_epi8(16, 16, 6, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] = _mm_setr_epi8( + 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21); + pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2]; + } else { // real_width < real_height + pos_to_offset[0] = pos_to_offset[1] = _mm_setr_epi8( + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11); + pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[4] = pos_to_offset_large[2]; + pos_to_offset_large[0] = pos_to_offset_large[1] = _mm_set1_epi8(11); + } + + do { + int w = width; + + do { + load_levels_16x1x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset[0]); + _mm_store_si128((__m128i *)cc, count); + levels += 16; + cc += 16; + w -= 16; + pos_to_offset[0] = pos_to_offset_large[0]; + } while (w); + + pos_to_offset[0] = pos_to_offset[1]; + pos_to_offset[1] = pos_to_offset[2]; + pos_to_offset[2] = pos_to_offset[3]; + pos_to_offset[3] = pos_to_offset[4]; + pos_to_offset_large[0] = pos_to_offset_large[1]; + pos_to_offset_large[1] = pos_to_offset_large[2]; + levels += TX_PAD_HOR; + } while (--row); + + coeff_contexts[0] = 0; +} + +static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels, + const int width, const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = width + TX_PAD_HOR; + const __m128i pos_to_offset_large = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); + __m128i count; + __m128i level[5]; + int row = height; + + assert(!(width % 16)); + + do { + __m128i pos_to_offset = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); + int w = width; + + do { + load_levels_16x1x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)coeff_contexts, count); + pos_to_offset = pos_to_offset_large; + levels += 16; + coeff_contexts += 16; + w -= 16; + } while (w); + + levels += TX_PAD_HOR; + } while (--row); +} + +static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels, + const int width, const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = width + TX_PAD_HOR; + __m128i pos_to_offset[3]; + __m128i count; + __m128i level[5]; + int row = height; + + assert(!(width % 16)); + + pos_to_offset[0] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 0); + pos_to_offset[1] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 5); + pos_to_offset[2] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10); + + do { + int w = width; + + do { + load_levels_16x1x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset[0]); + _mm_store_si128((__m128i *)coeff_contexts, count); + levels += 16; + coeff_contexts += 16; + w -= 16; + } while (w); + + pos_to_offset[0] = pos_to_offset[1]; + pos_to_offset[1] = pos_to_offset[2]; + levels += TX_PAD_HOR; + } while (--row); +} + +// Note: levels[] must be in the range [0, 127], inclusive. +void av1_get_nz_map_contexts_sse2(const uint8_t *const levels, + const int16_t *const scan, const uint16_t eob, + const TX_SIZE tx_size, + const TX_CLASS tx_class, + int8_t *const coeff_contexts) { + const int last_idx = eob - 1; + if (!last_idx) { + coeff_contexts[0] = 0; + return; + } + + const int real_width = tx_size_wide[tx_size]; + const int real_height = tx_size_high[tx_size]; + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + const int stride = width + TX_PAD_HOR; + ptrdiff_t offsets[3]; + + /* coeff_contexts must be 16 byte aligned. */ + assert(!((intptr_t)coeff_contexts & 0xf)); + + if (tx_class == TX_CLASS_2D) { + offsets[0] = 0 * stride + 2; + offsets[1] = 1 * stride + 1; + offsets[2] = 2 * stride + 0; + + if (width == 4) { + get_4_nz_map_contexts_2d(levels, height, offsets, coeff_contexts); + } else if (width == 8) { + get_8_coeff_contexts_2d(levels, height, offsets, coeff_contexts); + } else if (width == 16) { + get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height, + offsets, coeff_contexts); + } else { + get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height, + offsets, coeff_contexts); + } + } else if (tx_class == TX_CLASS_HORIZ) { + offsets[0] = 2; + offsets[1] = 3; + offsets[2] = 4; + if (width == 4) { + get_4_nz_map_contexts_hor(levels, height, offsets, coeff_contexts); + } else if (width == 8) { + get_8_coeff_contexts_hor(levels, height, offsets, coeff_contexts); + } else { + get_16n_coeff_contexts_hor(levels, width, height, offsets, + coeff_contexts); + } + } else { // TX_CLASS_VERT + offsets[0] = 2 * stride; + offsets[1] = 3 * stride; + offsets[2] = 4 * stride; + if (width == 4) { + get_4_nz_map_contexts_ver(levels, height, offsets, coeff_contexts); + } else if (width == 8) { + get_8_coeff_contexts_ver(levels, height, offsets, coeff_contexts); + } else { + get_16n_coeff_contexts_ver(levels, width, height, offsets, + coeff_contexts); + } + } + + const int bwl = get_txb_bwl(tx_size); + const int pos = scan[last_idx]; + if (last_idx <= (height << bwl) / 8) + coeff_contexts[pos] = 1; + else if (last_idx <= (height << bwl) / 4) + coeff_contexts[pos] = 2; + else + coeff_contexts[pos] = 3; +} diff --git a/libs/libaom/src/av1/encoder/x86/encodetxb_sse4.c b/libs/libaom/src/av1/encoder/x86/encodetxb_sse4.c new file mode 100644 index 000000000..aeb57f2cd --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/encodetxb_sse4.c @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 +#include /* SSE4.1 */ + +#include "aom/aom_integer.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/txb_common.h" +#include "aom_dsp/x86/synonyms.h" + +void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width, + const int height, uint8_t *const levels) { + const int stride = width + TX_PAD_HOR; + const __m128i zeros = _mm_setzero_si128(); + + const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride); + uint8_t *bottom_buf = levels + stride * height; + uint8_t *bottom_buf_end = bottom_buf + bottom_len; + do { + _mm_storeu_si128((__m128i *)(bottom_buf), zeros); + bottom_buf += 16; + } while (bottom_buf < bottom_buf_end); + + int i = 0; + uint8_t *ls = levels; + const tran_low_t *cf = coeff; + if (width == 4) { + do { + const __m128i coeffA = xx_loadu_128(cf); + const __m128i coeffB = xx_loadu_128(cf + 4); + const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB); + const __m128i absAB = _mm_abs_epi16(coeffAB); + const __m128i absAB8 = _mm_packs_epi16(absAB, zeros); + const __m128i lsAB = _mm_unpacklo_epi32(absAB8, zeros); + xx_storeu_128(ls, lsAB); + ls += (stride << 1); + cf += (width << 1); + i += 2; + } while (i < height); + } else if (width == 8) { + do { + const __m128i coeffA = xx_loadu_128(cf); + const __m128i coeffB = xx_loadu_128(cf + 4); + const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB); + const __m128i absAB = _mm_abs_epi16(coeffAB); + const __m128i absAB8 = _mm_packs_epi16(absAB, zeros); + xx_storeu_128(ls, absAB8); + ls += stride; + cf += width; + i += 1; + } while (i < height); + } else { + do { + int j = 0; + do { + const __m128i coeffA = xx_loadu_128(cf); + const __m128i coeffB = xx_loadu_128(cf + 4); + const __m128i coeffC = xx_loadu_128(cf + 8); + const __m128i coeffD = xx_loadu_128(cf + 12); + const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB); + const __m128i coeffCD = _mm_packs_epi32(coeffC, coeffD); + const __m128i absAB = _mm_abs_epi16(coeffAB); + const __m128i absCD = _mm_abs_epi16(coeffCD); + const __m128i absABCD = _mm_packs_epi16(absAB, absCD); + xx_storeu_128(ls + j, absABCD); + j += 16; + cf += 16; + } while (j < width); + *(int32_t *)(ls + width) = 0; + ls += stride; + i += 1; + } while (i < height); + } +} diff --git a/libs/libaom/src/av1/encoder/x86/error_intrin_avx2.c b/libs/libaom/src/av1/encoder/x86/error_intrin_avx2.c new file mode 100644 index 000000000..12dda3ad0 --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/error_intrin_avx2.c @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // AVX2 + +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" + +static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset, + __m256i *c) { + const tran_low_t *addr = coeff + offset; + + if (sizeof(tran_low_t) == 4) { + const __m256i x0 = _mm256_loadu_si256((const __m256i *)addr); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)addr + 1); + const __m256i y = _mm256_packs_epi32(x0, x1); + *c = _mm256_permute4x64_epi64(y, 0xD8); + } else { + *c = _mm256_loadu_si256((const __m256i *)addr); + } +} + +int64_t av1_block_error_lp_avx2(const int16_t *coeff, const int16_t *dqcoeff, + intptr_t block_size) { + const __m256i zero = _mm256_setzero_si256(); + __m256i sse_256 = zero; + __m256i sse_hi; + __m128i sse_128; + int64_t sse; + + if (block_size == 16) { + // Load 16 elements for coeff and dqcoeff. + const __m256i _coeff = _mm256_loadu_si256((const __m256i *)coeff); + const __m256i _dqcoeff = _mm256_loadu_si256((const __m256i *)dqcoeff); + // dqcoeff - coeff + const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff); + // madd (dqcoeff - coeff) + const __m256i error_lo = _mm256_madd_epi16(diff, diff); + // Save the higher 64 bit of each 128 bit lane. + const __m256i error_hi = _mm256_srli_si256(error_lo, 8); + // Add the higher 64 bit to the low 64 bit. + const __m256i error = _mm256_add_epi32(error_lo, error_hi); + // Expand each double word in the lower 64 bits to quad word. + sse_256 = _mm256_unpacklo_epi32(error, zero); + } else { + for (int i = 0; i < block_size; i += 16) { + // Load 16 elements for coeff and dqcoeff. + const __m256i _coeff = _mm256_loadu_si256((const __m256i *)coeff); + const __m256i _dqcoeff = _mm256_loadu_si256((const __m256i *)dqcoeff); + const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff); + const __m256i error = _mm256_madd_epi16(diff, diff); + // Expand each double word of madd (dqcoeff - coeff) to quad word. + const __m256i exp_error_lo = _mm256_unpacklo_epi32(error, zero); + const __m256i exp_error_hi = _mm256_unpackhi_epi32(error, zero); + // Add each quad word of madd (dqcoeff - coeff). + sse_256 = _mm256_add_epi64(sse_256, exp_error_lo); + sse_256 = _mm256_add_epi64(sse_256, exp_error_hi); + coeff += 16; + dqcoeff += 16; + } + } + // Save the higher 64 bit of each 128 bit lane. + sse_hi = _mm256_srli_si256(sse_256, 8); + // Add the higher 64 bit to the low 64 bit. + sse_256 = _mm256_add_epi64(sse_256, sse_hi); + + // Add each 64 bit from each of the 128 bit lane of the 256 bit. + sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256), + _mm256_extractf128_si256(sse_256, 1)); + + // Store the results. + _mm_storel_epi64((__m128i *)&sse, sse_128); + return sse; +} + +int64_t av1_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg; + __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi; + __m256i sse_reg_64hi, ssz_reg_64hi; + __m128i sse_reg128, ssz_reg128; + int64_t sse; + int i; + const __m256i zero_reg = _mm256_setzero_si256(); + + // init sse and ssz registerd to zero + sse_reg = _mm256_setzero_si256(); + ssz_reg = _mm256_setzero_si256(); + + for (i = 0; i < block_size; i += 16) { + // load 32 bytes from coeff and dqcoeff + read_coeff(coeff, i, &coeff_reg); + read_coeff(dqcoeff, i, &dqcoeff_reg); + // dqcoeff - coeff + dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg); + // madd (dqcoeff - coeff) + dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg); + // madd coeff + coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg); + // expand each double word of madd (dqcoeff - coeff) to quad word + exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg); + exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg); + // expand each double word of madd (coeff) to quad word + exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg); + exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg); + // add each quad word of madd (dqcoeff - coeff) and madd (coeff) + sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo); + ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo); + sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi); + ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi); + } + // save the higher 64 bit of each 128 bit lane + sse_reg_64hi = _mm256_srli_si256(sse_reg, 8); + ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8); + // add the higher 64 bit to the low 64 bit + sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi); + ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi); + + // add each 64 bit from each of the 128 bit lane of the 256 bit + sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg), + _mm256_extractf128_si256(sse_reg, 1)); + + ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg), + _mm256_extractf128_si256(ssz_reg, 1)); + + // store the results + _mm_storel_epi64((__m128i *)(&sse), sse_reg128); + + _mm_storel_epi64((__m128i *)(ssz), ssz_reg128); + _mm256_zeroupper(); + return sse; +} diff --git a/libs/libaom/src/av1/encoder/x86/error_sse2.asm b/libs/libaom/src/av1/encoder/x86/error_sse2.asm new file mode 100644 index 000000000..f4b496897 --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/error_sse2.asm @@ -0,0 +1,88 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +; Increment %1 by sizeof() tran_low_t * %2. +%macro INCREMENT_ELEMENTS_TRAN_LOW 2 + lea %1, [%1 + %2 * 4] +%endmacro + +; Load %2 + %3 into m%1. +; %3 is the offset in elements, not bytes. +; If tran_low_t is 16 bits (low bit depth configuration) then load the value +; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack +; the values down to 16 bits. +%macro LOAD_TRAN_LOW 3 + mova m%1, [%2 + (%3) * 4] + packssdw m%1, [%2 + (%3) * 4 + 16] +%endmacro + +%define private_prefix av1 + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; int64_t av1_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, +; int64_t *ssz) + +INIT_XMM sse2 +cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz + pxor m4, m4 ; sse accumulator + pxor m6, m6 ; ssz accumulator + pxor m5, m5 ; dedicated zero register +.loop: + LOAD_TRAN_LOW 2, uqcq, 0 + LOAD_TRAN_LOW 0, dqcq, 0 + LOAD_TRAN_LOW 3, uqcq, 8 + LOAD_TRAN_LOW 1, dqcq, 8 + INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16 + INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16 + sub sizeq, 16 + psubw m0, m2 + psubw m1, m3 + ; individual errors are max. 15bit+sign, so squares are 30bit, and + ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) + pmaddwd m0, m0 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + ; the sum of 2 31bit integers will fit in a 32bit unsigned integer + paddd m0, m1 + paddd m2, m3 + ; accumulate in 64bit + punpckldq m7, m0, m5 + punpckhdq m0, m5 + paddq m4, m7 + punpckldq m7, m2, m5 + paddq m4, m0 + punpckhdq m2, m5 + paddq m6, m7 + paddq m6, m2 + jg .loop + + ; accumulate horizontally and store in return value + movhlps m5, m4 + movhlps m7, m6 + paddq m4, m5 + paddq m6, m7 +%if ARCH_X86_64 + movq rax, m4 + movq [sszq], m6 +%else + mov eax, sszm + pshufd m5, m4, 0x1 + movq [eax], m6 + movd eax, m4 + movd edx, m5 +%endif + RET diff --git a/libs/libaom/src/av1/encoder/x86/hash_sse42.c b/libs/libaom/src/av1/encoder/x86/hash_sse42.c new file mode 100644 index 000000000..65fa46311 --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/hash_sse42.c @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +// Byte-boundary alignment issues +#define ALIGN_SIZE 8 +#define ALIGN_MASK (ALIGN_SIZE - 1) + +#define CALC_CRC(op, crc, type, buf, len) \ + while ((len) >= sizeof(type)) { \ + (crc) = op((crc), *(type *)(buf)); \ + (len) -= sizeof(type); \ + buf += sizeof(type); \ + } + +/** + * Calculates 32-bit CRC for the input buffer + * polynomial is 0x11EDC6F41 + * @return A 32-bit unsigned integer representing the CRC + */ +uint32_t av1_get_crc32c_value_sse4_2(void *crc_calculator, uint8_t *p, + size_t len) { + (void)crc_calculator; + const uint8_t *buf = p; + uint32_t crc = 0xFFFFFFFF; + + // Align the input to the word boundary + for (; (len > 0) && ((intptr_t)buf & ALIGN_MASK); len--, buf++) { + crc = _mm_crc32_u8(crc, *buf); + } + +#ifdef __x86_64__ + uint64_t crc64 = crc; + CALC_CRC(_mm_crc32_u64, crc64, uint64_t, buf, len); + crc = (uint32_t)crc64; +#endif + CALC_CRC(_mm_crc32_u32, crc, uint32_t, buf, len); + CALC_CRC(_mm_crc32_u16, crc, uint16_t, buf, len); + CALC_CRC(_mm_crc32_u8, crc, uint8_t, buf, len); + return (crc ^= 0xFFFFFFFF); +} diff --git a/libs/libaom/src/av1/encoder/x86/highbd_block_error_intrin_avx2.c b/libs/libaom/src/av1/encoder/x86/highbd_block_error_intrin_avx2.c new file mode 100644 index 000000000..ee3714d8a --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/highbd_block_error_intrin_avx2.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include "aom/aom_integer.h" +#include "av1/common/common.h" + +int64_t av1_highbd_block_error_avx2(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz, + int bps) { + int i; + int64_t temp1[8]; + int64_t error = 0, sqcoeff = 0; + const int shift = 2 * (bps - 8); + const int rounding = shift > 0 ? 1 << (shift - 1) : 0; + + for (i = 0; i < block_size; i += 16) { + __m256i mm256_coeff = _mm256_loadu_si256((__m256i *)(coeff + i)); + __m256i mm256_coeff2 = _mm256_loadu_si256((__m256i *)(coeff + i + 8)); + __m256i mm256_dqcoeff = _mm256_loadu_si256((__m256i *)(dqcoeff + i)); + __m256i mm256_dqcoeff2 = _mm256_loadu_si256((__m256i *)(dqcoeff + i + 8)); + + __m256i diff1 = _mm256_sub_epi32(mm256_coeff, mm256_dqcoeff); + __m256i diff2 = _mm256_sub_epi32(mm256_coeff2, mm256_dqcoeff2); + __m256i diff1h = _mm256_srli_epi64(diff1, 32); + __m256i diff2h = _mm256_srli_epi64(diff2, 32); + __m256i res = _mm256_mul_epi32(diff1, diff1); + __m256i res1 = _mm256_mul_epi32(diff1h, diff1h); + __m256i res2 = _mm256_mul_epi32(diff2, diff2); + __m256i res3 = _mm256_mul_epi32(diff2h, diff2h); + __m256i res_diff = _mm256_add_epi64(_mm256_add_epi64(res, res1), + _mm256_add_epi64(res2, res3)); + __m256i mm256_coeffh = _mm256_srli_epi64(mm256_coeff, 32); + __m256i mm256_coeffh2 = _mm256_srli_epi64(mm256_coeff2, 32); + res = _mm256_mul_epi32(mm256_coeff, mm256_coeff); + res1 = _mm256_mul_epi32(mm256_coeffh, mm256_coeffh); + res2 = _mm256_mul_epi32(mm256_coeff2, mm256_coeff2); + res3 = _mm256_mul_epi32(mm256_coeffh2, mm256_coeffh2); + __m256i res_sqcoeff = _mm256_add_epi64(_mm256_add_epi64(res, res1), + _mm256_add_epi64(res2, res3)); + _mm256_storeu_si256((__m256i *)temp1, res_diff); + _mm256_storeu_si256((__m256i *)temp1 + 1, res_sqcoeff); + + error += temp1[0] + temp1[1] + temp1[2] + temp1[3]; + sqcoeff += temp1[4] + temp1[5] + temp1[6] + temp1[7]; + } + assert(error >= 0 && sqcoeff >= 0); + error = (error + rounding) >> shift; + sqcoeff = (sqcoeff + rounding) >> shift; + + *ssz = sqcoeff; + return error; +} diff --git a/libs/libaom/src/av1/encoder/x86/highbd_block_error_intrin_sse2.c b/libs/libaom/src/av1/encoder/x86/highbd_block_error_intrin_sse2.c new file mode 100644 index 000000000..4579e4e4a --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/highbd_block_error_intrin_sse2.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "av1/common/common.h" + +int64_t av1_highbd_block_error_sse2(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz, + int bps) { + int i, j, test; + uint32_t temp[4]; + __m128i max, min, cmp0, cmp1, cmp2, cmp3; + int64_t error = 0, sqcoeff = 0; + const int shift = 2 * (bps - 8); + const int rounding = shift > 0 ? 1 << (shift - 1) : 0; + + for (i = 0; i < block_size; i += 8) { + // Load the data into xmm registers + __m128i mm_coeff = _mm_load_si128((__m128i *)(coeff + i)); + __m128i mm_coeff2 = _mm_load_si128((__m128i *)(coeff + i + 4)); + __m128i mm_dqcoeff = _mm_load_si128((__m128i *)(dqcoeff + i)); + __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4)); + // Check if any values require more than 15 bit + max = _mm_set1_epi32(0x3fff); + min = _mm_set1_epi32(0xffffc000); + cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max), + _mm_cmplt_epi32(mm_coeff, min)); + cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max), + _mm_cmplt_epi32(mm_coeff2, min)); + cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max), + _mm_cmplt_epi32(mm_dqcoeff, min)); + cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max), + _mm_cmplt_epi32(mm_dqcoeff2, min)); + test = _mm_movemask_epi8( + _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3))); + + if (!test) { + __m128i mm_diff, error_sse2, sqcoeff_sse2; + mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2); + mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2); + mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff); + error_sse2 = _mm_madd_epi16(mm_diff, mm_diff); + sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff); + _mm_storeu_si128((__m128i *)temp, error_sse2); + error = error + temp[0] + temp[1] + temp[2] + temp[3]; + _mm_storeu_si128((__m128i *)temp, sqcoeff_sse2); + sqcoeff += temp[0] + temp[1] + temp[2] + temp[3]; + } else { + for (j = 0; j < 8; j++) { + const int64_t diff = coeff[i + j] - dqcoeff[i + j]; + error += diff * diff; + sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j]; + } + } + } + assert(error >= 0 && sqcoeff >= 0); + error = (error + rounding) >> shift; + sqcoeff = (sqcoeff + rounding) >> shift; + + *ssz = sqcoeff; + return error; +} diff --git a/libs/libaom/src/av1/encoder/x86/highbd_fwd_txfm_avx2.c b/libs/libaom/src/av1/encoder/x86/highbd_fwd_txfm_avx2.c new file mode 100644 index 000000000..a81378cfe --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/highbd_fwd_txfm_avx2.c @@ -0,0 +1,3167 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include /*AVX2*/ + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" +#include "av1/common/av1_txfm.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" +#include "aom_dsp/txfm_common.h" +#include "aom_ports/mem.h" +#include "aom_dsp/x86/txfm_common_sse2.h" +#include "aom_dsp/x86/txfm_common_avx2.h" + +static INLINE void load_buffer_8x8_avx2(const int16_t *input, __m256i *out, + int stride, int flipud, int fliplr, + int shift) { + __m128i out1[8]; + if (!flipud) { + out1[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + out1[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + out1[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + out1[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + out1[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); + out1[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); + out1[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); + out1[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); + + } else { + out1[7] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + out1[6] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + out1[5] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + out1[4] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + out1[3] = _mm_load_si128((const __m128i *)(input + 4 * stride)); + out1[2] = _mm_load_si128((const __m128i *)(input + 5 * stride)); + out1[1] = _mm_load_si128((const __m128i *)(input + 6 * stride)); + out1[0] = _mm_load_si128((const __m128i *)(input + 7 * stride)); + } + if (!fliplr) { + out[0] = _mm256_cvtepi16_epi32(out1[0]); + out[1] = _mm256_cvtepi16_epi32(out1[1]); + out[2] = _mm256_cvtepi16_epi32(out1[2]); + out[3] = _mm256_cvtepi16_epi32(out1[3]); + out[4] = _mm256_cvtepi16_epi32(out1[4]); + out[5] = _mm256_cvtepi16_epi32(out1[5]); + out[6] = _mm256_cvtepi16_epi32(out1[6]); + out[7] = _mm256_cvtepi16_epi32(out1[7]); + + } else { + out[0] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[0])); + out[1] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[1])); + out[2] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[2])); + out[3] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[3])); + out[4] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[4])); + out[5] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[5])); + out[6] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[6])); + out[7] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[7])); + } + out[0] = _mm256_slli_epi32(out[0], shift); + out[1] = _mm256_slli_epi32(out[1], shift); + out[2] = _mm256_slli_epi32(out[2], shift); + out[3] = _mm256_slli_epi32(out[3], shift); + out[4] = _mm256_slli_epi32(out[4], shift); + out[5] = _mm256_slli_epi32(out[5], shift); + out[6] = _mm256_slli_epi32(out[6], shift); + out[7] = _mm256_slli_epi32(out[7], shift); +} +static INLINE void col_txfm_8x8_rounding(__m256i *in, int shift) { + const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1)); + + in[0] = _mm256_add_epi32(in[0], rounding); + in[1] = _mm256_add_epi32(in[1], rounding); + in[2] = _mm256_add_epi32(in[2], rounding); + in[3] = _mm256_add_epi32(in[3], rounding); + in[4] = _mm256_add_epi32(in[4], rounding); + in[5] = _mm256_add_epi32(in[5], rounding); + in[6] = _mm256_add_epi32(in[6], rounding); + in[7] = _mm256_add_epi32(in[7], rounding); + + in[0] = _mm256_srai_epi32(in[0], shift); + in[1] = _mm256_srai_epi32(in[1], shift); + in[2] = _mm256_srai_epi32(in[2], shift); + in[3] = _mm256_srai_epi32(in[3], shift); + in[4] = _mm256_srai_epi32(in[4], shift); + in[5] = _mm256_srai_epi32(in[5], shift); + in[6] = _mm256_srai_epi32(in[6], shift); + in[7] = _mm256_srai_epi32(in[7], shift); +} +static INLINE void load_buffer_8x16_avx2(const int16_t *input, __m256i *out, + int stride, int flipud, int fliplr, + int shift) { + const int16_t *topL = input; + const int16_t *botL = input + 8 * stride; + + const int16_t *tmp; + + if (flipud) { + tmp = topL; + topL = botL; + botL = tmp; + } + load_buffer_8x8_avx2(topL, out, stride, flipud, fliplr, shift); + load_buffer_8x8_avx2(botL, out + 8, stride, flipud, fliplr, shift); +} +static INLINE void load_buffer_16xn_avx2(const int16_t *input, __m256i *out, + int stride, int height, int outstride, + int flipud, int fliplr) { + __m256i out1[64]; + if (!flipud) { + for (int i = 0; i < height; i++) { + out1[i] = _mm256_loadu_si256((const __m256i *)(input + i * stride)); + } + } else { + for (int i = 0; i < height; i++) { + out1[(height - 1) - i] = + _mm256_loadu_si256((const __m256i *)(input + i * stride)); + } + } + if (!fliplr) { + for (int i = 0; i < height; i++) { + out[i * outstride] = + _mm256_cvtepi16_epi32(_mm256_castsi256_si128(out1[i])); + out[i * outstride + 1] = + _mm256_cvtepi16_epi32(_mm256_extractf128_si256(out1[i], 1)); + } + } else { + for (int i = 0; i < height; i++) { + out[i * outstride + 1] = _mm256_cvtepi16_epi32( + mm_reverse_epi16(_mm256_castsi256_si128(out1[i]))); + out[i * outstride + 0] = _mm256_cvtepi16_epi32( + mm_reverse_epi16(_mm256_extractf128_si256(out1[i], 1))); + } + } +} + +static void fwd_txfm_transpose_8x8_avx2(const __m256i *in, __m256i *out, + const int instride, + const int outstride) { + __m256i u0, u1, u2, u3, u4, u5, u6, u7; + __m256i x0, x1; + + u0 = _mm256_unpacklo_epi32(in[0 * instride], in[1 * instride]); + u1 = _mm256_unpackhi_epi32(in[0 * instride], in[1 * instride]); + + u2 = _mm256_unpacklo_epi32(in[2 * instride], in[3 * instride]); + u3 = _mm256_unpackhi_epi32(in[2 * instride], in[3 * instride]); + + u4 = _mm256_unpacklo_epi32(in[4 * instride], in[5 * instride]); + u5 = _mm256_unpackhi_epi32(in[4 * instride], in[5 * instride]); + + u6 = _mm256_unpacklo_epi32(in[6 * instride], in[7 * instride]); + u7 = _mm256_unpackhi_epi32(in[6 * instride], in[7 * instride]); + + x0 = _mm256_unpacklo_epi64(u0, u2); + x1 = _mm256_unpacklo_epi64(u4, u6); + out[0 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[4 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpackhi_epi64(u0, u2); + x1 = _mm256_unpackhi_epi64(u4, u6); + out[1 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[5 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpacklo_epi64(u1, u3); + x1 = _mm256_unpacklo_epi64(u5, u7); + out[2 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[6 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpackhi_epi64(u1, u3); + x1 = _mm256_unpackhi_epi64(u5, u7); + out[3 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[7 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31); +} +static INLINE void round_shift_32_8xn_avx2(__m256i *in, int size, int bit, + int stride) { + if (bit < 0) { + bit = -bit; + __m256i round = _mm256_set1_epi32(1 << (bit - 1)); + for (int i = 0; i < size; ++i) { + in[stride * i] = _mm256_add_epi32(in[stride * i], round); + in[stride * i] = _mm256_srai_epi32(in[stride * i], bit); + } + } else if (bit > 0) { + for (int i = 0; i < size; ++i) { + in[stride * i] = _mm256_slli_epi32(in[stride * i], bit); + } + } +} +static INLINE void store_buffer_avx2(const __m256i *const in, int32_t *out, + const int stride, const int out_size) { + for (int i = 0; i < out_size; ++i) { + _mm256_store_si256((__m256i *)(out), in[i]); + out += stride; + } +} +static INLINE void fwd_txfm_transpose_16x16_avx2(const __m256i *in, + __m256i *out) { + fwd_txfm_transpose_8x8_avx2(&in[0], &out[0], 2, 2); + fwd_txfm_transpose_8x8_avx2(&in[1], &out[16], 2, 2); + fwd_txfm_transpose_8x8_avx2(&in[16], &out[1], 2, 2); + fwd_txfm_transpose_8x8_avx2(&in[17], &out[17], 2, 2); +} + +static INLINE __m256i av1_half_btf_avx2(const __m256i *w0, const __m256i *n0, + const __m256i *w1, const __m256i *n1, + const __m256i *rounding, int bit) { + __m256i x, y; + + x = _mm256_mullo_epi32(*w0, *n0); + y = _mm256_mullo_epi32(*w1, *n1); + x = _mm256_add_epi32(x, y); + x = _mm256_add_epi32(x, *rounding); + x = _mm256_srai_epi32(x, bit); + return x; +} +#define btf_32_avx2_type0(w0, w1, in0, in1, out0, out1, bit) \ + do { \ + const __m256i ww0 = _mm256_set1_epi32(w0); \ + const __m256i ww1 = _mm256_set1_epi32(w1); \ + const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0); \ + const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1); \ + out0 = _mm256_add_epi32(in0_w0, in1_w1); \ + round_shift_32_8xn_avx2(&out0, 1, -bit, 1); \ + const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1); \ + const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0); \ + out1 = _mm256_sub_epi32(in0_w1, in1_w0); \ + round_shift_32_8xn_avx2(&out1, 1, -bit, 1); \ + } while (0) + +#define btf_32_type0_avx2_new(ww0, ww1, in0, in1, out0, out1, r, bit) \ + do { \ + const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0); \ + const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1); \ + out0 = _mm256_add_epi32(in0_w0, in1_w1); \ + out0 = _mm256_add_epi32(out0, r); \ + out0 = _mm256_srai_epi32(out0, bit); \ + const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1); \ + const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0); \ + out1 = _mm256_sub_epi32(in0_w1, in1_w0); \ + out1 = _mm256_add_epi32(out1, r); \ + out1 = _mm256_srai_epi32(out1, bit); \ + } while (0) + +typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, + const int8_t cos_bit, int instride, + int outstride); +static void fdct8_avx2(__m256i *in, __m256i *out, const int8_t bit, + const int col_num, const int outstride) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + __m256i u[8], v[8]; + for (int col = 0; col < col_num; ++col) { + u[0] = _mm256_add_epi32(in[0 * col_num + col], in[7 * col_num + col]); + v[7] = _mm256_sub_epi32(in[0 * col_num + col], in[7 * col_num + col]); + u[1] = _mm256_add_epi32(in[1 * col_num + col], in[6 * col_num + col]); + u[6] = _mm256_sub_epi32(in[1 * col_num + col], in[6 * col_num + col]); + u[2] = _mm256_add_epi32(in[2 * col_num + col], in[5 * col_num + col]); + u[5] = _mm256_sub_epi32(in[2 * col_num + col], in[5 * col_num + col]); + u[3] = _mm256_add_epi32(in[3 * col_num + col], in[4 * col_num + col]); + v[4] = _mm256_sub_epi32(in[3 * col_num + col], in[4 * col_num + col]); + v[0] = _mm256_add_epi32(u[0], u[3]); + v[3] = _mm256_sub_epi32(u[0], u[3]); + v[1] = _mm256_add_epi32(u[1], u[2]); + v[2] = _mm256_sub_epi32(u[1], u[2]); + + v[5] = _mm256_mullo_epi32(u[5], cospim32); + v[6] = _mm256_mullo_epi32(u[6], cospi32); + v[5] = _mm256_add_epi32(v[5], v[6]); + v[5] = _mm256_add_epi32(v[5], rnding); + v[5] = _mm256_srai_epi32(v[5], bit); + + u[0] = _mm256_mullo_epi32(u[5], cospi32); + v[6] = _mm256_mullo_epi32(u[6], cospim32); + v[6] = _mm256_sub_epi32(u[0], v[6]); + v[6] = _mm256_add_epi32(v[6], rnding); + v[6] = _mm256_srai_epi32(v[6], bit); + + // stage 3 + // type 0 + v[0] = _mm256_mullo_epi32(v[0], cospi32); + v[1] = _mm256_mullo_epi32(v[1], cospi32); + u[0] = _mm256_add_epi32(v[0], v[1]); + u[0] = _mm256_add_epi32(u[0], rnding); + u[0] = _mm256_srai_epi32(u[0], bit); + + u[1] = _mm256_sub_epi32(v[0], v[1]); + u[1] = _mm256_add_epi32(u[1], rnding); + u[1] = _mm256_srai_epi32(u[1], bit); + + // type 1 + v[0] = _mm256_mullo_epi32(v[2], cospi48); + v[1] = _mm256_mullo_epi32(v[3], cospi16); + u[2] = _mm256_add_epi32(v[0], v[1]); + u[2] = _mm256_add_epi32(u[2], rnding); + u[2] = _mm256_srai_epi32(u[2], bit); + + v[0] = _mm256_mullo_epi32(v[2], cospi16); + v[1] = _mm256_mullo_epi32(v[3], cospi48); + u[3] = _mm256_sub_epi32(v[1], v[0]); + u[3] = _mm256_add_epi32(u[3], rnding); + u[3] = _mm256_srai_epi32(u[3], bit); + + u[4] = _mm256_add_epi32(v[4], v[5]); + u[5] = _mm256_sub_epi32(v[4], v[5]); + u[6] = _mm256_sub_epi32(v[7], v[6]); + u[7] = _mm256_add_epi32(v[7], v[6]); + + // stage 4 + // stage 5 + v[0] = _mm256_mullo_epi32(u[4], cospi56); + v[1] = _mm256_mullo_epi32(u[7], cospi8); + v[0] = _mm256_add_epi32(v[0], v[1]); + v[0] = _mm256_add_epi32(v[0], rnding); + out[1 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[4] + + v[0] = _mm256_mullo_epi32(u[4], cospi8); + v[1] = _mm256_mullo_epi32(u[7], cospi56); + v[0] = _mm256_sub_epi32(v[1], v[0]); + v[0] = _mm256_add_epi32(v[0], rnding); + out[7 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[7] + + v[0] = _mm256_mullo_epi32(u[5], cospi24); + v[1] = _mm256_mullo_epi32(u[6], cospi40); + v[0] = _mm256_add_epi32(v[0], v[1]); + v[0] = _mm256_add_epi32(v[0], rnding); + out[5 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[5] + + v[0] = _mm256_mullo_epi32(u[5], cospi40); + v[1] = _mm256_mullo_epi32(u[6], cospi24); + v[0] = _mm256_sub_epi32(v[1], v[0]); + v[0] = _mm256_add_epi32(v[0], rnding); + out[3 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[6] + + out[0 * outstride + col] = u[0]; // buf0[0] + out[4 * outstride + col] = u[1]; // buf0[1] + out[2 * outstride + col] = u[2]; // buf0[2] + out[6 * outstride + col] = u[3]; // buf0[3] + } +} +static void fadst8_avx2(__m256i *in, __m256i *out, const int8_t bit, + const int col_num, const int outstirde) { + (void)col_num; + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const __m256i zero = _mm256_setzero_si256(); + __m256i u0, u1, u2, u3, u4, u5, u6, u7; + __m256i v0, v1, v2, v3, v4, v5, v6, v7; + __m256i x, y; + for (int col = 0; col < col_num; ++col) { + u0 = in[0 * col_num + col]; + u1 = _mm256_sub_epi32(zero, in[7 * col_num + col]); + u2 = _mm256_sub_epi32(zero, in[3 * col_num + col]); + u3 = in[4 * col_num + col]; + u4 = _mm256_sub_epi32(zero, in[1 * col_num + col]); + u5 = in[6 * col_num + col]; + u6 = in[2 * col_num + col]; + u7 = _mm256_sub_epi32(zero, in[5 * col_num + col]); + + // stage 2 + v0 = u0; + v1 = u1; + + x = _mm256_mullo_epi32(u2, cospi32); + y = _mm256_mullo_epi32(u3, cospi32); + v2 = _mm256_add_epi32(x, y); + v2 = _mm256_add_epi32(v2, rnding); + v2 = _mm256_srai_epi32(v2, bit); + + v3 = _mm256_sub_epi32(x, y); + v3 = _mm256_add_epi32(v3, rnding); + v3 = _mm256_srai_epi32(v3, bit); + + v4 = u4; + v5 = u5; + + x = _mm256_mullo_epi32(u6, cospi32); + y = _mm256_mullo_epi32(u7, cospi32); + v6 = _mm256_add_epi32(x, y); + v6 = _mm256_add_epi32(v6, rnding); + v6 = _mm256_srai_epi32(v6, bit); + + v7 = _mm256_sub_epi32(x, y); + v7 = _mm256_add_epi32(v7, rnding); + v7 = _mm256_srai_epi32(v7, bit); + + // stage 3 + u0 = _mm256_add_epi32(v0, v2); + u1 = _mm256_add_epi32(v1, v3); + u2 = _mm256_sub_epi32(v0, v2); + u3 = _mm256_sub_epi32(v1, v3); + u4 = _mm256_add_epi32(v4, v6); + u5 = _mm256_add_epi32(v5, v7); + u6 = _mm256_sub_epi32(v4, v6); + u7 = _mm256_sub_epi32(v5, v7); + + // stage 4 + v0 = u0; + v1 = u1; + v2 = u2; + v3 = u3; + + x = _mm256_mullo_epi32(u4, cospi16); + y = _mm256_mullo_epi32(u5, cospi48); + v4 = _mm256_add_epi32(x, y); + v4 = _mm256_add_epi32(v4, rnding); + v4 = _mm256_srai_epi32(v4, bit); + + x = _mm256_mullo_epi32(u4, cospi48); + y = _mm256_mullo_epi32(u5, cospim16); + v5 = _mm256_add_epi32(x, y); + v5 = _mm256_add_epi32(v5, rnding); + v5 = _mm256_srai_epi32(v5, bit); + + x = _mm256_mullo_epi32(u6, cospim48); + y = _mm256_mullo_epi32(u7, cospi16); + v6 = _mm256_add_epi32(x, y); + v6 = _mm256_add_epi32(v6, rnding); + v6 = _mm256_srai_epi32(v6, bit); + + x = _mm256_mullo_epi32(u6, cospi16); + y = _mm256_mullo_epi32(u7, cospi48); + v7 = _mm256_add_epi32(x, y); + v7 = _mm256_add_epi32(v7, rnding); + v7 = _mm256_srai_epi32(v7, bit); + + // stage 5 + u0 = _mm256_add_epi32(v0, v4); + u1 = _mm256_add_epi32(v1, v5); + u2 = _mm256_add_epi32(v2, v6); + u3 = _mm256_add_epi32(v3, v7); + u4 = _mm256_sub_epi32(v0, v4); + u5 = _mm256_sub_epi32(v1, v5); + u6 = _mm256_sub_epi32(v2, v6); + u7 = _mm256_sub_epi32(v3, v7); + + // stage 6 + x = _mm256_mullo_epi32(u0, cospi4); + y = _mm256_mullo_epi32(u1, cospi60); + v0 = _mm256_add_epi32(x, y); + v0 = _mm256_add_epi32(v0, rnding); + v0 = _mm256_srai_epi32(v0, bit); + + x = _mm256_mullo_epi32(u0, cospi60); + y = _mm256_mullo_epi32(u1, cospim4); + v1 = _mm256_add_epi32(x, y); + v1 = _mm256_add_epi32(v1, rnding); + v1 = _mm256_srai_epi32(v1, bit); + + x = _mm256_mullo_epi32(u2, cospi20); + y = _mm256_mullo_epi32(u3, cospi44); + v2 = _mm256_add_epi32(x, y); + v2 = _mm256_add_epi32(v2, rnding); + v2 = _mm256_srai_epi32(v2, bit); + + x = _mm256_mullo_epi32(u2, cospi44); + y = _mm256_mullo_epi32(u3, cospim20); + v3 = _mm256_add_epi32(x, y); + v3 = _mm256_add_epi32(v3, rnding); + v3 = _mm256_srai_epi32(v3, bit); + + x = _mm256_mullo_epi32(u4, cospi36); + y = _mm256_mullo_epi32(u5, cospi28); + v4 = _mm256_add_epi32(x, y); + v4 = _mm256_add_epi32(v4, rnding); + v4 = _mm256_srai_epi32(v4, bit); + + x = _mm256_mullo_epi32(u4, cospi28); + y = _mm256_mullo_epi32(u5, cospim36); + v5 = _mm256_add_epi32(x, y); + v5 = _mm256_add_epi32(v5, rnding); + v5 = _mm256_srai_epi32(v5, bit); + + x = _mm256_mullo_epi32(u6, cospi52); + y = _mm256_mullo_epi32(u7, cospi12); + v6 = _mm256_add_epi32(x, y); + v6 = _mm256_add_epi32(v6, rnding); + v6 = _mm256_srai_epi32(v6, bit); + + x = _mm256_mullo_epi32(u6, cospi12); + y = _mm256_mullo_epi32(u7, cospim52); + v7 = _mm256_add_epi32(x, y); + v7 = _mm256_add_epi32(v7, rnding); + v7 = _mm256_srai_epi32(v7, bit); + + // stage 7 + out[0 * outstirde + col] = v1; + out[1 * outstirde + col] = v6; + out[2 * outstirde + col] = v3; + out[3 * outstirde + col] = v4; + out[4 * outstirde + col] = v5; + out[5 * outstirde + col] = v2; + out[6 * outstirde + col] = v7; + out[7 * outstirde + col] = v0; + } +} +static void idtx8_avx2(__m256i *in, __m256i *out, const int8_t bit, int col_num, + int outstride) { + (void)bit; + (void)outstride; + int num_iters = 8 * col_num; + for (int i = 0; i < num_iters; i += 8) { + out[i] = _mm256_add_epi32(in[i], in[i]); + out[i + 1] = _mm256_add_epi32(in[i + 1], in[i + 1]); + out[i + 2] = _mm256_add_epi32(in[i + 2], in[i + 2]); + out[i + 3] = _mm256_add_epi32(in[i + 3], in[i + 3]); + out[i + 4] = _mm256_add_epi32(in[i + 4], in[i + 4]); + out[i + 5] = _mm256_add_epi32(in[i + 5], in[i + 5]); + out[i + 6] = _mm256_add_epi32(in[i + 6], in[i + 6]); + out[i + 7] = _mm256_add_epi32(in[i + 7], in[i + 7]); + } +} +void av1_fwd_txfm2d_8x8_avx2(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + __m256i in[8], out[8]; + const TX_SIZE tx_size = TX_8X8; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int width = tx_size_wide[tx_size]; + const int width_div8 = (width >> 3); + + switch (tx_type) { + case DCT_DCT: + load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + store_buffer_avx2(in, coeff, 8, 8); + break; + case ADST_DCT: + load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + store_buffer_avx2(in, coeff, 8, 8); + break; + case DCT_ADST: + load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + store_buffer_avx2(in, coeff, 8, 8); + break; + case ADST_ADST: + load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + store_buffer_avx2(in, coeff, 8, 8); + break; + case FLIPADST_DCT: + load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + store_buffer_avx2(in, coeff, 8, 8); + break; + case DCT_FLIPADST: + load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]); + fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + store_buffer_avx2(in, coeff, 8, 8); + break; + case FLIPADST_FLIPADST: + load_buffer_8x8_avx2(input, in, stride, 1, 1, shift[0]); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + store_buffer_avx2(in, coeff, 8, 8); + break; + case ADST_FLIPADST: + load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + store_buffer_avx2(in, coeff, 8, 8); + break; + case FLIPADST_ADST: + load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + store_buffer_avx2(in, coeff, 8, 8); + break; + case IDTX: + load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + idtx8_avx2(out, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(in, coeff, 8, 8); + break; + case V_DCT: + load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + idtx8_avx2(out, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(in, coeff, 8, 8); + break; + case H_DCT: + load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + store_buffer_avx2(in, coeff, 8, 8); + break; + case V_ADST: + load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + idtx8_avx2(out, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(in, coeff, 8, 8); + break; + case H_ADST: + load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + store_buffer_avx2(in, coeff, 8, 8); + break; + case V_FLIPADST: + load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + idtx8_avx2(out, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(in, coeff, 8, 8); + break; + case H_FLIPADST: + load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]); + idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + store_buffer_avx2(in, coeff, 8, 8); + break; + default: assert(0); + } + (void)bd; +} + +static void fdct16_avx2(__m256i *in, __m256i *out, const int8_t bit, + const int col_num, const int outstride) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + __m256i u[16], v[16], x; + int col; + + // Calculate the column 0, 1, 2, 3 + for (col = 0; col < col_num; ++col) { + // stage 0 + // stage 1 + u[0] = _mm256_add_epi32(in[0 * col_num + col], in[15 * col_num + col]); + u[15] = _mm256_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]); + u[1] = _mm256_add_epi32(in[1 * col_num + col], in[14 * col_num + col]); + u[14] = _mm256_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]); + u[2] = _mm256_add_epi32(in[2 * col_num + col], in[13 * col_num + col]); + u[13] = _mm256_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]); + u[3] = _mm256_add_epi32(in[3 * col_num + col], in[12 * col_num + col]); + u[12] = _mm256_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]); + u[4] = _mm256_add_epi32(in[4 * col_num + col], in[11 * col_num + col]); + u[11] = _mm256_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]); + u[5] = _mm256_add_epi32(in[5 * col_num + col], in[10 * col_num + col]); + u[10] = _mm256_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]); + u[6] = _mm256_add_epi32(in[6 * col_num + col], in[9 * col_num + col]); + u[9] = _mm256_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]); + u[7] = _mm256_add_epi32(in[7 * col_num + col], in[8 * col_num + col]); + u[8] = _mm256_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]); + + // stage 2 + v[0] = _mm256_add_epi32(u[0], u[7]); + v[7] = _mm256_sub_epi32(u[0], u[7]); + v[1] = _mm256_add_epi32(u[1], u[6]); + v[6] = _mm256_sub_epi32(u[1], u[6]); + v[2] = _mm256_add_epi32(u[2], u[5]); + v[5] = _mm256_sub_epi32(u[2], u[5]); + v[3] = _mm256_add_epi32(u[3], u[4]); + v[4] = _mm256_sub_epi32(u[3], u[4]); + v[8] = u[8]; + v[9] = u[9]; + + v[10] = _mm256_mullo_epi32(u[10], cospim32); + x = _mm256_mullo_epi32(u[13], cospi32); + v[10] = _mm256_add_epi32(v[10], x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[13] = _mm256_mullo_epi32(u[10], cospi32); + x = _mm256_mullo_epi32(u[13], cospim32); + v[13] = _mm256_sub_epi32(v[13], x); + v[13] = _mm256_add_epi32(v[13], rnding); + v[13] = _mm256_srai_epi32(v[13], bit); + + v[11] = _mm256_mullo_epi32(u[11], cospim32); + x = _mm256_mullo_epi32(u[12], cospi32); + v[11] = _mm256_add_epi32(v[11], x); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + v[12] = _mm256_mullo_epi32(u[11], cospi32); + x = _mm256_mullo_epi32(u[12], cospim32); + v[12] = _mm256_sub_epi32(v[12], x); + v[12] = _mm256_add_epi32(v[12], rnding); + v[12] = _mm256_srai_epi32(v[12], bit); + v[14] = u[14]; + v[15] = u[15]; + + // stage 3 + u[0] = _mm256_add_epi32(v[0], v[3]); + u[3] = _mm256_sub_epi32(v[0], v[3]); + u[1] = _mm256_add_epi32(v[1], v[2]); + u[2] = _mm256_sub_epi32(v[1], v[2]); + u[4] = v[4]; + + u[5] = _mm256_mullo_epi32(v[5], cospim32); + x = _mm256_mullo_epi32(v[6], cospi32); + u[5] = _mm256_add_epi32(u[5], x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + u[6] = _mm256_mullo_epi32(v[5], cospi32); + x = _mm256_mullo_epi32(v[6], cospim32); + u[6] = _mm256_sub_epi32(u[6], x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[7] = v[7]; + u[8] = _mm256_add_epi32(v[8], v[11]); + u[11] = _mm256_sub_epi32(v[8], v[11]); + u[9] = _mm256_add_epi32(v[9], v[10]); + u[10] = _mm256_sub_epi32(v[9], v[10]); + u[12] = _mm256_sub_epi32(v[15], v[12]); + u[15] = _mm256_add_epi32(v[15], v[12]); + u[13] = _mm256_sub_epi32(v[14], v[13]); + u[14] = _mm256_add_epi32(v[14], v[13]); + + // stage 4 + u[0] = _mm256_mullo_epi32(u[0], cospi32); + u[1] = _mm256_mullo_epi32(u[1], cospi32); + v[0] = _mm256_add_epi32(u[0], u[1]); + v[0] = _mm256_add_epi32(v[0], rnding); + v[0] = _mm256_srai_epi32(v[0], bit); + + v[1] = _mm256_sub_epi32(u[0], u[1]); + v[1] = _mm256_add_epi32(v[1], rnding); + v[1] = _mm256_srai_epi32(v[1], bit); + + v[2] = _mm256_mullo_epi32(u[2], cospi48); + x = _mm256_mullo_epi32(u[3], cospi16); + v[2] = _mm256_add_epi32(v[2], x); + v[2] = _mm256_add_epi32(v[2], rnding); + v[2] = _mm256_srai_epi32(v[2], bit); + + v[3] = _mm256_mullo_epi32(u[2], cospi16); + x = _mm256_mullo_epi32(u[3], cospi48); + v[3] = _mm256_sub_epi32(x, v[3]); + v[3] = _mm256_add_epi32(v[3], rnding); + v[3] = _mm256_srai_epi32(v[3], bit); + + v[4] = _mm256_add_epi32(u[4], u[5]); + v[5] = _mm256_sub_epi32(u[4], u[5]); + v[6] = _mm256_sub_epi32(u[7], u[6]); + v[7] = _mm256_add_epi32(u[7], u[6]); + v[8] = u[8]; + + v[9] = _mm256_mullo_epi32(u[9], cospim16); + x = _mm256_mullo_epi32(u[14], cospi48); + v[9] = _mm256_add_epi32(v[9], x); + v[9] = _mm256_add_epi32(v[9], rnding); + v[9] = _mm256_srai_epi32(v[9], bit); + + v[14] = _mm256_mullo_epi32(u[9], cospi48); + x = _mm256_mullo_epi32(u[14], cospim16); + v[14] = _mm256_sub_epi32(v[14], x); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[10] = _mm256_mullo_epi32(u[10], cospim48); + x = _mm256_mullo_epi32(u[13], cospim16); + v[10] = _mm256_add_epi32(v[10], x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[13] = _mm256_mullo_epi32(u[10], cospim16); + x = _mm256_mullo_epi32(u[13], cospim48); + v[13] = _mm256_sub_epi32(v[13], x); + v[13] = _mm256_add_epi32(v[13], rnding); + v[13] = _mm256_srai_epi32(v[13], bit); + + v[11] = u[11]; + v[12] = u[12]; + v[15] = u[15]; + + // stage 5 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = _mm256_mullo_epi32(v[4], cospi56); + x = _mm256_mullo_epi32(v[7], cospi8); + u[4] = _mm256_add_epi32(u[4], x); + u[4] = _mm256_add_epi32(u[4], rnding); + u[4] = _mm256_srai_epi32(u[4], bit); + + u[7] = _mm256_mullo_epi32(v[4], cospi8); + x = _mm256_mullo_epi32(v[7], cospi56); + u[7] = _mm256_sub_epi32(x, u[7]); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + u[5] = _mm256_mullo_epi32(v[5], cospi24); + x = _mm256_mullo_epi32(v[6], cospi40); + u[5] = _mm256_add_epi32(u[5], x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + u[6] = _mm256_mullo_epi32(v[5], cospi40); + x = _mm256_mullo_epi32(v[6], cospi24); + u[6] = _mm256_sub_epi32(x, u[6]); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[8] = _mm256_add_epi32(v[8], v[9]); + u[9] = _mm256_sub_epi32(v[8], v[9]); + u[10] = _mm256_sub_epi32(v[11], v[10]); + u[11] = _mm256_add_epi32(v[11], v[10]); + u[12] = _mm256_add_epi32(v[12], v[13]); + u[13] = _mm256_sub_epi32(v[12], v[13]); + u[14] = _mm256_sub_epi32(v[15], v[14]); + u[15] = _mm256_add_epi32(v[15], v[14]); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = _mm256_mullo_epi32(u[8], cospi60); + x = _mm256_mullo_epi32(u[15], cospi4); + v[8] = _mm256_add_epi32(v[8], x); + v[8] = _mm256_add_epi32(v[8], rnding); + v[8] = _mm256_srai_epi32(v[8], bit); + + v[15] = _mm256_mullo_epi32(u[8], cospi4); + x = _mm256_mullo_epi32(u[15], cospi60); + v[15] = _mm256_sub_epi32(x, v[15]); + v[15] = _mm256_add_epi32(v[15], rnding); + v[15] = _mm256_srai_epi32(v[15], bit); + + v[9] = _mm256_mullo_epi32(u[9], cospi28); + x = _mm256_mullo_epi32(u[14], cospi36); + v[9] = _mm256_add_epi32(v[9], x); + v[9] = _mm256_add_epi32(v[9], rnding); + v[9] = _mm256_srai_epi32(v[9], bit); + + v[14] = _mm256_mullo_epi32(u[9], cospi36); + x = _mm256_mullo_epi32(u[14], cospi28); + v[14] = _mm256_sub_epi32(x, v[14]); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[10] = _mm256_mullo_epi32(u[10], cospi44); + x = _mm256_mullo_epi32(u[13], cospi20); + v[10] = _mm256_add_epi32(v[10], x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[13] = _mm256_mullo_epi32(u[10], cospi20); + x = _mm256_mullo_epi32(u[13], cospi44); + v[13] = _mm256_sub_epi32(x, v[13]); + v[13] = _mm256_add_epi32(v[13], rnding); + v[13] = _mm256_srai_epi32(v[13], bit); + + v[11] = _mm256_mullo_epi32(u[11], cospi12); + x = _mm256_mullo_epi32(u[12], cospi52); + v[11] = _mm256_add_epi32(v[11], x); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + v[12] = _mm256_mullo_epi32(u[11], cospi52); + x = _mm256_mullo_epi32(u[12], cospi12); + v[12] = _mm256_sub_epi32(x, v[12]); + v[12] = _mm256_add_epi32(v[12], rnding); + v[12] = _mm256_srai_epi32(v[12], bit); + + out[0 * outstride + col] = v[0]; + out[1 * outstride + col] = v[8]; + out[2 * outstride + col] = v[4]; + out[3 * outstride + col] = v[12]; + out[4 * outstride + col] = v[2]; + out[5 * outstride + col] = v[10]; + out[6 * outstride + col] = v[6]; + out[7 * outstride + col] = v[14]; + out[8 * outstride + col] = v[1]; + out[9 * outstride + col] = v[9]; + out[10 * outstride + col] = v[5]; + out[11 * outstride + col] = v[13]; + out[12 * outstride + col] = v[3]; + out[13 * outstride + col] = v[11]; + out[14 * outstride + col] = v[7]; + out[15 * outstride + col] = v[15]; + } +} +static void fadst16_avx2(__m256i *in, __m256i *out, const int8_t bit, + const int num_cols, const int outstride) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]); + const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); + const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); + const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]); + const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); + const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); + const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]); + const __m256i cospi34 = _mm256_set1_epi32(cospi[34]); + const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); + const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]); + const __m256i cospi42 = _mm256_set1_epi32(cospi[42]); + const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); + const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]); + const __m256i cospi50 = _mm256_set1_epi32(cospi[50]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); + const __m256i cospi58 = _mm256_set1_epi32(cospi[58]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const __m256i zero = _mm256_setzero_si256(); + + __m256i u[16], v[16], x, y; + int col; + + for (col = 0; col < num_cols; ++col) { + // stage 0 + // stage 1 + u[0] = in[0 * num_cols + col]; + u[1] = _mm256_sub_epi32(zero, in[15 * num_cols + col]); + u[2] = _mm256_sub_epi32(zero, in[7 * num_cols + col]); + u[3] = in[8 * num_cols + col]; + u[4] = _mm256_sub_epi32(zero, in[3 * num_cols + col]); + u[5] = in[12 * num_cols + col]; + u[6] = in[4 * num_cols + col]; + u[7] = _mm256_sub_epi32(zero, in[11 * num_cols + col]); + u[8] = _mm256_sub_epi32(zero, in[1 * num_cols + col]); + u[9] = in[14 * num_cols + col]; + u[10] = in[6 * num_cols + col]; + u[11] = _mm256_sub_epi32(zero, in[9 * num_cols + col]); + u[12] = in[2 * num_cols + col]; + u[13] = _mm256_sub_epi32(zero, in[13 * num_cols + col]); + u[14] = _mm256_sub_epi32(zero, in[5 * num_cols + col]); + u[15] = in[10 * num_cols + col]; + + // stage 2 + v[0] = u[0]; + v[1] = u[1]; + + x = _mm256_mullo_epi32(u[2], cospi32); + y = _mm256_mullo_epi32(u[3], cospi32); + v[2] = _mm256_add_epi32(x, y); + v[2] = _mm256_add_epi32(v[2], rnding); + v[2] = _mm256_srai_epi32(v[2], bit); + + v[3] = _mm256_sub_epi32(x, y); + v[3] = _mm256_add_epi32(v[3], rnding); + v[3] = _mm256_srai_epi32(v[3], bit); + + v[4] = u[4]; + v[5] = u[5]; + + x = _mm256_mullo_epi32(u[6], cospi32); + y = _mm256_mullo_epi32(u[7], cospi32); + v[6] = _mm256_add_epi32(x, y); + v[6] = _mm256_add_epi32(v[6], rnding); + v[6] = _mm256_srai_epi32(v[6], bit); + + v[7] = _mm256_sub_epi32(x, y); + v[7] = _mm256_add_epi32(v[7], rnding); + v[7] = _mm256_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + + x = _mm256_mullo_epi32(u[10], cospi32); + y = _mm256_mullo_epi32(u[11], cospi32); + v[10] = _mm256_add_epi32(x, y); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[11] = _mm256_sub_epi32(x, y); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + v[12] = u[12]; + v[13] = u[13]; + + x = _mm256_mullo_epi32(u[14], cospi32); + y = _mm256_mullo_epi32(u[15], cospi32); + v[14] = _mm256_add_epi32(x, y); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[15] = _mm256_sub_epi32(x, y); + v[15] = _mm256_add_epi32(v[15], rnding); + v[15] = _mm256_srai_epi32(v[15], bit); + + // stage 3 + u[0] = _mm256_add_epi32(v[0], v[2]); + u[1] = _mm256_add_epi32(v[1], v[3]); + u[2] = _mm256_sub_epi32(v[0], v[2]); + u[3] = _mm256_sub_epi32(v[1], v[3]); + u[4] = _mm256_add_epi32(v[4], v[6]); + u[5] = _mm256_add_epi32(v[5], v[7]); + u[6] = _mm256_sub_epi32(v[4], v[6]); + u[7] = _mm256_sub_epi32(v[5], v[7]); + u[8] = _mm256_add_epi32(v[8], v[10]); + u[9] = _mm256_add_epi32(v[9], v[11]); + u[10] = _mm256_sub_epi32(v[8], v[10]); + u[11] = _mm256_sub_epi32(v[9], v[11]); + u[12] = _mm256_add_epi32(v[12], v[14]); + u[13] = _mm256_add_epi32(v[13], v[15]); + u[14] = _mm256_sub_epi32(v[12], v[14]); + u[15] = _mm256_sub_epi32(v[13], v[15]); + + // stage 4 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = av1_half_btf_avx2(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit); + v[5] = av1_half_btf_avx2(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit); + v[6] = av1_half_btf_avx2(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit); + v[7] = av1_half_btf_avx2(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit); + v[8] = u[8]; + v[9] = u[9]; + v[10] = u[10]; + v[11] = u[11]; + v[12] = av1_half_btf_avx2(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit); + v[13] = + av1_half_btf_avx2(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit); + v[14] = + av1_half_btf_avx2(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit); + v[15] = av1_half_btf_avx2(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit); + + // stage 5 + u[0] = _mm256_add_epi32(v[0], v[4]); + u[1] = _mm256_add_epi32(v[1], v[5]); + u[2] = _mm256_add_epi32(v[2], v[6]); + u[3] = _mm256_add_epi32(v[3], v[7]); + u[4] = _mm256_sub_epi32(v[0], v[4]); + u[5] = _mm256_sub_epi32(v[1], v[5]); + u[6] = _mm256_sub_epi32(v[2], v[6]); + u[7] = _mm256_sub_epi32(v[3], v[7]); + u[8] = _mm256_add_epi32(v[8], v[12]); + u[9] = _mm256_add_epi32(v[9], v[13]); + u[10] = _mm256_add_epi32(v[10], v[14]); + u[11] = _mm256_add_epi32(v[11], v[15]); + u[12] = _mm256_sub_epi32(v[8], v[12]); + u[13] = _mm256_sub_epi32(v[9], v[13]); + u[14] = _mm256_sub_epi32(v[10], v[14]); + u[15] = _mm256_sub_epi32(v[11], v[15]); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + v[8] = av1_half_btf_avx2(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit); + v[9] = av1_half_btf_avx2(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit); + v[10] = av1_half_btf_avx2(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit); + v[11] = + av1_half_btf_avx2(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit); + v[12] = av1_half_btf_avx2(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit); + v[13] = av1_half_btf_avx2(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit); + v[14] = + av1_half_btf_avx2(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit); + v[15] = av1_half_btf_avx2(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit); + + // stage 7 + u[0] = _mm256_add_epi32(v[0], v[8]); + u[1] = _mm256_add_epi32(v[1], v[9]); + u[2] = _mm256_add_epi32(v[2], v[10]); + u[3] = _mm256_add_epi32(v[3], v[11]); + u[4] = _mm256_add_epi32(v[4], v[12]); + u[5] = _mm256_add_epi32(v[5], v[13]); + u[6] = _mm256_add_epi32(v[6], v[14]); + u[7] = _mm256_add_epi32(v[7], v[15]); + u[8] = _mm256_sub_epi32(v[0], v[8]); + u[9] = _mm256_sub_epi32(v[1], v[9]); + u[10] = _mm256_sub_epi32(v[2], v[10]); + u[11] = _mm256_sub_epi32(v[3], v[11]); + u[12] = _mm256_sub_epi32(v[4], v[12]); + u[13] = _mm256_sub_epi32(v[5], v[13]); + u[14] = _mm256_sub_epi32(v[6], v[14]); + u[15] = _mm256_sub_epi32(v[7], v[15]); + + // stage 8 + v[0] = av1_half_btf_avx2(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit); + v[1] = av1_half_btf_avx2(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit); + v[2] = av1_half_btf_avx2(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit); + v[3] = av1_half_btf_avx2(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit); + v[4] = av1_half_btf_avx2(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit); + v[5] = av1_half_btf_avx2(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit); + v[6] = av1_half_btf_avx2(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit); + v[7] = av1_half_btf_avx2(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit); + v[8] = av1_half_btf_avx2(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit); + v[9] = av1_half_btf_avx2(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit); + v[10] = av1_half_btf_avx2(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit); + v[11] = + av1_half_btf_avx2(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit); + v[12] = av1_half_btf_avx2(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit); + v[13] = + av1_half_btf_avx2(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit); + v[14] = av1_half_btf_avx2(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit); + v[15] = av1_half_btf_avx2(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit); + + // stage 9 + out[0 * outstride + col] = v[1]; + out[1 * outstride + col] = v[14]; + out[2 * outstride + col] = v[3]; + out[3 * outstride + col] = v[12]; + out[4 * outstride + col] = v[5]; + out[5 * outstride + col] = v[10]; + out[6 * outstride + col] = v[7]; + out[7 * outstride + col] = v[8]; + out[8 * outstride + col] = v[9]; + out[9 * outstride + col] = v[6]; + out[10 * outstride + col] = v[11]; + out[11 * outstride + col] = v[4]; + out[12 * outstride + col] = v[13]; + out[13 * outstride + col] = v[2]; + out[14 * outstride + col] = v[15]; + out[15 * outstride + col] = v[0]; + } +} +static void idtx16_avx2(__m256i *in, __m256i *out, const int8_t bit, + int col_num, const int outstride) { + (void)bit; + (void)outstride; + __m256i fact = _mm256_set1_epi32(2 * NewSqrt2); + __m256i offset = _mm256_set1_epi32(1 << (NewSqrt2Bits - 1)); + __m256i a_low; + + int num_iters = 16 * col_num; + for (int i = 0; i < num_iters; i++) { + a_low = _mm256_mullo_epi32(in[i], fact); + a_low = _mm256_add_epi32(a_low, offset); + out[i] = _mm256_srai_epi32(a_low, NewSqrt2Bits); + } +} +static const transform_1d_avx2 col_highbd_txfm8x16_arr[TX_TYPES] = { + fdct16_avx2, // DCT_DCT + fadst16_avx2, // ADST_DCT + fdct16_avx2, // DCT_ADST + fadst16_avx2, // ADST_ADST + fadst16_avx2, // FLIPADST_DCT + fdct16_avx2, // DCT_FLIPADST + fadst16_avx2, // FLIPADST_FLIPADST + fadst16_avx2, // ADST_FLIPADST + fadst16_avx2, // FLIPADST_ADST + idtx16_avx2, // IDTX + fdct16_avx2, // V_DCT + idtx16_avx2, // H_DCT + fadst16_avx2, // V_ADST + idtx16_avx2, // H_ADST + fadst16_avx2, // V_FLIPADST + idtx16_avx2 // H_FLIPADST +}; +static const transform_1d_avx2 row_highbd_txfm8x8_arr[TX_TYPES] = { + fdct8_avx2, // DCT_DCT + fdct8_avx2, // ADST_DCT + fadst8_avx2, // DCT_ADST + fadst8_avx2, // ADST_ADST + fdct8_avx2, // FLIPADST_DCT + fadst8_avx2, // DCT_FLIPADST + fadst8_avx2, // FLIPADST_FLIPADST + fadst8_avx2, // ADST_FLIPADST + fadst8_avx2, // FLIPADST_ADST + idtx8_avx2, // IDTX + idtx8_avx2, // V_DCT + fdct8_avx2, // H_DCT + idtx8_avx2, // V_ADST + fadst8_avx2, // H_ADST + idtx8_avx2, // V_FLIPADST + fadst8_avx2 // H_FLIPADST +}; +void av1_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + __m256i in[16], out[16]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16]; + const int txw_idx = get_txw_idx(TX_8X16); + const int txh_idx = get_txh_idx(TX_8X16); + const transform_1d_avx2 col_txfm = col_highbd_txfm8x16_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_highbd_txfm8x8_arr[tx_type]; + const int8_t bit = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + load_buffer_8x16_avx2(input, in, stride, ud_flip, lr_flip, shift[0]); + col_txfm(in, out, bit, 1, 1); + col_txfm_8x8_rounding(out, -shift[1]); + col_txfm_8x8_rounding(&out[8], -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, 1, 2); + fwd_txfm_transpose_8x8_avx2(&out[8], &in[1], 1, 2); + row_txfm(in, out, bit, 2, 2); + fwd_txfm_transpose_8x8_avx2(out, in, 2, 1); + fwd_txfm_transpose_8x8_avx2(&out[1], &in[8], 2, 1); + av1_round_shift_rect_array_32_avx2(in, in, 16, -shift[2], NewSqrt2); + store_buffer_avx2(in, coeff, 8, 16); + (void)bd; +} +static const transform_1d_avx2 col_highbd_txfm8x8_arr[TX_TYPES] = { + fdct8_avx2, // DCT_DCT + fadst8_avx2, // ADST_DCT + fdct8_avx2, // DCT_ADST + fadst8_avx2, // ADST_ADST + fadst8_avx2, // FLIPADST_DCT + fdct8_avx2, // DCT_FLIPADST + fadst8_avx2, // FLIPADST_FLIPADST + fadst8_avx2, // ADST_FLIPADST + fadst8_avx2, // FLIPADST_ADST + idtx8_avx2, // IDTX + fdct8_avx2, // V_DCT + idtx8_avx2, // H_DCT + fadst8_avx2, // V_ADST + idtx8_avx2, // H_ADST + fadst8_avx2, // V_FLIPADST + idtx8_avx2 // H_FLIPADST +}; +static const transform_1d_avx2 row_highbd_txfm8x16_arr[TX_TYPES] = { + fdct16_avx2, // DCT_DCT + fdct16_avx2, // ADST_DCT + fadst16_avx2, // DCT_ADST + fadst16_avx2, // ADST_ADST + fdct16_avx2, // FLIPADST_DCT + fadst16_avx2, // DCT_FLIPADST + fadst16_avx2, // FLIPADST_FLIPADST + fadst16_avx2, // ADST_FLIPADST + fadst16_avx2, // FLIPADST_ADST + idtx16_avx2, // IDTX + idtx16_avx2, // V_DCT + fdct16_avx2, // H_DCT + idtx16_avx2, // V_ADST + fadst16_avx2, // H_ADST + idtx16_avx2, // V_FLIPADST + fadst16_avx2 // H_FLIPADST +}; +void av1_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + __m256i in[16], out[16]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8]; + const int txw_idx = get_txw_idx(TX_16X8); + const int txh_idx = get_txh_idx(TX_16X8); + const transform_1d_avx2 col_txfm = col_highbd_txfm8x8_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_highbd_txfm8x16_arr[tx_type]; + const int8_t bit = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + load_buffer_16xn_avx2(input, in, stride, 8, 2, ud_flip, lr_flip); + round_shift_32_8xn_avx2(in, 16, shift[0], 1); + col_txfm(in, out, bit, 2, 2); + round_shift_32_8xn_avx2(out, 16, shift[1], 1); + fwd_txfm_transpose_8x8_avx2(out, in, 2, 1); + fwd_txfm_transpose_8x8_avx2(&out[1], &in[8], 2, 1); + row_txfm(in, out, bit, 1, 1); + fwd_txfm_transpose_8x8_avx2(out, in, 1, 2); + fwd_txfm_transpose_8x8_avx2(&out[8], &in[1], 1, 2); + av1_round_shift_rect_array_32_avx2(in, in, 16, -shift[2], NewSqrt2); + store_buffer_avx2(in, coeff, 8, 16); + (void)bd; +} +void av1_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + __m256i in[32], out[32]; + const TX_SIZE tx_size = TX_16X16; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const int width_div8 = (width >> 3); + const int width_div16 = (width >> 4); + const int size = (height << 1); + switch (tx_type) { + case DCT_DCT: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + fwd_txfm_transpose_16x16_avx2(out, in); + store_buffer_avx2(in, coeff, 8, 32); + break; + case ADST_DCT: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + fwd_txfm_transpose_16x16_avx2(out, in); + store_buffer_avx2(in, coeff, 8, 32); + break; + case DCT_ADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + fwd_txfm_transpose_16x16_avx2(out, in); + store_buffer_avx2(in, coeff, 8, 32); + break; + case ADST_ADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + fwd_txfm_transpose_16x16_avx2(out, in); + store_buffer_avx2(in, coeff, 8, 32); + break; + case FLIPADST_DCT: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + fwd_txfm_transpose_16x16_avx2(out, in); + store_buffer_avx2(in, coeff, 8, 32); + break; + case DCT_FLIPADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + fwd_txfm_transpose_16x16_avx2(out, in); + store_buffer_avx2(in, coeff, 8, 32); + break; + case FLIPADST_FLIPADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 1); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + fwd_txfm_transpose_16x16_avx2(out, in); + store_buffer_avx2(in, coeff, 8, 32); + break; + case ADST_FLIPADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + fwd_txfm_transpose_16x16_avx2(out, in); + store_buffer_avx2(in, coeff, 8, 32); + break; + case FLIPADST_ADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + fwd_txfm_transpose_16x16_avx2(out, in); + store_buffer_avx2(in, coeff, 8, 32); + break; + case IDTX: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + idtx16_avx2(out, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(in, coeff, 8, 32); + break; + case V_DCT: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + idtx16_avx2(out, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(in, coeff, 8, 32); + break; + case H_DCT: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + fwd_txfm_transpose_16x16_avx2(out, in); + store_buffer_avx2(in, coeff, 8, 32); + break; + case V_ADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + idtx16_avx2(out, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(in, coeff, 8, 32); + break; + case H_ADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + fwd_txfm_transpose_16x16_avx2(out, in); + store_buffer_avx2(in, coeff, 8, 32); + break; + case V_FLIPADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + idtx16_avx2(out, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(in, coeff, 8, 32); + break; + case H_FLIPADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + fwd_txfm_transpose_16x16_avx2(out, in); + store_buffer_avx2(in, coeff, 8, 32); + break; + default: assert(0); + } + (void)bd; +} +static INLINE void fdct32_avx2(__m256i *input, __m256i *output, + const int8_t cos_bit, const int instride, + const int outstride) { + __m256i buf0[32]; + __m256i buf1[32]; + const int32_t *cospi; + int startidx = 0 * instride; + int endidx = 31 * instride; + // stage 0 + // stage 1 + buf1[0] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[31] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[1] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[30] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[2] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[29] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[3] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[28] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[4] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[27] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[5] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[26] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[6] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[25] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[7] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[24] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[8] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[23] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[9] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[22] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[10] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[21] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[11] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[20] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[12] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[19] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[13] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[18] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[14] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[17] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[15] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[16] = _mm256_sub_epi32(input[startidx], input[endidx]); + + // stage 2 + cospi = cospi_arr(cos_bit); + buf0[0] = _mm256_add_epi32(buf1[0], buf1[15]); + buf0[15] = _mm256_sub_epi32(buf1[0], buf1[15]); + buf0[1] = _mm256_add_epi32(buf1[1], buf1[14]); + buf0[14] = _mm256_sub_epi32(buf1[1], buf1[14]); + buf0[2] = _mm256_add_epi32(buf1[2], buf1[13]); + buf0[13] = _mm256_sub_epi32(buf1[2], buf1[13]); + buf0[3] = _mm256_add_epi32(buf1[3], buf1[12]); + buf0[12] = _mm256_sub_epi32(buf1[3], buf1[12]); + buf0[4] = _mm256_add_epi32(buf1[4], buf1[11]); + buf0[11] = _mm256_sub_epi32(buf1[4], buf1[11]); + buf0[5] = _mm256_add_epi32(buf1[5], buf1[10]); + buf0[10] = _mm256_sub_epi32(buf1[5], buf1[10]); + buf0[6] = _mm256_add_epi32(buf1[6], buf1[9]); + buf0[9] = _mm256_sub_epi32(buf1[6], buf1[9]); + buf0[7] = _mm256_add_epi32(buf1[7], buf1[8]); + buf0[8] = _mm256_sub_epi32(buf1[7], buf1[8]); + buf0[16] = buf1[16]; + buf0[17] = buf1[17]; + buf0[18] = buf1[18]; + buf0[19] = buf1[19]; + btf_32_avx2_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20], + buf0[27], cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22], + buf0[25], cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23], + buf0[24], cos_bit); + buf0[28] = buf1[28]; + buf0[29] = buf1[29]; + buf0[30] = buf1[30]; + buf0[31] = buf1[31]; + + // stage 3 + cospi = cospi_arr(cos_bit); + buf1[0] = _mm256_add_epi32(buf0[0], buf0[7]); + buf1[7] = _mm256_sub_epi32(buf0[0], buf0[7]); + buf1[1] = _mm256_add_epi32(buf0[1], buf0[6]); + buf1[6] = _mm256_sub_epi32(buf0[1], buf0[6]); + buf1[2] = _mm256_add_epi32(buf0[2], buf0[5]); + buf1[5] = _mm256_sub_epi32(buf0[2], buf0[5]); + buf1[3] = _mm256_add_epi32(buf0[3], buf0[4]); + buf1[4] = _mm256_sub_epi32(buf0[3], buf0[4]); + buf1[8] = buf0[8]; + buf1[9] = buf0[9]; + btf_32_avx2_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10], + buf1[13], cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11], + buf1[12], cos_bit); + buf1[14] = buf0[14]; + buf1[15] = buf0[15]; + buf1[16] = _mm256_add_epi32(buf0[16], buf0[23]); + buf1[23] = _mm256_sub_epi32(buf0[16], buf0[23]); + buf1[17] = _mm256_add_epi32(buf0[17], buf0[22]); + buf1[22] = _mm256_sub_epi32(buf0[17], buf0[22]); + buf1[18] = _mm256_add_epi32(buf0[18], buf0[21]); + buf1[21] = _mm256_sub_epi32(buf0[18], buf0[21]); + buf1[19] = _mm256_add_epi32(buf0[19], buf0[20]); + buf1[20] = _mm256_sub_epi32(buf0[19], buf0[20]); + buf1[24] = _mm256_sub_epi32(buf0[31], buf0[24]); + buf1[31] = _mm256_add_epi32(buf0[31], buf0[24]); + buf1[25] = _mm256_sub_epi32(buf0[30], buf0[25]); + buf1[30] = _mm256_add_epi32(buf0[30], buf0[25]); + buf1[26] = _mm256_sub_epi32(buf0[29], buf0[26]); + buf1[29] = _mm256_add_epi32(buf0[29], buf0[26]); + buf1[27] = _mm256_sub_epi32(buf0[28], buf0[27]); + buf1[28] = _mm256_add_epi32(buf0[28], buf0[27]); + + // stage 4 + cospi = cospi_arr(cos_bit); + buf0[0] = _mm256_add_epi32(buf1[0], buf1[3]); + buf0[3] = _mm256_sub_epi32(buf1[0], buf1[3]); + buf0[1] = _mm256_add_epi32(buf1[1], buf1[2]); + buf0[2] = _mm256_sub_epi32(buf1[1], buf1[2]); + buf0[4] = buf1[4]; + btf_32_avx2_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6], + cos_bit); + buf0[7] = buf1[7]; + buf0[8] = _mm256_add_epi32(buf1[8], buf1[11]); + buf0[11] = _mm256_sub_epi32(buf1[8], buf1[11]); + buf0[9] = _mm256_add_epi32(buf1[9], buf1[10]); + buf0[10] = _mm256_sub_epi32(buf1[9], buf1[10]); + buf0[12] = _mm256_sub_epi32(buf1[15], buf1[12]); + buf0[15] = _mm256_add_epi32(buf1[15], buf1[12]); + buf0[13] = _mm256_sub_epi32(buf1[14], buf1[13]); + buf0[14] = _mm256_add_epi32(buf1[14], buf1[13]); + buf0[16] = buf1[16]; + buf0[17] = buf1[17]; + btf_32_avx2_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18], + buf0[29], cos_bit); + btf_32_avx2_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19], + buf0[28], cos_bit); + btf_32_avx2_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20], + buf0[27], cos_bit); + btf_32_avx2_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + buf0[22] = buf1[22]; + buf0[23] = buf1[23]; + buf0[24] = buf1[24]; + buf0[25] = buf1[25]; + buf0[30] = buf1[30]; + buf0[31] = buf1[31]; + + // stage 5 + cospi = cospi_arr(cos_bit); + btf_32_avx2_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1], + cos_bit); + btf_32_avx2_type0(cospi[16], cospi[48], buf0[3], buf0[2], buf1[2], buf1[3], + cos_bit); + buf1[4] = _mm256_add_epi32(buf0[4], buf0[5]); + buf1[5] = _mm256_sub_epi32(buf0[4], buf0[5]); + buf1[6] = _mm256_sub_epi32(buf0[7], buf0[6]); + buf1[7] = _mm256_add_epi32(buf0[7], buf0[6]); + buf1[8] = buf0[8]; + btf_32_avx2_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], buf1[14], + cos_bit); + btf_32_avx2_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10], + buf1[13], cos_bit); + buf1[11] = buf0[11]; + buf1[12] = buf0[12]; + buf1[15] = buf0[15]; + buf1[16] = _mm256_add_epi32(buf0[16], buf0[19]); + buf1[19] = _mm256_sub_epi32(buf0[16], buf0[19]); + buf1[17] = _mm256_add_epi32(buf0[17], buf0[18]); + buf1[18] = _mm256_sub_epi32(buf0[17], buf0[18]); + buf1[20] = _mm256_sub_epi32(buf0[23], buf0[20]); + buf1[23] = _mm256_add_epi32(buf0[23], buf0[20]); + buf1[21] = _mm256_sub_epi32(buf0[22], buf0[21]); + buf1[22] = _mm256_add_epi32(buf0[22], buf0[21]); + buf1[24] = _mm256_add_epi32(buf0[24], buf0[27]); + buf1[27] = _mm256_sub_epi32(buf0[24], buf0[27]); + buf1[25] = _mm256_add_epi32(buf0[25], buf0[26]); + buf1[26] = _mm256_sub_epi32(buf0[25], buf0[26]); + buf1[28] = _mm256_sub_epi32(buf0[31], buf0[28]); + buf1[31] = _mm256_add_epi32(buf0[31], buf0[28]); + buf1[29] = _mm256_sub_epi32(buf0[30], buf0[29]); + buf1[30] = _mm256_add_epi32(buf0[30], buf0[29]); + + // stage 6 + cospi = cospi_arr(cos_bit); + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + buf0[2] = buf1[2]; + buf0[3] = buf1[3]; + btf_32_avx2_type0(cospi[8], cospi[56], buf1[7], buf1[4], buf0[4], buf0[7], + cos_bit); + btf_32_avx2_type0(cospi[40], cospi[24], buf1[6], buf1[5], buf0[5], buf0[6], + cos_bit); + buf0[8] = _mm256_add_epi32(buf1[8], buf1[9]); + buf0[9] = _mm256_sub_epi32(buf1[8], buf1[9]); + buf0[10] = _mm256_sub_epi32(buf1[11], buf1[10]); + buf0[11] = _mm256_add_epi32(buf1[11], buf1[10]); + buf0[12] = _mm256_add_epi32(buf1[12], buf1[13]); + buf0[13] = _mm256_sub_epi32(buf1[12], buf1[13]); + buf0[14] = _mm256_sub_epi32(buf1[15], buf1[14]); + buf0[15] = _mm256_add_epi32(buf1[15], buf1[14]); + buf0[16] = buf1[16]; + btf_32_avx2_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17], + buf0[30], cos_bit); + btf_32_avx2_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18], + buf0[29], cos_bit); + buf0[19] = buf1[19]; + buf0[20] = buf1[20]; + btf_32_avx2_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + btf_32_avx2_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22], + buf0[25], cos_bit); + buf0[23] = buf1[23]; + buf0[24] = buf1[24]; + buf0[27] = buf1[27]; + buf0[28] = buf1[28]; + buf0[31] = buf1[31]; + + // stage 7 + cospi = cospi_arr(cos_bit); + buf1[0] = buf0[0]; + buf1[1] = buf0[1]; + buf1[2] = buf0[2]; + buf1[3] = buf0[3]; + buf1[4] = buf0[4]; + buf1[5] = buf0[5]; + buf1[6] = buf0[6]; + buf1[7] = buf0[7]; + btf_32_avx2_type0(cospi[4], cospi[60], buf0[15], buf0[8], buf1[8], buf1[15], + cos_bit); + btf_32_avx2_type0(cospi[36], cospi[28], buf0[14], buf0[9], buf1[9], buf1[14], + cos_bit); + btf_32_avx2_type0(cospi[20], cospi[44], buf0[13], buf0[10], buf1[10], + buf1[13], cos_bit); + btf_32_avx2_type0(cospi[52], cospi[12], buf0[12], buf0[11], buf1[11], + buf1[12], cos_bit); + buf1[16] = _mm256_add_epi32(buf0[16], buf0[17]); + buf1[17] = _mm256_sub_epi32(buf0[16], buf0[17]); + buf1[18] = _mm256_sub_epi32(buf0[19], buf0[18]); + buf1[19] = _mm256_add_epi32(buf0[19], buf0[18]); + buf1[20] = _mm256_add_epi32(buf0[20], buf0[21]); + buf1[21] = _mm256_sub_epi32(buf0[20], buf0[21]); + buf1[22] = _mm256_sub_epi32(buf0[23], buf0[22]); + buf1[23] = _mm256_add_epi32(buf0[23], buf0[22]); + buf1[24] = _mm256_add_epi32(buf0[24], buf0[25]); + buf1[25] = _mm256_sub_epi32(buf0[24], buf0[25]); + buf1[26] = _mm256_sub_epi32(buf0[27], buf0[26]); + buf1[27] = _mm256_add_epi32(buf0[27], buf0[26]); + buf1[28] = _mm256_add_epi32(buf0[28], buf0[29]); + buf1[29] = _mm256_sub_epi32(buf0[28], buf0[29]); + buf1[30] = _mm256_sub_epi32(buf0[31], buf0[30]); + buf1[31] = _mm256_add_epi32(buf0[31], buf0[30]); + + // stage 8 + cospi = cospi_arr(cos_bit); + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + buf0[2] = buf1[2]; + buf0[3] = buf1[3]; + buf0[4] = buf1[4]; + buf0[5] = buf1[5]; + buf0[6] = buf1[6]; + buf0[7] = buf1[7]; + buf0[8] = buf1[8]; + buf0[9] = buf1[9]; + buf0[10] = buf1[10]; + buf0[11] = buf1[11]; + buf0[12] = buf1[12]; + buf0[13] = buf1[13]; + buf0[14] = buf1[14]; + buf0[15] = buf1[15]; + btf_32_avx2_type0(cospi[2], cospi[62], buf1[31], buf1[16], buf0[16], buf0[31], + cos_bit); + btf_32_avx2_type0(cospi[34], cospi[30], buf1[30], buf1[17], buf0[17], + buf0[30], cos_bit); + btf_32_avx2_type0(cospi[18], cospi[46], buf1[29], buf1[18], buf0[18], + buf0[29], cos_bit); + btf_32_avx2_type0(cospi[50], cospi[14], buf1[28], buf1[19], buf0[19], + buf0[28], cos_bit); + btf_32_avx2_type0(cospi[10], cospi[54], buf1[27], buf1[20], buf0[20], + buf0[27], cos_bit); + btf_32_avx2_type0(cospi[42], cospi[22], buf1[26], buf1[21], buf0[21], + buf0[26], cos_bit); + btf_32_avx2_type0(cospi[26], cospi[38], buf1[25], buf1[22], buf0[22], + buf0[25], cos_bit); + btf_32_avx2_type0(cospi[58], cospi[6], buf1[24], buf1[23], buf0[23], buf0[24], + cos_bit); + + startidx = 0 * outstride; + endidx = 31 * outstride; + // stage 9 + output[startidx] = buf0[0]; + output[endidx] = buf0[31]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[16]; + output[endidx] = buf0[15]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[8]; + output[endidx] = buf0[23]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[24]; + output[endidx] = buf0[7]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[4]; + output[endidx] = buf0[27]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[20]; + output[endidx] = buf0[11]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[12]; + output[endidx] = buf0[19]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[28]; + output[endidx] = buf0[3]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[2]; + output[endidx] = buf0[29]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[18]; + output[endidx] = buf0[13]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[10]; + output[endidx] = buf0[21]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[26]; + output[endidx] = buf0[5]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[6]; + output[endidx] = buf0[25]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[22]; + output[endidx] = buf0[9]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[14]; + output[endidx] = buf0[17]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[30]; + output[endidx] = buf0[1]; +} +static INLINE void idtx32x32_avx2(__m256i *input, __m256i *output, + const int8_t cos_bit, int instride, + int outstride) { + (void)cos_bit; + for (int i = 0; i < 32; i += 8) { + output[i * outstride] = _mm256_slli_epi32(input[i * instride], 2); + output[(i + 1) * outstride] = + _mm256_slli_epi32(input[(i + 1) * instride], 2); + output[(i + 2) * outstride] = + _mm256_slli_epi32(input[(i + 2) * instride], 2); + output[(i + 3) * outstride] = + _mm256_slli_epi32(input[(i + 3) * instride], 2); + output[(i + 4) * outstride] = + _mm256_slli_epi32(input[(i + 4) * instride], 2); + output[(i + 5) * outstride] = + _mm256_slli_epi32(input[(i + 5) * instride], 2); + output[(i + 6) * outstride] = + _mm256_slli_epi32(input[(i + 6) * instride], 2); + output[(i + 7) * outstride] = + _mm256_slli_epi32(input[(i + 7) * instride], 2); + } +} +static const transform_1d_avx2 col_txfm8x32_arr[TX_TYPES] = { + fdct32_avx2, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + idtx32x32_avx2, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; +static const transform_1d_avx2 row_txfm8x32_arr[TX_TYPES] = { + fdct32_avx2, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + idtx32x32_avx2, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; +void av1_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m256i buf0[128], buf1[128]; + const int tx_size = TX_32X32; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = col_txfm8x32_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_txfm8x32_arr[tx_type]; + int r, c; + const int width_div16 = (width >> 4); + const int width_div8 = (width >> 3); + + for (int i = 0; i < width_div16; i++) { + load_buffer_16xn_avx2(input + (i << 4), &buf0[(i << 1)], stride, height, + width_div8, 0, 0); + round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[0], width_div8); + round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0], width_div8); + col_txfm(&buf0[(i << 1)], &buf0[(i << 1)], cos_bit_col, width_div8, + width_div8); + col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col, width_div8, + width_div8); + round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[1], width_div8); + round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1], width_div8); + } + + for (r = 0; r < height; r += 8) { + for (c = 0; c < width_div8; c++) { + fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c], + &buf1[c * 8 * width_div8 + (r >> 3)], + width_div8, width_div8); + } + } + + for (int i = 0; i < width_div16; i++) { + row_txfm(&buf1[(i << 1)], &buf1[(i << 1)], cos_bit_row, width_div8, + width_div8); + row_txfm(&buf1[(i << 1) + 1], &buf1[(i << 1) + 1], cos_bit_row, width_div8, + width_div8); + round_shift_32_8xn_avx2(&buf1[(i << 1)], height, shift[2], width_div8); + round_shift_32_8xn_avx2(&buf1[(i << 1) + 1], height, shift[2], width_div8); + } + + for (r = 0; r < height; r += 8) { + for (c = 0; c < width_div8; c++) { + fwd_txfm_transpose_8x8_avx2(&buf1[r * width_div8 + c], + &buf0[c * 8 * width_div8 + (r >> 3)], + width_div8, width_div8); + } + } + + store_buffer_avx2(buf0, output, 8, 128); +} +static INLINE void fdct64_stage2_avx2(__m256i *x1, __m256i *x2, + __m256i *cospi_m32, __m256i *cospi_p32, + const __m256i *__rounding, + int8_t cos_bit) { + x2[0] = _mm256_add_epi32(x1[0], x1[31]); + x2[31] = _mm256_sub_epi32(x1[0], x1[31]); + x2[1] = _mm256_add_epi32(x1[1], x1[30]); + x2[30] = _mm256_sub_epi32(x1[1], x1[30]); + x2[2] = _mm256_add_epi32(x1[2], x1[29]); + x2[29] = _mm256_sub_epi32(x1[2], x1[29]); + x2[3] = _mm256_add_epi32(x1[3], x1[28]); + x2[28] = _mm256_sub_epi32(x1[3], x1[28]); + x2[4] = _mm256_add_epi32(x1[4], x1[27]); + x2[27] = _mm256_sub_epi32(x1[4], x1[27]); + x2[5] = _mm256_add_epi32(x1[5], x1[26]); + x2[26] = _mm256_sub_epi32(x1[5], x1[26]); + x2[6] = _mm256_add_epi32(x1[6], x1[25]); + x2[25] = _mm256_sub_epi32(x1[6], x1[25]); + x2[7] = _mm256_add_epi32(x1[7], x1[24]); + x2[24] = _mm256_sub_epi32(x1[7], x1[24]); + x2[8] = _mm256_add_epi32(x1[8], x1[23]); + x2[23] = _mm256_sub_epi32(x1[8], x1[23]); + x2[9] = _mm256_add_epi32(x1[9], x1[22]); + x2[22] = _mm256_sub_epi32(x1[9], x1[22]); + x2[10] = _mm256_add_epi32(x1[10], x1[21]); + x2[21] = _mm256_sub_epi32(x1[10], x1[21]); + x2[11] = _mm256_add_epi32(x1[11], x1[20]); + x2[20] = _mm256_sub_epi32(x1[11], x1[20]); + x2[12] = _mm256_add_epi32(x1[12], x1[19]); + x2[19] = _mm256_sub_epi32(x1[12], x1[19]); + x2[13] = _mm256_add_epi32(x1[13], x1[18]); + x2[18] = _mm256_sub_epi32(x1[13], x1[18]); + x2[14] = _mm256_add_epi32(x1[14], x1[17]); + x2[17] = _mm256_sub_epi32(x1[14], x1[17]); + x2[15] = _mm256_add_epi32(x1[15], x1[16]); + x2[16] = _mm256_sub_epi32(x1[15], x1[16]); + x2[32] = x1[32]; + x2[33] = x1[33]; + x2[34] = x1[34]; + x2[35] = x1[35]; + x2[36] = x1[36]; + x2[37] = x1[37]; + x2[38] = x1[38]; + x2[39] = x1[39]; + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[40], x1[55], x2[40], x2[55], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[41], x1[54], x2[41], x2[54], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[42], x1[53], x2[42], x2[53], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[43], x1[52], x2[43], x2[52], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[44], x1[51], x2[44], x2[51], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[45], x1[50], x2[45], x2[50], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[46], x1[49], x2[46], x2[49], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[47], x1[48], x2[47], x2[48], + *__rounding, cos_bit); + x2[56] = x1[56]; + x2[57] = x1[57]; + x2[58] = x1[58]; + x2[59] = x1[59]; + x2[60] = x1[60]; + x2[61] = x1[61]; + x2[62] = x1[62]; + x2[63] = x1[63]; +} +static INLINE void fdct64_stage3_avx2(__m256i *x2, __m256i *x3, + __m256i *cospi_m32, __m256i *cospi_p32, + const __m256i *__rounding, + int8_t cos_bit) { + x3[0] = _mm256_add_epi32(x2[0], x2[15]); + x3[15] = _mm256_sub_epi32(x2[0], x2[15]); + x3[1] = _mm256_add_epi32(x2[1], x2[14]); + x3[14] = _mm256_sub_epi32(x2[1], x2[14]); + x3[2] = _mm256_add_epi32(x2[2], x2[13]); + x3[13] = _mm256_sub_epi32(x2[2], x2[13]); + x3[3] = _mm256_add_epi32(x2[3], x2[12]); + x3[12] = _mm256_sub_epi32(x2[3], x2[12]); + x3[4] = _mm256_add_epi32(x2[4], x2[11]); + x3[11] = _mm256_sub_epi32(x2[4], x2[11]); + x3[5] = _mm256_add_epi32(x2[5], x2[10]); + x3[10] = _mm256_sub_epi32(x2[5], x2[10]); + x3[6] = _mm256_add_epi32(x2[6], x2[9]); + x3[9] = _mm256_sub_epi32(x2[6], x2[9]); + x3[7] = _mm256_add_epi32(x2[7], x2[8]); + x3[8] = _mm256_sub_epi32(x2[7], x2[8]); + x3[16] = x2[16]; + x3[17] = x2[17]; + x3[18] = x2[18]; + x3[19] = x2[19]; + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[20], x2[27], x3[20], x3[27], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[21], x2[26], x3[21], x3[26], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[22], x2[25], x3[22], x3[25], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[23], x2[24], x3[23], x3[24], + *__rounding, cos_bit); + x3[28] = x2[28]; + x3[29] = x2[29]; + x3[30] = x2[30]; + x3[31] = x2[31]; + x3[32] = _mm256_add_epi32(x2[32], x2[47]); + x3[47] = _mm256_sub_epi32(x2[32], x2[47]); + x3[33] = _mm256_add_epi32(x2[33], x2[46]); + x3[46] = _mm256_sub_epi32(x2[33], x2[46]); + x3[34] = _mm256_add_epi32(x2[34], x2[45]); + x3[45] = _mm256_sub_epi32(x2[34], x2[45]); + x3[35] = _mm256_add_epi32(x2[35], x2[44]); + x3[44] = _mm256_sub_epi32(x2[35], x2[44]); + x3[36] = _mm256_add_epi32(x2[36], x2[43]); + x3[43] = _mm256_sub_epi32(x2[36], x2[43]); + x3[37] = _mm256_add_epi32(x2[37], x2[42]); + x3[42] = _mm256_sub_epi32(x2[37], x2[42]); + x3[38] = _mm256_add_epi32(x2[38], x2[41]); + x3[41] = _mm256_sub_epi32(x2[38], x2[41]); + x3[39] = _mm256_add_epi32(x2[39], x2[40]); + x3[40] = _mm256_sub_epi32(x2[39], x2[40]); + x3[48] = _mm256_sub_epi32(x2[63], x2[48]); + x3[63] = _mm256_add_epi32(x2[63], x2[48]); + x3[49] = _mm256_sub_epi32(x2[62], x2[49]); + x3[62] = _mm256_add_epi32(x2[62], x2[49]); + x3[50] = _mm256_sub_epi32(x2[61], x2[50]); + x3[61] = _mm256_add_epi32(x2[61], x2[50]); + x3[51] = _mm256_sub_epi32(x2[60], x2[51]); + x3[60] = _mm256_add_epi32(x2[60], x2[51]); + x3[52] = _mm256_sub_epi32(x2[59], x2[52]); + x3[59] = _mm256_add_epi32(x2[59], x2[52]); + x3[53] = _mm256_sub_epi32(x2[58], x2[53]); + x3[58] = _mm256_add_epi32(x2[58], x2[53]); + x3[54] = _mm256_sub_epi32(x2[57], x2[54]); + x3[57] = _mm256_add_epi32(x2[57], x2[54]); + x3[55] = _mm256_sub_epi32(x2[56], x2[55]); + x3[56] = _mm256_add_epi32(x2[56], x2[55]); +} +static INLINE void fdct64_stage4_avx2(__m256i *x3, __m256i *x4, + __m256i *cospi_m32, __m256i *cospi_p32, + __m256i *cospi_m16, __m256i *cospi_p48, + __m256i *cospi_m48, + const __m256i *__rounding, + int8_t cos_bit) { + x4[0] = _mm256_add_epi32(x3[0], x3[7]); + x4[7] = _mm256_sub_epi32(x3[0], x3[7]); + x4[1] = _mm256_add_epi32(x3[1], x3[6]); + x4[6] = _mm256_sub_epi32(x3[1], x3[6]); + x4[2] = _mm256_add_epi32(x3[2], x3[5]); + x4[5] = _mm256_sub_epi32(x3[2], x3[5]); + x4[3] = _mm256_add_epi32(x3[3], x3[4]); + x4[4] = _mm256_sub_epi32(x3[3], x3[4]); + x4[8] = x3[8]; + x4[9] = x3[9]; + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x3[10], x3[13], x4[10], x4[13], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x3[11], x3[12], x4[11], x4[12], + *__rounding, cos_bit); + x4[14] = x3[14]; + x4[15] = x3[15]; + x4[16] = _mm256_add_epi32(x3[16], x3[23]); + x4[23] = _mm256_sub_epi32(x3[16], x3[23]); + x4[17] = _mm256_add_epi32(x3[17], x3[22]); + x4[22] = _mm256_sub_epi32(x3[17], x3[22]); + x4[18] = _mm256_add_epi32(x3[18], x3[21]); + x4[21] = _mm256_sub_epi32(x3[18], x3[21]); + x4[19] = _mm256_add_epi32(x3[19], x3[20]); + x4[20] = _mm256_sub_epi32(x3[19], x3[20]); + x4[24] = _mm256_sub_epi32(x3[31], x3[24]); + x4[31] = _mm256_add_epi32(x3[31], x3[24]); + x4[25] = _mm256_sub_epi32(x3[30], x3[25]); + x4[30] = _mm256_add_epi32(x3[30], x3[25]); + x4[26] = _mm256_sub_epi32(x3[29], x3[26]); + x4[29] = _mm256_add_epi32(x3[29], x3[26]); + x4[27] = _mm256_sub_epi32(x3[28], x3[27]); + x4[28] = _mm256_add_epi32(x3[28], x3[27]); + x4[32] = x3[32]; + x4[33] = x3[33]; + x4[34] = x3[34]; + x4[35] = x3[35]; + btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[36], x3[59], x4[36], x4[59], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[37], x3[58], x4[37], x4[58], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[38], x3[57], x4[38], x4[57], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[39], x3[56], x4[39], x4[56], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[40], x3[55], x4[40], x4[55], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[41], x3[54], x4[41], x4[54], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[42], x3[53], x4[42], x4[53], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[43], x3[52], x4[43], x4[52], + *__rounding, cos_bit); + x4[44] = x3[44]; + x4[45] = x3[45]; + x4[46] = x3[46]; + x4[47] = x3[47]; + x4[48] = x3[48]; + x4[49] = x3[49]; + x4[50] = x3[50]; + x4[51] = x3[51]; + x4[60] = x3[60]; + x4[61] = x3[61]; + x4[62] = x3[62]; + x4[63] = x3[63]; +} +static INLINE void fdct64_stage5_avx2(__m256i *x4, __m256i *x5, + __m256i *cospi_m32, __m256i *cospi_p32, + __m256i *cospi_m16, __m256i *cospi_p48, + __m256i *cospi_m48, + const __m256i *__rounding, + int8_t cos_bit) { + x5[0] = _mm256_add_epi32(x4[0], x4[3]); + x5[3] = _mm256_sub_epi32(x4[0], x4[3]); + x5[1] = _mm256_add_epi32(x4[1], x4[2]); + x5[2] = _mm256_sub_epi32(x4[1], x4[2]); + x5[4] = x4[4]; + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x4[5], x4[6], x5[5], x5[6], + *__rounding, cos_bit); + x5[7] = x4[7]; + x5[8] = _mm256_add_epi32(x4[8], x4[11]); + x5[11] = _mm256_sub_epi32(x4[8], x4[11]); + x5[9] = _mm256_add_epi32(x4[9], x4[10]); + x5[10] = _mm256_sub_epi32(x4[9], x4[10]); + x5[12] = _mm256_sub_epi32(x4[15], x4[12]); + x5[15] = _mm256_add_epi32(x4[15], x4[12]); + x5[13] = _mm256_sub_epi32(x4[14], x4[13]); + x5[14] = _mm256_add_epi32(x4[14], x4[13]); + x5[16] = x4[16]; + x5[17] = x4[17]; + btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x4[18], x4[29], x5[18], x5[29], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x4[19], x4[28], x5[19], x5[28], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x4[20], x4[27], x5[20], x5[27], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x4[21], x4[26], x5[21], x5[26], + *__rounding, cos_bit); + x5[22] = x4[22]; + x5[23] = x4[23]; + x5[24] = x4[24]; + x5[25] = x4[25]; + x5[30] = x4[30]; + x5[31] = x4[31]; + x5[32] = _mm256_add_epi32(x4[32], x4[39]); + x5[39] = _mm256_sub_epi32(x4[32], x4[39]); + x5[33] = _mm256_add_epi32(x4[33], x4[38]); + x5[38] = _mm256_sub_epi32(x4[33], x4[38]); + x5[34] = _mm256_add_epi32(x4[34], x4[37]); + x5[37] = _mm256_sub_epi32(x4[34], x4[37]); + x5[35] = _mm256_add_epi32(x4[35], x4[36]); + x5[36] = _mm256_sub_epi32(x4[35], x4[36]); + x5[40] = _mm256_sub_epi32(x4[47], x4[40]); + x5[47] = _mm256_add_epi32(x4[47], x4[40]); + x5[41] = _mm256_sub_epi32(x4[46], x4[41]); + x5[46] = _mm256_add_epi32(x4[46], x4[41]); + x5[42] = _mm256_sub_epi32(x4[45], x4[42]); + x5[45] = _mm256_add_epi32(x4[45], x4[42]); + x5[43] = _mm256_sub_epi32(x4[44], x4[43]); + x5[44] = _mm256_add_epi32(x4[44], x4[43]); + x5[48] = _mm256_add_epi32(x4[48], x4[55]); + x5[55] = _mm256_sub_epi32(x4[48], x4[55]); + x5[49] = _mm256_add_epi32(x4[49], x4[54]); + x5[54] = _mm256_sub_epi32(x4[49], x4[54]); + x5[50] = _mm256_add_epi32(x4[50], x4[53]); + x5[53] = _mm256_sub_epi32(x4[50], x4[53]); + x5[51] = _mm256_add_epi32(x4[51], x4[52]); + x5[52] = _mm256_sub_epi32(x4[51], x4[52]); + x5[56] = _mm256_sub_epi32(x4[63], x4[56]); + x5[63] = _mm256_add_epi32(x4[63], x4[56]); + x5[57] = _mm256_sub_epi32(x4[62], x4[57]); + x5[62] = _mm256_add_epi32(x4[62], x4[57]); + x5[58] = _mm256_sub_epi32(x4[61], x4[58]); + x5[61] = _mm256_add_epi32(x4[61], x4[58]); + x5[59] = _mm256_sub_epi32(x4[60], x4[59]); + x5[60] = _mm256_add_epi32(x4[60], x4[59]); +} +static INLINE void fdct64_stage6_avx2( + __m256i *x5, __m256i *x6, __m256i *cospi_p16, __m256i *cospi_p32, + __m256i *cospi_m16, __m256i *cospi_p48, __m256i *cospi_m48, + __m256i *cospi_m08, __m256i *cospi_p56, __m256i *cospi_m56, + __m256i *cospi_m40, __m256i *cospi_p24, __m256i *cospi_m24, + const __m256i *__rounding, int8_t cos_bit) { + btf_32_type0_avx2_new(*cospi_p32, *cospi_p32, x5[0], x5[1], x6[0], x6[1], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_p16, *cospi_p48, x5[3], x5[2], x6[2], x6[3], + *__rounding, cos_bit); + x6[4] = _mm256_add_epi32(x5[4], x5[5]); + x6[5] = _mm256_sub_epi32(x5[4], x5[5]); + x6[6] = _mm256_sub_epi32(x5[7], x5[6]); + x6[7] = _mm256_add_epi32(x5[7], x5[6]); + x6[8] = x5[8]; + btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x5[9], x5[14], x6[9], x6[14], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x5[10], x5[13], x6[10], x6[13], + *__rounding, cos_bit); + x6[11] = x5[11]; + x6[12] = x5[12]; + x6[15] = x5[15]; + x6[16] = _mm256_add_epi32(x5[16], x5[19]); + x6[19] = _mm256_sub_epi32(x5[16], x5[19]); + x6[17] = _mm256_add_epi32(x5[17], x5[18]); + x6[18] = _mm256_sub_epi32(x5[17], x5[18]); + x6[20] = _mm256_sub_epi32(x5[23], x5[20]); + x6[23] = _mm256_add_epi32(x5[23], x5[20]); + x6[21] = _mm256_sub_epi32(x5[22], x5[21]); + x6[22] = _mm256_add_epi32(x5[22], x5[21]); + x6[24] = _mm256_add_epi32(x5[24], x5[27]); + x6[27] = _mm256_sub_epi32(x5[24], x5[27]); + x6[25] = _mm256_add_epi32(x5[25], x5[26]); + x6[26] = _mm256_sub_epi32(x5[25], x5[26]); + x6[28] = _mm256_sub_epi32(x5[31], x5[28]); + x6[31] = _mm256_add_epi32(x5[31], x5[28]); + x6[29] = _mm256_sub_epi32(x5[30], x5[29]); + x6[30] = _mm256_add_epi32(x5[30], x5[29]); + x6[32] = x5[32]; + x6[33] = x5[33]; + btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x5[34], x5[61], x6[34], x6[61], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x5[35], x5[60], x6[35], x6[60], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x5[36], x5[59], x6[36], x6[59], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x5[37], x5[58], x6[37], x6[58], + *__rounding, cos_bit); + x6[38] = x5[38]; + x6[39] = x5[39]; + x6[40] = x5[40]; + x6[41] = x5[41]; + btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x5[42], x5[53], x6[42], x6[53], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x5[43], x5[52], x6[43], x6[52], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x5[44], x5[51], x6[44], x6[51], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x5[45], x5[50], x6[45], x6[50], + *__rounding, cos_bit); + x6[46] = x5[46]; + x6[47] = x5[47]; + x6[48] = x5[48]; + x6[49] = x5[49]; + x6[54] = x5[54]; + x6[55] = x5[55]; + x6[56] = x5[56]; + x6[57] = x5[57]; + x6[62] = x5[62]; + x6[63] = x5[63]; +} +static INLINE void fdct64_stage7_avx2(__m256i *x6, __m256i *x7, + __m256i *cospi_p08, __m256i *cospi_p56, + __m256i *cospi_p40, __m256i *cospi_p24, + __m256i *cospi_m08, __m256i *cospi_m56, + __m256i *cospi_m40, __m256i *cospi_m24, + const __m256i *__rounding, + int8_t cos_bit) { + x7[0] = x6[0]; + x7[1] = x6[1]; + x7[2] = x6[2]; + x7[3] = x6[3]; + btf_32_type0_avx2_new(*cospi_p08, *cospi_p56, x6[7], x6[4], x7[4], x7[7], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_p40, *cospi_p24, x6[6], x6[5], x7[5], x7[6], + *__rounding, cos_bit); + x7[8] = _mm256_add_epi32(x6[8], x6[9]); + x7[9] = _mm256_sub_epi32(x6[8], x6[9]); + x7[10] = _mm256_sub_epi32(x6[11], x6[10]); + x7[11] = _mm256_add_epi32(x6[11], x6[10]); + x7[12] = _mm256_add_epi32(x6[12], x6[13]); + x7[13] = _mm256_sub_epi32(x6[12], x6[13]); + x7[14] = _mm256_sub_epi32(x6[15], x6[14]); + x7[15] = _mm256_add_epi32(x6[15], x6[14]); + x7[16] = x6[16]; + btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x6[17], x6[30], x7[17], x7[30], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x6[18], x6[29], x7[18], x7[29], + *__rounding, cos_bit); + x7[19] = x6[19]; + x7[20] = x6[20]; + btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x6[21], x6[26], x7[21], x7[26], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x6[22], x6[25], x7[22], x7[25], + *__rounding, cos_bit); + x7[23] = x6[23]; + x7[24] = x6[24]; + x7[27] = x6[27]; + x7[28] = x6[28]; + x7[31] = x6[31]; + x7[32] = _mm256_add_epi32(x6[32], x6[35]); + x7[35] = _mm256_sub_epi32(x6[32], x6[35]); + x7[33] = _mm256_add_epi32(x6[33], x6[34]); + x7[34] = _mm256_sub_epi32(x6[33], x6[34]); + x7[36] = _mm256_sub_epi32(x6[39], x6[36]); + x7[39] = _mm256_add_epi32(x6[39], x6[36]); + x7[37] = _mm256_sub_epi32(x6[38], x6[37]); + x7[38] = _mm256_add_epi32(x6[38], x6[37]); + x7[40] = _mm256_add_epi32(x6[40], x6[43]); + x7[43] = _mm256_sub_epi32(x6[40], x6[43]); + x7[41] = _mm256_add_epi32(x6[41], x6[42]); + x7[42] = _mm256_sub_epi32(x6[41], x6[42]); + x7[44] = _mm256_sub_epi32(x6[47], x6[44]); + x7[47] = _mm256_add_epi32(x6[47], x6[44]); + x7[45] = _mm256_sub_epi32(x6[46], x6[45]); + x7[46] = _mm256_add_epi32(x6[46], x6[45]); + x7[48] = _mm256_add_epi32(x6[48], x6[51]); + x7[51] = _mm256_sub_epi32(x6[48], x6[51]); + x7[49] = _mm256_add_epi32(x6[49], x6[50]); + x7[50] = _mm256_sub_epi32(x6[49], x6[50]); + x7[52] = _mm256_sub_epi32(x6[55], x6[52]); + x7[55] = _mm256_add_epi32(x6[55], x6[52]); + x7[53] = _mm256_sub_epi32(x6[54], x6[53]); + x7[54] = _mm256_add_epi32(x6[54], x6[53]); + x7[56] = _mm256_add_epi32(x6[56], x6[59]); + x7[59] = _mm256_sub_epi32(x6[56], x6[59]); + x7[57] = _mm256_add_epi32(x6[57], x6[58]); + x7[58] = _mm256_sub_epi32(x6[57], x6[58]); + x7[60] = _mm256_sub_epi32(x6[63], x6[60]); + x7[63] = _mm256_add_epi32(x6[63], x6[60]); + x7[61] = _mm256_sub_epi32(x6[62], x6[61]); + x7[62] = _mm256_add_epi32(x6[62], x6[61]); +} +static INLINE void fdct64_stage8_avx2(__m256i *x7, __m256i *x8, + const int32_t *cospi, + const __m256i *__rounding, + int8_t cos_bit) { + __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]); + __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]); + __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]); + __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]); + __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]); + __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]); + __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]); + __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]); + __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]); + __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]); + __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]); + __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]); + __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]); + __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]); + __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]); + __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]); + + x8[0] = x7[0]; + x8[1] = x7[1]; + x8[2] = x7[2]; + x8[3] = x7[3]; + x8[4] = x7[4]; + x8[5] = x7[5]; + x8[6] = x7[6]; + x8[7] = x7[7]; + + btf_32_type0_avx2_new(cospi_p04, cospi_p60, x7[15], x7[8], x8[8], x8[15], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p36, cospi_p28, x7[14], x7[9], x8[9], x8[14], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p20, cospi_p44, x7[13], x7[10], x8[10], x8[13], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p52, cospi_p12, x7[12], x7[11], x8[11], x8[12], + *__rounding, cos_bit); + x8[16] = _mm256_add_epi32(x7[16], x7[17]); + x8[17] = _mm256_sub_epi32(x7[16], x7[17]); + x8[18] = _mm256_sub_epi32(x7[19], x7[18]); + x8[19] = _mm256_add_epi32(x7[19], x7[18]); + x8[20] = _mm256_add_epi32(x7[20], x7[21]); + x8[21] = _mm256_sub_epi32(x7[20], x7[21]); + x8[22] = _mm256_sub_epi32(x7[23], x7[22]); + x8[23] = _mm256_add_epi32(x7[23], x7[22]); + x8[24] = _mm256_add_epi32(x7[24], x7[25]); + x8[25] = _mm256_sub_epi32(x7[24], x7[25]); + x8[26] = _mm256_sub_epi32(x7[27], x7[26]); + x8[27] = _mm256_add_epi32(x7[27], x7[26]); + x8[28] = _mm256_add_epi32(x7[28], x7[29]); + x8[29] = _mm256_sub_epi32(x7[28], x7[29]); + x8[30] = _mm256_sub_epi32(x7[31], x7[30]); + x8[31] = _mm256_add_epi32(x7[31], x7[30]); + x8[32] = x7[32]; + btf_32_type0_avx2_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61], + *__rounding, cos_bit); + x8[35] = x7[35]; + x8[36] = x7[36]; + btf_32_type0_avx2_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57], + *__rounding, cos_bit); + x8[39] = x7[39]; + x8[40] = x7[40]; + btf_32_type0_avx2_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53], + *__rounding, cos_bit); + x8[43] = x7[43]; + x8[44] = x7[44]; + btf_32_type0_avx2_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49], + *__rounding, cos_bit); + x8[47] = x7[47]; + x8[48] = x7[48]; + x8[51] = x7[51]; + x8[52] = x7[52]; + x8[55] = x7[55]; + x8[56] = x7[56]; + x8[59] = x7[59]; + x8[60] = x7[60]; + x8[63] = x7[63]; +} +static INLINE void fdct64_stage9_avx2(__m256i *x8, __m256i *x9, + const int32_t *cospi, + const __m256i *__rounding, + int8_t cos_bit) { + __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]); + __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]); + __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]); + __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]); + __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]); + __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]); + __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]); + __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]); + __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]); + __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]); + __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]); + __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]); + __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]); + __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]); + __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]); + __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]); + + x9[0] = x8[0]; + x9[1] = x8[1]; + x9[2] = x8[2]; + x9[3] = x8[3]; + x9[4] = x8[4]; + x9[5] = x8[5]; + x9[6] = x8[6]; + x9[7] = x8[7]; + x9[8] = x8[8]; + x9[9] = x8[9]; + x9[10] = x8[10]; + x9[11] = x8[11]; + x9[12] = x8[12]; + x9[13] = x8[13]; + x9[14] = x8[14]; + x9[15] = x8[15]; + btf_32_type0_avx2_new(cospi_p02, cospi_p62, x8[31], x8[16], x9[16], x9[31], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p34, cospi_p30, x8[30], x8[17], x9[17], x9[30], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p18, cospi_p46, x8[29], x8[18], x9[18], x9[29], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p50, cospi_p14, x8[28], x8[19], x9[19], x9[28], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p10, cospi_p54, x8[27], x8[20], x9[20], x9[27], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p42, cospi_p22, x8[26], x8[21], x9[21], x9[26], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p26, cospi_p38, x8[25], x8[22], x9[22], x9[25], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p58, cospi_p06, x8[24], x8[23], x9[23], x9[24], + *__rounding, cos_bit); + x9[32] = _mm256_add_epi32(x8[32], x8[33]); + x9[33] = _mm256_sub_epi32(x8[32], x8[33]); + x9[34] = _mm256_sub_epi32(x8[35], x8[34]); + x9[35] = _mm256_add_epi32(x8[35], x8[34]); + x9[36] = _mm256_add_epi32(x8[36], x8[37]); + x9[37] = _mm256_sub_epi32(x8[36], x8[37]); + x9[38] = _mm256_sub_epi32(x8[39], x8[38]); + x9[39] = _mm256_add_epi32(x8[39], x8[38]); + x9[40] = _mm256_add_epi32(x8[40], x8[41]); + x9[41] = _mm256_sub_epi32(x8[40], x8[41]); + x9[42] = _mm256_sub_epi32(x8[43], x8[42]); + x9[43] = _mm256_add_epi32(x8[43], x8[42]); + x9[44] = _mm256_add_epi32(x8[44], x8[45]); + x9[45] = _mm256_sub_epi32(x8[44], x8[45]); + x9[46] = _mm256_sub_epi32(x8[47], x8[46]); + x9[47] = _mm256_add_epi32(x8[47], x8[46]); + x9[48] = _mm256_add_epi32(x8[48], x8[49]); + x9[49] = _mm256_sub_epi32(x8[48], x8[49]); + x9[50] = _mm256_sub_epi32(x8[51], x8[50]); + x9[51] = _mm256_add_epi32(x8[51], x8[50]); + x9[52] = _mm256_add_epi32(x8[52], x8[53]); + x9[53] = _mm256_sub_epi32(x8[52], x8[53]); + x9[54] = _mm256_sub_epi32(x8[55], x8[54]); + x9[55] = _mm256_add_epi32(x8[55], x8[54]); + x9[56] = _mm256_add_epi32(x8[56], x8[57]); + x9[57] = _mm256_sub_epi32(x8[56], x8[57]); + x9[58] = _mm256_sub_epi32(x8[59], x8[58]); + x9[59] = _mm256_add_epi32(x8[59], x8[58]); + x9[60] = _mm256_add_epi32(x8[60], x8[61]); + x9[61] = _mm256_sub_epi32(x8[60], x8[61]); + x9[62] = _mm256_sub_epi32(x8[63], x8[62]); + x9[63] = _mm256_add_epi32(x8[63], x8[62]); +} +static INLINE void fdct64_stage10_avx2(__m256i *x9, __m256i *x10, + const int32_t *cospi, + const __m256i *__rounding, + int8_t cos_bit) { + __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]); + __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]); + __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]); + __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]); + __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]); + __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]); + __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]); + __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]); + __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]); + __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]); + __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]); + __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]); + __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]); + __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]); + __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]); + __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]); + __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]); + __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]); + __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]); + __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]); + __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]); + __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]); + __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]); + __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]); + __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]); + __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]); + __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]); + __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]); + __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]); + __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]); + __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]); + __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]); + + x10[0] = x9[0]; + x10[1] = x9[1]; + x10[2] = x9[2]; + x10[3] = x9[3]; + x10[4] = x9[4]; + x10[5] = x9[5]; + x10[6] = x9[6]; + x10[7] = x9[7]; + x10[8] = x9[8]; + x10[9] = x9[9]; + x10[10] = x9[10]; + x10[11] = x9[11]; + x10[12] = x9[12]; + x10[13] = x9[13]; + x10[14] = x9[14]; + x10[15] = x9[15]; + x10[16] = x9[16]; + x10[17] = x9[17]; + x10[18] = x9[18]; + x10[19] = x9[19]; + x10[20] = x9[20]; + x10[21] = x9[21]; + x10[22] = x9[22]; + x10[23] = x9[23]; + x10[24] = x9[24]; + x10[25] = x9[25]; + x10[26] = x9[26]; + x10[27] = x9[27]; + x10[28] = x9[28]; + x10[29] = x9[29]; + x10[30] = x9[30]; + x10[31] = x9[31]; + btf_32_type0_avx2_new(cospi_p01, cospi_p63, x9[63], x9[32], x10[32], x10[63], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p33, cospi_p31, x9[62], x9[33], x10[33], x10[62], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p17, cospi_p47, x9[61], x9[34], x10[34], x10[61], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p49, cospi_p15, x9[60], x9[35], x10[35], x10[60], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p09, cospi_p55, x9[59], x9[36], x10[36], x10[59], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p41, cospi_p23, x9[58], x9[37], x10[37], x10[58], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p25, cospi_p39, x9[57], x9[38], x10[38], x10[57], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p57, cospi_p07, x9[56], x9[39], x10[39], x10[56], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p05, cospi_p59, x9[55], x9[40], x10[40], x10[55], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p37, cospi_p27, x9[54], x9[41], x10[41], x10[54], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p21, cospi_p43, x9[53], x9[42], x10[42], x10[53], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p53, cospi_p11, x9[52], x9[43], x10[43], x10[52], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p13, cospi_p51, x9[51], x9[44], x10[44], x10[51], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p45, cospi_p19, x9[50], x9[45], x10[45], x10[50], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p29, cospi_p35, x9[49], x9[46], x10[46], x10[49], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p61, cospi_p03, x9[48], x9[47], x10[47], x10[48], + *__rounding, cos_bit); +} +static void fdct64_avx2(__m256i *input, __m256i *output, int8_t cos_bit, + const int instride, const int outstride) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1)); + __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]); + __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]); + __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]); + __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]); + __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]); + __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]); + __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]); + __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]); + __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]); + __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]); + __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]); + __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]); + __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]); + __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]); + + int startidx = 0 * instride; + int endidx = 63 * instride; + // stage 1 + __m256i x1[64]; + x1[0] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[63] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[1] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[62] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[2] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[61] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[3] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[60] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[4] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[59] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[5] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[58] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[6] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[57] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[7] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[56] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[8] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[55] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[9] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[54] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[10] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[53] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[11] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[52] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[12] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[51] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[13] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[50] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[14] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[49] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[15] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[48] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[16] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[47] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[17] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[46] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[18] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[45] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[19] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[44] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[20] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[43] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[21] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[42] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[22] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[41] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[23] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[40] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[24] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[39] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[25] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[38] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[26] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[37] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[27] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[36] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[28] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[35] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[29] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[34] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[30] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[33] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[31] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[32] = _mm256_sub_epi32(input[startidx], input[endidx]); + + // stage 2 + __m256i x2[64]; + fdct64_stage2_avx2(x1, x2, &cospi_m32, &cospi_p32, &__rounding, cos_bit); + // stage 3 + fdct64_stage3_avx2(x2, x1, &cospi_m32, &cospi_p32, &__rounding, cos_bit); + // stage 4 + fdct64_stage4_avx2(x1, x2, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48, + &cospi_m48, &__rounding, cos_bit); + // stage 5 + fdct64_stage5_avx2(x2, x1, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48, + &cospi_m48, &__rounding, cos_bit); + // stage 6 + fdct64_stage6_avx2(x1, x2, &cospi_p16, &cospi_p32, &cospi_m16, &cospi_p48, + &cospi_m48, &cospi_m08, &cospi_p56, &cospi_m56, &cospi_m40, + &cospi_p24, &cospi_m24, &__rounding, cos_bit); + // stage 7 + fdct64_stage7_avx2(x2, x1, &cospi_p08, &cospi_p56, &cospi_p40, &cospi_p24, + &cospi_m08, &cospi_m56, &cospi_m40, &cospi_m24, + &__rounding, cos_bit); + // stage 8 + fdct64_stage8_avx2(x1, x2, cospi, &__rounding, cos_bit); + // stage 9 + fdct64_stage9_avx2(x2, x1, cospi, &__rounding, cos_bit); + // stage 10 + fdct64_stage10_avx2(x1, x2, cospi, &__rounding, cos_bit); + + startidx = 0 * outstride; + endidx = 63 * outstride; + + // stage 11 + output[startidx] = x2[0]; + output[endidx] = x2[63]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[32]; + output[endidx] = x2[31]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[16]; + output[endidx] = x2[47]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[48]; + output[endidx] = x2[15]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[8]; + output[endidx] = x2[55]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[40]; + output[endidx] = x2[23]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[24]; + output[endidx] = x2[39]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[56]; + output[endidx] = x2[7]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[4]; + output[endidx] = x2[59]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[36]; + output[endidx] = x2[27]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[20]; + output[endidx] = x2[43]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[52]; + output[endidx] = x2[11]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[12]; + output[endidx] = x2[51]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[44]; + output[endidx] = x2[19]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[28]; + output[endidx] = x2[35]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[60]; + output[endidx] = x2[3]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[2]; + output[endidx] = x2[61]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[34]; + output[endidx] = x2[29]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[18]; + output[endidx] = x2[45]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[50]; + output[endidx] = x2[13]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[10]; + output[endidx] = x2[53]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[42]; + output[endidx] = x2[21]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[26]; + output[endidx] = x2[37]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[58]; + output[endidx] = x2[5]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[6]; + output[endidx] = x2[57]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[38]; + output[endidx] = x2[25]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[22]; + output[endidx] = x2[41]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[54]; + output[endidx] = x2[9]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[14]; + output[endidx] = x2[49]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[46]; + output[endidx] = x2[17]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[30]; + output[endidx] = x2[33]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[62]; + output[endidx] = x2[1]; +} +void av1_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_64X64; + __m256i buf0[512], buf1[512]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = fdct64_avx2; + const transform_1d_avx2 row_txfm = fdct64_avx2; + const int width_div16 = (width >> 4); + const int width_div8 = (width >> 3); + int r, c; + for (int i = 0; i < width_div16; i++) { + load_buffer_16xn_avx2(input + (i << 4), &buf0[i << 1], stride, height, + width_div8, 0, 0); + round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[0], width_div8); + round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0], width_div8); + col_txfm(&buf0[i << 1], &buf0[i << 1], cos_bit_col, width_div8, width_div8); + col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col, width_div8, + width_div8); + round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[1], width_div8); + round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1], width_div8); + } + + for (r = 0; r < height; r += 8) { + for (c = 0; c < width_div8; c++) { + fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c], + &buf1[c * 8 * width_div8 + (r >> 3)], + width_div8, width_div8); + } + } + + for (int i = 0; i < 2; i++) { + row_txfm(&buf1[i << 1], &buf0[i << 1], cos_bit_row, width_div8, + width_div16); + row_txfm(&buf1[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_row, width_div8, + width_div16); + round_shift_32_8xn_avx2(&buf0[i << 1], (height >> 1), shift[2], + width_div16); + round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], (height >> 1), shift[2], + width_div16); + } + + for (r = 0; r < (height >> 1); r += 8) { + for (c = 0; c < width_div16; c++) { + fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div16 + c], + &buf1[c * 8 * width_div16 + (r >> 3)], + width_div16, width_div16); + } + } + store_buffer_avx2(buf1, output, 8, 128); +} diff --git a/libs/libaom/src/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/libs/libaom/src/av1/encoder/x86/highbd_fwd_txfm_sse4.c new file mode 100644 index 000000000..73afc5d03 --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/highbd_fwd_txfm_sse4.c @@ -0,0 +1,2604 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include /* SSE4.1 */ + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "av1/common/av1_txfm.h" +#include "av1/common/x86/highbd_txfm_utility_sse4.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" +#include "av1/encoder/x86/av1_txfm1d_sse4.h" +#include "aom_dsp/txfm_common.h" +#include "aom_dsp/x86/txfm_common_sse2.h" +#include "aom_ports/mem.h" + +static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, + int stride, int flipud, int fliplr, + int shift) { + if (!flipud) { + in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); + in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); + } else { + in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); + in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); + in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + } + + if (fliplr) { + in[0] = _mm_shufflelo_epi16(in[0], 0x1b); + in[1] = _mm_shufflelo_epi16(in[1], 0x1b); + in[2] = _mm_shufflelo_epi16(in[2], 0x1b); + in[3] = _mm_shufflelo_epi16(in[3], 0x1b); + } + + in[0] = _mm_cvtepi16_epi32(in[0]); + in[1] = _mm_cvtepi16_epi32(in[1]); + in[2] = _mm_cvtepi16_epi32(in[2]); + in[3] = _mm_cvtepi16_epi32(in[3]); + + in[0] = _mm_slli_epi32(in[0], shift); + in[1] = _mm_slli_epi32(in[1], shift); + in[2] = _mm_slli_epi32(in[2], shift); + in[3] = _mm_slli_epi32(in[3], shift); +} + +// We only use stage-2 bit; +// shift[0] is used in load_buffer_4x4() +// shift[1] is used in txfm_func_col() +// shift[2] is used in txfm_func_row() +static void fdct4x4_sse4_1(__m128i *in, __m128i *out, int bit, + const int num_col) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + __m128i s0, s1, s2, s3; + __m128i u0, u1, u2, u3; + __m128i v0, v1, v2, v3; + + int endidx = 3 * num_col; + s0 = _mm_add_epi32(in[0], in[endidx]); + s3 = _mm_sub_epi32(in[0], in[endidx]); + endidx -= num_col; + s1 = _mm_add_epi32(in[num_col], in[endidx]); + s2 = _mm_sub_epi32(in[num_col], in[endidx]); + + // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit); + u0 = _mm_mullo_epi32(s0, cospi32); + u1 = _mm_mullo_epi32(s1, cospi32); + u2 = _mm_add_epi32(u0, u1); + v0 = _mm_sub_epi32(u0, u1); + + u3 = _mm_add_epi32(u2, rnding); + v1 = _mm_add_epi32(v0, rnding); + + u0 = _mm_srai_epi32(u3, bit); + u2 = _mm_srai_epi32(v1, bit); + + // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit); + v0 = _mm_mullo_epi32(s2, cospi48); + v1 = _mm_mullo_epi32(s3, cospi16); + v2 = _mm_add_epi32(v0, v1); + + v3 = _mm_add_epi32(v2, rnding); + u1 = _mm_srai_epi32(v3, bit); + + v0 = _mm_mullo_epi32(s2, cospi16); + v1 = _mm_mullo_epi32(s3, cospi48); + v2 = _mm_sub_epi32(v1, v0); + + v3 = _mm_add_epi32(v2, rnding); + u3 = _mm_srai_epi32(v3, bit); + + // Note: shift[1] and shift[2] are zeros + + // Transpose 4x4 32-bit + v0 = _mm_unpacklo_epi32(u0, u1); + v1 = _mm_unpackhi_epi32(u0, u1); + v2 = _mm_unpacklo_epi32(u2, u3); + v3 = _mm_unpackhi_epi32(u2, u3); + + out[0] = _mm_unpacklo_epi64(v0, v2); + out[1] = _mm_unpackhi_epi64(v0, v2); + out[2] = _mm_unpacklo_epi64(v1, v3); + out[3] = _mm_unpackhi_epi64(v1, v3); +} + +static INLINE void write_buffer_4x4(__m128i *res, int32_t *output) { + _mm_store_si128((__m128i *)(output + 0 * 4), res[0]); + _mm_store_si128((__m128i *)(output + 1 * 4), res[1]); + _mm_store_si128((__m128i *)(output + 2 * 4), res[2]); + _mm_store_si128((__m128i *)(output + 3 * 4), res[3]); +} + +static void fadst4x4_sse4_1(__m128i *in, __m128i *out, int bit, + const int num_col) { + const int32_t *sinpi = sinpi_arr(bit); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]); + const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]); + const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]); + const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]); + __m128i t; + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + __m128i x0, x1, x2, x3; + __m128i u0, u1, u2, u3; + __m128i v0, v1, v2, v3; + + int idx = 0 * num_col; + s0 = _mm_mullo_epi32(in[idx], sinpi1); + s1 = _mm_mullo_epi32(in[idx], sinpi4); + t = _mm_add_epi32(in[idx], in[idx + num_col]); + idx += num_col; + s2 = _mm_mullo_epi32(in[idx], sinpi2); + s3 = _mm_mullo_epi32(in[idx], sinpi1); + idx += num_col; + s4 = _mm_mullo_epi32(in[idx], sinpi3); + idx += num_col; + s5 = _mm_mullo_epi32(in[idx], sinpi4); + s6 = _mm_mullo_epi32(in[idx], sinpi2); + s7 = _mm_sub_epi32(t, in[idx]); + + t = _mm_add_epi32(s0, s2); + x0 = _mm_add_epi32(t, s5); + x1 = _mm_mullo_epi32(s7, sinpi3); + t = _mm_sub_epi32(s1, s3); + x2 = _mm_add_epi32(t, s6); + x3 = s4; + + s0 = _mm_add_epi32(x0, x3); + s1 = x1; + s2 = _mm_sub_epi32(x2, x3); + t = _mm_sub_epi32(x2, x0); + s3 = _mm_add_epi32(t, x3); + + u0 = _mm_add_epi32(s0, rnding); + u0 = _mm_srai_epi32(u0, bit); + + u1 = _mm_add_epi32(s1, rnding); + u1 = _mm_srai_epi32(u1, bit); + + u2 = _mm_add_epi32(s2, rnding); + u2 = _mm_srai_epi32(u2, bit); + + u3 = _mm_add_epi32(s3, rnding); + u3 = _mm_srai_epi32(u3, bit); + + v0 = _mm_unpacklo_epi32(u0, u1); + v1 = _mm_unpackhi_epi32(u0, u1); + v2 = _mm_unpacklo_epi32(u2, u3); + v3 = _mm_unpackhi_epi32(u2, u3); + + out[0] = _mm_unpacklo_epi64(v0, v2); + out[1] = _mm_unpackhi_epi64(v0, v2); + out[2] = _mm_unpacklo_epi64(v1, v3); + out[3] = _mm_unpackhi_epi64(v1, v3); +} +static void idtx4x4_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) { + (void)bit; + __m128i fact = _mm_set1_epi32(NewSqrt2); + __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1)); + __m128i a_low; + __m128i v[4]; + + for (int i = 0; i < 4; i++) { + a_low = _mm_mullo_epi32(in[i * col_num], fact); + a_low = _mm_add_epi32(a_low, offset); + out[i] = _mm_srai_epi32(a_low, NewSqrt2Bits); + } + + // Transpose for 4x4 + v[0] = _mm_unpacklo_epi32(out[0], out[1]); + v[1] = _mm_unpackhi_epi32(out[0], out[1]); + v[2] = _mm_unpacklo_epi32(out[2], out[3]); + v[3] = _mm_unpackhi_epi32(out[2], out[3]); + + out[0] = _mm_unpacklo_epi64(v[0], v[2]); + out[1] = _mm_unpackhi_epi64(v[0], v[2]); + out[2] = _mm_unpacklo_epi64(v[1], v[3]); + out[3] = _mm_unpackhi_epi64(v[1], v[3]); +} +void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff, + int input_stride, TX_TYPE tx_type, int bd) { + __m128i in[4]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4]; + const int txw_idx = get_txw_idx(TX_4X4); + const int txh_idx = get_txh_idx(TX_4X4); + + switch (tx_type) { + case DCT_DCT: + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case ADST_DCT: + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case DCT_ADST: + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case ADST_ADST: + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case FLIPADST_DCT: + load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case DCT_FLIPADST: + load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]); + fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case FLIPADST_FLIPADST: + load_buffer_4x4(input, in, input_stride, 1, 1, shift[0]); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case ADST_FLIPADST: + load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case FLIPADST_ADST: + load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case IDTX: + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case V_DCT: + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case H_DCT: + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case V_ADST: + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case H_ADST: + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case V_FLIPADST: + load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case H_FLIPADST: + load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]); + idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + default: assert(0); + } + (void)bd; +} + +static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in, + int stride, int flipud, int fliplr, + int shift) { + __m128i u; + if (!flipud) { + in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); + } else { + in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride)); + in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + } + + if (fliplr) { + in[0] = mm_reverse_epi16(in[0]); + in[1] = mm_reverse_epi16(in[1]); + in[2] = mm_reverse_epi16(in[2]); + in[3] = mm_reverse_epi16(in[3]); + in[4] = mm_reverse_epi16(in[4]); + in[5] = mm_reverse_epi16(in[5]); + in[6] = mm_reverse_epi16(in[6]); + in[7] = mm_reverse_epi16(in[7]); + } + + u = _mm_unpackhi_epi64(in[4], in[4]); + in[8] = _mm_cvtepi16_epi32(in[4]); + in[9] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[5], in[5]); + in[10] = _mm_cvtepi16_epi32(in[5]); + in[11] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[6], in[6]); + in[12] = _mm_cvtepi16_epi32(in[6]); + in[13] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[7], in[7]); + in[14] = _mm_cvtepi16_epi32(in[7]); + in[15] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[3], in[3]); + in[6] = _mm_cvtepi16_epi32(in[3]); + in[7] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[2], in[2]); + in[4] = _mm_cvtepi16_epi32(in[2]); + in[5] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[1], in[1]); + in[2] = _mm_cvtepi16_epi32(in[1]); + in[3] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[0], in[0]); + in[0] = _mm_cvtepi16_epi32(in[0]); + in[1] = _mm_cvtepi16_epi32(u); + + in[0] = _mm_slli_epi32(in[0], shift); + in[1] = _mm_slli_epi32(in[1], shift); + in[2] = _mm_slli_epi32(in[2], shift); + in[3] = _mm_slli_epi32(in[3], shift); + in[4] = _mm_slli_epi32(in[4], shift); + in[5] = _mm_slli_epi32(in[5], shift); + in[6] = _mm_slli_epi32(in[6], shift); + in[7] = _mm_slli_epi32(in[7], shift); + + in[8] = _mm_slli_epi32(in[8], shift); + in[9] = _mm_slli_epi32(in[9], shift); + in[10] = _mm_slli_epi32(in[10], shift); + in[11] = _mm_slli_epi32(in[11], shift); + in[12] = _mm_slli_epi32(in[12], shift); + in[13] = _mm_slli_epi32(in[13], shift); + in[14] = _mm_slli_epi32(in[14], shift); + in[15] = _mm_slli_epi32(in[15], shift); +} + +static INLINE void col_txfm_8x8_rounding(__m128i *in, int shift) { + const __m128i rounding = _mm_set1_epi32(1 << (shift - 1)); + + in[0] = _mm_add_epi32(in[0], rounding); + in[1] = _mm_add_epi32(in[1], rounding); + in[2] = _mm_add_epi32(in[2], rounding); + in[3] = _mm_add_epi32(in[3], rounding); + in[4] = _mm_add_epi32(in[4], rounding); + in[5] = _mm_add_epi32(in[5], rounding); + in[6] = _mm_add_epi32(in[6], rounding); + in[7] = _mm_add_epi32(in[7], rounding); + in[8] = _mm_add_epi32(in[8], rounding); + in[9] = _mm_add_epi32(in[9], rounding); + in[10] = _mm_add_epi32(in[10], rounding); + in[11] = _mm_add_epi32(in[11], rounding); + in[12] = _mm_add_epi32(in[12], rounding); + in[13] = _mm_add_epi32(in[13], rounding); + in[14] = _mm_add_epi32(in[14], rounding); + in[15] = _mm_add_epi32(in[15], rounding); + + in[0] = _mm_srai_epi32(in[0], shift); + in[1] = _mm_srai_epi32(in[1], shift); + in[2] = _mm_srai_epi32(in[2], shift); + in[3] = _mm_srai_epi32(in[3], shift); + in[4] = _mm_srai_epi32(in[4], shift); + in[5] = _mm_srai_epi32(in[5], shift); + in[6] = _mm_srai_epi32(in[6], shift); + in[7] = _mm_srai_epi32(in[7], shift); + in[8] = _mm_srai_epi32(in[8], shift); + in[9] = _mm_srai_epi32(in[9], shift); + in[10] = _mm_srai_epi32(in[10], shift); + in[11] = _mm_srai_epi32(in[11], shift); + in[12] = _mm_srai_epi32(in[12], shift); + in[13] = _mm_srai_epi32(in[13], shift); + in[14] = _mm_srai_epi32(in[14], shift); + in[15] = _mm_srai_epi32(in[15], shift); +} + +static INLINE void col_txfm_4x8_rounding(__m128i *in, int shift) { + const __m128i rounding = _mm_set1_epi32(1 << (shift - 1)); + + in[0] = _mm_add_epi32(in[0], rounding); + in[1] = _mm_add_epi32(in[1], rounding); + in[2] = _mm_add_epi32(in[2], rounding); + in[3] = _mm_add_epi32(in[3], rounding); + in[4] = _mm_add_epi32(in[4], rounding); + in[5] = _mm_add_epi32(in[5], rounding); + in[6] = _mm_add_epi32(in[6], rounding); + in[7] = _mm_add_epi32(in[7], rounding); + + in[0] = _mm_srai_epi32(in[0], shift); + in[1] = _mm_srai_epi32(in[1], shift); + in[2] = _mm_srai_epi32(in[2], shift); + in[3] = _mm_srai_epi32(in[3], shift); + in[4] = _mm_srai_epi32(in[4], shift); + in[5] = _mm_srai_epi32(in[5], shift); + in[6] = _mm_srai_epi32(in[6], shift); + in[7] = _mm_srai_epi32(in[7], shift); +} + +static INLINE void write_buffer_8x8(const __m128i *res, int32_t *output) { + _mm_store_si128((__m128i *)(output + 0 * 4), res[0]); + _mm_store_si128((__m128i *)(output + 1 * 4), res[1]); + _mm_store_si128((__m128i *)(output + 2 * 4), res[2]); + _mm_store_si128((__m128i *)(output + 3 * 4), res[3]); + + _mm_store_si128((__m128i *)(output + 4 * 4), res[4]); + _mm_store_si128((__m128i *)(output + 5 * 4), res[5]); + _mm_store_si128((__m128i *)(output + 6 * 4), res[6]); + _mm_store_si128((__m128i *)(output + 7 * 4), res[7]); + + _mm_store_si128((__m128i *)(output + 8 * 4), res[8]); + _mm_store_si128((__m128i *)(output + 9 * 4), res[9]); + _mm_store_si128((__m128i *)(output + 10 * 4), res[10]); + _mm_store_si128((__m128i *)(output + 11 * 4), res[11]); + + _mm_store_si128((__m128i *)(output + 12 * 4), res[12]); + _mm_store_si128((__m128i *)(output + 13 * 4), res[13]); + _mm_store_si128((__m128i *)(output + 14 * 4), res[14]); + _mm_store_si128((__m128i *)(output + 15 * 4), res[15]); +} + +static INLINE void write_buffer_16x8(const __m128i *res, int32_t *output, + const int stride) { + _mm_storeu_si128((__m128i *)(output), res[0]); + _mm_storeu_si128((__m128i *)(output + 4), res[1]); + _mm_storeu_si128((__m128i *)(output + stride), res[2]); + _mm_storeu_si128((__m128i *)(output + stride + 4), res[3]); + + _mm_storeu_si128((__m128i *)(output + (stride * 2)), res[4]); + _mm_storeu_si128((__m128i *)(output + (stride * 2) + 4), res[5]); + _mm_storeu_si128((__m128i *)(output + (stride * 3)), res[6]); + _mm_storeu_si128((__m128i *)(output + (stride * 3) + 4), res[7]); + + _mm_storeu_si128((__m128i *)(output + (stride * 4)), res[8]); + _mm_storeu_si128((__m128i *)(output + (stride * 4) + 4), res[9]); + _mm_storeu_si128((__m128i *)(output + (stride * 5)), res[10]); + _mm_storeu_si128((__m128i *)(output + (stride * 5) + 4), res[11]); + + _mm_storeu_si128((__m128i *)(output + (stride * 6)), res[12]); + _mm_storeu_si128((__m128i *)(output + (stride * 6) + 4), res[13]); + _mm_storeu_si128((__m128i *)(output + (stride * 7)), res[14]); + _mm_storeu_si128((__m128i *)(output + (stride * 7) + 4), res[15]); +} + +static void fdct4x8_sse4_1(__m128i *in, __m128i *out, int bit, + const int col_num) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + __m128i u[8], v[8]; + + int startidx = 0 * col_num; + int endidx = 7 * col_num; + // Even 8 points 0, 2, ..., 14 + // stage 0 + // stage 1 + u[0] = _mm_add_epi32(in[startidx], in[endidx]); + v[7] = _mm_sub_epi32(in[startidx], in[endidx]); // v[7] + startidx += col_num; + endidx -= col_num; + u[1] = _mm_add_epi32(in[startidx], in[endidx]); + u[6] = _mm_sub_epi32(in[startidx], in[endidx]); + startidx += col_num; + endidx -= col_num; + u[2] = _mm_add_epi32(in[startidx], in[endidx]); + u[5] = _mm_sub_epi32(in[startidx], in[endidx]); + startidx += col_num; + endidx -= col_num; + u[3] = _mm_add_epi32(in[startidx], in[endidx]); + v[4] = _mm_sub_epi32(in[startidx], in[endidx]); // v[4] + + // stage 2 + v[0] = _mm_add_epi32(u[0], u[3]); + v[3] = _mm_sub_epi32(u[0], u[3]); + v[1] = _mm_add_epi32(u[1], u[2]); + v[2] = _mm_sub_epi32(u[1], u[2]); + + v[5] = _mm_mullo_epi32(u[5], cospim32); + v[6] = _mm_mullo_epi32(u[6], cospi32); + v[5] = _mm_add_epi32(v[5], v[6]); + v[5] = _mm_add_epi32(v[5], rnding); + v[5] = _mm_srai_epi32(v[5], bit); + + u[0] = _mm_mullo_epi32(u[5], cospi32); + v[6] = _mm_mullo_epi32(u[6], cospim32); + v[6] = _mm_sub_epi32(u[0], v[6]); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + // stage 3 + // type 0 + v[0] = _mm_mullo_epi32(v[0], cospi32); + v[1] = _mm_mullo_epi32(v[1], cospi32); + u[0] = _mm_add_epi32(v[0], v[1]); + u[0] = _mm_add_epi32(u[0], rnding); + u[0] = _mm_srai_epi32(u[0], bit); + + u[1] = _mm_sub_epi32(v[0], v[1]); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); + + // type 1 + v[0] = _mm_mullo_epi32(v[2], cospi48); + v[1] = _mm_mullo_epi32(v[3], cospi16); + u[2] = _mm_add_epi32(v[0], v[1]); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + v[0] = _mm_mullo_epi32(v[2], cospi16); + v[1] = _mm_mullo_epi32(v[3], cospi48); + u[3] = _mm_sub_epi32(v[1], v[0]); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + u[4] = _mm_add_epi32(v[4], v[5]); + u[5] = _mm_sub_epi32(v[4], v[5]); + u[6] = _mm_sub_epi32(v[7], v[6]); + u[7] = _mm_add_epi32(v[7], v[6]); + + // stage 4 + // stage 5 + v[0] = _mm_mullo_epi32(u[4], cospi56); + v[1] = _mm_mullo_epi32(u[7], cospi8); + v[0] = _mm_add_epi32(v[0], v[1]); + v[0] = _mm_add_epi32(v[0], rnding); + out[1 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[4] + + v[0] = _mm_mullo_epi32(u[4], cospi8); + v[1] = _mm_mullo_epi32(u[7], cospi56); + v[0] = _mm_sub_epi32(v[1], v[0]); + v[0] = _mm_add_epi32(v[0], rnding); + out[7 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[7] + + v[0] = _mm_mullo_epi32(u[5], cospi24); + v[1] = _mm_mullo_epi32(u[6], cospi40); + v[0] = _mm_add_epi32(v[0], v[1]); + v[0] = _mm_add_epi32(v[0], rnding); + out[5 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[5] + + v[0] = _mm_mullo_epi32(u[5], cospi40); + v[1] = _mm_mullo_epi32(u[6], cospi24); + v[0] = _mm_sub_epi32(v[1], v[0]); + v[0] = _mm_add_epi32(v[0], rnding); + out[3 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[6] + + out[0 * col_num] = u[0]; // buf0[0] + out[4 * col_num] = u[1]; // buf0[1] + out[2 * col_num] = u[2]; // buf0[2] + out[6 * col_num] = u[3]; // buf0[3] +} + +static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit, + const int col_num) { + fdct4x8_sse4_1(in, out, bit, col_num); + fdct4x8_sse4_1(in + 1, out + 1, bit, col_num); +} + +static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, + const int col_num) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i zero = _mm_setzero_si128(); + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i x, y; + int col; + + // Note: + // Even column: 0, 2, ..., 14 + // Odd column: 1, 3, ..., 15 + // one even column plus one odd column constructs one row (8 coeffs) + // total we have 8 rows (8x8). + for (col = 0; col < col_num; ++col) { + // stage 0 + // stage 1 + u0 = in[col_num * 0 + col]; + u1 = _mm_sub_epi32(zero, in[col_num * 7 + col]); + u2 = _mm_sub_epi32(zero, in[col_num * 3 + col]); + u3 = in[col_num * 4 + col]; + u4 = _mm_sub_epi32(zero, in[col_num * 1 + col]); + u5 = in[col_num * 6 + col]; + u6 = in[col_num * 2 + col]; + u7 = _mm_sub_epi32(zero, in[col_num * 5 + col]); + + // stage 2 + v0 = u0; + v1 = u1; + + x = _mm_mullo_epi32(u2, cospi32); + y = _mm_mullo_epi32(u3, cospi32); + v2 = _mm_add_epi32(x, y); + v2 = _mm_add_epi32(v2, rnding); + v2 = _mm_srai_epi32(v2, bit); + + v3 = _mm_sub_epi32(x, y); + v3 = _mm_add_epi32(v3, rnding); + v3 = _mm_srai_epi32(v3, bit); + + v4 = u4; + v5 = u5; + + x = _mm_mullo_epi32(u6, cospi32); + y = _mm_mullo_epi32(u7, cospi32); + v6 = _mm_add_epi32(x, y); + v6 = _mm_add_epi32(v6, rnding); + v6 = _mm_srai_epi32(v6, bit); + + v7 = _mm_sub_epi32(x, y); + v7 = _mm_add_epi32(v7, rnding); + v7 = _mm_srai_epi32(v7, bit); + + // stage 3 + u0 = _mm_add_epi32(v0, v2); + u1 = _mm_add_epi32(v1, v3); + u2 = _mm_sub_epi32(v0, v2); + u3 = _mm_sub_epi32(v1, v3); + u4 = _mm_add_epi32(v4, v6); + u5 = _mm_add_epi32(v5, v7); + u6 = _mm_sub_epi32(v4, v6); + u7 = _mm_sub_epi32(v5, v7); + + // stage 4 + v0 = u0; + v1 = u1; + v2 = u2; + v3 = u3; + + x = _mm_mullo_epi32(u4, cospi16); + y = _mm_mullo_epi32(u5, cospi48); + v4 = _mm_add_epi32(x, y); + v4 = _mm_add_epi32(v4, rnding); + v4 = _mm_srai_epi32(v4, bit); + + x = _mm_mullo_epi32(u4, cospi48); + y = _mm_mullo_epi32(u5, cospim16); + v5 = _mm_add_epi32(x, y); + v5 = _mm_add_epi32(v5, rnding); + v5 = _mm_srai_epi32(v5, bit); + + x = _mm_mullo_epi32(u6, cospim48); + y = _mm_mullo_epi32(u7, cospi16); + v6 = _mm_add_epi32(x, y); + v6 = _mm_add_epi32(v6, rnding); + v6 = _mm_srai_epi32(v6, bit); + + x = _mm_mullo_epi32(u6, cospi16); + y = _mm_mullo_epi32(u7, cospi48); + v7 = _mm_add_epi32(x, y); + v7 = _mm_add_epi32(v7, rnding); + v7 = _mm_srai_epi32(v7, bit); + + // stage 5 + u0 = _mm_add_epi32(v0, v4); + u1 = _mm_add_epi32(v1, v5); + u2 = _mm_add_epi32(v2, v6); + u3 = _mm_add_epi32(v3, v7); + u4 = _mm_sub_epi32(v0, v4); + u5 = _mm_sub_epi32(v1, v5); + u6 = _mm_sub_epi32(v2, v6); + u7 = _mm_sub_epi32(v3, v7); + + // stage 6 + x = _mm_mullo_epi32(u0, cospi4); + y = _mm_mullo_epi32(u1, cospi60); + v0 = _mm_add_epi32(x, y); + v0 = _mm_add_epi32(v0, rnding); + v0 = _mm_srai_epi32(v0, bit); + + x = _mm_mullo_epi32(u0, cospi60); + y = _mm_mullo_epi32(u1, cospim4); + v1 = _mm_add_epi32(x, y); + v1 = _mm_add_epi32(v1, rnding); + v1 = _mm_srai_epi32(v1, bit); + + x = _mm_mullo_epi32(u2, cospi20); + y = _mm_mullo_epi32(u3, cospi44); + v2 = _mm_add_epi32(x, y); + v2 = _mm_add_epi32(v2, rnding); + v2 = _mm_srai_epi32(v2, bit); + + x = _mm_mullo_epi32(u2, cospi44); + y = _mm_mullo_epi32(u3, cospim20); + v3 = _mm_add_epi32(x, y); + v3 = _mm_add_epi32(v3, rnding); + v3 = _mm_srai_epi32(v3, bit); + + x = _mm_mullo_epi32(u4, cospi36); + y = _mm_mullo_epi32(u5, cospi28); + v4 = _mm_add_epi32(x, y); + v4 = _mm_add_epi32(v4, rnding); + v4 = _mm_srai_epi32(v4, bit); + + x = _mm_mullo_epi32(u4, cospi28); + y = _mm_mullo_epi32(u5, cospim36); + v5 = _mm_add_epi32(x, y); + v5 = _mm_add_epi32(v5, rnding); + v5 = _mm_srai_epi32(v5, bit); + + x = _mm_mullo_epi32(u6, cospi52); + y = _mm_mullo_epi32(u7, cospi12); + v6 = _mm_add_epi32(x, y); + v6 = _mm_add_epi32(v6, rnding); + v6 = _mm_srai_epi32(v6, bit); + + x = _mm_mullo_epi32(u6, cospi12); + y = _mm_mullo_epi32(u7, cospim52); + v7 = _mm_add_epi32(x, y); + v7 = _mm_add_epi32(v7, rnding); + v7 = _mm_srai_epi32(v7, bit); + + // stage 7 + out[col_num * 0 + col] = v1; + out[col_num * 1 + col] = v6; + out[col_num * 2 + col] = v3; + out[col_num * 3 + col] = v4; + out[col_num * 4 + col] = v5; + out[col_num * 5 + col] = v2; + out[col_num * 6 + col] = v7; + out[col_num * 7 + col] = v0; + } +} +static void idtx8x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) { + (void)bit; + + for (int i = 0; i < col_num; i += 1) { + out[0 + 8 * i] = _mm_add_epi32(in[0 + 8 * i], in[0 + 8 * i]); + out[1 + 8 * i] = _mm_add_epi32(in[1 + 8 * i], in[1 + 8 * i]); + out[2 + 8 * i] = _mm_add_epi32(in[2 + 8 * i], in[2 + 8 * i]); + out[3 + 8 * i] = _mm_add_epi32(in[3 + 8 * i], in[3 + 8 * i]); + out[4 + 8 * i] = _mm_add_epi32(in[4 + 8 * i], in[4 + 8 * i]); + out[5 + 8 * i] = _mm_add_epi32(in[5 + 8 * i], in[5 + 8 * i]); + out[6 + 8 * i] = _mm_add_epi32(in[6 + 8 * i], in[6 + 8 * i]); + out[7 + 8 * i] = _mm_add_epi32(in[7 + 8 * i], in[7 + 8 * i]); + } +} +static void idtx32x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) { + (void)bit; + (void)col_num; + for (int j = 0; j < 2; j++) { + out[j + 8 * 0] = _mm_add_epi32(in[j + 8 * 0], in[j + 8 * 0]); + out[j + 8 * 1] = _mm_add_epi32(in[j + 8 * 1], in[j + 8 * 1]); + out[j + 8 * 2] = _mm_add_epi32(in[j + 8 * 2], in[j + 8 * 2]); + out[j + 8 * 3] = _mm_add_epi32(in[j + 8 * 3], in[j + 8 * 3]); + out[j + 8 * 4] = _mm_add_epi32(in[j + 8 * 4], in[j + 8 * 4]); + out[j + 8 * 5] = _mm_add_epi32(in[j + 8 * 5], in[j + 8 * 5]); + out[j + 8 * 6] = _mm_add_epi32(in[j + 8 * 6], in[j + 8 * 6]); + out[j + 8 * 7] = _mm_add_epi32(in[j + 8 * 7], in[j + 8 * 7]); + } +} +void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + __m128i in[16], out[16]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8]; + const int txw_idx = get_txw_idx(TX_8X8); + const int txh_idx = get_txh_idx(TX_8X8); + + switch (tx_type) { + case DCT_DCT: + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); + transpose_8x8(out, in); + write_buffer_8x8(in, coeff); + break; + case ADST_DCT: + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); + transpose_8x8(out, in); + write_buffer_8x8(in, coeff); + break; + case DCT_ADST: + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); + transpose_8x8(out, in); + write_buffer_8x8(in, coeff); + break; + case ADST_ADST: + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); + transpose_8x8(out, in); + write_buffer_8x8(in, coeff); + break; + case FLIPADST_DCT: + load_buffer_8x8(input, in, stride, 1, 0, shift[0]); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); + transpose_8x8(out, in); + write_buffer_8x8(in, coeff); + break; + case DCT_FLIPADST: + load_buffer_8x8(input, in, stride, 0, 1, shift[0]); + fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); + transpose_8x8(out, in); + write_buffer_8x8(in, coeff); + break; + case FLIPADST_FLIPADST: + load_buffer_8x8(input, in, stride, 1, 1, shift[0]); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); + transpose_8x8(out, in); + write_buffer_8x8(in, coeff); + break; + case ADST_FLIPADST: + load_buffer_8x8(input, in, stride, 0, 1, shift[0]); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); + transpose_8x8(out, in); + write_buffer_8x8(in, coeff); + break; + case FLIPADST_ADST: + load_buffer_8x8(input, in, stride, 1, 0, shift[0]); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); + transpose_8x8(out, in); + write_buffer_8x8(in, coeff); + break; + case IDTX: + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + transpose_8x8(out, in); + write_buffer_8x8(in, coeff); + break; + case V_DCT: + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + transpose_8x8(out, in); + write_buffer_8x8(in, coeff); + break; + case H_DCT: + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + transpose_8x8(out, in); + write_buffer_8x8(in, coeff); + break; + case V_ADST: + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + transpose_8x8(out, in); + write_buffer_8x8(in, coeff); + break; + case H_ADST: + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + transpose_8x8(out, in); + write_buffer_8x8(in, coeff); + break; + case V_FLIPADST: + load_buffer_8x8(input, in, stride, 1, 0, shift[0]); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + transpose_8x8(out, in); + write_buffer_8x8(in, coeff); + break; + case H_FLIPADST: + load_buffer_8x8(input, in, stride, 0, 1, shift[0]); + idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + transpose_8x8(out, in); + write_buffer_8x8(in, coeff); + break; + default: assert(0); + } + (void)bd; +} + +// Hybrid Transform 16x16 + +static INLINE void convert_8x8_to_16x16(const __m128i *in, __m128i *out) { + int row_index = 0; + int dst_index = 0; + int src_index = 0; + + // row 0, 1, .., 7 + do { + out[dst_index] = in[src_index]; + out[dst_index + 1] = in[src_index + 1]; + out[dst_index + 2] = in[src_index + 16]; + out[dst_index + 3] = in[src_index + 17]; + dst_index += 4; + src_index += 2; + row_index += 1; + } while (row_index < 8); + + // row 8, 9, ..., 15 + src_index += 16; + do { + out[dst_index] = in[src_index]; + out[dst_index + 1] = in[src_index + 1]; + out[dst_index + 2] = in[src_index + 16]; + out[dst_index + 3] = in[src_index + 17]; + dst_index += 4; + src_index += 2; + row_index += 1; + } while (row_index < 16); +} + +static INLINE void load_buffer_16x16(const int16_t *input, __m128i *out, + int stride, int flipud, int fliplr, + int shift) { + __m128i in[64]; + // Load 4 8x8 blocks + const int16_t *topL = input; + const int16_t *topR = input + 8; + const int16_t *botL = input + 8 * stride; + const int16_t *botR = input + 8 * stride + 8; + + const int16_t *tmp; + + if (flipud) { + // Swap left columns + tmp = topL; + topL = botL; + botL = tmp; + // Swap right columns + tmp = topR; + topR = botR; + botR = tmp; + } + + if (fliplr) { + // Swap top rows + tmp = topL; + topL = topR; + topR = tmp; + // Swap bottom rows + tmp = botL; + botL = botR; + botR = tmp; + } + + // load first 8 columns + load_buffer_8x8(topL, &in[0], stride, flipud, fliplr, shift); + load_buffer_8x8(botL, &in[32], stride, flipud, fliplr, shift); + + // load second 8 columns + load_buffer_8x8(topR, &in[16], stride, flipud, fliplr, shift); + load_buffer_8x8(botR, &in[48], stride, flipud, fliplr, shift); + + convert_8x8_to_16x16(in, out); +} + +static INLINE void load_buffer_8x16(const int16_t *input, __m128i *out, + int stride, int flipud, int fliplr, + int shift) { + const int16_t *topL = input; + const int16_t *botL = input + 8 * stride; + + const int16_t *tmp; + + if (flipud) { + tmp = topL; + topL = botL; + botL = tmp; + } + + load_buffer_8x8(topL, out, stride, flipud, fliplr, shift); + load_buffer_8x8(botL, out + 16, stride, flipud, fliplr, shift); +} + +static INLINE void load_buffer_8x4(const int16_t *input, __m128i *out, + int stride, int flipud, int fliplr, + int shift) { + const int16_t *topL = input; + const int16_t *topR = input + 4; + + const int16_t *tmp; + + if (fliplr) { + tmp = topL; + topL = topR; + topR = tmp; + } + + load_buffer_4x4(topL, out, stride, flipud, fliplr, shift); + load_buffer_4x4(topR, out + 4, stride, flipud, fliplr, shift); +} + +static INLINE void load_buffer_16x4(const int16_t *input, __m128i *out, + int stride, int flipud, int fliplr, + int shift) { + const int16_t *topL = input; + const int16_t *topR = input + 8; + + const int16_t *tmp; + + if (fliplr) { + tmp = topL; + topL = topR; + topR = tmp; + } + + load_buffer_8x4(topL, out, stride, flipud, fliplr, shift); + load_buffer_8x4(topR, out + 8, stride, flipud, fliplr, shift); +} + +static INLINE void load_buffer_4x8(const int16_t *input, __m128i *out, + int stride, int flipud, int fliplr, + int shift) { + const int16_t *topL = input; + const int16_t *botL = input + 4 * stride; + + const int16_t *tmp; + + if (flipud) { + tmp = topL; + topL = botL; + botL = tmp; + } + + load_buffer_4x4(topL, out, stride, flipud, fliplr, shift); + load_buffer_4x4(botL, out + 4, stride, flipud, fliplr, shift); +} + +static INLINE void load_buffer_4x16(const int16_t *input, __m128i *out, + const int stride, const int flipud, + const int fliplr, const int shift) { + const int16_t *topL = input; + const int16_t *botL = input + 8 * stride; + + const int16_t *tmp; + + if (flipud) { + tmp = topL; + topL = botL; + botL = tmp; + } + load_buffer_4x8(topL, out, stride, flipud, fliplr, shift); + load_buffer_4x8(botL, out + 8, stride, flipud, fliplr, shift); +} + +static INLINE void load_buffer_32x8n(const int16_t *input, __m128i *out, + int stride, int flipud, int fliplr, + int shift, const int height) { + const int16_t *in = input; + __m128i *output = out; + for (int col = 0; col < height; col++) { + in = input + col * stride; + output = out + col * 8; + load_buffer_4x4(in, output, 4, flipud, fliplr, shift); + load_buffer_4x4((in + 16), (output + 4), 4, flipud, fliplr, shift); + } +} + +static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit, + const int col_num) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + __m128i u[16], v[16], x; + int col; + + // Calculate the column 0, 1, 2, 3 + for (col = 0; col < col_num; ++col) { + // stage 0 + // stage 1 + u[0] = _mm_add_epi32(in[0 * col_num + col], in[15 * col_num + col]); + u[15] = _mm_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]); + u[1] = _mm_add_epi32(in[1 * col_num + col], in[14 * col_num + col]); + u[14] = _mm_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]); + u[2] = _mm_add_epi32(in[2 * col_num + col], in[13 * col_num + col]); + u[13] = _mm_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]); + u[3] = _mm_add_epi32(in[3 * col_num + col], in[12 * col_num + col]); + u[12] = _mm_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]); + u[4] = _mm_add_epi32(in[4 * col_num + col], in[11 * col_num + col]); + u[11] = _mm_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]); + u[5] = _mm_add_epi32(in[5 * col_num + col], in[10 * col_num + col]); + u[10] = _mm_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]); + u[6] = _mm_add_epi32(in[6 * col_num + col], in[9 * col_num + col]); + u[9] = _mm_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]); + u[7] = _mm_add_epi32(in[7 * col_num + col], in[8 * col_num + col]); + u[8] = _mm_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]); + + // stage 2 + v[0] = _mm_add_epi32(u[0], u[7]); + v[7] = _mm_sub_epi32(u[0], u[7]); + v[1] = _mm_add_epi32(u[1], u[6]); + v[6] = _mm_sub_epi32(u[1], u[6]); + v[2] = _mm_add_epi32(u[2], u[5]); + v[5] = _mm_sub_epi32(u[2], u[5]); + v[3] = _mm_add_epi32(u[3], u[4]); + v[4] = _mm_sub_epi32(u[3], u[4]); + v[8] = u[8]; + v[9] = u[9]; + + v[10] = _mm_mullo_epi32(u[10], cospim32); + x = _mm_mullo_epi32(u[13], cospi32); + v[10] = _mm_add_epi32(v[10], x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[13] = _mm_mullo_epi32(u[10], cospi32); + x = _mm_mullo_epi32(u[13], cospim32); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[11] = _mm_mullo_epi32(u[11], cospim32); + x = _mm_mullo_epi32(u[12], cospi32); + v[11] = _mm_add_epi32(v[11], x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = _mm_mullo_epi32(u[11], cospi32); + x = _mm_mullo_epi32(u[12], cospim32); + v[12] = _mm_sub_epi32(v[12], x); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + v[14] = u[14]; + v[15] = u[15]; + + // stage 3 + u[0] = _mm_add_epi32(v[0], v[3]); + u[3] = _mm_sub_epi32(v[0], v[3]); + u[1] = _mm_add_epi32(v[1], v[2]); + u[2] = _mm_sub_epi32(v[1], v[2]); + u[4] = v[4]; + + u[5] = _mm_mullo_epi32(v[5], cospim32); + x = _mm_mullo_epi32(v[6], cospi32); + u[5] = _mm_add_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_mullo_epi32(v[5], cospi32); + x = _mm_mullo_epi32(v[6], cospim32); + u[6] = _mm_sub_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = v[7]; + u[8] = _mm_add_epi32(v[8], v[11]); + u[11] = _mm_sub_epi32(v[8], v[11]); + u[9] = _mm_add_epi32(v[9], v[10]); + u[10] = _mm_sub_epi32(v[9], v[10]); + u[12] = _mm_sub_epi32(v[15], v[12]); + u[15] = _mm_add_epi32(v[15], v[12]); + u[13] = _mm_sub_epi32(v[14], v[13]); + u[14] = _mm_add_epi32(v[14], v[13]); + + // stage 4 + u[0] = _mm_mullo_epi32(u[0], cospi32); + u[1] = _mm_mullo_epi32(u[1], cospi32); + v[0] = _mm_add_epi32(u[0], u[1]); + v[0] = _mm_add_epi32(v[0], rnding); + v[0] = _mm_srai_epi32(v[0], bit); + + v[1] = _mm_sub_epi32(u[0], u[1]); + v[1] = _mm_add_epi32(v[1], rnding); + v[1] = _mm_srai_epi32(v[1], bit); + + v[2] = _mm_mullo_epi32(u[2], cospi48); + x = _mm_mullo_epi32(u[3], cospi16); + v[2] = _mm_add_epi32(v[2], x); + v[2] = _mm_add_epi32(v[2], rnding); + v[2] = _mm_srai_epi32(v[2], bit); + + v[3] = _mm_mullo_epi32(u[2], cospi16); + x = _mm_mullo_epi32(u[3], cospi48); + v[3] = _mm_sub_epi32(x, v[3]); + v[3] = _mm_add_epi32(v[3], rnding); + v[3] = _mm_srai_epi32(v[3], bit); + + v[4] = _mm_add_epi32(u[4], u[5]); + v[5] = _mm_sub_epi32(u[4], u[5]); + v[6] = _mm_sub_epi32(u[7], u[6]); + v[7] = _mm_add_epi32(u[7], u[6]); + v[8] = u[8]; + + v[9] = _mm_mullo_epi32(u[9], cospim16); + x = _mm_mullo_epi32(u[14], cospi48); + v[9] = _mm_add_epi32(v[9], x); + v[9] = _mm_add_epi32(v[9], rnding); + v[9] = _mm_srai_epi32(v[9], bit); + + v[14] = _mm_mullo_epi32(u[9], cospi48); + x = _mm_mullo_epi32(u[14], cospim16); + v[14] = _mm_sub_epi32(v[14], x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[10] = _mm_mullo_epi32(u[10], cospim48); + x = _mm_mullo_epi32(u[13], cospim16); + v[10] = _mm_add_epi32(v[10], x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[13] = _mm_mullo_epi32(u[10], cospim16); + x = _mm_mullo_epi32(u[13], cospim48); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[11] = u[11]; + v[12] = u[12]; + v[15] = u[15]; + + // stage 5 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = _mm_mullo_epi32(v[4], cospi56); + x = _mm_mullo_epi32(v[7], cospi8); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[7] = _mm_mullo_epi32(v[4], cospi8); + x = _mm_mullo_epi32(v[7], cospi56); + u[7] = _mm_sub_epi32(x, u[7]); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + u[5] = _mm_mullo_epi32(v[5], cospi24); + x = _mm_mullo_epi32(v[6], cospi40); + u[5] = _mm_add_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_mullo_epi32(v[5], cospi40); + x = _mm_mullo_epi32(v[6], cospi24); + u[6] = _mm_sub_epi32(x, u[6]); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[8] = _mm_add_epi32(v[8], v[9]); + u[9] = _mm_sub_epi32(v[8], v[9]); + u[10] = _mm_sub_epi32(v[11], v[10]); + u[11] = _mm_add_epi32(v[11], v[10]); + u[12] = _mm_add_epi32(v[12], v[13]); + u[13] = _mm_sub_epi32(v[12], v[13]); + u[14] = _mm_sub_epi32(v[15], v[14]); + u[15] = _mm_add_epi32(v[15], v[14]); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = _mm_mullo_epi32(u[8], cospi60); + x = _mm_mullo_epi32(u[15], cospi4); + v[8] = _mm_add_epi32(v[8], x); + v[8] = _mm_add_epi32(v[8], rnding); + v[8] = _mm_srai_epi32(v[8], bit); + + v[15] = _mm_mullo_epi32(u[8], cospi4); + x = _mm_mullo_epi32(u[15], cospi60); + v[15] = _mm_sub_epi32(x, v[15]); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + v[9] = _mm_mullo_epi32(u[9], cospi28); + x = _mm_mullo_epi32(u[14], cospi36); + v[9] = _mm_add_epi32(v[9], x); + v[9] = _mm_add_epi32(v[9], rnding); + v[9] = _mm_srai_epi32(v[9], bit); + + v[14] = _mm_mullo_epi32(u[9], cospi36); + x = _mm_mullo_epi32(u[14], cospi28); + v[14] = _mm_sub_epi32(x, v[14]); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[10] = _mm_mullo_epi32(u[10], cospi44); + x = _mm_mullo_epi32(u[13], cospi20); + v[10] = _mm_add_epi32(v[10], x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[13] = _mm_mullo_epi32(u[10], cospi20); + x = _mm_mullo_epi32(u[13], cospi44); + v[13] = _mm_sub_epi32(x, v[13]); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[11] = _mm_mullo_epi32(u[11], cospi12); + x = _mm_mullo_epi32(u[12], cospi52); + v[11] = _mm_add_epi32(v[11], x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = _mm_mullo_epi32(u[11], cospi52); + x = _mm_mullo_epi32(u[12], cospi12); + v[12] = _mm_sub_epi32(x, v[12]); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + out[0 * col_num + col] = v[0]; + out[1 * col_num + col] = v[8]; + out[2 * col_num + col] = v[4]; + out[3 * col_num + col] = v[12]; + out[4 * col_num + col] = v[2]; + out[5 * col_num + col] = v[10]; + out[6 * col_num + col] = v[6]; + out[7 * col_num + col] = v[14]; + out[8 * col_num + col] = v[1]; + out[9 * col_num + col] = v[9]; + out[10 * col_num + col] = v[5]; + out[11 * col_num + col] = v[13]; + out[12 * col_num + col] = v[3]; + out[13 * col_num + col] = v[11]; + out[14 * col_num + col] = v[7]; + out[15 * col_num + col] = v[15]; + } +} + +static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, + const int num_cols) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospim2 = _mm_set1_epi32(-cospi[2]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospim10 = _mm_set1_epi32(-cospi[10]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospim18 = _mm_set1_epi32(-cospi[18]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospim26 = _mm_set1_epi32(-cospi[26]); + const __m128i cospi34 = _mm_set1_epi32(cospi[34]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); + const __m128i cospi42 = _mm_set1_epi32(cospi[42]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); + const __m128i cospi50 = _mm_set1_epi32(cospi[50]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); + const __m128i cospi58 = _mm_set1_epi32(cospi[58]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i zero = _mm_setzero_si128(); + + __m128i u[16], v[16], x, y; + int col; + + for (col = 0; col < num_cols; ++col) { + // stage 0 + // stage 1 + u[0] = in[0 * num_cols + col]; + u[1] = _mm_sub_epi32(zero, in[15 * num_cols + col]); + u[2] = _mm_sub_epi32(zero, in[7 * num_cols + col]); + u[3] = in[8 * num_cols + col]; + u[4] = _mm_sub_epi32(zero, in[3 * num_cols + col]); + u[5] = in[12 * num_cols + col]; + u[6] = in[4 * num_cols + col]; + u[7] = _mm_sub_epi32(zero, in[11 * num_cols + col]); + u[8] = _mm_sub_epi32(zero, in[1 * num_cols + col]); + u[9] = in[14 * num_cols + col]; + u[10] = in[6 * num_cols + col]; + u[11] = _mm_sub_epi32(zero, in[9 * num_cols + col]); + u[12] = in[2 * num_cols + col]; + u[13] = _mm_sub_epi32(zero, in[13 * num_cols + col]); + u[14] = _mm_sub_epi32(zero, in[5 * num_cols + col]); + u[15] = in[10 * num_cols + col]; + + // stage 2 + v[0] = u[0]; + v[1] = u[1]; + + x = _mm_mullo_epi32(u[2], cospi32); + y = _mm_mullo_epi32(u[3], cospi32); + v[2] = _mm_add_epi32(x, y); + v[2] = _mm_add_epi32(v[2], rnding); + v[2] = _mm_srai_epi32(v[2], bit); + + v[3] = _mm_sub_epi32(x, y); + v[3] = _mm_add_epi32(v[3], rnding); + v[3] = _mm_srai_epi32(v[3], bit); + + v[4] = u[4]; + v[5] = u[5]; + + x = _mm_mullo_epi32(u[6], cospi32); + y = _mm_mullo_epi32(u[7], cospi32); + v[6] = _mm_add_epi32(x, y); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + v[7] = _mm_sub_epi32(x, y); + v[7] = _mm_add_epi32(v[7], rnding); + v[7] = _mm_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + + x = _mm_mullo_epi32(u[10], cospi32); + y = _mm_mullo_epi32(u[11], cospi32); + v[10] = _mm_add_epi32(x, y); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[11] = _mm_sub_epi32(x, y); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = u[12]; + v[13] = u[13]; + + x = _mm_mullo_epi32(u[14], cospi32); + y = _mm_mullo_epi32(u[15], cospi32); + v[14] = _mm_add_epi32(x, y); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_sub_epi32(x, y); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 3 + u[0] = _mm_add_epi32(v[0], v[2]); + u[1] = _mm_add_epi32(v[1], v[3]); + u[2] = _mm_sub_epi32(v[0], v[2]); + u[3] = _mm_sub_epi32(v[1], v[3]); + u[4] = _mm_add_epi32(v[4], v[6]); + u[5] = _mm_add_epi32(v[5], v[7]); + u[6] = _mm_sub_epi32(v[4], v[6]); + u[7] = _mm_sub_epi32(v[5], v[7]); + u[8] = _mm_add_epi32(v[8], v[10]); + u[9] = _mm_add_epi32(v[9], v[11]); + u[10] = _mm_sub_epi32(v[8], v[10]); + u[11] = _mm_sub_epi32(v[9], v[11]); + u[12] = _mm_add_epi32(v[12], v[14]); + u[13] = _mm_add_epi32(v[13], v[15]); + u[14] = _mm_sub_epi32(v[12], v[14]); + u[15] = _mm_sub_epi32(v[13], v[15]); + + // stage 4 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = half_btf_sse4_1(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit); + v[5] = half_btf_sse4_1(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit); + v[6] = half_btf_sse4_1(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit); + v[7] = half_btf_sse4_1(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit); + v[8] = u[8]; + v[9] = u[9]; + v[10] = u[10]; + v[11] = u[11]; + v[12] = half_btf_sse4_1(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit); + v[15] = half_btf_sse4_1(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit); + + // stage 5 + u[0] = _mm_add_epi32(v[0], v[4]); + u[1] = _mm_add_epi32(v[1], v[5]); + u[2] = _mm_add_epi32(v[2], v[6]); + u[3] = _mm_add_epi32(v[3], v[7]); + u[4] = _mm_sub_epi32(v[0], v[4]); + u[5] = _mm_sub_epi32(v[1], v[5]); + u[6] = _mm_sub_epi32(v[2], v[6]); + u[7] = _mm_sub_epi32(v[3], v[7]); + u[8] = _mm_add_epi32(v[8], v[12]); + u[9] = _mm_add_epi32(v[9], v[13]); + u[10] = _mm_add_epi32(v[10], v[14]); + u[11] = _mm_add_epi32(v[11], v[15]); + u[12] = _mm_sub_epi32(v[8], v[12]); + u[13] = _mm_sub_epi32(v[9], v[13]); + u[14] = _mm_sub_epi32(v[10], v[14]); + u[15] = _mm_sub_epi32(v[11], v[15]); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + v[8] = half_btf_sse4_1(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit); + v[9] = half_btf_sse4_1(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit); + v[10] = half_btf_sse4_1(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit); + v[11] = half_btf_sse4_1(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit); + v[12] = half_btf_sse4_1(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit); + v[15] = half_btf_sse4_1(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit); + + // stage 7 + u[0] = _mm_add_epi32(v[0], v[8]); + u[1] = _mm_add_epi32(v[1], v[9]); + u[2] = _mm_add_epi32(v[2], v[10]); + u[3] = _mm_add_epi32(v[3], v[11]); + u[4] = _mm_add_epi32(v[4], v[12]); + u[5] = _mm_add_epi32(v[5], v[13]); + u[6] = _mm_add_epi32(v[6], v[14]); + u[7] = _mm_add_epi32(v[7], v[15]); + u[8] = _mm_sub_epi32(v[0], v[8]); + u[9] = _mm_sub_epi32(v[1], v[9]); + u[10] = _mm_sub_epi32(v[2], v[10]); + u[11] = _mm_sub_epi32(v[3], v[11]); + u[12] = _mm_sub_epi32(v[4], v[12]); + u[13] = _mm_sub_epi32(v[5], v[13]); + u[14] = _mm_sub_epi32(v[6], v[14]); + u[15] = _mm_sub_epi32(v[7], v[15]); + + // stage 8 + v[0] = half_btf_sse4_1(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit); + v[1] = half_btf_sse4_1(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit); + v[2] = half_btf_sse4_1(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit); + v[3] = half_btf_sse4_1(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit); + v[4] = half_btf_sse4_1(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit); + v[5] = half_btf_sse4_1(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit); + v[6] = half_btf_sse4_1(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit); + v[7] = half_btf_sse4_1(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit); + v[8] = half_btf_sse4_1(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit); + v[9] = half_btf_sse4_1(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit); + v[10] = half_btf_sse4_1(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit); + v[11] = half_btf_sse4_1(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit); + v[12] = half_btf_sse4_1(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit); + v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit); + + // stage 9 + out[0 * num_cols + col] = v[1]; + out[1 * num_cols + col] = v[14]; + out[2 * num_cols + col] = v[3]; + out[3 * num_cols + col] = v[12]; + out[4 * num_cols + col] = v[5]; + out[5 * num_cols + col] = v[10]; + out[6 * num_cols + col] = v[7]; + out[7 * num_cols + col] = v[8]; + out[8 * num_cols + col] = v[9]; + out[9 * num_cols + col] = v[6]; + out[10 * num_cols + col] = v[11]; + out[11 * num_cols + col] = v[4]; + out[12 * num_cols + col] = v[13]; + out[13 * num_cols + col] = v[2]; + out[14 * num_cols + col] = v[15]; + out[15 * num_cols + col] = v[0]; + } +} + +static void col_txfm_16x16_rounding(__m128i *in, int shift) { + // Note: + // We split 16x16 rounding into 4 sections of 8x8 rounding, + // instead of 4 columns + col_txfm_8x8_rounding(&in[0], shift); + col_txfm_8x8_rounding(&in[16], shift); + col_txfm_8x8_rounding(&in[32], shift); + col_txfm_8x8_rounding(&in[48], shift); +} + +static void col_txfm_8x16_rounding(__m128i *in, int shift) { + col_txfm_8x8_rounding(&in[0], shift); + col_txfm_8x8_rounding(&in[16], shift); +} + +static void write_buffer_16x16(const __m128i *in, int32_t *output) { + const int size_8x8 = 16 * 4; + write_buffer_8x8(&in[0], output); + output += size_8x8; + write_buffer_8x8(&in[16], output); + output += size_8x8; + write_buffer_8x8(&in[32], output); + output += size_8x8; + write_buffer_8x8(&in[48], output); +} +static void idtx16x16_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) { + (void)bit; + __m128i fact = _mm_set1_epi32(2 * NewSqrt2); + __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1)); + __m128i a_low; + + int num_iters = 16 * col_num; + for (int i = 0; i < num_iters; i++) { + a_low = _mm_mullo_epi32(in[i], fact); + a_low = _mm_add_epi32(a_low, offset); + out[i] = _mm_srai_epi32(a_low, NewSqrt2Bits); + } +} +void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[64], out[64]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16]; + const int txw_idx = get_txw_idx(TX_16X16); + const int txh_idx = get_txh_idx(TX_16X16); + const int col_num = 4; + switch (tx_type) { + case DCT_DCT: + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); + transpose_16x16(out, in); + write_buffer_16x16(in, coeff); + break; + case ADST_DCT: + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], + col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); + transpose_16x16(out, in); + write_buffer_16x16(in, coeff); + break; + case DCT_ADST: + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], + col_num); + transpose_16x16(out, in); + write_buffer_16x16(in, coeff); + break; + case ADST_ADST: + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], + col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], + col_num); + transpose_16x16(out, in); + write_buffer_16x16(in, coeff); + break; + case FLIPADST_DCT: + load_buffer_16x16(input, in, stride, 1, 0, shift[0]); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], + col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); + transpose_16x16(out, in); + write_buffer_16x16(in, coeff); + break; + case DCT_FLIPADST: + load_buffer_16x16(input, in, stride, 0, 1, shift[0]); + fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], + col_num); + transpose_16x16(out, in); + write_buffer_16x16(in, coeff); + break; + case FLIPADST_FLIPADST: + load_buffer_16x16(input, in, stride, 1, 1, shift[0]); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], + col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], + col_num); + transpose_16x16(out, in); + write_buffer_16x16(in, coeff); + break; + case ADST_FLIPADST: + load_buffer_16x16(input, in, stride, 0, 1, shift[0]); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], + col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], + col_num); + transpose_16x16(out, in); + write_buffer_16x16(in, coeff); + break; + case FLIPADST_ADST: + load_buffer_16x16(input, in, stride, 1, 0, shift[0]); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], + col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], + col_num); + transpose_16x16(out, in); + write_buffer_16x16(in, coeff); + break; + case IDTX: + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); + transpose_16x16(out, in); + write_buffer_16x16(in, coeff); + break; + case V_DCT: + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); + transpose_16x16(out, in); + write_buffer_16x16(in, coeff); + break; + case H_DCT: + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); + transpose_16x16(out, in); + write_buffer_16x16(in, coeff); + break; + case V_ADST: + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], + col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); + transpose_16x16(out, in); + write_buffer_16x16(in, coeff); + break; + case H_ADST: + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], + col_num); + transpose_16x16(out, in); + write_buffer_16x16(in, coeff); + break; + case V_FLIPADST: + load_buffer_16x16(input, in, stride, 1, 0, shift[0]); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], + col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); + transpose_16x16(out, in); + write_buffer_16x16(in, coeff); + break; + case H_FLIPADST: + load_buffer_16x16(input, in, stride, 0, 1, shift[0]); + idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], + col_num); + transpose_16x16(out, in); + write_buffer_16x16(in, coeff); + break; + default: assert(0); + } + (void)bd; +} + +static INLINE void flip_buf_sse4_1(__m128i *in, __m128i *out, int size) { + for (int i = 0; i < size; i += 2) in[30 - i] = out[i]; + for (int i = 1; i < size; i += 2) in[size - i] = out[i]; +} + +static const fwd_transform_1d_sse4_1 col_highbd_txfm8x8_arr[TX_TYPES] = { + fdct8x8_sse4_1, // DCT_DCT + fadst8x8_sse4_1, // ADST_DCT + fdct8x8_sse4_1, // DCT_ADST + fadst8x8_sse4_1, // ADST_ADST + fadst8x8_sse4_1, // FLIPADST_DCT + fdct8x8_sse4_1, // DCT_FLIPADST + fadst8x8_sse4_1, // FLIPADST_FLIPADST + fadst8x8_sse4_1, // ADST_FLIPADST + fadst8x8_sse4_1, // FLIPADST_ADST + idtx8x8_sse4_1, // IDTX + fdct8x8_sse4_1, // V_DCT + idtx8x8_sse4_1, // H_DCT + fadst8x8_sse4_1, // V_ADST + idtx8x8_sse4_1, // H_ADST + fadst8x8_sse4_1, // V_FLIPADST + idtx8x8_sse4_1 // H_FLIPADST +}; +static const fwd_transform_1d_sse4_1 row_highbd_txfm32x8_arr[TX_TYPES] = { + fdct8x8_sse4_1, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST-ADST + idtx32x8_sse4_1, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL, // H_FLIPADST +}; +static const fwd_transform_1d_sse4_1 col_highbd_txfm4x8_arr[TX_TYPES] = { + fdct4x8_sse4_1, // DCT_DCT + fadst8x8_sse4_1, // ADST_DCT + fdct4x8_sse4_1, // DCT_ADST + fadst8x8_sse4_1, // ADST_ADST + fadst8x8_sse4_1, // FLIPADST_DCT + fdct4x8_sse4_1, // DCT_FLIPADST + fadst8x8_sse4_1, // FLIPADST_FLIPADST + fadst8x8_sse4_1, // ADST_FLIPADST + fadst8x8_sse4_1, // FLIPADST_ADST + idtx8x8_sse4_1, // IDTX + fdct4x8_sse4_1, // V_DCT + idtx8x8_sse4_1, // H_DCT + fadst8x8_sse4_1, // V_ADST + idtx8x8_sse4_1, // H_ADST + fadst8x8_sse4_1, // V_FLIPADST + idtx8x8_sse4_1 // H_FLIPADST +}; + +static const fwd_transform_1d_sse4_1 row_highbd_txfm8x16_arr[TX_TYPES] = { + fdct16x16_sse4_1, // DCT_DCT + fdct16x16_sse4_1, // ADST_DCT + fadst16x16_sse4_1, // DCT_ADST + fadst16x16_sse4_1, // ADST_ADST + fdct16x16_sse4_1, // FLIPADST_DCT + fadst16x16_sse4_1, // DCT_FLIPADST + fadst16x16_sse4_1, // FLIPADST_FLIPADST + fadst16x16_sse4_1, // ADST_FLIPADST + fadst16x16_sse4_1, // FLIPADST_ADST + idtx16x16_sse4_1, // IDTX + idtx16x16_sse4_1, // V_DCT + fdct16x16_sse4_1, // H_DCT + idtx16x16_sse4_1, // V_ADST + fadst16x16_sse4_1, // H_ADST + idtx16x16_sse4_1, // V_FLIPADST + fadst16x16_sse4_1 // H_FLIPADST +}; + +static const fwd_transform_1d_sse4_1 col_highbd_txfm8x16_arr[TX_TYPES] = { + fdct16x16_sse4_1, // DCT_DCT + fadst16x16_sse4_1, // ADST_DCT + fdct16x16_sse4_1, // DCT_ADST + fadst16x16_sse4_1, // ADST_ADST + fadst16x16_sse4_1, // FLIPADST_DCT + fdct16x16_sse4_1, // DCT_FLIPADST + fadst16x16_sse4_1, // FLIPADST_FLIPADST + fadst16x16_sse4_1, // ADST_FLIPADST + fadst16x16_sse4_1, // FLIPADST_ADST + idtx16x16_sse4_1, // IDTX + fdct16x16_sse4_1, // V_DCT + idtx16x16_sse4_1, // H_DCT + fadst16x16_sse4_1, // V_ADST + idtx16x16_sse4_1, // H_ADST + fadst16x16_sse4_1, // V_FLIPADST + idtx16x16_sse4_1 // H_FLIPADST +}; +static const fwd_transform_1d_sse4_1 row_highbd_txfm8x8_arr[TX_TYPES] = { + fdct8x8_sse4_1, // DCT_DCT + fdct8x8_sse4_1, // ADST_DCT + fadst8x8_sse4_1, // DCT_ADST + fadst8x8_sse4_1, // ADST_ADST + fdct8x8_sse4_1, // FLIPADST_DCT + fadst8x8_sse4_1, // DCT_FLIPADST + fadst8x8_sse4_1, // FLIPADST_FLIPADST + fadst8x8_sse4_1, // ADST_FLIPADST + fadst8x8_sse4_1, // FLIPADST_ADST + idtx8x8_sse4_1, // IDTX + idtx8x8_sse4_1, // V_DCT + fdct8x8_sse4_1, // H_DCT + idtx8x8_sse4_1, // V_ADST + fadst8x8_sse4_1, // H_ADST + idtx8x8_sse4_1, // V_FLIPADST + fadst8x8_sse4_1 // H_FLIPADST +}; + +static const fwd_transform_1d_sse4_1 row_highbd_txfm4x8_arr[TX_TYPES] = { + fdct4x8_sse4_1, // DCT_DCT + fdct4x8_sse4_1, // ADST_DCT + fadst8x8_sse4_1, // DCT_ADST + fadst8x8_sse4_1, // ADST_ADST + fdct4x8_sse4_1, // FLIPADST_DCT + fadst8x8_sse4_1, // DCT_FLIPADST + fadst8x8_sse4_1, // FLIPADST_FLIPADST + fadst8x8_sse4_1, // ADST_FLIPADST + fadst8x8_sse4_1, // FLIPADST_ADST + idtx8x8_sse4_1, // IDTX + idtx8x8_sse4_1, // V_DCT + fdct4x8_sse4_1, // H_DCT + idtx8x8_sse4_1, // V_ADST + fadst8x8_sse4_1, // H_ADST + idtx8x8_sse4_1, // V_FLIPADST + fadst8x8_sse4_1 // H_FLIPADST +}; + +static const fwd_transform_1d_sse4_1 row_highbd_txfm4x4_arr[TX_TYPES] = { + fdct4x4_sse4_1, // DCT_DCT + fdct4x4_sse4_1, // ADST_DCT + fadst4x4_sse4_1, // DCT_ADST + fadst4x4_sse4_1, // ADST_ADST + fdct4x4_sse4_1, // FLIPADST_DCT + fadst4x4_sse4_1, // DCT_FLIPADST + fadst4x4_sse4_1, // FLIPADST_FLIPADST + fadst4x4_sse4_1, // ADST_FLIPADST + fadst4x4_sse4_1, // FLIPADST_ADST + idtx4x4_sse4_1, // IDTX + idtx4x4_sse4_1, // V_DCT + fdct4x4_sse4_1, // H_DCT + idtx4x4_sse4_1, // V_ADST + fadst4x4_sse4_1, // H_ADST + idtx4x4_sse4_1, // V_FLIPADST + fadst4x4_sse4_1 // H_FLIPADST +}; + +static const fwd_transform_1d_sse4_1 col_highbd_txfm4x4_arr[TX_TYPES] = { + fdct4x4_sse4_1, // DCT_DCT + fadst4x4_sse4_1, // ADST_DCT + fdct4x4_sse4_1, // DCT_ADST + fadst4x4_sse4_1, // ADST_ADST + fadst4x4_sse4_1, // FLIPADST_DCT + fdct4x4_sse4_1, // DCT_FLIPADST + fadst4x4_sse4_1, // FLIPADST_FLIPADST + fadst4x4_sse4_1, // ADST_FLIPADST + fadst4x4_sse4_1, // FLIPADST_ADST + idtx4x4_sse4_1, // IDTX + fdct4x4_sse4_1, // V_DCT + idtx4x4_sse4_1, // H_DCT + fadst4x4_sse4_1, // V_ADST + idtx4x4_sse4_1, // H_ADST + fadst4x4_sse4_1, // V_FLIPADST + idtx4x4_sse4_1 // H_FLIPADST +}; + +static const fwd_transform_1d_sse4_1 col_highbd_txfm8x32_arr[TX_TYPES] = { + av1_fdct32_sse4_1, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + av1_idtx32_sse4_1, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +static const fwd_transform_1d_sse4_1 row_highbd_txfm8x32_arr[TX_TYPES] = { + fdct16x16_sse4_1, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + idtx16x16_sse4_1, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +void av1_fwd_txfm2d_16x8_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[32], out[32]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8]; + const int txw_idx = get_txw_idx(TX_16X8); + const int txh_idx = get_txh_idx(TX_16X8); + const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x8_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type]; + int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 2; i++) { + load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]); + col_txfm(in, in, bit, 2); + col_txfm_8x8_rounding(in, -shift[1]); + transpose_8x8(in, out + i * 16); + } + + if (lr_flip) { + flip_buf_sse4_1(in, out, 32); + row_txfm(in, out, bit, 2); + } else { + row_txfm(out, out, bit, 2); + } + + for (int i = 0; i < 2; i++) { + transpose_8x8(out + i * 16, in); + av1_round_shift_rect_array_32_sse4_1(in, in, 16, -shift[2], NewSqrt2); + write_buffer_16x8(in, coeff + i * 8, 16); + } + + (void)bd; +} + +void av1_fwd_txfm2d_8x16_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[32], out[32]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16]; + const int txw_idx = get_txw_idx(TX_8X16); + const int txh_idx = get_txh_idx(TX_8X16); + const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x8_arr[tx_type]; + int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]); + col_txfm(in, in, bit, 2); + col_txfm_8x16_rounding(in, -shift[1]); + transpose_8x8(in, out); + transpose_8x8(in + 16, out + 16); + + for (int i = 0; i < 2; i++) { + row_txfm(out + i * 16, out, bit, 2); + transpose_8x8(out, in); + av1_round_shift_rect_array_32_sse4_1(in, in, 16, -shift[2], NewSqrt2); + write_buffer_8x8(in, coeff + i * 64); + } + + (void)bd; +} + +void av1_fwd_txfm2d_4x16_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[16]; + __m128i *outcoeff128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16]; + const int txw_idx = get_txw_idx(TX_4X16); + const int txh_idx = get_txh_idx(TX_4X16); + const int txfm_size_col = tx_size_wide[TX_4X16]; + const int txfm_size_row = tx_size_high[TX_4X16]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x4_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + // col transform + load_buffer_4x16(input, in, stride, ud_flip, lr_flip, shift[0]); + col_txfm(in, outcoeff128, bitcol, 1); + col_txfm_8x8_rounding(outcoeff128, -shift[1]); + transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row); + + // row transform + for (int i = 0; i < txfm_size_col; i++) { + row_txfm(in + i, outcoeff128 + i * txfm_size_col, bitrow, txfm_size_col); + } + (void)bd; +} + +void av1_fwd_txfm2d_16x4_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[16]; + __m128i *outcoeff128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4]; + const int txw_idx = get_txw_idx(TX_16X4); + const int txh_idx = get_txh_idx(TX_16X4); + const int txfm_size_col = tx_size_wide[TX_16X4]; + const int txfm_size_row = tx_size_high[TX_16X4]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x4_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // col transform + load_buffer_16x4(input, in, stride, ud_flip, lr_flip, shift[0]); + + for (int i = 0; i < txfm_size_row; i++) { + col_txfm(in + i * txfm_size_row, outcoeff128 + i * txfm_size_row, bitcol, + 1); + } + col_txfm_8x8_rounding(outcoeff128, -shift[1]); + + // row transform + row_txfm(outcoeff128, in, bitrow, 1); + transpose_8nx8n(in, outcoeff128, txfm_size_row, txfm_size_col); + (void)bd; +} + +void av1_fwd_txfm2d_16x32_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[128]; + __m128i *outcoef128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32]; + const int txw_idx = get_txw_idx(TX_16X32); + const int txh_idx = get_txh_idx(TX_16X32); + const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x32_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x32_arr[tx_type]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + + // column transform + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + load_buffer_16x16(input + 16 * stride, in + 64, stride, 0, 0, shift[0]); + + for (int i = 0; i < 4; i++) { + col_txfm((in + i), (in + i), bitcol, 4); + } + col_txfm_16x16_rounding(&in[0], -shift[1]); + col_txfm_16x16_rounding(&in[64], -shift[1]); + transpose_8nx8n(in, outcoef128, 16, 32); + + // row transform + row_txfm(outcoef128, in, bitrow, 8); + transpose_8nx8n(in, outcoef128, 32, 16); + av1_round_shift_rect_array_32_sse4_1(outcoef128, outcoef128, 128, -shift[2], + NewSqrt2); + (void)bd; +} + +void av1_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + (void)tx_type; + __m128i in[512]; + __m128i *outcoef128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X64]; + const int txw_idx = get_txw_idx(TX_32X64); + const int txh_idx = get_txh_idx(TX_32X64); + const int txfm_size_col = tx_size_wide[TX_32X64]; + const int txfm_size_row = tx_size_high[TX_32X64]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int num_row = txfm_size_row >> 2; + const int num_col = txfm_size_col >> 2; + + // column transform + load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row); + for (int i = 0; i < num_col; i++) { + av1_fdct64_sse4_1((in + i), (in + i), bitcol, num_col, num_col); + } + for (int i = 0; i < num_col; i++) { + col_txfm_16x16_rounding((in + i * txfm_size_row), -shift[1]); + } + transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row); + + // row transform + for (int i = 0; i < num_row; i++) { + av1_fdct32_sse4_1((outcoef128 + i), (in + i), bitrow, num_row); + } + transpose_8nx8n(in, outcoef128, txfm_size_row, txfm_size_col); + av1_round_shift_rect_array_32_sse4_1(outcoef128, outcoef128, 512, -shift[2], + NewSqrt2); + (void)bd; +} + +void av1_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + (void)tx_type; + __m128i in[512]; + __m128i *outcoef128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X32]; + const int txw_idx = get_txw_idx(TX_64X32); + const int txh_idx = get_txh_idx(TX_64X32); + const int txfm_size_col = tx_size_wide[TX_64X32]; + const int txfm_size_row = tx_size_high[TX_64X32]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int num_row = txfm_size_row >> 2; + const int num_col = txfm_size_col >> 2; + + // column transform + for (int i = 0; i < 32; i++) { + load_buffer_4x4(input + 0 + i * stride, in + 0 + i * 16, 4, 0, 0, shift[0]); + load_buffer_4x4(input + 16 + i * stride, in + 4 + i * 16, 4, 0, 0, + shift[0]); + load_buffer_4x4(input + 32 + i * stride, in + 8 + i * 16, 4, 0, 0, + shift[0]); + load_buffer_4x4(input + 48 + i * stride, in + 12 + i * 16, 4, 0, 0, + shift[0]); + } + + for (int i = 0; i < num_col; i++) { + av1_fdct32_sse4_1((in + i), (in + i), bitcol, num_col); + } + + for (int i = 0; i < num_row; i++) { + col_txfm_16x16_rounding((in + i * txfm_size_col), -shift[1]); + } + transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row); + + // row transform + for (int i = 0; i < num_row; i++) { + av1_fdct64_sse4_1((outcoef128 + i), (in + i), bitrow, num_row, num_row); + } + transpose_8nx8n(in, outcoef128, txfm_size_row, txfm_size_col >> 1); + av1_round_shift_rect_array_32_sse4_1(outcoef128, outcoef128, 512 >> 1, + -shift[2], NewSqrt2); + (void)bd; +} + +void av1_fwd_txfm2d_32x16_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[128]; + __m128i *outcoef128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16]; + const int txw_idx = get_txw_idx(TX_32X16); + const int txh_idx = get_txh_idx(TX_32X16); + const fwd_transform_1d_sse4_1 col_txfm = row_highbd_txfm8x32_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = col_highbd_txfm8x32_arr[tx_type]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + + // column transform + load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 16); + col_txfm(in, in, bitcol, 8); + col_txfm_16x16_rounding(&in[0], -shift[1]); + col_txfm_16x16_rounding(&in[64], -shift[1]); + transpose_8nx8n(in, outcoef128, 32, 16); + + // row transform + for (int i = 0; i < 4; i++) { + row_txfm((outcoef128 + i), (in + i), bitrow, 4); + } + transpose_8nx8n(in, outcoef128, 16, 32); + av1_round_shift_rect_array_32_sse4_1(outcoef128, outcoef128, 128, -shift[2], + NewSqrt2); + (void)bd; +} + +void av1_fwd_txfm2d_8x32_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[64]; + __m128i *outcoef128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32]; + const int txw_idx = get_txw_idx(TX_8X32); + const int txh_idx = get_txh_idx(TX_8X32); + const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x32_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm32x8_arr[tx_type]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + + const int txfm_size_col = tx_size_wide[TX_8X32]; + const int txfm_size_row = tx_size_high[TX_8X32]; + const int num_col = txfm_size_col >> 2; + + // column transform + load_buffer_8x16(input, in, stride, 0, 0, shift[0]); + load_buffer_8x16(input + (txfm_size_row >> 1) * stride, in + txfm_size_row, + stride, 0, 0, shift[0]); + + for (int i = 0; i < num_col; i++) { + col_txfm((in + i), (in + i), bitcol, num_col); + } + col_txfm_16x16_rounding(in, -shift[1]); + transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row); + + // row transform + for (int i = 0; i < txfm_size_col; i += 2) { + row_txfm((outcoef128 + i), (in + i), bitrow, txfm_size_col); + } + transpose_8nx8n(in, outcoef128, txfm_size_row, txfm_size_col); + (void)bd; +} + +void av1_fwd_txfm2d_32x8_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[64]; + __m128i *outcoef128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8]; + const int txw_idx = get_txw_idx(TX_32X8); + const int txh_idx = get_txh_idx(TX_32X8); + const fwd_transform_1d_sse4_1 col_txfm = row_highbd_txfm32x8_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = col_highbd_txfm8x32_arr[tx_type]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + + const int txfm_size_col = tx_size_wide[TX_32X8]; + const int txfm_size_row = tx_size_high[TX_32X8]; + const int num_col = txfm_size_row >> 2; + + // column transform + load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 8); + for (int i = 0; i < txfm_size_row; i += 2) { + col_txfm((in + i), (in + i), bitcol, txfm_size_row); + } + + col_txfm_16x16_rounding(&in[0], -shift[1]); + transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row); + + // row transform + for (int i = 0; i < num_col; i++) { + row_txfm((outcoef128 + i), (in + i), bitrow, num_col); + } + transpose_8nx8n(in, outcoef128, txfm_size_row, txfm_size_col); + (void)bd; +} + +void av1_fwd_txfm2d_4x8_sse4_1(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + __m128i in[8]; + __m128i *outcoeff128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8]; + const int txw_idx = get_txw_idx(TX_4X8); + const int txh_idx = get_txh_idx(TX_4X8); + const int txfm_size_col = tx_size_wide[TX_4X8]; + const int txfm_size_row = tx_size_high[TX_4X8]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x8_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x4_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + load_buffer_4x8(input, in, stride, ud_flip, lr_flip, shift[0]); + col_txfm(in, in, bitcol, 1); + col_txfm_4x8_rounding(in, -shift[1]); + transpose_8nx8n(in, outcoeff128, txfm_size_col, txfm_size_row); + + for (int i = 0; i < 2; i++) { + row_txfm(outcoeff128 + i, in + i * txfm_size_col, bitrow, 2); + } + av1_round_shift_rect_array_32_sse4_1(in, outcoeff128, txfm_size_row, + -shift[2], NewSqrt2); + (void)bd; +} + +void av1_fwd_txfm2d_8x4_sse4_1(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + __m128i in[8]; + __m128i *outcoeff128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4]; + const int txw_idx = get_txw_idx(TX_8X4); + const int txh_idx = get_txh_idx(TX_8X4); + const int txfm_size_col = tx_size_wide[TX_8X4]; + const int txfm_size_row = tx_size_high[TX_8X4]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x4_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x8_arr[tx_type]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + // col tranform + load_buffer_8x4(input, in, stride, ud_flip, lr_flip, shift[0]); + for (int i = 0; i < 2; i++) { + col_txfm(in + i * txfm_size_row, in + i * txfm_size_row, bitcol, 1); + } + col_txfm_4x8_rounding(in, -shift[1]); + + // row tranform + row_txfm(in, outcoeff128, bitrow, 1); + av1_round_shift_rect_array_32_sse4_1(outcoeff128, in, txfm_size_col, + -shift[2], NewSqrt2); + transpose_8nx8n(in, outcoeff128, txfm_size_row, txfm_size_col); + (void)bd; +} + +void av1_fwd_txfm2d_16x64_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[256]; + __m128i *outcoeff128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X64]; + const int txw_idx = get_txw_idx(TX_16X64); + const int txh_idx = get_txh_idx(TX_16X64); + const int txfm_size_col = tx_size_wide[TX_16X64]; + const int txfm_size_row = tx_size_high[TX_16X64]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + const int num_col = txfm_size_col >> 2; + // col tranform + for (int i = 0; i < txfm_size_row; i += num_col) { + load_buffer_4x4(input + (i + 0) * stride, in + (i + 0) * num_col, num_col, + ud_flip, lr_flip, shift[0]); + load_buffer_4x4(input + (i + 1) * stride, in + (i + 1) * num_col, num_col, + ud_flip, lr_flip, shift[0]); + load_buffer_4x4(input + (i + 2) * stride, in + (i + 2) * num_col, num_col, + ud_flip, lr_flip, shift[0]); + load_buffer_4x4(input + (i + 3) * stride, in + (i + 3) * num_col, num_col, + ud_flip, lr_flip, shift[0]); + } + + for (int i = 0; i < num_col; i++) { + av1_fdct64_sse4_1(in + i, outcoeff128 + i, bitcol, num_col, num_col); + } + + col_txfm_16x16_rounding(outcoeff128, -shift[1]); + col_txfm_16x16_rounding(outcoeff128 + 64, -shift[1]); + col_txfm_16x16_rounding(outcoeff128 + 128, -shift[1]); + col_txfm_16x16_rounding(outcoeff128 + 192, -shift[1]); + + transpose_8nx8n(outcoeff128, in, txfm_size_col, 32); + fdct16x16_sse4_1(in, in, bitrow, 8); + transpose_8nx8n(in, outcoeff128, 32, txfm_size_col); + memset(coeff + txfm_size_col * 32, 0, txfm_size_col * 32 * sizeof(*coeff)); + (void)bd; +} + +void av1_fwd_txfm2d_64x16_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[256]; + __m128i *outcoeff128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X16]; + const int txw_idx = get_txw_idx(TX_64X16); + const int txh_idx = get_txh_idx(TX_64X16); + const int txfm_size_col = tx_size_wide[TX_64X16]; + const int txfm_size_row = tx_size_high[TX_64X16]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + // col tranform + for (int i = 0; i < txfm_size_row; i++) { + load_buffer_4x4(input + 0 + i * stride, in + 0 + i * txfm_size_row, 4, + ud_flip, lr_flip, shift[0]); + load_buffer_4x4(input + 16 + i * stride, in + 4 + i * txfm_size_row, 4, + ud_flip, lr_flip, shift[0]); + load_buffer_4x4(input + 32 + i * stride, in + 8 + i * txfm_size_row, 4, + ud_flip, lr_flip, shift[0]); + load_buffer_4x4(input + 48 + i * stride, in + 12 + i * txfm_size_row, 4, + ud_flip, lr_flip, shift[0]); + } + + fdct16x16_sse4_1(in, outcoeff128, bitcol, txfm_size_row); + col_txfm_16x16_rounding(outcoeff128, -shift[1]); + col_txfm_16x16_rounding(outcoeff128 + 64, -shift[1]); + col_txfm_16x16_rounding(outcoeff128 + 128, -shift[1]); + col_txfm_16x16_rounding(outcoeff128 + 192, -shift[1]); + + transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row); + for (int i = 0; i < 4; i++) { + av1_fdct64_sse4_1(in + i, in + i, bitrow, 4, 4); + } + transpose_8nx8n(in, outcoeff128, txfm_size_row, 32); + (void)bd; +} diff --git a/libs/libaom/src/av1/encoder/x86/ml_sse3.c b/libs/libaom/src/av1/encoder/x86/ml_sse3.c new file mode 100644 index 000000000..89b1e6a05 --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/ml_sse3.c @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/av1_rtcd.h" +#include "av1/encoder/ml.h" + +// In order to avoid the high-latency of swapping between FPU and SIMD +// operations, we keep the result in a 128-bit register even though we only +// care about a single value. +static void nn_propagate_8to1(const float *const inputs, + const float *const weights, + __m128 *const output) { + const __m128 inputs_h = _mm_loadu_ps(&inputs[4]); + const __m128 inputs_l = _mm_loadu_ps(inputs); + + const __m128 weights_h = _mm_loadu_ps(&weights[4]); + const __m128 weights_l = _mm_loadu_ps(weights); + + const __m128 mul_h = _mm_mul_ps(inputs_h, weights_h); + const __m128 mul_l = _mm_mul_ps(inputs_l, weights_l); + // [7 6 5 4] [3 2 1 0] (weight and input indices) + + const __m128 vadd = _mm_add_ps(mul_l, mul_h); + // [7+3 6+2 5+1 4+0] + const __m128 hadd1 = _mm_hadd_ps(vadd, vadd); + // [7+6+3+2 5+4+1+0 7+6+3+2 5+4+1+0] + const __m128 hadd2 = _mm_hadd_ps(hadd1, hadd1); + // [7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0] + *output = _mm_add_ps(*output, hadd2); +} + +static void nn_propagate_4to1(const float *const inputs, + const float *const weights, + __m128 *const output) { + const __m128 inputs128 = _mm_loadu_ps(inputs); + + const __m128 weights128 = _mm_loadu_ps(weights); + + const __m128 mul = _mm_mul_ps(inputs128, weights128); + // [3 2 1 0] (weight and input indices) + + const __m128 hadd1 = _mm_hadd_ps(mul, mul); + // [3+2 1+0 3+2 1+0] + const __m128 hadd2 = _mm_hadd_ps(hadd1, hadd1); + // [3+2+1+0 3+2+1+0 3+2+1+0 3+2+1+0] + *output = _mm_add_ps(*output, hadd2); +} + +static void nn_propagate_4to4(const float *const inputs, + const float *const weights, __m128 *const outputs, + const int num_inputs) { + const __m128 inputs128 = _mm_loadu_ps(inputs); + + __m128 hadd[2]; + for (int i = 0; i < 2; i++) { // For each pair of outputs + const __m128 weight0 = _mm_loadu_ps(&weights[2 * i * num_inputs]); + const __m128 mul0 = _mm_mul_ps(weight0, inputs128); + const __m128 weight1 = _mm_loadu_ps(&weights[(2 * i + 1) * num_inputs]); + const __m128 mul1 = _mm_mul_ps(weight1, inputs128); + hadd[i] = _mm_hadd_ps(mul0, mul1); + } + // hadd[0] = [7+6 5+4 3+2 1+0] (weight indices) + // hadd[1] = [15+14 13+12 11+10 9+8] + + const __m128 hh = _mm_hadd_ps(hadd[0], hadd[1]); + // [15+14+13+12 11+10+9+8 7+6+5+4 3+2+1+0] + + *outputs = _mm_add_ps(*outputs, hh); +} + +static void nn_propagate_4to8(const float *const inputs, + const float *const weights, __m128 *const out_h, + __m128 *const out_l, const int num_inputs) { + const __m128 inputs128 = _mm_loadu_ps(inputs); + + __m128 hadd[4]; + for (int i = 0; i < 4; i++) { // For each pair of outputs + const __m128 weight0 = _mm_loadu_ps(&weights[2 * i * num_inputs]); + const __m128 weight1 = _mm_loadu_ps(&weights[(2 * i + 1) * num_inputs]); + const __m128 mul0 = _mm_mul_ps(inputs128, weight0); + const __m128 mul1 = _mm_mul_ps(inputs128, weight1); + hadd[i] = _mm_hadd_ps(mul0, mul1); + } + // hadd[0] = [7+6 5+4 3+2 1+0] (weight indices) + // hadd[1] = [15+14 13+12 11+10 9+8] + // hadd[2] = [23+22 21+20 19+18 17+16] + // hadd[3] = [31+30 29+28 27+26 25+24] + + const __m128 hh0 = _mm_hadd_ps(hadd[0], hadd[1]); + // [15+14+13+12 11+10+9+8 7+6+5+4 3+2+1+0] + const __m128 hh1 = _mm_hadd_ps(hadd[2], hadd[3]); + // [31+30+29+28 27+26+25+24 23+22+21+20 19+18+17+16] + + *out_h = _mm_add_ps(*out_h, hh1); + *out_l = _mm_add_ps(*out_l, hh0); +} + +static void nn_propagate_8to4(const float *const inputs, + const float *const weights, __m128 *const outputs, + const int num_inputs) { + const __m128 inputs_h = _mm_loadu_ps(inputs + 4); + const __m128 inputs_l = _mm_loadu_ps(inputs); + // [7 6 5 4] [3 2 1 0] (input indices) + + __m128 add[4]; + for (int i = 0; i < 4; i++) { // For each output: + const __m128 weight_h = _mm_loadu_ps(&weights[i * num_inputs + 4]); + const __m128 weight_l = _mm_loadu_ps(&weights[i * num_inputs]); + const __m128 mul_h = _mm_mul_ps(inputs_h, weight_h); + const __m128 mul_l = _mm_mul_ps(inputs_l, weight_l); + add[i] = _mm_add_ps(mul_l, mul_h); + } + // add[0] = [7+3 6+2 5+1 4+0] + // add[1] = [15+11 14+10 13+9 12+8] + // add[2] = [23+19 22+18 21+17 20+16] + // add[3] = [31+27 30+26 29+25 28+24] + + const __m128 hadd_h = _mm_hadd_ps(add[2], add[3]); + // [31+30+27+26 29+28+25+24 23+22+19+18 21+20+17+16] + const __m128 hadd_l = _mm_hadd_ps(add[0], add[1]); + // [15+14+11+10 13+12+9+8 7+6+3+2 5+4+1+0] + + const __m128 haddhadd = _mm_hadd_ps(hadd_l, hadd_h); + // [31+30+29+28+27+26+25+24 23+22+21+20+19+18+17+16 + // 15+14+13+12+11+10+9+8 7+6+5+4+3+2+1+0] + + *outputs = _mm_add_ps(*outputs, haddhadd); +} + +static void nn_activate8(__m128 *out_h, __m128 *out_l) { + const __m128 zero = _mm_setzero_ps(); + *out_h = _mm_max_ps(*out_h, zero); + *out_l = _mm_max_ps(*out_l, zero); +} + +static void nn_activate4(__m128 *x) { *x = _mm_max_ps(*x, _mm_setzero_ps()); } + +// Calculate prediction based on the given input features and neural net config. +// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden +// layer. +void av1_nn_predict_sse3(const float *input_nodes, + const NN_CONFIG *const nn_config, int reduce_prec, + float *const output) { + float buf[2][NN_MAX_NODES_PER_LAYER]; + int buf_index = 0; + int num_inputs = nn_config->num_inputs; + + // Hidden layers, except the final iteration is the output layer. + for (int layer = 0; layer <= nn_config->num_hidden_layers; layer++) { + const float *layer_weights = nn_config->weights[layer]; + const float *layer_bias = nn_config->bias[layer]; + bool output_layer = (layer == nn_config->num_hidden_layers); + float *const output_nodes = output_layer ? output : &buf[buf_index][0]; + const int num_outputs = output_layer ? nn_config->num_outputs + : nn_config->num_hidden_nodes[layer]; + + if (num_inputs % 4 == 0 && num_outputs % 8 == 0) { + for (int out = 0; out < num_outputs; out += 8) { + __m128 out_h = _mm_loadu_ps(&layer_bias[out + 4]); + __m128 out_l = _mm_loadu_ps(&layer_bias[out]); + for (int in = 0; in < num_inputs; in += 4) { + nn_propagate_4to8(&input_nodes[in], + &layer_weights[out * num_inputs + in], &out_h, + &out_l, num_inputs); + } + if (!output_layer) nn_activate8(&out_h, &out_l); + _mm_storeu_ps(&output_nodes[out + 4], out_h); + _mm_storeu_ps(&output_nodes[out], out_l); + } + } else if (num_inputs % 8 == 0 && num_outputs % 4 == 0) { + for (int out = 0; out < num_outputs; out += 4) { + __m128 outputs = _mm_loadu_ps(&layer_bias[out]); + for (int in = 0; in < num_inputs; in += 8) { + nn_propagate_8to4(&input_nodes[in], + &layer_weights[out * num_inputs + in], &outputs, + num_inputs); + } + if (!output_layer) nn_activate4(&outputs); + _mm_storeu_ps(&output_nodes[out], outputs); + } + } else if (num_inputs % 4 == 0 && num_outputs % 4 == 0) { + for (int out = 0; out < num_outputs; out += 4) { + __m128 outputs = _mm_loadu_ps(&layer_bias[out]); + for (int in = 0; in < num_inputs; in += 4) { + nn_propagate_4to4(&input_nodes[in], + &layer_weights[out * num_inputs + in], &outputs, + num_inputs); + } + if (!output_layer) nn_activate4(&outputs); + _mm_storeu_ps(&output_nodes[out], outputs); + } + } else if (num_inputs % 8 == 0) { + for (int out = 0; out < num_outputs; out++) { + __m128 total = _mm_load1_ps(&layer_bias[out]); + for (int in = 0; in < num_inputs; in += 8) { + nn_propagate_8to1(&input_nodes[in], + &layer_weights[out * num_inputs + in], &total); + } + if (!output_layer) nn_activate4(&total); + output_nodes[out] = _mm_cvtss_f32(total); + } + } else if (num_inputs % 4 == 0) { + for (int out = 0; out < num_outputs; out++) { + __m128 total = _mm_load1_ps(&layer_bias[out]); + for (int in = 0; in < num_inputs; in += 4) { + nn_propagate_4to1(&input_nodes[in], + &layer_weights[out * num_inputs + in], &total); + } + if (!output_layer) nn_activate4(&total); + output_nodes[out] = _mm_cvtss_f32(total); + } + } else { + // Use SSE instructions for scalar operations to avoid the latency of + // swapping between SIMD and FPU modes. + for (int out = 0; out < num_outputs; out++) { + __m128 total = _mm_load1_ps(&layer_bias[out]); + for (int in_node = 0; in_node < num_inputs; in_node++) { + __m128 input = _mm_load1_ps(&input_nodes[in_node]); + __m128 weight = + _mm_load1_ps(&layer_weights[num_inputs * out + in_node]); + total = _mm_add_ps(total, _mm_mul_ps(input, weight)); + } + if (!output_layer) nn_activate4(&total); + output_nodes[out] = _mm_cvtss_f32(total); + } + } + input_nodes = output_nodes; + num_inputs = num_outputs; + buf_index = 1 - buf_index; + } + if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs); +} diff --git a/libs/libaom/src/av1/encoder/x86/pickrst_avx2.c b/libs/libaom/src/av1/encoder/x86/pickrst_avx2.c new file mode 100644 index 000000000..f8703a23c --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/pickrst_avx2.c @@ -0,0 +1,1084 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // AVX2 +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" +#include "aom_dsp/x86/transpose_sse2.h" + +#include "config/av1_rtcd.h" +#include "av1/common/restoration.h" +#include "av1/encoder/pickrst.h" + +static INLINE void acc_stat_avx2(int32_t *dst, const uint8_t *src, + const __m128i *shuffle, const __m256i *kl) { + const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle); + const __m256i d0 = _mm256_madd_epi16(*kl, _mm256_cvtepu8_epi16(s)); + const __m256i dst0 = yy_load_256(dst); + const __m256i r0 = _mm256_add_epi32(dst0, d0); + yy_store_256(dst, r0); +} + +static INLINE void acc_stat_win7_one_line_avx2( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, + int dgd_stride, const __m128i *shuffle, int32_t *sumX, + int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN], + int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) { + int j, k, l; + const int wiener_win = WIENER_WIN; + for (j = h_start; j < h_end; j += 2) { + const uint8_t X1 = src[j]; + const uint8_t X2 = src[j + 1]; + *sumX += X1 + X2; + const uint8_t *dgd_ij = dgd + j; + for (k = 0; k < wiener_win; k++) { + const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int32_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint8_t D1 = dgd_ijk[l]; + const uint8_t D2 = dgd_ijk[l + 1]; + sumY[k][l] += D1 + D2; + M_int[k][l] += D1 * X1 + D2 * X2; + + const __m256i kl = + _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l)))); + acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl); + } + } + } +} + +static INLINE void compute_stats_win7_opt_avx2( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start, + int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H) { + int i, j, k, l, m, n; + const int wiener_win = WIENER_WIN; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + uint8_t avg = find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } }; + + DECLARE_ALIGNED(32, int32_t, + H_int32[WIENER_WIN2][WIENER_WIN * 8]) = { { 0 } }; + int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } }; + int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int32_t sumX = 0; + const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + + const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data); + for (j = v_start; j < v_end; j += 64) { + const int vert_end = AOMMIN(64, v_end - j) + j; + for (i = j; i < vert_end; i++) { + acc_stat_win7_one_line_avx2( + dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, + dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32); + } + for (k = 0; k < wiener_win; ++k) { + for (l = 0; l < wiener_win; ++l) { + M_int64[k][l] += M_int32[k][l]; + M_int32[k][l] = 0; + } + } + for (k = 0; k < WIENER_WIN2; ++k) { + for (l = 0; l < WIENER_WIN * 8; ++l) { + H_int64[k][l] += H_int32[k][l]; + H_int32[k][l] = 0; + } + } + } + + const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const int32_t idx0 = l * wiener_win + k; + M[idx0] = + M_int64[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l])); + int64_t *H_ = H + idx0 * wiener_win2; + int64_t *H_int_ = &H_int64[idx0][0]; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum - + (int64_t)avg * (sumY[k][l] + sumY[n][m]); + } + } + } + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void acc_stat_highbd_avx2(int64_t *dst, const uint16_t *dgd, + const __m256i *shuffle, + const __m256i *dgd_ijkl) { + // Load two 128-bit chunks from dgd + const __m256i s0 = _mm256_inserti128_si256( + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)dgd)), + _mm_loadu_si128((__m128i *)(dgd + 4)), 1); + // s0 = [11 10 9 8 7 6 5 4] [7 6 5 4 3 2 1 0] as u16 (values are dgd indices) + // The weird order is so the shuffle stays within 128-bit lanes + + // Shuffle 16x u16 values within lanes according to the mask: + // [0 1 1 2 2 3 3 4] [0 1 1 2 2 3 3 4] + // (Actually we shuffle u8 values as there's no 16-bit shuffle) + const __m256i s1 = _mm256_shuffle_epi8(s0, *shuffle); + // s1 = [8 7 7 6 6 5 5 4] [4 3 3 2 2 1 1 0] as u16 (values are dgd indices) + + // Multiply 16x 16-bit integers in dgd_ijkl and s1, resulting in 16x 32-bit + // integers then horizontally add pairs of these integers resulting in 8x + // 32-bit integers + const __m256i d0 = _mm256_madd_epi16(*dgd_ijkl, s1); + // d0 = [a b c d] [e f g h] as u32 + + // Take the lower-half of d0, extend to u64, add it on to dst (H) + const __m256i d0l = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(d0, 0)); + // d0l = [a b] [c d] as u64 + const __m256i dst0 = yy_load_256(dst); + yy_store_256(dst, _mm256_add_epi64(d0l, dst0)); + + // Take the upper-half of d0, extend to u64, add it on to dst (H) + const __m256i d0h = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(d0, 1)); + // d0h = [e f] [g h] as u64 + const __m256i dst1 = yy_load_256(dst + 4); + yy_store_256(dst + 4, _mm256_add_epi64(d0h, dst1)); +} + +static INLINE void acc_stat_highbd_win7_one_line_avx2( + const uint16_t *dgd, const uint16_t *src, int h_start, int h_end, + int dgd_stride, const __m256i *shuffle, int32_t *sumX, + int32_t sumY[WIENER_WIN][WIENER_WIN], int64_t M_int[WIENER_WIN][WIENER_WIN], + int64_t H_int[WIENER_WIN2][WIENER_WIN * 8]) { + int j, k, l; + const int wiener_win = WIENER_WIN; + for (j = h_start; j < h_end; j += 2) { + const uint16_t X1 = src[j]; + const uint16_t X2 = src[j + 1]; + *sumX += X1 + X2; + const uint16_t *dgd_ij = dgd + j; + for (k = 0; k < wiener_win; k++) { + const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int64_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint16_t D1 = dgd_ijk[l]; + const uint16_t D2 = dgd_ijk[l + 1]; + sumY[k][l] += D1 + D2; + M_int[k][l] += D1 * X1 + D2 * X2; + + // Load two u16 values from dgd_ijkl combined as a u32, + // then broadcast to 8x u32 slots of a 256 + const __m256i dgd_ijkl = + _mm256_set1_epi32(*((uint32_t *)(dgd_ijk + l))); + // dgd_ijkl = [y x y x y x y x] [y x y x y x y x] where each is a u16 + + acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, + &dgd_ijkl); + } + } + } +} + +static INLINE void compute_stats_highbd_win7_opt_avx2( + const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, + int64_t *H, aom_bit_depth_t bit_depth) { + int i, j, k, l, m, n; + const int wiener_win = WIENER_WIN; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); + const uint16_t avg = + find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + int64_t M_int[WIENER_WIN][WIENER_WIN] = { { 0 } }; + DECLARE_ALIGNED(32, int64_t, H_int[WIENER_WIN2][WIENER_WIN * 8]) = { { 0 } }; + int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int32_t sumX = 0; + const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + + const __m256i shuffle = yy_loadu_256(g_shuffle_stats_highbd_data); + for (j = v_start; j < v_end; j += 64) { + const int vert_end = AOMMIN(64, v_end - j) + j; + for (i = j; i < vert_end; i++) { + acc_stat_highbd_win7_one_line_avx2( + dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, + dgd_stride, &shuffle, &sumX, sumY, M_int, H_int); + } + } + + uint8_t bit_depth_divider = 1; + if (bit_depth == AOM_BITS_12) + bit_depth_divider = 16; + else if (bit_depth == AOM_BITS_10) + bit_depth_divider = 4; + + const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const int32_t idx0 = l * wiener_win + k; + M[idx0] = (M_int[k][l] + + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) / + bit_depth_divider; + int64_t *H_ = H + idx0 * wiener_win2; + int64_t *H_int_ = &H_int[idx0][0]; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_[m * wiener_win + n] = + (H_int_[n * 8 + m] + + (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) / + bit_depth_divider; + } + } + } + } +} + +static INLINE void acc_stat_highbd_win5_one_line_avx2( + const uint16_t *dgd, const uint16_t *src, int h_start, int h_end, + int dgd_stride, const __m256i *shuffle, int32_t *sumX, + int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], + int64_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], + int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) { + int j, k, l; + const int wiener_win = WIENER_WIN_CHROMA; + for (j = h_start; j < h_end; j += 2) { + const uint16_t X1 = src[j]; + const uint16_t X2 = src[j + 1]; + *sumX += X1 + X2; + const uint16_t *dgd_ij = dgd + j; + for (k = 0; k < wiener_win; k++) { + const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int64_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint16_t D1 = dgd_ijk[l]; + const uint16_t D2 = dgd_ijk[l + 1]; + sumY[k][l] += D1 + D2; + M_int[k][l] += D1 * X1 + D2 * X2; + + // Load two u16 values from dgd_ijkl combined as a u32, + // then broadcast to 8x u32 slots of a 256 + const __m256i dgd_ijkl = + _mm256_set1_epi32(*((uint32_t *)(dgd_ijk + l))); + // dgd_ijkl = [x y x y x y x y] [x y x y x y x y] where each is a u16 + + acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, + &dgd_ijkl); + } + } + } +} + +static INLINE void compute_stats_highbd_win5_opt_avx2( + const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, + int64_t *H, aom_bit_depth_t bit_depth) { + int i, j, k, l, m, n; + const int wiener_win = WIENER_WIN_CHROMA; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); + const uint16_t avg = + find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + DECLARE_ALIGNED( + 32, int64_t, + H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) = { { 0 } }; + int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int32_t sumX = 0; + const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + + const __m256i shuffle = yy_loadu_256(g_shuffle_stats_highbd_data); + for (j = v_start; j < v_end; j += 64) { + const int vert_end = AOMMIN(64, v_end - j) + j; + for (i = j; i < vert_end; i++) { + acc_stat_highbd_win5_one_line_avx2( + dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, + dgd_stride, &shuffle, &sumX, sumY, M_int64, H_int64); + } + } + + uint8_t bit_depth_divider = 1; + if (bit_depth == AOM_BITS_12) + bit_depth_divider = 16; + else if (bit_depth == AOM_BITS_10) + bit_depth_divider = 4; + + const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const int32_t idx0 = l * wiener_win + k; + M[idx0] = (M_int64[k][l] + + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) / + bit_depth_divider; + int64_t *H_ = H + idx0 * wiener_win2; + int64_t *H_int_ = &H_int64[idx0][0]; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_[m * wiener_win + n] = + (H_int_[n * 8 + m] + + (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) / + bit_depth_divider; + } + } + } + } +} + +void av1_compute_stats_highbd_avx2(int wiener_win, const uint8_t *dgd8, + const uint8_t *src8, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + aom_bit_depth_t bit_depth) { + if (wiener_win == WIENER_WIN) { + compute_stats_highbd_win7_opt_avx2(dgd8, src8, h_start, h_end, v_start, + v_end, dgd_stride, src_stride, M, H, + bit_depth); + } else if (wiener_win == WIENER_WIN_CHROMA) { + compute_stats_highbd_win5_opt_avx2(dgd8, src8, h_start, h_end, v_start, + v_end, dgd_stride, src_stride, M, H, + bit_depth); + } else { + av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start, + v_end, dgd_stride, src_stride, M, H, bit_depth); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static INLINE void acc_stat_win5_one_line_avx2( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, + int dgd_stride, const __m128i *shuffle, int32_t *sumX, + int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], + int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], + int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) { + int j, k, l; + const int wiener_win = WIENER_WIN_CHROMA; + for (j = h_start; j < h_end; j += 2) { + const uint8_t X1 = src[j]; + const uint8_t X2 = src[j + 1]; + *sumX += X1 + X2; + const uint8_t *dgd_ij = dgd + j; + for (k = 0; k < wiener_win; k++) { + const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int32_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint8_t D1 = dgd_ijk[l]; + const uint8_t D2 = dgd_ijk[l + 1]; + sumY[k][l] += D1 + D2; + M_int[k][l] += D1 * X1 + D2 * X2; + + const __m256i kl = + _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l)))); + acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl); + } + } + } +} + +static INLINE void compute_stats_win5_opt_avx2( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start, + int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H) { + int i, j, k, l, m, n; + const int wiener_win = WIENER_WIN_CHROMA; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + uint8_t avg = find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + DECLARE_ALIGNED( + 32, int32_t, + H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) = { { 0 } }; + int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } }; + int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int32_t sumX = 0; + const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + + const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data); + for (j = v_start; j < v_end; j += 64) { + const int vert_end = AOMMIN(64, v_end - j) + j; + for (i = j; i < vert_end; i++) { + acc_stat_win5_one_line_avx2( + dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, + dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32); + } + for (k = 0; k < wiener_win; ++k) { + for (l = 0; l < wiener_win; ++l) { + M_int64[k][l] += M_int32[k][l]; + M_int32[k][l] = 0; + } + } + for (k = 0; k < WIENER_WIN2_CHROMA; ++k) { + for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) { + H_int64[k][l] += H_int32[k][l]; + H_int32[k][l] = 0; + } + } + } + + const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const int32_t idx0 = l * wiener_win + k; + M[idx0] = + M_int64[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l])); + int64_t *H_ = H + idx0 * wiener_win2; + int64_t *H_int_ = &H_int64[idx0][0]; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum - + (int64_t)avg * (sumY[k][l] + sumY[n][m]); + } + } + } + } +} + +void av1_compute_stats_avx2(int wiener_win, const uint8_t *dgd, + const uint8_t *src, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H) { + if (wiener_win == WIENER_WIN) { + compute_stats_win7_opt_avx2(dgd, src, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M, H); + } else if (wiener_win == WIENER_WIN_CHROMA) { + compute_stats_win5_opt_avx2(dgd, src, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M, H); + } else { + av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M, H); + } +} + +static INLINE __m256i pair_set_epi16(int a, int b) { + return _mm256_set1_epi32( + (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16))); +} + +int64_t av1_lowbd_pixel_proj_error_avx2( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { + int i, j, k; + const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS; + const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1)); + __m256i sum64 = _mm256_setzero_si256(); + const uint8_t *src = src8; + const uint8_t *dat = dat8; + int64_t err = 0; + if (params->r[0] > 0 && params->r[1] > 0) { + __m256i xq_coeff = pair_set_epi16(xq[0], xq[1]); + for (i = 0; i < height; ++i) { + __m256i sum32 = _mm256_setzero_si256(); + for (j = 0; j <= width - 16; j += 16) { + const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j)); + const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j)); + const __m256i flt0_16b = _mm256_permute4x64_epi64( + _mm256_packs_epi32(yy_loadu_256(flt0 + j), + yy_loadu_256(flt0 + j + 8)), + 0xd8); + const __m256i flt1_16b = _mm256_permute4x64_epi64( + _mm256_packs_epi32(yy_loadu_256(flt1 + j), + yy_loadu_256(flt1 + j + 8)), + 0xd8); + const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS); + const __m256i flt0_0_sub_u = _mm256_sub_epi16(flt0_16b, u0); + const __m256i flt1_0_sub_u = _mm256_sub_epi16(flt1_16b, u0); + const __m256i v0 = _mm256_madd_epi16( + xq_coeff, _mm256_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u)); + const __m256i v1 = _mm256_madd_epi16( + xq_coeff, _mm256_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u)); + const __m256i vr0 = + _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift); + const __m256i vr1 = + _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift); + const __m256i e0 = _mm256_sub_epi16( + _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0); + const __m256i err0 = _mm256_madd_epi16(e0, e0); + sum32 = _mm256_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + const __m256i sum64_0 = + _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32)); + const __m256i sum64_1 = + _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1)); + sum64 = _mm256_add_epi64(sum64, sum64_0); + sum64 = _mm256_add_epi64(sum64, sum64_1); + } + } else if (params->r[0] > 0 || params->r[1] > 0) { + const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1]; + const __m256i xq_coeff = + pair_set_epi16(xq_active, (-xq_active * (1 << SGRPROJ_RST_BITS))); + const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; + const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; + for (i = 0; i < height; ++i) { + __m256i sum32 = _mm256_setzero_si256(); + for (j = 0; j <= width - 16; j += 16) { + const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j)); + const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j)); + const __m256i flt_16b = _mm256_permute4x64_epi64( + _mm256_packs_epi32(yy_loadu_256(flt + j), + yy_loadu_256(flt + j + 8)), + 0xd8); + const __m256i v0 = + _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt_16b, d0)); + const __m256i v1 = + _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt_16b, d0)); + const __m256i vr0 = + _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift); + const __m256i vr1 = + _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift); + const __m256i e0 = _mm256_sub_epi16( + _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0); + const __m256i err0 = _mm256_madd_epi16(e0, e0); + sum32 = _mm256_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq_active * (flt[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt += flt_stride; + const __m256i sum64_0 = + _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32)); + const __m256i sum64_1 = + _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1)); + sum64 = _mm256_add_epi64(sum64, sum64_0); + sum64 = _mm256_add_epi64(sum64, sum64_1); + } + } else { + __m256i sum32 = _mm256_setzero_si256(); + for (i = 0; i < height; ++i) { + for (j = 0; j <= width - 16; j += 16) { + const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j)); + const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j)); + const __m256i diff0 = _mm256_sub_epi16(d0, s0); + const __m256i err0 = _mm256_madd_epi16(diff0, diff0); + sum32 = _mm256_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t e = (int32_t)(dat[k]) - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + } + const __m256i sum64_0 = + _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32)); + const __m256i sum64_1 = + _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1)); + sum64 = _mm256_add_epi64(sum64_0, sum64_1); + } + int64_t sum[4]; + yy_storeu_256(sum, sum64); + err += sum[0] + sum[1] + sum[2] + sum[3]; + return err; +} + +// When params->r[0] > 0 and params->r[1] > 0. In this case all elements of +// C and H need to be computed. +static AOM_INLINE void calc_proj_params_r0_r1_avx2( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + __m256i h00, h01, h11, c0, c1; + const __m256i zero = _mm256_setzero_si256(); + h01 = h11 = c0 = c1 = h00 = zero; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 8) { + const __m256i u_load = _mm256_cvtepu8_epi32( + _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j))); + const __m256i s_load = _mm256_cvtepu8_epi32( + _mm_loadl_epi64((__m128i *)(src + i * src_stride + j))); + __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j)); + __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j)); + __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS); + __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS); + s = _mm256_sub_epi32(s, d); + f1 = _mm256_sub_epi32(f1, d); + f2 = _mm256_sub_epi32(f2, d); + + const __m256i h00_even = _mm256_mul_epi32(f1, f1); + const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), + _mm256_srli_epi64(f1, 32)); + h00 = _mm256_add_epi64(h00, h00_even); + h00 = _mm256_add_epi64(h00, h00_odd); + + const __m256i h01_even = _mm256_mul_epi32(f1, f2); + const __m256i h01_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), + _mm256_srli_epi64(f2, 32)); + h01 = _mm256_add_epi64(h01, h01_even); + h01 = _mm256_add_epi64(h01, h01_odd); + + const __m256i h11_even = _mm256_mul_epi32(f2, f2); + const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), + _mm256_srli_epi64(f2, 32)); + h11 = _mm256_add_epi64(h11, h11_even); + h11 = _mm256_add_epi64(h11, h11_odd); + + const __m256i c0_even = _mm256_mul_epi32(f1, s); + const __m256i c0_odd = + _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32)); + c0 = _mm256_add_epi64(c0, c0_even); + c0 = _mm256_add_epi64(c0, c0_odd); + + const __m256i c1_even = _mm256_mul_epi32(f2, s); + const __m256i c1_odd = + _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32)); + c1 = _mm256_add_epi64(c1, c1_even); + c1 = _mm256_add_epi64(c1, c1_odd); + } + } + + __m256i c_low = _mm256_unpacklo_epi64(c0, c1); + const __m256i c_high = _mm256_unpackhi_epi64(c0, c1); + c_low = _mm256_add_epi64(c_low, c_high); + const __m128i c_128bit = _mm_add_epi64(_mm256_extracti128_si256(c_low, 1), + _mm256_castsi256_si128(c_low)); + + __m256i h0x_low = _mm256_unpacklo_epi64(h00, h01); + const __m256i h0x_high = _mm256_unpackhi_epi64(h00, h01); + h0x_low = _mm256_add_epi64(h0x_low, h0x_high); + const __m128i h0x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h0x_low, 1), + _mm256_castsi256_si128(h0x_low)); + + // Using the symmetric properties of H, calculations of H[1][0] are not + // needed. + __m256i h1x_low = _mm256_unpacklo_epi64(zero, h11); + const __m256i h1x_high = _mm256_unpackhi_epi64(zero, h11); + h1x_low = _mm256_add_epi64(h1x_low, h1x_high); + const __m128i h1x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h1x_low, 1), + _mm256_castsi256_si128(h1x_low)); + + xx_storeu_128(C, c_128bit); + xx_storeu_128(H[0], h0x_128bit); + xx_storeu_128(H[1], h1x_128bit); + + H[0][0] /= size; + H[0][1] /= size; + H[1][1] /= size; + + // Since H is a symmetric matrix + H[1][0] = H[0][1]; + C[0] /= size; + C[1] /= size; +} + +// When only params->r[0] > 0. In this case only H[0][0] and C[0] are +// non-zero and need to be computed. +static AOM_INLINE void calc_proj_params_r0_avx2(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, + int dat_stride, int32_t *flt0, + int flt0_stride, + int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + __m256i h00, c0; + const __m256i zero = _mm256_setzero_si256(); + c0 = h00 = zero; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 8) { + const __m256i u_load = _mm256_cvtepu8_epi32( + _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j))); + const __m256i s_load = _mm256_cvtepu8_epi32( + _mm_loadl_epi64((__m128i *)(src + i * src_stride + j))); + __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j)); + __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS); + __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS); + s = _mm256_sub_epi32(s, d); + f1 = _mm256_sub_epi32(f1, d); + + const __m256i h00_even = _mm256_mul_epi32(f1, f1); + const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), + _mm256_srli_epi64(f1, 32)); + h00 = _mm256_add_epi64(h00, h00_even); + h00 = _mm256_add_epi64(h00, h00_odd); + + const __m256i c0_even = _mm256_mul_epi32(f1, s); + const __m256i c0_odd = + _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32)); + c0 = _mm256_add_epi64(c0, c0_even); + c0 = _mm256_add_epi64(c0, c0_odd); + } + } + const __m128i h00_128bit = _mm_add_epi64(_mm256_extracti128_si256(h00, 1), + _mm256_castsi256_si128(h00)); + const __m128i h00_val = + _mm_add_epi64(h00_128bit, _mm_srli_si128(h00_128bit, 8)); + + const __m128i c0_128bit = _mm_add_epi64(_mm256_extracti128_si256(c0, 1), + _mm256_castsi256_si128(c0)); + const __m128i c0_val = _mm_add_epi64(c0_128bit, _mm_srli_si128(c0_128bit, 8)); + + const __m128i c = _mm_unpacklo_epi64(c0_val, _mm256_castsi256_si128(zero)); + const __m128i h0x = _mm_unpacklo_epi64(h00_val, _mm256_castsi256_si128(zero)); + + xx_storeu_128(C, c); + xx_storeu_128(H[0], h0x); + + H[0][0] /= size; + C[0] /= size; +} + +// When only params->r[1] > 0. In this case only H[1][1] and C[1] are +// non-zero and need to be computed. +static AOM_INLINE void calc_proj_params_r1_avx2(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, + int dat_stride, int32_t *flt1, + int flt1_stride, + int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + __m256i h11, c1; + const __m256i zero = _mm256_setzero_si256(); + c1 = h11 = zero; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 8) { + const __m256i u_load = _mm256_cvtepu8_epi32( + _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j))); + const __m256i s_load = _mm256_cvtepu8_epi32( + _mm_loadl_epi64((__m128i *)(src + i * src_stride + j))); + __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j)); + __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS); + __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS); + s = _mm256_sub_epi32(s, d); + f2 = _mm256_sub_epi32(f2, d); + + const __m256i h11_even = _mm256_mul_epi32(f2, f2); + const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), + _mm256_srli_epi64(f2, 32)); + h11 = _mm256_add_epi64(h11, h11_even); + h11 = _mm256_add_epi64(h11, h11_odd); + + const __m256i c1_even = _mm256_mul_epi32(f2, s); + const __m256i c1_odd = + _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32)); + c1 = _mm256_add_epi64(c1, c1_even); + c1 = _mm256_add_epi64(c1, c1_odd); + } + } + + const __m128i h11_128bit = _mm_add_epi64(_mm256_extracti128_si256(h11, 1), + _mm256_castsi256_si128(h11)); + const __m128i h11_val = + _mm_add_epi64(h11_128bit, _mm_srli_si128(h11_128bit, 8)); + + const __m128i c1_128bit = _mm_add_epi64(_mm256_extracti128_si256(c1, 1), + _mm256_castsi256_si128(c1)); + const __m128i c1_val = _mm_add_epi64(c1_128bit, _mm_srli_si128(c1_128bit, 8)); + + const __m128i c = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), c1_val); + const __m128i h1x = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), h11_val); + + xx_storeu_128(C, c); + xx_storeu_128(H[1], h1x); + + H[1][1] /= size; + C[1] /= size; +} + +// AVX2 variant of av1_calc_proj_params_c. +void av1_calc_proj_params_avx2(const uint8_t *src8, int width, int height, + int src_stride, const uint8_t *dat8, + int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int64_t H[2][2], + int64_t C[2], const sgr_params_type *params) { + if ((params->r[0] > 0) && (params->r[1] > 0)) { + calc_proj_params_r0_r1_avx2(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, H, C); + } else if (params->r[0] > 0) { + calc_proj_params_r0_avx2(src8, width, height, src_stride, dat8, dat_stride, + flt0, flt0_stride, H, C); + } else if (params->r[1] > 0) { + calc_proj_params_r1_avx2(src8, width, height, src_stride, dat8, dat_stride, + flt1, flt1_stride, H, C); + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +int64_t av1_highbd_pixel_proj_error_avx2( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { + int i, j, k; + const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS; + const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1)); + __m256i sum64 = _mm256_setzero_si256(); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + int64_t err = 0; + if (params->r[0] > 0 && params->r[1] > 0) { // Both filters are enabled + const __m256i xq0 = _mm256_set1_epi32(xq[0]); + const __m256i xq1 = _mm256_set1_epi32(xq[1]); + for (i = 0; i < height; ++i) { + __m256i sum32 = _mm256_setzero_si256(); + for (j = 0; j <= width - 16; j += 16) { // Process 16 pixels at a time + // Load 16 pixels each from source image and corrupted image + const __m256i s0 = yy_loadu_256(src + j); + const __m256i d0 = yy_loadu_256(dat + j); + // s0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16 (indices) + + // Shift-up each pixel to match filtered image scaling + const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS); + + // Split u0 into two halves and pad each from u16 to i32 + const __m256i u0l = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(u0)); + const __m256i u0h = + _mm256_cvtepu16_epi32(_mm256_extracti128_si256(u0, 1)); + // u0h, u0l = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as u32 + + // Load 16 pixels from each filtered image + const __m256i flt0l = yy_loadu_256(flt0 + j); + const __m256i flt0h = yy_loadu_256(flt0 + j + 8); + const __m256i flt1l = yy_loadu_256(flt1 + j); + const __m256i flt1h = yy_loadu_256(flt1 + j + 8); + // flt?l, flt?h = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as u32 + + // Subtract shifted corrupt image from each filtered image + const __m256i flt0l_subu = _mm256_sub_epi32(flt0l, u0l); + const __m256i flt0h_subu = _mm256_sub_epi32(flt0h, u0h); + const __m256i flt1l_subu = _mm256_sub_epi32(flt1l, u0l); + const __m256i flt1h_subu = _mm256_sub_epi32(flt1h, u0h); + + // Multiply basis vectors by appropriate coefficients + const __m256i v0l = _mm256_mullo_epi32(flt0l_subu, xq0); + const __m256i v0h = _mm256_mullo_epi32(flt0h_subu, xq0); + const __m256i v1l = _mm256_mullo_epi32(flt1l_subu, xq1); + const __m256i v1h = _mm256_mullo_epi32(flt1h_subu, xq1); + + // Add together the contributions from the two basis vectors + const __m256i vl = _mm256_add_epi32(v0l, v1l); + const __m256i vh = _mm256_add_epi32(v0h, v1h); + + // Right-shift v with appropriate rounding + const __m256i vrl = + _mm256_srai_epi32(_mm256_add_epi32(vl, rounding), shift); + const __m256i vrh = + _mm256_srai_epi32(_mm256_add_epi32(vh, rounding), shift); + // vrh, vrl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] + + // Saturate each i32 to an i16 then combine both halves + // The permute (control=[3 1 2 0]) fixes weird ordering from AVX lanes + const __m256i vr = + _mm256_permute4x64_epi64(_mm256_packs_epi32(vrl, vrh), 0xd8); + // intermediate = [15 14 13 12 7 6 5 4] [11 10 9 8 3 2 1 0] + // vr = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] + + // Add twin-subspace-sgr-filter to corrupt image then subtract source + const __m256i e0 = _mm256_sub_epi16(_mm256_add_epi16(vr, d0), s0); + + // Calculate squared error and add adjacent values + const __m256i err0 = _mm256_madd_epi16(e0, e0); + + sum32 = _mm256_add_epi32(sum32, err0); + } + + const __m256i sum32l = + _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32)); + sum64 = _mm256_add_epi64(sum64, sum32l); + const __m256i sum32h = + _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1)); + sum64 = _mm256_add_epi64(sum64, sum32h); + + // Process remaining pixels in this row (modulo 16) + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + } + } else if (params->r[0] > 0 || params->r[1] > 0) { // Only one filter enabled + const int32_t xq_on = (params->r[0] > 0) ? xq[0] : xq[1]; + const __m256i xq_active = _mm256_set1_epi32(xq_on); + const __m256i xq_inactive = + _mm256_set1_epi32(-xq_on * (1 << SGRPROJ_RST_BITS)); + const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; + const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; + for (i = 0; i < height; ++i) { + __m256i sum32 = _mm256_setzero_si256(); + for (j = 0; j <= width - 16; j += 16) { + // Load 16 pixels from source image + const __m256i s0 = yy_loadu_256(src + j); + // s0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16 + + // Load 16 pixels from corrupted image and pad each u16 to i32 + const __m256i d0 = yy_loadu_256(dat + j); + const __m256i d0h = + _mm256_cvtepu16_epi32(_mm256_extracti128_si256(d0, 1)); + const __m256i d0l = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(d0)); + // d0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16 + // d0h, d0l = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32 + + // Load 16 pixels from the filtered image + const __m256i flth = yy_loadu_256(flt + j + 8); + const __m256i fltl = yy_loadu_256(flt + j); + // flth, fltl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32 + + const __m256i flth_xq = _mm256_mullo_epi32(flth, xq_active); + const __m256i fltl_xq = _mm256_mullo_epi32(fltl, xq_active); + const __m256i d0h_xq = _mm256_mullo_epi32(d0h, xq_inactive); + const __m256i d0l_xq = _mm256_mullo_epi32(d0l, xq_inactive); + + const __m256i vh = _mm256_add_epi32(flth_xq, d0h_xq); + const __m256i vl = _mm256_add_epi32(fltl_xq, d0l_xq); + + // Shift this down with appropriate rounding + const __m256i vrh = + _mm256_srai_epi32(_mm256_add_epi32(vh, rounding), shift); + const __m256i vrl = + _mm256_srai_epi32(_mm256_add_epi32(vl, rounding), shift); + // vrh, vrl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32 + + // Saturate each i32 to an i16 then combine both halves + // The permute (control=[3 1 2 0]) fixes weird ordering from AVX lanes + const __m256i vr = + _mm256_permute4x64_epi64(_mm256_packs_epi32(vrl, vrh), 0xd8); + // intermediate = [15 14 13 12 7 6 5 4] [11 10 9 8 3 2 1 0] as u16 + // vr = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16 + + // Subtract twin-subspace-sgr filtered from source image to get error + const __m256i e0 = _mm256_sub_epi16(_mm256_add_epi16(vr, d0), s0); + + // Calculate squared error and add adjacent values + const __m256i err0 = _mm256_madd_epi16(e0, e0); + + sum32 = _mm256_add_epi32(sum32, err0); + } + + const __m256i sum32l = + _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32)); + sum64 = _mm256_add_epi64(sum64, sum32l); + const __m256i sum32h = + _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1)); + sum64 = _mm256_add_epi64(sum64, sum32h); + + // Process remaining pixels in this row (modulo 16) + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq_on * (flt[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt += flt_stride; + } + } else { // Neither filter is enabled + for (i = 0; i < height; ++i) { + __m256i sum32 = _mm256_setzero_si256(); + for (j = 0; j <= width - 32; j += 32) { + // Load 2x16 u16 from source image + const __m256i s0l = yy_loadu_256(src + j); + const __m256i s0h = yy_loadu_256(src + j + 16); + + // Load 2x16 u16 from corrupted image + const __m256i d0l = yy_loadu_256(dat + j); + const __m256i d0h = yy_loadu_256(dat + j + 16); + + // Subtract corrupted image from source image + const __m256i diffl = _mm256_sub_epi16(d0l, s0l); + const __m256i diffh = _mm256_sub_epi16(d0h, s0h); + + // Square error and add adjacent values + const __m256i err0l = _mm256_madd_epi16(diffl, diffl); + const __m256i err0h = _mm256_madd_epi16(diffh, diffh); + + sum32 = _mm256_add_epi32(sum32, err0l); + sum32 = _mm256_add_epi32(sum32, err0h); + } + + const __m256i sum32l = + _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32)); + sum64 = _mm256_add_epi64(sum64, sum32l); + const __m256i sum32h = + _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1)); + sum64 = _mm256_add_epi64(sum64, sum32h); + + // Process remaining pixels (modulu 16) + for (k = j; k < width; ++k) { + const int32_t e = (int32_t)(dat[k]) - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + } + } + + // Sum 4 values from sum64l and sum64h into err + int64_t sum[4]; + yy_storeu_256(sum, sum64); + err += sum[0] + sum[1] + sum[2] + sum[3]; + return err; +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/libs/libaom/src/av1/encoder/x86/pickrst_sse4.c b/libs/libaom/src/av1/encoder/x86/pickrst_sse4.c new file mode 100644 index 000000000..a2f65a50c --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/pickrst_sse4.c @@ -0,0 +1,833 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include "aom_dsp/x86/synonyms.h" + +#include "config/av1_rtcd.h" +#include "av1/common/restoration.h" +#include "av1/encoder/pickrst.h" + +static INLINE void acc_stat_sse41(int32_t *dst, const uint8_t *src, + const __m128i *shuffle, const __m128i *kl) { + const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle); + const __m128i d0 = _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(s)); + const __m128i d1 = + _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(_mm_srli_si128(s, 8))); + const __m128i dst0 = xx_loadu_128(dst); + const __m128i dst1 = xx_loadu_128(dst + 4); + const __m128i r0 = _mm_add_epi32(dst0, d0); + const __m128i r1 = _mm_add_epi32(dst1, d1); + xx_storeu_128(dst, r0); + xx_storeu_128(dst + 4, r1); +} + +static INLINE void acc_stat_win7_one_line_sse4_1( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, + int dgd_stride, const __m128i *shuffle, int32_t *sumX, + int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN], + int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) { + const int wiener_win = 7; + int j, k, l; + for (j = h_start; j < h_end; j += 2) { + const uint8_t *dgd_ij = dgd + j; + const uint8_t X1 = src[j]; + const uint8_t X2 = src[j + 1]; + *sumX += X1 + X2; + for (k = 0; k < wiener_win; k++) { + const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int32_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint8_t D1 = dgd_ijk[l]; + const uint8_t D2 = dgd_ijk[l + 1]; + sumY[k][l] += D1 + D2; + M_int[k][l] += D1 * X1 + D2 * X2; + + const __m128i kl = + _mm_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l)))); + acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl); + } + } + } +} + +static INLINE void compute_stats_win7_opt_sse4_1( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start, + int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H) { + int i, j, k, l, m, n; + const int wiener_win = WIENER_WIN; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const uint8_t avg = + find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } }; + int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } }; + int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int32_t sumX = 0; + const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + + const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data); + for (j = v_start; j < v_end; j += 64) { + const int vert_end = AOMMIN(64, v_end - j) + j; + for (i = j; i < vert_end; i++) { + acc_stat_win7_one_line_sse4_1( + dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, + dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32); + } + for (k = 0; k < wiener_win; ++k) { + for (l = 0; l < wiener_win; ++l) { + M_int64[k][l] += M_int32[k][l]; + M_int32[k][l] = 0; + } + } + for (k = 0; k < WIENER_WIN2; ++k) { + for (l = 0; l < WIENER_WIN * 8; ++l) { + H_int64[k][l] += H_int32[k][l]; + H_int32[k][l] = 0; + } + } + } + + const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const int32_t idx0 = l * wiener_win + k; + M[idx0] = + M_int64[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l])); + int64_t *H_ = H + idx0 * wiener_win2; + int64_t *H_int_ = &H_int64[idx0][0]; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum - + (int64_t)avg * (sumY[k][l] + sumY[n][m]); + } + } + } + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void acc_stat_highbd_sse41(int64_t *dst, const uint16_t *dgd, + const __m128i *shuffle, + const __m128i *dgd_ijkl) { + // Load 256 bits from dgd in two chunks + const __m128i s0l = xx_loadu_128(dgd); + const __m128i s0h = xx_loadu_128(dgd + 4); + // s0l = [7 6 5 4 3 2 1 0] as u16 values (dgd indices) + // s0h = [11 10 9 8 7 6 5 4] as u16 values (dgd indices) + // (Slightly strange order so we can apply the same shuffle to both halves) + + // Shuffle the u16 values in each half (actually using 8-bit shuffle mask) + const __m128i s1l = _mm_shuffle_epi8(s0l, *shuffle); + const __m128i s1h = _mm_shuffle_epi8(s0h, *shuffle); + // s1l = [4 3 3 2 2 1 1 0] as u16 values (dgd indices) + // s1h = [8 7 7 6 6 5 5 4] as u16 values (dgd indices) + + // Multiply s1 by dgd_ijkl resulting in 8x u32 values + // Horizontally add pairs of u32 resulting in 4x u32 + const __m128i dl = _mm_madd_epi16(*dgd_ijkl, s1l); + const __m128i dh = _mm_madd_epi16(*dgd_ijkl, s1h); + // dl = [d c b a] as u32 values + // dh = [h g f e] as u32 values + + // Add these 8x u32 results on to dst in four parts + const __m128i dll = _mm_cvtepu32_epi64(dl); + const __m128i dlh = _mm_cvtepu32_epi64(_mm_srli_si128(dl, 8)); + const __m128i dhl = _mm_cvtepu32_epi64(dh); + const __m128i dhh = _mm_cvtepu32_epi64(_mm_srli_si128(dh, 8)); + // dll = [b a] as u64 values, etc. + + const __m128i rll = _mm_add_epi64(xx_loadu_128(dst), dll); + xx_storeu_128(dst, rll); + const __m128i rlh = _mm_add_epi64(xx_loadu_128(dst + 2), dlh); + xx_storeu_128(dst + 2, rlh); + const __m128i rhl = _mm_add_epi64(xx_loadu_128(dst + 4), dhl); + xx_storeu_128(dst + 4, rhl); + const __m128i rhh = _mm_add_epi64(xx_loadu_128(dst + 6), dhh); + xx_storeu_128(dst + 6, rhh); +} + +static INLINE void acc_stat_highbd_win7_one_line_sse4_1( + const uint16_t *dgd, const uint16_t *src, int h_start, int h_end, + int dgd_stride, const __m128i *shuffle, int32_t *sumX, + int32_t sumY[WIENER_WIN][WIENER_WIN], int64_t M_int[WIENER_WIN][WIENER_WIN], + int64_t H_int[WIENER_WIN2][WIENER_WIN * 8]) { + int j, k, l; + const int wiener_win = WIENER_WIN; + for (j = h_start; j < h_end; j += 2) { + const uint16_t X1 = src[j]; + const uint16_t X2 = src[j + 1]; + *sumX += X1 + X2; + const uint16_t *dgd_ij = dgd + j; + for (k = 0; k < wiener_win; k++) { + const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int64_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint16_t D1 = dgd_ijk[l]; + const uint16_t D2 = dgd_ijk[l + 1]; + sumY[k][l] += D1 + D2; + M_int[k][l] += D1 * X1 + D2 * X2; + + // Load two u16 values from dgd as a single u32 + // Then broadcast to 4x u32 slots of a 128 + const __m128i dgd_ijkl = _mm_set1_epi32(*((uint32_t *)(dgd_ijk + l))); + // dgd_ijkl = [y x y x y x y x] as u16 + + acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, + &dgd_ijkl); + } + } + } +} + +static INLINE void compute_stats_highbd_win7_opt_sse4_1( + const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, + int64_t *H, aom_bit_depth_t bit_depth) { + int i, j, k, l, m, n; + const int wiener_win = WIENER_WIN; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); + const uint16_t avg = + find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + int64_t M_int[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int64_t H_int[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } }; + int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int32_t sumX = 0; + const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + + // Load just half of the 256-bit shuffle control used for the AVX2 version + const __m128i shuffle = xx_loadu_128(g_shuffle_stats_highbd_data); + for (j = v_start; j < v_end; j += 64) { + const int vert_end = AOMMIN(64, v_end - j) + j; + for (i = j; i < vert_end; i++) { + acc_stat_highbd_win7_one_line_sse4_1( + dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, + dgd_stride, &shuffle, &sumX, sumY, M_int, H_int); + } + } + + uint8_t bit_depth_divider = 1; + if (bit_depth == AOM_BITS_12) + bit_depth_divider = 16; + else if (bit_depth == AOM_BITS_10) + bit_depth_divider = 4; + + const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const int32_t idx0 = l * wiener_win + k; + M[idx0] = (M_int[k][l] + + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) / + bit_depth_divider; + int64_t *H_ = H + idx0 * wiener_win2; + int64_t *H_int_ = &H_int[idx0][0]; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_[m * wiener_win + n] = + (H_int_[n * 8 + m] + + (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) / + bit_depth_divider; + } + } + } + } +} + +static INLINE void acc_stat_highbd_win5_one_line_sse4_1( + const uint16_t *dgd, const uint16_t *src, int h_start, int h_end, + int dgd_stride, const __m128i *shuffle, int32_t *sumX, + int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], + int64_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], + int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) { + int j, k, l; + const int wiener_win = WIENER_WIN_CHROMA; + for (j = h_start; j < h_end; j += 2) { + const uint16_t X1 = src[j]; + const uint16_t X2 = src[j + 1]; + *sumX += X1 + X2; + const uint16_t *dgd_ij = dgd + j; + for (k = 0; k < wiener_win; k++) { + const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int64_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint16_t D1 = dgd_ijk[l]; + const uint16_t D2 = dgd_ijk[l + 1]; + sumY[k][l] += D1 + D2; + M_int[k][l] += D1 * X1 + D2 * X2; + + // Load two u16 values from dgd as a single u32 + // then broadcast to 4x u32 slots of a 128 + const __m128i dgd_ijkl = _mm_set1_epi32(*((uint32_t *)(dgd_ijk + l))); + // dgd_ijkl = [y x y x y x y x] as u16 + + acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, + &dgd_ijkl); + } + } + } +} + +static INLINE void compute_stats_highbd_win5_opt_sse4_1( + const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, + int64_t *H, aom_bit_depth_t bit_depth) { + int i, j, k, l, m, n; + const int wiener_win = WIENER_WIN_CHROMA; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); + const uint16_t avg = + find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + int64_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } }; + int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int32_t sumX = 0; + const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + + // Load just half of the 256-bit shuffle control used for the AVX2 version + const __m128i shuffle = xx_loadu_128(g_shuffle_stats_highbd_data); + for (j = v_start; j < v_end; j += 64) { + const int vert_end = AOMMIN(64, v_end - j) + j; + for (i = j; i < vert_end; i++) { + acc_stat_highbd_win5_one_line_sse4_1( + dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, + dgd_stride, &shuffle, &sumX, sumY, M_int, H_int); + } + } + + uint8_t bit_depth_divider = 1; + if (bit_depth == AOM_BITS_12) + bit_depth_divider = 16; + else if (bit_depth == AOM_BITS_10) + bit_depth_divider = 4; + + const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const int32_t idx0 = l * wiener_win + k; + M[idx0] = (M_int[k][l] + + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) / + bit_depth_divider; + int64_t *H_ = H + idx0 * wiener_win2; + int64_t *H_int_ = &H_int[idx0][0]; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_[m * wiener_win + n] = + (H_int_[n * 8 + m] + + (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) / + bit_depth_divider; + } + } + } + } +} + +void av1_compute_stats_highbd_sse4_1(int wiener_win, const uint8_t *dgd8, + const uint8_t *src8, int h_start, + int h_end, int v_start, int v_end, + int dgd_stride, int src_stride, int64_t *M, + int64_t *H, aom_bit_depth_t bit_depth) { + if (wiener_win == WIENER_WIN) { + compute_stats_highbd_win7_opt_sse4_1(dgd8, src8, h_start, h_end, v_start, + v_end, dgd_stride, src_stride, M, H, + bit_depth); + } else if (wiener_win == WIENER_WIN_CHROMA) { + compute_stats_highbd_win5_opt_sse4_1(dgd8, src8, h_start, h_end, v_start, + v_end, dgd_stride, src_stride, M, H, + bit_depth); + } else { + av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start, + v_end, dgd_stride, src_stride, M, H, bit_depth); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static INLINE void acc_stat_win5_one_line_sse4_1( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, + int dgd_stride, const __m128i *shuffle, int32_t *sumX, + int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], + int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], + int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) { + const int wiener_win = WIENER_WIN_CHROMA; + int j, k, l; + for (j = h_start; j < h_end; j += 2) { + const uint8_t *dgd_ij = dgd + j; + const uint8_t X1 = src[j]; + const uint8_t X2 = src[j + 1]; + *sumX += X1 + X2; + for (k = 0; k < wiener_win; k++) { + const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int32_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint8_t D1 = dgd_ijk[l]; + const uint8_t D2 = dgd_ijk[l + 1]; + sumY[k][l] += D1 + D2; + M_int[k][l] += D1 * X1 + D2 * X2; + + const __m128i kl = + _mm_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l)))); + acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl); + } + } + } +} + +static INLINE void compute_stats_win5_opt_sse4_1( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start, + int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H) { + int i, j, k, l, m, n; + const int wiener_win = WIENER_WIN_CHROMA; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const uint8_t avg = + find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } }; + int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } }; + int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int32_t sumX = 0; + const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + + const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data); + for (j = v_start; j < v_end; j += 64) { + const int vert_end = AOMMIN(64, v_end - j) + j; + for (i = j; i < vert_end; i++) { + acc_stat_win5_one_line_sse4_1( + dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, + dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32); + } + for (k = 0; k < wiener_win; ++k) { + for (l = 0; l < wiener_win; ++l) { + M_int64[k][l] += M_int32[k][l]; + M_int32[k][l] = 0; + } + } + for (k = 0; k < WIENER_WIN_CHROMA * WIENER_WIN_CHROMA; ++k) { + for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) { + H_int64[k][l] += H_int32[k][l]; + H_int32[k][l] = 0; + } + } + } + + const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const int32_t idx0 = l * wiener_win + k; + M[idx0] = + M_int64[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l])); + int64_t *H_ = H + idx0 * wiener_win2; + int64_t *H_int_ = &H_int64[idx0][0]; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum - + (int64_t)avg * (sumY[k][l] + sumY[n][m]); + } + } + } + } +} +void av1_compute_stats_sse4_1(int wiener_win, const uint8_t *dgd, + const uint8_t *src, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H) { + if (wiener_win == WIENER_WIN) { + compute_stats_win7_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M, H); + } else if (wiener_win == WIENER_WIN_CHROMA) { + compute_stats_win5_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M, H); + } else { + av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M, H); + } +} + +static INLINE __m128i pair_set_epi16(int a, int b) { + return _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16))); +} + +int64_t av1_lowbd_pixel_proj_error_sse4_1( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { + int i, j, k; + const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS; + const __m128i rounding = _mm_set1_epi32(1 << (shift - 1)); + __m128i sum64 = _mm_setzero_si128(); + const uint8_t *src = src8; + const uint8_t *dat = dat8; + int64_t err = 0; + if (params->r[0] > 0 && params->r[1] > 0) { + __m128i xq_coeff = pair_set_epi16(xq[0], xq[1]); + for (i = 0; i < height; ++i) { + __m128i sum32 = _mm_setzero_si128(); + for (j = 0; j <= width - 8; j += 8) { + const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j)); + const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j)); + const __m128i flt0_16b = + _mm_packs_epi32(xx_loadu_128(flt0 + j), xx_loadu_128(flt0 + j + 4)); + const __m128i flt1_16b = + _mm_packs_epi32(xx_loadu_128(flt1 + j), xx_loadu_128(flt1 + j + 4)); + const __m128i u0 = _mm_slli_epi16(d0, SGRPROJ_RST_BITS); + const __m128i flt0_0_sub_u = _mm_sub_epi16(flt0_16b, u0); + const __m128i flt1_0_sub_u = _mm_sub_epi16(flt1_16b, u0); + const __m128i v0 = _mm_madd_epi16( + xq_coeff, _mm_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u)); + const __m128i v1 = _mm_madd_epi16( + xq_coeff, _mm_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u)); + const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift); + const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift); + const __m128i e0 = + _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0); + const __m128i err0 = _mm_madd_epi16(e0, e0); + sum32 = _mm_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32); + const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8)); + sum64 = _mm_add_epi64(sum64, sum64_0); + sum64 = _mm_add_epi64(sum64, sum64_1); + } + } else if (params->r[0] > 0 || params->r[1] > 0) { + const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1]; + const __m128i xq_coeff = + pair_set_epi16(xq_active, -(xq_active << SGRPROJ_RST_BITS)); + const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; + const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; + for (i = 0; i < height; ++i) { + __m128i sum32 = _mm_setzero_si128(); + for (j = 0; j <= width - 8; j += 8) { + const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j)); + const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j)); + const __m128i flt_16b = + _mm_packs_epi32(xx_loadu_128(flt + j), xx_loadu_128(flt + j + 4)); + const __m128i v0 = + _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt_16b, d0)); + const __m128i v1 = + _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt_16b, d0)); + const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift); + const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift); + const __m128i e0 = + _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0); + const __m128i err0 = _mm_madd_epi16(e0, e0); + sum32 = _mm_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq_active * (flt[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt += flt_stride; + const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32); + const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8)); + sum64 = _mm_add_epi64(sum64, sum64_0); + sum64 = _mm_add_epi64(sum64, sum64_1); + } + } else { + __m128i sum32 = _mm_setzero_si128(); + for (i = 0; i < height; ++i) { + for (j = 0; j <= width - 16; j += 16) { + const __m128i d = xx_loadu_128(dat + j); + const __m128i s = xx_loadu_128(src + j); + const __m128i d0 = _mm_cvtepu8_epi16(d); + const __m128i d1 = _mm_cvtepu8_epi16(_mm_srli_si128(d, 8)); + const __m128i s0 = _mm_cvtepu8_epi16(s); + const __m128i s1 = _mm_cvtepu8_epi16(_mm_srli_si128(s, 8)); + const __m128i diff0 = _mm_sub_epi16(d0, s0); + const __m128i diff1 = _mm_sub_epi16(d1, s1); + const __m128i err0 = _mm_madd_epi16(diff0, diff0); + const __m128i err1 = _mm_madd_epi16(diff1, diff1); + sum32 = _mm_add_epi32(sum32, err0); + sum32 = _mm_add_epi32(sum32, err1); + } + for (k = j; k < width; ++k) { + const int32_t e = (int32_t)(dat[k]) - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + } + const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32); + const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8)); + sum64 = _mm_add_epi64(sum64_0, sum64_1); + } + int64_t sum[2]; + xx_storeu_128(sum, sum64); + err += sum[0] + sum[1]; + return err; +} + +#if CONFIG_AV1_HIGHBITDEPTH +int64_t av1_highbd_pixel_proj_error_sse4_1( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { + int i, j, k; + const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS; + const __m128i rounding = _mm_set1_epi32(1 << (shift - 1)); + __m128i sum64 = _mm_setzero_si128(); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + int64_t err = 0; + if (params->r[0] > 0 && params->r[1] > 0) { // Both filters are enabled + const __m128i xq0 = _mm_set1_epi32(xq[0]); + const __m128i xq1 = _mm_set1_epi32(xq[1]); + + for (i = 0; i < height; ++i) { + __m128i sum32 = _mm_setzero_si128(); + for (j = 0; j <= width - 8; j += 8) { + // Load 8x pixels from source image + const __m128i s0 = xx_loadu_128(src + j); + // s0 = [7 6 5 4 3 2 1 0] as i16 (indices of src[]) + + // Load 8x pixels from corrupted image + const __m128i d0 = xx_loadu_128(dat + j); + // d0 = [7 6 5 4 3 2 1 0] as i16 (indices of dat[]) + + // Shift each pixel value up by SGRPROJ_RST_BITS + const __m128i u0 = _mm_slli_epi16(d0, SGRPROJ_RST_BITS); + + // Split u0 into two halves and pad each from u16 to i32 + const __m128i u0l = _mm_cvtepu16_epi32(u0); + const __m128i u0h = _mm_cvtepu16_epi32(_mm_srli_si128(u0, 8)); + // u0h = [7 6 5 4] as i32, u0l = [3 2 1 0] as i32, all dat[] indices + + // Load 8 pixels from first and second filtered images + const __m128i flt0l = xx_loadu_128(flt0 + j); + const __m128i flt0h = xx_loadu_128(flt0 + j + 4); + const __m128i flt1l = xx_loadu_128(flt1 + j); + const __m128i flt1h = xx_loadu_128(flt1 + j + 4); + // flt0 = [7 6 5 4] [3 2 1 0] as i32 (indices of flt0+j) + // flt1 = [7 6 5 4] [3 2 1 0] as i32 (indices of flt1+j) + + // Subtract shifted corrupt image from each filtered image + // This gives our two basis vectors for the projection + const __m128i flt0l_subu = _mm_sub_epi32(flt0l, u0l); + const __m128i flt0h_subu = _mm_sub_epi32(flt0h, u0h); + const __m128i flt1l_subu = _mm_sub_epi32(flt1l, u0l); + const __m128i flt1h_subu = _mm_sub_epi32(flt1h, u0h); + // flt?h_subu = [ f[7]-u[7] f[6]-u[6] f[5]-u[5] f[4]-u[4] ] as i32 + // flt?l_subu = [ f[3]-u[3] f[2]-u[2] f[1]-u[1] f[0]-u[0] ] as i32 + + // Multiply each basis vector by the corresponding coefficient + const __m128i v0l = _mm_mullo_epi32(flt0l_subu, xq0); + const __m128i v0h = _mm_mullo_epi32(flt0h_subu, xq0); + const __m128i v1l = _mm_mullo_epi32(flt1l_subu, xq1); + const __m128i v1h = _mm_mullo_epi32(flt1h_subu, xq1); + + // Add together the contribution from each scaled basis vector + const __m128i vl = _mm_add_epi32(v0l, v1l); + const __m128i vh = _mm_add_epi32(v0h, v1h); + + // Right-shift v with appropriate rounding + const __m128i vrl = _mm_srai_epi32(_mm_add_epi32(vl, rounding), shift); + const __m128i vrh = _mm_srai_epi32(_mm_add_epi32(vh, rounding), shift); + + // Saturate each i32 value to i16 and combine lower and upper halves + const __m128i vr = _mm_packs_epi32(vrl, vrh); + + // Add twin-subspace-sgr-filter to corrupt image then subtract source + const __m128i e0 = _mm_sub_epi16(_mm_add_epi16(vr, d0), s0); + + // Calculate squared error and add adjacent values + const __m128i err0 = _mm_madd_epi16(e0, e0); + + sum32 = _mm_add_epi32(sum32, err0); + } + + const __m128i sum32l = _mm_cvtepu32_epi64(sum32); + sum64 = _mm_add_epi64(sum64, sum32l); + const __m128i sum32h = _mm_cvtepu32_epi64(_mm_srli_si128(sum32, 8)); + sum64 = _mm_add_epi64(sum64, sum32h); + + // Process remaining pixels in this row (modulo 8) + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + } + } else if (params->r[0] > 0 || params->r[1] > 0) { // Only one filter enabled + const int32_t xq_on = (params->r[0] > 0) ? xq[0] : xq[1]; + const __m128i xq_active = _mm_set1_epi32(xq_on); + const __m128i xq_inactive = + _mm_set1_epi32(-xq_on * (1 << SGRPROJ_RST_BITS)); + const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; + const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; + for (i = 0; i < height; ++i) { + __m128i sum32 = _mm_setzero_si128(); + for (j = 0; j <= width - 8; j += 8) { + // Load 8x pixels from source image + const __m128i s0 = xx_loadu_128(src + j); + // s0 = [7 6 5 4 3 2 1 0] as u16 (indices of src[]) + + // Load 8x pixels from corrupted image and pad each u16 to i32 + const __m128i d0 = xx_loadu_128(dat + j); + const __m128i d0h = _mm_cvtepu16_epi32(_mm_srli_si128(d0, 8)); + const __m128i d0l = _mm_cvtepu16_epi32(d0); + // d0h, d0l = [7 6 5 4], [3 2 1 0] as u32 (indices of dat[]) + + // Load 8 pixels from the filtered image + const __m128i flth = xx_loadu_128(flt + j + 4); + const __m128i fltl = xx_loadu_128(flt + j); + // flth, fltl = [7 6 5 4], [3 2 1 0] as i32 (indices of flt+j) + + const __m128i flth_xq = _mm_mullo_epi32(flth, xq_active); + const __m128i fltl_xq = _mm_mullo_epi32(fltl, xq_active); + const __m128i d0h_xq = _mm_mullo_epi32(d0h, xq_inactive); + const __m128i d0l_xq = _mm_mullo_epi32(d0l, xq_inactive); + + const __m128i vh = _mm_add_epi32(flth_xq, d0h_xq); + const __m128i vl = _mm_add_epi32(fltl_xq, d0l_xq); + // vh = [ xq0(f[7]-d[7]) xq0(f[6]-d[6]) xq0(f[5]-d[5]) xq0(f[4]-d[4]) ] + // vl = [ xq0(f[3]-d[3]) xq0(f[2]-d[2]) xq0(f[1]-d[1]) xq0(f[0]-d[0]) ] + + // Shift this down with appropriate rounding + const __m128i vrh = _mm_srai_epi32(_mm_add_epi32(vh, rounding), shift); + const __m128i vrl = _mm_srai_epi32(_mm_add_epi32(vl, rounding), shift); + + // Saturate vr0 and vr1 from i32 to i16 then pack together + const __m128i vr = _mm_packs_epi32(vrl, vrh); + + // Subtract twin-subspace-sgr filtered from source image to get error + const __m128i e0 = _mm_sub_epi16(_mm_add_epi16(vr, d0), s0); + + // Calculate squared error and add adjacent values + const __m128i err0 = _mm_madd_epi16(e0, e0); + + sum32 = _mm_add_epi32(sum32, err0); + } + + const __m128i sum32l = _mm_cvtepu32_epi64(sum32); + sum64 = _mm_add_epi64(sum64, sum32l); + const __m128i sum32h = _mm_cvtepu32_epi64(_mm_srli_si128(sum32, 8)); + sum64 = _mm_add_epi64(sum64, sum32h); + + // Process remaining pixels in this row (modulo 8) + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq_on * (flt[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt += flt_stride; + } + } else { // Neither filter is enabled + for (i = 0; i < height; ++i) { + __m128i sum32 = _mm_setzero_si128(); + for (j = 0; j <= width - 16; j += 16) { + // Load 2x8 u16 from source image + const __m128i s0 = xx_loadu_128(src + j); + const __m128i s1 = xx_loadu_128(src + j + 8); + // Load 2x8 u16 from corrupted image + const __m128i d0 = xx_loadu_128(dat + j); + const __m128i d1 = xx_loadu_128(dat + j + 8); + + // Subtract corrupted image from source image + const __m128i diff0 = _mm_sub_epi16(d0, s0); + const __m128i diff1 = _mm_sub_epi16(d1, s1); + + // Square error and add adjacent values + const __m128i err0 = _mm_madd_epi16(diff0, diff0); + const __m128i err1 = _mm_madd_epi16(diff1, diff1); + + sum32 = _mm_add_epi32(sum32, err0); + sum32 = _mm_add_epi32(sum32, err1); + } + + const __m128i sum32l = _mm_cvtepu32_epi64(sum32); + sum64 = _mm_add_epi64(sum64, sum32l); + const __m128i sum32h = _mm_cvtepu32_epi64(_mm_srli_si128(sum32, 8)); + sum64 = _mm_add_epi64(sum64, sum32h); + + // Process remaining pixels (modulu 8) + for (k = j; k < width; ++k) { + const int32_t e = (int32_t)(dat[k]) - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + } + } + + // Sum 4 values from sum64l and sum64h into err + int64_t sum[2]; + xx_storeu_128(sum, sum64); + err += sum[0] + sum[1]; + return err; +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/libs/libaom/src/av1/encoder/x86/rdopt_avx2.c b/libs/libaom/src/av1/encoder/x86/rdopt_avx2.c new file mode 100644 index 000000000..f588badc7 --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/rdopt_avx2.c @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include "aom_dsp/x86/synonyms_avx2.h" +#include "aom_ports/system_state.h" + +#include "config/av1_rtcd.h" +#include "av1/encoder/rdopt.h" + +// Process horizontal and vertical correlations in a 4x4 block of pixels. +// We actually use the 4x4 pixels to calculate correlations corresponding to +// the top-left 3x3 pixels, so this function must be called with 1x1 overlap, +// moving the window along/down by 3 pixels at a time. +INLINE static void horver_correlation_4x4(const int16_t *diff, int stride, + __m256i *xy_sum_32, + __m256i *xz_sum_32, __m256i *x_sum_32, + __m256i *x2_sum_32) { + // Pixels in this 4x4 [ a b c d ] + // are referred to as: [ e f g h ] + // [ i j k l ] + // [ m n o p ] + + const __m256i pixels = _mm256_set_epi64x( + *(uint64_t *)&diff[0 * stride], *(uint64_t *)&diff[1 * stride], + *(uint64_t *)&diff[2 * stride], *(uint64_t *)&diff[3 * stride]); + // pixels = [d c b a h g f e] [l k j i p o n m] as i16 + + const __m256i slli = _mm256_slli_epi64(pixels, 16); + // slli = [c b a 0 g f e 0] [k j i 0 o n m 0] as i16 + + const __m256i madd_xy = _mm256_madd_epi16(pixels, slli); + // madd_xy = [bc+cd ab fg+gh ef] [jk+kl ij no+op mn] as i32 + *xy_sum_32 = _mm256_add_epi32(*xy_sum_32, madd_xy); + + // Permute control [3 2] [1 0] => [2 1] [0 0], 0b10010000 = 0x90 + const __m256i perm = _mm256_permute4x64_epi64(slli, 0x90); + // perm = [g f e 0 k j i 0] [o n m 0 o n m 0] as i16 + + const __m256i madd_xz = _mm256_madd_epi16(slli, perm); + // madd_xz = [cg+bf ae gk+fj ei] [ko+jn im oo+nn mm] as i32 + *xz_sum_32 = _mm256_add_epi32(*xz_sum_32, madd_xz); + + // Sum every element in slli (and then also their squares) + const __m256i madd1_slli = _mm256_madd_epi16(slli, _mm256_set1_epi16(1)); + // madd1_slli = [c+b a g+f e] [k+j i o+n m] as i32 + *x_sum_32 = _mm256_add_epi32(*x_sum_32, madd1_slli); + + const __m256i madd_slli = _mm256_madd_epi16(slli, slli); + // madd_slli = [cc+bb aa gg+ff ee] [kk+jj ii oo+nn mm] as i32 + *x2_sum_32 = _mm256_add_epi32(*x2_sum_32, madd_slli); +} + +void av1_get_horver_correlation_full_avx2(const int16_t *diff, int stride, + int width, int height, float *hcorr, + float *vcorr) { + // The following notation is used: + // x - current pixel + // y - right neighbour pixel + // z - below neighbour pixel + // w - down-right neighbour pixel + int64_t xy_sum = 0, xz_sum = 0; + int64_t x_sum = 0, x2_sum = 0; + + // Process horizontal and vertical correlations through the body in 4x4 + // blocks. This excludes the final row and column and possibly one extra + // column depending how 3 divides into width and height + int32_t xy_xz_tmp[8] = { 0 }, x_x2_tmp[8] = { 0 }; + __m256i xy_sum_32 = _mm256_setzero_si256(); + __m256i xz_sum_32 = _mm256_setzero_si256(); + __m256i x_sum_32 = _mm256_setzero_si256(); + __m256i x2_sum_32 = _mm256_setzero_si256(); + for (int i = 0; i <= height - 4; i += 3) { + for (int j = 0; j <= width - 4; j += 3) { + horver_correlation_4x4(&diff[i * stride + j], stride, &xy_sum_32, + &xz_sum_32, &x_sum_32, &x2_sum_32); + } + const __m256i hadd_xy_xz = _mm256_hadd_epi32(xy_sum_32, xz_sum_32); + // hadd_xy_xz = [ae+bf+cg ei+fj+gk ab+bc+cd ef+fg+gh] + // [im+jn+ko mm+nn+oo ij+jk+kl mn+no+op] as i32 + yy_storeu_256(xy_xz_tmp, hadd_xy_xz); + xy_sum += (int64_t)xy_xz_tmp[5] + xy_xz_tmp[4] + xy_xz_tmp[1]; + xz_sum += (int64_t)xy_xz_tmp[7] + xy_xz_tmp[6] + xy_xz_tmp[3]; + + const __m256i hadd_x_x2 = _mm256_hadd_epi32(x_sum_32, x2_sum_32); + // hadd_x_x2 = [aa+bb+cc ee+ff+gg a+b+c e+f+g] + // [ii+jj+kk mm+nn+oo i+j+k m+n+o] as i32 + yy_storeu_256(x_x2_tmp, hadd_x_x2); + x_sum += (int64_t)x_x2_tmp[5] + x_x2_tmp[4] + x_x2_tmp[1]; + x2_sum += (int64_t)x_x2_tmp[7] + x_x2_tmp[6] + x_x2_tmp[3]; + + xy_sum_32 = _mm256_setzero_si256(); + xz_sum_32 = _mm256_setzero_si256(); + x_sum_32 = _mm256_setzero_si256(); + x2_sum_32 = _mm256_setzero_si256(); + } + + // x_sum now covers every pixel except the final 1-2 rows and 1-2 cols + int64_t x_finalrow = 0, x_finalcol = 0, x2_finalrow = 0, x2_finalcol = 0; + + // Do we have 2 rows remaining or just the one? Note that width and height + // are powers of 2, so each modulo 3 must be 1 or 2. + if (height % 3 == 1) { // Just horiz corrs on the final row + const int16_t x0 = diff[(height - 1) * stride]; + x_sum += x0; + x_finalrow += x0; + x2_sum += x0 * x0; + x2_finalrow += x0 * x0; + for (int j = 0; j < width - 1; ++j) { + const int16_t x = diff[(height - 1) * stride + j]; + const int16_t y = diff[(height - 1) * stride + j + 1]; + xy_sum += x * y; + x_sum += y; + x2_sum += y * y; + x_finalrow += y; + x2_finalrow += y * y; + } + } else { // Two rows remaining to do + const int16_t x0 = diff[(height - 2) * stride]; + const int16_t z0 = diff[(height - 1) * stride]; + x_sum += x0 + z0; + x2_sum += x0 * x0 + z0 * z0; + x_finalrow += z0; + x2_finalrow += z0 * z0; + for (int j = 0; j < width - 1; ++j) { + const int16_t x = diff[(height - 2) * stride + j]; + const int16_t y = diff[(height - 2) * stride + j + 1]; + const int16_t z = diff[(height - 1) * stride + j]; + const int16_t w = diff[(height - 1) * stride + j + 1]; + + // Horizontal and vertical correlations for the penultimate row: + xy_sum += x * y; + xz_sum += x * z; + + // Now just horizontal correlations for the final row: + xy_sum += z * w; + + x_sum += y + w; + x2_sum += y * y + w * w; + x_finalrow += w; + x2_finalrow += w * w; + } + } + + // Do we have 2 columns remaining or just the one? + if (width % 3 == 1) { // Just vert corrs on the final col + const int16_t x0 = diff[width - 1]; + x_sum += x0; + x_finalcol += x0; + x2_sum += x0 * x0; + x2_finalcol += x0 * x0; + for (int i = 0; i < height - 1; ++i) { + const int16_t x = diff[i * stride + width - 1]; + const int16_t z = diff[(i + 1) * stride + width - 1]; + xz_sum += x * z; + x_finalcol += z; + x2_finalcol += z * z; + // So the bottom-right elements don't get counted twice: + if (i < height - (height % 3 == 1 ? 2 : 3)) { + x_sum += z; + x2_sum += z * z; + } + } + } else { // Two cols remaining + const int16_t x0 = diff[width - 2]; + const int16_t y0 = diff[width - 1]; + x_sum += x0 + y0; + x2_sum += x0 * x0 + y0 * y0; + x_finalcol += y0; + x2_finalcol += y0 * y0; + for (int i = 0; i < height - 1; ++i) { + const int16_t x = diff[i * stride + width - 2]; + const int16_t y = diff[i * stride + width - 1]; + const int16_t z = diff[(i + 1) * stride + width - 2]; + const int16_t w = diff[(i + 1) * stride + width - 1]; + + // Horizontal and vertical correlations for the penultimate col: + // Skip these on the last iteration of this loop if we also had two + // rows remaining, otherwise the final horizontal and vertical correlation + // get erroneously processed twice + if (i < height - 2 || height % 3 == 1) { + xy_sum += x * y; + xz_sum += x * z; + } + + x_finalcol += w; + x2_finalcol += w * w; + // So the bottom-right elements don't get counted twice: + if (i < height - (height % 3 == 1 ? 2 : 3)) { + x_sum += z + w; + x2_sum += z * z + w * w; + } + + // Now just vertical correlations for the final column: + xz_sum += y * w; + } + } + + // Calculate the simple sums and squared-sums + int64_t x_firstrow = 0, x_firstcol = 0; + int64_t x2_firstrow = 0, x2_firstcol = 0; + + for (int j = 0; j < width; ++j) { + x_firstrow += diff[j]; + x2_firstrow += diff[j] * diff[j]; + } + for (int i = 0; i < height; ++i) { + x_firstcol += diff[i * stride]; + x2_firstcol += diff[i * stride] * diff[i * stride]; + } + + int64_t xhor_sum = x_sum - x_finalcol; + int64_t xver_sum = x_sum - x_finalrow; + int64_t y_sum = x_sum - x_firstcol; + int64_t z_sum = x_sum - x_firstrow; + int64_t x2hor_sum = x2_sum - x2_finalcol; + int64_t x2ver_sum = x2_sum - x2_finalrow; + int64_t y2_sum = x2_sum - x2_firstcol; + int64_t z2_sum = x2_sum - x2_firstrow; + + aom_clear_system_state(); + + const float num_hor = (float)(height * (width - 1)); + const float num_ver = (float)((height - 1) * width); + + const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor; + const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver; + + const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor; + const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver; + + const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor; + const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver; + + if (xhor_var_n > 0 && y_var_n > 0) { + *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n); + *hcorr = *hcorr < 0 ? 0 : *hcorr; + } else { + *hcorr = 1.0; + } + if (xver_var_n > 0 && z_var_n > 0) { + *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n); + *vcorr = *vcorr < 0 ? 0 : *vcorr; + } else { + *vcorr = 1.0; + } +} diff --git a/libs/libaom/src/av1/encoder/x86/rdopt_sse4.c b/libs/libaom/src/av1/encoder/x86/rdopt_sse4.c new file mode 100644 index 000000000..67d94b4ca --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/rdopt_sse4.c @@ -0,0 +1,275 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include "aom_dsp/x86/synonyms.h" +#include "aom_ports/system_state.h" + +#include "config/av1_rtcd.h" +#include "av1/encoder/rdopt.h" + +// Process horizontal and vertical correlations in a 4x4 block of pixels. +// We actually use the 4x4 pixels to calculate correlations corresponding to +// the top-left 3x3 pixels, so this function must be called with 1x1 overlap, +// moving the window along/down by 3 pixels at a time. +INLINE static void horver_correlation_4x4(const int16_t *diff, int stride, + __m128i *xy_sum_32, + __m128i *xz_sum_32, __m128i *x_sum_32, + __m128i *x2_sum_32) { + // Pixels in this 4x4 [ a b c d ] + // are referred to as: [ e f g h ] + // [ i j k l ] + // [ m n o p ] + + const __m128i pixelsa = _mm_set_epi64x(*(uint64_t *)&diff[0 * stride], + *(uint64_t *)&diff[2 * stride]); + const __m128i pixelsb = _mm_set_epi64x(*(uint64_t *)&diff[1 * stride], + *(uint64_t *)&diff[3 * stride]); + // pixelsa = [d c b a l k j i] as i16 + // pixelsb = [h g f e p o n m] as i16 + + const __m128i slli_a = _mm_slli_epi64(pixelsa, 16); + const __m128i slli_b = _mm_slli_epi64(pixelsb, 16); + // slli_a = [c b a 0 k j i 0] as i16 + // slli_b = [g f e 0 o n m 0] as i16 + + const __m128i xy_madd_a = _mm_madd_epi16(pixelsa, slli_a); + const __m128i xy_madd_b = _mm_madd_epi16(pixelsb, slli_b); + // xy_madd_a = [bc+cd ab jk+kl ij] as i32 + // xy_madd_b = [fg+gh ef no+op mn] as i32 + + const __m128i xy32 = _mm_hadd_epi32(xy_madd_b, xy_madd_a); + // xy32 = [ab+bc+cd ij+jk+kl ef+fg+gh mn+no+op] as i32 + *xy_sum_32 = _mm_add_epi32(*xy_sum_32, xy32); + + const __m128i xz_madd_a = _mm_madd_epi16(slli_a, slli_b); + // xz_madd_a = [bf+cg ae jn+ko im] i32 + + const __m128i swap_b = _mm_srli_si128(slli_b, 8); + // swap_b = [0 0 0 0 g f e 0] as i16 + const __m128i xz_madd_b = _mm_madd_epi16(slli_a, swap_b); + // xz_madd_b = [0 0 gk+fj ei] i32 + + const __m128i xz32 = _mm_hadd_epi32(xz_madd_b, xz_madd_a); + // xz32 = [ae+bf+cg im+jn+ko 0 ei+fj+gk] i32 + *xz_sum_32 = _mm_add_epi32(*xz_sum_32, xz32); + + // Now calculate the straight sums, x_sum += a+b+c+e+f+g+i+j+k + // (sum up every element in slli_a and swap_b) + const __m128i sum_slli_a = _mm_hadd_epi16(slli_a, slli_a); + const __m128i sum_slli_a32 = _mm_cvtepi16_epi32(sum_slli_a); + // sum_slli_a32 = [c+b a k+j i] as i32 + const __m128i swap_b32 = _mm_cvtepi16_epi32(swap_b); + // swap_b32 = [g f e 0] as i32 + *x_sum_32 = _mm_add_epi32(*x_sum_32, sum_slli_a32); + *x_sum_32 = _mm_add_epi32(*x_sum_32, swap_b32); + // sum = [c+b+g a+f k+j+e i] as i32 + + // Also sum their squares + const __m128i slli_a_2 = _mm_madd_epi16(slli_a, slli_a); + const __m128i swap_b_2 = _mm_madd_epi16(swap_b, swap_b); + // slli_a_2 = [c2+b2 a2 k2+j2 i2] + // swap_b_2 = [0 0 g2+f2 e2] + const __m128i sum2 = _mm_hadd_epi32(slli_a_2, swap_b_2); + // sum2 = [0 g2+f2+e2 c2+b2+a2 k2+j2+i2] + *x2_sum_32 = _mm_add_epi32(*x2_sum_32, sum2); +} + +void av1_get_horver_correlation_full_sse4_1(const int16_t *diff, int stride, + int width, int height, float *hcorr, + float *vcorr) { + // The following notation is used: + // x - current pixel + // y - right neighbour pixel + // z - below neighbour pixel + // w - down-right neighbour pixel + int64_t xy_sum = 0, xz_sum = 0; + int64_t x_sum = 0, x2_sum = 0; + + // Process horizontal and vertical correlations through the body in 4x4 + // blocks. This excludes the final row and column and possibly one extra + // column depending how 3 divides into width and height + int32_t xy_tmp[4] = { 0 }, xz_tmp[4] = { 0 }; + int32_t x_tmp[4] = { 0 }, x2_tmp[4] = { 0 }; + __m128i xy_sum_32 = _mm_setzero_si128(); + __m128i xz_sum_32 = _mm_setzero_si128(); + __m128i x_sum_32 = _mm_setzero_si128(); + __m128i x2_sum_32 = _mm_setzero_si128(); + for (int i = 0; i <= height - 4; i += 3) { + for (int j = 0; j <= width - 4; j += 3) { + horver_correlation_4x4(&diff[i * stride + j], stride, &xy_sum_32, + &xz_sum_32, &x_sum_32, &x2_sum_32); + } + xx_storeu_128(xy_tmp, xy_sum_32); + xx_storeu_128(xz_tmp, xz_sum_32); + xx_storeu_128(x_tmp, x_sum_32); + xx_storeu_128(x2_tmp, x2_sum_32); + xy_sum += (int64_t)xy_tmp[3] + xy_tmp[2] + xy_tmp[1]; + xz_sum += (int64_t)xz_tmp[3] + xz_tmp[2] + xz_tmp[0]; + x_sum += (int64_t)x_tmp[3] + x_tmp[2] + x_tmp[1] + x_tmp[0]; + x2_sum += (int64_t)x2_tmp[2] + x2_tmp[1] + x2_tmp[0]; + xy_sum_32 = _mm_setzero_si128(); + xz_sum_32 = _mm_setzero_si128(); + x_sum_32 = _mm_setzero_si128(); + x2_sum_32 = _mm_setzero_si128(); + } + + // x_sum now covers every pixel except the final 1-2 rows and 1-2 cols + int64_t x_finalrow = 0, x_finalcol = 0, x2_finalrow = 0, x2_finalcol = 0; + + // Do we have 2 rows remaining or just the one? Note that width and height + // are powers of 2, so each modulo 3 must be 1 or 2. + if (height % 3 == 1) { // Just horiz corrs on the final row + const int16_t x0 = diff[(height - 1) * stride]; + x_sum += x0; + x_finalrow += x0; + x2_sum += x0 * x0; + x2_finalrow += x0 * x0; + for (int j = 0; j < width - 1; ++j) { + const int16_t x = diff[(height - 1) * stride + j]; + const int16_t y = diff[(height - 1) * stride + j + 1]; + xy_sum += x * y; + x_sum += y; + x2_sum += y * y; + x_finalrow += y; + x2_finalrow += y * y; + } + } else { // Two rows remaining to do + const int16_t x0 = diff[(height - 2) * stride]; + const int16_t z0 = diff[(height - 1) * stride]; + x_sum += x0 + z0; + x2_sum += x0 * x0 + z0 * z0; + x_finalrow += z0; + x2_finalrow += z0 * z0; + for (int j = 0; j < width - 1; ++j) { + const int16_t x = diff[(height - 2) * stride + j]; + const int16_t y = diff[(height - 2) * stride + j + 1]; + const int16_t z = diff[(height - 1) * stride + j]; + const int16_t w = diff[(height - 1) * stride + j + 1]; + + // Horizontal and vertical correlations for the penultimate row: + xy_sum += x * y; + xz_sum += x * z; + + // Now just horizontal correlations for the final row: + xy_sum += z * w; + + x_sum += y + w; + x2_sum += y * y + w * w; + x_finalrow += w; + x2_finalrow += w * w; + } + } + + // Do we have 2 columns remaining or just the one? + if (width % 3 == 1) { // Just vert corrs on the final col + const int16_t x0 = diff[width - 1]; + x_sum += x0; + x_finalcol += x0; + x2_sum += x0 * x0; + x2_finalcol += x0 * x0; + for (int i = 0; i < height - 1; ++i) { + const int16_t x = diff[i * stride + width - 1]; + const int16_t z = diff[(i + 1) * stride + width - 1]; + xz_sum += x * z; + x_finalcol += z; + x2_finalcol += z * z; + // So the bottom-right elements don't get counted twice: + if (i < height - (height % 3 == 1 ? 2 : 3)) { + x_sum += z; + x2_sum += z * z; + } + } + } else { // Two cols remaining + const int16_t x0 = diff[width - 2]; + const int16_t y0 = diff[width - 1]; + x_sum += x0 + y0; + x2_sum += x0 * x0 + y0 * y0; + x_finalcol += y0; + x2_finalcol += y0 * y0; + for (int i = 0; i < height - 1; ++i) { + const int16_t x = diff[i * stride + width - 2]; + const int16_t y = diff[i * stride + width - 1]; + const int16_t z = diff[(i + 1) * stride + width - 2]; + const int16_t w = diff[(i + 1) * stride + width - 1]; + + // Horizontal and vertical correlations for the penultimate col: + // Skip these on the last iteration of this loop if we also had two + // rows remaining, otherwise the final horizontal and vertical correlation + // get erroneously processed twice + if (i < height - 2 || height % 3 == 1) { + xy_sum += x * y; + xz_sum += x * z; + } + + x_finalcol += w; + x2_finalcol += w * w; + // So the bottom-right elements don't get counted twice: + if (i < height - (height % 3 == 1 ? 2 : 3)) { + x_sum += z + w; + x2_sum += z * z + w * w; + } + + // Now just vertical correlations for the final column: + xz_sum += y * w; + } + } + + // Calculate the simple sums and squared-sums + int64_t x_firstrow = 0, x_firstcol = 0; + int64_t x2_firstrow = 0, x2_firstcol = 0; + + for (int j = 0; j < width; ++j) { + x_firstrow += diff[j]; + x2_firstrow += diff[j] * diff[j]; + } + for (int i = 0; i < height; ++i) { + x_firstcol += diff[i * stride]; + x2_firstcol += diff[i * stride] * diff[i * stride]; + } + + int64_t xhor_sum = x_sum - x_finalcol; + int64_t xver_sum = x_sum - x_finalrow; + int64_t y_sum = x_sum - x_firstcol; + int64_t z_sum = x_sum - x_firstrow; + int64_t x2hor_sum = x2_sum - x2_finalcol; + int64_t x2ver_sum = x2_sum - x2_finalrow; + int64_t y2_sum = x2_sum - x2_firstcol; + int64_t z2_sum = x2_sum - x2_firstrow; + + aom_clear_system_state(); + + const float num_hor = (float)(height * (width - 1)); + const float num_ver = (float)((height - 1) * width); + + const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor; + const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver; + + const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor; + const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver; + + const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor; + const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver; + + if (xhor_var_n > 0 && y_var_n > 0) { + *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n); + *hcorr = *hcorr < 0 ? 0 : *hcorr; + } else { + *hcorr = 1.0; + } + if (xver_var_n > 0 && z_var_n > 0) { + *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n); + *vcorr = *vcorr < 0 ? 0 : *vcorr; + } else { + *vcorr = 1.0; + } +} diff --git a/libs/libaom/src/av1/encoder/x86/temporal_filter_avx2.c b/libs/libaom/src/av1/encoder/x86/temporal_filter_avx2.c new file mode 100644 index 000000000..847f7283c --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/temporal_filter_avx2.c @@ -0,0 +1,284 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/temporal_filter.h" + +#define SSE_STRIDE (BW + 2) + +DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask[4][8]) = { + { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0 }, + { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 }, + { 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 }, + { 0, 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } +}; + +DECLARE_ALIGNED(32, static const uint8_t, shufflemask_16b[2][16]) = { + { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 10, 11, 10, 11 } +}; + +static AOM_FORCE_INLINE void get_squared_error_16x16_avx2( + const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2, + const unsigned int stride2, const int block_width, const int block_height, + uint16_t *frame_sse, const unsigned int sse_stride) { + (void)block_width; + const uint8_t *src1 = frame1; + const uint8_t *src2 = frame2; + uint16_t *dst = frame_sse; + for (int i = 0; i < block_height; i++) { + __m128i vf1_128, vf2_128; + __m256i vf1, vf2, vdiff1, vsqdiff1; + + vf1_128 = _mm_loadu_si128((__m128i *)(src1)); + vf2_128 = _mm_loadu_si128((__m128i *)(src2)); + vf1 = _mm256_cvtepu8_epi16(vf1_128); + vf2 = _mm256_cvtepu8_epi16(vf2_128); + vdiff1 = _mm256_sub_epi16(vf1, vf2); + vsqdiff1 = _mm256_mullo_epi16(vdiff1, vdiff1); + + _mm256_storeu_si256((__m256i *)(dst), vsqdiff1); + // Set zero to uninitialized memory to avoid uninitialized loads later + *(uint32_t *)(dst + 16) = _mm_cvtsi128_si32(_mm_setzero_si128()); + + src1 += stride, src2 += stride2; + dst += sse_stride; + } +} + +static AOM_FORCE_INLINE void get_squared_error_32x32_avx2( + const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2, + const unsigned int stride2, const int block_width, const int block_height, + uint16_t *frame_sse, const unsigned int sse_stride) { + (void)block_width; + const uint8_t *src1 = frame1; + const uint8_t *src2 = frame2; + uint16_t *dst = frame_sse; + for (int i = 0; i < block_height; i++) { + __m256i vsrc1, vsrc2, vmin, vmax, vdiff, vdiff1, vdiff2, vres1, vres2; + + vsrc1 = _mm256_loadu_si256((__m256i *)src1); + vsrc2 = _mm256_loadu_si256((__m256i *)src2); + vmax = _mm256_max_epu8(vsrc1, vsrc2); + vmin = _mm256_min_epu8(vsrc1, vsrc2); + vdiff = _mm256_subs_epu8(vmax, vmin); + + __m128i vtmp1 = _mm256_castsi256_si128(vdiff); + __m128i vtmp2 = _mm256_extracti128_si256(vdiff, 1); + vdiff1 = _mm256_cvtepu8_epi16(vtmp1); + vdiff2 = _mm256_cvtepu8_epi16(vtmp2); + + vres1 = _mm256_mullo_epi16(vdiff1, vdiff1); + vres2 = _mm256_mullo_epi16(vdiff2, vdiff2); + _mm256_storeu_si256((__m256i *)(dst), vres1); + _mm256_storeu_si256((__m256i *)(dst + 16), vres2); + // Set zero to uninitialized memory to avoid uninitialized loads later + *(uint32_t *)(dst + 32) = _mm_cvtsi128_si32(_mm_setzero_si128()); + + src1 += stride; + src2 += stride2; + dst += sse_stride; + } +} + +static AOM_FORCE_INLINE __m256i xx_load_and_pad(uint16_t *src, int col, + int block_width) { + __m128i v128tmp = _mm_loadu_si128((__m128i *)(src)); + if (col == 0) { + // For the first column, replicate the first element twice to the left + v128tmp = _mm_shuffle_epi8(v128tmp, *(__m128i *)shufflemask_16b[0]); + } + if (col == block_width - 4) { + // For the last column, replicate the last element twice to the right + v128tmp = _mm_shuffle_epi8(v128tmp, *(__m128i *)shufflemask_16b[1]); + } + return _mm256_cvtepu16_epi32(v128tmp); +} + +static AOM_FORCE_INLINE int32_t xx_mask_and_hadd(__m256i vsum, int i) { + // Mask the required 5 values inside the vector + __m256i vtmp = _mm256_and_si256(vsum, *(__m256i *)sse_bytemask[i]); + __m128i v128a, v128b; + // Extract 256b as two 128b registers A and B + v128a = _mm256_castsi256_si128(vtmp); + v128b = _mm256_extracti128_si256(vtmp, 1); + // A = [A0+B0, A1+B1, A2+B2, A3+B3] + v128a = _mm_add_epi32(v128a, v128b); + // B = [A2+B2, A3+B3, 0, 0] + v128b = _mm_srli_si128(v128a, 8); + // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X] + v128a = _mm_add_epi32(v128a, v128b); + // B = [A1+B1+A3+B3, 0, 0, 0] + v128b = _mm_srli_si128(v128a, 4); + // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X] + v128a = _mm_add_epi32(v128a, v128b); + return _mm_extract_epi32(v128a, 0); +} + +static void apply_temporal_filter_planewise( + const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2, + const unsigned int stride2, const int block_width, const int block_height, + const double sigma, const int decay_control, const int use_subblock, + const int block_mse, const int *subblock_mses, const int q_factor, + unsigned int *accumulator, uint16_t *count, uint16_t *luma_sq_error, + uint16_t *chroma_sq_error, int plane, int ss_x_shift, int ss_y_shift) { + assert(TF_PLANEWISE_FILTER_WINDOW_LENGTH == 5); + assert(((block_width == 16) || (block_width == 32)) && + ((block_height == 16) || (block_height == 32))); + if (plane > PLANE_TYPE_Y) assert(chroma_sq_error != NULL); + + uint32_t acc_5x5_sse[BH][BW]; + const double h = decay_control * (0.7 + log(sigma + 1.0)); + const double q = AOMMIN((double)(q_factor * q_factor) / 256.0, 1); + uint16_t *frame_sse = + (plane == PLANE_TYPE_Y) ? luma_sq_error : chroma_sq_error; + + if (block_width == 32) { + get_squared_error_32x32_avx2(frame1, stride, frame2, stride2, block_width, + block_height, frame_sse, SSE_STRIDE); + } else { + get_squared_error_16x16_avx2(frame1, stride, frame2, stride2, block_width, + block_height, frame_sse, SSE_STRIDE); + } + + __m256i vsrc[5]; + + // Traverse 4 columns at a time + // First and last columns will require padding + for (int col = 0; col < block_width; col += 4) { + uint16_t *src = (col) ? frame_sse + col - 2 : frame_sse; + + // Load and pad(for first and last col) 3 rows from the top + for (int i = 2; i < 5; i++) { + vsrc[i] = xx_load_and_pad(src, col, block_width); + src += SSE_STRIDE; + } + + // Copy first row to first 2 vectors + vsrc[0] = vsrc[2]; + vsrc[1] = vsrc[2]; + + for (int row = 0; row < block_height; row++) { + __m256i vsum = _mm256_setzero_si256(); + + // Add 5 consecutive rows + for (int i = 0; i < 5; i++) { + vsum = _mm256_add_epi32(vsum, vsrc[i]); + } + + // Push all elements by one element to the top + for (int i = 0; i < 4; i++) { + vsrc[i] = vsrc[i + 1]; + } + + // Load next row to the last element + if (row <= block_height - 4) { + vsrc[4] = xx_load_and_pad(src, col, block_width); + src += SSE_STRIDE; + } else { + vsrc[4] = vsrc[3]; + } + + // Accumulate the sum horizontally + for (int i = 0; i < 4; i++) { + acc_5x5_sse[row][col + i] = xx_mask_and_hadd(vsum, i); + } + } + } + + for (int i = 0, k = 0; i < block_height; i++) { + for (int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame2[i * stride2 + j]; + + int diff_sse = acc_5x5_sse[i][j]; + int num_ref_pixels = + TF_PLANEWISE_FILTER_WINDOW_LENGTH * TF_PLANEWISE_FILTER_WINDOW_LENGTH; + + // Filter U-plane and V-plane using Y-plane. This is because motion + // search is only done on Y-plane, so the information from Y-plane will + // be more accurate. + if (plane != PLANE_TYPE_Y) { + for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { + for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { + const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. + const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane. + diff_sse += luma_sq_error[yy * SSE_STRIDE + xx]; + ++num_ref_pixels; + } + } + } + + const double window_error = (double)(diff_sse) / num_ref_pixels; + const int subblock_idx = + (i >= block_height / 2) * 2 + (j >= block_width / 2); + const double block_error = + (double)(use_subblock ? subblock_mses[subblock_idx] : block_mse); + + const double scaled_diff = + AOMMAX(-(window_error + block_error / 10) / (2 * h * h * q), -15.0); + const int adjusted_weight = + (int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE); + + count[k] += adjusted_weight; + accumulator[k] += adjusted_weight * pixel_value; + } + } +} + +void av1_apply_temporal_filter_planewise_avx2( + const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const double *noise_levels, const int use_subblock, + const int block_mse, const int *subblock_mses, const int q_factor, + const uint8_t *pred, uint32_t *accum, uint16_t *count) { + const int is_high_bitdepth = ref_frame->flags & YV12_FLAG_HIGHBITDEPTH; + if (is_high_bitdepth) { + assert(0 && "Only support low bit-depth with avx2!"); + } + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + + const int frame_height = ref_frame->heights[0] << mbd->plane[0].subsampling_y; + const int decay_control = frame_height >= 720 ? 4 : 3; + + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int mb_pels = mb_height * mb_width; + uint16_t luma_sq_error[SSE_STRIDE * BH]; + uint16_t *chroma_sq_error = + (num_planes > 0) + ? (uint16_t *)aom_malloc(SSE_STRIDE * BH * sizeof(uint16_t)) + : NULL; + + for (int plane = 0; plane < num_planes; ++plane) { + const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y; + const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x; + const uint32_t frame_stride = ref_frame->strides[plane == 0 ? 0 : 1]; + const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; + + const uint8_t *ref = ref_frame->buffers[plane] + frame_offset; + const int ss_x_shift = + mbd->plane[plane].subsampling_x - mbd->plane[0].subsampling_x; + const int ss_y_shift = + mbd->plane[plane].subsampling_y - mbd->plane[0].subsampling_y; + + apply_temporal_filter_planewise( + ref, frame_stride, pred + mb_pels * plane, plane_w, plane_w, plane_h, + noise_levels[plane], decay_control, use_subblock, block_mse, + subblock_mses, q_factor, accum + mb_pels * plane, + count + mb_pels * plane, luma_sq_error, chroma_sq_error, plane, + ss_x_shift, ss_y_shift); + } + if (chroma_sq_error != NULL) aom_free(chroma_sq_error); +} diff --git a/libs/libaom/src/av1/encoder/x86/temporal_filter_constants.h b/libs/libaom/src/av1/encoder/x86/temporal_filter_constants.h new file mode 100644 index 000000000..7cd61d75e --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/temporal_filter_constants.h @@ -0,0 +1,407 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_ +#define AOM_AV1_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_ + +// Division using multiplication and shifting. The C implementation does: +// modifier *= 3; +// modifier /= index; +// where 'modifier' is a set of summed values and 'index' is the number of +// summed values. +// +// This equation works out to (m * 3) / i which reduces to: +// m * 3/4 +// m * 1/2 +// m * 1/3 +// +// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16): +// m * C / 65536 +// we can create a C to replicate the division. +// +// m * 49152 / 65536 = m * 3/4 +// m * 32758 / 65536 = m * 1/2 +// m * 21846 / 65536 = m * 0.3333 +// +// These are loaded using an instruction expecting int16_t values but are used +// with _mm_mulhi_epu16(), which treats them as unsigned. +#define NEIGHBOR_CONSTANT_4 (int16_t)49152 +#define NEIGHBOR_CONSTANT_5 (int16_t)39322 +#define NEIGHBOR_CONSTANT_6 (int16_t)32768 +#define NEIGHBOR_CONSTANT_7 (int16_t)28087 +#define NEIGHBOR_CONSTANT_8 (int16_t)24576 +#define NEIGHBOR_CONSTANT_9 (int16_t)21846 +#define NEIGHBOR_CONSTANT_10 (int16_t)19661 +#define NEIGHBOR_CONSTANT_11 (int16_t)17874 +#define NEIGHBOR_CONSTANT_13 (int16_t)15124 + +DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_5, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_5 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11 +}; + +DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6 +}; + +DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13 +}; + +DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10 +}; + +static const int16_t *const LUMA_LEFT_COLUMN_NEIGHBORS[2] = { + LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = { + MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const LUMA_RIGHT_COLUMN_NEIGHBORS[2] = { + RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = { + LEFT_CORNER_NEIGHBORS_PLUS_1, LEFT_EDGE_NEIGHBORS_PLUS_1 +}; + +static const int16_t *const CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + MIDDLE_EDGE_NEIGHBORS_PLUS_1, MIDDLE_CENTER_NEIGHBORS_PLUS_1 +}; + +static const int16_t *const CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + RIGHT_CORNER_NEIGHBORS_PLUS_1, RIGHT_EDGE_NEIGHBORS_PLUS_1 +}; + +static const int16_t *const CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = { + LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = { + TWO_CORNER_NEIGHBORS_PLUS_2, TWO_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = { + LEFT_CORNER_NEIGHBORS_PLUS_4, LEFT_EDGE_NEIGHBORS_PLUS_4 +}; + +static const int16_t *const CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + MIDDLE_EDGE_NEIGHBORS_PLUS_4, MIDDLE_CENTER_NEIGHBORS_PLUS_4 +}; + +static const int16_t *const CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + RIGHT_CORNER_NEIGHBORS_PLUS_4, RIGHT_EDGE_NEIGHBORS_PLUS_4 +}; + +static const int16_t *const CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = { + TWO_CORNER_NEIGHBORS_PLUS_4, TWO_EDGE_NEIGHBORS_PLUS_4 +}; + +#define HIGHBD_NEIGHBOR_CONSTANT_4 (uint32_t)3221225472U +#define HIGHBD_NEIGHBOR_CONSTANT_5 (uint32_t)2576980378U +#define HIGHBD_NEIGHBOR_CONSTANT_6 (uint32_t)2147483648U +#define HIGHBD_NEIGHBOR_CONSTANT_7 (uint32_t)1840700270U +#define HIGHBD_NEIGHBOR_CONSTANT_8 (uint32_t)1610612736U +#define HIGHBD_NEIGHBOR_CONSTANT_9 (uint32_t)1431655766U +#define HIGHBD_NEIGHBOR_CONSTANT_10 (uint32_t)1288490189U +#define HIGHBD_NEIGHBOR_CONSTANT_11 (uint32_t)1171354718U +#define HIGHBD_NEIGHBOR_CONSTANT_13 (uint32_t)991146300U + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_5, HIGHBD_NEIGHBOR_CONSTANT_7, + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7, + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_5 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7, + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_6, HIGHBD_NEIGHBOR_CONSTANT_8, + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8, + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_6 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_11, + HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11, + HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8, + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11, + HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_13, + HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13, + HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13, + HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13 +}; + +static const uint32_t *const HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS[2] = { + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const uint32_t *const HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = { + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2 +}; + +static const uint32_t *const HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS[2] = { + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const uint32_t *const HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = { + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_1 +}; + +static const uint32_t *const HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_1, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_1 +}; + +static const uint32_t *const HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1 +}; + +static const uint32_t + *const HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = { + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2 + }; + +static const uint32_t + *const HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2 + }; + +static const uint32_t + *const HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2 + }; + +static const uint32_t + *const HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = { + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4 + }; + +static const uint32_t + *const HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4 + }; + +static const uint32_t + *const HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4 + }; + +#define DIST_STRIDE ((BW) + 2) +#endif // AOM_AV1_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_ diff --git a/libs/libaom/src/av1/encoder/x86/temporal_filter_sse2.c b/libs/libaom/src/av1/encoder/x86/temporal_filter_sse2.c new file mode 100644 index 000000000..1722fac86 --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/temporal_filter_sse2.c @@ -0,0 +1,262 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/temporal_filter.h" + +// For the squared error buffer, keep a padding for 4 samples +#define SSE_STRIDE (BW + 4) + +DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask_2x4[4][2][4]) = { + { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 } }, + { { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 } }, + { { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 } }, + { { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } } +}; + +static void get_squared_error(const uint8_t *frame1, const unsigned int stride, + const uint8_t *frame2, const unsigned int stride2, + const int block_width, const int block_height, + uint16_t *frame_sse, + const unsigned int dst_stride) { + const uint8_t *src1 = frame1; + const uint8_t *src2 = frame2; + uint16_t *dst = frame_sse; + + for (int i = 0; i < block_height; i++) { + for (int j = 0; j < block_width; j += 16) { + // Set zero to uninitialized memory to avoid uninitialized loads later + *(uint32_t *)(dst) = _mm_cvtsi128_si32(_mm_setzero_si128()); + + __m128i vsrc1 = _mm_loadu_si128((__m128i *)(src1 + j)); + __m128i vsrc2 = _mm_loadu_si128((__m128i *)(src2 + j)); + + __m128i vmax = _mm_max_epu8(vsrc1, vsrc2); + __m128i vmin = _mm_min_epu8(vsrc1, vsrc2); + __m128i vdiff = _mm_subs_epu8(vmax, vmin); + + __m128i vzero = _mm_setzero_si128(); + __m128i vdiff1 = _mm_unpacklo_epi8(vdiff, vzero); + __m128i vdiff2 = _mm_unpackhi_epi8(vdiff, vzero); + + __m128i vres1 = _mm_mullo_epi16(vdiff1, vdiff1); + __m128i vres2 = _mm_mullo_epi16(vdiff2, vdiff2); + + _mm_storeu_si128((__m128i *)(dst + j + 2), vres1); + _mm_storeu_si128((__m128i *)(dst + j + 10), vres2); + } + + // Set zero to uninitialized memory to avoid uninitialized loads later + *(uint32_t *)(dst + block_width + 2) = + _mm_cvtsi128_si32(_mm_setzero_si128()); + + src1 += stride; + src2 += stride2; + dst += dst_stride; + } +} + +static void xx_load_and_pad(uint16_t *src, __m128i *dstvec, int col, + int block_width) { + __m128i vtmp = _mm_loadu_si128((__m128i *)src); + __m128i vzero = _mm_setzero_si128(); + __m128i vtmp1 = _mm_unpacklo_epi16(vtmp, vzero); + __m128i vtmp2 = _mm_unpackhi_epi16(vtmp, vzero); + // For the first column, replicate the first element twice to the left + dstvec[0] = (col) ? vtmp1 : _mm_shuffle_epi32(vtmp1, 0xEA); + // For the last column, replicate the last element twice to the right + dstvec[1] = (col < block_width - 4) ? vtmp2 : _mm_shuffle_epi32(vtmp2, 0x54); +} + +static int32_t xx_mask_and_hadd(__m128i vsum1, __m128i vsum2, int i) { + __m128i veca, vecb; + // Mask and obtain the required 5 values inside the vector + veca = _mm_and_si128(vsum1, *(__m128i *)sse_bytemask_2x4[i][0]); + vecb = _mm_and_si128(vsum2, *(__m128i *)sse_bytemask_2x4[i][1]); + // A = [A0+B0, A1+B1, A2+B2, A3+B3] + veca = _mm_add_epi32(veca, vecb); + // B = [A2+B2, A3+B3, 0, 0] + vecb = _mm_srli_si128(veca, 8); + // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X] + veca = _mm_add_epi32(veca, vecb); + // B = [A1+B1+A3+B3, 0, 0, 0] + vecb = _mm_srli_si128(veca, 4); + // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X] + veca = _mm_add_epi32(veca, vecb); + return _mm_cvtsi128_si32(veca); +} + +static void apply_temporal_filter_planewise( + const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2, + const unsigned int stride2, const int block_width, const int block_height, + const double sigma, const int decay_control, const int use_subblock, + const int block_mse, const int *subblock_mses, const int q_factor, + unsigned int *accumulator, uint16_t *count, uint16_t *luma_sq_error, + uint16_t *chroma_sq_error, int plane, int ss_x_shift, int ss_y_shift) { + assert(TF_PLANEWISE_FILTER_WINDOW_LENGTH == 5); + assert(((block_width == 16) || (block_width == 32)) && + ((block_height == 16) || (block_height == 32))); + if (plane > PLANE_TYPE_Y) assert(chroma_sq_error != NULL); + + uint32_t acc_5x5_sse[BH][BW]; + const double h = decay_control * (0.7 + log(sigma + 1.0)); + const double q = AOMMIN((double)(q_factor * q_factor) / 256.0, 1); + uint16_t *frame_sse = + (plane == PLANE_TYPE_Y) ? luma_sq_error : chroma_sq_error; + + get_squared_error(frame1, stride, frame2, stride2, block_width, block_height, + frame_sse, SSE_STRIDE); + + __m128i vsrc[5][2]; + + // Traverse 4 columns at a time + // First and last columns will require padding + for (int col = 0; col < block_width; col += 4) { + uint16_t *src = frame_sse + col; + + // Load and pad(for first and last col) 3 rows from the top + for (int i = 2; i < 5; i++) { + xx_load_and_pad(src, vsrc[i], col, block_width); + src += SSE_STRIDE; + } + + // Padding for top 2 rows + vsrc[0][0] = vsrc[2][0]; + vsrc[0][1] = vsrc[2][1]; + vsrc[1][0] = vsrc[2][0]; + vsrc[1][1] = vsrc[2][1]; + + for (int row = 0; row < block_height; row++) { + __m128i vsum1 = _mm_setzero_si128(); + __m128i vsum2 = _mm_setzero_si128(); + + // Add 5 consecutive rows + for (int i = 0; i < 5; i++) { + vsum1 = _mm_add_epi32(vsrc[i][0], vsum1); + vsum2 = _mm_add_epi32(vsrc[i][1], vsum2); + } + + // Push all elements by one element to the top + for (int i = 0; i < 4; i++) { + vsrc[i][0] = vsrc[i + 1][0]; + vsrc[i][1] = vsrc[i + 1][1]; + } + + if (row <= block_height - 4) { + // Load next row + xx_load_and_pad(src, vsrc[4], col, block_width); + src += SSE_STRIDE; + } else { + // Padding for bottom 2 rows + vsrc[4][0] = vsrc[3][0]; + vsrc[4][1] = vsrc[3][1]; + } + + // Accumulate the sum horizontally + for (int i = 0; i < 4; i++) { + acc_5x5_sse[row][col + i] = xx_mask_and_hadd(vsum1, vsum2, i); + } + } + } + + for (int i = 0, k = 0; i < block_height; i++) { + for (int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame2[i * stride2 + j]; + + int diff_sse = acc_5x5_sse[i][j]; + int num_ref_pixels = + TF_PLANEWISE_FILTER_WINDOW_LENGTH * TF_PLANEWISE_FILTER_WINDOW_LENGTH; + + // Filter U-plane and V-plane using Y-plane. This is because motion + // search is only done on Y-plane, so the information from Y-plane will + // be more accurate. + if (plane != PLANE_TYPE_Y) { + for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { + for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { + const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. + const int xx = (j << ss_x_shift) + jj + 2; // X-coord on Y-plane. + const int ww = SSE_STRIDE; // Stride of Y-plane. + diff_sse += luma_sq_error[yy * ww + xx]; + ++num_ref_pixels; + } + } + } + + const double window_error = (double)(diff_sse) / num_ref_pixels; + const int subblock_idx = + (i >= block_height / 2) * 2 + (j >= block_width / 2); + const double block_error = + (double)(use_subblock ? subblock_mses[subblock_idx] : block_mse); + + const double scaled_diff = + AOMMAX(-(window_error + block_error / 10) / (2 * h * h * q), -15.0); + const int adjusted_weight = + (int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE); + + count[k] += adjusted_weight; + accumulator[k] += adjusted_weight * pixel_value; + } + } +} + +void av1_apply_temporal_filter_planewise_sse2( + const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const double *noise_levels, const int use_subblock, + const int block_mse, const int *subblock_mses, const int q_factor, + const uint8_t *pred, uint32_t *accum, uint16_t *count) { + const int is_high_bitdepth = ref_frame->flags & YV12_FLAG_HIGHBITDEPTH; + if (is_high_bitdepth) { + assert(0 && "Only support low bit-depth with sse2!"); + } + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + + const int frame_height = ref_frame->heights[0] << mbd->plane[0].subsampling_y; + const int decay_control = frame_height >= 720 ? 4 : 3; + + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int mb_pels = mb_height * mb_width; + uint16_t luma_sq_error[SSE_STRIDE * BH]; + uint16_t *chroma_sq_error = + (num_planes > 0) + ? (uint16_t *)aom_malloc(SSE_STRIDE * BH * sizeof(uint16_t)) + : NULL; + + for (int plane = 0; plane < num_planes; ++plane) { + const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y; + const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x; + const uint32_t frame_stride = ref_frame->strides[plane == 0 ? 0 : 1]; + const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; + + const uint8_t *ref = ref_frame->buffers[plane] + frame_offset; + const int ss_x_shift = + mbd->plane[plane].subsampling_x - mbd->plane[0].subsampling_x; + const int ss_y_shift = + mbd->plane[plane].subsampling_y - mbd->plane[0].subsampling_y; + + apply_temporal_filter_planewise( + ref, frame_stride, pred + mb_pels * plane, plane_w, plane_w, plane_h, + noise_levels[plane], decay_control, use_subblock, block_mse, + subblock_mses, q_factor, accum + mb_pels * plane, + count + mb_pels * plane, luma_sq_error, chroma_sq_error, plane, + ss_x_shift, ss_y_shift); + } + if (chroma_sq_error != NULL) aom_free(chroma_sq_error); +} diff --git a/libs/libaom/src/av1/encoder/x86/temporal_filter_sse4.c b/libs/libaom/src/av1/encoder/x86/temporal_filter_sse4.c new file mode 100644 index 000000000..e3f9f5f27 --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/temporal_filter_sse4.c @@ -0,0 +1,2044 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" +#include "aom/aom_integer.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/temporal_filter.h" +#include "av1/encoder/x86/temporal_filter_constants.h" + +////////////////////////// +// Low bit-depth Begins // +////////////////////////// + +// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the +// difference squared, and store as unsigned 16-bit integer to dst. +static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b, + uint16_t *dst) { + const __m128i a_reg = _mm_loadl_epi64((const __m128i *)a); + const __m128i b_reg = _mm_loadl_epi64((const __m128i *)b); + + const __m128i a_first = _mm_cvtepu8_epi16(a_reg); + const __m128i b_first = _mm_cvtepu8_epi16(b_reg); + + __m128i dist_first; + + dist_first = _mm_sub_epi16(a_first, b_first); + dist_first = _mm_mullo_epi16(dist_first, dist_first); + + _mm_storeu_si128((__m128i *)dst, dist_first); +} + +static INLINE void store_dist_16(const uint8_t *a, const uint8_t *b, + uint16_t *dst) { + const __m128i zero = _mm_setzero_si128(); + const __m128i a_reg = _mm_loadu_si128((const __m128i *)a); + const __m128i b_reg = _mm_loadu_si128((const __m128i *)b); + + const __m128i a_first = _mm_cvtepu8_epi16(a_reg); + const __m128i a_second = _mm_unpackhi_epi8(a_reg, zero); + const __m128i b_first = _mm_cvtepu8_epi16(b_reg); + const __m128i b_second = _mm_unpackhi_epi8(b_reg, zero); + + __m128i dist_first, dist_second; + + dist_first = _mm_sub_epi16(a_first, b_first); + dist_second = _mm_sub_epi16(a_second, b_second); + dist_first = _mm_mullo_epi16(dist_first, dist_first); + dist_second = _mm_mullo_epi16(dist_second, dist_second); + + _mm_storeu_si128((__m128i *)dst, dist_first); + _mm_storeu_si128((__m128i *)(dst + 8), dist_second); +} + +static INLINE void read_dist_8(const uint16_t *dist, __m128i *dist_reg) { + *dist_reg = _mm_loadu_si128((const __m128i *)dist); +} + +static INLINE void read_dist_16(const uint16_t *dist, __m128i *reg_first, + __m128i *reg_second) { + read_dist_8(dist, reg_first); + read_dist_8(dist + 8, reg_second); +} + +// Average the value based on the number of values summed (9 for pixels away +// from the border, 4 for pixels in corners, and 6 for other edge values). +// +// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply +// by weight. +static __m128i average_8(__m128i sum, const __m128i *mul_constants, + const int strength, const int rounding, + const int weight) { + // _mm_srl_epi16 uses the lower 64 bit value for the shift. + const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); + const __m128i rounding_u16 = _mm_set1_epi16(rounding); + const __m128i weight_u16 = _mm_set1_epi16(weight); + const __m128i sixteen = _mm_set1_epi16(16); + + // modifier * 3 / index; + sum = _mm_mulhi_epu16(sum, *mul_constants); + + sum = _mm_adds_epu16(sum, rounding_u16); + sum = _mm_srl_epi16(sum, strength_u128); + + // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4 + // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385 + // So this needs to use the epu16 version which did not come until SSE4. + sum = _mm_min_epu16(sum, sixteen); + + sum = _mm_sub_epi16(sixteen, sum); + + return _mm_mullo_epi16(sum, weight_u16); +} + +static __m128i average_4_4(__m128i sum, const __m128i *mul_constants, + const int strength, const int rounding, + const int weight_0, const int weight_1) { + // _mm_srl_epi16 uses the lower 64 bit value for the shift. + const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); + const __m128i rounding_u16 = _mm_set1_epi16(rounding); + const __m128i weight_u16 = + _mm_setr_epi16(weight_0, weight_0, weight_0, weight_0, weight_1, weight_1, + weight_1, weight_1); + const __m128i sixteen = _mm_set1_epi16(16); + + // modifier * 3 / index; + sum = _mm_mulhi_epu16(sum, *mul_constants); + + sum = _mm_adds_epu16(sum, rounding_u16); + sum = _mm_srl_epi16(sum, strength_u128); + + // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4 + // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385 + // So this needs to use the epu16 version which did not come until SSE4. + sum = _mm_min_epu16(sum, sixteen); + + sum = _mm_sub_epi16(sixteen, sum); + + return _mm_mullo_epi16(sum, weight_u16); +} + +static INLINE void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16, + const __m128i *mul_constants_0, + const __m128i *mul_constants_1, + const int strength, const int rounding, + const int weight) { + const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); + const __m128i rounding_u16 = _mm_set1_epi16(rounding); + const __m128i weight_u16 = _mm_set1_epi16(weight); + const __m128i sixteen = _mm_set1_epi16(16); + __m128i input_0, input_1; + + input_0 = _mm_mulhi_epu16(*sum_0_u16, *mul_constants_0); + input_0 = _mm_adds_epu16(input_0, rounding_u16); + + input_1 = _mm_mulhi_epu16(*sum_1_u16, *mul_constants_1); + input_1 = _mm_adds_epu16(input_1, rounding_u16); + + input_0 = _mm_srl_epi16(input_0, strength_u128); + input_1 = _mm_srl_epi16(input_1, strength_u128); + + input_0 = _mm_min_epu16(input_0, sixteen); + input_1 = _mm_min_epu16(input_1, sixteen); + input_0 = _mm_sub_epi16(sixteen, input_0); + input_1 = _mm_sub_epi16(sixteen, input_1); + + *sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16); + *sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16); +} + +// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.' +static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred, + uint16_t *count, uint32_t *accumulator) { + const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred); + const __m128i zero = _mm_setzero_si128(); + __m128i count_u16 = _mm_loadu_si128((const __m128i *)count); + __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8); + __m128i pred_0_u32, pred_1_u32; + __m128i accum_0_u32, accum_1_u32; + + count_u16 = _mm_adds_epu16(count_u16, sum_u16); + _mm_storeu_si128((__m128i *)count, count_u16); + + pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16); + + pred_0_u32 = _mm_cvtepu16_epi32(pred_u16); + pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero); + + accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); + accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); + + accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32); + accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32); + + _mm_storeu_si128((__m128i *)accumulator, accum_0_u32); + _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); +} + +static INLINE void accumulate_and_store_16(const __m128i sum_0_u16, + const __m128i sum_1_u16, + const uint8_t *pred, uint16_t *count, + uint32_t *accumulator) { + const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred); + const __m128i zero = _mm_setzero_si128(); + __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count), + count_1_u16 = _mm_loadu_si128((const __m128i *)(count + 8)); + __m128i pred_0_u16 = _mm_cvtepu8_epi16(pred_u8), + pred_1_u16 = _mm_unpackhi_epi8(pred_u8, zero); + __m128i pred_0_u32, pred_1_u32, pred_2_u32, pred_3_u32; + __m128i accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32; + + count_0_u16 = _mm_adds_epu16(count_0_u16, sum_0_u16); + _mm_storeu_si128((__m128i *)count, count_0_u16); + + count_1_u16 = _mm_adds_epu16(count_1_u16, sum_1_u16); + _mm_storeu_si128((__m128i *)(count + 8), count_1_u16); + + pred_0_u16 = _mm_mullo_epi16(sum_0_u16, pred_0_u16); + pred_1_u16 = _mm_mullo_epi16(sum_1_u16, pred_1_u16); + + pred_0_u32 = _mm_cvtepu16_epi32(pred_0_u16); + pred_1_u32 = _mm_unpackhi_epi16(pred_0_u16, zero); + pred_2_u32 = _mm_cvtepu16_epi32(pred_1_u16); + pred_3_u32 = _mm_unpackhi_epi16(pred_1_u16, zero); + + accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); + accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); + accum_2_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 8)); + accum_3_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 12)); + + accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32); + accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32); + accum_2_u32 = _mm_add_epi32(pred_2_u32, accum_2_u32); + accum_3_u32 = _mm_add_epi32(pred_3_u32, accum_3_u32); + + _mm_storeu_si128((__m128i *)accumulator, accum_0_u32); + _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); + _mm_storeu_si128((__m128i *)(accumulator + 8), accum_2_u32); + _mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32); +} + +// Read in 8 pixels from y_dist. For each index i, compute y_dist[i-1] + +// y_dist[i] + y_dist[i+1] and store in sum as 16-bit unsigned int. +static INLINE void get_sum_8(const uint16_t *y_dist, __m128i *sum) { + __m128i dist_reg, dist_left, dist_right; + + dist_reg = _mm_loadu_si128((const __m128i *)y_dist); + dist_left = _mm_loadu_si128((const __m128i *)(y_dist - 1)); + dist_right = _mm_loadu_si128((const __m128i *)(y_dist + 1)); + + *sum = _mm_adds_epu16(dist_reg, dist_left); + *sum = _mm_adds_epu16(*sum, dist_right); +} + +// Read in 16 pixels from y_dist. For each index i, compute y_dist[i-1] + +// y_dist[i] + y_dist[i+1]. Store the result for first 8 pixels in sum_first and +// the rest in sum_second. +static INLINE void get_sum_16(const uint16_t *y_dist, __m128i *sum_first, + __m128i *sum_second) { + get_sum_8(y_dist, sum_first); + get_sum_8(y_dist + 8, sum_second); +} + +// Read in a row of chroma values corresponds to a row of 16 luma values. +static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist, + const uint16_t *v_dist, + __m128i *u_first, __m128i *u_second, + __m128i *v_first, + __m128i *v_second) { + if (!ss_x) { + // If there is no chroma subsampling in the horizontal direction, then we + // need to load 16 entries from chroma. + read_dist_16(u_dist, u_first, u_second); + read_dist_16(v_dist, v_first, v_second); + } else { // ss_x == 1 + // Otherwise, we only need to load 8 entries + __m128i u_reg, v_reg; + + read_dist_8(u_dist, &u_reg); + + *u_first = _mm_unpacklo_epi16(u_reg, u_reg); + *u_second = _mm_unpackhi_epi16(u_reg, u_reg); + + read_dist_8(v_dist, &v_reg); + + *v_first = _mm_unpacklo_epi16(v_reg, v_reg); + *v_second = _mm_unpackhi_epi16(v_reg, v_reg); + } +} + +// Horizontal add unsigned 16-bit ints in src and store them as signed 32-bit +// int in dst. +static INLINE void hadd_epu16(__m128i *src, __m128i *dst) { + const __m128i zero = _mm_setzero_si128(); + const __m128i shift_right = _mm_srli_si128(*src, 2); + + const __m128i odd = _mm_blend_epi16(shift_right, zero, 170); + const __m128i even = _mm_blend_epi16(*src, zero, 170); + + *dst = _mm_add_epi32(even, odd); +} + +// Add a row of luma distortion to 8 corresponding chroma mods. +static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist, + int ss_x, int ss_y, + __m128i *u_mod, + __m128i *v_mod) { + __m128i y_reg; + if (!ss_x) { + read_dist_8(y_dist, &y_reg); + if (ss_y == 1) { + __m128i y_tmp; + read_dist_8(y_dist + DIST_STRIDE, &y_tmp); + + y_reg = _mm_adds_epu16(y_reg, y_tmp); + } + } else { + __m128i y_first, y_second; + read_dist_16(y_dist, &y_first, &y_second); + if (ss_y == 1) { + __m128i y_tmp_0, y_tmp_1; + read_dist_16(y_dist + DIST_STRIDE, &y_tmp_0, &y_tmp_1); + + y_first = _mm_adds_epu16(y_first, y_tmp_0); + y_second = _mm_adds_epu16(y_second, y_tmp_1); + } + + hadd_epu16(&y_first, &y_first); + hadd_epu16(&y_second, &y_second); + + y_reg = _mm_packus_epi32(y_first, y_second); + } + + *u_mod = _mm_adds_epu16(*u_mod, y_reg); + *v_mod = _mm_adds_epu16(*v_mod, y_reg); +} + +// Apply temporal filter to the luma components. This performs temporal +// filtering on a luma block of 16 X block_height. Use blk_fw as an array of +// size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL, +// else use top_weight for top half, and bottom weight for bottom half. +static void apply_temporal_filter_luma_16( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum, + uint16_t *y_count, const uint16_t *y_dist, const uint16_t *u_dist, + const uint16_t *v_dist, const int16_t *const *neighbors_first, + const int16_t *const *neighbors_second, int top_weight, int bottom_weight, + const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + int weight = top_weight; + + __m128i mul_first, mul_second; + + __m128i sum_row_1_first, sum_row_1_second; + __m128i sum_row_2_first, sum_row_2_second; + __m128i sum_row_3_first, sum_row_3_second; + + __m128i u_first, u_second; + __m128i v_first, v_second; + + __m128i sum_row_first; + __m128i sum_row_second; + + // Loop variables + unsigned int h; + + assert(strength >= 0); + assert(strength <= 6); + + assert(block_width == 16); + + (void)block_width; + + // First row + mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]); + mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]); + + // Add luma values + get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second); + get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = _mm_adds_epu16(sum_row_2_first, sum_row_3_first); + sum_row_second = _mm_adds_epu16(sum_row_2_second, sum_row_3_second); + + // Add chroma values + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first, + &v_second); + + sum_row_first = _mm_adds_epu16(sum_row_first, u_first); + sum_row_second = _mm_adds_epu16(sum_row_second, u_second); + + sum_row_first = _mm_adds_epu16(sum_row_first, v_first); + sum_row_second = _mm_adds_epu16(sum_row_second, v_second); + + // Get modifier and store result + if (blk_fw) { + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]); + sum_row_second = + average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]); + } else { + average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second, + strength, rounding, weight); + } + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_src += y_src_stride; + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + + u_src += uv_src_stride; + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_src += uv_src_stride; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + + // Then all the rows except the last one + mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[1]); + mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[1]); + + for (h = 1; h < block_height - 1; ++h) { + // Move the weight to bottom half + if (!use_whole_blk && h == block_height / 2) { + if (blk_fw) { + blk_fw += 2; + } else { + weight = bottom_weight; + } + } + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first); + sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second); + + get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = _mm_adds_epu16(sum_row_first, sum_row_3_first); + sum_row_second = _mm_adds_epu16(sum_row_second, sum_row_3_second); + + // Add chroma values to the modifier + if (ss_y == 0 || h % 2 == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + + u_src += uv_src_stride; + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_src += uv_src_stride; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + } + + sum_row_first = _mm_adds_epu16(sum_row_first, u_first); + sum_row_second = _mm_adds_epu16(sum_row_second, u_second); + sum_row_first = _mm_adds_epu16(sum_row_first, v_first); + sum_row_second = _mm_adds_epu16(sum_row_second, v_second); + + // Get modifier and store result + if (blk_fw) { + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]); + sum_row_second = + average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]); + } else { + average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second, + strength, rounding, weight); + } + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_src += y_src_stride; + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + } + + // The last row + mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]); + mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]); + + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first); + sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second); + + // Add chroma values to the modifier + if (ss_y == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first, + &v_second); + } + + sum_row_first = _mm_adds_epu16(sum_row_first, u_first); + sum_row_second = _mm_adds_epu16(sum_row_second, u_second); + sum_row_first = _mm_adds_epu16(sum_row_first, v_first); + sum_row_second = _mm_adds_epu16(sum_row_second, v_second); + + // Get modifier and store result + if (blk_fw) { + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]); + sum_row_second = + average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]); + } else { + average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second, + strength, rounding, weight); + } + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); +} + +// Perform temporal filter for the luma component. +static void apply_temporal_filter_luma( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk, + uint32_t *y_accum, uint16_t *y_count, const uint16_t *y_dist, + const uint16_t *u_dist, const uint16_t *v_dist) { + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x; + const unsigned int mid_width = block_width >> 1, + last_width = block_width - blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const int16_t *const *neighbors_first; + const int16_t *const *neighbors_second; + + if (block_width == 16) { + // Special Case: The blockwidth is 16 and we are operating on a row of 16 + // chroma pixels. In this case, we can't use the usualy left-midle-right + // pattern. We also don't support splitting now. + neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS; + if (use_whole_blk) { + apply_temporal_filter_luma_16( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, + block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, + y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, + v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, + bottom_weight, NULL); + } else { + apply_temporal_filter_luma_16( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, + block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, + y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, + v_dist + uv_blk_col, neighbors_first, neighbors_second, 0, 0, blk_fw); + } + + return; + } + + // Left + neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS; + apply_temporal_filter_luma_16( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, + v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS; + for (; blk_col < mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + apply_temporal_filter_luma_16( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height, + ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, + y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, + v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, + bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; blk_col < last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + apply_temporal_filter_luma_16( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height, + ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, + y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, + v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, + bottom_weight, NULL); + } + + // Right + neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS; + apply_temporal_filter_luma_16( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, + v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); +} + +// Apply temporal filter to the chroma components. This performs temporal +// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use +// blk_fw as an array of size 4 for the weights for each of the 4 subblocks, +// else use top_weight for top half, and bottom weight for bottom half. +static void apply_temporal_filter_chroma_8( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int uv_block_width, + unsigned int uv_block_height, int ss_x, int ss_y, int strength, + uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist, + const int16_t *const *neighbors, int top_weight, int bottom_weight, + const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + int weight = top_weight; + + __m128i mul; + + __m128i u_sum_row_1, u_sum_row_2, u_sum_row_3; + __m128i v_sum_row_1, v_sum_row_2, v_sum_row_3; + + __m128i u_sum_row, v_sum_row; + + // Loop variable + unsigned int h; + + (void)uv_block_width; + + // First row + mul = _mm_loadu_si128((const __m128i *)neighbors[0]); + + // Add chroma values + get_sum_8(u_dist, &u_sum_row_2); + get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3); + + u_sum_row = _mm_adds_epu16(u_sum_row_2, u_sum_row_3); + + get_sum_8(v_dist, &v_sum_row_2); + get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3); + + v_sum_row = _mm_adds_epu16(v_sum_row_2, v_sum_row_3); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + if (blk_fw) { + u_sum_row = + average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]); + v_sum_row = + average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]); + } else { + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight); + } + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); + + u_src += uv_src_stride; + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_src += uv_src_stride; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_src += y_src_stride * (1 + ss_y); + y_pre += y_pre_stride * (1 + ss_y); + y_dist += DIST_STRIDE * (1 + ss_y); + + // Then all the rows except the last one + mul = _mm_loadu_si128((const __m128i *)neighbors[1]); + + for (h = 1; h < uv_block_height - 1; ++h) { + // Move the weight pointer to the bottom half of the blocks + if (h == uv_block_height / 2) { + if (blk_fw) { + blk_fw += 2; + } else { + weight = bottom_weight; + } + } + + // Shift the rows up + u_sum_row_1 = u_sum_row_2; + u_sum_row_2 = u_sum_row_3; + + v_sum_row_1 = v_sum_row_2; + v_sum_row_2 = v_sum_row_3; + + // Add chroma values + u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2); + get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3); + u_sum_row = _mm_adds_epu16(u_sum_row, u_sum_row_3); + + v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2); + get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3); + v_sum_row = _mm_adds_epu16(v_sum_row, v_sum_row_3); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + if (blk_fw) { + u_sum_row = average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0], + blk_fw[1]); + v_sum_row = average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0], + blk_fw[1]); + } else { + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight); + } + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); + + u_src += uv_src_stride; + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_src += uv_src_stride; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_src += y_src_stride * (1 + ss_y); + y_pre += y_pre_stride * (1 + ss_y); + y_dist += DIST_STRIDE * (1 + ss_y); + } + + // The last row + mul = _mm_loadu_si128((const __m128i *)neighbors[0]); + + // Shift the rows up + u_sum_row_1 = u_sum_row_2; + u_sum_row_2 = u_sum_row_3; + + v_sum_row_1 = v_sum_row_2; + v_sum_row_2 = v_sum_row_3; + + // Add chroma values + u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2); + v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + if (blk_fw) { + u_sum_row = + average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]); + v_sum_row = + average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]); + } else { + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight); + } + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); +} + +// Perform temporal filter for the chroma components. +static void apply_temporal_filter_chroma( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk, + uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) { + const unsigned int uv_width = block_width >> ss_x, + uv_height = block_height >> ss_y; + + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x; + const unsigned int uv_mid_width = uv_width >> 1, + uv_last_width = uv_width - uv_blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const int16_t *const *neighbors; + + if (uv_width == 8) { + // Special Case: We are subsampling in x direction on a 16x16 block. Since + // we are operating on a row of 8 chroma pixels, we can't use the usual + // left-middle-right pattern. + assert(ss_x); + + if (ss_y) { + neighbors = CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS; + } + + if (use_whole_blk) { + apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, + top_weight, bottom_weight, NULL); + } else { + apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, + 0, 0, blk_fw); + } + + return; + } + + // Left + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS; + } + + apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, + v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y, + strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + for (; uv_blk_col < uv_mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, + top_weight, bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; uv_blk_col < uv_last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, + top_weight, bottom_weight, NULL); + } + + // Right + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS; + } + + apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, + v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y, + strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); +} + +static void apply_temporal_filter_yuv( + const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int strength, const int use_subblock, + const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum, + uint16_t *count) { + const int use_whole_blk = !use_subblock; + const int *blk_fw = subblock_filter_weights; + + // Block information (Y-plane). + const unsigned int block_height = block_size_high[block_size]; + const unsigned int block_width = block_size_wide[block_size]; + const int mb_pels = block_height * block_width; + const int y_src_stride = ref_frame->y_stride; + const int y_pre_stride = block_width; + const int mb_y_src_offset = + mb_row * block_height * ref_frame->y_stride + mb_col * block_width; + + // Block information (UV-plane). + const int ss_y = mbd->plane[1].subsampling_y; + const int ss_x = mbd->plane[1].subsampling_x; + const unsigned int uv_height = block_height >> ss_y; + const unsigned int uv_width = block_width >> ss_x; + const int uv_src_stride = ref_frame->uv_stride; + const int uv_pre_stride = block_width >> ss_x; + const int mb_uv_src_offset = + mb_row * uv_height * ref_frame->uv_stride + mb_col * uv_width; + + const uint8_t *y_src = ref_frame->y_buffer + mb_y_src_offset; + const uint8_t *u_src = ref_frame->u_buffer + mb_uv_src_offset; + const uint8_t *v_src = ref_frame->v_buffer + mb_uv_src_offset; + const uint8_t *y_pre = pred; + const uint8_t *u_pre = pred + mb_pels; + const uint8_t *v_pre = pred + mb_pels * 2; + uint32_t *y_accum = accum; + uint32_t *u_accum = accum + mb_pels; + uint32_t *v_accum = accum + mb_pels * 2; + uint16_t *y_count = count; + uint16_t *u_count = count + mb_pels; + uint16_t *v_count = count + mb_pels * 2; + + const unsigned int chroma_height = block_height >> ss_y, + chroma_width = block_width >> ss_x; + + DECLARE_ALIGNED(16, uint16_t, y_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, u_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, v_dist[BH * DIST_STRIDE]) = { 0 }; + const int *blk_fw_ptr = blk_fw; + + uint16_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1, + *v_dist_ptr = v_dist + 1; + const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src; + const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre; + + // Loop variables + unsigned int row, blk_col; + + assert(block_width <= BW && "block width too large"); + assert(block_height <= BH && "block height too large"); + assert(block_width % 16 == 0 && "block width must be multiple of 16"); + assert(block_height % 2 == 0 && "block height must be even"); + assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) && + "invalid chroma subsampling"); + assert(strength >= 0 && strength <= 6 && "invalid temporal filter strength"); + assert(blk_fw[0] >= 0 && "filter weight must be positive"); + assert( + (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) && + "subblock filter weight must be positive"); + assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2"); + assert( + (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) && + "subblock filter weight must be less than 2"); + + // Precompute the difference sqaured + for (row = 0; row < block_height; row++) { + for (blk_col = 0; blk_col < block_width; blk_col += 16) { + store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col, + y_dist_ptr + blk_col); + } + y_src_ptr += y_src_stride; + y_pre_ptr += y_pre_stride; + y_dist_ptr += DIST_STRIDE; + } + + for (row = 0; row < chroma_height; row++) { + for (blk_col = 0; blk_col < chroma_width; blk_col += 8) { + store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col, + u_dist_ptr + blk_col); + store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col, + v_dist_ptr + blk_col); + } + + u_src_ptr += uv_src_stride; + u_pre_ptr += uv_pre_stride; + u_dist_ptr += DIST_STRIDE; + v_src_ptr += uv_src_stride; + v_pre_ptr += uv_pre_stride; + v_dist_ptr += DIST_STRIDE; + } + + y_dist_ptr = y_dist + 1; + u_dist_ptr = u_dist + 1; + v_dist_ptr = v_dist + 1; + + apply_temporal_filter_luma(y_src, y_src_stride, y_pre, y_pre_stride, u_src, + v_src, uv_src_stride, u_pre, v_pre, uv_pre_stride, + block_width, block_height, ss_x, ss_y, strength, + blk_fw_ptr, use_whole_blk, y_accum, y_count, + y_dist_ptr, u_dist_ptr, v_dist_ptr); + + apply_temporal_filter_chroma( + y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride, + u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y, + strength, blk_fw_ptr, use_whole_blk, u_accum, u_count, v_accum, v_count, + y_dist_ptr, u_dist_ptr, v_dist_ptr); +} + +//////////////////////// +// Low bit-depth Ends // +//////////////////////// + +/////////////////////////// +// High bit-depth Begins // +/////////////////////////// + +// Compute (a-b)**2 for 8 pixels with size 16-bit +static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b, + uint32_t *dst) { + const __m128i zero = _mm_setzero_si128(); + const __m128i a_reg = _mm_loadu_si128((const __m128i *)a); + const __m128i b_reg = _mm_loadu_si128((const __m128i *)b); + + const __m128i a_first = _mm_cvtepu16_epi32(a_reg); + const __m128i a_second = _mm_unpackhi_epi16(a_reg, zero); + const __m128i b_first = _mm_cvtepu16_epi32(b_reg); + const __m128i b_second = _mm_unpackhi_epi16(b_reg, zero); + + __m128i dist_first, dist_second; + + dist_first = _mm_sub_epi32(a_first, b_first); + dist_second = _mm_sub_epi32(a_second, b_second); + dist_first = _mm_mullo_epi32(dist_first, dist_first); + dist_second = _mm_mullo_epi32(dist_second, dist_second); + + _mm_storeu_si128((__m128i *)dst, dist_first); + _mm_storeu_si128((__m128i *)(dst + 4), dist_second); +} + +// Sum up three neighboring distortions for the pixels +static INLINE void highbd_get_sum_4(const uint32_t *dist, __m128i *sum) { + __m128i dist_reg, dist_left, dist_right; + + dist_reg = _mm_loadu_si128((const __m128i *)dist); + dist_left = _mm_loadu_si128((const __m128i *)(dist - 1)); + dist_right = _mm_loadu_si128((const __m128i *)(dist + 1)); + + *sum = _mm_add_epi32(dist_reg, dist_left); + *sum = _mm_add_epi32(*sum, dist_right); +} + +static INLINE void highbd_get_sum_8(const uint32_t *dist, __m128i *sum_first, + __m128i *sum_second) { + highbd_get_sum_4(dist, sum_first); + highbd_get_sum_4(dist + 4, sum_second); +} + +// Average the value based on the number of values summed (9 for pixels away +// from the border, 4 for pixels in corners, and 6 for other edge values, plus +// however many values from y/uv plane are). +// +// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply +// by weight. +static INLINE void highbd_average_4(__m128i *output, const __m128i *sum, + const __m128i *mul_constants, + const int strength, const int rounding, + const int weight) { + // _mm_srl_epi16 uses the lower 64 bit value for the shift. + const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); + const __m128i rounding_u32 = _mm_set1_epi32(rounding); + const __m128i weight_u32 = _mm_set1_epi32(weight); + const __m128i sixteen = _mm_set1_epi32(16); + const __m128i zero = _mm_setzero_si128(); + + // modifier * 3 / index; + const __m128i sum_lo = _mm_unpacklo_epi32(*sum, zero); + const __m128i sum_hi = _mm_unpackhi_epi32(*sum, zero); + const __m128i const_lo = _mm_unpacklo_epi32(*mul_constants, zero); + const __m128i const_hi = _mm_unpackhi_epi32(*mul_constants, zero); + + const __m128i mul_lo = _mm_mul_epu32(sum_lo, const_lo); + const __m128i mul_lo_div = _mm_srli_epi64(mul_lo, 32); + const __m128i mul_hi = _mm_mul_epu32(sum_hi, const_hi); + const __m128i mul_hi_div = _mm_srli_epi64(mul_hi, 32); + + // Now we have + // mul_lo: 00 a1 00 a0 + // mul_hi: 00 a3 00 a2 + // Unpack as 64 bit words to get even and odd elements + // unpack_lo: 00 a2 00 a0 + // unpack_hi: 00 a3 00 a1 + // Then we can shift and OR the results to get everything in 32-bits + const __m128i mul_even = _mm_unpacklo_epi64(mul_lo_div, mul_hi_div); + const __m128i mul_odd = _mm_unpackhi_epi64(mul_lo_div, mul_hi_div); + const __m128i mul_odd_shift = _mm_slli_si128(mul_odd, 4); + const __m128i mul = _mm_or_si128(mul_even, mul_odd_shift); + + // Round + *output = _mm_add_epi32(mul, rounding_u32); + *output = _mm_srl_epi32(*output, strength_u128); + + // Multiply with the weight + *output = _mm_min_epu32(*output, sixteen); + *output = _mm_sub_epi32(sixteen, *output); + *output = _mm_mullo_epi32(*output, weight_u32); +} + +static INLINE void highbd_average_8(__m128i *output_0, __m128i *output_1, + const __m128i *sum_0_u32, + const __m128i *sum_1_u32, + const __m128i *mul_constants_0, + const __m128i *mul_constants_1, + const int strength, const int rounding, + const int weight) { + highbd_average_4(output_0, sum_0_u32, mul_constants_0, strength, rounding, + weight); + highbd_average_4(output_1, sum_1_u32, mul_constants_1, strength, rounding, + weight); +} + +// Add 'sum_u32' to 'count'. Multiply by 'pred' and add to 'accumulator.' +static INLINE void highbd_accumulate_and_store_8(const __m128i sum_first_u32, + const __m128i sum_second_u32, + const uint16_t *pred, + uint16_t *count, + uint32_t *accumulator) { + // Cast down to 16-bit ints + const __m128i sum_u16 = _mm_packus_epi32(sum_first_u32, sum_second_u32); + const __m128i zero = _mm_setzero_si128(); + + __m128i pred_u16 = _mm_loadu_si128((const __m128i *)pred); + __m128i count_u16 = _mm_loadu_si128((const __m128i *)count); + + __m128i pred_0_u32, pred_1_u32; + __m128i accum_0_u32, accum_1_u32; + + count_u16 = _mm_adds_epu16(count_u16, sum_u16); + _mm_storeu_si128((__m128i *)count, count_u16); + + pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16); + + pred_0_u32 = _mm_cvtepu16_epi32(pred_u16); + pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero); + + accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); + accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); + + accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32); + accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32); + + _mm_storeu_si128((__m128i *)accumulator, accum_0_u32); + _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); +} + +static INLINE void highbd_read_dist_4(const uint32_t *dist, __m128i *dist_reg) { + *dist_reg = _mm_loadu_si128((const __m128i *)dist); +} + +static INLINE void highbd_read_dist_8(const uint32_t *dist, __m128i *reg_first, + __m128i *reg_second) { + highbd_read_dist_4(dist, reg_first); + highbd_read_dist_4(dist + 4, reg_second); +} + +static INLINE void highbd_read_chroma_dist_row_8( + int ss_x, const uint32_t *u_dist, const uint32_t *v_dist, __m128i *u_first, + __m128i *u_second, __m128i *v_first, __m128i *v_second) { + if (!ss_x) { + // If there is no chroma subsampling in the horizontal direction, then we + // need to load 8 entries from chroma. + highbd_read_dist_8(u_dist, u_first, u_second); + highbd_read_dist_8(v_dist, v_first, v_second); + } else { // ss_x == 1 + // Otherwise, we only need to load 8 entries + __m128i u_reg, v_reg; + + highbd_read_dist_4(u_dist, &u_reg); + + *u_first = _mm_unpacklo_epi32(u_reg, u_reg); + *u_second = _mm_unpackhi_epi32(u_reg, u_reg); + + highbd_read_dist_4(v_dist, &v_reg); + + *v_first = _mm_unpacklo_epi32(v_reg, v_reg); + *v_second = _mm_unpackhi_epi32(v_reg, v_reg); + } +} + +static void highbd_apply_temporal_filter_luma_8( + const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, + int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, + int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum, + uint16_t *y_count, const uint32_t *y_dist, const uint32_t *u_dist, + const uint32_t *v_dist, const uint32_t *const *neighbors_first, + const uint32_t *const *neighbors_second, int top_weight, + int bottom_weight) { + const int rounding = (1 << strength) >> 1; + int weight = top_weight; + + __m128i mul_first, mul_second; + + __m128i sum_row_1_first, sum_row_1_second; + __m128i sum_row_2_first, sum_row_2_second; + __m128i sum_row_3_first, sum_row_3_second; + + __m128i u_first, u_second; + __m128i v_first, v_second; + + __m128i sum_row_first; + __m128i sum_row_second; + + // Loop variables + unsigned int h; + + assert(strength >= 0 && strength <= 14 && + "invalid adjusted temporal filter strength"); + assert(block_width == 8); + + (void)block_width; + + // First row + mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]); + mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]); + + // Add luma values + highbd_get_sum_8(y_dist, &sum_row_2_first, &sum_row_2_second); + highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + // We don't need to saturate here because the maximum value is UINT12_MAX ** 2 + // * 9 ~= 2**24 * 9 < 2 ** 28 < INT32_MAX + sum_row_first = _mm_add_epi32(sum_row_2_first, sum_row_3_first); + sum_row_second = _mm_add_epi32(sum_row_2_second, sum_row_3_second); + + // Add chroma values + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + + // Max value here is 2 ** 24 * (9 + 2), so no saturation is needed + sum_row_first = _mm_add_epi32(sum_row_first, u_first); + sum_row_second = _mm_add_epi32(sum_row_second, u_second); + + sum_row_first = _mm_add_epi32(sum_row_first, v_first); + sum_row_second = _mm_add_epi32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first, + &sum_row_second, &mul_first, &mul_second, strength, rounding, + weight); + + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_src += y_src_stride; + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + + u_src += uv_src_stride; + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_src += uv_src_stride; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + + // Then all the rows except the last one + mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[1]); + mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[1]); + + for (h = 1; h < block_height - 1; ++h) { + // Move the weight to bottom half + if (!use_whole_blk && h == block_height / 2) { + weight = bottom_weight; + } + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first); + sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second); + + highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = _mm_add_epi32(sum_row_first, sum_row_3_first); + sum_row_second = _mm_add_epi32(sum_row_second, sum_row_3_second); + + // Add chroma values to the modifier + if (ss_y == 0 || h % 2 == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + + u_src += uv_src_stride; + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_src += uv_src_stride; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + } + + sum_row_first = _mm_add_epi32(sum_row_first, u_first); + sum_row_second = _mm_add_epi32(sum_row_second, u_second); + sum_row_first = _mm_add_epi32(sum_row_first, v_first); + sum_row_second = _mm_add_epi32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first, + &sum_row_second, &mul_first, &mul_second, strength, + rounding, weight); + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_src += y_src_stride; + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + } + + // The last row + mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]); + mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]); + + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first); + sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second); + + // Add chroma values to the modifier + if (ss_y == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + } + + sum_row_first = _mm_add_epi32(sum_row_first, u_first); + sum_row_second = _mm_add_epi32(sum_row_second, u_second); + sum_row_first = _mm_add_epi32(sum_row_first, v_first); + sum_row_second = _mm_add_epi32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first, + &sum_row_second, &mul_first, &mul_second, strength, rounding, + weight); + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); +} + +// Perform temporal filter for the luma component. +static void highbd_apply_temporal_filter_luma( + const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, + int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, + int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk, + uint32_t *y_accum, uint16_t *y_count, const uint32_t *y_dist, + const uint32_t *u_dist, const uint32_t *v_dist) { + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x; + const unsigned int mid_width = block_width >> 1, + last_width = block_width - blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const uint32_t *const *neighbors_first; + const uint32_t *const *neighbors_second; + + // Left + neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS; + highbd_apply_temporal_filter_luma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, + v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + neighbors_first = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS; + for (; blk_col < mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + highbd_apply_temporal_filter_luma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step, + block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, + y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, + v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, + bottom_weight); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; blk_col < last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + highbd_apply_temporal_filter_luma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step, + block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, + y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, + v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, + bottom_weight); + } + + // Right + neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS; + highbd_apply_temporal_filter_luma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, + v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); +} + +// Add a row of luma distortion that corresponds to 8 chroma mods. If we are +// subsampling in x direction, then we have 16 lumas, else we have 8. +static INLINE void highbd_add_luma_dist_to_8_chroma_mod( + const uint32_t *y_dist, int ss_x, int ss_y, __m128i *u_mod_fst, + __m128i *u_mod_snd, __m128i *v_mod_fst, __m128i *v_mod_snd) { + __m128i y_reg_fst, y_reg_snd; + if (!ss_x) { + highbd_read_dist_8(y_dist, &y_reg_fst, &y_reg_snd); + if (ss_y == 1) { + __m128i y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + y_reg_fst = _mm_add_epi32(y_reg_fst, y_tmp_fst); + y_reg_snd = _mm_add_epi32(y_reg_snd, y_tmp_snd); + } + } else { + // Temporary + __m128i y_fst, y_snd; + + // First 8 + highbd_read_dist_8(y_dist, &y_fst, &y_snd); + if (ss_y == 1) { + __m128i y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + + y_fst = _mm_add_epi32(y_fst, y_tmp_fst); + y_snd = _mm_add_epi32(y_snd, y_tmp_snd); + } + + y_reg_fst = _mm_hadd_epi32(y_fst, y_snd); + + // Second 8 + highbd_read_dist_8(y_dist + 8, &y_fst, &y_snd); + if (ss_y == 1) { + __m128i y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + 8 + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + + y_fst = _mm_add_epi32(y_fst, y_tmp_fst); + y_snd = _mm_add_epi32(y_snd, y_tmp_snd); + } + + y_reg_snd = _mm_hadd_epi32(y_fst, y_snd); + } + + *u_mod_fst = _mm_add_epi32(*u_mod_fst, y_reg_fst); + *u_mod_snd = _mm_add_epi32(*u_mod_snd, y_reg_snd); + *v_mod_fst = _mm_add_epi32(*v_mod_fst, y_reg_fst); + *v_mod_snd = _mm_add_epi32(*v_mod_snd, y_reg_snd); +} + +// Apply temporal filter to the chroma components. This performs temporal +// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use +// blk_fw as an array of size 4 for the weights for each of the 4 subblocks, +// else use top_weight for top half, and bottom weight for bottom half. +static void highbd_apply_temporal_filter_chroma_8( + const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, + int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, + int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, + int uv_pre_stride, unsigned int uv_block_width, + unsigned int uv_block_height, int ss_x, int ss_y, int strength, + uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist, + const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd, + int top_weight, int bottom_weight, const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + int weight = top_weight; + + __m128i mul_fst, mul_snd; + + __m128i u_sum_row_1_fst, u_sum_row_2_fst, u_sum_row_3_fst; + __m128i v_sum_row_1_fst, v_sum_row_2_fst, v_sum_row_3_fst; + __m128i u_sum_row_1_snd, u_sum_row_2_snd, u_sum_row_3_snd; + __m128i v_sum_row_1_snd, v_sum_row_2_snd, v_sum_row_3_snd; + + __m128i u_sum_row_fst, v_sum_row_fst; + __m128i u_sum_row_snd, v_sum_row_snd; + + // Loop variable + unsigned int h; + + (void)uv_block_width; + + // First row + mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[0]); + mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[0]); + + // Add chroma values + highbd_get_sum_8(u_dist, &u_sum_row_2_fst, &u_sum_row_2_snd); + highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd); + + u_sum_row_fst = _mm_add_epi32(u_sum_row_2_fst, u_sum_row_3_fst); + u_sum_row_snd = _mm_add_epi32(u_sum_row_2_snd, u_sum_row_3_snd); + + highbd_get_sum_8(v_dist, &v_sum_row_2_fst, &v_sum_row_2_snd); + highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd); + + v_sum_row_fst = _mm_add_epi32(v_sum_row_2_fst, v_sum_row_3_fst); + v_sum_row_snd = _mm_add_epi32(v_sum_row_2_snd, v_sum_row_3_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst, + &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); + + u_src += uv_src_stride; + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_src += uv_src_stride; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_src += y_src_stride * (1 + ss_y); + y_pre += y_pre_stride * (1 + ss_y); + y_dist += DIST_STRIDE * (1 + ss_y); + + // Then all the rows except the last one + mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[1]); + mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[1]); + + for (h = 1; h < uv_block_height - 1; ++h) { + // Move the weight pointer to the bottom half of the blocks + if (h == uv_block_height / 2) { + if (blk_fw) { + blk_fw += 2; + } else { + weight = bottom_weight; + } + } + + // Shift the rows up + u_sum_row_1_fst = u_sum_row_2_fst; + u_sum_row_2_fst = u_sum_row_3_fst; + u_sum_row_1_snd = u_sum_row_2_snd; + u_sum_row_2_snd = u_sum_row_3_snd; + + v_sum_row_1_fst = v_sum_row_2_fst; + v_sum_row_2_fst = v_sum_row_3_fst; + v_sum_row_1_snd = v_sum_row_2_snd; + v_sum_row_2_snd = v_sum_row_3_snd; + + // Add chroma values + u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst); + u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd); + highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd); + u_sum_row_fst = _mm_add_epi32(u_sum_row_fst, u_sum_row_3_fst); + u_sum_row_snd = _mm_add_epi32(u_sum_row_snd, u_sum_row_3_snd); + + v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst); + v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd); + highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd); + v_sum_row_fst = _mm_add_epi32(v_sum_row_fst, v_sum_row_3_fst); + v_sum_row_snd = _mm_add_epi32(v_sum_row_snd, v_sum_row_3_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst, + &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); + + u_src += uv_src_stride; + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_src += uv_src_stride; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_src += y_src_stride * (1 + ss_y); + y_pre += y_pre_stride * (1 + ss_y); + y_dist += DIST_STRIDE * (1 + ss_y); + } + + // The last row + mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[0]); + mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[0]); + + // Shift the rows up + u_sum_row_1_fst = u_sum_row_2_fst; + u_sum_row_2_fst = u_sum_row_3_fst; + u_sum_row_1_snd = u_sum_row_2_snd; + u_sum_row_2_snd = u_sum_row_3_snd; + + v_sum_row_1_fst = v_sum_row_2_fst; + v_sum_row_2_fst = v_sum_row_3_fst; + v_sum_row_1_snd = v_sum_row_2_snd; + v_sum_row_2_snd = v_sum_row_3_snd; + + // Add chroma values + u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst); + v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst); + u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd); + v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst, + &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); +} + +// Perform temporal filter for the chroma components. +static void highbd_apply_temporal_filter_chroma( + const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, + int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, + int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk, + uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) { + const unsigned int uv_width = block_width >> ss_x, + uv_height = block_height >> ss_y; + + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x; + const unsigned int uv_mid_width = uv_width >> 1, + uv_last_width = uv_width - uv_blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const uint32_t *const *neighbors_fst; + const uint32_t *const *neighbors_snd; + + if (uv_width == 8) { + // Special Case: We are subsampling in x direction on a 16x16 block. Since + // we are operating on a row of 8 chroma pixels, we can't use the usual + // left-middle-right pattern. + assert(ss_x); + + if (ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } + + if (use_whole_blk) { + highbd_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } else { + highbd_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, 0, 0, blk_fw); + } + + return; + } + + // Left + if (ss_x && ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + highbd_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, + v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y, + strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd, + top_weight, bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + if (ss_x && ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + for (; uv_blk_col < uv_mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + highbd_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; uv_blk_col < uv_last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + highbd_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } + + // Right + if (ss_x && ss_y) { + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors_snd = HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS; + } + + highbd_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, + v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y, + strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd, + top_weight, bottom_weight, NULL); +} + +static void highbd_apply_temporal_filter_yuv( + const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int strength, const int use_subblock, + const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum, + uint16_t *count) { + const int use_whole_blk = !use_subblock; + const int *blk_fw = subblock_filter_weights; + + // Block information (Y-plane). + const unsigned int block_height = block_size_high[block_size]; + const unsigned int block_width = block_size_wide[block_size]; + const int mb_pels = block_height * block_width; + const int y_src_stride = ref_frame->y_stride; + const int y_pre_stride = block_width; + const int mb_y_src_offset = + mb_row * block_height * ref_frame->y_stride + mb_col * block_width; + + // Block information (UV-plane). + const int ss_y = mbd->plane[1].subsampling_y; + const int ss_x = mbd->plane[1].subsampling_x; + const unsigned int uv_height = block_height >> ss_y; + const unsigned int uv_width = block_width >> ss_x; + const int uv_src_stride = ref_frame->uv_stride; + const int uv_pre_stride = block_width >> ss_x; + const int mb_uv_src_offset = + mb_row * uv_height * ref_frame->uv_stride + mb_col * uv_width; + + const uint8_t *y_src = ref_frame->y_buffer + mb_y_src_offset; + const uint8_t *u_src = ref_frame->u_buffer + mb_uv_src_offset; + const uint8_t *v_src = ref_frame->v_buffer + mb_uv_src_offset; + const uint8_t *y_pre = pred; + const uint8_t *u_pre = pred + mb_pels; + const uint8_t *v_pre = pred + mb_pels * 2; + uint32_t *y_accum = accum; + uint32_t *u_accum = accum + mb_pels; + uint32_t *v_accum = accum + mb_pels * 2; + uint16_t *y_count = count; + uint16_t *u_count = count + mb_pels; + uint16_t *v_count = count + mb_pels * 2; + + const unsigned int chroma_height = block_height >> ss_y, + chroma_width = block_width >> ss_x; + + DECLARE_ALIGNED(16, uint32_t, y_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, u_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, v_dist[BH * DIST_STRIDE]) = { 0 }; + + uint32_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1, + *v_dist_ptr = v_dist + 1; + const uint16_t *y_src_ptr = CONVERT_TO_SHORTPTR(y_src), + *u_src_ptr = CONVERT_TO_SHORTPTR(u_src), + *v_src_ptr = CONVERT_TO_SHORTPTR(v_src); + const uint16_t *y_pre_ptr = CONVERT_TO_SHORTPTR(y_pre), + *u_pre_ptr = CONVERT_TO_SHORTPTR(u_pre), + *v_pre_ptr = CONVERT_TO_SHORTPTR(v_pre); + + // Loop variables + unsigned int row, blk_col; + + assert(block_width <= BW && "block width too large"); + assert(block_height <= BH && "block height too large"); + assert(block_width % 16 == 0 && "block width must be multiple of 16"); + assert(block_height % 2 == 0 && "block height must be even"); + assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) && + "invalid chroma subsampling"); + assert(strength >= 0 && strength <= 14 && + "invalid adjusted temporal filter strength"); + assert(blk_fw[0] >= 0 && "filter weight must be positive"); + assert( + (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) && + "subblock filter weight must be positive"); + assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2"); + assert( + (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) && + "subblock filter weight must be less than 2"); + + // Precompute the difference squared + for (row = 0; row < block_height; row++) { + for (blk_col = 0; blk_col < block_width; blk_col += 8) { + highbd_store_dist_8(y_src_ptr + blk_col, y_pre_ptr + blk_col, + y_dist_ptr + blk_col); + } + y_src_ptr += y_src_stride; + y_pre_ptr += y_pre_stride; + y_dist_ptr += DIST_STRIDE; + } + + for (row = 0; row < chroma_height; row++) { + for (blk_col = 0; blk_col < chroma_width; blk_col += 8) { + highbd_store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col, + u_dist_ptr + blk_col); + highbd_store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col, + v_dist_ptr + blk_col); + } + + u_src_ptr += uv_src_stride; + u_pre_ptr += uv_pre_stride; + u_dist_ptr += DIST_STRIDE; + v_src_ptr += uv_src_stride; + v_pre_ptr += uv_pre_stride; + v_dist_ptr += DIST_STRIDE; + } + + y_src_ptr = CONVERT_TO_SHORTPTR(y_src), + u_src_ptr = CONVERT_TO_SHORTPTR(u_src), + v_src_ptr = CONVERT_TO_SHORTPTR(v_src); + y_pre_ptr = CONVERT_TO_SHORTPTR(y_pre), + u_pre_ptr = CONVERT_TO_SHORTPTR(u_pre), + v_pre_ptr = CONVERT_TO_SHORTPTR(v_pre); + + y_dist_ptr = y_dist + 1; + u_dist_ptr = u_dist + 1; + v_dist_ptr = v_dist + 1; + + highbd_apply_temporal_filter_luma( + y_src_ptr, y_src_stride, y_pre_ptr, y_pre_stride, u_src_ptr, v_src_ptr, + uv_src_stride, u_pre_ptr, v_pre_ptr, uv_pre_stride, block_width, + block_height, ss_x, ss_y, strength, blk_fw, use_whole_blk, y_accum, + y_count, y_dist_ptr, u_dist_ptr, v_dist_ptr); + + highbd_apply_temporal_filter_chroma( + y_src_ptr, y_src_stride, y_pre_ptr, y_pre_stride, u_src_ptr, v_src_ptr, + uv_src_stride, u_pre_ptr, v_pre_ptr, uv_pre_stride, block_width, + block_height, ss_x, ss_y, strength, blk_fw, use_whole_blk, u_accum, + u_count, v_accum, v_count, y_dist_ptr, u_dist_ptr, v_dist_ptr); +} + +///////////////////////// +// High bit-depth Ends // +///////////////////////// + +void av1_apply_temporal_filter_yuv_sse4_1( + const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const int strength, const int use_subblock, + const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum, + uint16_t *count) { + const int is_high_bitdepth = ref_frame->flags & YV12_FLAG_HIGHBITDEPTH; + // TODO(any): Need to support when `num_planes != 3`, like C implementation. + assert(num_planes == 3); + (void)num_planes; + if (is_high_bitdepth) { + highbd_apply_temporal_filter_yuv( + ref_frame, mbd, block_size, mb_row, mb_col, strength, use_subblock, + subblock_filter_weights, pred, accum, count); + } else { + apply_temporal_filter_yuv(ref_frame, mbd, block_size, mb_row, mb_col, + strength, use_subblock, subblock_filter_weights, + pred, accum, count); + } +} diff --git a/libs/libaom/src/av1/encoder/x86/wedge_utils_avx2.c b/libs/libaom/src/av1/encoder/x86/wedge_utils_avx2.c new file mode 100644 index 000000000..c06bad8f7 --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/wedge_utils_avx2.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" +#include "aom/aom_integer.h" + +#include "av1/common/reconinter.h" + +#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS) + +/** + * See av1_wedge_sse_from_residuals_c + */ +uint64_t av1_wedge_sse_from_residuals_avx2(const int16_t *r1, const int16_t *d, + const uint8_t *m, int N) { + int n = -N; + + uint64_t csse; + + const __m256i v_mask_max_w = _mm256_set1_epi16(MAX_MASK_VALUE); + const __m256i v_zext_q = yy_set1_64_from_32i(0xffffffff); + + __m256i v_acc0_q = _mm256_setzero_si256(); + + assert(N % 64 == 0); + + r1 += N; + d += N; + m += N; + + do { + const __m256i v_r0_w = _mm256_lddqu_si256((__m256i *)(r1 + n)); + const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(d + n)); + const __m128i v_m01_b = _mm_lddqu_si128((__m128i *)(m + n)); + + const __m256i v_rd0l_w = _mm256_unpacklo_epi16(v_d0_w, v_r0_w); + const __m256i v_rd0h_w = _mm256_unpackhi_epi16(v_d0_w, v_r0_w); + const __m256i v_m0_w = _mm256_cvtepu8_epi16(v_m01_b); + + const __m256i v_m0l_w = _mm256_unpacklo_epi16(v_m0_w, v_mask_max_w); + const __m256i v_m0h_w = _mm256_unpackhi_epi16(v_m0_w, v_mask_max_w); + + const __m256i v_t0l_d = _mm256_madd_epi16(v_rd0l_w, v_m0l_w); + const __m256i v_t0h_d = _mm256_madd_epi16(v_rd0h_w, v_m0h_w); + + const __m256i v_t0_w = _mm256_packs_epi32(v_t0l_d, v_t0h_d); + + const __m256i v_sq0_d = _mm256_madd_epi16(v_t0_w, v_t0_w); + + const __m256i v_sum0_q = _mm256_add_epi64( + _mm256_and_si256(v_sq0_d, v_zext_q), _mm256_srli_epi64(v_sq0_d, 32)); + + v_acc0_q = _mm256_add_epi64(v_acc0_q, v_sum0_q); + + n += 16; + } while (n); + + v_acc0_q = _mm256_add_epi64(v_acc0_q, _mm256_srli_si256(v_acc0_q, 8)); + __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc0_q); + __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc0_q, 1); + v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1); +#if ARCH_X86_64 + csse = (uint64_t)_mm_extract_epi64(v_acc_q_0, 0); +#else + xx_storel_64(&csse, v_acc_q_0); +#endif + + return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); +} + +/** + * See av1_wedge_sign_from_residuals_c + */ +int8_t av1_wedge_sign_from_residuals_avx2(const int16_t *ds, const uint8_t *m, + int N, int64_t limit) { + int64_t acc; + __m256i v_acc0_d = _mm256_setzero_si256(); + + // Input size limited to 8192 by the use of 32 bit accumulators and m + // being between [0, 64]. Overflow might happen at larger sizes, + // though it is practically impossible on real video input. + assert(N < 8192); + assert(N % 64 == 0); + + do { + const __m256i v_m01_b = _mm256_lddqu_si256((__m256i *)(m)); + const __m256i v_m23_b = _mm256_lddqu_si256((__m256i *)(m + 32)); + + const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(ds)); + const __m256i v_d1_w = _mm256_lddqu_si256((__m256i *)(ds + 16)); + const __m256i v_d2_w = _mm256_lddqu_si256((__m256i *)(ds + 32)); + const __m256i v_d3_w = _mm256_lddqu_si256((__m256i *)(ds + 48)); + + const __m256i v_m0_w = + _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m01_b)); + const __m256i v_m1_w = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m01_b, 1)); + const __m256i v_m2_w = + _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m23_b)); + const __m256i v_m3_w = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m23_b, 1)); + + const __m256i v_p0_d = _mm256_madd_epi16(v_d0_w, v_m0_w); + const __m256i v_p1_d = _mm256_madd_epi16(v_d1_w, v_m1_w); + const __m256i v_p2_d = _mm256_madd_epi16(v_d2_w, v_m2_w); + const __m256i v_p3_d = _mm256_madd_epi16(v_d3_w, v_m3_w); + + const __m256i v_p01_d = _mm256_add_epi32(v_p0_d, v_p1_d); + const __m256i v_p23_d = _mm256_add_epi32(v_p2_d, v_p3_d); + + const __m256i v_p0123_d = _mm256_add_epi32(v_p01_d, v_p23_d); + + v_acc0_d = _mm256_add_epi32(v_acc0_d, v_p0123_d); + + ds += 64; + m += 64; + + N -= 64; + } while (N); + + __m256i v_sign_d = _mm256_srai_epi32(v_acc0_d, 31); + v_acc0_d = _mm256_add_epi64(_mm256_unpacklo_epi32(v_acc0_d, v_sign_d), + _mm256_unpackhi_epi32(v_acc0_d, v_sign_d)); + + __m256i v_acc_q = _mm256_add_epi64(v_acc0_d, _mm256_srli_si256(v_acc0_d, 8)); + + __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc_q); + __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc_q, 1); + v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1); + +#if ARCH_X86_64 + acc = (uint64_t)_mm_extract_epi64(v_acc_q_0, 0); +#else + xx_storel_64(&acc, v_acc_q_0); +#endif + + return acc > limit; +} + +/** + * av1_wedge_compute_delta_squares_c + */ +void av1_wedge_compute_delta_squares_avx2(int16_t *d, const int16_t *a, + const int16_t *b, int N) { + const __m256i v_neg_w = _mm256_set1_epi32(0xffff0001); + + assert(N % 64 == 0); + + do { + const __m256i v_a0_w = _mm256_lddqu_si256((__m256i *)(a)); + const __m256i v_b0_w = _mm256_lddqu_si256((__m256i *)(b)); + const __m256i v_a1_w = _mm256_lddqu_si256((__m256i *)(a + 16)); + const __m256i v_b1_w = _mm256_lddqu_si256((__m256i *)(b + 16)); + const __m256i v_a2_w = _mm256_lddqu_si256((__m256i *)(a + 32)); + const __m256i v_b2_w = _mm256_lddqu_si256((__m256i *)(b + 32)); + const __m256i v_a3_w = _mm256_lddqu_si256((__m256i *)(a + 48)); + const __m256i v_b3_w = _mm256_lddqu_si256((__m256i *)(b + 48)); + + const __m256i v_ab0l_w = _mm256_unpacklo_epi16(v_a0_w, v_b0_w); + const __m256i v_ab0h_w = _mm256_unpackhi_epi16(v_a0_w, v_b0_w); + const __m256i v_ab1l_w = _mm256_unpacklo_epi16(v_a1_w, v_b1_w); + const __m256i v_ab1h_w = _mm256_unpackhi_epi16(v_a1_w, v_b1_w); + const __m256i v_ab2l_w = _mm256_unpacklo_epi16(v_a2_w, v_b2_w); + const __m256i v_ab2h_w = _mm256_unpackhi_epi16(v_a2_w, v_b2_w); + const __m256i v_ab3l_w = _mm256_unpacklo_epi16(v_a3_w, v_b3_w); + const __m256i v_ab3h_w = _mm256_unpackhi_epi16(v_a3_w, v_b3_w); + + // Negate top word of pairs + const __m256i v_abl0n_w = _mm256_sign_epi16(v_ab0l_w, v_neg_w); + const __m256i v_abh0n_w = _mm256_sign_epi16(v_ab0h_w, v_neg_w); + const __m256i v_abl1n_w = _mm256_sign_epi16(v_ab1l_w, v_neg_w); + const __m256i v_abh1n_w = _mm256_sign_epi16(v_ab1h_w, v_neg_w); + const __m256i v_abl2n_w = _mm256_sign_epi16(v_ab2l_w, v_neg_w); + const __m256i v_abh2n_w = _mm256_sign_epi16(v_ab2h_w, v_neg_w); + const __m256i v_abl3n_w = _mm256_sign_epi16(v_ab3l_w, v_neg_w); + const __m256i v_abh3n_w = _mm256_sign_epi16(v_ab3h_w, v_neg_w); + + const __m256i v_r0l_w = _mm256_madd_epi16(v_ab0l_w, v_abl0n_w); + const __m256i v_r0h_w = _mm256_madd_epi16(v_ab0h_w, v_abh0n_w); + const __m256i v_r1l_w = _mm256_madd_epi16(v_ab1l_w, v_abl1n_w); + const __m256i v_r1h_w = _mm256_madd_epi16(v_ab1h_w, v_abh1n_w); + const __m256i v_r2l_w = _mm256_madd_epi16(v_ab2l_w, v_abl2n_w); + const __m256i v_r2h_w = _mm256_madd_epi16(v_ab2h_w, v_abh2n_w); + const __m256i v_r3l_w = _mm256_madd_epi16(v_ab3l_w, v_abl3n_w); + const __m256i v_r3h_w = _mm256_madd_epi16(v_ab3h_w, v_abh3n_w); + + const __m256i v_r0_w = _mm256_packs_epi32(v_r0l_w, v_r0h_w); + const __m256i v_r1_w = _mm256_packs_epi32(v_r1l_w, v_r1h_w); + const __m256i v_r2_w = _mm256_packs_epi32(v_r2l_w, v_r2h_w); + const __m256i v_r3_w = _mm256_packs_epi32(v_r3l_w, v_r3h_w); + + _mm256_store_si256((__m256i *)(d), v_r0_w); + _mm256_store_si256((__m256i *)(d + 16), v_r1_w); + _mm256_store_si256((__m256i *)(d + 32), v_r2_w); + _mm256_store_si256((__m256i *)(d + 48), v_r3_w); + + a += 64; + b += 64; + d += 64; + N -= 64; + } while (N); +} diff --git a/libs/libaom/src/av1/encoder/x86/wedge_utils_sse2.c b/libs/libaom/src/av1/encoder/x86/wedge_utils_sse2.c new file mode 100644 index 000000000..f3f4b8a75 --- /dev/null +++ b/libs/libaom/src/av1/encoder/x86/wedge_utils_sse2.c @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/x86/synonyms.h" + +#include "aom/aom_integer.h" + +#include "av1/common/reconinter.h" + +#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS) + +/** + * See av1_wedge_sse_from_residuals_c + */ +uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d, + const uint8_t *m, int N) { + int n = -N; + int n8 = n + 8; + + uint64_t csse; + + const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE); + const __m128i v_zext_q = xx_set1_64_from_32i(0xffffffff); + + __m128i v_acc0_q = _mm_setzero_si128(); + + assert(N % 64 == 0); + + r1 += N; + d += N; + m += N; + + do { + const __m128i v_r0_w = xx_load_128(r1 + n); + const __m128i v_r1_w = xx_load_128(r1 + n8); + const __m128i v_d0_w = xx_load_128(d + n); + const __m128i v_d1_w = xx_load_128(d + n8); + const __m128i v_m01_b = xx_load_128(m + n); + + const __m128i v_rd0l_w = _mm_unpacklo_epi16(v_d0_w, v_r0_w); + const __m128i v_rd0h_w = _mm_unpackhi_epi16(v_d0_w, v_r0_w); + const __m128i v_rd1l_w = _mm_unpacklo_epi16(v_d1_w, v_r1_w); + const __m128i v_rd1h_w = _mm_unpackhi_epi16(v_d1_w, v_r1_w); + const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128()); + const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128()); + + const __m128i v_m0l_w = _mm_unpacklo_epi16(v_m0_w, v_mask_max_w); + const __m128i v_m0h_w = _mm_unpackhi_epi16(v_m0_w, v_mask_max_w); + const __m128i v_m1l_w = _mm_unpacklo_epi16(v_m1_w, v_mask_max_w); + const __m128i v_m1h_w = _mm_unpackhi_epi16(v_m1_w, v_mask_max_w); + + const __m128i v_t0l_d = _mm_madd_epi16(v_rd0l_w, v_m0l_w); + const __m128i v_t0h_d = _mm_madd_epi16(v_rd0h_w, v_m0h_w); + const __m128i v_t1l_d = _mm_madd_epi16(v_rd1l_w, v_m1l_w); + const __m128i v_t1h_d = _mm_madd_epi16(v_rd1h_w, v_m1h_w); + + const __m128i v_t0_w = _mm_packs_epi32(v_t0l_d, v_t0h_d); + const __m128i v_t1_w = _mm_packs_epi32(v_t1l_d, v_t1h_d); + + const __m128i v_sq0_d = _mm_madd_epi16(v_t0_w, v_t0_w); + const __m128i v_sq1_d = _mm_madd_epi16(v_t1_w, v_t1_w); + + const __m128i v_sum0_q = _mm_add_epi64(_mm_and_si128(v_sq0_d, v_zext_q), + _mm_srli_epi64(v_sq0_d, 32)); + const __m128i v_sum1_q = _mm_add_epi64(_mm_and_si128(v_sq1_d, v_zext_q), + _mm_srli_epi64(v_sq1_d, 32)); + + v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum0_q); + v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum1_q); + + n8 += 16; + n += 16; + } while (n); + + v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8)); + +#if ARCH_X86_64 + csse = (uint64_t)_mm_cvtsi128_si64(v_acc0_q); +#else + xx_storel_64(&csse, v_acc0_q); +#endif + + return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); +} + +/** + * See av1_wedge_sign_from_residuals_c + */ +int8_t av1_wedge_sign_from_residuals_sse2(const int16_t *ds, const uint8_t *m, + int N, int64_t limit) { + int64_t acc; + + __m128i v_sign_d; + __m128i v_acc0_d = _mm_setzero_si128(); + __m128i v_acc1_d = _mm_setzero_si128(); + __m128i v_acc_q; + + // Input size limited to 8192 by the use of 32 bit accumulators and m + // being between [0, 64]. Overflow might happen at larger sizes, + // though it is practically impossible on real video input. + assert(N < 8192); + assert(N % 64 == 0); + + do { + const __m128i v_m01_b = xx_load_128(m); + const __m128i v_m23_b = xx_load_128(m + 16); + const __m128i v_m45_b = xx_load_128(m + 32); + const __m128i v_m67_b = xx_load_128(m + 48); + + const __m128i v_d0_w = xx_load_128(ds); + const __m128i v_d1_w = xx_load_128(ds + 8); + const __m128i v_d2_w = xx_load_128(ds + 16); + const __m128i v_d3_w = xx_load_128(ds + 24); + const __m128i v_d4_w = xx_load_128(ds + 32); + const __m128i v_d5_w = xx_load_128(ds + 40); + const __m128i v_d6_w = xx_load_128(ds + 48); + const __m128i v_d7_w = xx_load_128(ds + 56); + + const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128()); + const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128()); + const __m128i v_m2_w = _mm_unpacklo_epi8(v_m23_b, _mm_setzero_si128()); + const __m128i v_m3_w = _mm_unpackhi_epi8(v_m23_b, _mm_setzero_si128()); + const __m128i v_m4_w = _mm_unpacklo_epi8(v_m45_b, _mm_setzero_si128()); + const __m128i v_m5_w = _mm_unpackhi_epi8(v_m45_b, _mm_setzero_si128()); + const __m128i v_m6_w = _mm_unpacklo_epi8(v_m67_b, _mm_setzero_si128()); + const __m128i v_m7_w = _mm_unpackhi_epi8(v_m67_b, _mm_setzero_si128()); + + const __m128i v_p0_d = _mm_madd_epi16(v_d0_w, v_m0_w); + const __m128i v_p1_d = _mm_madd_epi16(v_d1_w, v_m1_w); + const __m128i v_p2_d = _mm_madd_epi16(v_d2_w, v_m2_w); + const __m128i v_p3_d = _mm_madd_epi16(v_d3_w, v_m3_w); + const __m128i v_p4_d = _mm_madd_epi16(v_d4_w, v_m4_w); + const __m128i v_p5_d = _mm_madd_epi16(v_d5_w, v_m5_w); + const __m128i v_p6_d = _mm_madd_epi16(v_d6_w, v_m6_w); + const __m128i v_p7_d = _mm_madd_epi16(v_d7_w, v_m7_w); + + const __m128i v_p01_d = _mm_add_epi32(v_p0_d, v_p1_d); + const __m128i v_p23_d = _mm_add_epi32(v_p2_d, v_p3_d); + const __m128i v_p45_d = _mm_add_epi32(v_p4_d, v_p5_d); + const __m128i v_p67_d = _mm_add_epi32(v_p6_d, v_p7_d); + + const __m128i v_p0123_d = _mm_add_epi32(v_p01_d, v_p23_d); + const __m128i v_p4567_d = _mm_add_epi32(v_p45_d, v_p67_d); + + v_acc0_d = _mm_add_epi32(v_acc0_d, v_p0123_d); + v_acc1_d = _mm_add_epi32(v_acc1_d, v_p4567_d); + + ds += 64; + m += 64; + + N -= 64; + } while (N); + + v_sign_d = _mm_cmplt_epi32(v_acc0_d, _mm_setzero_si128()); + v_acc0_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc0_d, v_sign_d), + _mm_unpackhi_epi32(v_acc0_d, v_sign_d)); + + v_sign_d = _mm_cmplt_epi32(v_acc1_d, _mm_setzero_si128()); + v_acc1_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc1_d, v_sign_d), + _mm_unpackhi_epi32(v_acc1_d, v_sign_d)); + + v_acc_q = _mm_add_epi64(v_acc0_d, v_acc1_d); + + v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); + +#if ARCH_X86_64 + acc = (uint64_t)_mm_cvtsi128_si64(v_acc_q); +#else + xx_storel_64(&acc, v_acc_q); +#endif + + return acc > limit; +} + +// Negate under mask +static INLINE __m128i negm_epi16(__m128i v_v_w, __m128i v_mask_w) { + return _mm_sub_epi16(_mm_xor_si128(v_v_w, v_mask_w), v_mask_w); +} + +/** + * av1_wedge_compute_delta_squares_c + */ +void av1_wedge_compute_delta_squares_sse2(int16_t *d, const int16_t *a, + const int16_t *b, int N) { + const __m128i v_neg_w = _mm_set_epi16((short)0xffff, 0, (short)0xffff, 0, + (short)0xffff, 0, (short)0xffff, 0); + + assert(N % 64 == 0); + + do { + const __m128i v_a0_w = xx_load_128(a); + const __m128i v_b0_w = xx_load_128(b); + const __m128i v_a1_w = xx_load_128(a + 8); + const __m128i v_b1_w = xx_load_128(b + 8); + const __m128i v_a2_w = xx_load_128(a + 16); + const __m128i v_b2_w = xx_load_128(b + 16); + const __m128i v_a3_w = xx_load_128(a + 24); + const __m128i v_b3_w = xx_load_128(b + 24); + + const __m128i v_ab0l_w = _mm_unpacklo_epi16(v_a0_w, v_b0_w); + const __m128i v_ab0h_w = _mm_unpackhi_epi16(v_a0_w, v_b0_w); + const __m128i v_ab1l_w = _mm_unpacklo_epi16(v_a1_w, v_b1_w); + const __m128i v_ab1h_w = _mm_unpackhi_epi16(v_a1_w, v_b1_w); + const __m128i v_ab2l_w = _mm_unpacklo_epi16(v_a2_w, v_b2_w); + const __m128i v_ab2h_w = _mm_unpackhi_epi16(v_a2_w, v_b2_w); + const __m128i v_ab3l_w = _mm_unpacklo_epi16(v_a3_w, v_b3_w); + const __m128i v_ab3h_w = _mm_unpackhi_epi16(v_a3_w, v_b3_w); + + // Negate top word of pairs + const __m128i v_abl0n_w = negm_epi16(v_ab0l_w, v_neg_w); + const __m128i v_abh0n_w = negm_epi16(v_ab0h_w, v_neg_w); + const __m128i v_abl1n_w = negm_epi16(v_ab1l_w, v_neg_w); + const __m128i v_abh1n_w = negm_epi16(v_ab1h_w, v_neg_w); + const __m128i v_abl2n_w = negm_epi16(v_ab2l_w, v_neg_w); + const __m128i v_abh2n_w = negm_epi16(v_ab2h_w, v_neg_w); + const __m128i v_abl3n_w = negm_epi16(v_ab3l_w, v_neg_w); + const __m128i v_abh3n_w = negm_epi16(v_ab3h_w, v_neg_w); + + const __m128i v_r0l_w = _mm_madd_epi16(v_ab0l_w, v_abl0n_w); + const __m128i v_r0h_w = _mm_madd_epi16(v_ab0h_w, v_abh0n_w); + const __m128i v_r1l_w = _mm_madd_epi16(v_ab1l_w, v_abl1n_w); + const __m128i v_r1h_w = _mm_madd_epi16(v_ab1h_w, v_abh1n_w); + const __m128i v_r2l_w = _mm_madd_epi16(v_ab2l_w, v_abl2n_w); + const __m128i v_r2h_w = _mm_madd_epi16(v_ab2h_w, v_abh2n_w); + const __m128i v_r3l_w = _mm_madd_epi16(v_ab3l_w, v_abl3n_w); + const __m128i v_r3h_w = _mm_madd_epi16(v_ab3h_w, v_abh3n_w); + + const __m128i v_r0_w = _mm_packs_epi32(v_r0l_w, v_r0h_w); + const __m128i v_r1_w = _mm_packs_epi32(v_r1l_w, v_r1h_w); + const __m128i v_r2_w = _mm_packs_epi32(v_r2l_w, v_r2h_w); + const __m128i v_r3_w = _mm_packs_epi32(v_r3l_w, v_r3h_w); + + xx_store_128(d, v_r0_w); + xx_store_128(d + 8, v_r1_w); + xx_store_128(d + 16, v_r2_w); + xx_store_128(d + 24, v_r3_w); + + a += 32; + b += 32; + d += 32; + N -= 32; + } while (N); +} diff --git a/libs/libaom/src/av1/exports_com b/libs/libaom/src/av1/exports_com new file mode 100644 index 000000000..5c8e0e09d --- /dev/null +++ b/libs/libaom/src/av1/exports_com @@ -0,0 +1,2 @@ +text aom_read_obu_header_and_size +text av1_resize_frame420 diff --git a/libs/libaom/src/av1/exports_dec b/libs/libaom/src/av1/exports_dec new file mode 100644 index 000000000..daabf6766 --- /dev/null +++ b/libs/libaom/src/av1/exports_dec @@ -0,0 +1,3 @@ +data aom_codec_av1_dx_algo +text aom_codec_av1_dx +text av1_add_film_grain diff --git a/libs/libaom/src/av1/exports_enc b/libs/libaom/src/av1/exports_enc new file mode 100644 index 000000000..dc4a9eae7 --- /dev/null +++ b/libs/libaom/src/av1/exports_enc @@ -0,0 +1,2 @@ +data aom_codec_av1_cx_algo +text aom_codec_av1_cx diff --git a/libs/libaom/src/av1/exports_ident b/libs/libaom/src/av1/exports_ident new file mode 100644 index 000000000..b523a679d --- /dev/null +++ b/libs/libaom/src/av1/exports_ident @@ -0,0 +1,2 @@ +text ifd_init +text ifd_inspect diff --git a/libs/libaom/src/av1/exports_test b/libs/libaom/src/av1/exports_test new file mode 100644 index 000000000..dab377575 --- /dev/null +++ b/libs/libaom/src/av1/exports_test @@ -0,0 +1,2 @@ +text av1_get_fwd_txfm_cfg +text av1_rtcd diff --git a/libs/libaom/src/build/.gitattributes b/libs/libaom/src/build/.gitattributes new file mode 100644 index 000000000..03db79bc0 --- /dev/null +++ b/libs/libaom/src/build/.gitattributes @@ -0,0 +1,2 @@ +*-vs8/*.rules -crlf +*-msvs/*.rules -crlf diff --git a/libs/libaom/src/build/cmake/aom_config.c.template b/libs/libaom/src/build/cmake/aom_config.c.template new file mode 100644 index 000000000..62f0a10ab --- /dev/null +++ b/libs/libaom/src/build/cmake/aom_config.c.template @@ -0,0 +1,13 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "aom/aom_codec.h" +static const char* const cfg = "${AOM_CMAKE_CONFIG}"; +const char *aom_codec_build_config(void) {return cfg;} diff --git a/libs/libaom/src/build/cmake/aom_config_defaults.cmake b/libs/libaom/src/build/cmake/aom_config_defaults.cmake new file mode 100644 index 000000000..f9e70eb24 --- /dev/null +++ b/libs/libaom/src/build/cmake/aom_config_defaults.cmake @@ -0,0 +1,193 @@ +# +# Copyright (c) 2016, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. + +include("${AOM_ROOT}/build/cmake/util.cmake") + +# This file sets default values for libaom configuration variables. All libaom +# config variables are added to the CMake variable cache via the macros provided +# in util.cmake. + +# +# The variables in this section of the file are detected at configuration time, +# but can be overridden via the use of CONFIG_* and ENABLE_* values also defined +# in this file. +# + +set_aom_detect_var(INLINE "" "Sets INLINE value for current target.") + +# CPUs. +set_aom_detect_var(ARCH_ARM 0 "Enables ARM architecture.") +set_aom_detect_var(ARCH_MIPS 0 "Enables MIPS architecture.") +set_aom_detect_var(ARCH_PPC 0 "Enables PPC architecture.") +set_aom_detect_var(ARCH_X86 0 "Enables X86 architecture.") +set_aom_detect_var(ARCH_X86_64 0 "Enables X86_64 architecture.") + +# ARM feature flags. +set_aom_detect_var(HAVE_NEON 0 "Enables NEON intrinsics optimizations.") + +# MIPS feature flags. +set_aom_detect_var(HAVE_DSPR2 0 "Enables DSPR2 optimizations.") +set_aom_detect_var(HAVE_MIPS32 0 "Enables MIPS32 optimizations.") +set_aom_detect_var(HAVE_MIPS64 0 "Enables MIPS64 optimizations. ") +set_aom_detect_var(HAVE_MSA 0 "Enables MSA optimizations.") + +# PPC feature flags. +set_aom_detect_var(HAVE_VSX 0 "Enables VSX optimizations.") + +# x86/x86_64 feature flags. +set_aom_detect_var(HAVE_AVX 0 "Enables AVX optimizations.") +set_aom_detect_var(HAVE_AVX2 0 "Enables AVX2 optimizations.") +set_aom_detect_var(HAVE_MMX 0 "Enables MMX optimizations. ") +set_aom_detect_var(HAVE_SSE 0 "Enables SSE optimizations.") +set_aom_detect_var(HAVE_SSE2 0 "Enables SSE2 optimizations.") +set_aom_detect_var(HAVE_SSE3 0 "Enables SSE3 optimizations.") +set_aom_detect_var(HAVE_SSE4_1 0 "Enables SSE 4.1 optimizations.") +set_aom_detect_var(HAVE_SSE4_2 0 "Enables SSE 4.2 optimizations.") +set_aom_detect_var(HAVE_SSSE3 0 "Enables SSSE3 optimizations.") + +# Flags describing the build environment. +set_aom_detect_var(HAVE_FEXCEPT 0 + "Internal flag, GNU fenv.h present for target.") +set_aom_detect_var(HAVE_PTHREAD_H 0 "Internal flag, target pthread support.") +set_aom_detect_var(HAVE_UNISTD_H 0 + "Internal flag, unistd.h present for target.") +set_aom_detect_var(HAVE_WXWIDGETS 0 "WxWidgets present.") + +# +# Variables in this section can be set from the CMake command line or from +# within the CMake GUI. The variables control libaom features. +# + +# Build configuration flags. +set_aom_config_var(AOM_RTCD_FLAGS "" + "Arguments to pass to rtcd.pl. Separate with ';'") +set_aom_config_var(CONFIG_AV1_DECODER 1 "Enable AV1 decoder.") +set_aom_config_var(CONFIG_AV1_ENCODER 1 "Enable AV1 encoder.") +set_aom_config_var(CONFIG_BIG_ENDIAN 0 "Internal flag.") +set_aom_config_var(CONFIG_GCC 0 "Building with GCC (detect).") +set_aom_config_var(CONFIG_GCOV 0 "Enable gcov support.") +set_aom_config_var(CONFIG_GPROF 0 "Enable gprof support.") +set_aom_config_var(CONFIG_LIBYUV 1 "Enables libyuv scaling/conversion support.") + +set_aom_config_var(CONFIG_MULTITHREAD 1 "Multithread support.") +set_aom_config_var(CONFIG_OS_SUPPORT 0 "Internal flag.") +set_aom_config_var(CONFIG_PIC 0 "Build with PIC enabled.") +set_aom_config_var(CONFIG_RUNTIME_CPU_DETECT 1 "Runtime CPU detection support.") +set_aom_config_var(CONFIG_SHARED 0 "Build shared libs.") +set_aom_config_var(CONFIG_WEBM_IO 1 "Enables WebM support.") + +# Debugging flags. +set_aom_config_var(CONFIG_BITSTREAM_DEBUG 0 "Bitstream debugging flag.") +set_aom_config_var(CONFIG_DEBUG 0 "Debug build flag.") +set_aom_config_var(CONFIG_MISMATCH_DEBUG 0 "Mismatch debugging flag.") + +# AV1 feature flags. +set_aom_config_var(CONFIG_ACCOUNTING 0 "Enables bit accounting.") +set_aom_config_var(CONFIG_ANALYZER 0 "Enables bit stream analyzer.") +set_aom_config_var(CONFIG_COEFFICIENT_RANGE_CHECKING 0 + "Coefficient range check.") +set_aom_config_var(CONFIG_DENOISE 1 + "Denoise/noise modeling support in encoder.") +set_aom_config_var(CONFIG_INSPECTION 0 "Enables bitstream inspection.") +set_aom_config_var(CONFIG_INTERNAL_STATS 0 "Enables internal encoder stats.") +set_aom_config_var(FORCE_HIGHBITDEPTH_DECODING 0 + "Force high bitdepth decoding pipeline on 8-bit input.") +mark_as_advanced(FORCE_HIGHBITDEPTH_DECODING) +set_aom_config_var(CONFIG_MAX_DECODE_PROFILE 2 + "Max profile to support decoding.") +set_aom_config_var(CONFIG_NORMAL_TILE_MODE 0 "Only enables normal tile mode.") +set_aom_config_var(CONFIG_SIZE_LIMIT 0 "Limit max decode width/height.") +set_aom_config_var(CONFIG_SPATIAL_RESAMPLING 1 "Spatial resampling.") +set_aom_config_var(DECODE_HEIGHT_LIMIT 0 "Set limit for decode height.") +set_aom_config_var(DECODE_WIDTH_LIMIT 0 "Set limit for decode width.") +set_aom_config_var(CONFIG_TUNE_VMAF 0 "Enable encoding tuning for VMAF.") + +# AV1 experiment flags. +set_aom_config_var(CONFIG_SPEED_STATS 0 "AV1 experiment flag.") +set_aom_config_var(CONFIG_COLLECT_RD_STATS 0 "AV1 experiment flag.") +set_aom_config_var(CONFIG_DIST_8X8 0 "AV1 experiment flag.") +set_aom_config_var(CONFIG_ENTROPY_STATS 0 "AV1 experiment flag.") +set_aom_config_var(CONFIG_INTER_STATS_ONLY 0 "AV1 experiment flag.") +set_aom_config_var(CONFIG_RD_DEBUG 0 "AV1 experiment flag.") +set_aom_config_var(CONFIG_SHARP_SETTINGS 0 "AV1 experiment flag.") +set_aom_config_var(CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 1 + "Disable full_pixel_motion_search_based_split on BLOCK_8X8.") +set_aom_config_var(CONFIG_COLLECT_PARTITION_STATS 0 + "Collect stats on partition decisions.") +set_aom_config_var(CONFIG_COLLECT_COMPONENT_TIMING 0 + "Collect encoding component timing information.") +set_aom_config_var(CONFIG_LPF_MASK 0 + "Enable the use loop filter bitmasks for optimizations.") +set_aom_config_var(CONFIG_HTB_TRELLIS 0 + "Enable the use of hash table for trellis optimizations.") +set_aom_config_var(CONFIG_REALTIME_ONLY 0 + "Build for RTC-only to reduce binary size.") +set_aom_config_var(CONFIG_AV1_HIGHBITDEPTH 1 + "Build with high bitdepth support.") +set_aom_config_var(CONFIG_NN_V2 0 "Fully-connected neural nets ver.2.") +set_aom_config_var(CONFIG_SUPERRES_IN_RECODE 1 + "Enable encoding both full-res and superres in recode loop" + "when SUPERRES_AUTO mode is used.") +# +# Variables in this section control optional features of the build system. +# +set_aom_option_var(ENABLE_CCACHE "Enable ccache support." OFF) +set_aom_option_var(ENABLE_DECODE_PERF_TESTS "Enables decoder performance tests" + OFF) +set_aom_option_var(ENABLE_DISTCC "Enable distcc support." OFF) +set_aom_option_var(ENABLE_DOCS + "Enable documentation generation (doxygen required)." ON) +set_aom_option_var(ENABLE_ENCODE_PERF_TESTS "Enables encoder performance tests" + OFF) +set_aom_option_var(ENABLE_EXAMPLES "Enables build of example code." ON) +set_aom_option_var(ENABLE_GOMA "Enable goma support." OFF) +set_aom_option_var( + ENABLE_IDE_TEST_HOSTING + "Enables running tests within IDEs like Visual Studio and Xcode." OFF) +set_aom_option_var(ENABLE_NASM "Use nasm instead of yasm for x86 assembly." OFF) +set_aom_option_var(ENABLE_TESTDATA "Enables unit test data download targets." + ON) +set_aom_option_var(ENABLE_TESTS "Enables unit tests." ON) +set_aom_option_var(ENABLE_TOOLS "Enable applications in tools sub directory." + ON) +set_aom_option_var(ENABLE_WERROR "Converts warnings to errors at compile time." + OFF) + +# ARM assembly/intrinsics flags. +set_aom_option_var(ENABLE_NEON "Enables NEON optimizations on ARM targets." ON) + +# MIPS assembly/intrinsics flags. +set_aom_option_var(ENABLE_DSPR2 "Enables DSPR2 optimizations on MIPS targets." + OFF) +set_aom_option_var(ENABLE_MSA "Enables MSA optimizations on MIPS targets." OFF) + +# VSX intrinsics flags. +set_aom_option_var(ENABLE_VSX "Enables VSX optimizations on PowerPC targets." + ON) + +# x86/x86_64 assembly/intrinsics flags. +set_aom_option_var(ENABLE_MMX "Enables MMX optimizations on x86/x86_64 targets." + ON) +set_aom_option_var(ENABLE_SSE "Enables SSE optimizations on x86/x86_64 targets." + ON) +set_aom_option_var(ENABLE_SSE2 + "Enables SSE2 optimizations on x86/x86_64 targets." ON) +set_aom_option_var(ENABLE_SSE3 + "Enables SSE3 optimizations on x86/x86_64 targets." ON) +set_aom_option_var(ENABLE_SSSE3 + "Enables SSSE3 optimizations on x86/x86_64 targets." ON) +set_aom_option_var(ENABLE_SSE4_1 + "Enables SSE4_1 optimizations on x86/x86_64 targets." ON) +set_aom_option_var(ENABLE_SSE4_2 + "Enables SSE4_2 optimizations on x86/x86_64 targets." ON) +set_aom_option_var(ENABLE_AVX "Enables AVX optimizations on x86/x86_64 targets." + ON) +set_aom_option_var(ENABLE_AVX2 + "Enables AVX2 optimizations on x86/x86_64 targets." ON) diff --git a/libs/libaom/src/build/cmake/aom_configure.cmake b/libs/libaom/src/build/cmake/aom_configure.cmake new file mode 100644 index 000000000..224a46587 --- /dev/null +++ b/libs/libaom/src/build/cmake/aom_configure.cmake @@ -0,0 +1,399 @@ +# +# Copyright (c) 2016, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_BUILD_CMAKE_AOM_CONFIGURE_CMAKE_) + return() +endif() # AOM_BUILD_CMAKE_AOM_CONFIGURE_CMAKE_ +set(AOM_BUILD_CMAKE_AOM_CONFIGURE_CMAKE_ 1) + +include(FindGit) +include(FindPerl) +include(FindThreads) + +include("${AOM_ROOT}/build/cmake/aom_config_defaults.cmake") +include("${AOM_ROOT}/build/cmake/aom_experiment_deps.cmake") +include("${AOM_ROOT}/build/cmake/aom_optimization.cmake") +include("${AOM_ROOT}/build/cmake/compiler_flags.cmake") +include("${AOM_ROOT}/build/cmake/compiler_tests.cmake") +include("${AOM_ROOT}/build/cmake/util.cmake") + +if(DEFINED CONFIG_LOWBITDEPTH) + message(WARNING "CONFIG_LOWBITDEPTH has been removed. \ + Use -DFORCE_HIGHBITDEPTH_DECODING=1 instead of -DCONFIG_LOWBITDEPTH=0 \ + and -DFORCE_HIGHBITDEPTH_DECODING=0 instead of -DCONFIG_LOWBITDEPTH=1.") + if(NOT CONFIG_LOWBITDEPTH) + set(FORCE_HIGHBITDEPTH_DECODING + 1 + CACHE STRING "${cmake_cmdline_helpstring}" FORCE) + endif() +endif() + +if(FORCE_HIGHBITDEPTH_DECODING AND NOT CONFIG_AV1_HIGHBITDEPTH) + change_config_and_warn(CONFIG_AV1_HIGHBITDEPTH 1 + "FORCE_HIGHBITDEPTH_DECODING") +endif() + +# Generate the user config settings. +list(APPEND aom_build_vars ${AOM_CONFIG_VARS} ${AOM_OPTION_VARS}) +foreach(cache_var ${aom_build_vars}) + get_property(cache_var_helpstring CACHE ${cache_var} PROPERTY HELPSTRING) + if("${cache_var_helpstring}" STREQUAL "${cmake_cmdline_helpstring}") + set(AOM_CMAKE_CONFIG "${AOM_CMAKE_CONFIG} -D${cache_var}=${${cache_var}}") + endif() +endforeach() +string(STRIP "${AOM_CMAKE_CONFIG}" AOM_CMAKE_CONFIG) + +# Detect target CPU. +if(NOT AOM_TARGET_CPU) + string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" cpu_lowercase) + if("${cpu_lowercase}" STREQUAL "amd64" + OR "${cpu_lowercase}" STREQUAL "x86_64") + if(${CMAKE_SIZEOF_VOID_P} EQUAL 4) + set(AOM_TARGET_CPU "x86") + elseif(${CMAKE_SIZEOF_VOID_P} EQUAL 8) + set(AOM_TARGET_CPU "x86_64") + else() + message( + FATAL_ERROR "--- Unexpected pointer size (${CMAKE_SIZEOF_VOID_P}) for\n" + " CMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}\n" + " CMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}\n" + " CMAKE_GENERATOR=${CMAKE_GENERATOR}\n") + endif() + elseif("${cpu_lowercase}" STREQUAL "i386" + OR "${cpu_lowercase}" STREQUAL "x86") + set(AOM_TARGET_CPU "x86") + elseif("${cpu_lowercase}" MATCHES "^arm" + OR "${cpu_lowercase}" MATCHES "^mips") + set(AOM_TARGET_CPU "${cpu_lowercase}") + elseif("${cpu_lowercase}" MATCHES "aarch64") + set(AOM_TARGET_CPU "arm64") + elseif("${cpu_lowercase}" MATCHES "^ppc") + set(AOM_TARGET_CPU "ppc") + else() + message(WARNING "The architecture ${CMAKE_SYSTEM_PROCESSOR} is not " + "supported, falling back to the generic target") + set(AOM_TARGET_CPU "generic") + endif() +endif() + +if(CMAKE_TOOLCHAIN_FILE) # Add toolchain file to config string. + if(IS_ABSOLUTE "${CMAKE_TOOLCHAIN_FILE}") + file(RELATIVE_PATH toolchain_path "${AOM_CONFIG_DIR}" + "${CMAKE_TOOLCHAIN_FILE}") + else() + set(toolchain_path "${CMAKE_TOOLCHAIN_FILE}") + endif() + set(toolchain_string "-DCMAKE_TOOLCHAIN_FILE=\\\"${toolchain_path}\\\"") + set(AOM_CMAKE_CONFIG "${toolchain_string} ${AOM_CMAKE_CONFIG}") +else() + + # Add detected CPU to the config string. + set(AOM_CMAKE_CONFIG "-DAOM_TARGET_CPU=${AOM_TARGET_CPU} ${AOM_CMAKE_CONFIG}") +endif() +set(AOM_CMAKE_CONFIG "-G \\\"${CMAKE_GENERATOR}\\\" ${AOM_CMAKE_CONFIG}") +file(RELATIVE_PATH source_path "${AOM_CONFIG_DIR}" "${AOM_ROOT}") +set(AOM_CMAKE_CONFIG "cmake ${source_path} ${AOM_CMAKE_CONFIG}") +string(STRIP "${AOM_CMAKE_CONFIG}" AOM_CMAKE_CONFIG) + +message("--- aom_configure: Detected CPU: ${AOM_TARGET_CPU}") +set(AOM_TARGET_SYSTEM ${CMAKE_SYSTEM_NAME}) + +if("${CMAKE_BUILD_TYPE}" MATCHES "Deb") + set(CONFIG_DEBUG 1) +endif() + +if(BUILD_SHARED_LIBS) + set(CONFIG_PIC 1) + set(CONFIG_SHARED 1) +endif() + +if(NOT MSVC) + if(CONFIG_PIC) + + # TODO(tomfinegan): clang needs -pie in CMAKE_EXE_LINKER_FLAGS for this to + # work. + set(CMAKE_POSITION_INDEPENDENT_CODE ON) + if("${AOM_TARGET_SYSTEM}" STREQUAL "Linux" + AND "${AOM_TARGET_CPU}" MATCHES "^armv[78]") + set(AOM_AS_FLAGS ${AOM_AS_FLAGS} --defsym PIC=1) + else() + set(AOM_AS_FLAGS ${AOM_AS_FLAGS} -DPIC) + endif() + endif() +endif() + +if("${AOM_TARGET_CPU}" STREQUAL "x86" OR "${AOM_TARGET_CPU}" STREQUAL "x86_64") + find_program(AS_EXECUTABLE yasm $ENV{YASM_PATH}) + if(NOT AS_EXECUTABLE OR ENABLE_NASM) + unset(AS_EXECUTABLE CACHE) + find_program(AS_EXECUTABLE nasm $ENV{NASM_PATH}) + if(AS_EXECUTABLE) + test_nasm() + endif() + endif() + + if(NOT AS_EXECUTABLE) + message( + FATAL_ERROR + "Unable to find assembler. Install 'yasm' or 'nasm.' " + "To build without optimizations, add -DAOM_TARGET_CPU=generic to " + "your cmake command line.") + endif() + get_asm_obj_format("objformat") + set(AOM_AS_FLAGS -f ${objformat} ${AOM_AS_FLAGS}) + string(STRIP "${AOM_AS_FLAGS}" AOM_AS_FLAGS) +elseif("${AOM_TARGET_CPU}" MATCHES "arm") + if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin") + set(AS_EXECUTABLE as) + set(AOM_AS_FLAGS -arch ${AOM_TARGET_CPU} -isysroot ${CMAKE_OSX_SYSROOT}) + elseif("${AOM_TARGET_SYSTEM}" STREQUAL "Windows") + if(NOT AS_EXECUTABLE) + set(AS_EXECUTABLE ${CMAKE_C_COMPILER} -c -mimplicit-it=always) + endif() + else() + if(NOT AS_EXECUTABLE) + set(AS_EXECUTABLE as) + endif() + endif() + find_program(as_executable_found ${AS_EXECUTABLE}) + if(NOT as_executable_found) + message( + FATAL_ERROR + "Unable to find assembler and optimizations are enabled." + "Searched for ${AS_EXECUTABLE}. Install it, add it to your path, or " + "set the assembler directly by adding -DAS_EXECUTABLE= " + "to your CMake command line." + "To build without optimizations, add -DAOM_TARGET_CPU=generic to your " + "cmake command line.") + endif() + string(STRIP "${AOM_AS_FLAGS}" AOM_AS_FLAGS) +endif() + +if(CONFIG_ANALYZER) + include(FindwxWidgets) + find_package(wxWidgets REQUIRED adv base core) + include(${wxWidgets_USE_FILE}) +endif() + +if(NOT MSVC AND CMAKE_C_COMPILER_ID MATCHES "GNU\|Clang") + set(CONFIG_GCC 1) +endif() + +if(CONFIG_GCOV) + message("--- Testing for CONFIG_GCOV support.") + require_linker_flag("-fprofile-arcs -ftest-coverage") + require_compiler_flag("-fprofile-arcs -ftest-coverage" YES) +endif() + +if(CONFIG_GPROF) + message("--- Testing for CONFIG_GPROF support.") + require_compiler_flag("-pg" YES) +endif() + +if("${AOM_TARGET_SYSTEM}" MATCHES "Darwin\|Linux\|Windows\|Android") + set(CONFIG_OS_SUPPORT 1) +endif() + +# The default _WIN32_WINNT value in MinGW is 0x0502 (Windows XP with SP2). Set +# it to 0x0601 (Windows 7). +if("${AOM_TARGET_SYSTEM}" STREQUAL "Windows") + add_compiler_flag_if_supported("-D_WIN32_WINNT=0x0601") +endif() + +# +# Fix CONFIG_* dependencies. This must be done before including cpu.cmake to +# ensure RTCD_CONFIG_* are properly set. +fix_experiment_configs() + +# Test compiler support. +aom_get_inline("INLINE") + +# Don't just check for pthread.h, but use the result of the full pthreads +# including a linking check in FindThreads above. +set(HAVE_PTHREAD_H ${CMAKE_USE_PTHREADS_INIT}) +aom_check_source_compiles("unistd_check" "#include " HAVE_UNISTD_H) + +if(NOT MSVC) + aom_push_var(CMAKE_REQUIRED_LIBRARIES "m") + aom_check_c_compiles("fenv_check" "#define _GNU_SOURCE + #include + void unused(void) { + (void)unused; + (void)feenableexcept(FE_DIVBYZERO | FE_INVALID); + }" HAVE_FEXCEPT) + aom_pop_var(CMAKE_REQUIRED_LIBRARIES) +endif() + +include("${AOM_ROOT}/build/cmake/cpu.cmake") + +if(ENABLE_CCACHE) + set_compiler_launcher(ENABLE_CCACHE ccache) +endif() + +if(ENABLE_DISTCC) + set_compiler_launcher(ENABLE_DISTCC distcc) +endif() + +if(ENABLE_GOMA) + set_compiler_launcher(ENABLE_GOMA gomacc) +endif() + +if(NOT CONFIG_AV1_DECODER AND NOT CONFIG_AV1_ENCODER) + message(FATAL_ERROR "Decoder and encoder disabled, nothing to build.") +endif() + +if(DECODE_HEIGHT_LIMIT OR DECODE_WIDTH_LIMIT) + change_config_and_warn(CONFIG_SIZE_LIMIT 1 + "DECODE_HEIGHT_LIMIT and DECODE_WIDTH_LIMIT") +endif() + +if(CONFIG_SIZE_LIMIT) + if(NOT DECODE_HEIGHT_LIMIT OR NOT DECODE_WIDTH_LIMIT) + message(FATAL_ERROR "When setting CONFIG_SIZE_LIMIT, DECODE_HEIGHT_LIMIT " + "and DECODE_WIDTH_LIMIT must be set.") + endif() +endif() + +# Test compiler flags. +if(MSVC) + add_compiler_flag_if_supported("/W3") + + # Disable MSVC warnings that suggest making code non-portable. + add_compiler_flag_if_supported("/wd4996") + if(ENABLE_WERROR) + add_compiler_flag_if_supported("/WX") + endif() +else() + require_c_flag("-std=c99" YES) + require_cxx_flag_nomsvc("-std=c++11" YES) + add_compiler_flag_if_supported("-Wall") + add_compiler_flag_if_supported("-Wdisabled-optimization") + add_compiler_flag_if_supported("-Wextra") + add_compiler_flag_if_supported("-Wfloat-conversion") + add_c_flag_if_supported("-Wimplicit-function-declaration") + add_compiler_flag_if_supported("-Wlogical-op") + add_compiler_flag_if_supported("-Wpointer-arith") + add_compiler_flag_if_supported("-Wshorten-64-to-32") + add_compiler_flag_if_supported("-Wsign-compare") + add_compiler_flag_if_supported("-Wstring-conversion") + add_compiler_flag_if_supported("-Wtype-limits") + add_compiler_flag_if_supported("-Wuninitialized") + add_compiler_flag_if_supported("-Wunused") + add_compiler_flag_if_supported("-Wvla") + + if(CMAKE_C_COMPILER_ID MATCHES "GNU" + AND "${SANITIZE}" MATCHES "address|undefined") + + # This combination has more stack overhead, so we account for it by + # providing higher stack limit than usual. + add_c_flag_if_supported("-Wstack-usage=170000") + add_cxx_flag_if_supported("-Wstack-usage=270000") + elseif(CONFIG_RD_DEBUG) # Another case where higher stack usage is expected. + add_c_flag_if_supported("-Wstack-usage=117000") + add_cxx_flag_if_supported("-Wstack-usage=240000") + else() + add_c_flag_if_supported("-Wstack-usage=100000") + add_cxx_flag_if_supported("-Wstack-usage=240000") + endif() + + # Add -Wshadow only for C files to avoid massive gtest warning spam. + add_c_flag_if_supported("-Wshadow") + + # Add -Wundef only for C files to avoid massive gtest warning spam. + add_c_flag_if_supported("-Wundef") + + # Quiet gcc 6 vs 7 abi warnings: + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728 + if("${AOM_TARGET_CPU}" MATCHES "arm") + add_cxx_flag_if_supported("-Wno-psabi") + endif() + + if(ENABLE_WERROR) + add_compiler_flag_if_supported("-Werror") + endif() + + if("${CMAKE_BUILD_TYPE}" MATCHES "Rel") + add_compiler_flag_if_supported("-U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0") + endif() + add_compiler_flag_if_supported("-D_LARGEFILE_SOURCE") + add_compiler_flag_if_supported("-D_FILE_OFFSET_BITS=64") +endif() + +set(AOM_LIB_LINK_TYPE PUBLIC) +if(EMSCRIPTEN) + + # Avoid CMake generation time errors resulting from collisions with the form + # of target_link_libraries() used by Emscripten.cmake. + unset(AOM_LIB_LINK_TYPE) +endif() + +# Generate aom_config templates. +set(aom_config_asm_template "${AOM_CONFIG_DIR}/config/aom_config.asm.cmake") +set(aom_config_h_template "${AOM_CONFIG_DIR}/config/aom_config.h.cmake") +execute_process( + COMMAND ${CMAKE_COMMAND} + -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR} -DAOM_ROOT=${AOM_ROOT} -P + "${AOM_ROOT}/build/cmake/generate_aom_config_templates.cmake") + +# Generate aom_config.{asm,h}. +configure_file("${aom_config_asm_template}" + "${AOM_CONFIG_DIR}/config/aom_config.asm") +configure_file("${aom_config_h_template}" + "${AOM_CONFIG_DIR}/config/aom_config.h") + +# Read the current git hash. +find_package(Git) +if(NOT GIT_FOUND) + message("--- Git missing, version will be read from CHANGELOG.") +endif() + +configure_file("${AOM_ROOT}/build/cmake/aom_config.c.template" + "${AOM_CONFIG_DIR}/config/aom_config.c") + +# Find Perl and generate the RTCD sources. +find_package(Perl) +if(NOT PERL_FOUND) + message(FATAL_ERROR "Perl is required to build libaom.") +endif() + +set(AOM_RTCD_CONFIG_FILE_LIST "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd_defs.pl" + "${AOM_ROOT}/aom_scale/aom_scale_rtcd.pl" + "${AOM_ROOT}/av1/common/av1_rtcd_defs.pl") +set(AOM_RTCD_HEADER_FILE_LIST "${AOM_CONFIG_DIR}/config/aom_dsp_rtcd.h" + "${AOM_CONFIG_DIR}/config/aom_scale_rtcd.h" + "${AOM_CONFIG_DIR}/config/av1_rtcd.h") +set(AOM_RTCD_SOURCE_FILE_LIST "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c" + "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c" + "${AOM_ROOT}/av1/common/av1_rtcd.c") +set(AOM_RTCD_SYMBOL_LIST aom_dsp_rtcd aom_scale_rtcd av1_rtcd) +list(LENGTH AOM_RTCD_SYMBOL_LIST AOM_RTCD_CUSTOM_COMMAND_COUNT) +math(EXPR AOM_RTCD_CUSTOM_COMMAND_COUNT "${AOM_RTCD_CUSTOM_COMMAND_COUNT} - 1") + +foreach(NUM RANGE ${AOM_RTCD_CUSTOM_COMMAND_COUNT}) + list(GET AOM_RTCD_CONFIG_FILE_LIST ${NUM} AOM_RTCD_CONFIG_FILE) + list(GET AOM_RTCD_HEADER_FILE_LIST ${NUM} AOM_RTCD_HEADER_FILE) + list(GET AOM_RTCD_SOURCE_FILE_LIST ${NUM} AOM_RTCD_SOURCE_FILE) + list(GET AOM_RTCD_SYMBOL_LIST ${NUM} AOM_RTCD_SYMBOL) + execute_process( + COMMAND + ${PERL_EXECUTABLE} "${AOM_ROOT}/build/cmake/rtcd.pl" + --arch=${AOM_TARGET_CPU} + --sym=${AOM_RTCD_SYMBOL} ${AOM_RTCD_FLAGS} + --config=${AOM_CONFIG_DIR}/config/aom_config.h ${AOM_RTCD_CONFIG_FILE} + OUTPUT_FILE ${AOM_RTCD_HEADER_FILE}) +endforeach() + +# Generate aom_version.h. +execute_process(COMMAND ${CMAKE_COMMAND} + -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR} + -DAOM_ROOT=${AOM_ROOT} + -DGIT_EXECUTABLE=${GIT_EXECUTABLE} + -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P + "${AOM_ROOT}/build/cmake/version.cmake") diff --git a/libs/libaom/src/build/cmake/aom_experiment_deps.cmake b/libs/libaom/src/build/cmake/aom_experiment_deps.cmake new file mode 100644 index 000000000..2e3615791 --- /dev/null +++ b/libs/libaom/src/build/cmake/aom_experiment_deps.cmake @@ -0,0 +1,28 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_BUILD_CMAKE_AOM_EXPERIMENT_DEPS_CMAKE_) + return() +endif() # AOM_BUILD_CMAKE_AOM_EXPERIMENT_DEPS_CMAKE_ +set(AOM_BUILD_CMAKE_AOM_EXPERIMENT_DEPS_CMAKE_ 1) + +# Adjusts CONFIG_* CMake variables to address conflicts between active AV1 +# experiments. +macro(fix_experiment_configs) + + if(CONFIG_ANALYZER) + change_config_and_warn(CONFIG_INSPECTION 1 CONFIG_ANALYZER) + endif() + + if(CONFIG_DIST_8X8 AND CONFIG_MULTITHREAD) + change_config_and_warn(CONFIG_DIST_8X8 0 CONFIG_MULTITHREAD) + endif() + +endmacro() diff --git a/libs/libaom/src/build/cmake/aom_install.cmake b/libs/libaom/src/build/cmake/aom_install.cmake new file mode 100644 index 000000000..cd40fe424 --- /dev/null +++ b/libs/libaom/src/build/cmake/aom_install.cmake @@ -0,0 +1,96 @@ +# +# Copyright (c) 2018, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +list(APPEND AOM_INSTALL_INCS "${AOM_ROOT}/aom/aom.h" + "${AOM_ROOT}/aom/aom_codec.h" "${AOM_ROOT}/aom/aom_frame_buffer.h" + "${AOM_ROOT}/aom/aom_image.h" "${AOM_ROOT}/aom/aom_integer.h" + "${AOM_ROOT}/aom/aom.h") + +if(CONFIG_AV1_DECODER) + list(APPEND AOM_INSTALL_INCS "${AOM_ROOT}/aom/aom_decoder.h" + "${AOM_ROOT}/aom/aomdx.h") +endif() + +if(CONFIG_AV1_ENCODER) + list(APPEND AOM_INSTALL_INCS "${AOM_ROOT}/aom/aomcx.h" + "${AOM_ROOT}/aom/aom_encoder.h") +endif() + +# Generate aom.pc and setup dependencies to ensure it is created when necessary. +# Note: aom.pc generation uses GNUInstallDirs: +# https://cmake.org/cmake/help/latest/module/GNUInstallDirs.html +macro(setup_aom_install_targets) + if(NOT (MSVC OR XCODE)) + include("GNUInstallDirs") + set(AOM_PKG_CONFIG_FILE "${AOM_CONFIG_DIR}/aom.pc") + + # Create a dummy library target for creating aom.pc. + create_dummy_source_file(aom_pc c AOM_PKG_CONFIG_SOURCES) + add_library(aom_pc ${AOM_PKG_CONFIG_SOURCES}) + + # Setup a rule to generate aom.pc. + add_custom_command( + OUTPUT "${AOM_PKG_CONFIG_FILE}" + COMMAND ${CMAKE_COMMAND} ARGS + -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR} -DAOM_ROOT=${AOM_ROOT} + -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX} + -DCMAKE_INSTALL_BINDIR=${CMAKE_INSTALL_BINDIR} + -DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR} + -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR} + -DCMAKE_PROJECT_NAME=${CMAKE_PROJECT_NAME} + -DCONFIG_MULTITHREAD=${CONFIG_MULTITHREAD} + -DHAVE_PTHREAD_H=${HAVE_PTHREAD_H} -P + "${AOM_ROOT}/build/cmake/pkg_config.cmake" + COMMENT "Writing aom.pc" + VERBATIM) + + # Explicitly add a dependency on the pkg-config file to ensure it's built. + get_property(aom_pc_sources TARGET aom_pc PROPERTY SOURCES) + set_source_files_properties(${aom_pc_sources} OBJECT_DEPENDS + "${AOM_PKG_CONFIG_FILE}") + + # Our pkg-config file carries version information: add a dependency on the + # version rule. + add_dependencies(aom_pc aom_version) + + if(CONFIG_AV1_DECODER) + if(ENABLE_EXAMPLES) + list(APPEND AOM_INSTALL_BINS aomdec) + endif() + endif() + + if(CONFIG_AV1_ENCODER) + if(ENABLE_EXAMPLES) + list(APPEND AOM_INSTALL_BINS aomenc) + endif() + endif() + + if(BUILD_SHARED_LIBS) + set(AOM_INSTALL_LIBS aom aom_static) + else() + set(AOM_INSTALL_LIBS aom) + endif() + + # Setup the install rules. + install( + FILES ${AOM_INSTALL_INCS} + DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/aom") + install( + FILES "${AOM_PKG_CONFIG_FILE}" + DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/pkgconfig") + install(TARGETS ${AOM_INSTALL_LIBS} DESTINATION + "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}") + + if(ENABLE_EXAMPLES) + install(TARGETS ${AOM_INSTALL_BINS} DESTINATION + "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}") + endif() + endif() +endmacro() diff --git a/libs/libaom/src/build/cmake/aom_optimization.cmake b/libs/libaom/src/build/cmake/aom_optimization.cmake new file mode 100644 index 000000000..d8b258f1e --- /dev/null +++ b/libs/libaom/src/build/cmake/aom_optimization.cmake @@ -0,0 +1,240 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_BUILD_CMAKE_AOM_OPTIMIZATION_CMAKE_) + return() +endif() # AOM_BUILD_CMAKE_AOM_OPTIMIZATION_CMAKE_ +set(AOM_BUILD_CMAKE_AOM_OPTIMIZATION_CMAKE_ 1) + +include("${AOM_ROOT}/build/cmake/util.cmake") + +# Translate $flag to one which MSVC understands, and write the new flag to the +# variable named by $translated_flag (or unset it, when MSVC needs no flag). +function(get_msvc_intrinsic_flag flag translated_flag) + if("${flag}" STREQUAL "-mavx") + set(${translated_flag} "/arch:AVX" PARENT_SCOPE) + elseif("${flag}" STREQUAL "-mavx2") + set(${translated_flag} "/arch:AVX2" PARENT_SCOPE) + else() + + # MSVC does not need flags for intrinsics flavors other than AVX/AVX2. + unset(${translated_flag} PARENT_SCOPE) + endif() +endfunction() + +# Adds an object library target. Terminates generation if $flag is not supported +# by the current compiler. $flag is the intrinsics flag required by the current +# compiler, and is added to the compile flags for all sources in $sources. +# $opt_name is used to name the target. $target_to_update is made dependent upon +# the created target. +# +# Note: this function always updates the aom, and aom_static targets because +# OBJECT libraries have rules that disallow the direct addition of .o files to +# them as dependencies. Static and shared libraries do not have this limitation. +function(add_intrinsics_object_library flag opt_name target_to_update sources) + if("${${sources}}" STREQUAL "") + return() + endif() + set(target_name ${target_to_update}_${opt_name}_intrinsics) + add_library(${target_name} OBJECT ${${sources}}) + + if(MSVC) + get_msvc_intrinsic_flag(${flag} "flag") + endif() + + if("${flag}" STREQUAL "-mavx2") + unset(FLAG_SUPPORTED) + check_c_compiler_flag("-mno-avx256-split-unaligned-load" FLAG_SUPPORTED) + if(${FLAG_SUPPORTED}) + set(flag "${flag} -mno-avx256-split-unaligned-load") + endif() + + unset(FLAG_SUPPORTED) + check_c_compiler_flag("-mno-avx256-split-unaligned-store" FLAG_SUPPORTED) + if(${FLAG_SUPPORTED}) + set(flag "${flag} -mno-avx256-split-unaligned-store") + endif() + endif() + + if(flag) + separate_arguments(flag) + target_compile_options(${target_name} PUBLIC ${flag}) + endif() + + target_sources(aom PRIVATE $) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE $) + endif() + + # Add the new lib target to the global list of aom library targets. + list(APPEND AOM_LIB_TARGETS ${target_name}) + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE) +endfunction() + +# Adds sources in list named by $sources to $target and adds $flag to the +# compile flags for each source file. +function(add_intrinsics_source_to_target flag target sources) + target_sources(${target} PRIVATE ${${sources}}) + if(MSVC) + get_msvc_intrinsic_flag(${flag} "flag") + endif() + if(flag) + foreach(source ${${sources}}) + set_property(SOURCE ${source} APPEND PROPERTY COMPILE_FLAGS ${flag}) + endforeach() + endif() +endfunction() + +# Writes object format for the current target to the var named by $out_format, +# or terminates the build when the object format for the current target is +# unknown. +function(get_asm_obj_format out_format) + if("${AOM_TARGET_CPU}" STREQUAL "x86_64") + if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin") + set(objformat "macho64") + elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS" + OR "${AOM_TARGET_SYSTEM}" STREQUAL "Windows") + set(objformat "win64") + else() + set(objformat "elf64") + endif() + elseif("${AOM_TARGET_CPU}" STREQUAL "x86") + if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin") + set(objformat "macho32") + elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS" + OR "${AOM_TARGET_SYSTEM}" STREQUAL "Windows") + set(objformat "win32") + else() + set(objformat "elf32") + endif() + else() + message( + FATAL_ERROR "Unknown obj format: ${AOM_TARGET_CPU}-${AOM_TARGET_SYSTEM}") + endif() + + set(${out_format} ${objformat} PARENT_SCOPE) +endfunction() + +# Adds library target named $lib_name for ASM files in variable named by +# $asm_sources. Builds an output directory path from $lib_name. Links $lib_name +# into the aom library target(s). Generates a dummy C file with a dummy function +# to ensure that all cmake generators can determine the linker language, and +# that build tools don't complain that an object exposes no symbols. +function(add_asm_library lib_name asm_sources) + if("${${asm_sources}}" STREQUAL "") + return() + endif() + set(asm_lib_obj_dir "${AOM_CONFIG_DIR}/asm_objects/${lib_name}") + if(NOT EXISTS "${asm_lib_obj_dir}") + file(MAKE_DIRECTORY "${asm_lib_obj_dir}") + endif() + + # TODO(tomfinegan): If cmake ever allows addition of .o files to OBJECT lib + # targets, make this OBJECT instead of STATIC to hide the target from + # consumers of the AOM cmake build. + add_library(${lib_name} STATIC ${${asm_sources}}) + + foreach(asm_source ${${asm_sources}}) + get_filename_component(asm_source_name "${asm_source}" NAME) + set(asm_object "${asm_lib_obj_dir}/${asm_source_name}.o") + add_custom_command(OUTPUT "${asm_object}" + COMMAND ${AS_EXECUTABLE} ARGS ${AOM_AS_FLAGS} + -I${AOM_ROOT}/ -I${AOM_CONFIG_DIR}/ -o + "${asm_object}" "${asm_source}" + DEPENDS "${asm_source}" + COMMENT "Building ASM object ${asm_object}" + WORKING_DIRECTORY "${AOM_CONFIG_DIR}" + VERBATIM) + target_sources(aom PRIVATE "${asm_object}") + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE "${asm_object}") + endif() + endforeach() + + # The above created a target containing only ASM sources. Cmake needs help + # here to determine the linker language. Add a dummy C file to force the + # linker language to C. We don't bother with setting the LINKER_LANGUAGE + # property on the library target because not all generators obey it (looking + # at you, xcode generator). + add_dummy_source_file_to_target("${lib_name}" "c") + + # Add the new lib target to the global list of aom library targets. + list(APPEND AOM_LIB_TARGETS ${lib_name}) + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE) +endfunction() + +# Terminates generation if nasm found in PATH does not meet requirements. +# Currently checks only for presence of required object formats and support for +# the -Ox argument (multipass optimization). +function(test_nasm) + execute_process(COMMAND ${AS_EXECUTABLE} -hf OUTPUT_VARIABLE nasm_helptext) + + if(NOT "${nasm_helptext}" MATCHES "-Ox") + message( + FATAL_ERROR "Unsupported nasm: multipass optimization not supported.") + endif() + + if("${AOM_TARGET_CPU}" STREQUAL "x86") + if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin") + if(NOT "${nasm_helptext}" MATCHES "macho32") + message( + FATAL_ERROR "Unsupported nasm: macho32 object format not supported.") + endif() + elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS" + OR "${AOM_TARGET_SYSTEM}" STREQUAL "Windows") + if(NOT "${nasm_helptext}" MATCHES "win32") + message( + FATAL_ERROR "Unsupported nasm: win32 object format not supported.") + endif() + else() + if(NOT "${nasm_helptext}" MATCHES "elf32") + message( + FATAL_ERROR "Unsupported nasm: elf32 object format not supported.") + endif() + endif() + else() + if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin") + if(NOT "${nasm_helptext}" MATCHES "macho64") + message( + FATAL_ERROR "Unsupported nasm: macho64 object format not supported.") + endif() + elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS" + OR "${AOM_TARGET_SYSTEM}" STREQUAL "Windows") + if(NOT "${nasm_helptext}" MATCHES "win64") + message( + FATAL_ERROR "Unsupported nasm: win64 object format not supported.") + endif() + else() + if(NOT "${nasm_helptext}" MATCHES "elf64") + message( + FATAL_ERROR "Unsupported nasm: elf64 object format not supported.") + endif() + endif() + endif() +endfunction() + +# Adds build command for generation of rtcd C source files using +# build/cmake/rtcd.pl. $config is the input perl file, $output is the output C +# include file, $source is the C source file, and $symbol is used for the symbol +# argument passed to rtcd.pl. +function(add_rtcd_build_step config output source symbol) + add_custom_command( + OUTPUT ${output} + COMMAND ${PERL_EXECUTABLE} ARGS "${AOM_ROOT}/build/cmake/rtcd.pl" + --arch=${AOM_TARGET_CPU} + --sym=${symbol} ${AOM_RTCD_FLAGS} + --config=${AOM_CONFIG_DIR}/config/aom_config.h ${config} > ${output} + DEPENDS ${config} + COMMENT "Generating ${output}" + WORKING_DIRECTORY ${AOM_CONFIG_DIR} + VERBATIM) + set_property(SOURCE ${source} PROPERTY OBJECT_DEPENDS ${output}) + set_property(SOURCE ${output} PROPERTY GENERATED) +endfunction() diff --git a/libs/libaom/src/build/cmake/compiler_flags.cmake b/libs/libaom/src/build/cmake/compiler_flags.cmake new file mode 100644 index 000000000..24484bcad --- /dev/null +++ b/libs/libaom/src/build/cmake/compiler_flags.cmake @@ -0,0 +1,373 @@ +# +# Copyright (c) 2016, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_BUILD_CMAKE_COMPILER_FLAGS_CMAKE_) + return() +endif() # AOM_BUILD_CMAKE_COMPILER_FLAGS_CMAKE_ +set(AOM_BUILD_CMAKE_COMPILER_FLAGS_CMAKE_ 1) + +include(CheckCCompilerFlag) +include(CheckCXXCompilerFlag) +include("${AOM_ROOT}/build/cmake/compiler_tests.cmake") + +# Strings used to cache flags. +set(AOM_C_FLAGS) +set(AOM_CXX_FLAGS) +set(AOM_EXE_LINKER_FLAGS) +set(AOM_FAILED_C_FLAGS) +set(AOM_FAILED_CXX_FLAGS) + +# Sets variable named by $out_is_present to YES in the caller's scope when $flag +# is found in the string variable named by $flag_cache. Sets the var to NO +# otherwise. +function(is_flag_present flag_cache flag out_is_present) + string(FIND "${${flag_cache}}" "${flag}" flag_pos) + if(${flag_pos} EQUAL -1) + set(${out_is_present} NO PARENT_SCOPE) + else() + set(${out_is_present} YES PARENT_SCOPE) + endif() +endfunction() + +# Appends $flag to $flags. Ignores scope via use of FORCE with set() call. +function(append_flag flags flag) + string(FIND "${${flags}}" "${flag}" found) + if(${found} EQUAL -1) + set(${flags} "${${flags}} ${flag}" CACHE STRING "" FORCE) + endif() +endfunction() + +# Checks C compiler for support of $c_flag. Adds $c_flag to all +# $CMAKE_C_FLAGS_s stored in AOM_C_CONFIGS when the compile test passes. +# Caches $c_flag in $AOM_C_FLAGS or $AOM_FAILED_C_FLAGS depending on test +# outcome. +function(add_c_flag_if_supported c_flag) + if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS) + return() + endif() + + is_flag_present(AOM_C_FLAGS "${c_flag}" flag_ok) + is_flag_present(AOM_FAILED_C_FLAGS "${c_flag}" flag_failed) + if(${flag_ok} OR ${flag_failed}) + return() + endif() + + unset(C_FLAG_SUPPORTED CACHE) + message("Checking C compiler flag support for: " ${c_flag}) + check_c_compiler_flag("${c_flag}" C_FLAG_SUPPORTED) + + if(${C_FLAG_SUPPORTED}) + append_flag(AOM_C_FLAGS "${c_flag}") + foreach(config ${AOM_C_CONFIGS}) + unset(C_FLAG_FOUND) + append_flag("${config}" "${c_flag}") + endforeach() + else() + append_flag(AOM_FAILED_C_FLAGS "${c_flag}") + endif() +endfunction() + +# Checks C++ compiler for support of $cxx_flag. Adds $cxx_flag to all +# $CMAKE_CXX_FLAGS_s stored in AOM_CXX_CONFIGS when the compile test +# passes. Caches $cxx_flag in $AOM_CXX_FLAGS or $AOM_FAILED_CXX_FLAGS depending +# on test outcome. +function(add_cxx_flag_if_supported cxx_flag) + if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS) + return() + endif() + + is_flag_present(AOM_CXX_FLAGS "${cxx_flag}" flag_ok) + is_flag_present(AOM_FAILED_CXX_FLAGS "${cxx_flag}" flag_failed) + if(${flag_ok} OR ${flag_failed}) + return() + endif() + + unset(CXX_FLAG_SUPPORTED CACHE) + message("Checking C++ compiler flag support for: " ${cxx_flag}) + check_cxx_compiler_flag("${cxx_flag}" CXX_FLAG_SUPPORTED) + + if(${CXX_FLAG_SUPPORTED}) + append_flag(AOM_CXX_FLAGS "${cxx_flag}") + foreach(config ${AOM_CXX_CONFIGS}) + unset(CXX_FLAG_FOUND) + append_flag("${config}" "${cxx_flag}") + endforeach() + else() + append_flag(AOM_FAILED_CXX_FLAGS "${cxx_flag}") + endif() +endfunction() + +# Convenience method for adding a flag to both the C and C++ compiler command +# lines. +function(add_compiler_flag_if_supported flag) + add_c_flag_if_supported(${flag}) + add_cxx_flag_if_supported(${flag}) +endfunction() + +# Checks C compiler for support of $c_flag and terminates generation when +# support is not present. +function(require_c_flag c_flag update_c_flags) + if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS) + return() + endif() + + is_flag_present(AOM_C_FLAGS "${c_flag}" flag_ok) + if(${flag_ok}) + return() + endif() + + if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "") + aom_push_var(CMAKE_EXE_LINKER_FLAGS "${AOM_EXE_LINKER_FLAGS}") + endif() + + unset(HAVE_C_FLAG CACHE) + message("Checking C compiler flag support for: " ${c_flag}) + check_c_compiler_flag("${c_flag}" HAVE_C_FLAG) + if(NOT HAVE_C_FLAG) + message( + FATAL_ERROR "${PROJECT_NAME} requires support for C flag: ${c_flag}.") + endif() + + if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "") + aom_pop_var(CMAKE_EXE_LINKER_FLAGS) + endif() + + append_flag(AOM_C_FLAGS "${c_flag}") + if(update_c_flags) + foreach(config ${AOM_C_CONFIGS}) + set(${config} "${${config}} ${c_flag}" CACHE STRING "" FORCE) + endforeach() + endif() +endfunction() + +# Checks CXX compiler for support of $cxx_flag and terminates generation when +# support is not present. +function(require_cxx_flag cxx_flag update_cxx_flags) + if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS) + return() + endif() + + is_flag_present(AOM_CXX_FLAGS "${cxx_flag}" flag_ok) + if(${flag_ok}) + return() + endif() + + if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "") + aom_push_var(CMAKE_EXE_LINKER_FLAGS "${AOM_EXE_LINKER_FLAGS}") + endif() + + unset(HAVE_CXX_FLAG CACHE) + message("Checking C compiler flag support for: " ${cxx_flag}) + check_cxx_compiler_flag("${cxx_flag}" HAVE_CXX_FLAG) + if(NOT HAVE_CXX_FLAG) + message( + FATAL_ERROR "${PROJECT_NAME} requires support for C flag: ${cxx_flag}.") + endif() + + if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "") + aom_pop_var(CMAKE_EXE_LINKER_FLAGS) + endif() + + append_flag(AOM_CXX_FLAGS "${cxx_flag}") + if(update_cxx_flags) + foreach(config ${AOM_CXX_CONFIGS}) + set(${config} "${${config}} ${cxx_flag}" CACHE STRING "" FORCE) + endforeach() + endif() +endfunction() + +# Checks for support of $flag by both the C and CXX compilers. Terminates +# generation when support is not present in both compilers. +function(require_compiler_flag flag update_cmake_flags) + require_c_flag(${flag} ${update_cmake_flags}) + require_cxx_flag(${flag} ${update_cmake_flags}) +endfunction() + +# Checks only non-MSVC targets for support of $c_flag and terminates generation +# when support is not present. +function(require_c_flag_nomsvc c_flag update_c_flags) + if(NOT MSVC) + require_c_flag(${c_flag} ${update_c_flags}) + endif() +endfunction() + +# Checks only non-MSVC targets for support of $cxx_flag and terminates +# generation when support is not present. +function(require_cxx_flag_nomsvc cxx_flag update_cxx_flags) + if(NOT MSVC) + require_cxx_flag(${cxx_flag} ${update_cxx_flags}) + endif() +endfunction() + +# Checks only non-MSVC targets for support of $flag by both the C and CXX +# compilers. Terminates generation when support is not present in both +# compilers. +function(require_compiler_flag_nomsvc flag update_cmake_flags) + require_c_flag_nomsvc(${flag} ${update_cmake_flags}) + require_cxx_flag_nomsvc(${flag} ${update_cmake_flags}) +endfunction() + +# Adds $preproc_def to C compiler command line (as -D$preproc_def) if not +# already present. +function(add_c_preproc_definition preproc_def) + set(preproc_def "-D${preproc_def}") + is_flag_present(AOM_C_FLAGS "${preproc_def}" flag_cached) + if(${flag_cached}) + return() + endif() + + foreach(config ${AOM_C_CONFIGS}) + set(${config} "${${config}} ${preproc_def}" CACHE STRING "" FORCE) + endforeach() +endfunction() + +# Adds $preproc_def to CXX compiler command line (as -D$preproc_def) if not +# already present. +function(add_cxx_preproc_definition preproc_def) + set(preproc_def "-D${preproc_def}") + is_flag_present(AOM_CXX_FLAGS "${preproc_def}" flag_cached) + if(${flag_cached}) + return() + endif() + + foreach(config ${AOM_CXX_CONFIGS}) + set(${config} "${${config}} ${preproc_def}" CACHE STRING "" FORCE) + endforeach() +endfunction() + +# Adds $preproc_def to C and CXX compiler command line (as -D$preproc_def) if +# not already present. +function(add_preproc_definition preproc_def) + add_c_preproc_definition(${preproc_def}) + add_cxx_preproc_definition(${preproc_def}) +endfunction() + +# Adds $flag to assembler command line. +function(append_as_flag flag) + is_flag_present(AOM_AS_FLAGS "${flag}" flag_cached) + if(${flag_cached}) + return() + endif() + append_flag(AOM_AS_FLAGS "${flag}") +endfunction() + +# Adds $flag to the C compiler command line. +function(append_c_flag flag) + is_flag_present(AOM_C_FLAGS "${flag}" flag_cached) + if(${flag_cached}) + return() + endif() + + foreach(config ${AOM_C_CONFIGS}) + append_flag(${config} "${flag}") + endforeach() +endfunction() + +# Adds $flag to the CXX compiler command line. +function(append_cxx_flag flag) + is_flag_present(AOM_CXX_FLAGS "${flag}" flag_cached) + if(${flag_cached}) + return() + endif() + + foreach(config ${AOM_CXX_CONFIGS}) + append_flag(${config} "${flag}") + endforeach() +endfunction() + +# Adds $flag to the C and CXX compiler command lines. +function(append_compiler_flag flag) + append_c_flag(${flag}) + append_cxx_flag(${flag}) +endfunction() + +# Adds $flag to the executable linker command line when not present. +function(append_exe_linker_flag flag) + is_flag_present(AOM_EXE_LINKER_FLAGS "${flag}" flag_cached) + if(${flag_cached}) + return() + endif() + + append_flag(AOM_EXE_LINKER_FLAGS "${flag}") + foreach(config ${AOM_EXE_LINKER_CONFIGS}) + append_flag(${config} "${flag}") + endforeach() +endfunction() + +# Adds $flag to the link flags for $target. +function(append_link_flag_to_target target flag) + unset(target_link_flags) + get_target_property(target_link_flags ${target} LINK_FLAGS) + + if(target_link_flags) + is_flag_present(target_link_flags "${flag}" flag_found) + if(${flag_found}) + return() + endif() + set(target_link_flags "${target_link_flags} ${flag}") + else() + set(target_link_flags "${flag}") + endif() + + set_target_properties(${target} PROPERTIES LINK_FLAGS ${target_link_flags}) +endfunction() + +# Adds $flag to executable linker flags, and makes sure C/CXX builds still work. +function(require_linker_flag flag) + if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS) + return() + endif() + + append_exe_linker_flag(${flag}) + + unset(c_passed) + aom_check_c_compiles("LINKER_FLAG_C_TEST(${flag})" "" c_passed) + unset(cxx_passed) + aom_check_cxx_compiles("LINKER_FLAG_CXX_TEST(${flag})" "" cxx_passed) + + if(NOT c_passed OR NOT cxx_passed) + message(FATAL_ERROR "Linker flag test for ${flag} failed.") + endif() +endfunction() + +# Appends flags in $AOM_EXTRA__FLAGS variables to the flags used at build +# time. +function(set_user_flags) + + # Linker flags are handled first because some C/CXX flags require that a + # linker flag is present at link time. + if(AOM_EXTRA_EXE_LINKER_FLAGS) + is_flag_present(AOM_EXE_LINKER_FLAGS "${AOM_EXTRA_EXE_LINKER_FLAGS}" + extra_present) + if(NOT ${extra_present}) + require_linker_flag("${AOM_EXTRA_EXE_LINKER_FLAGS}") + endif() + endif() + if(AOM_EXTRA_AS_FLAGS) + + # TODO(tomfinegan): assembler flag testing would be a good thing to have. + is_flag_present(AOM_AS_FLAGS "${AOM_EXTRA_AS_FLAGS}" extra_present) + if(NOT ${extra_present}) + append_flag(AOM_AS_FLAGS "${AOM_EXTRA_AS_FLAGS}") + endif() + endif() + if(AOM_EXTRA_C_FLAGS) + is_flag_present(AOM_C_FLAGS "${AOM_EXTRA_C_FLAGS}" extra_present) + if(NOT ${extra_present}) + require_c_flag("${AOM_EXTRA_C_FLAGS}" YES) + endif() + endif() + if(AOM_EXTRA_CXX_FLAGS) + is_flag_present(AOM_CXX_FLAGS "${AOM_EXTRA_CXX_FLAGS}" extra_present) + if(NOT ${extra_present}) + require_cxx_flag("${AOM_EXTRA_CXX_FLAGS}" YES) + endif() + endif() +endfunction() diff --git a/libs/libaom/src/build/cmake/compiler_tests.cmake b/libs/libaom/src/build/cmake/compiler_tests.cmake new file mode 100644 index 000000000..040283225 --- /dev/null +++ b/libs/libaom/src/build/cmake/compiler_tests.cmake @@ -0,0 +1,179 @@ +# +# Copyright (c) 2016, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_BUILD_CMAKE_COMPILER_TESTS_CMAKE_) + return() +endif() # AOM_BUILD_CMAKE_COMPILER_TESTS_CMAKE_ +set(AOM_BUILD_CMAKE_COMPILER_TESTS_CMAKE_ 1) + +include(CheckCSourceCompiles) +include(CheckCXXSourceCompiles) + +# CMake passes command line flags like this: +# +# * $compiler $lang_flags $lang_flags_config ... +# +# To ensure the flags tested here and elsewhere are obeyed a list of active +# build configuration types is built, and flags are applied to the flag strings +# for each configuration currently active for C and CXX builds as determined by +# reading $CMAKE_CONFIGURATION_TYPES and $CMAKE_BUILD_TYPE. When +# $CMAKE_CONFIGURATION_TYPES is non-empty a multi- configuration generator is in +# use: currently this includes MSVC and Xcode. For other generators +# $CMAKE_BUILD_TYPE is used. For both cases AOM__CONFIGS is populated with +# CMake string variable names that contain flags for the currently available +# configuration(s). +unset(AOM_C_CONFIGS) +unset(AOM_CXX_CONFIGS) +list(LENGTH CMAKE_CONFIGURATION_TYPES num_configs) +if(${num_configs} GREATER 0) + foreach(config ${CMAKE_CONFIGURATION_TYPES}) + string(TOUPPER ${config} config) + list(APPEND AOM_C_CONFIGS "CMAKE_C_FLAGS_${config}") + list(APPEND AOM_CXX_CONFIGS "CMAKE_CXX_FLAGS_${config}") + list(APPEND AOM_EXE_LINKER_CONFIGS "CMAKE_EXE_LINKER_FLAGS_${config}") + endforeach() +else() + string(TOUPPER ${CMAKE_BUILD_TYPE} config) + set(AOM_C_CONFIGS "CMAKE_C_FLAGS_${config}") + set(AOM_CXX_CONFIGS "CMAKE_CXX_FLAGS_${config}") + set(AOM_EXE_LINKER_CONFIGS "CMAKE_EXE_LINKER_FLAGS_${config}") +endif() + +# The basic main() function used in all compile tests. +set(AOM_C_MAIN "\nint main(void) { return 0; }") +set(AOM_CXX_MAIN "\nint main() { return 0; }") + +# Strings containing the names of passed and failed tests. +set(AOM_C_PASSED_TESTS) +set(AOM_C_FAILED_TESTS) +set(AOM_CXX_PASSED_TESTS) +set(AOM_CXX_FAILED_TESTS) + +function(aom_push_var var new_value) + set(SAVED_${var} ${${var}} PARENT_SCOPE) + set(${var} "${${var}} ${new_value}" PARENT_SCOPE) +endfunction() + +function(aom_pop_var var) + set(var ${SAVED_${var}} PARENT_SCOPE) + unset(SAVED_${var} PARENT_SCOPE) +endfunction() + +# Confirms $test_source compiles and stores $test_name in one of +# $AOM_C_PASSED_TESTS or $AOM_C_FAILED_TESTS depending on out come. When the +# test passes $result_var is set to 1. When it fails $result_var is unset. The +# test is not run if the test name is found in either of the passed or failed +# test variables. +function(aom_check_c_compiles test_name test_source result_var) + if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS) + return() + endif() + + unset(C_TEST_PASSED CACHE) + unset(C_TEST_FAILED CACHE) + string(FIND "${AOM_C_PASSED_TESTS}" "${test_name}" C_TEST_PASSED) + string(FIND "${AOM_C_FAILED_TESTS}" "${test_name}" C_TEST_FAILED) + if(${C_TEST_PASSED} EQUAL -1 AND ${C_TEST_FAILED} EQUAL -1) + unset(C_TEST_COMPILED CACHE) + message("Running C compiler test: ${test_name}") + check_c_source_compiles("${test_source} ${AOM_C_MAIN}" C_TEST_COMPILED) + set(${result_var} ${C_TEST_COMPILED} PARENT_SCOPE) + + if(C_TEST_COMPILED) + set(AOM_C_PASSED_TESTS + "${AOM_C_PASSED_TESTS} ${test_name}" + CACHE STRING "" FORCE) + else() + set(AOM_C_FAILED_TESTS + "${AOM_C_FAILED_TESTS} ${test_name}" + CACHE STRING "" FORCE) + message("C Compiler test ${test_name} failed.") + endif() + elseif(NOT ${C_TEST_PASSED} EQUAL -1) + set(${result_var} 1 PARENT_SCOPE) + else() # ${C_TEST_FAILED} NOT EQUAL -1 + unset(${result_var} PARENT_SCOPE) + endif() +endfunction() + +# Confirms $test_source compiles and stores $test_name in one of +# $AOM_CXX_PASSED_TESTS or $AOM_CXX_FAILED_TESTS depending on out come. When the +# test passes $result_var is set to 1. When it fails $result_var is unset. The +# test is not run if the test name is found in either of the passed or failed +# test variables. +function(aom_check_cxx_compiles test_name test_source result_var) + if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS) + return() + endif() + + unset(CXX_TEST_PASSED CACHE) + unset(CXX_TEST_FAILED CACHE) + string(FIND "${AOM_CXX_PASSED_TESTS}" "${test_name}" CXX_TEST_PASSED) + string(FIND "${AOM_CXX_FAILED_TESTS}" "${test_name}" CXX_TEST_FAILED) + if(${CXX_TEST_PASSED} EQUAL -1 AND ${CXX_TEST_FAILED} EQUAL -1) + unset(CXX_TEST_COMPILED CACHE) + message("Running CXX compiler test: ${test_name}") + check_cxx_source_compiles("${test_source} ${AOM_CXX_MAIN}" + CXX_TEST_COMPILED) + set(${result_var} ${CXX_TEST_COMPILED} PARENT_SCOPE) + + if(CXX_TEST_COMPILED) + set(AOM_CXX_PASSED_TESTS + "${AOM_CXX_PASSED_TESTS} ${test_name}" + CACHE STRING "" FORCE) + else() + set(AOM_CXX_FAILED_TESTS + "${AOM_CXX_FAILED_TESTS} ${test_name}" + CACHE STRING "" FORCE) + message("CXX Compiler test ${test_name} failed.") + endif() + elseif(NOT ${CXX_TEST_PASSED} EQUAL -1) + set(${result_var} 1 PARENT_SCOPE) + else() # ${CXX_TEST_FAILED} NOT EQUAL -1 + unset(${result_var} PARENT_SCOPE) + endif() +endfunction() + +# Convenience function that confirms $test_source compiles as C and C++. +# $result_var is set to 1 when both tests are successful, and 0 when one or both +# tests fail. Note: This function is intended to be used to write to result +# variables that are expanded via configure_file(). $result_var is set to 1 or 0 +# to allow direct usage of the value in generated source files. +function(aom_check_source_compiles test_name test_source result_var) + unset(C_PASSED) + unset(CXX_PASSED) + aom_check_c_compiles(${test_name} ${test_source} C_PASSED) + aom_check_cxx_compiles(${test_name} ${test_source} CXX_PASSED) + if(C_PASSED AND CXX_PASSED) + set(${result_var} 1 PARENT_SCOPE) + else() + set(${result_var} 0 PARENT_SCOPE) + endif() +endfunction() + +# When inline support is detected for the current compiler the supported +# inlining keyword is written to $result in caller scope. +function(aom_get_inline result) + aom_check_source_compiles("inline_check_1" + "static inline void function(void) {}" + HAVE_INLINE_1) + if(HAVE_INLINE_1 EQUAL 1) + set(${result} "inline" PARENT_SCOPE) + return() + endif() + + # Check __inline. + aom_check_source_compiles("inline_check_2" + "static __inline void function(void) {}" + HAVE_INLINE_2) + if(HAVE_INLINE_2 EQUAL 1) + set(${result} "__inline" PARENT_SCOPE) + endif() +endfunction() diff --git a/libs/libaom/src/build/cmake/cpu.cmake b/libs/libaom/src/build/cmake/cpu.cmake new file mode 100644 index 000000000..ef2d7552b --- /dev/null +++ b/libs/libaom/src/build/cmake/cpu.cmake @@ -0,0 +1,82 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# + +if("${AOM_TARGET_CPU}" MATCHES "^arm") + set(ARCH_ARM 1) + set(RTCD_ARCH_ARM "yes") + + if(ENABLE_NEON) + set(HAVE_NEON 1) + set(RTCD_HAVE_NEON "yes") + else() + set(HAVE_NEON 0) + set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-neon) + endif() +elseif("${AOM_TARGET_CPU}" MATCHES "^mips") + set(ARCH_MIPS 1) + set(RTCD_ARCH_MIPS "yes") + + if("${AOM_TARGET_CPU}" STREQUAL "mips32") + set(HAVE_MIPS32 1) + set(RTCD_HAVE_MIPS32 "yes") + elseif("${AOM_TARGET_CPU}" STREQUAL "mips64") + set(HAVE_MIPS64 1) + set(RTCD_HAVE_MIPS64 "yes") + endif() + + # HAVE_DSPR2 is set by mips toolchain files. + if(ENABLE_DSPR2 AND HAVE_DSPR2) + set(RTCD_HAVE_DSPR2 "yes") + else() + set(HAVE_DSPR2 0) + set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-dspr2) + endif() + + # HAVE_MSA is set by mips toolchain files. + if(ENABLE_MSA AND HAVE_MSA) + set(RTCD_HAVE_MSA "yes") + else() + set(HAVE_MSA 0) + set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-msa) + endif() +elseif("${AOM_TARGET_CPU}" MATCHES "ppc") + set(ARCH_PPC 1) + set(RTCD_ARCH_PPC "yes") + + if(ENABLE_VSX) + set(HAVE_VSX 1) + set(RTCD_HAVE_VSX "yes") + else() + set(HAVE_VSX 0) + set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-vsx) + endif() +elseif("${AOM_TARGET_CPU}" MATCHES "^x86") + if("${AOM_TARGET_CPU}" STREQUAL "x86") + set(ARCH_X86 1) + set(RTCD_ARCH_X86 "yes") + elseif("${AOM_TARGET_CPU}" STREQUAL "x86_64") + set(ARCH_X86_64 1) + set(RTCD_ARCH_X86_64 "yes") + endif() + + set(X86_FLAVORS "MMX;SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;AVX;AVX2") + foreach(flavor ${X86_FLAVORS}) + if(ENABLE_${flavor} AND NOT disable_remaining_flavors) + set(HAVE_${flavor} 1) + set(RTCD_HAVE_${flavor} "yes") + else() + set(disable_remaining_flavors 1) + set(HAVE_${flavor} 0) + string(TOLOWER ${flavor} flavor) + set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-${flavor}) + endif() + endforeach() +endif() diff --git a/libs/libaom/src/build/cmake/dist.cmake b/libs/libaom/src/build/cmake/dist.cmake new file mode 100644 index 000000000..5b9fc95d4 --- /dev/null +++ b/libs/libaom/src/build/cmake/dist.cmake @@ -0,0 +1,64 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +cmake_minimum_required(VERSION 3.5) + +# Converts spaces in $in_string to semicolons and writes the output to +# $out_string. In CMake's eyes this converts the input string to a list. +function(listify_string in_string out_string) + string(REPLACE " " ";" ${out_string} ${in_string}) + set(${out_string} "${${out_string}}" PARENT_SCOPE) +endfunction() + +set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "AOM_DIST_DIR" "AOM_DIST_INCLUDES" + "AOM_DIST_LIBS" "ENABLE_DOCS") + +foreach(arg ${REQUIRED_ARGS}) + if("${${arg}}" STREQUAL "") + message(FATAL_ERROR "${arg} must not be empty.") + endif() +endforeach() + +if(ENABLE_DOCS) + file(INSTALL "${AOM_CONFIG_DIR}/docs" DESTINATION "${AOM_DIST_DIR}") +endif() + +if(AOM_DIST_EXAMPLES) + listify_string("${AOM_DIST_EXAMPLES}" "AOM_DIST_EXAMPLES") + foreach(example ${AOM_DIST_EXAMPLES}) + if(NOT "${example}" MATCHES "aomdec\|aomenc") + file(INSTALL "${example}" DESTINATION "${AOM_DIST_DIR}/bin/examples") + endif() + endforeach() +endif() + +if(AOM_DIST_TOOLS) + listify_string("${AOM_DIST_TOOLS}" "AOM_DIST_TOOLS") + foreach(tool ${AOM_DIST_TOOLS}) + file(INSTALL "${tool}" DESTINATION "${AOM_DIST_DIR}/bin/tools") + endforeach() +endif() + +if(AOM_DIST_APPS) + listify_string("${AOM_DIST_APPS}" "AOM_DIST_APPS") + foreach(app ${AOM_DIST_APPS}) + file(INSTALL "${app}" DESTINATION "${AOM_DIST_DIR}/bin") + endforeach() +endif() + +listify_string("${AOM_DIST_INCLUDES}" "AOM_DIST_INCLUDES") +foreach(inc ${AOM_DIST_INCLUDES}) + file(INSTALL "${inc}" DESTINATION "${AOM_DIST_DIR}/include/aom") +endforeach() + +listify_string("${AOM_DIST_LIBS}" "AOM_DIST_LIBS") +foreach(lib ${AOM_DIST_LIBS}) + file(INSTALL "${lib}" DESTINATION "${AOM_DIST_DIR}/lib") +endforeach() diff --git a/libs/libaom/src/build/cmake/exports.cmake b/libs/libaom/src/build/cmake/exports.cmake new file mode 100644 index 000000000..fa7842c9d --- /dev/null +++ b/libs/libaom/src/build/cmake/exports.cmake @@ -0,0 +1,74 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_BUILD_CMAKE_EXPORTS_CMAKE_) + return() +endif() # AOM_BUILD_CMAKE_EXPORTS_CMAKE_ +set(AOM_BUILD_CMAKE_EXPORTS_CMAKE_ 1) + +include("${AOM_ROOT}/build/cmake/exports_sources.cmake") + +# Creates the custom target which handles generation of the symbol export lists. +function(setup_exports_target) + if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin") + set(symbol_file_ext "syms") + elseif("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS" AND MSVC) + set(symbol_file_ext "def") + else() + set(symbol_file_ext "ver") + endif() + + set(aom_sym_file "${AOM_CONFIG_DIR}/libaom.${symbol_file_ext}") + + add_custom_target(generate_exports + COMMAND ${CMAKE_COMMAND} + -DAOM_ROOT="${AOM_ROOT}" + -DAOM_CONFIG_DIR="${AOM_CONFIG_DIR}" + -DAOM_TARGET_SYSTEM=${AOM_TARGET_SYSTEM} + -DAOM_SYM_FILE="${aom_sym_file}" + -DAOM_MSVC=${MSVC} + -DAOM_XCODE=${XCODE} + -DCONFIG_NAME=$ + -DCONFIG_AV1_DECODER=${CONFIG_AV1_DECODER} + -DCONFIG_AV1_ENCODER=${CONFIG_AV1_ENCODER} + -DCONFIG_INSPECTION=${CONFIG_INSPECTION} + -DENABLE_TESTS=${ENABLE_TESTS} + -P + "${AOM_ROOT}/build/cmake/generate_exports.cmake" + SOURCES ${AOM_EXPORTS_SOURCES} + DEPENDS ${AOM_EXPORTS_SOURCES}) + + # Make libaom depend on the exports file, and set flags to pick it up when + # creating the dylib. + add_dependencies(aom generate_exports) + + if(APPLE) + set_property(TARGET aom + APPEND_STRING + PROPERTY LINK_FLAGS "-exported_symbols_list ${aom_sym_file}") + elseif(WIN32) + if(NOT MSVC) + set_property(TARGET aom + APPEND_STRING + PROPERTY LINK_FLAGS "-Wl,--version-script ${aom_sym_file}") + else() + set_property(TARGET aom + APPEND_STRING + PROPERTY LINK_FLAGS "/DEF:${aom_sym_file}") + endif() + + # TODO(tomfinegan): Sort out the import lib situation and flags for MSVC. + + else() + set_property(TARGET aom + APPEND_STRING + PROPERTY LINK_FLAGS "-Wl,--version-script,${aom_sym_file}") + endif() +endfunction() diff --git a/libs/libaom/src/build/cmake/exports_sources.cmake b/libs/libaom/src/build/cmake/exports_sources.cmake new file mode 100644 index 000000000..46bf001d8 --- /dev/null +++ b/libs/libaom/src/build/cmake/exports_sources.cmake @@ -0,0 +1,35 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_BUILD_CMAKE_EXPORTS_SOURCES_CMAKE_) + return() +endif() # AOM_BUILD_CMAKE_EXPORTS_SOURCES_CMAKE_ +set(AOM_BUILD_CMAKE_EXPORTS_SOURCES_CMAKE_ 1) + +list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_com" + "${AOM_ROOT}/av1/exports_com") + +if(CONFIG_AV1_DECODER) + list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_dec" + "${AOM_ROOT}/av1/exports_dec") + if(CONFIG_INSPECTION) + list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/av1/exports_ident") + endif() +endif() + +if(CONFIG_AV1_ENCODER) + list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_enc" + "${AOM_ROOT}/av1/exports_enc") +endif() + +if(ENABLE_TESTS) + list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_test" + "${AOM_ROOT}/av1/exports_test") +endif() diff --git a/libs/libaom/src/build/cmake/generate_aom_config_templates.cmake b/libs/libaom/src/build/cmake/generate_aom_config_templates.cmake new file mode 100644 index 000000000..529daaf02 --- /dev/null +++ b/libs/libaom/src/build/cmake/generate_aom_config_templates.cmake @@ -0,0 +1,92 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +cmake_minimum_required(VERSION 3.5) + +string(TIMESTAMP year "%Y") +set(asm_file_header_block "\; +\; Copyright (c) ${year}, Alliance for Open Media. All rights reserved +\; +\; This source code is subject to the terms of the BSD 2 Clause License and +\; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +\; was not distributed with this source code in the LICENSE file, you can +\; obtain it at www.aomedia.org/license/software. If the Alliance for Open +\; Media Patent License 1.0 was not distributed with this source code in the +\; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +\; +") +set(h_file_header_block "/* + * Copyright (c) ${year}, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +\#ifndef AOM_CONFIG_H_ +\#define AOM_CONFIG_H_ +") +set(cmake_file_header_block "## +## Copyright (c) ${year}, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +") + +# Terminates cmake execution when $var_name is an empty string, or the variable +# name it contains does not expand to an existing directory. +function(check_directory_var var_name) + if("${var_name}" STREQUAL "") + message(FATAL_ERROR "The CMake variable ${var_name} must be defined.") + endif() + + if(NOT EXISTS "${${var_name}}") + message(FATAL_ERROR "${${var_name}} (${var_name}) missing.") + endif() +endfunction() + +check_directory_var(AOM_CONFIG_DIR) +check_directory_var(AOM_ROOT) + +set(AOM_DEFAULTS "${AOM_ROOT}/build/cmake/aom_config_defaults.cmake") +if(NOT EXISTS "${AOM_DEFAULTS}") + message( + FATAL_ERROR "Configuration default values file (${AOM_DEFAULTS}) missing.") +endif() + +include("${AOM_ROOT}/build/cmake/aom_config_defaults.cmake") +list(APPEND aom_build_vars ${AOM_DETECT_VARS} ${AOM_CONFIG_VARS}) +list(SORT aom_build_vars) + +set(aom_config_h_template "${AOM_CONFIG_DIR}/config/aom_config.h.cmake") +file(WRITE "${aom_config_h_template}" ${h_file_header_block}) +foreach(aom_var ${aom_build_vars}) + if(NOT "${aom_var}" STREQUAL "AOM_RTCD_FLAGS") + file(APPEND "${aom_config_h_template}" + "\#define ${aom_var} \${${aom_var}}\n") + endif() +endforeach() +file(APPEND "${aom_config_h_template}" "\#endif // AOM_CONFIG_H_") + +set(aom_asm_config_template "${AOM_CONFIG_DIR}/config/aom_config.asm.cmake") +file(WRITE "${aom_asm_config_template}" ${asm_file_header_block}) +foreach(aom_var ${aom_build_vars}) + if(NOT "${aom_var}" STREQUAL "INLINE" + AND NOT "${aom_var}" STREQUAL "AOM_RTCD_FLAGS") + file(APPEND "${aom_asm_config_template}" "${aom_var} equ \${${aom_var}}\n") + endif() +endforeach() diff --git a/libs/libaom/src/build/cmake/generate_exports.cmake b/libs/libaom/src/build/cmake/generate_exports.cmake new file mode 100644 index 000000000..f1d15a0fa --- /dev/null +++ b/libs/libaom/src/build/cmake/generate_exports.cmake @@ -0,0 +1,66 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +cmake_minimum_required(VERSION 3.5) + +set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "AOM_TARGET_SYSTEM" "AOM_SYM_FILE" + "CONFIG_AV1_DECODER" "CONFIG_AV1_ENCODER") + +foreach(arg ${REQUIRED_ARGS}) + if("${${arg}}" STREQUAL "") + message(FATAL_ERROR "${arg} must not be empty.") + endif() +endforeach() + +include("${AOM_ROOT}/build/cmake/exports_sources.cmake") + +if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin") + set(symbol_prefix "_") +elseif("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS" AND AOM_MSVC) + file(WRITE "${AOM_SYM_FILE}" "LIBRARY aom\n" "EXPORTS\n") +else() + set(symbol_suffix ";") +endif() + +set(aom_sym_file "${AOM_SYM_FILE}") + +if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin") + file(REMOVE "${aom_sym_file}") +elseif("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS" AND AOM_MSVC) + file(WRITE "${aom_sym_file}" "LIBRARY aom\n" "EXPORTS\n") +else() + file(WRITE "${aom_sym_file}" "{\nglobal:\n") +endif() + +foreach(export_file ${AOM_EXPORTS_SOURCES}) + file(STRINGS "${export_file}" exported_file_data) + set(exported_symbols "${exported_symbols} ${exported_file_data};") + string(STRIP "${exported_symbols}" exported_symbols) +endforeach() + +foreach(exported_symbol ${exported_symbols}) + string(STRIP "${exported_symbol}" exported_symbol) + if("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS" AND AOM_MSVC) + string(SUBSTRING ${exported_symbol} 0 4 export_type) + string(COMPARE EQUAL "${export_type}" "data" is_data) + if(is_data) + set(symbol_suffix " DATA") + else() + set(symbol_suffix "") + endif() + endif() + string(REGEX REPLACE "text \|data " "" "exported_symbol" "${exported_symbol}") + set(exported_symbol " ${symbol_prefix}${exported_symbol}${symbol_suffix}") + file(APPEND "${aom_sym_file}" "${exported_symbol}\n") +endforeach() + +if("${aom_sym_file}" MATCHES "ver$") + file(APPEND "${aom_sym_file}" " \nlocal:\n *;\n};") +endif() diff --git a/libs/libaom/src/build/cmake/ios-Info.plist b/libs/libaom/src/build/cmake/ios-Info.plist new file mode 100644 index 000000000..300e3e310 --- /dev/null +++ b/libs/libaom/src/build/cmake/ios-Info.plist @@ -0,0 +1,37 @@ + + + + + CFBundleDevelopmentRegion + en + CFBundleExecutable + AOM + CFBundleIdentifier + org.webmproject.AOM + CFBundleInfoDictionaryVersion + 6.0 + CFBundleName + AOM + CFBundlePackageType + FMWK + CFBundleShortVersionString + ${VERSION} + CFBundleSignature + ???? + CFBundleSupportedPlatforms + + iPhoneOS + + CFBundleVersion + ${VERSION} + MinimumOSVersion + ${IOS_VERSION_MIN} + UIDeviceFamily + + 1 + 2 + + AOMFullVersion + ${FULLVERSION} + + diff --git a/libs/libaom/src/build/cmake/iosbuild.sh b/libs/libaom/src/build/cmake/iosbuild.sh new file mode 100644 index 000000000..167ece200 --- /dev/null +++ b/libs/libaom/src/build/cmake/iosbuild.sh @@ -0,0 +1,384 @@ +#!/bin/sh +## Copyright (c) 2016, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +## This script generates 'AOM.framework'. An iOS app can encode and decode AVx +## video by including 'AOM.framework'. +## +## Run iosbuild.sh to create 'AOM.framework' in the current directory. +## +set -e +devnull='> /dev/null 2>&1' + +BUILD_ROOT="_iosbuild" +CONFIGURE_ARGS="--disable-docs + --disable-examples + --disable-libyuv + --disable-unit-tests" +DIST_DIR="_dist" +FRAMEWORK_DIR="AOM.framework" +FRAMEWORK_LIB="AOM.framework/AOM" +HEADER_DIR="${FRAMEWORK_DIR}/Headers/aom" +SCRIPT_DIR=$(dirname "$0") +LIBAOM_SOURCE_DIR=$(cd ${SCRIPT_DIR}/../..; pwd) +LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo) +ORIG_PWD="$(pwd)" +ARM_TARGETS="arm64-darwin-gcc + armv7-darwin-gcc + armv7s-darwin-gcc" +SIM_TARGETS="x86-iphonesimulator-gcc + x86_64-iphonesimulator-gcc" +OSX_TARGETS="x86-darwin16-gcc + x86_64-darwin16-gcc" +TARGETS="${ARM_TARGETS} ${SIM_TARGETS}" + +# Configures for the target specified by $1, and invokes make with the dist +# target using $ as the distribution output directory. +build_target() { + local target="$1" + local old_pwd="$(pwd)" + local target_specific_flags="" + + vlog "***Building target: ${target}***" + + case "${target}" in + x86-*) + target_specific_flags="--enable-pic" + vlog "Enabled PIC for ${target}" + ;; + esac + + mkdir "${target}" + cd "${target}" + # TODO(tomfinegan@google.com): switch to cmake. + eval "${LIBAOM_SOURCE_DIR}/configure" --target="${target}" \ + ${CONFIGURE_ARGS} ${EXTRA_CONFIGURE_ARGS} ${target_specific_flags} \ + ${devnull} + export DIST_DIR + eval make dist ${devnull} + cd "${old_pwd}" + + vlog "***Done building target: ${target}***" +} + +# Returns the preprocessor symbol for the target specified by $1. +target_to_preproc_symbol() { + target="$1" + case "${target}" in + arm64-*) + echo "__aarch64__" + ;; + armv7-*) + echo "__ARM_ARCH_7A__" + ;; + armv7s-*) + echo "__ARM_ARCH_7S__" + ;; + x86-*) + echo "__i386__" + ;; + x86_64-*) + echo "__x86_64__" + ;; + *) + echo "#error ${target} unknown/unsupported" + return 1 + ;; + esac +} + +# Create a aom_config.h shim that, based on preprocessor settings for the +# current target CPU, includes the real aom_config.h for the current target. +# $1 is the list of targets. +create_aom_framework_config_shim() { + local targets="$1" + local config_file="${HEADER_DIR}/aom_config.h" + local preproc_symbol="" + local target="" + local include_guard="AOM_FRAMEWORK_HEADERS_AOM_AOM_CONFIG_H_" + + local file_header="/* + * Copyright (c) $(date +%Y), Alliance for Open Media. All rights reserved. + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/* GENERATED FILE: DO NOT EDIT! */ + +#ifndef ${include_guard} +#define ${include_guard} + +#if defined" + + printf "%s" "${file_header}" > "${config_file}" + for target in ${targets}; do + preproc_symbol=$(target_to_preproc_symbol "${target}") + printf " ${preproc_symbol}\n" >> "${config_file}" + printf "#define AOM_FRAMEWORK_TARGET \"${target}\"\n" >> "${config_file}" + printf "#include \"AOM/aom/${target}/aom_config.h\"\n" >> "${config_file}" + printf "#elif defined" >> "${config_file}" + mkdir "${HEADER_DIR}/${target}" + cp -p "${BUILD_ROOT}/${target}/aom_config.h" "${HEADER_DIR}/${target}" + done + + # Consume the last line of output from the loop: We don't want it. + sed -i '' -e '$d' "${config_file}" + + printf "#endif\n\n" >> "${config_file}" + printf "#endif // ${include_guard}" >> "${config_file}" +} + +# Verifies that $FRAMEWORK_LIB fat library contains requested builds. +verify_framework_targets() { + local requested_cpus="" + local cpu="" + + # Extract CPU from full target name. + for target; do + cpu="${target%%-*}" + if [ "${cpu}" = "x86" ]; then + # lipo -info outputs i386 for libaom x86 targets. + cpu="i386" + fi + requested_cpus="${requested_cpus}${cpu} " + done + + # Get target CPUs present in framework library. + local targets_built=$(${LIPO} -info ${FRAMEWORK_LIB}) + + # $LIPO -info outputs a string like the following: + # Architectures in the fat file: $FRAMEWORK_LIB + # Capture only the architecture strings. + targets_built=${targets_built##*: } + + # Sort CPU strings to make the next step a simple string compare. + local actual=$(echo ${targets_built} | tr " " "\n" | sort | tr "\n" " ") + local requested=$(echo ${requested_cpus} | tr " " "\n" | sort | tr "\n" " ") + + vlog "Requested ${FRAMEWORK_LIB} CPUs: ${requested}" + vlog "Actual ${FRAMEWORK_LIB} CPUs: ${actual}" + + if [ "${requested}" != "${actual}" ]; then + elog "Actual ${FRAMEWORK_LIB} targets do not match requested target list." + elog " Requested target CPUs: ${requested}" + elog " Actual target CPUs: ${actual}" + return 1 + fi +} + +# Configures and builds each target specified by $1, and then builds +# AOM.framework. +build_framework() { + local lib_list="" + local targets="$1" + local target="" + local target_dist_dir="" + + # Clean up from previous build(s). + rm -rf "${BUILD_ROOT}" "${FRAMEWORK_DIR}" + + # Create output dirs. + mkdir -p "${BUILD_ROOT}" + mkdir -p "${HEADER_DIR}" + + cd "${BUILD_ROOT}" + + for target in ${targets}; do + build_target "${target}" + target_dist_dir="${BUILD_ROOT}/${target}/${DIST_DIR}" + if [ "${ENABLE_SHARED}" = "yes" ]; then + local suffix="dylib" + else + local suffix="a" + fi + lib_list="${lib_list} ${target_dist_dir}/lib/libaom.${suffix}" + done + + cd "${ORIG_PWD}" + + # The basic libaom API includes are all the same; just grab the most recent + # set. + cp -p "${target_dist_dir}"/include/aom/* "${HEADER_DIR}" + + # Build the fat library. + ${LIPO} -create ${lib_list} -output ${FRAMEWORK_DIR}/AOM + + # Create the aom_config.h shim that allows usage of aom_config.h from + # within AOM.framework. + create_aom_framework_config_shim "${targets}" + + # Copy in aom_version.h. + cp -p "${BUILD_ROOT}/${target}/aom_version.h" "${HEADER_DIR}" + + if [ "${ENABLE_SHARED}" = "yes" ]; then + # Adjust the dylib's name so dynamic linking in apps works as expected. + install_name_tool -id '@rpath/AOM.framework/AOM' ${FRAMEWORK_DIR}/AOM + + # Copy in Info.plist. + cat "${SCRIPT_DIR}/ios-Info.plist" \ + | sed "s/\${FULLVERSION}/${FULLVERSION}/g" \ + | sed "s/\${VERSION}/${VERSION}/g" \ + | sed "s/\${IOS_VERSION_MIN}/${IOS_VERSION_MIN}/g" \ + > "${FRAMEWORK_DIR}/Info.plist" + fi + + # Confirm AOM.framework/AOM contains the targets requested. + verify_framework_targets ${targets} + + vlog "Created fat library ${FRAMEWORK_LIB} containing:" + for lib in ${lib_list}; do + vlog " $(echo ${lib} | awk -F / '{print $2, $NF}')" + done +} + +# Trap function. Cleans up the subtree used to build all targets contained in +# $TARGETS. +cleanup() { + local res=$? + cd "${ORIG_PWD}" + + if [ $res -ne 0 ]; then + elog "build exited with error ($res)" + fi + + if [ "${PRESERVE_BUILD_OUTPUT}" != "yes" ]; then + rm -rf "${BUILD_ROOT}" + fi +} + +print_list() { + local indent="$1" + shift + local list="$@" + for entry in ${list}; do + echo "${indent}${entry}" + done +} + +iosbuild_usage() { +cat << EOF + Usage: ${0##*/} [arguments] + --help: Display this message and exit. + --enable-shared: Build a dynamic framework for use on iOS 8 or later. + --extra-configure-args : Extra args to pass when configuring libaom. + --macosx: Uses darwin16 targets instead of iphonesimulator targets for x86 + and x86_64. Allows linking to framework when builds target MacOSX + instead of iOS. + --preserve-build-output: Do not delete the build directory. + --show-build-output: Show output from each library build. + --targets : Override default target list. Defaults: +$(print_list " " ${TARGETS}) + --test-link: Confirms all targets can be linked. Functionally identical to + passing --enable-examples via --extra-configure-args. + --verbose: Output information about the environment and each stage of the + build. +EOF +} + +elog() { + echo "${0##*/} failed because: $@" 1>&2 +} + +vlog() { + if [ "${VERBOSE}" = "yes" ]; then + echo "$@" + fi +} + +trap cleanup EXIT + +# Parse the command line. +while [ -n "$1" ]; do + case "$1" in + --extra-configure-args) + EXTRA_CONFIGURE_ARGS="$2" + shift + ;; + --help) + iosbuild_usage + exit + ;; + --enable-shared) + ENABLE_SHARED=yes + ;; + --preserve-build-output) + PRESERVE_BUILD_OUTPUT=yes + ;; + --show-build-output) + devnull= + ;; + --test-link) + EXTRA_CONFIGURE_ARGS="${EXTRA_CONFIGURE_ARGS} --enable-examples" + ;; + --targets) + TARGETS="$2" + shift + ;; + --macosx) + TARGETS="${ARM_TARGETS} ${OSX_TARGETS}" + ;; + --verbose) + VERBOSE=yes + ;; + *) + iosbuild_usage + exit 1 + ;; + esac + shift +done + +if [ "${ENABLE_SHARED}" = "yes" ]; then + CONFIGURE_ARGS="--enable-shared ${CONFIGURE_ARGS}" +fi + +FULLVERSION=$("${SCRIPT_DIR}"/version.sh --bare "${LIBAOM_SOURCE_DIR}") +VERSION=$(echo "${FULLVERSION}" | sed -E 's/^v([0-9]+\.[0-9]+\.[0-9]+).*$/\1/') + +if [ "$ENABLE_SHARED" = "yes" ]; then + IOS_VERSION_OPTIONS="--enable-shared" + IOS_VERSION_MIN="8.0" +else + IOS_VERSION_OPTIONS="" + IOS_VERSION_MIN="6.0" +fi + +if [ "${VERBOSE}" = "yes" ]; then +cat << EOF + BUILD_ROOT=${BUILD_ROOT} + DIST_DIR=${DIST_DIR} + CONFIGURE_ARGS=${CONFIGURE_ARGS} + EXTRA_CONFIGURE_ARGS=${EXTRA_CONFIGURE_ARGS} + FRAMEWORK_DIR=${FRAMEWORK_DIR} + FRAMEWORK_LIB=${FRAMEWORK_LIB} + HEADER_DIR=${HEADER_DIR} + LIBAOM_SOURCE_DIR=${LIBAOM_SOURCE_DIR} + LIPO=${LIPO} + MAKEFLAGS=${MAKEFLAGS} + ORIG_PWD=${ORIG_PWD} + PRESERVE_BUILD_OUTPUT=${PRESERVE_BUILD_OUTPUT} + TARGETS="$(print_list "" ${TARGETS})" + ENABLE_SHARED=${ENABLE_SHARED} + OSX_TARGETS="${OSX_TARGETS}" + SIM_TARGETS="${SIM_TARGETS}" + SCRIPT_DIR="${SCRIPT_DIR}" + FULLVERSION="${FULLVERSION}" + VERSION="${VERSION}" + IOS_VERSION_MIN="${IOS_VERSION_MIN}" +EOF +fi + +build_framework "${TARGETS}" +echo "Successfully built '${FRAMEWORK_DIR}' for:" +print_list "" ${TARGETS} diff --git a/libs/libaom/src/build/cmake/msvc_runtime.cmake b/libs/libaom/src/build/cmake/msvc_runtime.cmake new file mode 100644 index 000000000..9e4cbea43 --- /dev/null +++ b/libs/libaom/src/build/cmake/msvc_runtime.cmake @@ -0,0 +1,37 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_BUILD_CMAKE_MSVC_RUNTIME_CMAKE_) + return() +endif() # AOM_BUILD_CMAKE_MSVC_RUNTIME_CMAKE_ +set(AOM_BUILD_CMAKE_MSVC_RUNTIME_CMAKE_ 1) + +if(MSVC) + + # CMake defaults to producing code linked to the DLL MSVC runtime. That will + # not work with googletest, and isn't what we want anyway. + if(NOT "${MSVC_RUNTIME}" STREQUAL "dll") + foreach(flag_var + CMAKE_C_FLAGS + CMAKE_C_FLAGS_DEBUG + CMAKE_C_FLAGS_RELEASE + CMAKE_C_FLAGS_MINSIZEREL + CMAKE_C_FLAGS_RELWITHDEBINFO + CMAKE_CXX_FLAGS + CMAKE_CXX_FLAGS_DEBUG + CMAKE_CXX_FLAGS_RELEASE + CMAKE_CXX_FLAGS_MINSIZEREL + CMAKE_CXX_FLAGS_RELWITHDEBINFO) + if(${flag_var} MATCHES "/MD") + string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") + endif(${flag_var} MATCHES "/MD") + endforeach(flag_var) + endif() +endif() diff --git a/libs/libaom/src/build/cmake/pkg_config.cmake b/libs/libaom/src/build/cmake/pkg_config.cmake new file mode 100644 index 000000000..c3914d79e --- /dev/null +++ b/libs/libaom/src/build/cmake/pkg_config.cmake @@ -0,0 +1,62 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +cmake_minimum_required(VERSION 3.5) + +set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "CMAKE_INSTALL_PREFIX" + "CMAKE_INSTALL_BINDIR" "CMAKE_INSTALL_INCLUDEDIR" + "CMAKE_INSTALL_LIBDIR" "CMAKE_PROJECT_NAME" + "CONFIG_MULTITHREAD" "HAVE_PTHREAD_H") + +foreach(arg ${REQUIRED_ARGS}) + if("${${arg}}" STREQUAL "") + message(FATAL_ERROR "${arg} must not be empty.") + endif() +endforeach() + +include("${AOM_ROOT}/build/cmake/util.cmake") + +extract_version_string("${AOM_CONFIG_DIR}/config/aom_version.h" aom_version) + +# Create a version string suitable for comparison using the RPM version compare +# algorithm: strip out everything after the number. +string(FIND "${aom_version}" "-" dash_pos) +if(${dash_pos} EQUAL -1) + set(package_version "${aom_version}") +else() + string(SUBSTRING "${aom_version}" 0 ${dash_pos} package_version) +endif() + +# Write pkg-config info. +set(prefix "${CMAKE_INSTALL_PREFIX}") +set(bindir "${CMAKE_INSTALL_BINDIR}") +set(includedir "${CMAKE_INSTALL_INCLUDEDIR}") +set(libdir "${CMAKE_INSTALL_LIBDIR}") +set(pkgconfig_file "${AOM_CONFIG_DIR}/aom.pc") +string(TOLOWER ${CMAKE_PROJECT_NAME} pkg_name) +file(WRITE "${pkgconfig_file}" "# libaom pkg-config.\n") +file(APPEND "${pkgconfig_file}" "prefix=${prefix}\n") +file(APPEND "${pkgconfig_file}" "exec_prefix=\${prefix}\n") +file(APPEND "${pkgconfig_file}" "includedir=\${prefix}/${includedir}\n") +file(APPEND "${pkgconfig_file}" "libdir=\${exec_prefix}/${libdir}\n\n") +file(APPEND "${pkgconfig_file}" "Name: ${pkg_name}\n") +file( + APPEND "${pkgconfig_file}" + "Description: Alliance for Open Media AV1 codec library v${aom_version}.\n") +file(APPEND "${pkgconfig_file}" "Version: ${package_version}\n") +file(APPEND "${pkgconfig_file}" "Requires:\n") +file(APPEND "${pkgconfig_file}" "Conflicts:\n") +file(APPEND "${pkgconfig_file}" "Libs: -L\${libdir} -l${pkg_name}\n") +if(CONFIG_MULTITHREAD AND HAVE_PTHREAD_H) + file(APPEND "${pkgconfig_file}" "Libs.private: -lm -lpthread\n") +else() + file(APPEND "${pkgconfig_file}" "Libs.private: -lm\n") +endif() +file(APPEND "${pkgconfig_file}" "Cflags: -I\${includedir}\n") diff --git a/libs/libaom/src/build/cmake/rtcd.pl b/libs/libaom/src/build/cmake/rtcd.pl new file mode 100644 index 000000000..dafccdca9 --- /dev/null +++ b/libs/libaom/src/build/cmake/rtcd.pl @@ -0,0 +1,467 @@ +#!/usr/bin/env perl +## +## Copyright (c) 2017, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +no strict 'refs'; +use warnings; +use Getopt::Long; +Getopt::Long::Configure("auto_help") if $Getopt::Long::VERSION > 2.32; + +my %ALL_FUNCS = (); +my @ALL_ARCHS; +my @ALL_FORWARD_DECLS; +my @REQUIRES; + +my %opts = (); +my %disabled = (); +my %required = (); + +my @argv; +foreach (@ARGV) { + $disabled{$1} = 1, next if /--disable-(.*)/; + $required{$1} = 1, next if /--require-(.*)/; + push @argv, $_; +} + +# NB: use GetOptions() instead of GetOptionsFromArray() for compatibility. +@ARGV = @argv; +GetOptions( + \%opts, + 'arch=s', + 'sym=s', + 'config=s', +); + +foreach my $opt (qw/arch config/) { + if (!defined($opts{$opt})) { + warn "--$opt is required!\n"; + Getopt::Long::HelpMessage('-exit' => 1); + } +} + +foreach my $defs_file (@ARGV) { + if (!-f $defs_file) { + warn "$defs_file: $!\n"; + Getopt::Long::HelpMessage('-exit' => 1); + } +} + +open CONFIG_FILE, $opts{config} or + die "Error opening config file '$opts{config}': $!\n"; + +my %config = (); +while () { + next if !/^#define\s+(?:CONFIG_|HAVE_)/; + chomp; + my @line_components = split /\s/; + scalar @line_components > 2 or + die "Invalid input passed to rtcd.pl via $opts{config}."; + # $line_components[0] = #define + # $line_components[1] = flag name (CONFIG_SOMETHING or HAVE_SOMETHING) + # $line_components[2] = flag value (0 or 1) + $config{$line_components[1]} = "$line_components[2]" eq "1" ? "yes" : ""; +} +close CONFIG_FILE; + +# +# Routines for the RTCD DSL to call +# +sub aom_config($) { + return (defined $config{$_[0]}) ? $config{$_[0]} : ""; +} + +sub specialize { + if (@_ <= 1) { + die "'specialize' must be called with a function name and at least one ", + "architecture ('C' is implied): \n@_\n"; + } + my $fn=$_[0]; + shift; + foreach my $opt (@_) { + eval "\$${fn}_${opt}=${fn}_${opt}"; + } +} + +sub add_proto { + my $fn = splice(@_, -2, 1); + $ALL_FUNCS{$fn} = \@_; + specialize $fn, "c"; +} + +sub require { + foreach my $fn (keys %ALL_FUNCS) { + foreach my $opt (@_) { + my $ofn = eval "\$${fn}_${opt}"; + next if !$ofn; + + # if we already have a default, then we can disable it, as we know + # we can do better. + my $best = eval "\$${fn}_default"; + if ($best) { + my $best_ofn = eval "\$${best}"; + if ($best_ofn && "$best_ofn" ne "$ofn") { + eval "\$${best}_link = 'false'"; + } + } + eval "\$${fn}_default=${fn}_${opt}"; + eval "\$${fn}_${opt}_link='true'"; + } + } +} + +sub forward_decls { + push @ALL_FORWARD_DECLS, @_; +} + +# +# Include the user's directives +# +foreach my $f (@ARGV) { + open FILE, "<", $f or die "cannot open $f: $!\n"; + my $contents = join('', ); + close FILE; + eval $contents or warn "eval failed: $@\n"; +} + +# +# Process the directives according to the command line +# +sub process_forward_decls() { + foreach (@ALL_FORWARD_DECLS) { + $_->(); + } +} + +sub determine_indirection { + aom_config("CONFIG_RUNTIME_CPU_DETECT") eq "yes" or &require(@ALL_ARCHS); + foreach my $fn (keys %ALL_FUNCS) { + my $n = ""; + my @val = @{$ALL_FUNCS{$fn}}; + my $args = pop @val; + my $rtyp = "@val"; + my $dfn = eval "\$${fn}_default"; + $dfn = eval "\$${dfn}"; + foreach my $opt (@_) { + my $ofn = eval "\$${fn}_${opt}"; + next if !$ofn; + my $link = eval "\$${fn}_${opt}_link"; + next if $link && $link eq "false"; + $n .= "x"; + } + if ($n eq "x") { + eval "\$${fn}_indirect = 'false'"; + } else { + eval "\$${fn}_indirect = 'true'"; + } + } +} + +sub declare_function_pointers { + foreach my $fn (sort keys %ALL_FUNCS) { + my @val = @{$ALL_FUNCS{$fn}}; + my $args = pop @val; + my $rtyp = "@val"; + my $dfn = eval "\$${fn}_default"; + $dfn = eval "\$${dfn}"; + foreach my $opt (@_) { + my $ofn = eval "\$${fn}_${opt}"; + next if !$ofn; + print "$rtyp ${ofn}($args);\n"; + } + if (eval "\$${fn}_indirect" eq "false") { + print "#define ${fn} ${dfn}\n"; + } else { + print "RTCD_EXTERN $rtyp (*${fn})($args);\n"; + } + print "\n"; + } +} + +sub set_function_pointers { + foreach my $fn (sort keys %ALL_FUNCS) { + my @val = @{$ALL_FUNCS{$fn}}; + my $args = pop @val; + my $rtyp = "@val"; + my $dfn = eval "\$${fn}_default"; + $dfn = eval "\$${dfn}"; + if (eval "\$${fn}_indirect" eq "true") { + print " $fn = $dfn;\n"; + foreach my $opt (@_) { + my $ofn = eval "\$${fn}_${opt}"; + next if !$ofn; + next if "$ofn" eq "$dfn"; + my $link = eval "\$${fn}_${opt}_link"; + next if $link && $link eq "false"; + my $cond = eval "\$have_${opt}"; + print " if (${cond}) $fn = $ofn;\n" + } + } + } +} + +sub filter { + my @filtered; + foreach (@_) { push @filtered, $_ unless $disabled{$_}; } + return @filtered; +} + +# +# Helper functions for generating the arch specific RTCD files +# +sub common_top() { + my $include_guard = uc($opts{sym})."_H_"; + print < \$version_data, + 'version_filename=s' => \$version_filename) or + die("Invalid arg(s): $!"); + +if (!defined $version_data || length($version_data) == 0 || + !defined $version_filename || length($version_filename) == 0) { + die("--version_data and --version_filename are required."); +} + +# Determine if $version_data is a filename or a git tag/description. +my $version_string; +chomp($version_data); +if (-r $version_data) { + # $version_data is the path to the CHANGELOG. Parse the most recent version. + my $changelog_filename = $version_data; + open(my $changelog_file, '<', $changelog_filename) or + die("Unable to open CHANGELOG @ $changelog_filename: $!."); + + while (my $line = <$changelog_file>) { + my @split_line = split(" ", $line, 3); + next if @split_line < 2; + $version_string = $split_line[1]; + last if substr($version_string, 0, 1) eq "v"; + } + close($changelog_file); +} else { + # $version_data is either a tag name or a full git description, one of: + # tagName OR tagName-commitsSinceTag-shortCommitHash + # In either case we want the first element of the array returned by split. + $version_string = (split("-", $version_data))[0]; + $git_desc = $version_data; +} + +if (substr($version_string, 0, 1) eq "v") { + $version_string = substr($version_string, 1); +} + +my @version_components = split('\.', $version_string, 4); +my $version_major = $version_components[0]; +my $version_minor = $version_components[1]; +my $version_patch = $version_components[2]; + +my $version_extra = ""; +if (length($git_desc) > 0) { + my @git_desc_components = split('-', $git_desc, 2); + $version_extra = $git_desc_components[1]; +} + +open(my $version_file, '>', $version_filename) or + die("Cannot open $version_filename: $!"); + +my $version_packed = "((VERSION_MAJOR << 16) | (VERSION_MINOR << 8) | (VERSION_PATCH))"; +my $year = (localtime)[5] + 1900; +my $lic_block = << "EOF"; +/* + * Copyright (c) $year, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +EOF + +select $version_file; +if (length($git_desc)) { + print << "EOF"; +$lic_block +#define VERSION_MAJOR $version_major +#define VERSION_MINOR $version_minor +#define VERSION_PATCH $version_patch +#define VERSION_EXTRA \"$version_extra\" +#define VERSION_PACKED \\ + $version_packed +#define VERSION_STRING_NOSP \"$git_desc\" +#define VERSION_STRING \" $git_desc\" +EOF +} else { + print << "EOF"; +$lic_block +#define VERSION_MAJOR $version_major +#define VERSION_MINOR $version_minor +#define VERSION_PATCH $version_patch +#define VERSION_EXTRA \"$version_extra\" +#define VERSION_PACKED \\ + $version_packed +#define VERSION_STRING_NOSP \"v$version_string\" +#define VERSION_STRING \" v$version_string\" +EOF +} +close($version_file); diff --git a/libs/libaom/src/codereview.settings b/libs/libaom/src/codereview.settings new file mode 100644 index 000000000..185e9344c --- /dev/null +++ b/libs/libaom/src/codereview.settings @@ -0,0 +1,4 @@ +# This file is used by git cl to get repository specific information. +GERRIT_HOST: True +CODE_REVIEW_SERVER: aomedia-review.googlesource.com +GERRIT_SQUASH_UPLOADS: False diff --git a/libs/libaom/src/common/args.c b/libs/libaom/src/common/args.c new file mode 100644 index 000000000..ec2a86353 --- /dev/null +++ b/libs/libaom/src/common/args.c @@ -0,0 +1,343 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "common/args.h" + +#include +#include +#include + +#include "aom/aom_integer.h" +#include "aom_ports/msvc.h" +#include "aom/aom_codec.h" + +#if defined(__GNUC__) && __GNUC__ +extern void die(const char *fmt, ...) __attribute__((noreturn)); +#else +extern void die(const char *fmt, ...); +#endif + +struct arg arg_init(char **argv) { + struct arg a; + + a.argv = argv; + a.argv_step = 1; + a.name = NULL; + a.val = NULL; + a.def = NULL; + return a; +} + +char *ignore_front_spaces(const char *str) { + while (str[0] == ' ' || str[0] == '\t') ++str; + return (char *)str; +} + +void ignore_end_spaces(char *str) { + char *end = str + strlen(str); + while (end > str && (end[0] == ' ' || end[0] == '\t' || end[0] == '\n' || + end[0] == '\r' || end[0] == '\0')) + --end; + if (end >= str) end[1] = '\0'; +} + +static const char kSbSizeWarningString[] = + "super_block_size has to be 64 or 128."; +static const char kMinpartWarningString[] = + "min_partition_size has to be smaller or equal to max_partition_size."; +static const char kMaxpartWarningString[] = + "max_partition_size has to be smaller or equal to super_block_size."; + +int parse_cfg(const char *file, cfg_options_t *config) { + char line[1024 * 10]; + FILE *f = fopen(file, "r"); + if (!f) return 1; + +#define GET_PARAMS(field) \ + if (strcmp(left, #field) == 0) { \ + config->field = atoi(right); \ + continue; \ + } + + while (fgets(line, sizeof(line) - 1, f)) { + char *actual_line = ignore_front_spaces(line); + char *left, *right, *comment; + size_t length = strlen(actual_line); + + if (length == 0 || actual_line[0] == '#') continue; + right = strchr(actual_line, '='); + if (right == NULL) continue; + right[0] = '\0'; + + left = ignore_front_spaces(actual_line); + right = ignore_front_spaces(right + 1); + + comment = strchr(right, '#'); + if (comment != NULL) comment[0] = '\0'; + + ignore_end_spaces(left); + ignore_end_spaces(right); + + GET_PARAMS(super_block_size); + GET_PARAMS(max_partition_size); + GET_PARAMS(min_partition_size); + GET_PARAMS(disable_ab_partition_type); + GET_PARAMS(disable_rect_partition_type); + GET_PARAMS(disable_1to4_partition_type); + GET_PARAMS(disable_flip_idtx); + GET_PARAMS(disable_cdef); + GET_PARAMS(disable_lr); + GET_PARAMS(disable_obmc); + GET_PARAMS(disable_warp_motion); + GET_PARAMS(disable_global_motion); + GET_PARAMS(disable_dist_wtd_comp); + GET_PARAMS(disable_diff_wtd_comp); + GET_PARAMS(disable_inter_intra_comp); + GET_PARAMS(disable_masked_comp); + GET_PARAMS(disable_one_sided_comp); + GET_PARAMS(disable_palette); + GET_PARAMS(disable_intrabc); + GET_PARAMS(disable_cfl); + GET_PARAMS(disable_smooth_intra); + GET_PARAMS(disable_filter_intra); + GET_PARAMS(disable_dual_filter); + GET_PARAMS(disable_intra_angle_delta); + GET_PARAMS(disable_intra_edge_filter); + GET_PARAMS(disable_tx_64x64); + GET_PARAMS(disable_smooth_inter_intra); + GET_PARAMS(disable_inter_inter_wedge); + GET_PARAMS(disable_inter_intra_wedge); + GET_PARAMS(disable_paeth_intra); + GET_PARAMS(disable_trellis_quant); + GET_PARAMS(disable_ref_frame_mv); + GET_PARAMS(reduced_reference_set); + GET_PARAMS(reduced_tx_type_set); + + fprintf(stderr, "\nInvalid parameter: %s", left); + exit(-1); + } + + if (config->super_block_size != 128 && config->super_block_size != 64) { + fprintf(stderr, "\n%s", kSbSizeWarningString); + exit(-1); + } + if (config->min_partition_size > config->max_partition_size) { + fprintf(stderr, "\n%s", kMinpartWarningString); + exit(-1); + } + if (config->max_partition_size > config->super_block_size) { + fprintf(stderr, "\n%s", kMaxpartWarningString); + exit(-1); + } + + fclose(f); + config->init_by_cfg_file = 1; + + return 0; +} + +int arg_match(struct arg *arg_, const struct arg_def *def, char **argv) { + struct arg arg; + + if (!argv[0] || argv[0][0] != '-') return 0; + + arg = arg_init(argv); + + if (def->short_name && strlen(arg.argv[0]) == strlen(def->short_name) + 1 && + !strcmp(arg.argv[0] + 1, def->short_name)) { + arg.name = arg.argv[0] + 1; + arg.val = def->has_val ? arg.argv[1] : NULL; + arg.argv_step = def->has_val ? 2 : 1; + } else if (def->long_name) { + const size_t name_len = strlen(def->long_name); + + if (strlen(arg.argv[0]) >= name_len + 2 && arg.argv[0][1] == '-' && + !strncmp(arg.argv[0] + 2, def->long_name, name_len) && + (arg.argv[0][name_len + 2] == '=' || + arg.argv[0][name_len + 2] == '\0')) { + arg.name = arg.argv[0] + 2; + arg.val = arg.name[name_len] == '=' ? arg.name + name_len + 1 : NULL; + arg.argv_step = 1; + } + } + + if (arg.name && !arg.val && def->has_val) + die("Error: option %s requires argument.\n", arg.name); + + if (arg.name && arg.val && !def->has_val) + die("Error: option %s requires no argument.\n", arg.name); + + if (arg.name && (arg.val || !def->has_val)) { + arg.def = def; + *arg_ = arg; + return 1; + } + + return 0; +} + +const char *arg_next(struct arg *arg) { + if (arg->argv[0]) arg->argv += arg->argv_step; + + return *arg->argv; +} + +char **argv_dup(int argc, const char **argv) { + char **new_argv = malloc((argc + 1) * sizeof(*argv)); + + memcpy(new_argv, argv, argc * sizeof(*argv)); + new_argv[argc] = NULL; + return new_argv; +} + +void arg_show_usage(FILE *fp, const struct arg_def *const *defs) { + char option_text[40] = { 0 }; + + for (; *defs; defs++) { + const struct arg_def *def = *defs; + char *short_val = def->has_val ? " " : ""; + char *long_val = def->has_val ? "=" : ""; + + if (def->short_name && def->long_name) { + char *comma = def->has_val ? "," : ", "; + + snprintf(option_text, 37, "-%s%s%s --%s%6s", def->short_name, short_val, + comma, def->long_name, long_val); + } else if (def->short_name) + snprintf(option_text, 37, "-%s%s", def->short_name, short_val); + else if (def->long_name) + snprintf(option_text, 37, " --%s%s", def->long_name, long_val); + + fprintf(fp, " %-37s\t%s\n", option_text, def->desc); + + if (def->enums) { + const struct arg_enum_list *listptr; + + fprintf(fp, " %-37s\t ", ""); + + for (listptr = def->enums; listptr->name; listptr++) + fprintf(fp, "%s%s", listptr->name, listptr[1].name ? ", " : "\n"); + } + } +} + +unsigned int arg_parse_uint(const struct arg *arg) { + char *endptr; + const unsigned long rawval = strtoul(arg->val, &endptr, 10); // NOLINT + + if (arg->val[0] != '\0' && endptr[0] == '\0') { + if (rawval <= UINT_MAX) return (unsigned int)rawval; + + die("Option %s: Value %lu out of range for unsigned int\n", arg->name, + rawval); + } + + die("Option %s: Invalid character '%c'\n", arg->name, *endptr); + return 0; +} + +int arg_parse_int(const struct arg *arg) { + char *endptr; + const long rawval = strtol(arg->val, &endptr, 10); // NOLINT + + if (arg->val[0] != '\0' && endptr[0] == '\0') { + if (rawval >= INT_MIN && rawval <= INT_MAX) return (int)rawval; + + die("Option %s: Value %ld out of range for signed int\n", arg->name, + rawval); + } + + die("Option %s: Invalid character '%c'\n", arg->name, *endptr); + return 0; +} + +struct aom_rational arg_parse_rational(const struct arg *arg) { + long int rawval; + char *endptr; + struct aom_rational rat; + + /* parse numerator */ + rawval = strtol(arg->val, &endptr, 10); + + if (arg->val[0] != '\0' && endptr[0] == '/') { + if (rawval >= INT_MIN && rawval <= INT_MAX) + rat.num = (int)rawval; + else + die("Option %s: Value %ld out of range for signed int\n", arg->name, + rawval); + } else + die("Option %s: Expected / at '%c'\n", arg->name, *endptr); + + /* parse denominator */ + rawval = strtol(endptr + 1, &endptr, 10); + + if (arg->val[0] != '\0' && endptr[0] == '\0') { + if (rawval >= INT_MIN && rawval <= INT_MAX) + rat.den = (int)rawval; + else + die("Option %s: Value %ld out of range for signed int\n", arg->name, + rawval); + } else + die("Option %s: Invalid character '%c'\n", arg->name, *endptr); + + return rat; +} + +int arg_parse_enum(const struct arg *arg) { + const struct arg_enum_list *listptr; + long int rawval; + char *endptr; + + /* First see if the value can be parsed as a raw value */ + rawval = strtol(arg->val, &endptr, 10); + if (arg->val[0] != '\0' && endptr[0] == '\0') { + /* Got a raw value, make sure it's valid */ + for (listptr = arg->def->enums; listptr->name; listptr++) + if (listptr->val == rawval) return (int)rawval; + } + + /* Next see if it can be parsed as a string */ + for (listptr = arg->def->enums; listptr->name; listptr++) + if (!strcmp(arg->val, listptr->name)) return listptr->val; + + die("Option %s: Invalid value '%s'\n", arg->name, arg->val); + return 0; +} + +int arg_parse_enum_or_int(const struct arg *arg) { + if (arg->def->enums) return arg_parse_enum(arg); + return arg_parse_int(arg); +} + +// parse a comma separated list of at most n integers +// return the number of elements in the list +int arg_parse_list(const struct arg *arg, int *list, int n) { + const char *ptr = arg->val; + char *endptr; + int i = 0; + + while (ptr[0] != '\0') { + int32_t rawval = (int32_t)strtol(ptr, &endptr, 10); + if (rawval < INT_MIN || rawval > INT_MAX) { + die("Option %s: Value %ld out of range for signed int\n", arg->name, + rawval); + } else if (i >= n) { + die("Option %s: List has more than %d entries\n", arg->name, n); + } else if (*endptr == ',') { + endptr++; + } else if (*endptr != '\0') { + die("Option %s: Bad list separator '%c'\n", arg->name, *endptr); + } + list[i++] = (int)rawval; + ptr = endptr; + } + return i; +} diff --git a/libs/libaom/src/common/args.h b/libs/libaom/src/common/args.h new file mode 100644 index 000000000..286f7dd1a --- /dev/null +++ b/libs/libaom/src/common/args.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_COMMON_ARGS_H_ +#define AOM_COMMON_ARGS_H_ +#include + +#include "aom/aom_codec.h" +#include "aom/aom_encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct arg { + char **argv; + const char *name; + const char *val; + unsigned int argv_step; + const struct arg_def *def; +}; + +struct arg_enum_list { + const char *name; + int val; +}; +#define ARG_ENUM_LIST_END \ + { 0 } + +typedef struct arg_def { + const char *short_name; + const char *long_name; + int has_val; + const char *desc; + const struct arg_enum_list *enums; +} arg_def_t; +#define ARG_DEF(s, l, v, d) \ + { s, l, v, d, NULL } +#define ARG_DEF_ENUM(s, l, v, d, e) \ + { s, l, v, d, e } +#define ARG_DEF_LIST_END \ + { 0 } + +struct arg arg_init(char **argv); +int arg_match(struct arg *arg_, const struct arg_def *def, char **argv); +char *ignore_front_spaces(const char *str); +void ignore_end_spaces(char *str); +int parse_cfg(const char *file, cfg_options_t *config); +const char *arg_next(struct arg *arg); +void arg_show_usage(FILE *fp, const struct arg_def *const *defs); +char **argv_dup(int argc, const char **argv); + +unsigned int arg_parse_uint(const struct arg *arg); +int arg_parse_int(const struct arg *arg); +struct aom_rational arg_parse_rational(const struct arg *arg); +int arg_parse_enum(const struct arg *arg); +int arg_parse_enum_or_int(const struct arg *arg); +int arg_parse_list(const struct arg *arg, int *list, int n); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_COMMON_ARGS_H_ diff --git a/libs/libaom/src/common/av1_config.c b/libs/libaom/src/common/av1_config.c new file mode 100644 index 000000000..9f5b02015 --- /dev/null +++ b/libs/libaom/src/common/av1_config.c @@ -0,0 +1,511 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include + +#include "aom/aom_image.h" +#include "aom/aom_integer.h" +#include "aom_dsp/bitreader_buffer.h" +#include "aom_dsp/bitwriter_buffer.h" +#include "av1/common/obu_util.h" +#include "common/av1_config.h" +#include "config/aom_config.h" + +// Helper macros to reduce verbosity required to check for read errors. +// +// Note that when using these macros, even single line if statements should use +// curly braces to avoid unexpected behavior because all but the +// AV1C_POP_ERROR_HANDLER_DATA() macro consist of multiple statements. +#define AV1C_READ_BIT_OR_RETURN_ERROR(field) \ + int field = 0; \ + do { \ + field = aom_rb_read_bit(reader); \ + if (result == -1) { \ + fprintf(stderr, \ + "av1c: Error reading bit for " #field ", value=%d result=%d.\n", \ + field, result); \ + return -1; \ + } \ + } while (0) + +#define AV1C_READ_BITS_OR_RETURN_ERROR(field, length) \ + int field = 0; \ + do { \ + field = aom_rb_read_literal(reader, (length)); \ + if (result == -1) { \ + fprintf(stderr, \ + "av1c: Could not read bits for " #field \ + ", value=%d result=%d.\n", \ + field, result); \ + return -1; \ + } \ + } while (0) + +// Helper macros for setting/restoring the error handler data in +// aom_read_bit_buffer. +#define AV1C_PUSH_ERROR_HANDLER_DATA(new_data) \ + void *original_error_handler_data = NULL; \ + do { \ + original_error_handler_data = reader->error_handler_data; \ + reader->error_handler_data = &new_data; \ + } while (0) + +#define AV1C_POP_ERROR_HANDLER_DATA() \ + do { \ + reader->error_handler_data = original_error_handler_data; \ + } while (0) + +static const size_t kAv1cSize = 4; + +static void bitreader_error_handler(void *data) { + int *error_val = (int *)data; + *error_val = -1; +} + +// Parse the AV1 timing_info() structure: +// timing_info( ) { +// num_units_in_display_tick f(32) +// time_scale f(32) +// equal_picture_interval f(1) +// if (equal_picture_interval) +// num_ticks_per_picture_minus_1 uvlc() +// } +static int parse_timing_info(struct aom_read_bit_buffer *reader) { + int result = 0; + AV1C_PUSH_ERROR_HANDLER_DATA(result); + + AV1C_READ_BITS_OR_RETURN_ERROR(num_units_in_display_tick, 32); + AV1C_READ_BITS_OR_RETURN_ERROR(time_scale, 32); + + AV1C_READ_BIT_OR_RETURN_ERROR(equal_picture_interval); + if (equal_picture_interval) { + uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(reader); + if (result == -1) { + fprintf(stderr, + "av1c: Could not read bits for " + "num_ticks_per_picture_minus_1, value=%u.\n", + num_ticks_per_picture_minus_1); + return result; + } + } + + AV1C_POP_ERROR_HANDLER_DATA(); + return result; +} + +// Parse the AV1 decoder_model_info() structure: +// decoder_model_info( ) { +// buffer_delay_length_minus_1 f(5) +// num_units_in_decoding_tick f(32) +// buffer_removal_time_length_minus_1 f(5) +// frame_presentation_time_length_minus_1 f(5) +// } +// +// Returns -1 upon failure, or the value of buffer_delay_length_minus_1 + 1. +static int parse_decoder_model_info(struct aom_read_bit_buffer *reader) { + int result = 0; + AV1C_PUSH_ERROR_HANDLER_DATA(result); + + AV1C_READ_BITS_OR_RETURN_ERROR(buffer_delay_length_minus_1, 5); + AV1C_READ_BITS_OR_RETURN_ERROR(num_units_in_decoding_tick, 32); + AV1C_READ_BITS_OR_RETURN_ERROR(buffer_removal_time_length_minus_1, 5); + AV1C_READ_BITS_OR_RETURN_ERROR(frame_presentation_time_length_minus_1, 5); + + AV1C_POP_ERROR_HANDLER_DATA(); + return buffer_delay_length_minus_1 + 1; +} + +// Parse the AV1 operating_parameters_info() structure: +// operating_parameters_info( op ) { +// n = buffer_delay_length_minus_1 + 1 +// decoder_buffer_delay[ op ] f(n) +// encoder_buffer_delay[ op ] f(n) +// low_delay_mode_flag[ op ] f(1) +// } +static int parse_operating_parameters_info(struct aom_read_bit_buffer *reader, + int buffer_delay_length_minus_1) { + int result = 0; + AV1C_PUSH_ERROR_HANDLER_DATA(result); + + const int buffer_delay_length = buffer_delay_length_minus_1 + 1; + AV1C_READ_BITS_OR_RETURN_ERROR(decoder_buffer_delay, buffer_delay_length); + AV1C_READ_BITS_OR_RETURN_ERROR(encoder_buffer_delay, buffer_delay_length); + AV1C_READ_BIT_OR_RETURN_ERROR(low_delay_mode_flag); + + AV1C_POP_ERROR_HANDLER_DATA(); + return result; +} + +// Parse the AV1 color_config() structure..See: +// https://aomediacodec.github.io/av1-spec/av1-spec.pdf#page=44 +static int parse_color_config(struct aom_read_bit_buffer *reader, + Av1Config *config) { + int result = 0; + AV1C_PUSH_ERROR_HANDLER_DATA(result); + + AV1C_READ_BIT_OR_RETURN_ERROR(high_bitdepth); + config->high_bitdepth = high_bitdepth; + + int bit_depth = 0; + if (config->seq_profile == 2 && config->high_bitdepth) { + AV1C_READ_BIT_OR_RETURN_ERROR(twelve_bit); + config->twelve_bit = twelve_bit; + bit_depth = config->twelve_bit ? 12 : 10; + } else { + bit_depth = config->high_bitdepth ? 10 : 8; + } + + if (config->seq_profile != 1) { + AV1C_READ_BIT_OR_RETURN_ERROR(mono_chrome); + config->monochrome = mono_chrome; + } + + int color_primaries = AOM_CICP_CP_UNSPECIFIED; + int transfer_characteristics = AOM_CICP_TC_UNSPECIFIED; + int matrix_coefficients = AOM_CICP_MC_UNSPECIFIED; + + AV1C_READ_BIT_OR_RETURN_ERROR(color_description_present_flag); + if (color_description_present_flag) { + AV1C_READ_BITS_OR_RETURN_ERROR(color_primaries_val, 8); + color_primaries = color_primaries_val; + AV1C_READ_BITS_OR_RETURN_ERROR(transfer_characteristics_val, 8); + transfer_characteristics = transfer_characteristics_val; + AV1C_READ_BITS_OR_RETURN_ERROR(matrix_coefficients_val, 8); + matrix_coefficients = matrix_coefficients_val; + } + + if (config->monochrome) { + AV1C_READ_BIT_OR_RETURN_ERROR(color_range); + config->chroma_subsampling_x = 1; + config->chroma_subsampling_y = 1; + } else if (color_primaries == AOM_CICP_CP_BT_709 && + transfer_characteristics == AOM_CICP_TC_SRGB && + matrix_coefficients == AOM_CICP_MC_IDENTITY) { + config->chroma_subsampling_x = 0; + config->chroma_subsampling_y = 0; + } else { + AV1C_READ_BIT_OR_RETURN_ERROR(color_range); + if (config->seq_profile == 0) { + config->chroma_subsampling_x = 1; + config->chroma_subsampling_y = 1; + } else if (config->seq_profile == 1) { + config->chroma_subsampling_x = 0; + config->chroma_subsampling_y = 0; + } else { + if (bit_depth == 12) { + AV1C_READ_BIT_OR_RETURN_ERROR(subsampling_x); + config->chroma_subsampling_x = subsampling_x; + if (subsampling_x) { + AV1C_READ_BIT_OR_RETURN_ERROR(subsampling_y); + config->chroma_subsampling_y = subsampling_y; + } else { + config->chroma_subsampling_y = 0; + } + } else { + config->chroma_subsampling_x = 1; + config->chroma_subsampling_y = 0; + } + } + + if (config->chroma_subsampling_x && config->chroma_subsampling_y) { + AV1C_READ_BITS_OR_RETURN_ERROR(chroma_sample_position, 2); + config->chroma_sample_position = chroma_sample_position; + } + } + + if (!config->monochrome) { + AV1C_READ_BIT_OR_RETURN_ERROR(separate_uv_delta_q); + } + + AV1C_POP_ERROR_HANDLER_DATA(); + return result; +} + +// Parse AV1 Sequence Header OBU. See: +// https://aomediacodec.github.io/av1-spec/av1-spec.pdf#page=41 +static int parse_sequence_header(const uint8_t *const buffer, size_t length, + Av1Config *config) { + int result = 0; + // The reader instance is local to this function, but a pointer to the + // reader instance is used within this function and throughout this file to + // allow use of the helper macros that reduce parse error checking verbosity. + struct aom_read_bit_buffer reader_instance = { buffer, buffer + length, 0, + &result, + bitreader_error_handler }; + struct aom_read_bit_buffer *reader = &reader_instance; + + AV1C_READ_BITS_OR_RETURN_ERROR(seq_profile, 3); + config->seq_profile = seq_profile; + AV1C_READ_BIT_OR_RETURN_ERROR(still_picture); + AV1C_READ_BIT_OR_RETURN_ERROR(reduced_still_picture_header); + if (reduced_still_picture_header) { + config->initial_presentation_delay_present = 0; + AV1C_READ_BITS_OR_RETURN_ERROR(seq_level_idx_0, 5); + config->seq_level_idx_0 = seq_level_idx_0; + config->seq_tier_0 = 0; + } else { + int has_decoder_model = 0; + int buffer_delay_length = 0; + + AV1C_READ_BIT_OR_RETURN_ERROR(timing_info_present_flag); + if (timing_info_present_flag) { + if (parse_timing_info(reader) != 0) return -1; + + AV1C_READ_BIT_OR_RETURN_ERROR(decoder_model_info_present_flag); + if (decoder_model_info_present_flag && + (buffer_delay_length = parse_decoder_model_info(reader)) == -1) { + return -1; + } + has_decoder_model = 1; + } + + AV1C_READ_BIT_OR_RETURN_ERROR(initial_presentation_delay_present); + config->initial_presentation_delay_present = + initial_presentation_delay_present; + + AV1C_READ_BITS_OR_RETURN_ERROR(operating_points_cnt_minus_1, 5); + const int num_operating_points = operating_points_cnt_minus_1 + 1; + + for (int op_index = 0; op_index < num_operating_points; ++op_index) { + AV1C_READ_BITS_OR_RETURN_ERROR(operating_point_idc, 12); + AV1C_READ_BITS_OR_RETURN_ERROR(seq_level_idx, 5); + + int seq_tier = 0; + if (seq_level_idx > 7) { + AV1C_READ_BIT_OR_RETURN_ERROR(seq_tier_this_op); + seq_tier = seq_tier_this_op; + } + + if (has_decoder_model) { + AV1C_READ_BIT_OR_RETURN_ERROR(decoder_model_present_for_op); + if (decoder_model_present_for_op) { + if (parse_operating_parameters_info(reader, buffer_delay_length) == + -1) { + return -1; + } + } + } + + if (config->initial_presentation_delay_present) { + // Skip the initial presentation delay bits if present since this + // function has no access to the data required to properly set the + // field. + AV1C_READ_BIT_OR_RETURN_ERROR( + initial_presentation_delay_present_for_this_op); + if (initial_presentation_delay_present_for_this_op) { + AV1C_READ_BITS_OR_RETURN_ERROR(initial_presentation_delay_minus_1, 4); + } + } + + if (op_index == 0) { + // Av1Config needs only the values from the first operating point. + config->seq_level_idx_0 = seq_level_idx; + config->seq_tier_0 = seq_tier; + config->initial_presentation_delay_present = 0; + config->initial_presentation_delay_minus_one = 0; + } + } + } + + AV1C_READ_BITS_OR_RETURN_ERROR(frame_width_bits_minus_1, 4); + AV1C_READ_BITS_OR_RETURN_ERROR(frame_height_bits_minus_1, 4); + AV1C_READ_BITS_OR_RETURN_ERROR(max_frame_width_minus_1, + frame_width_bits_minus_1 + 1); + AV1C_READ_BITS_OR_RETURN_ERROR(max_frame_height_minus_1, + frame_height_bits_minus_1 + 1); + + uint8_t frame_id_numbers_present = 0; + if (!reduced_still_picture_header) { + AV1C_READ_BIT_OR_RETURN_ERROR(frame_id_numbers_present_flag); + frame_id_numbers_present = frame_id_numbers_present_flag; + } + + if (frame_id_numbers_present) { + AV1C_READ_BITS_OR_RETURN_ERROR(delta_frame_id_length_minus_2, 4); + AV1C_READ_BITS_OR_RETURN_ERROR(additional_frame_id_length_minus_1, 3); + } + + AV1C_READ_BIT_OR_RETURN_ERROR(use_128x128_superblock); + AV1C_READ_BIT_OR_RETURN_ERROR(enable_filter_intra); + AV1C_READ_BIT_OR_RETURN_ERROR(enable_intra_edge_filter); + + if (!reduced_still_picture_header) { + AV1C_READ_BIT_OR_RETURN_ERROR(enable_interintra_compound); + AV1C_READ_BIT_OR_RETURN_ERROR(enable_masked_compound); + AV1C_READ_BIT_OR_RETURN_ERROR(enable_warped_motion); + AV1C_READ_BIT_OR_RETURN_ERROR(enable_dual_filter); + + AV1C_READ_BIT_OR_RETURN_ERROR(enable_order_hint); + if (enable_order_hint) { + AV1C_READ_BIT_OR_RETURN_ERROR(enable_dist_wtd_comp); + AV1C_READ_BIT_OR_RETURN_ERROR(enable_ref_frame_mvs); + } + + const int SELECT_SCREEN_CONTENT_TOOLS = 2; + int seq_force_screen_content_tools = SELECT_SCREEN_CONTENT_TOOLS; + AV1C_READ_BIT_OR_RETURN_ERROR(seq_choose_screen_content_tools); + if (!seq_choose_screen_content_tools) { + AV1C_READ_BIT_OR_RETURN_ERROR(seq_force_screen_content_tools_val); + seq_force_screen_content_tools = seq_force_screen_content_tools_val; + } + + if (seq_force_screen_content_tools > 0) { + AV1C_READ_BIT_OR_RETURN_ERROR(seq_choose_integer_mv); + + if (!seq_choose_integer_mv) { + AV1C_READ_BIT_OR_RETURN_ERROR(seq_force_integer_mv); + } + } + + if (enable_order_hint) { + AV1C_READ_BITS_OR_RETURN_ERROR(order_hint_bits_minus_1, 3); + } + } + + AV1C_READ_BIT_OR_RETURN_ERROR(enable_superres); + AV1C_READ_BIT_OR_RETURN_ERROR(enable_cdef); + AV1C_READ_BIT_OR_RETURN_ERROR(enable_restoration); + + if (parse_color_config(reader, config) != 0) { + fprintf(stderr, "av1c: color_config() parse failed.\n"); + return -1; + } + + AV1C_READ_BIT_OR_RETURN_ERROR(film_grain_params_present); + return 0; +} + +int get_av1config_from_obu(const uint8_t *buffer, size_t length, int is_annexb, + Av1Config *config) { + if (!buffer || length == 0 || !config) { + return -1; + } + + ObuHeader obu_header; + memset(&obu_header, 0, sizeof(obu_header)); + + size_t sequence_header_length = 0; + size_t obu_header_length = 0; + if (aom_read_obu_header_and_size(buffer, length, is_annexb, &obu_header, + &sequence_header_length, + &obu_header_length) != AOM_CODEC_OK || + obu_header.type != OBU_SEQUENCE_HEADER || + sequence_header_length + obu_header_length > length) { + return -1; + } + + memset(config, 0, sizeof(*config)); + config->marker = 1; + config->version = 1; + return parse_sequence_header(buffer + obu_header_length, + sequence_header_length, config); +} + +int read_av1config(const uint8_t *buffer, size_t buffer_length, + size_t *bytes_read, Av1Config *config) { + if (!buffer || buffer_length < kAv1cSize || !bytes_read || !config) return -1; + + *bytes_read = 0; + + int result = 0; + struct aom_read_bit_buffer reader_instance = { buffer, buffer + buffer_length, + 0, &result, + bitreader_error_handler }; + struct aom_read_bit_buffer *reader = &reader_instance; + + memset(config, 0, sizeof(*config)); + + AV1C_READ_BIT_OR_RETURN_ERROR(marker); + config->marker = marker; + + AV1C_READ_BITS_OR_RETURN_ERROR(version, 7); + config->version = version; + + AV1C_READ_BITS_OR_RETURN_ERROR(seq_profile, 3); + config->seq_profile = seq_profile; + + AV1C_READ_BITS_OR_RETURN_ERROR(seq_level_idx_0, 5); + config->seq_level_idx_0 = seq_level_idx_0; + + AV1C_READ_BIT_OR_RETURN_ERROR(seq_tier_0); + config->seq_tier_0 = seq_tier_0; + + AV1C_READ_BIT_OR_RETURN_ERROR(high_bitdepth); + config->high_bitdepth = high_bitdepth; + + AV1C_READ_BIT_OR_RETURN_ERROR(twelve_bit); + config->twelve_bit = twelve_bit; + + AV1C_READ_BIT_OR_RETURN_ERROR(monochrome); + config->monochrome = monochrome; + + AV1C_READ_BIT_OR_RETURN_ERROR(chroma_subsampling_x); + config->chroma_subsampling_x = chroma_subsampling_x; + + AV1C_READ_BIT_OR_RETURN_ERROR(chroma_subsampling_y); + config->chroma_subsampling_y = chroma_subsampling_y; + + AV1C_READ_BITS_OR_RETURN_ERROR(chroma_sample_position, 2); + config->chroma_sample_position = chroma_sample_position; + + AV1C_READ_BITS_OR_RETURN_ERROR(reserved, 3); + + AV1C_READ_BIT_OR_RETURN_ERROR(initial_presentation_delay_present); + config->initial_presentation_delay_present = + initial_presentation_delay_present; + + AV1C_READ_BITS_OR_RETURN_ERROR(initial_presentation_delay_minus_one, 4); + config->initial_presentation_delay_minus_one = + initial_presentation_delay_minus_one; + + *bytes_read = aom_rb_bytes_read(reader); + + return 0; +} + +int write_av1config(const Av1Config *config, size_t capacity, + size_t *bytes_written, uint8_t *buffer) { + if (!config || !buffer || capacity < kAv1cSize || !bytes_written) return -1; + + *bytes_written = 0; + memset(buffer, 0, kAv1cSize); + + struct aom_write_bit_buffer writer = { buffer, 0 }; + + aom_wb_write_bit(&writer, config->marker); + aom_wb_write_literal(&writer, config->version, 7); + aom_wb_write_literal(&writer, config->seq_profile, 3); + aom_wb_write_literal(&writer, config->seq_level_idx_0, 5); + aom_wb_write_bit(&writer, config->seq_tier_0); + aom_wb_write_bit(&writer, config->high_bitdepth); + aom_wb_write_bit(&writer, config->twelve_bit); + aom_wb_write_bit(&writer, config->monochrome); + aom_wb_write_bit(&writer, config->chroma_subsampling_x); + aom_wb_write_bit(&writer, config->chroma_subsampling_y); + aom_wb_write_literal(&writer, config->chroma_sample_position, 2); + aom_wb_write_literal(&writer, 0, 3); // reserved + aom_wb_write_bit(&writer, config->initial_presentation_delay_present); + + if (config->initial_presentation_delay_present) { + aom_wb_write_literal(&writer, config->initial_presentation_delay_minus_one, + 4); + } else { + aom_wb_write_literal(&writer, 0, 4); // reserved + } + + *bytes_written = aom_wb_bytes_written(&writer); + return 0; +} + +#undef AV1C_READ_BIT_OR_RETURN_ERROR +#undef AV1C_READ_BITS_OR_RETURN_ERROR +#undef AV1C_PUSH_ERROR_HANDLER_DATA +#undef AV1C_POP_ERROR_HANDLER_DATA diff --git a/libs/libaom/src/common/av1_config.h b/libs/libaom/src/common/av1_config.h new file mode 100644 index 000000000..a15bedb30 --- /dev/null +++ b/libs/libaom/src/common/av1_config.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_COMMON_AV1_CONFIG_H_ +#define AOM_COMMON_AV1_CONFIG_H_ + +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Struct representing ISOBMFF/Matroska AV1 config. See: +// https://aomediacodec.github.io/av1-isobmff/#av1codecconfigurationbox-syntax +// +// The AV1 config has the following format: +// +// unsigned int (1) marker = 1; +// unsigned int (7) version = 1; +// unsigned int (3) seq_profile; +// unsigned int (5) seq_level_idx_0; +// unsigned int (1) seq_tier_0; +// unsigned int (1) high_bitdepth; +// unsigned int (1) twelve_bit; +// unsigned int (1) monochrome; +// unsigned int (1) chroma_subsampling_x; +// unsigned int (1) chroma_subsampling_y; +// unsigned int (2) chroma_sample_position; +// unsigned int (3) reserved = 0; +// +// unsigned int (1) initial_presentation_delay_present; +// if (initial_presentation_delay_present) { +// unsigned int (4) initial_presentation_delay_minus_one; +// } else { +// unsigned int (4) reserved = 0; +// } +// +// unsigned int (8)[] configOBUs; +// +// Note: get_av1config_from_obu() does not currently store 'configOBUs' data, so +// the field is omitted. +typedef struct _Av1Config { + uint8_t marker; + uint8_t version; + uint8_t seq_profile; + uint8_t seq_level_idx_0; + uint8_t seq_tier_0; + uint8_t high_bitdepth; + uint8_t twelve_bit; + uint8_t monochrome; + uint8_t chroma_subsampling_x; + uint8_t chroma_subsampling_y; + uint8_t chroma_sample_position; + uint8_t initial_presentation_delay_present; + uint8_t initial_presentation_delay_minus_one; +} Av1Config; + +// Attempts to parse a Sequence Header OBU and set the paramenters of 'config'. +// Returns 0 upon success, and -1 upon failure. 'buffer' can contain multiple +// OBUs, but the Sequence Header OBU must be the first OBU within the buffer. +int get_av1config_from_obu(const uint8_t *buffer, size_t length, int is_annexb, + Av1Config *config); + +// Attempts to parse an AV1 config from 'buffer'. Returns 0 upon success. +// Returns -1 when 'buffer_length' is less than 4, when passed NULL pointers, or +// when parsing of 'buffer' fails. +int read_av1config(const uint8_t *buffer, size_t buffer_length, + size_t *bytes_read, Av1Config *config); + +// Writes 'config' to 'buffer'. Returns 0 upon successful write to 'buffer'. +// Returns -1 when passed NULL pointers or when 'capacity' insufficient. +int write_av1config(const Av1Config *config, size_t capacity, + size_t *bytes_written, uint8_t *buffer); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif // AOM_COMMON_AV1_CONFIG_H_ diff --git a/libs/libaom/src/common/ivfdec.c b/libs/libaom/src/common/ivfdec.c new file mode 100644 index 000000000..80d73b04c --- /dev/null +++ b/libs/libaom/src/common/ivfdec.c @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "common/ivfdec.h" + +#include +#include +#include + +#include "aom_ports/mem_ops.h" +#include "aom_ports/sanitizer.h" + +static const char *IVF_SIGNATURE = "DKIF"; + +static void fix_framerate(int *num, int *den) { + if (*den <= 0 || *den >= 1000000000 || *num <= 0 || *num >= 1000) { + // framerate seems to be invalid, just default to 30fps. + *num = 30; + *den = 1; + } +} + +int file_is_ivf(struct AvxInputContext *input_ctx) { + char raw_hdr[32]; + int is_ivf = 0; + + if (fread(raw_hdr, 1, 32, input_ctx->file) == 32) { + if (memcmp(IVF_SIGNATURE, raw_hdr, 4) == 0) { + is_ivf = 1; + + if (mem_get_le16(raw_hdr + 4) != 0) { + fprintf(stderr, + "Error: Unrecognized IVF version! This file may not" + " decode properly."); + } + + input_ctx->fourcc = mem_get_le32(raw_hdr + 8); + input_ctx->width = mem_get_le16(raw_hdr + 12); + input_ctx->height = mem_get_le16(raw_hdr + 14); + input_ctx->framerate.numerator = mem_get_le32(raw_hdr + 16); + input_ctx->framerate.denominator = mem_get_le32(raw_hdr + 20); + fix_framerate(&input_ctx->framerate.numerator, + &input_ctx->framerate.denominator); + } + } + + if (!is_ivf) { + rewind(input_ctx->file); + input_ctx->detect.buf_read = 0; + } else { + input_ctx->detect.position = 4; + } + return is_ivf; +} + +int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read, + size_t *buffer_size, aom_codec_pts_t *pts) { + char raw_header[IVF_FRAME_HDR_SZ] = { 0 }; + size_t frame_size = 0; + + if (fread(raw_header, IVF_FRAME_HDR_SZ, 1, infile) != 1) { + if (!feof(infile)) warn("Failed to read frame size"); + } else { + frame_size = mem_get_le32(raw_header); + + if (frame_size > 256 * 1024 * 1024) { + warn("Read invalid frame size (%u)", (unsigned int)frame_size); + frame_size = 0; + } + + if (frame_size > *buffer_size) { + uint8_t *new_buffer = (uint8_t *)realloc(*buffer, 2 * frame_size); + + if (new_buffer) { + *buffer = new_buffer; + *buffer_size = 2 * frame_size; + } else { + warn("Failed to allocate compressed data buffer"); + frame_size = 0; + } + } + + if (pts) { + *pts = mem_get_le32(&raw_header[4]); + *pts += ((aom_codec_pts_t)mem_get_le32(&raw_header[8]) << 32); + } + } + + if (!feof(infile)) { + ASAN_UNPOISON_MEMORY_REGION(*buffer, *buffer_size); + if (fread(*buffer, 1, frame_size, infile) != frame_size) { + warn("Failed to read full frame"); + return 1; + } + + ASAN_POISON_MEMORY_REGION(*buffer + frame_size, *buffer_size - frame_size); + *bytes_read = frame_size; + return 0; + } + + return 1; +} diff --git a/libs/libaom/src/common/ivfdec.h b/libs/libaom/src/common/ivfdec.h new file mode 100644 index 000000000..dbc77331f --- /dev/null +++ b/libs/libaom/src/common/ivfdec.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_COMMON_IVFDEC_H_ +#define AOM_COMMON_IVFDEC_H_ + +#include "aom/aom_codec.h" +#include "common/tools_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int file_is_ivf(struct AvxInputContext *input); +int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read, + size_t *buffer_size, aom_codec_pts_t *pts); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif // AOM_COMMON_IVFDEC_H_ diff --git a/libs/libaom/src/common/ivfenc.c b/libs/libaom/src/common/ivfenc.c new file mode 100644 index 000000000..64715f4d7 --- /dev/null +++ b/libs/libaom/src/common/ivfenc.c @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "common/ivfenc.h" + +#include "aom/aom_encoder.h" +#include "aom_ports/mem_ops.h" + +void ivf_write_file_header(FILE *outfile, const struct aom_codec_enc_cfg *cfg, + unsigned int fourcc, int frame_cnt) { + char header[32]; + + header[0] = 'D'; + header[1] = 'K'; + header[2] = 'I'; + header[3] = 'F'; + mem_put_le16(header + 4, 0); // version + mem_put_le16(header + 6, 32); // header size + mem_put_le32(header + 8, fourcc); // fourcc + mem_put_le16(header + 12, cfg->g_w); // width + mem_put_le16(header + 14, cfg->g_h); // height + mem_put_le32(header + 16, cfg->g_timebase.den); // rate + mem_put_le32(header + 20, cfg->g_timebase.num); // scale + mem_put_le32(header + 24, frame_cnt); // length + mem_put_le32(header + 28, 0); // unused + + fwrite(header, 1, 32, outfile); +} + +void ivf_write_frame_header(FILE *outfile, int64_t pts, size_t frame_size) { + char header[12]; + + mem_put_le32(header, (int)frame_size); + mem_put_le32(header + 4, (int)(pts & 0xFFFFFFFF)); + mem_put_le32(header + 8, (int)(pts >> 32)); + fwrite(header, 1, 12, outfile); +} + +void ivf_write_frame_size(FILE *outfile, size_t frame_size) { + char header[4]; + + mem_put_le32(header, (int)frame_size); + fwrite(header, 1, 4, outfile); +} diff --git a/libs/libaom/src/common/ivfenc.h b/libs/libaom/src/common/ivfenc.h new file mode 100644 index 000000000..8f6d947d4 --- /dev/null +++ b/libs/libaom/src/common/ivfenc.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_COMMON_IVFENC_H_ +#define AOM_COMMON_IVFENC_H_ + +#include "common/tools_common.h" + +struct aom_codec_enc_cfg; +struct aom_codec_cx_pkt; + +#ifdef __cplusplus +extern "C" { +#endif + +void ivf_write_file_header(FILE *outfile, const struct aom_codec_enc_cfg *cfg, + uint32_t fourcc, int frame_cnt); + +void ivf_write_frame_header(FILE *outfile, int64_t pts, size_t frame_size); + +void ivf_write_frame_size(FILE *outfile, size_t frame_size); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif // AOM_COMMON_IVFENC_H_ diff --git a/libs/libaom/src/common/md5_utils.c b/libs/libaom/src/common/md5_utils.c new file mode 100644 index 000000000..b69e1cc72 --- /dev/null +++ b/libs/libaom/src/common/md5_utils.c @@ -0,0 +1,249 @@ +/* + * This code implements the MD5 message-digest algorithm. + * The algorithm is due to Ron Rivest. This code was + * written by Colin Plumb in 1993, no copyright is claimed. + * This code is in the public domain; do with it what you wish. + * + * Equivalent code is available from RSA Data Security, Inc. + * This code has been tested against that, and is equivalent, + * except that you don't need to include two pages of legalese + * with every copy. + * + * To compute the message digest of a chunk of bytes, declare an + * MD5Context structure, pass it to MD5Init, call MD5Update as + * needed on buffers full of bytes, and then call MD5Final, which + * will fill a supplied 16-byte array with the digest. + * + * Changed so as no longer to depend on Colin Plumb's `usual.h' header + * definitions + * - Ian Jackson . + * Still in the public domain. + */ + +#include /* for memcpy() */ + +#include "common/md5_utils.h" + +static void byteSwap(UWORD32 *buf, unsigned words) { + md5byte *p; + + /* Only swap bytes for big endian machines */ + int i = 1; + + if (*(char *)&i == 1) return; + + p = (md5byte *)buf; + + do { + *buf++ = (UWORD32)((unsigned)p[3] << 8 | p[2]) << 16 | + ((unsigned)p[1] << 8 | p[0]); + p += 4; + } while (--words); +} + +/* + * Start MD5 accumulation. Set bit count to 0 and buffer to mysterious + * initialization constants. + */ +void MD5Init(struct MD5Context *ctx) { + ctx->buf[0] = 0x67452301; + ctx->buf[1] = 0xefcdab89; + ctx->buf[2] = 0x98badcfe; + ctx->buf[3] = 0x10325476; + + ctx->bytes[0] = 0; + ctx->bytes[1] = 0; +} + +/* + * Update context to reflect the concatenation of another buffer full + * of bytes. + */ +void MD5Update(struct MD5Context *ctx, md5byte const *buf, unsigned len) { + UWORD32 t; + + /* Update byte count */ + + t = ctx->bytes[0]; + + if ((ctx->bytes[0] = t + len) < t) + ctx->bytes[1]++; /* Carry from low to high */ + + t = 64 - (t & 0x3f); /* Space available in ctx->in (at least 1) */ + + if (t > len) { + memcpy((md5byte *)ctx->in + 64 - t, buf, len); + return; + } + + /* First chunk is an odd size */ + memcpy((md5byte *)ctx->in + 64 - t, buf, t); + byteSwap(ctx->in, 16); + MD5Transform(ctx->buf, ctx->in); + buf += t; + len -= t; + + /* Process data in 64-byte chunks */ + while (len >= 64) { + memcpy(ctx->in, buf, 64); + byteSwap(ctx->in, 16); + MD5Transform(ctx->buf, ctx->in); + buf += 64; + len -= 64; + } + + /* Handle any remaining bytes of data. */ + memcpy(ctx->in, buf, len); +} + +/* + * Final wrapup - pad to 64-byte boundary with the bit pattern + * 1 0* (64-bit count of bits processed, MSB-first) + */ +void MD5Final(md5byte digest[16], struct MD5Context *ctx) { + int count = ctx->bytes[0] & 0x3f; /* Number of bytes in ctx->in */ + md5byte *p = (md5byte *)ctx->in + count; + + /* Set the first char of padding to 0x80. There is always room. */ + *p++ = 0x80; + + /* Bytes of padding needed to make 56 bytes (-8..55) */ + count = 56 - 1 - count; + + if (count < 0) { /* Padding forces an extra block */ + memset(p, 0, count + 8); + byteSwap(ctx->in, 16); + MD5Transform(ctx->buf, ctx->in); + p = (md5byte *)ctx->in; + count = 56; + } + + memset(p, 0, count); + byteSwap(ctx->in, 14); + + /* Append length in bits and transform */ + ctx->in[14] = ctx->bytes[0] << 3; + ctx->in[15] = ctx->bytes[1] << 3 | ctx->bytes[0] >> 29; + MD5Transform(ctx->buf, ctx->in); + + byteSwap(ctx->buf, 4); + memcpy(digest, ctx->buf, 16); + memset(ctx, 0, sizeof(*ctx)); /* In case it's sensitive */ +} + +#ifndef ASM_MD5 + +/* The four core functions - F1 is optimized somewhat */ + +/* #define F1(x, y, z) (x & y | ~x & z) */ +#define F1(x, y, z) (z ^ (x & (y ^ z))) +#define F2(x, y, z) F1(z, x, y) +#define F3(x, y, z) (x ^ y ^ z) +#define F4(x, y, z) (y ^ (x | ~z)) + +/* This is the central step in the MD5 algorithm. */ +#define MD5STEP(f, w, x, y, z, in, s) \ + (w += f(x, y, z) + in, w = (w << s | w >> (32 - s)) + x) + +#if defined(__clang__) && defined(__has_attribute) +#if __has_attribute(no_sanitize) +#define AOM_NO_UNSIGNED_OVERFLOW_CHECK \ + __attribute__((no_sanitize("unsigned-integer-overflow"))) +#endif +#endif + +#ifndef AOM_NO_UNSIGNED_OVERFLOW_CHECK +#define AOM_NO_UNSIGNED_OVERFLOW_CHECK +#endif + +/* + * The core of the MD5 algorithm, this alters an existing MD5 hash to + * reflect the addition of 16 longwords of new data. MD5Update blocks + * the data and converts bytes into longwords for this routine. + */ +AOM_NO_UNSIGNED_OVERFLOW_CHECK void MD5Transform(UWORD32 buf[4], + UWORD32 const in[16]) { + register UWORD32 a, b, c, d; + + a = buf[0]; + b = buf[1]; + c = buf[2]; + d = buf[3]; + + MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7); + MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12); + MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17); + MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22); + MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7); + MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12); + MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17); + MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22); + MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7); + MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12); + MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17); + MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22); + MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7); + MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12); + MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17); + MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22); + + MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5); + MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9); + MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14); + MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20); + MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5); + MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9); + MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14); + MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20); + MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5); + MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9); + MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14); + MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20); + MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5); + MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9); + MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14); + MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20); + + MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4); + MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11); + MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16); + MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23); + MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4); + MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11); + MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16); + MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23); + MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4); + MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11); + MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16); + MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23); + MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4); + MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11); + MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16); + MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23); + + MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6); + MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10); + MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15); + MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21); + MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6); + MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10); + MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15); + MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21); + MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6); + MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10); + MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15); + MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21); + MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6); + MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10); + MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15); + MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21); + + buf[0] += a; + buf[1] += b; + buf[2] += c; + buf[3] += d; +} + +#undef AOM_NO_UNSIGNED_OVERFLOW_CHECK + +#endif diff --git a/libs/libaom/src/common/md5_utils.h b/libs/libaom/src/common/md5_utils.h new file mode 100644 index 000000000..144fa3ad2 --- /dev/null +++ b/libs/libaom/src/common/md5_utils.h @@ -0,0 +1,49 @@ +/* + * This is the header file for the MD5 message-digest algorithm. + * The algorithm is due to Ron Rivest. This code was + * written by Colin Plumb in 1993, no copyright is claimed. + * This code is in the public domain; do with it what you wish. + * + * Equivalent code is available from RSA Data Security, Inc. + * This code has been tested against that, and is equivalent, + * except that you don't need to include two pages of legalese + * with every copy. + * + * To compute the message digest of a chunk of bytes, declare an + * MD5Context structure, pass it to MD5Init, call MD5Update as + * needed on buffers full of bytes, and then call MD5Final, which + * will fill a supplied 16-byte array with the digest. + * + * Changed so as no longer to depend on Colin Plumb's `usual.h' + * header definitions + * - Ian Jackson . + * Still in the public domain. + */ + +#ifndef AOM_COMMON_MD5_UTILS_H_ +#define AOM_COMMON_MD5_UTILS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define md5byte unsigned char +#define UWORD32 unsigned int + +typedef struct MD5Context MD5Context; +struct MD5Context { + UWORD32 buf[4]; + UWORD32 bytes[2]; + UWORD32 in[16]; +}; + +void MD5Init(struct MD5Context *context); +void MD5Update(struct MD5Context *context, md5byte const *buf, unsigned len); +void MD5Final(unsigned char digest[16], struct MD5Context *context); +void MD5Transform(UWORD32 buf[4], UWORD32 const in[16]); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_COMMON_MD5_UTILS_H_ diff --git a/libs/libaom/src/common/obudec.c b/libs/libaom/src/common/obudec.c new file mode 100644 index 000000000..650f9973b --- /dev/null +++ b/libs/libaom/src/common/obudec.c @@ -0,0 +1,486 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "common/obudec.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem_ops.h" +#include "av1/common/common.h" +#include "av1/common/obu_util.h" + +#define OBU_BUFFER_SIZE (500 * 1024) + +#define OBU_HEADER_SIZE 1 +#define OBU_EXTENSION_SIZE 1 +#define OBU_MAX_LENGTH_FIELD_SIZE 8 + +#define OBU_MAX_HEADER_SIZE \ + (OBU_HEADER_SIZE + OBU_EXTENSION_SIZE + 2 * OBU_MAX_LENGTH_FIELD_SIZE) + +#define OBU_DETECTION_SIZE \ + (OBU_HEADER_SIZE + OBU_EXTENSION_SIZE + 4 * OBU_MAX_LENGTH_FIELD_SIZE) + +// Reads unsigned LEB128 integer and returns 0 upon successful read and decode. +// Stores raw bytes in 'value_buffer', length of the number in 'value_length', +// and decoded value in 'value'. +static int obudec_read_leb128(FILE *f, uint8_t *value_buffer, + size_t *value_length, uint64_t *value) { + if (!f || !value_buffer || !value_length || !value) return -1; + size_t len; + for (len = 0; len < OBU_MAX_LENGTH_FIELD_SIZE; ++len) { + const size_t num_read = fread(&value_buffer[len], 1, 1, f); + if (num_read == 0) { + if (len == 0 && feof(f)) { + *value_length = 0; + return 0; + } + // Ran out of data before completing read of value. + return -1; + } + if ((value_buffer[len] >> 7) == 0) { + ++len; + *value_length = len; + break; + } + } + + return aom_uleb_decode(value_buffer, len, value, NULL); +} + +// Reads OBU header from 'f'. The 'buffer_capacity' passed in must be large +// enough to store an OBU header with extension (2 bytes). Raw OBU data is +// written to 'obu_data', parsed OBU header values are written to 'obu_header', +// and total bytes read from file are written to 'bytes_read'. Returns 0 for +// success, and non-zero on failure. When end of file is reached, the return +// value is 0 and the 'bytes_read' value is set to 0. +static int obudec_read_obu_header(FILE *f, size_t buffer_capacity, + int is_annexb, uint8_t *obu_data, + ObuHeader *obu_header, size_t *bytes_read) { + if (!f || buffer_capacity < (OBU_HEADER_SIZE + OBU_EXTENSION_SIZE) || + !obu_data || !obu_header || !bytes_read) { + return -1; + } + *bytes_read = fread(obu_data, 1, 1, f); + + if (feof(f) && *bytes_read == 0) { + return 0; + } else if (*bytes_read != 1) { + fprintf(stderr, "obudec: Failure reading OBU header.\n"); + return -1; + } + + const int has_extension = (obu_data[0] >> 2) & 0x1; + if (has_extension) { + if (fread(&obu_data[1], 1, 1, f) != 1) { + fprintf(stderr, "obudec: Failure reading OBU extension."); + return -1; + } + ++*bytes_read; + } + + size_t obu_bytes_parsed = 0; + const aom_codec_err_t parse_result = aom_read_obu_header( + obu_data, *bytes_read, &obu_bytes_parsed, obu_header, is_annexb); + if (parse_result != AOM_CODEC_OK || *bytes_read != obu_bytes_parsed) { + fprintf(stderr, "obudec: Error parsing OBU header.\n"); + return -1; + } + + return 0; +} + +// Reads OBU payload from 'f' and returns 0 for success when all payload bytes +// are read from the file. Payload data is written to 'obu_data', and actual +// bytes read added to 'bytes_read'. +static int obudec_read_obu_payload(FILE *f, size_t payload_length, + uint8_t *obu_data, size_t *bytes_read) { + if (!f || payload_length == 0 || !obu_data || !bytes_read) return -1; + + if (fread(obu_data, 1, payload_length, f) != payload_length) { + fprintf(stderr, "obudec: Failure reading OBU payload.\n"); + return -1; + } + + *bytes_read += payload_length; + return 0; +} + +static int obudec_read_obu_header_and_size(FILE *f, size_t buffer_capacity, + int is_annexb, uint8_t *buffer, + size_t *bytes_read, + size_t *payload_length, + ObuHeader *obu_header) { + const size_t kMinimumBufferSize = OBU_MAX_HEADER_SIZE; + if (!f || !buffer || !bytes_read || !payload_length || !obu_header || + buffer_capacity < kMinimumBufferSize) { + return -1; + } + + size_t leb128_length_obu = 0; + size_t leb128_length_payload = 0; + uint64_t obu_size = 0; + if (is_annexb) { + if (obudec_read_leb128(f, &buffer[0], &leb128_length_obu, &obu_size) != 0) { + fprintf(stderr, "obudec: Failure reading OBU size length.\n"); + return -1; + } else if (leb128_length_obu == 0) { + *payload_length = 0; + return 0; + } + if (obu_size > UINT32_MAX) { + fprintf(stderr, "obudec: OBU payload length too large.\n"); + return -1; + } + } + + size_t header_size = 0; + if (obudec_read_obu_header(f, buffer_capacity - leb128_length_obu, is_annexb, + buffer + leb128_length_obu, obu_header, + &header_size) != 0) { + return -1; + } else if (header_size == 0) { + *payload_length = 0; + return 0; + } + + if (!obu_header->has_size_field) { + assert(is_annexb); + if (obu_size < header_size) { + fprintf(stderr, "obudec: OBU size is too small.\n"); + return -1; + } + *payload_length = (size_t)obu_size - header_size; + } else { + uint64_t u64_payload_length = 0; + if (obudec_read_leb128(f, &buffer[leb128_length_obu + header_size], + &leb128_length_payload, &u64_payload_length) != 0) { + fprintf(stderr, "obudec: Failure reading OBU payload length.\n"); + return -1; + } + if (u64_payload_length > UINT32_MAX) { + fprintf(stderr, "obudec: OBU payload length too large.\n"); + return -1; + } + + *payload_length = (size_t)u64_payload_length; + } + + *bytes_read = leb128_length_obu + header_size + leb128_length_payload; + return 0; +} + +static int obudec_grow_buffer(size_t growth_amount, uint8_t **obu_buffer, + size_t *obu_buffer_capacity) { + if (!*obu_buffer || !obu_buffer_capacity || growth_amount == 0) { + return -1; + } + + const size_t capacity = *obu_buffer_capacity; + if (SIZE_MAX - growth_amount < capacity) { + fprintf(stderr, "obudec: cannot grow buffer, capacity will roll over.\n"); + return -1; + } + + const size_t new_capacity = capacity + growth_amount; + +#if defined AOM_MAX_ALLOCABLE_MEMORY + if (new_capacity > AOM_MAX_ALLOCABLE_MEMORY) { + fprintf(stderr, "obudec: OBU size exceeds max alloc size.\n"); + return -1; + } +#endif + + uint8_t *new_buffer = (uint8_t *)realloc(*obu_buffer, new_capacity); + if (!new_buffer) { + fprintf(stderr, "obudec: Failed to allocate compressed data buffer.\n"); + return -1; + } + + *obu_buffer = new_buffer; + *obu_buffer_capacity = new_capacity; + return 0; +} + +static int obudec_read_one_obu(FILE *f, uint8_t **obu_buffer, + size_t obu_bytes_buffered, + size_t *obu_buffer_capacity, size_t *obu_length, + ObuHeader *obu_header, int is_annexb) { + if (!f || !(*obu_buffer) || !obu_buffer_capacity || !obu_length || + !obu_header) { + return -1; + } + + size_t bytes_read = 0; + size_t obu_payload_length = 0; + size_t available_buffer_capacity = *obu_buffer_capacity - obu_bytes_buffered; + + if (available_buffer_capacity < OBU_MAX_HEADER_SIZE) { + if (obudec_grow_buffer(AOMMAX(*obu_buffer_capacity, OBU_MAX_HEADER_SIZE), + obu_buffer, obu_buffer_capacity) != 0) { + *obu_length = bytes_read; + return -1; + } + available_buffer_capacity += + AOMMAX(*obu_buffer_capacity, OBU_MAX_HEADER_SIZE); + } + + const int status = obudec_read_obu_header_and_size( + f, available_buffer_capacity, is_annexb, *obu_buffer + obu_bytes_buffered, + &bytes_read, &obu_payload_length, obu_header); + if (status < 0) return status; + + if (obu_payload_length > SIZE_MAX - bytes_read) return -1; + + if (obu_payload_length > 256 * 1024 * 1024) { + fprintf(stderr, "obudec: Read invalid OBU size (%u)\n", + (unsigned int)obu_payload_length); + *obu_length = bytes_read + obu_payload_length; + return -1; + } + + if (bytes_read + obu_payload_length > available_buffer_capacity && + obudec_grow_buffer(AOMMAX(*obu_buffer_capacity, obu_payload_length), + obu_buffer, obu_buffer_capacity) != 0) { + *obu_length = bytes_read + obu_payload_length; + return -1; + } + + if (obu_payload_length > 0 && + obudec_read_obu_payload(f, obu_payload_length, + *obu_buffer + obu_bytes_buffered + bytes_read, + &bytes_read) != 0) { + return -1; + } + + *obu_length = bytes_read; + return 0; +} + +int file_is_obu(struct ObuDecInputContext *obu_ctx) { + if (!obu_ctx || !obu_ctx->avx_ctx) return 0; + + struct AvxInputContext *avx_ctx = obu_ctx->avx_ctx; + uint8_t detect_buf[OBU_DETECTION_SIZE] = { 0 }; + const int is_annexb = obu_ctx->is_annexb; + FILE *f = avx_ctx->file; + size_t payload_length = 0; + ObuHeader obu_header; + memset(&obu_header, 0, sizeof(obu_header)); + size_t length_of_unit_size = 0; + size_t annexb_header_length = 0; + uint64_t unit_size = 0; + + if (is_annexb) { + // read the size of first temporal unit + if (obudec_read_leb128(f, &detect_buf[0], &length_of_unit_size, + &unit_size) != 0) { + fprintf(stderr, "obudec: Failure reading temporal unit header\n"); + return 0; + } + + // read the size of first frame unit + if (obudec_read_leb128(f, &detect_buf[length_of_unit_size], + &annexb_header_length, &unit_size) != 0) { + fprintf(stderr, "obudec: Failure reading frame unit header\n"); + return 0; + } + annexb_header_length += length_of_unit_size; + } + + size_t bytes_read = 0; + if (obudec_read_obu_header_and_size( + f, OBU_DETECTION_SIZE - annexb_header_length, is_annexb, + &detect_buf[annexb_header_length], &bytes_read, &payload_length, + &obu_header) != 0) { + fprintf(stderr, "obudec: Failure reading first OBU.\n"); + rewind(f); + return 0; + } + + if (is_annexb) { + bytes_read += annexb_header_length; + } + + if (obu_header.type != OBU_TEMPORAL_DELIMITER && + obu_header.type != OBU_SEQUENCE_HEADER) { + return 0; + } + + if (obu_header.has_size_field) { + if (obu_header.type == OBU_TEMPORAL_DELIMITER && payload_length != 0) { + fprintf( + stderr, + "obudec: Invalid OBU_TEMPORAL_DELIMITER payload length (non-zero)."); + rewind(f); + return 0; + } + } else if (!is_annexb) { + fprintf(stderr, "obudec: OBU size fields required, cannot decode input.\n"); + rewind(f); + return 0; + } + + // Appears that input is valid Section 5 AV1 stream. + obu_ctx->buffer = (uint8_t *)malloc(OBU_BUFFER_SIZE); + if (!obu_ctx->buffer) { + fprintf(stderr, "Out of memory.\n"); + rewind(f); + return 0; + } + obu_ctx->buffer_capacity = OBU_BUFFER_SIZE; + + memcpy(obu_ctx->buffer, &detect_buf[0], bytes_read); + obu_ctx->bytes_buffered = bytes_read; + // If the first OBU is a SEQUENCE_HEADER, then it will have a payload. + // We need to read this in so that our buffer only contains complete OBUs. + if (payload_length > 0) { + if (payload_length > (obu_ctx->buffer_capacity - bytes_read)) { + fprintf(stderr, "obudec: First OBU's payload is too large\n"); + rewind(f); + return 0; + } + + size_t payload_bytes = 0; + const int status = obudec_read_obu_payload( + f, payload_length, &obu_ctx->buffer[bytes_read], &payload_bytes); + if (status < 0) { + rewind(f); + return 0; + } + obu_ctx->bytes_buffered += payload_bytes; + } + return 1; +} + +int obudec_read_temporal_unit(struct ObuDecInputContext *obu_ctx, + uint8_t **buffer, size_t *bytes_read, + size_t *buffer_size) { + FILE *f = obu_ctx->avx_ctx->file; + if (!f) return -1; + + *buffer_size = 0; + *bytes_read = 0; + + if (feof(f)) { + return 1; + } + + size_t tu_size; + size_t obu_size = 0; + size_t length_of_temporal_unit_size = 0; + uint8_t tuheader[OBU_MAX_LENGTH_FIELD_SIZE] = { 0 }; + + if (obu_ctx->is_annexb) { + uint64_t size = 0; + + if (obu_ctx->bytes_buffered == 0) { + if (obudec_read_leb128(f, &tuheader[0], &length_of_temporal_unit_size, + &size) != 0) { + fprintf(stderr, "obudec: Failure reading temporal unit header\n"); + return -1; + } + if (size == 0 && feof(f)) { + return 1; + } + } else { + // temporal unit size was already stored in buffer + if (aom_uleb_decode(obu_ctx->buffer, obu_ctx->bytes_buffered, &size, + &length_of_temporal_unit_size) != 0) { + fprintf(stderr, "obudec: Failure reading temporal unit header\n"); + return -1; + } + } + + if (size > UINT32_MAX || size + length_of_temporal_unit_size > UINT32_MAX) { + fprintf(stderr, "obudec: TU too large.\n"); + return -1; + } + + size += length_of_temporal_unit_size; + tu_size = (size_t)size; + } else { + while (1) { + ObuHeader obu_header; + memset(&obu_header, 0, sizeof(obu_header)); + + if (obudec_read_one_obu(f, &obu_ctx->buffer, obu_ctx->bytes_buffered, + &obu_ctx->buffer_capacity, &obu_size, &obu_header, + 0) != 0) { + fprintf(stderr, "obudec: read_one_obu failed in TU loop\n"); + return -1; + } + + if (obu_header.type == OBU_TEMPORAL_DELIMITER || obu_size == 0) { + tu_size = obu_ctx->bytes_buffered; + break; + } else { + obu_ctx->bytes_buffered += obu_size; + } + } + } + +#if defined AOM_MAX_ALLOCABLE_MEMORY + if (tu_size > AOM_MAX_ALLOCABLE_MEMORY) { + fprintf(stderr, "obudec: Temporal Unit size exceeds max alloc size.\n"); + return -1; + } +#endif + if (tu_size > 0) { + uint8_t *new_buffer = (uint8_t *)realloc(*buffer, tu_size); + if (!new_buffer) { + free(*buffer); + fprintf(stderr, "obudec: Out of memory.\n"); + return -1; + } + *buffer = new_buffer; + } + *bytes_read = tu_size; + *buffer_size = tu_size; + + if (!obu_ctx->is_annexb) { + memcpy(*buffer, obu_ctx->buffer, tu_size); + + // At this point, (obu_ctx->buffer + obu_ctx->bytes_buffered + obu_size) + // points to the end of the buffer. + memmove(obu_ctx->buffer, obu_ctx->buffer + obu_ctx->bytes_buffered, + obu_size); + obu_ctx->bytes_buffered = obu_size; + } else { + if (!feof(f)) { + size_t data_size; + size_t offset; + if (!obu_ctx->bytes_buffered) { + data_size = tu_size - length_of_temporal_unit_size; + memcpy(*buffer, &tuheader[0], length_of_temporal_unit_size); + offset = length_of_temporal_unit_size; + } else { + const size_t copy_size = AOMMIN(obu_ctx->bytes_buffered, tu_size); + memcpy(*buffer, obu_ctx->buffer, copy_size); + offset = copy_size; + data_size = tu_size - copy_size; + obu_ctx->bytes_buffered -= copy_size; + } + + if (fread(*buffer + offset, 1, data_size, f) != data_size) { + fprintf(stderr, "obudec: Failed to read full temporal unit\n"); + return -1; + } + } + } + return 0; +} + +void obudec_free(struct ObuDecInputContext *obu_ctx) { free(obu_ctx->buffer); } diff --git a/libs/libaom/src/common/obudec.h b/libs/libaom/src/common/obudec.h new file mode 100644 index 000000000..b2adb1e3d --- /dev/null +++ b/libs/libaom/src/common/obudec.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_COMMON_OBUDEC_H_ +#define AOM_COMMON_OBUDEC_H_ + +#include "common/tools_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct ObuDecInputContext { + struct AvxInputContext *avx_ctx; + uint8_t *buffer; + size_t buffer_capacity; + size_t bytes_buffered; + int is_annexb; +}; + +// Returns 1 when file data starts (if Annex B stream, after reading the +// size of the OBU) with what appears to be a Temporal Delimiter +// OBU as defined by Section 5 of the AV1 bitstream specification. +int file_is_obu(struct ObuDecInputContext *obu_ctx); + +// Reads one Temporal Unit from the input file. Returns 0 when a TU is +// successfully read, 1 when end of file is reached, and less than 0 when an +// error occurs. Stores TU data in 'buffer'. Reallocs buffer to match TU size, +// returns buffer capacity via 'buffer_size', and returns size of buffered data +// via 'bytes_read'. +int obudec_read_temporal_unit(struct ObuDecInputContext *obu_ctx, + uint8_t **buffer, size_t *bytes_read, + size_t *buffer_size); + +void obudec_free(struct ObuDecInputContext *obu_ctx); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif // AOM_COMMON_OBUDEC_H_ diff --git a/libs/libaom/src/common/rawenc.c b/libs/libaom/src/common/rawenc.c new file mode 100644 index 000000000..b72132c2e --- /dev/null +++ b/libs/libaom/src/common/rawenc.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "common/rawenc.h" + +#define BATCH_SIZE 8 +// When writing greyscale color, batch 8 writes for low bit-depth, 4 writes +// for high bit-depth. +static const uint8_t batched[BATCH_SIZE] = { 128, 128, 128, 128, + 128, 128, 128, 128 }; +static const uint8_t batched_hbd[BATCH_SIZE] = { + 0, 128, 0, 128, 0, 128, 0, 128 +}; + +// Interface to writing to either a file or MD5Context. Takes a pointer to +// either the file or MD5Context, the buffer, the size of each element, and +// number of elements to write. Note that size and nmemb (last two args) must +// be unsigned int, as the interface to MD5Update requires that. +typedef void (*WRITER)(void *, const uint8_t *, unsigned int, unsigned int); + +static void write_file(void *fp, const uint8_t *buffer, unsigned int size, + unsigned int nmemb) { + fwrite(buffer, size, nmemb, (FILE *)fp); +} + +static void write_md5(void *md5, const uint8_t *buffer, unsigned int size, + unsigned int nmemb) { + MD5Update((MD5Context *)md5, buffer, size * nmemb); +} + +// Writes out n greyscale values. +static void write_greyscale(const bool high_bitdepth, int n, WRITER writer_func, + void *file_or_md5) { + const uint8_t *b = batched; + if (high_bitdepth) { + b = batched_hbd; + } + const int num_batched_writes = + high_bitdepth ? n / (BATCH_SIZE / 2) : n / BATCH_SIZE; + for (int i = 0; i < num_batched_writes; ++i) { + writer_func(file_or_md5, b, sizeof(uint8_t), BATCH_SIZE); + } + const int remaining = high_bitdepth ? n % (BATCH_SIZE / 2) : n % BATCH_SIZE; + for (int i = 0; i < remaining; ++i) { + if (high_bitdepth) { + writer_func(file_or_md5, batched_hbd, sizeof(uint8_t), 2); + } else { + writer_func(file_or_md5, batched, sizeof(uint8_t), 1); + } + } +} + +// Encapsulates the logic for writing raw data to either an image file or +// to an MD5 context. +static void raw_write_image_file_or_md5(const aom_image_t *img, + const int *planes, const int num_planes, + void *file_or_md5, WRITER writer_func) { + const bool high_bitdepth = img->fmt & AOM_IMG_FMT_HIGHBITDEPTH; + const int bytes_per_sample = high_bitdepth ? 2 : 1; + for (int i = 0; i < num_planes; ++i) { + const int plane = planes[i]; + const int w = aom_img_plane_width(img, plane); + const int h = aom_img_plane_height(img, plane); + // If we're on a color plane and the output is monochrome, write a greyscale + // value. Since there are only YUV planes, compare against Y. + if (img->monochrome && plane != AOM_PLANE_Y) { + write_greyscale(high_bitdepth, w * h, writer_func, file_or_md5); + continue; + } + const unsigned char *buf = img->planes[plane]; + const int stride = img->stride[plane]; + for (int y = 0; y < h; ++y) { + writer_func(file_or_md5, buf, bytes_per_sample, w); + buf += stride; + } + } +} + +void raw_write_image_file(const aom_image_t *img, const int *planes, + const int num_planes, FILE *file) { + raw_write_image_file_or_md5(img, planes, num_planes, file, write_file); +} + +void raw_update_image_md5(const aom_image_t *img, const int *planes, + const int num_planes, MD5Context *md5) { + raw_write_image_file_or_md5(img, planes, num_planes, md5, write_md5); +} diff --git a/libs/libaom/src/common/rawenc.h b/libs/libaom/src/common/rawenc.h new file mode 100644 index 000000000..cf5e00e6f --- /dev/null +++ b/libs/libaom/src/common/rawenc.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_COMMON_RAWENC_H_ +#define AOM_COMMON_RAWENC_H_ + +#include "aom/aom_decoder.h" +#include "common/md5_utils.h" +#include "common/tools_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void raw_write_image_file(const aom_image_t *img, const int *planes, + const int num_planes, FILE *file); +void raw_update_image_md5(const aom_image_t *img, const int *planes, + const int num_planes, MD5Context *md5); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_COMMON_RAWENC_H_ diff --git a/libs/libaom/src/common/tools_common.c b/libs/libaom/src/common/tools_common.c new file mode 100644 index 000000000..51c1c52a1 --- /dev/null +++ b/libs/libaom/src/common/tools_common.c @@ -0,0 +1,508 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "common/tools_common.h" + +#include +#include +#include +#include +#include + +#if CONFIG_AV1_ENCODER +#include "aom/aomcx.h" +#endif + +#if CONFIG_AV1_DECODER +#include "aom/aomdx.h" +#endif + +#if defined(_WIN32) || defined(__OS2__) +#include +#include + +#ifdef __OS2__ +#define _setmode setmode +#define _fileno fileno +#define _O_BINARY O_BINARY +#endif +#endif + +#define LOG_ERROR(label) \ + do { \ + const char *l = label; \ + va_list ap; \ + va_start(ap, fmt); \ + if (l) fprintf(stderr, "%s: ", l); \ + vfprintf(stderr, fmt, ap); \ + fprintf(stderr, "\n"); \ + va_end(ap); \ + } while (0) + +FILE *set_binary_mode(FILE *stream) { + (void)stream; +#if defined(_WIN32) || defined(__OS2__) + _setmode(_fileno(stream), _O_BINARY); +#endif + return stream; +} + +void die(const char *fmt, ...) { + LOG_ERROR(NULL); + usage_exit(); +} + +void fatal(const char *fmt, ...) { + LOG_ERROR("Fatal"); + exit(EXIT_FAILURE); +} + +void warn(const char *fmt, ...) { LOG_ERROR("Warning"); } + +void die_codec(aom_codec_ctx_t *ctx, const char *s) { + const char *detail = aom_codec_error_detail(ctx); + + printf("%s: %s\n", s, aom_codec_error(ctx)); + if (detail) printf(" %s\n", detail); + exit(EXIT_FAILURE); +} + +int read_yuv_frame(struct AvxInputContext *input_ctx, aom_image_t *yuv_frame) { + FILE *f = input_ctx->file; + struct FileTypeDetectionBuffer *detect = &input_ctx->detect; + int plane = 0; + int shortread = 0; + const int bytespp = (yuv_frame->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; + + for (plane = 0; plane < 3; ++plane) { + uint8_t *ptr; + const int w = aom_img_plane_width(yuv_frame, plane); + const int h = aom_img_plane_height(yuv_frame, plane); + int r; + + /* Determine the correct plane based on the image format. The for-loop + * always counts in Y,U,V order, but this may not match the order of + * the data on disk. + */ + switch (plane) { + case 1: + ptr = + yuv_frame->planes[yuv_frame->fmt == AOM_IMG_FMT_YV12 ? AOM_PLANE_V + : AOM_PLANE_U]; + break; + case 2: + ptr = + yuv_frame->planes[yuv_frame->fmt == AOM_IMG_FMT_YV12 ? AOM_PLANE_U + : AOM_PLANE_V]; + break; + default: ptr = yuv_frame->planes[plane]; + } + + for (r = 0; r < h; ++r) { + size_t needed = w * bytespp; + size_t buf_position = 0; + const size_t left = detect->buf_read - detect->position; + if (left > 0) { + const size_t more = (left < needed) ? left : needed; + memcpy(ptr, detect->buf + detect->position, more); + buf_position = more; + needed -= more; + detect->position += more; + } + if (needed > 0) { + shortread |= (fread(ptr + buf_position, 1, needed, f) < needed); + } + + ptr += yuv_frame->stride[plane]; + } + } + + return shortread; +} + +#if CONFIG_AV1_ENCODER +static const AvxInterface aom_encoders[] = { + { "av1", AV1_FOURCC, &aom_codec_av1_cx }, +}; + +int get_aom_encoder_count(void) { + return sizeof(aom_encoders) / sizeof(aom_encoders[0]); +} + +const AvxInterface *get_aom_encoder_by_index(int i) { return &aom_encoders[i]; } + +const AvxInterface *get_aom_encoder_by_name(const char *name) { + int i; + + for (i = 0; i < get_aom_encoder_count(); ++i) { + const AvxInterface *encoder = get_aom_encoder_by_index(i); + if (strcmp(encoder->name, name) == 0) return encoder; + } + + return NULL; +} + +// large scale tile encoding +static const AvxInterface aom_lst_encoder = { "av1", LST_FOURCC, + &aom_codec_av1_cx }; +const AvxInterface *get_aom_lst_encoder(void) { return &aom_lst_encoder; } +#endif // CONFIG_AV1_ENCODER + +#if CONFIG_AV1_DECODER +static const AvxInterface aom_decoders[] = { + { "av1", AV1_FOURCC, &aom_codec_av1_dx }, +}; + +int get_aom_decoder_count(void) { + return sizeof(aom_decoders) / sizeof(aom_decoders[0]); +} + +const AvxInterface *get_aom_decoder_by_index(int i) { return &aom_decoders[i]; } + +const AvxInterface *get_aom_decoder_by_name(const char *name) { + int i; + + for (i = 0; i < get_aom_decoder_count(); ++i) { + const AvxInterface *const decoder = get_aom_decoder_by_index(i); + if (strcmp(decoder->name, name) == 0) return decoder; + } + + return NULL; +} + +const AvxInterface *get_aom_decoder_by_fourcc(uint32_t fourcc) { + int i; + + for (i = 0; i < get_aom_decoder_count(); ++i) { + const AvxInterface *const decoder = get_aom_decoder_by_index(i); + if (decoder->fourcc == fourcc) return decoder; + } + + return NULL; +} +#endif // CONFIG_AV1_DECODER + +void aom_img_write(const aom_image_t *img, FILE *file) { + int plane; + + for (plane = 0; plane < 3; ++plane) { + const unsigned char *buf = img->planes[plane]; + const int stride = img->stride[plane]; + const int w = aom_img_plane_width(img, plane) * + ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); + const int h = aom_img_plane_height(img, plane); + int y; + + for (y = 0; y < h; ++y) { + fwrite(buf, 1, w, file); + buf += stride; + } + } +} + +int aom_img_read(aom_image_t *img, FILE *file) { + int plane; + + for (plane = 0; plane < 3; ++plane) { + unsigned char *buf = img->planes[plane]; + const int stride = img->stride[plane]; + const int w = aom_img_plane_width(img, plane) * + ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); + const int h = aom_img_plane_height(img, plane); + int y; + + for (y = 0; y < h; ++y) { + if (fread(buf, 1, w, file) != (size_t)w) return 0; + buf += stride; + } + } + + return 1; +} + +// TODO(dkovalev) change sse_to_psnr signature: double -> int64_t +double sse_to_psnr(double samples, double peak, double sse) { + static const double kMaxPSNR = 100.0; + + if (sse > 0.0) { + const double psnr = 10.0 * log10(samples * peak * peak / sse); + return psnr > kMaxPSNR ? kMaxPSNR : psnr; + } else { + return kMaxPSNR; + } +} + +// TODO(debargha): Consolidate the functions below into a separate file. +static void highbd_img_upshift(aom_image_t *dst, const aom_image_t *src, + int input_shift) { + // Note the offset is 1 less than half. + const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0; + int plane; + if (dst->d_w != src->d_w || dst->d_h != src->d_h || + dst->x_chroma_shift != src->x_chroma_shift || + dst->y_chroma_shift != src->y_chroma_shift || dst->fmt != src->fmt || + input_shift < 0) { + fatal("Unsupported image conversion"); + } + switch (src->fmt) { + case AOM_IMG_FMT_I42016: + case AOM_IMG_FMT_I42216: + case AOM_IMG_FMT_I44416: break; + default: fatal("Unsupported image conversion"); break; + } + for (plane = 0; plane < 3; plane++) { + int w = src->d_w; + int h = src->d_h; + int x, y; + if (plane) { + w = (w + src->x_chroma_shift) >> src->x_chroma_shift; + h = (h + src->y_chroma_shift) >> src->y_chroma_shift; + } + for (y = 0; y < h; y++) { + const uint16_t *p_src = + (const uint16_t *)(src->planes[plane] + y * src->stride[plane]); + uint16_t *p_dst = + (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]); + for (x = 0; x < w; x++) *p_dst++ = (*p_src++ << input_shift) + offset; + } + } +} + +static void lowbd_img_upshift(aom_image_t *dst, const aom_image_t *src, + int input_shift) { + // Note the offset is 1 less than half. + const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0; + int plane; + if (dst->d_w != src->d_w || dst->d_h != src->d_h || + dst->x_chroma_shift != src->x_chroma_shift || + dst->y_chroma_shift != src->y_chroma_shift || + dst->fmt != src->fmt + AOM_IMG_FMT_HIGHBITDEPTH || input_shift < 0) { + fatal("Unsupported image conversion"); + } + switch (src->fmt) { + case AOM_IMG_FMT_YV12: + case AOM_IMG_FMT_I420: + case AOM_IMG_FMT_I422: + case AOM_IMG_FMT_I444: break; + default: fatal("Unsupported image conversion"); break; + } + for (plane = 0; plane < 3; plane++) { + int w = src->d_w; + int h = src->d_h; + int x, y; + if (plane) { + w = (w + src->x_chroma_shift) >> src->x_chroma_shift; + h = (h + src->y_chroma_shift) >> src->y_chroma_shift; + } + for (y = 0; y < h; y++) { + const uint8_t *p_src = src->planes[plane] + y * src->stride[plane]; + uint16_t *p_dst = + (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]); + for (x = 0; x < w; x++) { + *p_dst++ = (*p_src++ << input_shift) + offset; + } + } + } +} + +void aom_img_upshift(aom_image_t *dst, const aom_image_t *src, + int input_shift) { + if (src->fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + highbd_img_upshift(dst, src, input_shift); + } else { + lowbd_img_upshift(dst, src, input_shift); + } +} + +void aom_img_truncate_16_to_8(aom_image_t *dst, const aom_image_t *src) { + int plane; + if (dst->fmt + AOM_IMG_FMT_HIGHBITDEPTH != src->fmt || dst->d_w != src->d_w || + dst->d_h != src->d_h || dst->x_chroma_shift != src->x_chroma_shift || + dst->y_chroma_shift != src->y_chroma_shift) { + fatal("Unsupported image conversion"); + } + switch (dst->fmt) { + case AOM_IMG_FMT_I420: + case AOM_IMG_FMT_I422: + case AOM_IMG_FMT_I444: break; + default: fatal("Unsupported image conversion"); break; + } + for (plane = 0; plane < 3; plane++) { + int w = src->d_w; + int h = src->d_h; + int x, y; + if (plane) { + w = (w + src->x_chroma_shift) >> src->x_chroma_shift; + h = (h + src->y_chroma_shift) >> src->y_chroma_shift; + } + for (y = 0; y < h; y++) { + const uint16_t *p_src = + (const uint16_t *)(src->planes[plane] + y * src->stride[plane]); + uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane]; + for (x = 0; x < w; x++) { + *p_dst++ = (uint8_t)(*p_src++); + } + } + } +} + +static void highbd_img_downshift(aom_image_t *dst, const aom_image_t *src, + int down_shift) { + int plane; + if (dst->d_w != src->d_w || dst->d_h != src->d_h || + dst->x_chroma_shift != src->x_chroma_shift || + dst->y_chroma_shift != src->y_chroma_shift || dst->fmt != src->fmt || + down_shift < 0) { + fatal("Unsupported image conversion"); + } + switch (src->fmt) { + case AOM_IMG_FMT_I42016: + case AOM_IMG_FMT_I42216: + case AOM_IMG_FMT_I44416: break; + default: fatal("Unsupported image conversion"); break; + } + for (plane = 0; plane < 3; plane++) { + int w = src->d_w; + int h = src->d_h; + int x, y; + if (plane) { + w = (w + src->x_chroma_shift) >> src->x_chroma_shift; + h = (h + src->y_chroma_shift) >> src->y_chroma_shift; + } + for (y = 0; y < h; y++) { + const uint16_t *p_src = + (const uint16_t *)(src->planes[plane] + y * src->stride[plane]); + uint16_t *p_dst = + (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]); + for (x = 0; x < w; x++) *p_dst++ = *p_src++ >> down_shift; + } + } +} + +static void lowbd_img_downshift(aom_image_t *dst, const aom_image_t *src, + int down_shift) { + int plane; + if (dst->d_w != src->d_w || dst->d_h != src->d_h || + dst->x_chroma_shift != src->x_chroma_shift || + dst->y_chroma_shift != src->y_chroma_shift || + src->fmt != dst->fmt + AOM_IMG_FMT_HIGHBITDEPTH || down_shift < 0) { + fatal("Unsupported image conversion"); + } + switch (dst->fmt) { + case AOM_IMG_FMT_I420: + case AOM_IMG_FMT_I422: + case AOM_IMG_FMT_I444: break; + default: fatal("Unsupported image conversion"); break; + } + for (plane = 0; plane < 3; plane++) { + int w = src->d_w; + int h = src->d_h; + int x, y; + if (plane) { + w = (w + src->x_chroma_shift) >> src->x_chroma_shift; + h = (h + src->y_chroma_shift) >> src->y_chroma_shift; + } + for (y = 0; y < h; y++) { + const uint16_t *p_src = + (const uint16_t *)(src->planes[plane] + y * src->stride[plane]); + uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane]; + for (x = 0; x < w; x++) { + *p_dst++ = *p_src++ >> down_shift; + } + } + } +} + +void aom_img_downshift(aom_image_t *dst, const aom_image_t *src, + int down_shift) { + if (dst->fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + highbd_img_downshift(dst, src, down_shift); + } else { + lowbd_img_downshift(dst, src, down_shift); + } +} + +static int img_shifted_realloc_required(const aom_image_t *img, + const aom_image_t *shifted, + aom_img_fmt_t required_fmt) { + return img->d_w != shifted->d_w || img->d_h != shifted->d_h || + required_fmt != shifted->fmt; +} + +void aom_shift_img(unsigned int output_bit_depth, aom_image_t **img_ptr, + aom_image_t **img_shifted_ptr) { + aom_image_t *img = *img_ptr; + aom_image_t *img_shifted = *img_shifted_ptr; + + const aom_img_fmt_t shifted_fmt = output_bit_depth == 8 + ? img->fmt & ~AOM_IMG_FMT_HIGHBITDEPTH + : img->fmt | AOM_IMG_FMT_HIGHBITDEPTH; + + if (shifted_fmt != img->fmt || output_bit_depth != img->bit_depth) { + if (img_shifted && + img_shifted_realloc_required(img, img_shifted, shifted_fmt)) { + aom_img_free(img_shifted); + img_shifted = NULL; + } + if (img_shifted) { + img_shifted->monochrome = img->monochrome; + } + if (!img_shifted) { + img_shifted = aom_img_alloc(NULL, shifted_fmt, img->d_w, img->d_h, 16); + img_shifted->bit_depth = output_bit_depth; + img_shifted->monochrome = img->monochrome; + img_shifted->csp = img->csp; + } + if (output_bit_depth > img->bit_depth) { + aom_img_upshift(img_shifted, img, output_bit_depth - img->bit_depth); + } else { + aom_img_downshift(img_shifted, img, img->bit_depth - output_bit_depth); + } + *img_shifted_ptr = img_shifted; + *img_ptr = img_shifted; + } +} + +// Related to I420, NV12 format has one luma "luminance" plane Y and one plane +// with U and V values interleaved. +void aom_img_write_nv12(const aom_image_t *img, FILE *file) { + // Y plane + const unsigned char *buf = img->planes[0]; + int stride = img->stride[0]; + int w = aom_img_plane_width(img, 0) * + ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); + int h = aom_img_plane_height(img, 0); + int x, y; + + for (y = 0; y < h; ++y) { + fwrite(buf, 1, w, file); + buf += stride; + } + + // Interleaved U and V plane + const unsigned char *ubuf = img->planes[1]; + const unsigned char *vbuf = img->planes[2]; + const size_t size = (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; + stride = img->stride[1]; + w = aom_img_plane_width(img, 1); + h = aom_img_plane_height(img, 1); + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + fwrite(ubuf, size, 1, file); + fwrite(vbuf, size, 1, file); + ubuf += size; + vbuf += size; + } + ubuf += (stride - w * size); + vbuf += (stride - w * size); + } +} diff --git a/libs/libaom/src/common/tools_common.h b/libs/libaom/src/common/tools_common.h new file mode 100644 index 000000000..1ed004521 --- /dev/null +++ b/libs/libaom/src/common/tools_common.h @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_COMMON_TOOLS_COMMON_H_ +#define AOM_COMMON_TOOLS_COMMON_H_ + +#include + +#include "config/aom_config.h" + +#include "aom/aom_codec.h" +#include "aom/aom_image.h" +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_ports/msvc.h" + +#if CONFIG_AV1_ENCODER +#include "common/y4minput.h" +#endif + +#if defined(_MSC_VER) +/* MSVS uses _f{seek,tell}i64. */ +#define fseeko _fseeki64 +#define ftello _ftelli64 +typedef int64_t FileOffset; +#elif defined(_WIN32) +#include /* NOLINT*/ +/* MinGW uses f{seek,tell}o64 for large files. */ +#define fseeko fseeko64 +#define ftello ftello64 +typedef off64_t FileOffset; +#elif CONFIG_OS_SUPPORT +#include /* NOLINT*/ +typedef off_t FileOffset; +/* Use 32-bit file operations in WebM file format when building ARM + * executables (.axf) with RVCT. */ +#else +#define fseeko fseek +#define ftello ftell +typedef long FileOffset; /* NOLINT */ +#endif /* CONFIG_OS_SUPPORT */ + +#if CONFIG_OS_SUPPORT +#if defined(_MSC_VER) +#include /* NOLINT */ +#define isatty _isatty +#define fileno _fileno +#else +#include /* NOLINT */ +#endif /* _MSC_VER */ +#endif /* CONFIG_OS_SUPPORT */ + +#define LITERALU64(hi, lo) ((((uint64_t)hi) << 32) | lo) + +#ifndef PATH_MAX +#define PATH_MAX 512 +#endif + +#define IVF_FRAME_HDR_SZ (4 + 8) /* 4 byte size + 8 byte timestamp */ +#define IVF_FILE_HDR_SZ 32 + +#define RAW_FRAME_HDR_SZ sizeof(uint32_t) + +#define AV1_FOURCC 0x31305641 + +enum VideoFileType { + FILE_TYPE_OBU, + FILE_TYPE_RAW, + FILE_TYPE_IVF, + FILE_TYPE_Y4M, + FILE_TYPE_WEBM +}; + +// Used in lightfield example. +enum { + YUV1D, // 1D tile output for conformance test. + YUV, // Tile output in YUV format. + NV12, // Tile output in NV12 format. +} UENUM1BYTE(OUTPUT_FORMAT); + +// The fourcc for large_scale_tile encoding is "LSTC". +#define LST_FOURCC 0x4354534c + +struct FileTypeDetectionBuffer { + char buf[4]; + size_t buf_read; + size_t position; +}; + +struct AvxRational { + int numerator; + int denominator; +}; + +struct AvxInputContext { + const char *filename; + FILE *file; + int64_t length; + struct FileTypeDetectionBuffer detect; + enum VideoFileType file_type; + uint32_t width; + uint32_t height; + struct AvxRational pixel_aspect_ratio; + aom_img_fmt_t fmt; + aom_bit_depth_t bit_depth; + int only_i420; + uint32_t fourcc; + struct AvxRational framerate; +#if CONFIG_AV1_ENCODER + y4m_input y4m; +#endif +}; + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(__GNUC__) +#define AOM_NO_RETURN __attribute__((noreturn)) +#else +#define AOM_NO_RETURN +#endif + +/* Sets a stdio stream into binary mode */ +FILE *set_binary_mode(FILE *stream); + +void die(const char *fmt, ...) AOM_NO_RETURN; +void fatal(const char *fmt, ...) AOM_NO_RETURN; +void warn(const char *fmt, ...); + +void die_codec(aom_codec_ctx_t *ctx, const char *s) AOM_NO_RETURN; + +/* The tool including this file must define usage_exit() */ +void usage_exit(void) AOM_NO_RETURN; + +#undef AOM_NO_RETURN + +int read_yuv_frame(struct AvxInputContext *input_ctx, aom_image_t *yuv_frame); + +/////////////////////////////////////////////////////////////////////////////// +// A description of the interfaces used to access the AOM codecs +/////////////////////////////////////////////////////////////////////////////// +// +// There are three levels of interfaces used to access the AOM codec: the +// AVXInterface, the aom_codec_iface, and the aom_codec_ctx. Each of these +// is described in detail here. +// +// +// 1. AVXInterface +// (Related files: common/tools_common.c, common/tools_common.h) +// +// The high-level interface to the AVx encoders / decoders. Each AvxInterface +// contains the name of the codec (e.g., "av1"), the four character code +// associated with it, and a function pointer to the actual interface (see the +// documentation on aom_codec_iface_t for more info). This API +// is meant for lookup / iteration over all known codecs. +// +// For the encoder, call get_aom_encoder_by_name(...) if you know the name +// (e.g., "av1"); to iterate over all known encoders, use +// get_aom_encoder_count() and get_aom_encoder_by_index(i). To get the +// encoder specifically for large scale tile encoding, use +// get_aom_lst_encoder(). +// +// For the decoder, similar functions are available. There is also a +// get_aom_decoder_by_fourcc(fourcc) to get the decoder based on the four +// character codes. +// +// The main purpose of the AVXInterface is to get a reference to the +// aom_codec_interface_t, pointed to by its codec_interface variable. +// +// +// 2. aom_codec_iface_t +// (Related files: aom/aom_codec.h, aom/src/aom_codec.c, +// aom/internal/aom_codec_internal.h, av1/av1_cx_iface.c, +// av1/av1_dx_iface.c) +// +// Used to initialize the codec context, which contains the configuration for +// for modifying the encoder/decoder during run-time. See the documentation of +// aom/aom_codec.h for more details. For the most part, users will call the +// helper functions listed there, such as aom_codec_iface_name, +// aom_codec_get_caps, etc., to interact with it. +// +// The main purpose of the aom_codec_iface_t is to provide a way to generate +// a default codec config, find out what capabilities the implementation has, +// and create an aom_codec_ctx_t (which is actually used to interact with the +// codec). +// +// Note that the implementations of the aom_codec_iface_t are located in +// av1/av1_cx_iface.c and av1/av1_dx_iface.c +// +// +// 3. aom_codec_ctx_t +// (Related files: aom/aom_codec.h, av1/av1_cx_iface.c, av1/av1_dx_iface.c, +// aom/aomcx.h, aom/aomdx.h, aom/src/aom_encoder.c, aom/src/aom_decoder.c) +// +// The actual interface between user code and the codec. It stores the name +// of the codec, a pointer back to the aom_codec_iface_t that initialized it, +// initialization flags, a config for either encoder or the decoder, and a +// pointer to internal data. +// +// The codec is configured / queried through calls to aom_codec_control, +// which takes a control code (listed in aomcx.h and aomdx.h) and a parameter. +// In the case of "getter" control codes, the parameter is modified to have +// the requested value; in the case of "setter" control codes, the codec's +// configuration is changed based on the parameter. Note that a aom_codec_err_t +// is returned, which indicates if the operation was successful or not. +// +// Note that for the encoder, the aom_codec_alg_priv_t points to the +// the aom_codec_alg_priv structure in av1/av1_cx_iface.c, and for the decoder, +// the struct in av1/av1_dx_iface.c. Variables such as AV1_COMP cpi are stored +// here and also used in the core algorithm. +// +// At the end, aom_codec_destroy should be called for each initialized +// aom_codec_ctx_t. + +typedef struct AvxInterface { + const char *const name; + const uint32_t fourcc; + // Pointer to a function of zero arguments that returns an aom_codec_iface_t + // pointer. E.g.: + // aom_codec_iface_t *codec = interface->codec_interface(); + aom_codec_iface_t *(*const codec_interface)(); +} AvxInterface; + +int get_aom_encoder_count(void); +// Lookup the interface by index -- it must be the case that +// i < get_aom_encoder_count() +const AvxInterface *get_aom_encoder_by_index(int i); +// Lookup the interface by name -- returns NULL if no match. +const AvxInterface *get_aom_encoder_by_name(const char *name); +const AvxInterface *get_aom_lst_encoder(void); + +int get_aom_decoder_count(void); +const AvxInterface *get_aom_decoder_by_index(int i); +const AvxInterface *get_aom_decoder_by_name(const char *name); +// Lookup the interface by the fourcc -- returns NULL if no match. +const AvxInterface *get_aom_decoder_by_fourcc(uint32_t fourcc); + +void aom_img_write(const aom_image_t *img, FILE *file); +int aom_img_read(aom_image_t *img, FILE *file); + +double sse_to_psnr(double samples, double peak, double mse); +void aom_img_upshift(aom_image_t *dst, const aom_image_t *src, int input_shift); +void aom_img_downshift(aom_image_t *dst, const aom_image_t *src, + int down_shift); +void aom_shift_img(unsigned int output_bit_depth, aom_image_t **img_ptr, + aom_image_t **img_shifted_ptr); +void aom_img_truncate_16_to_8(aom_image_t *dst, const aom_image_t *src); + +// Output in NV12 format. +void aom_img_write_nv12(const aom_image_t *img, FILE *file); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif // AOM_COMMON_TOOLS_COMMON_H_ diff --git a/libs/libaom/src/common/video_common.h b/libs/libaom/src/common/video_common.h new file mode 100644 index 000000000..bf95031be --- /dev/null +++ b/libs/libaom/src/common/video_common.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_COMMON_VIDEO_COMMON_H_ +#define AOM_COMMON_VIDEO_COMMON_H_ + +#include "common/tools_common.h" + +typedef struct { + uint32_t codec_fourcc; + int frame_width; + int frame_height; + struct AvxRational time_base; + unsigned int is_annexb; +} AvxVideoInfo; + +#endif // AOM_COMMON_VIDEO_COMMON_H_ diff --git a/libs/libaom/src/common/video_reader.c b/libs/libaom/src/common/video_reader.c new file mode 100644 index 000000000..7b021bc40 --- /dev/null +++ b/libs/libaom/src/common/video_reader.c @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include +#include + +#include "aom_ports/mem_ops.h" +#include "common/ivfdec.h" +#include "common/obudec.h" +#include "common/tools_common.h" +#include "common/video_reader.h" +#include "common/webmdec.h" + +struct AvxVideoReaderStruct { + AvxVideoInfo info; + struct AvxInputContext input_ctx; + struct ObuDecInputContext obu_ctx; + struct WebmInputContext webm_ctx; + uint8_t *buffer; + size_t buffer_size; + size_t frame_size; + aom_codec_pts_t pts; +}; + +AvxVideoReader *aom_video_reader_open(const char *filename) { + AvxVideoReader *reader = NULL; + FILE *const file = fopen(filename, "rb"); + if (!file) return NULL; // Can't open file + + reader = (AvxVideoReader *)calloc(1, sizeof(*reader)); + if (!reader) { + fclose(file); + return NULL; // Can't allocate AvxVideoReader + } + + reader->input_ctx.filename = filename; + reader->input_ctx.file = file; + reader->obu_ctx.avx_ctx = &reader->input_ctx; + reader->obu_ctx.is_annexb = 1; + + if (file_is_ivf(&reader->input_ctx)) { + reader->input_ctx.file_type = FILE_TYPE_IVF; + reader->info.codec_fourcc = reader->input_ctx.fourcc; + reader->info.frame_width = reader->input_ctx.width; + reader->info.frame_height = reader->input_ctx.height; +#if CONFIG_WEBM_IO + } else if (file_is_webm(&reader->webm_ctx, &reader->input_ctx)) { + reader->input_ctx.file_type = FILE_TYPE_WEBM; + reader->info.codec_fourcc = reader->input_ctx.fourcc; + reader->info.frame_width = reader->input_ctx.width; + reader->info.frame_height = reader->input_ctx.height; +#endif + } else if (file_is_obu(&reader->obu_ctx)) { + reader->input_ctx.file_type = FILE_TYPE_OBU; + // assume AV1 + reader->info.codec_fourcc = AV1_FOURCC; + reader->info.is_annexb = reader->obu_ctx.is_annexb; + } else { + fclose(file); + free(reader); + return NULL; // Unknown file type + } + + return reader; +} + +void aom_video_reader_close(AvxVideoReader *reader) { + if (reader) { + fclose(reader->input_ctx.file); + if (reader->input_ctx.file_type == FILE_TYPE_OBU) { + obudec_free(&reader->obu_ctx); + } + free(reader->buffer); + free(reader); + } +} + +int aom_video_reader_read_frame(AvxVideoReader *reader) { + if (reader->input_ctx.file_type == FILE_TYPE_IVF) { + return !ivf_read_frame(reader->input_ctx.file, &reader->buffer, + &reader->frame_size, &reader->buffer_size, + &reader->pts); + } else if (reader->input_ctx.file_type == FILE_TYPE_OBU) { + return !obudec_read_temporal_unit(&reader->obu_ctx, &reader->buffer, + &reader->frame_size, + &reader->buffer_size); +#if CONFIG_WEBM_IO + } else if (reader->input_ctx.file_type == FILE_TYPE_WEBM) { + return !webm_read_frame(&reader->webm_ctx, &reader->buffer, + &reader->frame_size, &reader->buffer_size); +#endif + } else { + assert(0); + return 0; + } +} + +const uint8_t *aom_video_reader_get_frame(AvxVideoReader *reader, + size_t *size) { + if (size) *size = reader->frame_size; + + return reader->buffer; +} + +int64_t aom_video_reader_get_frame_pts(AvxVideoReader *reader) { + return (int64_t)reader->pts; +} + +FILE *aom_video_reader_get_file(AvxVideoReader *reader) { + return reader->input_ctx.file; +} + +const AvxVideoInfo *aom_video_reader_get_info(AvxVideoReader *reader) { + return &reader->info; +} + +void aom_video_reader_set_fourcc(AvxVideoReader *reader, uint32_t fourcc) { + reader->info.codec_fourcc = fourcc; +} diff --git a/libs/libaom/src/common/video_reader.h b/libs/libaom/src/common/video_reader.h new file mode 100644 index 000000000..9ab439e8a --- /dev/null +++ b/libs/libaom/src/common/video_reader.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_COMMON_VIDEO_READER_H_ +#define AOM_COMMON_VIDEO_READER_H_ + +#include "common/video_common.h" + +// The following code is work in progress. It is going to support transparent +// reading of input files. Right now only IVF format is supported for +// simplicity. The main goal the API is to be simple and easy to use in example +// code and in aomenc/aomdec later. All low-level details like memory +// buffer management are hidden from API users. +struct AvxVideoReaderStruct; +typedef struct AvxVideoReaderStruct AvxVideoReader; + +#ifdef __cplusplus +extern "C" { +#endif + +// Opens the input file for reading and inspects it to determine file type. +// Returns an opaque AvxVideoReader* upon success, or NULL upon failure. +// Right now only IVF format is supported. +AvxVideoReader *aom_video_reader_open(const char *filename); + +// Frees all resources associated with AvxVideoReader* returned from +// aom_video_reader_open() call. +void aom_video_reader_close(AvxVideoReader *reader); + +// Reads frame from the file and stores it in internal buffer. +int aom_video_reader_read_frame(AvxVideoReader *reader); + +// Returns the pointer to memory buffer with frame data read by last call to +// aom_video_reader_read_frame(). +const uint8_t *aom_video_reader_get_frame(AvxVideoReader *reader, size_t *size); + +// Returns the pts of the frame. +int64_t aom_video_reader_get_frame_pts(AvxVideoReader *reader); +// Return the reader file. +FILE *aom_video_reader_get_file(AvxVideoReader *reader); + +// Fills AvxVideoInfo with information from opened video file. +const AvxVideoInfo *aom_video_reader_get_info(AvxVideoReader *reader); + +// Set fourcc. +void aom_video_reader_set_fourcc(AvxVideoReader *reader, uint32_t fourcc); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_COMMON_VIDEO_READER_H_ diff --git a/libs/libaom/src/common/video_writer.c b/libs/libaom/src/common/video_writer.c new file mode 100644 index 000000000..1d4328ae1 --- /dev/null +++ b/libs/libaom/src/common/video_writer.c @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "common/video_writer.h" + +#include + +#include "aom/aom_encoder.h" +#include "common/ivfenc.h" + +struct AvxVideoWriterStruct { + AvxVideoInfo info; + FILE *file; + int frame_count; +}; + +static void write_header(FILE *file, const AvxVideoInfo *info, + int frame_count) { + struct aom_codec_enc_cfg cfg; + cfg.g_w = info->frame_width; + cfg.g_h = info->frame_height; + cfg.g_timebase.num = info->time_base.numerator; + cfg.g_timebase.den = info->time_base.denominator; + + ivf_write_file_header(file, &cfg, info->codec_fourcc, frame_count); +} + +AvxVideoWriter *aom_video_writer_open(const char *filename, + AvxContainer container, + const AvxVideoInfo *info) { + if (container == kContainerIVF) { + AvxVideoWriter *writer = NULL; + FILE *const file = fopen(filename, "wb"); + if (!file) return NULL; + + writer = malloc(sizeof(*writer)); + if (!writer) { + fclose(file); + return NULL; + } + writer->frame_count = 0; + writer->info = *info; + writer->file = file; + + write_header(writer->file, info, 0); + + return writer; + } + + return NULL; +} + +void aom_video_writer_close(AvxVideoWriter *writer) { + if (writer) { + // Rewriting frame header with real frame count + rewind(writer->file); + write_header(writer->file, &writer->info, writer->frame_count); + + fclose(writer->file); + free(writer); + } +} + +int aom_video_writer_write_frame(AvxVideoWriter *writer, const uint8_t *buffer, + size_t size, int64_t pts) { + ivf_write_frame_header(writer->file, pts, size); + if (fwrite(buffer, 1, size, writer->file) != size) return 0; + + ++writer->frame_count; + + return 1; +} + +void aom_video_writer_set_fourcc(AvxVideoWriter *writer, uint32_t fourcc) { + writer->info.codec_fourcc = fourcc; +} diff --git a/libs/libaom/src/common/video_writer.h b/libs/libaom/src/common/video_writer.h new file mode 100644 index 000000000..8712d47a5 --- /dev/null +++ b/libs/libaom/src/common/video_writer.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_COMMON_VIDEO_WRITER_H_ +#define AOM_COMMON_VIDEO_WRITER_H_ + +#include "common/video_common.h" + +enum { kContainerIVF } UENUM1BYTE(AvxContainer); + +struct AvxVideoWriterStruct; +typedef struct AvxVideoWriterStruct AvxVideoWriter; + +#ifdef __cplusplus +extern "C" { +#endif + +// Finds and opens writer for specified container format. +// Returns an opaque AvxVideoWriter* upon success, or NULL upon failure. +// Right now only IVF format is supported. +AvxVideoWriter *aom_video_writer_open(const char *filename, + AvxContainer container, + const AvxVideoInfo *info); + +// Frees all resources associated with AvxVideoWriter* returned from +// aom_video_writer_open() call. +void aom_video_writer_close(AvxVideoWriter *writer); + +// Writes frame bytes to the file. +int aom_video_writer_write_frame(AvxVideoWriter *writer, const uint8_t *buffer, + size_t size, int64_t pts); +// Set fourcc. +void aom_video_writer_set_fourcc(AvxVideoWriter *writer, uint32_t fourcc); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_COMMON_VIDEO_WRITER_H_ diff --git a/libs/libaom/src/common/warnings.c b/libs/libaom/src/common/warnings.c new file mode 100644 index 000000000..2facee252 --- /dev/null +++ b/libs/libaom/src/common/warnings.c @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "common/warnings.h" + +#include +#include +#include +#include + +#include "aom/aom_encoder.h" +#include "apps/aomenc.h" +#include "common/tools_common.h" + +static const char quantizer_warning_string[] = + "Bad quantizer values. Quantizer values should not be equal, and should " + "differ by at least 8."; + +struct WarningListNode { + const char *warning_string; + struct WarningListNode *next_warning; +}; + +struct WarningList { + struct WarningListNode *warning_node; +}; + +static void add_warning(const char *warning_string, + struct WarningList *warning_list) { + struct WarningListNode **node = &warning_list->warning_node; + + struct WarningListNode *new_node = malloc(sizeof(*new_node)); + if (new_node == NULL) { + fatal("Unable to allocate warning node."); + } + + new_node->warning_string = warning_string; + new_node->next_warning = NULL; + + while (*node != NULL) node = &(*node)->next_warning; + + *node = new_node; +} + +static void free_warning_list(struct WarningList *warning_list) { + while (warning_list->warning_node != NULL) { + struct WarningListNode *const node = warning_list->warning_node; + warning_list->warning_node = node->next_warning; + free(node); + } +} + +static int continue_prompt(int num_warnings) { + int c; + fprintf(stderr, + "%d encoder configuration warning(s). Continue? (y to continue) ", + num_warnings); + c = getchar(); + return c == 'y'; +} + +static void check_quantizer(int min_q, int max_q, + struct WarningList *warning_list) { + const int lossless = min_q == 0 && max_q == 0; + if (!lossless && (min_q == max_q || abs(max_q - min_q) < 8)) + add_warning(quantizer_warning_string, warning_list); +} + +void check_encoder_config(int disable_prompt, + const struct AvxEncoderConfig *global_config, + const struct aom_codec_enc_cfg *stream_config) { + int num_warnings = 0; + struct WarningListNode *warning = NULL; + struct WarningList warning_list = { 0 }; + (void)global_config; + check_quantizer(stream_config->rc_min_quantizer, + stream_config->rc_max_quantizer, &warning_list); + /* Count and print warnings. */ + for (warning = warning_list.warning_node; warning != NULL; + warning = warning->next_warning, ++num_warnings) { + warn(warning->warning_string); + } + + free_warning_list(&warning_list); + + if (num_warnings) { + if (!disable_prompt && !continue_prompt(num_warnings)) exit(EXIT_FAILURE); + } +} diff --git a/libs/libaom/src/common/warnings.h b/libs/libaom/src/common/warnings.h new file mode 100644 index 000000000..36f1fe070 --- /dev/null +++ b/libs/libaom/src/common/warnings.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_COMMON_WARNINGS_H_ +#define AOM_COMMON_WARNINGS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct aom_codec_enc_cfg; +struct AvxEncoderConfig; + +/* + * Checks config for improperly used settings. Warns user upon encountering + * settings that will lead to poor output quality. Prompts user to continue + * when warnings are issued. + */ +void check_encoder_config(int disable_prompt, + const struct AvxEncoderConfig *global_config, + const struct aom_codec_enc_cfg *stream_config); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_COMMON_WARNINGS_H_ diff --git a/libs/libaom/src/common/webmdec.cc b/libs/libaom/src/common/webmdec.cc new file mode 100644 index 000000000..33bda5902 --- /dev/null +++ b/libs/libaom/src/common/webmdec.cc @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "common/webmdec.h" + +#include +#include +#include + +#include "third_party/libwebm/mkvparser/mkvparser.h" +#include "third_party/libwebm/mkvparser/mkvreader.h" + +namespace { + +void reset(struct WebmInputContext *const webm_ctx) { + if (webm_ctx->reader != NULL) { + mkvparser::MkvReader *const reader = + reinterpret_cast(webm_ctx->reader); + delete reader; + } + if (webm_ctx->segment != NULL) { + mkvparser::Segment *const segment = + reinterpret_cast(webm_ctx->segment); + delete segment; + } + if (webm_ctx->buffer != NULL) { + delete[] webm_ctx->buffer; + } + webm_ctx->reader = NULL; + webm_ctx->segment = NULL; + webm_ctx->buffer = NULL; + webm_ctx->cluster = NULL; + webm_ctx->block_entry = NULL; + webm_ctx->block = NULL; + webm_ctx->block_frame_index = 0; + webm_ctx->video_track_index = 0; + webm_ctx->timestamp_ns = 0; + webm_ctx->is_key_frame = false; +} + +void get_first_cluster(struct WebmInputContext *const webm_ctx) { + mkvparser::Segment *const segment = + reinterpret_cast(webm_ctx->segment); + const mkvparser::Cluster *const cluster = segment->GetFirst(); + webm_ctx->cluster = cluster; +} + +void rewind_and_reset(struct WebmInputContext *const webm_ctx, + struct AvxInputContext *const aom_ctx) { + rewind(aom_ctx->file); + reset(webm_ctx); +} + +} // namespace + +int file_is_webm(struct WebmInputContext *webm_ctx, + struct AvxInputContext *aom_ctx) { + mkvparser::MkvReader *const reader = new mkvparser::MkvReader(aom_ctx->file); + webm_ctx->reader = reader; + webm_ctx->reached_eos = 0; + + mkvparser::EBMLHeader header; + long long pos = 0; + if (header.Parse(reader, pos) < 0) { + rewind_and_reset(webm_ctx, aom_ctx); + return 0; + } + + mkvparser::Segment *segment; + if (mkvparser::Segment::CreateInstance(reader, pos, segment)) { + rewind_and_reset(webm_ctx, aom_ctx); + return 0; + } + webm_ctx->segment = segment; + if (segment->Load() < 0) { + rewind_and_reset(webm_ctx, aom_ctx); + return 0; + } + + const mkvparser::Tracks *const tracks = segment->GetTracks(); + const mkvparser::VideoTrack *video_track = NULL; + for (unsigned long i = 0; i < tracks->GetTracksCount(); ++i) { + const mkvparser::Track *const track = tracks->GetTrackByIndex(i); + if (track->GetType() == mkvparser::Track::kVideo) { + video_track = static_cast(track); + webm_ctx->video_track_index = static_cast(track->GetNumber()); + break; + } + } + + if (video_track == NULL || video_track->GetCodecId() == NULL) { + rewind_and_reset(webm_ctx, aom_ctx); + return 0; + } + + if (!strncmp(video_track->GetCodecId(), "V_AV1", 5)) { + aom_ctx->fourcc = AV1_FOURCC; + } else { + rewind_and_reset(webm_ctx, aom_ctx); + return 0; + } + + aom_ctx->framerate.denominator = 0; + aom_ctx->framerate.numerator = 0; + aom_ctx->width = static_cast(video_track->GetWidth()); + aom_ctx->height = static_cast(video_track->GetHeight()); + + get_first_cluster(webm_ctx); + + return 1; +} + +int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer, + size_t *bytes_read, size_t *buffer_size) { + assert(webm_ctx->buffer == *buffer); + // This check is needed for frame parallel decoding, in which case this + // function could be called even after it has reached end of input stream. + if (webm_ctx->reached_eos) { + return 1; + } + mkvparser::Segment *const segment = + reinterpret_cast(webm_ctx->segment); + const mkvparser::Cluster *cluster = + reinterpret_cast(webm_ctx->cluster); + const mkvparser::Block *block = + reinterpret_cast(webm_ctx->block); + const mkvparser::BlockEntry *block_entry = + reinterpret_cast(webm_ctx->block_entry); + bool block_entry_eos = false; + do { + long status = 0; + bool get_new_block = false; + if (block_entry == NULL && !block_entry_eos) { + status = cluster->GetFirst(block_entry); + get_new_block = true; + } else if (block_entry_eos || block_entry->EOS()) { + cluster = segment->GetNext(cluster); + if (cluster == NULL || cluster->EOS()) { + *bytes_read = 0; + webm_ctx->reached_eos = 1; + return 1; + } + status = cluster->GetFirst(block_entry); + block_entry_eos = false; + get_new_block = true; + } else if (block == NULL || + webm_ctx->block_frame_index == block->GetFrameCount() || + block->GetTrackNumber() != webm_ctx->video_track_index) { + status = cluster->GetNext(block_entry, block_entry); + if (block_entry == NULL || block_entry->EOS()) { + block_entry_eos = true; + continue; + } + get_new_block = true; + } + if (status || block_entry == NULL) { + return -1; + } + if (get_new_block) { + block = block_entry->GetBlock(); + if (block == NULL) return -1; + webm_ctx->block_frame_index = 0; + } + } while (block_entry_eos || + block->GetTrackNumber() != webm_ctx->video_track_index); + + webm_ctx->cluster = cluster; + webm_ctx->block_entry = block_entry; + webm_ctx->block = block; + + const mkvparser::Block::Frame &frame = + block->GetFrame(webm_ctx->block_frame_index); + ++webm_ctx->block_frame_index; + if (frame.len > static_cast(*buffer_size)) { + delete[] * buffer; + *buffer = new uint8_t[frame.len]; + webm_ctx->buffer = *buffer; + if (*buffer == NULL) { + return -1; + } + *buffer_size = frame.len; + } + *bytes_read = frame.len; + webm_ctx->timestamp_ns = block->GetTime(cluster); + webm_ctx->is_key_frame = block->IsKey(); + + mkvparser::MkvReader *const reader = + reinterpret_cast(webm_ctx->reader); + return frame.Read(reader, *buffer) ? -1 : 0; +} + +// Calculate the greatest common divisor between two numbers. +static int gcd(int a, int b) { + int remainder; + while (b > 0) { + remainder = a % b; + a = b; + b = remainder; + } + return a; +} + +int webm_guess_framerate(struct WebmInputContext *webm_ctx, + struct AvxInputContext *aom_ctx) { + uint32_t i = 0; + uint8_t *buffer = NULL; + size_t buffer_size = 0; + size_t bytes_read = 0; + assert(webm_ctx->buffer == NULL); + while (webm_ctx->timestamp_ns < 1000000000 && i < 50) { + if (webm_read_frame(webm_ctx, &buffer, &bytes_read, &buffer_size)) { + break; + } + ++i; + } + aom_ctx->framerate.numerator = (i - 1) * 1000000; + aom_ctx->framerate.denominator = + static_cast(webm_ctx->timestamp_ns / 1000); + // Fraction might be represented in large numbers, like 49000000/980000 + // for 50fps. Simplify as much as possible. + int g = gcd(aom_ctx->framerate.numerator, aom_ctx->framerate.denominator); + if (g != 0) { + aom_ctx->framerate.numerator /= g; + aom_ctx->framerate.denominator /= g; + } + + delete[] buffer; + webm_ctx->buffer = NULL; + + get_first_cluster(webm_ctx); + webm_ctx->block = NULL; + webm_ctx->block_entry = NULL; + webm_ctx->block_frame_index = 0; + webm_ctx->timestamp_ns = 0; + webm_ctx->reached_eos = 0; + + return 0; +} + +void webm_free(struct WebmInputContext *webm_ctx) { reset(webm_ctx); } diff --git a/libs/libaom/src/common/webmdec.h b/libs/libaom/src/common/webmdec.h new file mode 100644 index 000000000..5ac75cb30 --- /dev/null +++ b/libs/libaom/src/common/webmdec.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_COMMON_WEBMDEC_H_ +#define AOM_COMMON_WEBMDEC_H_ + +#include "common/tools_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct AvxInputContext; + +struct WebmInputContext { + void *reader; + void *segment; + uint8_t *buffer; + const void *cluster; + const void *block_entry; + const void *block; + int block_frame_index; + int video_track_index; + uint64_t timestamp_ns; + int is_key_frame; + int reached_eos; +}; + +// Checks if the input is a WebM file. If so, initializes WebMInputContext so +// that webm_read_frame can be called to retrieve a video frame. +// Returns 1 on success and 0 on failure or input is not WebM file. +// TODO(vigneshv): Refactor this function into two smaller functions specific +// to their task. +int file_is_webm(struct WebmInputContext *webm_ctx, + struct AvxInputContext *aom_ctx); + +// Reads a WebM Video Frame. Memory for the buffer is created, owned and managed +// by this function. For the first call, |buffer| should be NULL and +// |*buffer_size| should be 0. Once all the frames are read and used, +// webm_free() should be called, otherwise there will be a leak. +// Parameters: +// webm_ctx - WebmInputContext object +// buffer - pointer where the frame data will be filled. +// bytes_read - pointer to bytes read. +// buffer_size - pointer to buffer size. +// Return values: +// 0 - Success +// 1 - End of Stream +// -1 - Error +int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer, + size_t *bytes_read, size_t *buffer_size); + +// Guesses the frame rate of the input file based on the container timestamps. +int webm_guess_framerate(struct WebmInputContext *webm_ctx, + struct AvxInputContext *aom_ctx); + +// Resets the WebMInputContext. +void webm_free(struct WebmInputContext *webm_ctx); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_COMMON_WEBMDEC_H_ diff --git a/libs/libaom/src/common/webmenc.cc b/libs/libaom/src/common/webmenc.cc new file mode 100644 index 000000000..6ae7df646 --- /dev/null +++ b/libs/libaom/src/common/webmenc.cc @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "common/webmenc.h" + +#include + +#include + +#include "common/av1_config.h" +#include "third_party/libwebm/mkvmuxer/mkvmuxer.h" +#include "third_party/libwebm/mkvmuxer/mkvmuxerutil.h" +#include "third_party/libwebm/mkvmuxer/mkvwriter.h" + +namespace { +const uint64_t kDebugTrackUid = 0xDEADBEEF; +const int kVideoTrackNumber = 1; +} // namespace + +int write_webm_file_header(struct WebmOutputContext *webm_ctx, + aom_codec_ctx_t *encoder_ctx, + const aom_codec_enc_cfg_t *cfg, + stereo_format_t stereo_fmt, unsigned int fourcc, + const struct AvxRational *par) { + mkvmuxer::MkvWriter *const writer = new mkvmuxer::MkvWriter(webm_ctx->stream); + mkvmuxer::Segment *const segment = new mkvmuxer::Segment(); + if (!writer || !segment) { + fprintf(stderr, "webmenc> mkvmuxer objects alloc failed, out of memory?\n"); + return -1; + } + + bool ok = segment->Init(writer); + if (!ok) { + fprintf(stderr, "webmenc> mkvmuxer Init failed.\n"); + return -1; + } + + segment->set_mode(mkvmuxer::Segment::kFile); + segment->OutputCues(true); + + mkvmuxer::SegmentInfo *const info = segment->GetSegmentInfo(); + if (!info) { + fprintf(stderr, "webmenc> Cannot retrieve Segment Info.\n"); + return -1; + } + + const uint64_t kTimecodeScale = 1000000; + info->set_timecode_scale(kTimecodeScale); + std::string version = "aomenc"; + if (!webm_ctx->debug) { + version.append(std::string(" ") + aom_codec_version_str()); + } + info->set_writing_app(version.c_str()); + + const uint64_t video_track_id = + segment->AddVideoTrack(static_cast(cfg->g_w), + static_cast(cfg->g_h), kVideoTrackNumber); + mkvmuxer::VideoTrack *const video_track = static_cast( + segment->GetTrackByNumber(video_track_id)); + + if (!video_track) { + fprintf(stderr, "webmenc> Video track creation failed.\n"); + return -1; + } + + ok = false; + aom_fixed_buf_t *obu_sequence_header = + aom_codec_get_global_headers(encoder_ctx); + if (obu_sequence_header) { + Av1Config av1_config; + if (get_av1config_from_obu( + reinterpret_cast(obu_sequence_header->buf), + obu_sequence_header->sz, false, &av1_config) == 0) { + uint8_t av1_config_buffer[4] = { 0 }; + size_t bytes_written = 0; + if (write_av1config(&av1_config, sizeof(av1_config_buffer), + &bytes_written, av1_config_buffer) == 0) { + ok = video_track->SetCodecPrivate(av1_config_buffer, + sizeof(av1_config_buffer)); + } + } + free(obu_sequence_header->buf); + free(obu_sequence_header); + } + if (!ok) { + fprintf(stderr, "webmenc> Unable to set AV1 config.\n"); + return -1; + } + + ok = video_track->SetStereoMode(stereo_fmt); + if (!ok) { + fprintf(stderr, "webmenc> Unable to set stereo mode.\n"); + return -1; + } + + if (fourcc != AV1_FOURCC) { + fprintf(stderr, "webmenc> Unsupported codec (unknown 4 CC).\n"); + return -1; + } + video_track->set_codec_id("V_AV1"); + + if (par->numerator > 1 || par->denominator > 1) { + // TODO(fgalligan): Add support of DisplayUnit, Display Aspect Ratio type + // to WebM format. + const uint64_t display_width = static_cast( + ((cfg->g_w * par->numerator * 1.0) / par->denominator) + .5); + video_track->set_display_width(display_width); + video_track->set_display_height(cfg->g_h); + } + + if (webm_ctx->debug) { + video_track->set_uid(kDebugTrackUid); + } + + webm_ctx->writer = writer; + webm_ctx->segment = segment; + + return 0; +} + +int write_webm_block(struct WebmOutputContext *webm_ctx, + const aom_codec_enc_cfg_t *cfg, + const aom_codec_cx_pkt_t *pkt) { + if (!webm_ctx->segment) { + fprintf(stderr, "webmenc> segment is NULL.\n"); + return -1; + } + mkvmuxer::Segment *const segment = + reinterpret_cast(webm_ctx->segment); + int64_t pts_ns = pkt->data.frame.pts * 1000000000ll * cfg->g_timebase.num / + cfg->g_timebase.den; + if (pts_ns <= webm_ctx->last_pts_ns) pts_ns = webm_ctx->last_pts_ns + 1000000; + webm_ctx->last_pts_ns = pts_ns; + + if (!segment->AddFrame(static_cast(pkt->data.frame.buf), + pkt->data.frame.sz, kVideoTrackNumber, pts_ns, + pkt->data.frame.flags & AOM_FRAME_IS_KEY)) { + fprintf(stderr, "webmenc> AddFrame failed.\n"); + return -1; + } + return 0; +} + +int write_webm_file_footer(struct WebmOutputContext *webm_ctx) { + if (!webm_ctx->writer || !webm_ctx->segment) { + fprintf(stderr, "webmenc> segment or writer NULL.\n"); + return -1; + } + mkvmuxer::MkvWriter *const writer = + reinterpret_cast(webm_ctx->writer); + mkvmuxer::Segment *const segment = + reinterpret_cast(webm_ctx->segment); + const bool ok = segment->Finalize(); + delete segment; + delete writer; + webm_ctx->writer = NULL; + webm_ctx->segment = NULL; + + if (!ok) { + fprintf(stderr, "webmenc> Segment::Finalize failed.\n"); + return -1; + } + + return 0; +} diff --git a/libs/libaom/src/common/webmenc.h b/libs/libaom/src/common/webmenc.h new file mode 100644 index 000000000..a4aa992b0 --- /dev/null +++ b/libs/libaom/src/common/webmenc.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_COMMON_WEBMENC_H_ +#define AOM_COMMON_WEBMENC_H_ + +#include +#include + +#include "tools_common.h" +#include "aom/aom_encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct WebmOutputContext { + int debug; + FILE *stream; + int64_t last_pts_ns; + void *writer; + void *segment; +}; + +/* Stereo 3D packed frame format */ +enum { + STEREO_FORMAT_MONO = 0, + STEREO_FORMAT_LEFT_RIGHT = 1, + STEREO_FORMAT_BOTTOM_TOP = 2, + STEREO_FORMAT_TOP_BOTTOM = 3, + STEREO_FORMAT_RIGHT_LEFT = 11 +} UENUM1BYTE(stereo_format_t); + +// The following functions wrap libwebm's mkvmuxer. All functions return 0 upon +// success, or -1 upon failure. + +int write_webm_file_header(struct WebmOutputContext *webm_ctx, + aom_codec_ctx_t *encoder_ctx, + const aom_codec_enc_cfg_t *cfg, + stereo_format_t stereo_fmt, unsigned int fourcc, + const struct AvxRational *par); + +int write_webm_block(struct WebmOutputContext *webm_ctx, + const aom_codec_enc_cfg_t *cfg, + const aom_codec_cx_pkt_t *pkt); + +int write_webm_file_footer(struct WebmOutputContext *webm_ctx); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_COMMON_WEBMENC_H_ diff --git a/libs/libaom/src/common/y4menc.c b/libs/libaom/src/common/y4menc.c new file mode 100644 index 000000000..e3f5d5b38 --- /dev/null +++ b/libs/libaom/src/common/y4menc.c @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "common/rawenc.h" +#include "common/y4menc.h" + +// Returns the Y4M name associated with the monochrome colorspace. +static const char *monochrome_colorspace(unsigned int bit_depth) { + switch (bit_depth) { + case 8: return "Cmono"; + case 9: return "Cmono9"; + case 10: return "Cmono10"; + case 12: return "Cmono12"; + case 16: return "Cmono16"; + default: assert(0); return NULL; + } +} + +// Return the Y4M name of the 8-bit colorspace, given the chroma position and +// image format. +const char *colorspace8(aom_chroma_sample_position_t csp, aom_img_fmt_t fmt) { + switch (fmt) { + case AOM_IMG_FMT_I444: return "C444"; + case AOM_IMG_FMT_I422: return "C422"; + default: + if (csp == AOM_CSP_VERTICAL) { + return "C420mpeg2 XYSCSS=420MPEG2"; + } else if (csp == AOM_CSP_COLOCATED) { + // Note that Y4M does not have a dedicated header for colocated chroma, + // and that FFMPEG interprets C420 as C420jpeg. + return "C420"; + } else { + return "C420jpeg"; + } + } +} + +// Return the Y4M name of the colorspace, given the bit depth and image format. +static const char *colorspace(unsigned int bit_depth, + aom_chroma_sample_position_t csp, + aom_img_fmt_t fmt) { + switch (bit_depth) { + case 8: return colorspace8(csp, fmt); + case 9: + return fmt == AOM_IMG_FMT_I44416 + ? "C444p9 XYSCSS=444P9" + : fmt == AOM_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9" + : "C420p9 XYSCSS=420P9"; + case 10: + return fmt == AOM_IMG_FMT_I44416 + ? "C444p10 XYSCSS=444P10" + : fmt == AOM_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10" + : "C420p10 XYSCSS=420P10"; + case 12: + return fmt == AOM_IMG_FMT_I44416 + ? "C444p12 XYSCSS=444P12" + : fmt == AOM_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12" + : "C420p12 XYSCSS=420P12"; + case 14: + return fmt == AOM_IMG_FMT_I44416 + ? "C444p14 XYSCSS=444P14" + : fmt == AOM_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14" + : "C420p14 XYSCSS=420P14"; + case 16: + return fmt == AOM_IMG_FMT_I44416 + ? "C444p16 XYSCSS=444P16" + : fmt == AOM_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16" + : "C420p16 XYSCSS=420P16"; + default: assert(0); return NULL; + } +} + +int y4m_write_file_header(char *buf, size_t len, int width, int height, + const struct AvxRational *framerate, int monochrome, + aom_chroma_sample_position_t csp, aom_img_fmt_t fmt, + unsigned int bit_depth) { + const char *color = monochrome ? monochrome_colorspace(bit_depth) + : colorspace(bit_depth, csp, fmt); + return snprintf(buf, len, "YUV4MPEG2 W%u H%u F%u:%u I%c %s\n", width, height, + framerate->numerator, framerate->denominator, 'p', color); +} + +int y4m_write_frame_header(char *buf, size_t len) { + return snprintf(buf, len, "FRAME\n"); +} + +void y4m_write_image_file(const aom_image_t *img, const int *planes, + FILE *file) { + int num_planes = img->monochrome ? 1 : 3; + raw_write_image_file(img, planes, num_planes, file); +} + +void y4m_update_image_md5(const aom_image_t *img, const int *planes, + MD5Context *md5) { + int num_planes = img->monochrome ? 1 : 3; + raw_update_image_md5(img, planes, num_planes, md5); +} diff --git a/libs/libaom/src/common/y4menc.h b/libs/libaom/src/common/y4menc.h new file mode 100644 index 000000000..f6d5fd86b --- /dev/null +++ b/libs/libaom/src/common/y4menc.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_COMMON_Y4MENC_H_ +#define AOM_COMMON_Y4MENC_H_ + +#include "aom/aom_decoder.h" +#include "common/md5_utils.h" +#include "common/tools_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define Y4M_BUFFER_SIZE 128 + +int y4m_write_file_header(char *buf, size_t len, int width, int height, + const struct AvxRational *framerate, int monochrome, + aom_chroma_sample_position_t csp, aom_img_fmt_t fmt, + unsigned int bit_depth); +int y4m_write_frame_header(char *buf, size_t len); +void y4m_write_image_file(const aom_image_t *img, const int *planes, + FILE *file); +void y4m_update_image_md5(const aom_image_t *img, const int *planes, + MD5Context *md5); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_COMMON_Y4MENC_H_ diff --git a/libs/libaom/src/common/y4minput.c b/libs/libaom/src/common/y4minput.c new file mode 100644 index 000000000..f3dfaafc6 --- /dev/null +++ b/libs/libaom/src/common/y4minput.c @@ -0,0 +1,1153 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + * + * Based on code from the OggTheora software codec source code, + * Copyright (C) 2002-2010 The Xiph.Org Foundation and contributors. + */ +#include +#include +#include + +#include "aom/aom_integer.h" +#include "aom_ports/msvc.h" +#include "y4minput.h" + +// Reads 'size' bytes from 'file' into 'buf' with some fault tolerance. +// Returns true on success. +static int file_read(void *buf, size_t size, FILE *file) { + const int kMaxRetries = 5; + int retry_count = 0; + int file_error; + size_t len = 0; + do { + const size_t n = fread((uint8_t *)buf + len, 1, size - len, file); + len += n; + file_error = ferror(file); + if (file_error) { + if (errno == EINTR || errno == EAGAIN) { + clearerr(file); + continue; + } else { + fprintf(stderr, "Error reading file: %u of %u bytes read, %d: %s\n", + (uint32_t)len, (uint32_t)size, errno, strerror(errno)); + return 0; + } + } + } while (!feof(file) && len < size && ++retry_count < kMaxRetries); + + if (!feof(file) && len != size) { + fprintf(stderr, + "Error reading file: %u of %u bytes read," + " error: %d, retries: %d, %d: %s\n", + (uint32_t)len, (uint32_t)size, file_error, retry_count, errno, + strerror(errno)); + } + return len == size; +} + +static int y4m_parse_tags(y4m_input *_y4m, char *_tags) { + int got_w; + int got_h; + int got_fps; + int got_interlace; + int got_par; + int got_chroma; + char *p; + char *q; + got_w = got_h = got_fps = got_interlace = got_par = got_chroma = 0; + for (p = _tags;; p = q) { + /*Skip any leading spaces.*/ + while (*p == ' ') p++; + /*If that's all we have, stop.*/ + if (p[0] == '\0') break; + /*Find the end of this tag.*/ + for (q = p + 1; *q != '\0' && *q != ' '; q++) { + } + /*Process the tag.*/ + switch (p[0]) { + case 'W': { + if (sscanf(p + 1, "%d", &_y4m->pic_w) != 1) return -1; + got_w = 1; + } break; + case 'H': { + if (sscanf(p + 1, "%d", &_y4m->pic_h) != 1) return -1; + got_h = 1; + } break; + case 'F': { + if (sscanf(p + 1, "%d:%d", &_y4m->fps_n, &_y4m->fps_d) != 2) { + return -1; + } + got_fps = 1; + } break; + case 'I': { + _y4m->interlace = p[1]; + got_interlace = 1; + } break; + case 'A': { + if (sscanf(p + 1, "%d:%d", &_y4m->par_n, &_y4m->par_d) != 2) { + return -1; + } + got_par = 1; + } break; + case 'C': { + if (q - p > 16) return -1; + memcpy(_y4m->chroma_type, p + 1, q - p - 1); + _y4m->chroma_type[q - p - 1] = '\0'; + got_chroma = 1; + } break; + /*Ignore unknown tags.*/ + } + } + if (!got_w || !got_h || !got_fps) return -1; + if (!got_interlace) _y4m->interlace = '?'; + if (!got_par) _y4m->par_n = _y4m->par_d = 0; + /*Chroma-type is not specified in older files, e.g., those generated by + mplayer.*/ + if (!got_chroma) + snprintf(_y4m->chroma_type, sizeof(_y4m->chroma_type), "420"); + return 0; +} + +/*All anti-aliasing filters in the following conversion functions are based on + one of two window functions: + The 6-tap Lanczos window (for down-sampling and shifts): + sinc(\pi*t)*sinc(\pi*t/3), |t|<3 (sinc(t)==sin(t)/t) + 0, |t|>=3 + The 4-tap Mitchell window (for up-sampling): + 7|t|^3-12|t|^2+16/3, |t|<1 + -(7/3)|x|^3+12|x|^2-20|x|+32/3, |t|<2 + 0, |t|>=2 + The number of taps is intentionally kept small to reduce computational + overhead and limit ringing. + + The taps from these filters are scaled so that their sum is 1, and the + result is scaled by 128 and rounded to integers to create a filter whose + intermediate values fit inside 16 bits. + Coefficients are rounded in such a way as to ensure their sum is still 128, + which is usually equivalent to normal rounding. + + Conversions which require both horizontal and vertical filtering could + have these steps pipelined, for less memory consumption and better cache + performance, but we do them separately for simplicity.*/ +#define OC_MINI(_a, _b) ((_a) > (_b) ? (_b) : (_a)) +#define OC_MAXI(_a, _b) ((_a) < (_b) ? (_b) : (_a)) +#define OC_CLAMPI(_a, _b, _c) (OC_MAXI(_a, OC_MINI(_b, _c))) + +/*420jpeg chroma samples are sited like: + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + + 420mpeg2 chroma samples are sited like: + Y-------Y-------Y-------Y------- + | | | | + BR | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + Y-------Y-------Y-------Y------- + | | | | + BR | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + + We use a resampling filter to shift the site locations one quarter pixel (at + the chroma plane's resolution) to the right. + The 4:2:2 modes look exactly the same, except there are twice as many chroma + lines, and they are vertically co-sited with the luma samples in both the + mpeg2 and jpeg cases (thus requiring no vertical resampling).*/ +static void y4m_42xmpeg2_42xjpeg_helper(unsigned char *_dst, + const unsigned char *_src, int _c_w, + int _c_h) { + int y; + int x; + for (y = 0; y < _c_h; y++) { + /*Filter: [4 -17 114 35 -9 1]/128, derived from a 6-tap Lanczos + window.*/ + for (x = 0; x < OC_MINI(_c_w, 2); x++) { + _dst[x] = (unsigned char)OC_CLAMPI( + 0, + (4 * _src[0] - 17 * _src[OC_MAXI(x - 1, 0)] + 114 * _src[x] + + 35 * _src[OC_MINI(x + 1, _c_w - 1)] - + 9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[OC_MINI(x + 3, _c_w - 1)] + + 64) >> + 7, + 255); + } + for (; x < _c_w - 3; x++) { + _dst[x] = (unsigned char)OC_CLAMPI( + 0, + (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] + + 35 * _src[x + 1] - 9 * _src[x + 2] + _src[x + 3] + 64) >> + 7, + 255); + } + for (; x < _c_w; x++) { + _dst[x] = (unsigned char)OC_CLAMPI( + 0, + (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] + + 35 * _src[OC_MINI(x + 1, _c_w - 1)] - + 9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[_c_w - 1] + 64) >> + 7, + 255); + } + _dst += _c_w; + _src += _c_w; + } +} + +/*Handles both 422 and 420mpeg2 to 422jpeg and 420jpeg, respectively.*/ +static void y4m_convert_42xmpeg2_42xjpeg(y4m_input *_y4m, unsigned char *_dst, + unsigned char *_aux) { + int c_w; + int c_h; + int c_sz; + int pli; + /*Skip past the luma data.*/ + _dst += _y4m->pic_w * _y4m->pic_h; + /*Compute the size of each chroma plane.*/ + c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h; + c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v; + c_sz = c_w * c_h; + for (pli = 1; pli < 3; pli++) { + y4m_42xmpeg2_42xjpeg_helper(_dst, _aux, c_w, c_h); + _dst += c_sz; + _aux += c_sz; + } +} + +/*This format is only used for interlaced content, but is included for + completeness. + + 420jpeg chroma samples are sited like: + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + + 420paldv chroma samples are sited like: + YR------Y-------YR------Y------- + | | | | + | | | | + | | | | + YB------Y-------YB------Y------- + | | | | + | | | | + | | | | + YR------Y-------YR------Y------- + | | | | + | | | | + | | | | + YB------Y-------YB------Y------- + | | | | + | | | | + | | | | + + We use a resampling filter to shift the site locations one quarter pixel (at + the chroma plane's resolution) to the right. + Then we use another filter to move the C_r location down one quarter pixel, + and the C_b location up one quarter pixel.*/ +static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst, + unsigned char *_aux) { + unsigned char *tmp; + int c_w; + int c_h; + int c_sz; + int pli; + int y; + int x; + /*Skip past the luma data.*/ + _dst += _y4m->pic_w * _y4m->pic_h; + /*Compute the size of each chroma plane.*/ + c_w = (_y4m->pic_w + 1) / 2; + c_h = (_y4m->pic_h + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h; + c_sz = c_w * c_h; + tmp = _aux + 2 * c_sz; + for (pli = 1; pli < 3; pli++) { + /*First do the horizontal re-sampling. + This is the same as the mpeg2 case, except that after the horizontal + case, we need to apply a second vertical filter.*/ + y4m_42xmpeg2_42xjpeg_helper(tmp, _aux, c_w, c_h); + _aux += c_sz; + switch (pli) { + case 1: { + /*Slide C_b up a quarter-pel. + This is the same filter used above, but in the other order.*/ + for (x = 0; x < c_w; x++) { + for (y = 0; y < OC_MINI(c_h, 3); y++) { + _dst[y * c_w] = (unsigned char)OC_CLAMPI( + 0, + (tmp[0] - 9 * tmp[OC_MAXI(y - 2, 0) * c_w] + + 35 * tmp[OC_MAXI(y - 1, 0) * c_w] + 114 * tmp[y * c_w] - + 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] + + 4 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + 64) >> + 7, + 255); + } + for (; y < c_h - 2; y++) { + _dst[y * c_w] = (unsigned char)OC_CLAMPI( + 0, + (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] + + 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] - + 17 * tmp[(y + 1) * c_w] + 4 * tmp[(y + 2) * c_w] + 64) >> + 7, + 255); + } + for (; y < c_h; y++) { + _dst[y * c_w] = (unsigned char)OC_CLAMPI( + 0, + (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] + + 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] - + 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] + + 4 * tmp[(c_h - 1) * c_w] + 64) >> + 7, + 255); + } + _dst++; + tmp++; + } + _dst += c_sz - c_w; + tmp -= c_w; + } break; + case 2: { + /*Slide C_r down a quarter-pel. + This is the same as the horizontal filter.*/ + for (x = 0; x < c_w; x++) { + for (y = 0; y < OC_MINI(c_h, 2); y++) { + _dst[y * c_w] = (unsigned char)OC_CLAMPI( + 0, + (4 * tmp[0] - 17 * tmp[OC_MAXI(y - 1, 0) * c_w] + + 114 * tmp[y * c_w] + 35 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] - + 9 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + + tmp[OC_MINI(y + 3, c_h - 1) * c_w] + 64) >> + 7, + 255); + } + for (; y < c_h - 3; y++) { + _dst[y * c_w] = (unsigned char)OC_CLAMPI( + 0, + (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] + + 114 * tmp[y * c_w] + 35 * tmp[(y + 1) * c_w] - + 9 * tmp[(y + 2) * c_w] + tmp[(y + 3) * c_w] + 64) >> + 7, + 255); + } + for (; y < c_h; y++) { + _dst[y * c_w] = (unsigned char)OC_CLAMPI( + 0, + (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] + + 114 * tmp[y * c_w] + 35 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] - + 9 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + tmp[(c_h - 1) * c_w] + + 64) >> + 7, + 255); + } + _dst++; + tmp++; + } + } break; + } + /*For actual interlaced material, this would have to be done separately on + each field, and the shift amounts would be different. + C_r moves down 1/8, C_b up 3/8 in the top field, and C_r moves down 3/8, + C_b up 1/8 in the bottom field. + The corresponding filters would be: + Down 1/8 (reverse order for up): [3 -11 125 15 -4 0]/128 + Down 3/8 (reverse order for up): [4 -19 98 56 -13 2]/128*/ + } +} + +/*Perform vertical filtering to reduce a single plane from 4:2:2 to 4:2:0. + This is used as a helper by several conversion routines.*/ +static void y4m_422jpeg_420jpeg_helper(unsigned char *_dst, + const unsigned char *_src, int _c_w, + int _c_h) { + int y; + int x; + /*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/ + for (x = 0; x < _c_w; x++) { + for (y = 0; y < OC_MINI(_c_h, 2); y += 2) { + _dst[(y >> 1) * _c_w] = + OC_CLAMPI(0, + (64 * _src[0] + 78 * _src[OC_MINI(1, _c_h - 1) * _c_w] - + 17 * _src[OC_MINI(2, _c_h - 1) * _c_w] + + 3 * _src[OC_MINI(3, _c_h - 1) * _c_w] + 64) >> + 7, + 255); + } + for (; y < _c_h - 3; y += 2) { + _dst[(y >> 1) * _c_w] = + OC_CLAMPI(0, + (3 * (_src[(y - 2) * _c_w] + _src[(y + 3) * _c_w]) - + 17 * (_src[(y - 1) * _c_w] + _src[(y + 2) * _c_w]) + + 78 * (_src[y * _c_w] + _src[(y + 1) * _c_w]) + 64) >> + 7, + 255); + } + for (; y < _c_h; y += 2) { + _dst[(y >> 1) * _c_w] = OC_CLAMPI( + 0, + (3 * (_src[(y - 2) * _c_w] + _src[(_c_h - 1) * _c_w]) - + 17 * (_src[(y - 1) * _c_w] + _src[OC_MINI(y + 2, _c_h - 1) * _c_w]) + + 78 * (_src[y * _c_w] + _src[OC_MINI(y + 1, _c_h - 1) * _c_w]) + + 64) >> + 7, + 255); + } + _src++; + _dst++; + } +} + +/*420jpeg chroma samples are sited like: + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + + 422jpeg chroma samples are sited like: + Y---BR--Y-------Y---BR--Y------- + | | | | + | | | | + | | | | + Y---BR--Y-------Y---BR--Y------- + | | | | + | | | | + | | | | + Y---BR--Y-------Y---BR--Y------- + | | | | + | | | | + | | | | + Y---BR--Y-------Y---BR--Y------- + | | | | + | | | | + | | | | + + We use a resampling filter to decimate the chroma planes by two in the + vertical direction.*/ +static void y4m_convert_422jpeg_420jpeg(y4m_input *_y4m, unsigned char *_dst, + unsigned char *_aux) { + int c_w; + int c_h; + int c_sz; + int dst_c_w; + int dst_c_h; + int dst_c_sz; + int pli; + /*Skip past the luma data.*/ + _dst += _y4m->pic_w * _y4m->pic_h; + /*Compute the size of each chroma plane.*/ + c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h; + c_h = _y4m->pic_h; + dst_c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h; + dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v; + c_sz = c_w * c_h; + dst_c_sz = dst_c_w * dst_c_h; + for (pli = 1; pli < 3; pli++) { + y4m_422jpeg_420jpeg_helper(_dst, _aux, c_w, c_h); + _aux += c_sz; + _dst += dst_c_sz; + } +} + +/*420jpeg chroma samples are sited like: + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + + 422 chroma samples are sited like: + YBR-----Y-------YBR-----Y------- + | | | | + | | | | + | | | | + YBR-----Y-------YBR-----Y------- + | | | | + | | | | + | | | | + YBR-----Y-------YBR-----Y------- + | | | | + | | | | + | | | | + YBR-----Y-------YBR-----Y------- + | | | | + | | | | + | | | | + + We use a resampling filter to shift the original site locations one quarter + pixel (at the original chroma resolution) to the right. + Then we use a second resampling filter to decimate the chroma planes by two + in the vertical direction.*/ +static void y4m_convert_422_420jpeg(y4m_input *_y4m, unsigned char *_dst, + unsigned char *_aux) { + unsigned char *tmp; + int c_w; + int c_h; + int c_sz; + int dst_c_h; + int dst_c_sz; + int pli; + /*Skip past the luma data.*/ + _dst += _y4m->pic_w * _y4m->pic_h; + /*Compute the size of each chroma plane.*/ + c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h; + c_h = _y4m->pic_h; + dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v; + c_sz = c_w * c_h; + dst_c_sz = c_w * dst_c_h; + tmp = _aux + 2 * c_sz; + for (pli = 1; pli < 3; pli++) { + /*In reality, the horizontal and vertical steps could be pipelined, for + less memory consumption and better cache performance, but we do them + separately for simplicity.*/ + /*First do horizontal filtering (convert to 422jpeg)*/ + y4m_42xmpeg2_42xjpeg_helper(tmp, _aux, c_w, c_h); + /*Now do the vertical filtering.*/ + y4m_422jpeg_420jpeg_helper(_dst, tmp, c_w, c_h); + _aux += c_sz; + _dst += dst_c_sz; + } +} + +/*420jpeg chroma samples are sited like: + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + + 411 chroma samples are sited like: + YBR-----Y-------Y-------Y------- + | | | | + | | | | + | | | | + YBR-----Y-------Y-------Y------- + | | | | + | | | | + | | | | + YBR-----Y-------Y-------Y------- + | | | | + | | | | + | | | | + YBR-----Y-------Y-------Y------- + | | | | + | | | | + | | | | + + We use a filter to resample at site locations one eighth pixel (at the source + chroma plane's horizontal resolution) and five eighths of a pixel to the + right. + Then we use another filter to decimate the planes by 2 in the vertical + direction.*/ +static void y4m_convert_411_420jpeg(y4m_input *_y4m, unsigned char *_dst, + unsigned char *_aux) { + unsigned char *tmp; + int c_w; + int c_h; + int c_sz; + int dst_c_w; + int dst_c_h; + int dst_c_sz; + int tmp_sz; + int pli; + int y; + int x; + /*Skip past the luma data.*/ + _dst += _y4m->pic_w * _y4m->pic_h; + /*Compute the size of each chroma plane.*/ + c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h; + c_h = _y4m->pic_h; + dst_c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h; + dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v; + c_sz = c_w * c_h; + dst_c_sz = dst_c_w * dst_c_h; + tmp_sz = dst_c_w * c_h; + tmp = _aux + 2 * c_sz; + for (pli = 1; pli < 3; pli++) { + /*In reality, the horizontal and vertical steps could be pipelined, for + less memory consumption and better cache performance, but we do them + separately for simplicity.*/ + /*First do horizontal filtering (convert to 422jpeg)*/ + for (y = 0; y < c_h; y++) { + /*Filters: [1 110 18 -1]/128 and [-3 50 86 -5]/128, both derived from a + 4-tap Mitchell window.*/ + for (x = 0; x < OC_MINI(c_w, 1); x++) { + tmp[x << 1] = (unsigned char)OC_CLAMPI( + 0, + (111 * _aux[0] + 18 * _aux[OC_MINI(1, c_w - 1)] - + _aux[OC_MINI(2, c_w - 1)] + 64) >> + 7, + 255); + tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI( + 0, + (47 * _aux[0] + 86 * _aux[OC_MINI(1, c_w - 1)] - + 5 * _aux[OC_MINI(2, c_w - 1)] + 64) >> + 7, + 255); + } + for (; x < c_w - 2; x++) { + tmp[x << 1] = + (unsigned char)OC_CLAMPI(0, + (_aux[x - 1] + 110 * _aux[x] + + 18 * _aux[x + 1] - _aux[x + 2] + 64) >> + 7, + 255); + tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI( + 0, + (-3 * _aux[x - 1] + 50 * _aux[x] + 86 * _aux[x + 1] - + 5 * _aux[x + 2] + 64) >> + 7, + 255); + } + for (; x < c_w; x++) { + tmp[x << 1] = (unsigned char)OC_CLAMPI( + 0, + (_aux[x - 1] + 110 * _aux[x] + 18 * _aux[OC_MINI(x + 1, c_w - 1)] - + _aux[c_w - 1] + 64) >> + 7, + 255); + if ((x << 1 | 1) < dst_c_w) { + tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI( + 0, + (-3 * _aux[x - 1] + 50 * _aux[x] + + 86 * _aux[OC_MINI(x + 1, c_w - 1)] - 5 * _aux[c_w - 1] + 64) >> + 7, + 255); + } + } + tmp += dst_c_w; + _aux += c_w; + } + tmp -= tmp_sz; + /*Now do the vertical filtering.*/ + y4m_422jpeg_420jpeg_helper(_dst, tmp, dst_c_w, c_h); + _dst += dst_c_sz; + } +} + +/*Convert 444 to 420jpeg.*/ +static void y4m_convert_444_420jpeg(y4m_input *_y4m, unsigned char *_dst, + unsigned char *_aux) { + unsigned char *tmp; + int c_w; + int c_h; + int c_sz; + int dst_c_w; + int dst_c_h; + int dst_c_sz; + int tmp_sz; + int pli; + int y; + int x; + /*Skip past the luma data.*/ + _dst += _y4m->pic_w * _y4m->pic_h; + /*Compute the size of each chroma plane.*/ + c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h; + c_h = _y4m->pic_h; + dst_c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h; + dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v; + c_sz = c_w * c_h; + dst_c_sz = dst_c_w * dst_c_h; + tmp_sz = dst_c_w * c_h; + tmp = _aux + 2 * c_sz; + for (pli = 1; pli < 3; pli++) { + /*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/ + for (y = 0; y < c_h; y++) { + for (x = 0; x < OC_MINI(c_w, 2); x += 2) { + tmp[x >> 1] = OC_CLAMPI(0, + (64 * _aux[0] + 78 * _aux[OC_MINI(1, c_w - 1)] - + 17 * _aux[OC_MINI(2, c_w - 1)] + + 3 * _aux[OC_MINI(3, c_w - 1)] + 64) >> + 7, + 255); + } + for (; x < c_w - 3; x += 2) { + tmp[x >> 1] = OC_CLAMPI(0, + (3 * (_aux[x - 2] + _aux[x + 3]) - + 17 * (_aux[x - 1] + _aux[x + 2]) + + 78 * (_aux[x] + _aux[x + 1]) + 64) >> + 7, + 255); + } + for (; x < c_w; x += 2) { + tmp[x >> 1] = + OC_CLAMPI(0, + (3 * (_aux[x - 2] + _aux[c_w - 1]) - + 17 * (_aux[x - 1] + _aux[OC_MINI(x + 2, c_w - 1)]) + + 78 * (_aux[x] + _aux[OC_MINI(x + 1, c_w - 1)]) + 64) >> + 7, + 255); + } + tmp += dst_c_w; + _aux += c_w; + } + tmp -= tmp_sz; + /*Now do the vertical filtering.*/ + y4m_422jpeg_420jpeg_helper(_dst, tmp, dst_c_w, c_h); + _dst += dst_c_sz; + } +} + +/*The image is padded with empty chroma components at 4:2:0.*/ +static void y4m_convert_mono_420jpeg(y4m_input *_y4m, unsigned char *_dst, + unsigned char *_aux) { + int c_sz; + (void)_aux; + _dst += _y4m->pic_w * _y4m->pic_h; + c_sz = ((_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h) * + ((_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v); + memset(_dst, 128, c_sz * 2); +} + +/*No conversion function needed.*/ +static void y4m_convert_null(y4m_input *_y4m, unsigned char *_dst, + unsigned char *_aux) { + (void)_y4m; + (void)_dst; + (void)_aux; +} + +int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip, + aom_chroma_sample_position_t csp, int only_420) { + char buffer[80] = { 0 }; + int ret; + int i; + /*Read until newline, or 80 cols, whichever happens first.*/ + for (i = 0; i < 79; i++) { + if (_nskip > 0) { + buffer[i] = *_skip++; + _nskip--; + } else { + if (!file_read(buffer + i, 1, _fin)) return -1; + } + if (buffer[i] == '\n') break; + } + /*We skipped too much header data.*/ + if (_nskip > 0) return -1; + if (i == 79) { + fprintf(stderr, "Error parsing header; not a YUV2MPEG2 file?\n"); + return -1; + } + buffer[i] = '\0'; + if (memcmp(buffer, "YUV4MPEG", 8)) { + fprintf(stderr, "Incomplete magic for YUV4MPEG file.\n"); + return -1; + } + if (buffer[8] != '2') { + fprintf(stderr, "Incorrect YUV input file version; YUV4MPEG2 required.\n"); + } + ret = y4m_parse_tags(_y4m, buffer + 5); + if (ret < 0) { + fprintf(stderr, "Error parsing YUV4MPEG2 header.\n"); + return ret; + } + if (_y4m->interlace == '?') { + fprintf(stderr, + "Warning: Input video interlacing format unknown; " + "assuming progressive scan.\n"); + } else if (_y4m->interlace != 'p') { + fprintf(stderr, + "Input video is interlaced; " + "Only progressive scan handled.\n"); + return -1; + } + /* Only support vertical chroma sample position if the input format is + * already 420mpeg2. Colocated is not supported in Y4M. + */ + if (csp == AOM_CSP_VERTICAL && strcmp(_y4m->chroma_type, "420mpeg2") != 0) { + fprintf(stderr, + "Vertical chroma sample position only supported " + "for 420mpeg2 input\n"); + return -1; + } + if (csp == AOM_CSP_COLOCATED) { + fprintf(stderr, "Colocated chroma sample position not supported in Y4M\n"); + return -1; + } + _y4m->aom_fmt = AOM_IMG_FMT_I420; + _y4m->bps = 12; + _y4m->bit_depth = 8; + if (strcmp(_y4m->chroma_type, "420") == 0 || + strcmp(_y4m->chroma_type, "420jpeg") == 0) { + _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v = + _y4m->dst_c_dec_v = 2; + _y4m->dst_buf_read_sz = + _y4m->pic_w * _y4m->pic_h + + 2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2); + /* Natively supported: no conversion required. */ + _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; + _y4m->convert = y4m_convert_null; + } else if (strcmp(_y4m->chroma_type, "420p10") == 0) { + _y4m->src_c_dec_h = 2; + _y4m->dst_c_dec_h = 2; + _y4m->src_c_dec_v = 2; + _y4m->dst_c_dec_v = 2; + _y4m->dst_buf_read_sz = + 2 * (_y4m->pic_w * _y4m->pic_h + + 2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2)); + /* Natively supported: no conversion required. */ + _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; + _y4m->convert = y4m_convert_null; + _y4m->bit_depth = 10; + _y4m->bps = 15; + _y4m->aom_fmt = AOM_IMG_FMT_I42016; + if (only_420) { + fprintf(stderr, "Unsupported conversion from 420p10 to 420jpeg\n"); + return -1; + } + } else if (strcmp(_y4m->chroma_type, "420p12") == 0) { + _y4m->src_c_dec_h = 2; + _y4m->dst_c_dec_h = 2; + _y4m->src_c_dec_v = 2; + _y4m->dst_c_dec_v = 2; + _y4m->dst_buf_read_sz = + 2 * (_y4m->pic_w * _y4m->pic_h + + 2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2)); + /* Natively supported: no conversion required. */ + _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; + _y4m->convert = y4m_convert_null; + _y4m->bit_depth = 12; + _y4m->bps = 18; + _y4m->aom_fmt = AOM_IMG_FMT_I42016; + if (only_420) { + fprintf(stderr, "Unsupported conversion from 420p12 to 420jpeg\n"); + return -1; + } + } else if (strcmp(_y4m->chroma_type, "420mpeg2") == 0) { + _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v = + _y4m->dst_c_dec_v = 2; + _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h; + /*Chroma filter required: read into the aux buf first.*/ + _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = + 2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2); + _y4m->convert = y4m_convert_null; + if (csp != AOM_CSP_VERTICAL) { + _y4m->convert = y4m_convert_42xmpeg2_42xjpeg; + snprintf(_y4m->chroma_type, sizeof(_y4m->chroma_type), "420"); + } + } else if (strcmp(_y4m->chroma_type, "420paldv") == 0) { + _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v = + _y4m->dst_c_dec_v = 2; + _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h; + /*Chroma filter required: read into the aux buf first. + We need to make two filter passes, so we need some extra space in the + aux buffer.*/ + _y4m->aux_buf_sz = 3 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2); + _y4m->aux_buf_read_sz = + 2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2); + _y4m->convert = y4m_convert_42xpaldv_42xjpeg; + } else if (strcmp(_y4m->chroma_type, "422jpeg") == 0) { + _y4m->src_c_dec_h = _y4m->dst_c_dec_h = 2; + _y4m->src_c_dec_v = 1; + _y4m->dst_c_dec_v = 2; + _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h; + /*Chroma filter required: read into the aux buf first.*/ + _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = + 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; + _y4m->convert = y4m_convert_422jpeg_420jpeg; + } else if (strcmp(_y4m->chroma_type, "422") == 0) { + _y4m->src_c_dec_h = 2; + _y4m->src_c_dec_v = 1; + if (only_420) { + _y4m->dst_c_dec_h = 2; + _y4m->dst_c_dec_v = 2; + _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h; + /*Chroma filter required: read into the aux buf first. + We need to make two filter passes, so we need some extra space in the + aux buffer.*/ + _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; + _y4m->aux_buf_sz = + _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; + _y4m->convert = y4m_convert_422_420jpeg; + } else { + _y4m->aom_fmt = AOM_IMG_FMT_I422; + _y4m->bps = 16; + _y4m->dst_c_dec_h = _y4m->src_c_dec_h; + _y4m->dst_c_dec_v = _y4m->src_c_dec_v; + _y4m->dst_buf_read_sz = + _y4m->pic_w * _y4m->pic_h + 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; + /*Natively supported: no conversion required.*/ + _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; + _y4m->convert = y4m_convert_null; + } + } else if (strcmp(_y4m->chroma_type, "422p10") == 0) { + _y4m->src_c_dec_h = 2; + _y4m->src_c_dec_v = 1; + _y4m->aom_fmt = AOM_IMG_FMT_I42216; + _y4m->bps = 20; + _y4m->bit_depth = 10; + _y4m->dst_c_dec_h = _y4m->src_c_dec_h; + _y4m->dst_c_dec_v = _y4m->src_c_dec_v; + _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h + + 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h); + _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; + _y4m->convert = y4m_convert_null; + if (only_420) { + fprintf(stderr, "Unsupported conversion from 422p10 to 420jpeg\n"); + return -1; + } + } else if (strcmp(_y4m->chroma_type, "422p12") == 0) { + _y4m->src_c_dec_h = 2; + _y4m->src_c_dec_v = 1; + _y4m->aom_fmt = AOM_IMG_FMT_I42216; + _y4m->bps = 24; + _y4m->bit_depth = 12; + _y4m->dst_c_dec_h = _y4m->src_c_dec_h; + _y4m->dst_c_dec_v = _y4m->src_c_dec_v; + _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h + + 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h); + _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; + _y4m->convert = y4m_convert_null; + if (only_420) { + fprintf(stderr, "Unsupported conversion from 422p12 to 420jpeg\n"); + return -1; + } + } else if (strcmp(_y4m->chroma_type, "411") == 0) { + _y4m->src_c_dec_h = 4; + _y4m->dst_c_dec_h = 2; + _y4m->src_c_dec_v = 1; + _y4m->dst_c_dec_v = 2; + _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h; + /*Chroma filter required: read into the aux buf first. + We need to make two filter passes, so we need some extra space in the + aux buffer.*/ + _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 3) / 4) * _y4m->pic_h; + _y4m->aux_buf_sz = + _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; + _y4m->convert = y4m_convert_411_420jpeg; + } else if (strcmp(_y4m->chroma_type, "444") == 0) { + _y4m->src_c_dec_h = 1; + _y4m->src_c_dec_v = 1; + if (only_420) { + _y4m->dst_c_dec_h = 2; + _y4m->dst_c_dec_v = 2; + _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h; + /*Chroma filter required: read into the aux buf first. + We need to make two filter passes, so we need some extra space in the + aux buffer.*/ + _y4m->aux_buf_read_sz = 2 * _y4m->pic_w * _y4m->pic_h; + _y4m->aux_buf_sz = + _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; + _y4m->convert = y4m_convert_444_420jpeg; + } else { + _y4m->aom_fmt = AOM_IMG_FMT_I444; + _y4m->bps = 24; + _y4m->dst_c_dec_h = _y4m->src_c_dec_h; + _y4m->dst_c_dec_v = _y4m->src_c_dec_v; + _y4m->dst_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h; + /*Natively supported: no conversion required.*/ + _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; + _y4m->convert = y4m_convert_null; + } + } else if (strcmp(_y4m->chroma_type, "444p10") == 0) { + _y4m->src_c_dec_h = 1; + _y4m->src_c_dec_v = 1; + _y4m->aom_fmt = AOM_IMG_FMT_I44416; + _y4m->bps = 30; + _y4m->bit_depth = 10; + _y4m->dst_c_dec_h = _y4m->src_c_dec_h; + _y4m->dst_c_dec_v = _y4m->src_c_dec_v; + _y4m->dst_buf_read_sz = 2 * 3 * _y4m->pic_w * _y4m->pic_h; + _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; + _y4m->convert = y4m_convert_null; + if (only_420) { + fprintf(stderr, "Unsupported conversion from 444p10 to 420jpeg\n"); + return -1; + } + } else if (strcmp(_y4m->chroma_type, "444p12") == 0) { + _y4m->src_c_dec_h = 1; + _y4m->src_c_dec_v = 1; + _y4m->aom_fmt = AOM_IMG_FMT_I44416; + _y4m->bps = 36; + _y4m->bit_depth = 12; + _y4m->dst_c_dec_h = _y4m->src_c_dec_h; + _y4m->dst_c_dec_v = _y4m->src_c_dec_v; + _y4m->dst_buf_read_sz = 2 * 3 * _y4m->pic_w * _y4m->pic_h; + _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; + _y4m->convert = y4m_convert_null; + if (only_420) { + fprintf(stderr, "Unsupported conversion from 444p12 to 420jpeg\n"); + return -1; + } + } else if (strcmp(_y4m->chroma_type, "444alpha") == 0) { + _y4m->src_c_dec_h = 1; + _y4m->src_c_dec_v = 1; + if (only_420) { + _y4m->dst_c_dec_h = 2; + _y4m->dst_c_dec_v = 2; + _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h; + /*Chroma filter required: read into the aux buf first. + We need to make two filter passes, so we need some extra space in the + aux buffer. + The extra plane also gets read into the aux buf. + It will be discarded.*/ + _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h; + _y4m->convert = y4m_convert_444_420jpeg; + } else { + fprintf(stderr, "Unsupported format: 444A\n"); + return -1; + } + } else if (strcmp(_y4m->chroma_type, "mono") == 0) { + _y4m->src_c_dec_h = _y4m->src_c_dec_v = 0; + _y4m->dst_c_dec_h = _y4m->dst_c_dec_v = 2; + _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h; + /*No extra space required, but we need to clear the chroma planes.*/ + _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; + _y4m->convert = y4m_convert_mono_420jpeg; + } else { + fprintf(stderr, "Unknown chroma sampling type: %s\n", _y4m->chroma_type); + return -1; + } + /*The size of the final frame buffers is always computed from the + destination chroma decimation type.*/ + _y4m->dst_buf_sz = + _y4m->pic_w * _y4m->pic_h + + 2 * ((_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h) * + ((_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v); + if (_y4m->bit_depth == 8) + _y4m->dst_buf = (unsigned char *)malloc(_y4m->dst_buf_sz); + else + _y4m->dst_buf = (unsigned char *)malloc(2 * _y4m->dst_buf_sz); + + if (_y4m->aux_buf_sz > 0) + _y4m->aux_buf = (unsigned char *)malloc(_y4m->aux_buf_sz); + return 0; +} + +void y4m_input_close(y4m_input *_y4m) { + free(_y4m->dst_buf); + free(_y4m->aux_buf); +} + +int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, aom_image_t *_img) { + char frame[6]; + int pic_sz; + int c_w; + int c_h; + int c_sz; + int bytes_per_sample = _y4m->bit_depth > 8 ? 2 : 1; + /*Read and skip the frame header.*/ + if (!file_read(frame, 6, _fin)) return 0; + if (memcmp(frame, "FRAME", 5)) { + fprintf(stderr, "Loss of framing in Y4M input data\n"); + return -1; + } + if (frame[5] != '\n') { + char c; + int j; + for (j = 0; j < 79 && file_read(&c, 1, _fin) && c != '\n'; j++) { + } + if (j == 79) { + fprintf(stderr, "Error parsing Y4M frame header\n"); + return -1; + } + } + /*Read the frame data that needs no conversion.*/ + if (!file_read(_y4m->dst_buf, _y4m->dst_buf_read_sz, _fin)) { + fprintf(stderr, "Error reading Y4M frame data.\n"); + return -1; + } + /*Read the frame data that does need conversion.*/ + if (!file_read(_y4m->aux_buf, _y4m->aux_buf_read_sz, _fin)) { + fprintf(stderr, "Error reading Y4M frame data.\n"); + return -1; + } + /*Now convert the just read frame.*/ + (*_y4m->convert)(_y4m, _y4m->dst_buf, _y4m->aux_buf); + /*Fill in the frame buffer pointers. + We don't use aom_img_wrap() because it forces padding for odd picture + sizes, which would require a separate fread call for every row.*/ + memset(_img, 0, sizeof(*_img)); + /*Y4M has the planes in Y'CbCr order, which libaom calls Y, U, and V.*/ + _img->fmt = _y4m->aom_fmt; + _img->w = _img->d_w = _y4m->pic_w; + _img->h = _img->d_h = _y4m->pic_h; + _img->x_chroma_shift = _y4m->dst_c_dec_h >> 1; + _img->y_chroma_shift = _y4m->dst_c_dec_v >> 1; + _img->bps = _y4m->bps; + + /*Set up the buffer pointers.*/ + pic_sz = _y4m->pic_w * _y4m->pic_h * bytes_per_sample; + c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h; + c_w *= bytes_per_sample; + c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v; + c_sz = c_w * c_h; + _img->stride[AOM_PLANE_Y] = _y4m->pic_w * bytes_per_sample; + _img->stride[AOM_PLANE_U] = _img->stride[AOM_PLANE_V] = c_w; + _img->planes[AOM_PLANE_Y] = _y4m->dst_buf; + _img->planes[AOM_PLANE_U] = _y4m->dst_buf + pic_sz; + _img->planes[AOM_PLANE_V] = _y4m->dst_buf + pic_sz + c_sz; + return 1; +} diff --git a/libs/libaom/src/common/y4minput.h b/libs/libaom/src/common/y4minput.h new file mode 100644 index 000000000..f6c5a3d3a --- /dev/null +++ b/libs/libaom/src/common/y4minput.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + * + * Based on code from the OggTheora software codec source code, + * Copyright (C) 2002-2010 The Xiph.Org Foundation and contributors. + */ + +#ifndef AOM_COMMON_Y4MINPUT_H_ +#define AOM_COMMON_Y4MINPUT_H_ + +#include +#include "aom/aom_image.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct y4m_input y4m_input; + +/*The function used to perform chroma conversion.*/ +typedef void (*y4m_convert_func)(y4m_input *_y4m, unsigned char *_dst, + unsigned char *_src); + +struct y4m_input { + int pic_w; + int pic_h; + int fps_n; + int fps_d; + int par_n; + int par_d; + char interlace; + int src_c_dec_h; + int src_c_dec_v; + int dst_c_dec_h; + int dst_c_dec_v; + char chroma_type[16]; + /*The size of each converted frame buffer.*/ + size_t dst_buf_sz; + /*The amount to read directly into the converted frame buffer.*/ + size_t dst_buf_read_sz; + /*The size of the auxilliary buffer.*/ + size_t aux_buf_sz; + /*The amount to read into the auxilliary buffer.*/ + size_t aux_buf_read_sz; + y4m_convert_func convert; + unsigned char *dst_buf; + unsigned char *aux_buf; + enum aom_img_fmt aom_fmt; + int bps; + unsigned int bit_depth; +}; + +/** + * Open the input file, treating it as Y4M. y4m_input is filled in after + * reading it. Note that chroma-sample-position should only be set for 420 + * input, and the input chroma is shifted if necessary. The code does not + * support the conversion from co-located to vertical. + */ +int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip, + aom_chroma_sample_position_t csp, int only_420); +void y4m_input_close(y4m_input *_y4m); +int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, aom_image_t *img); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_COMMON_Y4MINPUT_H_ diff --git a/libs/libaom/src/docs.cmake b/libs/libaom/src/docs.cmake new file mode 100644 index 000000000..28ca5c026 --- /dev/null +++ b/libs/libaom/src/docs.cmake @@ -0,0 +1,257 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_DOCS_CMAKE_) + return() +endif() # AOM_DOCS_CMAKE_ +set(AOM_DOCS_CMAKE_ 1) + +cmake_minimum_required(VERSION 3.5) + +set(AOM_DOXYFILE "${AOM_CONFIG_DIR}/doxyfile") +set(AOM_DOXYGEN_CONFIG_TEMPLATE "libs.doxy_template") +set(AOM_DOXYGEN_OUTPUT_DIR "${AOM_CONFIG_DIR}/dox") +set(AOM_DOXYGEN_SECTIONS "av1") + +set(AOM_DOXYGEN_SOURCES "${AOM_ROOT}/aom/aom.h" "${AOM_ROOT}/aom/aom_codec.h" + "${AOM_ROOT}/aom/aom_decoder.h" + "${AOM_ROOT}/aom/aom_encoder.h" + "${AOM_ROOT}/aom/aom_frame_buffer.h" + "${AOM_ROOT}/aom/aom_image.h" + "${AOM_ROOT}/aom/aom_integer.h" + "${AOM_ROOT}/keywords.dox" "${AOM_ROOT}/mainpage.dox" + "${AOM_ROOT}/usage.dox") + +if(CONFIG_AV1_DECODER) + set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} + "${AOM_ROOT}/apps/aomdec.c" + "${AOM_ROOT}/examples/decode_to_md5.c" + "${AOM_ROOT}/examples/decode_with_drops.c" + "${AOM_ROOT}/examples/simple_decoder.c") + + set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} + "Full featured decoder." + "Frame by frame MD5 checksum." + "Drops frames while decoding." + "Simplified decoder loop.") + + set(AOM_DOXYGEN_SECTIONS ${AOM_DOXYGEN_SECTIONS} "av1_decoder decoder") + + set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/aom/aomdx.h" + "${AOM_ROOT}/usage_dx.dox") + + if(CONFIG_ANALYZER) + set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} + "${AOM_ROOT}/examples/analyzer.cc") + + set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} + "Bitstream analyzer.") + endif() + + if(CONFIG_INSPECTION) + set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} + "${AOM_ROOT}/examples/inspect.c") + + set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} + "Bitstream inspector.") + endif() +endif() + +if(CONFIG_AV1_ENCODER) + set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} + "${AOM_ROOT}/apps/aomenc.c" + "${AOM_ROOT}/examples/lossless_encoder.c" + "${AOM_ROOT}/examples/set_maps.c" + "${AOM_ROOT}/examples/simple_encoder.c" + "${AOM_ROOT}/examples/twopass_encoder.c") + + set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} + "Full featured encoder." + "Simplified lossless encoder." + "Set active and ROI maps." + "Simplified encoder loop." + "Two-pass encoder loop.") + + set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} + "${AOM_ROOT}/examples/scalable_encoder.c") + + set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} + "Scalable encoder loop.") + + set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} + "${AOM_ROOT}/examples/svc_encoder_rtc.c") + + set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} + "Layered encoder for RTC.") + + set(AOM_DOXYGEN_SECTIONS ${AOM_DOXYGEN_SECTIONS} "av1_encoder encoder") + + set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/aom/aomcx.h" + "${AOM_ROOT}/usage_cx.dox") +endif() + +if(CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER) + set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} + "${AOM_ROOT}/examples/aom_cx_set_ref.c") + + set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} + "Set encoder reference frame.") +endif() + +if(CONFIG_AV1_ENCODER) + set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} + "${AOM_ROOT}/examples/lightfield_encoder.c") + + set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} + "Lightfield encoder example.") +endif() + +if(CONFIG_AV1_DECODER) + set(AOM_DOXYGEN_EXAMPLE_SOURCES + ${AOM_DOXYGEN_EXAMPLE_SOURCES} + "${AOM_ROOT}/examples/lightfield_tile_list_decoder.c") + + set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} + "Lightfield tile list decoder example.") +endif() + +if(CONFIG_AV1_DECODER) + set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} + "${AOM_ROOT}/examples/lightfield_decoder.c") + + set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} + "Lightfield decoder example.") +endif() + +if(CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER) + set(AOM_DOXYGEN_EXAMPLE_SOURCES + ${AOM_DOXYGEN_EXAMPLE_SOURCES} + "${AOM_ROOT}/examples/lightfield_bitstream_parsing.c") + + set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} + "Lightfield bitstream parsing example.") +endif() + +# Iterates over list named by $list_name and appends each item to $AOM_DOXYFILE +# as values assigned to $var_name with no line breaks between list items. +# Appends a new line after the entire config variable is expanded. +function(write_cmake_list_to_doxygen_config_var var_name list_name) + unset(output_string) + foreach(list_item ${${list_name}}) + set(output_string "${output_string} ${list_item} ") + endforeach() + string(STRIP "${output_string}" output_string) + file(APPEND "${AOM_DOXYFILE}" "${var_name} += ${output_string}\n") +endfunction() + +function(get_name file_path name_var) + get_filename_component(file_basename ${file_path} NAME) + get_filename_component(${name_var} ${file_basename} NAME_WE) + set(${name_var} ${${name_var}} PARENT_SCOPE) +endfunction() + +function(setup_documentation_targets) + + # Sanity check: the lengths of these lists must match. + list(LENGTH AOM_DOXYGEN_EXAMPLE_SOURCES num_sources) + list(LENGTH AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS num_descs) + if(NOT ${num_sources} EQUAL ${num_descs}) + message(FATAL_ERROR "Unqeual example and description totals.") + endif() + + # Take the list of examples and produce example_basename.dox for each file in + # the list. + file(MAKE_DIRECTORY "${AOM_DOXYGEN_OUTPUT_DIR}") + foreach(example_file ${AOM_DOXYGEN_EXAMPLE_SOURCES}) + unset(example_basename) + get_name("${example_file}" "example_name") + set(example_dox "${AOM_DOXYGEN_OUTPUT_DIR}/${example_name}.dox") + set(dox_string "/*!\\page example_${example_name} ${example_name}\n") + set(dox_string "${dox_string} \\includelineno ${example_file}\n*/\n") + file(WRITE "${example_dox}" ${dox_string}) + set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${example_dox}") + endforeach() + + # Generate samples.dox, an index page that refers to the example_basename.dox + # files that were just created. + set(samples_header " +/*!\\page samples Sample Code +This SDK includes a number of sample applications. Each sample documents a +feature of the SDK in both prose and the associated C code. The following +samples are included: +") + + set(utils_desc " +In addition, the SDK contains a number of utilities. Since these utilities are +built upon the concepts described in the sample code listed above, they are not +documented in pieces like the samples are. Their source is included here for +reference. The following utilities are included: +") + + # Write the description for the samples section. + set(samples_dox "${AOM_CONFIG_DIR}/samples.dox") + file(WRITE "${samples_dox}" "${samples_header}\n") + + # Iterate over $AOM_DOXYGEN_EXAMPLE_SOURCES and + # $AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS and massage example names as required by + # AV1's doxygen setup. + math(EXPR max_example_index "${num_sources} - 1") + foreach(NUM RANGE ${max_example_index}) + list(GET AOM_DOXYGEN_EXAMPLE_SOURCES ${NUM} ex_name) + get_name("${ex_name}" "ex_name") + + # AV1's doxygen lists aomdec and aomenc as utils apart from the examples. + # Save the indexes for another pass. + if("${ex_name}" MATCHES "aomdec\|aomenc") + set(util_indexes "${util_indexes}" "${NUM}") + continue() + endif() + list(GET AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${NUM} ex_desc) + file(APPEND "${samples_dox}" " - \\subpage example_${ex_name} ${ex_desc}\n") + endforeach() + + # Write the description and index for the utils. + file(APPEND "${samples_dox}" "${utils_desc}\n") + foreach(util_index ${util_indexes}) + list(GET AOM_DOXYGEN_EXAMPLE_SOURCES ${util_index} ex_name) + get_name("${ex_name}" "ex_name") + list(GET AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${util_index} ex_desc) + file(APPEND "${samples_dox}" " - \\subpage example_${ex_name} ${ex_desc}\n") + endforeach() + file(APPEND "${samples_dox}" "*/") + + # Add $samples_dox to the doxygen inputs. + get_filename_component(samples_dox ${samples_dox} NAME) + set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} ${samples_dox}) + + # Generate libaom's doxyfile. + file(WRITE "${AOM_DOXYFILE}" "##\n## GENERATED FILE. DO NOT EDIT\n##\n") + file(READ "${AOM_ROOT}/${AOM_DOXYGEN_CONFIG_TEMPLATE}" doxygen_template_data) + file(APPEND "${AOM_DOXYFILE}" ${doxygen_template_data}) + file(APPEND "${AOM_DOXYFILE}" + "EXAMPLE_PATH += ${AOM_ROOT} ${AOM_ROOT}/examples\n") + file(APPEND "${AOM_DOXYFILE}" + "INCLUDE_PATH += ${AOM_CONFIG_DIR} ${AOM_ROOT}\n") + file(APPEND "${AOM_DOXYFILE}" + "STRIP_FROM_PATH += ${AOM_ROOT} ${AOM_CONFIG_DIR}\n") + write_cmake_list_to_doxygen_config_var("INPUT" "AOM_DOXYGEN_SOURCES") + write_cmake_list_to_doxygen_config_var("ENABLED_SECTIONS" + "AOM_DOXYGEN_SECTIONS") + + # Add the doxygen generation rule. + add_custom_target(docs ALL + COMMAND "${DOXYGEN_EXECUTABLE}" "${AOM_DOXYFILE}" + DEPENDS "${AOM_DOXYFILE}" ${AOM_DOXYGEN_SOURCES} + ${AOM_DOXYGEN_EXAMPLE_SOURCES} + "${AOM_DOXYGEN_CONFIG_TEMPLATE}" + SOURCES "${AOM_DOXYFILE}" ${AOM_DOXYGEN_SOURCES} + ${AOM_DOXYGEN_EXAMPLE_SOURCES} + "${AOM_DOXYGEN_CONFIG_TEMPLATE}") +endfunction() diff --git a/libs/libaom/src/examples/analyzer.cc b/libs/libaom/src/examples/analyzer.cc new file mode 100644 index 000000000..35988211e --- /dev/null +++ b/libs/libaom/src/examples/analyzer.cc @@ -0,0 +1,723 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include +#include +#include + +#include "aom/aom_decoder.h" +#include "aom/aomdx.h" +#include "av1/common/av1_common_int.h" +#include "av1/decoder/accounting.h" +#include "av1/decoder/inspection.h" +#include "common/tools_common.h" +#include "common/video_reader.h" + +#define OD_SIGNMASK(a) (-((a) < 0)) +#define OD_FLIPSIGNI(a, b) (((a) + OD_SIGNMASK(b)) ^ OD_SIGNMASK(b)) +#define OD_DIV_ROUND(x, y) (((x) + OD_FLIPSIGNI((y) >> 1, x)) / (y)) + +enum { + OD_LUMA_MASK = 1 << 0, + OD_CB_MASK = 1 << 1, + OD_CR_MASK = 1 << 2, + OD_ALL_MASK = OD_LUMA_MASK | OD_CB_MASK | OD_CR_MASK +}; + +class AV1Decoder { + private: + FILE *input; + wxString path; + + AvxVideoReader *reader; + const AvxVideoInfo *info; + const AvxInterface *decoder; + + insp_frame_data frame_data; + + aom_codec_ctx_t codec; + bool show_padding; + + public: + aom_image_t *image; + int frame; + + int plane_mask; + + AV1Decoder(); + ~AV1Decoder(); + + bool open(const wxString &path); + void close(); + bool step(); + + int getWidthPadding() const; + int getHeightPadding() const; + void togglePadding(); + int getWidth() const; + int getHeight() const; + + bool getAccountingStruct(Accounting **acct); + bool setInspectionCallback(); + + static void inspect(void *decoder, void *data); +}; + +AV1Decoder::AV1Decoder() + : reader(NULL), info(NULL), decoder(NULL), show_padding(false), image(NULL), + frame(0) {} + +AV1Decoder::~AV1Decoder() {} + +void AV1Decoder::togglePadding() { show_padding = !show_padding; } + +bool AV1Decoder::open(const wxString &path) { + reader = aom_video_reader_open(path.mb_str()); + if (!reader) { + fprintf(stderr, "Failed to open %s for reading.", path.mb_str().data()); + return false; + } + this->path = path; + info = aom_video_reader_get_info(reader); + decoder = get_aom_decoder_by_fourcc(info->codec_fourcc); + if (!decoder) { + fprintf(stderr, "Unknown input codec."); + return false; + } + printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface())); + if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0)) { + fprintf(stderr, "Failed to initialize decoder."); + return false; + } + ifd_init(&frame_data, info->frame_width, info->frame_height); + setInspectionCallback(); + return true; +} + +void AV1Decoder::close() {} + +bool AV1Decoder::step() { + if (aom_video_reader_read_frame(reader)) { + size_t frame_size; + const unsigned char *frame_data; + frame_data = aom_video_reader_get_frame(reader, &frame_size); + if (aom_codec_decode(&codec, frame_data, frame_size, NULL)) { + fprintf(stderr, "Failed to decode frame."); + return false; + } else { + aom_codec_iter_t iter = NULL; + image = aom_codec_get_frame(&codec, &iter); + if (image != NULL) { + frame++; + return true; + } + return false; + } + } + return false; +} + +int AV1Decoder::getWidth() const { + return info->frame_width + 2 * getWidthPadding(); +} + +int AV1Decoder::getWidthPadding() const { + return show_padding ? AOMMAX(info->frame_width + 16, + ALIGN_POWER_OF_TWO(info->frame_width, 6)) - + info->frame_width + : 0; +} + +int AV1Decoder::getHeight() const { + return info->frame_height + 2 * getHeightPadding(); +} + +int AV1Decoder::getHeightPadding() const { + return show_padding ? AOMMAX(info->frame_height + 16, + ALIGN_POWER_OF_TWO(info->frame_height, 6)) - + info->frame_height + : 0; +} + +bool AV1Decoder::getAccountingStruct(Accounting **accounting) { + return aom_codec_control(&codec, AV1_GET_ACCOUNTING, accounting) == + AOM_CODEC_OK; +} + +bool AV1Decoder::setInspectionCallback() { + aom_inspect_init ii; + ii.inspect_cb = AV1Decoder::inspect; + ii.inspect_ctx = (void *)this; + return aom_codec_control(&codec, AV1_SET_INSPECTION_CALLBACK, &ii) == + AOM_CODEC_OK; +} + +void AV1Decoder::inspect(void *pbi, void *data) { + AV1Decoder *decoder = (AV1Decoder *)data; + ifd_inspect(&decoder->frame_data, pbi, 0); +} + +#define MIN_ZOOM (1) +#define MAX_ZOOM (4) + +class AnalyzerPanel : public wxPanel { + DECLARE_EVENT_TABLE() + + private: + AV1Decoder decoder; + const wxString path; + + int zoom; + unsigned char *pixels; + + const bool bit_accounting; + double *bpp_q3; + + int plane_mask; + + // The display size is the decode size, scaled by the zoom. + int getDisplayWidth() const; + int getDisplayHeight() const; + + bool updateDisplaySize(); + + void computeBitsPerPixel(); + + public: + AnalyzerPanel(wxWindow *parent, const wxString &path, + const bool bit_accounting); + ~AnalyzerPanel(); + + bool open(const wxString &path); + void close(); + void render(); + void togglePadding(); + bool nextFrame(); + void refresh(); + + int getZoom() const; + bool setZoom(int zoom); + + void setShowPlane(bool show_plane, int mask); + + void onPaint(wxPaintEvent &event); // NOLINT +}; + +BEGIN_EVENT_TABLE(AnalyzerPanel, wxPanel) +EVT_PAINT(AnalyzerPanel::onPaint) +END_EVENT_TABLE() + +AnalyzerPanel::AnalyzerPanel(wxWindow *parent, const wxString &path, + const bool bit_accounting) + : wxPanel(parent), path(path), zoom(0), pixels(NULL), + bit_accounting(bit_accounting), bpp_q3(NULL), plane_mask(OD_ALL_MASK) {} + +AnalyzerPanel::~AnalyzerPanel() { close(); } + +void AnalyzerPanel::setShowPlane(bool show_plane, int mask) { + if (show_plane) { + plane_mask |= mask; + } else { + plane_mask &= ~mask; + } +} + +void AnalyzerPanel::render() { + aom_image_t *img = decoder.image; + const int hbd = !!(img->fmt & AOM_IMG_FMT_HIGHBITDEPTH); + int y_stride = img->stride[0] >> hbd; + int cb_stride = img->stride[1] >> hbd; + int cr_stride = img->stride[2] >> hbd; + int p_stride = 3 * getDisplayWidth(); + unsigned char *y_row = img->planes[0]; + unsigned char *cb_row = img->planes[1]; + unsigned char *cr_row = img->planes[2]; + uint16_t *y_row16 = reinterpret_cast(y_row); + uint16_t *cb_row16 = reinterpret_cast(cb_row); + uint16_t *cr_row16 = reinterpret_cast(cr_row); + unsigned char *p_row = pixels; + int y_width_padding = decoder.getWidthPadding(); + int cb_width_padding = y_width_padding >> 1; + int cr_width_padding = y_width_padding >> 1; + int y_height_padding = decoder.getHeightPadding(); + int cb_height_padding = y_height_padding >> 1; + int cr_height_padding = y_height_padding >> 1; + for (int j = 0; j < decoder.getHeight(); j++) { + unsigned char *y = y_row - y_stride * y_height_padding; + unsigned char *cb = cb_row - cb_stride * cb_height_padding; + unsigned char *cr = cr_row - cr_stride * cr_height_padding; + uint16_t *y16 = y_row16 - y_stride * y_height_padding; + uint16_t *cb16 = cb_row16 - cb_stride * cb_height_padding; + uint16_t *cr16 = cr_row16 - cr_stride * cr_height_padding; + unsigned char *p = p_row; + for (int i = 0; i < decoder.getWidth(); i++) { + int64_t yval; + int64_t cbval; + int64_t crval; + int pmask; + unsigned rval; + unsigned gval; + unsigned bval; + if (hbd) { + yval = *(y16 - y_width_padding); + cbval = *(cb16 - cb_width_padding); + crval = *(cr16 - cr_width_padding); + } else { + yval = *(y - y_width_padding); + cbval = *(cb - cb_width_padding); + crval = *(cr - cr_width_padding); + } + pmask = plane_mask; + if (pmask & OD_LUMA_MASK) { + yval -= 16; + } else { + yval = 128; + } + cbval = ((pmask & OD_CB_MASK) >> 1) * (cbval - 128); + crval = ((pmask & OD_CR_MASK) >> 2) * (crval - 128); + /*This is intentionally slow and very accurate.*/ + rval = OD_CLAMPI( + 0, + (int32_t)OD_DIV_ROUND( + 2916394880000LL * yval + 4490222169144LL * crval, 9745792000LL), + 65535); + gval = OD_CLAMPI(0, + (int32_t)OD_DIV_ROUND(2916394880000LL * yval - + 534117096223LL * cbval - + 1334761232047LL * crval, + 9745792000LL), + 65535); + bval = OD_CLAMPI( + 0, + (int32_t)OD_DIV_ROUND( + 2916394880000LL * yval + 5290866304968LL * cbval, 9745792000LL), + 65535); + unsigned char *px_row = p; + for (int v = 0; v < zoom; v++) { + unsigned char *px = px_row; + for (int u = 0; u < zoom; u++) { + *(px + 0) = (unsigned char)(rval >> 8); + *(px + 1) = (unsigned char)(gval >> 8); + *(px + 2) = (unsigned char)(bval >> 8); + px += 3; + } + px_row += p_stride; + } + if (hbd) { + int dc = ((y16 - y_row16) & 1) | (1 - img->x_chroma_shift); + y16++; + cb16 += dc; + cr16 += dc; + } else { + int dc = ((y - y_row) & 1) | (1 - img->x_chroma_shift); + y++; + cb += dc; + cr += dc; + } + p += zoom * 3; + } + int dc = -((j & 1) | (1 - img->y_chroma_shift)); + if (hbd) { + y_row16 += y_stride; + cb_row16 += dc & cb_stride; + cr_row16 += dc & cr_stride; + } else { + y_row += y_stride; + cb_row += dc & cb_stride; + cr_row += dc & cr_stride; + } + p_row += zoom * p_stride; + } +} + +void AnalyzerPanel::computeBitsPerPixel() { + Accounting *acct; + double bpp_total; + int totals_q3[MAX_SYMBOL_TYPES] = { 0 }; + int sym_count[MAX_SYMBOL_TYPES] = { 0 }; + decoder.getAccountingStruct(&acct); + for (int j = 0; j < decoder.getHeight(); j++) { + for (int i = 0; i < decoder.getWidth(); i++) { + bpp_q3[j * decoder.getWidth() + i] = 0.0; + } + } + bpp_total = 0; + for (int i = 0; i < acct->syms.num_syms; i++) { + AccountingSymbol *s; + s = &acct->syms.syms[i]; + totals_q3[s->id] += s->bits; + sym_count[s->id] += s->samples; + } + printf("=== Frame: %-3i ===\n", decoder.frame - 1); + for (int i = 0; i < acct->syms.dictionary.num_strs; i++) { + if (totals_q3[i]) { + printf("%30s = %10.3f (%f bit/symbol)\n", acct->syms.dictionary.strs[i], + (float)totals_q3[i] / 8, (float)totals_q3[i] / 8 / sym_count[i]); + } + } + printf("\n"); +} + +void AnalyzerPanel::togglePadding() { + decoder.togglePadding(); + updateDisplaySize(); +} + +bool AnalyzerPanel::nextFrame() { + if (decoder.step()) { + refresh(); + return true; + } + return false; +} + +void AnalyzerPanel::refresh() { + if (bit_accounting) { + computeBitsPerPixel(); + } + render(); +} + +int AnalyzerPanel::getDisplayWidth() const { return zoom * decoder.getWidth(); } + +int AnalyzerPanel::getDisplayHeight() const { + return zoom * decoder.getHeight(); +} + +bool AnalyzerPanel::updateDisplaySize() { + unsigned char *p = (unsigned char *)malloc( + sizeof(*p) * 3 * getDisplayWidth() * getDisplayHeight()); + if (p == NULL) { + return false; + } + free(pixels); + pixels = p; + SetSize(getDisplayWidth(), getDisplayHeight()); + return true; +} + +bool AnalyzerPanel::open(const wxString &path) { + if (!decoder.open(path)) { + return false; + } + if (!setZoom(MIN_ZOOM)) { + return false; + } + if (bit_accounting) { + bpp_q3 = (double *)malloc(sizeof(*bpp_q3) * decoder.getWidth() * + decoder.getHeight()); + if (bpp_q3 == NULL) { + fprintf(stderr, "Could not allocate memory for bit accounting\n"); + close(); + return false; + } + } + if (!nextFrame()) { + close(); + return false; + } + SetFocus(); + return true; +} + +void AnalyzerPanel::close() { + decoder.close(); + free(pixels); + pixels = NULL; + free(bpp_q3); + bpp_q3 = NULL; +} + +int AnalyzerPanel::getZoom() const { return zoom; } + +bool AnalyzerPanel::setZoom(int z) { + if (z <= MAX_ZOOM && z >= MIN_ZOOM && zoom != z) { + int old_zoom = zoom; + zoom = z; + if (!updateDisplaySize()) { + zoom = old_zoom; + return false; + } + return true; + } + return false; +} + +void AnalyzerPanel::onPaint(wxPaintEvent &) { + wxBitmap bmp(wxImage(getDisplayWidth(), getDisplayHeight(), pixels, true)); + wxBufferedPaintDC dc(this, bmp); +} + +class AnalyzerFrame : public wxFrame { + DECLARE_EVENT_TABLE() + + private: + AnalyzerPanel *panel; + const bool bit_accounting; + + wxMenu *fileMenu; + wxMenu *viewMenu; + wxMenu *playbackMenu; + + public: + AnalyzerFrame(const bool bit_accounting); // NOLINT + + void onOpen(wxCommandEvent &event); // NOLINT + void onClose(wxCommandEvent &event); // NOLINT + void onQuit(wxCommandEvent &event); // NOLINT + + void onTogglePadding(wxCommandEvent &event); // NOLINT + void onZoomIn(wxCommandEvent &event); // NOLINT + void onZoomOut(wxCommandEvent &event); // NOLINT + void onActualSize(wxCommandEvent &event); // NOLINT + + void onToggleViewMenuCheckBox(wxCommandEvent &event); // NOLINT + void onResetAndToggleViewMenuCheckBox(wxCommandEvent &event); // NOLINT + + void onNextFrame(wxCommandEvent &event); // NOLINT + void onGotoFrame(wxCommandEvent &event); // NOLINT + void onRestart(wxCommandEvent &event); // NOLINT + + void onAbout(wxCommandEvent &event); // NOLINT + + bool open(const wxString &path); + bool setZoom(int zoom); + void updateViewMenu(); +}; + +enum { + wxID_NEXT_FRAME = 6000, + wxID_SHOW_Y, + wxID_SHOW_U, + wxID_SHOW_V, + wxID_GOTO_FRAME, + wxID_RESTART, + wxID_ACTUAL_SIZE, + wxID_PADDING +}; + +BEGIN_EVENT_TABLE(AnalyzerFrame, wxFrame) +EVT_MENU(wxID_OPEN, AnalyzerFrame::onOpen) +EVT_MENU(wxID_CLOSE, AnalyzerFrame::onClose) +EVT_MENU(wxID_EXIT, AnalyzerFrame::onQuit) +EVT_MENU(wxID_PADDING, AnalyzerFrame::onTogglePadding) +EVT_MENU(wxID_ZOOM_IN, AnalyzerFrame::onZoomIn) +EVT_MENU(wxID_ZOOM_OUT, AnalyzerFrame::onZoomOut) +EVT_MENU(wxID_ACTUAL_SIZE, AnalyzerFrame::onActualSize) +EVT_MENU(wxID_SHOW_Y, AnalyzerFrame::onResetAndToggleViewMenuCheckBox) +EVT_MENU(wxID_SHOW_U, AnalyzerFrame::onResetAndToggleViewMenuCheckBox) +EVT_MENU(wxID_SHOW_V, AnalyzerFrame::onResetAndToggleViewMenuCheckBox) +EVT_MENU(wxID_NEXT_FRAME, AnalyzerFrame::onNextFrame) +EVT_MENU(wxID_GOTO_FRAME, AnalyzerFrame::onGotoFrame) +EVT_MENU(wxID_RESTART, AnalyzerFrame::onRestart) +EVT_MENU(wxID_ABOUT, AnalyzerFrame::onAbout) +END_EVENT_TABLE() + +AnalyzerFrame::AnalyzerFrame(const bool bit_accounting) + : wxFrame(NULL, wxID_ANY, _("AV1 Stream Analyzer"), wxDefaultPosition, + wxDefaultSize, wxDEFAULT_FRAME_STYLE), + panel(NULL), bit_accounting(bit_accounting) { + wxMenuBar *mb = new wxMenuBar(); + + fileMenu = new wxMenu(); + fileMenu->Append(wxID_OPEN, _("&Open...\tCtrl-O"), _("Open AV1 file")); + fileMenu->Append(wxID_CLOSE, _("&Close\tCtrl-W"), _("Close AV1 file")); + fileMenu->Enable(wxID_CLOSE, false); + fileMenu->Append(wxID_EXIT, _("E&xit\tCtrl-Q"), _("Quit this program")); + mb->Append(fileMenu, _("&File")); + + wxAcceleratorEntry entries[2]; + entries[0].Set(wxACCEL_CTRL, (int)'=', wxID_ZOOM_IN); + entries[1].Set(wxACCEL_CTRL | wxACCEL_SHIFT, (int)'-', wxID_ZOOM_OUT); + wxAcceleratorTable accel(2, entries); + this->SetAcceleratorTable(accel); + + viewMenu = new wxMenu(); + +viewMenu->Append(wxID_PADDING, _("Toggle padding\tCtrl-p"), + _("Show padding")); + viewMenu->Append(wxID_ZOOM_IN, _("Zoom-In\tCtrl-+"), _("Double image size")); + viewMenu->Append(wxID_ZOOM_OUT, _("Zoom-Out\tCtrl--"), _("Half image size")); + viewMenu->Append(wxID_ACTUAL_SIZE, _("Actual size\tCtrl-0"), + _("Actual size of the frame")); + viewMenu->AppendSeparator(); + viewMenu->AppendCheckItem(wxID_SHOW_Y, _("&Y plane\tCtrl-Y"), + _("Show Y plane")); + viewMenu->AppendCheckItem(wxID_SHOW_U, _("&U plane\tCtrl-U"), + _("Show U plane")); + viewMenu->AppendCheckItem(wxID_SHOW_V, _("&V plane\tCtrl-V"), + _("Show V plane")); + mb->Append(viewMenu, _("&View")); + + playbackMenu = new wxMenu(); + playbackMenu->Append(wxID_NEXT_FRAME, _("Next frame\tCtrl-."), + _("Go to next frame")); + /*playbackMenu->Append(wxID_RESTART, _("&Restart\tCtrl-R"), + _("Set video to frame 0")); + playbackMenu->Append(wxID_GOTO_FRAME, _("Jump to Frame\tCtrl-J"), + _("Go to frame number"));*/ + mb->Append(playbackMenu, _("&Playback")); + + wxMenu *helpMenu = new wxMenu(); + helpMenu->Append(wxID_ABOUT, _("&About...\tF1"), _("Show about dialog")); + mb->Append(helpMenu, _("&Help")); + + SetMenuBar(mb); + + CreateStatusBar(1); +} + +void AnalyzerFrame::onOpen(wxCommandEvent &WXUNUSED(event)) { + wxFileDialog openFileDialog(this, _("Open file"), wxEmptyString, + wxEmptyString, _("AV1 files (*.ivf)|*.ivf"), + wxFD_OPEN | wxFD_FILE_MUST_EXIST); + if (openFileDialog.ShowModal() != wxID_CANCEL) { + open(openFileDialog.GetPath()); + } +} + +void AnalyzerFrame::onClose(wxCommandEvent &WXUNUSED(event)) {} + +void AnalyzerFrame::onQuit(wxCommandEvent &WXUNUSED(event)) { Close(true); } + +void AnalyzerFrame::onTogglePadding(wxCommandEvent &WXUNUSED(event)) { + panel->togglePadding(); + SetClientSize(panel->GetSize()); + panel->render(); + panel->Refresh(); +} + +void AnalyzerFrame::onZoomIn(wxCommandEvent &WXUNUSED(event)) { + setZoom(panel->getZoom() + 1); +} + +void AnalyzerFrame::onZoomOut(wxCommandEvent &WXUNUSED(event)) { + setZoom(panel->getZoom() - 1); +} + +void AnalyzerFrame::onActualSize(wxCommandEvent &WXUNUSED(event)) { + setZoom(MIN_ZOOM); +} + +void AnalyzerFrame::onToggleViewMenuCheckBox(wxCommandEvent &event) { // NOLINT + GetMenuBar()->Check(event.GetId(), event.IsChecked()); + updateViewMenu(); +} + +void AnalyzerFrame::onResetAndToggleViewMenuCheckBox( + wxCommandEvent &event) { // NOLINT + int id = event.GetId(); + if (id != wxID_SHOW_Y && id != wxID_SHOW_U && id != wxID_SHOW_V) { + GetMenuBar()->Check(wxID_SHOW_Y, true); + GetMenuBar()->Check(wxID_SHOW_U, true); + GetMenuBar()->Check(wxID_SHOW_V, true); + } + onToggleViewMenuCheckBox(event); +} + +void AnalyzerFrame::onNextFrame(wxCommandEvent &WXUNUSED(event)) { + panel->nextFrame(); + panel->Refresh(false); +} + +void AnalyzerFrame::onGotoFrame(wxCommandEvent &WXUNUSED(event)) {} + +void AnalyzerFrame::onRestart(wxCommandEvent &WXUNUSED(event)) {} + +void AnalyzerFrame::onAbout(wxCommandEvent &WXUNUSED(event)) { + wxAboutDialogInfo info; + info.SetName(_("AV1 Bitstream Analyzer")); + info.SetVersion(_("0.1-beta")); + info.SetDescription( + _("This program implements a bitstream analyzer for AV1")); + info.SetCopyright( + wxT("(C) 2017 Alliance for Open Media ")); + wxAboutBox(info); +} + +bool AnalyzerFrame::open(const wxString &path) { + panel = new AnalyzerPanel(this, path, bit_accounting); + if (panel->open(path)) { + SetClientSize(panel->GetSize()); + return true; + } else { + delete panel; + return false; + } +} + +bool AnalyzerFrame::setZoom(int zoom) { + if (panel->setZoom(zoom)) { + GetMenuBar()->Enable(wxID_ACTUAL_SIZE, zoom != MIN_ZOOM); + GetMenuBar()->Enable(wxID_ZOOM_IN, zoom != MAX_ZOOM); + GetMenuBar()->Enable(wxID_ZOOM_OUT, zoom != MIN_ZOOM); + SetClientSize(panel->GetSize()); + panel->render(); + panel->Refresh(); + return true; + } + return false; +} + +void AnalyzerFrame::updateViewMenu() { + panel->setShowPlane(GetMenuBar()->IsChecked(wxID_SHOW_Y), OD_LUMA_MASK); + panel->setShowPlane(GetMenuBar()->IsChecked(wxID_SHOW_U), OD_CB_MASK); + panel->setShowPlane(GetMenuBar()->IsChecked(wxID_SHOW_V), OD_CR_MASK); + SetClientSize(panel->GetSize()); + panel->render(); + panel->Refresh(false); +} + +class Analyzer : public wxApp { + private: + AnalyzerFrame *frame; + + public: + void OnInitCmdLine(wxCmdLineParser &parser); // NOLINT + bool OnCmdLineParsed(wxCmdLineParser &parser); // NOLINT +}; + +static const wxCmdLineEntryDesc CMD_LINE_DESC[] = { + { wxCMD_LINE_SWITCH, _("h"), _("help"), _("Display this help and exit."), + wxCMD_LINE_VAL_NONE, wxCMD_LINE_OPTION_HELP }, + { wxCMD_LINE_SWITCH, _("a"), _("bit-accounting"), _("Enable bit accounting"), + wxCMD_LINE_VAL_NONE, wxCMD_LINE_PARAM_OPTIONAL }, + { wxCMD_LINE_PARAM, NULL, NULL, _("input.ivf"), wxCMD_LINE_VAL_STRING, + wxCMD_LINE_PARAM_OPTIONAL }, + { wxCMD_LINE_NONE } +}; + +void Analyzer::OnInitCmdLine(wxCmdLineParser &parser) { // NOLINT + parser.SetDesc(CMD_LINE_DESC); + parser.SetSwitchChars(_("-")); +} + +bool Analyzer::OnCmdLineParsed(wxCmdLineParser &parser) { // NOLINT + bool bit_accounting = parser.Found(_("a")); + if (bit_accounting && !CONFIG_ACCOUNTING) { + fprintf(stderr, + "Bit accounting support not found. " + "Recompile with:\n./cmake -DCONFIG_ACCOUNTING=1\n"); + return false; + } + frame = new AnalyzerFrame(parser.Found(_("a"))); + frame->Show(); + if (parser.GetParamCount() > 0) { + return frame->open(parser.GetParam(0)); + } + return true; +} + +void usage_exit(void) { + fprintf(stderr, "uhh\n"); + exit(EXIT_FAILURE); +} + +IMPLEMENT_APP(Analyzer) diff --git a/libs/libaom/src/examples/aom_cx_set_ref.c b/libs/libaom/src/examples/aom_cx_set_ref.c new file mode 100644 index 000000000..2f4f6586f --- /dev/null +++ b/libs/libaom/src/examples/aom_cx_set_ref.c @@ -0,0 +1,383 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// AV1 Set Reference Frame +// ============================ +// +// This is an example demonstrating how to overwrite the AV1 encoder's +// internal reference frame. In the sample we set the last frame to the +// current frame. This technique could be used to bounce between two cameras. +// +// The decoder would also have to set the reference frame to the same value +// on the same frame, or the video will become corrupt. The 'test_decode' +// variable is set to 1 in this example that tests if the encoder and decoder +// results are matching. +// +// Usage +// ----- +// This example encodes a raw video. And the last argument passed in specifies +// the frame number to update the reference frame on. For example, run +// examples/aom_cx_set_ref av1 352 288 in.yuv out.ivf 4 30 +// The parameter is parsed as follows: +// +// +// Extra Variables +// --------------- +// This example maintains the frame number passed on the command line +// in the `update_frame_num` variable. +// +// +// Configuration +// ------------- +// +// The reference frame is updated on the frame specified on the command +// line. +// +// Observing The Effects +// --------------------- +// The encoder and decoder results should be matching when the same reference +// frame setting operation is done in both encoder and decoder. Otherwise, +// the encoder/decoder mismatch would be seen. + +#include +#include +#include + +#include "aom/aom_decoder.h" +#include "aom/aom_encoder.h" +#include "aom/aomcx.h" +#include "aom_scale/yv12config.h" +#include "common/tools_common.h" +#include "common/video_writer.h" +#include "examples/encoder_util.h" + +static const char *exec_name; + +void usage_exit() { + fprintf(stderr, + "Usage: %s " + " \n", + exec_name); + exit(EXIT_FAILURE); +} + +static void testing_decode(aom_codec_ctx_t *encoder, aom_codec_ctx_t *decoder, + unsigned int frame_out, int *mismatch_seen) { + aom_image_t enc_img, dec_img; + + if (*mismatch_seen) return; + + /* Get the internal reference frame */ + if (aom_codec_control(encoder, AV1_GET_NEW_FRAME_IMAGE, &enc_img)) + die_codec(encoder, "Failed to get encoder reference frame"); + if (aom_codec_control(decoder, AV1_GET_NEW_FRAME_IMAGE, &dec_img)) + die_codec(decoder, "Failed to get decoder reference frame"); + + if ((enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) != + (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH)) { + if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + aom_image_t enc_hbd_img; + aom_img_alloc(&enc_hbd_img, enc_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH, + enc_img.d_w, enc_img.d_h, 16); + aom_img_truncate_16_to_8(&enc_hbd_img, &enc_img); + enc_img = enc_hbd_img; + } + if (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + aom_image_t dec_hbd_img; + aom_img_alloc(&dec_hbd_img, dec_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH, + dec_img.d_w, dec_img.d_h, 16); + aom_img_truncate_16_to_8(&dec_hbd_img, &dec_img); + dec_img = dec_hbd_img; + } + } + + if (!aom_compare_img(&enc_img, &dec_img)) { + int y[4], u[4], v[4]; + if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + aom_find_mismatch_high(&enc_img, &dec_img, y, u, v); + } else { + aom_find_mismatch(&enc_img, &dec_img, y, u, v); + } + + printf( + "Encode/decode mismatch on frame %d at" + " Y[%d, %d] {%d/%d}," + " U[%d, %d] {%d/%d}," + " V[%d, %d] {%d/%d}", + frame_out, y[0], y[1], y[2], y[3], u[0], u[1], u[2], u[3], v[0], v[1], + v[2], v[3]); + *mismatch_seen = 1; + } + + aom_img_free(&enc_img); + aom_img_free(&dec_img); +} + +static int encode_frame(aom_codec_ctx_t *ecodec, aom_image_t *img, + unsigned int frame_in, AvxVideoWriter *writer, + int test_decode, aom_codec_ctx_t *dcodec, + unsigned int *frame_out, int *mismatch_seen, + aom_image_t *ext_ref) { + int got_pkts = 0; + aom_codec_iter_t iter = NULL; + const aom_codec_cx_pkt_t *pkt = NULL; + int got_data; + const aom_codec_err_t res = aom_codec_encode(ecodec, img, frame_in, 1, 0); + if (res != AOM_CODEC_OK) die_codec(ecodec, "Failed to encode frame"); + + got_data = 0; + + while ((pkt = aom_codec_get_cx_data(ecodec, &iter)) != NULL) { + got_pkts = 1; + + if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) { + const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0; + + ++*frame_out; + + if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf, + pkt->data.frame.sz, + pkt->data.frame.pts)) { + die_codec(ecodec, "Failed to write compressed frame"); + } + printf(keyframe ? "K" : "."); + fflush(stdout); + got_data = 1; + + // Decode 1 frame. + if (test_decode) { + if (aom_codec_decode(dcodec, pkt->data.frame.buf, + (unsigned int)pkt->data.frame.sz, NULL)) + die_codec(dcodec, "Failed to decode frame."); + + // Copy out first decoded frame, and use it as reference later. + if (*frame_out == 1 && ext_ref != NULL) + if (aom_codec_control(dcodec, AV1_COPY_NEW_FRAME_IMAGE, ext_ref)) + die_codec(dcodec, "Failed to get decoder new frame"); + } + } + } + + // Mismatch checking + if (got_data && test_decode) { + testing_decode(ecodec, dcodec, *frame_out, mismatch_seen); + } + + return got_pkts; +} + +int main(int argc, char **argv) { + FILE *infile = NULL; + // Encoder + aom_codec_ctx_t ecodec; + aom_codec_enc_cfg_t cfg; + unsigned int frame_in = 0; + aom_image_t raw; + aom_image_t raw_shift; + aom_image_t ext_ref; + aom_codec_err_t res; + AvxVideoInfo info; + AvxVideoWriter *writer = NULL; + const AvxInterface *encoder = NULL; + int flags = 0; + int allocated_raw_shift = 0; + aom_img_fmt_t raw_fmt = AOM_IMG_FMT_I420; + aom_img_fmt_t ref_fmt = AOM_IMG_FMT_I420; + + // Test encoder/decoder mismatch. + int test_decode = 1; + // Decoder + aom_codec_ctx_t dcodec; + unsigned int frame_out = 0; + + // The frame number to set reference frame on + unsigned int update_frame_num = 0; + int mismatch_seen = 0; + + const int fps = 30; + const int bitrate = 500; + + const char *codec_arg = NULL; + const char *width_arg = NULL; + const char *height_arg = NULL; + const char *infile_arg = NULL; + const char *outfile_arg = NULL; + const char *update_frame_num_arg = NULL; + unsigned int limit = 0; + exec_name = argv[0]; + + // Clear explicitly, as simply assigning "{ 0 }" generates + // "missing-field-initializers" warning in some compilers. + memset(&ecodec, 0, sizeof(ecodec)); + memset(&cfg, 0, sizeof(cfg)); + memset(&info, 0, sizeof(info)); + + if (argc < 7) die("Invalid number of arguments"); + + codec_arg = argv[1]; + width_arg = argv[2]; + height_arg = argv[3]; + infile_arg = argv[4]; + outfile_arg = argv[5]; + update_frame_num_arg = argv[6]; + + encoder = get_aom_encoder_by_name(codec_arg); + if (!encoder) die("Unsupported codec."); + + update_frame_num = (unsigned int)strtoul(update_frame_num_arg, NULL, 0); + // In AV1, the reference buffers (cm->buffer_pool->frame_bufs[i].buf) are + // allocated while calling aom_codec_encode(), thus, setting reference for + // 1st frame isn't supported. + if (update_frame_num <= 1) { + die("Couldn't parse frame number '%s'\n", update_frame_num_arg); + } + + if (argc > 7) { + limit = (unsigned int)strtoul(argv[7], NULL, 0); + if (update_frame_num > limit) + die("Update frame number couldn't larger than limit\n"); + } + + info.codec_fourcc = encoder->fourcc; + info.frame_width = (int)strtol(width_arg, NULL, 0); + info.frame_height = (int)strtol(height_arg, NULL, 0); + info.time_base.numerator = 1; + info.time_base.denominator = fps; + + if (info.frame_width <= 0 || info.frame_height <= 0) { + die("Invalid frame size: %dx%d", info.frame_width, info.frame_height); + } + + // In this test, the bit depth of input video is 8-bit, and the input format + // is AOM_IMG_FMT_I420. + if (!aom_img_alloc(&raw, raw_fmt, info.frame_width, info.frame_height, 32)) { + die("Failed to allocate image."); + } + + if (FORCE_HIGHBITDEPTH_DECODING) ref_fmt |= AOM_IMG_FMT_HIGHBITDEPTH; + // Allocate memory with the border so that it can be used as a reference. + if (!aom_img_alloc_with_border(&ext_ref, ref_fmt, info.frame_width, + info.frame_height, 32, 8, + AOM_BORDER_IN_PIXELS)) { + die("Failed to allocate image."); + } + + printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface())); + + res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0); + if (res) die_codec(&ecodec, "Failed to get default codec config."); + + cfg.g_w = info.frame_width; + cfg.g_h = info.frame_height; + cfg.g_timebase.num = info.time_base.numerator; + cfg.g_timebase.den = info.time_base.denominator; + cfg.rc_target_bitrate = bitrate; + cfg.g_lag_in_frames = 3; + cfg.g_bit_depth = AOM_BITS_8; + + flags |= (cfg.g_bit_depth > AOM_BITS_8 || FORCE_HIGHBITDEPTH_DECODING) + ? AOM_CODEC_USE_HIGHBITDEPTH + : 0; + + writer = aom_video_writer_open(outfile_arg, kContainerIVF, &info); + if (!writer) die("Failed to open %s for writing.", outfile_arg); + + if (!(infile = fopen(infile_arg, "rb"))) + die("Failed to open %s for reading.", infile_arg); + + if (aom_codec_enc_init(&ecodec, encoder->codec_interface(), &cfg, flags)) + die_codec(&ecodec, "Failed to initialize encoder"); + + // Disable alt_ref. + if (aom_codec_control(&ecodec, AOME_SET_ENABLEAUTOALTREF, 0)) + die_codec(&ecodec, "Failed to set enable auto alt ref"); + + if (test_decode) { + const AvxInterface *decoder = get_aom_decoder_by_name(codec_arg); + if (aom_codec_dec_init(&dcodec, decoder->codec_interface(), NULL, 0)) + die_codec(&dcodec, "Failed to initialize decoder."); + } + + // Encode frames. + while (aom_img_read(&raw, infile)) { + if (limit && frame_in >= limit) break; + aom_image_t *frame_to_encode; + + if (FORCE_HIGHBITDEPTH_DECODING) { + // Need to allocate larger buffer to use hbd internal. + int input_shift = 0; + if (!allocated_raw_shift) { + aom_img_alloc(&raw_shift, raw_fmt | AOM_IMG_FMT_HIGHBITDEPTH, + info.frame_width, info.frame_height, 32); + allocated_raw_shift = 1; + } + aom_img_upshift(&raw_shift, &raw, input_shift); + frame_to_encode = &raw_shift; + } else { + frame_to_encode = &raw; + } + + if (update_frame_num > 1 && frame_out + 1 == update_frame_num) { + av1_ref_frame_t ref; + ref.idx = 0; + ref.use_external_ref = 0; + ref.img = ext_ref; + // Set reference frame in encoder. + if (aom_codec_control(&ecodec, AV1_SET_REFERENCE, &ref)) + die_codec(&ecodec, "Failed to set encoder reference frame"); + printf(" "); + + // If set_reference in decoder is commented out, the enc/dec mismatch + // would be seen. + if (test_decode) { + ref.use_external_ref = 1; + if (aom_codec_control(&dcodec, AV1_SET_REFERENCE, &ref)) + die_codec(&dcodec, "Failed to set decoder reference frame"); + } + } + + encode_frame(&ecodec, frame_to_encode, frame_in, writer, test_decode, + &dcodec, &frame_out, &mismatch_seen, &ext_ref); + frame_in++; + if (mismatch_seen) break; + } + + // Flush encoder. + if (!mismatch_seen) + while (encode_frame(&ecodec, NULL, frame_in, writer, test_decode, &dcodec, + &frame_out, &mismatch_seen, NULL)) { + } + + printf("\n"); + fclose(infile); + printf("Processed %d frames.\n", frame_out); + + if (test_decode) { + if (!mismatch_seen) + printf("Encoder/decoder results are matching.\n"); + else + printf("Encoder/decoder results are NOT matching.\n"); + } + + if (test_decode) + if (aom_codec_destroy(&dcodec)) + die_codec(&dcodec, "Failed to destroy decoder"); + + if (allocated_raw_shift) aom_img_free(&raw_shift); + aom_img_free(&ext_ref); + aom_img_free(&raw); + if (aom_codec_destroy(&ecodec)) + die_codec(&ecodec, "Failed to destroy encoder."); + + aom_video_writer_close(writer); + + return EXIT_SUCCESS; +} diff --git a/libs/libaom/src/examples/av1_dec_fuzzer.cc b/libs/libaom/src/examples/av1_dec_fuzzer.cc new file mode 100644 index 000000000..1cddc8cc1 --- /dev/null +++ b/libs/libaom/src/examples/av1_dec_fuzzer.cc @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/* + * See build_av1_dec_fuzzer.sh for building instructions. + */ + +#include +#include +#include +#include +#include +#include +#include "config/aom_config.h" +#include "aom/aom_decoder.h" +#include "aom/aomdx.h" +#include "aom_ports/mem_ops.h" + +#define IVF_FRAME_HDR_SZ (4 + 8) /* 4 byte size + 8 byte timestamp */ +#define IVF_FILE_HDR_SZ 32 + +extern "C" void usage_exit(void) { exit(EXIT_FAILURE); } + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size <= IVF_FILE_HDR_SZ) { + return 0; + } + + const aom_codec_iface_t *codec_interface = aom_codec_av1_dx(); + aom_codec_ctx_t codec; + // Set thread count in the range [1, 64]. + const unsigned int threads = (data[IVF_FILE_HDR_SZ] & 0x3f) + 1; + aom_codec_dec_cfg_t cfg = { threads, 0, 0, !FORCE_HIGHBITDEPTH_DECODING }; + if (aom_codec_dec_init(&codec, codec_interface, &cfg, 0)) { + return 0; + } + + data += IVF_FILE_HDR_SZ; + size -= IVF_FILE_HDR_SZ; + + while (size > IVF_FRAME_HDR_SZ) { + size_t frame_size = mem_get_le32(data); + size -= IVF_FRAME_HDR_SZ; + data += IVF_FRAME_HDR_SZ; + frame_size = std::min(size, frame_size); + + const aom_codec_err_t err = + aom_codec_decode(&codec, data, frame_size, nullptr); + static_cast(err); + aom_codec_iter_t iter = nullptr; + aom_image_t *img = nullptr; + while ((img = aom_codec_get_frame(&codec, &iter)) != nullptr) { + } + data += frame_size; + size -= frame_size; + } + aom_codec_destroy(&codec); + return 0; +} diff --git a/libs/libaom/src/examples/build_av1_dec_fuzzer.sh b/libs/libaom/src/examples/build_av1_dec_fuzzer.sh new file mode 100644 index 000000000..0dcb254da --- /dev/null +++ b/libs/libaom/src/examples/build_av1_dec_fuzzer.sh @@ -0,0 +1,70 @@ +#!/bin/bash +# +# Copyright (c) 2019, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and +# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +# was not distributed with this source code in the LICENSE file, you can +# obtain it at www.aomedia.org/license/software. If the Alliance for Open +# Media Patent License 1.0 was not distributed with this source code in the +# PATENTS file, you can obtain it at www.aomedia.org/license/patent. +# +############################################################################### +# Fuzzer for libaom decoder. +# ========================== +# Requirements +# --------------------- +# Clang6.0 or above (must support -fsanitize=fuzzer -fsanitize=fuzzer-no-link) +# +# References: +# --------------------- +# http://llvm.org/docs/LibFuzzer.html +# https://github.com/google/oss-fuzz +# +# Steps to build / run +# --------------------- + +set -eu + +# Have a copy of AOM and a build directory ready. +if [[ $# -ne 2 ]]; then + echo "Pass in the AOM source tree as first argument, and a build directory " + echo "as the second argument. The AOM source tree can be obtained via: " + echo " git clone https://aomedia.googlesource.com/aom" + exit 2 +fi +if [[ -z "$CC" ]]; then + echo "Set the CC environment variable to point to your C compiler." + exit 2 +fi +if [[ -z "$CXX" ]]; then + echo "Set the CXX environment variable to point to your C++ compiler." + exit 2 +fi + +AOM_DIR=$1 +BUILD_DIR=$2 +# Run CMake with address sanitizer enabled and build the codec. +# Enable DO_RANGE_CHECK_CLAMP to suppress the noise of integer overflows +# in the transform functions. Also set memory limits. +EXTRA_C_FLAGS='-DDO_RANGE_CHECK_CLAMP=1 -DAOM_MAX_ALLOCABLE_MEMORY=1073741824' +cd "${BUILD_DIR}" +cmake "${AOM_DIR}" -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCONFIG_PIC=1 \ + -DCONFIG_SCALABILITY=0 -DFORCE_HIGHBITDEPTH_DECODING=0 \ + -DCONFIG_AV1_ENCODER=0 -DENABLE_EXAMPLES=0 -DENABLE_DOCS=0 -DENABLE_TESTS=0 \ + -DCONFIG_SIZE_LIMIT=1 -DDECODE_HEIGHT_LIMIT=12288 -DDECODE_WIDTH_LIMIT=12288 \ + -DAOM_EXTRA_C_FLAGS="${EXTRA_C_FLAGS}" \ + -DAOM_EXTRA_CXX_FLAGS="${EXTRA_C_FLAGS}" -DSANITIZE=fuzzer-no-link,address + +# Build the codec. +make -j$(nproc) + +# Build the av1 fuzzer +$CXX -std=c++11 -DDECODER=av1 -I${AOM_DIR} -I${BUILD_DIR} \ + -fsanitize=fuzzer,address -Wl,--start-group \ + ${AOM_DIR}/examples/av1_dec_fuzzer.cc -o ${BUILD_DIR}/av1_dec_fuzzer \ + ${BUILD_DIR}/libaom.a -Wl,--end-group + +echo "Fuzzer built at ${BUILD_DIR}/av1_dec_fuzzer." +echo "Create a corpus directory, copy IVF files in there, and run:" +echo " av1_dec_fuzzer CORPUS_DIR" diff --git a/libs/libaom/src/examples/decode_to_md5.c b/libs/libaom/src/examples/decode_to_md5.c new file mode 100644 index 000000000..bc127b78d --- /dev/null +++ b/libs/libaom/src/examples/decode_to_md5.c @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Frame-by-frame MD5 Checksum +// =========================== +// +// This example builds upon the simple decoder loop to show how checksums +// of the decoded output can be generated. These are used for validating +// decoder implementations against the reference implementation, for example. +// +// MD5 algorithm +// ------------- +// The Message-Digest 5 (MD5) is a well known hash function. We have provided +// an implementation derived from the RSA Data Security, Inc. MD5 Message-Digest +// Algorithm for your use. Our implmentation only changes the interface of this +// reference code. You must include the `md5_utils.h` header for access to these +// functions. +// +// Processing The Decoded Data +// --------------------------- +// Each row of the image is passed to the MD5 accumulator. First the Y plane +// is processed, then U, then V. It is important to honor the image's `stride` +// values. + +#include +#include +#include + +#include "aom/aom_decoder.h" +#include "aom/aomdx.h" +#include "common/md5_utils.h" +#include "common/tools_common.h" +#include "common/video_reader.h" + +static void get_image_md5(const aom_image_t *img, unsigned char digest[16]) { + int plane, y; + MD5Context md5; + + MD5Init(&md5); + + for (plane = 0; plane < 3; ++plane) { + const unsigned char *buf = img->planes[plane]; + const int stride = img->stride[plane]; + const int w = plane ? (img->d_w + 1) >> 1 : img->d_w; + const int h = plane ? (img->d_h + 1) >> 1 : img->d_h; + + for (y = 0; y < h; ++y) { + MD5Update(&md5, buf, w); + buf += stride; + } + } + + MD5Final(digest, &md5); +} + +static void print_md5(FILE *stream, unsigned char digest[16]) { + int i; + + for (i = 0; i < 16; ++i) fprintf(stream, "%02x", digest[i]); +} + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, "Usage: %s \n", exec_name); + exit(EXIT_FAILURE); +} + +int main(int argc, char **argv) { + int frame_cnt = 0; + FILE *outfile = NULL; + aom_codec_ctx_t codec; + AvxVideoReader *reader = NULL; + const AvxVideoInfo *info = NULL; + const AvxInterface *decoder = NULL; + + exec_name = argv[0]; + + if (argc != 3) die("Invalid number of arguments."); + + reader = aom_video_reader_open(argv[1]); + if (!reader) die("Failed to open %s for reading.", argv[1]); + + if (!(outfile = fopen(argv[2], "wb"))) + die("Failed to open %s for writing.", argv[2]); + + info = aom_video_reader_get_info(reader); + + decoder = get_aom_decoder_by_fourcc(info->codec_fourcc); + if (!decoder) die("Unknown input codec."); + + printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface())); + + if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0)) + die_codec(&codec, "Failed to initialize decoder"); + + while (aom_video_reader_read_frame(reader)) { + aom_codec_iter_t iter = NULL; + aom_image_t *img = NULL; + size_t frame_size = 0; + const unsigned char *frame = + aom_video_reader_get_frame(reader, &frame_size); + if (aom_codec_decode(&codec, frame, frame_size, NULL)) + die_codec(&codec, "Failed to decode frame"); + + while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) { + unsigned char digest[16]; + + get_image_md5(img, digest); + print_md5(outfile, digest); + fprintf(outfile, " img-%dx%d-%04d.i420\n", img->d_w, img->d_h, + ++frame_cnt); + } + } + + printf("Processed %d frames.\n", frame_cnt); + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + aom_video_reader_close(reader); + + fclose(outfile); + return EXIT_SUCCESS; +} diff --git a/libs/libaom/src/examples/decode_with_drops.c b/libs/libaom/src/examples/decode_with_drops.c new file mode 100644 index 000000000..214401958 --- /dev/null +++ b/libs/libaom/src/examples/decode_with_drops.c @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Decode With Drops Example +// ========================= +// +// This is an example utility which drops a series of frames, as specified +// on the command line. This is useful for observing the error recovery +// features of the codec. +// +// Usage +// ----- +// This example adds a single argument to the `simple_decoder` example, +// which specifies the range or pattern of frames to drop. The parameter is +// parsed as follows: +// +// Dropping A Range Of Frames +// -------------------------- +// To drop a range of frames, specify the starting frame and the ending +// frame to drop, separated by a dash. The following command will drop +// frames 5 through 10 (base 1). +// +// $ ./decode_with_drops in.ivf out.i420 5-10 +// +// +// Dropping A Pattern Of Frames +// ---------------------------- +// To drop a pattern of frames, specify the number of frames to drop and +// the number of frames after which to repeat the pattern, separated by +// a forward-slash. The following command will drop 3 of 7 frames. +// Specifically, it will decode 4 frames, then drop 3 frames, and then +// repeat. +// +// $ ./decode_with_drops in.ivf out.i420 3/7 +// +// +// Extra Variables +// --------------- +// This example maintains the pattern passed on the command line in the +// `n`, `m`, and `is_range` variables: +// +// +// Making The Drop Decision +// ------------------------ +// The example decides whether to drop the frame based on the current +// frame number, immediately before decoding the frame. + +#include +#include +#include + +#include "aom/aom_decoder.h" +#include "aom/aomdx.h" +#include "common/tools_common.h" +#include "common/video_reader.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, "Usage: %s \n", exec_name); + exit(EXIT_FAILURE); +} + +int main(int argc, char **argv) { + int frame_cnt = 0; + FILE *outfile = NULL; + aom_codec_ctx_t codec; + const AvxInterface *decoder = NULL; + AvxVideoReader *reader = NULL; + const AvxVideoInfo *info = NULL; + int n = 0; + int m = 0; + int is_range = 0; + char *nptr = NULL; + + exec_name = argv[0]; + + if (argc != 4) die("Invalid number of arguments."); + + reader = aom_video_reader_open(argv[1]); + if (!reader) die("Failed to open %s for reading.", argv[1]); + + if (!(outfile = fopen(argv[2], "wb"))) + die("Failed to open %s for writing.", argv[2]); + + n = (int)strtol(argv[3], &nptr, 0); + m = (int)strtol(nptr + 1, NULL, 0); + is_range = (*nptr == '-'); + if (!n || !m || (*nptr != '-' && *nptr != '/')) + die("Couldn't parse pattern %s.\n", argv[3]); + + info = aom_video_reader_get_info(reader); + + decoder = get_aom_decoder_by_fourcc(info->codec_fourcc); + if (!decoder) die("Unknown input codec."); + + printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface())); + + if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0)) + die_codec(&codec, "Failed to initialize decoder."); + + while (aom_video_reader_read_frame(reader)) { + aom_codec_iter_t iter = NULL; + aom_image_t *img = NULL; + size_t frame_size = 0; + int skip; + const unsigned char *frame = + aom_video_reader_get_frame(reader, &frame_size); + ++frame_cnt; + + skip = (is_range && frame_cnt >= n && frame_cnt <= m) || + (!is_range && m - (frame_cnt - 1) % m <= n); + + if (!skip) { + putc('.', stdout); + if (aom_codec_decode(&codec, frame, frame_size, NULL)) + die_codec(&codec, "Failed to decode frame."); + + while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) + aom_img_write(img, outfile); + } else { + putc('X', stdout); + } + + fflush(stdout); + } + + printf("Processed %d frames.\n", frame_cnt); + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + printf("Play: ffplay -f rawvideo -pix_fmt yuv420p -s %dx%d %s\n", + info->frame_width, info->frame_height, argv[2]); + + aom_video_reader_close(reader); + fclose(outfile); + + return EXIT_SUCCESS; +} diff --git a/libs/libaom/src/examples/encoder_util.c b/libs/libaom/src/examples/encoder_util.c new file mode 100644 index 000000000..e43b37250 --- /dev/null +++ b/libs/libaom/src/examples/encoder_util.c @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Utility functions used by encoder binaries. + +#include "examples/encoder_util.h" + +#include +#include + +#include "aom/aom_integer.h" + +#define mmin(a, b) ((a) < (b) ? (a) : (b)) + +static void find_mismatch_plane(const aom_image_t *const img1, + const aom_image_t *const img2, int plane, + int use_highbitdepth, int loc[4]) { + const unsigned char *const p1 = img1->planes[plane]; + const int p1_stride = img1->stride[plane] >> use_highbitdepth; + const unsigned char *const p2 = img2->planes[plane]; + const int p2_stride = img2->stride[plane] >> use_highbitdepth; + const uint32_t bsize = 64; + const int is_y_plane = (plane == AOM_PLANE_Y); + const uint32_t bsizex = is_y_plane ? bsize : bsize >> img1->x_chroma_shift; + const uint32_t bsizey = is_y_plane ? bsize : bsize >> img1->y_chroma_shift; + const uint32_t c_w = + is_y_plane ? img1->d_w + : (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; + const uint32_t c_h = + is_y_plane ? img1->d_h + : (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; + assert(img1->d_w == img2->d_w && img1->d_h == img2->d_h); + assert(img1->x_chroma_shift == img2->x_chroma_shift && + img1->y_chroma_shift == img2->y_chroma_shift); + loc[0] = loc[1] = loc[2] = loc[3] = -1; + if (img1->monochrome && img2->monochrome && plane) return; + int match = 1; + uint32_t i, j; + for (i = 0; match && i < c_h; i += bsizey) { + for (j = 0; match && j < c_w; j += bsizex) { + const int si = + is_y_plane ? mmin(i + bsizey, c_h) - i : mmin(i + bsizey, c_h - i); + const int sj = + is_y_plane ? mmin(j + bsizex, c_w) - j : mmin(j + bsizex, c_w - j); + int k, l; + for (k = 0; match && k < si; ++k) { + for (l = 0; match && l < sj; ++l) { + const int row = i + k; + const int col = j + l; + const int offset1 = row * p1_stride + col; + const int offset2 = row * p2_stride + col; + const int val1 = use_highbitdepth + ? p1[2 * offset1] | (p1[2 * offset1 + 1] << 8) + : p1[offset1]; + const int val2 = use_highbitdepth + ? p2[2 * offset2] | (p2[2 * offset2 + 1] << 8) + : p2[offset2]; + if (val1 != val2) { + loc[0] = row; + loc[1] = col; + loc[2] = val1; + loc[3] = val2; + match = 0; + break; + } + } + } + } + } +} + +static void find_mismatch_helper(const aom_image_t *const img1, + const aom_image_t *const img2, + int use_highbitdepth, int yloc[4], int uloc[4], + int vloc[4]) { + find_mismatch_plane(img1, img2, AOM_PLANE_Y, use_highbitdepth, yloc); + find_mismatch_plane(img1, img2, AOM_PLANE_U, use_highbitdepth, uloc); + find_mismatch_plane(img1, img2, AOM_PLANE_V, use_highbitdepth, vloc); +} + +void aom_find_mismatch_high(const aom_image_t *const img1, + const aom_image_t *const img2, int yloc[4], + int uloc[4], int vloc[4]) { + find_mismatch_helper(img1, img2, 1, yloc, uloc, vloc); +} + +void aom_find_mismatch(const aom_image_t *const img1, + const aom_image_t *const img2, int yloc[4], int uloc[4], + int vloc[4]) { + find_mismatch_helper(img1, img2, 0, yloc, uloc, vloc); +} + +int aom_compare_img(const aom_image_t *const img1, + const aom_image_t *const img2) { + assert(img1->cp == img2->cp); + assert(img1->tc == img2->tc); + assert(img1->mc == img2->mc); + assert(img1->monochrome == img2->monochrome); + + int num_planes = img1->monochrome ? 1 : 3; + + uint32_t l_w = img1->d_w; + uint32_t c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; + const uint32_t c_h = + (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; + int match = 1; + + match &= (img1->fmt == img2->fmt); + match &= (img1->d_w == img2->d_w); + match &= (img1->d_h == img2->d_h); + if (img1->fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + l_w *= 2; + c_w *= 2; + } + + for (int plane = 0; plane < num_planes; ++plane) { + uint32_t height = plane ? c_h : img1->d_h; + uint32_t width = plane ? c_w : l_w; + + for (uint32_t i = 0; i < height; ++i) { + match &= + (memcmp(img1->planes[plane] + i * img1->stride[plane], + img2->planes[plane] + i * img2->stride[plane], width) == 0); + } + } + + return match; +} diff --git a/libs/libaom/src/examples/encoder_util.h b/libs/libaom/src/examples/encoder_util.h new file mode 100644 index 000000000..a6bb3fb48 --- /dev/null +++ b/libs/libaom/src/examples/encoder_util.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Utility functions used by encoder binaries. + +#ifndef AOM_EXAMPLES_ENCODER_UTIL_H_ +#define AOM_EXAMPLES_ENCODER_UTIL_H_ + +#include "aom/aom_image.h" + +// Returns mismatch location (?loc[0],?loc[1]) and the values at that location +// in img1 (?loc[2]) and img2 (?loc[3]). +void aom_find_mismatch_high(const aom_image_t *const img1, + const aom_image_t *const img2, int yloc[4], + int uloc[4], int vloc[4]); + +void aom_find_mismatch(const aom_image_t *const img1, + const aom_image_t *const img2, int yloc[4], int uloc[4], + int vloc[4]); + +// Returns 1 if the two images match. +int aom_compare_img(const aom_image_t *const img1, + const aom_image_t *const img2); + +#endif // AOM_EXAMPLES_ENCODER_UTIL_H_ diff --git a/libs/libaom/src/examples/inspect.c b/libs/libaom/src/examples/inspect.c new file mode 100644 index 000000000..526bdc16c --- /dev/null +++ b/libs/libaom/src/examples/inspect.c @@ -0,0 +1,958 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Inspect Decoder +// ================ +// +// This is a simple decoder loop that writes JSON stats to stdout. This tool +// can also be compiled with Emscripten and used as a library. + +#include +#include +#include + +#ifdef __EMSCRIPTEN__ +#include +#else +#define EMSCRIPTEN_KEEPALIVE +#endif + +#include "config/aom_config.h" + +#include "aom/aom_decoder.h" +#include "aom/aomdx.h" +#include "av1/common/av1_common_int.h" + +#if CONFIG_ACCOUNTING +#include "av1/decoder/accounting.h" +#endif + +#include "av1/decoder/inspection.h" +#include "common/args.h" +#include "common/tools_common.h" +#include "common/video_common.h" +#include "common/video_reader.h" + +// Max JSON buffer size. +const int MAX_BUFFER = 1024 * 1024 * 256; + +typedef enum { + ACCOUNTING_LAYER = 1, + BLOCK_SIZE_LAYER = 1 << 1, + TRANSFORM_SIZE_LAYER = 1 << 2, + TRANSFORM_TYPE_LAYER = 1 << 3, + MODE_LAYER = 1 << 4, + SKIP_LAYER = 1 << 5, + FILTER_LAYER = 1 << 6, + CDEF_LAYER = 1 << 7, + REFERENCE_FRAME_LAYER = 1 << 8, + MOTION_VECTORS_LAYER = 1 << 9, + UV_MODE_LAYER = 1 << 10, + CFL_LAYER = 1 << 11, + DUAL_FILTER_LAYER = 1 << 12, + Q_INDEX_LAYER = 1 << 13, + SEGMENT_ID_LAYER = 1 << 14, + MOTION_MODE_LAYER = 1 << 15, + COMPOUND_TYPE_LAYER = 1 << 16, + INTRABC_LAYER = 1 << 17, + PALETTE_LAYER = 1 << 18, + UV_PALETTE_LAYER = 1 << 19, + ALL_LAYERS = (1 << 20) - 1 +} LayerType; + +static LayerType layers = 0; + +static int stop_after = 0; +static int compress = 0; + +static const arg_def_t limit_arg = + ARG_DEF(NULL, "limit", 1, "Stop decoding after n frames"); +static const arg_def_t dump_all_arg = ARG_DEF("A", "all", 0, "Dump All"); +static const arg_def_t compress_arg = + ARG_DEF("x", "compress", 0, "Compress JSON using RLE"); +static const arg_def_t dump_accounting_arg = + ARG_DEF("a", "accounting", 0, "Dump Accounting"); +static const arg_def_t dump_block_size_arg = + ARG_DEF("bs", "blockSize", 0, "Dump Block Size"); +static const arg_def_t dump_motion_vectors_arg = + ARG_DEF("mv", "motionVectors", 0, "Dump Motion Vectors"); +static const arg_def_t dump_transform_size_arg = + ARG_DEF("ts", "transformSize", 0, "Dump Transform Size"); +static const arg_def_t dump_transform_type_arg = + ARG_DEF("tt", "transformType", 0, "Dump Transform Type"); +static const arg_def_t dump_mode_arg = ARG_DEF("m", "mode", 0, "Dump Mode"); +static const arg_def_t dump_motion_mode_arg = + ARG_DEF("mm", "motion_mode", 0, "Dump Motion Modes"); +static const arg_def_t dump_compound_type_arg = + ARG_DEF("ct", "compound_type", 0, "Dump Compound Types"); +static const arg_def_t dump_uv_mode_arg = + ARG_DEF("uvm", "uv_mode", 0, "Dump UV Intra Prediction Modes"); +static const arg_def_t dump_skip_arg = ARG_DEF("s", "skip", 0, "Dump Skip"); +static const arg_def_t dump_filter_arg = + ARG_DEF("f", "filter", 0, "Dump Filter"); +static const arg_def_t dump_cdef_arg = ARG_DEF("c", "cdef", 0, "Dump CDEF"); +static const arg_def_t dump_cfl_arg = + ARG_DEF("cfl", "chroma_from_luma", 0, "Dump Chroma from Luma Alphas"); +static const arg_def_t dump_dual_filter_type_arg = + ARG_DEF("df", "dualFilterType", 0, "Dump Dual Filter Type"); +static const arg_def_t dump_reference_frame_arg = + ARG_DEF("r", "referenceFrame", 0, "Dump Reference Frame"); +static const arg_def_t dump_delta_q_arg = + ARG_DEF("dq", "delta_q", 0, "Dump QIndex"); +static const arg_def_t dump_seg_id_arg = + ARG_DEF("si", "seg_id", 0, "Dump Segment ID"); +static const arg_def_t dump_intrabc_arg = + ARG_DEF("ibc", "intrabc", 0, "Dump If IntraBC Is Used"); +static const arg_def_t dump_palette_arg = + ARG_DEF("plt", "palette", 0, "Dump Palette Size"); +static const arg_def_t dump_uv_palette_arg = + ARG_DEF("uvp", "uv_palette", 0, "Dump UV Palette Size"); +static const arg_def_t usage_arg = ARG_DEF("h", "help", 0, "Help"); +static const arg_def_t skip_non_transform_arg = ARG_DEF( + "snt", "skip_non_transform", 1, "Skip is counted as a non transform."); +static const arg_def_t combined_arg = + ARG_DEF("comb", "combined", 1, "combinining parameters into one output."); + +int combined_parm_list[15]; +int combined_parm_count = 0; + +static const arg_def_t *main_args[] = { &limit_arg, + &dump_all_arg, + &compress_arg, +#if CONFIG_ACCOUNTING + &dump_accounting_arg, +#endif + &dump_block_size_arg, + &dump_transform_size_arg, + &dump_transform_type_arg, + &dump_mode_arg, + &dump_uv_mode_arg, + &dump_motion_mode_arg, + &dump_compound_type_arg, + &dump_skip_arg, + &dump_filter_arg, + &dump_cdef_arg, + &dump_dual_filter_type_arg, + &dump_cfl_arg, + &dump_reference_frame_arg, + &dump_motion_vectors_arg, + &dump_delta_q_arg, + &dump_seg_id_arg, + &dump_intrabc_arg, + &dump_palette_arg, + &dump_uv_palette_arg, + &usage_arg, + &skip_non_transform_arg, + &combined_arg, + NULL }; +#define ENUM(name) \ + { #name, name } +#define LAST_ENUM \ + { NULL, 0 } +typedef struct map_entry { + const char *name; + int value; +} map_entry; + +const map_entry refs_map[] = { + ENUM(INTRA_FRAME), ENUM(LAST_FRAME), ENUM(LAST2_FRAME), + ENUM(LAST3_FRAME), ENUM(GOLDEN_FRAME), ENUM(BWDREF_FRAME), + ENUM(ALTREF2_FRAME), ENUM(ALTREF_FRAME), LAST_ENUM +}; + +const map_entry block_size_map[] = { + ENUM(BLOCK_4X4), ENUM(BLOCK_4X8), ENUM(BLOCK_8X4), + ENUM(BLOCK_8X8), ENUM(BLOCK_8X16), ENUM(BLOCK_16X8), + ENUM(BLOCK_16X16), ENUM(BLOCK_16X32), ENUM(BLOCK_32X16), + ENUM(BLOCK_32X32), ENUM(BLOCK_32X64), ENUM(BLOCK_64X32), + ENUM(BLOCK_64X64), ENUM(BLOCK_64X128), ENUM(BLOCK_128X64), + ENUM(BLOCK_128X128), ENUM(BLOCK_4X16), ENUM(BLOCK_16X4), + ENUM(BLOCK_8X32), ENUM(BLOCK_32X8), ENUM(BLOCK_16X64), + ENUM(BLOCK_64X16), LAST_ENUM +}; + +#define TX_SKIP -1 + +const map_entry tx_size_map[] = { + ENUM(TX_4X4), ENUM(TX_8X8), ENUM(TX_16X16), ENUM(TX_32X32), + ENUM(TX_64X64), ENUM(TX_4X8), ENUM(TX_8X4), ENUM(TX_8X16), + ENUM(TX_16X8), ENUM(TX_16X32), ENUM(TX_32X16), ENUM(TX_32X64), + ENUM(TX_64X32), ENUM(TX_4X16), ENUM(TX_16X4), ENUM(TX_8X32), + ENUM(TX_32X8), ENUM(TX_16X64), ENUM(TX_64X16), LAST_ENUM +}; + +const map_entry tx_type_map[] = { ENUM(DCT_DCT), + ENUM(ADST_DCT), + ENUM(DCT_ADST), + ENUM(ADST_ADST), + ENUM(FLIPADST_DCT), + ENUM(DCT_FLIPADST), + ENUM(FLIPADST_FLIPADST), + ENUM(ADST_FLIPADST), + ENUM(FLIPADST_ADST), + ENUM(IDTX), + ENUM(V_DCT), + ENUM(H_DCT), + ENUM(V_ADST), + ENUM(H_ADST), + ENUM(V_FLIPADST), + ENUM(H_FLIPADST), + LAST_ENUM }; +const map_entry dual_filter_map[] = { ENUM(REG_REG), ENUM(REG_SMOOTH), + ENUM(REG_SHARP), ENUM(SMOOTH_REG), + ENUM(SMOOTH_SMOOTH), ENUM(SMOOTH_SHARP), + ENUM(SHARP_REG), ENUM(SHARP_SMOOTH), + ENUM(SHARP_SHARP), LAST_ENUM }; + +const map_entry prediction_mode_map[] = { + ENUM(DC_PRED), ENUM(V_PRED), ENUM(H_PRED), + ENUM(D45_PRED), ENUM(D135_PRED), ENUM(D113_PRED), + ENUM(D157_PRED), ENUM(D203_PRED), ENUM(D67_PRED), + ENUM(SMOOTH_PRED), ENUM(SMOOTH_V_PRED), ENUM(SMOOTH_H_PRED), + ENUM(PAETH_PRED), ENUM(NEARESTMV), ENUM(NEARMV), + ENUM(GLOBALMV), ENUM(NEWMV), ENUM(NEAREST_NEARESTMV), + ENUM(NEAR_NEARMV), ENUM(NEAREST_NEWMV), ENUM(NEW_NEARESTMV), + ENUM(NEAR_NEWMV), ENUM(NEW_NEARMV), ENUM(GLOBAL_GLOBALMV), + ENUM(NEW_NEWMV), ENUM(INTRA_INVALID), LAST_ENUM +}; + +const map_entry motion_mode_map[] = { ENUM(SIMPLE_TRANSLATION), + ENUM(OBMC_CAUSAL), // 2-sided OBMC + ENUM(WARPED_CAUSAL), // 2-sided WARPED + LAST_ENUM }; + +const map_entry compound_type_map[] = { ENUM(COMPOUND_AVERAGE), + ENUM(COMPOUND_WEDGE), + ENUM(COMPOUND_DIFFWTD), LAST_ENUM }; + +const map_entry uv_prediction_mode_map[] = { + ENUM(UV_DC_PRED), ENUM(UV_V_PRED), + ENUM(UV_H_PRED), ENUM(UV_D45_PRED), + ENUM(UV_D135_PRED), ENUM(UV_D113_PRED), + ENUM(UV_D157_PRED), ENUM(UV_D203_PRED), + ENUM(UV_D67_PRED), ENUM(UV_SMOOTH_PRED), + ENUM(UV_SMOOTH_V_PRED), ENUM(UV_SMOOTH_H_PRED), + ENUM(UV_PAETH_PRED), ENUM(UV_CFL_PRED), + ENUM(UV_MODE_INVALID), LAST_ENUM +}; +#define NO_SKIP 0 +#define SKIP 1 + +const map_entry skip_map[] = { ENUM(SKIP), ENUM(NO_SKIP), LAST_ENUM }; + +const map_entry intrabc_map[] = { { "INTRABC", 1 }, + { "NO_INTRABC", 0 }, + LAST_ENUM }; + +const map_entry palette_map[] = { + { "ZERO_COLORS", 0 }, { "TWO_COLORS", 2 }, { "THREE_COLORS", 3 }, + { "FOUR_COLORS", 4 }, { "FIVE_COLORS", 5 }, { "SIX_COLORS", 6 }, + { "SEVEN_COLORS", 7 }, { "EIGHT_COLORS", 8 }, LAST_ENUM +}; + +const map_entry config_map[] = { ENUM(MI_SIZE), LAST_ENUM }; + +static const char *exec_name; + +struct parm_offset { + char parm[60]; + char offset; +}; +struct parm_offset parm_offsets[] = { + { "blockSize", offsetof(insp_mi_data, sb_type) }, + { "transformSize", offsetof(insp_mi_data, tx_size) }, + { "transformType", offsetof(insp_mi_data, tx_type) }, + { "dualFilterType", offsetof(insp_mi_data, dual_filter_type) }, + { "mode", offsetof(insp_mi_data, mode) }, + { "uv_mode", offsetof(insp_mi_data, uv_mode) }, + { "motion_mode", offsetof(insp_mi_data, motion_mode) }, + { "compound_type", offsetof(insp_mi_data, compound_type) }, + { "referenceFrame", offsetof(insp_mi_data, ref_frame) }, + { "skip", offsetof(insp_mi_data, skip) }, +}; +int parm_count = sizeof(parm_offsets) / sizeof(parm_offsets[0]); + +int convert_to_indices(char *str, int *indices, int maxCount, int *count) { + *count = 0; + do { + char *comma = strchr(str, ','); + int length = (comma ? (int)(comma - str) : (int)strlen(str)); + int i; + for (i = 0; i < parm_count; ++i) { + if (!strncmp(str, parm_offsets[i].parm, length)) { + break; + } + } + if (i == parm_count) return 0; + indices[(*count)++] = i; + if (*count > maxCount) return 0; + str += length + 1; + } while (strlen(str) > 0); + return 1; +} + +insp_frame_data frame_data; +int frame_count = 0; +int decoded_frame_count = 0; +aom_codec_ctx_t codec; +AvxVideoReader *reader = NULL; +const AvxVideoInfo *info = NULL; +aom_image_t *img = NULL; + +void on_frame_decoded_dump(char *json) { +#ifdef __EMSCRIPTEN__ + EM_ASM_({ Module.on_frame_decoded_json($0); }, json); +#else + printf("%s", json); +#endif +} + +// Writing out the JSON buffer using snprintf is very slow, especially when +// compiled with emscripten, these functions speed things up quite a bit. +int put_str(char *buffer, const char *str) { + int i; + for (i = 0; str[i] != '\0'; i++) { + buffer[i] = str[i]; + } + return i; +} + +int put_str_with_escape(char *buffer, const char *str) { + int i; + int j = 0; + for (i = 0; str[i] != '\0'; i++) { + if (str[i] < ' ') { + continue; + } else if (str[i] == '"' || str[i] == '\\') { + buffer[j++] = '\\'; + } + buffer[j++] = str[i]; + } + return j; +} + +int put_num(char *buffer, char prefix, int num, char suffix) { + int i = 0; + char *buf = buffer; + int is_neg = 0; + if (prefix) { + buf[i++] = prefix; + } + if (num == 0) { + buf[i++] = '0'; + } else { + if (num < 0) { + num = -num; + is_neg = 1; + } + int s = i; + while (num != 0) { + buf[i++] = '0' + (num % 10); + num = num / 10; + } + if (is_neg) { + buf[i++] = '-'; + } + int e = i - 1; + while (s < e) { + int t = buf[s]; + buf[s] = buf[e]; + buf[e] = t; + s++; + e--; + } + } + if (suffix) { + buf[i++] = suffix; + } + return i; +} + +int put_map(char *buffer, const map_entry *map) { + char *buf = buffer; + const map_entry *entry = map; + while (entry->name != NULL) { + *(buf++) = '"'; + buf += put_str(buf, entry->name); + *(buf++) = '"'; + buf += put_num(buf, ':', entry->value, 0); + entry++; + if (entry->name != NULL) { + *(buf++) = ','; + } + } + return (int)(buf - buffer); +} + +int put_reference_frame(char *buffer) { + const int mi_rows = frame_data.mi_rows; + const int mi_cols = frame_data.mi_cols; + char *buf = buffer; + int r, c, t; + buf += put_str(buf, " \"referenceFrameMap\": {"); + buf += put_map(buf, refs_map); + buf += put_str(buf, "},\n"); + buf += put_str(buf, " \"referenceFrame\": ["); + for (r = 0; r < mi_rows; ++r) { + *(buf++) = '['; + for (c = 0; c < mi_cols; ++c) { + insp_mi_data *mi = &frame_data.mi_grid[r * mi_cols + c]; + buf += put_num(buf, '[', mi->ref_frame[0], 0); + buf += put_num(buf, ',', mi->ref_frame[1], ']'); + if (compress) { // RLE + for (t = c + 1; t < mi_cols; ++t) { + insp_mi_data *next_mi = &frame_data.mi_grid[r * mi_cols + t]; + if (mi->ref_frame[0] != next_mi->ref_frame[0] || + mi->ref_frame[1] != next_mi->ref_frame[1]) { + break; + } + } + if (t - c > 1) { + *(buf++) = ','; + buf += put_num(buf, '[', t - c - 1, ']'); + c = t - 1; + } + } + if (c < mi_cols - 1) *(buf++) = ','; + } + *(buf++) = ']'; + if (r < mi_rows - 1) *(buf++) = ','; + } + buf += put_str(buf, "],\n"); + return (int)(buf - buffer); +} + +int put_motion_vectors(char *buffer) { + const int mi_rows = frame_data.mi_rows; + const int mi_cols = frame_data.mi_cols; + char *buf = buffer; + int r, c, t; + buf += put_str(buf, " \"motionVectors\": ["); + for (r = 0; r < mi_rows; ++r) { + *(buf++) = '['; + for (c = 0; c < mi_cols; ++c) { + insp_mi_data *mi = &frame_data.mi_grid[r * mi_cols + c]; + buf += put_num(buf, '[', mi->mv[0].col, 0); + buf += put_num(buf, ',', mi->mv[0].row, 0); + buf += put_num(buf, ',', mi->mv[1].col, 0); + buf += put_num(buf, ',', mi->mv[1].row, ']'); + if (compress) { // RLE + for (t = c + 1; t < mi_cols; ++t) { + insp_mi_data *next_mi = &frame_data.mi_grid[r * mi_cols + t]; + if (mi->mv[0].col != next_mi->mv[0].col || + mi->mv[0].row != next_mi->mv[0].row || + mi->mv[1].col != next_mi->mv[1].col || + mi->mv[1].row != next_mi->mv[1].row) { + break; + } + } + if (t - c > 1) { + *(buf++) = ','; + buf += put_num(buf, '[', t - c - 1, ']'); + c = t - 1; + } + } + if (c < mi_cols - 1) *(buf++) = ','; + } + *(buf++) = ']'; + if (r < mi_rows - 1) *(buf++) = ','; + } + buf += put_str(buf, "],\n"); + return (int)(buf - buffer); +} + +int put_combined(char *buffer) { + const int mi_rows = frame_data.mi_rows; + const int mi_cols = frame_data.mi_cols; + char *buf = buffer; + int r, c, p; + buf += put_str(buf, " \""); + for (p = 0; p < combined_parm_count; ++p) { + if (p) buf += put_str(buf, "&"); + buf += put_str(buf, parm_offsets[combined_parm_list[p]].parm); + } + buf += put_str(buf, "\": ["); + for (r = 0; r < mi_rows; ++r) { + *(buf++) = '['; + for (c = 0; c < mi_cols; ++c) { + insp_mi_data *mi = &frame_data.mi_grid[r * mi_cols + c]; + *(buf++) = '['; + for (p = 0; p < combined_parm_count; ++p) { + if (p) *(buf++) = ','; + int16_t *v = (int16_t *)(((int8_t *)mi) + + parm_offsets[combined_parm_list[p]].offset); + buf += put_num(buf, 0, v[0], 0); + } + *(buf++) = ']'; + if (c < mi_cols - 1) *(buf++) = ','; + } + *(buf++) = ']'; + if (r < mi_rows - 1) *(buf++) = ','; + } + buf += put_str(buf, "],\n"); + return (int)(buf - buffer); +} + +int put_block_info(char *buffer, const map_entry *map, const char *name, + size_t offset, int len) { + const int mi_rows = frame_data.mi_rows; + const int mi_cols = frame_data.mi_cols; + char *buf = buffer; + int r, c, t, i; + if (compress && len == 1) { + die("Can't encode scalars as arrays when RLE compression is enabled."); + return -1; + } + if (map) { + buf += snprintf(buf, MAX_BUFFER, " \"%sMap\": {", name); + buf += put_map(buf, map); + buf += put_str(buf, "},\n"); + } + buf += snprintf(buf, MAX_BUFFER, " \"%s\": [", name); + for (r = 0; r < mi_rows; ++r) { + *(buf++) = '['; + for (c = 0; c < mi_cols; ++c) { + insp_mi_data *mi = &frame_data.mi_grid[r * mi_cols + c]; + int16_t *v = (int16_t *)(((int8_t *)mi) + offset); + if (len == 0) { + buf += put_num(buf, 0, v[0], 0); + } else { + buf += put_str(buf, "["); + for (i = 0; i < len; i++) { + buf += put_num(buf, 0, v[i], 0); + if (i < len - 1) { + buf += put_str(buf, ","); + } + } + buf += put_str(buf, "]"); + } + if (compress) { // RLE + for (t = c + 1; t < mi_cols; ++t) { + insp_mi_data *next_mi = &frame_data.mi_grid[r * mi_cols + t]; + int16_t *nv = (int16_t *)(((int8_t *)next_mi) + offset); + int same = 0; + if (len == 0) { + same = v[0] == nv[0]; + } else { + for (i = 0; i < len; i++) { + same = v[i] == nv[i]; + if (!same) { + break; + } + } + } + if (!same) { + break; + } + } + if (t - c > 1) { + *(buf++) = ','; + buf += put_num(buf, '[', t - c - 1, ']'); + c = t - 1; + } + } + if (c < mi_cols - 1) *(buf++) = ','; + } + *(buf++) = ']'; + if (r < mi_rows - 1) *(buf++) = ','; + } + buf += put_str(buf, "],\n"); + return (int)(buf - buffer); +} + +#if CONFIG_ACCOUNTING +int put_accounting(char *buffer) { + char *buf = buffer; + int i; + const Accounting *accounting = frame_data.accounting; + if (accounting == NULL) { + printf("XXX\n"); + return 0; + } + const int num_syms = accounting->syms.num_syms; + const int num_strs = accounting->syms.dictionary.num_strs; + buf += put_str(buf, " \"symbolsMap\": ["); + for (i = 0; i < num_strs; i++) { + buf += snprintf(buf, MAX_BUFFER, "\"%s\"", + accounting->syms.dictionary.strs[i]); + if (i < num_strs - 1) *(buf++) = ','; + } + buf += put_str(buf, "],\n"); + buf += put_str(buf, " \"symbols\": [\n "); + AccountingSymbolContext context; + context.x = -2; + context.y = -2; + AccountingSymbol *sym; + for (i = 0; i < num_syms; i++) { + sym = &accounting->syms.syms[i]; + if (memcmp(&context, &sym->context, sizeof(AccountingSymbolContext)) != 0) { + buf += put_num(buf, '[', sym->context.x, 0); + buf += put_num(buf, ',', sym->context.y, ']'); + } else { + buf += put_num(buf, '[', sym->id, 0); + buf += put_num(buf, ',', sym->bits, 0); + buf += put_num(buf, ',', sym->samples, ']'); + } + context = sym->context; + if (i < num_syms - 1) *(buf++) = ','; + } + buf += put_str(buf, "],\n"); + return (int)(buf - buffer); +} +#endif + +int skip_non_transform = 0; + +void inspect(void *pbi, void *data) { + /* Fetch frame data. */ + ifd_inspect(&frame_data, pbi, skip_non_transform); + + // Show existing frames just show a reference buffer we've already decoded. + // There's no information to show. + if (frame_data.show_existing_frame) return; + + (void)data; + // We allocate enough space and hope we don't write out of bounds. Totally + // unsafe but this speeds things up, especially when compiled to Javascript. + char *buffer = aom_malloc(MAX_BUFFER); + char *buf = buffer; + buf += put_str(buf, "{\n"); + if (layers & BLOCK_SIZE_LAYER) { + buf += put_block_info(buf, block_size_map, "blockSize", + offsetof(insp_mi_data, sb_type), 0); + } + if (layers & TRANSFORM_SIZE_LAYER) { + buf += put_block_info(buf, tx_size_map, "transformSize", + offsetof(insp_mi_data, tx_size), 0); + } + if (layers & TRANSFORM_TYPE_LAYER) { + buf += put_block_info(buf, tx_type_map, "transformType", + offsetof(insp_mi_data, tx_type), 0); + } + if (layers & DUAL_FILTER_LAYER) { + buf += put_block_info(buf, dual_filter_map, "dualFilterType", + offsetof(insp_mi_data, dual_filter_type), 0); + } + if (layers & MODE_LAYER) { + buf += put_block_info(buf, prediction_mode_map, "mode", + offsetof(insp_mi_data, mode), 0); + } + if (layers & UV_MODE_LAYER) { + buf += put_block_info(buf, uv_prediction_mode_map, "uv_mode", + offsetof(insp_mi_data, uv_mode), 0); + } + if (layers & MOTION_MODE_LAYER) { + buf += put_block_info(buf, motion_mode_map, "motion_mode", + offsetof(insp_mi_data, motion_mode), 0); + } + if (layers & COMPOUND_TYPE_LAYER) { + buf += put_block_info(buf, compound_type_map, "compound_type", + offsetof(insp_mi_data, compound_type), 0); + } + if (layers & SKIP_LAYER) { + buf += + put_block_info(buf, skip_map, "skip", offsetof(insp_mi_data, skip), 0); + } + if (layers & FILTER_LAYER) { + buf += + put_block_info(buf, NULL, "filter", offsetof(insp_mi_data, filter), 2); + } + if (layers & CDEF_LAYER) { + buf += put_block_info(buf, NULL, "cdef_level", + offsetof(insp_mi_data, cdef_level), 0); + buf += put_block_info(buf, NULL, "cdef_strength", + offsetof(insp_mi_data, cdef_strength), 0); + } + if (layers & CFL_LAYER) { + buf += put_block_info(buf, NULL, "cfl_alpha_idx", + offsetof(insp_mi_data, cfl_alpha_idx), 0); + buf += put_block_info(buf, NULL, "cfl_alpha_sign", + offsetof(insp_mi_data, cfl_alpha_sign), 0); + } + if (layers & Q_INDEX_LAYER) { + buf += put_block_info(buf, NULL, "delta_q", + offsetof(insp_mi_data, current_qindex), 0); + } + if (layers & SEGMENT_ID_LAYER) { + buf += put_block_info(buf, NULL, "seg_id", + offsetof(insp_mi_data, segment_id), 0); + } + if (layers & MOTION_VECTORS_LAYER) { + buf += put_motion_vectors(buf); + } + if (layers & INTRABC_LAYER) { + buf += put_block_info(buf, intrabc_map, "intrabc", + offsetof(insp_mi_data, intrabc), 0); + } + if (layers & PALETTE_LAYER) { + buf += put_block_info(buf, palette_map, "palette", + offsetof(insp_mi_data, palette), 0); + } + if (layers & UV_PALETTE_LAYER) { + buf += put_block_info(buf, palette_map, "uv_palette", + offsetof(insp_mi_data, uv_palette), 0); + } + if (combined_parm_count > 0) buf += put_combined(buf); + if (layers & REFERENCE_FRAME_LAYER) { + buf += put_block_info(buf, refs_map, "referenceFrame", + offsetof(insp_mi_data, ref_frame), 2); + } +#if CONFIG_ACCOUNTING + if (layers & ACCOUNTING_LAYER) { + buf += put_accounting(buf); + } +#endif + buf += + snprintf(buf, MAX_BUFFER, " \"frame\": %d,\n", frame_data.frame_number); + buf += snprintf(buf, MAX_BUFFER, " \"showFrame\": %d,\n", + frame_data.show_frame); + buf += snprintf(buf, MAX_BUFFER, " \"frameType\": %d,\n", + frame_data.frame_type); + buf += snprintf(buf, MAX_BUFFER, " \"baseQIndex\": %d,\n", + frame_data.base_qindex); + buf += snprintf(buf, MAX_BUFFER, " \"tileCols\": %d,\n", + frame_data.tile_mi_cols); + buf += snprintf(buf, MAX_BUFFER, " \"tileRows\": %d,\n", + frame_data.tile_mi_rows); + buf += snprintf(buf, MAX_BUFFER, " \"deltaQPresentFlag\": %d,\n", + frame_data.delta_q_present_flag); + buf += snprintf(buf, MAX_BUFFER, " \"deltaQRes\": %d,\n", + frame_data.delta_q_res); + buf += put_str(buf, " \"config\": {"); + buf += put_map(buf, config_map); + buf += put_str(buf, "},\n"); + buf += put_str(buf, " \"configString\": \""); + buf += put_str_with_escape(buf, aom_codec_build_config()); + buf += put_str(buf, "\"\n"); + decoded_frame_count++; + buf += put_str(buf, "},\n"); + *(buf++) = 0; + on_frame_decoded_dump(buffer); + aom_free(buffer); +} + +void ifd_init_cb() { + aom_inspect_init ii; + ii.inspect_cb = inspect; + ii.inspect_ctx = NULL; + aom_codec_control(&codec, AV1_SET_INSPECTION_CALLBACK, &ii); +} + +EMSCRIPTEN_KEEPALIVE +int open_file(char *file) { + if (file == NULL) { + // The JS analyzer puts the .ivf file at this location. + file = "/tmp/input.ivf"; + } + reader = aom_video_reader_open(file); + if (!reader) die("Failed to open %s for reading.", file); + info = aom_video_reader_get_info(reader); + const AvxInterface *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc); + if (!decoder) die("Unknown input codec."); + fprintf(stderr, "Using %s\n", + aom_codec_iface_name(decoder->codec_interface())); + if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0)) + die_codec(&codec, "Failed to initialize decoder."); + ifd_init(&frame_data, info->frame_width, info->frame_height); + ifd_init_cb(); + return EXIT_SUCCESS; +} + +Av1DecodeReturn adr; +int have_frame = 0; +const unsigned char *frame; +const unsigned char *end_frame; +size_t frame_size = 0; + +EMSCRIPTEN_KEEPALIVE +int read_frame() { + img = NULL; + + // This loop skips over any frames that are show_existing_frames, as + // there is nothing to analyze. + do { + if (!have_frame) { + if (!aom_video_reader_read_frame(reader)) return EXIT_FAILURE; + frame = aom_video_reader_get_frame(reader, &frame_size); + + have_frame = 1; + end_frame = frame + frame_size; + } + + if (aom_codec_decode(&codec, frame, (unsigned int)frame_size, &adr) != + AOM_CODEC_OK) { + die_codec(&codec, "Failed to decode frame."); + } + + frame = adr.buf; + if (frame == end_frame) have_frame = 0; + } while (adr.show_existing); + + int got_any_frames = 0; + aom_image_t *frame_img; + struct av1_ref_frame ref_dec; + ref_dec.idx = adr.idx; + + // ref_dec.idx is the index to the reference buffer idx to AV1_GET_REFERENCE + // if its -1 the decoder didn't update any reference buffer and the only + // way to see the frame is aom_codec_get_frame. + if (ref_dec.idx == -1) { + aom_codec_iter_t iter = NULL; + img = frame_img = aom_codec_get_frame(&codec, &iter); + ++frame_count; + got_any_frames = 1; + } else if (!aom_codec_control(&codec, AV1_GET_REFERENCE, &ref_dec)) { + img = frame_img = &ref_dec.img; + ++frame_count; + got_any_frames = 1; + } + if (!got_any_frames) { + return EXIT_FAILURE; + } + return EXIT_SUCCESS; +} + +EMSCRIPTEN_KEEPALIVE +const char *get_aom_codec_build_config() { return aom_codec_build_config(); } + +EMSCRIPTEN_KEEPALIVE +int get_bit_depth() { return img->bit_depth; } + +EMSCRIPTEN_KEEPALIVE +int get_bits_per_sample() { return img->bps; } + +EMSCRIPTEN_KEEPALIVE +int get_image_format() { return img->fmt; } + +EMSCRIPTEN_KEEPALIVE +unsigned char *get_plane(int plane) { return img->planes[plane]; } + +EMSCRIPTEN_KEEPALIVE +int get_plane_stride(int plane) { return img->stride[plane]; } + +EMSCRIPTEN_KEEPALIVE +int get_plane_width(int plane) { return aom_img_plane_width(img, plane); } + +EMSCRIPTEN_KEEPALIVE +int get_plane_height(int plane) { return aom_img_plane_height(img, plane); } + +EMSCRIPTEN_KEEPALIVE +int get_frame_width() { return info->frame_width; } + +EMSCRIPTEN_KEEPALIVE +int get_frame_height() { return info->frame_height; } + +static void parse_args(char **argv) { + char **argi, **argj; + struct arg arg; + (void)dump_accounting_arg; + (void)dump_cdef_arg; + for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { + arg.argv_step = 1; + if (arg_match(&arg, &dump_block_size_arg, argi)) layers |= BLOCK_SIZE_LAYER; +#if CONFIG_ACCOUNTING + else if (arg_match(&arg, &dump_accounting_arg, argi)) + layers |= ACCOUNTING_LAYER; +#endif + else if (arg_match(&arg, &dump_transform_size_arg, argi)) + layers |= TRANSFORM_SIZE_LAYER; + else if (arg_match(&arg, &dump_transform_type_arg, argi)) + layers |= TRANSFORM_TYPE_LAYER; + else if (arg_match(&arg, &dump_mode_arg, argi)) + layers |= MODE_LAYER; + else if (arg_match(&arg, &dump_uv_mode_arg, argi)) + layers |= UV_MODE_LAYER; + else if (arg_match(&arg, &dump_motion_mode_arg, argi)) + layers |= MOTION_MODE_LAYER; + else if (arg_match(&arg, &dump_compound_type_arg, argi)) + layers |= COMPOUND_TYPE_LAYER; + else if (arg_match(&arg, &dump_skip_arg, argi)) + layers |= SKIP_LAYER; + else if (arg_match(&arg, &dump_filter_arg, argi)) + layers |= FILTER_LAYER; + else if (arg_match(&arg, &dump_cdef_arg, argi)) + layers |= CDEF_LAYER; + else if (arg_match(&arg, &dump_cfl_arg, argi)) + layers |= CFL_LAYER; + else if (arg_match(&arg, &dump_reference_frame_arg, argi)) + layers |= REFERENCE_FRAME_LAYER; + else if (arg_match(&arg, &dump_motion_vectors_arg, argi)) + layers |= MOTION_VECTORS_LAYER; + else if (arg_match(&arg, &dump_dual_filter_type_arg, argi)) + layers |= DUAL_FILTER_LAYER; + else if (arg_match(&arg, &dump_delta_q_arg, argi)) + layers |= Q_INDEX_LAYER; + else if (arg_match(&arg, &dump_seg_id_arg, argi)) + layers |= SEGMENT_ID_LAYER; + else if (arg_match(&arg, &dump_intrabc_arg, argi)) + layers |= INTRABC_LAYER; + else if (arg_match(&arg, &dump_palette_arg, argi)) + layers |= PALETTE_LAYER; + else if (arg_match(&arg, &dump_uv_palette_arg, argi)) + layers |= UV_PALETTE_LAYER; + else if (arg_match(&arg, &dump_all_arg, argi)) + layers |= ALL_LAYERS; + else if (arg_match(&arg, &compress_arg, argi)) + compress = 1; + else if (arg_match(&arg, &usage_arg, argi)) + usage_exit(); + else if (arg_match(&arg, &limit_arg, argi)) + stop_after = arg_parse_uint(&arg); + else if (arg_match(&arg, &skip_non_transform_arg, argi)) + skip_non_transform = arg_parse_uint(&arg); + else if (arg_match(&arg, &combined_arg, argi)) + convert_to_indices( + (char *)arg.val, combined_parm_list, + sizeof(combined_parm_list) / sizeof(combined_parm_list[0]), + &combined_parm_count); + else + argj++; + } +} + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, "Usage: %s src_filename \n", exec_name); + fprintf(stderr, "\nOptions:\n"); + arg_show_usage(stderr, main_args); + exit(EXIT_FAILURE); +} + +EMSCRIPTEN_KEEPALIVE +int main(int argc, char **argv) { + exec_name = argv[0]; + parse_args(argv); + if (argc >= 2) { + open_file(argv[1]); + printf("[\n"); + while (1) { + if (stop_after && (decoded_frame_count >= stop_after)) break; + if (read_frame()) break; + } + printf("null\n"); + printf("]"); + } else { + usage_exit(); + } +} + +EMSCRIPTEN_KEEPALIVE +void quit() { + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); + aom_video_reader_close(reader); +} + +EMSCRIPTEN_KEEPALIVE +void set_layers(LayerType v) { layers = v; } + +EMSCRIPTEN_KEEPALIVE +void set_compress(int v) { compress = v; } diff --git a/libs/libaom/src/examples/lightfield_bitstream_parsing.c b/libs/libaom/src/examples/lightfield_bitstream_parsing.c new file mode 100644 index 000000000..ffcbcb9cb --- /dev/null +++ b/libs/libaom/src/examples/lightfield_bitstream_parsing.c @@ -0,0 +1,414 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Lightfield Bitstream Parsing +// ============================ +// +// This is a lightfield bitstream parsing example. It takes an input file +// containing the whole compressed lightfield bitstream(ivf file) and a text +// file containing a stream of tiles to decode and then constructs and outputs +// a new bitstream that can be decoded by an AV1 decoder. The output bitstream +// contains reference frames(i.e. anchor frames), camera frame header, and +// tile list OBUs. num_references is the number of anchor frames coded at the +// beginning of the light field file. After running the lightfield encoder, +// run lightfield bitstream parsing: +// examples/lightfield_bitstream_parsing vase10x10.ivf vase_tile_list.ivf 4 +// tile_list.txt +// +// The tile_list.txt is expected to be of the form: +// Frame +// +// +// ... +// Frame +#include +#include + +#include "aom/aom_decoder.h" +#include "aom/aom_encoder.h" +#include "aom/aom_integer.h" +#include "aom/aomdx.h" +#include "aom_dsp/bitwriter_buffer.h" +#include "common/tools_common.h" +#include "common/video_reader.h" +#include "common/video_writer.h" + +#define MAX_TILES 512 + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, "Usage: %s \n", + exec_name); + exit(EXIT_FAILURE); +} + +#define ALIGN_POWER_OF_TWO(value, n) \ + (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1)) + +const int output_frame_width = 512; +const int output_frame_height = 512; + +// Spec: +// typedef struct { +// uint8_t anchor_frame_idx; +// uint8_t tile_row; +// uint8_t tile_col; +// uint16_t coded_tile_data_size_minus_1; +// uint8_t *coded_tile_data; +// } TILE_LIST_ENTRY; + +// Tile list entry provided by the application +typedef struct { + int image_idx; + int reference_idx; + int tile_col; + int tile_row; +} TILE_LIST_INFO; + +static int get_image_bps(aom_img_fmt_t fmt) { + switch (fmt) { + case AOM_IMG_FMT_I420: return 12; + case AOM_IMG_FMT_I422: return 16; + case AOM_IMG_FMT_I444: return 24; + case AOM_IMG_FMT_I42016: return 24; + case AOM_IMG_FMT_I42216: return 32; + case AOM_IMG_FMT_I44416: return 48; + default: die("Invalid image format"); + } + return 0; +} + +void process_tile_list(const TILE_LIST_INFO *tiles, int num_tiles, + aom_codec_pts_t tl_pts, unsigned char **frames, + const size_t *frame_sizes, aom_codec_ctx_t *codec, + unsigned char *tl_buf, AvxVideoWriter *writer, + uint8_t output_frame_width_in_tiles_minus_1, + uint8_t output_frame_height_in_tiles_minus_1) { + unsigned char *tl = tl_buf; + struct aom_write_bit_buffer wb = { tl, 0 }; + unsigned char *saved_obu_size_loc = NULL; + uint32_t tile_list_obu_header_size = 0; + uint32_t tile_list_obu_size = 0; + int num_tiles_minus_1 = num_tiles - 1; + int i; + + // Write the tile list OBU header that is 1 byte long. + aom_wb_write_literal(&wb, 0, 1); // forbidden bit. + aom_wb_write_literal(&wb, 8, 4); // tile list OBU: "1000" + aom_wb_write_literal(&wb, 0, 1); // obu_extension = 0 + aom_wb_write_literal(&wb, 1, 1); // obu_has_size_field + aom_wb_write_literal(&wb, 0, 1); // reserved + tl++; + tile_list_obu_header_size++; + + // Write the OBU size using a fixed length_field_size of 4 bytes. + saved_obu_size_loc = tl; + // aom_wb_write_unsigned_literal(&wb, data, bits) requires that bits <= 32. + aom_wb_write_unsigned_literal(&wb, 0, 32); + tl += 4; + tile_list_obu_header_size += 4; + + // write_tile_list_obu() + aom_wb_write_literal(&wb, output_frame_width_in_tiles_minus_1, 8); + aom_wb_write_literal(&wb, output_frame_height_in_tiles_minus_1, 8); + aom_wb_write_literal(&wb, num_tiles_minus_1, 16); + tl += 4; + tile_list_obu_size += 4; + + // Write each tile's data + for (i = 0; i <= num_tiles_minus_1; i++) { + aom_tile_data tile_data = { 0, NULL, 0 }; + + int image_idx = tiles[i].image_idx; + int ref_idx = tiles[i].reference_idx; + int tc = tiles[i].tile_col; + int tr = tiles[i].tile_row; + + // Reset bit writer to the right location. + wb.bit_buffer = tl; + wb.bit_offset = 0; + + size_t frame_size = frame_sizes[image_idx]; + const unsigned char *frame = frames[image_idx]; + + AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_DECODE_TILE_ROW, tr); + AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_DECODE_TILE_COL, tc); + + aom_codec_err_t aom_status = + aom_codec_decode(codec, frame, frame_size, NULL); + if (aom_status) die_codec(codec, "Failed to decode tile."); + + AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_GET_TILE_DATA, &tile_data); + + // Copy over tile info. + // uint8_t anchor_frame_idx; + // uint8_t tile_row; + // uint8_t tile_col; + // uint16_t coded_tile_data_size_minus_1; + // uint8_t *coded_tile_data; + uint32_t tile_info_bytes = 5; + aom_wb_write_literal(&wb, ref_idx, 8); + aom_wb_write_literal(&wb, tr, 8); + aom_wb_write_literal(&wb, tc, 8); + aom_wb_write_literal(&wb, (int)tile_data.coded_tile_data_size - 1, 16); + tl += tile_info_bytes; + + memcpy(tl, (uint8_t *)tile_data.coded_tile_data, + tile_data.coded_tile_data_size); + tl += tile_data.coded_tile_data_size; + + tile_list_obu_size += + tile_info_bytes + (uint32_t)tile_data.coded_tile_data_size; + } + + // Write tile list OBU size. + size_t bytes_written = 0; + if (aom_uleb_encode_fixed_size(tile_list_obu_size, 4, 4, saved_obu_size_loc, + &bytes_written)) + die_codec(codec, "Failed to encode the tile list obu size."); + + // Copy the tile list. + if (!aom_video_writer_write_frame( + writer, tl_buf, tile_list_obu_header_size + tile_list_obu_size, + tl_pts)) + die_codec(codec, "Failed to copy compressed tile list."); +} + +int main(int argc, char **argv) { + aom_codec_ctx_t codec; + AvxVideoReader *reader = NULL; + AvxVideoWriter *writer = NULL; + const AvxInterface *decoder = NULL; + const AvxVideoInfo *info = NULL; + int num_references; + int i; + aom_codec_pts_t pts; + const char *tile_list_file = NULL; + + exec_name = argv[0]; + if (argc != 5) die("Invalid number of arguments."); + + reader = aom_video_reader_open(argv[1]); + if (!reader) die("Failed to open %s for reading.", argv[1]); + + num_references = (int)strtol(argv[3], NULL, 0); + info = aom_video_reader_get_info(reader); + + aom_video_reader_set_fourcc(reader, AV1_FOURCC); + + // The writer to write out ivf file in tile list OBU, which can be decoded by + // AV1 decoder. + writer = aom_video_writer_open(argv[2], kContainerIVF, info); + if (!writer) die("Failed to open %s for writing", argv[2]); + + tile_list_file = argv[4]; + + decoder = get_aom_decoder_by_fourcc(info->codec_fourcc); + if (!decoder) die("Unknown input codec."); + printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface())); + + if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0)) + die_codec(&codec, "Failed to initialize decoder."); + + // Decode anchor frames. + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 0); + + printf("Reading %d reference images.\n", num_references); + for (i = 0; i < num_references; ++i) { + aom_video_reader_read_frame(reader); + + size_t frame_size = 0; + const unsigned char *frame = + aom_video_reader_get_frame(reader, &frame_size); + pts = (aom_codec_pts_t)aom_video_reader_get_frame_pts(reader); + + // Copy references bitstream directly. + if (!aom_video_writer_write_frame(writer, frame, frame_size, pts)) + die_codec(&codec, "Failed to copy compressed anchor frame."); + + if (aom_codec_decode(&codec, frame, frame_size, NULL)) + die_codec(&codec, "Failed to decode frame."); + } + + // Decode camera frames. + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 1); + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_EXT_TILE_DEBUG, 1); + + FILE *infile = aom_video_reader_get_file(reader); + // Record the offset of the first camera image. + const FileOffset camera_frame_pos = ftello(infile); + + printf("Loading compressed frames into memory.\n"); + + // Count the frames in the lightfield. + int num_frames = 0; + while (aom_video_reader_read_frame(reader)) { + ++num_frames; + } + if (num_frames < 1) die("Input light field has no frames."); + + // Read all of the lightfield frames into memory. + unsigned char **frames = + (unsigned char **)malloc(num_frames * sizeof(unsigned char *)); + size_t *frame_sizes = (size_t *)malloc(num_frames * sizeof(size_t)); + // Seek to the first camera image. + fseeko(infile, camera_frame_pos, SEEK_SET); + for (int f = 0; f < num_frames; ++f) { + aom_video_reader_read_frame(reader); + size_t frame_size = 0; + const unsigned char *frame = + aom_video_reader_get_frame(reader, &frame_size); + frames[f] = (unsigned char *)malloc(frame_size * sizeof(unsigned char)); + memcpy(frames[f], frame, frame_size); + frame_sizes[f] = frame_size; + } + printf("Read %d frames.\n", num_frames); + + // Copy first camera frame for getting camera frame header. This is done + // only once. + { + size_t frame_size = frame_sizes[0]; + const unsigned char *frame = frames[0]; + pts = num_references; + aom_tile_data frame_header_info = { 0, NULL, 0 }; + + // Need to decode frame header to get camera frame header info. So, here + // decoding 1 tile is enough. + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_DECODE_TILE_ROW, 0); + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_DECODE_TILE_COL, 0); + + aom_codec_err_t aom_status = + aom_codec_decode(&codec, frame, frame_size, NULL); + if (aom_status) die_codec(&codec, "Failed to decode tile."); + + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_FRAME_HEADER_INFO, + &frame_header_info); + + size_t obu_size_offset = + (uint8_t *)frame_header_info.coded_tile_data - frame; + size_t length_field_size = frame_header_info.coded_tile_data_size; + // Remove ext-tile tile info. + uint32_t frame_header_size = (uint32_t)frame_header_info.extra_size - 1; + size_t bytes_to_copy = + obu_size_offset + length_field_size + frame_header_size; + + unsigned char *frame_hdr_buf = (unsigned char *)malloc(bytes_to_copy); + if (frame_hdr_buf == NULL) + die_codec(&codec, "Failed to allocate frame header buffer."); + + memcpy(frame_hdr_buf, frame, bytes_to_copy); + + // Update frame header OBU size. + size_t bytes_written = 0; + if (aom_uleb_encode_fixed_size( + frame_header_size, length_field_size, length_field_size, + frame_hdr_buf + obu_size_offset, &bytes_written)) + die_codec(&codec, "Failed to encode the tile list obu size."); + + // Copy camera frame header bitstream. + if (!aom_video_writer_write_frame(writer, frame_hdr_buf, bytes_to_copy, + pts)) + die_codec(&codec, "Failed to copy compressed camera frame header."); + free(frame_hdr_buf); + } + + // Read out the image format. + aom_img_fmt_t ref_fmt = 0; + if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt)) + die_codec(&codec, "Failed to get the image format"); + const int bps = get_image_bps(ref_fmt); + if (!bps) die_codec(&codec, "Invalid image format."); + // read out the tile size. + unsigned int tile_size = 0; + if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_TILE_SIZE, &tile_size)) + die_codec(&codec, "Failed to get the tile size"); + const unsigned int tile_width = tile_size >> 16; + const unsigned int tile_height = tile_size & 65535; + // Allocate a buffer to store tile list bitstream. + const size_t data_sz = MAX_TILES * ALIGN_POWER_OF_TWO(tile_width, 5) * + ALIGN_POWER_OF_TWO(tile_height, 5) * bps / 8; + + unsigned char *tl_buf = (unsigned char *)malloc(data_sz); + if (tl_buf == NULL) die_codec(&codec, "Failed to allocate tile list buffer."); + + aom_codec_pts_t tl_pts = num_references; + const uint8_t output_frame_width_in_tiles_minus_1 = + output_frame_width / tile_width - 1; + const uint8_t output_frame_height_in_tiles_minus_1 = + output_frame_height / tile_height - 1; + + printf("Reading tile list from file.\n"); + char line[1024]; + FILE *tile_list_fptr = fopen(tile_list_file, "r"); + if (!tile_list_fptr) die_codec(&codec, "Failed to open tile list file."); + int num_tiles = 0; + TILE_LIST_INFO tiles[MAX_TILES]; + while ((fgets(line, 1024, tile_list_fptr)) != NULL) { + if (line[0] == 'F' || num_tiles >= MAX_TILES) { + // Flush existing tile list and start another, either because we hit a + // new render frame or because we've hit our max number of tiles per list. + if (num_tiles > 0) { + process_tile_list(tiles, num_tiles, tl_pts, frames, frame_sizes, &codec, + tl_buf, writer, output_frame_width_in_tiles_minus_1, + output_frame_height_in_tiles_minus_1); + ++tl_pts; + } + num_tiles = 0; + } + if (line[0] == 'F') { + continue; + } + if (sscanf(line, "%d %d %d %d", &tiles[num_tiles].image_idx, + &tiles[num_tiles].reference_idx, &tiles[num_tiles].tile_col, + &tiles[num_tiles].tile_row) == 4) { + if (tiles[num_tiles].image_idx >= num_frames) { + die("Tile list image_idx out of bounds: %d >= %d.", + tiles[num_tiles].image_idx, num_frames); + } + if (tiles[num_tiles].reference_idx >= num_references) { + die("Tile list reference_idx out of bounds: %d >= %d.", + tiles[num_tiles].reference_idx, num_references); + } + ++num_tiles; + } + } + if (num_tiles > 0) { + // Flush out the last tile list. + process_tile_list(tiles, num_tiles, tl_pts, frames, frame_sizes, &codec, + tl_buf, writer, output_frame_width_in_tiles_minus_1, + output_frame_height_in_tiles_minus_1); + ++tl_pts; + } + + const int num_tile_lists = (int)(tl_pts - pts); + printf("Finished processing tile lists. Num tile lists: %d.\n", + num_tile_lists); + free(tl_buf); + for (int f = 0; f < num_frames; ++f) { + free(frames[f]); + } + free(frame_sizes); + free(frames); + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); + aom_video_writer_close(writer); + aom_video_reader_close(reader); + + return EXIT_SUCCESS; +} diff --git a/libs/libaom/src/examples/lightfield_decoder.c b/libs/libaom/src/examples/lightfield_decoder.c new file mode 100644 index 000000000..a292e9c75 --- /dev/null +++ b/libs/libaom/src/examples/lightfield_decoder.c @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Lightfield Decoder +// ================== +// +// This is an example of a simple lightfield decoder. It builds upon the +// simple_decoder.c example. It takes an input file containing the compressed +// data (in ivf format), treating it as a lightfield instead of a video; and a +// text file with a list of tiles to decode. There is an optional parameter +// allowing to choose the output format, and the supported formats are +// YUV1D(default), YUV, and NV12. +// After running the lightfield encoder, run lightfield decoder to decode a +// batch of tiles: +// examples/lightfield_decoder vase10x10.ivf vase_reference.yuv 4 tile_list.txt +// 0(optional) +// The tile_list.txt is expected to be of the form: +// Frame +// +// +// ... +// Frame +#include +#include + +#include "aom/aom_decoder.h" +#include "aom/aomdx.h" +#include "aom_scale/yv12config.h" +#include "av1/common/enums.h" +#include "common/tools_common.h" +#include "common/video_reader.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, + "Usage: %s \n", + exec_name); + exit(EXIT_FAILURE); +} + +// Output frame size +const int output_frame_width = 512; +const int output_frame_height = 512; + +static void aom_img_copy_tile(const aom_image_t *src, const aom_image_t *dst, + int dst_row_offset, int dst_col_offset) { + const int shift = (src->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0; + int plane; + + for (plane = 0; plane < 3; ++plane) { + const unsigned char *src_buf = src->planes[plane]; + const int src_stride = src->stride[plane]; + unsigned char *dst_buf = dst->planes[plane]; + const int dst_stride = dst->stride[plane]; + const int roffset = + (plane > 0) ? dst_row_offset >> dst->y_chroma_shift : dst_row_offset; + const int coffset = + (plane > 0) ? dst_col_offset >> dst->x_chroma_shift : dst_col_offset; + + // col offset needs to be adjusted for HBD. + dst_buf += roffset * dst_stride + (coffset << shift); + + const int w = (aom_img_plane_width(src, plane) << shift); + const int h = aom_img_plane_height(src, plane); + int y; + + for (y = 0; y < h; ++y) { + memcpy(dst_buf, src_buf, w); + src_buf += src_stride; + dst_buf += dst_stride; + } + } +} + +void decode_tile(aom_codec_ctx_t *codec, const unsigned char *frame, + size_t frame_size, int tr, int tc, int ref_idx, + aom_image_t *reference_images, aom_image_t *output, + int *tile_idx, unsigned int *output_bit_depth, + aom_image_t **img_ptr, int output_format) { + AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_TILE_MODE, 1); + AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_EXT_TILE_DEBUG, 1); + AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_DECODE_TILE_ROW, tr); + AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_DECODE_TILE_COL, tc); + + av1_ref_frame_t ref; + ref.idx = 0; + ref.use_external_ref = 1; + ref.img = reference_images[ref_idx]; + if (AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_REFERENCE, &ref)) { + die_codec(codec, "Failed to set reference frame."); + } + + aom_codec_err_t aom_status = aom_codec_decode(codec, frame, frame_size, NULL); + if (aom_status) die_codec(codec, "Failed to decode tile."); + + aom_codec_iter_t iter = NULL; + aom_image_t *img = aom_codec_get_frame(codec, &iter); + if (!img) die_codec(codec, "Failed to get frame."); + *img_ptr = img; + + // aom_img_alloc() sets bit_depth as follows: + // output->bit_depth = (fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 16 : 8; + // Use img->bit_depth(read from bitstream), so that aom_shift_img() + // works as expected. + output->bit_depth = img->bit_depth; + *output_bit_depth = img->bit_depth; + + if (output_format != YUV1D) { + // read out the tile size. + unsigned int tile_size = 0; + if (AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_GET_TILE_SIZE, &tile_size)) + die_codec(codec, "Failed to get the tile size"); + const unsigned int tile_width = tile_size >> 16; + const unsigned int tile_height = tile_size & 65535; + const uint32_t output_frame_width_in_tiles = + output_frame_width / tile_width; + + // Copy the tile to the output frame. + const int row_offset = + (*tile_idx / output_frame_width_in_tiles) * tile_height; + const int col_offset = + (*tile_idx % output_frame_width_in_tiles) * tile_width; + + aom_img_copy_tile(img, output, row_offset, col_offset); + (*tile_idx)++; + } +} + +static void img_write_to_file(const aom_image_t *img, FILE *file, + int output_format) { + if (output_format == YUV) + aom_img_write(img, file); + else if (output_format == NV12) + aom_img_write_nv12(img, file); + else + die("Invalid output format"); +} + +int main(int argc, char **argv) { + FILE *outfile = NULL; + aom_codec_ctx_t codec; + AvxVideoReader *reader = NULL; + const AvxInterface *decoder = NULL; + const AvxVideoInfo *info = NULL; + int num_references; + aom_img_fmt_t ref_fmt = 0; + aom_image_t reference_images[MAX_EXTERNAL_REFERENCES]; + aom_image_t output; + aom_image_t *output_shifted = NULL; + size_t frame_size = 0; + const unsigned char *frame = NULL; + int i, j; + const char *tile_list_file = NULL; + int output_format = YUV1D; + exec_name = argv[0]; + + if (argc < 5) die("Invalid number of arguments."); + + reader = aom_video_reader_open(argv[1]); + if (!reader) die("Failed to open %s for reading.", argv[1]); + + if (!(outfile = fopen(argv[2], "wb"))) + die("Failed to open %s for writing.", argv[2]); + + num_references = (int)strtol(argv[3], NULL, 0); + tile_list_file = argv[4]; + + if (argc > 5) output_format = (int)strtol(argv[5], NULL, 0); + if (output_format < YUV1D || output_format > NV12) + die("Output format out of range [0, 2]"); + + info = aom_video_reader_get_info(reader); + + if (info->codec_fourcc == LST_FOURCC) + decoder = get_aom_decoder_by_fourcc(AV1_FOURCC); + else + die("Unknown input codec."); + printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface())); + + if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0)) + die_codec(&codec, "Failed to initialize decoder."); + + if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_IS_ANNEXB, + info->is_annexb)) { + die("Failed to set annex b status"); + } + + // Decode anchor frames. + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 0); + for (i = 0; i < num_references; ++i) { + aom_video_reader_read_frame(reader); + frame = aom_video_reader_get_frame(reader, &frame_size); + if (aom_codec_decode(&codec, frame, frame_size, NULL)) + die_codec(&codec, "Failed to decode frame."); + + if (i == 0) { + if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt)) + die_codec(&codec, "Failed to get the image format"); + + int frame_res[2]; + if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_FRAME_SIZE, frame_res)) + die_codec(&codec, "Failed to get the image frame size"); + + // Allocate memory to store decoded references. Allocate memory with the + // border so that it can be used as a reference. + for (j = 0; j < num_references; j++) { + unsigned int border = AOM_DEC_BORDER_IN_PIXELS; + if (!aom_img_alloc_with_border(&reference_images[j], ref_fmt, + frame_res[0], frame_res[1], 32, 8, + border)) { + die("Failed to allocate references."); + } + } + } + + if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_COPY_NEW_FRAME_IMAGE, + &reference_images[i])) + die_codec(&codec, "Failed to copy decoded reference frame"); + + aom_codec_iter_t iter = NULL; + aom_image_t *img = NULL; + while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) { + char name[1024]; + snprintf(name, sizeof(name), "ref_%d.yuv", i); + printf("writing ref image to %s, %d, %d\n", name, img->d_w, img->d_h); + FILE *ref_file = fopen(name, "wb"); + aom_img_write(img, ref_file); + fclose(ref_file); + } + } + + FILE *infile = aom_video_reader_get_file(reader); + // Record the offset of the first camera image. + const FileOffset camera_frame_pos = ftello(infile); + + printf("Loading compressed frames into memory.\n"); + + // Count the frames in the lightfield. + int num_frames = 0; + while (aom_video_reader_read_frame(reader)) { + ++num_frames; + } + if (num_frames < 1) die("Input light field has no frames."); + + // Read all of the lightfield frames into memory. + unsigned char **frames = + (unsigned char **)malloc(num_frames * sizeof(unsigned char *)); + size_t *frame_sizes = (size_t *)malloc(num_frames * sizeof(size_t)); + // Seek to the first camera image. + fseeko(infile, camera_frame_pos, SEEK_SET); + for (int f = 0; f < num_frames; ++f) { + aom_video_reader_read_frame(reader); + frame = aom_video_reader_get_frame(reader, &frame_size); + frames[f] = (unsigned char *)malloc(frame_size * sizeof(unsigned char)); + memcpy(frames[f], frame, frame_size); + frame_sizes[f] = frame_size; + } + printf("Read %d frames.\n", num_frames); + + if (output_format != YUV1D) { + // Allocate the output frame. + aom_img_fmt_t out_fmt = ref_fmt; + if (FORCE_HIGHBITDEPTH_DECODING) out_fmt |= AOM_IMG_FMT_HIGHBITDEPTH; + if (!aom_img_alloc(&output, out_fmt, output_frame_width, + output_frame_height, 32)) + die("Failed to allocate output image."); + } + + printf("Decoding tile list from file.\n"); + char line[1024]; + FILE *tile_list_fptr = fopen(tile_list_file, "r"); + if (!tile_list_fptr) die_codec(&codec, "Failed to open tile list file."); + int tile_list_cnt = 0; + int tile_list_writes = 0; + int tile_idx = 0; + aom_image_t *out = NULL; + unsigned int output_bit_depth = 0; + + while ((fgets(line, 1024, tile_list_fptr)) != NULL) { + if (line[0] == 'F') { + if (output_format != YUV1D) { + // Write out the tile list. + if (tile_list_cnt) { + out = &output; + if (output_bit_depth != 0) + aom_shift_img(output_bit_depth, &out, &output_shifted); + img_write_to_file(out, outfile, output_format); + tile_list_writes++; + } + + tile_list_cnt++; + tile_idx = 0; + // Then memset the frame. + memset(output.img_data, 0, output.sz); + } + continue; + } + + int image_idx, ref_idx, tc, tr; + sscanf(line, "%d %d %d %d", &image_idx, &ref_idx, &tc, &tr); + if (image_idx >= num_frames) { + die("Tile list image_idx out of bounds: %d >= %d.", image_idx, + num_frames); + } + if (ref_idx >= num_references) { + die("Tile list ref_idx out of bounds: %d >= %d.", ref_idx, + num_references); + } + frame = frames[image_idx]; + frame_size = frame_sizes[image_idx]; + + aom_image_t *img = NULL; + decode_tile(&codec, frame, frame_size, tr, tc, ref_idx, reference_images, + &output, &tile_idx, &output_bit_depth, &img, output_format); + if (output_format == YUV1D) { + out = img; + if (output_bit_depth != 0) + aom_shift_img(output_bit_depth, &out, &output_shifted); + aom_img_write(out, outfile); + } + } + + if (output_format != YUV1D) { + // Write out the last tile list. + if (tile_list_writes < tile_list_cnt) { + out = &output; + if (output_bit_depth != 0) + aom_shift_img(output_bit_depth, &out, &output_shifted); + img_write_to_file(out, outfile, output_format); + } + } + + if (output_shifted) aom_img_free(output_shifted); + if (output_format != YUV1D) aom_img_free(&output); + for (i = 0; i < num_references; i++) aom_img_free(&reference_images[i]); + for (int f = 0; f < num_frames; ++f) { + free(frames[f]); + } + free(frame_sizes); + free(frames); + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); + aom_video_reader_close(reader); + fclose(outfile); + + return EXIT_SUCCESS; +} diff --git a/libs/libaom/src/examples/lightfield_encoder.c b/libs/libaom/src/examples/lightfield_encoder.c new file mode 100644 index 000000000..e80fe24f6 --- /dev/null +++ b/libs/libaom/src/examples/lightfield_encoder.c @@ -0,0 +1,522 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Lightfield Encoder +// ================== +// +// This is an example of a simple lightfield encoder. It builds upon the +// twopass_encoder.c example. It takes an input file in YV12 format, +// treating it as a planar lightfield instead of a video. The img_width +// and img_height arguments are the dimensions of the lightfield images, +// while the lf_width and lf_height arguments are the number of +// lightfield images in each dimension. The lf_blocksize determines the +// number of reference images used for MCP. For example, 5 means that there +// is a reference image for every 5x5 lightfield image block. All images +// within a block will use the center image in that block as the reference +// image for MCP. +// Run "make test" to download lightfield test data: vase10x10.yuv. +// Run lightfield encoder to encode whole lightfield: +// examples/lightfield_encoder 1024 1024 vase10x10.yuv vase10x10.ivf 10 10 5 + +// Note: In bitstream.c and encoder.c, define EXT_TILE_DEBUG as 1 will print +// out the uncompressed header and the frame contexts, which can be used to +// test the bit exactness of the headers and the frame contexts for large scale +// tile coded frames. + +#include +#include +#include + +#include "aom/aom_encoder.h" +#include "aom/aomcx.h" +#include "aom_scale/yv12config.h" +#include "av1/common/enums.h" +#include "common/tools_common.h" +#include "common/video_writer.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, + "Usage: %s " + " \n", + exec_name); + exit(EXIT_FAILURE); +} + +static int img_size_bytes(aom_image_t *img) { + int image_size_bytes = 0; + int plane; + for (plane = 0; plane < 3; ++plane) { + const int w = aom_img_plane_width(img, plane) * + ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); + const int h = aom_img_plane_height(img, plane); + image_size_bytes += w * h; + } + return image_size_bytes; +} + +static int get_frame_stats(aom_codec_ctx_t *ctx, const aom_image_t *img, + aom_codec_pts_t pts, unsigned int duration, + aom_enc_frame_flags_t flags, + aom_fixed_buf_t *stats) { + int got_pkts = 0; + aom_codec_iter_t iter = NULL; + const aom_codec_cx_pkt_t *pkt = NULL; + const aom_codec_err_t res = aom_codec_encode(ctx, img, pts, duration, flags); + if (res != AOM_CODEC_OK) die_codec(ctx, "Failed to get frame stats."); + + while ((pkt = aom_codec_get_cx_data(ctx, &iter)) != NULL) { + got_pkts = 1; + + if (pkt->kind == AOM_CODEC_STATS_PKT) { + const uint8_t *const pkt_buf = pkt->data.twopass_stats.buf; + const size_t pkt_size = pkt->data.twopass_stats.sz; + stats->buf = realloc(stats->buf, stats->sz + pkt_size); + memcpy((uint8_t *)stats->buf + stats->sz, pkt_buf, pkt_size); + stats->sz += pkt_size; + } + } + + return got_pkts; +} + +static int encode_frame(aom_codec_ctx_t *ctx, const aom_image_t *img, + aom_codec_pts_t pts, unsigned int duration, + aom_enc_frame_flags_t flags, AvxVideoWriter *writer) { + int got_pkts = 0; + aom_codec_iter_t iter = NULL; + const aom_codec_cx_pkt_t *pkt = NULL; + const aom_codec_err_t res = aom_codec_encode(ctx, img, pts, duration, flags); + if (res != AOM_CODEC_OK) die_codec(ctx, "Failed to encode frame."); + + while ((pkt = aom_codec_get_cx_data(ctx, &iter)) != NULL) { + got_pkts = 1; + if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) { + const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0; + + if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf, + pkt->data.frame.sz, + pkt->data.frame.pts)) + die_codec(ctx, "Failed to write compressed frame."); + printf(keyframe ? "K" : "."); + fflush(stdout); + } + } + + return got_pkts; +} + +static void get_raw_image(aom_image_t **frame_to_encode, aom_image_t *raw, + aom_image_t *raw_shift) { + if (FORCE_HIGHBITDEPTH_DECODING) { + // Need to allocate larger buffer to use hbd internal. + int input_shift = 0; + aom_img_upshift(raw_shift, raw, input_shift); + *frame_to_encode = raw_shift; + } else { + *frame_to_encode = raw; + } +} + +static aom_fixed_buf_t pass0(aom_image_t *raw, FILE *infile, + const AvxInterface *encoder, + const aom_codec_enc_cfg_t *cfg, int lf_width, + int lf_height, int lf_blocksize, int flags, + aom_image_t *raw_shift) { + aom_codec_ctx_t codec; + int frame_count = 0; + int image_size_bytes = img_size_bytes(raw); + int u_blocks, v_blocks; + int bu, bv; + aom_fixed_buf_t stats = { NULL, 0 }; + aom_image_t *frame_to_encode; + + if (aom_codec_enc_init(&codec, encoder->codec_interface(), cfg, flags)) + die_codec(&codec, "Failed to initialize encoder"); + if (aom_codec_control(&codec, AOME_SET_ENABLEAUTOALTREF, 0)) + die_codec(&codec, "Failed to turn off auto altref"); + if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 0)) + die_codec(&codec, "Failed to set frame parallel decoding"); + + // How many reference images we need to encode. + u_blocks = (lf_width + lf_blocksize - 1) / lf_blocksize; + v_blocks = (lf_height + lf_blocksize - 1) / lf_blocksize; + + printf("\n First pass: "); + + for (bv = 0; bv < v_blocks; ++bv) { + for (bu = 0; bu < u_blocks; ++bu) { + const int block_u_min = bu * lf_blocksize; + const int block_v_min = bv * lf_blocksize; + int block_u_end = (bu + 1) * lf_blocksize; + int block_v_end = (bv + 1) * lf_blocksize; + int u_block_size, v_block_size; + int block_ref_u, block_ref_v; + + block_u_end = block_u_end < lf_width ? block_u_end : lf_width; + block_v_end = block_v_end < lf_height ? block_v_end : lf_height; + u_block_size = block_u_end - block_u_min; + v_block_size = block_v_end - block_v_min; + block_ref_u = block_u_min + u_block_size / 2; + block_ref_v = block_v_min + v_block_size / 2; + + printf("A%d, ", (block_ref_u + block_ref_v * lf_width)); + fseek(infile, (block_ref_u + block_ref_v * lf_width) * image_size_bytes, + SEEK_SET); + aom_img_read(raw, infile); + get_raw_image(&frame_to_encode, raw, raw_shift); + + // Reference frames can be encoded encoded without tiles. + ++frame_count; + get_frame_stats(&codec, frame_to_encode, frame_count, 1, + AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 | + AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | + AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 | + AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | + AOM_EFLAG_NO_UPD_ARF, + &stats); + } + } + + if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 1)) + die_codec(&codec, "Failed to set frame parallel decoding"); + + for (bv = 0; bv < v_blocks; ++bv) { + for (bu = 0; bu < u_blocks; ++bu) { + const int block_u_min = bu * lf_blocksize; + const int block_v_min = bv * lf_blocksize; + int block_u_end = (bu + 1) * lf_blocksize; + int block_v_end = (bv + 1) * lf_blocksize; + int u, v; + block_u_end = block_u_end < lf_width ? block_u_end : lf_width; + block_v_end = block_v_end < lf_height ? block_v_end : lf_height; + for (v = block_v_min; v < block_v_end; ++v) { + for (u = block_u_min; u < block_u_end; ++u) { + printf("C%d, ", (u + v * lf_width)); + fseek(infile, (u + v * lf_width) * image_size_bytes, SEEK_SET); + aom_img_read(raw, infile); + get_raw_image(&frame_to_encode, raw, raw_shift); + + ++frame_count; + get_frame_stats(&codec, frame_to_encode, frame_count, 1, + AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 | + AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | + AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 | + AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | + AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY, + &stats); + } + } + } + } + // Flush encoder. + // No ARF, this should not be needed. + while (get_frame_stats(&codec, NULL, frame_count, 1, 0, &stats)) { + } + + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + printf("\nFirst pass complete. Processed %d frames.\n", frame_count); + + return stats; +} + +static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name, + const AvxInterface *encoder, aom_codec_enc_cfg_t *cfg, + int lf_width, int lf_height, int lf_blocksize, int flags, + aom_image_t *raw_shift) { + AvxVideoInfo info = { encoder->fourcc, + cfg->g_w, + cfg->g_h, + { cfg->g_timebase.num, cfg->g_timebase.den }, + 0 }; + AvxVideoWriter *writer = NULL; + aom_codec_ctx_t codec; + int frame_count = 0; + int image_size_bytes = img_size_bytes(raw); + int bu, bv; + int u_blocks, v_blocks; + aom_image_t *frame_to_encode; + aom_image_t reference_images[MAX_EXTERNAL_REFERENCES]; + int reference_image_num = 0; + int i; + + writer = aom_video_writer_open(outfile_name, kContainerIVF, &info); + if (!writer) die("Failed to open %s for writing", outfile_name); + + if (aom_codec_enc_init(&codec, encoder->codec_interface(), cfg, flags)) + die_codec(&codec, "Failed to initialize encoder"); + if (aom_codec_control(&codec, AOME_SET_ENABLEAUTOALTREF, 0)) + die_codec(&codec, "Failed to turn off auto altref"); + if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 0)) + die_codec(&codec, "Failed to set frame parallel decoding"); + if (aom_codec_control(&codec, AV1E_ENABLE_EXT_TILE_DEBUG, 1)) + die_codec(&codec, "Failed to enable encoder ext_tile debug"); + if (aom_codec_control(&codec, AOME_SET_CPUUSED, 1)) + die_codec(&codec, "Failed to set cpu-used"); + + // Note: The superblock is a sequence parameter and has to be the same for 1 + // sequence. In lightfield application, must choose the superblock size(either + // 64x64 or 128x128) before the encoding starts. Otherwise, the default is + // AOM_SUPERBLOCK_SIZE_DYNAMIC, and the superblock size will be set to 64x64 + // internally. + if (aom_codec_control(&codec, AV1E_SET_SUPERBLOCK_SIZE, + AOM_SUPERBLOCK_SIZE_64X64)) + die_codec(&codec, "Failed to set SB size"); + + u_blocks = (lf_width + lf_blocksize - 1) / lf_blocksize; + v_blocks = (lf_height + lf_blocksize - 1) / lf_blocksize; + + reference_image_num = u_blocks * v_blocks; + // Set the max gf group length so the references are guaranteed to be in + // a different gf group than any of the regular frames. This avoids using + // both vbr and constant quality mode in a single group. The number of + // references now cannot surpass 17 because of the enforced MAX_GF_INTERVAL of + // 16. If it is necessary to exceed this reference frame limit, one will have + // to do some additional handling to ensure references are in separate gf + // groups from the regular frames. + if (aom_codec_control(&codec, AV1E_SET_MAX_GF_INTERVAL, + reference_image_num - 1)) + die_codec(&codec, "Failed to set max gf interval"); + aom_img_fmt_t ref_fmt = AOM_IMG_FMT_I420; + if (FORCE_HIGHBITDEPTH_DECODING) ref_fmt |= AOM_IMG_FMT_HIGHBITDEPTH; + // Allocate memory with the border so that it can be used as a reference. + int border_in_pixels = + (codec.config.enc->rc_resize_mode || codec.config.enc->rc_superres_mode) + ? AOM_BORDER_IN_PIXELS + : AOM_ENC_NO_SCALE_BORDER; + for (i = 0; i < reference_image_num; i++) { + if (!aom_img_alloc_with_border(&reference_images[i], ref_fmt, cfg->g_w, + cfg->g_h, 32, 8, border_in_pixels)) { + die("Failed to allocate image."); + } + } + + printf("\n Second pass: "); + + // Encode reference images first. + printf("Encoding Reference Images\n"); + for (bv = 0; bv < v_blocks; ++bv) { + for (bu = 0; bu < u_blocks; ++bu) { + const int block_u_min = bu * lf_blocksize; + const int block_v_min = bv * lf_blocksize; + int block_u_end = (bu + 1) * lf_blocksize; + int block_v_end = (bv + 1) * lf_blocksize; + int u_block_size, v_block_size; + int block_ref_u, block_ref_v; + + block_u_end = block_u_end < lf_width ? block_u_end : lf_width; + block_v_end = block_v_end < lf_height ? block_v_end : lf_height; + u_block_size = block_u_end - block_u_min; + v_block_size = block_v_end - block_v_min; + block_ref_u = block_u_min + u_block_size / 2; + block_ref_v = block_v_min + v_block_size / 2; + + printf("A%d, ", (block_ref_u + block_ref_v * lf_width)); + fseek(infile, (block_ref_u + block_ref_v * lf_width) * image_size_bytes, + SEEK_SET); + aom_img_read(raw, infile); + + get_raw_image(&frame_to_encode, raw, raw_shift); + + // Reference frames may be encoded without tiles. + ++frame_count; + printf("Encoding reference image %d of %d\n", bv * u_blocks + bu, + u_blocks * v_blocks); + encode_frame(&codec, frame_to_encode, frame_count, 1, + AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 | + AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | + AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 | + AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | + AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY, + writer); + + if (aom_codec_control(&codec, AV1_COPY_NEW_FRAME_IMAGE, + &reference_images[frame_count - 1])) + die_codec(&codec, "Failed to copy decoder reference frame"); + } + } + + cfg->large_scale_tile = 1; + // Fixed q encoding for camera frames. + cfg->rc_end_usage = AOM_Q; + if (aom_codec_enc_config_set(&codec, cfg)) + die_codec(&codec, "Failed to configure encoder"); + + // The fixed q value used in encoding. + if (aom_codec_control(&codec, AOME_SET_CQ_LEVEL, 36)) + die_codec(&codec, "Failed to set cq level"); + if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 1)) + die_codec(&codec, "Failed to set frame parallel decoding"); + if (aom_codec_control(&codec, AV1E_SET_SINGLE_TILE_DECODING, 1)) + die_codec(&codec, "Failed to turn on single tile decoding"); + // Set tile_columns and tile_rows to MAX values, which guarantees the tile + // size of 64 x 64 pixels(i.e. 1 SB) for <= 4k resolution. + if (aom_codec_control(&codec, AV1E_SET_TILE_COLUMNS, 6)) + die_codec(&codec, "Failed to set tile width"); + if (aom_codec_control(&codec, AV1E_SET_TILE_ROWS, 6)) + die_codec(&codec, "Failed to set tile height"); + + for (bv = 0; bv < v_blocks; ++bv) { + for (bu = 0; bu < u_blocks; ++bu) { + const int block_u_min = bu * lf_blocksize; + const int block_v_min = bv * lf_blocksize; + int block_u_end = (bu + 1) * lf_blocksize; + int block_v_end = (bv + 1) * lf_blocksize; + int u, v; + block_u_end = block_u_end < lf_width ? block_u_end : lf_width; + block_v_end = block_v_end < lf_height ? block_v_end : lf_height; + for (v = block_v_min; v < block_v_end; ++v) { + for (u = block_u_min; u < block_u_end; ++u) { + av1_ref_frame_t ref; + ref.idx = 0; + ref.use_external_ref = 1; + ref.img = reference_images[bv * u_blocks + bu]; + if (aom_codec_control(&codec, AV1_SET_REFERENCE, &ref)) + die_codec(&codec, "Failed to set reference frame"); + + printf("C%d, ", (u + v * lf_width)); + fseek(infile, (u + v * lf_width) * image_size_bytes, SEEK_SET); + aom_img_read(raw, infile); + get_raw_image(&frame_to_encode, raw, raw_shift); + + ++frame_count; + printf("Encoding image %d of %d\n", + frame_count - (u_blocks * v_blocks), lf_width * lf_height); + encode_frame(&codec, frame_to_encode, frame_count, 1, + AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 | + AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | + AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 | + AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | + AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY, + writer); + } + } + } + } + + // Flush encoder. + // No ARF, this should not be needed. + while (encode_frame(&codec, NULL, -1, 1, 0, writer)) { + } + + for (i = 0; i < reference_image_num; i++) aom_img_free(&reference_images[i]); + + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + // Modify large_scale_file fourcc. + if (cfg->large_scale_tile == 1) + aom_video_writer_set_fourcc(writer, LST_FOURCC); + aom_video_writer_close(writer); + + printf("\nSecond pass complete. Processed %d frames.\n", frame_count); +} + +int main(int argc, char **argv) { + FILE *infile = NULL; + int w, h; + // The number of lightfield images in the u and v dimensions. + int lf_width, lf_height; + // Defines how many images refer to the same reference image for MCP. + // lf_blocksize X lf_blocksize images will all use the reference image + // in the middle of the block of images. + int lf_blocksize; + aom_codec_ctx_t codec; + aom_codec_enc_cfg_t cfg; + aom_image_t raw; + aom_image_t raw_shift; + aom_codec_err_t res; + aom_fixed_buf_t stats; + int flags = 0; + + const AvxInterface *encoder = NULL; + const int fps = 30; + const int bitrate = 200; // kbit/s + const char *const width_arg = argv[1]; + const char *const height_arg = argv[2]; + const char *const infile_arg = argv[3]; + const char *const outfile_arg = argv[4]; + const char *const lf_width_arg = argv[5]; + const char *const lf_height_arg = argv[6]; + const char *lf_blocksize_arg = argv[7]; + exec_name = argv[0]; + + if (argc < 8) die("Invalid number of arguments"); + + encoder = get_aom_encoder_by_name("av1"); + if (!encoder) die("Unsupported codec."); + + w = (int)strtol(width_arg, NULL, 0); + h = (int)strtol(height_arg, NULL, 0); + lf_width = (int)strtol(lf_width_arg, NULL, 0); + lf_height = (int)strtol(lf_height_arg, NULL, 0); + lf_blocksize = (int)strtol(lf_blocksize_arg, NULL, 0); + lf_blocksize = lf_blocksize < lf_width ? lf_blocksize : lf_width; + lf_blocksize = lf_blocksize < lf_height ? lf_blocksize : lf_height; + + if (w <= 0 || h <= 0 || (w % 2) != 0 || (h % 2) != 0) + die("Invalid frame size: %dx%d", w, h); + if (lf_width <= 0 || lf_height <= 0) + die("Invalid lf_width and/or lf_height: %dx%d", lf_width, lf_height); + if (lf_blocksize <= 0) die("Invalid lf_blocksize: %d", lf_blocksize); + + if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, w, h, 32)) { + die("Failed to allocate image."); + } + if (FORCE_HIGHBITDEPTH_DECODING) { + // Need to allocate larger buffer to use hbd internal. + aom_img_alloc(&raw_shift, AOM_IMG_FMT_I420 | AOM_IMG_FMT_HIGHBITDEPTH, w, h, + 32); + } + + printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface())); + + // Configuration + res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0); + if (res) die_codec(&codec, "Failed to get default codec config."); + + cfg.g_w = w; + cfg.g_h = h; + cfg.g_timebase.num = 1; + cfg.g_timebase.den = fps; + cfg.rc_target_bitrate = bitrate; + cfg.g_error_resilient = 0; // This is required. + cfg.g_lag_in_frames = 0; // need to set this since default is 19. + cfg.kf_mode = AOM_KF_DISABLED; + cfg.large_scale_tile = 0; // Only set it to 1 for camera frame encoding. + cfg.g_bit_depth = AOM_BITS_8; + flags |= (cfg.g_bit_depth > AOM_BITS_8 || FORCE_HIGHBITDEPTH_DECODING) + ? AOM_CODEC_USE_HIGHBITDEPTH + : 0; + + if (!(infile = fopen(infile_arg, "rb"))) + die("Failed to open %s for reading", infile_arg); + + // Pass 0 + cfg.g_pass = AOM_RC_FIRST_PASS; + stats = pass0(&raw, infile, encoder, &cfg, lf_width, lf_height, lf_blocksize, + flags, &raw_shift); + + // Pass 1 + rewind(infile); + cfg.g_pass = AOM_RC_LAST_PASS; + cfg.rc_twopass_stats_in = stats; + pass1(&raw, infile, outfile_arg, encoder, &cfg, lf_width, lf_height, + lf_blocksize, flags, &raw_shift); + free(stats.buf); + + if (FORCE_HIGHBITDEPTH_DECODING) aom_img_free(&raw_shift); + aom_img_free(&raw); + fclose(infile); + + return EXIT_SUCCESS; +} diff --git a/libs/libaom/src/examples/lightfield_tile_list_decoder.c b/libs/libaom/src/examples/lightfield_tile_list_decoder.c new file mode 100644 index 000000000..3b928df2c --- /dev/null +++ b/libs/libaom/src/examples/lightfield_tile_list_decoder.c @@ -0,0 +1,227 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Lightfield Tile List Decoder +// ============================ +// +// This is a lightfield tile list decoder example. It takes an input file that +// contains the anchor frames that are references of the coded tiles, the camera +// frame header, and tile list OBUs that include the tile information and the +// compressed tile data. This input file is reconstructed from the encoded +// lightfield ivf file, and is decodable by AV1 decoder. num_references is +// the number of anchor frames coded at the beginning of the light field file. +// num_tile_lists is the number of tile lists need to be decoded. There is an +// optional parameter allowing to choose the output format, and the supported +// formats are YUV1D(default), YUV, and NV12. +// Run lightfield tile list decoder to decode an AV1 tile list file: +// examples/lightfield_tile_list_decoder vase_tile_list.ivf vase_tile_list.yuv +// 4 2 0(optional) + +#include +#include +#include +#include + +#include "aom/aom_decoder.h" +#include "aom/aomdx.h" +#include "aom_scale/yv12config.h" +#include "av1/common/enums.h" +#include "common/tools_common.h" +#include "common/video_reader.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, + "Usage: %s " + "\n", + exec_name); + exit(EXIT_FAILURE); +} + +static void write_tile_yuv1d(aom_codec_ctx_t *codec, const aom_image_t *img, + FILE *file) { + // read out the tile size. + unsigned int tile_size = 0; + if (AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_GET_TILE_SIZE, &tile_size)) + die_codec(codec, "Failed to get the tile size"); + const unsigned int tile_width = tile_size >> 16; + const unsigned int tile_height = tile_size & 65535; + const uint32_t output_frame_width_in_tiles = img->d_w / tile_width; + + unsigned int tile_count = 0; + if (AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_GET_TILE_COUNT, &tile_count)) + die_codec(codec, "Failed to get the tile size"); + + // Write tile to file. + const int shift = (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0; + unsigned int tile_idx; + + for (tile_idx = 0; tile_idx < tile_count; ++tile_idx) { + const int row_offset = + (tile_idx / output_frame_width_in_tiles) * tile_height; + const int col_offset = + (tile_idx % output_frame_width_in_tiles) * tile_width; + int plane; + + for (plane = 0; plane < 3; ++plane) { + const unsigned char *buf = img->planes[plane]; + const int stride = img->stride[plane]; + const int roffset = + (plane > 0) ? row_offset >> img->y_chroma_shift : row_offset; + const int coffset = + (plane > 0) ? col_offset >> img->x_chroma_shift : col_offset; + const int w = (plane > 0) ? ((tile_width >> img->x_chroma_shift) << shift) + : (tile_width << shift); + const int h = + (plane > 0) ? (tile_height >> img->y_chroma_shift) : tile_height; + int y; + + // col offset needs to be adjusted for HBD. + buf += roffset * stride + (coffset << shift); + + for (y = 0; y < h; ++y) { + fwrite(buf, 1, w, file); + buf += stride; + } + } + } +} + +int main(int argc, char **argv) { + FILE *outfile = NULL; + aom_codec_ctx_t codec; + AvxVideoReader *reader = NULL; + const AvxInterface *decoder = NULL; + const AvxVideoInfo *info = NULL; + int num_references; + int num_tile_lists; + aom_image_t reference_images[MAX_EXTERNAL_REFERENCES]; + size_t frame_size = 0; + const unsigned char *frame = NULL; + int output_format = YUV1D; + int i, j, n; + + exec_name = argv[0]; + + if (argc < 5) die("Invalid number of arguments."); + + reader = aom_video_reader_open(argv[1]); + if (!reader) die("Failed to open %s for reading.", argv[1]); + + if (!(outfile = fopen(argv[2], "wb"))) + die("Failed to open %s for writing.", argv[2]); + + num_references = (int)strtol(argv[3], NULL, 0); + num_tile_lists = (int)strtol(argv[4], NULL, 0); + + if (argc > 5) output_format = (int)strtol(argv[5], NULL, 0); + if (output_format < YUV1D || output_format > NV12) + die("Output format out of range [0, 2]"); + + info = aom_video_reader_get_info(reader); + + decoder = get_aom_decoder_by_fourcc(info->codec_fourcc); + if (!decoder) die("Unknown input codec."); + printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface())); + + if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0)) + die_codec(&codec, "Failed to initialize decoder."); + + if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_IS_ANNEXB, + info->is_annexb)) { + die("Failed to set annex b status"); + } + + // Decode anchor frames. + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 0); + for (i = 0; i < num_references; ++i) { + aom_video_reader_read_frame(reader); + frame = aom_video_reader_get_frame(reader, &frame_size); + if (aom_codec_decode(&codec, frame, frame_size, NULL)) + die_codec(&codec, "Failed to decode frame."); + + if (i == 0) { + aom_img_fmt_t ref_fmt = 0; + if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt)) + die_codec(&codec, "Failed to get the image format"); + + int frame_res[2]; + if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_FRAME_SIZE, frame_res)) + die_codec(&codec, "Failed to get the image frame size"); + + // Allocate memory to store decoded references. Allocate memory with the + // border so that it can be used as a reference. + for (j = 0; j < num_references; j++) { + unsigned int border = AOM_DEC_BORDER_IN_PIXELS; + if (!aom_img_alloc_with_border(&reference_images[j], ref_fmt, + frame_res[0], frame_res[1], 32, 8, + border)) { + die("Failed to allocate references."); + } + } + } + + if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_COPY_NEW_FRAME_IMAGE, + &reference_images[i])) + die_codec(&codec, "Failed to copy decoded reference frame"); + + aom_codec_iter_t iter = NULL; + aom_image_t *img = NULL; + while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) { + char name[1024]; + snprintf(name, sizeof(name), "ref_%d.yuv", i); + printf("writing ref image to %s, %d, %d\n", name, img->d_w, img->d_h); + FILE *ref_file = fopen(name, "wb"); + aom_img_write(img, ref_file); + fclose(ref_file); + } + } + + // Decode the lightfield. + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 1); + + // Set external references. + av1_ext_ref_frame_t set_ext_ref = { &reference_images[0], num_references }; + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_EXT_REF_PTR, &set_ext_ref); + // Must decode the camera frame header first. + aom_video_reader_read_frame(reader); + frame = aom_video_reader_get_frame(reader, &frame_size); + if (aom_codec_decode(&codec, frame, frame_size, NULL)) + die_codec(&codec, "Failed to decode the frame."); + // Decode tile lists one by one. + for (n = 0; n < num_tile_lists; n++) { + aom_video_reader_read_frame(reader); + frame = aom_video_reader_get_frame(reader, &frame_size); + + if (aom_codec_decode(&codec, frame, frame_size, NULL)) + die_codec(&codec, "Failed to decode the tile list."); + aom_codec_iter_t iter = NULL; + aom_image_t *img = aom_codec_get_frame(&codec, &iter); + if (!img) die_codec(&codec, "Failed to get frame."); + + if (output_format == YUV1D) + // write the tile to the output file in 1D format. + write_tile_yuv1d(&codec, img, outfile); + else if (output_format == YUV) + aom_img_write(img, outfile); + else + // NV12 output format + aom_img_write_nv12(img, outfile); + } + + for (i = 0; i < num_references; i++) aom_img_free(&reference_images[i]); + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); + aom_video_reader_close(reader); + fclose(outfile); + + return EXIT_SUCCESS; +} diff --git a/libs/libaom/src/examples/lossless_encoder.c b/libs/libaom/src/examples/lossless_encoder.c new file mode 100644 index 000000000..e0253d2b3 --- /dev/null +++ b/libs/libaom/src/examples/lossless_encoder.c @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom/aom_encoder.h" +#include "aom/aomcx.h" +#include "common/tools_common.h" +#include "common/video_writer.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, + "lossless_encoder: Example demonstrating lossless " + "encoding feature. Supports raw input only.\n"); + fprintf(stderr, "Usage: %s \n", exec_name); + exit(EXIT_FAILURE); +} + +static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img, + int frame_index, int flags, AvxVideoWriter *writer) { + int got_pkts = 0; + aom_codec_iter_t iter = NULL; + const aom_codec_cx_pkt_t *pkt = NULL; + const aom_codec_err_t res = + aom_codec_encode(codec, img, frame_index, 1, flags); + if (res != AOM_CODEC_OK) die_codec(codec, "Failed to encode frame"); + + while ((pkt = aom_codec_get_cx_data(codec, &iter)) != NULL) { + got_pkts = 1; + + if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) { + const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0; + if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf, + pkt->data.frame.sz, + pkt->data.frame.pts)) { + die_codec(codec, "Failed to write compressed frame"); + } + printf(keyframe ? "K" : "."); + fflush(stdout); + } + } + + return got_pkts; +} + +int main(int argc, char **argv) { + FILE *infile = NULL; + aom_codec_ctx_t codec; + aom_codec_enc_cfg_t cfg; + int frame_count = 0; + aom_image_t raw; + aom_codec_err_t res; + AvxVideoInfo info; + AvxVideoWriter *writer = NULL; + const AvxInterface *encoder = NULL; + const int fps = 30; + + exec_name = argv[0]; + + // Clear explicitly, as simply assigning "{ 0 }" generates + // "missing-field-initializers" warning in some compilers. + memset(&info, 0, sizeof(info)); + + if (argc < 5) die("Invalid number of arguments"); + + encoder = get_aom_encoder_by_name("av1"); + if (!encoder) die("Unsupported codec."); + + info.codec_fourcc = encoder->fourcc; + info.frame_width = (int)strtol(argv[1], NULL, 0); + info.frame_height = (int)strtol(argv[2], NULL, 0); + info.time_base.numerator = 1; + info.time_base.denominator = fps; + + if (info.frame_width <= 0 || info.frame_height <= 0 || + (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) { + die("Invalid frame size: %dx%d", info.frame_width, info.frame_height); + } + + if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, info.frame_width, + info.frame_height, 1)) { + die("Failed to allocate image."); + } + + printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface())); + + res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0); + if (res) die_codec(&codec, "Failed to get default codec config."); + + cfg.g_w = info.frame_width; + cfg.g_h = info.frame_height; + cfg.g_timebase.num = info.time_base.numerator; + cfg.g_timebase.den = info.time_base.denominator; + + writer = aom_video_writer_open(argv[4], kContainerIVF, &info); + if (!writer) die("Failed to open %s for writing.", argv[4]); + + if (!(infile = fopen(argv[3], "rb"))) + die("Failed to open %s for reading.", argv[3]); + + if (aom_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0)) + die_codec(&codec, "Failed to initialize encoder"); + + if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1E_SET_LOSSLESS, 1)) + die_codec(&codec, "Failed to use lossless mode"); + + // Encode frames. + while (aom_img_read(&raw, infile)) { + encode_frame(&codec, &raw, frame_count++, 0, writer); + } + + // Flush encoder. + while (encode_frame(&codec, NULL, -1, 0, writer)) { + } + + printf("\n"); + fclose(infile); + printf("Processed %d frames.\n", frame_count); + + aom_img_free(&raw); + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + aom_video_writer_close(writer); + + return EXIT_SUCCESS; +} diff --git a/libs/libaom/src/examples/noise_model.c b/libs/libaom/src/examples/noise_model.c new file mode 100644 index 000000000..d07443f9d --- /dev/null +++ b/libs/libaom/src/examples/noise_model.c @@ -0,0 +1,432 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief This is an sample binary to create noise params from input video. + * + * To allow for external denoising applications, this sample binary illustrates + * how to create a film grain table (film grain params as a function of time) + * from an input video and its corresponding denoised source. + * + * The --output-grain-table file can be passed as input to the encoder (in + * aomenc this is done through the "--film-grain-table" parameter). + * + * As an example, where the input source is an 854x480 yuv420p 8-bit video + * named "input.854_480.yuv" you would use steps similar to the following: + * + * # Run your denoiser (e.g, using hqdn3d filter): + * ffmpeg -vcodec rawvideo -video_size 854x480 -i input.854_480.yuv \ + * -vf hqdn3d=5:5:5:5 -vcodec rawvideo -an -f rawvideo \ + * denoised.854_480.yuv + * + * # Model the noise between the denoised version and original source: + * ./examples/noise_model --fps=25/1 --width=854 --height=480 --i420 \ + * --input-denoised=denoised.854_480.yuv --input=original.854_480.yuv \ + * --output-grain-table=film_grain.tbl + * + * # Encode with your favorite settings (including the grain table): + * aomenc --limit=100 --cpu-used=4 --input-bit-depth=8 \ + * --i420 -w 854 -h 480 --end-usage=q --cq-level=25 --lag-in-frames=25 \ + * --auto-alt-ref=2 --bit-depth=8 --film-grain-table=film_grain.tbl \ + * -o denoised_with_grain_params.ivf denoised.854_480.yuv + */ +#include +#include +#include +#include + +#include "aom/aom_encoder.h" +#include "aom_dsp/aom_dsp_common.h" + +#if CONFIG_AV1_DECODER +#include "aom_dsp/grain_synthesis.h" +#endif + +#include "aom_dsp/grain_table.h" +#include "aom_dsp/noise_model.h" +#include "aom_dsp/noise_util.h" +#include "aom_mem/aom_mem.h" +#include "common/args.h" +#include "common/tools_common.h" +#include "common/video_writer.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, + "Usage: %s --input= --input-denoised= " + "--output-grain-table= " + "See comments in noise_model.c for more information.\n", + exec_name); + exit(EXIT_FAILURE); +} + +static const arg_def_t help = + ARG_DEF(NULL, "help", 0, "Show usage options and exit"); +static const arg_def_t width_arg = + ARG_DEF("w", "width", 1, "Input width (if rawvideo)"); +static const arg_def_t height_arg = + ARG_DEF("h", "height", 1, "Input height (if rawvideo)"); +static const arg_def_t skip_frames_arg = + ARG_DEF("s", "skip-frames", 1, "Number of frames to skip (default = 1)"); +static const arg_def_t fps_arg = ARG_DEF(NULL, "fps", 1, "Frame rate"); +static const arg_def_t input_arg = ARG_DEF("-i", "input", 1, "Input filename"); +static const arg_def_t output_grain_table_arg = + ARG_DEF("n", "output-grain-table", 1, "Output noise file"); +static const arg_def_t input_denoised_arg = + ARG_DEF("d", "input-denoised", 1, "Input denoised filename (YUV) only"); +static const arg_def_t flat_block_finder_arg = + ARG_DEF("b", "flat-block-finder", 1, "Run the flat block finder"); +static const arg_def_t block_size_arg = + ARG_DEF("b", "block-size", 1, "Block size"); +static const arg_def_t bit_depth_arg = + ARG_DEF(NULL, "bit-depth", 1, "Bit depth of input"); +static const arg_def_t use_i420 = + ARG_DEF(NULL, "i420", 0, "Input file (and denoised) is I420 (default)"); +static const arg_def_t use_i422 = + ARG_DEF(NULL, "i422", 0, "Input file (and denoised) is I422"); +static const arg_def_t use_i444 = + ARG_DEF(NULL, "i444", 0, "Input file (and denoised) is I444"); +static const arg_def_t debug_file_arg = + ARG_DEF(NULL, "debug-file", 1, "File to output debug info"); + +typedef struct { + int width; + int height; + struct aom_rational fps; + const char *input; + const char *input_denoised; + const char *output_grain_table; + int img_fmt; + int block_size; + int bit_depth; + int run_flat_block_finder; + int force_flat_psd; + int skip_frames; + const char *debug_file; +} noise_model_args_t; + +static void parse_args(noise_model_args_t *noise_args, int *argc, char **argv) { + struct arg arg; + static const arg_def_t *main_args[] = { &help, + &input_arg, + &fps_arg, + &width_arg, + &height_arg, + &block_size_arg, + &output_grain_table_arg, + &input_denoised_arg, + &use_i420, + &use_i422, + &use_i444, + &debug_file_arg, + NULL }; + for (int argi = *argc + 1; *argv; argi++, argv++) { + if (arg_match(&arg, &help, argv)) { + fprintf(stdout, "\nOptions:\n"); + arg_show_usage(stdout, main_args); + exit(0); + } else if (arg_match(&arg, &width_arg, argv)) { + noise_args->width = atoi(arg.val); + } else if (arg_match(&arg, &height_arg, argv)) { + noise_args->height = atoi(arg.val); + } else if (arg_match(&arg, &input_arg, argv)) { + noise_args->input = arg.val; + } else if (arg_match(&arg, &input_denoised_arg, argv)) { + noise_args->input_denoised = arg.val; + } else if (arg_match(&arg, &output_grain_table_arg, argv)) { + noise_args->output_grain_table = arg.val; + } else if (arg_match(&arg, &block_size_arg, argv)) { + noise_args->block_size = atoi(arg.val); + } else if (arg_match(&arg, &bit_depth_arg, argv)) { + noise_args->bit_depth = atoi(arg.val); + } else if (arg_match(&arg, &flat_block_finder_arg, argv)) { + noise_args->run_flat_block_finder = atoi(arg.val); + } else if (arg_match(&arg, &fps_arg, argv)) { + noise_args->fps = arg_parse_rational(&arg); + } else if (arg_match(&arg, &use_i420, argv)) { + noise_args->img_fmt = AOM_IMG_FMT_I420; + } else if (arg_match(&arg, &use_i422, argv)) { + noise_args->img_fmt = AOM_IMG_FMT_I422; + } else if (arg_match(&arg, &use_i444, argv)) { + noise_args->img_fmt = AOM_IMG_FMT_I444; + } else if (arg_match(&arg, &skip_frames_arg, argv)) { + noise_args->skip_frames = atoi(arg.val); + } else if (arg_match(&arg, &debug_file_arg, argv)) { + noise_args->debug_file = arg.val; + } else { + fprintf(stdout, "Unknown arg: %s\n\nUsage:\n", *argv); + arg_show_usage(stdout, main_args); + exit(0); + } + } + if (noise_args->bit_depth > 8) { + noise_args->img_fmt |= AOM_IMG_FMT_HIGHBITDEPTH; + } +} + +#if CONFIG_AV1_DECODER +static void print_variance_y(FILE *debug_file, aom_image_t *raw, + aom_image_t *denoised, const uint8_t *flat_blocks, + int block_size, aom_film_grain_t *grain) { + aom_image_t renoised; + grain->apply_grain = 1; + grain->random_seed = 7391; + grain->bit_depth = raw->bit_depth; + aom_img_alloc(&renoised, raw->fmt, raw->w, raw->h, 1); + + if (av1_add_film_grain(grain, denoised, &renoised)) { + fprintf(stderr, "Internal failure in av1_add_film_grain().\n"); + aom_img_free(&renoised); + return; + } + + const int num_blocks_w = (raw->w + block_size - 1) / block_size; + const int num_blocks_h = (raw->h + block_size - 1) / block_size; + fprintf(debug_file, "x = ["); + for (int by = 0; by < num_blocks_h; by++) { + for (int bx = 0; bx < num_blocks_w; bx++) { + double block_mean = 0; + double noise_std = 0, noise_mean = 0; + double renoise_std = 0, renoise_mean = 0; + for (int yi = 0; yi < block_size; ++yi) { + const int y = by * block_size + yi; + for (int xi = 0; xi < block_size; ++xi) { + const int x = bx * block_size + xi; + const double noise_v = (raw->planes[0][y * raw->stride[0] + x] - + denoised->planes[0][y * raw->stride[0] + x]); + noise_mean += noise_v; + noise_std += noise_v * noise_v; + + block_mean += raw->planes[0][y * raw->stride[0] + x]; + + const double renoise_v = + (renoised.planes[0][y * raw->stride[0] + x] - + denoised->planes[0][y * raw->stride[0] + x]); + renoise_mean += renoise_v; + renoise_std += renoise_v * renoise_v; + } + } + int n = (block_size * block_size); + block_mean /= n; + noise_mean /= n; + renoise_mean /= n; + noise_std = sqrt(noise_std / n - noise_mean * noise_mean); + renoise_std = sqrt(renoise_std / n - renoise_mean * renoise_mean); + fprintf(debug_file, "%d %3.2lf %3.2lf %3.2lf ", + flat_blocks[by * num_blocks_w + bx], block_mean, noise_std, + renoise_std); + } + fprintf(debug_file, "\n"); + } + fprintf(debug_file, "];\n"); + + if (raw->fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + fprintf(stderr, + "Detailed debug info not supported for high bit" + "depth formats\n"); + } else { + fprintf(debug_file, "figure(2); clf;\n"); + fprintf(debug_file, + "scatter(x(:, 2:4:end), x(:, 3:4:end), 'r'); hold on;\n"); + fprintf(debug_file, "scatter(x(:, 2:4:end), x(:, 4:4:end), 'b');\n"); + fprintf(debug_file, + "plot(linspace(0, 255, length(noise_strength_0)), " + "noise_strength_0, 'b');\n"); + fprintf(debug_file, + "title('Scatter plot of intensity vs noise strength');\n"); + fprintf(debug_file, + "legend('Actual', 'Estimated', 'Estimated strength');\n"); + fprintf(debug_file, "figure(3); clf;\n"); + fprintf(debug_file, "scatter(x(:, 3:4:end), x(:, 4:4:end), 'k');\n"); + fprintf(debug_file, "title('Actual vs Estimated');\n"); + fprintf(debug_file, "pause(3);\n"); + } + aom_img_free(&renoised); +} +#endif + +static void print_debug_info(FILE *debug_file, aom_image_t *raw, + aom_image_t *denoised, uint8_t *flat_blocks, + int block_size, aom_noise_model_t *noise_model) { + (void)raw; + (void)denoised; + (void)flat_blocks; + (void)block_size; + fprintf(debug_file, "figure(3); clf;\n"); + fprintf(debug_file, "figure(2); clf;\n"); + fprintf(debug_file, "figure(1); clf;\n"); + for (int c = 0; c < 3; ++c) { + fprintf(debug_file, "noise_strength_%d = [\n", c); + const aom_equation_system_t *eqns = + &noise_model->combined_state[c].strength_solver.eqns; + for (int k = 0; k < eqns->n; ++k) { + fprintf(debug_file, "%lf ", eqns->x[k]); + } + fprintf(debug_file, "];\n"); + fprintf(debug_file, "plot(noise_strength_%d); hold on;\n", c); + } + fprintf(debug_file, "legend('Y', 'cb', 'cr');\n"); + fprintf(debug_file, "title('Noise strength function');\n"); + +#if CONFIG_AV1_DECODER + aom_film_grain_t grain; + aom_noise_model_get_grain_parameters(noise_model, &grain); + print_variance_y(debug_file, raw, denoised, flat_blocks, block_size, &grain); +#endif + fflush(debug_file); +} + +int main(int argc, char *argv[]) { + noise_model_args_t args = { 0, 0, { 25, 1 }, 0, 0, 0, AOM_IMG_FMT_I420, + 32, 8, 1, 0, 1, NULL }; + aom_image_t raw, denoised; + FILE *infile = NULL; + AvxVideoInfo info; + + memset(&info, 0, sizeof(info)); + + exec_name = argv[0]; + parse_args(&args, &argc, argv + 1); + + info.frame_width = args.width; + info.frame_height = args.height; + info.time_base.numerator = args.fps.den; + info.time_base.denominator = args.fps.num; + + if (info.frame_width <= 0 || info.frame_height <= 0 || + (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) { + die("Invalid frame size: %dx%d", info.frame_width, info.frame_height); + } + if (!aom_img_alloc(&raw, args.img_fmt, info.frame_width, info.frame_height, + 1)) { + die("Failed to allocate image."); + } + if (!aom_img_alloc(&denoised, args.img_fmt, info.frame_width, + info.frame_height, 1)) { + die("Failed to allocate image."); + } + infile = fopen(args.input, "rb"); + if (!infile) { + die("Failed to open input file:", args.input); + } + fprintf(stderr, "Bit depth: %d stride:%d\n", args.bit_depth, raw.stride[0]); + + const int high_bd = args.bit_depth > 8; + const int block_size = args.block_size; + aom_flat_block_finder_t block_finder; + aom_flat_block_finder_init(&block_finder, block_size, args.bit_depth, + high_bd); + + const int num_blocks_w = (info.frame_width + block_size - 1) / block_size; + const int num_blocks_h = (info.frame_height + block_size - 1) / block_size; + uint8_t *flat_blocks = (uint8_t *)aom_malloc(num_blocks_w * num_blocks_h); + // Sets the random seed on the first entry in the output table + int16_t random_seed = 7391; + aom_noise_model_t noise_model; + aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3, args.bit_depth, + high_bd }; + aom_noise_model_init(&noise_model, params); + + FILE *denoised_file = 0; + if (args.input_denoised) { + denoised_file = fopen(args.input_denoised, "rb"); + if (!denoised_file) + die("Unable to open input_denoised: %s", args.input_denoised); + } else { + die("--input-denoised file must be specified"); + } + FILE *debug_file = 0; + if (args.debug_file) { + debug_file = fopen(args.debug_file, "w"); + } + aom_film_grain_table_t grain_table = { 0, 0 }; + + int64_t prev_timestamp = 0; + int frame_count = 0; + while (aom_img_read(&raw, infile)) { + if (args.input_denoised) { + if (!aom_img_read(&denoised, denoised_file)) { + die("Unable to read input denoised file"); + } + } + if (frame_count % args.skip_frames == 0) { + int num_flat_blocks = num_blocks_w * num_blocks_h; + memset(flat_blocks, 1, num_flat_blocks); + if (args.run_flat_block_finder) { + memset(flat_blocks, 0, num_flat_blocks); + num_flat_blocks = aom_flat_block_finder_run( + &block_finder, raw.planes[0], info.frame_width, info.frame_height, + info.frame_width, flat_blocks); + fprintf(stdout, "Num flat blocks %d\n", num_flat_blocks); + } + + const uint8_t *planes[3] = { raw.planes[0], raw.planes[1], + raw.planes[2] }; + uint8_t *denoised_planes[3] = { denoised.planes[0], denoised.planes[1], + denoised.planes[2] }; + int strides[3] = { raw.stride[0] >> high_bd, raw.stride[1] >> high_bd, + raw.stride[2] >> high_bd }; + int chroma_sub[3] = { raw.x_chroma_shift, raw.y_chroma_shift, 0 }; + + fprintf(stdout, "Updating noise model...\n"); + aom_noise_status_t status = aom_noise_model_update( + &noise_model, (const uint8_t *const *)planes, + (const uint8_t *const *)denoised_planes, info.frame_width, + info.frame_height, strides, chroma_sub, flat_blocks, block_size); + + int64_t cur_timestamp = + frame_count * 10000000ULL * args.fps.den / args.fps.num; + if (status == AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE) { + fprintf(stdout, + "Noise type is different, updating parameters for time " + "[ %" PRId64 ", %" PRId64 ")\n", + prev_timestamp, cur_timestamp); + aom_film_grain_t grain; + aom_noise_model_get_grain_parameters(&noise_model, &grain); + grain.random_seed = random_seed; + random_seed = 0; + aom_film_grain_table_append(&grain_table, prev_timestamp, cur_timestamp, + &grain); + aom_noise_model_save_latest(&noise_model); + prev_timestamp = cur_timestamp; + } + if (debug_file) { + print_debug_info(debug_file, &raw, &denoised, flat_blocks, block_size, + &noise_model); + } + fprintf(stdout, "Done noise model update, status = %d\n", status); + } + frame_count++; + } + + aom_film_grain_t grain; + aom_noise_model_get_grain_parameters(&noise_model, &grain); + grain.random_seed = random_seed; + aom_film_grain_table_append(&grain_table, prev_timestamp, INT64_MAX, &grain); + if (args.output_grain_table) { + struct aom_internal_error_info error_info; + if (AOM_CODEC_OK != aom_film_grain_table_write(&grain_table, + args.output_grain_table, + &error_info)) { + die("Unable to write output film grain table"); + } + } + aom_film_grain_table_free(&grain_table); + + if (infile) fclose(infile); + if (denoised_file) fclose(denoised_file); + if (debug_file) fclose(debug_file); + aom_img_free(&raw); + aom_img_free(&denoised); + + return EXIT_SUCCESS; +} diff --git a/libs/libaom/src/examples/resize_util.c b/libs/libaom/src/examples/resize_util.c new file mode 100644 index 000000000..5692c2062 --- /dev/null +++ b/libs/libaom/src/examples/resize_util.c @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include +#include + +#include "av1/common/resize.h" +#include "common/tools_common.h" + +static const char *exec_name = NULL; + +static void usage() { + printf("Usage:\n"); + printf("%s x x ", + exec_name); + printf(" []\n"); +} + +void usage_exit(void) { + usage(); + exit(EXIT_FAILURE); +} + +static int parse_dim(char *v, int *width, int *height) { + char *x = strchr(v, 'x'); + if (x == NULL) x = strchr(v, 'X'); + if (x == NULL) return 0; + *width = atoi(v); + *height = atoi(&x[1]); + if (*width <= 0 || *height <= 0) + return 0; + else + return 1; +} + +int main(int argc, char *argv[]) { + char *fin, *fout; + FILE *fpin, *fpout; + uint8_t *inbuf, *outbuf; + uint8_t *inbuf_u, *outbuf_u; + uint8_t *inbuf_v, *outbuf_v; + int f, frames; + int width, height, target_width, target_height; + + exec_name = argv[0]; + + if (argc < 5) { + printf("Incorrect parameters:\n"); + usage(); + return 1; + } + + fin = argv[1]; + fout = argv[4]; + if (!parse_dim(argv[2], &width, &height)) { + printf("Incorrect parameters: %s\n", argv[2]); + usage(); + return 1; + } + if (!parse_dim(argv[3], &target_width, &target_height)) { + printf("Incorrect parameters: %s\n", argv[3]); + usage(); + return 1; + } + + fpin = fopen(fin, "rb"); + if (fpin == NULL) { + printf("Can't open file %s to read\n", fin); + usage(); + return 1; + } + fpout = fopen(fout, "wb"); + if (fpout == NULL) { + fclose(fpin); + printf("Can't open file %s to write\n", fout); + usage(); + return 1; + } + if (argc >= 6) + frames = atoi(argv[5]); + else + frames = INT_MAX; + + printf("Input size: %dx%d\n", width, height); + printf("Target size: %dx%d, Frames: ", target_width, target_height); + if (frames == INT_MAX) + printf("All\n"); + else + printf("%d\n", frames); + + inbuf = (uint8_t *)malloc(width * height * 3 / 2); + outbuf = (uint8_t *)malloc(target_width * target_height * 3 / 2); + inbuf_u = inbuf + width * height; + inbuf_v = inbuf_u + width * height / 4; + outbuf_u = outbuf + target_width * target_height; + outbuf_v = outbuf_u + target_width * target_height / 4; + f = 0; + while (f < frames) { + if (fread(inbuf, width * height * 3 / 2, 1, fpin) != 1) break; + av1_resize_frame420(inbuf, width, inbuf_u, inbuf_v, width / 2, height, + width, outbuf, target_width, outbuf_u, outbuf_v, + target_width / 2, target_height, target_width); + fwrite(outbuf, target_width * target_height * 3 / 2, 1, fpout); + f++; + } + printf("%d frames processed\n", f); + fclose(fpin); + fclose(fpout); + + free(inbuf); + free(outbuf); + return 0; +} diff --git a/libs/libaom/src/examples/scalable_decoder.c b/libs/libaom/src/examples/scalable_decoder.c new file mode 100644 index 000000000..c22924223 --- /dev/null +++ b/libs/libaom/src/examples/scalable_decoder.c @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Scalable Decoder +// ============== +// +// This is an example of a scalable decoder loop. It takes a 2-spatial-layer +// input file +// containing the compressed data (in OBU format), passes it through the +// decoder, and writes the decompressed frames to disk. The base layer and +// enhancement layers are stored as separate files, out_lyr0.yuv and +// out_lyr1.yuv, respectively. +// +// Standard Includes +// ----------------- +// For decoders, you only have to include `aom_decoder.h` and then any +// header files for the specific codecs you use. In this case, we're using +// av1. +// +// Initializing The Codec +// ---------------------- +// The libaom decoder is initialized by the call to aom_codec_dec_init(). +// Determining the codec interface to use is handled by AvxVideoReader and the +// functions prefixed with aom_video_reader_. Discussion of those functions is +// beyond the scope of this example, but the main gist is to open the input file +// and parse just enough of it to determine if it's a AVx file and which AVx +// codec is contained within the file. +// Note the NULL pointer passed to aom_codec_dec_init(). We do that in this +// example because we want the algorithm to determine the stream configuration +// (width/height) and allocate memory automatically. +// +// Decoding A Frame +// ---------------- +// Once the frame has been read into memory, it is decoded using the +// `aom_codec_decode` function. The call takes a pointer to the data +// (`frame`) and the length of the data (`frame_size`). No application data +// is associated with the frame in this example, so the `user_priv` +// parameter is NULL. The `deadline` parameter is left at zero for this +// example. This parameter is generally only used when doing adaptive post +// processing. +// +// Codecs may produce a variable number of output frames for every call to +// `aom_codec_decode`. These frames are retrieved by the +// `aom_codec_get_frame` iterator function. The iterator variable `iter` is +// initialized to NULL each time `aom_codec_decode` is called. +// `aom_codec_get_frame` is called in a loop, returning a pointer to a +// decoded image or NULL to indicate the end of list. +// +// Processing The Decoded Data +// --------------------------- +// In this example, we simply write the encoded data to disk. It is +// important to honor the image's `stride` values. +// +// Cleanup +// ------- +// The `aom_codec_destroy` call frees any memory allocated by the codec. +// +// Error Handling +// -------------- +// This example does not special case any error return codes. If there was +// an error, a descriptive message is printed and the program exits. With +// few exceptions, aom_codec functions return an enumerated error status, +// with the value `0` indicating success. + +#include +#include +#include + +#include "aom/aom_decoder.h" +#include "aom/aomdx.h" +#include "common/obudec.h" +#include "common/tools_common.h" +#include "common/video_reader.h" + +static const char *exec_name; + +#define MAX_LAYERS 5 + +void usage_exit(void) { + fprintf(stderr, "Usage: %s \n", exec_name); + exit(EXIT_FAILURE); +} + +int main(int argc, char **argv) { + int frame_cnt = 0; + FILE *outfile[MAX_LAYERS]; + char filename[80]; + aom_codec_ctx_t codec; + const AvxInterface *decoder = NULL; + FILE *inputfile = NULL; + uint8_t *buf = NULL; + size_t bytes_in_buffer = 0; + size_t buffer_size = 0; + struct AvxInputContext aom_input_ctx; + struct ObuDecInputContext obu_ctx = { &aom_input_ctx, NULL, 0, 0, 0 }; + aom_codec_stream_info_t si; + uint8_t tmpbuf[32]; + unsigned int i; + + exec_name = argv[0]; + + if (argc != 2) die("Invalid number of arguments."); + + if (!(inputfile = fopen(argv[1], "rb"))) + die("Failed to open %s for read.", argv[1]); + obu_ctx.avx_ctx->file = inputfile; + obu_ctx.avx_ctx->filename = argv[1]; + + decoder = get_aom_decoder_by_index(0); + printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface())); + + if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0)) + die_codec(&codec, "Failed to initialize decoder."); + + if (aom_codec_control(&codec, AV1D_SET_OUTPUT_ALL_LAYERS, 1)) { + die_codec(&codec, "Failed to set output_all_layers control."); + } + + // peak sequence header OBU to get number of spatial layers + const size_t ret = fread(tmpbuf, 1, 32, inputfile); + if (ret != 32) die_codec(&codec, "Input is not a valid obu file"); + si.is_annexb = 0; + if (aom_codec_peek_stream_info(decoder->codec_interface(), tmpbuf, 32, &si)) { + die_codec(&codec, "Input is not a valid obu file"); + } + fseek(inputfile, -32, SEEK_CUR); + + if (!file_is_obu(&obu_ctx)) + die_codec(&codec, "Input is not a valid obu file"); + + // open base layer output yuv file + snprintf(filename, sizeof(filename), "out_lyr%d.yuv", 0); + if (!(outfile[0] = fopen(filename, "wb"))) + die("Failed top open output for writing."); + + // open any enhancement layer output yuv files + for (i = 1; i < si.number_spatial_layers; i++) { + snprintf(filename, sizeof(filename), "out_lyr%d.yuv", i); + if (!(outfile[i] = fopen(filename, "wb"))) + die("Failed to open output for writing."); + } + + while (!obudec_read_temporal_unit(&obu_ctx, &buf, &bytes_in_buffer, + &buffer_size)) { + aom_codec_iter_t iter = NULL; + aom_image_t *img = NULL; + if (aom_codec_decode(&codec, buf, bytes_in_buffer, NULL)) + die_codec(&codec, "Failed to decode frame."); + + while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) { + aom_image_t *img_shifted = + aom_img_alloc(NULL, AOM_IMG_FMT_I420, img->d_w, img->d_h, 16); + img_shifted->bit_depth = 8; + aom_img_downshift(img_shifted, img, + img->bit_depth - img_shifted->bit_depth); + if (img->spatial_id == 0) { + printf("Writing base layer 0 %d\n", frame_cnt); + aom_img_write(img_shifted, outfile[0]); + } else if (img->spatial_id <= (int)(si.number_spatial_layers - 1)) { + printf("Writing enhancement layer %d %d\n", img->spatial_id, frame_cnt); + aom_img_write(img_shifted, outfile[img->spatial_id]); + } else { + die_codec(&codec, "Invalid bitstream. Layer id exceeds layer count"); + } + if (img->spatial_id == (int)(si.number_spatial_layers - 1)) ++frame_cnt; + } + } + + printf("Processed %d frames.\n", frame_cnt); + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); + + for (i = 0; i < si.number_spatial_layers; i++) fclose(outfile[i]); + + fclose(inputfile); + + return EXIT_SUCCESS; +} diff --git a/libs/libaom/src/examples/scalable_encoder.c b/libs/libaom/src/examples/scalable_encoder.c new file mode 100644 index 000000000..7af03e29f --- /dev/null +++ b/libs/libaom/src/examples/scalable_encoder.c @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Scalable Encoder +// ============== +// +// This is an example of a scalable encoder loop. It takes two input files in +// YV12 format, passes it through the encoder, and writes the compressed +// frames to disk in OBU format. +// +// Getting The Default Configuration +// --------------------------------- +// Encoders have the notion of "usage profiles." For example, an encoder +// may want to publish default configurations for both a video +// conferencing application and a best quality offline encoder. These +// obviously have very different default settings. Consult the +// documentation for your codec to see if it provides any default +// configurations. All codecs provide a default configuration, number 0, +// which is valid for material in the vacinity of QCIF/QVGA. +// +// Updating The Configuration +// --------------------------------- +// Almost all applications will want to update the default configuration +// with settings specific to their usage. Here we set the width and height +// of the video file to that specified on the command line. We also scale +// the default bitrate based on the ratio between the default resolution +// and the resolution specified on the command line. +// +// Encoding A Frame +// ---------------- +// The frame is read as a continuous block (size = width * height * 3 / 2) +// from the input file. If a frame was read (the input file has not hit +// EOF) then the frame is passed to the encoder. Otherwise, a NULL +// is passed, indicating the End-Of-Stream condition to the encoder. The +// `frame_cnt` is reused as the presentation time stamp (PTS) and each +// frame is shown for one frame-time in duration. The flags parameter is +// unused in this example. + +// Forced Keyframes +// ---------------- +// Keyframes can be forced by setting the AOM_EFLAG_FORCE_KF bit of the +// flags passed to `aom_codec_control()`. In this example, we force a +// keyframe every frames. Note, the output stream can +// contain additional keyframes beyond those that have been forced using the +// AOM_EFLAG_FORCE_KF flag because of automatic keyframe placement by the +// encoder. +// +// Processing The Encoded Data +// --------------------------- +// Each packet of type `AOM_CODEC_CX_FRAME_PKT` contains the encoded data +// for this frame. We write a IVF frame header, followed by the raw data. +// +// Cleanup +// ------- +// The `aom_codec_destroy` call frees any memory allocated by the codec. +// +// Error Handling +// -------------- +// This example does not special case any error return codes. If there was +// an error, a descriptive message is printed and the program exits. With +// few exeptions, aom_codec functions return an enumerated error status, +// with the value `0` indicating success. + +#include +#include +#include + +#include "aom/aom_encoder.h" +#include "aom/aomcx.h" +#include "av1/common/enums.h" +#include "common/tools_common.h" +#include "common/video_writer.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, + "Usage: %s " + " \n" + "See comments in scalable_encoder.c for more information.\n", + exec_name); + exit(EXIT_FAILURE); +} + +static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img, + int frame_index, int flags, FILE *outfile) { + int got_pkts = 0; + aom_codec_iter_t iter = NULL; + const aom_codec_cx_pkt_t *pkt = NULL; + const aom_codec_err_t res = + aom_codec_encode(codec, img, frame_index, 1, flags); + if (res != AOM_CODEC_OK) die_codec(codec, "Failed to encode frame"); + + while ((pkt = aom_codec_get_cx_data(codec, &iter)) != NULL) { + got_pkts = 1; + + if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) { + const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0; + if (fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile) != + pkt->data.frame.sz) { + die_codec(codec, "Failed to write compressed frame"); + } + printf(keyframe ? "K" : "."); + printf(" %6d\n", (int)pkt->data.frame.sz); + fflush(stdout); + } + } + + return got_pkts; +} + +int main(int argc, char **argv) { + FILE *infile0 = NULL; + FILE *infile1 = NULL; + aom_codec_ctx_t codec; + aom_codec_enc_cfg_t cfg; + int frame_count = 0; + aom_image_t raw0, raw1; + aom_codec_err_t res; + AvxVideoInfo info; + const AvxInterface *encoder = NULL; + const int fps = 30; + const int bitrate = 200; + int keyframe_interval = 0; + int max_frames = 0; + int frames_encoded = 0; + const char *codec_arg = NULL; + const char *width_arg = NULL; + const char *height_arg = NULL; + const char *infile0_arg = NULL; + const char *infile1_arg = NULL; + const char *outfile_arg = NULL; + // const char *keyframe_interval_arg = NULL; + FILE *outfile = NULL; + + exec_name = argv[0]; + + // Clear explicitly, as simply assigning "{ 0 }" generates + // "missing-field-initializers" warning in some compilers. + memset(&info, 0, sizeof(info)); + + if (argc != 8) die("Invalid number of arguments"); + + codec_arg = argv[1]; + width_arg = argv[2]; + height_arg = argv[3]; + infile0_arg = argv[4]; + infile1_arg = argv[5]; + outfile_arg = argv[6]; + max_frames = (int)strtol(argv[7], NULL, 0); + + encoder = get_aom_encoder_by_name(codec_arg); + if (!encoder) die("Unsupported codec."); + + info.codec_fourcc = encoder->fourcc; + info.frame_width = (int)strtol(width_arg, NULL, 0); + info.frame_height = (int)strtol(height_arg, NULL, 0); + info.time_base.numerator = 1; + info.time_base.denominator = fps; + + if (info.frame_width <= 0 || info.frame_height <= 0 || + (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) { + die("Invalid frame size: %dx%d", info.frame_width, info.frame_height); + } + + if (!aom_img_alloc(&raw0, AOM_IMG_FMT_I420, info.frame_width, + info.frame_height, 1)) { + die("Failed to allocate image for layer 0."); + } + if (!aom_img_alloc(&raw1, AOM_IMG_FMT_I420, info.frame_width, + info.frame_height, 1)) { + die("Failed to allocate image for layer 1."); + } + + // keyframe_interval = (int)strtol(keyframe_interval_arg, NULL, 0); + keyframe_interval = 100; + if (keyframe_interval < 0) die("Invalid keyframe interval value."); + + printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface())); + + res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0); + if (res) die_codec(&codec, "Failed to get default codec config."); + + cfg.g_w = info.frame_width; + cfg.g_h = info.frame_height; + cfg.g_timebase.num = info.time_base.numerator; + cfg.g_timebase.den = info.time_base.denominator; + cfg.rc_target_bitrate = bitrate; + cfg.g_error_resilient = 0; + cfg.g_lag_in_frames = 0; + cfg.rc_end_usage = AOM_Q; + cfg.save_as_annexb = 0; + + outfile = fopen(outfile_arg, "wb"); + if (!outfile) die("Failed to open %s for writing.", outfile_arg); + + if (!(infile0 = fopen(infile0_arg, "rb"))) + die("Failed to open %s for reading.", infile0_arg); + if (!(infile1 = fopen(infile1_arg, "rb"))) + die("Failed to open %s for reading.", infile0_arg); + + if (aom_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0)) + die_codec(&codec, "Failed to initialize encoder"); + if (aom_codec_control(&codec, AOME_SET_CPUUSED, 8)) + die_codec(&codec, "Failed to set cpu to 8"); + + if (aom_codec_control(&codec, AV1E_SET_TILE_COLUMNS, 2)) + die_codec(&codec, "Failed to set tile columns to 2"); + if (aom_codec_control(&codec, AV1E_SET_NUM_TG, 3)) + die_codec(&codec, "Failed to set num of tile groups to 3"); + + if (aom_codec_control(&codec, AOME_SET_NUMBER_SPATIAL_LAYERS, 2)) + die_codec(&codec, "Failed to set number of spatial layers to 2"); + + // Encode frames. + while (aom_img_read(&raw0, infile0)) { + int flags = 0; + + // configure and encode base layer + + if (keyframe_interval > 0 && frames_encoded % keyframe_interval == 0) + flags |= AOM_EFLAG_FORCE_KF; + else + // use previous base layer (LAST) as sole reference + // save this frame as LAST to be used as reference by enhanmcent layer + // and next base layer + flags |= AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 | + AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | + AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 | + AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF | + AOM_EFLAG_NO_UPD_ENTROPY; + cfg.g_w = info.frame_width; + cfg.g_h = info.frame_height; + if (aom_codec_enc_config_set(&codec, &cfg)) + die_codec(&codec, "Failed to set enc cfg for layer 0"); + if (aom_codec_control(&codec, AOME_SET_SPATIAL_LAYER_ID, 0)) + die_codec(&codec, "Failed to set layer id to 0"); + if (aom_codec_control(&codec, AOME_SET_CQ_LEVEL, 62)) + die_codec(&codec, "Failed to set cq level"); + encode_frame(&codec, &raw0, frame_count++, flags, outfile); + + // configure and encode enhancement layer + + // use LAST (base layer) as sole reference + flags = AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 | + AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD | + AOM_EFLAG_NO_REF_ARF2 | AOM_EFLAG_NO_UPD_LAST | + AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF | + AOM_EFLAG_NO_UPD_ENTROPY; + cfg.g_w = info.frame_width; + cfg.g_h = info.frame_height; + aom_img_read(&raw1, infile1); + if (aom_codec_enc_config_set(&codec, &cfg)) + die_codec(&codec, "Failed to set enc cfg for layer 1"); + if (aom_codec_control(&codec, AOME_SET_SPATIAL_LAYER_ID, 1)) + die_codec(&codec, "Failed to set layer id to 1"); + if (aom_codec_control(&codec, AOME_SET_CQ_LEVEL, 10)) + die_codec(&codec, "Failed to set cq level"); + encode_frame(&codec, &raw1, frame_count++, flags, outfile); + + frames_encoded++; + + if (max_frames > 0 && frames_encoded >= max_frames) break; + } + + // Flush encoder. + while (encode_frame(&codec, NULL, -1, 0, outfile)) continue; + + printf("\n"); + fclose(infile0); + fclose(infile1); + printf("Processed %d frames.\n", frame_count / 2); + + aom_img_free(&raw0); + aom_img_free(&raw1); + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + fclose(outfile); + + return EXIT_SUCCESS; +} diff --git a/libs/libaom/src/examples/set_maps.c b/libs/libaom/src/examples/set_maps.c new file mode 100644 index 000000000..9aeb96e43 --- /dev/null +++ b/libs/libaom/src/examples/set_maps.c @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// AOM Set Active and ROI Maps +// =========================== +// +// This is an example demonstrating how to control the AOM encoder's +// ROI and Active maps. +// +// ROI (Reigon of Interest) maps are a way for the application to assign +// each macroblock in the image to a region, and then set quantizer and +// filtering parameters on that image. +// +// Active maps are a way for the application to specify on a +// macroblock-by-macroblock basis whether there is any activity in that +// macroblock. +// +// +// Configuration +// ------------- +// An ROI map is set on frame 22. If the width of the image in macroblocks +// is evenly divisble by 4, then the output will appear to have distinct +// columns, where the quantizer, loopfilter, and static threshold differ +// from column to column. +// +// An active map is set on frame 33. If the width of the image in macroblocks +// is evenly divisble by 4, then the output will appear to have distinct +// columns, where one column will have motion and the next will not. +// +// The active map is cleared on frame 44. +// +// Observing The Effects +// --------------------- +// Use the `simple_decoder` example to decode this sample, and observe +// the change in the image at frames 22, 33, and 44. + +#include +#include +#include +#include + +#include "aom/aom_encoder.h" +#include "aom/aomcx.h" +#include "common/tools_common.h" +#include "common/video_writer.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, "Usage: %s \n", + exec_name); + exit(EXIT_FAILURE); +} + +static void set_active_map(const aom_codec_enc_cfg_t *cfg, + aom_codec_ctx_t *codec) { + unsigned int i; + aom_active_map_t map = { 0, 0, 0 }; + + map.rows = (cfg->g_h + 15) / 16; + map.cols = (cfg->g_w + 15) / 16; + + map.active_map = (uint8_t *)malloc(map.rows * map.cols); + for (i = 0; i < map.rows * map.cols; ++i) map.active_map[i] = i % 2; + + if (aom_codec_control(codec, AOME_SET_ACTIVEMAP, &map)) + die_codec(codec, "Failed to set active map"); + + free(map.active_map); +} + +static void unset_active_map(const aom_codec_enc_cfg_t *cfg, + aom_codec_ctx_t *codec) { + aom_active_map_t map = { 0, 0, 0 }; + + map.rows = (cfg->g_h + 15) / 16; + map.cols = (cfg->g_w + 15) / 16; + map.active_map = NULL; + + if (aom_codec_control(codec, AOME_SET_ACTIVEMAP, &map)) + die_codec(codec, "Failed to set active map"); +} + +static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img, + int frame_index, AvxVideoWriter *writer) { + int got_pkts = 0; + aom_codec_iter_t iter = NULL; + const aom_codec_cx_pkt_t *pkt = NULL; + const aom_codec_err_t res = aom_codec_encode(codec, img, frame_index, 1, 0); + if (res != AOM_CODEC_OK) die_codec(codec, "Failed to encode frame"); + + while ((pkt = aom_codec_get_cx_data(codec, &iter)) != NULL) { + got_pkts = 1; + + if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) { + const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0; + if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf, + pkt->data.frame.sz, + pkt->data.frame.pts)) { + die_codec(codec, "Failed to write compressed frame"); + } + + printf(keyframe ? "K" : "."); + fflush(stdout); + } + } + + return got_pkts; +} + +int main(int argc, char **argv) { + FILE *infile = NULL; + aom_codec_ctx_t codec; + aom_codec_enc_cfg_t cfg; + int frame_count = 0; + const int limit = 15; + aom_image_t raw; + aom_codec_err_t res; + AvxVideoInfo info; + AvxVideoWriter *writer = NULL; + const AvxInterface *encoder = NULL; + const int fps = 2; // TODO(dkovalev) add command line argument + const double bits_per_pixel_per_frame = 0.067; + + exec_name = argv[0]; + if (argc != 6) die("Invalid number of arguments"); + + memset(&info, 0, sizeof(info)); + + encoder = get_aom_encoder_by_name(argv[1]); + if (encoder == NULL) { + die("Unsupported codec."); + } + assert(encoder != NULL); + info.codec_fourcc = encoder->fourcc; + info.frame_width = (int)strtol(argv[2], NULL, 0); + info.frame_height = (int)strtol(argv[3], NULL, 0); + info.time_base.numerator = 1; + info.time_base.denominator = fps; + + if (info.frame_width <= 0 || info.frame_height <= 0 || + (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) { + die("Invalid frame size: %dx%d", info.frame_width, info.frame_height); + } + + if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, info.frame_width, + info.frame_height, 1)) { + die("Failed to allocate image."); + } + + printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface())); + + res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0); + if (res) die_codec(&codec, "Failed to get default codec config."); + + cfg.g_w = info.frame_width; + cfg.g_h = info.frame_height; + cfg.g_timebase.num = info.time_base.numerator; + cfg.g_timebase.den = info.time_base.denominator; + cfg.rc_target_bitrate = + (unsigned int)(bits_per_pixel_per_frame * cfg.g_w * cfg.g_h * fps / 1000); + cfg.g_lag_in_frames = 0; + + writer = aom_video_writer_open(argv[5], kContainerIVF, &info); + if (!writer) die("Failed to open %s for writing.", argv[5]); + + if (!(infile = fopen(argv[4], "rb"))) + die("Failed to open %s for reading.", argv[4]); + + if (aom_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0)) + die_codec(&codec, "Failed to initialize encoder"); + + // Encode frames. + while (aom_img_read(&raw, infile) && frame_count < limit) { + ++frame_count; + + if (frame_count == 5) { + set_active_map(&cfg, &codec); + } else if (frame_count == 11) { + unset_active_map(&cfg, &codec); + } + + encode_frame(&codec, &raw, frame_count, writer); + } + + // Flush encoder. + while (encode_frame(&codec, NULL, -1, writer)) { + } + + printf("\n"); + fclose(infile); + printf("Processed %d frames.\n", frame_count); + + aom_img_free(&raw); + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + aom_video_writer_close(writer); + + return EXIT_SUCCESS; +} diff --git a/libs/libaom/src/examples/simple_decoder.c b/libs/libaom/src/examples/simple_decoder.c new file mode 100644 index 000000000..d098d1e0b --- /dev/null +++ b/libs/libaom/src/examples/simple_decoder.c @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Simple Decoder +// ============== +// +// This is an example of a simple decoder loop. It takes an input file +// containing the compressed data (in IVF format), passes it through the +// decoder, and writes the decompressed frames to disk. Other decoder +// examples build upon this one. +// +// The details of the IVF format have been elided from this example for +// simplicity of presentation, as IVF files will not generally be used by +// your application. In general, an IVF file consists of a file header, +// followed by a variable number of frames. Each frame consists of a frame +// header followed by a variable length payload. The length of the payload +// is specified in the first four bytes of the frame header. The payload is +// the raw compressed data. +// +// Standard Includes +// ----------------- +// For decoders, you only have to include `aom_decoder.h` and then any +// header files for the specific codecs you use. In this case, we're using +// aom. +// +// Initializing The Codec +// ---------------------- +// The libaom decoder is initialized by the call to aom_codec_dec_init(). +// Determining the codec interface to use is handled by AvxVideoReader and the +// functions prefixed with aom_video_reader_. Discussion of those functions is +// beyond the scope of this example, but the main gist is to open the input file +// and parse just enough of it to determine if it's a AVx file and which AVx +// codec is contained within the file. +// Note the NULL pointer passed to aom_codec_dec_init(). We do that in this +// example because we want the algorithm to determine the stream configuration +// (width/height) and allocate memory automatically. +// +// Decoding A Frame +// ---------------- +// Once the frame has been read into memory, it is decoded using the +// `aom_codec_decode` function. The call takes a pointer to the data +// (`frame`) and the length of the data (`frame_size`). No application data +// is associated with the frame in this example, so the `user_priv` +// parameter is NULL. +// +// Codecs may produce a variable number of output frames for every call to +// `aom_codec_decode`. These frames are retrieved by the +// `aom_codec_get_frame` iterator function. The iterator variable `iter` is +// initialized to NULL each time `aom_codec_decode` is called. +// `aom_codec_get_frame` is called in a loop, returning a pointer to a +// decoded image or NULL to indicate the end of list. +// +// Processing The Decoded Data +// --------------------------- +// In this example, we simply write the encoded data to disk. It is +// important to honor the image's `stride` values. +// +// Cleanup +// ------- +// The `aom_codec_destroy` call frees any memory allocated by the codec. +// +// Error Handling +// -------------- +// This example does not special case any error return codes. If there was +// an error, a descriptive message is printed and the program exits. With +// few exceptions, aom_codec functions return an enumerated error status, +// with the value `0` indicating success. + +#include +#include +#include + +#include "aom/aom_decoder.h" +#include "common/tools_common.h" +#include "common/video_reader.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, "Usage: %s \n", exec_name); + exit(EXIT_FAILURE); +} + +int main(int argc, char **argv) { + int frame_cnt = 0; + FILE *outfile = NULL; + aom_codec_ctx_t codec; + AvxVideoReader *reader = NULL; + const AvxInterface *decoder = NULL; + const AvxVideoInfo *info = NULL; + + exec_name = argv[0]; + + if (argc != 3) die("Invalid number of arguments."); + + reader = aom_video_reader_open(argv[1]); + if (!reader) die("Failed to open %s for reading.", argv[1]); + + if (!(outfile = fopen(argv[2], "wb"))) + die("Failed to open %s for writing.", argv[2]); + + info = aom_video_reader_get_info(reader); + + decoder = get_aom_decoder_by_fourcc(info->codec_fourcc); + if (!decoder) die("Unknown input codec."); + + printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface())); + + if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0)) + die_codec(&codec, "Failed to initialize decoder."); + + while (aom_video_reader_read_frame(reader)) { + aom_codec_iter_t iter = NULL; + aom_image_t *img = NULL; + size_t frame_size = 0; + const unsigned char *frame = + aom_video_reader_get_frame(reader, &frame_size); + if (aom_codec_decode(&codec, frame, frame_size, NULL)) + die_codec(&codec, "Failed to decode frame."); + + while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) { + aom_img_write(img, outfile); + ++frame_cnt; + } + } + + printf("Processed %d frames.\n", frame_cnt); + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); + + printf("Play: ffplay -f rawvideo -pix_fmt yuv420p -s %dx%d %s\n", + info->frame_width, info->frame_height, argv[2]); + + aom_video_reader_close(reader); + + fclose(outfile); + + return EXIT_SUCCESS; +} diff --git a/libs/libaom/src/examples/simple_encoder.c b/libs/libaom/src/examples/simple_encoder.c new file mode 100644 index 000000000..01a37cf0c --- /dev/null +++ b/libs/libaom/src/examples/simple_encoder.c @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Simple Encoder +// ============== +// +// This is an example of a simple encoder loop. It takes an input file in +// YV12 format, passes it through the encoder, and writes the compressed +// frames to disk in IVF format. Other decoder examples build upon this +// one. +// +// The details of the IVF format have been elided from this example for +// simplicity of presentation, as IVF files will not generally be used by +// your application. In general, an IVF file consists of a file header, +// followed by a variable number of frames. Each frame consists of a frame +// header followed by a variable length payload. The length of the payload +// is specified in the first four bytes of the frame header. The payload is +// the raw compressed data. +// +// Standard Includes +// ----------------- +// For encoders, you only have to include `aom_encoder.h` and then any +// header files for the specific codecs you use. In this case, we're using +// aom. +// +// Getting The Default Configuration +// --------------------------------- +// Encoders have the notion of "usage profiles." For example, an encoder +// may want to publish default configurations for both a video +// conferencing application and a best quality offline encoder. These +// obviously have very different default settings. Consult the +// documentation for your codec to see if it provides any default +// configurations. All codecs provide a default configuration, number 0, +// which is valid for material in the vacinity of QCIF/QVGA. +// +// Updating The Configuration +// --------------------------------- +// Almost all applications will want to update the default configuration +// with settings specific to their usage. Here we set the width and height +// of the video file to that specified on the command line. We also scale +// the default bitrate based on the ratio between the default resolution +// and the resolution specified on the command line. +// +// Initializing The Codec +// ---------------------- +// The encoder is initialized by the following code. +// +// Encoding A Frame +// ---------------- +// The frame is read as a continuous block (size width * height * 3 / 2) +// from the input file. If a frame was read (the input file has not hit +// EOF) then the frame is passed to the encoder. Otherwise, a NULL +// is passed, indicating the End-Of-Stream condition to the encoder. The +// `frame_cnt` is reused as the presentation time stamp (PTS) and each +// frame is shown for one frame-time in duration. The flags parameter is +// unused in this example. + +// Forced Keyframes +// ---------------- +// Keyframes can be forced by setting the AOM_EFLAG_FORCE_KF bit of the +// flags passed to `aom_codec_control()`. In this example, we force a +// keyframe every frames. Note, the output stream can +// contain additional keyframes beyond those that have been forced using the +// AOM_EFLAG_FORCE_KF flag because of automatic keyframe placement by the +// encoder. +// +// Processing The Encoded Data +// --------------------------- +// Each packet of type `AOM_CODEC_CX_FRAME_PKT` contains the encoded data +// for this frame. We write a IVF frame header, followed by the raw data. +// +// Cleanup +// ------- +// The `aom_codec_destroy` call frees any memory allocated by the codec. +// +// Error Handling +// -------------- +// This example does not special case any error return codes. If there was +// an error, a descriptive message is printed and the program exits. With +// few exeptions, aom_codec functions return an enumerated error status, +// with the value `0` indicating success. +// +// Error Resiliency Features +// ------------------------- +// Error resiliency is controlled by the g_error_resilient member of the +// configuration structure. Use the `decode_with_drops` example to decode with +// frames 5-10 dropped. Compare the output for a file encoded with this example +// versus one encoded with the `simple_encoder` example. + +#include +#include +#include + +#include "aom/aom_encoder.h" +#include "common/tools_common.h" +#include "common/video_writer.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, + "Usage: %s " + " \n" + "See comments in simple_encoder.c for more information.\n", + exec_name); + exit(EXIT_FAILURE); +} + +static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img, + int frame_index, int flags, AvxVideoWriter *writer) { + int got_pkts = 0; + aom_codec_iter_t iter = NULL; + const aom_codec_cx_pkt_t *pkt = NULL; + const aom_codec_err_t res = + aom_codec_encode(codec, img, frame_index, 1, flags); + if (res != AOM_CODEC_OK) die_codec(codec, "Failed to encode frame"); + + while ((pkt = aom_codec_get_cx_data(codec, &iter)) != NULL) { + got_pkts = 1; + + if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) { + const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0; + if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf, + pkt->data.frame.sz, + pkt->data.frame.pts)) { + die_codec(codec, "Failed to write compressed frame"); + } + printf(keyframe ? "K" : "."); + fflush(stdout); + } + } + + return got_pkts; +} + +// TODO(tomfinegan): Improve command line parsing and add args for bitrate/fps. +int main(int argc, char **argv) { + FILE *infile = NULL; + aom_codec_ctx_t codec; + aom_codec_enc_cfg_t cfg; + int frame_count = 0; + aom_image_t raw; + aom_codec_err_t res; + AvxVideoInfo info; + AvxVideoWriter *writer = NULL; + const AvxInterface *encoder = NULL; + const int fps = 30; + const int bitrate = 200; + int keyframe_interval = 0; + int max_frames = 0; + int frames_encoded = 0; + const char *codec_arg = NULL; + const char *width_arg = NULL; + const char *height_arg = NULL; + const char *infile_arg = NULL; + const char *outfile_arg = NULL; + const char *keyframe_interval_arg = NULL; + + exec_name = argv[0]; + + // Clear explicitly, as simply assigning "{ 0 }" generates + // "missing-field-initializers" warning in some compilers. + memset(&info, 0, sizeof(info)); + + if (argc != 9) die("Invalid number of arguments"); + + codec_arg = argv[1]; + width_arg = argv[2]; + height_arg = argv[3]; + infile_arg = argv[4]; + outfile_arg = argv[5]; + keyframe_interval_arg = argv[6]; + max_frames = (int)strtol(argv[8], NULL, 0); + + encoder = get_aom_encoder_by_name(codec_arg); + if (!encoder) die("Unsupported codec."); + + info.codec_fourcc = encoder->fourcc; + info.frame_width = (int)strtol(width_arg, NULL, 0); + info.frame_height = (int)strtol(height_arg, NULL, 0); + info.time_base.numerator = 1; + info.time_base.denominator = fps; + + if (info.frame_width <= 0 || info.frame_height <= 0 || + (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) { + die("Invalid frame size: %dx%d", info.frame_width, info.frame_height); + } + + if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, info.frame_width, + info.frame_height, 1)) { + die("Failed to allocate image."); + } + + keyframe_interval = (int)strtol(keyframe_interval_arg, NULL, 0); + if (keyframe_interval < 0) die("Invalid keyframe interval value."); + + printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface())); + + res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0); + if (res) die_codec(&codec, "Failed to get default codec config."); + + cfg.g_w = info.frame_width; + cfg.g_h = info.frame_height; + cfg.g_timebase.num = info.time_base.numerator; + cfg.g_timebase.den = info.time_base.denominator; + cfg.rc_target_bitrate = bitrate; + cfg.g_error_resilient = (aom_codec_er_flags_t)strtoul(argv[7], NULL, 0); + + writer = aom_video_writer_open(outfile_arg, kContainerIVF, &info); + if (!writer) die("Failed to open %s for writing.", outfile_arg); + + if (!(infile = fopen(infile_arg, "rb"))) + die("Failed to open %s for reading.", infile_arg); + + if (aom_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0)) + die_codec(&codec, "Failed to initialize encoder"); + + // Encode frames. + while (aom_img_read(&raw, infile)) { + int flags = 0; + if (keyframe_interval > 0 && frame_count % keyframe_interval == 0) + flags |= AOM_EFLAG_FORCE_KF; + encode_frame(&codec, &raw, frame_count++, flags, writer); + frames_encoded++; + if (max_frames > 0 && frames_encoded >= max_frames) break; + } + + // Flush encoder. + while (encode_frame(&codec, NULL, -1, 0, writer)) continue; + + printf("\n"); + fclose(infile); + printf("Processed %d frames.\n", frame_count); + + aom_img_free(&raw); + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + aom_video_writer_close(writer); + + return EXIT_SUCCESS; +} diff --git a/libs/libaom/src/examples/svc_encoder_rtc.c b/libs/libaom/src/examples/svc_encoder_rtc.c new file mode 100644 index 000000000..1316c6c1e --- /dev/null +++ b/libs/libaom/src/examples/svc_encoder_rtc.c @@ -0,0 +1,907 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This is an example demonstrating how to implement a multi-layer AOM +// encoding scheme for RTC video applications. + +#include +#include +#include +#include +#include + +#include "aom/aom_encoder.h" +#include "aom/aomcx.h" +#include "av1/common/enums.h" +#include "common/tools_common.h" +#include "common/video_writer.h" +#include "aom_ports/aom_timer.h" + +#define zero(Dest) memset(&(Dest), 0, sizeof(Dest)); + +static const char *exec_name; + +void usage_exit(void) { exit(EXIT_FAILURE); } + +static int mode_to_num_temporal_layers[10] = { 1, 2, 3, 3, 2, 1, 1, 3, 3, 3 }; +static int mode_to_num_spatial_layers[10] = { 1, 1, 1, 1, 1, 2, 3, 3, 3, 3 }; +static int mode_to_num_layers[10] = { 1, 2, 3, 3, 2, 2, 3, 9, 9, 9 }; + +// For rate control encoding stats. +struct RateControlMetrics { + // Number of input frames per layer. + int layer_input_frames[AOM_MAX_TS_LAYERS]; + // Number of encoded non-key frames per layer. + int layer_enc_frames[AOM_MAX_TS_LAYERS]; + // Framerate per layer layer (cumulative). + double layer_framerate[AOM_MAX_TS_LAYERS]; + // Target average frame size per layer (per-frame-bandwidth per layer). + double layer_pfb[AOM_MAX_LAYERS]; + // Actual average frame size per layer. + double layer_avg_frame_size[AOM_MAX_LAYERS]; + // Average rate mismatch per layer (|target - actual| / target). + double layer_avg_rate_mismatch[AOM_MAX_LAYERS]; + // Actual encoding bitrate per layer (cumulative across temporal layers). + double layer_encoding_bitrate[AOM_MAX_LAYERS]; + // Average of the short-time encoder actual bitrate. + // TODO(marpan): Should we add these short-time stats for each layer? + double avg_st_encoding_bitrate; + // Variance of the short-time encoder actual bitrate. + double variance_st_encoding_bitrate; + // Window (number of frames) for computing short-timee encoding bitrate. + int window_size; + // Number of window measurements. + int window_count; + int layer_target_bitrate[AOM_MAX_LAYERS]; +}; + +static int read_frame(struct AvxInputContext *input_ctx, aom_image_t *img) { + FILE *f = input_ctx->file; + y4m_input *y4m = &input_ctx->y4m; + int shortread = 0; + + if (input_ctx->file_type == FILE_TYPE_Y4M) { + if (y4m_input_fetch_frame(y4m, f, img) < 1) return 0; + } else { + shortread = read_yuv_frame(input_ctx, img); + } + + return !shortread; +} + +static int file_is_y4m(const char detect[4]) { + if (memcmp(detect, "YUV4", 4) == 0) { + return 1; + } + return 0; +} + +static int fourcc_is_ivf(const char detect[4]) { + if (memcmp(detect, "DKIF", 4) == 0) { + return 1; + } + return 0; +} + +static void close_input_file(struct AvxInputContext *input) { + fclose(input->file); + if (input->file_type == FILE_TYPE_Y4M) y4m_input_close(&input->y4m); +} + +static void open_input_file(struct AvxInputContext *input, + aom_chroma_sample_position_t csp) { + /* Parse certain options from the input file, if possible */ + input->file = strcmp(input->filename, "-") ? fopen(input->filename, "rb") + : set_binary_mode(stdin); + + if (!input->file) fatal("Failed to open input file"); + + if (!fseeko(input->file, 0, SEEK_END)) { + /* Input file is seekable. Figure out how long it is, so we can get + * progress info. + */ + input->length = ftello(input->file); + rewind(input->file); + } + + /* Default to 1:1 pixel aspect ratio. */ + input->pixel_aspect_ratio.numerator = 1; + input->pixel_aspect_ratio.denominator = 1; + + /* For RAW input sources, these bytes will applied on the first frame + * in read_frame(). + */ + input->detect.buf_read = fread(input->detect.buf, 1, 4, input->file); + input->detect.position = 0; + + if (input->detect.buf_read == 4 && file_is_y4m(input->detect.buf)) { + if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4, csp, + input->only_i420) >= 0) { + input->file_type = FILE_TYPE_Y4M; + input->width = input->y4m.pic_w; + input->height = input->y4m.pic_h; + input->pixel_aspect_ratio.numerator = input->y4m.par_n; + input->pixel_aspect_ratio.denominator = input->y4m.par_d; + input->framerate.numerator = input->y4m.fps_n; + input->framerate.denominator = input->y4m.fps_d; + input->fmt = input->y4m.aom_fmt; + input->bit_depth = input->y4m.bit_depth; + } else { + fatal("Unsupported Y4M stream."); + } + } else if (input->detect.buf_read == 4 && fourcc_is_ivf(input->detect.buf)) { + fatal("IVF is not supported as input."); + } else { + input->file_type = FILE_TYPE_RAW; + } +} + +// Note: these rate control metrics assume only 1 key frame in the +// sequence (i.e., first frame only). So for temporal pattern# 7 +// (which has key frame for every frame on base layer), the metrics +// computation will be off/wrong. +// TODO(marpan): Update these metrics to account for multiple key frames +// in the stream. +static void set_rate_control_metrics(struct RateControlMetrics *rc, + double framerate, + unsigned int ss_number_layers, + unsigned int ts_number_layers) { + int ts_rate_decimator[AOM_MAX_TS_LAYERS] = { 1 }; + ts_rate_decimator[0] = 1; + if (ts_number_layers == 2) { + ts_rate_decimator[0] = 2; + ts_rate_decimator[1] = 1; + } + if (ts_number_layers == 3) { + ts_rate_decimator[0] = 4; + ts_rate_decimator[1] = 2; + ts_rate_decimator[2] = 1; + } + // Set the layer (cumulative) framerate and the target layer (non-cumulative) + // per-frame-bandwidth, for the rate control encoding stats below. + for (unsigned int sl = 0; sl < ss_number_layers; ++sl) { + unsigned int i = sl * ts_number_layers; + rc->layer_framerate[0] = framerate / ts_rate_decimator[0]; + rc->layer_pfb[i] = + 1000.0 * rc->layer_target_bitrate[i] / rc->layer_framerate[0]; + for (unsigned int tl = 0; tl < ts_number_layers; ++tl) { + i = sl * ts_number_layers + tl; + if (tl > 0) { + rc->layer_framerate[tl] = framerate / ts_rate_decimator[tl]; + rc->layer_pfb[i] = + 1000.0 * + (rc->layer_target_bitrate[i] - rc->layer_target_bitrate[i - 1]) / + (rc->layer_framerate[tl] - rc->layer_framerate[tl - 1]); + } + rc->layer_input_frames[tl] = 0; + rc->layer_enc_frames[tl] = 0; + rc->layer_encoding_bitrate[i] = 0.0; + rc->layer_avg_frame_size[i] = 0.0; + rc->layer_avg_rate_mismatch[i] = 0.0; + } + } + rc->window_count = 0; + rc->window_size = 15; + rc->avg_st_encoding_bitrate = 0.0; + rc->variance_st_encoding_bitrate = 0.0; +} + +static void printout_rate_control_summary(struct RateControlMetrics *rc, + int frame_cnt, + unsigned int ss_number_layers, + unsigned int ts_number_layers) { + int tot_num_frames = 0; + double perc_fluctuation = 0.0; + printf("Total number of processed frames: %d\n\n", frame_cnt - 1); + printf("Rate control layer stats for %d layer(s):\n\n", ts_number_layers); + for (unsigned int sl = 0; sl < ss_number_layers; ++sl) { + tot_num_frames = 0; + for (unsigned int tl = 0; tl < ts_number_layers; ++tl) { + unsigned int i = sl * ts_number_layers + tl; + const int num_dropped = + tl > 0 ? rc->layer_input_frames[tl] - rc->layer_enc_frames[tl] + : rc->layer_input_frames[tl] - rc->layer_enc_frames[tl] - 1; + tot_num_frames += rc->layer_input_frames[tl]; + rc->layer_encoding_bitrate[i] = 0.001 * rc->layer_framerate[tl] * + rc->layer_encoding_bitrate[i] / + tot_num_frames; + rc->layer_avg_frame_size[i] = + rc->layer_avg_frame_size[i] / rc->layer_enc_frames[tl]; + rc->layer_avg_rate_mismatch[i] = + 100.0 * rc->layer_avg_rate_mismatch[i] / rc->layer_enc_frames[tl]; + printf("For layer#: %d %d \n", sl, tl); + printf("Bitrate (target vs actual): %d %f\n", rc->layer_target_bitrate[i], + rc->layer_encoding_bitrate[i]); + printf("Average frame size (target vs actual): %f %f\n", rc->layer_pfb[i], + rc->layer_avg_frame_size[i]); + printf("Average rate_mismatch: %f\n", rc->layer_avg_rate_mismatch[i]); + printf( + "Number of input frames, encoded (non-key) frames, " + "and perc dropped frames: %d %d %f\n", + rc->layer_input_frames[tl], rc->layer_enc_frames[tl], + 100.0 * num_dropped / rc->layer_input_frames[tl]); + printf("\n"); + } + } + rc->avg_st_encoding_bitrate = rc->avg_st_encoding_bitrate / rc->window_count; + rc->variance_st_encoding_bitrate = + rc->variance_st_encoding_bitrate / rc->window_count - + (rc->avg_st_encoding_bitrate * rc->avg_st_encoding_bitrate); + perc_fluctuation = 100.0 * sqrt(rc->variance_st_encoding_bitrate) / + rc->avg_st_encoding_bitrate; + printf("Short-time stats, for window of %d frames:\n", rc->window_size); + printf("Average, rms-variance, and percent-fluct: %f %f %f\n", + rc->avg_st_encoding_bitrate, sqrt(rc->variance_st_encoding_bitrate), + perc_fluctuation); + if (frame_cnt - 1 != tot_num_frames) + die("Error: Number of input frames not equal to output!\n"); +} + +// Layer pattern configuration. +static int set_layer_pattern(int layering_mode, int superframe_cnt, + aom_svc_layer_id_t *layer_id, + aom_svc_ref_frame_config_t *ref_frame_config, + int *use_svc_control, int spatial_layer_id, + int is_key_frame, int ksvc_mode) { + int i; + int shift = (layering_mode == 7) ? 2 : 0; + *use_svc_control = 1; + layer_id->spatial_layer_id = spatial_layer_id; + // Set the referende map buffer idx for the 7 references: + // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3), + // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6). + for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = i; + for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->reference[i] = 0; + for (i = 0; i < REF_FRAMES; i++) ref_frame_config->refresh[i] = 0; + // Note for this layered patterns only use LAST and GF for prediction in + // non-rd mode (speed >= 7). + int layer_flags = AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 | + AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD | + AOM_EFLAG_NO_REF_ARF2; + if (ksvc_mode) { + // Same pattern as case 8. + layering_mode = 8; + if (!is_key_frame) + // No inter-layer prediction on inter-frames. + layer_flags |= AOM_EFLAG_NO_REF_GF; + } + switch (layering_mode) { + case 0: + // 1-layer: update LAST on every frame, reference LAST and GF. + layer_id->temporal_layer_id = 0; + ref_frame_config->refresh[0] = 1; + break; + case 1: + // 2-temporal layer. + // 1 3 5 + // 0 2 4 + if (superframe_cnt % 2 == 0) { + layer_id->temporal_layer_id = 0; + // Update LAST on layer 0, reference LAST and GF. + ref_frame_config->refresh[0] = 1; + } else { + layer_id->temporal_layer_id = 1; + // No updates on layer 1, only reference LAST (TL0). + layer_flags |= AOM_EFLAG_NO_REF_GF; + } + break; + case 2: + // 3-temporal layer: + // 1 3 5 7 + // 2 6 + // 0 4 8 + if (superframe_cnt % 4 == 0) { + // Base layer. + layer_id->temporal_layer_id = 0; + // Update LAST on layer 0, reference LAST and GF. + ref_frame_config->refresh[0] = 1; + } else if ((superframe_cnt - 1) % 4 == 0) { + layer_id->temporal_layer_id = 2; + // First top layer: no updates, only reference LAST (TL0). + layer_flags |= AOM_EFLAG_NO_REF_GF; + } else if ((superframe_cnt - 2) % 4 == 0) { + layer_id->temporal_layer_id = 1; + // Middle layer (TL1): update LAST2, only reference LAST (TL0). + ref_frame_config->refresh[1] = 1; + layer_flags |= AOM_EFLAG_NO_REF_GF; + } else if ((superframe_cnt - 3) % 4 == 0) { + layer_id->temporal_layer_id = 2; + // Second top layer: no updates, only reference LAST. + // Set buffer idx for LAST to slot 1, since that was the slot + // updated in previous frame. So LAST is TL1 frame. + ref_frame_config->ref_idx[0] = 1; + ref_frame_config->ref_idx[1] = 0; + layer_flags |= AOM_EFLAG_NO_REF_GF; + } + break; + case 3: + // 3-temporal layer: but middle layer updates GF, so 2nd TL2 will + // only reference GF (not LAST). Other frames only reference LAST. + // 1 3 5 7 + // 2 6 + // 0 4 8 + if (superframe_cnt % 4 == 0) { + // Base layer. + layer_id->temporal_layer_id = 0; + // Update LAST on layer 0, only reference LAST. + ref_frame_config->refresh[0] = 1; + layer_flags |= AOM_EFLAG_NO_REF_GF; + } else if ((superframe_cnt - 1) % 4 == 0) { + layer_id->temporal_layer_id = 2; + // First top layer: no updates, only reference LAST (TL0). + layer_flags |= AOM_EFLAG_NO_REF_GF; + } else if ((superframe_cnt - 2) % 4 == 0) { + layer_id->temporal_layer_id = 1; + // Middle layer (TL1): update GF, only reference LAST (TL0). + ref_frame_config->refresh[3] = 1; + layer_flags |= AOM_EFLAG_NO_REF_GF; + } else if ((superframe_cnt - 3) % 4 == 0) { + layer_id->temporal_layer_id = 2; + // Second top layer: no updates, only reference GF. + layer_flags |= AOM_EFLAG_NO_REF_LAST; + } + break; + case 4: + // 2-temporal layer with the old update flags, not with the new + // SVC control. + *use_svc_control = 0; + // 1 3 5 + // 0 2 4 + if (superframe_cnt % 2 == 0) { + layer_id->temporal_layer_id = 0; + // Update LAST on layer 0, reference LAST and GF. + layer_flags |= AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF; + } else { + layer_id->temporal_layer_id = 1; + // No updates on layer 1, only reference LAST (TL0). + layer_flags |= AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | + AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_REF_GF; + } + break; + case 5: + // 2 spatial layers, 1 temporal. + layer_id->temporal_layer_id = 0; + if (layer_id->spatial_layer_id == 0) { + // Reference LAST, update LAST. + ref_frame_config->refresh[0] = 1; + layer_flags |= AOM_EFLAG_NO_REF_GF; + } else if (layer_id->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1 + // and GOLDEN to slot 0. Update slot 1 (LAST). + ref_frame_config->ref_idx[0] = 1; + ref_frame_config->ref_idx[3] = 0; + ref_frame_config->refresh[1] = 1; + } + break; + case 6: + // 3 spatial layers, 1 temporal. + // Note for this case, we set the buffer idx for all references to be + // either LAST or GOLDEN, which are always valid references, since decoder + // will check if any of the 7 references is valid scale in + // valid_ref_frame_size(). + layer_id->temporal_layer_id = 0; + if (layer_id->spatial_layer_id == 0) { + // Reference LAST, update LAST. Set all buffer_idx to 0. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 0; + ref_frame_config->refresh[0] = 1; + layer_flags |= AOM_EFLAG_NO_REF_GF; + } else if (layer_id->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1 + // and GOLDEN (and all other refs) to slot 0. + // Update slot 1 (LAST). + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 0; + ref_frame_config->ref_idx[0] = 1; + ref_frame_config->refresh[1] = 1; + } else if (layer_id->spatial_layer_id == 2) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2 + // and GOLDEN (and all other refs) to slot 1. + // Update slot 2 (LAST). + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 1; + ref_frame_config->ref_idx[0] = 2; + ref_frame_config->refresh[2] = 1; + } + break; + case 7: + // 3 spatial and 3 temporal layer. + // Same as case 8 but overalap in the buffer slot updates. + // (shift = 2). The slots 3 and 4 updated by first TL2 are + // reused for update in TL1 superframe. + // Note for this case, frame order hint must be disabled for + // lower resolutios (operating points > 0) to be decoedable. + case 8: + // 3 spatial and 3 temporal layer. + // No overlap in buffer updates between TL2 and TL1. + // TL2 updates slot 3 and 4, TL1 updates 5, 6, 7. + // Set the references via the svc_ref_frame_config control. + layer_flags = 0; + // Always reference LAST. + ref_frame_config->reference[0] = 1; + if (superframe_cnt % 4 == 0) { + // Base temporal layer. + layer_id->temporal_layer_id = 0; + if (layer_id->spatial_layer_id == 0) { + // Reference LAST, update LAST. + // Set all buffer_idx to 0. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 0; + ref_frame_config->refresh[0] = 1; + } else if (layer_id->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1, + // GOLDEN (and all other refs) to slot 0. + // Update slot 1 (LAST). + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 0; + ref_frame_config->ref_idx[0] = 1; + ref_frame_config->refresh[1] = 1; + } else if (layer_id->spatial_layer_id == 2) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2, + // GOLDEN (and all other refs) to slot 1. + // Update slot 2 (LAST). + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 1; + ref_frame_config->ref_idx[0] = 2; + ref_frame_config->refresh[2] = 1; + } + } else if ((superframe_cnt - 1) % 4 == 0) { + // First top temporal enhancement layer. + layer_id->temporal_layer_id = 2; + if (layer_id->spatial_layer_id == 0) { + // Reference LAST (slot 0). + // Set GOLDEN to slot 3 and update slot 3. + // Set all other buffer_idx to slot 0. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 0; + ref_frame_config->ref_idx[3] = 3; + ref_frame_config->refresh[3] = 1; + } else if (layer_id->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1, + // GOLDEN (and all other refs) to slot 3. + // Set LAST2 to slot 4 and Update slot 4. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 3; + ref_frame_config->ref_idx[0] = 1; + ref_frame_config->ref_idx[1] = 4; + ref_frame_config->refresh[4] = 1; + } else if (layer_id->spatial_layer_id == 2) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2, + // GOLDEN (and all other refs) to slot 4. + // No update. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 4; + ref_frame_config->ref_idx[0] = 2; + } + } else if ((superframe_cnt - 2) % 4 == 0) { + // Middle temporal enhancement layer. + layer_id->temporal_layer_id = 1; + if (layer_id->spatial_layer_id == 0) { + // Reference LAST. + // Set all buffer_idx to 0. + // Set GOLDEN to slot 5 and update slot 5. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 0; + ref_frame_config->ref_idx[3] = 5 - shift; + ref_frame_config->refresh[5 - shift] = 1; + } else if (layer_id->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1, + // GOLDEN (and all other refs) to slot 5. + // Set LAST2 to slot 6 and update slot 6. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 5 - shift; + ref_frame_config->ref_idx[0] = 1; + ref_frame_config->ref_idx[2] = 6 - shift; + ref_frame_config->refresh[6 - shift] = 1; + } else if (layer_id->spatial_layer_id == 2) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2, + // GOLDEN (and all other refs) to slot 6. + // Set LAST2 to slot 6 and update slot 7. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 6 - shift; + ref_frame_config->ref_idx[0] = 2; + ref_frame_config->ref_idx[2] = 7 - shift; + ref_frame_config->refresh[7 - shift] = 1; + } + } else if ((superframe_cnt - 3) % 4 == 0) { + // Second top temporal enhancement layer. + layer_id->temporal_layer_id = 2; + if (layer_id->spatial_layer_id == 0) { + // Set LAST to slot 5 and reference LAST. + // Set GOLDEN to slot 3 and update slot 3. + // Set all other buffer_idx to 0. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 0; + ref_frame_config->ref_idx[0] = 5 - shift; + ref_frame_config->ref_idx[3] = 3; + ref_frame_config->refresh[3] = 1; + } else if (layer_id->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6, + // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 0; + ref_frame_config->ref_idx[0] = 6 - shift; + ref_frame_config->ref_idx[3] = 3; + ref_frame_config->ref_idx[1] = 4; + ref_frame_config->refresh[4] = 1; + } else if (layer_id->spatial_layer_id == 2) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 7, + // GOLDEN to slot 4. No update. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 0; + ref_frame_config->ref_idx[0] = 7 - shift; + ref_frame_config->ref_idx[3] = 4; + } + } + if (layer_id->spatial_layer_id > 0) + ref_frame_config->reference[3] = 1; // Reference GOLDEN. + break; + default: assert(0); die("Error: Unsupported temporal layering mode!\n"); + } + return layer_flags; +} + +int main(int argc, char **argv) { + AvxVideoWriter *outfile[AOM_MAX_LAYERS] = { NULL }; + aom_codec_ctx_t codec; + aom_codec_enc_cfg_t cfg; + int frame_cnt = 0; + aom_image_t raw; + aom_codec_err_t res; + unsigned int width; + unsigned int height; + uint32_t error_resilient = 0; + int speed; + int frame_avail; + int got_data = 0; + int flags = 0; + unsigned i; + int pts = 0; // PTS starts at 0. + int frame_duration = 1; // 1 timebase tick per frame. + int layering_mode = 0; + aom_svc_layer_id_t layer_id; + aom_svc_params_t svc_params; + aom_svc_ref_frame_config_t ref_frame_config; + const AvxInterface *encoder = NULL; + struct AvxInputContext input_ctx; + struct RateControlMetrics rc; + int64_t cx_time = 0; + const int min_args_base = 13; + const int min_args = min_args_base; + double sum_bitrate = 0.0; + double sum_bitrate2 = 0.0; + double framerate = 30.0; + int use_svc_control = 1; + zero(rc.layer_target_bitrate); + memset(&layer_id, 0, sizeof(aom_svc_layer_id_t)); + memset(&input_ctx, 0, sizeof(input_ctx)); + memset(&svc_params, 0, sizeof(svc_params)); + + // Flag to test dynamic scaling of source frames for single + // spatial stream, using the scaling_mode control. + const int test_dynamic_scaling_single_layer = 0; + + /* Setup default input stream settings */ + input_ctx.framerate.numerator = 30; + input_ctx.framerate.denominator = 1; + input_ctx.only_i420 = 1; + input_ctx.bit_depth = 0; + unsigned int ts_number_layers = 1; + unsigned int ss_number_layers = 1; + exec_name = argv[0]; + // Check usage and arguments. + if (argc < min_args) { + die("Usage: %s " + " " + " " + " ... \n", + argv[0]); + } + + encoder = get_aom_encoder_by_name(argv[3]); + + width = (unsigned int)strtoul(argv[4], NULL, 0); + height = (unsigned int)strtoul(argv[5], NULL, 0); + if (width < 16 || width % 2 || height < 16 || height % 2) { + die("Invalid resolution: %d x %d", width, height); + } + + layering_mode = (int)strtol(argv[12], NULL, 0); + if (layering_mode < 0 || layering_mode > 13) { + die("Invalid layering mode (0..12) %s", argv[12]); + } + + if (argc != min_args + mode_to_num_layers[layering_mode]) { + die("Invalid number of arguments"); + } + + ts_number_layers = mode_to_num_temporal_layers[layering_mode]; + ss_number_layers = mode_to_num_spatial_layers[layering_mode]; + + input_ctx.filename = argv[1]; + open_input_file(&input_ctx, 0); + + // Y4M reader has its own allocation. + if (input_ctx.file_type != FILE_TYPE_Y4M) { + if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, width, height, 32)) { + die("Failed to allocate image", width, height); + } + } + + // Populate encoder configuration. + res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0); + if (res) { + printf("Failed to get config: %s\n", aom_codec_err_to_string(res)); + return EXIT_FAILURE; + } + + // Update the default configuration with our settings. + cfg.g_w = width; + cfg.g_h = height; + + // Timebase format e.g. 30fps: numerator=1, demoninator = 30. + cfg.g_timebase.num = (int)strtol(argv[6], NULL, 0); + cfg.g_timebase.den = (int)strtol(argv[7], NULL, 0); + + speed = (int)strtol(argv[8], NULL, 0); + if (speed < 0 || speed > 8) { + die("Invalid speed setting: must be positive"); + } + + for (i = min_args_base; + (int)i < min_args_base + mode_to_num_layers[layering_mode]; ++i) { + rc.layer_target_bitrate[i - 13] = (int)strtol(argv[i], NULL, 0); + svc_params.layer_target_bitrate[i - 13] = rc.layer_target_bitrate[i - 13]; + } + + cfg.rc_target_bitrate = + svc_params.layer_target_bitrate[ss_number_layers * ts_number_layers - 1]; + + svc_params.framerate_factor[0] = 1; + if (ts_number_layers == 2) { + svc_params.framerate_factor[0] = 2; + svc_params.framerate_factor[1] = 1; + } else if (ts_number_layers == 3) { + svc_params.framerate_factor[0] = 4; + svc_params.framerate_factor[1] = 2; + svc_params.framerate_factor[2] = 1; + } + + // Real time parameters. + cfg.g_usage = AOM_USAGE_REALTIME; + + cfg.rc_dropframe_thresh = (unsigned int)strtoul(argv[9], NULL, 0); + cfg.rc_end_usage = AOM_CBR; + cfg.rc_min_quantizer = 2; + cfg.rc_max_quantizer = 52; + cfg.rc_undershoot_pct = 50; + cfg.rc_overshoot_pct = 50; + cfg.rc_buf_initial_sz = 600; + cfg.rc_buf_optimal_sz = 600; + cfg.rc_buf_sz = 1000; + + // Use 1 thread as default. + cfg.g_threads = (unsigned int)strtoul(argv[11], NULL, 0); + + error_resilient = (uint32_t)strtoul(argv[10], NULL, 0); + if (error_resilient != 0 && error_resilient != 1) { + die("Invalid value for error resilient (0, 1): %d.", error_resilient); + } + // Enable error resilient mode. + cfg.g_error_resilient = error_resilient; + cfg.g_lag_in_frames = 0; + cfg.kf_mode = AOM_KF_AUTO; + + // Disable automatic keyframe placement. + cfg.kf_min_dist = cfg.kf_max_dist = 3000; + + framerate = cfg.g_timebase.den / cfg.g_timebase.num; + set_rate_control_metrics(&rc, framerate, ss_number_layers, ts_number_layers); + + if (input_ctx.file_type == FILE_TYPE_Y4M) { + if (input_ctx.width != cfg.g_w || input_ctx.height != cfg.g_h) { + die("Incorrect width or height: %d x %d", cfg.g_w, cfg.g_h); + } + if (input_ctx.framerate.numerator != cfg.g_timebase.den || + input_ctx.framerate.denominator != cfg.g_timebase.num) { + die("Incorrect framerate: numerator %d denominator %d", + cfg.g_timebase.num, cfg.g_timebase.den); + } + } + + // Open an output file for each stream. + for (unsigned int sl = 0; sl < ss_number_layers; ++sl) { + for (unsigned tl = 0; tl < ts_number_layers; ++tl) { + i = sl * ts_number_layers + tl; + char file_name[PATH_MAX]; + AvxVideoInfo info; + info.codec_fourcc = encoder->fourcc; + info.frame_width = cfg.g_w; + info.frame_height = cfg.g_h; + info.time_base.numerator = cfg.g_timebase.num; + info.time_base.denominator = cfg.g_timebase.den; + + snprintf(file_name, sizeof(file_name), "%s_%d.av1", argv[2], i); + outfile[i] = aom_video_writer_open(file_name, kContainerIVF, &info); + if (!outfile[i]) die("Failed to open %s for writing", file_name); + assert(outfile[i] != NULL); + } + } + + // Initialize codec. + if (aom_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0)) + die_codec(&codec, "Failed to initialize encoder"); + + aom_codec_control(&codec, AOME_SET_CPUUSED, speed); + aom_codec_control(&codec, AV1E_SET_AQ_MODE, 3); + aom_codec_control(&codec, AV1E_SET_GF_CBR_BOOST_PCT, 0); + aom_codec_control(&codec, AV1E_SET_ENABLE_CDEF, 1); + aom_codec_control(&codec, AV1E_SET_ENABLE_ORDER_HINT, 0); + aom_codec_control(&codec, AV1E_SET_ENABLE_TPL_MODEL, 0); + aom_codec_control(&codec, AV1E_SET_DELTAQ_MODE, 0); + + svc_params.number_spatial_layers = ss_number_layers; + svc_params.number_temporal_layers = ts_number_layers; + for (i = 0; i < ss_number_layers * ts_number_layers; ++i) { + svc_params.max_quantizers[i] = cfg.rc_max_quantizer; + svc_params.min_quantizers[i] = cfg.rc_min_quantizer; + } + for (i = 0; i < ss_number_layers; ++i) { + svc_params.scaling_factor_num[i] = 1; + svc_params.scaling_factor_den[i] = 1; + } + if (ss_number_layers == 2) { + svc_params.scaling_factor_num[0] = 1; + svc_params.scaling_factor_den[0] = 2; + } else if (ss_number_layers == 3) { + svc_params.scaling_factor_num[0] = 1; + svc_params.scaling_factor_den[0] = 4; + svc_params.scaling_factor_num[1] = 1; + svc_params.scaling_factor_den[1] = 2; + } + + aom_codec_control(&codec, AV1E_SET_SVC_PARAMS, &svc_params); + + // This controls the maximum target size of the key frame. + // For generating smaller key frames, use a smaller max_intra_size_pct + // value, like 100 or 200. + { + const int max_intra_size_pct = 300; + aom_codec_control(&codec, AOME_SET_MAX_INTRA_BITRATE_PCT, + max_intra_size_pct); + } + + frame_avail = 1; + while (frame_avail || got_data) { + struct aom_usec_timer timer; + frame_avail = read_frame(&input_ctx, &raw); + int is_key_frame = (frame_cnt % cfg.kf_max_dist) == 0; + // Loop over spatial layers. + for (unsigned int slx = 0; slx < ss_number_layers; slx++) { + aom_codec_iter_t iter = NULL; + const aom_codec_cx_pkt_t *pkt; + int layer = 0; + + // Set the reference/update flags, layer_id, and reference_map + // buffer index. + flags = set_layer_pattern(layering_mode, frame_cnt, &layer_id, + &ref_frame_config, &use_svc_control, slx, + is_key_frame, (layering_mode == 9)); + aom_codec_control(&codec, AV1E_SET_SVC_LAYER_ID, &layer_id); + if (use_svc_control) + aom_codec_control(&codec, AV1E_SET_SVC_REF_FRAME_CONFIG, + &ref_frame_config); + + layer = slx * ts_number_layers + layer_id.temporal_layer_id; + if (frame_avail && slx == 0) ++rc.layer_input_frames[layer]; + + if (test_dynamic_scaling_single_layer) { + if (frame_cnt >= 200 && frame_cnt <= 400) { + // Scale source down by 2x2. + struct aom_scaling_mode mode = { AOME_ONETWO, AOME_ONETWO }; + aom_codec_control(&codec, AOME_SET_SCALEMODE, &mode); + } else { + // Source back up to original resolution (no scaling). + struct aom_scaling_mode mode = { AOME_NORMAL, AOME_NORMAL }; + aom_codec_control(&codec, AOME_SET_SCALEMODE, &mode); + } + } + + // Do the layer encode. + aom_usec_timer_start(&timer); + if (aom_codec_encode(&codec, frame_avail ? &raw : NULL, pts, 1, flags)) + die_codec(&codec, "Failed to encode frame"); + aom_usec_timer_mark(&timer); + cx_time += aom_usec_timer_elapsed(&timer); + + got_data = 0; + while ((pkt = aom_codec_get_cx_data(&codec, &iter))) { + got_data = 1; + switch (pkt->kind) { + case AOM_CODEC_CX_FRAME_PKT: + for (unsigned int sl = layer_id.spatial_layer_id; + sl < ss_number_layers; ++sl) { + for (unsigned tl = layer_id.temporal_layer_id; + tl < ts_number_layers; ++tl) { + unsigned int j = sl * ts_number_layers + tl; + aom_video_writer_write_frame(outfile[j], pkt->data.frame.buf, + pkt->data.frame.sz, pts); + if (sl == (unsigned int)layer_id.spatial_layer_id) + rc.layer_encoding_bitrate[j] += 8.0 * pkt->data.frame.sz; + // Keep count of rate control stats per layer (for non-key). + if (tl == (unsigned int)layer_id.temporal_layer_id && + sl == (unsigned int)layer_id.spatial_layer_id && + !(pkt->data.frame.flags & AOM_FRAME_IS_KEY)) { + rc.layer_avg_frame_size[j] += 8.0 * pkt->data.frame.sz; + rc.layer_avg_rate_mismatch[j] += + fabs(8.0 * pkt->data.frame.sz - rc.layer_pfb[j]) / + rc.layer_pfb[j]; + if (slx == 0) ++rc.layer_enc_frames[tl]; + } + } + } + + // Update for short-time encoding bitrate states, for moving window + // of size rc->window, shifted by rc->window / 2. + // Ignore first window segment, due to key frame. + // For spatial layers: only do this for top/highest SL. + if (frame_cnt > rc.window_size && slx == ss_number_layers - 1) { + sum_bitrate += 0.001 * 8.0 * pkt->data.frame.sz * framerate; + rc.window_size = (rc.window_size <= 0) ? 1 : rc.window_size; + if (frame_cnt % rc.window_size == 0) { + rc.window_count += 1; + rc.avg_st_encoding_bitrate += sum_bitrate / rc.window_size; + rc.variance_st_encoding_bitrate += + (sum_bitrate / rc.window_size) * + (sum_bitrate / rc.window_size); + sum_bitrate = 0.0; + } + } + // Second shifted window. + if (frame_cnt > rc.window_size + rc.window_size / 2 && + slx == ss_number_layers - 1) { + sum_bitrate2 += 0.001 * 8.0 * pkt->data.frame.sz * framerate; + if (frame_cnt > 2 * rc.window_size && + frame_cnt % rc.window_size == 0) { + rc.window_count += 1; + rc.avg_st_encoding_bitrate += sum_bitrate2 / rc.window_size; + rc.variance_st_encoding_bitrate += + (sum_bitrate2 / rc.window_size) * + (sum_bitrate2 / rc.window_size); + sum_bitrate2 = 0.0; + } + } + break; + default: break; + } + } + } // loop over spatial layers + ++frame_cnt; + pts += frame_duration; + } + close_input_file(&input_ctx); + printout_rate_control_summary(&rc, frame_cnt, ss_number_layers, + ts_number_layers); + printf("\n"); + printf("Frame cnt and encoding time/FPS stats for encoding: %d %f %f\n", + frame_cnt, 1000 * (float)cx_time / (double)(frame_cnt * 1000000), + 1000000 * (double)frame_cnt / (double)cx_time); + + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); + + // Try to rewrite the output file headers with the actual frame count. + for (i = 0; i < ss_number_layers * ts_number_layers; ++i) + aom_video_writer_close(outfile[i]); + + if (input_ctx.file_type != FILE_TYPE_Y4M) { + aom_img_free(&raw); + } + return EXIT_SUCCESS; +} diff --git a/libs/libaom/src/examples/twopass_encoder.c b/libs/libaom/src/examples/twopass_encoder.c new file mode 100644 index 000000000..a03bc6cc2 --- /dev/null +++ b/libs/libaom/src/examples/twopass_encoder.c @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Two Pass Encoder +// ================ +// +// This is an example of a two pass encoder loop. It takes an input file in +// YV12 format, passes it through the encoder twice, and writes the compressed +// frames to disk in IVF format. It builds upon the simple_encoder example. +// +// Twopass Variables +// ----------------- +// Twopass mode needs to track the current pass number and the buffer of +// statistics packets. +// +// Updating The Configuration +// --------------------------------- +// In two pass mode, the configuration has to be updated on each pass. The +// statistics buffer is passed on the last pass. +// +// Encoding A Frame +// ---------------- +// Encoding a frame in two pass mode is identical to the simple encoder +// example. +// +// Processing Statistics Packets +// ----------------------------- +// Each packet of type `AOM_CODEC_CX_FRAME_PKT` contains the encoded data +// for this frame. We write a IVF frame header, followed by the raw data. +// +// +// Pass Progress Reporting +// ----------------------------- +// It's sometimes helpful to see when each pass completes. +// +// +// Clean-up +// ----------------------------- +// Destruction of the encoder instance must be done on each pass. The +// raw image should be destroyed at the end as usual. + +#include +#include +#include + +#include "aom/aom_encoder.h" +#include "common/tools_common.h" +#include "common/video_writer.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, + "Usage: %s " + "\n", + exec_name); + exit(EXIT_FAILURE); +} + +static int get_frame_stats(aom_codec_ctx_t *ctx, const aom_image_t *img, + aom_codec_pts_t pts, unsigned int duration, + aom_enc_frame_flags_t flags, + aom_fixed_buf_t *stats) { + int got_pkts = 0; + aom_codec_iter_t iter = NULL; + const aom_codec_cx_pkt_t *pkt = NULL; + const aom_codec_err_t res = aom_codec_encode(ctx, img, pts, duration, flags); + if (res != AOM_CODEC_OK) die_codec(ctx, "Failed to get frame stats."); + + while ((pkt = aom_codec_get_cx_data(ctx, &iter)) != NULL) { + got_pkts = 1; + + if (pkt->kind == AOM_CODEC_STATS_PKT) { + const uint8_t *const pkt_buf = pkt->data.twopass_stats.buf; + const size_t pkt_size = pkt->data.twopass_stats.sz; + stats->buf = realloc(stats->buf, stats->sz + pkt_size); + memcpy((uint8_t *)stats->buf + stats->sz, pkt_buf, pkt_size); + stats->sz += pkt_size; + } + } + + return got_pkts; +} + +static int encode_frame(aom_codec_ctx_t *ctx, const aom_image_t *img, + aom_codec_pts_t pts, unsigned int duration, + aom_enc_frame_flags_t flags, AvxVideoWriter *writer) { + int got_pkts = 0; + aom_codec_iter_t iter = NULL; + const aom_codec_cx_pkt_t *pkt = NULL; + const aom_codec_err_t res = aom_codec_encode(ctx, img, pts, duration, flags); + if (res != AOM_CODEC_OK) die_codec(ctx, "Failed to encode frame."); + + while ((pkt = aom_codec_get_cx_data(ctx, &iter)) != NULL) { + got_pkts = 1; + if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) { + const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0; + + if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf, + pkt->data.frame.sz, + pkt->data.frame.pts)) + die_codec(ctx, "Failed to write compressed frame."); + printf(keyframe ? "K" : "."); + fflush(stdout); + } + } + + return got_pkts; +} + +static aom_fixed_buf_t pass0(aom_image_t *raw, FILE *infile, + const AvxInterface *encoder, + const aom_codec_enc_cfg_t *cfg, int limit) { + aom_codec_ctx_t codec; + int frame_count = 0; + aom_fixed_buf_t stats = { NULL, 0 }; + + if (aom_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0)) + die_codec(&codec, "Failed to initialize encoder"); + + // Calculate frame statistics. + while (aom_img_read(raw, infile) && frame_count < limit) { + ++frame_count; + get_frame_stats(&codec, raw, frame_count, 1, 0, &stats); + } + + // Flush encoder. + while (get_frame_stats(&codec, NULL, frame_count, 1, 0, &stats)) { + } + + printf("Pass 0 complete. Processed %d frames.\n", frame_count); + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + return stats; +} + +static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name, + const AvxInterface *encoder, const aom_codec_enc_cfg_t *cfg, + int limit) { + AvxVideoInfo info = { encoder->fourcc, + cfg->g_w, + cfg->g_h, + { cfg->g_timebase.num, cfg->g_timebase.den }, + 0 }; + AvxVideoWriter *writer = NULL; + aom_codec_ctx_t codec; + int frame_count = 0; + + writer = aom_video_writer_open(outfile_name, kContainerIVF, &info); + if (!writer) die("Failed to open %s for writing", outfile_name); + + if (aom_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0)) + die_codec(&codec, "Failed to initialize encoder"); + + // Encode frames. + while (aom_img_read(raw, infile) && frame_count < limit) { + ++frame_count; + encode_frame(&codec, raw, frame_count, 1, 0, writer); + } + + // Flush encoder. + while (encode_frame(&codec, NULL, -1, 1, 0, writer)) { + } + + printf("\n"); + + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + aom_video_writer_close(writer); + + printf("Pass 1 complete. Processed %d frames.\n", frame_count); +} + +int main(int argc, char **argv) { + FILE *infile = NULL; + int w, h; + aom_codec_ctx_t codec; + aom_codec_enc_cfg_t cfg; + aom_image_t raw; + aom_codec_err_t res; + aom_fixed_buf_t stats; + + const AvxInterface *encoder = NULL; + const int fps = 30; // TODO(dkovalev) add command line argument + const int bitrate = 200; // kbit/s TODO(dkovalev) add command line argument + const char *const codec_arg = argv[1]; + const char *const width_arg = argv[2]; + const char *const height_arg = argv[3]; + const char *const infile_arg = argv[4]; + const char *const outfile_arg = argv[5]; + int limit = 0; + exec_name = argv[0]; + + if (argc < 6) die("Invalid number of arguments"); + + if (argc > 6) limit = (int)strtol(argv[6], NULL, 0); + + if (limit == 0) limit = 100; + + encoder = get_aom_encoder_by_name(codec_arg); + if (!encoder) die("Unsupported codec."); + + w = (int)strtol(width_arg, NULL, 0); + h = (int)strtol(height_arg, NULL, 0); + + if (w <= 0 || h <= 0 || (w % 2) != 0 || (h % 2) != 0) + die("Invalid frame size: %dx%d", w, h); + + if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, w, h, 1)) + die("Failed to allocate image", w, h); + + printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface())); + + // Configuration + res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0); + if (res) die_codec(&codec, "Failed to get default codec config."); + + cfg.g_w = w; + cfg.g_h = h; + cfg.g_timebase.num = 1; + cfg.g_timebase.den = fps; + cfg.rc_target_bitrate = bitrate; + + if (!(infile = fopen(infile_arg, "rb"))) + die("Failed to open %s for reading", infile_arg); + + // Pass 0 + cfg.g_pass = AOM_RC_FIRST_PASS; + stats = pass0(&raw, infile, encoder, &cfg, limit); + + // Pass 1 + rewind(infile); + cfg.g_pass = AOM_RC_LAST_PASS; + cfg.rc_twopass_stats_in = stats; + pass1(&raw, infile, outfile_arg, encoder, &cfg, limit); + free(stats.buf); + + aom_img_free(&raw); + fclose(infile); + + return EXIT_SUCCESS; +} diff --git a/libs/libaom/src/keywords.dox b/libs/libaom/src/keywords.dox new file mode 100644 index 000000000..56f536890 --- /dev/null +++ b/libs/libaom/src/keywords.dox @@ -0,0 +1,51 @@ +/*!\page rfc2119 RFC2119 Keywords + + The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL + NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and + "OPTIONAL" in this document are to be interpreted as described in + RFC 2119. + +Specifically, the following definitions are used: + +\section MUST +\anchor REQUIRED +\anchor SHALL + This word, or the terms "REQUIRED" or "SHALL", mean that the + definition is an absolute requirement of the specification. + +\section MUSTNOT MUST NOT +\anchor SHALLNOT + This phrase, or the phrase "SHALL NOT", mean that the + definition is an absolute prohibition of the specification. + +\section SHOULD +\anchor RECOMMENDED + This word, or the adjective "RECOMMENDED", mean that there + may exist valid reasons in particular circumstances to ignore a + particular item, but the full implications must be understood and + carefully weighed before choosing a different course. + +\section SHOULDNOT SHOULD NOT +\anchor NOTRECOMMENDED + This phrase, or the phrase "NOT RECOMMENDED" mean that + there may exist valid reasons in particular circumstances when the + particular behavior is acceptable or even useful, but the full + implications should be understood and the case carefully weighed + before implementing any behavior described with this label. + +\section MAY +\anchor OPTIONAL + This word, or the adjective "OPTIONAL", mean that an item is + truly optional. One vendor may choose to include the item because a + particular marketplace requires it or because the vendor feels that + it enhances the product while another vendor may omit the same item. + An implementation which does not include a particular option \ref MUST be + prepared to interoperate with another implementation which does + include the option, though perhaps with reduced functionality. In the + same vein an implementation which does include a particular option + \ref MUST be prepared to interoperate with another implementation which + does not include the option (except, of course, for the feature the + option provides.) + + +*/ diff --git a/libs/libaom/src/libs.doxy_template b/libs/libaom/src/libs.doxy_template new file mode 100644 index 000000000..c522e21d3 --- /dev/null +++ b/libs/libaom/src/libs.doxy_template @@ -0,0 +1,1260 @@ +## Copyright (c) 2016, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## + +# Doxyfile 1.5.4 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project +# +# All text after a hash (#) is considered a comment and will be ignored +# The format is: +# TAG = value [value, ...] +# For lists items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (" ") + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file that +# follow. The default is UTF-8 which is also the encoding used for all text before +# the first occurrence of this tag. Doxygen uses libiconv (or the iconv built into +# libc) for the transcoding. See http://www.gnu.org/software/libiconv for the list of +# possible encodings. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded +# by quotes) that should identify the project. + +PROJECT_NAME = "AOMedia Codec SDK" + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) +# base path where the generated documentation will be put. +# If a relative path is entered, it will be relative to the location +# where doxygen was started. If left blank the current directory will be used. + +OUTPUT_DIRECTORY = docs + +# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create +# 4096 sub-directories (in 2 levels) under the output directory of each output +# format and will distribute the generated files over these directories. +# Enabling this option can be useful when feeding doxygen a huge amount of +# source files, where putting all generated files in the same directory would +# otherwise cause performance problems for the file system. + +CREATE_SUBDIRS = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# The default language is English, other supported languages are: +# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, +# Croatian, Czech, Danish, Dutch, Finnish, French, German, Greek, Hungarian, +# Italian, Japanese, Japanese-en (Japanese with English messages), Korean, +# Korean-en, Lithuanian, Norwegian, Polish, Portuguese, Romanian, Russian, +# Serbian, Slovak, Slovene, Spanish, Swedish, and Ukrainian. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will +# include brief member descriptions after the members that are listed in +# the file and class documentation (similar to java_doc). +# Set to NO to disable this. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend +# the brief description of a member or function before the detailed description. +# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator +# that is used to form the text in various listings. Each string +# in this list, if found as the leading text of the brief description, will be +# stripped from the text and the result after processing the whole list, is +# used as the annotated text. Otherwise, the brief description is used as-is. +# If left blank, the following values are used ("$name" is automatically +# replaced with the name of the entity): "The $name class" "The $name widget" +# "The $name file" "is" "provides" "specifies" "contains" +# "represents" "a" "an" "the" + +ABBREVIATE_BRIEF = + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# Doxygen will generate a detailed section even if there is only a brief +# description. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full +# path before files name in the file list and in the header files. If set +# to NO the shortest path that makes the file name unique will be used. + +FULL_PATH_NAMES = YES + +# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag +# can be used to strip a user-defined part of the path. Stripping is +# only done if one of the specified strings matches the left-hand part of +# the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the +# path to strip. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of +# the path mentioned in the documentation of a class, which tells +# the reader which header file to include in order to use a class. +# If left blank only the name of the header file containing the class +# definition is used. Otherwise one should specify the include paths that +# are normally passed to the compiler using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter +# (but less readable) file names. This can be useful is your file systems +# doesn't support long names like on DOS, Mac, or CD-ROM. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen +# will interpret the first line (until the first dot) of a java_doc-style +# comment as the brief description. If set to NO, the java_doc +# comments will behave just like regular Qt-style comments +# (thus requiring an explicit @brief command for a brief description.) + +JAVADOC_AUTOBRIEF = NO + +# If the QT_AUTOBRIEF tag is set to YES then Doxygen will +# interpret the first line (until the first dot) of a Qt-style +# comment as the brief description. If set to NO, the comments +# will behave just like regular Qt-style comments (thus requiring +# an explicit \brief command for a brief description.) + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen +# treat a multi-line C++ special comment block (i.e. a block of //! or /// +# comments) as a brief description. This used to be the default behaviour. +# The new default is to treat a multi-line C++ comment block as a detailed +# description. Set this tag to YES if you prefer the old behaviour instead. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented +# member inherits the documentation from any documented member that it +# re-implements. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce +# a new page for each member. If set to NO, the documentation of a member will +# be part of the file/class/namespace that contains it. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. +# Doxygen uses this value to replace tabs by spaces in code fragments. + +TAB_SIZE = 4 + +# This tag can be used to specify a number of aliases that acts +# as commands in the documentation. An alias has the form "name=value". +# For example adding "sideeffect=\par Side Effects:\n" will allow you to +# put the command \sideeffect (or @sideeffect) in the documentation, which +# will result in a user-defined paragraph with heading "Side Effects:". +# You can put \n's in the value part of an alias to insert newlines. + +ALIASES = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C +# sources only. Doxygen will then generate output that is more tailored for C. +# For instance, some of the names that are used will be different. The list +# of all members will be omitted, etc. + +OPTIMIZE_OUTPUT_FOR_C = YES + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java +# sources only. Doxygen will then generate output that is more tailored for Java. +# For instance, namespaces will be presented as packages, qualified scopes +# will look different, etc. + +OPTIMIZE_OUTPUT_JAVA = NO + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want to +# include (a tag file for) the STL sources as input, then you should +# set this tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. +# func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. +# Doxygen will parse them like normal C++ but will assume all classes use public +# instead of private inheritance when no explicit protection keyword is present. + +SIP_SUPPORT = NO + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES, then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. + +DISTRIBUTE_GROUP_DOC = NO + +# Set the SUBGROUPING tag to YES (the defqault) to allow class member groups of +# the same type (for instance a group of public functions) to be put as a +# subgroup of that type (e.g. under the Public Functions section). Set it to +# NO to prevent subgrouping. Alternatively, this can be done per class using +# the \nosubgrouping command. + +SUBGROUPING = YES + +# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct (or union) is +# documented as struct with the name of the typedef. So +# typedef struct type_s {} type_t, will appear in the documentation as a struct +# with name type_t. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named type_s. This can typically +# be useful for C code where the coding convention is that all structs are +# typedef'ed and only the typedef is referenced never the struct's name. + +TYPEDEF_HIDES_STRUCT = NO + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in +# documentation are documented, even if no documentation was available. +# Private class members and static file members will be hidden unless +# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES + +EXTRACT_ALL = NO + +# If the EXTRACT_PRIVATE tag is set to YES all private members of a class +# will be included in the documentation. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_STATIC tag is set to YES all static members of a file +# will be included in the documentation. + +EXTRACT_STATIC = NO + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) +# defined locally in source files will be included in the documentation. +# If set to NO only classes defined in header files are included. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. When set to YES local +# methods, which are defined in the implementation section but not in +# the interface are included in the documentation. +# If set to NO (the default) only methods in the interface are included. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be extracted +# and appear in the documentation as a namespace called 'anonymous_namespace{file}', +# where file will be replaced with the base name of the file that contains the anonymous +# namespace. By default anonymous namespace are hidden. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all +# undocumented members of documented classes, files or namespaces. +# If set to NO (the default) these members will be included in the +# various overviews, but no documentation section is generated. +# This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. +# If set to NO (the default) these classes will be included in the various +# overviews. This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all +# friend (class|struct|union) declarations. +# If set to NO (the default) these declarations will be included in the +# documentation. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any +# documentation blocks found inside the body of a function. +# If set to NO (the default) these blocks will be appended to the +# function's detailed documentation block. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation +# that is typed after a \internal command is included. If the tag is set +# to NO (the default) then the documentation will be excluded. +# Set it to YES to include the internal documentation. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate +# file names in lower-case letters. If set to YES upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. + +CASE_SENSE_NAMES = YES + +# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen +# will show members with their full class and namespace scopes in the +# documentation. If set to YES the scope will be hidden. + +HIDE_SCOPE_NAMES = NO + +# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen +# will put a list of the files that are included by a file in the documentation +# of that file. + +SHOW_INCLUDE_FILES = YES + +# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] +# is inserted in the documentation for inline members. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen +# will sort the (detailed) documentation of file and class members +# alphabetically by member name. If set to NO the members will appear in +# declaration order. + +SORT_MEMBER_DOCS = NO + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the +# brief documentation of file, namespace and class members alphabetically +# by member name. If set to NO (the default) the members will appear in +# declaration order. + +SORT_BRIEF_DOCS = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be +# sorted by fully-qualified names, including namespaces. If set to +# NO (the default), the class list will be sorted only by class name, +# not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the +# alphabetical list. + +SORT_BY_SCOPE_NAME = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or +# disable (NO) the todo list. This list is created by putting \todo +# commands in the documentation. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or +# disable (NO) the test list. This list is created by putting \test +# commands in the documentation. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or +# disable (NO) the bug list. This list is created by putting \bug +# commands in the documentation. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or +# disable (NO) the deprecated list. This list is created by putting +# \deprecated commands in the documentation. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional +# documentation sections, marked by \if sectionname ... \endif. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines +# the initial value of a variable or define consists of for it to appear in +# the documentation. If the initializer consists of more lines than specified +# here it will be hidden. Use a value of 0 to hide initializers completely. +# The appearance of the initializer of individual variables and defines in the +# documentation can be controlled using \showinitializer or \hideinitializer +# command in the documentation regardless of this setting. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated +# at the bottom of the documentation of classes and structs. If set to YES the +# list will mention the files that were used to generate the documentation. + +SHOW_USED_FILES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from the +# version control system). Doxygen will invoke the program by executing (via +# popen()) the command , where is the value of +# the FILE_VERSION_FILTER tag, and is the name of an input file +# provided by doxygen. Whatever the program writes to standard output +# is used as the file version. See the manual for examples. + +FILE_VERSION_FILTER = + +#--------------------------------------------------------------------------- +# configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated +# by doxygen. Possible values are YES and NO. If left blank NO is used. + +QUIET = YES + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated by doxygen. Possible values are YES and NO. If left blank +# NO is used. + +WARNINGS = YES + +# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings +# for undocumented members. If EXTRACT_ALL is set to YES then this flag will +# automatically be disabled. + +WARN_IF_UNDOCUMENTED = YES + +# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some +# parameters in a documented function, or documenting parameters that +# don't exist or using markup commands wrongly. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be abled to get warnings for +# functions that are documented, but have no documentation for their parameters +# or return value. If set to NO (the default) doxygen will only warn about +# wrong or incomplete parameter documentation, but not about the absence of +# documentation. + +WARN_NO_PARAMDOC = NO + +# The WARN_FORMAT tag determines the format of the warning messages that +# doxygen can produce. The string should contain the $file, $line, and $text +# tags, which will be replaced by the file and line number from which the +# warning originated and the warning text. Optionally the format may contain +# $version, which will be replaced by the version of the file (if it could +# be obtained via FILE_VERSION_FILTER) + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning +# and error messages should be written. If left blank the output is written +# to stderr. + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag can be used to specify the files and/or directories that contain +# documented source files. You may enter file names like "myfile.cpp" or +# directories like "/usr/src/myproject". Separate the files or directories +# with spaces. + +INPUT = + +# This tag can be used to specify the character encoding of the source files that +# doxygen parses. Internally doxygen uses the UTF-8 encoding, which is also the default +# input encoding. Doxygen uses libiconv (or the iconv built into libc) for the transcoding. +# See http://www.gnu.org/software/libiconv for the list of possible encodings. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank the following patterns are tested: +# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx +# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90 + +FILE_PATTERNS = + +# The RECURSIVE tag can be used to turn specify whether or not subdirectories +# should be searched for input files as well. Possible values are YES and NO. +# If left blank NO is used. + +RECURSIVE = NO + +# The EXCLUDE tag can be used to specify files and/or directories that should +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. + +EXCLUDE = + +# The EXCLUDE_SYMLINKS tag can be used select whether or not files or +# directories that are symbolic links (a Unix filesystem feature) are excluded +# from the input. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. Note that the wildcards are matched +# against the file with absolute path, so to exclude all test directories +# for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the output. +# The symbol name can be a fully qualified name, a word, or if the wildcard * is used, +# a substring. Examples: ANamespace, AClass, AClass::ANamespace, ANamespace::*Test + +EXCLUDE_SYMBOLS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or +# directories that contain example code fragments that are included (see +# the \include command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank all files are included. + +EXAMPLE_PATTERNS = + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude +# commands irrespective of the value of the RECURSIVE tag. +# Possible values are YES and NO. If left blank NO is used. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or +# directories that contain image that are included in the documentation (see +# the \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command , where +# is the value of the INPUT_FILTER tag, and is the name of an +# input file. Doxygen will then use the output that the filter program writes +# to standard output. If FILTER_PATTERNS is specified, this tag will be +# ignored. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: +# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further +# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER +# is applied to all files. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will be used to filter the input files when producing source +# files to browse (i.e. when SOURCE_BROWSER is set to YES). + +FILTER_SOURCE_FILES = NO + +#--------------------------------------------------------------------------- +# configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will +# be generated. Documented entities will be cross-referenced with these sources. +# Note: To get rid of all source code in the generated output, make sure also +# VERBATIM_HEADERS is set to NO. If you have enabled CALL_GRAPH or CALLER_GRAPH +# then you must also enable this option. If you don't then doxygen will produce +# a warning and turn it on anyway + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body +# of functions and classes directly in the documentation. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct +# doxygen to hide any special comment blocks from generated source code +# fragments. Normal C and C++ comments will always remain visible. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES (the default) +# then for each documented function all documented +# functions referencing it will be listed. + +REFERENCED_BY_RELATION = YES + +# If the REFERENCES_RELATION tag is set to YES (the default) +# then for each documented function all documented entities +# called/used by that function will be listed. + +REFERENCES_RELATION = YES + +# If the REFERENCES_LINK_SOURCE tag is set to YES (the default) +# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from +# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will +# link to the source code. Otherwise they will link to the documentstion. + +REFERENCES_LINK_SOURCE = YES + +# If the USE_HTAGS tag is set to YES then the references to source code +# will point to the HTML generated by the htags(1) tool instead of doxygen +# built-in source browser. The htags tool is part of GNU's global source +# tagging system (see http://www.gnu.org/software/global/global.html). You +# will need version 4.8.6 or higher. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen +# will generate a verbatim copy of the header file for each class for +# which an include is specified. Set to NO to disable this. + +VERBATIM_HEADERS = YES + +#--------------------------------------------------------------------------- +# configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index +# of all compounds will be generated. Enable this if the project +# contains a lot of classes, structs, unions or interfaces. + +ALPHABETICAL_INDEX = NO + +# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then +# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns +# in which this list will be split (can be a number in the range [1..20]) + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all +# classes will be put under the same header in the alphabetical index. +# The IGNORE_PREFIX tag can be used to specify one or more prefixes that +# should be ignored while generating the index headers. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES (the default) Doxygen will +# generate HTML output. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `html' will be used as the default path. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for +# each generated HTML page (for example: .htm,.php,.asp). If it is left blank +# doxygen will generate files with .html extension. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a personal HTML header for +# each generated HTML page. If it is left blank doxygen will generate a +# standard header. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a personal HTML footer for +# each generated HTML page. If it is left blank doxygen will generate a +# standard footer. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading +# style sheet that is used by each HTML page. It can be used to +# fine-tune the look of the HTML output. If the tag is left blank doxygen +# will generate a default style sheet. Note that doxygen will try to copy +# the style sheet file to the HTML output directory, so don't put your own +# stylesheet in the HTML output directory as well, or it will be erased! + +HTML_STYLESHEET = + +# If the GENERATE_HTMLHELP tag is set to YES, additional index files +# will be generated that can be used as input for tools like the +# Microsoft HTML help workshop to generate a compressed HTML help file (.chm) +# of the generated HTML documentation. + +GENERATE_HTMLHELP = NO + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. For this to work a browser that supports +# java_script and DHTML is required (for instance Mozilla 1.0+, Firefox +# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari). + +HTML_DYNAMIC_SECTIONS = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can +# be used to specify the file name of the resulting .chm file. You +# can add a path in front of the file if the result should not be +# written to the html output directory. + +CHM_FILE = + +# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can +# be used to specify the location (absolute path including file name) of +# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run +# the HTML help compiler on the generated index.hhp. + +HHC_LOCATION = + +# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag +# controls if a separate .chi index file is generated (YES) or that +# it should be included in the master .chm file (NO). + +GENERATE_CHI = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag +# controls whether a binary table of contents is generated (YES) or a +# normal table of contents (NO) in the .chm file. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members +# to the contents of the HTML help documentation and to the tree view. + +TOC_EXPAND = NO + +# The DISABLE_INDEX tag can be used to turn on/off the condensed index at +# top of each HTML page. The value NO (the default) enables the index and +# the value YES disables it. + +DISABLE_INDEX = NO + +# This tag can be used to set the number of enum values (range [1..20]) +# that doxygen will group on one line in the generated HTML documentation. + +ENUM_VALUES_PER_LINE = 4 + +# If the GENERATE_TREEVIEW tag is set to YES, a side panel will be +# generated containing a tree-like index structure (just like the one that +# is generated for HTML Help). For this to work a browser that supports +# java_script, DHTML, CSS and frames is required (for instance Mozilla 1.0+, +# Netscape 6.0+, Internet explorer 5.0+, or Konqueror). Windows users are +# probably better off using the HTML help feature. + +GENERATE_TREEVIEW = NO + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be +# used to set the initial width (in pixels) of the frame in which the tree +# is shown. + +TREEVIEW_WIDTH = 250 + +#--------------------------------------------------------------------------- +# configuration options related to the la_te_x output +#--------------------------------------------------------------------------- + +# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will +# generate Latex output. + +GENERATE_LATEX = YES + +# The LATEX_OUTPUT tag is used to specify where the la_te_x docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `latex' will be used as the default path. + +LATEX_OUTPUT = latex + +# The LATEX_CMD_NAME tag can be used to specify the la_te_x command name to be +# invoked. If left blank `latex' will be used as the default command name. + +LATEX_CMD_NAME = latex + +# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to +# generate index for la_te_x. If left blank `makeindex' will be used as the +# default command name. + +MAKEINDEX_CMD_NAME = makeindex + +# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact +# la_te_x documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_LATEX = YES + +# The PAPER_TYPE tag can be used to set the paper type that is used +# by the printer. Possible values are: a4, a4wide, letter, legal and +# executive. If left blank a4wide will be used. + +PAPER_TYPE = letter + +# The EXTRA_PACKAGES tag can be to specify one or more names of la_te_x +# packages that should be included in the la_te_x output. + +EXTRA_PACKAGES = + +# The LATEX_HEADER tag can be used to specify a personal la_te_x header for +# the generated latex document. The header should contain everything until +# the first chapter. If it is left blank doxygen will generate a +# standard header. Notice: only use this tag if you know what you are doing! + +LATEX_HEADER = + +# If the PDF_HYPERLINKS tag is set to YES, the la_te_x that is generated +# is prepared for conversion to pdf (using ps2pdf). The pdf file will +# contain links (just like the HTML output) instead of page references +# This makes the output suitable for online browsing using a pdf viewer. + +PDF_HYPERLINKS = YES + +# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of +# plain latex in the generated Makefile. Set this option to YES to get a +# higher quality PDF documentation. + +USE_PDFLATEX = YES + +# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. +# command to the generated la_te_x files. This will instruct la_te_x to keep +# running if errors occur, instead of asking the user for help. +# This option is also used when generating formulas in HTML. + +LATEX_BATCHMODE = NO + +# If LATEX_HIDE_INDICES is set to YES then doxygen will not +# include the index chapters (such as File Index, Compound Index, etc.) +# in the output. + +LATEX_HIDE_INDICES = NO + +#--------------------------------------------------------------------------- +# configuration options related to the RTF output +#--------------------------------------------------------------------------- + +# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output +# The RTF output is optimized for Word 97 and may not look very pretty with +# other RTF readers or editors. + +GENERATE_RTF = NO + +# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `rtf' will be used as the default path. + +RTF_OUTPUT = rtf + +# If the COMPACT_RTF tag is set to YES Doxygen generates more compact +# RTF documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_RTF = NO + +# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated +# will contain hyperlink fields. The RTF file will +# contain links (just like the HTML output) instead of page references. +# This makes the output suitable for online browsing using WORD or other +# programs which support those fields. +# Note: wordpad (write) and others do not support links. + +RTF_HYPERLINKS = NO + +# Load stylesheet definitions from file. Syntax is similar to doxygen's +# config file, i.e. a series of assignments. You only have to provide +# replacements, missing definitions are set to their default value. + +RTF_STYLESHEET_FILE = + +# Set optional variables used in the generation of an rtf document. +# Syntax is similar to doxygen's config file. + +RTF_EXTENSIONS_FILE = + +#--------------------------------------------------------------------------- +# configuration options related to the man page output +#--------------------------------------------------------------------------- + +# If the GENERATE_MAN tag is set to YES (the default) Doxygen will +# generate man pages + +GENERATE_MAN = NO + +# The MAN_OUTPUT tag is used to specify where the man pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `man' will be used as the default path. + +MAN_OUTPUT = man + +# The MAN_EXTENSION tag determines the extension that is added to +# the generated man pages (default is the subroutine's section .3) + +MAN_EXTENSION = .3 + +# If the MAN_LINKS tag is set to YES and Doxygen generates man output, +# then it will generate one additional man file for each entity +# documented in the real man page(s). These additional files +# only source the real man page, but without them the man command +# would be unable to find the correct page. The default is NO. + +MAN_LINKS = YES + +#--------------------------------------------------------------------------- +# configuration options for the auto_gen Definitions output +#--------------------------------------------------------------------------- + +# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will +# generate an auto_gen Definitions (see autogen.sf.net) file +# that captures the structure of the code including all +# documentation. Note that this feature is still experimental +# and incomplete at the moment. + +GENERATE_AUTOGEN_DEF = NO + +#--------------------------------------------------------------------------- +# configuration options related to the Perl module output +#--------------------------------------------------------------------------- + +# If the GENERATE_PERLMOD tag is set to YES Doxygen will +# generate a Perl module file that captures the structure of +# the code including all documentation. Note that this +# feature is still experimental and incomplete at the +# moment. + +GENERATE_PERLMOD = NO + +# If the PERLMOD_LATEX tag is set to YES Doxygen will generate +# the necessary Makefile rules, Perl scripts and la_te_x code to be able +# to generate PDF and DVI output from the Perl module output. + +PERLMOD_LATEX = NO + +# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be +# nicely formatted so it can be parsed by a human reader. This is useful +# if you want to understand what is going on. On the other hand, if this +# tag is set to NO the size of the Perl module output will be much smaller +# and Perl will parse it just the same. + +PERLMOD_PRETTY = YES + +# The names of the make variables in the generated doxyrules.make file +# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. +# This is useful so different doxyrules.make files included by the same +# Makefile don't overwrite each other's variables. + +PERLMOD_MAKEVAR_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the preprocessor +#--------------------------------------------------------------------------- + +# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will +# evaluate all C-preprocessor directives found in the sources and include +# files. + +ENABLE_PREPROCESSING = YES + +# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro +# names in the source code. If set to NO (the default) only conditional +# compilation will be performed. Macro expansion can be done in a controlled +# way by setting EXPAND_ONLY_PREDEF to YES. + +MACRO_EXPANSION = YES + +# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES +# then the macro expansion is limited to the macros specified with the +# PREDEFINED and EXPAND_AS_DEFINED tags. + +EXPAND_ONLY_PREDEF = NO + +# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files +# in the INCLUDE_PATH (see below) will be search if a #include is found. + +SEARCH_INCLUDES = YES + +# The INCLUDE_PATH tag can be used to specify one or more directories that +# contain include files that are not input files but should be processed by +# the preprocessor. + +INCLUDE_PATH = + +# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard +# patterns (like *.h and *.hpp) to filter out the header-files in the +# directories. If left blank, the patterns specified with FILE_PATTERNS will +# be used. + +INCLUDE_FILE_PATTERNS = *.h + +# The PREDEFINED tag can be used to specify one or more macro names that +# are defined before the preprocessor is started (similar to the -D option of +# gcc). The argument of the tag is a list of macros of the form: name +# or name=definition (no spaces). If the definition and the = are +# omitted =1 is assumed. To prevent a macro definition from being +# undefined via #undef or recursively expanded use the := operator +# instead of the = operator. + +PREDEFINED = + +# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then +# this tag can be used to specify a list of macro names that should be expanded. +# The macro definition that is found in the sources will be used. +# Use the PREDEFINED tag if you want to use a different macro definition. + +EXPAND_AS_DEFINED = + +# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then +# doxygen's preprocessor will remove all function-like macros that are alone +# on a line, have an all uppercase name, and do not end with a semicolon. Such +# function macros are typically used for boiler-plate code, and will confuse +# the parser if not removed. + +SKIP_FUNCTION_MACROS = YES + +#--------------------------------------------------------------------------- +# Configuration::additions related to external references +#--------------------------------------------------------------------------- + +# The TAGFILES option can be used to specify one or more tagfiles. +# Optionally an initial location of the external documentation +# can be added for each tagfile. The format of a tag file without +# this location is as follows: +# TAGFILES = file1 file2 ... +# Adding location for the tag files is done as follows: +# TAGFILES = file1=loc1 "file2 = loc2" ... +# where "loc1" and "loc2" can be relative or absolute paths or +# URLs. If a location is present for each tag, the installdox tool +# does not have to be run to correct the links. +# Note that each tag file must have a unique name +# (where the name does NOT include the path) +# If a tag file is not located in the directory in which doxygen +# is run, you must also specify the path to the tagfile here. + +TAGFILES = + +# When a file name is specified after GENERATE_TAGFILE, doxygen will create +# a tag file that is based on the input files it reads. + +GENERATE_TAGFILE = + +# If the ALLEXTERNALS tag is set to YES all external classes will be listed +# in the class index. If set to NO only the inherited external classes +# will be listed. + +ALLEXTERNALS = NO + +# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed +# in the modules index. If set to NO, only the current project's groups will +# be listed. + +EXTERNAL_GROUPS = YES + +# The PERL_PATH should be the absolute path and name of the perl script +# interpreter (i.e. the result of `which perl'). + +PERL_PATH = /usr/bin/perl + +#--------------------------------------------------------------------------- +# Configuration options related to the dot tool +#--------------------------------------------------------------------------- + +# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will +# generate a inheritance diagram (in HTML, RTF and la_te_x) for classes with base +# or super classes. Setting the tag to NO turns the diagrams off. Note that +# this option is superseded by the HAVE_DOT option below. This is only a +# fallback. It is recommended to install and use dot, since it yields more +# powerful graphs. + +CLASS_DIAGRAMS = YES + +# You can define message sequence charts within doxygen comments using the \msc +# command. Doxygen will then run the mscgen tool (see http://www.mcternan.me.uk/mscgen/) to +# produce the chart and insert it in the documentation. The MSCGEN_PATH tag allows you to +# specify the directory where the mscgen tool resides. If left empty the tool is assumed to +# be found in the default search path. + +MSCGEN_PATH = + +# If set to YES, the inheritance and collaboration graphs will hide +# inheritance and usage relations if the target is undocumented +# or is not a class. + +HIDE_UNDOC_RELATIONS = YES + +# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is +# available from the path. This tool is part of Graphviz, a graph visualization +# toolkit from AT&T and Lucent Bell Labs. The other options in this section +# have no effect if this option is set to NO (the default) + +HAVE_DOT = NO + +# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect inheritance relations. Setting this tag to YES will force the +# the CLASS_DIAGRAMS tag to NO. + +CLASS_GRAPH = YES + +# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect implementation dependencies (inheritance, containment, and +# class references variables) of the class with other documented classes. + +COLLABORATION_GRAPH = YES + +# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for groups, showing the direct groups dependencies + +GROUP_GRAPHS = YES + +# If the UML_LOOK tag is set to YES doxygen will generate inheritance and +# collaboration diagrams in a style similar to the OMG's Unified Modeling +# Language. + +UML_LOOK = NO + +# If set to YES, the inheritance and collaboration graphs will show the +# relations between templates and their instances. + +TEMPLATE_RELATIONS = NO + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT +# tags are set to YES then doxygen will generate a graph for each documented +# file showing the direct and indirect include dependencies of the file with +# other documented files. + +INCLUDE_GRAPH = YES + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and +# HAVE_DOT tags are set to YES then doxygen will generate a graph for each +# documented header file showing the documented files that directly or +# indirectly include this file. + +INCLUDED_BY_GRAPH = YES + +# If the CALL_GRAPH, SOURCE_BROWSER and HAVE_DOT tags are set to YES then doxygen will +# generate a call dependency graph for every global function or class method. +# Note that enabling this option will significantly increase the time of a run. +# So in most cases it will be better to enable call graphs for selected +# functions only using the \callgraph command. + +CALL_GRAPH = NO + +# If the CALLER_GRAPH, SOURCE_BROWSER and HAVE_DOT tags are set to YES then doxygen will +# generate a caller dependency graph for every global function or class method. +# Note that enabling this option will significantly increase the time of a run. +# So in most cases it will be better to enable caller graphs for selected +# functions only using the \callergraph command. + +CALLER_GRAPH = NO + +# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen +# will graphical hierarchy of all classes instead of a textual one. + +GRAPHICAL_HIERARCHY = YES + +# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES +# then doxygen will show the dependencies a directory has on other directories +# in a graphical way. The dependency relations are determined by the #include +# relations between the files in the directories. + +DIRECTORY_GRAPH = YES + +# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images +# generated by dot. Possible values are png, jpg, or gif +# If left blank png will be used. + +DOT_IMAGE_FORMAT = png + +# The tag DOT_PATH can be used to specify the path where the dot tool can be +# found. If left blank, it is assumed the dot tool can be found in the path. + +DOT_PATH = + +# The DOTFILE_DIRS tag can be used to specify one or more directories that +# contain dot files that are included in the documentation (see the +# \dotfile command). + +DOTFILE_DIRS = + +# The MAX_DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of +# nodes that will be shown in the graph. If the number of nodes in a graph +# becomes larger than this value, doxygen will truncate the graph, which is +# visualized by representing a node as a red box. Note that doxygen if the number +# of direct children of the root node in a graph is already larger than +# MAX_DOT_GRAPH_NOTES then the graph will not be shown at all. Also note +# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. + +DOT_GRAPH_MAX_NODES = 50 + +# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the +# graphs generated by dot. A depth value of 3 means that only nodes reachable +# from the root by following a path via at most 3 edges will be shown. Nodes +# that lay further from the root node will be omitted. Note that setting this +# option to 1 or 2 may greatly reduce the computation time needed for large +# code bases. Also note that the size of a graph can be further restricted by +# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. + +MAX_DOT_GRAPH_DEPTH = 0 + +# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent +# background. This is disabled by default, which results in a white background. +# Warning: Depending on the platform used, enabling this option may lead to +# badly anti-aliased labels on the edges of a graph (i.e. they become hard to +# read). + +DOT_TRANSPARENT = YES + +# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output +# files in one run (i.e. multiple -o and -T options on the command line). This +# makes dot run faster, but since only newer versions of dot (>1.8.10) +# support this, this feature is disabled by default. + +DOT_MULTI_TARGETS = NO + +# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will +# generate a legend page explaining the meaning of the various boxes and +# arrows in the dot generated graphs. + +GENERATE_LEGEND = YES + +# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will +# remove the intermediate dot files that are used to generate +# the various graphs. + +DOT_CLEANUP = YES + +#--------------------------------------------------------------------------- +# Configuration::additions related to the search engine +#--------------------------------------------------------------------------- + +# The SEARCHENGINE tag specifies whether or not a search engine should be +# used. If set to NO the values of all tags below this one will be ignored. + +SEARCHENGINE = NO diff --git a/libs/libaom/src/mainpage.dox b/libs/libaom/src/mainpage.dox new file mode 100644 index 000000000..03a299ae1 --- /dev/null +++ b/libs/libaom/src/mainpage.dox @@ -0,0 +1,52 @@ +/*!\mainpage AMedia Codec SDK + + \section main_contents Page Contents + - \ref main_intro + - \ref main_startpoints + - \ref main_support + + \section main_intro Introduction + Welcome to the AMedia Codec SDK. This SDK allows you to integrate your + applications with the AOM and AV1 video codecs. + + This distribution of the AOMedia Codec SDK includes the following support: + + \if aom_encoder + - \ref aom_encoder + \endif + \if aom_decoder + - \ref aom_decoder + \endif + + + \section main_startpoints Starting Points + - Consult the \ref changelog for a complete list of improvements in this + release. + - \ref readme contains instructions on compiling the sample applications. + - Read the \ref usage "usage" for a narrative on codec usage. + - Read the \ref samples "sample code" for examples of how to interact with the + codec. + - \ref codec reference + \if encoder + - \ref encoder reference + \endif + \if decoder + - \ref decoder reference + \endif + + \section main_support Support Options & FAQ + The AOMedia project is an open source project supported by its community. For + questions about this SDK, please mail the apps-devel@webmproject.org list. + To contribute, see http://www.webmproject.org/code/contribute and mail + codec-devel@webmproject.org. +*/ + +/*!\page changelog CHANGELOG + \verbinclude CHANGELOG +*/ + +/*!\page readme README.md + \include README.md +*/ + +/*!\defgroup codecs Supported Codecs */ diff --git a/libs/libaom/src/stats/aomstats.c b/libs/libaom/src/stats/aomstats.c new file mode 100644 index 000000000..4a15adf02 --- /dev/null +++ b/libs/libaom/src/stats/aomstats.c @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "stats/aomstats.h" + +#include +#include +#include + +#include "common/tools_common.h" + +int stats_open_file(stats_io_t *stats, const char *fpf, int pass) { + int res; + stats->pass = pass; + + if (pass == 0) { + stats->file = fopen(fpf, "wb"); + stats->buf.sz = 0; + stats->buf.buf = NULL; + res = (stats->file != NULL); + } else { + size_t nbytes; + + stats->file = fopen(fpf, "rb"); + + if (stats->file == NULL) fatal("First-pass stats file does not exist!"); + + if (fseek(stats->file, 0, SEEK_END)) + fatal("First-pass stats file must be seekable!"); + + stats->buf.sz = stats->buf_alloc_sz = ftell(stats->file); + rewind(stats->file); + + stats->buf.buf = malloc(stats->buf_alloc_sz); + + if (!stats->buf.buf) + fatal("Failed to allocate first-pass stats buffer (%lu bytes)", + (unsigned int)stats->buf_alloc_sz); + + nbytes = fread(stats->buf.buf, 1, stats->buf.sz, stats->file); + res = (nbytes == stats->buf.sz); + } + + return res; +} + +int stats_open_mem(stats_io_t *stats, int pass) { + int res; + stats->pass = pass; + + if (!pass) { + stats->buf.sz = 0; + stats->buf_alloc_sz = 64 * 1024; + stats->buf.buf = malloc(stats->buf_alloc_sz); + } + + stats->buf_ptr = stats->buf.buf; + res = (stats->buf.buf != NULL); + return res; +} + +void stats_close(stats_io_t *stats, int last_pass) { + if (stats->file) { + if (stats->pass == last_pass) { + free(stats->buf.buf); + } + + fclose(stats->file); + stats->file = NULL; + } else { + if (stats->pass == last_pass) free(stats->buf.buf); + } +} + +void stats_write(stats_io_t *stats, const void *pkt, size_t len) { + if (stats->file) { + (void)fwrite(pkt, 1, len, stats->file); + } else { + if (stats->buf.sz + len > stats->buf_alloc_sz) { + size_t new_sz = stats->buf_alloc_sz + 64 * 1024; + char *new_ptr = realloc(stats->buf.buf, new_sz); + + if (new_ptr) { + stats->buf_ptr = new_ptr + (stats->buf_ptr - (char *)stats->buf.buf); + stats->buf.buf = new_ptr; + stats->buf_alloc_sz = new_sz; + } else { + fatal("Failed to realloc firstpass stats buffer."); + } + } + + memcpy(stats->buf_ptr, pkt, len); + stats->buf.sz += len; + stats->buf_ptr += len; + } +} + +aom_fixed_buf_t stats_get(stats_io_t *stats) { return stats->buf; } diff --git a/libs/libaom/src/stats/aomstats.h b/libs/libaom/src/stats/aomstats.h new file mode 100644 index 000000000..b9c71871a --- /dev/null +++ b/libs/libaom/src/stats/aomstats.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_STATS_AOMSTATS_H_ +#define AOM_STATS_AOMSTATS_H_ + +#include + +#include "aom/aom_encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* This structure is used to abstract the different ways of handling + * first pass statistics + */ +typedef struct { + aom_fixed_buf_t buf; + int pass; + FILE *file; + char *buf_ptr; + size_t buf_alloc_sz; +} stats_io_t; + +int stats_open_file(stats_io_t *stats, const char *fpf, int pass); +int stats_open_mem(stats_io_t *stats, int pass); +void stats_close(stats_io_t *stats, int last_pass); +void stats_write(stats_io_t *stats, const void *pkt, size_t len); +aom_fixed_buf_t stats_get(stats_io_t *stats); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_STATS_AOMSTATS_H_ diff --git a/libs/libaom/src/stats/rate_hist.c b/libs/libaom/src/stats/rate_hist.c new file mode 100644 index 000000000..71eb78b72 --- /dev/null +++ b/libs/libaom/src/stats/rate_hist.c @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "stats/rate_hist.h" + +#include +#include +#include +#include +#include + +#define RATE_BINS 100 +#define HIST_BAR_MAX 40 + +struct hist_bucket { + int low; + int high; + int count; +}; + +struct rate_hist { + int64_t *pts; + int *sz; + int samples; + int frames; + struct hist_bucket bucket[RATE_BINS]; + int total; +}; + +struct rate_hist *init_rate_histogram(const aom_codec_enc_cfg_t *cfg, + const aom_rational_t *fps) { + int i; + struct rate_hist *hist = malloc(sizeof(*hist)); + + // Determine the number of samples in the buffer. Use the file's framerate + // to determine the number of frames in rc_buf_sz milliseconds, with an + // adjustment (5/4) to account for alt-refs + hist->samples = cfg->rc_buf_sz * 5 / 4 * fps->num / fps->den / 1000; + + // prevent division by zero + if (hist->samples == 0) hist->samples = 1; + + hist->frames = 0; + hist->total = 0; + + hist->pts = calloc(hist->samples, sizeof(*hist->pts)); + hist->sz = calloc(hist->samples, sizeof(*hist->sz)); + for (i = 0; i < RATE_BINS; i++) { + hist->bucket[i].low = INT_MAX; + hist->bucket[i].high = 0; + hist->bucket[i].count = 0; + } + + return hist; +} + +void destroy_rate_histogram(struct rate_hist *hist) { + if (hist) { + free(hist->pts); + free(hist->sz); + free(hist); + } +} + +void update_rate_histogram(struct rate_hist *hist, + const aom_codec_enc_cfg_t *cfg, + const aom_codec_cx_pkt_t *pkt) { + int i; + int64_t then = 0; + int64_t avg_bitrate = 0; + int64_t sum_sz = 0; + const int64_t now = pkt->data.frame.pts * 1000 * + (uint64_t)cfg->g_timebase.num / + (uint64_t)cfg->g_timebase.den; + + int idx = hist->frames++ % hist->samples; + hist->pts[idx] = now; + hist->sz[idx] = (int)pkt->data.frame.sz; + + if (now < cfg->rc_buf_initial_sz) return; + + if (!cfg->rc_target_bitrate) return; + + then = now; + + /* Sum the size over the past rc_buf_sz ms */ + for (i = hist->frames; i > 0 && hist->frames - i < hist->samples; i--) { + const int i_idx = (i - 1) % hist->samples; + + then = hist->pts[i_idx]; + if (now - then > cfg->rc_buf_sz) break; + sum_sz += hist->sz[i_idx]; + } + + if (now == then) return; + + avg_bitrate = sum_sz * 8 * 1000 / (now - then); + idx = (int)(avg_bitrate * (RATE_BINS / 2) / (cfg->rc_target_bitrate * 1000)); + if (idx < 0) idx = 0; + if (idx > RATE_BINS - 1) idx = RATE_BINS - 1; + if (hist->bucket[idx].low > avg_bitrate) + hist->bucket[idx].low = (int)avg_bitrate; + if (hist->bucket[idx].high < avg_bitrate) + hist->bucket[idx].high = (int)avg_bitrate; + hist->bucket[idx].count++; + hist->total++; +} + +static int merge_hist_buckets(struct hist_bucket *bucket, int max_buckets, + int *num_buckets) { + int small_bucket = 0, merge_bucket = INT_MAX, big_bucket = 0; + int buckets = *num_buckets; + int i; + + /* Find the extrema for this list of buckets */ + big_bucket = small_bucket = 0; + for (i = 0; i < buckets; i++) { + if (bucket[i].count < bucket[small_bucket].count) small_bucket = i; + if (bucket[i].count > bucket[big_bucket].count) big_bucket = i; + } + + /* If we have too many buckets, merge the smallest with an adjacent + * bucket. + */ + while (buckets > max_buckets) { + int last_bucket = buckets - 1; + + /* merge the small bucket with an adjacent one. */ + if (small_bucket == 0) + merge_bucket = 1; + else if (small_bucket == last_bucket) + merge_bucket = last_bucket - 1; + else if (bucket[small_bucket - 1].count < bucket[small_bucket + 1].count) + merge_bucket = small_bucket - 1; + else + merge_bucket = small_bucket + 1; + + assert(abs(merge_bucket - small_bucket) <= 1); + assert(small_bucket < buckets); + assert(big_bucket < buckets); + assert(merge_bucket < buckets); + + if (merge_bucket < small_bucket) { + bucket[merge_bucket].high = bucket[small_bucket].high; + bucket[merge_bucket].count += bucket[small_bucket].count; + } else { + bucket[small_bucket].high = bucket[merge_bucket].high; + bucket[small_bucket].count += bucket[merge_bucket].count; + merge_bucket = small_bucket; + } + + assert(bucket[merge_bucket].low != bucket[merge_bucket].high); + + buckets--; + + /* Remove the merge_bucket from the list, and find the new small + * and big buckets while we're at it + */ + big_bucket = small_bucket = 0; + for (i = 0; i < buckets; i++) { + if (i > merge_bucket) bucket[i] = bucket[i + 1]; + + if (bucket[i].count < bucket[small_bucket].count) small_bucket = i; + if (bucket[i].count > bucket[big_bucket].count) big_bucket = i; + } + } + + *num_buckets = buckets; + return bucket[big_bucket].count; +} + +static void show_histogram(const struct hist_bucket *bucket, int buckets, + int total, int scale) { + const char *pat1, *pat2; + int i; + + switch ((int)(log(bucket[buckets - 1].high) / log(10)) + 1) { + case 1: + case 2: + pat1 = "%4d %2s: "; + pat2 = "%4d-%2d: "; + break; + case 3: + pat1 = "%5d %3s: "; + pat2 = "%5d-%3d: "; + break; + case 4: + pat1 = "%6d %4s: "; + pat2 = "%6d-%4d: "; + break; + case 5: + pat1 = "%7d %5s: "; + pat2 = "%7d-%5d: "; + break; + case 6: + pat1 = "%8d %6s: "; + pat2 = "%8d-%6d: "; + break; + case 7: + pat1 = "%9d %7s: "; + pat2 = "%9d-%7d: "; + break; + default: + pat1 = "%12d %10s: "; + pat2 = "%12d-%10d: "; + break; + } + + for (i = 0; i < buckets; i++) { + int len; + int j; + float pct; + + pct = (float)(100.0 * bucket[i].count / total); + len = HIST_BAR_MAX * bucket[i].count / scale; + if (len < 1) len = 1; + assert(len <= HIST_BAR_MAX); + + if (bucket[i].low == bucket[i].high) + fprintf(stderr, pat1, bucket[i].low, ""); + else + fprintf(stderr, pat2, bucket[i].low, bucket[i].high); + + for (j = 0; j < HIST_BAR_MAX; j++) fprintf(stderr, j < len ? "=" : " "); + fprintf(stderr, "\t%5d (%6.2f%%)\n", bucket[i].count, pct); + } +} + +void show_q_histogram(const int counts[64], int max_buckets) { + struct hist_bucket bucket[64]; + int buckets = 0; + int total = 0; + int scale; + int i; + + for (i = 0; i < 64; i++) { + if (counts[i]) { + bucket[buckets].low = bucket[buckets].high = i; + bucket[buckets].count = counts[i]; + buckets++; + total += counts[i]; + } + } + + fprintf(stderr, "\nQuantizer Selection:\n"); + scale = merge_hist_buckets(bucket, max_buckets, &buckets); + show_histogram(bucket, buckets, total, scale); +} + +void show_rate_histogram(struct rate_hist *hist, const aom_codec_enc_cfg_t *cfg, + int max_buckets) { + int i, scale; + int buckets = 0; + + for (i = 0; i < RATE_BINS; i++) { + if (hist->bucket[i].low == INT_MAX) continue; + hist->bucket[buckets++] = hist->bucket[i]; + } + + fprintf(stderr, "\nRate (over %dms window):\n", cfg->rc_buf_sz); + scale = merge_hist_buckets(hist->bucket, max_buckets, &buckets); + show_histogram(hist->bucket, buckets, hist->total, scale); +} diff --git a/libs/libaom/src/stats/rate_hist.h b/libs/libaom/src/stats/rate_hist.h new file mode 100644 index 000000000..55b8c5d43 --- /dev/null +++ b/libs/libaom/src/stats/rate_hist.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_STATS_RATE_HIST_H_ +#define AOM_STATS_RATE_HIST_H_ + +#include "aom/aom_encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct rate_hist; + +struct rate_hist *init_rate_histogram(const aom_codec_enc_cfg_t *cfg, + const aom_rational_t *fps); + +void destroy_rate_histogram(struct rate_hist *hist); + +void update_rate_histogram(struct rate_hist *hist, + const aom_codec_enc_cfg_t *cfg, + const aom_codec_cx_pkt_t *pkt); + +void show_q_histogram(const int counts[64], int max_buckets); + +void show_rate_histogram(struct rate_hist *hist, const aom_codec_enc_cfg_t *cfg, + int max_buckets); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_STATS_RATE_HIST_H_ diff --git a/libs/libaom/src/test/accounting_test.cc b/libs/libaom/src/test/accounting_test.cc new file mode 100644 index 000000000..8b5c8af13 --- /dev/null +++ b/libs/libaom/src/test/accounting_test.cc @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "test/acm_random.h" +#include "aom/aom_integer.h" +#include "aom_dsp/bitreader.h" +#include "aom_dsp/bitwriter.h" + +using libaom_test::ACMRandom; + +TEST(AV1, TestAccounting) { + const int kBufferSize = 10000; + const int kSymbols = 1024; + aom_writer bw; + uint8_t bw_buffer[kBufferSize]; + aom_start_encode(&bw, bw_buffer); + for (int i = 0; i < kSymbols; i++) { + aom_write(&bw, 0, 32); + aom_write(&bw, 0, 32); + aom_write(&bw, 0, 32); + } + aom_stop_encode(&bw); + aom_reader br; + aom_reader_init(&br, bw_buffer, bw.pos); + + Accounting accounting; + aom_accounting_init(&accounting); + br.accounting = &accounting; + for (int i = 0; i < kSymbols; i++) { + aom_read(&br, 32, "A"); + } + // Consecutive symbols that are the same are coalesced. + GTEST_ASSERT_EQ(accounting.syms.num_syms, 1); + GTEST_ASSERT_EQ(accounting.syms.syms[0].samples, (unsigned int)kSymbols); + + aom_accounting_reset(&accounting); + GTEST_ASSERT_EQ(accounting.syms.num_syms, 0); + + // Should record 2 * kSymbols accounting symbols. + aom_reader_init(&br, bw_buffer, bw.pos); + br.accounting = &accounting; + for (int i = 0; i < kSymbols; i++) { + aom_read(&br, 32, "A"); + aom_read(&br, 32, "B"); + aom_read(&br, 32, "B"); + } + GTEST_ASSERT_EQ(accounting.syms.num_syms, kSymbols * 2); + uint32_t tell_frac = aom_reader_tell_frac(&br); + for (int i = 0; i < accounting.syms.num_syms; i++) { + tell_frac -= accounting.syms.syms[i].bits; + } + GTEST_ASSERT_EQ(tell_frac, 0U); + + GTEST_ASSERT_EQ(aom_accounting_dictionary_lookup(&accounting, "A"), + aom_accounting_dictionary_lookup(&accounting, "A")); + + // Check for collisions. The current aom_accounting_hash function returns + // the same hash code for AB and BA. + GTEST_ASSERT_NE(aom_accounting_dictionary_lookup(&accounting, "AB"), + aom_accounting_dictionary_lookup(&accounting, "BA")); +} diff --git a/libs/libaom/src/test/acm_random.h b/libs/libaom/src/test/acm_random.h new file mode 100644 index 000000000..8b1d51aef --- /dev/null +++ b/libs/libaom/src/test/acm_random.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_TEST_ACM_RANDOM_H_ +#define AOM_TEST_ACM_RANDOM_H_ + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "aom/aom_integer.h" + +namespace libaom_test { + +class ACMRandom { + public: + ACMRandom() : random_(DeterministicSeed()) {} + + explicit ACMRandom(int seed) : random_(seed) {} + + void Reset(int seed) { random_.Reseed(seed); } + + // Generates a random 31-bit unsigned integer from [0, 2^31). + uint32_t Rand31(void) { + return random_.Generate(testing::internal::Random::kMaxRange); + } + + uint16_t Rand16(void) { + const uint32_t value = + random_.Generate(testing::internal::Random::kMaxRange); + return (value >> 15) & 0xffff; + } + + int16_t Rand15Signed(void) { + const uint32_t value = + random_.Generate(testing::internal::Random::kMaxRange); + return (value >> 17) & 0xffff; + } + + uint16_t Rand12(void) { + const uint32_t value = + random_.Generate(testing::internal::Random::kMaxRange); + // There's a bit more entropy in the upper bits of this implementation. + return (value >> 19) & 0xfff; + } + + int16_t Rand9Signed(void) { + // Use 9 bits: values between 255 (0x0FF) and -256 (0x100). + const uint32_t value = random_.Generate(512); + return static_cast(value) - 256; + } + + uint8_t Rand8(void) { + const uint32_t value = + random_.Generate(testing::internal::Random::kMaxRange); + // There's a bit more entropy in the upper bits of this implementation. + return (value >> 23) & 0xff; + } + + uint8_t Rand8Extremes(void) { + // Returns a random value near 0 or near 255, to better exercise + // saturation behavior. + const uint8_t r = Rand8(); + return static_cast((r < 128) ? r << 4 : r >> 4); + } + + int PseudoUniform(int range) { return random_.Generate(range); } + + int operator()(int n) { return PseudoUniform(n); } + + static int DeterministicSeed(void) { return 0xbaba; } + + private: + testing::internal::Random random_; +}; + +} // namespace libaom_test + +#endif // AOM_TEST_ACM_RANDOM_H_ diff --git a/libs/libaom/src/test/active_map_test.cc b/libs/libaom/src/test/active_map_test.cc new file mode 100644 index 000000000..0f8a7329e --- /dev/null +++ b/libs/libaom/src/test/active_map_test.cc @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" + +namespace { + +class ActiveMapTest + : public ::libaom_test::CodecTestWith2Params, + public ::libaom_test::EncoderTest { + protected: + static const int kWidth = 208; + static const int kHeight = 144; + + ActiveMapTest() : EncoderTest(GET_PARAM(0)) {} + virtual ~ActiveMapTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(GET_PARAM(1)); + cpu_used_ = GET_PARAM(2); + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AOME_SET_CPUUSED, cpu_used_); + } else if (video->frame() == 3) { + aom_active_map_t map = aom_active_map_t(); + /* clang-format off */ + uint8_t active_map[9 * 13] = { + 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, + 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, + 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, + 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, + 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, + 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, + 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, + 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, + 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, + }; + /* clang-format on */ + map.cols = (kWidth + 15) / 16; + map.rows = (kHeight + 15) / 16; + ASSERT_EQ(map.cols, 13u); + ASSERT_EQ(map.rows, 9u); + map.active_map = active_map; + encoder->Control(AOME_SET_ACTIVEMAP, &map); + } else if (video->frame() == 15) { + aom_active_map_t map = aom_active_map_t(); + map.cols = (kWidth + 15) / 16; + map.rows = (kHeight + 15) / 16; + map.active_map = NULL; + encoder->Control(AOME_SET_ACTIVEMAP, &map); + } + } + + void DoTest() { + // Validate that this non multiple of 64 wide clip encodes + cfg_.g_lag_in_frames = 0; + cfg_.rc_target_bitrate = 400; + cfg_.rc_resize_mode = 0; + cfg_.g_pass = AOM_RC_ONE_PASS; + cfg_.rc_end_usage = AOM_CBR; + cfg_.kf_max_dist = 90000; + ::libaom_test::I420VideoSource video("hantro_odd.yuv", kWidth, kHeight, 30, + 1, 0, 20); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + int cpu_used_; +}; + +TEST_P(ActiveMapTest, Test) { DoTest(); } + +class ActiveMapTestLarge : public ActiveMapTest {}; + +TEST_P(ActiveMapTestLarge, Test) { DoTest(); } + +AV1_INSTANTIATE_TEST_CASE(ActiveMapTestLarge, + ::testing::Values(::libaom_test::kRealTime), + ::testing::Range(0, 5)); + +AV1_INSTANTIATE_TEST_CASE(ActiveMapTest, + ::testing::Values(::libaom_test::kRealTime), + ::testing::Range(5, 9)); + +} // namespace diff --git a/libs/libaom/src/test/altref_test.cc b/libs/libaom/src/test/altref_test.cc new file mode 100644 index 000000000..43df39fb6 --- /dev/null +++ b/libs/libaom/src/test/altref_test.cc @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +namespace { + +class AltRefForcedKeyTestLarge + : public ::libaom_test::CodecTestWith2Params, + public ::libaom_test::EncoderTest { + protected: + AltRefForcedKeyTestLarge() + : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), + cpu_used_(GET_PARAM(2)), forced_kf_frame_num_(1), frame_num_(0) {} + virtual ~AltRefForcedKeyTestLarge() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(encoding_mode_); + cfg_.rc_end_usage = AOM_VBR; + cfg_.g_threads = 0; + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AOME_SET_CPUUSED, cpu_used_); + encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1); +#if CONFIG_AV1_ENCODER + // override test default for tile columns if necessary. + if (GET_PARAM(0) == &libaom_test::kAV1) { + encoder->Control(AV1E_SET_TILE_COLUMNS, 6); + } +#endif + } + frame_flags_ = + (video->frame() == forced_kf_frame_num_) ? AOM_EFLAG_FORCE_KF : 0; + } + + virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) { + if (frame_num_ == forced_kf_frame_num_) { + ASSERT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, + static_cast(AOM_FRAME_IS_KEY)) + << "Frame #" << frame_num_ << " isn't a keyframe!"; + } + ++frame_num_; + } + + ::libaom_test::TestMode encoding_mode_; + int cpu_used_; + unsigned int forced_kf_frame_num_; + unsigned int frame_num_; +}; + +TEST_P(AltRefForcedKeyTestLarge, Frame1IsKey) { + const aom_rational timebase = { 1, 30 }; + const int lag_values[] = { 3, 15, 25, -1 }; + + forced_kf_frame_num_ = 1; + for (int i = 0; lag_values[i] != -1; ++i) { + frame_num_ = 0; + cfg_.g_lag_in_frames = lag_values[i]; + libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + timebase.den, timebase.num, 0, 30); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } +} + +TEST_P(AltRefForcedKeyTestLarge, ForcedFrameIsKey) { + const aom_rational timebase = { 1, 30 }; + const int lag_values[] = { 3, 15, 25, -1 }; + + for (int i = 0; lag_values[i] != -1; ++i) { + frame_num_ = 0; + forced_kf_frame_num_ = lag_values[i] - 1; + cfg_.g_lag_in_frames = lag_values[i]; + libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + timebase.den, timebase.num, 0, 30); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } +} + +AV1_INSTANTIATE_TEST_CASE(AltRefForcedKeyTestLarge, + ::testing::Values(::libaom_test::kOnePassGood), + ::testing::Values(2, 5)); + +} // namespace diff --git a/libs/libaom/src/test/aom_integer_test.cc b/libs/libaom/src/test/aom_integer_test.cc new file mode 100644 index 000000000..d5dfad946 --- /dev/null +++ b/libs/libaom/src/test/aom_integer_test.cc @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom/aom_integer.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +namespace { +const uint64_t kMaximumLeb128CodedSize = 8; +const uint8_t kLeb128PadByte = 0x80; // Binary: 10000000 +const uint64_t kMaximumLeb128Value = UINT32_MAX; +const uint32_t kSizeTestNumValues = 6; +const uint32_t kSizeTestExpectedSizes[kSizeTestNumValues] = { + 1, 1, 2, 3, 4, 5 +}; +const uint64_t kSizeTestInputs[kSizeTestNumValues] = { 0, 0x7f, + 0x3fff, 0x1fffff, + 0xffffff, 0x10000000 }; + +const uint8_t kOutOfRangeLeb128Value[5] = { 0x80, 0x80, 0x80, 0x80, + 0x10 }; // UINT32_MAX + 1 +} // namespace + +TEST(AomLeb128, DecodeTest) { + const size_t num_leb128_bytes = 3; + const uint8_t leb128_bytes[num_leb128_bytes] = { 0xE5, 0x8E, 0x26 }; + const uint64_t expected_value = 0x98765; // 624485 + const size_t expected_length = 3; + uint64_t value = ~0ULL; // make sure value is cleared by the function + size_t length; + ASSERT_EQ( + aom_uleb_decode(&leb128_bytes[0], num_leb128_bytes, &value, &length), 0); + ASSERT_EQ(expected_value, value); + ASSERT_EQ(expected_length, length); + + // Make sure the decoder stops on the last marked LEB128 byte. + aom_uleb_decode(&leb128_bytes[0], num_leb128_bytes + 1, &value, &length); + ASSERT_EQ(expected_value, value); + ASSERT_EQ(expected_length, length); +} + +TEST(AomLeb128, EncodeTest) { + const uint32_t test_value = 0x98765; // 624485 + const uint8_t expected_bytes[3] = { 0xE5, 0x8E, 0x26 }; + const size_t kWriteBufferSize = 4; + uint8_t write_buffer[kWriteBufferSize] = { 0 }; + size_t bytes_written = 0; + ASSERT_EQ(aom_uleb_encode(test_value, kWriteBufferSize, &write_buffer[0], + &bytes_written), + 0); + ASSERT_EQ(bytes_written, 3u); + for (size_t i = 0; i < bytes_written; ++i) { + ASSERT_EQ(write_buffer[i], expected_bytes[i]); + } +} + +TEST(AomLeb128, EncodeDecodeTest) { + const uint32_t value = 0x98765; // 624485 + const size_t kWriteBufferSize = 4; + uint8_t write_buffer[kWriteBufferSize] = { 0 }; + size_t bytes_written = 0; + ASSERT_EQ(aom_uleb_encode(value, kWriteBufferSize, &write_buffer[0], + &bytes_written), + 0); + ASSERT_EQ(bytes_written, 3u); + uint64_t decoded_value; + size_t decoded_length; + aom_uleb_decode(&write_buffer[0], bytes_written, &decoded_value, + &decoded_length); + ASSERT_EQ(value, decoded_value); + ASSERT_EQ(bytes_written, decoded_length); +} + +TEST(AomLeb128, FixedSizeEncodeTest) { + const uint32_t test_value = 0x123; + const uint8_t expected_bytes[4] = { 0xa3, 0x82, 0x80, 0x00 }; + const size_t kWriteBufferSize = 4; + uint8_t write_buffer[kWriteBufferSize] = { 0 }; + size_t bytes_written = 0; + ASSERT_EQ(0, aom_uleb_encode_fixed_size(test_value, kWriteBufferSize, + kWriteBufferSize, &write_buffer[0], + &bytes_written)); + ASSERT_EQ(kWriteBufferSize, bytes_written); + for (size_t i = 0; i < bytes_written; ++i) { + ASSERT_EQ(write_buffer[i], expected_bytes[i]); + } +} + +TEST(AomLeb128, FixedSizeEncodeDecodeTest) { + const uint32_t value = 0x1; + const size_t kWriteBufferSize = 4; + uint8_t write_buffer[kWriteBufferSize] = { 0 }; + size_t bytes_written = 0; + ASSERT_EQ( + aom_uleb_encode_fixed_size(value, kWriteBufferSize, kWriteBufferSize, + &write_buffer[0], &bytes_written), + 0); + ASSERT_EQ(bytes_written, 4u); + uint64_t decoded_value; + size_t decoded_length; + aom_uleb_decode(&write_buffer[0], bytes_written, &decoded_value, + &decoded_length); + ASSERT_EQ(value, decoded_value); + ASSERT_EQ(bytes_written, decoded_length); +} + +TEST(AomLeb128, SizeTest) { + for (size_t i = 0; i < kSizeTestNumValues; ++i) { + ASSERT_EQ(kSizeTestExpectedSizes[i], + aom_uleb_size_in_bytes(kSizeTestInputs[i])); + } +} + +TEST(AomLeb128, DecodeFailTest) { + // Input buffer containing what would be a valid 9 byte LEB128 encoded + // unsigned integer. + const uint8_t kAllPadBytesBuffer[kMaximumLeb128CodedSize + 1] = { + kLeb128PadByte, kLeb128PadByte, kLeb128PadByte, + kLeb128PadByte, kLeb128PadByte, kLeb128PadByte, + kLeb128PadByte, kLeb128PadByte, 0 + }; + uint64_t decoded_value; + + // Test that decode fails when result would be valid 9 byte integer. + ASSERT_EQ(aom_uleb_decode(&kAllPadBytesBuffer[0], kMaximumLeb128CodedSize + 1, + &decoded_value, NULL), + -1); + + // Test that encoded value missing terminator byte within available buffer + // range causes decode error. + ASSERT_EQ(aom_uleb_decode(&kAllPadBytesBuffer[0], kMaximumLeb128CodedSize, + &decoded_value, NULL), + -1); + + // Test that LEB128 input that decodes to a value larger than 32-bits fails. + size_t value_size = 0; + ASSERT_EQ(aom_uleb_decode(&kOutOfRangeLeb128Value[0], + sizeof(kOutOfRangeLeb128Value), &decoded_value, + &value_size), + -1); +} + +TEST(AomLeb128, EncodeFailTest) { + const size_t kWriteBufferSize = 4; + const uint32_t kValidTestValue = 1; + uint8_t write_buffer[kWriteBufferSize] = { 0 }; + size_t coded_size = 0; + ASSERT_EQ( + aom_uleb_encode(kValidTestValue, kWriteBufferSize, NULL, &coded_size), + -1); + ASSERT_EQ(aom_uleb_encode(kValidTestValue, kWriteBufferSize, &write_buffer[0], + NULL), + -1); + + const uint32_t kValueOutOfRangeForBuffer = 0xFFFFFFFF; + ASSERT_EQ(aom_uleb_encode(kValueOutOfRangeForBuffer, kWriteBufferSize, + &write_buffer[0], &coded_size), + -1); + + const uint64_t kValueOutOfRange = kMaximumLeb128Value + 1; + ASSERT_EQ(aom_uleb_encode(kValueOutOfRange, kWriteBufferSize, + &write_buffer[0], &coded_size), + -1); + + const size_t kPadSizeOutOfRange = 5; + ASSERT_EQ(aom_uleb_encode_fixed_size(kValidTestValue, kWriteBufferSize, + kPadSizeOutOfRange, &write_buffer[0], + &coded_size), + -1); +} diff --git a/libs/libaom/src/test/aomcx_set_ref.sh b/libs/libaom/src/test/aomcx_set_ref.sh new file mode 100644 index 000000000..f51b73c58 --- /dev/null +++ b/libs/libaom/src/test/aomcx_set_ref.sh @@ -0,0 +1,58 @@ +#!/bin/sh +## Copyright (c) 2016, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +## This file tests the libaom aom_cx_set_ref example. To add new tests to this +## file, do the following: +## 1. Write a shell function (this is your test). +## 2. Add the function to aom_cx_set_ref_tests (on a new line). +## +. $(dirname $0)/tools_common.sh + +# Environment check: $YUV_RAW_INPUT is required. +aom_cx_set_ref_verify_environment() { + if [ ! -e "${YUV_RAW_INPUT}" ]; then + echo "Libaom test data must exist in LIBAOM_TEST_DATA_PATH." + return 1 + fi +} + +# Runs aom_cx_set_ref and updates the reference frame before encoding frame 90. +# $1 is the codec name, which aom_cx_set_ref does not support at present: It's +# currently used only to name the output file. +# TODO(tomfinegan): Pass the codec param once the example is updated to support +# AV1. +aom_set_ref() { + local encoder="${LIBAOM_BIN_PATH}/aom_cx_set_ref${AOM_TEST_EXE_SUFFIX}" + local codec="$1" + local output_file="${AOM_TEST_OUTPUT_DIR}/aom_cx_set_ref_${codec}.ivf" + local ref_frame_num=4 + local limit=10 + if [ ! -x "${encoder}" ]; then + elog "${encoder} does not exist or is not executable." + return 1 + fi + + eval "${AOM_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \ + "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \ + "${ref_frame_num}" "${limit}" ${devnull} + + [ -e "${output_file}" ] || return 1 +} + +aom_cx_set_ref_av1() { + if [ "$(av1_encode_available)" = "yes" ]; then + aom_set_ref av1 || return 1 + fi +} + +aom_cx_set_ref_tests="aom_cx_set_ref_av1" + +run_tests aom_cx_set_ref_verify_environment "${aom_cx_set_ref_tests}" + diff --git a/libs/libaom/src/test/aomdec.sh b/libs/libaom/src/test/aomdec.sh new file mode 100644 index 000000000..927142287 --- /dev/null +++ b/libs/libaom/src/test/aomdec.sh @@ -0,0 +1,147 @@ +#!/bin/sh +## Copyright (c) 2016, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +## This file tests aomdec. To add new tests to this file, do the following: +## 1. Write a shell function (this is your test). +## 2. Add the function to aomdec_tests (on a new line). +## +. $(dirname $0)/tools_common.sh + +# Environment check: Make sure input is available. +aomdec_verify_environment() { + if [ "$(av1_encode_available)" != "yes" ] ; then + if [ ! -e "${AV1_IVF_FILE}" ] || \ + [ ! -e "${AV1_OBU_ANNEXB_FILE}" ] || \ + [ ! -e "${AV1_OBU_SEC5_FILE}" ] || \ + [ ! -e "${AV1_WEBM_FILE}" ]; then + elog "Libaom test data must exist before running this test script when " \ + " encoding is disabled. " + return 1 + fi + fi + if [ -z "$(aom_tool_path aomdec)" ]; then + elog "aomdec not found. It must exist in LIBAOM_BIN_PATH or its parent." + return 1 + fi +} + +# Wrapper function for running aomdec with pipe input. Requires that +# LIBAOM_BIN_PATH points to the directory containing aomdec. $1 is used as the +# input file path and shifted away. All remaining parameters are passed through +# to aomdec. +aomdec_pipe() { + local input="$1" + shift + if [ ! -e "${input}" ]; then + elog "Input file ($input) missing in aomdec_pipe()" + return 1 + fi + cat "${file}" | aomdec - "$@" ${devnull} +} + + +# Wrapper function for running aomdec. Requires that LIBAOM_BIN_PATH points to +# the directory containing aomdec. $1 one is used as the input file path and +# shifted away. All remaining parameters are passed through to aomdec. +aomdec() { + local decoder="$(aom_tool_path aomdec)" + local input="$1" + shift + eval "${AOM_TEST_PREFIX}" "${decoder}" "$input" "$@" ${devnull} +} + +aomdec_can_decode_av1() { + if [ "$(av1_decode_available)" = "yes" ]; then + echo yes + fi +} + +aomdec_av1_ivf() { + if [ "$(aomdec_can_decode_av1)" = "yes" ]; then + local file="${AV1_IVF_FILE}" + if [ ! -e "${file}" ]; then + encode_yuv_raw_input_av1 "${file}" --ivf + fi + aomdec "${AV1_IVF_FILE}" --summary --noblit + fi +} + +aomdec_av1_ivf_error_resilient() { + if [ "$(aomdec_can_decode_av1)" = "yes" ]; then + local file="av1.error-resilient.ivf" + if [ ! -e "${file}" ]; then + encode_yuv_raw_input_av1 "${file}" --ivf --error-resilient=1 + fi + aomdec "${file}" --summary --noblit + fi +} + +aomdec_av1_ivf_multithread() { + if [ "$(aomdec_can_decode_av1)" = "yes" ]; then + local file="${AV1_IVF_FILE}" + if [ ! -e "${file}" ]; then + encode_yuv_raw_input_av1 "${file}" --ivf + fi + for threads in 2 3 4 5 6 7 8; do + aomdec "${file}" --summary --noblit --threads=$threads + done + fi +} + +aomdec_aom_ivf_pipe_input() { + if [ "$(aomdec_can_decode_av1)" = "yes" ]; then + local file="${AV1_IVF_FILE}" + if [ ! -e "${file}" ]; then + encode_yuv_raw_input_av1 "${file}" --ivf + fi + aomdec_pipe "${AV1_IVF_FILE}" --summary --noblit + fi +} + +aomdec_av1_obu_annexb() { + if [ "$(aomdec_can_decode_av1)" = "yes" ]; then + local file="${AV1_OBU_ANNEXB_FILE}" + if [ ! -e "${file}" ]; then + encode_yuv_raw_input_av1 "${file}" --obu --annexb=1 + fi + aomdec "${file}" --summary --noblit --annexb + fi +} + +aomdec_av1_obu_section5() { + if [ "$(aomdec_can_decode_av1)" = "yes" ]; then + local file="${AV1_OBU_SEC5_FILE}" + if [ ! -e "${file}" ]; then + encode_yuv_raw_input_av1 "${file}" --obu + fi + aomdec "${file}" --summary --noblit + fi +} + +aomdec_av1_webm() { + if [ "$(aomdec_can_decode_av1)" = "yes" ] && \ + [ "$(webm_io_available)" = "yes" ]; then + local file="${AV1_WEBM_FILE}" + if [ ! -e "${file}" ]; then + encode_yuv_raw_input_av1 "${file}" + fi + aomdec "${AV1_WEBM_FILE}" --summary --noblit + fi +} + +aomdec_tests="aomdec_av1_ivf + aomdec_av1_ivf_error_resilient + aomdec_av1_ivf_multithread + aomdec_aom_ivf_pipe_input + aomdec_av1_obu_annexb + aomdec_av1_obu_section5 + aomdec_av1_webm" + +run_tests aomdec_verify_environment "${aomdec_tests}" diff --git a/libs/libaom/src/test/aomenc.sh b/libs/libaom/src/test/aomenc.sh new file mode 100644 index 000000000..b030397a3 --- /dev/null +++ b/libs/libaom/src/test/aomenc.sh @@ -0,0 +1,269 @@ +#!/bin/sh +## Copyright (c) 2016, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +## This file tests aomenc using hantro_collage_w352h288.yuv as input. To add +## new tests to this file, do the following: +## 1. Write a shell function (this is your test). +## 2. Add the function to aomenc_tests (on a new line). +## +. $(dirname $0)/tools_common.sh + +# Environment check: Make sure input is available. +aomenc_verify_environment() { + if [ ! -e "${YUV_RAW_INPUT}" ]; then + elog "The file ${YUV_RAW_INPUT##*/} must exist in LIBAOM_TEST_DATA_PATH." + return 1 + fi + if [ "$(aomenc_can_encode_av1)" = "yes" ]; then + if [ ! -e "${Y4M_NOSQ_PAR_INPUT}" ]; then + elog "The file ${Y4M_NOSQ_PAR_INPUT##*/} must exist in" + elog "LIBAOM_TEST_DATA_PATH." + return 1 + fi + fi + if [ -z "$(aom_tool_path aomenc)" ]; then + elog "aomenc not found. It must exist in LIBAOM_BIN_PATH or its parent." + return 1 + fi +} + +aomenc_can_encode_av1() { + if [ "$(av1_encode_available)" = "yes" ]; then + echo yes + fi +} + +aomenc_can_encode_av1() { + if [ "$(av1_encode_available)" = "yes" ]; then + echo yes + fi +} + +# Utilities that echo aomenc input file parameters. +y4m_input_non_square_par() { + echo ""${Y4M_NOSQ_PAR_INPUT}"" +} + +y4m_input_720p() { + echo ""${Y4M_720P_INPUT}"" +} + +# Wrapper function for running aomenc with pipe input. Requires that +# LIBAOM_BIN_PATH points to the directory containing aomenc. $1 is used as the +# input file path and shifted away. All remaining parameters are passed through +# to aomenc. +aomenc_pipe() { + local encoder="$(aom_tool_path aomenc)" + local input="$1" + shift + cat "${input}" | eval "${AOM_TEST_PREFIX}" "${encoder}" - \ + --test-decode=fatal \ + "$@" ${devnull} +} + +# Wrapper function for running aomenc. Requires that LIBAOM_BIN_PATH points to +# the directory containing aomenc. $1 one is used as the input file path and +# shifted away. All remaining parameters are passed through to aomenc. +aomenc() { + local encoder="$(aom_tool_path aomenc)" + local input="$1" + shift + eval "${AOM_TEST_PREFIX}" "${encoder}" "${input}" \ + --test-decode=fatal \ + "$@" ${devnull} +} + +aomenc_av1_ivf() { + if [ "$(aomenc_can_encode_av1)" = "yes" ]; then + local output="${AV1_IVF_FILE}" + if [ -e "${AV1_IVF_FILE}" ]; then + output="${AOM_TEST_OUTPUT_DIR}/av1_test.ivf" + fi + aomenc $(yuv_raw_input) \ + $(aomenc_encode_test_fast_params) \ + --ivf \ + --output="${output}" + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + fi +} + +aomenc_av1_obu_annexb() { + if [ "$(aomenc_can_encode_av1)" = "yes" ]; then + local output="${AV1_OBU_ANNEXB_FILE}" + if [ -e "${AV1_OBU_ANNEXB_FILE}" ]; then + output="${AOM_TEST_OUTPUT_DIR}/av1_test.annexb.obu" + fi + aomenc $(yuv_raw_input) \ + $(aomenc_encode_test_fast_params) \ + --obu \ + --annexb=1 \ + --output="${output}" + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + fi +} + +aomenc_av1_obu_section5() { + if [ "$(aomenc_can_encode_av1)" = "yes" ]; then + local output="${AV1_OBU_SEC5_FILE}" + if [ -e "${AV1_OBU_SEC5_FILE}" ]; then + output="${AOM_TEST_OUTPUT_DIR}/av1_test.section5.obu" + fi + aomenc $(yuv_raw_input) \ + $(aomenc_encode_test_fast_params) \ + --obu \ + --output="${output}" + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + fi +} + +aomenc_av1_webm() { + if [ "$(aomenc_can_encode_av1)" = "yes" ] && \ + [ "$(webm_io_available)" = "yes" ]; then + local output="${AV1_WEBM_FILE}" + if [ -e "${AV1_WEBM_FILE}" ]; then + output="${AOM_TEST_OUTPUT_DIR}/av1_test.webm" + fi + aomenc $(yuv_raw_input) \ + $(aomenc_encode_test_fast_params) \ + --output="${output}" + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + fi +} + +aomenc_av1_webm_1pass() { + if [ "$(aomenc_can_encode_av1)" = "yes" ] && \ + [ "$(webm_io_available)" = "yes" ]; then + local output="${AOM_TEST_OUTPUT_DIR}/av1_test.webm" + aomenc $(yuv_raw_input) \ + $(aomenc_encode_test_fast_params) \ + --passes=1 \ + --output="${output}" + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + fi +} + +aomenc_av1_ivf_lossless() { + if [ "$(aomenc_can_encode_av1)" = "yes" ]; then + local output="${AOM_TEST_OUTPUT_DIR}/av1_lossless.ivf" + aomenc $(yuv_raw_input) \ + $(aomenc_encode_test_fast_params) \ + --ivf \ + --output="${output}" \ + --lossless=1 + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + fi +} + +aomenc_av1_ivf_minq0_maxq0() { + if [ "$(aomenc_can_encode_av1)" = "yes" ]; then + local output="${AOM_TEST_OUTPUT_DIR}/av1_lossless_minq0_maxq0.ivf" + aomenc $(yuv_raw_input) \ + $(aomenc_encode_test_fast_params) \ + --ivf \ + --output="${output}" \ + --min-q=0 \ + --max-q=0 + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + fi +} + +aomenc_av1_webm_lag5_frames10() { + if [ "$(aomenc_can_encode_av1)" = "yes" ] && \ + [ "$(webm_io_available)" = "yes" ]; then + local lag_total_frames=10 + local lag_frames=5 + local output="${AOM_TEST_OUTPUT_DIR}/av1_lag5_frames10.webm" + aomenc $(yuv_raw_input) \ + $(aomenc_encode_test_fast_params) \ + --limit=${lag_total_frames} \ + --lag-in-frames=${lag_frames} \ + --output="${output}" + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + fi +} + +# TODO(fgalligan): Test that DisplayWidth is different than video width. +aomenc_av1_webm_non_square_par() { + if [ "$(aomenc_can_encode_av1)" = "yes" ] && \ + [ "$(webm_io_available)" = "yes" ]; then + local output="${AOM_TEST_OUTPUT_DIR}/av1_non_square_par.webm" + aomenc $(y4m_input_non_square_par) \ + $(aomenc_encode_test_fast_params) \ + --output="${output}" + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + fi +} + +aomenc_av1_webm_cdf_update_mode() { + if [ "$(aomenc_can_encode_av1)" = "yes" ] && \ + [ "$(webm_io_available)" = "yes" ]; then + for mode in 0 1 2; do + local output="${AOM_TEST_OUTPUT_DIR}/cdf_mode_${mode}.webm" + aomenc $(yuv_raw_input) \ + $(aomenc_encode_test_fast_params) \ + --cdf-update-mode=${mode} \ + --output="${output}" + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + done + fi +} + +aomenc_tests="aomenc_av1_ivf + aomenc_av1_obu_annexb + aomenc_av1_obu_section5 + aomenc_av1_webm + aomenc_av1_webm_1pass + aomenc_av1_ivf_lossless + aomenc_av1_ivf_minq0_maxq0 + aomenc_av1_webm_lag5_frames10 + aomenc_av1_webm_non_square_par + aomenc_av1_webm_cdf_update_mode" + +run_tests aomenc_verify_environment "${aomenc_tests}" diff --git a/libs/libaom/src/test/aq_segment_test.cc b/libs/libaom/src/test/aq_segment_test.cc new file mode 100644 index 000000000..83bfdb670 --- /dev/null +++ b/libs/libaom/src/test/aq_segment_test.cc @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" + +namespace { + +class AqSegmentTest + : public ::libaom_test::CodecTestWith3Params, + public ::libaom_test::EncoderTest { + protected: + AqSegmentTest() : EncoderTest(GET_PARAM(0)) {} + virtual ~AqSegmentTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(GET_PARAM(1)); + set_cpu_used_ = GET_PARAM(2); + aq_mode_ = 0; + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AOME_SET_CPUUSED, set_cpu_used_); + encoder->Control(AV1E_SET_AQ_MODE, aq_mode_); + encoder->Control(AV1E_SET_DELTAQ_MODE, deltaq_mode_); + encoder->Control(AOME_SET_MAX_INTRA_BITRATE_PCT, 100); + } + } + + void DoTest(int aq_mode) { + aq_mode_ = aq_mode; + deltaq_mode_ = 0; + cfg_.kf_max_dist = 12; + cfg_.rc_min_quantizer = 8; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = AOM_CBR; + cfg_.g_lag_in_frames = 6; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_target_bitrate = 300; + ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, + 288, 30, 1, 0, 15); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + int set_cpu_used_; + int aq_mode_; + int deltaq_mode_; +}; + +// Validate that this AQ segmentation mode (1-variance_aq, 2-complexity_aq, +// 3-cyclic_refresh_aq) encodes and decodes without a mismatch. +TEST_P(AqSegmentTest, TestNoMisMatch) { DoTest(GET_PARAM(3)); } + +class AqSegmentTestLarge : public AqSegmentTest {}; + +TEST_P(AqSegmentTestLarge, TestNoMisMatch) { DoTest(GET_PARAM(3)); } + +// Validate that this delta q mode +// encodes and decodes without a mismatch. +TEST_P(AqSegmentTest, TestNoMisMatchExtDeltaQ) { + cfg_.rc_end_usage = AOM_CQ; + aq_mode_ = 0; + deltaq_mode_ = 2; + ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 15); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +AV1_INSTANTIATE_TEST_CASE(AqSegmentTest, + ::testing::Values(::libaom_test::kRealTime, + ::libaom_test::kOnePassGood), + ::testing::Range(5, 9), ::testing::Range(0, 4)); +AV1_INSTANTIATE_TEST_CASE(AqSegmentTestLarge, + ::testing::Values(::libaom_test::kRealTime, + ::libaom_test::kOnePassGood), + ::testing::Range(3, 5), ::testing::Range(0, 4)); +} // namespace diff --git a/libs/libaom/src/test/arf_freq_test.cc b/libs/libaom/src/test/arf_freq_test.cc new file mode 100644 index 000000000..0780cd712 --- /dev/null +++ b/libs/libaom/src/test/arf_freq_test.cc @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "test/yuv_video_source.h" +#include "av1/encoder/ratectrl.h" + +namespace { + +const unsigned int kFrames = 100; +const int kBitrate = 500; + +#define ARF_NOT_SEEN 1000001 +#define ARF_SEEN_ONCE 1000000 + +typedef struct { + const char *filename; + unsigned int width; + unsigned int height; + unsigned int framerate_num; + unsigned int framerate_den; + unsigned int input_bit_depth; + aom_img_fmt fmt; + aom_bit_depth_t bit_depth; + unsigned int profile; +} TestVideoParam; + +typedef struct { + libaom_test::TestMode mode; + int cpu_used; +} TestEncodeParam; + +const TestVideoParam kTestVectors[] = { + // artificially increase framerate to trigger default check + { "hantro_collage_w352h288.yuv", 352, 288, 5000, 1, 8, AOM_IMG_FMT_I420, + AOM_BITS_8, 0 }, + { "hantro_collage_w352h288.yuv", 352, 288, 30, 1, 8, AOM_IMG_FMT_I420, + AOM_BITS_8, 0 }, + { "rush_hour_444.y4m", 352, 288, 30, 1, 8, AOM_IMG_FMT_I444, AOM_BITS_8, 1 }, + // Add list of profile 2/3 test videos here ... +}; + +const TestEncodeParam kEncodeVectors[] = { + { ::libaom_test::kOnePassGood, 2 }, { ::libaom_test::kOnePassGood, 5 }, + { ::libaom_test::kTwoPassGood, 1 }, { ::libaom_test::kTwoPassGood, 2 }, + { ::libaom_test::kTwoPassGood, 5 }, { ::libaom_test::kRealTime, 5 }, +}; + +const int kMinArfVectors[] = { + // NOTE: 0 refers to the default built-in logic in: + // av1_rc_get_default_min_gf_interval(...) + 0, 4, 8, 12, 15 +}; + +int is_extension_y4m(const char *filename) { + const char *dot = strrchr(filename, '.'); + if (!dot || dot == filename) + return 0; + else + return !strcmp(dot, ".y4m"); +} + +class ArfFreqTestLarge + : public ::libaom_test::CodecTestWith3Params, + public ::libaom_test::EncoderTest { + protected: + ArfFreqTestLarge() + : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)), + test_encode_param_(GET_PARAM(2)), min_arf_requested_(GET_PARAM(3)) {} + + virtual ~ArfFreqTestLarge() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(test_encode_param_.mode); + if (test_encode_param_.mode != ::libaom_test::kRealTime) { + cfg_.g_lag_in_frames = 25; + cfg_.rc_end_usage = AOM_VBR; + } else { + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = AOM_CBR; + cfg_.rc_buf_sz = 1000; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + } + } + + virtual void BeginPassHook(unsigned int) { + min_run_ = ARF_NOT_SEEN; + run_of_visible_frames_ = 0; + } + + int GetNumFramesInPkt(const aom_codec_cx_pkt_t *pkt) { + const uint8_t *buffer = reinterpret_cast(pkt->data.frame.buf); + const uint8_t marker = buffer[pkt->data.frame.sz - 1]; + const int mag = ((marker >> 3) & 3) + 1; + int frames = (marker & 0x7) + 1; + const unsigned int index_sz = 2 + mag * frames; + // Check for superframe or not. + // Assume superframe has only one visible frame, the rest being + // invisible. If superframe index is not found, then there is only + // one frame. + if (!((marker & 0xe0) == 0xc0 && pkt->data.frame.sz >= index_sz && + buffer[pkt->data.frame.sz - index_sz] == marker)) { + frames = 1; + } + return frames; + } + + virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) { + if (pkt->kind != AOM_CODEC_CX_FRAME_PKT) return; + const int frames = GetNumFramesInPkt(pkt); + if (frames == 1) { + run_of_visible_frames_++; + } else if (frames == 2) { + if (min_run_ == ARF_NOT_SEEN) { + min_run_ = ARF_SEEN_ONCE; + } else if (min_run_ == ARF_SEEN_ONCE || + run_of_visible_frames_ < min_run_) { + min_run_ = run_of_visible_frames_; + } + run_of_visible_frames_ = 1; + } else { + min_run_ = 0; + run_of_visible_frames_ = 1; + } + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1); + encoder->Control(AV1E_SET_TILE_COLUMNS, 4); + encoder->Control(AOME_SET_CPUUSED, test_encode_param_.cpu_used); + encoder->Control(AV1E_SET_MIN_GF_INTERVAL, min_arf_requested_); + if (test_encode_param_.mode != ::libaom_test::kRealTime) { + encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1); + encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7); + encoder->Control(AOME_SET_ARNR_STRENGTH, 5); + } + } + } + + int GetMinVisibleRun() const { return min_run_; } + + int GetMinArfDistanceRequested() const { + if (min_arf_requested_) + return min_arf_requested_; + else + return av1_rc_get_default_min_gf_interval( + test_video_param_.width, test_video_param_.height, + (double)test_video_param_.framerate_num / + test_video_param_.framerate_den); + } + + TestVideoParam test_video_param_; + TestEncodeParam test_encode_param_; + + private: + int min_arf_requested_; + int min_run_; + int run_of_visible_frames_; +}; + +TEST_P(ArfFreqTestLarge, MinArfFreqTest) { + cfg_.rc_target_bitrate = kBitrate; + cfg_.g_error_resilient = 0; + cfg_.g_profile = test_video_param_.profile; + cfg_.g_input_bit_depth = test_video_param_.input_bit_depth; + cfg_.g_bit_depth = test_video_param_.bit_depth; + init_flags_ = AOM_CODEC_USE_PSNR; + if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH; + + std::unique_ptr video; + if (is_extension_y4m(test_video_param_.filename)) { + video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0, + kFrames)); + } else { + video.reset(new libaom_test::YUVVideoSource( + test_video_param_.filename, test_video_param_.fmt, + test_video_param_.width, test_video_param_.height, + test_video_param_.framerate_num, test_video_param_.framerate_den, 0, + kFrames)); + } + + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); + const int min_run = GetMinVisibleRun(); + const int min_arf_dist_requested = GetMinArfDistanceRequested(); + if (min_run != ARF_NOT_SEEN && min_run != ARF_SEEN_ONCE) { + const int min_arf_dist = min_run + 1; + EXPECT_GE(min_arf_dist, min_arf_dist_requested); + } +} + +#if CONFIG_AV1_ENCODER +// TODO(angiebird): 25-29 fail in high bitdepth mode. +// TODO(zoeliu): This ArfFreqTest does not work with BWDREF_FRAME, as +// BWDREF_FRAME is also a non-show frame, and the minimum run between two +// consecutive BWDREF_FRAME's may vary between 1 and any arbitrary positive +// number as long as it does not exceed the gf_group interval. +INSTANTIATE_TEST_SUITE_P( + DISABLED_AV1, ArfFreqTestLarge, + ::testing::Combine( + ::testing::Values( + static_cast(&libaom_test::kAV1)), + ::testing::ValuesIn(kTestVectors), ::testing::ValuesIn(kEncodeVectors), + ::testing::ValuesIn(kMinArfVectors))); +#endif // CONFIG_AV1_ENCODER +} // namespace diff --git a/libs/libaom/src/test/av1_common_int_test.cc b/libs/libaom/src/test/av1_common_int_test.cc new file mode 100644 index 000000000..dde2542e3 --- /dev/null +++ b/libs/libaom/src/test/av1_common_int_test.cc @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "av1/common/av1_common_int.h" + +TEST(AV1CommonInt, TestGetTxSize) { + for (int t = TX_4X4; t < TX_SIZES_ALL; t++) { + TX_SIZE t2 = get_tx_size(tx_size_wide[t], tx_size_high[t]); + GTEST_ASSERT_EQ(tx_size_wide[t], tx_size_wide[t2]); + GTEST_ASSERT_EQ(tx_size_high[t], tx_size_high[t2]); + } +} diff --git a/libs/libaom/src/test/av1_config_test.cc b/libs/libaom/src/test/av1_config_test.cc new file mode 100644 index 000000000..fca980f06 --- /dev/null +++ b/libs/libaom/src/test/av1_config_test.cc @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "common/av1_config.h" +#include "test/util.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +namespace { + +// +// Input buffers containing exactly one Sequence Header OBU. +// +// Each buffer is named according to the OBU storage format (Annex-B vs Low +// Overhead Bitstream Format) and the type of Sequence Header OBU ("Full" +// Sequence Header OBUs vs Sequence Header OBUs with the +// reduced_still_image_flag set). +// +const uint8_t kAnnexBFullSequenceHeaderObu[] = { 0x0c, 0x08, 0x00, 0x00, 0x00, + 0x04, 0x45, 0x7e, 0x3e, 0xff, + 0xfc, 0xc0, 0x20 }; +const uint8_t kAnnexBReducedStillImageSequenceHeaderObu[] = { + 0x08, 0x08, 0x18, 0x22, 0x2b, 0xf1, 0xfe, 0xc0, 0x20 +}; + +const uint8_t kLobfFullSequenceHeaderObu[] = { 0x0a, 0x0b, 0x00, 0x00, 0x00, + 0x04, 0x45, 0x7e, 0x3e, 0xff, + 0xfc, 0xc0, 0x20 }; + +const uint8_t kLobfReducedStillImageSequenceHeaderObu[] = { 0x0a, 0x07, 0x18, + 0x22, 0x2b, 0xf1, + 0xfe, 0xc0, 0x20 }; + +const uint8_t kAv1cAllZero[] = { 0, 0, 0, 0 }; + +// The size of AV1 config when no configOBUs are present at the end of the +// configuration structure. +const size_t kAv1cNoConfigObusSize = 4; + +bool VerifyAv1c(const uint8_t *const obu_buffer, size_t obu_buffer_length, + bool is_annexb) { + Av1Config av1_config; + memset(&av1_config, 0, sizeof(av1_config)); + bool parse_ok = get_av1config_from_obu(obu_buffer, obu_buffer_length, + is_annexb, &av1_config) == 0; + if (parse_ok) { + EXPECT_EQ(1, av1_config.marker); + EXPECT_EQ(1, av1_config.version); + EXPECT_EQ(0, av1_config.seq_profile); + EXPECT_EQ(0, av1_config.seq_level_idx_0); + EXPECT_EQ(0, av1_config.seq_tier_0); + EXPECT_EQ(0, av1_config.high_bitdepth); + EXPECT_EQ(0, av1_config.twelve_bit); + EXPECT_EQ(0, av1_config.monochrome); + EXPECT_EQ(1, av1_config.chroma_subsampling_x); + EXPECT_EQ(1, av1_config.chroma_subsampling_y); + EXPECT_EQ(0, av1_config.chroma_sample_position); + EXPECT_EQ(0, av1_config.initial_presentation_delay_present); + EXPECT_EQ(0, av1_config.initial_presentation_delay_minus_one); + } + return parse_ok && ::testing::Test::HasFailure() == false; +} + +TEST(Av1Config, ObuInvalidInputs) { + Av1Config av1_config; + memset(&av1_config, 0, sizeof(av1_config)); + ASSERT_EQ(-1, get_av1config_from_obu(NULL, 0, 0, NULL)); + ASSERT_EQ(-1, + get_av1config_from_obu(&kLobfFullSequenceHeaderObu[0], 0, 0, NULL)); + ASSERT_EQ( + -1, get_av1config_from_obu(&kLobfFullSequenceHeaderObu[0], + sizeof(kLobfFullSequenceHeaderObu), 0, NULL)); + ASSERT_EQ(-1, get_av1config_from_obu(NULL, sizeof(kLobfFullSequenceHeaderObu), + 0, NULL)); + ASSERT_EQ(-1, get_av1config_from_obu(&kLobfFullSequenceHeaderObu[0], 0, 0, + &av1_config)); +} + +TEST(Av1Config, ReadInvalidInputs) { + Av1Config av1_config; + memset(&av1_config, 0, sizeof(av1_config)); + size_t bytes_read = 0; + ASSERT_EQ(-1, read_av1config(NULL, 0, NULL, NULL)); + ASSERT_EQ(-1, read_av1config(NULL, 4, NULL, NULL)); + ASSERT_EQ(-1, read_av1config(&kAv1cAllZero[0], 0, NULL, NULL)); + ASSERT_EQ(-1, read_av1config(&kAv1cAllZero[0], 4, &bytes_read, NULL)); + ASSERT_EQ(-1, read_av1config(NULL, 4, &bytes_read, &av1_config)); +} + +TEST(Av1Config, WriteInvalidInputs) { + Av1Config av1_config; + memset(&av1_config, 0, sizeof(av1_config)); + size_t bytes_written = 0; + uint8_t av1c_buffer[4] = { 0 }; + ASSERT_EQ(-1, write_av1config(NULL, 0, NULL, NULL)); + ASSERT_EQ(-1, write_av1config(&av1_config, 0, NULL, NULL)); + ASSERT_EQ(-1, write_av1config(&av1_config, 0, &bytes_written, NULL)); + + ASSERT_EQ(-1, + write_av1config(&av1_config, 0, &bytes_written, &av1c_buffer[0])); + ASSERT_EQ(-1, write_av1config(&av1_config, 4, &bytes_written, NULL)); +} + +TEST(Av1Config, GetAv1ConfigFromLobfObu) { + // Test parsing of a Sequence Header OBU with the reduced_still_picture_header + // unset-- aka a full Sequence Header OBU. + ASSERT_TRUE(VerifyAv1c(kLobfFullSequenceHeaderObu, + sizeof(kLobfFullSequenceHeaderObu), false)); + + // Test parsing of a reduced still image Sequence Header OBU. + ASSERT_TRUE(VerifyAv1c(kLobfReducedStillImageSequenceHeaderObu, + sizeof(kLobfReducedStillImageSequenceHeaderObu), + false)); +} + +TEST(Av1Config, GetAv1ConfigFromAnnexBObu) { + // Test parsing of a Sequence Header OBU with the reduced_still_picture_header + // unset-- aka a full Sequence Header OBU. + ASSERT_TRUE(VerifyAv1c(kAnnexBFullSequenceHeaderObu, + sizeof(kAnnexBFullSequenceHeaderObu), true)); + + // Test parsing of a reduced still image Sequence Header OBU. + ASSERT_TRUE(VerifyAv1c(kAnnexBReducedStillImageSequenceHeaderObu, + sizeof(kAnnexBReducedStillImageSequenceHeaderObu), + true)); +} + +TEST(Av1Config, ReadWriteConfig) { + Av1Config av1_config; + memset(&av1_config, 0, sizeof(av1_config)); + + // Test writing out the AV1 config. + size_t bytes_written = 0; + uint8_t av1c_buffer[4] = { 0 }; + ASSERT_EQ(0, write_av1config(&av1_config, sizeof(av1c_buffer), &bytes_written, + &av1c_buffer[0])); + ASSERT_EQ(kAv1cNoConfigObusSize, bytes_written); + for (size_t i = 0; i < kAv1cNoConfigObusSize; ++i) { + ASSERT_EQ(kAv1cAllZero[i], av1c_buffer[i]) + << "Mismatch in output Av1Config at offset=" << i; + } + + // Test reading the AV1 config. + size_t bytes_read = 0; + ASSERT_EQ(0, read_av1config(&kAv1cAllZero[0], sizeof(kAv1cAllZero), + &bytes_read, &av1_config)); + ASSERT_EQ(kAv1cNoConfigObusSize, bytes_read); + ASSERT_EQ(0, write_av1config(&av1_config, sizeof(av1c_buffer), &bytes_written, + &av1c_buffer[0])); + for (size_t i = 0; i < kAv1cNoConfigObusSize; ++i) { + ASSERT_EQ(kAv1cAllZero[i], av1c_buffer[i]) + << "Mismatch in output Av1Config at offset=" << i; + } +} + +} // namespace diff --git a/libs/libaom/src/test/av1_convolve_2d_test.cc b/libs/libaom/src/test/av1_convolve_2d_test.cc new file mode 100644 index 000000000..50a58f06d --- /dev/null +++ b/libs/libaom/src/test/av1_convolve_2d_test.cc @@ -0,0 +1,261 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/av1_convolve_2d_test_util.h" + +using libaom_test::ACMRandom; +using libaom_test::AV1Convolve2D::AV1Convolve2DSrTest; +using libaom_test::AV1Convolve2D::AV1JntConvolve2DTest; +#if CONFIG_AV1_HIGHBITDEPTH +using libaom_test::AV1HighbdConvolve2D::AV1HighbdConvolve2DSrTest; +using libaom_test::AV1HighbdConvolve2D::AV1HighbdJntConvolve2DTest; +#endif +using std::make_tuple; +using std::tuple; + +namespace { + +TEST_P(AV1Convolve2DSrTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); } + +TEST_P(AV1Convolve2DSrTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); } + +INSTANTIATE_TEST_SUITE_P( + C_COPY, AV1Convolve2DSrTest, + libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_copy_sr_c, 0, 0)); +INSTANTIATE_TEST_SUITE_P( + C_X, AV1Convolve2DSrTest, + libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_c, 1, 0)); +INSTANTIATE_TEST_SUITE_P( + C_Y, AV1Convolve2DSrTest, + libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_c, 0, 1)); +INSTANTIATE_TEST_SUITE_P( + C, AV1Convolve2DSrTest, + libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_c, 1, 1)); +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P(SSE2_COPY, AV1Convolve2DSrTest, + libaom_test::AV1Convolve2D::BuildParams( + av1_convolve_2d_copy_sr_sse2, 0, 0)); +INSTANTIATE_TEST_SUITE_P( + SSE2_X, AV1Convolve2DSrTest, + libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_sse2, 1, 0)); +INSTANTIATE_TEST_SUITE_P( + SSE2_Y, AV1Convolve2DSrTest, + libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_sse2, 0, 1)); +INSTANTIATE_TEST_SUITE_P( + SSE2, AV1Convolve2DSrTest, + libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_sse2, 1, 1)); +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2_COPY, AV1Convolve2DSrTest, + libaom_test::AV1Convolve2D::BuildParams( + av1_convolve_2d_copy_sr_avx2, 0, 0)); +INSTANTIATE_TEST_SUITE_P( + AVX2_X, AV1Convolve2DSrTest, + libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_avx2, 1, 0)); + +INSTANTIATE_TEST_SUITE_P( + AVX2_Y, AV1Convolve2DSrTest, + libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_avx2, 0, 1)); + +INSTANTIATE_TEST_SUITE_P( + AVX2, AV1Convolve2DSrTest, + libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_avx2, 1, 1)); +#endif // HAVE_AVX2 +#endif // HAVE_SSE2 + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON_X, AV1Convolve2DSrTest, + libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_neon, 1, 0)); + +INSTANTIATE_TEST_SUITE_P( + NEON_Y, AV1Convolve2DSrTest, + libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_neon, 0, 1)); + +INSTANTIATE_TEST_SUITE_P( + NEON, AV1Convolve2DSrTest, + libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_neon, 1, 1)); + +INSTANTIATE_TEST_SUITE_P(NEON_COPY, AV1Convolve2DSrTest, + libaom_test::AV1Convolve2D::BuildParams( + av1_convolve_2d_copy_sr_neon, 0, 0)); +#endif // HAVE_NEON + +TEST_P(AV1JntConvolve2DTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); } +TEST_P(AV1JntConvolve2DTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); } + +INSTANTIATE_TEST_SUITE_P(C_COPY, AV1JntConvolve2DTest, + libaom_test::AV1Convolve2D::BuildParams( + av1_dist_wtd_convolve_2d_copy_c, 0, 0)); + +INSTANTIATE_TEST_SUITE_P( + C_X, AV1JntConvolve2DTest, + libaom_test::AV1Convolve2D::BuildParams(av1_dist_wtd_convolve_x_c, 1, 0)); + +INSTANTIATE_TEST_SUITE_P( + C_Y, AV1JntConvolve2DTest, + libaom_test::AV1Convolve2D::BuildParams(av1_dist_wtd_convolve_y_c, 0, 1)); + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P(SSE2_COPY, AV1JntConvolve2DTest, + libaom_test::AV1Convolve2D::BuildParams( + av1_dist_wtd_convolve_2d_copy_sse2, 0, 0)); +INSTANTIATE_TEST_SUITE_P(SSE2, AV1JntConvolve2DTest, + libaom_test::AV1Convolve2D::BuildParams( + av1_dist_wtd_convolve_2d_sse2, 1, 1)); + +INSTANTIATE_TEST_SUITE_P(SSE2_X, AV1JntConvolve2DTest, + libaom_test::AV1Convolve2D::BuildParams( + av1_dist_wtd_convolve_x_sse2, 1, 0)); + +INSTANTIATE_TEST_SUITE_P(SSE2_Y, AV1JntConvolve2DTest, + libaom_test::AV1Convolve2D::BuildParams( + av1_dist_wtd_convolve_y_sse2, 0, 1)); + +#if HAVE_SSSE3 +INSTANTIATE_TEST_SUITE_P(SSSE3, AV1JntConvolve2DTest, + libaom_test::AV1Convolve2D::BuildParams( + av1_dist_wtd_convolve_2d_ssse3, 1, 1)); + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2_COPY, AV1JntConvolve2DTest, + libaom_test::AV1Convolve2D::BuildParams( + av1_dist_wtd_convolve_2d_copy_avx2, 0, 0)); +INSTANTIATE_TEST_SUITE_P(AVX2_X, AV1JntConvolve2DTest, + libaom_test::AV1Convolve2D::BuildParams( + av1_dist_wtd_convolve_x_avx2, 1, 0)); + +INSTANTIATE_TEST_SUITE_P(AVX2_Y, AV1JntConvolve2DTest, + libaom_test::AV1Convolve2D::BuildParams( + av1_dist_wtd_convolve_y_avx2, 0, 1)); + +INSTANTIATE_TEST_SUITE_P(AVX2, AV1JntConvolve2DTest, + libaom_test::AV1Convolve2D::BuildParams( + av1_dist_wtd_convolve_2d_avx2, 1, 1)); +#endif // HAVE_AVX2 +#endif // HAVE_SSSE3 +#endif // HAVE_SSE2 +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P(NEON_COPY, AV1JntConvolve2DTest, + libaom_test::AV1Convolve2D::BuildParams( + av1_dist_wtd_convolve_2d_copy_neon, 0, 0)); + +INSTANTIATE_TEST_SUITE_P(NEON, AV1JntConvolve2DTest, + libaom_test::AV1Convolve2D::BuildParams( + av1_dist_wtd_convolve_2d_neon, 1, 1)); +INSTANTIATE_TEST_SUITE_P(NEON_X, AV1JntConvolve2DTest, + libaom_test::AV1Convolve2D::BuildParams( + av1_dist_wtd_convolve_x_neon, 1, 0)); + +INSTANTIATE_TEST_SUITE_P(NEON_Y, AV1JntConvolve2DTest, + libaom_test::AV1Convolve2D::BuildParams( + av1_dist_wtd_convolve_y_neon, 0, 1)); +#endif // HAVE_NEON + +#if CONFIG_AV1_HIGHBITDEPTH +TEST_P(AV1HighbdConvolve2DSrTest, CheckOutput) { RunCheckOutput(GET_PARAM(1)); } +TEST_P(AV1HighbdConvolve2DSrTest, DISABLED_Speed) { + RunSpeedTest(GET_PARAM(1)); +} + +INSTANTIATE_TEST_SUITE_P(C_X, AV1HighbdConvolve2DSrTest, + libaom_test::AV1HighbdConvolve2D::BuildParams( + av1_highbd_convolve_x_sr_c, 1, 0)); + +INSTANTIATE_TEST_SUITE_P(C_Y, AV1HighbdConvolve2DSrTest, + libaom_test::AV1HighbdConvolve2D::BuildParams( + av1_highbd_convolve_y_sr_c, 0, 1)); + +INSTANTIATE_TEST_SUITE_P(C_COPY, AV1HighbdConvolve2DSrTest, + libaom_test::AV1HighbdConvolve2D::BuildParams( + av1_highbd_convolve_2d_copy_sr_c, 0, 0)); +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P(SSE2_COPY, AV1HighbdConvolve2DSrTest, + libaom_test::AV1HighbdConvolve2D::BuildParams( + av1_highbd_convolve_2d_copy_sr_sse2, 0, 0)); +#if HAVE_SSSE3 +INSTANTIATE_TEST_SUITE_P(SSSE3, AV1HighbdConvolve2DSrTest, + libaom_test::AV1HighbdConvolve2D::BuildParams( + av1_highbd_convolve_2d_sr_ssse3, 1, 1)); +INSTANTIATE_TEST_SUITE_P(SSSE3_X, AV1HighbdConvolve2DSrTest, + libaom_test::AV1HighbdConvolve2D::BuildParams( + av1_highbd_convolve_x_sr_ssse3, 1, 0)); +INSTANTIATE_TEST_SUITE_P(SSSE3_Y, AV1HighbdConvolve2DSrTest, + libaom_test::AV1HighbdConvolve2D::BuildParams( + av1_highbd_convolve_y_sr_ssse3, 0, 1)); +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2, AV1HighbdConvolve2DSrTest, + libaom_test::AV1HighbdConvolve2D::BuildParams( + av1_highbd_convolve_2d_sr_avx2, 1, 1)); +INSTANTIATE_TEST_SUITE_P(AVX2_X, AV1HighbdConvolve2DSrTest, + libaom_test::AV1HighbdConvolve2D::BuildParams( + av1_highbd_convolve_x_sr_avx2, 1, 0)); +INSTANTIATE_TEST_SUITE_P(AVX2_Y, AV1HighbdConvolve2DSrTest, + libaom_test::AV1HighbdConvolve2D::BuildParams( + av1_highbd_convolve_y_sr_avx2, 0, 1)); +INSTANTIATE_TEST_SUITE_P(AVX2_COPY, AV1HighbdConvolve2DSrTest, + libaom_test::AV1HighbdConvolve2D::BuildParams( + av1_highbd_convolve_2d_copy_sr_avx2, 0, 0)); +#endif // HAVE_AVX2 +#endif // HAVE_SSSE3 +#endif // HAVE_SSE2 +TEST_P(AV1HighbdJntConvolve2DTest, CheckOutput) { + RunCheckOutput(GET_PARAM(1)); +} + +TEST_P(AV1HighbdJntConvolve2DTest, DISABLED_Speed) { + RunSpeedTest(GET_PARAM(1)); +} + +INSTANTIATE_TEST_SUITE_P(C_X, AV1HighbdJntConvolve2DTest, + libaom_test::AV1HighbdConvolve2D::BuildParams( + av1_highbd_dist_wtd_convolve_x_c, 1, 0)); + +INSTANTIATE_TEST_SUITE_P(C_Y, AV1HighbdJntConvolve2DTest, + libaom_test::AV1HighbdConvolve2D::BuildParams( + av1_highbd_dist_wtd_convolve_y_c, 0, 1)); + +INSTANTIATE_TEST_SUITE_P(C_COPY, AV1HighbdJntConvolve2DTest, + libaom_test::AV1HighbdConvolve2D::BuildParams( + av1_highbd_dist_wtd_convolve_2d_copy_c, 0, 0)); +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE4_1_COPY, AV1HighbdJntConvolve2DTest, + libaom_test::AV1HighbdConvolve2D::BuildParams( + av1_highbd_dist_wtd_convolve_2d_copy_sse4_1, 0, + 0)); +INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdJntConvolve2DTest, + libaom_test::AV1HighbdConvolve2D::BuildParams( + av1_highbd_dist_wtd_convolve_2d_sse4_1, 1, 1)); +INSTANTIATE_TEST_SUITE_P(SSE4_1_X, AV1HighbdJntConvolve2DTest, + libaom_test::AV1HighbdConvolve2D::BuildParams( + av1_highbd_dist_wtd_convolve_x_sse4_1, 1, 0)); +INSTANTIATE_TEST_SUITE_P(SSE4_1_Y, AV1HighbdJntConvolve2DTest, + libaom_test::AV1HighbdConvolve2D::BuildParams( + av1_highbd_dist_wtd_convolve_y_sse4_1, 0, 1)); +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2_COPY, AV1HighbdJntConvolve2DTest, + libaom_test::AV1HighbdConvolve2D::BuildParams( + av1_highbd_dist_wtd_convolve_2d_copy_avx2, 0, 0)); +INSTANTIATE_TEST_SUITE_P(AVX2, AV1HighbdJntConvolve2DTest, + libaom_test::AV1HighbdConvolve2D::BuildParams( + av1_highbd_dist_wtd_convolve_2d_avx2, 1, 1)); +INSTANTIATE_TEST_SUITE_P(AVX2_X, AV1HighbdJntConvolve2DTest, + libaom_test::AV1HighbdConvolve2D::BuildParams( + av1_highbd_dist_wtd_convolve_x_avx2, 1, 0)); +INSTANTIATE_TEST_SUITE_P(AVX2_Y, AV1HighbdJntConvolve2DTest, + libaom_test::AV1HighbdConvolve2D::BuildParams( + av1_highbd_dist_wtd_convolve_y_avx2, 0, 1)); +#endif // HAVE_AVX2 +#endif // HAVE_SSE4_1 +#endif // CONFIG_AV1_HIGHBITDEPTH +} // namespace diff --git a/libs/libaom/src/test/av1_convolve_2d_test_util.cc b/libs/libaom/src/test/av1_convolve_2d_test_util.cc new file mode 100644 index 000000000..6f103d3f6 --- /dev/null +++ b/libs/libaom/src/test/av1_convolve_2d_test_util.cc @@ -0,0 +1,708 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "test/av1_convolve_2d_test_util.h" + +#include "aom_ports/aom_timer.h" +#include "av1/common/common_data.h" +#include "av1/common/convolve.h" + +using std::make_tuple; +using std::tuple; + +namespace libaom_test { + +const int kMaxSize = 128 + 32; // padding +namespace AV1Convolve2D { + +::testing::internal::ParamGenerator BuildParams( + convolve_2d_func filter, int has_subx, int has_suby) { + return ::testing::Combine(::testing::Values(filter), + ::testing::Values(has_subx), + ::testing::Values(has_suby), + ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL)); +} + +AV1Convolve2DSrTest::~AV1Convolve2DSrTest() {} +void AV1Convolve2DSrTest::SetUp() { + rnd_.Reset(ACMRandom::DeterministicSeed()); +} + +void AV1Convolve2DSrTest::TearDown() { libaom_test::ClearSystemState(); } + +void AV1Convolve2DSrTest::RunCheckOutput(convolve_2d_func test_impl) { + const int w = kMaxSize, h = kMaxSize; + const int has_subx = GET_PARAM(1); + const int has_suby = GET_PARAM(2); + const int block_idx = GET_PARAM(3); + int hfilter, vfilter, subx, suby; + uint8_t input[kMaxSize * kMaxSize]; + DECLARE_ALIGNED(32, uint8_t, output[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, uint8_t, output2[MAX_SB_SQUARE]); + + for (int i = 0; i < h; ++i) + for (int j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8(); + for (int i = 0; i < MAX_SB_SQUARE; ++i) + output[i] = output2[i] = static_cast(rnd_.Rand31()); + + // Make sure that sizes 2xN and Nx2 are also tested for chroma. + const int num_sizes = + (block_size_wide[block_idx] == 4 || block_size_high[block_idx] == 4) ? 2 + : 1; + for (int shift = 0; shift < num_sizes; ++shift) { // luma and chroma + const int out_w = block_size_wide[block_idx] >> shift; + const int out_h = block_size_high[block_idx] >> shift; + for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) { + for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL; + ++vfilter) { + const InterpFilterParams *filter_params_x = + av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter, + out_w); + const InterpFilterParams *filter_params_y = + av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter, + out_h); + for (int do_average = 0; do_average < 1; ++do_average) { + ConvolveParams conv_params1 = + get_conv_params_no_round(do_average, 0, NULL, 0, 0, 8); + ConvolveParams conv_params2 = + get_conv_params_no_round(do_average, 0, NULL, 0, 0, 8); + + const int subx_range = has_subx ? 16 : 1; + const int suby_range = has_suby ? 16 : 1; + for (subx = 0; subx < subx_range; ++subx) { + for (suby = 0; suby < suby_range; ++suby) { + // Choose random locations within the source block + const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7); + const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7); + av1_convolve_2d_sr_c(input + offset_r * w + offset_c, w, output, + MAX_SB_SIZE, out_w, out_h, filter_params_x, + filter_params_y, subx, suby, &conv_params1); + test_impl(input + offset_r * w + offset_c, w, output2, + MAX_SB_SIZE, out_w, out_h, filter_params_x, + filter_params_y, subx, suby, &conv_params2); + + if (memcmp(output, output2, sizeof(output))) { + for (int i = 0; i < MAX_SB_SIZE; ++i) { + for (int j = 0; j < MAX_SB_SIZE; ++j) { + int idx = i * MAX_SB_SIZE + j; + ASSERT_EQ(output[idx], output2[idx]) + << out_w << "x" << out_h << " Pixel mismatch at index " + << idx << " = (" << i << ", " << j + << "), sub pixel offset = (" << suby << ", " << subx + << ")"; + } + } + } + } + } + } + } + } + } +} + +void AV1Convolve2DSrTest::RunSpeedTest(convolve_2d_func test_impl) { + const int w = kMaxSize, h = kMaxSize; + const int has_subx = GET_PARAM(1); + const int has_suby = GET_PARAM(2); + const int block_idx = GET_PARAM(3); + + uint8_t input[kMaxSize * kMaxSize]; + DECLARE_ALIGNED(32, uint8_t, output[MAX_SB_SQUARE]); + + for (int i = 0; i < h; ++i) + for (int j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8(); + + int hfilter = EIGHTTAP_REGULAR, vfilter = EIGHTTAP_REGULAR; + int subx = 0, suby = 0; + + const int do_average = 0; + ConvolveParams conv_params2 = + get_conv_params_no_round(do_average, 0, NULL, 0, 0, 8); + + // Make sure that sizes 2xN and Nx2 are also tested for chroma. + const int num_sizes = + (block_size_wide[block_idx] == 4 || block_size_high[block_idx] == 4) ? 2 + : 1; + for (int shift = 0; shift < num_sizes; ++shift) { // luma and chroma + const int out_w = block_size_wide[block_idx] >> shift; + const int out_h = block_size_high[block_idx] >> shift; + const int num_loops = 1000000000 / (out_w + out_h); + + const InterpFilterParams *filter_params_x = + av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter, + out_w); + const InterpFilterParams *filter_params_y = + av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter, + out_h); + + aom_usec_timer timer; + aom_usec_timer_start(&timer); + + for (int i = 0; i < num_loops; ++i) + test_impl(input, w, output, MAX_SB_SIZE, out_w, out_h, filter_params_x, + filter_params_y, subx, suby, &conv_params2); + + aom_usec_timer_mark(&timer); + const int elapsed_time = static_cast(aom_usec_timer_elapsed(&timer)); + printf("%d,%d convolve %3dx%-3d: %7.2f us\n", has_subx, has_suby, out_w, + out_h, 1000.0 * elapsed_time / num_loops); + } +} + +AV1JntConvolve2DTest::~AV1JntConvolve2DTest() {} +void AV1JntConvolve2DTest::SetUp() { + rnd_.Reset(ACMRandom::DeterministicSeed()); +} + +void AV1JntConvolve2DTest::TearDown() { libaom_test::ClearSystemState(); } + +void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) { + const int w = kMaxSize, h = kMaxSize; + const int has_subx = GET_PARAM(1); + const int has_suby = GET_PARAM(2); + const int block_idx = GET_PARAM(3); + int hfilter, vfilter, subx, suby; + uint8_t input[kMaxSize * kMaxSize]; + DECLARE_ALIGNED(32, CONV_BUF_TYPE, output1[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, CONV_BUF_TYPE, output2[MAX_SB_SQUARE]); + DECLARE_ALIGNED(16, uint8_t, output8_1[MAX_SB_SQUARE]); + DECLARE_ALIGNED(16, uint8_t, output8_2[MAX_SB_SQUARE]); + + for (int i = 0; i < h; ++i) + for (int j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8(); + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + output1[i] = output2[i] = rnd_.Rand16(); + output8_1[i] = output8_2[i] = rnd_.Rand8(); + } + + const int out_w = block_size_wide[block_idx]; + const int out_h = block_size_high[block_idx]; + for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) { + for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL; ++vfilter) { + const InterpFilterParams *filter_params_x = + av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter, + out_w); + const InterpFilterParams *filter_params_y = + av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter, + out_h); + for (int do_average = 0; do_average <= 1; ++do_average) { + ConvolveParams conv_params1 = + get_conv_params_no_round(do_average, 0, output1, MAX_SB_SIZE, 1, 8); + ConvolveParams conv_params2 = + get_conv_params_no_round(do_average, 0, output2, MAX_SB_SIZE, 1, 8); + + // Test special case where dist_wtd_comp_avg is not used + conv_params1.use_dist_wtd_comp_avg = 0; + conv_params2.use_dist_wtd_comp_avg = 0; + + const int subx_range = has_subx ? 16 : 1; + const int suby_range = has_suby ? 16 : 1; + for (subx = 0; subx < subx_range; ++subx) { + for (suby = 0; suby < suby_range; ++suby) { + // Choose random locations within the source block + const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7); + const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7); + av1_dist_wtd_convolve_2d_c(input + offset_r * w + offset_c, w, + output8_1, MAX_SB_SIZE, out_w, out_h, + filter_params_x, filter_params_y, subx, + suby, &conv_params1); + test_impl(input + offset_r * w + offset_c, w, output8_2, + MAX_SB_SIZE, out_w, out_h, filter_params_x, + filter_params_y, subx, suby, &conv_params2); + + for (int i = 0; i < out_h; ++i) { + for (int j = 0; j < out_w; ++j) { + int idx = i * MAX_SB_SIZE + j; + ASSERT_EQ(output1[idx], output2[idx]) + << "Mismatch at unit tests for av1_dist_wtd_convolve_2d\n" + << out_w << "x" << out_h << " Pixel mismatch at index " + << idx << " = (" << i << ", " << j + << "), sub pixel offset = (" << suby << ", " << subx << ")"; + } + } + + if (memcmp(output8_1, output8_2, sizeof(output8_1))) { + for (int i = 0; i < MAX_SB_SIZE; ++i) { + for (int j = 0; j < MAX_SB_SIZE; ++j) { + int idx = i * MAX_SB_SIZE + j; + ASSERT_EQ(output8_1[idx], output8_2[idx]) + << out_w << "x" << out_h << " Pixel mismatch at index " + << idx << " = (" << i << ", " << j + << "), sub pixel offset = (" << suby << ", " << subx + << ")"; + } + } + } + } + } + + // Test different combination of fwd and bck offset weights + for (int k = 0; k < 2; ++k) { + for (int l = 0; l < 4; ++l) { + conv_params1.use_dist_wtd_comp_avg = 1; + conv_params2.use_dist_wtd_comp_avg = 1; + conv_params1.fwd_offset = quant_dist_lookup_table[k][l][0]; + conv_params1.bck_offset = quant_dist_lookup_table[k][l][1]; + conv_params2.fwd_offset = quant_dist_lookup_table[k][l][0]; + conv_params2.bck_offset = quant_dist_lookup_table[k][l][1]; + + for (subx = 0; subx < subx_range; ++subx) { + for (suby = 0; suby < suby_range; ++suby) { + // Choose random locations within the source block + const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7); + const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7); + av1_dist_wtd_convolve_2d_c(input + offset_r * w + offset_c, w, + output8_1, MAX_SB_SIZE, out_w, out_h, + filter_params_x, filter_params_y, + subx, suby, &conv_params1); + test_impl(input + offset_r * w + offset_c, w, output8_2, + MAX_SB_SIZE, out_w, out_h, filter_params_x, + filter_params_y, subx, suby, &conv_params2); + + for (int i = 0; i < out_h; ++i) { + for (int j = 0; j < out_w; ++j) { + int idx = i * MAX_SB_SIZE + j; + ASSERT_EQ(output1[idx], output2[idx]) + << "Mismatch at unit tests for " + "av1_dist_wtd_convolve_2d\n" + << out_w << "x" << out_h << " Pixel mismatch at index " + << idx << " = (" << i << ", " << j + << "), sub pixel offset = (" << suby << ", " << subx + << ")"; + } + } + if (memcmp(output8_1, output8_2, sizeof(output8_1))) { + for (int i = 0; i < MAX_SB_SIZE; ++i) { + for (int j = 0; j < MAX_SB_SIZE; ++j) { + int idx = i * MAX_SB_SIZE + j; + ASSERT_EQ(output8_1[idx], output8_2[idx]) + << out_w << "x" << out_h + << " Pixel mismatch at index " << idx << " = (" << i + << ", " << j << "), sub pixel offset = (" << suby + << ", " << subx << ")"; + } + } + } + } + } + } + } + } + } + } +} + +void AV1JntConvolve2DTest::RunSpeedTest(convolve_2d_func test_impl) { + const int w = kMaxSize, h = kMaxSize; + const int has_subx = GET_PARAM(1); + const int has_suby = GET_PARAM(2); + const int block_idx = GET_PARAM(3); + + int subx = 0, suby = 0; + uint8_t input[kMaxSize * kMaxSize]; + DECLARE_ALIGNED(32, CONV_BUF_TYPE, output[MAX_SB_SQUARE]); + DECLARE_ALIGNED(16, uint8_t, output8[MAX_SB_SQUARE]); + int hfilter = EIGHTTAP_REGULAR, vfilter = EIGHTTAP_REGULAR; + for (int i = 0; i < h; ++i) + for (int j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8(); + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + output[i] = rnd_.Rand16(); + output8[i] = rnd_.Rand8(); + } + + const int out_w = block_size_wide[block_idx]; + const int out_h = block_size_high[block_idx]; + const int num_loops = 1000000000 / (out_w + out_h); + const int do_average = 0; + + const InterpFilterParams *filter_params_x = + av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter, + out_w); + const InterpFilterParams *filter_params_y = + av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter, + out_h); + + ConvolveParams conv_params = + get_conv_params_no_round(do_average, 0, output, MAX_SB_SIZE, 1, 8); + + conv_params.use_dist_wtd_comp_avg = 0; + + // Choose random locations within the source block + const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7); + const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7); + + aom_usec_timer timer; + aom_usec_timer_start(&timer); + + for (int i = 0; i < num_loops; ++i) + test_impl(input + offset_r * w + offset_c, w, output8, MAX_SB_SIZE, out_w, + out_h, filter_params_x, filter_params_y, subx, suby, + &conv_params); + + aom_usec_timer_mark(&timer); + const int elapsed_time = static_cast(aom_usec_timer_elapsed(&timer)); + printf("%d,%d convolve %3dx%-3d: %7.2f us\n", has_subx, has_suby, out_w, + out_h, 1000.0 * elapsed_time / num_loops); +} +} // namespace AV1Convolve2D + +#if CONFIG_AV1_HIGHBITDEPTH +namespace AV1HighbdConvolve2D { +::testing::internal::ParamGenerator BuildParams( + highbd_convolve_2d_func filter, int has_subx, int has_suby) { + return ::testing::Combine( + ::testing::Range(8, 13, 2), ::testing::Values(filter), + ::testing::Values(has_subx), ::testing::Values(has_suby), + ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL)); +} + +AV1HighbdConvolve2DSrTest::~AV1HighbdConvolve2DSrTest() {} +void AV1HighbdConvolve2DSrTest::SetUp() { + rnd_.Reset(ACMRandom::DeterministicSeed()); +} + +void AV1HighbdConvolve2DSrTest::TearDown() { libaom_test::ClearSystemState(); } + +void AV1HighbdConvolve2DSrTest::RunSpeedTest( + highbd_convolve_2d_func test_impl) { + const int w = kMaxSize, h = kMaxSize; + const int bd = GET_PARAM(0); + const int has_subx = GET_PARAM(2); + const int has_suby = GET_PARAM(3); + const int block_idx = GET_PARAM(4); + int hfilter, vfilter, subx, suby; + uint16_t input[kMaxSize * kMaxSize]; + DECLARE_ALIGNED(32, uint16_t, output[MAX_SB_SQUARE]); + + for (int i = 0; i < h; ++i) + for (int j = 0; j < w; ++j) + input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1); + + hfilter = EIGHTTAP_REGULAR; + vfilter = EIGHTTAP_REGULAR; + int do_average = 0; + + const int offset_r = 3; + const int offset_c = 3; + subx = 0; + suby = 0; + + ConvolveParams conv_params = + get_conv_params_no_round(do_average, 0, NULL, 0, 0, bd); + + // Make sure that sizes 2xN and Nx2 are also tested for chroma. + const int num_sizes = + (block_size_wide[block_idx] == 4 || block_size_high[block_idx] == 4) ? 2 + : 1; + + for (int shift = 0; shift < num_sizes; ++shift) { // luma and chroma + const int out_w = block_size_wide[block_idx] >> shift; + const int out_h = block_size_high[block_idx] >> shift; + const int num_loops = 1000000000 / (out_w + out_h); + + const InterpFilterParams *filter_params_x = + av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter, + out_w); + const InterpFilterParams *filter_params_y = + av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter, + out_h); + + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int i = 0; i < num_loops; ++i) + test_impl(input + offset_r * w + offset_c, w, output, MAX_SB_SIZE, out_w, + out_h, filter_params_x, filter_params_y, subx, suby, + &conv_params, bd); + + aom_usec_timer_mark(&timer); + const int elapsed_time = static_cast(aom_usec_timer_elapsed(&timer)); + printf("%d,%d convolve %3dx%-3d: %7.2f us\n", has_subx, has_suby, out_w, + out_h, 1000.0 * elapsed_time / num_loops); + } +} + +void AV1HighbdConvolve2DSrTest::RunCheckOutput( + highbd_convolve_2d_func test_impl) { + const int w = kMaxSize, h = kMaxSize; + const int bd = GET_PARAM(0); + const int has_subx = GET_PARAM(2); + const int has_suby = GET_PARAM(3); + const int block_idx = GET_PARAM(4); + int hfilter, vfilter, subx, suby; + uint16_t input[kMaxSize * kMaxSize]; + DECLARE_ALIGNED(32, uint16_t, output[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, uint16_t, output2[MAX_SB_SQUARE]); + + for (int i = 0; i < h; ++i) + for (int j = 0; j < w; ++j) + input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1); + for (int i = 0; i < MAX_SB_SQUARE; ++i) + output[i] = output2[i] = static_cast(rnd_.Rand31()); + + // Make sure that sizes 2xN and Nx2 are also tested for chroma. + const int num_sizes = + (block_size_wide[block_idx] == 4 || block_size_high[block_idx] == 4) ? 2 + : 1; + for (int shift = 0; shift < num_sizes; ++shift) { // luma and chroma + const int out_w = block_size_wide[block_idx] >> shift; + const int out_h = block_size_high[block_idx] >> shift; + for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) { + for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL; + ++vfilter) { + const InterpFilterParams *filter_params_x = + av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter, + out_w); + const InterpFilterParams *filter_params_y = + av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter, + out_h); + for (int do_average = 0; do_average < 1; ++do_average) { + ConvolveParams conv_params1 = + get_conv_params_no_round(do_average, 0, NULL, 0, 0, bd); + ConvolveParams conv_params2 = + get_conv_params_no_round(do_average, 0, NULL, 0, 0, bd); + + const int subx_range = has_subx ? 16 : 1; + const int suby_range = has_suby ? 16 : 1; + for (subx = 0; subx < subx_range; ++subx) { + for (suby = 0; suby < suby_range; ++suby) { + // Choose random locations within the source block + const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7); + const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7); + av1_highbd_convolve_2d_sr_c(input + offset_r * w + offset_c, w, + output, MAX_SB_SIZE, out_w, out_h, + filter_params_x, filter_params_y, + subx, suby, &conv_params1, bd); + test_impl(input + offset_r * w + offset_c, w, output2, + MAX_SB_SIZE, out_w, out_h, filter_params_x, + filter_params_y, subx, suby, &conv_params2, bd); + + if (memcmp(output, output2, sizeof(output))) { + for (int i = 0; i < MAX_SB_SIZE; ++i) { + for (int j = 0; j < MAX_SB_SIZE; ++j) { + int idx = i * MAX_SB_SIZE + j; + ASSERT_EQ(output[idx], output2[idx]) + << out_w << "x" << out_h << " Pixel mismatch at index " + << idx << " = (" << i << ", " << j + << "), sub pixel offset = (" << suby << ", " << subx + << ")"; + } + } + } + } + } + } + } + } + } +} + +AV1HighbdJntConvolve2DTest::~AV1HighbdJntConvolve2DTest() {} +void AV1HighbdJntConvolve2DTest::SetUp() { + rnd_.Reset(ACMRandom::DeterministicSeed()); +} + +void AV1HighbdJntConvolve2DTest::TearDown() { libaom_test::ClearSystemState(); } + +void AV1HighbdJntConvolve2DTest::RunSpeedTest( + highbd_convolve_2d_func test_impl) { + const int w = kMaxSize, h = kMaxSize; + const int bd = GET_PARAM(0); + const int block_idx = GET_PARAM(4); + int hfilter, vfilter, subx, suby; + uint16_t input[kMaxSize * kMaxSize]; + DECLARE_ALIGNED(32, CONV_BUF_TYPE, output[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, uint16_t, output16[MAX_SB_SQUARE]); + + for (int i = 0; i < h; ++i) + for (int j = 0; j < w; ++j) + input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1); + for (int i = 0; i < MAX_SB_SQUARE; ++i) output[i] = rnd_.Rand16(); + hfilter = EIGHTTAP_REGULAR; + vfilter = EIGHTTAP_REGULAR; + int do_average = 0; + const int out_w = block_size_wide[block_idx]; + const int out_h = block_size_high[block_idx]; + + const InterpFilterParams *filter_params_x = + av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter, + out_w); + const InterpFilterParams *filter_params_y = + av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter, + out_h); + + ConvolveParams conv_params = + get_conv_params_no_round(do_average, 0, output, MAX_SB_SIZE, 1, bd); + + // Test special case where dist_wtd_comp_avg is not used + conv_params.use_dist_wtd_comp_avg = 0; + + subx = 0; + suby = 0; + // Choose random locations within the source block + const int offset_r = 3; + const int offset_c = 3; + + const int num_loops = 1000000000 / (out_w + out_h); + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int i = 0; i < num_loops; ++i) + test_impl(input + offset_r * w + offset_c, w, output16, MAX_SB_SIZE, out_w, + out_h, filter_params_x, filter_params_y, subx, suby, &conv_params, + bd); + + aom_usec_timer_mark(&timer); + const int elapsed_time = static_cast(aom_usec_timer_elapsed(&timer)); + printf("convolve %3dx%-3d: %7.2f us\n", out_w, out_h, + 1000.0 * elapsed_time / num_loops); +} + +void AV1HighbdJntConvolve2DTest::RunCheckOutput( + highbd_convolve_2d_func test_impl) { + const int w = kMaxSize, h = kMaxSize; + const int bd = GET_PARAM(0); + const int has_subx = GET_PARAM(2); + const int has_suby = GET_PARAM(3); + const int block_idx = GET_PARAM(4); + int hfilter, vfilter, subx, suby; + uint16_t input[kMaxSize * kMaxSize]; + DECLARE_ALIGNED(32, CONV_BUF_TYPE, output1[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, CONV_BUF_TYPE, output2[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, uint16_t, output16_1[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, uint16_t, output16_2[MAX_SB_SQUARE]); + + for (int i = 0; i < h; ++i) + for (int j = 0; j < w; ++j) + input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1); + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + output1[i] = output2[i] = rnd_.Rand16(); + output16_1[i] = output16_2[i] = rnd_.Rand16(); + } + + const int out_w = block_size_wide[block_idx]; + const int out_h = block_size_high[block_idx]; + for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) { + for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL; ++vfilter) { + const InterpFilterParams *filter_params_x = + av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter, + out_w); + const InterpFilterParams *filter_params_y = + av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter, + out_h); + for (int do_average = 0; do_average <= 1; ++do_average) { + ConvolveParams conv_params1 = get_conv_params_no_round( + do_average, 0, output1, MAX_SB_SIZE, 1, bd); + ConvolveParams conv_params2 = get_conv_params_no_round( + do_average, 0, output2, MAX_SB_SIZE, 1, bd); + + // Test special case where dist_wtd_comp_avg is not used + conv_params1.use_dist_wtd_comp_avg = 0; + conv_params2.use_dist_wtd_comp_avg = 0; + + const int subx_range = has_subx ? 16 : 1; + const int suby_range = has_suby ? 16 : 1; + for (subx = 0; subx < subx_range; ++subx) { + for (suby = 0; suby < suby_range; ++suby) { + // Choose random locations within the source block + const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7); + const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7); + av1_highbd_dist_wtd_convolve_2d_c( + input + offset_r * w + offset_c, w, output16_1, MAX_SB_SIZE, + out_w, out_h, filter_params_x, filter_params_y, subx, suby, + &conv_params1, bd); + test_impl(input + offset_r * w + offset_c, w, output16_2, + MAX_SB_SIZE, out_w, out_h, filter_params_x, + filter_params_y, subx, suby, &conv_params2, bd); + + for (int i = 0; i < out_h; ++i) { + for (int j = 0; j < out_w; ++j) { + int idx = i * MAX_SB_SIZE + j; + ASSERT_EQ(output1[idx], output2[idx]) + << out_w << "x" << out_h << " Pixel mismatch at index " + << idx << " = (" << i << ", " << j + << "), sub pixel offset = (" << suby << ", " << subx << ")"; + } + } + + if (memcmp(output16_1, output16_2, sizeof(output16_1))) { + for (int i = 0; i < MAX_SB_SIZE; ++i) { + for (int j = 0; j < MAX_SB_SIZE; ++j) { + int idx = i * MAX_SB_SIZE + j; + ASSERT_EQ(output16_1[idx], output16_2[idx]) + << out_w << "x" << out_h << " Pixel mismatch at index " + << idx << " = (" << i << ", " << j + << "), sub pixel offset = (" << suby << ", " << subx + << ")"; + } + } + } + } + } + + // Test different combination of fwd and bck offset weights + for (int k = 0; k < 2; ++k) { + for (int l = 0; l < 4; ++l) { + conv_params1.use_dist_wtd_comp_avg = 1; + conv_params2.use_dist_wtd_comp_avg = 1; + conv_params1.fwd_offset = quant_dist_lookup_table[k][l][0]; + conv_params1.bck_offset = quant_dist_lookup_table[k][l][1]; + conv_params2.fwd_offset = quant_dist_lookup_table[k][l][0]; + conv_params2.bck_offset = quant_dist_lookup_table[k][l][1]; + + const int subx_range = has_subx ? 16 : 1; + const int suby_range = has_suby ? 16 : 1; + for (subx = 0; subx < subx_range; ++subx) { + for (suby = 0; suby < suby_range; ++suby) { + // Choose random locations within the source block + const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7); + const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7); + av1_highbd_dist_wtd_convolve_2d_c( + input + offset_r * w + offset_c, w, output16_1, MAX_SB_SIZE, + out_w, out_h, filter_params_x, filter_params_y, subx, suby, + &conv_params1, bd); + test_impl(input + offset_r * w + offset_c, w, output16_2, + MAX_SB_SIZE, out_w, out_h, filter_params_x, + filter_params_y, subx, suby, &conv_params2, bd); + + for (int i = 0; i < out_h; ++i) { + for (int j = 0; j < out_w; ++j) { + int idx = i * MAX_SB_SIZE + j; + ASSERT_EQ(output1[idx], output2[idx]) + << out_w << "x" << out_h << " Pixel mismatch at index " + << idx << " = (" << i << ", " << j + << "), sub pixel offset = (" << suby << ", " << subx + << ")"; + } + } + + if (memcmp(output16_1, output16_2, sizeof(output16_1))) { + for (int i = 0; i < MAX_SB_SIZE; ++i) { + for (int j = 0; j < MAX_SB_SIZE; ++j) { + int idx = i * MAX_SB_SIZE + j; + ASSERT_EQ(output16_1[idx], output16_2[idx]) + << out_w << "x" << out_h + << " Pixel mismatch at index " << idx << " = (" << i + << ", " << j << "), sub pixel offset = (" << suby + << ", " << subx << ")"; + } + } + } + } + } + } + } + } + } + } +} +} // namespace AV1HighbdConvolve2D +#endif // CONFIG_AV1_HIGHBITDEPTH +} // namespace libaom_test diff --git a/libs/libaom/src/test/av1_convolve_2d_test_util.h b/libs/libaom/src/test/av1_convolve_2d_test_util.h new file mode 100644 index 000000000..3c19cfed3 --- /dev/null +++ b/libs/libaom/src/test/av1_convolve_2d_test_util.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_TEST_AV1_CONVOLVE_2D_TEST_UTIL_H_ +#define AOM_TEST_AV1_CONVOLVE_2D_TEST_UTIL_H_ + +#include + +#include "config/av1_rtcd.h" +#include "config/aom_dsp_rtcd.h" + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/acm_random.h" +#include "test/util.h" + +#include "test/clear_system_state.h" +#include "test/register_state_check.h" + +namespace libaom_test { + +namespace AV1Convolve2D { + +typedef void (*convolve_2d_func)(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params); + +typedef std::tuple Convolve2DParam; + +::testing::internal::ParamGenerator BuildParams( + convolve_2d_func filter, int subx_exist, int suby_exist); + +class AV1Convolve2DSrTest : public ::testing::TestWithParam { + public: + virtual ~AV1Convolve2DSrTest(); + virtual void SetUp(); + + virtual void TearDown(); + + protected: + void RunCheckOutput(convolve_2d_func test_impl); + void RunSpeedTest(convolve_2d_func test_impl); + + libaom_test::ACMRandom rnd_; +}; + +class AV1JntConvolve2DTest : public ::testing::TestWithParam { + public: + virtual ~AV1JntConvolve2DTest(); + virtual void SetUp(); + + virtual void TearDown(); + + protected: + void RunCheckOutput(convolve_2d_func test_impl); + void RunSpeedTest(convolve_2d_func test_impl); + + libaom_test::ACMRandom rnd_; +}; +} // namespace AV1Convolve2D + +#if CONFIG_AV1_HIGHBITDEPTH +namespace AV1HighbdConvolve2D { +typedef void (*highbd_convolve_2d_func)( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd); + +typedef std::tuple + HighbdConvolve2DParam; + +::testing::internal::ParamGenerator BuildParams( + highbd_convolve_2d_func filter, int subx_exist, int suby_exist); + +class AV1HighbdConvolve2DSrTest + : public ::testing::TestWithParam { + public: + virtual ~AV1HighbdConvolve2DSrTest(); + virtual void SetUp(); + + virtual void TearDown(); + + protected: + void RunCheckOutput(highbd_convolve_2d_func test_impl); + void RunSpeedTest(highbd_convolve_2d_func test_impl); + + libaom_test::ACMRandom rnd_; +}; + +class AV1HighbdJntConvolve2DTest + : public ::testing::TestWithParam { + public: + virtual ~AV1HighbdJntConvolve2DTest(); + virtual void SetUp(); + + virtual void TearDown(); + + protected: + void RunCheckOutput(highbd_convolve_2d_func test_impl); + void RunSpeedTest(highbd_convolve_2d_func test_impl); + + libaom_test::ACMRandom rnd_; +}; +} // namespace AV1HighbdConvolve2D +#endif // CONFIG_AV1_HIGHBITDEPTH + +} // namespace libaom_test + +#endif // AOM_TEST_AV1_CONVOLVE_2D_TEST_UTIL_H_ diff --git a/libs/libaom/src/test/av1_convolve_scale_test.cc b/libs/libaom/src/test/av1_convolve_scale_test.cc new file mode 100644 index 000000000..ffd0bab33 --- /dev/null +++ b/libs/libaom/src/test/av1_convolve_scale_test.cc @@ -0,0 +1,532 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/av1_rtcd.h" + +#include "aom_ports/aom_timer.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" + +#include "av1/common/common_data.h" + +namespace { +const int kTestIters = 10; +const int kPerfIters = 1000; + +const int kVPad = 32; +const int kHPad = 32; +const int kXStepQn = 16; +const int kYStepQn = 20; + +using libaom_test::ACMRandom; +using std::make_tuple; +using std::tuple; + +enum NTaps { EIGHT_TAP, TEN_TAP, TWELVE_TAP }; +int NTapsToInt(NTaps ntaps) { return 8 + static_cast(ntaps) * 2; } + +// A 16-bit filter with a configurable number of taps. +class TestFilter { + public: + void set(NTaps ntaps, bool backwards); + + InterpFilterParams params_; + + private: + std::vector coeffs_; +}; + +void TestFilter::set(NTaps ntaps, bool backwards) { + const int n = NTapsToInt(ntaps); + assert(n >= 8 && n <= 12); + + // The filter has n * SUBPEL_SHIFTS proper elements and an extra 8 bogus + // elements at the end so that convolutions can read off the end safely. + coeffs_.resize(n * SUBPEL_SHIFTS + 8); + + // The coefficients are pretty much arbitrary, but convolutions shouldn't + // over or underflow. For the first filter (subpels = 0), we use an + // increasing or decreasing ramp (depending on the backwards parameter). We + // don't want any zero coefficients, so we make it have an x-intercept at -1 + // or n. To ensure absence of under/overflow, we normalise the area under the + // ramp to be I = 1 << FILTER_BITS (so that convolving a constant function + // gives the identity). + // + // When increasing, the function has the form: + // + // f(x) = A * (x + 1) + // + // Summing and rearranging for A gives A = 2 * I / (n * (n + 1)). If the + // filter is reversed, we have the same A but with formula + // + // g(x) = A * (n - x) + const int I = 1 << FILTER_BITS; + const float A = 2.f * I / (n * (n + 1.f)); + for (int i = 0; i < n; ++i) { + coeffs_[i] = static_cast(A * (backwards ? (n - i) : (i + 1))); + } + + // For the other filters, make them slightly different by swapping two + // columns. Filter k will have the columns (k % n) and (7 * k) % n swapped. + const size_t filter_size = sizeof(coeffs_[0] * n); + int16_t *const filter0 = &coeffs_[0]; + for (int k = 1; k < SUBPEL_SHIFTS; ++k) { + int16_t *filterk = &coeffs_[k * n]; + memcpy(filterk, filter0, filter_size); + + const int idx0 = k % n; + const int idx1 = (7 * k) % n; + + const int16_t tmp = filterk[idx0]; + filterk[idx0] = filterk[idx1]; + filterk[idx1] = tmp; + } + + // Finally, write some rubbish at the end to make sure we don't use it. + for (int i = 0; i < 8; ++i) coeffs_[n * SUBPEL_SHIFTS + i] = 123 + i; + + // Fill in params + params_.filter_ptr = &coeffs_[0]; + params_.taps = n; + // These are ignored by the functions being tested. Set them to whatever. + params_.subpel_shifts = SUBPEL_SHIFTS; + params_.interp_filter = EIGHTTAP_REGULAR; +} + +template +class TestImage { + public: + TestImage(int w, int h, int bd) : w_(w), h_(h), bd_(bd) { + assert(bd < 16); + assert(bd <= 8 * static_cast(sizeof(SrcPixel))); + + // Pad width by 2*kHPad and then round up to the next multiple of 16 + // to get src_stride_. Add another 16 for dst_stride_ (to make sure + // something goes wrong if we use the wrong one) + src_stride_ = (w_ + 2 * kHPad + 15) & ~15; + dst_stride_ = src_stride_ + 16; + + // Allocate image data + src_data_.resize(2 * src_block_size()); + dst_data_.resize(2 * dst_block_size()); + dst_16_data_.resize(2 * dst_block_size()); + } + + void Initialize(ACMRandom *rnd); + void Check() const; + + int src_stride() const { return src_stride_; } + int dst_stride() const { return dst_stride_; } + + int src_block_size() const { return (h_ + 2 * kVPad) * src_stride(); } + int dst_block_size() const { return (h_ + 2 * kVPad) * dst_stride(); } + + const SrcPixel *GetSrcData(bool ref, bool borders) const { + const SrcPixel *block = &src_data_[ref ? 0 : src_block_size()]; + return borders ? block : block + kHPad + src_stride_ * kVPad; + } + + SrcPixel *GetDstData(bool ref, bool borders) { + SrcPixel *block = &dst_data_[ref ? 0 : dst_block_size()]; + return borders ? block : block + kHPad + dst_stride_ * kVPad; + } + + CONV_BUF_TYPE *GetDst16Data(bool ref, bool borders) { + CONV_BUF_TYPE *block = &dst_16_data_[ref ? 0 : dst_block_size()]; + return borders ? block : block + kHPad + dst_stride_ * kVPad; + } + + private: + int w_, h_, bd_; + int src_stride_, dst_stride_; + + std::vector src_data_; + std::vector dst_data_; + std::vector dst_16_data_; +}; + +template +void FillEdge(ACMRandom *rnd, int num_pixels, int bd, bool trash, Pixel *data) { + if (!trash) { + memset(data, 0, sizeof(*data) * num_pixels); + return; + } + const Pixel mask = (1 << bd) - 1; + for (int i = 0; i < num_pixels; ++i) data[i] = rnd->Rand16() & mask; +} + +template +void PrepBuffers(ACMRandom *rnd, int w, int h, int stride, int bd, + bool trash_edges, Pixel *data) { + assert(rnd); + const Pixel mask = (1 << bd) - 1; + + // Fill in the first buffer with random data + // Top border + FillEdge(rnd, stride * kVPad, bd, trash_edges, data); + for (int r = 0; r < h; ++r) { + Pixel *row_data = data + (kVPad + r) * stride; + // Left border, contents, right border + FillEdge(rnd, kHPad, bd, trash_edges, row_data); + for (int c = 0; c < w; ++c) row_data[kHPad + c] = rnd->Rand16() & mask; + FillEdge(rnd, kHPad, bd, trash_edges, row_data + kHPad + w); + } + // Bottom border + FillEdge(rnd, stride * kVPad, bd, trash_edges, data + stride * (kVPad + h)); + + const int bpp = sizeof(*data); + const int block_elts = stride * (h + 2 * kVPad); + const int block_size = bpp * block_elts; + + // Now copy that to the second buffer + memcpy(data + block_elts, data, block_size); +} + +template +void TestImage::Initialize(ACMRandom *rnd) { + PrepBuffers(rnd, w_, h_, src_stride_, bd_, false, &src_data_[0]); + PrepBuffers(rnd, w_, h_, dst_stride_, bd_, true, &dst_data_[0]); + PrepBuffers(rnd, w_, h_, dst_stride_, bd_, true, &dst_16_data_[0]); +} + +template +void TestImage::Check() const { + // If memcmp returns 0, there's nothing to do. + const int num_pixels = dst_block_size(); + const SrcPixel *ref_dst = &dst_data_[0]; + const SrcPixel *tst_dst = &dst_data_[num_pixels]; + + const CONV_BUF_TYPE *ref_16_dst = &dst_16_data_[0]; + const CONV_BUF_TYPE *tst_16_dst = &dst_16_data_[num_pixels]; + + if (0 == memcmp(ref_dst, tst_dst, sizeof(*ref_dst) * num_pixels)) { + if (0 == memcmp(ref_16_dst, tst_16_dst, sizeof(*ref_16_dst) * num_pixels)) + return; + } + // Otherwise, iterate through the buffer looking for differences (including + // the edges) + const int stride = dst_stride_; + for (int r = 0; r < h_ + 2 * kVPad; ++r) { + for (int c = 0; c < w_ + 2 * kHPad; ++c) { + const int32_t ref_value = ref_dst[r * stride + c]; + const int32_t tst_value = tst_dst[r * stride + c]; + + EXPECT_EQ(tst_value, ref_value) + << "Error at row: " << (r - kVPad) << ", col: " << (c - kHPad); + } + } + + for (int r = 0; r < h_ + 2 * kVPad; ++r) { + for (int c = 0; c < w_ + 2 * kHPad; ++c) { + const int32_t ref_value = ref_16_dst[r * stride + c]; + const int32_t tst_value = tst_16_dst[r * stride + c]; + + EXPECT_EQ(tst_value, ref_value) + << "Error in 16 bit buffer " + << "Error at row: " << (r - kVPad) << ", col: " << (c - kHPad); + } + } +} + +typedef tuple BlockDimension; + +struct BaseParams { + BaseParams(BlockDimension dims, NTaps ntaps_x, NTaps ntaps_y, bool avg) + : dims(dims), ntaps_x(ntaps_x), ntaps_y(ntaps_y), avg(avg) {} + + BlockDimension dims; + NTaps ntaps_x, ntaps_y; + bool avg; +}; + +template +class ConvolveScaleTestBase : public ::testing::Test { + public: + ConvolveScaleTestBase() : image_(NULL) {} + virtual ~ConvolveScaleTestBase() { delete image_; } + virtual void TearDown() { libaom_test::ClearSystemState(); } + + // Implemented by subclasses (SetUp depends on the parameters passed + // in and RunOne depends on the function to be tested. These can't + // be templated for low/high bit depths because they have different + // numbers of parameters) + virtual void SetUp() = 0; + virtual void RunOne(bool ref) = 0; + + protected: + void SetParams(const BaseParams ¶ms, int bd) { + width_ = std::get<0>(params.dims); + height_ = std::get<1>(params.dims); + ntaps_x_ = params.ntaps_x; + ntaps_y_ = params.ntaps_y; + bd_ = bd; + avg_ = params.avg; + + filter_x_.set(ntaps_x_, false); + filter_y_.set(ntaps_y_, true); + convolve_params_ = + get_conv_params_no_round(avg_ != false, 0, NULL, 0, 1, bd); + + delete image_; + image_ = new TestImage(width_, height_, bd_); + } + + void SetConvParamOffset(int i, int j, int is_compound, int do_average, + int use_dist_wtd_comp_avg) { + if (i == -1 && j == -1) { + convolve_params_.use_dist_wtd_comp_avg = use_dist_wtd_comp_avg; + convolve_params_.is_compound = is_compound; + convolve_params_.do_average = do_average; + } else { + convolve_params_.use_dist_wtd_comp_avg = use_dist_wtd_comp_avg; + convolve_params_.fwd_offset = quant_dist_lookup_table[i][j][0]; + convolve_params_.bck_offset = quant_dist_lookup_table[i][j][1]; + convolve_params_.is_compound = is_compound; + convolve_params_.do_average = do_average; + } + } + + void Run() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + for (int i = 0; i < kTestIters; ++i) { + int is_compound = 0; + SetConvParamOffset(-1, -1, is_compound, 0, 0); + Prep(&rnd); + RunOne(true); + RunOne(false); + image_->Check(); + + is_compound = 1; + for (int do_average = 0; do_average < 2; do_average++) { + for (int use_dist_wtd_comp_avg = 0; use_dist_wtd_comp_avg < 2; + use_dist_wtd_comp_avg++) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 4; ++k) { + SetConvParamOffset(j, k, is_compound, do_average, + use_dist_wtd_comp_avg); + Prep(&rnd); + RunOne(true); + RunOne(false); + image_->Check(); + } + } + } + } + } + } + + void SpeedTest() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + Prep(&rnd); + + aom_usec_timer ref_timer; + aom_usec_timer_start(&ref_timer); + for (int i = 0; i < kPerfIters; ++i) RunOne(true); + aom_usec_timer_mark(&ref_timer); + const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer); + + aom_usec_timer tst_timer; + aom_usec_timer_start(&tst_timer); + for (int i = 0; i < kPerfIters; ++i) RunOne(false); + aom_usec_timer_mark(&tst_timer); + const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer); + + std::cout << "[ ] C time = " << ref_time / 1000 + << " ms, SIMD time = " << tst_time / 1000 << " ms\n"; + + EXPECT_GT(ref_time, tst_time) + << "Error: CDEFSpeedTest, SIMD slower than C.\n" + << "C time: " << ref_time << " us\n" + << "SIMD time: " << tst_time << " us\n"; + } + + static int RandomSubpel(ACMRandom *rnd) { + const uint8_t subpel_mode = rnd->Rand8(); + if ((subpel_mode & 7) == 0) { + return 0; + } else if ((subpel_mode & 7) == 1) { + return SCALE_SUBPEL_SHIFTS - 1; + } else { + return 1 + rnd->PseudoUniform(SCALE_SUBPEL_SHIFTS - 2); + } + } + + void Prep(ACMRandom *rnd) { + assert(rnd); + + // Choose subpel_x_ and subpel_y_. They should be less than + // SCALE_SUBPEL_SHIFTS; we also want to add extra weight to "interesting" + // values: 0 and SCALE_SUBPEL_SHIFTS - 1 + subpel_x_ = RandomSubpel(rnd); + subpel_y_ = RandomSubpel(rnd); + + image_->Initialize(rnd); + } + + int width_, height_, bd_; + NTaps ntaps_x_, ntaps_y_; + bool avg_; + int subpel_x_, subpel_y_; + TestFilter filter_x_, filter_y_; + TestImage *image_; + ConvolveParams convolve_params_; +}; + +typedef tuple BlockDimension; + +typedef void (*LowbdConvolveFunc)(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int x_step_qn, + const int subpel_y_qn, const int y_step_qn, + ConvolveParams *conv_params); + +// Test parameter list: +// +typedef tuple + LowBDParams; + +class LowBDConvolveScaleTest + : public ConvolveScaleTestBase, + public ::testing::WithParamInterface { + public: + virtual ~LowBDConvolveScaleTest() {} + + void SetUp() { + tst_fun_ = GET_PARAM(0); + + const BlockDimension &block = GET_PARAM(1); + const NTaps ntaps_x = GET_PARAM(2); + const NTaps ntaps_y = GET_PARAM(3); + const int bd = 8; + const bool avg = GET_PARAM(4); + + SetParams(BaseParams(block, ntaps_x, ntaps_y, avg), bd); + } + + void RunOne(bool ref) { + const uint8_t *src = image_->GetSrcData(ref, false); + uint8_t *dst = image_->GetDstData(ref, false); + convolve_params_.dst = image_->GetDst16Data(ref, false); + const int src_stride = image_->src_stride(); + const int dst_stride = image_->dst_stride(); + if (ref) { + av1_convolve_2d_scale_c(src, src_stride, dst, dst_stride, width_, height_, + &filter_x_.params_, &filter_y_.params_, subpel_x_, + kXStepQn, subpel_y_, kYStepQn, &convolve_params_); + } else { + tst_fun_(src, src_stride, dst, dst_stride, width_, height_, + &filter_x_.params_, &filter_y_.params_, subpel_x_, kXStepQn, + subpel_y_, kYStepQn, &convolve_params_); + } + } + + private: + LowbdConvolveFunc tst_fun_; +}; + +const BlockDimension kBlockDim[] = { + make_tuple(2, 2), make_tuple(2, 4), make_tuple(4, 4), + make_tuple(4, 8), make_tuple(8, 4), make_tuple(8, 8), + make_tuple(8, 16), make_tuple(16, 8), make_tuple(16, 16), + make_tuple(16, 32), make_tuple(32, 16), make_tuple(32, 32), + make_tuple(32, 64), make_tuple(64, 32), make_tuple(64, 64), + make_tuple(64, 128), make_tuple(128, 64), make_tuple(128, 128), +}; + +const NTaps kNTaps[] = { EIGHT_TAP }; + +TEST_P(LowBDConvolveScaleTest, Check) { Run(); } +TEST_P(LowBDConvolveScaleTest, DISABLED_Speed) { SpeedTest(); } + +INSTANTIATE_TEST_SUITE_P( + SSE4_1, LowBDConvolveScaleTest, + ::testing::Combine(::testing::Values(av1_convolve_2d_scale_sse4_1), + ::testing::ValuesIn(kBlockDim), + ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps), + ::testing::Bool())); + +#if CONFIG_AV1_HIGHBITDEPTH +typedef void (*HighbdConvolveFunc)(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int x_step_qn, + const int subpel_y_qn, const int y_step_qn, + ConvolveParams *conv_params, int bd); + +// Test parameter list: +// +typedef tuple + HighBDParams; + +class HighBDConvolveScaleTest + : public ConvolveScaleTestBase, + public ::testing::WithParamInterface { + public: + virtual ~HighBDConvolveScaleTest() {} + + void SetUp() { + tst_fun_ = GET_PARAM(0); + + const BlockDimension &block = GET_PARAM(1); + const NTaps ntaps_x = GET_PARAM(2); + const NTaps ntaps_y = GET_PARAM(3); + const bool avg = GET_PARAM(4); + const int bd = GET_PARAM(5); + + SetParams(BaseParams(block, ntaps_x, ntaps_y, avg), bd); + } + + void RunOne(bool ref) { + const uint16_t *src = image_->GetSrcData(ref, false); + uint16_t *dst = image_->GetDstData(ref, false); + convolve_params_.dst = image_->GetDst16Data(ref, false); + const int src_stride = image_->src_stride(); + const int dst_stride = image_->dst_stride(); + + if (ref) { + av1_highbd_convolve_2d_scale_c( + src, src_stride, dst, dst_stride, width_, height_, &filter_x_.params_, + &filter_y_.params_, subpel_x_, kXStepQn, subpel_y_, kYStepQn, + &convolve_params_, bd_); + } else { + tst_fun_(src, src_stride, dst, dst_stride, width_, height_, + &filter_x_.params_, &filter_y_.params_, subpel_x_, kXStepQn, + subpel_y_, kYStepQn, &convolve_params_, bd_); + } + } + + private: + HighbdConvolveFunc tst_fun_; +}; + +const int kBDs[] = { 8, 10, 12 }; + +TEST_P(HighBDConvolveScaleTest, Check) { Run(); } +TEST_P(HighBDConvolveScaleTest, DISABLED_Speed) { SpeedTest(); } + +INSTANTIATE_TEST_SUITE_P( + SSE4_1, HighBDConvolveScaleTest, + ::testing::Combine(::testing::Values(av1_highbd_convolve_2d_scale_sse4_1), + ::testing::ValuesIn(kBlockDim), + ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps), + ::testing::Bool(), ::testing::ValuesIn(kBDs))); +#endif // CONFIG_AV1_HIGHBITDEPTH +} // namespace diff --git a/libs/libaom/src/test/av1_encoder_parms_get_to_decoder.cc b/libs/libaom/src/test/av1_encoder_parms_get_to_decoder.cc new file mode 100644 index 000000000..76b82f58f --- /dev/null +++ b/libs/libaom/src/test/av1_encoder_parms_get_to_decoder.cc @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/y4m_video_source.h" + +#include "aom/aom_decoder.h" +#include "av1/decoder/decoder.h" + +namespace { + +const int kMaxPsnr = 100; + +struct ParamPassingTestVideo { + const char *name; + uint32_t width; + uint32_t height; + uint32_t bitrate; + int frames; +}; + +const ParamPassingTestVideo kAV1ParamPassingTestVector = { + "niklas_1280_720_30.y4m", 1280, 720, 600, 3 +}; + +struct EncodeParameters { + int32_t lossless; + aom_color_primaries_t color_primaries; + aom_transfer_characteristics_t transfer_characteristics; + aom_matrix_coefficients_t matrix_coefficients; + aom_color_range_t color_range; + aom_chroma_sample_position_t chroma_sample_position; + int32_t render_size[2]; +}; + +const EncodeParameters kAV1EncodeParameterSet[] = { + { 1, + AOM_CICP_CP_BT_709, + AOM_CICP_TC_BT_709, + AOM_CICP_MC_BT_709, + AOM_CR_STUDIO_RANGE, + AOM_CSP_UNKNOWN, + { 0, 0 } }, + { 0, + AOM_CICP_CP_BT_470_M, + AOM_CICP_TC_BT_470_M, + AOM_CICP_MC_BT_470_B_G, + AOM_CR_FULL_RANGE, + AOM_CSP_VERTICAL, + { 0, 0 } }, + { 1, + AOM_CICP_CP_BT_601, + AOM_CICP_TC_BT_601, + AOM_CICP_MC_BT_601, + AOM_CR_STUDIO_RANGE, + AOM_CSP_COLOCATED, + { 0, 0 } }, + { 0, + AOM_CICP_CP_BT_2020, + AOM_CICP_TC_BT_2020_10_BIT, + AOM_CICP_MC_BT_2020_NCL, + AOM_CR_FULL_RANGE, + AOM_CSP_RESERVED, + { 640, 480 } }, +}; + +class AVxEncoderParmsGetToDecoder + : public ::libaom_test::EncoderTest, + public ::libaom_test::CodecTestWithParam { + protected: + AVxEncoderParmsGetToDecoder() + : EncoderTest(GET_PARAM(0)), encode_parms(GET_PARAM(1)) {} + + virtual ~AVxEncoderParmsGetToDecoder() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(::libaom_test::kTwoPassGood); + cfg_.g_lag_in_frames = 25; + test_video_ = kAV1ParamPassingTestVector; + cfg_.rc_target_bitrate = test_video_.bitrate; + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AV1E_SET_COLOR_PRIMARIES, encode_parms.color_primaries); + encoder->Control(AV1E_SET_TRANSFER_CHARACTERISTICS, + encode_parms.transfer_characteristics); + encoder->Control(AV1E_SET_MATRIX_COEFFICIENTS, + encode_parms.matrix_coefficients); + encoder->Control(AV1E_SET_COLOR_RANGE, encode_parms.color_range); + encoder->Control(AV1E_SET_CHROMA_SAMPLE_POSITION, + encode_parms.chroma_sample_position); + encoder->Control(AV1E_SET_LOSSLESS, encode_parms.lossless); + if (encode_parms.render_size[0] > 0 && encode_parms.render_size[1] > 0) { + encoder->Control(AV1E_SET_RENDER_SIZE, encode_parms.render_size); + } + } + } + + virtual void DecompressedFrameHook(const aom_image_t &img, + aom_codec_pts_t pts) { + (void)pts; + if (encode_parms.render_size[0] > 0 && encode_parms.render_size[1] > 0) { + EXPECT_EQ(encode_parms.render_size[0], (int)img.r_w); + EXPECT_EQ(encode_parms.render_size[1], (int)img.r_h); + } + EXPECT_EQ(encode_parms.color_primaries, img.cp); + EXPECT_EQ(encode_parms.transfer_characteristics, img.tc); + EXPECT_EQ(encode_parms.matrix_coefficients, img.mc); + EXPECT_EQ(encode_parms.color_range, img.range); + EXPECT_EQ(encode_parms.chroma_sample_position, img.csp); + } + + virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) { + if (encode_parms.lossless) { + EXPECT_EQ(kMaxPsnr, pkt->data.psnr.psnr[0]); + } + } + + virtual bool HandleDecodeResult(const aom_codec_err_t res_dec, + libaom_test::Decoder *decoder) { + EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError(); + return AOM_CODEC_OK == res_dec; + } + + ParamPassingTestVideo test_video_; + + private: + EncodeParameters encode_parms; +}; + +TEST_P(AVxEncoderParmsGetToDecoder, BitstreamParms) { + init_flags_ = AOM_CODEC_USE_PSNR; + + std::unique_ptr video( + new libaom_test::Y4mVideoSource(test_video_.name, 0, test_video_.frames)); + ASSERT_TRUE(video.get() != NULL); + + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); +} + +AV1_INSTANTIATE_TEST_CASE(AVxEncoderParmsGetToDecoder, + ::testing::ValuesIn(kAV1EncodeParameterSet)); +} // namespace diff --git a/libs/libaom/src/test/av1_ext_tile_test.cc b/libs/libaom/src/test/av1_ext_tile_test.cc new file mode 100644 index 000000000..424d2f065 --- /dev/null +++ b/libs/libaom/src/test/av1_ext_tile_test.cc @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/md5_helper.h" +#include "test/util.h" + +namespace { +// The number of frames to be encoded/decoded +const int kLimit = 8; +// Skip 1 frame to check the frame decoding independency. +const int kSkip = 5; +const int kTileSize = 1; +const int kTIleSizeInPixels = (kTileSize << 6); +// Fake width and height so that they can be multiples of the tile size. +const int kImgWidth = 704; +const int kImgHeight = 576; + +// This test tests large scale tile coding case. Non-large-scale tile coding +// is tested by the tile_independence test. +class AV1ExtTileTest + : public ::libaom_test::CodecTestWith2Params, + public ::libaom_test::EncoderTest { + protected: + AV1ExtTileTest() + : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), + set_cpu_used_(GET_PARAM(2)) { + init_flags_ = AOM_CODEC_USE_PSNR; + aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t(); + cfg.w = kImgWidth; + cfg.h = kImgHeight; + cfg.allow_lowbitdepth = 1; + + decoder_ = codec_->CreateDecoder(cfg, 0); + decoder_->Control(AV1_SET_TILE_MODE, 1); + decoder_->Control(AV1D_EXT_TILE_DEBUG, 1); + decoder_->Control(AV1_SET_DECODE_TILE_ROW, -1); + decoder_->Control(AV1_SET_DECODE_TILE_COL, -1); + + // Allocate buffer to store tile image. + aom_img_alloc(&tile_img_, AOM_IMG_FMT_I420, kImgWidth, kImgHeight, 32); + + md5_.clear(); + tile_md5_.clear(); + } + + virtual ~AV1ExtTileTest() { + aom_img_free(&tile_img_); + delete decoder_; + } + + virtual void SetUp() { + InitializeConfig(); + SetMode(encoding_mode_); + + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = AOM_VBR; + cfg_.g_error_resilient = 1; + + cfg_.rc_max_quantizer = 56; + cfg_.rc_min_quantizer = 0; + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + // Encode setting + encoder->Control(AOME_SET_CPUUSED, set_cpu_used_); + encoder->Control(AOME_SET_ENABLEAUTOALTREF, 0); + encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1); + + // TODO(yunqingwang): test single_tile_decoding = 0. + encoder->Control(AV1E_SET_SINGLE_TILE_DECODING, 1); + // Always use 64x64 max partition. + encoder->Control(AV1E_SET_SUPERBLOCK_SIZE, AOM_SUPERBLOCK_SIZE_64X64); + // Set tile_columns and tile_rows to MAX values, which guarantees the tile + // size of 64 x 64 pixels(i.e. 1 SB) for <= 4k resolution. + encoder->Control(AV1E_SET_TILE_COLUMNS, 6); + encoder->Control(AV1E_SET_TILE_ROWS, 6); + } + + if (video->frame() == 1) { + frame_flags_ = + AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF; + } + } + + virtual void DecompressedFrameHook(const aom_image_t &img, + aom_codec_pts_t pts) { + // Skip 1 already decoded frame to be consistent with the decoder in this + // test. + if (pts == (aom_codec_pts_t)kSkip) return; + + // Calculate MD5 as the reference. + ::libaom_test::MD5 md5_res; + md5_res.Add(&img); + md5_.push_back(md5_res.Get()); + } + + virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) { + // Skip decoding 1 frame. + if (pkt->data.frame.pts == (aom_codec_pts_t)kSkip) return; + + bool IsLastFrame = (pkt->data.frame.pts == (aom_codec_pts_t)(kLimit - 1)); + + // Decode the first (kLimit - 1) frames as whole frame, and decode the last + // frame in single tiles. + for (int r = 0; r < kImgHeight / kTIleSizeInPixels; ++r) { + for (int c = 0; c < kImgWidth / kTIleSizeInPixels; ++c) { + if (!IsLastFrame) { + decoder_->Control(AV1_SET_DECODE_TILE_ROW, -1); + decoder_->Control(AV1_SET_DECODE_TILE_COL, -1); + } else { + decoder_->Control(AV1_SET_DECODE_TILE_ROW, r); + decoder_->Control(AV1_SET_DECODE_TILE_COL, c); + } + + const aom_codec_err_t res = decoder_->DecodeFrame( + reinterpret_cast(pkt->data.frame.buf), + pkt->data.frame.sz); + if (res != AOM_CODEC_OK) { + abort_ = true; + ASSERT_EQ(AOM_CODEC_OK, res); + } + const aom_image_t *img = decoder_->GetDxData().Next(); + + if (!IsLastFrame) { + if (img) { + ::libaom_test::MD5 md5_res; + md5_res.Add(img); + tile_md5_.push_back(md5_res.Get()); + } + break; + } + + const int kMaxMBPlane = 3; + for (int plane = 0; plane < kMaxMBPlane; ++plane) { + const int shift = (plane == 0) ? 0 : 1; + int tile_height = kTIleSizeInPixels >> shift; + int tile_width = kTIleSizeInPixels >> shift; + + for (int tr = 0; tr < tile_height; ++tr) { + memcpy(tile_img_.planes[plane] + + tile_img_.stride[plane] * (r * tile_height + tr) + + c * tile_width, + img->planes[plane] + img->stride[plane] * tr, tile_width); + } + } + } + + if (!IsLastFrame) break; + } + + if (IsLastFrame) { + ::libaom_test::MD5 md5_res; + md5_res.Add(&tile_img_); + tile_md5_.push_back(md5_res.Get()); + } + } + + void TestRoundTrip() { + ::libaom_test::I420VideoSource video( + "hantro_collage_w352h288.yuv", kImgWidth, kImgHeight, 30, 1, 0, kLimit); + cfg_.rc_target_bitrate = 500; + cfg_.g_error_resilient = AOM_ERROR_RESILIENT_DEFAULT; + cfg_.large_scale_tile = 1; + cfg_.g_lag_in_frames = 0; + cfg_.g_threads = 1; + + // Tile encoding + init_flags_ = AOM_CODEC_USE_PSNR; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + // Compare to check if two vectors are equal. + ASSERT_EQ(md5_, tile_md5_); + } + + ::libaom_test::TestMode encoding_mode_; + int set_cpu_used_; + ::libaom_test::Decoder *decoder_; + aom_image_t tile_img_; + std::vector md5_; + std::vector tile_md5_; +}; + +TEST_P(AV1ExtTileTest, DecoderResultTest) { TestRoundTrip(); } + +AV1_INSTANTIATE_TEST_CASE( + // Now only test 2-pass mode. + AV1ExtTileTest, ::testing::Values(::libaom_test::kTwoPassGood), + ::testing::Range(1, 4)); + +class AV1ExtTileTestLarge : public AV1ExtTileTest {}; + +TEST_P(AV1ExtTileTestLarge, DecoderResultTest) { TestRoundTrip(); } + +AV1_INSTANTIATE_TEST_CASE( + // Now only test 2-pass mode. + AV1ExtTileTestLarge, ::testing::Values(::libaom_test::kTwoPassGood), + ::testing::Range(0, 1)); +} // namespace diff --git a/libs/libaom/src/test/av1_fwd_txfm1d_test.cc b/libs/libaom/src/test/av1_fwd_txfm1d_test.cc new file mode 100644 index 000000000..abc46ed5a --- /dev/null +++ b/libs/libaom/src/test/av1_fwd_txfm1d_test.cc @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/encoder/av1_fwd_txfm1d.h" +#include "test/av1_txfm_test.h" + +using libaom_test::ACMRandom; +using libaom_test::input_base; +using libaom_test::reference_hybrid_1d; +using libaom_test::TYPE_ADST; +using libaom_test::TYPE_DCT; +using libaom_test::TYPE_IDTX; +using libaom_test::TYPE_TXFM; + +namespace { +const int txfm_type_num = 3; +const TYPE_TXFM txfm_type_ls[txfm_type_num] = { TYPE_DCT, TYPE_ADST, + TYPE_IDTX }; + +const int txfm_size_num = 5; + +const int txfm_size_ls[] = { 4, 8, 16, 32, 64 }; + +const TxfmFunc fwd_txfm_func_ls[][txfm_type_num] = { + { av1_fdct4, av1_fadst4, av1_fidentity4_c }, + { av1_fdct8, av1_fadst8, av1_fidentity8_c }, + { av1_fdct16, av1_fadst16, av1_fidentity16_c }, + { av1_fdct32, NULL, av1_fidentity32_c }, + { av1_fdct64, NULL, NULL }, +}; + +// the maximum stage number of fwd/inv 1d dct/adst txfm is 12 +const int8_t cos_bit = 14; +const int8_t range_bit[12] = { 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 }; + +TEST(av1_fwd_txfm1d, round_shift) { + EXPECT_EQ(round_shift(7, 1), 4); + EXPECT_EQ(round_shift(-7, 1), -3); + + EXPECT_EQ(round_shift(7, 2), 2); + EXPECT_EQ(round_shift(-7, 2), -2); + + EXPECT_EQ(round_shift(8, 2), 2); + EXPECT_EQ(round_shift(-8, 2), -2); +} + +TEST(av1_fwd_txfm1d, av1_cospi_arr_data) { + for (int i = 0; i < 7; i++) { + for (int j = 0; j < 64; j++) { + EXPECT_EQ(av1_cospi_arr_data[i][j], + (int32_t)round(cos(PI * j / 128) * (1 << (cos_bit_min + i)))); + } + } +} + +TEST(av1_fwd_txfm1d, accuracy) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + for (int si = 0; si < txfm_size_num; ++si) { + int txfm_size = txfm_size_ls[si]; + int32_t *input = new int32_t[txfm_size]; + int32_t *output = new int32_t[txfm_size]; + double *ref_input = new double[txfm_size]; + double *ref_output = new double[txfm_size]; + + for (int ti = 0; ti < txfm_type_num; ++ti) { + TYPE_TXFM txfm_type = txfm_type_ls[ti]; + TxfmFunc fwd_txfm_func = fwd_txfm_func_ls[si][ti]; + int max_error = 7; + + const int count_test_block = 5000; + if (fwd_txfm_func != NULL) { + for (int ti = 0; ti < count_test_block; ++ti) { + for (int ni = 0; ni < txfm_size; ++ni) { + input[ni] = rnd.Rand16() % input_base - rnd.Rand16() % input_base; + ref_input[ni] = static_cast(input[ni]); + } + + fwd_txfm_func(input, output, cos_bit, range_bit); + reference_hybrid_1d(ref_input, ref_output, txfm_size, txfm_type); + + for (int ni = 0; ni < txfm_size; ++ni) { + ASSERT_LE( + abs(output[ni] - static_cast(round(ref_output[ni]))), + max_error) + << "tx size = " << txfm_size << ", tx type = " << txfm_type; + } + } + } + } + + delete[] input; + delete[] output; + delete[] ref_input; + delete[] ref_output; + } +} +} // namespace diff --git a/libs/libaom/src/test/av1_fwd_txfm2d_test.cc b/libs/libaom/src/test/av1_fwd_txfm2d_test.cc new file mode 100644 index 000000000..dd6066576 --- /dev/null +++ b/libs/libaom/src/test/av1_fwd_txfm2d_test.cc @@ -0,0 +1,583 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include + +#include "config/av1_rtcd.h" + +#include "test/acm_random.h" +#include "test/util.h" +#include "test/av1_txfm_test.h" +#include "av1/common/av1_txfm.h" +#include "av1/encoder/hybrid_fwd_txfm.h" + +using libaom_test::ACMRandom; +using libaom_test::bd; +using libaom_test::compute_avg_abs_error; +using libaom_test::input_base; +using libaom_test::TYPE_TXFM; + +using std::vector; + +namespace { +// tx_type_, tx_size_, max_error_, max_avg_error_ +typedef std::tuple AV1FwdTxfm2dParam; + +class AV1FwdTxfm2d : public ::testing::TestWithParam { + public: + virtual void SetUp() { + tx_type_ = GET_PARAM(0); + tx_size_ = GET_PARAM(1); + max_error_ = GET_PARAM(2); + max_avg_error_ = GET_PARAM(3); + count_ = 500; + TXFM_2D_FLIP_CFG fwd_txfm_flip_cfg; + av1_get_fwd_txfm_cfg(tx_type_, tx_size_, &fwd_txfm_flip_cfg); + amplify_factor_ = libaom_test::get_amplification_factor(tx_type_, tx_size_); + tx_width_ = tx_size_wide[fwd_txfm_flip_cfg.tx_size]; + tx_height_ = tx_size_high[fwd_txfm_flip_cfg.tx_size]; + ud_flip_ = fwd_txfm_flip_cfg.ud_flip; + lr_flip_ = fwd_txfm_flip_cfg.lr_flip; + + fwd_txfm_ = libaom_test::fwd_txfm_func_ls[tx_size_]; + txfm2d_size_ = tx_width_ * tx_height_; + input_ = reinterpret_cast( + aom_memalign(16, sizeof(input_[0]) * txfm2d_size_)); + output_ = reinterpret_cast( + aom_memalign(16, sizeof(output_[0]) * txfm2d_size_)); + ref_input_ = reinterpret_cast( + aom_memalign(16, sizeof(ref_input_[0]) * txfm2d_size_)); + ref_output_ = reinterpret_cast( + aom_memalign(16, sizeof(ref_output_[0]) * txfm2d_size_)); + } + + void RunFwdAccuracyCheck() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + double avg_abs_error = 0; + for (int ci = 0; ci < count_; ci++) { + for (int ni = 0; ni < txfm2d_size_; ++ni) { + input_[ni] = rnd.Rand16() % input_base; + ref_input_[ni] = static_cast(input_[ni]); + output_[ni] = 0; + ref_output_[ni] = 0; + } + + fwd_txfm_(input_, output_, tx_width_, tx_type_, bd); + + if (lr_flip_ && ud_flip_) { + libaom_test::fliplrud(ref_input_, tx_width_, tx_height_, tx_width_); + } else if (lr_flip_) { + libaom_test::fliplr(ref_input_, tx_width_, tx_height_, tx_width_); + } else if (ud_flip_) { + libaom_test::flipud(ref_input_, tx_width_, tx_height_, tx_width_); + } + + libaom_test::reference_hybrid_2d(ref_input_, ref_output_, tx_type_, + tx_size_); + + double actual_max_error = 0; + for (int ni = 0; ni < txfm2d_size_; ++ni) { + ref_output_[ni] = round(ref_output_[ni]); + const double this_error = + fabs(output_[ni] - ref_output_[ni]) / amplify_factor_; + actual_max_error = AOMMAX(actual_max_error, this_error); + } + EXPECT_GE(max_error_, actual_max_error) + << "tx_size = " << tx_size_ << ", tx_type = " << tx_type_; + if (actual_max_error > max_error_) { // exit early. + break; + } + + avg_abs_error += compute_avg_abs_error( + output_, ref_output_, txfm2d_size_); + } + + avg_abs_error /= amplify_factor_; + avg_abs_error /= count_; + EXPECT_GE(max_avg_error_, avg_abs_error) + << "tx_size = " << tx_size_ << ", tx_type = " << tx_type_; + } + + virtual void TearDown() { + aom_free(input_); + aom_free(output_); + aom_free(ref_input_); + aom_free(ref_output_); + } + + private: + double max_error_; + double max_avg_error_; + int count_; + double amplify_factor_; + TX_TYPE tx_type_; + TX_SIZE tx_size_; + int tx_width_; + int tx_height_; + int txfm2d_size_; + FwdTxfm2dFunc fwd_txfm_; + int16_t *input_; + int32_t *output_; + double *ref_input_; + double *ref_output_; + int ud_flip_; // flip upside down + int lr_flip_; // flip left to right +}; + +static double avg_error_ls[TX_SIZES_ALL] = { + 0.5, // 4x4 transform + 0.5, // 8x8 transform + 1.2, // 16x16 transform + 6.1, // 32x32 transform + 3.4, // 64x64 transform + 0.57, // 4x8 transform + 0.68, // 8x4 transform + 0.92, // 8x16 transform + 1.1, // 16x8 transform + 4.1, // 16x32 transform + 6, // 32x16 transform + 3.5, // 32x64 transform + 5.7, // 64x32 transform + 0.6, // 4x16 transform + 0.9, // 16x4 transform + 1.2, // 8x32 transform + 1.7, // 32x8 transform + 2.0, // 16x64 transform + 4.7, // 64x16 transform +}; + +static double max_error_ls[TX_SIZES_ALL] = { + 3, // 4x4 transform + 5, // 8x8 transform + 11, // 16x16 transform + 70, // 32x32 transform + 64, // 64x64 transform + 3.9, // 4x8 transform + 4.3, // 8x4 transform + 12, // 8x16 transform + 12, // 16x8 transform + 32, // 16x32 transform + 46, // 32x16 transform + 136, // 32x64 transform + 136, // 64x32 transform + 5, // 4x16 transform + 6, // 16x4 transform + 21, // 8x32 transform + 13, // 32x8 transform + 30, // 16x64 transform + 36, // 64x16 transform +}; + +vector GetTxfm2dParamList() { + vector param_list; + for (int s = 0; s < TX_SIZES; ++s) { + const double max_error = max_error_ls[s]; + const double avg_error = avg_error_ls[s]; + for (int t = 0; t < TX_TYPES; ++t) { + const TX_TYPE tx_type = static_cast(t); + const TX_SIZE tx_size = static_cast(s); + if (libaom_test::IsTxSizeTypeValid(tx_size, tx_type)) { + param_list.push_back( + AV1FwdTxfm2dParam(tx_type, tx_size, max_error, avg_error)); + } + } + } + return param_list; +} + +INSTANTIATE_TEST_SUITE_P(C, AV1FwdTxfm2d, + ::testing::ValuesIn(GetTxfm2dParamList())); + +TEST_P(AV1FwdTxfm2d, RunFwdAccuracyCheck) { RunFwdAccuracyCheck(); } + +TEST(AV1FwdTxfm2d, CfgTest) { + for (int bd_idx = 0; bd_idx < BD_NUM; ++bd_idx) { + int bd = libaom_test::bd_arr[bd_idx]; + int8_t low_range = libaom_test::low_range_arr[bd_idx]; + int8_t high_range = libaom_test::high_range_arr[bd_idx]; + for (int tx_size = 0; tx_size < TX_SIZES_ALL; ++tx_size) { + for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) { + if (libaom_test::IsTxSizeTypeValid(static_cast(tx_size), + static_cast(tx_type)) == + false) { + continue; + } + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(static_cast(tx_type), + static_cast(tx_size), &cfg); + int8_t stage_range_col[MAX_TXFM_STAGE_NUM]; + int8_t stage_range_row[MAX_TXFM_STAGE_NUM]; + av1_gen_fwd_stage_range(stage_range_col, stage_range_row, &cfg, bd); + libaom_test::txfm_stage_range_check(stage_range_col, cfg.stage_num_col, + cfg.cos_bit_col, low_range, + high_range); + libaom_test::txfm_stage_range_check(stage_range_row, cfg.stage_num_row, + cfg.cos_bit_row, low_range, + high_range); + } + } + } +} + +typedef void (*lowbd_fwd_txfm_func)(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param); + +void AV1FwdTxfm2dMatchTest(TX_SIZE tx_size, lowbd_fwd_txfm_func target_func) { + const int bd = 8; + TxfmParam param; + memset(¶m, 0, sizeof(param)); + const int rows = tx_size_high[tx_size]; + const int cols = tx_size_wide[tx_size]; + // printf("%d x %d\n", cols, rows); + for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) { + if (libaom_test::IsTxSizeTypeValid( + tx_size, static_cast(tx_type)) == false) { + continue; + } + + FwdTxfm2dFunc ref_func = libaom_test::fwd_txfm_func_ls[tx_size]; + if (ref_func != NULL) { + DECLARE_ALIGNED(32, int16_t, input[64 * 64]) = { 0 }; + DECLARE_ALIGNED(32, int32_t, output[64 * 64]); + DECLARE_ALIGNED(32, int32_t, ref_output[64 * 64]); + int input_stride = 64; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + for (int cnt = 0; cnt < 500; ++cnt) { + if (cnt == 0) { + for (int r = 0; r < rows; ++r) { + for (int c = 0; c < cols; ++c) { + input[r * input_stride + c] = (1 << bd) - 1; + } + } + } else { + for (int r = 0; r < rows; ++r) { + for (int c = 0; c < cols; ++c) { + input[r * input_stride + c] = rnd.Rand16() % (1 << bd); + } + } + } + param.tx_type = (TX_TYPE)tx_type; + param.tx_size = (TX_SIZE)tx_size; + param.tx_set_type = EXT_TX_SET_ALL16; + param.bd = bd; + ref_func(input, ref_output, input_stride, (TX_TYPE)tx_type, bd); + target_func(input, output, input_stride, ¶m); + const int check_rows = AOMMIN(32, rows); + const int check_cols = AOMMIN(32, rows * cols / check_rows); + for (int r = 0; r < check_rows; ++r) { + for (int c = 0; c < check_cols; ++c) { + ASSERT_EQ(ref_output[r * check_cols + c], + output[r * check_cols + c]) + << "[" << r << "," << c << "] cnt:" << cnt + << " tx_size: " << tx_size << " tx_type: " << tx_type; + } + } + } + } + } +} + +void AV1FwdTxfm2dSpeedTest(TX_SIZE tx_size, lowbd_fwd_txfm_func target_func) { + TxfmParam param; + memset(¶m, 0, sizeof(param)); + const int rows = tx_size_high[tx_size]; + const int cols = tx_size_wide[tx_size]; + const int num_loops = 1000000 / (rows * cols); + + for (int i = 0; i < 2; ++i) { + const int bd = 8; + for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) { + if (libaom_test::IsTxSizeTypeValid( + tx_size, static_cast(tx_type)) == false) { + continue; + } + + FwdTxfm2dFunc ref_func = libaom_test::fwd_txfm_func_ls[tx_size]; + if (ref_func != NULL) { + DECLARE_ALIGNED(32, int16_t, input[64 * 64]) = { 0 }; + DECLARE_ALIGNED(32, int32_t, output[64 * 64]); + DECLARE_ALIGNED(32, int32_t, ref_output[64 * 64]); + int input_stride = 64; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + + for (int r = 0; r < rows; ++r) { + for (int c = 0; c < cols; ++c) { + input[r * input_stride + c] = rnd.Rand16() % (1 << bd); + } + } + + param.tx_type = (TX_TYPE)tx_type; + param.tx_size = (TX_SIZE)tx_size; + param.tx_set_type = EXT_TX_SET_ALL16; + param.bd = bd; + + aom_usec_timer ref_timer, test_timer; + + aom_usec_timer_start(&ref_timer); + for (int i = 0; i < num_loops; ++i) { + ref_func(input, ref_output, input_stride, (TX_TYPE)tx_type, bd); + } + aom_usec_timer_mark(&ref_timer); + const int elapsed_time_c = + static_cast(aom_usec_timer_elapsed(&ref_timer)); + + aom_usec_timer_start(&test_timer); + for (int i = 0; i < num_loops; ++i) { + target_func(input, output, input_stride, ¶m); + } + aom_usec_timer_mark(&test_timer); + const int elapsed_time_simd = + static_cast(aom_usec_timer_elapsed(&test_timer)); + + printf( + "txfm_size[%d] \t txfm_type[%d] \t c_time=%d \t simd_time=%d \t " + "gain=%d \n", + tx_size, tx_type, elapsed_time_c, elapsed_time_simd, + (elapsed_time_c / elapsed_time_simd)); + } + } + } +} + +typedef std::tuple LbdFwdTxfm2dParam; + +class AV1FwdTxfm2dTest : public ::testing::TestWithParam {}; + +TEST_P(AV1FwdTxfm2dTest, match) { + AV1FwdTxfm2dMatchTest(GET_PARAM(0), GET_PARAM(1)); +} +TEST_P(AV1FwdTxfm2dTest, DISABLED_Speed) { + AV1FwdTxfm2dSpeedTest(GET_PARAM(0), GET_PARAM(1)); +} +using ::testing::Combine; +using ::testing::Values; +using ::testing::ValuesIn; + +#if HAVE_SSE2 +static TX_SIZE fwd_txfm_for_sse2[] = { + TX_4X4, + TX_8X8, + TX_16X16, + TX_32X32, + // TX_64X64, + TX_4X8, + TX_8X4, + TX_8X16, + TX_16X8, + TX_16X32, + TX_32X16, + // TX_32X64, + // TX_64X32, + TX_4X16, + TX_16X4, + TX_8X32, + TX_32X8, + TX_16X64, + TX_64X16, +}; + +INSTANTIATE_TEST_SUITE_P(SSE2, AV1FwdTxfm2dTest, + Combine(ValuesIn(fwd_txfm_for_sse2), + Values(av1_lowbd_fwd_txfm_sse2))); +#endif // HAVE_SSE2 + +#if HAVE_SSE4_1 +static TX_SIZE fwd_txfm_for_sse41[] = { + TX_4X4, + TX_64X64, + TX_32X64, + TX_64X32, +}; + +INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1FwdTxfm2dTest, + Combine(ValuesIn(fwd_txfm_for_sse41), + Values(av1_lowbd_fwd_txfm_sse4_1))); +#endif // HAVE_SSE4_1 + +#if HAVE_AVX2 +static TX_SIZE fwd_txfm_for_avx2[] = { + TX_4X4, TX_8X8, TX_16X16, TX_32X32, TX_64X64, TX_4X8, TX_8X4, + TX_8X16, TX_16X8, TX_16X32, TX_32X16, TX_32X64, TX_64X32, TX_4X16, + TX_16X4, TX_8X32, TX_32X8, TX_16X64, TX_64X16, +}; + +INSTANTIATE_TEST_SUITE_P(AVX2, AV1FwdTxfm2dTest, + Combine(ValuesIn(fwd_txfm_for_avx2), + Values(av1_lowbd_fwd_txfm_avx2))); +#endif // HAVE_AVX2 + +typedef void (*Highbd_fwd_txfm_func)(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param); + +void AV1HighbdFwdTxfm2dMatchTest(TX_SIZE tx_size, + Highbd_fwd_txfm_func target_func) { + const int bd_ar[2] = { 10, 12 }; + TxfmParam param; + memset(¶m, 0, sizeof(param)); + const int rows = tx_size_high[tx_size]; + const int cols = tx_size_wide[tx_size]; + for (int i = 0; i < 2; ++i) { + const int bd = bd_ar[i]; + for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) { + if (libaom_test::IsTxSizeTypeValid( + tx_size, static_cast(tx_type)) == false) { + continue; + } + + FwdTxfm2dFunc ref_func = libaom_test::fwd_txfm_func_ls[tx_size]; + if (ref_func != NULL) { + DECLARE_ALIGNED(32, int16_t, input[64 * 64]) = { 0 }; + DECLARE_ALIGNED(32, int32_t, output[64 * 64]); + DECLARE_ALIGNED(32, int32_t, ref_output[64 * 64]); + int input_stride = 64; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + for (int cnt = 0; cnt < 500; ++cnt) { + if (cnt == 0) { + for (int r = 0; r < rows; ++r) { + for (int c = 0; c < cols; ++c) { + input[r * input_stride + c] = (1 << bd) - 1; + } + } + } else { + for (int r = 0; r < rows; ++r) { + for (int c = 0; c < cols; ++c) { + input[r * input_stride + c] = rnd.Rand16() % (1 << bd); + } + } + } + param.tx_type = (TX_TYPE)tx_type; + param.tx_size = (TX_SIZE)tx_size; + param.tx_set_type = EXT_TX_SET_ALL16; + param.bd = bd; + + ref_func(input, ref_output, input_stride, (TX_TYPE)tx_type, bd); + target_func(input, output, input_stride, ¶m); + const int check_rows = AOMMIN(32, rows); + const int check_cols = AOMMIN(32, rows * cols / check_rows); + for (int r = 0; r < check_rows; ++r) { + for (int c = 0; c < check_cols; ++c) { + ASSERT_EQ(ref_output[r * check_cols + c], + output[r * check_cols + c]) + << "[" << r << "," << c << "] cnt:" << cnt + << " tx_size: " << tx_size << " tx_type: " << tx_type; + } + } + } + } + } + } +} + +void AV1HighbdFwdTxfm2dSpeedTest(TX_SIZE tx_size, + Highbd_fwd_txfm_func target_func) { + const int bd_ar[2] = { 10, 12 }; + TxfmParam param; + memset(¶m, 0, sizeof(param)); + const int rows = tx_size_high[tx_size]; + const int cols = tx_size_wide[tx_size]; + const int num_loops = 1000000 / (rows * cols); + + for (int i = 0; i < 2; ++i) { + const int bd = bd_ar[i]; + for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) { + if (libaom_test::IsTxSizeTypeValid( + tx_size, static_cast(tx_type)) == false) { + continue; + } + + FwdTxfm2dFunc ref_func = libaom_test::fwd_txfm_func_ls[tx_size]; + if (ref_func != NULL) { + DECLARE_ALIGNED(32, int16_t, input[64 * 64]) = { 0 }; + DECLARE_ALIGNED(32, int32_t, output[64 * 64]); + DECLARE_ALIGNED(32, int32_t, ref_output[64 * 64]); + int input_stride = 64; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + + for (int r = 0; r < rows; ++r) { + for (int c = 0; c < cols; ++c) { + input[r * input_stride + c] = rnd.Rand16() % (1 << bd); + } + } + + param.tx_type = (TX_TYPE)tx_type; + param.tx_size = (TX_SIZE)tx_size; + param.tx_set_type = EXT_TX_SET_ALL16; + param.bd = bd; + + aom_usec_timer ref_timer, test_timer; + + aom_usec_timer_start(&ref_timer); + for (int i = 0; i < num_loops; ++i) { + ref_func(input, ref_output, input_stride, (TX_TYPE)tx_type, bd); + } + aom_usec_timer_mark(&ref_timer); + const int elapsed_time_c = + static_cast(aom_usec_timer_elapsed(&ref_timer)); + + aom_usec_timer_start(&test_timer); + for (int i = 0; i < num_loops; ++i) { + target_func(input, output, input_stride, ¶m); + } + aom_usec_timer_mark(&test_timer); + const int elapsed_time_simd = + static_cast(aom_usec_timer_elapsed(&test_timer)); + + printf( + "txfm_size[%d] \t txfm_type[%d] \t c_time=%d \t simd_time=%d \t " + "gain=%d \n", + tx_size, tx_type, elapsed_time_c, elapsed_time_simd, + (elapsed_time_c / elapsed_time_simd)); + } + } + } +} + +typedef std::tuple HighbdFwdTxfm2dParam; + +class AV1HighbdFwdTxfm2dTest + : public ::testing::TestWithParam {}; + +TEST_P(AV1HighbdFwdTxfm2dTest, match) { + AV1HighbdFwdTxfm2dMatchTest(GET_PARAM(0), GET_PARAM(1)); +} + +TEST_P(AV1HighbdFwdTxfm2dTest, DISABLED_Speed) { + AV1HighbdFwdTxfm2dSpeedTest(GET_PARAM(0), GET_PARAM(1)); +} + +using ::testing::Combine; +using ::testing::Values; +using ::testing::ValuesIn; + +#if HAVE_SSE4_1 +static TX_SIZE Highbd_fwd_txfm_for_sse4_1[] = { + TX_4X4, TX_8X8, TX_16X16, TX_32X32, TX_64X64, TX_4X8, TX_8X4, + TX_8X16, TX_16X8, TX_16X32, TX_32X16, TX_32X64, TX_64X32, TX_4X16, + TX_16X4, TX_8X32, TX_32X8, TX_16X64, TX_64X16, +}; + +INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdFwdTxfm2dTest, + Combine(ValuesIn(Highbd_fwd_txfm_for_sse4_1), + Values(av1_highbd_fwd_txfm))); +#endif // HAVE_SSE4_1 +#if HAVE_AVX2 +static TX_SIZE Highbd_fwd_txfm_for_avx2[] = { TX_8X8, TX_16X16, TX_32X32, + TX_64X64, TX_8X16, TX_16X8 }; + +INSTANTIATE_TEST_SUITE_P(AVX2, AV1HighbdFwdTxfm2dTest, + Combine(ValuesIn(Highbd_fwd_txfm_for_avx2), + Values(av1_highbd_fwd_txfm))); +#endif // HAVE_AVX2 +} // namespace diff --git a/libs/libaom/src/test/av1_highbd_iht_test.cc b/libs/libaom/src/test/av1_highbd_iht_test.cc new file mode 100644 index 000000000..8fea500db --- /dev/null +++ b/libs/libaom/src/test/av1_highbd_iht_test.cc @@ -0,0 +1,362 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/av1_rtcd.h" + +#include "test/acm_random.h" +#include "test/av1_txfm_test.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "av1/common/enums.h" +#include "av1/common/scan.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +namespace { + +using libaom_test::ACMRandom; +using std::tuple; + +typedef void (*HbdHtFunc)(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd); + +typedef void (*IHbdHtFunc)(const int32_t *coeff, uint16_t *output, int stride, + TX_TYPE tx_type, int bd); +static const char *tx_type_name[] = { + "DCT_DCT", + "ADST_DCT", + "DCT_ADST", + "ADST_ADST", + "FLIPADST_DCT", + "DCT_FLIPADST", + "FLIPADST_FLIPADST", + "ADST_FLIPADST", + "FLIPADST_ADST", + "IDTX", + "V_DCT", + "H_DCT", + "V_ADST", + "H_ADST", + "V_FLIPADST", + "H_FLIPADST", +}; +// Test parameter argument list: +// +typedef tuple IHbdHtParam; + +class AV1HighbdInvHTNxN : public ::testing::TestWithParam { + public: + virtual ~AV1HighbdInvHTNxN() {} + + virtual void SetUp() { + txfm_ref_ = GET_PARAM(0); + inv_txfm_ = GET_PARAM(1); + inv_txfm_ref_ = GET_PARAM(2); + num_coeffs_ = GET_PARAM(3); + tx_type_ = GET_PARAM(4); + bit_depth_ = GET_PARAM(5); + + input_ = reinterpret_cast( + aom_memalign(16, sizeof(input_[0]) * num_coeffs_)); + + // Note: + // Inverse transform input buffer is 32-byte aligned + // Refer to /av1/encoder/context_tree.c, function, + // void alloc_mode_context(). + coeffs_ = reinterpret_cast( + aom_memalign(32, sizeof(coeffs_[0]) * num_coeffs_)); + output_ = reinterpret_cast( + aom_memalign(32, sizeof(output_[0]) * num_coeffs_)); + output_ref_ = reinterpret_cast( + aom_memalign(32, sizeof(output_ref_[0]) * num_coeffs_)); + } + + virtual void TearDown() { + aom_free(input_); + aom_free(coeffs_); + aom_free(output_); + aom_free(output_ref_); + libaom_test::ClearSystemState(); + } + + protected: + void RunBitexactCheck(); + + private: + int GetStride() const { + if (16 == num_coeffs_) { + return 4; + } else if (64 == num_coeffs_) { + return 8; + } else if (256 == num_coeffs_) { + return 16; + } else if (1024 == num_coeffs_) { + return 32; + } else if (4096 == num_coeffs_) { + return 64; + } else { + return 0; + } + } + + HbdHtFunc txfm_ref_; + IHbdHtFunc inv_txfm_; + IHbdHtFunc inv_txfm_ref_; + int num_coeffs_; + TX_TYPE tx_type_; + int bit_depth_; + + int16_t *input_; + int32_t *coeffs_; + uint16_t *output_; + uint16_t *output_ref_; +}; + +void AV1HighbdInvHTNxN::RunBitexactCheck() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int stride = GetStride(); + const int num_tests = 20000; + const uint16_t mask = (1 << bit_depth_) - 1; + + for (int i = 0; i < num_tests; ++i) { + for (int j = 0; j < num_coeffs_; ++j) { + input_[j] = (rnd.Rand16() & mask) - (rnd.Rand16() & mask); + output_ref_[j] = rnd.Rand16() & mask; + output_[j] = output_ref_[j]; + } + + txfm_ref_(input_, coeffs_, stride, tx_type_, bit_depth_); + inv_txfm_ref_(coeffs_, output_ref_, stride, tx_type_, bit_depth_); + ASM_REGISTER_STATE_CHECK( + inv_txfm_(coeffs_, output_, stride, tx_type_, bit_depth_)); + + for (int j = 0; j < num_coeffs_; ++j) { + EXPECT_EQ(output_ref_[j], output_[j]) + << "Not bit-exact result at index: " << j << " At test block: " << i; + } + } +} + +TEST_P(AV1HighbdInvHTNxN, InvTransResultCheck) { RunBitexactCheck(); } + +using std::make_tuple; + +#if HAVE_SSE4_1 +#define PARAM_LIST_4X4 \ + &av1_fwd_txfm2d_4x4_c, &av1_inv_txfm2d_add_4x4_sse4_1, \ + &av1_inv_txfm2d_add_4x4_c, 16 + +const IHbdHtParam kArrayIhtParam[] = { + // 4x4 + make_tuple(PARAM_LIST_4X4, DCT_DCT, 10), + make_tuple(PARAM_LIST_4X4, DCT_DCT, 12), + make_tuple(PARAM_LIST_4X4, ADST_DCT, 10), + make_tuple(PARAM_LIST_4X4, ADST_DCT, 12), + make_tuple(PARAM_LIST_4X4, DCT_ADST, 10), + make_tuple(PARAM_LIST_4X4, DCT_ADST, 12), + make_tuple(PARAM_LIST_4X4, ADST_ADST, 10), + make_tuple(PARAM_LIST_4X4, ADST_ADST, 12), + make_tuple(PARAM_LIST_4X4, FLIPADST_DCT, 10), + make_tuple(PARAM_LIST_4X4, FLIPADST_DCT, 12), + make_tuple(PARAM_LIST_4X4, DCT_FLIPADST, 10), + make_tuple(PARAM_LIST_4X4, DCT_FLIPADST, 12), + make_tuple(PARAM_LIST_4X4, FLIPADST_FLIPADST, 10), + make_tuple(PARAM_LIST_4X4, FLIPADST_FLIPADST, 12), + make_tuple(PARAM_LIST_4X4, ADST_FLIPADST, 10), + make_tuple(PARAM_LIST_4X4, ADST_FLIPADST, 12), + make_tuple(PARAM_LIST_4X4, FLIPADST_ADST, 10), + make_tuple(PARAM_LIST_4X4, FLIPADST_ADST, 12), +}; + +INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdInvHTNxN, + ::testing::ValuesIn(kArrayIhtParam)); +#endif // HAVE_SSE4_1 + +typedef void (*HighbdInvTxfm2dFunc)(const int32_t *input, uint8_t *output, + int stride, const TxfmParam *txfm_param); + +typedef std::tuple AV1HighbdInvTxfm2dParam; +class AV1HighbdInvTxfm2d + : public ::testing::TestWithParam { + public: + virtual void SetUp() { target_func_ = GET_PARAM(0); } + void RunAV1InvTxfm2dTest(TX_TYPE tx_type, TX_SIZE tx_size, int run_times, + int bit_depth, int gt_int16 = 0); + + private: + HighbdInvTxfm2dFunc target_func_; +}; + +void AV1HighbdInvTxfm2d::RunAV1InvTxfm2dTest(TX_TYPE tx_type_, TX_SIZE tx_size_, + int run_times, int bit_depth_, + int gt_int16) { + FwdTxfm2dFunc fwd_func_ = libaom_test::fwd_txfm_func_ls[tx_size_]; + TxfmParam txfm_param; + const int BLK_WIDTH = 64; + const int BLK_SIZE = BLK_WIDTH * BLK_WIDTH; + DECLARE_ALIGNED(16, int16_t, input[BLK_SIZE]) = { 0 }; + DECLARE_ALIGNED(32, int32_t, inv_input[BLK_SIZE]) = { 0 }; + DECLARE_ALIGNED(32, uint16_t, output[BLK_SIZE]) = { 0 }; + DECLARE_ALIGNED(32, uint16_t, ref_output[BLK_SIZE]) = { 0 }; + int stride = BLK_WIDTH; + int rows = tx_size_high[tx_size_]; + int cols = tx_size_wide[tx_size_]; + const int rows_nonezero = AOMMIN(32, rows); + const int cols_nonezero = AOMMIN(32, cols); + const uint16_t mask = (1 << bit_depth_) - 1; + run_times /= (rows * cols); + run_times = AOMMAX(1, run_times); + const SCAN_ORDER *scan_order = get_default_scan(tx_size_, tx_type_); + const int16_t *scan = scan_order->scan; + const int16_t eobmax = rows_nonezero * cols_nonezero; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + int randTimes = run_times == 1 ? (eobmax) : 1; + + txfm_param.tx_type = tx_type_; + txfm_param.tx_size = tx_size_; + txfm_param.lossless = 0; + txfm_param.bd = bit_depth_; + txfm_param.is_hbd = 1; + txfm_param.tx_set_type = EXT_TX_SET_ALL16; + + for (int cnt = 0; cnt < randTimes; ++cnt) { + for (int r = 0; r < BLK_WIDTH; ++r) { + for (int c = 0; c < BLK_WIDTH; ++c) { + input[r * cols + c] = (rnd.Rand16() & mask) - (rnd.Rand16() & mask); + output[r * stride + c] = rnd.Rand16() & mask; + + ref_output[r * stride + c] = output[r * stride + c]; + } + } + fwd_func_(input, inv_input, stride, tx_type_, bit_depth_); + + // produce eob input by setting high freq coeffs to zero + const int eob = AOMMIN(cnt + 1, eobmax); + for (int i = eob; i < eobmax; i++) { + inv_input[scan[i]] = 0; + } + txfm_param.eob = eob; + if (gt_int16) { + const uint16_t inv_input_mask = + static_cast((1 << (bit_depth_ + 7)) - 1); + for (int i = 0; i < eob; i++) { + inv_input[scan[i]] = (rnd.Rand31() & inv_input_mask); + } + } + + aom_usec_timer ref_timer, test_timer; + aom_usec_timer_start(&ref_timer); + for (int i = 0; i < run_times; ++i) { + av1_highbd_inv_txfm_add_c(inv_input, CONVERT_TO_BYTEPTR(ref_output), + stride, &txfm_param); + } + aom_usec_timer_mark(&ref_timer); + const int elapsed_time_c = + static_cast(aom_usec_timer_elapsed(&ref_timer)); + + aom_usec_timer_start(&test_timer); + for (int i = 0; i < run_times; ++i) { + target_func_(inv_input, CONVERT_TO_BYTEPTR(output), stride, &txfm_param); + } + aom_usec_timer_mark(&test_timer); + const int elapsed_time_simd = + static_cast(aom_usec_timer_elapsed(&test_timer)); + if (run_times > 10) { + printf( + "txfm_size[%d] \t txfm_type[%d] \t c_time=%d \t simd_time=%d \t " + "gain=%d \n", + tx_size_, tx_type_, elapsed_time_c, elapsed_time_simd, + (elapsed_time_c / elapsed_time_simd)); + } else { + for (int r = 0; r < rows; ++r) { + for (int c = 0; c < cols; ++c) { + ASSERT_EQ(ref_output[r * stride + c], output[r * stride + c]) + << "[" << r << "," << c << "] " << cnt + << " tx_size: " << static_cast(tx_size_) + << " bit_depth_: " << bit_depth_ + << " tx_type: " << tx_type_name[tx_type_] << " eob " << eob; + } + } + } + } +} + +TEST_P(AV1HighbdInvTxfm2d, match) { + int bitdepth_ar[3] = { 8, 10, 12 }; + for (int k = 0; k < 3; ++k) { + int bd = bitdepth_ar[k]; + for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) { + for (int i = 0; i < (int)TX_TYPES; ++i) { + if (libaom_test::IsTxSizeTypeValid(static_cast(j), + static_cast(i))) { + RunAV1InvTxfm2dTest(static_cast(i), static_cast(j), + 1, bd); + } + } + } + } +} + +TEST_P(AV1HighbdInvTxfm2d, gt_int16) { + int bitdepth_ar[3] = { 8, 10, 12 }; + static const TX_TYPE types[] = { + DCT_DCT, ADST_DCT, FLIPADST_DCT, IDTX, V_DCT, H_DCT, H_ADST, H_FLIPADST + }; + for (int k = 0; k < 3; ++k) { + int bd = bitdepth_ar[k]; + for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) { + const TX_SIZE sz = static_cast(j); + for (uint8_t i = 0; i < sizeof(types) / sizeof(TX_TYPE); ++i) { + const TX_TYPE tp = types[i]; + if (libaom_test::IsTxSizeTypeValid(sz, tp)) { + RunAV1InvTxfm2dTest(tp, sz, 1, bd, 1); + } + } + } + } +} + +TEST_P(AV1HighbdInvTxfm2d, DISABLED_Speed) { + int bitdepth_ar[2] = { 10, 12 }; + for (int k = 0; k < 2; ++k) { + int bd = bitdepth_ar[k]; + for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) { + for (int i = 0; i < (int)TX_TYPES; ++i) { + if (libaom_test::IsTxSizeTypeValid(static_cast(j), + static_cast(i))) { + RunAV1InvTxfm2dTest(static_cast(i), static_cast(j), + 1000000, bd); + } + } + } + } +} + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdInvTxfm2d, + ::testing::Values(av1_highbd_inv_txfm_add_sse4_1)); +#endif + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2, AV1HighbdInvTxfm2d, + ::testing::Values(av1_highbd_inv_txfm_add_avx2)); +#endif +} // namespace diff --git a/libs/libaom/src/test/av1_horz_only_frame_superres_test.cc b/libs/libaom/src/test/av1_horz_only_frame_superres_test.cc new file mode 100644 index 000000000..115fc84c0 --- /dev/null +++ b/libs/libaom/src/test/av1_horz_only_frame_superres_test.cc @@ -0,0 +1,365 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/av1_rtcd.h" + +#include "aom_ports/aom_timer.h" +#include "av1/common/convolve.h" +#include "av1/common/resize.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" + +namespace { +const int kTestIters = 10; +const int kPerfIters = 1000; + +const int kVPad = 32; +const int kHPad = 32; + +using libaom_test::ACMRandom; +using std::make_tuple; +using std::tuple; + +template +class TestImage { + public: + TestImage(int w_src, int h, int superres_denom, int x0, int bd) + : w_src_(w_src), h_(h), superres_denom_(superres_denom), x0_(x0), + bd_(bd) { + assert(bd < 16); + assert(bd <= 8 * static_cast(sizeof(Pixel))); + assert(9 <= superres_denom && superres_denom <= 16); + assert(SCALE_NUMERATOR == 8); + assert(0 <= x0_ && x0_ <= RS_SCALE_SUBPEL_MASK); + + w_dst_ = w_src_; + av1_calculate_unscaled_superres_size(&w_dst_, NULL, superres_denom); + + src_stride_ = ALIGN_POWER_OF_TWO(w_src_ + 2 * kHPad, 4); + dst_stride_ = ALIGN_POWER_OF_TWO(w_dst_ + 2 * kHPad, 4); + + // Allocate image data + src_data_.resize(2 * src_block_size()); + dst_data_.resize(2 * dst_block_size()); + } + + void Initialize(ACMRandom *rnd); + void Check() const; + + int src_stride() const { return src_stride_; } + int dst_stride() const { return dst_stride_; } + + int src_block_size() const { return (h_ + 2 * kVPad) * src_stride(); } + int dst_block_size() const { return (h_ + 2 * kVPad) * dst_stride(); } + + int src_width() const { return w_src_; } + int dst_width() const { return w_dst_; } + int height() const { return h_; } + int x0() const { return x0_; } + + const Pixel *GetSrcData(bool ref, bool borders) const { + const Pixel *block = &src_data_[ref ? 0 : src_block_size()]; + return borders ? block : block + kHPad + src_stride_ * kVPad; + } + + Pixel *GetDstData(bool ref, bool borders) { + Pixel *block = &dst_data_[ref ? 0 : dst_block_size()]; + return borders ? block : block + kHPad + dst_stride_ * kVPad; + } + + private: + int w_src_, w_dst_, h_, superres_denom_, x0_, bd_; + int src_stride_, dst_stride_; + + std::vector src_data_; + std::vector dst_data_; +}; + +template +void FillEdge(ACMRandom *rnd, int num_pixels, int bd, bool trash, Pixel *data) { + if (!trash) { + memset(data, 0, sizeof(*data) * num_pixels); + return; + } + const Pixel mask = (1 << bd) - 1; + for (int i = 0; i < num_pixels; ++i) data[i] = rnd->Rand16() & mask; +} + +template +void PrepBuffers(ACMRandom *rnd, int w, int h, int stride, int bd, + bool trash_edges, Pixel *data) { + assert(rnd); + const Pixel mask = (1 << bd) - 1; + + // Fill in the first buffer with random data + // Top border + FillEdge(rnd, stride * kVPad, bd, trash_edges, data); + for (int r = 0; r < h; ++r) { + Pixel *row_data = data + (kVPad + r) * stride; + // Left border, contents, right border + FillEdge(rnd, kHPad, bd, trash_edges, row_data); + for (int c = 0; c < w; ++c) row_data[kHPad + c] = rnd->Rand16() & mask; + FillEdge(rnd, kHPad, bd, trash_edges, row_data + kHPad + w); + } + // Bottom border + FillEdge(rnd, stride * kVPad, bd, trash_edges, data + stride * (kVPad + h)); + + const int bpp = sizeof(*data); + const int block_elts = stride * (h + 2 * kVPad); + const int block_size = bpp * block_elts; + + // Now copy that to the second buffer + memcpy(data + block_elts, data, block_size); +} + +template +void TestImage::Initialize(ACMRandom *rnd) { + PrepBuffers(rnd, w_src_, h_, src_stride_, bd_, false, &src_data_[0]); + PrepBuffers(rnd, w_dst_, h_, dst_stride_, bd_, true, &dst_data_[0]); +} + +template +void TestImage::Check() const { + const int num_pixels = dst_block_size(); + const Pixel *ref_dst = &dst_data_[0]; + const Pixel *tst_dst = &dst_data_[num_pixels]; + + // If memcmp returns 0, there's nothing to do. + if (0 == memcmp(ref_dst, tst_dst, sizeof(*ref_dst) * num_pixels)) return; + + // Otherwise, iterate through the buffer looking for differences, *ignoring + // the edges* + const int stride = dst_stride_; + for (int r = kVPad; r < h_ + kVPad; ++r) { + for (int c = kVPad; c < w_dst_ + kHPad; ++c) { + const int32_t ref_value = ref_dst[r * stride + c]; + const int32_t tst_value = tst_dst[r * stride + c]; + + EXPECT_EQ(tst_value, ref_value) + << "Error at row: " << (r - kVPad) << ", col: " << (c - kHPad) + << ", superres_denom: " << superres_denom_ << ", height: " << h_ + << ", src_width: " << w_src_ << ", dst_width: " << w_dst_ + << ", x0: " << x0_; + } + } +} + +template +class ConvolveHorizRSTestBase : public ::testing::Test { + public: + ConvolveHorizRSTestBase() : image_(NULL) {} + virtual ~ConvolveHorizRSTestBase() {} + virtual void TearDown() { libaom_test::ClearSystemState(); } + + // Implemented by subclasses (SetUp depends on the parameters passed + // in and RunOne depends on the function to be tested. These can't + // be templated for low/high bit depths because they have different + // numbers of parameters) + virtual void SetUp() = 0; + virtual void RunOne(bool ref) = 0; + + protected: + void SetBitDepth(int bd) { bd_ = bd; } + + void CorrectnessTest() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + for (int i = 0; i < kTestIters; ++i) { + for (int superres_denom = 9; superres_denom <= 16; superres_denom++) { + // Get a random height between 512 and 767 + int height = rnd.Rand8() + 512; + + // Get a random src width between 128 and 383 + int width_src = rnd.Rand8() + 128; + + // x0 is normally calculated by get_upscale_convolve_x0 in + // av1/common/resize.c. However, this test should work for + // any value of x0 between 0 and RS_SCALE_SUBPEL_MASK + // (inclusive), so we choose one at random. + int x0 = rnd.Rand16() % (RS_SCALE_SUBPEL_MASK + 1); + + image_ = + new TestImage(width_src, height, superres_denom, x0, bd_); + + Prep(&rnd); + RunOne(true); + RunOne(false); + image_->Check(); + + delete image_; + } + } + } + + void SpeedTest() { + // Pick some specific parameters to test + int height = 767; + int width_src = 129; + int superres_denom = 13; + int x0 = RS_SCALE_SUBPEL_MASK >> 1; + + image_ = new TestImage(width_src, height, superres_denom, x0, bd_); + + ACMRandom rnd(ACMRandom::DeterministicSeed()); + Prep(&rnd); + + aom_usec_timer ref_timer; + aom_usec_timer_start(&ref_timer); + for (int i = 0; i < kPerfIters; ++i) RunOne(true); + aom_usec_timer_mark(&ref_timer); + const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer); + + aom_usec_timer tst_timer; + aom_usec_timer_start(&tst_timer); + for (int i = 0; i < kPerfIters; ++i) RunOne(false); + aom_usec_timer_mark(&tst_timer); + const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer); + + std::cout << "[ ] C time = " << ref_time / 1000 + << " ms, SIMD time = " << tst_time / 1000 << " ms\n"; + + EXPECT_GT(ref_time, tst_time) + << "Error: ConvolveHorizRSTest (Speed Test), SIMD slower than C.\n" + << "C time: " << ref_time << " us\n" + << "SIMD time: " << tst_time << " us\n"; + } + + void Prep(ACMRandom *rnd) { + assert(rnd); + image_->Initialize(rnd); + } + + int bd_; + TestImage *image_; +}; + +typedef void (*LowBDConvolveHorizRsFunc)(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, + int h, const int16_t *x_filters, + const int x0_qn, const int x_step_qn); + +// Test parameter list: +// +typedef tuple LowBDParams; + +class LowBDConvolveHorizRSTest + : public ConvolveHorizRSTestBase, + public ::testing::WithParamInterface { + public: + virtual ~LowBDConvolveHorizRSTest() {} + + void SetUp() { + tst_fun_ = GET_PARAM(0); + const int bd = 8; + SetBitDepth(bd); + } + + void RunOne(bool ref) { + const uint8_t *src = image_->GetSrcData(ref, false); + uint8_t *dst = image_->GetDstData(ref, false); + const int src_stride = image_->src_stride(); + const int dst_stride = image_->dst_stride(); + const int width_src = image_->src_width(); + const int width_dst = image_->dst_width(); + const int height = image_->height(); + const int x0_qn = image_->x0(); + + const int32_t x_step_qn = + av1_get_upscale_convolve_step(width_src, width_dst); + + if (ref) { + av1_convolve_horiz_rs_c(src, src_stride, dst, dst_stride, width_dst, + height, &av1_resize_filter_normative[0][0], x0_qn, + x_step_qn); + } else { + tst_fun_(src, src_stride, dst, dst_stride, width_dst, height, + &av1_resize_filter_normative[0][0], x0_qn, x_step_qn); + } + } + + private: + LowBDConvolveHorizRsFunc tst_fun_; +}; + +TEST_P(LowBDConvolveHorizRSTest, Correctness) { CorrectnessTest(); } +TEST_P(LowBDConvolveHorizRSTest, DISABLED_Speed) { SpeedTest(); } + +INSTANTIATE_TEST_SUITE_P(SSE4_1, LowBDConvolveHorizRSTest, + ::testing::Values(av1_convolve_horiz_rs_sse4_1)); + +#if CONFIG_AV1_HIGHBITDEPTH +typedef void (*HighBDConvolveHorizRsFunc)(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, + int h, const int16_t *x_filters, + const int x0_qn, const int x_step_qn, + int bd); + +// Test parameter list: +// +typedef tuple HighBDParams; + +class HighBDConvolveHorizRSTest + : public ConvolveHorizRSTestBase, + public ::testing::WithParamInterface { + public: + virtual ~HighBDConvolveHorizRSTest() {} + + void SetUp() { + tst_fun_ = GET_PARAM(0); + const int bd = GET_PARAM(1); + SetBitDepth(bd); + } + + void RunOne(bool ref) { + const uint16_t *src = image_->GetSrcData(ref, false); + uint16_t *dst = image_->GetDstData(ref, false); + const int src_stride = image_->src_stride(); + const int dst_stride = image_->dst_stride(); + const int width_src = image_->src_width(); + const int width_dst = image_->dst_width(); + const int height = image_->height(); + const int x0_qn = image_->x0(); + + const int32_t x_step_qn = + av1_get_upscale_convolve_step(width_src, width_dst); + + if (ref) { + av1_highbd_convolve_horiz_rs_c( + src, src_stride, dst, dst_stride, width_dst, height, + &av1_resize_filter_normative[0][0], x0_qn, x_step_qn, bd_); + } else { + tst_fun_(src, src_stride, dst, dst_stride, width_dst, height, + &av1_resize_filter_normative[0][0], x0_qn, x_step_qn, bd_); + } + } + + private: + HighBDConvolveHorizRsFunc tst_fun_; +}; + +const int kBDs[] = { 8, 10, 12 }; + +TEST_P(HighBDConvolveHorizRSTest, Correctness) { CorrectnessTest(); } +TEST_P(HighBDConvolveHorizRSTest, DISABLED_Speed) { SpeedTest(); } + +INSTANTIATE_TEST_SUITE_P( + SSE4_1, HighBDConvolveHorizRSTest, + ::testing::Combine(::testing::Values(av1_highbd_convolve_horiz_rs_sse4_1), + ::testing::ValuesIn(kBDs))); +#endif // CONFIG_AV1_HIGHBITDEPTH + +} // namespace diff --git a/libs/libaom/src/test/av1_inv_txfm1d_test.cc b/libs/libaom/src/test/av1_inv_txfm1d_test.cc new file mode 100644 index 000000000..01d4a4d7f --- /dev/null +++ b/libs/libaom/src/test/av1_inv_txfm1d_test.cc @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "test/av1_txfm_test.h" +#include "test/util.h" +#include "av1/common/av1_inv_txfm1d.h" +#include "av1/encoder/av1_fwd_txfm1d.h" + +typedef TX_SIZE TxSize; + +using libaom_test::ACMRandom; +using libaom_test::input_base; + +namespace { +const int txfm_type_num = 2; +const int txfm_size_ls[] = { 4, 8, 16, 32, 64 }; + +const TxfmFunc fwd_txfm_func_ls[][txfm_type_num] = { + { av1_fdct4, av1_fadst4 }, { av1_fdct8, av1_fadst8 }, + { av1_fdct16, av1_fadst16 }, { av1_fdct32, NULL }, + { av1_fdct64, NULL }, +}; + +const TxfmFunc inv_txfm_func_ls[][txfm_type_num] = { + { av1_idct4, av1_iadst4 }, { av1_idct8, av1_iadst8 }, + { av1_idct16, av1_iadst16 }, { av1_idct32, NULL }, + { av1_idct64, NULL }, +}; + +// the maximum stage number of fwd/inv 1d dct/adst txfm is 12 +const int8_t cos_bit = 13; +const int8_t range_bit[12] = { 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 }; + +void reference_idct_1d_int(const int32_t *in, int32_t *out, int size) { + double input[64]; + for (int i = 0; i < size; ++i) input[i] = in[i]; + + double output[64]; + libaom_test::reference_idct_1d(input, output, size); + + for (int i = 0; i < size; ++i) { + ASSERT_GE(output[i], INT32_MIN); + ASSERT_LE(output[i], INT32_MAX); + out[i] = static_cast(round(output[i])); + } +} + +void random_matrix(int32_t *dst, int len, ACMRandom *rnd) { + const int bits = 16; + const int maxVal = (1 << (bits - 1)) - 1; + const int minVal = -(1 << (bits - 1)); + for (int i = 0; i < len; ++i) { + if (rnd->Rand8() % 10) + dst[i] = minVal + rnd->Rand16() % (1 << bits); + else + dst[i] = rnd->Rand8() % 2 ? minVal : maxVal; + } +} + +TEST(av1_inv_txfm1d, InvAccuracyCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 20000; + const int max_error[] = { 6, 10, 19, 31, 40 }; + ASSERT_EQ(NELEMENTS(max_error), TX_SIZES); + ASSERT_EQ(NELEMENTS(inv_txfm_func_ls), TX_SIZES); + for (int k = 0; k < count_test_block; ++k) { + // choose a random transform to test + const TxSize tx_size = static_cast(rnd.Rand8() % TX_SIZES); + const int tx_size_pix = txfm_size_ls[tx_size]; + const TxfmFunc inv_txfm_func = inv_txfm_func_ls[tx_size][0]; + + int32_t input[64]; + random_matrix(input, tx_size_pix, &rnd); + + // 64x64 transform assumes last 32 values are zero. + memset(input + 32, 0, 32 * sizeof(input[0])); + + int32_t ref_output[64]; + memset(ref_output, 0, sizeof(ref_output)); + reference_idct_1d_int(input, ref_output, tx_size_pix); + + int32_t output[64]; + memset(output, 0, sizeof(output)); + inv_txfm_func(input, output, cos_bit, range_bit); + + for (int i = 0; i < tx_size_pix; ++i) { + EXPECT_LE(abs(output[i] - ref_output[i]), max_error[tx_size]) + << "tx_size = " << tx_size << ", i = " << i + << ", output[i] = " << output[i] + << ", ref_output[i] = " << ref_output[i]; + } + } +} + +static INLINE int get_max_bit(int x) { + int max_bit = -1; + while (x) { + x = x >> 1; + max_bit++; + } + return max_bit; +} + +TEST(av1_inv_txfm1d, get_max_bit) { + int max_bit = get_max_bit(8); + EXPECT_EQ(max_bit, 3); +} + +TEST(av1_inv_txfm1d, round_trip) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + for (int si = 0; si < NELEMENTS(fwd_txfm_func_ls); ++si) { + int txfm_size = txfm_size_ls[si]; + + for (int ti = 0; ti < txfm_type_num; ++ti) { + TxfmFunc fwd_txfm_func = fwd_txfm_func_ls[si][ti]; + TxfmFunc inv_txfm_func = inv_txfm_func_ls[si][ti]; + int max_error = 2; + + if (!fwd_txfm_func) continue; + + const int count_test_block = 5000; + for (int ci = 0; ci < count_test_block; ++ci) { + int32_t input[64]; + int32_t output[64]; + int32_t round_trip_output[64]; + + ASSERT_LE(txfm_size, NELEMENTS(input)); + + for (int ni = 0; ni < txfm_size; ++ni) { + input[ni] = rnd.Rand16() % input_base - rnd.Rand16() % input_base; + } + + fwd_txfm_func(input, output, cos_bit, range_bit); + inv_txfm_func(output, round_trip_output, cos_bit, range_bit); + + for (int ni = 0; ni < txfm_size; ++ni) { + int node_err = + abs(input[ni] - round_shift(round_trip_output[ni], + get_max_bit(txfm_size) - 1)); + EXPECT_LE(node_err, max_error); + } + } + } + } +} + +} // namespace diff --git a/libs/libaom/src/test/av1_inv_txfm2d_test.cc b/libs/libaom/src/test/av1_inv_txfm2d_test.cc new file mode 100644 index 000000000..eacdf85d4 --- /dev/null +++ b/libs/libaom/src/test/av1_inv_txfm2d_test.cc @@ -0,0 +1,422 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_ports/aom_timer.h" +#include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/scan.h" +#include "test/acm_random.h" +#include "test/av1_txfm_test.h" +#include "test/util.h" + +using libaom_test::ACMRandom; +using libaom_test::bd; +using libaom_test::compute_avg_abs_error; +using libaom_test::input_base; +using libaom_test::InvTxfm2dFunc; +using libaom_test::LbdInvTxfm2dFunc; + +using ::testing::Combine; +using ::testing::Range; +using ::testing::Values; + +using std::vector; + +typedef TX_TYPE TxType; +typedef TX_SIZE TxSize; + +namespace { + +static const char *tx_type_name[] = { + "DCT_DCT", + "ADST_DCT", + "DCT_ADST", + "ADST_ADST", + "FLIPADST_DCT", + "DCT_FLIPADST", + "FLIPADST_FLIPADST", + "ADST_FLIPADST", + "FLIPADST_ADST", + "IDTX", + "V_DCT", + "H_DCT", + "V_ADST", + "H_ADST", + "V_FLIPADST", + "H_FLIPADST", +}; + +// AV1InvTxfm2dParam argument list: +// tx_type_, tx_size_, max_error_, max_avg_error_ +typedef std::tuple AV1InvTxfm2dParam; + +class AV1InvTxfm2d : public ::testing::TestWithParam { + public: + virtual void SetUp() { + tx_type_ = GET_PARAM(0); + tx_size_ = GET_PARAM(1); + max_error_ = GET_PARAM(2); + max_avg_error_ = GET_PARAM(3); + } + + void RunRoundtripCheck() { + int tx_w = tx_size_wide[tx_size_]; + int tx_h = tx_size_high[tx_size_]; + int txfm2d_size = tx_w * tx_h; + const FwdTxfm2dFunc fwd_txfm_func = libaom_test::fwd_txfm_func_ls[tx_size_]; + const InvTxfm2dFunc inv_txfm_func = libaom_test::inv_txfm_func_ls[tx_size_]; + double avg_abs_error = 0; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + + const int count = 500; + + for (int ci = 0; ci < count; ci++) { + DECLARE_ALIGNED(16, int16_t, input[64 * 64]) = { 0 }; + ASSERT_LE(txfm2d_size, NELEMENTS(input)); + + for (int ni = 0; ni < txfm2d_size; ++ni) { + if (ci == 0) { + int extreme_input = input_base - 1; + input[ni] = extreme_input; // extreme case + } else { + input[ni] = rnd.Rand16() % input_base; + } + } + + DECLARE_ALIGNED(16, uint16_t, expected[64 * 64]) = { 0 }; + ASSERT_LE(txfm2d_size, NELEMENTS(expected)); + if (TxfmUsesApproximation()) { + // Compare reference forward HT + inverse HT vs forward HT + inverse HT. + double ref_input[64 * 64]; + ASSERT_LE(txfm2d_size, NELEMENTS(ref_input)); + for (int ni = 0; ni < txfm2d_size; ++ni) { + ref_input[ni] = input[ni]; + } + double ref_coeffs[64 * 64] = { 0 }; + ASSERT_LE(txfm2d_size, NELEMENTS(ref_coeffs)); + ASSERT_EQ(tx_type_, static_cast(DCT_DCT)); + libaom_test::reference_hybrid_2d(ref_input, ref_coeffs, tx_type_, + tx_size_); + DECLARE_ALIGNED(16, int32_t, ref_coeffs_int[64 * 64]) = { 0 }; + ASSERT_LE(txfm2d_size, NELEMENTS(ref_coeffs_int)); + for (int ni = 0; ni < txfm2d_size; ++ni) { + ref_coeffs_int[ni] = (int32_t)round(ref_coeffs[ni]); + } + inv_txfm_func(ref_coeffs_int, expected, tx_w, tx_type_, bd); + } else { + // Compare original input vs forward HT + inverse HT. + for (int ni = 0; ni < txfm2d_size; ++ni) { + expected[ni] = input[ni]; + } + } + + DECLARE_ALIGNED(16, int32_t, coeffs[64 * 64]) = { 0 }; + ASSERT_LE(txfm2d_size, NELEMENTS(coeffs)); + fwd_txfm_func(input, coeffs, tx_w, tx_type_, bd); + + DECLARE_ALIGNED(16, uint16_t, actual[64 * 64]) = { 0 }; + ASSERT_LE(txfm2d_size, NELEMENTS(actual)); + inv_txfm_func(coeffs, actual, tx_w, tx_type_, bd); + + double actual_max_error = 0; + for (int ni = 0; ni < txfm2d_size; ++ni) { + const double this_error = abs(expected[ni] - actual[ni]); + actual_max_error = AOMMAX(actual_max_error, this_error); + } + EXPECT_GE(max_error_, actual_max_error) + << " tx_w: " << tx_w << " tx_h " << tx_h << " tx_type: " << tx_type_; + if (actual_max_error > max_error_) { // exit early. + break; + } + avg_abs_error += compute_avg_abs_error( + expected, actual, txfm2d_size); + } + + avg_abs_error /= count; + EXPECT_GE(max_avg_error_, avg_abs_error) + << " tx_w: " << tx_w << " tx_h " << tx_h << " tx_type: " << tx_type_; + } + + private: + bool TxfmUsesApproximation() { + if (tx_size_wide[tx_size_] == 64 || tx_size_high[tx_size_] == 64) { + return true; + } + return false; + } + + int max_error_; + double max_avg_error_; + TxType tx_type_; + TxSize tx_size_; +}; + +static int max_error_ls[TX_SIZES_ALL] = { + 2, // 4x4 transform + 2, // 8x8 transform + 2, // 16x16 transform + 4, // 32x32 transform + 3, // 64x64 transform + 2, // 4x8 transform + 2, // 8x4 transform + 2, // 8x16 transform + 2, // 16x8 transform + 3, // 16x32 transform + 3, // 32x16 transform + 5, // 32x64 transform + 5, // 64x32 transform + 2, // 4x16 transform + 2, // 16x4 transform + 2, // 8x32 transform + 2, // 32x8 transform + 3, // 16x64 transform + 3, // 64x16 transform +}; + +static double avg_error_ls[TX_SIZES_ALL] = { + 0.002, // 4x4 transform + 0.05, // 8x8 transform + 0.07, // 16x16 transform + 0.4, // 32x32 transform + 0.3, // 64x64 transform + 0.02, // 4x8 transform + 0.02, // 8x4 transform + 0.04, // 8x16 transform + 0.07, // 16x8 transform + 0.4, // 16x32 transform + 0.5, // 32x16 transform + 0.38, // 32x64 transform + 0.39, // 64x32 transform + 0.2, // 4x16 transform + 0.2, // 16x4 transform + 0.2, // 8x32 transform + 0.2, // 32x8 transform + 0.38, // 16x64 transform + 0.38, // 64x16 transform +}; + +vector GetInvTxfm2dParamList() { + vector param_list; + for (int s = 0; s < TX_SIZES; ++s) { + const int max_error = max_error_ls[s]; + const double avg_error = avg_error_ls[s]; + for (int t = 0; t < TX_TYPES; ++t) { + const TxType tx_type = static_cast(t); + const TxSize tx_size = static_cast(s); + if (libaom_test::IsTxSizeTypeValid(tx_size, tx_type)) { + param_list.push_back( + AV1InvTxfm2dParam(tx_type, tx_size, max_error, avg_error)); + } + } + } + return param_list; +} + +INSTANTIATE_TEST_SUITE_P(C, AV1InvTxfm2d, + ::testing::ValuesIn(GetInvTxfm2dParamList())); + +TEST_P(AV1InvTxfm2d, RunRoundtripCheck) { RunRoundtripCheck(); } + +TEST(AV1InvTxfm2d, CfgTest) { + for (int bd_idx = 0; bd_idx < BD_NUM; ++bd_idx) { + int bd = libaom_test::bd_arr[bd_idx]; + int8_t low_range = libaom_test::low_range_arr[bd_idx]; + int8_t high_range = libaom_test::high_range_arr[bd_idx]; + for (int tx_size = 0; tx_size < TX_SIZES_ALL; ++tx_size) { + for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) { + if (libaom_test::IsTxSizeTypeValid(static_cast(tx_size), + static_cast(tx_type)) == + false) { + continue; + } + TXFM_2D_FLIP_CFG cfg; + av1_get_inv_txfm_cfg(static_cast(tx_type), + static_cast(tx_size), &cfg); + int8_t stage_range_col[MAX_TXFM_STAGE_NUM]; + int8_t stage_range_row[MAX_TXFM_STAGE_NUM]; + av1_gen_inv_stage_range(stage_range_col, stage_range_row, &cfg, + static_cast(tx_size), bd); + libaom_test::txfm_stage_range_check(stage_range_col, cfg.stage_num_col, + cfg.cos_bit_col, low_range, + high_range); + libaom_test::txfm_stage_range_check(stage_range_row, cfg.stage_num_row, + cfg.cos_bit_row, low_range, + high_range); + } + } + } +} + +typedef std::tuple AV1LbdInvTxfm2dParam; +class AV1LbdInvTxfm2d : public ::testing::TestWithParam { + public: + virtual void SetUp() { target_func_ = GET_PARAM(0); } + void RunAV1InvTxfm2dTest(TxType tx_type, TxSize tx_size, int run_times, + int gt_int16 = 0); + + private: + LbdInvTxfm2dFunc target_func_; +}; + +void AV1LbdInvTxfm2d::RunAV1InvTxfm2dTest(TxType tx_type, TxSize tx_size, + int run_times, int gt_int16) { + FwdTxfm2dFunc fwd_func_ = libaom_test::fwd_txfm_func_ls[tx_size]; + InvTxfm2dFunc ref_func_ = libaom_test::inv_txfm_func_ls[tx_size]; + if (fwd_func_ == NULL || ref_func_ == NULL || target_func_ == NULL) { + return; + } + const int bd = 8; + const int BLK_WIDTH = 64; + const int BLK_SIZE = BLK_WIDTH * BLK_WIDTH; + DECLARE_ALIGNED(16, int16_t, input[BLK_SIZE]) = { 0 }; + DECLARE_ALIGNED(32, int32_t, inv_input[BLK_SIZE]) = { 0 }; + DECLARE_ALIGNED(16, uint8_t, output[BLK_SIZE]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, ref_output[BLK_SIZE]) = { 0 }; + int stride = BLK_WIDTH; + int rows = tx_size_high[tx_size]; + int cols = tx_size_wide[tx_size]; + const int rows_nonezero = AOMMIN(32, rows); + const int cols_nonezero = AOMMIN(32, cols); + run_times /= (rows * cols); + run_times = AOMMAX(1, run_times); + const SCAN_ORDER *scan_order = get_default_scan(tx_size, tx_type); + const int16_t *scan = scan_order->scan; + const int16_t eobmax = rows_nonezero * cols_nonezero; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + int randTimes = run_times == 1 ? (eobmax + 500) : 1; + + for (int cnt = 0; cnt < randTimes; ++cnt) { + const int16_t max_in = (1 << (bd)) - 1; + for (int r = 0; r < BLK_WIDTH; ++r) { + for (int c = 0; c < BLK_WIDTH; ++c) { + input[r * cols + c] = (cnt == 0) ? max_in : rnd.Rand8Extremes(); + output[r * stride + c] = (cnt == 0) ? 128 : rnd.Rand8(); + ref_output[r * stride + c] = output[r * stride + c]; + } + } + fwd_func_(input, inv_input, stride, tx_type, bd); + + // produce eob input by setting high freq coeffs to zero + const int eob = AOMMIN(cnt + 1, eobmax); + for (int i = eob; i < eobmax; i++) { + inv_input[scan[i]] = 0; + } + if (gt_int16) { + inv_input[scan[eob - 1]] = ((int32_t)INT16_MAX * 100 / 141); + } + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + ref_func_(inv_input, ref_output, stride, tx_type, bd); + } + aom_usec_timer_mark(&timer); + const double time1 = static_cast(aom_usec_timer_elapsed(&timer)); + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + target_func_(inv_input, output, stride, tx_type, tx_size, eob); + } + aom_usec_timer_mark(&timer); + const double time2 = static_cast(aom_usec_timer_elapsed(&timer)); + if (run_times > 10) { + printf("txfm[%d] %3dx%-3d:%7.2f/%7.2fns", tx_type, cols, rows, time1, + time2); + printf("(%3.2f)\n", time1 / time2); + } + for (int r = 0; r < rows; ++r) { + for (int c = 0; c < cols; ++c) { + uint8_t ref_value = static_cast(ref_output[r * stride + c]); + if (ref_value != output[r * stride + c]) { + printf(" "); + } + ASSERT_EQ(ref_value, output[r * stride + c]) + << "[" << r << "," << c << "] " << cnt + << " tx_size: " << static_cast(tx_size) + << " tx_type: " << tx_type_name[tx_type] << " eob " << eob; + } + } + } +} + +TEST_P(AV1LbdInvTxfm2d, match) { + for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) { + for (int i = 0; i < (int)TX_TYPES; ++i) { + if (libaom_test::IsTxSizeTypeValid(static_cast(j), + static_cast(i))) { + RunAV1InvTxfm2dTest(static_cast(i), static_cast(j), 1); + } + } + } +} + +TEST_P(AV1LbdInvTxfm2d, gt_int16) { + static const TxType types[] = { DCT_DCT, ADST_DCT, FLIPADST_DCT, IDTX, + V_DCT, H_DCT, H_ADST, H_FLIPADST }; + for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) { + const TxSize sz = static_cast(j); + for (uint8_t i = 0; i < sizeof(types) / sizeof(types[0]); ++i) { + const TxType tp = types[i]; + if (libaom_test::IsTxSizeTypeValid(sz, tp)) { + RunAV1InvTxfm2dTest(tp, sz, 1, 1); + } + } + } +} + +TEST_P(AV1LbdInvTxfm2d, DISABLED_Speed) { + for (int j = 1; j < (int)(TX_SIZES_ALL); ++j) { + for (int i = 0; i < (int)TX_TYPES; ++i) { + if (libaom_test::IsTxSizeTypeValid(static_cast(j), + static_cast(i))) { + RunAV1InvTxfm2dTest(static_cast(i), static_cast(j), + 10000000); + } + } + } +} + +#if HAVE_SSSE3 +#if defined(_MSC_VER) || defined(__SSSE3__) +#include "av1/common/x86/av1_inv_txfm_ssse3.h" +INSTANTIATE_TEST_SUITE_P(SSSE3, AV1LbdInvTxfm2d, + ::testing::Values(av1_lowbd_inv_txfm2d_add_ssse3)); +#endif // _MSC_VER || __SSSE3__ +#endif // HAVE_SSSE3 + +#if HAVE_AVX2 +extern "C" void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, + uint8_t *output, int stride, + TxType tx_type, TxSize tx_size, + int eob); + +INSTANTIATE_TEST_SUITE_P(AVX2, AV1LbdInvTxfm2d, + ::testing::Values(av1_lowbd_inv_txfm2d_add_avx2)); +#endif // HAVE_AVX2 + +// TODO(yunqing): Re-enable this unit test for NEON version after the functions +// are fixed. +#if HAVE_NEON +extern "C" void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob); + +INSTANTIATE_TEST_SUITE_P(NEON, AV1LbdInvTxfm2d, + ::testing::Values(av1_lowbd_inv_txfm2d_add_neon)); +#endif // HAVE_NEON + +} // namespace diff --git a/libs/libaom/src/test/av1_nn_predict_test.cc b/libs/libaom/src/test/av1_nn_predict_test.cc new file mode 100644 index 000000000..c03cba8c5 --- /dev/null +++ b/libs/libaom/src/test/av1_nn_predict_test.cc @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "aom/aom_integer.h" +#include "aom_ports/aom_timer.h" +#include "av1/encoder/ml.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" +#include "test/util.h" +#include "test/register_state_check.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" + +namespace { +typedef void (*NnPredict_Func)(const float *const input_nodes, + const NN_CONFIG *const nn_config, + int reduce_prec, float *const output); + +typedef std::tuple NnPredictTestParam; + +const float epsilon = 1e-3f; // Error threshold for functional equivalence + +class NnPredictTest : public ::testing::TestWithParam { + public: + virtual void SetUp() { + const int MAX_NODES2 = NN_MAX_NODES_PER_LAYER * NN_MAX_NODES_PER_LAYER; + // Allocate two massive buffers on the heap for edge weights and node bias + // Then set-up the double-dimension arrays pointing into the big buffers + weights_buf = (float *)aom_malloc(MAX_NODES2 * (NN_MAX_HIDDEN_LAYERS + 1) * + sizeof(*weights_buf)); + bias_buf = + (float *)aom_malloc(NN_MAX_NODES_PER_LAYER * + (NN_MAX_HIDDEN_LAYERS + 1) * sizeof(*bias_buf)); + ASSERT_NE(weights_buf, nullptr); + ASSERT_NE(bias_buf, nullptr); + for (int i = 0; i < NN_MAX_HIDDEN_LAYERS + 1; i++) { + weights[i] = &weights_buf[i * MAX_NODES2]; + bias[i] = &bias_buf[i * NN_MAX_NODES_PER_LAYER]; + } + target_func_ = GET_PARAM(0); + } + virtual void TearDown() { + aom_free(weights_buf); + aom_free(bias_buf); + } + void RunNnPredictTest(const NN_CONFIG *const shape); + void RunNnPredictSpeedTest(const NN_CONFIG *const shape, const int run_times); + void RunNnPredictTest_all(const NN_CONFIG *const shapes, + const int num_shapes); + void RunNnPredictSpeedTest_all(const NN_CONFIG *const shapes, + const int num_shapes, const int run_times); + + private: + NnPredict_Func target_func_; + libaom_test::ACMRandom rng_; + float *weights[NN_MAX_HIDDEN_LAYERS + 1] = { 0 }; + float *bias[NN_MAX_HIDDEN_LAYERS + 1] = { 0 }; + float *weights_buf = nullptr, *bias_buf = nullptr; +}; + +void NnPredictTest::RunNnPredictTest(const NN_CONFIG *const shape) { + libaom_test::ClearSystemState(); + float inputs[NN_MAX_NODES_PER_LAYER] = { 0 }; + float outputs_test[NN_MAX_NODES_PER_LAYER] = { 0 }; + float outputs_ref[NN_MAX_NODES_PER_LAYER] = { 0 }; + + NN_CONFIG nn_config; + memcpy(&nn_config, shape, sizeof(nn_config)); + + char shape_str[32] = { 0 }; + snprintf(shape_str, sizeof(shape_str), "%d", shape->num_inputs); + for (int layer = 0; layer < shape->num_hidden_layers; layer++) + snprintf(&shape_str[strlen(shape_str)], + sizeof(shape_str) - strlen(shape_str), "x%d", + shape->num_hidden_nodes[layer]); + snprintf(&shape_str[strlen(shape_str)], sizeof(shape_str) - strlen(shape_str), + "x%d", shape->num_outputs); + + for (int i = 0; i < NN_MAX_HIDDEN_LAYERS + 1; i++) { + nn_config.weights[i] = weights[i]; + nn_config.bias[i] = bias[i]; + } + + for (int iter = 0; iter < 10000 && !HasFatalFailure(); ++iter) { + for (int node = 0; node < shape->num_inputs; node++) { + inputs[node] = ((float)rng_.Rand31() - (1 << 30)) / (1u << 31); + } + for (int layer = 0; layer < shape->num_hidden_layers; layer++) { + for (int node = 0; node < NN_MAX_NODES_PER_LAYER; node++) { + bias[layer][node] = ((float)rng_.Rand31() - (1 << 30)) / (1u << 31); + } + for (int node = 0; node < NN_MAX_NODES_PER_LAYER * NN_MAX_NODES_PER_LAYER; + node++) { + weights[layer][node] = ((float)rng_.Rand31() - (1 << 30)) / (1u << 31); + } + } + // Now the outputs: + int layer = shape->num_hidden_layers; + for (int node = 0; node < NN_MAX_NODES_PER_LAYER; node++) { + bias[layer][node] = ((float)rng_.Rand31() - (1 << 30)) / (1u << 31); + } + for (int node = 0; node < NN_MAX_NODES_PER_LAYER * NN_MAX_NODES_PER_LAYER; + node++) { + weights[layer][node] = ((float)rng_.Rand31() - (1 << 30)) / (1u << 31); + } + + av1_nn_predict_c(inputs, &nn_config, 0, outputs_ref); + target_func_(inputs, &nn_config, 0, outputs_test); + libaom_test::ClearSystemState(); + + for (int node = 0; node < shape->num_outputs; node++) { + if (outputs_ref[node] < epsilon) { + ASSERT_LE(outputs_test[node], epsilon) + << "Reference output was near-zero, test output was not (" + << shape_str << ")"; + } else { + const float error = outputs_ref[node] - outputs_test[node]; + const float relative_error = fabsf(error / outputs_ref[node]); + ASSERT_LE(relative_error, epsilon) + << "Excessive relative error between reference and test (" + << shape_str << ")"; + } + } + } +} + +void NnPredictTest::RunNnPredictSpeedTest(const NN_CONFIG *const shape, + const int run_times) { + libaom_test::ClearSystemState(); + float inputs[NN_MAX_NODES_PER_LAYER] = { 0 }; + float outputs_test[NN_MAX_NODES_PER_LAYER] = { 0 }; + float outputs_ref[NN_MAX_NODES_PER_LAYER] = { 0 }; + + NN_CONFIG nn_config; + memcpy(&nn_config, shape, sizeof(nn_config)); + + for (int i = 0; i < NN_MAX_HIDDEN_LAYERS; i++) { + nn_config.weights[i] = weights[i]; + nn_config.bias[i] = bias[i]; + } + // Don't bother actually changing the values for inputs/weights/bias: it + // shouldn't make any difference for a speed test. + + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + av1_nn_predict_c(inputs, &nn_config, 0, outputs_ref); + } + aom_usec_timer_mark(&timer); + const double time1 = static_cast(aom_usec_timer_elapsed(&timer)); + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + target_func_(inputs, &nn_config, 0, outputs_test); + } + aom_usec_timer_mark(&timer); + libaom_test::ClearSystemState(); + const double time2 = static_cast(aom_usec_timer_elapsed(&timer)); + + printf("%d", shape->num_inputs); + for (int layer = 0; layer < shape->num_hidden_layers; layer++) + printf("x%d", shape->num_hidden_nodes[layer]); + printf("x%d: ", shape->num_outputs); + printf("%7.2f/%7.2fns (%3.2f)\n", time1, time2, time1 / time2); +} + +// This is all the neural network shapes observed executed in a few different +// runs of the encoder. It also conveniently covers all the kernels +// implemented. +static const NN_CONFIG shapes[] = { + { 10, 16, 1, { 64 }, { 0 }, { 0 } }, { 12, 1, 1, { 12 }, { 0 }, { 0 } }, + { 12, 1, 1, { 24 }, { 0 }, { 0 } }, { 12, 1, 1, { 32 }, { 0 }, { 0 } }, + { 18, 4, 1, { 24 }, { 0 }, { 0 } }, { 18, 4, 1, { 32 }, { 0 }, { 0 } }, + { 4, 1, 1, { 16 }, { 0 }, { 0 } }, { 8, 1, 1, { 16 }, { 0 }, { 0 } }, + { 8, 4, 1, { 16 }, { 0 }, { 0 } }, { 8, 1, 1, { 24 }, { 0 }, { 0 } }, + { 8, 1, 1, { 32 }, { 0 }, { 0 } }, { 8, 1, 1, { 64 }, { 0 }, { 0 } }, + { 9, 3, 1, { 32 }, { 0 }, { 0 } }, { 4, 4, 1, { 8 }, { 0 }, { 0 } }, +}; + +void NnPredictTest::RunNnPredictTest_all(const NN_CONFIG *const shapes, + const int num_shapes) { + for (int i = 0; i < num_shapes; i++) RunNnPredictTest(&shapes[i]); +} + +void NnPredictTest::RunNnPredictSpeedTest_all(const NN_CONFIG *const shapes, + const int num_shapes, + const int run_times) { + for (int i = 0; i < num_shapes; i++) + NnPredictTest::RunNnPredictSpeedTest(&shapes[i], run_times); +} + +TEST_P(NnPredictTest, RandomValues) { + RunNnPredictTest_all(shapes, sizeof(shapes) / sizeof(*shapes)); +} + +TEST_P(NnPredictTest, DISABLED_Speed) { + RunNnPredictSpeedTest_all(shapes, sizeof(shapes) / sizeof(*shapes), 10000000); +} + +#if HAVE_SSE3 +INSTANTIATE_TEST_SUITE_P(SSE3, NnPredictTest, + ::testing::Values(av1_nn_predict_sse3)); +#endif + +} // namespace diff --git a/libs/libaom/src/test/av1_quantize_test.cc b/libs/libaom/src/test/av1_quantize_test.cc new file mode 100644 index 000000000..39a3c33d8 --- /dev/null +++ b/libs/libaom/src/test/av1_quantize_test.cc @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "av1/common/scan.h" + +namespace { + +typedef void (*QuantizeFpFunc)( + const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, int log_scale); + +struct QuantizeFuncParams { + QuantizeFuncParams(QuantizeFpFunc qF = NULL, QuantizeFpFunc qRefF = NULL, + int count = 16) + : qFunc(qF), qFuncRef(qRefF), coeffCount(count) {} + QuantizeFpFunc qFunc; + QuantizeFpFunc qFuncRef; + int coeffCount; +}; + +using libaom_test::ACMRandom; + +const int numTests = 1000; +const int maxSize = 1024; +const int roundFactorRange = 127; +const int dequantRange = 32768; +const int coeffRange = (1 << 20) - 1; + +class AV1QuantizeTest : public ::testing::TestWithParam { + public: + void RunQuantizeTest() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[maxSize]); + DECLARE_ALIGNED(16, int16_t, zbin_ptr[8]); + DECLARE_ALIGNED(16, int16_t, round_ptr[8]); + DECLARE_ALIGNED(16, int16_t, quant_ptr[8]); + DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[8]); + DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[maxSize]); + DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[maxSize]); + DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[maxSize]); + DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[maxSize]); + DECLARE_ALIGNED(16, int16_t, dequant_ptr[8]); + uint16_t eob; + uint16_t ref_eob; + int err_count_total = 0; + int first_failure = -1; + int count = params_.coeffCount; + const TX_SIZE txSize = getTxSize(count); + int log_scale = (txSize == TX_32X32); + QuantizeFpFunc quanFunc = params_.qFunc; + QuantizeFpFunc quanFuncRef = params_.qFuncRef; + + const SCAN_ORDER scanOrder = av1_default_scan_orders[txSize]; + for (int i = 0; i < numTests; i++) { + int err_count = 0; + ref_eob = eob = UINT16_MAX; + for (int j = 0; j < count; j++) { + coeff_ptr[j] = rnd(coeffRange); + } + + for (int j = 0; j < 2; j++) { + zbin_ptr[j] = rnd.Rand16(); + quant_shift_ptr[j] = rnd.Rand16(); + // int16_t positive + dequant_ptr[j] = abs(rnd(dequantRange)); + quant_ptr[j] = static_cast((1 << 16) / dequant_ptr[j]); + round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >> 7; + } + for (int j = 2; j < 8; ++j) { + zbin_ptr[j] = zbin_ptr[1]; + quant_shift_ptr[j] = quant_shift_ptr[1]; + dequant_ptr[j] = dequant_ptr[1]; + quant_ptr[j] = quant_ptr[1]; + round_ptr[j] = round_ptr[1]; + } + quanFuncRef(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, ref_qcoeff_ptr, ref_dqcoeff_ptr, dequant_ptr, + &ref_eob, scanOrder.scan, scanOrder.iscan, log_scale); + + ASM_REGISTER_STATE_CHECK( + quanFunc(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, &eob, + scanOrder.scan, scanOrder.iscan, log_scale)); + + for (int j = 0; j < count; ++j) { + err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) | + (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]); + ASSERT_EQ(ref_qcoeff_ptr[j], qcoeff_ptr[j]) + << "qcoeff error: i = " << i << " j = " << j << "\n"; + EXPECT_EQ(ref_dqcoeff_ptr[j], dqcoeff_ptr[j]) + << "dqcoeff error: i = " << i << " j = " << j << "\n"; + } + EXPECT_EQ(ref_eob, eob) << "eob error: " + << "i = " << i << "\n"; + err_count += (ref_eob != eob); + if (err_count && !err_count_total) { + first_failure = i; + } + err_count_total += err_count; + } + EXPECT_EQ(0, err_count_total) + << "Error: Quantization Test, C output doesn't match SSE2 output. " + << "First failed at test case " << first_failure; + } + + void RunEobTest() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[maxSize]); + DECLARE_ALIGNED(16, int16_t, zbin_ptr[8]); + DECLARE_ALIGNED(16, int16_t, round_ptr[8]); + DECLARE_ALIGNED(16, int16_t, quant_ptr[8]); + DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[8]); + DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[maxSize]); + DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[maxSize]); + DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[maxSize]); + DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[maxSize]); + DECLARE_ALIGNED(16, int16_t, dequant_ptr[8]); + uint16_t eob; + uint16_t ref_eob; + int count = params_.coeffCount; + const TX_SIZE txSize = getTxSize(count); + int log_scale = (txSize == TX_32X32); + QuantizeFpFunc quanFunc = params_.qFunc; + QuantizeFpFunc quanFuncRef = params_.qFuncRef; + const SCAN_ORDER scanOrder = av1_default_scan_orders[txSize]; + + for (int i = 0; i < numTests; i++) { + ref_eob = eob = UINT16_MAX; + for (int j = 0; j < count; j++) { + coeff_ptr[j] = 0; + } + + coeff_ptr[rnd(count)] = rnd(coeffRange); + coeff_ptr[rnd(count)] = rnd(coeffRange); + coeff_ptr[rnd(count)] = rnd(coeffRange); + + for (int j = 0; j < 2; j++) { + zbin_ptr[j] = rnd.Rand16(); + quant_shift_ptr[j] = rnd.Rand16(); + // int16_t positive + dequant_ptr[j] = abs(rnd(dequantRange)); + quant_ptr[j] = (1 << 16) / dequant_ptr[j]; + round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >> 7; + } + for (int j = 2; j < 8; ++j) { + zbin_ptr[j] = zbin_ptr[1]; + quant_shift_ptr[j] = quant_shift_ptr[1]; + dequant_ptr[j] = dequant_ptr[1]; + quant_ptr[j] = quant_ptr[1]; + round_ptr[j] = round_ptr[1]; + } + + quanFuncRef(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, ref_qcoeff_ptr, ref_dqcoeff_ptr, dequant_ptr, + &ref_eob, scanOrder.scan, scanOrder.iscan, log_scale); + + ASM_REGISTER_STATE_CHECK( + quanFunc(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, &eob, + scanOrder.scan, scanOrder.iscan, log_scale)); + EXPECT_EQ(ref_eob, eob) << "eob error: " + << "i = " << i << "\n"; + } + } + + virtual void SetUp() { params_ = GetParam(); } + + virtual void TearDown() { libaom_test::ClearSystemState(); } + + virtual ~AV1QuantizeTest() {} + + private: + TX_SIZE getTxSize(int count) { + switch (count) { + case 16: return TX_4X4; + case 64: return TX_8X8; + case 256: return TX_16X16; + case 1024: return TX_32X32; + default: return TX_4X4; + } + } + + QuantizeFuncParams params_; +}; + +TEST_P(AV1QuantizeTest, BitExactCheck) { RunQuantizeTest(); } +TEST_P(AV1QuantizeTest, EobVerify) { RunEobTest(); } + +#if HAVE_SSE4_1 +const QuantizeFuncParams qfps[4] = { + QuantizeFuncParams(&av1_highbd_quantize_fp_sse4_1, &av1_highbd_quantize_fp_c, + 16), + QuantizeFuncParams(&av1_highbd_quantize_fp_sse4_1, &av1_highbd_quantize_fp_c, + 64), + QuantizeFuncParams(&av1_highbd_quantize_fp_sse4_1, &av1_highbd_quantize_fp_c, + 256), + QuantizeFuncParams(&av1_highbd_quantize_fp_sse4_1, &av1_highbd_quantize_fp_c, + 1024), +}; + +INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1QuantizeTest, ::testing::ValuesIn(qfps)); +#endif // HAVE_SSE4_1 + +#if HAVE_AVX2 +const QuantizeFuncParams qfps_avx2[4] = { + QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c, + 16), + QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c, + 64), + QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c, + 256), + QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c, + 1024), +}; + +INSTANTIATE_TEST_SUITE_P(AVX2, AV1QuantizeTest, ::testing::ValuesIn(qfps_avx2)); +#endif // HAVE_AVX2 + +} // namespace diff --git a/libs/libaom/src/test/av1_round_shift_array_test.cc b/libs/libaom/src/test/av1_round_shift_array_test.cc new file mode 100644 index 000000000..993fa9f19 --- /dev/null +++ b/libs/libaom/src/test/av1_round_shift_array_test.cc @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_mem/aom_mem.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/mem.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/util.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +namespace AV1CompRoundShift { + +typedef void (*comp_round_shift_array_func)(int32_t *arr, int size, int bit); + +#if HAVE_SSE4_1 || HAVE_NEON +const int kValidBitCheck[] = { + -4, -3, -2, -1, 0, 1, 2, 3, 4, +}; +#endif // HAVE_SSE4_1 || HAVE_NEON + +typedef std::tuple + CompRoundShiftParam; + +class AV1CompRoundShiftTest + : public ::testing::TestWithParam { + public: + ~AV1CompRoundShiftTest(); + + void SetUp() { rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed()); } + void TearDown() { libaom_test::ClearSystemState(); } + + protected: + void RunCheckOutput(comp_round_shift_array_func test_impl, BLOCK_SIZE bsize, + int bit); + void RunSpeedTest(comp_round_shift_array_func test_impl, BLOCK_SIZE bsize, + int bit); + + libaom_test::ACMRandom rnd_; +}; + +AV1CompRoundShiftTest::~AV1CompRoundShiftTest() { ; } + +void AV1CompRoundShiftTest::RunCheckOutput( + comp_round_shift_array_func test_impl, BLOCK_SIZE bsize, int bit) { + const int w = block_size_wide[bsize]; + const int h = block_size_high[bsize]; + const int blk_wd = 64; + DECLARE_ALIGNED(32, int32_t, pred_[blk_wd]); + DECLARE_ALIGNED(32, int32_t, ref_buffer_[blk_wd]); + for (int i = 0; i < (blk_wd); ++i) { + ref_buffer_[i] = pred_[i] = rnd_.Rand31() / 16; + } + av1_round_shift_array_c(ref_buffer_, w, bit); + test_impl(pred_, w, bit); + for (int x = 0; x < w; ++x) { + ASSERT_EQ(ref_buffer_[x], pred_[x]) << w << "x" << h << "mismatch @" + << "(" << x << ")"; + } +} + +void AV1CompRoundShiftTest::RunSpeedTest(comp_round_shift_array_func test_impl, + BLOCK_SIZE bsize, int bit) { + const int w = block_size_wide[bsize]; + const int h = block_size_high[bsize]; + const int blk_wd = 64; + DECLARE_ALIGNED(32, int32_t, ref_buffer_[blk_wd]); + for (int i = 0; i < (blk_wd); ++i) { + ref_buffer_[i] = rnd_.Rand31(); + } + + const int num_loops = 1000000000 / (w + h); + comp_round_shift_array_func funcs[2] = { av1_round_shift_array_c, test_impl }; + double elapsed_time[2] = { 0 }; + for (int i = 0; i < 2; ++i) { + aom_usec_timer timer; + aom_usec_timer_start(&timer); + comp_round_shift_array_func func = funcs[i]; + for (int j = 0; j < num_loops; ++j) { + func(ref_buffer_, w, bit); + } + aom_usec_timer_mark(&timer); + double time = static_cast(aom_usec_timer_elapsed(&timer)); + elapsed_time[i] = 1000.0 * time / num_loops; + } + printf("av1_round_shift_array %3dx%-3d: bit : %d %7.2f/%7.2fns", w, h, bit, + elapsed_time[0], elapsed_time[1]); + printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]); +} + +TEST_P(AV1CompRoundShiftTest, CheckOutput) { + RunCheckOutput(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2)); +} + +TEST_P(AV1CompRoundShiftTest, DISABLED_Speed) { + RunSpeedTest(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2)); +} + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P( + SSE4_1, AV1CompRoundShiftTest, + ::testing::Combine(::testing::Values(&av1_round_shift_array_sse4_1), + ::testing::ValuesIn(txsize_to_bsize), + ::testing::ValuesIn(kValidBitCheck))); +#endif + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, AV1CompRoundShiftTest, + ::testing::Combine(::testing::Values(&av1_round_shift_array_neon), + ::testing::ValuesIn(txsize_to_bsize), + ::testing::ValuesIn(kValidBitCheck))); +#endif + +}; // namespace AV1CompRoundShift diff --git a/libs/libaom/src/test/av1_txfm_test.cc b/libs/libaom/src/test/av1_txfm_test.cc new file mode 100644 index 000000000..aedd45d13 --- /dev/null +++ b/libs/libaom/src/test/av1_txfm_test.cc @@ -0,0 +1,371 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "test/av1_txfm_test.h" + +namespace libaom_test { + +int get_txfm1d_size(TX_SIZE tx_size) { return tx_size_wide[tx_size]; } + +void get_txfm1d_type(TX_TYPE txfm2d_type, TYPE_TXFM *type0, TYPE_TXFM *type1) { + switch (txfm2d_type) { + case DCT_DCT: + *type0 = TYPE_DCT; + *type1 = TYPE_DCT; + break; + case ADST_DCT: + *type0 = TYPE_ADST; + *type1 = TYPE_DCT; + break; + case DCT_ADST: + *type0 = TYPE_DCT; + *type1 = TYPE_ADST; + break; + case ADST_ADST: + *type0 = TYPE_ADST; + *type1 = TYPE_ADST; + break; + case FLIPADST_DCT: + *type0 = TYPE_ADST; + *type1 = TYPE_DCT; + break; + case DCT_FLIPADST: + *type0 = TYPE_DCT; + *type1 = TYPE_ADST; + break; + case FLIPADST_FLIPADST: + *type0 = TYPE_ADST; + *type1 = TYPE_ADST; + break; + case ADST_FLIPADST: + *type0 = TYPE_ADST; + *type1 = TYPE_ADST; + break; + case FLIPADST_ADST: + *type0 = TYPE_ADST; + *type1 = TYPE_ADST; + break; + case IDTX: + *type0 = TYPE_IDTX; + *type1 = TYPE_IDTX; + break; + case H_DCT: + *type0 = TYPE_IDTX; + *type1 = TYPE_DCT; + break; + case V_DCT: + *type0 = TYPE_DCT; + *type1 = TYPE_IDTX; + break; + case H_ADST: + *type0 = TYPE_IDTX; + *type1 = TYPE_ADST; + break; + case V_ADST: + *type0 = TYPE_ADST; + *type1 = TYPE_IDTX; + break; + case H_FLIPADST: + *type0 = TYPE_IDTX; + *type1 = TYPE_ADST; + break; + case V_FLIPADST: + *type0 = TYPE_ADST; + *type1 = TYPE_IDTX; + break; + default: + *type0 = TYPE_DCT; + *type1 = TYPE_DCT; + assert(0); + break; + } +} + +double Sqrt2 = pow(2, 0.5); +double invSqrt2 = 1 / pow(2, 0.5); + +double dct_matrix(double n, double k, int size) { + return cos(PI * (2 * n + 1) * k / (2 * size)); +} + +void reference_dct_1d(const double *in, double *out, int size) { + for (int k = 0; k < size; ++k) { + out[k] = 0; + for (int n = 0; n < size; ++n) { + out[k] += in[n] * dct_matrix(n, k, size); + } + if (k == 0) out[k] = out[k] * invSqrt2; + } +} + +void reference_idct_1d(const double *in, double *out, int size) { + for (int k = 0; k < size; ++k) { + out[k] = 0; + for (int n = 0; n < size; ++n) { + if (n == 0) + out[k] += invSqrt2 * in[n] * dct_matrix(k, n, size); + else + out[k] += in[n] * dct_matrix(k, n, size); + } + } +} + +// TODO(any): Copied from the old 'fadst4' (same as the new 'av1_fadst4' +// function). Should be replaced by a proper reference function that takes +// 'double' input & output. +static void fadst4_new(const tran_low_t *input, tran_low_t *output) { + tran_high_t x0, x1, x2, x3; + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; + + x0 = input[0]; + x1 = input[1]; + x2 = input[2]; + x3 = input[3]; + + if (!(x0 | x1 | x2 | x3)) { + output[0] = output[1] = output[2] = output[3] = 0; + return; + } + + s0 = sinpi_1_9 * x0; + s1 = sinpi_4_9 * x0; + s2 = sinpi_2_9 * x1; + s3 = sinpi_1_9 * x1; + s4 = sinpi_3_9 * x2; + s5 = sinpi_4_9 * x3; + s6 = sinpi_2_9 * x3; + s7 = x0 + x1 - x3; + + x0 = s0 + s2 + s5; + x1 = sinpi_3_9 * s7; + x2 = s1 - s3 + s6; + x3 = s4; + + s0 = x0 + x3; + s1 = x1; + s2 = x2 - x3; + s3 = x2 - x0 + x3; + + // 1-D transform scaling factor is sqrt(2). + output[0] = (tran_low_t)fdct_round_shift(s0); + output[1] = (tran_low_t)fdct_round_shift(s1); + output[2] = (tran_low_t)fdct_round_shift(s2); + output[3] = (tran_low_t)fdct_round_shift(s3); +} + +void reference_adst_1d(const double *in, double *out, int size) { + if (size == 4) { // Special case. + tran_low_t int_input[4]; + for (int i = 0; i < 4; ++i) { + int_input[i] = static_cast(round(in[i])); + } + tran_low_t int_output[4]; + fadst4_new(int_input, int_output); + for (int i = 0; i < 4; ++i) { + out[i] = int_output[i]; + } + return; + } + + for (int k = 0; k < size; ++k) { + out[k] = 0; + for (int n = 0; n < size; ++n) { + out[k] += in[n] * sin(PI * (2 * n + 1) * (2 * k + 1) / (4 * size)); + } + } +} + +void reference_idtx_1d(const double *in, double *out, int size) { + double scale = 0; + if (size == 4) + scale = Sqrt2; + else if (size == 8) + scale = 2; + else if (size == 16) + scale = 2 * Sqrt2; + else if (size == 32) + scale = 4; + else if (size == 64) + scale = 4 * Sqrt2; + for (int k = 0; k < size; ++k) { + out[k] = in[k] * scale; + } +} + +void reference_hybrid_1d(double *in, double *out, int size, int type) { + if (type == TYPE_DCT) + reference_dct_1d(in, out, size); + else if (type == TYPE_ADST) + reference_adst_1d(in, out, size); + else + reference_idtx_1d(in, out, size); +} + +double get_amplification_factor(TX_TYPE tx_type, TX_SIZE tx_size) { + TXFM_2D_FLIP_CFG fwd_txfm_flip_cfg; + av1_get_fwd_txfm_cfg(tx_type, tx_size, &fwd_txfm_flip_cfg); + const int tx_width = tx_size_wide[fwd_txfm_flip_cfg.tx_size]; + const int tx_height = tx_size_high[fwd_txfm_flip_cfg.tx_size]; + const int8_t *shift = fwd_txfm_flip_cfg.shift; + const int amplify_bit = shift[0] + shift[1] + shift[2]; + double amplify_factor = + amplify_bit >= 0 ? (1 << amplify_bit) : (1.0 / (1 << -amplify_bit)); + + // For rectangular transforms, we need to multiply by an extra factor. + const int rect_type = get_rect_tx_log_ratio(tx_width, tx_height); + if (abs(rect_type) == 1) { + amplify_factor *= pow(2, 0.5); + } + return amplify_factor; +} + +void reference_hybrid_2d(double *in, double *out, TX_TYPE tx_type, + TX_SIZE tx_size) { + // Get transform type and size of each dimension. + TYPE_TXFM type0; + TYPE_TXFM type1; + get_txfm1d_type(tx_type, &type0, &type1); + const int tx_width = tx_size_wide[tx_size]; + const int tx_height = tx_size_high[tx_size]; + + double *const temp_in = new double[AOMMAX(tx_width, tx_height)]; + double *const temp_out = new double[AOMMAX(tx_width, tx_height)]; + double *const out_interm = new double[tx_width * tx_height]; + const int stride = tx_width; + + // Transform columns. + for (int c = 0; c < tx_width; ++c) { + for (int r = 0; r < tx_height; ++r) { + temp_in[r] = in[r * stride + c]; + } + reference_hybrid_1d(temp_in, temp_out, tx_height, type0); + for (int r = 0; r < tx_height; ++r) { + out_interm[r * stride + c] = temp_out[r]; + } + } + + // Transform rows. + for (int r = 0; r < tx_height; ++r) { + reference_hybrid_1d(out_interm + r * stride, out + r * stride, tx_width, + type1); + } + + delete[] temp_in; + delete[] temp_out; + delete[] out_interm; + + // These transforms use an approximate 2D DCT transform, by only keeping the + // top-left quarter of the coefficients, and repacking them in the first + // quarter indices. + // TODO(urvang): Refactor this code. + if (tx_width == 64 && tx_height == 64) { // tx_size == TX_64X64 + // Zero out top-right 32x32 area. + for (int row = 0; row < 32; ++row) { + memset(out + row * 64 + 32, 0, 32 * sizeof(*out)); + } + // Zero out the bottom 64x32 area. + memset(out + 32 * 64, 0, 32 * 64 * sizeof(*out)); + // Re-pack non-zero coeffs in the first 32x32 indices. + for (int row = 1; row < 32; ++row) { + memcpy(out + row * 32, out + row * 64, 32 * sizeof(*out)); + } + } else if (tx_width == 32 && tx_height == 64) { // tx_size == TX_32X64 + // Zero out the bottom 32x32 area. + memset(out + 32 * 32, 0, 32 * 32 * sizeof(*out)); + // Note: no repacking needed here. + } else if (tx_width == 64 && tx_height == 32) { // tx_size == TX_64X32 + // Zero out right 32x32 area. + for (int row = 0; row < 32; ++row) { + memset(out + row * 64 + 32, 0, 32 * sizeof(*out)); + } + // Re-pack non-zero coeffs in the first 32x32 indices. + for (int row = 1; row < 32; ++row) { + memcpy(out + row * 32, out + row * 64, 32 * sizeof(*out)); + } + } else if (tx_width == 16 && tx_height == 64) { // tx_size == TX_16X64 + // Zero out the bottom 16x32 area. + memset(out + 16 * 32, 0, 16 * 32 * sizeof(*out)); + // Note: no repacking needed here. + } else if (tx_width == 64 && tx_height == 16) { // tx_size == TX_64X16 + // Zero out right 32x16 area. + for (int row = 0; row < 16; ++row) { + memset(out + row * 64 + 32, 0, 32 * sizeof(*out)); + } + // Re-pack non-zero coeffs in the first 32x16 indices. + for (int row = 1; row < 16; ++row) { + memcpy(out + row * 32, out + row * 64, 32 * sizeof(*out)); + } + } + + // Apply appropriate scale. + const double amplify_factor = get_amplification_factor(tx_type, tx_size); + for (int c = 0; c < tx_width; ++c) { + for (int r = 0; r < tx_height; ++r) { + out[r * stride + c] *= amplify_factor; + } + } +} + +template +void fliplr(Type *dest, int width, int height, int stride) { + for (int r = 0; r < height; ++r) { + for (int c = 0; c < width / 2; ++c) { + const Type tmp = dest[r * stride + c]; + dest[r * stride + c] = dest[r * stride + width - 1 - c]; + dest[r * stride + width - 1 - c] = tmp; + } + } +} + +template +void flipud(Type *dest, int width, int height, int stride) { + for (int c = 0; c < width; ++c) { + for (int r = 0; r < height / 2; ++r) { + const Type tmp = dest[r * stride + c]; + dest[r * stride + c] = dest[(height - 1 - r) * stride + c]; + dest[(height - 1 - r) * stride + c] = tmp; + } + } +} + +template +void fliplrud(Type *dest, int width, int height, int stride) { + for (int r = 0; r < height / 2; ++r) { + for (int c = 0; c < width; ++c) { + const Type tmp = dest[r * stride + c]; + dest[r * stride + c] = dest[(height - 1 - r) * stride + width - 1 - c]; + dest[(height - 1 - r) * stride + width - 1 - c] = tmp; + } + } +} + +template void fliplr(double *dest, int width, int height, int stride); +template void flipud(double *dest, int width, int height, int stride); +template void fliplrud(double *dest, int width, int height, int stride); + +int bd_arr[BD_NUM] = { 8, 10, 12 }; + +int8_t low_range_arr[BD_NUM] = { 18, 32, 32 }; +int8_t high_range_arr[BD_NUM] = { 32, 32, 32 }; + +void txfm_stage_range_check(const int8_t *stage_range, int stage_num, + int8_t cos_bit, int low_range, int high_range) { + for (int i = 0; i < stage_num; ++i) { + EXPECT_LE(stage_range[i], low_range); + ASSERT_LE(stage_range[i] + cos_bit, high_range) << "stage = " << i; + } + for (int i = 0; i < stage_num - 1; ++i) { + // make sure there is no overflow while doing half_btf() + ASSERT_LE(stage_range[i + 1] + cos_bit, high_range) << "stage = " << i; + } +} +} // namespace libaom_test diff --git a/libs/libaom/src/test/av1_txfm_test.h b/libs/libaom/src/test/av1_txfm_test.h new file mode 100644 index 000000000..5a56d28f1 --- /dev/null +++ b/libs/libaom/src/test/av1_txfm_test.h @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_TEST_AV1_TXFM_TEST_H_ +#define AOM_TEST_AV1_TXFM_TEST_H_ + +#include +#include +#ifdef _MSC_VER +#define _USE_MATH_DEFINES +#endif +#include + +#include "config/av1_rtcd.h" + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "test/acm_random.h" +#include "av1/common/av1_txfm.h" +#include "av1/common/blockd.h" +#include "av1/common/enums.h" + +namespace libaom_test { +enum { + TYPE_DCT = 0, + TYPE_ADST, + TYPE_IDTX, + TYPE_IDCT, + TYPE_IADST, + TYPE_LAST +} UENUM1BYTE(TYPE_TXFM); + +int get_txfm1d_size(TX_SIZE tx_size); + +void get_txfm1d_type(TX_TYPE txfm2d_type, TYPE_TXFM *type0, TYPE_TXFM *type1); + +void reference_dct_1d(const double *in, double *out, int size); +void reference_idct_1d(const double *in, double *out, int size); + +void reference_adst_1d(const double *in, double *out, int size); + +void reference_hybrid_1d(double *in, double *out, int size, int type); + +double get_amplification_factor(TX_TYPE tx_type, TX_SIZE tx_size); + +void reference_hybrid_2d(double *in, double *out, TX_TYPE tx_type, + TX_SIZE tx_size); +template +static double compute_avg_abs_error(const Type1 *a, const Type2 *b, + const int size) { + double error = 0; + for (int i = 0; i < size; i++) { + error += fabs(static_cast(a[i]) - static_cast(b[i])); + } + error = error / size; + return error; +} + +template +void fliplr(Type *dest, int width, int height, int stride); + +template +void flipud(Type *dest, int width, int height, int stride); + +template +void fliplrud(Type *dest, int width, int height, int stride); + +typedef void (*TxfmFunc)(const int32_t *in, int32_t *out, const int8_t cos_bit, + const int8_t *range_bit); + +typedef void (*InvTxfm2dFunc)(const int32_t *, uint16_t *, int, TX_TYPE, int); +typedef void (*LbdInvTxfm2dFunc)(const int32_t *, uint8_t *, int, TX_TYPE, + TX_SIZE, int); + +static const int bd = 10; +static const int input_base = (1 << bd); + +static INLINE bool IsTxSizeTypeValid(TX_SIZE tx_size, TX_TYPE tx_type) { + const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size]; + TxSetType tx_set_type; + if (tx_size_sqr_up > TX_32X32) { + tx_set_type = EXT_TX_SET_DCTONLY; + } else if (tx_size_sqr_up == TX_32X32) { + tx_set_type = EXT_TX_SET_DCT_IDTX; + } else { + tx_set_type = EXT_TX_SET_ALL16; + } + return av1_ext_tx_used[tx_set_type][tx_type] != 0; +} + +#if CONFIG_AV1_ENCODER + +static const FwdTxfm2dFunc fwd_txfm_func_ls[TX_SIZES_ALL] = { + av1_fwd_txfm2d_4x4_c, av1_fwd_txfm2d_8x8_c, av1_fwd_txfm2d_16x16_c, + av1_fwd_txfm2d_32x32_c, av1_fwd_txfm2d_64x64_c, av1_fwd_txfm2d_4x8_c, + av1_fwd_txfm2d_8x4_c, av1_fwd_txfm2d_8x16_c, av1_fwd_txfm2d_16x8_c, + av1_fwd_txfm2d_16x32_c, av1_fwd_txfm2d_32x16_c, av1_fwd_txfm2d_32x64_c, + av1_fwd_txfm2d_64x32_c, av1_fwd_txfm2d_4x16_c, av1_fwd_txfm2d_16x4_c, + av1_fwd_txfm2d_8x32_c, av1_fwd_txfm2d_32x8_c, av1_fwd_txfm2d_16x64_c, + av1_fwd_txfm2d_64x16_c, +}; +#endif + +static const InvTxfm2dFunc inv_txfm_func_ls[TX_SIZES_ALL] = { + av1_inv_txfm2d_add_4x4_c, av1_inv_txfm2d_add_8x8_c, + av1_inv_txfm2d_add_16x16_c, av1_inv_txfm2d_add_32x32_c, + av1_inv_txfm2d_add_64x64_c, av1_inv_txfm2d_add_4x8_c, + av1_inv_txfm2d_add_8x4_c, av1_inv_txfm2d_add_8x16_c, + av1_inv_txfm2d_add_16x8_c, av1_inv_txfm2d_add_16x32_c, + av1_inv_txfm2d_add_32x16_c, av1_inv_txfm2d_add_32x64_c, + av1_inv_txfm2d_add_64x32_c, av1_inv_txfm2d_add_4x16_c, + av1_inv_txfm2d_add_16x4_c, av1_inv_txfm2d_add_8x32_c, + av1_inv_txfm2d_add_32x8_c, av1_inv_txfm2d_add_16x64_c, + av1_inv_txfm2d_add_64x16_c, +}; + +#define BD_NUM 3 + +extern int bd_arr[]; +extern int8_t low_range_arr[]; +extern int8_t high_range_arr[]; + +void txfm_stage_range_check(const int8_t *stage_range, int stage_num, + const int8_t cos_bit, int low_range, + int high_range); +} // namespace libaom_test +#endif // AOM_TEST_AV1_TXFM_TEST_H_ diff --git a/libs/libaom/src/test/av1_wedge_utils_test.cc b/libs/libaom/src/test/av1_wedge_utils_test.cc new file mode 100644 index 000000000..f9dc838ff --- /dev/null +++ b/libs/libaom/src/test/av1_wedge_utils_test.cc @@ -0,0 +1,391 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" + +#include "av1/common/enums.h" + +#include "test/acm_random.h" +#include "test/function_equivalence_test.h" +#include "test/register_state_check.h" + +#define WEDGE_WEIGHT_BITS 6 +#define MAX_MASK_VALUE (1 << (WEDGE_WEIGHT_BITS)) + +using libaom_test::ACMRandom; +using libaom_test::FunctionEquivalenceTest; + +namespace { + +static const int16_t kInt13Max = (1 << 12) - 1; + +////////////////////////////////////////////////////////////////////////////// +// av1_wedge_sse_from_residuals - functionality +////////////////////////////////////////////////////////////////////////////// + +class WedgeUtilsSSEFuncTest : public testing::Test { + protected: + WedgeUtilsSSEFuncTest() : rng_(ACMRandom::DeterministicSeed()) {} + + static const int kIterations = 1000; + + ACMRandom rng_; +}; + +static void equiv_blend_residuals(int16_t *r, const int16_t *r0, + const int16_t *r1, const uint8_t *m, int N) { + for (int i = 0; i < N; i++) { + const int32_t m0 = m[i]; + const int32_t m1 = MAX_MASK_VALUE - m0; + const int16_t R = m0 * r0[i] + m1 * r1[i]; + // Note that this rounding is designed to match the result + // you would get when actually blending the 2 predictors and computing + // the residuals. + r[i] = ROUND_POWER_OF_TWO(R - 1, WEDGE_WEIGHT_BITS); + } +} + +static uint64_t equiv_sse_from_residuals(const int16_t *r0, const int16_t *r1, + const uint8_t *m, int N) { + uint64_t acc = 0; + for (int i = 0; i < N; i++) { + const int32_t m0 = m[i]; + const int32_t m1 = MAX_MASK_VALUE - m0; + const int16_t R = m0 * r0[i] + m1 * r1[i]; + const int32_t r = ROUND_POWER_OF_TWO(R - 1, WEDGE_WEIGHT_BITS); + acc += r * r; + } + return acc; +} + +TEST_F(WedgeUtilsSSEFuncTest, ResidualBlendingEquiv) { + DECLARE_ALIGNED(32, uint8_t, s[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, uint8_t, p0[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, uint8_t, p1[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, uint8_t, p[MAX_SB_SQUARE]); + + DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int16_t, r_ref[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int16_t, r_tst[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]); + + for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + s[i] = rng_.Rand8(); + m[i] = rng_(MAX_MASK_VALUE + 1); + } + + const int w = 1 << (rng_(MAX_SB_SIZE_LOG2 + 1 - 3) + 3); + const int h = 1 << (rng_(MAX_SB_SIZE_LOG2 + 1 - 3) + 3); + const int N = w * h; + + for (int j = 0; j < N; j++) { + p0[j] = clamp(s[j] + rng_(33) - 16, 0, UINT8_MAX); + p1[j] = clamp(s[j] + rng_(33) - 16, 0, UINT8_MAX); + } + + aom_blend_a64_mask(p, w, p0, w, p1, w, m, w, w, h, 0, 0); + + aom_subtract_block(h, w, r0, w, s, w, p0, w); + aom_subtract_block(h, w, r1, w, s, w, p1, w); + + aom_subtract_block(h, w, r_ref, w, s, w, p, w); + equiv_blend_residuals(r_tst, r0, r1, m, N); + + for (int i = 0; i < N; ++i) ASSERT_EQ(r_ref[i], r_tst[i]); + + uint64_t ref_sse = aom_sum_squares_i16(r_ref, N); + uint64_t tst_sse = equiv_sse_from_residuals(r0, r1, m, N); + + ASSERT_EQ(ref_sse, tst_sse); + } +} + +static uint64_t sse_from_residuals(const int16_t *r0, const int16_t *r1, + const uint8_t *m, int N) { + uint64_t acc = 0; + for (int i = 0; i < N; i++) { + const int32_t m0 = m[i]; + const int32_t m1 = MAX_MASK_VALUE - m0; + const int32_t r = m0 * r0[i] + m1 * r1[i]; + acc += r * r; + } + return ROUND_POWER_OF_TWO(acc, 2 * WEDGE_WEIGHT_BITS); +} + +TEST_F(WedgeUtilsSSEFuncTest, ResidualBlendingMethod) { + DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int16_t, d[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]); + + for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + r1[i] = rng_(2 * INT8_MAX - 2 * INT8_MIN + 1) + 2 * INT8_MIN; + d[i] = rng_(2 * INT8_MAX - 2 * INT8_MIN + 1) + 2 * INT8_MIN; + m[i] = rng_(MAX_MASK_VALUE + 1); + } + + const int N = 64 * (rng_(MAX_SB_SQUARE / 64) + 1); + + for (int i = 0; i < N; i++) r0[i] = r1[i] + d[i]; + + const uint64_t ref_res = sse_from_residuals(r0, r1, m, N); + const uint64_t tst_res = av1_wedge_sse_from_residuals(r1, d, m, N); + + ASSERT_EQ(ref_res, tst_res); + } +} + +////////////////////////////////////////////////////////////////////////////// +// av1_wedge_sse_from_residuals - optimizations +////////////////////////////////////////////////////////////////////////////// + +typedef uint64_t (*FSSE)(const int16_t *r1, const int16_t *d, const uint8_t *m, + int N); +typedef libaom_test::FuncParam TestFuncsFSSE; + +class WedgeUtilsSSEOptTest : public FunctionEquivalenceTest { + protected: + static const int kIterations = 10000; +}; + +TEST_P(WedgeUtilsSSEOptTest, RandomValues) { + DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int16_t, d[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]); + + for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + r1[i] = rng_(2 * kInt13Max + 1) - kInt13Max; + d[i] = rng_(2 * kInt13Max + 1) - kInt13Max; + m[i] = rng_(MAX_MASK_VALUE + 1); + } + + const int N = 64 * (rng_(MAX_SB_SQUARE / 64) + 1); + + const uint64_t ref_res = params_.ref_func(r1, d, m, N); + uint64_t tst_res; + ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(r1, d, m, N)); + + ASSERT_EQ(ref_res, tst_res); + } +} + +TEST_P(WedgeUtilsSSEOptTest, ExtremeValues) { + DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int16_t, d[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]); + + for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + if (rng_(2)) { + for (int i = 0; i < MAX_SB_SQUARE; ++i) r1[i] = kInt13Max; + } else { + for (int i = 0; i < MAX_SB_SQUARE; ++i) r1[i] = -kInt13Max; + } + + if (rng_(2)) { + for (int i = 0; i < MAX_SB_SQUARE; ++i) d[i] = kInt13Max; + } else { + for (int i = 0; i < MAX_SB_SQUARE; ++i) d[i] = -kInt13Max; + } + + for (int i = 0; i < MAX_SB_SQUARE; ++i) m[i] = MAX_MASK_VALUE; + + const int N = 64 * (rng_(MAX_SB_SQUARE / 64) + 1); + + const uint64_t ref_res = params_.ref_func(r1, d, m, N); + uint64_t tst_res; + ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(r1, d, m, N)); + + ASSERT_EQ(ref_res, tst_res); + } +} + +////////////////////////////////////////////////////////////////////////////// +// av1_wedge_sign_from_residuals +////////////////////////////////////////////////////////////////////////////// + +typedef int8_t (*FSign)(const int16_t *ds, const uint8_t *m, int N, + int64_t limit); +typedef libaom_test::FuncParam TestFuncsFSign; + +class WedgeUtilsSignOptTest : public FunctionEquivalenceTest { + protected: + static const int kIterations = 10000; + static const int kMaxSize = 8196; // Size limited by SIMD implementation. +}; + +TEST_P(WedgeUtilsSignOptTest, RandomValues) { + DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int16_t, ds[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]); + + for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + r0[i] = rng_(2 * kInt13Max + 1) - kInt13Max; + r1[i] = rng_(2 * kInt13Max + 1) - kInt13Max; + m[i] = rng_(MAX_MASK_VALUE + 1); + } + + const int maxN = AOMMIN(kMaxSize, MAX_SB_SQUARE); + const int N = 64 * (rng_(maxN / 64 - 1) + 1); + + int64_t limit; + limit = (int64_t)aom_sum_squares_i16(r0, N); + limit -= (int64_t)aom_sum_squares_i16(r1, N); + limit *= (1 << WEDGE_WEIGHT_BITS) / 2; + + for (int i = 0; i < N; i++) + ds[i] = clamp(r0[i] * r0[i] - r1[i] * r1[i], INT16_MIN, INT16_MAX); + + const int ref_res = params_.ref_func(ds, m, N, limit); + int tst_res; + ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(ds, m, N, limit)); + + ASSERT_EQ(ref_res, tst_res); + } +} + +TEST_P(WedgeUtilsSignOptTest, ExtremeValues) { + DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int16_t, ds[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]); + + for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + switch (rng_(4)) { + case 0: + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + r0[i] = 0; + r1[i] = kInt13Max; + } + break; + case 1: + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + r0[i] = kInt13Max; + r1[i] = 0; + } + break; + case 2: + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + r0[i] = 0; + r1[i] = -kInt13Max; + } + break; + default: + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + r0[i] = -kInt13Max; + r1[i] = 0; + } + break; + } + + for (int i = 0; i < MAX_SB_SQUARE; ++i) m[i] = MAX_MASK_VALUE; + + const int maxN = AOMMIN(kMaxSize, MAX_SB_SQUARE); + const int N = 64 * (rng_(maxN / 64 - 1) + 1); + + int64_t limit; + limit = (int64_t)aom_sum_squares_i16(r0, N); + limit -= (int64_t)aom_sum_squares_i16(r1, N); + limit *= (1 << WEDGE_WEIGHT_BITS) / 2; + + for (int i = 0; i < N; i++) + ds[i] = clamp(r0[i] * r0[i] - r1[i] * r1[i], INT16_MIN, INT16_MAX); + + const int ref_res = params_.ref_func(ds, m, N, limit); + int tst_res; + ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(ds, m, N, limit)); + + ASSERT_EQ(ref_res, tst_res); + } +} + +////////////////////////////////////////////////////////////////////////////// +// av1_wedge_compute_delta_squares +////////////////////////////////////////////////////////////////////////////// + +typedef void (*FDS)(int16_t *d, const int16_t *a, const int16_t *b, int N); +typedef libaom_test::FuncParam TestFuncsFDS; + +class WedgeUtilsDeltaSquaresOptTest : public FunctionEquivalenceTest { + protected: + static const int kIterations = 10000; +}; + +TEST_P(WedgeUtilsDeltaSquaresOptTest, RandomValues) { + DECLARE_ALIGNED(32, int16_t, a[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int16_t, b[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int16_t, d_ref[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int16_t, d_tst[MAX_SB_SQUARE]); + + for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + a[i] = rng_.Rand16(); + b[i] = rng_(2 * INT16_MAX + 1) - INT16_MAX; + } + + const int N = 64 * (rng_(MAX_SB_SQUARE / 64) + 1); + + memset(&d_ref, INT16_MAX, sizeof(d_ref)); + memset(&d_tst, INT16_MAX, sizeof(d_tst)); + + params_.ref_func(d_ref, a, b, N); + ASM_REGISTER_STATE_CHECK(params_.tst_func(d_tst, a, b, N)); + + for (int i = 0; i < MAX_SB_SQUARE; ++i) ASSERT_EQ(d_ref[i], d_tst[i]); + } +} + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, WedgeUtilsSSEOptTest, + ::testing::Values(TestFuncsFSSE(av1_wedge_sse_from_residuals_c, + av1_wedge_sse_from_residuals_sse2))); + +INSTANTIATE_TEST_SUITE_P( + SSE2, WedgeUtilsSignOptTest, + ::testing::Values(TestFuncsFSign(av1_wedge_sign_from_residuals_c, + av1_wedge_sign_from_residuals_sse2))); + +INSTANTIATE_TEST_SUITE_P( + SSE2, WedgeUtilsDeltaSquaresOptTest, + ::testing::Values(TestFuncsFDS(av1_wedge_compute_delta_squares_c, + av1_wedge_compute_delta_squares_sse2))); +#endif // HAVE_SSE2 + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, WedgeUtilsSSEOptTest, + ::testing::Values(TestFuncsFSSE(av1_wedge_sse_from_residuals_sse2, + av1_wedge_sse_from_residuals_avx2))); + +INSTANTIATE_TEST_SUITE_P( + AVX2, WedgeUtilsSignOptTest, + ::testing::Values(TestFuncsFSign(av1_wedge_sign_from_residuals_sse2, + av1_wedge_sign_from_residuals_avx2))); + +INSTANTIATE_TEST_SUITE_P( + AVX2, WedgeUtilsDeltaSquaresOptTest, + ::testing::Values(TestFuncsFDS(av1_wedge_compute_delta_squares_sse2, + av1_wedge_compute_delta_squares_avx2))); +#endif // HAVE_AVX2 + +} // namespace diff --git a/libs/libaom/src/test/avg_test.cc b/libs/libaom/src/test/avg_test.cc new file mode 100644 index 000000000..1742aec5f --- /dev/null +++ b/libs/libaom/src/test/avg_test.cc @@ -0,0 +1,291 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_dsp_rtcd.h" + +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" + +namespace { + +using libaom_test::ACMRandom; + +template +class AverageTestBase : public ::testing::Test { + public: + AverageTestBase(int width, int height) + : width_(width), height_(height), source_data_(NULL), source_stride_(0), + bit_depth_(8) {} + + virtual void TearDown() { + aom_free(source_data_); + source_data_ = NULL; + libaom_test::ClearSystemState(); + } + + protected: + // Handle blocks up to 4 blocks 64x64 with stride up to 128 + static const int kDataAlignment = 16; + static const int kDataBlockSize = 64 * 128; + + virtual void SetUp() { + source_data_ = static_cast( + aom_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0]))); + ASSERT_TRUE(source_data_ != NULL); + source_stride_ = (width_ + 31) & ~31; + bit_depth_ = 8; + rnd_.Reset(ACMRandom::DeterministicSeed()); + } + + // Sum Pixels + static unsigned int ReferenceAverage8x8(const Pixel *source, int pitch) { + unsigned int average = 0; + for (int h = 0; h < 8; ++h) { + for (int w = 0; w < 8; ++w) average += source[h * pitch + w]; + } + return (average + 32) >> 6; + } + + static unsigned int ReferenceAverage4x4(const Pixel *source, int pitch) { + unsigned int average = 0; + for (int h = 0; h < 4; ++h) { + for (int w = 0; w < 4; ++w) average += source[h * pitch + w]; + } + return (average + 8) >> 4; + } + + void FillConstant(Pixel fill_constant) { + for (int i = 0; i < width_ * height_; ++i) { + source_data_[i] = fill_constant; + } + } + + void FillRandom() { + for (int i = 0; i < width_ * height_; ++i) { + source_data_[i] = rnd_.Rand16() & ((1 << bit_depth_) - 1); + } + } + + int width_, height_; + Pixel *source_data_; + int source_stride_; + int bit_depth_; + + ACMRandom rnd_; +}; +typedef unsigned int (*AverageFunction)(const uint8_t *s, int pitch); + +// Arguments: width, height, pitch, block size, avg function. +typedef std::tuple AvgFunc; + +class AverageTest : public AverageTestBase, + public ::testing::WithParamInterface { + public: + AverageTest() : AverageTestBase(GET_PARAM(0), GET_PARAM(1)) {} + + protected: + void CheckAverages() { + const int block_size = GET_PARAM(3); + unsigned int expected = 0; + if (block_size == 8) { + expected = + ReferenceAverage8x8(source_data_ + GET_PARAM(2), source_stride_); + } else if (block_size == 4) { + expected = + ReferenceAverage4x4(source_data_ + GET_PARAM(2), source_stride_); + } + + unsigned int actual; + ASM_REGISTER_STATE_CHECK( + actual = GET_PARAM(4)(source_data_ + GET_PARAM(2), source_stride_)); + + EXPECT_EQ(expected, actual); + } +}; + +TEST_P(AverageTest, MinValue) { + FillConstant(0); + CheckAverages(); +} + +TEST_P(AverageTest, MaxValue) { + FillConstant(255); + CheckAverages(); +} + +TEST_P(AverageTest, Random) { + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + for (int i = 0; i < 1000; i++) { + FillRandom(); + CheckAverages(); + } +} + +typedef void (*IntProRowFunc)(int16_t hbuf[16], uint8_t const *ref, + const int ref_stride, const int height); + +// Params: height, asm function, c function. +typedef std::tuple IntProRowParam; + +class IntProRowTest : public AverageTestBase, + public ::testing::WithParamInterface { + public: + IntProRowTest() + : AverageTestBase(16, GET_PARAM(0)), hbuf_asm_(NULL), hbuf_c_(NULL) { + asm_func_ = GET_PARAM(1); + c_func_ = GET_PARAM(2); + } + + protected: + virtual void SetUp() { + source_data_ = static_cast( + aom_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0]))); + ASSERT_TRUE(source_data_ != NULL); + + hbuf_asm_ = static_cast( + aom_memalign(kDataAlignment, sizeof(*hbuf_asm_) * 16)); + hbuf_c_ = static_cast( + aom_memalign(kDataAlignment, sizeof(*hbuf_c_) * 16)); + } + + virtual void TearDown() { + aom_free(source_data_); + source_data_ = NULL; + aom_free(hbuf_c_); + hbuf_c_ = NULL; + aom_free(hbuf_asm_); + hbuf_asm_ = NULL; + } + + void RunComparison() { + ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_)); + ASM_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_)); + EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16)) + << "Output mismatch"; + } + + private: + IntProRowFunc asm_func_; + IntProRowFunc c_func_; + int16_t *hbuf_asm_; + int16_t *hbuf_c_; +}; + +typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width); + +// Params: width, asm function, c function. +typedef std::tuple IntProColParam; + +class IntProColTest : public AverageTestBase, + public ::testing::WithParamInterface { + public: + IntProColTest() : AverageTestBase(GET_PARAM(0), 1), sum_asm_(0), sum_c_(0) { + asm_func_ = GET_PARAM(1); + c_func_ = GET_PARAM(2); + } + + protected: + void RunComparison() { + ASM_REGISTER_STATE_CHECK(sum_c_ = c_func_(source_data_, width_)); + ASM_REGISTER_STATE_CHECK(sum_asm_ = asm_func_(source_data_, width_)); + EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch"; + } + + private: + IntProColFunc asm_func_; + IntProColFunc c_func_; + int16_t sum_asm_; + int16_t sum_c_; +}; + +TEST_P(IntProRowTest, MinValue) { + FillConstant(0); + RunComparison(); +} + +TEST_P(IntProRowTest, MaxValue) { + FillConstant(255); + RunComparison(); +} + +TEST_P(IntProRowTest, Random) { + FillRandom(); + RunComparison(); +} + +TEST_P(IntProColTest, MinValue) { + FillConstant(0); + RunComparison(); +} + +TEST_P(IntProColTest, MaxValue) { + FillConstant(255); + RunComparison(); +} + +TEST_P(IntProColTest, Random) { + FillRandom(); + RunComparison(); +} + +using std::make_tuple; + +INSTANTIATE_TEST_SUITE_P( + C, AverageTest, + ::testing::Values(make_tuple(16, 16, 1, 8, &aom_avg_8x8_c), + make_tuple(16, 16, 1, 4, &aom_avg_4x4_c))); + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, AverageTest, + ::testing::Values(make_tuple(16, 16, 0, 8, &aom_avg_8x8_sse2), + make_tuple(16, 16, 5, 8, &aom_avg_8x8_sse2), + make_tuple(32, 32, 15, 8, &aom_avg_8x8_sse2), + make_tuple(16, 16, 0, 4, &aom_avg_4x4_sse2), + make_tuple(16, 16, 5, 4, &aom_avg_4x4_sse2), + make_tuple(32, 32, 15, 4, &aom_avg_4x4_sse2))); + +INSTANTIATE_TEST_SUITE_P( + SSE2, IntProRowTest, + ::testing::Values(make_tuple(16, &aom_int_pro_row_sse2, &aom_int_pro_row_c), + make_tuple(32, &aom_int_pro_row_sse2, &aom_int_pro_row_c), + make_tuple(64, &aom_int_pro_row_sse2, &aom_int_pro_row_c), + make_tuple(128, &aom_int_pro_row_sse2, + &aom_int_pro_row_c))); + +INSTANTIATE_TEST_SUITE_P( + SSE2, IntProColTest, + ::testing::Values(make_tuple(16, &aom_int_pro_col_sse2, &aom_int_pro_col_c), + make_tuple(32, &aom_int_pro_col_sse2, &aom_int_pro_col_c), + make_tuple(64, &aom_int_pro_col_sse2, &aom_int_pro_col_c), + make_tuple(128, &aom_int_pro_col_sse2, + &aom_int_pro_col_c))); +#endif + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, AverageTest, + ::testing::Values(make_tuple(16, 16, 0, 8, &aom_avg_8x8_neon), + make_tuple(16, 16, 5, 8, &aom_avg_8x8_neon), + make_tuple(32, 32, 15, 8, &aom_avg_8x8_neon), + make_tuple(16, 16, 0, 4, &aom_avg_4x4_neon), + make_tuple(16, 16, 5, 4, &aom_avg_4x4_neon), + make_tuple(32, 32, 15, 4, &aom_avg_4x4_neon))); +#endif + +} // namespace diff --git a/libs/libaom/src/test/best_encode.sh b/libs/libaom/src/test/best_encode.sh new file mode 100644 index 000000000..fe31a01cb --- /dev/null +++ b/libs/libaom/src/test/best_encode.sh @@ -0,0 +1,103 @@ +#!/bin/bash +# +# Copyright (c) 2016, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and +# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +# was not distributed with this source code in the LICENSE file, you can +# obtain it at www.aomedia.org/license/software. If the Alliance for Open +# Media Patent License 1.0 was not distributed with this source code in the +# PATENTS file, you can obtain it at www.aomedia.org/license/patent. +# +# Author: jimbankoski@google.com (Jim Bankoski) + +if [[ $# -ne 2 ]]; then + echo "Encodes a file using best known settings (slow!)" + echo " Usage: be [FILE] [BITRATE]" + echo " Example: be akiyo_cif.y4m 200" + exit +fi + +f=$1 # file is first parameter +b=$2 # bitrate is second parameter + +if [[ -e $f.fpf ]]; then + # First-pass file found, do second pass only + aomenc \ + $f \ + -o $f-$b.av1.webm \ + -p 2 \ + --pass=2 \ + --fpf=$f.fpf \ + --best \ + --cpu-used=0 \ + --target-bitrate=$b \ + --auto-alt-ref=1 \ + -v \ + --minsection-pct=0 \ + --maxsection-pct=800 \ + --lag-in-frames=25 \ + --kf-min-dist=0 \ + --kf-max-dist=99999 \ + --static-thresh=0 \ + --min-q=0 \ + --max-q=63 \ + --drop-frame=0 \ + --bias-pct=50 \ + --minsection-pct=0 \ + --maxsection-pct=800 \ + --psnr \ + --arnr-maxframes=7 \ + --arnr-strength=3 \ + --arnr-type=3 +else + # No first-pass file found, do 2-pass encode + aomenc \ + $f \ + -o $f-$b.av1.webm \ + -p 2 \ + --pass=1 \ + --fpf=$f.fpf \ + --best \ + --cpu-used=0 \ + --target-bitrate=$b \ + --auto-alt-ref=1 \ + -v \ + --minsection-pct=0 \ + --maxsection-pct=800 \ + --lag-in-frames=25 \ + --kf-min-dist=0 \ + --kf-max-dist=99999 \ + --static-thresh=0 \ + --min-q=0 \ + --max-q=63 \ + --drop-frame=0 + + aomenc \ + $f \ + -o $f-$b.av1.webm \ + -p 2 \ + --pass=2 \ + --fpf=$f.fpf \ + --best \ + --cpu-used=0 \ + --target-bitrate=$b \ + --auto-alt-ref=1 \ + -v \ + --minsection-pct=0 \ + --maxsection-pct=800 \ + --lag-in-frames=25 \ + --kf-min-dist=0 \ + --kf-max-dist=99999 \ + --static-thresh=0 \ + --min-q=0 \ + --max-q=63 \ + --drop-frame=0 \ + --bias-pct=50 \ + --minsection-pct=0 \ + --maxsection-pct=800 \ + --psnr \ + --arnr-maxframes=7 \ + --arnr-strength=3 \ + --arnr-type=3 +fi diff --git a/libs/libaom/src/test/binary_codes_test.cc b/libs/libaom/src/test/binary_codes_test.cc new file mode 100644 index 000000000..45660cf85 --- /dev/null +++ b/libs/libaom/src/test/binary_codes_test.cc @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_config.h" + +#include "test/acm_random.h" +#include "aom/aom_integer.h" +#include "aom_dsp/bitreader.h" +#include "aom_dsp/bitwriter.h" +#include "aom_dsp/binary_codes_reader.h" +#include "aom_dsp/binary_codes_writer.h" + +#define ACCT_STR __func__ + +using libaom_test::ACMRandom; + +namespace { + +// Test for Finite subexponential code with reference +TEST(AV1, TestPrimitiveRefsubexpfin) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int kBufferSize = 65536; + aom_writer bw; + uint8_t bw_buffer[kBufferSize]; + const uint16_t kRanges = 8; + const uint16_t kSubexpParams = 6; + const uint16_t kReferences = 8; + const uint16_t kValues = 16; + uint16_t enc_values[kRanges][kSubexpParams][kReferences][kValues][4]; + const uint16_t range_vals[kRanges] = { 1, 13, 64, 120, 230, 420, 1100, 8000 }; + aom_start_encode(&bw, bw_buffer); + for (int n = 0; n < kRanges; ++n) { + const uint16_t range = range_vals[n]; + for (int k = 0; k < kSubexpParams; ++k) { + for (int r = 0; r < kReferences; ++r) { + const uint16_t ref = rnd(range); + for (int v = 0; v < kValues; ++v) { + const uint16_t value = rnd(range); + enc_values[n][k][r][v][0] = range; + enc_values[n][k][r][v][1] = k; + enc_values[n][k][r][v][2] = ref; + enc_values[n][k][r][v][3] = value; + aom_write_primitive_refsubexpfin(&bw, range, k, ref, value); + } + } + } + } + aom_stop_encode(&bw); + aom_reader br; + aom_reader_init(&br, bw_buffer, bw.pos); + GTEST_ASSERT_GE(aom_reader_tell(&br), 0u); + GTEST_ASSERT_LE(aom_reader_tell(&br), 1u); + for (int n = 0; n < kRanges; ++n) { + for (int k = 0; k < kSubexpParams; ++k) { + for (int r = 0; r < kReferences; ++r) { + for (int v = 0; v < kValues; ++v) { + const uint16_t range = enc_values[n][k][r][v][0]; + assert(k == enc_values[n][k][r][v][1]); + const uint16_t ref = enc_values[n][k][r][v][2]; + const uint16_t value = + aom_read_primitive_refsubexpfin(&br, range, k, ref, ACCT_STR); + GTEST_ASSERT_EQ(value, enc_values[n][k][r][v][3]); + } + } + } + } +} +// TODO(debargha): Adds tests for other primitives +} // namespace diff --git a/libs/libaom/src/test/blend_a64_mask_1d_test.cc b/libs/libaom/src/test/blend_a64_mask_1d_test.cc new file mode 100644 index 000000000..1b6350c79 --- /dev/null +++ b/libs/libaom/src/test/blend_a64_mask_1d_test.cc @@ -0,0 +1,340 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/register_state_check.h" +#include "test/function_equivalence_test.h" + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" + +#include "av1/common/enums.h" + +#include "aom_dsp/blend.h" + +using libaom_test::FunctionEquivalenceTest; + +namespace { + +template +class BlendA64Mask1DTest : public FunctionEquivalenceTest { + public: + static const int kIterations = 10000; + static const int kMaxWidth = MAX_SB_SIZE * 5; // * 5 to cover longer strides + static const int kMaxHeight = MAX_SB_SIZE; + static const int kBufSize = kMaxWidth * kMaxHeight; + static const int kMaxMaskWidth = 2 * MAX_SB_SIZE; + static const int kMaxMaskSize = kMaxMaskWidth; + + virtual ~BlendA64Mask1DTest() {} + + virtual void Execute(const T *p_src0, const T *p_src1) = 0; + + void Common() { + w_ = 2 << this->rng_(MAX_SB_SIZE_LOG2); + h_ = 2 << this->rng_(MAX_SB_SIZE_LOG2); + + dst_offset_ = this->rng_(33); + dst_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_; + + src0_offset_ = this->rng_(33); + src0_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_; + + src1_offset_ = this->rng_(33); + src1_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_; + + T *p_src0; + T *p_src1; + + switch (this->rng_(3)) { + case 0: // Separate sources + p_src0 = src0_; + p_src1 = src1_; + break; + case 1: // src0 == dst + p_src0 = dst_tst_; + src0_stride_ = dst_stride_; + src0_offset_ = dst_offset_; + p_src1 = src1_; + break; + case 2: // src1 == dst + p_src0 = src0_; + p_src1 = dst_tst_; + src1_stride_ = dst_stride_; + src1_offset_ = dst_offset_; + break; + default: FAIL(); + } + + Execute(p_src0, p_src1); + + for (int r = 0; r < h_; ++r) { + for (int c = 0; c < w_; ++c) { + ASSERT_EQ(dst_ref_[dst_offset_ + r * dst_stride_ + c], + dst_tst_[dst_offset_ + r * dst_stride_ + c]); + } + } + } + + T dst_ref_[kBufSize]; + T dst_tst_[kBufSize]; + uint32_t dst_stride_; + uint32_t dst_offset_; + + T src0_[kBufSize]; + uint32_t src0_stride_; + uint32_t src0_offset_; + + T src1_[kBufSize]; + uint32_t src1_stride_; + uint32_t src1_offset_; + + uint8_t mask_[kMaxMaskSize]; + + int w_; + int h_; +}; + +////////////////////////////////////////////////////////////////////////////// +// 8 bit version +////////////////////////////////////////////////////////////////////////////// + +typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, + uint32_t src1_stride, const uint8_t *mask, int w, int h); +typedef libaom_test::FuncParam TestFuncs; + +class BlendA64Mask1DTest8B : public BlendA64Mask1DTest { + protected: + void Execute(const uint8_t *p_src0, const uint8_t *p_src1) { + params_.ref_func(dst_ref_ + dst_offset_, dst_stride_, p_src0 + src0_offset_, + src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_, + w_, h_); + ASM_REGISTER_STATE_CHECK(params_.tst_func( + dst_tst_ + dst_offset_, dst_stride_, p_src0 + src0_offset_, + src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_, w_, h_)); + } +}; + +TEST_P(BlendA64Mask1DTest8B, RandomValues) { + for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + for (int i = 0; i < kBufSize; ++i) { + dst_ref_[i] = rng_.Rand8(); + dst_tst_[i] = rng_.Rand8(); + + src0_[i] = rng_.Rand8(); + src1_[i] = rng_.Rand8(); + } + + for (int i = 0; i < kMaxMaskSize; ++i) + mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1); + + Common(); + } +} + +TEST_P(BlendA64Mask1DTest8B, ExtremeValues) { + for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + for (int i = 0; i < kBufSize; ++i) { + dst_ref_[i] = rng_(2) + 254; + dst_tst_[i] = rng_(2) + 254; + src0_[i] = rng_(2) + 254; + src1_[i] = rng_(2) + 254; + } + + for (int i = 0; i < kMaxMaskSize; ++i) + mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1; + + Common(); + } +} + +static void blend_a64_hmask_ref(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + uint8_t mask2d[BlendA64Mask1DTest8B::kMaxMaskSize] + [BlendA64Mask1DTest8B::kMaxMaskSize]; + + for (int row = 0; row < h; ++row) + for (int col = 0; col < w; ++col) mask2d[row][col] = mask[col]; + + aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, + &mask2d[0][0], BlendA64Mask1DTest8B::kMaxMaskSize, w, h, + 0, 0); +} + +static void blend_a64_vmask_ref(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + uint8_t mask2d[BlendA64Mask1DTest8B::kMaxMaskSize] + [BlendA64Mask1DTest8B::kMaxMaskSize]; + + for (int row = 0; row < h; ++row) + for (int col = 0; col < w; ++col) mask2d[row][col] = mask[row]; + + aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, + &mask2d[0][0], BlendA64Mask1DTest8B::kMaxMaskSize, w, h, + 0, 0); +} + +INSTANTIATE_TEST_SUITE_P( + C, BlendA64Mask1DTest8B, + ::testing::Values(TestFuncs(blend_a64_hmask_ref, aom_blend_a64_hmask_c), + TestFuncs(blend_a64_vmask_ref, aom_blend_a64_vmask_c))); + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P( + SSE4_1, BlendA64Mask1DTest8B, + ::testing::Values( + TestFuncs(blend_a64_hmask_ref, aom_blend_a64_hmask_sse4_1), + TestFuncs(blend_a64_vmask_ref, aom_blend_a64_vmask_sse4_1))); +#endif // HAVE_SSE4_1 + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, BlendA64Mask1DTest8B, + ::testing::Values(TestFuncs(blend_a64_hmask_ref, aom_blend_a64_hmask_neon), + TestFuncs(blend_a64_vmask_ref, + aom_blend_a64_vmask_neon))); +#endif // HAVE_NEON + +////////////////////////////////////////////////////////////////////////////// +// High bit-depth version +////////////////////////////////////////////////////////////////////////////// +#if CONFIG_AV1_HIGHBITDEPTH +typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, + uint32_t src1_stride, const uint8_t *mask, int w, int h, + int bd); +typedef libaom_test::FuncParam TestFuncsHBD; + +class BlendA64Mask1DTestHBD : public BlendA64Mask1DTest { + protected: + void Execute(const uint16_t *p_src0, const uint16_t *p_src1) { + params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_, + CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_, + CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_, + mask_, w_, h_, bit_depth_); + ASM_REGISTER_STATE_CHECK(params_.tst_func( + CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_, + CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_, + CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_, mask_, w_, h_, + bit_depth_)); + } + + int bit_depth_; +}; + +TEST_P(BlendA64Mask1DTestHBD, RandomValues) { + for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + switch (rng_(3)) { + case 0: bit_depth_ = 8; break; + case 1: bit_depth_ = 10; break; + default: bit_depth_ = 12; break; + } + + const int hi = 1 << bit_depth_; + + for (int i = 0; i < kBufSize; ++i) { + dst_ref_[i] = rng_(hi); + dst_tst_[i] = rng_(hi); + src0_[i] = rng_(hi); + src1_[i] = rng_(hi); + } + + for (int i = 0; i < kMaxMaskSize; ++i) + mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1); + + Common(); + } +} + +TEST_P(BlendA64Mask1DTestHBD, ExtremeValues) { + for (int iter = 0; iter < 1000 && !HasFatalFailure(); ++iter) { + switch (rng_(3)) { + case 0: bit_depth_ = 8; break; + case 1: bit_depth_ = 10; break; + default: bit_depth_ = 12; break; + } + + const int hi = 1 << bit_depth_; + const int lo = hi - 2; + + for (int i = 0; i < kBufSize; ++i) { + dst_ref_[i] = rng_(hi - lo) + lo; + dst_tst_[i] = rng_(hi - lo) + lo; + src0_[i] = rng_(hi - lo) + lo; + src1_[i] = rng_(hi - lo) + lo; + } + + for (int i = 0; i < kMaxMaskSize; ++i) + mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1; + + Common(); + } +} + +static void highbd_blend_a64_hmask_ref( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h, int bd) { + uint8_t mask2d[BlendA64Mask1DTestHBD::kMaxMaskSize] + [BlendA64Mask1DTestHBD::kMaxMaskSize]; + + for (int row = 0; row < h; ++row) + for (int col = 0; col < w; ++col) mask2d[row][col] = mask[col]; + + aom_highbd_blend_a64_mask_c( + dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask2d[0][0], + BlendA64Mask1DTestHBD::kMaxMaskSize, w, h, 0, 0, bd); +} + +static void highbd_blend_a64_vmask_ref( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h, int bd) { + uint8_t mask2d[BlendA64Mask1DTestHBD::kMaxMaskSize] + [BlendA64Mask1DTestHBD::kMaxMaskSize]; + + for (int row = 0; row < h; ++row) + for (int col = 0; col < w; ++col) mask2d[row][col] = mask[row]; + + aom_highbd_blend_a64_mask_c( + dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask2d[0][0], + BlendA64Mask1DTestHBD::kMaxMaskSize, w, h, 0, 0, bd); +} + +INSTANTIATE_TEST_SUITE_P( + C, BlendA64Mask1DTestHBD, + ::testing::Values(TestFuncsHBD(highbd_blend_a64_hmask_ref, + aom_highbd_blend_a64_hmask_c), + TestFuncsHBD(highbd_blend_a64_vmask_ref, + aom_highbd_blend_a64_vmask_c))); + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P( + SSE4_1, BlendA64Mask1DTestHBD, + ::testing::Values(TestFuncsHBD(highbd_blend_a64_hmask_ref, + aom_highbd_blend_a64_hmask_sse4_1), + TestFuncsHBD(highbd_blend_a64_vmask_ref, + aom_highbd_blend_a64_vmask_sse4_1))); +#endif // HAVE_SSE4_1 +#endif // CONFIG_AV1_HIGHBITDEPTH +} // namespace diff --git a/libs/libaom/src/test/blend_a64_mask_test.cc b/libs/libaom/src/test/blend_a64_mask_test.cc new file mode 100644 index 000000000..5c2c291fd --- /dev/null +++ b/libs/libaom/src/test/blend_a64_mask_test.cc @@ -0,0 +1,620 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/register_state_check.h" +#include "test/function_equivalence_test.h" + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" + +#include "av1/common/enums.h" + +#include "aom_dsp/blend.h" + +using libaom_test::FunctionEquivalenceTest; + +namespace { + +template +class BlendA64MaskTest : public FunctionEquivalenceTest { + protected: + static const int kIterations = 10000; + static const int kMaxWidth = MAX_SB_SIZE * 5; // * 5 to cover longer strides + static const int kMaxHeight = MAX_SB_SIZE; + static const int kBufSize = kMaxWidth * kMaxHeight; + static const int kMaxMaskWidth = 2 * MAX_SB_SIZE; + static const int kMaxMaskSize = kMaxMaskWidth * kMaxMaskWidth; + + virtual ~BlendA64MaskTest() {} + + virtual void Execute(const SrcPixel *p_src0, const SrcPixel *p_src1, + int run_times) = 0; + + template + void GetSources(Pixel **src0, Pixel **src1, Pixel * /*dst*/, int run_times) { + if (run_times > 1) { + *src0 = src0_; + *src1 = src1_; + return; + } + switch (this->rng_(3)) { + case 0: // Separate sources + *src0 = src0_; + *src1 = src1_; + break; + case 1: // src0 == dst + *src0 = dst_tst_; + src0_stride_ = dst_stride_; + src0_offset_ = dst_offset_; + *src1 = src1_; + break; + case 2: // src1 == dst + *src0 = src0_; + *src1 = dst_tst_; + src1_stride_ = dst_stride_; + src1_offset_ = dst_offset_; + break; + default: FAIL(); + } + } + + void GetSources(uint16_t **src0, uint16_t **src1, uint8_t * /*dst*/, + int /*run_times*/) { + *src0 = src0_; + *src1 = src1_; + } + + uint8_t Rand1() { return this->rng_.Rand8() & 1; } + + void RunOneTest(int block_size, int subx, int suby, int run_times) { + w_ = block_size_wide[block_size]; + h_ = block_size_high[block_size]; + run_times = run_times > 1 ? run_times / w_ : 1; + ASSERT_GT(run_times, 0); + subx_ = subx; + suby_ = suby; + + dst_offset_ = this->rng_(33); + dst_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_; + + src0_offset_ = this->rng_(33); + src0_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_; + + src1_offset_ = this->rng_(33); + src1_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_; + + mask_stride_ = + this->rng_(kMaxWidth + 1 - w_ * (subx_ ? 2 : 1)) + w_ * (subx_ ? 2 : 1); + + SrcPixel *p_src0; + SrcPixel *p_src1; + + p_src0 = src0_; + p_src1 = src1_; + + GetSources(&p_src0, &p_src1, &dst_ref_[0], run_times); + + Execute(p_src0, p_src1, run_times); + + for (int r = 0; r < h_; ++r) { + for (int c = 0; c < w_; ++c) { + ASSERT_EQ(dst_ref_[dst_offset_ + r * dst_stride_ + c], + dst_tst_[dst_offset_ + r * dst_stride_ + c]) + << w_ << "x" << h_ << " subx " << subx_ << " suby " << suby_ + << " r: " << r << " c: " << c; + } + } + } + + void RunTest(int block_size, int run_times) { + subx_ = Rand1(); + suby_ = Rand1(); + RunOneTest(block_size, subx_, suby_, run_times); + } + + DstPixel dst_ref_[kBufSize]; + DstPixel dst_tst_[kBufSize]; + uint32_t dst_stride_; + uint32_t dst_offset_; + + SrcPixel src0_[kBufSize]; + uint32_t src0_stride_; + uint32_t src0_offset_; + + SrcPixel src1_[kBufSize]; + uint32_t src1_stride_; + uint32_t src1_offset_; + + uint8_t mask_[kMaxMaskSize]; + size_t mask_stride_; + + int w_; + int h_; + + int suby_; + int subx_; +}; + +////////////////////////////////////////////////////////////////////////////// +// 8 bit version +////////////////////////////////////////////////////////////////////////////// + +typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, + uint32_t src1_stride, const uint8_t *mask, + uint32_t mask_stride, int w, int h, int subx, int suby); +typedef libaom_test::FuncParam TestFuncs; + +class BlendA64MaskTest8B : public BlendA64MaskTest { + protected: + void Execute(const uint8_t *p_src0, const uint8_t *p_src1, int run_times) { + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + params_.ref_func(dst_ref_ + dst_offset_, dst_stride_, + p_src0 + src0_offset_, src0_stride_, + p_src1 + src1_offset_, src1_stride_, mask_, + kMaxMaskWidth, w_, h_, subx_, suby_); + } + aom_usec_timer_mark(&timer); + const double time1 = static_cast(aom_usec_timer_elapsed(&timer)); + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + params_.tst_func(dst_tst_ + dst_offset_, dst_stride_, + p_src0 + src0_offset_, src0_stride_, + p_src1 + src1_offset_, src1_stride_, mask_, + kMaxMaskWidth, w_, h_, subx_, suby_); + } + aom_usec_timer_mark(&timer); + const double time2 = static_cast(aom_usec_timer_elapsed(&timer)); + if (run_times > 1) { + printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_, + time1, time2); + printf("(%3.2f)\n", time1 / time2); + } + } +}; + +TEST_P(BlendA64MaskTest8B, RandomValues) { + for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + int bsize = rng_.Rand8() % BLOCK_SIZES_ALL; + for (int i = 0; i < kBufSize; ++i) { + dst_ref_[i] = rng_.Rand8(); + dst_tst_[i] = rng_.Rand8(); + + src0_[i] = rng_.Rand8(); + src1_[i] = rng_.Rand8(); + } + + for (int i = 0; i < kMaxMaskSize; ++i) + mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1); + + RunTest(bsize, 1); + } +} + +TEST_P(BlendA64MaskTest8B, ExtremeValues) { + for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + int bsize = rng_.Rand8() % BLOCK_SIZES_ALL; + for (int i = 0; i < kBufSize; ++i) { + dst_ref_[i] = rng_(2) + 254; + dst_tst_[i] = rng_(2) + 254; + src0_[i] = rng_(2) + 254; + src1_[i] = rng_(2) + 254; + } + + for (int i = 0; i < kMaxMaskSize; ++i) + mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1; + + RunTest(bsize, 1); + } +} +TEST_P(BlendA64MaskTest8B, DISABLED_Speed) { + const int kRunTimes = 10000000; + for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) { + for (int i = 0; i < kBufSize; ++i) { + dst_ref_[i] = rng_.Rand8(); + dst_tst_[i] = rng_.Rand8(); + + src0_[i] = rng_.Rand8(); + src1_[i] = rng_.Rand8(); + } + + for (int i = 0; i < kMaxMaskSize; ++i) + mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1); + + RunOneTest(bsize, 1, 1, kRunTimes); + RunOneTest(bsize, 1, 0, kRunTimes); + RunOneTest(bsize, 0, 1, kRunTimes); + RunOneTest(bsize, 0, 0, kRunTimes); + } +} +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE4_1, BlendA64MaskTest8B, + ::testing::Values(TestFuncs( + aom_blend_a64_mask_c, aom_blend_a64_mask_sse4_1))); +#endif // HAVE_SSE4_1 + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2, BlendA64MaskTest8B, + ::testing::Values(TestFuncs(aom_blend_a64_mask_sse4_1, + aom_blend_a64_mask_avx2))); +#endif // HAVE_AVX2 + +////////////////////////////////////////////////////////////////////////////// +// 8 bit _d16 version +////////////////////////////////////////////////////////////////////////////// + +typedef void (*F8B_D16)(uint8_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, + uint32_t src1_stride, const uint8_t *mask, + uint32_t mask_stride, int w, int h, int subx, int suby, + ConvolveParams *conv_params); +typedef libaom_test::FuncParam TestFuncs_d16; + +class BlendA64MaskTest8B_d16 + : public BlendA64MaskTest { + protected: + // max number of bits used by the source + static const int kSrcMaxBitsMask = 0x3fff; + + void Execute(const uint16_t *p_src0, const uint16_t *p_src1, int run_times) { + ConvolveParams conv_params; + conv_params.round_0 = ROUND0_BITS; + conv_params.round_1 = COMPOUND_ROUND1_BITS; + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + params_.ref_func(dst_ref_ + dst_offset_, dst_stride_, + p_src0 + src0_offset_, src0_stride_, + p_src1 + src1_offset_, src1_stride_, mask_, + kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params); + } + aom_usec_timer_mark(&timer); + const double time1 = static_cast(aom_usec_timer_elapsed(&timer)); + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + params_.tst_func(dst_tst_ + dst_offset_, dst_stride_, + p_src0 + src0_offset_, src0_stride_, + p_src1 + src1_offset_, src1_stride_, mask_, + kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params); + } + aom_usec_timer_mark(&timer); + const double time2 = static_cast(aom_usec_timer_elapsed(&timer)); + if (run_times > 1) { + printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_, + time1, time2); + printf("(%3.2f)\n", time1 / time2); + } + } +}; + +TEST_P(BlendA64MaskTest8B_d16, RandomValues) { + for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + int bsize = rng_.Rand8() % BLOCK_SIZES_ALL; + for (int i = 0; i < kBufSize; ++i) { + dst_ref_[i] = rng_.Rand8(); + dst_tst_[i] = rng_.Rand8(); + + src0_[i] = rng_.Rand16() & kSrcMaxBitsMask; + src1_[i] = rng_.Rand16() & kSrcMaxBitsMask; + } + + for (int i = 0; i < kMaxMaskSize; ++i) + mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1); + + RunTest(bsize, 1); + } +} + +TEST_P(BlendA64MaskTest8B_d16, ExtremeValues) { + for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + int bsize = rng_.Rand8() % BLOCK_SIZES_ALL; + for (int i = 0; i < kBufSize; ++i) { + dst_ref_[i] = 255; + dst_tst_[i] = 255; + + src0_[i] = kSrcMaxBitsMask; + src1_[i] = kSrcMaxBitsMask; + } + + for (int i = 0; i < kMaxMaskSize; ++i) + mask_[i] = AOM_BLEND_A64_MAX_ALPHA - 1; + + RunTest(bsize, 1); + } +} + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P( + SSE4_1, BlendA64MaskTest8B_d16, + ::testing::Values(TestFuncs_d16(aom_lowbd_blend_a64_d16_mask_c, + aom_lowbd_blend_a64_d16_mask_sse4_1))); +#endif // HAVE_SSE4_1 + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, BlendA64MaskTest8B_d16, + ::testing::Values(TestFuncs_d16(aom_lowbd_blend_a64_d16_mask_c, + aom_lowbd_blend_a64_d16_mask_avx2))); +#endif // HAVE_AVX2 + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, BlendA64MaskTest8B_d16, + ::testing::Values(TestFuncs_d16(aom_lowbd_blend_a64_d16_mask_c, + aom_lowbd_blend_a64_d16_mask_neon))); +#endif // HAVE_NEON + +////////////////////////////////////////////////////////////////////////////// +// High bit-depth version +////////////////////////////////////////////////////////////////////////////// +#if CONFIG_AV1_HIGHBITDEPTH +typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, + uint32_t src1_stride, const uint8_t *mask, + uint32_t mask_stride, int w, int h, int subx, int suby, + int bd); +typedef libaom_test::FuncParam TestFuncsHBD; + +class BlendA64MaskTestHBD : public BlendA64MaskTest { + protected: + void Execute(const uint16_t *p_src0, const uint16_t *p_src1, int run_times) { + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_, + CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_, + CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_, + mask_, kMaxMaskWidth, w_, h_, subx_, suby_, bit_depth_); + } + aom_usec_timer_mark(&timer); + const double time1 = static_cast(aom_usec_timer_elapsed(&timer)); + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + params_.tst_func(CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_, + CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_, + CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_, + mask_, kMaxMaskWidth, w_, h_, subx_, suby_, bit_depth_); + } + aom_usec_timer_mark(&timer); + const double time2 = static_cast(aom_usec_timer_elapsed(&timer)); + if (run_times > 1) { + printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_, + time1, time2); + printf("(%3.2f)\n", time1 / time2); + } + } + + int bit_depth_; +}; + +TEST_P(BlendA64MaskTestHBD, RandomValues) { + for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + int bsize = rng_.Rand8() % BLOCK_SIZES_ALL; + switch (rng_(3)) { + case 0: bit_depth_ = 8; break; + case 1: bit_depth_ = 10; break; + default: bit_depth_ = 12; break; + } + + const int hi = 1 << bit_depth_; + + for (int i = 0; i < kBufSize; ++i) { + dst_ref_[i] = rng_(hi); + dst_tst_[i] = rng_(hi); + src0_[i] = rng_(hi); + src1_[i] = rng_(hi); + } + + for (int i = 0; i < kMaxMaskSize; ++i) + mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1); + + RunTest(bsize, 1); + } +} + +TEST_P(BlendA64MaskTestHBD, ExtremeValues) { + for (int iter = 0; iter < 1000 && !HasFatalFailure(); ++iter) { + int bsize = rng_.Rand8() % BLOCK_SIZES_ALL; + switch (rng_(3)) { + case 0: bit_depth_ = 8; break; + case 1: bit_depth_ = 10; break; + default: bit_depth_ = 12; break; + } + + const int hi = 1 << bit_depth_; + const int lo = hi - 2; + + for (int i = 0; i < kBufSize; ++i) { + dst_ref_[i] = rng_(hi - lo) + lo; + dst_tst_[i] = rng_(hi - lo) + lo; + src0_[i] = rng_(hi - lo) + lo; + src1_[i] = rng_(hi - lo) + lo; + } + + for (int i = 0; i < kMaxMaskSize; ++i) + mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1; + + RunTest(bsize, 1); + } +} + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P( + SSE4_1, BlendA64MaskTestHBD, + ::testing::Values(TestFuncsHBD(aom_highbd_blend_a64_mask_c, + aom_highbd_blend_a64_mask_sse4_1))); +#endif // HAVE_SSE4_1 + +////////////////////////////////////////////////////////////////////////////// +// HBD _d16 version +////////////////////////////////////////////////////////////////////////////// + +typedef void (*FHBD_D16)(uint8_t *dst, uint32_t dst_stride, + const CONV_BUF_TYPE *src0, uint32_t src0_stride, + const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, + int h, int subx, int suby, ConvolveParams *conv_params, + const int bd); +typedef libaom_test::FuncParam TestFuncsHBD_d16; + +class BlendA64MaskTestHBD_d16 + : public BlendA64MaskTest { + protected: + // max number of bits used by the source + static const int kSrcMaxBitsMask = (1 << 14) - 1; + static const int kSrcMaxBitsMaskHBD = (1 << 16) - 1; + + void Execute(const uint16_t *p_src0, const uint16_t *p_src1, int run_times) { + ASSERT_GT(run_times, 0) << "Cannot run 0 iterations of the test."; + ConvolveParams conv_params; + conv_params.round_0 = (bit_depth_ == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; + conv_params.round_1 = COMPOUND_ROUND1_BITS; + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_, + p_src0 + src0_offset_, src0_stride_, + p_src1 + src1_offset_, src1_stride_, mask_, + kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params, + bit_depth_); + } + if (params_.tst_func) { + aom_usec_timer_mark(&timer); + const double time1 = static_cast(aom_usec_timer_elapsed(&timer)); + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + params_.tst_func(CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), + dst_stride_, p_src0 + src0_offset_, src0_stride_, + p_src1 + src1_offset_, src1_stride_, mask_, + kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params, + bit_depth_); + } + aom_usec_timer_mark(&timer); + const double time2 = static_cast(aom_usec_timer_elapsed(&timer)); + if (run_times > 1) { + printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_, + time1, time2); + printf("(%3.2f)\n", time1 / time2); + } + } + } + + int bit_depth_; + int src_max_bits_mask_; +}; + +TEST_P(BlendA64MaskTestHBD_d16, RandomValues) { + if (params_.tst_func == NULL) return; + for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + int bsize = rng_.Rand8() % BLOCK_SIZES_ALL; + switch (rng_(3)) { + case 0: bit_depth_ = 8; break; + case 1: bit_depth_ = 10; break; + default: bit_depth_ = 12; break; + } + src_max_bits_mask_ = + (bit_depth_ == 8) ? kSrcMaxBitsMask : kSrcMaxBitsMaskHBD; + + for (int i = 0; i < kBufSize; ++i) { + dst_ref_[i] = rng_.Rand8(); + dst_tst_[i] = rng_.Rand8(); + + src0_[i] = rng_.Rand16() & src_max_bits_mask_; + src1_[i] = rng_.Rand16() & src_max_bits_mask_; + } + + for (int i = 0; i < kMaxMaskSize; ++i) + mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1); + + RunTest(bsize, 1); + } +} +// TODO (Scott LaVarnway), fix this test +TEST_P(BlendA64MaskTestHBD_d16, DISABLED_SaturatedValues) { + for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) { + for (bit_depth_ = 8; bit_depth_ <= 12; bit_depth_ += 2) { + src_max_bits_mask_ = + (bit_depth_ == 8) ? kSrcMaxBitsMask : kSrcMaxBitsMaskHBD; + + for (int i = 0; i < kBufSize; ++i) { + dst_ref_[i] = 0; + dst_tst_[i] = (1 << bit_depth_) - 1; + + src0_[i] = src_max_bits_mask_; + src1_[i] = src_max_bits_mask_; + } + + for (int i = 0; i < kMaxMaskSize; ++i) mask_[i] = AOM_BLEND_A64_MAX_ALPHA; + + RunTest(bsize, 1); + } + } +} +TEST_P(BlendA64MaskTestHBD_d16, DISABLED_Speed) { + const int kRunTimes = 10000000; + for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) { + for (bit_depth_ = 8; bit_depth_ <= 12; bit_depth_ += 2) { + for (int i = 0; i < kBufSize; ++i) { + dst_ref_[i] = rng_.Rand12() % (1 << bit_depth_); + dst_tst_[i] = rng_.Rand12() % (1 << bit_depth_); + + src0_[i] = rng_.Rand16(); + src1_[i] = rng_.Rand16(); + } + + for (int i = 0; i < kMaxMaskSize; ++i) + mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1); + + RunOneTest(bsize, 1, 1, kRunTimes); + RunOneTest(bsize, 0, 0, kRunTimes); + } + } +} + +INSTANTIATE_TEST_SUITE_P( + C, BlendA64MaskTestHBD_d16, + ::testing::Values(TestFuncsHBD_d16(aom_highbd_blend_a64_d16_mask_c, NULL))); + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P( + SSE4_1, BlendA64MaskTestHBD_d16, + ::testing::Values(TestFuncsHBD_d16(aom_highbd_blend_a64_d16_mask_c, + aom_highbd_blend_a64_d16_mask_sse4_1))); +#endif // HAVE_SSE4_1 + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, BlendA64MaskTestHBD_d16, + ::testing::Values(TestFuncsHBD_d16(aom_highbd_blend_a64_d16_mask_c, + aom_highbd_blend_a64_d16_mask_avx2))); +#endif // HAVE_AVX2 + +// TODO(slavarnway): Enable the following in the avx2 commit. (56501) +#if 0 +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + SSE4_1, BlendA64MaskTestHBD, + ::testing::Values(TestFuncsHBD(aom_highbd_blend_a64_mask_c, + aom_highbd_blend_a64_mask_avx2))); +#endif // HAVE_AVX2 +#endif +#endif // CONFIG_AV1_HIGHBITDEPTH +} // namespace diff --git a/libs/libaom/src/test/blockd_test.cc b/libs/libaom/src/test/blockd_test.cc new file mode 100644 index 000000000..17e696863 --- /dev/null +++ b/libs/libaom/src/test/blockd_test.cc @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/blockd.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +// Verify the optimized implementation of get_partition_subsize() produces the +// same results as the Partition_Subsize lookup table in the spec. +TEST(BlockdTest, GetPartitionSubsize) { + // The Partition_Subsize table in the spec (Section 9.3. Conversion tables). + /* clang-format off */ + static const BLOCK_SIZE kPartitionSubsize[10][BLOCK_SIZES_ALL] = { + { + BLOCK_4X4, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X8, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X16, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X32, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X64, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X128, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID + }, { + BLOCK_INVALID, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID + }, { + BLOCK_INVALID, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID + }, { + BLOCK_INVALID, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X8, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X16, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X32, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X64, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID + }, { + BLOCK_INVALID, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID + }, { + BLOCK_INVALID, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID + }, { + BLOCK_INVALID, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID + }, { + BLOCK_INVALID, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID + }, { + BLOCK_INVALID, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X4, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X8, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X16, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID + }, { + BLOCK_INVALID, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X32, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X64, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID + } + }; + /* clang-format on */ + + for (int partition = 0; partition < 10; partition++) { + for (int bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; bsize++) { + EXPECT_EQ(kPartitionSubsize[partition][bsize], + get_partition_subsize(static_cast(bsize), + static_cast(partition))); + } + } +} diff --git a/libs/libaom/src/test/boolcoder_test.cc b/libs/libaom/src/test/boolcoder_test.cc new file mode 100644 index 000000000..680ec1877 --- /dev/null +++ b/libs/libaom/src/test/boolcoder_test.cc @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "test/acm_random.h" +#include "aom/aom_integer.h" +#include "aom_dsp/bitreader.h" +#include "aom_dsp/bitwriter.h" + +using libaom_test::ACMRandom; + +namespace { +const int num_tests = 10; +} // namespace + +TEST(AV1, TestBitIO) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + for (int n = 0; n < num_tests; ++n) { + for (int method = 0; method <= 7; ++method) { // we generate various proba + const int kBitsToTest = 1000; + uint8_t probas[kBitsToTest]; + + for (int i = 0; i < kBitsToTest; ++i) { + const int parity = i & 1; + /* clang-format off */ + probas[i] = + (method == 0) ? 0 : (method == 1) ? 255 : + (method == 2) ? 128 : + (method == 3) ? rnd.Rand8() : + (method == 4) ? (parity ? 0 : 255) : + // alternate between low and high proba: + (method == 5) ? (parity ? rnd(128) : 255 - rnd(128)) : + (method == 6) ? + (parity ? rnd(64) : 255 - rnd(64)) : + (parity ? rnd(32) : 255 - rnd(32)); + /* clang-format on */ + } + for (int bit_method = 0; bit_method <= 3; ++bit_method) { + const int random_seed = 6432; + const int kBufferSize = 10000; + ACMRandom bit_rnd(random_seed); + aom_writer bw; + uint8_t bw_buffer[kBufferSize]; + aom_start_encode(&bw, bw_buffer); + + int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0; + for (int i = 0; i < kBitsToTest; ++i) { + if (bit_method == 2) { + bit = (i & 1); + } else if (bit_method == 3) { + bit = bit_rnd(2); + } + aom_write(&bw, bit, static_cast(probas[i])); + } + + aom_stop_encode(&bw); + + aom_reader br; + aom_reader_init(&br, bw_buffer, bw.pos); + bit_rnd.Reset(random_seed); + for (int i = 0; i < kBitsToTest; ++i) { + if (bit_method == 2) { + bit = (i & 1); + } else if (bit_method == 3) { + bit = bit_rnd(2); + } + GTEST_ASSERT_EQ(aom_read(&br, probas[i], NULL), bit) + << "pos: " << i << " / " << kBitsToTest + << " bit_method: " << bit_method << " method: " << method; + } + } + } + } +} + +#define FRAC_DIFF_TOTAL_ERROR 0.18 + +TEST(AV1, TestTell) { + const int kBufferSize = 10000; + aom_writer bw; + uint8_t bw_buffer[kBufferSize]; + const int kSymbols = 1024; + // Coders are noisier at low probabilities, so we start at p = 4. + for (int p = 4; p < 256; p++) { + double probability = p / 256.; + aom_start_encode(&bw, bw_buffer); + for (int i = 0; i < kSymbols; i++) { + aom_write(&bw, 0, p); + } + aom_stop_encode(&bw); + aom_reader br; + aom_reader_init(&br, bw_buffer, bw.pos); + uint32_t last_tell = aom_reader_tell(&br); + uint32_t last_tell_frac = aom_reader_tell_frac(&br); + double frac_diff_total = 0; + GTEST_ASSERT_GE(aom_reader_tell(&br), 0u); + GTEST_ASSERT_LE(aom_reader_tell(&br), 1u); + ASSERT_FALSE(aom_reader_has_overflowed(&br)); + for (int i = 0; i < kSymbols; i++) { + aom_read(&br, p, NULL); + uint32_t tell = aom_reader_tell(&br); + uint32_t tell_frac = aom_reader_tell_frac(&br); + GTEST_ASSERT_GE(tell, last_tell) + << "tell: " << tell << ", last_tell: " << last_tell; + GTEST_ASSERT_GE(tell_frac, last_tell_frac) + << "tell_frac: " << tell_frac + << ", last_tell_frac: " << last_tell_frac; + // Frac tell should round up to tell. + GTEST_ASSERT_EQ(tell, (tell_frac + 7) >> 3); + last_tell = tell; + frac_diff_total += + fabs(((tell_frac - last_tell_frac) / 8.0) + log2(probability)); + last_tell_frac = tell_frac; + } + const uint32_t expected = (uint32_t)(-kSymbols * log2(probability)); + // Last tell should be close to the expected value. + GTEST_ASSERT_LE(last_tell, expected + 20) << " last_tell: " << last_tell; + // The average frac_diff error should be pretty small. + GTEST_ASSERT_LE(frac_diff_total / kSymbols, FRAC_DIFF_TOTAL_ERROR) + << " frac_diff_total: " << frac_diff_total; + ASSERT_FALSE(aom_reader_has_overflowed(&br)); + } +} + +TEST(AV1, TestHasOverflowed) { + const int kBufferSize = 10000; + aom_writer bw; + uint8_t bw_buffer[kBufferSize]; + const int kSymbols = 1024; + // Coders are noisier at low probabilities, so we start at p = 4. + for (int p = 4; p < 256; p++) { + aom_start_encode(&bw, bw_buffer); + for (int i = 0; i < kSymbols; i++) { + aom_write(&bw, 1, p); + } + aom_stop_encode(&bw); + aom_reader br; + aom_reader_init(&br, bw_buffer, bw.pos); + ASSERT_FALSE(aom_reader_has_overflowed(&br)); + for (int i = 0; i < kSymbols; i++) { + GTEST_ASSERT_EQ(aom_read(&br, p, NULL), 1); + ASSERT_FALSE(aom_reader_has_overflowed(&br)); + } + // In the worst case, the encoder uses just a tiny fraction of the last + // byte in the buffer. So to guarantee that aom_reader_has_overflowed() + // returns true, we have to consume very nearly 8 additional bits of data. + // In the worse case, one of the bits in that byte will be 1, and the rest + // will be zero. Once we are past that 1 bit, when the probability of + // reading zero symbol from aom_read() is high, each additional symbol read + // will consume very little additional data (in the case that p == 255, + // approximately -log_2(255/256) ~= 0.0056 bits). In that case it would + // take around 178 calls to consume more than 8 bits. That is only an upper + // bound. In practice we are not guaranteed to hit the worse case and can + // get away with 174 calls. + for (int i = 0; i < 174; i++) { + aom_read(&br, p, NULL); + } + ASSERT_TRUE(aom_reader_has_overflowed(&br)); + } +} diff --git a/libs/libaom/src/test/borders_test.cc b/libs/libaom/src/test/borders_test.cc new file mode 100644 index 000000000..31eacab12 --- /dev/null +++ b/libs/libaom/src/test/borders_test.cc @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" + +namespace { + +class BordersTestLarge + : public ::libaom_test::CodecTestWithParam, + public ::libaom_test::EncoderTest { + protected: + BordersTestLarge() : EncoderTest(GET_PARAM(0)) {} + virtual ~BordersTestLarge() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(GET_PARAM(1)); + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AOME_SET_CPUUSED, 1); + encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1); + encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7); + encoder->Control(AOME_SET_ARNR_STRENGTH, 5); + } + } + + virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) { + if (pkt->data.frame.flags & AOM_FRAME_IS_KEY) { + } + } +}; + +TEST_P(BordersTestLarge, TestEncodeHighBitrate) { + // Validate that this non multiple of 64 wide clip encodes and decodes + // without a mismatch when passing in a very low max q. This pushes + // the encoder to producing lots of big partitions which will likely + // extend into the border and test the border condition. + cfg_.g_lag_in_frames = 25; + cfg_.rc_2pass_vbr_minsection_pct = 5; + cfg_.rc_2pass_vbr_maxsection_pct = 2000; + cfg_.rc_target_bitrate = 2000; + cfg_.rc_max_quantizer = 10; + + ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, + 10); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} +TEST_P(BordersTestLarge, TestLowBitrate) { + // Validate that this clip encodes and decodes without a mismatch + // when passing in a very high min q. This pushes the encoder to producing + // lots of small partitions which might will test the other condition. + + cfg_.g_lag_in_frames = 25; + cfg_.rc_2pass_vbr_minsection_pct = 5; + cfg_.rc_2pass_vbr_maxsection_pct = 2000; + cfg_.rc_target_bitrate = 200; + cfg_.rc_min_quantizer = 40; + + ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, + 10); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +AV1_INSTANTIATE_TEST_CASE(BordersTestLarge, + ::testing::Values(::libaom_test::kTwoPassGood)); +} // namespace diff --git a/libs/libaom/src/test/cdef_test.cc b/libs/libaom/src/test/cdef_test.cc new file mode 100644 index 000000000..a2ec1e31e --- /dev/null +++ b/libs/libaom/src/test/cdef_test.cc @@ -0,0 +1,426 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_ports/aom_timer.h" +#include "av1/common/cdef_block.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" + +using libaom_test::ACMRandom; + +namespace { + +typedef std::tuple + cdef_dir_param_t; + +class CDEFBlockTest : public ::testing::TestWithParam { + public: + virtual ~CDEFBlockTest() {} + virtual void SetUp() { + cdef = GET_PARAM(0); + ref_cdef = GET_PARAM(1); + bsize = GET_PARAM(2); + boundary = GET_PARAM(3); + depth = GET_PARAM(4); + } + + virtual void TearDown() { libaom_test::ClearSystemState(); } + + protected: + int bsize; + int boundary; + int depth; + cdef_filter_block_func cdef; + cdef_filter_block_func ref_cdef; +}; + +typedef CDEFBlockTest CDEFSpeedTest; + +void test_cdef(int bsize, int iterations, cdef_filter_block_func cdef, + cdef_filter_block_func ref_cdef, int boundary, int depth) { + const int size = 8; + const int ysize = size + 2 * CDEF_VBORDER; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + DECLARE_ALIGNED(16, uint16_t, s[ysize * CDEF_BSTRIDE]); + DECLARE_ALIGNED(16, static uint16_t, d[size * size]); + DECLARE_ALIGNED(16, static uint16_t, ref_d[size * size]); + memset(ref_d, 0, sizeof(ref_d)); + memset(d, 0, sizeof(d)); + + int error = 0, pristrength = 0, secstrength, dir; + int pridamping, secdamping, bits, level, count, + errdepth = 0, errpristrength = 0, errsecstrength = 0, errboundary = 0, + errpridamping = 0, errsecdamping = 0; + unsigned int pos = 0; + + const unsigned int max_pos = size * size >> static_cast(depth == 8); + for (pridamping = 3 + depth - 8; pridamping < 7 - 3 * !!boundary + depth - 8; + pridamping++) { + for (secdamping = 3 + depth - 8; + secdamping < 7 - 3 * !!boundary + depth - 8; secdamping++) { + for (count = 0; count < iterations; count++) { + for (level = 0; level < (1 << depth) && !error; + level += (2 + 6 * !!boundary) << (depth - 8)) { + for (bits = 1; bits <= depth && !error; bits += 1 + 3 * !!boundary) { + for (unsigned int i = 0; i < sizeof(s) / sizeof(*s); i++) + s[i] = clamp((rnd.Rand16() & ((1 << bits) - 1)) + level, 0, + (1 << depth) - 1); + if (boundary) { + if (boundary & 1) { // Left + for (int i = 0; i < ysize; i++) + for (int j = 0; j < CDEF_HBORDER; j++) + s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE; + } + if (boundary & 2) { // Right + for (int i = 0; i < ysize; i++) + for (int j = CDEF_HBORDER + size; j < CDEF_BSTRIDE; j++) + s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE; + } + if (boundary & 4) { // Above + for (int i = 0; i < CDEF_VBORDER; i++) + for (int j = 0; j < CDEF_BSTRIDE; j++) + s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE; + } + if (boundary & 8) { // Below + for (int i = CDEF_VBORDER + size; i < ysize; i++) + for (int j = 0; j < CDEF_BSTRIDE; j++) + s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE; + } + } + for (dir = 0; dir < 8; dir++) { + for (pristrength = 0; pristrength <= 19 << (depth - 8) && !error; + pristrength += (1 + 4 * !!boundary) << (depth - 8)) { + if (pristrength == 16) pristrength = 19; + for (secstrength = 0; secstrength <= 4 << (depth - 8) && !error; + secstrength += 1 << (depth - 8)) { + if (secstrength == 3 << (depth - 8)) continue; + ref_cdef(depth == 8 ? (uint8_t *)ref_d : 0, ref_d, size, + s + CDEF_HBORDER + CDEF_VBORDER * CDEF_BSTRIDE, + pristrength, secstrength, dir, pridamping, + secdamping, bsize, depth - 8); + // If cdef and ref_cdef are the same, we're just testing + // speed + if (cdef != ref_cdef) + ASM_REGISTER_STATE_CHECK( + cdef(depth == 8 ? (uint8_t *)d : 0, d, size, + s + CDEF_HBORDER + CDEF_VBORDER * CDEF_BSTRIDE, + pristrength, secstrength, dir, pridamping, + secdamping, bsize, depth - 8)); + if (ref_cdef != cdef) { + for (pos = 0; pos < max_pos && !error; pos++) { + error = ref_d[pos] != d[pos]; + errdepth = depth; + errpristrength = pristrength; + errsecstrength = secstrength; + errboundary = boundary; + errpridamping = pridamping; + errsecdamping = secdamping; + } + } + } + } + } + } + } + } + } + } + + pos--; + EXPECT_EQ(0, error) << "Error: CDEFBlockTest, SIMD and C mismatch." + << std::endl + << "First error at " << pos % size << "," << pos / size + << " (" << (int16_t)ref_d[pos] << " : " << (int16_t)d[pos] + << ") " << std::endl + << "pristrength: " << errpristrength << std::endl + << "pridamping: " << errpridamping << std::endl + << "secstrength: " << errsecstrength << std::endl + << "secdamping: " << errsecdamping << std::endl + << "depth: " << errdepth << std::endl + << "size: " << bsize << std::endl + << "boundary: " << errboundary << std::endl + << std::endl; +} + +void test_cdef_speed(int bsize, int iterations, cdef_filter_block_func cdef, + cdef_filter_block_func ref_cdef, int boundary, int depth) { + aom_usec_timer ref_timer; + aom_usec_timer timer; + + aom_usec_timer_start(&ref_timer); + test_cdef(bsize, iterations, ref_cdef, ref_cdef, boundary, depth); + aom_usec_timer_mark(&ref_timer); + int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer); + + aom_usec_timer_start(&timer); + test_cdef(bsize, iterations, cdef, cdef, boundary, depth); + aom_usec_timer_mark(&timer); + int elapsed_time = (int)aom_usec_timer_elapsed(&timer); + + EXPECT_GT(ref_elapsed_time, elapsed_time) + << "Error: CDEFSpeedTest, SIMD slower than C." << std::endl + << "C time: " << ref_elapsed_time << " us" << std::endl + << "SIMD time: " << elapsed_time << " us" << std::endl; +} + +typedef int (*find_dir_t)(const uint16_t *img, int stride, int32_t *var, + int coeff_shift); + +typedef std::tuple find_dir_param_t; + +class CDEFFindDirTest : public ::testing::TestWithParam { + public: + virtual ~CDEFFindDirTest() {} + virtual void SetUp() { + finddir = GET_PARAM(0); + ref_finddir = GET_PARAM(1); + } + + virtual void TearDown() { libaom_test::ClearSystemState(); } + + protected: + find_dir_t finddir; + find_dir_t ref_finddir; +}; + +typedef CDEFFindDirTest CDEFFindDirSpeedTest; + +void test_finddir(int (*finddir)(const uint16_t *img, int stride, int32_t *var, + int coeff_shift), + int (*ref_finddir)(const uint16_t *img, int stride, + int32_t *var, int coeff_shift)) { + const int size = 8; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + DECLARE_ALIGNED(16, uint16_t, s[size * size]); + + int error = 0; + int depth, bits, level, count, errdepth = 0; + int ref_res = 0, res = 0; + int32_t ref_var = 0, var = 0; + + for (depth = 8; depth <= 12 && !error; depth += 2) { + for (count = 0; count < 512 && !error; count++) { + for (level = 0; level < (1 << depth) && !error; + level += 1 << (depth - 8)) { + for (bits = 1; bits <= depth && !error; bits++) { + for (unsigned int i = 0; i < sizeof(s) / sizeof(*s); i++) + s[i] = clamp((rnd.Rand16() & ((1 << bits) - 1)) + level, 0, + (1 << depth) - 1); + for (int c = 0; c < 1 + 9 * (finddir == ref_finddir); c++) + ref_res = ref_finddir(s, size, &ref_var, depth - 8); + if (finddir != ref_finddir) + ASM_REGISTER_STATE_CHECK(res = finddir(s, size, &var, depth - 8)); + if (ref_finddir != finddir) { + if (res != ref_res || var != ref_var) error = 1; + errdepth = depth; + } + } + } + } + } + + EXPECT_EQ(0, error) << "Error: CDEFFindDirTest, SIMD and C mismatch." + << std::endl + << "return: " << res << " : " << ref_res << std::endl + << "var: " << var << " : " << ref_var << std::endl + << "depth: " << errdepth << std::endl + << std::endl; +} + +void test_finddir_speed(int (*finddir)(const uint16_t *img, int stride, + int32_t *var, int coeff_shift), + int (*ref_finddir)(const uint16_t *img, int stride, + int32_t *var, int coeff_shift)) { + aom_usec_timer ref_timer; + aom_usec_timer timer; + + aom_usec_timer_start(&ref_timer); + test_finddir(ref_finddir, ref_finddir); + aom_usec_timer_mark(&ref_timer); + int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer); + + aom_usec_timer_start(&timer); + test_finddir(finddir, finddir); + aom_usec_timer_mark(&timer); + int elapsed_time = (int)aom_usec_timer_elapsed(&timer); + + EXPECT_GT(ref_elapsed_time, elapsed_time) + << "Error: CDEFFindDirSpeedTest, SIMD slower than C." << std::endl + << "C time: " << ref_elapsed_time << " us" << std::endl + << "SIMD time: " << elapsed_time << " us" << std::endl; +} + +TEST_P(CDEFBlockTest, TestSIMDNoMismatch) { + test_cdef(bsize, 1, cdef, ref_cdef, boundary, depth); +} + +TEST_P(CDEFSpeedTest, DISABLED_TestSpeed) { + test_cdef_speed(bsize, 4, cdef, ref_cdef, boundary, depth); +} + +TEST_P(CDEFFindDirTest, TestSIMDNoMismatch) { + test_finddir(finddir, ref_finddir); +} + +TEST_P(CDEFFindDirSpeedTest, DISABLED_TestSpeed) { + test_finddir_speed(finddir, ref_finddir); +} + +using std::make_tuple; + +// VS compiling for 32 bit targets does not support vector types in +// structs as arguments, which makes the v256 type of the intrinsics +// hard to support, so optimizations for this target are disabled. +#if defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__) +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, CDEFBlockTest, + ::testing::Combine(::testing::Values(&cdef_filter_block_sse2), + ::testing::Values(&cdef_filter_block_c), + ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, + BLOCK_8X8), + ::testing::Range(0, 16), ::testing::Range(8, 13, 2))); +INSTANTIATE_TEST_SUITE_P(SSE2, CDEFFindDirTest, + ::testing::Values(make_tuple(&cdef_find_dir_sse2, + &cdef_find_dir_c))); +#endif +#if HAVE_SSSE3 +INSTANTIATE_TEST_SUITE_P( + SSSE3, CDEFBlockTest, + ::testing::Combine(::testing::Values(&cdef_filter_block_ssse3), + ::testing::Values(&cdef_filter_block_c), + ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, + BLOCK_8X8), + ::testing::Range(0, 16), ::testing::Range(8, 13, 2))); +INSTANTIATE_TEST_SUITE_P(SSSE3, CDEFFindDirTest, + ::testing::Values(make_tuple(&cdef_find_dir_ssse3, + &cdef_find_dir_c))); +#endif + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P( + SSE4_1, CDEFBlockTest, + ::testing::Combine(::testing::Values(&cdef_filter_block_sse4_1), + ::testing::Values(&cdef_filter_block_c), + ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, + BLOCK_8X8), + ::testing::Range(0, 16), ::testing::Range(8, 13, 2))); +INSTANTIATE_TEST_SUITE_P(SSE4_1, CDEFFindDirTest, + ::testing::Values(make_tuple(&cdef_find_dir_sse4_1, + &cdef_find_dir_c))); +#endif + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, CDEFBlockTest, + ::testing::Combine(::testing::Values(&cdef_filter_block_avx2), + ::testing::Values(&cdef_filter_block_c), + ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, + BLOCK_8X8), + ::testing::Range(0, 16), ::testing::Range(8, 13, 2))); +INSTANTIATE_TEST_SUITE_P(AVX2, CDEFFindDirTest, + ::testing::Values(make_tuple(&cdef_find_dir_avx2, + &cdef_find_dir_c))); +#endif + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, CDEFBlockTest, + ::testing::Combine(::testing::Values(&cdef_filter_block_neon), + ::testing::Values(&cdef_filter_block_c), + ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, + BLOCK_8X8), + ::testing::Range(0, 16), ::testing::Range(8, 13, 2))); +INSTANTIATE_TEST_SUITE_P(NEON, CDEFFindDirTest, + ::testing::Values(make_tuple(&cdef_find_dir_neon, + &cdef_find_dir_c))); +#endif + +// Test speed for all supported architectures +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, CDEFSpeedTest, + ::testing::Combine(::testing::Values(&cdef_filter_block_sse2), + ::testing::Values(&cdef_filter_block_c), + ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, + BLOCK_8X8), + ::testing::Range(0, 16), ::testing::Range(8, 13, 2))); +INSTANTIATE_TEST_SUITE_P(SSE2, CDEFFindDirSpeedTest, + ::testing::Values(make_tuple(&cdef_find_dir_sse2, + &cdef_find_dir_c))); +#endif + +#if HAVE_SSSE3 +INSTANTIATE_TEST_SUITE_P( + SSSE3, CDEFSpeedTest, + ::testing::Combine(::testing::Values(&cdef_filter_block_ssse3), + ::testing::Values(&cdef_filter_block_c), + ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, + BLOCK_8X8), + ::testing::Range(0, 16), ::testing::Range(8, 13, 2))); +INSTANTIATE_TEST_SUITE_P(SSSE3, CDEFFindDirSpeedTest, + ::testing::Values(make_tuple(&cdef_find_dir_ssse3, + &cdef_find_dir_c))); +#endif + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P( + SSE4_1, CDEFSpeedTest, + ::testing::Combine(::testing::Values(&cdef_filter_block_sse4_1), + ::testing::Values(&cdef_filter_block_c), + ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, + BLOCK_8X8), + ::testing::Range(0, 16), ::testing::Range(8, 13, 2))); +INSTANTIATE_TEST_SUITE_P(SSE4_1, CDEFFindDirSpeedTest, + ::testing::Values(make_tuple(&cdef_find_dir_sse4_1, + &cdef_find_dir_c))); +#endif + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, CDEFSpeedTest, + ::testing::Combine(::testing::Values(&cdef_filter_block_avx2), + ::testing::Values(&cdef_filter_block_c), + ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, + BLOCK_8X8), + ::testing::Range(0, 16), ::testing::Range(8, 13, 2))); +INSTANTIATE_TEST_SUITE_P(AVX2, CDEFFindDirSpeedTest, + ::testing::Values(make_tuple(&cdef_find_dir_avx2, + &cdef_find_dir_c))); +#endif + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, CDEFSpeedTest, + ::testing::Combine(::testing::Values(&cdef_filter_block_neon), + ::testing::Values(&cdef_filter_block_c), + ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, + BLOCK_8X8), + ::testing::Range(0, 16), ::testing::Range(8, 13, 2))); +INSTANTIATE_TEST_SUITE_P(NEON, CDEFFindDirSpeedTest, + ::testing::Values(make_tuple(&cdef_find_dir_neon, + &cdef_find_dir_c))); +#endif + +#endif // defined(_WIN64) || !defined(_MSC_VER) +} // namespace diff --git a/libs/libaom/src/test/cfl_test.cc b/libs/libaom/src/test/cfl_test.cc new file mode 100644 index 000000000..d2973159c --- /dev/null +++ b/libs/libaom/src/test/cfl_test.cc @@ -0,0 +1,585 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/av1_rtcd.h" + +#include "aom_ports/aom_timer.h" +#include "test/util.h" +#include "test/acm_random.h" + +using std::make_tuple; + +using libaom_test::ACMRandom; + +#define NUM_ITERATIONS (100) +#define NUM_ITERATIONS_SPEED (INT16_MAX) + +#define ALL_CFL_TX_SIZES(function) \ + make_tuple(static_cast(TX_4X4), &function), \ + make_tuple(static_cast(TX_4X8), &function), \ + make_tuple(static_cast(TX_4X16), &function), \ + make_tuple(static_cast(TX_8X4), &function), \ + make_tuple(static_cast(TX_8X8), &function), \ + make_tuple(static_cast(TX_8X16), &function), \ + make_tuple(static_cast(TX_8X32), &function), \ + make_tuple(static_cast(TX_16X4), &function), \ + make_tuple(static_cast(TX_16X8), &function), \ + make_tuple(static_cast(TX_16X16), &function), \ + make_tuple(static_cast(TX_16X32), &function), \ + make_tuple(static_cast(TX_32X8), &function), \ + make_tuple(static_cast(TX_32X16), &function), \ + make_tuple(static_cast(TX_32X32), &function) + +#define ALL_CFL_TX_SIZES_SUBSAMPLE(fun420, fun422, fun444) \ + make_tuple(static_cast(TX_4X4), &fun420, &fun422, &fun444), \ + make_tuple(static_cast(TX_4X8), &fun420, &fun422, &fun444), \ + make_tuple(static_cast(TX_4X16), &fun420, &fun422, &fun444), \ + make_tuple(static_cast(TX_8X4), &fun420, &fun422, &fun444), \ + make_tuple(static_cast(TX_8X8), &fun420, &fun422, &fun444), \ + make_tuple(static_cast(TX_8X16), &fun420, &fun422, &fun444), \ + make_tuple(static_cast(TX_8X32), &fun420, &fun422, &fun444), \ + make_tuple(static_cast(TX_16X4), &fun420, &fun422, &fun444), \ + make_tuple(static_cast(TX_16X8), &fun420, &fun422, &fun444), \ + make_tuple(static_cast(TX_16X16), &fun420, &fun422, &fun444), \ + make_tuple(static_cast(TX_16X32), &fun420, &fun422, &fun444), \ + make_tuple(static_cast(TX_32X8), &fun420, &fun422, &fun444), \ + make_tuple(static_cast(TX_32X16), &fun420, &fun422, &fun444), \ + make_tuple(static_cast(TX_32X32), &fun420, &fun422, &fun444) + +namespace { + +template +static void assert_eq(const A *a, const A *b, int width, int height) { + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i++) { + ASSERT_EQ(a[j * CFL_BUF_LINE + i], b[j * CFL_BUF_LINE + i]); + } + } +} + +static void assertFaster(int ref_elapsed_time, int elapsed_time) { + EXPECT_GT(ref_elapsed_time, elapsed_time) + << "Error: CFLSubtractSpeedTest, SIMD slower than C." << std::endl + << "C time: " << ref_elapsed_time << " us" << std::endl + << "SIMD time: " << elapsed_time << " us" << std::endl; +} + +static void printSpeed(int ref_elapsed_time, int elapsed_time, int width, + int height) { + std::cout.precision(2); + std::cout << "[ ] " << width << "x" << height + << ": C time = " << ref_elapsed_time + << " us, SIMD time = " << elapsed_time << " us" + << " (~" << ref_elapsed_time / (double)elapsed_time << "x) " + << std::endl; +} + +class CFLTest { + public: + virtual ~CFLTest() {} + void init(TX_SIZE tx) { + tx_size = tx; + width = tx_size_wide[tx_size]; + height = tx_size_high[tx_size]; + rnd.Reset(ACMRandom::DeterministicSeed()); + } + + protected: + TX_SIZE tx_size; + int width; + int height; + ACMRandom rnd; +}; + +template +class CFLTestWithData : public CFLTest { + public: + virtual ~CFLTestWithData() {} + + protected: + I data[CFL_BUF_SQUARE]; + I data_ref[CFL_BUF_SQUARE]; + void randData(I (ACMRandom::*random)()) { + for (int j = 0; j < this->height; j++) { + for (int i = 0; i < this->width; i++) { + const I d = (this->rnd.*random)(); + data[j * CFL_BUF_LINE + i] = d; + data_ref[j * CFL_BUF_LINE + i] = d; + } + } + } +}; + +template +class CFLTestWithAlignedData : public CFLTest { + public: + CFLTestWithAlignedData() { + chroma_pels_ref = + reinterpret_cast(aom_memalign(32, sizeof(I) * CFL_BUF_SQUARE)); + chroma_pels = + reinterpret_cast(aom_memalign(32, sizeof(I) * CFL_BUF_SQUARE)); + sub_luma_pels_ref = reinterpret_cast( + aom_memalign(32, sizeof(int16_t) * CFL_BUF_SQUARE)); + sub_luma_pels = reinterpret_cast( + aom_memalign(32, sizeof(int16_t) * CFL_BUF_SQUARE)); + memset(chroma_pels_ref, 0, sizeof(I) * CFL_BUF_SQUARE); + memset(chroma_pels, 0, sizeof(I) * CFL_BUF_SQUARE); + memset(sub_luma_pels_ref, 0, sizeof(int16_t) * CFL_BUF_SQUARE); + memset(sub_luma_pels, 0, sizeof(int16_t) * CFL_BUF_SQUARE); + } + ~CFLTestWithAlignedData() { + aom_free(chroma_pels_ref); + aom_free(sub_luma_pels_ref); + aom_free(chroma_pels); + aom_free(sub_luma_pels); + } + + protected: + I *chroma_pels_ref; + I *chroma_pels; + int16_t *sub_luma_pels_ref; + int16_t *sub_luma_pels; + int alpha_q3; + I dc; + void randData(int bd) { + alpha_q3 = this->rnd(33) - 16; + dc = this->rnd(1 << bd); + for (int j = 0; j < this->height; j++) { + for (int i = 0; i < this->width; i++) { + chroma_pels[j * CFL_BUF_LINE + i] = dc; + chroma_pels_ref[j * CFL_BUF_LINE + i] = dc; + sub_luma_pels_ref[j * CFL_BUF_LINE + i] = + sub_luma_pels[j * CFL_BUF_LINE + i] = this->rnd(1 << (bd + 3)); + } + } + } +}; + +typedef cfl_subtract_average_fn (*sub_avg_fn)(TX_SIZE tx_size); +typedef std::tuple sub_avg_param; +class CFLSubAvgTest : public ::testing::TestWithParam, + public CFLTestWithData { + public: + virtual void SetUp() { + CFLTest::init(std::get<0>(this->GetParam())); + sub_avg = std::get<1>(this->GetParam())(tx_size); + sub_avg_ref = cfl_get_subtract_average_fn_c(tx_size); + } + virtual ~CFLSubAvgTest() {} + + protected: + cfl_subtract_average_fn sub_avg; + cfl_subtract_average_fn sub_avg_ref; +}; + +TEST_P(CFLSubAvgTest, SubAvgTest) { + for (int it = 0; it < NUM_ITERATIONS; it++) { + randData(&ACMRandom::Rand15Signed); + sub_avg((uint16_t *)data, data); + sub_avg_ref((uint16_t *)data_ref, data_ref); + assert_eq(data, data_ref, width, height); + } +} + +TEST_P(CFLSubAvgTest, DISABLED_SubAvgSpeedTest) { + aom_usec_timer ref_timer; + aom_usec_timer timer; + randData(&ACMRandom::Rand15Signed); + aom_usec_timer_start(&ref_timer); + for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) { + sub_avg_ref((uint16_t *)data_ref, data_ref); + } + aom_usec_timer_mark(&ref_timer); + int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer); + aom_usec_timer_start(&timer); + for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) { + sub_avg((uint16_t *)data, data); + } + aom_usec_timer_mark(&timer); + int elapsed_time = (int)aom_usec_timer_elapsed(&timer); + printSpeed(ref_elapsed_time, elapsed_time, width, height); + assertFaster(ref_elapsed_time, elapsed_time); +} + +template +class CFLSubsampleTest : public ::testing::TestWithParam, + public CFLTestWithData { + public: + virtual void SetUp() { + CFLTest::init(std::get<0>(this->GetParam())); + fun_420 = std::get<1>(this->GetParam())(this->tx_size); + fun_422 = std::get<2>(this->GetParam())(this->tx_size); + fun_444 = std::get<3>(this->GetParam())(this->tx_size); + } + + protected: + T fun_420; + T fun_422; + T fun_444; + T fun_420_ref; + T fun_422_ref; + T fun_444_ref; + + void subsampleTest(T fun, T fun_ref, int sub_width, int sub_height, + I (ACMRandom::*random)()) { + uint16_t sub_luma_pels[CFL_BUF_SQUARE]; + uint16_t sub_luma_pels_ref[CFL_BUF_SQUARE]; + + for (int it = 0; it < NUM_ITERATIONS; it++) { + CFLTestWithData::randData(random); + fun(this->data, CFL_BUF_LINE, sub_luma_pels); + fun_ref(this->data_ref, CFL_BUF_LINE, sub_luma_pels_ref); + assert_eq(sub_luma_pels, sub_luma_pels_ref, sub_width, + sub_height); + } + } + + void subsampleSpeedTest(T fun, T fun_ref, I (ACMRandom::*random)()) { + uint16_t sub_luma_pels[CFL_BUF_SQUARE]; + uint16_t sub_luma_pels_ref[CFL_BUF_SQUARE]; + aom_usec_timer ref_timer; + aom_usec_timer timer; + + CFLTestWithData::randData(random); + aom_usec_timer_start(&ref_timer); + for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) { + fun_ref(this->data_ref, CFL_BUF_LINE, sub_luma_pels); + } + aom_usec_timer_mark(&ref_timer); + int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer); + aom_usec_timer_start(&timer); + for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) { + fun(this->data, CFL_BUF_LINE, sub_luma_pels_ref); + } + aom_usec_timer_mark(&timer); + int elapsed_time = (int)aom_usec_timer_elapsed(&timer); + printSpeed(ref_elapsed_time, elapsed_time, this->width, this->height); + assertFaster(ref_elapsed_time, elapsed_time); + } +}; + +typedef cfl_subsample_lbd_fn (*get_subsample_lbd_fn)(TX_SIZE tx_size); +typedef std::tuple + subsample_lbd_param; +class CFLSubsampleLBDTest + : public CFLSubsampleTest { + public: + virtual ~CFLSubsampleLBDTest() {} + virtual void SetUp() { + CFLSubsampleTest::SetUp(); + fun_420_ref = cfl_get_luma_subsampling_420_lbd_c(tx_size); + fun_422_ref = cfl_get_luma_subsampling_422_lbd_c(tx_size); + fun_444_ref = cfl_get_luma_subsampling_444_lbd_c(tx_size); + } +}; + +TEST_P(CFLSubsampleLBDTest, SubsampleLBD420Test) { + subsampleTest(fun_420, fun_420_ref, width >> 1, height >> 1, + &ACMRandom::Rand8); +} + +TEST_P(CFLSubsampleLBDTest, DISABLED_SubsampleLBD420SpeedTest) { + subsampleSpeedTest(fun_420, fun_420_ref, &ACMRandom::Rand8); +} + +TEST_P(CFLSubsampleLBDTest, SubsampleLBD422Test) { + subsampleTest(fun_422, fun_422_ref, width >> 1, height, &ACMRandom::Rand8); +} + +TEST_P(CFLSubsampleLBDTest, DISABLED_SubsampleLBD422SpeedTest) { + subsampleSpeedTest(fun_422, fun_422_ref, &ACMRandom::Rand8); +} + +TEST_P(CFLSubsampleLBDTest, SubsampleLBD444Test) { + subsampleTest(fun_444, fun_444_ref, width, height, &ACMRandom::Rand8); +} + +TEST_P(CFLSubsampleLBDTest, DISABLED_SubsampleLBD444SpeedTest) { + subsampleSpeedTest(fun_444, fun_444_ref, &ACMRandom::Rand8); +} + +#if CONFIG_AV1_HIGHBITDEPTH +typedef cfl_subsample_hbd_fn (*get_subsample_hbd_fn)(TX_SIZE tx_size); +typedef std::tuple + subsample_hbd_param; +class CFLSubsampleHBDTest + : public CFLSubsampleTest { + public: + virtual ~CFLSubsampleHBDTest() {} + virtual void SetUp() { + CFLSubsampleTest::SetUp(); + fun_420_ref = cfl_get_luma_subsampling_420_hbd_c(tx_size); + fun_422_ref = cfl_get_luma_subsampling_422_hbd_c(tx_size); + fun_444_ref = cfl_get_luma_subsampling_444_hbd_c(tx_size); + } +}; + +TEST_P(CFLSubsampleHBDTest, SubsampleHBD420Test) { + subsampleTest(fun_420, fun_420_ref, width >> 1, height >> 1, + &ACMRandom::Rand12); +} + +TEST_P(CFLSubsampleHBDTest, DISABLED_SubsampleHBD420SpeedTest) { + subsampleSpeedTest(fun_420, fun_420_ref, &ACMRandom::Rand12); +} + +TEST_P(CFLSubsampleHBDTest, SubsampleHBD422Test) { + subsampleTest(fun_422, fun_422_ref, width >> 1, height, &ACMRandom::Rand12); +} + +TEST_P(CFLSubsampleHBDTest, DISABLED_SubsampleHBD422SpeedTest) { + subsampleSpeedTest(fun_422, fun_422_ref, &ACMRandom::Rand12); +} + +TEST_P(CFLSubsampleHBDTest, SubsampleHBD444Test) { + subsampleTest(fun_444, fun_444_ref, width, height, &ACMRandom::Rand12); +} + +TEST_P(CFLSubsampleHBDTest, DISABLED_SubsampleHBD444SpeedTest) { + subsampleSpeedTest(fun_444, fun_444_ref, &ACMRandom::Rand12); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +typedef cfl_predict_lbd_fn (*get_predict_fn)(TX_SIZE tx_size); +typedef std::tuple predict_param; +class CFLPredictTest : public ::testing::TestWithParam, + public CFLTestWithAlignedData { + public: + virtual void SetUp() { + CFLTest::init(std::get<0>(this->GetParam())); + predict = std::get<1>(this->GetParam())(tx_size); + predict_ref = cfl_get_predict_lbd_fn_c(tx_size); + } + virtual ~CFLPredictTest() {} + + protected: + cfl_predict_lbd_fn predict; + cfl_predict_lbd_fn predict_ref; +}; + +TEST_P(CFLPredictTest, PredictTest) { + for (int it = 0; it < NUM_ITERATIONS; it++) { + randData(8); + predict(sub_luma_pels, chroma_pels, CFL_BUF_LINE, alpha_q3); + predict_ref(sub_luma_pels_ref, chroma_pels_ref, CFL_BUF_LINE, alpha_q3); + assert_eq(chroma_pels, chroma_pels_ref, width, height); + } +} +TEST_P(CFLPredictTest, DISABLED_PredictSpeedTest) { + aom_usec_timer ref_timer; + aom_usec_timer timer; + randData(8); + aom_usec_timer_start(&ref_timer); + for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) { + predict_ref(sub_luma_pels_ref, chroma_pels_ref, CFL_BUF_LINE, alpha_q3); + } + aom_usec_timer_mark(&ref_timer); + int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer); + + aom_usec_timer_start(&timer); + for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) { + predict(sub_luma_pels, chroma_pels, CFL_BUF_LINE, alpha_q3); + } + aom_usec_timer_mark(&timer); + int elapsed_time = (int)aom_usec_timer_elapsed(&timer); + printSpeed(ref_elapsed_time, elapsed_time, width, height); + assertFaster(ref_elapsed_time, elapsed_time); +} + +#if CONFIG_AV1_HIGHBITDEPTH +typedef cfl_predict_hbd_fn (*get_predict_fn_hbd)(TX_SIZE tx_size); +typedef std::tuple predict_param_hbd; +class CFLPredictHBDTest : public ::testing::TestWithParam, + public CFLTestWithAlignedData { + public: + virtual void SetUp() { + CFLTest::init(std::get<0>(this->GetParam())); + predict = std::get<1>(this->GetParam())(tx_size); + predict_ref = cfl_get_predict_hbd_fn_c(tx_size); + } + virtual ~CFLPredictHBDTest() {} + + protected: + cfl_predict_hbd_fn predict; + cfl_predict_hbd_fn predict_ref; +}; + +TEST_P(CFLPredictHBDTest, PredictHBDTest) { + int bd = 12; + for (int it = 0; it < NUM_ITERATIONS; it++) { + randData(bd); + predict(sub_luma_pels, chroma_pels, CFL_BUF_LINE, alpha_q3, bd); + predict_ref(sub_luma_pels_ref, chroma_pels_ref, CFL_BUF_LINE, alpha_q3, bd); + assert_eq(chroma_pels, chroma_pels_ref, width, height); + } +} +TEST_P(CFLPredictHBDTest, DISABLED_PredictHBDSpeedTest) { + aom_usec_timer ref_timer; + aom_usec_timer timer; + const int bd = 12; + randData(bd); + aom_usec_timer_start(&ref_timer); + for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) { + predict_ref(sub_luma_pels_ref, chroma_pels_ref, CFL_BUF_LINE, alpha_q3, bd); + } + aom_usec_timer_mark(&ref_timer); + int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer); + + aom_usec_timer_start(&timer); + for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) { + predict(sub_luma_pels, chroma_pels, CFL_BUF_LINE, alpha_q3, bd); + } + aom_usec_timer_mark(&timer); + int elapsed_time = (int)aom_usec_timer_elapsed(&timer); + printSpeed(ref_elapsed_time, elapsed_time, width, height); + assertFaster(ref_elapsed_time, elapsed_time); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +#if HAVE_SSE2 +const sub_avg_param sub_avg_sizes_sse2[] = { ALL_CFL_TX_SIZES( + cfl_get_subtract_average_fn_sse2) }; + +INSTANTIATE_TEST_SUITE_P(SSE2, CFLSubAvgTest, + ::testing::ValuesIn(sub_avg_sizes_sse2)); + +#endif + +#if HAVE_SSSE3 +const subsample_lbd_param subsample_lbd_sizes_ssse3[] = { + ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_lbd_ssse3, + cfl_get_luma_subsampling_422_lbd_ssse3, + cfl_get_luma_subsampling_444_lbd_ssse3) +}; + +const predict_param predict_sizes_ssse3[] = { ALL_CFL_TX_SIZES( + cfl_get_predict_lbd_fn_ssse3) }; + +INSTANTIATE_TEST_SUITE_P(SSSE3, CFLSubsampleLBDTest, + ::testing::ValuesIn(subsample_lbd_sizes_ssse3)); + +INSTANTIATE_TEST_SUITE_P(SSSE3, CFLPredictTest, + ::testing::ValuesIn(predict_sizes_ssse3)); + +#if CONFIG_AV1_HIGHBITDEPTH +const subsample_hbd_param subsample_hbd_sizes_ssse3[] = { + ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_hbd_ssse3, + cfl_get_luma_subsampling_422_hbd_ssse3, + cfl_get_luma_subsampling_444_hbd_ssse3) +}; + +const predict_param_hbd predict_sizes_hbd_ssse3[] = { ALL_CFL_TX_SIZES( + cfl_get_predict_hbd_fn_ssse3) }; + +INSTANTIATE_TEST_SUITE_P(SSSE3, CFLSubsampleHBDTest, + ::testing::ValuesIn(subsample_hbd_sizes_ssse3)); + +INSTANTIATE_TEST_SUITE_P(SSSE3, CFLPredictHBDTest, + ::testing::ValuesIn(predict_sizes_hbd_ssse3)); +#endif // CONFIG_AV1_HIGHBITDEPTH +#endif // HAVE_SSSE3 + +#if HAVE_AVX2 +const sub_avg_param sub_avg_sizes_avx2[] = { ALL_CFL_TX_SIZES( + cfl_get_subtract_average_fn_avx2) }; + +const subsample_lbd_param subsample_lbd_sizes_avx2[] = { + ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_lbd_avx2, + cfl_get_luma_subsampling_422_lbd_avx2, + cfl_get_luma_subsampling_444_lbd_avx2) +}; + +const predict_param predict_sizes_avx2[] = { ALL_CFL_TX_SIZES( + cfl_get_predict_lbd_fn_avx2) }; + +INSTANTIATE_TEST_SUITE_P(AVX2, CFLSubAvgTest, + ::testing::ValuesIn(sub_avg_sizes_avx2)); + +INSTANTIATE_TEST_SUITE_P(AVX2, CFLSubsampleLBDTest, + ::testing::ValuesIn(subsample_lbd_sizes_avx2)); + +INSTANTIATE_TEST_SUITE_P(AVX2, CFLPredictTest, + ::testing::ValuesIn(predict_sizes_avx2)); + +#if CONFIG_AV1_HIGHBITDEPTH +const subsample_hbd_param subsample_hbd_sizes_avx2[] = { + ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_hbd_avx2, + cfl_get_luma_subsampling_422_hbd_avx2, + cfl_get_luma_subsampling_444_hbd_avx2) +}; + +const predict_param_hbd predict_sizes_hbd_avx2[] = { ALL_CFL_TX_SIZES( + cfl_get_predict_hbd_fn_avx2) }; + +INSTANTIATE_TEST_SUITE_P(AVX2, CFLSubsampleHBDTest, + ::testing::ValuesIn(subsample_hbd_sizes_avx2)); + +INSTANTIATE_TEST_SUITE_P(AVX2, CFLPredictHBDTest, + ::testing::ValuesIn(predict_sizes_hbd_avx2)); +#endif // CONFIG_AV1_HIGHBITDEPTH +#endif // HAVE_AVX2 + +#if HAVE_NEON +const sub_avg_param sub_avg_sizes_neon[] = { ALL_CFL_TX_SIZES( + cfl_get_subtract_average_fn_neon) }; + +const predict_param predict_sizes_neon[] = { ALL_CFL_TX_SIZES( + cfl_get_predict_lbd_fn_neon) }; + +const subsample_lbd_param subsample_lbd_sizes_neon[] = { + ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_lbd_neon, + cfl_get_luma_subsampling_422_lbd_neon, + cfl_get_luma_subsampling_444_lbd_neon) +}; + +INSTANTIATE_TEST_SUITE_P(NEON, CFLSubAvgTest, + ::testing::ValuesIn(sub_avg_sizes_neon)); + +INSTANTIATE_TEST_SUITE_P(NEON, CFLSubsampleLBDTest, + ::testing::ValuesIn(subsample_lbd_sizes_neon)); + +INSTANTIATE_TEST_SUITE_P(NEON, CFLPredictTest, + ::testing::ValuesIn(predict_sizes_neon)); + +#if CONFIG_AV1_HIGHBITDEPTH +const subsample_hbd_param subsample_hbd_sizes_neon[] = { + ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_hbd_neon, + cfl_get_luma_subsampling_422_hbd_neon, + cfl_get_luma_subsampling_444_hbd_neon) +}; + +const predict_param_hbd predict_sizes_hbd_neon[] = { ALL_CFL_TX_SIZES( + cfl_get_predict_hbd_fn_neon) }; + +INSTANTIATE_TEST_SUITE_P(NEON, CFLSubsampleHBDTest, + ::testing::ValuesIn(subsample_hbd_sizes_neon)); + +INSTANTIATE_TEST_SUITE_P(NEON, CFLPredictHBDTest, + ::testing::ValuesIn(predict_sizes_hbd_neon)); +#endif // CONFIG_AV1_HIGHBITDEPTH +#endif // HAVE_NEON + +#if HAVE_VSX +const sub_avg_param sub_avg_sizes_vsx[] = { ALL_CFL_TX_SIZES( + cfl_get_subtract_average_fn_vsx) }; + +INSTANTIATE_TEST_SUITE_P(VSX, CFLSubAvgTest, + ::testing::ValuesIn(sub_avg_sizes_vsx)); +#endif +} // namespace diff --git a/libs/libaom/src/test/clear_system_state.h b/libs/libaom/src/test/clear_system_state.h new file mode 100644 index 000000000..d38ff5dd5 --- /dev/null +++ b/libs/libaom/src/test/clear_system_state.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_TEST_CLEAR_SYSTEM_STATE_H_ +#define AOM_TEST_CLEAR_SYSTEM_STATE_H_ + +#include "config/aom_config.h" + +#if ARCH_X86 || ARCH_X86_64 +#include "aom_ports/x86.h" +#endif + +namespace libaom_test { + +// Reset system to a known state. This function should be used for all non-API +// test cases. +inline void ClearSystemState() { +#if ARCH_X86 || ARCH_X86_64 + aom_reset_mmx_state(); +#endif +} + +} // namespace libaom_test +#endif // AOM_TEST_CLEAR_SYSTEM_STATE_H_ diff --git a/libs/libaom/src/test/cnn_test.cc b/libs/libaom/src/test/cnn_test.cc new file mode 100644 index 000000000..4410493d3 --- /dev/null +++ b/libs/libaom/src/test/cnn_test.cc @@ -0,0 +1,2496 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/av1_rtcd.h" + +#include "av1/encoder/cnn.h" + +#define SQR(x) ((x) * (x)) + +// Best possible pixelwise guarenteed preicison given each float has at most +// 3 specified decimals. +#define PIXELWISE_FLOAT_TOL 1E-2 + +#define MSE_FLOAT_TOL 1E-6 +#define MSE_INT_TOL 0 + +namespace { + +class CNNTest : public ::testing::Test { + protected: + static void RunCNNTest(int image_width, int image_height, const float *input, + const float *expected, const CNN_CONFIG *cnn_config, + int in_stride, CNN_THREAD_DATA *thread_data, + double tolerance) { + int out_width, out_height, out_channels; + av1_find_cnn_output_size(image_width, image_height, cnn_config, &out_width, + &out_height, &out_channels); + + const int out_size = out_width * out_height; + const int out_stride = out_width; + + float *output_ = + (float *)aom_malloc(sizeof(*output_) * out_size * out_channels); + float *output[CNN_MAX_CHANNELS] = { nullptr }; + for (int channel = 0; channel < out_channels; ++channel) { + output[channel] = output_ + (channel * out_size); + } + const int num_outputs = 1; + const int output_chs[1] = { out_channels }; + const int output_strides[1] = { out_stride }; + CNN_MULTI_OUT output_struct = { num_outputs, output_chs, output_strides, + output }; + + RunMultiOutCNNTest(&input, image_width, image_height, in_stride, cnn_config, + thread_data, &output_struct, &expected, tolerance); + + aom_free(output_); + } + + static void RunMultiOutCNNTest(const float **input, int image_width, + int image_height, int in_stride, + const CNN_CONFIG *cnn_config, + CNN_THREAD_DATA *thread_data, + CNN_MULTI_OUT *output, const float **expected, + double tolerance) { + const int num_outputs = output->num_outputs; + const int *output_chs = output->output_channels; + + int *out_widths = (int *)aom_calloc(num_outputs, sizeof(*out_widths)); + int *out_heights = (int *)aom_calloc(num_outputs, sizeof(*out_heights)); + int *not_used = (int *)aom_calloc(num_outputs, sizeof(*not_used)); + + av1_find_cnn_output_size(image_width, image_height, cnn_config, out_widths, + out_heights, not_used); + av1_cnn_predict(input, image_width, image_height, in_stride, cnn_config, + thread_data, output); + + int channel_offset = 0; + for (int output_idx = 0; output_idx < num_outputs; output_idx++) { + const float *expected_out = expected[output_idx]; + const int curr_output_chs = output_chs[output_idx]; + const int out_size = out_widths[output_idx] * out_heights[output_idx]; + + double mse = 0; + int expected_ite = 0; + for (int channel = 0; channel < curr_output_chs; ++channel) { + const float *buf_out = output->output_buffer[channel_offset]; + + for (int i = 0; i < out_size; ++i) { + EXPECT_NEAR(expected_out[expected_ite], buf_out[i], + PIXELWISE_FLOAT_TOL) + << " output " << output_idx << " channel " << channel << " pixel " + << expected_ite % out_size << ": " << expected_out[expected_ite] + << "/" << buf_out[i] << std::endl; + mse += SQR(expected_out[expected_ite] - buf_out[i]); + expected_ite++; + } + + channel_offset++; + } + mse /= (out_size * curr_output_chs); + EXPECT_LE(mse, tolerance) << " output " << output_idx << std::endl; + } + + aom_free(out_widths); + aom_free(out_heights); + aom_free(not_used); + } + + static void AssignLayerWeightsBiases(CNN_CONFIG *cnn_config, float *weights, + float *bias) { + size_t weight_offset = 0; + size_t bias_offset = 0; + for (int layer = 0; layer < cnn_config->num_layers; ++layer) { + CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[layer]; + layer_config->weights = weights + weight_offset; + layer_config->bias = bias + bias_offset; + weight_offset += layer_config->filter_width * + layer_config->filter_height * layer_config->in_channels * + layer_config->out_channels; + bias_offset += layer_config->out_channels; + + ASSERT_NE(layer_config->weights, nullptr); + ASSERT_NE(layer_config->bias, nullptr); + } + } +}; + +} // namespace + +TEST_F(CNNTest, TestMultilayerConvolution) { + int image_height = 16; + int image_width = 16; + int filter_height = 5; + int filter_width = 4; + + float input[] = { + -3, 1, -3, 2, -2, -2, 2, -2, 1, -2, -3, 1, 2, 2, 2, -2, 0, 1, -1, + -3, -1, -1, 1, 0, -3, 1, 0, -1, 1, 0, 0, -3, -3, -3, 0, 2, 1, -1, + 2, 0, 1, -3, -1, 2, 2, 1, -2, 0, -1, 0, -2, -2, -1, 1, 0, 0, 0, + -2, -2, -2, 1, 1, -2, 1, 1, -2, -2, 1, -2, -1, -2, -3, 2, -3, -1, 1, + 0, -2, -2, -2, 1, -2, -2, -1, -1, 2, 2, 2, -1, 1, -3, -3, 0, 2, 0, + 2, 1, -3, -3, 1, 2, 2, 1, -2, -3, 0, -3, 0, -3, -2, 0, 1, 1, 0, + -3, 2, -1, 2, 1, 0, 1, -2, 1, -1, -1, 2, 0, -2, -3, 1, 1, -2, -1, + -3, -3, -1, 0, -3, -2, 0, 0, 1, 0, -3, -2, -1, 1, 0, 2, 1, 0, -3, + -2, -3, -3, -1, 0, -2, 2, -1, -3, 0, -1, -1, 2, 0, -3, -2, -1, 0, 0, + 1, -2, 1, 2, 1, 2, 2, -3, 2, -1, 0, 0, -1, 0, 2, 2, -1, 2, -2, + 1, 1, -3, -3, 1, -1, -1, -2, 2, -2, -2, 2, -1, -3, 2, -3, 1, -1, -1, + -3, 1, -1, 1, 0, -3, -3, 1, -3, -3, 0, 2, 2, -2, -1, 2, 0, 2, 1, + -1, -3, 0, 0, -1, -1, 1, 0, 2, 0, -3, 2, 1, 0, 1, -3, 2, -3, -3, + -1, -3, -3, 2, 0, 2, -2, 1, -1, + }; + + float weights[] = { + -2, 2, -2, 2, -1, -3, 2, 2, 0, 0, -3, -1, -2, -3, 1, -1, 0, 0, 0, + 2, -2, 2, -2, -3, 1, 1, 1, -3, -1, 0, 1, 2, -2, 0, -1, -3, -1, -2, + 2, -3, -3, 1, -2, -3, 0, 2, 1, -3, -3, -1, -3, -2, -1, -3, -1, -3, -2, + -1, -3, -1, -2, -2, -3, 2, 0, -3, 0, -3, -3, 1, -3, -1, 0, -1, 1, 1, + -1, 1, -2, 0, 2, 0, -3, 1, -1, -1, 2, 0, 1, -3, -3, 1, 2, -3, -3, + 1, -3, 2, 0, -3, 1, 2, 2, -2, -1, -2, 1, 1, 0, -2, -2, 1, 2, -1, + -3, 1, -2, 2, -3, -2, -3, 2, 1, 0, -2, 0, 1, -3, 2, -2, -2, 0, 2, + -3, 2, 0, 0, 1, -2, 1, 1, -2, -1, -2, 1, -2, 0, -2, -2, 0, -1, -1, + -3, -3, -3, 1, -3, -2, 2, -1, 2, 0, 2, -2, 2, -2, 1, -3, -3, -1, 0, + 2, 2, 1, -1, -3, -1, -3, 2, 1, -2, 0, -3, -1, -3, -1, 2, 1, 0, 2, + -1, 1, 0, 1, 2, -1, -2, 2, 1, -3, -1, -3, 0, 1, -2, 0, -2, -3, 0, + -2, 2, 2, 0, 0, 2, -3, 2, -3, -2, 1, 2, -3, -3, -1, -3, 0, -3, -3, + -2, -2, -2, 0, 0, 1, 0, 0, -1, 0, 0, -3, 0, -3, -1, -2, 1, -2, -1, + 2, -2, 0, 0, 1, 0, -2, -1, 0, -3, 1, 0, -1, -3, 1, -1, 1, -1, -3, + 1, 0, 1, 1, -1, 2, 2, 0, 0, 1, -3, 2, -2, -2, -3, -2, -1, -2, 2, + 0, 2, -2, -3, -1, -3, 2, 2, -1, 2, 2, -1, 0, -3, 1, + }; + + float bias[] = { + 1, -1, 0, 1, 1, 1, -2, + }; + + float expected_same[] = { + -1125, 2926, 6406, 631, -1244, 97, -1454, 2526, 1065, 3292, 3464, + 2553, -330, 532, 1038, 1182, -402, 3758, 3392, 9854, 4365, 1408, + 4736, 3134, 3838, 2409, 3221, 4350, 6750, 4045, 815, 1188, 2959, + 9802, 9590, 4572, 5740, 4253, 1701, 7974, 7012, 6854, 7093, 3907, + 4539, 3886, 4267, 3505, 465, 7824, 9219, 10026, 7968, 957, 2295, + 5594, 10811, 9641, 5950, 10043, 8783, 3132, 1421, 1110, 4108, 13929, + 10660, -84, -61, 3932, -180, 6811, 13393, 15147, 15640, 9337, 6961, + 3808, 1604, 1398, 1047, 6739, 10144, 6517, 4698, 2678, 7389, 2595, + 5248, 12075, 11272, 13951, 8820, 1090, 2199, 2206, 2788, 12116, 6683, + 2612, -291, 3183, 9414, 12316, 14524, 12333, 13208, 7832, 4664, 4657, + 3534, 1298, -666, 4250, 7707, 9103, 5760, 688, 9571, 15782, 14203, + 14878, 17339, 14684, 8690, 5671, 875, 1429, 1531, 6173, 2984, 5558, + 2996, 7928, 6733, 16117, 15262, 12757, 7980, 3923, 4795, 5973, 2051, + 455, -1922, 1816, 5906, 3321, 10908, 10910, 7377, 12204, 12809, 11195, + 7451, 6666, 74, -1645, -35, -391, 3813, 7324, 892, 1656, 6095, + 12193, 14648, 12156, 14663, 10251, 10325, 7821, 3925, 323, 697, 442, + 1324, 4669, 7002, 5485, 5171, 5086, 10582, 11053, 9709, 11353, 8543, + 5256, 2873, 235, -628, 1496, 1878, -867, 3420, 6865, 5937, 10182, + 13277, 10069, 10789, 5998, 624, -2082, 4417, 1258, -1080, -819, -1430, + 1033, 5220, 6335, 8471, 8980, 11908, 14430, 12584, 8404, 1576, -803, + 985, 1481, 1367, -193, 873, 3684, 2288, 6676, 9477, 11155, 9602, + 9707, 10507, 4739, 3174, -575, -178, 3002, 1710, 423, -477, 554, + 3088, 2029, 5113, 5000, 3771, 6090, 5365, 1185, 2855, 399, -312, + -1577, 176, 955, + }; + + float expected_replicate[] = { + 13768, 13528, 12999, 6906, 4618, 4043, 2611, 9955, 6685, 4776, 2753, + 1036, 3063, 4544, 5183, 7349, 12451, 12501, 9131, 12753, 8908, 4058, + 6299, 7542, 7115, 3307, 3360, 3543, 9754, 7808, 5991, 9019, 14320, + 14919, 12492, 6871, 7373, 3336, 2085, 10604, 9377, 6882, 5009, 3103, + 6220, 6278, 7588, 10196, 11045, 11563, 11842, 11911, 8279, 2030, 1858, + 6368, 12123, 9909, 6347, 10345, 9365, 4038, 1673, 3051, 16492, 16649, + 12276, 408, -301, 4122, -654, 7864, 14038, 15279, 15315, 9744, 8243, + 5298, 746, 380, 9824, 9124, 10895, 6640, 4712, 2669, 6980, 2759, + 5385, 12345, 11336, 13129, 8600, 2370, 3682, 5219, 12407, 13123, 6784, + 2612, -291, 3183, 9414, 12316, 14524, 12333, 13397, 7543, 3916, 4153, + 4477, 4314, 7983, 8418, 9163, 9103, 5760, 688, 9571, 15782, 14203, + 14878, 17718, 14570, 7940, 6642, 5094, 7133, 9964, 10219, 3224, 5558, + 2996, 7928, 6733, 16117, 15262, 12757, 7958, 4401, 5187, 5476, 5529, + 6055, 2206, 3909, 6015, 3321, 10908, 10910, 7377, 12204, 12809, 11195, + 6967, 6840, 481, -1600, 274, 1, 10373, 8514, 1123, 2117, 6758, + 12736, 16223, 13585, 15988, 11771, 10600, 7918, 4156, 2840, 3111, 3287, + 6359, 7652, 8813, 6530, 6967, 7789, 13671, 13990, 13247, 13241, 9836, + 5251, 3024, 2313, 1834, 4187, 2637, -1312, 2139, 7378, 7665, 11933, + 15591, 15314, 15678, 9531, 2820, -1516, 3400, 1314, 22, 363, -2896, + -898, 5906, 7308, 10650, 12975, 16978, 20370, 18817, 12381, 4118, -861, + -137, 236, 1802, 1632, -350, 2334, 3400, 8680, 14064, 18216, 18675, + 21765, 22871, 11491, 4937, -1555, -11, 1669, 2392, 3265, -5254, -217, + 5001, 8063, 13444, 18884, 19706, 22794, 21064, 9545, 6689, -7, 289, + -2021, 504, 2347, + }; + + float expected_valid[] = { + 2612, -291, 3183, 9414, 12316, 14524, 12333, 9103, 5760, 688, + 9571, 15782, 14203, 14878, 5558, 2996, 7928, 6733, 16117, 15262, + 12757, 3321, 10908, 10910, 7377, 12204, 12809, 11195, + }; + + CNN_CONFIG cnn_config = { 3, + 0, + 0, + 0, + 0, + { + { + 1, + filter_width, + filter_height, + 3, + 1, + 1, + 0, + nullptr, + nullptr, + PADDING_SAME_ZERO, + NONE, + 0, + 0, + BRANCH_NO_COPY, + BRANCH_NOC, + {}, + {}, + -1, + }, + { + 3, + filter_width, + filter_height, + 3, + 1, + 1, + 0, + nullptr, + nullptr, + PADDING_SAME_ZERO, + NONE, + 0, + 0, + BRANCH_NO_COPY, + BRANCH_NOC, + {}, + {}, + -1, + }, + { + 3, + filter_width, + filter_height, + 1, + 1, + 1, + 0, + nullptr, + nullptr, + PADDING_SAME_ZERO, + NONE, + 0, + 0, + BRANCH_NO_COPY, + BRANCH_NOC, + {}, + {}, + 0, + }, + } }; + + // Weights and biases need to be specified separately because + // of the offset. + AssignLayerWeightsBiases(&cnn_config, weights, bias); + + CNN_THREAD_DATA thread_data = { 1, NULL }; + + RunCNNTest(image_width, image_height, input, expected_same, &cnn_config, + image_width, &thread_data, MSE_INT_TOL); + + for (int i = 0; i < cnn_config.num_layers; ++i) { + cnn_config.layer_config[i].pad = PADDING_SAME_REPLICATE; + } + + RunCNNTest(image_width, image_height, input, expected_replicate, &cnn_config, + image_width, &thread_data, MSE_INT_TOL); + + for (int i = 0; i < cnn_config.num_layers; ++i) { + cnn_config.layer_config[i].pad = PADDING_VALID; + } + + RunCNNTest(image_width, image_height, input, expected_valid, &cnn_config, + image_width, &thread_data, MSE_INT_TOL); +} + +TEST_F(CNNTest, TestRELUSingleLayer) { + int image_width = 8; + int image_height = 8; + int filter_height = 5; + int filter_width = 4; + float input[] = { + 0, -2, -3, 1, -1, 2, -2, 1, -3, -1, 0, 1, -2, -3, -2, -2, + 1, -3, 2, -3, -1, -1, 2, 0, -2, -3, 0, -2, -3, 1, -1, -1, + 2, -2, 0, -2, -3, -3, 1, 1, -1, 1, 0, 1, -3, 0, 2, 2, + 0, -3, 1, -3, 2, -2, 1, -1, -1, -2, -3, -2, -1, -3, -2, -1, + }; + float expected_same[] = { + 9, 0, 1, 1, 0, 3, 0, 19, 0, 12, 10, 0, 0, 0, 5, 0, + 0, 18, 21, 7, 19, 4, 3, 0, 0, 9, 16, 0, 11, 16, 0, 11, + 12, 2, 0, 11, 0, 16, 6, 0, 8, 22, 13, 10, 12, 0, 0, 0, + 0, 1, 2, 12, 29, 6, 10, 0, 13, 0, 0, 5, 8, 10, 0, 0, + }; + float expected_replicate[] = { + 18, 17, 12, 2, 0, 0, 5, 11, 0, 17, 22, 6, 0, 0, 17, 0, + 0, 18, 21, 7, 19, 4, 3, 5, 3, 9, 16, 0, 11, 16, 0, 3, + 3, 2, 0, 11, 0, 16, 6, 0, 17, 22, 13, 10, 12, 0, 0, 0, + 0, 4, 1, 10, 30, 7, 10, 0, 23, 8, 0, 13, 15, 19, 8, 10, + }; + float expected_valid[] = { + 18, 21, 7, 19, 4, 9, 16, 0, 11, 16, 2, 0, 11, 0, 16, 22, 13, 10, 12, 0, + }; + float weights[] = { + -2, -3, 1, 2, 2, -2, -3, 0, -3, 2, 2, -3, -3, -2, 0, 1, 2, 0, -1, -1, + }; + float bias[] = { -3 }; + + CNN_CONFIG cnn_config = { 1, + 0, + 0, + 0, + 0, + { { + 1, + filter_width, + filter_height, + 1, + 1, + 1, + 0, + weights, + bias, + PADDING_SAME_ZERO, + RELU, + 0, + 0, + BRANCH_NO_COPY, + BRANCH_NOC, + {}, + {}, + 0, + } } }; + + CNN_THREAD_DATA thread_data = { 1, NULL }; + + RunCNNTest(image_width, image_height, input, expected_same, &cnn_config, + image_width, &thread_data, MSE_INT_TOL); + + cnn_config.layer_config[0].pad = PADDING_SAME_REPLICATE; + + RunCNNTest(image_width, image_height, input, expected_replicate, &cnn_config, + image_width, &thread_data, MSE_INT_TOL); + + cnn_config.layer_config[0].pad = PADDING_VALID; + + RunCNNTest(image_width, image_height, input, expected_valid, &cnn_config, + image_width, &thread_data, MSE_INT_TOL); +} + +TEST_F(CNNTest, TestVaryingStridesVaryingDimImages) { + float weights[] = { + 1, -5, -3, -4, -1, 1, 2, -3, 2, 2, -1, 1, -5, 1, 1, + -3, -5, 3, 1, 4, -2, -5, -2, -3, -5, 0, -1, -5, 2, -2, + -2, 1, -2, -4, 1, 3, -2, 2, 0, -3, 2, -3, -2, -3, + }; + float bias[] = { 2 }; + + CNN_CONFIG cnn_config = { 1, + 0, + 0, + 0, + 0, + { + { + 1, + 4, + 11, + 1, + 7, + 6, + 0, + weights, + bias, + PADDING_SAME_ZERO, + NONE, + 0, + 0, + BRANCH_NO_COPY, + BRANCH_NOC, + {}, + {}, + 0, + }, + } }; + + int image_height = 24; + int image_width = 17; + float input[] = { + -1, -3, 4, 4, -5, 4, 3, -5, -1, -3, 4, -4, 2, -3, 3, -5, 2, -1, -5, + 1, -1, 3, 1, -3, -3, 4, 0, 2, -3, -5, -5, -4, 0, -5, -2, -3, -1, -2, + 2, -5, 4, 4, 0, -4, -3, 1, -3, -5, -4, -4, 1, -2, -3, 3, -3, -3, -1, + -5, -5, -2, 3, 1, -1, -5, -5, 1, -4, -2, -1, -2, -4, -4, 2, -2, 2, 1, + -2, -4, -1, 1, -2, -5, 3, -2, -1, -1, -5, -3, 1, -2, -2, -3, -1, -2, -4, + -2, 1, -4, -1, 4, 3, -4, 0, 4, 2, 2, 4, -3, -5, 2, 2, 1, -1, -4, + -2, 1, 3, 2, 0, 4, -1, -3, 2, 1, -4, 2, 2, -4, -2, 0, -2, -1, 4, + 4, 2, 3, -4, 2, -4, -5, 4, -1, -3, -1, 0, -4, 1, 3, -1, -3, -5, 3, + -2, -4, 1, 2, -2, -3, -3, -5, 1, -3, -1, 0, -1, 3, -4, -1, -5, -5, 1, + 0, 0, -2, -2, 2, -2, 0, 0, 2, 0, -3, 0, -1, -4, -4, -1, 3, -4, -4, + -1, 0, -5, -3, -2, 4, -3, -4, -4, 0, -5, 1, -2, -3, -3, -4, 4, 3, 4, + 3, 3, -1, 3, 1, -3, -2, 3, 3, 0, 2, -4, -3, 2, 2, 0, -2, 4, -2, + 2, -2, -1, -4, -2, 2, -4, 3, -1, 4, 1, 1, 4, -1, -4, -4, 1, 1, -2, + 4, -1, 3, 2, -3, 4, 3, 1, 4, 0, -4, 2, 0, 2, 4, -2, -2, 4, 2, + -1, -2, 1, -3, 2, 3, -5, -3, 4, 4, 2, -5, -4, -5, -2, -4, 2, 0, 2, + -5, 4, -4, -2, -5, 2, 1, 0, 4, 1, -2, -3, -4, -3, -4, 3, 3, 2, 0, + -3, 1, -5, 4, 0, 4, -1, 3, -5, -5, -2, -1, -1, 4, 3, 3, 4, 3, -4, + 4, -3, -3, -1, -4, -1, -4, -1, -2, 4, -2, -4, 4, 4, -3, -4, -1, 1, 2, + -1, -2, -2, 3, 2, 2, -3, 0, -1, 0, 3, 2, -5, 0, -4, 0, 0, 2, -4, + -1, -1, 0, -2, 0, 1, 0, 0, 4, -5, -1, -5, 2, -1, 0, 2, -1, 1, 3, + -3, -5, -2, -3, 4, -2, -2, -1, -3, -4, -1, -2, -4, 1, 4, -3, -2, -1, 3, + -3, -2, 3, 2, 1, -4, -3, -5, 1, + }; + float expected_1[] = { + 41, -26, 5, 76, 13, 83, -21, 53, -54, -14, 21, 121, + }; + + CNN_THREAD_DATA thread_data = { 1, NULL }; + + RunCNNTest(image_width, image_height, input, expected_1, &cnn_config, + image_width, &thread_data, MSE_INT_TOL); + + cnn_config.layer_config[0].skip_width = 6; + cnn_config.layer_config[0].skip_height = 7; + + float expected_2[] = { + 21, -50, 41, 20, 72, 127, -21, 103, 62, -37, 83, -3, + }; + RunCNNTest(image_width, image_height, input, expected_2, &cnn_config, + image_width, &thread_data, MSE_INT_TOL); + + cnn_config.layer_config[0].skip_width = 3; + cnn_config.layer_config[0].skip_height = 10; + + float expected_3[] = { + -26, -21, -35, 69, 49, 4, -51, -43, -56, + -41, 15, -44, 40, -62, 63, 38, 27, 47, + }; + RunCNNTest(image_width, image_height, input, expected_3, &cnn_config, + image_width, &thread_data, MSE_INT_TOL); + + cnn_config.layer_config[0].skip_width = 10; + cnn_config.layer_config[0].skip_height = 3; + + float expected_4[] = { + 21, 49, 28, 87, 50, 40, 102, 81, 58, 85, 51, 66, 36, 19, -37, -45, + }; + + RunCNNTest(image_width, image_height, input, expected_4, &cnn_config, + image_width, &thread_data, MSE_INT_TOL); +} + +TEST_F(CNNTest, TestMaxPool) { + int image_width = 8; + int image_height = 8; + int stride = 3; + float input[] = { + 1, -4, -4, 8, 0, 7, -5, -2, 8, 2, 2, 8, 5, -1, -1, 9, + -3, 0, -2, 0, 6, 3, -4, 8, 7, 8, 7, -1, 4, -1, 0, 2, + -5, -2, 8, 5, 5, 4, 2, 7, 4, 6, 2, 8, 8, -4, -3, -4, + -3, -1, 2, 3, 3, 6, -5, 8, 9, 5, 0, -2, -1, 6, 5, 7, + }; + + float expected[] = { + 49, 58, 70, 68, 68, 70, 48, 57, 88, + }; + + float weights[] = { + 3, 1, 3, 4, -1, 5, -2, 1, -4, + }; + + float bias[] = { + -3, + }; + + CNN_CONFIG cnn_config = { 1, + 0, + 0, + 0, + 0, + { { + 1, + 3, + 3, + 1, + stride, + stride, + 1, + weights, + bias, + PADDING_SAME_ZERO, + NONE, + 0, + 0, + BRANCH_NO_COPY, + BRANCH_NOC, + {}, + {}, + 0, + } } }; + + CNN_THREAD_DATA thread_data = { 1, NULL }; + + RunCNNTest(image_width, image_height, input, expected, &cnn_config, + image_width, &thread_data, MSE_INT_TOL); +} + +TEST_F(CNNTest, TestDeconvolveNonActivationSingleLayerSingleKernel) { + int image_width = 4; + int image_height = 7; + float input[] = { + 9, 6, 181, 9, 218, 30, 80, 108, 68, 216, 70, 128, 179, 228, + 33, 212, 34, 14, 48, 27, 230, 23, 202, 113, 80, 56, 122, 112, + }; + + float expected_1_same[] = { + 15, -30, 36, -525, 377, -193, 558, 531, 6, -24, -15, 124, + 166, -561, -356, -754, -3, -3, -3, -3, -3, -3, -3, -3, + 433, -311, 711, 381, 247, -317, 453, 129, 215, -627, -409, -885, + 17, -255, -55, -647, -3, -3, -3, -3, -3, -3, -3, -3, + 133, -719, 633, -225, 785, 191, 463, 79, 65, 9, 77, -853, + -365, -949, -15, -667, -3, -3, -3, -3, -3, -3, -3, -3, + 355, -866, 990, 207, 747, 12, 520, -116, 176, -312, -133, -1370, + -426, -802, 143, -771, -3, -3, -3, -3, -3, -3, -3, -3, + 65, -79, 127, -59, 135, -90, 195, 114, 31, -91, -57, -133, + 17, -176, -72, -276, -3, -3, -3, -3, -3, -3, -3, -3, + 457, -302, 733, 58, 470, -475, 829, 490, 227, -670, -440, -790, + 153, -588, -294, -1150, -3, -3, -3, -3, -3, -3, -3, -3, + 157, -251, 349, -185, 409, -293, 587, 251, 77, -187, -107, -369, + 7, -481, -135, -827, -3, -3, -3, -3, -3, -3, -3, -3, + }; + float expected_1_valid[] = { + -30, 15, -30, 36, -525, 377, -193, 558, 531, 24, 24, 6, + 6, -24, -15, 124, 166, -561, -356, -754, -21, -39, -3, -3, + -3, -3, -3, -3, -3, -3, -3, -3, -3, -657, 433, -311, + 711, 381, 247, -317, 453, 129, 321, 321, 215, 215, -627, -409, + -885, 17, -255, -55, -647, -219, -435, -3, -3, -3, -3, -3, + -3, -3, -3, -3, -3, -3, -207, 133, -719, 633, -225, 785, + 191, 463, 79, 381, 381, 65, 65, 9, 77, -853, -365, -949, + -15, -667, -259, -515, -3, -3, -3, -3, -3, -3, -3, -3, + -3, -3, -3, -540, 355, -866, 990, 207, 747, 12, 520, -116, + 633, 633, 176, 176, -312, -133, -1370, -426, -802, 143, -771, -427, + -851, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, + -105, 65, -79, 127, -59, 135, -90, 195, 114, 78, 78, 31, + 31, -91, -57, -133, 17, -176, -72, -276, -57, -111, -3, -3, + -3, -3, -3, -3, -3, -3, -3, -3, -3, -693, 457, -302, + 733, 58, 470, -475, 829, 490, 336, 336, 227, 227, -670, -440, + -790, 153, -588, -294, -1150, -229, -455, -3, -3, -3, -3, -3, + -3, -3, -3, -3, -3, -3, -243, 157, -251, 349, -185, 409, + -293, 587, 251, 333, 333, 77, 77, -187, -107, -369, 7, -481, + -135, -827, -227, -451, + }; + float weights_1[] = { -3, 2, -1, 3, 3, 1, 1, -3, -2, -4 }; + float bias_1[] = { -3 }; + + CNN_CONFIG cnn_config = { 1, + 0, + 0, + 0, + 0, + { { + 1, + 5, + 2, + 1, + 2, + 3, + 0, + weights_1, + bias_1, + PADDING_SAME_ZERO, + NONE, + 1, + 0, + BRANCH_NO_COPY, + BRANCH_NOC, + {}, + {}, + 0, + } } }; + + CNN_THREAD_DATA thread_data = { 1, NULL }; + + RunCNNTest(image_width, image_height, input, expected_1_same, &cnn_config, + image_width, &thread_data, MSE_INT_TOL); + + // Change padding to valid + cnn_config.layer_config[0].pad = PADDING_VALID; + + RunCNNTest(image_width, image_height, input, expected_1_valid, &cnn_config, + image_width, &thread_data, MSE_INT_TOL); + + float expected_12_same[] = { + 15, -12, 6, 36, -9, -528, 377, -184, 513, 558, -12, 24, + 6, -30, -15, -33, -21, 166, 154, -546, -356, -718, -30, -21, + 433, -221, 561, 711, -33, -153, 247, -83, -87, 453, -111, 321, + 215, -657, -409, -845, -93, 17, -43, -243, -55, -215, -327, -219, + 133, -71, -447, 633, -219, 435, 785, -73, -177, 463, -131, 381, + 65, -207, 77, -59, -651, -365, -797, -213, -15, -155, -387, -259, + 355, -182, -150, 990, -231, 582, 747, -36, -540, 520, -215, 633, + 176, -540, -133, -491, -687, -426, -882, -102, 143, 77, -639, -427, + 65, -37, 57, 127, -17, -105, 135, -51, 60, 195, -30, 78, + 31, -105, -57, -125, -45, 17, -11, -147, -72, -168, -84, -57, + 457, -233, 618, 733, -26, -540, 470, -205, 264, 829, -116, 336, + 227, -693, -440, -900, -72, 153, 107, -609, -294, -698, -342, -229, + 157, -83, 69, 349, -59, -201, 409, -125, 27, 587, -115, 333, + 77, -243, -107, -267, -171, 7, -105, -369, -135, -379, -339, -227, + }; + float expected_12_valid[] = { + -30, 15, -12, 6, 36, -9, -528, 377, -184, 513, 558, -12, + 24, 24, 6, 6, -30, -15, -33, -21, 166, 154, -546, -356, + -718, -30, -21, -39, -657, 433, -221, 561, 711, -33, -153, 247, + -83, -87, 453, -111, 321, 321, 215, 215, -657, -409, -845, -93, + 17, -43, -243, -55, -215, -327, -219, -435, -207, 133, -71, -447, + 633, -219, 435, 785, -73, -177, 463, -131, 381, 381, 65, 65, + -207, 77, -59, -651, -365, -797, -213, -15, -155, -387, -259, -515, + -540, 355, -182, -150, 990, -231, 582, 747, -36, -540, 520, -215, + 633, 633, 176, 176, -540, -133, -491, -687, -426, -882, -102, 143, + 77, -639, -427, -851, -105, 65, -37, 57, 127, -17, -105, 135, + -51, 60, 195, -30, 78, 78, 31, 31, -105, -57, -125, -45, + 17, -11, -147, -72, -168, -84, -57, -111, -693, 457, -233, 618, + 733, -26, -540, 470, -205, 264, 829, -116, 336, 336, 227, 227, + -693, -440, -900, -72, 153, 107, -609, -294, -698, -342, -229, -455, + -243, 157, -83, 69, 349, -59, -201, 409, -125, 27, 587, -115, + 333, 333, 77, 77, -243, -107, -267, -171, 7, -105, -369, -135, + -379, -339, -227, -451, + }; + + // Change skip_width, skip_height to {2, 3} + cnn_config.layer_config[0].skip_width = 3; + cnn_config.layer_config[0].skip_height = 2; + // Set padding to same + cnn_config.layer_config[0].pad = PADDING_SAME_ZERO; + + RunCNNTest(image_width, image_height, input, expected_12_same, &cnn_config, + image_width, &thread_data, MSE_INT_TOL); + + // Change padding to valid + cnn_config.layer_config[0].pad = PADDING_VALID; + RunCNNTest(image_width, image_height, input, expected_12_valid, &cnn_config, + image_width, &thread_data, MSE_INT_TOL); + + cnn_config.layer_config[0].filter_width = 4; + cnn_config.layer_config[0].filter_height = 3; + float weights_2[] = { -1, -3, -1, -3, 0, 2, -2, 4, 3, 0, 1, 4 }; + float bias_2[] = { -4 }; + cnn_config.layer_config[0].weights = weights_2; + cnn_config.layer_config[0].bias = bias_2; + + cnn_config.layer_config[0].skip_width = 5; + cnn_config.layer_config[0].skip_height = 2; + float expected_2_same[] = { + -13, -31, -13, -31, -4, -10, -22, -10, -22, -4, -185, -547, + -185, -547, -4, -13, -31, -13, -31, -4, -4, 14, -22, 32, + -4, -4, 8, -16, 20, -4, -4, 358, -366, 720, -4, -4, + 14, -22, 32, -4, -195, -658, -213, -622, -4, -16, -94, -28, + -70, -4, 459, -244, 97, 480, -4, -85, -328, -103, -292, -4, + -4, 432, -440, 868, -4, -4, 56, -64, 116, -4, -4, 156, + -164, 316, -4, -4, 212, -220, 428, -4, 582, -208, 146, 664, + -4, -130, -652, -190, -532, -4, 166, -214, 6, 106, -4, 192, + -388, -24, 44, -4, -4, 132, -140, 268, -4, -4, 428, -436, + 860, -4, -4, 136, -144, 276, -4, -4, 252, -260, 508, -4, + 21, -541, -115, -269, -4, 416, -688, -16, 176, -4, 173, -103, + 33, 177, -4, 168, -640, -88, -128, -4, -4, 354, -362, 712, + -4, -4, 452, -460, 908, -4, -4, 62, -70, 128, -4, -4, + 420, -428, 844, -4, 499, -106, 141, 610, -4, 666, -46, 210, + 866, -4, 47, -148, -19, -16, -4, 605, -85, 181, 763, -4, + -4, 64, -72, 132, -4, -4, 24, -32, 52, -4, -4, 92, + -100, 188, -4, -4, 50, -58, 104, -4, -132, -694, -200, -558, + -4, 15, -73, -13, -17, -4, -62, -610, -158, -418, -4, -36, + -343, -90, -235, -4, -4, 456, -464, 916, -4, -4, 42, -50, + 88, -4, -4, 400, -408, 804, -4, -4, 222, -230, 448, -4, + 606, -244, 146, 676, -4, 9, -172, -37, -80, -4, 480, -370, + 76, 438, -4, 223, -340, -3, 112, -4, -4, 156, -164, 316, + -4, -4, 108, -116, 220, -4, -4, 240, -248, 484, -4, -4, + 220, -228, 444, -4, + }; + float expected_2_valid[] = { + -13, -31, -13, -31, -4, -10, -22, -10, -22, -4, -185, -547, + -185, -547, -4, -13, -31, -13, -31, -4, 14, -22, 32, -4, + -4, 8, -16, 20, -4, -4, 358, -366, 720, -4, -4, 14, + -22, 32, -195, -658, -213, -622, -4, -16, -94, -28, -70, -4, + 459, -244, 97, 480, -4, -85, -328, -103, -292, -4, 432, -440, + 868, -4, -4, 56, -64, 116, -4, -4, 156, -164, 316, -4, + -4, 212, -220, 428, 582, -208, 146, 664, -4, -130, -652, -190, + -532, -4, 166, -214, 6, 106, -4, 192, -388, -24, 44, -4, + 132, -140, 268, -4, -4, 428, -436, 860, -4, -4, 136, -144, + 276, -4, -4, 252, -260, 508, 21, -541, -115, -269, -4, 416, + -688, -16, 176, -4, 173, -103, 33, 177, -4, 168, -640, -88, + -128, -4, 354, -362, 712, -4, -4, 452, -460, 908, -4, -4, + 62, -70, 128, -4, -4, 420, -428, 844, 499, -106, 141, 610, + -4, 666, -46, 210, 866, -4, 47, -148, -19, -16, -4, 605, + -85, 181, 763, -4, 64, -72, 132, -4, -4, 24, -32, 52, + -4, -4, 92, -100, 188, -4, -4, 50, -58, 104, -132, -694, + -200, -558, -4, 15, -73, -13, -17, -4, -62, -610, -158, -418, + -4, -36, -343, -90, -235, -4, 456, -464, 916, -4, -4, 42, + -50, 88, -4, -4, 400, -408, 804, -4, -4, 222, -230, 448, + 606, -244, 146, 676, -4, 9, -172, -37, -80, -4, 480, -370, + 76, 438, -4, 223, -340, -3, 112, -4, 156, -164, 316, -4, + -4, 108, -116, 220, -4, -4, 240, -248, 484, -4, -4, 220, + -228, 444, 236, -4, 76, 316, -4, 164, -4, 52, 220, -4, + 362, -4, 118, 484, -4, 332, -4, 108, 444, + }; + // Set padding to same + cnn_config.layer_config[0].pad = PADDING_SAME_ZERO; + + RunCNNTest(image_width, image_height, input, expected_2_same, &cnn_config, + image_width, &thread_data, MSE_INT_TOL); + + cnn_config.layer_config[0].pad = PADDING_VALID; + + RunCNNTest(image_width, image_height, input, expected_2_valid, &cnn_config, + image_width, &thread_data, MSE_INT_TOL); + + cnn_config.layer_config[0].skip_width = 2; + cnn_config.layer_config[0].skip_height = 5; + float expected_21_same[] = { + -31, -19, -49, -191, -565, -194, -574, -13, 14, -22, 44, -16, + 382, -366, 738, -22, -4, 23, 32, 545, 20, 204, 720, 5, + -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, + -4, -4, -4, -4, -658, -252, -748, -114, -334, -192, -568, -112, + 432, -440, 928, -64, 276, -164, 532, -220, -4, 304, 868, 266, + 116, 400, 316, 104, -4, -4, -4, -4, -4, -4, -4, -4, + -4, -4, -4, -4, -4, -4, -4, -4, -208, -288, -856, -290, + -862, -202, -598, -132, 132, -140, 700, -436, 1000, -144, 532, -260, + -4, 712, 268, 422, 860, 450, 276, 124, -4, -4, -4, -4, + -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, + -541, -411, -1225, -265, -787, -249, -739, -216, 354, -362, 1168, -460, + 974, -70, 552, -428, -4, 859, 712, 323, 908, 665, 128, 208, + -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, + -4, -4, -4, -4, -106, -52, -148, -66, -190, -79, -229, -31, + 64, -72, 160, -32, 148, -100, 242, -58, -4, 72, 132, 154, + 52, 125, 188, 23, -4, -4, -4, -4, -4, -4, -4, -4, + -4, -4, -4, -4, -4, -4, -4, -4, -694, -257, -763, -229, + -679, -319, -949, -117, 456, -464, 962, -50, 492, -408, 1030, -230, + -4, 295, 916, 625, 88, 537, 804, 109, -4, -4, -4, -4, + -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, + -244, -140, -412, -182, -538, -238, -706, -116, 156, -164, 428, -116, + 464, -248, 708, -228, -4, 244, 316, 418, 220, 454, 484, 108, + -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, + -4, -4, -4, -4, + }; + float expected_21_valid[] = { + -13, -31, -19, -49, -191, -565, -194, -574, -13, -31, -4, 14, + -22, 44, -16, 382, -366, 738, -22, 32, 23, -4, 23, 32, + 545, 20, 204, 720, 5, 32, -4, -4, -4, -4, -4, -4, + -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, + -4, -4, -222, -658, -252, -748, -114, -334, -192, -568, -112, -328, + -4, 432, -440, 928, -64, 276, -164, 532, -220, 428, 650, -4, + 304, 868, 266, 116, 400, 316, 104, 428, -4, -4, -4, -4, + -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, + -4, -4, -4, -4, -72, -208, -288, -856, -290, -862, -202, -598, + -132, -388, -4, 132, -140, 700, -436, 1000, -144, 532, -260, 508, + 200, -4, 712, 268, 422, 860, 450, 276, 124, 508, -4, -4, + -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, + -4, -4, -4, -4, -4, -4, -183, -541, -411, -1225, -265, -787, + -249, -739, -216, -640, -4, 354, -362, 1168, -460, 974, -70, 552, + -428, 844, 533, -4, 859, 712, 323, 908, 665, 128, 208, 844, + -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, + -4, -4, -4, -4, -4, -4, -4, -4, -38, -106, -52, -148, + -66, -190, -79, -229, -31, -85, -4, 64, -72, 160, -32, 148, + -100, 242, -58, 104, 98, -4, 72, 132, 154, 52, 125, 188, + 23, 104, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, + -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -234, -694, + -257, -763, -229, -679, -319, -949, -117, -343, -4, 456, -464, 962, + -50, 492, -408, 1030, -230, 448, 686, -4, 295, 916, 625, 88, + 537, 804, 109, 448, -4, -4, -4, -4, -4, -4, -4, -4, + -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, + -84, -244, -140, -412, -182, -538, -238, -706, -116, -340, -4, 156, + -164, 428, -116, 464, -248, 708, -228, 444, 236, -4, 244, 316, + 418, 220, 454, 484, 108, 444, + }; + + cnn_config.layer_config[0].pad = PADDING_SAME_ZERO; + + RunCNNTest(image_width, image_height, input, expected_21_same, &cnn_config, + image_width, &thread_data, MSE_INT_TOL); + + cnn_config.layer_config[0].pad = PADDING_VALID; + + RunCNNTest(image_width, image_height, input, expected_21_valid, &cnn_config, + image_width, &thread_data, MSE_INT_TOL); +} + +TEST_F(CNNTest, TestLargeKernelsAndStrides) { + float input_10x11[] = { + 4, 4, 2, 4, 2, -5, -2, 3, -1, 0, 0, 1, 2, 0, -5, -2, -5, 1, -3, + -1, 4, -3, 2, -2, 1, 0, 1, -3, -3, -4, -2, -2, 1, -4, -1, 4, 1, -4, + -4, -4, 3, 2, -5, 3, -5, 1, 2, -4, 1, -1, 3, 4, -2, 3, -3, 3, 0, + 2, -4, -5, -5, -2, -1, -2, 1, 1, 1, -2, 4, -5, 4, -1, -1, 2, 3, -4, + 2, 2, 3, 0, 0, 1, 0, 3, 2, 3, 1, -2, 3, -4, 3, 2, 4, -2, 0, + 4, -4, 1, -3, -3, -3, -5, 1, -3, -5, 0, 4, -1, -3, 2, + }; + + float weights_10x11[] = { + -3, 4, -4, -3, -5, 1, -2, 3, 1, -4, -4, 0, -1, 0, 3, 1, -3, -2, 0, + -1, 1, 3, -4, -4, -3, -3, -2, 4, 3, -5, 4, 2, -3, 4, -2, -1, 2, -1, + -5, 0, -3, 0, 3, -5, -5, 3, -4, -1, -5, 3, 4, 0, 4, -5, 2, -1, 2, + -1, -1, -1, -5, 0, -4, 3, -1, 1, 1, -1, 3, 2, -5, -4, 0, -4, 4, -5, + -3, 4, -5, 2, -5, -4, -4, -1, 3, 3, 0, 2, -4, 1, -2, 1, 1, 0, 3, + -2, 0, 1, 2, 4, -3, -1, -5, -5, 2, -4, 1, 1, 2, -4, -2, -2, 2, 1, + 3, 4, -5, 1, -1, -3, -3, -1, -2, -5, 1, -1, 0, 1, 4, 4, 0, 0, 4, + -3, -1, -5, -3, 0, 1, 1, 1, -5, 3, 4, 3, -5, 3, -2, -2, 0, -4, 0, + 0, -2, 1, -4, -1, 0, -5, -2, -2, -5, -3, -3, 1, 1, -3, 2, 4, 2, 4, + -4, -3, 3, 1, 1, 3, -4, 4, -2, -3, -3, -3, -3, -4, -2, 3, -5, 2, 4, + -1, -4, -4, 4, -2, -1, 3, -3, -4, -4, -2, 4, 1, 0, 2, -1, 4, -3, 1, + 4, -3, 4, 4, 0, -4, 3, -2, -3, 2, 3, -1, -3, 2, 1, 4, -2, -3, 1, + 4, -2, 2, -2, -5, -2, 1, 4, -1, -4, 4, -5, 2, -5, -4, -1, -2, 3, 1, + 2, 1, -5, 1, -5, -4, -1, -2, 2, -2, -4, -3, -2, -2, 4, -1, 2, 2, -4, + 2, -2, 4, -4, -2, -2, 1, -1, 1, 1, 1, -4, -5, -2, 3, -4, -1, 3, -2, + 3, 2, -5, -4, 0, 3, -2, -4, -5, 3, -2, -4, 2, -2, 1, -4, 0, 2, -5, + 1, -4, -1, -1, 4, -5, -4, 0, -5, -4, -3, -5, -4, 0, 2, 0, -4, 2, -2, + 1, 1, -3, 2, 0, -4, 0, -4, 1, 0, -5, -1, -1, -1, -5, 4, 2, 2, -4, + 3, -2, -2, 2, -3, -2, -1, 2, -4, -5, 2, -2, -4, -5, -5, -1, 2, -1, 0, + -5, -2, -2, -5, 0, 1, -1, -5, 0, 3, 2, 3, 0, -3, -2, 0, -5, -1, -2, + 2, -4, -1, 2, 2, -5, 2, -4, 0, 3, -3, 1, 0, 0, 1, -5, -3, 1, -1, + 0, -4, -3, 2, -4, -4, 4, -1, 0, 1, 2, -4, -5, 4, -2, 1, -4, -4, -3, + -1, -1, 1, -1, -4, -1, -4, -3, 2, -1, -2, -4, 1, 1, 0, -2, 0, -4, 3, + -3, 0, -4, -1, -4, 2, -1, -2, -5, -1, -2, -3, 3, -1, 0, -3, 0, 1, -5, + 1, -5, 0, 1, + }; + + float bias_10x11[] = { 3 }; + + float expected_10x11[] = { + 118, + }; + + CNN_CONFIG cnn_config = { 1, + 0, + 0, + 0, + 0, + { { + 1, + 23, + 20, + 1, + 15, + 20, + 0, + weights_10x11, + bias_10x11, + PADDING_SAME_ZERO, + NONE, + 0, + 0, + BRANCH_NO_COPY, + BRANCH_NOC, + {}, + {}, + 0, + } } }; + + int image_height = 10; + int image_width = 11; + + CNN_THREAD_DATA thread_data = { 1, NULL }; + + RunCNNTest(image_width, image_height, input_10x11, expected_10x11, + &cnn_config, image_width, &thread_data, MSE_INT_TOL); + + float input_11x10[] = { + -2, -2, 3, -5, -1, -3, 1, 3, 2, 1, 1, -5, 4, 1, 3, -5, 3, -3, -5, + 0, -1, -3, -3, 1, 1, -5, -1, -5, -5, -3, 0, 1, -3, -1, -3, -3, 0, 3, + 4, -4, -1, 3, -3, -1, -3, 1, -3, -2, -1, -4, -3, 2, -4, 1, -4, -1, -3, + -5, -1, 2, 3, 0, 2, 2, -5, 4, 1, 2, -1, -4, 4, -4, -4, 0, -1, 1, + -1, 1, -3, -3, -2, 1, 2, 4, 4, 4, -3, -3, 0, 1, 0, 1, 4, 1, 3, + 4, -3, -2, -4, 4, 2, 0, 3, 4, -1, 2, -2, 1, -3, -2, + }; + + float weights_11x10[] = { + 4, -1, 1, -1, 2, 4, 3, 3, -4, 3, -5, 1, -1, -1, -2, -2, 0, 2, -3, + -2, 3, -5, -1, 0, -1, -2, -2, -1, 2, 4, 3, 1, 0, 0, -3, 3, -4, -1, + -5, 4, -2, -2, 1, 2, -1, -3, 1, 2, -5, 1, -3, 3, 3, 0, -4, -4, -5, + -3, -4, -4, 4, -2, 4, 4, -2, 2, -5, -1, -2, -5, -1, 4, -3, 3, -2, 0, + -4, -3, 0, -1, -2, 4, 2, 0, -2, -5, -4, 1, 4, -4, -2, 2, -2, 1, 1, + -4, 1, -4, -4, -2, 4, 2, -1, -5, -5, 1, -3, -3, 3, -3, -5, -3, 4, -1, + -1, -3, 0, -4, 3, -1, 0, -2, 0, -5, -2, -5, 2, 0, -5, 2, 3, -2, 2, + 4, -1, 1, -3, 2, 3, 2, 0, -5, -4, -5, 2, 1, 1, -1, -2, 3, 4, 2, + -2, 4, -2, 3, 1, -4, -3, -1, 4, 4, -3, -5, -2, 2, 0, 3, -2, 3, -1, + -4, 0, -2, 0, 3, 4, -2, -3, -2, 0, 3, 4, 2, -4, 0, 1, 2, 2, -1, + -1, 4, 1, 4, -2, -1, -1, -5, 1, -3, 3, 3, -1, -4, 3, -5, 0, 0, -1, + -4, -1, -2, 4, -2, 3, 3, -3, 1, -1, 2, -1, 4, 4, -2, -2, 4, -2, 0, + 3, -3, -5, -1, -2, 4, -4, 2, -4, 0, -2, 3, -3, 2, 2, -2, -5, -1, 4, + 3, -2, -1, 3, 3, -1, 3, 0, -3, 0, 4, 2, 0, -1, 4, 1, 1, 2, 1, + 3, 1, 1, 1, -3, -5, -4, 4, -4, 2, 0, 0, -4, 1, 4, -5, 4, 4, 0, + 1, 0, -2, -4, -4, -3, 0, 1, -5, 4, 0, -3, -2, -4, 2, 4, 1, -5, 1, + -4, 1, 0, -3, -3, 0, 2, -5, 4, 3, -2, -5, 3, 1, -1, 0, 3, -2, -2, + 3, -2, -5, 4, 1, -2, 2, -1, 0, 4, 0, -5, 3, -2, 1, 2, 1, -5, -3, + -2, -5, 4, -4, 0, 3, 2, -1, -4, -1, 2, 1, -2, 3, -1, -4, 2, 0, -3, + 1, -1, 2, -5, -4, -1, -5, 1, 4, 3, 4, 2, -3, 1, -5, -1, 3, 0, -1, + -4, 3, 4, -5, 4, 4, -3, 2, -3, -1, -3, -5, -3, 2, -3, -2, 1, 1, 0, + -5, 3, 2, 1, -5, 1, 1, 1, 3, 4, -4, -1, -2, 0, -5, -3, -5, -2, -4, + 3, 3, 3, 4, 0, -4, -1, -5, 0, -3, 1, 4, 4, -4, 4, -5, -5, -1, -2, + -5, 3, -4, 4, 3, 0, -3, 2, -2, 0, 0, 4, 4, 0, -2, 1, -1, -3, 2, + -1, 1, -3, -5, + }; + + float bias_11x10[] = { + -5, + }; + + float expected_11x10[] = { + 36, -84, 95, 45, 18, 46, 77, -54, -99, -149, 66, 49, 161, 11, + 39, 61, -66, 61, 4, -3, 34, -44, -23, 31, 64, 29, 47, 72, + -27, -27, 121, -3, 100, 1, 30, -78, -12, -89, -59, 8, -16, 112, + 91, -102, -26, -4, 30, 54, 4, -84, -24, -58, 27, -53, -33, 5, + 53, -26, 63, 50, -103, -130, -23, 6, -104, -207, 73, 23, 77, 132, + 38, 32, -130, -44, -60, 7, 27, 176, 45, -32, -2, 99, -97, 63, + 69, 126, 47, 63, 136, -57, 5, 16, -40, -157, 8, 38, -44, -10, + 91, 7, 122, 140, 30, -105, 4, -1, 113, 64, 180, 141, + }; + + cnn_config.layer_config[0].weights = weights_11x10; + cnn_config.layer_config[0].bias = bias_11x10; + cnn_config.layer_config[0].filter_width = 20; + cnn_config.layer_config[0].filter_height = 23; + cnn_config.layer_config[0].skip_width = 1; + cnn_config.layer_config[0].skip_height = 1; + image_height = 11; + image_width = 10; + + RunCNNTest(image_width, image_height, input_11x10, expected_11x10, + &cnn_config, image_width, &thread_data, MSE_INT_TOL); +} + +TEST_F(CNNTest, TestSoftsignSingleLayer) { + int image_width = 8; + int image_height = 8; + int filter_height = 5; + int filter_width = 4; + float input[] = { + -0.5220f, 0.8410f, -0.8990f, -0.0090f, 0.6710f, -0.9470f, -0.8240f, + -0.0870f, 0.5380f, 0.4750f, 0.570f, -0.3760f, -0.6960f, -0.5940f, + -0.3830f, 0.080f, -0.0980f, -0.4940f, -0.4030f, 0.9460f, -0.6020f, + 0.4220f, 0.6190f, 0.6640f, -0.9210f, -0.1470f, -0.2480f, -0.1120f, + -0.580f, -0.0650f, 0.3330f, 0.9860f, -0.7430f, 0.7610f, 0.4840f, + 0.1030f, 0.9570f, 0.6120f, -0.5240f, -0.1220f, -0.5850f, -0.270f, + 0.7840f, -0.9790f, 0.7290f, -0.30f, -0.6460f, 0.0780f, 0.4750f, + -0.0510f, 0.4550f, 0.3850f, -0.7230f, 0.4460f, -0.6260f, -0.810f, + 0.8720f, -0.2120f, -0.580f, -0.9510f, -0.8430f, -0.1340f, -0.0850f, + 0.9190f, + }; + float expected_same[] = { + 0.430f, 0.660f, 0.5510f, -0.610f, 0.450f, -0.1610f, 0.0520f, 0.3240f, + 0.6820f, 0.3820f, 0.6360f, 0.7480f, 0.3080f, 0.090f, 0.3910f, 0.1730f, + 0.340f, 0.6660f, -0.4990f, 0.4280f, 0.1540f, 0.120f, 0.4670f, 0.6150f, + -0.3880f, 0.7590f, 0.4190f, 0.7350f, 0.5310f, -0.5160f, -0.1760f, 0.6790f, + -0.6780f, 0.5470f, 0.5750f, -0.6420f, 0.7210f, -0.4620f, 0.5430f, 0.770f, + -0.1990f, 0.3950f, 0.7860f, -0.4380f, 0.7540f, 0.2640f, -0.6430f, 0.4510f, + -0.1260f, 0.1590f, -0.2110f, -0.0560f, 0.6570f, 0.680f, 0.5870f, 0.4720f, + 0.4040f, 0.3630f, 0.670f, 0.2360f, 0.410f, 0.6980f, -0.5350f, 0.3940f, + }; + float expected_replicate[] = { + 0.540f, 0.7230f, -0.3530f, -0.2130f, 0.7440f, -0.4470f, -0.6260f, + -0.2050f, 0.7230f, 0.4630f, 0.5920f, 0.7440f, 0.6080f, 0.3130f, + -0.5670f, -0.4720f, 0.5480f, 0.6660f, -0.4990f, 0.4280f, 0.1540f, + 0.120f, 0.3390f, 0.6090f, 0.4160f, 0.7590f, 0.4190f, 0.7350f, + 0.5310f, -0.5160f, -0.490f, 0.4450f, -0.610f, 0.5470f, 0.5750f, + -0.6420f, 0.7210f, -0.4620f, 0.3150f, 0.7370f, -0.5820f, 0.3950f, + 0.7860f, -0.4380f, 0.7540f, 0.2640f, -0.7430f, -0.5340f, -0.6270f, + 0.4430f, 0.4730f, 0.4570f, 0.7450f, 0.630f, 0.2620f, 0.3140f, + -0.1840f, 0.1810f, 0.7210f, 0.2760f, 0.6430f, 0.6720f, -0.4390f, + 0.2040f, + }; + float expected_valid[] = { + 0.6660f, -0.4990f, 0.4280f, 0.1540f, 0.120f, 0.7590f, 0.4190f, + 0.7350f, 0.5310f, -0.5160f, 0.5470f, 0.5750f, -0.6420f, 0.7210f, + -0.4620f, 0.3950f, 0.7860f, -0.4380f, 0.7540f, 0.2640f, + }; + float weights[] = { + 0.6210f, 0.3710f, -0.2770f, -0.7230f, -0.2450f, 0.6770f, 0.3080f, + -0.9880f, -0.080f, 0.7190f, -0.6760f, -0.0170f, -0.8970f, 0.8260f, + 0.7390f, -0.4550f, -0.4260f, -0.6330f, 0.0880f, -0.9390f, + }; + float bias[] = { + 0.750f, + }; + + CNN_CONFIG cnn_config = { 1, + 0, + 0, + 0, + 0, + { { + 1, + filter_width, + filter_height, + 1, + 1, + 1, + 0, + weights, + bias, + PADDING_SAME_ZERO, + SOFTSIGN, + 0, + 0, + BRANCH_NO_COPY, + BRANCH_NOC, + {}, + {}, + 0, + } } }; + + CNN_THREAD_DATA thread_data = { 1, NULL }; + + RunCNNTest(image_width, image_height, input, expected_same, &cnn_config, + image_width, &thread_data, MSE_FLOAT_TOL); + + cnn_config.layer_config[0].pad = PADDING_SAME_REPLICATE; + + RunCNNTest(image_width, image_height, input, expected_replicate, &cnn_config, + image_width, &thread_data, MSE_FLOAT_TOL); + + cnn_config.layer_config[0].pad = PADDING_VALID; + + RunCNNTest(image_width, image_height, input, expected_valid, &cnn_config, + image_width, &thread_data, MSE_FLOAT_TOL); +} + +TEST_F(CNNTest, TestBranchTensorAdd) { + int filter_width = 2; + int filter_height = 3; + + int image_width = 4; + int image_height = 4; + + float input[] = { + -3, -2, -2, 0, -1, 3, 2, -2, 1, 3, 4, 0, 2, -5, -4, 0, + }; + + float weights[] = { + -3, -1, 4, -1, -3, 3, 3, 0, 2, 0, 3, 2, 4, 4, 4, -5, 1, -4, + 2, -4, 1, -3, 0, 4, -5, 4, 0, -4, -3, -1, 0, 0, -2, 0, 0, 2, + -5, -1, 1, -3, 3, 4, 3, 0, 1, -1, 1, 1, 2, 4, -2, -5, 2, -2, + 3, -2, 4, -1, 0, 2, 3, 2, -2, -1, -3, 1, 3, 4, -1, -3, 0, -4, + 4, 2, -3, -3, -1, 0, 1, 0, 3, 3, -3, 0, 3, 2, -5, -3, 4, -5, + 3, -1, -1, -3, 0, 1, -1, -4, 2, 4, -1, 4, -1, 1, 3, 4, 4, 4, + 0, -1, -3, -3, -3, -3, 2, -3, -2, 2, 3, -3, + }; + + float bias[] = { + 3, 4, -1, -1, 2, 1, -2, 1, 4, 1, 3, + }; + + float expected[] = { + -11502, -4101, -3424, 668, -17950, -5470, -5504, 626, + 4835, 446, 1779, -3483, 3679, -4214, 4578, -105, + }; + + int channels = 2; + + CNN_CONFIG cnn_config = { 6, + 0, + 0, + 0, + 0, + { { + 1, + filter_width, + filter_height, + channels, + 1, + 1, + 0, + weights, + bias, + PADDING_SAME_ZERO, + NONE, + 0, + 0, + BRANCH_NO_COPY, + BRANCH_NOC, + {}, + {}, + -1, + }, + { + channels, + filter_width, + filter_height, + channels, + 1, + 1, + 0, + nullptr, + nullptr, + PADDING_SAME_ZERO, + NONE, + 0, + 0, + BRANCH_INPUT, + BRANCH_NOC, + { + 0x02, + 0, + 0x00, + }, + {}, + -1, + }, + { + channels, + filter_width, + filter_height, + channels, + 1, + 1, + 0, + nullptr, + nullptr, + PADDING_SAME_ZERO, + NONE, + 0, + 1, + BRANCH_NO_COPY, + BRANCH_NOC, + {}, + {}, + -1, + }, + { + channels, + filter_width, + filter_height, + channels, + 1, + 1, + 0, + nullptr, + nullptr, + PADDING_SAME_ZERO, + NONE, + 0, + 1, + BRANCH_NO_COPY, + BRANCH_NOC, + {}, + {}, + -1, + }, + { + channels, + filter_width, + filter_height, + channels, + 1, + 1, + 0, + nullptr, + nullptr, + PADDING_SAME_ZERO, + NONE, + 0, + 0, + BRANCH_NO_COPY, + BRANCH_ADD, + { + 0x00, + 0, + 0x02, + }, + {}, + -1, + }, + { + channels, + filter_width, + filter_height, + 1, + 1, + 1, + 0, + nullptr, + nullptr, + PADDING_SAME_ZERO, + NONE, + 0, + 0, + BRANCH_NO_COPY, + BRANCH_NOC, + {}, + {}, + 0, + } } }; + + // Weights and biases need to be specified separately because + // of the offset. + AssignLayerWeightsBiases(&cnn_config, weights, bias); + + CNN_THREAD_DATA thread_data = { 1, NULL }; + + RunCNNTest(image_width, image_height, input, expected, &cnn_config, + image_width, &thread_data, MSE_INT_TOL); +} + +TEST_F(CNNTest, TestBranchTensorConcatenation) { + int filter_width = 2; + int filter_height = 3; + + int image_width = 4; + int image_height = 4; + + float input[] = { + -3, -2, -2, 0, -1, 3, 2, -2, 1, 3, 4, 0, 2, -5, -4, 0, + }; + + float weights[] = { + 3, 0, 2, 0, 2, 3, 1, -3, 1, -5, -3, 0, -4, 4, 0, -5, 0, -5, -1, + -2, -5, 0, -3, 2, -4, 2, 0, 2, -1, 0, -4, 3, 0, 0, -1, -5, 2, -1, + 4, -4, -2, -3, -3, 3, 4, -2, -1, -4, -1, 4, 4, -1, 4, 3, -4, 2, -2, + -4, -3, -2, 3, -3, -5, -1, 3, -2, 4, 1, -4, -3, -5, -5, -3, 4, -2, -2, + -1, -5, -5, 0, -1, -2, -3, 3, -4, -5, 2, -3, 1, 0, -5, 2, 2, -2, 0, + 2, 2, -2, 4, 2, 2, 0, 1, -5, -3, 0, 2, -2, 1, 2, -5, 2, 3, 3, + -1, 3, 0, -3, 3, -4, -4, 3, 3, -4, -2, 2, -2, 2, -2, -1, 3, 0, + }; + + float bias[] = { + -3, -5, 4, -4, -3, -2, 0, 3, -4, 4, -3, + }; + + float expected[] = { + -33533, -32087, -6741, -2124, 39979, 41453, 14034, 689, + -22611, -42203, -14882, -239, 15781, 15963, 9524, 837, + }; + + int channels = 2; + + CNN_CONFIG cnn_config = { 6, + 0, + 0, + 0, + 0, + { { + 1, + filter_width, + filter_height, + channels, + 1, + 1, + 0, + weights, + bias, + PADDING_SAME_ZERO, + NONE, + 0, + 0, + BRANCH_NO_COPY, + BRANCH_NOC, + {}, + {}, + -1, + }, + { + channels, + filter_width, + filter_height, + channels, + 1, + 1, + 0, + nullptr, + nullptr, + PADDING_SAME_ZERO, + NONE, + 0, + 0, + BRANCH_INPUT, + BRANCH_NOC, + { + 0x02, + 0, + 0x00, + }, + {}, + -1, + }, + { + channels, + filter_width, + filter_height, + channels, + 1, + 1, + 0, + nullptr, + nullptr, + PADDING_SAME_ZERO, + NONE, + 0, + 1, + BRANCH_NO_COPY, + BRANCH_NOC, + {}, + {}, + -1, + }, + { + channels, + filter_width, + filter_height, + channels, + 1, + 1, + 0, + nullptr, + nullptr, + PADDING_SAME_ZERO, + NONE, + 0, + 1, + BRANCH_NO_COPY, + BRANCH_NOC, + {}, + {}, + -1, + }, + { + channels, + filter_width, + filter_height, + channels, + 1, + 1, + 0, + nullptr, + nullptr, + PADDING_SAME_ZERO, + NONE, + 0, + 0, + BRANCH_NO_COPY, + BRANCH_CAT, + { + 0x00, + 0, + 0x02, + }, + {}, + -1, + }, + { + channels + channels, + filter_width, + filter_height, + 1, + 1, + 1, + 0, + nullptr, + nullptr, + PADDING_SAME_ZERO, + NONE, + 0, + 0, + BRANCH_NO_COPY, + BRANCH_NOC, + {}, + {}, + 0, + } } }; + + // Weights and biases need to be specified separately because + // of the offset. + AssignLayerWeightsBiases(&cnn_config, weights, bias); + + CNN_THREAD_DATA thread_data = { 1, NULL }; + + RunCNNTest(image_width, image_height, input, expected, &cnn_config, + image_width, &thread_data, MSE_INT_TOL); +} + +// TODO(logangw): Add test to test all combinations of branch_copy_type. + +TEST_F(CNNTest, TestBranchCombinations) { + int filter_width = 2; + int filter_height = 3; + + int image_width = 4; + int image_height = 4; + + float input[] = { + 3, 2, -5, -4, 4, -2, -4, -3, 4, 2, -3, 2, -3, 1, -5, -1, + }; + + float weights[] = { + 2, 3, 0, 4, 4, 3, 1, 0, 1, -5, 4, -3, 3, 0, 4, -1, -1, -5, + 2, 1, -3, -5, 3, -1, -3, -2, 0, -2, 3, 0, -2, -4, -2, -2, 2, -5, + 4, -5, 0, 1, -5, -4, -3, -4, 2, -2, 1, 0, 3, -2, -4, 3, 4, -4, + -1, -1, -3, -2, -2, -1, 2, 0, 2, -1, 2, -4, -4, -1, 2, 0, 3, -2, + -2, 3, -3, 4, -2, 4, 3, 4, 1, 0, -2, -3, -5, 1, -3, 2, 0, -2, + -2, -1, -1, -5, -2, -3, -1, 3, 3, 4, 4, 0, 2, 1, 3, -3, 2, -5, + -5, 1, -5, -1, 3, 3, 2, -4, -1, 3, -4, -2, -5, -2, 1, 3, 2, 2, + -5, -2, -3, -1, -2, -4, -1, -2, 2, 1, -4, -4, 2, 0, 2, 0, 2, -3, + -2, -4, 4, 0, 1, -3, -5, 4, -1, 2, 3, -5, -1, 0, 4, -1, -1, 3, + -1, -3, 3, 1, 4, 3, 4, 3, -4, -5, -1, 3, 3, -4, 3, 1, 3, -5, + 3, 4, -5, 4, 2, -1, -5, 2, 1, 0, 4, 0, -3, 2, 0, 2, -2, 1, + -1, -2, -1, -5, 4, 3, 3, -2, 2, 4, -5, -5, -3, -2, 4, 0, -4, 1, + }; + + float bias[] = { + -1, 4, 0, 2, 2, -2, 0, -4, -5, -1, 1, -2, 3, 0, 4, -2, 1, 0, 0, + }; + + float expected[] = { + 149496, 15553, -24193, -20956, 134094, 86432, -68283, -6366, + -53031, 133739, 67407, -13539, -53205, -58635, -20033, 1979, + }; + + int channels = 2; + + CNN_CONFIG cnn_config = { 10, + 0, + 0, + 0, + 0, + { + { + 1, + filter_width, + filter_height, + channels, + 1, + 1, + 0, + weights, + bias, + PADDING_SAME_ZERO, + NONE, + 0, + 0, + BRANCH_NO_COPY, + BRANCH_NOC, + {}, + {}, + -1, + }, + { + channels, + filter_width, + filter_height, + channels, + 1, + 1, + 0, + nullptr, + nullptr, + PADDING_SAME_ZERO, + NONE, + 0, + 0, + BRANCH_INPUT, + BRANCH_NOC, + { + 0x06, + 0, + 0x00, + }, + {}, + -1, + }, + { + channels, + filter_width, + filter_height, + channels, + 1, + 1, + 0, + nullptr, + nullptr, + PADDING_SAME_ZERO, + NONE, + 0, + 2, + BRANCH_OUTPUT, + BRANCH_NOC, + { + 0x08, + 0, + 0x00, + }, + {}, + -1, + }, + { + channels, + filter_width, + filter_height, + channels, + 1, + 1, + 0, + nullptr, + nullptr, + PADDING_SAME_ZERO, + NONE, + 0, + 3, + BRANCH_NO_COPY, + BRANCH_NOC, + {}, + {}, + -1, + }, + { + channels, + filter_width, + filter_height, + channels, + 1, + 1, + 0, + nullptr, + nullptr, + PADDING_SAME_ZERO, + NONE, + 0, + 2, + BRANCH_NO_COPY, + BRANCH_ADD, + { + 0x00, + 0, + 0x08, + }, + {}, + -1, + }, + { + channels, + filter_width, + filter_height, + channels, + 1, + 1, + 0, + nullptr, + nullptr, + PADDING_SAME_ZERO, + NONE, + 0, + 2, + BRANCH_NO_COPY, + BRANCH_NOC, + {}, + {}, + -1, + }, + { + channels, + filter_width, + filter_height, + channels, + 1, + 1, + 0, + nullptr, + nullptr, + PADDING_SAME_ZERO, + NONE, + 0, + 1, + BRANCH_NO_COPY, + BRANCH_NOC, + {}, + {}, + -1, + }, + { + channels, + filter_width, + filter_height, + channels, + 1, + 1, + 0, + nullptr, + nullptr, + PADDING_SAME_ZERO, + NONE, + 0, + 1, + BRANCH_NO_COPY, + BRANCH_ADD, + { + 0x00, + 0, + 0x0C, + }, + {}, + -1, + }, + { + channels, + filter_width, + filter_height, + channels, + 1, + 1, + 0, + nullptr, + nullptr, + PADDING_SAME_ZERO, + NONE, + 0, + 0, + BRANCH_NO_COPY, + BRANCH_ADD, + { + 0x00, + 0, + 0x02, + }, + {}, + -1, + }, + { + channels, + filter_width, + filter_height, + 1, + 1, + 1, + 0, + nullptr, + nullptr, + PADDING_SAME_ZERO, + NONE, + 0, + 0, + BRANCH_NO_COPY, + BRANCH_NOC, + {}, + {}, + 0, + }, + } }; + + // Weights and biases need to be specified separately because + // of the offset. + AssignLayerWeightsBiases(&cnn_config, weights, bias); + + CNN_THREAD_DATA thread_data = { 1, NULL }; + + RunCNNTest(image_width, image_height, input, expected, &cnn_config, + image_width, &thread_data, MSE_INT_TOL); +} + +TEST_F(CNNTest, TestSplittingTensors) { + int filter_width = 2; + int filter_height = 3; + + int image_width = 4; + int image_height = 4; + + float input[] = { + -1, -1, 2, 1, 3, 2, 4, -3, -4, -2, 2, -3, 1, -3, 4, -2, + }; + + float weights[] = { + -4, 1, 0, 2, 3, 4, 4, -4, -5, -3, 2, 2, -4, -3, 3, 2, + 4, -4, -3, -4, -4, 1, -3, -5, -3, 4, 2, -2, 2, -1, -4, -1, + -2, -3, 1, 1, 0, -5, -1, 3, 3, -5, -3, 0, -3, 1, -3, -1, + 1, -3, -2, -2, 4, -2, 0, 1, 2, 2, -4, 2, 4, 0, -5, -2, + 4, 4, -5, 1, 0, 2, -2, -5, -5, -3, -5, -5, 4, -3, 0, 0, + -4, -4, 0, -5, -4, 0, 0, -3, -5, -3, -1, 2, -1, 4, -1, 2, + }; + + float bias[] = { + -4, -2, -3, -3, 3, 1, -2, + }; + + float expected[] = { + 530, -762, 1469, 777, 849, -771, -1698, 600, + -658, -1821, 98, -668, -1798, 30, 887, -971, + }; + + CNN_CONFIG cnn_config = { 3, + 0, + 0, + 0, + 0, + { + { + 1, + filter_width, + filter_height, + 4, + 1, + 1, + 0, + nullptr, + nullptr, + PADDING_SAME_ZERO, + NONE, + 0, + 0, + BRANCH_OUTPUT, + BRANCH_NOC, + { + 0x02, + 2, + 0x00, + }, + {}, + -1, + }, + { + 4, + filter_width, + filter_height, + 2, + 1, + 1, + 0, + nullptr, + nullptr, + PADDING_SAME_ZERO, + NONE, + 0, + 0, + BRANCH_NO_COPY, + BRANCH_CAT, + { + 0x00, + 0, + 0x02, + }, + {}, + -1, + }, + { + 4, + filter_width, + filter_height, + 1, + 1, + 1, + 0, + nullptr, + nullptr, + PADDING_SAME_ZERO, + NONE, + 0, + 0, + BRANCH_NO_COPY, + BRANCH_NOC, + {}, + {}, + 0, + }, + } }; + + // Weights and biases need to be specified separately because + // of the offset. + AssignLayerWeightsBiases(&cnn_config, weights, bias); + + CNN_THREAD_DATA thread_data = { 1, NULL }; + + RunCNNTest(image_width, image_height, input, expected, &cnn_config, + image_width, &thread_data, MSE_INT_TOL); +} + +TEST_F(CNNTest, TestOutputChannelsCount) { + int filter_width = 1; + int filter_height = 1; + + int image_width = 2; + int image_height = 2; + + float input[] = { 0, 0, 0, 0 }; + + float weights[] = { 0, 0, 0, 0, 0, 0, 0, 0 }; + + float bias[] = { 0, 0, 0, 0, 0, 0 }; + + float expected[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }; + + CNN_CONFIG cnn_config = { 3, + 0, + 0, + 0, + 0, + { + { + 1, + filter_width, + filter_height, + 2, + 1, + 1, + 0, + weights, + bias, + PADDING_SAME_ZERO, + NONE, + 0, + 0, + BRANCH_INPUT, + BRANCH_NOC, + { + 0x06, + 0, + 0x00, + }, + {}, + -1, + }, + { + 1, + filter_width, + filter_height, + 2, + 1, + 1, + 0, + weights, + bias, + PADDING_SAME_ZERO, + NONE, + 0, + 2, + BRANCH_NO_COPY, + BRANCH_CAT, + { + 0x00, + 0, + 0x03, + }, + {}, + -1, + }, + { + 2, + filter_width, + filter_height, + 2, + 1, + 1, + 0, + weights, + bias, + PADDING_SAME_ZERO, + NONE, + 0, + 0, + BRANCH_NO_COPY, + BRANCH_CAT, + { + 0x00, + 0, + 0x04, + }, + {}, + 0, + }, + } }; + + // Weights and biases need to be specified separately because + // of the offset. + AssignLayerWeightsBiases(&cnn_config, weights, bias); + + CNN_THREAD_DATA thread_data = { 1, NULL }; + + RunCNNTest(image_width, image_height, input, expected, &cnn_config, + image_width, &thread_data, MSE_FLOAT_TOL); +} + +TEST_F(CNNTest, TestBatchNorm) { + int image_width = 28; + int image_height = 28; + int filter_height = 7; + int filter_width = 7; + float input[] = { + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0117647f, 0.0705882f, 0.0705882f, 0.0705882f, + 0.494118f, 0.533333f, 0.686275f, 0.101961f, 0.65098f, 1.0f, + 0.968627f, 0.498039f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.117647f, 0.141176f, 0.368627f, 0.603922f, + 0.666667f, 0.992157f, 0.992157f, 0.992157f, 0.992157f, 0.992157f, + 0.882353f, 0.67451f, 0.992157f, 0.94902f, 0.764706f, 0.25098f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.192157f, + 0.933333f, 0.992157f, 0.992157f, 0.992157f, 0.992157f, 0.992157f, + 0.992157f, 0.992157f, 0.992157f, 0.984314f, 0.364706f, 0.321569f, + 0.321569f, 0.219608f, 0.152941f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0705882f, 0.858824f, 0.992157f, + 0.992157f, 0.992157f, 0.992157f, 0.992157f, 0.776471f, 0.713725f, + 0.968627f, 0.945098f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.313725f, 0.611765f, 0.419608f, 0.992157f, + 0.992157f, 0.803922f, 0.0431373f, 0.0f, 0.168627f, 0.603922f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.054902f, 0.00392157f, 0.603922f, 0.992157f, 0.352941f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.545098f, 0.992157f, 0.745098f, 0.00784314f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0431373f, + 0.745098f, 0.992157f, 0.27451f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.137255f, 0.945098f, + 0.882353f, 0.627451f, 0.423529f, 0.00392157f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.317647f, 0.941176f, 0.992157f, + 0.992157f, 0.466667f, 0.0980392f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.176471f, 0.729412f, 0.992157f, 0.992157f, + 0.588235f, 0.105882f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0627451f, 0.364706f, 0.988235f, 0.992157f, 0.733333f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.976471f, 0.992157f, 0.976471f, 0.25098f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.180392f, 0.509804f, 0.717647f, 0.992157f, + 0.992157f, 0.811765f, 0.00784314f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.152941f, 0.580392f, + 0.898039f, 0.992157f, 0.992157f, 0.992157f, 0.980392f, 0.713725f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0941176f, 0.447059f, 0.866667f, 0.992157f, 0.992157f, 0.992157f, + 0.992157f, 0.788235f, 0.305882f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0901961f, 0.258824f, 0.835294f, 0.992157f, + 0.992157f, 0.992157f, 0.992157f, 0.776471f, 0.317647f, 0.00784314f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0705882f, 0.670588f, + 0.858824f, 0.992157f, 0.992157f, 0.992157f, 0.992157f, 0.764706f, + 0.313725f, 0.0352941f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.215686f, 0.67451f, 0.886275f, 0.992157f, 0.992157f, 0.992157f, + 0.992157f, 0.956863f, 0.521569f, 0.0431373f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.533333f, 0.992157f, + 0.992157f, 0.992157f, 0.831373f, 0.529412f, 0.517647f, 0.0627451f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f + }; + float expected[] = { + -0.836424f, -0.857365f, -1.62739f, -1.62739f, -0.836424f, 5.40742f, + 0.920853f, -0.692567f, -0.836424f, -0.534405f, -1.62739f, -0.836424f, + 1.32602f, 1.36312f, 0.112766f, -0.836424f, -0.192962f, 1.56975f, + 2.45777f, 0.944414f, -0.192962f, -1.5519f, -1.5519f, -0.554006f, + -0.192962f, 1.4231f, -1.5519f, -0.192962f, 1.3661f, -1.5519f, + -1.5519f, -0.192962f, -0.843708f, -0.359025f, -0.843708f, -0.843708f, + -0.843708f, 4.53065f, 0.0429584f, -0.796804f, -0.843708f, 0.3473f, + -0.843708f, -0.843708f, -0.114439f, 3.14817f, 0.0811934f, -0.843708f + }; + float kernel[] = { + 0.119643f, -0.237864f, 0.0462892f, 0.0502297f, -0.0134528f, + 0.146347f, 0.153133f, 0.0513307f, 0.0752369f, 0.0135557f, + -0.111434f, 0.0941854f, 0.0788362f, 0.0299412f, 0.111762f, + 0.144066f, 0.00431504f, -0.0177954f, 0.0738092f, -0.0344215f, + 0.0832582f, 0.053989f, -0.112691f, 0.0962145f, 0.0186525f, + -0.00660205f, -0.111962f, -0.126801f, -0.231625f, 0.17309f, + 0.0748875f, -0.179569f, -0.00513812f, -0.156579f, -0.147322f, + 0.184168f, 0.189308f, -0.200359f, -0.0156733f, 0.140649f, + 0.0858496f, -0.0263217f, -0.0740749f, -0.112563f, 0.107528f, + 0.0609729f, -0.221625f, 0.0769944f, -0.00900815f, -0.00136441f, + -0.0236521f, -0.0418025f, -0.00286299f, 0.12241f, 0.0964093f, + -0.0150897f, 0.0532171f, 0.0625916f, 0.116939f, 0.118024f, + 0.161918f, -0.00909767f, 0.100897f, -0.054563f, -0.175179f, + -0.0687892f, 0.00734235f, 0.109833f, -0.113776f, 0.0595405f, + -0.170255f, 0.0124815f, -0.0363301f, -0.0127038f, 0.0445554f, + -0.0729894f, 0.107428f, -0.0341417f, 0.132619f, 0.00984557f, + -0.00443654f, 0.202929f, 0.0945134f, 0.0148725f, 0.00998574f, + -0.0226449f, 0.0478197f, -0.0793442f, 0.0707599f, -0.084225f, + 0.0865795f, 0.071104f, -0.047894f, 0.0838322f, 0.0635493f, + -0.00370265f, -0.157247f, -0.0289622f, -0.0590963f, 0.13207f, + 0.00468011f, -0.0345372f, 0.217939f, 0.18861f, -0.0290393f, + -0.0440664f, 0.0126197f, -0.129132f, -0.124943f, 0.0968156f, + -0.0853643f, -0.182305f, 0.00461618f, -0.147095f, -0.230282f, + 0.00856019f, 0.0278893f, -0.0300229f, 0.0417871f, 0.0804717f, + -0.0768571f, -0.0397085f, -0.0601096f, 0.100901f, -0.0184926f, + 0.0350673f, 0.0971094f, -0.0171837f, -0.289644f, -0.0899041f, + 0.08998f, -0.160319f, -0.0195103f, 0.0392167f, -0.137864f, + -0.0136294f, 0.0330886f, -0.0409244f, -0.092533f, -0.0427934f, + -0.191144f, -0.0969461f, 0.112035f, 0.138611f, 0.128717f, + 0.191184f, 0.197462f + }; + float bias[] = { 0.186703f, 0.204358f, -0.0230452f }; + + float bn_gamma[] = { 1.32173f, 1.26171f, 1.21966f }; + float bn_beta[] = { -0.232595f, -0.222652f, -0.232209f }; + float bn_mean[] = { 0.329233f, 0.199894f, 0.12389f }; + float bn_std[] = { 0.311986f, 0.189737f, 0.247104f }; + + CNN_BATCHNORM_PARAMS bn_params = { + bn_gamma, + bn_beta, + bn_mean, + bn_std, + }; + + CNN_CONFIG cnn_config = { + 1, + 0, + 0, + 0, + 0, + { + { + 1, + filter_width, + filter_height, + 3, + 7, + 7, + 0, + kernel, + bias, + PADDING_VALID, + RELU, + 0, + 0, + BRANCH_NO_COPY, + BRANCH_NOC, + {}, + bn_params, + 0, + }, + }, + }; + + CNN_THREAD_DATA thread_data = { 1, NULL }; + + RunCNNTest(image_width, image_height, input, expected, &cnn_config, + image_width, &thread_data, MSE_FLOAT_TOL); +} + +TEST_F(CNNTest, TestMultithreading) { + int image_height = 2; + int image_width = 2; + int filter_height = 3; + int filter_width = 3; + + float input[] = { + -2, + 4, + 1, + 0, + }; + + float weights[] = { + -4, 2, -2, 0, -4, 4, -3, -3, -3, -1, 1, 0, -5, -3, 0, -5, 0, 0, + -1, 0, 2, -5, 0, 1, 4, 2, 1, 0, -2, -1, -5, -3, 2, -2, 1, -5, + }; + + float bias[] = { + -4, + -3, + -2, + 3, + }; + + float expected[] = { + 2, 10, -8, -17, -24, 5, -15, 6, -5, -5, 7, -10, 4, 13, 9, -14, + }; + + CNN_CONFIG cnn_config = { + 1, + 0, + 0, + 0, + 0, + { + { + 1, + filter_width, + filter_height, + 4, + 1, + 1, + 0, + weights, + bias, + PADDING_SAME_ZERO, + NONE, + 0, + 0, + BRANCH_NO_COPY, + BRANCH_NOC, + {}, + {}, + 0, + }, + }, + }; + + CNN_THREAD_DATA thread_data = { 1, NULL }; + + RunCNNTest(image_width, image_height, input, expected, &cnn_config, + image_width, &thread_data, MSE_FLOAT_TOL); + + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + AVxWorker workers[4]; + + for (int i = 0; i < 4; ++i) { + winterface->init(&workers[i]); + } + + thread_data = { 4, workers }; + + RunCNNTest(image_width, image_height, input, expected, &cnn_config, + image_width, &thread_data, MSE_FLOAT_TOL); + + for (int i = 0; i < 4; ++i) { + winterface->end(&workers[i]); + } +} + +TEST_F(CNNTest, TestMultiOutput) { + const int image_dim = 8; + const int image_ch = 3; + const int filter_dim = 2; + const int stride = 2; + const int num_filters = 2; + + const float input_[] = { + 1.7537929121f, 0.134331551012f, 0.123580039877f, 0.957731845246f, + 0.391006834217f, 1.00699352042f, -0.778177955829f, -0.814166433059f, + -0.656374394915f, 0.321967305228f, -2.19455719176f, 0.708035038966f, + 0.409148822266f, -0.318254408902f, 0.152450211189f, -0.250210793369f, + 0.826811563186f, 1.6804156584f, 0.273626975978f, 0.437936241887f, + -0.329935520167f, -0.288761611645f, 0.156937008304f, 0.271054157295f, + -0.0224828854332f, 1.70110336895f, -0.989066699309f, 1.30863131729f, + -0.165813705702f, 0.00380178619265f, -0.0837342367587f, 0.760954783156f, + -0.413610373524f, 1.17968204175f, 0.720295719536f, 0.308718974472f, + -1.10091337671f, 0.693160033687f, -0.0202862320697f, 1.0221927503f, + -1.24521801881f, -0.478501952308f, -1.71648619442f, -0.182571723636f, + 0.339292649504f, 2.0806519131f, 0.967974033444f, 0.175248672328f, + 0.0658124561472f, 0.795504169496f, 0.750592557361f, -1.46631013249f, + -1.79052846838f, -1.03672179515f, -0.841985521653f, 1.20995011489f, + 0.140859718215f, -0.651552622661f, 0.451065110806f, 1.1189443693f, + 0.100213260593f, -0.834076868118f, -1.28734321611f, 1.22064420095f, + -0.364143084361f, 0.750961509335f, -0.888689074553f, -0.8253547106f, + -1.21800999027f, -0.966670603566f, 1.37384014741f, 0.47281264834f, + -0.420416235531f, 0.520163906493f, 0.501296589423f, 1.53418976951f, + 0.715234751485f, 0.644551588907f, 0.0763504863375f, -0.0018541943723f, + 0.322853189656f, -0.795099723224f, -0.125177096675f, 1.4476577471f, + -0.585888410088f, -1.44391754955f, -0.610543221933f, -0.221859179799f, + 0.252060200774f, -0.86287169623f, -0.0350246229157f, 1.0932311997f, + 0.899464648842f, -0.468806951704f, -0.300861137168f, 1.15776414206f, + 1.03268544738f, -0.171579585622f, -0.179136557119f, -0.354091003368f, + -0.612298249394f, -1.20237379258f, 1.54604109659f, 0.130664370287f, + 0.885225111868f, 1.0362799581f, 0.980561720868f, -0.619379186999f, + -1.33818929924f, -0.237233737961f, -1.89335425073f, 0.567821011321f, + 0.862420368465f, -1.37380916821f, 0.352190056666f, 0.611261516274f, + 0.393237747152f, 0.894686247967f, 0.190405182149f, 0.264872662911f, + -0.0657009133797f, 0.0580512653493f, -0.401825294366f, 0.4106081318f, + 0.49484512188f, -0.0751103149442f, -1.43243736382f, 1.79855656009f, + -1.1075351975f, 0.000354882733011f, -0.950716438608f, 1.27129831688f, + 1.00495189838f, 0.110358656713f, 1.08315032822f, -0.972676676218f, + -0.0757668962831f, 1.88932045165f, -0.0672638136275f, 0.425913010161f, + -0.781540372017f, 0.976000248609f, 0.687218504122f, 1.31374513445f, + -0.932658930672f, -1.25339468479f, 0.422071294078f, -0.24189927912f, + 0.216906604642f, -1.88720997548f, 1.99252872889f, 0.353943735777f, + 0.737434784132f, -1.17848645017f, 1.70424254896f, 0.775297112968f, + -0.516392797501f, 0.398130609129f, 0.737248101457f, 0.166282500886f, + 1.24699015468f, 0.47116183125f, 1.19091180182f, -0.372695424578f, + 0.219773209389f, -0.829467838962f, -0.52533122724f, 1.98707754595f, + 0.553692606972f, -0.933228902369f, 1.55427751643f, -1.08813399144f, + -0.325686682094f, 0.205091443796f, -1.70381666435f, 0.466465327942f, + 1.73126863447f, -0.939133672634f, 1.48318077459f, -0.599414038168f, + -1.1583078687f, 0.518116190201f, 0.133571482458f, 0.84958342672f, + 1.02205000597f, -0.0772082009087f, -1.69567503859f, 1.4697939436f, + 1.67813743122f, -0.627911582938f, 0.131380509137f, -1.35717850726f, + }; + const float *input[3] = { input_, &input_[image_dim * image_dim], + &input_[2 * image_dim * image_dim] }; + + const float bias[] = { 0.0f, 0.0f }; + + const float weights_1[] = { + -0.489547413618f, 0.141916424749f, -0.279286485585f, -0.115322211094f, + 0.299572786936f, 0.205289980785f, -0.536254480088f, -0.253626313744f, + -0.422883815849f, -0.169702966298f, -0.540104704793f, 0.495319646763f, + 0.298799079422f, -0.10054550901f, -0.306085047056f, 0.171061886165f, + -0.108058703878f, -0.410734629888f, -0.0640674673049f, -0.386524840979f, + -0.157203423678f, -0.362138920529f, -0.216206085209f, 0.147502517971f, + }; + + const float weights_2[] = { + 0.207580604357f, 0.480821146263f, -0.29111909562f, 0.47422567493f, + 0.206892553253f, -0.235067084092f, 0.354516800602f, -0.212399370252f, + -0.419071343731f, -0.050350731631f, -0.0516457320279f, -0.0359310500731f, + 0.567044864811f, -0.060341127522f, 0.0501464839637f, -0.437785677916f, + }; + + const float weights_3[] = { + -0.0690452401448f, -0.356657338763f, -0.219464031809f, 0.551288365843f, + 0.181372090853f, -0.00245268542109f, 0.409000696276f, -0.593209108763f, + 0.587352566749f, -0.243720660227f, 0.266232713887f, -0.00439285245097f, + 0.252883228305f, 0.152646192631f, 0.0918944932026f, 0.398853715057f, + }; + + const float weights_4[] = { + 0.207560791573f, 0.194201350401f, 0.227802322443f, 0.206533663345f, + 0.0557331066805f, 0.0224159800424f, -0.143939197467f, -0.27703361602f, + 0.130643888389f, -0.269456557461f, 0.186242862864f, -0.162879944774f, + -0.145503996718f, -0.0768822987581f, -0.203127976359f, -0.238119922873f, + -0.258806479994f, 0.0357957680385f, -0.1027606976f, -0.287920082345f, + 0.189047820993f, 0.250711538481f, -0.272815714175f, -0.0431449742024f, + 0.207261230996f, -0.0396472677451f, 0.131236557412f, 0.174291832499f, + -0.251515885765f, -0.107164007499f, 0.185824534748f, -0.00561585838161f, + 0.273393799578f, -0.139563699075f, -0.263922456031f, -0.118859844081f, + 0.109230982597f, -0.170170294794f, 0.0123025648515f, -0.0839368964355f, + -0.0774058234297f, 0.255847138286f, -0.208430879637f, 0.279170114319f, + -0.272890330712f, -0.217725903006f, -0.295923275459f, -0.17008723953f, + -0.284281803405f, 0.281406323629f, 0.266910044663f, -0.209963914338f, + 0.271980962964f, 0.142013581699f, -0.143896509026f, -0.290509242975f, + -0.305768180935f, 0.196902832117f, -0.090424189662f, -0.147460802346f, + 0.217722016651f, 0.12353848977f, -0.169177363577f, -0.0454230918512f, + }; + + const float expected_0[] = { + -2.04858441055f, -2.12883075791f, -0.045177363807f, 0.763949675768f, + -0.544361512821f, -1.58123168032f, 1.89319847039f, 0.16859080901f, + -1.16023321135f, -0.396988107751f, 1.76637090744f, -1.40434786514f, + 0.908227575669f, 0.817064817605f, 0.215631134908f, -0.848605613428f, + -0.106756747018f, 0.0193027166685f, 0.801345615113f, -0.395407237598f, + -1.79983795658f, -1.73054496242f, 0.0584392594454f, -0.388786095569f, + -0.237269619354f, 0.000843578271263f, -1.24043512104f, 0.487839445893f, + -0.394259726605f, 0.559632843424f, -0.527224052291f, -1.53792340282f, + }; + + const float expected_1[] = { + 0.0f, 0.0f, 0.0f, 0.0f, 0.4057888292f, 0.325309571755f, + 0.0f, 1.22013465602f, + }; + + const float expected_2[] = { + 0.156119444687f, + 0.517385299817f, + }; + + const float expected_3[] = { + 0.224177852984f, + 0.503384419034f, + 0.156119444687f, + 0.517385299817f, + }; + + const float *expected[] = { expected_0, expected_1, expected_2, expected_3 }; + + CNN_CONFIG cnn_config = { + 4, // num_layers + 0, // is_residue + 0, // ext_width + 0, // ext_height + 0, // strict_bounds + { + // layer_config + { + image_ch, // in_channels + filter_dim, // filter_width + filter_dim, // filter_height + num_filters, // out_channels + stride, // skip_width + stride, // skip_height + 0, // max_pool + weights_1, // weights + bias, // bias + PADDING_SAME_ZERO, // pad + NONE, // activation + 0, // deconvolve + 0, // branch + BRANCH_OUTPUT, // branch_copy_type + BRANCH_NOC, // branch_combine_type + { 2, 0, 0 }, // branch_config + {}, // bn_params + 0, // output_num + }, + { + num_filters, // in_channels + filter_dim, // filter_width + filter_dim, // filter_height + num_filters, // out_channels + stride, // skip_width + stride, // skip_height + 0, // max_pool + weights_2, // weights + bias, // bias + PADDING_SAME_ZERO, // pad + RELU, // activation + 0, // deconvolve + 0, // branch + BRANCH_NO_COPY, // branch_copy_type + BRANCH_NOC, // branch_combine_type + {}, // branch_config + {}, // bn_params + 1, // output_num + }, + { + num_filters, // in_channels + filter_dim, // filter_width + filter_dim, // filter_height + num_filters, // out_channels + stride, // skip_width + stride, // skip_height + 0, // max_pool + weights_3, // weights + bias, // bias + PADDING_SAME_ZERO, // pad + RELU, // activation + 0, // deconvolve + 0, // branch + BRANCH_NO_COPY, // branch_copy_type + BRANCH_NOC, // branch_combine_type + {}, // branch_config + {}, // bn_params + 2, // output_num + }, + { + num_filters, // in_channels + 2 * filter_dim, // filter_width + 2 * filter_dim, // filter_height + num_filters, // out_channels + 2 * stride, // skip_width + 2 * stride, // skip_height + 0, // max_pool + weights_4, // weights + bias, // bias + PADDING_VALID, // pad + RELU, // activation + 0, // deconvolve + 1, // branch + BRANCH_NO_COPY, // branch_copy_type + BRANCH_CAT, // branch_combine_type + { 0, 0, 1 }, // branch_config + {}, // bn_params + 3, // output_num + }, + }, + }; + + CNN_THREAD_DATA thread_data = { 1, NULL }; + + const int num_outputs = 4; + const int output_chs[4] = { filter_dim, filter_dim, filter_dim, + 2 * filter_dim }; + const int output_dims[4] = { 4, 2, 1, 1 }; + const int output_sizes[4] = { + output_chs[0] * output_dims[0] * output_dims[0], + output_chs[1] * output_dims[1] * output_dims[1], + output_chs[2] * output_dims[2] * output_dims[2], + output_chs[3] * output_dims[3] * output_dims[3], + }; + float *const output_ = (float *)aom_malloc( + sizeof(*output_) * + (output_sizes[0] + output_sizes[1] + output_sizes[2] + output_sizes[3])); + float *output[CNN_MAX_CHANNELS] = { nullptr }; + int ch_ite = 0; + float *output_ite = output_; + for (int output_idx = 0; output_idx < num_outputs; output_idx++) { + for (int channel = 0; channel < output_chs[output_idx]; ++channel) { + output[ch_ite++] = output_ite; + output_ite += output_dims[output_idx] * output_dims[output_idx]; + } + } + CNN_MULTI_OUT output_struct = { num_outputs, output_chs, output_dims, + output }; + + RunMultiOutCNNTest(input, image_dim, image_dim, image_dim, &cnn_config, + &thread_data, &output_struct, expected, MSE_FLOAT_TOL); + + aom_free(output_); +} diff --git a/libs/libaom/src/test/codec_factory.h b/libs/libaom/src/test/codec_factory.h new file mode 100644 index 000000000..801b8948f --- /dev/null +++ b/libs/libaom/src/test/codec_factory.h @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_TEST_CODEC_FACTORY_H_ +#define AOM_TEST_CODEC_FACTORY_H_ + +#include + +#include "config/aom_config.h" + +#include "aom/aom_decoder.h" +#include "aom/aom_encoder.h" +#if CONFIG_AV1_ENCODER +#include "aom/aomcx.h" +#endif +#if CONFIG_AV1_DECODER +#include "aom/aomdx.h" +#endif + +#include "test/decode_test_driver.h" +#include "test/encode_test_driver.h" +namespace libaom_test { + +const int kCodecFactoryParam = 0; + +class CodecFactory { + public: + CodecFactory() {} + + virtual ~CodecFactory() {} + + virtual Decoder *CreateDecoder(aom_codec_dec_cfg_t cfg) const = 0; + + virtual Decoder *CreateDecoder(aom_codec_dec_cfg_t cfg, + const aom_codec_flags_t flags) const = 0; + + virtual Encoder *CreateEncoder(aom_codec_enc_cfg_t cfg, + const aom_codec_flags_t init_flags, + TwopassStatsStore *stats) const = 0; + + virtual aom_codec_err_t DefaultEncoderConfig(aom_codec_enc_cfg_t *cfg, + unsigned int usage) const = 0; +}; + +/* Provide CodecTestWithParams classes for a variable number of parameters + * to avoid having to include a pointer to the CodecFactory in every test + * definition. + */ +template +class CodecTestWithParam + : public ::testing::TestWithParam< + std::tuple > {}; + +template +class CodecTestWith2Params + : public ::testing::TestWithParam< + std::tuple > {}; + +template +class CodecTestWith3Params + : public ::testing::TestWithParam< + std::tuple > {}; + +template +class CodecTestWith4Params + : public ::testing::TestWithParam< + std::tuple > {}; + +template +class CodecTestWith5Params + : public ::testing::TestWithParam< + std::tuple > { +}; + +/* + * AV1 Codec Definitions + */ +class AV1Decoder : public Decoder { + public: + explicit AV1Decoder(aom_codec_dec_cfg_t cfg) : Decoder(cfg) {} + + AV1Decoder(aom_codec_dec_cfg_t cfg, const aom_codec_flags_t flag) + : Decoder(cfg, flag) {} + + protected: + virtual aom_codec_iface_t *CodecInterface() const { +#if CONFIG_AV1_DECODER + return aom_codec_av1_dx(); +#else + return NULL; +#endif + } +}; + +class AV1Encoder : public Encoder { + public: + AV1Encoder(aom_codec_enc_cfg_t cfg, const aom_codec_flags_t init_flags, + TwopassStatsStore *stats) + : Encoder(cfg, init_flags, stats) {} + + protected: + virtual aom_codec_iface_t *CodecInterface() const { +#if CONFIG_AV1_ENCODER + return aom_codec_av1_cx(); +#else + return NULL; +#endif + } +}; + +class AV1CodecFactory : public CodecFactory { + public: + AV1CodecFactory() : CodecFactory() {} + + virtual Decoder *CreateDecoder(aom_codec_dec_cfg_t cfg) const { + return CreateDecoder(cfg, 0); + } + + virtual Decoder *CreateDecoder(aom_codec_dec_cfg_t cfg, + const aom_codec_flags_t flags) const { +#if CONFIG_AV1_DECODER + return new AV1Decoder(cfg, flags); +#else + (void)cfg; + (void)flags; + return NULL; +#endif + } + + virtual Encoder *CreateEncoder(aom_codec_enc_cfg_t cfg, + const aom_codec_flags_t init_flags, + TwopassStatsStore *stats) const { +#if CONFIG_AV1_ENCODER + return new AV1Encoder(cfg, init_flags, stats); +#else + (void)cfg; + (void)init_flags; + (void)stats; + return NULL; +#endif + } + + virtual aom_codec_err_t DefaultEncoderConfig(aom_codec_enc_cfg_t *cfg, + unsigned int usage) const { +#if CONFIG_AV1_ENCODER + return aom_codec_enc_config_default(aom_codec_av1_cx(), cfg, usage); +#else + (void)cfg; + (void)usage; + return AOM_CODEC_INCAPABLE; +#endif + } +}; + +const libaom_test::AV1CodecFactory kAV1; + +#define AV1_INSTANTIATE_TEST_CASE(test, ...) \ + INSTANTIATE_TEST_SUITE_P( \ + AV1, test, \ + ::testing::Combine( \ + ::testing::Values(static_cast( \ + &libaom_test::kAV1)), \ + __VA_ARGS__)) + +} // namespace libaom_test +#endif // AOM_TEST_CODEC_FACTORY_H_ diff --git a/libs/libaom/src/test/coding_path_sync.cc b/libs/libaom/src/test/coding_path_sync.cc new file mode 100644 index 000000000..4c613dc03 --- /dev/null +++ b/libs/libaom/src/test/coding_path_sync.cc @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/acm_random.h" + +#include "config/aom_config.h" + +#include "aom/aomcx.h" +#include "aom/aomdx.h" +#include "aom/aom_encoder.h" +#include "aom/aom_decoder.h" + +#define NELEMENTS(x) static_cast(sizeof(x) / sizeof(x[0])) + +using libaom_test::ACMRandom; +namespace { + +class CompressedSource { + public: + explicit CompressedSource(int seed) : rnd_(seed), frame_count_(0) { + aom_codec_iface_t *algo = aom_codec_av1_cx(); + + aom_codec_enc_cfg_t cfg; + aom_codec_enc_config_default(algo, &cfg, 0); + + // force the quantizer, to reduce the sensitivity on encoding choices. + // e.g, we don't want this test to break when the rate control is modified. + { + const int max_q = cfg.rc_max_quantizer; + const int min_q = cfg.rc_min_quantizer; + const int q = rnd_.PseudoUniform(max_q - min_q + 1) + min_q; + + cfg.rc_end_usage = AOM_Q; + cfg.rc_max_quantizer = q; + cfg.rc_min_quantizer = q; + } + + // choose the picture size + { + width_ = rnd_.PseudoUniform(kWidth - 8) + 8; + height_ = rnd_.PseudoUniform(kHeight - 8) + 8; + } + + // choose the chroma subsampling + { + const aom_img_fmt_t fmts[] = { + AOM_IMG_FMT_I420, + AOM_IMG_FMT_I422, + AOM_IMG_FMT_I444, + }; + + format_ = fmts[rnd_.PseudoUniform(NELEMENTS(fmts))]; + } + + cfg.g_w = width_; + cfg.g_h = height_; + cfg.g_lag_in_frames = 0; + if (format_ == AOM_IMG_FMT_I420) + cfg.g_profile = 0; + else if (format_ == AOM_IMG_FMT_I444) + cfg.g_profile = 1; + else if (format_ == AOM_IMG_FMT_I422) + cfg.g_profile = 2; + + aom_codec_enc_init(&enc_, algo, &cfg, 0); + } + + ~CompressedSource() { aom_codec_destroy(&enc_); } + + const aom_codec_cx_pkt_t *ReadFrame() { + uint8_t buf[kWidth * kHeight * 3] = { 0 }; + + // render regular pattern + const int period = rnd_.Rand8() % 32 + 1; + const int phase = rnd_.Rand8() % period; + + const int val_a = rnd_.Rand8(); + const int val_b = rnd_.Rand8(); + + for (int i = 0; i < (int)sizeof buf; ++i) + buf[i] = (i + phase) % period < period / 2 ? val_a : val_b; + + aom_image_t img; + aom_img_wrap(&img, format_, width_, height_, 0, buf); + aom_codec_encode(&enc_, &img, frame_count_++, 1, 0); + + aom_codec_iter_t iter = NULL; + + const aom_codec_cx_pkt_t *pkt = NULL; + + do { + pkt = aom_codec_get_cx_data(&enc_, &iter); + } while (pkt && pkt->kind != AOM_CODEC_CX_FRAME_PKT); + + return pkt; + } + + private: + static const int kWidth = 128; + static const int kHeight = 128; + + ACMRandom rnd_; + aom_img_fmt_t format_; + aom_codec_ctx_t enc_; + int frame_count_; + int width_, height_; +}; + +// lowers an aom_image_t to a easily comparable/printable form +std::vector Serialize(const aom_image_t *img) { + std::vector bytes; + bytes.reserve(img->d_w * img->d_h * 3); + for (int plane = 0; plane < 3; ++plane) { + const int w = aom_img_plane_width(img, plane); + const int h = aom_img_plane_height(img, plane); + + for (int r = 0; r < h; ++r) { + for (int c = 0; c < w; ++c) { + unsigned char *row = img->planes[plane] + r * img->stride[plane]; + if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) + bytes.push_back(row[c * 2]); + else + bytes.push_back(row[c]); + } + } + } + + return bytes; +} + +class Decoder { + public: + explicit Decoder(int allowLowbitdepth) { + aom_codec_iface_t *algo = aom_codec_av1_dx(); + + aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t(); + cfg.allow_lowbitdepth = allowLowbitdepth; + + aom_codec_dec_init(&dec_, algo, &cfg, 0); + } + + ~Decoder() { aom_codec_destroy(&dec_); } + + std::vector decode(const aom_codec_cx_pkt_t *pkt) { + aom_codec_decode(&dec_, static_cast(pkt->data.frame.buf), + pkt->data.frame.sz, NULL); + + aom_codec_iter_t iter = NULL; + return Serialize(aom_codec_get_frame(&dec_, &iter)); + } + + private: + aom_codec_ctx_t dec_; +}; + +// Try to reveal a mismatch between LBD and HBD coding paths. +TEST(CodingPathSync, SearchForHbdLbdMismatch) { + const int count_tests = 10; + for (int i = 0; i < count_tests; ++i) { + Decoder dec_hbd(0); + Decoder dec_lbd(1); + + CompressedSource enc(i); + + for (int k = 0; k < 3; ++k) { + const aom_codec_cx_pkt_t *frame = enc.ReadFrame(); + + std::vector lbd_yuv = dec_lbd.decode(frame); + std::vector hbd_yuv = dec_hbd.decode(frame); + + ASSERT_EQ(lbd_yuv, hbd_yuv); + } + } +} + +TEST(CodingPathSyncLarge, SearchForHbdLbdMismatchLarge) { + const int count_tests = 100; + const int seed = 1234; + for (int i = 0; i < count_tests; ++i) { + Decoder dec_hbd(0); + Decoder dec_lbd(1); + + CompressedSource enc(seed + i); + + for (int k = 0; k < 5; ++k) { + const aom_codec_cx_pkt_t *frame = enc.ReadFrame(); + + std::vector lbd_yuv = dec_lbd.decode(frame); + std::vector hbd_yuv = dec_hbd.decode(frame); + + ASSERT_EQ(lbd_yuv, hbd_yuv); + } + } +} + +} // namespace diff --git a/libs/libaom/src/test/comp_avg_pred_test.cc b/libs/libaom/src/test/comp_avg_pred_test.cc new file mode 100644 index 000000000..ac625a79d --- /dev/null +++ b/libs/libaom/src/test/comp_avg_pred_test.cc @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "test/comp_avg_pred_test.h" + +using libaom_test::ACMRandom; +using libaom_test::AV1DISTWTDCOMPAVG::AV1DISTWTDCOMPAVGTest; +using libaom_test::AV1DISTWTDCOMPAVG::AV1DISTWTDCOMPAVGUPSAMPLEDTest; +#if CONFIG_AV1_HIGHBITDEPTH +using libaom_test::AV1DISTWTDCOMPAVG::AV1HighBDDISTWTDCOMPAVGTest; +using libaom_test::AV1DISTWTDCOMPAVG::AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest; +#endif +using std::make_tuple; +using std::tuple; + +namespace { + +TEST_P(AV1DISTWTDCOMPAVGTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); } + +TEST_P(AV1DISTWTDCOMPAVGTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); } + +#if HAVE_SSSE3 +INSTANTIATE_TEST_SUITE_P(SSSE3, AV1DISTWTDCOMPAVGTest, + libaom_test::AV1DISTWTDCOMPAVG::BuildParams( + aom_dist_wtd_comp_avg_pred_ssse3)); +#endif + +TEST_P(AV1DISTWTDCOMPAVGUPSAMPLEDTest, DISABLED_Speed) { + RunSpeedTest(GET_PARAM(0)); +} + +TEST_P(AV1DISTWTDCOMPAVGUPSAMPLEDTest, CheckOutput) { + RunCheckOutput(GET_PARAM(0)); +} + +#if HAVE_SSSE3 +INSTANTIATE_TEST_SUITE_P(SSSE3, AV1DISTWTDCOMPAVGUPSAMPLEDTest, + libaom_test::AV1DISTWTDCOMPAVG::BuildParams( + aom_dist_wtd_comp_avg_upsampled_pred_ssse3)); +#endif + +#if CONFIG_AV1_HIGHBITDEPTH +TEST_P(AV1HighBDDISTWTDCOMPAVGTest, DISABLED_Speed) { + RunSpeedTest(GET_PARAM(1)); +} + +TEST_P(AV1HighBDDISTWTDCOMPAVGTest, CheckOutput) { + RunCheckOutput(GET_PARAM(1)); +} + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P(SSE2, AV1HighBDDISTWTDCOMPAVGTest, + libaom_test::AV1DISTWTDCOMPAVG::BuildParams( + aom_highbd_dist_wtd_comp_avg_pred_sse2, 1)); +#endif + +TEST_P(AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest, DISABLED_Speed) { + RunSpeedTest(GET_PARAM(1)); +} + +TEST_P(AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest, CheckOutput) { + RunCheckOutput(GET_PARAM(1)); +} + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P(SSE2, AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest, + libaom_test::AV1DISTWTDCOMPAVG::BuildParams( + aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2)); +#endif +#endif // CONFIG_AV1_HIGHBITDEPTH + +} // namespace diff --git a/libs/libaom/src/test/comp_avg_pred_test.h b/libs/libaom/src/test/comp_avg_pred_test.h new file mode 100644 index 000000000..7f73312c4 --- /dev/null +++ b/libs/libaom/src/test/comp_avg_pred_test.h @@ -0,0 +1,569 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_TEST_COMP_AVG_PRED_TEST_H_ +#define AOM_TEST_COMP_AVG_PRED_TEST_H_ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/acm_random.h" +#include "test/util.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "av1/common/common_data.h" +#include "aom_ports/aom_timer.h" + +namespace libaom_test { +const int kMaxSize = 128 + 32; // padding + +namespace AV1DISTWTDCOMPAVG { + +typedef void (*distwtdcompavg_func)(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, const uint8_t *ref, + int ref_stride, + const DIST_WTD_COMP_PARAMS *jcp_param); + +typedef void (*distwtdcompavgupsampled_func)( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search); + +typedef std::tuple DISTWTDCOMPAVGParam; + +typedef std::tuple + DISTWTDCOMPAVGUPSAMPLEDParam; + +#if CONFIG_AV1_HIGHBITDEPTH +typedef void (*highbddistwtdcompavgupsampled_func)( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, + int subpel_search); + +typedef std::tuple + HighbdDISTWTDCOMPAVGUPSAMPLEDParam; + +typedef std::tuple + HighbdDISTWTDCOMPAVGParam; + +::testing::internal::ParamGenerator BuildParams( + distwtdcompavg_func filter, int is_hbd) { + (void)is_hbd; + return ::testing::Combine(::testing::Range(8, 13, 2), + ::testing::Values(filter), + ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL)); +} + +::testing::internal::ParamGenerator +BuildParams(highbddistwtdcompavgupsampled_func filter) { + return ::testing::Combine(::testing::Range(8, 13, 2), + ::testing::Values(filter), + ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL)); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +::testing::internal::ParamGenerator BuildParams( + distwtdcompavg_func filter) { + return ::testing::Combine(::testing::Values(filter), + ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL)); +} + +::testing::internal::ParamGenerator BuildParams( + distwtdcompavgupsampled_func filter) { + return ::testing::Combine(::testing::Values(filter), + ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL)); +} + +class AV1DISTWTDCOMPAVGTest + : public ::testing::TestWithParam { + public: + ~AV1DISTWTDCOMPAVGTest() {} + void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); } + void TearDown() { libaom_test::ClearSystemState(); } + + protected: + void RunCheckOutput(distwtdcompavg_func test_impl) { + const int w = kMaxSize, h = kMaxSize; + const int block_idx = GET_PARAM(1); + + uint8_t pred8[kMaxSize * kMaxSize]; + uint8_t ref8[kMaxSize * kMaxSize]; + uint8_t output[kMaxSize * kMaxSize]; + uint8_t output2[kMaxSize * kMaxSize]; + + for (int i = 0; i < h; ++i) + for (int j = 0; j < w; ++j) { + pred8[i * w + j] = rnd_.Rand8(); + ref8[i * w + j] = rnd_.Rand8(); + } + const int in_w = block_size_wide[block_idx]; + const int in_h = block_size_high[block_idx]; + + DIST_WTD_COMP_PARAMS dist_wtd_comp_params; + dist_wtd_comp_params.use_dist_wtd_comp_avg = 1; + + for (int ii = 0; ii < 2; ii++) { + for (int jj = 0; jj < 4; jj++) { + dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0]; + dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1]; + + const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7); + const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7); + aom_dist_wtd_comp_avg_pred_c(output, pred8 + offset_r * w + offset_c, + in_w, in_h, ref8 + offset_r * w + offset_c, + in_w, &dist_wtd_comp_params); + test_impl(output2, pred8 + offset_r * w + offset_c, in_w, in_h, + ref8 + offset_r * w + offset_c, in_w, &dist_wtd_comp_params); + + for (int i = 0; i < in_h; ++i) { + for (int j = 0; j < in_w; ++j) { + int idx = i * in_w + j; + ASSERT_EQ(output[idx], output2[idx]) + << "Mismatch at unit tests for AV1DISTWTDCOMPAVGTest\n" + << in_w << "x" << in_h << " Pixel mismatch at index " << idx + << " = (" << i << ", " << j << ")"; + } + } + } + } + } + void RunSpeedTest(distwtdcompavg_func test_impl) { + const int w = kMaxSize, h = kMaxSize; + const int block_idx = GET_PARAM(1); + + uint8_t pred8[kMaxSize * kMaxSize]; + uint8_t ref8[kMaxSize * kMaxSize]; + uint8_t output[kMaxSize * kMaxSize]; + uint8_t output2[kMaxSize * kMaxSize]; + + for (int i = 0; i < h; ++i) + for (int j = 0; j < w; ++j) { + pred8[i * w + j] = rnd_.Rand8(); + ref8[i * w + j] = rnd_.Rand8(); + } + const int in_w = block_size_wide[block_idx]; + const int in_h = block_size_high[block_idx]; + + DIST_WTD_COMP_PARAMS dist_wtd_comp_params; + dist_wtd_comp_params.use_dist_wtd_comp_avg = 1; + + dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0]; + dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1]; + + const int num_loops = 1000000000 / (in_w + in_h); + aom_usec_timer timer; + aom_usec_timer_start(&timer); + + for (int i = 0; i < num_loops; ++i) + aom_dist_wtd_comp_avg_pred_c(output, pred8, in_w, in_h, ref8, in_w, + &dist_wtd_comp_params); + + aom_usec_timer_mark(&timer); + const int elapsed_time = static_cast(aom_usec_timer_elapsed(&timer)); + printf("distwtdcompavg c_code %3dx%-3d: %7.2f us\n", in_w, in_h, + 1000.0 * elapsed_time / num_loops); + + aom_usec_timer timer1; + aom_usec_timer_start(&timer1); + + for (int i = 0; i < num_loops; ++i) + test_impl(output2, pred8, in_w, in_h, ref8, in_w, &dist_wtd_comp_params); + + aom_usec_timer_mark(&timer1); + const int elapsed_time1 = static_cast(aom_usec_timer_elapsed(&timer1)); + printf("distwtdcompavg test_code %3dx%-3d: %7.2f us\n", in_w, in_h, + 1000.0 * elapsed_time1 / num_loops); + } + + libaom_test::ACMRandom rnd_; +}; // class AV1DISTWTDCOMPAVGTest + +class AV1DISTWTDCOMPAVGUPSAMPLEDTest + : public ::testing::TestWithParam { + public: + ~AV1DISTWTDCOMPAVGUPSAMPLEDTest() {} + void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); } + void TearDown() { libaom_test::ClearSystemState(); } + + protected: + void RunCheckOutput(distwtdcompavgupsampled_func test_impl) { + const int w = kMaxSize, h = kMaxSize; + const int block_idx = GET_PARAM(1); + + uint8_t pred8[kMaxSize * kMaxSize]; + uint8_t ref8[kMaxSize * kMaxSize]; + DECLARE_ALIGNED(16, uint8_t, output[MAX_SB_SQUARE]); + DECLARE_ALIGNED(16, uint8_t, output2[MAX_SB_SQUARE]); + + for (int i = 0; i < h; ++i) + for (int j = 0; j < w; ++j) { + pred8[i * w + j] = rnd_.Rand8(); + ref8[i * w + j] = rnd_.Rand8(); + } + const int in_w = block_size_wide[block_idx]; + const int in_h = block_size_high[block_idx]; + + DIST_WTD_COMP_PARAMS dist_wtd_comp_params; + dist_wtd_comp_params.use_dist_wtd_comp_avg = 1; + int sub_x_q3, sub_y_q3; + int subpel_search; + for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS; + ++subpel_search) { + for (sub_x_q3 = 0; sub_x_q3 < 8; ++sub_x_q3) { + for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) { + for (int ii = 0; ii < 2; ii++) { + for (int jj = 0; jj < 4; jj++) { + dist_wtd_comp_params.fwd_offset = + quant_dist_lookup_table[ii][jj][0]; + dist_wtd_comp_params.bck_offset = + quant_dist_lookup_table[ii][jj][1]; + + const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7); + const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7); + + aom_dist_wtd_comp_avg_upsampled_pred_c( + NULL, NULL, 0, 0, NULL, output, + pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3, + sub_y_q3, ref8 + offset_r * w + offset_c, in_w, + &dist_wtd_comp_params, subpel_search); + test_impl(NULL, NULL, 0, 0, NULL, output2, + pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3, + sub_y_q3, ref8 + offset_r * w + offset_c, in_w, + &dist_wtd_comp_params, subpel_search); + + for (int i = 0; i < in_h; ++i) { + for (int j = 0; j < in_w; ++j) { + int idx = i * in_w + j; + ASSERT_EQ(output[idx], output2[idx]) + << "Mismatch at unit tests for " + "AV1DISTWTDCOMPAVGUPSAMPLEDTest\n" + << in_w << "x" << in_h << " Pixel mismatch at index " + << idx << " = (" << i << ", " << j + << "), sub pixel offset = (" << sub_y_q3 << ", " + << sub_x_q3 << ")"; + } + } + } + } + } + } + } + } + void RunSpeedTest(distwtdcompavgupsampled_func test_impl) { + const int w = kMaxSize, h = kMaxSize; + const int block_idx = GET_PARAM(1); + + uint8_t pred8[kMaxSize * kMaxSize]; + uint8_t ref8[kMaxSize * kMaxSize]; + DECLARE_ALIGNED(16, uint8_t, output[MAX_SB_SQUARE]); + DECLARE_ALIGNED(16, uint8_t, output2[MAX_SB_SQUARE]); + + for (int i = 0; i < h; ++i) + for (int j = 0; j < w; ++j) { + pred8[i * w + j] = rnd_.Rand8(); + ref8[i * w + j] = rnd_.Rand8(); + } + const int in_w = block_size_wide[block_idx]; + const int in_h = block_size_high[block_idx]; + + DIST_WTD_COMP_PARAMS dist_wtd_comp_params; + dist_wtd_comp_params.use_dist_wtd_comp_avg = 1; + + dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0]; + dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1]; + + int sub_x_q3 = 0; + int sub_y_q3 = 0; + + const int num_loops = 1000000000 / (in_w + in_h); + aom_usec_timer timer; + aom_usec_timer_start(&timer); + int subpel_search = USE_8_TAPS; // set to USE_4_TAPS to test 4-tap filter. + + for (int i = 0; i < num_loops; ++i) + aom_dist_wtd_comp_avg_upsampled_pred_c( + NULL, NULL, 0, 0, NULL, output, pred8, in_w, in_h, sub_x_q3, sub_y_q3, + ref8, in_w, &dist_wtd_comp_params, subpel_search); + + aom_usec_timer_mark(&timer); + const int elapsed_time = static_cast(aom_usec_timer_elapsed(&timer)); + printf("distwtdcompavgupsampled c_code %3dx%-3d: %7.2f us\n", in_w, in_h, + 1000.0 * elapsed_time / num_loops); + + aom_usec_timer timer1; + aom_usec_timer_start(&timer1); + + for (int i = 0; i < num_loops; ++i) + test_impl(NULL, NULL, 0, 0, NULL, output2, pred8, in_w, in_h, sub_x_q3, + sub_y_q3, ref8, in_w, &dist_wtd_comp_params, subpel_search); + + aom_usec_timer_mark(&timer1); + const int elapsed_time1 = static_cast(aom_usec_timer_elapsed(&timer1)); + printf("distwtdcompavgupsampled test_code %3dx%-3d: %7.2f us\n", in_w, in_h, + 1000.0 * elapsed_time1 / num_loops); + } + + libaom_test::ACMRandom rnd_; +}; // class AV1DISTWTDCOMPAVGUPSAMPLEDTest + +#if CONFIG_AV1_HIGHBITDEPTH +class AV1HighBDDISTWTDCOMPAVGTest + : public ::testing::TestWithParam { + public: + ~AV1HighBDDISTWTDCOMPAVGTest() {} + void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); } + + void TearDown() { libaom_test::ClearSystemState(); } + + protected: + void RunCheckOutput(distwtdcompavg_func test_impl) { + const int w = kMaxSize, h = kMaxSize; + const int block_idx = GET_PARAM(2); + const int bd = GET_PARAM(0); + uint16_t pred8[kMaxSize * kMaxSize]; + uint16_t ref8[kMaxSize * kMaxSize]; + uint16_t output[kMaxSize * kMaxSize]; + uint16_t output2[kMaxSize * kMaxSize]; + + for (int i = 0; i < h; ++i) + for (int j = 0; j < w; ++j) { + pred8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1); + ref8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1); + } + const int in_w = block_size_wide[block_idx]; + const int in_h = block_size_high[block_idx]; + + DIST_WTD_COMP_PARAMS dist_wtd_comp_params; + dist_wtd_comp_params.use_dist_wtd_comp_avg = 1; + + for (int ii = 0; ii < 2; ii++) { + for (int jj = 0; jj < 4; jj++) { + dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0]; + dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1]; + + const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7); + const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7); + aom_highbd_dist_wtd_comp_avg_pred_c( + CONVERT_TO_BYTEPTR(output), + CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w, in_h, + CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w, + &dist_wtd_comp_params); + test_impl(CONVERT_TO_BYTEPTR(output2), + CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w, + in_h, CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, + in_w, &dist_wtd_comp_params); + + for (int i = 0; i < in_h; ++i) { + for (int j = 0; j < in_w; ++j) { + int idx = i * in_w + j; + ASSERT_EQ(output[idx], output2[idx]) + << "Mismatch at unit tests for AV1HighBDDISTWTDCOMPAVGTest\n" + << in_w << "x" << in_h << " Pixel mismatch at index " << idx + << " = (" << i << ", " << j << ")"; + } + } + } + } + } + void RunSpeedTest(distwtdcompavg_func test_impl) { + const int w = kMaxSize, h = kMaxSize; + const int block_idx = GET_PARAM(2); + const int bd = GET_PARAM(0); + uint16_t pred8[kMaxSize * kMaxSize]; + uint16_t ref8[kMaxSize * kMaxSize]; + uint16_t output[kMaxSize * kMaxSize]; + uint16_t output2[kMaxSize * kMaxSize]; + + for (int i = 0; i < h; ++i) + for (int j = 0; j < w; ++j) { + pred8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1); + ref8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1); + } + const int in_w = block_size_wide[block_idx]; + const int in_h = block_size_high[block_idx]; + + DIST_WTD_COMP_PARAMS dist_wtd_comp_params; + dist_wtd_comp_params.use_dist_wtd_comp_avg = 1; + + dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0]; + dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1]; + + const int num_loops = 1000000000 / (in_w + in_h); + aom_usec_timer timer; + aom_usec_timer_start(&timer); + + for (int i = 0; i < num_loops; ++i) + aom_highbd_dist_wtd_comp_avg_pred_c( + CONVERT_TO_BYTEPTR(output), CONVERT_TO_BYTEPTR(pred8), in_w, in_h, + CONVERT_TO_BYTEPTR(ref8), in_w, &dist_wtd_comp_params); + + aom_usec_timer_mark(&timer); + const int elapsed_time = static_cast(aom_usec_timer_elapsed(&timer)); + printf("highbddistwtdcompavg c_code %3dx%-3d: %7.2f us\n", in_w, in_h, + 1000.0 * elapsed_time / num_loops); + + aom_usec_timer timer1; + aom_usec_timer_start(&timer1); + + for (int i = 0; i < num_loops; ++i) + test_impl(CONVERT_TO_BYTEPTR(output2), CONVERT_TO_BYTEPTR(pred8), in_w, + in_h, CONVERT_TO_BYTEPTR(ref8), in_w, &dist_wtd_comp_params); + + aom_usec_timer_mark(&timer1); + const int elapsed_time1 = static_cast(aom_usec_timer_elapsed(&timer1)); + printf("highbddistwtdcompavg test_code %3dx%-3d: %7.2f us\n", in_w, in_h, + 1000.0 * elapsed_time1 / num_loops); + } + + libaom_test::ACMRandom rnd_; +}; // class AV1HighBDDISTWTDCOMPAVGTest + +class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest + : public ::testing::TestWithParam { + public: + ~AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest() {} + void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); } + void TearDown() { libaom_test::ClearSystemState(); } + + protected: + void RunCheckOutput(highbddistwtdcompavgupsampled_func test_impl) { + const int w = kMaxSize, h = kMaxSize; + const int block_idx = GET_PARAM(2); + const int bd = GET_PARAM(0); + uint16_t pred8[kMaxSize * kMaxSize]; + uint16_t ref8[kMaxSize * kMaxSize]; + DECLARE_ALIGNED(16, uint16_t, output[kMaxSize * kMaxSize]); + DECLARE_ALIGNED(16, uint16_t, output2[kMaxSize * kMaxSize]); + + for (int i = 0; i < h; ++i) + for (int j = 0; j < w; ++j) { + pred8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1); + ref8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1); + } + const int in_w = block_size_wide[block_idx]; + const int in_h = block_size_high[block_idx]; + + DIST_WTD_COMP_PARAMS dist_wtd_comp_params; + dist_wtd_comp_params.use_dist_wtd_comp_avg = 1; + int sub_x_q3, sub_y_q3; + int subpel_search; + for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS; + ++subpel_search) { + for (sub_x_q3 = 0; sub_x_q3 < 8; ++sub_x_q3) { + for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) { + for (int ii = 0; ii < 2; ii++) { + for (int jj = 0; jj < 4; jj++) { + dist_wtd_comp_params.fwd_offset = + quant_dist_lookup_table[ii][jj][0]; + dist_wtd_comp_params.bck_offset = + quant_dist_lookup_table[ii][jj][1]; + + const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7); + const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7); + + aom_highbd_dist_wtd_comp_avg_upsampled_pred_c( + NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output), + CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w, + in_h, sub_x_q3, sub_y_q3, + CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w, bd, + &dist_wtd_comp_params, subpel_search); + test_impl(NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output2), + CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, + in_w, in_h, sub_x_q3, sub_y_q3, + CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, + in_w, bd, &dist_wtd_comp_params, subpel_search); + + for (int i = 0; i < in_h; ++i) { + for (int j = 0; j < in_w; ++j) { + int idx = i * in_w + j; + ASSERT_EQ(output[idx], output2[idx]) + << "Mismatch at unit tests for " + "AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest\n" + << in_w << "x" << in_h << " Pixel mismatch at index " + << idx << " = (" << i << ", " << j + << "), sub pixel offset = (" << sub_y_q3 << ", " + << sub_x_q3 << ")"; + } + } + } + } + } + } + } + } + void RunSpeedTest(highbddistwtdcompavgupsampled_func test_impl) { + const int w = kMaxSize, h = kMaxSize; + const int block_idx = GET_PARAM(2); + const int bd = GET_PARAM(0); + uint16_t pred8[kMaxSize * kMaxSize]; + uint16_t ref8[kMaxSize * kMaxSize]; + DECLARE_ALIGNED(16, uint16_t, output[kMaxSize * kMaxSize]); + DECLARE_ALIGNED(16, uint16_t, output2[kMaxSize * kMaxSize]); + + for (int i = 0; i < h; ++i) + for (int j = 0; j < w; ++j) { + pred8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1); + ref8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1); + } + const int in_w = block_size_wide[block_idx]; + const int in_h = block_size_high[block_idx]; + + DIST_WTD_COMP_PARAMS dist_wtd_comp_params; + dist_wtd_comp_params.use_dist_wtd_comp_avg = 1; + + dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0]; + dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1]; + int sub_x_q3 = 0; + int sub_y_q3 = 0; + const int num_loops = 1000000000 / (in_w + in_h); + aom_usec_timer timer; + aom_usec_timer_start(&timer); + int subpel_search = USE_8_TAPS; // set to USE_4_TAPS to test 4-tap filter. + for (int i = 0; i < num_loops; ++i) + aom_highbd_dist_wtd_comp_avg_upsampled_pred_c( + NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output), + CONVERT_TO_BYTEPTR(pred8), in_w, in_h, sub_x_q3, sub_y_q3, + CONVERT_TO_BYTEPTR(ref8), in_w, bd, &dist_wtd_comp_params, + subpel_search); + + aom_usec_timer_mark(&timer); + const int elapsed_time = static_cast(aom_usec_timer_elapsed(&timer)); + printf("highbddistwtdcompavgupsampled c_code %3dx%-3d: %7.2f us\n", in_w, + in_h, 1000.0 * elapsed_time / num_loops); + + aom_usec_timer timer1; + aom_usec_timer_start(&timer1); + + for (int i = 0; i < num_loops; ++i) + test_impl(NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output2), + CONVERT_TO_BYTEPTR(pred8), in_w, in_h, sub_x_q3, sub_y_q3, + CONVERT_TO_BYTEPTR(ref8), in_w, bd, &dist_wtd_comp_params, + subpel_search); + + aom_usec_timer_mark(&timer1); + const int elapsed_time1 = static_cast(aom_usec_timer_elapsed(&timer1)); + printf("highbddistwtdcompavgupsampled test_code %3dx%-3d: %7.2f us\n", in_w, + in_h, 1000.0 * elapsed_time1 / num_loops); + } + + libaom_test::ACMRandom rnd_; +}; // class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest +#endif // CONFIG_AV1_HIGHBITDEPTH + +} // namespace AV1DISTWTDCOMPAVG +} // namespace libaom_test + +#endif // AOM_TEST_COMP_AVG_PRED_TEST_H_ diff --git a/libs/libaom/src/test/comp_mask_variance_test.cc b/libs/libaom/src/test/comp_mask_variance_test.cc new file mode 100644 index 000000000..b666306a3 --- /dev/null +++ b/libs/libaom/src/test/comp_mask_variance_test.cc @@ -0,0 +1,577 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_codec.h" +#include "aom/aom_integer.h" +#include "aom_dsp/variance.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/mem.h" +#include "av1/common/reconinter.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +namespace AV1CompMaskVariance { +typedef void (*comp_mask_pred_func)(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, const uint8_t *ref, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask); + +#if HAVE_SSSE3 || HAVE_SSE2 || HAVE_AV2 +const BLOCK_SIZE kValidBlockSize[] = { + BLOCK_8X8, BLOCK_8X16, BLOCK_8X32, BLOCK_16X8, BLOCK_16X16, + BLOCK_16X32, BLOCK_32X8, BLOCK_32X16, BLOCK_32X32, +}; +#endif +typedef std::tuple CompMaskPredParam; + +class AV1CompMaskVarianceTest + : public ::testing::TestWithParam { + public: + ~AV1CompMaskVarianceTest(); + void SetUp(); + + void TearDown(); + + protected: + void RunCheckOutput(comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv); + void RunSpeedTest(comp_mask_pred_func test_impl, BLOCK_SIZE bsize); + bool CheckResult(int width, int height) { + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + const int idx = y * width + x; + if (comp_pred1_[idx] != comp_pred2_[idx]) { + printf("%dx%d mismatch @%d(%d,%d) ", width, height, idx, y, x); + printf("%d != %d ", comp_pred1_[idx], comp_pred2_[idx]); + return false; + } + } + } + return true; + } + + libaom_test::ACMRandom rnd_; + uint8_t *comp_pred1_; + uint8_t *comp_pred2_; + uint8_t *pred_; + uint8_t *ref_buffer_; + uint8_t *ref_; +}; + +AV1CompMaskVarianceTest::~AV1CompMaskVarianceTest() { ; } + +void AV1CompMaskVarianceTest::SetUp() { + rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed()); + av1_init_wedge_masks(); + comp_pred1_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE); + comp_pred2_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE); + pred_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE); + ref_buffer_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE + (8 * MAX_SB_SIZE)); + ref_ = ref_buffer_ + (8 * MAX_SB_SIZE); + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + pred_[i] = rnd_.Rand8(); + } + for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) { + ref_buffer_[i] = rnd_.Rand8(); + } +} + +void AV1CompMaskVarianceTest::TearDown() { + aom_free(comp_pred1_); + aom_free(comp_pred2_); + aom_free(pred_); + aom_free(ref_buffer_); + libaom_test::ClearSystemState(); +} + +void AV1CompMaskVarianceTest::RunCheckOutput(comp_mask_pred_func test_impl, + BLOCK_SIZE bsize, int inv) { + const int w = block_size_wide[bsize]; + const int h = block_size_high[bsize]; + const int wedge_types = get_wedge_types_lookup(bsize); + for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) { + const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize); + + aom_comp_mask_pred_c(comp_pred1_, pred_, w, h, ref_, MAX_SB_SIZE, mask, w, + inv); + test_impl(comp_pred2_, pred_, w, h, ref_, MAX_SB_SIZE, mask, w, inv); + + ASSERT_EQ(CheckResult(w, h), true) + << " wedge " << wedge_index << " inv " << inv; + } +} + +void AV1CompMaskVarianceTest::RunSpeedTest(comp_mask_pred_func test_impl, + BLOCK_SIZE bsize) { + const int w = block_size_wide[bsize]; + const int h = block_size_high[bsize]; + const int wedge_types = get_wedge_types_lookup(bsize); + int wedge_index = wedge_types / 2; + const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize); + const int num_loops = 1000000000 / (w + h); + + comp_mask_pred_func funcs[2] = { aom_comp_mask_pred_c, test_impl }; + double elapsed_time[2] = { 0 }; + for (int i = 0; i < 2; ++i) { + aom_usec_timer timer; + aom_usec_timer_start(&timer); + comp_mask_pred_func func = funcs[i]; + for (int j = 0; j < num_loops; ++j) { + func(comp_pred1_, pred_, w, h, ref_, MAX_SB_SIZE, mask, w, 0); + } + aom_usec_timer_mark(&timer); + double time = static_cast(aom_usec_timer_elapsed(&timer)); + elapsed_time[i] = 1000.0 * time / num_loops; + } + printf("compMask %3dx%-3d: %7.2f/%7.2fns", w, h, elapsed_time[0], + elapsed_time[1]); + printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]); +} + +TEST_P(AV1CompMaskVarianceTest, CheckOutput) { + // inv = 0, 1 + RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 0); + RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 1); +} + +TEST_P(AV1CompMaskVarianceTest, DISABLED_Speed) { + RunSpeedTest(GET_PARAM(0), GET_PARAM(1)); +} + +#if HAVE_SSSE3 +INSTANTIATE_TEST_SUITE_P( + SSSE3, AV1CompMaskVarianceTest, + ::testing::Combine(::testing::Values(&aom_comp_mask_pred_ssse3), + ::testing::ValuesIn(kValidBlockSize))); +#endif + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, AV1CompMaskVarianceTest, + ::testing::Combine(::testing::Values(&aom_comp_mask_pred_avx2), + ::testing::ValuesIn(kValidBlockSize))); +#endif + +#ifndef aom_comp_mask_pred +// can't run this test if aom_comp_mask_pred is defined to aom_comp_mask_pred_c +class AV1CompMaskUpVarianceTest : public AV1CompMaskVarianceTest { + public: + ~AV1CompMaskUpVarianceTest(); + + protected: + void RunCheckOutput(comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv); + void RunSpeedTest(comp_mask_pred_func test_impl, BLOCK_SIZE bsize, + int havSub); +}; + +AV1CompMaskUpVarianceTest::~AV1CompMaskUpVarianceTest() { ; } + +void AV1CompMaskUpVarianceTest::RunCheckOutput(comp_mask_pred_func test_impl, + BLOCK_SIZE bsize, int inv) { + const int w = block_size_wide[bsize]; + const int h = block_size_high[bsize]; + const int wedge_types = get_wedge_types_lookup(bsize); + int subpel_search; + for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS; + ++subpel_search) { + // loop through subx and suby + for (int sub = 0; sub < 8 * 8; ++sub) { + int subx = sub & 0x7; + int suby = (sub >> 3); + for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) { + const uint8_t *mask = + av1_get_contiguous_soft_mask(wedge_index, 1, bsize); + + // ref + aom_comp_mask_upsampled_pred_c( + NULL, NULL, 0, 0, NULL, comp_pred1_, pred_, w, h, subx, suby, ref_, + MAX_SB_SIZE, mask, w, inv, subpel_search); + + aom_comp_mask_pred = test_impl; // test + aom_comp_mask_upsampled_pred(NULL, NULL, 0, 0, NULL, comp_pred2_, pred_, + w, h, subx, suby, ref_, MAX_SB_SIZE, mask, + w, inv, subpel_search); + ASSERT_EQ(CheckResult(w, h), true) + << " wedge " << wedge_index << " inv " << inv << "sub (" << subx + << "," << suby << ")"; + } + } + } +} + +void AV1CompMaskUpVarianceTest::RunSpeedTest(comp_mask_pred_func test_impl, + BLOCK_SIZE bsize, int havSub) { + const int w = block_size_wide[bsize]; + const int h = block_size_high[bsize]; + const int subx = havSub ? 3 : 0; + const int suby = havSub ? 4 : 0; + const int wedge_types = get_wedge_types_lookup(bsize); + int wedge_index = wedge_types / 2; + const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize); + + const int num_loops = 1000000000 / (w + h); + comp_mask_pred_func funcs[2] = { &aom_comp_mask_pred_c, test_impl }; + double elapsed_time[2] = { 0 }; + int subpel_search = USE_8_TAPS; // set to USE_4_TAPS to test 4-tap filter. + for (int i = 0; i < 2; ++i) { + aom_usec_timer timer; + aom_usec_timer_start(&timer); + aom_comp_mask_pred = funcs[i]; + for (int j = 0; j < num_loops; ++j) { + aom_comp_mask_upsampled_pred(NULL, NULL, 0, 0, NULL, comp_pred1_, pred_, + w, h, subx, suby, ref_, MAX_SB_SIZE, mask, w, + 0, subpel_search); + } + aom_usec_timer_mark(&timer); + double time = static_cast(aom_usec_timer_elapsed(&timer)); + elapsed_time[i] = 1000.0 * time / num_loops; + } + printf("CompMaskUp[%d] %3dx%-3d:%7.2f/%7.2fns", havSub, w, h, elapsed_time[0], + elapsed_time[1]); + printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]); +} + +TEST_P(AV1CompMaskUpVarianceTest, CheckOutput) { + // inv mask = 0, 1 + RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 0); + RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 1); +} + +TEST_P(AV1CompMaskUpVarianceTest, DISABLED_Speed) { + RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 1); +} + +#if HAVE_SSSE3 +INSTANTIATE_TEST_SUITE_P( + SSSE3, AV1CompMaskUpVarianceTest, + ::testing::Combine(::testing::Values(&aom_comp_mask_pred_ssse3), + ::testing::ValuesIn(kValidBlockSize))); +#endif + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, AV1CompMaskUpVarianceTest, + ::testing::Combine(::testing::Values(&aom_comp_mask_pred_avx2), + ::testing::ValuesIn(kValidBlockSize))); +#endif + +#endif // ifndef aom_comp_mask_pred + +#if CONFIG_AV1_HIGHBITDEPTH +typedef void (*highbd_comp_mask_pred_func)(uint8_t *comp_pred8, + const uint8_t *pred8, int width, + int height, const uint8_t *ref8, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask); + +typedef std::tuple + HighbdCompMaskPredParam; + +class AV1HighbdCompMaskVarianceTest + : public ::testing::TestWithParam { + public: + ~AV1HighbdCompMaskVarianceTest(); + void SetUp(); + + void TearDown(); + + protected: + void RunCheckOutput(highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize, + int inv); + void RunSpeedTest(highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize); + bool CheckResult(int width, int height) { + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + const int idx = y * width + x; + if (comp_pred1_[idx] != comp_pred2_[idx]) { + printf("%dx%d mismatch @%d(%d,%d) ", width, height, idx, y, x); + printf("%d != %d ", comp_pred1_[idx], comp_pred2_[idx]); + return false; + } + } + } + return true; + } + + libaom_test::ACMRandom rnd_; + uint16_t *comp_pred1_; + uint16_t *comp_pred2_; + uint16_t *pred_; + uint16_t *ref_buffer_; + uint16_t *ref_; +}; + +AV1HighbdCompMaskVarianceTest::~AV1HighbdCompMaskVarianceTest() { ; } + +void AV1HighbdCompMaskVarianceTest::SetUp() { + rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed()); + av1_init_wedge_masks(); + + comp_pred1_ = + (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*comp_pred1_)); + comp_pred2_ = + (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*comp_pred2_)); + pred_ = (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*pred_)); + ref_buffer_ = (uint16_t *)aom_memalign( + 16, (MAX_SB_SQUARE + (8 * MAX_SB_SIZE)) * sizeof(*ref_buffer_)); + ref_ = ref_buffer_ + (8 * MAX_SB_SIZE); +} + +void AV1HighbdCompMaskVarianceTest::TearDown() { + aom_free(comp_pred1_); + aom_free(comp_pred2_); + aom_free(pred_); + aom_free(ref_buffer_); + libaom_test::ClearSystemState(); +} + +void AV1HighbdCompMaskVarianceTest::RunCheckOutput( + highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv) { + int bd_ = GET_PARAM(2); + const int w = block_size_wide[bsize]; + const int h = block_size_high[bsize]; + const int wedge_types = get_wedge_types_lookup(bsize); + + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1); + } + for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) { + ref_buffer_[i] = rnd_.Rand16() & ((1 << bd_) - 1); + } + + for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) { + const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize); + + aom_highbd_comp_mask_pred_c( + CONVERT_TO_BYTEPTR(comp_pred1_), CONVERT_TO_BYTEPTR(pred_), w, h, + CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv); + + test_impl(CONVERT_TO_BYTEPTR(comp_pred2_), CONVERT_TO_BYTEPTR(pred_), w, h, + CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv); + + ASSERT_EQ(CheckResult(w, h), true) + << " wedge " << wedge_index << " inv " << inv; + } +} + +void AV1HighbdCompMaskVarianceTest::RunSpeedTest( + highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize) { + int bd_ = GET_PARAM(2); + + const int w = block_size_wide[bsize]; + const int h = block_size_high[bsize]; + const int wedge_types = get_wedge_types_lookup(bsize); + int wedge_index = wedge_types / 2; + + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1); + } + for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) { + ref_buffer_[i] = rnd_.Rand16() & ((1 << bd_) - 1); + } + + const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize); + const int num_loops = 1000000000 / (w + h); + + highbd_comp_mask_pred_func funcs[2] = { aom_highbd_comp_mask_pred_c, + test_impl }; + double elapsed_time[2] = { 0 }; + for (int i = 0; i < 2; ++i) { + aom_usec_timer timer; + aom_usec_timer_start(&timer); + highbd_comp_mask_pred_func func = funcs[i]; + for (int j = 0; j < num_loops; ++j) { + func(CONVERT_TO_BYTEPTR(comp_pred1_), CONVERT_TO_BYTEPTR(pred_), w, h, + CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, 0); + } + aom_usec_timer_mark(&timer); + double time = static_cast(aom_usec_timer_elapsed(&timer)); + elapsed_time[i] = 1000.0 * time / num_loops; + } + printf("compMask %3dx%-3d: %7.2f/%7.2fns", w, h, elapsed_time[0], + elapsed_time[1]); + printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]); +} + +TEST_P(AV1HighbdCompMaskVarianceTest, CheckOutput) { + // inv = 0, 1 + RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 0); + RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 1); +} + +TEST_P(AV1HighbdCompMaskVarianceTest, DISABLED_Speed) { + RunSpeedTest(GET_PARAM(0), GET_PARAM(1)); +} + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, AV1HighbdCompMaskVarianceTest, + ::testing::Combine(::testing::Values(&aom_highbd_comp_mask_pred_avx2), + ::testing::ValuesIn(kValidBlockSize), + ::testing::Range(8, 13, 2))); +#endif + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, AV1HighbdCompMaskVarianceTest, + ::testing::Combine(::testing::Values(&aom_highbd_comp_mask_pred_sse2), + ::testing::ValuesIn(kValidBlockSize), + ::testing::Range(8, 13, 2))); +#endif + +#ifndef aom_highbd_comp_mask_pred +// can't run this test if aom_highbd_comp_mask_pred is defined to +// aom_highbd_comp_mask_pred_c +class AV1HighbdCompMaskUpVarianceTest : public AV1HighbdCompMaskVarianceTest { + public: + ~AV1HighbdCompMaskUpVarianceTest(); + + protected: + void RunCheckOutput(highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize, + int inv); + void RunSpeedTest(highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize, + int havSub); +}; + +AV1HighbdCompMaskUpVarianceTest::~AV1HighbdCompMaskUpVarianceTest() { ; } + +void AV1HighbdCompMaskUpVarianceTest::RunCheckOutput( + highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv) { + (void)test_impl; + int bd_ = GET_PARAM(2); + const int w = block_size_wide[bsize]; + const int h = block_size_high[bsize]; + const int wedge_types = get_wedge_types_lookup(bsize); + + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1); + } + for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) { + ref_buffer_[i] = rnd_.Rand16() & ((1 << bd_) - 1); + } + + int subpel_search; + for (subpel_search = 1; subpel_search <= 2; ++subpel_search) { + // loop through subx and suby + for (int sub = 0; sub < 8 * 8; ++sub) { + int subx = sub & 0x7; + int suby = (sub >> 3); + for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) { + const uint8_t *mask = + av1_get_contiguous_soft_mask(wedge_index, 1, bsize); + + // ref + aom_highbd_upsampled_pred_c( + NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(comp_pred1_), w, h, subx, + suby, CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, bd_, subpel_search); + + aom_highbd_comp_mask_pred_c( + CONVERT_TO_BYTEPTR(comp_pred1_), CONVERT_TO_BYTEPTR(pred_), w, h, + CONVERT_TO_BYTEPTR(comp_pred1_), w, mask, w, inv); + + // test + aom_highbd_upsampled_pred( + NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(comp_pred2_), w, h, subx, + suby, CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, bd_, subpel_search); + + aom_highbd_comp_mask_pred( + CONVERT_TO_BYTEPTR(comp_pred2_), CONVERT_TO_BYTEPTR(pred_), w, h, + CONVERT_TO_BYTEPTR(comp_pred2_), w, mask, w, inv); + + ASSERT_EQ(CheckResult(w, h), true) + << " wedge " << wedge_index << " inv " << inv << "sub (" << subx + << "," << suby << ")"; + } + } + } +} + +void AV1HighbdCompMaskUpVarianceTest::RunSpeedTest( + highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int havSub) { + int bd_ = GET_PARAM(2); + const int w = block_size_wide[bsize]; + const int h = block_size_high[bsize]; + const int subx = havSub ? 3 : 0; + const int suby = havSub ? 4 : 0; + const int wedge_types = get_wedge_types_lookup(bsize); + int wedge_index = wedge_types / 2; + const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize); + + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1); + } + for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) { + ref_buffer_[i] = rnd_.Rand16() & ((1 << bd_) - 1); + } + + const int num_loops = 1000000000 / (w + h); + highbd_comp_mask_pred_func funcs[2] = { &aom_highbd_comp_mask_pred_c, + test_impl }; + double elapsed_time[2] = { 0 }; + for (int i = 0; i < 2; ++i) { + aom_usec_timer timer; + aom_usec_timer_start(&timer); + aom_highbd_comp_mask_pred = funcs[i]; + int subpel_search = 2; // set to 1 to test 4-tap filter. + for (int j = 0; j < num_loops; ++j) { + aom_highbd_comp_mask_upsampled_pred( + NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(comp_pred1_), + CONVERT_TO_BYTEPTR(pred_), w, h, subx, suby, CONVERT_TO_BYTEPTR(ref_), + MAX_SB_SIZE, mask, w, 0, bd_, subpel_search); + } + aom_usec_timer_mark(&timer); + double time = static_cast(aom_usec_timer_elapsed(&timer)); + elapsed_time[i] = 1000.0 * time / num_loops; + } + printf("CompMaskUp[%d] %3dx%-3d:%7.2f/%7.2fns", havSub, w, h, elapsed_time[0], + elapsed_time[1]); + printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]); +} + +TEST_P(AV1HighbdCompMaskUpVarianceTest, CheckOutput) { + // inv mask = 0, 1 + RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 0); + RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 1); +} + +TEST_P(AV1HighbdCompMaskUpVarianceTest, DISABLED_Speed) { + RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 1); +} + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, AV1HighbdCompMaskUpVarianceTest, + ::testing::Combine(::testing::Values(&aom_highbd_comp_mask_pred_avx2), + ::testing::ValuesIn(kValidBlockSize), + ::testing::Range(8, 13, 2))); +#endif + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, AV1HighbdCompMaskUpVarianceTest, + ::testing::Combine(::testing::Values(&aom_highbd_comp_mask_pred_sse2), + ::testing::ValuesIn(kValidBlockSize), + ::testing::Range(8, 13, 2))); +#endif + +#endif // ifndef aom_highbd_comp_mask_pred +#endif // CONFIG_AV1_HIGHBITDEPTH +} // namespace AV1CompMaskVariance diff --git a/libs/libaom/src/test/convolve_round_test.cc b/libs/libaom/src/test/convolve_round_test.cc new file mode 100644 index 000000000..4f17b5472 --- /dev/null +++ b/libs/libaom/src/test/convolve_round_test.cc @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_ports/aom_timer.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +using libaom_test::ACMRandom; + +namespace { +#define CONVOLVE_ROUNDING_PARAM \ + const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, \ + int h, int bits + +typedef void (*ConvolveRoundFunc)(CONVOLVE_ROUNDING_PARAM); + +typedef void (*ConvolveRoundFuncHbd)(CONVOLVE_ROUNDING_PARAM, int bd); + +template +void highbd_convolve_rounding_8(CONVOLVE_ROUNDING_PARAM) { + const int bd = 8; + fn(src, src_stride, dst, dst_stride, w, h, bits, bd); +} + +template +void highbd_convolve_rounding_10(CONVOLVE_ROUNDING_PARAM) { + const int bd = 10; + fn(src, src_stride, dst, dst_stride, w, h, bits, bd); +} + +template +void highbd_convolve_rounding_12(CONVOLVE_ROUNDING_PARAM) { + const int bd = 12; + fn(src, src_stride, dst, dst_stride, w, h, bits, bd); +} + +typedef enum { LOWBITDEPTH_TEST, HIGHBITDEPTH_TEST } DataPathType; + +using std::tuple; + +typedef tuple + ConvolveRoundParam; + +const int kTestNum = 5000; + +class ConvolveRoundTest : public ::testing::TestWithParam { + protected: + ConvolveRoundTest() + : func_ref_(GET_PARAM(0)), func_(GET_PARAM(1)), data_path_(GET_PARAM(2)) { + } + virtual ~ConvolveRoundTest() {} + + virtual void SetUp() { + const size_t block_size = 128 * 128; + src_ = reinterpret_cast( + aom_memalign(16, block_size * sizeof(*src_))); + dst_ref_ = reinterpret_cast( + aom_memalign(16, block_size * sizeof(*dst_ref_))); + dst_ = reinterpret_cast( + aom_memalign(16, block_size * sizeof(*dst_))); + } + + virtual void TearDown() { + aom_free(src_); + aom_free(dst_ref_); + aom_free(dst_); + } + + void ConvolveRoundingRun() { + int test_num = 0; + const int src_stride = 128; + const int dst_stride = 128; + int bits = 13; + uint8_t *dst = 0; + uint8_t *dst_ref = 0; + + if (data_path_ == LOWBITDEPTH_TEST) { + dst = reinterpret_cast(dst_); + dst_ref = reinterpret_cast(dst_ref_); + } else if (data_path_ == HIGHBITDEPTH_TEST) { + dst = CONVERT_TO_BYTEPTR(dst_); + dst_ref = CONVERT_TO_BYTEPTR(dst_ref_); + } else { + assert(0); + } + + while (test_num < kTestNum) { + int block_size = test_num % BLOCK_SIZES_ALL; + int w = block_size_wide[block_size]; + int h = block_size_high[block_size]; + + if (test_num % 2 == 0) + bits -= 1; + else + bits += 1; + + GenerateBufferWithRandom(src_, src_stride, bits, w, h); + + func_ref_(src_, src_stride, dst_ref, dst_stride, w, h, bits); + ASM_REGISTER_STATE_CHECK( + func_(src_, src_stride, dst, dst_stride, w, h, bits)); + + if (data_path_ == LOWBITDEPTH_TEST) { + for (int r = 0; r < h; ++r) { + for (int c = 0; c < w; ++c) { + ASSERT_EQ(dst_ref[r * dst_stride + c], dst[r * dst_stride + c]) + << "Mismatch at r: " << r << " c: " << c << " w: " << w + << " h: " << h << " test: " << test_num; + } + } + } else { + for (int r = 0; r < h; ++r) { + for (int c = 0; c < w; ++c) { + ASSERT_EQ(dst_ref_[r * dst_stride + c], dst_[r * dst_stride + c]) + << "Mismatch at r: " << r << " c: " << c << " w: " << w + << " h: " << h << " test: " << test_num; + } + } + } + + test_num++; + } + } + + void GenerateBufferWithRandom(int32_t *src, int src_stride, int bits, int w, + int h) { + int32_t number; + for (int r = 0; r < h; ++r) { + for (int c = 0; c < w; ++c) { + number = static_cast(rand_.Rand31()); + number %= 1 << (bits + 9); + src[r * src_stride + c] = number; + } + } + } + + ACMRandom rand_; + int32_t *src_; + uint16_t *dst_ref_; + uint16_t *dst_; + + ConvolveRoundFunc func_ref_; + ConvolveRoundFunc func_; + DataPathType data_path_; +}; + +TEST_P(ConvolveRoundTest, BitExactCheck) { ConvolveRoundingRun(); } + +using std::make_tuple; +#if HAVE_AVX2 +const ConvolveRoundParam kConvRndParamArray[] = { + make_tuple(&av1_convolve_rounding_c, &av1_convolve_rounding_avx2, + LOWBITDEPTH_TEST), + make_tuple(&highbd_convolve_rounding_8, + &highbd_convolve_rounding_8, + HIGHBITDEPTH_TEST), + make_tuple(&highbd_convolve_rounding_10, + &highbd_convolve_rounding_10, + HIGHBITDEPTH_TEST), + make_tuple(&highbd_convolve_rounding_12, + &highbd_convolve_rounding_12, + HIGHBITDEPTH_TEST) +}; +INSTANTIATE_TEST_SUITE_P(AVX2, ConvolveRoundTest, + ::testing::ValuesIn(kConvRndParamArray)); +#endif // HAVE_AVX2 +} // namespace diff --git a/libs/libaom/src/test/convolve_test.cc b/libs/libaom/src/test/convolve_test.cc new file mode 100644 index 000000000..0b1eea16a --- /dev/null +++ b/libs/libaom/src/test/convolve_test.cc @@ -0,0 +1,885 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/mem.h" +#include "av1/common/filter.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" + +namespace { + +static const unsigned int kMaxDimension = MAX_SB_SIZE; + +typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h); + +struct ConvolveFunctions { + ConvolveFunctions(ConvolveFunc copy, ConvolveFunc h8, ConvolveFunc v8, int bd) + : copy_(copy), h8_(h8), v8_(v8), use_highbd_(bd) {} + + ConvolveFunc copy_; + ConvolveFunc h8_; + ConvolveFunc v8_; + int use_highbd_; // 0 if high bitdepth not used, else the actual bit depth. +}; + +typedef std::tuple ConvolveParam; + +#define ALL_SIZES_64(convolve_fn) \ + make_tuple(4, 4, &convolve_fn), make_tuple(8, 4, &convolve_fn), \ + make_tuple(4, 8, &convolve_fn), make_tuple(8, 8, &convolve_fn), \ + make_tuple(16, 8, &convolve_fn), make_tuple(8, 16, &convolve_fn), \ + make_tuple(16, 16, &convolve_fn), make_tuple(32, 16, &convolve_fn), \ + make_tuple(16, 32, &convolve_fn), make_tuple(32, 32, &convolve_fn), \ + make_tuple(64, 32, &convolve_fn), make_tuple(32, 64, &convolve_fn), \ + make_tuple(64, 64, &convolve_fn) + +#define ALL_SIZES(convolve_fn) \ + make_tuple(128, 64, &convolve_fn), make_tuple(64, 128, &convolve_fn), \ + make_tuple(128, 128, &convolve_fn), ALL_SIZES_64(convolve_fn) + +// Reference 8-tap subpixel filter, slightly modified to fit into this test. +#define AV1_FILTER_WEIGHT 128 +#define AV1_FILTER_SHIFT 7 +uint8_t clip_pixel(int x) { return x < 0 ? 0 : x > 255 ? 255 : x; } + +void filter_block2d_8_c(const uint8_t *src_ptr, unsigned int src_stride, + const int16_t *HFilter, const int16_t *VFilter, + uint8_t *dst_ptr, unsigned int dst_stride, + unsigned int output_width, unsigned int output_height) { + // Between passes, we use an intermediate buffer whose height is extended to + // have enough horizontally filtered values as input for the vertical pass. + // This buffer is allocated to be big enough for the largest block type we + // support. + const int kInterp_Extend = 4; + const unsigned int intermediate_height = + (kInterp_Extend - 1) + output_height + kInterp_Extend; + unsigned int i, j; + + assert(intermediate_height > 7); + + // Size of intermediate_buffer is max_intermediate_height * filter_max_width, + // where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height + // + kInterp_Extend + // = 3 + 16 + 4 + // = 23 + // and filter_max_width = 16 + // + uint8_t intermediate_buffer[(kMaxDimension + 8) * kMaxDimension]; + const int intermediate_next_stride = + 1 - static_cast(intermediate_height * output_width); + + // Horizontal pass (src -> transposed intermediate). + uint8_t *output_ptr = intermediate_buffer; + const int src_next_row_stride = src_stride - output_width; + src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1); + for (i = 0; i < intermediate_height; ++i) { + for (j = 0; j < output_width; ++j) { + // Apply filter... + const int temp = (src_ptr[0] * HFilter[0]) + (src_ptr[1] * HFilter[1]) + + (src_ptr[2] * HFilter[2]) + (src_ptr[3] * HFilter[3]) + + (src_ptr[4] * HFilter[4]) + (src_ptr[5] * HFilter[5]) + + (src_ptr[6] * HFilter[6]) + (src_ptr[7] * HFilter[7]) + + (AV1_FILTER_WEIGHT >> 1); // Rounding + + // Normalize back to 0-255... + *output_ptr = clip_pixel(temp >> AV1_FILTER_SHIFT); + ++src_ptr; + output_ptr += intermediate_height; + } + src_ptr += src_next_row_stride; + output_ptr += intermediate_next_stride; + } + + // Vertical pass (transposed intermediate -> dst). + src_ptr = intermediate_buffer; + const int dst_next_row_stride = dst_stride - output_width; + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + // Apply filter... + const int temp = (src_ptr[0] * VFilter[0]) + (src_ptr[1] * VFilter[1]) + + (src_ptr[2] * VFilter[2]) + (src_ptr[3] * VFilter[3]) + + (src_ptr[4] * VFilter[4]) + (src_ptr[5] * VFilter[5]) + + (src_ptr[6] * VFilter[6]) + (src_ptr[7] * VFilter[7]) + + (AV1_FILTER_WEIGHT >> 1); // Rounding + + // Normalize back to 0-255... + *dst_ptr++ = clip_pixel(temp >> AV1_FILTER_SHIFT); + src_ptr += intermediate_height; + } + src_ptr += intermediate_next_stride; + dst_ptr += dst_next_row_stride; + } +} + +void block2d_average_c(uint8_t *src, unsigned int src_stride, + uint8_t *output_ptr, unsigned int output_stride, + unsigned int output_width, unsigned int output_height) { + unsigned int i, j; + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1; + } + output_ptr += output_stride; + } +} + +void filter_average_block2d_8_c(const uint8_t *src_ptr, + const unsigned int src_stride, + const int16_t *HFilter, const int16_t *VFilter, + uint8_t *dst_ptr, unsigned int dst_stride, + unsigned int output_width, + unsigned int output_height) { + uint8_t tmp[kMaxDimension * kMaxDimension]; + + assert(output_width <= kMaxDimension); + assert(output_height <= kMaxDimension); + filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, kMaxDimension, + output_width, output_height); + block2d_average_c(tmp, kMaxDimension, dst_ptr, dst_stride, output_width, + output_height); +} + +void highbd_filter_block2d_8_c(const uint16_t *src_ptr, + const unsigned int src_stride, + const int16_t *HFilter, const int16_t *VFilter, + uint16_t *dst_ptr, unsigned int dst_stride, + unsigned int output_width, + unsigned int output_height, int bd) { + // Between passes, we use an intermediate buffer whose height is extended to + // have enough horizontally filtered values as input for the vertical pass. + // This buffer is allocated to be big enough for the largest block type we + // support. + const int kInterp_Extend = 4; + const unsigned int intermediate_height = + (kInterp_Extend - 1) + output_height + kInterp_Extend; + + /* Size of intermediate_buffer is max_intermediate_height * filter_max_width, + * where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height + * + kInterp_Extend + * = 3 + 16 + 4 + * = 23 + * and filter_max_width = 16 + */ + uint16_t intermediate_buffer[(kMaxDimension + 8) * kMaxDimension] = { 0 }; + const int intermediate_next_stride = + 1 - static_cast(intermediate_height * output_width); + + // Horizontal pass (src -> transposed intermediate). + { + uint16_t *output_ptr = intermediate_buffer; + const int src_next_row_stride = src_stride - output_width; + unsigned int i, j; + src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1); + for (i = 0; i < intermediate_height; ++i) { + for (j = 0; j < output_width; ++j) { + // Apply filter... + const int temp = (src_ptr[0] * HFilter[0]) + (src_ptr[1] * HFilter[1]) + + (src_ptr[2] * HFilter[2]) + (src_ptr[3] * HFilter[3]) + + (src_ptr[4] * HFilter[4]) + (src_ptr[5] * HFilter[5]) + + (src_ptr[6] * HFilter[6]) + (src_ptr[7] * HFilter[7]) + + (AV1_FILTER_WEIGHT >> 1); // Rounding + + // Normalize back to 0-255... + *output_ptr = clip_pixel_highbd(temp >> AV1_FILTER_SHIFT, bd); + ++src_ptr; + output_ptr += intermediate_height; + } + src_ptr += src_next_row_stride; + output_ptr += intermediate_next_stride; + } + } + + // Vertical pass (transposed intermediate -> dst). + { + const uint16_t *interm_ptr = intermediate_buffer; + const int dst_next_row_stride = dst_stride - output_width; + unsigned int i, j; + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + // Apply filter... + const int temp = + (interm_ptr[0] * VFilter[0]) + (interm_ptr[1] * VFilter[1]) + + (interm_ptr[2] * VFilter[2]) + (interm_ptr[3] * VFilter[3]) + + (interm_ptr[4] * VFilter[4]) + (interm_ptr[5] * VFilter[5]) + + (interm_ptr[6] * VFilter[6]) + (interm_ptr[7] * VFilter[7]) + + (AV1_FILTER_WEIGHT >> 1); // Rounding + + // Normalize back to 0-255... + *dst_ptr++ = clip_pixel_highbd(temp >> AV1_FILTER_SHIFT, bd); + interm_ptr += intermediate_height; + } + interm_ptr += intermediate_next_stride; + dst_ptr += dst_next_row_stride; + } + } +} + +void highbd_block2d_average_c(uint16_t *src, unsigned int src_stride, + uint16_t *output_ptr, unsigned int output_stride, + unsigned int output_width, + unsigned int output_height) { + unsigned int i, j; + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1; + } + output_ptr += output_stride; + } +} + +void highbd_filter_average_block2d_8_c( + const uint16_t *src_ptr, unsigned int src_stride, const int16_t *HFilter, + const int16_t *VFilter, uint16_t *dst_ptr, unsigned int dst_stride, + unsigned int output_width, unsigned int output_height, int bd) { + uint16_t tmp[kMaxDimension * kMaxDimension]; + + assert(output_width <= kMaxDimension); + assert(output_height <= kMaxDimension); + highbd_filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, + kMaxDimension, output_width, output_height, bd); + highbd_block2d_average_c(tmp, kMaxDimension, dst_ptr, dst_stride, + output_width, output_height); +} + +class ConvolveTest : public ::testing::TestWithParam { + public: + static void SetUpTestCase() { + // Force input_ to be unaligned, output to be 16 byte aligned. + input_ = reinterpret_cast( + aom_memalign(kDataAlignment, kInputBufferSize + 1)) + + 1; + ref8_ = reinterpret_cast( + aom_memalign(kDataAlignment, kOutputStride * kMaxDimension)); + output_ = reinterpret_cast( + aom_memalign(kDataAlignment, kOutputBufferSize)); + output_ref_ = reinterpret_cast( + aom_memalign(kDataAlignment, kOutputBufferSize)); + input16_ = reinterpret_cast(aom_memalign( + kDataAlignment, (kInputBufferSize + 1) * sizeof(uint16_t))) + + 1; + ref16_ = reinterpret_cast(aom_memalign( + kDataAlignment, kOutputStride * kMaxDimension * sizeof(uint16_t))); + output16_ = reinterpret_cast( + aom_memalign(kDataAlignment, (kOutputBufferSize) * sizeof(uint16_t))); + output16_ref_ = reinterpret_cast( + aom_memalign(kDataAlignment, (kOutputBufferSize) * sizeof(uint16_t))); + } + + virtual void TearDown() { libaom_test::ClearSystemState(); } + + static void TearDownTestCase() { + aom_free(input_ - 1); + input_ = NULL; + aom_free(ref8_); + ref8_ = NULL; + aom_free(output_); + output_ = NULL; + aom_free(output_ref_); + output_ref_ = NULL; + aom_free(input16_ - 1); + input16_ = NULL; + aom_free(ref16_); + ref16_ = NULL; + aom_free(output16_); + output16_ = NULL; + aom_free(output16_ref_); + output16_ref_ = NULL; + } + + protected: + static const int kDataAlignment = 16; + static const int kOuterBlockSize = 4 * kMaxDimension; + static const int kInputStride = kOuterBlockSize; + static const int kOutputStride = kOuterBlockSize; + static const int kInputBufferSize = kOuterBlockSize * kOuterBlockSize; + static const int kOutputBufferSize = kOuterBlockSize * kOuterBlockSize; + + int Width() const { return GET_PARAM(0); } + int Height() const { return GET_PARAM(1); } + int BorderLeft() const { + const int center = (kOuterBlockSize - Width()) / 2; + return (center + (kDataAlignment - 1)) & ~(kDataAlignment - 1); + } + int BorderTop() const { return (kOuterBlockSize - Height()) / 2; } + + bool IsIndexInBorder(int i) { + return (i < BorderTop() * kOuterBlockSize || + i >= (BorderTop() + Height()) * kOuterBlockSize || + i % kOuterBlockSize < BorderLeft() || + i % kOuterBlockSize >= (BorderLeft() + Width())); + } + + virtual void SetUp() { + UUT_ = GET_PARAM(2); + if (UUT_->use_highbd_ != 0) + mask_ = (1 << UUT_->use_highbd_) - 1; + else + mask_ = 255; + /* Set up guard blocks for an inner block centered in the outer block */ + for (int i = 0; i < kOutputBufferSize; ++i) { + if (IsIndexInBorder(i)) { + output_[i] = 255; + output16_[i] = mask_; + } else { + output_[i] = 0; + output16_[i] = 0; + } + } + + ::libaom_test::ACMRandom prng; + for (int i = 0; i < kInputBufferSize; ++i) { + if (i & 1) { + input_[i] = 255; + input16_[i] = mask_; + } else { + input_[i] = prng.Rand8Extremes(); + input16_[i] = prng.Rand16() & mask_; + } + } + } + + void SetConstantInput(int value) { + memset(input_, value, kInputBufferSize); + aom_memset16(input16_, value, kInputBufferSize); + } + + void CopyOutputToRef() { + memcpy(output_ref_, output_, kOutputBufferSize); + // Copy 16-bit pixels values. The effective number of bytes is double. + memcpy(output16_ref_, output16_, sizeof(output16_[0]) * kOutputBufferSize); + } + + void CheckGuardBlocks() { + for (int i = 0; i < kOutputBufferSize; ++i) { + if (IsIndexInBorder(i)) { + EXPECT_EQ(255, output_[i]); + } + } + } + + uint8_t *input() const { + const int offset = BorderTop() * kOuterBlockSize + BorderLeft(); + if (UUT_->use_highbd_ == 0) { + return input_ + offset; + } else { + return CONVERT_TO_BYTEPTR(input16_) + offset; + } + } + + uint8_t *output() const { + const int offset = BorderTop() * kOuterBlockSize + BorderLeft(); + if (UUT_->use_highbd_ == 0) { + return output_ + offset; + } else { + return CONVERT_TO_BYTEPTR(output16_) + offset; + } + } + + uint8_t *output_ref() const { + const int offset = BorderTop() * kOuterBlockSize + BorderLeft(); + if (UUT_->use_highbd_ == 0) { + return output_ref_ + offset; + } else { + return CONVERT_TO_BYTEPTR(output16_ref_) + offset; + } + } + + uint16_t lookup(uint8_t *list, int index) const { + if (UUT_->use_highbd_ == 0) { + return list[index]; + } else { + return CONVERT_TO_SHORTPTR(list)[index]; + } + } + + void assign_val(uint8_t *list, int index, uint16_t val) const { + if (UUT_->use_highbd_ == 0) { + list[index] = (uint8_t)val; + } else { + CONVERT_TO_SHORTPTR(list)[index] = val; + } + } + + void wrapper_filter_average_block2d_8_c( + const uint8_t *src_ptr, unsigned int src_stride, const int16_t *HFilter, + const int16_t *VFilter, uint8_t *dst_ptr, unsigned int dst_stride, + unsigned int output_width, unsigned int output_height) { + if (UUT_->use_highbd_ == 0) { + filter_average_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, dst_ptr, + dst_stride, output_width, output_height); + } else { + highbd_filter_average_block2d_8_c( + CONVERT_TO_SHORTPTR(src_ptr), src_stride, HFilter, VFilter, + CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, output_width, output_height, + UUT_->use_highbd_); + } + } + + void wrapper_filter_block2d_8_c( + const uint8_t *src_ptr, unsigned int src_stride, const int16_t *HFilter, + const int16_t *VFilter, uint8_t *dst_ptr, unsigned int dst_stride, + unsigned int output_width, unsigned int output_height) { + if (UUT_->use_highbd_ == 0) { + filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, dst_ptr, + dst_stride, output_width, output_height); + } else { + highbd_filter_block2d_8_c(CONVERT_TO_SHORTPTR(src_ptr), src_stride, + HFilter, VFilter, CONVERT_TO_SHORTPTR(dst_ptr), + dst_stride, output_width, output_height, + UUT_->use_highbd_); + } + } + + const ConvolveFunctions *UUT_; + static uint8_t *input_; + static uint8_t *ref8_; + static uint8_t *output_; + static uint8_t *output_ref_; + static uint16_t *input16_; + static uint16_t *ref16_; + static uint16_t *output16_; + static uint16_t *output16_ref_; + int mask_; +}; + +uint8_t *ConvolveTest::input_ = NULL; +uint8_t *ConvolveTest::ref8_ = NULL; +uint8_t *ConvolveTest::output_ = NULL; +uint8_t *ConvolveTest::output_ref_ = NULL; +uint16_t *ConvolveTest::input16_ = NULL; +uint16_t *ConvolveTest::ref16_ = NULL; +uint16_t *ConvolveTest::output16_ = NULL; +uint16_t *ConvolveTest::output16_ref_ = NULL; + +TEST_P(ConvolveTest, GuardBlocks) { CheckGuardBlocks(); } + +TEST_P(ConvolveTest, Copy) { + uint8_t *const in = input(); + uint8_t *const out = output(); + + ASM_REGISTER_STATE_CHECK(UUT_->copy_(in, kInputStride, out, kOutputStride, + NULL, 0, NULL, 0, Width(), Height())); + + CheckGuardBlocks(); + + for (int y = 0; y < Height(); ++y) + for (int x = 0; x < Width(); ++x) + ASSERT_EQ(lookup(out, y * kOutputStride + x), + lookup(in, y * kInputStride + x)) + << "(" << x << "," << y << ")"; +} + +const int kNumFilterBanks = SWITCHABLE_FILTERS; +const int kNumFilters = 16; + +TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) { + int subpel_search; + for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS; + ++subpel_search) { + for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) { + const InterpFilter filter = (InterpFilter)filter_bank; + const InterpKernel *filters = + (const InterpKernel *)av1_get_interp_filter_kernel(filter, + subpel_search); + for (int i = 0; i < kNumFilters; i++) { + const int p0 = filters[i][0] + filters[i][1]; + const int p1 = filters[i][2] + filters[i][3]; + const int p2 = filters[i][4] + filters[i][5]; + const int p3 = filters[i][6] + filters[i][7]; + EXPECT_LE(p0, 128); + EXPECT_LE(p1, 128); + EXPECT_LE(p2, 128); + EXPECT_LE(p3, 128); + EXPECT_LE(p0 + p3, 128); + EXPECT_LE(p0 + p3 + p1, 128); + EXPECT_LE(p0 + p3 + p1 + p2, 128); + EXPECT_EQ(p0 + p1 + p2 + p3, 128); + } + } + } +} + +const int16_t kInvalidFilter[8] = { 0 }; + +TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) { + uint8_t *const in = input(); + uint8_t *const out = output(); + uint8_t *ref; + if (UUT_->use_highbd_ == 0) { + ref = ref8_; + } else { + ref = CONVERT_TO_BYTEPTR(ref16_); + } + int subpel_search; + for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS; + ++subpel_search) { + for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) { + const InterpFilter filter = (InterpFilter)filter_bank; + const InterpKernel *filters = + (const InterpKernel *)av1_get_interp_filter_kernel(filter, + subpel_search); + for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) { + for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) { + wrapper_filter_block2d_8_c(in, kInputStride, filters[filter_x], + filters[filter_y], ref, kOutputStride, + Width(), Height()); + + if (filter_x && filter_y) + continue; + else if (filter_y) + ASM_REGISTER_STATE_CHECK( + UUT_->v8_(in, kInputStride, out, kOutputStride, kInvalidFilter, + 16, filters[filter_y], 16, Width(), Height())); + else if (filter_x) + ASM_REGISTER_STATE_CHECK(UUT_->h8_( + in, kInputStride, out, kOutputStride, filters[filter_x], 16, + kInvalidFilter, 16, Width(), Height())); + else + ASM_REGISTER_STATE_CHECK(UUT_->copy_( + in, kInputStride, out, kOutputStride, kInvalidFilter, 0, + kInvalidFilter, 0, Width(), Height())); + + CheckGuardBlocks(); + + for (int y = 0; y < Height(); ++y) + for (int x = 0; x < Width(); ++x) + ASSERT_EQ(lookup(ref, y * kOutputStride + x), + lookup(out, y * kOutputStride + x)) + << "mismatch at (" << x << "," << y << "), " + << "filters (" << filter_bank << "," << filter_x << "," + << filter_y << ")"; + } + } + } + } +} + +TEST_P(ConvolveTest, FilterExtremes) { + uint8_t *const in = input(); + uint8_t *const out = output(); + uint8_t *ref; + if (UUT_->use_highbd_ == 0) { + ref = ref8_; + } else { + ref = CONVERT_TO_BYTEPTR(ref16_); + } + + // Populate ref and out with some random data + ::libaom_test::ACMRandom prng; + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) { + uint16_t r; + if (UUT_->use_highbd_ == 0 || UUT_->use_highbd_ == 8) { + r = prng.Rand8Extremes(); + } else { + r = prng.Rand16() & mask_; + } + assign_val(out, y * kOutputStride + x, r); + assign_val(ref, y * kOutputStride + x, r); + } + } + + for (int axis = 0; axis < 2; axis++) { + int seed_val = 0; + while (seed_val < 256) { + for (int y = 0; y < 8; ++y) { + for (int x = 0; x < 8; ++x) { + assign_val(in, y * kOutputStride + x - SUBPEL_TAPS / 2 + 1, + ((seed_val >> (axis ? y : x)) & 1) * mask_); + if (axis) seed_val++; + } + if (axis) + seed_val -= 8; + else + seed_val++; + } + if (axis) seed_val += 8; + int subpel_search; + for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS; + ++subpel_search) { + for (int filter_bank = 0; filter_bank < kNumFilterBanks; + ++filter_bank) { + const InterpFilter filter = (InterpFilter)filter_bank; + const InterpKernel *filters = + (const InterpKernel *)av1_get_interp_filter_kernel(filter, + subpel_search); + for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) { + for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) { + wrapper_filter_block2d_8_c(in, kInputStride, filters[filter_x], + filters[filter_y], ref, kOutputStride, + Width(), Height()); + if (filter_x && filter_y) + continue; + else if (filter_y) + ASM_REGISTER_STATE_CHECK(UUT_->v8_( + in, kInputStride, out, kOutputStride, kInvalidFilter, 16, + filters[filter_y], 16, Width(), Height())); + else if (filter_x) + ASM_REGISTER_STATE_CHECK(UUT_->h8_( + in, kInputStride, out, kOutputStride, filters[filter_x], 16, + kInvalidFilter, 16, Width(), Height())); + else + ASM_REGISTER_STATE_CHECK(UUT_->copy_( + in, kInputStride, out, kOutputStride, kInvalidFilter, 0, + kInvalidFilter, 0, Width(), Height())); + + for (int y = 0; y < Height(); ++y) + for (int x = 0; x < Width(); ++x) + ASSERT_EQ(lookup(ref, y * kOutputStride + x), + lookup(out, y * kOutputStride + x)) + << "mismatch at (" << x << "," << y << "), " + << "filters (" << filter_bank << "," << filter_x << "," + << filter_y << ")"; + } + } + } + } + } + } +} + +TEST_P(ConvolveTest, DISABLED_Copy_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + aom_usec_timer timer; + + aom_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->copy_(in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0, width, + height); + } + aom_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(aom_usec_timer_elapsed(&timer)); + printf("convolve_copy_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_Speed) { + uint8_t *const in = input(); + uint8_t *const out = output(); + uint8_t *ref; + if (UUT_->use_highbd_ == 0) { + ref = ref8_; + } else { + ref = CONVERT_TO_BYTEPTR(ref16_); + } + + // Populate ref and out with some random data + ::libaom_test::ACMRandom prng; + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) { + uint16_t r; + if (UUT_->use_highbd_ == 0 || UUT_->use_highbd_ == 8) { + r = prng.Rand8Extremes(); + } else { + r = prng.Rand16() & mask_; + } + assign_val(out, y * kOutputStride + x, r); + assign_val(ref, y * kOutputStride + x, r); + } + } + + const InterpFilter filter = (InterpFilter)1; + const InterpKernel *filters = + (const InterpKernel *)av1_get_interp_filter_kernel(filter, USE_8_TAPS); + wrapper_filter_average_block2d_8_c(in, kInputStride, filters[1], filters[1], + out, kOutputStride, Width(), Height()); + + aom_usec_timer timer; + int tests_num = 1000; + + aom_usec_timer_start(&timer); + while (tests_num > 0) { + for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) { + const InterpFilter filter = (InterpFilter)filter_bank; + const InterpKernel *filters = + (const InterpKernel *)av1_get_interp_filter_kernel(filter, + USE_8_TAPS); + for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) { + for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) { + if (filter_x && filter_y) continue; + if (filter_y) + ASM_REGISTER_STATE_CHECK( + UUT_->v8_(in, kInputStride, out, kOutputStride, kInvalidFilter, + 16, filters[filter_y], 16, Width(), Height())); + else if (filter_x) + ASM_REGISTER_STATE_CHECK(UUT_->h8_( + in, kInputStride, out, kOutputStride, filters[filter_x], 16, + kInvalidFilter, 16, Width(), Height())); + } + } + } + tests_num--; + } + aom_usec_timer_mark(&timer); + + const int elapsed_time = + static_cast(aom_usec_timer_elapsed(&timer) / 1000); + printf("%dx%d (bitdepth %d) time: %5d ms\n", Width(), Height(), + UUT_->use_highbd_, elapsed_time); +} + +using std::make_tuple; + +// WRAP macro is only used for high bitdepth build. +#if CONFIG_AV1_HIGHBITDEPTH +#define WRAP(func, bd) \ + static void wrap_##func##_##bd( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, \ + const int16_t *filter_y, int filter_y_stride, int w, int h) { \ + aom_highbd_##func(src, src_stride, dst, dst_stride, filter_x, \ + filter_x_stride, filter_y, filter_y_stride, w, h, bd); \ + } +#if HAVE_SSE2 && ARCH_X86_64 +WRAP(convolve_copy_sse2, 8) +WRAP(convolve_copy_sse2, 10) +WRAP(convolve_copy_sse2, 12) +WRAP(convolve8_horiz_sse2, 8) +WRAP(convolve8_vert_sse2, 8) +WRAP(convolve8_horiz_sse2, 10) +WRAP(convolve8_vert_sse2, 10) +WRAP(convolve8_horiz_sse2, 12) +WRAP(convolve8_vert_sse2, 12) +#endif // HAVE_SSE2 && ARCH_X86_64 + +WRAP(convolve_copy_c, 8) +WRAP(convolve8_horiz_c, 8) +WRAP(convolve8_vert_c, 8) +WRAP(convolve_copy_c, 10) +WRAP(convolve8_horiz_c, 10) +WRAP(convolve8_vert_c, 10) +WRAP(convolve_copy_c, 12) +WRAP(convolve8_horiz_c, 12) +WRAP(convolve8_vert_c, 12) + +#if HAVE_AVX2 +WRAP(convolve_copy_avx2, 8) +WRAP(convolve8_horiz_avx2, 8) +WRAP(convolve8_vert_avx2, 8) + +WRAP(convolve_copy_avx2, 10) +WRAP(convolve8_horiz_avx2, 10) +WRAP(convolve8_vert_avx2, 10) + +WRAP(convolve_copy_avx2, 12) +WRAP(convolve8_horiz_avx2, 12) +WRAP(convolve8_vert_avx2, 12) +#endif // HAVE_AVX2 +#endif // CONFIG_AV1_HIGHBITDEPTH + +#undef WRAP + +#if CONFIG_AV1_HIGHBITDEPTH +const ConvolveFunctions wrap_convolve8_c(wrap_convolve_copy_c_8, + wrap_convolve8_horiz_c_8, + wrap_convolve8_vert_c_8, 8); +const ConvolveFunctions wrap_convolve10_c(wrap_convolve_copy_c_10, + wrap_convolve8_horiz_c_10, + wrap_convolve8_vert_c_10, 10); +const ConvolveFunctions wrap_convolve12_c(wrap_convolve_copy_c_12, + wrap_convolve8_horiz_c_12, + wrap_convolve8_vert_c_12, 12); +const ConvolveParam kArrayConvolve_c[] = { ALL_SIZES(wrap_convolve8_c), + ALL_SIZES(wrap_convolve10_c), + ALL_SIZES(wrap_convolve12_c) }; +#else +const ConvolveFunctions convolve8_c(aom_convolve_copy_c, aom_convolve8_horiz_c, + aom_convolve8_vert_c, 0); +const ConvolveParam kArrayConvolve_c[] = { ALL_SIZES(convolve8_c) }; +#endif + +INSTANTIATE_TEST_SUITE_P(C, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_c)); + +#if HAVE_SSE2 && ARCH_X86_64 +#if CONFIG_AV1_HIGHBITDEPTH +const ConvolveFunctions wrap_convolve8_sse2(wrap_convolve_copy_sse2_8, + wrap_convolve8_horiz_sse2_8, + wrap_convolve8_vert_sse2_8, 8); +const ConvolveFunctions wrap_convolve10_sse2(wrap_convolve_copy_sse2_10, + wrap_convolve8_horiz_sse2_10, + wrap_convolve8_vert_sse2_10, 10); +const ConvolveFunctions wrap_convolve12_sse2(wrap_convolve_copy_sse2_12, + wrap_convolve8_horiz_sse2_12, + wrap_convolve8_vert_sse2_12, 12); +const ConvolveParam kArrayConvolve_sse2[] = { ALL_SIZES(wrap_convolve8_sse2), + ALL_SIZES(wrap_convolve10_sse2), + ALL_SIZES(wrap_convolve12_sse2) }; +#else +const ConvolveFunctions convolve8_sse2(aom_convolve_copy_c, + aom_convolve8_horiz_sse2, + aom_convolve8_vert_sse2, 0); +const ConvolveParam kArrayConvolve_sse2[] = { ALL_SIZES(convolve8_sse2) }; +#endif +INSTANTIATE_TEST_SUITE_P(SSE2, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_sse2)); +#endif + +#if HAVE_SSSE3 +const ConvolveFunctions convolve8_ssse3(aom_convolve_copy_c, + aom_convolve8_horiz_ssse3, + aom_convolve8_vert_ssse3, 0); + +const ConvolveParam kArrayConvolve8_ssse3[] = { ALL_SIZES(convolve8_ssse3) }; +INSTANTIATE_TEST_SUITE_P(SSSE3, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve8_ssse3)); +#endif + +#if HAVE_AVX2 +#if CONFIG_AV1_HIGHBITDEPTH +const ConvolveFunctions wrap_convolve8_avx2(wrap_convolve_copy_avx2_8, + wrap_convolve8_horiz_avx2_8, + wrap_convolve8_vert_avx2_8, 8); +const ConvolveFunctions wrap_convolve10_avx2(wrap_convolve_copy_avx2_10, + wrap_convolve8_horiz_avx2_10, + wrap_convolve8_vert_avx2_10, 10); +const ConvolveFunctions wrap_convolve12_avx2(wrap_convolve_copy_avx2_12, + wrap_convolve8_horiz_avx2_12, + wrap_convolve8_vert_avx2_12, 12); +const ConvolveParam kArray_Convolve8_avx2[] = { + ALL_SIZES_64(wrap_convolve8_avx2), ALL_SIZES_64(wrap_convolve10_avx2), + ALL_SIZES_64(wrap_convolve12_avx2) +}; +#else +const ConvolveFunctions convolve8_avx2(aom_convolve_copy_c, + aom_convolve8_horiz_avx2, + aom_convolve8_vert_avx2, 0); +const ConvolveParam kArray_Convolve8_avx2[] = { ALL_SIZES(convolve8_avx2) }; +#endif + +INSTANTIATE_TEST_SUITE_P(AVX2, ConvolveTest, + ::testing::ValuesIn(kArray_Convolve8_avx2)); +#endif // HAVE_AVX2 + +} // namespace diff --git a/libs/libaom/src/test/corner_match_test.cc b/libs/libaom/src/test/corner_match_test.cc new file mode 100644 index 000000000..c685dca80 --- /dev/null +++ b/libs/libaom/src/test/corner_match_test.cc @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "config/av1_rtcd.h" + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/acm_random.h" +#include "test/util.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" + +#include "av1/encoder/corner_match.h" + +namespace test_libaom { + +namespace AV1CornerMatch { + +using libaom_test::ACMRandom; + +typedef double (*ComputeCrossCorrFunc)(unsigned char *im1, int stride1, int x1, + int y1, unsigned char *im2, int stride2, + int x2, int y2); + +using std::make_tuple; +using std::tuple; +typedef tuple CornerMatchParam; + +class AV1CornerMatchTest : public ::testing::TestWithParam { + public: + virtual ~AV1CornerMatchTest(); + virtual void SetUp(); + + virtual void TearDown(); + + protected: + void RunCheckOutput(int run_times); + ComputeCrossCorrFunc target_func; + + libaom_test::ACMRandom rnd_; +}; + +AV1CornerMatchTest::~AV1CornerMatchTest() {} +void AV1CornerMatchTest::SetUp() { + rnd_.Reset(ACMRandom::DeterministicSeed()); + target_func = GET_PARAM(1); +} +void AV1CornerMatchTest::TearDown() { libaom_test::ClearSystemState(); } + +void AV1CornerMatchTest::RunCheckOutput(int run_times) { + const int w = 128, h = 128; + const int num_iters = 10000; + int i, j; + aom_usec_timer ref_timer, test_timer; + + uint8_t *input1 = new uint8_t[w * h]; + uint8_t *input2 = new uint8_t[w * h]; + + // Test the two extreme cases: + // i) Random data, should have correlation close to 0 + // ii) Linearly related data + noise, should have correlation close to 1 + int mode = GET_PARAM(0); + if (mode == 0) { + for (i = 0; i < h; ++i) + for (j = 0; j < w; ++j) { + input1[i * w + j] = rnd_.Rand8(); + input2[i * w + j] = rnd_.Rand8(); + } + } else if (mode == 1) { + for (i = 0; i < h; ++i) + for (j = 0; j < w; ++j) { + int v = rnd_.Rand8(); + input1[i * w + j] = v; + input2[i * w + j] = (v / 2) + (rnd_.Rand8() & 15); + } + } + + for (i = 0; i < num_iters; ++i) { + int x1 = MATCH_SZ_BY2 + rnd_.PseudoUniform(w - 2 * MATCH_SZ_BY2); + int y1 = MATCH_SZ_BY2 + rnd_.PseudoUniform(h - 2 * MATCH_SZ_BY2); + int x2 = MATCH_SZ_BY2 + rnd_.PseudoUniform(w - 2 * MATCH_SZ_BY2); + int y2 = MATCH_SZ_BY2 + rnd_.PseudoUniform(h - 2 * MATCH_SZ_BY2); + + double res_c = + av1_compute_cross_correlation_c(input1, w, x1, y1, input2, w, x2, y2); + double res_simd = target_func(input1, w, x1, y1, input2, w, x2, y2); + + if (run_times > 1) { + aom_usec_timer_start(&ref_timer); + for (j = 0; j < run_times; j++) { + av1_compute_cross_correlation_c(input1, w, x1, y1, input2, w, x2, y2); + } + aom_usec_timer_mark(&ref_timer); + const int elapsed_time_c = + static_cast(aom_usec_timer_elapsed(&ref_timer)); + + aom_usec_timer_start(&test_timer); + for (j = 0; j < run_times; j++) { + target_func(input1, w, x1, y1, input2, w, x2, y2); + } + aom_usec_timer_mark(&test_timer); + const int elapsed_time_simd = + static_cast(aom_usec_timer_elapsed(&test_timer)); + + printf( + "c_time=%d \t simd_time=%d \t " + "gain=%d\n", + elapsed_time_c, elapsed_time_simd, + (elapsed_time_c / elapsed_time_simd)); + } else { + ASSERT_EQ(res_simd, res_c); + } + } + delete[] input1; + delete[] input2; +} + +TEST_P(AV1CornerMatchTest, CheckOutput) { RunCheckOutput(1); } +TEST_P(AV1CornerMatchTest, DISABLED_Speed) { RunCheckOutput(100000); } + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P( + SSE4_1, AV1CornerMatchTest, + ::testing::Values(make_tuple(0, &av1_compute_cross_correlation_sse4_1), + make_tuple(1, &av1_compute_cross_correlation_sse4_1))); +#endif + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, AV1CornerMatchTest, + ::testing::Values(make_tuple(0, &av1_compute_cross_correlation_avx2), + make_tuple(1, &av1_compute_cross_correlation_avx2))); +#endif +} // namespace AV1CornerMatch + +} // namespace test_libaom diff --git a/libs/libaom/src/test/cpu_speed_test.cc b/libs/libaom/src/test/cpu_speed_test.cc new file mode 100644 index 000000000..2a164974b --- /dev/null +++ b/libs/libaom/src/test/cpu_speed_test.cc @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/y4m_video_source.h" + +namespace { + +const int kMaxPSNR = 100; + +class CpuSpeedTest + : public ::libaom_test::CodecTestWith2Params, + public ::libaom_test::EncoderTest { + protected: + CpuSpeedTest() + : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), + set_cpu_used_(GET_PARAM(2)), min_psnr_(kMaxPSNR), + tune_content_(AOM_CONTENT_DEFAULT) {} + virtual ~CpuSpeedTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(encoding_mode_); + if (encoding_mode_ != ::libaom_test::kRealTime) { + cfg_.g_lag_in_frames = 25; + cfg_.rc_end_usage = AOM_VBR; + } else { + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = AOM_CBR; + } + } + + virtual void BeginPassHook(unsigned int /*pass*/) { min_psnr_ = kMaxPSNR; } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AOME_SET_CPUUSED, set_cpu_used_); + encoder->Control(AV1E_SET_TUNE_CONTENT, tune_content_); + if (encoding_mode_ != ::libaom_test::kRealTime) { + encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1); + encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7); + encoder->Control(AOME_SET_ARNR_STRENGTH, 5); + } + } + } + + virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) { + if (pkt->data.psnr.psnr[0] < min_psnr_) min_psnr_ = pkt->data.psnr.psnr[0]; + } + + void TestQ0(); + void TestScreencastQ0(); + void TestTuneScreen(); + void TestEncodeHighBitrate(); + void TestLowBitrate(); + + ::libaom_test::TestMode encoding_mode_; + int set_cpu_used_; + double min_psnr_; + int tune_content_; +}; + +void CpuSpeedTest::TestQ0() { + // Validate that this non multiple of 64 wide clip encodes and decodes + // without a mismatch when passing in a very low max q. This pushes + // the encoder to producing lots of big partitions which will likely + // extend into the border and test the border condition. + cfg_.rc_2pass_vbr_minsection_pct = 5; + cfg_.rc_2pass_vbr_maxsection_pct = 2000; + cfg_.rc_target_bitrate = 400; + cfg_.rc_max_quantizer = 0; + cfg_.rc_min_quantizer = 0; + + ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, + 10); + + init_flags_ = AOM_CODEC_USE_PSNR; + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + EXPECT_GE(min_psnr_, kMaxPSNR); +} + +void CpuSpeedTest::TestScreencastQ0() { + ::libaom_test::Y4mVideoSource video("screendata.y4m", 0, 3); + cfg_.g_timebase = video.timebase(); + cfg_.rc_2pass_vbr_minsection_pct = 5; + cfg_.rc_2pass_vbr_maxsection_pct = 2000; + cfg_.rc_target_bitrate = 400; + cfg_.rc_max_quantizer = 0; + cfg_.rc_min_quantizer = 0; + + init_flags_ = AOM_CODEC_USE_PSNR; + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + EXPECT_GE(min_psnr_, kMaxPSNR); +} + +void CpuSpeedTest::TestTuneScreen() { + ::libaom_test::Y4mVideoSource video("screendata.y4m", 0, 3); + cfg_.g_timebase = video.timebase(); + cfg_.rc_2pass_vbr_minsection_pct = 5; + cfg_.rc_2pass_vbr_minsection_pct = 2000; + cfg_.rc_target_bitrate = 2000; + cfg_.rc_max_quantizer = 63; + cfg_.rc_min_quantizer = 0; + tune_content_ = AOM_CONTENT_SCREEN; + + init_flags_ = AOM_CODEC_USE_PSNR; + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +void CpuSpeedTest::TestEncodeHighBitrate() { + // Validate that this non multiple of 64 wide clip encodes and decodes + // without a mismatch when passing in a very low max q. This pushes + // the encoder to producing lots of big partitions which will likely + // extend into the border and test the border condition. + cfg_.rc_2pass_vbr_minsection_pct = 5; + cfg_.rc_2pass_vbr_maxsection_pct = 2000; + cfg_.rc_target_bitrate = 12000; + cfg_.rc_max_quantizer = 10; + cfg_.rc_min_quantizer = 0; + + ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, + 10); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +void CpuSpeedTest::TestLowBitrate() { + // Validate that this clip encodes and decodes without a mismatch + // when passing in a very high min q. This pushes the encoder to producing + // lots of small partitions which might will test the other condition. + cfg_.rc_2pass_vbr_minsection_pct = 5; + cfg_.rc_2pass_vbr_maxsection_pct = 2000; + cfg_.rc_target_bitrate = 200; + cfg_.rc_min_quantizer = 40; + + ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, + 10); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +TEST_P(CpuSpeedTest, TestQ0) { TestQ0(); } +TEST_P(CpuSpeedTest, TestScreencastQ0) { TestScreencastQ0(); } +TEST_P(CpuSpeedTest, TestTuneScreen) { TestTuneScreen(); } +TEST_P(CpuSpeedTest, TestEncodeHighBitrate) { TestEncodeHighBitrate(); } +TEST_P(CpuSpeedTest, TestLowBitrate) { TestLowBitrate(); } + +class CpuSpeedTestLarge : public CpuSpeedTest {}; + +TEST_P(CpuSpeedTestLarge, TestQ0) { TestQ0(); } +TEST_P(CpuSpeedTestLarge, TestScreencastQ0) { TestScreencastQ0(); } +TEST_P(CpuSpeedTestLarge, TestTuneScreen) { TestTuneScreen(); } +TEST_P(CpuSpeedTestLarge, TestEncodeHighBitrate) { TestEncodeHighBitrate(); } +TEST_P(CpuSpeedTestLarge, TestLowBitrate) { TestLowBitrate(); } + +AV1_INSTANTIATE_TEST_CASE(CpuSpeedTest, + ::testing::Values(::libaom_test::kTwoPassGood, + ::libaom_test::kOnePassGood), + ::testing::Range(1, 3)); +AV1_INSTANTIATE_TEST_CASE(CpuSpeedTestLarge, + ::testing::Values(::libaom_test::kTwoPassGood, + ::libaom_test::kOnePassGood), + ::testing::Range(0, 1)); +} // namespace diff --git a/libs/libaom/src/test/datarate_test.cc b/libs/libaom/src/test/datarate_test.cc new file mode 100644 index 000000000..053c05571 --- /dev/null +++ b/libs/libaom/src/test/datarate_test.cc @@ -0,0 +1,373 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/datarate_test.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "aom/aom_codec.h" + +namespace datarate_test { +namespace { + +// Params: test mode, speed, aq mode and index for bitrate array. +class DatarateTestLarge + : public ::libaom_test::CodecTestWith4Params, + public DatarateTest { + public: + DatarateTestLarge() : DatarateTest(GET_PARAM(0)) { + set_cpu_used_ = GET_PARAM(2); + aq_mode_ = GET_PARAM(3); + } + + protected: + virtual ~DatarateTestLarge() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(GET_PARAM(1)); + ResetModel(); + } + + virtual void BasicRateTargetingVBRTest() { + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_error_resilient = 0; + cfg_.rc_end_usage = AOM_VBR; + cfg_.g_lag_in_frames = 0; + + ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, + 288, 30, 1, 0, 140); + const int bitrate_array[2] = { 400, 800 }; + cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)]; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.7) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.3) + << " The datarate for the file is greater than target by too much!"; + } + + virtual void BasicRateTargetingCBRTest() { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = AOM_CBR; + cfg_.g_lag_in_frames = 0; + + ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, + 288, 30, 1, 0, 140); + const int bitrate_array[2] = { 150, 550 }; + cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)]; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.15) + << " The datarate for the file is greater than target by too much!"; + } + + virtual void BasicRateTargetingCBRPeriodicKeyFrameTest() { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = AOM_CBR; + cfg_.g_lag_in_frames = 0; + // Periodic keyframe + cfg_.kf_max_dist = 50; + + ::libaom_test::I420VideoSource video("pixel_capture_w320h240.yuv", 320, 240, + 30, 1, 0, 310); + const int bitrate_array[2] = { 150, 550 }; + cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)]; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.15) + << " The datarate for the file is greater than target by too much!"; + } + + virtual void BasicRateTargetingAQModeOnOffCBRTest() { + if (GET_PARAM(4) > 0) return; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = AOM_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 0; + cfg_.g_pass = AOM_RC_ONE_PASS; + cfg_.g_usage = AOM_USAGE_REALTIME; + cfg_.kf_mode = AOM_KF_DISABLED; + + ::libaom_test::I420VideoSource video("pixel_capture_w320h240.yuv", 320, 240, + 30, 1, 0, 310); + const int bitrate_array[1] = { 60 }; + cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)]; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.15) + << " The datarate for the file is greater than target by too much!"; + } + + virtual void BasicRateTargeting444CBRTest() { + ::libaom_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140); + + cfg_.g_profile = 1; + cfg_.g_timebase = video.timebase(); + + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = AOM_CBR; + + const int bitrate_array[2] = { 250, 650 }; + cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)]; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(static_cast(cfg_.rc_target_bitrate), + effective_datarate_ * 0.85) + << " The datarate for the file exceeds the target by too much!"; + ASSERT_LE(static_cast(cfg_.rc_target_bitrate), + effective_datarate_ * 1.15) + << " The datarate for the file missed the target!" + << cfg_.rc_target_bitrate << " " << effective_datarate_; + } +}; + +// Params: test mode, speed, aq mode. +class DatarateTestFrameDropLarge + : public ::libaom_test::CodecTestWith3Params, + public DatarateTest { + public: + DatarateTestFrameDropLarge() : DatarateTest(GET_PARAM(0)) { + set_cpu_used_ = GET_PARAM(2); + aq_mode_ = GET_PARAM(3); + } + + protected: + virtual ~DatarateTestFrameDropLarge() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(GET_PARAM(1)); + ResetModel(); + } + + virtual void ChangingDropFrameThreshTest() { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_undershoot_pct = 20; + cfg_.rc_undershoot_pct = 20; + cfg_.rc_dropframe_thresh = 10; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 50; + cfg_.rc_end_usage = AOM_CBR; + cfg_.rc_target_bitrate = 200; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + // TODO(marpan): Investigate datarate target failures with a smaller + // keyframe interval (128). + cfg_.kf_max_dist = 9999; + + ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, + 288, 30, 1, 0, 100); + + const int kDropFrameThreshTestStep = 30; + aom_codec_pts_t last_drop = 140; + int last_num_drops = 0; + for (int i = 40; i < 100; i += kDropFrameThreshTestStep) { + cfg_.rc_dropframe_thresh = i; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.17) + << " The datarate for the file is greater than target by too much!"; + if (last_drop > 0) { + ASSERT_LE(first_drop_, last_drop) + << " The first dropped frame for drop_thresh " << i + << " > first dropped frame for drop_thresh " + << i - kDropFrameThreshTestStep; + } + ASSERT_GE(num_drops_, last_num_drops * 0.7) + << " The number of dropped frames for drop_thresh " << i + << " < number of dropped frames for drop_thresh " + << i - kDropFrameThreshTestStep; + last_drop = first_drop_; + last_num_drops = num_drops_; + } + } +}; + +// Check basic rate targeting for VBR mode. +TEST_P(DatarateTestLarge, BasicRateTargetingVBR) { + BasicRateTargetingVBRTest(); +} + +// Check basic rate targeting for CBR. +TEST_P(DatarateTestLarge, BasicRateTargetingCBR) { + BasicRateTargetingCBRTest(); +} + +// Check basic rate targeting for periodic key frame. +TEST_P(DatarateTestLarge, PeriodicKeyFrameCBR) { + BasicRateTargetingCBRPeriodicKeyFrameTest(); +} + +// Check basic rate targeting for CBR. +TEST_P(DatarateTestLarge, BasicRateTargeting444CBR) { + BasicRateTargeting444CBRTest(); +} + +// Check that (1) the first dropped frame gets earlier and earlier +// as the drop frame threshold is increased, and (2) that the total number of +// frame drops does not decrease as we increase frame drop threshold. +// Use a lower qp-max to force some frame drops. +TEST_P(DatarateTestFrameDropLarge, ChangingDropFrameThresh) { + ChangingDropFrameThreshTest(); +} + +TEST_P(DatarateTestLarge, BasicRateTargetingAQModeOnOffCBR) { + BasicRateTargetingAQModeOnOffCBRTest(); +} + +class DatarateTestRealtime : public DatarateTestLarge {}; + +class DatarateTestFrameDropRealtime : public DatarateTestFrameDropLarge {}; + +// Params: aq mode. +class DatarateTestSpeedChangeRealtime + : public ::libaom_test::CodecTestWith2Params, + public DatarateTest { + public: + DatarateTestSpeedChangeRealtime() : DatarateTest(GET_PARAM(0)) { + aq_mode_ = GET_PARAM(1); + speed_change_test_ = true; + } + + protected: + virtual ~DatarateTestSpeedChangeRealtime() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(GET_PARAM(1)); + ResetModel(); + } + + virtual void ChangingSpeedTest() { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_undershoot_pct = 20; + cfg_.rc_undershoot_pct = 20; + cfg_.rc_dropframe_thresh = 10; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 50; + cfg_.rc_end_usage = AOM_CBR; + cfg_.rc_target_bitrate = 200; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + // TODO(marpan): Investigate datarate target failures with a smaller + // keyframe interval (128). + cfg_.kf_max_dist = 9999; + cfg_.rc_dropframe_thresh = 0; + ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, + 288, 30, 1, 0, 100); + + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.83) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.20) + << " The datarate for the file is greater than target by too much!"; + } +}; + +// Check basic rate targeting for VBR mode. +TEST_P(DatarateTestRealtime, BasicRateTargetingVBR) { + BasicRateTargetingVBRTest(); +} + +// Check basic rate targeting for CBR. +TEST_P(DatarateTestRealtime, BasicRateTargetingCBR) { + BasicRateTargetingCBRTest(); +} + +// Check basic rate targeting for periodic key frame. +TEST_P(DatarateTestRealtime, PeriodicKeyFrameCBR) { + BasicRateTargetingCBRPeriodicKeyFrameTest(); +} + +// Check basic rate targeting for CBR. +TEST_P(DatarateTestRealtime, BasicRateTargeting444CBR) { + BasicRateTargeting444CBRTest(); +} + +// Check that (1) the first dropped frame gets earlier and earlier +// as the drop frame threshold is increased, and (2) that the total number of +// frame drops does not decrease as we increase frame drop threshold. +// Use a lower qp-max to force some frame drops. +TEST_P(DatarateTestFrameDropRealtime, ChangingDropFrameThresh) { + ChangingDropFrameThreshTest(); +} + +TEST_P(DatarateTestSpeedChangeRealtime, ChangingSpeedTest) { + ChangingSpeedTest(); +} + +AV1_INSTANTIATE_TEST_CASE(DatarateTestLarge, + ::testing::Values(::libaom_test::kRealTime), + ::testing::Range(5, 7), ::testing::Values(0, 3), + ::testing::Values(0, 1)); + +AV1_INSTANTIATE_TEST_CASE(DatarateTestFrameDropLarge, + ::testing::Values(::libaom_test::kRealTime), + ::testing::Range(5, 7), ::testing::Values(0, 3)); + +AV1_INSTANTIATE_TEST_CASE(DatarateTestRealtime, + ::testing::Values(::libaom_test::kRealTime), + ::testing::Range(7, 9), ::testing::Values(0, 3), + ::testing::Values(0, 1)); + +AV1_INSTANTIATE_TEST_CASE(DatarateTestFrameDropRealtime, + ::testing::Values(::libaom_test::kRealTime), + ::testing::Range(7, 9), ::testing::Values(0, 3)); + +AV1_INSTANTIATE_TEST_CASE(DatarateTestSpeedChangeRealtime, + ::testing::Values(::libaom_test::kRealTime), + ::testing::Values(0, 3)); + +} // namespace +} // namespace datarate_test diff --git a/libs/libaom/src/test/datarate_test.h b/libs/libaom/src/test/datarate_test.h new file mode 100644 index 000000000..3c1573119 --- /dev/null +++ b/libs/libaom/src/test/datarate_test.h @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "aom/aom_codec.h" + +namespace datarate_test { +namespace { +class DatarateTest : public ::libaom_test::EncoderTest { + public: + explicit DatarateTest(const ::libaom_test::CodecFactory *codec) + : EncoderTest(codec), set_cpu_used_(0), aq_mode_(0), + speed_change_test_(false) {} + + protected: + virtual ~DatarateTest() {} + + virtual void SetUp() { + InitializeConfig(); + ResetModel(); + } + + virtual void ResetModel() { + last_pts_ = 0; + bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz; + frame_number_ = 0; + tot_frame_number_ = 0; + first_drop_ = 0; + num_drops_ = 0; + // Denoiser is off by default. + denoiser_on_ = 0; + bits_total_ = 0; + denoiser_offon_test_ = 0; + denoiser_offon_period_ = -1; + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AOME_SET_CPUUSED, set_cpu_used_); + encoder->Control(AV1E_SET_AQ_MODE, aq_mode_); + encoder->Control(AV1E_SET_TILE_COLUMNS, 0); + if (cfg_.g_usage == AOM_USAGE_REALTIME) { + encoder->Control(AV1E_SET_DELTAQ_MODE, 0); + encoder->Control(AV1E_SET_ENABLE_TPL_MODEL, 0); + encoder->Control(AV1E_SET_ENABLE_CDEF, 1); + encoder->Control(AV1E_SET_COEFF_COST_UPD_FREQ, 2); + encoder->Control(AV1E_SET_MODE_COST_UPD_FREQ, 2); + encoder->Control(AV1E_SET_MV_COST_UPD_FREQ, 2); + } + } + + if (speed_change_test_) { + if (video->frame() == 0) { + encoder->Control(AOME_SET_CPUUSED, 8); + } + if (video->frame() == 30) { + encoder->Control(AOME_SET_CPUUSED, 7); + } + if (video->frame() == 60) { + encoder->Control(AOME_SET_CPUUSED, 6); + } + if (video->frame() == 90) { + encoder->Control(AOME_SET_CPUUSED, 7); + } + } + + if (denoiser_offon_test_) { + ASSERT_GT(denoiser_offon_period_, 0) + << "denoiser_offon_period_ is not positive."; + if ((video->frame() + 1) % denoiser_offon_period_ == 0) { + // Flip denoiser_on_ periodically + denoiser_on_ ^= 1; + } + } + + encoder->Control(AV1E_SET_NOISE_SENSITIVITY, denoiser_on_); + + const aom_rational_t tb = video->timebase(); + timebase_ = static_cast(tb.num) / tb.den; + duration_ = 0; + } + + virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) { + // Time since last timestamp = duration. + aom_codec_pts_t duration = pkt->data.frame.pts - last_pts_; + + if (duration > 1) { + // If first drop not set and we have a drop set it to this time. + if (!first_drop_) first_drop_ = last_pts_ + 1; + // Update the number of frame drops. + num_drops_ += static_cast(duration - 1); + // Update counter for total number of frames (#frames input to encoder). + // Needed for setting the proper layer_id below. + tot_frame_number_ += static_cast(duration - 1); + } + + // Add to the buffer the bits we'd expect from a constant bitrate server. + bits_in_buffer_model_ += static_cast( + duration * timebase_ * cfg_.rc_target_bitrate * 1000); + + // Buffer should not go negative. + ASSERT_GE(bits_in_buffer_model_, 0) + << "Buffer Underrun at frame " << pkt->data.frame.pts; + + const size_t frame_size_in_bits = pkt->data.frame.sz * 8; + + // Update the total encoded bits. + bits_total_ += frame_size_in_bits; + + // Update the most recent pts. + last_pts_ = pkt->data.frame.pts; + ++frame_number_; + ++tot_frame_number_; + } + + virtual void EndPassHook(void) { + duration_ = (last_pts_ + 1) * timebase_; + // Effective file datarate: + effective_datarate_ = (bits_total_ / 1000.0) / duration_; + } + + aom_codec_pts_t last_pts_; + double timebase_; + int frame_number_; // Counter for number of non-dropped/encoded frames. + int tot_frame_number_; // Counter for total number of input frames. + int64_t bits_total_; + double duration_; + double effective_datarate_; + int set_cpu_used_; + int64_t bits_in_buffer_model_; + aom_codec_pts_t first_drop_; + int num_drops_; + int denoiser_on_; + int denoiser_offon_test_; + int denoiser_offon_period_; + unsigned int aq_mode_; + bool speed_change_test_; +}; + +} // namespace +} // namespace datarate_test diff --git a/libs/libaom/src/test/decode_api_test.cc b/libs/libaom/src/test/decode_api_test.cc new file mode 100644 index 000000000..910640df7 --- /dev/null +++ b/libs/libaom/src/test/decode_api_test.cc @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_config.h" + +#include "test/util.h" +#include "aom/aomdx.h" +#include "aom/aom_decoder.h" + +namespace { + +TEST(DecodeAPI, InvalidParams) { + static const aom_codec_iface_t *kCodecs[] = { +#if CONFIG_AV1_DECODER + aom_codec_av1_dx(), +#endif + }; + uint8_t buf[1] = { 0 }; + aom_codec_ctx_t dec; + + EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_dec_init(NULL, NULL, NULL, 0)); + EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_dec_init(&dec, NULL, NULL, 0)); + EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_decode(NULL, NULL, 0, NULL)); + EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_decode(NULL, buf, 0, NULL)); + EXPECT_EQ(AOM_CODEC_INVALID_PARAM, + aom_codec_decode(NULL, buf, sizeof(buf), NULL)); + EXPECT_EQ(AOM_CODEC_INVALID_PARAM, + aom_codec_decode(NULL, NULL, sizeof(buf), NULL)); + EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_destroy(NULL)); + EXPECT_TRUE(aom_codec_error(NULL) != NULL); + + for (const aom_codec_iface_t *iface : kCodecs) { + EXPECT_EQ(AOM_CODEC_INVALID_PARAM, + aom_codec_dec_init(NULL, iface, NULL, 0)); + + EXPECT_EQ(AOM_CODEC_OK, aom_codec_dec_init(&dec, iface, NULL, 0)); + EXPECT_EQ(AOM_CODEC_INVALID_PARAM, + aom_codec_decode(&dec, NULL, sizeof(buf), NULL)); + EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_decode(&dec, buf, 0, NULL)); + + EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&dec)); + } +} + +} // namespace diff --git a/libs/libaom/src/test/decode_multithreaded_test.cc b/libs/libaom/src/test/decode_multithreaded_test.cc new file mode 100644 index 000000000..92253ede8 --- /dev/null +++ b/libs/libaom/src/test/decode_multithreaded_test.cc @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom_mem/aom_mem.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/md5_helper.h" +#include "test/util.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +namespace { + +static const int kNumMultiThreadDecoders = 3; + +class AV1DecodeMultiThreadedTest + : public ::libaom_test::CodecTestWith5Params, + public ::libaom_test::EncoderTest { + protected: + AV1DecodeMultiThreadedTest() + : EncoderTest(GET_PARAM(0)), md5_single_thread_(), md5_multi_thread_(), + n_tile_cols_(GET_PARAM(1)), n_tile_rows_(GET_PARAM(2)), + n_tile_groups_(GET_PARAM(3)), set_cpu_used_(GET_PARAM(4)), + row_mt_(GET_PARAM(5)) { + init_flags_ = AOM_CODEC_USE_PSNR; + aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t(); + cfg.w = 704; + cfg.h = 576; + cfg.threads = 1; + cfg.allow_lowbitdepth = 1; + single_thread_dec_ = codec_->CreateDecoder(cfg, 0); + + // Test cfg.threads == powers of 2. + for (int i = 0; i < kNumMultiThreadDecoders; ++i) { + cfg.threads <<= 1; + multi_thread_dec_[i] = codec_->CreateDecoder(cfg, 0); + multi_thread_dec_[i]->Control(AV1D_SET_ROW_MT, row_mt_); + } + + if (single_thread_dec_->IsAV1()) { + single_thread_dec_->Control(AV1D_EXT_TILE_DEBUG, 1); + single_thread_dec_->Control(AV1_SET_DECODE_TILE_ROW, -1); + single_thread_dec_->Control(AV1_SET_DECODE_TILE_COL, -1); + } + for (int i = 0; i < kNumMultiThreadDecoders; ++i) { + if (multi_thread_dec_[i]->IsAV1()) { + multi_thread_dec_[i]->Control(AV1D_EXT_TILE_DEBUG, 1); + multi_thread_dec_[i]->Control(AV1_SET_DECODE_TILE_ROW, -1); + multi_thread_dec_[i]->Control(AV1_SET_DECODE_TILE_COL, -1); + } + } + } + + virtual ~AV1DecodeMultiThreadedTest() { + delete single_thread_dec_; + for (int i = 0; i < kNumMultiThreadDecoders; ++i) + delete multi_thread_dec_[i]; + } + + virtual void SetUp() { + InitializeConfig(); + SetMode(libaom_test::kTwoPassGood); + } + + virtual void PreEncodeFrameHook(libaom_test::VideoSource *video, + libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AV1E_SET_TILE_COLUMNS, n_tile_cols_); + encoder->Control(AV1E_SET_TILE_ROWS, n_tile_rows_); + encoder->Control(AV1E_SET_NUM_TG, n_tile_groups_); + encoder->Control(AOME_SET_CPUUSED, set_cpu_used_); + } + } + + void UpdateMD5(::libaom_test::Decoder *dec, const aom_codec_cx_pkt_t *pkt, + ::libaom_test::MD5 *md5) { + const aom_codec_err_t res = dec->DecodeFrame( + reinterpret_cast(pkt->data.frame.buf), pkt->data.frame.sz); + if (res != AOM_CODEC_OK) { + abort_ = true; + ASSERT_EQ(AOM_CODEC_OK, res); + } + const aom_image_t *img = dec->GetDxData().Next(); + md5->Add(img); + } + + virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) { + UpdateMD5(single_thread_dec_, pkt, &md5_single_thread_); + + for (int i = 0; i < kNumMultiThreadDecoders; ++i) + UpdateMD5(multi_thread_dec_[i], pkt, &md5_multi_thread_[i]); + } + + void DoTest() { + const aom_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + cfg_.rc_target_bitrate = 500; + cfg_.g_lag_in_frames = 12; + cfg_.rc_end_usage = AOM_VBR; + + libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 704, 576, + timebase.den, timebase.num, 0, 5); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + const char *md5_single_thread_str = md5_single_thread_.Get(); + + for (int i = 0; i < kNumMultiThreadDecoders; ++i) { + const char *md5_multi_thread_str = md5_multi_thread_[i].Get(); + ASSERT_STREQ(md5_single_thread_str, md5_multi_thread_str); + } + } + + ::libaom_test::MD5 md5_single_thread_; + ::libaom_test::MD5 md5_multi_thread_[kNumMultiThreadDecoders]; + ::libaom_test::Decoder *single_thread_dec_; + ::libaom_test::Decoder *multi_thread_dec_[kNumMultiThreadDecoders]; + + private: + int n_tile_cols_; + int n_tile_rows_; + int n_tile_groups_; + int set_cpu_used_; + int row_mt_; +}; + +// run an encode and do the decode both in single thread +// and multi thread. Ensure that the MD5 of the output in both cases +// is identical. If so, the test passes. +TEST_P(AV1DecodeMultiThreadedTest, MD5Match) { + cfg_.large_scale_tile = 0; + single_thread_dec_->Control(AV1_SET_TILE_MODE, 0); + for (int i = 0; i < kNumMultiThreadDecoders; ++i) + multi_thread_dec_[i]->Control(AV1_SET_TILE_MODE, 0); + DoTest(); +} + +class AV1DecodeMultiThreadedTestLarge : public AV1DecodeMultiThreadedTest {}; + +TEST_P(AV1DecodeMultiThreadedTestLarge, MD5Match) { + cfg_.large_scale_tile = 0; + single_thread_dec_->Control(AV1_SET_TILE_MODE, 0); + for (int i = 0; i < kNumMultiThreadDecoders; ++i) + multi_thread_dec_[i]->Control(AV1_SET_TILE_MODE, 0); + DoTest(); +} + +// TODO(ranjit): More tests have to be added using pre-generated MD5. +AV1_INSTANTIATE_TEST_CASE(AV1DecodeMultiThreadedTest, ::testing::Values(1, 2), + ::testing::Values(1, 2), ::testing::Values(1), + ::testing::Values(3), ::testing::Values(0, 1)); +AV1_INSTANTIATE_TEST_CASE(AV1DecodeMultiThreadedTestLarge, + ::testing::Values(0, 1, 2, 6), + ::testing::Values(0, 1, 2, 6), + ::testing::Values(1, 4), ::testing::Values(0), + ::testing::Values(0, 1)); + +class AV1DecodeMultiThreadedLSTestLarge + : public AV1DecodeMultiThreadedTestLarge {}; + +TEST_P(AV1DecodeMultiThreadedLSTestLarge, MD5Match) { + cfg_.large_scale_tile = 1; + single_thread_dec_->Control(AV1_SET_TILE_MODE, 1); + for (int i = 0; i < kNumMultiThreadDecoders; ++i) + multi_thread_dec_[i]->Control(AV1_SET_TILE_MODE, 1); + DoTest(); +} + +AV1_INSTANTIATE_TEST_CASE(AV1DecodeMultiThreadedLSTestLarge, + ::testing::Values(6), ::testing::Values(6), + ::testing::Values(1), ::testing::Values(0, 3), + ::testing::Values(0, 1)); + +} // namespace diff --git a/libs/libaom/src/test/decode_perf_test.cc b/libs/libaom/src/test/decode_perf_test.cc new file mode 100644 index 000000000..691337cd6 --- /dev/null +++ b/libs/libaom/src/test/decode_perf_test.cc @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_version.h" + +#include "aom_ports/aom_timer.h" +#include "common/ivfenc.h" +#include "test/codec_factory.h" +#include "test/decode_test_driver.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/ivf_video_source.h" +#include "test/md5_helper.h" +#include "test/util.h" +#include "test/webm_video_source.h" + +using std::make_tuple; + +namespace { + +#define VIDEO_NAME 0 +#define THREADS 1 + +const double kUsecsInSec = 1000000.0; +const char kNewEncodeOutputFile[] = "new_encode.ivf"; + +/* + DecodePerfTest takes a tuple of filename + number of threads to decode with + */ +typedef std::tuple DecodePerfParam; + +// TODO(jimbankoski): Add actual test vectors here when available. +// const DecodePerfParam kAV1DecodePerfVectors[] = {}; + +/* + In order to reflect real world performance as much as possible, Perf tests + *DO NOT* do any correctness checks. Please run them alongside correctness + tests to ensure proper codec integrity. Furthermore, in this test we + deliberately limit the amount of system calls we make to avoid OS + preemption. + + TODO(joshualitt) create a more detailed perf measurement test to collect + power/temp/min max frame decode times/etc + */ + +class DecodePerfTest : public ::testing::TestWithParam {}; + +TEST_P(DecodePerfTest, PerfTest) { + const char *const video_name = GET_PARAM(VIDEO_NAME); + const unsigned threads = GET_PARAM(THREADS); + + libaom_test::WebMVideoSource video(video_name); + video.Init(); + + aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t(); + cfg.threads = threads; + cfg.allow_lowbitdepth = 1; + libaom_test::AV1Decoder decoder(cfg, 0); + + aom_usec_timer t; + aom_usec_timer_start(&t); + + for (video.Begin(); video.cxdata() != NULL; video.Next()) { + decoder.DecodeFrame(video.cxdata(), video.frame_size()); + } + + aom_usec_timer_mark(&t); + const double elapsed_secs = double(aom_usec_timer_elapsed(&t)) / kUsecsInSec; + const unsigned frames = video.frame_number(); + const double fps = double(frames) / elapsed_secs; + + printf("{\n"); + printf("\t\"type\" : \"decode_perf_test\",\n"); + printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP); + printf("\t\"videoName\" : \"%s\",\n", video_name); + printf("\t\"threadCount\" : %u,\n", threads); + printf("\t\"decodeTimeSecs\" : %f,\n", elapsed_secs); + printf("\t\"totalFrames\" : %u,\n", frames); + printf("\t\"framesPerSecond\" : %f\n", fps); + printf("}\n"); +} + +// TODO(jimbankoski): Enabled when we have actual AV1 Decode vectors. +// INSTANTIATE_TEST_SUITE_P(AV1, DecodePerfTest, +// ::testing::ValuesIn(kAV1DecodePerfVectors)); + +class AV1NewEncodeDecodePerfTest + : public ::libaom_test::CodecTestWithParam, + public ::libaom_test::EncoderTest { + protected: + AV1NewEncodeDecodePerfTest() + : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), speed_(0), + outfile_(0), out_frames_(0) {} + + virtual ~AV1NewEncodeDecodePerfTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(encoding_mode_); + + cfg_.g_lag_in_frames = 25; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_undershoot_pct = 50; + cfg_.rc_overshoot_pct = 50; + cfg_.rc_buf_sz = 1000; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + cfg_.rc_end_usage = AOM_VBR; + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AOME_SET_CPUUSED, speed_); + encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1); + encoder->Control(AV1E_SET_TILE_COLUMNS, 2); + } + } + + virtual void BeginPassHook(unsigned int /*pass*/) { + const char *const env = getenv("LIBAOM_TEST_DATA_PATH"); + const std::string data_path(env ? env : "."); + const std::string path_to_source = data_path + "/" + kNewEncodeOutputFile; + outfile_ = fopen(path_to_source.c_str(), "wb"); + ASSERT_TRUE(outfile_ != NULL); + } + + virtual void EndPassHook() { + if (outfile_ != NULL) { + if (!fseek(outfile_, 0, SEEK_SET)) + ivf_write_file_header(outfile_, &cfg_, AV1_FOURCC, out_frames_); + fclose(outfile_); + outfile_ = NULL; + } + } + + virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) { + ++out_frames_; + + // Write initial file header if first frame. + if (pkt->data.frame.pts == 0) + ivf_write_file_header(outfile_, &cfg_, AV1_FOURCC, out_frames_); + + // Write frame header and data. + ivf_write_frame_header(outfile_, out_frames_, pkt->data.frame.sz); + ASSERT_EQ(fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_), + pkt->data.frame.sz); + } + + virtual bool DoDecode() const { return false; } + + void set_speed(unsigned int speed) { speed_ = speed; } + + private: + libaom_test::TestMode encoding_mode_; + uint32_t speed_; + FILE *outfile_; + uint32_t out_frames_; +}; + +struct EncodePerfTestVideo { + EncodePerfTestVideo(const char *name_, uint32_t width_, uint32_t height_, + uint32_t bitrate_, int frames_) + : name(name_), width(width_), height(height_), bitrate(bitrate_), + frames(frames_) {} + const char *name; + uint32_t width; + uint32_t height; + uint32_t bitrate; + int frames; +}; + +const EncodePerfTestVideo kAV1EncodePerfTestVectors[] = { + EncodePerfTestVideo("niklas_1280_720_30.yuv", 1280, 720, 600, 470), +}; + +TEST_P(AV1NewEncodeDecodePerfTest, PerfTest) { + SetUp(); + + // TODO(JBB): Make this work by going through the set of given files. + const int i = 0; + const aom_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + cfg_.rc_target_bitrate = kAV1EncodePerfTestVectors[i].bitrate; + + init_flags_ = AOM_CODEC_USE_PSNR; + + const char *video_name = kAV1EncodePerfTestVectors[i].name; + libaom_test::I420VideoSource video( + video_name, kAV1EncodePerfTestVectors[i].width, + kAV1EncodePerfTestVectors[i].height, timebase.den, timebase.num, 0, + kAV1EncodePerfTestVectors[i].frames); + set_speed(2); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + const uint32_t threads = 4; + + libaom_test::IVFVideoSource decode_video(kNewEncodeOutputFile); + decode_video.Init(); + + aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t(); + cfg.threads = threads; + cfg.allow_lowbitdepth = 1; + libaom_test::AV1Decoder decoder(cfg, 0); + + aom_usec_timer t; + aom_usec_timer_start(&t); + + for (decode_video.Begin(); decode_video.cxdata() != NULL; + decode_video.Next()) { + decoder.DecodeFrame(decode_video.cxdata(), decode_video.frame_size()); + } + + aom_usec_timer_mark(&t); + const double elapsed_secs = + static_cast(aom_usec_timer_elapsed(&t)) / kUsecsInSec; + const unsigned decode_frames = decode_video.frame_number(); + const double fps = static_cast(decode_frames) / elapsed_secs; + + printf("{\n"); + printf("\t\"type\" : \"decode_perf_test\",\n"); + printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP); + printf("\t\"videoName\" : \"%s\",\n", kNewEncodeOutputFile); + printf("\t\"threadCount\" : %u,\n", threads); + printf("\t\"decodeTimeSecs\" : %f,\n", elapsed_secs); + printf("\t\"totalFrames\" : %u,\n", decode_frames); + printf("\t\"framesPerSecond\" : %f\n", fps); + printf("}\n"); +} + +AV1_INSTANTIATE_TEST_CASE(AV1NewEncodeDecodePerfTest, + ::testing::Values(::libaom_test::kTwoPassGood)); +} // namespace diff --git a/libs/libaom/src/test/decode_test_driver.cc b/libs/libaom/src/test/decode_test_driver.cc new file mode 100644 index 000000000..70de0cff6 --- /dev/null +++ b/libs/libaom/src/test/decode_test_driver.cc @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "test/codec_factory.h" +#include "test/decode_test_driver.h" +#include "test/register_state_check.h" +#include "test/video_source.h" + +namespace libaom_test { + +const char kAV1Name[] = "AOMedia Project AV1 Decoder"; + +aom_codec_err_t Decoder::PeekStream(const uint8_t *cxdata, size_t size, + aom_codec_stream_info_t *stream_info) { + return aom_codec_peek_stream_info(CodecInterface(), cxdata, size, + stream_info); +} + +aom_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size) { + return DecodeFrame(cxdata, size, NULL); +} + +aom_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size, + void *user_priv) { + aom_codec_err_t res_dec; + InitOnce(); + API_REGISTER_STATE_CHECK( + res_dec = aom_codec_decode(&decoder_, cxdata, size, user_priv)); + return res_dec; +} + +bool Decoder::IsAV1() const { + const char *codec_name = GetDecoderName(); + return strncmp(kAV1Name, codec_name, sizeof(kAV1Name) - 1) == 0; +} + +void DecoderTest::HandlePeekResult(Decoder *const /*decoder*/, + CompressedVideoSource * /*video*/, + const aom_codec_err_t res_peek) { + /* The Av1 implementation of PeekStream returns an error only if the + * data passed to it isn't a valid Av1 chunk. */ + ASSERT_EQ(AOM_CODEC_OK, res_peek) + << "Peek return failed: " << aom_codec_err_to_string(res_peek); +} + +void DecoderTest::RunLoop(CompressedVideoSource *video, + const aom_codec_dec_cfg_t &dec_cfg) { + Decoder *const decoder = codec_->CreateDecoder(dec_cfg, flags_); + ASSERT_TRUE(decoder != NULL); + bool end_of_file = false; + bool peeked_stream = false; + + // Decode frames. + for (video->Begin(); !::testing::Test::HasFailure() && !end_of_file; + video->Next()) { + PreDecodeFrameHook(*video, decoder); + + aom_codec_stream_info_t stream_info; + stream_info.is_annexb = 0; + + if (video->cxdata() != NULL) { + if (!peeked_stream) { + // TODO(yaowu): PeekStream returns error for non-sequence_header_obu, + // therefore should only be tried once per sequence, this shall be fixed + // once PeekStream is updated to properly operate on other obus. + const aom_codec_err_t res_peek = decoder->PeekStream( + video->cxdata(), video->frame_size(), &stream_info); + HandlePeekResult(decoder, video, res_peek); + ASSERT_FALSE(::testing::Test::HasFailure()); + peeked_stream = true; + } + + aom_codec_err_t res_dec = + decoder->DecodeFrame(video->cxdata(), video->frame_size()); + if (!HandleDecodeResult(res_dec, *video, decoder)) break; + } else { + // Signal end of the file to the decoder. + const aom_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0); + ASSERT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError(); + end_of_file = true; + } + + DxDataIterator dec_iter = decoder->GetDxData(); + const aom_image_t *img = NULL; + + // Get decompressed data + while (!::testing::Test::HasFailure() && (img = dec_iter.Next())) + DecompressedFrameHook(*img, video->frame_number()); + } + delete decoder; +} + +void DecoderTest::RunLoop(CompressedVideoSource *video) { + aom_codec_dec_cfg_t dec_cfg = aom_codec_dec_cfg_t(); + RunLoop(video, dec_cfg); +} + +void DecoderTest::set_cfg(const aom_codec_dec_cfg_t &dec_cfg) { + memcpy(&cfg_, &dec_cfg, sizeof(cfg_)); +} + +void DecoderTest::set_flags(const aom_codec_flags_t flags) { flags_ = flags; } + +} // namespace libaom_test diff --git a/libs/libaom/src/test/decode_test_driver.h b/libs/libaom/src/test/decode_test_driver.h new file mode 100644 index 000000000..64722f43a --- /dev/null +++ b/libs/libaom/src/test/decode_test_driver.h @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_TEST_DECODE_TEST_DRIVER_H_ +#define AOM_TEST_DECODE_TEST_DRIVER_H_ +#include +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_config.h" + +#include "aom/aom_decoder.h" + +namespace libaom_test { + +class CodecFactory; +class CompressedVideoSource; + +// Provides an object to handle decoding output +class DxDataIterator { + public: + explicit DxDataIterator(aom_codec_ctx_t *decoder) + : decoder_(decoder), iter_(NULL) {} + + const aom_image_t *Next() { return aom_codec_get_frame(decoder_, &iter_); } + + private: + aom_codec_ctx_t *decoder_; + aom_codec_iter_t iter_; +}; + +// Provides a simplified interface to manage one video decoding. +// Similar to Encoder class, the exact services should be added +// as more tests are added. +class Decoder { + public: + explicit Decoder(aom_codec_dec_cfg_t cfg) + : cfg_(cfg), flags_(0), init_done_(false) { + memset(&decoder_, 0, sizeof(decoder_)); + } + + Decoder(aom_codec_dec_cfg_t cfg, const aom_codec_flags_t flag) + : cfg_(cfg), flags_(flag), init_done_(false) { + memset(&decoder_, 0, sizeof(decoder_)); + } + + virtual ~Decoder() { aom_codec_destroy(&decoder_); } + + aom_codec_err_t PeekStream(const uint8_t *cxdata, size_t size, + aom_codec_stream_info_t *stream_info); + + aom_codec_err_t DecodeFrame(const uint8_t *cxdata, size_t size); + + aom_codec_err_t DecodeFrame(const uint8_t *cxdata, size_t size, + void *user_priv); + + DxDataIterator GetDxData() { return DxDataIterator(&decoder_); } + + void Control(int ctrl_id, int arg) { Control(ctrl_id, arg, AOM_CODEC_OK); } + + void Control(int ctrl_id, const void *arg) { + InitOnce(); + const aom_codec_err_t res = aom_codec_control(&decoder_, ctrl_id, arg); + ASSERT_EQ(AOM_CODEC_OK, res) << DecodeError(); + } + + void Control(int ctrl_id, int arg, aom_codec_err_t expected_value) { + InitOnce(); + const aom_codec_err_t res = aom_codec_control(&decoder_, ctrl_id, arg); + ASSERT_EQ(expected_value, res) << DecodeError(); + } + + const char *DecodeError() { + const char *detail = aom_codec_error_detail(&decoder_); + return detail ? detail : aom_codec_error(&decoder_); + } + + // Passes the external frame buffer information to libaom. + aom_codec_err_t SetFrameBufferFunctions( + aom_get_frame_buffer_cb_fn_t cb_get, + aom_release_frame_buffer_cb_fn_t cb_release, void *user_priv) { + InitOnce(); + return aom_codec_set_frame_buffer_functions(&decoder_, cb_get, cb_release, + user_priv); + } + + const char *GetDecoderName() const { + return aom_codec_iface_name(CodecInterface()); + } + + bool IsAV1() const; + + aom_codec_ctx_t *GetDecoder() { return &decoder_; } + + protected: + virtual aom_codec_iface_t *CodecInterface() const = 0; + + void InitOnce() { + if (!init_done_) { + const aom_codec_err_t res = + aom_codec_dec_init(&decoder_, CodecInterface(), &cfg_, flags_); + ASSERT_EQ(AOM_CODEC_OK, res) << DecodeError(); + init_done_ = true; + } + } + + aom_codec_ctx_t decoder_; + aom_codec_dec_cfg_t cfg_; + aom_codec_flags_t flags_; + bool init_done_; +}; + +// Common test functionality for all Decoder tests. +class DecoderTest { + public: + // Main decoding loop + virtual void RunLoop(CompressedVideoSource *video); + virtual void RunLoop(CompressedVideoSource *video, + const aom_codec_dec_cfg_t &dec_cfg); + + virtual void set_cfg(const aom_codec_dec_cfg_t &dec_cfg); + virtual void set_flags(const aom_codec_flags_t flags); + + // Hook to be called before decompressing every frame. + virtual void PreDecodeFrameHook(const CompressedVideoSource & /*video*/, + Decoder * /*decoder*/) {} + + // Hook to be called to handle decode result. Return true to continue. + virtual bool HandleDecodeResult(const aom_codec_err_t res_dec, + const CompressedVideoSource & /*video*/, + Decoder *decoder) { + EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError(); + return AOM_CODEC_OK == res_dec; + } + + // Hook to be called on every decompressed frame. + virtual void DecompressedFrameHook(const aom_image_t & /*img*/, + const unsigned int /*frame_number*/) {} + + // Hook to be called on peek result + virtual void HandlePeekResult(Decoder *const decoder, + CompressedVideoSource *video, + const aom_codec_err_t res_peek); + + protected: + explicit DecoderTest(const CodecFactory *codec) + : codec_(codec), cfg_(), flags_(0) {} + + virtual ~DecoderTest() {} + + const CodecFactory *codec_; + aom_codec_dec_cfg_t cfg_; + aom_codec_flags_t flags_; +}; + +} // namespace libaom_test + +#endif // AOM_TEST_DECODE_TEST_DRIVER_H_ diff --git a/libs/libaom/src/test/decode_to_md5.sh b/libs/libaom/src/test/decode_to_md5.sh new file mode 100644 index 000000000..2edd1cb52 --- /dev/null +++ b/libs/libaom/src/test/decode_to_md5.sh @@ -0,0 +1,77 @@ +#!/bin/sh +## Copyright (c) 2016, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +## This file tests the libaom decode_to_md5 example. To add new tests to this +## file, do the following: +## 1. Write a shell function (this is your test). +## 2. Add the function to decode_to_md5_tests (on a new line). +## +. $(dirname $0)/tools_common.sh + +# Environment check: Make sure input is available: +# $AV1_IVF_FILE is required. +decode_to_md5_verify_environment() { + if [ "$(av1_encode_available)" != "yes" ] && [ ! -e "${AV1_IVF_FILE}" ]; then + return 1 + fi +} + +# Runs decode_to_md5 on $1 and captures the md5 sum for the final frame. $2 is +# interpreted as codec name and used solely to name the output file. $3 is the +# expected md5 sum: It must match that of the final frame. +decode_to_md5() { + local decoder="$(aom_tool_path decode_to_md5)" + local input_file="$1" + local codec="$2" + local expected_md5="$3" + local output_file="${AOM_TEST_OUTPUT_DIR}/decode_to_md5_${codec}" + + if [ ! -x "${decoder}" ]; then + elog "${decoder} does not exist or is not executable." + return 1 + fi + + eval "${AOM_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \ + ${devnull} + + [ -e "${output_file}" ] || return 1 + + local md5_last_frame="$(tail -n1 "${output_file}" | awk '{print $1}')" + local actual_md5="$(echo "${md5_last_frame}" | awk '{print $1}')" + if [ "${actual_md5}" = "${expected_md5}" ]; then + return 0 + else + elog "MD5 mismatch:" + elog "Expected: ${expected_md5}" + elog "Actual: ${actual_md5}" + return 1 + fi +} + +DISABLED_decode_to_md5_av1() { + # expected MD5 sum for the last frame. + local expected_md5="567dd6d4b7a7170edddbf58bbcc3aff1" + local file="${AV1_IVF_FILE}" + + # TODO(urvang): Check in the encoded file (like libvpx does) to avoid + # encoding every time. + if [ "$(av1_decode_available)" = "yes" ]; then + if [ ! -e "${AV1_IVF_FILE}" ]; then + file="${AOM_TEST_OUTPUT_DIR}/test_encode.ivf" + encode_yuv_raw_input_av1 "${file}" --ivf + fi + decode_to_md5 "${file}" "av1" "${expected_md5}" + fi +} + +# TODO(tomfinegan): Enable when the bitstream stabilizes. +decode_to_md5_tests="DISABLED_decode_to_md5_av1" + +run_tests decode_to_md5_verify_environment "${decode_to_md5_tests}" diff --git a/libs/libaom/src/test/decode_with_drops.sh b/libs/libaom/src/test/decode_with_drops.sh new file mode 100644 index 000000000..155ee9207 --- /dev/null +++ b/libs/libaom/src/test/decode_with_drops.sh @@ -0,0 +1,68 @@ +#!/bin/sh +## Copyright (c) 2016, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +## This file tests the libaom decode_with_drops example. To add new tests to +## this file, do the following: +## 1. Write a shell function (this is your test). +## 2. Add the function to decode_with_drops_tests (on a new line). +## +. $(dirname $0)/tools_common.sh + +# Environment check: Make sure input is available: +# $AV1_IVF_FILE is required. +decode_with_drops_verify_environment() { + if [ "$(av1_encode_available)" != "yes" ] && [ ! -e "${AV1_IVF_FILE}" ]; then + return 1 + fi +} + +# Runs decode_with_drops on $1, $2 is interpreted as codec name and used solely +# to name the output file. $3 is the drop mode, and is passed directly to +# decode_with_drops. +decode_with_drops() { + local decoder="$(aom_tool_path decode_with_drops)" + local input_file="$1" + local codec="$2" + local output_file="${AOM_TEST_OUTPUT_DIR}/decode_with_drops_${codec}" + local drop_mode="$3" + + if [ ! -x "${decoder}" ]; then + elog "${decoder} does not exist or is not executable." + return 1 + fi + + eval "${AOM_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \ + "${drop_mode}" ${devnull} + + [ -e "${output_file}" ] || return 1 +} + + +# Decodes $AV1_IVF_FILE while dropping frames, twice: once in sequence mode, +# and once in pattern mode. +DISABLED_decode_with_drops_av1() { + if [ "$(av1_decode_available)" = "yes" ]; then + local file="${AV1_IVF_FILE}" + if [ ! -e "${AV1_IVF_FILE}" ]; then + file="${AOM_TEST_OUTPUT_DIR}/test_encode.ivf" + encode_yuv_raw_input_av1 "${file}" --ivf + fi + # Drop frames 3 and 4. + decode_with_drops "${file}" "av1" "3-4" + + # Test pattern mode: Drop 3 of every 4 frames. + decode_with_drops "${file}" "av1" "3/4" + fi +} + +# TODO(yaowu): Disable this test as trailing_bit check is expected to fail +decode_with_drops_tests="DISABLED_decode_with_drops_av1" + +run_tests decode_with_drops_verify_environment "${decode_with_drops_tests}" diff --git a/libs/libaom/src/test/divu_small_test.cc b/libs/libaom/src/test/divu_small_test.cc new file mode 100644 index 000000000..f4d0846cf --- /dev/null +++ b/libs/libaom/src/test/divu_small_test.cc @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "test/acm_random.h" +#include "av1/common/odintrin.h" + +using libaom_test::ACMRandom; + +TEST(DivuSmallTest, TestDIVUuptoMAX) { + for (int d = 1; d <= OD_DIVU_DMAX; d++) { + for (uint32_t x = 1; x <= 1000000; x++) { + GTEST_ASSERT_EQ(x / d, OD_DIVU_SMALL(x, d)) + << "x=" << x << " d=" << d << " x/d=" << (x / d) + << " != " << OD_DIVU_SMALL(x, d); + } + } +} + +TEST(DivuSmallTest, TestDIVUrandI31) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + for (int d = 1; d < OD_DIVU_DMAX; d++) { + for (int i = 0; i < 1000000; i++) { + uint32_t x = rnd.Rand31(); + GTEST_ASSERT_EQ(x / d, OD_DIVU_SMALL(x, d)) + << "x=" << x << " d=" << d << " x/d=" << (x / d) + << " != " << OD_DIVU_SMALL(x, d); + } + } +} diff --git a/libs/libaom/src/test/dr_prediction_test.cc b/libs/libaom/src/test/dr_prediction_test.cc new file mode 100644 index 000000000..e8865c02a --- /dev/null +++ b/libs/libaom/src/test/dr_prediction_test.cc @@ -0,0 +1,474 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_mem/aom_mem.h" +#include "aom_ports/aom_timer.h" +#include "av1/common/blockd.h" +#include "av1/common/pred_common.h" +#include "av1/common/reconintra.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" + +namespace { + +const int kZ1Start = 0; +const int kZ2Start = 90; +const int kZ3Start = 180; + +const TX_SIZE kTxSize[] = { TX_4X4, TX_8X8, TX_16X16, TX_32X32, TX_64X64, + TX_4X8, TX_8X4, TX_8X16, TX_16X8, TX_16X32, + TX_32X16, TX_32X64, TX_64X32, TX_4X16, TX_16X4, + TX_8X32, TX_32X8, TX_16X64, TX_64X16 }; + +const char *const kTxSizeStrings[] = { + "TX_4X4", "TX_8X8", "TX_16X16", "TX_32X32", "TX_64X64", + "TX_4X8", "TX_8X4", "TX_8X16", "TX_16X8", "TX_16X32", + "TX_32X16", "TX_32X64", "TX_64X32", "TX_4X16", "TX_16X4", + "TX_8X32", "TX_32X8", "TX_16X64", "TX_64X16" +}; + +using libaom_test::ACMRandom; + +typedef void (*DrPred_Hbd)(uint16_t *dst, ptrdiff_t stride, int bw, int bh, + const uint16_t *above, const uint16_t *left, + int upsample_above, int upsample_left, int dx, + int dy, int bd); + +typedef void (*DrPred)(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_above, int upsample_left, int dx, int dy, + int bd); + +typedef void (*Z1_Lbd)(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_above, int dx, int dy); +template +void z1_wrapper(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, int upsample_above, + int upsample_left, int dx, int dy, int bd) { + (void)bd; + (void)upsample_left; + fn(dst, stride, bw, bh, above, left, upsample_above, dx, dy); +} + +typedef void (*Z2_Lbd)(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_above, int upsample_left, int dx, int dy); +template +void z2_wrapper(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, int upsample_above, + int upsample_left, int dx, int dy, int bd) { + (void)bd; + (void)upsample_left; + fn(dst, stride, bw, bh, above, left, upsample_above, upsample_left, dx, dy); +} + +typedef void (*Z3_Lbd)(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_left, int dx, int dy); +template +void z3_wrapper(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, int upsample_above, + int upsample_left, int dx, int dy, int bd) { + (void)bd; + (void)upsample_above; + fn(dst, stride, bw, bh, above, left, upsample_left, dx, dy); +} + +typedef void (*Z1_Hbd)(uint16_t *dst, ptrdiff_t stride, int bw, int bh, + const uint16_t *above, const uint16_t *left, + int upsample_above, int dx, int dy, int bd); +template +void z1_wrapper_hbd(uint16_t *dst, ptrdiff_t stride, int bw, int bh, + const uint16_t *above, const uint16_t *left, + int upsample_above, int upsample_left, int dx, int dy, + int bd) { + (void)bd; + (void)upsample_left; + fn(dst, stride, bw, bh, above, left, upsample_above, dx, dy, bd); +} + +typedef void (*Z2_Hbd)(uint16_t *dst, ptrdiff_t stride, int bw, int bh, + const uint16_t *above, const uint16_t *left, + int upsample_above, int upsample_left, int dx, int dy, + int bd); +template +void z2_wrapper_hbd(uint16_t *dst, ptrdiff_t stride, int bw, int bh, + const uint16_t *above, const uint16_t *left, + int upsample_above, int upsample_left, int dx, int dy, + int bd) { + (void)bd; + fn(dst, stride, bw, bh, above, left, upsample_above, upsample_left, dx, dy, + bd); +} + +typedef void (*Z3_Hbd)(uint16_t *dst, ptrdiff_t stride, int bw, int bh, + const uint16_t *above, const uint16_t *left, + int upsample_left, int dx, int dy, int bd); +template +void z3_wrapper_hbd(uint16_t *dst, ptrdiff_t stride, int bw, int bh, + const uint16_t *above, const uint16_t *left, + int upsample_above, int upsample_left, int dx, int dy, + int bd) { + (void)bd; + (void)upsample_above; + fn(dst, stride, bw, bh, above, left, upsample_left, dx, dy, bd); +} + +template +struct DrPredFunc { + DrPredFunc(FuncType pred = NULL, FuncType tst = NULL, int bit_depth_value = 0, + int start_angle_value = 0) + : ref_fn(pred), tst_fn(tst), bit_depth(bit_depth_value), + start_angle(start_angle_value) {} + + FuncType ref_fn; + FuncType tst_fn; + int bit_depth; + int start_angle; +}; + +template +class DrPredTest : public ::testing::TestWithParam > { + protected: + static const int kMaxNumTests = 10000; + static const int kIterations = 10; + static const int kDstStride = 64; + static const int kDstSize = kDstStride * kDstStride; + static const int kOffset = 16; + static const int kBufSize = ((2 * MAX_TX_SIZE) << 1) + 16; + + DrPredTest() + : enable_upsample_(0), upsample_above_(0), upsample_left_(0), bw_(0), + bh_(0), dx_(1), dy_(1), bd_(8), txsize_(TX_4X4) { + params_ = this->GetParam(); + start_angle_ = params_.start_angle; + stop_angle_ = start_angle_ + 90; + + dst_ref_ = &dst_ref_data_[0]; + dst_tst_ = &dst_tst_data_[0]; + dst_stride_ = kDstStride; + above_ = &above_data_[kOffset]; + left_ = &left_data_[kOffset]; + + for (int i = 0; i < kBufSize; ++i) { + above_data_[i] = rng_.Rand8(); + left_data_[i] = rng_.Rand8(); + } + + for (int i = 0; i < kDstSize; ++i) { + dst_ref_[i] = 0; + dst_tst_[i] = 0; + } + } + + virtual ~DrPredTest() {} + + void Predict(bool speedtest, int tx) { + const int kNumTests = speedtest ? kMaxNumTests : 1; + aom_usec_timer timer; + int tst_time = 0; + + bd_ = params_.bit_depth; + + aom_usec_timer_start(&timer); + for (int k = 0; k < kNumTests; ++k) { + params_.ref_fn(dst_ref_, dst_stride_, bw_, bh_, above_, left_, + upsample_above_, upsample_left_, dx_, dy_, bd_); + } + aom_usec_timer_mark(&timer); + const int ref_time = static_cast(aom_usec_timer_elapsed(&timer)); + + if (params_.tst_fn) { + aom_usec_timer_start(&timer); + for (int k = 0; k < kNumTests; ++k) { + ASM_REGISTER_STATE_CHECK(params_.tst_fn(dst_tst_, dst_stride_, bw_, bh_, + above_, left_, upsample_above_, + upsample_left_, dx_, dy_, bd_)); + } + aom_usec_timer_mark(&timer); + tst_time = static_cast(aom_usec_timer_elapsed(&timer)); + } else { + for (int i = 0; i < kDstSize; ++i) { + dst_ref_[i] = dst_tst_[i]; + } + } + + OutputTimes(kNumTests, ref_time, tst_time, tx); + } + + void RunTest(bool speedtest, bool needsaturation, int p_angle) { + bd_ = params_.bit_depth; + + if (needsaturation) { + for (int i = 0; i < kBufSize; ++i) { + above_data_[i] = left_data_[i] = (1 << bd_) - 1; + } + } + for (int tx = 0; tx < TX_SIZES_ALL; ++tx) { + if (params_.tst_fn == NULL) { + for (int i = 0; i < kDstSize; ++i) { + dst_tst_[i] = (1 << bd_) - 1; + dst_ref_[i] = (1 << bd_) - 1; + } + } else { + for (int i = 0; i < kDstSize; ++i) { + dst_ref_[i] = 0; + dst_tst_[i] = 0; + } + } + + bw_ = tx_size_wide[kTxSize[tx]]; + bh_ = tx_size_high[kTxSize[tx]]; + + if (enable_upsample_) { + upsample_above_ = + av1_use_intra_edge_upsample(bw_, bh_, p_angle - 90, 0); + upsample_left_ = + av1_use_intra_edge_upsample(bw_, bh_, p_angle - 180, 0); + } else { + upsample_above_ = upsample_left_ = 0; + } + + Predict(speedtest, tx); + + for (int r = 0; r < bh_; ++r) { + for (int c = 0; c < bw_; ++c) { + ASSERT_EQ(dst_ref_[r * dst_stride_ + c], + dst_tst_[r * dst_stride_ + c]) + << bw_ << "x" << bh_ << " r: " << r << " c: " << c + << " dx: " << dx_ << " dy: " << dy_ + << " upsample_above: " << upsample_above_ + << " upsample_left: " << upsample_left_; + } + } + } + } + + void OutputTimes(int num_tests, int ref_time, int tst_time, int tx) { + if (num_tests > 1) { + if (params_.tst_fn) { + const float x = static_cast(ref_time) / tst_time; + printf("\t[%8s] :: ref time %6d, tst time %6d %3.2f\n", + kTxSizeStrings[tx], ref_time, tst_time, x); + } else { + printf("\t[%8s] :: ref time %6d\n", kTxSizeStrings[tx], ref_time); + } + } + } + + Pixel dst_ref_data_[kDstSize]; + Pixel dst_tst_data_[kDstSize]; + + Pixel left_data_[kBufSize]; + Pixel dummy_data_[kBufSize]; + Pixel above_data_[kBufSize]; + + Pixel *dst_ref_; + Pixel *dst_tst_; + Pixel *above_; + Pixel *left_; + int dst_stride_; + + int enable_upsample_; + int upsample_above_; + int upsample_left_; + int bw_; + int bh_; + int dx_; + int dy_; + int bd_; + TX_SIZE txsize_; + + int start_angle_; + int stop_angle_; + + ACMRandom rng_; + + DrPredFunc params_; +}; + +class LowbdDrPredTest : public DrPredTest {}; + +TEST_P(LowbdDrPredTest, SaturatedValues) { + for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) { + for (int angle = start_angle_; angle < stop_angle_; ++angle) { + dx_ = av1_get_dx(angle); + dy_ = av1_get_dy(angle); + if (dx_ && dy_) RunTest(false, true, angle); + } + } +} + +using std::make_tuple; + +INSTANTIATE_TEST_SUITE_P( + C, LowbdDrPredTest, + ::testing::Values(DrPredFunc(&z1_wrapper, + NULL, AOM_BITS_8, kZ1Start), + DrPredFunc(&z2_wrapper, + NULL, AOM_BITS_8, kZ2Start), + DrPredFunc(&z3_wrapper, + NULL, AOM_BITS_8, kZ3Start))); + +#if CONFIG_AV1_HIGHBITDEPTH +class HighbdDrPredTest : public DrPredTest {}; + +TEST_P(HighbdDrPredTest, SaturatedValues) { + for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) { + for (int angle = start_angle_; angle < stop_angle_; ++angle) { + dx_ = av1_get_dx(angle); + dy_ = av1_get_dy(angle); + if (dx_ && dy_) RunTest(false, true, angle); + } + } +} + +INSTANTIATE_TEST_SUITE_P( + C, HighbdDrPredTest, + ::testing::Values( + DrPredFunc(&z1_wrapper_hbd, + NULL, AOM_BITS_8, kZ1Start), + DrPredFunc(&z1_wrapper_hbd, + NULL, AOM_BITS_10, kZ1Start), + DrPredFunc(&z1_wrapper_hbd, + NULL, AOM_BITS_12, kZ1Start), + DrPredFunc(&z2_wrapper_hbd, + NULL, AOM_BITS_8, kZ2Start), + DrPredFunc(&z2_wrapper_hbd, + NULL, AOM_BITS_10, kZ2Start), + DrPredFunc(&z2_wrapper_hbd, + NULL, AOM_BITS_12, kZ2Start), + DrPredFunc(&z3_wrapper_hbd, + NULL, AOM_BITS_8, kZ3Start), + DrPredFunc(&z3_wrapper_hbd, + NULL, AOM_BITS_10, kZ3Start), + DrPredFunc(&z3_wrapper_hbd, + NULL, AOM_BITS_12, kZ3Start))); +#endif // CONFIG_AV1_HIGHBITDEPTH + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, LowbdDrPredTest, + ::testing::Values(DrPredFunc(&z1_wrapper, + &z1_wrapper, + AOM_BITS_8, kZ1Start), + DrPredFunc(&z2_wrapper, + &z2_wrapper, + AOM_BITS_8, kZ2Start), + DrPredFunc(&z3_wrapper, + &z3_wrapper, + AOM_BITS_8, kZ3Start))); + +TEST_P(LowbdDrPredTest, DISABLED_Speed) { + const int angles[] = { 3, 45, 87 }; + for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) { + for (int i = 0; i < 3; ++i) { + const int angle = angles[i] + start_angle_; + dx_ = av1_get_dx(angle); + dy_ = av1_get_dy(angle); + printf("enable_upsample: %d angle: %d ~~~~~~~~~~~~~~~\n", + enable_upsample_, angle); + if (dx_ && dy_) RunTest(true, false, angle); + } + } +} + +TEST_P(LowbdDrPredTest, OperationCheck) { + if (params_.tst_fn == NULL) return; + // const int angles[] = { 3, 45, 81, 87, 93, 100, 145, 187, 199, 260 }; + for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) { + for (int angle = start_angle_; angle < stop_angle_; ++angle) { + dx_ = av1_get_dx(angle); + dy_ = av1_get_dy(angle); + if (dx_ && dy_) RunTest(false, false, angle); + } + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + AVX2, HighbdDrPredTest, + ::testing::Values(DrPredFunc( + &z1_wrapper_hbd, + &z1_wrapper_hbd, + AOM_BITS_8, kZ1Start), + DrPredFunc( + &z1_wrapper_hbd, + &z1_wrapper_hbd, + AOM_BITS_10, kZ1Start), + DrPredFunc( + &z1_wrapper_hbd, + &z1_wrapper_hbd, + AOM_BITS_12, kZ1Start), + DrPredFunc( + &z2_wrapper_hbd, + &z2_wrapper_hbd, + AOM_BITS_8, kZ2Start), + DrPredFunc( + &z2_wrapper_hbd, + &z2_wrapper_hbd, + AOM_BITS_10, kZ2Start), + DrPredFunc( + &z2_wrapper_hbd, + &z2_wrapper_hbd, + AOM_BITS_12, kZ2Start), + DrPredFunc( + &z3_wrapper_hbd, + &z3_wrapper_hbd, + AOM_BITS_8, kZ3Start), + DrPredFunc( + &z3_wrapper_hbd, + &z3_wrapper_hbd, + AOM_BITS_10, kZ3Start), + DrPredFunc( + &z3_wrapper_hbd, + &z3_wrapper_hbd, + AOM_BITS_12, kZ3Start))); + +TEST_P(HighbdDrPredTest, DISABLED_Speed) { + const int angles[] = { 3, 45, 87 }; + for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) { + for (int i = 0; i < 3; ++i) { + int angle = angles[i] + start_angle_; + dx_ = av1_get_dx(angle); + dy_ = av1_get_dy(angle); + printf("enable_upsample: %d angle: %d ~~~~~~~~~~~~~~~\n", + enable_upsample_, angle); + if (dx_ && dy_) RunTest(true, false, angle); + } + } +} + +TEST_P(HighbdDrPredTest, OperationCheck) { + if (params_.tst_fn == NULL) return; + // const int angles[] = { 3, 45, 81, 87, 93, 100, 145, 187, 199, 260 }; + for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) { + for (int angle = start_angle_; angle < stop_angle_; angle++) { + dx_ = av1_get_dx(angle); + dy_ = av1_get_dy(angle); + if (dx_ && dy_) RunTest(false, false, angle); + } + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH +#endif // HAVE_AVX2 + +} // namespace diff --git a/libs/libaom/src/test/dump_obu.sh b/libs/libaom/src/test/dump_obu.sh new file mode 100644 index 000000000..da44dd7e6 --- /dev/null +++ b/libs/libaom/src/test/dump_obu.sh @@ -0,0 +1,70 @@ +#!/bin/sh +## Copyright (c) 2018, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +## This file tests the libaom dump_obu tool. To add new tests to this +## file, do the following: +## 1. Write a shell function (this is your test). +## 2. Add the function to dump_obu_tests (on a new line). +## +. $(dirname $0)/tools_common.sh + +readonly dump_obu_test_file="${AOM_TEST_OUTPUT_DIR}/av1_obu_test.ivf" + +dump_obu_verify_environment() { + if [ ! -e "${YUV_RAW_INPUT}" ]; then + elog "The file ${YUV_RAW_INPUT##*/} must exist in LIBAOM_TEST_DATA_PATH." + return 1 + fi + if [ "$(dump_obu_available)" = "yes" ]; then + if [ -z "$(aom_tool_path dump_obu)" ]; then + elog "dump_obu not found in LIBAOM_BIN_PATH, its parent, or child tools/." + fi + fi +} + +dump_obu_available() { + if [ "$(av1_decode_available)" = "yes" ] && \ + [ "$(av1_encode_available)" = "yes" ]; then + echo yes + fi +} + +aomenc_available() { + if [ -x "$(aom_tool_path aomenc)" ]; then + echo yes + fi +} + +encode_test_file() { + if [ "$(aomenc_available)" = "yes" ]; then + local encoder="$(aom_tool_path aomenc)" + + eval "${encoder}" \ + $(aomenc_encode_test_fast_params) \ + $(yuv_raw_input) \ + --ivf \ + --output=${dump_obu_test_file} \ + ${devnull} + + if [ ! -e "${dump_obu_test_file}" ]; then + elog "dump_obu test input encode failed." + return 1 + fi + fi +} + +dump_obu() { + encode_test_file + eval $(aom_tool_path dump_obu) "${dump_obu_test_file}" ${devnull} +} + +dump_obu_tests="dump_obu" + +run_tests dump_obu_verify_environment "${dump_obu_tests}" diff --git a/libs/libaom/src/test/ec_test.cc b/libs/libaom/src/test/ec_test.cc new file mode 100644 index 000000000..853abcbc5 --- /dev/null +++ b/libs/libaom/src/test/ec_test.cc @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include + +#include "aom_dsp/entenc.h" +#include "aom_dsp/entdec.h" + +TEST(EC_TEST, random_ec_test) { + od_ec_enc enc; + od_ec_dec dec; + int sz; + int i; + int ret; + unsigned int seed; + unsigned char *ptr; + uint32_t ptr_sz; + char *seed_str; + ret = 0; + seed_str = getenv("EC_TEST_SEED"); + if (seed_str) { + seed = atoi(seed_str); + } else { + seed = 0xdaa1a; + } + srand(seed); + od_ec_enc_init(&enc, 1); + /*Test compatibility between multiple different encode/decode routines.*/ + for (i = 0; i < 409600; i++) { + unsigned *fz; + unsigned *fts; + unsigned *data; + unsigned *tell; + unsigned *enc_method; + int j; + sz = rand() / ((RAND_MAX >> (rand() % 9U)) + 1U); + fz = (unsigned *)malloc(sz * sizeof(*fz)); + fts = (unsigned *)malloc(sz * sizeof(*fts)); + data = (unsigned *)malloc(sz * sizeof(*data)); + tell = (unsigned *)malloc((sz + 1) * sizeof(*tell)); + enc_method = (unsigned *)malloc(sz * sizeof(*enc_method)); + od_ec_enc_reset(&enc); + tell[0] = od_ec_enc_tell_frac(&enc); + for (j = 0; j < sz; j++) { + data[j] = rand() / ((RAND_MAX >> 1) + 1); + + fts[j] = CDF_PROB_BITS; + fz[j] = (rand() % (CDF_PROB_TOP - 2)) >> (CDF_PROB_BITS - fts[j]); + fz[j] = OD_MAXI(fz[j], 1); + enc_method[j] = 3 + (rand() & 1); + switch (enc_method[j]) { + case 3: { + od_ec_encode_bool_q15(&enc, data[j], + OD_ICDF(fz[j] << (CDF_PROB_BITS - fts[j]))); + break; + } + case 4: { + uint16_t cdf[2]; + cdf[0] = OD_ICDF(fz[j]); + cdf[1] = OD_ICDF(1U << fts[j]); + od_ec_encode_cdf_q15(&enc, data[j], cdf, 2); + break; + } + } + + tell[j + 1] = od_ec_enc_tell_frac(&enc); + } + ptr = od_ec_enc_done(&enc, &ptr_sz); + EXPECT_GE(((od_ec_enc_tell(&enc) + 7U) >> 3), ptr_sz) + << "od_ec_enc_tell() lied: " + "there's " + << ptr_sz << " bytes instead of " << ((od_ec_enc_tell(&enc) + 7) >> 3) + << " (Random seed: " << seed << ")\n"; + od_ec_dec_init(&dec, ptr, ptr_sz); + EXPECT_EQ(od_ec_dec_tell_frac(&dec), tell[0]) + << "od_ec_dec_tell() mismatch between encoder and decoder " + "at symbol 0: " + << (unsigned)od_ec_dec_tell_frac(&dec) << " instead of " << tell[0] + << " (Random seed: " << seed << ").\n"; + for (j = 0; j < sz; j++) { + int dec_method; + unsigned int sym = data[j] + 1; // Initialize sym to an invalid value. + + if (CDF_SHIFT == 0) { + dec_method = 3 + (rand() & 1); + } else { + dec_method = enc_method[j]; + } + switch (dec_method) { + case 3: { + sym = od_ec_decode_bool_q15( + &dec, OD_ICDF(fz[j] << (CDF_PROB_BITS - fts[j]))); + break; + } + case 4: { + uint16_t cdf[2]; + cdf[0] = OD_ICDF(fz[j]); + cdf[1] = OD_ICDF(1U << fts[j]); + sym = od_ec_decode_cdf_q15(&dec, cdf, 2); + break; + } + } + + EXPECT_EQ(sym, data[j]) + << "Decoded " << sym << " instead of " << data[j] + << " with fz=" << fz[j] << " and ftb=" << fts[j] << "at position " + << j << " of " << sz << " (Random seed: " << seed << ").\n" + << "Encoding method: " << enc_method[j] + << " decoding method: " << dec_method << "\n"; + EXPECT_EQ(od_ec_dec_tell_frac(&dec), tell[j + 1]) + << "od_ec_dec_tell() mismatch between encoder and " + "decoder at symbol " + << j + 1 << ": " << (unsigned)od_ec_dec_tell_frac(&dec) + << " instead of " << tell[j + 1] << " (Random seed: " << seed + << ").\n"; + } + free(enc_method); + free(tell); + free(data); + free(fts); + free(fz); + } + od_ec_enc_reset(&enc); + if (CDF_SHIFT == 0) { + od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384)); + od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384)); + od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384)); + od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384)); + od_ec_encode_bool_q15(&enc, 0, OD_ICDF(24576)); + od_ec_enc_patch_initial_bits(&enc, 3, 2); + EXPECT_FALSE(enc.error) << "od_ec_enc_patch_initial_bits() failed.\n"; + od_ec_enc_patch_initial_bits(&enc, 0, 5); + EXPECT_TRUE(enc.error) + << "od_ec_enc_patch_initial_bits() didn't fail when it should have.\n"; + od_ec_enc_reset(&enc); + od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384)); + od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384)); + od_ec_encode_bool_q15(&enc, 1, OD_ICDF(32256)); + od_ec_encode_bool_q15(&enc, 0, OD_ICDF(24576)); + od_ec_enc_patch_initial_bits(&enc, 0, 2); + EXPECT_FALSE(enc.error) << "od_ec_enc_patch_initial_bits() failed.\n"; + ptr = od_ec_enc_done(&enc, &ptr_sz); + EXPECT_EQ(ptr_sz, 2u); + EXPECT_EQ(ptr[0], 63) + << "Got " << ptr[0] + << " when expecting 63 for od_ec_enc_patch_initial_bits().\n"; + } + od_ec_enc_clear(&enc); + EXPECT_EQ(ret, 0); +} diff --git a/libs/libaom/src/test/edge_detect_test.cc b/libs/libaom/src/test/edge_detect_test.cc new file mode 100644 index 000000000..33fbbc0bb --- /dev/null +++ b/libs/libaom/src/test/edge_detect_test.cc @@ -0,0 +1,409 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include "aom_mem/aom_mem.h" +#include "av1/encoder/rdopt.h" +#include "test/util.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +namespace { + +using std::get; +using std::tuple; + +static int get_pix(uint8_t *buf, int i, bool high_bd) { + if (high_bd) { + return *CONVERT_TO_SHORTPTR(buf + i); + } else { + return buf[i]; + } +} + +/** Get the (i, j) value from the input; if i or j is outside of the width + * or height, the nearest pixel value is returned. + */ +static int get_nearest_pix(const int *buf, int w, int h, int i, int j) { + int offset = AOMMAX(AOMMIN(i, w - 1), 0) + w * AOMMAX(AOMMIN(j, h - 1), 0); + return buf[offset]; +} + +/** Given the image data, creates a new image with padded values, so an + * 8-tap filter can be convolved. The padded value is the same as the closest + * value in the image. Returns a pointer to the start of the image in the + * padded data. Must be freed with free_pad_8tap. The output will be either + * 8-bit or 16-bit, depending on the high bit-depth (high_bd) field. + */ +static uint8_t *pad_8tap_convolve(const int *data, int w, int h, bool high_bd) { + // SIMD optimizations require the width to be a multiple of 8 and the height + // to be multiples of 4. + assert(w % 8 == 0); + assert(h % 4 == 0); + // For an 8-tap filter, we need to pad with 3 lines on top and on the left, + // and 4 lines on the right and bottom, for 7 extra lines. + const int pad_w = w + 7; + const int pad_h = h + 7; + + uint8_t *dst; + if (high_bd) { + dst = + CONVERT_TO_BYTEPTR(aom_memalign(32, sizeof(uint16_t) * pad_w * pad_h)); + } else { + dst = (uint8_t *)aom_memalign(32, sizeof(uint8_t) * pad_w * pad_h); + } + if (dst == nullptr) { + EXPECT_NE(dst, nullptr); + return nullptr; + } + + for (int j = 0; j < pad_h; ++j) { + for (int i = 0; i < pad_w; ++i) { + const int v = get_nearest_pix(data, w, h, i - 3, j - 3); + if (high_bd) { + *CONVERT_TO_SHORTPTR(dst + i + j * pad_w) = v; + } else { + dst[i + j * pad_w] = static_cast(v); + } + } + } + return dst + (w + 7) * 3 + 3; +} + +static int stride_8tap(int width) { return width + 7; } + +static void free_pad_8tap(uint8_t *padded, int width, bool high_bd) { + if (high_bd) { + aom_free(CONVERT_TO_SHORTPTR(padded - (width + 7) * 3 - 3)); + } else { + aom_free(padded - (width + 7) * 3 - 3); + } +} + +struct Pad8TapConvolveDeleter { + Pad8TapConvolveDeleter(const int width, const bool high_bd) + : width(width), high_bd(high_bd) {} + void operator()(uint8_t *p) { + if (p != nullptr) { + free_pad_8tap(p, width, high_bd); + } + } + const int width; + const bool high_bd; +}; + +static uint8_t *malloc_bd(int num_entries, bool high_bd) { + const int bytes_per_entry = high_bd ? sizeof(uint16_t) : sizeof(uint8_t); + + uint8_t *buf = (uint8_t *)aom_memalign(32, bytes_per_entry * num_entries); + if (high_bd) { + return CONVERT_TO_BYTEPTR(buf); + } else { + return buf; + } +} + +static void free_bd(uint8_t *p, bool high_bd) { + if (high_bd) { + aom_free(CONVERT_TO_SHORTPTR(p)); + } else { + aom_free(p); + } +} + +struct MallocBdDeleter { + explicit MallocBdDeleter(const bool high_bd) : high_bd(high_bd) {} + void operator()(uint8_t *p) { free_bd(p, high_bd); } + const bool high_bd; +}; + +class EdgeDetectBrightnessTest : + // Parameters are (brightness, width, height, high bit depth representation, + // bit depth). + public ::testing::TestWithParam > { + protected: + void SetUp() override { + // Allocate a (width by height) array of luma values in orig_. + // padded_ will be filled by the pad() call, which adds a border around + // the orig_. The output_ array has enough space for the computation. + const int brightness = GET_PARAM(0); + const int width = GET_PARAM(1); + const int height = GET_PARAM(2); + const bool high_bd = GET_PARAM(3); + + // Create the padded image of uniform brightness. + std::unique_ptr orig(new int[width * height]); + ASSERT_NE(orig, nullptr); + for (int i = 0; i < width * height; ++i) { + orig[i] = brightness; + } + input_ = pad_8tap_convolve(orig.get(), width, height, high_bd); + ASSERT_NE(input_, nullptr); + output_ = malloc_bd(width * height, high_bd); + ASSERT_NE(output_, nullptr); + } + + void TearDown() override { + const int width = GET_PARAM(1); + const bool high_bd = GET_PARAM(3); + free_pad_8tap(input_, width, high_bd); + free_bd(output_, high_bd); + } + + // Skip the tests where brightness exceeds the bit-depth; we run into this + // issue because of gtest's limitation on valid combinations of test + // parameters. Also skip the tests where bit depth is greater than 8, but + // high bit depth representation is not set. + bool should_skip() const { + const int brightness = GET_PARAM(0); + const int bd = GET_PARAM(4); + if (brightness >= (1 << bd)) { + return true; + } + const bool high_bd = GET_PARAM(3); + if (bd > 8 && !high_bd) { + return true; + } + return false; + } + + uint8_t *input_; + uint8_t *output_; +}; + +TEST_P(EdgeDetectBrightnessTest, BlurUniformBrightness) { + // Some combination of parameters are non-sensical, due to limitations + // of the testing framework. Ignore these. + if (should_skip()) { + return; + } + + // For varying levels of brightness, the algorithm should + // produce the same output. + const int brightness = GET_PARAM(0); + const int width = GET_PARAM(1); + const int height = GET_PARAM(2); + const bool high_bd = GET_PARAM(3); + const int bd = GET_PARAM(4); + + av1_gaussian_blur(input_, stride_8tap(width), width, height, output_, high_bd, + bd); + for (int i = 0; i < width * height; ++i) { + ASSERT_EQ(brightness, get_pix(output_, i, high_bd)); + } +} + +// No edges on a uniformly bright image. +TEST_P(EdgeDetectBrightnessTest, DetectUniformBrightness) { + if (should_skip()) { + return; + } + const int width = GET_PARAM(1); + const int height = GET_PARAM(2); + const bool high_bd = GET_PARAM(3); + const int bd = GET_PARAM(4); + + ASSERT_EQ( + 0, av1_edge_exists(input_, stride_8tap(width), width, height, high_bd, bd) + .magnitude); +} + +#if CONFIG_AV1_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P(ImageBrightnessTests, EdgeDetectBrightnessTest, + ::testing::Combine( + // Brightness + ::testing::Values(0, 1, 2, 127, 128, 129, 254, 255, + 256, 511, 512, 1023, 1024, 2048, + 4095), + // Width + ::testing::Values(8, 16, 32), + // Height + ::testing::Values(4, 8, 12, 32), + // High bit depth representation + ::testing::Bool(), + // Bit depth + ::testing::Values(8, 10, 12))); +#else +INSTANTIATE_TEST_SUITE_P(ImageBrightnessTests, EdgeDetectBrightnessTest, + ::testing::Combine( + // Brightness + ::testing::Values(0, 1, 2, 127, 128, 129, 254, 255, + 256, 511, 512, 1023, 1024, 2048, + 4095), + // Width + ::testing::Values(8, 16, 32), + // Height + ::testing::Values(4, 8, 12, 32), + // High bit depth representation + ::testing::Values(false), + // Bit depth + ::testing::Values(8))); +#endif + +class EdgeDetectImageTest : + // Parameters are (width, height, high bit depth representation, bit depth). + public ::testing::TestWithParam > { + protected: + // Skip the tests where bit depth is greater than 8, but high bit depth + // representation is not set (limitation of testing framework). + bool should_skip() const { + const bool high_bd = GET_PARAM(2); + const int bd = GET_PARAM(3); + return bd > 8 && !high_bd; + } +}; + +// Generate images with black on one side and white on the other. +TEST_P(EdgeDetectImageTest, BlackWhite) { + // Some combination of parameters are non-sensical, due to limitations + // of the testing framework. Ignore these. + if (should_skip()) { + return; + } + + const int width = GET_PARAM(0); + const int height = GET_PARAM(1); + const bool high_bd = GET_PARAM(2); + const int bd = GET_PARAM(3); + + const int white = (1 << bd) - 1; + std::unique_ptr orig(new int[width * height]); + for (int j = 0; j < height; ++j) { + for (int i = 0; i < width; ++i) { + if (i < width / 2) { + orig[i + j * width] = 0; + } else { + orig[i + j * width] = white; + } + } + } + + std::unique_ptr padded( + pad_8tap_convolve(orig.get(), width, height, high_bd), + Pad8TapConvolveDeleter(width, high_bd)); + ASSERT_NE(padded, nullptr); + // Value should be between 556 and 560. + ASSERT_LE(556, av1_edge_exists(padded.get(), stride_8tap(width), width, + height, high_bd, bd) + .magnitude); + ASSERT_GE(560, av1_edge_exists(padded.get(), stride_8tap(width), width, + height, high_bd, bd) + .magnitude); +} + +// Hardcoded blur tests. +static const int luma[32] = { 241, 147, 7, 90, 184, 103, 28, 186, + 2, 248, 49, 242, 114, 146, 127, 22, + 121, 228, 167, 108, 158, 174, 41, 168, + 214, 99, 184, 109, 114, 247, 117, 119 }; +static const uint8_t expected[] = { 161, 138, 119, 118, 123, 118, 113, 122, + 143, 140, 134, 133, 134, 126, 116, 114, + 147, 149, 145, 142, 143, 138, 126, 118, + 164, 156, 148, 144, 148, 148, 138, 126 }; + +static void hardcoded_blur_test_aux(const bool high_bd) { + const int w = 8; + const int h = 4; + for (int bd = 8; bd <= 12; bd += 2) { + // Skip the tests where bit depth is greater than 8, but high bit depth + // representation is not set. + if (bd > 8 && !high_bd) { + break; + } + std::unique_ptr output( + malloc_bd(w * h, high_bd), MallocBdDeleter(high_bd)); + ASSERT_NE(output, nullptr); + std::unique_ptr padded( + pad_8tap_convolve(luma, w, h, high_bd), + Pad8TapConvolveDeleter(w, high_bd)); + ASSERT_NE(padded, nullptr); + av1_gaussian_blur(padded.get(), stride_8tap(w), w, h, output.get(), high_bd, + bd); + for (int i = 0; i < w * h; ++i) { + ASSERT_EQ(expected[i], get_pix(output.get(), i, high_bd)); + } + + // If we multiply the inputs by a constant factor, the output should not + // vary more than 0.5 * factor. + for (int c = 2; c < (1 << (bd - 8)); ++c) { + int scaled_luma[32]; + for (int i = 0; i < 32; ++i) { + scaled_luma[i] = luma[i] * c; + } + padded.reset(pad_8tap_convolve(scaled_luma, w, h, high_bd)); + ASSERT_NE(padded, nullptr); + av1_gaussian_blur(padded.get(), stride_8tap(w), w, h, output.get(), + high_bd, bd); + for (int i = 0; i < w * h; ++i) { + ASSERT_GE(c / 2, + abs(expected[i] * c - get_pix(output.get(), i, high_bd))); + } + } + } +} + +TEST(EdgeDetectImageTest, HardcodedBlurTest) { + hardcoded_blur_test_aux(false); +#if CONFIG_AV1_HIGHBITDEPTH + hardcoded_blur_test_aux(true); +#endif +} + +TEST(EdgeDetectImageTest, SobelTest) { + // Randomly generated 3x3. Compute Sobel for middle value. + const uint8_t buf[9] = { 241, 147, 7, 90, 184, 103, 28, 186, 2 }; + const int stride = 3; + bool high_bd = false; + sobel_xy result = av1_sobel(buf, stride, 1, 1, high_bd); + ASSERT_EQ(234, result.x); + ASSERT_EQ(140, result.y); + +#if CONFIG_AV1_HIGHBITDEPTH + // Verify it works for 8-bit values in a high bit-depth buffer. + const uint16_t buf8_16[9] = { 241, 147, 7, 90, 184, 103, 28, 186, 2 }; + high_bd = true; + result = av1_sobel(CONVERT_TO_BYTEPTR(buf8_16), stride, 1, 1, high_bd); + ASSERT_EQ(234, result.x); + ASSERT_EQ(140, result.y); + + // Verify it works for high bit-depth values as well. + const uint16_t buf16[9] = { 241, 147, 7, 90, 184, 2003, 1028, 186, 2 }; + result = av1_sobel(CONVERT_TO_BYTEPTR(buf16), stride, 1, 1, high_bd); + ASSERT_EQ(-2566, result.x); + ASSERT_EQ(-860, result.y); +#endif +} + +#if CONFIG_AV1_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P(EdgeDetectImages, EdgeDetectImageTest, + ::testing::Combine( + // Width + ::testing::Values(8, 16, 32), + // Height + ::testing::Values(4, 8, 12, 32), + // High bit depth representation + ::testing::Bool(), + // Bit depth + ::testing::Values(8, 10, 12))); +#else +INSTANTIATE_TEST_SUITE_P(EdgeDetectImages, EdgeDetectImageTest, + ::testing::Combine( + // Width + ::testing::Values(8, 16, 32), + // Height + ::testing::Values(4, 8, 12, 32), + // High bit depth representation + ::testing::Values(false), + // Bit depth + ::testing::Values(8))); +#endif +} // namespace diff --git a/libs/libaom/src/test/encode_api_test.cc b/libs/libaom/src/test/encode_api_test.cc new file mode 100644 index 000000000..25bdb5c3f --- /dev/null +++ b/libs/libaom/src/test/encode_api_test.cc @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_config.h" + +#include "test/util.h" +#include "aom/aomcx.h" +#include "aom/aom_encoder.h" + +namespace { + +TEST(EncodeAPI, InvalidParams) { + static const aom_codec_iface_t *kCodecs[] = { +#if CONFIG_AV1_ENCODER + aom_codec_av1_cx(), +#endif + }; + uint8_t buf[1] = { 0 }; + aom_image_t img; + aom_codec_ctx_t enc; + aom_codec_enc_cfg_t cfg; + + EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, 1, 1, 1, buf)); + + EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(NULL, NULL, NULL, 0)); + EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, NULL, NULL, 0)); + EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_encode(NULL, NULL, 0, 0, 0)); + EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_encode(NULL, &img, 0, 0, 0)); + EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_destroy(NULL)); + EXPECT_EQ(AOM_CODEC_INVALID_PARAM, + aom_codec_enc_config_default(NULL, NULL, 0)); + EXPECT_EQ(AOM_CODEC_INVALID_PARAM, + aom_codec_enc_config_default(NULL, &cfg, 0)); + EXPECT_TRUE(aom_codec_error(NULL) != NULL); + + for (const aom_codec_iface_t *iface : kCodecs) { + SCOPED_TRACE(aom_codec_iface_name(iface)); + EXPECT_EQ(AOM_CODEC_INVALID_PARAM, + aom_codec_enc_init(NULL, iface, NULL, 0)); + EXPECT_EQ(AOM_CODEC_INVALID_PARAM, + aom_codec_enc_init(&enc, iface, NULL, 0)); + EXPECT_EQ(AOM_CODEC_INVALID_PARAM, + aom_codec_enc_config_default(iface, &cfg, 2)); + + EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, 0)); + EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0)); + + EXPECT_EQ(NULL, aom_codec_get_global_headers(NULL)); + + aom_fixed_buf_t *glob_headers = aom_codec_get_global_headers(&enc); + EXPECT_TRUE(glob_headers->buf != NULL); + if (glob_headers) { + free(glob_headers->buf); + free(glob_headers); + } + + EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, NULL, 0, 0, 0)); + + EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc)); + } +} + +} // namespace diff --git a/libs/libaom/src/test/encode_perf_test.cc b/libs/libaom/src/test/encode_perf_test.cc new file mode 100644 index 000000000..390a6e0e6 --- /dev/null +++ b/libs/libaom/src/test/encode_perf_test.cc @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_config.h" +#include "config/aom_version.h" + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "aom_ports/aom_timer.h" + +namespace { + +const int kMaxPsnr = 100; +const double kUsecsInSec = 1000000.0; + +struct EncodePerfTestVideo { + EncodePerfTestVideo(const char *name_, uint32_t width_, uint32_t height_, + uint32_t bitrate_, int frames_) + : name(name_), width(width_), height(height_), bitrate(bitrate_), + frames(frames_) {} + const char *name; + uint32_t width; + uint32_t height; + uint32_t bitrate; + int frames; +}; + +const EncodePerfTestVideo kAV1EncodePerfTestVectors[] = { + EncodePerfTestVideo("desktop_640_360_30.yuv", 640, 360, 200, 2484), + EncodePerfTestVideo("kirland_640_480_30.yuv", 640, 480, 200, 300), + EncodePerfTestVideo("macmarcomoving_640_480_30.yuv", 640, 480, 200, 987), + EncodePerfTestVideo("macmarcostationary_640_480_30.yuv", 640, 480, 200, 718), + EncodePerfTestVideo("niklas_640_480_30.yuv", 640, 480, 200, 471), + EncodePerfTestVideo("tacomanarrows_640_480_30.yuv", 640, 480, 200, 300), + EncodePerfTestVideo("tacomasmallcameramovement_640_480_30.yuv", 640, 480, 200, + 300), + EncodePerfTestVideo("thaloundeskmtg_640_480_30.yuv", 640, 480, 200, 300), + EncodePerfTestVideo("niklas_1280_720_30.yuv", 1280, 720, 600, 470), +}; + +const int kEncodePerfTestSpeeds[] = { 5, 6, 7, 8 }; +const int kEncodePerfTestThreads[] = { 1, 2, 4 }; + +class AV1EncodePerfTest + : public ::libaom_test::CodecTestWithParam, + public ::libaom_test::EncoderTest { + protected: + AV1EncodePerfTest() + : EncoderTest(GET_PARAM(0)), min_psnr_(kMaxPsnr), nframes_(0), + encoding_mode_(GET_PARAM(1)), speed_(0), threads_(1) {} + + virtual ~AV1EncodePerfTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(encoding_mode_); + + cfg_.g_lag_in_frames = 0; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_undershoot_pct = 50; + cfg_.rc_overshoot_pct = 50; + cfg_.rc_buf_sz = 1000; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + cfg_.rc_end_usage = AOM_CBR; + cfg_.g_error_resilient = 1; + cfg_.g_threads = threads_; + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + const int log2_tile_columns = 3; + encoder->Control(AOME_SET_CPUUSED, speed_); + encoder->Control(AV1E_SET_TILE_COLUMNS, log2_tile_columns); + encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1); + encoder->Control(AOME_SET_ENABLEAUTOALTREF, 0); + } + } + + virtual void BeginPassHook(unsigned int /*pass*/) { + min_psnr_ = kMaxPsnr; + nframes_ = 0; + } + + virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) { + if (pkt->data.psnr.psnr[0] < min_psnr_) { + min_psnr_ = pkt->data.psnr.psnr[0]; + } + } + + // for performance reasons don't decode + virtual bool DoDecode() { return 0; } + + double min_psnr() const { return min_psnr_; } + + void set_speed(unsigned int speed) { speed_ = speed; } + + void set_threads(unsigned int threads) { threads_ = threads; } + + private: + double min_psnr_; + unsigned int nframes_; + libaom_test::TestMode encoding_mode_; + unsigned speed_; + unsigned int threads_; +}; + +TEST_P(AV1EncodePerfTest, PerfTest) { + for (const EncodePerfTestVideo &test_video : kAV1EncodePerfTestVectors) { + for (int speed : kEncodePerfTestSpeeds) { + for (int threads : kEncodePerfTestThreads) { + if (test_video.width < 512 && threads > 1) + continue; + else if (test_video.width < 1024 && threads > 2) + continue; + + set_threads(threads); + SetUp(); + + const aom_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + cfg_.rc_target_bitrate = test_video.bitrate; + + init_flags_ = AOM_CODEC_USE_PSNR; + + const unsigned frames = test_video.frames; + const char *video_name = test_video.name; + libaom_test::I420VideoSource video(video_name, test_video.width, + test_video.height, timebase.den, + timebase.num, 0, test_video.frames); + set_speed(speed); + + aom_usec_timer t; + aom_usec_timer_start(&t); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + aom_usec_timer_mark(&t); + const double elapsed_secs = aom_usec_timer_elapsed(&t) / kUsecsInSec; + const double fps = frames / elapsed_secs; + const double minimum_psnr = min_psnr(); + std::string display_name(video_name); + if (threads > 1) { + char thread_count[32]; + snprintf(thread_count, sizeof(thread_count), "_t-%d", threads); + display_name += thread_count; + } + + printf("{\n"); + printf("\t\"type\" : \"encode_perf_test\",\n"); + printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP); + printf("\t\"videoName\" : \"%s\",\n", display_name.c_str()); + printf("\t\"encodeTimeSecs\" : %f,\n", elapsed_secs); + printf("\t\"totalFrames\" : %u,\n", frames); + printf("\t\"framesPerSecond\" : %f,\n", fps); + printf("\t\"minPsnr\" : %f,\n", minimum_psnr); + printf("\t\"speed\" : %d,\n", speed); + printf("\t\"threads\" : %d\n", threads); + printf("}\n"); + } + } + } +} + +AV1_INSTANTIATE_TEST_CASE(AV1EncodePerfTest, + ::testing::Values(::libaom_test::kRealTime)); +} // namespace diff --git a/libs/libaom/src/test/encode_test_driver.cc b/libs/libaom/src/test/encode_test_driver.cc new file mode 100644 index 000000000..01f8d501a --- /dev/null +++ b/libs/libaom/src/test/encode_test_driver.cc @@ -0,0 +1,297 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_config.h" + +#include "aom_ports/mem.h" +#include "test/codec_factory.h" +#include "test/decode_test_driver.h" +#include "test/encode_test_driver.h" +#include "test/register_state_check.h" +#include "test/video_source.h" + +namespace libaom_test { +void Encoder::InitEncoder(VideoSource *video) { + aom_codec_err_t res; + const aom_image_t *img = video->img(); + + if (video->img() && !encoder_.priv) { + cfg_.g_w = img->d_w; + cfg_.g_h = img->d_h; + cfg_.g_timebase = video->timebase(); + cfg_.rc_twopass_stats_in = stats_->buf(); + + res = aom_codec_enc_init(&encoder_, CodecInterface(), &cfg_, init_flags_); + ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError(); + } +} + +void Encoder::EncodeFrame(VideoSource *video, const unsigned long frame_flags) { + if (video->img()) + EncodeFrameInternal(*video, frame_flags); + else + Flush(); + + // Handle twopass stats + CxDataIterator iter = GetCxData(); + + while (const aom_codec_cx_pkt_t *pkt = iter.Next()) { + if (pkt->kind != AOM_CODEC_STATS_PKT) continue; + + stats_->Append(*pkt); + } +} + +void Encoder::EncodeFrameInternal(const VideoSource &video, + const unsigned long frame_flags) { + aom_codec_err_t res; + const aom_image_t *img = video.img(); + + // Handle frame resizing + if (cfg_.g_w != img->d_w || cfg_.g_h != img->d_h) { + cfg_.g_w = img->d_w; + cfg_.g_h = img->d_h; + res = aom_codec_enc_config_set(&encoder_, &cfg_); + ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError(); + } + + // Encode the frame + API_REGISTER_STATE_CHECK(res = + aom_codec_encode(&encoder_, img, video.pts(), + video.duration(), frame_flags)); + ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError(); +} + +void Encoder::Flush() { + const aom_codec_err_t res = aom_codec_encode(&encoder_, NULL, 0, 0, 0); + if (!encoder_.priv) + ASSERT_EQ(AOM_CODEC_ERROR, res) << EncoderError(); + else + ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError(); +} + +void EncoderTest::InitializeConfig() { + const aom_codec_err_t res = codec_->DefaultEncoderConfig(&cfg_, 0); + ASSERT_EQ(AOM_CODEC_OK, res); +} + +void EncoderTest::SetMode(TestMode mode) { + switch (mode) { + case kOnePassGood: + case kTwoPassGood: break; + case kRealTime: { + cfg_.g_lag_in_frames = 0; + cfg_.g_usage = AOM_USAGE_REALTIME; + break; + } + default: ASSERT_TRUE(false) << "Unexpected mode " << mode; + } + mode_ = mode; + if (mode == kTwoPassGood) + passes_ = 2; + else + passes_ = 1; +} + +static bool compare_plane(const uint8_t *const buf1, int stride1, + const uint8_t *const buf2, int stride2, int w, int h, + int *const mismatch_row, int *const mismatch_col, + int *const mismatch_pix1, int *const mismatch_pix2) { + int r, c; + + for (r = 0; r < h; ++r) { + for (c = 0; c < w; ++c) { + const int pix1 = buf1[r * stride1 + c]; + const int pix2 = buf2[r * stride2 + c]; + + if (pix1 != pix2) { + if (mismatch_row != NULL) *mismatch_row = r; + if (mismatch_col != NULL) *mismatch_col = c; + if (mismatch_pix1 != NULL) *mismatch_pix1 = pix1; + if (mismatch_pix2 != NULL) *mismatch_pix2 = pix2; + return false; + } + } + } + + return true; +} + +// The function should return "true" most of the time, therefore no early +// break-out is implemented within the match checking process. +static bool compare_img(const aom_image_t *img1, const aom_image_t *img2, + int *const mismatch_row, int *const mismatch_col, + int *const mismatch_plane, int *const mismatch_pix1, + int *const mismatch_pix2) { + if (img1->fmt != img2->fmt || img1->cp != img2->cp || img1->tc != img2->tc || + img1->mc != img2->mc || img1->d_w != img2->d_w || + img1->d_h != img2->d_h || img1->monochrome != img2->monochrome) { + if (mismatch_row != NULL) *mismatch_row = -1; + if (mismatch_col != NULL) *mismatch_col = -1; + return false; + } + + const int num_planes = img1->monochrome ? 1 : 3; + for (int plane = 0; plane < num_planes; plane++) { + if (!compare_plane(img1->planes[plane], img1->stride[plane], + img2->planes[plane], img2->stride[plane], + aom_img_plane_width(img1, plane), + aom_img_plane_height(img1, plane), mismatch_row, + mismatch_col, mismatch_pix1, mismatch_pix2)) { + if (mismatch_plane != NULL) *mismatch_plane = plane; + return false; + } + } + + return true; +} + +void EncoderTest::MismatchHook(const aom_image_t *img_enc, + const aom_image_t *img_dec) { + int mismatch_row = 0; + int mismatch_col = 0; + int mismatch_plane = 0; + int mismatch_pix_enc = 0; + int mismatch_pix_dec = 0; + + ASSERT_FALSE(compare_img(img_enc, img_dec, &mismatch_row, &mismatch_col, + &mismatch_plane, &mismatch_pix_enc, + &mismatch_pix_dec)); + + GTEST_FAIL() << "Encode/Decode mismatch found:" << std::endl + << " pixel value enc/dec: " << mismatch_pix_enc << "/" + << mismatch_pix_dec << std::endl + << " plane: " << mismatch_plane << std::endl + << " row/col: " << mismatch_row << "/" + << mismatch_col << std::endl; +} + +void EncoderTest::RunLoop(VideoSource *video) { + aom_codec_dec_cfg_t dec_cfg = aom_codec_dec_cfg_t(); + dec_cfg.allow_lowbitdepth = 1; + + stats_.Reset(); + + ASSERT_TRUE(passes_ == 1 || passes_ == 2); + for (unsigned int pass = 0; pass < passes_; pass++) { + last_pts_ = 0; + + if (passes_ == 1) + cfg_.g_pass = AOM_RC_ONE_PASS; + else if (pass == 0) + cfg_.g_pass = AOM_RC_FIRST_PASS; + else + cfg_.g_pass = AOM_RC_LAST_PASS; + + BeginPassHook(pass); + std::unique_ptr encoder( + codec_->CreateEncoder(cfg_, init_flags_, &stats_)); + ASSERT_TRUE(encoder.get() != NULL); + + ASSERT_NO_FATAL_FAILURE(video->Begin()); + encoder->InitEncoder(video); + + if (mode_ == kRealTime) { + encoder->Control(AOME_SET_ENABLEAUTOALTREF, 0); + } + + ASSERT_FALSE(::testing::Test::HasFatalFailure()); + + std::unique_ptr decoder( + codec_->CreateDecoder(dec_cfg, 0 /* flags */)); +#if CONFIG_AV1_DECODER + if (decoder->IsAV1()) { + // Set dec_cfg.tile_row = -1 and dec_cfg.tile_col = -1 so that the whole + // frame is decoded. + decoder->Control(AV1_SET_TILE_MODE, cfg_.large_scale_tile); + decoder->Control(AV1D_EXT_TILE_DEBUG, 1); + decoder->Control(AV1_SET_DECODE_TILE_ROW, -1); + decoder->Control(AV1_SET_DECODE_TILE_COL, -1); + } +#endif + + number_spatial_layers_ = GetNumSpatialLayers(); + + bool again; + for (again = true; again; video->Next()) { + again = (video->img() != NULL); + + for (int sl = 0; sl < number_spatial_layers_; sl++) { + PreEncodeFrameHook(video); + PreEncodeFrameHook(video, encoder.get()); + encoder->EncodeFrame(video, frame_flags_); + + CxDataIterator iter = encoder->GetCxData(); + + bool has_cxdata = false; + bool has_dxdata = false; + while (const aom_codec_cx_pkt_t *pkt = iter.Next()) { + pkt = MutateEncoderOutputHook(pkt); + again = true; + switch (pkt->kind) { + case AOM_CODEC_CX_FRAME_PKT: + has_cxdata = true; + if (decoder.get() != NULL && DoDecode()) { + aom_codec_err_t res_dec; + if (DoDecodeInvisible()) { + res_dec = decoder->DecodeFrame( + (const uint8_t *)pkt->data.frame.buf, pkt->data.frame.sz); + } else { + res_dec = decoder->DecodeFrame( + (const uint8_t *)pkt->data.frame.buf + + (pkt->data.frame.sz - pkt->data.frame.vis_frame_size), + pkt->data.frame.vis_frame_size); + } + + if (!HandleDecodeResult(res_dec, decoder.get())) break; + + has_dxdata = true; + } + ASSERT_GE(pkt->data.frame.pts, last_pts_); + if (sl == number_spatial_layers_) last_pts_ = pkt->data.frame.pts; + FramePktHook(pkt); + break; + + case AOM_CODEC_PSNR_PKT: PSNRPktHook(pkt); break; + + default: break; + } + } + + if (has_dxdata && has_cxdata) { + const aom_image_t *img_enc = encoder->GetPreviewFrame(); + DxDataIterator dec_iter = decoder->GetDxData(); + const aom_image_t *img_dec = dec_iter.Next(); + if (img_enc && img_dec) { + const bool res = + compare_img(img_enc, img_dec, NULL, NULL, NULL, NULL, NULL); + if (!res) { // Mismatch + MismatchHook(img_enc, img_dec); + } + } + if (img_dec) DecompressedFrameHook(*img_dec, video->pts()); + } + if (!Continue()) break; + } // Loop over spatial layers + } + + EndPassHook(); + + if (!Continue()) break; + } +} + +} // namespace libaom_test diff --git a/libs/libaom/src/test/encode_test_driver.h b/libs/libaom/src/test/encode_test_driver.h new file mode 100644 index 000000000..6319a5220 --- /dev/null +++ b/libs/libaom/src/test/encode_test_driver.h @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_TEST_ENCODE_TEST_DRIVER_H_ +#define AOM_TEST_ENCODE_TEST_DRIVER_H_ + +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_config.h" + +#if CONFIG_AV1_ENCODER +#include "aom/aomcx.h" +#endif +#include "aom/aom_encoder.h" + +namespace libaom_test { + +class CodecFactory; +class VideoSource; + +enum TestMode { kRealTime, kOnePassGood, kTwoPassGood }; +#define ALL_TEST_MODES \ + ::testing::Values(::libaom_test::kRealTime, ::libaom_test::kOnePassGood, \ + ::libaom_test::kTwoPassGood) + +#define ONE_PASS_TEST_MODES \ + ::testing::Values(::libaom_test::kRealTime, ::libaom_test::kOnePassGood) + +#define TWO_PASS_TEST_MODES ::testing::Values(::libaom_test::kTwoPassGood) + +#define NONREALTIME_TEST_MODES \ + ::testing::Values(::libaom_test::kOnePassGood, ::libaom_test::kTwoPassGood) + +// Provides an object to handle the libaom get_cx_data() iteration pattern +class CxDataIterator { + public: + explicit CxDataIterator(aom_codec_ctx_t *encoder) + : encoder_(encoder), iter_(NULL) {} + + const aom_codec_cx_pkt_t *Next() { + return aom_codec_get_cx_data(encoder_, &iter_); + } + + private: + aom_codec_ctx_t *encoder_; + aom_codec_iter_t iter_; +}; + +// Implements an in-memory store for libaom twopass statistics +class TwopassStatsStore { + public: + void Append(const aom_codec_cx_pkt_t &pkt) { + buffer_.append(reinterpret_cast(pkt.data.twopass_stats.buf), + pkt.data.twopass_stats.sz); + } + + aom_fixed_buf_t buf() { + const aom_fixed_buf_t buf = { &buffer_[0], buffer_.size() }; + return buf; + } + + void Reset() { buffer_.clear(); } + + protected: + std::string buffer_; +}; + +// Provides a simplified interface to manage one video encoding pass, given +// a configuration and video source. +// +// TODO(jkoleszar): The exact services it provides and the appropriate +// level of abstraction will be fleshed out as more tests are written. +class Encoder { + public: + Encoder(aom_codec_enc_cfg_t cfg, const aom_codec_flags_t init_flags, + TwopassStatsStore *stats) + : cfg_(cfg), init_flags_(init_flags), stats_(stats) { + memset(&encoder_, 0, sizeof(encoder_)); + } + + virtual ~Encoder() { aom_codec_destroy(&encoder_); } + + CxDataIterator GetCxData() { return CxDataIterator(&encoder_); } + + void InitEncoder(VideoSource *video); + + const aom_image_t *GetPreviewFrame() { + return aom_codec_get_preview_frame(&encoder_); + } + // This is a thin wrapper around aom_codec_encode(), so refer to + // aom_encoder.h for its semantics. + void EncodeFrame(VideoSource *video, const unsigned long frame_flags); + + // Convenience wrapper for EncodeFrame() + void EncodeFrame(VideoSource *video) { EncodeFrame(video, 0); } + + void Control(int ctrl_id, int arg) { + const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg); + ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError(); + } + + void Control(int ctrl_id, int *arg) { + const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg); + ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError(); + } + + void Control(int ctrl_id, struct aom_scaling_mode *arg) { + const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg); + ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError(); + } + + void Control(int ctrl_id, struct aom_svc_layer_id *arg) { + const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg); + ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError(); + } + + void Control(int ctrl_id, struct aom_svc_ref_frame_config *arg) { + const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg); + ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError(); + } + + void Control(int ctrl_id, struct aom_svc_params *arg) { + const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg); + ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError(); + } + +#if CONFIG_AV1_ENCODER + void Control(int ctrl_id, aom_active_map_t *arg) { + const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg); + ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError(); + } +#endif + + void Config(const aom_codec_enc_cfg_t *cfg) { + const aom_codec_err_t res = aom_codec_enc_config_set(&encoder_, cfg); + ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError(); + cfg_ = *cfg; + } + + protected: + virtual aom_codec_iface_t *CodecInterface() const = 0; + + const char *EncoderError() { + const char *detail = aom_codec_error_detail(&encoder_); + return detail ? detail : aom_codec_error(&encoder_); + } + + // Encode an image + void EncodeFrameInternal(const VideoSource &video, + const unsigned long frame_flags); + + // Flush the encoder on EOS + void Flush(); + + aom_codec_ctx_t encoder_; + aom_codec_enc_cfg_t cfg_; + aom_codec_flags_t init_flags_; + TwopassStatsStore *stats_; +}; + +// Common test functionality for all Encoder tests. +// +// This class is a mixin which provides the main loop common to all +// encoder tests. It provides hooks which can be overridden by subclasses +// to implement each test's specific behavior, while centralizing the bulk +// of the boilerplate. Note that it doesn't inherit the gtest testing +// classes directly, so that tests can be parameterized differently. +class EncoderTest { + protected: + explicit EncoderTest(const CodecFactory *codec) + : codec_(codec), abort_(false), init_flags_(0), frame_flags_(0), + last_pts_(0), mode_(kRealTime), number_spatial_layers_(1) { + // Default to 1 thread. + cfg_.g_threads = 1; + } + + virtual ~EncoderTest() {} + + // Initialize the cfg_ member with the default configuration. + void InitializeConfig(); + + // Map the TestMode enum to the passes_ variables. + void SetMode(TestMode mode); + + // Set encoder flag. + void set_init_flags(aom_codec_flags_t flag) { init_flags_ = flag; } + + // Main loop + virtual void RunLoop(VideoSource *video); + + // Hook to be called at the beginning of a pass. + virtual void BeginPassHook(unsigned int /*pass*/) {} + + // Hook to be called at the end of a pass. + virtual void EndPassHook() {} + + // Hook to be called before encoding a frame. + virtual void PreEncodeFrameHook(VideoSource * /*video*/) {} + virtual void PreEncodeFrameHook(VideoSource * /*video*/, + Encoder * /*encoder*/) {} + + // Hook to be called on every compressed data packet. + virtual void FramePktHook(const aom_codec_cx_pkt_t * /*pkt*/) {} + + // Hook to be called on every PSNR packet. + virtual void PSNRPktHook(const aom_codec_cx_pkt_t * /*pkt*/) {} + + // Hook to determine whether the encode loop should continue. + virtual bool Continue() const { + return !(::testing::Test::HasFatalFailure() || abort_); + } + + // Hook to determine whether to decode frame after encoding + virtual bool DoDecode() const { return true; } + + // Hook to determine whether to decode invisible frames after encoding + virtual bool DoDecodeInvisible() const { return true; } + + // Hook to handle encode/decode mismatch + virtual void MismatchHook(const aom_image_t *img1, const aom_image_t *img2); + + // Hook to be called on every decompressed frame. + virtual void DecompressedFrameHook(const aom_image_t & /*img*/, + aom_codec_pts_t /*pts*/) {} + + // Hook to be called to handle decode result. Return true to continue. + virtual bool HandleDecodeResult(const aom_codec_err_t res_dec, + Decoder *decoder) { + EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError(); + return AOM_CODEC_OK == res_dec; + } + + virtual int GetNumSpatialLayers() { return 1; } + + // Hook that can modify the encoder's output data + virtual const aom_codec_cx_pkt_t *MutateEncoderOutputHook( + const aom_codec_cx_pkt_t *pkt) { + return pkt; + } + + const CodecFactory *codec_; + bool abort_; + aom_codec_enc_cfg_t cfg_; + unsigned int passes_; + TwopassStatsStore stats_; + aom_codec_flags_t init_flags_; + unsigned long frame_flags_; + aom_codec_pts_t last_pts_; + TestMode mode_; + int number_spatial_layers_; +}; + +} // namespace libaom_test + +#endif // AOM_TEST_ENCODE_TEST_DRIVER_H_ diff --git a/libs/libaom/src/test/encodetxb_test.cc b/libs/libaom/src/test/encodetxb_test.cc new file mode 100644 index 000000000..385d3f1a8 --- /dev/null +++ b/libs/libaom/src/test/encodetxb_test.cc @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_ports/aom_timer.h" +#include "aom_ports/mem.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/idct.h" +#include "av1/common/scan.h" +#include "av1/common/txb_common.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" + +namespace { +using libaom_test::ACMRandom; + +typedef void (*GetNzMapContextsFunc)(const uint8_t *const levels, + const int16_t *const scan, + const uint16_t eob, const TX_SIZE tx_size, + const TX_CLASS tx_class, + int8_t *const coeff_contexts); + +class EncodeTxbTest : public ::testing::TestWithParam { + public: + EncodeTxbTest() : get_nz_map_contexts_func_(GetParam()) {} + + virtual ~EncodeTxbTest() {} + + virtual void SetUp() { + coeff_contexts_ref_ = reinterpret_cast( + aom_memalign(16, sizeof(*coeff_contexts_ref_) * MAX_TX_SQUARE)); + ASSERT_TRUE(coeff_contexts_ref_ != NULL); + coeff_contexts_ = reinterpret_cast( + aom_memalign(16, sizeof(*coeff_contexts_) * MAX_TX_SQUARE)); + ASSERT_TRUE(coeff_contexts_ != NULL); + } + + virtual void TearDown() { + aom_free(coeff_contexts_ref_); + aom_free(coeff_contexts_); + libaom_test::ClearSystemState(); + } + + void GetNzMapContextsRun() { + const int kNumTests = 10; + int result = 0; + + for (int is_inter = 0; is_inter < 2; ++is_inter) { + for (int tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) { + const TX_CLASS tx_class = tx_type_to_class[tx_type]; + for (int tx_size = TX_4X4; tx_size < TX_SIZES_ALL; ++tx_size) { + const int bwl = get_txb_bwl((TX_SIZE)tx_size); + const int width = get_txb_wide((TX_SIZE)tx_size); + const int height = get_txb_high((TX_SIZE)tx_size); + const int real_width = tx_size_wide[tx_size]; + const int real_height = tx_size_high[tx_size]; + const int16_t *const scan = av1_scan_orders[tx_size][tx_type].scan; + + levels_ = set_levels(levels_buf_, width); + for (int i = 0; i < kNumTests && !result; ++i) { + for (int eob = 1; eob <= width * height && !result; ++eob) { + InitDataWithEob(scan, bwl, eob); + + av1_get_nz_map_contexts_c(levels_, scan, eob, (TX_SIZE)tx_size, + tx_class, coeff_contexts_ref_); + get_nz_map_contexts_func_(levels_, scan, eob, (TX_SIZE)tx_size, + tx_class, coeff_contexts_); + + result = Compare(scan, eob); + + EXPECT_EQ(result, 0) + << " tx_class " << tx_class << " width " << real_width + << " height " << real_height << " eob " << eob; + } + } + } + } + } + } + + void SpeedTestGetNzMapContextsRun() { + const int kNumTests = 2000000000; + aom_usec_timer timer; + + printf("Note: Only test the largest possible eob case!\n"); + for (int tx_size = TX_4X4; tx_size < TX_SIZES_ALL; ++tx_size) { + const int bwl = get_txb_bwl((TX_SIZE)tx_size); + const int width = get_txb_wide((TX_SIZE)tx_size); + const int height = get_txb_high((TX_SIZE)tx_size); + const int real_width = tx_size_wide[tx_size]; + const int real_height = tx_size_high[tx_size]; + const TX_TYPE tx_type = DCT_DCT; + const TX_CLASS tx_class = tx_type_to_class[tx_type]; + const int16_t *const scan = av1_scan_orders[tx_size][tx_type].scan; + const int eob = width * height; + const int numTests = kNumTests / (width * height); + + levels_ = set_levels(levels_buf_, width); + InitDataWithEob(scan, bwl, eob); + + aom_usec_timer_start(&timer); + for (int i = 0; i < numTests; ++i) { + get_nz_map_contexts_func_(levels_, scan, eob, (TX_SIZE)tx_size, + tx_class, coeff_contexts_); + } + aom_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(aom_usec_timer_elapsed(&timer)); + printf("get_nz_map_contexts_%2dx%2d: %7.1f ms\n", real_width, real_height, + elapsed_time / 1000.0); + } + } + + private: + void InitDataWithEob(const int16_t *const scan, const int bwl, + const int eob) { + memset(levels_buf_, 0, sizeof(levels_buf_)); + memset(coeff_contexts_, 0, sizeof(*coeff_contexts_) * MAX_TX_SQUARE); + + for (int c = 0; c < eob; ++c) { + levels_[get_padded_idx(scan[c], bwl)] = + static_cast(clamp(rnd_.Rand8(), 0, INT8_MAX)); + coeff_contexts_[scan[c]] = static_cast(rnd_.Rand16() >> 1); + } + + memcpy(coeff_contexts_ref_, coeff_contexts_, + sizeof(*coeff_contexts_) * MAX_TX_SQUARE); + } + + bool Compare(const int16_t *const scan, const int eob) const { + bool result = false; + if (memcmp(coeff_contexts_, coeff_contexts_ref_, + sizeof(*coeff_contexts_ref_) * MAX_TX_SQUARE)) { + for (int i = 0; i < eob; i++) { + const int pos = scan[i]; + if (coeff_contexts_ref_[pos] != coeff_contexts_[pos]) { + printf("coeff_contexts_[%d] diff:%6d (ref),%6d (opt)\n", pos, + coeff_contexts_ref_[pos], coeff_contexts_[pos]); + result = true; + break; + } + } + } + return result; + } + + GetNzMapContextsFunc get_nz_map_contexts_func_; + ACMRandom rnd_; + uint8_t levels_buf_[TX_PAD_2D]; + uint8_t *levels_; + int8_t *coeff_contexts_ref_; + int8_t *coeff_contexts_; +}; + +TEST_P(EncodeTxbTest, GetNzMapContexts) { GetNzMapContextsRun(); } + +TEST_P(EncodeTxbTest, DISABLED_SpeedTestGetNzMapContexts) { + SpeedTestGetNzMapContextsRun(); +} + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P(SSE2, EncodeTxbTest, + ::testing::Values(av1_get_nz_map_contexts_sse2)); +#endif + +typedef void (*av1_txb_init_levels_func)(const tran_low_t *const coeff, + const int width, const int height, + uint8_t *const levels); + +typedef std::tuple TxbInitLevelParam; + +class EncodeTxbInitLevelTest + : public ::testing::TestWithParam { + public: + virtual ~EncodeTxbInitLevelTest() {} + virtual void TearDown() { libaom_test::ClearSystemState(); } + void RunTest(av1_txb_init_levels_func test_func, int tx_size, int is_speed); +}; + +void EncodeTxbInitLevelTest::RunTest(av1_txb_init_levels_func test_func, + int tx_size, int is_speed) { + const int width = get_txb_wide((TX_SIZE)tx_size); + const int height = get_txb_high((TX_SIZE)tx_size); + tran_low_t coeff[MAX_TX_SQUARE]; + + uint8_t levels_buf[2][TX_PAD_2D]; + uint8_t *const levels0 = set_levels(levels_buf[0], width); + uint8_t *const levels1 = set_levels(levels_buf[1], width); + + ACMRandom rnd(ACMRandom::DeterministicSeed()); + for (int i = 0; i < width * height; i++) { + coeff[i] = rnd.Rand15Signed() + rnd.Rand15Signed(); + } + for (int i = 0; i < TX_PAD_2D; i++) { + levels_buf[0][i] = rnd.Rand8(); + levels_buf[1][i] = rnd.Rand8(); + } + const int run_times = is_speed ? (width * height) * 10000 : 1; + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + av1_txb_init_levels_c(coeff, width, height, levels0); + } + const double t1 = get_time_mark(&timer); + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + test_func(coeff, width, height, levels1); + } + const double t2 = get_time_mark(&timer); + if (is_speed) { + printf("init %3dx%-3d:%7.2f/%7.2fns", width, height, t1, t2); + printf("(%3.2f)\n", t1 / t2); + } + const int stride = width + TX_PAD_HOR; + for (int r = 0; r < height + TX_PAD_VER; ++r) { + for (int c = 0; c < stride; ++c) { + ASSERT_EQ(levels_buf[0][c + r * stride], levels_buf[1][c + r * stride]) + << "[" << r << "," << c << "] " << run_times << width << "x" + << height; + } + } +} + +TEST_P(EncodeTxbInitLevelTest, match) { + RunTest(GET_PARAM(0), GET_PARAM(1), 0); +} + +TEST_P(EncodeTxbInitLevelTest, DISABLED_Speed) { + RunTest(GET_PARAM(0), GET_PARAM(1), 1); +} + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P( + SSE4_1, EncodeTxbInitLevelTest, + ::testing::Combine(::testing::Values(&av1_txb_init_levels_sse4_1), + ::testing::Range(0, static_cast(TX_SIZES_ALL), 1))); +#endif +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, EncodeTxbInitLevelTest, + ::testing::Combine(::testing::Values(&av1_txb_init_levels_avx2), + ::testing::Range(0, static_cast(TX_SIZES_ALL), 1))); +#endif +} // namespace diff --git a/libs/libaom/src/test/end_to_end_test.cc b/libs/libaom/src/test/end_to_end_test.cc new file mode 100644 index 000000000..162a7c743 --- /dev/null +++ b/libs/libaom/src/test/end_to_end_test.cc @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "test/yuv_video_source.h" + +namespace { + +const unsigned int kWidth = 160; +const unsigned int kHeight = 90; +const unsigned int kFramerate = 50; +const unsigned int kFrames = 10; +const int kBitrate = 500; +// List of psnr thresholds for speed settings 0-7 and 5 encoding modes +const double kPsnrThreshold[][5] = { +// Note: +// AV1 HBD average PSNR is slightly lower than AV1. +// We make two cases here to enable the testing and +// guard picture quality. +#if CONFIG_AV1_ENCODER + { 36.0, 37.0, 37.0, 37.0, 37.0 }, { 31.0, 36.0, 36.0, 36.0, 36.0 }, + { 31.0, 35.0, 35.0, 35.0, 35.0 }, { 31.0, 34.0, 34.0, 34.0, 34.0 }, + { 31.0, 33.0, 33.0, 33.0, 33.0 }, { 31.0, 32.0, 32.0, 32.0, 32.0 }, + { 30.0, 31.0, 31.0, 31.0, 31.0 }, { 29.0, 30.0, 30.0, 30.0, 30.0 }, +#else + { 36.0, 37.0, 37.0, 37.0, 37.0 }, { 35.0, 36.0, 36.0, 36.0, 36.0 }, + { 34.0, 35.0, 35.0, 35.0, 35.0 }, { 33.0, 34.0, 34.0, 34.0, 34.0 }, + { 32.0, 33.0, 33.0, 33.0, 33.0 }, { 31.0, 32.0, 32.0, 32.0, 32.0 }, + { 30.0, 31.0, 31.0, 31.0, 31.0 }, { 29.0, 30.0, 30.0, 30.0, 30.0 }, +#endif // CONFIG_AV1_ENCODER +}; + +typedef struct { + const char *filename; + unsigned int input_bit_depth; + aom_img_fmt fmt; + aom_bit_depth_t bit_depth; + unsigned int profile; +} TestVideoParam; + +std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) { + return os << "TestVideoParam { filename:" << test_arg.filename + << " input_bit_depth:" << test_arg.input_bit_depth + << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth + << " profile:" << test_arg.profile << " }"; +} + +const TestVideoParam kTestVectors[] = { + { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 }, + { "park_joy_90p_8_422.y4m", 8, AOM_IMG_FMT_I422, AOM_BITS_8, 2 }, + { "park_joy_90p_8_444.y4m", 8, AOM_IMG_FMT_I444, AOM_BITS_8, 1 }, +#if CONFIG_AV1_HIGHBITDEPTH + { "park_joy_90p_10_420.y4m", 10, AOM_IMG_FMT_I42016, AOM_BITS_10, 0 }, + { "park_joy_90p_10_422.y4m", 10, AOM_IMG_FMT_I42216, AOM_BITS_10, 2 }, + { "park_joy_90p_10_444.y4m", 10, AOM_IMG_FMT_I44416, AOM_BITS_10, 1 }, + { "park_joy_90p_12_420.y4m", 12, AOM_IMG_FMT_I42016, AOM_BITS_12, 2 }, + { "park_joy_90p_12_422.y4m", 12, AOM_IMG_FMT_I42216, AOM_BITS_12, 2 }, + { "park_joy_90p_12_444.y4m", 12, AOM_IMG_FMT_I44416, AOM_BITS_12, 2 }, +#endif +}; + +// Encoding modes tested +const libaom_test::TestMode kEncodingModeVectors[] = { + ::libaom_test::kTwoPassGood, + ::libaom_test::kOnePassGood, + ::libaom_test::kRealTime, +}; + +// Speed settings tested +const int kCpuUsedVectors[] = { 1, 2, 3, 5, 6 }; + +int is_extension_y4m(const char *filename) { + const char *dot = strrchr(filename, '.'); + if (!dot || dot == filename) + return 0; + else + return !strcmp(dot, ".y4m"); +} + +class EndToEndTest + : public ::libaom_test::CodecTestWith3Params, + public ::libaom_test::EncoderTest { + protected: + EndToEndTest() + : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(2)), + cpu_used_(GET_PARAM(3)), psnr_(0.0), nframes_(0), + encoding_mode_(GET_PARAM(1)) {} + + virtual ~EndToEndTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(encoding_mode_); + if (encoding_mode_ != ::libaom_test::kRealTime) { + cfg_.g_lag_in_frames = 5; + cfg_.rc_end_usage = AOM_VBR; + } else { + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = AOM_CBR; + cfg_.rc_buf_sz = 1000; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + } + } + + virtual void BeginPassHook(unsigned int) { + psnr_ = 0.0; + nframes_ = 0; + } + + virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) { + psnr_ += pkt->data.psnr.psnr[0]; + nframes_++; + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1); + encoder->Control(AV1E_SET_TILE_COLUMNS, 4); + encoder->Control(AOME_SET_CPUUSED, cpu_used_); + // Test screen coding tools at cpu_used = 1 && encoding mode is two-pass. + if (cpu_used_ == 1 && encoding_mode_ == ::libaom_test::kTwoPassGood) + encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN); + else + encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT); + if (encoding_mode_ != ::libaom_test::kRealTime) { + encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1); + encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7); + encoder->Control(AOME_SET_ARNR_STRENGTH, 5); + } + } + } + + double GetAveragePsnr() const { + if (nframes_) return psnr_ / nframes_; + return 0.0; + } + + double GetPsnrThreshold() { + return kPsnrThreshold[cpu_used_][encoding_mode_]; + } + + void DoTest() { + cfg_.rc_target_bitrate = kBitrate; + cfg_.g_error_resilient = 0; + cfg_.g_profile = test_video_param_.profile; + cfg_.g_input_bit_depth = test_video_param_.input_bit_depth; + cfg_.g_bit_depth = test_video_param_.bit_depth; + init_flags_ = AOM_CODEC_USE_PSNR; + if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH; + + std::unique_ptr video; + if (is_extension_y4m(test_video_param_.filename)) { + video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0, + kFrames)); + } else { + video.reset(new libaom_test::YUVVideoSource( + test_video_param_.filename, test_video_param_.fmt, kWidth, kHeight, + kFramerate, 1, 0, kFrames)); + } + ASSERT_TRUE(video.get() != NULL); + + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); + const double psnr = GetAveragePsnr(); + EXPECT_GT(psnr, GetPsnrThreshold()) + << "cpu used = " << cpu_used_ << ", encoding mode = " << encoding_mode_; + } + + TestVideoParam test_video_param_; + int cpu_used_; + + private: + double psnr_; + unsigned int nframes_; + libaom_test::TestMode encoding_mode_; +}; + +class EndToEndTestLarge : public EndToEndTest {}; + +TEST_P(EndToEndTestLarge, EndtoEndPSNRTest) { DoTest(); } + +TEST_P(EndToEndTest, EndtoEndPSNRTest) { DoTest(); } + +AV1_INSTANTIATE_TEST_CASE(EndToEndTestLarge, + ::testing::ValuesIn(kEncodingModeVectors), + ::testing::ValuesIn(kTestVectors), + ::testing::ValuesIn(kCpuUsedVectors)); + +AV1_INSTANTIATE_TEST_CASE(EndToEndTest, + ::testing::Values(kEncodingModeVectors[0]), + ::testing::Values(kTestVectors[2]), // 444 + ::testing::Values(kCpuUsedVectors[2])); +} // namespace diff --git a/libs/libaom/src/test/error_block_test.cc b/libs/libaom/src/test/error_block_test.cc new file mode 100644 index 000000000..462661e61 --- /dev/null +++ b/libs/libaom/src/test/error_block_test.cc @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "av1/common/entropy.h" +#include "aom/aom_codec.h" +#include "aom/aom_integer.h" + +using libaom_test::ACMRandom; + +namespace { +const int kNumIterations = 1000; + +typedef int64_t (*ErrorBlockFunc)(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz, int bps); + +typedef int64_t (*ErrorBlockFunc8Bits)(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz); + +typedef std::tuple + ErrorBlockParam; + +template +int64_t BlockError8BitWrapper(const tran_low_t *coeff, + const tran_low_t *dqcoeff, intptr_t block_size, + int64_t *ssz, int bps) { + EXPECT_EQ(bps, 8); + return fn(coeff, dqcoeff, block_size, ssz); +} + +class ErrorBlockTest : public ::testing::TestWithParam { + public: + virtual ~ErrorBlockTest() {} + virtual void SetUp() { + error_block_op_ = GET_PARAM(0); + ref_error_block_op_ = GET_PARAM(1); + bit_depth_ = GET_PARAM(2); + } + + virtual void TearDown() { libaom_test::ClearSystemState(); } + + protected: + aom_bit_depth_t bit_depth_; + ErrorBlockFunc error_block_op_; + ErrorBlockFunc ref_error_block_op_; +}; + +TEST_P(ErrorBlockTest, OperationCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + DECLARE_ALIGNED(16, tran_low_t, coeff[4096]); + DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]); + int err_count_total = 0; + int first_failure = -1; + intptr_t block_size; + int64_t ssz; + int64_t ret; + int64_t ref_ssz; + int64_t ref_ret; + const int msb = bit_depth_ + 8 - 1; + for (int i = 0; i < kNumIterations; ++i) { + int err_count = 0; + block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64 + for (int j = 0; j < block_size; j++) { + // coeff and dqcoeff will always have at least the same sign, and this + // can be used for optimization, so generate test input precisely. + if (rnd(2)) { + // Positive number + coeff[j] = rnd(1 << msb); + dqcoeff[j] = rnd(1 << msb); + } else { + // Negative number + coeff[j] = -rnd(1 << msb); + dqcoeff[j] = -rnd(1 << msb); + } + } + ref_ret = + ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, bit_depth_); + ASM_REGISTER_STATE_CHECK( + ret = error_block_op_(coeff, dqcoeff, block_size, &ssz, bit_depth_)); + err_count += (ref_ret != ret) | (ref_ssz != ssz); + if (err_count && !err_count_total) { + first_failure = i; + } + err_count_total += err_count; + } + EXPECT_EQ(0, err_count_total) + << "Error: Error Block Test, C output doesn't match optimized output. " + << "First failed at test case " << first_failure; +} + +TEST_P(ErrorBlockTest, ExtremeValues) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + DECLARE_ALIGNED(16, tran_low_t, coeff[4096]); + DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]); + int err_count_total = 0; + int first_failure = -1; + intptr_t block_size; + int64_t ssz; + int64_t ret; + int64_t ref_ssz; + int64_t ref_ret; + const int msb = bit_depth_ + 8 - 1; + int max_val = ((1 << msb) - 1); + for (int i = 0; i < kNumIterations; ++i) { + int err_count = 0; + int k = (i / 9) % 9; + + // Change the maximum coeff value, to test different bit boundaries + if (k == 8 && (i % 9) == 0) { + max_val >>= 1; + } + block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64 + for (int j = 0; j < block_size; j++) { + if (k < 4) { + // Test at positive maximum values + coeff[j] = k % 2 ? max_val : 0; + dqcoeff[j] = (k >> 1) % 2 ? max_val : 0; + } else if (k < 8) { + // Test at negative maximum values + coeff[j] = k % 2 ? -max_val : 0; + dqcoeff[j] = (k >> 1) % 2 ? -max_val : 0; + } else { + if (rnd(2)) { + // Positive number + coeff[j] = rnd(1 << 14); + dqcoeff[j] = rnd(1 << 14); + } else { + // Negative number + coeff[j] = -rnd(1 << 14); + dqcoeff[j] = -rnd(1 << 14); + } + } + } + ref_ret = + ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, bit_depth_); + ASM_REGISTER_STATE_CHECK( + ret = error_block_op_(coeff, dqcoeff, block_size, &ssz, bit_depth_)); + err_count += (ref_ret != ret) | (ref_ssz != ssz); + if (err_count && !err_count_total) { + first_failure = i; + } + err_count_total += err_count; + } + EXPECT_EQ(0, err_count_total) + << "Error: Error Block Test, C output doesn't match optimized output. " + << "First failed at test case " << first_failure; +} + +TEST_P(ErrorBlockTest, DISABLED_Speed) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + DECLARE_ALIGNED(16, tran_low_t, coeff[4096]); + DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]); + intptr_t block_size; + int64_t ssz; + int num_iters = 100000; + int64_t ref_ssz; + int k; + const int msb = bit_depth_ + 8 - 1; + for (int i = 0; i < 9; ++i) { + block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64 + for (k = 0; k < 9; k++) { + for (int j = 0; j < block_size; j++) { + if (k < 5) { + if (rnd(2)) { + // Positive number + coeff[j] = rnd(1 << msb); + dqcoeff[j] = rnd(1 << msb); + } else { + // Negative number + coeff[j] = -rnd(1 << msb); + dqcoeff[j] = -rnd(1 << msb); + } + } else { + if (rnd(2)) { + // Positive number + coeff[j] = rnd(1 << 14); + dqcoeff[j] = rnd(1 << 14); + } else { + // Negative number + coeff[j] = -rnd(1 << 14); + dqcoeff[j] = -rnd(1 << 14); + } + } + } + aom_usec_timer ref_timer, test_timer; + + aom_usec_timer_start(&ref_timer); + for (int i = 0; i < num_iters; ++i) { + ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, bit_depth_); + } + aom_usec_timer_mark(&ref_timer); + const int elapsed_time_c = + static_cast(aom_usec_timer_elapsed(&ref_timer)); + + aom_usec_timer_start(&test_timer); + for (int i = 0; i < num_iters; ++i) { + error_block_op_(coeff, dqcoeff, block_size, &ssz, bit_depth_); + } + aom_usec_timer_mark(&test_timer); + + const int elapsed_time_simd = + static_cast(aom_usec_timer_elapsed(&test_timer)); + + printf( + " c_time=%d \t simd_time=%d \t " + "gain=%d \n", + elapsed_time_c, elapsed_time_simd, + (elapsed_time_c / elapsed_time_simd)); + } + } +} + +using std::make_tuple; + +#if (HAVE_SSE2) +const ErrorBlockParam kErrorBlockTestParamsSse2[] = { +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(&av1_highbd_block_error_sse2, &av1_highbd_block_error_c, + AOM_BITS_10), + make_tuple(&av1_highbd_block_error_sse2, &av1_highbd_block_error_c, + AOM_BITS_12), + make_tuple(&av1_highbd_block_error_sse2, &av1_highbd_block_error_c, + AOM_BITS_8), +#endif + make_tuple(&BlockError8BitWrapper, + &BlockError8BitWrapper, AOM_BITS_8) +}; + +INSTANTIATE_TEST_SUITE_P(SSE2, ErrorBlockTest, + ::testing::ValuesIn(kErrorBlockTestParamsSse2)); +#endif // HAVE_SSE2 + +#if (HAVE_AVX2) +const ErrorBlockParam kErrorBlockTestParamsAvx2[] = { +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(&av1_highbd_block_error_avx2, &av1_highbd_block_error_c, + AOM_BITS_10), + make_tuple(&av1_highbd_block_error_avx2, &av1_highbd_block_error_c, + AOM_BITS_12), + make_tuple(&av1_highbd_block_error_avx2, &av1_highbd_block_error_c, + AOM_BITS_8), +#endif + make_tuple(&BlockError8BitWrapper, + &BlockError8BitWrapper, AOM_BITS_8) +}; + +INSTANTIATE_TEST_SUITE_P(AVX2, ErrorBlockTest, + ::testing::ValuesIn(kErrorBlockTestParamsAvx2)); +#endif // HAVE_AVX2 + +#if (HAVE_MSA) +INSTANTIATE_TEST_SUITE_P( + MSA, ErrorBlockTest, + ::testing::Values(make_tuple(&BlockError8BitWrapper, + &BlockError8BitWrapper, + AOM_BITS_8))); +#endif // HAVE_MSA + +#if (HAVE_NEON) +INSTANTIATE_TEST_SUITE_P( + NEON, ErrorBlockTest, + ::testing::Values(make_tuple(&BlockError8BitWrapper, + &BlockError8BitWrapper, + AOM_BITS_8))); +#endif // HAVE_NEON +} // namespace diff --git a/libs/libaom/src/test/error_resilience_test.cc b/libs/libaom/src/test/error_resilience_test.cc new file mode 100644 index 000000000..1d52bb24a --- /dev/null +++ b/libs/libaom/src/test/error_resilience_test.cc @@ -0,0 +1,459 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" + +namespace { + +const int kMaxErrorFrames = 12; +const int kMaxInvisibleErrorFrames = 12; +const int kMaxDroppableFrames = 12; +const int kMaxErrorResilientFrames = 12; +const int kMaxNoMFMVFrames = 12; +const int kMaxPrimRefNoneFrames = 12; +const int kMaxSFrames = 12; +const int kCpuUsed = 1; + +class ErrorResilienceTestLarge + : public ::libaom_test::CodecTestWithParam, + public ::libaom_test::EncoderTest { + protected: + ErrorResilienceTestLarge() + : EncoderTest(GET_PARAM(0)), psnr_(0.0), nframes_(0), mismatch_psnr_(0.0), + mismatch_nframes_(0), encoding_mode_(GET_PARAM(1)), allow_mismatch_(0) { + Reset(); + } + + virtual ~ErrorResilienceTestLarge() {} + + void Reset() { + error_nframes_ = 0; + invisible_error_nframes_ = 0; + droppable_nframes_ = 0; + error_resilient_nframes_ = 0; + nomfmv_nframes_ = 0; + prim_ref_none_nframes_ = 0; + s_nframes_ = 0; + } + + void SetupEncoder(int bitrate, int lag) { + const aom_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + cfg_.rc_target_bitrate = bitrate; + cfg_.kf_mode = AOM_KF_DISABLED; + cfg_.g_lag_in_frames = lag; + init_flags_ = AOM_CODEC_USE_PSNR; + } + + virtual void SetUp() { + InitializeConfig(); + SetMode(encoding_mode_); + } + + virtual void BeginPassHook(unsigned int /*pass*/) { + psnr_ = 0.0; + nframes_ = 0; + decoded_nframes_ = 0; + mismatch_psnr_ = 0.0; + mismatch_nframes_ = 0; + } + + virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) { + psnr_ += pkt->data.psnr.psnr[0]; + nframes_++; + } + + virtual void PreEncodeFrameHook(libaom_test::VideoSource *video, + libaom_test::Encoder *encoder) { + if (video->frame() == 0) encoder->Control(AOME_SET_CPUUSED, kCpuUsed); + frame_flags_ &= + ~(AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF | + AOM_EFLAG_NO_REF_FRAME_MVS | AOM_EFLAG_ERROR_RESILIENT | + AOM_EFLAG_SET_S_FRAME | AOM_EFLAG_SET_PRIMARY_REF_NONE); + if (droppable_nframes_ > 0 && + (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) { + for (unsigned int i = 0; i < droppable_nframes_; ++i) { + if (droppable_frames_[i] == video->frame()) { + std::cout << " Encoding droppable frame: " + << droppable_frames_[i] << "\n"; + frame_flags_ |= (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | + AOM_EFLAG_NO_UPD_ARF); + break; + } + } + } + + if (error_resilient_nframes_ > 0 && + (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) { + for (unsigned int i = 0; i < error_resilient_nframes_; ++i) { + if (error_resilient_frames_[i] == video->frame()) { + std::cout << " Encoding error_resilient frame: " + << error_resilient_frames_[i] << "\n"; + frame_flags_ |= AOM_EFLAG_ERROR_RESILIENT; + break; + } + } + } + + if (nomfmv_nframes_ > 0 && + (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) { + for (unsigned int i = 0; i < nomfmv_nframes_; ++i) { + if (nomfmv_frames_[i] == video->frame()) { + std::cout << " Encoding no mfmv frame: " + << nomfmv_frames_[i] << "\n"; + frame_flags_ |= AOM_EFLAG_NO_REF_FRAME_MVS; + break; + } + } + } + + if (prim_ref_none_nframes_ > 0 && + (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) { + for (unsigned int i = 0; i < prim_ref_none_nframes_; ++i) { + if (prim_ref_none_frames_[i] == video->frame()) { + std::cout << " Encoding no PRIMARY_REF_NONE frame: " + << prim_ref_none_frames_[i] << "\n"; + frame_flags_ |= AOM_EFLAG_SET_PRIMARY_REF_NONE; + break; + } + } + } + + encoder->Control(AV1E_SET_S_FRAME_MODE, 0); + if (s_nframes_ > 0 && + (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) { + for (unsigned int i = 0; i < s_nframes_; ++i) { + if (s_frames_[i] == video->frame()) { + std::cout << " Encoding S frame: " << s_frames_[i] + << "\n"; + frame_flags_ |= AOM_EFLAG_SET_S_FRAME; + break; + } + } + } + } + + virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) { + // Check that the encode frame flags are correctly reflected + // in the output frame flags. + const int encode_flags = pkt->data.frame.flags >> 16; + if ((encode_flags & (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | + AOM_EFLAG_NO_UPD_ARF)) == + (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF)) { + ASSERT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_DROPPABLE, + static_cast(AOM_FRAME_IS_DROPPABLE)); + } + if (encode_flags & AOM_EFLAG_SET_S_FRAME) { + ASSERT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_SWITCH, + static_cast(AOM_FRAME_IS_SWITCH)); + } + if (encode_flags & AOM_EFLAG_ERROR_RESILIENT) { + ASSERT_EQ( + pkt->data.frame.flags & AOM_FRAME_IS_ERROR_RESILIENT, + static_cast(AOM_FRAME_IS_ERROR_RESILIENT)); + } + } + + double GetAveragePsnr() const { + if (nframes_) return psnr_ / nframes_; + return 0.0; + } + + double GetAverageMismatchPsnr() const { + if (mismatch_nframes_) return mismatch_psnr_ / mismatch_nframes_; + return 0.0; + } + + virtual bool DoDecode() const { + if (error_nframes_ > 0 && + (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) { + for (unsigned int i = 0; i < error_nframes_; ++i) { + if (error_frames_[i] == nframes_ - 1) { + std::cout << " Skipping decoding frame: " + << error_frames_[i] << "\n"; + return 0; + } + } + } + return 1; + } + + virtual bool DoDecodeInvisible() const { + if (invisible_error_nframes_ > 0 && + (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) { + for (unsigned int i = 0; i < invisible_error_nframes_; ++i) { + if (invisible_error_frames_[i] == nframes_ - 1) { + std::cout << " Skipping decoding all invisible frames in " + "frame pkt: " + << invisible_error_frames_[i] << "\n"; + return 0; + } + } + } + return 1; + } + + virtual void MismatchHook(const aom_image_t *img1, const aom_image_t *img2) { + if (allow_mismatch_) { + double mismatch_psnr = compute_psnr(img1, img2); + mismatch_psnr_ += mismatch_psnr; + ++mismatch_nframes_; + // std::cout << "Mismatch frame psnr: " << mismatch_psnr << "\n"; + } else { + ::libaom_test::EncoderTest::MismatchHook(img1, img2); + } + } + + virtual void DecompressedFrameHook(const aom_image_t &img, + aom_codec_pts_t pts) { + (void)img; + (void)pts; + ++decoded_nframes_; + } + + void SetErrorFrames(int num, unsigned int *list) { + if (num > kMaxErrorFrames) + num = kMaxErrorFrames; + else if (num < 0) + num = 0; + error_nframes_ = num; + for (unsigned int i = 0; i < error_nframes_; ++i) + error_frames_[i] = list[i]; + } + + void SetInvisibleErrorFrames(int num, unsigned int *list) { + if (num > kMaxInvisibleErrorFrames) + num = kMaxInvisibleErrorFrames; + else if (num < 0) + num = 0; + invisible_error_nframes_ = num; + for (unsigned int i = 0; i < invisible_error_nframes_; ++i) + invisible_error_frames_[i] = list[i]; + } + + void SetDroppableFrames(int num, unsigned int *list) { + if (num > kMaxDroppableFrames) + num = kMaxDroppableFrames; + else if (num < 0) + num = 0; + droppable_nframes_ = num; + for (unsigned int i = 0; i < droppable_nframes_; ++i) + droppable_frames_[i] = list[i]; + } + + void SetErrorResilientFrames(int num, unsigned int *list) { + if (num > kMaxErrorResilientFrames) + num = kMaxErrorResilientFrames; + else if (num < 0) + num = 0; + error_resilient_nframes_ = num; + for (unsigned int i = 0; i < error_resilient_nframes_; ++i) + error_resilient_frames_[i] = list[i]; + } + + void SetNoMFMVFrames(int num, unsigned int *list) { + if (num > kMaxNoMFMVFrames) + num = kMaxNoMFMVFrames; + else if (num < 0) + num = 0; + nomfmv_nframes_ = num; + for (unsigned int i = 0; i < nomfmv_nframes_; ++i) + nomfmv_frames_[i] = list[i]; + } + + void SetPrimaryRefNoneFrames(int num, unsigned int *list) { + if (num > kMaxPrimRefNoneFrames) + num = kMaxPrimRefNoneFrames; + else if (num < 0) + num = 0; + prim_ref_none_nframes_ = num; + for (unsigned int i = 0; i < prim_ref_none_nframes_; ++i) + prim_ref_none_frames_[i] = list[i]; + } + + void SetSFrames(int num, unsigned int *list) { + if (num > kMaxSFrames) + num = kMaxSFrames; + else if (num < 0) + num = 0; + s_nframes_ = num; + for (unsigned int i = 0; i < s_nframes_; ++i) s_frames_[i] = list[i]; + } + + unsigned int GetMismatchFrames() { return mismatch_nframes_; } + unsigned int GetEncodedFrames() { return nframes_; } + unsigned int GetDecodedFrames() { return decoded_nframes_; } + + void SetAllowMismatch(int allow) { allow_mismatch_ = allow; } + + private: + double psnr_; + unsigned int nframes_; + unsigned int decoded_nframes_; + unsigned int error_nframes_; + unsigned int invisible_error_nframes_; + unsigned int droppable_nframes_; + unsigned int error_resilient_nframes_; + unsigned int nomfmv_nframes_; + unsigned int prim_ref_none_nframes_; + unsigned int s_nframes_; + double mismatch_psnr_; + unsigned int mismatch_nframes_; + unsigned int error_frames_[kMaxErrorFrames]; + unsigned int invisible_error_frames_[kMaxInvisibleErrorFrames]; + unsigned int droppable_frames_[kMaxDroppableFrames]; + unsigned int error_resilient_frames_[kMaxErrorResilientFrames]; + unsigned int nomfmv_frames_[kMaxNoMFMVFrames]; + unsigned int prim_ref_none_frames_[kMaxPrimRefNoneFrames]; + unsigned int s_frames_[kMaxSFrames]; + libaom_test::TestMode encoding_mode_; + int allow_mismatch_; +}; + +TEST_P(ErrorResilienceTestLarge, OnVersusOff) { + SetupEncoder(2000, 10); + libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + cfg_.g_timebase.den, cfg_.g_timebase.num, + 0, 12); + + // Global error resilient mode OFF. + cfg_.g_error_resilient = 0; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const double psnr_resilience_off = GetAveragePsnr(); + EXPECT_GT(psnr_resilience_off, 25.0); + + Reset(); + // Error resilient mode ON for certain frames + unsigned int num_error_resilient_frames = 5; + unsigned int error_resilient_frame_list[] = { 3, 5, 6, 9, 11 }; + SetErrorResilientFrames(num_error_resilient_frames, + error_resilient_frame_list); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const double psnr_resilience_on = GetAveragePsnr(); + EXPECT_GT(psnr_resilience_on, 25.0); + + // Test that turning on error resilient mode hurts by 10% at most. + if (psnr_resilience_off > 0.0) { + const double psnr_ratio = psnr_resilience_on / psnr_resilience_off; + EXPECT_GE(psnr_ratio, 0.9); + EXPECT_LE(psnr_ratio, 1.1); + } +} + +// Check for successful decoding and no encoder/decoder mismatch +// if we lose (i.e., drop before decoding) a set of droppable +// frames (i.e., frames that don't update any reference buffers). +TEST_P(ErrorResilienceTestLarge, DropFramesWithoutRecovery) { + SetupEncoder(500, 10); + libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + cfg_.g_timebase.den, cfg_.g_timebase.num, + 0, 20); + + // Set an arbitrary set of error frames same as droppable frames. + unsigned int num_droppable_frames = 3; + unsigned int droppable_frame_list[] = { 5, 11, 13 }; + SetDroppableFrames(num_droppable_frames, droppable_frame_list); + SetErrorFrames(num_droppable_frames, droppable_frame_list); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Test that no mismatches have been found + std::cout << " Encoded frames: " << GetEncodedFrames() << "\n"; + std::cout << " Decoded frames: " << GetDecodedFrames() << "\n"; + std::cout << " Mismatch frames: " << GetMismatchFrames() << "\n"; + EXPECT_EQ(GetEncodedFrames() - GetDecodedFrames(), num_droppable_frames); +} + +// Check for ParseAbility property of an error-resilient frame. +// Encode a frame in error-resilient mode (E-frame), and disallow all +// subsequent frames from using MFMV. If frames are dropped before the +// E frame, all frames starting from the E frame should be parse-able. +TEST_P(ErrorResilienceTestLarge, ParseAbilityTest) { + SetupEncoder(500, 10); + + libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + cfg_.g_timebase.den, cfg_.g_timebase.num, + 0, 15); + + SetAllowMismatch(1); + + // Note that an E-frame cannot be forced on a frame that is a + // show_existing_frame, or a frame that comes directly after an invisible + // frame. Currently, this will cause an assertion failure. + // Set an arbitrary error resilient (E) frame + unsigned int num_error_resilient_frames = 1; + unsigned int error_resilient_frame_list[] = { 8 }; + SetErrorResilientFrames(num_error_resilient_frames, + error_resilient_frame_list); + // Ensure that any invisible frames before the E frame are dropped + SetInvisibleErrorFrames(num_error_resilient_frames, + error_resilient_frame_list); + // Set all frames after the error resilient frame to not allow MFMV + unsigned int num_post_error_resilient_frames = 6; + unsigned int post_error_resilient_frame_list[] = { 9, 10, 11, 12, 13, 14 }; + SetNoMFMVFrames(num_post_error_resilient_frames, + post_error_resilient_frame_list); + + // Set a few frames before the E frame that are lost (not decoded) + unsigned int num_error_frames = 5; + unsigned int error_frame_list[] = { 3, 4, 5, 6, 7 }; + SetErrorFrames(num_error_frames, error_frame_list); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + std::cout << " Encoded frames: " << GetEncodedFrames() << "\n"; + std::cout << " Decoded frames: " << GetDecodedFrames() << "\n"; + std::cout << " Mismatch frames: " << GetMismatchFrames() << "\n"; + EXPECT_EQ(GetEncodedFrames() - GetDecodedFrames(), num_error_frames); + // All frames following the E-frame and the E-frame are expected to have + // mismatches, but still be parse-able. + EXPECT_LE(GetMismatchFrames(), num_post_error_resilient_frames + 1); +} + +// Check for ParseAbility property of an S frame. +// Encode an S-frame. If frames are dropped before the S-frame, all frames +// starting from the S frame should be parse-able. +TEST_P(ErrorResilienceTestLarge, SFrameTest) { + SetupEncoder(500, 10); + + libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + cfg_.g_timebase.den, cfg_.g_timebase.num, + 0, 15); + + SetAllowMismatch(1); + + // Note that an S-frame cannot be forced on a frame that is a + // show_existing_frame. This issue still needs to be addressed. + // Set an arbitrary S-frame + unsigned int num_s_frames = 1; + unsigned int s_frame_list[] = { 6 }; + SetSFrames(num_s_frames, s_frame_list); + // Ensure that any invisible frames before the S frame are dropped + SetInvisibleErrorFrames(num_s_frames, s_frame_list); + + // Set a few frames before the S frame that are lost (not decoded) + unsigned int num_error_frames = 4; + unsigned int error_frame_list[] = { 2, 3, 4, 5 }; + SetErrorFrames(num_error_frames, error_frame_list); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + std::cout << " Encoded frames: " << GetEncodedFrames() << "\n"; + std::cout << " Decoded frames: " << GetDecodedFrames() << "\n"; + std::cout << " Mismatch frames: " << GetMismatchFrames() << "\n"; + EXPECT_EQ(GetEncodedFrames() - GetDecodedFrames(), num_error_frames); + // All frames following the S-frame and the S-frame are expected to have + // mismatches, but still be parse-able. + EXPECT_LE(GetMismatchFrames(), GetEncodedFrames() - s_frame_list[0]); +} + +AV1_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, NONREALTIME_TEST_MODES); +} // namespace diff --git a/libs/libaom/src/test/ethread_test.cc b/libs/libaom/src/test/ethread_test.cc new file mode 100644 index 000000000..306cc2f3a --- /dev/null +++ b/libs/libaom/src/test/ethread_test.cc @@ -0,0 +1,275 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/md5_helper.h" +#include "test/util.h" +#include "test/yuv_video_source.h" + +namespace { +class AVxEncoderThreadTest + : public ::libaom_test::CodecTestWith5Params, + public ::libaom_test::EncoderTest { + protected: + AVxEncoderThreadTest() + : EncoderTest(GET_PARAM(0)), encoder_initialized_(false), + encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)), + tile_cols_(GET_PARAM(3)), tile_rows_(GET_PARAM(4)), + row_mt_(GET_PARAM(5)) { + init_flags_ = AOM_CODEC_USE_PSNR; + aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t(); + cfg.w = 1280; + cfg.h = 720; + cfg.allow_lowbitdepth = 1; + decoder_ = codec_->CreateDecoder(cfg, 0); + if (decoder_->IsAV1()) { + decoder_->Control(AV1_SET_DECODE_TILE_ROW, -1); + decoder_->Control(AV1_SET_DECODE_TILE_COL, -1); + } + + size_enc_.clear(); + md5_dec_.clear(); + md5_enc_.clear(); + } + virtual ~AVxEncoderThreadTest() { delete decoder_; } + + virtual void SetUp() { + InitializeConfig(); + SetMode(encoding_mode_); + + if (encoding_mode_ != ::libaom_test::kRealTime) { + cfg_.g_lag_in_frames = 5; + cfg_.rc_end_usage = AOM_VBR; + cfg_.rc_2pass_vbr_minsection_pct = 5; + cfg_.rc_2pass_vbr_maxsection_pct = 2000; + } else { + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = AOM_CBR; + cfg_.g_error_resilient = 1; + } + cfg_.rc_max_quantizer = 56; + cfg_.rc_min_quantizer = 0; + } + + virtual void BeginPassHook(unsigned int /*pass*/) { + encoder_initialized_ = false; + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource * /*video*/, + ::libaom_test::Encoder *encoder) { + if (!encoder_initialized_) { + SetTileSize(encoder); + encoder->Control(AOME_SET_CPUUSED, set_cpu_used_); + encoder->Control(AV1E_SET_ROW_MT, row_mt_); + if (encoding_mode_ != ::libaom_test::kRealTime) { + encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1); + encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7); + encoder->Control(AOME_SET_ARNR_STRENGTH, 5); + encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 0); + } else { + encoder->Control(AOME_SET_ENABLEAUTOALTREF, 0); + encoder->Control(AV1E_SET_AQ_MODE, 3); + } + encoder_initialized_ = true; + } + } + + virtual void SetTileSize(libaom_test::Encoder *encoder) { + encoder->Control(AV1E_SET_TILE_COLUMNS, tile_cols_); + encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_); + } + + virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) { + size_enc_.push_back(pkt->data.frame.sz); + + ::libaom_test::MD5 md5_enc; + md5_enc.Add(reinterpret_cast(pkt->data.frame.buf), + pkt->data.frame.sz); + md5_enc_.push_back(md5_enc.Get()); + + const aom_codec_err_t res = decoder_->DecodeFrame( + reinterpret_cast(pkt->data.frame.buf), pkt->data.frame.sz); + if (res != AOM_CODEC_OK) { + abort_ = true; + ASSERT_EQ(AOM_CODEC_OK, res); + } + const aom_image_t *img = decoder_->GetDxData().Next(); + + if (img) { + ::libaom_test::MD5 md5_res; + md5_res.Add(img); + md5_dec_.push_back(md5_res.Get()); + } + } + + void DoTest() { + ::libaom_test::YUVVideoSource video( + "niklas_640_480_30.yuv", AOM_IMG_FMT_I420, 640, 480, 30, 1, 15, 21); + cfg_.rc_target_bitrate = 1000; + + if (row_mt_ == 0) { + // Encode using single thread. + cfg_.g_threads = 1; + init_flags_ = AOM_CODEC_USE_PSNR; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + std::vector single_thr_size_enc; + std::vector single_thr_md5_enc; + std::vector single_thr_md5_dec; + single_thr_size_enc = size_enc_; + single_thr_md5_enc = md5_enc_; + single_thr_md5_dec = md5_dec_; + size_enc_.clear(); + md5_enc_.clear(); + md5_dec_.clear(); + + // Encode using multiple threads. + cfg_.g_threads = 4; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + std::vector multi_thr_size_enc; + std::vector multi_thr_md5_enc; + std::vector multi_thr_md5_dec; + multi_thr_size_enc = size_enc_; + multi_thr_md5_enc = md5_enc_; + multi_thr_md5_dec = md5_dec_; + size_enc_.clear(); + md5_enc_.clear(); + md5_dec_.clear(); + + // Check that the vectors are equal. + ASSERT_EQ(single_thr_size_enc, multi_thr_size_enc); + ASSERT_EQ(single_thr_md5_enc, multi_thr_md5_enc); + ASSERT_EQ(single_thr_md5_dec, multi_thr_md5_dec); + } else if (row_mt_ == 1) { + // Encode using multiple threads row-mt enabled. + cfg_.g_threads = 2; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + std::vector multi_thr2_row_mt_size_enc; + std::vector multi_thr2_row_mt_md5_enc; + std::vector multi_thr2_row_mt_md5_dec; + multi_thr2_row_mt_size_enc = size_enc_; + multi_thr2_row_mt_md5_enc = md5_enc_; + multi_thr2_row_mt_md5_dec = md5_dec_; + size_enc_.clear(); + md5_enc_.clear(); + md5_dec_.clear(); + + // Disable threads=3 test for now to reduce the time so that the nightly + // test would not time out. + // cfg_.g_threads = 3; + // ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // std::vector multi_thr3_row_mt_size_enc; + // std::vector multi_thr3_row_mt_md5_enc; + // std::vector multi_thr3_row_mt_md5_dec; + // multi_thr3_row_mt_size_enc = size_enc_; + // multi_thr3_row_mt_md5_enc = md5_enc_; + // multi_thr3_row_mt_md5_dec = md5_dec_; + // size_enc_.clear(); + // md5_enc_.clear(); + // md5_dec_.clear(); + // Check that the vectors are equal. + // ASSERT_EQ(multi_thr3_row_mt_size_enc, multi_thr2_row_mt_size_enc); + // ASSERT_EQ(multi_thr3_row_mt_md5_enc, multi_thr2_row_mt_md5_enc); + // ASSERT_EQ(multi_thr3_row_mt_md5_dec, multi_thr2_row_mt_md5_dec); + + cfg_.g_threads = 4; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + std::vector multi_thr4_row_mt_size_enc; + std::vector multi_thr4_row_mt_md5_enc; + std::vector multi_thr4_row_mt_md5_dec; + multi_thr4_row_mt_size_enc = size_enc_; + multi_thr4_row_mt_md5_enc = md5_enc_; + multi_thr4_row_mt_md5_dec = md5_dec_; + size_enc_.clear(); + md5_enc_.clear(); + md5_dec_.clear(); + + // Check that the vectors are equal. + ASSERT_EQ(multi_thr4_row_mt_size_enc, multi_thr2_row_mt_size_enc); + ASSERT_EQ(multi_thr4_row_mt_md5_enc, multi_thr2_row_mt_md5_enc); + ASSERT_EQ(multi_thr4_row_mt_md5_dec, multi_thr2_row_mt_md5_dec); + } + } + + bool encoder_initialized_; + ::libaom_test::TestMode encoding_mode_; + int set_cpu_used_; + int tile_cols_; + int tile_rows_; + int row_mt_; + ::libaom_test::Decoder *decoder_; + std::vector size_enc_; + std::vector md5_enc_; + std::vector md5_dec_; +}; + +TEST_P(AVxEncoderThreadTest, EncoderResultTest) { + cfg_.large_scale_tile = 0; + decoder_->Control(AV1_SET_TILE_MODE, 0); + DoTest(); +} + +class AVxEncoderThreadTestLarge : public AVxEncoderThreadTest {}; + +TEST_P(AVxEncoderThreadTestLarge, EncoderResultTest) { + cfg_.large_scale_tile = 0; + decoder_->Control(AV1_SET_TILE_MODE, 0); + DoTest(); +} + +// For AV1, only test speed 0 to 3. +// Here test cpu_used 2 and 3 +AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadTest, + ::testing::Values(::libaom_test::kTwoPassGood), + ::testing::Range(2, 4), ::testing::Values(0, 2), + ::testing::Values(0, 1), ::testing::Values(0, 1)); + +// Test cpu_used 0 and 1. +AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadTestLarge, + ::testing::Values(::libaom_test::kTwoPassGood, + ::libaom_test::kOnePassGood), + ::testing::Range(0, 2), ::testing::Values(0, 1, 2, 6), + ::testing::Values(0, 1, 2, 6), + ::testing::Values(0, 1)); + +class AVxEncoderThreadLSTest : public AVxEncoderThreadTest { + virtual void SetTileSize(libaom_test::Encoder *encoder) { + encoder->Control(AV1E_SET_TILE_COLUMNS, tile_cols_); + encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_); + } +}; + +TEST_P(AVxEncoderThreadLSTest, EncoderResultTest) { + cfg_.large_scale_tile = 1; + decoder_->Control(AV1_SET_TILE_MODE, 1); + decoder_->Control(AV1D_EXT_TILE_DEBUG, 1); + DoTest(); +} + +class AVxEncoderThreadLSTestLarge : public AVxEncoderThreadLSTest {}; + +TEST_P(AVxEncoderThreadLSTestLarge, EncoderResultTest) { + cfg_.large_scale_tile = 1; + decoder_->Control(AV1_SET_TILE_MODE, 1); + decoder_->Control(AV1D_EXT_TILE_DEBUG, 1); + DoTest(); +} + +AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadLSTestLarge, + ::testing::Values(::libaom_test::kTwoPassGood, + ::libaom_test::kOnePassGood), + ::testing::Range(0, 4), ::testing::Values(0, 6), + ::testing::Values(0, 6), ::testing::Values(0, 1)); +} // namespace diff --git a/libs/libaom/src/test/examples.sh b/libs/libaom/src/test/examples.sh new file mode 100644 index 000000000..2cdb89dd0 --- /dev/null +++ b/libs/libaom/src/test/examples.sh @@ -0,0 +1,29 @@ +#!/bin/sh +## Copyright (c) 2016, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +## This file runs all of the tests for the libaom examples. +## +. $(dirname $0)/tools_common.sh + +example_tests=$(ls -r $(dirname $0)/*.sh) + +# List of script names to exclude. +exclude_list="best_encode examples run_encodes tools_common" + +# Filter out the scripts in $exclude_list. +for word in ${exclude_list}; do + example_tests=$(filter_strings "${example_tests}" "${word}" exclude) +done + +for test in ${example_tests}; do + # Source each test script so that exporting variables can be avoided. + AOM_TEST_NAME="$(basename ${test%.*})" + . "${test}" +done diff --git a/libs/libaom/src/test/external_frame_buffer_test.cc b/libs/libaom/src/test/external_frame_buffer_test.cc new file mode 100644 index 000000000..1d726a4f1 --- /dev/null +++ b/libs/libaom/src/test/external_frame_buffer_test.cc @@ -0,0 +1,540 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include "common/tools_common.h" +#include "config/aom_config.h" +#include "test/codec_factory.h" +#include "test/decode_test_driver.h" +#include "test/ivf_video_source.h" +#include "test/md5_helper.h" +#include "test/test_vectors.h" +#include "test/util.h" +#if CONFIG_WEBM_IO +#include "test/webm_video_source.h" +#endif + +namespace { + +const int kVideoNameParam = 1; + +struct ExternalFrameBuffer { + uint8_t *data; + size_t size; + int in_use; +}; + +// Class to manipulate a list of external frame buffers. +class ExternalFrameBufferList { + public: + ExternalFrameBufferList() + : num_buffers_(0), num_used_buffers_(0), ext_fb_list_(NULL) {} + + virtual ~ExternalFrameBufferList() { + for (int i = 0; i < num_buffers_; ++i) { + delete[] ext_fb_list_[i].data; + } + delete[] ext_fb_list_; + } + + // Creates the list to hold the external buffers. Returns true on success. + bool CreateBufferList(int num_buffers) { + if (num_buffers < 0) return false; + + num_buffers_ = num_buffers; + ext_fb_list_ = new ExternalFrameBuffer[num_buffers_]; + EXPECT_TRUE(ext_fb_list_ != NULL); + memset(ext_fb_list_, 0, sizeof(ext_fb_list_[0]) * num_buffers_); + return true; + } + + // Searches the frame buffer list for a free frame buffer. Makes sure + // that the frame buffer is at least |min_size| in bytes. Marks that the + // frame buffer is in use by libaom. Finally sets |fb| to point to the + // external frame buffer. Returns < 0 on an error. + int GetFreeFrameBuffer(size_t min_size, aom_codec_frame_buffer_t *fb) { + EXPECT_TRUE(fb != NULL); + const int idx = FindFreeBufferIndex(); + if (idx == num_buffers_) return -1; + + if (ext_fb_list_[idx].size < min_size) { + delete[] ext_fb_list_[idx].data; + ext_fb_list_[idx].data = new uint8_t[min_size]; + memset(ext_fb_list_[idx].data, 0, min_size); + ext_fb_list_[idx].size = min_size; + } + + SetFrameBuffer(idx, fb); + + num_used_buffers_++; + return 0; + } + + // Test function that will not allocate any data for the frame buffer. + // Returns < 0 on an error. + int GetZeroFrameBuffer(size_t min_size, aom_codec_frame_buffer_t *fb) { + EXPECT_TRUE(fb != NULL); + const int idx = FindFreeBufferIndex(); + if (idx == num_buffers_) return -1; + + if (ext_fb_list_[idx].size < min_size) { + delete[] ext_fb_list_[idx].data; + ext_fb_list_[idx].data = NULL; + ext_fb_list_[idx].size = min_size; + } + + SetFrameBuffer(idx, fb); + return 0; + } + + // Marks the external frame buffer that |fb| is pointing to as free. + // Returns < 0 on an error. + int ReturnFrameBuffer(aom_codec_frame_buffer_t *fb) { + if (fb == NULL) { + EXPECT_TRUE(fb != NULL); + return -1; + } + ExternalFrameBuffer *const ext_fb = + reinterpret_cast(fb->priv); + if (ext_fb == NULL) { + EXPECT_TRUE(ext_fb != NULL); + return -1; + } + EXPECT_EQ(1, ext_fb->in_use); + ext_fb->in_use = 0; + num_used_buffers_--; + return 0; + } + + // Checks that the aom_image_t data is contained within the external frame + // buffer private data passed back in the aom_image_t. + void CheckImageFrameBuffer(const aom_image_t *img) { + const struct ExternalFrameBuffer *const ext_fb = + reinterpret_cast(img->fb_priv); + + ASSERT_TRUE(img->planes[0] >= ext_fb->data && + img->planes[0] < (ext_fb->data + ext_fb->size)); + } + + int num_used_buffers() const { return num_used_buffers_; } + + private: + // Returns the index of the first free frame buffer. Returns |num_buffers_| + // if there are no free frame buffers. + int FindFreeBufferIndex() { + int i; + // Find a free frame buffer. + for (i = 0; i < num_buffers_; ++i) { + if (!ext_fb_list_[i].in_use) break; + } + return i; + } + + // Sets |fb| to an external frame buffer. idx is the index into the frame + // buffer list. + void SetFrameBuffer(int idx, aom_codec_frame_buffer_t *fb) { + ASSERT_TRUE(fb != NULL); + fb->data = ext_fb_list_[idx].data; + fb->size = ext_fb_list_[idx].size; + ASSERT_EQ(0, ext_fb_list_[idx].in_use); + ext_fb_list_[idx].in_use = 1; + fb->priv = &ext_fb_list_[idx]; + } + + int num_buffers_; + int num_used_buffers_; + ExternalFrameBuffer *ext_fb_list_; +}; + +#if CONFIG_WEBM_IO + +// Callback used by libaom to request the application to return a frame +// buffer of at least |min_size| in bytes. +int get_aom_frame_buffer(void *user_priv, size_t min_size, + aom_codec_frame_buffer_t *fb) { + ExternalFrameBufferList *const fb_list = + reinterpret_cast(user_priv); + return fb_list->GetFreeFrameBuffer(min_size, fb); +} + +// Callback used by libaom to tell the application that |fb| is not needed +// anymore. +int release_aom_frame_buffer(void *user_priv, aom_codec_frame_buffer_t *fb) { + ExternalFrameBufferList *const fb_list = + reinterpret_cast(user_priv); + return fb_list->ReturnFrameBuffer(fb); +} + +// Callback will not allocate data for frame buffer. +int get_aom_zero_frame_buffer(void *user_priv, size_t min_size, + aom_codec_frame_buffer_t *fb) { + ExternalFrameBufferList *const fb_list = + reinterpret_cast(user_priv); + return fb_list->GetZeroFrameBuffer(min_size, fb); +} + +// Callback will allocate one less byte than |min_size|. +int get_aom_one_less_byte_frame_buffer(void *user_priv, size_t min_size, + aom_codec_frame_buffer_t *fb) { + ExternalFrameBufferList *const fb_list = + reinterpret_cast(user_priv); + return fb_list->GetFreeFrameBuffer(min_size - 1, fb); +} + +// Callback will not release the external frame buffer. +int do_not_release_aom_frame_buffer(void *user_priv, + aom_codec_frame_buffer_t *fb) { + (void)user_priv; + (void)fb; + return 0; +} + +#endif // CONFIG_WEBM_IO + +// Class for testing passing in external frame buffers to libaom. +class ExternalFrameBufferMD5Test + : public ::libaom_test::DecoderTest, + public ::libaom_test::CodecTestWithParam { + protected: + ExternalFrameBufferMD5Test() + : DecoderTest(GET_PARAM(::libaom_test::kCodecFactoryParam)), + md5_file_(NULL), num_buffers_(0) {} + + virtual ~ExternalFrameBufferMD5Test() { + if (md5_file_ != NULL) fclose(md5_file_); + } + + virtual void PreDecodeFrameHook( + const libaom_test::CompressedVideoSource &video, + libaom_test::Decoder *decoder) { + if (num_buffers_ > 0 && video.frame_number() == 0) { + // Have libaom use frame buffers we create. + ASSERT_TRUE(fb_list_.CreateBufferList(num_buffers_)); + ASSERT_EQ(AOM_CODEC_OK, + decoder->SetFrameBufferFunctions(GetAV1FrameBuffer, + ReleaseAV1FrameBuffer, this)); + } + } + + void OpenMD5File(const std::string &md5_file_name_) { + md5_file_ = libaom_test::OpenTestDataFile(md5_file_name_); + ASSERT_TRUE(md5_file_ != NULL) + << "Md5 file open failed. Filename: " << md5_file_name_; + } + + virtual void DecompressedFrameHook(const aom_image_t &img, + const unsigned int frame_number) { + ASSERT_TRUE(md5_file_ != NULL); + char expected_md5[33]; + char junk[128]; + + // Read correct md5 checksums. + const int res = fscanf(md5_file_, "%s %s", expected_md5, junk); + ASSERT_NE(EOF, res) << "Read md5 data failed"; + expected_md5[32] = '\0'; + + ::libaom_test::MD5 md5_res; +#if FORCE_HIGHBITDEPTH_DECODING + const aom_img_fmt_t shifted_fmt = + (aom_img_fmt)(img.fmt & ~AOM_IMG_FMT_HIGHBITDEPTH); + if (img.bit_depth == 8 && shifted_fmt != img.fmt) { + aom_image_t *img_shifted = + aom_img_alloc(NULL, shifted_fmt, img.d_w, img.d_h, 16); + img_shifted->bit_depth = img.bit_depth; + img_shifted->monochrome = img.monochrome; + aom_img_downshift(img_shifted, &img, 0); + md5_res.Add(img_shifted); + aom_img_free(img_shifted); + } else { +#endif + md5_res.Add(&img); +#if FORCE_HIGHBITDEPTH_DECODING + } +#endif + const char *const actual_md5 = md5_res.Get(); + + // Check md5 match. + ASSERT_STREQ(expected_md5, actual_md5) + << "Md5 checksums don't match: frame number = " << frame_number; + + const struct ExternalFrameBuffer *const ext_fb = + reinterpret_cast(img.fb_priv); + + ASSERT_TRUE(img.planes[0] >= ext_fb->data && + img.planes[0] < (ext_fb->data + ext_fb->size)); + } + + // Callback to get a free external frame buffer. Return value < 0 is an + // error. + static int GetAV1FrameBuffer(void *user_priv, size_t min_size, + aom_codec_frame_buffer_t *fb) { + ExternalFrameBufferMD5Test *const md5Test = + reinterpret_cast(user_priv); + return md5Test->fb_list_.GetFreeFrameBuffer(min_size, fb); + } + + // Callback to release an external frame buffer. Return value < 0 is an + // error. + static int ReleaseAV1FrameBuffer(void *user_priv, + aom_codec_frame_buffer_t *fb) { + ExternalFrameBufferMD5Test *const md5Test = + reinterpret_cast(user_priv); + return md5Test->fb_list_.ReturnFrameBuffer(fb); + } + + void set_num_buffers(int num_buffers) { num_buffers_ = num_buffers; } + int num_buffers() const { return num_buffers_; } + + private: + FILE *md5_file_; + int num_buffers_; + ExternalFrameBufferList fb_list_; +}; + +#if CONFIG_WEBM_IO +const char kAV1TestFile[] = "av1-1-b8-03-sizeup.mkv"; +const char kAV1NonRefTestFile[] = "av1-1-b8-01-size-226x226.ivf"; + +// Class for testing passing in external frame buffers to libaom. +class ExternalFrameBufferTest : public ::testing::Test { + protected: + ExternalFrameBufferTest() : video_(NULL), decoder_(NULL), num_buffers_(0) {} + + virtual void SetUp() { + video_ = new libaom_test::WebMVideoSource(kAV1TestFile); + ASSERT_TRUE(video_ != NULL); + video_->Init(); + video_->Begin(); + + aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t(); + cfg.allow_lowbitdepth = !FORCE_HIGHBITDEPTH_DECODING; + decoder_ = new libaom_test::AV1Decoder(cfg, 0); + ASSERT_TRUE(decoder_ != NULL); + } + + virtual void TearDown() { + delete decoder_; + decoder_ = NULL; + delete video_; + video_ = NULL; + } + + // Passes the external frame buffer information to libaom. + aom_codec_err_t SetFrameBufferFunctions( + int num_buffers, aom_get_frame_buffer_cb_fn_t cb_get, + aom_release_frame_buffer_cb_fn_t cb_release) { + if (num_buffers > 0) { + num_buffers_ = num_buffers; + EXPECT_TRUE(fb_list_.CreateBufferList(num_buffers_)); + } + + return decoder_->SetFrameBufferFunctions(cb_get, cb_release, &fb_list_); + } + + aom_codec_err_t DecodeOneFrame() { + const aom_codec_err_t res = + decoder_->DecodeFrame(video_->cxdata(), video_->frame_size()); + CheckDecodedFrames(); + if (res == AOM_CODEC_OK) video_->Next(); + return res; + } + + aom_codec_err_t DecodeRemainingFrames() { + for (; video_->cxdata() != NULL; video_->Next()) { + const aom_codec_err_t res = + decoder_->DecodeFrame(video_->cxdata(), video_->frame_size()); + if (res != AOM_CODEC_OK) return res; + CheckDecodedFrames(); + } + return AOM_CODEC_OK; + } + + protected: + void CheckDecodedFrames() { + libaom_test::DxDataIterator dec_iter = decoder_->GetDxData(); + const aom_image_t *img = NULL; + + // Get decompressed data + while ((img = dec_iter.Next()) != NULL) { + fb_list_.CheckImageFrameBuffer(img); + } + } + + libaom_test::CompressedVideoSource *video_; + libaom_test::AV1Decoder *decoder_; + int num_buffers_; + ExternalFrameBufferList fb_list_; +}; + +class ExternalFrameBufferNonRefTest : public ExternalFrameBufferTest { + protected: + virtual void SetUp() { + video_ = new libaom_test::IVFVideoSource(kAV1NonRefTestFile); + ASSERT_TRUE(video_ != NULL); + video_->Init(); + video_->Begin(); + + aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t(); + cfg.allow_lowbitdepth = !FORCE_HIGHBITDEPTH_DECODING; + decoder_ = new libaom_test::AV1Decoder(cfg, 0); + ASSERT_TRUE(decoder_ != NULL); + } + + virtual void CheckFrameBufferRelease() { + TearDown(); + ASSERT_EQ(0, fb_list_.num_used_buffers()); + } +}; +#endif // CONFIG_WEBM_IO + +// This test runs through the set of test vectors, and decodes them. +// Libaom will call into the application to allocate a frame buffer when +// needed. The md5 checksums are computed for each frame in the video file. +// If md5 checksums match the correct md5 data, then the test is passed. +// Otherwise, the test failed. +TEST_P(ExternalFrameBufferMD5Test, ExtFBMD5Match) { + const std::string filename = GET_PARAM(kVideoNameParam); + aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t(); + + // Number of buffers equals #AOM_MAXIMUM_REF_BUFFERS + + // #AOM_MAXIMUM_WORK_BUFFERS + four jitter buffers. + const int jitter_buffers = 4; + const int num_buffers = + AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS + jitter_buffers; + set_num_buffers(num_buffers); + + // Open compressed video file. + std::unique_ptr video; + if (filename.substr(filename.length() - 3, 3) == "ivf") { + video.reset(new libaom_test::IVFVideoSource(filename)); + } else { +#if CONFIG_WEBM_IO + video.reset(new libaom_test::WebMVideoSource(filename)); +#else + fprintf(stderr, "WebM IO is disabled, skipping test vector %s\n", + filename.c_str()); + return; +#endif + } + ASSERT_TRUE(video.get() != NULL); + video->Init(); + + // Construct md5 file name. + const std::string md5_filename = filename + ".md5"; + OpenMD5File(md5_filename); + + // Set decode config. + cfg.allow_lowbitdepth = !FORCE_HIGHBITDEPTH_DECODING; + set_cfg(cfg); + + // Decode frame, and check the md5 matching. + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get(), cfg)); +} + +#if CONFIG_WEBM_IO +TEST_F(ExternalFrameBufferTest, MinFrameBuffers) { + // Minimum number of external frame buffers for AV1 is + // #AOM_MAXIMUM_REF_BUFFERS + #AOM_MAXIMUM_WORK_BUFFERS. + const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS; + ASSERT_EQ(AOM_CODEC_OK, + SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer, + release_aom_frame_buffer)); + ASSERT_EQ(AOM_CODEC_OK, DecodeRemainingFrames()); +} + +TEST_F(ExternalFrameBufferTest, EightJitterBuffers) { + // Number of buffers equals #AOM_MAXIMUM_REF_BUFFERS + + // #AOM_MAXIMUM_WORK_BUFFERS + eight jitter buffers. + const int jitter_buffers = 8; + const int num_buffers = + AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS + jitter_buffers; + ASSERT_EQ(AOM_CODEC_OK, + SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer, + release_aom_frame_buffer)); + ASSERT_EQ(AOM_CODEC_OK, DecodeRemainingFrames()); +} + +TEST_F(ExternalFrameBufferTest, NotEnoughBuffers) { + // Minimum number of external frame buffers for AV1 is + // #AOM_MAXIMUM_REF_BUFFERS + #AOM_MAXIMUM_WORK_BUFFERS. Most files will + // only use 5 frame buffers at one time. + const int num_buffers = 2; + ASSERT_EQ(AOM_CODEC_OK, + SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer, + release_aom_frame_buffer)); + ASSERT_EQ(AOM_CODEC_OK, DecodeOneFrame()); + // Only run this on long clips. Decoding a very short clip will return + // AOM_CODEC_OK even with only 2 buffers. + ASSERT_EQ(AOM_CODEC_MEM_ERROR, DecodeRemainingFrames()); +} + +TEST_F(ExternalFrameBufferTest, NoRelease) { + const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS; + ASSERT_EQ(AOM_CODEC_OK, + SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer, + do_not_release_aom_frame_buffer)); + ASSERT_EQ(AOM_CODEC_OK, DecodeOneFrame()); + ASSERT_EQ(AOM_CODEC_MEM_ERROR, DecodeRemainingFrames()); +} + +TEST_F(ExternalFrameBufferTest, NullRealloc) { + const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS; + ASSERT_EQ(AOM_CODEC_OK, + SetFrameBufferFunctions(num_buffers, get_aom_zero_frame_buffer, + release_aom_frame_buffer)); + ASSERT_EQ(AOM_CODEC_MEM_ERROR, DecodeOneFrame()); +} + +TEST_F(ExternalFrameBufferTest, ReallocOneLessByte) { + const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS; + ASSERT_EQ(AOM_CODEC_OK, SetFrameBufferFunctions( + num_buffers, get_aom_one_less_byte_frame_buffer, + release_aom_frame_buffer)); + ASSERT_EQ(AOM_CODEC_MEM_ERROR, DecodeOneFrame()); +} + +TEST_F(ExternalFrameBufferTest, NullGetFunction) { + const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS; + ASSERT_EQ( + AOM_CODEC_INVALID_PARAM, + SetFrameBufferFunctions(num_buffers, NULL, release_aom_frame_buffer)); +} + +TEST_F(ExternalFrameBufferTest, NullReleaseFunction) { + const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS; + ASSERT_EQ(AOM_CODEC_INVALID_PARAM, + SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer, NULL)); +} + +TEST_F(ExternalFrameBufferTest, SetAfterDecode) { + const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS; + ASSERT_EQ(AOM_CODEC_OK, DecodeOneFrame()); + ASSERT_EQ(AOM_CODEC_ERROR, + SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer, + release_aom_frame_buffer)); +} + +TEST_F(ExternalFrameBufferNonRefTest, ReleaseNonRefFrameBuffer) { + const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS; + ASSERT_EQ(AOM_CODEC_OK, + SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer, + release_aom_frame_buffer)); + ASSERT_EQ(AOM_CODEC_OK, DecodeRemainingFrames()); + CheckFrameBufferRelease(); +} +#endif // CONFIG_WEBM_IO + +AV1_INSTANTIATE_TEST_CASE( + ExternalFrameBufferMD5Test, + ::testing::ValuesIn(libaom_test::kAV1TestVectors, + libaom_test::kAV1TestVectors + + libaom_test::kNumAV1TestVectors)); +} // namespace diff --git a/libs/libaom/src/test/fdct4x4_test.cc b/libs/libaom/src/test/fdct4x4_test.cc new file mode 100644 index 000000000..6600f2c46 --- /dev/null +++ b/libs/libaom/src/test/fdct4x4_test.cc @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/av1_rtcd.h" +#include "config/aom_dsp_rtcd.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/transform_test_base.h" +#include "test/util.h" +#include "av1/common/entropy.h" +#include "aom/aom_codec.h" +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +using libaom_test::ACMRandom; + +namespace { + +template +using FdctFunc = void (*)(const int16_t *in, OutputType *out, int stride); + +template +using FhtFunc = void (*)(const int16_t *in, OutputType *out, int stride, + TxfmParam *txfm_param); + +template +using Fdct4x4Param = + std::tuple, FhtFunc, aom_bit_depth_t, int>; + +#if HAVE_NEON || HAVE_SSE2 +void fdct4x4_ref(const int16_t *in, tran_low_t *out, int stride, + TxfmParam * /*txfm_param*/) { + aom_fdct4x4_c(in, out, stride); +} + +void fdct4x4_lp_ref(const int16_t *in, int16_t *out, int stride, + TxfmParam * /*txfm_param*/) { + aom_fdct4x4_lp_c(in, out, stride); +} +#endif + +template +class Trans4x4FDCT : public libaom_test::TransformTestBase, + public ::testing::TestWithParam> { + public: + virtual ~Trans4x4FDCT() {} + + using TxfmBaseOutType = libaom_test::TransformTestBase; + virtual void SetUp() { + fwd_txfm_ = std::get<0>(this->GetParam()); + TxfmBaseOutType::pitch_ = 4; + TxfmBaseOutType::height_ = 4; + TxfmBaseOutType::fwd_txfm_ref = std::get<1>(this->GetParam()); + TxfmBaseOutType::bit_depth_ = std::get<2>(this->GetParam()); + TxfmBaseOutType::mask_ = (1 << TxfmBaseOutType::bit_depth_) - 1; + TxfmBaseOutType::num_coeffs_ = std::get<3>(this->GetParam()); + } + virtual void TearDown() { libaom_test::ClearSystemState(); } + + protected: + void RunFwdTxfm(const int16_t *in, OutputType *out, int stride) { + fwd_txfm_(in, out, stride); + } + + void RunInvTxfm(const OutputType *out, uint8_t *dst, int stride) { + (void)out; + (void)dst; + (void)stride; + } + + FdctFunc fwd_txfm_; +}; + +using Trans4x4FDCTTranLow = Trans4x4FDCT; +TEST_P(Trans4x4FDCTTranLow, CoeffCheck) { RunCoeffCheck(); } +TEST_P(Trans4x4FDCTTranLow, MemCheck) { RunMemCheck(); } + +using Trans4x4FDCTInt16 = Trans4x4FDCT; +TEST_P(Trans4x4FDCTInt16, CoeffCheck) { RunCoeffCheck(); } +TEST_P(Trans4x4FDCTInt16, MemCheck) { RunMemCheck(); } + +using std::make_tuple; + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, Trans4x4FDCTTranLow, + ::testing::Values(make_tuple(&aom_fdct4x4_neon, + &fdct4x4_ref, AOM_BITS_8, + 16))); + +INSTANTIATE_TEST_SUITE_P(NEON, Trans4x4FDCTInt16, + ::testing::Values(make_tuple(&aom_fdct4x4_lp_neon, + &fdct4x4_lp_ref, + AOM_BITS_8, 16))); +#endif + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P(SSE2, Trans4x4FDCTTranLow, + ::testing::Values(make_tuple(&aom_fdct4x4_sse2, + &fdct4x4_ref, AOM_BITS_8, + 16))); + +INSTANTIATE_TEST_SUITE_P(SSE2, Trans4x4FDCTInt16, + ::testing::Values(make_tuple(&aom_fdct4x4_lp_sse2, + &fdct4x4_lp_ref, + AOM_BITS_8, 16))); +#endif +} // namespace diff --git a/libs/libaom/src/test/fft_test.cc b/libs/libaom/src/test/fft_test.cc new file mode 100644 index 000000000..d23aa012c --- /dev/null +++ b/libs/libaom/src/test/fft_test.cc @@ -0,0 +1,257 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include +#include +#include +#include + +#include "aom_dsp/fft_common.h" +#include "aom_mem/aom_mem.h" +#include "av1/common/common.h" +#include "config/aom_dsp_rtcd.h" +#include "test/acm_random.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +namespace { + +typedef void (*tform_fun_t)(const float *input, float *temp, float *output); + +// Simple 1D FFT implementation +template +void fft(const InputType *data, std::complex *result, int n) { + if (n == 1) { + result[0] = data[0]; + return; + } + std::vector temp(n); + for (int k = 0; k < n / 2; ++k) { + temp[k] = data[2 * k]; + temp[n / 2 + k] = data[2 * k + 1]; + } + fft(&temp[0], result, n / 2); + fft(&temp[n / 2], result + n / 2, n / 2); + for (int k = 0; k < n / 2; ++k) { + std::complex w = std::complex((float)cos(2. * PI * k / n), + (float)-sin(2. * PI * k / n)); + std::complex a = result[k]; + std::complex b = result[n / 2 + k]; + result[k] = a + w * b; + result[n / 2 + k] = a - w * b; + } +} + +void transpose(std::vector > *data, int n) { + for (int y = 0; y < n; ++y) { + for (int x = y + 1; x < n; ++x) { + std::swap((*data)[y * n + x], (*data)[x * n + y]); + } + } +} + +// Simple 2D FFT implementation +template +std::vector > fft2d(const InputType *input, int n) { + std::vector > rowfft(n * n); + std::vector > result(n * n); + for (int y = 0; y < n; ++y) { + fft(input + y * n, &rowfft[y * n], n); + } + transpose(&rowfft, n); + for (int y = 0; y < n; ++y) { + fft(&rowfft[y * n], &result[y * n], n); + } + transpose(&result, n); + return result; +} + +struct FFTTestArg { + int n; + void (*fft)(const float *input, float *temp, float *output); + FFTTestArg(int n_in, tform_fun_t fft_in) : n(n_in), fft(fft_in) {} +}; + +std::ostream &operator<<(std::ostream &os, const FFTTestArg &test_arg) { + return os << "fft_arg { n:" << test_arg.n << " fft:" << test_arg.fft << " }"; +} + +class FFT2DTest : public ::testing::TestWithParam { + protected: + void SetUp() { + int n = GetParam().n; + input_ = (float *)aom_memalign(32, sizeof(*input_) * n * n); + temp_ = (float *)aom_memalign(32, sizeof(*temp_) * n * n); + output_ = (float *)aom_memalign(32, sizeof(*output_) * n * n * 2); + memset(input_, 0, sizeof(*input_) * n * n); + memset(temp_, 0, sizeof(*temp_) * n * n); + memset(output_, 0, sizeof(*output_) * n * n * 2); + } + void TearDown() { + aom_free(input_); + aom_free(temp_); + aom_free(output_); + } + float *input_; + float *temp_; + float *output_; +}; + +TEST_P(FFT2DTest, Correct) { + int n = GetParam().n; + for (int i = 0; i < n * n; ++i) { + input_[i] = 1; + std::vector > expected = fft2d(&input_[0], n); + GetParam().fft(&input_[0], &temp_[0], &output_[0]); + for (int y = 0; y < n; ++y) { + for (int x = 0; x < (n / 2) + 1; ++x) { + EXPECT_NEAR(expected[y * n + x].real(), output_[2 * (y * n + x)], 1e-5); + EXPECT_NEAR(expected[y * n + x].imag(), output_[2 * (y * n + x) + 1], + 1e-5); + } + } + input_[i] = 0; + } +} + +TEST_P(FFT2DTest, Benchmark) { + int n = GetParam().n; + float sum = 0; + for (int i = 0; i < 1000 * (64 - n); ++i) { + input_[i % (n * n)] = 1; + GetParam().fft(&input_[0], &temp_[0], &output_[0]); + sum += output_[0]; + input_[i % (n * n)] = 0; + } +} + +INSTANTIATE_TEST_SUITE_P(C, FFT2DTest, + ::testing::Values(FFTTestArg(2, aom_fft2x2_float_c), + FFTTestArg(4, aom_fft4x4_float_c), + FFTTestArg(8, aom_fft8x8_float_c), + FFTTestArg(16, aom_fft16x16_float_c), + FFTTestArg(32, + aom_fft32x32_float_c))); +#if ARCH_X86 || ARCH_X86_64 +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, FFT2DTest, + ::testing::Values(FFTTestArg(4, aom_fft4x4_float_sse2), + FFTTestArg(8, aom_fft8x8_float_sse2), + FFTTestArg(16, aom_fft16x16_float_sse2), + FFTTestArg(32, aom_fft32x32_float_sse2))); +#endif // HAVE_SSE2 +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, FFT2DTest, + ::testing::Values(FFTTestArg(8, aom_fft8x8_float_avx2), + FFTTestArg(16, aom_fft16x16_float_avx2), + FFTTestArg(32, aom_fft32x32_float_avx2))); +#endif // HAVE_AVX2 +#endif // ARCH_X86 || ARCH_X86_64 + +struct IFFTTestArg { + int n; + tform_fun_t ifft; + IFFTTestArg(int n_in, tform_fun_t ifft_in) : n(n_in), ifft(ifft_in) {} +}; + +std::ostream &operator<<(std::ostream &os, const IFFTTestArg &test_arg) { + return os << "ifft_arg { n:" << test_arg.n << " fft:" << test_arg.ifft + << " }"; +} + +class IFFT2DTest : public ::testing::TestWithParam { + protected: + void SetUp() { + int n = GetParam().n; + input_ = (float *)aom_memalign(32, sizeof(*input_) * n * n * 2); + temp_ = (float *)aom_memalign(32, sizeof(*temp_) * n * n * 2); + output_ = (float *)aom_memalign(32, sizeof(*output_) * n * n); + memset(input_, 0, sizeof(*input_) * n * n * 2); + memset(temp_, 0, sizeof(*temp_) * n * n * 2); + memset(output_, 0, sizeof(*output_) * n * n); + } + void TearDown() { + aom_free(input_); + aom_free(temp_); + aom_free(output_); + } + float *input_; + float *temp_; + float *output_; +}; + +TEST_P(IFFT2DTest, Correctness) { + int n = GetParam().n; + ASSERT_GE(n, 2); + std::vector expected(n * n); + std::vector actual(n * n); + // Do forward transform then invert to make sure we get back expected + for (int y = 0; y < n; ++y) { + for (int x = 0; x < n; ++x) { + expected[y * n + x] = 1; + std::vector > input_c = fft2d(&expected[0], n); + for (int i = 0; i < n * n; ++i) { + input_[2 * i + 0] = input_c[i].real(); + input_[2 * i + 1] = input_c[i].imag(); + } + GetParam().ifft(&input_[0], &temp_[0], &output_[0]); + + for (int yy = 0; yy < n; ++yy) { + for (int xx = 0; xx < n; ++xx) { + EXPECT_NEAR(expected[yy * n + xx], output_[yy * n + xx] / (n * n), + 1e-5); + } + } + expected[y * n + x] = 0; + } + } +}; + +TEST_P(IFFT2DTest, Benchmark) { + int n = GetParam().n; + float sum = 0; + for (int i = 0; i < 1000 * (64 - n); ++i) { + input_[i % (n * n)] = 1; + GetParam().ifft(&input_[0], &temp_[0], &output_[0]); + sum += output_[0]; + input_[i % (n * n)] = 0; + } +} +INSTANTIATE_TEST_SUITE_P( + C, IFFT2DTest, + ::testing::Values(IFFTTestArg(2, aom_ifft2x2_float_c), + IFFTTestArg(4, aom_ifft4x4_float_c), + IFFTTestArg(8, aom_ifft8x8_float_c), + IFFTTestArg(16, aom_ifft16x16_float_c), + IFFTTestArg(32, aom_ifft32x32_float_c))); +#if ARCH_X86 || ARCH_X86_64 +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, IFFT2DTest, + ::testing::Values(IFFTTestArg(4, aom_ifft4x4_float_sse2), + IFFTTestArg(8, aom_ifft8x8_float_sse2), + IFFTTestArg(16, aom_ifft16x16_float_sse2), + IFFTTestArg(32, aom_ifft32x32_float_sse2))); +#endif // HAVE_SSE2 + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, IFFT2DTest, + ::testing::Values(IFFTTestArg(8, aom_ifft8x8_float_avx2), + IFFTTestArg(16, aom_ifft16x16_float_avx2), + IFFTTestArg(32, aom_ifft32x32_float_avx2))); +#endif // HAVE_AVX2 +#endif // ARCH_X86 || ARCH_X86_64 + +} // namespace diff --git a/libs/libaom/src/test/film_grain_table_test.cc b/libs/libaom/src/test/film_grain_table_test.cc new file mode 100644 index 000000000..524d67d7b --- /dev/null +++ b/libs/libaom/src/test/film_grain_table_test.cc @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "aom_dsp/grain_table.h" +#include "aom/internal/aom_codec_internal.h" +#include "av1/encoder/grain_test_vectors.h" +#include "test/video_source.h" + +void grain_equal(const aom_film_grain_t *expected, + const aom_film_grain_t *actual) { + EXPECT_EQ(expected->apply_grain, actual->apply_grain); + EXPECT_EQ(expected->update_parameters, actual->update_parameters); + if (!expected->update_parameters) return; + EXPECT_EQ(expected->num_y_points, actual->num_y_points); + EXPECT_EQ(expected->num_cb_points, actual->num_cb_points); + EXPECT_EQ(expected->num_cr_points, actual->num_cr_points); + EXPECT_EQ(0, memcmp(expected->scaling_points_y, actual->scaling_points_y, + expected->num_y_points * + sizeof(expected->scaling_points_y[0]))); + EXPECT_EQ(0, memcmp(expected->scaling_points_cb, actual->scaling_points_cb, + expected->num_cb_points * + sizeof(expected->scaling_points_cb[0]))); + EXPECT_EQ(0, memcmp(expected->scaling_points_cr, actual->scaling_points_cr, + expected->num_cr_points * + sizeof(expected->scaling_points_cr[0]))); + EXPECT_EQ(expected->scaling_shift, actual->scaling_shift); + EXPECT_EQ(expected->ar_coeff_lag, actual->ar_coeff_lag); + EXPECT_EQ(expected->ar_coeff_shift, actual->ar_coeff_shift); + + const int num_pos_luma = + 2 * expected->ar_coeff_lag * (expected->ar_coeff_lag + 1); + const int num_pos_chroma = num_pos_luma; + EXPECT_EQ(0, memcmp(expected->ar_coeffs_y, actual->ar_coeffs_y, + sizeof(expected->ar_coeffs_y[0]) * num_pos_luma)); + if (actual->num_cb_points || actual->chroma_scaling_from_luma) { + EXPECT_EQ(0, memcmp(expected->ar_coeffs_cb, actual->ar_coeffs_cb, + sizeof(expected->ar_coeffs_cb[0]) * num_pos_chroma)); + } + if (actual->num_cr_points || actual->chroma_scaling_from_luma) { + EXPECT_EQ(0, memcmp(expected->ar_coeffs_cr, actual->ar_coeffs_cr, + sizeof(expected->ar_coeffs_cr[0]) * num_pos_chroma)); + } + EXPECT_EQ(expected->overlap_flag, actual->overlap_flag); + EXPECT_EQ(expected->chroma_scaling_from_luma, + actual->chroma_scaling_from_luma); + EXPECT_EQ(expected->grain_scale_shift, actual->grain_scale_shift); + // EXPECT_EQ(expected->random_seed, actual->random_seed); + + // clip_to_restricted and bit_depth aren't written + if (expected->num_cb_points) { + EXPECT_EQ(expected->cb_mult, actual->cb_mult); + EXPECT_EQ(expected->cb_luma_mult, actual->cb_luma_mult); + EXPECT_EQ(expected->cb_offset, actual->cb_offset); + } + if (expected->num_cr_points) { + EXPECT_EQ(expected->cr_mult, actual->cr_mult); + EXPECT_EQ(expected->cr_luma_mult, actual->cr_luma_mult); + EXPECT_EQ(expected->cr_offset, actual->cr_offset); + } +} + +TEST(FilmGrainTableTest, AddAndLookupSingleSegment) { + aom_film_grain_table_t table; + memset(&table, 0, sizeof(table)); + + aom_film_grain_t grain; + EXPECT_FALSE(aom_film_grain_table_lookup(&table, 0, 1000, false, &grain)); + + aom_film_grain_table_append(&table, 1000, 2000, film_grain_test_vectors + 0); + EXPECT_FALSE(aom_film_grain_table_lookup(&table, 0, 1000, false, &grain)); + EXPECT_FALSE(aom_film_grain_table_lookup(&table, 2000, 3000, false, &grain)); + + EXPECT_TRUE(aom_film_grain_table_lookup(&table, 1000, 2000, false, &grain)); + + grain.bit_depth = film_grain_test_vectors[0].bit_depth; + EXPECT_EQ(0, memcmp(&grain, film_grain_test_vectors + 0, sizeof(table))); + + // Extend the existing segment + aom_film_grain_table_append(&table, 2000, 3000, film_grain_test_vectors + 0); + EXPECT_EQ(0, table.head->next); + + // Lookup and remove and check that the entry is no longer there + EXPECT_TRUE(aom_film_grain_table_lookup(&table, 1000, 2000, true, &grain)); + EXPECT_FALSE(aom_film_grain_table_lookup(&table, 1000, 2000, false, &grain)); + + EXPECT_TRUE(aom_film_grain_table_lookup(&table, 2000, 3000, true, &grain)); + EXPECT_FALSE(aom_film_grain_table_lookup(&table, 2000, 3000, false, &grain)); + + EXPECT_EQ(0, table.head); + EXPECT_EQ(0, table.tail); + aom_film_grain_table_free(&table); +} + +TEST(FilmGrainTableTest, SplitSingleSegment) { + aom_film_grain_table_t table; + aom_film_grain_t grain; + memset(&table, 0, sizeof(table)); + + aom_film_grain_table_append(&table, 0, 1000, film_grain_test_vectors + 0); + + // Test lookup and remove that adjusts start time + EXPECT_TRUE(aom_film_grain_table_lookup(&table, 0, 100, true, &grain)); + EXPECT_EQ(NULL, table.head->next); + EXPECT_EQ(100, table.head->start_time); + + // Test lookup and remove that adjusts end time + EXPECT_TRUE(aom_film_grain_table_lookup(&table, 900, 1000, true, &grain)); + EXPECT_EQ(NULL, table.head->next); + EXPECT_EQ(100, table.head->start_time); + EXPECT_EQ(900, table.head->end_time); + + // Test lookup and remove that splits the first entry + EXPECT_TRUE(aom_film_grain_table_lookup(&table, 400, 600, true, &grain)); + EXPECT_EQ(100, table.head->start_time); + EXPECT_EQ(400, table.head->end_time); + + ASSERT_NE((void *)NULL, table.head->next); + EXPECT_EQ(table.tail, table.head->next); + EXPECT_EQ(600, table.head->next->start_time); + EXPECT_EQ(900, table.head->next->end_time); + + aom_film_grain_table_free(&table); +} + +TEST(FilmGrainTableTest, AddAndLookupMultipleSegments) { + aom_film_grain_table_t table; + memset(&table, 0, sizeof(table)); + + aom_film_grain_t grain; + const int kNumTestVectors = + sizeof(film_grain_test_vectors) / sizeof(film_grain_test_vectors[0]); + for (int i = 0; i < kNumTestVectors; ++i) { + aom_film_grain_table_append(&table, i * 1000, (i + 1) * 1000, + film_grain_test_vectors + i); + } + + for (int i = kNumTestVectors - 1; i >= 0; --i) { + EXPECT_TRUE(aom_film_grain_table_lookup(&table, i * 1000, (i + 1) * 1000, + true, &grain)); + grain_equal(film_grain_test_vectors + i, &grain); + EXPECT_FALSE(aom_film_grain_table_lookup(&table, i * 1000, (i + 1) * 1000, + true, &grain)); + } + + // Verify that all the data has been removed + for (int i = 0; i < kNumTestVectors; ++i) { + EXPECT_FALSE(aom_film_grain_table_lookup(&table, i * 1000, (i + 1) * 1000, + true, &grain)); + } + aom_film_grain_table_free(&table); +} + +class FilmGrainTableIOTest : public ::testing::Test { + protected: + void SetUp() { memset(&error_, 0, sizeof(error_)); } + struct aom_internal_error_info error_; +}; + +TEST_F(FilmGrainTableIOTest, ReadMissingFile) { + aom_film_grain_table_t table; + memset(&table, 0, sizeof(table)); + ASSERT_EQ(AOM_CODEC_ERROR, aom_film_grain_table_read( + &table, "/path/to/missing/file", &error_)); +} + +TEST_F(FilmGrainTableIOTest, ReadTruncatedFile) { + aom_film_grain_table_t table; + memset(&table, 0, sizeof(table)); + + std::string grain_file; + FILE *file = libaom_test::GetTempOutFile(&grain_file); + fwrite("deadbeef", 8, 1, file); + fclose(file); + ASSERT_EQ(AOM_CODEC_ERROR, + aom_film_grain_table_read(&table, grain_file.c_str(), &error_)); + EXPECT_EQ(0, remove(grain_file.c_str())); +} + +TEST_F(FilmGrainTableIOTest, RoundTripReadWrite) { + aom_film_grain_table_t table; + memset(&table, 0, sizeof(table)); + + aom_film_grain_t expected_grain[16]; + const int kNumTestVectors = + sizeof(film_grain_test_vectors) / sizeof(film_grain_test_vectors[0]); + for (int i = 0; i < kNumTestVectors; ++i) { + expected_grain[i] = film_grain_test_vectors[i]; + expected_grain[i].random_seed = i; + expected_grain[i].update_parameters = i % 2; + expected_grain[i].apply_grain = (i + 1) % 2; + expected_grain[i].bit_depth = 0; + aom_film_grain_table_append(&table, i * 1000, (i + 1) * 1000, + expected_grain + i); + } + std::string grain_file; + fclose(libaom_test::GetTempOutFile(&grain_file)); + ASSERT_EQ(AOM_CODEC_OK, + aom_film_grain_table_write(&table, grain_file.c_str(), &error_)); + aom_film_grain_table_free(&table); + + memset(&table, 0, sizeof(table)); + ASSERT_EQ(AOM_CODEC_OK, + aom_film_grain_table_read(&table, grain_file.c_str(), &error_)); + for (int i = 0; i < kNumTestVectors; ++i) { + aom_film_grain_t grain; + EXPECT_TRUE(aom_film_grain_table_lookup(&table, i * 1000, (i + 1) * 1000, + true, &grain)); + grain_equal(expected_grain + i, &grain); + } + aom_film_grain_table_free(&table); + EXPECT_EQ(0, remove(grain_file.c_str())); +} + +TEST_F(FilmGrainTableIOTest, RoundTripSplit) { + std::string grain_file; + fclose(libaom_test::GetTempOutFile(&grain_file)); + + aom_film_grain_table_t table; + memset(&table, 0, sizeof(table)); + + aom_film_grain_t grain = film_grain_test_vectors[0]; + aom_film_grain_table_append(&table, 0, 3000, &grain); + ASSERT_TRUE(aom_film_grain_table_lookup(&table, 1000, 2000, true, &grain)); + ASSERT_TRUE(aom_film_grain_table_lookup(&table, 0, 1000, false, &grain)); + EXPECT_FALSE(aom_film_grain_table_lookup(&table, 1000, 2000, false, &grain)); + ASSERT_TRUE(aom_film_grain_table_lookup(&table, 2000, 3000, false, &grain)); + ASSERT_EQ(AOM_CODEC_OK, + aom_film_grain_table_write(&table, grain_file.c_str(), &error_)); + aom_film_grain_table_free(&table); + + memset(&table, 0, sizeof(table)); + ASSERT_EQ(AOM_CODEC_OK, + aom_film_grain_table_read(&table, grain_file.c_str(), &error_)); + ASSERT_TRUE(aom_film_grain_table_lookup(&table, 0, 1000, false, &grain)); + ASSERT_FALSE(aom_film_grain_table_lookup(&table, 1000, 2000, false, &grain)); + ASSERT_TRUE(aom_film_grain_table_lookup(&table, 2000, 3000, false, &grain)); + aom_film_grain_table_free(&table); + + EXPECT_EQ(0, remove(grain_file.c_str())); +} diff --git a/libs/libaom/src/test/filterintra_test.cc b/libs/libaom/src/test/filterintra_test.cc new file mode 100644 index 000000000..284353c69 --- /dev/null +++ b/libs/libaom/src/test/filterintra_test.cc @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/av1_rtcd.h" + +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "av1/common/enums.h" + +namespace { + +using libaom_test::ACMRandom; +using std::tuple; + +typedef void (*Predictor)(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, + const uint8_t *above, const uint8_t *left, int mode); + +// Note: +// Test parameter list: +// Reference predictor, optimized predictor, prediction mode, tx size +// +typedef tuple PredFuncMode; +typedef tuple PredParams; + +const int MaxTxSize = 32; + +const int MaxTestNum = 100; + +class AV1FilterIntraPredTest : public ::testing::TestWithParam { + public: + virtual ~AV1FilterIntraPredTest() {} + virtual void SetUp() { + PredFuncMode funcMode = GET_PARAM(0); + predFuncRef_ = std::get<0>(funcMode); + predFunc_ = std::get<1>(funcMode); + mode_ = std::get<2>(funcMode); + txSize_ = GET_PARAM(1); + + alloc_ = new uint8_t[2 * MaxTxSize + 1]; + predRef_ = new uint8_t[MaxTxSize * MaxTxSize]; + pred_ = new uint8_t[MaxTxSize * MaxTxSize]; + } + + virtual void TearDown() { + delete[] alloc_; + delete[] predRef_; + delete[] pred_; + libaom_test::ClearSystemState(); + } + + protected: + void RunTest() const { + int tstIndex = 0; + int stride = tx_size_wide[txSize_]; + uint8_t *left = alloc_; + uint8_t *above = alloc_ + MaxTxSize; + while (tstIndex < MaxTestNum) { + PrepareBuffer(); + predFuncRef_(predRef_, stride, txSize_, &above[1], left, mode_); + ASM_REGISTER_STATE_CHECK( + predFunc_(pred_, stride, txSize_, &above[1], left, mode_)); + DiffPred(tstIndex); + tstIndex += 1; + } + } + + private: + void PrepareBuffer() const { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + int i = 0; + while (i < (2 * MaxTxSize + 1)) { + alloc_[i] = rnd.Rand8(); + i++; + } + } + + void DiffPred(int testNum) const { + int i = 0; + while (i < tx_size_wide[txSize_] * tx_size_high[txSize_]) { + EXPECT_EQ(predRef_[i], pred_[i]) << "Error at position: " << i << " " + << "Tx size: " << tx_size_wide[txSize_] + << "x" << tx_size_high[txSize_] << " " + << "Test number: " << testNum; + i++; + } + } + + Predictor predFunc_; + Predictor predFuncRef_; + int mode_; + TX_SIZE txSize_; + uint8_t *alloc_; + uint8_t *pred_; + uint8_t *predRef_; +}; + +TEST_P(AV1FilterIntraPredTest, BitExactCheck) { RunTest(); } + +using std::make_tuple; + +const PredFuncMode kPredFuncMdArray[] = { + make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_sse4_1, + FILTER_DC_PRED), + make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_sse4_1, + FILTER_V_PRED), + make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_sse4_1, + FILTER_H_PRED), + make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_sse4_1, + FILTER_D157_PRED), + make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_sse4_1, + FILTER_PAETH_PRED), +}; + +const TX_SIZE kTxSize[] = { TX_4X4, TX_8X8, TX_16X16, TX_32X32, TX_4X8, + TX_8X4, TX_8X16, TX_16X8, TX_16X32, TX_32X16, + TX_4X16, TX_16X4, TX_8X32, TX_32X8 }; + +INSTANTIATE_TEST_SUITE_P( + SSE4_1, AV1FilterIntraPredTest, + ::testing::Combine(::testing::ValuesIn(kPredFuncMdArray), + ::testing::ValuesIn(kTxSize))); +} // namespace diff --git a/libs/libaom/src/test/frame_error_test.cc b/libs/libaom/src/test/frame_error_test.cc new file mode 100644 index 000000000..6d74a68f2 --- /dev/null +++ b/libs/libaom/src/test/frame_error_test.cc @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_mem/aom_mem.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/mem.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/util.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +namespace { +typedef int64_t (*frame_error_func)(const uint8_t *const ref, int stride, + const uint8_t *const dst, int p_width, + int p_height, int p_stride); +#if HAVE_AVX2 || HAVE_SSE2 +const int kBlockWidth[] = { + 832, 834, 640, 1280, 1920, +}; +const int kBlockHeight[] = { + 480, 482, 360, 720, 1080, +}; +#endif +typedef std::tuple FrameErrorParam; + +class AV1FrameErrorTest : public ::testing::TestWithParam { + public: + virtual ~AV1FrameErrorTest() {} + virtual void SetUp() { + rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed()); + } + virtual void TearDown() { libaom_test::ClearSystemState(); } + + protected: + void RandomValues(frame_error_func test_impl, int width, int height); + void ExtremeValues(frame_error_func test_impl, int width, int height); + void RunSpeedTest(frame_error_func test_impl, int width, int height); + libaom_test::ACMRandom rnd_; +}; + +void AV1FrameErrorTest::RandomValues(frame_error_func test_impl, int width, + int height) { + const int stride = (((width * 3) / 2) + 15) & ~15; + const int max_blk_size = stride * height; + uint8_t *const dst = + static_cast(aom_memalign(16, max_blk_size * sizeof(*dst))); + uint8_t *const ref = + static_cast(aom_memalign(16, max_blk_size * sizeof(*ref))); + ASSERT_TRUE(dst != NULL); + ASSERT_TRUE(ref != NULL); + for (int i = 0; i < max_blk_size; ++i) { + dst[i] = rnd_.Rand8(); + ref[i] = rnd_.Rand8(); + } + const int64_t ref_error = + av1_calc_frame_error_c(ref, stride, dst, width, height, stride); + const int64_t test_error = test_impl(ref, stride, dst, width, height, stride); + ASSERT_EQ(test_error, ref_error) << width << "x" << height; + aom_free(dst); + aom_free(ref); +} + +void AV1FrameErrorTest::ExtremeValues(frame_error_func test_impl, int width, + int height) { + const int stride = (((width * 3) / 2) + 15) & ~15; + const int max_blk_size = stride * height; + uint8_t *const dst = + static_cast(aom_memalign(16, max_blk_size * sizeof(*dst))); + uint8_t *const ref = + static_cast(aom_memalign(16, max_blk_size * sizeof(*ref))); + ASSERT_TRUE(dst != NULL); + ASSERT_TRUE(ref != NULL); + for (int r = 0; r < 2; r++) { + if (r == 0) { + memset(dst, 0, max_blk_size); + memset(ref, 255, max_blk_size); + } else if (r == 1) { + memset(dst, 255, max_blk_size); + memset(ref, 0, max_blk_size); + } + const int64_t ref_error = + av1_calc_frame_error_c(ref, stride, dst, width, height, stride); + const int64_t test_error = + test_impl(ref, stride, dst, width, height, stride); + ASSERT_EQ(test_error, ref_error) << width << "x" << height; + } + aom_free(dst); + aom_free(ref); +} + +void AV1FrameErrorTest::RunSpeedTest(frame_error_func test_impl, int width, + int height) { + const int stride = (((width * 3) / 2) + 15) & ~15; + const int max_blk_size = stride * height; + uint8_t *const dst = + static_cast(aom_memalign(16, max_blk_size * sizeof(*dst))); + uint8_t *const ref = + static_cast(aom_memalign(16, max_blk_size * sizeof(*ref))); + ASSERT_TRUE(dst != NULL); + ASSERT_TRUE(ref != NULL); + for (int i = 0; i < max_blk_size; ++i) { + dst[i] = ref[i] = rnd_.Rand8(); + } + const int num_loops = 10000000 / (width + height); + frame_error_func funcs[2] = { av1_calc_frame_error_c, test_impl }; + double elapsed_time[2] = { 0 }; + for (int i = 0; i < 2; ++i) { + aom_usec_timer timer; + aom_usec_timer_start(&timer); + frame_error_func func = funcs[i]; + for (int j = 0; j < num_loops; ++j) { + func(ref, stride, dst, width, height, stride); + } + aom_usec_timer_mark(&timer); + double time = static_cast(aom_usec_timer_elapsed(&timer)); + elapsed_time[i] = 1000.0 * time / num_loops; + } + aom_free(dst); + aom_free(ref); + printf("av1_calc_frame_error %3dx%-3d: %7.2f/%7.2fns", width, height, + elapsed_time[0], elapsed_time[1]); + printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]); +} + +TEST_P(AV1FrameErrorTest, CheckOutput) { + RandomValues(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2)); + ExtremeValues(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2)); +} + +TEST_P(AV1FrameErrorTest, DISABLED_Speed) { + RunSpeedTest(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2)); +} + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, AV1FrameErrorTest, + ::testing::Combine(::testing::Values(&av1_calc_frame_error_sse2), + ::testing::ValuesIn(kBlockWidth), + ::testing::ValuesIn(kBlockHeight))); +#endif + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, AV1FrameErrorTest, + ::testing::Combine(::testing::Values(&av1_calc_frame_error_avx2), + ::testing::ValuesIn(kBlockWidth), + ::testing::ValuesIn(kBlockHeight))); +#endif +} // namespace diff --git a/libs/libaom/src/test/frame_size_tests.cc b/libs/libaom/src/test/frame_size_tests.cc new file mode 100644 index 000000000..1546012a3 --- /dev/null +++ b/libs/libaom/src/test/frame_size_tests.cc @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/video_source.h" + +namespace { + +class AV1FrameSizeTests : public ::testing::Test, + public ::libaom_test::EncoderTest { + protected: + AV1FrameSizeTests() + : EncoderTest(&::libaom_test::kAV1), expected_res_(AOM_CODEC_OK) {} + virtual ~AV1FrameSizeTests() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(::libaom_test::kRealTime); + } + + virtual bool HandleDecodeResult(const aom_codec_err_t res_dec, + libaom_test::Decoder *decoder) { + EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError(); + return !::testing::Test::HasFailure(); + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AOME_SET_CPUUSED, 7); + encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1); + encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7); + encoder->Control(AOME_SET_ARNR_STRENGTH, 5); + } + } + + int expected_res_; +}; + +#if CONFIG_SIZE_LIMIT +TEST_F(AV1FrameSizeTests, TestInvalidSizes) { + ::libaom_test::RandomVideoSource video; + + video.SetSize(DECODE_WIDTH_LIMIT + 16, DECODE_HEIGHT_LIMIT + 16); + video.set_limit(2); + expected_res_ = AOM_CODEC_CORRUPT_FRAME; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +TEST_F(AV1FrameSizeTests, LargeValidSizes) { + ::libaom_test::RandomVideoSource video; + + video.SetSize(DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT); + video.set_limit(2); + expected_res_ = AOM_CODEC_OK; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} +#endif + +TEST_F(AV1FrameSizeTests, OneByOneVideo) { + ::libaom_test::RandomVideoSource video; + + video.SetSize(1, 1); + video.set_limit(2); + expected_res_ = AOM_CODEC_OK; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} +#undef ONE_BY_ONE_VIDEO_NAME +} // namespace diff --git a/libs/libaom/src/test/function_equivalence_test.h b/libs/libaom/src/test/function_equivalence_test.h new file mode 100644 index 000000000..a299c48d4 --- /dev/null +++ b/libs/libaom/src/test/function_equivalence_test.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_TEST_FUNCTION_EQUIVALENCE_TEST_H_ +#define AOM_TEST_FUNCTION_EQUIVALENCE_TEST_H_ + +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/util.h" + +using libaom_test::ACMRandom; + +namespace libaom_test { +// Base class for tests that compare 2 implementations of the same function +// for equivalence. The template parameter should be pointer to a function +// that is being tested. +// +// The test takes a 3-parameters encapsulating struct 'FuncParam', containing: +// - Pointer to reference function +// - Pointer to tested function +// - Integer bit depth (default to 0). +// +// These values are then accessible in the tests as member of params_: +// params_.ref_func, params_.tst_func, and params_.bit_depth. +// + +template +struct FuncParam { + FuncParam(T ref = NULL, T tst = NULL, int bit_depth = 0) + : ref_func(ref), tst_func(tst), bit_depth(bit_depth) {} + T ref_func; + T tst_func; + int bit_depth; +}; + +template +std::ostream &operator<<(std::ostream &os, const FuncParam &p) { + return os << "bit_depth:" << p.bit_depth + << " function:" << reinterpret_cast(p.ref_func) + << " function:" << reinterpret_cast(p.tst_func); +} + +template +class FunctionEquivalenceTest : public ::testing::TestWithParam > { + public: + FunctionEquivalenceTest() : rng_(ACMRandom::DeterministicSeed()) {} + + virtual ~FunctionEquivalenceTest() {} + + virtual void SetUp() { params_ = this->GetParam(); } + + virtual void TearDown() { libaom_test::ClearSystemState(); } + + protected: + ACMRandom rng_; + FuncParam params_; +}; + +} // namespace libaom_test +#endif // AOM_TEST_FUNCTION_EQUIVALENCE_TEST_H_ diff --git a/libs/libaom/src/test/fwd_kf_test.cc b/libs/libaom/src/test/fwd_kf_test.cc new file mode 100644 index 000000000..50c2f36d8 --- /dev/null +++ b/libs/libaom/src/test/fwd_kf_test.cc @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" + +namespace { + +typedef struct { + const int max_kf_dist; + const double psnr_thresh; +} FwdKfTestParam; + +const FwdKfTestParam kTestParams[] = { + { 4, 33.4 }, { 6, 32.9 }, { 8, 32.6 }, + { 12, 32.4 }, { 16, 32.3 }, { 18, 32.1 } +}; + +std::ostream &operator<<(std::ostream &os, const FwdKfTestParam &test_arg) { + return os << "FwdKfTestParam { max_kf_dist:" << test_arg.max_kf_dist + << " psnr_thresh:" << test_arg.psnr_thresh << " }"; +} + +class ForwardKeyTest + : public ::libaom_test::CodecTestWith2Params, + public ::libaom_test::EncoderTest { + protected: + ForwardKeyTest() + : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), + kf_max_dist_param_(GET_PARAM(2)) {} + virtual ~ForwardKeyTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(encoding_mode_); + const aom_rational timebase = { 1, 30 }; + cfg_.g_timebase = timebase; + cpu_used_ = 2; + kf_max_dist_ = kf_max_dist_param_.max_kf_dist; + psnr_threshold_ = kf_max_dist_param_.psnr_thresh; + cfg_.rc_end_usage = AOM_VBR; + cfg_.rc_target_bitrate = 200; + cfg_.g_lag_in_frames = 10; + cfg_.fwd_kf_enabled = 1; + cfg_.kf_max_dist = kf_max_dist_; + cfg_.g_threads = 0; + init_flags_ = AOM_CODEC_USE_PSNR; + } + + virtual void BeginPassHook(unsigned int) { + psnr_ = 0.0; + nframes_ = 0; + } + + virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) { + psnr_ += pkt->data.psnr.psnr[0]; + nframes_++; + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AOME_SET_CPUUSED, cpu_used_); + if (encoding_mode_ != ::libaom_test::kRealTime) { + encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1); + encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7); + encoder->Control(AOME_SET_ARNR_STRENGTH, 5); + } + } + } + + double GetAveragePsnr() const { + if (nframes_) return psnr_ / nframes_; + return 0.0; + } + + double GetPsnrThreshold() { return psnr_threshold_; } + + ::libaom_test::TestMode encoding_mode_; + const FwdKfTestParam kf_max_dist_param_; + double psnr_threshold_; + int kf_max_dist_; + int cpu_used_; + int nframes_; + double psnr_; +}; + +TEST_P(ForwardKeyTest, ForwardKeyEncodeTest) { + libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + cfg_.g_timebase.den, cfg_.g_timebase.num, + 0, 20); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // TODO(sarahparker) Add functionality to assert the minimum number of + // keyframes were placed. + EXPECT_GT(GetAveragePsnr(), GetPsnrThreshold()) + << "kf max dist = " << kf_max_dist_; +} + +AV1_INSTANTIATE_TEST_CASE(ForwardKeyTest, + ::testing::Values(::libaom_test::kTwoPassGood), + ::testing::ValuesIn(kTestParams)); +} // namespace diff --git a/libs/libaom/src/test/fwht4x4_test.cc b/libs/libaom/src/test/fwht4x4_test.cc new file mode 100644 index 000000000..d2f77b8d4 --- /dev/null +++ b/libs/libaom/src/test/fwht4x4_test.cc @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/av1_rtcd.h" +#include "config/aom_dsp_rtcd.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/transform_test_base.h" +#include "test/util.h" +#include "av1/common/entropy.h" +#include "aom/aom_codec.h" +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +using libaom_test::ACMRandom; + +namespace { +typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride); +typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride); + +using libaom_test::FhtFunc; + +typedef std::tuple + Dct4x4Param; + +void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride, + TxfmParam * /*txfm_param*/) { + av1_fwht4x4_c(in, out, stride); +} + +void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) { + av1_highbd_iwht4x4_16_add_c(in, out, stride, 10); +} + +void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) { + av1_highbd_iwht4x4_16_add_c(in, out, stride, 12); +} + +class Trans4x4WHT : public libaom_test::TransformTestBase, + public ::testing::TestWithParam { + public: + virtual ~Trans4x4WHT() {} + + virtual void SetUp() { + fwd_txfm_ = GET_PARAM(0); + inv_txfm_ = GET_PARAM(1); + pitch_ = 4; + height_ = 4; + fwd_txfm_ref = fwht4x4_ref; + bit_depth_ = GET_PARAM(3); + mask_ = (1 << bit_depth_) - 1; + num_coeffs_ = GET_PARAM(4); + } + virtual void TearDown() { libaom_test::ClearSystemState(); } + + protected: + void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) { + fwd_txfm_(in, out, stride); + } + void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) { + inv_txfm_(out, dst, stride); + } + + FdctFunc fwd_txfm_; + IdctFunc inv_txfm_; +}; + +TEST_P(Trans4x4WHT, AccuracyCheck) { RunAccuracyCheck(0, 0.00001); } + +TEST_P(Trans4x4WHT, CoeffCheck) { RunCoeffCheck(); } + +TEST_P(Trans4x4WHT, MemCheck) { RunMemCheck(); } + +TEST_P(Trans4x4WHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); } +using std::make_tuple; + +INSTANTIATE_TEST_SUITE_P( + C, Trans4x4WHT, + ::testing::Values(make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_10, DCT_DCT, + AOM_BITS_10, 16), + make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_12, DCT_DCT, + AOM_BITS_12, 16))); +} // namespace diff --git a/libs/libaom/src/test/gf_pyr_height_test.cc b/libs/libaom/src/test/gf_pyr_height_test.cc new file mode 100644 index 000000000..b1ade67a6 --- /dev/null +++ b/libs/libaom/src/test/gf_pyr_height_test.cc @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" + +namespace { + +static const struct GFPyrHeightTestParam { + int gf_min_pyr_height; + int gf_max_pyr_height; + double psnr_thresh; +} kTestParams[] = { + // gf_min_pyr_height = 0 + { 0, 0, 33.40 }, + { 0, 1, 34.00 }, + { 0, 2, 34.00 }, + { 0, 3, 34.20 }, + { 0, 4, 34.30 }, + { 0, 5, 34.40 }, + // gf_min_pyr_height = 1 + { 1, 1, 34.00 }, + { 1, 2, 34.00 }, + { 1, 3, 34.20 }, + { 1, 4, 34.30 }, + { 1, 5, 34.40 }, + // gf_min_pyr_height = 2 + { 2, 2, 34.00 }, + { 2, 3, 34.20 }, + { 2, 4, 34.30 }, + { 2, 5, 34.40 }, + // gf_min_pyr_height = 3 + { 3, 3, 34.20 }, + { 3, 4, 34.30 }, + { 3, 5, 34.40 }, + // gf_min_pyr_height = 4 + { 4, 4, 34.30 }, + { 4, 5, 34.40 }, + // gf_min_pyr_height = 5 + { 5, 5, 34.40 }, +}; + +// Compiler may decide to add some padding to the struct above for alignment, +// which the gtest may try to print (on error for example). This would cause +// valgrind to complain that the padding is uninitialized. To avoid that, we +// provide our own function to print the struct. +// This also makes '--gtest_list_tests' output more understandable. +std::ostream &operator<<(std::ostream &os, const GFPyrHeightTestParam &p) { + os << "GFPyrHeightTestParam { " + << "gf_min_pyr_height = " << p.gf_min_pyr_height << ", " + << "gf_max_pyr_height = " << p.gf_max_pyr_height << ", " + << "psnr_thresh = " << p.psnr_thresh << " }"; + return os; +} + +// Params: encoding mode, rate control mode and GFPyrHeightTestParam object. +class GFPyrHeightTest + : public ::libaom_test::CodecTestWith3Params< + libaom_test::TestMode, aom_rc_mode, GFPyrHeightTestParam>, + public ::libaom_test::EncoderTest { + protected: + GFPyrHeightTest() + : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), + rc_mode_(GET_PARAM(2)) { + gf_min_pyr_height_ = GET_PARAM(3).gf_min_pyr_height; + gf_max_pyr_height_ = GET_PARAM(3).gf_max_pyr_height; + psnr_threshold_ = GET_PARAM(3).psnr_thresh; + } + virtual ~GFPyrHeightTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(encoding_mode_); + const aom_rational timebase = { 1, 30 }; + cfg_.g_timebase = timebase; + cpu_used_ = 4; + cfg_.rc_end_usage = rc_mode_; + if (rc_mode_ == AOM_VBR) { + cfg_.rc_target_bitrate = 200; + } + cfg_.g_lag_in_frames = 19; + cfg_.g_threads = 0; + init_flags_ = AOM_CODEC_USE_PSNR; + } + + virtual void BeginPassHook(unsigned int) { + psnr_ = 0.0; + nframes_ = 0; + } + + virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) { + psnr_ += pkt->data.psnr.psnr[0]; + nframes_++; + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AOME_SET_CPUUSED, cpu_used_); + if (rc_mode_ == AOM_Q) { + encoder->Control(AOME_SET_CQ_LEVEL, 32); + } + if (encoding_mode_ != ::libaom_test::kRealTime) { + encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1); + encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7); + encoder->Control(AOME_SET_ARNR_STRENGTH, 5); + } + encoder->Control(AV1E_SET_GF_MIN_PYRAMID_HEIGHT, gf_min_pyr_height_); + encoder->Control(AV1E_SET_GF_MAX_PYRAMID_HEIGHT, gf_max_pyr_height_); + } + } + + double GetAveragePsnr() const { + if (nframes_) return psnr_ / nframes_; + return 0.0; + } + + double GetPsnrThreshold() { return psnr_threshold_; } + + ::libaom_test::TestMode encoding_mode_; + aom_rc_mode rc_mode_; + double psnr_threshold_; + int gf_min_pyr_height_; + int gf_max_pyr_height_; + int cpu_used_; + int nframes_; + double psnr_; +}; + +TEST_P(GFPyrHeightTest, EncodeAndVerifyPSNR) { + libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + cfg_.g_timebase.den, cfg_.g_timebase.num, + 0, 32); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + EXPECT_GT(GetAveragePsnr(), GetPsnrThreshold()) + << "GF Min Pyramid Height = " << gf_min_pyr_height_ << ", " + << "GF Max Pyramid Height = " << gf_max_pyr_height_; +} + +AV1_INSTANTIATE_TEST_CASE(GFPyrHeightTest, NONREALTIME_TEST_MODES, + ::testing::Values(AOM_Q, AOM_VBR), + ::testing::ValuesIn(kTestParams)); +} // namespace diff --git a/libs/libaom/src/test/gviz_api.py b/libs/libaom/src/test/gviz_api.py new file mode 100644 index 000000000..d3a443dab --- /dev/null +++ b/libs/libaom/src/test/gviz_api.py @@ -0,0 +1,1087 @@ +#!/usr/bin/python +# +# Copyright (c) 2016, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and +# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +# was not distributed with this source code in the LICENSE file, you can +# obtain it at www.aomedia.org/license/software. If the Alliance for Open +# Media Patent License 1.0 was not distributed with this source code in the +# PATENTS file, you can obtain it at www.aomedia.org/license/patent. +# + +"""Converts Python data into data for Google Visualization API clients. + +This library can be used to create a google.visualization.DataTable usable by +visualizations built on the Google Visualization API. Output formats are raw +JSON, JSON response, JavaScript, CSV, and HTML table. + +See http://code.google.com/apis/visualization/ for documentation on the +Google Visualization API. +""" + +__author__ = "Amit Weinstein, Misha Seltzer, Jacob Baskin" + +import cgi +import cStringIO +import csv +import datetime +try: + import json +except ImportError: + import simplejson as json +import types + + +class DataTableException(Exception): + """The general exception object thrown by DataTable.""" + pass + + +class DataTableJSONEncoder(json.JSONEncoder): + """JSON encoder that handles date/time/datetime objects correctly.""" + + def __init__(self): + json.JSONEncoder.__init__(self, + separators=(",", ":"), + ensure_ascii=False) + + def default(self, o): + if isinstance(o, datetime.datetime): + if o.microsecond == 0: + # If the time doesn't have ms-resolution, leave it out to keep + # things smaller. + return "Date(%d,%d,%d,%d,%d,%d)" % ( + o.year, o.month - 1, o.day, o.hour, o.minute, o.second) + else: + return "Date(%d,%d,%d,%d,%d,%d,%d)" % ( + o.year, o.month - 1, o.day, o.hour, o.minute, o.second, + o.microsecond / 1000) + elif isinstance(o, datetime.date): + return "Date(%d,%d,%d)" % (o.year, o.month - 1, o.day) + elif isinstance(o, datetime.time): + return [o.hour, o.minute, o.second] + else: + return super(DataTableJSONEncoder, self).default(o) + + +class DataTable(object): + """Wraps the data to convert to a Google Visualization API DataTable. + + Create this object, populate it with data, then call one of the ToJS... + methods to return a string representation of the data in the format described. + + You can clear all data from the object to reuse it, but you cannot clear + individual cells, rows, or columns. You also cannot modify the table schema + specified in the class constructor. + + You can add new data one or more rows at a time. All data added to an + instantiated DataTable must conform to the schema passed in to __init__(). + + You can reorder the columns in the output table, and also specify row sorting + order by column. The default column order is according to the original + table_description parameter. Default row sort order is ascending, by column + 1 values. For a dictionary, we sort the keys for order. + + The data and the table_description are closely tied, as described here: + + The table schema is defined in the class constructor's table_description + parameter. The user defines each column using a tuple of + (id[, type[, label[, custom_properties]]]). The default value for type is + string, label is the same as ID if not specified, and custom properties is + an empty dictionary if not specified. + + table_description is a dictionary or list, containing one or more column + descriptor tuples, nested dictionaries, and lists. Each dictionary key, list + element, or dictionary element must eventually be defined as + a column description tuple. Here's an example of a dictionary where the key + is a tuple, and the value is a list of two tuples: + {('a', 'number'): [('b', 'number'), ('c', 'string')]} + + This flexibility in data entry enables you to build and manipulate your data + in a Python structure that makes sense for your program. + + Add data to the table using the same nested design as the table's + table_description, replacing column descriptor tuples with cell data, and + each row is an element in the top level collection. This will be a bit + clearer after you look at the following examples showing the + table_description, matching data, and the resulting table: + + Columns as list of tuples [col1, col2, col3] + table_description: [('a', 'number'), ('b', 'string')] + AppendData( [[1, 'z'], [2, 'w'], [4, 'o'], [5, 'k']] ) + Table: + a b <--- these are column ids/labels + 1 z + 2 w + 4 o + 5 k + + Dictionary of columns, where key is a column, and value is a list of + columns {col1: [col2, col3]} + table_description: {('a', 'number'): [('b', 'number'), ('c', 'string')]} + AppendData( data: {1: [2, 'z'], 3: [4, 'w']} + Table: + a b c + 1 2 z + 3 4 w + + Dictionary where key is a column, and the value is itself a dictionary of + columns {col1: {col2, col3}} + table_description: {('a', 'number'): {'b': 'number', 'c': 'string'}} + AppendData( data: {1: {'b': 2, 'c': 'z'}, 3: {'b': 4, 'c': 'w'}} + Table: + a b c + 1 2 z + 3 4 w + """ + + def __init__(self, table_description, data=None, custom_properties=None): + """Initialize the data table from a table schema and (optionally) data. + + See the class documentation for more information on table schema and data + values. + + Args: + table_description: A table schema, following one of the formats described + in TableDescriptionParser(). Schemas describe the + column names, data types, and labels. See + TableDescriptionParser() for acceptable formats. + data: Optional. If given, fills the table with the given data. The data + structure must be consistent with schema in table_description. See + the class documentation for more information on acceptable data. You + can add data later by calling AppendData(). + custom_properties: Optional. A dictionary from string to string that + goes into the table's custom properties. This can be + later changed by changing self.custom_properties. + + Raises: + DataTableException: Raised if the data and the description did not match, + or did not use the supported formats. + """ + self.__columns = self.TableDescriptionParser(table_description) + self.__data = [] + self.custom_properties = {} + if custom_properties is not None: + self.custom_properties = custom_properties + if data: + self.LoadData(data) + + @staticmethod + def CoerceValue(value, value_type): + """Coerces a single value into the type expected for its column. + + Internal helper method. + + Args: + value: The value which should be converted + value_type: One of "string", "number", "boolean", "date", "datetime" or + "timeofday". + + Returns: + An item of the Python type appropriate to the given value_type. Strings + are also converted to Unicode using UTF-8 encoding if necessary. + If a tuple is given, it should be in one of the following forms: + - (value, formatted value) + - (value, formatted value, custom properties) + where the formatted value is a string, and custom properties is a + dictionary of the custom properties for this cell. + To specify custom properties without specifying formatted value, one can + pass None as the formatted value. + One can also have a null-valued cell with formatted value and/or custom + properties by specifying None for the value. + This method ignores the custom properties except for checking that it is a + dictionary. The custom properties are handled in the ToJSon and ToJSCode + methods. + The real type of the given value is not strictly checked. For example, + any type can be used for string - as we simply take its str( ) and for + boolean value we just check "if value". + Examples: + CoerceValue(None, "string") returns None + CoerceValue((5, "5$"), "number") returns (5, "5$") + CoerceValue(100, "string") returns "100" + CoerceValue(0, "boolean") returns False + + Raises: + DataTableException: The value and type did not match in a not-recoverable + way, for example given value 'abc' for type 'number'. + """ + if isinstance(value, tuple): + # In case of a tuple, we run the same function on the value itself and + # add the formatted value. + if (len(value) not in [2, 3] or + (len(value) == 3 and not isinstance(value[2], dict))): + raise DataTableException("Wrong format for value and formatting - %s." % + str(value)) + if not isinstance(value[1], types.StringTypes + (types.NoneType,)): + raise DataTableException("Formatted value is not string, given %s." % + type(value[1])) + js_value = DataTable.CoerceValue(value[0], value_type) + return (js_value,) + value[1:] + + t_value = type(value) + if value is None: + return value + if value_type == "boolean": + return bool(value) + + elif value_type == "number": + if isinstance(value, (int, long, float)): + return value + raise DataTableException("Wrong type %s when expected number" % t_value) + + elif value_type == "string": + if isinstance(value, unicode): + return value + else: + return str(value).decode("utf-8") + + elif value_type == "date": + if isinstance(value, datetime.datetime): + return datetime.date(value.year, value.month, value.day) + elif isinstance(value, datetime.date): + return value + else: + raise DataTableException("Wrong type %s when expected date" % t_value) + + elif value_type == "timeofday": + if isinstance(value, datetime.datetime): + return datetime.time(value.hour, value.minute, value.second) + elif isinstance(value, datetime.time): + return value + else: + raise DataTableException("Wrong type %s when expected time" % t_value) + + elif value_type == "datetime": + if isinstance(value, datetime.datetime): + return value + else: + raise DataTableException("Wrong type %s when expected datetime" % + t_value) + # If we got here, it means the given value_type was not one of the + # supported types. + raise DataTableException("Unsupported type %s" % value_type) + + @staticmethod + def EscapeForJSCode(encoder, value): + if value is None: + return "null" + elif isinstance(value, datetime.datetime): + if value.microsecond == 0: + # If it's not ms-resolution, leave that out to save space. + return "new Date(%d,%d,%d,%d,%d,%d)" % (value.year, + value.month - 1, # To match JS + value.day, + value.hour, + value.minute, + value.second) + else: + return "new Date(%d,%d,%d,%d,%d,%d,%d)" % (value.year, + value.month - 1, # match JS + value.day, + value.hour, + value.minute, + value.second, + value.microsecond / 1000) + elif isinstance(value, datetime.date): + return "new Date(%d,%d,%d)" % (value.year, value.month - 1, value.day) + else: + return encoder.encode(value) + + @staticmethod + def ToString(value): + if value is None: + return "(empty)" + elif isinstance(value, (datetime.datetime, + datetime.date, + datetime.time)): + return str(value) + elif isinstance(value, unicode): + return value + elif isinstance(value, bool): + return str(value).lower() + else: + return str(value).decode("utf-8") + + @staticmethod + def ColumnTypeParser(description): + """Parses a single column description. Internal helper method. + + Args: + description: a column description in the possible formats: + 'id' + ('id',) + ('id', 'type') + ('id', 'type', 'label') + ('id', 'type', 'label', {'custom_prop1': 'custom_val1'}) + Returns: + Dictionary with the following keys: id, label, type, and + custom_properties where: + - If label not given, it equals the id. + - If type not given, string is used by default. + - If custom properties are not given, an empty dictionary is used by + default. + + Raises: + DataTableException: The column description did not match the RE, or + unsupported type was passed. + """ + if not description: + raise DataTableException("Description error: empty description given") + + if not isinstance(description, (types.StringTypes, tuple)): + raise DataTableException("Description error: expected either string or " + "tuple, got %s." % type(description)) + + if isinstance(description, types.StringTypes): + description = (description,) + + # According to the tuple's length, we fill the keys + # We verify everything is of type string + for elem in description[:3]: + if not isinstance(elem, types.StringTypes): + raise DataTableException("Description error: expected tuple of " + "strings, current element of type %s." % + type(elem)) + desc_dict = {"id": description[0], + "label": description[0], + "type": "string", + "custom_properties": {}} + if len(description) > 1: + desc_dict["type"] = description[1].lower() + if len(description) > 2: + desc_dict["label"] = description[2] + if len(description) > 3: + if not isinstance(description[3], dict): + raise DataTableException("Description error: expected custom " + "properties of type dict, current element " + "of type %s." % type(description[3])) + desc_dict["custom_properties"] = description[3] + if len(description) > 4: + raise DataTableException("Description error: tuple of length > 4") + if desc_dict["type"] not in ["string", "number", "boolean", + "date", "datetime", "timeofday"]: + raise DataTableException( + "Description error: unsupported type '%s'" % desc_dict["type"]) + return desc_dict + + @staticmethod + def TableDescriptionParser(table_description, depth=0): + """Parses the table_description object for internal use. + + Parses the user-submitted table description into an internal format used + by the Python DataTable class. Returns the flat list of parsed columns. + + Args: + table_description: A description of the table which should comply + with one of the formats described below. + depth: Optional. The depth of the first level in the current description. + Used by recursive calls to this function. + + Returns: + List of columns, where each column represented by a dictionary with the + keys: id, label, type, depth, container which means the following: + - id: the id of the column + - name: The name of the column + - type: The datatype of the elements in this column. Allowed types are + described in ColumnTypeParser(). + - depth: The depth of this column in the table description + - container: 'dict', 'iter' or 'scalar' for parsing the format easily. + - custom_properties: The custom properties for this column. + The returned description is flattened regardless of how it was given. + + Raises: + DataTableException: Error in a column description or in the description + structure. + + Examples: + A column description can be of the following forms: + 'id' + ('id',) + ('id', 'type') + ('id', 'type', 'label') + ('id', 'type', 'label', {'custom_prop1': 'custom_val1'}) + or as a dictionary: + 'id': 'type' + 'id': ('type',) + 'id': ('type', 'label') + 'id': ('type', 'label', {'custom_prop1': 'custom_val1'}) + If the type is not specified, we treat it as string. + If no specific label is given, the label is simply the id. + If no custom properties are given, we use an empty dictionary. + + input: [('a', 'date'), ('b', 'timeofday', 'b', {'foo': 'bar'})] + output: [{'id': 'a', 'label': 'a', 'type': 'date', + 'depth': 0, 'container': 'iter', 'custom_properties': {}}, + {'id': 'b', 'label': 'b', 'type': 'timeofday', + 'depth': 0, 'container': 'iter', + 'custom_properties': {'foo': 'bar'}}] + + input: {'a': [('b', 'number'), ('c', 'string', 'column c')]} + output: [{'id': 'a', 'label': 'a', 'type': 'string', + 'depth': 0, 'container': 'dict', 'custom_properties': {}}, + {'id': 'b', 'label': 'b', 'type': 'number', + 'depth': 1, 'container': 'iter', 'custom_properties': {}}, + {'id': 'c', 'label': 'column c', 'type': 'string', + 'depth': 1, 'container': 'iter', 'custom_properties': {}}] + + input: {('a', 'number', 'column a'): { 'b': 'number', 'c': 'string'}} + output: [{'id': 'a', 'label': 'column a', 'type': 'number', + 'depth': 0, 'container': 'dict', 'custom_properties': {}}, + {'id': 'b', 'label': 'b', 'type': 'number', + 'depth': 1, 'container': 'dict', 'custom_properties': {}}, + {'id': 'c', 'label': 'c', 'type': 'string', + 'depth': 1, 'container': 'dict', 'custom_properties': {}}] + + input: { ('w', 'string', 'word'): ('c', 'number', 'count') } + output: [{'id': 'w', 'label': 'word', 'type': 'string', + 'depth': 0, 'container': 'dict', 'custom_properties': {}}, + {'id': 'c', 'label': 'count', 'type': 'number', + 'depth': 1, 'container': 'scalar', 'custom_properties': {}}] + + input: {'a': ('number', 'column a'), 'b': ('string', 'column b')} + output: [{'id': 'a', 'label': 'column a', 'type': 'number', 'depth': 0, + 'container': 'dict', 'custom_properties': {}}, + {'id': 'b', 'label': 'column b', 'type': 'string', 'depth': 0, + 'container': 'dict', 'custom_properties': {}} + + NOTE: there might be ambiguity in the case of a dictionary representation + of a single column. For example, the following description can be parsed + in 2 different ways: {'a': ('b', 'c')} can be thought of a single column + with the id 'a', of type 'b' and the label 'c', or as 2 columns: one named + 'a', and the other named 'b' of type 'c'. We choose the first option by + default, and in case the second option is the right one, it is possible to + make the key into a tuple (i.e. {('a',): ('b', 'c')}) or add more info + into the tuple, thus making it look like this: {'a': ('b', 'c', 'b', {})} + -- second 'b' is the label, and {} is the custom properties field. + """ + # For the recursion step, we check for a scalar object (string or tuple) + if isinstance(table_description, (types.StringTypes, tuple)): + parsed_col = DataTable.ColumnTypeParser(table_description) + parsed_col["depth"] = depth + parsed_col["container"] = "scalar" + return [parsed_col] + + # Since it is not scalar, table_description must be iterable. + if not hasattr(table_description, "__iter__"): + raise DataTableException("Expected an iterable object, got %s" % + type(table_description)) + if not isinstance(table_description, dict): + # We expects a non-dictionary iterable item. + columns = [] + for desc in table_description: + parsed_col = DataTable.ColumnTypeParser(desc) + parsed_col["depth"] = depth + parsed_col["container"] = "iter" + columns.append(parsed_col) + if not columns: + raise DataTableException("Description iterable objects should not" + " be empty.") + return columns + # The other case is a dictionary + if not table_description: + raise DataTableException("Empty dictionaries are not allowed inside" + " description") + + # To differentiate between the two cases of more levels below or this is + # the most inner dictionary, we consider the number of keys (more then one + # key is indication for most inner dictionary) and the type of the key and + # value in case of only 1 key (if the type of key is string and the type of + # the value is a tuple of 0-3 items, we assume this is the most inner + # dictionary). + # NOTE: this way of differentiating might create ambiguity. See docs. + if (len(table_description) != 1 or + (isinstance(table_description.keys()[0], types.StringTypes) and + isinstance(table_description.values()[0], tuple) and + len(table_description.values()[0]) < 4)): + # This is the most inner dictionary. Parsing types. + columns = [] + # We sort the items, equivalent to sort the keys since they are unique + for key, value in sorted(table_description.items()): + # We parse the column type as (key, type) or (key, type, label) using + # ColumnTypeParser. + if isinstance(value, tuple): + parsed_col = DataTable.ColumnTypeParser((key,) + value) + else: + parsed_col = DataTable.ColumnTypeParser((key, value)) + parsed_col["depth"] = depth + parsed_col["container"] = "dict" + columns.append(parsed_col) + return columns + # This is an outer dictionary, must have at most one key. + parsed_col = DataTable.ColumnTypeParser(table_description.keys()[0]) + parsed_col["depth"] = depth + parsed_col["container"] = "dict" + return ([parsed_col] + + DataTable.TableDescriptionParser(table_description.values()[0], + depth=depth + 1)) + + @property + def columns(self): + """Returns the parsed table description.""" + return self.__columns + + def NumberOfRows(self): + """Returns the number of rows in the current data stored in the table.""" + return len(self.__data) + + def SetRowsCustomProperties(self, rows, custom_properties): + """Sets the custom properties for given row(s). + + Can accept a single row or an iterable of rows. + Sets the given custom properties for all specified rows. + + Args: + rows: The row, or rows, to set the custom properties for. + custom_properties: A string to string dictionary of custom properties to + set for all rows. + """ + if not hasattr(rows, "__iter__"): + rows = [rows] + for row in rows: + self.__data[row] = (self.__data[row][0], custom_properties) + + def LoadData(self, data, custom_properties=None): + """Loads new rows to the data table, clearing existing rows. + + May also set the custom_properties for the added rows. The given custom + properties dictionary specifies the dictionary that will be used for *all* + given rows. + + Args: + data: The rows that the table will contain. + custom_properties: A dictionary of string to string to set as the custom + properties for all rows. + """ + self.__data = [] + self.AppendData(data, custom_properties) + + def AppendData(self, data, custom_properties=None): + """Appends new data to the table. + + Data is appended in rows. Data must comply with + the table schema passed in to __init__(). See CoerceValue() for a list + of acceptable data types. See the class documentation for more information + and examples of schema and data values. + + Args: + data: The row to add to the table. The data must conform to the table + description format. + custom_properties: A dictionary of string to string, representing the + custom properties to add to all the rows. + + Raises: + DataTableException: The data structure does not match the description. + """ + # If the maximal depth is 0, we simply iterate over the data table + # lines and insert them using _InnerAppendData. Otherwise, we simply + # let the _InnerAppendData handle all the levels. + if not self.__columns[-1]["depth"]: + for row in data: + self._InnerAppendData(({}, custom_properties), row, 0) + else: + self._InnerAppendData(({}, custom_properties), data, 0) + + def _InnerAppendData(self, prev_col_values, data, col_index): + """Inner function to assist LoadData.""" + # We first check that col_index has not exceeded the columns size + if col_index >= len(self.__columns): + raise DataTableException("The data does not match description, too deep") + + # Dealing with the scalar case, the data is the last value. + if self.__columns[col_index]["container"] == "scalar": + prev_col_values[0][self.__columns[col_index]["id"]] = data + self.__data.append(prev_col_values) + return + + if self.__columns[col_index]["container"] == "iter": + if not hasattr(data, "__iter__") or isinstance(data, dict): + raise DataTableException("Expected iterable object, got %s" % + type(data)) + # We only need to insert the rest of the columns + # If there are less items than expected, we only add what there is. + for value in data: + if col_index >= len(self.__columns): + raise DataTableException("Too many elements given in data") + prev_col_values[0][self.__columns[col_index]["id"]] = value + col_index += 1 + self.__data.append(prev_col_values) + return + + # We know the current level is a dictionary, we verify the type. + if not isinstance(data, dict): + raise DataTableException("Expected dictionary at current level, got %s" % + type(data)) + # We check if this is the last level + if self.__columns[col_index]["depth"] == self.__columns[-1]["depth"]: + # We need to add the keys in the dictionary as they are + for col in self.__columns[col_index:]: + if col["id"] in data: + prev_col_values[0][col["id"]] = data[col["id"]] + self.__data.append(prev_col_values) + return + + # We have a dictionary in an inner depth level. + if not data.keys(): + # In case this is an empty dictionary, we add a record with the columns + # filled only until this point. + self.__data.append(prev_col_values) + else: + for key in sorted(data): + col_values = dict(prev_col_values[0]) + col_values[self.__columns[col_index]["id"]] = key + self._InnerAppendData((col_values, prev_col_values[1]), + data[key], col_index + 1) + + def _PreparedData(self, order_by=()): + """Prepares the data for enumeration - sorting it by order_by. + + Args: + order_by: Optional. Specifies the name of the column(s) to sort by, and + (optionally) which direction to sort in. Default sort direction + is asc. Following formats are accepted: + "string_col_name" -- For a single key in default (asc) order. + ("string_col_name", "asc|desc") -- For a single key. + [("col_1","asc|desc"), ("col_2","asc|desc")] -- For more than + one column, an array of tuples of (col_name, "asc|desc"). + + Returns: + The data sorted by the keys given. + + Raises: + DataTableException: Sort direction not in 'asc' or 'desc' + """ + if not order_by: + return self.__data + + proper_sort_keys = [] + if isinstance(order_by, types.StringTypes) or ( + isinstance(order_by, tuple) and len(order_by) == 2 and + order_by[1].lower() in ["asc", "desc"]): + order_by = (order_by,) + for key in order_by: + if isinstance(key, types.StringTypes): + proper_sort_keys.append((key, 1)) + elif (isinstance(key, (list, tuple)) and len(key) == 2 and + key[1].lower() in ("asc", "desc")): + proper_sort_keys.append((key[0], key[1].lower() == "asc" and 1 or -1)) + else: + raise DataTableException("Expected tuple with second value: " + "'asc' or 'desc'") + + def SortCmpFunc(row1, row2): + """cmp function for sorted. Compares by keys and 'asc'/'desc' keywords.""" + for key, asc_mult in proper_sort_keys: + cmp_result = asc_mult * cmp(row1[0].get(key), row2[0].get(key)) + if cmp_result: + return cmp_result + return 0 + + return sorted(self.__data, cmp=SortCmpFunc) + + def ToJSCode(self, name, columns_order=None, order_by=()): + """Writes the data table as a JS code string. + + This method writes a string of JS code that can be run to + generate a DataTable with the specified data. Typically used for debugging + only. + + Args: + name: The name of the table. The name would be used as the DataTable's + variable name in the created JS code. + columns_order: Optional. Specifies the order of columns in the + output table. Specify a list of all column IDs in the order + in which you want the table created. + Note that you must list all column IDs in this parameter, + if you use it. + order_by: Optional. Specifies the name of the column(s) to sort by. + Passed as is to _PreparedData. + + Returns: + A string of JS code that, when run, generates a DataTable with the given + name and the data stored in the DataTable object. + Example result: + "var tab1 = new google.visualization.DataTable(); + tab1.addColumn("string", "a", "a"); + tab1.addColumn("number", "b", "b"); + tab1.addColumn("boolean", "c", "c"); + tab1.addRows(10); + tab1.setCell(0, 0, "a"); + tab1.setCell(0, 1, 1, null, {"foo": "bar"}); + tab1.setCell(0, 2, true); + ... + tab1.setCell(9, 0, "c"); + tab1.setCell(9, 1, 3, "3$"); + tab1.setCell(9, 2, false);" + + Raises: + DataTableException: The data does not match the type. + """ + + encoder = DataTableJSONEncoder() + + if columns_order is None: + columns_order = [col["id"] for col in self.__columns] + col_dict = dict([(col["id"], col) for col in self.__columns]) + + # We first create the table with the given name + jscode = "var %s = new google.visualization.DataTable();\n" % name + if self.custom_properties: + jscode += "%s.setTableProperties(%s);\n" % ( + name, encoder.encode(self.custom_properties)) + + # We add the columns to the table + for i, col in enumerate(columns_order): + jscode += "%s.addColumn(%s, %s, %s);\n" % ( + name, + encoder.encode(col_dict[col]["type"]), + encoder.encode(col_dict[col]["label"]), + encoder.encode(col_dict[col]["id"])) + if col_dict[col]["custom_properties"]: + jscode += "%s.setColumnProperties(%d, %s);\n" % ( + name, i, encoder.encode(col_dict[col]["custom_properties"])) + jscode += "%s.addRows(%d);\n" % (name, len(self.__data)) + + # We now go over the data and add each row + for (i, (row, cp)) in enumerate(self._PreparedData(order_by)): + # We add all the elements of this row by their order + for (j, col) in enumerate(columns_order): + if col not in row or row[col] is None: + continue + value = self.CoerceValue(row[col], col_dict[col]["type"]) + if isinstance(value, tuple): + cell_cp = "" + if len(value) == 3: + cell_cp = ", %s" % encoder.encode(row[col][2]) + # We have a formatted value or custom property as well + jscode += ("%s.setCell(%d, %d, %s, %s%s);\n" % + (name, i, j, + self.EscapeForJSCode(encoder, value[0]), + self.EscapeForJSCode(encoder, value[1]), cell_cp)) + else: + jscode += "%s.setCell(%d, %d, %s);\n" % ( + name, i, j, self.EscapeForJSCode(encoder, value)) + if cp: + jscode += "%s.setRowProperties(%d, %s);\n" % ( + name, i, encoder.encode(cp)) + return jscode + + def ToHtml(self, columns_order=None, order_by=()): + """Writes the data table as an HTML table code string. + + Args: + columns_order: Optional. Specifies the order of columns in the + output table. Specify a list of all column IDs in the order + in which you want the table created. + Note that you must list all column IDs in this parameter, + if you use it. + order_by: Optional. Specifies the name of the column(s) to sort by. + Passed as is to _PreparedData. + + Returns: + An HTML table code string. + Example result (the result is without the newlines): + + + + + + +
abc
1"z"2
"3$""w"
+ + Raises: + DataTableException: The data does not match the type. + """ + table_template = "%s
" + columns_template = "%s" + rows_template = "%s" + row_template = "%s" + header_cell_template = "%s" + cell_template = "%s" + + if columns_order is None: + columns_order = [col["id"] for col in self.__columns] + col_dict = dict([(col["id"], col) for col in self.__columns]) + + columns_list = [] + for col in columns_order: + columns_list.append(header_cell_template % + cgi.escape(col_dict[col]["label"])) + columns_html = columns_template % "".join(columns_list) + + rows_list = [] + # We now go over the data and add each row + for row, unused_cp in self._PreparedData(order_by): + cells_list = [] + # We add all the elements of this row by their order + for col in columns_order: + # For empty string we want empty quotes (""). + value = "" + if col in row and row[col] is not None: + value = self.CoerceValue(row[col], col_dict[col]["type"]) + if isinstance(value, tuple): + # We have a formatted value and we're going to use it + cells_list.append(cell_template % cgi.escape(self.ToString(value[1]))) + else: + cells_list.append(cell_template % cgi.escape(self.ToString(value))) + rows_list.append(row_template % "".join(cells_list)) + rows_html = rows_template % "".join(rows_list) + + return table_template % (columns_html + rows_html) + + def ToCsv(self, columns_order=None, order_by=(), separator=","): + """Writes the data table as a CSV string. + + Output is encoded in UTF-8 because the Python "csv" module can't handle + Unicode properly according to its documentation. + + Args: + columns_order: Optional. Specifies the order of columns in the + output table. Specify a list of all column IDs in the order + in which you want the table created. + Note that you must list all column IDs in this parameter, + if you use it. + order_by: Optional. Specifies the name of the column(s) to sort by. + Passed as is to _PreparedData. + separator: Optional. The separator to use between the values. + + Returns: + A CSV string representing the table. + Example result: + 'a','b','c' + 1,'z',2 + 3,'w','' + + Raises: + DataTableException: The data does not match the type. + """ + + csv_buffer = cStringIO.StringIO() + writer = csv.writer(csv_buffer, delimiter=separator) + + if columns_order is None: + columns_order = [col["id"] for col in self.__columns] + col_dict = dict([(col["id"], col) for col in self.__columns]) + + writer.writerow([col_dict[col]["label"].encode("utf-8") + for col in columns_order]) + + # We now go over the data and add each row + for row, unused_cp in self._PreparedData(order_by): + cells_list = [] + # We add all the elements of this row by their order + for col in columns_order: + value = "" + if col in row and row[col] is not None: + value = self.CoerceValue(row[col], col_dict[col]["type"]) + if isinstance(value, tuple): + # We have a formatted value. Using it only for date/time types. + if col_dict[col]["type"] in ["date", "datetime", "timeofday"]: + cells_list.append(self.ToString(value[1]).encode("utf-8")) + else: + cells_list.append(self.ToString(value[0]).encode("utf-8")) + else: + cells_list.append(self.ToString(value).encode("utf-8")) + writer.writerow(cells_list) + return csv_buffer.getvalue() + + def ToTsvExcel(self, columns_order=None, order_by=()): + """Returns a file in tab-separated-format readable by MS Excel. + + Returns a file in UTF-16 little endian encoding, with tabs separating the + values. + + Args: + columns_order: Delegated to ToCsv. + order_by: Delegated to ToCsv. + + Returns: + A tab-separated little endian UTF16 file representing the table. + """ + return (self.ToCsv(columns_order, order_by, separator="\t") + .decode("utf-8").encode("UTF-16LE")) + + def _ToJSonObj(self, columns_order=None, order_by=()): + """Returns an object suitable to be converted to JSON. + + Args: + columns_order: Optional. A list of all column IDs in the order in which + you want them created in the output table. If specified, + all column IDs must be present. + order_by: Optional. Specifies the name of the column(s) to sort by. + Passed as is to _PreparedData(). + + Returns: + A dictionary object for use by ToJSon or ToJSonResponse. + """ + if columns_order is None: + columns_order = [col["id"] for col in self.__columns] + col_dict = dict([(col["id"], col) for col in self.__columns]) + + # Creating the column JSON objects + col_objs = [] + for col_id in columns_order: + col_obj = {"id": col_dict[col_id]["id"], + "label": col_dict[col_id]["label"], + "type": col_dict[col_id]["type"]} + if col_dict[col_id]["custom_properties"]: + col_obj["p"] = col_dict[col_id]["custom_properties"] + col_objs.append(col_obj) + + # Creating the rows jsons + row_objs = [] + for row, cp in self._PreparedData(order_by): + cell_objs = [] + for col in columns_order: + value = self.CoerceValue(row.get(col, None), col_dict[col]["type"]) + if value is None: + cell_obj = None + elif isinstance(value, tuple): + cell_obj = {"v": value[0]} + if len(value) > 1 and value[1] is not None: + cell_obj["f"] = value[1] + if len(value) == 3: + cell_obj["p"] = value[2] + else: + cell_obj = {"v": value} + cell_objs.append(cell_obj) + row_obj = {"c": cell_objs} + if cp: + row_obj["p"] = cp + row_objs.append(row_obj) + + json_obj = {"cols": col_objs, "rows": row_objs} + if self.custom_properties: + json_obj["p"] = self.custom_properties + + return json_obj + + def ToJSon(self, columns_order=None, order_by=()): + """Returns a string that can be used in a JS DataTable constructor. + + This method writes a JSON string that can be passed directly into a Google + Visualization API DataTable constructor. Use this output if you are + hosting the visualization HTML on your site, and want to code the data + table in Python. Pass this string into the + google.visualization.DataTable constructor, e.g,: + ... on my page that hosts my visualization ... + google.setOnLoadCallback(drawTable); + function drawTable() { + var data = new google.visualization.DataTable(_my_JSon_string, 0.6); + myTable.draw(data); + } + + Args: + columns_order: Optional. Specifies the order of columns in the + output table. Specify a list of all column IDs in the order + in which you want the table created. + Note that you must list all column IDs in this parameter, + if you use it. + order_by: Optional. Specifies the name of the column(s) to sort by. + Passed as is to _PreparedData(). + + Returns: + A JSon constructor string to generate a JS DataTable with the data + stored in the DataTable object. + Example result (the result is without the newlines): + {cols: [{id:"a",label:"a",type:"number"}, + {id:"b",label:"b",type:"string"}, + {id:"c",label:"c",type:"number"}], + rows: [{c:[{v:1},{v:"z"},{v:2}]}, c:{[{v:3,f:"3$"},{v:"w"},{v:null}]}], + p: {'foo': 'bar'}} + + Raises: + DataTableException: The data does not match the type. + """ + + encoder = DataTableJSONEncoder() + return encoder.encode( + self._ToJSonObj(columns_order, order_by)).encode("utf-8") + + def ToJSonResponse(self, columns_order=None, order_by=(), req_id=0, + response_handler="google.visualization.Query.setResponse"): + """Writes a table as a JSON response that can be returned as-is to a client. + + This method writes a JSON response to return to a client in response to a + Google Visualization API query. This string can be processed by the calling + page, and is used to deliver a data table to a visualization hosted on + a different page. + + Args: + columns_order: Optional. Passed straight to self.ToJSon(). + order_by: Optional. Passed straight to self.ToJSon(). + req_id: Optional. The response id, as retrieved by the request. + response_handler: Optional. The response handler, as retrieved by the + request. + + Returns: + A JSON response string to be received by JS the visualization Query + object. This response would be translated into a DataTable on the + client side. + Example result (newlines added for readability): + google.visualization.Query.setResponse({ + 'version':'0.6', 'reqId':'0', 'status':'OK', + 'table': {cols: [...], rows: [...]}}); + + Note: The URL returning this string can be used as a data source by Google + Visualization Gadgets or from JS code. + """ + + response_obj = { + "version": "0.6", + "reqId": str(req_id), + "table": self._ToJSonObj(columns_order, order_by), + "status": "ok" + } + encoder = DataTableJSONEncoder() + return "%s(%s);" % (response_handler, + encoder.encode(response_obj).encode("utf-8")) + + def ToResponse(self, columns_order=None, order_by=(), tqx=""): + """Writes the right response according to the request string passed in tqx. + + This method parses the tqx request string (format of which is defined in + the documentation for implementing a data source of Google Visualization), + and returns the right response according to the request. + It parses out the "out" parameter of tqx, calls the relevant response + (ToJSonResponse() for "json", ToCsv() for "csv", ToHtml() for "html", + ToTsvExcel() for "tsv-excel") and passes the response function the rest of + the relevant request keys. + + Args: + columns_order: Optional. Passed as is to the relevant response function. + order_by: Optional. Passed as is to the relevant response function. + tqx: Optional. The request string as received by HTTP GET. Should be in + the format "key1:value1;key2:value2...". All keys have a default + value, so an empty string will just do the default (which is calling + ToJSonResponse() with no extra parameters). + + Returns: + A response string, as returned by the relevant response function. + + Raises: + DataTableException: One of the parameters passed in tqx is not supported. + """ + tqx_dict = {} + if tqx: + tqx_dict = dict(opt.split(":") for opt in tqx.split(";")) + if tqx_dict.get("version", "0.6") != "0.6": + raise DataTableException( + "Version (%s) passed by request is not supported." + % tqx_dict["version"]) + + if tqx_dict.get("out", "json") == "json": + response_handler = tqx_dict.get("responseHandler", + "google.visualization.Query.setResponse") + return self.ToJSonResponse(columns_order, order_by, + req_id=tqx_dict.get("reqId", 0), + response_handler=response_handler) + elif tqx_dict["out"] == "html": + return self.ToHtml(columns_order, order_by) + elif tqx_dict["out"] == "csv": + return self.ToCsv(columns_order, order_by) + elif tqx_dict["out"] == "tsv-excel": + return self.ToTsvExcel(columns_order, order_by) + else: + raise DataTableException( + "'out' parameter: '%s' is not supported" % tqx_dict["out"]) diff --git a/libs/libaom/src/test/hadamard_test.cc b/libs/libaom/src/test/hadamard_test.cc new file mode 100644 index 000000000..7903259e7 --- /dev/null +++ b/libs/libaom/src/test/hadamard_test.cc @@ -0,0 +1,261 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_dsp_rtcd.h" + +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" + +namespace { + +using libaom_test::ACMRandom; + +typedef void (*HadamardFunc)(const int16_t *a, ptrdiff_t a_stride, + tran_low_t *b); + +void HadamardLoop(const tran_low_t *a, tran_low_t *out) { + tran_low_t b[8]; + for (int i = 0; i < 8; i += 2) { + b[i + 0] = a[i * 8] + a[(i + 1) * 8]; + b[i + 1] = a[i * 8] - a[(i + 1) * 8]; + } + tran_low_t c[8]; + for (int i = 0; i < 8; i += 4) { + c[i + 0] = b[i + 0] + b[i + 2]; + c[i + 1] = b[i + 1] + b[i + 3]; + c[i + 2] = b[i + 0] - b[i + 2]; + c[i + 3] = b[i + 1] - b[i + 3]; + } + out[0] = c[0] + c[4]; + out[7] = c[1] + c[5]; + out[3] = c[2] + c[6]; + out[4] = c[3] + c[7]; + out[2] = c[0] - c[4]; + out[6] = c[1] - c[5]; + out[1] = c[2] - c[6]; + out[5] = c[3] - c[7]; +} + +void ReferenceHadamard8x8(const int16_t *a, int a_stride, tran_low_t *b) { + tran_low_t input[64]; + tran_low_t buf[64]; + for (int i = 0; i < 8; ++i) { + for (int j = 0; j < 8; ++j) { + input[i * 8 + j] = static_cast(a[i * a_stride + j]); + } + } + for (int i = 0; i < 8; ++i) HadamardLoop(input + i, buf + i * 8); + for (int i = 0; i < 8; ++i) HadamardLoop(buf + i, b + i * 8); +} + +void ReferenceHadamard16x16(const int16_t *a, int a_stride, tran_low_t *b) { + /* The source is a 16x16 block. The destination is rearranged to 8x32. + * Input is 9 bit. */ + ReferenceHadamard8x8(a + 0 + 0 * a_stride, a_stride, b + 0); + ReferenceHadamard8x8(a + 8 + 0 * a_stride, a_stride, b + 64); + ReferenceHadamard8x8(a + 0 + 8 * a_stride, a_stride, b + 128); + ReferenceHadamard8x8(a + 8 + 8 * a_stride, a_stride, b + 192); + + /* Overlay the 8x8 blocks and combine. */ + for (int i = 0; i < 64; ++i) { + /* 8x8 steps the range up to 15 bits. */ + const tran_low_t a0 = b[0]; + const tran_low_t a1 = b[64]; + const tran_low_t a2 = b[128]; + const tran_low_t a3 = b[192]; + + /* Prevent the result from escaping int16_t. */ + const tran_low_t b0 = (a0 + a1) >> 1; + const tran_low_t b1 = (a0 - a1) >> 1; + const tran_low_t b2 = (a2 + a3) >> 1; + const tran_low_t b3 = (a2 - a3) >> 1; + + /* Store a 16 bit value. */ + b[0] = b0 + b2; + b[64] = b1 + b3; + b[128] = b0 - b2; + b[192] = b1 - b3; + + ++b; + } +} + +void ReferenceHadamard32x32(const int16_t *a, int a_stride, tran_low_t *b) { + ReferenceHadamard16x16(a + 0 + 0 * a_stride, a_stride, b + 0); + ReferenceHadamard16x16(a + 16 + 0 * a_stride, a_stride, b + 256); + ReferenceHadamard16x16(a + 0 + 16 * a_stride, a_stride, b + 512); + ReferenceHadamard16x16(a + 16 + 16 * a_stride, a_stride, b + 768); + + for (int i = 0; i < 256; ++i) { + const tran_low_t a0 = b[0]; + const tran_low_t a1 = b[256]; + const tran_low_t a2 = b[512]; + const tran_low_t a3 = b[768]; + + const tran_low_t b0 = (a0 + a1) >> 2; + const tran_low_t b1 = (a0 - a1) >> 2; + const tran_low_t b2 = (a2 + a3) >> 2; + const tran_low_t b3 = (a2 - a3) >> 2; + + b[0] = b0 + b2; + b[256] = b1 + b3; + b[512] = b0 - b2; + b[768] = b1 - b3; + + ++b; + } +} + +struct HadamardFuncWithSize { + HadamardFuncWithSize(HadamardFunc f, int s) : func(f), block_size(s) {} + HadamardFunc func; + int block_size; +}; + +std::ostream &operator<<(std::ostream &os, const HadamardFuncWithSize &hfs) { + return os << "block size: " << hfs.block_size; +} + +class HadamardTestBase : public ::testing::TestWithParam { + public: + virtual void SetUp() { + h_func_ = GetParam().func; + bwh_ = GetParam().block_size; + block_size_ = bwh_ * bwh_; + rnd_.Reset(ACMRandom::DeterministicSeed()); + } + + virtual int16_t Rand() = 0; + + void ReferenceHadamard(const int16_t *a, int a_stride, tran_low_t *b, + int bwh) { + if (bwh == 32) + ReferenceHadamard32x32(a, a_stride, b); + else if (bwh == 16) + ReferenceHadamard16x16(a, a_stride, b); + else + ReferenceHadamard8x8(a, a_stride, b); + } + + void CompareReferenceRandom() { + const int kMaxBlockSize = 32 * 32; + DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize]); + DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]); + memset(a, 0, sizeof(a)); + memset(b, 0, sizeof(b)); + + tran_low_t b_ref[kMaxBlockSize]; + memset(b_ref, 0, sizeof(b_ref)); + + for (int i = 0; i < block_size_; ++i) a[i] = Rand(); + + ReferenceHadamard(a, bwh_, b_ref, bwh_); + ASM_REGISTER_STATE_CHECK(h_func_(a, bwh_, b)); + + // The order of the output is not important. Sort before checking. + std::sort(b, b + block_size_); + std::sort(b_ref, b_ref + block_size_); + EXPECT_EQ(memcmp(b, b_ref, sizeof(b)), 0); + } + + void VaryStride() { + const int kMaxBlockSize = 32 * 32; + DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize * 8]); + DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]); + memset(a, 0, sizeof(a)); + for (int i = 0; i < block_size_ * 8; ++i) a[i] = Rand(); + + tran_low_t b_ref[kMaxBlockSize]; + for (int i = 8; i < 64; i += 8) { + memset(b, 0, sizeof(b)); + memset(b_ref, 0, sizeof(b_ref)); + + ReferenceHadamard(a, i, b_ref, bwh_); + ASM_REGISTER_STATE_CHECK(h_func_(a, i, b)); + + // The order of the output is not important. Sort before checking. + std::sort(b, b + block_size_); + std::sort(b_ref, b_ref + block_size_); + EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b))); + } + } + + void SpeedTest(int times) { + const int kMaxBlockSize = 32 * 32; + DECLARE_ALIGNED(16, int16_t, input[kMaxBlockSize]); + DECLARE_ALIGNED(16, tran_low_t, output[kMaxBlockSize]); + memset(input, 1, sizeof(input)); + memset(output, 0, sizeof(output)); + + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int i = 0; i < times; ++i) { + h_func_(input, bwh_, output); + } + aom_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(aom_usec_timer_elapsed(&timer)); + printf("Hadamard%dx%d[%12d runs]: %d us\n", bwh_, bwh_, times, + elapsed_time); + } + + ACMRandom rnd_; + + private: + int bwh_; + int block_size_; + HadamardFunc h_func_; +}; + +class HadamardLowbdTest : public HadamardTestBase { + public: + virtual int16_t Rand() { return rnd_.Rand9Signed(); } +}; + +TEST_P(HadamardLowbdTest, CompareReferenceRandom) { CompareReferenceRandom(); } + +TEST_P(HadamardLowbdTest, VaryStride) { VaryStride(); } + +INSTANTIATE_TEST_SUITE_P( + C, HadamardLowbdTest, + ::testing::Values(HadamardFuncWithSize(&aom_hadamard_8x8_c, 8), + HadamardFuncWithSize(&aom_hadamard_16x16_c, 16), + HadamardFuncWithSize(&aom_hadamard_32x32_c, 32))); + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, HadamardLowbdTest, + ::testing::Values(HadamardFuncWithSize(&aom_hadamard_8x8_sse2, 8), + HadamardFuncWithSize(&aom_hadamard_16x16_sse2, 16), + HadamardFuncWithSize(&aom_hadamard_32x32_sse2, 32))); +#endif // HAVE_SSE2 + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, HadamardLowbdTest, + ::testing::Values(HadamardFuncWithSize(&aom_hadamard_16x16_avx2, 16), + HadamardFuncWithSize(&aom_hadamard_32x32_avx2, 32))); +#endif // HAVE_AVX2 + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, HadamardLowbdTest, + ::testing::Values(HadamardFuncWithSize(&aom_hadamard_8x8_neon, 8), + HadamardFuncWithSize(&aom_hadamard_16x16_neon, 16))); +#endif // HAVE_NEON + +} // namespace diff --git a/libs/libaom/src/test/hash_test.cc b/libs/libaom/src/test/hash_test.cc new file mode 100644 index 000000000..eb964ac5f --- /dev/null +++ b/libs/libaom/src/test/hash_test.cc @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_ports/aom_timer.h" +#include "av1/encoder/hash.h" +#include "test/acm_random.h" +#include "test/util.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +namespace { + +typedef uint32_t (*get_crc32c_value_func)(void *calculator, uint8_t *p, + size_t length); + +typedef std::tuple HashParam; + +class AV1Crc32cHashTest : public ::testing::TestWithParam { + public: + ~AV1Crc32cHashTest(); + void SetUp(); + + void TearDown(); + + protected: + void RunCheckOutput(get_crc32c_value_func test_impl); + void RunSpeedTest(get_crc32c_value_func test_impl); + + void RunZeroTest(get_crc32c_value_func test_impl); + + libaom_test::ACMRandom rnd_; + CRC32C calc_; + uint8_t *buffer_; + int bsize_; + size_t length_; +}; + +AV1Crc32cHashTest::~AV1Crc32cHashTest() { ; } + +void AV1Crc32cHashTest::SetUp() { + rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed()); + av1_crc32c_calculator_init(&calc_); + + bsize_ = GET_PARAM(1); + length_ = bsize_ * bsize_ * sizeof(uint16_t); + buffer_ = new uint8_t[length_]; + ASSERT_TRUE(buffer_ != NULL); + for (size_t i = 0; i < length_; ++i) { + buffer_[i] = rnd_.Rand8(); + } +} + +void AV1Crc32cHashTest::TearDown() { delete[] buffer_; } + +void AV1Crc32cHashTest::RunCheckOutput(get_crc32c_value_func test_impl) { + get_crc32c_value_func ref_impl = av1_get_crc32c_value_c; + // for the same buffer crc should be the same + uint32_t crc0 = test_impl(&calc_, buffer_, length_); + uint32_t crc1 = test_impl(&calc_, buffer_, length_); + uint32_t crc2 = ref_impl(&calc_, buffer_, length_); + ASSERT_EQ(crc0, crc1); + ASSERT_EQ(crc0, crc2); // should equal to software version + // modify buffer + buffer_[0] += 1; + uint32_t crc3 = test_impl(&calc_, buffer_, length_); + uint32_t crc4 = ref_impl(&calc_, buffer_, length_); + ASSERT_NE(crc0, crc3); // crc shoud not equal to previous one + ASSERT_EQ(crc3, crc4); +} + +void AV1Crc32cHashTest::RunSpeedTest(get_crc32c_value_func test_impl) { + get_crc32c_value_func impls[] = { av1_get_crc32c_value_c, test_impl }; + const int repeat = 10000000 / (bsize_ + bsize_); + + aom_usec_timer timer; + double time[2]; + for (int i = 0; i < 2; ++i) { + aom_usec_timer_start(&timer); + for (int j = 0; j < repeat; ++j) { + impls[i](&calc_, buffer_, length_); + } + aom_usec_timer_mark(&timer); + time[i] = static_cast(aom_usec_timer_elapsed(&timer)); + } + printf("hash %3dx%-3d:%7.2f/%7.2fus", bsize_, bsize_, time[0], time[1]); + printf("(%3.2f)\n", time[0] / time[1]); +} + +void AV1Crc32cHashTest::RunZeroTest(get_crc32c_value_func test_impl) { + uint8_t buffer0[1024] = { 0 }; + // for buffer with different size the crc should not be the same + const uint32_t crc0 = test_impl(&calc_, buffer0, 32); + const uint32_t crc1 = test_impl(&calc_, buffer0, 128); + const uint32_t crc2 = test_impl(&calc_, buffer0, 1024); + ASSERT_NE(crc0, crc1); + ASSERT_NE(crc0, crc2); + ASSERT_NE(crc1, crc2); +} + +TEST_P(AV1Crc32cHashTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); } + +TEST_P(AV1Crc32cHashTest, CheckZero) { RunZeroTest(GET_PARAM(0)); } + +TEST_P(AV1Crc32cHashTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); } + +const int kValidBlockSize[] = { 64, 32, 8, 4 }; + +INSTANTIATE_TEST_SUITE_P( + C, AV1Crc32cHashTest, + ::testing::Combine(::testing::Values(&av1_get_crc32c_value_c), + ::testing::ValuesIn(kValidBlockSize))); + +#if HAVE_SSE4_2 +INSTANTIATE_TEST_SUITE_P( + SSE4_2, AV1Crc32cHashTest, + ::testing::Combine(::testing::Values(&av1_get_crc32c_value_sse4_2), + ::testing::ValuesIn(kValidBlockSize))); +#endif + +} // namespace diff --git a/libs/libaom/src/test/hbd_metrics_test.cc b/libs/libaom/src/test/hbd_metrics_test.cc new file mode 100644 index 000000000..5b03beee7 --- /dev/null +++ b/libs/libaom/src/test/hbd_metrics_test.cc @@ -0,0 +1,240 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/acm_random.h" +#include "test/util.h" + +#include "config/aom_config.h" + +#include "aom_dsp/psnr.h" +#include "aom_dsp/ssim.h" +#include "aom_ports/mem.h" +#include "aom_ports/msvc.h" +#include "aom_scale/yv12config.h" + +using libaom_test::ACMRandom; + +namespace { + +typedef double (*LBDMetricFunc)(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest); +typedef double (*HBDMetricFunc)(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, uint32_t in_bd, + uint32_t bd); + +double compute_hbd_psnr(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, uint32_t in_bd, + uint32_t bd) { + PSNR_STATS psnr; + aom_calc_highbd_psnr(source, dest, &psnr, bd, in_bd); + return psnr.psnr[0]; +} + +double compute_psnr(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest) { + PSNR_STATS psnr; + aom_calc_psnr(source, dest, &psnr); + return psnr.psnr[0]; +} + +double compute_hbd_psnrhvs(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, uint32_t in_bd, + uint32_t bd) { + double tempy, tempu, tempv; + return aom_psnrhvs(source, dest, &tempy, &tempu, &tempv, bd, in_bd); +} + +double compute_psnrhvs(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest) { + double tempy, tempu, tempv; + return aom_psnrhvs(source, dest, &tempy, &tempu, &tempv, 8, 8); +} + +double compute_hbd_fastssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, uint32_t in_bd, + uint32_t bd) { + double tempy, tempu, tempv; + return aom_calc_fastssim(source, dest, &tempy, &tempu, &tempv, bd, in_bd); +} + +double compute_fastssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest) { + double tempy, tempu, tempv; + return aom_calc_fastssim(source, dest, &tempy, &tempu, &tempv, 8, 8); +} + +double compute_hbd_aomssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, uint32_t in_bd, + uint32_t bd) { + double ssim, weight; + ssim = aom_highbd_calc_ssim(source, dest, &weight, bd, in_bd); + return 100 * pow(ssim / weight, 8.0); +} + +double compute_aomssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest) { + double ssim, weight; + ssim = aom_calc_ssim(source, dest, &weight); + return 100 * pow(ssim / weight, 8.0); +} + +class HBDMetricsTestBase { + public: + virtual ~HBDMetricsTestBase() {} + + protected: + void RunAccuracyCheck() { + const int width = 1920; + const int height = 1080; + size_t i = 0; + const uint8_t kPixFiller = 128; + YV12_BUFFER_CONFIG lbd_src, lbd_dst; + YV12_BUFFER_CONFIG hbd_src, hbd_dst; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + double lbd_db, hbd_db; + + memset(&lbd_src, 0, sizeof(lbd_src)); + memset(&lbd_dst, 0, sizeof(lbd_dst)); + memset(&hbd_src, 0, sizeof(hbd_src)); + memset(&hbd_dst, 0, sizeof(hbd_dst)); + + aom_alloc_frame_buffer(&lbd_src, width, height, 1, 1, 0, 32, 16); + aom_alloc_frame_buffer(&lbd_dst, width, height, 1, 1, 0, 32, 16); + aom_alloc_frame_buffer(&hbd_src, width, height, 1, 1, 1, 32, 16); + aom_alloc_frame_buffer(&hbd_dst, width, height, 1, 1, 1, 32, 16); + + memset(lbd_src.buffer_alloc, kPixFiller, lbd_src.buffer_alloc_sz); + while (i < lbd_src.buffer_alloc_sz) { + uint16_t spel, dpel; + spel = lbd_src.buffer_alloc[i]; + // Create some distortion for dst buffer. + dpel = rnd.Rand8(); + lbd_dst.buffer_alloc[i] = (uint8_t)dpel; + ((uint16_t *)(hbd_src.buffer_alloc))[i] = spel << (bit_depth_ - 8); + ((uint16_t *)(hbd_dst.buffer_alloc))[i] = dpel << (bit_depth_ - 8); + i++; + } + + lbd_db = lbd_metric_(&lbd_src, &lbd_dst); + hbd_db = hbd_metric_(&hbd_src, &hbd_dst, input_bit_depth_, bit_depth_); + EXPECT_LE(fabs(lbd_db - hbd_db), threshold_); + + i = 0; + while (i < lbd_src.buffer_alloc_sz) { + uint16_t dpel; + // Create some small distortion for dst buffer. + dpel = 120 + (rnd.Rand8() >> 4); + lbd_dst.buffer_alloc[i] = (uint8_t)dpel; + ((uint16_t *)(hbd_dst.buffer_alloc))[i] = dpel << (bit_depth_ - 8); + i++; + } + + lbd_db = lbd_metric_(&lbd_src, &lbd_dst); + hbd_db = hbd_metric_(&hbd_src, &hbd_dst, input_bit_depth_, bit_depth_); + EXPECT_LE(fabs(lbd_db - hbd_db), threshold_); + + i = 0; + while (i < lbd_src.buffer_alloc_sz) { + uint16_t dpel; + // Create some small distortion for dst buffer. + dpel = 126 + (rnd.Rand8() >> 6); + lbd_dst.buffer_alloc[i] = (uint8_t)dpel; + ((uint16_t *)(hbd_dst.buffer_alloc))[i] = dpel << (bit_depth_ - 8); + i++; + } + + lbd_db = lbd_metric_(&lbd_src, &lbd_dst); + hbd_db = hbd_metric_(&hbd_src, &hbd_dst, input_bit_depth_, bit_depth_); + EXPECT_LE(fabs(lbd_db - hbd_db), threshold_); + + aom_free_frame_buffer(&lbd_src); + aom_free_frame_buffer(&lbd_dst); + aom_free_frame_buffer(&hbd_src); + aom_free_frame_buffer(&hbd_dst); + } + + int input_bit_depth_; + int bit_depth_; + double threshold_; + LBDMetricFunc lbd_metric_; + HBDMetricFunc hbd_metric_; +}; + +typedef std::tuple + MetricTestTParam; +class HBDMetricsTest : public HBDMetricsTestBase, + public ::testing::TestWithParam { + public: + virtual void SetUp() { + lbd_metric_ = GET_PARAM(0); + hbd_metric_ = GET_PARAM(1); + input_bit_depth_ = GET_PARAM(2); + bit_depth_ = GET_PARAM(3); + threshold_ = GET_PARAM(4); + } + virtual void TearDown() {} +}; + +TEST_P(HBDMetricsTest, RunAccuracyCheck) { RunAccuracyCheck(); } + +// Allow small variation due to floating point operations. +static const double kSsim_thresh = 0.001; +// Allow some additional errors accumulated in floating point operations. +static const double kFSsim_thresh = 0.03; +// Allow some extra variation due to rounding error accumulated in dct. +static const double kPhvs_thresh = 0.3; + +INSTANTIATE_TEST_SUITE_P( + AOMSSIM, HBDMetricsTest, + ::testing::Values(MetricTestTParam(&compute_aomssim, &compute_hbd_aomssim, + 8, 10, kSsim_thresh), + MetricTestTParam(&compute_aomssim, &compute_hbd_aomssim, + 10, 10, kPhvs_thresh), + MetricTestTParam(&compute_aomssim, &compute_hbd_aomssim, + 8, 12, kSsim_thresh), + MetricTestTParam(&compute_aomssim, &compute_hbd_aomssim, + 12, 12, kPhvs_thresh))); +INSTANTIATE_TEST_SUITE_P( + FASTSSIM, HBDMetricsTest, + ::testing::Values(MetricTestTParam(&compute_fastssim, &compute_hbd_fastssim, + 8, 10, kFSsim_thresh), + MetricTestTParam(&compute_fastssim, &compute_hbd_fastssim, + 10, 10, kFSsim_thresh), + MetricTestTParam(&compute_fastssim, &compute_hbd_fastssim, + 8, 12, kFSsim_thresh), + MetricTestTParam(&compute_fastssim, &compute_hbd_fastssim, + 12, 12, kFSsim_thresh))); +INSTANTIATE_TEST_SUITE_P( + PSNRHVS, HBDMetricsTest, + ::testing::Values(MetricTestTParam(&compute_psnrhvs, &compute_hbd_psnrhvs, + 8, 10, kPhvs_thresh), + MetricTestTParam(&compute_psnrhvs, &compute_hbd_psnrhvs, + 10, 10, kPhvs_thresh), + MetricTestTParam(&compute_psnrhvs, &compute_hbd_psnrhvs, + 8, 12, kPhvs_thresh), + MetricTestTParam(&compute_psnrhvs, &compute_hbd_psnrhvs, + 12, 12, kPhvs_thresh))); +INSTANTIATE_TEST_SUITE_P( + PSNR, HBDMetricsTest, + ::testing::Values( + MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 8, 10, kPhvs_thresh), + MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 10, 10, + kPhvs_thresh), + MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 8, 12, kPhvs_thresh), + MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 12, 12, + kPhvs_thresh))); +} // namespace diff --git a/libs/libaom/src/test/hiprec_convolve_test.cc b/libs/libaom/src/test/hiprec_convolve_test.cc new file mode 100644 index 000000000..59d28e883 --- /dev/null +++ b/libs/libaom/src/test/hiprec_convolve_test.cc @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/hiprec_convolve_test_util.h" + +using libaom_test::ACMRandom; +#if CONFIG_AV1_HIGHBITDEPTH +using libaom_test::AV1HighbdHiprecConvolve::AV1HighbdHiprecConvolveTest; +#endif +using libaom_test::AV1HiprecConvolve::AV1HiprecConvolveTest; +using std::make_tuple; +using std::tuple; + +namespace { + +TEST_P(AV1HiprecConvolveTest, CheckOutput) { RunCheckOutput(GET_PARAM(3)); } +TEST_P(AV1HiprecConvolveTest, DISABLED_SpeedTest) { + RunSpeedTest(GET_PARAM(3)); +} +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P(SSE2, AV1HiprecConvolveTest, + libaom_test::AV1HiprecConvolve::BuildParams( + av1_wiener_convolve_add_src_sse2)); +#endif +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2, AV1HiprecConvolveTest, + libaom_test::AV1HiprecConvolve::BuildParams( + av1_wiener_convolve_add_src_avx2)); +#endif +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, AV1HiprecConvolveTest, + libaom_test::AV1HiprecConvolve::BuildParams( + av1_wiener_convolve_add_src_neon)); +#endif + +#if CONFIG_AV1_HIGHBITDEPTH +#if HAVE_SSSE3 || HAVE_AVX2 +TEST_P(AV1HighbdHiprecConvolveTest, CheckOutput) { + RunCheckOutput(GET_PARAM(4)); +} +TEST_P(AV1HighbdHiprecConvolveTest, DISABLED_SpeedTest) { + RunSpeedTest(GET_PARAM(4)); +} +#if HAVE_SSSE3 +INSTANTIATE_TEST_SUITE_P(SSSE3, AV1HighbdHiprecConvolveTest, + libaom_test::AV1HighbdHiprecConvolve::BuildParams( + av1_highbd_wiener_convolve_add_src_ssse3)); +#endif +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2, AV1HighbdHiprecConvolveTest, + libaom_test::AV1HighbdHiprecConvolve::BuildParams( + av1_highbd_wiener_convolve_add_src_avx2)); +#endif +#endif +#endif // CONFIG_AV1_HIGHBITDEPTH + +} // namespace diff --git a/libs/libaom/src/test/hiprec_convolve_test_util.cc b/libs/libaom/src/test/hiprec_convolve_test_util.cc new file mode 100644 index 000000000..956af7fc8 --- /dev/null +++ b/libs/libaom/src/test/hiprec_convolve_test_util.cc @@ -0,0 +1,350 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "test/hiprec_convolve_test_util.h" + +#include "av1/common/restoration.h" + +using std::make_tuple; +using std::tuple; + +namespace libaom_test { + +// Generate a random pair of filter kernels, using the ranges +// of possible values from the loop-restoration experiment +static void generate_kernels(ACMRandom *rnd, InterpKernel hkernel, + InterpKernel vkernel, int kernel_type = 2) { + if (kernel_type == 0) { + // Low possible values for filter coefficients + hkernel[0] = hkernel[6] = vkernel[0] = vkernel[6] = WIENER_FILT_TAP0_MINV; + hkernel[1] = hkernel[5] = vkernel[1] = vkernel[5] = WIENER_FILT_TAP1_MINV; + hkernel[2] = hkernel[4] = vkernel[2] = vkernel[4] = WIENER_FILT_TAP2_MINV; + hkernel[3] = vkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]); + hkernel[7] = vkernel[7] = 0; + } else if (kernel_type == 1) { + // Max possible values for filter coefficients + hkernel[0] = hkernel[6] = vkernel[0] = vkernel[6] = WIENER_FILT_TAP0_MAXV; + hkernel[1] = hkernel[5] = vkernel[1] = vkernel[5] = WIENER_FILT_TAP1_MAXV; + hkernel[2] = hkernel[4] = vkernel[2] = vkernel[4] = WIENER_FILT_TAP2_MAXV; + hkernel[3] = vkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]); + hkernel[7] = vkernel[7] = 0; + } else { + // Randomly generated values for filter coefficients + hkernel[0] = hkernel[6] = + WIENER_FILT_TAP0_MINV + + rnd->PseudoUniform(WIENER_FILT_TAP0_MAXV + 1 - WIENER_FILT_TAP0_MINV); + hkernel[1] = hkernel[5] = + WIENER_FILT_TAP1_MINV + + rnd->PseudoUniform(WIENER_FILT_TAP1_MAXV + 1 - WIENER_FILT_TAP1_MINV); + hkernel[2] = hkernel[4] = + WIENER_FILT_TAP2_MINV + + rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 1 - WIENER_FILT_TAP2_MINV); + hkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]); + hkernel[7] = 0; + + vkernel[0] = vkernel[6] = + WIENER_FILT_TAP0_MINV + + rnd->PseudoUniform(WIENER_FILT_TAP0_MAXV + 2 - WIENER_FILT_TAP0_MINV); + vkernel[1] = vkernel[5] = + WIENER_FILT_TAP1_MINV + + rnd->PseudoUniform(WIENER_FILT_TAP1_MAXV + 2 - WIENER_FILT_TAP1_MINV); + vkernel[2] = vkernel[4] = + WIENER_FILT_TAP2_MINV + + rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 2 - WIENER_FILT_TAP2_MINV); + vkernel[3] = -2 * (vkernel[0] + vkernel[1] + vkernel[2]); + vkernel[7] = 0; + } +} + +namespace AV1HiprecConvolve { + +::testing::internal::ParamGenerator BuildParams( + hiprec_convolve_func filter) { + const HiprecConvolveParam params[] = { + make_tuple(8, 8, 50000, filter), make_tuple(8, 4, 50000, filter), + make_tuple(64, 24, 1000, filter), make_tuple(64, 64, 1000, filter), + make_tuple(64, 56, 1000, filter), make_tuple(32, 8, 10000, filter), + make_tuple(32, 28, 10000, filter), make_tuple(32, 32, 10000, filter), + make_tuple(16, 34, 10000, filter), make_tuple(32, 34, 10000, filter), + make_tuple(64, 34, 1000, filter), make_tuple(8, 17, 10000, filter), + make_tuple(16, 17, 10000, filter), make_tuple(32, 17, 10000, filter) + }; + return ::testing::ValuesIn(params); +} + +AV1HiprecConvolveTest::~AV1HiprecConvolveTest() {} +void AV1HiprecConvolveTest::SetUp() { + rnd_.Reset(ACMRandom::DeterministicSeed()); +} + +void AV1HiprecConvolveTest::TearDown() { libaom_test::ClearSystemState(); } + +void AV1HiprecConvolveTest::RunCheckOutput(hiprec_convolve_func test_impl) { + const int w = 128, h = 128; + const int out_w = GET_PARAM(0), out_h = GET_PARAM(1); + const int num_iters = GET_PARAM(2); + int i, j, k, m; + const ConvolveParams conv_params = get_conv_params_wiener(8); + + uint8_t *input_ = new uint8_t[h * w]; + uint8_t *input = input_; + + // The AVX2 convolve functions always write rows with widths that are + // multiples of 16. So to avoid a buffer overflow, we may need to pad + // rows to a multiple of 16. + int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h; + uint8_t *output = new uint8_t[output_n]; + uint8_t *output2 = new uint8_t[output_n]; + + // Generate random filter kernels + DECLARE_ALIGNED(16, InterpKernel, hkernel); + DECLARE_ALIGNED(16, InterpKernel, vkernel); + + for (int kernel_type = 0; kernel_type < 3; kernel_type++) { + generate_kernels(&rnd_, hkernel, vkernel, kernel_type); + for (i = 0; i < num_iters; ++i) { + for (k = 0; k < h; ++k) + for (m = 0; m < w; ++m) input[k * w + m] = rnd_.Rand8(); + // Choose random locations within the source block + int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7); + int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7); + av1_wiener_convolve_add_src_c(input + offset_r * w + offset_c, w, output, + out_w, hkernel, 16, vkernel, 16, out_w, + out_h, &conv_params); + test_impl(input + offset_r * w + offset_c, w, output2, out_w, hkernel, 16, + vkernel, 16, out_w, out_h, &conv_params); + + for (j = 0; j < out_w * out_h; ++j) + ASSERT_EQ(output[j], output2[j]) + << "Pixel mismatch at index " << j << " = (" << (j % out_w) << ", " + << (j / out_w) << ") on iteration " << i; + } + } + delete[] input_; + delete[] output; + delete[] output2; +} + +void AV1HiprecConvolveTest::RunSpeedTest(hiprec_convolve_func test_impl) { + const int w = 128, h = 128; + const int out_w = GET_PARAM(0), out_h = GET_PARAM(1); + const int num_iters = GET_PARAM(2) / 500; + int i, j, k; + const ConvolveParams conv_params = get_conv_params_wiener(8); + + uint8_t *input_ = new uint8_t[h * w]; + uint8_t *input = input_; + + // The AVX2 convolve functions always write rows with widths that are + // multiples of 16. So to avoid a buffer overflow, we may need to pad + // rows to a multiple of 16. + int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h; + uint8_t *output = new uint8_t[output_n]; + uint8_t *output2 = new uint8_t[output_n]; + + // Generate random filter kernels + DECLARE_ALIGNED(16, InterpKernel, hkernel); + DECLARE_ALIGNED(16, InterpKernel, vkernel); + + generate_kernels(&rnd_, hkernel, vkernel); + + for (i = 0; i < h; ++i) + for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8(); + + aom_usec_timer ref_timer; + aom_usec_timer_start(&ref_timer); + for (i = 0; i < num_iters; ++i) { + for (j = 3; j < h - out_h - 4; j++) { + for (k = 3; k < w - out_w - 4; k++) { + av1_wiener_convolve_add_src_c(input + j * w + k, w, output, out_w, + hkernel, 16, vkernel, 16, out_w, out_h, + &conv_params); + } + } + } + aom_usec_timer_mark(&ref_timer); + const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer); + + aom_usec_timer tst_timer; + aom_usec_timer_start(&tst_timer); + for (i = 0; i < num_iters; ++i) { + for (j = 3; j < h - out_h - 4; j++) { + for (k = 3; k < w - out_w - 4; k++) { + test_impl(input + j * w + k, w, output2, out_w, hkernel, 16, vkernel, + 16, out_w, out_h, &conv_params); + } + } + } + aom_usec_timer_mark(&tst_timer); + const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer); + + std::cout << "[ ] C time = " << ref_time / 1000 + << " ms, SIMD time = " << tst_time / 1000 << " ms\n"; + + EXPECT_GT(ref_time, tst_time) + << "Error: AV1HiprecConvolveTest.SpeedTest, SIMD slower than C.\n" + << "C time: " << ref_time << " us\n" + << "SIMD time: " << tst_time << " us\n"; + + delete[] input_; + delete[] output; + delete[] output2; +} +} // namespace AV1HiprecConvolve + +#if CONFIG_AV1_HIGHBITDEPTH +namespace AV1HighbdHiprecConvolve { + +::testing::internal::ParamGenerator BuildParams( + highbd_hiprec_convolve_func filter) { + const HighbdHiprecConvolveParam params[] = { + make_tuple(8, 8, 50000, 8, filter), make_tuple(64, 64, 1000, 8, filter), + make_tuple(32, 8, 10000, 8, filter), make_tuple(8, 8, 50000, 10, filter), + make_tuple(64, 64, 1000, 10, filter), make_tuple(32, 8, 10000, 10, filter), + make_tuple(8, 8, 50000, 12, filter), make_tuple(64, 64, 1000, 12, filter), + make_tuple(32, 8, 10000, 12, filter), + }; + return ::testing::ValuesIn(params); +} + +AV1HighbdHiprecConvolveTest::~AV1HighbdHiprecConvolveTest() {} +void AV1HighbdHiprecConvolveTest::SetUp() { + rnd_.Reset(ACMRandom::DeterministicSeed()); +} + +void AV1HighbdHiprecConvolveTest::TearDown() { + libaom_test::ClearSystemState(); +} + +void AV1HighbdHiprecConvolveTest::RunCheckOutput( + highbd_hiprec_convolve_func test_impl) { + const int w = 128, h = 128; + const int out_w = GET_PARAM(0), out_h = GET_PARAM(1); + const int num_iters = GET_PARAM(2); + const int bd = GET_PARAM(3); + int i, j; + const ConvolveParams conv_params = get_conv_params_wiener(bd); + + uint16_t *input = new uint16_t[h * w]; + + // The AVX2 convolve functions always write rows with widths that are + // multiples of 16. So to avoid a buffer overflow, we may need to pad + // rows to a multiple of 16. + int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h; + uint16_t *output = new uint16_t[output_n]; + uint16_t *output2 = new uint16_t[output_n]; + + // Generate random filter kernels + DECLARE_ALIGNED(16, InterpKernel, hkernel); + DECLARE_ALIGNED(16, InterpKernel, vkernel); + + for (i = 0; i < h; ++i) + for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1); + + uint8_t *input_ptr = CONVERT_TO_BYTEPTR(input); + uint8_t *output_ptr = CONVERT_TO_BYTEPTR(output); + uint8_t *output2_ptr = CONVERT_TO_BYTEPTR(output2); + for (int kernel_type = 0; kernel_type < 3; kernel_type++) { + generate_kernels(&rnd_, hkernel, vkernel, kernel_type); + for (i = 0; i < num_iters; ++i) { + // Choose random locations within the source block + int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7); + int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7); + av1_highbd_wiener_convolve_add_src_c( + input_ptr + offset_r * w + offset_c, w, output_ptr, out_w, hkernel, + 16, vkernel, 16, out_w, out_h, &conv_params, bd); + test_impl(input_ptr + offset_r * w + offset_c, w, output2_ptr, out_w, + hkernel, 16, vkernel, 16, out_w, out_h, &conv_params, bd); + + for (j = 0; j < out_w * out_h; ++j) + ASSERT_EQ(output[j], output2[j]) + << "Pixel mismatch at index " << j << " = (" << (j % out_w) << ", " + << (j / out_w) << ") on iteration " << i; + } + } + delete[] input; + delete[] output; + delete[] output2; +} + +void AV1HighbdHiprecConvolveTest::RunSpeedTest( + highbd_hiprec_convolve_func test_impl) { + const int w = 128, h = 128; + const int out_w = GET_PARAM(0), out_h = GET_PARAM(1); + const int num_iters = GET_PARAM(2) / 500; + const int bd = GET_PARAM(3); + int i, j, k; + const ConvolveParams conv_params = get_conv_params_wiener(bd); + + uint16_t *input = new uint16_t[h * w]; + + // The AVX2 convolve functions always write rows with widths that are + // multiples of 16. So to avoid a buffer overflow, we may need to pad + // rows to a multiple of 16. + int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h; + uint16_t *output = new uint16_t[output_n]; + uint16_t *output2 = new uint16_t[output_n]; + + // Generate random filter kernels + DECLARE_ALIGNED(16, InterpKernel, hkernel); + DECLARE_ALIGNED(16, InterpKernel, vkernel); + + generate_kernels(&rnd_, hkernel, vkernel); + + for (i = 0; i < h; ++i) + for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1); + + uint8_t *input_ptr = CONVERT_TO_BYTEPTR(input); + uint8_t *output_ptr = CONVERT_TO_BYTEPTR(output); + uint8_t *output2_ptr = CONVERT_TO_BYTEPTR(output2); + + aom_usec_timer ref_timer; + aom_usec_timer_start(&ref_timer); + for (i = 0; i < num_iters; ++i) { + for (j = 3; j < h - out_h - 4; j++) { + for (k = 3; k < w - out_w - 4; k++) { + av1_highbd_wiener_convolve_add_src_c( + input_ptr + j * w + k, w, output_ptr, out_w, hkernel, 16, vkernel, + 16, out_w, out_h, &conv_params, bd); + } + } + } + aom_usec_timer_mark(&ref_timer); + const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer); + + aom_usec_timer tst_timer; + aom_usec_timer_start(&tst_timer); + for (i = 0; i < num_iters; ++i) { + for (j = 3; j < h - out_h - 4; j++) { + for (k = 3; k < w - out_w - 4; k++) { + test_impl(input_ptr + j * w + k, w, output2_ptr, out_w, hkernel, 16, + vkernel, 16, out_w, out_h, &conv_params, bd); + } + } + } + aom_usec_timer_mark(&tst_timer); + const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer); + + std::cout << "[ ] C time = " << ref_time / 1000 + << " ms, SIMD time = " << tst_time / 1000 << " ms\n"; + + EXPECT_GT(ref_time, tst_time) + << "Error: AV1HighbdHiprecConvolveTest.SpeedTest, SIMD slower than C.\n" + << "C time: " << ref_time << " us\n" + << "SIMD time: " << tst_time << " us\n"; + + delete[] input; + delete[] output; + delete[] output2; +} +} // namespace AV1HighbdHiprecConvolve +#endif // CONFIG_AV1_HIGHBITDEPTH +} // namespace libaom_test diff --git a/libs/libaom/src/test/hiprec_convolve_test_util.h b/libs/libaom/src/test/hiprec_convolve_test_util.h new file mode 100644 index 000000000..6b6da4ee8 --- /dev/null +++ b/libs/libaom/src/test/hiprec_convolve_test_util.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_TEST_HIPREC_CONVOLVE_TEST_UTIL_H_ +#define AOM_TEST_HIPREC_CONVOLVE_TEST_UTIL_H_ + +#include + +#include "config/av1_rtcd.h" + +#include "test/acm_random.h" +#include "test/util.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "aom_ports/aom_timer.h" +#include "av1/common/convolve.h" +#include "av1/common/mv.h" + +namespace libaom_test { + +namespace AV1HiprecConvolve { + +typedef void (*hiprec_convolve_func)(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, + const ConvolveParams *conv_params); + +typedef std::tuple HiprecConvolveParam; + +::testing::internal::ParamGenerator BuildParams( + hiprec_convolve_func filter); + +class AV1HiprecConvolveTest + : public ::testing::TestWithParam { + public: + virtual ~AV1HiprecConvolveTest(); + virtual void SetUp(); + + virtual void TearDown(); + + protected: + void RunCheckOutput(hiprec_convolve_func test_impl); + void RunSpeedTest(hiprec_convolve_func test_impl); + + libaom_test::ACMRandom rnd_; +}; + +} // namespace AV1HiprecConvolve + +#if CONFIG_AV1_HIGHBITDEPTH +namespace AV1HighbdHiprecConvolve { +typedef void (*highbd_hiprec_convolve_func)( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, int h, + const ConvolveParams *conv_params, int bps); + +typedef std::tuple + HighbdHiprecConvolveParam; + +::testing::internal::ParamGenerator BuildParams( + highbd_hiprec_convolve_func filter); + +class AV1HighbdHiprecConvolveTest + : public ::testing::TestWithParam { + public: + virtual ~AV1HighbdHiprecConvolveTest(); + virtual void SetUp(); + + virtual void TearDown(); + + protected: + void RunCheckOutput(highbd_hiprec_convolve_func test_impl); + void RunSpeedTest(highbd_hiprec_convolve_func test_impl); + + libaom_test::ACMRandom rnd_; +}; + +} // namespace AV1HighbdHiprecConvolve +#endif // CONFIG_AV1_HIGHBITDEPTH +} // namespace libaom_test + +#endif // AOM_TEST_HIPREC_CONVOLVE_TEST_UTIL_H_ diff --git a/libs/libaom/src/test/horver_correlation_test.cc b/libs/libaom/src/test/horver_correlation_test.cc new file mode 100644 index 000000000..ccb8eddd0 --- /dev/null +++ b/libs/libaom/src/test/horver_correlation_test.cc @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "test/acm_random.h" +#include "test/register_state_check.h" +#include "test/util.h" + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" + +using libaom_test::ACMRandom; + +namespace { +typedef void (*HorverFunc)(const int16_t *diff, int stride, int w, int h, + float *hcorr, float *vcorr); + +typedef std::tuple HorverTestParam; + +class HorverTest : public ::testing::TestWithParam { + public: + virtual void SetUp() { + data_buf_ = (int16_t *)aom_malloc(MAX_SB_SQUARE * sizeof(int16_t)); + ASSERT_NE(data_buf_, nullptr); + target_func_ = GET_PARAM(0); + } + virtual void TearDown() { aom_free(data_buf_); } + void RunHorverTest(void); + void RunHorverTest_ExtremeValues(void); + void RunHorverSpeedTest(int run_times); + + private: + HorverFunc target_func_; + ACMRandom rng_; + int16_t *data_buf_; +}; + +void HorverTest::RunHorverTest(void) { + for (int block_size = 0; block_size < BLOCK_SIZES_ALL; block_size++) { + const int w = block_size_wide[block_size]; + const int h = block_size_high[block_size]; + for (int iter = 0; iter < 1000 && !HasFatalFailure(); ++iter) { + float hcorr_ref = 0.0, vcorr_ref = 0.0; + float hcorr_test = 0.0, vcorr_test = 0.0; + + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + data_buf_[i] = (rng_.Rand16() % (1 << 12)) - (1 << 11); + } + + av1_get_horver_correlation_full_c(data_buf_, MAX_SB_SIZE, w, h, + &hcorr_ref, &vcorr_ref); + + target_func_(data_buf_, MAX_SB_SIZE, w, h, &hcorr_test, &vcorr_test); + + ASSERT_LE(fabs(hcorr_ref - hcorr_test), 1e-6) + << "hcorr incorrect (" << w << "x" << h << ")"; + ASSERT_LE(fabs(vcorr_ref - vcorr_test), 1e-6) + << "vcorr incorrect (" << w << "x" << h << ")"; + } + // printf("(%3dx%-3d) passed\n", w, h); + } +} + +void HorverTest::RunHorverSpeedTest(int run_times) { + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + data_buf_[i] = rng_.Rand16() % (1 << 12); + } + + for (int block_size = 0; block_size < BLOCK_SIZES_ALL; block_size++) { + const int w = block_size_wide[block_size]; + const int h = block_size_high[block_size]; + float hcorr_ref = 0.0, vcorr_ref = 0.0; + float hcorr_test = 0.0, vcorr_test = 0.0; + + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + av1_get_horver_correlation_full_c(data_buf_, MAX_SB_SIZE, w, h, + &hcorr_ref, &vcorr_ref); + } + aom_usec_timer_mark(&timer); + const double time1 = static_cast(aom_usec_timer_elapsed(&timer)); + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + target_func_(data_buf_, MAX_SB_SIZE, w, h, &hcorr_test, &vcorr_test); + } + aom_usec_timer_mark(&timer); + const double time2 = static_cast(aom_usec_timer_elapsed(&timer)); + + printf("%3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", w, h, time1, time2, + time1 / time2); + } +} + +void HorverTest::RunHorverTest_ExtremeValues(void) { + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + // Most of get_horver_test is squaring and summing, so simply saturating + // the whole buffer is mostly likely to cause an overflow. + data_buf_[i] = (1 << 12) - 1; + } + + for (int block_size = 0; block_size < BLOCK_SIZES_ALL; block_size++) { + const int w = block_size_wide[block_size]; + const int h = block_size_high[block_size]; + float hcorr_ref = 0.0, vcorr_ref = 0.0; + float hcorr_test = 0.0, vcorr_test = 0.0; + + av1_get_horver_correlation_full_c(data_buf_, MAX_SB_SIZE, w, h, &hcorr_ref, + &vcorr_ref); + target_func_(data_buf_, MAX_SB_SIZE, w, h, &hcorr_test, &vcorr_test); + + ASSERT_LE(fabs(hcorr_ref - hcorr_test), 1e-6) << "hcorr incorrect"; + ASSERT_LE(fabs(vcorr_ref - vcorr_test), 1e-6) << "vcorr incorrect"; + } +} + +TEST_P(HorverTest, RandomValues) { RunHorverTest(); } + +TEST_P(HorverTest, ExtremeValues) { RunHorverTest_ExtremeValues(); } + +TEST_P(HorverTest, DISABLED_Speed) { RunHorverSpeedTest(100000); } + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P( + SSE4_1, HorverTest, + ::testing::Values(av1_get_horver_correlation_full_sse4_1)); +#endif // HAVE_SSE4_1 + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, HorverTest, ::testing::Values(av1_get_horver_correlation_full_avx2)); +#endif // HAVE_AVX2 + +} // namespace diff --git a/libs/libaom/src/test/horz_superres_test.cc b/libs/libaom/src/test/horz_superres_test.cc new file mode 100644 index 000000000..938b0b15a --- /dev/null +++ b/libs/libaom/src/test/horz_superres_test.cc @@ -0,0 +1,406 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "av1/encoder/encoder.h" + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "test/yuv_video_source.h" + +namespace { + +using std::make_tuple; +using std::tuple; + +/* TESTING PARAMETERS */ + +const int kBitrate = 40; + +typedef struct { + const char *filename; + aom_img_fmt fmt; + aom_bit_depth_t bit_depth; + unsigned int profile; + unsigned int limit; + unsigned int screen_content; + double psnr_threshold; +} TestVideoParam; + +std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) { + return os << "TestVideoParam { filename:" << test_arg.filename + << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth + << " profile:" << test_arg.profile << " limit:" << test_arg.limit + << " screen_content:" << test_arg.screen_content + << " psnr_threshold:" << test_arg.psnr_threshold << " }"; +} + +const TestVideoParam kTestVideoVectors[] = { + { "park_joy_90p_8_420.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 5, 0, 25.5 }, +#if CONFIG_AV1_HIGHBITDEPTH + { "park_joy_90p_10_444.y4m", AOM_IMG_FMT_I44416, AOM_BITS_10, 1, 5, 0, 28.0 }, +#endif + { "screendata.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 4, 1, 20.0 }, + // Image coding (single frame). + { "niklas_1280_720_30.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 1, 0, 32.0 }, +}; + +// Modes with extra params have their own tests. +const SUPERRES_MODE kSuperresModesWithoutParams[] = { SUPERRES_RANDOM, + SUPERRES_AUTO }; + +// Superres denominators and superres kf denominators to be tested +typedef tuple SuperresDenominatorPair; +const SuperresDenominatorPair kSuperresDenominators[] = { + make_tuple(16, 9), make_tuple(13, 11), make_tuple(9, 9), + make_tuple(13, 13), make_tuple(11, 16), make_tuple(8, 16), + make_tuple(16, 8), make_tuple(8, 8), make_tuple(9, 14), +}; + +// Superres q thresholds and superres kf q thresholds to be tested +typedef tuple SuperresQThresholdPair; +const SuperresQThresholdPair kSuperresQThresholds[] = { + make_tuple(63, 63), make_tuple(63, 41), make_tuple(17, 63), + make_tuple(41, 11), make_tuple(1, 37), make_tuple(11, 11), + make_tuple(1, 1), make_tuple(17, 29), make_tuple(29, 11), +}; + +/* END (TESTING PARAMETERS) */ + +// Test parameter list: +// <[needed for EncoderTest], test_video_param_, superres_mode_> +typedef tuple + HorzSuperresTestParam; + +class HorzSuperresEndToEndTest + : public ::testing::TestWithParam, + public ::libaom_test::EncoderTest { + protected: + HorzSuperresEndToEndTest() + : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)), + superres_mode_(GET_PARAM(2)), psnr_(0.0), frame_count_(0) {} + + virtual ~HorzSuperresEndToEndTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(::libaom_test::kTwoPassGood); + cfg_.g_lag_in_frames = 5; + cfg_.rc_end_usage = AOM_Q; + cfg_.rc_target_bitrate = kBitrate; + cfg_.g_error_resilient = 0; + cfg_.g_profile = test_video_param_.profile; + cfg_.g_input_bit_depth = (unsigned int)test_video_param_.bit_depth; + cfg_.g_bit_depth = test_video_param_.bit_depth; + init_flags_ = AOM_CODEC_USE_PSNR; + if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH; + + // Set superres parameters + cfg_.rc_superres_mode = superres_mode_; + } + + virtual void BeginPassHook(unsigned int) { + psnr_ = 0.0; + frame_count_ = 0; + } + + virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) { + psnr_ += pkt->data.psnr.psnr[0]; + frame_count_++; + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1); + encoder->Control(AV1E_SET_TILE_COLUMNS, 4); + + // Set cpu-used = 8 for speed + encoder->Control(AOME_SET_CPUUSED, 8); + + // Test screen coding tools + if (test_video_param_.screen_content) + encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN); + else + encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT); + + encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1); + encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7); + encoder->Control(AOME_SET_ARNR_STRENGTH, 5); + } + } + + double GetAveragePsnr() const { + if (frame_count_) return psnr_ / frame_count_; + return 0.0; + } + + void DoTest() { + std::unique_ptr video; + video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0, + test_video_param_.limit)); + ASSERT_TRUE(video.get() != NULL); + + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); + const double psnr = GetAveragePsnr(); + EXPECT_GT(psnr, test_video_param_.psnr_threshold) + << "superres_mode_ = " << superres_mode_; + + EXPECT_EQ(test_video_param_.limit, frame_count_) + << "superres_mode_ = " << superres_mode_; + } + + TestVideoParam test_video_param_; + SUPERRES_MODE superres_mode_; + + private: + double psnr_; + unsigned int frame_count_; +}; + +TEST_P(HorzSuperresEndToEndTest, HorzSuperresEndToEndPSNRTest) { DoTest(); } + +AV1_INSTANTIATE_TEST_CASE(HorzSuperresEndToEndTest, + ::testing::ValuesIn(kTestVideoVectors), + ::testing::ValuesIn(kSuperresModesWithoutParams)); + +// Test parameter list: +// <[needed for EncoderTest], test_video_param_, tuple(superres_denom_, +// superres_kf_denom_)> +typedef tuple + HorzSuperresFixedTestParam; + +class HorzSuperresFixedEndToEndTest + : public ::testing::TestWithParam, + public ::libaom_test::EncoderTest { + protected: + HorzSuperresFixedEndToEndTest() + : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)), + superres_mode_(SUPERRES_FIXED), psnr_(0.0), frame_count_(0) { + SuperresDenominatorPair denoms = GET_PARAM(2); + superres_denom_ = std::get<0>(denoms); + superres_kf_denom_ = std::get<1>(denoms); + } + + virtual ~HorzSuperresFixedEndToEndTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(::libaom_test::kTwoPassGood); + cfg_.g_lag_in_frames = 5; + cfg_.rc_end_usage = AOM_VBR; + cfg_.rc_target_bitrate = kBitrate; + cfg_.g_error_resilient = 0; + cfg_.g_profile = test_video_param_.profile; + cfg_.g_input_bit_depth = (unsigned int)test_video_param_.bit_depth; + cfg_.g_bit_depth = test_video_param_.bit_depth; + init_flags_ = AOM_CODEC_USE_PSNR; + if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH; + + // Set superres parameters + cfg_.rc_superres_mode = superres_mode_; + cfg_.rc_superres_denominator = superres_denom_; + cfg_.rc_superres_kf_denominator = superres_kf_denom_; + } + + virtual void BeginPassHook(unsigned int) { + psnr_ = 0.0; + frame_count_ = 0; + } + + virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) { + psnr_ += pkt->data.psnr.psnr[0]; + frame_count_++; + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1); + encoder->Control(AV1E_SET_TILE_COLUMNS, 4); + + // Set cpu-used = 8 for speed + encoder->Control(AOME_SET_CPUUSED, 8); + + // Test screen coding tools + if (test_video_param_.screen_content) + encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN); + else + encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT); + + encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1); + encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7); + encoder->Control(AOME_SET_ARNR_STRENGTH, 5); + } + } + + double GetAveragePsnr() const { + if (frame_count_) return psnr_ / frame_count_; + return 0.0; + } + + void DoTest() { + std::unique_ptr video; + video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0, + test_video_param_.limit)); + ASSERT_TRUE(video.get() != NULL); + + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); + const double psnr = GetAveragePsnr(); + EXPECT_GT(psnr, test_video_param_.psnr_threshold) + << "superres_mode_ = " << superres_mode_ + << ", superres_denom_ = " << superres_denom_ + << ", superres_kf_denom_ = " << superres_kf_denom_; + + EXPECT_EQ(test_video_param_.limit, frame_count_) + << "superres_mode_ = " << superres_mode_ + << ", superres_denom_ = " << superres_denom_ + << ", superres_kf_denom_ = " << superres_kf_denom_; + } + + TestVideoParam test_video_param_; + SUPERRES_MODE superres_mode_; + int superres_denom_; + int superres_kf_denom_; + + private: + double psnr_; + unsigned int frame_count_; +}; + +TEST_P(HorzSuperresFixedEndToEndTest, HorzSuperresFixedTestParam) { DoTest(); } + +AV1_INSTANTIATE_TEST_CASE(HorzSuperresFixedEndToEndTest, + ::testing::ValuesIn(kTestVideoVectors), + ::testing::ValuesIn(kSuperresDenominators)); + +// Test parameter list: +// <[needed for EncoderTest], test_video_param_, +// tuple(superres_qthresh_,superres_kf_qthresh_)> +typedef tuple + HorzSuperresQThreshTestParam; + +class HorzSuperresQThreshEndToEndTest + : public ::testing::TestWithParam, + public ::libaom_test::EncoderTest { + protected: + HorzSuperresQThreshEndToEndTest() + : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)), + superres_mode_(SUPERRES_QTHRESH), psnr_(0.0), frame_count_(0) { + SuperresQThresholdPair qthresholds = GET_PARAM(2); + superres_qthresh_ = std::get<0>(qthresholds); + superres_kf_qthresh_ = std::get<1>(qthresholds); + } + + virtual ~HorzSuperresQThreshEndToEndTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(::libaom_test::kTwoPassGood); + cfg_.g_lag_in_frames = 5; + cfg_.rc_end_usage = AOM_VBR; + cfg_.rc_target_bitrate = kBitrate; + cfg_.g_error_resilient = 0; + cfg_.g_profile = test_video_param_.profile; + cfg_.g_input_bit_depth = (unsigned int)test_video_param_.bit_depth; + cfg_.g_bit_depth = test_video_param_.bit_depth; + init_flags_ = AOM_CODEC_USE_PSNR; + if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH; + + // Set superres parameters + cfg_.rc_superres_mode = superres_mode_; + cfg_.rc_superres_qthresh = superres_qthresh_; + cfg_.rc_superres_kf_qthresh = superres_kf_qthresh_; + } + + virtual void BeginPassHook(unsigned int) { + psnr_ = 0.0; + frame_count_ = 0; + } + + virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) { + psnr_ += pkt->data.psnr.psnr[0]; + frame_count_++; + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1); + encoder->Control(AV1E_SET_TILE_COLUMNS, 0); + + // Set cpu-used = 8 for speed + encoder->Control(AOME_SET_CPUUSED, 8); + + // Test screen coding tools + if (test_video_param_.screen_content) + encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN); + else + encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT); + + encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1); + encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7); + encoder->Control(AOME_SET_ARNR_STRENGTH, 5); + } + } + + double GetAveragePsnr() const { + if (frame_count_) return psnr_ / frame_count_; + return 0.0; + } + + void DoTest() { + std::unique_ptr video; + video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0, + test_video_param_.limit)); + ASSERT_TRUE(video.get() != NULL); + + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); + const double psnr = GetAveragePsnr(); + EXPECT_GT(psnr, test_video_param_.psnr_threshold) + << "superres_mode_ = " << superres_mode_ + << ", superres_qthresh_ = " << superres_qthresh_ + << ", superres_kf_qthresh_ = " << superres_kf_qthresh_; + + EXPECT_EQ(test_video_param_.limit, frame_count_) + << "superres_mode_ = " << superres_mode_ + << ", superres_qthresh_ = " << superres_qthresh_ + << ", superres_kf_qthresh_ = " << superres_kf_qthresh_; + } + + TestVideoParam test_video_param_; + SUPERRES_MODE superres_mode_; + int superres_qthresh_; + int superres_kf_qthresh_; + + private: + double psnr_; + unsigned int frame_count_; +}; + +TEST_P(HorzSuperresQThreshEndToEndTest, HorzSuperresQThreshEndToEndPSNRTest) { + DoTest(); +} + +AV1_INSTANTIATE_TEST_CASE(HorzSuperresQThreshEndToEndTest, + ::testing::ValuesIn(kTestVideoVectors), + ::testing::ValuesIn(kSuperresQThresholds)); + +} // namespace diff --git a/libs/libaom/src/test/i420_video_source.h b/libs/libaom/src/test/i420_video_source.h new file mode 100644 index 000000000..233e7152b --- /dev/null +++ b/libs/libaom/src/test/i420_video_source.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_TEST_I420_VIDEO_SOURCE_H_ +#define AOM_TEST_I420_VIDEO_SOURCE_H_ +#include +#include +#include + +#include "test/yuv_video_source.h" + +namespace libaom_test { + +// This class extends VideoSource to allow parsing of raw yv12 +// so that we can do actual file encodes. +class I420VideoSource : public YUVVideoSource { + public: + I420VideoSource(const std::string &file_name, unsigned int width, + unsigned int height, int rate_numerator, int rate_denominator, + unsigned int start, int limit) + : YUVVideoSource(file_name, AOM_IMG_FMT_I420, width, height, + rate_numerator, rate_denominator, start, limit) {} +}; + +} // namespace libaom_test + +#endif // AOM_TEST_I420_VIDEO_SOURCE_H_ diff --git a/libs/libaom/src/test/intra_edge_test.cc b/libs/libaom/src/test/intra_edge_test.cc new file mode 100644 index 000000000..f7702c952 --- /dev/null +++ b/libs/libaom/src/test/intra_edge_test.cc @@ -0,0 +1,337 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/register_state_check.h" +#include "test/function_equivalence_test.h" + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "av1/common/enums.h" + +using libaom_test::FunctionEquivalenceTest; + +namespace { + +template +class UpsampleTest : public FunctionEquivalenceTest { + protected: + static const int kIterations = 1000000; + static const int kMinEdge = 4; + static const int kMaxEdge = 24; + static const int kBufSize = 2 * 64 + 32; + static const int kOffset = 16; + + virtual ~UpsampleTest() {} + + virtual void Execute(T *edge_tst) = 0; + + void Common() { + edge_ref_ = &edge_ref_data_[kOffset]; + edge_tst_ = &edge_tst_data_[kOffset]; + + Execute(edge_tst_); + + const int max_idx = (size_ - 1) * 2; + for (int r = -2; r <= max_idx; ++r) { + ASSERT_EQ(edge_ref_[r], edge_tst_[r]); + } + } + + T edge_ref_data_[kBufSize]; + T edge_tst_data_[kBufSize]; + + T *edge_ref_; + T *edge_tst_; + + int size_; +}; + +////////////////////////////////////////////////////////////////////////////// +// 8 bit version +////////////////////////////////////////////////////////////////////////////// + +typedef void (*UP8B)(uint8_t *p, int size); +typedef libaom_test::FuncParam TestFuncs; + +class UpsampleTest8B : public UpsampleTest { + protected: + void Execute(uint8_t *edge_tst) { + params_.ref_func(edge_ref_, size_); + ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_)); + } +}; + +TEST_P(UpsampleTest8B, RandomValues) { + for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + size_ = 4 * (this->rng_(4) + 1); + + int i, pix = 0; + for (i = 0; i < kOffset + size_; ++i) { + pix = rng_.Rand8(); + edge_ref_data_[i] = pix; + edge_tst_data_[i] = edge_ref_data_[i]; + } + + // Extend final sample + while (i < kBufSize) { + edge_ref_data_[i] = pix; + edge_tst_data_[i] = pix; + i++; + } + + Common(); + } +} + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P( + SSE4_1, UpsampleTest8B, + ::testing::Values(TestFuncs(av1_upsample_intra_edge_c, + av1_upsample_intra_edge_sse4_1))); +#endif // HAVE_SSE4_1 + +////////////////////////////////////////////////////////////////////////////// +// High bit-depth version +////////////////////////////////////////////////////////////////////////////// + +typedef void (*UPHB)(uint16_t *p, int size, int bd); +typedef libaom_test::FuncParam TestFuncsHBD; + +class UpsampleTestHB : public UpsampleTest { + protected: + void Execute(uint16_t *edge_tst) { + params_.ref_func(edge_ref_, size_, bit_depth_); + ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, bit_depth_)); + } + int bit_depth_; +}; + +TEST_P(UpsampleTestHB, RandomValues) { + for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + switch (rng_(3)) { + case 0: bit_depth_ = 8; break; + case 1: bit_depth_ = 10; break; + default: bit_depth_ = 12; break; + } + const int hi = 1 << bit_depth_; + + size_ = 4 * (this->rng_(4) + 1); + + int i, pix = 0; + for (i = 0; i < kOffset + size_; ++i) { + pix = rng_(hi); + edge_ref_data_[i] = pix; + edge_tst_data_[i] = pix; + } + + // Extend final sample + while (i < kBufSize) { + edge_ref_data_[i] = pix; + edge_tst_data_[i] = pix; + i++; + } + + Common(); + } +} + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P( + SSE4_1, UpsampleTestHB, + ::testing::Values(TestFuncsHBD(av1_upsample_intra_edge_high_c, + av1_upsample_intra_edge_high_sse4_1))); +#endif // HAVE_SSE4_1 + +template +class FilterEdgeTest : public FunctionEquivalenceTest { + protected: + static const int kIterations = 1000000; + static const int kMaxEdge = 2 * 64; + static const int kBufSize = kMaxEdge + 32; + static const int kOffset = 15; + + virtual ~FilterEdgeTest() {} + + virtual void Execute(T *edge_tst) = 0; + + void Common() { + edge_ref_ = &edge_ref_data_[kOffset]; + edge_tst_ = &edge_tst_data_[kOffset]; + + Execute(edge_tst_); + + for (int r = 0; r < size_; ++r) { + ASSERT_EQ(edge_ref_[r], edge_tst_[r]); + } + } + + T edge_ref_data_[kBufSize]; + T edge_tst_data_[kBufSize]; + + T *edge_ref_; + T *edge_tst_; + + int size_; + int strength_; +}; + +////////////////////////////////////////////////////////////////////////////// +// 8 bit version +////////////////////////////////////////////////////////////////////////////// + +typedef void (*FE8B)(uint8_t *p, int size, int strength); +typedef libaom_test::FuncParam FilterEdgeTestFuncs; + +class FilterEdgeTest8B : public FilterEdgeTest { + protected: + void Execute(uint8_t *edge_tst) { + params_.ref_func(edge_ref_, size_, strength_); + ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, strength_)); + } +}; + +TEST_P(FilterEdgeTest8B, RandomValues) { + for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + strength_ = this->rng_(4); + size_ = 4 * (this->rng_(128 / 4) + 1) + 1; + + int i, pix = 0; + for (i = 0; i < kOffset + size_; ++i) { + pix = rng_.Rand8(); + edge_ref_data_[i] = pix; + edge_tst_data_[i] = pix; + } + + Common(); + } +} + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P( + SSE4_1, FilterEdgeTest8B, + ::testing::Values(FilterEdgeTestFuncs(av1_filter_intra_edge_c, + av1_filter_intra_edge_sse4_1))); +#endif // HAVE_SSE4_1 + +////////////////////////////////////////////////////////////////////////////// +// High bit-depth version +////////////////////////////////////////////////////////////////////////////// + +typedef void (*FEHB)(uint16_t *p, int size, int strength); +typedef libaom_test::FuncParam FilterEdgeTestFuncsHBD; + +class FilterEdgeTestHB : public FilterEdgeTest { + protected: + void Execute(uint16_t *edge_tst) { + params_.ref_func(edge_ref_, size_, strength_); + ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, strength_)); + } + int bit_depth_; +}; + +TEST_P(FilterEdgeTestHB, RandomValues) { + for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + switch (rng_(3)) { + case 0: bit_depth_ = 8; break; + case 1: bit_depth_ = 10; break; + default: bit_depth_ = 12; break; + } + const int hi = 1 << bit_depth_; + strength_ = this->rng_(4); + size_ = 4 * (this->rng_(128 / 4) + 1) + 1; + + int i, pix = 0; + for (i = 0; i < kOffset + size_; ++i) { + pix = rng_(hi); + edge_ref_data_[i] = pix; + edge_tst_data_[i] = pix; + } + + Common(); + } +} + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE4_1, FilterEdgeTestHB, + ::testing::Values(FilterEdgeTestFuncsHBD( + av1_filter_intra_edge_high_c, + av1_filter_intra_edge_high_sse4_1))); +#endif // HAVE_SSE4_1 + +// Speed tests + +TEST_P(UpsampleTest8B, DISABLED_Speed) { + const int test_count = 10000000; + size_ = kMaxEdge; + for (int i = 0; i < kOffset + size_; ++i) { + edge_tst_data_[i] = rng_.Rand8(); + } + edge_tst_ = &edge_tst_data_[kOffset]; + for (int iter = 0; iter < test_count; ++iter) { + ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_)); + } +} + +TEST_P(UpsampleTestHB, DISABLED_Speed) { + const int test_count = 10000000; + size_ = kMaxEdge; + bit_depth_ = 12; + const int hi = 1 << bit_depth_; + for (int i = 0; i < kOffset + size_; ++i) { + edge_tst_data_[i] = rng_(hi); + } + edge_tst_ = &edge_tst_data_[kOffset]; + for (int iter = 0; iter < test_count; ++iter) { + ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, bit_depth_)); + } +} + +TEST_P(FilterEdgeTest8B, DISABLED_Speed) { + const int test_count = 10000000; + size_ = kMaxEdge; + strength_ = 1; + for (int i = 0; i < kOffset + size_; ++i) { + edge_tst_data_[i] = rng_.Rand8(); + } + edge_tst_ = &edge_tst_data_[kOffset]; + for (int iter = 0; iter < test_count; ++iter) { + ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, strength_)); + // iterate over filter strengths (1,2,3) + strength_ = (strength_ == 3) ? 1 : strength_ + 1; + } +} + +TEST_P(FilterEdgeTestHB, DISABLED_Speed) { + const int test_count = 10000000; + size_ = kMaxEdge; + strength_ = 1; + bit_depth_ = 12; + const int hi = 1 << bit_depth_; + for (int i = 0; i < kOffset + size_; ++i) { + edge_tst_data_[i] = rng_(hi); + } + edge_tst_ = &edge_tst_data_[kOffset]; + for (int iter = 0; iter < test_count; ++iter) { + ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, strength_)); + // iterate over filter strengths (1,2,3) + strength_ = (strength_ == 3) ? 1 : strength_ + 1; + } +} + +} // namespace diff --git a/libs/libaom/src/test/intrabc_test.cc b/libs/libaom/src/test/intrabc_test.cc new file mode 100644 index 000000000..b57eb6fab --- /dev/null +++ b/libs/libaom/src/test/intrabc_test.cc @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_config.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/enums.h" +#include "av1/common/mv.h" +#include "av1/common/mvref_common.h" +#include "av1/common/tile_common.h" + +namespace { +TEST(IntrabcTest, DvValidation) { + struct DvTestCase { + MV dv; + int mi_row_offset; + int mi_col_offset; + BLOCK_SIZE bsize; + bool valid; + }; + const int kSubPelScale = 8; + const int kTileMaxMibWidth = 8; + const DvTestCase kDvCases[] = { + { { 0, 0 }, 0, 0, BLOCK_128X128, false }, + { { 0, 0 }, 0, 0, BLOCK_64X64, false }, + { { 0, 0 }, 0, 0, BLOCK_32X32, false }, + { { 0, 0 }, 0, 0, BLOCK_16X16, false }, + { { 0, 0 }, 0, 0, BLOCK_8X8, false }, + { { 0, 0 }, 0, 0, BLOCK_4X4, false }, + { { -MAX_SB_SIZE * kSubPelScale, -MAX_SB_SIZE * kSubPelScale }, + MAX_SB_SIZE / MI_SIZE, + MAX_SB_SIZE / MI_SIZE, + BLOCK_16X16, + true }, + { { 0, -MAX_SB_SIZE * kSubPelScale }, + MAX_SB_SIZE / MI_SIZE, + MAX_SB_SIZE / MI_SIZE, + BLOCK_16X16, + false }, + { { -MAX_SB_SIZE * kSubPelScale, 0 }, + MAX_SB_SIZE / MI_SIZE, + MAX_SB_SIZE / MI_SIZE, + BLOCK_16X16, + true }, + { { MAX_SB_SIZE * kSubPelScale, 0 }, + MAX_SB_SIZE / MI_SIZE, + MAX_SB_SIZE / MI_SIZE, + BLOCK_16X16, + false }, + { { 0, MAX_SB_SIZE * kSubPelScale }, + MAX_SB_SIZE / MI_SIZE, + MAX_SB_SIZE / MI_SIZE, + BLOCK_16X16, + false }, + { { -32 * kSubPelScale, -32 * kSubPelScale }, + MAX_SB_SIZE / MI_SIZE, + MAX_SB_SIZE / MI_SIZE, + BLOCK_32X32, + true }, + { { -32 * kSubPelScale, -32 * kSubPelScale }, + 32 / MI_SIZE, + 32 / MI_SIZE, + BLOCK_32X32, + false }, + { { -32 * kSubPelScale - kSubPelScale / 2, -32 * kSubPelScale }, + MAX_SB_SIZE / MI_SIZE, + MAX_SB_SIZE / MI_SIZE, + BLOCK_32X32, + false }, + { { -33 * kSubPelScale, -32 * kSubPelScale }, + MAX_SB_SIZE / MI_SIZE, + MAX_SB_SIZE / MI_SIZE, + BLOCK_32X32, + true }, + { { -32 * kSubPelScale, -32 * kSubPelScale - kSubPelScale / 2 }, + MAX_SB_SIZE / MI_SIZE, + MAX_SB_SIZE / MI_SIZE, + BLOCK_32X32, + false }, + { { -32 * kSubPelScale, -33 * kSubPelScale }, + MAX_SB_SIZE / MI_SIZE, + MAX_SB_SIZE / MI_SIZE, + BLOCK_32X32, + true }, + { { -MAX_SB_SIZE * kSubPelScale, -MAX_SB_SIZE * kSubPelScale }, + MAX_SB_SIZE / MI_SIZE, + MAX_SB_SIZE / MI_SIZE, + BLOCK_LARGEST, + true }, + { { -(MAX_SB_SIZE + 1) * kSubPelScale, -MAX_SB_SIZE * kSubPelScale }, + MAX_SB_SIZE / MI_SIZE, + MAX_SB_SIZE / MI_SIZE, + BLOCK_LARGEST, + false }, + { { -MAX_SB_SIZE * kSubPelScale, -(MAX_SB_SIZE + 1) * kSubPelScale }, + MAX_SB_SIZE / MI_SIZE, + MAX_SB_SIZE / MI_SIZE, + BLOCK_LARGEST, + false }, + { { -(MAX_SB_SIZE - 1) * kSubPelScale, -MAX_SB_SIZE * kSubPelScale }, + MAX_SB_SIZE / MI_SIZE, + MAX_SB_SIZE / MI_SIZE, + BLOCK_LARGEST, + false }, + { { -MAX_SB_SIZE * kSubPelScale, -(MAX_SB_SIZE - 1) * kSubPelScale }, + MAX_SB_SIZE / MI_SIZE, + MAX_SB_SIZE / MI_SIZE, + BLOCK_LARGEST, + true }, + { { -(MAX_SB_SIZE - 1) * kSubPelScale, -(MAX_SB_SIZE - 1) * kSubPelScale }, + MAX_SB_SIZE / MI_SIZE, + MAX_SB_SIZE / MI_SIZE, + BLOCK_LARGEST, + false }, + { { -MAX_SB_SIZE * kSubPelScale, MAX_SB_SIZE * kSubPelScale }, + MAX_SB_SIZE / MI_SIZE, + MAX_SB_SIZE / MI_SIZE, + BLOCK_LARGEST, + false }, + { { -MAX_SB_SIZE * kSubPelScale, + (kTileMaxMibWidth - 2) * MAX_SB_SIZE * kSubPelScale }, + MAX_SB_SIZE / MI_SIZE, + MAX_SB_SIZE / MI_SIZE, + BLOCK_LARGEST, + false }, + { { -MAX_SB_SIZE * kSubPelScale, + ((kTileMaxMibWidth - 2) * MAX_SB_SIZE + 1) * kSubPelScale }, + MAX_SB_SIZE / MI_SIZE, + MAX_SB_SIZE / MI_SIZE, + BLOCK_LARGEST, + false }, + }; + + MACROBLOCKD xd; + memset(&xd, 0, sizeof(xd)); + xd.tile.mi_row_start = 8 * MAX_MIB_SIZE; + xd.tile.mi_row_end = 16 * MAX_MIB_SIZE; + xd.tile.mi_col_start = 24 * MAX_MIB_SIZE; + xd.tile.mi_col_end = xd.tile.mi_col_start + kTileMaxMibWidth * MAX_MIB_SIZE; + xd.plane[1].subsampling_x = 1; + xd.plane[1].subsampling_y = 1; + xd.plane[2].subsampling_x = 1; + xd.plane[2].subsampling_y = 1; + + AV1_COMMON cm; + memset(&cm, 0, sizeof(cm)); + + for (const DvTestCase &dv_case : kDvCases) { + const int mi_row = xd.tile.mi_row_start + dv_case.mi_row_offset; + const int mi_col = xd.tile.mi_col_start + dv_case.mi_col_offset; + xd.is_chroma_ref = is_chroma_reference(mi_row, mi_col, dv_case.bsize, + xd.plane[1].subsampling_x, + xd.plane[1].subsampling_y); + EXPECT_EQ(static_cast(dv_case.valid), + av1_is_dv_valid(dv_case.dv, &cm, &xd, mi_row, mi_col, + dv_case.bsize, MAX_MIB_SIZE_LOG2)); + } +} +} // namespace diff --git a/libs/libaom/src/test/intrapred_test.cc b/libs/libaom/src/test/intrapred_test.cc new file mode 100644 index 000000000..779cf9a5d --- /dev/null +++ b/libs/libaom/src/test/intrapred_test.cc @@ -0,0 +1,273 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "av1/common/blockd.h" +#include "av1/common/common.h" +#include "av1/common/pred_common.h" +#include "aom_mem/aom_mem.h" + +namespace { + +using libaom_test::ACMRandom; + +const int count_test_block = 100000; + +typedef void (*HighbdIntraPred)(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bps); +typedef void (*IntraPred)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, + const uint8_t *left); + +} // namespace + +// NOTE: Under gcc version 7.3.0 (Debian 7.3.0-5), if this template is in the +// anonymous namespace, then we get a strange compiler warning in +// the begin() and end() methods of the ParamGenerator template class in +// gtest/internal/gtest-param-util.h: +// warning: ‘’ is used uninitialized in this function +// As a workaround, put this template outside the anonymous namespace. +// See bug aomedia:2003. +template +struct IntraPredFunc { + IntraPredFunc(FuncType pred = NULL, FuncType ref = NULL, + int block_width_value = 0, int block_height_value = 0, + int bit_depth_value = 0) + : pred_fn(pred), ref_fn(ref), block_width(block_width_value), + block_height(block_height_value), bit_depth(bit_depth_value) {} + + FuncType pred_fn; + FuncType ref_fn; + int block_width; + int block_height; + int bit_depth; +}; + +namespace { + +template +class AV1IntraPredTest + : public ::testing::TestWithParam > { + public: + void RunTest(Pixel *left_col, Pixel *above_data, Pixel *dst, Pixel *ref_dst) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int block_width = params_.block_width; + const int block_height = params_.block_height; + above_row_ = above_data + 16; + left_col_ = left_col; + dst_ = dst; + ref_dst_ = ref_dst; + int error_count = 0; + for (int i = 0; i < count_test_block; ++i) { + // Fill edges with random data, try first with saturated values. + for (int x = -1; x <= block_width * 2; x++) { + if (i == 0) { + above_row_[x] = mask_; + } else { + above_row_[x] = rnd.Rand16() & mask_; + } + } + for (int y = 0; y < block_height; y++) { + if (i == 0) { + left_col_[y] = mask_; + } else { + left_col_[y] = rnd.Rand16() & mask_; + } + } + Predict(); + CheckPrediction(i, &error_count); + } + ASSERT_EQ(0, error_count); + } + + protected: + virtual void SetUp() { + params_ = this->GetParam(); + stride_ = params_.block_width * 3; + mask_ = (1 << params_.bit_depth) - 1; + } + + virtual void Predict() = 0; + + void CheckPrediction(int test_case_number, int *error_count) const { + // For each pixel ensure that the calculated value is the same as reference. + const int block_width = params_.block_width; + const int block_height = params_.block_height; + for (int y = 0; y < block_height; y++) { + for (int x = 0; x < block_width; x++) { + *error_count += ref_dst_[x + y * stride_] != dst_[x + y * stride_]; + if (*error_count == 1) { + ASSERT_EQ(ref_dst_[x + y * stride_], dst_[x + y * stride_]) + << " Failed on Test Case Number " << test_case_number + << " location: x = " << x << " y = " << y; + } + } + } + } + + Pixel *above_row_; + Pixel *left_col_; + Pixel *dst_; + Pixel *ref_dst_; + ptrdiff_t stride_; + int mask_; + + IntraPredFunc params_; +}; + +#if CONFIG_AV1_HIGHBITDEPTH +class HighbdIntraPredTest : public AV1IntraPredTest { + protected: + void Predict() { + const int bit_depth = params_.bit_depth; + params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth); + ASM_REGISTER_STATE_CHECK( + params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth)); + } +}; +#endif + +class LowbdIntraPredTest : public AV1IntraPredTest { + protected: + void Predict() { + params_.ref_fn(ref_dst_, stride_, above_row_, left_col_); + ASM_REGISTER_STATE_CHECK( + params_.pred_fn(dst_, stride_, above_row_, left_col_)); + } +}; + +#if CONFIG_AV1_HIGHBITDEPTH +// Suppress an unitialized warning. Once there are implementations to test then +// this can be restored. +TEST_P(HighbdIntraPredTest, Bitexact) { + // max block size is 64 + DECLARE_ALIGNED(16, uint16_t, left_col[2 * 64]); + DECLARE_ALIGNED(16, uint16_t, above_data[2 * 64 + 64]); + DECLARE_ALIGNED(16, uint16_t, dst[3 * 64 * 64]); + DECLARE_ALIGNED(16, uint16_t, ref_dst[3 * 64 * 64]); + av1_zero(left_col); + av1_zero(above_data); + RunTest(left_col, above_data, dst, ref_dst); +} +#endif + +// Same issue as above but for arm. +#if !HAVE_NEON +TEST_P(LowbdIntraPredTest, Bitexact) { + // max block size is 32 + DECLARE_ALIGNED(16, uint8_t, left_col[2 * 32]); + DECLARE_ALIGNED(16, uint8_t, above_data[2 * 32 + 32]); + DECLARE_ALIGNED(16, uint8_t, dst[3 * 32 * 32]); + DECLARE_ALIGNED(16, uint8_t, ref_dst[3 * 32 * 32]); + av1_zero(left_col); + av1_zero(above_data); + RunTest(left_col, above_data, dst, ref_dst); +} +#endif // !HAVE_NEON + +#if CONFIG_AV1_HIGHBITDEPTH +// ----------------------------------------------------------------------------- +// High Bit Depth Tests +#define highbd_entry(type, width, height, opt, bd) \ + IntraPredFunc( \ + &aom_highbd_##type##_predictor_##width##x##height##_##opt, \ + &aom_highbd_##type##_predictor_##width##x##height##_c, width, height, \ + bd) + +#if 0 +#define highbd_intrapred(type, opt, bd) \ + highbd_entry(type, 4, 4, opt, bd), highbd_entry(type, 4, 8, opt, bd), \ + highbd_entry(type, 8, 4, opt, bd), highbd_entry(type, 8, 8, opt, bd), \ + highbd_entry(type, 8, 16, opt, bd), highbd_entry(type, 16, 8, opt, bd), \ + highbd_entry(type, 16, 16, opt, bd), \ + highbd_entry(type, 16, 32, opt, bd), \ + highbd_entry(type, 32, 16, opt, bd), highbd_entry(type, 32, 32, opt, bd) +#endif +#endif // CONFIG_AV1_HIGHBITDEPTH +// --------------------------------------------------------------------------- +// Low Bit Depth Tests + +#define lowbd_entry(type, width, height, opt) \ + IntraPredFunc(&aom_##type##_predictor_##width##x##height##_##opt, \ + &aom_##type##_predictor_##width##x##height##_c, \ + width, height, 8) + +#define lowbd_intrapred(type, opt) \ + lowbd_entry(type, 4, 4, opt), lowbd_entry(type, 4, 8, opt), \ + lowbd_entry(type, 8, 4, opt), lowbd_entry(type, 8, 8, opt), \ + lowbd_entry(type, 8, 16, opt), lowbd_entry(type, 16, 8, opt), \ + lowbd_entry(type, 16, 16, opt), lowbd_entry(type, 16, 32, opt), \ + lowbd_entry(type, 32, 16, opt), lowbd_entry(type, 32, 32, opt) + +#if HAVE_SSE2 +const IntraPredFunc LowbdIntraPredTestVector[] = { + lowbd_intrapred(dc, sse2), lowbd_intrapred(dc_top, sse2), + lowbd_intrapred(dc_left, sse2), lowbd_intrapred(dc_128, sse2), + lowbd_intrapred(v, sse2), lowbd_intrapred(h, sse2), +}; + +INSTANTIATE_TEST_SUITE_P(SSE2, LowbdIntraPredTest, + ::testing::ValuesIn(LowbdIntraPredTestVector)); + +#endif // HAVE_SSE2 + +#if HAVE_SSSE3 +const IntraPredFunc LowbdIntraPredTestVectorSsse3[] = { + lowbd_intrapred(paeth, ssse3), + lowbd_intrapred(smooth, ssse3), +}; + +INSTANTIATE_TEST_SUITE_P(SSSE3, LowbdIntraPredTest, + ::testing::ValuesIn(LowbdIntraPredTestVectorSsse3)); + +#endif // HAVE_SSSE3 + +#if HAVE_AVX2 +const IntraPredFunc LowbdIntraPredTestVectorAvx2[] = { + lowbd_entry(dc, 32, 32, avx2), lowbd_entry(dc_top, 32, 32, avx2), + lowbd_entry(dc_left, 32, 32, avx2), lowbd_entry(dc_128, 32, 32, avx2), + lowbd_entry(v, 32, 32, avx2), lowbd_entry(h, 32, 32, avx2), + lowbd_entry(dc, 32, 16, avx2), lowbd_entry(dc_top, 32, 16, avx2), + lowbd_entry(dc_left, 32, 16, avx2), lowbd_entry(dc_128, 32, 16, avx2), + lowbd_entry(v, 32, 16, avx2), lowbd_entry(paeth, 16, 8, avx2), + lowbd_entry(paeth, 16, 16, avx2), lowbd_entry(paeth, 16, 32, avx2), + lowbd_entry(paeth, 32, 16, avx2), lowbd_entry(paeth, 32, 32, avx2), +}; + +INSTANTIATE_TEST_SUITE_P(AVX2, LowbdIntraPredTest, + ::testing::ValuesIn(LowbdIntraPredTestVectorAvx2)); + +#endif // HAVE_AVX2 + +#if CONFIG_AV1_HIGHBITDEPTH +#if HAVE_NEON +const IntraPredFunc HighbdIntraPredTestVectorNeon[] = { + highbd_entry(dc, 4, 4, neon, 8), highbd_entry(dc, 8, 8, neon, 8), + highbd_entry(dc, 16, 16, neon, 8), highbd_entry(dc, 32, 32, neon, 8), + highbd_entry(dc, 64, 64, neon, 8), +}; + +INSTANTIATE_TEST_SUITE_P(NEON, HighbdIntraPredTest, + ::testing::ValuesIn(HighbdIntraPredTestVectorNeon)); + +#endif // HAVE_NEON +#endif // CONFIG_AV1_HIGHBITDEPTH +} // namespace diff --git a/libs/libaom/src/test/invalid_file_test.cc b/libs/libaom/src/test/invalid_file_test.cc new file mode 100644 index 000000000..dd0956d0c --- /dev/null +++ b/libs/libaom/src/test/invalid_file_test.cc @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/ivf_video_source.h" +#include "test/util.h" +#include "test/video_source.h" + +namespace { + +struct DecodeParam { + int threads; + const char *filename; + const char *res_filename; // If NULL, the result filename is + // filename + ".res". +}; + +// Constructs result file name. +std::string GetResFilename(const DecodeParam ¶m) { + if (param.res_filename != NULL) return param.res_filename; + const std::string filename = param.filename; + return filename + ".res"; +} + +std::ostream &operator<<(std::ostream &os, const DecodeParam &dp) { + return os << "threads: " << dp.threads << " file: " << dp.filename + << " result file: " << GetResFilename(dp); +} + +class InvalidFileTest : public ::libaom_test::DecoderTest, + public ::libaom_test::CodecTestWithParam { + protected: + InvalidFileTest() : DecoderTest(GET_PARAM(0)), res_file_(NULL) {} + + virtual ~InvalidFileTest() { + if (res_file_ != NULL) fclose(res_file_); + } + + void OpenResFile(const std::string &res_file_name) { + res_file_ = libaom_test::OpenTestDataFile(res_file_name); + ASSERT_TRUE(res_file_ != NULL) + << "Result file open failed. Filename: " << res_file_name; + } + + virtual void DecompressedFrameHook(const aom_image_t &img, + const unsigned int /*frame_number*/) { + EXPECT_NE(img.fb_priv, nullptr); + } + + virtual bool HandleDecodeResult( + const aom_codec_err_t res_dec, + const libaom_test::CompressedVideoSource &video, + libaom_test::Decoder *decoder) { + EXPECT_TRUE(res_file_ != NULL); + int expected_res_dec = -1; + + // Read integer result. + const int res = fscanf(res_file_, "%d", &expected_res_dec); + EXPECT_NE(res, EOF) << "Read result data failed"; + + if (expected_res_dec != -1) { + // Check results match. + const DecodeParam input = GET_PARAM(1); + if (input.threads > 1) { + // The serial decode check is too strict for tile-threaded decoding as + // there is no guarantee on the decode order nor which specific error + // will take precedence. Currently a tile-level error is not forwarded + // so the frame will simply be marked corrupt. + EXPECT_TRUE(res_dec == expected_res_dec || + res_dec == AOM_CODEC_CORRUPT_FRAME) + << "Results don't match: frame number = " << video.frame_number() + << ". (" << decoder->DecodeError() + << "). Expected: " << expected_res_dec << " or " + << AOM_CODEC_CORRUPT_FRAME; + } else { + EXPECT_EQ(expected_res_dec, res_dec) + << "Results don't match: frame number = " << video.frame_number() + << ". (" << decoder->DecodeError() << ")"; + } + } + + return !HasFailure(); + } + + virtual void HandlePeekResult(libaom_test::Decoder *const /*decoder*/, + libaom_test::CompressedVideoSource * /*video*/, + const aom_codec_err_t /*res_peek*/) {} + + void RunTest() { + const DecodeParam input = GET_PARAM(1); + aom_codec_dec_cfg_t cfg = { 0, 0, 0, !FORCE_HIGHBITDEPTH_DECODING }; + cfg.threads = input.threads; + const std::string filename = input.filename; + libaom_test::IVFVideoSource decode_video(filename); + decode_video.Init(); + + // The result file holds a list of expected integer results, one for each + // decoded frame. Any result that doesn't match the file's list will + // cause a test failure. + const std::string res_filename = GetResFilename(input); + OpenResFile(res_filename); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&decode_video, cfg)); + } + + private: + FILE *res_file_; +}; + +TEST_P(InvalidFileTest, ReturnCode) { RunTest(); } + +// If res_filename (the third field) is NULL, then the result filename is +// filename + ".res" by default. Set res_filename to a string if the result +// filename differs from the default. +const DecodeParam kAV1InvalidFileTests[] = { + // { threads, filename, res_filename } + { 1, "invalid-bug-1814.ivf", NULL }, + { 1, "invalid-chromium-906381.ivf", NULL }, + { 1, "invalid-google-142530197.ivf", NULL }, + { 1, "invalid-google-142530197-1.ivf", NULL }, + { 4, "invalid-oss-fuzz-9463.ivf", "invalid-oss-fuzz-9463.ivf.res.2" }, + { 1, "invalid-oss-fuzz-9720.ivf", NULL }, + { 1, "invalid-oss-fuzz-10389.ivf", "invalid-oss-fuzz-10389.ivf.res.2" }, + { 1, "invalid-oss-fuzz-11523.ivf", "invalid-oss-fuzz-11523.ivf.res.2" }, + { 4, "invalid-oss-fuzz-15363.ivf", NULL }, + { 1, "invalid-oss-fuzz-16437.ivf", NULL }, +#if CONFIG_AV1_HIGHBITDEPTH + // These test vectors contain 10-bit or 12-bit video. + { 1, "invalid-oss-fuzz-9288.ivf", NULL }, + { 1, "invalid-oss-fuzz-9482.ivf", NULL }, + { 1, "invalid-oss-fuzz-10061.ivf", NULL }, + { 1, "invalid-oss-fuzz-10117-mc-buf-use-highbd.ivf", NULL }, + { 1, "invalid-oss-fuzz-10227.ivf", NULL }, + { 4, "invalid-oss-fuzz-10555.ivf", NULL }, + { 1, "invalid-oss-fuzz-10705.ivf", NULL }, + { 1, "invalid-oss-fuzz-10723.ivf", "invalid-oss-fuzz-10723.ivf.res.2" }, + { 1, "invalid-oss-fuzz-10779.ivf", NULL }, + { 1, "invalid-oss-fuzz-11477.ivf", NULL }, + { 1, "invalid-oss-fuzz-11479.ivf", "invalid-oss-fuzz-11479.ivf.res.2" }, +#endif +}; + +AV1_INSTANTIATE_TEST_CASE(InvalidFileTest, + ::testing::ValuesIn(kAV1InvalidFileTests)); + +} // namespace diff --git a/libs/libaom/src/test/ivf_video_source.h b/libs/libaom/src/test/ivf_video_source.h new file mode 100644 index 000000000..ff2841445 --- /dev/null +++ b/libs/libaom/src/test/ivf_video_source.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_TEST_IVF_VIDEO_SOURCE_H_ +#define AOM_TEST_IVF_VIDEO_SOURCE_H_ + +#include +#include +#include +#include + +#include "aom_ports/sanitizer.h" +#include "test/video_source.h" + +namespace libaom_test { +const unsigned int kCodeBufferSize = 256 * 1024 * 1024; +const unsigned int kIvfFileHdrSize = 32; +const unsigned int kIvfFrameHdrSize = 12; + +static unsigned int MemGetLe32(const uint8_t *mem) { + return (mem[3] << 24) | (mem[2] << 16) | (mem[1] << 8) | (mem[0]); +} + +// This class extends VideoSource to allow parsing of ivf files, +// so that we can do actual file decodes. +class IVFVideoSource : public CompressedVideoSource { + public: + explicit IVFVideoSource(const std::string &file_name) + : file_name_(file_name), input_file_(NULL), compressed_frame_buf_(NULL), + frame_sz_(0), frame_(0), end_of_file_(false) {} + + virtual ~IVFVideoSource() { + delete[] compressed_frame_buf_; + + if (input_file_) fclose(input_file_); + } + + virtual void Init() { + // Allocate a buffer for read in the compressed video frame. + compressed_frame_buf_ = new uint8_t[kCodeBufferSize]; + ASSERT_TRUE(compressed_frame_buf_ != NULL) + << "Allocate frame buffer failed"; + ASAN_POISON_MEMORY_REGION(compressed_frame_buf_, kCodeBufferSize); + } + + virtual void Begin() { + input_file_ = OpenTestDataFile(file_name_); + ASSERT_TRUE(input_file_ != NULL) + << "Input file open failed. Filename: " << file_name_; + + // Read file header + uint8_t file_hdr[kIvfFileHdrSize]; + ASSERT_EQ(kIvfFileHdrSize, fread(file_hdr, 1, kIvfFileHdrSize, input_file_)) + << "File header read failed."; + // Check file header + ASSERT_TRUE(file_hdr[0] == 'D' && file_hdr[1] == 'K' && + file_hdr[2] == 'I' && file_hdr[3] == 'F') + << "Input is not an IVF file."; + + FillFrame(); + } + + virtual void Next() { + ++frame_; + FillFrame(); + } + + void FillFrame() { + ASSERT_TRUE(input_file_ != NULL); + uint8_t frame_hdr[kIvfFrameHdrSize]; + // Check frame header and read a frame from input_file. + if (fread(frame_hdr, 1, kIvfFrameHdrSize, input_file_) != + kIvfFrameHdrSize) { + end_of_file_ = true; + } else { + end_of_file_ = false; + + frame_sz_ = MemGetLe32(frame_hdr); + ASSERT_LE(frame_sz_, kCodeBufferSize) + << "Frame is too big for allocated code buffer"; + ASAN_UNPOISON_MEMORY_REGION(compressed_frame_buf_, kCodeBufferSize); + ASSERT_EQ(frame_sz_, + fread(compressed_frame_buf_, 1, frame_sz_, input_file_)) + << "Failed to read complete frame"; + ASAN_POISON_MEMORY_REGION(compressed_frame_buf_ + frame_sz_, + kCodeBufferSize - frame_sz_); + } + } + + virtual const uint8_t *cxdata() const { + return end_of_file_ ? NULL : compressed_frame_buf_; + } + virtual size_t frame_size() const { return frame_sz_; } + virtual unsigned int frame_number() const { return frame_; } + + protected: + std::string file_name_; + FILE *input_file_; + uint8_t *compressed_frame_buf_; + size_t frame_sz_; + unsigned int frame_; + bool end_of_file_; +}; + +} // namespace libaom_test + +#endif // AOM_TEST_IVF_VIDEO_SOURCE_H_ diff --git a/libs/libaom/src/test/level_test.cc b/libs/libaom/src/test/level_test.cc new file mode 100644 index 000000000..a9613c5f7 --- /dev/null +++ b/libs/libaom/src/test/level_test.cc @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "test/yuv_video_source.h" + +namespace { +const int kLevelMin = 0; +const int kLevelMax = 31; +const int kLevelKeepStats = 24; +// Speed settings tested +static const int kCpuUsedVectors[] = { + 1, + 2, + 3, + 4, +}; + +class LevelTest + : public ::libaom_test::CodecTestWith2Params, + public ::libaom_test::EncoderTest { + protected: + LevelTest() + : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), + cpu_used_(GET_PARAM(2)), target_level_(31) {} + + virtual ~LevelTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(encoding_mode_); + if (encoding_mode_ != ::libaom_test::kRealTime) { + cfg_.g_lag_in_frames = 5; + cfg_.rc_end_usage = AOM_VBR; + } else { + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = AOM_CBR; + cfg_.rc_buf_sz = 1000; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + } + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AOME_SET_CPUUSED, cpu_used_); + encoder->Control(AV1E_SET_TARGET_SEQ_LEVEL_IDX, target_level_); + if (encoding_mode_ != ::libaom_test::kRealTime) { + encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1); + encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7); + encoder->Control(AOME_SET_ARNR_STRENGTH, 5); + } + } + + encoder->Control(AV1E_GET_SEQ_LEVEL_IDX, level_); + ASSERT_LE(level_[0], kLevelMax); + ASSERT_GE(level_[0], kLevelMin); + } + + libaom_test::TestMode encoding_mode_; + int cpu_used_; + int target_level_; + int level_[32]; +}; + +TEST_P(LevelTest, TestTargetLevelApi) { + static const aom_codec_iface_t *codec = &aom_codec_av1_cx_algo; + aom_codec_ctx_t enc; + aom_codec_enc_cfg_t cfg; + EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(codec, &cfg, 0)); + EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, codec, &cfg, 0)); + for (int operating_point = 0; operating_point <= 32; ++operating_point) { + for (int level = 0; level <= 32; ++level) { + const int target_level = operating_point * 100 + level; + if ((level <= 24 && level != 2 && level != 3 && level != 6 && + level != 7 && level != 10 && level != 11 && level != 20 && + level != 21 && level != 22 && level != 23) || + level == 31 || operating_point > 31) { + EXPECT_EQ(AOM_CODEC_OK, + AOM_CODEC_CONTROL_TYPECHECKED( + &enc, AV1E_SET_TARGET_SEQ_LEVEL_IDX, target_level)); + } else { + EXPECT_EQ(AOM_CODEC_INVALID_PARAM, + AOM_CODEC_CONTROL_TYPECHECKED( + &enc, AV1E_SET_TARGET_SEQ_LEVEL_IDX, target_level)); + } + } + } + EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc)); +} + +TEST_P(LevelTest, TestTargetLevel19) { + std::unique_ptr video; + video.reset(new libaom_test::Y4mVideoSource("park_joy_90p_8_420.y4m", 0, 10)); + ASSERT_TRUE(video.get() != NULL); + // Level index 19 corresponding to level 6.3. + target_level_ = 19; + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); +} + +TEST_P(LevelTest, TestLevelMonitoringLowBitrate) { + // To save run time, we only test speed 4. + if (cpu_used_ == 4) { + libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 40); + target_level_ = kLevelKeepStats; + cfg_.rc_target_bitrate = 1000; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_EQ(level_[0], 0); + } +} + +TEST_P(LevelTest, TestLevelMonitoringHighBitrate) { + // To save run time, we only test speed 4. + if (cpu_used_ == 4) { + libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 40); + target_level_ = kLevelKeepStats; + cfg_.rc_target_bitrate = 4000; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_EQ(level_[0], 1); + } +} + +TEST_P(LevelTest, TestTargetLevel0) { + // To save run time, we only test speed 4. + if (cpu_used_ == 4) { + libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 50); + const int target_level = 0; + target_level_ = target_level; + cfg_.rc_target_bitrate = 4000; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_EQ(level_[0], target_level); + } +} + +AV1_INSTANTIATE_TEST_CASE(LevelTest, + ::testing::Values(::libaom_test::kTwoPassGood), + ::testing::ValuesIn(kCpuUsedVectors)); +} // namespace diff --git a/libs/libaom/src/test/lightfield_test.sh b/libs/libaom/src/test/lightfield_test.sh new file mode 100644 index 000000000..3de88af87 --- /dev/null +++ b/libs/libaom/src/test/lightfield_test.sh @@ -0,0 +1,115 @@ +#!/bin/sh +## Copyright (c) 2018, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +## This file tests the lightfield example. +## +. $(dirname $0)/tools_common.sh + +# Environment check: $infile is required. +lightfield_test_verify_environment() { + local infile="${LIBAOM_TEST_DATA_PATH}/vase10x10.yuv" + if [ ! -e "${infile}" ]; then + echo "Libaom test data must exist in LIBAOM_TEST_DATA_PATH." + return 1 + fi +} + +# Run the lightfield example +lightfield_test() { + local img_width=1024 + local img_height=1024 + local lf_width=10 + local lf_height=10 + local lf_blocksize=5 + local num_references=4 + local num_tile_lists=2 + + # Encode the lightfield. + local encoder="${LIBAOM_BIN_PATH}/lightfield_encoder${AOM_TEST_EXE_SUFFIX}" + local yuv_file="${LIBAOM_TEST_DATA_PATH}/vase10x10.yuv" + local lf_file="${AOM_TEST_OUTPUT_DIR}/vase10x10.ivf" + if [ ! -x "${encoder}" ]; then + elog "${encoder} does not exist or is not executable." + return 1 + fi + + eval "${AOM_TEST_PREFIX}" "${encoder}" "${img_width}" "${img_height}" \ + "${yuv_file}" "${lf_file}" "${lf_width}" \ + "${lf_height}" "${lf_blocksize}" ${devnull} + + [ -e "${lf_file}" ] || return 1 + + # Check to ensure all camera frames have the identical frame header. If not identical, this test fails. + for i in ./fh*; do + diff ./fh004 $i > /dev/null + if [ $? -eq 1 ]; then + return 1 + fi + done + + # Check to ensure all camera frames use the identical frame context. If not identical, this test fails. + for i in ./fc*; do + diff ./fc004 $i > /dev/null + if [ $? -eq 1 ]; then + return 1 + fi + done + + # Parse lightfield bitstream to construct and output a new bitstream that can + # be decoded by an AV1 decoder. + local bs_decoder="${LIBAOM_BIN_PATH}/lightfield_bitstream_parsing${AOM_TEST_EXE_SUFFIX}" + local tl_file="${AOM_TEST_OUTPUT_DIR}/vase_tile_list.ivf" + local tl_text_file="${LIBAOM_TEST_DATA_PATH}/vase10x10_tiles.txt" + if [ ! -x "${bs_decoder}" ]; then + elog "${bs_decoder} does not exist or is not executable." + return 1 + fi + + eval "${AOM_TEST_PREFIX}" "${bs_decoder}" "${lf_file}" "${tl_file}" \ + "${num_references}" "${tl_text_file}" ${devnull} + + [ -e "${tl_file}" ] || return 1 + + # Run lightfield tile list decoder + local tl_decoder="${LIBAOM_BIN_PATH}/lightfield_tile_list_decoder${AOM_TEST_EXE_SUFFIX}" + local tl_outfile="${AOM_TEST_OUTPUT_DIR}/vase_tile_list.yuv" + if [ ! -x "${tl_decoder}" ]; then + elog "${tl_decoder} does not exist or is not executable." + return 1 + fi + + eval "${AOM_TEST_PREFIX}" "${tl_decoder}" "${tl_file}" "${tl_outfile}" \ + "${num_references}" "${num_tile_lists}" ${devnull} + + [ -e "${tl_outfile}" ] || return 1 + + # Run reference lightfield decoder + local ref_decoder="${LIBAOM_BIN_PATH}/lightfield_decoder${AOM_TEST_EXE_SUFFIX}" + local tl_reffile="${AOM_TEST_OUTPUT_DIR}/vase_reference.yuv" + if [ ! -x "${ref_decoder}" ]; then + elog "${ref_decoder} does not exist or is not executable." + return 1 + fi + + eval "${AOM_TEST_PREFIX}" "${ref_decoder}" "${lf_file}" "${tl_reffile}" \ + "${num_references}" "${tl_text_file}" ${devnull} + + [ -e "${tl_reffile}" ] || return 1 + + # Check if tl_outfile and tl_reffile are identical. If not identical, this test fails. + diff ${tl_outfile} ${tl_reffile} > /dev/null + if [ $? -eq 1 ]; then + return 1 + fi +} + +lightfield_test_tests="lightfield_test" + +run_tests lightfield_test_verify_environment "${lightfield_test_tests}" diff --git a/libs/libaom/src/test/log2_test.cc b/libs/libaom/src/test/log2_test.cc new file mode 100644 index 000000000..d7840c68b --- /dev/null +++ b/libs/libaom/src/test/log2_test.cc @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_ports/bitops.h" +#include "av1/common/entropymode.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +TEST(Log2Test, GetMsb) { + // Test small numbers exhaustively. + for (unsigned int n = 1; n < 10000; n++) { + EXPECT_EQ(get_msb(n), static_cast(floor(log2(n)))); + } + + // Test every power of 2 and the two adjacent numbers. + for (int exponent = 2; exponent < 32; exponent++) { + const unsigned int power_of_2 = 1U << exponent; + EXPECT_EQ(get_msb(power_of_2 - 1), exponent - 1); + EXPECT_EQ(get_msb(power_of_2), exponent); + EXPECT_EQ(get_msb(power_of_2 + 1), exponent); + } +} + +TEST(Log2Test, Av1CeilLog2) { + // Test small numbers exhaustively. + EXPECT_EQ(av1_ceil_log2(0), 0); + for (int n = 1; n < 10000; n++) { + EXPECT_EQ(av1_ceil_log2(n), static_cast(ceil(log2(n)))); + } + + // Test every power of 2 and the two adjacent numbers. + for (int exponent = 2; exponent < 31; exponent++) { + const int power_of_2 = 1 << exponent; + EXPECT_EQ(av1_ceil_log2(power_of_2 - 1), exponent); + EXPECT_EQ(av1_ceil_log2(power_of_2), exponent); + // The current implementation of av1_ceil_log2 only works up to 2^30. + if (exponent < 30) { + EXPECT_EQ(av1_ceil_log2(power_of_2 + 1), exponent + 1); + } + } +} diff --git a/libs/libaom/src/test/lossless_test.cc b/libs/libaom/src/test/lossless_test.cc new file mode 100644 index 000000000..71ae5e72b --- /dev/null +++ b/libs/libaom/src/test/lossless_test.cc @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_config.h" + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/y4m_video_source.h" + +namespace { + +const int kMaxPsnr = 100; + +class LosslessTestLarge + : public ::libaom_test::CodecTestWithParam, + public ::libaom_test::EncoderTest { + protected: + LosslessTestLarge() + : EncoderTest(GET_PARAM(0)), psnr_(kMaxPsnr), nframes_(0), + encoding_mode_(GET_PARAM(1)) {} + + virtual ~LosslessTestLarge() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(encoding_mode_); + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + // Only call Control if quantizer > 0 to verify that using quantizer + // alone will activate lossless + if (cfg_.rc_max_quantizer > 0 || cfg_.rc_min_quantizer > 0) { + encoder->Control(AV1E_SET_LOSSLESS, 1); + } + } + } + + virtual void BeginPassHook(unsigned int /*pass*/) { + psnr_ = kMaxPsnr; + nframes_ = 0; + } + + virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) { + if (pkt->data.psnr.psnr[0] < psnr_) psnr_ = pkt->data.psnr.psnr[0]; + } + + double GetMinPsnr() const { return psnr_; } + + private: + double psnr_; + unsigned int nframes_; + libaom_test::TestMode encoding_mode_; +}; + +TEST_P(LosslessTestLarge, TestLossLessEncoding) { + const aom_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + cfg_.rc_target_bitrate = 2000; + cfg_.g_lag_in_frames = 25; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 0; + + init_flags_ = AOM_CODEC_USE_PSNR; + + // intentionally changed the dimension for better testing coverage + libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + timebase.den, timebase.num, 0, 5); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const double psnr_lossless = GetMinPsnr(); + EXPECT_GE(psnr_lossless, kMaxPsnr); +} + +TEST_P(LosslessTestLarge, TestLossLessEncoding444) { + libaom_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 5); + + cfg_.g_profile = 1; + cfg_.g_timebase = video.timebase(); + cfg_.rc_target_bitrate = 2000; + cfg_.g_lag_in_frames = 25; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 0; + + init_flags_ = AOM_CODEC_USE_PSNR; + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const double psnr_lossless = GetMinPsnr(); + EXPECT_GE(psnr_lossless, kMaxPsnr); +} + +TEST_P(LosslessTestLarge, TestLossLessEncodingCtrl) { + const aom_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + cfg_.rc_target_bitrate = 2000; + cfg_.g_lag_in_frames = 25; + // Intentionally set Q > 0, to make sure control can be used to activate + // lossless + cfg_.rc_min_quantizer = 10; + cfg_.rc_max_quantizer = 20; + + init_flags_ = AOM_CODEC_USE_PSNR; + + libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + timebase.den, timebase.num, 0, 5); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const double psnr_lossless = GetMinPsnr(); + EXPECT_GE(psnr_lossless, kMaxPsnr); +} + +AV1_INSTANTIATE_TEST_CASE(LosslessTestLarge, + ::testing::Values(::libaom_test::kOnePassGood, + ::libaom_test::kTwoPassGood)); +} // namespace diff --git a/libs/libaom/src/test/lpf_test.cc b/libs/libaom/src/test/lpf_test.cc new file mode 100644 index 000000000..e8eeceb7c --- /dev/null +++ b/libs/libaom/src/test/lpf_test.cc @@ -0,0 +1,645 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "av1/common/av1_loopfilter.h" +#include "av1/common/entropy.h" +#include "aom/aom_integer.h" + +using libaom_test::ACMRandom; + +namespace { +// Horizontally and Vertically need 32x32: 8 Coeffs preceeding filtered section +// 16 Coefs within filtered section +// 8 Coeffs following filtered section +const int kNumCoeffs = 1024; + +const int number_of_iterations = 10000; + +const int kSpeedTestNum = 500000; + +#define LOOP_PARAM \ + int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh +#define DUAL_LOOP_PARAM \ + int p, const uint8_t *blimit0, const uint8_t *limit0, \ + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, \ + const uint8_t *thresh1 + +typedef void (*loop_op_t)(uint8_t *s, LOOP_PARAM); +typedef void (*dual_loop_op_t)(uint8_t *s, DUAL_LOOP_PARAM); +typedef void (*hbdloop_op_t)(uint16_t *s, LOOP_PARAM, int bd); +typedef void (*hbddual_loop_op_t)(uint16_t *s, DUAL_LOOP_PARAM, int bd); + +typedef std::tuple hbdloop_param_t; +typedef std::tuple + hbddual_loop_param_t; +typedef std::tuple loop_param_t; +typedef std::tuple dual_loop_param_t; + +template +void InitInput(Pixel_t *s, Pixel_t *ref_s, ACMRandom *rnd, const uint8_t limit, + const int mask, const int32_t p, const int i) { + uint16_t tmp_s[kNumCoeffs]; + + for (int j = 0; j < kNumCoeffs;) { + const uint8_t val = rnd->Rand8(); + if (val & 0x80) { // 50% chance to choose a new value. + tmp_s[j] = rnd->Rand16(); + j++; + } else { // 50% chance to repeat previous value in row X times. + int k = 0; + while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) { + if (j < 1) { + tmp_s[j] = rnd->Rand16(); + } else if (val & 0x20) { // Increment by a value within the limit. + tmp_s[j] = static_cast(tmp_s[j - 1] + (limit - 1)); + } else { // Decrement by a value within the limit. + tmp_s[j] = static_cast(tmp_s[j - 1] - (limit - 1)); + } + j++; + } + } + } + + for (int j = 0; j < kNumCoeffs;) { + const uint8_t val = rnd->Rand8(); + if (val & 0x80) { + j++; + } else { // 50% chance to repeat previous value in column X times. + int k = 0; + while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) { + if (j < 1) { + tmp_s[j] = rnd->Rand16(); + } else if (val & 0x20) { // Increment by a value within the limit. + tmp_s[(j % 32) * 32 + j / 32] = static_cast( + tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] + (limit - 1)); + } else { // Decrement by a value within the limit. + tmp_s[(j % 32) * 32 + j / 32] = static_cast( + tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] - (limit - 1)); + } + j++; + } + } + } + + for (int j = 0; j < kNumCoeffs; j++) { + if (i % 2) { + s[j] = tmp_s[j] & mask; + } else { + s[j] = tmp_s[p * (j % p) + j / p] & mask; + } + ref_s[j] = s[j]; + } +} + +uint8_t GetOuterThresh(ACMRandom *rnd) { + return static_cast(rnd->PseudoUniform(3 * MAX_LOOP_FILTER + 5)); +} + +uint8_t GetInnerThresh(ACMRandom *rnd) { + return static_cast(rnd->PseudoUniform(MAX_LOOP_FILTER + 1)); +} + +uint8_t GetHevThresh(ACMRandom *rnd) { + return static_cast(rnd->PseudoUniform(MAX_LOOP_FILTER + 1) >> 4); +} + +template +class LoopTestParam : public ::testing::TestWithParam { + public: + virtual ~LoopTestParam() {} + virtual void SetUp() { + loopfilter_op_ = std::get<0>(this->GetParam()); + ref_loopfilter_op_ = std::get<1>(this->GetParam()); + bit_depth_ = std::get<2>(this->GetParam()); + mask_ = (1 << bit_depth_) - 1; + } + + virtual void TearDown() { libaom_test::ClearSystemState(); } + + protected: + int bit_depth_; + int mask_; + func_type_t loopfilter_op_; + func_type_t ref_loopfilter_op_; +}; + +#if CONFIG_AV1_HIGHBITDEPTH +void call_filter(uint16_t *s, LOOP_PARAM, int bd, hbdloop_op_t op) { + op(s, p, blimit, limit, thresh, bd); +} +void call_dualfilter(uint16_t *s, DUAL_LOOP_PARAM, int bd, + hbddual_loop_op_t op) { + op(s, p, blimit0, limit0, thresh0, blimit1, limit1, thresh1, bd); +} +#endif +void call_filter(uint8_t *s, LOOP_PARAM, int bd, loop_op_t op) { + (void)bd; + op(s, p, blimit, limit, thresh); +} +void call_dualfilter(uint8_t *s, DUAL_LOOP_PARAM, int bd, dual_loop_op_t op) { + (void)bd; + op(s, p, blimit0, limit0, thresh0, blimit1, limit1, thresh1); +}; + +#if CONFIG_AV1_HIGHBITDEPTH +typedef LoopTestParam Loop8Test6Param_hbd; +typedef LoopTestParam + Loop8Test9Param_hbd; +#endif +typedef LoopTestParam Loop8Test6Param_lbd; +typedef LoopTestParam Loop8Test9Param_lbd; + +#define OPCHECK(a, b) \ + ACMRandom rnd(ACMRandom::DeterministicSeed()); \ + const int count_test_block = number_of_iterations; \ + const int32_t p = kNumCoeffs / 32; \ + DECLARE_ALIGNED(b, a, s[kNumCoeffs]); \ + DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]); \ + int err_count_total = 0; \ + int first_failure = -1; \ + for (int i = 0; i < count_test_block; ++i) { \ + int err_count = 0; \ + uint8_t tmp = GetOuterThresh(&rnd); \ + DECLARE_ALIGNED(16, const uint8_t, \ + blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \ + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \ + tmp = GetInnerThresh(&rnd); \ + DECLARE_ALIGNED(16, const uint8_t, \ + limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \ + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \ + tmp = GetHevThresh(&rnd); \ + DECLARE_ALIGNED(16, const uint8_t, \ + thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \ + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \ + InitInput(s, ref_s, &rnd, *limit, mask_, p, i); \ + call_filter(ref_s + 8 + p * 8, p, blimit, limit, thresh, bit_depth_, \ + ref_loopfilter_op_); \ + ASM_REGISTER_STATE_CHECK(call_filter(s + 8 + p * 8, p, blimit, limit, \ + thresh, bit_depth_, loopfilter_op_)); \ + for (int j = 0; j < kNumCoeffs; ++j) { \ + err_count += ref_s[j] != s[j]; \ + } \ + if (err_count && !err_count_total) { \ + first_failure = i; \ + } \ + err_count_total += err_count; \ + } \ + EXPECT_EQ(0, err_count_total) \ + << "Error: Loop8Test6Param, C output doesn't match SIMD " \ + "loopfilter output. " \ + << "First failed at test case " << first_failure; + +#if CONFIG_AV1_HIGHBITDEPTH +TEST_P(Loop8Test6Param_hbd, OperationCheck) { OPCHECK(uint16_t, 16); } +#endif +TEST_P(Loop8Test6Param_lbd, OperationCheck) { OPCHECK(uint8_t, 8); } + +#define VALCHECK(a, b) \ + ACMRandom rnd(ACMRandom::DeterministicSeed()); \ + const int count_test_block = number_of_iterations; \ + DECLARE_ALIGNED(b, a, s[kNumCoeffs]); \ + DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]); \ + int err_count_total = 0; \ + int first_failure = -1; \ + for (int i = 0; i < count_test_block; ++i) { \ + int err_count = 0; \ + uint8_t tmp = GetOuterThresh(&rnd); \ + DECLARE_ALIGNED(16, const uint8_t, \ + blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \ + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \ + tmp = GetInnerThresh(&rnd); \ + DECLARE_ALIGNED(16, const uint8_t, \ + limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \ + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \ + tmp = GetHevThresh(&rnd); \ + DECLARE_ALIGNED(16, const uint8_t, \ + thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \ + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \ + int32_t p = kNumCoeffs / 32; \ + for (int j = 0; j < kNumCoeffs; ++j) { \ + s[j] = rnd.Rand16() & mask_; \ + ref_s[j] = s[j]; \ + } \ + call_filter(ref_s + 8 + p * 8, p, blimit, limit, thresh, bit_depth_, \ + ref_loopfilter_op_); \ + ASM_REGISTER_STATE_CHECK(call_filter(s + 8 + p * 8, p, blimit, limit, \ + thresh, bit_depth_, loopfilter_op_)); \ + for (int j = 0; j < kNumCoeffs; ++j) { \ + err_count += ref_s[j] != s[j]; \ + } \ + if (err_count && !err_count_total) { \ + first_failure = i; \ + } \ + err_count_total += err_count; \ + } \ + EXPECT_EQ(0, err_count_total) \ + << "Error: Loop8Test6Param, C output doesn't match SIMD " \ + "loopfilter output. " \ + << "First failed at test case " << first_failure; + +#if CONFIG_AV1_HIGHBITDEPTH +TEST_P(Loop8Test6Param_hbd, ValueCheck) { VALCHECK(uint16_t, 16); } +#endif +TEST_P(Loop8Test6Param_lbd, ValueCheck) { VALCHECK(uint8_t, 8); } + +#define SPEEDCHECK(a, b) \ + ACMRandom rnd(ACMRandom::DeterministicSeed()); \ + const int count_test_block = kSpeedTestNum; \ + const int32_t bd = bit_depth_; \ + DECLARE_ALIGNED(b, a, s[kNumCoeffs]); \ + uint8_t tmp = GetOuterThresh(&rnd); \ + DECLARE_ALIGNED(16, const uint8_t, \ + blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \ + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \ + tmp = GetInnerThresh(&rnd); \ + DECLARE_ALIGNED(16, const uint8_t, \ + limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \ + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \ + tmp = GetHevThresh(&rnd); \ + DECLARE_ALIGNED(16, const uint8_t, \ + thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \ + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \ + int32_t p = kNumCoeffs / 32; \ + for (int j = 0; j < kNumCoeffs; ++j) { \ + s[j] = rnd.Rand16() & mask_; \ + } \ + for (int i = 0; i < count_test_block; ++i) { \ + call_filter(s + 8 + p * 8, p, blimit, limit, thresh, bd, loopfilter_op_); \ + } + +#if CONFIG_AV1_HIGHBITDEPTH +TEST_P(Loop8Test6Param_hbd, DISABLED_Speed) { SPEEDCHECK(uint16_t, 16); } +#endif +TEST_P(Loop8Test6Param_lbd, DISABLED_Speed) { SPEEDCHECK(uint8_t, 8); } + +#define OPCHECKd(a, b) \ + ACMRandom rnd(ACMRandom::DeterministicSeed()); \ + const int count_test_block = number_of_iterations; \ + DECLARE_ALIGNED(b, a, s[kNumCoeffs]); \ + DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]); \ + int err_count_total = 0; \ + int first_failure = -1; \ + for (int i = 0; i < count_test_block; ++i) { \ + int err_count = 0; \ + uint8_t tmp = GetOuterThresh(&rnd); \ + DECLARE_ALIGNED(16, const uint8_t, \ + blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \ + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \ + tmp = GetInnerThresh(&rnd); \ + DECLARE_ALIGNED(16, const uint8_t, \ + limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \ + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \ + tmp = GetHevThresh(&rnd); \ + DECLARE_ALIGNED(16, const uint8_t, \ + thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \ + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \ + tmp = GetOuterThresh(&rnd); \ + DECLARE_ALIGNED(16, const uint8_t, \ + blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \ + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \ + tmp = GetInnerThresh(&rnd); \ + DECLARE_ALIGNED(16, const uint8_t, \ + limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \ + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \ + tmp = GetHevThresh(&rnd); \ + DECLARE_ALIGNED(16, const uint8_t, \ + thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \ + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \ + int32_t p = kNumCoeffs / 32; \ + const uint8_t limit = *limit0 < *limit1 ? *limit0 : *limit1; \ + InitInput(s, ref_s, &rnd, limit, mask_, p, i); \ + call_dualfilter(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1, \ + limit1, thresh1, bit_depth_, ref_loopfilter_op_); \ + ASM_REGISTER_STATE_CHECK( \ + call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1, \ + limit1, thresh1, bit_depth_, loopfilter_op_)); \ + for (int j = 0; j < kNumCoeffs; ++j) { \ + err_count += ref_s[j] != s[j]; \ + } \ + if (err_count && !err_count_total) { \ + first_failure = i; \ + } \ + err_count_total += err_count; \ + } \ + EXPECT_EQ(0, err_count_total) \ + << "Error: Loop8Test9Param, C output doesn't match SIMD " \ + "loopfilter output. " \ + << "First failed at test case " << first_failure; + +#if CONFIG_AV1_HIGHBITDEPTH +TEST_P(Loop8Test9Param_hbd, OperationCheck) { OPCHECKd(uint16_t, 16); } +#endif +TEST_P(Loop8Test9Param_lbd, OperationCheck) { OPCHECKd(uint8_t, 8); } + +#define VALCHECKd(a, b) \ + ACMRandom rnd(ACMRandom::DeterministicSeed()); \ + const int count_test_block = number_of_iterations; \ + DECLARE_ALIGNED(b, a, s[kNumCoeffs]); \ + DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]); \ + int err_count_total = 0; \ + int first_failure = -1; \ + for (int i = 0; i < count_test_block; ++i) { \ + int err_count = 0; \ + uint8_t tmp = GetOuterThresh(&rnd); \ + DECLARE_ALIGNED(16, const uint8_t, \ + blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \ + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \ + tmp = GetInnerThresh(&rnd); \ + DECLARE_ALIGNED(16, const uint8_t, \ + limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \ + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \ + tmp = GetHevThresh(&rnd); \ + DECLARE_ALIGNED(16, const uint8_t, \ + thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \ + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \ + tmp = GetOuterThresh(&rnd); \ + DECLARE_ALIGNED(16, const uint8_t, \ + blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \ + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \ + tmp = GetInnerThresh(&rnd); \ + DECLARE_ALIGNED(16, const uint8_t, \ + limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \ + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \ + tmp = GetHevThresh(&rnd); \ + DECLARE_ALIGNED(16, const uint8_t, \ + thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \ + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \ + int32_t p = kNumCoeffs / 32; \ + for (int j = 0; j < kNumCoeffs; ++j) { \ + s[j] = rnd.Rand16() & mask_; \ + ref_s[j] = s[j]; \ + } \ + call_dualfilter(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1, \ + limit1, thresh1, bit_depth_, ref_loopfilter_op_); \ + ASM_REGISTER_STATE_CHECK( \ + call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1, \ + limit1, thresh1, bit_depth_, loopfilter_op_)); \ + for (int j = 0; j < kNumCoeffs; ++j) { \ + err_count += ref_s[j] != s[j]; \ + } \ + if (err_count && !err_count_total) { \ + first_failure = i; \ + } \ + err_count_total += err_count; \ + } \ + EXPECT_EQ(0, err_count_total) \ + << "Error: Loop8Test9Param, C output doesn't match SIMD " \ + "loopfilter output. " \ + << "First failed at test case " << first_failure; + +#if CONFIG_AV1_HIGHBITDEPTH +TEST_P(Loop8Test9Param_hbd, ValueCheck) { VALCHECKd(uint16_t, 16); } +#endif +TEST_P(Loop8Test9Param_lbd, ValueCheck) { VALCHECKd(uint8_t, 8); } + +#define SPEEDCHECKd(a, b) \ + ACMRandom rnd(ACMRandom::DeterministicSeed()); \ + const int count_test_block = kSpeedTestNum; \ + DECLARE_ALIGNED(b, a, s[kNumCoeffs]); \ + uint8_t tmp = GetOuterThresh(&rnd); \ + DECLARE_ALIGNED(16, const uint8_t, \ + blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \ + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \ + tmp = GetInnerThresh(&rnd); \ + DECLARE_ALIGNED(16, const uint8_t, \ + limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \ + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \ + tmp = GetHevThresh(&rnd); \ + DECLARE_ALIGNED(16, const uint8_t, \ + thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \ + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \ + tmp = GetOuterThresh(&rnd); \ + DECLARE_ALIGNED(16, const uint8_t, \ + blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \ + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \ + tmp = GetInnerThresh(&rnd); \ + DECLARE_ALIGNED(16, const uint8_t, \ + limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \ + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \ + tmp = GetHevThresh(&rnd); \ + DECLARE_ALIGNED(16, const uint8_t, \ + thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \ + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \ + int32_t p = kNumCoeffs / 32; \ + for (int j = 0; j < kNumCoeffs; ++j) { \ + s[j] = rnd.Rand16() & mask_; \ + } \ + for (int i = 0; i < count_test_block; ++i) { \ + call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1, \ + limit1, thresh1, bit_depth_, loopfilter_op_); \ + } + +#if CONFIG_AV1_HIGHBITDEPTH +TEST_P(Loop8Test9Param_hbd, DISABLED_Speed) { SPEEDCHECKd(uint16_t, 16); } +#endif +TEST_P(Loop8Test9Param_lbd, DISABLED_Speed) { SPEEDCHECKd(uint8_t, 8); } + +using std::make_tuple; + +#if HAVE_SSE2 +#if CONFIG_AV1_HIGHBITDEPTH +const hbdloop_param_t kHbdLoop8Test6[] = { + make_tuple(&aom_highbd_lpf_horizontal_4_sse2, &aom_highbd_lpf_horizontal_4_c, + 8), + make_tuple(&aom_highbd_lpf_vertical_4_sse2, &aom_highbd_lpf_vertical_4_c, 8), + make_tuple(&aom_highbd_lpf_horizontal_6_sse2, &aom_highbd_lpf_horizontal_6_c, + 8), + make_tuple(&aom_highbd_lpf_horizontal_8_sse2, &aom_highbd_lpf_horizontal_8_c, + 8), + make_tuple(&aom_highbd_lpf_horizontal_14_sse2, + &aom_highbd_lpf_horizontal_14_c, 8), + make_tuple(&aom_highbd_lpf_vertical_6_sse2, &aom_highbd_lpf_vertical_6_c, 8), + make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 8), + + make_tuple(&aom_highbd_lpf_vertical_14_sse2, &aom_highbd_lpf_vertical_14_c, + 8), + make_tuple(&aom_highbd_lpf_horizontal_4_sse2, &aom_highbd_lpf_horizontal_4_c, + 10), + make_tuple(&aom_highbd_lpf_vertical_4_sse2, &aom_highbd_lpf_vertical_4_c, 10), + make_tuple(&aom_highbd_lpf_horizontal_6_sse2, &aom_highbd_lpf_horizontal_6_c, + 10), + make_tuple(&aom_highbd_lpf_horizontal_8_sse2, &aom_highbd_lpf_horizontal_8_c, + 10), + make_tuple(&aom_highbd_lpf_horizontal_14_sse2, + &aom_highbd_lpf_horizontal_14_c, 10), + make_tuple(&aom_highbd_lpf_vertical_6_sse2, &aom_highbd_lpf_vertical_6_c, 10), + make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 10), + make_tuple(&aom_highbd_lpf_vertical_14_sse2, &aom_highbd_lpf_vertical_14_c, + 10), + make_tuple(&aom_highbd_lpf_horizontal_4_sse2, &aom_highbd_lpf_horizontal_4_c, + 12), + make_tuple(&aom_highbd_lpf_vertical_4_sse2, &aom_highbd_lpf_vertical_4_c, 12), + make_tuple(&aom_highbd_lpf_horizontal_6_sse2, &aom_highbd_lpf_horizontal_6_c, + 12), + make_tuple(&aom_highbd_lpf_horizontal_8_sse2, &aom_highbd_lpf_horizontal_8_c, + 12), + make_tuple(&aom_highbd_lpf_horizontal_14_sse2, + &aom_highbd_lpf_horizontal_14_c, 12), + make_tuple(&aom_highbd_lpf_vertical_14_sse2, &aom_highbd_lpf_vertical_14_c, + 12), + make_tuple(&aom_highbd_lpf_vertical_6_sse2, &aom_highbd_lpf_vertical_6_c, 12), + make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 12) +}; + +INSTANTIATE_TEST_SUITE_P(SSE2, Loop8Test6Param_hbd, + ::testing::ValuesIn(kHbdLoop8Test6)); +#endif // CONFIG_AV1_HIGHBITDEPTH + +const loop_param_t kLoop8Test6[] = { + make_tuple(&aom_lpf_horizontal_4_sse2, &aom_lpf_horizontal_4_c, 8), + make_tuple(&aom_lpf_horizontal_8_sse2, &aom_lpf_horizontal_8_c, 8), + make_tuple(&aom_lpf_horizontal_6_sse2, &aom_lpf_horizontal_6_c, 8), + make_tuple(&aom_lpf_vertical_6_sse2, &aom_lpf_vertical_6_c, 8), + make_tuple(&aom_lpf_horizontal_14_sse2, &aom_lpf_horizontal_14_c, 8), + make_tuple(&aom_lpf_vertical_4_sse2, &aom_lpf_vertical_4_c, 8), + make_tuple(&aom_lpf_vertical_8_sse2, &aom_lpf_vertical_8_c, 8), + make_tuple(&aom_lpf_vertical_14_sse2, &aom_lpf_vertical_14_c, 8), +}; + +INSTANTIATE_TEST_SUITE_P(SSE2, Loop8Test6Param_lbd, + ::testing::ValuesIn(kLoop8Test6)); + +const dual_loop_param_t kLoop8Test9[] = { + make_tuple(&aom_lpf_horizontal_4_dual_sse2, &aom_lpf_horizontal_4_dual_c, 8), + make_tuple(&aom_lpf_vertical_4_dual_sse2, &aom_lpf_vertical_4_dual_c, 8), + make_tuple(&aom_lpf_horizontal_6_dual_sse2, &aom_lpf_horizontal_6_dual_c, 8), + make_tuple(&aom_lpf_vertical_6_dual_sse2, &aom_lpf_vertical_6_dual_c, 8), + make_tuple(&aom_lpf_horizontal_8_dual_sse2, &aom_lpf_horizontal_8_dual_c, 8), + make_tuple(&aom_lpf_vertical_8_dual_sse2, &aom_lpf_vertical_8_dual_c, 8), + make_tuple(&aom_lpf_horizontal_14_dual_sse2, &aom_lpf_horizontal_14_dual_c, + 8), + make_tuple(&aom_lpf_vertical_14_dual_sse2, &aom_lpf_vertical_14_dual_c, 8) +}; + +INSTANTIATE_TEST_SUITE_P(SSE2, Loop8Test9Param_lbd, + ::testing::ValuesIn(kLoop8Test9)); + +#endif // HAVE_SSE2 + +#if HAVE_SSE2 && CONFIG_AV1_HIGHBITDEPTH +const hbddual_loop_param_t kHbdLoop8Test9[] = { + make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2, + &aom_highbd_lpf_horizontal_4_dual_c, 8), + make_tuple(&aom_highbd_lpf_horizontal_6_dual_sse2, + &aom_highbd_lpf_horizontal_6_dual_c, 8), + make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2, + &aom_highbd_lpf_horizontal_8_dual_c, 8), + make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2, + &aom_highbd_lpf_horizontal_14_dual_c, 8), + make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2, + &aom_highbd_lpf_vertical_4_dual_c, 8), + make_tuple(&aom_highbd_lpf_vertical_6_dual_sse2, + &aom_highbd_lpf_vertical_6_dual_c, 8), + make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2, + &aom_highbd_lpf_vertical_8_dual_c, 8), + make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2, + &aom_highbd_lpf_vertical_14_dual_c, 8), + make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2, + &aom_highbd_lpf_horizontal_4_dual_c, 10), + make_tuple(&aom_highbd_lpf_horizontal_6_dual_sse2, + &aom_highbd_lpf_horizontal_6_dual_c, 10), + make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2, + &aom_highbd_lpf_horizontal_8_dual_c, 10), + make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2, + &aom_highbd_lpf_horizontal_14_dual_c, 10), + make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2, + &aom_highbd_lpf_vertical_4_dual_c, 10), + make_tuple(&aom_highbd_lpf_vertical_6_dual_sse2, + &aom_highbd_lpf_vertical_6_dual_c, 10), + make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2, + &aom_highbd_lpf_vertical_8_dual_c, 10), + make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2, + &aom_highbd_lpf_vertical_14_dual_c, 10), + make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2, + &aom_highbd_lpf_horizontal_4_dual_c, 12), + make_tuple(&aom_highbd_lpf_horizontal_6_dual_sse2, + &aom_highbd_lpf_horizontal_6_dual_c, 12), + make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2, + &aom_highbd_lpf_horizontal_8_dual_c, 12), + make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2, + &aom_highbd_lpf_horizontal_14_dual_c, 12), + make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2, + &aom_highbd_lpf_vertical_4_dual_c, 12), + make_tuple(&aom_highbd_lpf_vertical_6_dual_sse2, + &aom_highbd_lpf_vertical_6_dual_c, 12), + make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2, + &aom_highbd_lpf_vertical_8_dual_c, 12), + make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2, + &aom_highbd_lpf_vertical_14_dual_c, 12), +}; + +INSTANTIATE_TEST_SUITE_P(SSE2, Loop8Test9Param_hbd, + ::testing::ValuesIn(kHbdLoop8Test9)); + +#endif // HAVE_SSE2 && CONFIG_AV1_HIGHBITDEPTH + +#if HAVE_NEON +const loop_param_t kLoop8Test6[] = { + make_tuple(&aom_lpf_vertical_14_neon, &aom_lpf_vertical_14_c, 8), + make_tuple(&aom_lpf_vertical_8_neon, &aom_lpf_vertical_8_c, 8), + make_tuple(&aom_lpf_vertical_6_neon, &aom_lpf_vertical_6_c, 8), + make_tuple(&aom_lpf_vertical_4_neon, &aom_lpf_vertical_4_c, 8), + make_tuple(&aom_lpf_horizontal_14_neon, &aom_lpf_horizontal_14_c, 8), + make_tuple(&aom_lpf_horizontal_8_neon, &aom_lpf_horizontal_8_c, 8), + make_tuple(&aom_lpf_horizontal_6_neon, &aom_lpf_horizontal_6_c, 8), + make_tuple(&aom_lpf_horizontal_4_neon, &aom_lpf_horizontal_4_c, 8) +}; + +INSTANTIATE_TEST_SUITE_P(NEON, Loop8Test6Param_lbd, + ::testing::ValuesIn(kLoop8Test6)); +#endif // HAVE_NEON + +#if HAVE_AVX2 && CONFIG_AV1_HIGHBITDEPTH +const hbddual_loop_param_t kHbdLoop8Test9Avx2[] = { + make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2, + &aom_highbd_lpf_horizontal_4_dual_c, 8), + make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2, + &aom_highbd_lpf_horizontal_4_dual_c, 10), + make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2, + &aom_highbd_lpf_horizontal_4_dual_c, 12), + make_tuple(&aom_highbd_lpf_horizontal_8_dual_avx2, + &aom_highbd_lpf_horizontal_8_dual_c, 8), + make_tuple(&aom_highbd_lpf_horizontal_8_dual_avx2, + &aom_highbd_lpf_horizontal_8_dual_c, 10), + make_tuple(&aom_highbd_lpf_horizontal_8_dual_avx2, + &aom_highbd_lpf_horizontal_8_dual_c, 12), + make_tuple(&aom_highbd_lpf_vertical_4_dual_avx2, + &aom_highbd_lpf_vertical_4_dual_c, 8), + make_tuple(&aom_highbd_lpf_vertical_4_dual_avx2, + &aom_highbd_lpf_vertical_4_dual_c, 10), + make_tuple(&aom_highbd_lpf_vertical_4_dual_avx2, + &aom_highbd_lpf_vertical_4_dual_c, 12), + make_tuple(&aom_highbd_lpf_vertical_8_dual_avx2, + &aom_highbd_lpf_vertical_8_dual_c, 8), + make_tuple(&aom_highbd_lpf_vertical_8_dual_avx2, + &aom_highbd_lpf_vertical_8_dual_c, 10), + make_tuple(&aom_highbd_lpf_vertical_8_dual_avx2, + &aom_highbd_lpf_vertical_8_dual_c, 12), +}; + +INSTANTIATE_TEST_SUITE_P(AVX2, Loop8Test9Param_hbd, + ::testing::ValuesIn(kHbdLoop8Test9Avx2)); +#endif +} // namespace diff --git a/libs/libaom/src/test/masked_sad_test.cc b/libs/libaom/src/test/masked_sad_test.cc new file mode 100644 index 000000000..aa4dd8341 --- /dev/null +++ b/libs/libaom/src/test/masked_sad_test.cc @@ -0,0 +1,495 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" + +using libaom_test::ACMRandom; + +namespace { +const int number_of_iterations = 200; + +typedef unsigned int (*MaskedSADFunc)(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, + const uint8_t *msk, int msk_stride, + int invert_mask); +typedef std::tuple MaskedSADParam; + +typedef void (*MaskedSADx4Func)(const uint8_t *src, int src_stride, + const uint8_t *ref[], int ref_stride, + const uint8_t *second_pred, const uint8_t *msk, + int msk_stride, int invert_mask, + unsigned sads[]); + +typedef std::tuple MaskedSADx4Param; + +class MaskedSADTestBase : public ::testing::Test { + public: + virtual ~MaskedSADTestBase() {} + virtual void SetUp() = 0; + virtual void runRef(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr[], int ref_stride, + const uint8_t *second_pred, const uint8_t *msk, + int msk_stride, int inv_mask, unsigned sads[], + int times) = 0; + virtual void runTest(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr[], int ref_stride, + const uint8_t *second_pred, const uint8_t *msk, + int msk_stride, int inv_mask, unsigned sads[], + int times) = 0; + + virtual void TearDown() { libaom_test::ClearSystemState(); } + void runMaskedSADTest(int run_times); +}; + +class MaskedSADTest : public MaskedSADTestBase, + public ::testing::WithParamInterface { + public: + virtual ~MaskedSADTest() {} + virtual void SetUp() { + maskedSAD_op_ = GET_PARAM(0); + ref_maskedSAD_op_ = GET_PARAM(1); + } + + virtual void runRef(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr[], int ref_stride, + const uint8_t *second_pred, const uint8_t *msk, + int msk_stride, int inv_mask, unsigned sads[], int times); + virtual void runTest(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr[], int ref_stride, + const uint8_t *second_pred, const uint8_t *msk, + int msk_stride, int inv_mask, unsigned sads[], + int times); + + protected: + MaskedSADFunc maskedSAD_op_; + MaskedSADFunc ref_maskedSAD_op_; +}; + +class MaskedSADx4Test : public MaskedSADTestBase, + public ::testing::WithParamInterface { + public: + virtual ~MaskedSADx4Test() {} + virtual void SetUp() { + maskedSAD_op_ = GET_PARAM(0); + ref_maskedSAD_op_ = GET_PARAM(1); + } + virtual void runRef(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr[], int ref_stride, + const uint8_t *second_pred, const uint8_t *msk, + int msk_stride, int inv_mask, unsigned sads[], int times); + virtual void runTest(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr[], int ref_stride, + const uint8_t *second_pred, const uint8_t *msk, + int msk_stride, int inv_mask, unsigned sads[], + int times); + + protected: + MaskedSADx4Func maskedSAD_op_; + MaskedSADx4Func ref_maskedSAD_op_; +}; + +void MaskedSADTest::runRef(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr[], int ref_stride, + const uint8_t *second_pred, const uint8_t *msk, + int msk_stride, int invert_mask, unsigned sads[], + int times) { + for (int repeat = 0; repeat < times; ++repeat) { + sads[0] = ref_maskedSAD_op_(src_ptr, src_stride, ref_ptr[0], ref_stride, + second_pred, msk, msk_stride, invert_mask); + } +} + +void MaskedSADTest::runTest(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr[], int ref_stride, + const uint8_t *second_pred, const uint8_t *msk, + int msk_stride, int invert_mask, unsigned sads[], + int times) { + if (times == 1) { + sads[0] = maskedSAD_op_(src_ptr, src_stride, ref_ptr[0], ref_stride, + second_pred, msk, msk_stride, invert_mask); + } else { + for (int repeat = 0; repeat < times; ++repeat) { + ASM_REGISTER_STATE_CHECK( + sads[0] = maskedSAD_op_(src_ptr, src_stride, ref_ptr[0], ref_stride, + second_pred, msk, msk_stride, invert_mask)); + } + } +} + +void MaskedSADx4Test::runRef(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr[], int ref_stride, + const uint8_t *second_pred, const uint8_t *msk, + int msk_stride, int invert_mask, unsigned sads[], + int times) { + for (int repeat = 0; repeat < times; ++repeat) { + ref_maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, + msk, msk_stride, invert_mask, sads); + } +} + +void MaskedSADx4Test::runTest(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr[], int ref_stride, + const uint8_t *second_pred, const uint8_t *msk, + int msk_stride, int invert_mask, unsigned sads[], + int times) { + if (times == 1) { + ASM_REGISTER_STATE_CHECK(maskedSAD_op_(src_ptr, src_stride, ref_ptr, + ref_stride, second_pred, msk, + msk_stride, invert_mask, sads)); + } else { + for (int repeat = 0; repeat < times; ++repeat) { + maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, msk, + msk_stride, invert_mask, sads); + } + } +} + +void MaskedSADTestBase::runMaskedSADTest(int run_times) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const unsigned kBlockSize = MAX_SB_SIZE * MAX_SB_SIZE; + DECLARE_ALIGNED(16, uint8_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]); + DECLARE_ALIGNED(16, uint8_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE * 4]); + DECLARE_ALIGNED(16, uint8_t, second_pred_ptr[MAX_SB_SIZE * MAX_SB_SIZE]); + DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE * MAX_SB_SIZE]); + + const uint8_t *refs[] = { ref_ptr, ref_ptr + kBlockSize, + ref_ptr + 2 * kBlockSize, + ref_ptr + 3 * kBlockSize }; + unsigned sads[] = { 0, 0, 0, 0 }; + unsigned sads_ref[] = { 0, 0, 0, 0 }; + int err_count = 0; + int first_failure = -1; + int src_stride = MAX_SB_SIZE; + int ref_stride = MAX_SB_SIZE; + int msk_stride = MAX_SB_SIZE; + const int iters = run_times == 1 ? number_of_iterations : 1; + for (int i = 0; i < iters; ++i) { + for (int j = 0; j < MAX_SB_SIZE * MAX_SB_SIZE; j++) { + src_ptr[j] = rnd.Rand8(); + ref_ptr[j] = rnd.Rand8(); + (ref_ptr + kBlockSize)[j] = rnd.Rand8(); + (ref_ptr + 2 * kBlockSize)[j] = rnd.Rand8(); + (ref_ptr + 3 * kBlockSize)[j] = rnd.Rand8(); + second_pred_ptr[j] = rnd.Rand8(); + msk_ptr[j] = ((rnd.Rand8() & 0x7f) > 64) ? rnd.Rand8() & 0x3f : 64; + assert(msk_ptr[j] <= 64); + } + + for (int invert_mask = 0; invert_mask < 2; ++invert_mask) { + aom_usec_timer timer; + aom_usec_timer_start(&timer); + runRef(src_ptr, src_stride, refs, ref_stride, second_pred_ptr, msk_ptr, + msk_stride, invert_mask, sads_ref, run_times); + aom_usec_timer_mark(&timer); + const double time1 = static_cast(aom_usec_timer_elapsed(&timer)); + + aom_usec_timer_start(&timer); + runTest(src_ptr, src_stride, refs, ref_stride, second_pred_ptr, msk_ptr, + msk_stride, invert_mask, sads, run_times); + aom_usec_timer_mark(&timer); + const double time2 = static_cast(aom_usec_timer_elapsed(&timer)); + + if (run_times > 10) { + printf("%7.2f/%7.2fns", time1, time2); + printf("(%3.2f)\n", time1 / time2); + } + if (sads_ref[0] != sads[0] || sads_ref[1] != sads[1] || + sads_ref[2] != sads[2] || sads_ref[3] != sads[3]) { + err_count++; + if (first_failure == -1) first_failure = i; + } + } + } + EXPECT_EQ(0, err_count) << "Error: Masked SAD Test, output doesn't match. " + << "First failed at test case " << first_failure; +} + +TEST_P(MaskedSADTest, OperationCheck) { runMaskedSADTest(1); } + +TEST_P(MaskedSADTest, DISABLED_Speed) { runMaskedSADTest(2000000); } + +TEST_P(MaskedSADx4Test, OperationCheck) { runMaskedSADTest(1); } + +TEST_P(MaskedSADx4Test, DISABLED_Speed) { runMaskedSADTest(2000000); } + +#if CONFIG_AV1_HIGHBITDEPTH +typedef unsigned int (*HighbdMaskedSADFunc)(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, + const uint8_t *msk, int msk_stride, + int invert_mask); +typedef std::tuple + HighbdMaskedSADParam; + +class HighbdMaskedSADTest + : public ::testing::TestWithParam { + public: + virtual ~HighbdMaskedSADTest() {} + virtual void SetUp() { + maskedSAD_op_ = GET_PARAM(0); + ref_maskedSAD_op_ = GET_PARAM(1); + } + + virtual void TearDown() { libaom_test::ClearSystemState(); } + void runHighbdMaskedSADTest(int run_times); + + protected: + HighbdMaskedSADFunc maskedSAD_op_; + HighbdMaskedSADFunc ref_maskedSAD_op_; +}; +void HighbdMaskedSADTest::runHighbdMaskedSADTest(int run_times) { + unsigned int ref_ret = 0, ret = 1; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]); + DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE]); + DECLARE_ALIGNED(16, uint16_t, second_pred_ptr[MAX_SB_SIZE * MAX_SB_SIZE]); + DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE * MAX_SB_SIZE]); + uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr); + uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr); + uint8_t *second_pred8_ptr = CONVERT_TO_BYTEPTR(second_pred_ptr); + int err_count = 0; + int first_failure = -1; + int src_stride = MAX_SB_SIZE; + int ref_stride = MAX_SB_SIZE; + int msk_stride = MAX_SB_SIZE; + const int iters = run_times == 1 ? number_of_iterations : 1; + for (int i = 0; i < iters; ++i) { + for (int j = 0; j < MAX_SB_SIZE * MAX_SB_SIZE; j++) { + src_ptr[j] = rnd.Rand16() & 0xfff; + ref_ptr[j] = rnd.Rand16() & 0xfff; + second_pred_ptr[j] = rnd.Rand16() & 0xfff; + msk_ptr[j] = ((rnd.Rand8() & 0x7f) > 64) ? rnd.Rand8() & 0x3f : 64; + } + + for (int invert_mask = 0; invert_mask < 2; ++invert_mask) { + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int repeat = 0; repeat < run_times; ++repeat) { + ref_ret = ref_maskedSAD_op_(src8_ptr, src_stride, ref8_ptr, ref_stride, + second_pred8_ptr, msk_ptr, msk_stride, + invert_mask); + } + aom_usec_timer_mark(&timer); + const double time1 = static_cast(aom_usec_timer_elapsed(&timer)); + aom_usec_timer_start(&timer); + if (run_times == 1) { + ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src8_ptr, src_stride, + ref8_ptr, ref_stride, + second_pred8_ptr, msk_ptr, + msk_stride, invert_mask)); + } else { + for (int repeat = 0; repeat < run_times; ++repeat) { + ret = + maskedSAD_op_(src8_ptr, src_stride, ref8_ptr, ref_stride, + second_pred8_ptr, msk_ptr, msk_stride, invert_mask); + } + } + aom_usec_timer_mark(&timer); + const double time2 = static_cast(aom_usec_timer_elapsed(&timer)); + if (run_times > 10) { + printf("%7.2f/%7.2fns", time1, time2); + printf("(%3.2f)\n", time1 / time2); + } + if (ret != ref_ret) { + err_count++; + if (first_failure == -1) first_failure = i; + } + } + } + EXPECT_EQ(0, err_count) + << "Error: High BD Masked SAD Test, output doesn't match. " + << "First failed at test case " << first_failure; +} + +TEST_P(HighbdMaskedSADTest, OperationCheck) { runHighbdMaskedSADTest(1); } + +TEST_P(HighbdMaskedSADTest, DISABLED_Speed) { runHighbdMaskedSADTest(1000000); } +#endif // CONFIG_AV1_HIGHBITDEPTH + +using std::make_tuple; + +#if HAVE_SSSE3 +const MaskedSADParam msad_test[] = { + make_tuple(&aom_masked_sad4x4_ssse3, &aom_masked_sad4x4_c), + make_tuple(&aom_masked_sad4x8_ssse3, &aom_masked_sad4x8_c), + make_tuple(&aom_masked_sad8x4_ssse3, &aom_masked_sad8x4_c), + make_tuple(&aom_masked_sad8x8_ssse3, &aom_masked_sad8x8_c), + make_tuple(&aom_masked_sad8x16_ssse3, &aom_masked_sad8x16_c), + make_tuple(&aom_masked_sad16x8_ssse3, &aom_masked_sad16x8_c), + make_tuple(&aom_masked_sad16x16_ssse3, &aom_masked_sad16x16_c), + make_tuple(&aom_masked_sad16x32_ssse3, &aom_masked_sad16x32_c), + make_tuple(&aom_masked_sad32x16_ssse3, &aom_masked_sad32x16_c), + make_tuple(&aom_masked_sad32x32_ssse3, &aom_masked_sad32x32_c), + make_tuple(&aom_masked_sad32x64_ssse3, &aom_masked_sad32x64_c), + make_tuple(&aom_masked_sad64x32_ssse3, &aom_masked_sad64x32_c), + make_tuple(&aom_masked_sad64x64_ssse3, &aom_masked_sad64x64_c), + make_tuple(&aom_masked_sad64x128_ssse3, &aom_masked_sad64x128_c), + make_tuple(&aom_masked_sad128x64_ssse3, &aom_masked_sad128x64_c), + make_tuple(&aom_masked_sad128x128_ssse3, &aom_masked_sad128x128_c), + make_tuple(&aom_masked_sad4x16_ssse3, &aom_masked_sad4x16_c), + make_tuple(&aom_masked_sad16x4_ssse3, &aom_masked_sad16x4_c), + make_tuple(&aom_masked_sad8x32_ssse3, &aom_masked_sad8x32_c), + make_tuple(&aom_masked_sad32x8_ssse3, &aom_masked_sad32x8_c), + make_tuple(&aom_masked_sad16x64_ssse3, &aom_masked_sad16x64_c), + make_tuple(&aom_masked_sad64x16_ssse3, &aom_masked_sad64x16_c), +}; + +INSTANTIATE_TEST_SUITE_P(SSSE3, MaskedSADTest, ::testing::ValuesIn(msad_test)); + +const MaskedSADx4Param msadx4_test[] = { + make_tuple(&aom_masked_sad4x4x4d_ssse3, &aom_masked_sad4x4x4d_c), + make_tuple(&aom_masked_sad4x8x4d_ssse3, &aom_masked_sad4x8x4d_c), + make_tuple(&aom_masked_sad8x4x4d_ssse3, &aom_masked_sad8x4x4d_c), + make_tuple(&aom_masked_sad8x8x4d_ssse3, &aom_masked_sad8x8x4d_c), + make_tuple(&aom_masked_sad8x16x4d_ssse3, &aom_masked_sad8x16x4d_c), + make_tuple(&aom_masked_sad16x8x4d_ssse3, &aom_masked_sad16x8x4d_c), + make_tuple(&aom_masked_sad16x16x4d_ssse3, &aom_masked_sad16x16x4d_c), + make_tuple(&aom_masked_sad16x32x4d_ssse3, &aom_masked_sad16x32x4d_c), + make_tuple(&aom_masked_sad32x16x4d_ssse3, &aom_masked_sad32x16x4d_c), + make_tuple(&aom_masked_sad32x32x4d_ssse3, &aom_masked_sad32x32x4d_c), + make_tuple(&aom_masked_sad32x64x4d_ssse3, &aom_masked_sad32x64x4d_c), + make_tuple(&aom_masked_sad64x32x4d_ssse3, &aom_masked_sad64x32x4d_c), + make_tuple(&aom_masked_sad64x64x4d_ssse3, &aom_masked_sad64x64x4d_c), + make_tuple(&aom_masked_sad64x128x4d_ssse3, &aom_masked_sad64x128x4d_c), + make_tuple(&aom_masked_sad128x64x4d_ssse3, &aom_masked_sad128x64x4d_c), + make_tuple(&aom_masked_sad128x128x4d_ssse3, &aom_masked_sad128x128x4d_c), + make_tuple(&aom_masked_sad4x16x4d_ssse3, &aom_masked_sad4x16x4d_c), + make_tuple(&aom_masked_sad16x4x4d_ssse3, &aom_masked_sad16x4x4d_c), + make_tuple(&aom_masked_sad8x32x4d_ssse3, &aom_masked_sad8x32x4d_c), + make_tuple(&aom_masked_sad32x8x4d_ssse3, &aom_masked_sad32x8x4d_c), + make_tuple(&aom_masked_sad16x64x4d_ssse3, &aom_masked_sad16x64x4d_c), + make_tuple(&aom_masked_sad64x16x4d_ssse3, &aom_masked_sad64x16x4d_c), +}; + +INSTANTIATE_TEST_SUITE_P(SSSE3, MaskedSADx4Test, + ::testing::ValuesIn(msadx4_test)); + +#if CONFIG_AV1_HIGHBITDEPTH +const HighbdMaskedSADParam hbd_msad_test[] = { + make_tuple(&aom_highbd_masked_sad4x4_ssse3, &aom_highbd_masked_sad4x4_c), + make_tuple(&aom_highbd_masked_sad4x8_ssse3, &aom_highbd_masked_sad4x8_c), + make_tuple(&aom_highbd_masked_sad8x4_ssse3, &aom_highbd_masked_sad8x4_c), + make_tuple(&aom_highbd_masked_sad8x8_ssse3, &aom_highbd_masked_sad8x8_c), + make_tuple(&aom_highbd_masked_sad8x16_ssse3, &aom_highbd_masked_sad8x16_c), + make_tuple(&aom_highbd_masked_sad16x8_ssse3, &aom_highbd_masked_sad16x8_c), + make_tuple(&aom_highbd_masked_sad16x16_ssse3, &aom_highbd_masked_sad16x16_c), + make_tuple(&aom_highbd_masked_sad16x32_ssse3, &aom_highbd_masked_sad16x32_c), + make_tuple(&aom_highbd_masked_sad32x16_ssse3, &aom_highbd_masked_sad32x16_c), + make_tuple(&aom_highbd_masked_sad32x32_ssse3, &aom_highbd_masked_sad32x32_c), + make_tuple(&aom_highbd_masked_sad32x64_ssse3, &aom_highbd_masked_sad32x64_c), + make_tuple(&aom_highbd_masked_sad64x32_ssse3, &aom_highbd_masked_sad64x32_c), + make_tuple(&aom_highbd_masked_sad64x64_ssse3, &aom_highbd_masked_sad64x64_c), + make_tuple(&aom_highbd_masked_sad64x128_ssse3, + &aom_highbd_masked_sad64x128_c), + make_tuple(&aom_highbd_masked_sad128x64_ssse3, + &aom_highbd_masked_sad128x64_c), + make_tuple(&aom_highbd_masked_sad128x128_ssse3, + &aom_highbd_masked_sad128x128_c), + make_tuple(&aom_highbd_masked_sad4x16_ssse3, &aom_highbd_masked_sad4x16_c), + make_tuple(&aom_highbd_masked_sad16x4_ssse3, &aom_highbd_masked_sad16x4_c), + make_tuple(&aom_highbd_masked_sad8x32_ssse3, &aom_highbd_masked_sad8x32_c), + make_tuple(&aom_highbd_masked_sad32x8_ssse3, &aom_highbd_masked_sad32x8_c), + make_tuple(&aom_highbd_masked_sad16x64_ssse3, &aom_highbd_masked_sad16x64_c), + make_tuple(&aom_highbd_masked_sad64x16_ssse3, &aom_highbd_masked_sad64x16_c), +}; + +INSTANTIATE_TEST_SUITE_P(SSSE3, HighbdMaskedSADTest, + ::testing::ValuesIn(hbd_msad_test)); +#endif // CONFIG_AV1_HIGHBITDEPTH +#endif // HAVE_SSSE3 + +#if HAVE_AVX2 +const MaskedSADParam msad_avx2_test[] = { + make_tuple(&aom_masked_sad4x4_avx2, &aom_masked_sad4x4_ssse3), + make_tuple(&aom_masked_sad4x8_avx2, &aom_masked_sad4x8_ssse3), + make_tuple(&aom_masked_sad8x4_avx2, &aom_masked_sad8x4_ssse3), + make_tuple(&aom_masked_sad8x8_avx2, &aom_masked_sad8x8_ssse3), + make_tuple(&aom_masked_sad8x16_avx2, &aom_masked_sad8x16_ssse3), + make_tuple(&aom_masked_sad16x8_avx2, &aom_masked_sad16x8_ssse3), + make_tuple(&aom_masked_sad16x16_avx2, &aom_masked_sad16x16_ssse3), + make_tuple(&aom_masked_sad16x32_avx2, &aom_masked_sad16x32_ssse3), + make_tuple(&aom_masked_sad32x16_avx2, &aom_masked_sad32x16_ssse3), + make_tuple(&aom_masked_sad32x32_avx2, &aom_masked_sad32x32_ssse3), + make_tuple(&aom_masked_sad32x64_avx2, &aom_masked_sad32x64_ssse3), + make_tuple(&aom_masked_sad64x32_avx2, &aom_masked_sad64x32_ssse3), + make_tuple(&aom_masked_sad64x64_avx2, &aom_masked_sad64x64_ssse3), + make_tuple(&aom_masked_sad64x128_avx2, &aom_masked_sad64x128_ssse3), + make_tuple(&aom_masked_sad128x64_avx2, &aom_masked_sad128x64_ssse3), + make_tuple(&aom_masked_sad128x128_avx2, &aom_masked_sad128x128_ssse3), + make_tuple(&aom_masked_sad4x16_avx2, &aom_masked_sad4x16_ssse3), + make_tuple(&aom_masked_sad16x4_avx2, &aom_masked_sad16x4_ssse3), + make_tuple(&aom_masked_sad8x32_avx2, &aom_masked_sad8x32_ssse3), + make_tuple(&aom_masked_sad32x8_avx2, &aom_masked_sad32x8_ssse3), + make_tuple(&aom_masked_sad16x64_avx2, &aom_masked_sad16x64_ssse3), + make_tuple(&aom_masked_sad64x16_avx2, &aom_masked_sad64x16_ssse3) +}; + +INSTANTIATE_TEST_SUITE_P(AVX2, MaskedSADTest, + ::testing::ValuesIn(msad_avx2_test)); + +#if CONFIG_AV1_HIGHBITDEPTH +const HighbdMaskedSADParam hbd_msad_avx2_test[] = { + make_tuple(&aom_highbd_masked_sad4x4_avx2, &aom_highbd_masked_sad4x4_ssse3), + make_tuple(&aom_highbd_masked_sad4x8_avx2, &aom_highbd_masked_sad4x8_ssse3), + make_tuple(&aom_highbd_masked_sad8x4_avx2, &aom_highbd_masked_sad8x4_ssse3), + make_tuple(&aom_highbd_masked_sad8x8_avx2, &aom_highbd_masked_sad8x8_ssse3), + make_tuple(&aom_highbd_masked_sad8x16_avx2, &aom_highbd_masked_sad8x16_ssse3), + make_tuple(&aom_highbd_masked_sad16x8_avx2, &aom_highbd_masked_sad16x8_ssse3), + make_tuple(&aom_highbd_masked_sad16x16_avx2, + &aom_highbd_masked_sad16x16_ssse3), + make_tuple(&aom_highbd_masked_sad16x32_avx2, + &aom_highbd_masked_sad16x32_ssse3), + make_tuple(&aom_highbd_masked_sad32x16_avx2, + &aom_highbd_masked_sad32x16_ssse3), + make_tuple(&aom_highbd_masked_sad32x32_avx2, + &aom_highbd_masked_sad32x32_ssse3), + make_tuple(&aom_highbd_masked_sad32x64_avx2, + &aom_highbd_masked_sad32x64_ssse3), + make_tuple(&aom_highbd_masked_sad64x32_avx2, + &aom_highbd_masked_sad64x32_ssse3), + make_tuple(&aom_highbd_masked_sad64x64_avx2, + &aom_highbd_masked_sad64x64_ssse3), + make_tuple(&aom_highbd_masked_sad64x128_avx2, + &aom_highbd_masked_sad64x128_ssse3), + make_tuple(&aom_highbd_masked_sad128x64_avx2, + &aom_highbd_masked_sad128x64_ssse3), + make_tuple(&aom_highbd_masked_sad128x128_avx2, + &aom_highbd_masked_sad128x128_ssse3), + make_tuple(&aom_highbd_masked_sad4x16_avx2, &aom_highbd_masked_sad4x16_ssse3), + make_tuple(&aom_highbd_masked_sad16x4_avx2, &aom_highbd_masked_sad16x4_ssse3), + make_tuple(&aom_highbd_masked_sad8x32_avx2, &aom_highbd_masked_sad8x32_ssse3), + make_tuple(&aom_highbd_masked_sad32x8_avx2, &aom_highbd_masked_sad32x8_ssse3), + make_tuple(&aom_highbd_masked_sad16x64_avx2, + &aom_highbd_masked_sad16x64_ssse3), + make_tuple(&aom_highbd_masked_sad64x16_avx2, + &aom_highbd_masked_sad64x16_ssse3) +}; + +INSTANTIATE_TEST_SUITE_P(AVX2, HighbdMaskedSADTest, + ::testing::ValuesIn(hbd_msad_avx2_test)); +#endif // CONFIG_AV1_HIGHBITDEPTH +#endif // HAVE_AVX2 + +} // namespace diff --git a/libs/libaom/src/test/masked_variance_test.cc b/libs/libaom/src/test/masked_variance_test.cc new file mode 100644 index 000000000..bf814cea2 --- /dev/null +++ b/libs/libaom/src/test/masked_variance_test.cc @@ -0,0 +1,514 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_codec.h" +#include "aom/aom_integer.h" +#include "aom_dsp/aom_filter.h" +#include "aom_mem/aom_mem.h" + +using libaom_test::ACMRandom; + +namespace { +const int number_of_iterations = 200; + +typedef unsigned int (*MaskedSubPixelVarianceFunc)( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, + const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse); + +typedef std::tuple + MaskedSubPixelVarianceParam; + +class MaskedSubPixelVarianceTest + : public ::testing::TestWithParam { + public: + virtual ~MaskedSubPixelVarianceTest() {} + virtual void SetUp() { + opt_func_ = GET_PARAM(0); + ref_func_ = GET_PARAM(1); + } + + virtual void TearDown() { libaom_test::ClearSystemState(); } + + protected: + MaskedSubPixelVarianceFunc opt_func_; + MaskedSubPixelVarianceFunc ref_func_; +}; + +TEST_P(MaskedSubPixelVarianceTest, OperationCheck) { + unsigned int ref_ret, opt_ret; + unsigned int ref_sse, opt_sse; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + // Note: We pad out the input array to a multiple of 16 bytes wide, so that + // consecutive rows keep the 16-byte alignment. + DECLARE_ALIGNED(16, uint8_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]); + DECLARE_ALIGNED(16, uint8_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]); + DECLARE_ALIGNED(16, uint8_t, + second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]); + DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]); + int err_count = 0; + int first_failure = -1; + int src_stride = (MAX_SB_SIZE + 16); + int ref_stride = (MAX_SB_SIZE + 16); + int msk_stride = (MAX_SB_SIZE + 16); + int xoffset; + int yoffset; + + for (int i = 0; i < number_of_iterations; ++i) { + int xoffsets[] = { 0, 4, rnd(BIL_SUBPEL_SHIFTS) }; + int yoffsets[] = { 0, 4, rnd(BIL_SUBPEL_SHIFTS) }; + for (int j = 0; j < (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16); j++) { + src_ptr[j] = rnd.Rand8(); + ref_ptr[j] = rnd.Rand8(); + second_pred_ptr[j] = rnd.Rand8(); + msk_ptr[j] = rnd(65); + } + for (int k = 0; k < 3; k++) { + for (int l = 0; l < 3; l++) { + xoffset = xoffsets[k]; + yoffset = yoffsets[l]; + for (int invert_mask = 0; invert_mask < 2; ++invert_mask) { + ref_ret = ref_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr, + ref_stride, second_pred_ptr, msk_ptr, msk_stride, + invert_mask, &ref_sse); + ASM_REGISTER_STATE_CHECK( + opt_ret = opt_func_(src_ptr, src_stride, xoffset, yoffset, + ref_ptr, ref_stride, second_pred_ptr, msk_ptr, + msk_stride, invert_mask, &opt_sse)); + + if (opt_ret != ref_ret || opt_sse != ref_sse) { + err_count++; + if (first_failure == -1) first_failure = i; + } + } + } + } + } + + EXPECT_EQ(0, err_count) + << "Error: Masked Sub Pixel Variance Test OperationCheck," + << "C output doesn't match SSSE3 output. " + << "First failed at test case " << first_failure; +} + +TEST_P(MaskedSubPixelVarianceTest, ExtremeValues) { + unsigned int ref_ret, opt_ret; + unsigned int ref_sse, opt_sse; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + DECLARE_ALIGNED(16, uint8_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]); + DECLARE_ALIGNED(16, uint8_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]); + DECLARE_ALIGNED(16, uint8_t, + second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]); + DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]); + int first_failure_x = -1; + int first_failure_y = -1; + int err_count = 0; + int first_failure = -1; + int src_stride = (MAX_SB_SIZE + 16); + int ref_stride = (MAX_SB_SIZE + 16); + int msk_stride = (MAX_SB_SIZE + 16); + + for (int xoffset = 0; xoffset < BIL_SUBPEL_SHIFTS; xoffset++) { + for (int yoffset = 0; yoffset < BIL_SUBPEL_SHIFTS; yoffset++) { + for (int i = 0; i < 16; ++i) { + memset(src_ptr, (i & 0x1) ? 255 : 0, + (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)); + memset(ref_ptr, (i & 0x2) ? 255 : 0, + (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)); + memset(second_pred_ptr, (i & 0x4) ? 255 : 0, + (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)); + memset(msk_ptr, (i & 0x8) ? 64 : 0, + (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)); + + for (int invert_mask = 0; invert_mask < 2; ++invert_mask) { + ref_ret = ref_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr, + ref_stride, second_pred_ptr, msk_ptr, msk_stride, + invert_mask, &ref_sse); + ASM_REGISTER_STATE_CHECK( + opt_ret = opt_func_(src_ptr, src_stride, xoffset, yoffset, + ref_ptr, ref_stride, second_pred_ptr, msk_ptr, + msk_stride, invert_mask, &opt_sse)); + + if (opt_ret != ref_ret || opt_sse != ref_sse) { + err_count++; + if (first_failure == -1) { + first_failure = i; + first_failure_x = xoffset; + first_failure_y = yoffset; + } + } + } + } + } + } + + EXPECT_EQ(0, err_count) << "Error: Masked Variance Test ExtremeValues," + << "C output doesn't match SSSE3 output. " + << "First failed at test case " << first_failure + << " x_offset = " << first_failure_x + << " y_offset = " << first_failure_y; +} + +#if CONFIG_AV1_HIGHBITDEPTH +typedef std::tuple + HighbdMaskedSubPixelVarianceParam; + +class HighbdMaskedSubPixelVarianceTest + : public ::testing::TestWithParam { + public: + virtual ~HighbdMaskedSubPixelVarianceTest() {} + virtual void SetUp() { + opt_func_ = GET_PARAM(0); + ref_func_ = GET_PARAM(1); + bit_depth_ = GET_PARAM(2); + } + + virtual void TearDown() { libaom_test::ClearSystemState(); } + + protected: + MaskedSubPixelVarianceFunc opt_func_; + MaskedSubPixelVarianceFunc ref_func_; + aom_bit_depth_t bit_depth_; +}; + +TEST_P(HighbdMaskedSubPixelVarianceTest, OperationCheck) { + unsigned int ref_ret, opt_ret; + unsigned int ref_sse, opt_sse; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]); + DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]); + DECLARE_ALIGNED(16, uint16_t, + second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]); + DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]); + uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr); + uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr); + uint8_t *second_pred8_ptr = CONVERT_TO_BYTEPTR(second_pred_ptr); + int err_count = 0; + int first_failure = -1; + int first_failure_x = -1; + int first_failure_y = -1; + int src_stride = (MAX_SB_SIZE + 8); + int ref_stride = (MAX_SB_SIZE + 8); + int msk_stride = (MAX_SB_SIZE + 8); + int xoffset, yoffset; + + for (int i = 0; i < number_of_iterations; ++i) { + for (int j = 0; j < (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8); j++) { + src_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1); + ref_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1); + second_pred_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1); + msk_ptr[j] = rnd(65); + } + for (xoffset = 0; xoffset < BIL_SUBPEL_SHIFTS; xoffset++) { + for (yoffset = 0; yoffset < BIL_SUBPEL_SHIFTS; yoffset++) { + for (int invert_mask = 0; invert_mask < 2; ++invert_mask) { + ref_ret = ref_func_(src8_ptr, src_stride, xoffset, yoffset, ref8_ptr, + ref_stride, second_pred8_ptr, msk_ptr, msk_stride, + invert_mask, &ref_sse); + ASM_REGISTER_STATE_CHECK( + opt_ret = opt_func_(src8_ptr, src_stride, xoffset, yoffset, + ref8_ptr, ref_stride, second_pred8_ptr, + msk_ptr, msk_stride, invert_mask, &opt_sse)); + + if (opt_ret != ref_ret || opt_sse != ref_sse) { + err_count++; + if (first_failure == -1) { + first_failure = i; + first_failure_x = xoffset; + first_failure_y = yoffset; + } + } + } + } + } + } + + EXPECT_EQ(0, err_count) + << "Error: Masked Sub Pixel Variance Test OperationCheck," + << "C output doesn't match SSSE3 output. " + << "First failed at test case " << first_failure + << " x_offset = " << first_failure_x << " y_offset = " << first_failure_y; +} + +TEST_P(HighbdMaskedSubPixelVarianceTest, ExtremeValues) { + unsigned int ref_ret, opt_ret; + unsigned int ref_sse, opt_sse; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]); + DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]); + DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]); + DECLARE_ALIGNED(16, uint16_t, + second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]); + uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr); + uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr); + uint8_t *second_pred8_ptr = CONVERT_TO_BYTEPTR(second_pred_ptr); + int first_failure_x = -1; + int first_failure_y = -1; + int err_count = 0; + int first_failure = -1; + int src_stride = (MAX_SB_SIZE + 8); + int ref_stride = (MAX_SB_SIZE + 8); + int msk_stride = (MAX_SB_SIZE + 8); + + for (int xoffset = 0; xoffset < BIL_SUBPEL_SHIFTS; xoffset++) { + for (int yoffset = 0; yoffset < BIL_SUBPEL_SHIFTS; yoffset++) { + for (int i = 0; i < 16; ++i) { + aom_memset16(src_ptr, (i & 0x1) ? ((1 << bit_depth_) - 1) : 0, + (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)); + aom_memset16(ref_ptr, (i & 0x2) ? ((1 << bit_depth_) - 1) : 0, + (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)); + aom_memset16(second_pred_ptr, (i & 0x4) ? ((1 << bit_depth_) - 1) : 0, + (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)); + memset(msk_ptr, (i & 0x8) ? 64 : 0, + (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)); + + for (int invert_mask = 0; invert_mask < 2; ++invert_mask) { + ref_ret = ref_func_(src8_ptr, src_stride, xoffset, yoffset, ref8_ptr, + ref_stride, second_pred8_ptr, msk_ptr, msk_stride, + invert_mask, &ref_sse); + ASM_REGISTER_STATE_CHECK( + opt_ret = opt_func_(src8_ptr, src_stride, xoffset, yoffset, + ref8_ptr, ref_stride, second_pred8_ptr, + msk_ptr, msk_stride, invert_mask, &opt_sse)); + + if (opt_ret != ref_ret || opt_sse != ref_sse) { + err_count++; + if (first_failure == -1) { + first_failure = i; + first_failure_x = xoffset; + first_failure_y = yoffset; + } + } + } + } + } + } + + EXPECT_EQ(0, err_count) << "Error: Masked Variance Test ExtremeValues," + << "C output doesn't match SSSE3 output. " + << "First failed at test case " << first_failure + << " x_offset = " << first_failure_x + << " y_offset = " << first_failure_y; +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +using std::make_tuple; + +#if HAVE_SSSE3 + +const MaskedSubPixelVarianceParam sub_pel_var_test[] = { + make_tuple(&aom_masked_sub_pixel_variance128x128_ssse3, + &aom_masked_sub_pixel_variance128x128_c), + make_tuple(&aom_masked_sub_pixel_variance128x64_ssse3, + &aom_masked_sub_pixel_variance128x64_c), + make_tuple(&aom_masked_sub_pixel_variance64x128_ssse3, + &aom_masked_sub_pixel_variance64x128_c), + make_tuple(&aom_masked_sub_pixel_variance64x64_ssse3, + &aom_masked_sub_pixel_variance64x64_c), + make_tuple(&aom_masked_sub_pixel_variance64x32_ssse3, + &aom_masked_sub_pixel_variance64x32_c), + make_tuple(&aom_masked_sub_pixel_variance32x64_ssse3, + &aom_masked_sub_pixel_variance32x64_c), + make_tuple(&aom_masked_sub_pixel_variance32x32_ssse3, + &aom_masked_sub_pixel_variance32x32_c), + make_tuple(&aom_masked_sub_pixel_variance32x16_ssse3, + &aom_masked_sub_pixel_variance32x16_c), + make_tuple(&aom_masked_sub_pixel_variance16x32_ssse3, + &aom_masked_sub_pixel_variance16x32_c), + make_tuple(&aom_masked_sub_pixel_variance16x16_ssse3, + &aom_masked_sub_pixel_variance16x16_c), + make_tuple(&aom_masked_sub_pixel_variance16x8_ssse3, + &aom_masked_sub_pixel_variance16x8_c), + make_tuple(&aom_masked_sub_pixel_variance8x16_ssse3, + &aom_masked_sub_pixel_variance8x16_c), + make_tuple(&aom_masked_sub_pixel_variance8x8_ssse3, + &aom_masked_sub_pixel_variance8x8_c), + make_tuple(&aom_masked_sub_pixel_variance8x4_ssse3, + &aom_masked_sub_pixel_variance8x4_c), + make_tuple(&aom_masked_sub_pixel_variance4x8_ssse3, + &aom_masked_sub_pixel_variance4x8_c), + make_tuple(&aom_masked_sub_pixel_variance4x4_ssse3, + &aom_masked_sub_pixel_variance4x4_c), + + make_tuple(&aom_masked_sub_pixel_variance64x16_ssse3, + &aom_masked_sub_pixel_variance64x16_c), + make_tuple(&aom_masked_sub_pixel_variance16x64_ssse3, + &aom_masked_sub_pixel_variance16x64_c), + make_tuple(&aom_masked_sub_pixel_variance32x8_ssse3, + &aom_masked_sub_pixel_variance32x8_c), + make_tuple(&aom_masked_sub_pixel_variance8x32_ssse3, + &aom_masked_sub_pixel_variance8x32_c), + make_tuple(&aom_masked_sub_pixel_variance16x4_ssse3, + &aom_masked_sub_pixel_variance16x4_c), + make_tuple(&aom_masked_sub_pixel_variance4x16_ssse3, + &aom_masked_sub_pixel_variance4x16_c), +}; + +INSTANTIATE_TEST_SUITE_P(SSSE3_C_COMPARE, MaskedSubPixelVarianceTest, + ::testing::ValuesIn(sub_pel_var_test)); + +#if CONFIG_AV1_HIGHBITDEPTH +const HighbdMaskedSubPixelVarianceParam hbd_sub_pel_var_test[] = { + make_tuple(&aom_highbd_8_masked_sub_pixel_variance128x128_ssse3, + &aom_highbd_8_masked_sub_pixel_variance128x128_c, AOM_BITS_8), + make_tuple(&aom_highbd_8_masked_sub_pixel_variance128x64_ssse3, + &aom_highbd_8_masked_sub_pixel_variance128x64_c, AOM_BITS_8), + make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x128_ssse3, + &aom_highbd_8_masked_sub_pixel_variance64x128_c, AOM_BITS_8), + make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x64_ssse3, + &aom_highbd_8_masked_sub_pixel_variance64x64_c, AOM_BITS_8), + make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x32_ssse3, + &aom_highbd_8_masked_sub_pixel_variance64x32_c, AOM_BITS_8), + make_tuple(&aom_highbd_8_masked_sub_pixel_variance32x64_ssse3, + &aom_highbd_8_masked_sub_pixel_variance32x64_c, AOM_BITS_8), + make_tuple(&aom_highbd_8_masked_sub_pixel_variance32x32_ssse3, + &aom_highbd_8_masked_sub_pixel_variance32x32_c, AOM_BITS_8), + make_tuple(&aom_highbd_8_masked_sub_pixel_variance32x16_ssse3, + &aom_highbd_8_masked_sub_pixel_variance32x16_c, AOM_BITS_8), + make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x32_ssse3, + &aom_highbd_8_masked_sub_pixel_variance16x32_c, AOM_BITS_8), + make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x16_ssse3, + &aom_highbd_8_masked_sub_pixel_variance16x16_c, AOM_BITS_8), + make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x8_ssse3, + &aom_highbd_8_masked_sub_pixel_variance16x8_c, AOM_BITS_8), + make_tuple(&aom_highbd_8_masked_sub_pixel_variance8x16_ssse3, + &aom_highbd_8_masked_sub_pixel_variance8x16_c, AOM_BITS_8), + make_tuple(&aom_highbd_8_masked_sub_pixel_variance8x8_ssse3, + &aom_highbd_8_masked_sub_pixel_variance8x8_c, AOM_BITS_8), + make_tuple(&aom_highbd_8_masked_sub_pixel_variance8x4_ssse3, + &aom_highbd_8_masked_sub_pixel_variance8x4_c, AOM_BITS_8), + make_tuple(&aom_highbd_8_masked_sub_pixel_variance4x8_ssse3, + &aom_highbd_8_masked_sub_pixel_variance4x8_c, AOM_BITS_8), + make_tuple(&aom_highbd_8_masked_sub_pixel_variance4x4_ssse3, + &aom_highbd_8_masked_sub_pixel_variance4x4_c, AOM_BITS_8), + make_tuple(&aom_highbd_10_masked_sub_pixel_variance128x128_ssse3, + &aom_highbd_10_masked_sub_pixel_variance128x128_c, AOM_BITS_10), + make_tuple(&aom_highbd_10_masked_sub_pixel_variance128x64_ssse3, + &aom_highbd_10_masked_sub_pixel_variance128x64_c, AOM_BITS_10), + make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x128_ssse3, + &aom_highbd_10_masked_sub_pixel_variance64x128_c, AOM_BITS_10), + make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x64_ssse3, + &aom_highbd_10_masked_sub_pixel_variance64x64_c, AOM_BITS_10), + make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x32_ssse3, + &aom_highbd_10_masked_sub_pixel_variance64x32_c, AOM_BITS_10), + make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x64_ssse3, + &aom_highbd_10_masked_sub_pixel_variance32x64_c, AOM_BITS_10), + make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x32_ssse3, + &aom_highbd_10_masked_sub_pixel_variance32x32_c, AOM_BITS_10), + make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x16_ssse3, + &aom_highbd_10_masked_sub_pixel_variance32x16_c, AOM_BITS_10), + make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x32_ssse3, + &aom_highbd_10_masked_sub_pixel_variance16x32_c, AOM_BITS_10), + make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x16_ssse3, + &aom_highbd_10_masked_sub_pixel_variance16x16_c, AOM_BITS_10), + make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x8_ssse3, + &aom_highbd_10_masked_sub_pixel_variance16x8_c, AOM_BITS_10), + make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x16_ssse3, + &aom_highbd_10_masked_sub_pixel_variance8x16_c, AOM_BITS_10), + make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x8_ssse3, + &aom_highbd_10_masked_sub_pixel_variance8x8_c, AOM_BITS_10), + make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x4_ssse3, + &aom_highbd_10_masked_sub_pixel_variance8x4_c, AOM_BITS_10), + make_tuple(&aom_highbd_10_masked_sub_pixel_variance4x8_ssse3, + &aom_highbd_10_masked_sub_pixel_variance4x8_c, AOM_BITS_10), + make_tuple(&aom_highbd_10_masked_sub_pixel_variance4x4_ssse3, + &aom_highbd_10_masked_sub_pixel_variance4x4_c, AOM_BITS_10), + make_tuple(&aom_highbd_12_masked_sub_pixel_variance128x128_ssse3, + &aom_highbd_12_masked_sub_pixel_variance128x128_c, AOM_BITS_12), + make_tuple(&aom_highbd_12_masked_sub_pixel_variance128x64_ssse3, + &aom_highbd_12_masked_sub_pixel_variance128x64_c, AOM_BITS_12), + make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x128_ssse3, + &aom_highbd_12_masked_sub_pixel_variance64x128_c, AOM_BITS_12), + make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x64_ssse3, + &aom_highbd_12_masked_sub_pixel_variance64x64_c, AOM_BITS_12), + make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x32_ssse3, + &aom_highbd_12_masked_sub_pixel_variance64x32_c, AOM_BITS_12), + make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x64_ssse3, + &aom_highbd_12_masked_sub_pixel_variance32x64_c, AOM_BITS_12), + make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x32_ssse3, + &aom_highbd_12_masked_sub_pixel_variance32x32_c, AOM_BITS_12), + make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x16_ssse3, + &aom_highbd_12_masked_sub_pixel_variance32x16_c, AOM_BITS_12), + make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x32_ssse3, + &aom_highbd_12_masked_sub_pixel_variance16x32_c, AOM_BITS_12), + make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x16_ssse3, + &aom_highbd_12_masked_sub_pixel_variance16x16_c, AOM_BITS_12), + make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x8_ssse3, + &aom_highbd_12_masked_sub_pixel_variance16x8_c, AOM_BITS_12), + make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x16_ssse3, + &aom_highbd_12_masked_sub_pixel_variance8x16_c, AOM_BITS_12), + make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x8_ssse3, + &aom_highbd_12_masked_sub_pixel_variance8x8_c, AOM_BITS_12), + make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x4_ssse3, + &aom_highbd_12_masked_sub_pixel_variance8x4_c, AOM_BITS_12), + make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x8_ssse3, + &aom_highbd_12_masked_sub_pixel_variance4x8_c, AOM_BITS_12), + make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x4_ssse3, + &aom_highbd_12_masked_sub_pixel_variance4x4_c, AOM_BITS_12), + + make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x16_ssse3, + &aom_highbd_8_masked_sub_pixel_variance64x16_c, AOM_BITS_8), + make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x64_ssse3, + &aom_highbd_8_masked_sub_pixel_variance16x64_c, AOM_BITS_8), + make_tuple(&aom_highbd_8_masked_sub_pixel_variance32x8_ssse3, + &aom_highbd_8_masked_sub_pixel_variance32x8_c, AOM_BITS_8), + make_tuple(&aom_highbd_8_masked_sub_pixel_variance8x32_ssse3, + &aom_highbd_8_masked_sub_pixel_variance8x32_c, AOM_BITS_8), + make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x4_ssse3, + &aom_highbd_8_masked_sub_pixel_variance16x4_c, AOM_BITS_8), + make_tuple(&aom_highbd_8_masked_sub_pixel_variance4x16_ssse3, + &aom_highbd_8_masked_sub_pixel_variance4x16_c, AOM_BITS_8), + make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x16_ssse3, + &aom_highbd_10_masked_sub_pixel_variance64x16_c, AOM_BITS_10), + make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x64_ssse3, + &aom_highbd_10_masked_sub_pixel_variance16x64_c, AOM_BITS_10), + make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x8_ssse3, + &aom_highbd_10_masked_sub_pixel_variance32x8_c, AOM_BITS_10), + make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x32_ssse3, + &aom_highbd_10_masked_sub_pixel_variance8x32_c, AOM_BITS_10), + make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x4_ssse3, + &aom_highbd_10_masked_sub_pixel_variance16x4_c, AOM_BITS_10), + make_tuple(&aom_highbd_10_masked_sub_pixel_variance4x16_ssse3, + &aom_highbd_10_masked_sub_pixel_variance4x16_c, AOM_BITS_10), + make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x16_ssse3, + &aom_highbd_12_masked_sub_pixel_variance64x16_c, AOM_BITS_12), + make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x64_ssse3, + &aom_highbd_12_masked_sub_pixel_variance16x64_c, AOM_BITS_12), + make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x8_ssse3, + &aom_highbd_12_masked_sub_pixel_variance32x8_c, AOM_BITS_12), + make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x32_ssse3, + &aom_highbd_12_masked_sub_pixel_variance8x32_c, AOM_BITS_12), + make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x4_ssse3, + &aom_highbd_12_masked_sub_pixel_variance16x4_c, AOM_BITS_12), + make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x16_ssse3, + &aom_highbd_12_masked_sub_pixel_variance4x16_c, AOM_BITS_12), +}; + +INSTANTIATE_TEST_SUITE_P(SSSE3_C_COMPARE, HighbdMaskedSubPixelVarianceTest, + ::testing::ValuesIn(hbd_sub_pel_var_test)); +#endif // CONFIG_AV1_HIGHBITDEPTH +#endif // HAVE_SSSE3 +} // namespace diff --git a/libs/libaom/src/test/md5_helper.h b/libs/libaom/src/test/md5_helper.h new file mode 100644 index 000000000..9443cb262 --- /dev/null +++ b/libs/libaom/src/test/md5_helper.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_TEST_MD5_HELPER_H_ +#define AOM_TEST_MD5_HELPER_H_ + +#include "aom/aom_decoder.h" +#include "common/md5_utils.h" + +namespace libaom_test { +class MD5 { + public: + MD5() { MD5Init(&md5_); } + + void Add(const aom_image_t *img) { + for (int plane = 0; plane < 3; ++plane) { + const uint8_t *buf = img->planes[plane]; + // Calculate the width and height to do the md5 check. For the chroma + // plane, we never want to round down and thus skip a pixel so if + // we are shifting by 1 (chroma_shift) we add 1 before doing the shift. + // This works only for chroma_shift of 0 and 1. + const int bytes_per_sample = + (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; + const int h = + plane ? (img->d_h + img->y_chroma_shift) >> img->y_chroma_shift + : img->d_h; + const int w = + (plane ? (img->d_w + img->x_chroma_shift) >> img->x_chroma_shift + : img->d_w) * + bytes_per_sample; + + for (int y = 0; y < h; ++y) { + MD5Update(&md5_, buf, w); + buf += img->stride[plane]; + } + } + } + + void Add(const uint8_t *data, size_t size) { + MD5Update(&md5_, data, static_cast(size)); + } + + const char *Get(void) { + static const char hex[16] = { + '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', + }; + uint8_t tmp[16]; + MD5Context ctx_tmp = md5_; + + MD5Final(tmp, &ctx_tmp); + for (int i = 0; i < 16; i++) { + res_[i * 2 + 0] = hex[tmp[i] >> 4]; + res_[i * 2 + 1] = hex[tmp[i] & 0xf]; + } + res_[32] = 0; + + return res_; + } + + protected: + char res_[33]; + MD5Context md5_; +}; + +} // namespace libaom_test + +#endif // AOM_TEST_MD5_HELPER_H_ diff --git a/libs/libaom/src/test/metadata_test.cc b/libs/libaom/src/test/metadata_test.cc new file mode 100644 index 000000000..79e08a7a5 --- /dev/null +++ b/libs/libaom/src/test/metadata_test.cc @@ -0,0 +1,337 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "aom/aom_codec.h" +#include "aom/aom_image.h" +#include "aom/internal/aom_image_internal.h" +#include "aom_scale/yv12config.h" +#include "av1/encoder/bitstream.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/video_source.h" + +namespace { +const size_t kMetadataPayloadSizeT35 = 24; +// 0xB5 stands for the itut t35 metadata country code for the Unites States +const uint8_t kMetadataPayloadT35[kMetadataPayloadSizeT35] = { + 0xB5, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, + 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 +}; + +const size_t kMetadataPayloadSizeCll = 4; +const uint8_t kMetadataPayloadCll[kMetadataPayloadSizeCll] = { 0xB5, 0x01, 0x02, + 0x03 }; + +#if CONFIG_AV1_ENCODER + +const size_t kMetadataObuSizeT35 = 28; +const uint8_t kMetadataObuT35[kMetadataObuSizeT35] = { + 0x2A, 0x1A, 0x02, 0xB5, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, + 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x80 +}; +const size_t kMetadataObuSizeMdcv = 28; +const uint8_t kMetadataObuMdcv[kMetadataObuSizeMdcv] = { + 0x2A, 0x1A, 0x02, 0xB5, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, + 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x80 +}; +const size_t kMetadataObuSizeCll = 8; +const uint8_t kMetadataObuCll[kMetadataObuSizeCll] = { 0x2A, 0x06, 0x01, 0xB5, + 0x01, 0x02, 0x03, 0x80 }; + +class MetadataEncodeTest + : public ::libaom_test::CodecTestWithParam, + public ::libaom_test::EncoderTest { + protected: + MetadataEncodeTest() : EncoderTest(GET_PARAM(0)) {} + + virtual ~MetadataEncodeTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(GET_PARAM(1)); + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video) { + aom_image_t *current_frame = video->img(); + if (current_frame) { + if (current_frame->metadata) aom_img_remove_metadata(current_frame); + ASSERT_EQ(aom_img_add_metadata(current_frame, OBU_METADATA_TYPE_ITUT_T35, + kMetadataPayloadT35, 0, AOM_MIF_ANY_FRAME), + -1); + ASSERT_EQ( + aom_img_add_metadata(current_frame, OBU_METADATA_TYPE_ITUT_T35, NULL, + kMetadataPayloadSizeT35, AOM_MIF_ANY_FRAME), + -1); + ASSERT_EQ(aom_img_add_metadata(current_frame, OBU_METADATA_TYPE_ITUT_T35, + NULL, 0, AOM_MIF_ANY_FRAME), + -1); + ASSERT_EQ( + aom_img_add_metadata(current_frame, OBU_METADATA_TYPE_ITUT_T35, + kMetadataPayloadT35, kMetadataPayloadSizeT35, + AOM_MIF_ANY_FRAME), + 0); + + ASSERT_EQ( + aom_img_add_metadata(current_frame, OBU_METADATA_TYPE_HDR_MDCV, + kMetadataPayloadT35, kMetadataPayloadSizeT35, + AOM_MIF_KEY_FRAME), + 0); + + ASSERT_EQ( + aom_img_add_metadata(current_frame, OBU_METADATA_TYPE_HDR_CLL, + kMetadataPayloadCll, kMetadataPayloadSizeCll, + AOM_MIF_KEY_FRAME), + 0); + } + } + + virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) { + if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) { + const size_t bitstream_size = pkt->data.frame.sz; + const uint8_t *bitstream = + static_cast(pkt->data.frame.buf); + // look for valid metadatas in bitstream + bool itut_t35_metadata_found = false; + if (bitstream_size >= kMetadataObuSizeT35) { + for (size_t i = 0; i <= bitstream_size - kMetadataObuSizeT35; ++i) { + if (memcmp(bitstream + i, kMetadataObuT35, kMetadataObuSizeT35) == + 0) { + itut_t35_metadata_found = true; + } + } + } + ASSERT_EQ(itut_t35_metadata_found, 1u); + + // Testing for HDR MDCV metadata + bool hdr_mdcv_metadata_found = false; + if (bitstream_size >= kMetadataObuSizeMdcv) { + for (size_t i = 0; i <= bitstream_size - kMetadataObuSizeMdcv; ++i) { + if (memcmp(bitstream + i, kMetadataObuMdcv, kMetadataObuSizeMdcv) == + 0) { + hdr_mdcv_metadata_found = true; + } + } + } + ASSERT_TRUE(hdr_mdcv_metadata_found); + + // Testing for HDR CLL metadata + bool hdr_cll_metadata_found = false; + if (bitstream_size >= kMetadataObuSizeCll) { + for (size_t i = 0; i <= bitstream_size - kMetadataObuSizeCll; ++i) { + if (memcmp(bitstream + i, kMetadataObuCll, kMetadataObuSizeCll) == + 0) { + hdr_cll_metadata_found = true; + } + } + } + ASSERT_TRUE(hdr_cll_metadata_found); + } + } + + virtual void DecompressedFrameHook(const aom_image_t &img, + aom_codec_pts_t /*pts*/) { + ASSERT_TRUE(img.metadata != nullptr); + + ASSERT_EQ(img.metadata->sz, 3u); + + for (size_t i = 0; i < img.metadata->sz - 1; ++i) { + ASSERT_EQ(kMetadataPayloadSizeT35, img.metadata->metadata_array[i]->sz); + EXPECT_EQ( + memcmp(kMetadataPayloadT35, img.metadata->metadata_array[i]->payload, + kMetadataPayloadSizeT35), + 0); + } + + ASSERT_EQ(kMetadataPayloadSizeCll, img.metadata->metadata_array[2]->sz); + EXPECT_EQ( + memcmp(kMetadataPayloadCll, img.metadata->metadata_array[2]->payload, + kMetadataPayloadSizeCll), + 0); + } +}; + +TEST_P(MetadataEncodeTest, TestMetadataEncoding) { + ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 5); + init_flags_ = AOM_CODEC_USE_PSNR; + + cfg_.g_w = 352; + cfg_.g_h = 288; + + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.rc_undershoot_pct = 50; + cfg_.rc_overshoot_pct = 50; + cfg_.rc_end_usage = AOM_CBR; + cfg_.kf_mode = AOM_KF_AUTO; + cfg_.g_lag_in_frames = 1; + cfg_.kf_min_dist = cfg_.kf_max_dist = 3000; + // Enable dropped frames. + cfg_.rc_dropframe_thresh = 1; + // Disable error_resilience mode. + cfg_.g_error_resilient = 0; + // Run at low bitrate. + cfg_.rc_target_bitrate = 40; + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +AV1_INSTANTIATE_TEST_CASE(MetadataEncodeTest, + ::testing::Values(::libaom_test::kOnePassGood)); + +#endif // CONFIG_AV1_ENCODER +} // namespace + +TEST(MetadataTest, MetadataAllocation) { + aom_metadata_t *metadata = + aom_img_metadata_alloc(OBU_METADATA_TYPE_ITUT_T35, kMetadataPayloadT35, + kMetadataPayloadSizeT35, AOM_MIF_ANY_FRAME); + ASSERT_NE(metadata, nullptr); + aom_img_metadata_free(metadata); +} + +TEST(MetadataTest, MetadataArrayAllocation) { + aom_metadata_array_t *metadata_array = aom_img_metadata_array_alloc(2); + ASSERT_NE(metadata_array, nullptr); + + metadata_array->metadata_array[0] = + aom_img_metadata_alloc(OBU_METADATA_TYPE_ITUT_T35, kMetadataPayloadT35, + kMetadataPayloadSizeT35, AOM_MIF_ANY_FRAME); + metadata_array->metadata_array[1] = + aom_img_metadata_alloc(OBU_METADATA_TYPE_ITUT_T35, kMetadataPayloadT35, + kMetadataPayloadSizeT35, AOM_MIF_ANY_FRAME); + + aom_img_metadata_array_free(metadata_array); +} + +TEST(MetadataTest, AddMetadataToImage) { + aom_image_t image; + image.metadata = NULL; + + ASSERT_EQ(aom_img_add_metadata(&image, OBU_METADATA_TYPE_ITUT_T35, + kMetadataPayloadT35, kMetadataPayloadSizeT35, + AOM_MIF_ANY_FRAME), + 0); + aom_img_metadata_array_free(image.metadata); + EXPECT_EQ(aom_img_add_metadata(NULL, OBU_METADATA_TYPE_ITUT_T35, + kMetadataPayloadT35, kMetadataPayloadSizeT35, + AOM_MIF_ANY_FRAME), + -1); +} + +TEST(MetadataTest, RemoveMetadataFromImage) { + aom_image_t image; + image.metadata = NULL; + + ASSERT_EQ(aom_img_add_metadata(&image, OBU_METADATA_TYPE_ITUT_T35, + kMetadataPayloadT35, kMetadataPayloadSizeT35, + AOM_MIF_ANY_FRAME), + 0); + aom_img_remove_metadata(&image); + aom_img_remove_metadata(NULL); +} + +TEST(MetadataTest, CopyMetadataToFrameBuffer) { + YV12_BUFFER_CONFIG yvBuf; + yvBuf.metadata = NULL; + + aom_metadata_array_t *metadata_array = aom_img_metadata_array_alloc(1); + ASSERT_NE(metadata_array, nullptr); + + metadata_array->metadata_array[0] = + aom_img_metadata_alloc(OBU_METADATA_TYPE_ITUT_T35, kMetadataPayloadT35, + kMetadataPayloadSizeT35, AOM_MIF_ANY_FRAME); + + // Metadata_array + int status = aom_copy_metadata_to_frame_buffer(&yvBuf, metadata_array); + EXPECT_EQ(status, 0); + status = aom_copy_metadata_to_frame_buffer(NULL, metadata_array); + EXPECT_EQ(status, -1); + aom_img_metadata_array_free(metadata_array); + + // Metadata_array_2 + aom_metadata_array_t *metadata_array_2 = aom_img_metadata_array_alloc(0); + ASSERT_NE(metadata_array_2, nullptr); + status = aom_copy_metadata_to_frame_buffer(&yvBuf, metadata_array_2); + EXPECT_EQ(status, -1); + aom_img_metadata_array_free(metadata_array_2); + + // YV12_BUFFER_CONFIG + status = aom_copy_metadata_to_frame_buffer(&yvBuf, NULL); + EXPECT_EQ(status, -1); + aom_remove_metadata_from_frame_buffer(&yvBuf); + aom_remove_metadata_from_frame_buffer(NULL); +} + +TEST(MetadataTest, GetMetadataFromImage) { + aom_image_t image; + image.metadata = NULL; + + ASSERT_EQ(aom_img_add_metadata(&image, OBU_METADATA_TYPE_ITUT_T35, + kMetadataPayloadT35, kMetadataPayloadSizeT35, + AOM_MIF_ANY_FRAME), + 0); + + EXPECT_TRUE(aom_img_get_metadata(NULL, 0) == NULL); + EXPECT_TRUE(aom_img_get_metadata(&image, 1u) == NULL); + EXPECT_TRUE(aom_img_get_metadata(&image, 10u) == NULL); + + const aom_metadata_t *metadata = aom_img_get_metadata(&image, 0); + ASSERT_TRUE(metadata != NULL); + ASSERT_EQ(metadata->sz, kMetadataPayloadSizeT35); + EXPECT_EQ( + memcmp(kMetadataPayloadT35, metadata->payload, kMetadataPayloadSizeT35), + 0); + + aom_img_metadata_array_free(image.metadata); +} + +TEST(MetadataTest, ReadMetadatasFromImage) { + aom_image_t image; + image.metadata = NULL; + + uint32_t types[3]; + types[0] = OBU_METADATA_TYPE_ITUT_T35; + types[1] = OBU_METADATA_TYPE_HDR_CLL; + types[2] = OBU_METADATA_TYPE_HDR_MDCV; + + ASSERT_EQ(aom_img_add_metadata(&image, types[0], kMetadataPayloadT35, + kMetadataPayloadSizeT35, AOM_MIF_ANY_FRAME), + 0); + ASSERT_EQ(aom_img_add_metadata(&image, types[1], kMetadataPayloadT35, + kMetadataPayloadSizeT35, AOM_MIF_KEY_FRAME), + 0); + ASSERT_EQ(aom_img_add_metadata(&image, types[2], kMetadataPayloadT35, + kMetadataPayloadSizeT35, AOM_MIF_KEY_FRAME), + 0); + + size_t number_metadata = aom_img_num_metadata(&image); + ASSERT_EQ(number_metadata, 3u); + for (size_t i = 0; i < number_metadata; ++i) { + const aom_metadata_t *metadata = aom_img_get_metadata(&image, i); + ASSERT_TRUE(metadata != NULL); + ASSERT_EQ(metadata->type, types[i]); + ASSERT_EQ(metadata->sz, kMetadataPayloadSizeT35); + EXPECT_EQ( + memcmp(kMetadataPayloadT35, metadata->payload, kMetadataPayloadSizeT35), + 0); + } + aom_img_metadata_array_free(image.metadata); +} diff --git a/libs/libaom/src/test/metrics_template.html b/libs/libaom/src/test/metrics_template.html new file mode 100644 index 000000000..b57c62314 --- /dev/null +++ b/libs/libaom/src/test/metrics_template.html @@ -0,0 +1,422 @@ + + + + +Video Codec Test Results + + + + + + + +
+ +
+

Codec Comparison Results

+
+ +
+ +
+ Method For Combining Points + Average of bitrates difference + BDSNR + BDRATE +
+ +
+
+ +
+ +
+ +
+
+
+ +
+
+
+ +
+
+
Indicators
+
+
+
+
+ +
+ +
+ + + diff --git a/libs/libaom/src/test/monochrome_test.cc b/libs/libaom/src/test/monochrome_test.cc new file mode 100644 index 000000000..ebccba584 --- /dev/null +++ b/libs/libaom/src/test/monochrome_test.cc @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/video_source.h" +#include "test/util.h" + +namespace { + +class MonochromeTest + : public ::libaom_test::CodecTestWithParam, + public ::libaom_test::EncoderTest { + protected: + MonochromeTest() : EncoderTest(GET_PARAM(0)), frame0_psnr_y_(0.) {} + + virtual ~MonochromeTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(GET_PARAM(1)); + } + + virtual void DecompressedFrameHook(const aom_image_t &img, + aom_codec_pts_t pts) { + (void)pts; + + // Get value of top-left corner pixel of U plane + int chroma_value = img.planes[AOM_PLANE_U][0]; + + bool is_chroma_constant = + ComparePlaneToValue(img, AOM_PLANE_U, chroma_value) && + ComparePlaneToValue(img, AOM_PLANE_V, chroma_value); + + // Chroma planes should be constant + EXPECT_TRUE(is_chroma_constant); + + // Monochrome flag on image should be set + EXPECT_EQ(img.monochrome, 1); + + chroma_value_list_.push_back(chroma_value); + } + + // Returns true if all pixels on the plane are equal to value, and returns + // false otherwise. + bool ComparePlaneToValue(const aom_image_t &img, const int plane, + const int value) { + const int w = aom_img_plane_width(&img, plane); + const int h = aom_img_plane_height(&img, plane); + const uint8_t *const buf = img.planes[plane]; + const int stride = img.stride[plane]; + + for (int r = 0; r < h; ++r) { + for (int c = 0; c < w; ++c) { + if (buf[r * stride + c] != value) return false; + } + } + return true; + } + + virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) { + // Check that the initial Y PSNR value is 'high enough', and check that + // subsequent Y PSNR values are 'close' to this initial value. + if (frame0_psnr_y_ == 0.) { + frame0_psnr_y_ = pkt->data.psnr.psnr[1]; + EXPECT_GT(frame0_psnr_y_, 29.); + } + EXPECT_NEAR(pkt->data.psnr.psnr[1], frame0_psnr_y_, 2.5); + } + + std::vector chroma_value_list_; + double frame0_psnr_y_; +}; + +TEST_P(MonochromeTest, TestMonochromeEncoding) { + ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 5); + + init_flags_ = AOM_CODEC_USE_PSNR; + + cfg_.g_w = 352; + cfg_.g_h = 288; + + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.rc_undershoot_pct = 50; + cfg_.rc_overshoot_pct = 50; + cfg_.rc_end_usage = AOM_CBR; + cfg_.kf_mode = AOM_KF_AUTO; + cfg_.g_lag_in_frames = 1; + cfg_.kf_min_dist = cfg_.kf_max_dist = 3000; + // Enable dropped frames. + cfg_.rc_dropframe_thresh = 1; + // Disable error_resilience mode. + cfg_.g_error_resilient = 0; + // Run at low bitrate. + cfg_.rc_target_bitrate = 40; + // Set monochrome encoding flag + cfg_.monochrome = 1; + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + // Check that the chroma planes are equal across all frames + std::vector::const_iterator iter = chroma_value_list_.begin(); + int initial_chroma_value = *iter; + for (; iter != chroma_value_list_.end(); ++iter) { + // Check that all decoded frames have the same constant chroma planes. + EXPECT_EQ(*iter, initial_chroma_value); + } +} + +AV1_INSTANTIATE_TEST_CASE(MonochromeTest, + ::testing::Values(::libaom_test::kTwoPassGood)); + +} // namespace diff --git a/libs/libaom/src/test/motion_vector_test.cc b/libs/libaom/src/test/motion_vector_test.cc new file mode 100644 index 000000000..2636c39aa --- /dev/null +++ b/libs/libaom/src/test/motion_vector_test.cc @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/yuv_video_source.h" + +namespace { +#define MAX_EXTREME_MV 1 +#define MIN_EXTREME_MV 2 + +// Encoding modes +const libaom_test::TestMode kEncodingModeVectors[] = { + ::libaom_test::kTwoPassGood, + ::libaom_test::kOnePassGood, +}; + +// Encoding speeds +const int kCpuUsedVectors[] = { 1, 5 }; + +// MV test modes: 1 - always use maximum MV; 2 - always use minimum MV. +const int kMVTestModes[] = { MAX_EXTREME_MV, MIN_EXTREME_MV }; + +class MotionVectorTestLarge + : public ::libaom_test::CodecTestWith3Params, + public ::libaom_test::EncoderTest { + protected: + MotionVectorTestLarge() + : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), + cpu_used_(GET_PARAM(2)), mv_test_mode_(GET_PARAM(3)) {} + + virtual ~MotionVectorTestLarge() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(encoding_mode_); + if (encoding_mode_ != ::libaom_test::kRealTime) { + cfg_.g_lag_in_frames = 3; + cfg_.rc_end_usage = AOM_VBR; + } else { + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = AOM_CBR; + cfg_.rc_buf_sz = 1000; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + } + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AOME_SET_CPUUSED, cpu_used_); + encoder->Control(AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, mv_test_mode_); + if (encoding_mode_ != ::libaom_test::kRealTime) { + encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1); + encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7); + encoder->Control(AOME_SET_ARNR_STRENGTH, 5); + } + } + } + + libaom_test::TestMode encoding_mode_; + int cpu_used_; + int mv_test_mode_; +}; + +TEST_P(MotionVectorTestLarge, OverallTest) { + int width = 3840; + int height = 2160; + + // Reduce the test clip's resolution while testing on 32-bit system. + if (sizeof(void *) == 4) { + width = 2048; + height = 360; + } + + cfg_.rc_target_bitrate = 24000; + cfg_.g_profile = 0; + init_flags_ = AOM_CODEC_USE_PSNR; + + std::unique_ptr video; + video.reset(new libaom_test::YUVVideoSource( + "niklas_640_480_30.yuv", AOM_IMG_FMT_I420, width, height, 30, 1, 0, 3)); + + ASSERT_TRUE(video.get() != NULL); + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); +} + +AV1_INSTANTIATE_TEST_CASE(MotionVectorTestLarge, + ::testing::ValuesIn(kEncodingModeVectors), + ::testing::ValuesIn(kCpuUsedVectors), + ::testing::ValuesIn(kMVTestModes)); +} // namespace diff --git a/libs/libaom/src/test/noise_model_test.cc b/libs/libaom/src/test/noise_model_test.cc new file mode 100644 index 000000000..5b61236f0 --- /dev/null +++ b/libs/libaom/src/test/noise_model_test.cc @@ -0,0 +1,1343 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom_dsp/noise_model.h" +#include "aom_dsp/noise_util.h" +#include "config/aom_dsp_rtcd.h" +#include "test/acm_random.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +namespace { + +// Return normally distrbuted values with standard deviation of sigma. +double randn(libaom_test::ACMRandom *random, double sigma) { + while (1) { + const double u = 2.0 * ((double)random->Rand31() / + testing::internal::Random::kMaxRange) - + 1.0; + const double v = 2.0 * ((double)random->Rand31() / + testing::internal::Random::kMaxRange) - + 1.0; + const double s = u * u + v * v; + if (s > 0 && s < 1) { + return sigma * (u * sqrt(-2.0 * log(s) / s)); + } + } + return 0; +} + +// Synthesizes noise using the auto-regressive filter of the given lag, +// with the provided n coefficients sampled at the given coords. +void noise_synth(libaom_test::ACMRandom *random, int lag, int n, + const int (*coords)[2], const double *coeffs, double *data, + int w, int h) { + const int pad_size = 3 * lag; + const int padded_w = w + pad_size; + const int padded_h = h + pad_size; + int x = 0, y = 0; + std::vector padded(padded_w * padded_h); + + for (y = 0; y < padded_h; ++y) { + for (x = 0; x < padded_w; ++x) { + padded[y * padded_w + x] = randn(random, 1.0); + } + } + for (y = lag; y < padded_h; ++y) { + for (x = lag; x < padded_w; ++x) { + double sum = 0; + int i = 0; + for (i = 0; i < n; ++i) { + const int dx = coords[i][0]; + const int dy = coords[i][1]; + sum += padded[(y + dy) * padded_w + (x + dx)] * coeffs[i]; + } + padded[y * padded_w + x] += sum; + } + } + // Copy over the padded rows to the output + for (y = 0; y < h; ++y) { + memcpy(data + y * w, &padded[0] + y * padded_w, sizeof(*data) * w); + } +} + +std::vector get_noise_psd(double *noise, int width, int height, + int block_size) { + float *block = + (float *)aom_memalign(32, block_size * block_size * sizeof(block)); + std::vector psd(block_size * block_size); + int num_blocks = 0; + struct aom_noise_tx_t *tx = aom_noise_tx_malloc(block_size); + for (int y = 0; y <= height - block_size; y += block_size / 2) { + for (int x = 0; x <= width - block_size; x += block_size / 2) { + for (int yy = 0; yy < block_size; ++yy) { + for (int xx = 0; xx < block_size; ++xx) { + block[yy * block_size + xx] = (float)noise[(y + yy) * width + x + xx]; + } + } + aom_noise_tx_forward(tx, &block[0]); + aom_noise_tx_add_energy(tx, &psd[0]); + num_blocks++; + } + } + for (int yy = 0; yy < block_size; ++yy) { + for (int xx = 0; xx <= block_size / 2; ++xx) { + psd[yy * block_size + xx] /= num_blocks; + } + } + // Fill in the data that is missing due to symmetries + for (int xx = 1; xx < block_size / 2; ++xx) { + psd[(block_size - xx)] = psd[xx]; + } + for (int yy = 1; yy < block_size; ++yy) { + for (int xx = 1; xx < block_size / 2; ++xx) { + psd[(block_size - yy) * block_size + (block_size - xx)] = + psd[yy * block_size + xx]; + } + } + aom_noise_tx_free(tx); + aom_free(block); + return psd; +} + +} // namespace + +TEST(NoiseStrengthSolver, GetCentersTwoBins) { + aom_noise_strength_solver_t solver; + aom_noise_strength_solver_init(&solver, 2, 8); + EXPECT_NEAR(0, aom_noise_strength_solver_get_center(&solver, 0), 1e-5); + EXPECT_NEAR(255, aom_noise_strength_solver_get_center(&solver, 1), 1e-5); + aom_noise_strength_solver_free(&solver); +} + +TEST(NoiseStrengthSolver, GetCentersTwoBins10bit) { + aom_noise_strength_solver_t solver; + aom_noise_strength_solver_init(&solver, 2, 10); + EXPECT_NEAR(0, aom_noise_strength_solver_get_center(&solver, 0), 1e-5); + EXPECT_NEAR(1023, aom_noise_strength_solver_get_center(&solver, 1), 1e-5); + aom_noise_strength_solver_free(&solver); +} + +TEST(NoiseStrengthSolver, GetCenters256Bins) { + const int num_bins = 256; + aom_noise_strength_solver_t solver; + aom_noise_strength_solver_init(&solver, num_bins, 8); + + for (int i = 0; i < 256; ++i) { + EXPECT_NEAR(i, aom_noise_strength_solver_get_center(&solver, i), 1e-5); + } + aom_noise_strength_solver_free(&solver); +} + +// Tests that the noise strength solver returns the identity transform when +// given identity-like constraints. +TEST(NoiseStrengthSolver, ObserveIdentity) { + const int num_bins = 256; + aom_noise_strength_solver_t solver; + EXPECT_EQ(1, aom_noise_strength_solver_init(&solver, num_bins, 8)); + + // We have to add a big more strength to constraints at the boundary to + // overcome any regularization. + for (int j = 0; j < 5; ++j) { + aom_noise_strength_solver_add_measurement(&solver, 0, 0); + aom_noise_strength_solver_add_measurement(&solver, 255, 255); + } + for (int i = 0; i < 256; ++i) { + aom_noise_strength_solver_add_measurement(&solver, i, i); + } + EXPECT_EQ(1, aom_noise_strength_solver_solve(&solver)); + for (int i = 2; i < num_bins - 2; ++i) { + EXPECT_NEAR(i, solver.eqns.x[i], 0.1); + } + + aom_noise_strength_lut_t lut; + EXPECT_EQ(1, aom_noise_strength_solver_fit_piecewise(&solver, 2, &lut)); + + ASSERT_EQ(2, lut.num_points); + EXPECT_NEAR(0.0, lut.points[0][0], 1e-5); + EXPECT_NEAR(0.0, lut.points[0][1], 0.5); + EXPECT_NEAR(255.0, lut.points[1][0], 1e-5); + EXPECT_NEAR(255.0, lut.points[1][1], 0.5); + + aom_noise_strength_lut_free(&lut); + aom_noise_strength_solver_free(&solver); +} + +TEST(NoiseStrengthSolver, SimplifiesCurve) { + const int num_bins = 256; + aom_noise_strength_solver_t solver; + EXPECT_EQ(1, aom_noise_strength_solver_init(&solver, num_bins, 8)); + + // Create a parabolic input + for (int i = 0; i < 256; ++i) { + const double x = (i - 127.5) / 63.5; + aom_noise_strength_solver_add_measurement(&solver, i, x * x); + } + EXPECT_EQ(1, aom_noise_strength_solver_solve(&solver)); + + // First try to fit an unconstrained lut + aom_noise_strength_lut_t lut; + EXPECT_EQ(1, aom_noise_strength_solver_fit_piecewise(&solver, -1, &lut)); + ASSERT_LE(20, lut.num_points); + aom_noise_strength_lut_free(&lut); + + // Now constrain the maximum number of points + const int kMaxPoints = 9; + EXPECT_EQ(1, + aom_noise_strength_solver_fit_piecewise(&solver, kMaxPoints, &lut)); + ASSERT_EQ(kMaxPoints, lut.num_points); + + // Check that the input parabola is still well represented + EXPECT_NEAR(0.0, lut.points[0][0], 1e-5); + EXPECT_NEAR(4.0, lut.points[0][1], 0.1); + for (int i = 1; i < lut.num_points - 1; ++i) { + const double x = (lut.points[i][0] - 128.) / 64.; + EXPECT_NEAR(x * x, lut.points[i][1], 0.1); + } + EXPECT_NEAR(255.0, lut.points[kMaxPoints - 1][0], 1e-5); + + EXPECT_NEAR(4.0, lut.points[kMaxPoints - 1][1], 0.1); + aom_noise_strength_lut_free(&lut); + aom_noise_strength_solver_free(&solver); +} + +TEST(NoiseStrengthLut, LutEvalSinglePoint) { + aom_noise_strength_lut_t lut; + ASSERT_TRUE(aom_noise_strength_lut_init(&lut, 1)); + ASSERT_EQ(1, lut.num_points); + lut.points[0][0] = 0; + lut.points[0][1] = 1; + EXPECT_EQ(1, aom_noise_strength_lut_eval(&lut, -1)); + EXPECT_EQ(1, aom_noise_strength_lut_eval(&lut, 0)); + EXPECT_EQ(1, aom_noise_strength_lut_eval(&lut, 1)); + aom_noise_strength_lut_free(&lut); +} + +TEST(NoiseStrengthLut, LutEvalMultiPointInterp) { + const double kEps = 1e-5; + aom_noise_strength_lut_t lut; + ASSERT_TRUE(aom_noise_strength_lut_init(&lut, 4)); + ASSERT_EQ(4, lut.num_points); + + lut.points[0][0] = 0; + lut.points[0][1] = 0; + + lut.points[1][0] = 1; + lut.points[1][1] = 1; + + lut.points[2][0] = 2; + lut.points[2][1] = 1; + + lut.points[3][0] = 100; + lut.points[3][1] = 1001; + + // Test lower boundary + EXPECT_EQ(0, aom_noise_strength_lut_eval(&lut, -1)); + EXPECT_EQ(0, aom_noise_strength_lut_eval(&lut, 0)); + + // Test first part that should be identity + EXPECT_NEAR(0.25, aom_noise_strength_lut_eval(&lut, 0.25), kEps); + EXPECT_NEAR(0.75, aom_noise_strength_lut_eval(&lut, 0.75), kEps); + + // This is a constant section (should evaluate to 1) + EXPECT_NEAR(1.0, aom_noise_strength_lut_eval(&lut, 1.25), kEps); + EXPECT_NEAR(1.0, aom_noise_strength_lut_eval(&lut, 1.75), kEps); + + // Test interpolation between to non-zero y coords. + EXPECT_NEAR(1, aom_noise_strength_lut_eval(&lut, 2), kEps); + EXPECT_NEAR(251, aom_noise_strength_lut_eval(&lut, 26.5), kEps); + EXPECT_NEAR(751, aom_noise_strength_lut_eval(&lut, 75.5), kEps); + + // Test upper boundary + EXPECT_EQ(1001, aom_noise_strength_lut_eval(&lut, 100)); + EXPECT_EQ(1001, aom_noise_strength_lut_eval(&lut, 101)); + + aom_noise_strength_lut_free(&lut); +} + +TEST(NoiseModel, InitSuccessWithValidSquareShape) { + aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 2, 8, 0 }; + aom_noise_model_t model; + + EXPECT_TRUE(aom_noise_model_init(&model, params)); + + const int kNumCoords = 12; + const int kCoords[][2] = { { -2, -2 }, { -1, -2 }, { 0, -2 }, { 1, -2 }, + { 2, -2 }, { -2, -1 }, { -1, -1 }, { 0, -1 }, + { 1, -1 }, { 2, -1 }, { -2, 0 }, { -1, 0 } }; + EXPECT_EQ(kNumCoords, model.n); + for (int i = 0; i < kNumCoords; ++i) { + const int *coord = kCoords[i]; + EXPECT_EQ(coord[0], model.coords[i][0]); + EXPECT_EQ(coord[1], model.coords[i][1]); + } + aom_noise_model_free(&model); +} + +TEST(NoiseModel, InitSuccessWithValidDiamondShape) { + aom_noise_model_t model; + aom_noise_model_params_t params = { AOM_NOISE_SHAPE_DIAMOND, 2, 8, 0 }; + EXPECT_TRUE(aom_noise_model_init(&model, params)); + EXPECT_EQ(6, model.n); + const int kNumCoords = 6; + const int kCoords[][2] = { { 0, -2 }, { -1, -1 }, { 0, -1 }, + { 1, -1 }, { -2, 0 }, { -1, 0 } }; + EXPECT_EQ(kNumCoords, model.n); + for (int i = 0; i < kNumCoords; ++i) { + const int *coord = kCoords[i]; + EXPECT_EQ(coord[0], model.coords[i][0]); + EXPECT_EQ(coord[1], model.coords[i][1]); + } + aom_noise_model_free(&model); +} + +TEST(NoiseModel, InitFailsWithTooLargeLag) { + aom_noise_model_t model; + aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 10, 8, 0 }; + EXPECT_FALSE(aom_noise_model_init(&model, params)); + aom_noise_model_free(&model); +} + +TEST(NoiseModel, InitFailsWithTooSmallLag) { + aom_noise_model_t model; + aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 0, 8, 0 }; + EXPECT_FALSE(aom_noise_model_init(&model, params)); + aom_noise_model_free(&model); +} + +TEST(NoiseModel, InitFailsWithInvalidShape) { + aom_noise_model_t model; + aom_noise_model_params_t params = { aom_noise_shape(100), 3, 8, 0 }; + EXPECT_FALSE(aom_noise_model_init(&model, params)); + aom_noise_model_free(&model); +} + +// A container template class to hold a data type and extra arguments. +// All of these args are bundled into one struct so that we can use +// parameterized tests on combinations of supported data types +// (uint8_t and uint16_t) and bit depths (8, 10, 12). +template +struct BitDepthParams { + typedef T data_type_t; + static const int kBitDepth = bit_depth; + static const bool kUseHighBD = use_highbd; +}; + +template +class FlatBlockEstimatorTest : public ::testing::Test, public T { + public: + virtual void SetUp() { random_.Reset(171); } + typedef std::vector VecType; + VecType data_; + libaom_test::ACMRandom random_; +}; + +TYPED_TEST_SUITE_P(FlatBlockEstimatorTest); + +TYPED_TEST_P(FlatBlockEstimatorTest, ExtractBlock) { + const int kBlockSize = 16; + aom_flat_block_finder_t flat_block_finder; + ASSERT_EQ(1, aom_flat_block_finder_init(&flat_block_finder, kBlockSize, + this->kBitDepth, this->kUseHighBD)); + const double normalization = flat_block_finder.normalization; + + // Test with an image of more than one block. + const int h = 2 * kBlockSize; + const int w = 2 * kBlockSize; + const int stride = 2 * kBlockSize; + this->data_.resize(h * stride, 128); + + // Set up the (0,0) block to be a plane and the (0,1) block to be a + // checkerboard + const int shift = this->kBitDepth - 8; + for (int y = 0; y < kBlockSize; ++y) { + for (int x = 0; x < kBlockSize; ++x) { + this->data_[y * stride + x] = (-y + x + 128) << shift; + this->data_[y * stride + x + kBlockSize] = + ((x % 2 + y % 2) % 2 ? 128 - 20 : 128 + 20) << shift; + } + } + std::vector block(kBlockSize * kBlockSize, 1); + std::vector plane(kBlockSize * kBlockSize, 1); + + // The block data should be a constant (zero) and the rest of the plane + // trend is covered in the plane data. + aom_flat_block_finder_extract_block(&flat_block_finder, + (uint8_t *)&this->data_[0], w, h, stride, + 0, 0, &plane[0], &block[0]); + for (int y = 0; y < kBlockSize; ++y) { + for (int x = 0; x < kBlockSize; ++x) { + EXPECT_NEAR(0, block[y * kBlockSize + x], 1e-5); + EXPECT_NEAR((double)(this->data_[y * stride + x]) / normalization, + plane[y * kBlockSize + x], 1e-5); + } + } + + // The plane trend is a constant, and the block is a zero mean checkerboard. + aom_flat_block_finder_extract_block(&flat_block_finder, + (uint8_t *)&this->data_[0], w, h, stride, + kBlockSize, 0, &plane[0], &block[0]); + const int mid = 128 << shift; + for (int y = 0; y < kBlockSize; ++y) { + for (int x = 0; x < kBlockSize; ++x) { + EXPECT_NEAR(((double)this->data_[y * stride + x + kBlockSize] - mid) / + normalization, + block[y * kBlockSize + x], 1e-5); + EXPECT_NEAR(mid / normalization, plane[y * kBlockSize + x], 1e-5); + } + } + aom_flat_block_finder_free(&flat_block_finder); +} + +TYPED_TEST_P(FlatBlockEstimatorTest, FindFlatBlocks) { + const int kBlockSize = 32; + aom_flat_block_finder_t flat_block_finder; + ASSERT_EQ(1, aom_flat_block_finder_init(&flat_block_finder, kBlockSize, + this->kBitDepth, this->kUseHighBD)); + + const int num_blocks_w = 8; + const int h = kBlockSize; + const int w = kBlockSize * num_blocks_w; + const int stride = w; + this->data_.resize(h * stride, 128); + std::vector flat_blocks(num_blocks_w, 0); + + const int shift = this->kBitDepth - 8; + for (int y = 0; y < kBlockSize; ++y) { + for (int x = 0; x < kBlockSize; ++x) { + // Block 0 (not flat): constant doesn't have enough variance to qualify + this->data_[y * stride + x + 0 * kBlockSize] = 128 << shift; + + // Block 1 (not flat): too high of variance is hard to validate as flat + this->data_[y * stride + x + 1 * kBlockSize] = + ((uint8_t)(128 + randn(&this->random_, 5))) << shift; + + // Block 2 (flat): slight checkerboard added to constant + const int check = (x % 2 + y % 2) % 2 ? -2 : 2; + this->data_[y * stride + x + 2 * kBlockSize] = (128 + check) << shift; + + // Block 3 (flat): planar block with checkerboard pattern is also flat + this->data_[y * stride + x + 3 * kBlockSize] = + (y * 2 - x / 2 + 128 + check) << shift; + + // Block 4 (flat): gaussian random with standard deviation 1. + this->data_[y * stride + x + 4 * kBlockSize] = + ((uint8_t)(randn(&this->random_, 1) + x + 128.0)) << shift; + + // Block 5 (flat): gaussian random with standard deviation 2. + this->data_[y * stride + x + 5 * kBlockSize] = + ((uint8_t)(randn(&this->random_, 2) + y + 128.0)) << shift; + + // Block 6 (not flat): too high of directional gradient. + const int strong_edge = x > kBlockSize / 2 ? 64 : 0; + this->data_[y * stride + x + 6 * kBlockSize] = + ((uint8_t)(randn(&this->random_, 1) + strong_edge + 128.0)) << shift; + + // Block 7 (not flat): too high gradient. + const int big_check = ((x >> 2) % 2 + (y >> 2) % 2) % 2 ? -16 : 16; + this->data_[y * stride + x + 7 * kBlockSize] = + ((uint8_t)(randn(&this->random_, 1) + big_check + 128.0)) << shift; + } + } + + EXPECT_EQ(4, aom_flat_block_finder_run(&flat_block_finder, + (uint8_t *)&this->data_[0], w, h, + stride, &flat_blocks[0])); + + // First two blocks are not flat + EXPECT_EQ(0, flat_blocks[0]); + EXPECT_EQ(0, flat_blocks[1]); + + // Next 4 blocks are flat. + EXPECT_EQ(255, flat_blocks[2]); + EXPECT_EQ(255, flat_blocks[3]); + EXPECT_EQ(255, flat_blocks[4]); + EXPECT_EQ(255, flat_blocks[5]); + + // Last 2 are not flat by threshold + EXPECT_EQ(0, flat_blocks[6]); + EXPECT_EQ(0, flat_blocks[7]); + + // Add the noise from non-flat block 1 to every block. + for (int y = 0; y < kBlockSize; ++y) { + for (int x = 0; x < kBlockSize * num_blocks_w; ++x) { + this->data_[y * stride + x] += + (this->data_[y * stride + x % kBlockSize + kBlockSize] - + (128 << shift)); + } + } + // Now the scored selection will pick the one that is most likely flat (block + // 0) + EXPECT_EQ(1, aom_flat_block_finder_run(&flat_block_finder, + (uint8_t *)&this->data_[0], w, h, + stride, &flat_blocks[0])); + EXPECT_EQ(1, flat_blocks[0]); + EXPECT_EQ(0, flat_blocks[1]); + EXPECT_EQ(0, flat_blocks[2]); + EXPECT_EQ(0, flat_blocks[3]); + EXPECT_EQ(0, flat_blocks[4]); + EXPECT_EQ(0, flat_blocks[5]); + EXPECT_EQ(0, flat_blocks[6]); + EXPECT_EQ(0, flat_blocks[7]); + + aom_flat_block_finder_free(&flat_block_finder); +} + +REGISTER_TYPED_TEST_SUITE_P(FlatBlockEstimatorTest, ExtractBlock, + FindFlatBlocks); + +typedef ::testing::Types, // lowbd + BitDepthParams, // lowbd in 16-bit + BitDepthParams, // highbd data + BitDepthParams > + AllBitDepthParams; +INSTANTIATE_TYPED_TEST_SUITE_P(FlatBlockInstatiation, FlatBlockEstimatorTest, + AllBitDepthParams); + +template +class NoiseModelUpdateTest : public ::testing::Test, public T { + public: + static const int kWidth = 128; + static const int kHeight = 128; + static const int kBlockSize = 16; + static const int kNumBlocksX = kWidth / kBlockSize; + static const int kNumBlocksY = kHeight / kBlockSize; + + virtual void SetUp() { + const aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3, + T::kBitDepth, T::kUseHighBD }; + ASSERT_TRUE(aom_noise_model_init(&model_, params)); + + random_.Reset(100171); + + data_.resize(kWidth * kHeight * 3); + denoised_.resize(kWidth * kHeight * 3); + noise_.resize(kWidth * kHeight * 3); + renoise_.resize(kWidth * kHeight); + flat_blocks_.resize(kNumBlocksX * kNumBlocksY); + + for (int c = 0, offset = 0; c < 3; ++c, offset += kWidth * kHeight) { + data_ptr_[c] = &data_[offset]; + noise_ptr_[c] = &noise_[offset]; + denoised_ptr_[c] = &denoised_[offset]; + strides_[c] = kWidth; + + data_ptr_raw_[c] = (uint8_t *)&data_[offset]; + denoised_ptr_raw_[c] = (uint8_t *)&denoised_[offset]; + } + chroma_sub_[0] = 0; + chroma_sub_[1] = 0; + } + + int NoiseModelUpdate(int block_size = kBlockSize) { + return aom_noise_model_update(&model_, data_ptr_raw_, denoised_ptr_raw_, + kWidth, kHeight, strides_, chroma_sub_, + &flat_blocks_[0], block_size); + } + + void TearDown() { aom_noise_model_free(&model_); } + + protected: + aom_noise_model_t model_; + std::vector data_; + std::vector denoised_; + + std::vector noise_; + std::vector renoise_; + std::vector flat_blocks_; + + typename T::data_type_t *data_ptr_[3]; + typename T::data_type_t *denoised_ptr_[3]; + + double *noise_ptr_[3]; + int strides_[3]; + int chroma_sub_[2]; + libaom_test::ACMRandom random_; + + private: + uint8_t *data_ptr_raw_[3]; + uint8_t *denoised_ptr_raw_[3]; +}; + +TYPED_TEST_SUITE_P(NoiseModelUpdateTest); + +TYPED_TEST_P(NoiseModelUpdateTest, UpdateFailsNoFlatBlocks) { + EXPECT_EQ(AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS, + this->NoiseModelUpdate()); +} + +TYPED_TEST_P(NoiseModelUpdateTest, UpdateSuccessForZeroNoiseAllFlat) { + this->flat_blocks_.assign(this->flat_blocks_.size(), 1); + this->denoised_.assign(this->denoised_.size(), 128); + this->data_.assign(this->denoised_.size(), 128); + EXPECT_EQ(AOM_NOISE_STATUS_INTERNAL_ERROR, this->NoiseModelUpdate()); +} + +TYPED_TEST_P(NoiseModelUpdateTest, UpdateFailsBlockSizeTooSmall) { + this->flat_blocks_.assign(this->flat_blocks_.size(), 1); + this->denoised_.assign(this->denoised_.size(), 128); + this->data_.assign(this->denoised_.size(), 128); + EXPECT_EQ(AOM_NOISE_STATUS_INVALID_ARGUMENT, + this->NoiseModelUpdate(6 /* block_size=6 is too small*/)); +} + +TYPED_TEST_P(NoiseModelUpdateTest, UpdateSuccessForWhiteRandomNoise) { + aom_noise_model_t &model = this->model_; + const int kWidth = this->kWidth; + const int kHeight = this->kHeight; + + const int shift = this->kBitDepth - 8; + for (int y = 0; y < kHeight; ++y) { + for (int x = 0; x < kWidth; ++x) { + this->data_ptr_[0][y * kWidth + x] = + int(64 + y + randn(&this->random_, 1)) << shift; + this->denoised_ptr_[0][y * kWidth + x] = (64 + y) << shift; + // Make the chroma planes completely correlated with the Y plane + for (int c = 1; c < 3; ++c) { + this->data_ptr_[c][y * kWidth + x] = this->data_ptr_[0][y * kWidth + x]; + this->denoised_ptr_[c][y * kWidth + x] = + this->denoised_ptr_[0][y * kWidth + x]; + } + } + } + this->flat_blocks_.assign(this->flat_blocks_.size(), 1); + EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate()); + + const double kCoeffEps = 0.075; + const int n = model.n; + for (int c = 0; c < 3; ++c) { + for (int i = 0; i < n; ++i) { + EXPECT_NEAR(0, model.latest_state[c].eqns.x[i], kCoeffEps); + EXPECT_NEAR(0, model.combined_state[c].eqns.x[i], kCoeffEps); + } + // The second and third channels are highly correlated with the first. + if (c > 0) { + ASSERT_EQ(n + 1, model.latest_state[c].eqns.n); + ASSERT_EQ(n + 1, model.combined_state[c].eqns.n); + + EXPECT_NEAR(1, model.latest_state[c].eqns.x[n], kCoeffEps); + EXPECT_NEAR(1, model.combined_state[c].eqns.x[n], kCoeffEps); + } + } + + // The fitted noise strength should be close to the standard deviation + // for all intensity bins. + const double kStdEps = 0.1; + const double normalize = 1 << shift; + + for (int i = 0; i < model.latest_state[0].strength_solver.eqns.n; ++i) { + EXPECT_NEAR(1.0, + model.latest_state[0].strength_solver.eqns.x[i] / normalize, + kStdEps); + EXPECT_NEAR(1.0, + model.combined_state[0].strength_solver.eqns.x[i] / normalize, + kStdEps); + } + + aom_noise_strength_lut_t lut; + aom_noise_strength_solver_fit_piecewise( + &model.latest_state[0].strength_solver, -1, &lut); + ASSERT_EQ(2, lut.num_points); + EXPECT_NEAR(0.0, lut.points[0][0], 1e-5); + EXPECT_NEAR(1.0, lut.points[0][1] / normalize, kStdEps); + EXPECT_NEAR((1 << this->kBitDepth) - 1, lut.points[1][0], 1e-5); + EXPECT_NEAR(1.0, lut.points[1][1] / normalize, kStdEps); + aom_noise_strength_lut_free(&lut); +} + +TYPED_TEST_P(NoiseModelUpdateTest, UpdateSuccessForScaledWhiteNoise) { + aom_noise_model_t &model = this->model_; + const int kWidth = this->kWidth; + const int kHeight = this->kHeight; + + const double kCoeffEps = 0.055; + const double kLowStd = 1; + const double kHighStd = 4; + const int shift = this->kBitDepth - 8; + for (int y = 0; y < kHeight; ++y) { + for (int x = 0; x < kWidth; ++x) { + for (int c = 0; c < 3; ++c) { + // The image data is bimodal: + // Bottom half has low intensity and low noise strength + // Top half has high intensity and high noise strength + const int avg = (y < kHeight / 2) ? 4 : 245; + const double std = (y < kHeight / 2) ? kLowStd : kHighStd; + this->data_ptr_[c][y * kWidth + x] = + ((uint8_t)std::min((int)255, + (int)(2 + avg + randn(&this->random_, std)))) + << shift; + this->denoised_ptr_[c][y * kWidth + x] = (2 + avg) << shift; + } + } + } + // Label all blocks as flat for the update + this->flat_blocks_.assign(this->flat_blocks_.size(), 1); + EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate()); + + const int n = model.n; + // The noise is uncorrelated spatially and with the y channel. + // All coefficients should be reasonably close to zero. + for (int c = 0; c < 3; ++c) { + for (int i = 0; i < n; ++i) { + EXPECT_NEAR(0, model.latest_state[c].eqns.x[i], kCoeffEps); + EXPECT_NEAR(0, model.combined_state[c].eqns.x[i], kCoeffEps); + } + if (c > 0) { + ASSERT_EQ(n + 1, model.latest_state[c].eqns.n); + ASSERT_EQ(n + 1, model.combined_state[c].eqns.n); + + // The correlation to the y channel should be low (near zero) + EXPECT_NEAR(0, model.latest_state[c].eqns.x[n], kCoeffEps); + EXPECT_NEAR(0, model.combined_state[c].eqns.x[n], kCoeffEps); + } + } + + // Noise strength should vary between kLowStd and kHighStd. + const double kStdEps = 0.15; + // We have to normalize fitted standard deviation based on bit depth. + const double normalize = (1 << shift); + + ASSERT_EQ(20, model.latest_state[0].strength_solver.eqns.n); + for (int i = 0; i < model.latest_state[0].strength_solver.eqns.n; ++i) { + const double a = i / 19.0; + const double expected = (kLowStd * (1.0 - a) + kHighStd * a); + EXPECT_NEAR(expected, + model.latest_state[0].strength_solver.eqns.x[i] / normalize, + kStdEps); + EXPECT_NEAR(expected, + model.combined_state[0].strength_solver.eqns.x[i] / normalize, + kStdEps); + } + + // If we fit a piecewise linear model, there should be two points: + // one near kLowStd at 0, and the other near kHighStd and 255. + aom_noise_strength_lut_t lut; + aom_noise_strength_solver_fit_piecewise( + &model.latest_state[0].strength_solver, 2, &lut); + ASSERT_EQ(2, lut.num_points); + EXPECT_NEAR(0, lut.points[0][0], 1e-4); + EXPECT_NEAR(kLowStd, lut.points[0][1] / normalize, kStdEps); + EXPECT_NEAR((1 << this->kBitDepth) - 1, lut.points[1][0], 1e-5); + EXPECT_NEAR(kHighStd, lut.points[1][1] / normalize, kStdEps); + aom_noise_strength_lut_free(&lut); +} + +TYPED_TEST_P(NoiseModelUpdateTest, UpdateSuccessForCorrelatedNoise) { + aom_noise_model_t &model = this->model_; + const int kWidth = this->kWidth; + const int kHeight = this->kHeight; + const int kNumCoeffs = 24; + const double kStd = 4; + const double kStdEps = 0.3; + const double kCoeffEps = 0.065; + // Use different coefficients for each channel + const double kCoeffs[3][24] = { + { 0.02884, -0.03356, 0.00633, 0.01757, 0.02849, -0.04620, + 0.02833, -0.07178, 0.07076, -0.11603, -0.10413, -0.16571, + 0.05158, -0.07969, 0.02640, -0.07191, 0.02530, 0.41968, + 0.21450, -0.00702, -0.01401, -0.03676, -0.08713, 0.44196 }, + { 0.00269, -0.01291, -0.01513, 0.07234, 0.03208, 0.00477, + 0.00226, -0.00254, 0.03533, 0.12841, -0.25970, -0.06336, + 0.05238, -0.00845, -0.03118, 0.09043, -0.36558, 0.48903, + 0.00595, -0.11938, 0.02106, 0.095956, -0.350139, 0.59305 }, + { -0.00643, -0.01080, -0.01466, 0.06951, 0.03707, -0.00482, + 0.00817, -0.00909, 0.02949, 0.12181, -0.25210, -0.07886, + 0.06083, -0.01210, -0.03108, 0.08944, -0.35875, 0.49150, + 0.00415, -0.12905, 0.02870, 0.09740, -0.34610, 0.58824 }, + }; + + ASSERT_EQ(model.n, kNumCoeffs); + this->chroma_sub_[0] = this->chroma_sub_[1] = 1; + + this->flat_blocks_.assign(this->flat_blocks_.size(), 1); + + // Add different noise onto each plane + const int shift = this->kBitDepth - 8; + for (int c = 0; c < 3; ++c) { + noise_synth(&this->random_, model.params.lag, model.n, model.coords, + kCoeffs[c], this->noise_ptr_[c], kWidth, kHeight); + const int x_shift = c > 0 ? this->chroma_sub_[0] : 0; + const int y_shift = c > 0 ? this->chroma_sub_[1] : 0; + for (int y = 0; y < (kHeight >> y_shift); ++y) { + for (int x = 0; x < (kWidth >> x_shift); ++x) { + const uint8_t value = 64 + x / 2 + y / 4; + this->data_ptr_[c][y * kWidth + x] = + (uint8_t(value + this->noise_ptr_[c][y * kWidth + x] * kStd)) + << shift; + this->denoised_ptr_[c][y * kWidth + x] = value << shift; + } + } + } + EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate()); + + // For the Y plane, the solved coefficients should be close to the original + const int n = model.n; + for (int c = 0; c < 3; ++c) { + for (int i = 0; i < n; ++i) { + EXPECT_NEAR(kCoeffs[c][i], model.latest_state[c].eqns.x[i], kCoeffEps); + EXPECT_NEAR(kCoeffs[c][i], model.combined_state[c].eqns.x[i], kCoeffEps); + } + // The chroma planes should be uncorrelated with the luma plane + if (c > 0) { + EXPECT_NEAR(0, model.latest_state[c].eqns.x[n], kCoeffEps); + EXPECT_NEAR(0, model.combined_state[c].eqns.x[n], kCoeffEps); + } + // Correlation between the coefficient vector and the fitted coefficients + // should be close to 1. + EXPECT_LT(0.98, aom_normalized_cross_correlation( + model.latest_state[c].eqns.x, kCoeffs[c], kNumCoeffs)); + + noise_synth(&this->random_, model.params.lag, model.n, model.coords, + model.latest_state[c].eqns.x, &this->renoise_[0], kWidth, + kHeight); + + EXPECT_TRUE(aom_noise_data_validate(&this->renoise_[0], kWidth, kHeight)); + } + + // Check fitted noise strength + const double normalize = 1 << shift; + for (int c = 0; c < 3; ++c) { + for (int i = 0; i < model.latest_state[c].strength_solver.eqns.n; ++i) { + EXPECT_NEAR(kStd, + model.latest_state[c].strength_solver.eqns.x[i] / normalize, + kStdEps); + } + } +} + +TYPED_TEST_P(NoiseModelUpdateTest, + NoiseStrengthChangeSignalsDifferentNoiseType) { + aom_noise_model_t &model = this->model_; + const int kWidth = this->kWidth; + const int kHeight = this->kHeight; + const int kBlockSize = this->kBlockSize; + // Create a gradient image with std = 2 uncorrelated noise + const double kStd = 2; + const int shift = this->kBitDepth - 8; + + for (int i = 0; i < kWidth * kHeight; ++i) { + const uint8_t val = (i % kWidth) < kWidth / 2 ? 64 : 192; + for (int c = 0; c < 3; ++c) { + this->noise_ptr_[c][i] = randn(&this->random_, 1); + this->data_ptr_[c][i] = ((uint8_t)(this->noise_ptr_[c][i] * kStd + val)) + << shift; + this->denoised_ptr_[c][i] = val << shift; + } + } + this->flat_blocks_.assign(this->flat_blocks_.size(), 1); + EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate()); + + const int kNumBlocks = kWidth * kHeight / kBlockSize / kBlockSize; + EXPECT_EQ(kNumBlocks, model.latest_state[0].strength_solver.num_equations); + EXPECT_EQ(kNumBlocks, model.latest_state[1].strength_solver.num_equations); + EXPECT_EQ(kNumBlocks, model.latest_state[2].strength_solver.num_equations); + EXPECT_EQ(kNumBlocks, model.combined_state[0].strength_solver.num_equations); + EXPECT_EQ(kNumBlocks, model.combined_state[1].strength_solver.num_equations); + EXPECT_EQ(kNumBlocks, model.combined_state[2].strength_solver.num_equations); + + // Bump up noise by an insignificant amount + for (int i = 0; i < kWidth * kHeight; ++i) { + const uint8_t val = (i % kWidth) < kWidth / 2 ? 64 : 192; + this->data_ptr_[0][i] = + ((uint8_t)(this->noise_ptr_[0][i] * (kStd + 0.085) + val)) << shift; + } + EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate()); + + const double kARGainTolerance = 0.02; + for (int c = 0; c < 3; ++c) { + EXPECT_EQ(kNumBlocks, model.latest_state[c].strength_solver.num_equations); + EXPECT_EQ(15250, model.latest_state[c].num_observations); + EXPECT_NEAR(1, model.latest_state[c].ar_gain, kARGainTolerance); + + EXPECT_EQ(2 * kNumBlocks, + model.combined_state[c].strength_solver.num_equations); + EXPECT_EQ(2 * 15250, model.combined_state[c].num_observations); + EXPECT_NEAR(1, model.combined_state[c].ar_gain, kARGainTolerance); + } + + // Bump up the noise strength on half the image for one channel by a + // significant amount. + for (int i = 0; i < kWidth * kHeight; ++i) { + const uint8_t val = (i % kWidth) < kWidth / 2 ? 64 : 128; + if (i % kWidth < kWidth / 2) { + this->data_ptr_[0][i] = + ((uint8_t)(randn(&this->random_, kStd + 0.5) + val)) << shift; + } + } + EXPECT_EQ(AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE, this->NoiseModelUpdate()); + + // Since we didn't update the combined state, it should still be at 2 * + // num_blocks + EXPECT_EQ(kNumBlocks, model.latest_state[0].strength_solver.num_equations); + EXPECT_EQ(2 * kNumBlocks, + model.combined_state[0].strength_solver.num_equations); + + // In normal operation, the "latest" estimate can be saved to the "combined" + // state for continued updates. + aom_noise_model_save_latest(&model); + for (int c = 0; c < 3; ++c) { + EXPECT_EQ(kNumBlocks, model.latest_state[c].strength_solver.num_equations); + EXPECT_EQ(15250, model.latest_state[c].num_observations); + EXPECT_NEAR(1, model.latest_state[c].ar_gain, kARGainTolerance); + + EXPECT_EQ(kNumBlocks, + model.combined_state[c].strength_solver.num_equations); + EXPECT_EQ(15250, model.combined_state[c].num_observations); + EXPECT_NEAR(1, model.combined_state[c].ar_gain, kARGainTolerance); + } +} + +TYPED_TEST_P(NoiseModelUpdateTest, NoiseCoeffsSignalsDifferentNoiseType) { + aom_noise_model_t &model = this->model_; + const int kWidth = this->kWidth; + const int kHeight = this->kHeight; + const double kCoeffs[2][24] = { + { 0.02884, -0.03356, 0.00633, 0.01757, 0.02849, -0.04620, + 0.02833, -0.07178, 0.07076, -0.11603, -0.10413, -0.16571, + 0.05158, -0.07969, 0.02640, -0.07191, 0.02530, 0.41968, + 0.21450, -0.00702, -0.01401, -0.03676, -0.08713, 0.44196 }, + { 0.00269, -0.01291, -0.01513, 0.07234, 0.03208, 0.00477, + 0.00226, -0.00254, 0.03533, 0.12841, -0.25970, -0.06336, + 0.05238, -0.00845, -0.03118, 0.09043, -0.36558, 0.48903, + 0.00595, -0.11938, 0.02106, 0.095956, -0.350139, 0.59305 } + }; + + noise_synth(&this->random_, model.params.lag, model.n, model.coords, + kCoeffs[0], this->noise_ptr_[0], kWidth, kHeight); + for (int i = 0; i < kWidth * kHeight; ++i) { + this->data_ptr_[0][i] = (uint8_t)(128 + this->noise_ptr_[0][i]); + } + this->flat_blocks_.assign(this->flat_blocks_.size(), 1); + EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate()); + + // Now try with the second set of AR coefficients + noise_synth(&this->random_, model.params.lag, model.n, model.coords, + kCoeffs[1], this->noise_ptr_[0], kWidth, kHeight); + for (int i = 0; i < kWidth * kHeight; ++i) { + this->data_ptr_[0][i] = (uint8_t)(128 + this->noise_ptr_[0][i]); + } + EXPECT_EQ(AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE, this->NoiseModelUpdate()); +} +REGISTER_TYPED_TEST_SUITE_P(NoiseModelUpdateTest, UpdateFailsNoFlatBlocks, + UpdateSuccessForZeroNoiseAllFlat, + UpdateFailsBlockSizeTooSmall, + UpdateSuccessForWhiteRandomNoise, + UpdateSuccessForScaledWhiteNoise, + UpdateSuccessForCorrelatedNoise, + NoiseStrengthChangeSignalsDifferentNoiseType, + NoiseCoeffsSignalsDifferentNoiseType); + +INSTANTIATE_TYPED_TEST_SUITE_P(NoiseModelUpdateTestInstatiation, + NoiseModelUpdateTest, AllBitDepthParams); + +TEST(NoiseModelGetGrainParameters, TestLagSize) { + aom_film_grain_t film_grain; + for (int lag = 1; lag <= 3; ++lag) { + aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, lag, 8, 0 }; + aom_noise_model_t model; + EXPECT_TRUE(aom_noise_model_init(&model, params)); + EXPECT_TRUE(aom_noise_model_get_grain_parameters(&model, &film_grain)); + EXPECT_EQ(lag, film_grain.ar_coeff_lag); + aom_noise_model_free(&model); + } + + aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 4, 8, 0 }; + aom_noise_model_t model; + EXPECT_TRUE(aom_noise_model_init(&model, params)); + EXPECT_FALSE(aom_noise_model_get_grain_parameters(&model, &film_grain)); + aom_noise_model_free(&model); +} + +TEST(NoiseModelGetGrainParameters, TestARCoeffShiftBounds) { + struct TestCase { + double max_input_value; + int expected_ar_coeff_shift; + int expected_value; + }; + const int lag = 1; + const int kNumTestCases = 19; + const TestCase test_cases[] = { + // Test cases for ar_coeff_shift = 9 + { 0, 9, 0 }, + { 0.125, 9, 64 }, + { -0.125, 9, -64 }, + { 0.2499, 9, 127 }, + { -0.25, 9, -128 }, + // Test cases for ar_coeff_shift = 8 + { 0.25, 8, 64 }, + { -0.2501, 8, -64 }, + { 0.499, 8, 127 }, + { -0.5, 8, -128 }, + // Test cases for ar_coeff_shift = 7 + { 0.5, 7, 64 }, + { -0.5001, 7, -64 }, + { 0.999, 7, 127 }, + { -1, 7, -128 }, + // Test cases for ar_coeff_shift = 6 + { 1.0, 6, 64 }, + { -1.0001, 6, -64 }, + { 2.0, 6, 127 }, + { -2.0, 6, -128 }, + { 4, 6, 127 }, + { -4, 6, -128 }, + }; + aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, lag, 8, 0 }; + aom_noise_model_t model; + EXPECT_TRUE(aom_noise_model_init(&model, params)); + + for (int i = 0; i < kNumTestCases; ++i) { + const TestCase &test_case = test_cases[i]; + model.combined_state[0].eqns.x[0] = test_case.max_input_value; + + aom_film_grain_t film_grain; + EXPECT_TRUE(aom_noise_model_get_grain_parameters(&model, &film_grain)); + EXPECT_EQ(1, film_grain.ar_coeff_lag); + EXPECT_EQ(test_case.expected_ar_coeff_shift, film_grain.ar_coeff_shift); + EXPECT_EQ(test_case.expected_value, film_grain.ar_coeffs_y[0]); + } + aom_noise_model_free(&model); +} + +TEST(NoiseModelGetGrainParameters, TestNoiseStrengthShiftBounds) { + struct TestCase { + double max_input_value; + int expected_scaling_shift; + int expected_value; + }; + const int kNumTestCases = 10; + const TestCase test_cases[] = { + { 0, 11, 0 }, { 1, 11, 64 }, { 2, 11, 128 }, { 3.99, 11, 255 }, + { 4, 10, 128 }, { 7.99, 10, 255 }, { 8, 9, 128 }, { 16, 8, 128 }, + { 31.99, 8, 255 }, { 64, 8, 255 }, // clipped + }; + const int lag = 1; + aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, lag, 8, 0 }; + aom_noise_model_t model; + EXPECT_TRUE(aom_noise_model_init(&model, params)); + + for (int i = 0; i < kNumTestCases; ++i) { + const TestCase &test_case = test_cases[i]; + aom_equation_system_t &eqns = model.combined_state[0].strength_solver.eqns; + // Set the fitted scale parameters to be a constant value. + for (int j = 0; j < eqns.n; ++j) { + eqns.x[j] = test_case.max_input_value; + } + aom_film_grain_t film_grain; + EXPECT_TRUE(aom_noise_model_get_grain_parameters(&model, &film_grain)); + // We expect a single constant segemnt + EXPECT_EQ(test_case.expected_scaling_shift, film_grain.scaling_shift); + EXPECT_EQ(test_case.expected_value, film_grain.scaling_points_y[0][1]); + EXPECT_EQ(test_case.expected_value, film_grain.scaling_points_y[1][1]); + } + aom_noise_model_free(&model); +} + +// The AR coefficients are the same inputs used to generate "Test 2" in the test +// vectors +TEST(NoiseModelGetGrainParameters, GetGrainParametersReal) { + const double kInputCoeffsY[] = { 0.0315, 0.0073, 0.0218, 0.00235, 0.00511, + -0.0222, 0.0627, -0.022, 0.05575, -0.1816, + 0.0107, -0.1966, 0.00065, -0.0809, 0.04934, + -0.1349, -0.0352, 0.41772, 0.27973, 0.04207, + -0.0429, -0.1372, 0.06193, 0.52032 }; + const double kInputCoeffsCB[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5 }; + const double kInputCoeffsCR[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0.5 }; + const int kExpectedARCoeffsY[] = { 4, 1, 3, 0, 1, -3, 8, -3, + 7, -23, 1, -25, 0, -10, 6, -17, + -5, 53, 36, 5, -5, -18, 8, 67 }; + const int kExpectedARCoeffsCB[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 84 }; + const int kExpectedARCoeffsCR[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -126 }; + // Scaling function is initialized analytically with a sqrt function. + const int kNumScalingPointsY = 12; + const int kExpectedScalingPointsY[][2] = { + { 0, 0 }, { 13, 44 }, { 27, 62 }, { 40, 76 }, + { 54, 88 }, { 67, 98 }, { 94, 117 }, { 121, 132 }, + { 148, 146 }, { 174, 159 }, { 201, 171 }, { 255, 192 }, + }; + + const int lag = 3; + aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, lag, 8, 0 }; + aom_noise_model_t model; + EXPECT_TRUE(aom_noise_model_init(&model, params)); + + // Setup the AR coeffs + memcpy(model.combined_state[0].eqns.x, kInputCoeffsY, sizeof(kInputCoeffsY)); + memcpy(model.combined_state[1].eqns.x, kInputCoeffsCB, + sizeof(kInputCoeffsCB)); + memcpy(model.combined_state[2].eqns.x, kInputCoeffsCR, + sizeof(kInputCoeffsCR)); + for (int i = 0; i < model.combined_state[0].strength_solver.num_bins; ++i) { + const double x = + ((double)i) / (model.combined_state[0].strength_solver.num_bins - 1.0); + model.combined_state[0].strength_solver.eqns.x[i] = 6 * sqrt(x); + model.combined_state[1].strength_solver.eqns.x[i] = 3; + model.combined_state[2].strength_solver.eqns.x[i] = 2; + + // Inject some observations into the strength solver, as during film grain + // parameter extraction an estimate of the average strength will be used to + // adjust correlation. + const int n = model.combined_state[0].strength_solver.num_bins; + for (int j = 0; j < model.combined_state[0].strength_solver.num_bins; ++j) { + model.combined_state[0].strength_solver.eqns.A[i * n + j] = 1; + model.combined_state[1].strength_solver.eqns.A[i * n + j] = 1; + model.combined_state[2].strength_solver.eqns.A[i * n + j] = 1; + } + } + + aom_film_grain_t film_grain; + EXPECT_TRUE(aom_noise_model_get_grain_parameters(&model, &film_grain)); + EXPECT_EQ(lag, film_grain.ar_coeff_lag); + EXPECT_EQ(3, film_grain.ar_coeff_lag); + EXPECT_EQ(7, film_grain.ar_coeff_shift); + EXPECT_EQ(10, film_grain.scaling_shift); + EXPECT_EQ(kNumScalingPointsY, film_grain.num_y_points); + EXPECT_EQ(1, film_grain.update_parameters); + EXPECT_EQ(1, film_grain.apply_grain); + + const int kNumARCoeffs = 24; + for (int i = 0; i < kNumARCoeffs; ++i) { + EXPECT_EQ(kExpectedARCoeffsY[i], film_grain.ar_coeffs_y[i]); + } + for (int i = 0; i < kNumARCoeffs + 1; ++i) { + EXPECT_EQ(kExpectedARCoeffsCB[i], film_grain.ar_coeffs_cb[i]); + } + for (int i = 0; i < kNumARCoeffs + 1; ++i) { + EXPECT_EQ(kExpectedARCoeffsCR[i], film_grain.ar_coeffs_cr[i]); + } + for (int i = 0; i < kNumScalingPointsY; ++i) { + EXPECT_EQ(kExpectedScalingPointsY[i][0], film_grain.scaling_points_y[i][0]); + EXPECT_EQ(kExpectedScalingPointsY[i][1], film_grain.scaling_points_y[i][1]); + } + + // CB strength should just be a piecewise segment + EXPECT_EQ(2, film_grain.num_cb_points); + EXPECT_EQ(0, film_grain.scaling_points_cb[0][0]); + EXPECT_EQ(255, film_grain.scaling_points_cb[1][0]); + EXPECT_EQ(96, film_grain.scaling_points_cb[0][1]); + EXPECT_EQ(96, film_grain.scaling_points_cb[1][1]); + + // CR strength should just be a piecewise segment + EXPECT_EQ(2, film_grain.num_cr_points); + EXPECT_EQ(0, film_grain.scaling_points_cr[0][0]); + EXPECT_EQ(255, film_grain.scaling_points_cr[1][0]); + EXPECT_EQ(64, film_grain.scaling_points_cr[0][1]); + EXPECT_EQ(64, film_grain.scaling_points_cr[1][1]); + + EXPECT_EQ(128, film_grain.cb_mult); + EXPECT_EQ(192, film_grain.cb_luma_mult); + EXPECT_EQ(256, film_grain.cb_offset); + EXPECT_EQ(128, film_grain.cr_mult); + EXPECT_EQ(192, film_grain.cr_luma_mult); + EXPECT_EQ(256, film_grain.cr_offset); + EXPECT_EQ(0, film_grain.chroma_scaling_from_luma); + EXPECT_EQ(0, film_grain.grain_scale_shift); + + aom_noise_model_free(&model); +} + +template +class WienerDenoiseTest : public ::testing::Test, public T { + public: + static void SetUpTestCase() { aom_dsp_rtcd(); } + + protected: + void SetUp() { + static const float kNoiseLevel = 5.f; + static const float kStd = 4.0; + static const double kMaxValue = (1 << T::kBitDepth) - 1; + + chroma_sub_[0] = 1; + chroma_sub_[1] = 1; + stride_[0] = kWidth; + stride_[1] = kWidth / 2; + stride_[2] = kWidth / 2; + for (int k = 0; k < 3; ++k) { + data_[k].resize(kWidth * kHeight); + denoised_[k].resize(kWidth * kHeight); + noise_psd_[k].resize(kBlockSize * kBlockSize); + } + + const double kCoeffsY[] = { 0.0406, -0.116, -0.078, -0.152, 0.0033, -0.093, + 0.048, 0.404, 0.2353, -0.035, -0.093, 0.441 }; + const int kCoords[12][2] = { + { -2, -2 }, { -1, -2 }, { 0, -2 }, { 1, -2 }, { 2, -2 }, { -2, -1 }, + { -1, -1 }, { 0, -1 }, { 1, -1 }, { 2, -1 }, { -2, 0 }, { -1, 0 } + }; + const int kLag = 2; + const int kLength = 12; + libaom_test::ACMRandom random; + std::vector noise(kWidth * kHeight); + noise_synth(&random, kLag, kLength, kCoords, kCoeffsY, &noise[0], kWidth, + kHeight); + noise_psd_[0] = get_noise_psd(&noise[0], kWidth, kHeight, kBlockSize); + for (int i = 0; i < kBlockSize * kBlockSize; ++i) { + noise_psd_[0][i] = (float)(noise_psd_[0][i] * kStd * kStd * kScaleNoise * + kScaleNoise / (kMaxValue * kMaxValue)); + } + + float psd_value = + aom_noise_psd_get_default_value(kBlockSizeChroma, kNoiseLevel); + for (int i = 0; i < kBlockSizeChroma * kBlockSizeChroma; ++i) { + noise_psd_[1][i] = psd_value; + noise_psd_[2][i] = psd_value; + } + for (int y = 0; y < kHeight; ++y) { + for (int x = 0; x < kWidth; ++x) { + data_[0][y * stride_[0] + x] = (typename T::data_type_t)fclamp( + (x + noise[y * stride_[0] + x] * kStd) * kScaleNoise, 0, kMaxValue); + } + } + + for (int c = 1; c < 3; ++c) { + for (int y = 0; y < (kHeight >> 1); ++y) { + for (int x = 0; x < (kWidth >> 1); ++x) { + data_[c][y * stride_[c] + x] = (typename T::data_type_t)fclamp( + (x + randn(&random, kStd)) * kScaleNoise, 0, kMaxValue); + } + } + } + for (int k = 0; k < 3; ++k) { + noise_psd_ptrs_[k] = &noise_psd_[k][0]; + } + } + static const int kBlockSize = 32; + static const int kBlockSizeChroma = 16; + static const int kWidth = 256; + static const int kHeight = 256; + static const int kScaleNoise = 1 << (T::kBitDepth - 8); + + std::vector data_[3]; + std::vector denoised_[3]; + std::vector noise_psd_[3]; + int chroma_sub_[2]; + float *noise_psd_ptrs_[3]; + int stride_[3]; +}; + +TYPED_TEST_SUITE_P(WienerDenoiseTest); + +TYPED_TEST_P(WienerDenoiseTest, InvalidBlockSize) { + const uint8_t *const data_ptrs[3] = { + reinterpret_cast(&this->data_[0][0]), + reinterpret_cast(&this->data_[1][0]), + reinterpret_cast(&this->data_[2][0]), + }; + uint8_t *denoised_ptrs[3] = { + reinterpret_cast(&this->denoised_[0][0]), + reinterpret_cast(&this->denoised_[1][0]), + reinterpret_cast(&this->denoised_[2][0]), + }; + EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth, + this->kHeight, this->stride_, + this->chroma_sub_, this->noise_psd_ptrs_, + 18, this->kBitDepth, this->kUseHighBD)); + EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth, + this->kHeight, this->stride_, + this->chroma_sub_, this->noise_psd_ptrs_, + 48, this->kBitDepth, this->kUseHighBD)); + EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth, + this->kHeight, this->stride_, + this->chroma_sub_, this->noise_psd_ptrs_, + 64, this->kBitDepth, this->kUseHighBD)); +} + +TYPED_TEST_P(WienerDenoiseTest, InvalidChromaSubsampling) { + const uint8_t *const data_ptrs[3] = { + reinterpret_cast(&this->data_[0][0]), + reinterpret_cast(&this->data_[1][0]), + reinterpret_cast(&this->data_[2][0]), + }; + uint8_t *denoised_ptrs[3] = { + reinterpret_cast(&this->denoised_[0][0]), + reinterpret_cast(&this->denoised_[1][0]), + reinterpret_cast(&this->denoised_[2][0]), + }; + int chroma_sub[2] = { 1, 0 }; + EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth, + this->kHeight, this->stride_, chroma_sub, + this->noise_psd_ptrs_, 32, this->kBitDepth, + this->kUseHighBD)); + + chroma_sub[0] = 0; + chroma_sub[1] = 1; + EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth, + this->kHeight, this->stride_, chroma_sub, + this->noise_psd_ptrs_, 32, this->kBitDepth, + this->kUseHighBD)); +} + +TYPED_TEST_P(WienerDenoiseTest, GradientTest) { + const int kWidth = this->kWidth; + const int kHeight = this->kHeight; + const int kBlockSize = this->kBlockSize; + const uint8_t *const data_ptrs[3] = { + reinterpret_cast(&this->data_[0][0]), + reinterpret_cast(&this->data_[1][0]), + reinterpret_cast(&this->data_[2][0]), + }; + uint8_t *denoised_ptrs[3] = { + reinterpret_cast(&this->denoised_[0][0]), + reinterpret_cast(&this->denoised_[1][0]), + reinterpret_cast(&this->denoised_[2][0]), + }; + const int ret = aom_wiener_denoise_2d( + data_ptrs, denoised_ptrs, kWidth, kHeight, this->stride_, + this->chroma_sub_, this->noise_psd_ptrs_, this->kBlockSize, + this->kBitDepth, this->kUseHighBD); + EXPECT_EQ(1, ret); + + // Check the noise on the denoised image (from the analytical gradient) + // and make sure that it is less than what we added. + for (int c = 0; c < 3; ++c) { + std::vector measured_noise(kWidth * kHeight); + + double var = 0; + const int shift = (c > 0); + for (int x = 0; x < (kWidth >> shift); ++x) { + for (int y = 0; y < (kHeight >> shift); ++y) { + const double diff = this->denoised_[c][y * this->stride_[c] + x] - + x * this->kScaleNoise; + var += diff * diff; + measured_noise[y * kWidth + x] = diff; + } + } + var /= (kWidth * kHeight); + const double std = sqrt(std::max(0.0, var)); + EXPECT_LE(std, 1.25f * this->kScaleNoise); + if (c == 0) { + std::vector measured_psd = + get_noise_psd(&measured_noise[0], kWidth, kHeight, kBlockSize); + std::vector measured_psd_d(kBlockSize * kBlockSize); + std::vector noise_psd_d(kBlockSize * kBlockSize); + std::copy(measured_psd.begin(), measured_psd.end(), + measured_psd_d.begin()); + std::copy(this->noise_psd_[0].begin(), this->noise_psd_[0].end(), + noise_psd_d.begin()); + EXPECT_LT( + aom_normalized_cross_correlation(&measured_psd_d[0], &noise_psd_d[0], + (int)(noise_psd_d.size())), + 0.35); + } + } +} + +REGISTER_TYPED_TEST_SUITE_P(WienerDenoiseTest, InvalidBlockSize, + InvalidChromaSubsampling, GradientTest); + +INSTANTIATE_TYPED_TEST_SUITE_P(WienerDenoiseTestInstatiation, WienerDenoiseTest, + AllBitDepthParams); diff --git a/libs/libaom/src/test/obmc_sad_test.cc b/libs/libaom/src/test/obmc_sad_test.cc new file mode 100644 index 000000000..6b4382cd7 --- /dev/null +++ b/libs/libaom/src/test/obmc_sad_test.cc @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "test/function_equivalence_test.h" +#include "test/register_state_check.h" + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" + +#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE) + +using libaom_test::FunctionEquivalenceTest; + +namespace { + +static const int kIterations = 1000; +static const int kMaskMax = 64; + +typedef unsigned int (*ObmcSadF)(const uint8_t *pre, int pre_stride, + const int32_t *wsrc, const int32_t *mask); +typedef libaom_test::FuncParam TestFuncs; + +//////////////////////////////////////////////////////////////////////////////// +// 8 bit +//////////////////////////////////////////////////////////////////////////////// + +class ObmcSadTest : public FunctionEquivalenceTest {}; + +TEST_P(ObmcSadTest, RandomValues) { + DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]); + + for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + const int pre_stride = rng_(MAX_SB_SIZE + 1); + + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + pre[i] = rng_.Rand8(); + wsrc[i] = rng_.Rand8() * rng_(kMaskMax * kMaskMax + 1); + mask[i] = rng_(kMaskMax * kMaskMax + 1); + } + + const unsigned int ref_res = params_.ref_func(pre, pre_stride, wsrc, mask); + unsigned int tst_res; + ASM_REGISTER_STATE_CHECK(tst_res = + params_.tst_func(pre, pre_stride, wsrc, mask)); + + ASSERT_EQ(ref_res, tst_res); + } +} + +TEST_P(ObmcSadTest, ExtremeValues) { + DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]); + + for (int iter = 0; iter < MAX_SB_SIZE && !HasFatalFailure(); ++iter) { + const int pre_stride = iter; + + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + pre[i] = UINT8_MAX; + wsrc[i] = UINT8_MAX * kMaskMax * kMaskMax; + mask[i] = kMaskMax * kMaskMax; + } + + const unsigned int ref_res = params_.ref_func(pre, pre_stride, wsrc, mask); + unsigned int tst_res; + ASM_REGISTER_STATE_CHECK(tst_res = + params_.tst_func(pre, pre_stride, wsrc, mask)); + + ASSERT_EQ(ref_res, tst_res); + } +} + +#if HAVE_SSE4_1 +const ObmcSadTest::ParamType sse4_functions[] = { + TestFuncs(aom_obmc_sad128x128_c, aom_obmc_sad128x128_sse4_1), + TestFuncs(aom_obmc_sad128x64_c, aom_obmc_sad128x64_sse4_1), + TestFuncs(aom_obmc_sad64x128_c, aom_obmc_sad64x128_sse4_1), + TestFuncs(aom_obmc_sad64x64_c, aom_obmc_sad64x64_sse4_1), + TestFuncs(aom_obmc_sad64x32_c, aom_obmc_sad64x32_sse4_1), + TestFuncs(aom_obmc_sad32x64_c, aom_obmc_sad32x64_sse4_1), + TestFuncs(aom_obmc_sad32x32_c, aom_obmc_sad32x32_sse4_1), + TestFuncs(aom_obmc_sad32x16_c, aom_obmc_sad32x16_sse4_1), + TestFuncs(aom_obmc_sad16x32_c, aom_obmc_sad16x32_sse4_1), + TestFuncs(aom_obmc_sad16x16_c, aom_obmc_sad16x16_sse4_1), + TestFuncs(aom_obmc_sad16x8_c, aom_obmc_sad16x8_sse4_1), + TestFuncs(aom_obmc_sad8x16_c, aom_obmc_sad8x16_sse4_1), + TestFuncs(aom_obmc_sad8x8_c, aom_obmc_sad8x8_sse4_1), + TestFuncs(aom_obmc_sad8x4_c, aom_obmc_sad8x4_sse4_1), + TestFuncs(aom_obmc_sad4x8_c, aom_obmc_sad4x8_sse4_1), + TestFuncs(aom_obmc_sad4x4_c, aom_obmc_sad4x4_sse4_1), + + TestFuncs(aom_obmc_sad64x16_c, aom_obmc_sad64x16_sse4_1), + TestFuncs(aom_obmc_sad16x64_c, aom_obmc_sad16x64_sse4_1), + TestFuncs(aom_obmc_sad32x8_c, aom_obmc_sad32x8_sse4_1), + TestFuncs(aom_obmc_sad8x32_c, aom_obmc_sad8x32_sse4_1), + TestFuncs(aom_obmc_sad16x4_c, aom_obmc_sad16x4_sse4_1), + TestFuncs(aom_obmc_sad4x16_c, aom_obmc_sad4x16_sse4_1), +}; + +INSTANTIATE_TEST_SUITE_P(SSE4_1, ObmcSadTest, + ::testing::ValuesIn(sse4_functions)); +#endif // HAVE_SSE4_1 + +#if HAVE_AVX2 +const ObmcSadTest::ParamType avx2_functions[] = { + TestFuncs(aom_obmc_sad128x128_c, aom_obmc_sad128x128_avx2), + TestFuncs(aom_obmc_sad128x64_c, aom_obmc_sad128x64_avx2), + TestFuncs(aom_obmc_sad64x128_c, aom_obmc_sad64x128_avx2), + TestFuncs(aom_obmc_sad64x64_c, aom_obmc_sad64x64_avx2), + TestFuncs(aom_obmc_sad64x32_c, aom_obmc_sad64x32_avx2), + TestFuncs(aom_obmc_sad32x64_c, aom_obmc_sad32x64_avx2), + TestFuncs(aom_obmc_sad32x32_c, aom_obmc_sad32x32_avx2), + TestFuncs(aom_obmc_sad32x16_c, aom_obmc_sad32x16_avx2), + TestFuncs(aom_obmc_sad16x32_c, aom_obmc_sad16x32_avx2), + TestFuncs(aom_obmc_sad16x16_c, aom_obmc_sad16x16_avx2), + TestFuncs(aom_obmc_sad16x8_c, aom_obmc_sad16x8_avx2), + TestFuncs(aom_obmc_sad8x16_c, aom_obmc_sad8x16_avx2), + TestFuncs(aom_obmc_sad8x8_c, aom_obmc_sad8x8_avx2), + TestFuncs(aom_obmc_sad8x4_c, aom_obmc_sad8x4_avx2), + TestFuncs(aom_obmc_sad4x8_c, aom_obmc_sad4x8_avx2), + TestFuncs(aom_obmc_sad4x4_c, aom_obmc_sad4x4_avx2), + + TestFuncs(aom_obmc_sad64x16_c, aom_obmc_sad64x16_avx2), + TestFuncs(aom_obmc_sad16x64_c, aom_obmc_sad16x64_avx2), + TestFuncs(aom_obmc_sad32x8_c, aom_obmc_sad32x8_avx2), + TestFuncs(aom_obmc_sad8x32_c, aom_obmc_sad8x32_avx2), + TestFuncs(aom_obmc_sad16x4_c, aom_obmc_sad16x4_avx2), + TestFuncs(aom_obmc_sad4x16_c, aom_obmc_sad4x16_avx2), +}; + +INSTANTIATE_TEST_SUITE_P(AVX2, ObmcSadTest, + ::testing::ValuesIn(avx2_functions)); +#endif // HAVE_AVX2 + +#if CONFIG_AV1_HIGHBITDEPTH +//////////////////////////////////////////////////////////////////////////////// +// High bit-depth +//////////////////////////////////////////////////////////////////////////////// + +class ObmcSadHBDTest : public FunctionEquivalenceTest {}; + +TEST_P(ObmcSadHBDTest, RandomValues) { + DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]); + + for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + const int pre_stride = rng_(MAX_SB_SIZE + 1); + + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + pre[i] = rng_(1 << 12); + wsrc[i] = rng_(1 << 12) * rng_(kMaskMax * kMaskMax + 1); + mask[i] = rng_(kMaskMax * kMaskMax + 1); + } + + const unsigned int ref_res = + params_.ref_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask); + unsigned int tst_res; + ASM_REGISTER_STATE_CHECK( + tst_res = + params_.tst_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask)); + + ASSERT_EQ(ref_res, tst_res); + } +} + +TEST_P(ObmcSadHBDTest, ExtremeValues) { + DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]); + + for (int iter = 0; iter < MAX_SB_SIZE && !HasFatalFailure(); ++iter) { + const int pre_stride = iter; + + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + pre[i] = (1 << 12) - 1; + wsrc[i] = ((1 << 12) - 1) * kMaskMax * kMaskMax; + mask[i] = kMaskMax * kMaskMax; + } + + const unsigned int ref_res = + params_.ref_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask); + unsigned int tst_res; + ASM_REGISTER_STATE_CHECK( + tst_res = + params_.tst_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask)); + + ASSERT_EQ(ref_res, tst_res); + } +} + +#if HAVE_SSE4_1 +ObmcSadHBDTest::ParamType sse4_functions_hbd[] = { + TestFuncs(aom_highbd_obmc_sad128x128_c, aom_highbd_obmc_sad128x128_sse4_1), + TestFuncs(aom_highbd_obmc_sad128x64_c, aom_highbd_obmc_sad128x64_sse4_1), + TestFuncs(aom_highbd_obmc_sad64x128_c, aom_highbd_obmc_sad64x128_sse4_1), + TestFuncs(aom_highbd_obmc_sad64x64_c, aom_highbd_obmc_sad64x64_sse4_1), + TestFuncs(aom_highbd_obmc_sad64x32_c, aom_highbd_obmc_sad64x32_sse4_1), + TestFuncs(aom_highbd_obmc_sad32x64_c, aom_highbd_obmc_sad32x64_sse4_1), + TestFuncs(aom_highbd_obmc_sad32x32_c, aom_highbd_obmc_sad32x32_sse4_1), + TestFuncs(aom_highbd_obmc_sad32x16_c, aom_highbd_obmc_sad32x16_sse4_1), + TestFuncs(aom_highbd_obmc_sad16x32_c, aom_highbd_obmc_sad16x32_sse4_1), + TestFuncs(aom_highbd_obmc_sad16x16_c, aom_highbd_obmc_sad16x16_sse4_1), + TestFuncs(aom_highbd_obmc_sad16x8_c, aom_highbd_obmc_sad16x8_sse4_1), + TestFuncs(aom_highbd_obmc_sad8x16_c, aom_highbd_obmc_sad8x16_sse4_1), + TestFuncs(aom_highbd_obmc_sad8x8_c, aom_highbd_obmc_sad8x8_sse4_1), + TestFuncs(aom_highbd_obmc_sad8x4_c, aom_highbd_obmc_sad8x4_sse4_1), + TestFuncs(aom_highbd_obmc_sad4x8_c, aom_highbd_obmc_sad4x8_sse4_1), + TestFuncs(aom_highbd_obmc_sad4x4_c, aom_highbd_obmc_sad4x4_sse4_1), + + TestFuncs(aom_highbd_obmc_sad64x16_c, aom_highbd_obmc_sad64x16_sse4_1), + TestFuncs(aom_highbd_obmc_sad16x64_c, aom_highbd_obmc_sad16x64_sse4_1), + TestFuncs(aom_highbd_obmc_sad32x8_c, aom_highbd_obmc_sad32x8_sse4_1), + TestFuncs(aom_highbd_obmc_sad8x32_c, aom_highbd_obmc_sad8x32_sse4_1), + TestFuncs(aom_highbd_obmc_sad16x4_c, aom_highbd_obmc_sad16x4_sse4_1), + TestFuncs(aom_highbd_obmc_sad4x16_c, aom_highbd_obmc_sad4x16_sse4_1), +}; + +INSTANTIATE_TEST_SUITE_P(SSE4_1, ObmcSadHBDTest, + ::testing::ValuesIn(sse4_functions_hbd)); +#endif // HAVE_SSE4_1 + +#if HAVE_AVX2 +ObmcSadHBDTest::ParamType avx2_functions_hbd[] = { + TestFuncs(aom_highbd_obmc_sad128x128_c, aom_highbd_obmc_sad128x128_avx2), + TestFuncs(aom_highbd_obmc_sad128x64_c, aom_highbd_obmc_sad128x64_avx2), + TestFuncs(aom_highbd_obmc_sad64x128_c, aom_highbd_obmc_sad64x128_avx2), + TestFuncs(aom_highbd_obmc_sad64x64_c, aom_highbd_obmc_sad64x64_avx2), + TestFuncs(aom_highbd_obmc_sad64x32_c, aom_highbd_obmc_sad64x32_avx2), + TestFuncs(aom_highbd_obmc_sad32x64_c, aom_highbd_obmc_sad32x64_avx2), + TestFuncs(aom_highbd_obmc_sad32x32_c, aom_highbd_obmc_sad32x32_avx2), + TestFuncs(aom_highbd_obmc_sad32x16_c, aom_highbd_obmc_sad32x16_avx2), + TestFuncs(aom_highbd_obmc_sad16x32_c, aom_highbd_obmc_sad16x32_avx2), + TestFuncs(aom_highbd_obmc_sad16x16_c, aom_highbd_obmc_sad16x16_avx2), + TestFuncs(aom_highbd_obmc_sad16x8_c, aom_highbd_obmc_sad16x8_avx2), + TestFuncs(aom_highbd_obmc_sad8x16_c, aom_highbd_obmc_sad8x16_avx2), + TestFuncs(aom_highbd_obmc_sad8x8_c, aom_highbd_obmc_sad8x8_avx2), + TestFuncs(aom_highbd_obmc_sad8x4_c, aom_highbd_obmc_sad8x4_avx2), + TestFuncs(aom_highbd_obmc_sad4x8_c, aom_highbd_obmc_sad4x8_avx2), + TestFuncs(aom_highbd_obmc_sad4x4_c, aom_highbd_obmc_sad4x4_avx2), + + TestFuncs(aom_highbd_obmc_sad64x16_c, aom_highbd_obmc_sad64x16_avx2), + TestFuncs(aom_highbd_obmc_sad16x64_c, aom_highbd_obmc_sad16x64_avx2), + TestFuncs(aom_highbd_obmc_sad32x8_c, aom_highbd_obmc_sad32x8_avx2), + TestFuncs(aom_highbd_obmc_sad8x32_c, aom_highbd_obmc_sad8x32_avx2), + TestFuncs(aom_highbd_obmc_sad16x4_c, aom_highbd_obmc_sad16x4_avx2), + TestFuncs(aom_highbd_obmc_sad4x16_c, aom_highbd_obmc_sad4x16_avx2), +}; + +INSTANTIATE_TEST_SUITE_P(AVX2, ObmcSadHBDTest, + ::testing::ValuesIn(avx2_functions_hbd)); +#endif // HAVE_AVX2 +#endif // CONFIG_AV1_HIGHBITDEPTH +} // namespace diff --git a/libs/libaom/src/test/obmc_variance_test.cc b/libs/libaom/src/test/obmc_variance_test.cc new file mode 100644 index 000000000..fc281d70b --- /dev/null +++ b/libs/libaom/src/test/obmc_variance_test.cc @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/acm_random.h" + +#include "test/function_equivalence_test.h" +#include "test/register_state_check.h" + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" + +#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE) + +using libaom_test::ACMRandom; +using libaom_test::FunctionEquivalenceTest; + +namespace { + +static const int kIterations = 1000; +static const int kMaskMax = 64; + +typedef unsigned int (*ObmcVarF)(const uint8_t *pre, int pre_stride, + const int32_t *wsrc, const int32_t *mask, + unsigned int *sse); +typedef libaom_test::FuncParam TestFuncs; + +//////////////////////////////////////////////////////////////////////////////// +// 8 bit +//////////////////////////////////////////////////////////////////////////////// + +class ObmcVarianceTest : public FunctionEquivalenceTest {}; + +TEST_P(ObmcVarianceTest, RandomValues) { + DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]); + + for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + const int pre_stride = this->rng_(MAX_SB_SIZE + 1); + + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + pre[i] = this->rng_.Rand8(); + wsrc[i] = this->rng_.Rand8() * this->rng_(kMaskMax * kMaskMax + 1); + mask[i] = this->rng_(kMaskMax * kMaskMax + 1); + } + + unsigned int ref_sse, tst_sse; + const unsigned int ref_res = + params_.ref_func(pre, pre_stride, wsrc, mask, &ref_sse); + unsigned int tst_res; + ASM_REGISTER_STATE_CHECK( + tst_res = params_.tst_func(pre, pre_stride, wsrc, mask, &tst_sse)); + + ASSERT_EQ(ref_res, tst_res); + ASSERT_EQ(ref_sse, tst_sse); + } +} + +TEST_P(ObmcVarianceTest, ExtremeValues) { + DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]); + + for (int iter = 0; iter < MAX_SB_SIZE && !HasFatalFailure(); ++iter) { + const int pre_stride = iter; + + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + pre[i] = UINT8_MAX; + wsrc[i] = UINT8_MAX * kMaskMax * kMaskMax; + mask[i] = kMaskMax * kMaskMax; + } + + unsigned int ref_sse, tst_sse; + const unsigned int ref_res = + params_.ref_func(pre, pre_stride, wsrc, mask, &ref_sse); + unsigned int tst_res; + ASM_REGISTER_STATE_CHECK( + tst_res = params_.tst_func(pre, pre_stride, wsrc, mask, &tst_sse)); + + ASSERT_EQ(ref_res, tst_res); + ASSERT_EQ(ref_sse, tst_sse); + } +} + +TEST_P(ObmcVarianceTest, DISABLED_Speed) { + DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]); + + const int pre_stride = this->rng_(MAX_SB_SIZE + 1); + + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + pre[i] = this->rng_.Rand8(); + wsrc[i] = this->rng_.Rand8() * this->rng_(kMaskMax * kMaskMax + 1); + mask[i] = this->rng_(kMaskMax * kMaskMax + 1); + } + + const int num_loops = 1000000; + unsigned int ref_sse, tst_sse; + aom_usec_timer ref_timer, test_timer; + + aom_usec_timer_start(&ref_timer); + for (int i = 0; i < num_loops; ++i) { + params_.ref_func(pre, pre_stride, wsrc, mask, &ref_sse); + } + aom_usec_timer_mark(&ref_timer); + const int elapsed_time_c = + static_cast(aom_usec_timer_elapsed(&ref_timer)); + + aom_usec_timer_start(&test_timer); + for (int i = 0; i < num_loops; ++i) { + params_.tst_func(pre, pre_stride, wsrc, mask, &tst_sse); + } + aom_usec_timer_mark(&test_timer); + const int elapsed_time_simd = + static_cast(aom_usec_timer_elapsed(&test_timer)); + + printf("c_time=%d \t simd_time=%d \t gain=%d \n", elapsed_time_c, + elapsed_time_simd, (elapsed_time_c / elapsed_time_simd)); +} + +#if HAVE_SSE4_1 +const ObmcVarianceTest::ParamType sse4_functions[] = { + TestFuncs(aom_obmc_variance128x128_c, aom_obmc_variance128x128_sse4_1), + TestFuncs(aom_obmc_variance128x64_c, aom_obmc_variance128x64_sse4_1), + TestFuncs(aom_obmc_variance64x128_c, aom_obmc_variance64x128_sse4_1), + TestFuncs(aom_obmc_variance64x64_c, aom_obmc_variance64x64_sse4_1), + TestFuncs(aom_obmc_variance64x32_c, aom_obmc_variance64x32_sse4_1), + TestFuncs(aom_obmc_variance32x64_c, aom_obmc_variance32x64_sse4_1), + TestFuncs(aom_obmc_variance32x32_c, aom_obmc_variance32x32_sse4_1), + TestFuncs(aom_obmc_variance32x16_c, aom_obmc_variance32x16_sse4_1), + TestFuncs(aom_obmc_variance16x32_c, aom_obmc_variance16x32_sse4_1), + TestFuncs(aom_obmc_variance16x16_c, aom_obmc_variance16x16_sse4_1), + TestFuncs(aom_obmc_variance16x8_c, aom_obmc_variance16x8_sse4_1), + TestFuncs(aom_obmc_variance8x16_c, aom_obmc_variance8x16_sse4_1), + TestFuncs(aom_obmc_variance8x8_c, aom_obmc_variance8x8_sse4_1), + TestFuncs(aom_obmc_variance8x4_c, aom_obmc_variance8x4_sse4_1), + TestFuncs(aom_obmc_variance4x8_c, aom_obmc_variance4x8_sse4_1), + TestFuncs(aom_obmc_variance4x4_c, aom_obmc_variance4x4_sse4_1), + + TestFuncs(aom_obmc_variance64x16_c, aom_obmc_variance64x16_sse4_1), + TestFuncs(aom_obmc_variance16x64_c, aom_obmc_variance16x64_sse4_1), + TestFuncs(aom_obmc_variance32x8_c, aom_obmc_variance32x8_sse4_1), + TestFuncs(aom_obmc_variance8x32_c, aom_obmc_variance8x32_sse4_1), + TestFuncs(aom_obmc_variance16x4_c, aom_obmc_variance16x4_sse4_1), + TestFuncs(aom_obmc_variance4x16_c, aom_obmc_variance4x16_sse4_1), +}; + +INSTANTIATE_TEST_SUITE_P(SSE4_1, ObmcVarianceTest, + ::testing::ValuesIn(sse4_functions)); +#endif // HAVE_SSE4_1 + +#if HAVE_AVX2 +const ObmcVarianceTest::ParamType avx2_functions[] = { + TestFuncs(aom_obmc_variance128x128_c, aom_obmc_variance128x128_avx2), + TestFuncs(aom_obmc_variance128x64_c, aom_obmc_variance128x64_avx2), + TestFuncs(aom_obmc_variance64x128_c, aom_obmc_variance64x128_avx2), + TestFuncs(aom_obmc_variance64x64_c, aom_obmc_variance64x64_avx2), + TestFuncs(aom_obmc_variance64x32_c, aom_obmc_variance64x32_avx2), + TestFuncs(aom_obmc_variance32x64_c, aom_obmc_variance32x64_avx2), + TestFuncs(aom_obmc_variance32x32_c, aom_obmc_variance32x32_avx2), + TestFuncs(aom_obmc_variance32x16_c, aom_obmc_variance32x16_avx2), + TestFuncs(aom_obmc_variance16x32_c, aom_obmc_variance16x32_avx2), + TestFuncs(aom_obmc_variance16x16_c, aom_obmc_variance16x16_avx2), + TestFuncs(aom_obmc_variance16x8_c, aom_obmc_variance16x8_avx2), + TestFuncs(aom_obmc_variance8x16_c, aom_obmc_variance8x16_avx2), + TestFuncs(aom_obmc_variance8x8_c, aom_obmc_variance8x8_avx2), + TestFuncs(aom_obmc_variance8x4_c, aom_obmc_variance8x4_avx2), + TestFuncs(aom_obmc_variance4x8_c, aom_obmc_variance4x8_avx2), + TestFuncs(aom_obmc_variance4x4_c, aom_obmc_variance4x4_avx2), + + TestFuncs(aom_obmc_variance64x16_c, aom_obmc_variance64x16_avx2), + TestFuncs(aom_obmc_variance16x64_c, aom_obmc_variance16x64_avx2), + TestFuncs(aom_obmc_variance32x8_c, aom_obmc_variance32x8_avx2), + TestFuncs(aom_obmc_variance8x32_c, aom_obmc_variance8x32_avx2), + TestFuncs(aom_obmc_variance16x4_c, aom_obmc_variance16x4_avx2), + TestFuncs(aom_obmc_variance4x16_c, aom_obmc_variance4x16_avx2), +}; + +INSTANTIATE_TEST_SUITE_P(AVX2, ObmcVarianceTest, + ::testing::ValuesIn(avx2_functions)); +#endif // HAVE_AVX2 + +//////////////////////////////////////////////////////////////////////////////// +// High bit-depth +//////////////////////////////////////////////////////////////////////////////// +#if CONFIG_AV1_HIGHBITDEPTH +class ObmcVarianceHBDTest : public FunctionEquivalenceTest {}; + +TEST_P(ObmcVarianceHBDTest, RandomValues) { + DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]); + + for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + const int pre_stride = this->rng_(MAX_SB_SIZE + 1); + + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + pre[i] = this->rng_(1 << params_.bit_depth); + wsrc[i] = this->rng_(1 << params_.bit_depth) * + this->rng_(kMaskMax * kMaskMax + 1); + mask[i] = this->rng_(kMaskMax * kMaskMax + 1); + } + + unsigned int ref_sse, tst_sse; + const unsigned int ref_res = params_.ref_func( + CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask, &ref_sse); + unsigned int tst_res; + ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(CONVERT_TO_BYTEPTR(pre), + pre_stride, wsrc, mask, + &tst_sse)); + + ASSERT_EQ(ref_res, tst_res); + ASSERT_EQ(ref_sse, tst_sse); + } +} + +TEST_P(ObmcVarianceHBDTest, ExtremeValues) { + DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]); + + for (int iter = 0; iter < MAX_SB_SIZE && !HasFatalFailure(); ++iter) { + const int pre_stride = iter; + + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + pre[i] = (1 << params_.bit_depth) - 1; + wsrc[i] = ((1 << params_.bit_depth) - 1) * kMaskMax * kMaskMax; + mask[i] = kMaskMax * kMaskMax; + } + + unsigned int ref_sse, tst_sse; + const unsigned int ref_res = params_.ref_func( + CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask, &ref_sse); + unsigned int tst_res; + ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(CONVERT_TO_BYTEPTR(pre), + pre_stride, wsrc, mask, + &tst_sse)); + + ASSERT_EQ(ref_res, tst_res); + ASSERT_EQ(ref_sse, tst_sse); + } +} + +#if HAVE_SSE4_1 +ObmcVarianceHBDTest::ParamType sse4_functions_hbd[] = { + TestFuncs(aom_highbd_obmc_variance128x128_c, + aom_highbd_obmc_variance128x128_sse4_1, 8), + TestFuncs(aom_highbd_obmc_variance128x64_c, + aom_highbd_obmc_variance128x64_sse4_1, 8), + TestFuncs(aom_highbd_obmc_variance64x128_c, + aom_highbd_obmc_variance64x128_sse4_1, 8), + TestFuncs(aom_highbd_obmc_variance64x64_c, + aom_highbd_obmc_variance64x64_sse4_1, 8), + TestFuncs(aom_highbd_obmc_variance64x32_c, + aom_highbd_obmc_variance64x32_sse4_1, 8), + TestFuncs(aom_highbd_obmc_variance32x64_c, + aom_highbd_obmc_variance32x64_sse4_1, 8), + TestFuncs(aom_highbd_obmc_variance32x32_c, + aom_highbd_obmc_variance32x32_sse4_1, 8), + TestFuncs(aom_highbd_obmc_variance32x16_c, + aom_highbd_obmc_variance32x16_sse4_1, 8), + TestFuncs(aom_highbd_obmc_variance16x32_c, + aom_highbd_obmc_variance16x32_sse4_1, 8), + TestFuncs(aom_highbd_obmc_variance16x16_c, + aom_highbd_obmc_variance16x16_sse4_1, 8), + TestFuncs(aom_highbd_obmc_variance16x8_c, aom_highbd_obmc_variance16x8_sse4_1, + 8), + TestFuncs(aom_highbd_obmc_variance8x16_c, aom_highbd_obmc_variance8x16_sse4_1, + 8), + TestFuncs(aom_highbd_obmc_variance8x8_c, aom_highbd_obmc_variance8x8_sse4_1, + 8), + TestFuncs(aom_highbd_obmc_variance8x4_c, aom_highbd_obmc_variance8x4_sse4_1, + 8), + TestFuncs(aom_highbd_obmc_variance4x8_c, aom_highbd_obmc_variance4x8_sse4_1, + 8), + TestFuncs(aom_highbd_obmc_variance4x4_c, aom_highbd_obmc_variance4x4_sse4_1, + 8), + TestFuncs(aom_highbd_10_obmc_variance128x128_c, + aom_highbd_10_obmc_variance128x128_sse4_1, 10), + TestFuncs(aom_highbd_10_obmc_variance128x64_c, + aom_highbd_10_obmc_variance128x64_sse4_1, 10), + TestFuncs(aom_highbd_10_obmc_variance64x128_c, + aom_highbd_10_obmc_variance64x128_sse4_1, 10), + TestFuncs(aom_highbd_10_obmc_variance64x64_c, + aom_highbd_10_obmc_variance64x64_sse4_1, 10), + TestFuncs(aom_highbd_10_obmc_variance64x32_c, + aom_highbd_10_obmc_variance64x32_sse4_1, 10), + TestFuncs(aom_highbd_10_obmc_variance32x64_c, + aom_highbd_10_obmc_variance32x64_sse4_1, 10), + TestFuncs(aom_highbd_10_obmc_variance32x32_c, + aom_highbd_10_obmc_variance32x32_sse4_1, 10), + TestFuncs(aom_highbd_10_obmc_variance32x16_c, + aom_highbd_10_obmc_variance32x16_sse4_1, 10), + TestFuncs(aom_highbd_10_obmc_variance16x32_c, + aom_highbd_10_obmc_variance16x32_sse4_1, 10), + TestFuncs(aom_highbd_10_obmc_variance16x16_c, + aom_highbd_10_obmc_variance16x16_sse4_1, 10), + TestFuncs(aom_highbd_10_obmc_variance16x8_c, + aom_highbd_10_obmc_variance16x8_sse4_1, 10), + TestFuncs(aom_highbd_10_obmc_variance8x16_c, + aom_highbd_10_obmc_variance8x16_sse4_1, 10), + TestFuncs(aom_highbd_10_obmc_variance8x8_c, + aom_highbd_10_obmc_variance8x8_sse4_1, 10), + TestFuncs(aom_highbd_10_obmc_variance8x4_c, + aom_highbd_10_obmc_variance8x4_sse4_1, 10), + TestFuncs(aom_highbd_10_obmc_variance4x8_c, + aom_highbd_10_obmc_variance4x8_sse4_1, 10), + TestFuncs(aom_highbd_10_obmc_variance4x4_c, + aom_highbd_10_obmc_variance4x4_sse4_1, 10), + TestFuncs(aom_highbd_12_obmc_variance128x128_c, + aom_highbd_12_obmc_variance128x128_sse4_1, 12), + TestFuncs(aom_highbd_12_obmc_variance128x64_c, + aom_highbd_12_obmc_variance128x64_sse4_1, 12), + TestFuncs(aom_highbd_12_obmc_variance64x128_c, + aom_highbd_12_obmc_variance64x128_sse4_1, 12), + TestFuncs(aom_highbd_12_obmc_variance64x64_c, + aom_highbd_12_obmc_variance64x64_sse4_1, 12), + TestFuncs(aom_highbd_12_obmc_variance64x32_c, + aom_highbd_12_obmc_variance64x32_sse4_1, 12), + TestFuncs(aom_highbd_12_obmc_variance32x64_c, + aom_highbd_12_obmc_variance32x64_sse4_1, 12), + TestFuncs(aom_highbd_12_obmc_variance32x32_c, + aom_highbd_12_obmc_variance32x32_sse4_1, 12), + TestFuncs(aom_highbd_12_obmc_variance32x16_c, + aom_highbd_12_obmc_variance32x16_sse4_1, 12), + TestFuncs(aom_highbd_12_obmc_variance16x32_c, + aom_highbd_12_obmc_variance16x32_sse4_1, 12), + TestFuncs(aom_highbd_12_obmc_variance16x16_c, + aom_highbd_12_obmc_variance16x16_sse4_1, 12), + TestFuncs(aom_highbd_12_obmc_variance16x8_c, + aom_highbd_12_obmc_variance16x8_sse4_1, 12), + TestFuncs(aom_highbd_12_obmc_variance8x16_c, + aom_highbd_12_obmc_variance8x16_sse4_1, 12), + TestFuncs(aom_highbd_12_obmc_variance8x8_c, + aom_highbd_12_obmc_variance8x8_sse4_1, 12), + TestFuncs(aom_highbd_12_obmc_variance8x4_c, + aom_highbd_12_obmc_variance8x4_sse4_1, 12), + TestFuncs(aom_highbd_12_obmc_variance4x8_c, + aom_highbd_12_obmc_variance4x8_sse4_1, 12), + TestFuncs(aom_highbd_12_obmc_variance4x4_c, + aom_highbd_12_obmc_variance4x4_sse4_1, 12), + + TestFuncs(aom_highbd_obmc_variance64x16_c, + aom_highbd_obmc_variance64x16_sse4_1, 8), + TestFuncs(aom_highbd_obmc_variance16x64_c, + aom_highbd_obmc_variance16x64_sse4_1, 8), + TestFuncs(aom_highbd_obmc_variance32x8_c, aom_highbd_obmc_variance32x8_sse4_1, + 8), + TestFuncs(aom_highbd_obmc_variance8x32_c, aom_highbd_obmc_variance8x32_sse4_1, + 8), + TestFuncs(aom_highbd_obmc_variance16x4_c, aom_highbd_obmc_variance16x4_sse4_1, + 8), + TestFuncs(aom_highbd_obmc_variance4x16_c, aom_highbd_obmc_variance4x16_sse4_1, + 8), + TestFuncs(aom_highbd_10_obmc_variance64x16_c, + aom_highbd_10_obmc_variance64x16_sse4_1, 10), + TestFuncs(aom_highbd_10_obmc_variance16x64_c, + aom_highbd_10_obmc_variance16x64_sse4_1, 10), + TestFuncs(aom_highbd_10_obmc_variance32x8_c, + aom_highbd_10_obmc_variance32x8_sse4_1, 10), + TestFuncs(aom_highbd_10_obmc_variance8x32_c, + aom_highbd_10_obmc_variance8x32_sse4_1, 10), + TestFuncs(aom_highbd_10_obmc_variance16x4_c, + aom_highbd_10_obmc_variance16x4_sse4_1, 10), + TestFuncs(aom_highbd_10_obmc_variance4x16_c, + aom_highbd_10_obmc_variance4x16_sse4_1, 10), + TestFuncs(aom_highbd_12_obmc_variance64x16_c, + aom_highbd_12_obmc_variance64x16_sse4_1, 12), + TestFuncs(aom_highbd_12_obmc_variance16x64_c, + aom_highbd_12_obmc_variance16x64_sse4_1, 12), + TestFuncs(aom_highbd_12_obmc_variance32x8_c, + aom_highbd_12_obmc_variance32x8_sse4_1, 12), + TestFuncs(aom_highbd_12_obmc_variance8x32_c, + aom_highbd_12_obmc_variance8x32_sse4_1, 12), + TestFuncs(aom_highbd_12_obmc_variance16x4_c, + aom_highbd_12_obmc_variance16x4_sse4_1, 12), + TestFuncs(aom_highbd_12_obmc_variance4x16_c, + aom_highbd_12_obmc_variance4x16_sse4_1, 12), +}; + +INSTANTIATE_TEST_SUITE_P(SSE4_1, ObmcVarianceHBDTest, + ::testing::ValuesIn(sse4_functions_hbd)); +#endif // HAVE_SSE4_1 +#endif // CONFIG_AV1_HIGHBITDEPTH +} // namespace diff --git a/libs/libaom/src/test/pickrst_test.cc b/libs/libaom/src/test/pickrst_test.cc new file mode 100644 index 000000000..9a2c5bcd4 --- /dev/null +++ b/libs/libaom/src/test/pickrst_test.cc @@ -0,0 +1,534 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "test/register_state_check.h" +#include "test/acm_random.h" +#include "test/util.h" + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_ports/aom_timer.h" +#include "av1/encoder/pickrst.h" + +#define MAX_DATA_BLOCK 384 + +namespace pickrst_test_lowbd { +static const int kIterations = 100; + +typedef int64_t (*lowbd_pixel_proj_error_func)( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params); + +//////////////////////////////////////////////////////////////////////////////// +// 8 bit +//////////////////////////////////////////////////////////////////////////////// + +typedef std::tuple PixelProjErrorTestParam; + +class PixelProjErrorTest + : public ::testing::TestWithParam { + public: + virtual void SetUp() { + target_func_ = GET_PARAM(0); + src_ = (uint8_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK * + sizeof(*src_))); + ASSERT_NE(src_, nullptr); + dgd_ = (uint8_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK * + sizeof(*dgd_))); + ASSERT_NE(dgd_, nullptr); + flt0_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK * + sizeof(*flt0_))); + ASSERT_NE(flt0_, nullptr); + flt1_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK * + sizeof(*flt1_))); + ASSERT_NE(flt1_, nullptr); + } + virtual void TearDown() { + aom_free(src_); + aom_free(dgd_); + aom_free(flt0_); + aom_free(flt1_); + } + void RunPixelProjErrorTest(int32_t run_times); + void RunPixelProjErrorTest_ExtremeValues(); + + private: + lowbd_pixel_proj_error_func target_func_; + libaom_test::ACMRandom rng_; + uint8_t *src_; + uint8_t *dgd_; + int32_t *flt0_; + int32_t *flt1_; +}; + +void PixelProjErrorTest::RunPixelProjErrorTest(int32_t run_times) { + int h_end = run_times != 1 ? 128 : (rng_.Rand16() % MAX_DATA_BLOCK) + 1; + int v_end = run_times != 1 ? 128 : (rng_.Rand16() % MAX_DATA_BLOCK) + 1; + const int dgd_stride = MAX_DATA_BLOCK; + const int src_stride = MAX_DATA_BLOCK; + const int flt0_stride = MAX_DATA_BLOCK; + const int flt1_stride = MAX_DATA_BLOCK; + sgr_params_type params; + int xq[2]; + const int iters = run_times == 1 ? kIterations : 4; + for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) { + int64_t err_ref = 0, err_test = 1; + for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) { + dgd_[i] = rng_.Rand8(); + src_[i] = rng_.Rand8(); + flt0_[i] = rng_.Rand15Signed(); + flt1_[i] = rng_.Rand15Signed(); + } + xq[0] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS); + xq[1] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS); + params.r[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2); + params.r[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2); + params.s[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2); + params.s[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2); + uint8_t *dgd = dgd_; + uint8_t *src = src_; + + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + err_ref = av1_lowbd_pixel_proj_error_c(src, h_end, v_end, src_stride, dgd, + dgd_stride, flt0_, flt0_stride, + flt1_, flt1_stride, xq, ¶ms); + } + aom_usec_timer_mark(&timer); + const double time1 = static_cast(aom_usec_timer_elapsed(&timer)); + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + err_test = + target_func_(src, h_end, v_end, src_stride, dgd, dgd_stride, flt0_, + flt0_stride, flt1_, flt1_stride, xq, ¶ms); + } + aom_usec_timer_mark(&timer); + const double time2 = static_cast(aom_usec_timer_elapsed(&timer)); + if (run_times > 10) { + printf("r0 %d r1 %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", params.r[0], + params.r[1], h_end, v_end, time1, time2, time1 / time2); + } + ASSERT_EQ(err_ref, err_test); + } +} + +void PixelProjErrorTest::RunPixelProjErrorTest_ExtremeValues() { + const int h_start = 0; + int h_end = 192; + const int v_start = 0; + int v_end = 192; + const int dgd_stride = MAX_DATA_BLOCK; + const int src_stride = MAX_DATA_BLOCK; + const int flt0_stride = MAX_DATA_BLOCK; + const int flt1_stride = MAX_DATA_BLOCK; + sgr_params_type params; + int xq[2]; + const int iters = kIterations; + for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) { + int64_t err_ref = 0, err_test = 1; + for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) { + dgd_[i] = 0; + src_[i] = 255; + flt0_[i] = rng_.Rand15Signed(); + flt1_[i] = rng_.Rand15Signed(); + } + xq[0] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS); + xq[1] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS); + params.r[0] = rng_.Rand8() % MAX_RADIUS; + params.r[1] = rng_.Rand8() % MAX_RADIUS; + params.s[0] = rng_.Rand8() % MAX_RADIUS; + params.s[1] = rng_.Rand8() % MAX_RADIUS; + uint8_t *dgd = dgd_; + uint8_t *src = src_; + + err_ref = av1_lowbd_pixel_proj_error_c( + src, h_end - h_start, v_end - v_start, src_stride, dgd, dgd_stride, + flt0_, flt0_stride, flt1_, flt1_stride, xq, ¶ms); + + err_test = target_func_(src, h_end - h_start, v_end - v_start, src_stride, + dgd, dgd_stride, flt0_, flt0_stride, flt1_, + flt1_stride, xq, ¶ms); + + ASSERT_EQ(err_ref, err_test); + } +} + +TEST_P(PixelProjErrorTest, RandomValues) { RunPixelProjErrorTest(1); } + +TEST_P(PixelProjErrorTest, ExtremeValues) { + RunPixelProjErrorTest_ExtremeValues(); +} + +TEST_P(PixelProjErrorTest, DISABLED_Speed) { RunPixelProjErrorTest(200000); } + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE4_1, PixelProjErrorTest, + ::testing::Values(av1_lowbd_pixel_proj_error_sse4_1)); +#endif // HAVE_SSE4_1 + +#if HAVE_AVX2 + +INSTANTIATE_TEST_SUITE_P(AVX2, PixelProjErrorTest, + ::testing::Values(av1_lowbd_pixel_proj_error_avx2)); +#endif // HAVE_AVX2 + +} // namespace pickrst_test_lowbd + +#if CONFIG_AV1_HIGHBITDEPTH +namespace pickrst_test_highbd { +static const int kIterations = 100; + +typedef int64_t (*highbd_pixel_proj_error_func)( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params); + +//////////////////////////////////////////////////////////////////////////////// +// High bit-depth +//////////////////////////////////////////////////////////////////////////////// + +typedef std::tuple PixelProjErrorTestParam; + +class PixelProjHighbdErrorTest + : public ::testing::TestWithParam { + public: + virtual void SetUp() { + target_func_ = GET_PARAM(0); + src_ = + (uint16_t *)aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*src_)); + ASSERT_NE(src_, nullptr); + dgd_ = + (uint16_t *)aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*dgd_)); + ASSERT_NE(dgd_, nullptr); + flt0_ = + (int32_t *)aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*flt0_)); + ASSERT_NE(flt0_, nullptr); + flt1_ = + (int32_t *)aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*flt1_)); + ASSERT_NE(flt1_, nullptr); + } + virtual void TearDown() { + aom_free(src_); + aom_free(dgd_); + aom_free(flt0_); + aom_free(flt1_); + } + void RunPixelProjErrorTest(int32_t run_times); + void RunPixelProjErrorTest_ExtremeValues(); + + private: + highbd_pixel_proj_error_func target_func_; + libaom_test::ACMRandom rng_; + uint16_t *src_; + uint16_t *dgd_; + int32_t *flt0_; + int32_t *flt1_; +}; + +void PixelProjHighbdErrorTest::RunPixelProjErrorTest(int32_t run_times) { + int h_end = run_times != 1 ? 128 : (rng_.Rand16() % MAX_DATA_BLOCK) + 1; + int v_end = run_times != 1 ? 128 : (rng_.Rand16() % MAX_DATA_BLOCK) + 1; + const int dgd_stride = MAX_DATA_BLOCK; + const int src_stride = MAX_DATA_BLOCK; + const int flt0_stride = MAX_DATA_BLOCK; + const int flt1_stride = MAX_DATA_BLOCK; + sgr_params_type params; + int xq[2]; + const int iters = run_times == 1 ? kIterations : 4; + for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) { + int64_t err_ref = 0, err_test = 1; + for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) { + dgd_[i] = rng_.Rand16() % (1 << 12); + src_[i] = rng_.Rand16() % (1 << 12); + flt0_[i] = rng_.Rand15Signed(); + flt1_[i] = rng_.Rand15Signed(); + } + xq[0] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS); + xq[1] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS); + params.r[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2); + params.r[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2); + params.s[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2); + params.s[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2); + uint8_t *dgd8 = CONVERT_TO_BYTEPTR(dgd_); + uint8_t *src8 = CONVERT_TO_BYTEPTR(src_); + + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + err_ref = av1_highbd_pixel_proj_error_c( + src8, h_end, v_end, src_stride, dgd8, dgd_stride, flt0_, flt0_stride, + flt1_, flt1_stride, xq, ¶ms); + } + aom_usec_timer_mark(&timer); + const double time1 = static_cast(aom_usec_timer_elapsed(&timer)); + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + err_test = + target_func_(src8, h_end, v_end, src_stride, dgd8, dgd_stride, flt0_, + flt0_stride, flt1_, flt1_stride, xq, ¶ms); + } + aom_usec_timer_mark(&timer); + const double time2 = static_cast(aom_usec_timer_elapsed(&timer)); + if (run_times > 10) { + printf("r0 %d r1 %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", params.r[0], + params.r[1], h_end, v_end, time1, time2, time1 / time2); + } + ASSERT_EQ(err_ref, err_test); + } +} + +void PixelProjHighbdErrorTest::RunPixelProjErrorTest_ExtremeValues() { + const int h_start = 0; + int h_end = 192; + const int v_start = 0; + int v_end = 192; + const int dgd_stride = MAX_DATA_BLOCK; + const int src_stride = MAX_DATA_BLOCK; + const int flt0_stride = MAX_DATA_BLOCK; + const int flt1_stride = MAX_DATA_BLOCK; + sgr_params_type params; + int xq[2]; + const int iters = kIterations; + for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) { + int64_t err_ref = 0, err_test = 1; + for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) { + dgd_[i] = 0; + src_[i] = (1 << 12) - 1; + flt0_[i] = rng_.Rand15Signed(); + flt1_[i] = rng_.Rand15Signed(); + } + xq[0] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS); + xq[1] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS); + params.r[0] = rng_.Rand8() % MAX_RADIUS; + params.r[1] = rng_.Rand8() % MAX_RADIUS; + params.s[0] = rng_.Rand8() % MAX_RADIUS; + params.s[1] = rng_.Rand8() % MAX_RADIUS; + uint8_t *dgd8 = CONVERT_TO_BYTEPTR(dgd_); + uint8_t *src8 = CONVERT_TO_BYTEPTR(src_); + + err_ref = av1_highbd_pixel_proj_error_c( + src8, h_end - h_start, v_end - v_start, src_stride, dgd8, dgd_stride, + flt0_, flt0_stride, flt1_, flt1_stride, xq, ¶ms); + + err_test = target_func_(src8, h_end - h_start, v_end - v_start, src_stride, + dgd8, dgd_stride, flt0_, flt0_stride, flt1_, + flt1_stride, xq, ¶ms); + + ASSERT_EQ(err_ref, err_test); + } +} + +TEST_P(PixelProjHighbdErrorTest, RandomValues) { RunPixelProjErrorTest(1); } + +TEST_P(PixelProjHighbdErrorTest, ExtremeValues) { + RunPixelProjErrorTest_ExtremeValues(); +} + +TEST_P(PixelProjHighbdErrorTest, DISABLED_Speed) { + RunPixelProjErrorTest(200000); +} + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE4_1, PixelProjHighbdErrorTest, + ::testing::Values(av1_highbd_pixel_proj_error_sse4_1)); +#endif // HAVE_SSE4_1 + +#if HAVE_AVX2 + +INSTANTIATE_TEST_SUITE_P(AVX2, PixelProjHighbdErrorTest, + ::testing::Values(av1_highbd_pixel_proj_error_avx2)); +#endif // HAVE_AVX2 + +} // namespace pickrst_test_highbd + +//////////////////////////////////////////////////////////////////////////////// +// Get_proj_subspace_Test +//////////////////////////////////////////////////////////////////////////////// + +namespace get_proj_subspace_test_lowbd { +static const int kIterations = 100; + +typedef void (*set_get_proj_subspace)(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, int dat_stride, + int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, + int64_t H[2][2], int64_t C[2], + const sgr_params_type *params); + +typedef std::tuple GetProjSubspaceTestParam; + +class GetProjSubspaceTest + : public ::testing::TestWithParam { + public: + virtual void SetUp() { + target_func_ = GET_PARAM(0); + src_ = (uint8_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK * + sizeof(*src_))); + ASSERT_NE(src_, nullptr); + dgd_ = (uint8_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK * + sizeof(*dgd_))); + ASSERT_NE(dgd_, nullptr); + flt0_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK * + sizeof(*flt0_))); + ASSERT_NE(flt0_, nullptr); + flt1_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK * + sizeof(*flt1_))); + ASSERT_NE(flt1_, nullptr); + } + virtual void TearDown() { + aom_free(src_); + aom_free(dgd_); + aom_free(flt0_); + aom_free(flt1_); + } + void RunGetProjSubspaceTest(int32_t run_times); + void RunGetProjSubspaceTest_ExtremeValues(); + + private: + set_get_proj_subspace target_func_; + libaom_test::ACMRandom rng_; + uint8_t *src_; + uint8_t *dgd_; + int32_t *flt0_; + int32_t *flt1_; +}; + +void GetProjSubspaceTest::RunGetProjSubspaceTest(int32_t run_times) { + int h_end = run_times != 1 + ? 128 + : ((rng_.Rand16() % MAX_DATA_BLOCK) & + 2147483640); // We test for widths divisible by 8. + int v_end = + run_times != 1 ? 128 : ((rng_.Rand16() % MAX_DATA_BLOCK) & 2147483640); + const int dgd_stride = MAX_DATA_BLOCK; + const int src_stride = MAX_DATA_BLOCK; + const int flt0_stride = MAX_DATA_BLOCK; + const int flt1_stride = MAX_DATA_BLOCK; + sgr_params_type params; + const int iters = run_times == 1 ? kIterations : 4; + for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) { + int64_t C_ref[2] = { 0 }, C_test[2] = { 0 }; + int64_t H_ref[2][2] = { { 0, 0 }, { 0, 0 } }; + int64_t H_test[2][2] = { { 0, 0 }, { 0, 0 } }; + for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) { + dgd_[i] = rng_.Rand8(); + src_[i] = rng_.Rand8(); + flt0_[i] = rng_.Rand15Signed(); + flt1_[i] = rng_.Rand15Signed(); + } + + params.r[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : 1; + params.r[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : 1; + params.s[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2); + params.s[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2); + uint8_t *dgd = dgd_; + uint8_t *src = src_; + + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + av1_calc_proj_params_c(src, v_end, h_end, src_stride, dgd, dgd_stride, + flt0_, flt0_stride, flt1_, flt1_stride, H_ref, + C_ref, ¶ms); + } + aom_usec_timer_mark(&timer); + const double time1 = static_cast(aom_usec_timer_elapsed(&timer)); + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + target_func_(src, v_end, h_end, src_stride, dgd, dgd_stride, flt0_, + flt0_stride, flt1_, flt1_stride, H_test, C_test, ¶ms); + } + aom_usec_timer_mark(&timer); + const double time2 = static_cast(aom_usec_timer_elapsed(&timer)); + if (run_times > 10) { + printf("r0 %d r1 %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", params.r[0], + params.r[1], h_end, v_end, time1, time2, time1 / time2); + } else { + ASSERT_EQ(H_ref[0][0], H_test[0][0]); + ASSERT_EQ(H_ref[0][1], H_test[0][1]); + ASSERT_EQ(H_ref[1][0], H_test[1][0]); + ASSERT_EQ(H_ref[1][1], H_test[1][1]); + ASSERT_EQ(C_ref[0], C_test[0]); + ASSERT_EQ(C_ref[1], C_test[1]); + } + } +} + +void GetProjSubspaceTest::RunGetProjSubspaceTest_ExtremeValues() { + const int h_start = 0; + int h_end = MAX_DATA_BLOCK; + const int v_start = 0; + int v_end = MAX_DATA_BLOCK; + const int dgd_stride = MAX_DATA_BLOCK; + const int src_stride = MAX_DATA_BLOCK; + const int flt0_stride = MAX_DATA_BLOCK; + const int flt1_stride = MAX_DATA_BLOCK; + sgr_params_type params; + const int iters = kIterations; + for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) { + int64_t C_ref[2] = { 0 }, C_test[2] = { 0 }; + int64_t H_ref[2][2] = { { 0, 0 }, { 0, 0 } }; + int64_t H_test[2][2] = { { 0, 0 }, { 0, 0 } }; + for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) { + dgd_[i] = 0; + src_[i] = 255; + flt0_[i] = rng_.Rand15Signed(); + flt1_[i] = rng_.Rand15Signed(); + } + params.r[0] = 1; + params.r[1] = 1; + params.s[0] = rng_.Rand8() % MAX_RADIUS; + params.s[1] = rng_.Rand8() % MAX_RADIUS; + uint8_t *dgd = dgd_; + uint8_t *src = src_; + + av1_calc_proj_params_c(src, h_end - h_start, v_end - v_start, src_stride, + dgd, dgd_stride, flt0_, flt0_stride, flt1_, + flt1_stride, H_ref, C_ref, ¶ms); + + target_func_(src, h_end - h_start, v_end - v_start, src_stride, dgd, + dgd_stride, flt0_, flt0_stride, flt1_, flt1_stride, H_test, + C_test, ¶ms); + + ASSERT_EQ(H_ref[0][0], H_test[0][0]); + ASSERT_EQ(H_ref[0][1], H_test[0][1]); + ASSERT_EQ(H_ref[1][0], H_test[1][0]); + ASSERT_EQ(H_ref[1][1], H_test[1][1]); + ASSERT_EQ(C_ref[0], C_test[0]); + ASSERT_EQ(C_ref[1], C_test[1]); + } +} + +TEST_P(GetProjSubspaceTest, RandomValues) { RunGetProjSubspaceTest(1); } + +TEST_P(GetProjSubspaceTest, ExtremeValues) { + RunGetProjSubspaceTest_ExtremeValues(); +} + +TEST_P(GetProjSubspaceTest, DISABLED_Speed) { RunGetProjSubspaceTest(200000); } + +#if HAVE_AVX2 + +INSTANTIATE_TEST_SUITE_P(AVX2, GetProjSubspaceTest, + ::testing::Values(av1_calc_proj_params_avx2)); +#endif // HAVE_AVX2 + +} // namespace get_proj_subspace_test_lowbd +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/libs/libaom/src/test/qm_test.cc b/libs/libaom/src/test/qm_test.cc new file mode 100644 index 000000000..d1dfbb849 --- /dev/null +++ b/libs/libaom/src/test/qm_test.cc @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "config/aom_config.h" + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" + +namespace { + +class QMTest + : public ::libaom_test::CodecTestWith2Params, + public ::libaom_test::EncoderTest { + protected: + QMTest() : EncoderTest(GET_PARAM(0)) {} + virtual ~QMTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(GET_PARAM(1)); + set_cpu_used_ = GET_PARAM(2); + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AOME_SET_CPUUSED, set_cpu_used_); + encoder->Control(AV1E_SET_ENABLE_QM, 1); + encoder->Control(AV1E_SET_QM_MIN, qm_min_); + encoder->Control(AV1E_SET_QM_MAX, qm_max_); + + encoder->Control(AOME_SET_MAX_INTRA_BITRATE_PCT, 100); + } + } + + void DoTest(int qm_min, int qm_max) { + qm_min_ = qm_min; + qm_max_ = qm_max; + cfg_.kf_max_dist = 12; + cfg_.rc_min_quantizer = 8; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = AOM_CBR; + cfg_.g_lag_in_frames = 6; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_target_bitrate = 300; + ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, + 288, 30, 1, 0, 15); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + int set_cpu_used_; + int qm_min_; + int qm_max_; +}; + +// encodes and decodes without a mismatch. +TEST_P(QMTest, TestNoMisMatchQM1) { DoTest(5, 9); } + +// encodes and decodes without a mismatch. +TEST_P(QMTest, TestNoMisMatchQM2) { DoTest(0, 8); } + +// encodes and decodes without a mismatch. +TEST_P(QMTest, TestNoMisMatchQM3) { DoTest(9, 15); } + +AV1_INSTANTIATE_TEST_CASE(QMTest, + ::testing::Values(::libaom_test::kRealTime, + ::libaom_test::kOnePassGood), + ::testing::Range(5, 9)); +} // namespace diff --git a/libs/libaom/src/test/quantize_func_test.cc b/libs/libaom/src/test/quantize_func_test.cc new file mode 100644 index 000000000..b40b38d5a --- /dev/null +++ b/libs/libaom/src/test/quantize_func_test.cc @@ -0,0 +1,547 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_codec.h" +#include "aom_ports/aom_timer.h" +#include "av1/encoder/encoder.h" +#include "av1/common/scan.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" + +namespace { +using libaom_test::ACMRandom; + +#define QUAN_PARAM_LIST \ + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, \ + const int16_t *round_ptr, const int16_t *quant_ptr, \ + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, \ + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, \ + const int16_t *scan, const int16_t *iscan + +typedef void (*QuantizeFunc)(QUAN_PARAM_LIST); +typedef void (*QuantizeFuncHbd)(QUAN_PARAM_LIST, int log_scale); + +#define HBD_QUAN_FUNC \ + fn(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, \ + qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, log_scale) + +#define LBD_QUAN_FUNC \ + fn(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, \ + qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan) + +template +void highbd_quan16x16_wrapper(QUAN_PARAM_LIST) { + const int log_scale = 0; + HBD_QUAN_FUNC; +} + +template +void highbd_quan32x32_wrapper(QUAN_PARAM_LIST) { + const int log_scale = 1; + HBD_QUAN_FUNC; +} + +template +void highbd_quan64x64_wrapper(QUAN_PARAM_LIST) { + const int log_scale = 2; + HBD_QUAN_FUNC; +} + +enum QuantType { TYPE_B, TYPE_DC, TYPE_FP }; + +using std::tuple; +typedef tuple + QuantizeParam; + +typedef struct { + QUANTS quant; + Dequants dequant; +} QuanTable; + +const int kTestNum = 1000; + +class QuantizeTest : public ::testing::TestWithParam { + protected: + QuantizeTest() + : quant_ref_(GET_PARAM(0)), quant_(GET_PARAM(1)), tx_size_(GET_PARAM(2)), + type_(GET_PARAM(3)), bd_(GET_PARAM(4)) {} + + virtual ~QuantizeTest() {} + + virtual void SetUp() { + qtab_ = reinterpret_cast(aom_memalign(32, sizeof(*qtab_))); + const int n_coeffs = coeff_num(); + coeff_ = reinterpret_cast( + aom_memalign(32, 6 * n_coeffs * sizeof(tran_low_t))); + InitQuantizer(); + } + + virtual void TearDown() { + aom_free(qtab_); + qtab_ = NULL; + aom_free(coeff_); + coeff_ = NULL; + libaom_test::ClearSystemState(); + } + + void InitQuantizer() { + av1_build_quantizer(bd_, 0, 0, 0, 0, 0, &qtab_->quant, &qtab_->dequant); + } + + void QuantizeRun(bool is_loop, int q = 0, int test_num = 1) { + tran_low_t *coeff_ptr = coeff_; + const intptr_t n_coeffs = coeff_num(); + + tran_low_t *qcoeff_ref = coeff_ptr + n_coeffs; + tran_low_t *dqcoeff_ref = qcoeff_ref + n_coeffs; + + tran_low_t *qcoeff = dqcoeff_ref + n_coeffs; + tran_low_t *dqcoeff = qcoeff + n_coeffs; + uint16_t *eob = (uint16_t *)(dqcoeff + n_coeffs); + + // Testing uses 2-D DCT scan order table + const SCAN_ORDER *const sc = get_default_scan(tx_size_, DCT_DCT); + + // Testing uses luminance quantization table + const int16_t *zbin = qtab_->quant.y_zbin[q]; + + const int16_t *round = 0; + const int16_t *quant = 0; + if (type_ == TYPE_B) { + round = qtab_->quant.y_round[q]; + quant = qtab_->quant.y_quant[q]; + } else if (type_ == TYPE_FP) { + round = qtab_->quant.y_round_fp[q]; + quant = qtab_->quant.y_quant_fp[q]; + } + + const int16_t *quant_shift = qtab_->quant.y_quant_shift[q]; + const int16_t *dequant = qtab_->dequant.y_dequant_QTX[q]; + + for (int i = 0; i < test_num; ++i) { + if (is_loop) FillCoeffRandom(); + + memset(qcoeff_ref, 0, 5 * n_coeffs * sizeof(*qcoeff_ref)); + + quant_ref_(coeff_ptr, n_coeffs, zbin, round, quant, quant_shift, + qcoeff_ref, dqcoeff_ref, dequant, &eob[0], sc->scan, + sc->iscan); + + ASM_REGISTER_STATE_CHECK(quant_(coeff_ptr, n_coeffs, zbin, round, quant, + quant_shift, qcoeff, dqcoeff, dequant, + &eob[1], sc->scan, sc->iscan)); + + for (int j = 0; j < n_coeffs; ++j) { + ASSERT_EQ(qcoeff_ref[j], qcoeff[j]) + << "Q mismatch on test: " << i << " at position: " << j + << " Q: " << q << " coeff: " << coeff_ptr[j]; + } + + for (int j = 0; j < n_coeffs; ++j) { + ASSERT_EQ(dqcoeff_ref[j], dqcoeff[j]) + << "Dq mismatch on test: " << i << " at position: " << j + << " Q: " << q << " coeff: " << coeff_ptr[j]; + } + + ASSERT_EQ(eob[0], eob[1]) + << "eobs mismatch on test: " << i << " Q: " << q; + } + } + + void CompareResults(const tran_low_t *buf_ref, const tran_low_t *buf, + int size, const char *text, int q, int number) { + int i; + for (i = 0; i < size; ++i) { + ASSERT_EQ(buf_ref[i], buf[i]) << text << " mismatch on test: " << number + << " at position: " << i << " Q: " << q; + } + } + + int coeff_num() const { return av1_get_max_eob(tx_size_); } + + void FillCoeff(tran_low_t c) { + const int n_coeffs = coeff_num(); + for (int i = 0; i < n_coeffs; ++i) { + coeff_[i] = c; + } + } + + void FillCoeffRandom() { + const int n_coeffs = coeff_num(); + FillCoeffZero(); + int num = rnd_.Rand16() % n_coeffs; + for (int i = 0; i < num; ++i) { + coeff_[i] = GetRandomCoeff(); + } + } + + void FillCoeffRandomRows(int num) { + FillCoeffZero(); + for (int i = 0; i < num; ++i) { + coeff_[i] = GetRandomCoeff(); + } + } + + void FillCoeffZero() { FillCoeff(0); } + + void FillCoeffConstant() { + tran_low_t c = GetRandomCoeff(); + FillCoeff(c); + } + + void FillDcOnly() { + FillCoeffZero(); + coeff_[0] = GetRandomCoeff(); + } + + void FillDcLargeNegative() { + FillCoeffZero(); + // Generate a qcoeff which contains 512/-512 (0x0100/0xFE00) to catch issues + // like BUG=883 where the constant being compared was incorrectly + // initialized. + coeff_[0] = -8191; + } + + tran_low_t GetRandomCoeff() { + tran_low_t coeff; + if (bd_ == AOM_BITS_8) { + coeff = + clamp(static_cast(rnd_.Rand16()), INT16_MIN + 1, INT16_MAX); + } else { + tran_low_t min = -(1 << (7 + bd_)); + tran_low_t max = -min - 1; + coeff = clamp(static_cast(rnd_.Rand31()), min, max); + } + return coeff; + } + + ACMRandom rnd_; + QuanTable *qtab_; + tran_low_t *coeff_; + QuantizeFunc quant_ref_; + QuantizeFunc quant_; + TX_SIZE tx_size_; + QuantType type_; + aom_bit_depth_t bd_; +}; + +TEST_P(QuantizeTest, ZeroInput) { + FillCoeffZero(); + QuantizeRun(false); +} + +TEST_P(QuantizeTest, LargeNegativeInput) { + FillDcLargeNegative(); + QuantizeRun(false, 0, 1); +} + +TEST_P(QuantizeTest, DcOnlyInput) { + FillDcOnly(); + QuantizeRun(false, 0, 1); +} + +TEST_P(QuantizeTest, RandomInput) { QuantizeRun(true, 0, kTestNum); } + +TEST_P(QuantizeTest, MultipleQ) { + for (int q = 0; q < QINDEX_RANGE; ++q) { + QuantizeRun(true, q, kTestNum); + } +} + +// Force the coeff to be half the value of the dequant. This exposes a +// mismatch found in av1_quantize_fp_sse2(). +TEST_P(QuantizeTest, CoeffHalfDequant) { + FillCoeff(16); + QuantizeRun(false, 25, 1); +} + +TEST_P(QuantizeTest, DISABLED_Speed) { + tran_low_t *coeff_ptr = coeff_; + const intptr_t n_coeffs = coeff_num(); + + tran_low_t *qcoeff_ref = coeff_ptr + n_coeffs; + tran_low_t *dqcoeff_ref = qcoeff_ref + n_coeffs; + + tran_low_t *qcoeff = dqcoeff_ref + n_coeffs; + tran_low_t *dqcoeff = qcoeff + n_coeffs; + uint16_t *eob = (uint16_t *)(dqcoeff + n_coeffs); + + // Testing uses 2-D DCT scan order table + const SCAN_ORDER *const sc = get_default_scan(tx_size_, DCT_DCT); + + // Testing uses luminance quantization table + const int q = 22; + const int16_t *zbin = qtab_->quant.y_zbin[q]; + const int16_t *round_fp = qtab_->quant.y_round_fp[q]; + const int16_t *quant_fp = qtab_->quant.y_quant_fp[q]; + const int16_t *quant_shift = qtab_->quant.y_quant_shift[q]; + const int16_t *dequant = qtab_->dequant.y_dequant_QTX[q]; + const int kNumTests = 5000000; + aom_usec_timer timer, simd_timer; + int rows = tx_size_high[tx_size_]; + int cols = tx_size_wide[tx_size_]; + rows = AOMMIN(32, rows); + cols = AOMMIN(32, cols); + for (int cnt = 0; cnt <= rows; cnt++) { + FillCoeffRandomRows(cnt * cols); + + aom_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + quant_ref_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift, + qcoeff, dqcoeff, dequant, eob, sc->scan, sc->iscan); + } + aom_usec_timer_mark(&timer); + + aom_usec_timer_start(&simd_timer); + for (int n = 0; n < kNumTests; ++n) { + quant_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift, qcoeff, + dqcoeff, dequant, eob, sc->scan, sc->iscan); + } + aom_usec_timer_mark(&simd_timer); + + const int elapsed_time = static_cast(aom_usec_timer_elapsed(&timer)); + const int simd_elapsed_time = + static_cast(aom_usec_timer_elapsed(&simd_timer)); + printf("c_time = %d \t simd_time = %d \t Gain = %d \n", elapsed_time, + simd_elapsed_time, (elapsed_time / simd_elapsed_time)); + } +} + +using std::make_tuple; + +#if HAVE_AVX2 +const QuantizeParam kQParamArrayAvx2[] = { + make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2, + static_cast(TX_16X16), TYPE_FP, AOM_BITS_8), + make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2, + static_cast(TX_4X16), TYPE_FP, AOM_BITS_8), + make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2, + static_cast(TX_16X4), TYPE_FP, AOM_BITS_8), + make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2, + static_cast(TX_32X8), TYPE_FP, AOM_BITS_8), + make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2, + static_cast(TX_8X32), TYPE_FP, AOM_BITS_8), + make_tuple(&av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_avx2, + static_cast(TX_32X32), TYPE_FP, AOM_BITS_8), + make_tuple(&av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_avx2, + static_cast(TX_16X64), TYPE_FP, AOM_BITS_8), + make_tuple(&av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_avx2, + static_cast(TX_64X16), TYPE_FP, AOM_BITS_8), + make_tuple(&av1_quantize_fp_64x64_c, &av1_quantize_fp_64x64_avx2, + static_cast(TX_64X64), TYPE_FP, AOM_BITS_8), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(&highbd_quan16x16_wrapper, + &highbd_quan16x16_wrapper, + static_cast(TX_16X16), TYPE_FP, AOM_BITS_8), + make_tuple(&highbd_quan16x16_wrapper, + &highbd_quan16x16_wrapper, + static_cast(TX_16X16), TYPE_FP, AOM_BITS_10), + make_tuple(&highbd_quan16x16_wrapper, + &highbd_quan16x16_wrapper, + static_cast(TX_16X16), TYPE_FP, AOM_BITS_12), + make_tuple(&highbd_quan32x32_wrapper, + &highbd_quan32x32_wrapper, + static_cast(TX_32X32), TYPE_FP, AOM_BITS_8), + make_tuple(&highbd_quan32x32_wrapper, + &highbd_quan32x32_wrapper, + static_cast(TX_32X32), TYPE_FP, AOM_BITS_10), + make_tuple(&highbd_quan32x32_wrapper, + &highbd_quan32x32_wrapper, + static_cast(TX_32X32), TYPE_FP, AOM_BITS_12), + make_tuple(&highbd_quan64x64_wrapper, + &highbd_quan64x64_wrapper, + static_cast(TX_64X64), TYPE_FP, AOM_BITS_8), + make_tuple(&highbd_quan64x64_wrapper, + &highbd_quan64x64_wrapper, + static_cast(TX_64X64), TYPE_FP, AOM_BITS_10), + make_tuple(&highbd_quan64x64_wrapper, + &highbd_quan64x64_wrapper, + static_cast(TX_64X64), TYPE_FP, AOM_BITS_12), + make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_avx2, + static_cast(TX_16X16), TYPE_B, AOM_BITS_8), + make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_avx2, + static_cast(TX_16X16), TYPE_B, AOM_BITS_10), + make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_avx2, + static_cast(TX_16X16), TYPE_B, AOM_BITS_12), + make_tuple(&aom_highbd_quantize_b_adaptive_c, + &aom_highbd_quantize_b_adaptive_avx2, + static_cast(TX_16X16), TYPE_B, AOM_BITS_8), + make_tuple(&aom_highbd_quantize_b_adaptive_c, + &aom_highbd_quantize_b_adaptive_avx2, + static_cast(TX_16X16), TYPE_B, AOM_BITS_10), + make_tuple(&aom_highbd_quantize_b_adaptive_c, + &aom_highbd_quantize_b_adaptive_avx2, + static_cast(TX_16X16), TYPE_B, AOM_BITS_12), + make_tuple(&aom_highbd_quantize_b_32x32_adaptive_c, + &aom_highbd_quantize_b_32x32_adaptive_avx2, + static_cast(TX_32X32), TYPE_B, AOM_BITS_8), + make_tuple(&aom_highbd_quantize_b_32x32_adaptive_c, + &aom_highbd_quantize_b_32x32_adaptive_avx2, + static_cast(TX_32X32), TYPE_B, AOM_BITS_10), + make_tuple(&aom_highbd_quantize_b_32x32_adaptive_c, + &aom_highbd_quantize_b_32x32_adaptive_avx2, + static_cast(TX_32X32), TYPE_B, AOM_BITS_12), +#endif + make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_avx2, + static_cast(TX_16X16), TYPE_B, AOM_BITS_8), + make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_avx2, + static_cast(TX_8X8), TYPE_B, AOM_BITS_8), + make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_avx2, + static_cast(TX_4X4), TYPE_B, AOM_BITS_8) +}; + +INSTANTIATE_TEST_SUITE_P(AVX2, QuantizeTest, + ::testing::ValuesIn(kQParamArrayAvx2)); +#endif // HAVE_AVX2 + +#if HAVE_SSE2 +const QuantizeParam kQParamArraySSE2[] = { + make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2, + static_cast(TX_16X16), TYPE_FP, AOM_BITS_8), + make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2, + static_cast(TX_4X16), TYPE_FP, AOM_BITS_8), + make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2, + static_cast(TX_16X4), TYPE_FP, AOM_BITS_8), + make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2, + static_cast(TX_8X32), TYPE_FP, AOM_BITS_8), + make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2, + static_cast(TX_32X8), TYPE_FP, AOM_BITS_8), + make_tuple(&aom_quantize_b_c, &aom_quantize_b_sse2, + static_cast(TX_16X16), TYPE_B, AOM_BITS_8), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_sse2, + static_cast(TX_16X16), TYPE_B, AOM_BITS_8), + make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_sse2, + static_cast(TX_16X16), TYPE_B, AOM_BITS_10), + make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_sse2, + static_cast(TX_16X16), TYPE_B, AOM_BITS_12), + make_tuple(&aom_highbd_quantize_b_adaptive_c, + &aom_highbd_quantize_b_adaptive_sse2, + static_cast(TX_16X16), TYPE_B, AOM_BITS_8), + make_tuple(&aom_highbd_quantize_b_adaptive_c, + &aom_highbd_quantize_b_adaptive_sse2, + static_cast(TX_16X16), TYPE_B, AOM_BITS_10), + make_tuple(&aom_highbd_quantize_b_adaptive_c, + &aom_highbd_quantize_b_adaptive_sse2, + static_cast(TX_16X16), TYPE_B, AOM_BITS_12), + make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_sse2, + static_cast(TX_32X32), TYPE_B, AOM_BITS_8), + make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_sse2, + static_cast(TX_32X32), TYPE_B, AOM_BITS_10), + make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_sse2, + static_cast(TX_32X32), TYPE_B, AOM_BITS_12), + make_tuple(&aom_highbd_quantize_b_32x32_adaptive_c, + &aom_highbd_quantize_b_32x32_adaptive_sse2, + static_cast(TX_32X32), TYPE_B, AOM_BITS_8), + make_tuple(&aom_highbd_quantize_b_32x32_adaptive_c, + &aom_highbd_quantize_b_32x32_adaptive_sse2, + static_cast(TX_32X32), TYPE_B, AOM_BITS_10), + make_tuple(&aom_highbd_quantize_b_32x32_adaptive_c, + &aom_highbd_quantize_b_32x32_adaptive_sse2, + static_cast(TX_32X32), TYPE_B, AOM_BITS_12), + make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2, + static_cast(TX_64X64), TYPE_B, AOM_BITS_8), + make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2, + static_cast(TX_64X64), TYPE_B, AOM_BITS_10), + make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2, + static_cast(TX_64X64), TYPE_B, AOM_BITS_12), + make_tuple(&aom_highbd_quantize_b_64x64_adaptive_c, + &aom_highbd_quantize_b_64x64_adaptive_sse2, + static_cast(TX_64X64), TYPE_B, AOM_BITS_8), + make_tuple(&aom_highbd_quantize_b_64x64_adaptive_c, + &aom_highbd_quantize_b_64x64_adaptive_sse2, + static_cast(TX_64X64), TYPE_B, AOM_BITS_10), + make_tuple(&aom_highbd_quantize_b_64x64_adaptive_c, + &aom_highbd_quantize_b_64x64_adaptive_sse2, + static_cast(TX_64X64), TYPE_B, AOM_BITS_12), +#endif + make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_sse2, + static_cast(TX_16X16), TYPE_B, AOM_BITS_8), + make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_sse2, + static_cast(TX_8X8), TYPE_B, AOM_BITS_8), + make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_sse2, + static_cast(TX_4X4), TYPE_B, AOM_BITS_8), + make_tuple(&aom_quantize_b_32x32_adaptive_c, + &aom_quantize_b_32x32_adaptive_sse2, + static_cast(TX_32X16), TYPE_B, AOM_BITS_8), + make_tuple(&aom_quantize_b_32x32_adaptive_c, + &aom_quantize_b_32x32_adaptive_sse2, + static_cast(TX_16X32), TYPE_B, AOM_BITS_8), + make_tuple(&aom_quantize_b_32x32_adaptive_c, + &aom_quantize_b_32x32_adaptive_sse2, + static_cast(TX_32X32), TYPE_B, AOM_BITS_8), + make_tuple(&aom_quantize_b_64x64_adaptive_c, + &aom_quantize_b_64x64_adaptive_sse2, + static_cast(TX_32X64), TYPE_B, AOM_BITS_8), + make_tuple(&aom_quantize_b_64x64_adaptive_c, + &aom_quantize_b_64x64_adaptive_sse2, + static_cast(TX_64X32), TYPE_B, AOM_BITS_8), + make_tuple(&aom_quantize_b_64x64_adaptive_c, + &aom_quantize_b_64x64_adaptive_sse2, + static_cast(TX_64X64), TYPE_B, AOM_BITS_8) +}; + +INSTANTIATE_TEST_SUITE_P(SSE2, QuantizeTest, + ::testing::ValuesIn(kQParamArraySSE2)); +#endif + +#if HAVE_NEON +const QuantizeParam kQParamArrayNEON[] = { + make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_neon, + static_cast(TX_16X16), TYPE_FP, AOM_BITS_8), + make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_neon, + static_cast(TX_4X16), TYPE_FP, AOM_BITS_8), + make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_neon, + static_cast(TX_16X4), TYPE_FP, AOM_BITS_8), + make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_neon, + static_cast(TX_8X32), TYPE_FP, AOM_BITS_8), + make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_neon, + static_cast(TX_32X8), TYPE_FP, AOM_BITS_8) +}; + +INSTANTIATE_TEST_SUITE_P(NEON, QuantizeTest, + ::testing::ValuesIn(kQParamArrayNEON)); +#endif + +#if HAVE_SSSE3 && ARCH_X86_64 +INSTANTIATE_TEST_SUITE_P( + SSSE3, QuantizeTest, + ::testing::Values( + make_tuple(&aom_quantize_b_c, &aom_quantize_b_ssse3, + static_cast(TX_16X16), TYPE_B, AOM_BITS_8), + make_tuple(&aom_quantize_b_32x32_c, &aom_quantize_b_32x32_ssse3, + static_cast(TX_32X32), TYPE_B, AOM_BITS_8), + make_tuple(&aom_quantize_b_64x64_c, &aom_quantize_b_64x64_ssse3, + static_cast(TX_64X64), TYPE_B, AOM_BITS_8))); + +#endif // HAVE_SSSE3 && ARCH_X86_64 + +#if HAVE_AVX && ARCH_X86_64 +INSTANTIATE_TEST_SUITE_P( + AVX, QuantizeTest, + ::testing::Values( + make_tuple(&aom_quantize_b_c, &aom_quantize_b_avx, + static_cast(TX_16X16), TYPE_B, AOM_BITS_8), + make_tuple(&aom_quantize_b_32x32_c, &aom_quantize_b_32x32_avx, + static_cast(TX_32X32), TYPE_B, AOM_BITS_8))); + +#endif // HAVE_AVX && ARCH_X86_64 +} // namespace diff --git a/libs/libaom/src/test/reconinter_test.cc b/libs/libaom/src/test/reconinter_test.cc new file mode 100644 index 000000000..51bec0eab --- /dev/null +++ b/libs/libaom/src/test/reconinter_test.cc @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_ports/mem.h" +#include "av1/common/scan.h" +#include "av1/common/txb_common.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +namespace { +using libaom_test::ACMRandom; + +typedef void (*buildcompdiffwtdmaskd_func)(uint8_t *mask, + DIFFWTD_MASK_TYPE mask_type, + const uint8_t *src0, int src0_stride, + const uint8_t *src1, int src1_stride, + int h, int w); + +typedef std::tuple + BuildCompDiffwtdMaskDParam; + +#if HAVE_SSE4_1 +::testing::internal::ParamGenerator BuildParams( + buildcompdiffwtdmaskd_func filter) { + return ::testing::Combine(::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL), + ::testing::Values(filter)); +} +#endif + +class BuildCompDiffwtdMaskTest + : public ::testing::TestWithParam { + public: + virtual ~BuildCompDiffwtdMaskTest() {} + + virtual void TearDown() { libaom_test::ClearSystemState(); } + void RunTest(buildcompdiffwtdmaskd_func test_impl, const int is_speed, + const DIFFWTD_MASK_TYPE type); + + private: + ACMRandom rnd_; +}; + +typedef void (*buildcompdiffwtdmaskd16_func)( + uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, + int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, + ConvolveParams *conv_params, int bd); + +typedef std::tuple + BuildCompDiffwtdMaskD16Param; + +#if HAVE_SSE4_1 || HAVE_NEON +::testing::internal::ParamGenerator BuildParams( + buildcompdiffwtdmaskd16_func filter) { + return ::testing::Combine(::testing::Range(8, 13, 2), + ::testing::Values(filter), + ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL)); +} +#endif +class BuildCompDiffwtdMaskD16Test + : public ::testing::TestWithParam { + public: + ~BuildCompDiffwtdMaskD16Test() {} + virtual void TearDown() { libaom_test::ClearSystemState(); } + void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); } + + protected: + void RunCheckOutput(buildcompdiffwtdmaskd16_func test_impl); + void RunSpeedTest(buildcompdiffwtdmaskd16_func test_impl, + DIFFWTD_MASK_TYPE mask_type); + libaom_test::ACMRandom rnd_; +}; // class BuildCompDiffwtdMaskD16Test + +void BuildCompDiffwtdMaskD16Test::RunCheckOutput( + buildcompdiffwtdmaskd16_func test_impl) { + const int block_idx = GET_PARAM(2); + const int bd = GET_PARAM(0); + const int width = block_size_wide[block_idx]; + const int height = block_size_high[block_idx]; + DECLARE_ALIGNED(16, uint8_t, mask_ref[2 * MAX_SB_SQUARE]); + DECLARE_ALIGNED(16, uint8_t, mask_test[2 * MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, uint16_t, src0[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, uint16_t, src1[MAX_SB_SQUARE]); + + ConvolveParams conv_params = get_conv_params_no_round(0, 0, NULL, 0, 1, bd); + + int in_precision = + bd + 2 * FILTER_BITS - conv_params.round_0 - conv_params.round_1 + 2; + + for (int i = 0; i < MAX_SB_SQUARE; i++) { + src0[i] = rnd_.Rand16() & ((1 << in_precision) - 1); + src1[i] = rnd_.Rand16() & ((1 << in_precision) - 1); + } + + for (int mask_type = 0; mask_type < DIFFWTD_MASK_TYPES; mask_type++) { + av1_build_compound_diffwtd_mask_d16_c( + mask_ref, (DIFFWTD_MASK_TYPE)mask_type, src0, width, src1, width, + height, width, &conv_params, bd); + + test_impl(mask_test, (DIFFWTD_MASK_TYPE)mask_type, src0, width, src1, width, + height, width, &conv_params, bd); + + for (int r = 0; r < height; ++r) { + for (int c = 0; c < width; ++c) { + ASSERT_EQ(mask_ref[c + r * width], mask_test[c + r * width]) + << "Mismatch at unit tests for BuildCompDiffwtdMaskD16Test\n" + << " Pixel mismatch at index " + << "[" << r << "," << c << "] " + << " @ " << width << "x" << height << " inv " << mask_type; + } + } + } +} + +void BuildCompDiffwtdMaskD16Test::RunSpeedTest( + buildcompdiffwtdmaskd16_func test_impl, DIFFWTD_MASK_TYPE mask_type) { + const int block_idx = GET_PARAM(2); + const int bd = GET_PARAM(0); + const int width = block_size_wide[block_idx]; + const int height = block_size_high[block_idx]; + DECLARE_ALIGNED(16, uint8_t, mask[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, uint16_t, src0[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, uint16_t, src1[MAX_SB_SQUARE]); + + ConvolveParams conv_params = get_conv_params_no_round(0, 0, NULL, 0, 1, bd); + + int in_precision = + bd + 2 * FILTER_BITS - conv_params.round_0 - conv_params.round_1 + 2; + + for (int i = 0; i < MAX_SB_SQUARE; i++) { + src0[i] = rnd_.Rand16() & ((1 << in_precision) - 1); + src1[i] = rnd_.Rand16() & ((1 << in_precision) - 1); + } + + const int num_loops = 10000000 / (width + height); + aom_usec_timer timer; + aom_usec_timer_start(&timer); + + for (int i = 0; i < num_loops; ++i) + av1_build_compound_diffwtd_mask_d16_c(mask, mask_type, src0, width, src1, + width, height, width, &conv_params, + bd); + + aom_usec_timer_mark(&timer); + const int elapsed_time = static_cast(aom_usec_timer_elapsed(&timer)); + + aom_usec_timer timer1; + aom_usec_timer_start(&timer1); + + for (int i = 0; i < num_loops; ++i) + test_impl(mask, mask_type, src0, width, src1, width, height, width, + &conv_params, bd); + + aom_usec_timer_mark(&timer1); + const int elapsed_time1 = static_cast(aom_usec_timer_elapsed(&timer1)); + printf("av1_build_compound_diffwtd_mask_d16 %3dx%-3d: %7.2f \n", width, + height, elapsed_time / double(elapsed_time1)); +} +#if HAVE_SSE4_1 +void BuildCompDiffwtdMaskTest::RunTest(buildcompdiffwtdmaskd_func test_impl, + const int is_speed, + const DIFFWTD_MASK_TYPE type) { + const int sb_type = GET_PARAM(0); + const int width = block_size_wide[sb_type]; + const int height = block_size_high[sb_type]; + DECLARE_ALIGNED(16, uint8_t, mask_ref[MAX_SB_SQUARE]); + DECLARE_ALIGNED(16, uint8_t, mask_test[MAX_SB_SQUARE]); + DECLARE_ALIGNED(16, uint8_t, src0[MAX_SB_SQUARE]); + DECLARE_ALIGNED(16, uint8_t, src1[MAX_SB_SQUARE]); + ACMRandom rnd(ACMRandom::DeterministicSeed()); + for (int i = 0; i < width * height; i++) { + src0[i] = rnd.Rand8(); + src1[i] = rnd.Rand8(); + } + const int run_times = is_speed ? (10000000 / (width + height)) : 1; + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + av1_build_compound_diffwtd_mask_c(mask_ref, type, src0, width, src1, width, + height, width); + } + const double t1 = get_time_mark(&timer); + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + test_impl(mask_test, type, src0, width, src1, width, height, width); + } + const double t2 = get_time_mark(&timer); + if (is_speed) { + printf("mask %d %3dx%-3d:%7.2f/%7.2fns", type, width, height, t1, t2); + printf("(%3.2f)\n", t1 / t2); + } + for (int r = 0; r < height; ++r) { + for (int c = 0; c < width; ++c) { + ASSERT_EQ(mask_ref[c + r * width], mask_test[c + r * width]) + << "[" << r << "," << c << "] " << run_times << " @ " << width << "x" + << height << " inv " << type; + } + } +} + +TEST_P(BuildCompDiffwtdMaskTest, match) { + RunTest(GET_PARAM(1), 0, DIFFWTD_38); + RunTest(GET_PARAM(1), 0, DIFFWTD_38_INV); +} +TEST_P(BuildCompDiffwtdMaskTest, DISABLED_Speed) { + RunTest(GET_PARAM(1), 1, DIFFWTD_38); + RunTest(GET_PARAM(1), 1, DIFFWTD_38_INV); +} +#endif +TEST_P(BuildCompDiffwtdMaskD16Test, CheckOutput) { + RunCheckOutput(GET_PARAM(1)); +} + +TEST_P(BuildCompDiffwtdMaskD16Test, DISABLED_Speed) { + RunSpeedTest(GET_PARAM(1), DIFFWTD_38); + RunSpeedTest(GET_PARAM(1), DIFFWTD_38_INV); +} + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE4_1, BuildCompDiffwtdMaskTest, + BuildParams(av1_build_compound_diffwtd_mask_sse4_1)); + +INSTANTIATE_TEST_SUITE_P( + SSE4_1, BuildCompDiffwtdMaskD16Test, + BuildParams(av1_build_compound_diffwtd_mask_d16_sse4_1)); +#endif + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2, BuildCompDiffwtdMaskTest, + BuildParams(av1_build_compound_diffwtd_mask_avx2)); + +INSTANTIATE_TEST_SUITE_P(AVX2, BuildCompDiffwtdMaskD16Test, + BuildParams(av1_build_compound_diffwtd_mask_d16_avx2)); +#endif + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, BuildCompDiffwtdMaskD16Test, + BuildParams(av1_build_compound_diffwtd_mask_d16_neon)); +#endif + +} // namespace diff --git a/libs/libaom/src/test/register_state_check.h b/libs/libaom/src/test/register_state_check.h new file mode 100644 index 000000000..d404621dd --- /dev/null +++ b/libs/libaom/src/test/register_state_check.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_TEST_REGISTER_STATE_CHECK_H_ +#define AOM_TEST_REGISTER_STATE_CHECK_H_ + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" + +// ASM_REGISTER_STATE_CHECK(asm_function) +// Minimally validates the environment pre & post function execution. This +// variant should be used with assembly functions which are not expected to +// fully restore the system state. See platform implementations of +// RegisterStateCheck for details. +// +// API_REGISTER_STATE_CHECK(api_function) +// Performs all the checks done by ASM_REGISTER_STATE_CHECK() and any +// additional checks to ensure the environment is in a consistent state pre & +// post function execution. This variant should be used with API functions. +// See platform implementations of RegisterStateCheckXXX for details. +// + +#if defined(_WIN64) && ARCH_X86_64 + +#undef NOMINMAX +#define NOMINMAX +#define WIN32_LEAN_AND_MEAN +#include +#include + +inline bool operator==(const M128A &lhs, const M128A &rhs) { + return (lhs.Low == rhs.Low && lhs.High == rhs.High); +} + +namespace libaom_test { + +// Compares the state of xmm[6-15] at construction with their state at +// destruction. These registers should be preserved by the callee on +// Windows x64. +class RegisterStateCheck { + public: + RegisterStateCheck() { initialized_ = StoreRegisters(&pre_context_); } + ~RegisterStateCheck() { Check(); } + + private: + static bool StoreRegisters(CONTEXT *const context) { + const HANDLE this_thread = GetCurrentThread(); + EXPECT_TRUE(this_thread != NULL); + context->ContextFlags = CONTEXT_FLOATING_POINT; + const bool context_saved = GetThreadContext(this_thread, context) == TRUE; + EXPECT_TRUE(context_saved) << "GetLastError: " << GetLastError(); + return context_saved; + } + + // Compares the register state. Returns true if the states match. + void Check() const { + ASSERT_TRUE(initialized_); + CONTEXT post_context; + ASSERT_TRUE(StoreRegisters(&post_context)); + + const M128A *xmm_pre = &pre_context_.Xmm6; + const M128A *xmm_post = &post_context.Xmm6; + for (int i = 6; i <= 15; ++i) { + EXPECT_EQ(*xmm_pre, *xmm_post) << "xmm" << i << " has been modified!"; + ++xmm_pre; + ++xmm_post; + } + } + + bool initialized_; + CONTEXT pre_context_; +}; + +#define ASM_REGISTER_STATE_CHECK(statement) \ + do { \ + libaom_test::RegisterStateCheck reg_check; \ + statement; \ + } while (false) + +} // namespace libaom_test + +#else + +namespace libaom_test { + +class RegisterStateCheck {}; +#define ASM_REGISTER_STATE_CHECK(statement) statement + +} // namespace libaom_test + +#endif // _WIN64 && ARCH_X86_64 + +#if ARCH_X86 || ARCH_X86_64 +#if defined(__GNUC__) + +namespace libaom_test { + +// Checks the FPU tag word pre/post execution to ensure emms has been called. +class RegisterStateCheckMMX { + public: + RegisterStateCheckMMX() { + __asm__ volatile("fstenv %0" : "=rm"(pre_fpu_env_)); + } + ~RegisterStateCheckMMX() { Check(); } + + private: + // Checks the FPU tag word pre/post execution, returning false if not cleared + // to 0xffff. + void Check() const { + EXPECT_EQ(0xffff, pre_fpu_env_[4]) + << "FPU was in an inconsistent state prior to call"; + + uint16_t post_fpu_env[14]; + __asm__ volatile("fstenv %0" : "=rm"(post_fpu_env)); + EXPECT_EQ(0xffff, post_fpu_env[4]) + << "FPU was left in an inconsistent state after call"; + } + + uint16_t pre_fpu_env_[14]; +}; + +#define API_REGISTER_STATE_CHECK(statement) \ + do { \ + libaom_test::RegisterStateCheckMMX reg_check; \ + ASM_REGISTER_STATE_CHECK(statement); \ + } while (false) + +} // namespace libaom_test + +#endif // __GNUC__ +#endif // ARCH_X86 || ARCH_X86_64 + +#ifndef API_REGISTER_STATE_CHECK +#define API_REGISTER_STATE_CHECK ASM_REGISTER_STATE_CHECK +#endif + +#endif // AOM_TEST_REGISTER_STATE_CHECK_H_ diff --git a/libs/libaom/src/test/resize_test.cc b/libs/libaom/src/test/resize_test.cc new file mode 100644 index 000000000..bcf6794d0 --- /dev/null +++ b/libs/libaom/src/test/resize_test.cc @@ -0,0 +1,644 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include "aom_dsp/aom_dsp_common.h" +#include "common/tools_common.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/video_source.h" +#include "test/util.h" + +// Enable(1) or Disable(0) writing of the compressed bitstream. +#define WRITE_COMPRESSED_STREAM 0 + +namespace { + +#if WRITE_COMPRESSED_STREAM +static void mem_put_le16(char *const mem, unsigned int val) { + mem[0] = val; + mem[1] = val >> 8; +} + +static void mem_put_le32(char *const mem, unsigned int val) { + mem[0] = val; + mem[1] = val >> 8; + mem[2] = val >> 16; + mem[3] = val >> 24; +} + +static void write_ivf_file_header(const aom_codec_enc_cfg_t *const cfg, + int frame_cnt, FILE *const outfile) { + char header[32]; + + header[0] = 'D'; + header[1] = 'K'; + header[2] = 'I'; + header[3] = 'F'; + mem_put_le16(header + 4, 0); /* version */ + mem_put_le16(header + 6, 32); /* headersize */ + mem_put_le32(header + 8, AV1_FOURCC); /* fourcc (av1) */ + mem_put_le16(header + 12, cfg->g_w); /* width */ + mem_put_le16(header + 14, cfg->g_h); /* height */ + mem_put_le32(header + 16, cfg->g_timebase.den); /* rate */ + mem_put_le32(header + 20, cfg->g_timebase.num); /* scale */ + mem_put_le32(header + 24, frame_cnt); /* length */ + mem_put_le32(header + 28, 0); /* unused */ + + (void)fwrite(header, 1, 32, outfile); +} + +static void write_ivf_frame_size(FILE *const outfile, const size_t size) { + char header[4]; + mem_put_le32(header, static_cast(size)); + (void)fwrite(header, 1, 4, outfile); +} + +static void write_ivf_frame_header(const aom_codec_cx_pkt_t *const pkt, + FILE *const outfile) { + char header[12]; + aom_codec_pts_t pts; + + if (pkt->kind != AOM_CODEC_CX_FRAME_PKT) return; + + pts = pkt->data.frame.pts; + mem_put_le32(header, static_cast(pkt->data.frame.sz)); + mem_put_le32(header + 4, pts & 0xFFFFFFFF); + mem_put_le32(header + 8, pts >> 32); + + (void)fwrite(header, 1, 12, outfile); +} +#endif // WRITE_COMPRESSED_STREAM + +const unsigned int kInitialWidth = 320; +const unsigned int kInitialHeight = 240; + +struct FrameInfo { + FrameInfo(aom_codec_pts_t _pts, unsigned int _w, unsigned int _h) + : pts(_pts), w(_w), h(_h) {} + + aom_codec_pts_t pts; + unsigned int w; + unsigned int h; +}; + +void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, + unsigned int initial_h, unsigned int *w, + unsigned int *h, int flag_codec) { + if (frame < 10) { + *w = initial_w; + *h = initial_h; + return; + } + if (frame < 20) { + *w = initial_w * 3 / 4; + *h = initial_h * 3 / 4; + return; + } + if (frame < 30) { + *w = initial_w / 2; + *h = initial_h / 2; + return; + } + if (frame < 40) { + *w = initial_w; + *h = initial_h; + return; + } + if (frame < 50) { + *w = initial_w * 3 / 4; + *h = initial_h * 3 / 4; + return; + } + if (frame < 60) { + *w = initial_w / 2; + *h = initial_h / 2; + return; + } + if (frame < 70) { + *w = initial_w; + *h = initial_h; + return; + } + if (frame < 80) { + *w = initial_w * 3 / 4; + *h = initial_h * 3 / 4; + return; + } + if (frame < 90) { + *w = initial_w / 2; + *h = initial_h / 2; + return; + } + if (frame < 100) { + *w = initial_w * 3 / 4; + *h = initial_h * 3 / 4; + return; + } + if (frame < 110) { + *w = initial_w; + *h = initial_h; + return; + } + // Go down very low + if (frame < 120) { + *w = initial_w / 4; + *h = initial_h / 4; + return; + } + if (flag_codec == 1) { + // Cases that only works for AV1. + // For AV1: Swap width and height of original. + if (frame < 140) { + *w = initial_h; + *h = initial_w; + return; + } + } + *w = initial_w; + *h = initial_h; +} + +class ResizingVideoSource : public ::libaom_test::DummyVideoSource { + public: + ResizingVideoSource() { + SetSize(kInitialWidth, kInitialHeight); + limit_ = 150; + } + int flag_codec_; + virtual ~ResizingVideoSource() {} + + protected: + virtual void Next() { + ++frame_; + unsigned int width; + unsigned int height; + ScaleForFrameNumber(frame_, kInitialWidth, kInitialHeight, &width, &height, + flag_codec_); + SetSize(width, height); + FillFrame(); + } +}; + +class ResizeTest + : public ::libaom_test::CodecTestWithParam, + public ::libaom_test::EncoderTest { + protected: + ResizeTest() : EncoderTest(GET_PARAM(0)) {} + + virtual ~ResizeTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(GET_PARAM(1)); + } + + virtual void DecompressedFrameHook(const aom_image_t &img, + aom_codec_pts_t pts) { + frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h)); + } + + std::vector frame_info_list_; +}; + +TEST_P(ResizeTest, TestExternalResizeWorks) { + ResizingVideoSource video; + video.flag_codec_ = 0; + cfg_.g_lag_in_frames = 0; + // We use max(kInitialWidth, kInitialHeight) because during the test + // the width and height of the frame are swapped + cfg_.g_forced_max_frame_width = cfg_.g_forced_max_frame_height = + AOMMAX(kInitialWidth, kInitialHeight); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + // Check we decoded the same number of frames as we attempted to encode + ASSERT_EQ(frame_info_list_.size(), video.limit()); + + for (std::vector::const_iterator info = frame_info_list_.begin(); + info != frame_info_list_.end(); ++info) { + const unsigned int frame = static_cast(info->pts); + unsigned int expected_w; + unsigned int expected_h; + ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w, + &expected_h, 0); + EXPECT_EQ(expected_w, info->w) + << "Frame " << frame << " had unexpected width"; + EXPECT_EQ(expected_h, info->h) + << "Frame " << frame << " had unexpected height"; + } +} + +const unsigned int kStepDownFrame = 3; +const unsigned int kStepUpFrame = 6; + +class ResizeInternalTestLarge : public ResizeTest { + protected: +#if WRITE_COMPRESSED_STREAM + ResizeInternalTestLarge() + : ResizeTest(), frame0_psnr_(0.0), outfile_(NULL), out_frames_(0) {} +#else + ResizeInternalTestLarge() : ResizeTest(), frame0_psnr_(0.0) {} +#endif + + virtual ~ResizeInternalTestLarge() {} + + virtual void BeginPassHook(unsigned int /*pass*/) { +#if WRITE_COMPRESSED_STREAM + outfile_ = fopen("av10-2-05-resize.ivf", "wb"); +#endif + } + + virtual void EndPassHook() { +#if WRITE_COMPRESSED_STREAM + if (outfile_) { + if (!fseek(outfile_, 0, SEEK_SET)) + write_ivf_file_header(&cfg_, out_frames_, outfile_); + fclose(outfile_); + outfile_ = NULL; + } +#endif + } + + virtual void PreEncodeFrameHook(libaom_test::VideoSource *video, + libaom_test::Encoder *encoder) { + if (change_config_) { + int new_q = 60; + if (video->frame() == 0) { + struct aom_scaling_mode mode = { AOME_ONETWO, AOME_ONETWO }; + encoder->Control(AOME_SET_SCALEMODE, &mode); + } + if (video->frame() == 1) { + struct aom_scaling_mode mode = { AOME_NORMAL, AOME_NORMAL }; + encoder->Control(AOME_SET_SCALEMODE, &mode); + cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = new_q; + encoder->Config(&cfg_); + } + } else { + if (video->frame() >= kStepDownFrame && video->frame() < kStepUpFrame) { + struct aom_scaling_mode mode = { AOME_FOURFIVE, AOME_THREEFIVE }; + encoder->Control(AOME_SET_SCALEMODE, &mode); + } + if (video->frame() >= kStepUpFrame) { + struct aom_scaling_mode mode = { AOME_NORMAL, AOME_NORMAL }; + encoder->Control(AOME_SET_SCALEMODE, &mode); + } + } + } + + virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) { + if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0]; + EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 3.0); + } + +#if WRITE_COMPRESSED_STREAM + virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) { + ++out_frames_; + + // Write initial file header if first frame. + if (pkt->data.frame.pts == 0) write_ivf_file_header(&cfg_, 0, outfile_); + + // Write frame header and data. + write_ivf_frame_header(pkt, outfile_); + (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_); + } +#endif + + double frame0_psnr_; + bool change_config_; +#if WRITE_COMPRESSED_STREAM + FILE *outfile_; + unsigned int out_frames_; +#endif +}; + +TEST_P(ResizeInternalTestLarge, TestInternalResizeWorks) { + ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 10); + init_flags_ = AOM_CODEC_USE_PSNR; + change_config_ = false; + + // q picked such that initial keyframe on this clip is ~30dB PSNR + cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48; + + // If the number of frames being encoded is smaller than g_lag_in_frames + // the encoded frame is unavailable using the current API. Comparing + // frames to detect mismatch would then not be possible. Set + // g_lag_in_frames = 0 to get around this. + cfg_.g_lag_in_frames = 0; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + for (std::vector::const_iterator info = frame_info_list_.begin(); + info != frame_info_list_.end(); ++info) { + } + for (std::vector::const_iterator info = frame_info_list_.begin(); + info != frame_info_list_.end(); ++info) { + const aom_codec_pts_t pts = info->pts; + if (pts >= kStepDownFrame && pts < kStepUpFrame) { + ASSERT_EQ(282U, info->w) << "Frame " << pts << " had unexpected width"; + ASSERT_EQ(173U, info->h) << "Frame " << pts << " had unexpected height"; + } else { + EXPECT_EQ(352U, info->w) << "Frame " << pts << " had unexpected width"; + EXPECT_EQ(288U, info->h) << "Frame " << pts << " had unexpected height"; + } + } +} + +TEST_P(ResizeInternalTestLarge, TestInternalResizeChangeConfig) { + ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 10); + cfg_.g_w = 352; + cfg_.g_h = 288; + change_config_ = true; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +class ResizeRealtimeTest + : public ::libaom_test::CodecTestWith2Params, + public ::libaom_test::EncoderTest { + protected: + ResizeRealtimeTest() : EncoderTest(GET_PARAM(0)) {} + virtual ~ResizeRealtimeTest() {} + + virtual void PreEncodeFrameHook(libaom_test::VideoSource *video, + libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AV1E_SET_AQ_MODE, 3); + encoder->Control(AOME_SET_CPUUSED, set_cpu_used_); + encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1); + } + + if (change_bitrate_ && video->frame() == 120) { + change_bitrate_ = false; + cfg_.rc_target_bitrate = 500; + encoder->Config(&cfg_); + } + } + + virtual void SetUp() { + InitializeConfig(); + SetMode(GET_PARAM(1)); + set_cpu_used_ = GET_PARAM(2); + } + + virtual void DecompressedFrameHook(const aom_image_t &img, + aom_codec_pts_t pts) { + frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h)); + } + + virtual void MismatchHook(const aom_image_t *img1, const aom_image_t *img2) { + double mismatch_psnr = compute_psnr(img1, img2); + mismatch_psnr_ += mismatch_psnr; + ++mismatch_nframes_; + } + + unsigned int GetMismatchFrames() { return mismatch_nframes_; } + + void DefaultConfig() { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.rc_undershoot_pct = 50; + cfg_.rc_overshoot_pct = 50; + cfg_.rc_end_usage = AOM_CBR; + cfg_.kf_mode = AOM_KF_AUTO; + cfg_.g_lag_in_frames = 0; + cfg_.kf_min_dist = cfg_.kf_max_dist = 3000; + // Enable dropped frames. + cfg_.rc_dropframe_thresh = 1; + // Disable error_resilience mode. + cfg_.g_error_resilient = 0; + // Run at low bitrate. + cfg_.rc_target_bitrate = 200; + // We use max(kInitialWidth, kInitialHeight) because during the test + // the width and height of the frame are swapped + cfg_.g_forced_max_frame_width = cfg_.g_forced_max_frame_height = + AOMMAX(kInitialWidth, kInitialHeight); + } + + std::vector frame_info_list_; + int set_cpu_used_; + bool change_bitrate_; + double mismatch_psnr_; + int mismatch_nframes_; +}; + +TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) { + ResizingVideoSource video; + video.flag_codec_ = 1; + DefaultConfig(); + change_bitrate_ = false; + mismatch_psnr_ = 0.0; + mismatch_nframes_ = 0; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + // Check we decoded the same number of frames as we attempted to encode + ASSERT_EQ(frame_info_list_.size(), video.limit()); + + for (std::vector::const_iterator info = frame_info_list_.begin(); + info != frame_info_list_.end(); ++info) { + const unsigned int frame = static_cast(info->pts); + unsigned int expected_w; + unsigned int expected_h; + ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w, + &expected_h, 1); + EXPECT_EQ(expected_w, info->w) + << "Frame " << frame << " had unexpected width"; + EXPECT_EQ(expected_h, info->h) + << "Frame " << frame << " had unexpected height"; + EXPECT_EQ(static_cast(0), GetMismatchFrames()); + } +} + +// Verify the dynamic resizer behavior for real time, 1 pass CBR mode. +// Run at low bitrate, with resize_allowed = 1, and verify that we get +// one resize down event. +TEST_P(ResizeRealtimeTest, DISABLED_TestInternalResizeDown) { + ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 299); + DefaultConfig(); + cfg_.g_w = 352; + cfg_.g_h = 288; + change_bitrate_ = false; + mismatch_psnr_ = 0.0; + mismatch_nframes_ = 0; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + unsigned int last_w = cfg_.g_w; + unsigned int last_h = cfg_.g_h; + int resize_count = 0; + for (std::vector::const_iterator info = frame_info_list_.begin(); + info != frame_info_list_.end(); ++info) { + if (info->w != last_w || info->h != last_h) { + // Verify that resize down occurs. + ASSERT_LT(info->w, last_w); + ASSERT_LT(info->h, last_h); + last_w = info->w; + last_h = info->h; + resize_count++; + } + } + +#if CONFIG_AV1_DECODER + // Verify that we get 1 resize down event in this test. + ASSERT_EQ(1, resize_count) << "Resizing should occur."; + EXPECT_EQ(static_cast(0), GetMismatchFrames()); +#else + printf("Warning: AV1 decoder unavailable, unable to check resize count!\n"); +#endif +} + +// Verify the dynamic resizer behavior for real time, 1 pass CBR mode. +// Start at low target bitrate, raise the bitrate in the middle of the clip, +// scaling-up should occur after bitrate changed. +TEST_P(ResizeRealtimeTest, DISABLED_TestInternalResizeDownUpChangeBitRate) { + ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 359); + DefaultConfig(); + cfg_.g_w = 352; + cfg_.g_h = 288; + change_bitrate_ = true; + mismatch_psnr_ = 0.0; + mismatch_nframes_ = 0; + // Disable dropped frames. + cfg_.rc_dropframe_thresh = 0; + // Starting bitrate low. + cfg_.rc_target_bitrate = 80; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + unsigned int last_w = cfg_.g_w; + unsigned int last_h = cfg_.g_h; + int resize_count = 0; + for (std::vector::const_iterator info = frame_info_list_.begin(); + info != frame_info_list_.end(); ++info) { + if (info->w != last_w || info->h != last_h) { + resize_count++; + if (resize_count == 1) { + // Verify that resize down occurs. + ASSERT_LT(info->w, last_w); + ASSERT_LT(info->h, last_h); + } else if (resize_count == 2) { + // Verify that resize up occurs. + ASSERT_GT(info->w, last_w); + ASSERT_GT(info->h, last_h); + } + last_w = info->w; + last_h = info->h; + } + } + +#if CONFIG_AV1_DECODER + // Verify that we get 2 resize events in this test. + ASSERT_EQ(resize_count, 2) << "Resizing should occur twice."; + EXPECT_EQ(static_cast(0), GetMismatchFrames()); +#else + printf("Warning: AV1 decoder unavailable, unable to check resize count!\n"); +#endif +} + +class ResizeCspTest : public ResizeTest { + protected: +#if WRITE_COMPRESSED_STREAM + ResizeCspTest() + : ResizeTest(), frame0_psnr_(0.0), outfile_(NULL), out_frames_(0) {} +#else + ResizeCspTest() : ResizeTest(), frame0_psnr_(0.0) {} +#endif + + virtual ~ResizeCspTest() {} + + virtual void BeginPassHook(unsigned int /*pass*/) { +#if WRITE_COMPRESSED_STREAM + outfile_ = fopen("av11-2-05-cspchape.ivf", "wb"); +#endif + } + + virtual void EndPassHook() { +#if WRITE_COMPRESSED_STREAM + if (outfile_) { + if (!fseek(outfile_, 0, SEEK_SET)) + write_ivf_file_header(&cfg_, out_frames_, outfile_); + fclose(outfile_); + outfile_ = NULL; + } +#endif + } + + virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) { + if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0]; + EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0); + } + +#if WRITE_COMPRESSED_STREAM + virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) { + ++out_frames_; + + // Write initial file header if first frame. + if (pkt->data.frame.pts == 0) write_ivf_file_header(&cfg_, 0, outfile_); + + // Write frame header and data. + write_ivf_frame_header(pkt, outfile_); + (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_); + } +#endif + + double frame0_psnr_; +#if WRITE_COMPRESSED_STREAM + FILE *outfile_; + unsigned int out_frames_; +#endif +}; + +class ResizingCspVideoSource : public ::libaom_test::DummyVideoSource { + public: + explicit ResizingCspVideoSource(aom_img_fmt_t image_format) { + SetSize(kInitialWidth, kInitialHeight); + SetImageFormat(image_format); + limit_ = 30; + } + + virtual ~ResizingCspVideoSource() {} +}; + +#if (defined(DISABLE_TRELLISQ_SEARCH) && DISABLE_TRELLISQ_SEARCH) +TEST_P(ResizeCspTest, DISABLED_TestResizeCspWorks) { +#else +TEST_P(ResizeCspTest, TestResizeCspWorks) { +#endif + const aom_img_fmt_t image_formats[] = { AOM_IMG_FMT_I420, AOM_IMG_FMT_I444 }; + for (const aom_img_fmt_t &img_format : image_formats) { + ResizingCspVideoSource video(img_format); + init_flags_ = AOM_CODEC_USE_PSNR; + cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48; + cfg_.g_lag_in_frames = 0; + cfg_.g_profile = (img_format == AOM_IMG_FMT_I420) ? 0 : 1; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + // Check we decoded the same number of frames as we attempted to encode + ASSERT_EQ(frame_info_list_.size(), video.limit()); + frame_info_list_.clear(); + } +} + +AV1_INSTANTIATE_TEST_CASE(ResizeTest, + ::testing::Values(::libaom_test::kRealTime)); +AV1_INSTANTIATE_TEST_CASE(ResizeInternalTestLarge, + ::testing::Values(::libaom_test::kOnePassGood)); +AV1_INSTANTIATE_TEST_CASE(ResizeRealtimeTest, + ::testing::Values(::libaom_test::kRealTime), + ::testing::Range(5, 9)); +AV1_INSTANTIATE_TEST_CASE(ResizeCspTest, + ::testing::Values(::libaom_test::kRealTime)); +} // namespace diff --git a/libs/libaom/src/test/rt_end_to_end_test.cc b/libs/libaom/src/test/rt_end_to_end_test.cc new file mode 100644 index 000000000..f14d12474 --- /dev/null +++ b/libs/libaom/src/test/rt_end_to_end_test.cc @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "test/yuv_video_source.h" + +namespace { + +const unsigned int kFrames = 10; +const int kBitrate = 500; + +// List of psnr thresholds for speed settings 6-8 +// keys: video, speed, aq mode. +std::unordered_map>> + kPsnrThreshold = { { "park_joy_90p_8_420.y4m", + { { 5, { { 0, 35.4 }, { 3, 36.4 } } }, + { 6, { { 0, 35.3 }, { 3, 36.2 } } }, + { 7, { { 0, 34.9 }, { 3, 35.8 } } }, + { 8, { { 0, 35.0 }, { 3, 35.8 } } } } }, + { "paris_352_288_30.y4m", + { { 5, { { 0, 36.2 }, { 3, 36.7 } } }, + { 6, { { 0, 36.1 }, { 3, 36.6 } } }, + { 7, { { 0, 35.5 }, { 3, 36.0 } } }, + { 8, { { 0, 36.0 }, { 3, 36.5 } } } } }, + { "niklas_1280_720_30.y4m", + { { 5, { { 0, 34.6 }, { 3, 34.6 } } }, + { 6, { { 0, 34.2 }, { 3, 34.2 } } }, + { 7, { { 0, 33.7 }, { 3, 33.6 } } }, + { 8, { { 0, 33.6 }, { 3, 33.4 } } } } } }; + +typedef struct { + const char *filename; + unsigned int input_bit_depth; + aom_img_fmt fmt; + aom_bit_depth_t bit_depth; + unsigned int profile; +} TestVideoParam; + +std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) { + return os << "TestVideoParam { filename:" << test_arg.filename + << " input_bit_depth:" << test_arg.input_bit_depth + << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth + << " profile:" << test_arg.profile << " }"; +} + +const TestVideoParam kTestVectors[] = { + { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 }, + { "paris_352_288_30.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 }, + { "niklas_1280_720_30.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 }, +}; + +// Params: test video, speed, aq mode, threads, tile columns. +class RTEndToEndTest + : public ::libaom_test::CodecTestWith5Params, + public ::libaom_test::EncoderTest { + protected: + RTEndToEndTest() + : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)), + cpu_used_(GET_PARAM(2)), psnr_(0.0), nframes_(0), + aq_mode_(GET_PARAM(3)), threads_(GET_PARAM(4)), + tile_columns_(GET_PARAM(5)) {} + + virtual ~RTEndToEndTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(::libaom_test::kRealTime); + + cfg_.rc_end_usage = AOM_CBR; + cfg_.g_threads = threads_; + cfg_.rc_buf_sz = 1000; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + } + + virtual void BeginPassHook(unsigned int) { + psnr_ = 0.0; + nframes_ = 0; + } + + virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) { + psnr_ += pkt->data.psnr.psnr[0]; + nframes_++; + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1); + encoder->Control(AV1E_SET_TILE_COLUMNS, tile_columns_); + encoder->Control(AOME_SET_CPUUSED, cpu_used_); + encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT); + encoder->Control(AV1E_SET_AQ_MODE, aq_mode_); + encoder->Control(AV1E_SET_ROW_MT, 1); + } + } + + double GetAveragePsnr() const { + if (nframes_) return psnr_ / nframes_; + return 0.0; + } + + double GetPsnrThreshold() { + return kPsnrThreshold[test_video_param_.filename][cpu_used_][aq_mode_]; + } + + void DoTest() { + cfg_.rc_target_bitrate = kBitrate; + cfg_.g_error_resilient = 0; + cfg_.g_profile = test_video_param_.profile; + cfg_.g_input_bit_depth = test_video_param_.input_bit_depth; + cfg_.g_bit_depth = test_video_param_.bit_depth; + init_flags_ = AOM_CODEC_USE_PSNR; + if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH; + + std::unique_ptr video; + video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0, + kFrames)); + ASSERT_TRUE(video.get() != NULL); + + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); + const double psnr = GetAveragePsnr(); + EXPECT_GT(psnr, GetPsnrThreshold()) + << "cpu used = " << cpu_used_ << " aq mode = " << aq_mode_; + } + + TestVideoParam test_video_param_; + int cpu_used_; + + private: + double psnr_; + unsigned int nframes_; + unsigned int aq_mode_; + int threads_; + int tile_columns_; +}; + +class RTEndToEndTestThreaded : public RTEndToEndTest {}; + +TEST_P(RTEndToEndTest, EndtoEndPSNRTest) { DoTest(); } + +TEST_P(RTEndToEndTestThreaded, EndtoEndPSNRTest) { DoTest(); } + +AV1_INSTANTIATE_TEST_CASE(RTEndToEndTest, ::testing::ValuesIn(kTestVectors), + ::testing::Range(5, 9), + ::testing::Values(0, 3), + ::testing::Values(1), ::testing::Values(1)); + +AV1_INSTANTIATE_TEST_CASE(RTEndToEndTestThreaded, + ::testing::ValuesIn(kTestVectors), + ::testing::Range(5, 9), + ::testing::Values(0, 3), + ::testing::Range(2, 5), ::testing::Range(2, 5)); +} // namespace diff --git a/libs/libaom/src/test/run_encodes.sh b/libs/libaom/src/test/run_encodes.sh new file mode 100644 index 000000000..2096d8b15 --- /dev/null +++ b/libs/libaom/src/test/run_encodes.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# +# Copyright (c) 2016, Alliance for Open Media. All rights reserved. +# +# This source code is subject to the terms of the BSD 2 Clause License and +# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +# was not distributed with this source code in the LICENSE file, you can +# obtain it at www.aomedia.org/license/software. If the Alliance for Open +# Media Patent License 1.0 was not distributed with this source code in the +# PATENTS file, you can obtain it at www.aomedia.org/license/patent. +# +# Author: jimbankoski@google.com (Jim Bankoski) + +if [[ $# -ne 4 ]]; then + echo Encodes all the y4m files in the directory at the bitrates specified by + echo the first 3 parameters and stores the results in a subdirectory named by + echo the 4th parameter: + echo + echo Usage: run_encodes.sh start-kbps end-kbps step-kbps output-directory + echo Example: run_encodes.sh 200 500 50 baseline + exit +fi + +s=$1 +e=$2 +step=$3 +newdir=$4 + +for i in ./*y4m; do + for (( b=$s; b<= $e; b+= $step )) + do + best_encode.sh $i $b + done + mv opsnr.stt $i.stt +done + +mkdir $newdir +mv *.stt $newdir +mv *.webm $newdir diff --git a/libs/libaom/src/test/sad_test.cc b/libs/libaom/src/test/sad_test.cc new file mode 100644 index 000000000..0bdbf3745 --- /dev/null +++ b/libs/libaom/src/test/sad_test.cc @@ -0,0 +1,1981 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "aom/aom_codec.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" + +typedef unsigned int (*SadMxNFunc)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride); +typedef std::tuple SadMxNParam; + +typedef uint32_t (*SadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred); +typedef std::tuple SadMxNAvgParam; + +typedef void (*DistWtdCompAvgFunc)(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, const uint8_t *ref, + int ref_stride, + const DIST_WTD_COMP_PARAMS *jcp_param); +typedef std::tuple DistWtdCompAvgParam; + +typedef unsigned int (*DistWtdSadMxhFunc)(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int width, + int height); +typedef std::tuple DistWtdSadMxhParam; + +typedef uint32_t (*DistWtdSadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param); +typedef std::tuple DistWtdSadMxNAvgParam; + +typedef void (*SadMxNx4Func)(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_ptr[], int ref_stride, + uint32_t *sad_array); +typedef std::tuple SadMxNx4Param; + +typedef void (*SadMxNx4AvgFunc)(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_ptr[], int ref_stride, + const uint8_t *second_pred, + uint32_t *sad_array); +typedef std::tuple SadMxNx4AvgParam; + +using libaom_test::ACMRandom; + +namespace { +class SADTestBase : public ::testing::Test { + public: + SADTestBase(int width, int height, int bit_depth) + : width_(width), height_(height), bd_(bit_depth) {} + + static void SetUpTestCase() { + source_data8_ = reinterpret_cast( + aom_memalign(kDataAlignment, kDataBlockSize)); + reference_data8_ = reinterpret_cast( + aom_memalign(kDataAlignment, kDataBufferSize)); + second_pred8_ = + reinterpret_cast(aom_memalign(kDataAlignment, 128 * 128)); + comp_pred8_ = + reinterpret_cast(aom_memalign(kDataAlignment, 128 * 128)); + comp_pred8_test_ = + reinterpret_cast(aom_memalign(kDataAlignment, 128 * 128)); + source_data16_ = reinterpret_cast( + aom_memalign(kDataAlignment, kDataBlockSize * sizeof(uint16_t))); + reference_data16_ = reinterpret_cast( + aom_memalign(kDataAlignment, kDataBufferSize * sizeof(uint16_t))); + second_pred16_ = reinterpret_cast( + aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t))); + comp_pred16_ = reinterpret_cast( + aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t))); + comp_pred16_test_ = reinterpret_cast( + aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t))); + } + + static void TearDownTestCase() { + aom_free(source_data8_); + source_data8_ = NULL; + aom_free(reference_data8_); + reference_data8_ = NULL; + aom_free(second_pred8_); + second_pred8_ = NULL; + aom_free(comp_pred8_); + comp_pred8_ = NULL; + aom_free(comp_pred8_test_); + comp_pred8_test_ = NULL; + aom_free(source_data16_); + source_data16_ = NULL; + aom_free(reference_data16_); + reference_data16_ = NULL; + aom_free(second_pred16_); + second_pred16_ = NULL; + aom_free(comp_pred16_); + comp_pred16_ = NULL; + aom_free(comp_pred16_test_); + comp_pred16_test_ = NULL; + } + + virtual void TearDown() { libaom_test::ClearSystemState(); } + + protected: + // Handle up to 4 128x128 blocks, with stride up to 256 + static const int kDataAlignment = 16; + static const int kDataBlockSize = 128 * 256; + static const int kDataBufferSize = 4 * kDataBlockSize; + + virtual void SetUp() { + if (bd_ == -1) { + use_high_bit_depth_ = false; + bit_depth_ = AOM_BITS_8; + source_data_ = source_data8_; + reference_data_ = reference_data8_; + second_pred_ = second_pred8_; + comp_pred_ = comp_pred8_; + comp_pred_test_ = comp_pred8_test_; + } else { + use_high_bit_depth_ = true; + bit_depth_ = static_cast(bd_); + source_data_ = CONVERT_TO_BYTEPTR(source_data16_); + reference_data_ = CONVERT_TO_BYTEPTR(reference_data16_); + second_pred_ = CONVERT_TO_BYTEPTR(second_pred16_); + comp_pred_ = CONVERT_TO_BYTEPTR(comp_pred16_); + comp_pred_test_ = CONVERT_TO_BYTEPTR(comp_pred16_test_); + } + mask_ = (1 << bit_depth_) - 1; + source_stride_ = (width_ + 31) & ~31; + reference_stride_ = width_ * 2; + rnd_.Reset(ACMRandom::DeterministicSeed()); + } + + virtual uint8_t *GetReference(int block_idx) { + if (use_high_bit_depth_) + return CONVERT_TO_BYTEPTR(CONVERT_TO_SHORTPTR(reference_data_) + + block_idx * kDataBlockSize); + return reference_data_ + block_idx * kDataBlockSize; + } + + // Sum of Absolute Differences. Given two blocks, calculate the absolute + // difference between two pixels in the same relative location; accumulate. + unsigned int ReferenceSAD(int block_idx) { + unsigned int sad = 0; + const uint8_t *const reference8 = GetReference(block_idx); + const uint8_t *const source8 = source_data_; + const uint16_t *const reference16 = + CONVERT_TO_SHORTPTR(GetReference(block_idx)); + const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_); + for (int h = 0; h < height_; ++h) { + for (int w = 0; w < width_; ++w) { + if (!use_high_bit_depth_) { + sad += abs(source8[h * source_stride_ + w] - + reference8[h * reference_stride_ + w]); + } else { + sad += abs(source16[h * source_stride_ + w] - + reference16[h * reference_stride_ + w]); + } + } + } + return sad; + } + + // Sum of Absolute Differences Average. Given two blocks, and a prediction + // calculate the absolute difference between one pixel and average of the + // corresponding and predicted pixels; accumulate. + unsigned int ReferenceSADavg(int block_idx) { + unsigned int sad = 0; + const uint8_t *const reference8 = GetReference(block_idx); + const uint8_t *const source8 = source_data_; + const uint8_t *const second_pred8 = second_pred_; + const uint16_t *const reference16 = + CONVERT_TO_SHORTPTR(GetReference(block_idx)); + const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_); + const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_); + for (int h = 0; h < height_; ++h) { + for (int w = 0; w < width_; ++w) { + if (!use_high_bit_depth_) { + const int tmp = second_pred8[h * width_ + w] + + reference8[h * reference_stride_ + w]; + const uint8_t comp_pred = ROUND_POWER_OF_TWO(tmp, 1); + sad += abs(source8[h * source_stride_ + w] - comp_pred); + } else { + const int tmp = second_pred16[h * width_ + w] + + reference16[h * reference_stride_ + w]; + const uint16_t comp_pred = ROUND_POWER_OF_TWO(tmp, 1); + sad += abs(source16[h * source_stride_ + w] - comp_pred); + } + } + } + return sad; + } + + void ReferenceDistWtdCompAvg(int block_idx) { + const uint8_t *const reference8 = GetReference(block_idx); + const uint8_t *const second_pred8 = second_pred_; + uint8_t *const comp_pred8 = comp_pred_; + const uint16_t *const reference16 = + CONVERT_TO_SHORTPTR(GetReference(block_idx)); + const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_); + uint16_t *const comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred_); + for (int h = 0; h < height_; ++h) { + for (int w = 0; w < width_; ++w) { + if (!use_high_bit_depth_) { + const int tmp = + second_pred8[h * width_ + w] * jcp_param_.bck_offset + + reference8[h * reference_stride_ + w] * jcp_param_.fwd_offset; + comp_pred8[h * width_ + w] = ROUND_POWER_OF_TWO(tmp, 4); + } else { + const int tmp = + second_pred16[h * width_ + w] * jcp_param_.bck_offset + + reference16[h * reference_stride_ + w] * jcp_param_.fwd_offset; + comp_pred16[h * width_ + w] = ROUND_POWER_OF_TWO(tmp, 4); + } + } + } + } + + unsigned int ReferenceDistWtdSADavg(int block_idx) { + unsigned int sad = 0; + const uint8_t *const reference8 = GetReference(block_idx); + const uint8_t *const source8 = source_data_; + const uint8_t *const second_pred8 = second_pred_; + const uint16_t *const reference16 = + CONVERT_TO_SHORTPTR(GetReference(block_idx)); + const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_); + const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_); + for (int h = 0; h < height_; ++h) { + for (int w = 0; w < width_; ++w) { + if (!use_high_bit_depth_) { + const int tmp = + second_pred8[h * width_ + w] * jcp_param_.bck_offset + + reference8[h * reference_stride_ + w] * jcp_param_.fwd_offset; + const uint8_t comp_pred = ROUND_POWER_OF_TWO(tmp, 4); + sad += abs(source8[h * source_stride_ + w] - comp_pred); + } else { + const int tmp = + second_pred16[h * width_ + w] * jcp_param_.bck_offset + + reference16[h * reference_stride_ + w] * jcp_param_.fwd_offset; + const uint16_t comp_pred = ROUND_POWER_OF_TWO(tmp, 4); + sad += abs(source16[h * source_stride_ + w] - comp_pred); + } + } + } + return sad; + } + + void FillConstant(uint8_t *data, int stride, uint16_t fill_constant) { + uint8_t *data8 = data; + uint16_t *data16 = CONVERT_TO_SHORTPTR(data); + for (int h = 0; h < height_; ++h) { + for (int w = 0; w < width_; ++w) { + if (!use_high_bit_depth_) { + data8[h * stride + w] = static_cast(fill_constant); + } else { + data16[h * stride + w] = fill_constant; + } + } + } + } + + void FillRandom(uint8_t *data, int stride) { + uint8_t *data8 = data; + uint16_t *data16 = CONVERT_TO_SHORTPTR(data); + for (int h = 0; h < height_; ++h) { + for (int w = 0; w < width_; ++w) { + if (!use_high_bit_depth_) { + data8[h * stride + w] = rnd_.Rand8(); + } else { + data16[h * stride + w] = rnd_.Rand16() & mask_; + } + } + } + } + + int width_, height_, mask_, bd_; + aom_bit_depth_t bit_depth_; + static uint8_t *source_data_; + static uint8_t *reference_data_; + static uint8_t *second_pred_; + int source_stride_; + bool use_high_bit_depth_; + static uint8_t *source_data8_; + static uint8_t *reference_data8_; + static uint8_t *second_pred8_; + static uint16_t *source_data16_; + static uint16_t *reference_data16_; + static uint16_t *second_pred16_; + int reference_stride_; + static uint8_t *comp_pred_; + static uint8_t *comp_pred8_; + static uint16_t *comp_pred16_; + static uint8_t *comp_pred_test_; + static uint8_t *comp_pred8_test_; + static uint16_t *comp_pred16_test_; + DIST_WTD_COMP_PARAMS jcp_param_; + + ACMRandom rnd_; +}; + +class SADx4Test : public ::testing::WithParamInterface, + public SADTestBase { + public: + SADx4Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {} + + protected: + void SADs(unsigned int *results) { + const uint8_t *references[] = { GetReference(0), GetReference(1), + GetReference(2), GetReference(3) }; + + ASM_REGISTER_STATE_CHECK(GET_PARAM(2)( + source_data_, source_stride_, references, reference_stride_, results)); + } + + void CheckSADs() { + unsigned int reference_sad, exp_sad[4]; + + SADs(exp_sad); + for (int block = 0; block < 4; ++block) { + reference_sad = ReferenceSAD(block); + + EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block; + } + } +}; + +class SADx4AvgTest : public ::testing::WithParamInterface, + public SADTestBase { + public: + SADx4AvgTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {} + + protected: + void SADs(unsigned int *results) { + const uint8_t *references[] = { GetReference(0), GetReference(1), + GetReference(2), GetReference(3) }; + + ASM_REGISTER_STATE_CHECK(GET_PARAM(2)(source_data_, source_stride_, + references, reference_stride_, + second_pred_, results)); + } + + void CheckSADs() { + unsigned int reference_sad, exp_sad[4]; + + SADs(exp_sad); + for (int block = 0; block < 4; ++block) { + reference_sad = ReferenceSADavg(block); + + EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block; + } + } + + void SpeedSAD() { + int test_count = 200000; + unsigned int exp_sad[4]; + while (test_count > 0) { + SADs(exp_sad); + test_count -= 1; + } + } +}; + +class SADTest : public ::testing::WithParamInterface, + public SADTestBase { + public: + SADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {} + + protected: + unsigned int SAD(int block_idx) { + unsigned int ret; + const uint8_t *const reference = GetReference(block_idx); + + ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_, + reference, reference_stride_)); + return ret; + } + + void CheckSAD() { + const unsigned int reference_sad = ReferenceSAD(0); + const unsigned int exp_sad = SAD(0); + + ASSERT_EQ(reference_sad, exp_sad); + } + + void SpeedSAD() { + int test_count = 20000000; + while (test_count > 0) { + SAD(0); + test_count -= 1; + } + } +}; + +class SADavgTest : public ::testing::WithParamInterface, + public SADTestBase { + public: + SADavgTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {} + + protected: + unsigned int SAD_avg(int block_idx) { + unsigned int ret; + const uint8_t *const reference = GetReference(block_idx); + + ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_, + reference, reference_stride_, + second_pred_)); + return ret; + } + + void CheckSAD() { + const unsigned int reference_sad = ReferenceSADavg(0); + const unsigned int exp_sad = SAD_avg(0); + + ASSERT_EQ(reference_sad, exp_sad); + } +}; + +class DistWtdCompAvgTest + : public ::testing::WithParamInterface, + public SADTestBase { + public: + DistWtdCompAvgTest() + : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {} + + protected: + void dist_wtd_comp_avg(int block_idx) { + const uint8_t *const reference = GetReference(block_idx); + + ASM_REGISTER_STATE_CHECK(GET_PARAM(2)(comp_pred_test_, second_pred_, width_, + height_, reference, reference_stride_, + &jcp_param_)); + } + + void CheckCompAvg() { + for (int j = 0; j < 2; ++j) { + for (int i = 0; i < 4; ++i) { + jcp_param_.fwd_offset = quant_dist_lookup_table[j][i][0]; + jcp_param_.bck_offset = quant_dist_lookup_table[j][i][1]; + + ReferenceDistWtdCompAvg(0); + dist_wtd_comp_avg(0); + + for (int y = 0; y < height_; ++y) + for (int x = 0; x < width_; ++x) + ASSERT_EQ(comp_pred_[y * width_ + x], + comp_pred_test_[y * width_ + x]); + } + } + } +}; + +class DistWtdSADTest : public ::testing::WithParamInterface, + public SADTestBase { + public: + DistWtdSADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {} + + protected: + unsigned int SAD(int block_idx) { + unsigned int ret; + const uint8_t *const reference = GetReference(block_idx); + + ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_, + reference, reference_stride_, + GET_PARAM(0), GET_PARAM(1))); + return ret; + } + + void CheckSAD() { + const unsigned int reference_sad = ReferenceSAD(0); + const unsigned int exp_sad = SAD(0); + + ASSERT_EQ(reference_sad, exp_sad); + } + + void SpeedSAD() { + int test_count = 20000000; + while (test_count > 0) { + SAD(0); + test_count -= 1; + } + } +}; + +class DistWtdSADavgTest + : public ::testing::WithParamInterface, + public SADTestBase { + public: + DistWtdSADavgTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {} + + protected: + unsigned int dist_wtd_SAD_avg(int block_idx) { + unsigned int ret; + const uint8_t *const reference = GetReference(block_idx); + + ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_, + reference, reference_stride_, + second_pred_, &jcp_param_)); + return ret; + } + + void CheckSAD() { + for (int j = 0; j < 2; ++j) { + for (int i = 0; i < 4; ++i) { + jcp_param_.fwd_offset = quant_dist_lookup_table[j][i][0]; + jcp_param_.bck_offset = quant_dist_lookup_table[j][i][1]; + + const unsigned int reference_sad = ReferenceDistWtdSADavg(0); + const unsigned int exp_sad = dist_wtd_SAD_avg(0); + + ASSERT_EQ(reference_sad, exp_sad); + } + } + } +}; + +uint8_t *SADTestBase::source_data_ = NULL; +uint8_t *SADTestBase::reference_data_ = NULL; +uint8_t *SADTestBase::second_pred_ = NULL; +uint8_t *SADTestBase::comp_pred_ = NULL; +uint8_t *SADTestBase::comp_pred_test_ = NULL; +uint8_t *SADTestBase::source_data8_ = NULL; +uint8_t *SADTestBase::reference_data8_ = NULL; +uint8_t *SADTestBase::second_pred8_ = NULL; +uint8_t *SADTestBase::comp_pred8_ = NULL; +uint8_t *SADTestBase::comp_pred8_test_ = NULL; +uint16_t *SADTestBase::source_data16_ = NULL; +uint16_t *SADTestBase::reference_data16_ = NULL; +uint16_t *SADTestBase::second_pred16_ = NULL; +uint16_t *SADTestBase::comp_pred16_ = NULL; +uint16_t *SADTestBase::comp_pred16_test_ = NULL; + +TEST_P(SADTest, MaxRef) { + FillConstant(source_data_, source_stride_, 0); + FillConstant(reference_data_, reference_stride_, mask_); + CheckSAD(); +} + +TEST_P(SADTest, MaxSrc) { + FillConstant(source_data_, source_stride_, mask_); + FillConstant(reference_data_, reference_stride_, 0); + CheckSAD(); +} + +TEST_P(SADTest, ShortRef) { + const int tmp_stride = reference_stride_; + reference_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + CheckSAD(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADTest, UnalignedRef) { + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + const int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + CheckSAD(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADTest, ShortSrc) { + const int tmp_stride = source_stride_; + source_stride_ >>= 1; + int test_count = 2000; + while (test_count > 0) { + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + CheckSAD(); + test_count -= 1; + } + source_stride_ = tmp_stride; +} + +#define SPEED_TEST (0) +#if SPEED_TEST +TEST_P(SADTest, Speed) { + const int tmp_stride = source_stride_; + source_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + SpeedSAD(); + source_stride_ = tmp_stride; +} +#endif + +TEST_P(SADavgTest, MaxRef) { + FillConstant(source_data_, source_stride_, 0); + FillConstant(reference_data_, reference_stride_, mask_); + FillConstant(second_pred_, width_, 0); + CheckSAD(); +} +TEST_P(SADavgTest, MaxSrc) { + FillConstant(source_data_, source_stride_, mask_); + FillConstant(reference_data_, reference_stride_, 0); + FillConstant(second_pred_, width_, 0); + CheckSAD(); +} + +TEST_P(SADavgTest, ShortRef) { + const int tmp_stride = reference_stride_; + reference_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + FillRandom(second_pred_, width_); + CheckSAD(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADavgTest, UnalignedRef) { + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + const int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + FillRandom(second_pred_, width_); + CheckSAD(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADavgTest, ShortSrc) { + const int tmp_stride = source_stride_; + source_stride_ >>= 1; + int test_count = 2000; + while (test_count > 0) { + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + FillRandom(second_pred_, width_); + CheckSAD(); + test_count -= 1; + } + source_stride_ = tmp_stride; +} + +TEST_P(DistWtdCompAvgTest, MaxRef) { + FillConstant(reference_data_, reference_stride_, mask_); + FillConstant(second_pred_, width_, 0); + CheckCompAvg(); +} + +TEST_P(DistWtdCompAvgTest, MaxSecondPred) { + FillConstant(reference_data_, reference_stride_, 0); + FillConstant(second_pred_, width_, mask_); + CheckCompAvg(); +} + +TEST_P(DistWtdCompAvgTest, ShortRef) { + const int tmp_stride = reference_stride_; + reference_stride_ >>= 1; + FillRandom(reference_data_, reference_stride_); + FillRandom(second_pred_, width_); + CheckCompAvg(); + reference_stride_ = tmp_stride; +} + +TEST_P(DistWtdCompAvgTest, UnalignedRef) { + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + const int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(reference_data_, reference_stride_); + FillRandom(second_pred_, width_); + CheckCompAvg(); + reference_stride_ = tmp_stride; +} + +TEST_P(DistWtdSADTest, MaxRef) { + FillConstant(source_data_, source_stride_, 0); + FillConstant(reference_data_, reference_stride_, mask_); + CheckSAD(); +} + +TEST_P(DistWtdSADTest, MaxSrc) { + FillConstant(source_data_, source_stride_, mask_); + FillConstant(reference_data_, reference_stride_, 0); + CheckSAD(); +} + +TEST_P(DistWtdSADTest, ShortRef) { + const int tmp_stride = reference_stride_; + reference_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + CheckSAD(); + reference_stride_ = tmp_stride; +} + +TEST_P(DistWtdSADTest, UnalignedRef) { + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + const int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + CheckSAD(); + reference_stride_ = tmp_stride; +} + +TEST_P(DistWtdSADTest, ShortSrc) { + const int tmp_stride = source_stride_; + source_stride_ >>= 1; + int test_count = 2000; + while (test_count > 0) { + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + CheckSAD(); + test_count -= 1; + } + source_stride_ = tmp_stride; +} + +TEST_P(DistWtdSADavgTest, MaxRef) { + FillConstant(source_data_, source_stride_, 0); + FillConstant(reference_data_, reference_stride_, mask_); + FillConstant(second_pred_, width_, 0); + CheckSAD(); +} +TEST_P(DistWtdSADavgTest, MaxSrc) { + FillConstant(source_data_, source_stride_, mask_); + FillConstant(reference_data_, reference_stride_, 0); + FillConstant(second_pred_, width_, 0); + CheckSAD(); +} + +TEST_P(DistWtdSADavgTest, ShortRef) { + const int tmp_stride = reference_stride_; + reference_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + FillRandom(second_pred_, width_); + CheckSAD(); + reference_stride_ = tmp_stride; +} + +TEST_P(DistWtdSADavgTest, UnalignedRef) { + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + const int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + FillRandom(second_pred_, width_); + CheckSAD(); + reference_stride_ = tmp_stride; +} + +TEST_P(DistWtdSADavgTest, ShortSrc) { + const int tmp_stride = source_stride_; + source_stride_ >>= 1; + int test_count = 2000; + while (test_count > 0) { + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + FillRandom(second_pred_, width_); + CheckSAD(); + test_count -= 1; + } + source_stride_ = tmp_stride; +} + +TEST_P(SADx4Test, MaxRef) { + FillConstant(source_data_, source_stride_, 0); + FillConstant(GetReference(0), reference_stride_, mask_); + FillConstant(GetReference(1), reference_stride_, mask_); + FillConstant(GetReference(2), reference_stride_, mask_); + FillConstant(GetReference(3), reference_stride_, mask_); + CheckSADs(); +} + +TEST_P(SADx4Test, MaxSrc) { + FillConstant(source_data_, source_stride_, mask_); + FillConstant(GetReference(0), reference_stride_, 0); + FillConstant(GetReference(1), reference_stride_, 0); + FillConstant(GetReference(2), reference_stride_, 0); + FillConstant(GetReference(3), reference_stride_, 0); + CheckSADs(); +} + +TEST_P(SADx4Test, ShortRef) { + int tmp_stride = reference_stride_; + reference_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADx4Test, UnalignedRef) { + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADx4Test, ShortSrc) { + int tmp_stride = source_stride_; + source_stride_ >>= 1; + int test_count = 1000; + while (test_count > 0) { + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + test_count -= 1; + } + source_stride_ = tmp_stride; +} + +TEST_P(SADx4Test, SrcAlignedByWidth) { + uint8_t *tmp_source_data = source_data_; + source_data_ += width_; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + source_data_ = tmp_source_data; +} + +using std::make_tuple; + +#if SPEED_TEST +TEST_P(SADx4AvgTest, Speed) { + int tmp_stride = reference_stride_; + reference_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + FillRandom(second_pred_, width_); + SpeedSAD(); + reference_stride_ = tmp_stride; +} +#endif + +TEST_P(SADx4AvgTest, MaxRef) { + FillConstant(source_data_, source_stride_, 0); + FillConstant(GetReference(0), reference_stride_, mask_); + FillConstant(GetReference(1), reference_stride_, mask_); + FillConstant(GetReference(2), reference_stride_, mask_); + FillConstant(GetReference(3), reference_stride_, mask_); + FillConstant(second_pred_, width_, 0); + CheckSADs(); +} + +TEST_P(SADx4AvgTest, MaxSrc) { + FillConstant(source_data_, source_stride_, mask_); + FillConstant(GetReference(0), reference_stride_, 0); + FillConstant(GetReference(1), reference_stride_, 0); + FillConstant(GetReference(2), reference_stride_, 0); + FillConstant(GetReference(3), reference_stride_, 0); + FillConstant(second_pred_, width_, 0); + CheckSADs(); +} + +TEST_P(SADx4AvgTest, ShortRef) { + int tmp_stride = reference_stride_; + reference_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + FillRandom(second_pred_, width_); + CheckSADs(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADx4AvgTest, UnalignedRef) { + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + FillRandom(second_pred_, width_); + CheckSADs(); + reference_stride_ = tmp_stride; +} + +//------------------------------------------------------------------------------ +// C functions +const SadMxNParam c_tests[] = { + make_tuple(128, 128, &aom_sad128x128_c, -1), + make_tuple(128, 64, &aom_sad128x64_c, -1), + make_tuple(64, 128, &aom_sad64x128_c, -1), + make_tuple(64, 64, &aom_sad64x64_c, -1), + make_tuple(64, 32, &aom_sad64x32_c, -1), + make_tuple(32, 64, &aom_sad32x64_c, -1), + make_tuple(32, 32, &aom_sad32x32_c, -1), + make_tuple(32, 16, &aom_sad32x16_c, -1), + make_tuple(16, 32, &aom_sad16x32_c, -1), + make_tuple(16, 16, &aom_sad16x16_c, -1), + make_tuple(16, 8, &aom_sad16x8_c, -1), + make_tuple(8, 16, &aom_sad8x16_c, -1), + make_tuple(8, 8, &aom_sad8x8_c, -1), + make_tuple(8, 4, &aom_sad8x4_c, -1), + make_tuple(4, 8, &aom_sad4x8_c, -1), + make_tuple(4, 4, &aom_sad4x4_c, -1), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(128, 128, &aom_highbd_sad128x128_c, 8), + make_tuple(128, 64, &aom_highbd_sad128x64_c, 8), + make_tuple(64, 128, &aom_highbd_sad64x128_c, 8), + make_tuple(64, 64, &aom_highbd_sad64x64_c, 8), + make_tuple(64, 32, &aom_highbd_sad64x32_c, 8), + make_tuple(32, 64, &aom_highbd_sad32x64_c, 8), + make_tuple(32, 32, &aom_highbd_sad32x32_c, 8), + make_tuple(32, 16, &aom_highbd_sad32x16_c, 8), + make_tuple(16, 32, &aom_highbd_sad16x32_c, 8), + make_tuple(16, 16, &aom_highbd_sad16x16_c, 8), + make_tuple(16, 8, &aom_highbd_sad16x8_c, 8), + make_tuple(8, 16, &aom_highbd_sad8x16_c, 8), + make_tuple(8, 8, &aom_highbd_sad8x8_c, 8), + make_tuple(8, 4, &aom_highbd_sad8x4_c, 8), + make_tuple(4, 8, &aom_highbd_sad4x8_c, 8), + make_tuple(4, 4, &aom_highbd_sad4x4_c, 8), + make_tuple(128, 128, &aom_highbd_sad128x128_c, 10), + make_tuple(128, 64, &aom_highbd_sad128x64_c, 10), + make_tuple(64, 128, &aom_highbd_sad64x128_c, 10), + make_tuple(64, 64, &aom_highbd_sad64x64_c, 10), + make_tuple(64, 32, &aom_highbd_sad64x32_c, 10), + make_tuple(32, 64, &aom_highbd_sad32x64_c, 10), + make_tuple(32, 32, &aom_highbd_sad32x32_c, 10), + make_tuple(32, 16, &aom_highbd_sad32x16_c, 10), + make_tuple(16, 32, &aom_highbd_sad16x32_c, 10), + make_tuple(16, 16, &aom_highbd_sad16x16_c, 10), + make_tuple(16, 8, &aom_highbd_sad16x8_c, 10), + make_tuple(8, 16, &aom_highbd_sad8x16_c, 10), + make_tuple(8, 8, &aom_highbd_sad8x8_c, 10), + make_tuple(8, 4, &aom_highbd_sad8x4_c, 10), + make_tuple(4, 8, &aom_highbd_sad4x8_c, 10), + make_tuple(4, 4, &aom_highbd_sad4x4_c, 10), + make_tuple(128, 128, &aom_highbd_sad128x128_c, 12), + make_tuple(128, 64, &aom_highbd_sad128x64_c, 12), + make_tuple(64, 128, &aom_highbd_sad64x128_c, 12), + make_tuple(64, 64, &aom_highbd_sad64x64_c, 12), + make_tuple(64, 32, &aom_highbd_sad64x32_c, 12), + make_tuple(32, 64, &aom_highbd_sad32x64_c, 12), + make_tuple(32, 32, &aom_highbd_sad32x32_c, 12), + make_tuple(32, 16, &aom_highbd_sad32x16_c, 12), + make_tuple(16, 32, &aom_highbd_sad16x32_c, 12), + make_tuple(16, 16, &aom_highbd_sad16x16_c, 12), + make_tuple(16, 8, &aom_highbd_sad16x8_c, 12), + make_tuple(8, 16, &aom_highbd_sad8x16_c, 12), + make_tuple(8, 8, &aom_highbd_sad8x8_c, 12), + make_tuple(8, 4, &aom_highbd_sad8x4_c, 12), + make_tuple(4, 8, &aom_highbd_sad4x8_c, 12), + make_tuple(4, 4, &aom_highbd_sad4x4_c, 12), +#endif // CONFIG_AV1_HIGHBITDEPTH + make_tuple(64, 16, &aom_sad64x16_c, -1), + make_tuple(16, 64, &aom_sad16x64_c, -1), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(64, 16, &aom_highbd_sad64x16_c, 8), + make_tuple(16, 64, &aom_highbd_sad16x64_c, 8), + make_tuple(64, 16, &aom_highbd_sad64x16_c, 10), + make_tuple(16, 64, &aom_highbd_sad16x64_c, 10), + make_tuple(64, 16, &aom_highbd_sad64x16_c, 12), + make_tuple(16, 64, &aom_highbd_sad16x64_c, 12), +#endif + make_tuple(32, 8, &aom_sad32x8_c, -1), + make_tuple(8, 32, &aom_sad8x32_c, -1), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(32, 8, &aom_highbd_sad32x8_c, 8), + make_tuple(8, 32, &aom_highbd_sad8x32_c, 8), + make_tuple(32, 8, &aom_highbd_sad32x8_c, 10), + make_tuple(8, 32, &aom_highbd_sad8x32_c, 10), + make_tuple(32, 8, &aom_highbd_sad32x8_c, 12), + make_tuple(8, 32, &aom_highbd_sad8x32_c, 12), +#endif + make_tuple(16, 4, &aom_sad16x4_c, -1), + make_tuple(4, 16, &aom_sad4x16_c, -1), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(16, 4, &aom_highbd_sad16x4_c, 8), + make_tuple(4, 16, &aom_highbd_sad4x16_c, 8), + make_tuple(16, 4, &aom_highbd_sad16x4_c, 10), + make_tuple(4, 16, &aom_highbd_sad4x16_c, 10), + make_tuple(16, 4, &aom_highbd_sad16x4_c, 12), + make_tuple(4, 16, &aom_highbd_sad4x16_c, 12), +#endif +}; +INSTANTIATE_TEST_SUITE_P(C, SADTest, ::testing::ValuesIn(c_tests)); + +const SadMxNAvgParam avg_c_tests[] = { + make_tuple(128, 128, &aom_sad128x128_avg_c, -1), + make_tuple(128, 64, &aom_sad128x64_avg_c, -1), + make_tuple(64, 128, &aom_sad64x128_avg_c, -1), + make_tuple(64, 64, &aom_sad64x64_avg_c, -1), + make_tuple(64, 32, &aom_sad64x32_avg_c, -1), + make_tuple(32, 64, &aom_sad32x64_avg_c, -1), + make_tuple(32, 32, &aom_sad32x32_avg_c, -1), + make_tuple(32, 16, &aom_sad32x16_avg_c, -1), + make_tuple(16, 32, &aom_sad16x32_avg_c, -1), + make_tuple(16, 16, &aom_sad16x16_avg_c, -1), + make_tuple(16, 8, &aom_sad16x8_avg_c, -1), + make_tuple(8, 16, &aom_sad8x16_avg_c, -1), + make_tuple(8, 8, &aom_sad8x8_avg_c, -1), + make_tuple(8, 4, &aom_sad8x4_avg_c, -1), + make_tuple(4, 8, &aom_sad4x8_avg_c, -1), + make_tuple(4, 4, &aom_sad4x4_avg_c, -1), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(128, 128, &aom_highbd_sad128x128_avg_c, 8), + make_tuple(128, 64, &aom_highbd_sad128x64_avg_c, 8), + make_tuple(64, 128, &aom_highbd_sad64x128_avg_c, 8), + make_tuple(64, 64, &aom_highbd_sad64x64_avg_c, 8), + make_tuple(64, 32, &aom_highbd_sad64x32_avg_c, 8), + make_tuple(32, 64, &aom_highbd_sad32x64_avg_c, 8), + make_tuple(32, 32, &aom_highbd_sad32x32_avg_c, 8), + make_tuple(32, 16, &aom_highbd_sad32x16_avg_c, 8), + make_tuple(16, 32, &aom_highbd_sad16x32_avg_c, 8), + make_tuple(16, 16, &aom_highbd_sad16x16_avg_c, 8), + make_tuple(16, 8, &aom_highbd_sad16x8_avg_c, 8), + make_tuple(8, 16, &aom_highbd_sad8x16_avg_c, 8), + make_tuple(8, 8, &aom_highbd_sad8x8_avg_c, 8), + make_tuple(8, 4, &aom_highbd_sad8x4_avg_c, 8), + make_tuple(4, 8, &aom_highbd_sad4x8_avg_c, 8), + make_tuple(4, 4, &aom_highbd_sad4x4_avg_c, 8), + make_tuple(128, 128, &aom_highbd_sad128x128_avg_c, 10), + make_tuple(128, 64, &aom_highbd_sad128x64_avg_c, 10), + make_tuple(64, 128, &aom_highbd_sad64x128_avg_c, 10), + make_tuple(64, 64, &aom_highbd_sad64x64_avg_c, 10), + make_tuple(64, 32, &aom_highbd_sad64x32_avg_c, 10), + make_tuple(32, 64, &aom_highbd_sad32x64_avg_c, 10), + make_tuple(32, 32, &aom_highbd_sad32x32_avg_c, 10), + make_tuple(32, 16, &aom_highbd_sad32x16_avg_c, 10), + make_tuple(16, 32, &aom_highbd_sad16x32_avg_c, 10), + make_tuple(16, 16, &aom_highbd_sad16x16_avg_c, 10), + make_tuple(16, 8, &aom_highbd_sad16x8_avg_c, 10), + make_tuple(8, 16, &aom_highbd_sad8x16_avg_c, 10), + make_tuple(8, 8, &aom_highbd_sad8x8_avg_c, 10), + make_tuple(8, 4, &aom_highbd_sad8x4_avg_c, 10), + make_tuple(4, 8, &aom_highbd_sad4x8_avg_c, 10), + make_tuple(4, 4, &aom_highbd_sad4x4_avg_c, 10), + make_tuple(128, 128, &aom_highbd_sad128x128_avg_c, 12), + make_tuple(128, 64, &aom_highbd_sad128x64_avg_c, 12), + make_tuple(64, 128, &aom_highbd_sad64x128_avg_c, 12), + make_tuple(64, 64, &aom_highbd_sad64x64_avg_c, 12), + make_tuple(64, 32, &aom_highbd_sad64x32_avg_c, 12), + make_tuple(32, 64, &aom_highbd_sad32x64_avg_c, 12), + make_tuple(32, 32, &aom_highbd_sad32x32_avg_c, 12), + make_tuple(32, 16, &aom_highbd_sad32x16_avg_c, 12), + make_tuple(16, 32, &aom_highbd_sad16x32_avg_c, 12), + make_tuple(16, 16, &aom_highbd_sad16x16_avg_c, 12), + make_tuple(16, 8, &aom_highbd_sad16x8_avg_c, 12), + make_tuple(8, 16, &aom_highbd_sad8x16_avg_c, 12), + make_tuple(8, 8, &aom_highbd_sad8x8_avg_c, 12), + make_tuple(8, 4, &aom_highbd_sad8x4_avg_c, 12), + make_tuple(4, 8, &aom_highbd_sad4x8_avg_c, 12), + make_tuple(4, 4, &aom_highbd_sad4x4_avg_c, 12), +#endif // CONFIG_AV1_HIGHBITDEPTH + make_tuple(64, 16, &aom_sad64x16_avg_c, -1), + make_tuple(16, 64, &aom_sad16x64_avg_c, -1), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(64, 16, &aom_highbd_sad64x16_avg_c, 8), + make_tuple(16, 64, &aom_highbd_sad16x64_avg_c, 8), + make_tuple(64, 16, &aom_highbd_sad64x16_avg_c, 10), + make_tuple(16, 64, &aom_highbd_sad16x64_avg_c, 10), + make_tuple(64, 16, &aom_highbd_sad64x16_avg_c, 12), + make_tuple(16, 64, &aom_highbd_sad16x64_avg_c, 12), +#endif + make_tuple(32, 8, &aom_sad32x8_avg_c, -1), + make_tuple(8, 32, &aom_sad8x32_avg_c, -1), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(32, 8, &aom_highbd_sad32x8_avg_c, 8), + make_tuple(8, 32, &aom_highbd_sad8x32_avg_c, 8), + make_tuple(32, 8, &aom_highbd_sad32x8_avg_c, 10), + make_tuple(8, 32, &aom_highbd_sad8x32_avg_c, 10), + make_tuple(32, 8, &aom_highbd_sad32x8_avg_c, 12), + make_tuple(8, 32, &aom_highbd_sad8x32_avg_c, 12), +#endif + make_tuple(16, 4, &aom_sad16x4_avg_c, -1), + make_tuple(4, 16, &aom_sad4x16_avg_c, -1), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(16, 4, &aom_highbd_sad16x4_avg_c, 8), + make_tuple(4, 16, &aom_highbd_sad4x16_avg_c, 8), + make_tuple(16, 4, &aom_highbd_sad16x4_avg_c, 10), + make_tuple(4, 16, &aom_highbd_sad4x16_avg_c, 10), + make_tuple(16, 4, &aom_highbd_sad16x4_avg_c, 12), + make_tuple(4, 16, &aom_highbd_sad4x16_avg_c, 12), +#endif +}; +INSTANTIATE_TEST_SUITE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests)); + +// TODO(chengchen): add highbd tests +const DistWtdCompAvgParam dist_wtd_comp_avg_c_tests[] = { + make_tuple(128, 128, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(128, 64, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(64, 128, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(64, 64, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(64, 32, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(32, 64, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(32, 32, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(32, 16, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(16, 32, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(16, 8, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(8, 16, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(8, 8, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(8, 4, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_c, -1), + + make_tuple(64, 16, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(16, 64, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(32, 8, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(8, 32, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(16, 4, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(4, 16, &aom_dist_wtd_comp_avg_pred_c, -1), +}; + +INSTANTIATE_TEST_SUITE_P(C, DistWtdCompAvgTest, + ::testing::ValuesIn(dist_wtd_comp_avg_c_tests)); + +const DistWtdSadMxNAvgParam dist_wtd_avg_c_tests[] = { + make_tuple(128, 128, &aom_dist_wtd_sad128x128_avg_c, -1), + make_tuple(128, 64, &aom_dist_wtd_sad128x64_avg_c, -1), + make_tuple(64, 128, &aom_dist_wtd_sad64x128_avg_c, -1), + make_tuple(64, 64, &aom_dist_wtd_sad64x64_avg_c, -1), + make_tuple(64, 32, &aom_dist_wtd_sad64x32_avg_c, -1), + make_tuple(32, 64, &aom_dist_wtd_sad32x64_avg_c, -1), + make_tuple(32, 32, &aom_dist_wtd_sad32x32_avg_c, -1), + make_tuple(32, 16, &aom_dist_wtd_sad32x16_avg_c, -1), + make_tuple(16, 32, &aom_dist_wtd_sad16x32_avg_c, -1), + make_tuple(16, 16, &aom_dist_wtd_sad16x16_avg_c, -1), + make_tuple(16, 8, &aom_dist_wtd_sad16x8_avg_c, -1), + make_tuple(8, 16, &aom_dist_wtd_sad8x16_avg_c, -1), + make_tuple(8, 8, &aom_dist_wtd_sad8x8_avg_c, -1), + make_tuple(8, 4, &aom_dist_wtd_sad8x4_avg_c, -1), + make_tuple(4, 8, &aom_dist_wtd_sad4x8_avg_c, -1), + make_tuple(4, 4, &aom_dist_wtd_sad4x4_avg_c, -1), + + make_tuple(64, 16, &aom_dist_wtd_sad64x16_avg_c, -1), + make_tuple(16, 64, &aom_dist_wtd_sad16x64_avg_c, -1), + make_tuple(32, 8, &aom_dist_wtd_sad32x8_avg_c, -1), + make_tuple(8, 32, &aom_dist_wtd_sad8x32_avg_c, -1), + make_tuple(16, 4, &aom_dist_wtd_sad16x4_avg_c, -1), + make_tuple(4, 16, &aom_dist_wtd_sad4x16_avg_c, -1), +}; + +INSTANTIATE_TEST_SUITE_P(C, DistWtdSADavgTest, + ::testing::ValuesIn(dist_wtd_avg_c_tests)); + +const SadMxNx4Param x4d_c_tests[] = { + make_tuple(128, 128, &aom_sad128x128x4d_c, -1), + make_tuple(128, 64, &aom_sad128x64x4d_c, -1), + make_tuple(64, 128, &aom_sad64x128x4d_c, -1), + make_tuple(64, 64, &aom_sad64x64x4d_c, -1), + make_tuple(64, 32, &aom_sad64x32x4d_c, -1), + make_tuple(32, 64, &aom_sad32x64x4d_c, -1), + make_tuple(32, 32, &aom_sad32x32x4d_c, -1), + make_tuple(32, 16, &aom_sad32x16x4d_c, -1), + make_tuple(16, 32, &aom_sad16x32x4d_c, -1), + make_tuple(16, 16, &aom_sad16x16x4d_c, -1), + make_tuple(16, 8, &aom_sad16x8x4d_c, -1), + make_tuple(8, 16, &aom_sad8x16x4d_c, -1), + make_tuple(8, 8, &aom_sad8x8x4d_c, -1), + make_tuple(8, 4, &aom_sad8x4x4d_c, -1), + make_tuple(4, 8, &aom_sad4x8x4d_c, -1), + make_tuple(4, 4, &aom_sad4x4x4d_c, -1), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(128, 128, &aom_highbd_sad128x128x4d_c, 8), + make_tuple(128, 64, &aom_highbd_sad128x64x4d_c, 8), + make_tuple(64, 128, &aom_highbd_sad64x128x4d_c, 8), + make_tuple(64, 64, &aom_highbd_sad64x64x4d_c, 8), + make_tuple(64, 32, &aom_highbd_sad64x32x4d_c, 8), + make_tuple(32, 64, &aom_highbd_sad32x64x4d_c, 8), + make_tuple(32, 32, &aom_highbd_sad32x32x4d_c, 8), + make_tuple(32, 16, &aom_highbd_sad32x16x4d_c, 8), + make_tuple(16, 32, &aom_highbd_sad16x32x4d_c, 8), + make_tuple(16, 16, &aom_highbd_sad16x16x4d_c, 8), + make_tuple(16, 8, &aom_highbd_sad16x8x4d_c, 8), + make_tuple(8, 16, &aom_highbd_sad8x16x4d_c, 8), + make_tuple(8, 8, &aom_highbd_sad8x8x4d_c, 8), + make_tuple(8, 4, &aom_highbd_sad8x4x4d_c, 8), + make_tuple(4, 8, &aom_highbd_sad4x8x4d_c, 8), + make_tuple(4, 4, &aom_highbd_sad4x4x4d_c, 8), + make_tuple(128, 128, &aom_highbd_sad128x128x4d_c, 10), + make_tuple(128, 64, &aom_highbd_sad128x64x4d_c, 10), + make_tuple(64, 128, &aom_highbd_sad64x128x4d_c, 10), + make_tuple(64, 64, &aom_highbd_sad64x64x4d_c, 10), + make_tuple(64, 32, &aom_highbd_sad64x32x4d_c, 10), + make_tuple(32, 64, &aom_highbd_sad32x64x4d_c, 10), + make_tuple(32, 32, &aom_highbd_sad32x32x4d_c, 10), + make_tuple(32, 16, &aom_highbd_sad32x16x4d_c, 10), + make_tuple(16, 32, &aom_highbd_sad16x32x4d_c, 10), + make_tuple(16, 16, &aom_highbd_sad16x16x4d_c, 10), + make_tuple(16, 8, &aom_highbd_sad16x8x4d_c, 10), + make_tuple(8, 16, &aom_highbd_sad8x16x4d_c, 10), + make_tuple(8, 8, &aom_highbd_sad8x8x4d_c, 10), + make_tuple(8, 4, &aom_highbd_sad8x4x4d_c, 10), + make_tuple(4, 8, &aom_highbd_sad4x8x4d_c, 10), + make_tuple(4, 4, &aom_highbd_sad4x4x4d_c, 10), + make_tuple(128, 128, &aom_highbd_sad128x128x4d_c, 12), + make_tuple(128, 64, &aom_highbd_sad128x64x4d_c, 12), + make_tuple(64, 128, &aom_highbd_sad64x128x4d_c, 12), + make_tuple(64, 64, &aom_highbd_sad64x64x4d_c, 12), + make_tuple(64, 32, &aom_highbd_sad64x32x4d_c, 12), + make_tuple(32, 64, &aom_highbd_sad32x64x4d_c, 12), + make_tuple(32, 32, &aom_highbd_sad32x32x4d_c, 12), + make_tuple(32, 16, &aom_highbd_sad32x16x4d_c, 12), + make_tuple(16, 32, &aom_highbd_sad16x32x4d_c, 12), + make_tuple(16, 16, &aom_highbd_sad16x16x4d_c, 12), + make_tuple(16, 8, &aom_highbd_sad16x8x4d_c, 12), + make_tuple(8, 16, &aom_highbd_sad8x16x4d_c, 12), + make_tuple(8, 8, &aom_highbd_sad8x8x4d_c, 12), + make_tuple(8, 4, &aom_highbd_sad8x4x4d_c, 12), + make_tuple(4, 8, &aom_highbd_sad4x8x4d_c, 12), + make_tuple(4, 4, &aom_highbd_sad4x4x4d_c, 12), +#endif + make_tuple(64, 16, &aom_sad64x16x4d_c, -1), + make_tuple(16, 64, &aom_sad16x64x4d_c, -1), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(64, 16, &aom_highbd_sad64x16x4d_c, 8), + make_tuple(16, 64, &aom_highbd_sad16x64x4d_c, 8), + make_tuple(64, 16, &aom_highbd_sad64x16x4d_c, 10), + make_tuple(16, 64, &aom_highbd_sad16x64x4d_c, 10), + make_tuple(64, 16, &aom_highbd_sad64x16x4d_c, 12), + make_tuple(16, 64, &aom_highbd_sad16x64x4d_c, 12), +#endif + make_tuple(32, 8, &aom_sad32x8x4d_c, -1), + make_tuple(8, 32, &aom_sad8x32x4d_c, -1), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(32, 8, &aom_highbd_sad32x8x4d_c, 8), + make_tuple(8, 32, &aom_highbd_sad8x32x4d_c, 8), + make_tuple(32, 8, &aom_highbd_sad32x8x4d_c, 10), + make_tuple(8, 32, &aom_highbd_sad8x32x4d_c, 10), + make_tuple(32, 8, &aom_highbd_sad32x8x4d_c, 12), + make_tuple(8, 32, &aom_highbd_sad8x32x4d_c, 12), +#endif + make_tuple(16, 4, &aom_sad16x4x4d_c, -1), + make_tuple(4, 16, &aom_sad4x16x4d_c, -1), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(16, 4, &aom_highbd_sad16x4x4d_c, 8), + make_tuple(4, 16, &aom_highbd_sad4x16x4d_c, 8), + make_tuple(16, 4, &aom_highbd_sad16x4x4d_c, 10), + make_tuple(4, 16, &aom_highbd_sad4x16x4d_c, 10), + make_tuple(16, 4, &aom_highbd_sad16x4x4d_c, 12), + make_tuple(4, 16, &aom_highbd_sad4x16x4d_c, 12), +#endif +}; +INSTANTIATE_TEST_SUITE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests)); + +const SadMxNx4AvgParam x4d_avg_c_tests[] = { + make_tuple(128, 128, &aom_sad128x128x4d_avg_c, -1), + make_tuple(128, 64, &aom_sad128x64x4d_avg_c, -1), + make_tuple(64, 128, &aom_sad64x128x4d_avg_c, -1), + make_tuple(64, 64, &aom_sad64x64x4d_avg_c, -1), + make_tuple(64, 32, &aom_sad64x32x4d_avg_c, -1), + make_tuple(32, 64, &aom_sad32x64x4d_avg_c, -1), + make_tuple(32, 32, &aom_sad32x32x4d_avg_c, -1), + make_tuple(32, 16, &aom_sad32x16x4d_avg_c, -1), + make_tuple(16, 32, &aom_sad16x32x4d_avg_c, -1), + make_tuple(16, 16, &aom_sad16x16x4d_avg_c, -1), + make_tuple(16, 8, &aom_sad16x8x4d_avg_c, -1), + make_tuple(8, 16, &aom_sad8x16x4d_avg_c, -1), + make_tuple(8, 8, &aom_sad8x8x4d_avg_c, -1), + make_tuple(8, 4, &aom_sad8x4x4d_avg_c, -1), + make_tuple(4, 8, &aom_sad4x8x4d_avg_c, -1), + make_tuple(4, 4, &aom_sad4x4x4d_avg_c, -1), + make_tuple(64, 16, &aom_sad64x16x4d_avg_c, -1), + make_tuple(16, 64, &aom_sad16x64x4d_avg_c, -1), + make_tuple(32, 8, &aom_sad32x8x4d_avg_c, -1), + make_tuple(8, 32, &aom_sad8x32x4d_avg_c, -1), + make_tuple(16, 4, &aom_sad16x4x4d_avg_c, -1), + make_tuple(4, 16, &aom_sad4x16x4d_avg_c, -1), +}; +INSTANTIATE_TEST_SUITE_P(C, SADx4AvgTest, ::testing::ValuesIn(x4d_avg_c_tests)); + +//------------------------------------------------------------------------------ +// ARM functions +#if HAVE_NEON +const SadMxNParam neon_tests[] = { + make_tuple(64, 64, &aom_sad64x64_neon, -1), + make_tuple(32, 32, &aom_sad32x32_neon, -1), + make_tuple(16, 16, &aom_sad16x16_neon, -1), + make_tuple(16, 8, &aom_sad16x8_neon, -1), + make_tuple(8, 16, &aom_sad8x16_neon, -1), + make_tuple(8, 8, &aom_sad8x8_neon, -1), + make_tuple(4, 4, &aom_sad4x4_neon, -1), +}; +INSTANTIATE_TEST_SUITE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests)); + +const SadMxNx4Param x4d_neon_tests[] = { + make_tuple(64, 64, &aom_sad64x64x4d_neon, -1), + make_tuple(32, 32, &aom_sad32x32x4d_neon, -1), + make_tuple(16, 16, &aom_sad16x16x4d_neon, -1), +}; +INSTANTIATE_TEST_SUITE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests)); +#endif // HAVE_NEON + +//------------------------------------------------------------------------------ +// x86 functions +#if HAVE_SSE2 +const SadMxNParam sse2_tests[] = { + make_tuple(128, 128, &aom_sad128x128_sse2, -1), + make_tuple(128, 64, &aom_sad128x64_sse2, -1), + make_tuple(64, 128, &aom_sad64x128_sse2, -1), + make_tuple(64, 64, &aom_sad64x64_sse2, -1), + make_tuple(64, 32, &aom_sad64x32_sse2, -1), + make_tuple(32, 64, &aom_sad32x64_sse2, -1), + make_tuple(32, 32, &aom_sad32x32_sse2, -1), + make_tuple(32, 16, &aom_sad32x16_sse2, -1), + make_tuple(16, 32, &aom_sad16x32_sse2, -1), + make_tuple(16, 16, &aom_sad16x16_sse2, -1), + make_tuple(16, 8, &aom_sad16x8_sse2, -1), + make_tuple(8, 16, &aom_sad8x16_sse2, -1), + make_tuple(8, 8, &aom_sad8x8_sse2, -1), + make_tuple(8, 4, &aom_sad8x4_sse2, -1), + make_tuple(4, 8, &aom_sad4x8_sse2, -1), + make_tuple(4, 4, &aom_sad4x4_sse2, -1), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(64, 64, &aom_highbd_sad64x64_sse2, 8), + make_tuple(64, 32, &aom_highbd_sad64x32_sse2, 8), + make_tuple(32, 64, &aom_highbd_sad32x64_sse2, 8), + make_tuple(32, 32, &aom_highbd_sad32x32_sse2, 8), + make_tuple(32, 16, &aom_highbd_sad32x16_sse2, 8), + make_tuple(16, 32, &aom_highbd_sad16x32_sse2, 8), + make_tuple(16, 16, &aom_highbd_sad16x16_sse2, 8), + make_tuple(16, 8, &aom_highbd_sad16x8_sse2, 8), + make_tuple(8, 16, &aom_highbd_sad8x16_sse2, 8), + make_tuple(8, 8, &aom_highbd_sad8x8_sse2, 8), + make_tuple(8, 4, &aom_highbd_sad8x4_sse2, 8), + make_tuple(4, 8, &aom_highbd_sad4x8_sse2, 8), + make_tuple(4, 4, &aom_highbd_sad4x4_sse2, 8), + make_tuple(64, 64, &aom_highbd_sad64x64_sse2, 10), + make_tuple(64, 32, &aom_highbd_sad64x32_sse2, 10), + make_tuple(32, 64, &aom_highbd_sad32x64_sse2, 10), + make_tuple(32, 32, &aom_highbd_sad32x32_sse2, 10), + make_tuple(32, 16, &aom_highbd_sad32x16_sse2, 10), + make_tuple(16, 32, &aom_highbd_sad16x32_sse2, 10), + make_tuple(16, 16, &aom_highbd_sad16x16_sse2, 10), + make_tuple(16, 8, &aom_highbd_sad16x8_sse2, 10), + make_tuple(8, 16, &aom_highbd_sad8x16_sse2, 10), + make_tuple(8, 8, &aom_highbd_sad8x8_sse2, 10), + make_tuple(8, 4, &aom_highbd_sad8x4_sse2, 10), + make_tuple(4, 8, &aom_highbd_sad4x8_sse2, 10), + make_tuple(4, 4, &aom_highbd_sad4x4_sse2, 10), + make_tuple(64, 64, &aom_highbd_sad64x64_sse2, 12), + make_tuple(64, 32, &aom_highbd_sad64x32_sse2, 12), + make_tuple(32, 64, &aom_highbd_sad32x64_sse2, 12), + make_tuple(32, 32, &aom_highbd_sad32x32_sse2, 12), + make_tuple(32, 16, &aom_highbd_sad32x16_sse2, 12), + make_tuple(16, 32, &aom_highbd_sad16x32_sse2, 12), + make_tuple(16, 16, &aom_highbd_sad16x16_sse2, 12), + make_tuple(16, 8, &aom_highbd_sad16x8_sse2, 12), + make_tuple(8, 16, &aom_highbd_sad8x16_sse2, 12), + make_tuple(8, 8, &aom_highbd_sad8x8_sse2, 12), + make_tuple(8, 4, &aom_highbd_sad8x4_sse2, 12), + make_tuple(4, 8, &aom_highbd_sad4x8_sse2, 12), + make_tuple(4, 4, &aom_highbd_sad4x4_sse2, 12), +#endif + make_tuple(64, 16, &aom_sad64x16_sse2, -1), + make_tuple(16, 64, &aom_sad16x64_sse2, -1), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(64, 16, &aom_highbd_sad64x16_sse2, 8), + make_tuple(16, 64, &aom_highbd_sad16x64_sse2, 8), + make_tuple(64, 16, &aom_highbd_sad64x16_sse2, 10), + make_tuple(16, 64, &aom_highbd_sad16x64_sse2, 10), + make_tuple(64, 16, &aom_highbd_sad64x16_sse2, 12), + make_tuple(16, 64, &aom_highbd_sad16x64_sse2, 12), +#endif + make_tuple(32, 8, &aom_sad32x8_sse2, -1), + make_tuple(8, 32, &aom_sad8x32_sse2, -1), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(32, 8, &aom_highbd_sad32x8_sse2, 8), + make_tuple(8, 32, &aom_highbd_sad8x32_sse2, 8), + make_tuple(32, 8, &aom_highbd_sad32x8_sse2, 10), + make_tuple(8, 32, &aom_highbd_sad8x32_sse2, 10), + make_tuple(32, 8, &aom_highbd_sad32x8_sse2, 12), + make_tuple(8, 32, &aom_highbd_sad8x32_sse2, 12), +#endif + make_tuple(16, 4, &aom_sad16x4_sse2, -1), + make_tuple(4, 16, &aom_sad4x16_sse2, -1), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(16, 4, &aom_highbd_sad16x4_sse2, 8), + make_tuple(4, 16, &aom_highbd_sad4x16_sse2, 8), + make_tuple(16, 4, &aom_highbd_sad16x4_sse2, 10), + make_tuple(4, 16, &aom_highbd_sad4x16_sse2, 10), + make_tuple(16, 4, &aom_highbd_sad16x4_sse2, 12), + make_tuple(4, 16, &aom_highbd_sad4x16_sse2, 12), +#endif +}; +INSTANTIATE_TEST_SUITE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests)); + +const SadMxNAvgParam avg_sse2_tests[] = { + make_tuple(128, 128, &aom_sad128x128_avg_sse2, -1), + make_tuple(128, 64, &aom_sad128x64_avg_sse2, -1), + make_tuple(64, 128, &aom_sad64x128_avg_sse2, -1), + make_tuple(64, 64, &aom_sad64x64_avg_sse2, -1), + make_tuple(64, 32, &aom_sad64x32_avg_sse2, -1), + make_tuple(32, 64, &aom_sad32x64_avg_sse2, -1), + make_tuple(32, 32, &aom_sad32x32_avg_sse2, -1), + make_tuple(32, 16, &aom_sad32x16_avg_sse2, -1), + make_tuple(16, 32, &aom_sad16x32_avg_sse2, -1), + make_tuple(16, 16, &aom_sad16x16_avg_sse2, -1), + make_tuple(16, 8, &aom_sad16x8_avg_sse2, -1), + make_tuple(8, 16, &aom_sad8x16_avg_sse2, -1), + make_tuple(8, 8, &aom_sad8x8_avg_sse2, -1), + make_tuple(8, 4, &aom_sad8x4_avg_sse2, -1), + make_tuple(4, 8, &aom_sad4x8_avg_sse2, -1), + make_tuple(4, 4, &aom_sad4x4_avg_sse2, -1), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(64, 64, &aom_highbd_sad64x64_avg_sse2, 8), + make_tuple(64, 32, &aom_highbd_sad64x32_avg_sse2, 8), + make_tuple(32, 64, &aom_highbd_sad32x64_avg_sse2, 8), + make_tuple(32, 32, &aom_highbd_sad32x32_avg_sse2, 8), + make_tuple(32, 16, &aom_highbd_sad32x16_avg_sse2, 8), + make_tuple(16, 32, &aom_highbd_sad16x32_avg_sse2, 8), + make_tuple(16, 16, &aom_highbd_sad16x16_avg_sse2, 8), + make_tuple(16, 8, &aom_highbd_sad16x8_avg_sse2, 8), + make_tuple(8, 16, &aom_highbd_sad8x16_avg_sse2, 8), + make_tuple(8, 8, &aom_highbd_sad8x8_avg_sse2, 8), + make_tuple(8, 4, &aom_highbd_sad8x4_avg_sse2, 8), + make_tuple(4, 8, &aom_highbd_sad4x8_avg_sse2, 8), + make_tuple(4, 4, &aom_highbd_sad4x4_avg_sse2, 8), + make_tuple(64, 64, &aom_highbd_sad64x64_avg_sse2, 10), + make_tuple(64, 32, &aom_highbd_sad64x32_avg_sse2, 10), + make_tuple(32, 64, &aom_highbd_sad32x64_avg_sse2, 10), + make_tuple(32, 32, &aom_highbd_sad32x32_avg_sse2, 10), + make_tuple(32, 16, &aom_highbd_sad32x16_avg_sse2, 10), + make_tuple(16, 32, &aom_highbd_sad16x32_avg_sse2, 10), + make_tuple(16, 16, &aom_highbd_sad16x16_avg_sse2, 10), + make_tuple(16, 8, &aom_highbd_sad16x8_avg_sse2, 10), + make_tuple(8, 16, &aom_highbd_sad8x16_avg_sse2, 10), + make_tuple(8, 8, &aom_highbd_sad8x8_avg_sse2, 10), + make_tuple(8, 4, &aom_highbd_sad8x4_avg_sse2, 10), + make_tuple(4, 8, &aom_highbd_sad4x8_avg_sse2, 10), + make_tuple(4, 4, &aom_highbd_sad4x4_avg_sse2, 10), + make_tuple(64, 64, &aom_highbd_sad64x64_avg_sse2, 12), + make_tuple(64, 32, &aom_highbd_sad64x32_avg_sse2, 12), + make_tuple(32, 64, &aom_highbd_sad32x64_avg_sse2, 12), + make_tuple(32, 32, &aom_highbd_sad32x32_avg_sse2, 12), + make_tuple(32, 16, &aom_highbd_sad32x16_avg_sse2, 12), + make_tuple(16, 32, &aom_highbd_sad16x32_avg_sse2, 12), + make_tuple(16, 16, &aom_highbd_sad16x16_avg_sse2, 12), + make_tuple(16, 8, &aom_highbd_sad16x8_avg_sse2, 12), + make_tuple(8, 16, &aom_highbd_sad8x16_avg_sse2, 12), + make_tuple(8, 8, &aom_highbd_sad8x8_avg_sse2, 12), + make_tuple(8, 4, &aom_highbd_sad8x4_avg_sse2, 12), + make_tuple(4, 8, &aom_highbd_sad4x8_avg_sse2, 12), + make_tuple(4, 4, &aom_highbd_sad4x4_avg_sse2, 12), +#endif + make_tuple(64, 16, &aom_sad64x16_avg_sse2, -1), + make_tuple(16, 64, &aom_sad16x64_avg_sse2, -1), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(64, 16, &aom_highbd_sad64x16_avg_sse2, 8), + make_tuple(16, 64, &aom_highbd_sad16x64_avg_sse2, 8), + make_tuple(64, 16, &aom_highbd_sad64x16_avg_sse2, 10), + make_tuple(16, 64, &aom_highbd_sad16x64_avg_sse2, 10), + make_tuple(64, 16, &aom_highbd_sad64x16_avg_sse2, 12), + make_tuple(16, 64, &aom_highbd_sad16x64_avg_sse2, 12), +#endif + make_tuple(32, 8, &aom_sad32x8_avg_sse2, -1), + make_tuple(8, 32, &aom_sad8x32_avg_sse2, -1), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(32, 8, &aom_highbd_sad32x8_avg_sse2, 8), + make_tuple(8, 32, &aom_highbd_sad8x32_avg_sse2, 8), + make_tuple(32, 8, &aom_highbd_sad32x8_avg_sse2, 10), + make_tuple(8, 32, &aom_highbd_sad8x32_avg_sse2, 10), + make_tuple(32, 8, &aom_highbd_sad32x8_avg_sse2, 12), + make_tuple(8, 32, &aom_highbd_sad8x32_avg_sse2, 12), +#endif + make_tuple(16, 4, &aom_sad16x4_avg_sse2, -1), + make_tuple(4, 16, &aom_sad4x16_avg_sse2, -1), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(16, 4, &aom_highbd_sad16x4_avg_sse2, 8), + make_tuple(4, 16, &aom_highbd_sad4x16_avg_sse2, 8), + make_tuple(16, 4, &aom_highbd_sad16x4_avg_sse2, 10), + make_tuple(4, 16, &aom_highbd_sad4x16_avg_sse2, 10), + make_tuple(16, 4, &aom_highbd_sad16x4_avg_sse2, 12), + make_tuple(4, 16, &aom_highbd_sad4x16_avg_sse2, 12), +#endif +}; +INSTANTIATE_TEST_SUITE_P(SSE2, SADavgTest, ::testing::ValuesIn(avg_sse2_tests)); + +const SadMxNx4Param x4d_sse2_tests[] = { + make_tuple(128, 128, &aom_sad128x128x4d_sse2, -1), + make_tuple(128, 64, &aom_sad128x64x4d_sse2, -1), + make_tuple(64, 128, &aom_sad64x128x4d_sse2, -1), + make_tuple(64, 64, &aom_sad64x64x4d_sse2, -1), + make_tuple(64, 32, &aom_sad64x32x4d_sse2, -1), + make_tuple(32, 64, &aom_sad32x64x4d_sse2, -1), + make_tuple(32, 32, &aom_sad32x32x4d_sse2, -1), + make_tuple(32, 16, &aom_sad32x16x4d_sse2, -1), + make_tuple(16, 32, &aom_sad16x32x4d_sse2, -1), + make_tuple(16, 16, &aom_sad16x16x4d_sse2, -1), + make_tuple(16, 8, &aom_sad16x8x4d_sse2, -1), + make_tuple(8, 16, &aom_sad8x16x4d_sse2, -1), + make_tuple(8, 8, &aom_sad8x8x4d_sse2, -1), + make_tuple(8, 4, &aom_sad8x4x4d_sse2, -1), + make_tuple(4, 8, &aom_sad4x8x4d_sse2, -1), + make_tuple(4, 4, &aom_sad4x4x4d_sse2, -1), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(64, 64, &aom_highbd_sad64x64x4d_sse2, 8), + make_tuple(64, 32, &aom_highbd_sad64x32x4d_sse2, 8), + make_tuple(32, 64, &aom_highbd_sad32x64x4d_sse2, 8), + make_tuple(32, 32, &aom_highbd_sad32x32x4d_sse2, 8), + make_tuple(32, 16, &aom_highbd_sad32x16x4d_sse2, 8), + make_tuple(16, 32, &aom_highbd_sad16x32x4d_sse2, 8), + make_tuple(16, 16, &aom_highbd_sad16x16x4d_sse2, 8), + make_tuple(16, 8, &aom_highbd_sad16x8x4d_sse2, 8), + make_tuple(8, 16, &aom_highbd_sad8x16x4d_sse2, 8), + make_tuple(8, 8, &aom_highbd_sad8x8x4d_sse2, 8), + make_tuple(8, 4, &aom_highbd_sad8x4x4d_sse2, 8), + make_tuple(4, 8, &aom_highbd_sad4x8x4d_sse2, 8), + make_tuple(4, 4, &aom_highbd_sad4x4x4d_sse2, 8), + make_tuple(64, 64, &aom_highbd_sad64x64x4d_sse2, 10), + make_tuple(64, 32, &aom_highbd_sad64x32x4d_sse2, 10), + make_tuple(32, 64, &aom_highbd_sad32x64x4d_sse2, 10), + make_tuple(32, 32, &aom_highbd_sad32x32x4d_sse2, 10), + make_tuple(32, 16, &aom_highbd_sad32x16x4d_sse2, 10), + make_tuple(16, 32, &aom_highbd_sad16x32x4d_sse2, 10), + make_tuple(16, 16, &aom_highbd_sad16x16x4d_sse2, 10), + make_tuple(16, 8, &aom_highbd_sad16x8x4d_sse2, 10), + make_tuple(8, 16, &aom_highbd_sad8x16x4d_sse2, 10), + make_tuple(8, 8, &aom_highbd_sad8x8x4d_sse2, 10), + make_tuple(8, 4, &aom_highbd_sad8x4x4d_sse2, 10), + make_tuple(4, 8, &aom_highbd_sad4x8x4d_sse2, 10), + make_tuple(4, 4, &aom_highbd_sad4x4x4d_sse2, 10), + make_tuple(64, 64, &aom_highbd_sad64x64x4d_sse2, 12), + make_tuple(64, 32, &aom_highbd_sad64x32x4d_sse2, 12), + make_tuple(32, 64, &aom_highbd_sad32x64x4d_sse2, 12), + make_tuple(32, 32, &aom_highbd_sad32x32x4d_sse2, 12), + make_tuple(32, 16, &aom_highbd_sad32x16x4d_sse2, 12), + make_tuple(16, 32, &aom_highbd_sad16x32x4d_sse2, 12), + make_tuple(16, 16, &aom_highbd_sad16x16x4d_sse2, 12), + make_tuple(16, 8, &aom_highbd_sad16x8x4d_sse2, 12), + make_tuple(8, 16, &aom_highbd_sad8x16x4d_sse2, 12), + make_tuple(8, 8, &aom_highbd_sad8x8x4d_sse2, 12), + make_tuple(8, 4, &aom_highbd_sad8x4x4d_sse2, 12), + make_tuple(4, 8, &aom_highbd_sad4x8x4d_sse2, 12), + make_tuple(4, 4, &aom_highbd_sad4x4x4d_sse2, 12), +#endif + make_tuple(64, 16, &aom_sad64x16x4d_sse2, -1), + make_tuple(16, 64, &aom_sad16x64x4d_sse2, -1), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(64, 16, &aom_highbd_sad64x16x4d_sse2, 8), + make_tuple(16, 64, &aom_highbd_sad16x64x4d_sse2, 8), + make_tuple(64, 16, &aom_highbd_sad64x16x4d_sse2, 10), + make_tuple(16, 64, &aom_highbd_sad16x64x4d_sse2, 10), + make_tuple(64, 16, &aom_highbd_sad64x16x4d_sse2, 12), + make_tuple(16, 64, &aom_highbd_sad16x64x4d_sse2, 12), +#endif + make_tuple(32, 8, &aom_sad32x8x4d_sse2, -1), + make_tuple(8, 32, &aom_sad8x32x4d_sse2, -1), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(32, 8, &aom_highbd_sad32x8x4d_sse2, 8), + make_tuple(8, 32, &aom_highbd_sad8x32x4d_sse2, 8), + make_tuple(32, 8, &aom_highbd_sad32x8x4d_sse2, 10), + make_tuple(8, 32, &aom_highbd_sad8x32x4d_sse2, 10), + make_tuple(32, 8, &aom_highbd_sad32x8x4d_sse2, 12), + make_tuple(8, 32, &aom_highbd_sad8x32x4d_sse2, 12), +#endif + make_tuple(16, 4, &aom_sad16x4x4d_sse2, -1), + make_tuple(4, 16, &aom_sad4x16x4d_sse2, -1), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(16, 4, &aom_highbd_sad16x4x4d_sse2, 8), + make_tuple(4, 16, &aom_highbd_sad4x16x4d_sse2, 8), + make_tuple(16, 4, &aom_highbd_sad16x4x4d_sse2, 10), + make_tuple(4, 16, &aom_highbd_sad4x16x4d_sse2, 10), + make_tuple(16, 4, &aom_highbd_sad16x4x4d_sse2, 12), + make_tuple(4, 16, &aom_highbd_sad4x16x4d_sse2, 12), +#endif +}; +INSTANTIATE_TEST_SUITE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests)); + +const SadMxNx4AvgParam x4d_avg_sse2_tests[] = { + make_tuple(128, 128, &aom_sad128x128x4d_avg_sse2, -1), + make_tuple(128, 64, &aom_sad128x64x4d_avg_sse2, -1), + make_tuple(64, 128, &aom_sad64x128x4d_avg_sse2, -1), + make_tuple(64, 64, &aom_sad64x64x4d_avg_sse2, -1), + make_tuple(64, 32, &aom_sad64x32x4d_avg_sse2, -1), + make_tuple(32, 64, &aom_sad32x64x4d_avg_sse2, -1), + make_tuple(32, 32, &aom_sad32x32x4d_avg_sse2, -1), + make_tuple(32, 16, &aom_sad32x16x4d_avg_sse2, -1), + make_tuple(16, 32, &aom_sad16x32x4d_avg_sse2, -1), + make_tuple(16, 16, &aom_sad16x16x4d_avg_sse2, -1), + make_tuple(16, 8, &aom_sad16x8x4d_avg_sse2, -1), + make_tuple(8, 16, &aom_sad8x16x4d_avg_sse2, -1), + make_tuple(8, 8, &aom_sad8x8x4d_avg_sse2, -1), + make_tuple(8, 4, &aom_sad8x4x4d_avg_sse2, -1), + make_tuple(4, 8, &aom_sad4x8x4d_avg_sse2, -1), + make_tuple(4, 4, &aom_sad4x4x4d_avg_sse2, -1), + make_tuple(64, 16, &aom_sad64x16x4d_avg_sse2, -1), + make_tuple(16, 64, &aom_sad16x64x4d_avg_sse2, -1), + make_tuple(32, 8, &aom_sad32x8x4d_avg_sse2, -1), + make_tuple(8, 32, &aom_sad8x32x4d_avg_sse2, -1), + make_tuple(16, 4, &aom_sad16x4x4d_avg_sse2, -1), + make_tuple(4, 16, &aom_sad4x16x4d_avg_sse2, -1), +}; +INSTANTIATE_TEST_SUITE_P(SSE2, SADx4AvgTest, + ::testing::ValuesIn(x4d_avg_sse2_tests)); +#endif // HAVE_SSE2 + +#if HAVE_SSSE3 +// Note: These are named sse2, but part of ssse3 file and only built and linked +// when ssse3 is enabled. +const DistWtdSadMxhParam dist_wtd_sad_sse2_tests[] = { + make_tuple(4, 4, &aom_sad4xh_sse2, -1), + make_tuple(4, 8, &aom_sad4xh_sse2, -1), + make_tuple(8, 4, &aom_sad8xh_sse2, -1), + make_tuple(8, 8, &aom_sad8xh_sse2, -1), + make_tuple(8, 16, &aom_sad8xh_sse2, -1), + make_tuple(16, 8, &aom_sad16xh_sse2, -1), + make_tuple(16, 16, &aom_sad16xh_sse2, -1), + make_tuple(16, 32, &aom_sad16xh_sse2, -1), + make_tuple(32, 16, &aom_sad32xh_sse2, -1), + make_tuple(32, 32, &aom_sad32xh_sse2, -1), + make_tuple(32, 64, &aom_sad32xh_sse2, -1), + make_tuple(64, 32, &aom_sad64xh_sse2, -1), + make_tuple(64, 64, &aom_sad64xh_sse2, -1), + make_tuple(128, 128, &aom_sad128xh_sse2, -1), + make_tuple(128, 64, &aom_sad128xh_sse2, -1), + make_tuple(64, 128, &aom_sad64xh_sse2, -1), + make_tuple(4, 16, &aom_sad4xh_sse2, -1), + make_tuple(16, 4, &aom_sad16xh_sse2, -1), + make_tuple(8, 32, &aom_sad8xh_sse2, -1), + make_tuple(32, 8, &aom_sad32xh_sse2, -1), + make_tuple(16, 64, &aom_sad16xh_sse2, -1), + make_tuple(64, 16, &aom_sad64xh_sse2, -1), + + make_tuple(16, 64, &aom_sad16xh_sse2, -1), + make_tuple(64, 16, &aom_sad64xh_sse2, -1), + make_tuple(8, 32, &aom_sad8xh_sse2, -1), + make_tuple(32, 8, &aom_sad32xh_sse2, -1), + make_tuple(4, 16, &aom_sad4xh_sse2, -1), + make_tuple(16, 4, &aom_sad16xh_sse2, -1), +}; +INSTANTIATE_TEST_SUITE_P(SSE2, DistWtdSADTest, + ::testing::ValuesIn(dist_wtd_sad_sse2_tests)); + +#endif // HAVE_SSSE3 + +#if HAVE_SSE3 +// Only functions are x3, which do not have tests. +#endif // HAVE_SSE3 + +#if HAVE_SSSE3 +const DistWtdCompAvgParam dist_wtd_comp_avg_ssse3_tests[] = { + make_tuple(128, 128, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(128, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(64, 128, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(64, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(64, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(32, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(32, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(32, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(16, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(16, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(8, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(8, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(8, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + + make_tuple(64, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(16, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(32, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(8, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(16, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(4, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1), +}; + +INSTANTIATE_TEST_SUITE_P(SSSE3, DistWtdCompAvgTest, + ::testing::ValuesIn(dist_wtd_comp_avg_ssse3_tests)); + +const DistWtdSadMxNAvgParam dist_wtd_avg_ssse3_tests[] = { + make_tuple(128, 128, &aom_dist_wtd_sad128x128_avg_ssse3, -1), + make_tuple(128, 64, &aom_dist_wtd_sad128x64_avg_ssse3, -1), + make_tuple(64, 128, &aom_dist_wtd_sad64x128_avg_ssse3, -1), + make_tuple(64, 64, &aom_dist_wtd_sad64x64_avg_ssse3, -1), + make_tuple(64, 32, &aom_dist_wtd_sad64x32_avg_ssse3, -1), + make_tuple(32, 64, &aom_dist_wtd_sad32x64_avg_ssse3, -1), + make_tuple(32, 32, &aom_dist_wtd_sad32x32_avg_ssse3, -1), + make_tuple(32, 16, &aom_dist_wtd_sad32x16_avg_ssse3, -1), + make_tuple(16, 32, &aom_dist_wtd_sad16x32_avg_ssse3, -1), + make_tuple(16, 16, &aom_dist_wtd_sad16x16_avg_ssse3, -1), + make_tuple(16, 8, &aom_dist_wtd_sad16x8_avg_ssse3, -1), + make_tuple(8, 16, &aom_dist_wtd_sad8x16_avg_ssse3, -1), + make_tuple(8, 8, &aom_dist_wtd_sad8x8_avg_ssse3, -1), + make_tuple(8, 4, &aom_dist_wtd_sad8x4_avg_ssse3, -1), + make_tuple(4, 8, &aom_dist_wtd_sad4x8_avg_ssse3, -1), + make_tuple(4, 4, &aom_dist_wtd_sad4x4_avg_ssse3, -1), + + make_tuple(64, 16, &aom_dist_wtd_sad64x16_avg_ssse3, -1), + make_tuple(16, 64, &aom_dist_wtd_sad16x64_avg_ssse3, -1), + make_tuple(32, 8, &aom_dist_wtd_sad32x8_avg_ssse3, -1), + make_tuple(8, 32, &aom_dist_wtd_sad8x32_avg_ssse3, -1), + make_tuple(16, 4, &aom_dist_wtd_sad16x4_avg_ssse3, -1), + make_tuple(4, 16, &aom_dist_wtd_sad4x16_avg_ssse3, -1), +}; +INSTANTIATE_TEST_SUITE_P(SSSE3, DistWtdSADavgTest, + ::testing::ValuesIn(dist_wtd_avg_ssse3_tests)); +#endif // HAVE_SSSE3 + +#if HAVE_SSE4_1 +// Only functions are x8, which do not have tests. +#endif // HAVE_SSE4_1 + +#if HAVE_AVX2 +const SadMxNParam avx2_tests[] = { + make_tuple(64, 128, &aom_sad64x128_avx2, -1), + make_tuple(128, 64, &aom_sad128x64_avx2, -1), + make_tuple(128, 128, &aom_sad128x128_avx2, -1), + make_tuple(64, 64, &aom_sad64x64_avx2, -1), + make_tuple(64, 32, &aom_sad64x32_avx2, -1), + make_tuple(32, 64, &aom_sad32x64_avx2, -1), + make_tuple(32, 32, &aom_sad32x32_avx2, -1), + make_tuple(32, 16, &aom_sad32x16_avx2, -1), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(128, 128, &aom_highbd_sad128x128_avx2, 8), + make_tuple(128, 128, &aom_highbd_sad128x128_avx2, 10), + make_tuple(128, 128, &aom_highbd_sad128x128_avx2, 12), + make_tuple(128, 64, &aom_highbd_sad128x64_avx2, 8), + make_tuple(128, 64, &aom_highbd_sad128x64_avx2, 10), + make_tuple(128, 64, &aom_highbd_sad128x64_avx2, 12), + make_tuple(64, 128, &aom_highbd_sad64x128_avx2, 8), + make_tuple(64, 128, &aom_highbd_sad64x128_avx2, 10), + make_tuple(64, 128, &aom_highbd_sad64x128_avx2, 12), + make_tuple(64, 64, &aom_highbd_sad64x64_avx2, 8), + make_tuple(64, 64, &aom_highbd_sad64x64_avx2, 10), + make_tuple(64, 64, &aom_highbd_sad64x64_avx2, 12), + make_tuple(64, 32, &aom_highbd_sad64x32_avx2, 8), + make_tuple(64, 32, &aom_highbd_sad64x32_avx2, 10), + make_tuple(64, 32, &aom_highbd_sad64x32_avx2, 12), + make_tuple(32, 64, &aom_highbd_sad32x64_avx2, 8), + make_tuple(32, 64, &aom_highbd_sad32x64_avx2, 10), + make_tuple(32, 64, &aom_highbd_sad32x64_avx2, 12), + make_tuple(32, 32, &aom_highbd_sad32x32_avx2, 8), + make_tuple(32, 32, &aom_highbd_sad32x32_avx2, 10), + make_tuple(32, 32, &aom_highbd_sad32x32_avx2, 12), + make_tuple(32, 16, &aom_highbd_sad32x16_avx2, 8), + make_tuple(32, 16, &aom_highbd_sad32x16_avx2, 10), + make_tuple(32, 16, &aom_highbd_sad32x16_avx2, 12), + make_tuple(16, 32, &aom_highbd_sad16x32_avx2, 8), + make_tuple(16, 32, &aom_highbd_sad16x32_avx2, 10), + make_tuple(16, 32, &aom_highbd_sad16x32_avx2, 12), + make_tuple(16, 16, &aom_highbd_sad16x16_avx2, 8), + make_tuple(16, 16, &aom_highbd_sad16x16_avx2, 10), + make_tuple(16, 16, &aom_highbd_sad16x16_avx2, 12), + make_tuple(16, 8, &aom_highbd_sad16x8_avx2, 8), + make_tuple(16, 8, &aom_highbd_sad16x8_avx2, 10), + make_tuple(16, 8, &aom_highbd_sad16x8_avx2, 12), + + make_tuple(64, 16, &aom_highbd_sad64x16_avx2, 8), + make_tuple(64, 16, &aom_highbd_sad64x16_avx2, 10), + make_tuple(64, 16, &aom_highbd_sad64x16_avx2, 12), + make_tuple(16, 64, &aom_highbd_sad16x64_avx2, 8), + make_tuple(16, 64, &aom_highbd_sad16x64_avx2, 10), + make_tuple(16, 64, &aom_highbd_sad16x64_avx2, 12), + make_tuple(32, 8, &aom_highbd_sad32x8_avx2, 8), + make_tuple(32, 8, &aom_highbd_sad32x8_avx2, 10), + make_tuple(32, 8, &aom_highbd_sad32x8_avx2, 12), + make_tuple(16, 4, &aom_highbd_sad16x4_avx2, 8), + make_tuple(16, 4, &aom_highbd_sad16x4_avx2, 10), + make_tuple(16, 4, &aom_highbd_sad16x4_avx2, 12), +#endif +}; +INSTANTIATE_TEST_SUITE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests)); + +const SadMxNAvgParam avg_avx2_tests[] = { + make_tuple(64, 128, &aom_sad64x128_avg_avx2, -1), + make_tuple(128, 64, &aom_sad128x64_avg_avx2, -1), + make_tuple(128, 128, &aom_sad128x128_avg_avx2, -1), + make_tuple(64, 64, &aom_sad64x64_avg_avx2, -1), + make_tuple(64, 32, &aom_sad64x32_avg_avx2, -1), + make_tuple(32, 64, &aom_sad32x64_avg_avx2, -1), + make_tuple(32, 32, &aom_sad32x32_avg_avx2, -1), + make_tuple(32, 16, &aom_sad32x16_avg_avx2, -1), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(128, 128, &aom_highbd_sad128x128_avg_avx2, 8), + make_tuple(128, 128, &aom_highbd_sad128x128_avg_avx2, 10), + make_tuple(128, 128, &aom_highbd_sad128x128_avg_avx2, 12), + make_tuple(128, 64, &aom_highbd_sad128x64_avg_avx2, 8), + make_tuple(128, 64, &aom_highbd_sad128x64_avg_avx2, 10), + make_tuple(128, 64, &aom_highbd_sad128x64_avg_avx2, 12), + make_tuple(64, 128, &aom_highbd_sad64x128_avg_avx2, 8), + make_tuple(64, 128, &aom_highbd_sad64x128_avg_avx2, 10), + make_tuple(64, 128, &aom_highbd_sad64x128_avg_avx2, 12), + make_tuple(64, 64, &aom_highbd_sad64x64_avg_avx2, 8), + make_tuple(64, 64, &aom_highbd_sad64x64_avg_avx2, 10), + make_tuple(64, 64, &aom_highbd_sad64x64_avg_avx2, 12), + make_tuple(64, 32, &aom_highbd_sad64x32_avg_avx2, 8), + make_tuple(64, 32, &aom_highbd_sad64x32_avg_avx2, 10), + make_tuple(64, 32, &aom_highbd_sad64x32_avg_avx2, 12), + make_tuple(32, 64, &aom_highbd_sad32x64_avg_avx2, 8), + make_tuple(32, 64, &aom_highbd_sad32x64_avg_avx2, 10), + make_tuple(32, 64, &aom_highbd_sad32x64_avg_avx2, 12), + make_tuple(32, 32, &aom_highbd_sad32x32_avg_avx2, 8), + make_tuple(32, 32, &aom_highbd_sad32x32_avg_avx2, 10), + make_tuple(32, 32, &aom_highbd_sad32x32_avg_avx2, 12), + make_tuple(32, 16, &aom_highbd_sad32x16_avg_avx2, 8), + make_tuple(32, 16, &aom_highbd_sad32x16_avg_avx2, 10), + make_tuple(32, 16, &aom_highbd_sad32x16_avg_avx2, 12), + make_tuple(16, 32, &aom_highbd_sad16x32_avg_avx2, 8), + make_tuple(16, 32, &aom_highbd_sad16x32_avg_avx2, 10), + make_tuple(16, 32, &aom_highbd_sad16x32_avg_avx2, 12), + make_tuple(16, 16, &aom_highbd_sad16x16_avg_avx2, 8), + make_tuple(16, 16, &aom_highbd_sad16x16_avg_avx2, 10), + make_tuple(16, 16, &aom_highbd_sad16x16_avg_avx2, 12), + make_tuple(16, 8, &aom_highbd_sad16x8_avg_avx2, 8), + make_tuple(16, 8, &aom_highbd_sad16x8_avg_avx2, 10), + make_tuple(16, 8, &aom_highbd_sad16x8_avg_avx2, 12), + + make_tuple(64, 16, &aom_highbd_sad64x16_avg_avx2, 8), + make_tuple(64, 16, &aom_highbd_sad64x16_avg_avx2, 10), + make_tuple(64, 16, &aom_highbd_sad64x16_avg_avx2, 12), + make_tuple(16, 64, &aom_highbd_sad16x64_avg_avx2, 8), + make_tuple(16, 64, &aom_highbd_sad16x64_avg_avx2, 10), + make_tuple(16, 64, &aom_highbd_sad16x64_avg_avx2, 12), + make_tuple(32, 8, &aom_highbd_sad32x8_avg_avx2, 8), + make_tuple(32, 8, &aom_highbd_sad32x8_avg_avx2, 10), + make_tuple(32, 8, &aom_highbd_sad32x8_avg_avx2, 12), + make_tuple(16, 4, &aom_highbd_sad16x4_avg_avx2, 8), + make_tuple(16, 4, &aom_highbd_sad16x4_avg_avx2, 10), + make_tuple(16, 4, &aom_highbd_sad16x4_avg_avx2, 12), +#endif +}; +INSTANTIATE_TEST_SUITE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests)); + +const SadMxNx4Param x4d_avx2_tests[] = { + make_tuple(32, 64, &aom_sad32x64x4d_avx2, -1), + make_tuple(32, 32, &aom_sad32x32x4d_avx2, -1), + make_tuple(32, 16, &aom_sad32x16x4d_avx2, -1), + make_tuple(32, 8, &aom_sad32x8x4d_avx2, -1), + make_tuple(64, 128, &aom_sad64x128x4d_avx2, -1), + make_tuple(64, 64, &aom_sad64x64x4d_avx2, -1), + make_tuple(64, 32, &aom_sad64x32x4d_avx2, -1), + make_tuple(64, 16, &aom_sad64x16x4d_avx2, -1), + make_tuple(128, 128, &aom_sad128x128x4d_avx2, -1), + make_tuple(128, 64, &aom_sad128x64x4d_avx2, -1), +#if CONFIG_AV1_HIGHBITDEPTH + make_tuple(128, 128, &aom_highbd_sad128x128x4d_avx2, 8), + make_tuple(128, 128, &aom_highbd_sad128x128x4d_avx2, 10), + make_tuple(128, 128, &aom_highbd_sad128x128x4d_avx2, 12), + make_tuple(128, 64, &aom_highbd_sad128x64x4d_avx2, 8), + make_tuple(128, 64, &aom_highbd_sad128x64x4d_avx2, 10), + make_tuple(128, 64, &aom_highbd_sad128x64x4d_avx2, 12), + make_tuple(64, 128, &aom_highbd_sad64x128x4d_avx2, 8), + make_tuple(64, 128, &aom_highbd_sad64x128x4d_avx2, 10), + make_tuple(64, 128, &aom_highbd_sad64x128x4d_avx2, 12), + make_tuple(64, 64, &aom_highbd_sad64x64x4d_avx2, 8), + make_tuple(64, 64, &aom_highbd_sad64x64x4d_avx2, 10), + make_tuple(64, 64, &aom_highbd_sad64x64x4d_avx2, 12), + make_tuple(64, 32, &aom_highbd_sad64x32x4d_avx2, 8), + make_tuple(64, 32, &aom_highbd_sad64x32x4d_avx2, 10), + make_tuple(64, 32, &aom_highbd_sad64x32x4d_avx2, 12), + make_tuple(32, 64, &aom_highbd_sad32x64x4d_avx2, 8), + make_tuple(32, 64, &aom_highbd_sad32x64x4d_avx2, 10), + make_tuple(32, 64, &aom_highbd_sad32x64x4d_avx2, 12), + make_tuple(32, 32, &aom_highbd_sad32x32x4d_avx2, 8), + make_tuple(32, 32, &aom_highbd_sad32x32x4d_avx2, 10), + make_tuple(32, 32, &aom_highbd_sad32x32x4d_avx2, 12), + make_tuple(32, 16, &aom_highbd_sad32x16x4d_avx2, 8), + make_tuple(32, 16, &aom_highbd_sad32x16x4d_avx2, 10), + make_tuple(32, 16, &aom_highbd_sad32x16x4d_avx2, 12), + make_tuple(16, 32, &aom_highbd_sad16x32x4d_avx2, 8), + make_tuple(16, 32, &aom_highbd_sad16x32x4d_avx2, 10), + make_tuple(16, 32, &aom_highbd_sad16x32x4d_avx2, 12), + make_tuple(16, 16, &aom_highbd_sad16x16x4d_avx2, 8), + make_tuple(16, 16, &aom_highbd_sad16x16x4d_avx2, 10), + make_tuple(16, 16, &aom_highbd_sad16x16x4d_avx2, 12), + make_tuple(16, 8, &aom_highbd_sad16x8x4d_avx2, 8), + make_tuple(16, 8, &aom_highbd_sad16x8x4d_avx2, 10), + make_tuple(16, 8, &aom_highbd_sad16x8x4d_avx2, 12), + + make_tuple(16, 64, &aom_highbd_sad16x64x4d_avx2, 8), + make_tuple(16, 64, &aom_highbd_sad16x64x4d_avx2, 10), + make_tuple(16, 64, &aom_highbd_sad16x64x4d_avx2, 12), + make_tuple(64, 16, &aom_highbd_sad64x16x4d_avx2, 8), + make_tuple(64, 16, &aom_highbd_sad64x16x4d_avx2, 10), + make_tuple(64, 16, &aom_highbd_sad64x16x4d_avx2, 12), + make_tuple(32, 8, &aom_highbd_sad32x8x4d_avx2, 8), + make_tuple(32, 8, &aom_highbd_sad32x8x4d_avx2, 10), + make_tuple(32, 8, &aom_highbd_sad32x8x4d_avx2, 12), + make_tuple(16, 4, &aom_highbd_sad16x4x4d_avx2, 8), + make_tuple(16, 4, &aom_highbd_sad16x4x4d_avx2, 10), + make_tuple(16, 4, &aom_highbd_sad16x4x4d_avx2, 12), +#endif +}; +INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests)); +#endif // HAVE_AVX2 + +//------------------------------------------------------------------------------ +// MIPS functions +#if HAVE_MSA +const SadMxNParam msa_tests[] = { + make_tuple(64, 64, &aom_sad64x64_msa, -1), + make_tuple(64, 32, &aom_sad64x32_msa, -1), + make_tuple(32, 64, &aom_sad32x64_msa, -1), + make_tuple(32, 32, &aom_sad32x32_msa, -1), + make_tuple(32, 16, &aom_sad32x16_msa, -1), + make_tuple(16, 32, &aom_sad16x32_msa, -1), + make_tuple(16, 16, &aom_sad16x16_msa, -1), + make_tuple(16, 8, &aom_sad16x8_msa, -1), + make_tuple(8, 16, &aom_sad8x16_msa, -1), + make_tuple(8, 8, &aom_sad8x8_msa, -1), + make_tuple(8, 4, &aom_sad8x4_msa, -1), + make_tuple(4, 8, &aom_sad4x8_msa, -1), + make_tuple(4, 4, &aom_sad4x4_msa, -1), +}; +INSTANTIATE_TEST_SUITE_P(MSA, SADTest, ::testing::ValuesIn(msa_tests)); + +const SadMxNAvgParam avg_msa_tests[] = { + make_tuple(64, 64, &aom_sad64x64_avg_msa, -1), + make_tuple(64, 32, &aom_sad64x32_avg_msa, -1), + make_tuple(32, 64, &aom_sad32x64_avg_msa, -1), + make_tuple(32, 32, &aom_sad32x32_avg_msa, -1), + make_tuple(32, 16, &aom_sad32x16_avg_msa, -1), + make_tuple(16, 32, &aom_sad16x32_avg_msa, -1), + make_tuple(16, 16, &aom_sad16x16_avg_msa, -1), + make_tuple(16, 8, &aom_sad16x8_avg_msa, -1), + make_tuple(8, 16, &aom_sad8x16_avg_msa, -1), + make_tuple(8, 8, &aom_sad8x8_avg_msa, -1), + make_tuple(8, 4, &aom_sad8x4_avg_msa, -1), + make_tuple(4, 8, &aom_sad4x8_avg_msa, -1), + make_tuple(4, 4, &aom_sad4x4_avg_msa, -1), +}; +INSTANTIATE_TEST_SUITE_P(MSA, SADavgTest, ::testing::ValuesIn(avg_msa_tests)); + +const SadMxNx4Param x4d_msa_tests[] = { + make_tuple(64, 64, &aom_sad64x64x4d_msa, -1), + make_tuple(64, 32, &aom_sad64x32x4d_msa, -1), + make_tuple(32, 64, &aom_sad32x64x4d_msa, -1), + make_tuple(32, 32, &aom_sad32x32x4d_msa, -1), + make_tuple(32, 16, &aom_sad32x16x4d_msa, -1), + make_tuple(16, 32, &aom_sad16x32x4d_msa, -1), + make_tuple(16, 16, &aom_sad16x16x4d_msa, -1), + make_tuple(16, 8, &aom_sad16x8x4d_msa, -1), + make_tuple(8, 16, &aom_sad8x16x4d_msa, -1), + make_tuple(8, 8, &aom_sad8x8x4d_msa, -1), + make_tuple(8, 4, &aom_sad8x4x4d_msa, -1), + make_tuple(4, 8, &aom_sad4x8x4d_msa, -1), + make_tuple(4, 4, &aom_sad4x4x4d_msa, -1), +}; +INSTANTIATE_TEST_SUITE_P(MSA, SADx4Test, ::testing::ValuesIn(x4d_msa_tests)); +#endif // HAVE_MSA + +} // namespace diff --git a/libs/libaom/src/test/sb_multipass_test.cc b/libs/libaom/src/test/sb_multipass_test.cc new file mode 100644 index 000000000..0ca76ab85 --- /dev/null +++ b/libs/libaom/src/test/sb_multipass_test.cc @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/md5_helper.h" +#include "test/util.h" +#include "test/yuv_video_source.h" + +namespace { +class AV1SBMultipassTest + : public ::libaom_test::CodecTestWith2Params, + public ::libaom_test::EncoderTest { + protected: + AV1SBMultipassTest() + : EncoderTest(GET_PARAM(0)), set_cpu_used_(GET_PARAM(1)), + row_mt_(GET_PARAM(2)) { + init_flags_ = AOM_CODEC_USE_PSNR; + aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t(); + cfg.w = 1280; + cfg.h = 720; + cfg.allow_lowbitdepth = 1; + decoder_ = codec_->CreateDecoder(cfg, 0); + if (decoder_->IsAV1()) { + decoder_->Control(AV1_SET_DECODE_TILE_ROW, -1); + decoder_->Control(AV1_SET_DECODE_TILE_COL, -1); + } + + size_enc_.clear(); + md5_dec_.clear(); + md5_enc_.clear(); + } + virtual ~AV1SBMultipassTest() { delete decoder_; } + + virtual void SetUp() { + InitializeConfig(); + SetMode(::libaom_test::kTwoPassGood); + + cfg_.g_lag_in_frames = 5; + cfg_.rc_end_usage = AOM_VBR; + cfg_.rc_2pass_vbr_minsection_pct = 5; + cfg_.rc_2pass_vbr_maxsection_pct = 2000; + + cfg_.rc_max_quantizer = 56; + cfg_.rc_min_quantizer = 0; + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + SetTileSize(encoder); + encoder->Control(AOME_SET_CPUUSED, set_cpu_used_); + encoder->Control(AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST, use_multipass_); + encoder->Control(AV1E_SET_ROW_MT, row_mt_); + + encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1); + encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7); + encoder->Control(AOME_SET_ARNR_STRENGTH, 5); + } + } + + virtual void SetTileSize(libaom_test::Encoder *encoder) { + encoder->Control(AV1E_SET_TILE_COLUMNS, 1); + encoder->Control(AV1E_SET_TILE_ROWS, 1); + } + + virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) { + size_enc_.push_back(pkt->data.frame.sz); + + ::libaom_test::MD5 md5_enc; + md5_enc.Add(reinterpret_cast(pkt->data.frame.buf), + pkt->data.frame.sz); + md5_enc_.push_back(md5_enc.Get()); + + const aom_codec_err_t res = decoder_->DecodeFrame( + reinterpret_cast(pkt->data.frame.buf), pkt->data.frame.sz); + if (res != AOM_CODEC_OK) { + abort_ = true; + ASSERT_EQ(AOM_CODEC_OK, res); + } + const aom_image_t *img = decoder_->GetDxData().Next(); + + if (img) { + ::libaom_test::MD5 md5_res; + md5_res.Add(img); + md5_dec_.push_back(md5_res.Get()); + } + } + + void DoTest() { + ::libaom_test::YUVVideoSource video( + "niklas_640_480_30.yuv", AOM_IMG_FMT_I420, 640, 480, 30, 1, 0, 6); + cfg_.rc_target_bitrate = 1000; + + // Encode while coding each sb once + use_multipass_ = false; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + std::vector single_pass_size_enc; + std::vector single_pass_md5_enc; + std::vector single_pass_md5_dec; + single_pass_size_enc = size_enc_; + single_pass_md5_enc = md5_enc_; + single_pass_md5_dec = md5_dec_; + size_enc_.clear(); + md5_enc_.clear(); + md5_dec_.clear(); + + // Encode while coding each sb twice + use_multipass_ = true; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + std::vector multi_pass_size_enc; + std::vector multi_pass_md5_enc; + std::vector multi_pass_md5_dec; + multi_pass_size_enc = size_enc_; + multi_pass_md5_enc = md5_enc_; + multi_pass_md5_dec = md5_dec_; + size_enc_.clear(); + md5_enc_.clear(); + md5_dec_.clear(); + + // Check that the vectors are equal. + ASSERT_EQ(single_pass_size_enc, multi_pass_size_enc); + ASSERT_EQ(single_pass_md5_enc, multi_pass_md5_enc); + ASSERT_EQ(single_pass_md5_dec, multi_pass_md5_dec); + } + + bool use_multipass_; + int set_cpu_used_; + bool row_mt_; + ::libaom_test::Decoder *decoder_; + std::vector size_enc_; + std::vector md5_enc_; + std::vector md5_dec_; +}; + +TEST_P(AV1SBMultipassTest, TwoPassMatchTest) { DoTest(); } + +AV1_INSTANTIATE_TEST_CASE(AV1SBMultipassTest, ::testing::Range(0, 6), + ::testing::Bool()); + +} // namespace diff --git a/libs/libaom/src/test/scalability_test.cc b/libs/libaom/src/test/scalability_test.cc new file mode 100644 index 000000000..b39918861 --- /dev/null +++ b/libs/libaom/src/test/scalability_test.cc @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" + +namespace { + +const int kCpuUsed = 8; +const int kBaseLayerQp = 55; +const int kEnhancementLayerQp = 20; + +class ScalabilityTest + : public ::libaom_test::CodecTestWithParam, + public ::libaom_test::EncoderTest { + protected: + ScalabilityTest() : EncoderTest(GET_PARAM(0)) {} + virtual ~ScalabilityTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(GET_PARAM(1)); + num_spatial_layers_ = 2; + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AOME_SET_CPUUSED, kCpuUsed); + encoder->Control(AOME_SET_NUMBER_SPATIAL_LAYERS, num_spatial_layers_); + } else if (video->frame() % num_spatial_layers_) { + frame_flags_ = AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 | + AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | + AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 | + AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | + AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY; + encoder->Control(AOME_SET_SPATIAL_LAYER_ID, 1); + encoder->Control(AOME_SET_CQ_LEVEL, kEnhancementLayerQp); + } else { + frame_flags_ = AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 | + AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | + AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 | + AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF | + AOM_EFLAG_NO_UPD_ENTROPY; + encoder->Control(AOME_SET_SPATIAL_LAYER_ID, 0); + encoder->Control(AOME_SET_CQ_LEVEL, kBaseLayerQp); + } + } + + void DoTest(int num_spatial_layers) { + num_spatial_layers_ = num_spatial_layers; + cfg_.rc_end_usage = AOM_Q; + cfg_.g_lag_in_frames = 0; + + ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, + 288, 30, 1, 0, 18); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + int num_spatial_layers_; +}; + +TEST_P(ScalabilityTest, TestNoMismatch2SpatialLayers) { DoTest(2); } + +TEST_P(ScalabilityTest, TestNoMismatch3SpatialLayers) { DoTest(3); } + +AV1_INSTANTIATE_TEST_CASE(ScalabilityTest, + ::testing::Values(::libaom_test::kRealTime)); + +} // namespace diff --git a/libs/libaom/src/test/scan_test.cc b/libs/libaom/src/test/scan_test.cc new file mode 100644 index 000000000..dee2ab5a6 --- /dev/null +++ b/libs/libaom/src/test/scan_test.cc @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "av1/common/scan.h" +#include "av1/common/txb_common.h" +#include "test/av1_txfm_test.h" + +static int scan_test(const int16_t *scan, const int16_t *iscan, int si, int r, + int c, int w) { + if (iscan[r * w + c] != si || scan[si] != r * w + c) { + printf("r %d c %d ref_iscan %d iscan %d ref_scan %d scan %d\n", r, c, si, + iscan[r * w + c], r * w + c, scan[si]); + return 1; + } else { + return 0; + } +} + +int scan_order_test(const SCAN_ORDER *scan_order, int w, int h, + SCAN_MODE mode) { + const int16_t *scan = scan_order->scan; + const int16_t *iscan = scan_order->iscan; + int dim = w + h - 1; + if (mode == SCAN_MODE_ZIG_ZAG) { + int si = 0; + for (int i = 0; i < dim; ++i) { + if (i % 2 == 0) { + for (int c = 0; c < w; ++c) { + int r = i - c; + if (r >= 0 && r < h) { + if (scan_test(scan, iscan, si, r, c, w)) return 1; + ++si; + } + } + } else { + for (int r = 0; r < h; ++r) { + int c = i - r; + if (c >= 0 && c < w) { + if (scan_test(scan, iscan, si, r, c, w)) return 1; + ++si; + } + } + } + } + } else if (mode == SCAN_MODE_COL_DIAG) { + int si = 0; + for (int i = 0; i < dim; ++i) { + for (int c = 0; c < w; ++c) { + int r = i - c; + if (r >= 0 && r < h) { + if (scan_test(scan, iscan, si, r, c, w)) return 1; + ++si; + } + } + } + } else if (mode == SCAN_MODE_ROW_DIAG) { + int si = 0; + for (int i = 0; i < dim; ++i) { + for (int r = 0; r < h; ++r) { + int c = i - r; + if (c >= 0 && c < w) { + if (scan_test(scan, iscan, si, r, c, w)) return 1; + ++si; + } + } + } + } else if (mode == SCAN_MODE_ROW_1D) { + int si = 0; + for (int r = 0; r < h; ++r) { + for (int c = 0; c < w; ++c) { + if (scan_test(scan, iscan, si, r, c, w)) return 1; + ++si; + } + } + } else { + assert(mode == SCAN_MODE_COL_1D); + int si = 0; + for (int c = 0; c < w; ++c) { + for (int r = 0; r < h; ++r) { + if (scan_test(scan, iscan, si, r, c, w)) return 1; + ++si; + } + } + } + return 0; +} + +TEST(Av1ScanTest, Dependency) { + for (int tx_size = TX_4X4; tx_size < TX_SIZES_ALL; ++tx_size) { + const int org_rows = tx_size_high[(TX_SIZE)tx_size]; + const int org_cols = tx_size_wide[(TX_SIZE)tx_size]; + const int rows = get_txb_high((TX_SIZE)tx_size); + const int cols = get_txb_wide((TX_SIZE)tx_size); + for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) { + if (libaom_test::IsTxSizeTypeValid(static_cast(tx_size), + static_cast(tx_type)) == + false) { + continue; + } + SCAN_MODE scan_mode; + TX_CLASS tx_class = tx_type_to_class[(TX_TYPE)tx_type]; + if (tx_class == TX_CLASS_2D) { + if (rows == cols) { + scan_mode = SCAN_MODE_ZIG_ZAG; + } else if (rows > cols) { + scan_mode = SCAN_MODE_ROW_DIAG; + } else { + scan_mode = SCAN_MODE_COL_DIAG; + } + } else if (tx_class == TX_CLASS_VERT) { + scan_mode = SCAN_MODE_ROW_1D; + } else { + assert(tx_class == TX_CLASS_HORIZ); + scan_mode = SCAN_MODE_COL_1D; + } + const SCAN_ORDER *scan_order = + get_default_scan((TX_SIZE)tx_size, (TX_TYPE)tx_type); + ASSERT_EQ(scan_order_test(scan_order, cols, rows, scan_mode), 0) + << "scan mismatch tx_class " << tx_class << " tx_type " << tx_type + << " tx_w " << org_cols << " tx_h " << org_rows << " scan_mode " + << scan_mode << "\n"; + } + } +} diff --git a/libs/libaom/src/test/segment_binarization_sync.cc b/libs/libaom/src/test/segment_binarization_sync.cc new file mode 100644 index 000000000..bd8cf1141 --- /dev/null +++ b/libs/libaom/src/test/segment_binarization_sync.cc @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/acm_random.h" + +using libaom_test::ACMRandom; + +extern "C" { +int av1_neg_interleave(int x, int ref, int max); +int av1_neg_deinterleave(int diff, int ref, int max); +} + +namespace { + +struct Segment { + int id; + int pred; + int last_id; +}; + +Segment GenerateSegment(int seed) { + static const int MAX_SEGMENTS = 8; + + ACMRandom rnd_(seed); + + Segment segment; + const int last_segid = rnd_.PseudoUniform(MAX_SEGMENTS); + segment.last_id = last_segid; + segment.pred = rnd_.PseudoUniform(MAX_SEGMENTS); + segment.id = rnd_.PseudoUniform(last_segid + 1); + + return segment; +} + +// Try to reveal a mismatch between segment binarization and debinarization +TEST(SegmentBinarizationSync, SearchForBinarizationMismatch) { + const int count_tests = 1000; + const int seed_init = 4321; + + for (int i = 0; i < count_tests; ++i) { + const Segment seg = GenerateSegment(seed_init + i); + + const int max_segid = seg.last_id + 1; + const int seg_diff = av1_neg_interleave(seg.id, seg.pred, max_segid); + const int decoded_segid = + av1_neg_deinterleave(seg_diff, seg.pred, max_segid); + + ASSERT_EQ(decoded_segid, seg.id); + } +} + +} // namespace diff --git a/libs/libaom/src/test/selfguided_filter_test.cc b/libs/libaom/src/test/selfguided_filter_test.cc new file mode 100644 index 000000000..d65cce58a --- /dev/null +++ b/libs/libaom/src/test/selfguided_filter_test.cc @@ -0,0 +1,420 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/av1_rtcd.h" + +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" + +#include "aom_ports/aom_timer.h" +#include "av1/common/mv.h" +#include "av1/common/restoration.h" + +namespace { + +using libaom_test::ACMRandom; +using std::make_tuple; +using std::tuple; + +typedef void (*SgrFunc)(const uint8_t *dat8, int width, int height, int stride, + int eps, const int *xqd, uint8_t *dst8, int dst_stride, + int32_t *tmpbuf, int bit_depth, int highbd); + +// Test parameter list: +// +typedef tuple FilterTestParam; + +class AV1SelfguidedFilterTest + : public ::testing::TestWithParam { + public: + virtual ~AV1SelfguidedFilterTest() {} + virtual void SetUp() {} + + virtual void TearDown() { libaom_test::ClearSystemState(); } + + protected: + void RunSpeedTest() { + tst_fun_ = GET_PARAM(0); + const int pu_width = RESTORATION_PROC_UNIT_SIZE; + const int pu_height = RESTORATION_PROC_UNIT_SIZE; + const int width = 256, height = 256, stride = 288, out_stride = 288; + const int NUM_ITERS = 2000; + int i, j, k; + + uint8_t *input_ = + (uint8_t *)aom_memalign(32, stride * (height + 32) * sizeof(uint8_t)); + uint8_t *output_ = (uint8_t *)aom_memalign( + 32, out_stride * (height + 32) * sizeof(uint8_t)); + int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE); + uint8_t *input = input_ + stride * 16 + 16; + uint8_t *output = output_ + out_stride * 16 + 16; + + ACMRandom rnd(ACMRandom::DeterministicSeed()); + + for (i = -16; i < height + 16; ++i) + for (j = -16; j < width + 16; ++j) + input[i * stride + j] = rnd.Rand16() & 0xFF; + + int xqd[2] = { SGRPROJ_PRJ_MIN0 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 - + SGRPROJ_PRJ_MIN0), + SGRPROJ_PRJ_MIN1 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 - + SGRPROJ_PRJ_MIN1) }; + // Fix a parameter set, since the speed depends slightly on r. + // Change this to test different combinations of values of r. + int eps = 15; + + av1_loop_restoration_precal(); + + aom_usec_timer ref_timer; + aom_usec_timer_start(&ref_timer); + for (i = 0; i < NUM_ITERS; ++i) { + for (k = 0; k < height; k += pu_height) + for (j = 0; j < width; j += pu_width) { + int w = AOMMIN(pu_width, width - j); + int h = AOMMIN(pu_height, height - k); + uint8_t *input_p = input + k * stride + j; + uint8_t *output_p = output + k * out_stride + j; + av1_apply_selfguided_restoration_c(input_p, w, h, stride, eps, xqd, + output_p, out_stride, tmpbuf, 8, + 0); + } + } + aom_usec_timer_mark(&ref_timer); + const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer); + + aom_usec_timer tst_timer; + aom_usec_timer_start(&tst_timer); + for (i = 0; i < NUM_ITERS; ++i) { + for (k = 0; k < height; k += pu_height) + for (j = 0; j < width; j += pu_width) { + int w = AOMMIN(pu_width, width - j); + int h = AOMMIN(pu_height, height - k); + uint8_t *input_p = input + k * stride + j; + uint8_t *output_p = output + k * out_stride + j; + tst_fun_(input_p, w, h, stride, eps, xqd, output_p, out_stride, + tmpbuf, 8, 0); + } + } + aom_usec_timer_mark(&tst_timer); + const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer); + + std::cout << "[ ] C time = " << ref_time / 1000 + << " ms, SIMD time = " << tst_time / 1000 << " ms\n"; + + EXPECT_GT(ref_time, tst_time) + << "Error: AV1SelfguidedFilterTest.SpeedTest, SIMD slower than C.\n" + << "C time: " << ref_time << " us\n" + << "SIMD time: " << tst_time << " us\n"; + + aom_free(input_); + aom_free(output_); + aom_free(tmpbuf); + } + + void RunCorrectnessTest() { + tst_fun_ = GET_PARAM(0); + const int pu_width = RESTORATION_PROC_UNIT_SIZE; + const int pu_height = RESTORATION_PROC_UNIT_SIZE; + // Set the maximum width/height to test here. We actually test a small + // range of sizes *up to* this size, so that we can check, eg., + // the behaviour on tiles which are not a multiple of 4 wide. + const int max_w = 260, max_h = 260, stride = 672, out_stride = 672; + const int NUM_ITERS = 81; + int i, j, k; + + uint8_t *input_ = + (uint8_t *)aom_memalign(32, stride * (max_h + 32) * sizeof(uint8_t)); + uint8_t *output_ = (uint8_t *)aom_memalign( + 32, out_stride * (max_h + 32) * sizeof(uint8_t)); + uint8_t *output2_ = (uint8_t *)aom_memalign( + 32, out_stride * (max_h + 32) * sizeof(uint8_t)); + int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE); + + uint8_t *input = input_ + stride * 16 + 16; + uint8_t *output = output_ + out_stride * 16 + 16; + uint8_t *output2 = output2_ + out_stride * 16 + 16; + + ACMRandom rnd(ACMRandom::DeterministicSeed()); + + av1_loop_restoration_precal(); + + for (i = 0; i < NUM_ITERS; ++i) { + for (j = -16; j < max_h + 16; ++j) + for (k = -16; k < max_w + 16; ++k) + input[j * stride + k] = rnd.Rand16() & 0xFF; + + int xqd[2] = { SGRPROJ_PRJ_MIN0 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 - + SGRPROJ_PRJ_MIN0), + SGRPROJ_PRJ_MIN1 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 - + SGRPROJ_PRJ_MIN1) }; + int eps = rnd.PseudoUniform(1 << SGRPROJ_PARAMS_BITS); + + // Test various tile sizes around 256x256 + int test_w = max_w - (i / 9); + int test_h = max_h - (i % 9); + + for (k = 0; k < test_h; k += pu_height) + for (j = 0; j < test_w; j += pu_width) { + int w = AOMMIN(pu_width, test_w - j); + int h = AOMMIN(pu_height, test_h - k); + uint8_t *input_p = input + k * stride + j; + uint8_t *output_p = output + k * out_stride + j; + uint8_t *output2_p = output2 + k * out_stride + j; + tst_fun_(input_p, w, h, stride, eps, xqd, output_p, out_stride, + tmpbuf, 8, 0); + av1_apply_selfguided_restoration_c(input_p, w, h, stride, eps, xqd, + output2_p, out_stride, tmpbuf, 8, + 0); + } + + for (j = 0; j < test_h; ++j) + for (k = 0; k < test_w; ++k) { + ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]); + } + } + + aom_free(input_); + aom_free(output_); + aom_free(output2_); + aom_free(tmpbuf); + } + + private: + SgrFunc tst_fun_; +}; + +TEST_P(AV1SelfguidedFilterTest, DISABLED_SpeedTest) { RunSpeedTest(); } +TEST_P(AV1SelfguidedFilterTest, CorrectnessTest) { RunCorrectnessTest(); } + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P( + SSE4_1, AV1SelfguidedFilterTest, + ::testing::Values(av1_apply_selfguided_restoration_sse4_1)); +#endif + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, AV1SelfguidedFilterTest, + ::testing::Values(av1_apply_selfguided_restoration_avx2)); +#endif + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, AV1SelfguidedFilterTest, + ::testing::Values(av1_apply_selfguided_restoration_neon)); +#endif + +#if CONFIG_AV1_HIGHBITDEPTH +// Test parameter list: +// +typedef tuple HighbdFilterTestParam; + +class AV1HighbdSelfguidedFilterTest + : public ::testing::TestWithParam { + public: + virtual ~AV1HighbdSelfguidedFilterTest() {} + virtual void SetUp() {} + + virtual void TearDown() { libaom_test::ClearSystemState(); } + + protected: + void RunSpeedTest() { + tst_fun_ = GET_PARAM(0); + const int pu_width = RESTORATION_PROC_UNIT_SIZE; + const int pu_height = RESTORATION_PROC_UNIT_SIZE; + const int width = 256, height = 256, stride = 288, out_stride = 288; + const int NUM_ITERS = 2000; + int i, j, k; + int bit_depth = GET_PARAM(1); + int mask = (1 << bit_depth) - 1; + + uint16_t *input_ = + (uint16_t *)aom_memalign(32, stride * (height + 32) * sizeof(uint16_t)); + uint16_t *output_ = (uint16_t *)aom_memalign( + 32, out_stride * (height + 32) * sizeof(uint16_t)); + int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE); + uint16_t *input = input_ + stride * 16 + 16; + uint16_t *output = output_ + out_stride * 16 + 16; + + ACMRandom rnd(ACMRandom::DeterministicSeed()); + + for (i = -16; i < height + 16; ++i) + for (j = -16; j < width + 16; ++j) + input[i * stride + j] = rnd.Rand16() & mask; + + int xqd[2] = { SGRPROJ_PRJ_MIN0 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 - + SGRPROJ_PRJ_MIN0), + SGRPROJ_PRJ_MIN1 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 - + SGRPROJ_PRJ_MIN1) }; + // Fix a parameter set, since the speed depends slightly on r. + // Change this to test different combinations of values of r. + int eps = 15; + + av1_loop_restoration_precal(); + + aom_usec_timer ref_timer; + aom_usec_timer_start(&ref_timer); + for (i = 0; i < NUM_ITERS; ++i) { + for (k = 0; k < height; k += pu_height) + for (j = 0; j < width; j += pu_width) { + int w = AOMMIN(pu_width, width - j); + int h = AOMMIN(pu_height, height - k); + uint16_t *input_p = input + k * stride + j; + uint16_t *output_p = output + k * out_stride + j; + av1_apply_selfguided_restoration_c( + CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd, + CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth, 1); + } + } + aom_usec_timer_mark(&ref_timer); + const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer); + + aom_usec_timer tst_timer; + aom_usec_timer_start(&tst_timer); + for (i = 0; i < NUM_ITERS; ++i) { + for (k = 0; k < height; k += pu_height) + for (j = 0; j < width; j += pu_width) { + int w = AOMMIN(pu_width, width - j); + int h = AOMMIN(pu_height, height - k); + uint16_t *input_p = input + k * stride + j; + uint16_t *output_p = output + k * out_stride + j; + tst_fun_(CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd, + CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth, + 1); + } + } + aom_usec_timer_mark(&tst_timer); + const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer); + + std::cout << "[ ] C time = " << ref_time / 1000 + << " ms, SIMD time = " << tst_time / 1000 << " ms\n"; + + EXPECT_GT(ref_time, tst_time) + << "Error: AV1HighbdSelfguidedFilterTest.SpeedTest, SIMD slower than " + "C.\n" + << "C time: " << ref_time << " us\n" + << "SIMD time: " << tst_time << " us\n"; + + aom_free(input_); + aom_free(output_); + aom_free(tmpbuf); + } + + void RunCorrectnessTest() { + tst_fun_ = GET_PARAM(0); + const int pu_width = RESTORATION_PROC_UNIT_SIZE; + const int pu_height = RESTORATION_PROC_UNIT_SIZE; + // Set the maximum width/height to test here. We actually test a small + // range of sizes *up to* this size, so that we can check, eg., + // the behaviour on tiles which are not a multiple of 4 wide. + const int max_w = 260, max_h = 260, stride = 672, out_stride = 672; + const int NUM_ITERS = 81; + int i, j, k; + int bit_depth = GET_PARAM(1); + int mask = (1 << bit_depth) - 1; + + uint16_t *input_ = + (uint16_t *)aom_memalign(32, stride * (max_h + 32) * sizeof(uint16_t)); + uint16_t *output_ = (uint16_t *)aom_memalign( + 32, out_stride * (max_h + 32) * sizeof(uint16_t)); + uint16_t *output2_ = (uint16_t *)aom_memalign( + 32, out_stride * (max_h + 32) * sizeof(uint16_t)); + int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE); + + uint16_t *input = input_ + stride * 16 + 16; + uint16_t *output = output_ + out_stride * 16 + 16; + uint16_t *output2 = output2_ + out_stride * 16 + 16; + + ACMRandom rnd(ACMRandom::DeterministicSeed()); + + av1_loop_restoration_precal(); + + for (i = 0; i < NUM_ITERS; ++i) { + for (j = -16; j < max_h + 16; ++j) + for (k = -16; k < max_w + 16; ++k) + input[j * stride + k] = rnd.Rand16() & mask; + + int xqd[2] = { SGRPROJ_PRJ_MIN0 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 - + SGRPROJ_PRJ_MIN0), + SGRPROJ_PRJ_MIN1 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 - + SGRPROJ_PRJ_MIN1) }; + int eps = rnd.PseudoUniform(1 << SGRPROJ_PARAMS_BITS); + + // Test various tile sizes around 256x256 + int test_w = max_w - (i / 9); + int test_h = max_h - (i % 9); + + for (k = 0; k < test_h; k += pu_height) + for (j = 0; j < test_w; j += pu_width) { + int w = AOMMIN(pu_width, test_w - j); + int h = AOMMIN(pu_height, test_h - k); + uint16_t *input_p = input + k * stride + j; + uint16_t *output_p = output + k * out_stride + j; + uint16_t *output2_p = output2 + k * out_stride + j; + tst_fun_(CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd, + CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth, + 1); + av1_apply_selfguided_restoration_c( + CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd, + CONVERT_TO_BYTEPTR(output2_p), out_stride, tmpbuf, bit_depth, 1); + } + + for (j = 0; j < test_h; ++j) + for (k = 0; k < test_w; ++k) + ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]); + } + + aom_free(input_); + aom_free(output_); + aom_free(output2_); + aom_free(tmpbuf); + } + + private: + SgrFunc tst_fun_; +}; + +TEST_P(AV1HighbdSelfguidedFilterTest, DISABLED_SpeedTest) { RunSpeedTest(); } +TEST_P(AV1HighbdSelfguidedFilterTest, CorrectnessTest) { RunCorrectnessTest(); } + +#if HAVE_SSE4_1 +const int highbd_params_sse4_1[] = { 8, 10, 12 }; +INSTANTIATE_TEST_SUITE_P( + SSE4_1, AV1HighbdSelfguidedFilterTest, + ::testing::Combine( + ::testing::Values(av1_apply_selfguided_restoration_sse4_1), + ::testing::ValuesIn(highbd_params_sse4_1))); +#endif + +#if HAVE_AVX2 +const int highbd_params_avx2[] = { 8, 10, 12 }; +INSTANTIATE_TEST_SUITE_P( + AVX2, AV1HighbdSelfguidedFilterTest, + ::testing::Combine(::testing::Values(av1_apply_selfguided_restoration_avx2), + ::testing::ValuesIn(highbd_params_avx2))); +#endif + +#if HAVE_NEON +const int highbd_params_neon[] = { 8, 10, 12 }; +INSTANTIATE_TEST_SUITE_P( + NEON, AV1HighbdSelfguidedFilterTest, + ::testing::Combine(::testing::Values(av1_apply_selfguided_restoration_neon), + ::testing::ValuesIn(highbd_params_neon))); +#endif +#endif // CONFIG_AV1_HIGHBITDEPTH +} // namespace diff --git a/libs/libaom/src/test/set_maps.sh b/libs/libaom/src/test/set_maps.sh new file mode 100644 index 000000000..4f59b06d6 --- /dev/null +++ b/libs/libaom/src/test/set_maps.sh @@ -0,0 +1,52 @@ +#!/bin/sh +## Copyright (c) 2016, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +## This file tests the libaom set_maps example. To add new tests to this file, +## do the following: +## 1. Write a shell function (this is your test). +## 2. Add the function to set_maps_tests (on a new line). +## +. $(dirname $0)/tools_common.sh + +# Environment check: $YUV_RAW_INPUT is required, and set_maps must exist in +# $LIBAOM_BIN_PATH. +set_maps_verify_environment() { + if [ ! -e "${YUV_RAW_INPUT}" ]; then + echo "Libaom test data must exist in LIBAOM_TEST_DATA_PATH." + return 1 + fi + if [ -z "$(aom_tool_path set_maps)" ]; then + elog "set_maps not found. It must exist in LIBAOM_BIN_PATH or its parent." + return 1 + fi +} + +# Runs set_maps using the codec specified by $1. +set_maps() { + local encoder="$(aom_tool_path set_maps)" + local codec="$1" + local output_file="${AOM_TEST_OUTPUT_DIR}/set_maps_${codec}.ivf" + + eval "${AOM_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \ + "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \ + ${devnull} + + [ -e "${output_file}" ] || return 1 +} + +set_maps_av1() { + if [ "$(av1_encode_available)" = "yes" ]; then + set_maps av1 || return 1 + fi +} + +set_maps_tests="set_maps_av1" + +run_tests set_maps_verify_environment "${set_maps_tests}" diff --git a/libs/libaom/src/test/simd_avx2_test.cc b/libs/libaom/src/test/simd_avx2_test.cc new file mode 100644 index 000000000..8a012bff8 --- /dev/null +++ b/libs/libaom/src/test/simd_avx2_test.cc @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#define ARCH AVX2 +#define ARCH_POSTFIX(name) name##_avx2 +#define SIMD_NAMESPACE simd_test_avx2 +#include "test/simd_impl.h" diff --git a/libs/libaom/src/test/simd_cmp_avx2.cc b/libs/libaom/src/test/simd_cmp_avx2.cc new file mode 100644 index 000000000..cda632bcd --- /dev/null +++ b/libs/libaom/src/test/simd_cmp_avx2.cc @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#define ARCH AVX2 +#define ARCH_POSTFIX(name) name##_avx2 +#define SIMD_NAMESPACE simd_test_avx2 +#include "test/simd_cmp_impl.h" diff --git a/libs/libaom/src/test/simd_cmp_impl.h b/libs/libaom/src/test/simd_cmp_impl.h new file mode 100644 index 000000000..d3eb33619 --- /dev/null +++ b/libs/libaom/src/test/simd_cmp_impl.h @@ -0,0 +1,2171 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "test/acm_random.h" +#include "aom_dsp/aom_simd.h" +#undef SIMD_INLINE +#define SIMD_INLINE static // Don't enforce inlining +#include "aom_dsp/simd/v256_intrinsics_c.h" + +// Machine tuned code goes into this file. This file is included from +// simd_cmp_sse2.cc, simd_cmp_ssse3.cc etc which define the macros +// ARCH (=neon, sse2, ssse3, etc), SIMD_NAMESPACE and ARCH_POSTFIX(). + +#ifdef _MSC_VER +// Disable "value of intrinsic immediate argument 'value' is out of range +// 'lowerbound - upperbound'" warning. Visual Studio emits this warning though +// the parameters are conditionally checked in e.g., v256_shr_n_byte. Adding a +// mask doesn't always appear to be sufficient. +#pragma warning(disable : 4556) +#endif + +using libaom_test::ACMRandom; + +namespace SIMD_NAMESPACE { + +// Wrap templates around intrinsics using immediate values +template +v64 imm_v64_shl_n_byte(v64 a) { + return v64_shl_n_byte(a, shift); +} +template +v64 imm_v64_shr_n_byte(v64 a) { + return v64_shr_n_byte(a, shift); +} +template +v64 imm_v64_shl_n_8(v64 a) { + return v64_shl_n_8(a, shift); +} +template +v64 imm_v64_shr_n_u8(v64 a) { + return v64_shr_n_u8(a, shift); +} +template +v64 imm_v64_shr_n_s8(v64 a) { + return v64_shr_n_s8(a, shift); +} +template +v64 imm_v64_shl_n_16(v64 a) { + return v64_shl_n_16(a, shift); +} +template +v64 imm_v64_shr_n_u16(v64 a) { + return v64_shr_n_u16(a, shift); +} +template +v64 imm_v64_shr_n_s16(v64 a) { + return v64_shr_n_s16(a, shift); +} +template +v64 imm_v64_shl_n_32(v64 a) { + return v64_shl_n_32(a, shift); +} +template +v64 imm_v64_shr_n_u32(v64 a) { + return v64_shr_n_u32(a, shift); +} +template +v64 imm_v64_shr_n_s32(v64 a) { + return v64_shr_n_s32(a, shift); +} +template +v64 imm_v64_align(v64 a, v64 b) { + return v64_align(a, b, shift); +} + +// Wrap templates around corresponding C implementations of the above +template +c_v64 c_imm_v64_shl_n_byte(c_v64 a) { + return c_v64_shl_n_byte(a, shift); +} +template +c_v64 c_imm_v64_shr_n_byte(c_v64 a) { + return c_v64_shr_n_byte(a, shift); +} +template +c_v64 c_imm_v64_shl_n_8(c_v64 a) { + return c_v64_shl_n_8(a, shift); +} +template +c_v64 c_imm_v64_shr_n_u8(c_v64 a) { + return c_v64_shr_n_u8(a, shift); +} +template +c_v64 c_imm_v64_shr_n_s8(c_v64 a) { + return c_v64_shr_n_s8(a, shift); +} +template +c_v64 c_imm_v64_shl_n_16(c_v64 a) { + return c_v64_shl_n_16(a, shift); +} +template +c_v64 c_imm_v64_shr_n_u16(c_v64 a) { + return c_v64_shr_n_u16(a, shift); +} +template +c_v64 c_imm_v64_shr_n_s16(c_v64 a) { + return c_v64_shr_n_s16(a, shift); +} +template +c_v64 c_imm_v64_shl_n_32(c_v64 a) { + return c_v64_shl_n_32(a, shift); +} +template +c_v64 c_imm_v64_shr_n_u32(c_v64 a) { + return c_v64_shr_n_u32(a, shift); +} +template +c_v64 c_imm_v64_shr_n_s32(c_v64 a) { + return c_v64_shr_n_s32(a, shift); +} +template +c_v64 c_imm_v64_align(c_v64 a, c_v64 b) { + return c_v64_align(a, b, shift); +} + +template +v128 imm_v128_shl_n_byte(v128 a) { + return v128_shl_n_byte(a, shift); +} +template +v128 imm_v128_shr_n_byte(v128 a) { + return v128_shr_n_byte(a, shift); +} +template +v128 imm_v128_shl_n_8(v128 a) { + return v128_shl_n_8(a, shift); +} +template +v128 imm_v128_shr_n_u8(v128 a) { + return v128_shr_n_u8(a, shift); +} +template +v128 imm_v128_shr_n_s8(v128 a) { + return v128_shr_n_s8(a, shift); +} +template +v128 imm_v128_shl_n_16(v128 a) { + return v128_shl_n_16(a, shift); +} +template +v128 imm_v128_shr_n_u16(v128 a) { + return v128_shr_n_u16(a, shift); +} +template +v128 imm_v128_shr_n_s16(v128 a) { + return v128_shr_n_s16(a, shift); +} +template +v128 imm_v128_shl_n_32(v128 a) { + return v128_shl_n_32(a, shift); +} +template +v128 imm_v128_shr_n_u32(v128 a) { + return v128_shr_n_u32(a, shift); +} +template +v128 imm_v128_shr_n_s32(v128 a) { + return v128_shr_n_s32(a, shift); +} +template +v128 imm_v128_shl_n_64(v128 a) { + return v128_shl_n_64(a, shift); +} +template +v128 imm_v128_shr_n_u64(v128 a) { + return v128_shr_n_u64(a, shift); +} +template +v128 imm_v128_shr_n_s64(v128 a) { + return v128_shr_n_s64(a, shift); +} +template +v128 imm_v128_align(v128 a, v128 b) { + return v128_align(a, b, shift); +} + +template +c_v128 c_imm_v128_shl_n_byte(c_v128 a) { + return c_v128_shl_n_byte(a, shift); +} +template +c_v128 c_imm_v128_shr_n_byte(c_v128 a) { + return c_v128_shr_n_byte(a, shift); +} +template +c_v128 c_imm_v128_shl_n_8(c_v128 a) { + return c_v128_shl_n_8(a, shift); +} +template +c_v128 c_imm_v128_shr_n_u8(c_v128 a) { + return c_v128_shr_n_u8(a, shift); +} +template +c_v128 c_imm_v128_shr_n_s8(c_v128 a) { + return c_v128_shr_n_s8(a, shift); +} +template +c_v128 c_imm_v128_shl_n_16(c_v128 a) { + return c_v128_shl_n_16(a, shift); +} +template +c_v128 c_imm_v128_shr_n_u16(c_v128 a) { + return c_v128_shr_n_u16(a, shift); +} +template +c_v128 c_imm_v128_shr_n_s16(c_v128 a) { + return c_v128_shr_n_s16(a, shift); +} +template +c_v128 c_imm_v128_shl_n_32(c_v128 a) { + return c_v128_shl_n_32(a, shift); +} +template +c_v128 c_imm_v128_shr_n_u32(c_v128 a) { + return c_v128_shr_n_u32(a, shift); +} +template +c_v128 c_imm_v128_shr_n_s32(c_v128 a) { + return c_v128_shr_n_s32(a, shift); +} +template +c_v128 c_imm_v128_shl_n_64(c_v128 a) { + return c_v128_shl_n_64(a, shift); +} +template +c_v128 c_imm_v128_shr_n_u64(c_v128 a) { + return c_v128_shr_n_u64(a, shift); +} +template +c_v128 c_imm_v128_shr_n_s64(c_v128 a) { + return c_v128_shr_n_s64(a, shift); +} +template +c_v128 c_imm_v128_align(c_v128 a, c_v128 b) { + return c_v128_align(a, b, shift); +} + +template +v256 imm_v256_shl_n_word(v256 a) { + return v256_shl_n_word(a, shift); +} +template +v256 imm_v256_shr_n_word(v256 a) { + return v256_shr_n_word(a, shift); +} +template +v256 imm_v256_shl_n_byte(v256 a) { + return v256_shl_n_byte(a, shift); +} +template +v256 imm_v256_shr_n_byte(v256 a) { + return v256_shr_n_byte(a, shift); +} +template +v256 imm_v256_shl_n_8(v256 a) { + return v256_shl_n_8(a, shift); +} +template +v256 imm_v256_shr_n_u8(v256 a) { + return v256_shr_n_u8(a, shift); +} +template +v256 imm_v256_shr_n_s8(v256 a) { + return v256_shr_n_s8(a, shift); +} +template +v256 imm_v256_shl_n_16(v256 a) { + return v256_shl_n_16(a, shift); +} +template +v256 imm_v256_shr_n_u16(v256 a) { + return v256_shr_n_u16(a, shift); +} +template +v256 imm_v256_shr_n_s16(v256 a) { + return v256_shr_n_s16(a, shift); +} +template +v256 imm_v256_shl_n_32(v256 a) { + return v256_shl_n_32(a, shift); +} +template +v256 imm_v256_shr_n_u32(v256 a) { + return v256_shr_n_u32(a, shift); +} +template +v256 imm_v256_shr_n_s32(v256 a) { + return v256_shr_n_s32(a, shift); +} +template +v256 imm_v256_shl_n_64(v256 a) { + return v256_shl_n_64(a, shift); +} +template +v256 imm_v256_shr_n_u64(v256 a) { + return v256_shr_n_u64(a, shift); +} +template +v256 imm_v256_shr_n_s64(v256 a) { + return v256_shr_n_s64(a, shift); +} +template +v256 imm_v256_align(v256 a, v256 b) { + return v256_align(a, b, shift); +} + +template +c_v256 c_imm_v256_shl_n_word(c_v256 a) { + return c_v256_shl_n_word(a, shift); +} +template +c_v256 c_imm_v256_shr_n_word(c_v256 a) { + return c_v256_shr_n_word(a, shift); +} +template +c_v256 c_imm_v256_shl_n_byte(c_v256 a) { + return c_v256_shl_n_byte(a, shift); +} +template +c_v256 c_imm_v256_shr_n_byte(c_v256 a) { + return c_v256_shr_n_byte(a, shift); +} +template +c_v256 c_imm_v256_shl_n_8(c_v256 a) { + return c_v256_shl_n_8(a, shift); +} +template +c_v256 c_imm_v256_shr_n_u8(c_v256 a) { + return c_v256_shr_n_u8(a, shift); +} +template +c_v256 c_imm_v256_shr_n_s8(c_v256 a) { + return c_v256_shr_n_s8(a, shift); +} +template +c_v256 c_imm_v256_shl_n_16(c_v256 a) { + return c_v256_shl_n_16(a, shift); +} +template +c_v256 c_imm_v256_shr_n_u16(c_v256 a) { + return c_v256_shr_n_u16(a, shift); +} +template +c_v256 c_imm_v256_shr_n_s16(c_v256 a) { + return c_v256_shr_n_s16(a, shift); +} +template +c_v256 c_imm_v256_shl_n_32(c_v256 a) { + return c_v256_shl_n_32(a, shift); +} +template +c_v256 c_imm_v256_shr_n_u32(c_v256 a) { + return c_v256_shr_n_u32(a, shift); +} +template +c_v256 c_imm_v256_shr_n_s32(c_v256 a) { + return c_v256_shr_n_s32(a, shift); +} +template +c_v256 c_imm_v256_shl_n_64(c_v256 a) { + return c_v256_shl_n_64(a, shift); +} +template +c_v256 c_imm_v256_shr_n_u64(c_v256 a) { + return c_v256_shr_n_u64(a, shift); +} +template +c_v256 c_imm_v256_shr_n_s64(c_v256 a) { + return c_v256_shr_n_s64(a, shift); +} +template +c_v256 c_imm_v256_align(c_v256 a, c_v256 b) { + return c_v256_align(a, b, shift); +} + +// Wrappers around the the SAD and SSD functions +uint32_t v64_sad_u8(v64 a, v64 b) { + return v64_sad_u8_sum(::v64_sad_u8(v64_sad_u8_init(), a, b)); +} +uint32_t v64_ssd_u8(v64 a, v64 b) { + return v64_ssd_u8_sum(::v64_ssd_u8(v64_ssd_u8_init(), a, b)); +} + +uint32_t c_v64_sad_u8(c_v64 a, c_v64 b) { + return c_v64_sad_u8_sum(::c_v64_sad_u8(c_v64_sad_u8_init(), a, b)); +} +uint32_t c_v64_ssd_u8(c_v64 a, c_v64 b) { + return c_v64_ssd_u8_sum(::c_v64_ssd_u8(c_v64_ssd_u8_init(), a, b)); +} +uint32_t v128_sad_u8(v128 a, v128 b) { + return v128_sad_u8_sum(::v128_sad_u8(v128_sad_u8_init(), a, b)); +} +uint32_t v128_ssd_u8(v128 a, v128 b) { + return v128_ssd_u8_sum(::v128_ssd_u8(v128_ssd_u8_init(), a, b)); +} +uint32_t c_v128_sad_u8(c_v128 a, c_v128 b) { + return c_v128_sad_u8_sum(::c_v128_sad_u8(c_v128_sad_u8_init(), a, b)); +} +uint32_t c_v128_ssd_u8(c_v128 a, c_v128 b) { + return c_v128_ssd_u8_sum(::c_v128_ssd_u8(c_v128_ssd_u8_init(), a, b)); +} +uint32_t v128_sad_u16(v128 a, v128 b) { + return v128_sad_u16_sum(::v128_sad_u16(v128_sad_u16_init(), a, b)); +} +uint64_t v128_ssd_s16(v128 a, v128 b) { + return v128_ssd_s16_sum(::v128_ssd_s16(v128_ssd_s16_init(), a, b)); +} +uint32_t c_v128_sad_u16(c_v128 a, c_v128 b) { + return c_v128_sad_u16_sum(::c_v128_sad_u16(c_v128_sad_u16_init(), a, b)); +} +uint64_t c_v128_ssd_s16(c_v128 a, c_v128 b) { + return c_v128_ssd_s16_sum(::c_v128_ssd_s16(c_v128_ssd_s16_init(), a, b)); +} +uint32_t v256_sad_u8(v256 a, v256 b) { + return v256_sad_u8_sum(::v256_sad_u8(v256_sad_u8_init(), a, b)); +} +uint32_t v256_ssd_u8(v256 a, v256 b) { + return v256_ssd_u8_sum(::v256_ssd_u8(v256_ssd_u8_init(), a, b)); +} +uint32_t c_v256_sad_u8(c_v256 a, c_v256 b) { + return c_v256_sad_u8_sum(::c_v256_sad_u8(c_v256_sad_u8_init(), a, b)); +} +uint32_t c_v256_ssd_u8(c_v256 a, c_v256 b) { + return c_v256_ssd_u8_sum(::c_v256_ssd_u8(c_v256_ssd_u8_init(), a, b)); +} +uint32_t v256_sad_u16(v256 a, v256 b) { + return v256_sad_u16_sum(::v256_sad_u16(v256_sad_u16_init(), a, b)); +} +uint64_t v256_ssd_s16(v256 a, v256 b) { + return v256_ssd_s16_sum(::v256_ssd_s16(v256_ssd_s16_init(), a, b)); +} +uint32_t c_v256_sad_u16(c_v256 a, c_v256 b) { + return c_v256_sad_u16_sum(::c_v256_sad_u16(c_v256_sad_u16_init(), a, b)); +} +uint64_t c_v256_ssd_s16(c_v256 a, c_v256 b) { + return c_v256_ssd_s16_sum(::c_v256_ssd_s16(c_v256_ssd_s16_init(), a, b)); +} + +namespace { + +typedef void (*fptr)(); + +typedef struct { + const char *name; + fptr ref; + fptr simd; +} mapping; + +#define MAP(name) \ + { #name, reinterpret_cast < fptr>(c_##name), reinterpret_cast < fptr>(name) } + +const mapping m[] = { MAP(v64_sad_u8), + MAP(v64_ssd_u8), + MAP(v64_add_8), + MAP(v64_add_16), + MAP(v64_sadd_s8), + MAP(v64_sadd_u8), + MAP(v64_sadd_s16), + MAP(v64_add_32), + MAP(v64_sub_8), + MAP(v64_ssub_u8), + MAP(v64_ssub_s8), + MAP(v64_sub_16), + MAP(v64_ssub_s16), + MAP(v64_ssub_u16), + MAP(v64_sub_32), + MAP(v64_ziplo_8), + MAP(v64_ziphi_8), + MAP(v64_ziplo_16), + MAP(v64_ziphi_16), + MAP(v64_ziplo_32), + MAP(v64_ziphi_32), + MAP(v64_pack_s32_u16), + MAP(v64_pack_s32_s16), + MAP(v64_pack_s16_u8), + MAP(v64_pack_s16_s8), + MAP(v64_unziphi_8), + MAP(v64_unziplo_8), + MAP(v64_unziphi_16), + MAP(v64_unziplo_16), + MAP(v64_or), + MAP(v64_xor), + MAP(v64_and), + MAP(v64_andn), + MAP(v64_mullo_s16), + MAP(v64_mulhi_s16), + MAP(v64_mullo_s32), + MAP(v64_madd_s16), + MAP(v64_madd_us8), + MAP(v64_avg_u8), + MAP(v64_rdavg_u8), + MAP(v64_rdavg_u16), + MAP(v64_avg_u16), + MAP(v64_min_u8), + MAP(v64_max_u8), + MAP(v64_min_s8), + MAP(v64_max_s8), + MAP(v64_min_s16), + MAP(v64_max_s16), + MAP(v64_cmpgt_s8), + MAP(v64_cmplt_s8), + MAP(v64_cmpeq_8), + MAP(v64_cmpgt_s16), + MAP(v64_cmplt_s16), + MAP(v64_cmpeq_16), + MAP(v64_shuffle_8), + MAP(imm_v64_align<1>), + MAP(imm_v64_align<2>), + MAP(imm_v64_align<3>), + MAP(imm_v64_align<4>), + MAP(imm_v64_align<5>), + MAP(imm_v64_align<6>), + MAP(imm_v64_align<7>), + MAP(v64_abs_s8), + MAP(v64_abs_s16), + MAP(v64_unpacklo_u8_s16), + MAP(v64_unpackhi_u8_s16), + MAP(v64_unpacklo_s8_s16), + MAP(v64_unpackhi_s8_s16), + MAP(v64_unpacklo_u16_s32), + MAP(v64_unpacklo_s16_s32), + MAP(v64_unpackhi_u16_s32), + MAP(v64_unpackhi_s16_s32), + MAP(imm_v64_shr_n_byte<1>), + MAP(imm_v64_shr_n_byte<2>), + MAP(imm_v64_shr_n_byte<3>), + MAP(imm_v64_shr_n_byte<4>), + MAP(imm_v64_shr_n_byte<5>), + MAP(imm_v64_shr_n_byte<6>), + MAP(imm_v64_shr_n_byte<7>), + MAP(imm_v64_shl_n_byte<1>), + MAP(imm_v64_shl_n_byte<2>), + MAP(imm_v64_shl_n_byte<3>), + MAP(imm_v64_shl_n_byte<4>), + MAP(imm_v64_shl_n_byte<5>), + MAP(imm_v64_shl_n_byte<6>), + MAP(imm_v64_shl_n_byte<7>), + MAP(imm_v64_shl_n_8<1>), + MAP(imm_v64_shl_n_8<2>), + MAP(imm_v64_shl_n_8<3>), + MAP(imm_v64_shl_n_8<4>), + MAP(imm_v64_shl_n_8<5>), + MAP(imm_v64_shl_n_8<6>), + MAP(imm_v64_shl_n_8<7>), + MAP(imm_v64_shr_n_u8<1>), + MAP(imm_v64_shr_n_u8<2>), + MAP(imm_v64_shr_n_u8<3>), + MAP(imm_v64_shr_n_u8<4>), + MAP(imm_v64_shr_n_u8<5>), + MAP(imm_v64_shr_n_u8<6>), + MAP(imm_v64_shr_n_u8<7>), + MAP(imm_v64_shr_n_s8<1>), + MAP(imm_v64_shr_n_s8<2>), + MAP(imm_v64_shr_n_s8<3>), + MAP(imm_v64_shr_n_s8<4>), + MAP(imm_v64_shr_n_s8<5>), + MAP(imm_v64_shr_n_s8<6>), + MAP(imm_v64_shr_n_s8<7>), + MAP(imm_v64_shl_n_16<1>), + MAP(imm_v64_shl_n_16<2>), + MAP(imm_v64_shl_n_16<4>), + MAP(imm_v64_shl_n_16<6>), + MAP(imm_v64_shl_n_16<8>), + MAP(imm_v64_shl_n_16<10>), + MAP(imm_v64_shl_n_16<12>), + MAP(imm_v64_shl_n_16<14>), + MAP(imm_v64_shr_n_u16<1>), + MAP(imm_v64_shr_n_u16<2>), + MAP(imm_v64_shr_n_u16<4>), + MAP(imm_v64_shr_n_u16<6>), + MAP(imm_v64_shr_n_u16<8>), + MAP(imm_v64_shr_n_u16<10>), + MAP(imm_v64_shr_n_u16<12>), + MAP(imm_v64_shr_n_u16<14>), + MAP(imm_v64_shr_n_s16<1>), + MAP(imm_v64_shr_n_s16<2>), + MAP(imm_v64_shr_n_s16<4>), + MAP(imm_v64_shr_n_s16<6>), + MAP(imm_v64_shr_n_s16<8>), + MAP(imm_v64_shr_n_s16<10>), + MAP(imm_v64_shr_n_s16<12>), + MAP(imm_v64_shr_n_s16<14>), + MAP(imm_v64_shl_n_32<1>), + MAP(imm_v64_shl_n_32<4>), + MAP(imm_v64_shl_n_32<8>), + MAP(imm_v64_shl_n_32<12>), + MAP(imm_v64_shl_n_32<16>), + MAP(imm_v64_shl_n_32<20>), + MAP(imm_v64_shl_n_32<24>), + MAP(imm_v64_shl_n_32<28>), + MAP(imm_v64_shr_n_u32<1>), + MAP(imm_v64_shr_n_u32<4>), + MAP(imm_v64_shr_n_u32<8>), + MAP(imm_v64_shr_n_u32<12>), + MAP(imm_v64_shr_n_u32<16>), + MAP(imm_v64_shr_n_u32<20>), + MAP(imm_v64_shr_n_u32<24>), + MAP(imm_v64_shr_n_u32<28>), + MAP(imm_v64_shr_n_s32<1>), + MAP(imm_v64_shr_n_s32<4>), + MAP(imm_v64_shr_n_s32<8>), + MAP(imm_v64_shr_n_s32<12>), + MAP(imm_v64_shr_n_s32<16>), + MAP(imm_v64_shr_n_s32<20>), + MAP(imm_v64_shr_n_s32<24>), + MAP(imm_v64_shr_n_s32<28>), + MAP(v64_shl_8), + MAP(v64_shr_u8), + MAP(v64_shr_s8), + MAP(v64_shl_16), + MAP(v64_shr_u16), + MAP(v64_shr_s16), + MAP(v64_shl_32), + MAP(v64_shr_u32), + MAP(v64_shr_s32), + MAP(v64_hadd_u8), + MAP(v64_hadd_s16), + MAP(v64_dotp_s16), + MAP(v64_dotp_su8), + MAP(v64_u64), + MAP(v64_low_u32), + MAP(v64_high_u32), + MAP(v64_low_s32), + MAP(v64_high_s32), + MAP(v64_dup_8), + MAP(v64_dup_16), + MAP(v64_dup_32), + MAP(v64_from_32), + MAP(v64_zero), + MAP(v64_from_16), + MAP(v128_sad_u8), + MAP(v128_ssd_u8), + MAP(v128_sad_u16), + MAP(v128_ssd_s16), + MAP(v128_add_8), + MAP(v128_add_16), + MAP(v128_sadd_s8), + MAP(v128_sadd_u8), + MAP(v128_sadd_s16), + MAP(v128_add_32), + MAP(v128_add_64), + MAP(v128_sub_8), + MAP(v128_ssub_u8), + MAP(v128_ssub_s8), + MAP(v128_sub_16), + MAP(v128_ssub_s16), + MAP(v128_ssub_u16), + MAP(v128_sub_32), + MAP(v128_sub_64), + MAP(v128_ziplo_8), + MAP(v128_ziphi_8), + MAP(v128_ziplo_16), + MAP(v128_ziphi_16), + MAP(v128_ziplo_32), + MAP(v128_ziphi_32), + MAP(v128_ziplo_64), + MAP(v128_ziphi_64), + MAP(v128_unziphi_8), + MAP(v128_unziplo_8), + MAP(v128_unziphi_16), + MAP(v128_unziplo_16), + MAP(v128_unziphi_32), + MAP(v128_unziplo_32), + MAP(v128_pack_s32_u16), + MAP(v128_pack_s32_s16), + MAP(v128_pack_s16_u8), + MAP(v128_pack_s16_s8), + MAP(v128_or), + MAP(v128_xor), + MAP(v128_and), + MAP(v128_andn), + MAP(v128_mullo_s16), + MAP(v128_mulhi_s16), + MAP(v128_mullo_s32), + MAP(v128_madd_s16), + MAP(v128_madd_us8), + MAP(v128_avg_u8), + MAP(v128_rdavg_u8), + MAP(v128_rdavg_u16), + MAP(v128_avg_u16), + MAP(v128_min_u8), + MAP(v128_max_u8), + MAP(v128_min_s8), + MAP(v128_max_s8), + MAP(v128_min_s16), + MAP(v128_max_s16), + MAP(v128_min_s32), + MAP(v128_max_s32), + MAP(v128_cmpgt_s8), + MAP(v128_cmplt_s8), + MAP(v128_cmpeq_8), + MAP(v128_cmpgt_s16), + MAP(v128_cmpeq_16), + MAP(v128_cmplt_s16), + MAP(v128_cmpgt_s32), + MAP(v128_cmpeq_32), + MAP(v128_cmplt_s32), + MAP(v128_shuffle_8), + MAP(imm_v128_align<1>), + MAP(imm_v128_align<2>), + MAP(imm_v128_align<3>), + MAP(imm_v128_align<4>), + MAP(imm_v128_align<5>), + MAP(imm_v128_align<6>), + MAP(imm_v128_align<7>), + MAP(imm_v128_align<8>), + MAP(imm_v128_align<9>), + MAP(imm_v128_align<10>), + MAP(imm_v128_align<11>), + MAP(imm_v128_align<12>), + MAP(imm_v128_align<13>), + MAP(imm_v128_align<14>), + MAP(imm_v128_align<15>), + MAP(v128_abs_s8), + MAP(v128_abs_s16), + MAP(v128_padd_u8), + MAP(v128_padd_s16), + MAP(v128_unpacklo_u16_s32), + MAP(v128_unpacklo_s16_s32), + MAP(v128_unpackhi_u16_s32), + MAP(v128_unpackhi_s16_s32), + MAP(imm_v128_shr_n_byte<1>), + MAP(imm_v128_shr_n_byte<2>), + MAP(imm_v128_shr_n_byte<3>), + MAP(imm_v128_shr_n_byte<4>), + MAP(imm_v128_shr_n_byte<5>), + MAP(imm_v128_shr_n_byte<6>), + MAP(imm_v128_shr_n_byte<7>), + MAP(imm_v128_shr_n_byte<8>), + MAP(imm_v128_shr_n_byte<9>), + MAP(imm_v128_shr_n_byte<10>), + MAP(imm_v128_shr_n_byte<11>), + MAP(imm_v128_shr_n_byte<12>), + MAP(imm_v128_shr_n_byte<13>), + MAP(imm_v128_shr_n_byte<14>), + MAP(imm_v128_shr_n_byte<15>), + MAP(imm_v128_shl_n_byte<1>), + MAP(imm_v128_shl_n_byte<2>), + MAP(imm_v128_shl_n_byte<3>), + MAP(imm_v128_shl_n_byte<4>), + MAP(imm_v128_shl_n_byte<5>), + MAP(imm_v128_shl_n_byte<6>), + MAP(imm_v128_shl_n_byte<7>), + MAP(imm_v128_shl_n_byte<8>), + MAP(imm_v128_shl_n_byte<9>), + MAP(imm_v128_shl_n_byte<10>), + MAP(imm_v128_shl_n_byte<11>), + MAP(imm_v128_shl_n_byte<12>), + MAP(imm_v128_shl_n_byte<13>), + MAP(imm_v128_shl_n_byte<14>), + MAP(imm_v128_shl_n_byte<15>), + MAP(imm_v128_shl_n_8<1>), + MAP(imm_v128_shl_n_8<2>), + MAP(imm_v128_shl_n_8<3>), + MAP(imm_v128_shl_n_8<4>), + MAP(imm_v128_shl_n_8<5>), + MAP(imm_v128_shl_n_8<6>), + MAP(imm_v128_shl_n_8<7>), + MAP(imm_v128_shr_n_u8<1>), + MAP(imm_v128_shr_n_u8<2>), + MAP(imm_v128_shr_n_u8<3>), + MAP(imm_v128_shr_n_u8<4>), + MAP(imm_v128_shr_n_u8<5>), + MAP(imm_v128_shr_n_u8<6>), + MAP(imm_v128_shr_n_u8<7>), + MAP(imm_v128_shr_n_s8<1>), + MAP(imm_v128_shr_n_s8<2>), + MAP(imm_v128_shr_n_s8<3>), + MAP(imm_v128_shr_n_s8<4>), + MAP(imm_v128_shr_n_s8<5>), + MAP(imm_v128_shr_n_s8<6>), + MAP(imm_v128_shr_n_s8<7>), + MAP(imm_v128_shl_n_16<1>), + MAP(imm_v128_shl_n_16<2>), + MAP(imm_v128_shl_n_16<4>), + MAP(imm_v128_shl_n_16<6>), + MAP(imm_v128_shl_n_16<8>), + MAP(imm_v128_shl_n_16<10>), + MAP(imm_v128_shl_n_16<12>), + MAP(imm_v128_shl_n_16<14>), + MAP(imm_v128_shr_n_u16<1>), + MAP(imm_v128_shr_n_u16<2>), + MAP(imm_v128_shr_n_u16<4>), + MAP(imm_v128_shr_n_u16<6>), + MAP(imm_v128_shr_n_u16<8>), + MAP(imm_v128_shr_n_u16<10>), + MAP(imm_v128_shr_n_u16<12>), + MAP(imm_v128_shr_n_u16<14>), + MAP(imm_v128_shr_n_s16<1>), + MAP(imm_v128_shr_n_s16<2>), + MAP(imm_v128_shr_n_s16<4>), + MAP(imm_v128_shr_n_s16<6>), + MAP(imm_v128_shr_n_s16<8>), + MAP(imm_v128_shr_n_s16<10>), + MAP(imm_v128_shr_n_s16<12>), + MAP(imm_v128_shr_n_s16<14>), + MAP(imm_v128_shl_n_32<1>), + MAP(imm_v128_shl_n_32<4>), + MAP(imm_v128_shl_n_32<8>), + MAP(imm_v128_shl_n_32<12>), + MAP(imm_v128_shl_n_32<16>), + MAP(imm_v128_shl_n_32<20>), + MAP(imm_v128_shl_n_32<24>), + MAP(imm_v128_shl_n_32<28>), + MAP(imm_v128_shr_n_u32<1>), + MAP(imm_v128_shr_n_u32<4>), + MAP(imm_v128_shr_n_u32<8>), + MAP(imm_v128_shr_n_u32<12>), + MAP(imm_v128_shr_n_u32<16>), + MAP(imm_v128_shr_n_u32<20>), + MAP(imm_v128_shr_n_u32<24>), + MAP(imm_v128_shr_n_u32<28>), + MAP(imm_v128_shr_n_s32<1>), + MAP(imm_v128_shr_n_s32<4>), + MAP(imm_v128_shr_n_s32<8>), + MAP(imm_v128_shr_n_s32<12>), + MAP(imm_v128_shr_n_s32<16>), + MAP(imm_v128_shr_n_s32<20>), + MAP(imm_v128_shr_n_s32<24>), + MAP(imm_v128_shr_n_s32<28>), + MAP(imm_v128_shl_n_64<1>), + MAP(imm_v128_shl_n_64<4>), + MAP(imm_v128_shl_n_64<8>), + MAP(imm_v128_shl_n_64<12>), + MAP(imm_v128_shl_n_64<16>), + MAP(imm_v128_shl_n_64<20>), + MAP(imm_v128_shl_n_64<24>), + MAP(imm_v128_shl_n_64<28>), + MAP(imm_v128_shl_n_64<32>), + MAP(imm_v128_shl_n_64<36>), + MAP(imm_v128_shl_n_64<40>), + MAP(imm_v128_shl_n_64<44>), + MAP(imm_v128_shl_n_64<48>), + MAP(imm_v128_shl_n_64<52>), + MAP(imm_v128_shl_n_64<56>), + MAP(imm_v128_shl_n_64<60>), + MAP(imm_v128_shr_n_u64<1>), + MAP(imm_v128_shr_n_u64<4>), + MAP(imm_v128_shr_n_u64<8>), + MAP(imm_v128_shr_n_u64<12>), + MAP(imm_v128_shr_n_u64<16>), + MAP(imm_v128_shr_n_u64<20>), + MAP(imm_v128_shr_n_u64<24>), + MAP(imm_v128_shr_n_u64<28>), + MAP(imm_v128_shr_n_u64<32>), + MAP(imm_v128_shr_n_u64<36>), + MAP(imm_v128_shr_n_u64<40>), + MAP(imm_v128_shr_n_u64<44>), + MAP(imm_v128_shr_n_u64<48>), + MAP(imm_v128_shr_n_u64<52>), + MAP(imm_v128_shr_n_u64<56>), + MAP(imm_v128_shr_n_u64<60>), + MAP(imm_v128_shr_n_s64<1>), + MAP(imm_v128_shr_n_s64<4>), + MAP(imm_v128_shr_n_s64<8>), + MAP(imm_v128_shr_n_s64<12>), + MAP(imm_v128_shr_n_s64<16>), + MAP(imm_v128_shr_n_s64<20>), + MAP(imm_v128_shr_n_s64<24>), + MAP(imm_v128_shr_n_s64<28>), + MAP(imm_v128_shr_n_s64<32>), + MAP(imm_v128_shr_n_s64<36>), + MAP(imm_v128_shr_n_s64<40>), + MAP(imm_v128_shr_n_s64<44>), + MAP(imm_v128_shr_n_s64<48>), + MAP(imm_v128_shr_n_s64<52>), + MAP(imm_v128_shr_n_s64<56>), + MAP(imm_v128_shr_n_s64<60>), + MAP(v128_from_v64), + MAP(v128_zip_8), + MAP(v128_zip_16), + MAP(v128_zip_32), + MAP(v128_mul_s16), + MAP(v128_unpack_u8_s16), + MAP(v128_unpack_s8_s16), + MAP(v128_unpack_u16_s32), + MAP(v128_unpack_s16_s32), + MAP(v128_shl_8), + MAP(v128_shr_u8), + MAP(v128_shr_s8), + MAP(v128_shl_16), + MAP(v128_shr_u16), + MAP(v128_shr_s16), + MAP(v128_shl_32), + MAP(v128_shr_u32), + MAP(v128_shr_s32), + MAP(v128_shl_64), + MAP(v128_shr_u64), + MAP(v128_shr_s64), + MAP(v128_hadd_u8), + MAP(v128_dotp_su8), + MAP(v128_dotp_s16), + MAP(v128_dotp_s32), + MAP(v128_low_u32), + MAP(v128_low_v64), + MAP(v128_high_v64), + MAP(v128_from_64), + MAP(v128_from_32), + MAP(v128_movemask_8), + MAP(v128_zero), + MAP(v128_dup_8), + MAP(v128_dup_16), + MAP(v128_dup_32), + MAP(v128_dup_64), + MAP(v128_unpacklo_u8_s16), + MAP(v128_unpackhi_u8_s16), + MAP(v128_unpacklo_s8_s16), + MAP(v128_unpackhi_s8_s16), + MAP(v128_blend_8), + MAP(u32_load_unaligned), + MAP(u32_store_unaligned), + MAP(v64_load_unaligned), + MAP(v64_store_unaligned), + MAP(v128_load_unaligned), + MAP(v128_store_unaligned), + MAP(v256_sad_u8), + MAP(v256_ssd_u8), + MAP(v256_sad_u16), + MAP(v256_ssd_s16), + MAP(v256_hadd_u8), + MAP(v256_low_u64), + MAP(v256_dotp_su8), + MAP(v256_dotp_s16), + MAP(v256_dotp_s32), + MAP(v256_add_8), + MAP(v256_add_16), + MAP(v256_sadd_s8), + MAP(v256_sadd_u8), + MAP(v256_sadd_s16), + MAP(v256_add_32), + MAP(v256_add_64), + MAP(v256_sub_8), + MAP(v256_ssub_u8), + MAP(v256_ssub_s8), + MAP(v256_sub_16), + MAP(v256_ssub_u16), + MAP(v256_ssub_s16), + MAP(v256_sub_32), + MAP(v256_sub_64), + MAP(v256_ziplo_8), + MAP(v256_ziphi_8), + MAP(v256_ziplo_16), + MAP(v256_ziphi_16), + MAP(v256_ziplo_32), + MAP(v256_ziphi_32), + MAP(v256_ziplo_64), + MAP(v256_ziphi_64), + MAP(v256_unziphi_8), + MAP(v256_unziplo_8), + MAP(v256_unziphi_16), + MAP(v256_unziplo_16), + MAP(v256_unziphi_32), + MAP(v256_unziplo_32), + MAP(v256_unziphi_64), + MAP(v256_unziplo_64), + MAP(v256_pack_s32_u16), + MAP(v256_pack_s32_s16), + MAP(v256_pack_s16_u8), + MAP(v256_pack_s16_s8), + MAP(v256_or), + MAP(v256_xor), + MAP(v256_and), + MAP(v256_andn), + MAP(v256_mullo_s16), + MAP(v256_mulhi_s16), + MAP(v256_mullo_s32), + MAP(v256_madd_s16), + MAP(v256_madd_us8), + MAP(v256_avg_u8), + MAP(v256_rdavg_u8), + MAP(v256_rdavg_u16), + MAP(v256_avg_u16), + MAP(v256_min_u8), + MAP(v256_max_u8), + MAP(v256_min_s8), + MAP(v256_max_s8), + MAP(v256_min_s16), + MAP(v256_max_s16), + MAP(v256_min_s32), + MAP(v256_max_s32), + MAP(v256_cmpgt_s8), + MAP(v256_cmplt_s8), + MAP(v256_cmpeq_8), + MAP(v256_cmpgt_s16), + MAP(v256_cmplt_s16), + MAP(v256_cmpeq_16), + MAP(v256_cmpgt_s32), + MAP(v256_cmplt_s32), + MAP(v256_cmpeq_32), + MAP(v256_shuffle_8), + MAP(v256_pshuffle_8), + MAP(v256_wideshuffle_8), + MAP(imm_v256_align<1>), + MAP(imm_v256_align<2>), + MAP(imm_v256_align<3>), + MAP(imm_v256_align<4>), + MAP(imm_v256_align<5>), + MAP(imm_v256_align<6>), + MAP(imm_v256_align<7>), + MAP(imm_v256_align<8>), + MAP(imm_v256_align<9>), + MAP(imm_v256_align<10>), + MAP(imm_v256_align<11>), + MAP(imm_v256_align<12>), + MAP(imm_v256_align<13>), + MAP(imm_v256_align<14>), + MAP(imm_v256_align<15>), + MAP(imm_v256_align<16>), + MAP(imm_v256_align<17>), + MAP(imm_v256_align<18>), + MAP(imm_v256_align<19>), + MAP(imm_v256_align<20>), + MAP(imm_v256_align<21>), + MAP(imm_v256_align<22>), + MAP(imm_v256_align<23>), + MAP(imm_v256_align<24>), + MAP(imm_v256_align<25>), + MAP(imm_v256_align<26>), + MAP(imm_v256_align<27>), + MAP(imm_v256_align<28>), + MAP(imm_v256_align<29>), + MAP(imm_v256_align<30>), + MAP(imm_v256_align<31>), + MAP(v256_from_v128), + MAP(v256_zip_8), + MAP(v256_zip_16), + MAP(v256_zip_32), + MAP(v256_mul_s16), + MAP(v256_unpack_u8_s16), + MAP(v256_unpack_s8_s16), + MAP(v256_unpack_u16_s32), + MAP(v256_unpack_s16_s32), + MAP(v256_shl_8), + MAP(v256_shr_u8), + MAP(v256_shr_s8), + MAP(v256_shl_16), + MAP(v256_shr_u16), + MAP(v256_shr_s16), + MAP(v256_shl_32), + MAP(v256_shr_u32), + MAP(v256_shr_s32), + MAP(v256_shl_64), + MAP(v256_shr_u64), + MAP(v256_shr_s64), + MAP(v256_abs_s8), + MAP(v256_abs_s16), + MAP(v256_padd_u8), + MAP(v256_padd_s16), + MAP(v256_unpacklo_u16_s32), + MAP(v256_unpacklo_s16_s32), + MAP(v256_unpackhi_u16_s32), + MAP(v256_unpackhi_s16_s32), + MAP(imm_v256_shr_n_word<1>), + MAP(imm_v256_shr_n_word<2>), + MAP(imm_v256_shr_n_word<3>), + MAP(imm_v256_shr_n_word<4>), + MAP(imm_v256_shr_n_word<5>), + MAP(imm_v256_shr_n_word<6>), + MAP(imm_v256_shr_n_word<7>), + MAP(imm_v256_shr_n_word<8>), + MAP(imm_v256_shr_n_word<9>), + MAP(imm_v256_shr_n_word<10>), + MAP(imm_v256_shr_n_word<11>), + MAP(imm_v256_shr_n_word<12>), + MAP(imm_v256_shr_n_word<13>), + MAP(imm_v256_shr_n_word<14>), + MAP(imm_v256_shr_n_word<15>), + MAP(imm_v256_shl_n_word<1>), + MAP(imm_v256_shl_n_word<2>), + MAP(imm_v256_shl_n_word<3>), + MAP(imm_v256_shl_n_word<4>), + MAP(imm_v256_shl_n_word<5>), + MAP(imm_v256_shl_n_word<6>), + MAP(imm_v256_shl_n_word<7>), + MAP(imm_v256_shl_n_word<8>), + MAP(imm_v256_shl_n_word<9>), + MAP(imm_v256_shl_n_word<10>), + MAP(imm_v256_shl_n_word<11>), + MAP(imm_v256_shl_n_word<12>), + MAP(imm_v256_shl_n_word<13>), + MAP(imm_v256_shl_n_word<14>), + MAP(imm_v256_shl_n_word<15>), + MAP(imm_v256_shr_n_byte<1>), + MAP(imm_v256_shr_n_byte<2>), + MAP(imm_v256_shr_n_byte<3>), + MAP(imm_v256_shr_n_byte<4>), + MAP(imm_v256_shr_n_byte<5>), + MAP(imm_v256_shr_n_byte<6>), + MAP(imm_v256_shr_n_byte<7>), + MAP(imm_v256_shr_n_byte<8>), + MAP(imm_v256_shr_n_byte<9>), + MAP(imm_v256_shr_n_byte<10>), + MAP(imm_v256_shr_n_byte<11>), + MAP(imm_v256_shr_n_byte<12>), + MAP(imm_v256_shr_n_byte<13>), + MAP(imm_v256_shr_n_byte<14>), + MAP(imm_v256_shr_n_byte<15>), + MAP(imm_v256_shr_n_byte<16>), + MAP(imm_v256_shr_n_byte<17>), + MAP(imm_v256_shr_n_byte<18>), + MAP(imm_v256_shr_n_byte<19>), + MAP(imm_v256_shr_n_byte<20>), + MAP(imm_v256_shr_n_byte<21>), + MAP(imm_v256_shr_n_byte<22>), + MAP(imm_v256_shr_n_byte<23>), + MAP(imm_v256_shr_n_byte<24>), + MAP(imm_v256_shr_n_byte<25>), + MAP(imm_v256_shr_n_byte<26>), + MAP(imm_v256_shr_n_byte<27>), + MAP(imm_v256_shr_n_byte<28>), + MAP(imm_v256_shr_n_byte<29>), + MAP(imm_v256_shr_n_byte<30>), + MAP(imm_v256_shr_n_byte<31>), + MAP(imm_v256_shl_n_byte<1>), + MAP(imm_v256_shl_n_byte<2>), + MAP(imm_v256_shl_n_byte<3>), + MAP(imm_v256_shl_n_byte<4>), + MAP(imm_v256_shl_n_byte<5>), + MAP(imm_v256_shl_n_byte<6>), + MAP(imm_v256_shl_n_byte<7>), + MAP(imm_v256_shl_n_byte<8>), + MAP(imm_v256_shl_n_byte<9>), + MAP(imm_v256_shl_n_byte<10>), + MAP(imm_v256_shl_n_byte<11>), + MAP(imm_v256_shl_n_byte<12>), + MAP(imm_v256_shl_n_byte<13>), + MAP(imm_v256_shl_n_byte<14>), + MAP(imm_v256_shl_n_byte<15>), + MAP(imm_v256_shl_n_byte<16>), + MAP(imm_v256_shl_n_byte<17>), + MAP(imm_v256_shl_n_byte<18>), + MAP(imm_v256_shl_n_byte<19>), + MAP(imm_v256_shl_n_byte<20>), + MAP(imm_v256_shl_n_byte<21>), + MAP(imm_v256_shl_n_byte<22>), + MAP(imm_v256_shl_n_byte<23>), + MAP(imm_v256_shl_n_byte<24>), + MAP(imm_v256_shl_n_byte<25>), + MAP(imm_v256_shl_n_byte<26>), + MAP(imm_v256_shl_n_byte<27>), + MAP(imm_v256_shl_n_byte<28>), + MAP(imm_v256_shl_n_byte<29>), + MAP(imm_v256_shl_n_byte<30>), + MAP(imm_v256_shl_n_byte<31>), + MAP(imm_v256_shl_n_8<1>), + MAP(imm_v256_shl_n_8<2>), + MAP(imm_v256_shl_n_8<3>), + MAP(imm_v256_shl_n_8<4>), + MAP(imm_v256_shl_n_8<5>), + MAP(imm_v256_shl_n_8<6>), + MAP(imm_v256_shl_n_8<7>), + MAP(imm_v256_shr_n_u8<1>), + MAP(imm_v256_shr_n_u8<2>), + MAP(imm_v256_shr_n_u8<3>), + MAP(imm_v256_shr_n_u8<4>), + MAP(imm_v256_shr_n_u8<5>), + MAP(imm_v256_shr_n_u8<6>), + MAP(imm_v256_shr_n_u8<7>), + MAP(imm_v256_shr_n_s8<1>), + MAP(imm_v256_shr_n_s8<2>), + MAP(imm_v256_shr_n_s8<3>), + MAP(imm_v256_shr_n_s8<4>), + MAP(imm_v256_shr_n_s8<5>), + MAP(imm_v256_shr_n_s8<6>), + MAP(imm_v256_shr_n_s8<7>), + MAP(imm_v256_shl_n_16<1>), + MAP(imm_v256_shl_n_16<2>), + MAP(imm_v256_shl_n_16<4>), + MAP(imm_v256_shl_n_16<6>), + MAP(imm_v256_shl_n_16<8>), + MAP(imm_v256_shl_n_16<10>), + MAP(imm_v256_shl_n_16<12>), + MAP(imm_v256_shl_n_16<14>), + MAP(imm_v256_shr_n_u16<1>), + MAP(imm_v256_shr_n_u16<2>), + MAP(imm_v256_shr_n_u16<4>), + MAP(imm_v256_shr_n_u16<6>), + MAP(imm_v256_shr_n_u16<8>), + MAP(imm_v256_shr_n_u16<10>), + MAP(imm_v256_shr_n_u16<12>), + MAP(imm_v256_shr_n_u16<14>), + MAP(imm_v256_shr_n_s16<1>), + MAP(imm_v256_shr_n_s16<2>), + MAP(imm_v256_shr_n_s16<4>), + MAP(imm_v256_shr_n_s16<6>), + MAP(imm_v256_shr_n_s16<8>), + MAP(imm_v256_shr_n_s16<10>), + MAP(imm_v256_shr_n_s16<12>), + MAP(imm_v256_shr_n_s16<14>), + MAP(imm_v256_shl_n_32<1>), + MAP(imm_v256_shl_n_32<4>), + MAP(imm_v256_shl_n_32<8>), + MAP(imm_v256_shl_n_32<12>), + MAP(imm_v256_shl_n_32<16>), + MAP(imm_v256_shl_n_32<20>), + MAP(imm_v256_shl_n_32<24>), + MAP(imm_v256_shl_n_32<28>), + MAP(imm_v256_shr_n_u32<1>), + MAP(imm_v256_shr_n_u32<4>), + MAP(imm_v256_shr_n_u32<8>), + MAP(imm_v256_shr_n_u32<12>), + MAP(imm_v256_shr_n_u32<16>), + MAP(imm_v256_shr_n_u32<20>), + MAP(imm_v256_shr_n_u32<24>), + MAP(imm_v256_shr_n_u32<28>), + MAP(imm_v256_shr_n_s32<1>), + MAP(imm_v256_shr_n_s32<4>), + MAP(imm_v256_shr_n_s32<8>), + MAP(imm_v256_shr_n_s32<12>), + MAP(imm_v256_shr_n_s32<16>), + MAP(imm_v256_shr_n_s32<20>), + MAP(imm_v256_shr_n_s32<24>), + MAP(imm_v256_shr_n_s32<28>), + MAP(imm_v256_shl_n_64<1>), + MAP(imm_v256_shl_n_64<4>), + MAP(imm_v256_shl_n_64<8>), + MAP(imm_v256_shl_n_64<12>), + MAP(imm_v256_shl_n_64<16>), + MAP(imm_v256_shl_n_64<20>), + MAP(imm_v256_shl_n_64<24>), + MAP(imm_v256_shl_n_64<28>), + MAP(imm_v256_shl_n_64<32>), + MAP(imm_v256_shl_n_64<36>), + MAP(imm_v256_shl_n_64<40>), + MAP(imm_v256_shl_n_64<44>), + MAP(imm_v256_shl_n_64<48>), + MAP(imm_v256_shl_n_64<52>), + MAP(imm_v256_shl_n_64<56>), + MAP(imm_v256_shl_n_64<60>), + MAP(imm_v256_shr_n_u64<1>), + MAP(imm_v256_shr_n_u64<4>), + MAP(imm_v256_shr_n_u64<8>), + MAP(imm_v256_shr_n_u64<12>), + MAP(imm_v256_shr_n_u64<16>), + MAP(imm_v256_shr_n_u64<20>), + MAP(imm_v256_shr_n_u64<24>), + MAP(imm_v256_shr_n_u64<28>), + MAP(imm_v256_shr_n_u64<32>), + MAP(imm_v256_shr_n_u64<36>), + MAP(imm_v256_shr_n_u64<40>), + MAP(imm_v256_shr_n_u64<44>), + MAP(imm_v256_shr_n_u64<48>), + MAP(imm_v256_shr_n_u64<52>), + MAP(imm_v256_shr_n_u64<56>), + MAP(imm_v256_shr_n_u64<60>), + MAP(imm_v256_shr_n_s64<1>), + MAP(imm_v256_shr_n_s64<4>), + MAP(imm_v256_shr_n_s64<8>), + MAP(imm_v256_shr_n_s64<12>), + MAP(imm_v256_shr_n_s64<16>), + MAP(imm_v256_shr_n_s64<20>), + MAP(imm_v256_shr_n_s64<24>), + MAP(imm_v256_shr_n_s64<28>), + MAP(imm_v256_shr_n_s64<32>), + MAP(imm_v256_shr_n_s64<36>), + MAP(imm_v256_shr_n_s64<40>), + MAP(imm_v256_shr_n_s64<44>), + MAP(imm_v256_shr_n_s64<48>), + MAP(imm_v256_shr_n_s64<52>), + MAP(imm_v256_shr_n_s64<56>), + MAP(imm_v256_shr_n_s64<60>), + MAP(v256_movemask_8), + MAP(v256_zero), + MAP(v256_dup_8), + MAP(v256_dup_16), + MAP(v256_dup_32), + MAP(v256_dup_64), + MAP(v256_low_u32), + MAP(v256_low_v64), + MAP(v256_from_64), + MAP(v256_from_v64), + MAP(v256_ziplo_128), + MAP(v256_ziphi_128), + MAP(v256_unpacklo_u8_s16), + MAP(v256_unpackhi_u8_s16), + MAP(v256_unpacklo_s8_s16), + MAP(v256_unpackhi_s8_s16), + MAP(v256_blend_8), + { NULL, NULL, NULL } }; +#undef MAP + +// Map reference functions to machine tuned functions. Since the +// functions depend on machine tuned types, the non-machine tuned +// instantiations of the test can't refer to these functions directly, +// so we refer to them by name and do the mapping here. +void Map(const char *name, fptr *ref, fptr *simd) { + unsigned int i; + for (i = 0; m[i].name && strcmp(name, m[i].name); i++) { + } + + *ref = m[i].ref; + *simd = m[i].simd; +} + +// Used for printing errors in TestSimd1Arg, TestSimd2Args and TestSimd3Args +std::string Print(const uint8_t *a, int size) { + std::string text = "0x"; + for (int i = 0; i < size; i++) { + const uint8_t c = a[!CONFIG_BIG_ENDIAN ? size - 1 - i : i]; + // Same as snprintf(..., ..., "%02x", c) + text += (c >> 4) + '0' + ((c >> 4) > 9) * ('a' - '0' - 10); + text += (c & 15) + '0' + ((c & 15) > 9) * ('a' - '0' - 10); + } + + return text; +} + +// Used in TestSimd1Arg, TestSimd2Args and TestSimd3Args to restrict argument +// ranges +void SetMask(uint8_t *s, int size, uint32_t mask, uint32_t maskwidth) { + switch (maskwidth) { + case 0: { + break; + } + case 8: { + for (int i = 0; i < size; i++) s[i] &= mask; + break; + } + case 16: { + uint16_t *t = reinterpret_cast(s); + assert(!(reinterpret_cast(s) & 1)); + for (int i = 0; i < size / 2; i++) t[i] &= mask; + break; + } + case 32: { + uint32_t *t = reinterpret_cast(s); + assert(!(reinterpret_cast(s) & 3)); + for (int i = 0; i < size / 4; i++) t[i] &= mask; + break; + } + case 64: { + uint64_t *t = reinterpret_cast(s); + assert(!(reinterpret_cast(s) & 7)); + for (int i = 0; i < size / 8; i++) t[i] &= mask; + break; + } + default: { + FAIL() << "Unsupported mask width"; + break; + } + } +} + +// We need some extra load/store functions +void u64_store_aligned(void *p, uint64_t a) { + v64_store_aligned(p, v64_from_64(a)); +} +void s32_store_aligned(void *p, int32_t a) { + u32_store_aligned(p, static_cast(a)); +} +void s64_store_aligned(void *p, int64_t a) { + v64_store_aligned(p, v64_from_64(static_cast(a))); +} + +void c_u64_store_aligned(void *p, uint64_t a) { + c_v64_store_aligned(p, c_v64_from_64(a)); +} + +void c_s32_store_aligned(void *p, int32_t a) { + c_u32_store_aligned(p, static_cast(a)); +} + +void c_s64_store_aligned(void *p, int64_t a) { + c_v64_store_aligned(p, c_v64_from_64(static_cast(a))); +} + +uint64_t u64_load_aligned(const void *p) { + return v64_u64(v64_load_aligned(p)); +} +uint16_t u16_load_aligned(const void *p) { + return *(reinterpret_cast(p)); +} +uint8_t u8_load_aligned(const void *p) { + return *(reinterpret_cast(p)); +} + +uint64_t c_u64_load_aligned(const void *p) { + return c_v64_u64(c_v64_load_aligned(p)); +} +uint16_t c_u16_load_aligned(const void *p) { + return *(reinterpret_cast(p)); +} +uint8_t c_u8_load_aligned(const void *p) { + return *(reinterpret_cast(p)); +} + +// CompareSimd1Arg, CompareSimd2Args and CompareSimd3Args compare +// intrinsics taking 1, 2 or 3 arguments respectively with their +// corresponding C reference. Ideally, the loads and stores should +// have gone into the template parameter list, but v64 and v128 could +// be typedef'ed to the same type (which is the case on x86) and then +// we can't instantiate both v64 and v128, so the function return and +// argument types, including the always differing types in the C +// equivalent are used instead. The function arguments must be void +// pointers and then go through a cast to avoid matching errors in the +// branches eliminated by the typeid tests in the calling function. +template +int CompareSimd1Arg(fptr store, fptr load, fptr simd, void *d, fptr c_store, + fptr c_load, fptr c_simd, void *ref_d, const void *a) { + void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store; + Arg (*const my_load)(const void *) = (Arg(*const)(const void *))load; + Ret (*const my_simd)(Arg) = (Ret(*const)(Arg))simd; + void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store; + CArg (*const my_c_load)(const void *) = (CArg(*const)(const void *))c_load; + CRet (*const my_c_simd)(CArg) = (CRet(*const)(CArg))c_simd; + + // Call reference and intrinsic + my_c_store(ref_d, my_c_simd(my_c_load(a))); + my_store(d, my_simd(my_load(a))); + + // Compare results + return memcmp(ref_d, d, sizeof(CRet)); +} + +template +int CompareSimd2Args(fptr store, fptr load1, fptr load2, fptr simd, void *d, + fptr c_store, fptr c_load1, fptr c_load2, fptr c_simd, + void *ref_d, const void *a, const void *b) { + void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store; + Arg1 (*const my_load1)(const void *) = (Arg1(*const)(const void *))load1; + Arg2 (*const my_load2)(const void *) = (Arg2(*const)(const void *))load2; + Ret (*const my_simd)(Arg1, Arg2) = (Ret(*const)(Arg1, Arg2))simd; + void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store; + CArg1 (*const my_c_load1)(const void *) = + (CArg1(*const)(const void *))c_load1; + CArg2 (*const my_c_load2)(const void *) = + (CArg2(*const)(const void *))c_load2; + CRet (*const my_c_simd)(CArg1, CArg2) = (CRet(*const)(CArg1, CArg2))c_simd; + + // Call reference and intrinsic + my_c_store(ref_d, my_c_simd(my_c_load1(a), my_c_load2(b))); + my_store(d, my_simd(my_load1(a), my_load2(b))); + + // Compare results + return memcmp(ref_d, d, sizeof(CRet)); +} + +template +int CompareSimd3Args(fptr store, fptr load1, fptr load2, fptr load3, fptr simd, + void *d, fptr c_store, fptr c_load1, fptr c_load2, + fptr c_load3, fptr c_simd, void *ref_d, const void *a, + const void *b, const void *c) { + void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store; + Arg1 (*const my_load1)(const void *) = (Arg1(*const)(const void *))load1; + Arg2 (*const my_load2)(const void *) = (Arg2(*const)(const void *))load2; + Arg3 (*const my_load3)(const void *) = (Arg3(*const)(const void *))load3; + Ret (*const my_simd)(Arg1, Arg2, Arg3) = (Ret(*const)(Arg1, Arg2, Arg3))simd; + void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store; + CArg1 (*const my_c_load1)(const void *) = + (CArg1(*const)(const void *))c_load1; + CArg2 (*const my_c_load2)(const void *) = + (CArg2(*const)(const void *))c_load2; + CArg3 (*const my_c_load3)(const void *) = + (CArg3(*const)(const void *))c_load3; + CRet (*const my_c_simd)(CArg1, CArg2, CArg3) = + (CRet(*const)(CArg1, CArg2, CArg3))c_simd; + + // Call reference and intrinsic + my_c_store(ref_d, my_c_simd(my_c_load1(a), my_c_load2(b), my_c_load3(c))); + my_store(d, my_simd(my_load1(a), my_load2(b), my_load3(c))); + + // Compare results + return memcmp(ref_d, d, sizeof(CRet)); +} + +} // namespace + +template +void TestSimd1Arg(uint32_t iterations, uint32_t mask, uint32_t maskwidth, + const char *name) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + fptr ref_simd; + fptr simd; + int error = 0; + DECLARE_ALIGNED(32, uint8_t, s[32]); + DECLARE_ALIGNED(32, uint8_t, d[32]); + DECLARE_ALIGNED(32, uint8_t, ref_d[32]); + assert(sizeof(CArg) <= 32 && sizeof(CRet) <= 32); + memset(ref_d, 0, sizeof(ref_d)); + memset(d, 0, sizeof(d)); + + Map(name, &ref_simd, &simd); + if (simd == NULL || ref_simd == NULL) { + FAIL() << "Internal error: Unknown intrinsic function " << name; + } + for (unsigned int count = 0; + count < iterations && !error && !testing::Test::HasFailure(); count++) { + for (unsigned int c = 0; c < sizeof(CArg); c++) s[c] = rnd.Rand8(); + + if (maskwidth) { + SetMask(s, sizeof(CArg), mask, maskwidth); + } + + if (typeid(CRet) == typeid(c_v64) && typeid(CArg) == typeid(c_v64)) { + // V64_V64 + error = CompareSimd1Arg( + reinterpret_cast(v64_store_aligned), + reinterpret_cast(v64_load_aligned), simd, d, + reinterpret_cast(c_v64_store_aligned), + reinterpret_cast(c_v64_load_aligned), ref_simd, ref_d, s); + } else if (typeid(CRet) == typeid(c_v64) && + typeid(CArg) == typeid(uint8_t)) { + // V64_U8 + error = CompareSimd1Arg( + reinterpret_cast(v64_store_aligned), + reinterpret_cast(u8_load_aligned), simd, d, + reinterpret_cast(c_v64_store_aligned), + reinterpret_cast(c_u8_load_aligned), ref_simd, ref_d, s); + } else if (typeid(CRet) == typeid(c_v64) && + typeid(CArg) == typeid(uint16_t)) { + // V64_U16 + error = CompareSimd1Arg( + reinterpret_cast(v64_store_aligned), + reinterpret_cast(u16_load_aligned), simd, d, + reinterpret_cast(c_v64_store_aligned), + reinterpret_cast(c_u16_load_aligned), ref_simd, ref_d, s); + } else if (typeid(CRet) == typeid(c_v64) && + typeid(CArg) == typeid(uint32_t)) { + // V64_U32 + error = CompareSimd1Arg( + reinterpret_cast(v64_store_aligned), + reinterpret_cast(u32_load_aligned), simd, d, + reinterpret_cast(c_v64_store_aligned), + reinterpret_cast(c_u32_load_aligned), ref_simd, ref_d, s); + } else if (typeid(CRet) == typeid(uint64_t) && + typeid(CArg) == typeid(c_v64)) { + // U64_V64 + error = CompareSimd1Arg( + reinterpret_cast(u64_store_aligned), + reinterpret_cast(v64_load_aligned), simd, d, + reinterpret_cast(c_u64_store_aligned), + reinterpret_cast(c_v64_load_aligned), ref_simd, ref_d, s); + } else if (typeid(CRet) == typeid(int64_t) && + typeid(CArg) == typeid(c_v64)) { + // S64_V64 + error = CompareSimd1Arg( + reinterpret_cast(s64_store_aligned), + reinterpret_cast(v64_load_aligned), simd, d, + reinterpret_cast(c_s64_store_aligned), + reinterpret_cast(c_v64_load_aligned), ref_simd, ref_d, s); + } else if (typeid(CRet) == typeid(uint32_t) && + typeid(CArg) == typeid(c_v64)) { + // U32_V64 + error = CompareSimd1Arg( + reinterpret_cast(u32_store_aligned), + reinterpret_cast(v64_load_aligned), simd, d, + reinterpret_cast(c_u32_store_aligned), + reinterpret_cast(c_v64_load_aligned), ref_simd, ref_d, s); + } else if (typeid(CRet) == typeid(int32_t) && + typeid(CArg) == typeid(c_v64)) { + // S32_V64 + error = CompareSimd1Arg( + reinterpret_cast(s32_store_aligned), + reinterpret_cast(v64_load_aligned), simd, d, + reinterpret_cast(c_s32_store_aligned), + reinterpret_cast(c_v64_load_aligned), ref_simd, ref_d, s); + } else if (typeid(CRet) == typeid(uint32_t) && + typeid(CArg) == typeid(c_v128)) { + // U32_V128 + error = CompareSimd1Arg( + reinterpret_cast(u32_store_aligned), + reinterpret_cast(v128_load_aligned), simd, d, + reinterpret_cast(c_u32_store_aligned), + reinterpret_cast(c_v128_load_aligned), ref_simd, ref_d, s); + } else if (typeid(CRet) == typeid(uint64_t) && + typeid(CArg) == typeid(c_v128)) { + // U64_V128 + error = CompareSimd1Arg( + reinterpret_cast(u64_store_aligned), + reinterpret_cast(v128_load_aligned), simd, d, + reinterpret_cast(c_u64_store_aligned), + reinterpret_cast(c_v128_load_aligned), ref_simd, ref_d, s); + } else if (typeid(CRet) == typeid(uint64_t) && + typeid(CArg) == typeid(c_v256)) { + // U64_V256 + error = CompareSimd1Arg( + reinterpret_cast(u64_store_aligned), + reinterpret_cast(v256_load_aligned), simd, d, + reinterpret_cast(c_u64_store_aligned), + reinterpret_cast(c_v256_load_aligned), ref_simd, ref_d, s); + } else if (typeid(CRet) == typeid(c_v64) && + typeid(CArg) == typeid(c_v128)) { + // V64_V128 + error = CompareSimd1Arg( + reinterpret_cast(v64_store_aligned), + reinterpret_cast(v128_load_aligned), simd, d, + reinterpret_cast(c_v64_store_aligned), + reinterpret_cast(c_v128_load_aligned), ref_simd, ref_d, s); + } else if (typeid(CRet) == typeid(c_v128) && + typeid(CArg) == typeid(c_v128)) { + // V128_V128 + error = CompareSimd1Arg( + reinterpret_cast(v128_store_aligned), + reinterpret_cast(v128_load_aligned), simd, d, + reinterpret_cast(c_v128_store_aligned), + reinterpret_cast(c_v128_load_aligned), ref_simd, ref_d, s); + } else if (typeid(CRet) == typeid(c_v128) && + typeid(CArg) == typeid(c_v64)) { + // V128_V64 + error = CompareSimd1Arg( + reinterpret_cast(v128_store_aligned), + reinterpret_cast(v64_load_aligned), simd, d, + reinterpret_cast(c_v128_store_aligned), + reinterpret_cast(c_v64_load_aligned), ref_simd, ref_d, s); + } else if (typeid(CRet) == typeid(c_v128) && + typeid(CArg) == typeid(uint8_t)) { + // V128_U8 + error = CompareSimd1Arg( + reinterpret_cast(v128_store_aligned), + reinterpret_cast(u8_load_aligned), simd, d, + reinterpret_cast(c_v128_store_aligned), + reinterpret_cast(c_u8_load_aligned), ref_simd, ref_d, s); + } else if (typeid(CRet) == typeid(c_v128) && + typeid(CArg) == typeid(uint16_t)) { + // V128_U16 + error = CompareSimd1Arg( + reinterpret_cast(v128_store_aligned), + reinterpret_cast(u16_load_aligned), simd, d, + reinterpret_cast(c_v128_store_aligned), + reinterpret_cast(c_u16_load_aligned), ref_simd, ref_d, s); + } else if (typeid(CRet) == typeid(c_v128) && + typeid(CArg) == typeid(uint32_t)) { + // V128_U32 + error = CompareSimd1Arg( + reinterpret_cast(v128_store_aligned), + reinterpret_cast(u32_load_aligned), simd, d, + reinterpret_cast(c_v128_store_aligned), + reinterpret_cast(c_u32_load_aligned), ref_simd, ref_d, s); + } else if (typeid(CRet) == typeid(c_v128) && + typeid(CArg) == typeid(uint64_t)) { + // V128_U64 + error = CompareSimd1Arg( + reinterpret_cast(v128_store_aligned), + reinterpret_cast(u64_load_aligned), simd, d, + reinterpret_cast(c_v128_store_aligned), + reinterpret_cast(c_u64_load_aligned), ref_simd, ref_d, s); + } else if (typeid(CRet) == typeid(c_v256) && + typeid(CArg) == typeid(c_v256)) { + // V256_V256 + error = CompareSimd1Arg( + reinterpret_cast(v256_store_aligned), + reinterpret_cast(v256_load_aligned), simd, d, + reinterpret_cast(c_v256_store_aligned), + reinterpret_cast(c_v256_load_aligned), ref_simd, ref_d, s); + } else if (typeid(CRet) == typeid(c_v256) && + typeid(CArg) == typeid(c_v128)) { + // V256_V128 + error = CompareSimd1Arg( + reinterpret_cast(v256_store_aligned), + reinterpret_cast(v128_load_aligned), simd, d, + reinterpret_cast(c_v256_store_aligned), + reinterpret_cast(c_v128_load_aligned), ref_simd, ref_d, s); + } else if (typeid(CRet) == typeid(c_v256) && + typeid(CArg) == typeid(uint8_t)) { + // V256_U8 + error = CompareSimd1Arg( + reinterpret_cast(v256_store_aligned), + reinterpret_cast(u8_load_aligned), simd, d, + reinterpret_cast(c_v256_store_aligned), + reinterpret_cast(c_u8_load_aligned), ref_simd, ref_d, s); + } else if (typeid(CRet) == typeid(c_v256) && + typeid(CArg) == typeid(uint16_t)) { + // V256_U16 + error = CompareSimd1Arg( + reinterpret_cast(v256_store_aligned), + reinterpret_cast(u16_load_aligned), simd, d, + reinterpret_cast(c_v256_store_aligned), + reinterpret_cast(c_u16_load_aligned), ref_simd, ref_d, s); + } else if (typeid(CRet) == typeid(c_v256) && + typeid(CArg) == typeid(uint32_t)) { + // V256_U32 + error = CompareSimd1Arg( + reinterpret_cast(v256_store_aligned), + reinterpret_cast(u32_load_aligned), simd, d, + reinterpret_cast(c_v256_store_aligned), + reinterpret_cast(c_u32_load_aligned), ref_simd, ref_d, s); + } else if (typeid(CRet) == typeid(c_v256) && + typeid(CArg) == typeid(uint64_t)) { + // V256_U64 + error = CompareSimd1Arg( + reinterpret_cast(v256_store_aligned), + reinterpret_cast(u64_load_aligned), simd, d, + reinterpret_cast(c_v256_store_aligned), + reinterpret_cast(c_u64_load_aligned), ref_simd, ref_d, s); + } else if (typeid(CRet) == typeid(uint32_t) && + typeid(CArg) == typeid(c_v256)) { + // U32_V256 + error = CompareSimd1Arg( + reinterpret_cast(u32_store_aligned), + reinterpret_cast(v256_load_aligned), simd, d, + reinterpret_cast(c_u32_store_aligned), + reinterpret_cast(c_v256_load_aligned), ref_simd, ref_d, s); + } else if (typeid(CRet) == typeid(c_v64) && + typeid(CArg) == typeid(c_v256)) { + // V64_V256 + error = CompareSimd1Arg( + reinterpret_cast(v64_store_aligned), + reinterpret_cast(v256_load_aligned), simd, d, + reinterpret_cast(c_v64_store_aligned), + reinterpret_cast(c_v256_load_aligned), ref_simd, ref_d, s); + } else { + FAIL() << "Internal error: Unknown intrinsic function " + << typeid(CRet).name() << " " << name << "(" << typeid(CArg).name() + << ")"; + } + } + + EXPECT_EQ(0, error) << "Error: mismatch for " << name << "(" + << Print(s, sizeof(CArg)) << ") -> " + << Print(d, sizeof(CRet)) << " (simd), " + << Print(ref_d, sizeof(CRet)) << " (ref)"; +} + +template +void TestSimd2Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth, + const char *name) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + fptr ref_simd; + fptr simd; + int error = 0; + DECLARE_ALIGNED(32, uint8_t, s1[32]); + DECLARE_ALIGNED(32, uint8_t, s2[32]); + DECLARE_ALIGNED(32, uint8_t, d[32]); + DECLARE_ALIGNED(32, uint8_t, ref_d[32]); + assert(sizeof(CArg1) <= 32 && sizeof(CArg2) <= 32 && sizeof(CRet) <= 32); + memset(ref_d, 0, sizeof(ref_d)); + memset(d, 0, sizeof(d)); + + Map(name, &ref_simd, &simd); + if (simd == NULL || ref_simd == NULL) { + FAIL() << "Internal error: Unknown intrinsic function " << name; + } + + for (unsigned int count = 0; + count < iterations && !error && !testing::Test::HasFailure(); count++) { + for (unsigned int c = 0; c < sizeof(CArg1); c++) s1[c] = rnd.Rand8(); + + for (unsigned int c = 0; c < sizeof(CArg2); c++) s2[c] = rnd.Rand8(); + + if (maskwidth) SetMask(s2, sizeof(CArg2), mask, maskwidth); + + if (typeid(CRet) == typeid(c_v64) && typeid(CArg1) == typeid(c_v64) && + typeid(CArg2) == typeid(c_v64)) { + // V64_V64V64 + error = CompareSimd2Args( + reinterpret_cast(v64_store_aligned), + reinterpret_cast(v64_load_aligned), + reinterpret_cast(v64_load_aligned), simd, d, + reinterpret_cast(c_v64_store_aligned), + reinterpret_cast(c_v64_load_aligned), + reinterpret_cast(c_v64_load_aligned), + reinterpret_cast(ref_simd), ref_d, s1, s2); + } else if (typeid(CRet) == typeid(c_v64) && + typeid(CArg1) == typeid(uint32_t) && + typeid(CArg2) == typeid(uint32_t)) { + // V64_U32U32 + error = CompareSimd2Args( + reinterpret_cast(v64_store_aligned), + reinterpret_cast(u32_load_aligned), + reinterpret_cast(u32_load_aligned), simd, d, + reinterpret_cast(c_v64_store_aligned), + reinterpret_cast(c_u32_load_aligned), + reinterpret_cast(c_u32_load_aligned), + reinterpret_cast(ref_simd), ref_d, s1, s2); + } else if (typeid(CRet) == typeid(uint32_t) && + typeid(CArg1) == typeid(c_v64) && + typeid(CArg2) == typeid(c_v64)) { + // U32_V64V64 + error = CompareSimd2Args( + reinterpret_cast(u32_store_aligned), + reinterpret_cast(v64_load_aligned), + reinterpret_cast(v64_load_aligned), simd, d, + reinterpret_cast(c_u32_store_aligned), + reinterpret_cast(c_v64_load_aligned), + reinterpret_cast(c_v64_load_aligned), + reinterpret_cast(ref_simd), ref_d, s1, s2); + } else if (typeid(CRet) == typeid(int64_t) && + typeid(CArg1) == typeid(c_v64) && + typeid(CArg2) == typeid(c_v64)) { + // S64_V64V64 + error = CompareSimd2Args( + reinterpret_cast(s64_store_aligned), + reinterpret_cast(v64_load_aligned), + reinterpret_cast(v64_load_aligned), simd, d, + reinterpret_cast(c_s64_store_aligned), + reinterpret_cast(c_v64_load_aligned), + reinterpret_cast(c_v64_load_aligned), + reinterpret_cast(ref_simd), ref_d, s1, s2); + } else if (typeid(CRet) == typeid(c_v64) && + typeid(CArg1) == typeid(c_v64) && + typeid(CArg2) == typeid(uint32_t)) { + // V64_V64U32 + error = CompareSimd2Args( + reinterpret_cast(v64_store_aligned), + reinterpret_cast(v64_load_aligned), + reinterpret_cast(u32_load_aligned), simd, d, + reinterpret_cast(c_v64_store_aligned), + reinterpret_cast(c_v64_load_aligned), + reinterpret_cast(c_u32_load_aligned), + reinterpret_cast(ref_simd), ref_d, s1, s2); + } else if (typeid(CRet) == typeid(c_v128) && + typeid(CArg1) == typeid(c_v128) && + typeid(CArg2) == typeid(c_v128)) { + // V128_V128V128 + error = CompareSimd2Args( + reinterpret_cast(v128_store_aligned), + reinterpret_cast(v128_load_aligned), + reinterpret_cast(v128_load_aligned), simd, d, + reinterpret_cast(c_v128_store_aligned), + reinterpret_cast(c_v128_load_aligned), + reinterpret_cast(c_v128_load_aligned), + reinterpret_cast(ref_simd), ref_d, s1, s2); + } else if (typeid(CRet) == typeid(uint32_t) && + typeid(CArg1) == typeid(c_v128) && + typeid(CArg2) == typeid(c_v128)) { + // U32_V128V128 + error = CompareSimd2Args( + reinterpret_cast(u32_store_aligned), + reinterpret_cast(v128_load_aligned), + reinterpret_cast(v128_load_aligned), simd, d, + reinterpret_cast(c_u32_store_aligned), + reinterpret_cast(c_v128_load_aligned), + reinterpret_cast(c_v128_load_aligned), + reinterpret_cast(ref_simd), ref_d, s1, s2); + } else if (typeid(CRet) == typeid(uint64_t) && + typeid(CArg1) == typeid(c_v128) && + typeid(CArg2) == typeid(c_v128)) { + // U64_V128V128 + error = CompareSimd2Args( + reinterpret_cast(u64_store_aligned), + reinterpret_cast(v128_load_aligned), + reinterpret_cast(v128_load_aligned), simd, d, + reinterpret_cast(c_u64_store_aligned), + reinterpret_cast(c_v128_load_aligned), + reinterpret_cast(c_v128_load_aligned), + reinterpret_cast(ref_simd), ref_d, s1, s2); + } else if (typeid(CRet) == typeid(int64_t) && + typeid(CArg1) == typeid(c_v128) && + typeid(CArg2) == typeid(c_v128)) { + // S64_V128V128 + error = CompareSimd2Args( + reinterpret_cast(s64_store_aligned), + reinterpret_cast(v128_load_aligned), + reinterpret_cast(v128_load_aligned), simd, d, + reinterpret_cast(c_s64_store_aligned), + reinterpret_cast(c_v128_load_aligned), + reinterpret_cast(c_v128_load_aligned), + reinterpret_cast(ref_simd), ref_d, s1, s2); + } else if (typeid(CRet) == typeid(c_v128) && + typeid(CArg1) == typeid(uint64_t) && + typeid(CArg2) == typeid(uint64_t)) { + // V128_U64U64 + error = CompareSimd2Args( + reinterpret_cast(v128_store_aligned), + reinterpret_cast(u64_load_aligned), + reinterpret_cast(u64_load_aligned), simd, d, + reinterpret_cast(c_v128_store_aligned), + reinterpret_cast(c_u64_load_aligned), + reinterpret_cast(c_u64_load_aligned), + reinterpret_cast(ref_simd), ref_d, s1, s2); + } else if (typeid(CRet) == typeid(c_v128) && + typeid(CArg1) == typeid(c_v64) && + typeid(CArg2) == typeid(c_v64)) { + // V128_V64V64 + error = CompareSimd2Args( + reinterpret_cast(v128_store_aligned), + reinterpret_cast(v64_load_aligned), + reinterpret_cast(v64_load_aligned), simd, d, + reinterpret_cast(c_v128_store_aligned), + reinterpret_cast(c_v64_load_aligned), + reinterpret_cast(c_v64_load_aligned), + reinterpret_cast(ref_simd), ref_d, s1, s2); + } else if (typeid(CRet) == typeid(c_v128) && + typeid(CArg1) == typeid(c_v128) && + typeid(CArg2) == typeid(uint32_t)) { + // V128_V128U32 + error = CompareSimd2Args( + reinterpret_cast(v128_store_aligned), + reinterpret_cast(v128_load_aligned), + reinterpret_cast(u32_load_aligned), simd, d, + reinterpret_cast(c_v128_store_aligned), + reinterpret_cast(c_v128_load_aligned), + reinterpret_cast(c_u32_load_aligned), + reinterpret_cast(ref_simd), ref_d, s1, s2); + } else if (typeid(CRet) == typeid(c_v256) && + typeid(CArg1) == typeid(c_v256) && + typeid(CArg2) == typeid(c_v256)) { + // V256_V256V256 + error = CompareSimd2Args( + reinterpret_cast(v256_store_aligned), + reinterpret_cast(v256_load_aligned), + reinterpret_cast(v256_load_aligned), simd, d, + reinterpret_cast(c_v256_store_aligned), + reinterpret_cast(c_v256_load_aligned), + reinterpret_cast(c_v256_load_aligned), + reinterpret_cast(ref_simd), ref_d, s1, s2); + } else if (typeid(CRet) == typeid(uint64_t) && + typeid(CArg1) == typeid(c_v256) && + typeid(CArg2) == typeid(c_v256)) { + // U64_V256V256 + error = CompareSimd2Args( + reinterpret_cast(u64_store_aligned), + reinterpret_cast(v256_load_aligned), + reinterpret_cast(v256_load_aligned), simd, d, + reinterpret_cast(c_u64_store_aligned), + reinterpret_cast(c_v256_load_aligned), + reinterpret_cast(c_v256_load_aligned), + reinterpret_cast(ref_simd), ref_d, s1, s2); + } else if (typeid(CRet) == typeid(int64_t) && + typeid(CArg1) == typeid(c_v256) && + typeid(CArg2) == typeid(c_v256)) { + // S64_V256V256 + error = CompareSimd2Args( + reinterpret_cast(s64_store_aligned), + reinterpret_cast(v256_load_aligned), + reinterpret_cast(v256_load_aligned), simd, d, + reinterpret_cast(c_s64_store_aligned), + reinterpret_cast(c_v256_load_aligned), + reinterpret_cast(c_v256_load_aligned), + reinterpret_cast(ref_simd), ref_d, s1, s2); + } else if (typeid(CRet) == typeid(uint32_t) && + typeid(CArg1) == typeid(c_v256) && + typeid(CArg2) == typeid(c_v256)) { + // U32_V256V256 + error = CompareSimd2Args( + reinterpret_cast(u32_store_aligned), + reinterpret_cast(v256_load_aligned), + reinterpret_cast(v256_load_aligned), simd, d, + reinterpret_cast(c_u32_store_aligned), + reinterpret_cast(c_v256_load_aligned), + reinterpret_cast(c_v256_load_aligned), + reinterpret_cast(ref_simd), ref_d, s1, s2); + } else if (typeid(CRet) == typeid(c_v256) && + typeid(CArg1) == typeid(c_v128) && + typeid(CArg2) == typeid(c_v128)) { + // V256_V128V128 + error = CompareSimd2Args( + reinterpret_cast(v256_store_aligned), + reinterpret_cast(v128_load_aligned), + reinterpret_cast(v128_load_aligned), simd, d, + reinterpret_cast(c_v256_store_aligned), + reinterpret_cast(c_v128_load_aligned), + reinterpret_cast(c_v128_load_aligned), + reinterpret_cast(ref_simd), ref_d, s1, s2); + } else if (typeid(CRet) == typeid(c_v256) && + typeid(CArg1) == typeid(c_v256) && + typeid(CArg2) == typeid(uint32_t)) { + // V256_V256U32 + error = CompareSimd2Args( + reinterpret_cast(v256_store_aligned), + reinterpret_cast(v256_load_aligned), + reinterpret_cast(u32_load_aligned), simd, d, + reinterpret_cast(c_v256_store_aligned), + reinterpret_cast(c_v256_load_aligned), + reinterpret_cast(c_u32_load_aligned), + reinterpret_cast(ref_simd), ref_d, s1, s2); + + } else { + FAIL() << "Internal error: Unknown intrinsic function " + << typeid(CRet).name() << " " << name << "(" + << typeid(CArg1).name() << ", " << typeid(CArg2).name() << ")"; + } + } + + EXPECT_EQ(0, error) << "Error: mismatch for " << name << "(" + << Print(s1, sizeof(CArg1)) << ", " + << Print(s2, sizeof(CArg2)) << ") -> " + << Print(d, sizeof(CRet)) << " (simd), " + << Print(ref_d, sizeof(CRet)) << " (ref)"; +} + +template +void TestSimd3Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth, + const char *name) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + fptr ref_simd; + fptr simd; + int error = 0; + DECLARE_ALIGNED(32, uint8_t, s1[32]); + DECLARE_ALIGNED(32, uint8_t, s2[32]); + DECLARE_ALIGNED(32, uint8_t, s3[32]); + DECLARE_ALIGNED(32, uint8_t, d[32]); + DECLARE_ALIGNED(32, uint8_t, ref_d[32]); + assert(sizeof(CArg1) <= 32 && sizeof(CArg2) <= 32 && sizeof(CArg3) <= 32 && + sizeof(CRet) <= 32); + memset(ref_d, 0, sizeof(ref_d)); + memset(d, 0, sizeof(d)); + + Map(name, &ref_simd, &simd); + if (simd == NULL || ref_simd == NULL) { + FAIL() << "Internal error: Unknown intrinsic function " << name; + } + + for (unsigned int count = 0; + count < iterations && !error && !testing::Test::HasFailure(); count++) { + for (unsigned int c = 0; c < sizeof(CArg1); c++) s1[c] = rnd.Rand8(); + + for (unsigned int c = 0; c < sizeof(CArg2); c++) s2[c] = rnd.Rand8(); + + for (unsigned int c = 0; c < sizeof(CArg3); c++) s3[c] = rnd.Rand8(); + + if (maskwidth) SetMask(s3, sizeof(CArg3), mask, maskwidth); + + if (typeid(CRet) == typeid(c_v128) && typeid(CArg1) == typeid(c_v128) && + typeid(CArg2) == typeid(c_v128) && typeid(CArg3) == typeid(c_v128)) { + // V128_V128V128V128 + error = + CompareSimd3Args( + reinterpret_cast(v128_store_aligned), + reinterpret_cast(v128_load_aligned), + reinterpret_cast(v128_load_aligned), + reinterpret_cast(v128_load_aligned), simd, d, + reinterpret_cast(c_v128_store_aligned), + reinterpret_cast(c_v128_load_aligned), + reinterpret_cast(c_v128_load_aligned), + reinterpret_cast(c_v128_load_aligned), + reinterpret_cast(ref_simd), ref_d, s1, s2, s3); + } else if (typeid(CRet) == typeid(c_v256) && + typeid(CArg1) == typeid(c_v256) && + typeid(CArg2) == typeid(c_v256) && + typeid(CArg3) == typeid(c_v256)) { + // V256_V256V256V256 + error = + CompareSimd3Args( + reinterpret_cast(v256_store_aligned), + reinterpret_cast(v256_load_aligned), + reinterpret_cast(v256_load_aligned), + reinterpret_cast(v256_load_aligned), simd, d, + reinterpret_cast(c_v256_store_aligned), + reinterpret_cast(c_v256_load_aligned), + reinterpret_cast(c_v256_load_aligned), + reinterpret_cast(c_v256_load_aligned), + reinterpret_cast(ref_simd), ref_d, s1, s2, s3); + } else { + FAIL() << "Internal error: Unknown intrinsic function " + << typeid(CRet).name() << " " << name << "(" + << typeid(CArg1).name() << ", " << typeid(CArg2).name() << ", " + << typeid(CArg3).name() << ")"; + } + } + + EXPECT_EQ(0, error) << "Error: mismatch for " << name << "(" + << Print(s1, sizeof(CArg1)) << ", " + << Print(s2, sizeof(CArg2)) << ", " + << Print(s3, sizeof(CArg3)) << ") -> " + << Print(d, sizeof(CRet)) << " (simd), " + << Print(ref_d, sizeof(CRet)) << " (ref)"; +} + +// Instantiations to make the functions callable from another files +template void TestSimd1Arg(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd1Arg(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd1Arg(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd1Arg(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd1Arg(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd1Arg(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd1Arg(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd1Arg(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd2Args(uint32_t, uint32_t, + uint32_t, const char *); +template void TestSimd2Args(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd2Args(uint32_t, uint32_t, + uint32_t, const char *); +template void TestSimd2Args(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd2Args(uint32_t, uint32_t, + uint32_t, const char *); +template void TestSimd1Arg(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd1Arg(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd1Arg(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd1Arg(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd1Arg(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd1Arg(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd1Arg(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd1Arg(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd1Arg(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd2Args(uint32_t, uint32_t, + uint32_t, const char *); +template void TestSimd2Args(uint32_t, uint32_t, + uint32_t, const char *); +template void TestSimd2Args(uint32_t, uint32_t, + uint32_t, const char *); +template void TestSimd2Args(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd2Args(uint32_t, uint32_t, + uint32_t, const char *); +template void TestSimd2Args(uint32_t, uint32_t, + uint32_t, const char *); +template void TestSimd2Args(uint32_t, uint32_t, + uint32_t, const char *); +template void TestSimd3Args(uint32_t, uint32_t, + uint32_t, + const char *); +template void TestSimd1Arg(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd1Arg(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd1Arg(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd1Arg(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd1Arg(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd1Arg(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd1Arg(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd1Arg(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd1Arg(uint32_t, uint32_t, uint32_t, + const char *); +template void TestSimd2Args(uint32_t, uint32_t, + uint32_t, const char *); +template void TestSimd2Args(uint32_t, uint32_t, + uint32_t, const char *); +template void TestSimd2Args(uint32_t, uint32_t, + uint32_t, const char *); +template void TestSimd2Args(uint32_t, uint32_t, + uint32_t, const char *); +template void TestSimd2Args(uint32_t, uint32_t, + uint32_t, const char *); +template void TestSimd2Args(uint32_t, uint32_t, + uint32_t, const char *); +template void TestSimd3Args(uint32_t, uint32_t, + uint32_t, + const char *); + +} // namespace SIMD_NAMESPACE diff --git a/libs/libaom/src/test/simd_cmp_neon.cc b/libs/libaom/src/test/simd_cmp_neon.cc new file mode 100644 index 000000000..53c1e2a07 --- /dev/null +++ b/libs/libaom/src/test/simd_cmp_neon.cc @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#if defined(__OPTIMIZE__) && __OPTIMIZE__ +#define ARCH NEON +#define ARCH_POSTFIX(name) name##_neon +#define SIMD_NAMESPACE simd_test_neon +#include "test/simd_cmp_impl.h" +#endif diff --git a/libs/libaom/src/test/simd_cmp_sse2.cc b/libs/libaom/src/test/simd_cmp_sse2.cc new file mode 100644 index 000000000..f7827a7fa --- /dev/null +++ b/libs/libaom/src/test/simd_cmp_sse2.cc @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#if (defined(__OPTIMIZE__) && __OPTIMIZE__) || \ + (!defined(__GNUC__) && !defined(_DEBUG)) +#define ARCH SSE2 +#define ARCH_POSTFIX(name) name##_sse2 +#define SIMD_NAMESPACE simd_test_sse2 +#include "test/simd_cmp_impl.h" +#endif diff --git a/libs/libaom/src/test/simd_cmp_sse4.cc b/libs/libaom/src/test/simd_cmp_sse4.cc new file mode 100644 index 000000000..3566764b6 --- /dev/null +++ b/libs/libaom/src/test/simd_cmp_sse4.cc @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#if (defined(__OPTIMIZE__) && __OPTIMIZE__) || \ + (!defined(__GNUC__) && !defined(_DEBUG)) +#define ARCH SSE4_1 +#define ARCH_POSTFIX(name) name##_sse4_1 +#define SIMD_NAMESPACE simd_test_sse4_1 +#include "test/simd_cmp_impl.h" +#endif diff --git a/libs/libaom/src/test/simd_cmp_ssse3.cc b/libs/libaom/src/test/simd_cmp_ssse3.cc new file mode 100644 index 000000000..57bf135dd --- /dev/null +++ b/libs/libaom/src/test/simd_cmp_ssse3.cc @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#if (defined(__OPTIMIZE__) && __OPTIMIZE__) || \ + (!defined(__GNUC__) && !defined(_DEBUG)) +#define ARCH SSSE3 +#define ARCH_POSTFIX(name) name##_ssse3 +#define SIMD_NAMESPACE simd_test_ssse3 +#include "test/simd_cmp_impl.h" +#endif diff --git a/libs/libaom/src/test/simd_impl.h b/libs/libaom/src/test/simd_impl.h new file mode 100644 index 000000000..61fda009f --- /dev/null +++ b/libs/libaom/src/test/simd_impl.h @@ -0,0 +1,1143 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#define SIMD_CHECK 1 +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "aom_dsp/aom_simd_inline.h" +#include "aom_dsp/simd/v256_intrinsics_c.h" + +namespace SIMD_NAMESPACE { + +template +class TestIntrinsic : public ::testing::TestWithParam { + public: + virtual ~TestIntrinsic() {} + virtual void SetUp() { + mask = std::get<0>(this->GetParam()); + maskwidth = std::get<1>(this->GetParam()); + name = std::get<2>(this->GetParam()); + } + + virtual void TearDown() { libaom_test::ClearSystemState(); } + + protected: + uint32_t mask, maskwidth; + const char *name; +}; + +// Create one typedef for each function signature +#define TYPEDEF_SIMD(name) \ + typedef TestIntrinsic > \ + ARCH_POSTFIX(name) + +TYPEDEF_SIMD(V64_U8); +TYPEDEF_SIMD(V64_U16); +TYPEDEF_SIMD(V64_U32); +TYPEDEF_SIMD(V64_V64); +TYPEDEF_SIMD(U32_V64); +TYPEDEF_SIMD(S32_V64); +TYPEDEF_SIMD(U64_V64); +TYPEDEF_SIMD(S64_V64); +TYPEDEF_SIMD(V64_U32U32); +TYPEDEF_SIMD(V64_V64V64); +TYPEDEF_SIMD(S64_V64V64); +TYPEDEF_SIMD(V64_V64U32); +TYPEDEF_SIMD(U32_V64V64); +TYPEDEF_SIMD(V128_V64); +TYPEDEF_SIMD(V128_V128); +TYPEDEF_SIMD(U32_V128); +TYPEDEF_SIMD(U64_V128); +TYPEDEF_SIMD(V64_V128); +TYPEDEF_SIMD(V128_U8); +TYPEDEF_SIMD(V128_U16); +TYPEDEF_SIMD(V128_U32); +TYPEDEF_SIMD(V128_U64); +TYPEDEF_SIMD(V128_U64U64); +TYPEDEF_SIMD(V128_V64V64); +TYPEDEF_SIMD(V128_V128V128); +TYPEDEF_SIMD(V128_V128V128V128); +TYPEDEF_SIMD(S64_V128V128); +TYPEDEF_SIMD(V128_V128U32); +TYPEDEF_SIMD(U32_V128V128); +TYPEDEF_SIMD(U64_V128V128); +TYPEDEF_SIMD(V256_V128); +TYPEDEF_SIMD(V256_V256); +TYPEDEF_SIMD(U64_V256); +TYPEDEF_SIMD(V256_V128V128); +TYPEDEF_SIMD(V256_V256V256); +TYPEDEF_SIMD(V256_V256V256V256); +TYPEDEF_SIMD(U64_V256V256); +TYPEDEF_SIMD(S64_V256V256); +TYPEDEF_SIMD(V256_V256U32); +TYPEDEF_SIMD(U32_V256V256); +TYPEDEF_SIMD(V256_U8); +TYPEDEF_SIMD(V256_U16); +TYPEDEF_SIMD(V256_U32); +TYPEDEF_SIMD(V256_U64); +TYPEDEF_SIMD(U32_V256); +TYPEDEF_SIMD(V64_V256); + +// Google Test allows up to 50 tests per case, so split the largest +typedef ARCH_POSTFIX(V64_V64) ARCH_POSTFIX(V64_V64_Part2); +typedef ARCH_POSTFIX(V64_V64V64) ARCH_POSTFIX(V64_V64V64_Part2); +typedef ARCH_POSTFIX(V128_V128) ARCH_POSTFIX(V128_V128_Part2); +typedef ARCH_POSTFIX(V128_V128) ARCH_POSTFIX(V128_V128_Part3); +typedef ARCH_POSTFIX(V128_V128) ARCH_POSTFIX(V128_V128_Part4); +typedef ARCH_POSTFIX(V128_V128V128) ARCH_POSTFIX(V128_V128V128_Part2); +typedef ARCH_POSTFIX(V256_V256) ARCH_POSTFIX(V256_V256_Part2); +typedef ARCH_POSTFIX(V256_V256) ARCH_POSTFIX(V256_V256_Part3); +typedef ARCH_POSTFIX(V256_V256) ARCH_POSTFIX(V256_V256_Part4); +typedef ARCH_POSTFIX(V256_V256) ARCH_POSTFIX(V256_V256_Part5); +typedef ARCH_POSTFIX(V256_V256V256) ARCH_POSTFIX(V256_V256V256_Part2); + +// These functions are machine tuned located elsewhere +template +void TestSimd1Arg(uint32_t iterations, uint32_t mask, uint32_t maskwidth, + const char *name); + +template +void TestSimd2Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth, + const char *name); + +template +void TestSimd3Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth, + const char *name); + +const int kIterations = 65536; + +// Add a macro layer since TEST_P will quote the name so we need to +// expand it first with the prefix. +#define MY_TEST_P(name, test) TEST_P(name, test) + +MY_TEST_P(ARCH_POSTFIX(V64_U8), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V64_U16), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V64_U32), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V64_V64), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(U64_V64), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(S64_V64), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(U32_V64), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(S32_V64), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V64_U32U32), TestIntrinsics) { + TestSimd2Args(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V64_V64V64), TestIntrinsics) { + TestSimd2Args(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(S64_V64V64), TestIntrinsics) { + TestSimd2Args(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(U32_V64V64), TestIntrinsics) { + TestSimd2Args(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V64_V64U32), TestIntrinsics) { + TestSimd2Args(kIterations, mask, maskwidth, name); +} + +// Google Test allows up to 50 tests per case, so split the largest +MY_TEST_P(ARCH_POSTFIX(V64_V64_Part2), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V64_V64V64_Part2), TestIntrinsics) { + TestSimd2Args(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(U32_V128), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(U64_V128), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V64_V128), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V128_V128), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V128_U8), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V128_U16), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V128_U32), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V128_U64), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V128_V64), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V128_V128V128), TestIntrinsics) { + TestSimd2Args(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V128_V128V128V128), TestIntrinsics) { + TestSimd3Args(kIterations, mask, maskwidth, + name); +} + +MY_TEST_P(ARCH_POSTFIX(U32_V128V128), TestIntrinsics) { + TestSimd2Args(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(U64_V128V128), TestIntrinsics) { + TestSimd2Args(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(S64_V128V128), TestIntrinsics) { + TestSimd2Args(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V128_U64U64), TestIntrinsics) { + TestSimd2Args(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V128_V64V64), TestIntrinsics) { + TestSimd2Args(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V128_V128U32), TestIntrinsics) { + TestSimd2Args(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V128_V128V128_Part2), TestIntrinsics) { + TestSimd2Args(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V128_V128_Part2), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V128_V128_Part3), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V128_V128_Part4), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(U64_V256), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V256_V256), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V256_V128), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V256_V256V256), TestIntrinsics) { + TestSimd2Args(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V256_V256V256V256), TestIntrinsics) { + TestSimd3Args(kIterations, mask, maskwidth, + name); +} + +MY_TEST_P(ARCH_POSTFIX(V256_V128V128), TestIntrinsics) { + TestSimd2Args(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(U32_V256V256), TestIntrinsics) { + TestSimd2Args(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(U64_V256V256), TestIntrinsics) { + TestSimd2Args(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(S64_V256V256), TestIntrinsics) { + TestSimd2Args(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V256_V256V256_Part2), TestIntrinsics) { + TestSimd2Args(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V256_V256U32), TestIntrinsics) { + TestSimd2Args(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V256_V256_Part2), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V256_V256_Part3), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V256_V256_Part4), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V256_V256_Part5), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V256_U8), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V256_U16), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V256_U32), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V256_U64), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(U32_V256), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +MY_TEST_P(ARCH_POSTFIX(V64_V256), TestIntrinsics) { + TestSimd1Arg(kIterations, mask, maskwidth, name); +} + +// Add a macro layer since INSTANTIATE_TEST_SUITE_P will quote the name +// so we need to expand it first with the prefix +#define INSTANTIATE(name, type, ...) \ + INSTANTIATE_TEST_SUITE_P(name, type, ::testing::Values(__VA_ARGS__)) + +#define SIMD_TUPLE(name, mask, maskwidth) \ + std::make_tuple(mask, maskwidth, static_cast(#name)) + +INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V64V64), SIMD_TUPLE(v64_sad_u8, 0U, 0U), + SIMD_TUPLE(v64_ssd_u8, 0U, 0U)); + +INSTANTIATE( + ARCH, ARCH_POSTFIX(V64_V64V64), SIMD_TUPLE(v64_add_8, 0U, 0U), + SIMD_TUPLE(v64_add_16, 0U, 0U), SIMD_TUPLE(v64_sadd_s16, 0U, 0U), + SIMD_TUPLE(v64_add_32, 0U, 0U), SIMD_TUPLE(v64_sub_8, 0U, 0U), + SIMD_TUPLE(v64_ssub_u8, 0U, 0U), SIMD_TUPLE(v64_ssub_s8, 0U, 0U), + SIMD_TUPLE(v64_sub_16, 0U, 0U), SIMD_TUPLE(v64_ssub_s16, 0U, 0U), + SIMD_TUPLE(v64_ssub_u16, 0U, 0U), SIMD_TUPLE(v64_sub_32, 0U, 0U), + SIMD_TUPLE(v64_ziplo_8, 0U, 0U), SIMD_TUPLE(v64_ziphi_8, 0U, 0U), + SIMD_TUPLE(v64_ziplo_16, 0U, 0U), SIMD_TUPLE(v64_ziphi_16, 0U, 0U), + SIMD_TUPLE(v64_ziplo_32, 0U, 0U), SIMD_TUPLE(v64_ziphi_32, 0U, 0U), + SIMD_TUPLE(v64_pack_s32_s16, 0U, 0U), SIMD_TUPLE(v64_pack_s16_u8, 0U, 0U), + SIMD_TUPLE(v64_pack_s16_s8, 0U, 0U), SIMD_TUPLE(v64_unziphi_8, 0U, 0U), + SIMD_TUPLE(v64_unziplo_8, 0U, 0U), SIMD_TUPLE(v64_unziphi_16, 0U, 0U), + SIMD_TUPLE(v64_unziplo_16, 0U, 0U), SIMD_TUPLE(v64_or, 0U, 0U), + SIMD_TUPLE(v64_xor, 0U, 0U), SIMD_TUPLE(v64_and, 0U, 0U), + SIMD_TUPLE(v64_andn, 0U, 0U), SIMD_TUPLE(v64_mullo_s16, 0U, 0U), + SIMD_TUPLE(v64_mulhi_s16, 0U, 0U), SIMD_TUPLE(v64_mullo_s32, 0U, 0U), + SIMD_TUPLE(v64_madd_s16, 0U, 0U), SIMD_TUPLE(v64_madd_us8, 0U, 0U), + SIMD_TUPLE(v64_avg_u8, 0U, 0U), SIMD_TUPLE(v64_rdavg_u8, 0U, 0U), + SIMD_TUPLE(v64_avg_u16, 0U, 0U), SIMD_TUPLE(v64_min_u8, 0U, 0U), + SIMD_TUPLE(v64_max_u8, 0U, 0U), SIMD_TUPLE(v64_min_s8, 0U, 0U), + SIMD_TUPLE(v64_max_s8, 0U, 0U), SIMD_TUPLE(v64_min_s16, 0U, 0U), + SIMD_TUPLE(v64_max_s16, 0U, 0U), SIMD_TUPLE(v64_cmpgt_s8, 0U, 0U), + SIMD_TUPLE(v64_cmplt_s8, 0U, 0U), SIMD_TUPLE(v64_cmpeq_8, 0U, 0U), + SIMD_TUPLE(v64_cmpgt_s16, 0U, 0U), SIMD_TUPLE(v64_cmplt_s16, 0U, 0U), + SIMD_TUPLE(v64_cmpeq_16, 0U, 0U)); + +INSTANTIATE( + ARCH, ARCH_POSTFIX(V64_V64V64_Part2), SIMD_TUPLE(v64_shuffle_8, 7U, 8U), + SIMD_TUPLE(v64_pack_s32_u16, 0U, 0U), SIMD_TUPLE(v64_rdavg_u16, 0U, 0U), + SIMD_TUPLE(v64_sadd_s8, 0U, 0U), SIMD_TUPLE(v64_sadd_u8, 0U, 0U), + SIMD_TUPLE(imm_v64_align<1>, 0U, 0U), SIMD_TUPLE(imm_v64_align<2>, 0U, 0U), + SIMD_TUPLE(imm_v64_align<3>, 0U, 0U), SIMD_TUPLE(imm_v64_align<4>, 0U, 0U), + SIMD_TUPLE(imm_v64_align<5>, 0U, 0U), SIMD_TUPLE(imm_v64_align<6>, 0U, 0U), + SIMD_TUPLE(imm_v64_align<7>, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V64), SIMD_TUPLE(v64_abs_s8, 0U, 0U), + SIMD_TUPLE(v64_abs_s16, 0U, 0U), + SIMD_TUPLE(v64_unpacklo_u8_s16, 0U, 0U), + SIMD_TUPLE(v64_unpackhi_u8_s16, 0U, 0U), + SIMD_TUPLE(v64_unpacklo_s8_s16, 0U, 0U), + SIMD_TUPLE(v64_unpackhi_s8_s16, 0U, 0U), + SIMD_TUPLE(v64_unpacklo_u16_s32, 0U, 0U), + SIMD_TUPLE(v64_unpacklo_s16_s32, 0U, 0U), + SIMD_TUPLE(v64_unpackhi_u16_s32, 0U, 0U), + SIMD_TUPLE(v64_unpackhi_s16_s32, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_byte<1>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_byte<2>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_byte<3>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_byte<4>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_byte<5>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_byte<6>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_byte<7>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_byte<1>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_byte<2>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_byte<3>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_byte<4>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_byte<5>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_byte<6>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_byte<7>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_8<1>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_8<2>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_8<3>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_8<4>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_8<5>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_8<6>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_8<7>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_u8<1>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_u8<2>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_u8<3>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_u8<4>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_u8<5>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_u8<6>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_u8<7>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_s8<1>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_s8<2>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_s8<3>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_s8<4>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_s8<5>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_s8<6>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_s8<7>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_16<1>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_16<2>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_16<4>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_16<6>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_16<8>, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V64_Part2), + SIMD_TUPLE(imm_v64_shl_n_16<10>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_16<12>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_16<14>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_u16<1>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_u16<2>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_u16<4>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_u16<6>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_u16<8>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_u16<10>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_u16<12>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_u16<14>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_s16<1>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_s16<2>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_s16<4>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_s16<6>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_s16<8>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_s16<10>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_s16<12>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_s16<14>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_32<1>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_32<4>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_32<8>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_32<12>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_32<16>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_32<20>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_32<24>, 0U, 0U), + SIMD_TUPLE(imm_v64_shl_n_32<28>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_u32<1>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_u32<4>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_u32<8>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_u32<12>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_u32<16>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_u32<20>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_u32<24>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_u32<28>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_s32<1>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_s32<4>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_s32<8>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_s32<12>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_s32<16>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_s32<20>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_s32<24>, 0U, 0U), + SIMD_TUPLE(imm_v64_shr_n_s32<28>, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V64U32), SIMD_TUPLE(v64_shl_8, 7U, 32U), + SIMD_TUPLE(v64_shr_u8, 7U, 32U), SIMD_TUPLE(v64_shr_s8, 7U, 32U), + SIMD_TUPLE(v64_shl_16, 15U, 32U), SIMD_TUPLE(v64_shr_u16, 15U, 32U), + SIMD_TUPLE(v64_shr_s16, 15U, 32U), SIMD_TUPLE(v64_shl_32, 31U, 32U), + SIMD_TUPLE(v64_shr_u32, 31U, 32U), + SIMD_TUPLE(v64_shr_s32, 31U, 32U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V64), SIMD_TUPLE(v64_hadd_u8, 0U, 0U), + SIMD_TUPLE(v64_u64, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(S64_V64), SIMD_TUPLE(v64_hadd_s16, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V64), SIMD_TUPLE(v64_low_u32, 0U, 0U), + SIMD_TUPLE(v64_high_u32, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(S32_V64), SIMD_TUPLE(v64_low_s32, 0U, 0U), + SIMD_TUPLE(v64_high_s32, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(S64_V64V64), SIMD_TUPLE(v64_dotp_s16, 0U, 0U), + SIMD_TUPLE(v64_dotp_su8, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V64_U8), SIMD_TUPLE(v64_dup_8, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V64_U16), SIMD_TUPLE(v64_dup_16, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V64_U32), SIMD_TUPLE(v64_dup_32, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V64_U32U32), SIMD_TUPLE(v64_from_32, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V128V128), SIMD_TUPLE(v128_sad_u8, 0U, 0U), + SIMD_TUPLE(v128_ssd_u8, 0U, 0U), SIMD_TUPLE(v128_sad_u16, 0U, 0U)); +INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V128V128), SIMD_TUPLE(v128_ssd_s16, 0U, 0U)); + +INSTANTIATE( + ARCH, ARCH_POSTFIX(V128_V128V128), SIMD_TUPLE(v128_add_8, 0U, 0U), + SIMD_TUPLE(v128_add_16, 0U, 0U), SIMD_TUPLE(v128_sadd_s16, 0U, 0U), + SIMD_TUPLE(v128_add_32, 0U, 0U), SIMD_TUPLE(v128_sub_8, 0U, 0U), + SIMD_TUPLE(v128_ssub_u8, 0U, 0U), SIMD_TUPLE(v128_ssub_s8, 0U, 0U), + SIMD_TUPLE(v128_sub_16, 0U, 0U), SIMD_TUPLE(v128_ssub_s16, 0U, 0U), + SIMD_TUPLE(v128_ssub_u16, 0U, 0U), SIMD_TUPLE(v128_sub_32, 0U, 0U), + SIMD_TUPLE(v128_ziplo_8, 0U, 0U), SIMD_TUPLE(v128_ziphi_8, 0U, 0U), + SIMD_TUPLE(v128_ziplo_16, 0U, 0U), SIMD_TUPLE(v128_ziphi_16, 0U, 0U), + SIMD_TUPLE(v128_ziplo_32, 0U, 0U), SIMD_TUPLE(v128_ziphi_32, 0U, 0U), + SIMD_TUPLE(v128_ziplo_64, 0U, 0U), SIMD_TUPLE(v128_ziphi_64, 0U, 0U), + SIMD_TUPLE(v128_unziphi_8, 0U, 0U), SIMD_TUPLE(v128_unziplo_8, 0U, 0U), + SIMD_TUPLE(v128_unziphi_16, 0U, 0U), SIMD_TUPLE(v128_unziplo_16, 0U, 0U), + SIMD_TUPLE(v128_unziphi_32, 0U, 0U), SIMD_TUPLE(v128_unziplo_32, 0U, 0U), + SIMD_TUPLE(v128_pack_s32_s16, 0U, 0U), SIMD_TUPLE(v128_pack_s16_u8, 0U, 0U), + SIMD_TUPLE(v128_pack_s16_s8, 0U, 0U), SIMD_TUPLE(v128_or, 0U, 0U), + SIMD_TUPLE(v128_xor, 0U, 0U), SIMD_TUPLE(v128_and, 0U, 0U), + SIMD_TUPLE(v128_andn, 0U, 0U), SIMD_TUPLE(v128_mullo_s16, 0U, 0U), + SIMD_TUPLE(v128_mulhi_s16, 0U, 0U), SIMD_TUPLE(v128_mullo_s32, 0U, 0U), + SIMD_TUPLE(v128_madd_s16, 0U, 0U), SIMD_TUPLE(v128_madd_us8, 0U, 0U), + SIMD_TUPLE(v128_avg_u8, 0U, 0U), SIMD_TUPLE(v128_rdavg_u8, 0U, 0U), + SIMD_TUPLE(v128_avg_u16, 0U, 0U), SIMD_TUPLE(v128_min_u8, 0U, 0U), + SIMD_TUPLE(v128_max_u8, 0U, 0U), SIMD_TUPLE(v128_min_s8, 0U, 0U), + SIMD_TUPLE(v128_max_s8, 0U, 0U), SIMD_TUPLE(v128_min_s16, 0U, 0U), + SIMD_TUPLE(v128_max_s16, 0U, 0U), SIMD_TUPLE(v128_cmpgt_s8, 0U, 0U), + SIMD_TUPLE(v128_cmplt_s8, 0U, 0U), SIMD_TUPLE(v128_cmpeq_8, 0U, 0U), + SIMD_TUPLE(v128_cmpgt_s16, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128V128_Part2), + SIMD_TUPLE(v128_pack_s32_u16, 0U, 0U), + SIMD_TUPLE(v128_rdavg_u16, 0U, 0U), SIMD_TUPLE(v128_add_64, 0U, 0U), + SIMD_TUPLE(v128_sub_64, 0U, 0U), SIMD_TUPLE(v128_sadd_s8, 0U, 0U), + SIMD_TUPLE(v128_sadd_u8, 0U, 0U), SIMD_TUPLE(v128_cmpeq_16, 0U, 0U), + SIMD_TUPLE(v128_cmplt_s16, 0U, 0U), + SIMD_TUPLE(v128_cmplt_s32, 0U, 0U), + SIMD_TUPLE(v128_cmpeq_32, 0U, 0U), + SIMD_TUPLE(v128_cmpgt_s32, 0U, 0U), + SIMD_TUPLE(v128_shuffle_8, 15U, 8U), + SIMD_TUPLE(v128_min_s32, 0U, 0U), SIMD_TUPLE(v128_max_s32, 0U, 0U), + SIMD_TUPLE(imm_v128_align<1>, 0U, 0U), + SIMD_TUPLE(imm_v128_align<2>, 0U, 0U), + SIMD_TUPLE(imm_v128_align<3>, 0U, 0U), + SIMD_TUPLE(imm_v128_align<4>, 0U, 0U), + SIMD_TUPLE(imm_v128_align<5>, 0U, 0U), + SIMD_TUPLE(imm_v128_align<6>, 0U, 0U), + SIMD_TUPLE(imm_v128_align<7>, 0U, 0U), + SIMD_TUPLE(imm_v128_align<8>, 0U, 0U), + SIMD_TUPLE(imm_v128_align<9>, 0U, 0U), + SIMD_TUPLE(imm_v128_align<10>, 0U, 0U), + SIMD_TUPLE(imm_v128_align<11>, 0U, 0U), + SIMD_TUPLE(imm_v128_align<12>, 0U, 0U), + SIMD_TUPLE(imm_v128_align<13>, 0U, 0U), + SIMD_TUPLE(imm_v128_align<14>, 0U, 0U), + SIMD_TUPLE(imm_v128_align<15>, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128V128V128), + SIMD_TUPLE(v128_blend_8, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128), SIMD_TUPLE(v128_abs_s8, 0U, 0U), + SIMD_TUPLE(v128_abs_s16, 0U, 0U), SIMD_TUPLE(v128_padd_s16, 0U, 0U), + SIMD_TUPLE(v128_unpacklo_u8_s16, 0U, 0U), + SIMD_TUPLE(v128_unpacklo_s8_s16, 0U, 0U), + SIMD_TUPLE(v128_unpacklo_u16_s32, 0U, 0U), + SIMD_TUPLE(v128_unpacklo_s16_s32, 0U, 0U), + SIMD_TUPLE(v128_unpackhi_u8_s16, 0U, 0U), + SIMD_TUPLE(v128_unpackhi_s8_s16, 0U, 0U), + SIMD_TUPLE(v128_unpackhi_u16_s32, 0U, 0U), + SIMD_TUPLE(v128_unpackhi_s16_s32, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_byte<1>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_byte<2>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_byte<3>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_byte<4>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_byte<5>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_byte<6>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_byte<7>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_byte<8>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_byte<9>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_byte<10>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_byte<11>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_byte<12>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_byte<13>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_byte<14>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_byte<15>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_byte<1>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_byte<2>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_byte<3>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_byte<4>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_byte<5>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_byte<6>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_byte<7>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_byte<8>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_byte<9>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_byte<10>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_byte<11>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_byte<12>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_byte<13>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_byte<14>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_byte<15>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_8<1>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_8<2>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_8<3>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_8<4>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_8<5>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_8<6>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_8<7>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u8<1>, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128_Part2), + SIMD_TUPLE(imm_v128_shr_n_u8<2>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u8<3>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u8<4>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u8<5>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u8<6>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u8<7>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s8<1>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s8<2>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s8<3>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s8<4>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s8<5>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s8<6>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s8<7>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_16<1>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_16<2>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_16<4>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_16<6>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_16<8>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_16<10>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_16<12>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_16<14>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u16<1>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u16<2>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u16<4>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u16<6>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u16<8>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u16<10>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u16<12>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u16<14>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s16<1>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s16<2>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s16<4>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s16<6>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s16<8>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s16<10>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s16<12>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s16<14>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_32<1>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_32<4>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_32<8>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_32<12>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_32<16>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_32<20>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_32<24>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_32<28>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u32<1>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u32<4>, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128_Part3), + SIMD_TUPLE(imm_v128_shr_n_u32<8>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u32<12>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u32<16>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u32<20>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u32<24>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u32<28>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s32<1>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s32<4>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s32<8>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s32<12>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s32<16>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s32<20>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s32<24>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s32<28>, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128_Part4), + SIMD_TUPLE(imm_v128_shl_n_64<1>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_64<4>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_64<8>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_64<12>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_64<16>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_64<20>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_64<24>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_64<28>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_64<32>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_64<36>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_64<40>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_64<44>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_64<48>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_64<52>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_64<56>, 0U, 0U), + SIMD_TUPLE(imm_v128_shl_n_64<60>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u64<1>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u64<4>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u64<8>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u64<12>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u64<16>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u64<20>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u64<24>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u64<28>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u64<32>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u64<36>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u64<40>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u64<44>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u64<48>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u64<52>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u64<56>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_u64<60>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s64<1>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s64<4>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s64<8>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s64<12>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s64<16>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s64<20>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s64<24>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s64<28>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s64<32>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s64<36>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s64<40>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s64<44>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s64<48>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s64<52>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s64<56>, 0U, 0U), + SIMD_TUPLE(imm_v128_shr_n_s64<60>, 0U, 0U), + SIMD_TUPLE(v128_padd_u8, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V64V64), SIMD_TUPLE(v128_from_v64, 0U, 0U), + SIMD_TUPLE(v128_zip_8, 0U, 0U), SIMD_TUPLE(v128_zip_16, 0U, 0U), + SIMD_TUPLE(v128_zip_32, 0U, 0U), SIMD_TUPLE(v128_mul_s16, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V128_U64U64), SIMD_TUPLE(v128_from_64, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V64), + SIMD_TUPLE(v128_unpack_u8_s16, 0U, 0U), + SIMD_TUPLE(v128_unpack_s8_s16, 0U, 0U), + SIMD_TUPLE(v128_unpack_u16_s32, 0U, 0U), + SIMD_TUPLE(v128_unpack_s16_s32, 0U, 0U)); + +INSTANTIATE( + ARCH, ARCH_POSTFIX(V128_V128U32), SIMD_TUPLE(v128_shl_8, 7U, 32U), + SIMD_TUPLE(v128_shr_u8, 7U, 32U), SIMD_TUPLE(v128_shr_s8, 7U, 32U), + SIMD_TUPLE(v128_shl_16, 15U, 32U), SIMD_TUPLE(v128_shr_u16, 15U, 32U), + SIMD_TUPLE(v128_shr_s16, 15U, 32U), SIMD_TUPLE(v128_shl_32, 31U, 32U), + SIMD_TUPLE(v128_shr_u32, 31U, 32U), SIMD_TUPLE(v128_shr_s32, 31U, 32U), + SIMD_TUPLE(v128_shl_64, 63U, 32U), SIMD_TUPLE(v128_shr_u64, 63U, 32U), + SIMD_TUPLE(v128_shr_s64, 63U, 32U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V128), SIMD_TUPLE(v128_low_u32, 0U, 0U), + SIMD_TUPLE(v128_movemask_8, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V128), SIMD_TUPLE(v128_hadd_u8, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V128), SIMD_TUPLE(v128_low_v64, 0U, 0U), + SIMD_TUPLE(v128_high_v64, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V128_U8), SIMD_TUPLE(v128_dup_8, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V128_U16), SIMD_TUPLE(v128_dup_16, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V128_U32), SIMD_TUPLE(v128_dup_32, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V128_U64), SIMD_TUPLE(v128_dup_64, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(S64_V128V128), SIMD_TUPLE(v128_dotp_s16, 0U, 0U), + SIMD_TUPLE(v128_dotp_s32, 0U, 0U), + SIMD_TUPLE(v128_dotp_su8, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V256V256), SIMD_TUPLE(v256_sad_u8, 0U, 0U), + SIMD_TUPLE(v256_ssd_u8, 0U, 0U), SIMD_TUPLE(v256_sad_u16, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V256), SIMD_TUPLE(v256_hadd_u8, 0U, 0U), + SIMD_TUPLE(v256_low_u64, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(S64_V256V256), SIMD_TUPLE(v256_dotp_s16, 0U, 0U), + SIMD_TUPLE(v256_dotp_s32, 0U, 0U), + SIMD_TUPLE(v256_dotp_su8, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V256V256), SIMD_TUPLE(v256_ssd_s16, 0U, 0U)); + +INSTANTIATE( + ARCH, ARCH_POSTFIX(V256_V256V256), SIMD_TUPLE(v256_add_8, 0U, 0U), + SIMD_TUPLE(v256_add_16, 0U, 0U), SIMD_TUPLE(v256_sadd_s16, 0U, 0U), + SIMD_TUPLE(v256_add_32, 0U, 0U), SIMD_TUPLE(v256_sub_8, 0U, 0U), + SIMD_TUPLE(v256_ssub_u8, 0U, 0U), SIMD_TUPLE(v256_ssub_s8, 0U, 0U), + SIMD_TUPLE(v256_sub_16, 0U, 0U), SIMD_TUPLE(v256_ssub_s16, 0U, 0U), + SIMD_TUPLE(v256_ssub_u16, 0U, 0U), SIMD_TUPLE(v256_sub_32, 0U, 0U), + SIMD_TUPLE(v256_ziplo_8, 0U, 0U), SIMD_TUPLE(v256_ziphi_8, 0U, 0U), + SIMD_TUPLE(v256_ziplo_16, 0U, 0U), SIMD_TUPLE(v256_ziphi_16, 0U, 0U), + SIMD_TUPLE(v256_ziplo_32, 0U, 0U), SIMD_TUPLE(v256_ziphi_32, 0U, 0U), + SIMD_TUPLE(v256_ziplo_64, 0U, 0U), SIMD_TUPLE(v256_ziphi_64, 0U, 0U), + SIMD_TUPLE(v256_ziplo_128, 0U, 0U), SIMD_TUPLE(v256_ziphi_128, 0U, 0U), + SIMD_TUPLE(v256_unziphi_8, 0U, 0U), SIMD_TUPLE(v256_unziplo_8, 0U, 0U), + SIMD_TUPLE(v256_unziphi_16, 0U, 0U), SIMD_TUPLE(v256_unziplo_16, 0U, 0U), + SIMD_TUPLE(v256_unziphi_32, 0U, 0U), SIMD_TUPLE(v256_unziplo_32, 0U, 0U), + SIMD_TUPLE(v256_pack_s32_s16, 0U, 0U), SIMD_TUPLE(v256_pack_s16_u8, 0U, 0U), + SIMD_TUPLE(v256_pack_s16_s8, 0U, 0U), SIMD_TUPLE(v256_or, 0U, 0U), + SIMD_TUPLE(v256_xor, 0U, 0U), SIMD_TUPLE(v256_and, 0U, 0U), + SIMD_TUPLE(v256_andn, 0U, 0U), SIMD_TUPLE(v256_mullo_s16, 0U, 0U), + SIMD_TUPLE(v256_mulhi_s16, 0U, 0U), SIMD_TUPLE(v256_mullo_s32, 0U, 0U), + SIMD_TUPLE(v256_madd_s16, 0U, 0U), SIMD_TUPLE(v256_madd_us8, 0U, 0U), + SIMD_TUPLE(v256_avg_u8, 0U, 0U), SIMD_TUPLE(v256_rdavg_u8, 0U, 0U), + SIMD_TUPLE(v256_avg_u16, 0U, 0U), SIMD_TUPLE(v256_min_u8, 0U, 0U), + SIMD_TUPLE(v256_max_u8, 0U, 0U), SIMD_TUPLE(v256_min_s8, 0U, 0U), + SIMD_TUPLE(v256_max_s8, 0U, 0U), SIMD_TUPLE(v256_min_s16, 0U, 0U), + SIMD_TUPLE(v256_max_s16, 0U, 0U), SIMD_TUPLE(v256_cmpgt_s8, 0U, 0U), + SIMD_TUPLE(v256_cmplt_s8, 0U, 0U)); + +INSTANTIATE( + ARCH, ARCH_POSTFIX(V256_V256V256_Part2), SIMD_TUPLE(v256_cmpeq_8, 0U, 0U), + SIMD_TUPLE(v256_min_s32, 0U, 0U), SIMD_TUPLE(v256_max_s32, 0U, 0U), + SIMD_TUPLE(v256_add_64, 0U, 0U), SIMD_TUPLE(v256_sub_64, 0U, 0U), + SIMD_TUPLE(v256_cmpgt_s16, 0U, 0U), SIMD_TUPLE(v256_cmplt_s16, 0U, 0U), + SIMD_TUPLE(v256_cmpeq_16, 0U, 0U), SIMD_TUPLE(v256_cmpgt_s32, 0U, 0U), + SIMD_TUPLE(v256_cmplt_s32, 0U, 0U), SIMD_TUPLE(v256_cmpeq_32, 0U, 0U), + SIMD_TUPLE(v256_shuffle_8, 31U, 8U), SIMD_TUPLE(v256_pshuffle_8, 15U, 8U), + SIMD_TUPLE(imm_v256_align<1>, 0U, 0U), SIMD_TUPLE(v256_sadd_s8, 0U, 0U), + SIMD_TUPLE(v256_sadd_u8, 0U, 0U), SIMD_TUPLE(v256_pack_s32_u16, 0U, 0U), + SIMD_TUPLE(v256_rdavg_u16, 0U, 0U), SIMD_TUPLE(imm_v256_align<2>, 0U, 0U), + SIMD_TUPLE(v256_unziphi_64, 0U, 0U), SIMD_TUPLE(v256_unziplo_64, 0U, 0U), + SIMD_TUPLE(imm_v256_align<3>, 0U, 0U), + SIMD_TUPLE(imm_v256_align<4>, 0U, 0U), + SIMD_TUPLE(imm_v256_align<5>, 0U, 0U), + SIMD_TUPLE(imm_v256_align<6>, 0U, 0U), + SIMD_TUPLE(imm_v256_align<7>, 0U, 0U), + SIMD_TUPLE(imm_v256_align<8>, 0U, 0U), + SIMD_TUPLE(imm_v256_align<9>, 0U, 0U), + SIMD_TUPLE(imm_v256_align<10>, 0U, 0U), + SIMD_TUPLE(imm_v256_align<11>, 0U, 0U), + SIMD_TUPLE(imm_v256_align<12>, 0U, 0U), + SIMD_TUPLE(imm_v256_align<13>, 0U, 0U), + SIMD_TUPLE(imm_v256_align<14>, 0U, 0U), + SIMD_TUPLE(imm_v256_align<15>, 0U, 0U), + SIMD_TUPLE(imm_v256_align<16>, 0U, 0U), + SIMD_TUPLE(imm_v256_align<17>, 0U, 0U), + SIMD_TUPLE(imm_v256_align<18>, 0U, 0U), + SIMD_TUPLE(imm_v256_align<19>, 0U, 0U), + SIMD_TUPLE(imm_v256_align<20>, 0U, 0U), + SIMD_TUPLE(imm_v256_align<21>, 0U, 0U), + SIMD_TUPLE(imm_v256_align<22>, 0U, 0U), + SIMD_TUPLE(imm_v256_align<23>, 0U, 0U), + SIMD_TUPLE(imm_v256_align<24>, 0U, 0U), + SIMD_TUPLE(imm_v256_align<25>, 0U, 0U), + SIMD_TUPLE(imm_v256_align<26>, 0U, 0U), + SIMD_TUPLE(imm_v256_align<27>, 0U, 0U), + SIMD_TUPLE(imm_v256_align<28>, 0U, 0U), + SIMD_TUPLE(imm_v256_align<29>, 0U, 0U), + SIMD_TUPLE(imm_v256_align<30>, 0U, 0U), + SIMD_TUPLE(imm_v256_align<31>, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V128V128), + SIMD_TUPLE(v256_from_v128, 0U, 0U), SIMD_TUPLE(v256_zip_8, 0U, 0U), + SIMD_TUPLE(v256_zip_16, 0U, 0U), SIMD_TUPLE(v256_zip_32, 0U, 0U), + SIMD_TUPLE(v256_mul_s16, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V128), + SIMD_TUPLE(v256_unpack_u8_s16, 0U, 0U), + SIMD_TUPLE(v256_unpack_s8_s16, 0U, 0U), + SIMD_TUPLE(v256_unpack_u16_s32, 0U, 0U), + SIMD_TUPLE(v256_unpack_s16_s32, 0U, 0U)); + +INSTANTIATE( + ARCH, ARCH_POSTFIX(V256_V256U32), SIMD_TUPLE(v256_shl_8, 7U, 32U), + SIMD_TUPLE(v256_shr_u8, 7U, 32U), SIMD_TUPLE(v256_shr_s8, 7U, 32U), + SIMD_TUPLE(v256_shl_16, 15U, 32U), SIMD_TUPLE(v256_shr_u16, 15U, 32U), + SIMD_TUPLE(v256_shr_s16, 15U, 32U), SIMD_TUPLE(v256_shl_32, 31U, 32U), + SIMD_TUPLE(v256_shr_u32, 31U, 32U), SIMD_TUPLE(v256_shr_s32, 31U, 32U), + SIMD_TUPLE(v256_shl_64, 63U, 32U), SIMD_TUPLE(v256_shr_u64, 63U, 32U), + SIMD_TUPLE(v256_shr_s64, 63U, 32U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256), SIMD_TUPLE(v256_abs_s8, 0U, 0U), + SIMD_TUPLE(v256_abs_s16, 0U, 0U), SIMD_TUPLE(v256_padd_s16, 0U, 0U), + SIMD_TUPLE(v256_unpacklo_u8_s16, 0U, 0U), + SIMD_TUPLE(v256_unpacklo_s8_s16, 0U, 0U), + SIMD_TUPLE(v256_unpacklo_u16_s32, 0U, 0U), + SIMD_TUPLE(v256_unpacklo_s16_s32, 0U, 0U), + SIMD_TUPLE(v256_unpackhi_u8_s16, 0U, 0U), + SIMD_TUPLE(v256_unpackhi_s8_s16, 0U, 0U), + SIMD_TUPLE(v256_unpackhi_u16_s32, 0U, 0U), + SIMD_TUPLE(v256_unpackhi_s16_s32, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<1>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<2>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<3>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<4>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<5>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<6>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<7>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<8>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<9>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<10>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<11>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<12>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<13>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<14>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<15>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<16>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<17>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<18>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<19>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<20>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<21>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<22>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<23>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<24>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<25>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<26>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<27>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<28>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<29>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<30>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_byte<31>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<1>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<2>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<3>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<4>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<5>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<6>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<7>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<8>, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256_Part2), + SIMD_TUPLE(imm_v256_shl_n_byte<9>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<10>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<11>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<12>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<13>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<14>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<15>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<16>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<17>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<18>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<19>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<20>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<21>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<22>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<23>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<24>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<25>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<26>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<27>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<28>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<29>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<30>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_byte<31>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_8<1>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_8<2>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_8<3>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_8<4>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_8<5>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_8<6>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_8<7>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u8<1>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u8<2>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u8<3>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u8<4>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u8<5>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u8<6>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u8<7>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s8<1>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s8<2>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s8<3>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s8<4>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s8<5>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s8<6>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s8<7>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_16<1>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_16<2>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_16<4>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_16<6>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_16<8>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_16<10>, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256_Part3), + SIMD_TUPLE(imm_v256_shl_n_16<12>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_16<14>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u16<1>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u16<2>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u16<4>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u16<6>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u16<8>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u16<10>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u16<12>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u16<14>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s16<1>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s16<2>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s16<4>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s16<6>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s16<8>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s16<10>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s16<12>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s16<14>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_32<1>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_32<4>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_32<8>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_32<12>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_32<16>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_32<20>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_32<24>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_32<28>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u32<1>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u32<4>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u32<8>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u32<12>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u32<16>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u32<20>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u32<24>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u32<28>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s32<1>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s32<4>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s32<8>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s32<12>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s32<16>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s32<20>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s32<24>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s32<28>, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256_Part4), + SIMD_TUPLE(imm_v256_shl_n_64<1>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_64<4>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_64<8>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_64<12>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_64<16>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_64<20>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_64<24>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_64<28>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_64<32>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_64<36>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_64<40>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_64<44>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_64<48>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_64<52>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_64<56>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_64<60>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u64<1>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u64<4>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u64<8>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u64<12>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u64<16>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u64<20>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u64<24>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u64<28>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u64<32>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u64<36>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u64<40>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u64<44>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u64<48>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u64<52>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u64<56>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_u64<60>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s64<1>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s64<4>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s64<8>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s64<12>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s64<16>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s64<20>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s64<24>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s64<28>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s64<32>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s64<36>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s64<40>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s64<44>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s64<48>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s64<52>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s64<56>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_s64<60>, 0U, 0U), + SIMD_TUPLE(v256_padd_u8, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256_Part5), + SIMD_TUPLE(imm_v256_shr_n_word<1>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_word<2>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_word<3>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_word<4>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_word<5>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_word<6>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_word<7>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_word<8>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_word<9>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_word<10>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_word<11>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_word<12>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_word<13>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_word<14>, 0U, 0U), + SIMD_TUPLE(imm_v256_shr_n_word<15>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_word<1>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_word<2>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_word<3>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_word<4>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_word<5>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_word<6>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_word<7>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_word<8>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_word<9>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_word<10>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_word<11>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_word<12>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_word<13>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_word<14>, 0U, 0U), + SIMD_TUPLE(imm_v256_shl_n_word<15>, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256V256V256), + SIMD_TUPLE(v256_blend_8, 0U, 0U), + SIMD_TUPLE(v256_wideshuffle_8, 63U, 8U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V256_U8), SIMD_TUPLE(v256_dup_8, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V256_U16), SIMD_TUPLE(v256_dup_16, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V256_U32), SIMD_TUPLE(v256_dup_32, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V256_U64), SIMD_TUPLE(v256_dup_64, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V256), SIMD_TUPLE(v256_low_u32, 0U, 0U), + SIMD_TUPLE(v256_movemask_8, 0U, 0U)); + +INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V256), SIMD_TUPLE(v256_low_v64, 0U, 0U)); + +} // namespace SIMD_NAMESPACE diff --git a/libs/libaom/src/test/simd_neon_test.cc b/libs/libaom/src/test/simd_neon_test.cc new file mode 100644 index 000000000..b67b18895 --- /dev/null +++ b/libs/libaom/src/test/simd_neon_test.cc @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#if defined(__OPTIMIZE__) && __OPTIMIZE__ +#define ARCH NEON +#define ARCH_POSTFIX(name) name##_neon +#define SIMD_NAMESPACE simd_test_neon +#include "test/simd_impl.h" +#endif diff --git a/libs/libaom/src/test/simd_sse2_test.cc b/libs/libaom/src/test/simd_sse2_test.cc new file mode 100644 index 000000000..b37a931b3 --- /dev/null +++ b/libs/libaom/src/test/simd_sse2_test.cc @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#if (defined(__OPTIMIZE__) && __OPTIMIZE__) || \ + (!defined(__GNUC__) && !defined(_DEBUG)) +#define ARCH SSE2 +#define ARCH_POSTFIX(name) name##_sse2 +#define SIMD_NAMESPACE simd_test_sse2 +#include "test/simd_impl.h" +#endif diff --git a/libs/libaom/src/test/simd_sse4_test.cc b/libs/libaom/src/test/simd_sse4_test.cc new file mode 100644 index 000000000..b1c9d5cd8 --- /dev/null +++ b/libs/libaom/src/test/simd_sse4_test.cc @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#if (defined(__OPTIMIZE__) && __OPTIMIZE__) || \ + (!defined(__GNUC__) && !defined(_DEBUG)) +#define ARCH SSE4_1 +#define ARCH_POSTFIX(name) name##_sse4_1 +#define SIMD_NAMESPACE simd_test_sse4_1 +#include "test/simd_impl.h" +#endif diff --git a/libs/libaom/src/test/simd_ssse3_test.cc b/libs/libaom/src/test/simd_ssse3_test.cc new file mode 100644 index 000000000..d95c26fb5 --- /dev/null +++ b/libs/libaom/src/test/simd_ssse3_test.cc @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#if (defined(__OPTIMIZE__) && __OPTIMIZE__) || \ + (!defined(__GNUC__) && !defined(_DEBUG)) +#define ARCH SSSE3 +#define ARCH_POSTFIX(name) name##_ssse3 +#define SIMD_NAMESPACE simd_test_ssse3 +#include "test/simd_impl.h" +#endif diff --git a/libs/libaom/src/test/simple_decoder.sh b/libs/libaom/src/test/simple_decoder.sh new file mode 100644 index 000000000..5f39ad206 --- /dev/null +++ b/libs/libaom/src/test/simple_decoder.sh @@ -0,0 +1,58 @@ +#!/bin/sh +## Copyright (c) 2016, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +## This file tests the libaom simple_decoder example code. To add new tests to +## this file, do the following: +## 1. Write a shell function (this is your test). +## 2. Add the function to simple_decoder_tests (on a new line). +## +. $(dirname $0)/tools_common.sh + +# Environment check: Make sure input is available: +simple_decoder_verify_environment() { + if [ ! "$(av1_encode_available)" = "yes" ] && [ ! -e "${AV1_IVF_FILE}" ]; then + return 1 + fi +} + +# Runs simple_decoder using $1 as input file. $2 is the codec name, and is used +# solely to name the output file. +simple_decoder() { + local decoder="$(aom_tool_path simple_decoder)" + local input_file="$1" + local codec="$2" + local output_file="${AOM_TEST_OUTPUT_DIR}/simple_decoder_${codec}.raw" + + if [ ! -x "${decoder}" ]; then + elog "${decoder} does not exist or is not executable." + return 1 + fi + + eval "${AOM_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \ + ${devnull} + + [ -e "${output_file}" ] || return 1 +} + +simple_decoder_av1() { + if [ "$(av1_decode_available)" = "yes" ]; then + if [ ! -e "${AV1_IVF_FILE}" ]; then + local file="${AOM_TEST_OUTPUT_DIR}/test_encode.ivf" + encode_yuv_raw_input_av1 "${file}" --ivf + simple_decoder "${file}" av1 || return 1 + else + simple_decoder "${AV1_IVF_FILE}" av1 || return 1 + fi + fi +} + +simple_decoder_tests="simple_decoder_av1" + +run_tests simple_decoder_verify_environment "${simple_decoder_tests}" diff --git a/libs/libaom/src/test/simple_encoder.sh b/libs/libaom/src/test/simple_encoder.sh new file mode 100644 index 000000000..5cd6b46a1 --- /dev/null +++ b/libs/libaom/src/test/simple_encoder.sh @@ -0,0 +1,53 @@ +#!/bin/sh +## Copyright (c) 2016, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +## This file tests the libaom simple_encoder example. To add new tests to this +## file, do the following: +## 1. Write a shell function (this is your test). +## 2. Add the function to simple_encoder_tests (on a new line). +## +. $(dirname $0)/tools_common.sh + +# Environment check: $YUV_RAW_INPUT is required. +simple_encoder_verify_environment() { + if [ ! -e "${YUV_RAW_INPUT}" ]; then + echo "Libaom test data must exist in LIBAOM_TEST_DATA_PATH." + return 1 + fi +} + +# Runs simple_encoder using the codec specified by $1 with a frame limit of 100. +simple_encoder() { + local encoder="${LIBAOM_BIN_PATH}/simple_encoder${AOM_TEST_EXE_SUFFIX}" + local codec="$1" + local output_file="${AOM_TEST_OUTPUT_DIR}/simple_encoder_${codec}.ivf" + + if [ ! -x "${encoder}" ]; then + elog "${encoder} does not exist or is not executable." + return 1 + fi + + eval "${AOM_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \ + "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 9999 0 5 \ + ${devnull} + + [ -e "${output_file}" ] || return 1 +} + + +simple_encoder_av1() { + if [ "$(av1_encode_available)" = "yes" ]; then + simple_encoder av1 || return 1 + fi +} + +simple_encoder_tests="simple_encoder_av1" + +run_tests simple_encoder_verify_environment "${simple_encoder_tests}" diff --git a/libs/libaom/src/test/subtract_test.cc b/libs/libaom/src/test/subtract_test.cc new file mode 100644 index 000000000..4001e8b7a --- /dev/null +++ b/libs/libaom/src/test/subtract_test.cc @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "av1/common/blockd.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" + +typedef void (*SubtractFunc)(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride); + +namespace { + +class AV1SubtractBlockTest : public ::testing::TestWithParam { + public: + virtual void TearDown() { libaom_test::ClearSystemState(); } +}; + +using libaom_test::ACMRandom; + +TEST_P(AV1SubtractBlockTest, SimpleSubtract) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + + // FIXME(rbultje) split in its own file + for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES; + bsize = static_cast(static_cast(bsize) + 1)) { + const int block_width = block_size_wide[bsize]; + const int block_height = block_size_high[bsize]; + int16_t *diff = reinterpret_cast( + aom_memalign(16, sizeof(*diff) * block_width * block_height * 2)); + uint8_t *pred = reinterpret_cast( + aom_memalign(16, block_width * block_height * 2)); + uint8_t *src = reinterpret_cast( + aom_memalign(16, block_width * block_height * 2)); + + for (int n = 0; n < 100; n++) { + for (int r = 0; r < block_height; ++r) { + for (int c = 0; c < block_width * 2; ++c) { + src[r * block_width * 2 + c] = rnd.Rand8(); + pred[r * block_width * 2 + c] = rnd.Rand8(); + } + } + + GetParam()(block_height, block_width, diff, block_width, src, block_width, + pred, block_width); + + for (int r = 0; r < block_height; ++r) { + for (int c = 0; c < block_width; ++c) { + EXPECT_EQ(diff[r * block_width + c], + (src[r * block_width + c] - pred[r * block_width + c])) + << "r = " << r << ", c = " << c << ", bs = " << bsize; + } + } + + GetParam()(block_height, block_width, diff, block_width * 2, src, + block_width * 2, pred, block_width * 2); + + for (int r = 0; r < block_height; ++r) { + for (int c = 0; c < block_width; ++c) { + EXPECT_EQ( + diff[r * block_width * 2 + c], + (src[r * block_width * 2 + c] - pred[r * block_width * 2 + c])) + << "r = " << r << ", c = " << c << ", bs = " << bsize; + } + } + } + aom_free(diff); + aom_free(pred); + aom_free(src); + } +} + +INSTANTIATE_TEST_SUITE_P(C, AV1SubtractBlockTest, + ::testing::Values(aom_subtract_block_c)); + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P(SSE2, AV1SubtractBlockTest, + ::testing::Values(aom_subtract_block_sse2)); +#endif +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, AV1SubtractBlockTest, + ::testing::Values(aom_subtract_block_neon)); +#endif +#if HAVE_MSA +INSTANTIATE_TEST_SUITE_P(MSA, AV1SubtractBlockTest, + ::testing::Values(aom_subtract_block_msa)); +#endif + +#if CONFIG_AV1_HIGHBITDEPTH +typedef void (*HBDSubtractFunc)(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride, int bd); + +using std::get; +using std::make_tuple; +using std::tuple; + +// +typedef tuple Params; + +class AV1HBDSubtractBlockTest : public ::testing::TestWithParam { + public: + virtual void SetUp() { + block_width_ = GET_PARAM(0); + block_height_ = GET_PARAM(1); + bit_depth_ = static_cast(GET_PARAM(2)); + func_ = GET_PARAM(3); + + rnd_.Reset(ACMRandom::DeterministicSeed()); + + const size_t max_width = 128; + const size_t max_block_size = max_width * max_width; + src_ = CONVERT_TO_BYTEPTR(reinterpret_cast( + aom_memalign(16, max_block_size * sizeof(uint16_t)))); + pred_ = CONVERT_TO_BYTEPTR(reinterpret_cast( + aom_memalign(16, max_block_size * sizeof(uint16_t)))); + diff_ = reinterpret_cast( + aom_memalign(16, max_block_size * sizeof(int16_t))); + } + + virtual void TearDown() { + aom_free(CONVERT_TO_SHORTPTR(src_)); + aom_free(CONVERT_TO_SHORTPTR(pred_)); + aom_free(diff_); + } + + protected: + void CheckResult(); + void RunForSpeed(); + + private: + ACMRandom rnd_; + int block_height_; + int block_width_; + aom_bit_depth_t bit_depth_; + HBDSubtractFunc func_; + uint8_t *src_; + uint8_t *pred_; + int16_t *diff_; +}; + +void AV1HBDSubtractBlockTest::CheckResult() { + const int test_num = 100; + const size_t max_width = 128; + const int max_block_size = max_width * max_width; + const int mask = (1 << bit_depth_) - 1; + int i, j; + + for (i = 0; i < test_num; ++i) { + for (j = 0; j < max_block_size; ++j) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask; + CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask; + } + + func_(block_height_, block_width_, diff_, block_width_, src_, block_width_, + pred_, block_width_, bit_depth_); + + for (int r = 0; r < block_height_; ++r) { + for (int c = 0; c < block_width_; ++c) { + EXPECT_EQ(diff_[r * block_width_ + c], + (CONVERT_TO_SHORTPTR(src_)[r * block_width_ + c] - + CONVERT_TO_SHORTPTR(pred_)[r * block_width_ + c])) + << "r = " << r << ", c = " << c << ", test: " << i; + } + } + } +} + +TEST_P(AV1HBDSubtractBlockTest, CheckResult) { CheckResult(); } + +void AV1HBDSubtractBlockTest::RunForSpeed() { + const int test_num = 200000; + const size_t max_width = 128; + const int max_block_size = max_width * max_width; + const int mask = (1 << bit_depth_) - 1; + int i, j; + + for (j = 0; j < max_block_size; ++j) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask; + CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask; + } + + for (i = 0; i < test_num; ++i) { + func_(block_height_, block_width_, diff_, block_width_, src_, block_width_, + pred_, block_width_, bit_depth_); + } +} + +TEST_P(AV1HBDSubtractBlockTest, DISABLED_Speed) { RunForSpeed(); } + +#if HAVE_SSE2 +const Params kAV1HBDSubtractBlock_sse2[] = { + make_tuple(4, 4, 12, &aom_highbd_subtract_block_sse2), + make_tuple(4, 4, 12, &aom_highbd_subtract_block_c), + make_tuple(4, 8, 12, &aom_highbd_subtract_block_sse2), + make_tuple(4, 8, 12, &aom_highbd_subtract_block_c), + make_tuple(8, 4, 12, &aom_highbd_subtract_block_sse2), + make_tuple(8, 4, 12, &aom_highbd_subtract_block_c), + make_tuple(8, 8, 12, &aom_highbd_subtract_block_sse2), + make_tuple(8, 8, 12, &aom_highbd_subtract_block_c), + make_tuple(8, 16, 12, &aom_highbd_subtract_block_sse2), + make_tuple(8, 16, 12, &aom_highbd_subtract_block_c), + make_tuple(16, 8, 12, &aom_highbd_subtract_block_sse2), + make_tuple(16, 8, 12, &aom_highbd_subtract_block_c), + make_tuple(16, 16, 12, &aom_highbd_subtract_block_sse2), + make_tuple(16, 16, 12, &aom_highbd_subtract_block_c), + make_tuple(16, 32, 12, &aom_highbd_subtract_block_sse2), + make_tuple(16, 32, 12, &aom_highbd_subtract_block_c), + make_tuple(32, 16, 12, &aom_highbd_subtract_block_sse2), + make_tuple(32, 16, 12, &aom_highbd_subtract_block_c), + make_tuple(32, 32, 12, &aom_highbd_subtract_block_sse2), + make_tuple(32, 32, 12, &aom_highbd_subtract_block_c), + make_tuple(32, 64, 12, &aom_highbd_subtract_block_sse2), + make_tuple(32, 64, 12, &aom_highbd_subtract_block_c), + make_tuple(64, 32, 12, &aom_highbd_subtract_block_sse2), + make_tuple(64, 32, 12, &aom_highbd_subtract_block_c), + make_tuple(64, 64, 12, &aom_highbd_subtract_block_sse2), + make_tuple(64, 64, 12, &aom_highbd_subtract_block_c), + make_tuple(64, 128, 12, &aom_highbd_subtract_block_sse2), + make_tuple(64, 128, 12, &aom_highbd_subtract_block_c), + make_tuple(128, 64, 12, &aom_highbd_subtract_block_sse2), + make_tuple(128, 64, 12, &aom_highbd_subtract_block_c), + make_tuple(128, 128, 12, &aom_highbd_subtract_block_sse2), + make_tuple(128, 128, 12, &aom_highbd_subtract_block_c) +}; + +INSTANTIATE_TEST_SUITE_P(SSE2, AV1HBDSubtractBlockTest, + ::testing::ValuesIn(kAV1HBDSubtractBlock_sse2)); +#endif // HAVE_SSE2 +#endif // CONFIG_AV1_HIGHBITDEPTH +} // namespace diff --git a/libs/libaom/src/test/sum_squares_test.cc b/libs/libaom/src/test/sum_squares_test.cc new file mode 100644 index 000000000..8845466b8 --- /dev/null +++ b/libs/libaom/src/test/sum_squares_test.cc @@ -0,0 +1,839 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_ports/mem.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "test/function_equivalence_test.h" + +using libaom_test::ACMRandom; +using libaom_test::FunctionEquivalenceTest; +using ::testing::Combine; +using ::testing::Range; +using ::testing::Values; +using ::testing::ValuesIn; + +namespace { +const int kNumIterations = 10000; + +static const int16_t kInt13Max = (1 << 12) - 1; + +typedef uint64_t (*SSI16Func)(const int16_t *src, int stride, int width, + int height); +typedef libaom_test::FuncParam TestFuncs; + +class SumSquaresTest : public ::testing::TestWithParam { + public: + virtual ~SumSquaresTest() {} + virtual void SetUp() { + params_ = this->GetParam(); + rnd_.Reset(ACMRandom::DeterministicSeed()); + src_ = reinterpret_cast(aom_memalign(16, 256 * 256 * 2)); + ASSERT_TRUE(src_ != NULL); + } + + virtual void TearDown() { + libaom_test::ClearSystemState(); + aom_free(src_); + } + void RunTest(int isRandom); + void RunSpeedTest(); + + void GenRandomData(int width, int height, int stride) { + const int msb = 11; // Up to 12 bit input + const int limit = 1 << (msb + 1); + for (int ii = 0; ii < height; ii++) { + for (int jj = 0; jj < width; jj++) { + src_[ii * stride + jj] = rnd_(2) ? rnd_(limit) : -rnd_(limit); + } + } + } + + void GenExtremeData(int width, int height, int stride) { + const int msb = 11; // Up to 12 bit input + const int limit = 1 << (msb + 1); + const int val = rnd_(2) ? limit - 1 : -(limit - 1); + for (int ii = 0; ii < height; ii++) { + for (int jj = 0; jj < width; jj++) { + src_[ii * stride + jj] = val; + } + } + } + + protected: + TestFuncs params_; + int16_t *src_; + ACMRandom rnd_; +}; + +void SumSquaresTest::RunTest(int isRandom) { + int failed = 0; + for (int k = 0; k < kNumIterations; k++) { + const int width = 4 * (rnd_(31) + 1); // Up to 128x128 + const int height = 4 * (rnd_(31) + 1); // Up to 128x128 + int stride = 4 << rnd_(7); // Up to 256 stride + while (stride < width) { // Make sure it's valid + stride = 4 << rnd_(7); + } + if (isRandom) { + GenRandomData(width, height, stride); + } else { + GenExtremeData(width, height, stride); + } + const uint64_t res_ref = params_.ref_func(src_, stride, width, height); + uint64_t res_tst; + ASM_REGISTER_STATE_CHECK(res_tst = + params_.tst_func(src_, stride, width, height)); + + if (!failed) { + failed = res_ref != res_tst; + EXPECT_EQ(res_ref, res_tst) + << "Error: Sum Squares Test [" << width << "x" << height + << "] C output does not match optimized output."; + } + } +} + +void SumSquaresTest::RunSpeedTest() { + for (int block = BLOCK_4X4; block < BLOCK_SIZES_ALL; block++) { + const int width = block_size_wide[block]; // Up to 128x128 + const int height = block_size_high[block]; // Up to 128x128 + int stride = 4 << rnd_(7); // Up to 256 stride + while (stride < width) { // Make sure it's valid + stride = 4 << rnd_(7); + } + GenExtremeData(width, height, stride); + const int num_loops = 1000000000 / (width + height); + aom_usec_timer timer; + aom_usec_timer_start(&timer); + + for (int i = 0; i < num_loops; ++i) + params_.ref_func(src_, stride, width, height); + + aom_usec_timer_mark(&timer); + const int elapsed_time = static_cast(aom_usec_timer_elapsed(&timer)); + printf("SumSquaresTest C %3dx%-3d: %7.2f ns\n", width, height, + 1000.0 * elapsed_time / num_loops); + + aom_usec_timer timer1; + aom_usec_timer_start(&timer1); + for (int i = 0; i < num_loops; ++i) + params_.tst_func(src_, stride, width, height); + aom_usec_timer_mark(&timer1); + const int elapsed_time1 = static_cast(aom_usec_timer_elapsed(&timer1)); + printf("SumSquaresTest Test %3dx%-3d: %7.2f ns\n", width, height, + 1000.0 * elapsed_time1 / num_loops); + } +} + +TEST_P(SumSquaresTest, OperationCheck) { + RunTest(1); // GenRandomData +} + +TEST_P(SumSquaresTest, ExtremeValues) { + RunTest(0); // GenExtremeData +} + +TEST_P(SumSquaresTest, DISABLED_Speed) { RunSpeedTest(); } + +#if HAVE_SSE2 + +INSTANTIATE_TEST_SUITE_P( + SSE2, SumSquaresTest, + ::testing::Values(TestFuncs(&aom_sum_squares_2d_i16_c, + &aom_sum_squares_2d_i16_sse2))); + +#endif // HAVE_SSE2 + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, SumSquaresTest, + ::testing::Values(TestFuncs(&aom_sum_squares_2d_i16_c, + &aom_sum_squares_2d_i16_avx2))); +#endif // HAVE_AVX2 + +////////////////////////////////////////////////////////////////////////////// +// 1D version +////////////////////////////////////////////////////////////////////////////// + +typedef uint64_t (*F1D)(const int16_t *src, uint32_t N); +typedef libaom_test::FuncParam TestFuncs1D; + +class SumSquares1DTest : public FunctionEquivalenceTest { + protected: + static const int kIterations = 1000; + static const int kMaxSize = 256; +}; + +TEST_P(SumSquares1DTest, RandomValues) { + DECLARE_ALIGNED(16, int16_t, src[kMaxSize * kMaxSize]); + + for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + for (int i = 0; i < kMaxSize * kMaxSize; ++i) + src[i] = rng_(kInt13Max * 2 + 1) - kInt13Max; + + const int N = rng_(2) ? rng_(kMaxSize * kMaxSize + 1 - kMaxSize) + kMaxSize + : rng_(kMaxSize) + 1; + + const uint64_t ref_res = params_.ref_func(src, N); + uint64_t tst_res; + ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, N)); + + ASSERT_EQ(ref_res, tst_res); + } +} + +TEST_P(SumSquares1DTest, ExtremeValues) { + DECLARE_ALIGNED(16, int16_t, src[kMaxSize * kMaxSize]); + + for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + if (rng_(2)) { + for (int i = 0; i < kMaxSize * kMaxSize; ++i) src[i] = kInt13Max; + } else { + for (int i = 0; i < kMaxSize * kMaxSize; ++i) src[i] = -kInt13Max; + } + + const int N = rng_(2) ? rng_(kMaxSize * kMaxSize + 1 - kMaxSize) + kMaxSize + : rng_(kMaxSize) + 1; + + const uint64_t ref_res = params_.ref_func(src, N); + uint64_t tst_res; + ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, N)); + + ASSERT_EQ(ref_res, tst_res); + } +} + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P(SSE2, SumSquares1DTest, + ::testing::Values(TestFuncs1D( + aom_sum_squares_i16_c, aom_sum_squares_i16_sse2))); + +#endif // HAVE_SSE2 + +typedef int64_t (*sse_func)(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height); +typedef libaom_test::FuncParam TestSSEFuncs; + +typedef std::tuple SSETestParam; + +class SSETest : public ::testing::TestWithParam { + public: + virtual ~SSETest() {} + virtual void SetUp() { + params_ = GET_PARAM(0); + width_ = GET_PARAM(1); + isHbd_ = +#if CONFIG_AV1_HIGHBITDEPTH + params_.ref_func == aom_highbd_sse_c; +#else + 0; +#endif + rnd_.Reset(ACMRandom::DeterministicSeed()); + src_ = reinterpret_cast(aom_memalign(32, 256 * 256 * 2)); + ref_ = reinterpret_cast(aom_memalign(32, 256 * 256 * 2)); + ASSERT_TRUE(src_ != NULL); + ASSERT_TRUE(ref_ != NULL); + } + + virtual void TearDown() { + libaom_test::ClearSystemState(); + aom_free(src_); + aom_free(ref_); + } + void RunTest(int isRandom, int width, int height, int run_times); + + void GenRandomData(int width, int height, int stride) { + uint16_t *pSrc = (uint16_t *)src_; + uint16_t *pRef = (uint16_t *)ref_; + const int msb = 11; // Up to 12 bit input + const int limit = 1 << (msb + 1); + for (int ii = 0; ii < height; ii++) { + for (int jj = 0; jj < width; jj++) { + if (!isHbd_) { + src_[ii * stride + jj] = rnd_.Rand8(); + ref_[ii * stride + jj] = rnd_.Rand8(); + } else { + pSrc[ii * stride + jj] = rnd_(limit); + pRef[ii * stride + jj] = rnd_(limit); + } + } + } + } + + void GenExtremeData(int width, int height, int stride, uint8_t *data, + int16_t val) { + uint16_t *pData = (uint16_t *)data; + for (int ii = 0; ii < height; ii++) { + for (int jj = 0; jj < width; jj++) { + if (!isHbd_) { + data[ii * stride + jj] = (uint8_t)val; + } else { + pData[ii * stride + jj] = val; + } + } + } + } + + protected: + int isHbd_; + int width_; + TestSSEFuncs params_; + uint8_t *src_; + uint8_t *ref_; + ACMRandom rnd_; +}; + +void SSETest::RunTest(int isRandom, int width, int height, int run_times) { + int failed = 0; + aom_usec_timer ref_timer, test_timer; + for (int k = 0; k < 3; k++) { + int stride = 4 << rnd_(7); // Up to 256 stride + while (stride < width) { // Make sure it's valid + stride = 4 << rnd_(7); + } + if (isRandom) { + GenRandomData(width, height, stride); + } else { + const int msb = isHbd_ ? 12 : 8; // Up to 12 bit input + const int limit = (1 << msb) - 1; + if (k == 0) { + GenExtremeData(width, height, stride, src_, 0); + GenExtremeData(width, height, stride, ref_, limit); + } else { + GenExtremeData(width, height, stride, src_, limit); + GenExtremeData(width, height, stride, ref_, 0); + } + } + int64_t res_ref, res_tst; + uint8_t *pSrc = src_; + uint8_t *pRef = ref_; + if (isHbd_) { + pSrc = CONVERT_TO_BYTEPTR(src_); + pRef = CONVERT_TO_BYTEPTR(ref_); + } + res_ref = params_.ref_func(pSrc, stride, pRef, stride, width, height); + res_tst = params_.tst_func(pSrc, stride, pRef, stride, width, height); + if (run_times > 1) { + aom_usec_timer_start(&ref_timer); + for (int j = 0; j < run_times; j++) { + params_.ref_func(pSrc, stride, pRef, stride, width, height); + } + aom_usec_timer_mark(&ref_timer); + const int elapsed_time_c = + static_cast(aom_usec_timer_elapsed(&ref_timer)); + + aom_usec_timer_start(&test_timer); + for (int j = 0; j < run_times; j++) { + params_.tst_func(pSrc, stride, pRef, stride, width, height); + } + aom_usec_timer_mark(&test_timer); + const int elapsed_time_simd = + static_cast(aom_usec_timer_elapsed(&test_timer)); + + printf( + "c_time=%d \t simd_time=%d \t " + "gain=%d\n", + elapsed_time_c, elapsed_time_simd, + (elapsed_time_c / elapsed_time_simd)); + } else { + if (!failed) { + failed = res_ref != res_tst; + EXPECT_EQ(res_ref, res_tst) + << "Error:" << (isHbd_ ? "hbd " : " ") << k << " SSE Test [" + << width << "x" << height + << "] C output does not match optimized output."; + } + } + } +} + +TEST_P(SSETest, OperationCheck) { + for (int height = 4; height <= 128; height += 4) { + RunTest(1, width_, height, 1); // GenRandomData + } +} + +TEST_P(SSETest, ExtremeValues) { + for (int height = 4; height <= 128; height += 4) { + RunTest(0, width_, height, 1); + } +} + +TEST_P(SSETest, DISABLED_Speed) { + for (int height = 4; height <= 128; height += 4) { + RunTest(1, width_, height, 100); + } +} + +#if HAVE_NEON +TestSSEFuncs sse_neon[] = { + TestSSEFuncs(&aom_sse_c, &aom_sse_neon), +#if CONFIG_AV1_HIGHBITDEPTH + TestSSEFuncs(&aom_highbd_sse_c, &aom_highbd_sse_neon) +#endif +}; +INSTANTIATE_TEST_SUITE_P(NEON, SSETest, + Combine(ValuesIn(sse_neon), Range(4, 129, 4))); +#endif // HAVE_NEON + +#if HAVE_SSE4_1 +TestSSEFuncs sse_sse4[] = { + TestSSEFuncs(&aom_sse_c, &aom_sse_sse4_1), +#if CONFIG_AV1_HIGHBITDEPTH + TestSSEFuncs(&aom_highbd_sse_c, &aom_highbd_sse_sse4_1) +#endif +}; +INSTANTIATE_TEST_SUITE_P(SSE4_1, SSETest, + Combine(ValuesIn(sse_sse4), Range(4, 129, 4))); +#endif // HAVE_SSE4_1 + +#if HAVE_AVX2 + +TestSSEFuncs sse_avx2[] = { + TestSSEFuncs(&aom_sse_c, &aom_sse_avx2), +#if CONFIG_AV1_HIGHBITDEPTH + TestSSEFuncs(&aom_highbd_sse_c, &aom_highbd_sse_avx2) +#endif +}; +INSTANTIATE_TEST_SUITE_P(AVX2, SSETest, + Combine(ValuesIn(sse_avx2), Range(4, 129, 4))); +#endif // HAVE_AVX2 + +////////////////////////////////////////////////////////////////////////////// +// get_blk sum squares test functions +////////////////////////////////////////////////////////////////////////////// + +typedef void (*sse_sum_func)(const int16_t *data, int stride, int bw, int bh, + int *x_sum, int64_t *x2_sum); +typedef libaom_test::FuncParam TestSSE_SumFuncs; + +typedef std::tuple SSE_SumTestParam; + +class SSE_Sum_Test : public ::testing::TestWithParam { + public: + virtual ~SSE_Sum_Test() {} + virtual void SetUp() { + params_ = GET_PARAM(0); + width_ = GET_PARAM(1); + rnd_.Reset(ACMRandom::DeterministicSeed()); + src_ = reinterpret_cast(aom_memalign(32, 256 * 256 * 2)); + ASSERT_TRUE(src_ != NULL); + } + + virtual void TearDown() { + libaom_test::ClearSystemState(); + aom_free(src_); + } + void RunTest(int isRandom, int width, int height, int run_times); + + void GenRandomData(int width, int height, int stride) { + const int msb = 11; // Up to 12 bit input + const int limit = 1 << (msb + 1); + for (int ii = 0; ii < height; ii++) { + for (int jj = 0; jj < width; jj++) { + src_[ii * stride + jj] = rnd_(limit); + } + } + } + + void GenExtremeData(int width, int height, int stride, int16_t *data, + int16_t val) { + for (int ii = 0; ii < height; ii++) { + for (int jj = 0; jj < width; jj++) { + data[ii * stride + jj] = val; + } + } + } + + protected: + int width_; + TestSSE_SumFuncs params_; + int16_t *src_; + ACMRandom rnd_; +}; + +void SSE_Sum_Test::RunTest(int isRandom, int width, int height, int run_times) { + aom_usec_timer ref_timer, test_timer; + for (int k = 0; k < 3; k++) { + int stride = 4 << rnd_(7); // Up to 256 stride + while (stride < width) { // Make sure it's valid + stride = 4 << rnd_(7); + } + if (isRandom) { + GenRandomData(width, height, stride); + } else { + const int msb = 12; // Up to 12 bit input + const int limit = (1 << msb) - 1; + if (k == 0) { + GenExtremeData(width, height, stride, src_, limit); + } else { + GenExtremeData(width, height, stride, src_, -limit); + } + } + int sum_c = 0; + int64_t sse_intr = 0; + int sum_intr = 0; + int64_t sse_c = 0; + + params_.ref_func(src_, stride, width, height, &sum_c, &sse_c); + params_.tst_func(src_, stride, width, height, &sum_intr, &sse_intr); + + if (run_times > 1) { + aom_usec_timer_start(&ref_timer); + for (int j = 0; j < run_times; j++) { + params_.ref_func(src_, stride, width, height, &sum_c, &sse_c); + } + aom_usec_timer_mark(&ref_timer); + const int elapsed_time_c = + static_cast(aom_usec_timer_elapsed(&ref_timer)); + + aom_usec_timer_start(&test_timer); + for (int j = 0; j < run_times; j++) { + params_.tst_func(src_, stride, width, height, &sum_intr, &sse_intr); + } + aom_usec_timer_mark(&test_timer); + const int elapsed_time_simd = + static_cast(aom_usec_timer_elapsed(&test_timer)); + + printf( + "c_time=%d \t simd_time=%d \t " + "gain=%f\t width=%d\t height=%d \n", + elapsed_time_c, elapsed_time_simd, + (float)((float)elapsed_time_c / (float)elapsed_time_simd), width, + height); + + } else { + EXPECT_EQ(sum_c, sum_intr) + << "Error:" << k << " SSE Sum Test [" << width << "x" << height + << "] C output does not match optimized output."; + EXPECT_EQ(sse_c, sse_intr) + << "Error:" << k << " SSE Sum Test [" << width << "x" << height + << "] C output does not match optimized output."; + } + } +} + +TEST_P(SSE_Sum_Test, OperationCheck) { + for (int height = 4; height <= 64; height = height * 2) { + RunTest(1, width_, height, 1); // GenRandomData + } +} + +TEST_P(SSE_Sum_Test, ExtremeValues) { + for (int height = 4; height <= 64; height = height * 2) { + RunTest(0, width_, height, 1); + } +} + +TEST_P(SSE_Sum_Test, DISABLED_Speed) { + for (int height = 4; height <= 64; height = height * 2) { + RunTest(1, width_, height, 10000); + } +} + +#if HAVE_SSE2 +TestSSE_SumFuncs sse_sum_sse2[] = { TestSSE_SumFuncs( + &aom_get_blk_sse_sum_c, &aom_get_blk_sse_sum_sse2) }; +INSTANTIATE_TEST_SUITE_P(SSE2, SSE_Sum_Test, + Combine(ValuesIn(sse_sum_sse2), Range(4, 65, 4))); +#endif // HAVE_SSE2 + +#if HAVE_AVX2 +TestSSE_SumFuncs sse_sum_avx2[] = { TestSSE_SumFuncs( + &aom_get_blk_sse_sum_c, &aom_get_blk_sse_sum_avx2) }; +INSTANTIATE_TEST_SUITE_P(AVX2, SSE_Sum_Test, + Combine(ValuesIn(sse_sum_avx2), Range(4, 65, 4))); +#endif // HAVE_AVX2 + +////////////////////////////////////////////////////////////////////////////// +// 2D Variance test functions +////////////////////////////////////////////////////////////////////////////// + +typedef uint64_t (*Var2DFunc)(uint8_t *src, int stride, int width, int height); +typedef libaom_test::FuncParam TestFuncVar2D; + +const uint16_t test_block_size[2] = { 128, 256 }; + +class Lowbd2dVarTest : public ::testing::TestWithParam { + public: + virtual ~Lowbd2dVarTest() {} + virtual void SetUp() { + params_ = this->GetParam(); + rnd_.Reset(ACMRandom::DeterministicSeed()); + src_ = reinterpret_cast( + aom_memalign(16, 512 * 512 * sizeof(uint8_t))); + ASSERT_TRUE(src_ != NULL); + } + + virtual void TearDown() { + libaom_test::ClearSystemState(); + aom_free(src_); + } + void RunTest(int isRandom); + void RunSpeedTest(); + + void GenRandomData(int width, int height, int stride) { + const int msb = 7; // Up to 8 bit input + const int limit = 1 << (msb + 1); + for (int ii = 0; ii < height; ii++) { + for (int jj = 0; jj < width; jj++) { + src_[ii * stride + jj] = rnd_(limit); + } + } + } + + void GenExtremeData(int width, int height, int stride) { + const int msb = 7; // Up to 8 bit input + const int limit = 1 << (msb + 1); + const int val = rnd_(2) ? limit - 1 : 0; + for (int ii = 0; ii < height; ii++) { + for (int jj = 0; jj < width; jj++) { + src_[ii * stride + jj] = val; + } + } + } + + protected: + TestFuncVar2D params_; + uint8_t *src_; + ACMRandom rnd_; +}; + +void Lowbd2dVarTest::RunTest(int isRandom) { + int failed = 0; + for (int k = 0; k < kNumIterations; k++) { + const int width = 4 * (rnd_(63) + 1); // Up to 256x256 + const int height = 4 * (rnd_(63) + 1); // Up to 256x256 + int stride = 4 << rnd_(8); // Up to 512 stride + while (stride < width) { // Make sure it's valid + stride = 4 << rnd_(8); + } + if (isRandom) { + GenRandomData(width, height, stride); + } else { + GenExtremeData(width, height, stride); + } + + const uint64_t res_ref = params_.ref_func(src_, stride, width, height); + uint64_t res_tst; + ASM_REGISTER_STATE_CHECK(res_tst = + params_.tst_func(src_, stride, width, height)); + + if (!failed) { + failed = res_ref != res_tst; + EXPECT_EQ(res_ref, res_tst) + << "Error: Sum Squares Test [" << width << "x" << height + << "] C output does not match optimized output."; + } + } +} + +void Lowbd2dVarTest::RunSpeedTest() { + for (int block = 0; block < 2; block++) { + const int width = test_block_size[block]; + const int height = test_block_size[block]; + int stride = 4 << rnd_(8); // Up to 512 stride + while (stride < width) { // Make sure it's valid + stride = 4 << rnd_(8); + } + GenExtremeData(width, height, stride); + const int num_loops = 1000000000 / (width + height); + aom_usec_timer timer; + aom_usec_timer_start(&timer); + + for (int i = 0; i < num_loops; ++i) + params_.ref_func(src_, stride, width, height); + + aom_usec_timer_mark(&timer); + const int elapsed_time = static_cast(aom_usec_timer_elapsed(&timer)); + + aom_usec_timer timer1; + aom_usec_timer_start(&timer1); + for (int i = 0; i < num_loops; ++i) + params_.tst_func(src_, stride, width, height); + aom_usec_timer_mark(&timer1); + const int elapsed_time1 = static_cast(aom_usec_timer_elapsed(&timer1)); + printf("%3dx%-3d: Scaling = %.2f\n", width, height, + (double)elapsed_time / elapsed_time1); + } +} + +TEST_P(Lowbd2dVarTest, OperationCheck) { + RunTest(1); // GenRandomData +} + +TEST_P(Lowbd2dVarTest, ExtremeValues) { + RunTest(0); // GenExtremeData +} + +TEST_P(Lowbd2dVarTest, DISABLED_Speed) { RunSpeedTest(); } + +#if HAVE_SSE2 + +INSTANTIATE_TEST_SUITE_P(SSE2, Lowbd2dVarTest, + ::testing::Values(TestFuncVar2D(&aom_var_2d_u8_c, + &aom_var_2d_u8_sse2))); + +#endif // HAVE_SSE2 + +#if HAVE_AVX2 + +INSTANTIATE_TEST_SUITE_P(AVX2, Lowbd2dVarTest, + ::testing::Values(TestFuncVar2D(&aom_var_2d_u8_c, + &aom_var_2d_u8_avx2))); + +#endif // HAVE_SSE2 + +class Highbd2dVarTest : public ::testing::TestWithParam { + public: + virtual ~Highbd2dVarTest() {} + virtual void SetUp() { + params_ = this->GetParam(); + rnd_.Reset(ACMRandom::DeterministicSeed()); + src_ = reinterpret_cast( + aom_memalign(16, 512 * 512 * sizeof(uint16_t))); + ASSERT_TRUE(src_ != NULL); + } + + virtual void TearDown() { + libaom_test::ClearSystemState(); + aom_free(src_); + } + void RunTest(int isRandom); + void RunSpeedTest(); + + void GenRandomData(int width, int height, int stride) { + const int msb = 11; // Up to 12 bit input + const int limit = 1 << (msb + 1); + for (int ii = 0; ii < height; ii++) { + for (int jj = 0; jj < width; jj++) { + src_[ii * stride + jj] = rnd_(limit); + } + } + } + + void GenExtremeData(int width, int height, int stride) { + const int msb = 11; // Up to 12 bit input + const int limit = 1 << (msb + 1); + const int val = rnd_(2) ? limit - 1 : 0; + for (int ii = 0; ii < height; ii++) { + for (int jj = 0; jj < width; jj++) { + src_[ii * stride + jj] = val; + } + } + } + + protected: + TestFuncVar2D params_; + uint16_t *src_; + ACMRandom rnd_; +}; + +void Highbd2dVarTest::RunTest(int isRandom) { + int failed = 0; + for (int k = 0; k < kNumIterations; k++) { + const int width = 4 * (rnd_(63) + 1); // Up to 256x256 + const int height = 4 * (rnd_(63) + 1); // Up to 256x256 + int stride = 4 << rnd_(8); // Up to 512 stride + while (stride < width) { // Make sure it's valid + stride = 4 << rnd_(8); + } + if (isRandom) { + GenRandomData(width, height, stride); + } else { + GenExtremeData(width, height, stride); + } + + const uint64_t res_ref = + params_.ref_func(CONVERT_TO_BYTEPTR(src_), stride, width, height); + uint64_t res_tst; + ASM_REGISTER_STATE_CHECK( + res_tst = + params_.tst_func(CONVERT_TO_BYTEPTR(src_), stride, width, height)); + + if (!failed) { + failed = res_ref != res_tst; + EXPECT_EQ(res_ref, res_tst) + << "Error: Sum Squares Test [" << width << "x" << height + << "] C output does not match optimized output."; + } + } +} + +void Highbd2dVarTest::RunSpeedTest() { + for (int block = 0; block < 2; block++) { + const int width = test_block_size[block]; + const int height = test_block_size[block]; + int stride = 4 << rnd_(8); // Up to 512 stride + while (stride < width) { // Make sure it's valid + stride = 4 << rnd_(8); + } + GenExtremeData(width, height, stride); + const int num_loops = 1000000000 / (width + height); + aom_usec_timer timer; + aom_usec_timer_start(&timer); + + for (int i = 0; i < num_loops; ++i) + params_.ref_func(CONVERT_TO_BYTEPTR(src_), stride, width, height); + + aom_usec_timer_mark(&timer); + const int elapsed_time = static_cast(aom_usec_timer_elapsed(&timer)); + + aom_usec_timer timer1; + aom_usec_timer_start(&timer1); + for (int i = 0; i < num_loops; ++i) + params_.tst_func(CONVERT_TO_BYTEPTR(src_), stride, width, height); + aom_usec_timer_mark(&timer1); + const int elapsed_time1 = static_cast(aom_usec_timer_elapsed(&timer1)); + printf("%3dx%-3d: Scaling = %.2f\n", width, height, + (double)elapsed_time / elapsed_time1); + } +} + +TEST_P(Highbd2dVarTest, OperationCheck) { + RunTest(1); // GenRandomData +} + +TEST_P(Highbd2dVarTest, ExtremeValues) { + RunTest(0); // GenExtremeData +} + +TEST_P(Highbd2dVarTest, DISABLED_Speed) { RunSpeedTest(); } + +#if HAVE_SSE2 + +INSTANTIATE_TEST_SUITE_P( + SSE2, Highbd2dVarTest, + ::testing::Values(TestFuncVar2D(&aom_var_2d_u16_c, &aom_var_2d_u16_sse2))); + +#endif // HAVE_SSE2 + +#if HAVE_AVX2 + +INSTANTIATE_TEST_SUITE_P( + AVX2, Highbd2dVarTest, + ::testing::Values(TestFuncVar2D(&aom_var_2d_u16_c, &aom_var_2d_u16_avx2))); + +#endif // HAVE_SSE2 +} // namespace diff --git a/libs/libaom/src/test/superframe_test.cc b/libs/libaom/src/test/superframe_test.cc new file mode 100644 index 000000000..024a18b97 --- /dev/null +++ b/libs/libaom/src/test/superframe_test.cc @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" + +namespace { + +const int kTestMode = 0; +const int kTileCols = 1; +const int kTileRows = 2; + +typedef std::tuple SuperframeTestParam; + +class SuperframeTest + : public ::libaom_test::CodecTestWithParam, + public ::libaom_test::EncoderTest { + protected: + SuperframeTest() : EncoderTest(GET_PARAM(0)), last_sf_pts_(0) {} + virtual ~SuperframeTest() {} + + virtual void SetUp() { + InitializeConfig(); + const SuperframeTestParam input = GET_PARAM(1); + const libaom_test::TestMode mode = std::get(input); + SetMode(mode); + sf_count_ = 0; + sf_count_max_ = INT_MAX; + n_tile_cols_ = std::get(input); + n_tile_rows_ = std::get(input); + } + + virtual void PreEncodeFrameHook(libaom_test::VideoSource *video, + libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1); + encoder->Control(AOME_SET_CPUUSED, 2); + encoder->Control(AV1E_SET_TILE_COLUMNS, n_tile_cols_); + encoder->Control(AV1E_SET_TILE_ROWS, n_tile_rows_); + } + } + + virtual const aom_codec_cx_pkt_t *MutateEncoderOutputHook( + const aom_codec_cx_pkt_t *pkt) { + if (pkt->kind != AOM_CODEC_CX_FRAME_PKT) return pkt; + + const uint8_t *buffer = reinterpret_cast(pkt->data.frame.buf); + const uint8_t marker = buffer[0]; + const int frames = (marker & 0x7) + 1; + const int mag = ((marker >> 3) & 3) + 1; + const unsigned int index_sz = 2 + mag * (frames - 1); + if ((marker & 0xe0) == 0xc0 && pkt->data.frame.sz >= index_sz && + buffer[index_sz - 1] == marker) { + // frame is a superframe. strip off the index. + modified_buf_.resize(pkt->data.frame.sz - index_sz); + memcpy(&modified_buf_[0], (uint8_t *)pkt->data.frame.buf + index_sz, + pkt->data.frame.sz - index_sz); + modified_pkt_ = *pkt; + modified_pkt_.data.frame.buf = &modified_buf_[0]; + modified_pkt_.data.frame.sz -= index_sz; + + sf_count_++; + last_sf_pts_ = pkt->data.frame.pts; + return &modified_pkt_; + } + + // Make sure we do a few frames after the last SF + abort_ |= + sf_count_ > sf_count_max_ && pkt->data.frame.pts - last_sf_pts_ >= 5; + return pkt; + } + + int sf_count_; + int sf_count_max_; + aom_codec_cx_pkt_t modified_pkt_; + std::vector modified_buf_; + aom_codec_pts_t last_sf_pts_; + + private: + int n_tile_cols_; + int n_tile_rows_; +}; + +TEST_P(SuperframeTest, TestSuperframeIndexIsOptional) { + sf_count_max_ = 0; // early exit on successful test. + cfg_.g_lag_in_frames = 25; + cfg_.large_scale_tile = 1; + ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 40); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // NOTE: The use of BWDREF_FRAME will enable the coding of more non-show + // frames besides ALTREF_FRAME. + EXPECT_GE(sf_count_, 1); +} + +} // namespace diff --git a/libs/libaom/src/test/svc_datarate_test.cc b/libs/libaom/src/test/svc_datarate_test.cc new file mode 100644 index 000000000..28e517ba1 --- /dev/null +++ b/libs/libaom/src/test/svc_datarate_test.cc @@ -0,0 +1,609 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/datarate_test.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "aom/aom_codec.h" +#include "av1/common/enums.h" + +namespace datarate_test { +namespace { + +class DatarateTestSVC + : public ::libaom_test::CodecTestWith4Params, + public DatarateTest { + public: + DatarateTestSVC() : DatarateTest(GET_PARAM(0)) { + set_cpu_used_ = GET_PARAM(2); + aq_mode_ = GET_PARAM(3); + } + + protected: + virtual void SetUp() { + InitializeConfig(); + SetMode(GET_PARAM(1)); + ResetModel(); + } + + virtual int GetNumSpatialLayers() { return number_spatial_layers_; } + + virtual void ResetModel() { + DatarateTest::ResetModel(); + layer_frame_cnt_ = 0; + superframe_cnt_ = 0; + number_temporal_layers_ = 1; + number_spatial_layers_ = 1; + for (int i = 0; i < AOM_MAX_LAYERS; i++) { + target_layer_bitrate_[i] = 0; + effective_datarate_tl[i] = 0.0; + } + memset(&layer_id_, 0, sizeof(aom_svc_layer_id_t)); + memset(&svc_params_, 0, sizeof(aom_svc_params_t)); + memset(&ref_frame_config_, 0, sizeof(aom_svc_ref_frame_config_t)); + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + int spatial_layer_id = 0; + if (video->frame() == 0) { + initialize_svc(number_temporal_layers_, number_spatial_layers_, + &svc_params_); + encoder->Control(AV1E_SET_SVC_PARAMS, &svc_params_); + encoder->Control(AV1E_SET_ENABLE_ORDER_HINT, 0); + encoder->Control(AV1E_SET_ENABLE_TPL_MODEL, 0); + encoder->Control(AV1E_SET_DELTAQ_MODE, 0); + } + if (number_spatial_layers_ == 2) { + spatial_layer_id = (layer_frame_cnt_ % 2 == 0) ? 0 : 1; + } else if (number_spatial_layers_ == 3) { + spatial_layer_id = (layer_frame_cnt_ % 3 == 0) + ? 0 + : ((layer_frame_cnt_ - 1) % 3 == 0) ? 1 : 2; + } + // Set the reference/update flags, layer_id, and reference_map + // buffer index. + frame_flags_ = set_layer_pattern(video->frame(), &layer_id_, + &ref_frame_config_, spatial_layer_id); + encoder->Control(AV1E_SET_SVC_LAYER_ID, &layer_id_); + encoder->Control(AV1E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_); + layer_frame_cnt_++; + DatarateTest::PreEncodeFrameHook(video, encoder); + } + + virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) { + const size_t frame_size_in_bits = pkt->data.frame.sz * 8; + // Update the layer cumulative bitrate. + for (int i = layer_id_.temporal_layer_id; i < number_temporal_layers_; + i++) { + int layer = layer_id_.spatial_layer_id * number_temporal_layers_ + i; + effective_datarate_tl[layer] += 1.0 * frame_size_in_bits; + } + if (layer_id_.spatial_layer_id == number_spatial_layers_ - 1) { + last_pts_ = pkt->data.frame.pts; + superframe_cnt_++; + } + } + + virtual void EndPassHook(void) { + duration_ = ((last_pts_ + 1) * timebase_); + for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) { + effective_datarate_tl[i] = (effective_datarate_tl[i] / 1000) / duration_; + } + } + + // Layer pattern configuration. + virtual int set_layer_pattern(int frame_cnt, aom_svc_layer_id_t *layer_id, + aom_svc_ref_frame_config_t *ref_frame_config, + int spatial_layer) { + layer_id->spatial_layer_id = spatial_layer; + // Set the referende map buffer idx for the 7 references: + // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3), + // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6). + for (int i = 0; i < INTER_REFS_PER_FRAME; i++) { + ref_frame_config->ref_idx[i] = i; + ref_frame_config->reference[i] = 0; + } + for (int i = 0; i < REF_FRAMES; i++) ref_frame_config->refresh[i] = 0; + // Set layer_flags to 0 when using ref_frame_config->reference. + int layer_flags = 0; + // Always reference LAST. + ref_frame_config->reference[0] = 1; + if (number_temporal_layers_ == 3 && number_spatial_layers_ == 1) { + // 3-layer: + // 1 3 5 7 + // 2 6 + // 0 4 8 + if (frame_cnt % 4 == 0) { + // Base layer. + layer_id->temporal_layer_id = 0; + // Update LAST on layer 0, reference LAST and GF. + ref_frame_config->refresh[0] = 1; + ref_frame_config->reference[3] = 1; + } else if ((frame_cnt - 1) % 4 == 0) { + layer_id->temporal_layer_id = 2; + // First top layer: no updates, only reference LAST (TL0). + } else if ((frame_cnt - 2) % 4 == 0) { + layer_id->temporal_layer_id = 1; + // Middle layer (TL1): update LAST2, only reference LAST (TL0). + ref_frame_config->refresh[1] = 1; + } else if ((frame_cnt - 3) % 4 == 0) { + layer_id->temporal_layer_id = 2; + // Second top layer: no updates, only reference LAST. + // Set buffer idx for LAST to slot 1, since that was the slot + // updated in previous frame. So LAST is TL1 frame. + ref_frame_config->ref_idx[0] = 1; + ref_frame_config->ref_idx[1] = 0; + } + } else if (number_temporal_layers_ == 1 && number_spatial_layers_ == 2) { + layer_id->temporal_layer_id = 0; + if (layer_id->spatial_layer_id == 0) { + // Reference LAST, update LAST. Keep LAST and GOLDEN in slots 0 and 3. + ref_frame_config->ref_idx[0] = 0; + ref_frame_config->ref_idx[3] = 3; + ref_frame_config->refresh[0] = 1; + } else if (layer_id->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 3 + // and GOLDEN to slot 0. Update slot 3 (LAST). + ref_frame_config->ref_idx[0] = 3; + ref_frame_config->ref_idx[3] = 0; + ref_frame_config->refresh[3] = 1; + } + // Reference GOLDEN. + if (layer_id->spatial_layer_id > 0) ref_frame_config->reference[3] = 1; + } else if (number_temporal_layers_ == 1 && number_spatial_layers_ == 3) { + // 3 spatial layers, 1 temporal. + // Note for this case , we set the buffer idx for all references to be + // either LAST or GOLDEN, which are always valid references, since decoder + // will check if any of the 7 references is valid scale in + // valid_ref_frame_size(). + layer_id->temporal_layer_id = 0; + if (layer_id->spatial_layer_id == 0) { + // Reference LAST, update LAST. Set all other buffer_idx to 0. + for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0; + ref_frame_config->refresh[0] = 1; + } else if (layer_id->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1 + // and GOLDEN (and all other refs) to slot 0. + // Update slot 1 (LAST). + for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0; + ref_frame_config->ref_idx[0] = 1; + ref_frame_config->refresh[1] = 1; + } else if (layer_id->spatial_layer_id == 2) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2 + // and GOLDEN (and all other refs) to slot 1. + // Update slot 2 (LAST). + for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 1; + ref_frame_config->ref_idx[0] = 2; + ref_frame_config->refresh[2] = 1; + } + // Reference GOLDEN. + if (layer_id->spatial_layer_id > 0) ref_frame_config->reference[3] = 1; + } else if (number_temporal_layers_ == 3 && number_spatial_layers_ == 3) { + // 3 spatial and 3 temporal layer. + if (superframe_cnt_ % 4 == 0) { + // Base temporal layer. + layer_id->temporal_layer_id = 0; + if (layer_id->spatial_layer_id == 0) { + // Reference LAST, update LAST. + // Set all buffer_idx to 0. + for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0; + ref_frame_config->refresh[0] = 1; + } else if (layer_id->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1, + // GOLDEN (and all other refs) to slot 0. + // Update slot 1 (LAST). + for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0; + ref_frame_config->ref_idx[0] = 1; + ref_frame_config->refresh[1] = 1; + } else if (layer_id->spatial_layer_id == 2) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2, + // GOLDEN (and all other refs) to slot 1. + // Update slot 2 (LAST). + for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 1; + ref_frame_config->ref_idx[0] = 2; + ref_frame_config->refresh[2] = 1; + } + } else if ((superframe_cnt_ - 1) % 4 == 0) { + // First top temporal enhancement layer. + layer_id->temporal_layer_id = 2; + if (layer_id->spatial_layer_id == 0) { + // Reference LAST (slot 0). + // Set GOLDEN to slot 3 and update slot 3. + // Set all other buffer_idx to slot 0. + for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0; + ref_frame_config->ref_idx[3] = 3; + ref_frame_config->refresh[3] = 1; + } else if (layer_id->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1, + // GOLDEN (and all other refs) to slot 3. + // Set LAST2 to slot 4 and Update slot 4. + for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 3; + ref_frame_config->ref_idx[0] = 1; + ref_frame_config->ref_idx[1] = 4; + ref_frame_config->refresh[4] = 1; + } else if (layer_id->spatial_layer_id == 2) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2, + // GOLDEN (and all other refs) to slot 4. + // No update. + for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 4; + ref_frame_config->ref_idx[0] = 2; + } + } else if ((superframe_cnt_ - 2) % 4 == 0) { + // Middle temporal enhancement layer. + layer_id->temporal_layer_id = 1; + if (layer_id->spatial_layer_id == 0) { + // Reference LAST. + // Set all buffer_idx to 0. + // Set GOLDEN to slot 5 and update slot 5. + for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0; + ref_frame_config->ref_idx[3] = 5; + ref_frame_config->refresh[5] = 1; + } else if (layer_id->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1, + // GOLDEN (and all other refs) to slot 5. + // Set LAST2 to slot 6 and update slot 6. + for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 5; + ref_frame_config->ref_idx[0] = 1; + ref_frame_config->ref_idx[2] = 6; + ref_frame_config->refresh[6] = 1; + } else if (layer_id->spatial_layer_id == 2) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2, + // GOLDEN (and all other refs) to slot 6. + // Set LAST2 to slot 6 and update slot 7. + for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 6; + ref_frame_config->ref_idx[0] = 2; + ref_frame_config->ref_idx[2] = 7; + ref_frame_config->refresh[7] = 1; + } + } else if ((superframe_cnt_ - 3) % 4 == 0) { + // Second top temporal enhancement layer. + layer_id->temporal_layer_id = 2; + if (layer_id->spatial_layer_id == 0) { + // Set LAST to slot 5 and reference LAST. + // Set GOLDEN to slot 3 and update slot 3. + // Set all other buffer_idx to 0. + for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0; + ref_frame_config->ref_idx[0] = 5; + ref_frame_config->ref_idx[3] = 3; + ref_frame_config->refresh[3] = 1; + } else if (layer_id->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6, + // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4. + for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0; + ref_frame_config->ref_idx[0] = 6; + ref_frame_config->ref_idx[3] = 3; + ref_frame_config->ref_idx[1] = 4; + ref_frame_config->refresh[4] = 1; + } else if (layer_id->spatial_layer_id == 2) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 7, + // GOLDEN to slot 4. No update. + for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0; + ref_frame_config->ref_idx[0] = 7; + ref_frame_config->ref_idx[3] = 4; + } + } + // Reference GOLDEN. + if (layer_id->spatial_layer_id > 0) ref_frame_config->reference[3] = 1; + } + return layer_flags; + } + + virtual void initialize_svc(int number_temporal_layers, + int number_spatial_layers, + aom_svc_params *svc_params) { + svc_params->number_spatial_layers = number_spatial_layers; + svc_params->number_temporal_layers = number_temporal_layers; + for (int i = 0; i < number_temporal_layers * number_spatial_layers; ++i) { + svc_params->max_quantizers[i] = 60; + svc_params->min_quantizers[i] = 2; + svc_params->layer_target_bitrate[i] = target_layer_bitrate_[i]; + } + // Do at most 3 spatial or temporal layers here. + svc_params->framerate_factor[0] = 1; + if (number_temporal_layers == 2) { + svc_params->framerate_factor[0] = 2; + svc_params->framerate_factor[1] = 1; + } else if (number_temporal_layers == 3) { + svc_params->framerate_factor[0] = 4; + svc_params->framerate_factor[1] = 2; + svc_params->framerate_factor[2] = 1; + } + svc_params->scaling_factor_num[0] = 1; + svc_params->scaling_factor_den[0] = 1; + if (number_spatial_layers == 2) { + svc_params->scaling_factor_num[0] = 1; + svc_params->scaling_factor_den[0] = 2; + svc_params->scaling_factor_num[1] = 1; + svc_params->scaling_factor_den[1] = 1; + } else if (number_spatial_layers == 3) { + svc_params->scaling_factor_num[0] = 1; + svc_params->scaling_factor_den[0] = 4; + svc_params->scaling_factor_num[1] = 1; + svc_params->scaling_factor_den[1] = 2; + svc_params->scaling_factor_num[2] = 1; + svc_params->scaling_factor_den[2] = 1; + } + } + + virtual void BasicRateTargetingSVC3TL1SLTest() { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = AOM_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + + ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, + 288, 30, 1, 0, 300); + const int bitrate_array[2] = { 200, 550 }; + cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)]; + ResetModel(); + number_temporal_layers_ = 3; + target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100; + target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100; + target_layer_bitrate_[2] = cfg_.rc_target_bitrate; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) { + ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.30) + << " The datarate for the file is greater than target by too much!"; + } + } + + virtual void BasicRateTargetingSVC1TL2SLTest() { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = AOM_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + + ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, + 288, 30, 1, 0, 300); + const int bitrate_array[2] = { 300, 600 }; + cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)]; + ResetModel(); + number_temporal_layers_ = 1; + number_spatial_layers_ = 2; + target_layer_bitrate_[0] = 2 * cfg_.rc_target_bitrate / 4; + target_layer_bitrate_[1] = 2 * cfg_.rc_target_bitrate / 4; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) { + ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.35) + << " The datarate for the file is greater than target by too much!"; + } + } + + virtual void BasicRateTargetingSVC1TL3SLTest() { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = AOM_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + + ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, + 288, 30, 1, 0, 300); + const int bitrate_array[2] = { 500, 1000 }; + cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)]; + ResetModel(); + number_temporal_layers_ = 1; + number_spatial_layers_ = 3; + target_layer_bitrate_[0] = 1 * cfg_.rc_target_bitrate / 8; + target_layer_bitrate_[1] = 3 * cfg_.rc_target_bitrate / 8; + target_layer_bitrate_[2] = 4 * cfg_.rc_target_bitrate / 8; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) { + ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.38) + << " The datarate for the file is greater than target by too much!"; + } + } + + virtual void BasicRateTargetingSVC3TL3SLTest() { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = AOM_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + + ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, + 288, 30, 1, 0, 300); + const int bitrate_array[2] = { 600, 1200 }; + cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)]; + ResetModel(); + number_temporal_layers_ = 3; + number_spatial_layers_ = 3; + // SL0 + const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8; + target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100; + target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100; + target_layer_bitrate_[2] = bitrate_sl0; + // SL1 + const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8; + target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100; + target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100; + target_layer_bitrate_[5] = bitrate_sl1; + // SL2 + const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8; + target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100; + target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100; + target_layer_bitrate_[8] = bitrate_sl2; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) { + ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.38) + << " The datarate for the file is greater than target by too much!"; + } + } + + virtual void BasicRateTargetingSVC3TL3SLHDTest() { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = AOM_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + + ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + const int bitrate_array[2] = { 600, 1200 }; + cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)]; + ResetModel(); + number_temporal_layers_ = 3; + number_spatial_layers_ = 3; + // SL0 + const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8; + target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100; + target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100; + target_layer_bitrate_[2] = bitrate_sl0; + // SL1 + const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8; + target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100; + target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100; + target_layer_bitrate_[5] = bitrate_sl1; + // SL2 + const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8; + target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100; + target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100; + target_layer_bitrate_[8] = bitrate_sl2; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) { + ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.70) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.4) + << " The datarate for the file is greater than target by too much!"; + } + } + + virtual void BasicRateTargetingSVC3TL3SLKfTest() { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = AOM_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + cfg_.kf_mode = AOM_KF_AUTO; + cfg_.kf_min_dist = cfg_.kf_max_dist = 100; + + ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, + 288, 30, 1, 0, 300); + const int bitrate_array[2] = { 600, 1200 }; + cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)]; + ResetModel(); + number_temporal_layers_ = 3; + number_spatial_layers_ = 3; + // SL0 + const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8; + target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100; + target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100; + target_layer_bitrate_[2] = bitrate_sl0; + // SL1 + const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8; + target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100; + target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100; + target_layer_bitrate_[5] = bitrate_sl1; + // SL2 + const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8; + target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100; + target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100; + target_layer_bitrate_[8] = bitrate_sl2; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) { + ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.75) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.4) + << " The datarate for the file is greater than target by too much!"; + } + } + + int layer_frame_cnt_; + int superframe_cnt_; + int number_temporal_layers_; + int number_spatial_layers_; + // Allow for up to 3 temporal layers. + int target_layer_bitrate_[AOM_MAX_LAYERS]; + aom_svc_params_t svc_params_; + aom_svc_ref_frame_config_t ref_frame_config_; + aom_svc_layer_id_t layer_id_; + double effective_datarate_tl[AOM_MAX_LAYERS]; +}; + +// Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial. +TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SL) { + BasicRateTargetingSVC3TL1SLTest(); +} + +// Check basic rate targeting for CBR, for 2 spatial layers, 1 temporal. +TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL2SL) { + BasicRateTargetingSVC1TL2SLTest(); +} + +// Check basic rate targeting for CBR, for 3 spatial layers, 1 temporal. +TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL3SL) { + BasicRateTargetingSVC1TL3SLTest(); +} + +// Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers. +TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SL) { + BasicRateTargetingSVC3TL3SLTest(); +} + +// Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers. +TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLHD) { + BasicRateTargetingSVC3TL3SLHDTest(); +} + +// Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers, +// for auto key frame mode with short key frame period. +TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLKf) { + BasicRateTargetingSVC3TL3SLKfTest(); +} + +AV1_INSTANTIATE_TEST_CASE(DatarateTestSVC, + ::testing::Values(::libaom_test::kRealTime), + ::testing::Range(7, 9), + ::testing::Range(0, 4), + ::testing::Values(0, 1)); + +} // namespace +} // namespace datarate_test diff --git a/libs/libaom/src/test/temporal_filter_planewise_test.cc b/libs/libaom/src/test/temporal_filter_planewise_test.cc new file mode 100644 index 000000000..c3f3e9e05 --- /dev/null +++ b/libs/libaom/src/test/temporal_filter_planewise_test.cc @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_ports/mem.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "test/function_equivalence_test.h" + +using libaom_test::ACMRandom; +using libaom_test::FunctionEquivalenceTest; +using ::testing::Combine; +using ::testing::Range; +using ::testing::Values; +using ::testing::ValuesIn; + +#if !CONFIG_REALTIME_ONLY +namespace { + +typedef void (*TemporalFilterPlanewiseFunc)( + const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const double *noise_level, const int use_subblock, + const int block_mse, const int *subblock_mses, const int q_factor, + const uint8_t *pred, uint32_t *accum, uint16_t *count); +typedef libaom_test::FuncParam + TemporalFilterPlanewiseFuncParam; + +typedef std::tuple + TemporalFilterPlanewiseWithParam; + +class TemporalFilterPlanewiseTest + : public ::testing::TestWithParam { + public: + virtual ~TemporalFilterPlanewiseTest() {} + virtual void SetUp() { + params_ = GET_PARAM(0); + rnd_.Reset(ACMRandom::DeterministicSeed()); + src1_ = reinterpret_cast(aom_memalign(8, 256 * 256)); + src2_ = reinterpret_cast(aom_memalign(8, 256 * 256)); + + ASSERT_TRUE(src1_ != NULL); + ASSERT_TRUE(src2_ != NULL); + } + + virtual void TearDown() { + libaom_test::ClearSystemState(); + aom_free(src1_); + aom_free(src2_); + } + void RunTest(int isRandom, int width, int height, int run_times); + + void GenRandomData(int width, int height, int stride, int stride2) { + for (int ii = 0; ii < height; ii++) { + for (int jj = 0; jj < width; jj++) { + src1_[ii * stride + jj] = rnd_.Rand8(); + src2_[ii * stride2 + jj] = rnd_.Rand8(); + } + } + } + + void GenExtremeData(int width, int height, int stride, uint8_t *data, + int stride2, uint8_t *data2, uint8_t val) { + for (int ii = 0; ii < height; ii++) { + for (int jj = 0; jj < width; jj++) { + data[ii * stride + jj] = val; + data2[ii * stride2 + jj] = (255 - val); + } + } + } + + protected: + TemporalFilterPlanewiseFuncParam params_; + uint8_t *src1_; + uint8_t *src2_; + ACMRandom rnd_; +}; + +void TemporalFilterPlanewiseTest::RunTest(int isRandom, int width, int height, + int run_times) { + aom_usec_timer ref_timer, test_timer; + for (int k = 0; k < 3; k++) { + const int stride = width; + const int stride2 = width; + if (isRandom) { + GenRandomData(width, height, stride, stride2); + } else { + const int msb = 8; // Up to 8 bit input + const int limit = (1 << msb) - 1; + if (k == 0) { + GenExtremeData(width, height, stride, src1_, stride2, src2_, limit); + } else { + GenExtremeData(width, height, stride, src1_, stride2, src2_, 0); + } + } + double sigma[1] = { 2.1002103677063437 }; + DECLARE_ALIGNED(16, unsigned int, accumulator_ref[1024 * 3]); + DECLARE_ALIGNED(16, uint16_t, count_ref[1024 * 3]); + memset(accumulator_ref, 0, 1024 * 3 * sizeof(accumulator_ref[0])); + memset(count_ref, 0, 1024 * 3 * sizeof(count_ref[0])); + DECLARE_ALIGNED(16, unsigned int, accumulator_mod[1024 * 3]); + DECLARE_ALIGNED(16, uint16_t, count_mod[1024 * 3]); + memset(accumulator_mod, 0, 1024 * 3 * sizeof(accumulator_mod[0])); + memset(count_mod, 0, 1024 * 3 * sizeof(count_mod[0])); + + assert(width == 32 && height == 32); + const BLOCK_SIZE block_size = BLOCK_32X32; + const int use_subblock = 0; + const int block_mse = 20; + const int subblock_mses[4] = { 15, 16, 17, 18 }; + const int q_factor = 12; + const int mb_row = 0; + const int mb_col = 0; + const int num_planes = 1; + YV12_BUFFER_CONFIG *ref_frame = + (YV12_BUFFER_CONFIG *)malloc(sizeof(YV12_BUFFER_CONFIG)); + ref_frame->heights[0] = height; + ref_frame->strides[0] = stride; + DECLARE_ALIGNED(16, uint8_t, src[1024 * 3]); + ref_frame->buffer_alloc = src; + ref_frame->buffers[0] = ref_frame->buffer_alloc; + ref_frame->flags = 0; // Only support low bit-depth test. + memcpy(src, src1_, 1024 * 3 * sizeof(uint8_t)); + + MACROBLOCKD *mbd = (MACROBLOCKD *)malloc(sizeof(MACROBLOCKD)); + mbd->plane[0].subsampling_y = 0; + mbd->plane[0].subsampling_x = 0; + mbd->bd = 8; + + params_.ref_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes, + sigma, use_subblock, block_mse, subblock_mses, q_factor, + src2_, accumulator_ref, count_ref); + params_.tst_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes, + sigma, use_subblock, block_mse, subblock_mses, q_factor, + src2_, accumulator_mod, count_mod); + + if (run_times > 1) { + aom_usec_timer_start(&ref_timer); + for (int j = 0; j < run_times; j++) { + params_.ref_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes, + sigma, use_subblock, block_mse, subblock_mses, + q_factor, src2_, accumulator_ref, count_ref); + } + aom_usec_timer_mark(&ref_timer); + const int elapsed_time_c = + static_cast(aom_usec_timer_elapsed(&ref_timer)); + + aom_usec_timer_start(&test_timer); + for (int j = 0; j < run_times; j++) { + params_.tst_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes, + sigma, use_subblock, block_mse, subblock_mses, + q_factor, src2_, accumulator_mod, count_mod); + } + aom_usec_timer_mark(&test_timer); + const int elapsed_time_simd = + static_cast(aom_usec_timer_elapsed(&test_timer)); + + printf( + "c_time=%d \t simd_time=%d \t " + "gain=%f\t width=%d\t height=%d \n", + elapsed_time_c, elapsed_time_simd, + (float)((float)elapsed_time_c / (float)elapsed_time_simd), width, + height); + + } else { + for (int i = 0, l = 0; i < height; i++) { + for (int j = 0; j < width; j++, l++) { + EXPECT_EQ(accumulator_ref[l], accumulator_mod[l]) + << "Error:" << k << " SSE Sum Test [" << width << "x" << height + << "] C accumulator does not match optimized accumulator."; + EXPECT_EQ(count_ref[l], count_mod[l]) + << "Error:" << k << " SSE Sum Test [" << width << "x" << height + << "] C count does not match optimized count."; + } + } + } + + free(ref_frame); + free(mbd); + } +} + +TEST_P(TemporalFilterPlanewiseTest, OperationCheck) { + for (int height = 32; height <= 32; height = height * 2) { + RunTest(1, height, height, 1); // GenRandomData + } +} + +TEST_P(TemporalFilterPlanewiseTest, ExtremeValues) { + for (int height = 32; height <= 32; height = height * 2) { + RunTest(0, height, height, 1); + } +} + +TEST_P(TemporalFilterPlanewiseTest, DISABLED_Speed) { + for (int height = 32; height <= 32; height = height * 2) { + RunTest(1, height, height, 100000); + } +} + +#if HAVE_AVX2 +TemporalFilterPlanewiseFuncParam temporal_filter_planewise_test_avx2[] = { + TemporalFilterPlanewiseFuncParam(&av1_apply_temporal_filter_planewise_c, + &av1_apply_temporal_filter_planewise_avx2) +}; +INSTANTIATE_TEST_SUITE_P(AVX2, TemporalFilterPlanewiseTest, + Combine(ValuesIn(temporal_filter_planewise_test_avx2), + Range(64, 65, 4))); +#endif // HAVE_AVX2 + +#if HAVE_SSE2 +TemporalFilterPlanewiseFuncParam temporal_filter_planewise_test_sse2[] = { + TemporalFilterPlanewiseFuncParam(&av1_apply_temporal_filter_planewise_c, + &av1_apply_temporal_filter_planewise_sse2) +}; +INSTANTIATE_TEST_SUITE_P(SSE2, TemporalFilterPlanewiseTest, + Combine(ValuesIn(temporal_filter_planewise_test_sse2), + Range(64, 65, 4))); +#endif // HAVE_SSE2 + +} // namespace +#endif diff --git a/libs/libaom/src/test/temporal_filter_yuv_test.cc b/libs/libaom/src/test/temporal_filter_yuv_test.cc new file mode 100644 index 000000000..dc17aaaf7 --- /dev/null +++ b/libs/libaom/src/test/temporal_filter_yuv_test.cc @@ -0,0 +1,841 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/av1_rtcd.h" +#include "test/acm_random.h" +#include "test/register_state_check.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/mem.h" + +namespace { + +using ::libaom_test::ACMRandom; + +const int MAX_WIDTH = 32; +const int MAX_HEIGHT = 32; + +typedef void (*TemporalFilterYUVFunc)( + const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const int strength, const int use_subblock, + const int *blk_fw, const uint8_t *pred, uint32_t *accum, uint16_t *count); + +struct TemporalFilterWithBd { + TemporalFilterWithBd(TemporalFilterYUVFunc func, int bitdepth) + : temporal_filter(func), bd(bitdepth) {} + + TemporalFilterYUVFunc temporal_filter; + int bd; +}; + +std::ostream &operator<<(std::ostream &os, const TemporalFilterWithBd &tf) { + return os << "Bitdepth: " << tf.bd; +} + +int GetFilterWeight(unsigned int row, unsigned int col, + unsigned int block_height, unsigned int block_width, + const int *const blk_fw, int use_32x32) { + if (use_32x32) { + return blk_fw[0]; + } + + return blk_fw[2 * (row >= block_height / 2) + (col >= block_width / 2)]; +} + +template +int GetModIndex(int sum_dist, int index, int rounding, int strength, + int filter_weight) { + int mod = sum_dist * 3 / index; + mod += rounding; + mod >>= strength; + + mod = AOMMIN(16, mod); + + mod = 16 - mod; + mod *= filter_weight; + + return mod; +} + +// Lowbitdepth version +template <> +int GetModIndex(int sum_dist, int index, int rounding, int strength, + int filter_weight) { + unsigned int index_mult[14] = { 0, 0, 0, 0, 49152, + 39322, 32768, 28087, 24576, 21846, + 19661, 17874, 0, 15124 }; + + assert(index >= 0 && index <= 13); + assert(index_mult[index] != 0); + + int mod = (clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16; + mod += rounding; + mod >>= strength; + + mod = AOMMIN(16, mod); + + mod = 16 - mod; + mod *= filter_weight; + + return mod; +} + +// Highbitdepth version +template <> +int GetModIndex(int sum_dist, int index, int rounding, int strength, + int filter_weight) { + int64_t index_mult[14] = { 0U, 0U, 0U, 0U, + 3221225472U, 2576980378U, 2147483648U, 1840700270U, + 1610612736U, 1431655766U, 1288490189U, 1171354718U, + 0U, 991146300U }; + + assert(index >= 0 && index <= 13); + assert(index_mult[index] != 0); + + int mod = static_cast((sum_dist * index_mult[index]) >> 32); + mod += rounding; + mod >>= strength; + + mod = AOMMIN(16, mod); + + mod = 16 - mod; + mod *= filter_weight; + + return mod; +} + +template +void SetArray(PixelType *pixel_array, int width, int height, int stride, + int val) { + for (int row = 0; row < height; row++) { + for (int col = 0; col < width; col++) { + pixel_array[col] = val; + } + pixel_array += stride; + } +} + +template +void SetArray(PixelType *pixel_array, int width, int height, int stride, + ACMRandom *rnd, int low_val, int high_val) { + EXPECT_LE(low_val, high_val); + + for (int row = 0; row < height; row++) { + for (int col = 0; col < width; col++) { + const int val = + static_cast((*rnd).PseudoUniform(high_val - low_val)); + pixel_array[col] = low_val + val; + } + pixel_array += stride; + } +} + +template +bool CheckArrayEqual(const ValueType *arr_1, const ValueType *arr_2, int width, + int height, int stride_1, int stride_2) { + for (int row = 0; row < height; row++) { + for (int col = 0; col < width; col++) { + if (arr_1[col] != arr_2[col]) { + return false; + } + } + arr_1 += stride_1; + arr_2 += stride_2; + } + return true; +} + +template +void PrintArrayDiff(const ValueType *arr_1, const ValueType *arr_2, int width, + int height, int stride_1, int stride_2) { + const ValueType *arr_1_start = arr_1, *arr_2_start = arr_2; + + printf("Array 1:\n"); + for (int row = 0; row < height; ++row) { + for (int col = 0; col < width; ++col) { + if (arr_1[col] != arr_2[col]) { + printf("*%3d", arr_1[col]); + } else { + printf("%4d", arr_1[col]); + } + } + printf("\n"); + arr_1 += stride_1; + arr_2 += stride_2; + } + + arr_1 = arr_1_start; + arr_2 = arr_2_start; + + printf("Array 2:\n"); + for (int row = 0; row < height; ++row) { + for (int col = 0; col < width; ++col) { + if (arr_1[col] != arr_2[col]) { + printf("*%3d", arr_2[col]); + } else { + printf("%4d", arr_2[col]); + } + } + printf("\n"); + arr_1 += stride_1; + arr_2 += stride_2; + } + + arr_1 = arr_1_start; + arr_2 = arr_2_start; + printf("Difference:\n"); + for (int row = 0; row < height; ++row) { + for (int col = 0; col < width; ++col) { + printf("%4d", arr_1[col] - arr_2[col]); + } + printf("\n"); + arr_1 += stride_1; + arr_2 += stride_2; + } +} + +template +void ApplyReferenceFilter(const PixelType *y_src, const PixelType *y_pre, + const PixelType *u_src, const PixelType *v_src, + const PixelType *u_pre, const PixelType *v_pre, + unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, + const int *const blk_fw, int use_32x32, + uint32_t *y_accum, uint16_t *y_count, + uint32_t *u_accum, uint16_t *u_count, + uint32_t *v_accum, uint16_t *v_count) { + const int uv_block_width = block_width >> ss_x, + uv_block_height = block_height >> ss_y; + const int y_src_stride = block_width, y_pre_stride = block_width; + const int uv_src_stride = uv_block_width, uv_pre_stride = uv_block_width; + const int y_diff_stride = block_width, uv_diff_stride = uv_block_width; + const int y_count_stride = block_width, u_count_stride = uv_block_width, + v_count_stride = uv_block_width; + const int y_accum_stride = block_width, u_accum_stride = uv_block_width, + v_accum_stride = uv_block_width; + + int y_dif[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + int u_dif[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + int v_dif[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + + const int rounding = (1 << strength) >> 1; + + // Get the square diffs + for (int row = 0; row < (int)block_height; row++) { + for (int col = 0; col < (int)block_width; col++) { + const int diff = + y_src[row * y_src_stride + col] - y_pre[row * y_pre_stride + col]; + y_dif[row * y_diff_stride + col] = diff * diff; + } + } + + for (int row = 0; row < (int)uv_block_height; row++) { + for (int col = 0; col < (int)uv_block_width; col++) { + const int u_diff = + u_src[row * uv_src_stride + col] - u_pre[row * uv_pre_stride + col]; + const int v_diff = + v_src[row * uv_src_stride + col] - v_pre[row * uv_pre_stride + col]; + u_dif[row * uv_diff_stride + col] = u_diff * u_diff; + v_dif[row * uv_diff_stride + col] = v_diff * v_diff; + } + } + + // Apply the filter to luma + for (int row = 0; row < (int)block_height; row++) { + for (int col = 0; col < (int)block_width; col++) { + const int uv_row = row >> ss_y; + const int uv_col = col >> ss_x; + const int filter_weight = GetFilterWeight(row, col, block_height, + block_width, blk_fw, use_32x32); + + // First we get the modifier for the current y pixel + const int y_pixel = y_pre[row * y_pre_stride + col]; + int y_num_used = 0; + int y_mod = 0; + + // Sum the neighboring 3x3 y pixels + for (int row_step = -1; row_step <= 1; row_step++) { + for (int col_step = -1; col_step <= 1; col_step++) { + const int sub_row = row + row_step; + const int sub_col = col + col_step; + + if (sub_row >= 0 && sub_row < (int)block_height && sub_col >= 0 && + sub_col < (int)block_width) { + y_mod += y_dif[sub_row * y_diff_stride + sub_col]; + y_num_used++; + } + } + } + + // Sum the corresponding uv pixels to the current y modifier + // Note we are rounding down instead of rounding to the nearest pixel. + y_mod += u_dif[uv_row * uv_diff_stride + uv_col]; + y_mod += v_dif[uv_row * uv_diff_stride + uv_col]; + + y_num_used += 2; + + // Set the modifier + y_mod = GetModIndex(y_mod, y_num_used, rounding, strength, + filter_weight); + + // Accumulate the result + y_count[row * y_count_stride + col] += y_mod; + y_accum[row * y_accum_stride + col] += y_mod * y_pixel; + } + } + + // Apply the filter to chroma + for (int uv_row = 0; uv_row < (int)uv_block_height; uv_row++) { + for (int uv_col = 0; uv_col < (int)uv_block_width; uv_col++) { + const int y_row = uv_row << ss_y; + const int y_col = uv_col << ss_x; + const int filter_weight = GetFilterWeight( + uv_row, uv_col, uv_block_height, uv_block_width, blk_fw, use_32x32); + + const int u_pixel = u_pre[uv_row * uv_pre_stride + uv_col]; + const int v_pixel = v_pre[uv_row * uv_pre_stride + uv_col]; + + int uv_num_used = 0; + int u_mod = 0, v_mod = 0; + + // Sum the neighboring 3x3 chromal pixels to the chroma modifier + for (int row_step = -1; row_step <= 1; row_step++) { + for (int col_step = -1; col_step <= 1; col_step++) { + const int sub_row = uv_row + row_step; + const int sub_col = uv_col + col_step; + + if (sub_row >= 0 && sub_row < uv_block_height && sub_col >= 0 && + sub_col < uv_block_width) { + u_mod += u_dif[sub_row * uv_diff_stride + sub_col]; + v_mod += v_dif[sub_row * uv_diff_stride + sub_col]; + uv_num_used++; + } + } + } + + // Sum all the luma pixels associated with the current luma pixel + for (int row_step = 0; row_step < 1 + ss_y; row_step++) { + for (int col_step = 0; col_step < 1 + ss_x; col_step++) { + const int sub_row = y_row + row_step; + const int sub_col = y_col + col_step; + const int y_diff = y_dif[sub_row * y_diff_stride + sub_col]; + + u_mod += y_diff; + v_mod += y_diff; + uv_num_used++; + } + } + + // Set the modifier + u_mod = GetModIndex(u_mod, uv_num_used, rounding, strength, + filter_weight); + v_mod = GetModIndex(v_mod, uv_num_used, rounding, strength, + filter_weight); + + // Accumulate the result + u_count[uv_row * u_count_stride + uv_col] += u_mod; + u_accum[uv_row * u_accum_stride + uv_col] += u_mod * u_pixel; + v_count[uv_row * v_count_stride + uv_col] += v_mod; + v_accum[uv_row * v_accum_stride + uv_col] += v_mod * v_pixel; + } + } +} + +class TemporalFilterYUVTest + : public ::testing::TestWithParam { + public: + virtual void SetUp() { + filter_func_ = GetParam().temporal_filter; + bd_ = GetParam().bd; + use_highbd_ = (bd_ != 8); + + rnd_.Reset(ACMRandom::DeterministicSeed()); + saturate_test_ = 0; + num_repeats_ = 10; + + ASSERT_TRUE(bd_ == 8 || bd_ == 10 || bd_ == 12); + } + + protected: + template + void CompareTestWithParam(int width, int height, int ss_x, int ss_y, + int filter_strength, int use_32x32, + const int *filter_weight); + template + void RunTestFilterWithParam(int width, int height, int ss_x, int ss_y, + int filter_strength, int use_32x32, + const int *filter_weight); + template + void ApplyTestFilter(const PixelType *y_src, int y_src_stride, + const PixelType *y_pre, int y_pre_stride, + const PixelType *u_src, const PixelType *v_src, + int uv_src_stride, const PixelType *u_pre, + const PixelType *v_pre, int uv_pre_stride, + unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *blk_fw, + int use_32x32, uint32_t *y_accum, uint16_t *y_count, + uint32_t *u_accumu, uint16_t *u_count, uint32_t *v_accum, + uint16_t *v_count); + + TemporalFilterYUVFunc filter_func_; + ACMRandom rnd_; + int saturate_test_; + int num_repeats_; + int use_highbd_; + int bd_; +}; + +template <> +void TemporalFilterYUVTest::ApplyTestFilter( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32, + uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count, + uint32_t *v_accum, uint16_t *v_count) { + (void)block_width; + (void)block_height; + (void)y_src_stride; + (void)uv_src_stride; + + assert(block_width == MAX_WIDTH && MAX_WIDTH == 32); + assert(block_height == MAX_HEIGHT && MAX_HEIGHT == 32); + const BLOCK_SIZE block_size = BLOCK_32X32; + const int num_planes = 3; + const int mb_pels = MAX_WIDTH * MAX_HEIGHT; + const int mb_row = 0; + const int mb_col = 0; + const int use_subblock = !(use_32x32); + + YV12_BUFFER_CONFIG *ref_frame = + (YV12_BUFFER_CONFIG *)malloc(sizeof(YV12_BUFFER_CONFIG)); + ref_frame->strides[0] = y_pre_stride; + ref_frame->strides[1] = uv_pre_stride; + const int alloc_size = MAX_MB_PLANE * mb_pels; + DECLARE_ALIGNED(16, uint8_t, src[alloc_size]); + ref_frame->buffer_alloc = src; + ref_frame->buffers[0] = ref_frame->buffer_alloc + 0 * mb_pels; + ref_frame->buffers[1] = ref_frame->buffer_alloc + 1 * mb_pels; + ref_frame->buffers[2] = ref_frame->buffer_alloc + 2 * mb_pels; + ref_frame->flags = bd_ > 8 ? YV12_FLAG_HIGHBITDEPTH : 0; + + MACROBLOCKD *mbd = (MACROBLOCKD *)malloc(sizeof(MACROBLOCKD)); + mbd->plane[0].subsampling_y = 0; + mbd->plane[0].subsampling_x = 0; + mbd->plane[1].subsampling_y = ss_y; + mbd->plane[1].subsampling_x = ss_x; + mbd->plane[2].subsampling_y = ss_y; + mbd->plane[2].subsampling_x = ss_x; + + DECLARE_ALIGNED(16, uint8_t, pred[alloc_size]); + DECLARE_ALIGNED(16, uint32_t, accum[alloc_size]); + DECLARE_ALIGNED(16, uint16_t, count[alloc_size]); + memcpy(src + 0 * mb_pels, y_src, mb_pels * sizeof(uint8_t)); + memcpy(src + 1 * mb_pels, u_src, mb_pels * sizeof(uint8_t)); + memcpy(src + 2 * mb_pels, v_src, mb_pels * sizeof(uint8_t)); + memcpy(pred + 0 * mb_pels, y_pre, mb_pels * sizeof(uint8_t)); + memcpy(pred + 1 * mb_pels, u_pre, mb_pels * sizeof(uint8_t)); + memcpy(pred + 2 * mb_pels, v_pre, mb_pels * sizeof(uint8_t)); + memcpy(accum + 0 * mb_pels, y_accum, mb_pels * sizeof(uint32_t)); + memcpy(accum + 1 * mb_pels, u_accum, mb_pels * sizeof(uint32_t)); + memcpy(accum + 2 * mb_pels, v_accum, mb_pels * sizeof(uint32_t)); + memcpy(count + 0 * mb_pels, y_count, mb_pels * sizeof(uint16_t)); + memcpy(count + 1 * mb_pels, u_count, mb_pels * sizeof(uint16_t)); + memcpy(count + 2 * mb_pels, v_count, mb_pels * sizeof(uint16_t)); + + ASM_REGISTER_STATE_CHECK( + filter_func_(ref_frame, mbd, block_size, mb_row, mb_col, num_planes, + strength, use_subblock, blk_fw, pred, accum, count)); + + memcpy(y_accum, accum + 0 * mb_pels, mb_pels * sizeof(uint32_t)); + memcpy(u_accum, accum + 1 * mb_pels, mb_pels * sizeof(uint32_t)); + memcpy(v_accum, accum + 2 * mb_pels, mb_pels * sizeof(uint32_t)); + memcpy(y_count, count + 0 * mb_pels, mb_pels * sizeof(uint16_t)); + memcpy(u_count, count + 1 * mb_pels, mb_pels * sizeof(uint16_t)); + memcpy(v_count, count + 2 * mb_pels, mb_pels * sizeof(uint16_t)); + + free(ref_frame); + free(mbd); +} + +template <> +void TemporalFilterYUVTest::ApplyTestFilter( + const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, + int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, + int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32, + uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count, + uint32_t *v_accum, uint16_t *v_count) { + (void)block_width; + (void)block_height; + (void)y_src_stride; + (void)uv_src_stride; + + assert(block_width == MAX_WIDTH && MAX_WIDTH == 32); + assert(block_height == MAX_HEIGHT && MAX_HEIGHT == 32); + const BLOCK_SIZE block_size = BLOCK_32X32; + const int num_planes = 3; + const int mb_pels = MAX_WIDTH * MAX_HEIGHT; + const int mb_row = 0; + const int mb_col = 0; + const int use_subblock = !(use_32x32); + + YV12_BUFFER_CONFIG *ref_frame = + (YV12_BUFFER_CONFIG *)malloc(sizeof(YV12_BUFFER_CONFIG)); + ref_frame->strides[0] = y_pre_stride; + ref_frame->strides[1] = uv_pre_stride; + const int alloc_size = MAX_MB_PLANE * mb_pels; + DECLARE_ALIGNED(16, uint16_t, src16[alloc_size]); + ref_frame->buffer_alloc = CONVERT_TO_BYTEPTR(src16); + ref_frame->buffers[0] = ref_frame->buffer_alloc + 0 * mb_pels; + ref_frame->buffers[1] = ref_frame->buffer_alloc + 1 * mb_pels; + ref_frame->buffers[2] = ref_frame->buffer_alloc + 2 * mb_pels; + ref_frame->flags = bd_ > 8 ? YV12_FLAG_HIGHBITDEPTH : 0; + + MACROBLOCKD *mbd = (MACROBLOCKD *)malloc(sizeof(MACROBLOCKD)); + mbd->plane[0].subsampling_y = 0; + mbd->plane[0].subsampling_x = 0; + mbd->plane[1].subsampling_y = ss_y; + mbd->plane[1].subsampling_x = ss_x; + mbd->plane[2].subsampling_y = ss_y; + mbd->plane[2].subsampling_x = ss_x; + + DECLARE_ALIGNED(16, uint16_t, pred16[alloc_size]); + DECLARE_ALIGNED(16, uint32_t, accum[alloc_size]); + DECLARE_ALIGNED(16, uint16_t, count[alloc_size]); + memcpy(src16 + 0 * mb_pels, y_src, mb_pels * sizeof(uint16_t)); + memcpy(src16 + 1 * mb_pels, u_src, mb_pels * sizeof(uint16_t)); + memcpy(src16 + 2 * mb_pels, v_src, mb_pels * sizeof(uint16_t)); + memcpy(pred16 + 0 * mb_pels, y_pre, mb_pels * sizeof(uint16_t)); + memcpy(pred16 + 1 * mb_pels, u_pre, mb_pels * sizeof(uint16_t)); + memcpy(pred16 + 2 * mb_pels, v_pre, mb_pels * sizeof(uint16_t)); + memcpy(accum + 0 * mb_pels, y_accum, mb_pels * sizeof(uint32_t)); + memcpy(accum + 1 * mb_pels, u_accum, mb_pels * sizeof(uint32_t)); + memcpy(accum + 2 * mb_pels, v_accum, mb_pels * sizeof(uint32_t)); + memcpy(count + 0 * mb_pels, y_count, mb_pels * sizeof(uint16_t)); + memcpy(count + 1 * mb_pels, u_count, mb_pels * sizeof(uint16_t)); + memcpy(count + 2 * mb_pels, v_count, mb_pels * sizeof(uint16_t)); + const uint8_t *pred = CONVERT_TO_BYTEPTR(pred16); + + ASM_REGISTER_STATE_CHECK( + filter_func_(ref_frame, mbd, block_size, mb_row, mb_col, num_planes, + strength, use_subblock, blk_fw, pred, accum, count)); + + memcpy(y_accum, accum + 0 * mb_pels, mb_pels * sizeof(uint32_t)); + memcpy(u_accum, accum + 1 * mb_pels, mb_pels * sizeof(uint32_t)); + memcpy(v_accum, accum + 2 * mb_pels, mb_pels * sizeof(uint32_t)); + memcpy(y_count, count + 0 * mb_pels, mb_pels * sizeof(uint16_t)); + memcpy(u_count, count + 1 * mb_pels, mb_pels * sizeof(uint16_t)); + memcpy(v_count, count + 2 * mb_pels, mb_pels * sizeof(uint16_t)); + + free(ref_frame); + free(mbd); +} + +template +void TemporalFilterYUVTest::CompareTestWithParam(int width, int height, + int ss_x, int ss_y, + int filter_strength, + int use_32x32, + const int *filter_weight) { + const int uv_width = width >> ss_x, uv_height = height >> ss_y; + const int y_stride = width, uv_stride = uv_width; + + DECLARE_ALIGNED(16, PixelType, y_src[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, PixelType, y_pre[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, y_count_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, y_accum_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, y_count_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, y_accum_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + + DECLARE_ALIGNED(16, PixelType, u_src[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, PixelType, u_pre[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, u_count_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, u_accum_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, u_count_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, u_accum_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + + DECLARE_ALIGNED(16, PixelType, v_src[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, PixelType, v_pre[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, v_count_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, v_accum_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, v_count_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, v_accum_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + + for (int repeats = 0; repeats < num_repeats_; repeats++) { + if (saturate_test_) { + const int max_val = (1 << bd_) - 1; + SetArray(y_src, width, height, y_stride, max_val); + SetArray(y_pre, width, height, y_stride, 0); + SetArray(u_src, uv_width, uv_height, uv_stride, max_val); + SetArray(u_pre, uv_width, uv_height, uv_stride, 0); + SetArray(v_src, uv_width, uv_height, uv_stride, max_val); + SetArray(v_pre, uv_width, uv_height, uv_stride, 0); + } else { + const int max_val = 7 << (bd_ - 8); + SetArray(y_src, width, height, y_stride, &rnd_, 0, max_val); + SetArray(y_pre, width, height, y_stride, &rnd_, 0, max_val); + SetArray(u_src, uv_width, uv_height, uv_stride, &rnd_, 0, max_val); + SetArray(u_pre, uv_width, uv_height, uv_stride, &rnd_, 0, max_val); + SetArray(v_src, uv_width, uv_height, uv_stride, &rnd_, 0, max_val); + SetArray(v_pre, uv_width, uv_height, uv_stride, &rnd_, 0, max_val); + } + + ApplyReferenceFilter( + y_src, y_pre, u_src, v_src, u_pre, v_pre, width, height, ss_x, ss_y, + filter_strength, filter_weight, use_32x32, y_accum_ref, y_count_ref, + u_accum_ref, u_count_ref, v_accum_ref, v_count_ref); + + ApplyTestFilter(y_src, y_stride, y_pre, y_stride, u_src, v_src, uv_stride, + u_pre, v_pre, uv_stride, width, height, ss_x, ss_y, + filter_strength, filter_weight, use_32x32, y_accum_tst, + y_count_tst, u_accum_tst, u_count_tst, v_accum_tst, + v_count_tst); + + EXPECT_TRUE(CheckArrayEqual(y_accum_tst, y_accum_ref, width, height, + y_stride, y_stride)); + EXPECT_TRUE(CheckArrayEqual(y_count_tst, y_count_ref, width, height, + y_stride, y_stride)); + EXPECT_TRUE(CheckArrayEqual(u_accum_tst, u_accum_ref, uv_width, uv_height, + uv_stride, uv_stride)); + EXPECT_TRUE(CheckArrayEqual(u_count_tst, u_count_ref, uv_width, uv_height, + uv_stride, uv_stride)); + EXPECT_TRUE(CheckArrayEqual(v_accum_tst, v_accum_ref, uv_width, uv_height, + uv_stride, uv_stride)); + EXPECT_TRUE(CheckArrayEqual(v_count_tst, v_count_ref, uv_width, uv_height, + uv_stride, uv_stride)); + + if (HasFailure()) { + if (use_32x32) { + printf("SS_X: %d, SS_Y: %d, Strength: %d, Weight: %d\n", ss_x, ss_y, + filter_strength, *filter_weight); + } else { + printf("SS_X: %d, SS_Y: %d, Strength: %d, Weights: %d,%d,%d,%d\n", ss_x, + ss_y, filter_strength, filter_weight[0], filter_weight[1], + filter_weight[2], filter_weight[3]); + } + + PrintArrayDiff(y_accum_ref, y_accum_tst, width, height, y_stride, + y_stride); + PrintArrayDiff(y_count_ref, y_count_tst, width, height, y_stride, + y_stride); + PrintArrayDiff(u_accum_ref, v_accum_tst, uv_width, uv_height, uv_stride, + uv_stride); + PrintArrayDiff(u_count_ref, v_count_tst, uv_width, uv_height, uv_stride, + uv_stride); + PrintArrayDiff(u_accum_ref, v_accum_tst, uv_width, uv_height, uv_stride, + uv_stride); + PrintArrayDiff(u_count_ref, v_count_tst, uv_width, uv_height, uv_stride, + uv_stride); + + return; + } + } +} + +template +void TemporalFilterYUVTest::RunTestFilterWithParam(int width, int height, + int ss_x, int ss_y, + int filter_strength, + int use_32x32, + const int *filter_weight) { + PixelType y_src[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + PixelType y_pre[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + uint16_t y_count[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + uint32_t y_accum[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + + PixelType u_src[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + PixelType u_pre[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + uint16_t u_count[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + uint32_t u_accum[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + + PixelType v_src[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + PixelType v_pre[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + uint16_t v_count[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + uint32_t v_accum[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + + SetArray(y_src, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8)); + SetArray(y_pre, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8)); + SetArray(u_src, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8)); + SetArray(u_pre, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8)); + SetArray(v_src, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8)); + SetArray(v_pre, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8)); + + for (int repeats = 0; repeats < num_repeats_; repeats++) { + ApplyTestFilter(y_src, MAX_WIDTH, y_pre, MAX_WIDTH, u_src, v_src, MAX_WIDTH, + u_pre, v_pre, MAX_WIDTH, width, height, ss_x, ss_y, + filter_strength, filter_weight, use_32x32, y_accum, y_count, + u_accum, u_count, v_accum, v_count); + } +} + +TEST_P(TemporalFilterYUVTest, Use32x32) { + const int width = 32, height = 32; + const int use_32x32 = 1; + + for (int ss_x = 0; ss_x <= 1; ss_x++) { + for (int ss_y = 0; ss_y <= 1; ss_y++) { + for (int filter_strength = 0; filter_strength <= 6; + filter_strength += 2) { + for (int filter_weight = 0; filter_weight <= 2; filter_weight++) { + if (use_highbd_) { + const int adjusted_strength = filter_strength + 2 * (bd_ - 8); + CompareTestWithParam(width, height, ss_x, ss_y, + adjusted_strength, use_32x32, + &filter_weight); + } else { + CompareTestWithParam(width, height, ss_x, ss_y, + filter_strength, use_32x32, + &filter_weight); + } + ASSERT_FALSE(HasFailure()); + } + } + } + } +} + +TEST_P(TemporalFilterYUVTest, Use16x16) { + const int width = 32, height = 32; + const int use_32x32 = 0; + + for (int ss_x = 0; ss_x <= 1; ss_x++) { + for (int ss_y = 0; ss_y <= 1; ss_y++) { + for (int filter_idx = 0; filter_idx < 3 * 3 * 3 * 3; filter_idx++) { + // Set up the filter + int filter_weight[4]; + int filter_idx_cp = filter_idx; + for (int idx = 0; idx < 4; idx++) { + filter_weight[idx] = filter_idx_cp % 3; + filter_idx_cp /= 3; + } + + // Test each parameter + for (int filter_strength = 0; filter_strength <= 6; + filter_strength += 2) { + if (use_highbd_) { + const int adjusted_strength = filter_strength + 2 * (bd_ - 8); + CompareTestWithParam(width, height, ss_x, ss_y, + adjusted_strength, use_32x32, + filter_weight); + } else { + CompareTestWithParam(width, height, ss_x, ss_y, + filter_strength, use_32x32, + filter_weight); + } + + ASSERT_FALSE(HasFailure()); + } + } + } + } +} + +TEST_P(TemporalFilterYUVTest, SaturationTest) { + const int width = 32, height = 32; + const int use_32x32 = 1; + const int filter_weight = 1; + saturate_test_ = 1; + + for (int ss_x = 0; ss_x <= 1; ss_x++) { + for (int ss_y = 0; ss_y <= 1; ss_y++) { + for (int filter_strength = 0; filter_strength <= 6; + filter_strength += 2) { + if (use_highbd_) { + const int adjusted_strength = filter_strength + 2 * (bd_ - 8); + CompareTestWithParam(width, height, ss_x, ss_y, + adjusted_strength, use_32x32, + &filter_weight); + } else { + CompareTestWithParam(width, height, ss_x, ss_y, + filter_strength, use_32x32, + &filter_weight); + } + + ASSERT_FALSE(HasFailure()); + } + } + } +} + +TEST_P(TemporalFilterYUVTest, DISABLED_Speed) { + const int width = 32, height = 32; + num_repeats_ = 1000; + + for (int use_32x32 = 0; use_32x32 <= 1; use_32x32++) { + const int num_filter_weights = use_32x32 ? 3 : 3 * 3 * 3 * 3; + for (int ss_x = 0; ss_x <= 1; ss_x++) { + for (int ss_y = 0; ss_y <= 1; ss_y++) { + for (int filter_idx = 0; filter_idx < num_filter_weights; + filter_idx++) { + // Set up the filter + int filter_weight[4]; + int filter_idx_cp = filter_idx; + for (int idx = 0; idx < 4; idx++) { + filter_weight[idx] = filter_idx_cp % 3; + filter_idx_cp /= 3; + } + + // Test each parameter + for (int filter_strength = 0; filter_strength <= 6; + filter_strength += 2) { + aom_usec_timer timer; + aom_usec_timer_start(&timer); + + if (use_highbd_) { + RunTestFilterWithParam(width, height, ss_x, ss_y, + filter_strength, use_32x32, + filter_weight); + } else { + RunTestFilterWithParam(width, height, ss_x, ss_y, + filter_strength, use_32x32, + filter_weight); + } + + aom_usec_timer_mark(&timer); + const int elapsed_time = + static_cast(aom_usec_timer_elapsed(&timer)); + + printf( + "Bitdepth: %d, Use 32X32: %d, SS_X: %d, SS_Y: %d, Weight Idx: " + "%d, Strength: %d, Time: %5d\n", + bd_, use_32x32, ss_x, ss_y, filter_idx, filter_strength, + elapsed_time); + } + } + } + } + } +} + +INSTANTIATE_TEST_SUITE_P( + C, TemporalFilterYUVTest, + ::testing::Values( + TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_c, 8), + TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_c, 10), + TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_c, 12))); + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P( + SSE4_1, TemporalFilterYUVTest, + ::testing::Values( + TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_sse4_1, 8), + TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_sse4_1, 10), + TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_sse4_1, 12))); +#endif // HAVE_SSE4_1 + +} // namespace diff --git a/libs/libaom/src/test/test-data.sha1 b/libs/libaom/src/test/test-data.sha1 new file mode 100644 index 000000000..383ae79c1 --- /dev/null +++ b/libs/libaom/src/test/test-data.sha1 @@ -0,0 +1,559 @@ +d5dfb0151c9051f8c85999255645d7a23916d3c0 *hantro_collage_w352h288.yuv +b87815bf86020c592ccc7a846ba2e28ec8043902 *hantro_odd.yuv +26b7f64399b84db4b4c9c915d743ec5c2619d4b9 *invalid-bug-1814.ivf +d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-bug-1814.ivf.res +09aa07e5325b3bb5462182eb30b8ecc914630740 *invalid-chromium-906381.ivf +09d2af8dd22201dd8d48e5dcfcaed281ff9422c7 *invalid-chromium-906381.ivf.res +f7c83c14aa35b928ba8b70f3eaa3b92070be4519 *invalid-google-142530197-1.ivf +d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-google-142530197-1.ivf.res +703c05720d5d67053bcee44987635cd78af2f971 *invalid-google-142530197.ivf +d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-google-142530197.ivf.res +fa06784f23751d8c37be94160fb821e855199af4 *invalid-oss-fuzz-10061.ivf +b055f06b9a95aaa5697fa26497b592a47843a7c8 *invalid-oss-fuzz-10061.ivf.res +c9e06c4c7fb7d69fd635a1f606a5e478d60e99cf *invalid-oss-fuzz-10117-mc-buf-use-highbd.ivf +88e18e61bd2b7457b4c71ebefbdff0029c41cc04 *invalid-oss-fuzz-10117-mc-buf-use-highbd.ivf.res +91a5bedeb4832c1c2900736cc0f644bb63971bbc *invalid-oss-fuzz-10227.ivf +b055f06b9a95aaa5697fa26497b592a47843a7c8 *invalid-oss-fuzz-10227.ivf.res +b2d0a29a65879436bf483d04865faca7d11cc2ee *invalid-oss-fuzz-10389.ivf +9655e6275888547ecd1f14e20e08ce4891372e76 *invalid-oss-fuzz-10389.ivf.res +e5fe0e8984c42d53d4ff734c3fbfd57d5c5c25cf *invalid-oss-fuzz-10389.ivf.res.2 +11df8e9a068669c678097d460b63609d3da73828 *invalid-oss-fuzz-10555.ivf +b055f06b9a95aaa5697fa26497b592a47843a7c8 *invalid-oss-fuzz-10555.ivf.res +cf5945085fe85456a1f74bf4cc7998b88b3f4b62 *invalid-oss-fuzz-10705.ivf +758671858368ffd2a2c0727898de5661f7cf7d68 *invalid-oss-fuzz-10705.ivf.res +88e29851122cca3f336824f7fa4d9f757f91110c *invalid-oss-fuzz-10723.ivf +1af486cd2cc83ebeddc76ca7a1c512cc0ec568d5 *invalid-oss-fuzz-10723.ivf.res +64f8a208dec7f1580fbe0371aa15e62bb1262715 *invalid-oss-fuzz-10723.ivf.res.2 +0784acc8931090ec24eba752d6c27e359e68fe7d *invalid-oss-fuzz-10779.ivf +5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-oss-fuzz-10779.ivf.res +7d37be9357f89a100ced694aee1ca5a6fad35ba9 *invalid-oss-fuzz-11477.ivf +15932651aacfc4622f0910f728f3f95e08e1753d *invalid-oss-fuzz-11477.ivf.res +1674787c38ddf82a2e5c804203f04f56a304e8e0 *invalid-oss-fuzz-11479.ivf +1af486cd2cc83ebeddc76ca7a1c512cc0ec568d5 *invalid-oss-fuzz-11479.ivf.res +64f8a208dec7f1580fbe0371aa15e62bb1262715 *invalid-oss-fuzz-11479.ivf.res.2 +b1a45514f0c59be03c9991cd04882426b9b930fa *invalid-oss-fuzz-11523.ivf +7c44ac1723c14d98bcb888fbf118c959511519ba *invalid-oss-fuzz-11523.ivf.res +3198c7af55a7d50173ce3c369c0cf2d9cdfface6 *invalid-oss-fuzz-11523.ivf.res.2 +cb445173be760c3554f1740ce4d119f57a7be043 *invalid-oss-fuzz-15363.ivf +d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-oss-fuzz-15363.ivf.res +5b697360bf0f02de31bae9b8da78e93570958fa4 *invalid-oss-fuzz-16437.ivf +09d2af8dd22201dd8d48e5dcfcaed281ff9422c7 *invalid-oss-fuzz-16437.ivf.res +ccbe4081557eb44820a0e6337c4a094421826b9a *invalid-oss-fuzz-9288.ivf +67c54283fe1a26ccf02cc991e4f9a1eea3ac5e78 *invalid-oss-fuzz-9288.ivf.res +c0960f032484579f967881cc025b71cfd7a79ee1 *invalid-oss-fuzz-9463.ivf +d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-oss-fuzz-9463.ivf.res +5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-oss-fuzz-9463.ivf.res.2 +f448caf378e250b7eea4fa2d1c3cd7ef4a3211ce *invalid-oss-fuzz-9482.ivf +b055f06b9a95aaa5697fa26497b592a47843a7c8 *invalid-oss-fuzz-9482.ivf.res +a686989de79af89136f631fd630df639c7861851 *invalid-oss-fuzz-9720.ivf +d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-oss-fuzz-9720.ivf.res +a432f96ff0a787268e2f94a8092ab161a18d1b06 *park_joy_90p_10_420.y4m +0b194cc312c3a2e84d156a221b0a5eb615dfddc5 *park_joy_90p_10_422.y4m +ff0e0a21dc2adc95b8c1b37902713700655ced17 *park_joy_90p_10_444.y4m +c934da6fb8cc54ee2a8c17c54cf6076dac37ead0 *park_joy_90p_10_440.yuv +614c32ae1eca391e867c70d19974f0d62664dd99 *park_joy_90p_12_420.y4m +c92825f1ea25c5c37855083a69faac6ac4641a9e *park_joy_90p_12_422.y4m +b592189b885b6cc85db55cc98512a197d73d3b34 *park_joy_90p_12_444.y4m +82c1bfcca368c2f22bad7d693d690d5499ecdd11 *park_joy_90p_12_440.yuv +b9e1e90aece2be6e2c90d89e6ab2372d5f8c792d *park_joy_90p_8_420_a10-1.y4m +4e0eb61e76f0684188d9bc9f3ce61f6b6b77bb2c *park_joy_90p_8_420.y4m +7a193ff7dfeb96ba5f82b2afd7afa9e1fe83d947 *park_joy_90p_8_422.y4m +bdb7856e6bc93599bdda05c2e773a9f22b6c6d03 *park_joy_90p_8_444.y4m +81e1f3843748438b8f2e71db484eb22daf72e939 *park_joy_90p_8_440.yuv +b1f1c3ec79114b9a0651af24ce634afb44a9a419 *rush_hour_444.y4m +eb438c6540eb429f74404eedfa3228d409c57874 *desktop_640_360_30.yuv +89e70ebd22c27d275fe14dc2f1a41841a6d8b9ab *kirland_640_480_30.yuv +33c533192759e5bb4f07abfbac389dc259db4686 *macmarcomoving_640_480_30.yuv +8bfaab121080821b8f03b23467911e59ec59b8fe *macmarcostationary_640_480_30.yuv +70894878d916a599842d9ad0dcd24e10c13e5467 *niklas_640_480_30.yuv +8784b6df2d8cc946195a90ac00540500d2e522e4 *tacomanarrows_640_480_30.yuv +edd86a1f5e62fd9da9a9d46078247759c2638009 *tacomasmallcameramovement_640_480_30.yuv +9a70e8b7d14fba9234d0e51dce876635413ce444 *thaloundeskmtg_640_480_30.yuv +e7d315dbf4f3928779e0dc624311196d44491d32 *niklas_1280_720_30.yuv +717da707afcaa1f692ff1946f291054eb75a4f06 *screendata.y4m +9cfc855459e7549fd015c79e8eca512b2f2cb7e3 *niklas_1280_720_30.y4m +5b5763b388b1b52a81bb82b39f7ec25c4bd3d0e1 *desktop_credits.y4m +36ddab9b99eb7545aa0bf362d6f498212d596516 *vase10x10.yuv +c542890ac929749000f7b3883174f2202070d834 *pixel_capture_w320h240.yuv +c2e1ec9936b95254187a359e94aa32a9f3dad1b7 *av1-1-b8-00-quantizer-00.ivf +26cd2a0321d01d9db5f6dace8b43a40cd5b9d58d *av1-1-b8-00-quantizer-00.ivf.md5 +a56dd02c0258d4afea1ee358a22b54e99e39d5e1 *av1-1-b8-00-quantizer-01.ivf +b3d24124d81f1fbb26f5eb0036accb54f3ec69b2 *av1-1-b8-00-quantizer-01.ivf.md5 +3466327cb842a91d69839b11ef930a74f086f4c6 *av1-1-b8-00-quantizer-02.ivf +c111dce946100efeaad34203080eee1d55464df6 *av1-1-b8-00-quantizer-02.ivf.md5 +d3f1f32de5e2c0c19a58bb8ef096108388c6a820 *av1-1-b8-00-quantizer-03.ivf +6265321b31130545b4454982ca93e412a56845b8 *av1-1-b8-00-quantizer-03.ivf.md5 +f37c393ebe73266a5ec8508a2ca33c586ff28e64 *av1-1-b8-00-quantizer-04.ivf +c6e979da71aecc593c0abb40135dd304152b00dd *av1-1-b8-00-quantizer-04.ivf.md5 +ac9c5e93cb19942a9be259d0567ec96c54dcdc7c *av1-1-b8-00-quantizer-05.ivf +49e35a7399568a0e4f015ce323d5a45ea780ca87 *av1-1-b8-00-quantizer-05.ivf.md5 +461142b1b50ae74c6b698d23f5ed3b764eadfb89 *av1-1-b8-00-quantizer-06.ivf +6477ff260624e0f76c94ac872d1e7d5576af4177 *av1-1-b8-00-quantizer-06.ivf.md5 +7f8113cd13d8faaa06fdbaaa50dc328daf037e6d *av1-1-b8-00-quantizer-07.ivf +b26795c6cb408487c20737977cd6b77311772bf7 *av1-1-b8-00-quantizer-07.ivf.md5 +4218f7945a172e1fe4f9e77ec35085a394eda9f4 *av1-1-b8-00-quantizer-08.ivf +ea5d7d501e9a69d805251e4871515d28468d8676 *av1-1-b8-00-quantizer-08.ivf.md5 +837f3bcadfe56cf302db2ebaf9a990446fb35801 *av1-1-b8-00-quantizer-09.ivf +eede995cdac5fd01a411da2e74e86e8394138be1 *av1-1-b8-00-quantizer-09.ivf.md5 +adc229b3780a4968c18ded1bcbe72e3f04643833 *av1-1-b8-00-quantizer-10.ivf +0799b7e54e54ee97bf0e8aad2b75509ce59c7097 *av1-1-b8-00-quantizer-10.ivf.md5 +44bac8247160a8d9a0ab19f890fc89cc9298de1d *av1-1-b8-00-quantizer-11.ivf +cc6b2bf167e114599b242aba574e8c6f1fa2f047 *av1-1-b8-00-quantizer-11.ivf.md5 +ebb3af7dfc15567188bcb617021cdc95ebc560e3 *av1-1-b8-00-quantizer-12.ivf +b716ae29d56cd0c052dbfa1b5dcf850cd0fa8ca7 *av1-1-b8-00-quantizer-12.ivf.md5 +46159641f981a26fb9c374a5ca41e44f0ce0a9f0 *av1-1-b8-00-quantizer-13.ivf +c6db1b8b4a74f83e4a0647e053cea0fc00f6abab *av1-1-b8-00-quantizer-13.ivf.md5 +fadc909d18eb640760fbb075f922fb050e715470 *av1-1-b8-00-quantizer-14.ivf +e36bb6b23273633ba3ef7d28160a7258840a1476 *av1-1-b8-00-quantizer-14.ivf.md5 +8befbd9cc1601dcd36ec6911613855f68e6fd40e *av1-1-b8-00-quantizer-15.ivf +cfc2334b76fb5e7aa9d8607e89d37cbc7716d62e *av1-1-b8-00-quantizer-15.ivf.md5 +ca42e00ae27c6b7f684fe3d2a787d50d2827cb3f *av1-1-b8-00-quantizer-16.ivf +f11278218a7c3c73cfaab2332bab55f06cedcc81 *av1-1-b8-00-quantizer-16.ivf.md5 +05270d365bdc067f9446eda3029a6f41571a5229 *av1-1-b8-00-quantizer-17.ivf +fb6482f35e7ad04bf231ea1806226760abcb3c26 *av1-1-b8-00-quantizer-17.ivf.md5 +617bc72037165efbff478d5a0d342b3c20ffcafd *av1-1-b8-00-quantizer-18.ivf +1ff68d5424f91322123fe0d58f436b8e49cfa99d *av1-1-b8-00-quantizer-18.ivf.md5 +821c3b1ae6054c7a91b2f64428806e57f1157ca6 *av1-1-b8-00-quantizer-19.ivf +f2fd118e786697553d6987f786660a2bb9f00680 *av1-1-b8-00-quantizer-19.ivf.md5 +48bcf17c27d9a4eb73632a68c09f42eff9f9af99 *av1-1-b8-00-quantizer-20.ivf +64d55e4c858414bc2837c9c3e2d5fb6d2208c4b8 *av1-1-b8-00-quantizer-20.ivf.md5 +d61ecdd4f0950bc5c8bae1270b22e711bdd22763 *av1-1-b8-00-quantizer-21.ivf +9d447938596096704fd5f4d41bcdf6fabf9cdfb9 *av1-1-b8-00-quantizer-21.ivf.md5 +59b4b65d8e56ccdd1bddff26a03e991a63409334 *av1-1-b8-00-quantizer-22.ivf +aa1be0c7c7622d612af85f9bf96a212f6fe5ab56 *av1-1-b8-00-quantizer-22.ivf.md5 +95ed96988eb9916cad956db9b929718769de49f1 *av1-1-b8-00-quantizer-23.ivf +596b8a3aea468996d609624367465c412751f52b *av1-1-b8-00-quantizer-23.ivf.md5 +e6c2dc4ce725003152797b3d7b34d7eb34da50c8 *av1-1-b8-00-quantizer-24.ivf +1cd3d7e8b3813a9e5591b94eaeb72d471780e64a *av1-1-b8-00-quantizer-24.ivf.md5 +6734e353008824e523939d1a18daa3f2ab2d8ec6 *av1-1-b8-00-quantizer-25.ivf +c45cf440a05802c1f9e29472175ed397d130d988 *av1-1-b8-00-quantizer-25.ivf.md5 +3372b1c69fb39811156adcea4f6dba802c0918c2 *av1-1-b8-00-quantizer-26.ivf +b1751d55bb3fb788751fe28fb7434bee153bda68 *av1-1-b8-00-quantizer-26.ivf.md5 +e7ddb19a6e2a798d6a4e7dfdfc10b4df777b60e3 *av1-1-b8-00-quantizer-27.ivf +0e19d6b79cd71de69d03e0455349568af979b170 *av1-1-b8-00-quantizer-27.ivf.md5 +7f1c90a35543d6b673e353b3702baf3aa1caeaa7 *av1-1-b8-00-quantizer-28.ivf +d9a4f9cb88103249a05a7e6aa616bf0c16bf9c95 *av1-1-b8-00-quantizer-28.ivf.md5 +28d741b923011c7fcc50a7318256a638d3110a07 *av1-1-b8-00-quantizer-29.ivf +c68cacf2b2ff2694945a99ad836dcf1ee3961c09 *av1-1-b8-00-quantizer-29.ivf.md5 +9a5d9ea4bc76dd40d04e92f33f45e9c2e120e85d *av1-1-b8-00-quantizer-30.ivf +eb02bb8c16c4c0368ddff83e05e516e84ec9eaf3 *av1-1-b8-00-quantizer-30.ivf.md5 +20193c372f44f522e094c2c05fc7e4aaa0717fa8 *av1-1-b8-00-quantizer-31.ivf +a4c1a4ac332f4911f0d5abbd826ebecfb8432d6c *av1-1-b8-00-quantizer-31.ivf.md5 +9617bbd691f093d259dbc8a642a57a153c1fc00c *av1-1-b8-00-quantizer-32.ivf +73d60a348454b126ea6368ea604954bc23f210ae *av1-1-b8-00-quantizer-32.ivf.md5 +d9aea9d72a686c59b60584d827f60ca1ee8eee26 *av1-1-b8-00-quantizer-33.ivf +fbf64de376a63d2d3051da83b0e4e56579b55c0a *av1-1-b8-00-quantizer-33.ivf.md5 +791aaf067f125e5cf4a247cf06a2e29ab071ec90 *av1-1-b8-00-quantizer-34.ivf +8e2e6efe4c069e54844da19125c4280b95990c69 *av1-1-b8-00-quantizer-34.ivf.md5 +01ba67bba5cbf7c94c65da8f4c9bd6e7db24cf3a *av1-1-b8-00-quantizer-35.ivf +0c5e60704a4a6bd27e67b6fd72ca7d2cf7fff50f *av1-1-b8-00-quantizer-35.ivf.md5 +3e255b4a320c9522dcec539fef770b6920b9a102 *av1-1-b8-00-quantizer-36.ivf +1241aab865fd7b4bae73736cbeec1866ea9c90ec *av1-1-b8-00-quantizer-36.ivf.md5 +44fa6fca109747d8f43f6c6aa46d782e5d476d54 *av1-1-b8-00-quantizer-37.ivf +947f0f887c5ac9149cf85e8114a709d6f410fc32 *av1-1-b8-00-quantizer-37.ivf.md5 +8319ac1ddd6ce3279da5780175dff7a3a5fa1054 *av1-1-b8-00-quantizer-38.ivf +5f571b7f88678eab9e54f162cc9898f14e437770 *av1-1-b8-00-quantizer-38.ivf.md5 +5975e7056e17608593a8c40619b68e6576d373d9 *av1-1-b8-00-quantizer-39.ivf +7c870192d6eb70ce5367147a3d2c6a52e11f7bec *av1-1-b8-00-quantizer-39.ivf.md5 +47da942f1e455f1422fc65f06dd57304541d16ac *av1-1-b8-00-quantizer-40.ivf +6ea7116c9ce3a1641c7060bab2f5e06fd0910d61 *av1-1-b8-00-quantizer-40.ivf.md5 +ab35c15dfde21c2572b14e04dbfd5fac1adae449 *av1-1-b8-00-quantizer-41.ivf +19596f9849653b913186b9d6b7072984ede96177 *av1-1-b8-00-quantizer-41.ivf.md5 +23a5fa6c3d0eaffaf13f6402465f5dd33d8ea7f1 *av1-1-b8-00-quantizer-42.ivf +5a2726f0d1b1799d4f70883f1bfe5c9d976c6cf5 *av1-1-b8-00-quantizer-42.ivf.md5 +86cddfc463d2b186ec5a1aa25c4562c05201e3c3 *av1-1-b8-00-quantizer-43.ivf +674c64ec8487ee774ad09350380fa6ac43815807 *av1-1-b8-00-quantizer-43.ivf.md5 +6894c154eb56c4f3fe44d54fc4f9af468b03d175 *av1-1-b8-00-quantizer-44.ivf +eca679a2781eb894d18b3d578e3aaf4f48019a15 *av1-1-b8-00-quantizer-44.ivf.md5 +0960bf018ada4224b8344519cf091850d50a57bd *av1-1-b8-00-quantizer-45.ivf +291bb43b9e1ab167040b51019daf1ccf94fd1e50 *av1-1-b8-00-quantizer-45.ivf.md5 +ea644a4732f1a2534332802c2fa5073344f3c356 *av1-1-b8-00-quantizer-46.ivf +4c7915382b1d6d08709c95525b04ab8830f20ca1 *av1-1-b8-00-quantizer-46.ivf.md5 +d1f8832d33234e2c74a2280090850153ea24ea82 *av1-1-b8-00-quantizer-47.ivf +90eb9959e612602934dcc512fe6f54abf0c88d9c *av1-1-b8-00-quantizer-47.ivf.md5 +69c93f760e8b666eb5b98f510e09d90f9230ac9b *av1-1-b8-00-quantizer-48.ivf +931f869e14bd455de9dac2101b383c29e7d6f04c *av1-1-b8-00-quantizer-48.ivf.md5 +8b660c577d95c031d6711c1134b8d115097f8d7e *av1-1-b8-00-quantizer-49.ivf +0e3fe8b49d497050dc1a0eac5f3ad60f5fe068fe *av1-1-b8-00-quantizer-49.ivf.md5 +d40bb21448a6da0fc9b88cbcf76d2f4226573acb *av1-1-b8-00-quantizer-50.ivf +bcd2a9c9a021ba44fc5dc74ae02194fe49ca76a4 *av1-1-b8-00-quantizer-50.ivf.md5 +3b5a1d464aa89b0f1a6ad4f5a03602292b826172 *av1-1-b8-00-quantizer-51.ivf +49bcde0c56cf8b7fbe429336981be22d39025b74 *av1-1-b8-00-quantizer-51.ivf.md5 +38970a02fb38ddb4954fe4240164cb75de5fc744 *av1-1-b8-00-quantizer-52.ivf +fd02b034d79d4be150efb02bd4349edfd0e41311 *av1-1-b8-00-quantizer-52.ivf.md5 +2fde7a7cf3014d5196d011c47de4a144227ed122 *av1-1-b8-00-quantizer-53.ivf +0cb66e6d8fbb29962a69ae1703e22da50db2c92b *av1-1-b8-00-quantizer-53.ivf.md5 +89a69e9b9a601e40cb491ac3a1d32491f2468ac8 *av1-1-b8-00-quantizer-54.ivf +2f8af51acc73c99b5af81db2bdd1883b611ad311 *av1-1-b8-00-quantizer-54.ivf.md5 +31ee4f56fcb0043e95fff7af49e4ef82aafa5543 *av1-1-b8-00-quantizer-55.ivf +04a7104e02bdd0fa38c118202dbbecdbd11ace02 *av1-1-b8-00-quantizer-55.ivf.md5 +f262f0b234006a2652fceb77b1a8711aa53abb54 *av1-1-b8-00-quantizer-56.ivf +bdd54dc25bc5a147c76163af0bced45c56435d79 *av1-1-b8-00-quantizer-56.ivf.md5 +1ef00617091db4b2b839de623bd6b4fb0b2f5f83 *av1-1-b8-00-quantizer-57.ivf +714c65363a87ed5e6e4ad75c79ddb6af57d41fd9 *av1-1-b8-00-quantizer-57.ivf.md5 +43c9b02feccbb3c709d96015f126b7e3d4c24c64 *av1-1-b8-00-quantizer-58.ivf +bae22b8d6377862bff8219470c0d87205d186a68 *av1-1-b8-00-quantizer-58.ivf.md5 +ca5f780abe4c02e48cceb9c804f3625723c359bf *av1-1-b8-00-quantizer-59.ivf +c60a20bbf60b0b0a442ef3f7b682979053909d6e *av1-1-b8-00-quantizer-59.ivf.md5 +1f6f047e9f0e1da22fb514370d92c3c7c66dcf89 *av1-1-b8-00-quantizer-60.ivf +86dc7fa59d363cf1ae4b027a57b119bda893c1c1 *av1-1-b8-00-quantizer-60.ivf.md5 +bcf0c3353568c47a043f2dc34c9abd3fc04eebd4 *av1-1-b8-00-quantizer-61.ivf +66fc4f729c5915aa19939d1b6e28e5b398e747bb *av1-1-b8-00-quantizer-61.ivf.md5 +ac8d3c54451b52cf557ef435d33e7638088d66df *av1-1-b8-00-quantizer-62.ivf +b57f4e1276ead626a3662339a86111ae6fda49d2 *av1-1-b8-00-quantizer-62.ivf.md5 +2a8aa33513d8e01ae9410c4bf5fe1e471b775482 *av1-1-b8-00-quantizer-63.ivf +9f646ec35a168f495e144c64ba7ce9aeb41cd0a2 *av1-1-b8-00-quantizer-63.ivf.md5 +838388fbda4a1d91be81ff62694c3bf13c460d38 *av1-1-b8-01-size-16x16.ivf +4229c1caf8e25eb3073456fb90ceed206753901e *av1-1-b8-01-size-16x16.ivf.md5 +23f4253bf71e02b2e8ead66da4b3de875e879ef2 *av1-1-b8-01-size-18x16.ivf +af125644436d4b6897dade68336cedad663b6610 *av1-1-b8-01-size-18x16.ivf.md5 +94e4a75bd93052f79998e9e08e6b5dd73dc27e50 *av1-1-b8-01-size-32x16.ivf +e7b3fbc5e4b2469838e7ae36512bd3ce0a81040c *av1-1-b8-01-size-32x16.ivf.md5 +f297bde01c05ec5c07ff8118a0280bd36c52b246 *av1-1-b8-01-size-34x16.ivf +f6bbd94d6063c689de3c7cf94afa2c68b969d12c *av1-1-b8-01-size-34x16.ivf.md5 +1e18bdf68bab7e7282aacc77e423bc7d93d04a8e *av1-1-b8-01-size-64x16.ivf +de75732fccfb385294b23c17f0f1a57b455edcf7 *av1-1-b8-01-size-64x16.ivf.md5 +26b1f6ae80b161e971468085778cc1ece502b330 *av1-1-b8-01-size-66x16.ivf +48bd99813557c314d398e6952da78da07c79d416 *av1-1-b8-01-size-66x16.ivf.md5 +ff213ecf31b982a3a7f009c9739f64e066e1ffe9 *av1-1-b8-01-size-16x18.ivf +86b20a13b1939dc5f678e80491f190d376233d58 *av1-1-b8-01-size-16x18.ivf.md5 +c90bd878c59263a15c6a6f515d1c7e071f141559 *av1-1-b8-01-size-18x18.ivf +6f659036ffcd3dd380cf970cf1a06f7755e0b2de *av1-1-b8-01-size-18x18.ivf.md5 +e16a1411381b34817a4c0d8e5eeaeb8cddcc9c46 *av1-1-b8-01-size-32x18.ivf +fdb1c4ec56f5aa690eadbe897340fee86a06ae2f *av1-1-b8-01-size-32x18.ivf.md5 +fac7052b39bd2d0ae107e0e94050226712c770c2 *av1-1-b8-01-size-34x18.ivf +adb0d5a99228027eaa3b016963df447c9818c447 *av1-1-b8-01-size-34x18.ivf.md5 +b8be5e55d9be42746c2b547d0e26e80b21c9802a *av1-1-b8-01-size-64x18.ivf +8f8f6da34cdf78c5a6551c637e1afe279cc3884e *av1-1-b8-01-size-64x18.ivf.md5 +9e066bdcc2cd789cdf551bd4c9c85c178887b880 *av1-1-b8-01-size-66x18.ivf +e8ec6effa936423ae2eec2b60a3160720d2de912 *av1-1-b8-01-size-66x18.ivf.md5 +6ebe45085cdeebc2acd6da5abd542a59312c0ff4 *av1-1-b8-01-size-16x32.ivf +044695669103dbf158591dce9c649317a177d5f6 *av1-1-b8-01-size-16x32.ivf.md5 +9fabb4f60641b8c7995d1dc451419165d41258ff *av1-1-b8-01-size-18x32.ivf +7263764680dfec864c3fad5df824ab1973489a14 *av1-1-b8-01-size-18x32.ivf.md5 +3f72841a24a13e601d79cf029aa1fdb02970ce0b *av1-1-b8-01-size-32x32.ivf +bbe1ae2888d291ec6bc98cd0784937580c554103 *av1-1-b8-01-size-32x32.ivf.md5 +392131a7c7609acd0dba88fee14f1ed042d23ab1 *av1-1-b8-01-size-34x32.ivf +eea68165ebe9acd28693374bf2266374b9c77786 *av1-1-b8-01-size-34x32.ivf.md5 +78afdd96265811ab9466e906347b57161e5c010d *av1-1-b8-01-size-64x32.ivf +47b317af582700b67f6e77659db1dfaa26c8cde6 *av1-1-b8-01-size-64x32.ivf.md5 +2b4d01f2c9f23044c0d886482c7073bd4d5d37d1 *av1-1-b8-01-size-66x32.ivf +3ad5a58a0ee5086af370b22ab2b5b7592a4f33e7 *av1-1-b8-01-size-66x32.ivf.md5 +78ddae04eb8277ae605bd7017ad7ad27bfc82d39 *av1-1-b8-01-size-16x34.ivf +d0c18e679f1fc51e4f7409831321eed9c4858f6f *av1-1-b8-01-size-16x34.ivf.md5 +38d8ed885f46aead6ec1271d8a5d4aee79b8eb68 *av1-1-b8-01-size-18x34.ivf +097ddbd69b8f54826a35efeb0b8b07ec198bba6b *av1-1-b8-01-size-18x34.ivf.md5 +91a42720bc2e7ba701f4d97b463a098b6707cdbd *av1-1-b8-01-size-32x34.ivf +c590d43d37095bd2e8f8d12c9278477419b72d1a *av1-1-b8-01-size-32x34.ivf.md5 +4cc2a437dba56e8878113d9b390b980522542028 *av1-1-b8-01-size-34x34.ivf +57eeb971f00e64abde25be69dbcb4e3ce5065a57 *av1-1-b8-01-size-34x34.ivf.md5 +b36fee1b6ad69d1206466615d69c05e0a4407939 *av1-1-b8-01-size-64x34.ivf +a78aea0250d0b32657dc0eaf2d8394bc766c0e35 *av1-1-b8-01-size-64x34.ivf.md5 +10e441209262e082e31fef8c15b51579c9e81509 *av1-1-b8-01-size-66x34.ivf +558b46f6ef1662c208012d0b66d1857eeff3244e *av1-1-b8-01-size-66x34.ivf.md5 +dd44aad500c7ca0fc97e3d8f0abed3c83b24c79c *av1-1-b8-01-size-16x64.ivf +a5b64e8063abcf3e4872dc4baf1c32384dc5cf83 *av1-1-b8-01-size-16x64.ivf.md5 +aa849f0d09bcb2ead44719d63043536932d5c9f2 *av1-1-b8-01-size-18x64.ivf +bcdf2dea3590c7031158ffe7b907d9ee35e2fe57 *av1-1-b8-01-size-18x64.ivf.md5 +36e856d30e160ba2fbb00510296202f61afaae49 *av1-1-b8-01-size-32x64.ivf +99299f75b82c40c13f168adf2d124f57044a39a2 *av1-1-b8-01-size-32x64.ivf.md5 +e3e03ec5d38eb25e97e4ec3adc6ed40ecdebd278 *av1-1-b8-01-size-34x64.ivf +84625abf8a200a7d20dd3dd3b277b50b3d62ce32 *av1-1-b8-01-size-34x64.ivf.md5 +7d017daebef2d39ed42a505a8e6103ab0c0988c1 *av1-1-b8-01-size-64x64.ivf +1ff38d5ecba82fb2e6ac3b09c29c9fe74885ac29 *av1-1-b8-01-size-64x64.ivf.md5 +e1b58ba0b462508593399a2ed84db5f1c59ffcd2 *av1-1-b8-01-size-66x64.ivf +a6b2c84c94fe79ab0373d157d1203f8d66de0706 *av1-1-b8-01-size-66x64.ivf.md5 +7b4faa7eb7b73392b62de6613282a98dddc13bb6 *av1-1-b8-01-size-16x66.ivf +a2dacf2bae3c4ab352af66a9600946d29ab9a6ee *av1-1-b8-01-size-16x66.ivf.md5 +0f97805fa30497d4cf39665150f00dfdea52d862 *av1-1-b8-01-size-18x66.ivf +33d8ea0765953250f998da3fe161f2a8cfca2353 *av1-1-b8-01-size-18x66.ivf.md5 +c8bb00256de973e3b3ee31b924f554336d310cdb *av1-1-b8-01-size-32x66.ivf +6a6588e6edc68ff7739968a9e7cc6d9eaaeed356 *av1-1-b8-01-size-32x66.ivf.md5 +75ec54fec5c36eecde6d0a16e0389a5f7ad8ec22 *av1-1-b8-01-size-34x66.ivf +36101dfa9495c18696c0d7d61f25e748f4de7425 *av1-1-b8-01-size-34x66.ivf.md5 +7e5491716e70f8199156b8843513c935667b281e *av1-1-b8-01-size-64x66.ivf +da38755bb0c9ef56b81617835ddf1340242c6dce *av1-1-b8-01-size-64x66.ivf.md5 +68b47b386f61d67cb5b824a7e6bf87c8b9c2bf7b *av1-1-b8-01-size-66x66.ivf +25974893956ebd92df474325946130c34f880ea7 *av1-1-b8-01-size-66x66.ivf.md5 +9f386d19c87dbfd6ac84a06d2393dd88863ac003 *av1-1-b8-01-size-196x196.ivf +788f77f655f55de3db94dd69870316134c149116 *av1-1-b8-01-size-196x196.ivf.md5 +ed3bb2bb52a9d1786e233ef38142b15b85097875 *av1-1-b8-01-size-198x196.ivf +3bb6b6721ad9b2838b2d07e47b29d6c0117526b1 *av1-1-b8-01-size-198x196.ivf.md5 +49461772caaaa7b824d48f4e9c77a906b0dc02d5 *av1-1-b8-01-size-200x196.ivf +f1cba00c36909c56097c8785df476d42bc91f259 *av1-1-b8-01-size-200x196.ivf.md5 +44a656a22958e26ed169a69deb8f373117224f06 *av1-1-b8-01-size-202x196.ivf +69be876b52fe42811bba52d36d0bcc88d6c25b3f *av1-1-b8-01-size-202x196.ivf.md5 +0a6fe9b478363faedbfd465a75790b4c2661b9ba *av1-1-b8-01-size-208x196.ivf +fc8e95a6860a8a37ccdf1dfe49828502fcf96a08 *av1-1-b8-01-size-208x196.ivf.md5 +8e05b5a20ec95afd92bb615a7daa2e17a7ef55a8 *av1-1-b8-01-size-210x196.ivf +0add512bffbda3300d8f684a53b13b996fe2e46d *av1-1-b8-01-size-210x196.ivf.md5 +a15f12652c6b4d0c30f13a439c941bfc4a431d1a *av1-1-b8-01-size-224x196.ivf +b904b93252175f79e0e2b28896131ce93d5fc925 *av1-1-b8-01-size-224x196.ivf.md5 +1a57b913443b267f4a31a6925c39f5b58022f550 *av1-1-b8-01-size-226x196.ivf +7cf3087de5804763a82d2a798243a66459664772 *av1-1-b8-01-size-226x196.ivf.md5 +2cc28541a2a72e8b45a368f71e70fc294e2de3ab *av1-1-b8-01-size-196x198.ivf +bb736eedb4bd1e39bf9d60435b4b27a12842e112 *av1-1-b8-01-size-196x198.ivf.md5 +c4ebf93fbf3ae52108fd7b39ddef3afae48188ea *av1-1-b8-01-size-198x198.ivf +fa4de6881511728bafa15b5f441a0cfdf683cc75 *av1-1-b8-01-size-198x198.ivf.md5 +55fce983186d454b0eb15527393bb2465ba41c6b *av1-1-b8-01-size-200x198.ivf +1ac8fb1ee622cbc4aa1b83cb46b4731c85efae62 *av1-1-b8-01-size-200x198.ivf.md5 +67d276c67886f0a91a7ee06751a64f95eeb7bc1f *av1-1-b8-01-size-202x198.ivf +1633b62d9e4ea41737c42f70cbde9a5671da0cef *av1-1-b8-01-size-202x198.ivf.md5 +081cb3f29d3956d4d858d9661fd3d62c94b68867 *av1-1-b8-01-size-208x198.ivf +871d1c99167408dd32fa7603a7296c9b99ccda15 *av1-1-b8-01-size-208x198.ivf.md5 +b2d80b42468d5f296ae240cfb1fc0b3dd3d96bbc *av1-1-b8-01-size-210x198.ivf +6a3382656cb17b532a97b1061697f9a878fc58d1 *av1-1-b8-01-size-210x198.ivf.md5 +84d7994fa20fcf6c1d8dbd4c2060c988a6fce831 *av1-1-b8-01-size-224x198.ivf +42ea12e15de81f2e8617b6de7bae76de2da4d648 *av1-1-b8-01-size-224x198.ivf.md5 +c74a9281cf98c597121df6bff0ac5312b887f969 *av1-1-b8-01-size-226x198.ivf +4133aae0001804e2bbc7928fc065517a6dd8b288 *av1-1-b8-01-size-226x198.ivf.md5 +27adbf148c63f807bd617cfd78aeaedb8b0f2304 *av1-1-b8-01-size-196x200.ivf +9253e525e6207ef1ce0839b8f88ea781e9abe41e *av1-1-b8-01-size-196x200.ivf.md5 +21c9ea4d882e48353d3df66fcde0e4746168163f *av1-1-b8-01-size-198x200.ivf +3d5ee59fde9194f0eaff736051cfd1d7b7daeff1 *av1-1-b8-01-size-198x200.ivf.md5 +c27b0b57667910847122a0309c703315e444110f *av1-1-b8-01-size-200x200.ivf +7b2a15a17b421ef07e285ca4e8a224f0512c434d *av1-1-b8-01-size-200x200.ivf.md5 +780de549e4163a52590f7c0f488e027a8a4aa053 *av1-1-b8-01-size-202x200.ivf +cb0ec0969522ca60d79a639e9b9509363468ffd0 *av1-1-b8-01-size-202x200.ivf.md5 +2c59821904863e264ae61401cbd494a79bc04f13 *av1-1-b8-01-size-208x200.ivf +9963955966a52b65cdd13465c9fb2ba3b5356755 *av1-1-b8-01-size-208x200.ivf.md5 +ff63121611ea9c0628c7e5af13de5e7786611ca6 *av1-1-b8-01-size-210x200.ivf +2a5993be234e3af2af6d185b2a6f3aaf1979b83a *av1-1-b8-01-size-210x200.ivf.md5 +b8485ada95440d78b51153227231b1aced1a8273 *av1-1-b8-01-size-224x200.ivf +9c3cd32ea6c006a91eb37d69dbeccf878de5d214 *av1-1-b8-01-size-224x200.ivf.md5 +1aa0ce3e3a74f9b600a146e98b05547a0b454c48 *av1-1-b8-01-size-226x200.ivf +e045be96c3af16a9ddc10a9933e8ddfb3319d716 *av1-1-b8-01-size-226x200.ivf.md5 +e92b76480f4339855d998b97182f36b28deadcfa *av1-1-b8-01-size-196x202.ivf +480c707abcd2a650e2160ec397f8348cecb45770 *av1-1-b8-01-size-196x202.ivf.md5 +137b9c0d10a3bdbdf6f97b3e6331f3e8acaf8f91 *av1-1-b8-01-size-198x202.ivf +7429642146d0da55161ab13024a261094ee2ce87 *av1-1-b8-01-size-198x202.ivf.md5 +9cea71c44ad015ac702d675bacca17876e65cb1a *av1-1-b8-01-size-200x202.ivf +76b1ec6c42da55f47e389a561590d1a7c713e495 *av1-1-b8-01-size-200x202.ivf.md5 +26dffdcd0dac9becf68d12e31fcd91eddf1f7154 *av1-1-b8-01-size-202x202.ivf +ddb75e99123fed4ef05d9b85200cefd8985bc84c *av1-1-b8-01-size-202x202.ivf.md5 +04007e83bb66ba547d09f8926ea5bfc7fd9e4b2a *av1-1-b8-01-size-208x202.ivf +5b72eb58db22087ad416c499119f41e718395b52 *av1-1-b8-01-size-208x202.ivf.md5 +721ff7c0ae0e2ed896b5acac230113f1404e769c *av1-1-b8-01-size-210x202.ivf +187d2ef939fc26e1a1c7de65abe8e058d8aae17a *av1-1-b8-01-size-210x202.ivf.md5 +dba41421cc938bcf0234254f96be0325ab66186e *av1-1-b8-01-size-224x202.ivf +58856038c1eb13a7bf0353a30b1affe844cd31b1 *av1-1-b8-01-size-224x202.ivf.md5 +55eba14878d25dcc351ee5e92fa06e559035b409 *av1-1-b8-01-size-226x202.ivf +e295b3d791d40d7c1fff2c40a260078dccaef24a *av1-1-b8-01-size-226x202.ivf.md5 +6c777223990ddfd92040a8526646ed0f39299b0d *av1-1-b8-01-size-196x208.ivf +5210daff766cddaf3945610ee05ff242aef8175a *av1-1-b8-01-size-196x208.ivf.md5 +252831abfb9f4a9a8556c21cc3bf60adfe88210f *av1-1-b8-01-size-198x208.ivf +35ed9601e608a829980cec81e41b7bd3e5f4c2ce *av1-1-b8-01-size-198x208.ivf.md5 +e800ed893a88704a4576d4984957f3664560daa9 *av1-1-b8-01-size-200x208.ivf +82c038f9072a2fcf8d55fb4a474fdd791ba9a290 *av1-1-b8-01-size-200x208.ivf.md5 +9ce7bb932dd99f86da8ff2ab89fa4d3089a78da8 *av1-1-b8-01-size-202x208.ivf +0611bf0179abe3c820a447a2bd3a04c3790f3a87 *av1-1-b8-01-size-202x208.ivf.md5 +e5900d9150c8bebc49776227afd3b0a21f5a6ac6 *av1-1-b8-01-size-208x208.ivf +86d6b9a3840aa0a77938547c905bd6f45d069681 *av1-1-b8-01-size-208x208.ivf.md5 +2758ba5dad16f4a91334f2ed07a4a037201bb873 *av1-1-b8-01-size-210x208.ivf +78453b1fda2ccc6f35e0d762567807757bcddb16 *av1-1-b8-01-size-210x208.ivf.md5 +fff88fb8e833f6b4ad64cb591b219c7cceb7f2d2 *av1-1-b8-01-size-224x208.ivf +87266fc34aaed82cdb98cbc309b221ad52eccd81 *av1-1-b8-01-size-224x208.ivf.md5 +dec839fe64046461015b56cda191835284f42a52 *av1-1-b8-01-size-226x208.ivf +d7a15264fc3fd55d3aec0ccfaa7c434c6d90969f *av1-1-b8-01-size-226x208.ivf.md5 +584782e93ed1cb7797a90fece44becdd1e23bf0d *av1-1-b8-01-size-196x210.ivf +ed76ec841b18a457853e368576967c4768fc2730 *av1-1-b8-01-size-196x210.ivf.md5 +dab625599b9f01398b593e865d9a4a95a029d60f *av1-1-b8-01-size-198x210.ivf +b90e8d96a1f5b329b088b467a11fed2d055d74ca *av1-1-b8-01-size-198x210.ivf.md5 +6774bee17b9e50d2d8630e2e1afc30ded67e662d *av1-1-b8-01-size-200x210.ivf +343a86bd54eb3dd5e9902eb62a3d776dcff2f4f3 *av1-1-b8-01-size-200x210.ivf.md5 +0456c3b8e242eeee019ca97d155f81124de62c90 *av1-1-b8-01-size-202x210.ivf +5a6a6428c9858a0d3561db42ceaf981c143fe479 *av1-1-b8-01-size-202x210.ivf.md5 +6a3a8f65bf806b1be7726b983427880f772c9986 *av1-1-b8-01-size-208x210.ivf +5563ea6d8c65887553ff3000addc6418913f1650 *av1-1-b8-01-size-208x210.ivf.md5 +5a8b69489f8e9b917ea7718ad2645101cdbe5644 *av1-1-b8-01-size-210x210.ivf +f4b01604036fa23000d44fbf42097ae1181bcd62 *av1-1-b8-01-size-210x210.ivf.md5 +fb6f5b08a048698cfe324557ee8cd840c4a3f6ce *av1-1-b8-01-size-224x210.ivf +3ce5c404e3ca09c8e994b3043bad42cd555b00c0 *av1-1-b8-01-size-224x210.ivf.md5 +2e9fc8510d2131b2f3c9a93bececac985e4426d2 *av1-1-b8-01-size-226x210.ivf +897c537e259331ca86cdd6e4d2bd343f8538402e *av1-1-b8-01-size-226x210.ivf.md5 +8300512106fce3424eb74b5d4bc0f4f19f7c9af8 *av1-1-b8-01-size-196x224.ivf +43662ea025ea79afe4964fd4d12a77f4aa4e565e *av1-1-b8-01-size-196x224.ivf.md5 +640f8fda7ade8f2850e2275a9f5e233e33a0ba8d *av1-1-b8-01-size-198x224.ivf +9ac690bdbbce47d7b169128b568f955e70076f8c *av1-1-b8-01-size-198x224.ivf.md5 +ce2e9379c72fc924e364d5727605394a1438a211 *av1-1-b8-01-size-200x224.ivf +1ec35a53d88072b96b255202f678178bc7e5bb20 *av1-1-b8-01-size-200x224.ivf.md5 +5d3af7921623deccb578115c8ce207c019f97f50 *av1-1-b8-01-size-202x224.ivf +14eafd55b0cda3a3476cae7ad500dbd5ee899dd5 *av1-1-b8-01-size-202x224.ivf.md5 +6b6d78e466cf94a5ef8dfe252caa0948dd2ec175 *av1-1-b8-01-size-208x224.ivf +e178b0c272dfcfe614c6b49cb28dad11781af0b6 *av1-1-b8-01-size-208x224.ivf.md5 +dd2232b9e18971d7e19650a1e3218aef1010247f *av1-1-b8-01-size-210x224.ivf +40a66198c47820f5fa2d2e389ec0c1191ea4ffcc *av1-1-b8-01-size-210x224.ivf.md5 +9ec028b81a5ea311683328d856f436e6d0b0e6a0 *av1-1-b8-01-size-224x224.ivf +143b9530ce722385db2c2d883daa649ed42b8d40 *av1-1-b8-01-size-224x224.ivf.md5 +bf833947e62935c54e1e727ccb36157f7c1e9e5d *av1-1-b8-01-size-226x224.ivf +ca4f3b44463106e4f0bb54e490c3bd457d7d780b *av1-1-b8-01-size-226x224.ivf.md5 +5525f7e312ec073f480ed5a2be5bdc4f0ce51a09 *av1-1-b8-01-size-196x226.ivf +062d4b240741184458d2d2abd243ed7877631de8 *av1-1-b8-01-size-196x226.ivf.md5 +e6b911142394b94c23191eaa63c9eb41a00f80b0 *av1-1-b8-01-size-198x226.ivf +3b580d903dddf47082f5e055bfb01a4f05c09b7d *av1-1-b8-01-size-198x226.ivf.md5 +70feb5efeb28df25f7d1a661c73bf013c5ada9b4 *av1-1-b8-01-size-200x226.ivf +f0b894e7f787e62f1492be62f3dedeb065062160 *av1-1-b8-01-size-200x226.ivf.md5 +7f9a10831e2389b31497fad50080b4d5452d6e91 *av1-1-b8-01-size-202x226.ivf +45b7194eba9367c8059403c23ca4ae49e988dfaf *av1-1-b8-01-size-202x226.ivf.md5 +967837a2cfbf9aa3131f73aec6a52dcdd82926c7 *av1-1-b8-01-size-208x226.ivf +c8baedb48fd5d4c956aa8d73fd957370f718f047 *av1-1-b8-01-size-208x226.ivf.md5 +9c926226b9f6b015501d8ac1e3f95e8570283a05 *av1-1-b8-01-size-210x226.ivf +57d4837667fd4c5a7aeb908626d701b632852c60 *av1-1-b8-01-size-210x226.ivf.md5 +25a4940922761239809d82c45c2be1c5e4f48785 *av1-1-b8-01-size-224x226.ivf +87ae7e7558241bf3575a333f56fbad4dfdade8ff *av1-1-b8-01-size-224x226.ivf.md5 +40dd208eb525cd90d7c0674cf787097fb909afae *av1-1-b8-01-size-226x226.ivf +34bdef682a4eae0e0a05e4486a968af1df8b220a *av1-1-b8-01-size-226x226.ivf.md5 +9bbe8499796aa588ff02e313fb0d4349940d2fea *av1-1-b10-00-quantizer-00.ivf +36b402eedad2bacee8ac09acce44e2fc356dd80b *av1-1-b10-00-quantizer-00.ivf.md5 +1d5e1d2827624f328020bf123df213bb175577e0 *av1-1-b10-00-quantizer-01.ivf +16c529be5502369e43ce9c6fe99a9709968e3daf *av1-1-b10-00-quantizer-01.ivf.md5 +39abc20739242a8f05efd4b35d7603c8ad7ff45d *av1-1-b10-00-quantizer-02.ivf +81faa72c3d43b003966fe09ffaae51b07b1059be *av1-1-b10-00-quantizer-02.ivf.md5 +92ebf349b803333a43824a83d997b8cf76f656f9 *av1-1-b10-00-quantizer-03.ivf +5e7556dc998cb8b506a43cc078e30802d7e600e6 *av1-1-b10-00-quantizer-03.ivf.md5 +1c496177c66e49f2e3556af87ec67afb5060170b *av1-1-b10-00-quantizer-04.ivf +560fea4800a44fe19ed8d3e74f425bdbf1fb8abd *av1-1-b10-00-quantizer-04.ivf.md5 +7de864b8475ce0acd0ecb01827f2c9add815352b *av1-1-b10-00-quantizer-05.ivf +1c1aea3db3f54a91866d89fd3b1a0d285ca10310 *av1-1-b10-00-quantizer-05.ivf.md5 +b6501c165619b036d0f7864fd4739973d2d18970 *av1-1-b10-00-quantizer-06.ivf +d758c8eff275651006c41e7dd447cac13b489ad7 *av1-1-b10-00-quantizer-06.ivf.md5 +e4df6f588f156dffaafd9517b64f753cfc9ccf05 *av1-1-b10-00-quantizer-07.ivf +3c577f67dade4537de642fd457ea2b367424f336 *av1-1-b10-00-quantizer-07.ivf.md5 +07e9c4c18abb36c8699c1c12bebcc727f090b525 *av1-1-b10-00-quantizer-08.ivf +4981568ade3170f311cb114fa2689edc4bc35e67 *av1-1-b10-00-quantizer-08.ivf.md5 +2268ecd2899f1b41ae9898925b1d62cfefa30282 *av1-1-b10-00-quantizer-09.ivf +029b03029b65b7c4c208961f0820467ad42fd3d6 *av1-1-b10-00-quantizer-09.ivf.md5 +3d2adaf6441cfa9585dcbf7d19d65bf6992a29a3 *av1-1-b10-00-quantizer-10.ivf +017b7fb4c3ba0747c2d5688d493da33ef993d110 *av1-1-b10-00-quantizer-10.ivf.md5 +006535760bd7dc1cfc95e648b05215954a2e76c2 *av1-1-b10-00-quantizer-11.ivf +c0ae083deb8e820aa49034af4d100944dd977018 *av1-1-b10-00-quantizer-11.ivf.md5 +840e0cbfe1acc8a7a45c823dc55ab44a0b6b553e *av1-1-b10-00-quantizer-12.ivf +49232ea38bdef650c94808f53834f1137cd4bf39 *av1-1-b10-00-quantizer-12.ivf.md5 +04b0e5a7387e07474f51be4b2c3e05211b40f0d0 *av1-1-b10-00-quantizer-13.ivf +a51b5ec4b890df3a64f9f0d866b8c41296c9e081 *av1-1-b10-00-quantizer-13.ivf.md5 +5dc47a140fbcbf08bf91481ee3585e9e067561ab *av1-1-b10-00-quantizer-14.ivf +2625319eef69d6225e6ab6e5ce7790491406cb5d *av1-1-b10-00-quantizer-14.ivf.md5 +f866be86d8d8aa08ded30e42988b0936c1a16064 *av1-1-b10-00-quantizer-15.ivf +03b7c1eefb54d99e30051c7123c0453f04a6579d *av1-1-b10-00-quantizer-15.ivf.md5 +548df2371dfb485419ed9baf28e3f495c64f364a *av1-1-b10-00-quantizer-16.ivf +8a0d6bf1626b05b65c77331305414fe9be54e8c6 *av1-1-b10-00-quantizer-16.ivf.md5 +0077c82f96a2e095a3cb8de9bfa63715e3c9f438 *av1-1-b10-00-quantizer-17.ivf +5d85f77f3087f4b206930722a945c60039262be4 *av1-1-b10-00-quantizer-17.ivf.md5 +1e0f1245ecb4c903b5dc7072d959fc43a7bba381 *av1-1-b10-00-quantizer-18.ivf +06316ae2b45f2359a70cc3855ffd6ab81048b41a *av1-1-b10-00-quantizer-18.ivf.md5 +f197198f7ec058110185fda5297a1a43993654df *av1-1-b10-00-quantizer-19.ivf +bac522c7f234d506c75b5495d74b3fa57c83a4df *av1-1-b10-00-quantizer-19.ivf.md5 +c2f57324d000b349323f37d5ebebde8c2b861f30 *av1-1-b10-00-quantizer-20.ivf +999c6110786cbc25e67792234a5a02f2cb4553c7 *av1-1-b10-00-quantizer-20.ivf.md5 +2ffad9adfd19286fe2166ba877289d201c9a634f *av1-1-b10-00-quantizer-21.ivf +d55713eaa791cfd7bf69b6c26d5032029d9a0f06 *av1-1-b10-00-quantizer-21.ivf.md5 +382528db53328c1a38976f5d9b579eef35d839f4 *av1-1-b10-00-quantizer-22.ivf +cb5bd459e1a90126da9264cff4281515f95755b2 *av1-1-b10-00-quantizer-22.ivf.md5 +b52cc6160fc66f72ad66c198d275a1c73f925022 *av1-1-b10-00-quantizer-23.ivf +c0f9d6659e1f283e9356fd7b4ac9f7cc5544cdc2 *av1-1-b10-00-quantizer-23.ivf.md5 +e11f15e3b63e7606b1122bb3670ee77c09c04840 *av1-1-b10-00-quantizer-24.ivf +e9f141b924440e044270c81a68458fe498599a8e *av1-1-b10-00-quantizer-24.ivf.md5 +fb91793b69824c99b0218788dcea0a74ebd7e84e *av1-1-b10-00-quantizer-25.ivf +434e33d609b2683c3cfbcc3a2cdfc26339590fb6 *av1-1-b10-00-quantizer-25.ivf.md5 +d82e38f31cdcf8b43479e6ddaa83373de38f70a2 *av1-1-b10-00-quantizer-26.ivf +183943b851ba383a536f13c83b93f61ac8961ad5 *av1-1-b10-00-quantizer-26.ivf.md5 +6bf5e4e8e0aca699e493b9eb3672d2117494d74d *av1-1-b10-00-quantizer-27.ivf +f0fb7e0a99180828b0e38b2cfe0622eecc2d26b8 *av1-1-b10-00-quantizer-27.ivf.md5 +d5adee2567544c3ae4223b3f3528a770377878d2 *av1-1-b10-00-quantizer-28.ivf +14edf588efc67570e529b0ff8aeb8e7a0c69238b *av1-1-b10-00-quantizer-28.ivf.md5 +e6dcdc106847956035e3f00aabf4470f97e1887e *av1-1-b10-00-quantizer-29.ivf +413c5cb778611c7c1a810b53861b9ab1fb391f17 *av1-1-b10-00-quantizer-29.ivf.md5 +b5e98b3f6b1db04d46bf43064c6ac64f797aff00 *av1-1-b10-00-quantizer-30.ivf +d1a603661d76c28658c7cd2892b408e91d77893e *av1-1-b10-00-quantizer-30.ivf.md5 +80168371d1150e82e3f46bcbbcabba458b835b19 *av1-1-b10-00-quantizer-31.ivf +904ecd033d4af5239c4d5b3f86e51ed5c3c2e3fb *av1-1-b10-00-quantizer-31.ivf.md5 +96291f6ace85980892d135a5b74188cd629c325f *av1-1-b10-00-quantizer-32.ivf +a5ceace390d4a75d48281fe29060c21557e4f5ae *av1-1-b10-00-quantizer-32.ivf.md5 +0f80495de34eae07c4905b72573a315a879390ec *av1-1-b10-00-quantizer-33.ivf +72b8f662973a660412946687dff878b276ae518e *av1-1-b10-00-quantizer-33.ivf.md5 +24905e3be7db320994b7fb8311dfd50a7c9e54da *av1-1-b10-00-quantizer-34.ivf +cea514bb1b7b064c4d31914a2cb266611c278577 *av1-1-b10-00-quantizer-34.ivf.md5 +083012960dd7c17d3b00fa0e807759c98faded8f *av1-1-b10-00-quantizer-35.ivf +de5fdb9e1e581484af1cc7d2dd3c3e84c90cebb2 *av1-1-b10-00-quantizer-35.ivf.md5 +f725f179aeee5b413620c0dd81b007b245c2a7ed *av1-1-b10-00-quantizer-36.ivf +246b1931c04c02df1f168090e2650827cd5dbabd *av1-1-b10-00-quantizer-36.ivf.md5 +f6aa824156e9848f237481889a8103eb6130f31d *av1-1-b10-00-quantizer-37.ivf +a8f78dd15fc2994369a08c2ddddcd0760c62ea5b *av1-1-b10-00-quantizer-37.ivf.md5 +a8dd662338c493aea266b99203e70af25982633f *av1-1-b10-00-quantizer-38.ivf +09f36d998e85d0450060f540e50b075ae1432fc6 *av1-1-b10-00-quantizer-38.ivf.md5 +d97428871720ed658da6ed0e3f7c15da83387e4c *av1-1-b10-00-quantizer-39.ivf +8c5230048909ee8f86f87c116f153cd910d0141f *av1-1-b10-00-quantizer-39.ivf.md5 +86e754e55e9b63c6e0a4fef01761414f8a6b61ca *av1-1-b10-00-quantizer-40.ivf +99a71accf6457264e45ca80d3b1f082ee5acdecc *av1-1-b10-00-quantizer-40.ivf.md5 +9d18b7236506ab7e107c062620b64096ec0cf423 *av1-1-b10-00-quantizer-41.ivf +5771159a9a7c7b66c9e13bb13ec3d53b37860208 *av1-1-b10-00-quantizer-41.ivf.md5 +54b72bc879a80e66613f421e67db62bba1c0041b *av1-1-b10-00-quantizer-42.ivf +bf958236883ee7209ef4cb0b7503b430634a291e *av1-1-b10-00-quantizer-42.ivf.md5 +a06d5321a51d90404dd7085ae511d7df5d5e1e05 *av1-1-b10-00-quantizer-43.ivf +ddb25723d976043d863634b9dc3b5fb84a245803 *av1-1-b10-00-quantizer-43.ivf.md5 +2ea0b64c170d7299dae1c14a8a49349aee8e0d08 *av1-1-b10-00-quantizer-44.ivf +d18bde1b4893792173fa2014665e9364395ad5e9 *av1-1-b10-00-quantizer-44.ivf.md5 +73e506a32d3518e23424f231c7b5323d7a34a3d6 *av1-1-b10-00-quantizer-45.ivf +be6224ebc77a3e5fb9c1645b876007e584a09d89 *av1-1-b10-00-quantizer-45.ivf.md5 +841223871374464194edc739c48dc7cefd1ff255 *av1-1-b10-00-quantizer-46.ivf +4766d616f923496a8dc113c9b7f875f0c0735f9a *av1-1-b10-00-quantizer-46.ivf.md5 +8bbbbea130aaea453f7b826956a5520d10a0eccf *av1-1-b10-00-quantizer-47.ivf +3ea21fac0c492b03d8ec25e4ee0971cd57e5f71a *av1-1-b10-00-quantizer-47.ivf.md5 +3ce83e0f1e1835b9a6c10fe502a16fd3650839e0 *av1-1-b10-00-quantizer-48.ivf +b468de2c09fca5a6b2bb7a20bab4afd8d192c31d *av1-1-b10-00-quantizer-48.ivf.md5 +f3a757c678aa00f9a9c4c4658d37733fd935925a *av1-1-b10-00-quantizer-49.ivf +f888dc88db576122695d4eb41c486aacd28a2d1d *av1-1-b10-00-quantizer-49.ivf.md5 +a9d78aaef105cc5a95b7ebb54783f37e75673123 *av1-1-b10-00-quantizer-50.ivf +06d0c5e79cc794030c4be022089b1d12c1383f71 *av1-1-b10-00-quantizer-50.ivf.md5 +165c20ee372f83682d094541097e375227353239 *av1-1-b10-00-quantizer-51.ivf +b3d90214b8c6e6f6d9357bb5784d10081325c356 *av1-1-b10-00-quantizer-51.ivf.md5 +5b3ea7a18654d943065f5c176974c3960b56664e *av1-1-b10-00-quantizer-52.ivf +dc61a6e4e2549074130023b14b137fb4fe442ce3 *av1-1-b10-00-quantizer-52.ivf.md5 +74c3b5851b6a94d33b575a689eb8d34592e95d5f *av1-1-b10-00-quantizer-53.ivf +a80e43a0fb2b852426bd941b8d4b8f56690e9bc9 *av1-1-b10-00-quantizer-53.ivf.md5 +d05b8dea2cddd4f0d9e792f42f71afbd29f7811c *av1-1-b10-00-quantizer-54.ivf +432937893321f4bd25fa400b8988c5788cb06ecf *av1-1-b10-00-quantizer-54.ivf.md5 +4eaee0f1970426be0bbeb7d4fccdc7e804e9bea4 *av1-1-b10-00-quantizer-55.ivf +710ab95ce1dcd2540db4477ff4ee6ab771fe0759 *av1-1-b10-00-quantizer-55.ivf.md5 +fe637930c9faa8744cba37effc4cb5510315d1c0 *av1-1-b10-00-quantizer-56.ivf +2f9431b30523fb6a3e4122f22c6c3ff7b96a7987 *av1-1-b10-00-quantizer-56.ivf.md5 +ed54fc7fcec194eef1f50adbbe12a6a36ab6836b *av1-1-b10-00-quantizer-57.ivf +43bccac7800b399210cf15520a83739c23a5d9c7 *av1-1-b10-00-quantizer-57.ivf.md5 +a7b8d628ba3e4c5f37aa6a3d7b82afda73ac89dc *av1-1-b10-00-quantizer-58.ivf +b26638272b787df54f45a46629b852acbcb73e3d *av1-1-b10-00-quantizer-58.ivf.md5 +c077f22ff547fb5ffd020e8dac91d05942fb52df *av1-1-b10-00-quantizer-59.ivf +4efd99cc0891bf345b8cd2ae8e21709d61be497b *av1-1-b10-00-quantizer-59.ivf.md5 +301ab53039d75e1ffa8cc6a0874d9ea94e4a6a0d *av1-1-b10-00-quantizer-60.ivf +4729bd734a6edd2d8d0432a3f66b3d91d565050e *av1-1-b10-00-quantizer-60.ivf.md5 +c78640d3211034df9fcb273bdfc18625819652f2 *av1-1-b10-00-quantizer-61.ivf +3d823eb2b33ccfea68db506626bcbecf49b0f167 *av1-1-b10-00-quantizer-61.ivf.md5 +bf241a449a28773b93e6e529a06dfc28109577e4 *av1-1-b10-00-quantizer-62.ivf +75457d8476f1927f737d089dcf3d0f7f99f3c4fb *av1-1-b10-00-quantizer-62.ivf.md5 +8b6eb3fff2e0db7eac775b08c745250ca591e2d9 *av1-1-b10-00-quantizer-63.ivf +63ea689d025593e5d91760785b8e446d04d4671e *av1-1-b10-00-quantizer-63.ivf.md5 +a9f7ea6312a533cc6426a6145edd190d45813c37 *av1-1-b8-02-allintra.ivf +8fd8f789cfee1069d20f3e2c241f5cad7292239e *av1-1-b8-02-allintra.ivf.md5 +e69e41fee40b408b6eebcc79f266a95f2ee24f9e *av1-1-b8-03-sizedown.mkv +8c528fb3ccda959a29721566e132f730935ca32b *av1-1-b8-03-sizedown.mkv.md5 +1889da5ee1708007e47bb887470ac477e1d7ba01 *av1-1-b8-03-sizeup.mkv +8de81b170635d456602dc8923a8b39c534d01fa8 *av1-1-b8-03-sizeup.mkv.md5 +d3ed7de0aa8c155fe35e0f5f4203240710d31383 *park_joy_90p_8_420_monochrome.y4m +5b3f0907407b809aa66b62cb080feda8c92454ca *park_joy_90p_8_420_vertical_csp.y4m +caf8b6a5f1a5bcb38afae8a54a08c4f4459aafa3 *vase10x10_tiles.txt +e14825f50ff845b8a6932c64cb254007a0b5e3a1 *av1-1-b8-22-svc-L2T1.ivf +0f75f2ac44e61fc83be70c955410fa378e433237 *av1-1-b8-22-svc-L2T1.ivf.md5 +e94687eb0e90179b3800b6d5e11eb7e9bfb34eec *av1-1-b8-22-svc-L1T2.ivf +2bc12b16385ea14323bc79607fb8dfbd7edaf8ef *av1-1-b8-22-svc-L1T2.ivf.md5 +32ef2f14ee9cb11a24a22934f4c065e926e5d236 *av1-1-b8-22-svc-L2T2.ivf +f476a10ff06d750129f8229755d51e17ff141b2a *av1-1-b8-22-svc-L2T2.ivf.md5 +afca5502a489692b0a3c120370b0f43b8fc572a1 *av1-1-b8-04-cdfupdate.ivf +13b9423155a08d5e3a2fd9ae4a973bb046718cdf *av1-1-b8-04-cdfupdate.ivf.md5 +f064290d7fcd3b3de19020e8aec6c43c88d3a505 *av1-1-b8-05-mv.ivf +bff316e63ded5559116bdc2fa4aa97ad7b1a1761 *av1-1-b8-05-mv.ivf.md5 +b48a717c7c003b8dd23c3c2caed1ac673380fdb3 *av1-1-b8-06-mfmv.ivf +1424e3cb53e00eb56b94f4c725826274212c42b6 *av1-1-b8-06-mfmv.ivf.md5 +f8724ed96272ddbc35776908f2df7cb9955766a9 *paris_352_288_30.y4m +11bb40026103182c23a88133edafca369e5575e2 *av1-1-b8-23-film_grain-50.ivf +c58ccf7ff04711acc559c06f0bfce3c5b14800c3 *av1-1-b8-23-film_grain-50.ivf.md5 +2f883c7e11c21a31f79bd9c809541be90b0c7c4a *av1-1-b10-23-film_grain-50.ivf +83f2094fca597ad38b4fd623b807de1774c53ffb *av1-1-b10-23-film_grain-50.ivf.md5 diff --git a/libs/libaom/src/test/test.cmake b/libs/libaom/src/test/test.cmake new file mode 100644 index 000000000..d4d3b298d --- /dev/null +++ b/libs/libaom/src/test/test.cmake @@ -0,0 +1,471 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_TEST_TEST_CMAKE_) + return() +endif() # AOM_TEST_TEST_CMAKE_ +set(AOM_TEST_TEST_CMAKE_ 1) + +include(FindPythonInterp) +include(ProcessorCount) + +include("${AOM_ROOT}/test/test_data_util.cmake") + +set(AOM_UNIT_TEST_DATA_LIST_FILE "${AOM_ROOT}/test/test-data.sha1") + +list(APPEND AOM_UNIT_TEST_WRAPPER_SOURCES "${AOM_GEN_SRC_DIR}/usage_exit.c" + "${AOM_ROOT}/test/test_libaom.cc") + +list(APPEND AOM_UNIT_TEST_COMMON_SOURCES + "${AOM_ROOT}/test/acm_random.h" + "${AOM_ROOT}/test/aom_integer_test.cc" + "${AOM_ROOT}/test/av1_config_test.cc" + "${AOM_ROOT}/test/blockd_test.cc" + "${AOM_ROOT}/test/clear_system_state.h" + "${AOM_ROOT}/test/codec_factory.h" + "${AOM_ROOT}/test/decode_test_driver.cc" + "${AOM_ROOT}/test/decode_test_driver.h" + "${AOM_ROOT}/test/function_equivalence_test.h" + "${AOM_ROOT}/test/log2_test.cc" + "${AOM_ROOT}/test/md5_helper.h" + "${AOM_ROOT}/test/metadata_test.cc" + "${AOM_ROOT}/test/register_state_check.h" + "${AOM_ROOT}/test/test_vectors.cc" + "${AOM_ROOT}/test/test_vectors.h" + "${AOM_ROOT}/test/transform_test_base.h" + "${AOM_ROOT}/test/util.h" + "${AOM_ROOT}/test/video_source.h") + +if(CONFIG_INTERNAL_STATS) + list(APPEND AOM_UNIT_TEST_COMMON_SOURCES + "${AOM_ROOT}/test/hbd_metrics_test.cc") +endif() + +list(APPEND AOM_UNIT_TEST_DECODER_SOURCES "${AOM_ROOT}/test/decode_api_test.cc" + "${AOM_ROOT}/test/external_frame_buffer_test.cc" + "${AOM_ROOT}/test/invalid_file_test.cc" + "${AOM_ROOT}/test/test_vector_test.cc" + "${AOM_ROOT}/test/ivf_video_source.h") + +list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES + "${AOM_ROOT}/test/active_map_test.cc" + "${AOM_ROOT}/test/altref_test.cc" + "${AOM_ROOT}/test/aq_segment_test.cc" + "${AOM_ROOT}/test/borders_test.cc" + "${AOM_ROOT}/test/cpu_speed_test.cc" + "${AOM_ROOT}/test/datarate_test.cc" + "${AOM_ROOT}/test/datarate_test.h" + "${AOM_ROOT}/test/svc_datarate_test.cc" + "${AOM_ROOT}/test/encode_api_test.cc" + "${AOM_ROOT}/test/encode_test_driver.cc" + "${AOM_ROOT}/test/encode_test_driver.h" + "${AOM_ROOT}/test/end_to_end_test.cc" + "${AOM_ROOT}/test/fwd_kf_test.cc" + "${AOM_ROOT}/test/gf_pyr_height_test.cc" + "${AOM_ROOT}/test/rt_end_to_end_test.cc" + "${AOM_ROOT}/test/error_resilience_test.cc" + "${AOM_ROOT}/test/frame_size_tests.cc" + "${AOM_ROOT}/test/horz_superres_test.cc" + "${AOM_ROOT}/test/i420_video_source.h" + "${AOM_ROOT}/test/level_test.cc" + "${AOM_ROOT}/test/lossless_test.cc" + "${AOM_ROOT}/test/monochrome_test.cc" + "${AOM_ROOT}/test/qm_test.cc" + "${AOM_ROOT}/test/resize_test.cc" + "${AOM_ROOT}/test/scalability_test.cc" + "${AOM_ROOT}/test/y4m_test.cc" + "${AOM_ROOT}/test/y4m_video_source.h" + "${AOM_ROOT}/test/yuv_video_source.h" + "${AOM_ROOT}/test/time_stamp_test.cc") + +list(APPEND AOM_DECODE_PERF_TEST_SOURCES "${AOM_ROOT}/test/decode_perf_test.cc") +list(APPEND AOM_ENCODE_PERF_TEST_SOURCES "${AOM_ROOT}/test/encode_perf_test.cc") +list(APPEND AOM_UNIT_TEST_WEBM_SOURCES "${AOM_ROOT}/test/webm_video_source.h") +list(APPEND AOM_TEST_INTRA_PRED_SPEED_SOURCES "${AOM_GEN_SRC_DIR}/usage_exit.c" + "${AOM_ROOT}/test/test_intra_pred_speed.cc") + +if(NOT BUILD_SHARED_LIBS) + list(APPEND AOM_UNIT_TEST_COMMON_SOURCES + "${AOM_ROOT}/test/av1_common_int_test.cc" + "${AOM_ROOT}/test/cdef_test.cc" + "${AOM_ROOT}/test/cfl_test.cc" + "${AOM_ROOT}/test/convolve_test.cc" + "${AOM_ROOT}/test/hiprec_convolve_test.cc" + "${AOM_ROOT}/test/hiprec_convolve_test_util.cc" + "${AOM_ROOT}/test/hiprec_convolve_test_util.h" + "${AOM_ROOT}/test/intrabc_test.cc" + "${AOM_ROOT}/test/intrapred_test.cc" + "${AOM_ROOT}/test/lpf_test.cc" + "${AOM_ROOT}/test/scan_test.cc" + "${AOM_ROOT}/test/selfguided_filter_test.cc" + "${AOM_ROOT}/test/simd_cmp_impl.h" + "${AOM_ROOT}/test/simd_impl.h") + + if(CONFIG_ACCOUNTING) + list(APPEND AOM_UNIT_TEST_COMMON_SOURCES + "${AOM_ROOT}/test/accounting_test.cc") + endif() + + if(CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER) + list(APPEND AOM_UNIT_TEST_COMMON_SOURCES + "${AOM_ROOT}/test/av1_encoder_parms_get_to_decoder.cc" + "${AOM_ROOT}/test/av1_ext_tile_test.cc" + "${AOM_ROOT}/test/binary_codes_test.cc" + "${AOM_ROOT}/test/boolcoder_test.cc" + "${AOM_ROOT}/test/cnn_test.cc" + "${AOM_ROOT}/test/coding_path_sync.cc" + "${AOM_ROOT}/test/decode_multithreaded_test.cc" + "${AOM_ROOT}/test/divu_small_test.cc" + "${AOM_ROOT}/test/dr_prediction_test.cc" + "${AOM_ROOT}/test/ec_test.cc" + "${AOM_ROOT}/test/ethread_test.cc" + "${AOM_ROOT}/test/film_grain_table_test.cc" + "${AOM_ROOT}/test/sb_multipass_test.cc" + "${AOM_ROOT}/test/segment_binarization_sync.cc" + "${AOM_ROOT}/test/superframe_test.cc" + "${AOM_ROOT}/test/tile_independence_test.cc" + "${AOM_ROOT}/test/temporal_filter_planewise_test.cc" + "${AOM_ROOT}/test/temporal_filter_yuv_test.cc") + if(CONFIG_REALTIME_ONLY) + list(REMOVE_ITEM AOM_UNIT_TEST_COMMON_SOURCES + "${AOM_ROOT}/test/cnn_test.cc" + "${AOM_ROOT}/test/temporal_filter_yuv_test.cc") + endif() + if(NOT CONFIG_AV1_HIGHBITDEPTH) + list(REMOVE_ITEM AOM_UNIT_TEST_COMMON_SOURCES + "${AOM_ROOT}/test/coding_path_sync.cc") + endif() + endif() + + list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_NEON + "${AOM_ROOT}/test/simd_cmp_neon.cc") + if(HAVE_NEON) + list(APPEND AOM_UNIT_TEST_COMMON_SOURCES + "${AOM_ROOT}/test/simd_neon_test.cc") + endif() + + list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_SSE2 + "${AOM_ROOT}/test/simd_cmp_sse2.cc") + if(HAVE_SSE2) + list(APPEND AOM_UNIT_TEST_COMMON_SOURCES + "${AOM_ROOT}/test/simd_sse2_test.cc") + endif() + + list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_SSSE3 + "${AOM_ROOT}/test/simd_cmp_ssse3.cc") + if(HAVE_SSSE3) + list(APPEND AOM_UNIT_TEST_COMMON_SOURCES + "${AOM_ROOT}/test/simd_ssse3_test.cc") + endif() + + if(HAVE_SSE4) + list(APPEND AOM_UNIT_TEST_COMMON_SOURCES + "${AOM_ROOT}/test/simd_sse4_test.cc") + endif() + + if(HAVE_SSE4_1) + list(APPEND AOM_UNIT_TEST_COMMON_SOURCES + "${AOM_ROOT}/test/filterintra_test.cc") + endif() + + list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_AVX2 + "${AOM_ROOT}/test/simd_cmp_avx2.cc") + if(HAVE_AVX2) + list(APPEND AOM_UNIT_TEST_COMMON_SOURCES + "${AOM_ROOT}/test/simd_avx2_test.cc") + endif() + + list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES + "${AOM_ROOT}/test/arf_freq_test.cc" + "${AOM_ROOT}/test/av1_convolve_2d_test.cc" + "${AOM_ROOT}/test/av1_convolve_2d_test_util.cc" + "${AOM_ROOT}/test/av1_convolve_2d_test_util.h" + "${AOM_ROOT}/test/av1_fwd_txfm1d_test.cc" + "${AOM_ROOT}/test/av1_fwd_txfm2d_test.cc" + "${AOM_ROOT}/test/av1_inv_txfm1d_test.cc" + "${AOM_ROOT}/test/av1_inv_txfm2d_test.cc" + "${AOM_ROOT}/test/av1_nn_predict_test.cc" + "${AOM_ROOT}/test/av1_round_shift_array_test.cc" + "${AOM_ROOT}/test/av1_txfm_test.cc" + "${AOM_ROOT}/test/av1_txfm_test.h" + "${AOM_ROOT}/test/av1_wedge_utils_test.cc" + "${AOM_ROOT}/test/avg_test.cc" + "${AOM_ROOT}/test/blend_a64_mask_1d_test.cc" + "${AOM_ROOT}/test/blend_a64_mask_test.cc" + "${AOM_ROOT}/test/comp_avg_pred_test.cc" + "${AOM_ROOT}/test/comp_avg_pred_test.h" + "${AOM_ROOT}/test/comp_mask_variance_test.cc" + "${AOM_ROOT}/test/edge_detect_test.cc" + "${AOM_ROOT}/test/encodetxb_test.cc" + "${AOM_ROOT}/test/error_block_test.cc" + "${AOM_ROOT}/test/fft_test.cc" + "${AOM_ROOT}/test/fwht4x4_test.cc" + "${AOM_ROOT}/test/fdct4x4_test.cc" + "${AOM_ROOT}/test/hadamard_test.cc" + "${AOM_ROOT}/test/horver_correlation_test.cc" + "${AOM_ROOT}/test/masked_sad_test.cc" + "${AOM_ROOT}/test/masked_variance_test.cc" + "${AOM_ROOT}/test/motion_vector_test.cc" + "${AOM_ROOT}/test/noise_model_test.cc" + "${AOM_ROOT}/test/obmc_sad_test.cc" + "${AOM_ROOT}/test/obmc_variance_test.cc" + "${AOM_ROOT}/test/pickrst_test.cc" + "${AOM_ROOT}/test/quantize_func_test.cc" + "${AOM_ROOT}/test/sad_test.cc" + "${AOM_ROOT}/test/subtract_test.cc" + "${AOM_ROOT}/test/reconinter_test.cc" + "${AOM_ROOT}/test/sum_squares_test.cc" + "${AOM_ROOT}/test/variance_test.cc" + "${AOM_ROOT}/test/wiener_test.cc" + "${AOM_ROOT}/test/frame_error_test.cc" + "${AOM_ROOT}/test/warp_filter_test.cc" + "${AOM_ROOT}/test/warp_filter_test_util.cc" + "${AOM_ROOT}/test/warp_filter_test_util.h") + + list(APPEND AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1 + "${AOM_ROOT}/test/av1_highbd_iht_test.cc" + "${AOM_ROOT}/test/av1_quantize_test.cc" + "${AOM_ROOT}/test/corner_match_test.cc" + "${AOM_ROOT}/test/simd_cmp_sse4.cc") + + if(NOT CONFIG_AV1_HIGHBITDEPTH) + list(REMOVE_ITEM AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1 + "${AOM_ROOT}/test/av1_quantize_test.cc") + endif() + + if(NOT (HAVE_SSE2 OR HAVE_NEON)) + list(REMOVE_ITEM AOM_UNIT_TEST_ENCODER_SOURCES + "${AOM_ROOT}/test/quantize_func_test.cc") + endif() + + if(HAVE_SSE4_1) + list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES + "${AOM_ROOT}/test/av1_convolve_scale_test.cc" + "${AOM_ROOT}/test/av1_horz_only_frame_superres_test.cc" + "${AOM_ROOT}/test/intra_edge_test.cc") + + endif() + + if(HAVE_SSE4_2) + list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES "${AOM_ROOT}/test/hash_test.cc") + endif() + +endif() + +if(ENABLE_TESTS) + find_package(PythonInterp) + if(NOT PYTHONINTERP_FOUND) + message( + FATAL_ERROR "--- Unit tests require Python, rerun cmake with " + "-DENABLE_TESTS=0 to avoid this error, or install Python and " + "make sure it's in your PATH.") + endif() + + if(BUILD_SHARED_LIBS AND APPLE) # Silence an RPATH warning. + set(CMAKE_MACOSX_RPATH 1) + endif() + + include_directories( + "${AOM_ROOT}/third_party/googletest/src/googletest/include") + + include_directories("${AOM_ROOT}/third_party/googletest/src/googletest") + add_library( + aom_gtest STATIC + "${AOM_ROOT}/third_party/googletest/src/googletest/src/gtest-all.cc") + if(MSVC OR WIN32) + target_compile_definitions(aom_gtest PRIVATE GTEST_OS_WINDOWS=1) + elseif(CONFIG_MULTITHREAD AND CMAKE_USE_PTHREADS_INIT) + target_compile_definitions(aom_gtest PRIVATE GTEST_HAS_PTHREAD=1) + else() + target_compile_definitions(aom_gtest PRIVATE GTEST_HAS_PTHREAD=0) + endif() +endif() + +# Setup testdata download targets, test build targets, and test run targets. The +# libaom and app util targets must exist before this function is called. +function(setup_aom_test_targets) + + # TODO(tomfinegan): Build speed optimization. $AOM_UNIT_TEST_COMMON_SOURCES + # and $AOM_UNIT_TEST_ENCODER_SOURCES are very large. The build of test targets + # could be sped up (on multicore build machines) by compiling sources in each + # list into separate object library targets, and then linking them into + # test_libaom. + add_library(test_aom_common OBJECT ${AOM_UNIT_TEST_COMMON_SOURCES}) + add_dependencies(test_aom_common aom) + + if(CONFIG_AV1_DECODER) + add_library(test_aom_decoder OBJECT ${AOM_UNIT_TEST_DECODER_SOURCES}) + add_dependencies(test_aom_decoder aom) + endif() + + if(CONFIG_AV1_ENCODER) + add_library(test_aom_encoder OBJECT ${AOM_UNIT_TEST_ENCODER_SOURCES}) + add_dependencies(test_aom_encoder aom) + endif() + + add_executable(test_libaom ${AOM_UNIT_TEST_WRAPPER_SOURCES} + $ + $) + list(APPEND AOM_APP_TARGETS test_libaom) + + if(CONFIG_AV1_DECODER) + target_sources(test_libaom PRIVATE $ + $) + + if(ENABLE_DECODE_PERF_TESTS AND CONFIG_WEBM_IO) + target_sources(test_libaom PRIVATE ${AOM_DECODE_PERF_TEST_SOURCES}) + endif() + endif() + + if(CONFIG_AV1_ENCODER) + target_sources(test_libaom PRIVATE $ + $) + + if(ENABLE_ENCODE_PERF_TESTS) + target_sources(test_libaom PRIVATE ${AOM_ENCODE_PERF_TEST_SOURCES}) + endif() + + if(NOT BUILD_SHARED_LIBS) + add_executable(test_intra_pred_speed + ${AOM_TEST_INTRA_PRED_SPEED_SOURCES} + $) + target_link_libraries(test_intra_pred_speed ${AOM_LIB_LINK_TYPE} aom + aom_gtest) + list(APPEND AOM_APP_TARGETS test_intra_pred_speed) + endif() + endif() + + target_link_libraries(test_libaom ${AOM_LIB_LINK_TYPE} aom aom_gtest) + + if(CONFIG_LIBYUV) + target_sources(test_libaom PRIVATE $) + endif() + if(CONFIG_WEBM_IO) + target_sources(test_libaom PRIVATE $) + endif() + if(HAVE_SSE2) + add_intrinsics_source_to_target("-msse2" "test_libaom" + "AOM_UNIT_TEST_COMMON_INTRIN_SSE2") + endif() + if(HAVE_SSSE3) + add_intrinsics_source_to_target("-mssse3" "test_libaom" + "AOM_UNIT_TEST_COMMON_INTRIN_SSSE3") + endif() + if(HAVE_SSE4_1) + add_intrinsics_source_to_target("-msse4.1" "test_libaom" + "AOM_UNIT_TEST_COMMON_INTRIN_SSE4_1") + if(CONFIG_AV1_ENCODER) + if(AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1) + add_intrinsics_source_to_target("-msse4.1" "test_libaom" + "AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1") + endif() + endif() + endif() + if(HAVE_AVX2) + add_intrinsics_source_to_target("-mavx2" "test_libaom" + "AOM_UNIT_TEST_COMMON_INTRIN_AVX2") + endif() + if(HAVE_NEON) + add_intrinsics_source_to_target("${AOM_NEON_INTRIN_FLAG}" "test_libaom" + "AOM_UNIT_TEST_COMMON_INTRIN_NEON") + endif() + + if(ENABLE_TESTDATA) + make_test_data_lists("${AOM_UNIT_TEST_DATA_LIST_FILE}" test_files + test_file_checksums) + list(LENGTH test_files num_test_files) + list(LENGTH test_file_checksums num_test_file_checksums) + + math(EXPR max_file_index "${num_test_files} - 1") + foreach(test_index RANGE ${max_file_index}) + list(GET test_files ${test_index} test_file) + list(GET test_file_checksums ${test_index} test_file_checksum) + add_custom_target( + testdata_${test_index} + COMMAND ${CMAKE_COMMAND} + -DAOM_CONFIG_DIR="${AOM_CONFIG_DIR}" -DAOM_ROOT="${AOM_ROOT}" + -DAOM_TEST_FILE="${test_file}" + -DAOM_TEST_CHECKSUM=${test_file_checksum} -P + "${AOM_ROOT}/test/test_data_download_worker.cmake") + list(APPEND testdata_targets testdata_${test_index}) + endforeach() + + # Create a custom build target for running each test data download target. + add_custom_target(testdata) + add_dependencies(testdata ${testdata_targets}) + + # Skip creation of test run targets when generating for Visual Studio and + # Xcode unless the user explicitly requests IDE test hosting. This is done + # to make build cycles in the IDE tolerable when the IDE command for build + # project is used to build AOM. Default behavior in IDEs is to build all + # targets, and the test run takes hours. + if(((NOT MSVC) AND (NOT XCODE)) OR ENABLE_IDE_TEST_HOSTING) + + # Pick a reasonable number of targets (this controls parallelization). + processorcount(num_test_targets) + if(num_test_targets EQUAL 0) # Just default to 10 targets when there's no + # processor count available. + set(num_test_targets 10) + endif() + + math(EXPR max_shard_index "${num_test_targets} - 1") + foreach(shard_index RANGE ${max_shard_index}) + set(test_name "test_${shard_index}") + add_custom_target(${test_name} + COMMAND ${CMAKE_COMMAND} + -DGTEST_SHARD_INDEX=${shard_index} + -DGTEST_TOTAL_SHARDS=${num_test_targets} + -DTEST_LIBAOM=$ -P + "${AOM_ROOT}/test/test_runner.cmake" + DEPENDS testdata test_libaom) + list(APPEND test_targets ${test_name}) + endforeach() + add_custom_target(runtests) + add_dependencies(runtests ${test_targets}) + endif() + endif() + + # Collect all variables containing libaom test source files. + get_cmake_property(all_cmake_vars VARIABLES) + foreach(var ${all_cmake_vars}) + + # https://github.com/cheshirekow/cmake_format/issues/34 + # cmake-format: off + if (("${var}" MATCHES "_TEST_" AND NOT + "${var}" MATCHES + "_DATA_\|_CMAKE_\|INTRA_PRED\|_COMPILED\|_HOSTING\|_PERF_\|CODER_") + OR (CONFIG_AV1_ENCODER AND ENABLE_ENCODE_PERF_TESTS AND + "${var}" MATCHES "_ENCODE_PERF_TEST_") + OR (CONFIG_AV1_DECODER AND ENABLE_DECODE_PERF_TESTS AND + "${var}" MATCHES "_DECODE_PERF_TEST_") + OR (CONFIG_AV1_ENCODER AND "${var}" MATCHES "_TEST_ENCODER_") + OR (CONFIG_AV1_DECODER AND "${var}" MATCHES "_TEST_DECODER_")) + list(APPEND aom_test_source_vars ${var}) + endif() + # cmake-format: on + endforeach() + + # Libaom_test_srcs.txt generation. + set(libaom_test_srcs_txt_file "${AOM_CONFIG_DIR}/libaom_test_srcs.txt") + file(WRITE "${libaom_test_srcs_txt_file}" + "# This file is generated. DO NOT EDIT.\n") + + # Static source file list first. + foreach(aom_test_source_var ${aom_test_source_vars}) + foreach(file ${${aom_test_source_var}}) + if(NOT "${file}" MATCHES "${AOM_CONFIG_DIR}") + string(REPLACE "${AOM_ROOT}/" "" file "${file}") + file(APPEND "${libaom_test_srcs_txt_file}" "${file}\n") + endif() + endforeach() + endforeach() + + set(AOM_APP_TARGETS ${AOM_APP_TARGETS} PARENT_SCOPE) +endfunction() diff --git a/libs/libaom/src/test/test_data_download_worker.cmake b/libs/libaom/src/test/test_data_download_worker.cmake new file mode 100644 index 000000000..a49038888 --- /dev/null +++ b/libs/libaom/src/test/test_data_download_worker.cmake @@ -0,0 +1,46 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +include("${AOM_ROOT}/test/test_data_util.cmake") + +# https://github.com/cheshirekow/cmake_format/issues/34 +# cmake-format: off +if (NOT AOM_ROOT OR NOT AOM_CONFIG_DIR OR NOT AOM_TEST_FILE + OR NOT AOM_TEST_CHECKSUM) + message(FATAL_ERROR + "AOM_ROOT, AOM_CONFIG_DIR, AOM_TEST_FILE and AOM_TEST_CHECKSUM must be + defined.") +endif () +# cmake-format: on + +set(AOM_TEST_DATA_URL "https://storage.googleapis.com/aom-test-data") + +if(NOT AOM_TEST_DATA_PATH) + set(AOM_TEST_DATA_PATH "$ENV{LIBAOM_TEST_DATA_PATH}") +endif() + +if("${AOM_TEST_DATA_PATH}" STREQUAL "") + message( + WARNING "Writing test data to ${AOM_CONFIG_DIR}, set " + "$LIBAOM_TEST_DATA_PATH in your environment to avoid this warning.") + set(AOM_TEST_DATA_PATH "${AOM_CONFIG_DIR}") +endif() + +if(NOT EXISTS "${AOM_TEST_DATA_PATH}") + file(MAKE_DIRECTORY "${AOM_TEST_DATA_PATH}") +endif() + +expand_test_file_paths("AOM_TEST_FILE" "${AOM_TEST_DATA_PATH}" "filepath") +expand_test_file_paths("AOM_TEST_FILE" "${AOM_TEST_DATA_URL}" "url") + +check_file("${filepath}" "${AOM_TEST_CHECKSUM}" "needs_download") +if(needs_download) + download_test_file("${url}" "${AOM_TEST_CHECKSUM}" "${filepath}") +endif() diff --git a/libs/libaom/src/test/test_data_util.cmake b/libs/libaom/src/test/test_data_util.cmake new file mode 100644 index 000000000..050600e13 --- /dev/null +++ b/libs/libaom/src/test/test_data_util.cmake @@ -0,0 +1,650 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# + +list(APPEND AOM_TEST_DATA_FILE_NAMES + "hantro_collage_w352h288.yuv" + "hantro_odd.yuv" + "paris_352_288_30.y4m" + "park_joy_90p_10_420.y4m" + "park_joy_90p_10_422.y4m" + "park_joy_90p_10_444.y4m" + "park_joy_90p_12_420.y4m" + "park_joy_90p_12_422.y4m" + "park_joy_90p_12_444.y4m" + "park_joy_90p_8_420_a10-1.y4m" + "park_joy_90p_8_420.y4m" + "park_joy_90p_8_420_monochrome.y4m" + "park_joy_90p_8_420_vertical_csp.y4m" + "park_joy_90p_8_422.y4m" + "park_joy_90p_8_444.y4m" + "pixel_capture_w320h240.yuv" + "desktop_credits.y4m" + "niklas_1280_720_30.y4m" + "rush_hour_444.y4m" + "screendata.y4m" + "niklas_640_480_30.yuv" + "vase10x10.yuv" + "vase10x10_tiles.txt") + +if(ENABLE_DECODE_PERF_TESTS AND CONFIG_AV1_ENCODER) + list(APPEND AOM_TEST_DATA_FILE_NAMES "niklas_1280_720_30.yuv") +endif() + +if(CONFIG_AV1_DECODER) + list(APPEND AOM_TEST_DATA_FILE_NAMES + "av1-1-b8-00-quantizer-00.ivf" + "av1-1-b8-00-quantizer-00.ivf.md5" + "av1-1-b8-00-quantizer-01.ivf" + "av1-1-b8-00-quantizer-01.ivf.md5" + "av1-1-b8-00-quantizer-02.ivf" + "av1-1-b8-00-quantizer-02.ivf.md5" + "av1-1-b8-00-quantizer-03.ivf" + "av1-1-b8-00-quantizer-03.ivf.md5" + "av1-1-b8-00-quantizer-04.ivf" + "av1-1-b8-00-quantizer-04.ivf.md5" + "av1-1-b8-00-quantizer-05.ivf" + "av1-1-b8-00-quantizer-05.ivf.md5" + "av1-1-b8-00-quantizer-06.ivf" + "av1-1-b8-00-quantizer-06.ivf.md5" + "av1-1-b8-00-quantizer-07.ivf" + "av1-1-b8-00-quantizer-07.ivf.md5" + "av1-1-b8-00-quantizer-08.ivf" + "av1-1-b8-00-quantizer-08.ivf.md5" + "av1-1-b8-00-quantizer-09.ivf" + "av1-1-b8-00-quantizer-09.ivf.md5" + "av1-1-b8-00-quantizer-10.ivf" + "av1-1-b8-00-quantizer-10.ivf.md5" + "av1-1-b8-00-quantizer-11.ivf" + "av1-1-b8-00-quantizer-11.ivf.md5" + "av1-1-b8-00-quantizer-12.ivf" + "av1-1-b8-00-quantizer-12.ivf.md5" + "av1-1-b8-00-quantizer-13.ivf" + "av1-1-b8-00-quantizer-13.ivf.md5" + "av1-1-b8-00-quantizer-14.ivf" + "av1-1-b8-00-quantizer-14.ivf.md5" + "av1-1-b8-00-quantizer-15.ivf" + "av1-1-b8-00-quantizer-15.ivf.md5" + "av1-1-b8-00-quantizer-16.ivf" + "av1-1-b8-00-quantizer-16.ivf.md5" + "av1-1-b8-00-quantizer-17.ivf" + "av1-1-b8-00-quantizer-17.ivf.md5" + "av1-1-b8-00-quantizer-18.ivf" + "av1-1-b8-00-quantizer-18.ivf.md5" + "av1-1-b8-00-quantizer-19.ivf" + "av1-1-b8-00-quantizer-19.ivf.md5" + "av1-1-b8-00-quantizer-20.ivf" + "av1-1-b8-00-quantizer-20.ivf.md5" + "av1-1-b8-00-quantizer-21.ivf" + "av1-1-b8-00-quantizer-21.ivf.md5" + "av1-1-b8-00-quantizer-22.ivf" + "av1-1-b8-00-quantizer-22.ivf.md5" + "av1-1-b8-00-quantizer-23.ivf" + "av1-1-b8-00-quantizer-23.ivf.md5" + "av1-1-b8-00-quantizer-24.ivf" + "av1-1-b8-00-quantizer-24.ivf.md5" + "av1-1-b8-00-quantizer-25.ivf" + "av1-1-b8-00-quantizer-25.ivf.md5" + "av1-1-b8-00-quantizer-26.ivf" + "av1-1-b8-00-quantizer-26.ivf.md5" + "av1-1-b8-00-quantizer-27.ivf" + "av1-1-b8-00-quantizer-27.ivf.md5" + "av1-1-b8-00-quantizer-28.ivf" + "av1-1-b8-00-quantizer-28.ivf.md5" + "av1-1-b8-00-quantizer-29.ivf" + "av1-1-b8-00-quantizer-29.ivf.md5" + "av1-1-b8-00-quantizer-30.ivf" + "av1-1-b8-00-quantizer-30.ivf.md5" + "av1-1-b8-00-quantizer-31.ivf" + "av1-1-b8-00-quantizer-31.ivf.md5" + "av1-1-b8-00-quantizer-32.ivf" + "av1-1-b8-00-quantizer-32.ivf.md5" + "av1-1-b8-00-quantizer-33.ivf" + "av1-1-b8-00-quantizer-33.ivf.md5" + "av1-1-b8-00-quantizer-34.ivf" + "av1-1-b8-00-quantizer-34.ivf.md5" + "av1-1-b8-00-quantizer-35.ivf" + "av1-1-b8-00-quantizer-35.ivf.md5" + "av1-1-b8-00-quantizer-36.ivf" + "av1-1-b8-00-quantizer-36.ivf.md5" + "av1-1-b8-00-quantizer-37.ivf" + "av1-1-b8-00-quantizer-37.ivf.md5" + "av1-1-b8-00-quantizer-38.ivf" + "av1-1-b8-00-quantizer-38.ivf.md5" + "av1-1-b8-00-quantizer-39.ivf" + "av1-1-b8-00-quantizer-39.ivf.md5" + "av1-1-b8-00-quantizer-40.ivf" + "av1-1-b8-00-quantizer-40.ivf.md5" + "av1-1-b8-00-quantizer-41.ivf" + "av1-1-b8-00-quantizer-41.ivf.md5" + "av1-1-b8-00-quantizer-42.ivf" + "av1-1-b8-00-quantizer-42.ivf.md5" + "av1-1-b8-00-quantizer-43.ivf" + "av1-1-b8-00-quantizer-43.ivf.md5" + "av1-1-b8-00-quantizer-44.ivf" + "av1-1-b8-00-quantizer-44.ivf.md5" + "av1-1-b8-00-quantizer-45.ivf" + "av1-1-b8-00-quantizer-45.ivf.md5" + "av1-1-b8-00-quantizer-46.ivf" + "av1-1-b8-00-quantizer-46.ivf.md5" + "av1-1-b8-00-quantizer-47.ivf" + "av1-1-b8-00-quantizer-47.ivf.md5" + "av1-1-b8-00-quantizer-48.ivf" + "av1-1-b8-00-quantizer-48.ivf.md5" + "av1-1-b8-00-quantizer-49.ivf" + "av1-1-b8-00-quantizer-49.ivf.md5" + "av1-1-b8-00-quantizer-50.ivf" + "av1-1-b8-00-quantizer-50.ivf.md5" + "av1-1-b8-00-quantizer-51.ivf" + "av1-1-b8-00-quantizer-51.ivf.md5" + "av1-1-b8-00-quantizer-52.ivf" + "av1-1-b8-00-quantizer-52.ivf.md5" + "av1-1-b8-00-quantizer-53.ivf" + "av1-1-b8-00-quantizer-53.ivf.md5" + "av1-1-b8-00-quantizer-54.ivf" + "av1-1-b8-00-quantizer-54.ivf.md5" + "av1-1-b8-00-quantizer-55.ivf" + "av1-1-b8-00-quantizer-55.ivf.md5" + "av1-1-b8-00-quantizer-56.ivf" + "av1-1-b8-00-quantizer-56.ivf.md5" + "av1-1-b8-00-quantizer-57.ivf" + "av1-1-b8-00-quantizer-57.ivf.md5" + "av1-1-b8-00-quantizer-58.ivf" + "av1-1-b8-00-quantizer-58.ivf.md5" + "av1-1-b8-00-quantizer-59.ivf" + "av1-1-b8-00-quantizer-59.ivf.md5" + "av1-1-b8-00-quantizer-60.ivf" + "av1-1-b8-00-quantizer-60.ivf.md5" + "av1-1-b8-00-quantizer-61.ivf" + "av1-1-b8-00-quantizer-61.ivf.md5" + "av1-1-b8-00-quantizer-62.ivf" + "av1-1-b8-00-quantizer-62.ivf.md5" + "av1-1-b8-00-quantizer-63.ivf" + "av1-1-b8-00-quantizer-63.ivf.md5" + "av1-1-b10-00-quantizer-00.ivf" + "av1-1-b10-00-quantizer-00.ivf.md5" + "av1-1-b10-00-quantizer-01.ivf" + "av1-1-b10-00-quantizer-01.ivf.md5" + "av1-1-b10-00-quantizer-02.ivf" + "av1-1-b10-00-quantizer-02.ivf.md5" + "av1-1-b10-00-quantizer-03.ivf" + "av1-1-b10-00-quantizer-03.ivf.md5" + "av1-1-b10-00-quantizer-04.ivf" + "av1-1-b10-00-quantizer-04.ivf.md5" + "av1-1-b10-00-quantizer-05.ivf" + "av1-1-b10-00-quantizer-05.ivf.md5" + "av1-1-b10-00-quantizer-06.ivf" + "av1-1-b10-00-quantizer-06.ivf.md5" + "av1-1-b10-00-quantizer-07.ivf" + "av1-1-b10-00-quantizer-07.ivf.md5" + "av1-1-b10-00-quantizer-08.ivf" + "av1-1-b10-00-quantizer-08.ivf.md5" + "av1-1-b10-00-quantizer-09.ivf" + "av1-1-b10-00-quantizer-09.ivf.md5" + "av1-1-b10-00-quantizer-10.ivf" + "av1-1-b10-00-quantizer-10.ivf.md5" + "av1-1-b10-00-quantizer-11.ivf" + "av1-1-b10-00-quantizer-11.ivf.md5" + "av1-1-b10-00-quantizer-12.ivf" + "av1-1-b10-00-quantizer-12.ivf.md5" + "av1-1-b10-00-quantizer-13.ivf" + "av1-1-b10-00-quantizer-13.ivf.md5" + "av1-1-b10-00-quantizer-14.ivf" + "av1-1-b10-00-quantizer-14.ivf.md5" + "av1-1-b10-00-quantizer-15.ivf" + "av1-1-b10-00-quantizer-15.ivf.md5" + "av1-1-b10-00-quantizer-16.ivf" + "av1-1-b10-00-quantizer-16.ivf.md5" + "av1-1-b10-00-quantizer-17.ivf" + "av1-1-b10-00-quantizer-17.ivf.md5" + "av1-1-b10-00-quantizer-18.ivf" + "av1-1-b10-00-quantizer-18.ivf.md5" + "av1-1-b10-00-quantizer-19.ivf" + "av1-1-b10-00-quantizer-19.ivf.md5" + "av1-1-b10-00-quantizer-20.ivf" + "av1-1-b10-00-quantizer-20.ivf.md5" + "av1-1-b10-00-quantizer-21.ivf" + "av1-1-b10-00-quantizer-21.ivf.md5" + "av1-1-b10-00-quantizer-22.ivf" + "av1-1-b10-00-quantizer-22.ivf.md5" + "av1-1-b10-00-quantizer-23.ivf" + "av1-1-b10-00-quantizer-23.ivf.md5" + "av1-1-b10-00-quantizer-24.ivf" + "av1-1-b10-00-quantizer-24.ivf.md5" + "av1-1-b10-00-quantizer-25.ivf" + "av1-1-b10-00-quantizer-25.ivf.md5" + "av1-1-b10-00-quantizer-26.ivf" + "av1-1-b10-00-quantizer-26.ivf.md5" + "av1-1-b10-00-quantizer-27.ivf" + "av1-1-b10-00-quantizer-27.ivf.md5" + "av1-1-b10-00-quantizer-28.ivf" + "av1-1-b10-00-quantizer-28.ivf.md5" + "av1-1-b10-00-quantizer-29.ivf" + "av1-1-b10-00-quantizer-29.ivf.md5" + "av1-1-b10-00-quantizer-30.ivf" + "av1-1-b10-00-quantizer-30.ivf.md5" + "av1-1-b10-00-quantizer-31.ivf" + "av1-1-b10-00-quantizer-31.ivf.md5" + "av1-1-b10-00-quantizer-32.ivf" + "av1-1-b10-00-quantizer-32.ivf.md5" + "av1-1-b10-00-quantizer-33.ivf" + "av1-1-b10-00-quantizer-33.ivf.md5" + "av1-1-b10-00-quantizer-34.ivf" + "av1-1-b10-00-quantizer-34.ivf.md5" + "av1-1-b10-00-quantizer-35.ivf" + "av1-1-b10-00-quantizer-35.ivf.md5" + "av1-1-b10-00-quantizer-36.ivf" + "av1-1-b10-00-quantizer-36.ivf.md5" + "av1-1-b10-00-quantizer-37.ivf" + "av1-1-b10-00-quantizer-37.ivf.md5" + "av1-1-b10-00-quantizer-38.ivf" + "av1-1-b10-00-quantizer-38.ivf.md5" + "av1-1-b10-00-quantizer-39.ivf" + "av1-1-b10-00-quantizer-39.ivf.md5" + "av1-1-b10-00-quantizer-40.ivf" + "av1-1-b10-00-quantizer-40.ivf.md5" + "av1-1-b10-00-quantizer-41.ivf" + "av1-1-b10-00-quantizer-41.ivf.md5" + "av1-1-b10-00-quantizer-42.ivf" + "av1-1-b10-00-quantizer-42.ivf.md5" + "av1-1-b10-00-quantizer-43.ivf" + "av1-1-b10-00-quantizer-43.ivf.md5" + "av1-1-b10-00-quantizer-44.ivf" + "av1-1-b10-00-quantizer-44.ivf.md5" + "av1-1-b10-00-quantizer-45.ivf" + "av1-1-b10-00-quantizer-45.ivf.md5" + "av1-1-b10-00-quantizer-46.ivf" + "av1-1-b10-00-quantizer-46.ivf.md5" + "av1-1-b10-00-quantizer-47.ivf" + "av1-1-b10-00-quantizer-47.ivf.md5" + "av1-1-b10-00-quantizer-48.ivf" + "av1-1-b10-00-quantizer-48.ivf.md5" + "av1-1-b10-00-quantizer-49.ivf" + "av1-1-b10-00-quantizer-49.ivf.md5" + "av1-1-b10-00-quantizer-50.ivf" + "av1-1-b10-00-quantizer-50.ivf.md5" + "av1-1-b10-00-quantizer-51.ivf" + "av1-1-b10-00-quantizer-51.ivf.md5" + "av1-1-b10-00-quantizer-52.ivf" + "av1-1-b10-00-quantizer-52.ivf.md5" + "av1-1-b10-00-quantizer-53.ivf" + "av1-1-b10-00-quantizer-53.ivf.md5" + "av1-1-b10-00-quantizer-54.ivf" + "av1-1-b10-00-quantizer-54.ivf.md5" + "av1-1-b10-00-quantizer-55.ivf" + "av1-1-b10-00-quantizer-55.ivf.md5" + "av1-1-b10-00-quantizer-56.ivf" + "av1-1-b10-00-quantizer-56.ivf.md5" + "av1-1-b10-00-quantizer-57.ivf" + "av1-1-b10-00-quantizer-57.ivf.md5" + "av1-1-b10-00-quantizer-58.ivf" + "av1-1-b10-00-quantizer-58.ivf.md5" + "av1-1-b10-00-quantizer-59.ivf" + "av1-1-b10-00-quantizer-59.ivf.md5" + "av1-1-b10-00-quantizer-60.ivf" + "av1-1-b10-00-quantizer-60.ivf.md5" + "av1-1-b10-00-quantizer-61.ivf" + "av1-1-b10-00-quantizer-61.ivf.md5" + "av1-1-b10-00-quantizer-62.ivf" + "av1-1-b10-00-quantizer-62.ivf.md5" + "av1-1-b10-00-quantizer-63.ivf" + "av1-1-b10-00-quantizer-63.ivf.md5" + "av1-1-b10-23-film_grain-50.ivf" + "av1-1-b10-23-film_grain-50.ivf.md5" + "av1-1-b8-01-size-16x16.ivf" + "av1-1-b8-01-size-16x16.ivf.md5" + "av1-1-b8-01-size-16x18.ivf" + "av1-1-b8-01-size-16x18.ivf.md5" + "av1-1-b8-01-size-16x32.ivf" + "av1-1-b8-01-size-16x32.ivf.md5" + "av1-1-b8-01-size-16x34.ivf" + "av1-1-b8-01-size-16x34.ivf.md5" + "av1-1-b8-01-size-16x64.ivf" + "av1-1-b8-01-size-16x64.ivf.md5" + "av1-1-b8-01-size-16x66.ivf" + "av1-1-b8-01-size-16x66.ivf.md5" + "av1-1-b8-01-size-18x16.ivf" + "av1-1-b8-01-size-18x16.ivf.md5" + "av1-1-b8-01-size-18x18.ivf" + "av1-1-b8-01-size-18x18.ivf.md5" + "av1-1-b8-01-size-18x32.ivf" + "av1-1-b8-01-size-18x32.ivf.md5" + "av1-1-b8-01-size-18x34.ivf" + "av1-1-b8-01-size-18x34.ivf.md5" + "av1-1-b8-01-size-18x64.ivf" + "av1-1-b8-01-size-18x64.ivf.md5" + "av1-1-b8-01-size-18x66.ivf" + "av1-1-b8-01-size-18x66.ivf.md5" + "av1-1-b8-01-size-196x196.ivf" + "av1-1-b8-01-size-196x196.ivf.md5" + "av1-1-b8-01-size-196x198.ivf" + "av1-1-b8-01-size-196x198.ivf.md5" + "av1-1-b8-01-size-196x200.ivf" + "av1-1-b8-01-size-196x200.ivf.md5" + "av1-1-b8-01-size-196x202.ivf" + "av1-1-b8-01-size-196x202.ivf.md5" + "av1-1-b8-01-size-196x208.ivf" + "av1-1-b8-01-size-196x208.ivf.md5" + "av1-1-b8-01-size-196x210.ivf" + "av1-1-b8-01-size-196x210.ivf.md5" + "av1-1-b8-01-size-196x224.ivf" + "av1-1-b8-01-size-196x224.ivf.md5" + "av1-1-b8-01-size-196x226.ivf" + "av1-1-b8-01-size-196x226.ivf.md5" + "av1-1-b8-01-size-198x196.ivf" + "av1-1-b8-01-size-198x196.ivf.md5" + "av1-1-b8-01-size-198x198.ivf" + "av1-1-b8-01-size-198x198.ivf.md5" + "av1-1-b8-01-size-198x200.ivf" + "av1-1-b8-01-size-198x200.ivf.md5" + "av1-1-b8-01-size-198x202.ivf" + "av1-1-b8-01-size-198x202.ivf.md5" + "av1-1-b8-01-size-198x208.ivf" + "av1-1-b8-01-size-198x208.ivf.md5" + "av1-1-b8-01-size-198x210.ivf" + "av1-1-b8-01-size-198x210.ivf.md5" + "av1-1-b8-01-size-198x224.ivf" + "av1-1-b8-01-size-198x224.ivf.md5" + "av1-1-b8-01-size-198x226.ivf" + "av1-1-b8-01-size-198x226.ivf.md5" + "av1-1-b8-01-size-200x196.ivf" + "av1-1-b8-01-size-200x196.ivf.md5" + "av1-1-b8-01-size-200x198.ivf" + "av1-1-b8-01-size-200x198.ivf.md5" + "av1-1-b8-01-size-200x200.ivf" + "av1-1-b8-01-size-200x200.ivf.md5" + "av1-1-b8-01-size-200x202.ivf" + "av1-1-b8-01-size-200x202.ivf.md5" + "av1-1-b8-01-size-200x208.ivf" + "av1-1-b8-01-size-200x208.ivf.md5" + "av1-1-b8-01-size-200x210.ivf" + "av1-1-b8-01-size-200x210.ivf.md5" + "av1-1-b8-01-size-200x224.ivf" + "av1-1-b8-01-size-200x224.ivf.md5" + "av1-1-b8-01-size-200x226.ivf" + "av1-1-b8-01-size-200x226.ivf.md5" + "av1-1-b8-01-size-202x196.ivf" + "av1-1-b8-01-size-202x196.ivf.md5" + "av1-1-b8-01-size-202x198.ivf" + "av1-1-b8-01-size-202x198.ivf.md5" + "av1-1-b8-01-size-202x200.ivf" + "av1-1-b8-01-size-202x200.ivf.md5" + "av1-1-b8-01-size-202x202.ivf" + "av1-1-b8-01-size-202x202.ivf.md5" + "av1-1-b8-01-size-202x208.ivf" + "av1-1-b8-01-size-202x208.ivf.md5" + "av1-1-b8-01-size-202x210.ivf" + "av1-1-b8-01-size-202x210.ivf.md5" + "av1-1-b8-01-size-202x224.ivf" + "av1-1-b8-01-size-202x224.ivf.md5" + "av1-1-b8-01-size-202x226.ivf" + "av1-1-b8-01-size-202x226.ivf.md5" + "av1-1-b8-01-size-208x196.ivf" + "av1-1-b8-01-size-208x196.ivf.md5" + "av1-1-b8-01-size-208x198.ivf" + "av1-1-b8-01-size-208x198.ivf.md5" + "av1-1-b8-01-size-208x200.ivf" + "av1-1-b8-01-size-208x200.ivf.md5" + "av1-1-b8-01-size-208x202.ivf" + "av1-1-b8-01-size-208x202.ivf.md5" + "av1-1-b8-01-size-208x208.ivf" + "av1-1-b8-01-size-208x208.ivf.md5" + "av1-1-b8-01-size-208x210.ivf" + "av1-1-b8-01-size-208x210.ivf.md5" + "av1-1-b8-01-size-208x224.ivf" + "av1-1-b8-01-size-208x224.ivf.md5" + "av1-1-b8-01-size-208x226.ivf" + "av1-1-b8-01-size-208x226.ivf.md5" + "av1-1-b8-01-size-210x196.ivf" + "av1-1-b8-01-size-210x196.ivf.md5" + "av1-1-b8-01-size-210x198.ivf" + "av1-1-b8-01-size-210x198.ivf.md5" + "av1-1-b8-01-size-210x200.ivf" + "av1-1-b8-01-size-210x200.ivf.md5" + "av1-1-b8-01-size-210x202.ivf" + "av1-1-b8-01-size-210x202.ivf.md5" + "av1-1-b8-01-size-210x208.ivf" + "av1-1-b8-01-size-210x208.ivf.md5" + "av1-1-b8-01-size-210x210.ivf" + "av1-1-b8-01-size-210x210.ivf.md5" + "av1-1-b8-01-size-210x224.ivf" + "av1-1-b8-01-size-210x224.ivf.md5" + "av1-1-b8-01-size-210x226.ivf" + "av1-1-b8-01-size-210x226.ivf.md5" + "av1-1-b8-01-size-224x196.ivf" + "av1-1-b8-01-size-224x196.ivf.md5" + "av1-1-b8-01-size-224x198.ivf" + "av1-1-b8-01-size-224x198.ivf.md5" + "av1-1-b8-01-size-224x200.ivf" + "av1-1-b8-01-size-224x200.ivf.md5" + "av1-1-b8-01-size-224x202.ivf" + "av1-1-b8-01-size-224x202.ivf.md5" + "av1-1-b8-01-size-224x208.ivf" + "av1-1-b8-01-size-224x208.ivf.md5" + "av1-1-b8-01-size-224x210.ivf" + "av1-1-b8-01-size-224x210.ivf.md5" + "av1-1-b8-01-size-224x224.ivf" + "av1-1-b8-01-size-224x224.ivf.md5" + "av1-1-b8-01-size-224x226.ivf" + "av1-1-b8-01-size-224x226.ivf.md5" + "av1-1-b8-01-size-226x196.ivf" + "av1-1-b8-01-size-226x196.ivf.md5" + "av1-1-b8-01-size-226x198.ivf" + "av1-1-b8-01-size-226x198.ivf.md5" + "av1-1-b8-01-size-226x200.ivf" + "av1-1-b8-01-size-226x200.ivf.md5" + "av1-1-b8-01-size-226x202.ivf" + "av1-1-b8-01-size-226x202.ivf.md5" + "av1-1-b8-01-size-226x208.ivf" + "av1-1-b8-01-size-226x208.ivf.md5" + "av1-1-b8-01-size-226x210.ivf" + "av1-1-b8-01-size-226x210.ivf.md5" + "av1-1-b8-01-size-226x224.ivf" + "av1-1-b8-01-size-226x224.ivf.md5" + "av1-1-b8-01-size-226x226.ivf" + "av1-1-b8-01-size-226x226.ivf.md5" + "av1-1-b8-01-size-32x16.ivf" + "av1-1-b8-01-size-32x16.ivf.md5" + "av1-1-b8-01-size-32x18.ivf" + "av1-1-b8-01-size-32x18.ivf.md5" + "av1-1-b8-01-size-32x32.ivf" + "av1-1-b8-01-size-32x32.ivf.md5" + "av1-1-b8-01-size-32x34.ivf" + "av1-1-b8-01-size-32x34.ivf.md5" + "av1-1-b8-01-size-32x64.ivf" + "av1-1-b8-01-size-32x64.ivf.md5" + "av1-1-b8-01-size-32x66.ivf" + "av1-1-b8-01-size-32x66.ivf.md5" + "av1-1-b8-01-size-34x16.ivf" + "av1-1-b8-01-size-34x16.ivf.md5" + "av1-1-b8-01-size-34x18.ivf" + "av1-1-b8-01-size-34x18.ivf.md5" + "av1-1-b8-01-size-34x32.ivf" + "av1-1-b8-01-size-34x32.ivf.md5" + "av1-1-b8-01-size-34x34.ivf" + "av1-1-b8-01-size-34x34.ivf.md5" + "av1-1-b8-01-size-34x64.ivf" + "av1-1-b8-01-size-34x64.ivf.md5" + "av1-1-b8-01-size-34x66.ivf" + "av1-1-b8-01-size-34x66.ivf.md5" + "av1-1-b8-01-size-64x16.ivf" + "av1-1-b8-01-size-64x16.ivf.md5" + "av1-1-b8-01-size-64x18.ivf" + "av1-1-b8-01-size-64x18.ivf.md5" + "av1-1-b8-01-size-64x32.ivf" + "av1-1-b8-01-size-64x32.ivf.md5" + "av1-1-b8-01-size-64x34.ivf" + "av1-1-b8-01-size-64x34.ivf.md5" + "av1-1-b8-01-size-64x64.ivf" + "av1-1-b8-01-size-64x64.ivf.md5" + "av1-1-b8-01-size-64x66.ivf" + "av1-1-b8-01-size-64x66.ivf.md5" + "av1-1-b8-01-size-66x16.ivf" + "av1-1-b8-01-size-66x16.ivf.md5" + "av1-1-b8-01-size-66x18.ivf" + "av1-1-b8-01-size-66x18.ivf.md5" + "av1-1-b8-01-size-66x32.ivf" + "av1-1-b8-01-size-66x32.ivf.md5" + "av1-1-b8-01-size-66x34.ivf" + "av1-1-b8-01-size-66x34.ivf.md5" + "av1-1-b8-01-size-66x64.ivf" + "av1-1-b8-01-size-66x64.ivf.md5" + "av1-1-b8-01-size-66x66.ivf" + "av1-1-b8-01-size-66x66.ivf.md5" + "av1-1-b8-02-allintra.ivf" + "av1-1-b8-02-allintra.ivf.md5" + "av1-1-b8-03-sizeup.mkv" + "av1-1-b8-03-sizeup.mkv.md5" + "av1-1-b8-03-sizedown.mkv" + "av1-1-b8-03-sizedown.mkv.md5" + "av1-1-b8-04-cdfupdate.ivf" + "av1-1-b8-04-cdfupdate.ivf.md5" + "av1-1-b8-05-mv.ivf" + "av1-1-b8-05-mv.ivf.md5" + "av1-1-b8-06-mfmv.ivf" + "av1-1-b8-06-mfmv.ivf.md5" + "av1-1-b8-22-svc-L2T1.ivf" + "av1-1-b8-22-svc-L2T1.ivf.md5" + "av1-1-b8-22-svc-L1T2.ivf" + "av1-1-b8-22-svc-L1T2.ivf.md5" + "av1-1-b8-22-svc-L2T2.ivf" + "av1-1-b8-22-svc-L2T2.ivf.md5" + "av1-1-b8-23-film_grain-50.ivf" + "av1-1-b8-23-film_grain-50.ivf.md5" + "invalid-bug-1814.ivf" + "invalid-bug-1814.ivf.res" + "invalid-chromium-906381.ivf" + "invalid-chromium-906381.ivf.res" + "invalid-google-142530197-1.ivf" + "invalid-google-142530197-1.ivf.res" + "invalid-google-142530197.ivf" + "invalid-google-142530197.ivf.res" + "invalid-oss-fuzz-10061.ivf" + "invalid-oss-fuzz-10061.ivf.res" + "invalid-oss-fuzz-10117-mc-buf-use-highbd.ivf" + "invalid-oss-fuzz-10117-mc-buf-use-highbd.ivf.res" + "invalid-oss-fuzz-10227.ivf" + "invalid-oss-fuzz-10227.ivf.res" + "invalid-oss-fuzz-10389.ivf" + "invalid-oss-fuzz-10389.ivf.res" + "invalid-oss-fuzz-10389.ivf.res.2" + "invalid-oss-fuzz-10555.ivf" + "invalid-oss-fuzz-10555.ivf.res" + "invalid-oss-fuzz-10705.ivf" + "invalid-oss-fuzz-10705.ivf.res" + "invalid-oss-fuzz-10723.ivf" + "invalid-oss-fuzz-10723.ivf.res" + "invalid-oss-fuzz-10723.ivf.res.2" + "invalid-oss-fuzz-10779.ivf" + "invalid-oss-fuzz-10779.ivf.res" + "invalid-oss-fuzz-11477.ivf" + "invalid-oss-fuzz-11477.ivf.res" + "invalid-oss-fuzz-11479.ivf" + "invalid-oss-fuzz-11479.ivf.res" + "invalid-oss-fuzz-11479.ivf.res.2" + "invalid-oss-fuzz-11523.ivf" + "invalid-oss-fuzz-11523.ivf.res" + "invalid-oss-fuzz-11523.ivf.res.2" + "invalid-oss-fuzz-15363.ivf" + "invalid-oss-fuzz-15363.ivf.res" + "invalid-oss-fuzz-16437.ivf" + "invalid-oss-fuzz-16437.ivf.res" + "invalid-oss-fuzz-9288.ivf" + "invalid-oss-fuzz-9288.ivf.res" + "invalid-oss-fuzz-9463.ivf" + "invalid-oss-fuzz-9463.ivf.res" + "invalid-oss-fuzz-9463.ivf.res.2" + "invalid-oss-fuzz-9482.ivf" + "invalid-oss-fuzz-9482.ivf.res" + "invalid-oss-fuzz-9720.ivf" + "invalid-oss-fuzz-9720.ivf.res") +endif() + +if(ENABLE_ENCODE_PERF_TESTS AND CONFIG_AV1_ENCODER) + list(APPEND AOM_TEST_DATA_FILE_NAMES "desktop_640_360_30.yuv" + "kirland_640_480_30.yuv" "macmarcomoving_640_480_30.yuv" + "macmarcostationary_640_480_30.yuv" "niklas_1280_720_30.yuv" + "tacomanarrows_640_480_30.yuv" + "tacomasmallcameramovement_640_480_30.yuv" + "thaloundeskmtg_640_480_30.yuv") +endif() + +# Parses test/test-data.sha1 and writes captured file names and checksums to +# $out_files and $out_checksums as lists. +function(make_test_data_lists test_data_file out_files out_checksums) + if(NOT test_data_file OR NOT EXISTS "${test_data_file}") + message(FATAL_ERROR "Test info file missing or empty (${test_data_file})") + endif() + + # Read $test_data_file into $files_and_checksums. $files_and_checksums becomes + # a list with an entry for each line from $test_data_file. + file(STRINGS "${test_data_file}" files_and_checksums) + + # Iterate over the list of lines and split it into $checksums and $filenames. + foreach(line ${files_and_checksums}) + string(FIND "${line}" " *" delim_pos) + + math(EXPR filename_pos "${delim_pos} + 2") + string(SUBSTRING "${line}" 0 ${delim_pos} checksum) + string(SUBSTRING "${line}" ${filename_pos} -1 filename) + + list(FIND AOM_TEST_DATA_FILE_NAMES ${filename} list_index) + if(NOT ${list_index} EQUAL -1) + + # Include the name and checksum in output only when the file is needed. + set(checksums ${checksums} ${checksum}) + set(filenames ${filenames} ${filename}) + endif() + endforeach() + + list(LENGTH filenames num_files) + list(LENGTH checksums num_checksums) + if(NOT checksums OR NOT filenames OR NOT num_files EQUAL num_checksums) + message(FATAL_ERROR "Parsing of ${test_data_file} failed.") + endif() + + set(${out_checksums} ${checksums} PARENT_SCOPE) + set(${out_files} ${filenames} PARENT_SCOPE) +endfunction() + +# Appends each file name in $test_files to $test_dir and adds the result path to +# $out_path_list. +function(expand_test_file_paths test_files test_dir out_path_list) + foreach(filename ${${test_files}}) + set(path_list ${path_list} "${test_dir}/${filename}") + endforeach() + set(${out_path_list} ${path_list} PARENT_SCOPE) +endfunction() + +function(check_file local_path expected_checksum out_needs_update) + if(EXISTS "${local_path}") + file(SHA1 "${local_path}" file_checksum) + else() + set(${out_needs_update} 1 PARENT_SCOPE) + return() + endif() + + if("${file_checksum}" STREQUAL "${expected_checksum}") + unset(${out_needs_update} PARENT_SCOPE) + else() + set(${out_needs_update} 1 PARENT_SCOPE) + return() + endif() + message("${local_path} up to date.") +endfunction() + +# Downloads data from $file_url, confirms that $file_checksum matches, and +# writes it to $local_path. +function(download_test_file file_url file_checksum local_path) + message("Downloading ${file_url} ...") + file(DOWNLOAD "${file_url}" "${local_path}" SHOW_PROGRESS EXPECTED_HASH + SHA1=${file_checksum}) + message("Download of ${file_url} complete.") +endfunction() diff --git a/libs/libaom/src/test/test_intra_pred_speed.cc b/libs/libaom/src/test/test_intra_pred_speed.cc new file mode 100644 index 000000000..25c50d022 --- /dev/null +++ b/libs/libaom/src/test/test_intra_pred_speed.cc @@ -0,0 +1,1467 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Test and time AOM intra-predictor functions + +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_dsp_rtcd.h" + +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/md5_helper.h" +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_ports/aom_timer.h" +#include "av1/common/common_data.h" + +// ----------------------------------------------------------------------------- + +namespace { + +// Note: +// APPLY_UNIT_TESTS +// 1: Do unit tests +// 0: Generate MD5 array as required +#define APPLY_UNIT_TESTS 1 + +typedef void (*AvxPredFunc)(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left); + +const int kBPS = 64; +const int kTotalPixels = kBPS * kBPS; +// 4 DC variants, V, H, PAETH, SMOOTH, SMOOTH_V, SMOOTH_H +const int kNumAv1IntraFuncs = 10; + +#if APPLY_UNIT_TESTS +const char *kAv1IntraPredNames[kNumAv1IntraFuncs] = { + "DC_PRED", "DC_LEFT_PRED", "DC_TOP_PRED", "DC_128_PRED", "V_PRED", + "H_PRED", "PAETH_PRED", "SMOOTH_PRED", "SMOOTH_V_PRED", "SMOOTH_H_PRED", +}; +#endif // APPLY_UNIT_TESTS + +template +struct IntraPredTestMem { + void Init(int block_width, int block_height, int bd) { + ASSERT_LE(block_width, kBPS); + ASSERT_LE(block_height, kBPS); + // Note: for blocks having width <= 32 and height <= 32, we generate 32x32 + // random pixels as before to avoid having to recalculate all hashes again. + const int block_size_upto_32 = (block_width <= 32) && (block_height <= 32); + stride = block_size_upto_32 ? 32 : kBPS; + num_pixels = stride * stride; + libaom_test::ACMRandom rnd(libaom_test::ACMRandom::DeterministicSeed()); + above = above_mem + 16; + const int mask = (1 << bd) - 1; + for (int i = 0; i < num_pixels; ++i) ref_src[i] = rnd.Rand16() & mask; + for (int i = 0; i < stride; ++i) left[i] = rnd.Rand16() & mask; + for (int i = -1; i < stride; ++i) above[i] = rnd.Rand16() & mask; + + for (int i = stride; i < 2 * stride; ++i) { + left[i] = rnd.Rand16() & mask; + above[i] = rnd.Rand16() & mask; + } + } + + DECLARE_ALIGNED(16, Pixel, src[kTotalPixels]); + DECLARE_ALIGNED(16, Pixel, ref_src[kTotalPixels]); + DECLARE_ALIGNED(16, Pixel, left[2 * kBPS]); + Pixel *above; + int stride; + int num_pixels; + + private: + DECLARE_ALIGNED(16, Pixel, above_mem[2 * kBPS + 16]); +}; + +// ----------------------------------------------------------------------------- +// Low Bittdepth + +typedef IntraPredTestMem Av1IntraPredTestMem; + +static const char *const kTxSizeStrings[TX_SIZES_ALL] = { + "4X4", "8X8", "16X16", "32X32", "64X64", "4X8", "8X4", + "8X16", "16X8", "16X32", "32X16", "32X64", "64X32", "4X16", + "16X4", "8X32", "32X8", "16X64", "64X16", +}; + +void CheckMd5Signature(TX_SIZE tx_size, bool is_hbd, + const char *const signatures[], const void *data, + size_t data_size, int elapsed_time, int idx) { + const std::string hbd_str = is_hbd ? "Hbd " : ""; + const std::string name_str = hbd_str + "Intra" + kTxSizeStrings[tx_size]; + libaom_test::MD5 md5; + md5.Add(reinterpret_cast(data), data_size); +#if APPLY_UNIT_TESTS + printf("Mode %s[%13s]: %5d ms MD5: %s\n", name_str.c_str(), + kAv1IntraPredNames[idx], elapsed_time, md5.Get()); + EXPECT_STREQ(signatures[idx], md5.Get()); +#else + (void)signatures; + (void)elapsed_time; + (void)idx; + printf("\"%s\",\n", md5.Get()); +#endif +} + +void TestIntraPred(TX_SIZE tx_size, AvxPredFunc const *pred_funcs, + const char *const signatures[]) { + const int block_width = tx_size_wide[tx_size]; + const int block_height = tx_size_high[tx_size]; + const int num_pixels_per_test = + block_width * block_height * kNumAv1IntraFuncs; + const int kNumTests = static_cast(2.e10 / num_pixels_per_test); + Av1IntraPredTestMem intra_pred_test_mem; + intra_pred_test_mem.Init(block_width, block_height, 8); + + for (int k = 0; k < kNumAv1IntraFuncs; ++k) { + if (pred_funcs[k] == NULL) continue; + memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src, + sizeof(intra_pred_test_mem.src)); + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int num_tests = 0; num_tests < kNumTests; ++num_tests) { + pred_funcs[k](intra_pred_test_mem.src, intra_pred_test_mem.stride, + intra_pred_test_mem.above, intra_pred_test_mem.left); + } + libaom_test::ClearSystemState(); + aom_usec_timer_mark(&timer); + const int elapsed_time = + static_cast(aom_usec_timer_elapsed(&timer) / 1000); + CheckMd5Signature( + tx_size, false, signatures, intra_pred_test_mem.src, + intra_pred_test_mem.num_pixels * sizeof(*intra_pred_test_mem.src), + elapsed_time, k); + } +} + +static const char *const kSignatures[TX_SIZES_ALL][kNumAv1IntraFuncs] = { + { + // 4X4 + "e7ed7353c3383fff942e500e9bfe82fe", + "2a4a26fcc6ce005eadc08354d196c8a9", + "269d92eff86f315d9c38fe7640d85b15", + "ae2960eea9f71ee3dabe08b282ec1773", + "6c1abcc44e90148998b51acd11144e9c", + "f7bb3186e1ef8a2b326037ff898cad8e", + "59fc0e923a08cfac0a493fb38988e2bb", + "9ff8bb37d9c830e6ab8ecb0c435d3c91", + "de6937fca02354f2874dbc5dbec5d5b3", + "723cf948137f7d8c7860d814e55ae67d", + }, + { + // 8X8 + "d8bbae5d6547cfc17e4f5f44c8730e88", + "373bab6d931868d41a601d9d88ce9ac3", + "6fdd5ff4ff79656c14747598ca9e3706", + "d9661c2811d6a73674f40ffb2b841847", + "7c722d10b19ccff0b8c171868e747385", + "f81dd986eb2b50f750d3a7da716b7e27", + "064404361748dd111a890a1470d7f0ea", + "dc29b7e1f78cc8e7525d5ea4c0ab9b78", + "97111eb1bc26bade6272015df829f1ae", + "d19a8a73cc46b807f2c5e817576cc1e1", + }, + { + // 16X16 + "50971c07ce26977d30298538fffec619", + "527a6b9e0dc5b21b98cf276305432bef", + "7eff2868f80ebc2c43a4f367281d80f7", + "67cd60512b54964ef6aff1bd4816d922", + "48371c87dc95c08a33b2048f89cf6468", + "b0acf2872ee411d7530af6d2625a7084", + "93d6b5352b571805ab16a55e1bbed86a", + "03764e4c0aebbc180e4e2c68fb06df2b", + "bb6c74c9076c9f266ab11fb57060d8e6", + "0c5162bc28489756ddb847b5678e6f07", + }, + { + // 32X32 + "a0a618c900e65ae521ccc8af789729f2", + "985aaa7c72b4a6c2fb431d32100cf13a", + "10662d09febc3ca13ee4e700120daeb5", + "b3b01379ba08916ef6b1b35f7d9ad51c", + "9f4261755795af97e34679c333ec7004", + "bc2c9da91ad97ef0d1610fb0a9041657", + "ef1653982b69e1f64bee3759f3e1ec45", + "1a51a675deba2c83282142eb48d3dc3d", + "866c224746dc260cda861a7b1b383fb3", + "cea23799fc3526e1b6a6ff02b42b82af", + }, + { + // 64X64 + "6e1094fa7b50bc813aa2ba29f5df8755", + "afe020786b83b793c2bbd9468097ff6e", + "be91585259bc37bf4dc1651936e90b3e", + "a1650dbcd56e10288c3e269eca37967d", + "9e5c34f3797e0cdd3cd9d4c05b0d8950", + "bc87be7ac899cc6a28f399d7516c49fe", + "9811fd0d2dd515f06122f5d1bd18b784", + "3c140e466f2c2c0d9cb7d2157ab8dc27", + "9543de76c925a8f6adc884cc7f98dc91", + "df1df0376cc944afe7e74e94f53e575a", + }, + { + // 4X8 + "d9fbebdc85f71ab1e18461b2db4a2adc", + "5ccb2a68284bc9714d94b8a06ccadbb2", + "735d059abc2744f3ff3f9590f7191b37", + "d9fbebdc85f71ab1e18461b2db4a2adc", + "6819497c44cd0ace120add83672996ee", + "7e3244f5a2d3edf81c7e962a842b97f9", + "809350f164cd4d1650850bb0f59c3260", + "1b60a394331eeab6927a6f8aaff57040", + "5307de1bd7329ba6b281d2c1b0b457f9", + "24c58a8138339846d95568efb91751db", + }, + { + // 8X4 + "23f9fc11344426c9bee2e06d57dfd628", + "2d71a26d1bae1fb34734de7b42fc5eb7", + "5af9c1b2fd9d5721fad67b67b3f7c816", + "00d71b17be662753813d515f197d145e", + "bef10ec984427e28f4390f43809d10af", + "77773cdfb7ed6bc882ab202a64b0a470", + "2cc48bd66d6b0121b5221d52ccd732af", + "b302155e1c9eeeafe2ba2bf68e807a46", + "561bc8d0e76d5041ebd5168fc6a115e1", + "81d0113fb1d0a9a24ffd6f1987b77948", + }, + { + // 8X16 + "c849de88b24f773dfcdd1d48d1209796", + "6cb807c1897b94866a0f3d3c56ed8695", + "d56db05a8ac7981762f5b877f486c4ef", + "b4bc01eb6e59a40922ad17715cafb04b", + "09d178439534f4062ae687c351f66d64", + "644501399cf73080ac606e5cef7ca09b", + "278076495180e17c065a95ab7278539a", + "9dd7f324816f242be408ffeb0c673732", + "f520c4a20acfa0bea1d253c6f0f040fd", + "85f38df809df2c2d7c8b4a157a65cd44", + }, + { + // 16X8 + "b4cbdbdf10ce13300b4063a3daf99e04", + "3731e1e6202064a9d0604d7c293ecee4", + "6c856188c4256a06452f0d5d70cac436", + "1f2192b4c8c497589484ea7bf9c944e8", + "84011bd4b7f565119d06787840e333a0", + "0e48949f7a6aa36f0d76b5d01f91124a", + "60eff8064634b6c73b10681356baeee9", + "1559aeb081a9c0c71111d6093c2ff9fd", + "c15479b739713773e5cabb748451987b", + "72e33ec12c9b67aea26d8d005fb82de2", + }, + { + // 16X32 + "abe5233d189cdbf79424721571bbaa7b", + "282759f81e3cfb2e2d396fe406b72a8b", + "e2224926c264f6f174cbc3167a233168", + "6814e85c2b33f8c9415d62e80394b47b", + "99cbbb60459c08a3061d72c4e4f6276a", + "1d1567d40b8e816f8c1f71e576fe0f87", + "36fdd371b624a075814d497c4832ec85", + "8ab8da61b727442b6ff692b40d0df018", + "e35a10ad7fdf2327e821504a90f6a6eb", + "1f7211e727dc1de7d6a55d082fbdd821", + }, + { + // 32X16 + "d1aeb8d5fdcfd3307922af01a798a4dc", + "b0bcb514ebfbee065faea9d34c12ae75", + "d6a18c63b4e909871c0137ca652fad23", + "fd047f2fc1b8ffb95d0eeef3e8796a45", + "645ab60779ea348fd93c81561c31bab9", + "4409633c9db8dff41ade4292a3a56e7f", + "5e36a11e069b31c2a739f3a9c7b37c24", + "e83b9483d702cfae496991c3c7fa92c0", + "12f6ddf98c7f30a277307f1ea935b030", + "354321d6c32bbdb0739e4fa2acbf41e1", + }, + { + // 32X64 + "0ce332b343934b34cd4417725faa85cb", + "4e2a2cfd8f56f15939bdfc753145b303", + "0f46d124ba9f48cdd5d5290acf786d6d", + "e1e8ed803236367821981500a3d9eebe", + "1d2f8e48e3adb7c448be05d9f66f4954", + "9fb2e176636a5689b26f73ca73fcc512", + "e720ebccae7e25e36f23da53ae5b5d6a", + "86fe4364734169aaa4520d799890d530", + "b1870290764bb1b100d1974e2bd70f1d", + "ce5b238e19d85ef69d85badfab4e63ae", + }, + { + // 64X32 + "a6c5aeb722615089efbca80b02951ceb", + "538424b24bd0830f21788e7238ca762f", + "80c15b303235f9bc2259027bb92dfdc4", + "e48e1ac15e97191a8fda08d62fff343e", + "12604b37875533665078405ef4582e35", + "0048afa17bd3e1632d68b96048836530", + "07a0cfcb56a5eed50c4bd6c26814336b", + "529d8a070de5bc6531fa3ee8f450c233", + "33c50a11c7d78f72434064f634305e95", + "e0ef7f0559c1a50ec5a8c12011b962f7", + }, + { + // 4X16 + "750491056568eb8fe15387b86bdf06b8", + "3a52dae9f599f08cfb3bd1b910dc0e11", + "af79f71e3e03dbeca44e2e13561f70c7", + "ca7dfd7624afc0c06fb5552f44398535", + "b591af115444bf43140c29c269f68fb2", + "483d942ae36e69e62f31eb215331416f", + "f14b58525e81870bc5d95c7ac71a347f", + "371208bb4027d9badb04095d1590bbc4", + "c7049c21b2924d70c7c12784d6b6b796", + "7d87233f4b5b0f12086045e5d7b2d4c2", + }, + { + // 16X4 + "7c6e325a65e77e732b3adbe237e045e4", + "24478f93ffcec47852e004d0fe948464", + "258d042c67d4ba3ecfa667f0adc9aebf", + "b2cd21d06959f159a1f3c4d9768ee7fb", + "b4e1f38157bf8410e7c3da02f687a343", + "869e703729eb0fc0711c254944ff5d5a", + "9638dd77105a640b146a8201ea7a0801", + "919d932c6af8a1cc7486e8ce996dd487", + "e1c9be493b6714c7ae48f30044c43140", + "bf0fe3889d654b2f6eb98c8fc751f9e4", + }, + { + // 8X32 + "8dfac4319fe0bd40013ffb3102da8c72", + "feb46b6dc4e2ca0a09533bfc51d4dcb0", + "850837ec714c37262216527aaf4cbbe9", + "4603c7800fb08361f163daca876e8bda", + "1ff95e7d2debc27b05806fb25abfd624", + "d81b9a51a062b23ca7823804cb7bec22", + "f1d8978158766f46335203608cb807e7", + "f3527096256258c0878d644a9d7d53ca", + "cbde98ac8b009953eb112807ad2ea29e", + "654fb1153415747feae599f538122af5", + }, + { + // 32X8 + "3d4ee16fab374357474f60b845327bc7", + "bc17c5059473a476df4e85f56395ad55", + "3d4ee16fab374357474f60b845327bc7", + "c14b8db34dc2355b84e3735c9ba16c7f", + "a71d25b5d47a92a8b9223c98f18458ee", + "6c1cfe2b1893f4576a80675687cb6426", + "92d11bbef8b85bb48d799bb055de3514", + "bcf81d1db8ae5cc03360467f44f498ec", + "79f8c564163555592e808e145eaf5c60", + "46fff139cef2ef773938bcc8b0e5abb8", + }, + { + // 16X64 + "3b2a053ee8b05a8ac35ad23b0422a151", + "12b0c69595328c465e0b25e0c9e3e9fc", + "f77c544ac8035e01920deae40cee7b07", + "727797ef15ccd8d325476fe8f12006a3", + "f3be77c0fe67eb5d9d515e92bec21eb7", + "f1ece6409e01e9dd98b800d49628247d", + "efd2ec9bfbbd4fd1f6604ea369df1894", + "ec703de918422b9e03197ba0ed60a199", + "739418efb89c07f700895deaa5d0b3e3", + "9943ae1bbeeebfe1d3a92dc39e049d63", + }, + { + // 64X16 + "821b76b1494d4f84d20817840f719a1a", + "69e462c3338a9aaf993c3f7cfbc15649", + "516d8f6eb054d74d150e7b444185b6b9", + "de1b736e9d99129609d6ef3a491507a0", + "fd9b4276e7affe1e0e4ce4f428058994", + "cd82fd361a4767ac29a9f406b480b8f3", + "2792c2f810157a4a6cb13c28529ff779", + "1220442d90c4255ba0969d28b91e93a6", + "c7253e10b45f7f67dfee3256c9b94825", + "879792198071c7e0b50b9b5010d8c18f", + }, +}; + +} // namespace + +// Defines a test case for |arch| (e.g., C, SSE2, ...) passing the predictors +// to TestIntraPred. The test name is 'arch.TestIntraPred_tx_size', e.g., +// C.TestIntraPred.0 +#define INTRA_PRED_TEST(arch, tx_size, dc, dc_left, dc_top, dc_128, v, h, \ + paeth, smooth, smooth_v, smooth_h) \ + TEST(arch, DISABLED_##TestIntraPred_##tx_size) { \ + static const AvxPredFunc aom_intra_pred[] = { \ + dc, dc_left, dc_top, dc_128, v, h, paeth, smooth, smooth_v, smooth_h \ + }; \ + TestIntraPred(tx_size, aom_intra_pred, kSignatures[tx_size]); \ + } + +// ----------------------------------------------------------------------------- +// 4x4, 4x8, 4x16 + +INTRA_PRED_TEST(C_1, TX_4X4, aom_dc_predictor_4x4_c, + aom_dc_left_predictor_4x4_c, aom_dc_top_predictor_4x4_c, + aom_dc_128_predictor_4x4_c, aom_v_predictor_4x4_c, + aom_h_predictor_4x4_c, aom_paeth_predictor_4x4_c, + aom_smooth_predictor_4x4_c, aom_smooth_v_predictor_4x4_c, + aom_smooth_h_predictor_4x4_c) + +INTRA_PRED_TEST(C_2, TX_4X8, aom_dc_predictor_4x8_c, + aom_dc_left_predictor_4x8_c, aom_dc_top_predictor_4x8_c, + aom_dc_128_predictor_4x8_c, aom_v_predictor_4x8_c, + aom_h_predictor_4x8_c, aom_paeth_predictor_4x8_c, + aom_smooth_predictor_4x8_c, aom_smooth_v_predictor_4x8_c, + aom_smooth_h_predictor_4x8_c) + +INTRA_PRED_TEST(C_3, TX_4X16, aom_dc_predictor_4x16_c, + aom_dc_left_predictor_4x16_c, aom_dc_top_predictor_4x16_c, + aom_dc_128_predictor_4x16_c, aom_v_predictor_4x16_c, + aom_h_predictor_4x16_c, aom_paeth_predictor_4x16_c, + aom_smooth_predictor_4x16_c, aom_smooth_v_predictor_4x16_c, + aom_smooth_h_predictor_4x16_c) + +#if HAVE_SSE2 +INTRA_PRED_TEST(SSE2_1, TX_4X4, aom_dc_predictor_4x4_sse2, + aom_dc_left_predictor_4x4_sse2, aom_dc_top_predictor_4x4_sse2, + aom_dc_128_predictor_4x4_sse2, aom_v_predictor_4x4_sse2, + aom_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL) +INTRA_PRED_TEST(SSE2_2, TX_4X8, aom_dc_predictor_4x8_sse2, + aom_dc_left_predictor_4x8_sse2, aom_dc_top_predictor_4x8_sse2, + aom_dc_128_predictor_4x8_sse2, aom_v_predictor_4x8_sse2, + aom_h_predictor_4x8_sse2, NULL, NULL, NULL, NULL) +INTRA_PRED_TEST(SSE2_3, TX_4X16, aom_dc_predictor_4x16_sse2, + aom_dc_left_predictor_4x16_sse2, aom_dc_top_predictor_4x16_sse2, + aom_dc_128_predictor_4x16_sse2, aom_v_predictor_4x16_sse2, + aom_h_predictor_4x16_sse2, NULL, NULL, NULL, NULL) +#endif // HAVE_SSE2 + +#if HAVE_SSSE3 +INTRA_PRED_TEST(SSSE3_1, TX_4X4, NULL, NULL, NULL, NULL, NULL, NULL, + aom_paeth_predictor_4x4_ssse3, aom_smooth_predictor_4x4_ssse3, + aom_smooth_v_predictor_4x4_ssse3, + aom_smooth_h_predictor_4x4_ssse3) +INTRA_PRED_TEST(SSSE3_2, TX_4X8, NULL, NULL, NULL, NULL, NULL, NULL, + aom_paeth_predictor_4x8_ssse3, aom_smooth_predictor_4x8_ssse3, + aom_smooth_v_predictor_4x8_ssse3, + aom_smooth_h_predictor_4x8_ssse3) +INTRA_PRED_TEST(SSSE3_3, TX_4X16, NULL, NULL, NULL, NULL, NULL, NULL, + aom_paeth_predictor_4x16_ssse3, aom_smooth_predictor_4x16_ssse3, + aom_smooth_v_predictor_4x16_ssse3, + aom_smooth_h_predictor_4x16_ssse3) +#endif // HAVE_SSSE3 + +#if HAVE_DSPR2 +INTRA_PRED_TEST(DSPR2, TX_4X4, aom_dc_predictor_4x4_dspr2, NULL, NULL, NULL, + NULL, aom_h_predictor_4x4_dspr2, NULL, NULL, NULL, NULL) +#endif // HAVE_DSPR2 + +#if HAVE_NEON +INTRA_PRED_TEST(NEON, TX_4X4, aom_dc_predictor_4x4_neon, + aom_dc_left_predictor_4x4_neon, aom_dc_top_predictor_4x4_neon, + aom_dc_128_predictor_4x4_neon, aom_v_predictor_4x4_neon, + aom_h_predictor_4x4_neon, NULL, NULL, NULL, NULL) +#endif // HAVE_NEON + +#if HAVE_MSA +INTRA_PRED_TEST(MSA, TX_4X4, aom_dc_predictor_4x4_msa, + aom_dc_left_predictor_4x4_msa, aom_dc_top_predictor_4x4_msa, + aom_dc_128_predictor_4x4_msa, aom_v_predictor_4x4_msa, + aom_h_predictor_4x4_msa, NULL, NULL, NULL, NULL) +#endif // HAVE_MSA + +// ----------------------------------------------------------------------------- +// 8x8, 8x4, 8x16, 8x32 + +INTRA_PRED_TEST(C_1, TX_8X8, aom_dc_predictor_8x8_c, + aom_dc_left_predictor_8x8_c, aom_dc_top_predictor_8x8_c, + aom_dc_128_predictor_8x8_c, aom_v_predictor_8x8_c, + aom_h_predictor_8x8_c, aom_paeth_predictor_8x8_c, + aom_smooth_predictor_8x8_c, aom_smooth_v_predictor_8x8_c, + aom_smooth_h_predictor_8x8_c) + +INTRA_PRED_TEST(C_2, TX_8X4, aom_dc_predictor_8x4_c, + aom_dc_left_predictor_8x4_c, aom_dc_top_predictor_8x4_c, + aom_dc_128_predictor_8x4_c, aom_v_predictor_8x4_c, + aom_h_predictor_8x4_c, aom_paeth_predictor_8x4_c, + aom_smooth_predictor_8x4_c, aom_smooth_v_predictor_8x4_c, + aom_smooth_h_predictor_8x4_c) + +INTRA_PRED_TEST(C_3, TX_8X16, aom_dc_predictor_8x16_c, + aom_dc_left_predictor_8x16_c, aom_dc_top_predictor_8x16_c, + aom_dc_128_predictor_8x16_c, aom_v_predictor_8x16_c, + aom_h_predictor_8x16_c, aom_paeth_predictor_8x16_c, + aom_smooth_predictor_8x16_c, aom_smooth_v_predictor_8x16_c, + aom_smooth_h_predictor_8x16_c) + +INTRA_PRED_TEST(C_4, TX_8X32, aom_dc_predictor_8x32_c, + aom_dc_left_predictor_8x32_c, aom_dc_top_predictor_8x32_c, + aom_dc_128_predictor_8x32_c, aom_v_predictor_8x32_c, + aom_h_predictor_8x32_c, aom_paeth_predictor_8x32_c, + aom_smooth_predictor_8x32_c, aom_smooth_v_predictor_8x32_c, + aom_smooth_h_predictor_8x32_c) + +#if HAVE_SSE2 +INTRA_PRED_TEST(SSE2_1, TX_8X8, aom_dc_predictor_8x8_sse2, + aom_dc_left_predictor_8x8_sse2, aom_dc_top_predictor_8x8_sse2, + aom_dc_128_predictor_8x8_sse2, aom_v_predictor_8x8_sse2, + aom_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL) +INTRA_PRED_TEST(SSE2_2, TX_8X4, aom_dc_predictor_8x4_sse2, + aom_dc_left_predictor_8x4_sse2, aom_dc_top_predictor_8x4_sse2, + aom_dc_128_predictor_8x4_sse2, aom_v_predictor_8x4_sse2, + aom_h_predictor_8x4_sse2, NULL, NULL, NULL, NULL) +INTRA_PRED_TEST(SSE2_3, TX_8X16, aom_dc_predictor_8x16_sse2, + aom_dc_left_predictor_8x16_sse2, aom_dc_top_predictor_8x16_sse2, + aom_dc_128_predictor_8x16_sse2, aom_v_predictor_8x16_sse2, + aom_h_predictor_8x16_sse2, NULL, NULL, NULL, NULL) +INTRA_PRED_TEST(SSE2_4, TX_8X32, aom_dc_predictor_8x32_sse2, + aom_dc_left_predictor_8x32_sse2, aom_dc_top_predictor_8x32_sse2, + aom_dc_128_predictor_8x32_sse2, aom_v_predictor_8x32_sse2, + aom_h_predictor_8x32_sse2, NULL, NULL, NULL, NULL) +#endif // HAVE_SSE2 + +#if HAVE_SSSE3 +INTRA_PRED_TEST(SSSE3_1, TX_8X8, NULL, NULL, NULL, NULL, NULL, NULL, + aom_paeth_predictor_8x8_ssse3, aom_smooth_predictor_8x8_ssse3, + aom_smooth_v_predictor_8x8_ssse3, + aom_smooth_h_predictor_8x8_ssse3) +INTRA_PRED_TEST(SSSE3_2, TX_8X4, NULL, NULL, NULL, NULL, NULL, NULL, + aom_paeth_predictor_8x4_ssse3, aom_smooth_predictor_8x4_ssse3, + aom_smooth_v_predictor_8x4_ssse3, + aom_smooth_h_predictor_8x4_ssse3) +INTRA_PRED_TEST(SSSE3_3, TX_8X16, NULL, NULL, NULL, NULL, NULL, NULL, + aom_paeth_predictor_8x16_ssse3, aom_smooth_predictor_8x16_ssse3, + aom_smooth_v_predictor_8x16_ssse3, + aom_smooth_h_predictor_8x16_ssse3) +INTRA_PRED_TEST(SSSE3_4, TX_8X32, NULL, NULL, NULL, NULL, NULL, NULL, + aom_paeth_predictor_8x32_ssse3, aom_smooth_predictor_8x32_ssse3, + aom_smooth_v_predictor_8x32_ssse3, + aom_smooth_h_predictor_8x32_ssse3) +#endif // HAVE_SSSE3 + +#if HAVE_DSPR2 +INTRA_PRED_TEST(DSPR2, TX_8X8, aom_dc_predictor_8x8_dspr2, NULL, NULL, NULL, + NULL, aom_h_predictor_8x8_dspr2, NULL, NULL, NULL, NULL) +#endif // HAVE_DSPR2 + +#if HAVE_NEON +INTRA_PRED_TEST(NEON, TX_8X8, aom_dc_predictor_8x8_neon, + aom_dc_left_predictor_8x8_neon, aom_dc_top_predictor_8x8_neon, + aom_dc_128_predictor_8x8_neon, aom_v_predictor_8x8_neon, + aom_h_predictor_8x8_neon, NULL, NULL, NULL, NULL) +#endif // HAVE_NEON + +#if HAVE_MSA +INTRA_PRED_TEST(MSA, TX_8X8, aom_dc_predictor_8x8_msa, + aom_dc_left_predictor_8x8_msa, aom_dc_top_predictor_8x8_msa, + aom_dc_128_predictor_8x8_msa, aom_v_predictor_8x8_msa, + aom_h_predictor_8x8_msa, NULL, NULL, NULL, NULL) +#endif // HAVE_MSA + +// ----------------------------------------------------------------------------- +// 16x16, 16x8, 16x32, 16x4, 16x64 + +INTRA_PRED_TEST(C_1, TX_16X16, aom_dc_predictor_16x16_c, + aom_dc_left_predictor_16x16_c, aom_dc_top_predictor_16x16_c, + aom_dc_128_predictor_16x16_c, aom_v_predictor_16x16_c, + aom_h_predictor_16x16_c, aom_paeth_predictor_16x16_c, + aom_smooth_predictor_16x16_c, aom_smooth_v_predictor_16x16_c, + aom_smooth_h_predictor_16x16_c) + +INTRA_PRED_TEST(C_2, TX_16X8, aom_dc_predictor_16x8_c, + aom_dc_left_predictor_16x8_c, aom_dc_top_predictor_16x8_c, + aom_dc_128_predictor_16x8_c, aom_v_predictor_16x8_c, + aom_h_predictor_16x8_c, aom_paeth_predictor_16x8_c, + aom_smooth_predictor_16x8_c, aom_smooth_v_predictor_16x8_c, + aom_smooth_h_predictor_16x8_c) + +INTRA_PRED_TEST(C_3, TX_16X32, aom_dc_predictor_16x32_c, + aom_dc_left_predictor_16x32_c, aom_dc_top_predictor_16x32_c, + aom_dc_128_predictor_16x32_c, aom_v_predictor_16x32_c, + aom_h_predictor_16x32_c, aom_paeth_predictor_16x32_c, + aom_smooth_predictor_16x32_c, aom_smooth_v_predictor_16x32_c, + aom_smooth_h_predictor_16x32_c) + +INTRA_PRED_TEST(C_4, TX_16X4, aom_dc_predictor_16x4_c, + aom_dc_left_predictor_16x4_c, aom_dc_top_predictor_16x4_c, + aom_dc_128_predictor_16x4_c, aom_v_predictor_16x4_c, + aom_h_predictor_16x4_c, aom_paeth_predictor_16x4_c, + aom_smooth_predictor_16x4_c, aom_smooth_v_predictor_16x4_c, + aom_smooth_h_predictor_16x4_c) + +INTRA_PRED_TEST(C_5, TX_16X64, aom_dc_predictor_16x64_c, + aom_dc_left_predictor_16x64_c, aom_dc_top_predictor_16x64_c, + aom_dc_128_predictor_16x64_c, aom_v_predictor_16x64_c, + aom_h_predictor_16x64_c, aom_paeth_predictor_16x64_c, + aom_smooth_predictor_16x64_c, aom_smooth_v_predictor_16x64_c, + aom_smooth_h_predictor_16x64_c) + +#if HAVE_SSE2 +INTRA_PRED_TEST(SSE2_1, TX_16X16, aom_dc_predictor_16x16_sse2, + aom_dc_left_predictor_16x16_sse2, + aom_dc_top_predictor_16x16_sse2, + aom_dc_128_predictor_16x16_sse2, aom_v_predictor_16x16_sse2, + aom_h_predictor_16x16_sse2, NULL, NULL, NULL, NULL) +INTRA_PRED_TEST(SSE2_2, TX_16X8, aom_dc_predictor_16x8_sse2, + aom_dc_left_predictor_16x8_sse2, aom_dc_top_predictor_16x8_sse2, + aom_dc_128_predictor_16x8_sse2, aom_v_predictor_16x8_sse2, + aom_h_predictor_16x8_sse2, NULL, NULL, NULL, NULL) +INTRA_PRED_TEST(SSE2_3, TX_16X32, aom_dc_predictor_16x32_sse2, + aom_dc_left_predictor_16x32_sse2, + aom_dc_top_predictor_16x32_sse2, + aom_dc_128_predictor_16x32_sse2, aom_v_predictor_16x32_sse2, + aom_h_predictor_16x32_sse2, NULL, NULL, NULL, NULL) +INTRA_PRED_TEST(SSE2_4, TX_16X64, aom_dc_predictor_16x64_sse2, + aom_dc_left_predictor_16x64_sse2, + aom_dc_top_predictor_16x64_sse2, + aom_dc_128_predictor_16x64_sse2, aom_v_predictor_16x64_sse2, + aom_h_predictor_16x64_sse2, NULL, NULL, NULL, NULL) +INTRA_PRED_TEST(SSE2_5, TX_16X4, aom_dc_predictor_16x4_sse2, + aom_dc_left_predictor_16x4_sse2, aom_dc_top_predictor_16x4_sse2, + aom_dc_128_predictor_16x4_sse2, aom_v_predictor_16x4_sse2, + aom_h_predictor_16x4_sse2, NULL, NULL, NULL, NULL) +#endif // HAVE_SSE2 + +#if HAVE_SSSE3 +INTRA_PRED_TEST(SSSE3_1, TX_16X16, NULL, NULL, NULL, NULL, NULL, NULL, + aom_paeth_predictor_16x16_ssse3, + aom_smooth_predictor_16x16_ssse3, + aom_smooth_v_predictor_16x16_ssse3, + aom_smooth_h_predictor_16x16_ssse3) +INTRA_PRED_TEST(SSSE3_2, TX_16X8, NULL, NULL, NULL, NULL, NULL, NULL, + aom_paeth_predictor_16x8_ssse3, aom_smooth_predictor_16x8_ssse3, + aom_smooth_v_predictor_16x8_ssse3, + aom_smooth_h_predictor_16x8_ssse3) +INTRA_PRED_TEST(SSSE3_3, TX_16X32, NULL, NULL, NULL, NULL, NULL, NULL, + aom_paeth_predictor_16x32_ssse3, + aom_smooth_predictor_16x32_ssse3, + aom_smooth_v_predictor_16x32_ssse3, + aom_smooth_h_predictor_16x32_ssse3) +INTRA_PRED_TEST(SSSE3_4, TX_16X64, NULL, NULL, NULL, NULL, NULL, NULL, + aom_paeth_predictor_16x64_ssse3, + aom_smooth_predictor_16x64_ssse3, + aom_smooth_v_predictor_16x64_ssse3, + aom_smooth_h_predictor_16x64_ssse3) +INTRA_PRED_TEST(SSSE3_5, TX_16X4, NULL, NULL, NULL, NULL, NULL, NULL, + aom_paeth_predictor_16x4_ssse3, aom_smooth_predictor_16x4_ssse3, + aom_smooth_v_predictor_16x4_ssse3, + aom_smooth_h_predictor_16x4_ssse3) +#endif // HAVE_SSSE3 + +#if HAVE_AVX2 +INTRA_PRED_TEST(AVX2_1, TX_16X16, NULL, NULL, NULL, NULL, NULL, NULL, + aom_paeth_predictor_16x16_avx2, NULL, NULL, NULL) +INTRA_PRED_TEST(AVX2_2, TX_16X8, NULL, NULL, NULL, NULL, NULL, NULL, + aom_paeth_predictor_16x8_avx2, NULL, NULL, NULL) +INTRA_PRED_TEST(AVX2_3, TX_16X32, NULL, NULL, NULL, NULL, NULL, NULL, + aom_paeth_predictor_16x32_avx2, NULL, NULL, NULL) +INTRA_PRED_TEST(AVX2_4, TX_16X64, NULL, NULL, NULL, NULL, NULL, NULL, + aom_paeth_predictor_16x64_avx2, NULL, NULL, NULL) +#endif // HAVE_AVX2 + +#if HAVE_DSPR2 +INTRA_PRED_TEST(DSPR2, TX_16X16, aom_dc_predictor_16x16_dspr2, NULL, NULL, NULL, + NULL, aom_h_predictor_16x16_dspr2, NULL, NULL, NULL, NULL) +#endif // HAVE_DSPR2 + +#if HAVE_NEON +INTRA_PRED_TEST(NEON, TX_16X16, aom_dc_predictor_16x16_neon, + aom_dc_left_predictor_16x16_neon, + aom_dc_top_predictor_16x16_neon, + aom_dc_128_predictor_16x16_neon, aom_v_predictor_16x16_neon, + aom_h_predictor_16x16_neon, NULL, NULL, NULL, NULL) +#endif // HAVE_NEON + +#if HAVE_MSA +INTRA_PRED_TEST(MSA, TX_16X16, aom_dc_predictor_16x16_msa, + aom_dc_left_predictor_16x16_msa, aom_dc_top_predictor_16x16_msa, + aom_dc_128_predictor_16x16_msa, aom_v_predictor_16x16_msa, + aom_h_predictor_16x16_msa, NULL, NULL, NULL, NULL) +#endif // HAVE_MSA + +// ----------------------------------------------------------------------------- +// 32x32, 32x16, 32x64, 32x8 + +INTRA_PRED_TEST(C_1, TX_32X32, aom_dc_predictor_32x32_c, + aom_dc_left_predictor_32x32_c, aom_dc_top_predictor_32x32_c, + aom_dc_128_predictor_32x32_c, aom_v_predictor_32x32_c, + aom_h_predictor_32x32_c, aom_paeth_predictor_32x32_c, + aom_smooth_predictor_32x32_c, aom_smooth_v_predictor_32x32_c, + aom_smooth_h_predictor_32x32_c) + +INTRA_PRED_TEST(C_2, TX_32X16, aom_dc_predictor_32x16_c, + aom_dc_left_predictor_32x16_c, aom_dc_top_predictor_32x16_c, + aom_dc_128_predictor_32x16_c, aom_v_predictor_32x16_c, + aom_h_predictor_32x16_c, aom_paeth_predictor_32x16_c, + aom_smooth_predictor_32x16_c, aom_smooth_v_predictor_32x16_c, + aom_smooth_h_predictor_32x16_c) + +INTRA_PRED_TEST(C_3, TX_32X64, aom_dc_predictor_32x64_c, + aom_dc_left_predictor_32x64_c, aom_dc_top_predictor_32x64_c, + aom_dc_128_predictor_32x64_c, aom_v_predictor_32x64_c, + aom_h_predictor_32x64_c, aom_paeth_predictor_32x64_c, + aom_smooth_predictor_32x64_c, aom_smooth_v_predictor_32x64_c, + aom_smooth_h_predictor_32x64_c) + +INTRA_PRED_TEST(C_4, TX_32X8, aom_dc_predictor_32x8_c, + aom_dc_left_predictor_32x8_c, aom_dc_top_predictor_32x8_c, + aom_dc_128_predictor_32x8_c, aom_v_predictor_32x8_c, + aom_h_predictor_32x8_c, aom_paeth_predictor_32x8_c, + aom_smooth_predictor_32x8_c, aom_smooth_v_predictor_32x8_c, + aom_smooth_h_predictor_32x8_c) + +#if HAVE_SSE2 +INTRA_PRED_TEST(SSE2_1, TX_32X32, aom_dc_predictor_32x32_sse2, + aom_dc_left_predictor_32x32_sse2, + aom_dc_top_predictor_32x32_sse2, + aom_dc_128_predictor_32x32_sse2, aom_v_predictor_32x32_sse2, + aom_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL) +INTRA_PRED_TEST(SSE2_2, TX_32X16, aom_dc_predictor_32x16_sse2, + aom_dc_left_predictor_32x16_sse2, + aom_dc_top_predictor_32x16_sse2, + aom_dc_128_predictor_32x16_sse2, aom_v_predictor_32x16_sse2, + aom_h_predictor_32x16_sse2, NULL, NULL, NULL, NULL) +INTRA_PRED_TEST(SSE2_3, TX_32X64, aom_dc_predictor_32x64_sse2, + aom_dc_left_predictor_32x64_sse2, + aom_dc_top_predictor_32x64_sse2, + aom_dc_128_predictor_32x64_sse2, aom_v_predictor_32x64_sse2, + aom_h_predictor_32x64_sse2, NULL, NULL, NULL, NULL) +INTRA_PRED_TEST(SSE2_4, TX_32X8, aom_dc_predictor_32x8_sse2, + aom_dc_left_predictor_32x8_sse2, aom_dc_top_predictor_32x8_sse2, + aom_dc_128_predictor_32x8_sse2, aom_v_predictor_32x8_sse2, + aom_h_predictor_32x8_sse2, NULL, NULL, NULL, NULL) +#endif // HAVE_SSE2 + +#if HAVE_SSSE3 +INTRA_PRED_TEST(SSSE3_1, TX_32X32, NULL, NULL, NULL, NULL, NULL, NULL, + aom_paeth_predictor_32x32_ssse3, + aom_smooth_predictor_32x32_ssse3, + aom_smooth_v_predictor_32x32_ssse3, + aom_smooth_h_predictor_32x32_ssse3) +INTRA_PRED_TEST(SSSE3_2, TX_32X16, NULL, NULL, NULL, NULL, NULL, NULL, + aom_paeth_predictor_32x16_ssse3, + aom_smooth_predictor_32x16_ssse3, + aom_smooth_v_predictor_32x16_ssse3, + aom_smooth_h_predictor_32x16_ssse3) +INTRA_PRED_TEST(SSSE3_3, TX_32X64, NULL, NULL, NULL, NULL, NULL, NULL, + aom_paeth_predictor_32x64_ssse3, + aom_smooth_predictor_32x64_ssse3, + aom_smooth_v_predictor_32x64_ssse3, + aom_smooth_h_predictor_32x64_ssse3) +INTRA_PRED_TEST(SSSE3_4, TX_32X8, NULL, NULL, NULL, NULL, NULL, NULL, + aom_paeth_predictor_32x8_ssse3, aom_smooth_predictor_32x8_ssse3, + aom_smooth_v_predictor_32x8_ssse3, + aom_smooth_h_predictor_32x8_ssse3) +#endif // HAVE_SSSE3 + +#if HAVE_AVX2 +INTRA_PRED_TEST(AVX2_1, TX_32X32, aom_dc_predictor_32x32_avx2, + aom_dc_left_predictor_32x32_avx2, + aom_dc_top_predictor_32x32_avx2, + aom_dc_128_predictor_32x32_avx2, aom_v_predictor_32x32_avx2, + aom_h_predictor_32x32_avx2, aom_paeth_predictor_32x32_avx2, + NULL, NULL, NULL) +INTRA_PRED_TEST(AVX2_2, TX_32X16, aom_dc_predictor_32x16_avx2, + aom_dc_left_predictor_32x16_avx2, + aom_dc_top_predictor_32x16_avx2, + aom_dc_128_predictor_32x16_avx2, aom_v_predictor_32x16_avx2, + NULL, aom_paeth_predictor_32x16_avx2, NULL, NULL, NULL) +INTRA_PRED_TEST(AVX2_3, TX_32X64, aom_dc_predictor_32x64_avx2, + aom_dc_left_predictor_32x64_avx2, + aom_dc_top_predictor_32x64_avx2, + aom_dc_128_predictor_32x64_avx2, aom_v_predictor_32x64_avx2, + NULL, aom_paeth_predictor_32x64_avx2, NULL, NULL, NULL) +#endif // HAVE_AVX2 + +#if HAVE_NEON +INTRA_PRED_TEST(NEON, TX_32X32, aom_dc_predictor_32x32_neon, + aom_dc_left_predictor_32x32_neon, + aom_dc_top_predictor_32x32_neon, + aom_dc_128_predictor_32x32_neon, aom_v_predictor_32x32_neon, + aom_h_predictor_32x32_neon, NULL, NULL, NULL, NULL) +#endif // HAVE_NEON + +#if HAVE_MSA +INTRA_PRED_TEST(MSA, TX_32X32, aom_dc_predictor_32x32_msa, + aom_dc_left_predictor_32x32_msa, aom_dc_top_predictor_32x32_msa, + aom_dc_128_predictor_32x32_msa, aom_v_predictor_32x32_msa, + aom_h_predictor_32x32_msa, NULL, NULL, NULL, NULL) +#endif // HAVE_MSA + +// ----------------------------------------------------------------------------- +// 64x64, 64x32, 64x16 + +INTRA_PRED_TEST(C_1, TX_64X64, aom_dc_predictor_64x64_c, + aom_dc_left_predictor_64x64_c, aom_dc_top_predictor_64x64_c, + aom_dc_128_predictor_64x64_c, aom_v_predictor_64x64_c, + aom_h_predictor_64x64_c, aom_paeth_predictor_64x64_c, + aom_smooth_predictor_64x64_c, aom_smooth_v_predictor_64x64_c, + aom_smooth_h_predictor_64x64_c) + +INTRA_PRED_TEST(C_2, TX_64X32, aom_dc_predictor_64x32_c, + aom_dc_left_predictor_64x32_c, aom_dc_top_predictor_64x32_c, + aom_dc_128_predictor_64x32_c, aom_v_predictor_64x32_c, + aom_h_predictor_64x32_c, aom_paeth_predictor_64x32_c, + aom_smooth_predictor_64x32_c, aom_smooth_v_predictor_64x32_c, + aom_smooth_h_predictor_64x32_c) + +INTRA_PRED_TEST(C_3, TX_64X16, aom_dc_predictor_64x16_c, + aom_dc_left_predictor_64x16_c, aom_dc_top_predictor_64x16_c, + aom_dc_128_predictor_64x16_c, aom_v_predictor_64x16_c, + aom_h_predictor_64x16_c, aom_paeth_predictor_64x16_c, + aom_smooth_predictor_64x16_c, aom_smooth_v_predictor_64x16_c, + aom_smooth_h_predictor_64x16_c) + +#if HAVE_SSE2 +INTRA_PRED_TEST(SSE2_4, TX_64X64, aom_dc_predictor_64x64_sse2, + aom_dc_left_predictor_64x64_sse2, + aom_dc_top_predictor_64x64_sse2, + aom_dc_128_predictor_64x64_sse2, aom_v_predictor_64x64_sse2, + aom_h_predictor_64x64_sse2, NULL, NULL, NULL, NULL) +INTRA_PRED_TEST(SSE2_5, TX_64X32, aom_dc_predictor_64x32_sse2, + aom_dc_left_predictor_64x32_sse2, + aom_dc_top_predictor_64x32_sse2, + aom_dc_128_predictor_64x32_sse2, aom_v_predictor_64x32_sse2, + aom_h_predictor_64x32_sse2, NULL, NULL, NULL, NULL) +INTRA_PRED_TEST(SSE2_6, TX_64X16, aom_dc_predictor_64x16_sse2, + aom_dc_left_predictor_64x16_sse2, + aom_dc_top_predictor_64x16_sse2, + aom_dc_128_predictor_64x16_sse2, aom_v_predictor_64x16_sse2, + aom_h_predictor_64x16_sse2, NULL, NULL, NULL, NULL) +#endif + +#if HAVE_SSSE3 +INTRA_PRED_TEST(SSSE3_4, TX_64X64, NULL, NULL, NULL, NULL, NULL, NULL, + aom_paeth_predictor_64x64_ssse3, + aom_smooth_predictor_64x64_ssse3, + aom_smooth_v_predictor_64x64_ssse3, + aom_smooth_h_predictor_64x64_ssse3) +INTRA_PRED_TEST(SSSE3_5, TX_64X32, NULL, NULL, NULL, NULL, NULL, NULL, + aom_paeth_predictor_64x32_ssse3, + aom_smooth_predictor_64x32_ssse3, + aom_smooth_v_predictor_64x32_ssse3, + aom_smooth_h_predictor_64x32_ssse3) +INTRA_PRED_TEST(SSSE3_6, TX_64X16, NULL, NULL, NULL, NULL, NULL, NULL, + aom_paeth_predictor_64x16_ssse3, + aom_smooth_predictor_64x16_ssse3, + aom_smooth_v_predictor_64x16_ssse3, + aom_smooth_h_predictor_64x16_ssse3) +#endif + +#if HAVE_AVX2 +INTRA_PRED_TEST(AVX2_4, TX_64X64, aom_dc_predictor_64x64_avx2, + aom_dc_left_predictor_64x64_avx2, + aom_dc_top_predictor_64x64_avx2, + aom_dc_128_predictor_64x64_avx2, aom_v_predictor_64x64_avx2, + NULL, aom_paeth_predictor_64x64_avx2, NULL, NULL, NULL) +INTRA_PRED_TEST(AVX2_5, TX_64X32, aom_dc_predictor_64x32_avx2, + aom_dc_left_predictor_64x32_avx2, + aom_dc_top_predictor_64x32_avx2, + aom_dc_128_predictor_64x32_avx2, aom_v_predictor_64x32_avx2, + NULL, aom_paeth_predictor_64x32_avx2, NULL, NULL, NULL) +INTRA_PRED_TEST(AVX2_6, TX_64X16, aom_dc_predictor_64x16_avx2, + aom_dc_left_predictor_64x16_avx2, + aom_dc_top_predictor_64x16_avx2, + aom_dc_128_predictor_64x16_avx2, aom_v_predictor_64x16_avx2, + NULL, aom_paeth_predictor_64x16_avx2, NULL, NULL, NULL) +#endif + +#if CONFIG_AV1_HIGHBITDEPTH +// ----------------------------------------------------------------------------- +// High Bitdepth +namespace { + +typedef void (*AvxHighbdPredFunc)(uint16_t *dst, ptrdiff_t y_stride, + const uint16_t *above, const uint16_t *left, + int bd); + +typedef IntraPredTestMem Av1HighbdIntraPredTestMem; + +void TestHighbdIntraPred(TX_SIZE tx_size, AvxHighbdPredFunc const *pred_funcs, + const char *const signatures[]) { + const int block_width = tx_size_wide[tx_size]; + const int block_height = tx_size_high[tx_size]; + const int num_pixels_per_test = + block_width * block_height * kNumAv1IntraFuncs; + const int kNumTests = static_cast(2.e10 / num_pixels_per_test); + Av1HighbdIntraPredTestMem intra_pred_test_mem; + const int bd = 12; + intra_pred_test_mem.Init(block_width, block_height, bd); + + for (int k = 0; k < kNumAv1IntraFuncs; ++k) { + if (pred_funcs[k] == NULL) continue; + memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src, + sizeof(intra_pred_test_mem.src)); + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int num_tests = 0; num_tests < kNumTests; ++num_tests) { + pred_funcs[k](intra_pred_test_mem.src, intra_pred_test_mem.stride, + intra_pred_test_mem.above, intra_pred_test_mem.left, bd); + } + libaom_test::ClearSystemState(); + aom_usec_timer_mark(&timer); + const int elapsed_time = + static_cast(aom_usec_timer_elapsed(&timer) / 1000); + CheckMd5Signature( + tx_size, true, signatures, intra_pred_test_mem.src, + intra_pred_test_mem.num_pixels * sizeof(*intra_pred_test_mem.src), + elapsed_time, k); + } +} + +static const char *const kHighbdSignatures[TX_SIZES_ALL][kNumAv1IntraFuncs] = { + { + // 4X4 + "11f74af6c5737df472f3275cbde062fa", + "51bea056b6447c93f6eb8f6b7e8f6f71", + "27e97f946766331795886f4de04c5594", + "53ab15974b049111fb596c5168ec7e3f", + "f0b640bb176fbe4584cf3d32a9b0320a", + "729783ca909e03afd4b47111c80d967b", + "6e30009c45474a22032678b1bd579c8f", + "e57cba016d808aa8a35619df2a65f049", + "55a6c37f39afcbbf5abca4a985b96459", + "a623d45b37dafec1f8a75c4c5218913d", + }, + { + // 8X8 + "03da8829fe94663047fd108c5fcaa71d", + "ecdb37b8120a2d3a4c706b016bd1bfd7", + "1d4543ed8d2b9368cb96898095fe8a75", + "f791c9a67b913cbd82d9da8ecede30e2", + "065c70646f4dbaff913282f55a45a441", + "51f87123616662ef7c35691497dfd0ba", + "85c01ba03df68f9ece7bd3fa0f8980e6", + "ad19b7dac092f56df6d054e1f67f21e7", + "0edc415b5dd7299f7a34fb9f71d31d78", + "2bc8ec19e9f4b77a64b8a0a1f6aec7e7", + }, + { + // 16X16 + "e33cb3f56a878e2fddb1b2fc51cdd275", + "c7bff6f04b6052c8ab335d726dbbd52d", + "d0b0b47b654a9bcc5c6008110a44589b", + "78f5da7b10b2b9ab39f114a33b6254e9", + "c78e31d23831abb40d6271a318fdd6f3", + "90d1347f4ec9198a0320daecb6ff90b8", + "e63ded54ab3d0e8728b6f24d4f01e53f", + "35ce21fbe0ea114c089fc3489a78155d", + "f277f6ef8e4d717f1f0dfe2706ac197d", + "e8014d3f41256976c02e0f1e622ba2b9", + }, + { + // 32X32 + "a3e8056ba7e36628cce4917cd956fedd", + "cc7d3024fe8748b512407edee045377e", + "2aab0a0f330a1d3e19b8ecb8f06387a3", + "a547bc3fb7b06910bf3973122a426661", + "26f712514da95042f93d6e8dc8e431dc", + "bb08c6e16177081daa3d936538dbc2e3", + "84bf83f94a51b33654ca940c6f8bc057", + "7168b03fc31bf29596a344d6a35d007c", + "b073a70d3672f1282236994f5d12e94b", + "c51607aebad5dcb3c1e3b58ef9e5b84e", + }, + { + // 64X64 + "a6baa0d4bfb2269a94c7a38f86a4bccf", + "3f1ef5f473a49eba743f17a3324adf9d", + "12ac11889ae5f55b7781454efd706a6a", + "d9a906c0e692b22e1b4414e71a704b7e", + "47d4cadd56f70c11ff8f3e5d8df81161", + "de997744cf24c16c5ac2a36b02b351cc", + "23781211ae178ddeb6c4bb97a6bd7d83", + "a79d2e28340ca34b9e37daabbf030f63", + "0372bd3ddfc258750a6ac106b70587f4", + "228ef625d9460cbf6fa253a16a730976", + }, + { + // 4X8 + "22d519b796d59644043466320e4ccd14", + "09513a738c49b3f9542d27f34abbe1d5", + "807ae5e8813443ff01e71be6efacfb69", + "cbfa18d0293430b6e9708b0be1fd2394", + "346c354c34ec7fa780b576db355dab88", + "f97dae85c35359632380b09ca98d611e", + "698ae351d8896d89ed9e4e67b6e53eda", + "dcc197034a9c45a3d8238bf085835f4e", + "7a35e2c42ffdc2efc2d6d1d75a100fc7", + "41ab6cebd4516c87a91b2a593e2c2506", + }, + { + // 8X4 + "d58cd4c4bf3b7bbaa5db5e1a5622ec78", + "6e572c35aa782d00cafcb99e9ea047ea", + "e8c22a3702b416dc9ab974505afbed09", + "aaa4e4762a795aad7ad74de0c662c4e4", + "a19f9101967383c3dcbd516dc317a291", + "9ab8cb91f1a595b9ebe3fe8de58031aa", + "2cf9021d5f1169268699807ee118b65f", + "ee9605fcbd6fb871f1c5cd81a6989327", + "b4871af8316089e3e23522175df7e93f", + "d33301e1c2cb173be46792a22d19881a", + }, + { + // 8X16 + "4562de1d0336610880fdd5685498a9ec", + "16310fa7076394f16fc85c4b149d89c9", + "0e94af88e1dc573b6f0f499cddd1f530", + "dfd245ee20d091c67809160340365aa9", + "d3562504327f70c096c5be23fd8a3747", + "601b853558502acbb5135eadd2da117a", + "3c624345a723a1b2b1bea05a6a08bc99", + "2a9c781de609e0184cc7ab442050f4e5", + "0ddc5035c22252747126b61fc238c74d", + "e43f5d83bab759af69c7b6773fc8f9b2", + }, + { + // 16X8 + "a57d6b5a9bfd30c29591d8717ace9c51", + "f5907ba97ee6c53e339e953fc8d845ee", + "ea3aa727913ce45af06f89dd1808db5f", + "408af4f23e48d14b48ee35ae094fcd18", + "85c41cbcb5d744f7961e8950026fbffe", + "8a4e588a837638887ba671f8d4910485", + "b792d8826b67a21757ea7097cff9e05b", + "f94ce7101bb87fd3bb9312112527dbf4", + "688c6660a6dc6fa61fa1aa38e708c209", + "0cdf641b4f81d69509c92ae0b93ef5ff", + }, + { + // 16X32 + "aee4b3b0e3cc02d48e2c40d77f807927", + "8baef2b2e789f79c8df9d90ad10f34a4", + "038c38ee3c4f090bb8d736eab136aafc", + "1a3de2aaeaffd68a9fd6c7f6557b83f3", + "385c6e0ea29421dd81011a2934641e26", + "6cf96c285d1a2d4787f955dad715b08c", + "2d7f75dcd73b9528c8396279ff09ff3a", + "5a63cd1841e4ed470e4ca5ef845f2281", + "610d899ca945fbead33287d4335a8b32", + "6bafaad81fce37be46730187e78d8b11", + }, + { + // 32X16 + "290b23c9f5a1de7905bfa71a942da29b", + "701e7b82593c66da5052fc4b6afd79ce", + "4da828c5455cd246735a663fbb204989", + "e3fbeaf234efece8dbd752b77226200c", + "4d1d8c969f05155a7e7e84cf7aad021b", + "c22e4877c2c946d5bdc0d542e29e70cf", + "8ac1ce815e7780500f842b0beb0bb980", + "9fee2e2502b507f25bfad30a55b0b610", + "4ced9c212ec6f9956e27f68a91b59fef", + "4a7a0b93f138bb0863e4e465b01ec0b1", + }, + { + // 32X64 + "ad9cfc395a5c5644a21d958c7274ac14", + "f29d6d03c143ddf96fef04c19f2c8333", + "a8bdc852ef704dd4975c61893e8fbc3f", + "7d0bd7dea26226741dbca9a97f27fa74", + "45c27c5cca9a91b6ae8379feb0881c9f", + "8a0b78df1e001b85c874d686eac4aa1b", + "ce9fa75fac54a3f6c0cc3f2083b938f1", + "c0dca10d88762c954af18dc9e3791a39", + "61df229eddfccab913b8fda4bb02f9ac", + "4f4df6bc8d50a5600b573f0e44d70e66", + }, + { + // 64X32 + "db9d82921fd88b24fdff6f849f2f9c87", + "5ecc7fdc52d2f575ad4f2d0e9e6b1e11", + "b4581311a0a73d95dfac7f8f44591032", + "68bd283cfd1a125f6b2ee47cee874d36", + "804179f05c032908a5e36077bb87c994", + "fc5fd041a8ee779015394d0c066ee43c", + "68f5579ccadfe9a1baafb158334a3db2", + "fe237e45e215ab06d79046da9ad71e84", + "9a8a938a6824551bf7d21b8fd1d70ea1", + "eb7332f2017cd96882c76e7136aeaf53", + }, + { + // 4X16 + "7bafa307d507747b8132e7735b7f1c73", + "e58bc2d8213a97d1fea9cfb73d7a9633", + "435f8a8e8bbf14dbf2fe16b2be9e97aa", + "1d0e767b68d84acbfb50b7a04e633836", + "5f713bd7b324fe73bb7063e35ee14e5e", + "0dac4e1fa3d59814202715468c01ed56", + "47709d1db4a330c7a8900f450e6fddd1", + "258e0b930bb27db28f05da9cf7d1ee7c", + "36cf030fbae767912593efea045bfff5", + "248d7aceabb7499febae663fae41a920", + }, + { + // 16X4 + "04dde98e632670e393704742c89f9067", + "8c72543f1664651ae1fa08e2ac0adb9b", + "2354a2cdc2773aa2df8ab4010db1be39", + "6300ad3221c26da39b10e0e6d87ee3be", + "8ea30b661c6ba60b28d3167f19e449b8", + "fb6c1e4ff101a371cede63c2955cdb7e", + "a517c06433d6d7927b16a72184a23e92", + "393828be5d62ab6c48668bea5e2f801a", + "b1e510c542013eb9d6fb188dea2ce90a", + "569a8f2fe01679ca216535ecbcdccb62", + }, + { + // 8X32 + "9d541865c185ca7607852852613ac1fc", + "b96be67f08c6b5fa5ebd3411299c2f7c", + "75a2dcf50004b9d188849b048239767e", + "429492ff415c9fd9b050d73b2ad500f8", + "64b3606c1ccd036bd766bd5711392cf4", + "cb59844a0f01660ac955bae3511f1100", + "3e076155b7a70e8828618e3f33b51e3d", + "ed2d1f597ab7c50beff690f737cf9726", + "7909c6a26aaf20c59d996d3e5b5f9c29", + "965798807240c98c6f7cc9b457ed0773", + }, + { + // 32X8 + "36f391aa31619eec1f4d9ee95ea454cc", + "b82648f14eeba2527357cb50bc3223cb", + "7a7b2adf429125e8bee9d1d00a66e13f", + "4198e4d6ba503b7cc2d7e96bb845f661", + "96c160d2ec1be9fe0cdea9682f14d257", + "19a450bcebaa75afb4fc6bd1fd6434af", + "2bd2e35967d43d0ec1c6587a36f204d5", + "49799a99aa4ccfbd989bee92a99422f1", + "955530e99813812a74659edeac3f5475", + "f0316b84e378a19cd11b19a6e40b2914", + }, + { + // 16X64 + "8cba1b70a0bde29e8ef235cedc5faa7d", + "96d00ddc7537bf7f196006591b733b4e", + "cbf69d5d157c9f3355a4757b1d6e3414", + "3ac1f642019493dec1b737d7a3a1b4e5", + "35f9ee300d7fa3c97338e81a6f21dcd4", + "aae335442e77c8ebc280f16ea50ba9c7", + "a6140fdac2278644328be094d88731db", + "2df93621b6ff100f7008432d509f4161", + "c77bf5aee39e7ed4a3dd715f816f452a", + "02109bd63557d90225c32a8f1338258e", + }, + { + // 64X16 + "a5e2f9fb685d5f4a048e9a96affd25a4", + "1348f249690d9eefe09d9ad7ead2c801", + "525da4b187acd81b1ff1116b60461141", + "e99d072de858094c98b01bd4a6772634", + "873bfa9dc24693f19721f7c8d527f7d3", + "0acfc6507bd3468e9679efc127d6e4b9", + "57d03f8d079c7264854e22ac1157cfae", + "6c2c4036f70c7d957a9399b5436c0774", + "42b8e4a97b7f8416c72a5148c031c0b1", + "a38a2c5f79993dfae8530e9e25800893", + }, +}; + +} // namespace + +#define HIGHBD_INTRA_PRED_TEST(arch, tx_size, dc, dc_left, dc_top, dc_128, v, \ + h, paeth, smooth, smooth_v, smooth_h) \ + TEST(arch, DISABLED_##TestHighbdIntraPred_##tx_size) { \ + static const AvxHighbdPredFunc aom_intra_pred[] = { \ + dc, dc_left, dc_top, dc_128, v, h, paeth, smooth, smooth_v, smooth_h \ + }; \ + TestHighbdIntraPred(tx_size, aom_intra_pred, kHighbdSignatures[tx_size]); \ + } + +// ----------------------------------------------------------------------------- +// 4x4, 4x8, 4x16 + +HIGHBD_INTRA_PRED_TEST( + C_1, TX_4X4, aom_highbd_dc_predictor_4x4_c, + aom_highbd_dc_left_predictor_4x4_c, aom_highbd_dc_top_predictor_4x4_c, + aom_highbd_dc_128_predictor_4x4_c, aom_highbd_v_predictor_4x4_c, + aom_highbd_h_predictor_4x4_c, aom_highbd_paeth_predictor_4x4_c, + aom_highbd_smooth_predictor_4x4_c, aom_highbd_smooth_v_predictor_4x4_c, + aom_highbd_smooth_h_predictor_4x4_c) + +HIGHBD_INTRA_PRED_TEST( + C_2, TX_4X8, aom_highbd_dc_predictor_4x8_c, + aom_highbd_dc_left_predictor_4x8_c, aom_highbd_dc_top_predictor_4x8_c, + aom_highbd_dc_128_predictor_4x8_c, aom_highbd_v_predictor_4x8_c, + aom_highbd_h_predictor_4x8_c, aom_highbd_paeth_predictor_4x8_c, + aom_highbd_smooth_predictor_4x8_c, aom_highbd_smooth_v_predictor_4x8_c, + aom_highbd_smooth_h_predictor_4x8_c) + +HIGHBD_INTRA_PRED_TEST( + C_3, TX_4X16, aom_highbd_dc_predictor_4x16_c, + aom_highbd_dc_left_predictor_4x16_c, aom_highbd_dc_top_predictor_4x16_c, + aom_highbd_dc_128_predictor_4x16_c, aom_highbd_v_predictor_4x16_c, + aom_highbd_h_predictor_4x16_c, aom_highbd_paeth_predictor_4x16_c, + aom_highbd_smooth_predictor_4x16_c, aom_highbd_smooth_v_predictor_4x16_c, + aom_highbd_smooth_h_predictor_4x16_c) + +#if HAVE_SSE2 +HIGHBD_INTRA_PRED_TEST(SSE2_1, TX_4X4, aom_highbd_dc_predictor_4x4_sse2, + aom_highbd_dc_left_predictor_4x4_sse2, + aom_highbd_dc_top_predictor_4x4_sse2, + aom_highbd_dc_128_predictor_4x4_sse2, + aom_highbd_v_predictor_4x4_sse2, + aom_highbd_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL) + +HIGHBD_INTRA_PRED_TEST(SSE2_2, TX_4X8, aom_highbd_dc_predictor_4x8_sse2, + aom_highbd_dc_left_predictor_4x8_sse2, + aom_highbd_dc_top_predictor_4x8_sse2, + aom_highbd_dc_128_predictor_4x8_sse2, + aom_highbd_v_predictor_4x8_sse2, + aom_highbd_h_predictor_4x8_sse2, NULL, NULL, NULL, NULL) +#endif + +// ----------------------------------------------------------------------------- +// 8x8, 8x4, 8x16, 8x32 + +HIGHBD_INTRA_PRED_TEST( + C_1, TX_8X8, aom_highbd_dc_predictor_8x8_c, + aom_highbd_dc_left_predictor_8x8_c, aom_highbd_dc_top_predictor_8x8_c, + aom_highbd_dc_128_predictor_8x8_c, aom_highbd_v_predictor_8x8_c, + aom_highbd_h_predictor_8x8_c, aom_highbd_paeth_predictor_8x8_c, + aom_highbd_smooth_predictor_8x8_c, aom_highbd_smooth_v_predictor_8x8_c, + aom_highbd_smooth_h_predictor_8x8_c) + +HIGHBD_INTRA_PRED_TEST( + C_2, TX_8X4, aom_highbd_dc_predictor_8x4_c, + aom_highbd_dc_left_predictor_8x4_c, aom_highbd_dc_top_predictor_8x4_c, + aom_highbd_dc_128_predictor_8x4_c, aom_highbd_v_predictor_8x4_c, + aom_highbd_h_predictor_8x4_c, aom_highbd_paeth_predictor_8x4_c, + aom_highbd_smooth_predictor_8x4_c, aom_highbd_smooth_v_predictor_8x4_c, + aom_highbd_smooth_h_predictor_8x4_c) + +HIGHBD_INTRA_PRED_TEST( + C_3, TX_8X16, aom_highbd_dc_predictor_8x16_c, + aom_highbd_dc_left_predictor_8x16_c, aom_highbd_dc_top_predictor_8x16_c, + aom_highbd_dc_128_predictor_8x16_c, aom_highbd_v_predictor_8x16_c, + aom_highbd_h_predictor_8x16_c, aom_highbd_paeth_predictor_8x16_c, + aom_highbd_smooth_predictor_8x16_c, aom_highbd_smooth_v_predictor_8x16_c, + aom_highbd_smooth_h_predictor_8x16_c) + +HIGHBD_INTRA_PRED_TEST( + C_4, TX_8X32, aom_highbd_dc_predictor_8x32_c, + aom_highbd_dc_left_predictor_8x32_c, aom_highbd_dc_top_predictor_8x32_c, + aom_highbd_dc_128_predictor_8x32_c, aom_highbd_v_predictor_8x32_c, + aom_highbd_h_predictor_8x32_c, aom_highbd_paeth_predictor_8x32_c, + aom_highbd_smooth_predictor_8x32_c, aom_highbd_smooth_v_predictor_8x32_c, + aom_highbd_smooth_h_predictor_8x32_c) + +#if HAVE_SSE2 +HIGHBD_INTRA_PRED_TEST(SSE2_1, TX_8X8, aom_highbd_dc_predictor_8x8_sse2, + aom_highbd_dc_left_predictor_8x8_sse2, + aom_highbd_dc_top_predictor_8x8_sse2, + aom_highbd_dc_128_predictor_8x8_sse2, + aom_highbd_v_predictor_8x8_sse2, + aom_highbd_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL) +HIGHBD_INTRA_PRED_TEST(SSE2_2, TX_8X4, aom_highbd_dc_predictor_8x4_sse2, + aom_highbd_dc_left_predictor_8x4_sse2, + aom_highbd_dc_top_predictor_8x4_sse2, + aom_highbd_dc_128_predictor_8x4_sse2, + aom_highbd_v_predictor_8x4_sse2, + aom_highbd_h_predictor_8x4_sse2, NULL, NULL, NULL, NULL) +HIGHBD_INTRA_PRED_TEST(SSE2_3, TX_8X16, aom_highbd_dc_predictor_8x16_sse2, + aom_highbd_dc_left_predictor_8x16_sse2, + aom_highbd_dc_top_predictor_8x16_sse2, + aom_highbd_dc_128_predictor_8x16_sse2, + aom_highbd_v_predictor_8x16_sse2, + aom_highbd_h_predictor_8x16_sse2, NULL, NULL, NULL, NULL) +#endif + +#if HAVE_SSSE3 +HIGHBD_INTRA_PRED_TEST(SSSE3, TX_8X8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL) +#endif + +// ----------------------------------------------------------------------------- +// 16x16, 16x8, 16x32, 16x4, 16x64 + +HIGHBD_INTRA_PRED_TEST( + C_1, TX_16X16, aom_highbd_dc_predictor_16x16_c, + aom_highbd_dc_left_predictor_16x16_c, aom_highbd_dc_top_predictor_16x16_c, + aom_highbd_dc_128_predictor_16x16_c, aom_highbd_v_predictor_16x16_c, + aom_highbd_h_predictor_16x16_c, aom_highbd_paeth_predictor_16x16_c, + aom_highbd_smooth_predictor_16x16_c, aom_highbd_smooth_v_predictor_16x16_c, + aom_highbd_smooth_h_predictor_16x16_c) + +HIGHBD_INTRA_PRED_TEST( + C_2, TX_16X8, aom_highbd_dc_predictor_16x8_c, + aom_highbd_dc_left_predictor_16x8_c, aom_highbd_dc_top_predictor_16x8_c, + aom_highbd_dc_128_predictor_16x8_c, aom_highbd_v_predictor_16x8_c, + aom_highbd_h_predictor_16x8_c, aom_highbd_paeth_predictor_16x8_c, + aom_highbd_smooth_predictor_16x8_c, aom_highbd_smooth_v_predictor_16x8_c, + aom_highbd_smooth_h_predictor_16x8_c) + +HIGHBD_INTRA_PRED_TEST( + C_3, TX_16X32, aom_highbd_dc_predictor_16x32_c, + aom_highbd_dc_left_predictor_16x32_c, aom_highbd_dc_top_predictor_16x32_c, + aom_highbd_dc_128_predictor_16x32_c, aom_highbd_v_predictor_16x32_c, + aom_highbd_h_predictor_16x32_c, aom_highbd_paeth_predictor_16x32_c, + aom_highbd_smooth_predictor_16x32_c, aom_highbd_smooth_v_predictor_16x32_c, + aom_highbd_smooth_h_predictor_16x32_c) + +HIGHBD_INTRA_PRED_TEST( + C_4, TX_16X4, aom_highbd_dc_predictor_16x4_c, + aom_highbd_dc_left_predictor_16x4_c, aom_highbd_dc_top_predictor_16x4_c, + aom_highbd_dc_128_predictor_16x4_c, aom_highbd_v_predictor_16x4_c, + aom_highbd_h_predictor_16x4_c, aom_highbd_paeth_predictor_16x4_c, + aom_highbd_smooth_predictor_16x4_c, aom_highbd_smooth_v_predictor_16x4_c, + aom_highbd_smooth_h_predictor_16x4_c) + +HIGHBD_INTRA_PRED_TEST( + C_5, TX_16X64, aom_highbd_dc_predictor_16x64_c, + aom_highbd_dc_left_predictor_16x64_c, aom_highbd_dc_top_predictor_16x64_c, + aom_highbd_dc_128_predictor_16x64_c, aom_highbd_v_predictor_16x64_c, + aom_highbd_h_predictor_16x64_c, aom_highbd_paeth_predictor_16x64_c, + aom_highbd_smooth_predictor_16x64_c, aom_highbd_smooth_v_predictor_16x64_c, + aom_highbd_smooth_h_predictor_16x64_c) + +#if HAVE_SSE2 +HIGHBD_INTRA_PRED_TEST(SSE2_1, TX_16X16, aom_highbd_dc_predictor_16x16_sse2, + aom_highbd_dc_left_predictor_16x16_sse2, + aom_highbd_dc_top_predictor_16x16_sse2, + aom_highbd_dc_128_predictor_16x16_sse2, + aom_highbd_v_predictor_16x16_sse2, + aom_highbd_h_predictor_16x16_sse2, NULL, NULL, NULL, + NULL) +HIGHBD_INTRA_PRED_TEST(SSE2_2, TX_16X8, aom_highbd_dc_predictor_16x8_sse2, + aom_highbd_dc_left_predictor_16x8_sse2, + aom_highbd_dc_top_predictor_16x8_sse2, + aom_highbd_dc_128_predictor_16x8_sse2, + aom_highbd_v_predictor_16x8_sse2, + aom_highbd_h_predictor_16x8_sse2, NULL, NULL, NULL, NULL) +HIGHBD_INTRA_PRED_TEST(SSE2_3, TX_16X32, aom_highbd_dc_predictor_16x32_sse2, + aom_highbd_dc_left_predictor_16x32_sse2, + aom_highbd_dc_top_predictor_16x32_sse2, + aom_highbd_dc_128_predictor_16x32_sse2, + aom_highbd_v_predictor_16x32_sse2, + aom_highbd_h_predictor_16x32_sse2, NULL, NULL, NULL, + NULL) +#endif + +#if HAVE_SSSE3 +HIGHBD_INTRA_PRED_TEST(SSSE3_1, TX_16X16, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL) +#endif + +#if HAVE_AVX2 +HIGHBD_INTRA_PRED_TEST(AVX2_1, TX_16X16, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL) + +HIGHBD_INTRA_PRED_TEST(AVX2_2, TX_16X8, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL) + +HIGHBD_INTRA_PRED_TEST(AVX2_3, TX_16X32, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL) +#endif + +// ----------------------------------------------------------------------------- +// 32x32, 32x16, 32x64, 32x8 + +HIGHBD_INTRA_PRED_TEST( + C_1, TX_32X32, aom_highbd_dc_predictor_32x32_c, + aom_highbd_dc_left_predictor_32x32_c, aom_highbd_dc_top_predictor_32x32_c, + aom_highbd_dc_128_predictor_32x32_c, aom_highbd_v_predictor_32x32_c, + aom_highbd_h_predictor_32x32_c, aom_highbd_paeth_predictor_32x32_c, + aom_highbd_smooth_predictor_32x32_c, aom_highbd_smooth_v_predictor_32x32_c, + aom_highbd_smooth_h_predictor_32x32_c) + +HIGHBD_INTRA_PRED_TEST( + C_2, TX_32X16, aom_highbd_dc_predictor_32x16_c, + aom_highbd_dc_left_predictor_32x16_c, aom_highbd_dc_top_predictor_32x16_c, + aom_highbd_dc_128_predictor_32x16_c, aom_highbd_v_predictor_32x16_c, + aom_highbd_h_predictor_32x16_c, aom_highbd_paeth_predictor_32x16_c, + aom_highbd_smooth_predictor_32x16_c, aom_highbd_smooth_v_predictor_32x16_c, + aom_highbd_smooth_h_predictor_32x16_c) + +HIGHBD_INTRA_PRED_TEST( + C_3, TX_32X64, aom_highbd_dc_predictor_32x64_c, + aom_highbd_dc_left_predictor_32x64_c, aom_highbd_dc_top_predictor_32x64_c, + aom_highbd_dc_128_predictor_32x64_c, aom_highbd_v_predictor_32x64_c, + aom_highbd_h_predictor_32x64_c, aom_highbd_paeth_predictor_32x64_c, + aom_highbd_smooth_predictor_32x64_c, aom_highbd_smooth_v_predictor_32x64_c, + aom_highbd_smooth_h_predictor_32x64_c) + +HIGHBD_INTRA_PRED_TEST( + C_4, TX_32X8, aom_highbd_dc_predictor_32x8_c, + aom_highbd_dc_left_predictor_32x8_c, aom_highbd_dc_top_predictor_32x8_c, + aom_highbd_dc_128_predictor_32x8_c, aom_highbd_v_predictor_32x8_c, + aom_highbd_h_predictor_32x8_c, aom_highbd_paeth_predictor_32x8_c, + aom_highbd_smooth_predictor_32x8_c, aom_highbd_smooth_v_predictor_32x8_c, + aom_highbd_smooth_h_predictor_32x8_c) + +#if HAVE_SSE2 +HIGHBD_INTRA_PRED_TEST(SSE2_1, TX_32X32, aom_highbd_dc_predictor_32x32_sse2, + aom_highbd_dc_left_predictor_32x32_sse2, + aom_highbd_dc_top_predictor_32x32_sse2, + aom_highbd_dc_128_predictor_32x32_sse2, + aom_highbd_v_predictor_32x32_sse2, + aom_highbd_h_predictor_32x32_sse2, NULL, NULL, NULL, + NULL) +HIGHBD_INTRA_PRED_TEST(SSE2_2, TX_32X16, aom_highbd_dc_predictor_32x16_sse2, + aom_highbd_dc_left_predictor_32x16_sse2, + aom_highbd_dc_top_predictor_32x16_sse2, + aom_highbd_dc_128_predictor_32x16_sse2, + aom_highbd_v_predictor_32x16_sse2, + aom_highbd_h_predictor_32x16_sse2, NULL, NULL, NULL, + NULL) +#endif + +#if HAVE_SSSE3 +HIGHBD_INTRA_PRED_TEST(SSSE3_1, TX_32X32, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL) +#endif + +#if HAVE_AVX2 +HIGHBD_INTRA_PRED_TEST(AVX2_1, TX_32X32, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL) + +HIGHBD_INTRA_PRED_TEST(AVX2_2, TX_32X16, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL) +#endif + +// ----------------------------------------------------------------------------- +// 64x64, 64x32, 64x16 + +HIGHBD_INTRA_PRED_TEST( + C_1, TX_64X64, aom_highbd_dc_predictor_64x64_c, + aom_highbd_dc_left_predictor_64x64_c, aom_highbd_dc_top_predictor_64x64_c, + aom_highbd_dc_128_predictor_64x64_c, aom_highbd_v_predictor_64x64_c, + aom_highbd_h_predictor_64x64_c, aom_highbd_paeth_predictor_64x64_c, + aom_highbd_smooth_predictor_64x64_c, aom_highbd_smooth_v_predictor_64x64_c, + aom_highbd_smooth_h_predictor_64x64_c) + +HIGHBD_INTRA_PRED_TEST( + C_2, TX_64X32, aom_highbd_dc_predictor_64x32_c, + aom_highbd_dc_left_predictor_64x32_c, aom_highbd_dc_top_predictor_64x32_c, + aom_highbd_dc_128_predictor_64x32_c, aom_highbd_v_predictor_64x32_c, + aom_highbd_h_predictor_64x32_c, aom_highbd_paeth_predictor_64x32_c, + aom_highbd_smooth_predictor_64x32_c, aom_highbd_smooth_v_predictor_64x32_c, + aom_highbd_smooth_h_predictor_64x32_c) + +HIGHBD_INTRA_PRED_TEST( + C_3, TX_64X16, aom_highbd_dc_predictor_64x16_c, + aom_highbd_dc_left_predictor_64x16_c, aom_highbd_dc_top_predictor_64x16_c, + aom_highbd_dc_128_predictor_64x16_c, aom_highbd_v_predictor_64x16_c, + aom_highbd_h_predictor_64x16_c, aom_highbd_paeth_predictor_64x16_c, + aom_highbd_smooth_predictor_64x16_c, aom_highbd_smooth_v_predictor_64x16_c, + aom_highbd_smooth_h_predictor_64x16_c) + +// ----------------------------------------------------------------------------- +#endif // CONFIG_AV1_HIGHBITDEPTH + +#include "test/test_libaom.cc" diff --git a/libs/libaom/src/test/test_libaom.cc b/libs/libaom/src/test/test_libaom.cc new file mode 100644 index 000000000..b55d76237 --- /dev/null +++ b/libs/libaom/src/test/test_libaom.cc @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_config.h" + +#if ARCH_X86 || ARCH_X86_64 +#include "aom_ports/x86.h" +#endif +extern "C" { +extern void av1_rtcd(); +extern void aom_dsp_rtcd(); +extern void aom_scale_rtcd(); +} + +#if ARCH_X86 || ARCH_X86_64 +static void append_negative_gtest_filter(const char *str) { + std::string filter = ::testing::FLAGS_gtest_filter; + // Negative patterns begin with one '-' followed by a ':' separated list. + if (filter.find('-') == std::string::npos) filter += '-'; + // OPT.* matches TEST() functions + // OPT/* matches TEST_P() functions + // OPT_* matches tests which have been manually sharded. + // We do not match OPT* because of SSE/SSE2 collisions. + const char *search_terminators = "./_"; + for (size_t pos = 0; pos < strlen(search_terminators); ++pos) { + filter += ":"; + filter += str; + filter += search_terminators[pos]; + filter += "*"; + } + ::testing::FLAGS_gtest_filter = filter; +} +#endif // ARCH_X86 || ARCH_X86_64 + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + +#if ARCH_X86 || ARCH_X86_64 + const int simd_caps = x86_simd_caps(); + if (!(simd_caps & HAS_MMX)) append_negative_gtest_filter("MMX"); + if (!(simd_caps & HAS_SSE)) append_negative_gtest_filter("SSE"); + if (!(simd_caps & HAS_SSE2)) append_negative_gtest_filter("SSE2"); + if (!(simd_caps & HAS_SSE3)) append_negative_gtest_filter("SSE3"); + if (!(simd_caps & HAS_SSSE3)) append_negative_gtest_filter("SSSE3"); + if (!(simd_caps & HAS_SSE4_1)) append_negative_gtest_filter("SSE4_1"); + if (!(simd_caps & HAS_SSE4_2)) append_negative_gtest_filter("SSE4_2"); + if (!(simd_caps & HAS_AVX)) append_negative_gtest_filter("AVX"); + if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter("AVX2"); +#endif // ARCH_X86 || ARCH_X86_64 + +// Shared library builds don't support whitebox tests that exercise internal +// symbols. +#if !CONFIG_SHARED + av1_rtcd(); + aom_dsp_rtcd(); + aom_scale_rtcd(); +#endif // !CONFIG_SHARED + + return RUN_ALL_TESTS(); +} diff --git a/libs/libaom/src/test/test_runner.cmake b/libs/libaom/src/test/test_runner.cmake new file mode 100644 index 000000000..f0648d16b --- /dev/null +++ b/libs/libaom/src/test/test_runner.cmake @@ -0,0 +1,28 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(NOT GTEST_TOTAL_SHARDS + OR "${GTEST_SHARD_INDEX}" STREQUAL "" + OR NOT TEST_LIBAOM) + message( + FATAL_ERROR + "The variables GTEST_SHARD_INDEX, GTEST_TOTAL_SHARDS and TEST_LIBAOM + must be defined.") +endif() + +set($ENV{GTEST_SHARD_INDEX} ${GTEST_SHARD_INDEX}) +set($ENV{GTEST_TOTAL_SHARDS} ${GTEST_TOTAL_SHARDS}) +execute_process(COMMAND ${TEST_LIBAOM} RESULT_VARIABLE test_result) +set(test_message "Test shard ${GTEST_SHARD_INDEX}/${GTEST_TOTAL_SHARDS} result") +message("${test_message}: ${test_result}") + +if(NOT "${test_result}" STREQUAL "0") + message(FATAL_ERROR "${test_message}: FAILED, non-zero exit code.") +endif() diff --git a/libs/libaom/src/test/test_vector_test.cc b/libs/libaom/src/test/test_vector_test.cc new file mode 100644 index 000000000..eab92b685 --- /dev/null +++ b/libs/libaom/src/test/test_vector_test.cc @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include +#include +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "common/tools_common.h" +#include "config/aom_config.h" +#include "test/codec_factory.h" +#include "test/decode_test_driver.h" +#include "test/ivf_video_source.h" +#include "test/md5_helper.h" +#include "test/test_vectors.h" +#include "test/util.h" +#if CONFIG_WEBM_IO +#include "test/webm_video_source.h" +#endif + +namespace { + +const int kThreads = 0; +const int kFileName = 1; +const int kRowMT = 2; + +typedef std::tuple DecodeParam; + +class TestVectorTest : public ::libaom_test::DecoderTest, + public ::libaom_test::CodecTestWithParam { + protected: + TestVectorTest() : DecoderTest(GET_PARAM(0)), md5_file_(NULL) {} + + virtual ~TestVectorTest() { + if (md5_file_) fclose(md5_file_); + } + + void OpenMD5File(const std::string &md5_file_name_) { + md5_file_ = libaom_test::OpenTestDataFile(md5_file_name_); + ASSERT_TRUE(md5_file_ != NULL) + << "Md5 file open failed. Filename: " << md5_file_name_; + } + + virtual void PreDecodeFrameHook( + const libaom_test::CompressedVideoSource &video, + libaom_test::Decoder *decoder) { + if (video.frame_number() == 0) decoder->Control(AV1D_SET_ROW_MT, row_mt_); + } + + virtual void DecompressedFrameHook(const aom_image_t &img, + const unsigned int frame_number) { + ASSERT_TRUE(md5_file_ != NULL); + char expected_md5[33]; + char junk[128]; + + // Read correct md5 checksums. + const int res = fscanf(md5_file_, "%s %s", expected_md5, junk); + ASSERT_NE(res, EOF) << "Read md5 data failed"; + expected_md5[32] = '\0'; + + ::libaom_test::MD5 md5_res; +#if FORCE_HIGHBITDEPTH_DECODING + const aom_img_fmt_t shifted_fmt = + (aom_img_fmt)(img.fmt & ~AOM_IMG_FMT_HIGHBITDEPTH); + if (img.bit_depth == 8 && shifted_fmt != img.fmt) { + aom_image_t *img_shifted = + aom_img_alloc(NULL, shifted_fmt, img.d_w, img.d_h, 16); + img_shifted->bit_depth = img.bit_depth; + img_shifted->monochrome = img.monochrome; + aom_img_downshift(img_shifted, &img, 0); + md5_res.Add(img_shifted); + aom_img_free(img_shifted); + } else { +#endif + md5_res.Add(&img); +#if FORCE_HIGHBITDEPTH_DECODING + } +#endif + + const char *actual_md5 = md5_res.Get(); + // Check md5 match. + ASSERT_STREQ(expected_md5, actual_md5) + << "Md5 checksums don't match: frame number = " << frame_number; + } + + unsigned int row_mt_; + + private: + FILE *md5_file_; +}; + +// This test runs through the whole set of test vectors, and decodes them. +// The md5 checksums are computed for each frame in the video file. If md5 +// checksums match the correct md5 data, then the test is passed. Otherwise, +// the test failed. +TEST_P(TestVectorTest, MD5Match) { + const DecodeParam input = GET_PARAM(1); + const std::string filename = std::get(input); + aom_codec_flags_t flags = 0; + aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t(); + char str[256]; + + cfg.threads = std::get(input); + row_mt_ = std::get(input); + + snprintf(str, sizeof(str) / sizeof(str[0]) - 1, "file: %s threads: %d", + filename.c_str(), cfg.threads); + SCOPED_TRACE(str); + + // Open compressed video file. + std::unique_ptr video; + if (filename.substr(filename.length() - 3, 3) == "ivf") { + video.reset(new libaom_test::IVFVideoSource(filename)); + } else if (filename.substr(filename.length() - 4, 4) == "webm" || + filename.substr(filename.length() - 3, 3) == "mkv") { +#if CONFIG_WEBM_IO + video.reset(new libaom_test::WebMVideoSource(filename)); +#else + fprintf(stderr, "WebM IO is disabled, skipping test vector %s\n", + filename.c_str()); + return; +#endif + } + ASSERT_TRUE(video.get() != NULL); + video->Init(); + + // Construct md5 file name. + const std::string md5_filename = filename + ".md5"; + OpenMD5File(md5_filename); + + // Set decode config and flags. + cfg.allow_lowbitdepth = !FORCE_HIGHBITDEPTH_DECODING; + set_cfg(cfg); + set_flags(flags); + + // Decode frame, and check the md5 matching. + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get(), cfg)); +} + +#if CONFIG_AV1_DECODER +AV1_INSTANTIATE_TEST_CASE( + TestVectorTest, + ::testing::Combine(::testing::Values(1), // Single thread. + ::testing::ValuesIn(libaom_test::kAV1TestVectors, + libaom_test::kAV1TestVectors + + libaom_test::kNumAV1TestVectors), + ::testing::Values(0))); + +// Test AV1 decode in with different numbers of threads. +INSTANTIATE_TEST_SUITE_P( + AV1MultiThreaded, TestVectorTest, + ::testing::Combine( + ::testing::Values( + static_cast(&libaom_test::kAV1)), + ::testing::Combine( + ::testing::Range(2, 9), // With 2 ~ 8 threads. + ::testing::ValuesIn(libaom_test::kAV1TestVectors, + libaom_test::kAV1TestVectors + + libaom_test::kNumAV1TestVectors), + ::testing::Range(0, 2)))); + +#endif // CONFIG_AV1_DECODER + +} // namespace diff --git a/libs/libaom/src/test/test_vectors.cc b/libs/libaom/src/test/test_vectors.cc new file mode 100644 index 000000000..991667a08 --- /dev/null +++ b/libs/libaom/src/test/test_vectors.cc @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "test/test_vectors.h" + +namespace libaom_test { + +#define NELEMENTS(x) static_cast(sizeof(x) / sizeof(x[0])) + +#if CONFIG_AV1_DECODER +const char *const kAV1TestVectors[] = { "av1-1-b8-00-quantizer-00.ivf", + "av1-1-b8-00-quantizer-01.ivf", + "av1-1-b8-00-quantizer-02.ivf", + "av1-1-b8-00-quantizer-03.ivf", + "av1-1-b8-00-quantizer-04.ivf", + "av1-1-b8-00-quantizer-05.ivf", + "av1-1-b8-00-quantizer-06.ivf", + "av1-1-b8-00-quantizer-07.ivf", + "av1-1-b8-00-quantizer-08.ivf", + "av1-1-b8-00-quantizer-09.ivf", + "av1-1-b8-00-quantizer-10.ivf", + "av1-1-b8-00-quantizer-11.ivf", + "av1-1-b8-00-quantizer-12.ivf", + "av1-1-b8-00-quantizer-13.ivf", + "av1-1-b8-00-quantizer-14.ivf", + "av1-1-b8-00-quantizer-15.ivf", + "av1-1-b8-00-quantizer-16.ivf", + "av1-1-b8-00-quantizer-17.ivf", + "av1-1-b8-00-quantizer-18.ivf", + "av1-1-b8-00-quantizer-19.ivf", + "av1-1-b8-00-quantizer-20.ivf", + "av1-1-b8-00-quantizer-21.ivf", + "av1-1-b8-00-quantizer-22.ivf", + "av1-1-b8-00-quantizer-23.ivf", + "av1-1-b8-00-quantizer-24.ivf", + "av1-1-b8-00-quantizer-25.ivf", + "av1-1-b8-00-quantizer-26.ivf", + "av1-1-b8-00-quantizer-27.ivf", + "av1-1-b8-00-quantizer-28.ivf", + "av1-1-b8-00-quantizer-29.ivf", + "av1-1-b8-00-quantizer-30.ivf", + "av1-1-b8-00-quantizer-31.ivf", + "av1-1-b8-00-quantizer-32.ivf", + "av1-1-b8-00-quantizer-33.ivf", + "av1-1-b8-00-quantizer-34.ivf", + "av1-1-b8-00-quantizer-35.ivf", + "av1-1-b8-00-quantizer-36.ivf", + "av1-1-b8-00-quantizer-37.ivf", + "av1-1-b8-00-quantizer-38.ivf", + "av1-1-b8-00-quantizer-39.ivf", + "av1-1-b8-00-quantizer-40.ivf", + "av1-1-b8-00-quantizer-41.ivf", + "av1-1-b8-00-quantizer-42.ivf", + "av1-1-b8-00-quantizer-43.ivf", + "av1-1-b8-00-quantizer-44.ivf", + "av1-1-b8-00-quantizer-45.ivf", + "av1-1-b8-00-quantizer-46.ivf", + "av1-1-b8-00-quantizer-47.ivf", + "av1-1-b8-00-quantizer-48.ivf", + "av1-1-b8-00-quantizer-49.ivf", + "av1-1-b8-00-quantizer-50.ivf", + "av1-1-b8-00-quantizer-51.ivf", + "av1-1-b8-00-quantizer-52.ivf", + "av1-1-b8-00-quantizer-53.ivf", + "av1-1-b8-00-quantizer-54.ivf", + "av1-1-b8-00-quantizer-55.ivf", + "av1-1-b8-00-quantizer-56.ivf", + "av1-1-b8-00-quantizer-57.ivf", + "av1-1-b8-00-quantizer-58.ivf", + "av1-1-b8-00-quantizer-59.ivf", + "av1-1-b8-00-quantizer-60.ivf", + "av1-1-b8-00-quantizer-61.ivf", + "av1-1-b8-00-quantizer-62.ivf", + "av1-1-b8-00-quantizer-63.ivf", +#if CONFIG_AV1_HIGHBITDEPTH + "av1-1-b10-00-quantizer-00.ivf", + "av1-1-b10-00-quantizer-01.ivf", + "av1-1-b10-00-quantizer-02.ivf", + "av1-1-b10-00-quantizer-03.ivf", + "av1-1-b10-00-quantizer-04.ivf", + "av1-1-b10-00-quantizer-05.ivf", + "av1-1-b10-00-quantizer-06.ivf", + "av1-1-b10-00-quantizer-07.ivf", + "av1-1-b10-00-quantizer-08.ivf", + "av1-1-b10-00-quantizer-09.ivf", + "av1-1-b10-00-quantizer-10.ivf", + "av1-1-b10-00-quantizer-11.ivf", + "av1-1-b10-00-quantizer-12.ivf", + "av1-1-b10-00-quantizer-13.ivf", + "av1-1-b10-00-quantizer-14.ivf", + "av1-1-b10-00-quantizer-15.ivf", + "av1-1-b10-00-quantizer-16.ivf", + "av1-1-b10-00-quantizer-17.ivf", + "av1-1-b10-00-quantizer-18.ivf", + "av1-1-b10-00-quantizer-19.ivf", + "av1-1-b10-00-quantizer-20.ivf", + "av1-1-b10-00-quantizer-21.ivf", + "av1-1-b10-00-quantizer-22.ivf", + "av1-1-b10-00-quantizer-23.ivf", + "av1-1-b10-00-quantizer-24.ivf", + "av1-1-b10-00-quantizer-25.ivf", + "av1-1-b10-00-quantizer-26.ivf", + "av1-1-b10-00-quantizer-27.ivf", + "av1-1-b10-00-quantizer-28.ivf", + "av1-1-b10-00-quantizer-29.ivf", + "av1-1-b10-00-quantizer-30.ivf", + "av1-1-b10-00-quantizer-31.ivf", + "av1-1-b10-00-quantizer-32.ivf", + "av1-1-b10-00-quantizer-33.ivf", + "av1-1-b10-00-quantizer-34.ivf", + "av1-1-b10-00-quantizer-35.ivf", + "av1-1-b10-00-quantizer-36.ivf", + "av1-1-b10-00-quantizer-37.ivf", + "av1-1-b10-00-quantizer-38.ivf", + "av1-1-b10-00-quantizer-39.ivf", + "av1-1-b10-00-quantizer-40.ivf", + "av1-1-b10-00-quantizer-41.ivf", + "av1-1-b10-00-quantizer-42.ivf", + "av1-1-b10-00-quantizer-43.ivf", + "av1-1-b10-00-quantizer-44.ivf", + "av1-1-b10-00-quantizer-45.ivf", + "av1-1-b10-00-quantizer-46.ivf", + "av1-1-b10-00-quantizer-47.ivf", + "av1-1-b10-00-quantizer-48.ivf", + "av1-1-b10-00-quantizer-49.ivf", + "av1-1-b10-00-quantizer-50.ivf", + "av1-1-b10-00-quantizer-51.ivf", + "av1-1-b10-00-quantizer-52.ivf", + "av1-1-b10-00-quantizer-53.ivf", + "av1-1-b10-00-quantizer-54.ivf", + "av1-1-b10-00-quantizer-55.ivf", + "av1-1-b10-00-quantizer-56.ivf", + "av1-1-b10-00-quantizer-57.ivf", + "av1-1-b10-00-quantizer-58.ivf", + "av1-1-b10-00-quantizer-59.ivf", + "av1-1-b10-00-quantizer-60.ivf", + "av1-1-b10-00-quantizer-61.ivf", + "av1-1-b10-00-quantizer-62.ivf", + "av1-1-b10-00-quantizer-63.ivf", + "av1-1-b10-23-film_grain-50.ivf", +#endif // CONFIG_AV1_HIGHBITDEPTH + "av1-1-b8-01-size-16x16.ivf", + "av1-1-b8-01-size-16x18.ivf", + "av1-1-b8-01-size-16x32.ivf", + "av1-1-b8-01-size-16x34.ivf", + "av1-1-b8-01-size-16x64.ivf", + "av1-1-b8-01-size-16x66.ivf", + "av1-1-b8-01-size-18x16.ivf", + "av1-1-b8-01-size-18x18.ivf", + "av1-1-b8-01-size-18x32.ivf", + "av1-1-b8-01-size-18x34.ivf", + "av1-1-b8-01-size-18x64.ivf", + "av1-1-b8-01-size-18x66.ivf", + "av1-1-b8-01-size-196x196.ivf", + "av1-1-b8-01-size-196x198.ivf", + "av1-1-b8-01-size-196x200.ivf", + "av1-1-b8-01-size-196x202.ivf", + "av1-1-b8-01-size-196x208.ivf", + "av1-1-b8-01-size-196x210.ivf", + "av1-1-b8-01-size-196x224.ivf", + "av1-1-b8-01-size-196x226.ivf", + "av1-1-b8-01-size-198x196.ivf", + "av1-1-b8-01-size-198x198.ivf", + "av1-1-b8-01-size-198x200.ivf", + "av1-1-b8-01-size-198x202.ivf", + "av1-1-b8-01-size-198x208.ivf", + "av1-1-b8-01-size-198x210.ivf", + "av1-1-b8-01-size-198x224.ivf", + "av1-1-b8-01-size-198x226.ivf", + "av1-1-b8-01-size-200x196.ivf", + "av1-1-b8-01-size-200x198.ivf", + "av1-1-b8-01-size-200x200.ivf", + "av1-1-b8-01-size-200x202.ivf", + "av1-1-b8-01-size-200x208.ivf", + "av1-1-b8-01-size-200x210.ivf", + "av1-1-b8-01-size-200x224.ivf", + "av1-1-b8-01-size-200x226.ivf", + "av1-1-b8-01-size-202x196.ivf", + "av1-1-b8-01-size-202x198.ivf", + "av1-1-b8-01-size-202x200.ivf", + "av1-1-b8-01-size-202x202.ivf", + "av1-1-b8-01-size-202x208.ivf", + "av1-1-b8-01-size-202x210.ivf", + "av1-1-b8-01-size-202x224.ivf", + "av1-1-b8-01-size-202x226.ivf", + "av1-1-b8-01-size-208x196.ivf", + "av1-1-b8-01-size-208x198.ivf", + "av1-1-b8-01-size-208x200.ivf", + "av1-1-b8-01-size-208x202.ivf", + "av1-1-b8-01-size-208x208.ivf", + "av1-1-b8-01-size-208x210.ivf", + "av1-1-b8-01-size-208x224.ivf", + "av1-1-b8-01-size-208x226.ivf", + "av1-1-b8-01-size-210x196.ivf", + "av1-1-b8-01-size-210x198.ivf", + "av1-1-b8-01-size-210x200.ivf", + "av1-1-b8-01-size-210x202.ivf", + "av1-1-b8-01-size-210x208.ivf", + "av1-1-b8-01-size-210x210.ivf", + "av1-1-b8-01-size-210x224.ivf", + "av1-1-b8-01-size-210x226.ivf", + "av1-1-b8-01-size-224x196.ivf", + "av1-1-b8-01-size-224x198.ivf", + "av1-1-b8-01-size-224x200.ivf", + "av1-1-b8-01-size-224x202.ivf", + "av1-1-b8-01-size-224x208.ivf", + "av1-1-b8-01-size-224x210.ivf", + "av1-1-b8-01-size-224x224.ivf", + "av1-1-b8-01-size-224x226.ivf", + "av1-1-b8-01-size-226x196.ivf", + "av1-1-b8-01-size-226x198.ivf", + "av1-1-b8-01-size-226x200.ivf", + "av1-1-b8-01-size-226x202.ivf", + "av1-1-b8-01-size-226x208.ivf", + "av1-1-b8-01-size-226x210.ivf", + "av1-1-b8-01-size-226x224.ivf", + "av1-1-b8-01-size-226x226.ivf", + "av1-1-b8-01-size-32x16.ivf", + "av1-1-b8-01-size-32x18.ivf", + "av1-1-b8-01-size-32x32.ivf", + "av1-1-b8-01-size-32x34.ivf", + "av1-1-b8-01-size-32x64.ivf", + "av1-1-b8-01-size-32x66.ivf", + "av1-1-b8-01-size-34x16.ivf", + "av1-1-b8-01-size-34x18.ivf", + "av1-1-b8-01-size-34x32.ivf", + "av1-1-b8-01-size-34x34.ivf", + "av1-1-b8-01-size-34x64.ivf", + "av1-1-b8-01-size-34x66.ivf", + "av1-1-b8-01-size-64x16.ivf", + "av1-1-b8-01-size-64x18.ivf", + "av1-1-b8-01-size-64x32.ivf", + "av1-1-b8-01-size-64x34.ivf", + "av1-1-b8-01-size-64x64.ivf", + "av1-1-b8-01-size-64x66.ivf", + "av1-1-b8-01-size-66x16.ivf", + "av1-1-b8-01-size-66x18.ivf", + "av1-1-b8-01-size-66x32.ivf", + "av1-1-b8-01-size-66x34.ivf", + "av1-1-b8-01-size-66x64.ivf", + "av1-1-b8-01-size-66x66.ivf", + "av1-1-b8-02-allintra.ivf", + "av1-1-b8-03-sizedown.mkv", + "av1-1-b8-03-sizeup.mkv", + "av1-1-b8-04-cdfupdate.ivf", + "av1-1-b8-05-mv.ivf", + "av1-1-b8-06-mfmv.ivf", + "av1-1-b8-22-svc-L1T2.ivf", + "av1-1-b8-22-svc-L2T1.ivf", + "av1-1-b8-22-svc-L2T2.ivf", + "av1-1-b8-23-film_grain-50.ivf" }; +const int kNumAV1TestVectors = NELEMENTS(kAV1TestVectors); +#endif // CONFIG_AV1_DECODER + +} // namespace libaom_test diff --git a/libs/libaom/src/test/test_vectors.h b/libs/libaom/src/test/test_vectors.h new file mode 100644 index 000000000..be37f6e37 --- /dev/null +++ b/libs/libaom/src/test/test_vectors.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_TEST_TEST_VECTORS_H_ +#define AOM_TEST_TEST_VECTORS_H_ + +#include "config/aom_config.h" + +namespace libaom_test { + +#if CONFIG_AV1_DECODER +extern const int kNumAV1TestVectors; +extern const char *const kAV1TestVectors[]; +#endif + +} // namespace libaom_test + +#endif // AOM_TEST_TEST_VECTORS_H_ diff --git a/libs/libaom/src/test/tile_independence_test.cc b/libs/libaom/src/test/tile_independence_test.cc new file mode 100644 index 000000000..4f7c4a475 --- /dev/null +++ b/libs/libaom/src/test/tile_independence_test.cc @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/md5_helper.h" +#include "aom_mem/aom_mem.h" + +namespace { +class TileIndependenceTest + : public ::libaom_test::CodecTestWith3Params, + public ::libaom_test::EncoderTest { + protected: + TileIndependenceTest() + : EncoderTest(GET_PARAM(0)), md5_fw_order_(), md5_inv_order_(), + n_tile_cols_(GET_PARAM(1)), n_tile_rows_(GET_PARAM(2)), + n_tile_groups_(GET_PARAM(3)) { + init_flags_ = AOM_CODEC_USE_PSNR; + aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t(); + cfg.w = 704; + cfg.h = 576; + cfg.threads = 1; + cfg.allow_lowbitdepth = 1; + fw_dec_ = codec_->CreateDecoder(cfg, 0); + inv_dec_ = codec_->CreateDecoder(cfg, 0); + inv_dec_->Control(AV1_INVERT_TILE_DECODE_ORDER, 1); + + if (fw_dec_->IsAV1() && inv_dec_->IsAV1()) { + fw_dec_->Control(AV1_SET_DECODE_TILE_ROW, -1); + fw_dec_->Control(AV1_SET_DECODE_TILE_COL, -1); + inv_dec_->Control(AV1_SET_DECODE_TILE_ROW, -1); + inv_dec_->Control(AV1_SET_DECODE_TILE_COL, -1); + } + } + + virtual ~TileIndependenceTest() { + delete fw_dec_; + delete inv_dec_; + } + + virtual void SetUp() { + InitializeConfig(); + SetMode(libaom_test::kTwoPassGood); + } + + virtual void PreEncodeFrameHook(libaom_test::VideoSource *video, + libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AV1E_SET_TILE_COLUMNS, n_tile_cols_); + encoder->Control(AV1E_SET_TILE_ROWS, n_tile_rows_); + SetCpuUsed(encoder); + } else if (video->frame() == 3) { + encoder->Control(AV1E_SET_NUM_TG, n_tile_groups_); + } + } + + virtual void SetCpuUsed(libaom_test::Encoder *encoder) { + static const int kCpuUsed = 3; + encoder->Control(AOME_SET_CPUUSED, kCpuUsed); + } + + void UpdateMD5(::libaom_test::Decoder *dec, const aom_codec_cx_pkt_t *pkt, + ::libaom_test::MD5 *md5) { + const aom_codec_err_t res = dec->DecodeFrame( + reinterpret_cast(pkt->data.frame.buf), pkt->data.frame.sz); + if (res != AOM_CODEC_OK) { + abort_ = true; + ASSERT_EQ(AOM_CODEC_OK, res); + } + const aom_image_t *img = dec->GetDxData().Next(); + md5->Add(img); + } + + virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) { + UpdateMD5(fw_dec_, pkt, &md5_fw_order_); + UpdateMD5(inv_dec_, pkt, &md5_inv_order_); + } + + void DoTest() { + const aom_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + cfg_.rc_target_bitrate = 500; + cfg_.g_lag_in_frames = 12; + cfg_.rc_end_usage = AOM_VBR; + + libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 704, 576, + timebase.den, timebase.num, 0, 5); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + const char *md5_fw_str = md5_fw_order_.Get(); + const char *md5_inv_str = md5_inv_order_.Get(); + ASSERT_STREQ(md5_fw_str, md5_inv_str); + } + + ::libaom_test::MD5 md5_fw_order_, md5_inv_order_; + ::libaom_test::Decoder *fw_dec_, *inv_dec_; + + private: + int n_tile_cols_; + int n_tile_rows_; + int n_tile_groups_; +}; + +// run an encode with 2 or 4 tiles, and do the decode both in normal and +// inverted tile ordering. Ensure that the MD5 of the output in both cases +// is identical. If so, tiles are considered independent and the test passes. +TEST_P(TileIndependenceTest, MD5Match) { + cfg_.large_scale_tile = 0; + fw_dec_->Control(AV1_SET_TILE_MODE, 0); + inv_dec_->Control(AV1_SET_TILE_MODE, 0); + DoTest(); +} + +class TileIndependenceTestLarge : public TileIndependenceTest { + virtual void SetCpuUsed(libaom_test::Encoder *encoder) { + static const int kCpuUsed = 0; + encoder->Control(AOME_SET_CPUUSED, kCpuUsed); + } +}; + +TEST_P(TileIndependenceTestLarge, MD5Match) { + cfg_.large_scale_tile = 0; + fw_dec_->Control(AV1_SET_TILE_MODE, 0); + inv_dec_->Control(AV1_SET_TILE_MODE, 0); + DoTest(); +} + +AV1_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Values(0, 1), + ::testing::Values(0, 1), ::testing::Values(1, 2, 4)); +AV1_INSTANTIATE_TEST_CASE(TileIndependenceTestLarge, ::testing::Values(0, 1), + ::testing::Values(0, 1), ::testing::Values(1, 2, 4)); + +class TileIndependenceLSTest : public TileIndependenceTest {}; + +TEST_P(TileIndependenceLSTest, MD5Match) { + cfg_.large_scale_tile = 1; + fw_dec_->Control(AV1_SET_TILE_MODE, 1); + fw_dec_->Control(AV1D_EXT_TILE_DEBUG, 1); + inv_dec_->Control(AV1_SET_TILE_MODE, 1); + inv_dec_->Control(AV1D_EXT_TILE_DEBUG, 1); + DoTest(); +} + +class TileIndependenceLSTestLarge : public TileIndependenceTestLarge {}; + +TEST_P(TileIndependenceLSTestLarge, MD5Match) { + cfg_.large_scale_tile = 1; + fw_dec_->Control(AV1_SET_TILE_MODE, 1); + fw_dec_->Control(AV1D_EXT_TILE_DEBUG, 1); + inv_dec_->Control(AV1_SET_TILE_MODE, 1); + inv_dec_->Control(AV1D_EXT_TILE_DEBUG, 1); + DoTest(); +} + +AV1_INSTANTIATE_TEST_CASE(TileIndependenceLSTest, ::testing::Values(6), + ::testing::Values(6), ::testing::Values(1)); +AV1_INSTANTIATE_TEST_CASE(TileIndependenceLSTestLarge, ::testing::Values(6), + ::testing::Values(6), ::testing::Values(1)); +} // namespace diff --git a/libs/libaom/src/test/time_stamp_test.cc b/libs/libaom/src/test/time_stamp_test.cc new file mode 100644 index 000000000..679e4da29 --- /dev/null +++ b/libs/libaom/src/test/time_stamp_test.cc @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Test AOM timestamp handling + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/video_source.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +namespace { + +const int kVideoSourceWidth = 320; +const int kVideoSourceHeight = 240; +const int kFramesToEncode = 3; + +// A video source that exposes functions to set the timebase, framerate and +// starting pts. +class DummyTimebaseVideoSource : public ::libaom_test::DummyVideoSource { + public: + // Parameters num and den set the timebase for the video source. + DummyTimebaseVideoSource(int num, int den) + : framerate_numerator_(30), framerate_denominator_(1), starting_pts_(0) { + SetSize(kVideoSourceWidth, kVideoSourceHeight); + set_limit(kFramesToEncode); + timebase_.num = num; + timebase_.den = den; + } + + void SetFramerate(int numerator, int denominator) { + framerate_numerator_ = numerator; + framerate_denominator_ = denominator; + } + + // Returns one frames duration in timebase units as a double. + double FrameDuration() const { + return (static_cast(timebase_.den) / timebase_.num) / + (static_cast(framerate_numerator_) / framerate_denominator_); + } + + virtual aom_codec_pts_t pts() const { + return static_cast(frame_ * FrameDuration() + + starting_pts_ + 0.5); + } + + virtual unsigned long duration() const { + return static_cast(FrameDuration() + 0.5); + } + + virtual aom_rational_t timebase() const { return timebase_; } + + void set_starting_pts(int64_t starting_pts) { starting_pts_ = starting_pts; } + + private: + aom_rational_t timebase_; + int framerate_numerator_; + int framerate_denominator_; + int64_t starting_pts_; +}; + +class TimestampTest + : public ::libaom_test::EncoderTest, + public ::libaom_test::CodecTestWithParam { + protected: + TimestampTest() : EncoderTest(GET_PARAM(0)) {} + virtual ~TimestampTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(GET_PARAM(1)); + } +}; + +// Tests encoding in millisecond timebase. +TEST_P(TimestampTest, EncodeFrames) { + DummyTimebaseVideoSource video(1, 1000); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +TEST_P(TimestampTest, TestMicrosecondTimebase) { + // Set the timebase to microseconds. + DummyTimebaseVideoSource video(1, 1000000); + video.set_limit(1); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +TEST_P(TimestampTest, TestAv1Rollover) { + DummyTimebaseVideoSource video(1, 1000); + video.set_starting_pts(922337170351ll); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +AV1_INSTANTIATE_TEST_CASE(TimestampTest, + ::testing::Values(::libaom_test::kTwoPassGood)); + +} // namespace diff --git a/libs/libaom/src/test/tools_common.sh b/libs/libaom/src/test/tools_common.sh new file mode 100644 index 000000000..c08710606 --- /dev/null +++ b/libs/libaom/src/test/tools_common.sh @@ -0,0 +1,477 @@ +#!/bin/sh +## Copyright (c) 2016, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +## This file contains shell code shared by test scripts for libaom tools. + +# Use $AOM_TEST_TOOLS_COMMON_SH as a pseudo include guard. +if [ -z "${AOM_TEST_TOOLS_COMMON_SH}" ]; then +AOM_TEST_TOOLS_COMMON_SH=included + +set -e +devnull='> /dev/null 2>&1' +AOM_TEST_PREFIX="" + +elog() { + echo "$@" 1>&2 +} + +vlog() { + if [ "${AOM_TEST_VERBOSE_OUTPUT}" = "yes" ]; then + echo "$@" + fi +} + +# Sets $AOM_TOOL_TEST to the name specified by positional parameter one. +test_begin() { + AOM_TOOL_TEST="${1}" +} + +# Clears the AOM_TOOL_TEST variable after confirming that $AOM_TOOL_TEST matches +# positional parameter one. +test_end() { + if [ "$1" != "${AOM_TOOL_TEST}" ]; then + echo "FAIL completed test mismatch!." + echo " completed test: ${1}" + echo " active test: ${AOM_TOOL_TEST}." + return 1 + fi + AOM_TOOL_TEST='' +} + +# Echoes the target configuration being tested. +test_configuration_target() { + aom_config_c="${LIBAOM_CONFIG_PATH}/config/aom_config.c" + # Clean up the cfg pointer line from aom_config.c for easier re-use by + # someone examining a failure in the example tests. + # 1. Run grep on aom_config.c for cfg and limit the results to 1. + # 2. Split the line using ' = ' as separator. + # 3. Abuse sed to consume the leading " and trailing "; from the assignment + # to the cfg pointer. + cmake_config=$(awk -F ' = ' '/cfg/ { print $NF; exit }' "${aom_config_c}" \ + | sed -e s/\"// -e s/\"\;//) + echo cmake generated via command: cmake path/to/aom ${cmake_config} +} + +# Trap function used for failure reports and tool output directory removal. +# When the contents of $AOM_TOOL_TEST do not match the string '', reports +# failure of test stored in $AOM_TOOL_TEST. +cleanup() { + if [ -n "${AOM_TOOL_TEST}" ] && [ "${AOM_TOOL_TEST}" != '' ]; then + echo "FAIL: $AOM_TOOL_TEST" + fi + if [ "${AOM_TEST_PRESERVE_OUTPUT}" = "yes" ]; then + return + fi + if [ -n "${AOM_TEST_OUTPUT_DIR}" ] && [ -d "${AOM_TEST_OUTPUT_DIR}" ]; then + rm -rf "${AOM_TEST_OUTPUT_DIR}" + fi +} + +# Echoes the version string assigned to the VERSION_STRING_NOSP variable defined +# in $LIBAOM_CONFIG_PATH/config/aom_version.h to stdout. +cmake_version() { + aom_version_h="${LIBAOM_CONFIG_PATH}/config/aom_version.h" + + # Find VERSION_STRING_NOSP line, split it with '"' and print the next to last + # field to output the version string to stdout. + aom_version=$(awk -F \" '/VERSION_STRING_NOSP/ {print $(NF-1)}' \ + "${aom_version_h}") + echo "v${aom_version}" +} + +# Echoes current git version as reported by running 'git describe', or the +# version used by the cmake build when git is unavailable. +source_version() { + if git --version > /dev/null 2>&1; then + (cd "$(dirname "${0}")" + git describe) + else + cmake_version + fi +} + +# Echoes warnings to stdout when source version and CMake build generated +# version are out of sync. +check_version_strings() { + cmake_version=$(cmake_version) + source_version=$(source_version) + + if [ "${cmake_version}" != "${source_version}" ]; then + echo "Warning: version has changed since last cmake run." + vlog " cmake version: ${cmake_version} version now: ${source_version}" + fi +} + +# $1 is the name of an environment variable containing a directory name to +# test. +test_env_var_dir() { + local dir=$(eval echo "\${$1}") + if [ ! -d "${dir}" ]; then + elog "'${dir}': No such directory" + elog "The $1 environment variable must be set to a valid directory." + return 1 + fi +} + +# This script requires that the LIBAOM_BIN_PATH, LIBAOM_CONFIG_PATH, and +# LIBAOM_TEST_DATA_PATH variables are in the environment: Confirm that +# the variables are set and that they all evaluate to directory paths. +verify_aom_test_environment() { + test_env_var_dir "LIBAOM_BIN_PATH" \ + && test_env_var_dir "LIBAOM_CONFIG_PATH" \ + && test_env_var_dir "LIBAOM_TEST_DATA_PATH" +} + +# Greps aom_config.h in LIBAOM_CONFIG_PATH for positional parameter one, which +# should be a LIBAOM preprocessor flag. Echoes yes to stdout when the feature +# is available. +aom_config_option_enabled() { + aom_config_option="${1}" + aom_config_file="${LIBAOM_CONFIG_PATH}/config/aom_config.h" + config_line=$(grep "${aom_config_option}" "${aom_config_file}") + if echo "${config_line}" | egrep -q '1$'; then + echo yes + fi +} + +# Echoes yes when output of test_configuration_target() contains win32 or win64. +is_windows_target() { + if test_configuration_target \ + | grep -q -e win32 -e win64 > /dev/null 2>&1; then + echo yes + fi +} + +# Echoes path to $1 when it's executable and exists in one of the directories +# included in $tool_paths, or an empty string. Caller is responsible for testing +# the string once the function returns. +aom_tool_path() { + local tool_name="$1" + local root_path="${LIBAOM_BIN_PATH}" + local suffix="${AOM_TEST_EXE_SUFFIX}" + local tool_paths="\ + ${root_path}/${tool_name}${suffix} \ + ${root_path}/../${tool_name}${suffix} \ + ${root_path}/tools/${tool_name}${suffix} \ + ${root_path}/../tools/${tool_name}${suffix}" + + local toolpath="" + + for tool_path in ${tool_paths}; do + if [ -x "${tool_path}" ] && [ -f "${tool_path}" ]; then + echo "${tool_path}" + return 0 + fi + done + + return 1 +} + +# Echoes yes to stdout when the file named by positional parameter one exists +# in LIBAOM_BIN_PATH, and is executable. +aom_tool_available() { + local tool_name="$1" + local tool="${LIBAOM_BIN_PATH}/${tool_name}${AOM_TEST_EXE_SUFFIX}" + [ -x "${tool}" ] && echo yes +} + +# Echoes yes to stdout when aom_config_option_enabled() reports yes for +# CONFIG_AV1_DECODER. +av1_decode_available() { + [ "$(aom_config_option_enabled CONFIG_AV1_DECODER)" = "yes" ] && echo yes +} + +# Echoes yes to stdout when aom_config_option_enabled() reports yes for +# CONFIG_AV1_ENCODER. +av1_encode_available() { + [ "$(aom_config_option_enabled CONFIG_AV1_ENCODER)" = "yes" ] && echo yes +} + +# Echoes "fast" encode params for use with aomenc. +aomenc_encode_test_fast_params() { + echo "--cpu-used=1 + --limit=${AV1_ENCODE_TEST_FRAME_LIMIT} + --lag-in-frames=0 + --test-decode=fatal" +} + +# Echoes yes to stdout when aom_config_option_enabled() reports yes for +# CONFIG_WEBM_IO. +webm_io_available() { + [ "$(aom_config_option_enabled CONFIG_WEBM_IO)" = "yes" ] && echo yes +} + +# Filters strings from $1 using the filter specified by $2. Filter behavior +# depends on the presence of $3. When $3 is present, strings that match the +# filter are excluded. When $3 is omitted, strings matching the filter are +# included. +# The filtered result is echoed to stdout. +filter_strings() { + strings=${1} + filter=${2} + exclude=${3} + + if [ -n "${exclude}" ]; then + # When positional parameter three exists the caller wants to remove strings. + # Tell grep to invert matches using the -v argument. + exclude='-v' + else + unset exclude + fi + + if [ -n "${filter}" ]; then + for s in ${strings}; do + if echo "${s}" | egrep -q ${exclude} "${filter}" > /dev/null 2>&1; then + filtered_strings="${filtered_strings} ${s}" + fi + done + else + filtered_strings="${strings}" + fi + echo "${filtered_strings}" +} + +# Runs user test functions passed via positional parameters one and two. +# Functions in positional parameter one are treated as environment verification +# functions and are run unconditionally. Functions in positional parameter two +# are run according to the rules specified in aom_test_usage(). +run_tests() { + local env_tests="verify_aom_test_environment $1" + local tests_to_filter="$2" + local test_name="${AOM_TEST_NAME}" + + if [ -z "${test_name}" ]; then + test_name="$(basename "${0%.*}")" + fi + + if [ "${AOM_TEST_RUN_DISABLED_TESTS}" != "yes" ]; then + # Filter out DISABLED tests. + tests_to_filter=$(filter_strings "${tests_to_filter}" ^DISABLED exclude) + fi + + if [ -n "${AOM_TEST_FILTER}" ]; then + # Remove tests not matching the user's filter. + tests_to_filter=$(filter_strings "${tests_to_filter}" ${AOM_TEST_FILTER}) + fi + + # User requested test listing: Dump test names and return. + if [ "${AOM_TEST_LIST_TESTS}" = "yes" ]; then + for test_name in $tests_to_filter; do + echo ${test_name} + done + return + fi + + # Don't bother with the environment tests if everything else was disabled. + [ -z "${tests_to_filter}" ] && return + + # Combine environment and actual tests. + local tests_to_run="${env_tests} ${tests_to_filter}" + + check_version_strings + + # Run tests. + for test in ${tests_to_run}; do + test_begin "${test}" + vlog " RUN ${test}" + "${test}" + vlog " PASS ${test}" + test_end "${test}" + done + + local tested_config="$(test_configuration_target) @ $(source_version)" + echo "${test_name}: Done, all tests pass for ${tested_config}." +} + +aom_test_usage() { +cat << EOF + Usage: ${0##*/} [arguments] + --bin-path + --config-path + --filter : User test filter. Only tests matching filter are run. + --run-disabled-tests: Run disabled tests. + --help: Display this message and exit. + --test-data-path + --show-program-output: Shows output from all programs being tested. + --prefix: Allows for a user specified prefix to be inserted before all test + programs. Grants the ability, for example, to run test programs + within valgrind. + --list-tests: List all test names and exit without actually running tests. + --verbose: Verbose output. + + When the --bin-path option is not specified the script attempts to use + \$LIBAOM_BIN_PATH and then the current directory. + + When the --config-path option is not specified the script attempts to use + \$LIBAOM_CONFIG_PATH and then the current directory. + + When the -test-data-path option is not specified the script attempts to use + \$LIBAOM_TEST_DATA_PATH and then the current directory. +EOF +} + +# Returns non-zero (failure) when required environment variables are empty +# strings. +aom_test_check_environment() { + if [ -z "${LIBAOM_BIN_PATH}" ] || \ + [ -z "${LIBAOM_CONFIG_PATH}" ] || \ + [ -z "${LIBAOM_TEST_DATA_PATH}" ]; then + return 1 + fi +} + +# Echo aomenc command line parameters allowing use of a raw yuv file as +# input to aomenc. +yuv_raw_input() { + echo ""${YUV_RAW_INPUT}" + --width="${YUV_RAW_INPUT_WIDTH}" + --height="${YUV_RAW_INPUT_HEIGHT}"" +} + +# Do a small encode for testing decoders. +encode_yuv_raw_input_av1() { + if [ "$(av1_encode_available)" = "yes" ]; then + local output="$1" + local encoder="$(aom_tool_path aomenc)" + shift + eval "${encoder}" $(yuv_raw_input) \ + $(aomenc_encode_test_fast_params) \ + --output="${output}" \ + $@ \ + ${devnull} + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + fi +} + +# Parse the command line. +while [ -n "$1" ]; do + case "$1" in + --bin-path) + LIBAOM_BIN_PATH="$2" + shift + ;; + --config-path) + LIBAOM_CONFIG_PATH="$2" + shift + ;; + --filter) + AOM_TEST_FILTER="$2" + shift + ;; + --run-disabled-tests) + AOM_TEST_RUN_DISABLED_TESTS=yes + ;; + --help) + aom_test_usage + exit + ;; + --test-data-path) + LIBAOM_TEST_DATA_PATH="$2" + shift + ;; + --prefix) + AOM_TEST_PREFIX="$2" + shift + ;; + --verbose) + AOM_TEST_VERBOSE_OUTPUT=yes + ;; + --show-program-output) + devnull= + ;; + --list-tests) + AOM_TEST_LIST_TESTS=yes + ;; + *) + aom_test_usage + exit 1 + ;; + esac + shift +done + +# Handle running the tests from a build directory without arguments when running +# the tests on *nix/macosx. +LIBAOM_BIN_PATH="${LIBAOM_BIN_PATH:-.}" +LIBAOM_CONFIG_PATH="${LIBAOM_CONFIG_PATH:-.}" +LIBAOM_TEST_DATA_PATH="${LIBAOM_TEST_DATA_PATH:-.}" + +# Create a temporary directory for output files, and a trap to clean it up. +if [ -n "${TMPDIR}" ]; then + AOM_TEST_TEMP_ROOT="${TMPDIR}" +elif [ -n "${TEMPDIR}" ]; then + AOM_TEST_TEMP_ROOT="${TEMPDIR}" +else + AOM_TEST_TEMP_ROOT=/tmp +fi + +AOM_TEST_OUTPUT_DIR="${AOM_TEST_OUTPUT_DIR:-${AOM_TEST_TEMP_ROOT}/aom_test_$$}" + +if ! mkdir -p "${AOM_TEST_OUTPUT_DIR}" || \ + [ ! -d "${AOM_TEST_OUTPUT_DIR}" ]; then + echo "${0##*/}: Cannot create output directory, giving up." + echo "${0##*/}: AOM_TEST_OUTPUT_DIR=${AOM_TEST_OUTPUT_DIR}" + exit 1 +fi + +AOM_TEST_PRESERVE_OUTPUT=${AOM_TEST_PRESERVE_OUTPUT:-no} + +if [ "$(is_windows_target)" = "yes" ]; then + AOM_TEST_EXE_SUFFIX=".exe" +fi + +# Variables shared by tests. +AV1_ENCODE_CPU_USED=${AV1_ENCODE_CPU_USED:-1} +AV1_ENCODE_TEST_FRAME_LIMIT=${AV1_ENCODE_TEST_FRAME_LIMIT:-5} +AV1_IVF_FILE="${AV1_IVF_FILE:-${AOM_TEST_OUTPUT_DIR}/av1.ivf}" +AV1_OBU_ANNEXB_FILE="${AV1_OBU_ANNEXB_FILE:-${AOM_TEST_OUTPUT_DIR}/av1.annexb.obu}" +AV1_OBU_SEC5_FILE="${AV1_OBU_SEC5_FILE:-${AOM_TEST_OUTPUT_DIR}/av1.section5.obu}" +AV1_WEBM_FILE="${AV1_WEBM_FILE:-${AOM_TEST_OUTPUT_DIR}/av1.webm}" + +YUV_RAW_INPUT="${LIBAOM_TEST_DATA_PATH}/hantro_collage_w352h288.yuv" +YUV_RAW_INPUT_WIDTH=352 +YUV_RAW_INPUT_HEIGHT=288 + +Y4M_NOSQ_PAR_INPUT="${LIBAOM_TEST_DATA_PATH}/park_joy_90p_8_420_a10-1.y4m" +Y4M_720P_INPUT="${LIBAOM_TEST_DATA_PATH}/niklas_1280_720_30.y4m" + +# Setup a trap function to clean up after tests complete. +trap cleanup EXIT + +vlog "$(basename "${0%.*}") test configuration: + LIBAOM_BIN_PATH=${LIBAOM_BIN_PATH} + LIBAOM_CONFIG_PATH=${LIBAOM_CONFIG_PATH} + LIBAOM_TEST_DATA_PATH=${LIBAOM_TEST_DATA_PATH} + AOM_TEST_EXE_SUFFIX=${AOM_TEST_EXE_SUFFIX} + AOM_TEST_FILTER=${AOM_TEST_FILTER} + AOM_TEST_LIST_TESTS=${AOM_TEST_LIST_TESTS} + AOM_TEST_OUTPUT_DIR=${AOM_TEST_OUTPUT_DIR} + AOM_TEST_PREFIX=${AOM_TEST_PREFIX} + AOM_TEST_PRESERVE_OUTPUT=${AOM_TEST_PRESERVE_OUTPUT} + AOM_TEST_RUN_DISABLED_TESTS=${AOM_TEST_RUN_DISABLED_TESTS} + AOM_TEST_SHOW_PROGRAM_OUTPUT=${AOM_TEST_SHOW_PROGRAM_OUTPUT} + AOM_TEST_TEMP_ROOT=${AOM_TEST_TEMP_ROOT} + AOM_TEST_VERBOSE_OUTPUT=${AOM_TEST_VERBOSE_OUTPUT} + AV1_ENCODE_CPU_USED=${AV1_ENCODE_CPU_USED} + AV1_ENCODE_TEST_FRAME_LIMIT=${AV1_ENCODE_TEST_FRAME_LIMIT} + AV1_IVF_FILE=${AV1_IVF_FILE} + AV1_OBU_ANNEXB_FILE=${AV1_OBU_ANNEXB_FILE} + AV1_OBU_SEC5_FILE=${AV1_OBU_SEC5_FILE} + AV1_WEBM_FILE=${AV1_WEBM_FILE} + YUV_RAW_INPUT=${YUV_RAW_INPUT} + YUV_RAW_INPUT_WIDTH=${YUV_RAW_INPUT_WIDTH} + YUV_RAW_INPUT_HEIGHT=${YUV_RAW_INPUT_HEIGHT} + Y4M_NOSQ_PAR_INPUT=${Y4M_NOSQ_PAR_INPUT}" + +fi # End $AOM_TEST_TOOLS_COMMON_SH pseudo include guard. diff --git a/libs/libaom/src/test/transform_test_base.h b/libs/libaom/src/test/transform_test_base.h new file mode 100644 index 000000000..68f5cc74d --- /dev/null +++ b/libs/libaom/src/test/transform_test_base.h @@ -0,0 +1,345 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_TEST_TRANSFORM_TEST_BASE_H_ +#define AOM_TEST_TRANSFORM_TEST_BASE_H_ + +#include "config/aom_config.h" + +#include "aom_mem/aom_mem.h" +#include "aom/aom_codec.h" +#include "aom_dsp/txfm_common.h" + +namespace libaom_test { + +// Note: +// Same constant are defined in av1/common/av1_entropy.h and +// av1/common/entropy.h. Goal is to make this base class +// to use for future codec transform testing. But including +// either of them would lead to compiling error when we do +// unit test for another codec. Suggest to move the definition +// to a aom header file. +const int kDctMaxValue = 16384; + +template +using FhtFunc = void (*)(const int16_t *in, OutputType *out, int stride, + TxfmParam *txfm_param); + +template +using IhtFunc = void (*)(const tran_low_t *in, uint8_t *out, int stride, + const TxfmParam *txfm_param); + +template +class TransformTestBase { + public: + virtual ~TransformTestBase() {} + + protected: + virtual void RunFwdTxfm(const int16_t *in, OutType *out, int stride) = 0; + + virtual void RunInvTxfm(const OutType *out, uint8_t *dst, int stride) = 0; + + void RunAccuracyCheck(uint32_t ref_max_error, double ref_avg_error) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + uint32_t max_error = 0; + int64_t total_error = 0; + const int count_test_block = 10000; + + int16_t *test_input_block = reinterpret_cast( + aom_memalign(16, sizeof(int16_t) * num_coeffs_)); + OutType *test_temp_block = reinterpret_cast( + aom_memalign(16, sizeof(test_temp_block[0]) * num_coeffs_)); + uint8_t *dst = reinterpret_cast( + aom_memalign(16, sizeof(uint8_t) * num_coeffs_)); + uint8_t *src = reinterpret_cast( + aom_memalign(16, sizeof(uint8_t) * num_coeffs_)); + uint16_t *dst16 = reinterpret_cast( + aom_memalign(16, sizeof(uint16_t) * num_coeffs_)); + uint16_t *src16 = reinterpret_cast( + aom_memalign(16, sizeof(uint16_t) * num_coeffs_)); + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with input range [-255, 255]. + for (int j = 0; j < num_coeffs_; ++j) { + if (bit_depth_ == AOM_BITS_8) { + src[j] = rnd.Rand8(); + dst[j] = rnd.Rand8(); + test_input_block[j] = src[j] - dst[j]; + } else { + src16[j] = rnd.Rand16() & mask_; + dst16[j] = rnd.Rand16() & mask_; + test_input_block[j] = src16[j] - dst16[j]; + } + } + + ASM_REGISTER_STATE_CHECK( + RunFwdTxfm(test_input_block, test_temp_block, pitch_)); + if (bit_depth_ == AOM_BITS_8) { + ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_)); + } else { + ASM_REGISTER_STATE_CHECK( + RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_)); + } + + for (int j = 0; j < num_coeffs_; ++j) { + const int diff = + bit_depth_ == AOM_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j]; + const uint32_t error = diff * diff; + if (max_error < error) max_error = error; + total_error += error; + } + } + + double avg_error = total_error * 1. / count_test_block / num_coeffs_; + + EXPECT_GE(ref_max_error, max_error) + << "Error: FHT/IHT has an individual round trip error > " + << ref_max_error; + + EXPECT_GE(ref_avg_error, avg_error) + << "Error: FHT/IHT has average round trip error > " << ref_avg_error + << " per block"; + + aom_free(test_input_block); + aom_free(test_temp_block); + aom_free(dst); + aom_free(src); + aom_free(dst16); + aom_free(src16); + } + + void RunCoeffCheck() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 5000; + + // Use a stride value which is not the width of any transform, to catch + // cases where the transforms use the stride incorrectly. + int stride = 96; + + int16_t *input_block = reinterpret_cast( + aom_memalign(16, sizeof(int16_t) * stride * height_)); + OutType *output_ref_block = reinterpret_cast( + aom_memalign(16, sizeof(output_ref_block[0]) * num_coeffs_)); + OutType *output_block = reinterpret_cast( + aom_memalign(16, sizeof(output_block[0]) * num_coeffs_)); + + for (int i = 0; i < count_test_block; ++i) { + int j, k; + for (j = 0; j < height_; ++j) { + for (k = 0; k < pitch_; ++k) { + int in_idx = j * stride + k; + int out_idx = j * pitch_ + k; + input_block[in_idx] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_); + if (bit_depth_ == AOM_BITS_8) { + output_block[out_idx] = output_ref_block[out_idx] = rnd.Rand8(); + } else { + output_block[out_idx] = output_ref_block[out_idx] = + rnd.Rand16() & mask_; + } + } + } + + fwd_txfm_ref(input_block, output_ref_block, stride, &txfm_param_); + ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, stride)); + + // The minimum quant value is 4. + for (j = 0; j < height_; ++j) { + for (k = 0; k < pitch_; ++k) { + int out_idx = j * pitch_ + k; + ASSERT_EQ(output_block[out_idx], output_ref_block[out_idx]) + << "Error: not bit-exact result at index: " << out_idx + << " at test block: " << i; + } + } + } + aom_free(input_block); + aom_free(output_ref_block); + aom_free(output_block); + } + + void RunInvCoeffCheck() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 5000; + + // Use a stride value which is not the width of any transform, to catch + // cases where the transforms use the stride incorrectly. + int stride = 96; + + int16_t *input_block = reinterpret_cast( + aom_memalign(16, sizeof(int16_t) * num_coeffs_)); + OutType *trans_block = reinterpret_cast( + aom_memalign(16, sizeof(trans_block[0]) * num_coeffs_)); + uint8_t *output_block = reinterpret_cast( + aom_memalign(16, sizeof(uint8_t) * stride * height_)); + uint8_t *output_ref_block = reinterpret_cast( + aom_memalign(16, sizeof(uint8_t) * stride * height_)); + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with input range [-mask_, mask_]. + int j, k; + for (j = 0; j < height_; ++j) { + for (k = 0; k < pitch_; ++k) { + int in_idx = j * pitch_ + k; + int out_idx = j * stride + k; + input_block[in_idx] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_); + output_ref_block[out_idx] = rnd.Rand16() & mask_; + output_block[out_idx] = output_ref_block[out_idx]; + } + } + + fwd_txfm_ref(input_block, trans_block, pitch_, &txfm_param_); + + inv_txfm_ref(trans_block, output_ref_block, stride, &txfm_param_); + ASM_REGISTER_STATE_CHECK(RunInvTxfm(trans_block, output_block, stride)); + + for (j = 0; j < height_; ++j) { + for (k = 0; k < pitch_; ++k) { + int out_idx = j * stride + k; + ASSERT_EQ(output_block[out_idx], output_ref_block[out_idx]) + << "Error: not bit-exact result at index: " << out_idx + << " j = " << j << " k = " << k << " at test block: " << i; + } + } + } + aom_free(input_block); + aom_free(trans_block); + aom_free(output_ref_block); + aom_free(output_block); + } + + void RunMemCheck() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 5000; + + int16_t *input_extreme_block = reinterpret_cast( + aom_memalign(16, sizeof(int16_t) * num_coeffs_)); + OutType *output_ref_block = reinterpret_cast( + aom_memalign(16, sizeof(output_ref_block[0]) * num_coeffs_)); + OutType *output_block = reinterpret_cast( + aom_memalign(16, sizeof(output_block[0]) * num_coeffs_)); + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with input range [-mask_, mask_]. + for (int j = 0; j < num_coeffs_; ++j) { + input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_; + } + if (i == 0) { + for (int j = 0; j < num_coeffs_; ++j) input_extreme_block[j] = mask_; + } else if (i == 1) { + for (int j = 0; j < num_coeffs_; ++j) input_extreme_block[j] = -mask_; + } + + fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, &txfm_param_); + ASM_REGISTER_STATE_CHECK( + RunFwdTxfm(input_extreme_block, output_block, pitch_)); + + int row_length = FindRowLength(); + // The minimum quant value is 4. + for (int j = 0; j < num_coeffs_; ++j) { + ASSERT_EQ(output_block[j], output_ref_block[j]) + << "Not bit-exact at test index: " << i << ", " + << "j = " << j << std::endl; + EXPECT_GE(row_length * kDctMaxValue << (bit_depth_ - 8), + abs(output_block[j])) + << "Error: NxN FDCT has coefficient larger than N*DCT_MAX_VALUE"; + } + } + aom_free(input_extreme_block); + aom_free(output_ref_block); + aom_free(output_block); + } + + void RunInvAccuracyCheck(int limit) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 1000; + + int16_t *in = reinterpret_cast( + aom_memalign(16, sizeof(int16_t) * num_coeffs_)); + OutType *coeff = reinterpret_cast( + aom_memalign(16, sizeof(coeff[0]) * num_coeffs_)); + uint8_t *dst = reinterpret_cast( + aom_memalign(16, sizeof(uint8_t) * num_coeffs_)); + uint8_t *src = reinterpret_cast( + aom_memalign(16, sizeof(uint8_t) * num_coeffs_)); + + uint16_t *dst16 = reinterpret_cast( + aom_memalign(16, sizeof(uint16_t) * num_coeffs_)); + uint16_t *src16 = reinterpret_cast( + aom_memalign(16, sizeof(uint16_t) * num_coeffs_)); + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with input range [-mask_, mask_]. + for (int j = 0; j < num_coeffs_; ++j) { + if (bit_depth_ == AOM_BITS_8) { + src[j] = rnd.Rand8(); + dst[j] = rnd.Rand8(); + in[j] = src[j] - dst[j]; + } else { + src16[j] = rnd.Rand16() & mask_; + dst16[j] = rnd.Rand16() & mask_; + in[j] = src16[j] - dst16[j]; + } + } + + fwd_txfm_ref(in, coeff, pitch_, &txfm_param_); + + if (bit_depth_ == AOM_BITS_8) { + ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_)); + } else { + ASM_REGISTER_STATE_CHECK( + RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_)); + } + + for (int j = 0; j < num_coeffs_; ++j) { + const int diff = + bit_depth_ == AOM_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j]; + const uint32_t error = diff * diff; + ASSERT_GE(static_cast(limit), error) + << "Error: 4x4 IDCT has error " << error << " at index " << j; + } + } + aom_free(in); + aom_free(coeff); + aom_free(dst); + aom_free(src); + aom_free(src16); + aom_free(dst16); + } + + int pitch_; + int height_; + FhtFunc fwd_txfm_ref; + IhtFunc inv_txfm_ref; + aom_bit_depth_t bit_depth_; + int mask_; + int num_coeffs_; + TxfmParam txfm_param_; + + private: + // Assume transform size is 4x4, 8x8, 16x16,... + int FindRowLength() const { + int row = 4; + if (16 == num_coeffs_) { + row = 4; + } else if (64 == num_coeffs_) { + row = 8; + } else if (256 == num_coeffs_) { + row = 16; + } else if (1024 == num_coeffs_) { + row = 32; + } + return row; + } +}; + +} // namespace libaom_test + +#endif // AOM_TEST_TRANSFORM_TEST_BASE_H_ diff --git a/libs/libaom/src/test/twopass_encoder.sh b/libs/libaom/src/test/twopass_encoder.sh new file mode 100644 index 000000000..cca44ced8 --- /dev/null +++ b/libs/libaom/src/test/twopass_encoder.sh @@ -0,0 +1,54 @@ +#!/bin/sh +## Copyright (c) 2016, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +## This file tests the libaom twopass_encoder example. To add new tests to this +## file, do the following: +## 1. Write a shell function (this is your test). +## 2. Add the function to twopass_encoder_tests (on a new line). +## +. $(dirname $0)/tools_common.sh + +# Environment check: $YUV_RAW_INPUT is required. +twopass_encoder_verify_environment() { + if [ ! -e "${YUV_RAW_INPUT}" ]; then + echo "Libaom test data must exist in LIBAOM_TEST_DATA_PATH." + return 1 + fi +} + +# Runs twopass_encoder using the codec specified by $1 with a frame limit of +# 100. +twopass_encoder() { + local encoder="$(aom_tool_path twopass_encoder)" + local codec="$1" + local output_file="${AOM_TEST_OUTPUT_DIR}/twopass_encoder_${codec}.ivf" + local limit=7 + + if [ ! -x "${encoder}" ]; then + elog "${encoder} does not exist or is not executable." + return 1 + fi + + eval "${AOM_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \ + "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" "${limit}" \ + ${devnull} + + [ -e "${output_file}" ] || return 1 +} + +twopass_encoder_av1() { + if [ "$(av1_encode_available)" = "yes" ]; then + twopass_encoder av1 || return 1 + fi +} + +twopass_encoder_tests="twopass_encoder_av1" + +run_tests twopass_encoder_verify_environment "${twopass_encoder_tests}" diff --git a/libs/libaom/src/test/util.h b/libs/libaom/src/test/util.h new file mode 100644 index 000000000..aa4b106e4 --- /dev/null +++ b/libs/libaom/src/test/util.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_TEST_UTIL_H_ +#define AOM_TEST_UTIL_H_ + +#include +#include +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "aom/aom_integer.h" +#include "aom/aom_image.h" +#include "aom_ports/aom_timer.h" + +// Macros +#define GET_PARAM(k) std::get(GetParam()) + +inline double compute_psnr(const aom_image_t *img1, const aom_image_t *img2) { + assert((img1->fmt == img2->fmt) && (img1->d_w == img2->d_w) && + (img1->d_h == img2->d_h)); + + const unsigned int width_y = img1->d_w; + const unsigned int height_y = img1->d_h; + unsigned int i, j; + + int64_t sqrerr = 0; + for (i = 0; i < height_y; ++i) + for (j = 0; j < width_y; ++j) { + int64_t d = img1->planes[AOM_PLANE_Y][i * img1->stride[AOM_PLANE_Y] + j] - + img2->planes[AOM_PLANE_Y][i * img2->stride[AOM_PLANE_Y] + j]; + sqrerr += d * d; + } + double mse = static_cast(sqrerr) / (width_y * height_y); + double psnr = 100.0; + if (mse > 0.0) { + psnr = 10 * log10(255.0 * 255.0 / mse); + } + return psnr; +} + +static INLINE double get_time_mark(aom_usec_timer *t) { + aom_usec_timer_mark(t); + return static_cast(aom_usec_timer_elapsed(t)); +} + +#endif // AOM_TEST_UTIL_H_ diff --git a/libs/libaom/src/test/variance_test.cc b/libs/libaom/src/test/variance_test.cc new file mode 100644 index 000000000..1458ece28 --- /dev/null +++ b/libs/libaom/src/test/variance_test.cc @@ -0,0 +1,2410 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "aom/aom_codec.h" +#include "aom/aom_integer.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/mem.h" + +namespace { + +typedef unsigned int (*VarianceMxNFunc)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse); +typedef unsigned int (*SubpixVarMxNFunc)(const uint8_t *a, int a_stride, + int xoffset, int yoffset, + const uint8_t *b, int b_stride, + unsigned int *sse); +typedef unsigned int (*SubpixAvgVarMxNFunc)(const uint8_t *a, int a_stride, + int xoffset, int yoffset, + const uint8_t *b, int b_stride, + uint32_t *sse, + const uint8_t *second_pred); +typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride); +typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src); +typedef unsigned int (*DistWtdSubpixAvgVarMxNFunc)( + const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b, + int b_stride, uint32_t *sse, const uint8_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param); +typedef uint32_t (*ObmcSubpelVarFunc)(const uint8_t *pre, int pre_stride, + int xoffset, int yoffset, + const int32_t *wsrc, const int32_t *mask, + unsigned int *sse); + +using libaom_test::ACMRandom; + +// Truncate high bit depth results by downshifting (with rounding) by: +// 2 * (bit_depth - 8) for sse +// (bit_depth - 8) for se +static void RoundHighBitDepth(int bit_depth, int64_t *se, uint64_t *sse) { + switch (bit_depth) { + case AOM_BITS_12: + *sse = (*sse + 128) >> 8; + *se = (*se + 8) >> 4; + break; + case AOM_BITS_10: + *sse = (*sse + 8) >> 4; + *se = (*se + 2) >> 2; + break; + case AOM_BITS_8: + default: break; + } +} + +static unsigned int mb_ss_ref(const int16_t *src) { + unsigned int res = 0; + for (int i = 0; i < 256; ++i) { + res += src[i] * src[i]; + } + return res; +} + +/* Note: + * Our codebase calculates the "diff" value in the variance algorithm by + * (src - ref). + */ +static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref, int l2w, + int l2h, int src_stride, int ref_stride, + uint32_t *sse_ptr, bool use_high_bit_depth_, + aom_bit_depth_t bit_depth) { + int64_t se = 0; + uint64_t sse = 0; + const int w = 1 << l2w; + const int h = 1 << l2h; + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + int diff; + if (!use_high_bit_depth_) { + diff = src[y * src_stride + x] - ref[y * ref_stride + x]; + se += diff; + sse += diff * diff; + } else { + diff = CONVERT_TO_SHORTPTR(src)[y * src_stride + x] - + CONVERT_TO_SHORTPTR(ref)[y * ref_stride + x]; + se += diff; + sse += diff * diff; + } + } + } + RoundHighBitDepth(bit_depth, &se, &sse); + *sse_ptr = static_cast(sse); + return static_cast(sse - ((se * se) >> (l2w + l2h))); +} + +/* The subpel reference functions differ from the codec version in one aspect: + * they calculate the bilinear factors directly instead of using a lookup table + * and therefore upshift xoff and yoff by 1. Only every other calculated value + * is used so the codec version shrinks the table to save space. + */ +static uint32_t subpel_variance_ref(const uint8_t *ref, const uint8_t *src, + int l2w, int l2h, int xoff, int yoff, + uint32_t *sse_ptr, bool use_high_bit_depth_, + aom_bit_depth_t bit_depth) { + int64_t se = 0; + uint64_t sse = 0; + const int w = 1 << l2w; + const int h = 1 << l2h; + + xoff <<= 1; + yoff <<= 1; + + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + // Bilinear interpolation at a 16th pel step. + if (!use_high_bit_depth_) { + const int a1 = ref[(w + 1) * (y + 0) + x + 0]; + const int a2 = ref[(w + 1) * (y + 0) + x + 1]; + const int b1 = ref[(w + 1) * (y + 1) + x + 0]; + const int b2 = ref[(w + 1) * (y + 1) + x + 1]; + const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); + const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); + const int r = a + (((b - a) * yoff + 8) >> 4); + const int diff = r - src[w * y + x]; + se += diff; + sse += diff * diff; + } else { + uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref); + uint16_t *src16 = CONVERT_TO_SHORTPTR(src); + const int a1 = ref16[(w + 1) * (y + 0) + x + 0]; + const int a2 = ref16[(w + 1) * (y + 0) + x + 1]; + const int b1 = ref16[(w + 1) * (y + 1) + x + 0]; + const int b2 = ref16[(w + 1) * (y + 1) + x + 1]; + const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); + const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); + const int r = a + (((b - a) * yoff + 8) >> 4); + const int diff = r - src16[w * y + x]; + se += diff; + sse += diff * diff; + } + } + } + RoundHighBitDepth(bit_depth, &se, &sse); + *sse_ptr = static_cast(sse); + return static_cast(sse - ((se * se) >> (l2w + l2h))); +} + +static uint32_t subpel_avg_variance_ref(const uint8_t *ref, const uint8_t *src, + const uint8_t *second_pred, int l2w, + int l2h, int xoff, int yoff, + uint32_t *sse_ptr, + bool use_high_bit_depth, + aom_bit_depth_t bit_depth) { + int64_t se = 0; + uint64_t sse = 0; + const int w = 1 << l2w; + const int h = 1 << l2h; + + xoff <<= 1; + yoff <<= 1; + + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + // bilinear interpolation at a 16th pel step + if (!use_high_bit_depth) { + const int a1 = ref[(w + 1) * (y + 0) + x + 0]; + const int a2 = ref[(w + 1) * (y + 0) + x + 1]; + const int b1 = ref[(w + 1) * (y + 1) + x + 0]; + const int b2 = ref[(w + 1) * (y + 1) + x + 1]; + const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); + const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); + const int r = a + (((b - a) * yoff + 8) >> 4); + const int diff = + ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x]; + se += diff; + sse += diff * diff; + } else { + const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref); + const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); + const uint16_t *sec16 = CONVERT_TO_SHORTPTR(second_pred); + const int a1 = ref16[(w + 1) * (y + 0) + x + 0]; + const int a2 = ref16[(w + 1) * (y + 0) + x + 1]; + const int b1 = ref16[(w + 1) * (y + 1) + x + 0]; + const int b2 = ref16[(w + 1) * (y + 1) + x + 1]; + const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); + const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); + const int r = a + (((b - a) * yoff + 8) >> 4); + const int diff = ((r + sec16[w * y + x] + 1) >> 1) - src16[w * y + x]; + se += diff; + sse += diff * diff; + } + } + } + RoundHighBitDepth(bit_depth, &se, &sse); + *sse_ptr = static_cast(sse); + return static_cast(sse - ((se * se) >> (l2w + l2h))); +} + +static uint32_t dist_wtd_subpel_avg_variance_ref( + const uint8_t *ref, const uint8_t *src, const uint8_t *second_pred, int l2w, + int l2h, int xoff, int yoff, uint32_t *sse_ptr, bool use_high_bit_depth, + aom_bit_depth_t bit_depth, DIST_WTD_COMP_PARAMS *jcp_param) { + int64_t se = 0; + uint64_t sse = 0; + const int w = 1 << l2w; + const int h = 1 << l2h; + + xoff <<= 1; + yoff <<= 1; + + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + // bilinear interpolation at a 16th pel step + if (!use_high_bit_depth) { + const int a1 = ref[(w + 0) * (y + 0) + x + 0]; + const int a2 = ref[(w + 0) * (y + 0) + x + 1]; + const int b1 = ref[(w + 0) * (y + 1) + x + 0]; + const int b2 = ref[(w + 0) * (y + 1) + x + 1]; + const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); + const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); + const int r = a + (((b - a) * yoff + 8) >> 4); + const int avg = ROUND_POWER_OF_TWO( + r * jcp_param->fwd_offset + + second_pred[w * y + x] * jcp_param->bck_offset, + DIST_PRECISION_BITS); + const int diff = avg - src[w * y + x]; + + se += diff; + sse += diff * diff; + } else { + const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref); + const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); + const uint16_t *sec16 = CONVERT_TO_SHORTPTR(second_pred); + const int a1 = ref16[(w + 0) * (y + 0) + x + 0]; + const int a2 = ref16[(w + 0) * (y + 0) + x + 1]; + const int b1 = ref16[(w + 0) * (y + 1) + x + 0]; + const int b2 = ref16[(w + 0) * (y + 1) + x + 1]; + const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); + const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); + const int r = a + (((b - a) * yoff + 8) >> 4); + const int avg = + ROUND_POWER_OF_TWO(r * jcp_param->fwd_offset + + sec16[w * y + x] * jcp_param->bck_offset, + DIST_PRECISION_BITS); + const int diff = avg - src16[w * y + x]; + + se += diff; + sse += diff * diff; + } + } + } + RoundHighBitDepth(bit_depth, &se, &sse); + *sse_ptr = static_cast(sse); + return static_cast(sse - ((se * se) >> (l2w + l2h))); +} + +static uint32_t obmc_subpel_variance_ref(const uint8_t *pre, int l2w, int l2h, + int xoff, int yoff, + const int32_t *wsrc, + const int32_t *mask, uint32_t *sse_ptr, + bool use_high_bit_depth_, + aom_bit_depth_t bit_depth) { + int64_t se = 0; + uint64_t sse = 0; + const int w = 1 << l2w; + const int h = 1 << l2h; + + xoff <<= 1; + yoff <<= 1; + + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + // Bilinear interpolation at a 16th pel step. + if (!use_high_bit_depth_) { + const int a1 = pre[(w + 1) * (y + 0) + x + 0]; + const int a2 = pre[(w + 1) * (y + 0) + x + 1]; + const int b1 = pre[(w + 1) * (y + 1) + x + 0]; + const int b2 = pre[(w + 1) * (y + 1) + x + 1]; + const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); + const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); + const int r = a + (((b - a) * yoff + 8) >> 4); + const int diff = ROUND_POWER_OF_TWO_SIGNED( + wsrc[w * y + x] - r * mask[w * y + x], 12); + se += diff; + sse += diff * diff; + } else { + uint16_t *pre16 = CONVERT_TO_SHORTPTR(pre); + const int a1 = pre16[(w + 1) * (y + 0) + x + 0]; + const int a2 = pre16[(w + 1) * (y + 0) + x + 1]; + const int b1 = pre16[(w + 1) * (y + 1) + x + 0]; + const int b2 = pre16[(w + 1) * (y + 1) + x + 1]; + const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); + const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); + const int r = a + (((b - a) * yoff + 8) >> 4); + const int diff = ROUND_POWER_OF_TWO_SIGNED( + wsrc[w * y + x] - r * mask[w * y + x], 12); + se += diff; + sse += diff * diff; + } + } + } + RoundHighBitDepth(bit_depth, &se, &sse); + *sse_ptr = static_cast(sse); + return static_cast(sse - ((se * se) >> (l2w + l2h))); +} + +//////////////////////////////////////////////////////////////////////////////// + +class SumOfSquaresTest : public ::testing::TestWithParam { + public: + SumOfSquaresTest() : func_(GetParam()) {} + + virtual ~SumOfSquaresTest() { libaom_test::ClearSystemState(); } + + protected: + void ConstTest(); + void RefTest(); + + SumOfSquaresFunction func_; + ACMRandom rnd_; +}; + +void SumOfSquaresTest::ConstTest() { + int16_t mem[256]; + unsigned int res; + for (int v = 0; v < 256; ++v) { + for (int i = 0; i < 256; ++i) { + mem[i] = v; + } + ASM_REGISTER_STATE_CHECK(res = func_(mem)); + EXPECT_EQ(256u * (v * v), res); + } +} + +void SumOfSquaresTest::RefTest() { + int16_t mem[256]; + for (int i = 0; i < 100; ++i) { + for (int j = 0; j < 256; ++j) { + mem[j] = rnd_.Rand8() - rnd_.Rand8(); + } + + const unsigned int expected = mb_ss_ref(mem); + unsigned int res; + ASM_REGISTER_STATE_CHECK(res = func_(mem)); + EXPECT_EQ(expected, res); + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Encapsulating struct to store the function to test along with +// some testing context. +// Can be used for MSE, SSE, Variance, etc. + +template +struct TestParams { + TestParams(int log2w = 0, int log2h = 0, Func function = NULL, + int bit_depth_value = 0) + : log2width(log2w), log2height(log2h), func(function) { + use_high_bit_depth = (bit_depth_value > 0); + if (use_high_bit_depth) { + bit_depth = static_cast(bit_depth_value); + } else { + bit_depth = AOM_BITS_8; + } + width = 1 << log2width; + height = 1 << log2height; + block_size = width * height; + mask = (1u << bit_depth) - 1; + } + + int log2width, log2height; + int width, height; + int block_size; + Func func; + aom_bit_depth_t bit_depth; + bool use_high_bit_depth; + uint32_t mask; +}; + +template +std::ostream &operator<<(std::ostream &os, const TestParams &p) { + return os << "width/height:" << p.width << "/" << p.height + << " function:" << reinterpret_cast(p.func) + << " bit-depth:" << p.bit_depth; +} + +// Main class for testing a function type +template +class MainTestClass + : public ::testing::TestWithParam > { + public: + virtual void SetUp() { + params_ = this->GetParam(); + + rnd_.Reset(ACMRandom::DeterministicSeed()); + const size_t unit = + use_high_bit_depth() ? sizeof(uint16_t) : sizeof(uint8_t); + src_ = reinterpret_cast(aom_memalign(16, block_size() * unit)); + ref_ = new uint8_t[block_size() * unit]; + ASSERT_TRUE(src_ != NULL); + ASSERT_TRUE(ref_ != NULL); + if (use_high_bit_depth()) { + // TODO(skal): remove! + src_ = CONVERT_TO_BYTEPTR(src_); + ref_ = CONVERT_TO_BYTEPTR(ref_); + } + } + + virtual void TearDown() { + if (use_high_bit_depth()) { + // TODO(skal): remove! + src_ = reinterpret_cast(CONVERT_TO_SHORTPTR(src_)); + ref_ = reinterpret_cast(CONVERT_TO_SHORTPTR(ref_)); + } + + aom_free(src_); + delete[] ref_; + src_ = NULL; + ref_ = NULL; + libaom_test::ClearSystemState(); + } + + protected: + // We could sub-class MainTestClass into dedicated class for Variance + // and MSE/SSE, but it involves a lot of 'this->xxx' dereferencing + // to access top class fields xxx. That's cumbersome, so for now we'll just + // implement the testing methods here: + + // Variance tests + void ZeroTest(); + void RefTest(); + void RefStrideTest(); + void OneQuarterTest(); + void SpeedTest(); + + // MSE/SSE tests + void RefTestMse(); + void RefTestSse(); + void MaxTestMse(); + void MaxTestSse(); + + protected: + ACMRandom rnd_; + uint8_t *src_; + uint8_t *ref_; + TestParams params_; + + // some relay helpers + bool use_high_bit_depth() const { return params_.use_high_bit_depth; } + int byte_shift() const { return params_.bit_depth - 8; } + int block_size() const { return params_.block_size; } + int width() const { return params_.width; } + int height() const { return params_.height; } + uint32_t mask() const { return params_.mask; } +}; + +//////////////////////////////////////////////////////////////////////////////// +// Tests related to variance. + +template +void MainTestClass::ZeroTest() { + for (int i = 0; i <= 255; ++i) { + if (!use_high_bit_depth()) { + memset(src_, i, block_size()); + } else { + uint16_t *const src16 = CONVERT_TO_SHORTPTR(src_); + for (int k = 0; k < block_size(); ++k) src16[k] = i << byte_shift(); + } + for (int j = 0; j <= 255; ++j) { + if (!use_high_bit_depth()) { + memset(ref_, j, block_size()); + } else { + uint16_t *const ref16 = CONVERT_TO_SHORTPTR(ref_); + for (int k = 0; k < block_size(); ++k) ref16[k] = j << byte_shift(); + } + unsigned int sse, var; + ASM_REGISTER_STATE_CHECK( + var = params_.func(src_, width(), ref_, width(), &sse)); + EXPECT_EQ(0u, var) << "src values: " << i << " ref values: " << j; + } + } +} + +template +void MainTestClass::RefTest() { + for (int i = 0; i < 10; ++i) { + for (int j = 0; j < block_size(); j++) { + if (!use_high_bit_depth()) { + src_[j] = rnd_.Rand8(); + ref_[j] = rnd_.Rand8(); + } else { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask(); + CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask(); + } + } + unsigned int sse1, sse2, var1, var2; + const int stride = width(); + ASM_REGISTER_STATE_CHECK( + var1 = params_.func(src_, stride, ref_, stride, &sse1)); + var2 = + variance_ref(src_, ref_, params_.log2width, params_.log2height, stride, + stride, &sse2, use_high_bit_depth(), params_.bit_depth); + EXPECT_EQ(sse1, sse2) << "Error at test index: " << i; + EXPECT_EQ(var1, var2) << "Error at test index: " << i; + } +} + +template +void MainTestClass::RefStrideTest() { + for (int i = 0; i < 10; ++i) { + const int ref_stride = (i & 1) * width(); + const int src_stride = ((i >> 1) & 1) * width(); + for (int j = 0; j < block_size(); j++) { + const int ref_ind = (j / width()) * ref_stride + j % width(); + const int src_ind = (j / width()) * src_stride + j % width(); + if (!use_high_bit_depth()) { + src_[src_ind] = rnd_.Rand8(); + ref_[ref_ind] = rnd_.Rand8(); + } else { + CONVERT_TO_SHORTPTR(src_)[src_ind] = rnd_.Rand16() & mask(); + CONVERT_TO_SHORTPTR(ref_)[ref_ind] = rnd_.Rand16() & mask(); + } + } + unsigned int sse1, sse2; + unsigned int var1, var2; + + ASM_REGISTER_STATE_CHECK( + var1 = params_.func(src_, src_stride, ref_, ref_stride, &sse1)); + var2 = variance_ref(src_, ref_, params_.log2width, params_.log2height, + src_stride, ref_stride, &sse2, use_high_bit_depth(), + params_.bit_depth); + EXPECT_EQ(sse1, sse2) << "Error at test index: " << i; + EXPECT_EQ(var1, var2) << "Error at test index: " << i; + } +} + +template +void MainTestClass::OneQuarterTest() { + const int half = block_size() / 2; + if (!use_high_bit_depth()) { + memset(src_, 255, block_size()); + memset(ref_, 255, half); + memset(ref_ + half, 0, half); + } else { + aom_memset16(CONVERT_TO_SHORTPTR(src_), 255 << byte_shift(), block_size()); + aom_memset16(CONVERT_TO_SHORTPTR(ref_), 255 << byte_shift(), half); + aom_memset16(CONVERT_TO_SHORTPTR(ref_) + half, 0, half); + } + unsigned int sse, var, expected; + ASM_REGISTER_STATE_CHECK( + var = params_.func(src_, width(), ref_, width(), &sse)); + expected = block_size() * 255 * 255 / 4; + EXPECT_EQ(expected, var); +} + +template +void MainTestClass::SpeedTest() { + for (int j = 0; j < block_size(); j++) { + if (!use_high_bit_depth()) { + src_[j] = rnd_.Rand8(); + ref_[j] = rnd_.Rand8(); + } else { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask(); + CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask(); + } + } + unsigned int sse; + const int stride = width(); + int run_time = 1000000000 / block_size(); + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int i = 0; i < run_time; ++i) { + params_.func(src_, stride, ref_, stride, &sse); + } + + aom_usec_timer_mark(&timer); + const double elapsed_time = + static_cast(aom_usec_timer_elapsed(&timer)); + printf("Variance %dx%d : %7.2fns\n", width(), height(), elapsed_time); +} + +//////////////////////////////////////////////////////////////////////////////// +// Tests related to MSE / SSE. + +template +void MainTestClass::RefTestMse() { + for (int i = 0; i < 10; ++i) { + for (int j = 0; j < block_size(); ++j) { + src_[j] = rnd_.Rand8(); + ref_[j] = rnd_.Rand8(); + } + unsigned int sse1, sse2; + const int stride = width(); + ASM_REGISTER_STATE_CHECK(params_.func(src_, stride, ref_, stride, &sse1)); + variance_ref(src_, ref_, params_.log2width, params_.log2height, stride, + stride, &sse2, false, AOM_BITS_8); + EXPECT_EQ(sse1, sse2); + } +} + +template +void MainTestClass::RefTestSse() { + for (int i = 0; i < 10; ++i) { + for (int j = 0; j < block_size(); ++j) { + src_[j] = rnd_.Rand8(); + ref_[j] = rnd_.Rand8(); + } + unsigned int sse2; + unsigned int var1; + const int stride = width(); + ASM_REGISTER_STATE_CHECK(var1 = params_.func(src_, stride, ref_, stride)); + variance_ref(src_, ref_, params_.log2width, params_.log2height, stride, + stride, &sse2, false, AOM_BITS_8); + EXPECT_EQ(var1, sse2); + } +} + +template +void MainTestClass::MaxTestMse() { + memset(src_, 255, block_size()); + memset(ref_, 0, block_size()); + unsigned int sse; + ASM_REGISTER_STATE_CHECK(params_.func(src_, width(), ref_, width(), &sse)); + const unsigned int expected = block_size() * 255 * 255; + EXPECT_EQ(expected, sse); +} + +template +void MainTestClass::MaxTestSse() { + memset(src_, 255, block_size()); + memset(ref_, 0, block_size()); + unsigned int var; + ASM_REGISTER_STATE_CHECK(var = params_.func(src_, width(), ref_, width())); + const unsigned int expected = block_size() * 255 * 255; + EXPECT_EQ(expected, var); +} + +//////////////////////////////////////////////////////////////////////////////// + +using std::get; +using std::make_tuple; +using std::tuple; + +template +class SubpelVarianceTest + : public ::testing::TestWithParam > { + public: + virtual void SetUp() { + params_ = this->GetParam(); + + rnd_.Reset(ACMRandom::DeterministicSeed()); + if (!use_high_bit_depth()) { + src_ = reinterpret_cast(aom_memalign(32, block_size())); + sec_ = reinterpret_cast(aom_memalign(32, block_size())); + ref_ = reinterpret_cast( + aom_memalign(32, block_size() + width() + height() + 1)); + } else { + src_ = CONVERT_TO_BYTEPTR(reinterpret_cast( + aom_memalign(32, block_size() * sizeof(uint16_t)))); + sec_ = CONVERT_TO_BYTEPTR(reinterpret_cast( + aom_memalign(32, block_size() * sizeof(uint16_t)))); + ref_ = CONVERT_TO_BYTEPTR(aom_memalign( + 32, (block_size() + width() + height() + 1) * sizeof(uint16_t))); + } + ASSERT_TRUE(src_ != NULL); + ASSERT_TRUE(sec_ != NULL); + ASSERT_TRUE(ref_ != NULL); + } + + virtual void TearDown() { + if (!use_high_bit_depth()) { + aom_free(src_); + aom_free(ref_); + aom_free(sec_); + } else { + aom_free(CONVERT_TO_SHORTPTR(src_)); + aom_free(CONVERT_TO_SHORTPTR(ref_)); + aom_free(CONVERT_TO_SHORTPTR(sec_)); + } + libaom_test::ClearSystemState(); + } + + protected: + void RefTest(); + void ExtremeRefTest(); + void SpeedTest(); + + ACMRandom rnd_; + uint8_t *src_; + uint8_t *ref_; + uint8_t *sec_; + TestParams params_; + DIST_WTD_COMP_PARAMS jcp_param_; + + // some relay helpers + bool use_high_bit_depth() const { return params_.use_high_bit_depth; } + int byte_shift() const { return params_.bit_depth - 8; } + int block_size() const { return params_.block_size; } + int width() const { return params_.width; } + int height() const { return params_.height; } + uint32_t mask() const { return params_.mask; } +}; + +template +void SubpelVarianceTest::RefTest() { + for (int x = 0; x < 8; ++x) { + for (int y = 0; y < 8; ++y) { + if (!use_high_bit_depth()) { + for (int j = 0; j < block_size(); j++) { + src_[j] = rnd_.Rand8(); + } + for (int j = 0; j < block_size() + width() + height() + 1; j++) { + ref_[j] = rnd_.Rand8(); + } + } else { + for (int j = 0; j < block_size(); j++) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask(); + } + for (int j = 0; j < block_size() + width() + height() + 1; j++) { + CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask(); + } + } + unsigned int sse1, sse2; + unsigned int var1; + ASM_REGISTER_STATE_CHECK( + var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1)); + const unsigned int var2 = subpel_variance_ref( + ref_, src_, params_.log2width, params_.log2height, x, y, &sse2, + use_high_bit_depth(), params_.bit_depth); + EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y; + EXPECT_EQ(var1, var2) << "at position " << x << ", " << y; + } + } +} + +template +void SubpelVarianceTest::ExtremeRefTest() { + // Compare against reference. + // Src: Set the first half of values to 0, the second half to the maximum. + // Ref: Set the first half of values to the maximum, the second half to 0. + for (int x = 0; x < 8; ++x) { + for (int y = 0; y < 8; ++y) { + const int half = block_size() / 2; + if (!use_high_bit_depth()) { + memset(src_, 0, half); + memset(src_ + half, 255, half); + memset(ref_, 255, half); + memset(ref_ + half, 0, half + width() + height() + 1); + } else { + aom_memset16(CONVERT_TO_SHORTPTR(src_), mask(), half); + aom_memset16(CONVERT_TO_SHORTPTR(src_) + half, 0, half); + aom_memset16(CONVERT_TO_SHORTPTR(ref_), 0, half); + aom_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask(), + half + width() + height() + 1); + } + unsigned int sse1, sse2; + unsigned int var1; + ASM_REGISTER_STATE_CHECK( + var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1)); + const unsigned int var2 = subpel_variance_ref( + ref_, src_, params_.log2width, params_.log2height, x, y, &sse2, + use_high_bit_depth(), params_.bit_depth); + EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y; + EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y; + } + } +} + +template +void SubpelVarianceTest::SpeedTest() { + if (!use_high_bit_depth()) { + for (int j = 0; j < block_size(); j++) { + src_[j] = rnd_.Rand8(); + } + for (int j = 0; j < block_size() + width() + height() + 1; j++) { + ref_[j] = rnd_.Rand8(); + } + } else { + for (int j = 0; j < block_size(); j++) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask(); + } + for (int j = 0; j < block_size() + width() + height() + 1; j++) { + CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask(); + } + } + + unsigned int sse1, sse2; + int run_time = 1000000000 / block_size(); + aom_usec_timer timer; + + aom_usec_timer_start(&timer); + for (int i = 0; i < run_time; ++i) { + int x = rnd_(8); + int y = rnd_(8); + params_.func(ref_, width() + 1, x, y, src_, width(), &sse1); + } + aom_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(aom_usec_timer_elapsed(&timer)); + + aom_usec_timer timer_c; + + aom_usec_timer_start(&timer_c); + for (int i = 0; i < run_time; ++i) { + int x = rnd_(8); + int y = rnd_(8); + subpel_variance_ref(ref_, src_, params_.log2width, params_.log2height, x, y, + &sse2, use_high_bit_depth(), params_.bit_depth); + } + aom_usec_timer_mark(&timer_c); + + const int elapsed_time_c = static_cast(aom_usec_timer_elapsed(&timer_c)); + + printf( + "sub_pixel_variance_%dx%d_%d: ref_time=%d us opt_time=%d us gain=%d \n", + width(), height(), params_.bit_depth, elapsed_time_c, elapsed_time, + elapsed_time_c / elapsed_time); +} + +template <> +void SubpelVarianceTest::RefTest() { + for (int x = 0; x < 8; ++x) { + for (int y = 0; y < 8; ++y) { + if (!use_high_bit_depth()) { + for (int j = 0; j < block_size(); j++) { + src_[j] = rnd_.Rand8(); + sec_[j] = rnd_.Rand8(); + } + for (int j = 0; j < block_size() + width() + height() + 1; j++) { + ref_[j] = rnd_.Rand8(); + } + } else { + for (int j = 0; j < block_size(); j++) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask(); + CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask(); + } + for (int j = 0; j < block_size() + width() + height() + 1; j++) { + CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask(); + } + } + uint32_t sse1, sse2; + uint32_t var1, var2; + ASM_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 1, x, y, + src_, width(), &sse1, sec_)); + var2 = subpel_avg_variance_ref(ref_, src_, sec_, params_.log2width, + params_.log2height, x, y, &sse2, + use_high_bit_depth(), params_.bit_depth); + EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y; + EXPECT_EQ(var1, var2) << "at position " << x << ", " << y; + } + } +} + +template <> +void SubpelVarianceTest::RefTest() { + for (int x = 0; x < 8; ++x) { + for (int y = 0; y < 8; ++y) { + if (!use_high_bit_depth()) { + for (int j = 0; j < block_size(); j++) { + src_[j] = rnd_.Rand8(); + sec_[j] = rnd_.Rand8(); + } + for (int j = 0; j < block_size() + width() + height() + 1; j++) { + ref_[j] = rnd_.Rand8(); + } + } else { + for (int j = 0; j < block_size(); j++) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask(); + CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask(); + } + for (int j = 0; j < block_size() + width() + height() + 1; j++) { + CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask(); + } + } + for (int x0 = 0; x0 < 2; ++x0) { + for (int y0 = 0; y0 < 4; ++y0) { + uint32_t sse1, sse2; + uint32_t var1, var2; + jcp_param_.fwd_offset = quant_dist_lookup_table[x0][y0][0]; + jcp_param_.bck_offset = quant_dist_lookup_table[x0][y0][1]; + ASM_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 0, x, y, + src_, width(), &sse1, + sec_, &jcp_param_)); + var2 = dist_wtd_subpel_avg_variance_ref( + ref_, src_, sec_, params_.log2width, params_.log2height, x, y, + &sse2, use_high_bit_depth(), params_.bit_depth, &jcp_param_); + EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y; + EXPECT_EQ(var1, var2) << "at position " << x << ", " << y; + } + } + } + } +} + +//////////////////////////////////////////////////////////////////////////////// + +static const int kMaskMax = 64; + +typedef TestParams ObmcSubpelVarianceParams; + +template +class ObmcVarianceTest + : public ::testing::TestWithParam > { + public: + virtual void SetUp() { + params_ = this->GetParam(); + + rnd_.Reset(ACMRandom::DeterministicSeed()); + if (!use_high_bit_depth()) { + pre_ = reinterpret_cast( + aom_memalign(32, block_size() + width() + height() + 1)); + } else { + pre_ = CONVERT_TO_BYTEPTR(reinterpret_cast(aom_memalign( + 32, block_size() + width() + height() + 1 * sizeof(uint16_t)))); + } + wsrc_ = reinterpret_cast( + aom_memalign(32, block_size() * sizeof(uint32_t))); + mask_ = reinterpret_cast( + aom_memalign(32, block_size() * sizeof(uint32_t))); + ASSERT_TRUE(pre_ != NULL); + ASSERT_TRUE(wsrc_ != NULL); + ASSERT_TRUE(mask_ != NULL); + } + + virtual void TearDown() { + if (!use_high_bit_depth()) { + aom_free(pre_); + } else { + aom_free(CONVERT_TO_SHORTPTR(pre_)); + } + aom_free(wsrc_); + aom_free(mask_); + libaom_test::ClearSystemState(); + } + + protected: + void RefTest(); + void ExtremeRefTest(); + void SpeedTest(); + + ACMRandom rnd_; + uint8_t *pre_; + int32_t *wsrc_; + int32_t *mask_; + TestParams params_; + + // some relay helpers + bool use_high_bit_depth() const { return params_.use_high_bit_depth; } + int byte_shift() const { return params_.bit_depth - 8; } + int block_size() const { return params_.block_size; } + int width() const { return params_.width; } + int height() const { return params_.height; } + uint32_t bd_mask() const { return params_.mask; } +}; + +template <> +void ObmcVarianceTest::RefTest() { + for (int x = 0; x < 8; ++x) { + for (int y = 0; y < 8; ++y) { + if (!use_high_bit_depth()) + for (int j = 0; j < block_size() + width() + height() + 1; j++) + pre_[j] = rnd_.Rand8(); + else + for (int j = 0; j < block_size() + width() + height() + 1; j++) + CONVERT_TO_SHORTPTR(pre_)[j] = rnd_.Rand16() & bd_mask(); + for (int j = 0; j < block_size(); j++) { + wsrc_[j] = (rnd_.Rand16() & bd_mask()) * rnd_(kMaskMax * kMaskMax + 1); + mask_[j] = rnd_(kMaskMax * kMaskMax + 1); + } + + uint32_t sse1, sse2; + uint32_t var1, var2; + ASM_REGISTER_STATE_CHECK( + var1 = params_.func(pre_, width() + 1, x, y, wsrc_, mask_, &sse1)); + var2 = obmc_subpel_variance_ref( + pre_, params_.log2width, params_.log2height, x, y, wsrc_, mask_, + &sse2, use_high_bit_depth(), params_.bit_depth); + EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y; + EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y; + } + } +} + +template <> +void ObmcVarianceTest::ExtremeRefTest() { + // Pre: Set the first half of values to the maximum, the second half to 0. + // Mask: same as above + // WSrc: Set the first half of values to 0, the second half to the maximum. + for (int x = 0; x < 8; ++x) { + for (int y = 0; y < 8; ++y) { + const int half = block_size() / 2; + if (!use_high_bit_depth()) { + memset(pre_, 255, half); + memset(pre_ + half, 0, half + width() + height() + 1); + } else { + aom_memset16(CONVERT_TO_SHORTPTR(pre_), bd_mask(), half); + aom_memset16(CONVERT_TO_SHORTPTR(pre_) + half, 0, half); + } + for (int j = 0; j < half; j++) { + wsrc_[j] = bd_mask() * kMaskMax * kMaskMax; + mask_[j] = 0; + } + for (int j = half; j < block_size(); j++) { + wsrc_[j] = 0; + mask_[j] = kMaskMax * kMaskMax; + } + + uint32_t sse1, sse2; + uint32_t var1, var2; + ASM_REGISTER_STATE_CHECK( + var1 = params_.func(pre_, width() + 1, x, y, wsrc_, mask_, &sse1)); + var2 = obmc_subpel_variance_ref( + pre_, params_.log2width, params_.log2height, x, y, wsrc_, mask_, + &sse2, use_high_bit_depth(), params_.bit_depth); + EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y; + EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y; + } + } +} + +template <> +void ObmcVarianceTest::SpeedTest() { + if (!use_high_bit_depth()) + for (int j = 0; j < block_size() + width() + height() + 1; j++) + pre_[j] = rnd_.Rand8(); + else + for (int j = 0; j < block_size() + width() + height() + 1; j++) + CONVERT_TO_SHORTPTR(pre_)[j] = rnd_.Rand16() & bd_mask(); + for (int j = 0; j < block_size(); j++) { + wsrc_[j] = (rnd_.Rand16() & bd_mask()) * rnd_(kMaskMax * kMaskMax + 1); + mask_[j] = rnd_(kMaskMax * kMaskMax + 1); + } + unsigned int sse1; + const int stride = width() + 1; + int run_time = 1000000000 / block_size(); + aom_usec_timer timer; + + aom_usec_timer_start(&timer); + for (int i = 0; i < run_time; ++i) { + int x = rnd_(8); + int y = rnd_(8); + ASM_REGISTER_STATE_CHECK( + params_.func(pre_, stride, x, y, wsrc_, mask_, &sse1)); + } + aom_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(aom_usec_timer_elapsed(&timer)); + printf("obmc_sub_pixel_variance_%dx%d_%d: %d us\n", width(), height(), + params_.bit_depth, elapsed_time); +} + +typedef MainTestClass AvxSseTest; +typedef MainTestClass AvxMseTest; +typedef MainTestClass AvxVarianceTest; +typedef SubpelVarianceTest AvxSubpelVarianceTest; +typedef SubpelVarianceTest AvxSubpelAvgVarianceTest; +typedef SubpelVarianceTest + AvxDistWtdSubpelAvgVarianceTest; +typedef ObmcVarianceTest AvxObmcSubpelVarianceTest; + +TEST_P(AvxSseTest, RefSse) { RefTestSse(); } +TEST_P(AvxSseTest, MaxSse) { MaxTestSse(); } +TEST_P(AvxMseTest, RefMse) { RefTestMse(); } +TEST_P(AvxMseTest, MaxMse) { MaxTestMse(); } +TEST_P(AvxVarianceTest, Zero) { ZeroTest(); } +TEST_P(AvxVarianceTest, Ref) { RefTest(); } +TEST_P(AvxVarianceTest, RefStride) { RefStrideTest(); } +TEST_P(AvxVarianceTest, OneQuarter) { OneQuarterTest(); } +TEST_P(AvxVarianceTest, DISABLED_Speed) { SpeedTest(); } +TEST_P(SumOfSquaresTest, Const) { ConstTest(); } +TEST_P(SumOfSquaresTest, Ref) { RefTest(); } +TEST_P(AvxSubpelVarianceTest, Ref) { RefTest(); } +TEST_P(AvxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } +TEST_P(AvxSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); } +TEST_P(AvxSubpelAvgVarianceTest, Ref) { RefTest(); } +TEST_P(AvxDistWtdSubpelAvgVarianceTest, Ref) { RefTest(); } +TEST_P(AvxObmcSubpelVarianceTest, Ref) { RefTest(); } +TEST_P(AvxObmcSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } +TEST_P(AvxObmcSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); } + +INSTANTIATE_TEST_SUITE_P(C, SumOfSquaresTest, + ::testing::Values(aom_get_mb_ss_c)); + +typedef TestParams SseParams; +INSTANTIATE_TEST_SUITE_P(C, AvxSseTest, + ::testing::Values(SseParams(2, 2, + &aom_get4x4sse_cs_c))); + +typedef TestParams MseParams; +INSTANTIATE_TEST_SUITE_P(C, AvxMseTest, + ::testing::Values(MseParams(4, 4, &aom_mse16x16_c), + MseParams(4, 3, &aom_mse16x8_c), + MseParams(3, 4, &aom_mse8x16_c), + MseParams(3, 3, &aom_mse8x8_c))); + +typedef TestParams VarianceParams; +INSTANTIATE_TEST_SUITE_P( + C, AvxVarianceTest, + ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_c), + VarianceParams(7, 6, &aom_variance128x64_c), + VarianceParams(6, 7, &aom_variance64x128_c), + VarianceParams(6, 6, &aom_variance64x64_c), + VarianceParams(6, 5, &aom_variance64x32_c), + VarianceParams(5, 6, &aom_variance32x64_c), + VarianceParams(5, 5, &aom_variance32x32_c), + VarianceParams(5, 4, &aom_variance32x16_c), + VarianceParams(4, 5, &aom_variance16x32_c), + VarianceParams(4, 4, &aom_variance16x16_c), + VarianceParams(4, 3, &aom_variance16x8_c), + VarianceParams(3, 4, &aom_variance8x16_c), + VarianceParams(3, 3, &aom_variance8x8_c), + VarianceParams(3, 2, &aom_variance8x4_c), + VarianceParams(2, 3, &aom_variance4x8_c), + VarianceParams(2, 2, &aom_variance4x4_c), + + VarianceParams(6, 4, &aom_variance64x16_c), + VarianceParams(4, 6, &aom_variance16x64_c), + VarianceParams(5, 3, &aom_variance32x8_c), + VarianceParams(3, 5, &aom_variance8x32_c), + VarianceParams(4, 2, &aom_variance16x4_c), + VarianceParams(2, 4, &aom_variance4x16_c))); + +typedef TestParams SubpelVarianceParams; +INSTANTIATE_TEST_SUITE_P( + C, AvxSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_c, 0), + SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_c, 0), + SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_c, 0), + SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_c, 0), + SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_c, 0), + SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_c, 0), + SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_c, 0), + SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_c, 0), + SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_c, 0), + SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_c, 0), + SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_c, 0), + SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_c, 0), + SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_c, 0), + SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_c, 0), + SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_c, 0), + SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_c, 0), + + SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_c, 0), + SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_c, 0), + SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_c, 0), + SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_c, 0), + SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_c, 0), + SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_c, 0))); + +typedef TestParams SubpelAvgVarianceParams; +INSTANTIATE_TEST_SUITE_P( + C, AvxSubpelAvgVarianceTest, + ::testing::Values( + SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_c, 0), + SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_c, 0), + SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_c, 0), + SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_c, 0), + SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_c, 0), + SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_c, 0), + SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_c, 0), + SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_c, 0), + SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_c, 0), + SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_c, 0), + SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_c, 0), + SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_c, 0), + SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_c, 0), + SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_c, 0), + SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_c, 0), + SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_c, 0), + + SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_c, 0), + SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_c, 0), + SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_c, 0), + SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_c, 0), + SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_c, 0), + SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_c, 0))); + +typedef TestParams DistWtdSubpelAvgVarianceParams; +INSTANTIATE_TEST_SUITE_P( + C, AvxDistWtdSubpelAvgVarianceTest, + ::testing::Values(DistWtdSubpelAvgVarianceParams( + 6, 6, &aom_dist_wtd_sub_pixel_avg_variance64x64_c, 0), + DistWtdSubpelAvgVarianceParams( + 6, 5, &aom_dist_wtd_sub_pixel_avg_variance64x32_c, 0), + DistWtdSubpelAvgVarianceParams( + 5, 6, &aom_dist_wtd_sub_pixel_avg_variance32x64_c, 0), + DistWtdSubpelAvgVarianceParams( + 5, 5, &aom_dist_wtd_sub_pixel_avg_variance32x32_c, 0), + DistWtdSubpelAvgVarianceParams( + 5, 4, &aom_dist_wtd_sub_pixel_avg_variance32x16_c, 0), + DistWtdSubpelAvgVarianceParams( + 4, 5, &aom_dist_wtd_sub_pixel_avg_variance16x32_c, 0), + DistWtdSubpelAvgVarianceParams( + 4, 4, &aom_dist_wtd_sub_pixel_avg_variance16x16_c, 0), + DistWtdSubpelAvgVarianceParams( + 4, 3, &aom_dist_wtd_sub_pixel_avg_variance16x8_c, 0), + DistWtdSubpelAvgVarianceParams( + 3, 4, &aom_dist_wtd_sub_pixel_avg_variance8x16_c, 0), + DistWtdSubpelAvgVarianceParams( + 3, 3, &aom_dist_wtd_sub_pixel_avg_variance8x8_c, 0), + DistWtdSubpelAvgVarianceParams( + 3, 2, &aom_dist_wtd_sub_pixel_avg_variance8x4_c, 0), + DistWtdSubpelAvgVarianceParams( + 2, 3, &aom_dist_wtd_sub_pixel_avg_variance4x8_c, 0), + DistWtdSubpelAvgVarianceParams( + 2, 2, &aom_dist_wtd_sub_pixel_avg_variance4x4_c, 0), + + DistWtdSubpelAvgVarianceParams( + 6, 4, &aom_dist_wtd_sub_pixel_avg_variance64x16_c, 0), + DistWtdSubpelAvgVarianceParams( + 4, 6, &aom_dist_wtd_sub_pixel_avg_variance16x64_c, 0), + DistWtdSubpelAvgVarianceParams( + 5, 3, &aom_dist_wtd_sub_pixel_avg_variance32x8_c, 0), + DistWtdSubpelAvgVarianceParams( + 3, 5, &aom_dist_wtd_sub_pixel_avg_variance8x32_c, 0), + DistWtdSubpelAvgVarianceParams( + 4, 2, &aom_dist_wtd_sub_pixel_avg_variance16x4_c, 0), + DistWtdSubpelAvgVarianceParams( + 2, 4, &aom_dist_wtd_sub_pixel_avg_variance4x16_c, + 0))); + +INSTANTIATE_TEST_SUITE_P( + C, AvxObmcSubpelVarianceTest, + ::testing::Values( + ObmcSubpelVarianceParams(7, 7, &aom_obmc_sub_pixel_variance128x128_c, + 0), + ObmcSubpelVarianceParams(7, 6, &aom_obmc_sub_pixel_variance128x64_c, 0), + ObmcSubpelVarianceParams(6, 7, &aom_obmc_sub_pixel_variance64x128_c, 0), + ObmcSubpelVarianceParams(6, 6, &aom_obmc_sub_pixel_variance64x64_c, 0), + ObmcSubpelVarianceParams(6, 5, &aom_obmc_sub_pixel_variance64x32_c, 0), + ObmcSubpelVarianceParams(5, 6, &aom_obmc_sub_pixel_variance32x64_c, 0), + ObmcSubpelVarianceParams(5, 5, &aom_obmc_sub_pixel_variance32x32_c, 0), + ObmcSubpelVarianceParams(5, 4, &aom_obmc_sub_pixel_variance32x16_c, 0), + ObmcSubpelVarianceParams(4, 5, &aom_obmc_sub_pixel_variance16x32_c, 0), + ObmcSubpelVarianceParams(4, 4, &aom_obmc_sub_pixel_variance16x16_c, 0), + ObmcSubpelVarianceParams(4, 3, &aom_obmc_sub_pixel_variance16x8_c, 0), + ObmcSubpelVarianceParams(3, 4, &aom_obmc_sub_pixel_variance8x16_c, 0), + ObmcSubpelVarianceParams(3, 3, &aom_obmc_sub_pixel_variance8x8_c, 0), + ObmcSubpelVarianceParams(3, 2, &aom_obmc_sub_pixel_variance8x4_c, 0), + ObmcSubpelVarianceParams(2, 3, &aom_obmc_sub_pixel_variance4x8_c, 0), + ObmcSubpelVarianceParams(2, 2, &aom_obmc_sub_pixel_variance4x4_c, 0), + + ObmcSubpelVarianceParams(6, 4, &aom_obmc_sub_pixel_variance64x16_c, 0), + ObmcSubpelVarianceParams(4, 6, &aom_obmc_sub_pixel_variance16x64_c, 0), + ObmcSubpelVarianceParams(5, 3, &aom_obmc_sub_pixel_variance32x8_c, 0), + ObmcSubpelVarianceParams(3, 5, &aom_obmc_sub_pixel_variance8x32_c, 0), + ObmcSubpelVarianceParams(4, 2, &aom_obmc_sub_pixel_variance16x4_c, 0), + ObmcSubpelVarianceParams(2, 4, &aom_obmc_sub_pixel_variance4x16_c, 0))); + +#if CONFIG_AV1_HIGHBITDEPTH +typedef MainTestClass AvxHBDMseTest; +typedef MainTestClass AvxHBDVarianceTest; +typedef SubpelVarianceTest AvxHBDSubpelVarianceTest; +typedef SubpelVarianceTest AvxHBDSubpelAvgVarianceTest; +typedef ObmcVarianceTest AvxHBDObmcSubpelVarianceTest; + +TEST_P(AvxHBDMseTest, RefMse) { RefTestMse(); } +TEST_P(AvxHBDMseTest, MaxMse) { MaxTestMse(); } +TEST_P(AvxHBDVarianceTest, Zero) { ZeroTest(); } +TEST_P(AvxHBDVarianceTest, Ref) { RefTest(); } +TEST_P(AvxHBDVarianceTest, RefStride) { RefStrideTest(); } +TEST_P(AvxHBDVarianceTest, OneQuarter) { OneQuarterTest(); } +TEST_P(AvxHBDVarianceTest, DISABLED_Speed) { SpeedTest(); } +TEST_P(AvxHBDSubpelVarianceTest, Ref) { RefTest(); } +TEST_P(AvxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } +TEST_P(AvxHBDSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); } +TEST_P(AvxHBDSubpelAvgVarianceTest, Ref) { RefTest(); } + +/* TODO(debargha): This test does not support the highbd version +INSTANTIATE_TEST_SUITE_P( + C, AvxHBDMseTest, + ::testing::Values(make_tuple(4, 4, &aom_highbd_12_mse16x16_c), + make_tuple(4, 4, &aom_highbd_12_mse16x8_c), + make_tuple(4, 4, &aom_highbd_12_mse8x16_c), + make_tuple(4, 4, &aom_highbd_12_mse8x8_c), + make_tuple(4, 4, &aom_highbd_10_mse16x16_c), + make_tuple(4, 4, &aom_highbd_10_mse16x8_c), + make_tuple(4, 4, &aom_highbd_10_mse8x16_c), + make_tuple(4, 4, &aom_highbd_10_mse8x8_c), + make_tuple(4, 4, &aom_highbd_8_mse16x16_c), + make_tuple(4, 4, &aom_highbd_8_mse16x8_c), + make_tuple(4, 4, &aom_highbd_8_mse8x16_c), + make_tuple(4, 4, &aom_highbd_8_mse8x8_c))); +*/ + +const VarianceParams kArrayHBDVariance_c[] = { + VarianceParams(7, 7, &aom_highbd_12_variance128x128_c, 12), + VarianceParams(7, 6, &aom_highbd_12_variance128x64_c, 12), + VarianceParams(6, 7, &aom_highbd_12_variance64x128_c, 12), + VarianceParams(6, 6, &aom_highbd_12_variance64x64_c, 12), + VarianceParams(6, 5, &aom_highbd_12_variance64x32_c, 12), + VarianceParams(5, 6, &aom_highbd_12_variance32x64_c, 12), + VarianceParams(5, 5, &aom_highbd_12_variance32x32_c, 12), + VarianceParams(5, 4, &aom_highbd_12_variance32x16_c, 12), + VarianceParams(4, 5, &aom_highbd_12_variance16x32_c, 12), + VarianceParams(4, 4, &aom_highbd_12_variance16x16_c, 12), + VarianceParams(4, 3, &aom_highbd_12_variance16x8_c, 12), + VarianceParams(3, 4, &aom_highbd_12_variance8x16_c, 12), + VarianceParams(3, 3, &aom_highbd_12_variance8x8_c, 12), + VarianceParams(3, 2, &aom_highbd_12_variance8x4_c, 12), + VarianceParams(2, 3, &aom_highbd_12_variance4x8_c, 12), + VarianceParams(2, 2, &aom_highbd_12_variance4x4_c, 12), + VarianceParams(7, 7, &aom_highbd_10_variance128x128_c, 10), + VarianceParams(7, 6, &aom_highbd_10_variance128x64_c, 10), + VarianceParams(6, 7, &aom_highbd_10_variance64x128_c, 10), + VarianceParams(6, 6, &aom_highbd_10_variance64x64_c, 10), + VarianceParams(6, 5, &aom_highbd_10_variance64x32_c, 10), + VarianceParams(5, 6, &aom_highbd_10_variance32x64_c, 10), + VarianceParams(5, 5, &aom_highbd_10_variance32x32_c, 10), + VarianceParams(5, 4, &aom_highbd_10_variance32x16_c, 10), + VarianceParams(4, 5, &aom_highbd_10_variance16x32_c, 10), + VarianceParams(4, 4, &aom_highbd_10_variance16x16_c, 10), + VarianceParams(4, 3, &aom_highbd_10_variance16x8_c, 10), + VarianceParams(3, 4, &aom_highbd_10_variance8x16_c, 10), + VarianceParams(3, 3, &aom_highbd_10_variance8x8_c, 10), + VarianceParams(3, 2, &aom_highbd_10_variance8x4_c, 10), + VarianceParams(2, 3, &aom_highbd_10_variance4x8_c, 10), + VarianceParams(2, 2, &aom_highbd_10_variance4x4_c, 10), + VarianceParams(7, 7, &aom_highbd_8_variance128x128_c, 8), + VarianceParams(7, 6, &aom_highbd_8_variance128x64_c, 8), + VarianceParams(6, 7, &aom_highbd_8_variance64x128_c, 8), + VarianceParams(6, 6, &aom_highbd_8_variance64x64_c, 8), + VarianceParams(6, 5, &aom_highbd_8_variance64x32_c, 8), + VarianceParams(5, 6, &aom_highbd_8_variance32x64_c, 8), + VarianceParams(5, 5, &aom_highbd_8_variance32x32_c, 8), + VarianceParams(5, 4, &aom_highbd_8_variance32x16_c, 8), + VarianceParams(4, 5, &aom_highbd_8_variance16x32_c, 8), + VarianceParams(4, 4, &aom_highbd_8_variance16x16_c, 8), + VarianceParams(4, 3, &aom_highbd_8_variance16x8_c, 8), + VarianceParams(3, 4, &aom_highbd_8_variance8x16_c, 8), + VarianceParams(3, 3, &aom_highbd_8_variance8x8_c, 8), + VarianceParams(3, 2, &aom_highbd_8_variance8x4_c, 8), + VarianceParams(2, 3, &aom_highbd_8_variance4x8_c, 8), + VarianceParams(2, 2, &aom_highbd_8_variance4x4_c, 8), + + VarianceParams(6, 4, &aom_highbd_12_variance64x16_c, 12), + VarianceParams(4, 6, &aom_highbd_12_variance16x64_c, 12), + VarianceParams(5, 3, &aom_highbd_12_variance32x8_c, 12), + VarianceParams(3, 5, &aom_highbd_12_variance8x32_c, 12), + VarianceParams(4, 2, &aom_highbd_12_variance16x4_c, 12), + VarianceParams(2, 4, &aom_highbd_12_variance4x16_c, 12), + VarianceParams(6, 4, &aom_highbd_10_variance64x16_c, 10), + VarianceParams(4, 6, &aom_highbd_10_variance16x64_c, 10), + VarianceParams(5, 3, &aom_highbd_10_variance32x8_c, 10), + VarianceParams(3, 5, &aom_highbd_10_variance8x32_c, 10), + VarianceParams(4, 2, &aom_highbd_10_variance16x4_c, 10), + VarianceParams(2, 4, &aom_highbd_10_variance4x16_c, 10), + VarianceParams(6, 4, &aom_highbd_8_variance64x16_c, 8), + VarianceParams(4, 6, &aom_highbd_8_variance16x64_c, 8), + VarianceParams(5, 3, &aom_highbd_8_variance32x8_c, 8), + VarianceParams(3, 5, &aom_highbd_8_variance8x32_c, 8), + VarianceParams(4, 2, &aom_highbd_8_variance16x4_c, 8), + VarianceParams(2, 4, &aom_highbd_8_variance4x16_c, 8), +}; +INSTANTIATE_TEST_SUITE_P(C, AvxHBDVarianceTest, + ::testing::ValuesIn(kArrayHBDVariance_c)); + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P( + SSE4_1, AvxHBDVarianceTest, + ::testing::Values( + VarianceParams(2, 2, &aom_highbd_8_variance4x4_sse4_1, 8), + VarianceParams(2, 2, &aom_highbd_10_variance4x4_sse4_1, 10), + VarianceParams(2, 2, &aom_highbd_12_variance4x4_sse4_1, 12))); +#endif // HAVE_SSE4_1 + +const SubpelVarianceParams kArrayHBDSubpelVariance_c[] = { + SubpelVarianceParams(7, 7, &aom_highbd_8_sub_pixel_variance128x128_c, 8), + SubpelVarianceParams(7, 6, &aom_highbd_8_sub_pixel_variance128x64_c, 8), + SubpelVarianceParams(6, 7, &aom_highbd_8_sub_pixel_variance64x128_c, 8), + SubpelVarianceParams(6, 6, &aom_highbd_8_sub_pixel_variance64x64_c, 8), + SubpelVarianceParams(6, 5, &aom_highbd_8_sub_pixel_variance64x32_c, 8), + SubpelVarianceParams(5, 6, &aom_highbd_8_sub_pixel_variance32x64_c, 8), + SubpelVarianceParams(5, 5, &aom_highbd_8_sub_pixel_variance32x32_c, 8), + SubpelVarianceParams(5, 4, &aom_highbd_8_sub_pixel_variance32x16_c, 8), + SubpelVarianceParams(4, 5, &aom_highbd_8_sub_pixel_variance16x32_c, 8), + SubpelVarianceParams(4, 4, &aom_highbd_8_sub_pixel_variance16x16_c, 8), + SubpelVarianceParams(4, 3, &aom_highbd_8_sub_pixel_variance16x8_c, 8), + SubpelVarianceParams(3, 4, &aom_highbd_8_sub_pixel_variance8x16_c, 8), + SubpelVarianceParams(3, 3, &aom_highbd_8_sub_pixel_variance8x8_c, 8), + SubpelVarianceParams(3, 2, &aom_highbd_8_sub_pixel_variance8x4_c, 8), + SubpelVarianceParams(2, 3, &aom_highbd_8_sub_pixel_variance4x8_c, 8), + SubpelVarianceParams(2, 2, &aom_highbd_8_sub_pixel_variance4x4_c, 8), + SubpelVarianceParams(7, 7, &aom_highbd_10_sub_pixel_variance128x128_c, 10), + SubpelVarianceParams(7, 6, &aom_highbd_10_sub_pixel_variance128x64_c, 10), + SubpelVarianceParams(6, 7, &aom_highbd_10_sub_pixel_variance64x128_c, 10), + SubpelVarianceParams(6, 6, &aom_highbd_10_sub_pixel_variance64x64_c, 10), + SubpelVarianceParams(6, 5, &aom_highbd_10_sub_pixel_variance64x32_c, 10), + SubpelVarianceParams(5, 6, &aom_highbd_10_sub_pixel_variance32x64_c, 10), + SubpelVarianceParams(5, 5, &aom_highbd_10_sub_pixel_variance32x32_c, 10), + SubpelVarianceParams(5, 4, &aom_highbd_10_sub_pixel_variance32x16_c, 10), + SubpelVarianceParams(4, 5, &aom_highbd_10_sub_pixel_variance16x32_c, 10), + SubpelVarianceParams(4, 4, &aom_highbd_10_sub_pixel_variance16x16_c, 10), + SubpelVarianceParams(4, 3, &aom_highbd_10_sub_pixel_variance16x8_c, 10), + SubpelVarianceParams(3, 4, &aom_highbd_10_sub_pixel_variance8x16_c, 10), + SubpelVarianceParams(3, 3, &aom_highbd_10_sub_pixel_variance8x8_c, 10), + SubpelVarianceParams(3, 2, &aom_highbd_10_sub_pixel_variance8x4_c, 10), + SubpelVarianceParams(2, 3, &aom_highbd_10_sub_pixel_variance4x8_c, 10), + SubpelVarianceParams(2, 2, &aom_highbd_10_sub_pixel_variance4x4_c, 10), + SubpelVarianceParams(7, 7, &aom_highbd_12_sub_pixel_variance128x128_c, 12), + SubpelVarianceParams(7, 6, &aom_highbd_12_sub_pixel_variance128x64_c, 12), + SubpelVarianceParams(6, 7, &aom_highbd_12_sub_pixel_variance64x128_c, 12), + SubpelVarianceParams(6, 6, &aom_highbd_12_sub_pixel_variance64x64_c, 12), + SubpelVarianceParams(6, 5, &aom_highbd_12_sub_pixel_variance64x32_c, 12), + SubpelVarianceParams(5, 6, &aom_highbd_12_sub_pixel_variance32x64_c, 12), + SubpelVarianceParams(5, 5, &aom_highbd_12_sub_pixel_variance32x32_c, 12), + SubpelVarianceParams(5, 4, &aom_highbd_12_sub_pixel_variance32x16_c, 12), + SubpelVarianceParams(4, 5, &aom_highbd_12_sub_pixel_variance16x32_c, 12), + SubpelVarianceParams(4, 4, &aom_highbd_12_sub_pixel_variance16x16_c, 12), + SubpelVarianceParams(4, 3, &aom_highbd_12_sub_pixel_variance16x8_c, 12), + SubpelVarianceParams(3, 4, &aom_highbd_12_sub_pixel_variance8x16_c, 12), + SubpelVarianceParams(3, 3, &aom_highbd_12_sub_pixel_variance8x8_c, 12), + SubpelVarianceParams(3, 2, &aom_highbd_12_sub_pixel_variance8x4_c, 12), + SubpelVarianceParams(2, 3, &aom_highbd_12_sub_pixel_variance4x8_c, 12), + SubpelVarianceParams(2, 2, &aom_highbd_12_sub_pixel_variance4x4_c, 12), + + SubpelVarianceParams(6, 4, &aom_highbd_8_sub_pixel_variance64x16_c, 8), + SubpelVarianceParams(4, 6, &aom_highbd_8_sub_pixel_variance16x64_c, 8), + SubpelVarianceParams(5, 3, &aom_highbd_8_sub_pixel_variance32x8_c, 8), + SubpelVarianceParams(3, 5, &aom_highbd_8_sub_pixel_variance8x32_c, 8), + SubpelVarianceParams(4, 2, &aom_highbd_8_sub_pixel_variance16x4_c, 8), + SubpelVarianceParams(2, 4, &aom_highbd_8_sub_pixel_variance4x16_c, 8), + SubpelVarianceParams(6, 4, &aom_highbd_10_sub_pixel_variance64x16_c, 10), + SubpelVarianceParams(4, 6, &aom_highbd_10_sub_pixel_variance16x64_c, 10), + SubpelVarianceParams(5, 3, &aom_highbd_10_sub_pixel_variance32x8_c, 10), + SubpelVarianceParams(3, 5, &aom_highbd_10_sub_pixel_variance8x32_c, 10), + SubpelVarianceParams(4, 2, &aom_highbd_10_sub_pixel_variance16x4_c, 10), + SubpelVarianceParams(2, 4, &aom_highbd_10_sub_pixel_variance4x16_c, 10), + SubpelVarianceParams(6, 4, &aom_highbd_12_sub_pixel_variance64x16_c, 12), + SubpelVarianceParams(4, 6, &aom_highbd_12_sub_pixel_variance16x64_c, 12), + SubpelVarianceParams(5, 3, &aom_highbd_12_sub_pixel_variance32x8_c, 12), + SubpelVarianceParams(3, 5, &aom_highbd_12_sub_pixel_variance8x32_c, 12), + SubpelVarianceParams(4, 2, &aom_highbd_12_sub_pixel_variance16x4_c, 12), + SubpelVarianceParams(2, 4, &aom_highbd_12_sub_pixel_variance4x16_c, 12), +}; +INSTANTIATE_TEST_SUITE_P(C, AvxHBDSubpelVarianceTest, + ::testing::ValuesIn(kArrayHBDSubpelVariance_c)); + +const SubpelAvgVarianceParams kArrayHBDSubpelAvgVariance_c[] = { + SubpelAvgVarianceParams(7, 7, &aom_highbd_8_sub_pixel_avg_variance128x128_c, + 8), + SubpelAvgVarianceParams(7, 6, &aom_highbd_8_sub_pixel_avg_variance128x64_c, + 8), + SubpelAvgVarianceParams(6, 7, &aom_highbd_8_sub_pixel_avg_variance64x128_c, + 8), + SubpelAvgVarianceParams(6, 6, &aom_highbd_8_sub_pixel_avg_variance64x64_c, 8), + SubpelAvgVarianceParams(6, 5, &aom_highbd_8_sub_pixel_avg_variance64x32_c, 8), + SubpelAvgVarianceParams(5, 6, &aom_highbd_8_sub_pixel_avg_variance32x64_c, 8), + SubpelAvgVarianceParams(5, 5, &aom_highbd_8_sub_pixel_avg_variance32x32_c, 8), + SubpelAvgVarianceParams(5, 4, &aom_highbd_8_sub_pixel_avg_variance32x16_c, 8), + SubpelAvgVarianceParams(4, 5, &aom_highbd_8_sub_pixel_avg_variance16x32_c, 8), + SubpelAvgVarianceParams(4, 4, &aom_highbd_8_sub_pixel_avg_variance16x16_c, 8), + SubpelAvgVarianceParams(4, 3, &aom_highbd_8_sub_pixel_avg_variance16x8_c, 8), + SubpelAvgVarianceParams(3, 4, &aom_highbd_8_sub_pixel_avg_variance8x16_c, 8), + SubpelAvgVarianceParams(3, 3, &aom_highbd_8_sub_pixel_avg_variance8x8_c, 8), + SubpelAvgVarianceParams(3, 2, &aom_highbd_8_sub_pixel_avg_variance8x4_c, 8), + SubpelAvgVarianceParams(2, 3, &aom_highbd_8_sub_pixel_avg_variance4x8_c, 8), + SubpelAvgVarianceParams(2, 2, &aom_highbd_8_sub_pixel_avg_variance4x4_c, 8), + SubpelAvgVarianceParams(7, 7, &aom_highbd_10_sub_pixel_avg_variance128x128_c, + 10), + SubpelAvgVarianceParams(7, 6, &aom_highbd_10_sub_pixel_avg_variance128x64_c, + 10), + SubpelAvgVarianceParams(6, 7, &aom_highbd_10_sub_pixel_avg_variance64x128_c, + 10), + SubpelAvgVarianceParams(6, 6, &aom_highbd_10_sub_pixel_avg_variance64x64_c, + 10), + SubpelAvgVarianceParams(6, 5, &aom_highbd_10_sub_pixel_avg_variance64x32_c, + 10), + SubpelAvgVarianceParams(5, 6, &aom_highbd_10_sub_pixel_avg_variance32x64_c, + 10), + SubpelAvgVarianceParams(5, 5, &aom_highbd_10_sub_pixel_avg_variance32x32_c, + 10), + SubpelAvgVarianceParams(5, 4, &aom_highbd_10_sub_pixel_avg_variance32x16_c, + 10), + SubpelAvgVarianceParams(4, 5, &aom_highbd_10_sub_pixel_avg_variance16x32_c, + 10), + SubpelAvgVarianceParams(4, 4, &aom_highbd_10_sub_pixel_avg_variance16x16_c, + 10), + SubpelAvgVarianceParams(4, 3, &aom_highbd_10_sub_pixel_avg_variance16x8_c, + 10), + SubpelAvgVarianceParams(3, 4, &aom_highbd_10_sub_pixel_avg_variance8x16_c, + 10), + SubpelAvgVarianceParams(3, 3, &aom_highbd_10_sub_pixel_avg_variance8x8_c, 10), + SubpelAvgVarianceParams(3, 2, &aom_highbd_10_sub_pixel_avg_variance8x4_c, 10), + SubpelAvgVarianceParams(2, 3, &aom_highbd_10_sub_pixel_avg_variance4x8_c, 10), + SubpelAvgVarianceParams(2, 2, &aom_highbd_10_sub_pixel_avg_variance4x4_c, 10), + SubpelAvgVarianceParams(7, 7, &aom_highbd_12_sub_pixel_avg_variance128x128_c, + 12), + SubpelAvgVarianceParams(7, 6, &aom_highbd_12_sub_pixel_avg_variance128x64_c, + 12), + SubpelAvgVarianceParams(6, 7, &aom_highbd_12_sub_pixel_avg_variance64x128_c, + 12), + SubpelAvgVarianceParams(6, 6, &aom_highbd_12_sub_pixel_avg_variance64x64_c, + 12), + SubpelAvgVarianceParams(6, 5, &aom_highbd_12_sub_pixel_avg_variance64x32_c, + 12), + SubpelAvgVarianceParams(5, 6, &aom_highbd_12_sub_pixel_avg_variance32x64_c, + 12), + SubpelAvgVarianceParams(5, 5, &aom_highbd_12_sub_pixel_avg_variance32x32_c, + 12), + SubpelAvgVarianceParams(5, 4, &aom_highbd_12_sub_pixel_avg_variance32x16_c, + 12), + SubpelAvgVarianceParams(4, 5, &aom_highbd_12_sub_pixel_avg_variance16x32_c, + 12), + SubpelAvgVarianceParams(4, 4, &aom_highbd_12_sub_pixel_avg_variance16x16_c, + 12), + SubpelAvgVarianceParams(4, 3, &aom_highbd_12_sub_pixel_avg_variance16x8_c, + 12), + SubpelAvgVarianceParams(3, 4, &aom_highbd_12_sub_pixel_avg_variance8x16_c, + 12), + SubpelAvgVarianceParams(3, 3, &aom_highbd_12_sub_pixel_avg_variance8x8_c, 12), + SubpelAvgVarianceParams(3, 2, &aom_highbd_12_sub_pixel_avg_variance8x4_c, 12), + SubpelAvgVarianceParams(2, 3, &aom_highbd_12_sub_pixel_avg_variance4x8_c, 12), + SubpelAvgVarianceParams(2, 2, &aom_highbd_12_sub_pixel_avg_variance4x4_c, 12), + + SubpelAvgVarianceParams(6, 4, &aom_highbd_8_sub_pixel_avg_variance64x16_c, 8), + SubpelAvgVarianceParams(4, 6, &aom_highbd_8_sub_pixel_avg_variance16x64_c, 8), + SubpelAvgVarianceParams(5, 3, &aom_highbd_8_sub_pixel_avg_variance32x8_c, 8), + SubpelAvgVarianceParams(3, 5, &aom_highbd_8_sub_pixel_avg_variance8x32_c, 8), + SubpelAvgVarianceParams(4, 2, &aom_highbd_8_sub_pixel_avg_variance16x4_c, 8), + SubpelAvgVarianceParams(2, 4, &aom_highbd_8_sub_pixel_avg_variance4x16_c, 8), + SubpelAvgVarianceParams(6, 4, &aom_highbd_10_sub_pixel_avg_variance64x16_c, + 10), + SubpelAvgVarianceParams(4, 6, &aom_highbd_10_sub_pixel_avg_variance16x64_c, + 10), + SubpelAvgVarianceParams(5, 3, &aom_highbd_10_sub_pixel_avg_variance32x8_c, + 10), + SubpelAvgVarianceParams(3, 5, &aom_highbd_10_sub_pixel_avg_variance8x32_c, + 10), + SubpelAvgVarianceParams(4, 2, &aom_highbd_10_sub_pixel_avg_variance16x4_c, + 10), + SubpelAvgVarianceParams(2, 4, &aom_highbd_10_sub_pixel_avg_variance4x16_c, + 10), + SubpelAvgVarianceParams(6, 4, &aom_highbd_12_sub_pixel_avg_variance64x16_c, + 12), + SubpelAvgVarianceParams(4, 6, &aom_highbd_12_sub_pixel_avg_variance16x64_c, + 12), + SubpelAvgVarianceParams(5, 3, &aom_highbd_12_sub_pixel_avg_variance32x8_c, + 12), + SubpelAvgVarianceParams(3, 5, &aom_highbd_12_sub_pixel_avg_variance8x32_c, + 12), + SubpelAvgVarianceParams(4, 2, &aom_highbd_12_sub_pixel_avg_variance16x4_c, + 12), + SubpelAvgVarianceParams(2, 4, &aom_highbd_12_sub_pixel_avg_variance4x16_c, + 12), +}; +INSTANTIATE_TEST_SUITE_P(C, AvxHBDSubpelAvgVarianceTest, + ::testing::ValuesIn(kArrayHBDSubpelAvgVariance_c)); + +const ObmcSubpelVarianceParams kArrayHBDObmcSubpelVariance_c[] = { + ObmcSubpelVarianceParams(7, 7, &aom_highbd_obmc_sub_pixel_variance128x128_c, + 8), + ObmcSubpelVarianceParams(7, 6, &aom_highbd_obmc_sub_pixel_variance128x64_c, + 8), + ObmcSubpelVarianceParams(6, 7, &aom_highbd_obmc_sub_pixel_variance64x128_c, + 8), + ObmcSubpelVarianceParams(6, 6, &aom_highbd_obmc_sub_pixel_variance64x64_c, 8), + ObmcSubpelVarianceParams(6, 5, &aom_highbd_obmc_sub_pixel_variance64x32_c, 8), + ObmcSubpelVarianceParams(5, 6, &aom_highbd_obmc_sub_pixel_variance32x64_c, 8), + ObmcSubpelVarianceParams(5, 5, &aom_highbd_obmc_sub_pixel_variance32x32_c, 8), + ObmcSubpelVarianceParams(5, 4, &aom_highbd_obmc_sub_pixel_variance32x16_c, 8), + ObmcSubpelVarianceParams(4, 5, &aom_highbd_obmc_sub_pixel_variance16x32_c, 8), + ObmcSubpelVarianceParams(4, 4, &aom_highbd_obmc_sub_pixel_variance16x16_c, 8), + ObmcSubpelVarianceParams(4, 3, &aom_highbd_obmc_sub_pixel_variance16x8_c, 8), + ObmcSubpelVarianceParams(3, 4, &aom_highbd_obmc_sub_pixel_variance8x16_c, 8), + ObmcSubpelVarianceParams(3, 3, &aom_highbd_obmc_sub_pixel_variance8x8_c, 8), + ObmcSubpelVarianceParams(3, 2, &aom_highbd_obmc_sub_pixel_variance8x4_c, 8), + ObmcSubpelVarianceParams(2, 3, &aom_highbd_obmc_sub_pixel_variance4x8_c, 8), + ObmcSubpelVarianceParams(2, 2, &aom_highbd_obmc_sub_pixel_variance4x4_c, 8), + ObmcSubpelVarianceParams(7, 7, + &aom_highbd_10_obmc_sub_pixel_variance128x128_c, 10), + ObmcSubpelVarianceParams(7, 6, &aom_highbd_10_obmc_sub_pixel_variance128x64_c, + 10), + ObmcSubpelVarianceParams(6, 7, &aom_highbd_10_obmc_sub_pixel_variance64x128_c, + 10), + ObmcSubpelVarianceParams(6, 6, &aom_highbd_10_obmc_sub_pixel_variance64x64_c, + 10), + ObmcSubpelVarianceParams(6, 5, &aom_highbd_10_obmc_sub_pixel_variance64x32_c, + 10), + ObmcSubpelVarianceParams(5, 6, &aom_highbd_10_obmc_sub_pixel_variance32x64_c, + 10), + ObmcSubpelVarianceParams(5, 5, &aom_highbd_10_obmc_sub_pixel_variance32x32_c, + 10), + ObmcSubpelVarianceParams(5, 4, &aom_highbd_10_obmc_sub_pixel_variance32x16_c, + 10), + ObmcSubpelVarianceParams(4, 5, &aom_highbd_10_obmc_sub_pixel_variance16x32_c, + 10), + ObmcSubpelVarianceParams(4, 4, &aom_highbd_10_obmc_sub_pixel_variance16x16_c, + 10), + ObmcSubpelVarianceParams(4, 3, &aom_highbd_10_obmc_sub_pixel_variance16x8_c, + 10), + ObmcSubpelVarianceParams(3, 4, &aom_highbd_10_obmc_sub_pixel_variance8x16_c, + 10), + ObmcSubpelVarianceParams(3, 3, &aom_highbd_10_obmc_sub_pixel_variance8x8_c, + 10), + ObmcSubpelVarianceParams(3, 2, &aom_highbd_10_obmc_sub_pixel_variance8x4_c, + 10), + ObmcSubpelVarianceParams(2, 3, &aom_highbd_10_obmc_sub_pixel_variance4x8_c, + 10), + ObmcSubpelVarianceParams(2, 2, &aom_highbd_10_obmc_sub_pixel_variance4x4_c, + 10), + ObmcSubpelVarianceParams(7, 7, + &aom_highbd_12_obmc_sub_pixel_variance128x128_c, 12), + ObmcSubpelVarianceParams(7, 6, &aom_highbd_12_obmc_sub_pixel_variance128x64_c, + 12), + ObmcSubpelVarianceParams(6, 7, &aom_highbd_12_obmc_sub_pixel_variance64x128_c, + 12), + ObmcSubpelVarianceParams(6, 6, &aom_highbd_12_obmc_sub_pixel_variance64x64_c, + 12), + ObmcSubpelVarianceParams(6, 5, &aom_highbd_12_obmc_sub_pixel_variance64x32_c, + 12), + ObmcSubpelVarianceParams(5, 6, &aom_highbd_12_obmc_sub_pixel_variance32x64_c, + 12), + ObmcSubpelVarianceParams(5, 5, &aom_highbd_12_obmc_sub_pixel_variance32x32_c, + 12), + ObmcSubpelVarianceParams(5, 4, &aom_highbd_12_obmc_sub_pixel_variance32x16_c, + 12), + ObmcSubpelVarianceParams(4, 5, &aom_highbd_12_obmc_sub_pixel_variance16x32_c, + 12), + ObmcSubpelVarianceParams(4, 4, &aom_highbd_12_obmc_sub_pixel_variance16x16_c, + 12), + ObmcSubpelVarianceParams(4, 3, &aom_highbd_12_obmc_sub_pixel_variance16x8_c, + 12), + ObmcSubpelVarianceParams(3, 4, &aom_highbd_12_obmc_sub_pixel_variance8x16_c, + 12), + ObmcSubpelVarianceParams(3, 3, &aom_highbd_12_obmc_sub_pixel_variance8x8_c, + 12), + ObmcSubpelVarianceParams(3, 2, &aom_highbd_12_obmc_sub_pixel_variance8x4_c, + 12), + ObmcSubpelVarianceParams(2, 3, &aom_highbd_12_obmc_sub_pixel_variance4x8_c, + 12), + ObmcSubpelVarianceParams(2, 2, &aom_highbd_12_obmc_sub_pixel_variance4x4_c, + 12), + + ObmcSubpelVarianceParams(6, 4, &aom_highbd_obmc_sub_pixel_variance64x16_c, 8), + ObmcSubpelVarianceParams(4, 6, &aom_highbd_obmc_sub_pixel_variance16x64_c, 8), + ObmcSubpelVarianceParams(5, 3, &aom_highbd_obmc_sub_pixel_variance32x8_c, 8), + ObmcSubpelVarianceParams(3, 5, &aom_highbd_obmc_sub_pixel_variance8x32_c, 8), + ObmcSubpelVarianceParams(4, 2, &aom_highbd_obmc_sub_pixel_variance16x4_c, 8), + ObmcSubpelVarianceParams(2, 4, &aom_highbd_obmc_sub_pixel_variance4x16_c, 8), + ObmcSubpelVarianceParams(6, 4, &aom_highbd_10_obmc_sub_pixel_variance64x16_c, + 10), + ObmcSubpelVarianceParams(4, 6, &aom_highbd_10_obmc_sub_pixel_variance16x64_c, + 10), + ObmcSubpelVarianceParams(5, 3, &aom_highbd_10_obmc_sub_pixel_variance32x8_c, + 10), + ObmcSubpelVarianceParams(3, 5, &aom_highbd_10_obmc_sub_pixel_variance8x32_c, + 10), + ObmcSubpelVarianceParams(4, 2, &aom_highbd_10_obmc_sub_pixel_variance16x4_c, + 10), + ObmcSubpelVarianceParams(2, 4, &aom_highbd_10_obmc_sub_pixel_variance4x16_c, + 10), + ObmcSubpelVarianceParams(6, 4, &aom_highbd_12_obmc_sub_pixel_variance64x16_c, + 12), + ObmcSubpelVarianceParams(4, 6, &aom_highbd_12_obmc_sub_pixel_variance16x64_c, + 12), + ObmcSubpelVarianceParams(5, 3, &aom_highbd_12_obmc_sub_pixel_variance32x8_c, + 12), + ObmcSubpelVarianceParams(3, 5, &aom_highbd_12_obmc_sub_pixel_variance8x32_c, + 12), + ObmcSubpelVarianceParams(4, 2, &aom_highbd_12_obmc_sub_pixel_variance16x4_c, + 12), + ObmcSubpelVarianceParams(2, 4, &aom_highbd_12_obmc_sub_pixel_variance4x16_c, + 12), +}; +INSTANTIATE_TEST_SUITE_P(C, AvxHBDObmcSubpelVarianceTest, + ::testing::ValuesIn(kArrayHBDObmcSubpelVariance_c)); +#endif // CONFIG_AV1_HIGHBITDEPTH + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P(SSE2, SumOfSquaresTest, + ::testing::Values(aom_get_mb_ss_sse2)); + +INSTANTIATE_TEST_SUITE_P(SSE2, AvxMseTest, + ::testing::Values(MseParams(4, 4, &aom_mse16x16_sse2), + MseParams(4, 3, &aom_mse16x8_sse2), + MseParams(3, 4, &aom_mse8x16_sse2), + MseParams(3, 3, &aom_mse8x8_sse2))); + +INSTANTIATE_TEST_SUITE_P( + SSE2, AvxVarianceTest, + ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_sse2), + VarianceParams(7, 6, &aom_variance128x64_sse2), + VarianceParams(6, 7, &aom_variance64x128_sse2), + VarianceParams(6, 6, &aom_variance64x64_sse2), + VarianceParams(6, 5, &aom_variance64x32_sse2), + VarianceParams(6, 4, &aom_variance64x16_sse2), + VarianceParams(5, 6, &aom_variance32x64_sse2), + VarianceParams(5, 5, &aom_variance32x32_sse2), + VarianceParams(5, 4, &aom_variance32x16_sse2), + VarianceParams(5, 3, &aom_variance32x8_sse2), + VarianceParams(4, 6, &aom_variance16x64_sse2), + VarianceParams(4, 5, &aom_variance16x32_sse2), + VarianceParams(4, 4, &aom_variance16x16_sse2), + VarianceParams(4, 3, &aom_variance16x8_sse2), + VarianceParams(4, 2, &aom_variance16x4_sse2), + VarianceParams(3, 5, &aom_variance8x32_sse2), + VarianceParams(3, 4, &aom_variance8x16_sse2), + VarianceParams(3, 3, &aom_variance8x8_sse2), + VarianceParams(3, 2, &aom_variance8x4_sse2), + VarianceParams(2, 4, &aom_variance4x16_sse2), + VarianceParams(2, 3, &aom_variance4x8_sse2), + VarianceParams(2, 2, &aom_variance4x4_sse2))); + +INSTANTIATE_TEST_SUITE_P( + SSE2, AvxSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_sse2, 0), + SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_sse2, 0), + SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_sse2, 0), + SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_sse2, 0), + SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_sse2, 0), + SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_sse2, 0), + SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_sse2, 0), + SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_sse2, 0), + SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_sse2, 0), + SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_sse2, 0), + SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_sse2, 0), + SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_sse2, 0), + SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_sse2, 0), + SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_sse2, 0), + SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_sse2, 0), + SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_sse2, 0), + + SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_sse2, 0), + SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_sse2, 0), + SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_sse2, 0), + SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_sse2, 0), + SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_sse2, 0), + SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_sse2, 0))); + +INSTANTIATE_TEST_SUITE_P( + SSE2, AvxSubpelAvgVarianceTest, + ::testing::Values( + SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_sse2, + 0), + SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_sse2, + 0), + SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_sse2, + 0), + SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_sse2, 0), + SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_sse2, 0), + SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_sse2, 0), + SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_sse2, 0), + SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_sse2, 0), + SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_sse2, 0), + SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_sse2, 0), + SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_sse2, 0), + SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_sse2, 0), + SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_sse2, 0), + SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_sse2, 0), + SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_sse2, 0), + SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_sse2, 0), + + SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_sse2, 0), + SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_sse2, 0), + SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_sse2, 0), + SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_sse2, 0), + SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_sse2, 0), + SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_sse2, + 0))); + +#if CONFIG_AV1_HIGHBITDEPTH +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P( + SSE4_1, AvxSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(2, 2, &aom_highbd_8_sub_pixel_variance4x4_sse4_1, + 8), + SubpelVarianceParams(2, 2, &aom_highbd_10_sub_pixel_variance4x4_sse4_1, + 10), + SubpelVarianceParams(2, 2, &aom_highbd_12_sub_pixel_variance4x4_sse4_1, + 12))); + +INSTANTIATE_TEST_SUITE_P( + SSE4_1, AvxSubpelAvgVarianceTest, + ::testing::Values( + SubpelAvgVarianceParams(2, 2, + &aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1, + 8), + SubpelAvgVarianceParams(2, 2, + &aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1, + 10), + SubpelAvgVarianceParams(2, 2, + &aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1, + 12))); +#endif // HAVE_SSE4_1 + +/* TODO(debargha): This test does not support the highbd version +INSTANTIATE_TEST_SUITE_P( + SSE2, AvxHBDMseTest, + ::testing::Values(MseParams(4, 4, &aom_highbd_12_mse16x16_sse2), + MseParams(4, 3, &aom_highbd_12_mse16x8_sse2), + MseParams(3, 4, &aom_highbd_12_mse8x16_sse2), + MseParams(3, 3, &aom_highbd_12_mse8x8_sse2), + MseParams(4, 4, &aom_highbd_10_mse16x16_sse2), + MseParams(4, 3, &aom_highbd_10_mse16x8_sse2), + MseParams(3, 4, &aom_highbd_10_mse8x16_sse2), + MseParams(3, 3, &aom_highbd_10_mse8x8_sse2), + MseParams(4, 4, &aom_highbd_8_mse16x16_sse2), + MseParams(4, 3, &aom_highbd_8_mse16x8_sse2), + MseParams(3, 4, &aom_highbd_8_mse8x16_sse2), + MseParams(3, 3, &aom_highbd_8_mse8x8_sse2))); +*/ + +const VarianceParams kArrayHBDVariance_sse2[] = { + VarianceParams(7, 7, &aom_highbd_12_variance128x128_sse2, 12), + VarianceParams(7, 6, &aom_highbd_12_variance128x64_sse2, 12), + VarianceParams(6, 7, &aom_highbd_12_variance64x128_sse2, 12), + VarianceParams(6, 6, &aom_highbd_12_variance64x64_sse2, 12), + VarianceParams(6, 5, &aom_highbd_12_variance64x32_sse2, 12), + VarianceParams(5, 6, &aom_highbd_12_variance32x64_sse2, 12), + VarianceParams(5, 5, &aom_highbd_12_variance32x32_sse2, 12), + VarianceParams(5, 4, &aom_highbd_12_variance32x16_sse2, 12), + VarianceParams(4, 5, &aom_highbd_12_variance16x32_sse2, 12), + VarianceParams(4, 4, &aom_highbd_12_variance16x16_sse2, 12), + VarianceParams(4, 3, &aom_highbd_12_variance16x8_sse2, 12), + VarianceParams(3, 4, &aom_highbd_12_variance8x16_sse2, 12), + VarianceParams(3, 3, &aom_highbd_12_variance8x8_sse2, 12), + VarianceParams(7, 7, &aom_highbd_10_variance128x128_sse2, 10), + VarianceParams(7, 6, &aom_highbd_10_variance128x64_sse2, 10), + VarianceParams(6, 7, &aom_highbd_10_variance64x128_sse2, 10), + VarianceParams(6, 6, &aom_highbd_10_variance64x64_sse2, 10), + VarianceParams(6, 5, &aom_highbd_10_variance64x32_sse2, 10), + VarianceParams(5, 6, &aom_highbd_10_variance32x64_sse2, 10), + VarianceParams(5, 5, &aom_highbd_10_variance32x32_sse2, 10), + VarianceParams(5, 4, &aom_highbd_10_variance32x16_sse2, 10), + VarianceParams(4, 5, &aom_highbd_10_variance16x32_sse2, 10), + VarianceParams(4, 4, &aom_highbd_10_variance16x16_sse2, 10), + VarianceParams(4, 3, &aom_highbd_10_variance16x8_sse2, 10), + VarianceParams(3, 4, &aom_highbd_10_variance8x16_sse2, 10), + VarianceParams(3, 3, &aom_highbd_10_variance8x8_sse2, 10), + VarianceParams(7, 7, &aom_highbd_8_variance128x128_sse2, 8), + VarianceParams(7, 6, &aom_highbd_8_variance128x64_sse2, 8), + VarianceParams(6, 7, &aom_highbd_8_variance64x128_sse2, 8), + VarianceParams(6, 6, &aom_highbd_8_variance64x64_sse2, 8), + VarianceParams(6, 5, &aom_highbd_8_variance64x32_sse2, 8), + VarianceParams(5, 6, &aom_highbd_8_variance32x64_sse2, 8), + VarianceParams(5, 5, &aom_highbd_8_variance32x32_sse2, 8), + VarianceParams(5, 4, &aom_highbd_8_variance32x16_sse2, 8), + VarianceParams(4, 5, &aom_highbd_8_variance16x32_sse2, 8), + VarianceParams(4, 4, &aom_highbd_8_variance16x16_sse2, 8), + VarianceParams(4, 3, &aom_highbd_8_variance16x8_sse2, 8), + VarianceParams(3, 4, &aom_highbd_8_variance8x16_sse2, 8), + VarianceParams(3, 3, &aom_highbd_8_variance8x8_sse2, 8), + + VarianceParams(6, 4, &aom_highbd_12_variance64x16_sse2, 12), + VarianceParams(4, 6, &aom_highbd_12_variance16x64_sse2, 12), + VarianceParams(5, 3, &aom_highbd_12_variance32x8_sse2, 12), + VarianceParams(3, 5, &aom_highbd_12_variance8x32_sse2, 12), + // VarianceParams(4, 2, &aom_highbd_12_variance16x4_sse2, 12), + // VarianceParams(2, 4, &aom_highbd_12_variance4x16_sse2, 12), + VarianceParams(6, 4, &aom_highbd_10_variance64x16_sse2, 10), + VarianceParams(4, 6, &aom_highbd_10_variance16x64_sse2, 10), + VarianceParams(5, 3, &aom_highbd_10_variance32x8_sse2, 10), + VarianceParams(3, 5, &aom_highbd_10_variance8x32_sse2, 10), + // VarianceParams(4, 2, &aom_highbd_10_variance16x4_sse2, 10), + // VarianceParams(2, 4, &aom_highbd_10_variance4x16_sse2, 10), + VarianceParams(6, 4, &aom_highbd_8_variance64x16_sse2, 8), + VarianceParams(4, 6, &aom_highbd_8_variance16x64_sse2, 8), + VarianceParams(5, 3, &aom_highbd_8_variance32x8_sse2, 8), + VarianceParams(3, 5, &aom_highbd_8_variance8x32_sse2, 8), + // VarianceParams(4, 2, &aom_highbd_8_variance16x4_sse2, 8), + // VarianceParams(2, 4, &aom_highbd_8_variance4x16_sse2, 8), +}; +INSTANTIATE_TEST_SUITE_P(SSE2, AvxHBDVarianceTest, + ::testing::ValuesIn(kArrayHBDVariance_sse2)); + +#if HAVE_AVX2 + +const VarianceParams kArrayHBDVariance_avx2[] = { + VarianceParams(7, 7, &aom_highbd_10_variance128x128_avx2, 10), + VarianceParams(7, 6, &aom_highbd_10_variance128x64_avx2, 10), + VarianceParams(6, 7, &aom_highbd_10_variance64x128_avx2, 10), + VarianceParams(6, 6, &aom_highbd_10_variance64x64_avx2, 10), + VarianceParams(6, 5, &aom_highbd_10_variance64x32_avx2, 10), + VarianceParams(5, 6, &aom_highbd_10_variance32x64_avx2, 10), + VarianceParams(5, 5, &aom_highbd_10_variance32x32_avx2, 10), + VarianceParams(5, 4, &aom_highbd_10_variance32x16_avx2, 10), + VarianceParams(4, 5, &aom_highbd_10_variance16x32_avx2, 10), + VarianceParams(4, 4, &aom_highbd_10_variance16x16_avx2, 10), + VarianceParams(4, 3, &aom_highbd_10_variance16x8_avx2, 10), + VarianceParams(3, 4, &aom_highbd_10_variance8x16_avx2, 10), + VarianceParams(3, 3, &aom_highbd_10_variance8x8_avx2, 10), +}; + +INSTANTIATE_TEST_SUITE_P(AVX2, AvxHBDVarianceTest, + ::testing::ValuesIn(kArrayHBDVariance_avx2)); +#endif // HAVE_AVX2 + +const SubpelVarianceParams kArrayHBDSubpelVariance_sse2[] = { + SubpelVarianceParams(7, 7, &aom_highbd_12_sub_pixel_variance128x128_sse2, 12), + SubpelVarianceParams(7, 6, &aom_highbd_12_sub_pixel_variance128x64_sse2, 12), + SubpelVarianceParams(6, 7, &aom_highbd_12_sub_pixel_variance64x128_sse2, 12), + SubpelVarianceParams(6, 6, &aom_highbd_12_sub_pixel_variance64x64_sse2, 12), + SubpelVarianceParams(6, 5, &aom_highbd_12_sub_pixel_variance64x32_sse2, 12), + SubpelVarianceParams(5, 6, &aom_highbd_12_sub_pixel_variance32x64_sse2, 12), + SubpelVarianceParams(5, 5, &aom_highbd_12_sub_pixel_variance32x32_sse2, 12), + SubpelVarianceParams(5, 4, &aom_highbd_12_sub_pixel_variance32x16_sse2, 12), + SubpelVarianceParams(4, 5, &aom_highbd_12_sub_pixel_variance16x32_sse2, 12), + SubpelVarianceParams(4, 4, &aom_highbd_12_sub_pixel_variance16x16_sse2, 12), + SubpelVarianceParams(4, 3, &aom_highbd_12_sub_pixel_variance16x8_sse2, 12), + SubpelVarianceParams(3, 4, &aom_highbd_12_sub_pixel_variance8x16_sse2, 12), + SubpelVarianceParams(3, 3, &aom_highbd_12_sub_pixel_variance8x8_sse2, 12), + SubpelVarianceParams(3, 2, &aom_highbd_12_sub_pixel_variance8x4_sse2, 12), + SubpelVarianceParams(7, 7, &aom_highbd_10_sub_pixel_variance128x128_sse2, 10), + SubpelVarianceParams(7, 6, &aom_highbd_10_sub_pixel_variance128x64_sse2, 10), + SubpelVarianceParams(6, 7, &aom_highbd_10_sub_pixel_variance64x128_sse2, 10), + SubpelVarianceParams(6, 6, &aom_highbd_10_sub_pixel_variance64x64_sse2, 10), + SubpelVarianceParams(6, 5, &aom_highbd_10_sub_pixel_variance64x32_sse2, 10), + SubpelVarianceParams(5, 6, &aom_highbd_10_sub_pixel_variance32x64_sse2, 10), + SubpelVarianceParams(5, 5, &aom_highbd_10_sub_pixel_variance32x32_sse2, 10), + SubpelVarianceParams(5, 4, &aom_highbd_10_sub_pixel_variance32x16_sse2, 10), + SubpelVarianceParams(4, 5, &aom_highbd_10_sub_pixel_variance16x32_sse2, 10), + SubpelVarianceParams(4, 4, &aom_highbd_10_sub_pixel_variance16x16_sse2, 10), + SubpelVarianceParams(4, 3, &aom_highbd_10_sub_pixel_variance16x8_sse2, 10), + SubpelVarianceParams(3, 4, &aom_highbd_10_sub_pixel_variance8x16_sse2, 10), + SubpelVarianceParams(3, 3, &aom_highbd_10_sub_pixel_variance8x8_sse2, 10), + SubpelVarianceParams(3, 2, &aom_highbd_10_sub_pixel_variance8x4_sse2, 10), + SubpelVarianceParams(7, 7, &aom_highbd_8_sub_pixel_variance128x128_sse2, 8), + SubpelVarianceParams(7, 6, &aom_highbd_8_sub_pixel_variance128x64_sse2, 8), + SubpelVarianceParams(6, 7, &aom_highbd_8_sub_pixel_variance64x128_sse2, 8), + SubpelVarianceParams(6, 6, &aom_highbd_8_sub_pixel_variance64x64_sse2, 8), + SubpelVarianceParams(6, 5, &aom_highbd_8_sub_pixel_variance64x32_sse2, 8), + SubpelVarianceParams(5, 6, &aom_highbd_8_sub_pixel_variance32x64_sse2, 8), + SubpelVarianceParams(5, 5, &aom_highbd_8_sub_pixel_variance32x32_sse2, 8), + SubpelVarianceParams(5, 4, &aom_highbd_8_sub_pixel_variance32x16_sse2, 8), + SubpelVarianceParams(4, 5, &aom_highbd_8_sub_pixel_variance16x32_sse2, 8), + SubpelVarianceParams(4, 4, &aom_highbd_8_sub_pixel_variance16x16_sse2, 8), + SubpelVarianceParams(4, 3, &aom_highbd_8_sub_pixel_variance16x8_sse2, 8), + SubpelVarianceParams(3, 4, &aom_highbd_8_sub_pixel_variance8x16_sse2, 8), + SubpelVarianceParams(3, 3, &aom_highbd_8_sub_pixel_variance8x8_sse2, 8), + SubpelVarianceParams(3, 2, &aom_highbd_8_sub_pixel_variance8x4_sse2, 8), + + SubpelVarianceParams(6, 4, &aom_highbd_12_sub_pixel_variance64x16_sse2, 12), + SubpelVarianceParams(4, 6, &aom_highbd_12_sub_pixel_variance16x64_sse2, 12), + SubpelVarianceParams(5, 3, &aom_highbd_12_sub_pixel_variance32x8_sse2, 12), + SubpelVarianceParams(3, 5, &aom_highbd_12_sub_pixel_variance8x32_sse2, 12), + SubpelVarianceParams(4, 2, &aom_highbd_12_sub_pixel_variance16x4_sse2, 12), + // SubpelVarianceParams(2, 4, &aom_highbd_12_sub_pixel_variance4x16_sse2, 12), + SubpelVarianceParams(6, 4, &aom_highbd_10_sub_pixel_variance64x16_sse2, 10), + SubpelVarianceParams(4, 6, &aom_highbd_10_sub_pixel_variance16x64_sse2, 10), + SubpelVarianceParams(5, 3, &aom_highbd_10_sub_pixel_variance32x8_sse2, 10), + SubpelVarianceParams(3, 5, &aom_highbd_10_sub_pixel_variance8x32_sse2, 10), + SubpelVarianceParams(4, 2, &aom_highbd_10_sub_pixel_variance16x4_sse2, 10), + // SubpelVarianceParams(2, 4, &aom_highbd_10_sub_pixel_variance4x16_sse2, 10), + SubpelVarianceParams(6, 4, &aom_highbd_8_sub_pixel_variance64x16_sse2, 8), + SubpelVarianceParams(4, 6, &aom_highbd_8_sub_pixel_variance16x64_sse2, 8), + SubpelVarianceParams(5, 3, &aom_highbd_8_sub_pixel_variance32x8_sse2, 8), + SubpelVarianceParams(3, 5, &aom_highbd_8_sub_pixel_variance8x32_sse2, 8), + SubpelVarianceParams(4, 2, &aom_highbd_8_sub_pixel_variance16x4_sse2, 8), + // SubpelVarianceParams(2, 4, &aom_highbd_8_sub_pixel_variance4x16_sse2, 8), +}; +INSTANTIATE_TEST_SUITE_P(SSE2, AvxHBDSubpelVarianceTest, + ::testing::ValuesIn(kArrayHBDSubpelVariance_sse2)); + +const SubpelAvgVarianceParams kArrayHBDSubpelAvgVariance_sse2[] = { + SubpelAvgVarianceParams(6, 6, &aom_highbd_12_sub_pixel_avg_variance64x64_sse2, + 12), + SubpelAvgVarianceParams(6, 5, &aom_highbd_12_sub_pixel_avg_variance64x32_sse2, + 12), + SubpelAvgVarianceParams(5, 6, &aom_highbd_12_sub_pixel_avg_variance32x64_sse2, + 12), + SubpelAvgVarianceParams(5, 5, &aom_highbd_12_sub_pixel_avg_variance32x32_sse2, + 12), + SubpelAvgVarianceParams(5, 4, &aom_highbd_12_sub_pixel_avg_variance32x16_sse2, + 12), + SubpelAvgVarianceParams(4, 5, &aom_highbd_12_sub_pixel_avg_variance16x32_sse2, + 12), + SubpelAvgVarianceParams(4, 4, &aom_highbd_12_sub_pixel_avg_variance16x16_sse2, + 12), + SubpelAvgVarianceParams(4, 3, &aom_highbd_12_sub_pixel_avg_variance16x8_sse2, + 12), + SubpelAvgVarianceParams(3, 4, &aom_highbd_12_sub_pixel_avg_variance8x16_sse2, + 12), + SubpelAvgVarianceParams(3, 3, &aom_highbd_12_sub_pixel_avg_variance8x8_sse2, + 12), + SubpelAvgVarianceParams(3, 2, &aom_highbd_12_sub_pixel_avg_variance8x4_sse2, + 12), + SubpelAvgVarianceParams(6, 6, &aom_highbd_10_sub_pixel_avg_variance64x64_sse2, + 10), + SubpelAvgVarianceParams(6, 5, &aom_highbd_10_sub_pixel_avg_variance64x32_sse2, + 10), + SubpelAvgVarianceParams(5, 6, &aom_highbd_10_sub_pixel_avg_variance32x64_sse2, + 10), + SubpelAvgVarianceParams(5, 5, &aom_highbd_10_sub_pixel_avg_variance32x32_sse2, + 10), + SubpelAvgVarianceParams(5, 4, &aom_highbd_10_sub_pixel_avg_variance32x16_sse2, + 10), + SubpelAvgVarianceParams(4, 5, &aom_highbd_10_sub_pixel_avg_variance16x32_sse2, + 10), + SubpelAvgVarianceParams(4, 4, &aom_highbd_10_sub_pixel_avg_variance16x16_sse2, + 10), + SubpelAvgVarianceParams(4, 3, &aom_highbd_10_sub_pixel_avg_variance16x8_sse2, + 10), + SubpelAvgVarianceParams(3, 4, &aom_highbd_10_sub_pixel_avg_variance8x16_sse2, + 10), + SubpelAvgVarianceParams(3, 3, &aom_highbd_10_sub_pixel_avg_variance8x8_sse2, + 10), + SubpelAvgVarianceParams(3, 2, &aom_highbd_10_sub_pixel_avg_variance8x4_sse2, + 10), + SubpelAvgVarianceParams(6, 6, &aom_highbd_8_sub_pixel_avg_variance64x64_sse2, + 8), + SubpelAvgVarianceParams(6, 5, &aom_highbd_8_sub_pixel_avg_variance64x32_sse2, + 8), + SubpelAvgVarianceParams(5, 6, &aom_highbd_8_sub_pixel_avg_variance32x64_sse2, + 8), + SubpelAvgVarianceParams(5, 5, &aom_highbd_8_sub_pixel_avg_variance32x32_sse2, + 8), + SubpelAvgVarianceParams(5, 4, &aom_highbd_8_sub_pixel_avg_variance32x16_sse2, + 8), + SubpelAvgVarianceParams(4, 5, &aom_highbd_8_sub_pixel_avg_variance16x32_sse2, + 8), + SubpelAvgVarianceParams(4, 4, &aom_highbd_8_sub_pixel_avg_variance16x16_sse2, + 8), + SubpelAvgVarianceParams(4, 3, &aom_highbd_8_sub_pixel_avg_variance16x8_sse2, + 8), + SubpelAvgVarianceParams(3, 4, &aom_highbd_8_sub_pixel_avg_variance8x16_sse2, + 8), + SubpelAvgVarianceParams(3, 3, &aom_highbd_8_sub_pixel_avg_variance8x8_sse2, + 8), + SubpelAvgVarianceParams(3, 2, &aom_highbd_8_sub_pixel_avg_variance8x4_sse2, + 8), + + SubpelAvgVarianceParams(6, 4, &aom_highbd_12_sub_pixel_avg_variance64x16_sse2, + 12), + SubpelAvgVarianceParams(4, 6, &aom_highbd_12_sub_pixel_avg_variance16x64_sse2, + 12), + SubpelAvgVarianceParams(5, 3, &aom_highbd_12_sub_pixel_avg_variance32x8_sse2, + 12), + SubpelAvgVarianceParams(3, 5, &aom_highbd_12_sub_pixel_avg_variance8x32_sse2, + 12), + SubpelAvgVarianceParams(4, 2, &aom_highbd_12_sub_pixel_avg_variance16x4_sse2, + 12), + // SubpelAvgVarianceParams(2, 4, + // &aom_highbd_12_sub_pixel_avg_variance4x16_sse2, 12), + SubpelAvgVarianceParams(6, 4, &aom_highbd_10_sub_pixel_avg_variance64x16_sse2, + 10), + SubpelAvgVarianceParams(4, 6, &aom_highbd_10_sub_pixel_avg_variance16x64_sse2, + 10), + SubpelAvgVarianceParams(5, 3, &aom_highbd_10_sub_pixel_avg_variance32x8_sse2, + 10), + SubpelAvgVarianceParams(3, 5, &aom_highbd_10_sub_pixel_avg_variance8x32_sse2, + 10), + SubpelAvgVarianceParams(4, 2, &aom_highbd_10_sub_pixel_avg_variance16x4_sse2, + 10), + // SubpelAvgVarianceParams(2, 4, + // &aom_highbd_10_sub_pixel_avg_variance4x16_sse2, 10), + SubpelAvgVarianceParams(6, 4, &aom_highbd_8_sub_pixel_avg_variance64x16_sse2, + 8), + SubpelAvgVarianceParams(4, 6, &aom_highbd_8_sub_pixel_avg_variance16x64_sse2, + 8), + SubpelAvgVarianceParams(5, 3, &aom_highbd_8_sub_pixel_avg_variance32x8_sse2, + 8), + SubpelAvgVarianceParams(3, 5, &aom_highbd_8_sub_pixel_avg_variance8x32_sse2, + 8), + SubpelAvgVarianceParams(4, 2, &aom_highbd_8_sub_pixel_avg_variance16x4_sse2, + 8), + // SubpelAvgVarianceParams(2, 4, + // &aom_highbd_8_sub_pixel_avg_variance4x16_sse2, 8), +}; + +INSTANTIATE_TEST_SUITE_P(SSE2, AvxHBDSubpelAvgVarianceTest, + ::testing::ValuesIn(kArrayHBDSubpelAvgVariance_sse2)); +#endif // HAVE_SSE2 +#endif // CONFIG_AV1_HIGHBITDEPTH + +#if HAVE_SSSE3 +INSTANTIATE_TEST_SUITE_P( + SSSE3, AvxSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_ssse3, 0), + SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_ssse3, 0), + SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_ssse3, 0), + SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_ssse3, 0), + SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_ssse3, 0), + SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_ssse3, 0), + SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_ssse3, 0), + SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_ssse3, 0), + SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_ssse3, 0), + SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_ssse3, 0), + SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_ssse3, 0), + SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_ssse3, 0), + SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_ssse3, 0), + SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_ssse3, 0), + SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_ssse3, 0), + SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_ssse3, 0), + + SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_ssse3, 0), + SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_ssse3, 0), + SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_ssse3, 0), + SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_ssse3, 0), + SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_ssse3, 0), + SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_ssse3, 0))); + +INSTANTIATE_TEST_SUITE_P( + SSSE3, AvxSubpelAvgVarianceTest, + ::testing::Values( + SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_ssse3, + 0), + SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_ssse3, + 0), + SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_ssse3, + 0), + SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_ssse3, + 0), + SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_ssse3, + 0), + SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_ssse3, + 0), + SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_ssse3, + 0), + SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_ssse3, + 0), + SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_ssse3, + 0), + SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_ssse3, + 0), + SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_ssse3, 0), + SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_ssse3, 0), + SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_ssse3, 0), + SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_ssse3, 0), + SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_ssse3, 0), + SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_ssse3, 0), + + SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_ssse3, + 0), + SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_ssse3, + 0), + SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_ssse3, 0), + SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_ssse3, 0), + SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_ssse3, 0), + SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_ssse3, + 0))); + +INSTANTIATE_TEST_SUITE_P( + SSSE3, AvxDistWtdSubpelAvgVarianceTest, + ::testing::Values( + DistWtdSubpelAvgVarianceParams( + 7, 7, &aom_dist_wtd_sub_pixel_avg_variance128x128_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 7, 6, &aom_dist_wtd_sub_pixel_avg_variance128x64_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 6, 7, &aom_dist_wtd_sub_pixel_avg_variance64x128_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 6, 6, &aom_dist_wtd_sub_pixel_avg_variance64x64_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 6, 5, &aom_dist_wtd_sub_pixel_avg_variance64x32_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 5, 6, &aom_dist_wtd_sub_pixel_avg_variance32x64_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 5, 5, &aom_dist_wtd_sub_pixel_avg_variance32x32_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 5, 4, &aom_dist_wtd_sub_pixel_avg_variance32x16_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 4, 5, &aom_dist_wtd_sub_pixel_avg_variance16x32_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 4, 4, &aom_dist_wtd_sub_pixel_avg_variance16x16_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 4, 3, &aom_dist_wtd_sub_pixel_avg_variance16x8_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 3, 4, &aom_dist_wtd_sub_pixel_avg_variance8x16_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 3, 3, &aom_dist_wtd_sub_pixel_avg_variance8x8_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 3, 2, &aom_dist_wtd_sub_pixel_avg_variance8x4_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 2, 3, &aom_dist_wtd_sub_pixel_avg_variance4x8_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 2, 2, &aom_dist_wtd_sub_pixel_avg_variance4x4_ssse3, 0), + + DistWtdSubpelAvgVarianceParams( + 6, 4, &aom_dist_wtd_sub_pixel_avg_variance64x16_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 4, 6, &aom_dist_wtd_sub_pixel_avg_variance16x64_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 5, 3, &aom_dist_wtd_sub_pixel_avg_variance32x8_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 3, 5, &aom_dist_wtd_sub_pixel_avg_variance8x32_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 4, 2, &aom_dist_wtd_sub_pixel_avg_variance16x4_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 2, 4, &aom_dist_wtd_sub_pixel_avg_variance4x16_ssse3, 0))); +#endif // HAVE_SSSE3 + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P( + SSE4_1, AvxObmcSubpelVarianceTest, + ::testing::Values( + ObmcSubpelVarianceParams(7, 7, + &aom_obmc_sub_pixel_variance128x128_sse4_1, 0), + ObmcSubpelVarianceParams(7, 6, + &aom_obmc_sub_pixel_variance128x64_sse4_1, 0), + ObmcSubpelVarianceParams(6, 7, + &aom_obmc_sub_pixel_variance64x128_sse4_1, 0), + ObmcSubpelVarianceParams(6, 6, &aom_obmc_sub_pixel_variance64x64_sse4_1, + 0), + ObmcSubpelVarianceParams(6, 5, &aom_obmc_sub_pixel_variance64x32_sse4_1, + 0), + ObmcSubpelVarianceParams(5, 6, &aom_obmc_sub_pixel_variance32x64_sse4_1, + 0), + ObmcSubpelVarianceParams(5, 5, &aom_obmc_sub_pixel_variance32x32_sse4_1, + 0), + ObmcSubpelVarianceParams(5, 4, &aom_obmc_sub_pixel_variance32x16_sse4_1, + 0), + ObmcSubpelVarianceParams(4, 5, &aom_obmc_sub_pixel_variance16x32_sse4_1, + 0), + ObmcSubpelVarianceParams(4, 4, &aom_obmc_sub_pixel_variance16x16_sse4_1, + 0), + ObmcSubpelVarianceParams(4, 3, &aom_obmc_sub_pixel_variance16x8_sse4_1, + 0), + ObmcSubpelVarianceParams(3, 4, &aom_obmc_sub_pixel_variance8x16_sse4_1, + 0), + ObmcSubpelVarianceParams(3, 3, &aom_obmc_sub_pixel_variance8x8_sse4_1, + 0), + ObmcSubpelVarianceParams(3, 2, &aom_obmc_sub_pixel_variance8x4_sse4_1, + 0), + ObmcSubpelVarianceParams(2, 3, &aom_obmc_sub_pixel_variance4x8_sse4_1, + 0), + ObmcSubpelVarianceParams(2, 2, &aom_obmc_sub_pixel_variance4x4_sse4_1, + 0), + + ObmcSubpelVarianceParams(6, 4, &aom_obmc_sub_pixel_variance64x16_sse4_1, + 0), + ObmcSubpelVarianceParams(4, 6, &aom_obmc_sub_pixel_variance16x64_sse4_1, + 0), + ObmcSubpelVarianceParams(5, 3, &aom_obmc_sub_pixel_variance32x8_sse4_1, + 0), + ObmcSubpelVarianceParams(3, 5, &aom_obmc_sub_pixel_variance8x32_sse4_1, + 0), + ObmcSubpelVarianceParams(4, 2, &aom_obmc_sub_pixel_variance16x4_sse4_1, + 0), + ObmcSubpelVarianceParams(2, 4, &aom_obmc_sub_pixel_variance4x16_sse4_1, + 0))); +#endif // HAVE_SSE4_1 + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2, AvxMseTest, + ::testing::Values(MseParams(4, 4, + &aom_mse16x16_avx2))); + +INSTANTIATE_TEST_SUITE_P( + AVX2, AvxVarianceTest, + ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_avx2), + VarianceParams(7, 6, &aom_variance128x64_avx2), + VarianceParams(6, 7, &aom_variance64x128_avx2), + VarianceParams(6, 6, &aom_variance64x64_avx2), + VarianceParams(6, 5, &aom_variance64x32_avx2), + VarianceParams(6, 4, &aom_variance64x16_avx2), + VarianceParams(5, 6, &aom_variance32x64_avx2), + VarianceParams(5, 5, &aom_variance32x32_avx2), + VarianceParams(5, 4, &aom_variance32x16_avx2), + VarianceParams(5, 3, &aom_variance32x8_avx2), + VarianceParams(4, 6, &aom_variance16x64_avx2), + VarianceParams(4, 5, &aom_variance16x32_avx2), + VarianceParams(4, 4, &aom_variance16x16_avx2), + VarianceParams(4, 3, &aom_variance16x8_avx2), + VarianceParams(4, 2, &aom_variance16x4_avx2))); + +INSTANTIATE_TEST_SUITE_P( + AVX2, AvxSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_avx2, 0), + SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_avx2, 0), + SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_avx2, 0), + SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_avx2, 0), + SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_avx2, 0), + SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_avx2, 0), + SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_avx2, 0), + SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_avx2, 0), + SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_avx2, 0), + SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_avx2, 0), + SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_avx2, 0), + SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_avx2, 0), + SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_avx2, 0))); + +INSTANTIATE_TEST_SUITE_P( + AVX2, AvxSubpelAvgVarianceTest, + ::testing::Values( + SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_avx2, + 0), + SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_avx2, + 0), + SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_avx2, + 0), + SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_avx2, 0), + SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_avx2, 0), + SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_avx2, 0), + SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_avx2, 0), + SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_avx2, + 0))); +#endif // HAVE_AVX2 + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, AvxSseTest, + ::testing::Values(SseParams(2, 2, + &aom_get4x4sse_cs_neon))); + +INSTANTIATE_TEST_SUITE_P(NEON, AvxMseTest, + ::testing::Values(MseParams(4, 4, + &aom_mse16x16_neon))); + +INSTANTIATE_TEST_SUITE_P( + NEON, AvxVarianceTest, + ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_neon), + VarianceParams(6, 6, &aom_variance64x64_neon), + VarianceParams(6, 5, &aom_variance64x32_neon), + VarianceParams(5, 6, &aom_variance32x64_neon), + VarianceParams(5, 5, &aom_variance32x32_neon), + VarianceParams(4, 4, &aom_variance16x16_neon), + VarianceParams(4, 3, &aom_variance16x8_neon), + VarianceParams(3, 4, &aom_variance8x16_neon), + VarianceParams(3, 3, &aom_variance8x8_neon))); + +INSTANTIATE_TEST_SUITE_P( + NEON, AvxSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_neon, 0), + SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_neon, 0), + SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_neon, 0), + SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_neon, 0))); +#endif // HAVE_NEON + +#if HAVE_MSA +INSTANTIATE_TEST_SUITE_P(MSA, SumOfSquaresTest, + ::testing::Values(aom_get_mb_ss_msa)); + +INSTANTIATE_TEST_SUITE_P(MSA, AvxSseTest, + ::testing::Values(SseParams(2, 2, + &aom_get4x4sse_cs_msa))); + +INSTANTIATE_TEST_SUITE_P(MSA, AvxMseTest, + ::testing::Values(MseParams(4, 4, &aom_mse16x16_msa), + MseParams(4, 3, &aom_mse16x8_msa), + MseParams(3, 4, &aom_mse8x16_msa), + MseParams(3, 3, &aom_mse8x8_msa))); + +INSTANTIATE_TEST_SUITE_P( + MSA, AvxVarianceTest, + ::testing::Values(VarianceParams(6, 6, &aom_variance64x64_msa), + VarianceParams(6, 5, &aom_variance64x32_msa), + VarianceParams(5, 6, &aom_variance32x64_msa), + VarianceParams(5, 5, &aom_variance32x32_msa), + VarianceParams(5, 4, &aom_variance32x16_msa), + VarianceParams(4, 5, &aom_variance16x32_msa), + VarianceParams(4, 4, &aom_variance16x16_msa), + VarianceParams(4, 3, &aom_variance16x8_msa), + VarianceParams(3, 4, &aom_variance8x16_msa), + VarianceParams(3, 3, &aom_variance8x8_msa), + VarianceParams(3, 2, &aom_variance8x4_msa), + VarianceParams(2, 3, &aom_variance4x8_msa), + VarianceParams(2, 2, &aom_variance4x4_msa))); + +INSTANTIATE_TEST_SUITE_P( + MSA, AvxSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_msa, 0), + SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_msa, 0), + SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_msa, 0), + SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_msa, 0), + SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_msa, 0), + SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_msa, 0), + SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_msa, 0), + SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_msa, 0), + SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_msa, 0), + SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_msa, 0), + SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_msa, 0), + SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_msa, 0), + SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_msa, 0))); + +INSTANTIATE_TEST_SUITE_P( + MSA, AvxSubpelAvgVarianceTest, + ::testing::Values( + SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_msa, 0), + SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_msa, 0), + SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_msa, 0), + SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_msa, 0), + SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_msa, 0), + SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_msa, 0), + SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_msa, 0), + SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_msa, 0), + SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_msa, 0), + SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_msa, 0), + SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_msa, 0), + SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_msa, 0), + SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_msa, 0))); +#endif // HAVE_MSA +} // namespace diff --git a/libs/libaom/src/test/video_source.h b/libs/libaom/src/test/video_source.h new file mode 100644 index 000000000..3c1c5e559 --- /dev/null +++ b/libs/libaom/src/test/video_source.h @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_TEST_VIDEO_SOURCE_H_ +#define AOM_TEST_VIDEO_SOURCE_H_ + +#if defined(_WIN32) +#undef NOMINMAX +#define NOMINMAX +#define WIN32_LEAN_AND_MEAN +#include +#endif +#include +#include +#include +#include "test/acm_random.h" +#include "aom/aom_encoder.h" + +namespace libaom_test { + +// Helper macros to ensure LIBAOM_TEST_DATA_PATH is a quoted string. +// These are undefined right below GetDataPath +// NOTE: LIBAOM_TEST_DATA_PATH MUST NOT be a quoted string before +// Stringification or the GetDataPath will fail at runtime +#define TO_STRING(S) #S +#define STRINGIFY(S) TO_STRING(S) + +// A simple function to encapsulate cross platform retrieval of test data path +static std::string GetDataPath() { + const char *const data_path = getenv("LIBAOM_TEST_DATA_PATH"); + if (data_path == NULL) { +#ifdef LIBAOM_TEST_DATA_PATH + // In some environments, we cannot set environment variables + // Instead, we set the data path by using a preprocessor symbol + // which can be set from make files + return STRINGIFY(LIBAOM_TEST_DATA_PATH); +#else + return "."; +#endif + } + return data_path; +} + +// Undefining stringification macros because they are not used elsewhere +#undef TO_STRING +#undef STRINGIFY + +inline FILE *OpenTestDataFile(const std::string &file_name) { + const std::string path_to_source = GetDataPath() + "/" + file_name; + return fopen(path_to_source.c_str(), "rb"); +} + +static FILE *GetTempOutFile(std::string *file_name) { + file_name->clear(); +#if defined(_WIN32) + char fname[MAX_PATH]; + char tmppath[MAX_PATH]; + if (GetTempPathA(MAX_PATH, tmppath)) { + // Assume for now that the filename generated is unique per process + if (GetTempFileNameA(tmppath, "lvx", 0, fname)) { + file_name->assign(fname); + return fopen(fname, "wb+"); + } + } + return NULL; +#else + char name_template[] = "/tmp/libaomtest.XXXXXX"; + const int fd = mkstemp(name_template); + *file_name = name_template; + return fdopen(fd, "wb+"); +#endif +} + +class TempOutFile { + public: + TempOutFile() { file_ = GetTempOutFile(&file_name_); } + ~TempOutFile() { + CloseFile(); + if (!file_name_.empty()) { + EXPECT_EQ(0, remove(file_name_.c_str())); + } + } + FILE *file() { return file_; } + const std::string &file_name() { return file_name_; } + + protected: + void CloseFile() { + if (file_) { + fclose(file_); + file_ = NULL; + } + } + FILE *file_; + std::string file_name_; +}; + +// Abstract base class for test video sources, which provide a stream of +// aom_image_t images with associated timestamps and duration. +class VideoSource { + public: + virtual ~VideoSource() {} + + // Prepare the stream for reading, rewind/open as necessary. + virtual void Begin() = 0; + + // Advance the cursor to the next frame + virtual void Next() = 0; + + // Get the current video frame, or NULL on End-Of-Stream. + virtual aom_image_t *img() const = 0; + + // Get the presentation timestamp of the current frame. + virtual aom_codec_pts_t pts() const = 0; + + // Get the current frame's duration + virtual unsigned long duration() const = 0; + + // Get the timebase for the stream + virtual aom_rational_t timebase() const = 0; + + // Get the current frame counter, starting at 0. + virtual unsigned int frame() const = 0; + + // Get the current file limit. + virtual unsigned int limit() const = 0; +}; + +class DummyVideoSource : public VideoSource { + public: + DummyVideoSource() + : img_(NULL), limit_(100), width_(80), height_(64), + format_(AOM_IMG_FMT_I420) { + ReallocImage(); + } + + virtual ~DummyVideoSource() { aom_img_free(img_); } + + virtual void Begin() { + frame_ = 0; + FillFrame(); + } + + virtual void Next() { + ++frame_; + FillFrame(); + } + + virtual aom_image_t *img() const { return (frame_ < limit_) ? img_ : NULL; } + + // Models a stream where Timebase = 1/FPS, so pts == frame. + virtual aom_codec_pts_t pts() const { return frame_; } + + virtual unsigned long duration() const { return 1; } + + virtual aom_rational_t timebase() const { + const aom_rational_t t = { 1, 30 }; + return t; + } + + virtual unsigned int frame() const { return frame_; } + + virtual unsigned int limit() const { return limit_; } + + void set_limit(unsigned int limit) { limit_ = limit; } + + void SetSize(unsigned int width, unsigned int height) { + if (width != width_ || height != height_) { + width_ = width; + height_ = height; + ReallocImage(); + } + } + + void SetImageFormat(aom_img_fmt_t format) { + if (format_ != format) { + format_ = format; + ReallocImage(); + } + } + + protected: + virtual void FillFrame() { + if (img_) memset(img_->img_data, 0, raw_sz_); + } + + void ReallocImage() { + aom_img_free(img_); + img_ = aom_img_alloc(NULL, format_, width_, height_, 32); + raw_sz_ = ((img_->w + 31) & ~31) * img_->h * img_->bps / 8; + } + + aom_image_t *img_; + size_t raw_sz_; + unsigned int limit_; + unsigned int frame_; + unsigned int width_; + unsigned int height_; + aom_img_fmt_t format_; +}; + +class RandomVideoSource : public DummyVideoSource { + public: + RandomVideoSource(int seed = ACMRandom::DeterministicSeed()) + : rnd_(seed), seed_(seed) {} + + protected: + // Reset the RNG to get a matching stream for the second pass + virtual void Begin() { + frame_ = 0; + rnd_.Reset(seed_); + FillFrame(); + } + + // 15 frames of noise, followed by 15 static frames. Reset to 0 rather + // than holding previous frames to encourage keyframes to be thrown. + virtual void FillFrame() { + if (img_) { + if (frame_ % 30 < 15) + for (size_t i = 0; i < raw_sz_; ++i) img_->img_data[i] = rnd_.Rand8(); + else + memset(img_->img_data, 0, raw_sz_); + } + } + + ACMRandom rnd_; + int seed_; +}; + +// Abstract base class for test video sources, which provide a stream of +// decompressed images to the decoder. +class CompressedVideoSource { + public: + virtual ~CompressedVideoSource() {} + + virtual void Init() = 0; + + // Prepare the stream for reading, rewind/open as necessary. + virtual void Begin() = 0; + + // Advance the cursor to the next frame + virtual void Next() = 0; + + virtual const uint8_t *cxdata() const = 0; + + virtual size_t frame_size() const = 0; + + virtual unsigned int frame_number() const = 0; +}; + +} // namespace libaom_test + +#endif // AOM_TEST_VIDEO_SOURCE_H_ diff --git a/libs/libaom/src/test/visual_metrics.py b/libs/libaom/src/test/visual_metrics.py new file mode 100644 index 000000000..9055feb33 --- /dev/null +++ b/libs/libaom/src/test/visual_metrics.py @@ -0,0 +1,466 @@ +#!/usr/bin/python +# +# Copyright (c) 2016, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and +# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +# was not distributed with this source code in the LICENSE file, you can +# obtain it at www.aomedia.org/license/software. If the Alliance for Open +# Media Patent License 1.0 was not distributed with this source code in the +# PATENTS file, you can obtain it at www.aomedia.org/license/patent. +# + +"""Converts video encoding result data from text files to visualization +data source.""" + +__author__ = "jzern@google.com (James Zern)," +__author__ += "jimbankoski@google.com (Jim Bankoski)" + +import fnmatch +import numpy as np +import scipy as sp +import scipy.interpolate +import os +import re +import string +import sys +import math +import warnings + +import gviz_api + +from os.path import basename +from os.path import splitext + +warnings.simplefilter('ignore', np.RankWarning) +warnings.simplefilter('ignore', RuntimeWarning) + +def bdsnr2(metric_set1, metric_set2): + """ + BJONTEGAARD Bjontegaard metric calculation adapted + Bjontegaard's snr metric allows to compute the average % saving in decibels + between two rate-distortion curves [1]. This is an adaptation of that + method that fixes inconsistencies when the curve fit operation goes awry + by replacing the curve fit function with a Piecewise Cubic Hermite + Interpolating Polynomial and then integrating that by evaluating that + function at small intervals using the trapezoid method to calculate + the integral. + + metric_set1 - list of tuples ( bitrate, metric ) for first graph + metric_set2 - list of tuples ( bitrate, metric ) for second graph + """ + + if not metric_set1 or not metric_set2: + return 0.0 + + try: + + # pchip_interlopate requires keys sorted by x axis. x-axis will + # be our metric not the bitrate so sort by metric. + metric_set1.sort() + metric_set2.sort() + + # Pull the log of the rate and clamped psnr from metric_sets. + log_rate1 = [math.log(x[0]) for x in metric_set1] + metric1 = [100.0 if x[1] == float('inf') else x[1] for x in metric_set1] + log_rate2 = [math.log(x[0]) for x in metric_set2] + metric2 = [100.0 if x[1] == float('inf') else x[1] for x in metric_set2] + + # Integration interval. This metric only works on the area that's + # overlapping. Extrapolation of these things is sketchy so we avoid. + min_int = max([min(log_rate1), min(log_rate2)]) + max_int = min([max(log_rate1), max(log_rate2)]) + + # No overlap means no sensible metric possible. + if max_int <= min_int: + return 0.0 + + # Use Piecewise Cubic Hermite Interpolating Polynomial interpolation to + # create 100 new samples points separated by interval. + lin = np.linspace(min_int, max_int, num=100, retstep=True) + interval = lin[1] + samples = lin[0] + v1 = scipy.interpolate.pchip_interpolate(log_rate1, metric1, samples) + v2 = scipy.interpolate.pchip_interpolate(log_rate2, metric2, samples) + + # Calculate the integral using the trapezoid method on the samples. + int_v1 = np.trapz(v1, dx=interval) + int_v2 = np.trapz(v2, dx=interval) + + # Calculate the average improvement. + avg_exp_diff = (int_v2 - int_v1) / (max_int - min_int) + + except (TypeError, ZeroDivisionError, ValueError, np.RankWarning) as e: + return 0 + + return avg_exp_diff + +def bdrate2(metric_set1, metric_set2): + """ + BJONTEGAARD Bjontegaard metric calculation adapted + Bjontegaard's metric allows to compute the average % saving in bitrate + between two rate-distortion curves [1]. This is an adaptation of that + method that fixes inconsistencies when the curve fit operation goes awry + by replacing the curve fit function with a Piecewise Cubic Hermite + Interpolating Polynomial and then integrating that by evaluating that + function at small intervals using the trapezoid method to calculate + the integral. + + metric_set1 - list of tuples ( bitrate, metric ) for first graph + metric_set2 - list of tuples ( bitrate, metric ) for second graph + """ + + if not metric_set1 or not metric_set2: + return 0.0 + + try: + + # pchip_interlopate requires keys sorted by x axis. x-axis will + # be our metric not the bitrate so sort by metric. + metric_set1.sort(key=lambda tup: tup[1]) + metric_set2.sort(key=lambda tup: tup[1]) + + # Pull the log of the rate and clamped psnr from metric_sets. + log_rate1 = [math.log(x[0]) for x in metric_set1] + metric1 = [100.0 if x[1] == float('inf') else x[1] for x in metric_set1] + log_rate2 = [math.log(x[0]) for x in metric_set2] + metric2 = [100.0 if x[1] == float('inf') else x[1] for x in metric_set2] + + # Integration interval. This metric only works on the area that's + # overlapping. Extrapolation of these things is sketchy so we avoid. + min_int = max([min(metric1), min(metric2)]) + max_int = min([max(metric1), max(metric2)]) + + # No overlap means no sensible metric possible. + if max_int <= min_int: + return 0.0 + + # Use Piecewise Cubic Hermite Interpolating Polynomial interpolation to + # create 100 new samples points separated by interval. + lin = np.linspace(min_int, max_int, num=100, retstep=True) + interval = lin[1] + samples = lin[0] + v1 = scipy.interpolate.pchip_interpolate(metric1, log_rate1, samples) + v2 = scipy.interpolate.pchip_interpolate(metric2, log_rate2, samples) + + # Calculate the integral using the trapezoid method on the samples. + int_v1 = np.trapz(v1, dx=interval) + int_v2 = np.trapz(v2, dx=interval) + + # Calculate the average improvement. + avg_exp_diff = (int_v2 - int_v1) / (max_int - min_int) + + except (TypeError, ZeroDivisionError, ValueError, np.RankWarning) as e: + return 0 + + # Convert to a percentage. + avg_diff = (math.exp(avg_exp_diff) - 1) * 100 + + return avg_diff + + + +def FillForm(string_for_substitution, dictionary_of_vars): + """ + This function substitutes all matches of the command string //%% ... %%// + with the variable represented by ... . + """ + return_string = string_for_substitution + for i in re.findall("//%%(.*)%%//", string_for_substitution): + return_string = re.sub("//%%" + i + "%%//", dictionary_of_vars[i], + return_string) + return return_string + + +def HasMetrics(line): + """ + The metrics files produced by aomenc are started with a B for headers. + """ + # If the first char of the first word on the line is a digit + if len(line) == 0: + return False + if len(line.split()) == 0: + return False + if line.split()[0][0:1].isdigit(): + return True + return False + +def GetMetrics(file_name): + metric_file = open(file_name, "r") + return metric_file.readline().split(); + +def ParseMetricFile(file_name, metric_column): + metric_set1 = set([]) + metric_file = open(file_name, "r") + for line in metric_file: + metrics = string.split(line) + if HasMetrics(line): + if metric_column < len(metrics): + try: + tuple = float(metrics[0]), float(metrics[metric_column]) + except: + tuple = float(metrics[0]), 0 + else: + tuple = float(metrics[0]), 0 + metric_set1.add(tuple) + metric_set1_sorted = sorted(metric_set1) + return metric_set1_sorted + + +def FileBetter(file_name_1, file_name_2, metric_column, method): + """ + Compares two data files and determines which is better and by how + much. Also produces a histogram of how much better, by PSNR. + metric_column is the metric. + """ + # Store and parse our two files into lists of unique tuples. + + # Read the two files, parsing out lines starting with bitrate. + metric_set1_sorted = ParseMetricFile(file_name_1, metric_column) + metric_set2_sorted = ParseMetricFile(file_name_2, metric_column) + + + def GraphBetter(metric_set1_sorted, metric_set2_sorted, base_is_set_2): + """ + Search through the sorted metric file for metrics on either side of + the metric from file 1. Since both lists are sorted we really + should not have to search through the entire range, but these + are small files.""" + total_bitrate_difference_ratio = 0.0 + count = 0 + for bitrate, metric in metric_set1_sorted: + if bitrate == 0: + continue + for i in range(len(metric_set2_sorted) - 1): + s2_bitrate_0, s2_metric_0 = metric_set2_sorted[i] + s2_bitrate_1, s2_metric_1 = metric_set2_sorted[i + 1] + # We have a point on either side of our metric range. + if metric > s2_metric_0 and metric <= s2_metric_1: + + # Calculate a slope. + if s2_metric_1 - s2_metric_0 != 0: + metric_slope = ((s2_bitrate_1 - s2_bitrate_0) / + (s2_metric_1 - s2_metric_0)) + else: + metric_slope = 0 + + estimated_s2_bitrate = (s2_bitrate_0 + (metric - s2_metric_0) * + metric_slope) + + if estimated_s2_bitrate == 0: + continue + # Calculate percentage difference as given by base. + if base_is_set_2 == 0: + bitrate_difference_ratio = ((bitrate - estimated_s2_bitrate) / + bitrate) + else: + bitrate_difference_ratio = ((bitrate - estimated_s2_bitrate) / + estimated_s2_bitrate) + + total_bitrate_difference_ratio += bitrate_difference_ratio + count += 1 + break + + # Calculate the average improvement between graphs. + if count != 0: + avg = total_bitrate_difference_ratio / count + + else: + avg = 0.0 + + return avg + + # Be fair to both graphs by testing all the points in each. + if method == 'avg': + avg_improvement = 50 * ( + GraphBetter(metric_set1_sorted, metric_set2_sorted, 1) - + GraphBetter(metric_set2_sorted, metric_set1_sorted, 0)) + elif method == 'dsnr': + avg_improvement = bdsnr2(metric_set1_sorted, metric_set2_sorted) + else: + avg_improvement = bdrate2(metric_set2_sorted, metric_set1_sorted) + + return avg_improvement + + +def HandleFiles(variables): + """ + This script creates html for displaying metric data produced from data + in a video stats file, as created by the AOM project when enable_psnr + is turned on: + + Usage: visual_metrics.py template.html pattern base_dir sub_dir [ sub_dir2 ..] + + The script parses each metrics file [see below] that matches the + statfile_pattern in the baseline directory and looks for the file that + matches that same file in each of the sub_dirs, and compares the resultant + metrics bitrate, avg psnr, glb psnr, and ssim. " + + It provides a table in which each row is a file in the line directory, + and a column for each subdir, with the cells representing how that clip + compares to baseline for that subdir. A graph is given for each which + compares filesize to that metric. If you click on a point in the graph it + zooms in on that point. + + a SAMPLE metrics file: + + Bitrate AVGPsnr GLBPsnr AVPsnrP GLPsnrP VPXSSIM Time(us) + 25.911 38.242 38.104 38.258 38.121 75.790 14103 + Bitrate AVGPsnr GLBPsnr AVPsnrP GLPsnrP VPXSSIM Time(us) + 49.982 41.264 41.129 41.255 41.122 83.993 19817 + Bitrate AVGPsnr GLBPsnr AVPsnrP GLPsnrP VPXSSIM Time(us) + 74.967 42.911 42.767 42.899 42.756 87.928 17332 + Bitrate AVGPsnr GLBPsnr AVPsnrP GLPsnrP VPXSSIM Time(us) + 100.012 43.983 43.838 43.881 43.738 89.695 25389 + Bitrate AVGPsnr GLBPsnr AVPsnrP GLPsnrP VPXSSIM Time(us) + 149.980 45.338 45.203 45.184 45.043 91.591 25438 + Bitrate AVGPsnr GLBPsnr AVPsnrP GLPsnrP VPXSSIM Time(us) + 199.852 46.225 46.123 46.113 45.999 92.679 28302 + Bitrate AVGPsnr GLBPsnr AVPsnrP GLPsnrP VPXSSIM Time(us) + 249.922 46.864 46.773 46.777 46.673 93.334 27244 + Bitrate AVGPsnr GLBPsnr AVPsnrP GLPsnrP VPXSSIM Time(us) + 299.998 47.366 47.281 47.317 47.220 93.844 27137 + Bitrate AVGPsnr GLBPsnr AVPsnrP GLPsnrP VPXSSIM Time(us) + 349.769 47.746 47.677 47.722 47.648 94.178 32226 + Bitrate AVGPsnr GLBPsnr AVPsnrP GLPsnrP VPXSSIM Time(us) + 399.773 48.032 47.971 48.013 47.946 94.362 36203 + + sample use: + visual_metrics.py template.html "*stt" aom aom_b aom_c > metrics.html + """ + + # The template file is the html file into which we will write the + # data from the stats file, formatted correctly for the gviz_api. + template_file = open(variables[1], "r") + page_template = template_file.read() + template_file.close() + + # This is the path match pattern for finding stats files amongst + # all the other files it could be. eg: *.stt + file_pattern = variables[2] + + # This is the directory with files that we will use to do the comparison + # against. + baseline_dir = variables[3] + snrs = '' + filestable = {} + + filestable['dsnr'] = '' + filestable['drate'] = '' + filestable['avg'] = '' + + # Dirs is directories after the baseline to compare to the base. + dirs = variables[4:len(variables)] + + # Find the metric files in the baseline directory. + dir_list = sorted(fnmatch.filter(os.listdir(baseline_dir), file_pattern)) + + metrics = GetMetrics(baseline_dir + "/" + dir_list[0]) + + metrics_js = 'metrics = ["' + '", "'.join(metrics) + '"];' + + for column in range(1, len(metrics)): + + for metric in ['avg','dsnr','drate']: + description = {"file": ("string", "File")} + + # Go through each directory and add a column header to our description. + countoverall = {} + sumoverall = {} + + for directory in dirs: + description[directory] = ("number", directory) + countoverall[directory] = 0 + sumoverall[directory] = 0 + + # Data holds the data for the visualization, name given comes from + # gviz_api sample code. + data = [] + for filename in dir_list: + row = {'file': splitext(basename(filename))[0] } + baseline_file_name = baseline_dir + "/" + filename + + # Read the metric file from each of the directories in our list. + for directory in dirs: + metric_file_name = directory + "/" + filename + + # If there is a metric file in the current directory, open it + # and calculate its overall difference between it and the baseline + # directory's metric file. + if os.path.isfile(metric_file_name): + overall = FileBetter(baseline_file_name, metric_file_name, + column, metric) + row[directory] = overall + + sumoverall[directory] += overall + countoverall[directory] += 1 + + data.append(row) + + # Add the overall numbers. + row = {"file": "OVERALL" } + for directory in dirs: + row[directory] = sumoverall[directory] / countoverall[directory] + data.append(row) + + # write the tables out + data_table = gviz_api.DataTable(description) + data_table.LoadData(data) + + filestable[metric] = ( filestable[metric] + "filestable_" + metric + + "[" + str(column) + "]=" + + data_table.ToJSon(columns_order=["file"]+dirs) + "\n" ) + + filestable_avg = filestable['avg'] + filestable_dpsnr = filestable['dsnr'] + filestable_drate = filestable['drate'] + + # Now we collect all the data for all the graphs. First the column + # headers which will be Datarate and then each directory. + columns = ("datarate",baseline_dir) + description = {"datarate":("number", "Datarate")} + for directory in dirs: + description[directory] = ("number", directory) + + description[baseline_dir] = ("number", baseline_dir) + + snrs = snrs + "snrs[" + str(column) + "] = [" + + # Now collect the data for the graphs, file by file. + for filename in dir_list: + + data = [] + + # Collect the file in each directory and store all of its metrics + # in the associated gviz metrics table. + all_dirs = dirs + [baseline_dir] + for directory in all_dirs: + + metric_file_name = directory + "/" + filename + if not os.path.isfile(metric_file_name): + continue + + # Read and parse the metrics file storing it to the data we'll + # use for the gviz_api.Datatable. + metrics = ParseMetricFile(metric_file_name, column) + for bitrate, metric in metrics: + data.append({"datarate": bitrate, directory: metric}) + + data_table = gviz_api.DataTable(description) + data_table.LoadData(data) + snrs = snrs + "'" + data_table.ToJSon( + columns_order=tuple(["datarate",baseline_dir]+dirs)) + "'," + + snrs = snrs + "]\n" + + formatters = "" + for i in range(len(dirs)): + formatters = "%s formatter.format(better, %d);" % (formatters, i+1) + + print FillForm(page_template, vars()) + return + +if len(sys.argv) < 3: + print HandleFiles.__doc__ +else: + HandleFiles(sys.argv) diff --git a/libs/libaom/src/test/warp_filter_test.cc b/libs/libaom/src/test/warp_filter_test.cc new file mode 100644 index 000000000..c5e87f085 --- /dev/null +++ b/libs/libaom/src/test/warp_filter_test.cc @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/warp_filter_test_util.h" +using libaom_test::ACMRandom; +#if CONFIG_AV1_HIGHBITDEPTH +using libaom_test::AV1HighbdWarpFilter::AV1HighbdWarpFilterTest; +#endif +using libaom_test::AV1WarpFilter::AV1WarpFilterTest; +using std::make_tuple; +using std::tuple; + +namespace { + +TEST_P(AV1WarpFilterTest, CheckOutput) { + RunCheckOutput(std::get<3>(GET_PARAM(0))); +} +TEST_P(AV1WarpFilterTest, DISABLED_Speed) { + RunSpeedTest(std::get<3>(GET_PARAM(0))); +} + +INSTANTIATE_TEST_SUITE_P( + C, AV1WarpFilterTest, + libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_c)); + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P( + SSE4_1, AV1WarpFilterTest, + libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_sse4_1)); + +#if CONFIG_AV1_HIGHBITDEPTH +TEST_P(AV1HighbdWarpFilterTest, CheckOutput) { + RunCheckOutput(std::get<4>(GET_PARAM(0))); +} +TEST_P(AV1HighbdWarpFilterTest, DISABLED_Speed) { + RunSpeedTest(std::get<4>(GET_PARAM(0))); +} + +INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdWarpFilterTest, + libaom_test::AV1HighbdWarpFilter::BuildParams( + av1_highbd_warp_affine_sse4_1)); +#endif // CONFIG_AV1_HIGHBITDEPTH +#endif // HAVE_SSE4_1 + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, AV1WarpFilterTest, + libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_avx2)); +#endif // HAVE_AVX2 + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, AV1WarpFilterTest, + libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_neon)); +#endif // HAVE_NEON + +} // namespace diff --git a/libs/libaom/src/test/warp_filter_test_util.cc b/libs/libaom/src/test/warp_filter_test_util.cc new file mode 100644 index 000000000..bcb0c1859 --- /dev/null +++ b/libs/libaom/src/test/warp_filter_test_util.cc @@ -0,0 +1,483 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "aom_ports/aom_timer.h" +#include "test/warp_filter_test_util.h" + +using std::make_tuple; +using std::tuple; + +namespace libaom_test { + +int32_t random_warped_param(libaom_test::ACMRandom *rnd, int bits) { + // 1 in 8 chance of generating zero (arbitrarily chosen) + if (((rnd->Rand8()) & 7) == 0) return 0; + // Otherwise, enerate uniform values in the range + // [-(1 << bits), 1] U [1, 1<Rand16() & ((1 << bits) - 1)); + if ((rnd->Rand8()) & 1) return -v; + return v; +} + +void generate_warped_model(libaom_test::ACMRandom *rnd, int32_t *mat, + int16_t *alpha, int16_t *beta, int16_t *gamma, + int16_t *delta, const int is_alpha_zero, + const int is_beta_zero, const int is_gamma_zero, + const int is_delta_zero) { + while (1) { + int rnd8 = rnd->Rand8() & 3; + mat[0] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS + 6); + mat[1] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS + 6); + mat[2] = (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3)) + + (1 << WARPEDMODEL_PREC_BITS); + mat[3] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3); + + if (rnd8 <= 1) { + // AFFINE + mat[4] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3); + mat[5] = (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3)) + + (1 << WARPEDMODEL_PREC_BITS); + } else if (rnd8 == 2) { + mat[4] = -mat[3]; + mat[5] = mat[2]; + } else { + mat[4] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3); + mat[5] = (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3)) + + (1 << WARPEDMODEL_PREC_BITS); + if (is_alpha_zero == 1) mat[2] = 1 << WARPEDMODEL_PREC_BITS; + if (is_beta_zero == 1) mat[3] = 0; + if (is_gamma_zero == 1) mat[4] = 0; + if (is_delta_zero == 1) + mat[5] = static_cast( + ((static_cast(mat[3]) * mat[4] + (mat[2] / 2)) / mat[2]) + + (1 << WARPEDMODEL_PREC_BITS)); + } + + // Calculate the derived parameters and check that they are suitable + // for the warp filter. + assert(mat[2] != 0); + + *alpha = clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX); + *beta = clamp(mat[3], INT16_MIN, INT16_MAX); + *gamma = static_cast(clamp64( + (static_cast(mat[4]) * (1 << WARPEDMODEL_PREC_BITS)) / mat[2], + INT16_MIN, INT16_MAX)); + *delta = static_cast(clamp64( + mat[5] - + ((static_cast(mat[3]) * mat[4] + (mat[2] / 2)) / mat[2]) - + (1 << WARPEDMODEL_PREC_BITS), + INT16_MIN, INT16_MAX)); + + if ((4 * abs(*alpha) + 7 * abs(*beta) >= (1 << WARPEDMODEL_PREC_BITS)) || + (4 * abs(*gamma) + 4 * abs(*delta) >= (1 << WARPEDMODEL_PREC_BITS))) + continue; + + *alpha = ROUND_POWER_OF_TWO_SIGNED(*alpha, WARP_PARAM_REDUCE_BITS) * + (1 << WARP_PARAM_REDUCE_BITS); + *beta = ROUND_POWER_OF_TWO_SIGNED(*beta, WARP_PARAM_REDUCE_BITS) * + (1 << WARP_PARAM_REDUCE_BITS); + *gamma = ROUND_POWER_OF_TWO_SIGNED(*gamma, WARP_PARAM_REDUCE_BITS) * + (1 << WARP_PARAM_REDUCE_BITS); + *delta = ROUND_POWER_OF_TWO_SIGNED(*delta, WARP_PARAM_REDUCE_BITS) * + (1 << WARP_PARAM_REDUCE_BITS); + + // We have a valid model, so finish + return; + } +} + +namespace AV1WarpFilter { +::testing::internal::ParamGenerator BuildParams( + warp_affine_func filter) { + WarpTestParam params[] = { + make_tuple(4, 4, 50000, filter), make_tuple(8, 8, 50000, filter), + make_tuple(64, 64, 1000, filter), make_tuple(4, 16, 20000, filter), + make_tuple(32, 8, 10000, filter), + }; + return ::testing::Combine(::testing::ValuesIn(params), + ::testing::Values(0, 1), ::testing::Values(0, 1), + ::testing::Values(0, 1), ::testing::Values(0, 1)); +} + +AV1WarpFilterTest::~AV1WarpFilterTest() {} +void AV1WarpFilterTest::SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); } + +void AV1WarpFilterTest::TearDown() { libaom_test::ClearSystemState(); } + +void AV1WarpFilterTest::RunSpeedTest(warp_affine_func test_impl) { + const int w = 128, h = 128; + const int border = 16; + const int stride = w + 2 * border; + WarpTestParam params = GET_PARAM(0); + const int out_w = std::get<0>(params), out_h = std::get<1>(params); + const int is_alpha_zero = GET_PARAM(1); + const int is_beta_zero = GET_PARAM(2); + const int is_gamma_zero = GET_PARAM(3); + const int is_delta_zero = GET_PARAM(4); + int sub_x, sub_y; + const int bd = 8; + + uint8_t *input_ = new uint8_t[h * stride]; + uint8_t *input = input_ + border; + + // The warp functions always write rows with widths that are multiples of 8. + // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8. + int output_n = ((out_w + 7) & ~7) * out_h; + uint8_t *output = new uint8_t[output_n]; + int32_t mat[8]; + int16_t alpha, beta, gamma, delta; + ConvolveParams conv_params = get_conv_params(0, 0, bd); + CONV_BUF_TYPE *dsta = new CONV_BUF_TYPE[output_n]; + generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta, + is_alpha_zero, is_beta_zero, is_gamma_zero, + is_delta_zero); + + for (int r = 0; r < h; ++r) + for (int c = 0; c < w; ++c) input[r * stride + c] = rnd_.Rand8(); + for (int r = 0; r < h; ++r) { + memset(input + r * stride - border, input[r * stride], border); + memset(input + r * stride + w, input[r * stride + (w - 1)], border); + } + + sub_x = 0; + sub_y = 0; + int do_average = 0; + + conv_params = get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd); + conv_params.use_dist_wtd_comp_avg = 0; + + const int num_loops = 1000000000 / (out_w + out_h); + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int i = 0; i < num_loops; ++i) + test_impl(mat, input, w, h, stride, output, 32, 32, out_w, out_h, out_w, + sub_x, sub_y, &conv_params, alpha, beta, gamma, delta); + + aom_usec_timer_mark(&timer); + const int elapsed_time = static_cast(aom_usec_timer_elapsed(&timer)); + printf("warp %3dx%-3d: %7.2f ns\n", out_w, out_h, + 1000.0 * elapsed_time / num_loops); + + delete[] input_; + delete[] output; + delete[] dsta; +} + +void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) { + const int w = 128, h = 128; + const int border = 16; + const int stride = w + 2 * border; + WarpTestParam params = GET_PARAM(0); + const int is_alpha_zero = GET_PARAM(1); + const int is_beta_zero = GET_PARAM(2); + const int is_gamma_zero = GET_PARAM(3); + const int is_delta_zero = GET_PARAM(4); + const int out_w = std::get<0>(params), out_h = std::get<1>(params); + const int num_iters = std::get<2>(params); + int i, j, sub_x, sub_y; + const int bd = 8; + + // The warp functions always write rows with widths that are multiples of 8. + // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8. + int output_n = ((out_w + 7) & ~7) * out_h; + uint8_t *input_ = new uint8_t[h * stride]; + uint8_t *input = input_ + border; + uint8_t *output = new uint8_t[output_n]; + uint8_t *output2 = new uint8_t[output_n]; + int32_t mat[8]; + int16_t alpha, beta, gamma, delta; + ConvolveParams conv_params = get_conv_params(0, 0, bd); + CONV_BUF_TYPE *dsta = new CONV_BUF_TYPE[output_n]; + CONV_BUF_TYPE *dstb = new CONV_BUF_TYPE[output_n]; + for (int i = 0; i < output_n; ++i) output[i] = output2[i] = rnd_.Rand8(); + + for (i = 0; i < num_iters; ++i) { + // Generate an input block and extend its borders horizontally + for (int r = 0; r < h; ++r) + for (int c = 0; c < w; ++c) input[r * stride + c] = rnd_.Rand8(); + for (int r = 0; r < h; ++r) { + memset(input + r * stride - border, input[r * stride], border); + memset(input + r * stride + w, input[r * stride + (w - 1)], border); + } + const int use_no_round = rnd_.Rand8() & 1; + for (sub_x = 0; sub_x < 2; ++sub_x) + for (sub_y = 0; sub_y < 2; ++sub_y) { + generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta, + is_alpha_zero, is_beta_zero, is_gamma_zero, + is_delta_zero); + + for (int ii = 0; ii < 2; ++ii) { + for (int jj = 0; jj < 5; ++jj) { + for (int do_average = 0; do_average <= 1; ++do_average) { + if (use_no_round) { + conv_params = + get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd); + } else { + conv_params = get_conv_params(0, 0, bd); + } + if (jj >= 4) { + conv_params.use_dist_wtd_comp_avg = 0; + } else { + conv_params.use_dist_wtd_comp_avg = 1; + conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0]; + conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1]; + } + av1_warp_affine_c(mat, input, w, h, stride, output, 32, 32, out_w, + out_h, out_w, sub_x, sub_y, &conv_params, alpha, + beta, gamma, delta); + if (use_no_round) { + conv_params = + get_conv_params_no_round(do_average, 0, dstb, out_w, 1, bd); + } + if (jj >= 4) { + conv_params.use_dist_wtd_comp_avg = 0; + } else { + conv_params.use_dist_wtd_comp_avg = 1; + conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0]; + conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1]; + } + test_impl(mat, input, w, h, stride, output2, 32, 32, out_w, out_h, + out_w, sub_x, sub_y, &conv_params, alpha, beta, gamma, + delta); + if (use_no_round) { + for (j = 0; j < out_w * out_h; ++j) + ASSERT_EQ(dsta[j], dstb[j]) + << "Pixel mismatch at index " << j << " = (" + << (j % out_w) << ", " << (j / out_w) << ") on iteration " + << i; + for (j = 0; j < out_w * out_h; ++j) + ASSERT_EQ(output[j], output2[j]) + << "Pixel mismatch at index " << j << " = (" + << (j % out_w) << ", " << (j / out_w) << ") on iteration " + << i; + } else { + for (j = 0; j < out_w * out_h; ++j) + ASSERT_EQ(output[j], output2[j]) + << "Pixel mismatch at index " << j << " = (" + << (j % out_w) << ", " << (j / out_w) << ") on iteration " + << i; + } + } + } + } + } + } + delete[] input_; + delete[] output; + delete[] output2; + delete[] dsta; + delete[] dstb; +} +} // namespace AV1WarpFilter + +#if CONFIG_AV1_HIGHBITDEPTH +namespace AV1HighbdWarpFilter { +::testing::internal::ParamGenerator BuildParams( + highbd_warp_affine_func filter) { + const HighbdWarpTestParam params[] = { + make_tuple(4, 4, 100, 8, filter), make_tuple(8, 8, 100, 8, filter), + make_tuple(64, 64, 100, 8, filter), make_tuple(4, 16, 100, 8, filter), + make_tuple(32, 8, 100, 8, filter), make_tuple(4, 4, 100, 10, filter), + make_tuple(8, 8, 100, 10, filter), make_tuple(64, 64, 100, 10, filter), + make_tuple(4, 16, 100, 10, filter), make_tuple(32, 8, 100, 10, filter), + make_tuple(4, 4, 100, 12, filter), make_tuple(8, 8, 100, 12, filter), + make_tuple(64, 64, 100, 12, filter), make_tuple(4, 16, 100, 12, filter), + make_tuple(32, 8, 100, 12, filter), + }; + return ::testing::Combine(::testing::ValuesIn(params), + ::testing::Values(0, 1), ::testing::Values(0, 1), + ::testing::Values(0, 1), ::testing::Values(0, 1)); +} + +AV1HighbdWarpFilterTest::~AV1HighbdWarpFilterTest() {} +void AV1HighbdWarpFilterTest::SetUp() { + rnd_.Reset(ACMRandom::DeterministicSeed()); +} + +void AV1HighbdWarpFilterTest::TearDown() { libaom_test::ClearSystemState(); } + +void AV1HighbdWarpFilterTest::RunSpeedTest(highbd_warp_affine_func test_impl) { + const int w = 128, h = 128; + const int border = 16; + const int stride = w + 2 * border; + HighbdWarpTestParam param = GET_PARAM(0); + const int is_alpha_zero = GET_PARAM(1); + const int is_beta_zero = GET_PARAM(2); + const int is_gamma_zero = GET_PARAM(3); + const int is_delta_zero = GET_PARAM(4); + const int out_w = std::get<0>(param), out_h = std::get<1>(param); + const int bd = std::get<3>(param); + const int mask = (1 << bd) - 1; + int sub_x, sub_y; + + // The warp functions always write rows with widths that are multiples of 8. + // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8. + int output_n = ((out_w + 7) & ~7) * out_h; + uint16_t *input_ = new uint16_t[h * stride]; + uint16_t *input = input_ + border; + uint16_t *output = new uint16_t[output_n]; + int32_t mat[8]; + int16_t alpha, beta, gamma, delta; + ConvolveParams conv_params = get_conv_params(0, 0, bd); + CONV_BUF_TYPE *dsta = new CONV_BUF_TYPE[output_n]; + + generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta, + is_alpha_zero, is_beta_zero, is_gamma_zero, + is_delta_zero); + // Generate an input block and extend its borders horizontally + for (int r = 0; r < h; ++r) + for (int c = 0; c < w; ++c) input[r * stride + c] = rnd_.Rand16() & mask; + for (int r = 0; r < h; ++r) { + for (int c = 0; c < border; ++c) { + input[r * stride - border + c] = input[r * stride]; + input[r * stride + w + c] = input[r * stride + (w - 1)]; + } + } + + sub_x = 0; + sub_y = 0; + int do_average = 0; + conv_params.use_dist_wtd_comp_avg = 0; + conv_params = get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd); + + const int num_loops = 1000000000 / (out_w + out_h); + aom_usec_timer timer; + aom_usec_timer_start(&timer); + + for (int i = 0; i < num_loops; ++i) + test_impl(mat, input, w, h, stride, output, 32, 32, out_w, out_h, out_w, + sub_x, sub_y, bd, &conv_params, alpha, beta, gamma, delta); + + aom_usec_timer_mark(&timer); + const int elapsed_time = static_cast(aom_usec_timer_elapsed(&timer)); + printf("highbd warp %3dx%-3d: %7.2f ns\n", out_w, out_h, + 1000.0 * elapsed_time / num_loops); + + delete[] input_; + delete[] output; + delete[] dsta; +} + +void AV1HighbdWarpFilterTest::RunCheckOutput( + highbd_warp_affine_func test_impl) { + const int w = 128, h = 128; + const int border = 16; + const int stride = w + 2 * border; + HighbdWarpTestParam param = GET_PARAM(0); + const int is_alpha_zero = GET_PARAM(1); + const int is_beta_zero = GET_PARAM(2); + const int is_gamma_zero = GET_PARAM(3); + const int is_delta_zero = GET_PARAM(4); + const int out_w = std::get<0>(param), out_h = std::get<1>(param); + const int bd = std::get<3>(param); + const int num_iters = std::get<2>(param); + const int mask = (1 << bd) - 1; + int i, j, sub_x, sub_y; + + // The warp functions always write rows with widths that are multiples of 8. + // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8. + int output_n = ((out_w + 7) & ~7) * out_h; + uint16_t *input_ = new uint16_t[h * stride]; + uint16_t *input = input_ + border; + uint16_t *output = new uint16_t[output_n]; + uint16_t *output2 = new uint16_t[output_n]; + int32_t mat[8]; + int16_t alpha, beta, gamma, delta; + ConvolveParams conv_params = get_conv_params(0, 0, bd); + CONV_BUF_TYPE *dsta = new CONV_BUF_TYPE[output_n]; + CONV_BUF_TYPE *dstb = new CONV_BUF_TYPE[output_n]; + for (int i = 0; i < output_n; ++i) output[i] = output2[i] = rnd_.Rand16(); + + for (i = 0; i < num_iters; ++i) { + // Generate an input block and extend its borders horizontally + for (int r = 0; r < h; ++r) + for (int c = 0; c < w; ++c) input[r * stride + c] = rnd_.Rand16() & mask; + for (int r = 0; r < h; ++r) { + for (int c = 0; c < border; ++c) { + input[r * stride - border + c] = input[r * stride]; + input[r * stride + w + c] = input[r * stride + (w - 1)]; + } + } + const int use_no_round = rnd_.Rand8() & 1; + for (sub_x = 0; sub_x < 2; ++sub_x) + for (sub_y = 0; sub_y < 2; ++sub_y) { + generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta, + is_alpha_zero, is_beta_zero, is_gamma_zero, + is_delta_zero); + for (int ii = 0; ii < 2; ++ii) { + for (int jj = 0; jj < 5; ++jj) { + for (int do_average = 0; do_average <= 1; ++do_average) { + if (use_no_round) { + conv_params = + get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd); + } else { + conv_params = get_conv_params(0, 0, bd); + } + if (jj >= 4) { + conv_params.use_dist_wtd_comp_avg = 0; + } else { + conv_params.use_dist_wtd_comp_avg = 1; + conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0]; + conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1]; + } + + av1_highbd_warp_affine_c(mat, input, w, h, stride, output, 32, 32, + out_w, out_h, out_w, sub_x, sub_y, bd, + &conv_params, alpha, beta, gamma, delta); + if (use_no_round) { + // TODO(angiebird): Change this to test_impl once we have SIMD + // implementation + conv_params = + get_conv_params_no_round(do_average, 0, dstb, out_w, 1, bd); + } + if (jj >= 4) { + conv_params.use_dist_wtd_comp_avg = 0; + } else { + conv_params.use_dist_wtd_comp_avg = 1; + conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0]; + conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1]; + } + test_impl(mat, input, w, h, stride, output2, 32, 32, out_w, out_h, + out_w, sub_x, sub_y, bd, &conv_params, alpha, beta, + gamma, delta); + + if (use_no_round) { + for (j = 0; j < out_w * out_h; ++j) + ASSERT_EQ(dsta[j], dstb[j]) + << "Pixel mismatch at index " << j << " = (" + << (j % out_w) << ", " << (j / out_w) << ") on iteration " + << i; + for (j = 0; j < out_w * out_h; ++j) + ASSERT_EQ(output[j], output2[j]) + << "Pixel mismatch at index " << j << " = (" + << (j % out_w) << ", " << (j / out_w) << ") on iteration " + << i; + } else { + for (j = 0; j < out_w * out_h; ++j) + ASSERT_EQ(output[j], output2[j]) + << "Pixel mismatch at index " << j << " = (" + << (j % out_w) << ", " << (j / out_w) << ") on iteration " + << i; + } + } + } + } + } + } + + delete[] input_; + delete[] output; + delete[] output2; + delete[] dsta; + delete[] dstb; +} +} // namespace AV1HighbdWarpFilter +#endif // CONFIG_AV1_HIGHBITDEPTH +} // namespace libaom_test diff --git a/libs/libaom/src/test/warp_filter_test_util.h b/libs/libaom/src/test/warp_filter_test_util.h new file mode 100644 index 000000000..66a6e244b --- /dev/null +++ b/libs/libaom/src/test/warp_filter_test_util.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_TEST_WARP_FILTER_TEST_UTIL_H_ +#define AOM_TEST_WARP_FILTER_TEST_UTIL_H_ + +#include + +#include "config/av1_rtcd.h" +#include "config/aom_dsp_rtcd.h" + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/acm_random.h" +#include "test/util.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" + +#include "av1/common/mv.h" +#include "av1/common/common_data.h" + +namespace libaom_test { + +void generate_warped_model(libaom_test::ACMRandom *rnd, int32_t *mat, + int16_t *alpha, int16_t *beta, int16_t *gamma, + int16_t *delta, int is_alpha_zero, int is_beta_zero, + int is_gamma_zero, int is_delta_zero); + +namespace AV1WarpFilter { + +typedef void (*warp_affine_func)(const int32_t *mat, const uint8_t *ref, + int width, int height, int stride, + uint8_t *pred, int p_col, int p_row, + int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta); + +typedef std::tuple WarpTestParam; +typedef std::tuple WarpTestParams; + +::testing::internal::ParamGenerator BuildParams( + warp_affine_func filter); + +class AV1WarpFilterTest : public ::testing::TestWithParam { + public: + virtual ~AV1WarpFilterTest(); + virtual void SetUp(); + + virtual void TearDown(); + + protected: + void RunCheckOutput(warp_affine_func test_impl); + void RunSpeedTest(warp_affine_func test_impl); + + libaom_test::ACMRandom rnd_; +}; + +} // namespace AV1WarpFilter + +#if CONFIG_AV1_HIGHBITDEPTH +namespace AV1HighbdWarpFilter { +typedef void (*highbd_warp_affine_func)(const int32_t *mat, const uint16_t *ref, + int width, int height, int stride, + uint16_t *pred, int p_col, int p_row, + int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, + int bd, ConvolveParams *conv_params, + int16_t alpha, int16_t beta, + int16_t gamma, int16_t delta); + +typedef std::tuple + HighbdWarpTestParam; +typedef std::tuple + HighbdWarpTestParams; + +::testing::internal::ParamGenerator BuildParams( + highbd_warp_affine_func filter); + +class AV1HighbdWarpFilterTest + : public ::testing::TestWithParam { + public: + virtual ~AV1HighbdWarpFilterTest(); + virtual void SetUp(); + + virtual void TearDown(); + + protected: + void RunCheckOutput(highbd_warp_affine_func test_impl); + void RunSpeedTest(highbd_warp_affine_func test_impl); + + libaom_test::ACMRandom rnd_; +}; + +} // namespace AV1HighbdWarpFilter +#endif // CONFIG_AV1_HIGHBITDEPTH + +} // namespace libaom_test + +#endif // AOM_TEST_WARP_FILTER_TEST_UTIL_H_ diff --git a/libs/libaom/src/test/webm_video_source.h b/libs/libaom/src/test/webm_video_source.h new file mode 100644 index 000000000..bb3d11735 --- /dev/null +++ b/libs/libaom/src/test/webm_video_source.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_TEST_WEBM_VIDEO_SOURCE_H_ +#define AOM_TEST_WEBM_VIDEO_SOURCE_H_ +#include +#include +#include +#include +#include +#include "common/tools_common.h" +#include "common/webmdec.h" +#include "test/video_source.h" + +namespace libaom_test { + +// This class extends VideoSource to allow parsing of WebM files, +// so that we can do actual file decodes. +class WebMVideoSource : public CompressedVideoSource { + public: + explicit WebMVideoSource(const std::string &file_name) + : file_name_(file_name), aom_ctx_(new AvxInputContext()), + webm_ctx_(new WebmInputContext()), buf_(NULL), buf_sz_(0), frame_sz_(0), + frame_number_(0), end_of_file_(false) {} + + virtual ~WebMVideoSource() { + if (aom_ctx_->file != NULL) fclose(aom_ctx_->file); + webm_free(webm_ctx_); + delete aom_ctx_; + delete webm_ctx_; + } + + virtual void Init() {} + + virtual void Begin() { + aom_ctx_->file = OpenTestDataFile(file_name_); + ASSERT_TRUE(aom_ctx_->file != NULL) + << "Input file open failed. Filename: " << file_name_; + + ASSERT_EQ(file_is_webm(webm_ctx_, aom_ctx_), 1) << "file is not WebM"; + + FillFrame(); + } + + virtual void Next() { + ++frame_number_; + FillFrame(); + } + + void FillFrame() { + ASSERT_TRUE(aom_ctx_->file != NULL); + const int status = webm_read_frame(webm_ctx_, &buf_, &frame_sz_, &buf_sz_); + ASSERT_GE(status, 0) << "webm_read_frame failed"; + if (status == 1) { + end_of_file_ = true; + } + } + + void SeekToNextKeyFrame() { + ASSERT_TRUE(aom_ctx_->file != NULL); + do { + const int status = + webm_read_frame(webm_ctx_, &buf_, &frame_sz_, &buf_sz_); + ASSERT_GE(status, 0) << "webm_read_frame failed"; + ++frame_number_; + if (status == 1) { + end_of_file_ = true; + } + } while (!webm_ctx_->is_key_frame && !end_of_file_); + } + + virtual const uint8_t *cxdata() const { return end_of_file_ ? NULL : buf_; } + virtual size_t frame_size() const { return frame_sz_; } + virtual unsigned int frame_number() const { return frame_number_; } + + protected: + std::string file_name_; + AvxInputContext *aom_ctx_; + WebmInputContext *webm_ctx_; + uint8_t *buf_; // Owned by webm_ctx_ and freed when webm_ctx_ is freed. + size_t buf_sz_; + size_t frame_sz_; + unsigned int frame_number_; + bool end_of_file_; +}; + +} // namespace libaom_test + +#endif // AOM_TEST_WEBM_VIDEO_SOURCE_H_ diff --git a/libs/libaom/src/test/wiener_test.cc b/libs/libaom/src/test/wiener_test.cc new file mode 100644 index 000000000..81839fd56 --- /dev/null +++ b/libs/libaom/src/test/wiener_test.cc @@ -0,0 +1,587 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "test/register_state_check.h" +#include "test/acm_random.h" +#include "test/util.h" + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_ports/aom_timer.h" +#include "av1/encoder/pickrst.h" + +#define MAX_WIENER_BLOCK 384 +#define MAX_DATA_BLOCK (MAX_WIENER_BLOCK + WIENER_WIN) + +// 8-bit-depth tests +namespace wiener_lowbd { + +static void compute_stats_win_opt_c(int wiener_win, const uint8_t *dgd, + const uint8_t *src, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H) { + ASSERT_TRUE(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA); + int i, j, k, l, m, n; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + uint8_t avg = find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + std::vector > M_int(wiener_win, + std::vector(wiener_win, 0)); + std::vector > H_int( + wiener_win * wiener_win, std::vector(wiener_win * 8, 0)); + std::vector > sumY(wiener_win, + std::vector(wiener_win, 0)); + int32_t sumX = 0; + const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + + for (i = v_start; i < v_end; i++) { + for (j = h_start; j < h_end; j += 2) { + const uint8_t X1 = src[i * src_stride + j]; + const uint8_t X2 = src[i * src_stride + j + 1]; + sumX += X1 + X2; + + const uint8_t *dgd_ij = dgd_win + i * dgd_stride + j; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const uint8_t *dgd_ijkl = dgd_ij + k * dgd_stride + l; + int64_t *H_int_temp = &H_int[(l * wiener_win + k)][0]; + const uint8_t D1 = dgd_ijkl[0]; + const uint8_t D2 = dgd_ijkl[1]; + sumY[k][l] += D1 + D2; + M_int[l][k] += D1 * X1 + D2 * X2; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_int_temp[m * 8 + n] += D1 * dgd_ij[n + dgd_stride * m] + + D2 * dgd_ij[n + dgd_stride * m + 1]; + } + } + } + } + } + } + + const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + M[l * wiener_win + k] = + M_int[l][k] + avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]); + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H[(l * wiener_win + k) * wiener_win2 + m * wiener_win + n] = + H_int[(l * wiener_win + k)][n * 8 + m] + avg_square_sum - + (int64_t)avg * (sumY[k][l] + sumY[n][m]); + } + } + } + } +} + +void compute_stats_opt_c(int wiener_win, const uint8_t *dgd, const uint8_t *src, + int h_start, int h_end, int v_start, int v_end, + int dgd_stride, int src_stride, int64_t *M, + int64_t *H) { + if (wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA) { + compute_stats_win_opt_c(wiener_win, dgd, src, h_start, h_end, v_start, + v_end, dgd_stride, src_stride, M, H); + } else { + av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M, H); + } +} + +static const int kIterations = 100; +typedef void (*compute_stats_Func)(int wiener_win, const uint8_t *dgd, + const uint8_t *src, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H); + +//////////////////////////////////////////////////////////////////////////////// +// 8 bit +//////////////////////////////////////////////////////////////////////////////// + +typedef std::tuple WienerTestParam; + +class WienerTest : public ::testing::TestWithParam { + public: + virtual void SetUp() { + src_buf = (uint8_t *)aom_memalign( + 32, MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*src_buf)); + dgd_buf = (uint8_t *)aom_memalign( + 32, MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*dgd_buf)); + target_func_ = GET_PARAM(0); + } + virtual void TearDown() { + aom_free(src_buf); + aom_free(dgd_buf); + } + void RunWienerTest(const int32_t wiener_win, int32_t run_times); + void RunWienerTest_ExtremeValues(const int32_t wiener_win); + + private: + compute_stats_Func target_func_; + libaom_test::ACMRandom rng_; + uint8_t *src_buf; + uint8_t *dgd_buf; +}; + +void WienerTest::RunWienerTest(const int32_t wiener_win, int32_t run_times) { + const int32_t wiener_halfwin = wiener_win >> 1; + const int32_t wiener_win2 = wiener_win * wiener_win; + DECLARE_ALIGNED(32, int64_t, M_ref[WIENER_WIN2]); + DECLARE_ALIGNED(32, int64_t, H_ref[WIENER_WIN2 * WIENER_WIN2]); + DECLARE_ALIGNED(32, int64_t, M_test[WIENER_WIN2]); + DECLARE_ALIGNED(32, int64_t, H_test[WIENER_WIN2 * WIENER_WIN2]); + const int h_start = ((rng_.Rand16() % (MAX_WIENER_BLOCK / 2)) & (~7)); + int h_end = + run_times != 1 ? 256 : ((rng_.Rand16() % MAX_WIENER_BLOCK) & (~7)) + 8; + const int v_start = ((rng_.Rand16() % (MAX_WIENER_BLOCK / 2)) & (~7)); + int v_end = + run_times != 1 ? 256 : ((rng_.Rand16() % MAX_WIENER_BLOCK) & (~7)) + 8; + const int dgd_stride = h_end; + const int src_stride = MAX_DATA_BLOCK; + const int iters = run_times == 1 ? kIterations : 2; + for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) { + for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) { + dgd_buf[i] = rng_.Rand8(); + src_buf[i] = rng_.Rand8(); + } + uint8_t *dgd = dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin; + uint8_t *src = src_buf; + + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M_ref, H_ref); + } + aom_usec_timer_mark(&timer); + const double time1 = static_cast(aom_usec_timer_elapsed(&timer)); + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + target_func_(wiener_win, dgd, src, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M_test, H_test); + } + aom_usec_timer_mark(&timer); + const double time2 = static_cast(aom_usec_timer_elapsed(&timer)); + if (run_times > 10) { + printf("win %d %3dx%-3d:%7.2f/%7.2fns", wiener_win, h_end, v_end, time1, + time2); + printf("(%3.2f)\n", time1 / time2); + } + int failed = 0; + for (int i = 0; i < wiener_win2; ++i) { + if (M_ref[i] != M_test[i]) { + failed = 1; + printf("win %d M iter %d [%4d] ref %6" PRId64 " test %6" PRId64 " \n", + wiener_win, iter, i, M_ref[i], M_test[i]); + break; + } + } + for (int i = 0; i < wiener_win2 * wiener_win2; ++i) { + if (H_ref[i] != H_test[i]) { + failed = 1; + printf("win %d H iter %d [%4d] ref %6" PRId64 " test %6" PRId64 " \n", + wiener_win, iter, i, H_ref[i], H_test[i]); + break; + } + } + ASSERT_EQ(failed, 0); + } +} + +void WienerTest::RunWienerTest_ExtremeValues(const int32_t wiener_win) { + const int32_t wiener_halfwin = wiener_win >> 1; + const int32_t wiener_win2 = wiener_win * wiener_win; + DECLARE_ALIGNED(32, int64_t, M_ref[WIENER_WIN2]); + DECLARE_ALIGNED(32, int64_t, H_ref[WIENER_WIN2 * WIENER_WIN2]); + DECLARE_ALIGNED(32, int64_t, M_test[WIENER_WIN2]); + DECLARE_ALIGNED(32, int64_t, H_test[WIENER_WIN2 * WIENER_WIN2]); + const int h_start = 16; + const int h_end = MAX_WIENER_BLOCK; + const int v_start = 16; + const int v_end = MAX_WIENER_BLOCK; + const int dgd_stride = h_end; + const int src_stride = MAX_DATA_BLOCK; + const int iters = 1; + for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) { + for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) { + dgd_buf[i] = 255; + src_buf[i] = 255; + } + uint8_t *dgd = dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin; + uint8_t *src = src_buf; + + av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M_ref, H_ref); + + target_func_(wiener_win, dgd, src, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M_test, H_test); + + int failed = 0; + for (int i = 0; i < wiener_win2; ++i) { + if (M_ref[i] != M_test[i]) { + failed = 1; + printf("win %d M iter %d [%4d] ref %6" PRId64 " test %6" PRId64 " \n", + wiener_win, iter, i, M_ref[i], M_test[i]); + break; + } + } + for (int i = 0; i < wiener_win2 * wiener_win2; ++i) { + if (H_ref[i] != H_test[i]) { + failed = 1; + printf("win %d H iter %d [%4d] ref %6" PRId64 " test %6" PRId64 " \n", + wiener_win, iter, i, H_ref[i], H_test[i]); + break; + } + } + ASSERT_EQ(failed, 0); + } +} + +TEST_P(WienerTest, RandomValues) { + RunWienerTest(WIENER_WIN, 1); + RunWienerTest(WIENER_WIN_CHROMA, 1); +} + +TEST_P(WienerTest, ExtremeValues) { + RunWienerTest_ExtremeValues(WIENER_WIN); + RunWienerTest_ExtremeValues(WIENER_WIN_CHROMA); +} + +TEST_P(WienerTest, DISABLED_Speed) { + RunWienerTest(WIENER_WIN, 200); + RunWienerTest(WIENER_WIN_CHROMA, 200); +} + +INSTANTIATE_TEST_SUITE_P(C, WienerTest, ::testing::Values(compute_stats_opt_c)); + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE4_1, WienerTest, + ::testing::Values(av1_compute_stats_sse4_1)); +#endif // HAVE_SSE4_1 + +#if HAVE_AVX2 + +INSTANTIATE_TEST_SUITE_P(AVX2, WienerTest, + ::testing::Values(av1_compute_stats_avx2)); +#endif // HAVE_AVX2 + +} // namespace wiener_lowbd + +#if CONFIG_AV1_HIGHBITDEPTH +// High bit-depth tests: +namespace wiener_highbd { + +static void compute_stats_highbd_win_opt_c(int wiener_win, const uint8_t *dgd8, + const uint8_t *src8, int h_start, + int h_end, int v_start, int v_end, + int dgd_stride, int src_stride, + int64_t *M, int64_t *H, + aom_bit_depth_t bit_depth) { + ASSERT_TRUE(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA); + int i, j, k, l, m, n; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); + const uint16_t avg = + find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + std::vector > M_int(wiener_win, + std::vector(wiener_win, 0)); + std::vector > H_int( + wiener_win * wiener_win, std::vector(wiener_win * 8, 0)); + std::vector > sumY(wiener_win, + std::vector(wiener_win, 0)); + + memset(M, 0, sizeof(*M) * wiener_win2); + memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2); + + int64_t sumX = 0; + const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + + for (i = v_start; i < v_end; i++) { + for (j = h_start; j < h_end; j += 2) { + const uint16_t X1 = src[i * src_stride + j]; + const uint16_t X2 = src[i * src_stride + j + 1]; + sumX += X1 + X2; + + const uint16_t *dgd_ij = dgd_win + i * dgd_stride + j; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const uint16_t *dgd_ijkl = dgd_ij + k * dgd_stride + l; + int64_t *H_int_temp = &H_int[(l * wiener_win + k)][0]; + const uint16_t D1 = dgd_ijkl[0]; + const uint16_t D2 = dgd_ijkl[1]; + sumY[k][l] += D1 + D2; + M_int[l][k] += D1 * X1 + D2 * X2; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_int_temp[m * 8 + n] += D1 * dgd_ij[n + dgd_stride * m] + + D2 * dgd_ij[n + dgd_stride * m + 1]; + } + } + } + } + } + } + + uint8_t bit_depth_divider = 1; + if (bit_depth == AOM_BITS_12) + bit_depth_divider = 16; + else if (bit_depth == AOM_BITS_10) + bit_depth_divider = 4; + + const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + M[l * wiener_win + k] = + (M_int[l][k] + + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) / + bit_depth_divider; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H[(l * wiener_win + k) * wiener_win2 + m * wiener_win + n] = + (H_int[(l * wiener_win + k)][n * 8 + m] + + (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) / + bit_depth_divider; + } + } + } + } +} + +void compute_stats_highbd_opt_c(int wiener_win, const uint8_t *dgd, + const uint8_t *src, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + aom_bit_depth_t bit_depth) { + if (wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA) { + compute_stats_highbd_win_opt_c(wiener_win, dgd, src, h_start, h_end, + v_start, v_end, dgd_stride, src_stride, M, H, + bit_depth); + } else { + av1_compute_stats_highbd_c(wiener_win, dgd, src, h_start, h_end, v_start, + v_end, dgd_stride, src_stride, M, H, bit_depth); + } +} + +static const int kIterations = 100; +typedef void (*compute_stats_Func)(int wiener_win, const uint8_t *dgd, + const uint8_t *src, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + aom_bit_depth_t bit_depth); + +typedef std::tuple WienerTestParam; + +class WienerTestHighbd : public ::testing::TestWithParam { + public: + virtual void SetUp() { + src_buf = (uint16_t *)aom_memalign( + 32, MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*src_buf)); + dgd_buf = (uint16_t *)aom_memalign( + 32, MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*dgd_buf)); + target_func_ = GET_PARAM(0); + } + virtual void TearDown() { + aom_free(src_buf); + aom_free(dgd_buf); + } + void RunWienerTest(const int32_t wiener_win, int32_t run_times, + aom_bit_depth_t bit_depth); + void RunWienerTest_ExtremeValues(const int32_t wiener_win, + aom_bit_depth_t bit_depth); + + private: + compute_stats_Func target_func_; + libaom_test::ACMRandom rng_; + uint16_t *src_buf; + uint16_t *dgd_buf; +}; + +void WienerTestHighbd::RunWienerTest(const int32_t wiener_win, + int32_t run_times, + aom_bit_depth_t bit_depth) { + const int32_t wiener_halfwin = wiener_win >> 1; + const int32_t wiener_win2 = wiener_win * wiener_win; + DECLARE_ALIGNED(32, int64_t, M_ref[WIENER_WIN2]); + DECLARE_ALIGNED(32, int64_t, H_ref[WIENER_WIN2 * WIENER_WIN2]); + DECLARE_ALIGNED(32, int64_t, M_test[WIENER_WIN2]); + DECLARE_ALIGNED(32, int64_t, H_test[WIENER_WIN2 * WIENER_WIN2]); + const int h_start = ((rng_.Rand16() % (MAX_WIENER_BLOCK / 2)) & (~7)); + const int h_end = + run_times != 1 ? 256 : ((rng_.Rand16() % MAX_WIENER_BLOCK) & (~7)) + 8; + const int v_start = ((rng_.Rand16() % (MAX_WIENER_BLOCK / 2)) & (~7)); + const int v_end = + run_times != 1 ? 256 : ((rng_.Rand16() % MAX_WIENER_BLOCK) & (~7)) + 8; + const int dgd_stride = h_end; + const int src_stride = MAX_DATA_BLOCK; + const int iters = run_times == 1 ? kIterations : 2; + for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) { + for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) { + dgd_buf[i] = rng_.Rand16() % (1 << bit_depth); + src_buf[i] = rng_.Rand16() % (1 << bit_depth); + } + const uint8_t *dgd8 = CONVERT_TO_BYTEPTR( + dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin); + const uint8_t *src8 = CONVERT_TO_BYTEPTR(src_buf); + + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, + v_start, v_end, dgd_stride, src_stride, M_ref, + H_ref, bit_depth); + } + aom_usec_timer_mark(&timer); + const double time1 = static_cast(aom_usec_timer_elapsed(&timer)); + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + target_func_(wiener_win, dgd8, src8, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M_test, H_test, bit_depth); + } + aom_usec_timer_mark(&timer); + const double time2 = static_cast(aom_usec_timer_elapsed(&timer)); + if (run_times > 10) { + printf("win %d bd %d %3dx%-3d:%7.2f/%7.2fns", wiener_win, bit_depth, + h_end, v_end, time1, time2); + printf("(%3.2f)\n", time1 / time2); + } + int failed = 0; + for (int i = 0; i < wiener_win2; ++i) { + if (M_ref[i] != M_test[i]) { + failed = 1; + printf("win %d bd %d M iter %d [%4d] ref %6" PRId64 " test %6" PRId64 + " \n", + wiener_win, bit_depth, iter, i, M_ref[i], M_test[i]); + break; + } + } + for (int i = 0; i < wiener_win2 * wiener_win2; ++i) { + if (H_ref[i] != H_test[i]) { + failed = 1; + printf("win %d bd %d H iter %d [%4d] ref %6" PRId64 " test %6" PRId64 + " \n", + wiener_win, bit_depth, iter, i, H_ref[i], H_test[i]); + break; + } + } + ASSERT_EQ(failed, 0); + } +} + +void WienerTestHighbd::RunWienerTest_ExtremeValues(const int32_t wiener_win, + aom_bit_depth_t bit_depth) { + const int32_t wiener_halfwin = wiener_win >> 1; + const int32_t wiener_win2 = wiener_win * wiener_win; + DECLARE_ALIGNED(32, int64_t, M_ref[WIENER_WIN2]); + DECLARE_ALIGNED(32, int64_t, H_ref[WIENER_WIN2 * WIENER_WIN2]); + DECLARE_ALIGNED(32, int64_t, M_test[WIENER_WIN2]); + DECLARE_ALIGNED(32, int64_t, H_test[WIENER_WIN2 * WIENER_WIN2]); + const int h_start = 16; + const int h_end = MAX_WIENER_BLOCK; + const int v_start = 16; + const int v_end = MAX_WIENER_BLOCK; + const int dgd_stride = h_end; + const int src_stride = MAX_DATA_BLOCK; + const int iters = 1; + for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) { + for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) { + dgd_buf[i] = ((uint16_t)1 << bit_depth) - 1; + src_buf[i] = ((uint16_t)1 << bit_depth) - 1; + } + const uint8_t *dgd8 = CONVERT_TO_BYTEPTR( + dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin); + const uint8_t *src8 = CONVERT_TO_BYTEPTR(src_buf); + + av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start, + v_end, dgd_stride, src_stride, M_ref, H_ref, + bit_depth); + + target_func_(wiener_win, dgd8, src8, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M_test, H_test, bit_depth); + + int failed = 0; + for (int i = 0; i < wiener_win2; ++i) { + if (M_ref[i] != M_test[i]) { + failed = 1; + printf("win %d bd %d M iter %d [%4d] ref %6" PRId64 " test %6" PRId64 + " \n", + wiener_win, bit_depth, iter, i, M_ref[i], M_test[i]); + break; + } + } + for (int i = 0; i < wiener_win2 * wiener_win2; ++i) { + if (H_ref[i] != H_test[i]) { + failed = 1; + printf("win %d bd %d H iter %d [%4d] ref %6" PRId64 " test %6" PRId64 + " \n", + wiener_win, bit_depth, iter, i, H_ref[i], H_test[i]); + break; + } + } + ASSERT_EQ(failed, 0); + } +} + +TEST_P(WienerTestHighbd, RandomValues) { + RunWienerTest(WIENER_WIN, 1, AOM_BITS_8); + RunWienerTest(WIENER_WIN_CHROMA, 1, AOM_BITS_8); + RunWienerTest(WIENER_WIN, 1, AOM_BITS_10); + RunWienerTest(WIENER_WIN_CHROMA, 1, AOM_BITS_10); + RunWienerTest(WIENER_WIN, 1, AOM_BITS_12); + RunWienerTest(WIENER_WIN_CHROMA, 1, AOM_BITS_12); +} + +TEST_P(WienerTestHighbd, ExtremeValues) { + RunWienerTest_ExtremeValues(WIENER_WIN, AOM_BITS_8); + RunWienerTest_ExtremeValues(WIENER_WIN_CHROMA, AOM_BITS_8); + RunWienerTest_ExtremeValues(WIENER_WIN, AOM_BITS_10); + RunWienerTest_ExtremeValues(WIENER_WIN_CHROMA, AOM_BITS_10); + RunWienerTest_ExtremeValues(WIENER_WIN, AOM_BITS_12); + RunWienerTest_ExtremeValues(WIENER_WIN_CHROMA, AOM_BITS_12); +} + +TEST_P(WienerTestHighbd, DISABLED_Speed) { + RunWienerTest(WIENER_WIN, 200, AOM_BITS_8); + RunWienerTest(WIENER_WIN_CHROMA, 200, AOM_BITS_8); + RunWienerTest(WIENER_WIN, 200, AOM_BITS_10); + RunWienerTest(WIENER_WIN_CHROMA, 200, AOM_BITS_10); + RunWienerTest(WIENER_WIN, 200, AOM_BITS_12); + RunWienerTest(WIENER_WIN_CHROMA, 200, AOM_BITS_12); +} + +INSTANTIATE_TEST_SUITE_P(C, WienerTestHighbd, + ::testing::Values(compute_stats_highbd_opt_c)); + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE4_1, WienerTestHighbd, + ::testing::Values(av1_compute_stats_highbd_sse4_1)); +#endif // HAVE_SSE4_1 + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2, WienerTestHighbd, + ::testing::Values(av1_compute_stats_highbd_avx2)); +#endif // HAVE_AVX2 + +} // namespace wiener_highbd +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/libs/libaom/src/test/y4m_test.cc b/libs/libaom/src/test/y4m_test.cc new file mode 100644 index 000000000..5d795fad9 --- /dev/null +++ b/libs/libaom/src/test/y4m_test.cc @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" + +#include "common/y4menc.h" +#include "test/md5_helper.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +namespace { + +using std::string; + +static const unsigned int kWidth = 160; +static const unsigned int kHeight = 90; +static const unsigned int kFrames = 10; + +struct Y4mTestParam { + const char *filename; + unsigned int bit_depth; + aom_img_fmt format; + const char *md5raw; +}; + +const Y4mTestParam kY4mTestVectors[] = { + { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, + "e5406275b9fc6bb3436c31d4a05c1cab" }, + { "park_joy_90p_8_420_monochrome.y4m", 8, AOM_IMG_FMT_I420, + "95ef5bf6218580588be24a5271bb6a7f" }, + { "park_joy_90p_8_420_vertical_csp.y4m", 8, AOM_IMG_FMT_I420, + "f53a40fec15254ac312527339d9c686b" }, + { "park_joy_90p_8_422.y4m", 8, AOM_IMG_FMT_I422, + "284a47a47133b12884ec3a14e959a0b6" }, + { "park_joy_90p_8_444.y4m", 8, AOM_IMG_FMT_I444, + "90517ff33843d85de712fd4fe60dbed0" }, + { "park_joy_90p_10_420.y4m", 10, AOM_IMG_FMT_I42016, + "63f21f9f717d8b8631bd2288ee87137b" }, + { "park_joy_90p_10_422.y4m", 10, AOM_IMG_FMT_I42216, + "48ab51fb540aed07f7ff5af130c9b605" }, + { "park_joy_90p_10_444.y4m", 10, AOM_IMG_FMT_I44416, + "067bfd75aa85ff9bae91fa3e0edd1e3e" }, + { "park_joy_90p_12_420.y4m", 12, AOM_IMG_FMT_I42016, + "9e6d8f6508c6e55625f6b697bc461cef" }, + { "park_joy_90p_12_422.y4m", 12, AOM_IMG_FMT_I42216, + "b239c6b301c0b835485be349ca83a7e3" }, + { "park_joy_90p_12_444.y4m", 12, AOM_IMG_FMT_I44416, + "5a6481a550821dab6d0192f5c63845e9" }, +}; + +static const int PLANES_YUV[] = { AOM_PLANE_Y, AOM_PLANE_U, AOM_PLANE_V }; + +class Y4mVideoSourceTest : public ::testing::TestWithParam, + public ::libaom_test::Y4mVideoSource { + protected: + Y4mVideoSourceTest() : Y4mVideoSource("", 0, 0) {} + + virtual ~Y4mVideoSourceTest() { CloseSource(); } + + virtual void Init(const std::string &file_name, int limit) { + file_name_ = file_name; + start_ = 0; + limit_ = limit; + frame_ = 0; + Begin(); + } + + // Checks y4m header information + void HeaderChecks(unsigned int bit_depth, aom_img_fmt_t fmt) { + ASSERT_TRUE(input_file_ != NULL); + ASSERT_EQ(y4m_.pic_w, (int)kWidth); + ASSERT_EQ(y4m_.pic_h, (int)kHeight); + ASSERT_EQ(img()->d_w, kWidth); + ASSERT_EQ(img()->d_h, kHeight); + ASSERT_EQ(y4m_.bit_depth, bit_depth); + ASSERT_EQ(y4m_.aom_fmt, fmt); + if (fmt == AOM_IMG_FMT_I420 || fmt == AOM_IMG_FMT_I42016) { + ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 3 / 2); + ASSERT_EQ(img()->x_chroma_shift, 1U); + ASSERT_EQ(img()->y_chroma_shift, 1U); + } + if (fmt == AOM_IMG_FMT_I422 || fmt == AOM_IMG_FMT_I42216) { + ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 2); + ASSERT_EQ(img()->x_chroma_shift, 1U); + ASSERT_EQ(img()->y_chroma_shift, 0U); + } + if (fmt == AOM_IMG_FMT_I444 || fmt == AOM_IMG_FMT_I44416) { + ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 3); + ASSERT_EQ(img()->x_chroma_shift, 0U); + ASSERT_EQ(img()->y_chroma_shift, 0U); + } + } + + // Checks MD5 of the raw frame data + void Md5Check(const string &expected_md5) { + ASSERT_TRUE(input_file_ != NULL); + libaom_test::MD5 md5; + for (unsigned int i = start_; i < limit_; i++) { + md5.Add(img()); + Next(); + } + ASSERT_EQ(string(md5.Get()), expected_md5); + } +}; + +TEST_P(Y4mVideoSourceTest, SourceTest) { + const Y4mTestParam t = GetParam(); + Init(t.filename, kFrames); + HeaderChecks(t.bit_depth, t.format); + Md5Check(t.md5raw); +} + +INSTANTIATE_TEST_SUITE_P(C, Y4mVideoSourceTest, + ::testing::ValuesIn(kY4mTestVectors)); + +class Y4mVideoWriteTest : public Y4mVideoSourceTest { + protected: + Y4mVideoWriteTest() : tmpfile_(NULL) {} + + virtual ~Y4mVideoWriteTest() { + delete tmpfile_; + input_file_ = NULL; + } + + void ReplaceInputFile(FILE *input_file) { + CloseSource(); + frame_ = 0; + input_file_ = input_file; + rewind(input_file_); + ReadSourceToStart(); + } + + // Writes out a y4m file and then reads it back + void WriteY4mAndReadBack() { + ASSERT_TRUE(input_file_ != NULL); + char buf[Y4M_BUFFER_SIZE] = { 0 }; + const struct AvxRational framerate = { y4m_.fps_n, y4m_.fps_d }; + tmpfile_ = new libaom_test::TempOutFile; + ASSERT_TRUE(tmpfile_->file() != NULL); + y4m_write_file_header(buf, sizeof(buf), kWidth, kHeight, &framerate, + img()->monochrome, img()->csp, y4m_.aom_fmt, + y4m_.bit_depth); + fputs(buf, tmpfile_->file()); + for (unsigned int i = start_; i < limit_; i++) { + y4m_write_frame_header(buf, sizeof(buf)); + fputs(buf, tmpfile_->file()); + y4m_write_image_file(img(), PLANES_YUV, tmpfile_->file()); + Next(); + } + ReplaceInputFile(tmpfile_->file()); + } + + virtual void Init(const std::string &file_name, int limit) { + Y4mVideoSourceTest::Init(file_name, limit); + WriteY4mAndReadBack(); + } + libaom_test::TempOutFile *tmpfile_; +}; + +TEST_P(Y4mVideoWriteTest, WriteTest) { + const Y4mTestParam t = GetParam(); + Init(t.filename, kFrames); + HeaderChecks(t.bit_depth, t.format); + Md5Check(t.md5raw); +} + +INSTANTIATE_TEST_SUITE_P(C, Y4mVideoWriteTest, + ::testing::ValuesIn(kY4mTestVectors)); +} // namespace diff --git a/libs/libaom/src/test/y4m_video_source.h b/libs/libaom/src/test/y4m_video_source.h new file mode 100644 index 000000000..63f74f567 --- /dev/null +++ b/libs/libaom/src/test/y4m_video_source.h @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_TEST_Y4M_VIDEO_SOURCE_H_ +#define AOM_TEST_Y4M_VIDEO_SOURCE_H_ +#include +#include +#include + +#include "common/y4minput.h" +#include "test/video_source.h" + +namespace libaom_test { + +// This class extends VideoSource to allow parsing of raw yv12 +// so that we can do actual file encodes. +class Y4mVideoSource : public VideoSource { + public: + Y4mVideoSource(const std::string &file_name, unsigned int start, int limit) + : file_name_(file_name), input_file_(NULL), img_(new aom_image_t()), + start_(start), limit_(limit), frame_(0), framerate_numerator_(0), + framerate_denominator_(0), y4m_() {} + + virtual ~Y4mVideoSource() { + aom_img_free(img_.get()); + CloseSource(); + } + + virtual void OpenSource() { + CloseSource(); + input_file_ = OpenTestDataFile(file_name_); + ASSERT_TRUE(input_file_ != NULL) + << "Input file open failed. Filename: " << file_name_; + } + + virtual void ReadSourceToStart() { + ASSERT_TRUE(input_file_ != NULL); + ASSERT_FALSE( + y4m_input_open(&y4m_, input_file_, NULL, 0, AOM_CSP_UNKNOWN, 0)); + framerate_numerator_ = y4m_.fps_n; + framerate_denominator_ = y4m_.fps_d; + frame_ = 0; + for (unsigned int i = 0; i < start_; i++) { + Next(); + } + FillFrame(); + } + + virtual void Begin() { + OpenSource(); + ReadSourceToStart(); + } + + virtual void Next() { + ++frame_; + FillFrame(); + } + + virtual aom_image_t *img() const { + return (frame_ < limit_) ? img_.get() : NULL; + } + + // Models a stream where Timebase = 1/FPS, so pts == frame. + virtual aom_codec_pts_t pts() const { return frame_; } + + virtual unsigned long duration() const { return 1; } + + virtual aom_rational_t timebase() const { + const aom_rational_t t = { framerate_denominator_, framerate_numerator_ }; + return t; + } + + virtual unsigned int frame() const { return frame_; } + + virtual unsigned int limit() const { return limit_; } + + virtual void FillFrame() { + ASSERT_TRUE(input_file_ != NULL); + // Read a frame from input_file. + y4m_input_fetch_frame(&y4m_, input_file_, img_.get()); + } + + // Swap buffers with another y4m source. This allows reading a new frame + // while keeping the old frame around. A whole Y4mSource is required and + // not just a aom_image_t because of how the y4m reader manipulates + // aom_image_t internals, + void SwapBuffers(Y4mVideoSource *other) { + std::swap(other->y4m_.dst_buf, y4m_.dst_buf); + aom_image_t *tmp; + tmp = other->img_.release(); + other->img_.reset(img_.release()); + img_.reset(tmp); + } + + protected: + void CloseSource() { + y4m_input_close(&y4m_); + y4m_ = y4m_input(); + if (input_file_ != NULL) { + fclose(input_file_); + input_file_ = NULL; + } + } + + std::string file_name_; + FILE *input_file_; + std::unique_ptr img_; + unsigned int start_; + unsigned int limit_; + unsigned int frame_; + int framerate_numerator_; + int framerate_denominator_; + y4m_input y4m_; +}; + +} // namespace libaom_test + +#endif // AOM_TEST_Y4M_VIDEO_SOURCE_H_ diff --git a/libs/libaom/src/test/yuv_video_source.h b/libs/libaom/src/test/yuv_video_source.h new file mode 100644 index 000000000..774ecc008 --- /dev/null +++ b/libs/libaom/src/test/yuv_video_source.h @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_TEST_YUV_VIDEO_SOURCE_H_ +#define AOM_TEST_YUV_VIDEO_SOURCE_H_ + +#include +#include +#include + +#include "test/video_source.h" +#include "aom/aom_image.h" + +namespace libaom_test { + +// This class extends VideoSource to allow parsing of raw YUV +// formats of various color sampling and bit-depths so that we can +// do actual file encodes. +class YUVVideoSource : public VideoSource { + public: + YUVVideoSource(const std::string &file_name, aom_img_fmt format, + unsigned int width, unsigned int height, int rate_numerator, + int rate_denominator, unsigned int start, int limit) + : file_name_(file_name), input_file_(NULL), img_(NULL), start_(start), + limit_(limit), frame_(0), width_(0), height_(0), + format_(AOM_IMG_FMT_NONE), framerate_numerator_(rate_numerator), + framerate_denominator_(rate_denominator) { + // This initializes format_, raw_size_, width_, height_ and allocates img. + SetSize(width, height, format); + } + + virtual ~YUVVideoSource() { + aom_img_free(img_); + if (input_file_) fclose(input_file_); + } + + virtual void Begin() { + if (input_file_) fclose(input_file_); + input_file_ = OpenTestDataFile(file_name_); + ASSERT_TRUE(input_file_ != NULL) + << "Input file open failed. Filename: " << file_name_; + if (start_) + fseek(input_file_, static_cast(raw_size_) * start_, SEEK_SET); + + frame_ = start_; + FillFrame(); + } + + virtual void Next() { + ++frame_; + FillFrame(); + } + + virtual aom_image_t *img() const { return (frame_ < limit_) ? img_ : NULL; } + + // Models a stream where Timebase = 1/FPS, so pts == frame. + virtual aom_codec_pts_t pts() const { return frame_; } + + virtual unsigned long duration() const { return 1; } + + virtual aom_rational_t timebase() const { + const aom_rational_t t = { framerate_denominator_, framerate_numerator_ }; + return t; + } + + virtual unsigned int frame() const { return frame_; } + + virtual unsigned int limit() const { return limit_; } + + virtual void SetSize(unsigned int width, unsigned int height, + aom_img_fmt format) { + if (width != width_ || height != height_ || format != format_) { + aom_img_free(img_); + img_ = aom_img_alloc(NULL, format, width, height, 1); + ASSERT_TRUE(img_ != NULL); + width_ = width; + height_ = height; + format_ = format; + switch (format) { + case AOM_IMG_FMT_I420: raw_size_ = width * height * 3 / 2; break; + case AOM_IMG_FMT_I422: raw_size_ = width * height * 2; break; + case AOM_IMG_FMT_I444: raw_size_ = width * height * 3; break; + case AOM_IMG_FMT_I42016: raw_size_ = width * height * 3; break; + case AOM_IMG_FMT_I42216: raw_size_ = width * height * 4; break; + case AOM_IMG_FMT_I44416: raw_size_ = width * height * 6; break; + default: ASSERT_TRUE(0); + } + } + } + + virtual void FillFrame() { + ASSERT_TRUE(input_file_ != NULL); + // Read a frame from input_file. + if (fread(img_->img_data, raw_size_, 1, input_file_) == 0) { + limit_ = frame_; + } + } + + protected: + std::string file_name_; + FILE *input_file_; + aom_image_t *img_; + size_t raw_size_; + unsigned int start_; + unsigned int limit_; + unsigned int frame_; + unsigned int width_; + unsigned int height_; + aom_img_fmt format_; + int framerate_numerator_; + int framerate_denominator_; +}; + +} // namespace libaom_test + +#endif // AOM_TEST_YUV_VIDEO_SOURCE_H_ diff --git a/libs/libaom/src/third_party/fastfeat/LICENSE b/libs/libaom/src/third_party/fastfeat/LICENSE new file mode 100644 index 000000000..f347008d6 --- /dev/null +++ b/libs/libaom/src/third_party/fastfeat/LICENSE @@ -0,0 +1,30 @@ +Copyright (c) 2006, 2008 Edward Rosten +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + + *Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + *Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + *Neither the name of the University of Cambridge nor the names of + its contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/libs/libaom/src/third_party/fastfeat/README.libaom b/libs/libaom/src/third_party/fastfeat/README.libaom new file mode 100644 index 000000000..a732b0d93 --- /dev/null +++ b/libs/libaom/src/third_party/fastfeat/README.libaom @@ -0,0 +1,40 @@ +URL: https://github.com/edrosten/fast-C-src +Version: 391d5e939eb1545d24c10533d7de424db8d9c191 +License: BSD +License File: LICENSE + +Description: +Library to compute FAST features with non-maximum suppression. + +The files are valid C and C++ code, and have no special requirements for +compiling, and they do not depend on any libraries. Just compile them along with +the rest of your project. + +To use the functions, #include "fast.h" + +The corner detectors have the following prototype (where X is 9, 10, 11 or 12): + +xy* fastX_detect_nonmax(const unsigned char * data, int xsize, int ysize, int stride, int threshold, int* numcorners) + +Where xy is the following simple struct typedef: + +typedef struct +{ + int x, y; +} xy; + +The image is passed in as a block of data and dimensions, and the list of +corners is returned as an array of xy structs, and an integer (numcorners) +with the number of corners returned. The data can be deallocated with free(). +Nonmaximal suppression is performed on the corners. Note that the stride +is the number of bytes between rows. If your image has no padding, then this +is the same as xsize. + +The detection, scoring and nonmaximal suppression are available as individual +functions. To see how to use the individual functions, see fast.c + +Local Modifications: +Add lines to turn off clang formatting for these files +Remove Fast 10, 11 and 12 +Convert tabs to spaces +Prefix global functions with "aom_" diff --git a/libs/libaom/src/third_party/fastfeat/fast.c b/libs/libaom/src/third_party/fastfeat/fast.c new file mode 100644 index 000000000..f29ac8f72 --- /dev/null +++ b/libs/libaom/src/third_party/fastfeat/fast.c @@ -0,0 +1,22 @@ +// clang-format off +#include +#include "fast.h" + + +xy* aom_fast9_detect_nonmax(const byte* im, int xsize, int ysize, int stride, int b, int* ret_num_corners) +{ + xy* corners; + int num_corners; + int* scores; + xy* nonmax; + + corners = aom_fast9_detect(im, xsize, ysize, stride, b, &num_corners); + scores = aom_fast9_score(im, stride, corners, num_corners, b); + nonmax = aom_nonmax_suppression(corners, scores, num_corners, ret_num_corners); + + free(corners); + free(scores); + + return nonmax; +} +// clang-format on diff --git a/libs/libaom/src/third_party/fastfeat/fast.h b/libs/libaom/src/third_party/fastfeat/fast.h new file mode 100644 index 000000000..a65d5a5d1 --- /dev/null +++ b/libs/libaom/src/third_party/fastfeat/fast.h @@ -0,0 +1,20 @@ +// clang-format off +#ifndef FAST_H +#define FAST_H + +typedef struct { int x, y; } xy; +typedef unsigned char byte; + +int aom_fast9_corner_score(const byte* p, const int pixel[], int bstart); + +xy* aom_fast9_detect(const byte* im, int xsize, int ysize, int stride, int b, int* ret_num_corners); + +int* aom_fast9_score(const byte* i, int stride, xy* corners, int num_corners, int b); + +xy* aom_fast9_detect_nonmax(const byte* im, int xsize, int ysize, int stride, int b, int* ret_num_corners); + +xy* aom_nonmax_suppression(const xy* corners, const int* scores, int num_corners, int* ret_num_nonmax); + + +#endif +// clang-format on diff --git a/libs/libaom/src/third_party/fastfeat/fast_9.c b/libs/libaom/src/third_party/fastfeat/fast_9.c new file mode 100644 index 000000000..61c654c47 --- /dev/null +++ b/libs/libaom/src/third_party/fastfeat/fast_9.c @@ -0,0 +1,5911 @@ +// clang-format off +/*This is mechanically generated code*/ +#include + +typedef struct { int x, y; } xy; +typedef unsigned char byte; + +int aom_fast9_corner_score(const byte* p, const int pixel[], int bstart) +{ + int bmin = bstart; + int bmax = 255; + int b = (bmax + bmin)/2; + + /*Compute the score using binary search*/ + for(;;) + { + int cb = *p + b; + int c_b= *p - b; + + + if( p[pixel[0]] > cb) + if( p[pixel[1]] > cb) + if( p[pixel[2]] > cb) + if( p[pixel[3]] > cb) + if( p[pixel[4]] > cb) + if( p[pixel[5]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + goto is_a_corner; + else + if( p[pixel[15]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[7]] < c_b) + if( p[pixel[14]] > cb) + if( p[pixel[15]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[14]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + if( p[pixel[15]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[14]] > cb) + if( p[pixel[15]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[6]] < c_b) + if( p[pixel[15]] > cb) + if( p[pixel[13]] > cb) + if( p[pixel[14]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[13]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[14]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + if( p[pixel[14]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[13]] > cb) + if( p[pixel[14]] > cb) + if( p[pixel[15]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[13]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[14]] < c_b) + if( p[pixel[15]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[5]] < c_b) + if( p[pixel[14]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + if( p[pixel[15]] > cb) + goto is_a_corner; + else + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[12]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + if( p[pixel[13]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[14]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + if( p[pixel[6]] < c_b) + goto is_a_corner; + else + if( p[pixel[15]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + if( p[pixel[14]] > cb) + if( p[pixel[15]] > cb) + goto is_a_corner; + else + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[12]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + if( p[pixel[13]] < c_b) + if( p[pixel[14]] < c_b) + if( p[pixel[6]] < c_b) + goto is_a_corner; + else + if( p[pixel[15]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[4]] < c_b) + if( p[pixel[13]] > cb) + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[14]] > cb) + if( p[pixel[15]] > cb) + goto is_a_corner; + else + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + if( p[pixel[10]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[5]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + if( p[pixel[10]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[11]] < c_b) + if( p[pixel[5]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + if( p[pixel[10]] < c_b) + if( p[pixel[12]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[13]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[5]] < c_b) + goto is_a_corner; + else + if( p[pixel[14]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[14]] < c_b) + if( p[pixel[15]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[5]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + if( p[pixel[14]] > cb) + if( p[pixel[15]] > cb) + goto is_a_corner; + else + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + if( p[pixel[10]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[5]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + if( p[pixel[10]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[11]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + if( p[pixel[10]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[5]] < c_b) + goto is_a_corner; + else + if( p[pixel[14]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[14]] < c_b) + if( p[pixel[15]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[3]] < c_b) + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + if( p[pixel[14]] > cb) + if( p[pixel[15]] > cb) + goto is_a_corner; + else + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[5]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[4]] > cb) + if( p[pixel[5]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[10]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + if( p[pixel[11]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[5]] < c_b) + if( p[pixel[4]] < c_b) + goto is_a_corner; + else + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + if( p[pixel[14]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + if( p[pixel[14]] < c_b) + if( p[pixel[15]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + if( p[pixel[14]] > cb) + if( p[pixel[15]] > cb) + goto is_a_corner; + else + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[5]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[4]] > cb) + if( p[pixel[5]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[10]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[5]] < c_b) + if( p[pixel[4]] < c_b) + goto is_a_corner; + else + if( p[pixel[13]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[13]] < c_b) + if( p[pixel[14]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[13]] < c_b) + if( p[pixel[14]] < c_b) + if( p[pixel[15]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[2]] < c_b) + if( p[pixel[9]] > cb) + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + if( p[pixel[14]] > cb) + if( p[pixel[15]] > cb) + goto is_a_corner; + else + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[5]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[4]] > cb) + if( p[pixel[5]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[3]] > cb) + if( p[pixel[4]] > cb) + if( p[pixel[5]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[9]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[10]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[5]] < c_b) + if( p[pixel[4]] < c_b) + if( p[pixel[3]] < c_b) + goto is_a_corner; + else + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + if( p[pixel[14]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + if( p[pixel[14]] < c_b) + if( p[pixel[15]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[9]] > cb) + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + if( p[pixel[14]] > cb) + if( p[pixel[15]] > cb) + goto is_a_corner; + else + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[5]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[4]] > cb) + if( p[pixel[5]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[3]] > cb) + if( p[pixel[4]] > cb) + if( p[pixel[5]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[9]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[5]] < c_b) + if( p[pixel[4]] < c_b) + if( p[pixel[3]] < c_b) + goto is_a_corner; + else + if( p[pixel[12]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + if( p[pixel[14]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + if( p[pixel[14]] < c_b) + if( p[pixel[15]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[1]] < c_b) + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + if( p[pixel[14]] > cb) + if( p[pixel[15]] > cb) + goto is_a_corner; + else + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[5]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[4]] > cb) + if( p[pixel[5]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[3]] > cb) + if( p[pixel[4]] > cb) + if( p[pixel[5]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[2]] > cb) + if( p[pixel[3]] > cb) + if( p[pixel[4]] > cb) + if( p[pixel[5]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[8]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[9]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[5]] < c_b) + if( p[pixel[4]] < c_b) + if( p[pixel[3]] < c_b) + if( p[pixel[2]] < c_b) + goto is_a_corner; + else + if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + if( p[pixel[14]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + if( p[pixel[14]] < c_b) + if( p[pixel[15]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + if( p[pixel[14]] > cb) + if( p[pixel[15]] > cb) + goto is_a_corner; + else + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[5]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[4]] > cb) + if( p[pixel[5]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[3]] > cb) + if( p[pixel[4]] > cb) + if( p[pixel[5]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[2]] > cb) + if( p[pixel[3]] > cb) + if( p[pixel[4]] > cb) + if( p[pixel[5]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[8]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[9]] < c_b) + if( p[pixel[10]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[5]] < c_b) + if( p[pixel[4]] < c_b) + if( p[pixel[3]] < c_b) + if( p[pixel[2]] < c_b) + goto is_a_corner; + else + if( p[pixel[11]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + if( p[pixel[14]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + if( p[pixel[14]] < c_b) + if( p[pixel[15]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[0]] < c_b) + if( p[pixel[1]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[9]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[5]] > cb) + if( p[pixel[4]] > cb) + if( p[pixel[3]] > cb) + if( p[pixel[2]] > cb) + goto is_a_corner; + else + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + if( p[pixel[14]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + if( p[pixel[14]] > cb) + if( p[pixel[15]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + if( p[pixel[14]] < c_b) + if( p[pixel[15]] < c_b) + goto is_a_corner; + else + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[5]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[4]] < c_b) + if( p[pixel[5]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[3]] < c_b) + if( p[pixel[4]] < c_b) + if( p[pixel[5]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[2]] < c_b) + if( p[pixel[3]] < c_b) + if( p[pixel[4]] < c_b) + if( p[pixel[5]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[1]] < c_b) + if( p[pixel[2]] > cb) + if( p[pixel[9]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[10]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[5]] > cb) + if( p[pixel[4]] > cb) + if( p[pixel[3]] > cb) + goto is_a_corner; + else + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + if( p[pixel[14]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + if( p[pixel[14]] > cb) + if( p[pixel[15]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[9]] < c_b) + if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + if( p[pixel[14]] < c_b) + if( p[pixel[15]] < c_b) + goto is_a_corner; + else + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[5]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[4]] < c_b) + if( p[pixel[5]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[3]] < c_b) + if( p[pixel[4]] < c_b) + if( p[pixel[5]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[2]] < c_b) + if( p[pixel[3]] > cb) + if( p[pixel[10]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + if( p[pixel[11]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[5]] > cb) + if( p[pixel[4]] > cb) + goto is_a_corner; + else + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + if( p[pixel[14]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + if( p[pixel[14]] > cb) + if( p[pixel[15]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + if( p[pixel[14]] < c_b) + if( p[pixel[15]] < c_b) + goto is_a_corner; + else + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[5]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[4]] < c_b) + if( p[pixel[5]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[3]] < c_b) + if( p[pixel[4]] > cb) + if( p[pixel[13]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[5]] > cb) + goto is_a_corner; + else + if( p[pixel[14]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[14]] > cb) + if( p[pixel[15]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[13]] < c_b) + if( p[pixel[11]] > cb) + if( p[pixel[5]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + if( p[pixel[10]] > cb) + if( p[pixel[12]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[14]] < c_b) + if( p[pixel[15]] < c_b) + goto is_a_corner; + else + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + if( p[pixel[10]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[5]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + if( p[pixel[10]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[5]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[4]] < c_b) + if( p[pixel[5]] > cb) + if( p[pixel[14]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + if( p[pixel[6]] > cb) + goto is_a_corner; + else + if( p[pixel[15]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[14]] < c_b) + if( p[pixel[12]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + if( p[pixel[13]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + if( p[pixel[15]] < c_b) + goto is_a_corner; + else + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[6]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[5]] < c_b) + if( p[pixel[6]] > cb) + if( p[pixel[15]] < c_b) + if( p[pixel[13]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[14]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[13]] < c_b) + if( p[pixel[14]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + if( p[pixel[14]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[6]] < c_b) + if( p[pixel[7]] > cb) + if( p[pixel[14]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + if( p[pixel[15]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[14]] < c_b) + if( p[pixel[15]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + goto is_a_corner; + else + if( p[pixel[15]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[14]] < c_b) + if( p[pixel[15]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[13]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[14]] > cb) + if( p[pixel[15]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[13]] < c_b) + if( p[pixel[14]] < c_b) + if( p[pixel[15]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[12]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + if( p[pixel[13]] > cb) + if( p[pixel[14]] > cb) + if( p[pixel[6]] > cb) + goto is_a_corner; + else + if( p[pixel[15]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + if( p[pixel[14]] < c_b) + if( p[pixel[15]] < c_b) + goto is_a_corner; + else + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[11]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + if( p[pixel[10]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[5]] > cb) + goto is_a_corner; + else + if( p[pixel[14]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[14]] > cb) + if( p[pixel[15]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + if( p[pixel[14]] < c_b) + if( p[pixel[15]] < c_b) + goto is_a_corner; + else + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + if( p[pixel[10]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[5]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + if( p[pixel[10]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[10]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[5]] > cb) + if( p[pixel[4]] > cb) + goto is_a_corner; + else + if( p[pixel[13]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[13]] > cb) + if( p[pixel[14]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[13]] > cb) + if( p[pixel[14]] > cb) + if( p[pixel[15]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + if( p[pixel[14]] < c_b) + if( p[pixel[15]] < c_b) + goto is_a_corner; + else + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[5]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[4]] < c_b) + if( p[pixel[5]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[9]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[5]] > cb) + if( p[pixel[4]] > cb) + if( p[pixel[3]] > cb) + goto is_a_corner; + else + if( p[pixel[12]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + if( p[pixel[14]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + if( p[pixel[14]] > cb) + if( p[pixel[15]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[9]] < c_b) + if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + if( p[pixel[14]] < c_b) + if( p[pixel[15]] < c_b) + goto is_a_corner; + else + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[5]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[4]] < c_b) + if( p[pixel[5]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[3]] < c_b) + if( p[pixel[4]] < c_b) + if( p[pixel[5]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[8]] > cb) + if( p[pixel[7]] > cb) + if( p[pixel[9]] > cb) + if( p[pixel[10]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[5]] > cb) + if( p[pixel[4]] > cb) + if( p[pixel[3]] > cb) + if( p[pixel[2]] > cb) + goto is_a_corner; + else + if( p[pixel[11]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + if( p[pixel[14]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + if( p[pixel[14]] > cb) + if( p[pixel[15]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + if( p[pixel[14]] < c_b) + if( p[pixel[15]] < c_b) + goto is_a_corner; + else + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[5]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[4]] < c_b) + if( p[pixel[5]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[3]] < c_b) + if( p[pixel[4]] < c_b) + if( p[pixel[5]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[2]] < c_b) + if( p[pixel[3]] < c_b) + if( p[pixel[4]] < c_b) + if( p[pixel[5]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[7]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[7]] > cb) + if( p[pixel[8]] > cb) + if( p[pixel[9]] > cb) + if( p[pixel[6]] > cb) + if( p[pixel[5]] > cb) + if( p[pixel[4]] > cb) + if( p[pixel[3]] > cb) + if( p[pixel[2]] > cb) + if( p[pixel[1]] > cb) + goto is_a_corner; + else + if( p[pixel[10]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + if( p[pixel[14]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[10]] > cb) + if( p[pixel[11]] > cb) + if( p[pixel[12]] > cb) + if( p[pixel[13]] > cb) + if( p[pixel[14]] > cb) + if( p[pixel[15]] > cb) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else if( p[pixel[7]] < c_b) + if( p[pixel[8]] < c_b) + if( p[pixel[9]] < c_b) + if( p[pixel[6]] < c_b) + if( p[pixel[5]] < c_b) + if( p[pixel[4]] < c_b) + if( p[pixel[3]] < c_b) + if( p[pixel[2]] < c_b) + if( p[pixel[1]] < c_b) + goto is_a_corner; + else + if( p[pixel[10]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + if( p[pixel[14]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + if( p[pixel[10]] < c_b) + if( p[pixel[11]] < c_b) + if( p[pixel[12]] < c_b) + if( p[pixel[13]] < c_b) + if( p[pixel[14]] < c_b) + if( p[pixel[15]] < c_b) + goto is_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + else + goto is_not_a_corner; + +is_a_corner: + bmin=b; + goto end_if; + +is_not_a_corner: + bmax=b; + goto end_if; + +end_if: + + if(bmin == bmax - 1 || bmin == bmax) + return bmin; + b = (bmin + bmax) / 2; + } +} + +static void make_offsets(int pixel[], int row_stride) +{ + pixel[0] = 0 + row_stride * 3; + pixel[1] = 1 + row_stride * 3; + pixel[2] = 2 + row_stride * 2; + pixel[3] = 3 + row_stride * 1; + pixel[4] = 3 + row_stride * 0; + pixel[5] = 3 + row_stride * -1; + pixel[6] = 2 + row_stride * -2; + pixel[7] = 1 + row_stride * -3; + pixel[8] = 0 + row_stride * -3; + pixel[9] = -1 + row_stride * -3; + pixel[10] = -2 + row_stride * -2; + pixel[11] = -3 + row_stride * -1; + pixel[12] = -3 + row_stride * 0; + pixel[13] = -3 + row_stride * 1; + pixel[14] = -2 + row_stride * 2; + pixel[15] = -1 + row_stride * 3; +} + + + +int* aom_fast9_score(const byte* i, int stride, xy* corners, int num_corners, int b) +{ + int* scores = (int*)malloc(sizeof(int)* num_corners); + int n; + + int pixel[16]; + make_offsets(pixel, stride); + + for(n=0; n < num_corners; n++) + scores[n] = aom_fast9_corner_score(i + corners[n].y*stride + corners[n].x, pixel, b); + + return scores; +} + + +xy* aom_fast9_detect(const byte* im, int xsize, int ysize, int stride, int b, int* ret_num_corners) +{ + int num_corners=0; + xy* ret_corners; + int rsize=512; + int pixel[16]; + int x, y; + + ret_corners = (xy*)malloc(sizeof(xy)*rsize); + make_offsets(pixel, stride); + + for(y=3; y < ysize - 3; y++) + for(x=3; x < xsize - 3; x++) + { + const byte* p = im + y*stride + x; + + int cb = *p + b; + int c_b= *p - b; + if(p[pixel[0]] > cb) + if(p[pixel[1]] > cb) + if(p[pixel[2]] > cb) + if(p[pixel[3]] > cb) + if(p[pixel[4]] > cb) + if(p[pixel[5]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + {} + else + if(p[pixel[15]] > cb) + {} + else + continue; + else if(p[pixel[7]] < c_b) + if(p[pixel[14]] > cb) + if(p[pixel[15]] > cb) + {} + else + continue; + else if(p[pixel[14]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + if(p[pixel[15]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[14]] > cb) + if(p[pixel[15]] > cb) + {} + else + continue; + else + continue; + else if(p[pixel[6]] < c_b) + if(p[pixel[15]] > cb) + if(p[pixel[13]] > cb) + if(p[pixel[14]] > cb) + {} + else + continue; + else if(p[pixel[13]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[14]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + if(p[pixel[14]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[13]] > cb) + if(p[pixel[14]] > cb) + if(p[pixel[15]] > cb) + {} + else + continue; + else + continue; + else if(p[pixel[13]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[14]] < c_b) + if(p[pixel[15]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[5]] < c_b) + if(p[pixel[14]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + if(p[pixel[15]] > cb) + {} + else + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[12]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + if(p[pixel[13]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[14]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + if(p[pixel[6]] < c_b) + {} + else + if(p[pixel[15]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + if(p[pixel[14]] > cb) + if(p[pixel[15]] > cb) + {} + else + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[12]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + if(p[pixel[13]] < c_b) + if(p[pixel[14]] < c_b) + if(p[pixel[6]] < c_b) + {} + else + if(p[pixel[15]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[4]] < c_b) + if(p[pixel[13]] > cb) + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[14]] > cb) + if(p[pixel[15]] > cb) + {} + else + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + if(p[pixel[10]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[5]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + if(p[pixel[10]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[11]] < c_b) + if(p[pixel[5]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + if(p[pixel[10]] < c_b) + if(p[pixel[12]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[13]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[5]] < c_b) + {} + else + if(p[pixel[14]] < c_b) + {} + else + continue; + else + if(p[pixel[14]] < c_b) + if(p[pixel[15]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[5]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + if(p[pixel[14]] > cb) + if(p[pixel[15]] > cb) + {} + else + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + if(p[pixel[10]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[5]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + if(p[pixel[10]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[11]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + if(p[pixel[10]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[5]] < c_b) + {} + else + if(p[pixel[14]] < c_b) + {} + else + continue; + else + if(p[pixel[14]] < c_b) + if(p[pixel[15]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[3]] < c_b) + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + if(p[pixel[14]] > cb) + if(p[pixel[15]] > cb) + {} + else + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[5]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[4]] > cb) + if(p[pixel[5]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[10]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + if(p[pixel[11]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[5]] < c_b) + if(p[pixel[4]] < c_b) + {} + else + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + {} + else + continue; + else + continue; + else + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + if(p[pixel[14]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + if(p[pixel[14]] < c_b) + if(p[pixel[15]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + if(p[pixel[14]] > cb) + if(p[pixel[15]] > cb) + {} + else + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[5]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[4]] > cb) + if(p[pixel[5]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[10]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[5]] < c_b) + if(p[pixel[4]] < c_b) + {} + else + if(p[pixel[13]] < c_b) + {} + else + continue; + else + if(p[pixel[13]] < c_b) + if(p[pixel[14]] < c_b) + {} + else + continue; + else + continue; + else + if(p[pixel[13]] < c_b) + if(p[pixel[14]] < c_b) + if(p[pixel[15]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[2]] < c_b) + if(p[pixel[9]] > cb) + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + if(p[pixel[14]] > cb) + if(p[pixel[15]] > cb) + {} + else + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + if(p[pixel[5]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[4]] > cb) + if(p[pixel[5]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[3]] > cb) + if(p[pixel[4]] > cb) + if(p[pixel[5]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[9]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[10]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[5]] < c_b) + if(p[pixel[4]] < c_b) + if(p[pixel[3]] < c_b) + {} + else + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + {} + else + continue; + else + continue; + else + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + if(p[pixel[14]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + if(p[pixel[14]] < c_b) + if(p[pixel[15]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[9]] > cb) + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + if(p[pixel[14]] > cb) + if(p[pixel[15]] > cb) + {} + else + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + if(p[pixel[5]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[4]] > cb) + if(p[pixel[5]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[3]] > cb) + if(p[pixel[4]] > cb) + if(p[pixel[5]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[9]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[5]] < c_b) + if(p[pixel[4]] < c_b) + if(p[pixel[3]] < c_b) + {} + else + if(p[pixel[12]] < c_b) + {} + else + continue; + else + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + {} + else + continue; + else + continue; + else + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + if(p[pixel[14]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + if(p[pixel[14]] < c_b) + if(p[pixel[15]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[1]] < c_b) + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + if(p[pixel[14]] > cb) + if(p[pixel[15]] > cb) + {} + else + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + {} + else + continue; + else + continue; + else + if(p[pixel[5]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + if(p[pixel[4]] > cb) + if(p[pixel[5]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[3]] > cb) + if(p[pixel[4]] > cb) + if(p[pixel[5]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[2]] > cb) + if(p[pixel[3]] > cb) + if(p[pixel[4]] > cb) + if(p[pixel[5]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[8]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[9]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[5]] < c_b) + if(p[pixel[4]] < c_b) + if(p[pixel[3]] < c_b) + if(p[pixel[2]] < c_b) + {} + else + if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + {} + else + continue; + else + continue; + else + if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + if(p[pixel[14]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + if(p[pixel[14]] < c_b) + if(p[pixel[15]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + if(p[pixel[14]] > cb) + if(p[pixel[15]] > cb) + {} + else + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + {} + else + continue; + else + continue; + else + if(p[pixel[5]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + if(p[pixel[4]] > cb) + if(p[pixel[5]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[3]] > cb) + if(p[pixel[4]] > cb) + if(p[pixel[5]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[2]] > cb) + if(p[pixel[3]] > cb) + if(p[pixel[4]] > cb) + if(p[pixel[5]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[8]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[9]] < c_b) + if(p[pixel[10]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[5]] < c_b) + if(p[pixel[4]] < c_b) + if(p[pixel[3]] < c_b) + if(p[pixel[2]] < c_b) + {} + else + if(p[pixel[11]] < c_b) + {} + else + continue; + else + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + {} + else + continue; + else + continue; + else + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + if(p[pixel[14]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + if(p[pixel[14]] < c_b) + if(p[pixel[15]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[0]] < c_b) + if(p[pixel[1]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[9]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[5]] > cb) + if(p[pixel[4]] > cb) + if(p[pixel[3]] > cb) + if(p[pixel[2]] > cb) + {} + else + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + {} + else + continue; + else + continue; + else + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + if(p[pixel[14]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + if(p[pixel[14]] > cb) + if(p[pixel[15]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + if(p[pixel[14]] < c_b) + if(p[pixel[15]] < c_b) + {} + else + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + {} + else + continue; + else + continue; + else + if(p[pixel[5]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + if(p[pixel[4]] < c_b) + if(p[pixel[5]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[3]] < c_b) + if(p[pixel[4]] < c_b) + if(p[pixel[5]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[2]] < c_b) + if(p[pixel[3]] < c_b) + if(p[pixel[4]] < c_b) + if(p[pixel[5]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[1]] < c_b) + if(p[pixel[2]] > cb) + if(p[pixel[9]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[10]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[5]] > cb) + if(p[pixel[4]] > cb) + if(p[pixel[3]] > cb) + {} + else + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + {} + else + continue; + else + continue; + else + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + if(p[pixel[14]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + if(p[pixel[14]] > cb) + if(p[pixel[15]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[9]] < c_b) + if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + if(p[pixel[14]] < c_b) + if(p[pixel[15]] < c_b) + {} + else + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + if(p[pixel[5]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[4]] < c_b) + if(p[pixel[5]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[3]] < c_b) + if(p[pixel[4]] < c_b) + if(p[pixel[5]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[2]] < c_b) + if(p[pixel[3]] > cb) + if(p[pixel[10]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + if(p[pixel[11]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[5]] > cb) + if(p[pixel[4]] > cb) + {} + else + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + {} + else + continue; + else + continue; + else + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + if(p[pixel[14]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + if(p[pixel[14]] > cb) + if(p[pixel[15]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + if(p[pixel[14]] < c_b) + if(p[pixel[15]] < c_b) + {} + else + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[5]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[4]] < c_b) + if(p[pixel[5]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[3]] < c_b) + if(p[pixel[4]] > cb) + if(p[pixel[13]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[5]] > cb) + {} + else + if(p[pixel[14]] > cb) + {} + else + continue; + else + if(p[pixel[14]] > cb) + if(p[pixel[15]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[13]] < c_b) + if(p[pixel[11]] > cb) + if(p[pixel[5]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + if(p[pixel[10]] > cb) + if(p[pixel[12]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[14]] < c_b) + if(p[pixel[15]] < c_b) + {} + else + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + if(p[pixel[10]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[5]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + if(p[pixel[10]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[5]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[4]] < c_b) + if(p[pixel[5]] > cb) + if(p[pixel[14]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + if(p[pixel[6]] > cb) + {} + else + if(p[pixel[15]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[14]] < c_b) + if(p[pixel[12]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + if(p[pixel[13]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + if(p[pixel[15]] < c_b) + {} + else + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[6]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[5]] < c_b) + if(p[pixel[6]] > cb) + if(p[pixel[15]] < c_b) + if(p[pixel[13]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[14]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[13]] < c_b) + if(p[pixel[14]] < c_b) + {} + else + continue; + else + continue; + else + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + if(p[pixel[14]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[6]] < c_b) + if(p[pixel[7]] > cb) + if(p[pixel[14]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + if(p[pixel[15]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[14]] < c_b) + if(p[pixel[15]] < c_b) + {} + else + continue; + else + continue; + else if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + {} + else + if(p[pixel[15]] < c_b) + {} + else + continue; + else + if(p[pixel[14]] < c_b) + if(p[pixel[15]] < c_b) + {} + else + continue; + else + continue; + else + if(p[pixel[13]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[14]] > cb) + if(p[pixel[15]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[13]] < c_b) + if(p[pixel[14]] < c_b) + if(p[pixel[15]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + if(p[pixel[12]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + if(p[pixel[13]] > cb) + if(p[pixel[14]] > cb) + if(p[pixel[6]] > cb) + {} + else + if(p[pixel[15]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + if(p[pixel[14]] < c_b) + if(p[pixel[15]] < c_b) + {} + else + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[11]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + if(p[pixel[10]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[5]] > cb) + {} + else + if(p[pixel[14]] > cb) + {} + else + continue; + else + if(p[pixel[14]] > cb) + if(p[pixel[15]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + if(p[pixel[14]] < c_b) + if(p[pixel[15]] < c_b) + {} + else + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + if(p[pixel[10]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[5]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + if(p[pixel[10]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[10]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[5]] > cb) + if(p[pixel[4]] > cb) + {} + else + if(p[pixel[13]] > cb) + {} + else + continue; + else + if(p[pixel[13]] > cb) + if(p[pixel[14]] > cb) + {} + else + continue; + else + continue; + else + if(p[pixel[13]] > cb) + if(p[pixel[14]] > cb) + if(p[pixel[15]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + if(p[pixel[14]] < c_b) + if(p[pixel[15]] < c_b) + {} + else + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[5]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[4]] < c_b) + if(p[pixel[5]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[9]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[5]] > cb) + if(p[pixel[4]] > cb) + if(p[pixel[3]] > cb) + {} + else + if(p[pixel[12]] > cb) + {} + else + continue; + else + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + {} + else + continue; + else + continue; + else + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + if(p[pixel[14]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + if(p[pixel[14]] > cb) + if(p[pixel[15]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[9]] < c_b) + if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + if(p[pixel[14]] < c_b) + if(p[pixel[15]] < c_b) + {} + else + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + if(p[pixel[5]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[4]] < c_b) + if(p[pixel[5]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[3]] < c_b) + if(p[pixel[4]] < c_b) + if(p[pixel[5]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[8]] > cb) + if(p[pixel[7]] > cb) + if(p[pixel[9]] > cb) + if(p[pixel[10]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[5]] > cb) + if(p[pixel[4]] > cb) + if(p[pixel[3]] > cb) + if(p[pixel[2]] > cb) + {} + else + if(p[pixel[11]] > cb) + {} + else + continue; + else + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + {} + else + continue; + else + continue; + else + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + if(p[pixel[14]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + if(p[pixel[14]] > cb) + if(p[pixel[15]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + if(p[pixel[14]] < c_b) + if(p[pixel[15]] < c_b) + {} + else + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + {} + else + continue; + else + continue; + else + if(p[pixel[5]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + if(p[pixel[4]] < c_b) + if(p[pixel[5]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[3]] < c_b) + if(p[pixel[4]] < c_b) + if(p[pixel[5]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[2]] < c_b) + if(p[pixel[3]] < c_b) + if(p[pixel[4]] < c_b) + if(p[pixel[5]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[7]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[7]] > cb) + if(p[pixel[8]] > cb) + if(p[pixel[9]] > cb) + if(p[pixel[6]] > cb) + if(p[pixel[5]] > cb) + if(p[pixel[4]] > cb) + if(p[pixel[3]] > cb) + if(p[pixel[2]] > cb) + if(p[pixel[1]] > cb) + {} + else + if(p[pixel[10]] > cb) + {} + else + continue; + else + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + {} + else + continue; + else + continue; + else + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + if(p[pixel[14]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[10]] > cb) + if(p[pixel[11]] > cb) + if(p[pixel[12]] > cb) + if(p[pixel[13]] > cb) + if(p[pixel[14]] > cb) + if(p[pixel[15]] > cb) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else if(p[pixel[7]] < c_b) + if(p[pixel[8]] < c_b) + if(p[pixel[9]] < c_b) + if(p[pixel[6]] < c_b) + if(p[pixel[5]] < c_b) + if(p[pixel[4]] < c_b) + if(p[pixel[3]] < c_b) + if(p[pixel[2]] < c_b) + if(p[pixel[1]] < c_b) + {} + else + if(p[pixel[10]] < c_b) + {} + else + continue; + else + if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + {} + else + continue; + else + continue; + else + if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + if(p[pixel[14]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + if(p[pixel[10]] < c_b) + if(p[pixel[11]] < c_b) + if(p[pixel[12]] < c_b) + if(p[pixel[13]] < c_b) + if(p[pixel[14]] < c_b) + if(p[pixel[15]] < c_b) + {} + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + else + continue; + if(num_corners == rsize) + { + rsize*=2; + ret_corners = (xy*)realloc(ret_corners, sizeof(xy)*rsize); + } + ret_corners[num_corners].x = x; + ret_corners[num_corners].y = y; + num_corners++; + + } + + *ret_num_corners = num_corners; + return ret_corners; + +} + +// clang-format on diff --git a/libs/libaom/src/third_party/fastfeat/nonmax.c b/libs/libaom/src/third_party/fastfeat/nonmax.c new file mode 100644 index 000000000..0dbc660cb --- /dev/null +++ b/libs/libaom/src/third_party/fastfeat/nonmax.c @@ -0,0 +1,121 @@ +// clang-format off +#include +#include "fast.h" + + +#define Compare(X, Y) ((X)>=(Y)) + +xy* aom_nonmax_suppression(const xy* corners, const int* scores, int num_corners, int* ret_num_nonmax) +{ + int num_nonmax=0; + int last_row; + int* row_start; + int i, j; + xy* ret_nonmax; + const int sz = (int)num_corners; + + /*Point above points (roughly) to the pixel above the one of interest, if there + is a feature there.*/ + int point_above = 0; + int point_below = 0; + + + if(num_corners < 1) + { + *ret_num_nonmax = 0; + return 0; + } + + ret_nonmax = (xy*)malloc(num_corners * sizeof(xy)); + + /* Find where each row begins + (the corners are output in raster scan order). A beginning of -1 signifies + that there are no corners on that row. */ + last_row = corners[num_corners-1].y; + row_start = (int*)malloc((last_row+1)*sizeof(int)); + + for(i=0; i < last_row+1; i++) + row_start[i] = -1; + + { + int prev_row = -1; + for(i=0; i< num_corners; i++) + if(corners[i].y != prev_row) + { + row_start[corners[i].y] = i; + prev_row = corners[i].y; + } + } + + + + for(i=0; i < sz; i++) + { + int score = scores[i]; + xy pos = corners[i]; + + /*Check left */ + if(i > 0) + if(corners[i-1].x == pos.x-1 && corners[i-1].y == pos.y && Compare(scores[i-1], score)) + continue; + + /*Check right*/ + if(i < (sz - 1)) + if(corners[i+1].x == pos.x+1 && corners[i+1].y == pos.y && Compare(scores[i+1], score)) + continue; + + /*Check above (if there is a valid row above)*/ + if(pos.y > 0) + if (row_start[pos.y - 1] != -1) + { + /*Make sure that current point_above is one + row above.*/ + if(corners[point_above].y < pos.y - 1) + point_above = row_start[pos.y-1]; + + /*Make point_above point to the first of the pixels above the current point, + if it exists.*/ + for(; corners[point_above].y < pos.y && corners[point_above].x < pos.x - 1; point_above++) + {} + + + for(j=point_above; corners[j].y < pos.y && corners[j].x <= pos.x + 1; j++) + { + int x = corners[j].x; + if( (x == pos.x - 1 || x ==pos.x || x == pos.x+1) && Compare(scores[j], score)) + goto cont; + } + + } + + /*Check below (if there is anything below)*/ + if(pos.y >= 0) + if (pos.y != last_row && row_start[pos.y + 1] != -1 && point_below < sz) /*Nothing below*/ + { + if(corners[point_below].y < pos.y + 1) + point_below = row_start[pos.y+1]; + + /* Make point below point to one of the pixels belowthe current point, if it + exists.*/ + for(; point_below < sz && corners[point_below].y == pos.y+1 && corners[point_below].x < pos.x - 1; point_below++) + {} + + for(j=point_below; j < sz && corners[j].y == pos.y+1 && corners[j].x <= pos.x + 1; j++) + { + int x = corners[j].x; + if( (x == pos.x - 1 || x ==pos.x || x == pos.x+1) && Compare(scores[j],score)) + goto cont; + } + } + + ret_nonmax[num_nonmax++] = corners[i]; +cont: + ; + } + + free(row_start); + *ret_num_nonmax = num_nonmax; + return ret_nonmax; +} + +// clang-format on diff --git a/libs/libaom/src/third_party/googletest/README.libaom b/libs/libaom/src/third_party/googletest/README.libaom new file mode 100644 index 000000000..9b8a86398 --- /dev/null +++ b/libs/libaom/src/third_party/googletest/README.libaom @@ -0,0 +1,17 @@ +URL: https://github.com/google/googletest +Version: 1.10.x +License: BSD +License File: LICENSE + +Description: +Google's framework for writing C++ tests on a variety of platforms +(Linux, Mac OS X, Windows, Windows CE, Symbian, etc). Based on the +xUnit architecture. Supports automatic test discovery, a rich set of +assertions, user-defined assertions, death tests, fatal and non-fatal +failures, various options for running the tests, and XML test report +generation. + +Local Modifications: +- Replace everything in: + third_party/googletest/src/googletest/src/ + third_party/googletest/src/googletest/include/ diff --git a/libs/libaom/src/third_party/googletest/src/googletest/CHANGES b/libs/libaom/src/third_party/googletest/src/googletest/CHANGES new file mode 100644 index 000000000..055213242 --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/CHANGES @@ -0,0 +1,157 @@ +Changes for 1.7.0: + +* New feature: death tests are supported on OpenBSD and in iOS + simulator now. +* New feature: Google Test now implements a protocol to allow + a test runner to detect that a test program has exited + prematurely and report it as a failure (before it would be + falsely reported as a success if the exit code is 0). +* New feature: Test::RecordProperty() can now be used outside of the + lifespan of a test method, in which case it will be attributed to + the current test case or the test program in the XML report. +* New feature (potentially breaking): --gtest_list_tests now prints + the type parameters and value parameters for each test. +* Improvement: char pointers and char arrays are now escaped properly + in failure messages. +* Improvement: failure summary in XML reports now includes file and + line information. +* Improvement: the XML element now has a timestamp attribute. +* Improvement: When --gtest_filter is specified, XML report now doesn't + contain information about tests that are filtered out. +* Fixed the bug where long --gtest_filter flag values are truncated in + death tests. +* Potentially breaking change: RUN_ALL_TESTS() is now implemented as a + function instead of a macro in order to work better with Clang. +* Compatibility fixes with C++ 11 and various platforms. +* Bug/warning fixes. + +Changes for 1.6.0: + +* New feature: ADD_FAILURE_AT() for reporting a test failure at the + given source location -- useful for writing testing utilities. +* New feature: the universal value printer is moved from Google Mock + to Google Test. +* New feature: type parameters and value parameters are reported in + the XML report now. +* A gtest_disable_pthreads CMake option. +* Colored output works in GNU Screen sessions now. +* Parameters of value-parameterized tests are now printed in the + textual output. +* Failures from ad hoc test assertions run before RUN_ALL_TESTS() are + now correctly reported. +* Arguments of ASSERT_XY and EXPECT_XY no longer need to support << to + ostream. +* More complete handling of exceptions. +* GTEST_ASSERT_XY can be used instead of ASSERT_XY in case the latter + name is already used by another library. +* --gtest_catch_exceptions is now true by default, allowing a test + program to continue after an exception is thrown. +* Value-parameterized test fixtures can now derive from Test and + WithParamInterface separately, easing conversion of legacy tests. +* Death test messages are clearly marked to make them more + distinguishable from other messages. +* Compatibility fixes for Android, Google Native Client, MinGW, HP UX, + PowerPC, Lucid autotools, libCStd, Sun C++, Borland C++ Builder (Code Gear), + IBM XL C++ (Visual Age C++), and C++0x. +* Bug fixes and implementation clean-ups. +* Potentially incompatible changes: disables the harmful 'make install' + command in autotools. + +Changes for 1.5.0: + + * New feature: assertions can be safely called in multiple threads + where the pthreads library is available. + * New feature: predicates used inside EXPECT_TRUE() and friends + can now generate custom failure messages. + * New feature: Google Test can now be compiled as a DLL. + * New feature: fused source files are included. + * New feature: prints help when encountering unrecognized Google Test flags. + * Experimental feature: CMake build script (requires CMake 2.6.4+). + * Experimental feature: the Pump script for meta programming. + * double values streamed to an assertion are printed with enough precision + to differentiate any two different values. + * Google Test now works on Solaris and AIX. + * Build and test script improvements. + * Bug fixes and implementation clean-ups. + + Potentially breaking changes: + + * Stopped supporting VC++ 7.1 with exceptions disabled. + * Dropped support for 'make install'. + +Changes for 1.4.0: + + * New feature: the event listener API + * New feature: test shuffling + * New feature: the XML report format is closer to junitreport and can + be parsed by Hudson now. + * New feature: when a test runs under Visual Studio, its failures are + integrated in the IDE. + * New feature: /MD(d) versions of VC++ projects. + * New feature: elapsed time for the tests is printed by default. + * New feature: comes with a TR1 tuple implementation such that Boost + is no longer needed for Combine(). + * New feature: EXPECT_DEATH_IF_SUPPORTED macro and friends. + * New feature: the Xcode project can now produce static gtest + libraries in addition to a framework. + * Compatibility fixes for Solaris, Cygwin, minGW, Windows Mobile, + Symbian, gcc, and C++Builder. + * Bug fixes and implementation clean-ups. + +Changes for 1.3.0: + + * New feature: death tests on Windows, Cygwin, and Mac. + * New feature: ability to use Google Test assertions in other testing + frameworks. + * New feature: ability to run disabled test via + --gtest_also_run_disabled_tests. + * New feature: the --help flag for printing the usage. + * New feature: access to Google Test flag values in user code. + * New feature: a script that packs Google Test into one .h and one + .cc file for easy deployment. + * New feature: support for distributing test functions to multiple + machines (requires support from the test runner). + * Bug fixes and implementation clean-ups. + +Changes for 1.2.1: + + * Compatibility fixes for Linux IA-64 and IBM z/OS. + * Added support for using Boost and other TR1 implementations. + * Changes to the build scripts to support upcoming release of Google C++ + Mocking Framework. + * Added Makefile to the distribution package. + * Improved build instructions in README. + +Changes for 1.2.0: + + * New feature: value-parameterized tests. + * New feature: the ASSERT/EXPECT_(NON)FATAL_FAILURE(_ON_ALL_THREADS) + macros. + * Changed the XML report format to match JUnit/Ant's. + * Added tests to the Xcode project. + * Added scons/SConscript for building with SCons. + * Added src/gtest-all.cc for building Google Test from a single file. + * Fixed compatibility with Solaris and z/OS. + * Enabled running Python tests on systems with python 2.3 installed, + e.g. Mac OS X 10.4. + * Bug fixes. + +Changes for 1.1.0: + + * New feature: type-parameterized tests. + * New feature: exception assertions. + * New feature: printing elapsed time of tests. + * Improved the robustness of death tests. + * Added an Xcode project and samples. + * Adjusted the output format on Windows to be understandable by Visual Studio. + * Minor bug fixes. + +Changes for 1.0.1: + + * Added project files for Visual Studio 7.1. + * Fixed issues with compiling on Mac OS X. + * Fixed issues with compiling on Cygwin. + +Changes for 1.0.0: + + * Initial Open Source release of Google Test diff --git a/libs/libaom/src/third_party/googletest/src/googletest/CMakeLists.txt b/libs/libaom/src/third_party/googletest/src/googletest/CMakeLists.txt new file mode 100644 index 000000000..9ee79408c --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/CMakeLists.txt @@ -0,0 +1,331 @@ +######################################################################## +# CMake build script for Google Test. +# +# To run the tests for Google Test itself on Linux, use 'make test' or +# ctest. You can select which tests to run using 'ctest -R regex'. +# For more options, run 'ctest --help'. + +# When other libraries are using a shared version of runtime libraries, +# Google Test also has to use one. +option( + gtest_force_shared_crt + "Use shared (DLL) run-time lib even when Google Test is built as static lib." + OFF) + +option(gtest_build_tests "Build all of gtest's own tests." OFF) + +option(gtest_build_samples "Build gtest's sample programs." OFF) + +option(gtest_disable_pthreads "Disable uses of pthreads in gtest." OFF) + +option( + gtest_hide_internal_symbols + "Build gtest with internal symbols hidden in shared libraries." + OFF) + +# Defines pre_project_set_up_hermetic_build() and set_up_hermetic_build(). +include(cmake/hermetic_build.cmake OPTIONAL) + +if (COMMAND pre_project_set_up_hermetic_build) + pre_project_set_up_hermetic_build() +endif() + +######################################################################## +# +# Project-wide settings + +# Name of the project. +# +# CMake files in this project can refer to the root source directory +# as ${gtest_SOURCE_DIR} and to the root binary directory as +# ${gtest_BINARY_DIR}. +# Language "C" is required for find_package(Threads). +if (CMAKE_VERSION VERSION_LESS 3.0) + project(gtest CXX C) +else() + cmake_policy(SET CMP0048 NEW) + project(gtest VERSION ${GOOGLETEST_VERSION} LANGUAGES CXX C) +endif() +cmake_minimum_required(VERSION 2.6.4) + +if (POLICY CMP0063) # Visibility + cmake_policy(SET CMP0063 NEW) +endif (POLICY CMP0063) + +if (COMMAND set_up_hermetic_build) + set_up_hermetic_build() +endif() + +# These commands only run if this is the main project +if(CMAKE_PROJECT_NAME STREQUAL "gtest" OR CMAKE_PROJECT_NAME STREQUAL "googletest-distribution") + + # BUILD_SHARED_LIBS is a standard CMake variable, but we declare it here to + # make it prominent in the GUI. + option(BUILD_SHARED_LIBS "Build shared libraries (DLLs)." OFF) + +else() + + mark_as_advanced( + gtest_force_shared_crt + gtest_build_tests + gtest_build_samples + gtest_disable_pthreads + gtest_hide_internal_symbols) + +endif() + + +if (gtest_hide_internal_symbols) + set(CMAKE_CXX_VISIBILITY_PRESET hidden) + set(CMAKE_VISIBILITY_INLINES_HIDDEN 1) +endif() + +# Define helper functions and macros used by Google Test. +include(cmake/internal_utils.cmake) + +config_compiler_and_linker() # Defined in internal_utils.cmake. + +# Create the CMake package file descriptors. +if (INSTALL_GTEST) + include(CMakePackageConfigHelpers) + set(cmake_package_name GTest) + set(targets_export_name ${cmake_package_name}Targets CACHE INTERNAL "") + set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated" CACHE INTERNAL "") + set(cmake_files_install_dir "${CMAKE_INSTALL_LIBDIR}/cmake/${cmake_package_name}") + set(version_file "${generated_dir}/${cmake_package_name}ConfigVersion.cmake") + write_basic_package_version_file(${version_file} COMPATIBILITY AnyNewerVersion) + install(EXPORT ${targets_export_name} + NAMESPACE ${cmake_package_name}:: + DESTINATION ${cmake_files_install_dir}) + set(config_file "${generated_dir}/${cmake_package_name}Config.cmake") + configure_package_config_file("${gtest_SOURCE_DIR}/cmake/Config.cmake.in" + "${config_file}" INSTALL_DESTINATION ${cmake_files_install_dir}) + install(FILES ${version_file} ${config_file} + DESTINATION ${cmake_files_install_dir}) +endif() + +# Where Google Test's .h files can be found. +set(gtest_build_include_dirs + "${gtest_SOURCE_DIR}/include" + "${gtest_SOURCE_DIR}") +include_directories(${gtest_build_include_dirs}) + +# Summary of tuple support for Microsoft Visual Studio: +# Compiler version(MS) version(cmake) Support +# ---------- ----------- -------------- ----------------------------- +# <= VS 2010 <= 10 <= 1600 Use Google Tests's own tuple. +# VS 2012 11 1700 std::tr1::tuple + _VARIADIC_MAX=10 +# VS 2013 12 1800 std::tr1::tuple +# VS 2015 14 1900 std::tuple +# VS 2017 15 >= 1910 std::tuple +if (MSVC AND MSVC_VERSION EQUAL 1700) + add_definitions(/D _VARIADIC_MAX=10) +endif() + +######################################################################## +# +# Defines the gtest & gtest_main libraries. User tests should link +# with one of them. + +# Google Test libraries. We build them using more strict warnings than what +# are used for other targets, to ensure that gtest can be compiled by a user +# aggressive about warnings. +cxx_library(gtest "${cxx_strict}" src/gtest-all.cc) +cxx_library(gtest_main "${cxx_strict}" src/gtest_main.cc) +# If the CMake version supports it, attach header directory information +# to the targets for when we are part of a parent build (ie being pulled +# in via add_subdirectory() rather than being a standalone build). +if (DEFINED CMAKE_VERSION AND NOT "${CMAKE_VERSION}" VERSION_LESS "2.8.11") + target_include_directories(gtest SYSTEM INTERFACE + "$" + "$/${CMAKE_INSTALL_INCLUDEDIR}>") + target_include_directories(gtest_main SYSTEM INTERFACE + "$" + "$/${CMAKE_INSTALL_INCLUDEDIR}>") +endif() +target_link_libraries(gtest_main PUBLIC gtest) + +######################################################################## +# +# Install rules +install_project(gtest gtest_main) + +######################################################################## +# +# Samples on how to link user tests with gtest or gtest_main. +# +# They are not built by default. To build them, set the +# gtest_build_samples option to ON. You can do it by running ccmake +# or specifying the -Dgtest_build_samples=ON flag when running cmake. + +if (gtest_build_samples) + cxx_executable(sample1_unittest samples gtest_main samples/sample1.cc) + cxx_executable(sample2_unittest samples gtest_main samples/sample2.cc) + cxx_executable(sample3_unittest samples gtest_main) + cxx_executable(sample4_unittest samples gtest_main samples/sample4.cc) + cxx_executable(sample5_unittest samples gtest_main samples/sample1.cc) + cxx_executable(sample6_unittest samples gtest_main) + cxx_executable(sample7_unittest samples gtest_main) + cxx_executable(sample8_unittest samples gtest_main) + cxx_executable(sample9_unittest samples gtest) + cxx_executable(sample10_unittest samples gtest) +endif() + +######################################################################## +# +# Google Test's own tests. +# +# You can skip this section if you aren't interested in testing +# Google Test itself. +# +# The tests are not built by default. To build them, set the +# gtest_build_tests option to ON. You can do it by running ccmake +# or specifying the -Dgtest_build_tests=ON flag when running cmake. + +if (gtest_build_tests) + # This must be set in the root directory for the tests to be run by + # 'make test' or ctest. + enable_testing() + + ############################################################ + # C++ tests built with standard compiler flags. + + cxx_test(googletest-death-test-test gtest_main) + cxx_test(gtest_environment_test gtest) + cxx_test(googletest-filepath-test gtest_main) + cxx_test(googletest-linked-ptr-test gtest_main) + cxx_test(googletest-listener-test gtest_main) + cxx_test(gtest_main_unittest gtest_main) + cxx_test(googletest-message-test gtest_main) + cxx_test(gtest_no_test_unittest gtest) + cxx_test(googletest-options-test gtest_main) + cxx_test(googletest-param-test-test gtest + test/googletest-param-test2-test.cc) + cxx_test(googletest-port-test gtest_main) + cxx_test(gtest_pred_impl_unittest gtest_main) + cxx_test(gtest_premature_exit_test gtest + test/gtest_premature_exit_test.cc) + cxx_test(googletest-printers-test gtest_main) + cxx_test(gtest_prod_test gtest_main + test/production.cc) + cxx_test(gtest_repeat_test gtest) + cxx_test(gtest_sole_header_test gtest_main) + cxx_test(gtest_stress_test gtest) + cxx_test(googletest-test-part-test gtest_main) + cxx_test(gtest_throw_on_failure_ex_test gtest) + cxx_test(gtest-typed-test_test gtest_main + test/gtest-typed-test2_test.cc) + cxx_test(gtest_unittest gtest_main) + cxx_test(gtest-unittest-api_test gtest) + + ############################################################ + # C++ tests built with non-standard compiler flags. + + # MSVC 7.1 does not support STL with exceptions disabled. + if (NOT MSVC OR MSVC_VERSION GREATER 1310) + cxx_library(gtest_no_exception "${cxx_no_exception}" + src/gtest-all.cc) + cxx_library(gtest_main_no_exception "${cxx_no_exception}" + src/gtest-all.cc src/gtest_main.cc) + endif() + cxx_library(gtest_main_no_rtti "${cxx_no_rtti}" + src/gtest-all.cc src/gtest_main.cc) + + cxx_test_with_flags(gtest-death-test_ex_nocatch_test + "${cxx_exception} -DGTEST_ENABLE_CATCH_EXCEPTIONS_=0" + gtest test/googletest-death-test_ex_test.cc) + cxx_test_with_flags(gtest-death-test_ex_catch_test + "${cxx_exception} -DGTEST_ENABLE_CATCH_EXCEPTIONS_=1" + gtest test/googletest-death-test_ex_test.cc) + + cxx_test_with_flags(gtest_no_rtti_unittest "${cxx_no_rtti}" + gtest_main_no_rtti test/gtest_unittest.cc) + + cxx_shared_library(gtest_dll "${cxx_default}" + src/gtest-all.cc src/gtest_main.cc) + + cxx_executable_with_flags(gtest_dll_test_ "${cxx_default}" + gtest_dll test/gtest_all_test.cc) + set_target_properties(gtest_dll_test_ + PROPERTIES + COMPILE_DEFINITIONS "GTEST_LINKED_AS_SHARED_LIBRARY=1") + + if (NOT MSVC OR MSVC_VERSION LESS 1600) # 1600 is Visual Studio 2010. + # Visual Studio 2010, 2012, and 2013 define symbols in std::tr1 that + # conflict with our own definitions. Therefore using our own tuple does not + # work on those compilers. + cxx_library(gtest_main_use_own_tuple "${cxx_use_own_tuple}" + src/gtest-all.cc src/gtest_main.cc) + + cxx_test_with_flags(googletest-tuple-test "${cxx_use_own_tuple}" + gtest_main_use_own_tuple test/googletest-tuple-test.cc) + + cxx_test_with_flags(gtest_use_own_tuple_test "${cxx_use_own_tuple}" + gtest_main_use_own_tuple + test/googletest-param-test-test.cc test/googletest-param-test2-test.cc) + endif() + + ############################################################ + # Python tests. + + cxx_executable(googletest-break-on-failure-unittest_ test gtest) + py_test(googletest-break-on-failure-unittest) + + # Visual Studio .NET 2003 does not support STL with exceptions disabled. + if (NOT MSVC OR MSVC_VERSION GREATER 1310) # 1310 is Visual Studio .NET 2003 + cxx_executable_with_flags( + googletest-catch-exceptions-no-ex-test_ + "${cxx_no_exception}" + gtest_main_no_exception + test/googletest-catch-exceptions-test_.cc) + endif() + + cxx_executable_with_flags( + googletest-catch-exceptions-ex-test_ + "${cxx_exception}" + gtest_main + test/googletest-catch-exceptions-test_.cc) + py_test(googletest-catch-exceptions-test) + + cxx_executable(googletest-color-test_ test gtest) + py_test(googletest-color-test) + + cxx_executable(googletest-env-var-test_ test gtest) + py_test(googletest-env-var-test) + + cxx_executable(googletest-filter-unittest_ test gtest) + py_test(googletest-filter-unittest) + + cxx_executable(gtest_help_test_ test gtest_main) + py_test(gtest_help_test) + + cxx_executable(googletest-list-tests-unittest_ test gtest) + py_test(googletest-list-tests-unittest) + + cxx_executable(googletest-output-test_ test gtest) + py_test(googletest-output-test --no_stacktrace_support) + + cxx_executable(googletest-shuffle-test_ test gtest) + py_test(googletest-shuffle-test) + + # MSVC 7.1 does not support STL with exceptions disabled. + if (NOT MSVC OR MSVC_VERSION GREATER 1310) + cxx_executable(googletest-throw-on-failure-test_ test gtest_no_exception) + set_target_properties(googletest-throw-on-failure-test_ + PROPERTIES + COMPILE_FLAGS "${cxx_no_exception}") + py_test(googletest-throw-on-failure-test) + endif() + + cxx_executable(googletest-uninitialized-test_ test gtest) + py_test(googletest-uninitialized-test) + + cxx_executable(gtest_xml_outfile1_test_ test gtest_main) + cxx_executable(gtest_xml_outfile2_test_ test gtest_main) + py_test(gtest_xml_outfiles_test) + py_test(googletest-json-outfiles-test) + + cxx_executable(gtest_xml_output_unittest_ test gtest) + py_test(gtest_xml_output_unittest --no_stacktrace_support) + py_test(googletest-json-output-unittest --no_stacktrace_support) +endif() diff --git a/libs/libaom/src/third_party/googletest/src/googletest/CONTRIBUTORS b/libs/libaom/src/third_party/googletest/src/googletest/CONTRIBUTORS new file mode 100644 index 000000000..feae2fc04 --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/CONTRIBUTORS @@ -0,0 +1,37 @@ +# This file contains a list of people who've made non-trivial +# contribution to the Google C++ Testing Framework project. People +# who commit code to the project are encouraged to add their names +# here. Please keep the list sorted by first names. + +Ajay Joshi +Balázs Dán +Bharat Mediratta +Chandler Carruth +Chris Prince +Chris Taylor +Dan Egnor +Eric Roman +Hady Zalek +Jeffrey Yasskin +Jói Sigurðsson +Keir Mierle +Keith Ray +Kenton Varda +Manuel Klimek +Markus Heule +Mika Raento +Miklós Fazekas +Pasi Valminen +Patrick Hanna +Patrick Riley +Peter Kaminski +Preston Jackson +Rainer Klaffenboeck +Russ Cox +Russ Rufer +Sean Mcafee +Sigurður Ásgeirsson +Tracy Bialik +Vadim Berman +Vlad Losev +Zhanyong Wan diff --git a/libs/libaom/src/third_party/googletest/src/googletest/LICENSE b/libs/libaom/src/third_party/googletest/src/googletest/LICENSE new file mode 100644 index 000000000..1941a11f8 --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/LICENSE @@ -0,0 +1,28 @@ +Copyright 2008, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/libs/libaom/src/third_party/googletest/src/googletest/README.md b/libs/libaom/src/third_party/googletest/src/googletest/README.md new file mode 100644 index 000000000..e30fe8047 --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/README.md @@ -0,0 +1,341 @@ +### Generic Build Instructions + +#### Setup + +To build Google Test and your tests that use it, you need to tell your build +system where to find its headers and source files. The exact way to do it +depends on which build system you use, and is usually straightforward. + +#### Build + +Suppose you put Google Test in directory `${GTEST_DIR}`. To build it, create a +library build target (or a project as called by Visual Studio and Xcode) to +compile + + ${GTEST_DIR}/src/gtest-all.cc + +with `${GTEST_DIR}/include` in the system header search path and `${GTEST_DIR}` +in the normal header search path. Assuming a Linux-like system and gcc, +something like the following will do: + + g++ -isystem ${GTEST_DIR}/include -I${GTEST_DIR} \ + -pthread -c ${GTEST_DIR}/src/gtest-all.cc + ar -rv libgtest.a gtest-all.o + +(We need `-pthread` as Google Test uses threads.) + +Next, you should compile your test source file with `${GTEST_DIR}/include` in +the system header search path, and link it with gtest and any other necessary +libraries: + + g++ -isystem ${GTEST_DIR}/include -pthread path/to/your_test.cc libgtest.a \ + -o your_test + +As an example, the make/ directory contains a Makefile that you can use to build +Google Test on systems where GNU make is available (e.g. Linux, Mac OS X, and +Cygwin). It doesn't try to build Google Test's own tests. Instead, it just +builds the Google Test library and a sample test. You can use it as a starting +point for your own build script. + +If the default settings are correct for your environment, the following commands +should succeed: + + cd ${GTEST_DIR}/make + make + ./sample1_unittest + +If you see errors, try to tweak the contents of `make/Makefile` to make them go +away. There are instructions in `make/Makefile` on how to do it. + +### Using CMake + +Google Test comes with a CMake build script ( +[CMakeLists.txt](https://github.com/google/googletest/blob/master/CMakeLists.txt)) +that can be used on a wide range of platforms ("C" stands for cross-platform.). +If you don't have CMake installed already, you can download it for free from +. + +CMake works by generating native makefiles or build projects that can be used in +the compiler environment of your choice. You can either build Google Test as a +standalone project or it can be incorporated into an existing CMake build for +another project. + +#### Standalone CMake Project + +When building Google Test as a standalone project, the typical workflow starts +with: + + mkdir mybuild # Create a directory to hold the build output. + cd mybuild + cmake ${GTEST_DIR} # Generate native build scripts. + +If you want to build Google Test's samples, you should replace the last command +with + + cmake -Dgtest_build_samples=ON ${GTEST_DIR} + +If you are on a \*nix system, you should now see a Makefile in the current +directory. Just type 'make' to build gtest. + +If you use Windows and have Visual Studio installed, a `gtest.sln` file and +several `.vcproj` files will be created. You can then build them using Visual +Studio. + +On Mac OS X with Xcode installed, a `.xcodeproj` file will be generated. + +#### Incorporating Into An Existing CMake Project + +If you want to use gtest in a project which already uses CMake, then a more +robust and flexible approach is to build gtest as part of that project directly. +This is done by making the GoogleTest source code available to the main build +and adding it using CMake's `add_subdirectory()` command. This has the +significant advantage that the same compiler and linker settings are used +between gtest and the rest of your project, so issues associated with using +incompatible libraries (eg debug/release), etc. are avoided. This is +particularly useful on Windows. Making GoogleTest's source code available to the +main build can be done a few different ways: + +* Download the GoogleTest source code manually and place it at a known + location. This is the least flexible approach and can make it more difficult + to use with continuous integration systems, etc. +* Embed the GoogleTest source code as a direct copy in the main project's + source tree. This is often the simplest approach, but is also the hardest to + keep up to date. Some organizations may not permit this method. +* Add GoogleTest as a git submodule or equivalent. This may not always be + possible or appropriate. Git submodules, for example, have their own set of + advantages and drawbacks. +* Use CMake to download GoogleTest as part of the build's configure step. This + is just a little more complex, but doesn't have the limitations of the other + methods. + +The last of the above methods is implemented with a small piece of CMake code in +a separate file (e.g. `CMakeLists.txt.in`) which is copied to the build area and +then invoked as a sub-build _during the CMake stage_. That directory is then +pulled into the main build with `add_subdirectory()`. For example: + +New file `CMakeLists.txt.in`: + + cmake_minimum_required(VERSION 2.8.2) + + project(googletest-download NONE) + + include(ExternalProject) + ExternalProject_Add(googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG master + SOURCE_DIR "${CMAKE_BINARY_DIR}/googletest-src" + BINARY_DIR "${CMAKE_BINARY_DIR}/googletest-build" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" + ) + +Existing build's `CMakeLists.txt`: + + # Download and unpack googletest at configure time + configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt) + execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . + RESULT_VARIABLE result + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download ) + if(result) + message(FATAL_ERROR "CMake step for googletest failed: ${result}") + endif() + execute_process(COMMAND ${CMAKE_COMMAND} --build . + RESULT_VARIABLE result + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download ) + if(result) + message(FATAL_ERROR "Build step for googletest failed: ${result}") + endif() + + # Prevent overriding the parent project's compiler/linker + # settings on Windows + set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) + + # Add googletest directly to our build. This defines + # the gtest and gtest_main targets. + add_subdirectory(${CMAKE_BINARY_DIR}/googletest-src + ${CMAKE_BINARY_DIR}/googletest-build + EXCLUDE_FROM_ALL) + + # The gtest/gtest_main targets carry header search path + # dependencies automatically when using CMake 2.8.11 or + # later. Otherwise we have to add them here ourselves. + if (CMAKE_VERSION VERSION_LESS 2.8.11) + include_directories("${gtest_SOURCE_DIR}/include") + endif() + + # Now simply link against gtest or gtest_main as needed. Eg + add_executable(example example.cpp) + target_link_libraries(example gtest_main) + add_test(NAME example_test COMMAND example) + +Note that this approach requires CMake 2.8.2 or later due to its use of the +`ExternalProject_Add()` command. The above technique is discussed in more detail +in [this separate article](http://crascit.com/2015/07/25/cmake-gtest/) which +also contains a link to a fully generalized implementation of the technique. + +##### Visual Studio Dynamic vs Static Runtimes + +By default, new Visual Studio projects link the C runtimes dynamically but +Google Test links them statically. This will generate an error that looks +something like the following: gtest.lib(gtest-all.obj) : error LNK2038: mismatch +detected for 'RuntimeLibrary': value 'MTd_StaticDebug' doesn't match value +'MDd_DynamicDebug' in main.obj + +Google Test already has a CMake option for this: `gtest_force_shared_crt` + +Enabling this option will make gtest link the runtimes dynamically too, and +match the project in which it is included. + +### Legacy Build Scripts + +Before settling on CMake, we have been providing hand-maintained build +projects/scripts for Visual Studio, Xcode, and Autotools. While we continue to +provide them for convenience, they are not actively maintained any more. We +highly recommend that you follow the instructions in the above sections to +integrate Google Test with your existing build system. + +If you still need to use the legacy build scripts, here's how: + +The msvc\ folder contains two solutions with Visual C++ projects. Open the +`gtest.sln` or `gtest-md.sln` file using Visual Studio, and you are ready to +build Google Test the same way you build any Visual Studio project. Files that +have names ending with -md use DLL versions of Microsoft runtime libraries (the +/MD or the /MDd compiler option). Files without that suffix use static versions +of the runtime libraries (the /MT or the /MTd option). Please note that one must +use the same option to compile both gtest and the test code. If you use Visual +Studio 2005 or above, we recommend the -md version as /MD is the default for new +projects in these versions of Visual Studio. + +On Mac OS X, open the `gtest.xcodeproj` in the `xcode/` folder using Xcode. +Build the "gtest" target. The universal binary framework will end up in your +selected build directory (selected in the Xcode "Preferences..." -> "Building" +pane and defaults to xcode/build). Alternatively, at the command line, enter: + + xcodebuild + +This will build the "Release" configuration of gtest.framework in your default +build location. See the "xcodebuild" man page for more information about +building different configurations and building in different locations. + +If you wish to use the Google Test Xcode project with Xcode 4.x and above, you +need to either: + +* update the SDK configuration options in xcode/Config/General.xconfig. + Comment options `SDKROOT`, `MACOS_DEPLOYMENT_TARGET`, and `GCC_VERSION`. If + you choose this route you lose the ability to target earlier versions of + MacOS X. +* Install an SDK for an earlier version. This doesn't appear to be supported + by Apple, but has been reported to work + (http://stackoverflow.com/questions/5378518). + +### Tweaking Google Test + +Google Test can be used in diverse environments. The default configuration may +not work (or may not work well) out of the box in some environments. However, +you can easily tweak Google Test by defining control macros on the compiler +command line. Generally, these macros are named like `GTEST_XYZ` and you define +them to either 1 or 0 to enable or disable a certain feature. + +We list the most frequently used macros below. For a complete list, see file +[include/gtest/internal/gtest-port.h](https://github.com/google/googletest/blob/master/include/gtest/internal/gtest-port.h). + +### Choosing a TR1 Tuple Library + +Some Google Test features require the C++ Technical Report 1 (TR1) tuple +library, which is not yet available with all compilers. The good news is that +Google Test implements a subset of TR1 tuple that's enough for its own need, and +will automatically use this when the compiler doesn't provide TR1 tuple. + +Usually you don't need to care about which tuple library Google Test uses. +However, if your project already uses TR1 tuple, you need to tell Google Test to +use the same TR1 tuple library the rest of your project uses, or the two tuple +implementations will clash. To do that, add + + -DGTEST_USE_OWN_TR1_TUPLE=0 + +to the compiler flags while compiling Google Test and your tests. If you want to +force Google Test to use its own tuple library, just add + + -DGTEST_USE_OWN_TR1_TUPLE=1 + +to the compiler flags instead. + +If you don't want Google Test to use tuple at all, add + + -DGTEST_HAS_TR1_TUPLE=0 + +and all features using tuple will be disabled. + +### Multi-threaded Tests + +Google Test is thread-safe where the pthread library is available. After +`#include "gtest/gtest.h"`, you can check the `GTEST_IS_THREADSAFE` macro to see +whether this is the case (yes if the macro is `#defined` to 1, no if it's +undefined.). + +If Google Test doesn't correctly detect whether pthread is available in your +environment, you can force it with + + -DGTEST_HAS_PTHREAD=1 + +or + + -DGTEST_HAS_PTHREAD=0 + +When Google Test uses pthread, you may need to add flags to your compiler and/or +linker to select the pthread library, or you'll get link errors. If you use the +CMake script or the deprecated Autotools script, this is taken care of for you. +If you use your own build script, you'll need to read your compiler and linker's +manual to figure out what flags to add. + +### As a Shared Library (DLL) + +Google Test is compact, so most users can build and link it as a static library +for the simplicity. You can choose to use Google Test as a shared library (known +as a DLL on Windows) if you prefer. + +To compile *gtest* as a shared library, add + + -DGTEST_CREATE_SHARED_LIBRARY=1 + +to the compiler flags. You'll also need to tell the linker to produce a shared +library instead - consult your linker's manual for how to do it. + +To compile your *tests* that use the gtest shared library, add + + -DGTEST_LINKED_AS_SHARED_LIBRARY=1 + +to the compiler flags. + +Note: while the above steps aren't technically necessary today when using some +compilers (e.g. GCC), they may become necessary in the future, if we decide to +improve the speed of loading the library (see + for details). Therefore you are recommended +to always add the above flags when using Google Test as a shared library. +Otherwise a future release of Google Test may break your build script. + +### Avoiding Macro Name Clashes + +In C++, macros don't obey namespaces. Therefore two libraries that both define a +macro of the same name will clash if you `#include` both definitions. In case a +Google Test macro clashes with another library, you can force Google Test to +rename its macro to avoid the conflict. + +Specifically, if both Google Test and some other code define macro FOO, you can +add + + -DGTEST_DONT_DEFINE_FOO=1 + +to the compiler flags to tell Google Test to change the macro's name from `FOO` +to `GTEST_FOO`. Currently `FOO` can be `FAIL`, `SUCCEED`, or `TEST`. For +example, with `-DGTEST_DONT_DEFINE_TEST=1`, you'll need to write + + GTEST_TEST(SomeTest, DoesThis) { ... } + +instead of + + TEST(SomeTest, DoesThis) { ... } + +in order to define a test. diff --git a/libs/libaom/src/third_party/googletest/src/googletest/cmake/Config.cmake.in b/libs/libaom/src/third_party/googletest/src/googletest/cmake/Config.cmake.in new file mode 100644 index 000000000..12be4498b --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/cmake/Config.cmake.in @@ -0,0 +1,9 @@ +@PACKAGE_INIT@ +include(CMakeFindDependencyMacro) +if (@GTEST_HAS_PTHREAD@) + set(THREADS_PREFER_PTHREAD_FLAG @THREADS_PREFER_PTHREAD_FLAG@) + find_dependency(Threads) +endif() + +include("${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake") +check_required_components("@project_name@") diff --git a/libs/libaom/src/third_party/googletest/src/googletest/cmake/gtest.pc.in b/libs/libaom/src/third_party/googletest/src/googletest/cmake/gtest.pc.in new file mode 100644 index 000000000..e7967ad56 --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/cmake/gtest.pc.in @@ -0,0 +1,9 @@ +libdir=@CMAKE_INSTALL_FULL_LIBDIR@ +includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ + +Name: gtest +Description: GoogleTest (without main() function) +Version: @PROJECT_VERSION@ +URL: https://github.com/google/googletest +Libs: -L${libdir} -lgtest @CMAKE_THREAD_LIBS_INIT@ +Cflags: -I${includedir} @GTEST_HAS_PTHREAD_MACRO@ @CMAKE_THREAD_LIBS_INIT@ diff --git a/libs/libaom/src/third_party/googletest/src/googletest/cmake/gtest_main.pc.in b/libs/libaom/src/third_party/googletest/src/googletest/cmake/gtest_main.pc.in new file mode 100644 index 000000000..fe25d9c73 --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/cmake/gtest_main.pc.in @@ -0,0 +1,10 @@ +libdir=@CMAKE_INSTALL_FULL_LIBDIR@ +includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ + +Name: gtest_main +Description: GoogleTest (with main() function) +Version: @PROJECT_VERSION@ +URL: https://github.com/google/googletest +Requires: gtest +Libs: -L${libdir} -lgtest_main @CMAKE_THREAD_LIBS_INIT@ +Cflags: -I${includedir} @GTEST_HAS_PTHREAD_MACRO@ @CMAKE_THREAD_LIBS_INIT@ diff --git a/libs/libaom/src/third_party/googletest/src/googletest/cmake/internal_utils.cmake b/libs/libaom/src/third_party/googletest/src/googletest/cmake/internal_utils.cmake new file mode 100644 index 000000000..8c1f9ba99 --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/cmake/internal_utils.cmake @@ -0,0 +1,318 @@ +# Defines functions and macros useful for building Google Test and +# Google Mock. +# +# Note: +# +# - This file will be run twice when building Google Mock (once via +# Google Test's CMakeLists.txt, and once via Google Mock's). +# Therefore it shouldn't have any side effects other than defining +# the functions and macros. +# +# - The functions/macros defined in this file may depend on Google +# Test and Google Mock's option() definitions, and thus must be +# called *after* the options have been defined. + +# Tweaks CMake's default compiler/linker settings to suit Google Test's needs. +# +# This must be a macro(), as inside a function string() can only +# update variables in the function scope. +macro(fix_default_compiler_settings_) + if (MSVC) + # For MSVC, CMake sets certain flags to defaults we want to override. + # This replacement code is taken from sample in the CMake Wiki at + # https://gitlab.kitware.com/cmake/community/wikis/FAQ#dynamic-replace. + foreach (flag_var + CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE + CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) + if (NOT BUILD_SHARED_LIBS AND NOT gtest_force_shared_crt) + # When Google Test is built as a shared library, it should also use + # shared runtime libraries. Otherwise, it may end up with multiple + # copies of runtime library data in different modules, resulting in + # hard-to-find crashes. When it is built as a static library, it is + # preferable to use CRT as static libraries, as we don't have to rely + # on CRT DLLs being available. CMake always defaults to using shared + # CRT libraries, so we override that default here. + string(REPLACE "/MD" "-MT" ${flag_var} "${${flag_var}}") + endif() + + # We prefer more strict warning checking for building Google Test. + # Replaces /W3 with /W4 in defaults. + string(REPLACE "/W3" "/W4" ${flag_var} "${${flag_var}}") + + # Prevent D9025 warning for targets that have exception handling + # turned off (/EHs-c- flag). Where required, exceptions are explicitly + # re-enabled using the cxx_exception_flags variable. + string(REPLACE "/EHsc" "" ${flag_var} "${${flag_var}}") + endforeach() + endif() +endmacro() + +# Defines the compiler/linker flags used to build Google Test and +# Google Mock. You can tweak these definitions to suit your need. A +# variable's value is empty before it's explicitly assigned to. +macro(config_compiler_and_linker) + # Note: pthreads on MinGW is not supported, even if available + # instead, we use windows threading primitives + unset(GTEST_HAS_PTHREAD) + if (NOT gtest_disable_pthreads AND NOT MINGW) + # Defines CMAKE_USE_PTHREADS_INIT and CMAKE_THREAD_LIBS_INIT. + set(THREADS_PREFER_PTHREAD_FLAG ON) + find_package(Threads) + if (CMAKE_USE_PTHREADS_INIT) + set(GTEST_HAS_PTHREAD ON) + endif() + endif() + + fix_default_compiler_settings_() + if (MSVC) + # Newlines inside flags variables break CMake's NMake generator. + # TODO(vladl@google.com): Add -RTCs and -RTCu to debug builds. + set(cxx_base_flags "-GS -W4 -WX -wd4251 -wd4275 -nologo -J -Zi") + if (MSVC_VERSION LESS 1400) # 1400 is Visual Studio 2005 + # Suppress spurious warnings MSVC 7.1 sometimes issues. + # Forcing value to bool. + set(cxx_base_flags "${cxx_base_flags} -wd4800") + # Copy constructor and assignment operator could not be generated. + set(cxx_base_flags "${cxx_base_flags} -wd4511 -wd4512") + # Compatibility warnings not applicable to Google Test. + # Resolved overload was found by argument-dependent lookup. + set(cxx_base_flags "${cxx_base_flags} -wd4675") + endif() + if (MSVC_VERSION LESS 1500) # 1500 is Visual Studio 2008 + # Conditional expression is constant. + # When compiling with /W4, we get several instances of C4127 + # (Conditional expression is constant). In our code, we disable that + # warning on a case-by-case basis. However, on Visual Studio 2005, + # the warning fires on std::list. Therefore on that compiler and earlier, + # we disable the warning project-wide. + set(cxx_base_flags "${cxx_base_flags} -wd4127") + endif() + if (NOT (MSVC_VERSION LESS 1700)) # 1700 is Visual Studio 2012. + # Suppress "unreachable code" warning on VS 2012 and later. + # http://stackoverflow.com/questions/3232669 explains the issue. + set(cxx_base_flags "${cxx_base_flags} -wd4702") + endif() + + set(cxx_base_flags "${cxx_base_flags} -D_UNICODE -DUNICODE -DWIN32 -D_WIN32") + set(cxx_base_flags "${cxx_base_flags} -DSTRICT -DWIN32_LEAN_AND_MEAN") + set(cxx_exception_flags "-EHsc -D_HAS_EXCEPTIONS=1") + set(cxx_no_exception_flags "-EHs-c- -D_HAS_EXCEPTIONS=0") + set(cxx_no_rtti_flags "-GR-") + elseif (CMAKE_COMPILER_IS_GNUCXX) + set(cxx_base_flags "-Wall -Wshadow -Werror") + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0.0) + set(cxx_base_flags "${cxx_base_flags} -Wno-error=dangling-else") + endif() + set(cxx_exception_flags "-fexceptions") + set(cxx_no_exception_flags "-fno-exceptions") + # Until version 4.3.2, GCC doesn't define a macro to indicate + # whether RTTI is enabled. Therefore we define GTEST_HAS_RTTI + # explicitly. + set(cxx_no_rtti_flags "-fno-rtti -DGTEST_HAS_RTTI=0") + set(cxx_strict_flags + "-Wextra -Wno-unused-parameter -Wno-missing-field-initializers") + elseif (CMAKE_CXX_COMPILER_ID STREQUAL "SunPro") + set(cxx_exception_flags "-features=except") + # Sun Pro doesn't provide macros to indicate whether exceptions and + # RTTI are enabled, so we define GTEST_HAS_* explicitly. + set(cxx_no_exception_flags "-features=no%except -DGTEST_HAS_EXCEPTIONS=0") + set(cxx_no_rtti_flags "-features=no%rtti -DGTEST_HAS_RTTI=0") + elseif (CMAKE_CXX_COMPILER_ID STREQUAL "VisualAge" OR + CMAKE_CXX_COMPILER_ID STREQUAL "XL") + # CMake 2.8 changes Visual Age's compiler ID to "XL". + set(cxx_exception_flags "-qeh") + set(cxx_no_exception_flags "-qnoeh") + # Until version 9.0, Visual Age doesn't define a macro to indicate + # whether RTTI is enabled. Therefore we define GTEST_HAS_RTTI + # explicitly. + set(cxx_no_rtti_flags "-qnortti -DGTEST_HAS_RTTI=0") + elseif (CMAKE_CXX_COMPILER_ID STREQUAL "HP") + set(cxx_base_flags "-AA -mt") + set(cxx_exception_flags "-DGTEST_HAS_EXCEPTIONS=1") + set(cxx_no_exception_flags "+noeh -DGTEST_HAS_EXCEPTIONS=0") + # RTTI can not be disabled in HP aCC compiler. + set(cxx_no_rtti_flags "") + endif() + + # The pthreads library is available and allowed? + if (DEFINED GTEST_HAS_PTHREAD) + set(GTEST_HAS_PTHREAD_MACRO "-DGTEST_HAS_PTHREAD=1") + else() + set(GTEST_HAS_PTHREAD_MACRO "-DGTEST_HAS_PTHREAD=0") + endif() + set(cxx_base_flags "${cxx_base_flags} ${GTEST_HAS_PTHREAD_MACRO}") + + # For building gtest's own tests and samples. + set(cxx_exception "${cxx_base_flags} ${cxx_exception_flags}") + set(cxx_no_exception + "${CMAKE_CXX_FLAGS} ${cxx_base_flags} ${cxx_no_exception_flags}") + set(cxx_default "${cxx_exception}") + set(cxx_no_rtti "${cxx_default} ${cxx_no_rtti_flags}") + set(cxx_use_own_tuple "${cxx_default} -DGTEST_USE_OWN_TR1_TUPLE=1") + + # For building the gtest libraries. + set(cxx_strict "${cxx_default} ${cxx_strict_flags}") +endmacro() + +# Defines the gtest & gtest_main libraries. User tests should link +# with one of them. +function(cxx_library_with_type name type cxx_flags) + # type can be either STATIC or SHARED to denote a static or shared library. + # ARGN refers to additional arguments after 'cxx_flags'. + add_library(${name} ${type} ${ARGN}) + set_target_properties(${name} + PROPERTIES + COMPILE_FLAGS "${cxx_flags}") + # Generate debug library name with a postfix. + set_target_properties(${name} + PROPERTIES + DEBUG_POSTFIX "d") + if (BUILD_SHARED_LIBS OR type STREQUAL "SHARED") + set_target_properties(${name} + PROPERTIES + COMPILE_DEFINITIONS "GTEST_CREATE_SHARED_LIBRARY=1") + if (NOT "${CMAKE_VERSION}" VERSION_LESS "2.8.11") + target_compile_definitions(${name} INTERFACE + $) + endif() + endif() + if (DEFINED GTEST_HAS_PTHREAD) + if ("${CMAKE_VERSION}" VERSION_LESS "3.1.0") + set(threads_spec ${CMAKE_THREAD_LIBS_INIT}) + else() + set(threads_spec Threads::Threads) + endif() + target_link_libraries(${name} PUBLIC ${threads_spec}) + endif() +endfunction() + +######################################################################## +# +# Helper functions for creating build targets. + +function(cxx_shared_library name cxx_flags) + cxx_library_with_type(${name} SHARED "${cxx_flags}" ${ARGN}) +endfunction() + +function(cxx_library name cxx_flags) + cxx_library_with_type(${name} "" "${cxx_flags}" ${ARGN}) +endfunction() + +# cxx_executable_with_flags(name cxx_flags libs srcs...) +# +# creates a named C++ executable that depends on the given libraries and +# is built from the given source files with the given compiler flags. +function(cxx_executable_with_flags name cxx_flags libs) + add_executable(${name} ${ARGN}) + if (MSVC AND (NOT (MSVC_VERSION LESS 1700))) # 1700 is Visual Studio 2012. + # BigObj required for tests. + set(cxx_flags "${cxx_flags} -bigobj") + endif() + if (cxx_flags) + set_target_properties(${name} + PROPERTIES + COMPILE_FLAGS "${cxx_flags}") + endif() + if (BUILD_SHARED_LIBS) + set_target_properties(${name} + PROPERTIES + COMPILE_DEFINITIONS "GTEST_LINKED_AS_SHARED_LIBRARY=1") + endif() + # To support mixing linking in static and dynamic libraries, link each + # library in with an extra call to target_link_libraries. + foreach (lib "${libs}") + target_link_libraries(${name} ${lib}) + endforeach() +endfunction() + +# cxx_executable(name dir lib srcs...) +# +# creates a named target that depends on the given libs and is built +# from the given source files. dir/name.cc is implicitly included in +# the source file list. +function(cxx_executable name dir libs) + cxx_executable_with_flags( + ${name} "${cxx_default}" "${libs}" "${dir}/${name}.cc" ${ARGN}) +endfunction() + +# Sets PYTHONINTERP_FOUND and PYTHON_EXECUTABLE. +find_package(PythonInterp) + +# cxx_test_with_flags(name cxx_flags libs srcs...) +# +# creates a named C++ test that depends on the given libs and is built +# from the given source files with the given compiler flags. +function(cxx_test_with_flags name cxx_flags libs) + cxx_executable_with_flags(${name} "${cxx_flags}" "${libs}" ${ARGN}) + add_test(NAME ${name} COMMAND ${name}) +endfunction() + +# cxx_test(name libs srcs...) +# +# creates a named test target that depends on the given libs and is +# built from the given source files. Unlike cxx_test_with_flags, +# test/name.cc is already implicitly included in the source file list. +function(cxx_test name libs) + cxx_test_with_flags("${name}" "${cxx_default}" "${libs}" + "test/${name}.cc" ${ARGN}) +endfunction() + +# py_test(name) +# +# creates a Python test with the given name whose main module is in +# test/name.py. It does nothing if Python is not installed. +function(py_test name) + if (PYTHONINTERP_FOUND) + if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.1) + if (CMAKE_CONFIGURATION_TYPES) + # Multi-configuration build generators as for Visual Studio save + # output in a subdirectory of CMAKE_CURRENT_BINARY_DIR (Debug, + # Release etc.), so we have to provide it here. + add_test( + NAME ${name} + COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py + --build_dir=${CMAKE_CURRENT_BINARY_DIR}/$ ${ARGN}) + else (CMAKE_CONFIGURATION_TYPES) + # Single-configuration build generators like Makefile generators + # don't have subdirs below CMAKE_CURRENT_BINARY_DIR. + add_test( + NAME ${name} + COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py + --build_dir=${CMAKE_CURRENT_BINARY_DIR} ${ARGN}) + endif (CMAKE_CONFIGURATION_TYPES) + else (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.1) + # ${CMAKE_CURRENT_BINARY_DIR} is known at configuration time, so we can + # directly bind it from cmake. ${CTEST_CONFIGURATION_TYPE} is known + # only at ctest runtime (by calling ctest -c ), so + # we have to escape $ to delay variable substitution here. + add_test( + ${name} + ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py + --build_dir=${CMAKE_CURRENT_BINARY_DIR}/\${CTEST_CONFIGURATION_TYPE} ${ARGN}) + endif (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.1) + endif(PYTHONINTERP_FOUND) +endfunction() + +# install_project(targets...) +# +# Installs the specified targets and configures the associated pkgconfig files. +function(install_project) + if(INSTALL_GTEST) + install(DIRECTORY "${PROJECT_SOURCE_DIR}/include/" + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}") + # Install the project targets. + install(TARGETS ${ARGN} + EXPORT ${targets_export_name} + RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}" + ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}" + LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}") + # Configure and install pkgconfig files. + foreach(t ${ARGN}) + set(configured_pc "${generated_dir}/${t}.pc") + configure_file("${PROJECT_SOURCE_DIR}/cmake/${t}.pc.in" + "${configured_pc}" @ONLY) + install(FILES "${configured_pc}" + DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig") + endforeach() + endif() +endfunction() diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-death-test.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-death-test.h new file mode 100644 index 000000000..39f0ded1b --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-death-test.h @@ -0,0 +1,342 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This header file defines the public API for death tests. It is +// #included by gtest.h so a user doesn't need to include this +// directly. +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ +#define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ + +#include "gtest/internal/gtest-death-test-internal.h" + +namespace testing { + +// This flag controls the style of death tests. Valid values are "threadsafe", +// meaning that the death test child process will re-execute the test binary +// from the start, running only a single death test, or "fast", +// meaning that the child process will execute the test logic immediately +// after forking. +GTEST_DECLARE_string_(death_test_style); + +#if GTEST_HAS_DEATH_TEST + +namespace internal { + +// Returns a Boolean value indicating whether the caller is currently +// executing in the context of the death test child process. Tools such as +// Valgrind heap checkers may need this to modify their behavior in death +// tests. IMPORTANT: This is an internal utility. Using it may break the +// implementation of death tests. User code MUST NOT use it. +GTEST_API_ bool InDeathTestChild(); + +} // namespace internal + +// The following macros are useful for writing death tests. + +// Here's what happens when an ASSERT_DEATH* or EXPECT_DEATH* is +// executed: +// +// 1. It generates a warning if there is more than one active +// thread. This is because it's safe to fork() or clone() only +// when there is a single thread. +// +// 2. The parent process clone()s a sub-process and runs the death +// test in it; the sub-process exits with code 0 at the end of the +// death test, if it hasn't exited already. +// +// 3. The parent process waits for the sub-process to terminate. +// +// 4. The parent process checks the exit code and error message of +// the sub-process. +// +// Examples: +// +// ASSERT_DEATH(server.SendMessage(56, "Hello"), "Invalid port number"); +// for (int i = 0; i < 5; i++) { +// EXPECT_DEATH(server.ProcessRequest(i), +// "Invalid request .* in ProcessRequest()") +// << "Failed to die on request " << i; +// } +// +// ASSERT_EXIT(server.ExitNow(), ::testing::ExitedWithCode(0), "Exiting"); +// +// bool KilledBySIGHUP(int exit_code) { +// return WIFSIGNALED(exit_code) && WTERMSIG(exit_code) == SIGHUP; +// } +// +// ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!"); +// +// On the regular expressions used in death tests: +// +// GOOGLETEST_CM0005 DO NOT DELETE +// On POSIX-compliant systems (*nix), we use the library, +// which uses the POSIX extended regex syntax. +// +// On other platforms (e.g. Windows or Mac), we only support a simple regex +// syntax implemented as part of Google Test. This limited +// implementation should be enough most of the time when writing +// death tests; though it lacks many features you can find in PCRE +// or POSIX extended regex syntax. For example, we don't support +// union ("x|y"), grouping ("(xy)"), brackets ("[xy]"), and +// repetition count ("x{5,7}"), among others. +// +// Below is the syntax that we do support. We chose it to be a +// subset of both PCRE and POSIX extended regex, so it's easy to +// learn wherever you come from. In the following: 'A' denotes a +// literal character, period (.), or a single \\ escape sequence; +// 'x' and 'y' denote regular expressions; 'm' and 'n' are for +// natural numbers. +// +// c matches any literal character c +// \\d matches any decimal digit +// \\D matches any character that's not a decimal digit +// \\f matches \f +// \\n matches \n +// \\r matches \r +// \\s matches any ASCII whitespace, including \n +// \\S matches any character that's not a whitespace +// \\t matches \t +// \\v matches \v +// \\w matches any letter, _, or decimal digit +// \\W matches any character that \\w doesn't match +// \\c matches any literal character c, which must be a punctuation +// . matches any single character except \n +// A? matches 0 or 1 occurrences of A +// A* matches 0 or many occurrences of A +// A+ matches 1 or many occurrences of A +// ^ matches the beginning of a string (not that of each line) +// $ matches the end of a string (not that of each line) +// xy matches x followed by y +// +// If you accidentally use PCRE or POSIX extended regex features +// not implemented by us, you will get a run-time failure. In that +// case, please try to rewrite your regular expression within the +// above syntax. +// +// This implementation is *not* meant to be as highly tuned or robust +// as a compiled regex library, but should perform well enough for a +// death test, which already incurs significant overhead by launching +// a child process. +// +// Known caveats: +// +// A "threadsafe" style death test obtains the path to the test +// program from argv[0] and re-executes it in the sub-process. For +// simplicity, the current implementation doesn't search the PATH +// when launching the sub-process. This means that the user must +// invoke the test program via a path that contains at least one +// path separator (e.g. path/to/foo_test and +// /absolute/path/to/bar_test are fine, but foo_test is not). This +// is rarely a problem as people usually don't put the test binary +// directory in PATH. +// + +// Asserts that a given statement causes the program to exit, with an +// integer exit status that satisfies predicate, and emitting error output +// that matches regex. +#define ASSERT_EXIT(statement, predicate, regex) \ + GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_) + +// Like ASSERT_EXIT, but continues on to successive tests in the +// test suite, if any: +#define EXPECT_EXIT(statement, predicate, regex) \ + GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_) + +// Asserts that a given statement causes the program to exit, either by +// explicitly exiting with a nonzero exit code or being killed by a +// signal, and emitting error output that matches regex. +#define ASSERT_DEATH(statement, regex) \ + ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex) + +// Like ASSERT_DEATH, but continues on to successive tests in the +// test suite, if any: +#define EXPECT_DEATH(statement, regex) \ + EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex) + +// Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*: + +// Tests that an exit code describes a normal exit with a given exit code. +class GTEST_API_ ExitedWithCode { + public: + explicit ExitedWithCode(int exit_code); + bool operator()(int exit_status) const; + + private: + // No implementation - assignment is unsupported. + void operator=(const ExitedWithCode &other); + + const int exit_code_; +}; + +#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA +// Tests that an exit code describes an exit due to termination by a +// given signal. +// GOOGLETEST_CM0006 DO NOT DELETE +class GTEST_API_ KilledBySignal { + public: + explicit KilledBySignal(int signum); + bool operator()(int exit_status) const; + + private: + const int signum_; +}; +#endif // !GTEST_OS_WINDOWS + +// EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode. +// The death testing framework causes this to have interesting semantics, +// since the sideeffects of the call are only visible in opt mode, and not +// in debug mode. +// +// In practice, this can be used to test functions that utilize the +// LOG(DFATAL) macro using the following style: +// +// int DieInDebugOr12(int* sideeffect) { +// if (sideeffect) { +// *sideeffect = 12; +// } +// LOG(DFATAL) << "death"; +// return 12; +// } +// +// TEST(TestSuite, TestDieOr12WorksInDgbAndOpt) { +// int sideeffect = 0; +// // Only asserts in dbg. +// EXPECT_DEBUG_DEATH(DieInDebugOr12(&sideeffect), "death"); +// +// #ifdef NDEBUG +// // opt-mode has sideeffect visible. +// EXPECT_EQ(12, sideeffect); +// #else +// // dbg-mode no visible sideeffect. +// EXPECT_EQ(0, sideeffect); +// #endif +// } +// +// This will assert that DieInDebugReturn12InOpt() crashes in debug +// mode, usually due to a DCHECK or LOG(DFATAL), but returns the +// appropriate fallback value (12 in this case) in opt mode. If you +// need to test that a function has appropriate side-effects in opt +// mode, include assertions against the side-effects. A general +// pattern for this is: +// +// EXPECT_DEBUG_DEATH({ +// // Side-effects here will have an effect after this statement in +// // opt mode, but none in debug mode. +// EXPECT_EQ(12, DieInDebugOr12(&sideeffect)); +// }, "death"); +// +#ifdef NDEBUG + +#define EXPECT_DEBUG_DEATH(statement, regex) \ + GTEST_EXECUTE_STATEMENT_(statement, regex) + +#define ASSERT_DEBUG_DEATH(statement, regex) \ + GTEST_EXECUTE_STATEMENT_(statement, regex) + +#else + +#define EXPECT_DEBUG_DEATH(statement, regex) EXPECT_DEATH(statement, regex) + +#define ASSERT_DEBUG_DEATH(statement, regex) ASSERT_DEATH(statement, regex) + +#endif // NDEBUG for EXPECT_DEBUG_DEATH +#endif // GTEST_HAS_DEATH_TEST + +// This macro is used for implementing macros such as +// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where +// death tests are not supported. Those macros must compile on such systems +// if and only if EXPECT_DEATH and ASSERT_DEATH compile with the same parameters +// on systems that support death tests. This allows one to write such a macro on +// a system that does not support death tests and be sure that it will compile +// on a death-test supporting system. It is exposed publicly so that systems +// that have death-tests with stricter requirements than GTEST_HAS_DEATH_TEST +// can write their own equivalent of EXPECT_DEATH_IF_SUPPORTED and +// ASSERT_DEATH_IF_SUPPORTED. +// +// Parameters: +// statement - A statement that a macro such as EXPECT_DEATH would test +// for program termination. This macro has to make sure this +// statement is compiled but not executed, to ensure that +// EXPECT_DEATH_IF_SUPPORTED compiles with a certain +// parameter if and only if EXPECT_DEATH compiles with it. +// regex - A regex that a macro such as EXPECT_DEATH would use to test +// the output of statement. This parameter has to be +// compiled but not evaluated by this macro, to ensure that +// this macro only accepts expressions that a macro such as +// EXPECT_DEATH would accept. +// terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED +// and a return statement for ASSERT_DEATH_IF_SUPPORTED. +// This ensures that ASSERT_DEATH_IF_SUPPORTED will not +// compile inside functions where ASSERT_DEATH doesn't +// compile. +// +// The branch that has an always false condition is used to ensure that +// statement and regex are compiled (and thus syntactically correct) but +// never executed. The unreachable code macro protects the terminator +// statement from generating an 'unreachable code' warning in case +// statement unconditionally returns or throws. The Message constructor at +// the end allows the syntax of streaming additional messages into the +// macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH. +#define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + GTEST_LOG_(WARNING) << "Death tests are not supported on this platform.\n" \ + << "Statement '" #statement "' cannot be verified."; \ + } else if (::testing::internal::AlwaysFalse()) { \ + ::testing::internal::RE::PartialMatch(".*", (regex)); \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + terminator; \ + } else \ + ::testing::Message() + +// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and +// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if +// death tests are supported; otherwise they just issue a warning. This is +// useful when you are combining death test assertions with normal test +// assertions in one test. +#if GTEST_HAS_DEATH_TEST +#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \ + EXPECT_DEATH(statement, regex) +#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \ + ASSERT_DEATH(statement, regex) +#else +#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \ + GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, ) +#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \ + GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return ) +#endif + +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-matchers.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-matchers.h new file mode 100644 index 000000000..20be24f43 --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-matchers.h @@ -0,0 +1,769 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This file implements just enough of the matcher interface to allow +// EXPECT_DEATH and friends to accept a matcher argument. + +// IWYU pragma: private, include "testing/base/public/gunit.h" +// IWYU pragma: friend third_party/googletest/googlemock/.* +// IWYU pragma: friend third_party/googletest/googletest/.* + +#ifndef GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_ +#define GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_ + +#include +#include +#include +#include + +#include "gtest/gtest-printers.h" +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-port.h" + +// MSVC warning C5046 is new as of VS2017 version 15.8. +#if defined(_MSC_VER) && _MSC_VER >= 1915 +#define GTEST_MAYBE_5046_ 5046 +#else +#define GTEST_MAYBE_5046_ +#endif + +GTEST_DISABLE_MSC_WARNINGS_PUSH_( + 4251 GTEST_MAYBE_5046_ /* class A needs to have dll-interface to be used by + clients of class B */ + /* Symbol involving type with internal linkage not defined */) + +namespace testing { + +// To implement a matcher Foo for type T, define: +// 1. a class FooMatcherImpl that implements the +// MatcherInterface interface, and +// 2. a factory function that creates a Matcher object from a +// FooMatcherImpl*. +// +// The two-level delegation design makes it possible to allow a user +// to write "v" instead of "Eq(v)" where a Matcher is expected, which +// is impossible if we pass matchers by pointers. It also eases +// ownership management as Matcher objects can now be copied like +// plain values. + +// MatchResultListener is an abstract class. Its << operator can be +// used by a matcher to explain why a value matches or doesn't match. +// +class MatchResultListener { + public: + // Creates a listener object with the given underlying ostream. The + // listener does not own the ostream, and does not dereference it + // in the constructor or destructor. + explicit MatchResultListener(::std::ostream *os) : stream_(os) {} + virtual ~MatchResultListener() = 0; // Makes this class abstract. + + // Streams x to the underlying ostream; does nothing if the ostream + // is NULL. + template + MatchResultListener &operator<<(const T &x) { + if (stream_ != nullptr) *stream_ << x; + return *this; + } + + // Returns the underlying ostream. + ::std::ostream *stream() { return stream_; } + + // Returns true if and only if the listener is interested in an explanation + // of the match result. A matcher's MatchAndExplain() method can use + // this information to avoid generating the explanation when no one + // intends to hear it. + bool IsInterested() const { return stream_ != nullptr; } + + private: + ::std::ostream *const stream_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(MatchResultListener); +}; + +inline MatchResultListener::~MatchResultListener() {} + +// An instance of a subclass of this knows how to describe itself as a +// matcher. +class MatcherDescriberInterface { + public: + virtual ~MatcherDescriberInterface() {} + + // Describes this matcher to an ostream. The function should print + // a verb phrase that describes the property a value matching this + // matcher should have. The subject of the verb phrase is the value + // being matched. For example, the DescribeTo() method of the Gt(7) + // matcher prints "is greater than 7". + virtual void DescribeTo(::std::ostream *os) const = 0; + + // Describes the negation of this matcher to an ostream. For + // example, if the description of this matcher is "is greater than + // 7", the negated description could be "is not greater than 7". + // You are not required to override this when implementing + // MatcherInterface, but it is highly advised so that your matcher + // can produce good error messages. + virtual void DescribeNegationTo(::std::ostream *os) const { + *os << "not ("; + DescribeTo(os); + *os << ")"; + } +}; + +// The implementation of a matcher. +template +class MatcherInterface : public MatcherDescriberInterface { + public: + // Returns true if and only if the matcher matches x; also explains the + // match result to 'listener' if necessary (see the next paragraph), in + // the form of a non-restrictive relative clause ("which ...", + // "whose ...", etc) that describes x. For example, the + // MatchAndExplain() method of the Pointee(...) matcher should + // generate an explanation like "which points to ...". + // + // Implementations of MatchAndExplain() should add an explanation of + // the match result *if and only if* they can provide additional + // information that's not already present (or not obvious) in the + // print-out of x and the matcher's description. Whether the match + // succeeds is not a factor in deciding whether an explanation is + // needed, as sometimes the caller needs to print a failure message + // when the match succeeds (e.g. when the matcher is used inside + // Not()). + // + // For example, a "has at least 10 elements" matcher should explain + // what the actual element count is, regardless of the match result, + // as it is useful information to the reader; on the other hand, an + // "is empty" matcher probably only needs to explain what the actual + // size is when the match fails, as it's redundant to say that the + // size is 0 when the value is already known to be empty. + // + // You should override this method when defining a new matcher. + // + // It's the responsibility of the caller (Google Test) to guarantee + // that 'listener' is not NULL. This helps to simplify a matcher's + // implementation when it doesn't care about the performance, as it + // can talk to 'listener' without checking its validity first. + // However, in order to implement dummy listeners efficiently, + // listener->stream() may be NULL. + virtual bool MatchAndExplain(T x, MatchResultListener *listener) const = 0; + + // Inherits these methods from MatcherDescriberInterface: + // virtual void DescribeTo(::std::ostream* os) const = 0; + // virtual void DescribeNegationTo(::std::ostream* os) const; +}; + +namespace internal { + +// Converts a MatcherInterface to a MatcherInterface. +template +class MatcherInterfaceAdapter : public MatcherInterface { + public: + explicit MatcherInterfaceAdapter(const MatcherInterface *impl) + : impl_(impl) {} + ~MatcherInterfaceAdapter() override { delete impl_; } + + void DescribeTo(::std::ostream *os) const override { impl_->DescribeTo(os); } + + void DescribeNegationTo(::std::ostream *os) const override { + impl_->DescribeNegationTo(os); + } + + bool MatchAndExplain(const T &x, + MatchResultListener *listener) const override { + return impl_->MatchAndExplain(x, listener); + } + + private: + const MatcherInterface *const impl_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(MatcherInterfaceAdapter); +}; + +struct AnyEq { + template + bool operator()(const A &a, const B &b) const { + return a == b; + } +}; +struct AnyNe { + template + bool operator()(const A &a, const B &b) const { + return a != b; + } +}; +struct AnyLt { + template + bool operator()(const A &a, const B &b) const { + return a < b; + } +}; +struct AnyGt { + template + bool operator()(const A &a, const B &b) const { + return a > b; + } +}; +struct AnyLe { + template + bool operator()(const A &a, const B &b) const { + return a <= b; + } +}; +struct AnyGe { + template + bool operator()(const A &a, const B &b) const { + return a >= b; + } +}; + +// A match result listener that ignores the explanation. +class DummyMatchResultListener : public MatchResultListener { + public: + DummyMatchResultListener() : MatchResultListener(nullptr) {} + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(DummyMatchResultListener); +}; + +// A match result listener that forwards the explanation to a given +// ostream. The difference between this and MatchResultListener is +// that the former is concrete. +class StreamMatchResultListener : public MatchResultListener { + public: + explicit StreamMatchResultListener(::std::ostream *os) + : MatchResultListener(os) {} + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamMatchResultListener); +}; + +// An internal class for implementing Matcher, which will derive +// from it. We put functionalities common to all Matcher +// specializations here to avoid code duplication. +template +class MatcherBase { + public: + // Returns true if and only if the matcher matches x; also explains the + // match result to 'listener'. + bool MatchAndExplain(const T &x, MatchResultListener *listener) const { + return impl_->MatchAndExplain(x, listener); + } + + // Returns true if and only if this matcher matches x. + bool Matches(const T &x) const { + DummyMatchResultListener dummy; + return MatchAndExplain(x, &dummy); + } + + // Describes this matcher to an ostream. + void DescribeTo(::std::ostream *os) const { impl_->DescribeTo(os); } + + // Describes the negation of this matcher to an ostream. + void DescribeNegationTo(::std::ostream *os) const { + impl_->DescribeNegationTo(os); + } + + // Explains why x matches, or doesn't match, the matcher. + void ExplainMatchResultTo(const T &x, ::std::ostream *os) const { + StreamMatchResultListener listener(os); + MatchAndExplain(x, &listener); + } + + // Returns the describer for this matcher object; retains ownership + // of the describer, which is only guaranteed to be alive when + // this matcher object is alive. + const MatcherDescriberInterface *GetDescriber() const { return impl_.get(); } + + protected: + MatcherBase() {} + + // Constructs a matcher from its implementation. + explicit MatcherBase(const MatcherInterface *impl) : impl_(impl) {} + + template + explicit MatcherBase( + const MatcherInterface *impl, + typename std::enable_if::value>::type * = + nullptr) + : impl_(new internal::MatcherInterfaceAdapter(impl)) {} + + MatcherBase(const MatcherBase &) = default; + MatcherBase &operator=(const MatcherBase &) = default; + MatcherBase(MatcherBase &&) = default; + MatcherBase &operator=(MatcherBase &&) = default; + + virtual ~MatcherBase() {} + + private: + std::shared_ptr> impl_; +}; + +} // namespace internal + +// A Matcher is a copyable and IMMUTABLE (except by assignment) +// object that can check whether a value of type T matches. The +// implementation of Matcher is just a std::shared_ptr to const +// MatcherInterface. Don't inherit from Matcher! +template +class Matcher : public internal::MatcherBase { + public: + // Constructs a null matcher. Needed for storing Matcher objects in STL + // containers. A default-constructed matcher is not yet initialized. You + // cannot use it until a valid value has been assigned to it. + explicit Matcher() {} // NOLINT + + // Constructs a matcher from its implementation. + explicit Matcher(const MatcherInterface *impl) + : internal::MatcherBase(impl) {} + + template + explicit Matcher( + const MatcherInterface *impl, + typename std::enable_if::value>::type * = + nullptr) + : internal::MatcherBase(impl) {} + + // Implicit constructor here allows people to write + // EXPECT_CALL(foo, Bar(5)) instead of EXPECT_CALL(foo, Bar(Eq(5))) sometimes + Matcher(T value); // NOLINT +}; + +// The following two specializations allow the user to write str +// instead of Eq(str) and "foo" instead of Eq("foo") when a std::string +// matcher is expected. +template <> +class GTEST_API_ Matcher + : public internal::MatcherBase { + public: + Matcher() {} + + explicit Matcher(const MatcherInterface *impl) + : internal::MatcherBase(impl) {} + + // Allows the user to write str instead of Eq(str) sometimes, where + // str is a std::string object. + Matcher(const std::string &s); // NOLINT + + // Allows the user to write "foo" instead of Eq("foo") sometimes. + Matcher(const char *s); // NOLINT +}; + +template <> +class GTEST_API_ Matcher + : public internal::MatcherBase { + public: + Matcher() {} + + explicit Matcher(const MatcherInterface *impl) + : internal::MatcherBase(impl) {} + explicit Matcher(const MatcherInterface *impl) + : internal::MatcherBase(impl) {} + + // Allows the user to write str instead of Eq(str) sometimes, where + // str is a string object. + Matcher(const std::string &s); // NOLINT + + // Allows the user to write "foo" instead of Eq("foo") sometimes. + Matcher(const char *s); // NOLINT +}; + +#if GTEST_INTERNAL_HAS_STRING_VIEW +// The following two specializations allow the user to write str +// instead of Eq(str) and "foo" instead of Eq("foo") when a absl::string_view +// matcher is expected. +template <> +class GTEST_API_ Matcher + : public internal::MatcherBase { + public: + Matcher() {} + + explicit Matcher(const MatcherInterface *impl) + : internal::MatcherBase(impl) {} + + // Allows the user to write str instead of Eq(str) sometimes, where + // str is a std::string object. + Matcher(const std::string &s); // NOLINT + + // Allows the user to write "foo" instead of Eq("foo") sometimes. + Matcher(const char *s); // NOLINT + + // Allows the user to pass absl::string_views or std::string_views directly. + Matcher(internal::StringView s); // NOLINT +}; + +template <> +class GTEST_API_ Matcher + : public internal::MatcherBase { + public: + Matcher() {} + + explicit Matcher(const MatcherInterface *impl) + : internal::MatcherBase(impl) {} + explicit Matcher(const MatcherInterface *impl) + : internal::MatcherBase(impl) {} + + // Allows the user to write str instead of Eq(str) sometimes, where + // str is a std::string object. + Matcher(const std::string &s); // NOLINT + + // Allows the user to write "foo" instead of Eq("foo") sometimes. + Matcher(const char *s); // NOLINT + + // Allows the user to pass absl::string_views or std::string_views directly. + Matcher(internal::StringView s); // NOLINT +}; +#endif // GTEST_INTERNAL_HAS_STRING_VIEW + +// Prints a matcher in a human-readable format. +template +std::ostream &operator<<(std::ostream &os, const Matcher &matcher) { + matcher.DescribeTo(&os); + return os; +} + +// The PolymorphicMatcher class template makes it easy to implement a +// polymorphic matcher (i.e. a matcher that can match values of more +// than one type, e.g. Eq(n) and NotNull()). +// +// To define a polymorphic matcher, a user should provide an Impl +// class that has a DescribeTo() method and a DescribeNegationTo() +// method, and define a member function (or member function template) +// +// bool MatchAndExplain(const Value& value, +// MatchResultListener* listener) const; +// +// See the definition of NotNull() for a complete example. +template +class PolymorphicMatcher { + public: + explicit PolymorphicMatcher(const Impl &an_impl) : impl_(an_impl) {} + + // Returns a mutable reference to the underlying matcher + // implementation object. + Impl &mutable_impl() { return impl_; } + + // Returns an immutable reference to the underlying matcher + // implementation object. + const Impl &impl() const { return impl_; } + + template + operator Matcher() const { + return Matcher(new MonomorphicImpl(impl_)); + } + + private: + template + class MonomorphicImpl : public MatcherInterface { + public: + explicit MonomorphicImpl(const Impl &impl) : impl_(impl) {} + + void DescribeTo(::std::ostream *os) const override { impl_.DescribeTo(os); } + + void DescribeNegationTo(::std::ostream *os) const override { + impl_.DescribeNegationTo(os); + } + + bool MatchAndExplain(T x, MatchResultListener *listener) const override { + return impl_.MatchAndExplain(x, listener); + } + + private: + const Impl impl_; + }; + + Impl impl_; +}; + +// Creates a matcher from its implementation. +// DEPRECATED: Especially in the generic code, prefer: +// Matcher(new MyMatcherImpl(...)); +// +// MakeMatcher may create a Matcher that accepts its argument by value, which +// leads to unnecessary copies & lack of support for non-copyable types. +template +inline Matcher MakeMatcher(const MatcherInterface *impl) { + return Matcher(impl); +} + +// Creates a polymorphic matcher from its implementation. This is +// easier to use than the PolymorphicMatcher constructor as it +// doesn't require you to explicitly write the template argument, e.g. +// +// MakePolymorphicMatcher(foo); +// vs +// PolymorphicMatcher(foo); +template +inline PolymorphicMatcher MakePolymorphicMatcher(const Impl &impl) { + return PolymorphicMatcher(impl); +} + +namespace internal { +// Implements a matcher that compares a given value with a +// pre-supplied value using one of the ==, <=, <, etc, operators. The +// two values being compared don't have to have the same type. +// +// The matcher defined here is polymorphic (for example, Eq(5) can be +// used to match an int, a short, a double, etc). Therefore we use +// a template type conversion operator in the implementation. +// +// The following template definition assumes that the Rhs parameter is +// a "bare" type (i.e. neither 'const T' nor 'T&'). +template +class ComparisonBase { + public: + explicit ComparisonBase(const Rhs &rhs) : rhs_(rhs) {} + template + operator Matcher() const { + return Matcher(new Impl(rhs_)); + } + + private: + template + static const T &Unwrap(const T &v) { + return v; + } + template + static const T &Unwrap(std::reference_wrapper v) { + return v; + } + + template + class Impl : public MatcherInterface { + public: + explicit Impl(const Rhs &rhs) : rhs_(rhs) {} + bool MatchAndExplain(Lhs lhs, + MatchResultListener * /* listener */) const override { + return Op()(lhs, Unwrap(rhs_)); + } + void DescribeTo(::std::ostream *os) const override { + *os << D::Desc() << " "; + UniversalPrint(Unwrap(rhs_), os); + } + void DescribeNegationTo(::std::ostream *os) const override { + *os << D::NegatedDesc() << " "; + UniversalPrint(Unwrap(rhs_), os); + } + + private: + Rhs rhs_; + }; + Rhs rhs_; +}; + +template +class EqMatcher : public ComparisonBase, Rhs, AnyEq> { + public: + explicit EqMatcher(const Rhs &rhs) + : ComparisonBase, Rhs, AnyEq>(rhs) {} + static const char *Desc() { return "is equal to"; } + static const char *NegatedDesc() { return "isn't equal to"; } +}; +template +class NeMatcher : public ComparisonBase, Rhs, AnyNe> { + public: + explicit NeMatcher(const Rhs &rhs) + : ComparisonBase, Rhs, AnyNe>(rhs) {} + static const char *Desc() { return "isn't equal to"; } + static const char *NegatedDesc() { return "is equal to"; } +}; +template +class LtMatcher : public ComparisonBase, Rhs, AnyLt> { + public: + explicit LtMatcher(const Rhs &rhs) + : ComparisonBase, Rhs, AnyLt>(rhs) {} + static const char *Desc() { return "is <"; } + static const char *NegatedDesc() { return "isn't <"; } +}; +template +class GtMatcher : public ComparisonBase, Rhs, AnyGt> { + public: + explicit GtMatcher(const Rhs &rhs) + : ComparisonBase, Rhs, AnyGt>(rhs) {} + static const char *Desc() { return "is >"; } + static const char *NegatedDesc() { return "isn't >"; } +}; +template +class LeMatcher : public ComparisonBase, Rhs, AnyLe> { + public: + explicit LeMatcher(const Rhs &rhs) + : ComparisonBase, Rhs, AnyLe>(rhs) {} + static const char *Desc() { return "is <="; } + static const char *NegatedDesc() { return "isn't <="; } +}; +template +class GeMatcher : public ComparisonBase, Rhs, AnyGe> { + public: + explicit GeMatcher(const Rhs &rhs) + : ComparisonBase, Rhs, AnyGe>(rhs) {} + static const char *Desc() { return "is >="; } + static const char *NegatedDesc() { return "isn't >="; } +}; + +// Implements polymorphic matchers MatchesRegex(regex) and +// ContainsRegex(regex), which can be used as a Matcher as long as +// T can be converted to a string. +class MatchesRegexMatcher { + public: + MatchesRegexMatcher(const RE *regex, bool full_match) + : regex_(regex), full_match_(full_match) {} + +#if GTEST_INTERNAL_HAS_STRING_VIEW + bool MatchAndExplain(const internal::StringView &s, + MatchResultListener *listener) const { + return MatchAndExplain(std::string(s), listener); + } +#endif // GTEST_INTERNAL_HAS_STRING_VIEW + + // Accepts pointer types, particularly: + // const char* + // char* + // const wchar_t* + // wchar_t* + template + bool MatchAndExplain(CharType *s, MatchResultListener *listener) const { + return s != nullptr && MatchAndExplain(std::string(s), listener); + } + + // Matches anything that can convert to std::string. + // + // This is a template, not just a plain function with const std::string&, + // because absl::string_view has some interfering non-explicit constructors. + template + bool MatchAndExplain(const MatcheeStringType &s, + MatchResultListener * /* listener */) const { + const std::string &s2(s); + return full_match_ ? RE::FullMatch(s2, *regex_) + : RE::PartialMatch(s2, *regex_); + } + + void DescribeTo(::std::ostream *os) const { + *os << (full_match_ ? "matches" : "contains") << " regular expression "; + UniversalPrinter::Print(regex_->pattern(), os); + } + + void DescribeNegationTo(::std::ostream *os) const { + *os << "doesn't " << (full_match_ ? "match" : "contain") + << " regular expression "; + UniversalPrinter::Print(regex_->pattern(), os); + } + + private: + const std::shared_ptr regex_; + const bool full_match_; +}; +} // namespace internal + +// Matches a string that fully matches regular expression 'regex'. +// The matcher takes ownership of 'regex'. +inline PolymorphicMatcher MatchesRegex( + const internal::RE *regex) { + return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, true)); +} +inline PolymorphicMatcher MatchesRegex( + const std::string ®ex) { + return MatchesRegex(new internal::RE(regex)); +} + +// Matches a string that contains regular expression 'regex'. +// The matcher takes ownership of 'regex'. +inline PolymorphicMatcher ContainsRegex( + const internal::RE *regex) { + return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, false)); +} +inline PolymorphicMatcher ContainsRegex( + const std::string ®ex) { + return ContainsRegex(new internal::RE(regex)); +} + +// Creates a polymorphic matcher that matches anything equal to x. +// Note: if the parameter of Eq() were declared as const T&, Eq("foo") +// wouldn't compile. +template +inline internal::EqMatcher Eq(T x) { + return internal::EqMatcher(x); +} + +// Constructs a Matcher from a 'value' of type T. The constructed +// matcher matches any value that's equal to 'value'. +template +Matcher::Matcher(T value) { + *this = Eq(value); +} + +// Creates a monomorphic matcher that matches anything with type Lhs +// and equal to rhs. A user may need to use this instead of Eq(...) +// in order to resolve an overloading ambiguity. +// +// TypedEq(x) is just a convenient short-hand for Matcher(Eq(x)) +// or Matcher(x), but more readable than the latter. +// +// We could define similar monomorphic matchers for other comparison +// operations (e.g. TypedLt, TypedGe, and etc), but decided not to do +// it yet as those are used much less than Eq() in practice. A user +// can always write Matcher(Lt(5)) to be explicit about the type, +// for example. +template +inline Matcher TypedEq(const Rhs &rhs) { + return Eq(rhs); +} + +// Creates a polymorphic matcher that matches anything >= x. +template +inline internal::GeMatcher Ge(Rhs x) { + return internal::GeMatcher(x); +} + +// Creates a polymorphic matcher that matches anything > x. +template +inline internal::GtMatcher Gt(Rhs x) { + return internal::GtMatcher(x); +} + +// Creates a polymorphic matcher that matches anything <= x. +template +inline internal::LeMatcher Le(Rhs x) { + return internal::LeMatcher(x); +} + +// Creates a polymorphic matcher that matches anything < x. +template +inline internal::LtMatcher Lt(Rhs x) { + return internal::LtMatcher(x); +} + +// Creates a polymorphic matcher that matches anything != x. +template +inline internal::NeMatcher Ne(Rhs x) { + return internal::NeMatcher(x); +} +} // namespace testing + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 5046 + +#endif // GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_ diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-message.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-message.h new file mode 100644 index 000000000..713facae8 --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-message.h @@ -0,0 +1,217 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This header file defines the Message class. +// +// IMPORTANT NOTE: Due to limitation of the C++ language, we have to +// leave some internal implementation details in this header file. +// They are clearly marked by comments like this: +// +// // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +// +// Such code is NOT meant to be used by a user directly, and is subject +// to CHANGE WITHOUT NOTICE. Therefore DO NOT DEPEND ON IT in a user +// program! + +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ +#define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ + +#include +#include +#include + +#include "gtest/internal/gtest-port.h" + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +// Ensures that there is at least one operator<< in the global namespace. +// See Message& operator<<(...) below for why. +void operator<<(const testing::internal::Secret &, int); + +namespace testing { + +// The Message class works like an ostream repeater. +// +// Typical usage: +// +// 1. You stream a bunch of values to a Message object. +// It will remember the text in a stringstream. +// 2. Then you stream the Message object to an ostream. +// This causes the text in the Message to be streamed +// to the ostream. +// +// For example; +// +// testing::Message foo; +// foo << 1 << " != " << 2; +// std::cout << foo; +// +// will print "1 != 2". +// +// Message is not intended to be inherited from. In particular, its +// destructor is not virtual. +// +// Note that stringstream behaves differently in gcc and in MSVC. You +// can stream a NULL char pointer to it in the former, but not in the +// latter (it causes an access violation if you do). The Message +// class hides this difference by treating a NULL char pointer as +// "(null)". +class GTEST_API_ Message { + private: + // The type of basic IO manipulators (endl, ends, and flush) for + // narrow streams. + typedef std::ostream &(*BasicNarrowIoManip)(std::ostream &); + + public: + // Constructs an empty Message. + Message(); + + // Copy constructor. + Message(const Message &msg) : ss_(new ::std::stringstream) { // NOLINT + *ss_ << msg.GetString(); + } + + // Constructs a Message from a C-string. + explicit Message(const char *str) : ss_(new ::std::stringstream) { + *ss_ << str; + } + + // Streams a non-pointer value to this object. + template + inline Message &operator<<(const T &val) { + // Some libraries overload << for STL containers. These + // overloads are defined in the global namespace instead of ::std. + // + // C++'s symbol lookup rule (i.e. Koenig lookup) says that these + // overloads are visible in either the std namespace or the global + // namespace, but not other namespaces, including the testing + // namespace which Google Test's Message class is in. + // + // To allow STL containers (and other types that has a << operator + // defined in the global namespace) to be used in Google Test + // assertions, testing::Message must access the custom << operator + // from the global namespace. With this using declaration, + // overloads of << defined in the global namespace and those + // visible via Koenig lookup are both exposed in this function. + using ::operator<<; + *ss_ << val; + return *this; + } + + // Streams a pointer value to this object. + // + // This function is an overload of the previous one. When you + // stream a pointer to a Message, this definition will be used as it + // is more specialized. (The C++ Standard, section + // [temp.func.order].) If you stream a non-pointer, then the + // previous definition will be used. + // + // The reason for this overload is that streaming a NULL pointer to + // ostream is undefined behavior. Depending on the compiler, you + // may get "0", "(nil)", "(null)", or an access violation. To + // ensure consistent result across compilers, we always treat NULL + // as "(null)". + template + inline Message &operator<<(T *const &pointer) { // NOLINT + if (pointer == nullptr) { + *ss_ << "(null)"; + } else { + *ss_ << pointer; + } + return *this; + } + + // Since the basic IO manipulators are overloaded for both narrow + // and wide streams, we have to provide this specialized definition + // of operator <<, even though its body is the same as the + // templatized version above. Without this definition, streaming + // endl or other basic IO manipulators to Message will confuse the + // compiler. + Message &operator<<(BasicNarrowIoManip val) { + *ss_ << val; + return *this; + } + + // Instead of 1/0, we want to see true/false for bool values. + Message &operator<<(bool b) { return *this << (b ? "true" : "false"); } + + // These two overloads allow streaming a wide C string to a Message + // using the UTF-8 encoding. + Message &operator<<(const wchar_t *wide_c_str); + Message &operator<<(wchar_t *wide_c_str); + +#if GTEST_HAS_STD_WSTRING + // Converts the given wide string to a narrow string using the UTF-8 + // encoding, and streams the result to this Message object. + Message &operator<<(const ::std::wstring &wstr); +#endif // GTEST_HAS_STD_WSTRING + + // Gets the text streamed to this object so far as an std::string. + // Each '\0' character in the buffer is replaced with "\\0". + // + // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + std::string GetString() const; + + private: + // We'll hold the text streamed to this object here. + const std::unique_ptr< ::std::stringstream> ss_; + + // We declare (but don't implement) this to prevent the compiler + // from implementing the assignment operator. + void operator=(const Message &); +}; + +// Streams a Message to an ostream. +inline std::ostream &operator<<(std::ostream &os, const Message &sb) { + return os << sb.GetString(); +} + +namespace internal { + +// Converts a streamable value to an std::string. A NULL pointer is +// converted to "(null)". When the input value is a ::string, +// ::std::string, ::wstring, or ::std::wstring object, each NUL +// character in it is replaced with "\\0". +template +std::string StreamableToString(const T &streamable) { + return (Message() << streamable).GetString(); +} + +} // namespace internal +} // namespace testing + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +#endif // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h new file mode 100644 index 000000000..8d01df525 --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h @@ -0,0 +1,507 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Macros and functions for implementing parameterized tests +// in Google C++ Testing and Mocking Framework (Google Test) +// +// This file is generated by a SCRIPT. DO NOT EDIT BY HAND! +// +// GOOGLETEST_CM0001 DO NOT DELETE +#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ +#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ + +// Value-parameterized tests allow you to test your code with different +// parameters without writing multiple copies of the same test. +// +// Here is how you use value-parameterized tests: + +#if 0 + +// To write value-parameterized tests, first you should define a fixture +// class. It is usually derived from testing::TestWithParam (see below for +// another inheritance scheme that's sometimes useful in more complicated +// class hierarchies), where the type of your parameter values. +// TestWithParam is itself derived from testing::Test. T can be any +// copyable type. If it's a raw pointer, you are responsible for managing the +// lifespan of the pointed values. + +class FooTest : public ::testing::TestWithParam { + // You can implement all the usual class fixture members here. +}; + +// Then, use the TEST_P macro to define as many parameterized tests +// for this fixture as you want. The _P suffix is for "parameterized" +// or "pattern", whichever you prefer to think. + +TEST_P(FooTest, DoesBlah) { + // Inside a test, access the test parameter with the GetParam() method + // of the TestWithParam class: + EXPECT_TRUE(foo.Blah(GetParam())); + ... +} + +TEST_P(FooTest, HasBlahBlah) { + ... +} + +// Finally, you can use INSTANTIATE_TEST_SUITE_P to instantiate the test +// case with any set of parameters you want. Google Test defines a number +// of functions for generating test parameters. They return what we call +// (surprise!) parameter generators. Here is a summary of them, which +// are all in the testing namespace: +// +// +// Range(begin, end [, step]) - Yields values {begin, begin+step, +// begin+step+step, ...}. The values do not +// include end. step defaults to 1. +// Values(v1, v2, ..., vN) - Yields values {v1, v2, ..., vN}. +// ValuesIn(container) - Yields values from a C-style array, an STL +// ValuesIn(begin,end) container, or an iterator range [begin, end). +// Bool() - Yields sequence {false, true}. +// Combine(g1, g2, ..., gN) - Yields all combinations (the Cartesian product +// for the math savvy) of the values generated +// by the N generators. +// +// For more details, see comments at the definitions of these functions below +// in this file. +// +// The following statement will instantiate tests from the FooTest test suite +// each with parameter values "meeny", "miny", and "moe". + +INSTANTIATE_TEST_SUITE_P(InstantiationName, + FooTest, + Values("meeny", "miny", "moe")); + +// To distinguish different instances of the pattern, (yes, you +// can instantiate it more than once) the first argument to the +// INSTANTIATE_TEST_SUITE_P macro is a prefix that will be added to the +// actual test suite name. Remember to pick unique prefixes for different +// instantiations. The tests from the instantiation above will have +// these names: +// +// * InstantiationName/FooTest.DoesBlah/0 for "meeny" +// * InstantiationName/FooTest.DoesBlah/1 for "miny" +// * InstantiationName/FooTest.DoesBlah/2 for "moe" +// * InstantiationName/FooTest.HasBlahBlah/0 for "meeny" +// * InstantiationName/FooTest.HasBlahBlah/1 for "miny" +// * InstantiationName/FooTest.HasBlahBlah/2 for "moe" +// +// You can use these names in --gtest_filter. +// +// This statement will instantiate all tests from FooTest again, each +// with parameter values "cat" and "dog": + +const char* pets[] = {"cat", "dog"}; +INSTANTIATE_TEST_SUITE_P(AnotherInstantiationName, FooTest, ValuesIn(pets)); + +// The tests from the instantiation above will have these names: +// +// * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat" +// * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog" +// * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat" +// * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog" +// +// Please note that INSTANTIATE_TEST_SUITE_P will instantiate all tests +// in the given test suite, whether their definitions come before or +// AFTER the INSTANTIATE_TEST_SUITE_P statement. +// +// Please also note that generator expressions (including parameters to the +// generators) are evaluated in InitGoogleTest(), after main() has started. +// This allows the user on one hand, to adjust generator parameters in order +// to dynamically determine a set of tests to run and on the other hand, +// give the user a chance to inspect the generated tests with Google Test +// reflection API before RUN_ALL_TESTS() is executed. +// +// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc +// for more examples. +// +// In the future, we plan to publish the API for defining new parameter +// generators. But for now this interface remains part of the internal +// implementation and is subject to change. +// +// +// A parameterized test fixture must be derived from testing::Test and from +// testing::WithParamInterface, where T is the type of the parameter +// values. Inheriting from TestWithParam satisfies that requirement because +// TestWithParam inherits from both Test and WithParamInterface. In more +// complicated hierarchies, however, it is occasionally useful to inherit +// separately from Test and WithParamInterface. For example: + +class BaseTest : public ::testing::Test { + // You can inherit all the usual members for a non-parameterized test + // fixture here. +}; + +class DerivedTest : public BaseTest, public ::testing::WithParamInterface { + // The usual test fixture members go here too. +}; + +TEST_F(BaseTest, HasFoo) { + // This is an ordinary non-parameterized test. +} + +TEST_P(DerivedTest, DoesBlah) { + // GetParam works just the same here as if you inherit from TestWithParam. + EXPECT_TRUE(foo.Blah(GetParam())); +} + +#endif // 0 + +#include +#include + +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-param-util.h" +#include "gtest/internal/gtest-port.h" + +namespace testing { + +// Functions producing parameter generators. +// +// Google Test uses these generators to produce parameters for value- +// parameterized tests. When a parameterized test suite is instantiated +// with a particular generator, Google Test creates and runs tests +// for each element in the sequence produced by the generator. +// +// In the following sample, tests from test suite FooTest are instantiated +// each three times with parameter values 3, 5, and 8: +// +// class FooTest : public TestWithParam { ... }; +// +// TEST_P(FooTest, TestThis) { +// } +// TEST_P(FooTest, TestThat) { +// } +// INSTANTIATE_TEST_SUITE_P(TestSequence, FooTest, Values(3, 5, 8)); +// + +// Range() returns generators providing sequences of values in a range. +// +// Synopsis: +// Range(start, end) +// - returns a generator producing a sequence of values {start, start+1, +// start+2, ..., }. +// Range(start, end, step) +// - returns a generator producing a sequence of values {start, start+step, +// start+step+step, ..., }. +// Notes: +// * The generated sequences never include end. For example, Range(1, 5) +// returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2) +// returns a generator producing {1, 3, 5, 7}. +// * start and end must have the same type. That type may be any integral or +// floating-point type or a user defined type satisfying these conditions: +// * It must be assignable (have operator=() defined). +// * It must have operator+() (operator+(int-compatible type) for +// two-operand version). +// * It must have operator<() defined. +// Elements in the resulting sequences will also have that type. +// * Condition start < end must be satisfied in order for resulting sequences +// to contain any elements. +// +template +internal::ParamGenerator Range(T start, T end, IncrementT step) { + return internal::ParamGenerator( + new internal::RangeGenerator(start, end, step)); +} + +template +internal::ParamGenerator Range(T start, T end) { + return Range(start, end, 1); +} + +// ValuesIn() function allows generation of tests with parameters coming from +// a container. +// +// Synopsis: +// ValuesIn(const T (&array)[N]) +// - returns a generator producing sequences with elements from +// a C-style array. +// ValuesIn(const Container& container) +// - returns a generator producing sequences with elements from +// an STL-style container. +// ValuesIn(Iterator begin, Iterator end) +// - returns a generator producing sequences with elements from +// a range [begin, end) defined by a pair of STL-style iterators. These +// iterators can also be plain C pointers. +// +// Please note that ValuesIn copies the values from the containers +// passed in and keeps them to generate tests in RUN_ALL_TESTS(). +// +// Examples: +// +// This instantiates tests from test suite StringTest +// each with C-string values of "foo", "bar", and "baz": +// +// const char* strings[] = {"foo", "bar", "baz"}; +// INSTANTIATE_TEST_SUITE_P(StringSequence, StringTest, ValuesIn(strings)); +// +// This instantiates tests from test suite StlStringTest +// each with STL strings with values "a" and "b": +// +// ::std::vector< ::std::string> GetParameterStrings() { +// ::std::vector< ::std::string> v; +// v.push_back("a"); +// v.push_back("b"); +// return v; +// } +// +// INSTANTIATE_TEST_SUITE_P(CharSequence, +// StlStringTest, +// ValuesIn(GetParameterStrings())); +// +// +// This will also instantiate tests from CharTest +// each with parameter values 'a' and 'b': +// +// ::std::list GetParameterChars() { +// ::std::list list; +// list.push_back('a'); +// list.push_back('b'); +// return list; +// } +// ::std::list l = GetParameterChars(); +// INSTANTIATE_TEST_SUITE_P(CharSequence2, +// CharTest, +// ValuesIn(l.begin(), l.end())); +// +template +internal::ParamGenerator< + typename std::iterator_traits::value_type> +ValuesIn(ForwardIterator begin, ForwardIterator end) { + typedef typename std::iterator_traits::value_type ParamType; + return internal::ParamGenerator( + new internal::ValuesInIteratorRangeGenerator(begin, end)); +} + +template +internal::ParamGenerator ValuesIn(const T (&array)[N]) { + return ValuesIn(array, array + N); +} + +template +internal::ParamGenerator ValuesIn( + const Container &container) { + return ValuesIn(container.begin(), container.end()); +} + +// Values() allows generating tests from explicitly specified list of +// parameters. +// +// Synopsis: +// Values(T v1, T v2, ..., T vN) +// - returns a generator producing sequences with elements v1, v2, ..., vN. +// +// For example, this instantiates tests from test suite BarTest each +// with values "one", "two", and "three": +// +// INSTANTIATE_TEST_SUITE_P(NumSequence, +// BarTest, +// Values("one", "two", "three")); +// +// This instantiates tests from test suite BazTest each with values 1, 2, 3.5. +// The exact type of values will depend on the type of parameter in BazTest. +// +// INSTANTIATE_TEST_SUITE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5)); +// +// +template +internal::ValueArray Values(T... v) { + return internal::ValueArray(std::move(v)...); +} + +// Bool() allows generating tests with parameters in a set of (false, true). +// +// Synopsis: +// Bool() +// - returns a generator producing sequences with elements {false, true}. +// +// It is useful when testing code that depends on Boolean flags. Combinations +// of multiple flags can be tested when several Bool()'s are combined using +// Combine() function. +// +// In the following example all tests in the test suite FlagDependentTest +// will be instantiated twice with parameters false and true. +// +// class FlagDependentTest : public testing::TestWithParam { +// virtual void SetUp() { +// external_flag = GetParam(); +// } +// } +// INSTANTIATE_TEST_SUITE_P(BoolSequence, FlagDependentTest, Bool()); +// +inline internal::ParamGenerator Bool() { return Values(false, true); } + +// Combine() allows the user to combine two or more sequences to produce +// values of a Cartesian product of those sequences' elements. +// +// Synopsis: +// Combine(gen1, gen2, ..., genN) +// - returns a generator producing sequences with elements coming from +// the Cartesian product of elements from the sequences generated by +// gen1, gen2, ..., genN. The sequence elements will have a type of +// std::tuple where T1, T2, ..., TN are the types +// of elements from sequences produces by gen1, gen2, ..., genN. +// +// Combine can have up to 10 arguments. +// +// Example: +// +// This will instantiate tests in test suite AnimalTest each one with +// the parameter values tuple("cat", BLACK), tuple("cat", WHITE), +// tuple("dog", BLACK), and tuple("dog", WHITE): +// +// enum Color { BLACK, GRAY, WHITE }; +// class AnimalTest +// : public testing::TestWithParam > {...}; +// +// TEST_P(AnimalTest, AnimalLooksNice) {...} +// +// INSTANTIATE_TEST_SUITE_P(AnimalVariations, AnimalTest, +// Combine(Values("cat", "dog"), +// Values(BLACK, WHITE))); +// +// This will instantiate tests in FlagDependentTest with all variations of two +// Boolean flags: +// +// class FlagDependentTest +// : public testing::TestWithParam > { +// virtual void SetUp() { +// // Assigns external_flag_1 and external_flag_2 values from the tuple. +// std::tie(external_flag_1, external_flag_2) = GetParam(); +// } +// }; +// +// TEST_P(FlagDependentTest, TestFeature1) { +// // Test your code using external_flag_1 and external_flag_2 here. +// } +// INSTANTIATE_TEST_SUITE_P(TwoBoolSequence, FlagDependentTest, +// Combine(Bool(), Bool())); +// +template +internal::CartesianProductHolder Combine(const Generator &... g) { + return internal::CartesianProductHolder(g...); +} + +#define TEST_P(test_suite_name, test_name) \ + class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ + : public test_suite_name { \ + public: \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() {} \ + void TestBody() override; \ + \ + private: \ + static int AddToRegistry() { \ + ::testing::UnitTest::GetInstance() \ + ->parameterized_test_registry() \ + .GetTestSuitePatternHolder( \ + GTEST_STRINGIFY_(test_suite_name), \ + ::testing::internal::CodeLocation(__FILE__, __LINE__)) \ + ->AddTestPattern( \ + GTEST_STRINGIFY_(test_suite_name), GTEST_STRINGIFY_(test_name), \ + new ::testing::internal::TestMetaFactory()); \ + return 0; \ + } \ + static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \ + GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name)); \ + }; \ + int GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name)::gtest_registering_dummy_ = \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::AddToRegistry(); \ + void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody() + +// The last argument to INSTANTIATE_TEST_SUITE_P allows the user to specify +// generator and an optional function or functor that generates custom test name +// suffixes based on the test parameters. Such a function or functor should +// accept one argument of type testing::TestParamInfo, and +// return std::string. +// +// testing::PrintToStringParamName is a builtin test suffix generator that +// returns the value of testing::PrintToString(GetParam()). +// +// Note: test names must be non-empty, unique, and may only contain ASCII +// alphanumeric characters or underscore. Because PrintToString adds quotes +// to std::string and C strings, it won't work for these types. + +#define GTEST_EXPAND_(arg) arg +#define GTEST_GET_FIRST_(first, ...) first +#define GTEST_GET_SECOND_(first, second, ...) second + +#define INSTANTIATE_TEST_SUITE_P(prefix, test_suite_name, ...) \ + static ::testing::internal::ParamGenerator \ + gtest_##prefix##test_suite_name##_EvalGenerator_() { \ + return GTEST_EXPAND_(GTEST_GET_FIRST_(__VA_ARGS__, DUMMY_PARAM_)); \ + } \ + static ::std::string gtest_##prefix##test_suite_name##_EvalGenerateName_( \ + const ::testing::TestParamInfo &info) { \ + if (::testing::internal::AlwaysFalse()) { \ + ::testing::internal::TestNotEmpty(GTEST_EXPAND_(GTEST_GET_SECOND_( \ + __VA_ARGS__, \ + ::testing::internal::DefaultParamName, \ + DUMMY_PARAM_))); \ + auto t = std::make_tuple(__VA_ARGS__); \ + static_assert(std::tuple_size::value <= 2, \ + "Too Many Args!"); \ + } \ + return ((GTEST_EXPAND_(GTEST_GET_SECOND_( \ + __VA_ARGS__, \ + ::testing::internal::DefaultParamName, \ + DUMMY_PARAM_))))(info); \ + } \ + static int gtest_##prefix##test_suite_name##_dummy_ \ + GTEST_ATTRIBUTE_UNUSED_ = \ + ::testing::UnitTest::GetInstance() \ + ->parameterized_test_registry() \ + .GetTestSuitePatternHolder( \ + GTEST_STRINGIFY_(test_suite_name), \ + ::testing::internal::CodeLocation(__FILE__, __LINE__)) \ + ->AddTestSuiteInstantiation( \ + GTEST_STRINGIFY_(prefix), \ + >est_##prefix##test_suite_name##_EvalGenerator_, \ + >est_##prefix##test_suite_name##_EvalGenerateName_, \ + __FILE__, __LINE__) + +// Allow Marking a Parameterized test class as not needing to be instantiated. +#define GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(T) \ + namespace gtest_do_not_use_outside_namespace_scope {} \ + static const ::testing::internal::MarkAsIgnored gtest_allow_ignore_##T( \ + GTEST_STRINGIFY_(T)) + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +#define INSTANTIATE_TEST_CASE_P \ + static_assert(::testing::internal::InstantiateTestCase_P_IsDeprecated(), \ + ""); \ + INSTANTIATE_TEST_SUITE_P +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-printers.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-printers.h new file mode 100644 index 000000000..950247cf6 --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-printers.h @@ -0,0 +1,925 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Google Test - The Google C++ Testing and Mocking Framework +// +// This file implements a universal value printer that can print a +// value of any type T: +// +// void ::testing::internal::UniversalPrinter::Print(value, ostream_ptr); +// +// A user can teach this function how to print a class type T by +// defining either operator<<() or PrintTo() in the namespace that +// defines T. More specifically, the FIRST defined function in the +// following list will be used (assuming T is defined in namespace +// foo): +// +// 1. foo::PrintTo(const T&, ostream*) +// 2. operator<<(ostream&, const T&) defined in either foo or the +// global namespace. +// +// However if T is an STL-style container then it is printed element-wise +// unless foo::PrintTo(const T&, ostream*) is defined. Note that +// operator<<() is ignored for container types. +// +// If none of the above is defined, it will print the debug string of +// the value if it is a protocol buffer, or print the raw bytes in the +// value otherwise. +// +// To aid debugging: when T is a reference type, the address of the +// value is also printed; when T is a (const) char pointer, both the +// pointer value and the NUL-terminated string it points to are +// printed. +// +// We also provide some convenient wrappers: +// +// // Prints a value to a string. For a (const or not) char +// // pointer, the NUL-terminated string (but not the pointer) is +// // printed. +// std::string ::testing::PrintToString(const T& value); +// +// // Prints a value tersely: for a reference type, the referenced +// // value (but not the address) is printed; for a (const or not) char +// // pointer, the NUL-terminated string (but not the pointer) is +// // printed. +// void ::testing::internal::UniversalTersePrint(const T& value, ostream*); +// +// // Prints value using the type inferred by the compiler. The difference +// // from UniversalTersePrint() is that this function prints both the +// // pointer and the NUL-terminated string for a (const or not) char pointer. +// void ::testing::internal::UniversalPrint(const T& value, ostream*); +// +// // Prints the fields of a tuple tersely to a string vector, one +// // element for each field. Tuple support must be enabled in +// // gtest-port.h. +// std::vector UniversalTersePrintTupleFieldsToStrings( +// const Tuple& value); +// +// Known limitation: +// +// The print primitives print the elements of an STL-style container +// using the compiler-inferred type of *iter where iter is a +// const_iterator of the container. When const_iterator is an input +// iterator but not a forward iterator, this inferred type may not +// match value_type, and the print output may be incorrect. In +// practice, this is rarely a problem as for most containers +// const_iterator is a forward iterator. We'll fix this if there's an +// actual need for it. Note that this fix cannot rely on value_type +// being defined as many user-defined container types don't have +// value_type. + +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ +#define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ + +#include +#include // NOLINT +#include +#include +#include +#include +#include +#include +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-port.h" + +#if GTEST_HAS_ABSL +#include "absl/strings/string_view.h" +#include "absl/types/optional.h" +#include "absl/types/variant.h" +#endif // GTEST_HAS_ABSL + +namespace testing { + +// Definitions in the 'internal' and 'internal2' name spaces are +// subject to change without notice. DO NOT USE THEM IN USER CODE! +namespace internal2 { + +// Prints the given number of bytes in the given object to the given +// ostream. +GTEST_API_ void PrintBytesInObjectTo(const unsigned char *obj_bytes, + size_t count, ::std::ostream *os); + +// For selecting which printer to use when a given type has neither << +// nor PrintTo(). +enum TypeKind { + kProtobuf, // a protobuf type + kConvertibleToInteger, // a type implicitly convertible to BiggestInt + // (e.g. a named or unnamed enum type) +#if GTEST_INTERNAL_HAS_STRING_VIEW + kConvertibleToStringView, // a type implicitly convertible to + // absl::string_view or std::string_view +#endif + kOtherType // anything else +}; + +// TypeWithoutFormatter::PrintValue(value, os) is called +// by the universal printer to print a value of type T when neither +// operator<< nor PrintTo() is defined for T, where kTypeKind is the +// "kind" of T as defined by enum TypeKind. +template +class TypeWithoutFormatter { + public: + // This default version is called when kTypeKind is kOtherType. + static void PrintValue(const T &value, ::std::ostream *os) { + PrintBytesInObjectTo( + static_cast( + reinterpret_cast(std::addressof(value))), + sizeof(value), os); + } +}; + +// We print a protobuf using its ShortDebugString() when the string +// doesn't exceed this many characters; otherwise we print it using +// DebugString() for better readability. +const size_t kProtobufOneLinerMaxLength = 50; + +template +class TypeWithoutFormatter { + public: + static void PrintValue(const T &value, ::std::ostream *os) { + std::string pretty_str = value.ShortDebugString(); + if (pretty_str.length() > kProtobufOneLinerMaxLength) { + pretty_str = "\n" + value.DebugString(); + } + *os << ("<" + pretty_str + ">"); + } +}; + +template +class TypeWithoutFormatter { + public: + // Since T has no << operator or PrintTo() but can be implicitly + // converted to BiggestInt, we print it as a BiggestInt. + // + // Most likely T is an enum type (either named or unnamed), in which + // case printing it as an integer is the desired behavior. In case + // T is not an enum, printing it as an integer is the best we can do + // given that it has no user-defined printer. + static void PrintValue(const T &value, ::std::ostream *os) { + const internal::BiggestInt kBigInt = value; + *os << kBigInt; + } +}; + +#if GTEST_INTERNAL_HAS_STRING_VIEW +template +class TypeWithoutFormatter { + public: + // Since T has neither operator<< nor PrintTo() but can be implicitly + // converted to absl::string_view, we print it as a absl::string_view + // (or std::string_view). + // + // Note: the implementation is further below, as it depends on + // internal::PrintTo symbol which is defined later in the file. + static void PrintValue(const T &value, ::std::ostream *os); +}; +#endif + +// Prints the given value to the given ostream. If the value is a +// protocol message, its debug string is printed; if it's an enum or +// of a type implicitly convertible to BiggestInt, it's printed as an +// integer; otherwise the bytes in the value are printed. This is +// what UniversalPrinter::Print() does when it knows nothing about +// type T and T has neither << operator nor PrintTo(). +// +// A user can override this behavior for a class type Foo by defining +// a << operator in the namespace where Foo is defined. +// +// We put this operator in namespace 'internal2' instead of 'internal' +// to simplify the implementation, as much code in 'internal' needs to +// use << in STL, which would conflict with our own << were it defined +// in 'internal'. +// +// Note that this operator<< takes a generic std::basic_ostream type instead of the more restricted std::ostream. If +// we define it to take an std::ostream instead, we'll get an +// "ambiguous overloads" compiler error when trying to print a type +// Foo that supports streaming to std::basic_ostream, as the compiler cannot tell whether +// operator<<(std::ostream&, const T&) or +// operator<<(std::basic_stream, const Foo&) is more +// specific. +template +::std::basic_ostream &operator<<( + ::std::basic_ostream &os, const T &x) { + TypeWithoutFormatter< + T, (internal::IsAProtocolMessage::value + ? kProtobuf + : std::is_convertible::value + ? kConvertibleToInteger + : +#if GTEST_INTERNAL_HAS_STRING_VIEW + std::is_convertible::value + ? kConvertibleToStringView + : +#endif + kOtherType)>::PrintValue(x, &os); + return os; +} + +} // namespace internal2 +} // namespace testing + +// This namespace MUST NOT BE NESTED IN ::testing, or the name look-up +// magic needed for implementing UniversalPrinter won't work. +namespace testing_internal { + +// Used to print a value that is not an STL-style container when the +// user doesn't define PrintTo() for it. +template +void DefaultPrintNonContainerTo(const T &value, ::std::ostream *os) { + // With the following statement, during unqualified name lookup, + // testing::internal2::operator<< appears as if it was declared in + // the nearest enclosing namespace that contains both + // ::testing_internal and ::testing::internal2, i.e. the global + // namespace. For more details, refer to the C++ Standard section + // 7.3.4-1 [namespace.udir]. This allows us to fall back onto + // testing::internal2::operator<< in case T doesn't come with a << + // operator. + + using ::testing::internal2::operator<<; + + // Assuming T is defined in namespace foo, in the next statement, + // the compiler will consider all of: + // + // 1. foo::operator<< (thanks to Koenig look-up), + // 2. ::operator<< (as the current namespace is enclosed in ::), + // 3. testing::internal2::operator<< (thanks to the using statement above). + // + // The operator<< whose type matches T best will be picked. + // + // We deliberately allow #2 to be a candidate, as sometimes it's + // impossible to define #1 (e.g. when foo is ::std, defining + // anything in it is undefined behavior unless you are a compiler + // vendor.). + *os << value; +} + +} // namespace testing_internal + +namespace testing { +namespace internal { + +// FormatForComparison::Format(value) formats a +// value of type ToPrint that is an operand of a comparison assertion +// (e.g. ASSERT_EQ). OtherOperand is the type of the other operand in +// the comparison, and is used to help determine the best way to +// format the value. In particular, when the value is a C string +// (char pointer) and the other operand is an STL string object, we +// want to format the C string as a string, since we know it is +// compared by value with the string object. If the value is a char +// pointer but the other operand is not an STL string object, we don't +// know whether the pointer is supposed to point to a NUL-terminated +// string, and thus want to print it as a pointer to be safe. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + +// The default case. +template +class FormatForComparison { + public: + static ::std::string Format(const ToPrint &value) { + return ::testing::PrintToString(value); + } +}; + +// Array. +template +class FormatForComparison { + public: + static ::std::string Format(const ToPrint *value) { + return FormatForComparison::Format(value); + } +}; + +// By default, print C string as pointers to be safe, as we don't know +// whether they actually point to a NUL-terminated string. + +#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType) \ + template \ + class FormatForComparison { \ + public: \ + static ::std::string Format(CharType *value) { \ + return ::testing::PrintToString(static_cast(value)); \ + } \ + } + +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t); + +#undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_ + +// If a C string is compared with an STL string object, we know it's meant +// to point to a NUL-terminated string, and thus can print it as a string. + +#define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \ + template <> \ + class FormatForComparison { \ + public: \ + static ::std::string Format(CharType *value) { \ + return ::testing::PrintToString(value); \ + } \ + } + +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string); + +#if GTEST_HAS_STD_WSTRING +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring); +#endif + +#undef GTEST_IMPL_FORMAT_C_STRING_AS_STRING_ + +// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc) +// operand to be used in a failure message. The type (but not value) +// of the other operand may affect the format. This allows us to +// print a char* as a raw pointer when it is compared against another +// char* or void*, and print it as a C string when it is compared +// against an std::string object, for example. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +template +std::string FormatForComparisonFailureMessage(const T1 &value, + const T2 & /* other_operand */) { + return FormatForComparison::Format(value); +} + +// UniversalPrinter::Print(value, ostream_ptr) prints the given +// value to the given ostream. The caller must ensure that +// 'ostream_ptr' is not NULL, or the behavior is undefined. +// +// We define UniversalPrinter as a class template (as opposed to a +// function template), as we need to partially specialize it for +// reference types, which cannot be done with function templates. +template +class UniversalPrinter; + +template +void UniversalPrint(const T &value, ::std::ostream *os); + +enum DefaultPrinterType { + kPrintContainer, + kPrintPointer, + kPrintFunctionPointer, + kPrintOther, +}; +template +struct WrapPrinterType {}; + +// Used to print an STL-style container when the user doesn't define +// a PrintTo() for it. +template +void DefaultPrintTo(WrapPrinterType /* dummy */, + const C &container, ::std::ostream *os) { + const size_t kMaxCount = 32; // The maximum number of elements to print. + *os << '{'; + size_t count = 0; + for (typename C::const_iterator it = container.begin(); it != container.end(); + ++it, ++count) { + if (count > 0) { + *os << ','; + if (count == kMaxCount) { // Enough has been printed. + *os << " ..."; + break; + } + } + *os << ' '; + // We cannot call PrintTo(*it, os) here as PrintTo() doesn't + // handle *it being a native array. + internal::UniversalPrint(*it, os); + } + + if (count > 0) { + *os << ' '; + } + *os << '}'; +} + +// Used to print a pointer that is neither a char pointer nor a member +// pointer, when the user doesn't define PrintTo() for it. (A member +// variable pointer or member function pointer doesn't really point to +// a location in the address space. Their representation is +// implementation-defined. Therefore they will be printed as raw +// bytes.) +template +void DefaultPrintTo(WrapPrinterType /* dummy */, T *p, + ::std::ostream *os) { + if (p == nullptr) { + *os << "NULL"; + } else { + // T is not a function type. We just call << to print p, + // relying on ADL to pick up user-defined << for their pointer + // types, if any. + *os << p; + } +} +template +void DefaultPrintTo(WrapPrinterType /* dummy */, T *p, + ::std::ostream *os) { + if (p == nullptr) { + *os << "NULL"; + } else { + // T is a function type, so '*os << p' doesn't do what we want + // (it just prints p as bool). We want to print p as a const + // void*. + *os << reinterpret_cast(p); + } +} + +// Used to print a non-container, non-pointer value when the user +// doesn't define PrintTo() for it. +template +void DefaultPrintTo(WrapPrinterType /* dummy */, const T &value, + ::std::ostream *os) { + ::testing_internal::DefaultPrintNonContainerTo(value, os); +} + +// Prints the given value using the << operator if it has one; +// otherwise prints the bytes in it. This is what +// UniversalPrinter::Print() does when PrintTo() is not specialized +// or overloaded for type T. +// +// A user can override this behavior for a class type Foo by defining +// an overload of PrintTo() in the namespace where Foo is defined. We +// give the user this option as sometimes defining a << operator for +// Foo is not desirable (e.g. the coding style may prevent doing it, +// or there is already a << operator but it doesn't do what the user +// wants). +template +void PrintTo(const T &value, ::std::ostream *os) { + // DefaultPrintTo() is overloaded. The type of its first argument + // determines which version will be picked. + // + // Note that we check for container types here, prior to we check + // for protocol message types in our operator<<. The rationale is: + // + // For protocol messages, we want to give people a chance to + // override Google Mock's format by defining a PrintTo() or + // operator<<. For STL containers, other formats can be + // incompatible with Google Mock's format for the container + // elements; therefore we check for container types here to ensure + // that our format is used. + // + // Note that MSVC and clang-cl do allow an implicit conversion from + // pointer-to-function to pointer-to-object, but clang-cl warns on it. + // So don't use ImplicitlyConvertible if it can be helped since it will + // cause this warning, and use a separate overload of DefaultPrintTo for + // function pointers so that the `*os << p` in the object pointer overload + // doesn't cause that warning either. + DefaultPrintTo( + WrapPrinterType < + (sizeof(IsContainerTest(0)) == sizeof(IsContainer)) && + !IsRecursiveContainer::value + ? kPrintContainer + : !std::is_pointer::value + ? kPrintOther + : std::is_function::type>::value + ? kPrintFunctionPointer + : kPrintPointer > (), + value, os); +} + +// The following list of PrintTo() overloads tells +// UniversalPrinter::Print() how to print standard types (built-in +// types, strings, plain arrays, and pointers). + +// Overloads for various char types. +GTEST_API_ void PrintTo(unsigned char c, ::std::ostream *os); +GTEST_API_ void PrintTo(signed char c, ::std::ostream *os); +inline void PrintTo(char c, ::std::ostream *os) { + // When printing a plain char, we always treat it as unsigned. This + // way, the output won't be affected by whether the compiler thinks + // char is signed or not. + PrintTo(static_cast(c), os); +} + +// Overloads for other simple built-in types. +inline void PrintTo(bool x, ::std::ostream *os) { + *os << (x ? "true" : "false"); +} + +// Overload for wchar_t type. +// Prints a wchar_t as a symbol if it is printable or as its internal +// code otherwise and also as its decimal code (except for L'\0'). +// The L'\0' char is printed as "L'\\0'". The decimal code is printed +// as signed integer when wchar_t is implemented by the compiler +// as a signed type and is printed as an unsigned integer when wchar_t +// is implemented as an unsigned type. +GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream *os); + +// Overloads for C strings. +GTEST_API_ void PrintTo(const char *s, ::std::ostream *os); +inline void PrintTo(char *s, ::std::ostream *os) { + PrintTo(ImplicitCast_(s), os); +} + +// signed/unsigned char is often used for representing binary data, so +// we print pointers to it as void* to be safe. +inline void PrintTo(const signed char *s, ::std::ostream *os) { + PrintTo(ImplicitCast_(s), os); +} +inline void PrintTo(signed char *s, ::std::ostream *os) { + PrintTo(ImplicitCast_(s), os); +} +inline void PrintTo(const unsigned char *s, ::std::ostream *os) { + PrintTo(ImplicitCast_(s), os); +} +inline void PrintTo(unsigned char *s, ::std::ostream *os) { + PrintTo(ImplicitCast_(s), os); +} + +// MSVC can be configured to define wchar_t as a typedef of unsigned +// short. It defines _NATIVE_WCHAR_T_DEFINED when wchar_t is a native +// type. When wchar_t is a typedef, defining an overload for const +// wchar_t* would cause unsigned short* be printed as a wide string, +// possibly causing invalid memory accesses. +#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED) +// Overloads for wide C strings +GTEST_API_ void PrintTo(const wchar_t *s, ::std::ostream *os); +inline void PrintTo(wchar_t *s, ::std::ostream *os) { + PrintTo(ImplicitCast_(s), os); +} +#endif + +// Overload for C arrays. Multi-dimensional arrays are printed +// properly. + +// Prints the given number of elements in an array, without printing +// the curly braces. +template +void PrintRawArrayTo(const T a[], size_t count, ::std::ostream *os) { + UniversalPrint(a[0], os); + for (size_t i = 1; i != count; i++) { + *os << ", "; + UniversalPrint(a[i], os); + } +} + +// Overloads for ::std::string. +GTEST_API_ void PrintStringTo(const ::std::string &s, ::std::ostream *os); +inline void PrintTo(const ::std::string &s, ::std::ostream *os) { + PrintStringTo(s, os); +} + +// Overloads for ::std::wstring. +#if GTEST_HAS_STD_WSTRING +GTEST_API_ void PrintWideStringTo(const ::std::wstring &s, ::std::ostream *os); +inline void PrintTo(const ::std::wstring &s, ::std::ostream *os) { + PrintWideStringTo(s, os); +} +#endif // GTEST_HAS_STD_WSTRING + +#if GTEST_INTERNAL_HAS_STRING_VIEW +// Overload for internal::StringView. +inline void PrintTo(internal::StringView sp, ::std::ostream *os) { + PrintTo(::std::string(sp), os); +} +#endif // GTEST_INTERNAL_HAS_STRING_VIEW + +inline void PrintTo(std::nullptr_t, ::std::ostream *os) { *os << "(nullptr)"; } + +template +void PrintTo(std::reference_wrapper ref, ::std::ostream *os) { + UniversalPrinter::Print(ref.get(), os); +} + +// Helper function for printing a tuple. T must be instantiated with +// a tuple type. +template +void PrintTupleTo(const T &, std::integral_constant, + ::std::ostream *) {} + +template +void PrintTupleTo(const T &t, std::integral_constant, + ::std::ostream *os) { + PrintTupleTo(t, std::integral_constant(), os); + GTEST_INTENTIONAL_CONST_COND_PUSH_() + if (I > 1) { + GTEST_INTENTIONAL_CONST_COND_POP_() + *os << ", "; + } + UniversalPrinter::type>::Print( + std::get(t), os); +} + +template +void PrintTo(const ::std::tuple &t, ::std::ostream *os) { + *os << "("; + PrintTupleTo(t, std::integral_constant(), os); + *os << ")"; +} + +// Overload for std::pair. +template +void PrintTo(const ::std::pair &value, ::std::ostream *os) { + *os << '('; + // We cannot use UniversalPrint(value.first, os) here, as T1 may be + // a reference type. The same for printing value.second. + UniversalPrinter::Print(value.first, os); + *os << ", "; + UniversalPrinter::Print(value.second, os); + *os << ')'; +} + +// Implements printing a non-reference type T by letting the compiler +// pick the right overload of PrintTo() for T. +template +class UniversalPrinter { + public: + // MSVC warns about adding const to a function type, so we want to + // disable the warning. + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180) + + // Note: we deliberately don't call this PrintTo(), as that name + // conflicts with ::testing::internal::PrintTo in the body of the + // function. + static void Print(const T &value, ::std::ostream *os) { + // By default, ::testing::internal::PrintTo() is used for printing + // the value. + // + // Thanks to Koenig look-up, if T is a class and has its own + // PrintTo() function defined in its namespace, that function will + // be visible here. Since it is more specific than the generic ones + // in ::testing::internal, it will be picked by the compiler in the + // following statement - exactly what we want. + PrintTo(value, os); + } + + GTEST_DISABLE_MSC_WARNINGS_POP_() +}; + +#if GTEST_HAS_ABSL + +// Printer for absl::optional + +template +class UniversalPrinter<::absl::optional> { + public: + static void Print(const ::absl::optional &value, ::std::ostream *os) { + *os << '('; + if (!value) { + *os << "nullopt"; + } else { + UniversalPrint(*value, os); + } + *os << ')'; + } +}; + +// Printer for absl::variant + +template +class UniversalPrinter<::absl::variant> { + public: + static void Print(const ::absl::variant &value, ::std::ostream *os) { + *os << '('; + absl::visit(Visitor{ os }, value); + *os << ')'; + } + + private: + struct Visitor { + template + void operator()(const U &u) const { + *os << "'" << GetTypeName() << "' with value "; + UniversalPrint(u, os); + } + ::std::ostream *os; + }; +}; + +#endif // GTEST_HAS_ABSL + +// UniversalPrintArray(begin, len, os) prints an array of 'len' +// elements, starting at address 'begin'. +template +void UniversalPrintArray(const T *begin, size_t len, ::std::ostream *os) { + if (len == 0) { + *os << "{}"; + } else { + *os << "{ "; + const size_t kThreshold = 18; + const size_t kChunkSize = 8; + // If the array has more than kThreshold elements, we'll have to + // omit some details by printing only the first and the last + // kChunkSize elements. + if (len <= kThreshold) { + PrintRawArrayTo(begin, len, os); + } else { + PrintRawArrayTo(begin, kChunkSize, os); + *os << ", ..., "; + PrintRawArrayTo(begin + len - kChunkSize, kChunkSize, os); + } + *os << " }"; + } +} +// This overload prints a (const) char array compactly. +GTEST_API_ void UniversalPrintArray(const char *begin, size_t len, + ::std::ostream *os); + +// This overload prints a (const) wchar_t array compactly. +GTEST_API_ void UniversalPrintArray(const wchar_t *begin, size_t len, + ::std::ostream *os); + +// Implements printing an array type T[N]. +template +class UniversalPrinter { + public: + // Prints the given array, omitting some elements when there are too + // many. + static void Print(const T (&a)[N], ::std::ostream *os) { + UniversalPrintArray(a, N, os); + } +}; + +// Implements printing a reference type T&. +template +class UniversalPrinter { + public: + // MSVC warns about adding const to a function type, so we want to + // disable the warning. + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180) + + static void Print(const T &value, ::std::ostream *os) { + // Prints the address of the value. We use reinterpret_cast here + // as static_cast doesn't compile when T is a function type. + *os << "@" << reinterpret_cast(&value) << " "; + + // Then prints the value itself. + UniversalPrint(value, os); + } + + GTEST_DISABLE_MSC_WARNINGS_POP_() +}; + +// Prints a value tersely: for a reference type, the referenced value +// (but not the address) is printed; for a (const) char pointer, the +// NUL-terminated string (but not the pointer) is printed. + +template +class UniversalTersePrinter { + public: + static void Print(const T &value, ::std::ostream *os) { + UniversalPrint(value, os); + } +}; +template +class UniversalTersePrinter { + public: + static void Print(const T &value, ::std::ostream *os) { + UniversalPrint(value, os); + } +}; +template +class UniversalTersePrinter { + public: + static void Print(const T (&value)[N], ::std::ostream *os) { + UniversalPrinter::Print(value, os); + } +}; +template <> +class UniversalTersePrinter { + public: + static void Print(const char *str, ::std::ostream *os) { + if (str == nullptr) { + *os << "NULL"; + } else { + UniversalPrint(std::string(str), os); + } + } +}; +template <> +class UniversalTersePrinter { + public: + static void Print(char *str, ::std::ostream *os) { + UniversalTersePrinter::Print(str, os); + } +}; + +#if GTEST_HAS_STD_WSTRING +template <> +class UniversalTersePrinter { + public: + static void Print(const wchar_t *str, ::std::ostream *os) { + if (str == nullptr) { + *os << "NULL"; + } else { + UniversalPrint(::std::wstring(str), os); + } + } +}; +#endif + +template <> +class UniversalTersePrinter { + public: + static void Print(wchar_t *str, ::std::ostream *os) { + UniversalTersePrinter::Print(str, os); + } +}; + +template +void UniversalTersePrint(const T &value, ::std::ostream *os) { + UniversalTersePrinter::Print(value, os); +} + +// Prints a value using the type inferred by the compiler. The +// difference between this and UniversalTersePrint() is that for a +// (const) char pointer, this prints both the pointer and the +// NUL-terminated string. +template +void UniversalPrint(const T &value, ::std::ostream *os) { + // A workarond for the bug in VC++ 7.1 that prevents us from instantiating + // UniversalPrinter with T directly. + typedef T T1; + UniversalPrinter::Print(value, os); +} + +typedef ::std::vector<::std::string> Strings; + +// Tersely prints the first N fields of a tuple to a string vector, +// one element for each field. +template +void TersePrintPrefixToStrings(const Tuple &, std::integral_constant, + Strings *) {} +template +void TersePrintPrefixToStrings(const Tuple &t, + std::integral_constant, + Strings *strings) { + TersePrintPrefixToStrings(t, std::integral_constant(), + strings); + ::std::stringstream ss; + UniversalTersePrint(std::get(t), &ss); + strings->push_back(ss.str()); +} + +// Prints the fields of a tuple tersely to a string vector, one +// element for each field. See the comment before +// UniversalTersePrint() for how we define "tersely". +template +Strings UniversalTersePrintTupleFieldsToStrings(const Tuple &value) { + Strings result; + TersePrintPrefixToStrings( + value, std::integral_constant::value>(), + &result); + return result; +} + +} // namespace internal + +#if GTEST_INTERNAL_HAS_STRING_VIEW +namespace internal2 { +template +void TypeWithoutFormatter::PrintValue( + const T &value, ::std::ostream *os) { + internal::PrintTo(internal::StringView(value), os); +} +} // namespace internal2 +#endif + +template +::std::string PrintToString(const T &value) { + ::std::stringstream ss; + internal::UniversalTersePrinter::Print(value, &ss); + return ss.str(); +} + +} // namespace testing + +// Include any custom printer added by the local installation. +// We must include this header at the end to make sure it can use the +// declarations from this file. +#include "gtest/internal/custom/gtest-printers.h" + +#endif // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-spi.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-spi.h new file mode 100644 index 000000000..e263b1033 --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-spi.h @@ -0,0 +1,245 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +// Utilities for testing Google Test itself and code that uses Google Test +// (e.g. frameworks built on top of Google Test). + +// GOOGLETEST_CM0004 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_ +#define GTEST_INCLUDE_GTEST_GTEST_SPI_H_ + +#include "gtest/gtest.h" + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +namespace testing { + +// This helper class can be used to mock out Google Test failure reporting +// so that we can test Google Test or code that builds on Google Test. +// +// An object of this class appends a TestPartResult object to the +// TestPartResultArray object given in the constructor whenever a Google Test +// failure is reported. It can either intercept only failures that are +// generated in the same thread that created this object or it can intercept +// all generated failures. The scope of this mock object can be controlled with +// the second argument to the two arguments constructor. +class GTEST_API_ ScopedFakeTestPartResultReporter + : public TestPartResultReporterInterface { + public: + // The two possible mocking modes of this object. + enum InterceptMode { + INTERCEPT_ONLY_CURRENT_THREAD, // Intercepts only thread local failures. + INTERCEPT_ALL_THREADS // Intercepts all failures. + }; + + // The c'tor sets this object as the test part result reporter used + // by Google Test. The 'result' parameter specifies where to report the + // results. This reporter will only catch failures generated in the current + // thread. DEPRECATED + explicit ScopedFakeTestPartResultReporter(TestPartResultArray *result); + + // Same as above, but you can choose the interception scope of this object. + ScopedFakeTestPartResultReporter(InterceptMode intercept_mode, + TestPartResultArray *result); + + // The d'tor restores the previous test part result reporter. + ~ScopedFakeTestPartResultReporter() override; + + // Appends the TestPartResult object to the TestPartResultArray + // received in the constructor. + // + // This method is from the TestPartResultReporterInterface + // interface. + void ReportTestPartResult(const TestPartResult &result) override; + + private: + void Init(); + + const InterceptMode intercept_mode_; + TestPartResultReporterInterface *old_reporter_; + TestPartResultArray *const result_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter); +}; + +namespace internal { + +// A helper class for implementing EXPECT_FATAL_FAILURE() and +// EXPECT_NONFATAL_FAILURE(). Its destructor verifies that the given +// TestPartResultArray contains exactly one failure that has the given +// type and contains the given substring. If that's not the case, a +// non-fatal failure will be generated. +class GTEST_API_ SingleFailureChecker { + public: + // The constructor remembers the arguments. + SingleFailureChecker(const TestPartResultArray *results, + TestPartResult::Type type, const std::string &substr); + ~SingleFailureChecker(); + + private: + const TestPartResultArray *const results_; + const TestPartResult::Type type_; + const std::string substr_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker); +}; + +} // namespace internal + +} // namespace testing + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +// A set of macros for testing Google Test assertions or code that's expected +// to generate Google Test fatal failures. It verifies that the given +// statement will cause exactly one fatal Google Test failure with 'substr' +// being part of the failure message. +// +// There are two different versions of this macro. EXPECT_FATAL_FAILURE only +// affects and considers failures generated in the current thread and +// EXPECT_FATAL_FAILURE_ON_ALL_THREADS does the same but for all threads. +// +// The verification of the assertion is done correctly even when the statement +// throws an exception or aborts the current function. +// +// Known restrictions: +// - 'statement' cannot reference local non-static variables or +// non-static members of the current object. +// - 'statement' cannot return a value. +// - You cannot stream a failure message to this macro. +// +// Note that even though the implementations of the following two +// macros are much alike, we cannot refactor them to use a common +// helper macro, due to some peculiarity in how the preprocessor +// works. The AcceptsMacroThatExpandsToUnprotectedComma test in +// gtest_unittest.cc will fail to compile if we do that. +#define EXPECT_FATAL_FAILURE(statement, substr) \ + do { \ + class GTestExpectFatalFailureHelper { \ + public: \ + static void Execute() { statement; } \ + }; \ + ::testing::TestPartResultArray gtest_failures; \ + ::testing::internal::SingleFailureChecker gtest_checker( \ + >est_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \ + { \ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter( \ + ::testing::ScopedFakeTestPartResultReporter:: \ + INTERCEPT_ONLY_CURRENT_THREAD, \ + >est_failures); \ + GTestExpectFatalFailureHelper::Execute(); \ + } \ + } while (::testing::internal::AlwaysFalse()) + +#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \ + do { \ + class GTestExpectFatalFailureHelper { \ + public: \ + static void Execute() { statement; } \ + }; \ + ::testing::TestPartResultArray gtest_failures; \ + ::testing::internal::SingleFailureChecker gtest_checker( \ + >est_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \ + { \ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter( \ + ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \ + >est_failures); \ + GTestExpectFatalFailureHelper::Execute(); \ + } \ + } while (::testing::internal::AlwaysFalse()) + +// A macro for testing Google Test assertions or code that's expected to +// generate Google Test non-fatal failures. It asserts that the given +// statement will cause exactly one non-fatal Google Test failure with 'substr' +// being part of the failure message. +// +// There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only +// affects and considers failures generated in the current thread and +// EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS does the same but for all threads. +// +// 'statement' is allowed to reference local variables and members of +// the current object. +// +// The verification of the assertion is done correctly even when the statement +// throws an exception or aborts the current function. +// +// Known restrictions: +// - You cannot stream a failure message to this macro. +// +// Note that even though the implementations of the following two +// macros are much alike, we cannot refactor them to use a common +// helper macro, due to some peculiarity in how the preprocessor +// works. If we do that, the code won't compile when the user gives +// EXPECT_NONFATAL_FAILURE() a statement that contains a macro that +// expands to code containing an unprotected comma. The +// AcceptsMacroThatExpandsToUnprotectedComma test in gtest_unittest.cc +// catches that. +// +// For the same reason, we have to write +// if (::testing::internal::AlwaysTrue()) { statement; } +// instead of +// GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) +// to avoid an MSVC warning on unreachable code. +#define EXPECT_NONFATAL_FAILURE(statement, substr) \ + do { \ + ::testing::TestPartResultArray gtest_failures; \ + ::testing::internal::SingleFailureChecker gtest_checker( \ + >est_failures, ::testing::TestPartResult::kNonFatalFailure, \ + (substr)); \ + { \ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter( \ + ::testing::ScopedFakeTestPartResultReporter:: \ + INTERCEPT_ONLY_CURRENT_THREAD, \ + >est_failures); \ + if (::testing::internal::AlwaysTrue()) { \ + statement; \ + } \ + } \ + } while (::testing::internal::AlwaysFalse()) + +#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \ + do { \ + ::testing::TestPartResultArray gtest_failures; \ + ::testing::internal::SingleFailureChecker gtest_checker( \ + >est_failures, ::testing::TestPartResult::kNonFatalFailure, \ + (substr)); \ + { \ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter( \ + ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \ + >est_failures); \ + if (::testing::internal::AlwaysTrue()) { \ + statement; \ + } \ + } \ + } while (::testing::internal::AlwaysFalse()) + +#endif // GTEST_INCLUDE_GTEST_GTEST_SPI_H_ diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-test-part.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-test-part.h new file mode 100644 index 000000000..a28afb309 --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-test-part.h @@ -0,0 +1,183 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ +#define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ + +#include +#include +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-string.h" + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +namespace testing { + +// A copyable object representing the result of a test part (i.e. an +// assertion or an explicit FAIL(), ADD_FAILURE(), or SUCCESS()). +// +// Don't inherit from TestPartResult as its destructor is not virtual. +class GTEST_API_ TestPartResult { + public: + // The possible outcomes of a test part (i.e. an assertion or an + // explicit SUCCEED(), FAIL(), or ADD_FAILURE()). + enum Type { + kSuccess, // Succeeded. + kNonFatalFailure, // Failed but the test can continue. + kFatalFailure, // Failed and the test should be terminated. + kSkip // Skipped. + }; + + // C'tor. TestPartResult does NOT have a default constructor. + // Always use this constructor (with parameters) to create a + // TestPartResult object. + TestPartResult(Type a_type, const char *a_file_name, int a_line_number, + const char *a_message) + : type_(a_type), file_name_(a_file_name == nullptr ? "" : a_file_name), + line_number_(a_line_number), summary_(ExtractSummary(a_message)), + message_(a_message) {} + + // Gets the outcome of the test part. + Type type() const { return type_; } + + // Gets the name of the source file where the test part took place, or + // NULL if it's unknown. + const char *file_name() const { + return file_name_.empty() ? nullptr : file_name_.c_str(); + } + + // Gets the line in the source file where the test part took place, + // or -1 if it's unknown. + int line_number() const { return line_number_; } + + // Gets the summary of the failure message. + const char *summary() const { return summary_.c_str(); } + + // Gets the message associated with the test part. + const char *message() const { return message_.c_str(); } + + // Returns true if and only if the test part was skipped. + bool skipped() const { return type_ == kSkip; } + + // Returns true if and only if the test part passed. + bool passed() const { return type_ == kSuccess; } + + // Returns true if and only if the test part non-fatally failed. + bool nonfatally_failed() const { return type_ == kNonFatalFailure; } + + // Returns true if and only if the test part fatally failed. + bool fatally_failed() const { return type_ == kFatalFailure; } + + // Returns true if and only if the test part failed. + bool failed() const { return fatally_failed() || nonfatally_failed(); } + + private: + Type type_; + + // Gets the summary of the failure message by omitting the stack + // trace in it. + static std::string ExtractSummary(const char *message); + + // The name of the source file where the test part took place, or + // "" if the source file is unknown. + std::string file_name_; + // The line in the source file where the test part took place, or -1 + // if the line number is unknown. + int line_number_; + std::string summary_; // The test failure summary. + std::string message_; // The test failure message. +}; + +// Prints a TestPartResult object. +std::ostream &operator<<(std::ostream &os, const TestPartResult &result); + +// An array of TestPartResult objects. +// +// Don't inherit from TestPartResultArray as its destructor is not +// virtual. +class GTEST_API_ TestPartResultArray { + public: + TestPartResultArray() {} + + // Appends the given TestPartResult to the array. + void Append(const TestPartResult &result); + + // Returns the TestPartResult at the given index (0-based). + const TestPartResult &GetTestPartResult(int index) const; + + // Returns the number of TestPartResult objects in the array. + int size() const; + + private: + std::vector array_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestPartResultArray); +}; + +// This interface knows how to report a test part result. +class GTEST_API_ TestPartResultReporterInterface { + public: + virtual ~TestPartResultReporterInterface() {} + + virtual void ReportTestPartResult(const TestPartResult &result) = 0; +}; + +namespace internal { + +// This helper class is used by {ASSERT|EXPECT}_NO_FATAL_FAILURE to check if a +// statement generates new fatal failures. To do so it registers itself as the +// current test part result reporter. Besides checking if fatal failures were +// reported, it only delegates the reporting to the former result reporter. +// The original result reporter is restored in the destructor. +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +class GTEST_API_ HasNewFatalFailureHelper + : public TestPartResultReporterInterface { + public: + HasNewFatalFailureHelper(); + ~HasNewFatalFailureHelper() override; + void ReportTestPartResult(const TestPartResult &result) override; + bool has_new_fatal_failure() const { return has_new_fatal_failure_; } + + private: + bool has_new_fatal_failure_; + TestPartResultReporterInterface *original_reporter_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper); +}; + +} // namespace internal + +} // namespace testing + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +#endif // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-typed-test.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-typed-test.h new file mode 100644 index 000000000..f5afc4db8 --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-typed-test.h @@ -0,0 +1,337 @@ +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ +#define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ + +// This header implements typed tests and type-parameterized tests. + +// Typed (aka type-driven) tests repeat the same test for types in a +// list. You must know which types you want to test with when writing +// typed tests. Here's how you do it: + +#if 0 + +// First, define a fixture class template. It should be parameterized +// by a type. Remember to derive it from testing::Test. +template +class FooTest : public testing::Test { + public: + ... + typedef std::list List; + static T shared_; + T value_; +}; + +// Next, associate a list of types with the test suite, which will be +// repeated for each type in the list. The typedef is necessary for +// the macro to parse correctly. +typedef testing::Types MyTypes; +TYPED_TEST_SUITE(FooTest, MyTypes); + +// If the type list contains only one type, you can write that type +// directly without Types<...>: +// TYPED_TEST_SUITE(FooTest, int); + +// Then, use TYPED_TEST() instead of TEST_F() to define as many typed +// tests for this test suite as you want. +TYPED_TEST(FooTest, DoesBlah) { + // Inside a test, refer to the special name TypeParam to get the type + // parameter. Since we are inside a derived class template, C++ requires + // us to visit the members of FooTest via 'this'. + TypeParam n = this->value_; + + // To visit static members of the fixture, add the TestFixture:: + // prefix. + n += TestFixture::shared_; + + // To refer to typedefs in the fixture, add the "typename + // TestFixture::" prefix. + typename TestFixture::List values; + values.push_back(n); + ... +} + +TYPED_TEST(FooTest, HasPropertyA) { ... } + +// TYPED_TEST_SUITE takes an optional third argument which allows to specify a +// class that generates custom test name suffixes based on the type. This should +// be a class which has a static template function GetName(int index) returning +// a string for each type. The provided integer index equals the index of the +// type in the provided type list. In many cases the index can be ignored. +// +// For example: +// class MyTypeNames { +// public: +// template +// static std::string GetName(int) { +// if (std::is_same()) return "char"; +// if (std::is_same()) return "int"; +// if (std::is_same()) return "unsignedInt"; +// } +// }; +// TYPED_TEST_SUITE(FooTest, MyTypes, MyTypeNames); + +#endif // 0 + +// Type-parameterized tests are abstract test patterns parameterized +// by a type. Compared with typed tests, type-parameterized tests +// allow you to define the test pattern without knowing what the type +// parameters are. The defined pattern can be instantiated with +// different types any number of times, in any number of translation +// units. +// +// If you are designing an interface or concept, you can define a +// suite of type-parameterized tests to verify properties that any +// valid implementation of the interface/concept should have. Then, +// each implementation can easily instantiate the test suite to verify +// that it conforms to the requirements, without having to write +// similar tests repeatedly. Here's an example: + +#if 0 + +// First, define a fixture class template. It should be parameterized +// by a type. Remember to derive it from testing::Test. +template +class FooTest : public testing::Test { + ... +}; + +// Next, declare that you will define a type-parameterized test suite +// (the _P suffix is for "parameterized" or "pattern", whichever you +// prefer): +TYPED_TEST_SUITE_P(FooTest); + +// Then, use TYPED_TEST_P() to define as many type-parameterized tests +// for this type-parameterized test suite as you want. +TYPED_TEST_P(FooTest, DoesBlah) { + // Inside a test, refer to TypeParam to get the type parameter. + TypeParam n = 0; + ... +} + +TYPED_TEST_P(FooTest, HasPropertyA) { ... } + +// Now the tricky part: you need to register all test patterns before +// you can instantiate them. The first argument of the macro is the +// test suite name; the rest are the names of the tests in this test +// case. +REGISTER_TYPED_TEST_SUITE_P(FooTest, + DoesBlah, HasPropertyA); + +// Finally, you are free to instantiate the pattern with the types you +// want. If you put the above code in a header file, you can #include +// it in multiple C++ source files and instantiate it multiple times. +// +// To distinguish different instances of the pattern, the first +// argument to the INSTANTIATE_* macro is a prefix that will be added +// to the actual test suite name. Remember to pick unique prefixes for +// different instances. +typedef testing::Types MyTypes; +INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes); + +// If the type list contains only one type, you can write that type +// directly without Types<...>: +// INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, int); +// +// Similar to the optional argument of TYPED_TEST_SUITE above, +// INSTANTIATE_TEST_SUITE_P takes an optional fourth argument which allows to +// generate custom names. +// INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes, MyTypeNames); + +#endif // 0 + +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-port.h" +#include "gtest/internal/gtest-type-util.h" + +// Implements typed tests. + +#if GTEST_HAS_TYPED_TEST + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Expands to the name of the typedef for the type parameters of the +// given test suite. +#define GTEST_TYPE_PARAMS_(TestSuiteName) gtest_type_params_##TestSuiteName##_ + +// Expands to the name of the typedef for the NameGenerator, responsible for +// creating the suffixes of the name. +#define GTEST_NAME_GENERATOR_(TestSuiteName) \ + gtest_type_params_##TestSuiteName##_NameGenerator + +#define TYPED_TEST_SUITE(CaseName, Types, ...) \ + typedef ::testing::internal::GenerateTypeList::type \ + GTEST_TYPE_PARAMS_(CaseName); \ + typedef ::testing::internal::NameGeneratorSelector<__VA_ARGS__>::type \ + GTEST_NAME_GENERATOR_(CaseName) + +#define TYPED_TEST(CaseName, TestName) \ + static_assert(sizeof(GTEST_STRINGIFY_(TestName)) > 1, \ + "test-name must not be empty"); \ + template \ + class GTEST_TEST_CLASS_NAME_(CaseName, TestName) \ + : public CaseName { \ + private: \ + typedef CaseName TestFixture; \ + typedef gtest_TypeParam_ TypeParam; \ + void TestBody() override; \ + }; \ + static bool gtest_##CaseName##_##TestName##_registered_ \ + GTEST_ATTRIBUTE_UNUSED_ = ::testing::internal::TypeParameterizedTest< \ + CaseName, \ + ::testing::internal::TemplateSel, \ + GTEST_TYPE_PARAMS_( \ + CaseName)>::Register("", \ + ::testing::internal::CodeLocation( \ + __FILE__, __LINE__), \ + GTEST_STRINGIFY_(CaseName), \ + GTEST_STRINGIFY_(TestName), 0, \ + ::testing::internal::GenerateNames< \ + GTEST_NAME_GENERATOR_(CaseName), \ + GTEST_TYPE_PARAMS_(CaseName)>()); \ + template \ + void GTEST_TEST_CLASS_NAME_(CaseName, \ + TestName)::TestBody() + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +#define TYPED_TEST_CASE \ + static_assert(::testing::internal::TypedTestCaseIsDeprecated(), ""); \ + TYPED_TEST_SUITE +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +#endif // GTEST_HAS_TYPED_TEST + +// Implements type-parameterized tests. + +#if GTEST_HAS_TYPED_TEST_P + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Expands to the namespace name that the type-parameterized tests for +// the given type-parameterized test suite are defined in. The exact +// name of the namespace is subject to change without notice. +#define GTEST_SUITE_NAMESPACE_(TestSuiteName) gtest_suite_##TestSuiteName##_ + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Expands to the name of the variable used to remember the names of +// the defined tests in the given test suite. +#define GTEST_TYPED_TEST_SUITE_P_STATE_(TestSuiteName) \ + gtest_typed_test_suite_p_state_##TestSuiteName##_ + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE DIRECTLY. +// +// Expands to the name of the variable used to remember the names of +// the registered tests in the given test suite. +#define GTEST_REGISTERED_TEST_NAMES_(TestSuiteName) \ + gtest_registered_test_names_##TestSuiteName##_ + +// The variables defined in the type-parameterized test macros are +// static as typically these macros are used in a .h file that can be +// #included in multiple translation units linked together. +#define TYPED_TEST_SUITE_P(SuiteName) \ + static ::testing::internal::TypedTestSuitePState \ + GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName) + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +#define TYPED_TEST_CASE_P \ + static_assert(::testing::internal::TypedTestCase_P_IsDeprecated(), ""); \ + TYPED_TEST_SUITE_P +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +#define TYPED_TEST_P(SuiteName, TestName) \ + namespace GTEST_SUITE_NAMESPACE_(SuiteName) { \ + template \ + class TestName : public SuiteName { \ + private: \ + typedef SuiteName TestFixture; \ + typedef gtest_TypeParam_ TypeParam; \ + void TestBody() override; \ + }; \ + static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \ + GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).AddTestName( \ + __FILE__, __LINE__, GTEST_STRINGIFY_(SuiteName), \ + GTEST_STRINGIFY_(TestName)); \ + } \ + template \ + void GTEST_SUITE_NAMESPACE_( \ + SuiteName)::TestName::TestBody() + +// Note: this won't work correctly if the trailing arguments are macros. +#define REGISTER_TYPED_TEST_SUITE_P(SuiteName, ...) \ + namespace GTEST_SUITE_NAMESPACE_(SuiteName) { \ + typedef ::testing::internal::Templates<__VA_ARGS__> gtest_AllTests_; \ + } \ + static const char *const GTEST_REGISTERED_TEST_NAMES_( \ + SuiteName) GTEST_ATTRIBUTE_UNUSED_ = \ + GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).VerifyRegisteredTestNames( \ + GTEST_STRINGIFY_(SuiteName), __FILE__, __LINE__, #__VA_ARGS__) + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +#define REGISTER_TYPED_TEST_CASE_P \ + static_assert(::testing::internal::RegisterTypedTestCase_P_IsDeprecated(), \ + ""); \ + REGISTER_TYPED_TEST_SUITE_P +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +#define INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, SuiteName, Types, ...) \ + static_assert(sizeof(GTEST_STRINGIFY_(Prefix)) > 1, \ + "test-suit-prefix must not be empty"); \ + static bool gtest_##Prefix##_##SuiteName GTEST_ATTRIBUTE_UNUSED_ = \ + ::testing::internal::TypeParameterizedTestSuite< \ + SuiteName, GTEST_SUITE_NAMESPACE_(SuiteName)::gtest_AllTests_, \ + ::testing::internal::GenerateTypeList::type>:: \ + Register(GTEST_STRINGIFY_(Prefix), \ + ::testing::internal::CodeLocation(__FILE__, __LINE__), \ + >EST_TYPED_TEST_SUITE_P_STATE_(SuiteName), \ + GTEST_STRINGIFY_(SuiteName), \ + GTEST_REGISTERED_TEST_NAMES_(SuiteName), \ + ::testing::internal::GenerateNames< \ + ::testing::internal::NameGeneratorSelector< \ + __VA_ARGS__>::type, \ + ::testing::internal::GenerateTypeList::type>()) + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +#define INSTANTIATE_TYPED_TEST_CASE_P \ + static_assert( \ + ::testing::internal::InstantiateTypedTestCase_P_IsDeprecated(), ""); \ + INSTANTIATE_TYPED_TEST_SUITE_P +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +#endif // GTEST_HAS_TYPED_TEST_P + +#endif // GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest.h new file mode 100644 index 000000000..8fd7eea1e --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest.h @@ -0,0 +1,2454 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This header file defines the public API for Google Test. It should be +// included by any test program that uses Google Test. +// +// IMPORTANT NOTE: Due to limitation of the C++ language, we have to +// leave some internal implementation details in this header file. +// They are clearly marked by comments like this: +// +// // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +// +// Such code is NOT meant to be used by a user directly, and is subject +// to CHANGE WITHOUT NOTICE. Therefore DO NOT DEPEND ON IT in a user +// program! +// +// Acknowledgment: Google Test borrowed the idea of automatic test +// registration from Barthelemy Dagenais' (barthelemy@prologique.com) +// easyUnit framework. + +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_GTEST_H_ +#define GTEST_INCLUDE_GTEST_GTEST_H_ + +#include +#include +#include +#include +#include +#include + +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-string.h" +#include "gtest/gtest-death-test.h" +#include "gtest/gtest-matchers.h" +#include "gtest/gtest-message.h" +#include "gtest/gtest-param-test.h" +#include "gtest/gtest-printers.h" +#include "gtest/gtest_prod.h" +#include "gtest/gtest-test-part.h" +#include "gtest/gtest-typed-test.h" + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +namespace testing { + +// Silence C4100 (unreferenced formal parameter) and 4805 +// unsafe mix of type 'const int' and type 'const bool' +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4805) +#pragma warning(disable : 4100) +#endif + +// Declares the flags. + +// This flag temporary enables the disabled tests. +GTEST_DECLARE_bool_(also_run_disabled_tests); + +// This flag brings the debugger on an assertion failure. +GTEST_DECLARE_bool_(break_on_failure); + +// This flag controls whether Google Test catches all test-thrown exceptions +// and logs them as failures. +GTEST_DECLARE_bool_(catch_exceptions); + +// This flag enables using colors in terminal output. Available values are +// "yes" to enable colors, "no" (disable colors), or "auto" (the default) +// to let Google Test decide. +GTEST_DECLARE_string_(color); + +// This flag sets up the filter to select by name using a glob pattern +// the tests to run. If the filter is not given all tests are executed. +GTEST_DECLARE_string_(filter); + +// This flag controls whether Google Test installs a signal handler that dumps +// debugging information when fatal signals are raised. +GTEST_DECLARE_bool_(install_failure_signal_handler); + +// This flag causes the Google Test to list tests. None of the tests listed +// are actually run if the flag is provided. +GTEST_DECLARE_bool_(list_tests); + +// This flag controls whether Google Test emits a detailed XML report to a file +// in addition to its normal textual output. +GTEST_DECLARE_string_(output); + +// This flags control whether Google Test prints the elapsed time for each +// test. +GTEST_DECLARE_bool_(print_time); + +// This flags control whether Google Test prints UTF8 characters as text. +GTEST_DECLARE_bool_(print_utf8); + +// This flag specifies the random number seed. +GTEST_DECLARE_int32_(random_seed); + +// This flag sets how many times the tests are repeated. The default value +// is 1. If the value is -1 the tests are repeating forever. +GTEST_DECLARE_int32_(repeat); + +// This flag controls whether Google Test includes Google Test internal +// stack frames in failure stack traces. +GTEST_DECLARE_bool_(show_internal_stack_frames); + +// When this flag is specified, tests' order is randomized on every iteration. +GTEST_DECLARE_bool_(shuffle); + +// This flag specifies the maximum number of stack frames to be +// printed in a failure message. +GTEST_DECLARE_int32_(stack_trace_depth); + +// When this flag is specified, a failed assertion will throw an +// exception if exceptions are enabled, or exit the program with a +// non-zero code otherwise. For use with an external test framework. +GTEST_DECLARE_bool_(throw_on_failure); + +// When this flag is set with a "host:port" string, on supported +// platforms test results are streamed to the specified port on +// the specified host machine. +GTEST_DECLARE_string_(stream_result_to); + +#if GTEST_USE_OWN_FLAGFILE_FLAG_ +GTEST_DECLARE_string_(flagfile); +#endif // GTEST_USE_OWN_FLAGFILE_FLAG_ + +// The upper limit for valid stack trace depths. +const int kMaxStackTraceDepth = 100; + +namespace internal { + +class AssertHelper; +class DefaultGlobalTestPartResultReporter; +class ExecDeathTest; +class NoExecDeathTest; +class FinalSuccessChecker; +class GTestFlagSaver; +class StreamingListenerTest; +class TestResultAccessor; +class TestEventListenersAccessor; +class TestEventRepeater; +class UnitTestRecordPropertyTestHelper; +class WindowsDeathTest; +class FuchsiaDeathTest; +class UnitTestImpl *GetUnitTestImpl(); +void ReportFailureInUnknownLocation(TestPartResult::Type result_type, + const std::string &message); +std::set *GetIgnoredParameterizedTestSuites(); + +} // namespace internal + +// The friend relationship of some of these classes is cyclic. +// If we don't forward declare them the compiler might confuse the classes +// in friendship clauses with same named classes on the scope. +class Test; +class TestSuite; + +// Old API is still available but deprecated +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +using TestCase = TestSuite; +#endif +class TestInfo; +class UnitTest; + +// A class for indicating whether an assertion was successful. When +// the assertion wasn't successful, the AssertionResult object +// remembers a non-empty message that describes how it failed. +// +// To create an instance of this class, use one of the factory functions +// (AssertionSuccess() and AssertionFailure()). +// +// This class is useful for two purposes: +// 1. Defining predicate functions to be used with Boolean test assertions +// EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts +// 2. Defining predicate-format functions to be +// used with predicate assertions (ASSERT_PRED_FORMAT*, etc). +// +// For example, if you define IsEven predicate: +// +// testing::AssertionResult IsEven(int n) { +// if ((n % 2) == 0) +// return testing::AssertionSuccess(); +// else +// return testing::AssertionFailure() << n << " is odd"; +// } +// +// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5))) +// will print the message +// +// Value of: IsEven(Fib(5)) +// Actual: false (5 is odd) +// Expected: true +// +// instead of a more opaque +// +// Value of: IsEven(Fib(5)) +// Actual: false +// Expected: true +// +// in case IsEven is a simple Boolean predicate. +// +// If you expect your predicate to be reused and want to support informative +// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up +// about half as often as positive ones in our tests), supply messages for +// both success and failure cases: +// +// testing::AssertionResult IsEven(int n) { +// if ((n % 2) == 0) +// return testing::AssertionSuccess() << n << " is even"; +// else +// return testing::AssertionFailure() << n << " is odd"; +// } +// +// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print +// +// Value of: IsEven(Fib(6)) +// Actual: true (8 is even) +// Expected: false +// +// NB: Predicates that support negative Boolean assertions have reduced +// performance in positive ones so be careful not to use them in tests +// that have lots (tens of thousands) of positive Boolean assertions. +// +// To use this class with EXPECT_PRED_FORMAT assertions such as: +// +// // Verifies that Foo() returns an even number. +// EXPECT_PRED_FORMAT1(IsEven, Foo()); +// +// you need to define: +// +// testing::AssertionResult IsEven(const char* expr, int n) { +// if ((n % 2) == 0) +// return testing::AssertionSuccess(); +// else +// return testing::AssertionFailure() +// << "Expected: " << expr << " is even\n Actual: it's " << n; +// } +// +// If Foo() returns 5, you will see the following message: +// +// Expected: Foo() is even +// Actual: it's 5 +// +class GTEST_API_ AssertionResult { + public: + // Copy constructor. + // Used in EXPECT_TRUE/FALSE(assertion_result). + AssertionResult(const AssertionResult &other); + +// C4800 is a level 3 warning in Visual Studio 2015 and earlier. +// This warning is not emitted in Visual Studio 2017. +// This warning is off by default starting in Visual Studio 2019 but can be +// enabled with command-line options. +#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920) + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */) +#endif + + // Used in the EXPECT_TRUE/FALSE(bool_expression). + // + // T must be contextually convertible to bool. + // + // The second parameter prevents this overload from being considered if + // the argument is implicitly convertible to AssertionResult. In that case + // we want AssertionResult's copy constructor to be used. + template + explicit AssertionResult( + const T &success, + typename std::enable_if< + !std::is_convertible::value>::type * + /*enabler*/ + = nullptr) + : success_(success) {} + +#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920) + GTEST_DISABLE_MSC_WARNINGS_POP_() +#endif + + // Assignment operator. + AssertionResult &operator=(AssertionResult other) { + swap(other); + return *this; + } + + // Returns true if and only if the assertion succeeded. + operator bool() const { return success_; } // NOLINT + + // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE. + AssertionResult operator!() const; + + // Returns the text streamed into this AssertionResult. Test assertions + // use it when they fail (i.e., the predicate's outcome doesn't match the + // assertion's expectation). When nothing has been streamed into the + // object, returns an empty string. + const char *message() const { + return message_.get() != nullptr ? message_->c_str() : ""; + } + // Deprecated; please use message() instead. + const char *failure_message() const { return message(); } + + // Streams a custom failure message into this object. + template + AssertionResult &operator<<(const T &value) { + AppendMessage(Message() << value); + return *this; + } + + // Allows streaming basic output manipulators such as endl or flush into + // this object. + AssertionResult &operator<<( + ::std::ostream &(*basic_manipulator)(::std::ostream &stream)) { + AppendMessage(Message() << basic_manipulator); + return *this; + } + + private: + // Appends the contents of message to message_. + void AppendMessage(const Message &a_message) { + if (message_.get() == nullptr) message_.reset(new ::std::string); + message_->append(a_message.GetString().c_str()); + } + + // Swap the contents of this AssertionResult with other. + void swap(AssertionResult &other); + + // Stores result of the assertion predicate. + bool success_; + // Stores the message describing the condition in case the expectation + // construct is not satisfied with the predicate's outcome. + // Referenced via a pointer to avoid taking too much stack frame space + // with test assertions. + std::unique_ptr< ::std::string> message_; +}; + +// Makes a successful assertion result. +GTEST_API_ AssertionResult AssertionSuccess(); + +// Makes a failed assertion result. +GTEST_API_ AssertionResult AssertionFailure(); + +// Makes a failed assertion result with the given failure message. +// Deprecated; use AssertionFailure() << msg. +GTEST_API_ AssertionResult AssertionFailure(const Message &msg); + +} // namespace testing + +// Includes the auto-generated header that implements a family of generic +// predicate assertion macros. This include comes late because it relies on +// APIs declared above. +#include "gtest/gtest_pred_impl.h" + +namespace testing { + +// The abstract class that all tests inherit from. +// +// In Google Test, a unit test program contains one or many TestSuites, and +// each TestSuite contains one or many Tests. +// +// When you define a test using the TEST macro, you don't need to +// explicitly derive from Test - the TEST macro automatically does +// this for you. +// +// The only time you derive from Test is when defining a test fixture +// to be used in a TEST_F. For example: +// +// class FooTest : public testing::Test { +// protected: +// void SetUp() override { ... } +// void TearDown() override { ... } +// ... +// }; +// +// TEST_F(FooTest, Bar) { ... } +// TEST_F(FooTest, Baz) { ... } +// +// Test is not copyable. +class GTEST_API_ Test { + public: + friend class TestInfo; + + // The d'tor is virtual as we intend to inherit from Test. + virtual ~Test(); + + // Sets up the stuff shared by all tests in this test case. + // + // Google Test will call Foo::SetUpTestSuite() before running the first + // test in test case Foo. Hence a sub-class can define its own + // SetUpTestSuite() method to shadow the one defined in the super + // class. + static void SetUpTestSuite() {} + + // Tears down the stuff shared by all tests in this test suite. + // + // Google Test will call Foo::TearDownTestSuite() after running the last + // test in test case Foo. Hence a sub-class can define its own + // TearDownTestSuite() method to shadow the one defined in the super + // class. + static void TearDownTestSuite() {} + + // Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + static void TearDownTestCase() {} + static void SetUpTestCase() {} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + // Returns true if and only if the current test has a fatal failure. + static bool HasFatalFailure(); + + // Returns true if and only if the current test has a non-fatal failure. + static bool HasNonfatalFailure(); + + // Returns true if and only if the current test was skipped. + static bool IsSkipped(); + + // Returns true if and only if the current test has a (either fatal or + // non-fatal) failure. + static bool HasFailure() { return HasFatalFailure() || HasNonfatalFailure(); } + + // Logs a property for the current test, test suite, or for the entire + // invocation of the test program when used outside of the context of a + // test suite. Only the last value for a given key is remembered. These + // are public static so they can be called from utility functions that are + // not members of the test fixture. Calls to RecordProperty made during + // lifespan of the test (from the moment its constructor starts to the + // moment its destructor finishes) will be output in XML as attributes of + // the element. Properties recorded from fixture's + // SetUpTestSuite or TearDownTestSuite are logged as attributes of the + // corresponding element. Calls to RecordProperty made in the + // global context (before or after invocation of RUN_ALL_TESTS and from + // SetUp/TearDown method of Environment objects registered with Google + // Test) will be output as attributes of the element. + static void RecordProperty(const std::string &key, const std::string &value); + static void RecordProperty(const std::string &key, int value); + + protected: + // Creates a Test object. + Test(); + + // Sets up the test fixture. + virtual void SetUp(); + + // Tears down the test fixture. + virtual void TearDown(); + + private: + // Returns true if and only if the current test has the same fixture class + // as the first test in the current test suite. + static bool HasSameFixtureClass(); + + // Runs the test after the test fixture has been set up. + // + // A sub-class must implement this to define the test logic. + // + // DO NOT OVERRIDE THIS FUNCTION DIRECTLY IN A USER PROGRAM. + // Instead, use the TEST or TEST_F macro. + virtual void TestBody() = 0; + + // Sets up, executes, and tears down the test. + void Run(); + + // Deletes self. We deliberately pick an unusual name for this + // internal method to avoid clashing with names used in user TESTs. + void DeleteSelf_() { delete this; } + + const std::unique_ptr gtest_flag_saver_; + + // Often a user misspells SetUp() as Setup() and spends a long time + // wondering why it is never called by Google Test. The declaration of + // the following method is solely for catching such an error at + // compile time: + // + // - The return type is deliberately chosen to be not void, so it + // will be a conflict if void Setup() is declared in the user's + // test fixture. + // + // - This method is private, so it will be another compiler error + // if the method is called from the user's test fixture. + // + // DO NOT OVERRIDE THIS FUNCTION. + // + // If you see an error about overriding the following function or + // about it being private, you have mis-spelled SetUp() as Setup(). + struct Setup_should_be_spelled_SetUp {}; + virtual Setup_should_be_spelled_SetUp *Setup() { return nullptr; } + + // We disallow copying Tests. + GTEST_DISALLOW_COPY_AND_ASSIGN_(Test); +}; + +typedef internal::TimeInMillis TimeInMillis; + +// A copyable object representing a user specified test property which can be +// output as a key/value string pair. +// +// Don't inherit from TestProperty as its destructor is not virtual. +class TestProperty { + public: + // C'tor. TestProperty does NOT have a default constructor. + // Always use this constructor (with parameters) to create a + // TestProperty object. + TestProperty(const std::string &a_key, const std::string &a_value) + : key_(a_key), value_(a_value) {} + + // Gets the user supplied key. + const char *key() const { return key_.c_str(); } + + // Gets the user supplied value. + const char *value() const { return value_.c_str(); } + + // Sets a new value, overriding the one supplied in the constructor. + void SetValue(const std::string &new_value) { value_ = new_value; } + + private: + // The key supplied by the user. + std::string key_; + // The value supplied by the user. + std::string value_; +}; + +// The result of a single Test. This includes a list of +// TestPartResults, a list of TestProperties, a count of how many +// death tests there are in the Test, and how much time it took to run +// the Test. +// +// TestResult is not copyable. +class GTEST_API_ TestResult { + public: + // Creates an empty TestResult. + TestResult(); + + // D'tor. Do not inherit from TestResult. + ~TestResult(); + + // Gets the number of all test parts. This is the sum of the number + // of successful test parts and the number of failed test parts. + int total_part_count() const; + + // Returns the number of the test properties. + int test_property_count() const; + + // Returns true if and only if the test passed (i.e. no test part failed). + bool Passed() const { return !Skipped() && !Failed(); } + + // Returns true if and only if the test was skipped. + bool Skipped() const; + + // Returns true if and only if the test failed. + bool Failed() const; + + // Returns true if and only if the test fatally failed. + bool HasFatalFailure() const; + + // Returns true if and only if the test has a non-fatal failure. + bool HasNonfatalFailure() const; + + // Returns the elapsed time, in milliseconds. + TimeInMillis elapsed_time() const { return elapsed_time_; } + + // Gets the time of the test case start, in ms from the start of the + // UNIX epoch. + TimeInMillis start_timestamp() const { return start_timestamp_; } + + // Returns the i-th test part result among all the results. i can range from 0 + // to total_part_count() - 1. If i is not in that range, aborts the program. + const TestPartResult &GetTestPartResult(int i) const; + + // Returns the i-th test property. i can range from 0 to + // test_property_count() - 1. If i is not in that range, aborts the + // program. + const TestProperty &GetTestProperty(int i) const; + + private: + friend class TestInfo; + friend class TestSuite; + friend class UnitTest; + friend class internal::DefaultGlobalTestPartResultReporter; + friend class internal::ExecDeathTest; + friend class internal::TestResultAccessor; + friend class internal::UnitTestImpl; + friend class internal::WindowsDeathTest; + friend class internal::FuchsiaDeathTest; + + // Gets the vector of TestPartResults. + const std::vector &test_part_results() const { + return test_part_results_; + } + + // Gets the vector of TestProperties. + const std::vector &test_properties() const { + return test_properties_; + } + + // Sets the start time. + void set_start_timestamp(TimeInMillis start) { start_timestamp_ = start; } + + // Sets the elapsed time. + void set_elapsed_time(TimeInMillis elapsed) { elapsed_time_ = elapsed; } + + // Adds a test property to the list. The property is validated and may add + // a non-fatal failure if invalid (e.g., if it conflicts with reserved + // key names). If a property is already recorded for the same key, the + // value will be updated, rather than storing multiple values for the same + // key. xml_element specifies the element for which the property is being + // recorded and is used for validation. + void RecordProperty(const std::string &xml_element, + const TestProperty &test_property); + + // Adds a failure if the key is a reserved attribute of Google Test + // testsuite tags. Returns true if the property is valid. + // FIXME: Validate attribute names are legal and human readable. + static bool ValidateTestProperty(const std::string &xml_element, + const TestProperty &test_property); + + // Adds a test part result to the list. + void AddTestPartResult(const TestPartResult &test_part_result); + + // Returns the death test count. + int death_test_count() const { return death_test_count_; } + + // Increments the death test count, returning the new count. + int increment_death_test_count() { return ++death_test_count_; } + + // Clears the test part results. + void ClearTestPartResults(); + + // Clears the object. + void Clear(); + + // Protects mutable state of the property vector and of owned + // properties, whose values may be updated. + internal::Mutex test_properites_mutex_; + + // The vector of TestPartResults + std::vector test_part_results_; + // The vector of TestProperties + std::vector test_properties_; + // Running count of death tests. + int death_test_count_; + // The start time, in milliseconds since UNIX Epoch. + TimeInMillis start_timestamp_; + // The elapsed time, in milliseconds. + TimeInMillis elapsed_time_; + + // We disallow copying TestResult. + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestResult); +}; // class TestResult + +// A TestInfo object stores the following information about a test: +// +// Test suite name +// Test name +// Whether the test should be run +// A function pointer that creates the test object when invoked +// Test result +// +// The constructor of TestInfo registers itself with the UnitTest +// singleton such that the RUN_ALL_TESTS() macro knows which tests to +// run. +class GTEST_API_ TestInfo { + public: + // Destructs a TestInfo object. This function is not virtual, so + // don't inherit from TestInfo. + ~TestInfo(); + + // Returns the test suite name. + const char *test_suite_name() const { return test_suite_name_.c_str(); } + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + const char *test_case_name() const { return test_suite_name(); } +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + // Returns the test name. + const char *name() const { return name_.c_str(); } + + // Returns the name of the parameter type, or NULL if this is not a typed + // or a type-parameterized test. + const char *type_param() const { + if (type_param_.get() != nullptr) return type_param_->c_str(); + return nullptr; + } + + // Returns the text representation of the value parameter, or NULL if this + // is not a value-parameterized test. + const char *value_param() const { + if (value_param_.get() != nullptr) return value_param_->c_str(); + return nullptr; + } + + // Returns the file name where this test is defined. + const char *file() const { return location_.file.c_str(); } + + // Returns the line where this test is defined. + int line() const { return location_.line; } + + // Return true if this test should not be run because it's in another shard. + bool is_in_another_shard() const { return is_in_another_shard_; } + + // Returns true if this test should run, that is if the test is not + // disabled (or it is disabled but the also_run_disabled_tests flag has + // been specified) and its full name matches the user-specified filter. + // + // Google Test allows the user to filter the tests by their full names. + // The full name of a test Bar in test suite Foo is defined as + // "Foo.Bar". Only the tests that match the filter will run. + // + // A filter is a colon-separated list of glob (not regex) patterns, + // optionally followed by a '-' and a colon-separated list of + // negative patterns (tests to exclude). A test is run if it + // matches one of the positive patterns and does not match any of + // the negative patterns. + // + // For example, *A*:Foo.* is a filter that matches any string that + // contains the character 'A' or starts with "Foo.". + bool should_run() const { return should_run_; } + + // Returns true if and only if this test will appear in the XML report. + bool is_reportable() const { + // The XML report includes tests matching the filter, excluding those + // run in other shards. + return matches_filter_ && !is_in_another_shard_; + } + + // Returns the result of the test. + const TestResult *result() const { return &result_; } + + private: +#if GTEST_HAS_DEATH_TEST + friend class internal::DefaultDeathTestFactory; +#endif // GTEST_HAS_DEATH_TEST + friend class Test; + friend class TestSuite; + friend class internal::UnitTestImpl; + friend class internal::StreamingListenerTest; + friend TestInfo *internal::MakeAndRegisterTestInfo( + const char *test_suite_name, const char *name, const char *type_param, + const char *value_param, internal::CodeLocation code_location, + internal::TypeId fixture_class_id, internal::SetUpTestSuiteFunc set_up_tc, + internal::TearDownTestSuiteFunc tear_down_tc, + internal::TestFactoryBase *factory); + + // Constructs a TestInfo object. The newly constructed instance assumes + // ownership of the factory object. + TestInfo(const std::string &test_suite_name, const std::string &name, + const char *a_type_param, // NULL if not a type-parameterized test + const char *a_value_param, // NULL if not a value-parameterized test + internal::CodeLocation a_code_location, + internal::TypeId fixture_class_id, + internal::TestFactoryBase *factory); + + // Increments the number of death tests encountered in this test so + // far. + int increment_death_test_count() { + return result_.increment_death_test_count(); + } + + // Creates the test object, runs it, records its result, and then + // deletes it. + void Run(); + + static void ClearTestResult(TestInfo *test_info) { + test_info->result_.Clear(); + } + + // These fields are immutable properties of the test. + const std::string test_suite_name_; // test suite name + const std::string name_; // Test name + // Name of the parameter type, or NULL if this is not a typed or a + // type-parameterized test. + const std::unique_ptr type_param_; + // Text representation of the value parameter, or NULL if this is not a + // value-parameterized test. + const std::unique_ptr value_param_; + internal::CodeLocation location_; + const internal::TypeId fixture_class_id_; // ID of the test fixture class + bool should_run_; // True if and only if this test should run + bool is_disabled_; // True if and only if this test is disabled + bool matches_filter_; // True if this test matches the + // user-specified filter. + bool is_in_another_shard_; // Will be run in another shard. + internal::TestFactoryBase *const factory_; // The factory that creates + // the test object + + // This field is mutable and needs to be reset before running the + // test for the second time. + TestResult result_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo); +}; + +// A test suite, which consists of a vector of TestInfos. +// +// TestSuite is not copyable. +class GTEST_API_ TestSuite { + public: + // Creates a TestSuite with the given name. + // + // TestSuite does NOT have a default constructor. Always use this + // constructor to create a TestSuite object. + // + // Arguments: + // + // name: name of the test suite + // a_type_param: the name of the test's type parameter, or NULL if + // this is not a type-parameterized test. + // set_up_tc: pointer to the function that sets up the test suite + // tear_down_tc: pointer to the function that tears down the test suite + TestSuite(const char *name, const char *a_type_param, + internal::SetUpTestSuiteFunc set_up_tc, + internal::TearDownTestSuiteFunc tear_down_tc); + + // Destructor of TestSuite. + virtual ~TestSuite(); + + // Gets the name of the TestSuite. + const char *name() const { return name_.c_str(); } + + // Returns the name of the parameter type, or NULL if this is not a + // type-parameterized test suite. + const char *type_param() const { + if (type_param_.get() != nullptr) return type_param_->c_str(); + return nullptr; + } + + // Returns true if any test in this test suite should run. + bool should_run() const { return should_run_; } + + // Gets the number of successful tests in this test suite. + int successful_test_count() const; + + // Gets the number of skipped tests in this test suite. + int skipped_test_count() const; + + // Gets the number of failed tests in this test suite. + int failed_test_count() const; + + // Gets the number of disabled tests that will be reported in the XML report. + int reportable_disabled_test_count() const; + + // Gets the number of disabled tests in this test suite. + int disabled_test_count() const; + + // Gets the number of tests to be printed in the XML report. + int reportable_test_count() const; + + // Get the number of tests in this test suite that should run. + int test_to_run_count() const; + + // Gets the number of all tests in this test suite. + int total_test_count() const; + + // Returns true if and only if the test suite passed. + bool Passed() const { return !Failed(); } + + // Returns true if and only if the test suite failed. + bool Failed() const { + return failed_test_count() > 0 || ad_hoc_test_result().Failed(); + } + + // Returns the elapsed time, in milliseconds. + TimeInMillis elapsed_time() const { return elapsed_time_; } + + // Gets the time of the test suite start, in ms from the start of the + // UNIX epoch. + TimeInMillis start_timestamp() const { return start_timestamp_; } + + // Returns the i-th test among all the tests. i can range from 0 to + // total_test_count() - 1. If i is not in that range, returns NULL. + const TestInfo *GetTestInfo(int i) const; + + // Returns the TestResult that holds test properties recorded during + // execution of SetUpTestSuite and TearDownTestSuite. + const TestResult &ad_hoc_test_result() const { return ad_hoc_test_result_; } + + private: + friend class Test; + friend class internal::UnitTestImpl; + + // Gets the (mutable) vector of TestInfos in this TestSuite. + std::vector &test_info_list() { return test_info_list_; } + + // Gets the (immutable) vector of TestInfos in this TestSuite. + const std::vector &test_info_list() const { + return test_info_list_; + } + + // Returns the i-th test among all the tests. i can range from 0 to + // total_test_count() - 1. If i is not in that range, returns NULL. + TestInfo *GetMutableTestInfo(int i); + + // Sets the should_run member. + void set_should_run(bool should) { should_run_ = should; } + + // Adds a TestInfo to this test suite. Will delete the TestInfo upon + // destruction of the TestSuite object. + void AddTestInfo(TestInfo *test_info); + + // Clears the results of all tests in this test suite. + void ClearResult(); + + // Clears the results of all tests in the given test suite. + static void ClearTestSuiteResult(TestSuite *test_suite) { + test_suite->ClearResult(); + } + + // Runs every test in this TestSuite. + void Run(); + + // Runs SetUpTestSuite() for this TestSuite. This wrapper is needed + // for catching exceptions thrown from SetUpTestSuite(). + void RunSetUpTestSuite() { + if (set_up_tc_ != nullptr) { + (*set_up_tc_)(); + } + } + + // Runs TearDownTestSuite() for this TestSuite. This wrapper is + // needed for catching exceptions thrown from TearDownTestSuite(). + void RunTearDownTestSuite() { + if (tear_down_tc_ != nullptr) { + (*tear_down_tc_)(); + } + } + + // Returns true if and only if test passed. + static bool TestPassed(const TestInfo *test_info) { + return test_info->should_run() && test_info->result()->Passed(); + } + + // Returns true if and only if test skipped. + static bool TestSkipped(const TestInfo *test_info) { + return test_info->should_run() && test_info->result()->Skipped(); + } + + // Returns true if and only if test failed. + static bool TestFailed(const TestInfo *test_info) { + return test_info->should_run() && test_info->result()->Failed(); + } + + // Returns true if and only if the test is disabled and will be reported in + // the XML report. + static bool TestReportableDisabled(const TestInfo *test_info) { + return test_info->is_reportable() && test_info->is_disabled_; + } + + // Returns true if and only if test is disabled. + static bool TestDisabled(const TestInfo *test_info) { + return test_info->is_disabled_; + } + + // Returns true if and only if this test will appear in the XML report. + static bool TestReportable(const TestInfo *test_info) { + return test_info->is_reportable(); + } + + // Returns true if the given test should run. + static bool ShouldRunTest(const TestInfo *test_info) { + return test_info->should_run(); + } + + // Shuffles the tests in this test suite. + void ShuffleTests(internal::Random *random); + + // Restores the test order to before the first shuffle. + void UnshuffleTests(); + + // Name of the test suite. + std::string name_; + // Name of the parameter type, or NULL if this is not a typed or a + // type-parameterized test. + const std::unique_ptr type_param_; + // The vector of TestInfos in their original order. It owns the + // elements in the vector. + std::vector test_info_list_; + // Provides a level of indirection for the test list to allow easy + // shuffling and restoring the test order. The i-th element in this + // vector is the index of the i-th test in the shuffled test list. + std::vector test_indices_; + // Pointer to the function that sets up the test suite. + internal::SetUpTestSuiteFunc set_up_tc_; + // Pointer to the function that tears down the test suite. + internal::TearDownTestSuiteFunc tear_down_tc_; + // True if and only if any test in this test suite should run. + bool should_run_; + // The start time, in milliseconds since UNIX Epoch. + TimeInMillis start_timestamp_; + // Elapsed time, in milliseconds. + TimeInMillis elapsed_time_; + // Holds test properties recorded during execution of SetUpTestSuite and + // TearDownTestSuite. + TestResult ad_hoc_test_result_; + + // We disallow copying TestSuites. + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestSuite); +}; + +// An Environment object is capable of setting up and tearing down an +// environment. You should subclass this to define your own +// environment(s). +// +// An Environment object does the set-up and tear-down in virtual +// methods SetUp() and TearDown() instead of the constructor and the +// destructor, as: +// +// 1. You cannot safely throw from a destructor. This is a problem +// as in some cases Google Test is used where exceptions are enabled, and +// we may want to implement ASSERT_* using exceptions where they are +// available. +// 2. You cannot use ASSERT_* directly in a constructor or +// destructor. +class Environment { + public: + // The d'tor is virtual as we need to subclass Environment. + virtual ~Environment() {} + + // Override this to define how to set up the environment. + virtual void SetUp() {} + + // Override this to define how to tear down the environment. + virtual void TearDown() {} + + private: + // If you see an error about overriding the following function or + // about it being private, you have mis-spelled SetUp() as Setup(). + struct Setup_should_be_spelled_SetUp {}; + virtual Setup_should_be_spelled_SetUp *Setup() { return nullptr; } +}; + +#if GTEST_HAS_EXCEPTIONS + +// Exception which can be thrown from TestEventListener::OnTestPartResult. +class GTEST_API_ AssertionException + : public internal::GoogleTestFailureException { + public: + explicit AssertionException(const TestPartResult &result) + : GoogleTestFailureException(result) {} +}; + +#endif // GTEST_HAS_EXCEPTIONS + +// The interface for tracing execution of tests. The methods are organized in +// the order the corresponding events are fired. +class TestEventListener { + public: + virtual ~TestEventListener() {} + + // Fired before any test activity starts. + virtual void OnTestProgramStart(const UnitTest &unit_test) = 0; + + // Fired before each iteration of tests starts. There may be more than + // one iteration if GTEST_FLAG(repeat) is set. iteration is the iteration + // index, starting from 0. + virtual void OnTestIterationStart(const UnitTest &unit_test, + int iteration) = 0; + + // Fired before environment set-up for each iteration of tests starts. + virtual void OnEnvironmentsSetUpStart(const UnitTest &unit_test) = 0; + + // Fired after environment set-up for each iteration of tests ends. + virtual void OnEnvironmentsSetUpEnd(const UnitTest &unit_test) = 0; + + // Fired before the test suite starts. + virtual void OnTestSuiteStart(const TestSuite & /*test_suite*/) {} + + // Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + virtual void OnTestCaseStart(const TestCase & /*test_case*/) {} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + // Fired before the test starts. + virtual void OnTestStart(const TestInfo &test_info) = 0; + + // Fired after a failed assertion or a SUCCEED() invocation. + // If you want to throw an exception from this function to skip to the next + // TEST, it must be AssertionException defined above, or inherited from it. + virtual void OnTestPartResult(const TestPartResult &test_part_result) = 0; + + // Fired after the test ends. + virtual void OnTestEnd(const TestInfo &test_info) = 0; + + // Fired after the test suite ends. + virtual void OnTestSuiteEnd(const TestSuite & /*test_suite*/) {} + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + virtual void OnTestCaseEnd(const TestCase & /*test_case*/) {} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + // Fired before environment tear-down for each iteration of tests starts. + virtual void OnEnvironmentsTearDownStart(const UnitTest &unit_test) = 0; + + // Fired after environment tear-down for each iteration of tests ends. + virtual void OnEnvironmentsTearDownEnd(const UnitTest &unit_test) = 0; + + // Fired after each iteration of tests finishes. + virtual void OnTestIterationEnd(const UnitTest &unit_test, int iteration) = 0; + + // Fired after all test activities have ended. + virtual void OnTestProgramEnd(const UnitTest &unit_test) = 0; +}; + +// The convenience class for users who need to override just one or two +// methods and are not concerned that a possible change to a signature of +// the methods they override will not be caught during the build. For +// comments about each method please see the definition of TestEventListener +// above. +class EmptyTestEventListener : public TestEventListener { + public: + void OnTestProgramStart(const UnitTest & /*unit_test*/) override {} + void OnTestIterationStart(const UnitTest & /*unit_test*/, + int /*iteration*/) override {} + void OnEnvironmentsSetUpStart(const UnitTest & /*unit_test*/) override {} + void OnEnvironmentsSetUpEnd(const UnitTest & /*unit_test*/) override {} + void OnTestSuiteStart(const TestSuite & /*test_suite*/) override {} +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseStart(const TestCase & /*test_case*/) override {} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + void OnTestStart(const TestInfo & /*test_info*/) override {} + void OnTestPartResult(const TestPartResult & /*test_part_result*/) override {} + void OnTestEnd(const TestInfo & /*test_info*/) override {} + void OnTestSuiteEnd(const TestSuite & /*test_suite*/) override {} +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseEnd(const TestCase & /*test_case*/) override {} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + void OnEnvironmentsTearDownStart(const UnitTest & /*unit_test*/) override {} + void OnEnvironmentsTearDownEnd(const UnitTest & /*unit_test*/) override {} + void OnTestIterationEnd(const UnitTest & /*unit_test*/, + int /*iteration*/) override {} + void OnTestProgramEnd(const UnitTest & /*unit_test*/) override {} +}; + +// TestEventListeners lets users add listeners to track events in Google Test. +class GTEST_API_ TestEventListeners { + public: + TestEventListeners(); + ~TestEventListeners(); + + // Appends an event listener to the end of the list. Google Test assumes + // the ownership of the listener (i.e. it will delete the listener when + // the test program finishes). + void Append(TestEventListener *listener); + + // Removes the given event listener from the list and returns it. It then + // becomes the caller's responsibility to delete the listener. Returns + // NULL if the listener is not found in the list. + TestEventListener *Release(TestEventListener *listener); + + // Returns the standard listener responsible for the default console + // output. Can be removed from the listeners list to shut down default + // console output. Note that removing this object from the listener list + // with Release transfers its ownership to the caller and makes this + // function return NULL the next time. + TestEventListener *default_result_printer() const { + return default_result_printer_; + } + + // Returns the standard listener responsible for the default XML output + // controlled by the --gtest_output=xml flag. Can be removed from the + // listeners list by users who want to shut down the default XML output + // controlled by this flag and substitute it with custom one. Note that + // removing this object from the listener list with Release transfers its + // ownership to the caller and makes this function return NULL the next + // time. + TestEventListener *default_xml_generator() const { + return default_xml_generator_; + } + + private: + friend class TestSuite; + friend class TestInfo; + friend class internal::DefaultGlobalTestPartResultReporter; + friend class internal::NoExecDeathTest; + friend class internal::TestEventListenersAccessor; + friend class internal::UnitTestImpl; + + // Returns repeater that broadcasts the TestEventListener events to all + // subscribers. + TestEventListener *repeater(); + + // Sets the default_result_printer attribute to the provided listener. + // The listener is also added to the listener list and previous + // default_result_printer is removed from it and deleted. The listener can + // also be NULL in which case it will not be added to the list. Does + // nothing if the previous and the current listener objects are the same. + void SetDefaultResultPrinter(TestEventListener *listener); + + // Sets the default_xml_generator attribute to the provided listener. The + // listener is also added to the listener list and previous + // default_xml_generator is removed from it and deleted. The listener can + // also be NULL in which case it will not be added to the list. Does + // nothing if the previous and the current listener objects are the same. + void SetDefaultXmlGenerator(TestEventListener *listener); + + // Controls whether events will be forwarded by the repeater to the + // listeners in the list. + bool EventForwardingEnabled() const; + void SuppressEventForwarding(); + + // The actual list of listeners. + internal::TestEventRepeater *repeater_; + // Listener responsible for the standard result output. + TestEventListener *default_result_printer_; + // Listener responsible for the creation of the XML output file. + TestEventListener *default_xml_generator_; + + // We disallow copying TestEventListeners. + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventListeners); +}; + +// A UnitTest consists of a vector of TestSuites. +// +// This is a singleton class. The only instance of UnitTest is +// created when UnitTest::GetInstance() is first called. This +// instance is never deleted. +// +// UnitTest is not copyable. +// +// This class is thread-safe as long as the methods are called +// according to their specification. +class GTEST_API_ UnitTest { + public: + // Gets the singleton UnitTest object. The first time this method + // is called, a UnitTest object is constructed and returned. + // Consecutive calls will return the same object. + static UnitTest *GetInstance(); + + // Runs all tests in this UnitTest object and prints the result. + // Returns 0 if successful, or 1 otherwise. + // + // This method can only be called from the main thread. + // + // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + int Run() GTEST_MUST_USE_RESULT_; + + // Returns the working directory when the first TEST() or TEST_F() + // was executed. The UnitTest object owns the string. + const char *original_working_dir() const; + + // Returns the TestSuite object for the test that's currently running, + // or NULL if no test is running. + const TestSuite *current_test_suite() const GTEST_LOCK_EXCLUDED_(mutex_); + +// Legacy API is still available but deprecated +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + const TestCase *current_test_case() const GTEST_LOCK_EXCLUDED_(mutex_); +#endif + + // Returns the TestInfo object for the test that's currently running, + // or NULL if no test is running. + const TestInfo *current_test_info() const GTEST_LOCK_EXCLUDED_(mutex_); + + // Returns the random seed used at the start of the current test run. + int random_seed() const; + + // Returns the ParameterizedTestSuiteRegistry object used to keep track of + // value-parameterized tests and instantiate and register them. + // + // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + internal::ParameterizedTestSuiteRegistry ¶meterized_test_registry() + GTEST_LOCK_EXCLUDED_(mutex_); + + // Gets the number of successful test suites. + int successful_test_suite_count() const; + + // Gets the number of failed test suites. + int failed_test_suite_count() const; + + // Gets the number of all test suites. + int total_test_suite_count() const; + + // Gets the number of all test suites that contain at least one test + // that should run. + int test_suite_to_run_count() const; + + // Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + int successful_test_case_count() const; + int failed_test_case_count() const; + int total_test_case_count() const; + int test_case_to_run_count() const; +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + // Gets the number of successful tests. + int successful_test_count() const; + + // Gets the number of skipped tests. + int skipped_test_count() const; + + // Gets the number of failed tests. + int failed_test_count() const; + + // Gets the number of disabled tests that will be reported in the XML report. + int reportable_disabled_test_count() const; + + // Gets the number of disabled tests. + int disabled_test_count() const; + + // Gets the number of tests to be printed in the XML report. + int reportable_test_count() const; + + // Gets the number of all tests. + int total_test_count() const; + + // Gets the number of tests that should run. + int test_to_run_count() const; + + // Gets the time of the test program start, in ms from the start of the + // UNIX epoch. + TimeInMillis start_timestamp() const; + + // Gets the elapsed time, in milliseconds. + TimeInMillis elapsed_time() const; + + // Returns true if and only if the unit test passed (i.e. all test suites + // passed). + bool Passed() const; + + // Returns true if and only if the unit test failed (i.e. some test suite + // failed or something outside of all tests failed). + bool Failed() const; + + // Gets the i-th test suite among all the test suites. i can range from 0 to + // total_test_suite_count() - 1. If i is not in that range, returns NULL. + const TestSuite *GetTestSuite(int i) const; + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + const TestCase *GetTestCase(int i) const; +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + // Returns the TestResult containing information on test failures and + // properties logged outside of individual test suites. + const TestResult &ad_hoc_test_result() const; + + // Returns the list of event listeners that can be used to track events + // inside Google Test. + TestEventListeners &listeners(); + + private: + // Registers and returns a global test environment. When a test + // program is run, all global test environments will be set-up in + // the order they were registered. After all tests in the program + // have finished, all global test environments will be torn-down in + // the *reverse* order they were registered. + // + // The UnitTest object takes ownership of the given environment. + // + // This method can only be called from the main thread. + Environment *AddEnvironment(Environment *env); + + // Adds a TestPartResult to the current TestResult object. All + // Google Test assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) + // eventually call this to report their results. The user code + // should use the assertion macros instead of calling this directly. + void AddTestPartResult(TestPartResult::Type result_type, + const char *file_name, int line_number, + const std::string &message, + const std::string &os_stack_trace) + GTEST_LOCK_EXCLUDED_(mutex_); + + // Adds a TestProperty to the current TestResult object when invoked from + // inside a test, to current TestSuite's ad_hoc_test_result_ when invoked + // from SetUpTestSuite or TearDownTestSuite, or to the global property set + // when invoked elsewhere. If the result already contains a property with + // the same key, the value will be updated. + void RecordProperty(const std::string &key, const std::string &value); + + // Gets the i-th test suite among all the test suites. i can range from 0 to + // total_test_suite_count() - 1. If i is not in that range, returns NULL. + TestSuite *GetMutableTestSuite(int i); + + // Accessors for the implementation object. + internal::UnitTestImpl *impl() { return impl_; } + const internal::UnitTestImpl *impl() const { return impl_; } + + // These classes and functions are friends as they need to access private + // members of UnitTest. + friend class ScopedTrace; + friend class Test; + friend class internal::AssertHelper; + friend class internal::StreamingListenerTest; + friend class internal::UnitTestRecordPropertyTestHelper; + friend Environment *AddGlobalTestEnvironment(Environment *env); + friend std::set *internal::GetIgnoredParameterizedTestSuites(); + friend internal::UnitTestImpl *internal::GetUnitTestImpl(); + friend void internal::ReportFailureInUnknownLocation( + TestPartResult::Type result_type, const std::string &message); + + // Creates an empty UnitTest. + UnitTest(); + + // D'tor + virtual ~UnitTest(); + + // Pushes a trace defined by SCOPED_TRACE() on to the per-thread + // Google Test trace stack. + void PushGTestTrace(const internal::TraceInfo &trace) + GTEST_LOCK_EXCLUDED_(mutex_); + + // Pops a trace from the per-thread Google Test trace stack. + void PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_); + + // Protects mutable state in *impl_. This is mutable as some const + // methods need to lock it too. + mutable internal::Mutex mutex_; + + // Opaque implementation object. This field is never changed once + // the object is constructed. We don't mark it as const here, as + // doing so will cause a warning in the constructor of UnitTest. + // Mutable state in *impl_ is protected by mutex_. + internal::UnitTestImpl *impl_; + + // We disallow copying UnitTest. + GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTest); +}; + +// A convenient wrapper for adding an environment for the test +// program. +// +// You should call this before RUN_ALL_TESTS() is called, probably in +// main(). If you use gtest_main, you need to call this before main() +// starts for it to take effect. For example, you can define a global +// variable like this: +// +// testing::Environment* const foo_env = +// testing::AddGlobalTestEnvironment(new FooEnvironment); +// +// However, we strongly recommend you to write your own main() and +// call AddGlobalTestEnvironment() there, as relying on initialization +// of global variables makes the code harder to read and may cause +// problems when you register multiple environments from different +// translation units and the environments have dependencies among them +// (remember that the compiler doesn't guarantee the order in which +// global variables from different translation units are initialized). +inline Environment *AddGlobalTestEnvironment(Environment *env) { + return UnitTest::GetInstance()->AddEnvironment(env); +} + +// Initializes Google Test. This must be called before calling +// RUN_ALL_TESTS(). In particular, it parses a command line for the +// flags that Google Test recognizes. Whenever a Google Test flag is +// seen, it is removed from argv, and *argc is decremented. +// +// No value is returned. Instead, the Google Test flag variables are +// updated. +// +// Calling the function for the second time has no user-visible effect. +GTEST_API_ void InitGoogleTest(int *argc, char **argv); + +// This overloaded version can be used in Windows programs compiled in +// UNICODE mode. +GTEST_API_ void InitGoogleTest(int *argc, wchar_t **argv); + +// This overloaded version can be used on Arduino/embedded platforms where +// there is no argc/argv. +GTEST_API_ void InitGoogleTest(); + +namespace internal { + +// Separate the error generating code from the code path to reduce the stack +// frame size of CmpHelperEQ. This helps reduce the overhead of some sanitizers +// when calling EXPECT_* in a tight loop. +template +AssertionResult CmpHelperEQFailure(const char *lhs_expression, + const char *rhs_expression, const T1 &lhs, + const T2 &rhs) { + return EqFailure(lhs_expression, rhs_expression, + FormatForComparisonFailureMessage(lhs, rhs), + FormatForComparisonFailureMessage(rhs, lhs), false); +} + +// This block of code defines operator==/!= +// to block lexical scope lookup. +// It prevents using invalid operator==/!= defined at namespace scope. +struct faketype {}; +inline bool operator==(faketype, faketype) { return true; } +inline bool operator!=(faketype, faketype) { return false; } + +// The helper function for {ASSERT|EXPECT}_EQ. +template +AssertionResult CmpHelperEQ(const char *lhs_expression, + const char *rhs_expression, const T1 &lhs, + const T2 &rhs) { + if (lhs == rhs) { + return AssertionSuccess(); + } + + return CmpHelperEQFailure(lhs_expression, rhs_expression, lhs, rhs); +} + +// With this overloaded version, we allow anonymous enums to be used +// in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums +// can be implicitly cast to BiggestInt. +GTEST_API_ AssertionResult CmpHelperEQ(const char *lhs_expression, + const char *rhs_expression, + BiggestInt lhs, BiggestInt rhs); + +class EqHelper { + public: + // This templatized version is for the general case. + template < + typename T1, typename T2, + // Disable this overload for cases where one argument is a pointer + // and the other is the null pointer constant. + typename std::enable_if::value || + !std::is_pointer::value>::type * = nullptr> + static AssertionResult Compare(const char *lhs_expression, + const char *rhs_expression, const T1 &lhs, + const T2 &rhs) { + return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs); + } + + // With this overloaded version, we allow anonymous enums to be used + // in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous + // enums can be implicitly cast to BiggestInt. + // + // Even though its body looks the same as the above version, we + // cannot merge the two, as it will make anonymous enums unhappy. + static AssertionResult Compare(const char *lhs_expression, + const char *rhs_expression, BiggestInt lhs, + BiggestInt rhs) { + return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs); + } + + template + static AssertionResult Compare( + const char *lhs_expression, const char *rhs_expression, + // Handle cases where '0' is used as a null pointer literal. + std::nullptr_t /* lhs */, T *rhs) { + // We already know that 'lhs' is a null pointer. + return CmpHelperEQ(lhs_expression, rhs_expression, + static_cast(nullptr), rhs); + } +}; + +// Separate the error generating code from the code path to reduce the stack +// frame size of CmpHelperOP. This helps reduce the overhead of some sanitizers +// when calling EXPECT_OP in a tight loop. +template +AssertionResult CmpHelperOpFailure(const char *expr1, const char *expr2, + const T1 &val1, const T2 &val2, + const char *op) { + return AssertionFailure() + << "Expected: (" << expr1 << ") " << op << " (" << expr2 + << "), actual: " << FormatForComparisonFailureMessage(val1, val2) + << " vs " << FormatForComparisonFailureMessage(val2, val1); +} + +// A macro for implementing the helper functions needed to implement +// ASSERT_?? and EXPECT_??. It is here just to avoid copy-and-paste +// of similar code. +// +// For each templatized helper function, we also define an overloaded +// version for BiggestInt in order to reduce code bloat and allow +// anonymous enums to be used with {ASSERT|EXPECT}_?? when compiled +// with gcc 4. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + +#define GTEST_IMPL_CMP_HELPER_(op_name, op) \ + template \ + AssertionResult CmpHelper##op_name(const char *expr1, const char *expr2, \ + const T1 &val1, const T2 &val2) { \ + if (val1 op val2) { \ + return AssertionSuccess(); \ + } else { \ + return CmpHelperOpFailure(expr1, expr2, val1, val2, #op); \ + } \ + } \ + GTEST_API_ AssertionResult CmpHelper##op_name( \ + const char *expr1, const char *expr2, BiggestInt val1, BiggestInt val2) + +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + +// Implements the helper function for {ASSERT|EXPECT}_NE +GTEST_IMPL_CMP_HELPER_(NE, !=); +// Implements the helper function for {ASSERT|EXPECT}_LE +GTEST_IMPL_CMP_HELPER_(LE, <=); +// Implements the helper function for {ASSERT|EXPECT}_LT +GTEST_IMPL_CMP_HELPER_(LT, <); +// Implements the helper function for {ASSERT|EXPECT}_GE +GTEST_IMPL_CMP_HELPER_(GE, >=); +// Implements the helper function for {ASSERT|EXPECT}_GT +GTEST_IMPL_CMP_HELPER_(GT, >); + +#undef GTEST_IMPL_CMP_HELPER_ + +// The helper function for {ASSERT|EXPECT}_STREQ. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTREQ(const char *s1_expression, + const char *s2_expression, + const char *s1, const char *s2); + +// The helper function for {ASSERT|EXPECT}_STRCASEEQ. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char *s1_expression, + const char *s2_expression, + const char *s1, const char *s2); + +// The helper function for {ASSERT|EXPECT}_STRNE. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTRNE(const char *s1_expression, + const char *s2_expression, + const char *s1, const char *s2); + +// The helper function for {ASSERT|EXPECT}_STRCASENE. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char *s1_expression, + const char *s2_expression, + const char *s1, const char *s2); + +// Helper function for *_STREQ on wide strings. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTREQ(const char *s1_expression, + const char *s2_expression, + const wchar_t *s1, const wchar_t *s2); + +// Helper function for *_STRNE on wide strings. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTRNE(const char *s1_expression, + const char *s2_expression, + const wchar_t *s1, const wchar_t *s2); + +} // namespace internal + +// IsSubstring() and IsNotSubstring() are intended to be used as the +// first argument to {EXPECT,ASSERT}_PRED_FORMAT2(), not by +// themselves. They check whether needle is a substring of haystack +// (NULL is considered a substring of itself only), and return an +// appropriate error message when they fail. +// +// The {needle,haystack}_expr arguments are the stringified +// expressions that generated the two real arguments. +GTEST_API_ AssertionResult IsSubstring(const char *needle_expr, + const char *haystack_expr, + const char *needle, + const char *haystack); +GTEST_API_ AssertionResult IsSubstring(const char *needle_expr, + const char *haystack_expr, + const wchar_t *needle, + const wchar_t *haystack); +GTEST_API_ AssertionResult IsNotSubstring(const char *needle_expr, + const char *haystack_expr, + const char *needle, + const char *haystack); +GTEST_API_ AssertionResult IsNotSubstring(const char *needle_expr, + const char *haystack_expr, + const wchar_t *needle, + const wchar_t *haystack); +GTEST_API_ AssertionResult IsSubstring(const char *needle_expr, + const char *haystack_expr, + const ::std::string &needle, + const ::std::string &haystack); +GTEST_API_ AssertionResult IsNotSubstring(const char *needle_expr, + const char *haystack_expr, + const ::std::string &needle, + const ::std::string &haystack); + +#if GTEST_HAS_STD_WSTRING +GTEST_API_ AssertionResult IsSubstring(const char *needle_expr, + const char *haystack_expr, + const ::std::wstring &needle, + const ::std::wstring &haystack); +GTEST_API_ AssertionResult IsNotSubstring(const char *needle_expr, + const char *haystack_expr, + const ::std::wstring &needle, + const ::std::wstring &haystack); +#endif // GTEST_HAS_STD_WSTRING + +namespace internal { + +// Helper template function for comparing floating-points. +// +// Template parameter: +// +// RawType: the raw floating-point type (either float or double) +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +template +AssertionResult CmpHelperFloatingPointEQ(const char *lhs_expression, + const char *rhs_expression, + RawType lhs_value, RawType rhs_value) { + const FloatingPoint lhs(lhs_value), rhs(rhs_value); + + if (lhs.AlmostEquals(rhs)) { + return AssertionSuccess(); + } + + ::std::stringstream lhs_ss; + lhs_ss << std::setprecision(std::numeric_limits::digits10 + 2) + << lhs_value; + + ::std::stringstream rhs_ss; + rhs_ss << std::setprecision(std::numeric_limits::digits10 + 2) + << rhs_value; + + return EqFailure(lhs_expression, rhs_expression, + StringStreamToString(&lhs_ss), StringStreamToString(&rhs_ss), + false); +} + +// Helper function for implementing ASSERT_NEAR. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult DoubleNearPredFormat(const char *expr1, + const char *expr2, + const char *abs_error_expr, + double val1, double val2, + double abs_error); + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// A class that enables one to stream messages to assertion macros +class GTEST_API_ AssertHelper { + public: + // Constructor. + AssertHelper(TestPartResult::Type type, const char *file, int line, + const char *message); + ~AssertHelper(); + + // Message assignment is a semantic trick to enable assertion + // streaming; see the GTEST_MESSAGE_ macro below. + void operator=(const Message &message) const; + + private: + // We put our data in a struct so that the size of the AssertHelper class can + // be as small as possible. This is important because gcc is incapable of + // re-using stack space even for temporary variables, so every EXPECT_EQ + // reserves stack space for another AssertHelper. + struct AssertHelperData { + AssertHelperData(TestPartResult::Type t, const char *srcfile, int line_num, + const char *msg) + : type(t), file(srcfile), line(line_num), message(msg) {} + + TestPartResult::Type const type; + const char *const file; + int const line; + std::string const message; + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelperData); + }; + + AssertHelperData *const data_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper); +}; + +enum GTestColor { COLOR_DEFAULT, COLOR_RED, COLOR_GREEN, COLOR_YELLOW }; + +GTEST_API_ GTEST_ATTRIBUTE_PRINTF_(2, 3) void ColoredPrintf(GTestColor color, + const char *fmt, + ...); + +} // namespace internal + +// The pure interface class that all value-parameterized tests inherit from. +// A value-parameterized class must inherit from both ::testing::Test and +// ::testing::WithParamInterface. In most cases that just means inheriting +// from ::testing::TestWithParam, but more complicated test hierarchies +// may need to inherit from Test and WithParamInterface at different levels. +// +// This interface has support for accessing the test parameter value via +// the GetParam() method. +// +// Use it with one of the parameter generator defining functions, like Range(), +// Values(), ValuesIn(), Bool(), and Combine(). +// +// class FooTest : public ::testing::TestWithParam { +// protected: +// FooTest() { +// // Can use GetParam() here. +// } +// ~FooTest() override { +// // Can use GetParam() here. +// } +// void SetUp() override { +// // Can use GetParam() here. +// } +// void TearDown override { +// // Can use GetParam() here. +// } +// }; +// TEST_P(FooTest, DoesBar) { +// // Can use GetParam() method here. +// Foo foo; +// ASSERT_TRUE(foo.DoesBar(GetParam())); +// } +// INSTANTIATE_TEST_SUITE_P(OneToTenRange, FooTest, ::testing::Range(1, 10)); + +template +class WithParamInterface { + public: + typedef T ParamType; + virtual ~WithParamInterface() {} + + // The current parameter value. Is also available in the test fixture's + // constructor. + static const ParamType &GetParam() { + GTEST_CHECK_(parameter_ != nullptr) + << "GetParam() can only be called inside a value-parameterized test " + << "-- did you intend to write TEST_P instead of TEST_F?"; + return *parameter_; + } + + private: + // Sets parameter value. The caller is responsible for making sure the value + // remains alive and unchanged throughout the current test. + static void SetParam(const ParamType *parameter) { parameter_ = parameter; } + + // Static value used for accessing parameter during a test lifetime. + static const ParamType *parameter_; + + // TestClass must be a subclass of WithParamInterface and Test. + template + friend class internal::ParameterizedTestFactory; +}; + +template +const T *WithParamInterface::parameter_ = nullptr; + +// Most value-parameterized classes can ignore the existence of +// WithParamInterface, and can just inherit from ::testing::TestWithParam. + +template +class TestWithParam : public Test, public WithParamInterface {}; + +// Macros for indicating success/failure in test code. + +// Skips test in runtime. +// Skipping test aborts current function. +// Skipped tests are neither successful nor failed. +#define GTEST_SKIP() GTEST_SKIP_("") + +// ADD_FAILURE unconditionally adds a failure to the current test. +// SUCCEED generates a success - it doesn't automatically make the +// current test successful, as a test is only successful when it has +// no failure. +// +// EXPECT_* verifies that a certain condition is satisfied. If not, +// it behaves like ADD_FAILURE. In particular: +// +// EXPECT_TRUE verifies that a Boolean condition is true. +// EXPECT_FALSE verifies that a Boolean condition is false. +// +// FAIL and ASSERT_* are similar to ADD_FAILURE and EXPECT_*, except +// that they will also abort the current function on failure. People +// usually want the fail-fast behavior of FAIL and ASSERT_*, but those +// writing data-driven tests often find themselves using ADD_FAILURE +// and EXPECT_* more. + +// Generates a nonfatal failure with a generic message. +#define ADD_FAILURE() GTEST_NONFATAL_FAILURE_("Failed") + +// Generates a nonfatal failure at the given source file location with +// a generic message. +#define ADD_FAILURE_AT(file, line) \ + GTEST_MESSAGE_AT_(file, line, "Failed", \ + ::testing::TestPartResult::kNonFatalFailure) + +// Generates a fatal failure with a generic message. +#define GTEST_FAIL() GTEST_FATAL_FAILURE_("Failed") + +// Like GTEST_FAIL(), but at the given source file location. +#define GTEST_FAIL_AT(file, line) \ + GTEST_MESSAGE_AT_(file, line, "Failed", \ + ::testing::TestPartResult::kFatalFailure) + +// Define this macro to 1 to omit the definition of FAIL(), which is a +// generic name and clashes with some other libraries. +#if !GTEST_DONT_DEFINE_FAIL +#define FAIL() GTEST_FAIL() +#endif + +// Generates a success with a generic message. +#define GTEST_SUCCEED() GTEST_SUCCESS_("Succeeded") + +// Define this macro to 1 to omit the definition of SUCCEED(), which +// is a generic name and clashes with some other libraries. +#if !GTEST_DONT_DEFINE_SUCCEED +#define SUCCEED() GTEST_SUCCEED() +#endif + +// Macros for testing exceptions. +// +// * {ASSERT|EXPECT}_THROW(statement, expected_exception): +// Tests that the statement throws the expected exception. +// * {ASSERT|EXPECT}_NO_THROW(statement): +// Tests that the statement doesn't throw any exception. +// * {ASSERT|EXPECT}_ANY_THROW(statement): +// Tests that the statement throws an exception. + +#define EXPECT_THROW(statement, expected_exception) \ + GTEST_TEST_THROW_(statement, expected_exception, GTEST_NONFATAL_FAILURE_) +#define EXPECT_NO_THROW(statement) \ + GTEST_TEST_NO_THROW_(statement, GTEST_NONFATAL_FAILURE_) +#define EXPECT_ANY_THROW(statement) \ + GTEST_TEST_ANY_THROW_(statement, GTEST_NONFATAL_FAILURE_) +#define ASSERT_THROW(statement, expected_exception) \ + GTEST_TEST_THROW_(statement, expected_exception, GTEST_FATAL_FAILURE_) +#define ASSERT_NO_THROW(statement) \ + GTEST_TEST_NO_THROW_(statement, GTEST_FATAL_FAILURE_) +#define ASSERT_ANY_THROW(statement) \ + GTEST_TEST_ANY_THROW_(statement, GTEST_FATAL_FAILURE_) + +// Boolean assertions. Condition can be either a Boolean expression or an +// AssertionResult. For more information on how to use AssertionResult with +// these macros see comments on that class. +#define EXPECT_TRUE(condition) \ + GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \ + GTEST_NONFATAL_FAILURE_) +#define EXPECT_FALSE(condition) \ + GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \ + GTEST_NONFATAL_FAILURE_) +#define ASSERT_TRUE(condition) \ + GTEST_TEST_BOOLEAN_(condition, #condition, false, true, GTEST_FATAL_FAILURE_) +#define ASSERT_FALSE(condition) \ + GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \ + GTEST_FATAL_FAILURE_) + +// Macros for testing equalities and inequalities. +// +// * {ASSERT|EXPECT}_EQ(v1, v2): Tests that v1 == v2 +// * {ASSERT|EXPECT}_NE(v1, v2): Tests that v1 != v2 +// * {ASSERT|EXPECT}_LT(v1, v2): Tests that v1 < v2 +// * {ASSERT|EXPECT}_LE(v1, v2): Tests that v1 <= v2 +// * {ASSERT|EXPECT}_GT(v1, v2): Tests that v1 > v2 +// * {ASSERT|EXPECT}_GE(v1, v2): Tests that v1 >= v2 +// +// When they are not, Google Test prints both the tested expressions and +// their actual values. The values must be compatible built-in types, +// or you will get a compiler error. By "compatible" we mean that the +// values can be compared by the respective operator. +// +// Note: +// +// 1. It is possible to make a user-defined type work with +// {ASSERT|EXPECT}_??(), but that requires overloading the +// comparison operators and is thus discouraged by the Google C++ +// Usage Guide. Therefore, you are advised to use the +// {ASSERT|EXPECT}_TRUE() macro to assert that two objects are +// equal. +// +// 2. The {ASSERT|EXPECT}_??() macros do pointer comparisons on +// pointers (in particular, C strings). Therefore, if you use it +// with two C strings, you are testing how their locations in memory +// are related, not how their content is related. To compare two C +// strings by content, use {ASSERT|EXPECT}_STR*(). +// +// 3. {ASSERT|EXPECT}_EQ(v1, v2) is preferred to +// {ASSERT|EXPECT}_TRUE(v1 == v2), as the former tells you +// what the actual value is when it fails, and similarly for the +// other comparisons. +// +// 4. Do not depend on the order in which {ASSERT|EXPECT}_??() +// evaluate their arguments, which is undefined. +// +// 5. These macros evaluate their arguments exactly once. +// +// Examples: +// +// EXPECT_NE(Foo(), 5); +// EXPECT_EQ(a_pointer, NULL); +// ASSERT_LT(i, array_size); +// ASSERT_GT(records.size(), 0) << "There is no record left."; + +#define EXPECT_EQ(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::EqHelper::Compare, val1, val2) +#define EXPECT_NE(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2) +#define EXPECT_LE(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2) +#define EXPECT_LT(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2) +#define EXPECT_GE(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2) +#define EXPECT_GT(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2) + +#define GTEST_ASSERT_EQ(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::EqHelper::Compare, val1, val2) +#define GTEST_ASSERT_NE(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2) +#define GTEST_ASSERT_LE(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2) +#define GTEST_ASSERT_LT(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2) +#define GTEST_ASSERT_GE(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2) +#define GTEST_ASSERT_GT(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2) + +// Define macro GTEST_DONT_DEFINE_ASSERT_XY to 1 to omit the definition of +// ASSERT_XY(), which clashes with some users' own code. + +#if !GTEST_DONT_DEFINE_ASSERT_EQ +#define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_NE +#define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_LE +#define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_LT +#define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_GE +#define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_GT +#define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2) +#endif + +// C-string Comparisons. All tests treat NULL and any non-NULL string +// as different. Two NULLs are equal. +// +// * {ASSERT|EXPECT}_STREQ(s1, s2): Tests that s1 == s2 +// * {ASSERT|EXPECT}_STRNE(s1, s2): Tests that s1 != s2 +// * {ASSERT|EXPECT}_STRCASEEQ(s1, s2): Tests that s1 == s2, ignoring case +// * {ASSERT|EXPECT}_STRCASENE(s1, s2): Tests that s1 != s2, ignoring case +// +// For wide or narrow string objects, you can use the +// {ASSERT|EXPECT}_??() macros. +// +// Don't depend on the order in which the arguments are evaluated, +// which is undefined. +// +// These macros evaluate their arguments exactly once. + +#define EXPECT_STREQ(s1, s2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2) +#define EXPECT_STRNE(s1, s2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2) +#define EXPECT_STRCASEEQ(s1, s2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2) +#define EXPECT_STRCASENE(s1, s2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2) + +#define ASSERT_STREQ(s1, s2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2) +#define ASSERT_STRNE(s1, s2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2) +#define ASSERT_STRCASEEQ(s1, s2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2) +#define ASSERT_STRCASENE(s1, s2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2) + +// Macros for comparing floating-point numbers. +// +// * {ASSERT|EXPECT}_FLOAT_EQ(val1, val2): +// Tests that two float values are almost equal. +// * {ASSERT|EXPECT}_DOUBLE_EQ(val1, val2): +// Tests that two double values are almost equal. +// * {ASSERT|EXPECT}_NEAR(v1, v2, abs_error): +// Tests that v1 and v2 are within the given distance to each other. +// +// Google Test uses ULP-based comparison to automatically pick a default +// error bound that is appropriate for the operands. See the +// FloatingPoint template class in gtest-internal.h if you are +// interested in the implementation details. + +#define EXPECT_FLOAT_EQ(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ, \ + val1, val2) + +#define EXPECT_DOUBLE_EQ(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ, \ + val1, val2) + +#define ASSERT_FLOAT_EQ(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ, \ + val1, val2) + +#define ASSERT_DOUBLE_EQ(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ, \ + val1, val2) + +#define EXPECT_NEAR(val1, val2, abs_error) \ + EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \ + abs_error) + +#define ASSERT_NEAR(val1, val2, abs_error) \ + ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \ + abs_error) + +// These predicate format functions work on floating-point values, and +// can be used in {ASSERT|EXPECT}_PRED_FORMAT2*(), e.g. +// +// EXPECT_PRED_FORMAT2(testing::DoubleLE, Foo(), 5.0); + +// Asserts that val1 is less than, or almost equal to, val2. Fails +// otherwise. In particular, it fails if either val1 or val2 is NaN. +GTEST_API_ AssertionResult FloatLE(const char *expr1, const char *expr2, + float val1, float val2); +GTEST_API_ AssertionResult DoubleLE(const char *expr1, const char *expr2, + double val1, double val2); + +#if GTEST_OS_WINDOWS + +// Macros that test for HRESULT failure and success, these are only useful +// on Windows, and rely on Windows SDK macros and APIs to compile. +// +// * {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}(expr) +// +// When expr unexpectedly fails or succeeds, Google Test prints the +// expected result and the actual result with both a human-readable +// string representation of the error, if available, as well as the +// hex result code. +#define EXPECT_HRESULT_SUCCEEDED(expr) \ + EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr)) + +#define ASSERT_HRESULT_SUCCEEDED(expr) \ + ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr)) + +#define EXPECT_HRESULT_FAILED(expr) \ + EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr)) + +#define ASSERT_HRESULT_FAILED(expr) \ + ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr)) + +#endif // GTEST_OS_WINDOWS + +// Macros that execute statement and check that it doesn't generate new fatal +// failures in the current thread. +// +// * {ASSERT|EXPECT}_NO_FATAL_FAILURE(statement); +// +// Examples: +// +// EXPECT_NO_FATAL_FAILURE(Process()); +// ASSERT_NO_FATAL_FAILURE(Process()) << "Process() failed"; +// +#define ASSERT_NO_FATAL_FAILURE(statement) \ + GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_) +#define EXPECT_NO_FATAL_FAILURE(statement) \ + GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_) + +// Causes a trace (including the given source file path and line number, +// and the given message) to be included in every test failure message generated +// by code in the scope of the lifetime of an instance of this class. The effect +// is undone with the destruction of the instance. +// +// The message argument can be anything streamable to std::ostream. +// +// Example: +// testing::ScopedTrace trace("file.cc", 123, "message"); +// +class GTEST_API_ ScopedTrace { + public: + // The c'tor pushes the given source file location and message onto + // a trace stack maintained by Google Test. + + // Template version. Uses Message() to convert the values into strings. + // Slow, but flexible. + template + ScopedTrace(const char *file, int line, const T &message) { + PushTrace(file, line, (Message() << message).GetString()); + } + + // Optimize for some known types. + ScopedTrace(const char *file, int line, const char *message) { + PushTrace(file, line, message ? message : "(null)"); + } + + ScopedTrace(const char *file, int line, const std::string &message) { + PushTrace(file, line, message); + } + + // The d'tor pops the info pushed by the c'tor. + // + // Note that the d'tor is not virtual in order to be efficient. + // Don't inherit from ScopedTrace! + ~ScopedTrace(); + + private: + void PushTrace(const char *file, int line, std::string message); + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace); +} GTEST_ATTRIBUTE_UNUSED_; // A ScopedTrace object does its job in its + // c'tor and d'tor. Therefore it doesn't + // need to be used otherwise. + +// Causes a trace (including the source file path, the current line +// number, and the given message) to be included in every test failure +// message generated by code in the current scope. The effect is +// undone when the control leaves the current scope. +// +// The message argument can be anything streamable to std::ostream. +// +// In the implementation, we include the current line number as part +// of the dummy variable name, thus allowing multiple SCOPED_TRACE()s +// to appear in the same block - as long as they are on different +// lines. +// +// Assuming that each thread maintains its own stack of traces. +// Therefore, a SCOPED_TRACE() would (correctly) only affect the +// assertions in its own thread. +#define SCOPED_TRACE(message) \ + ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)( \ + __FILE__, __LINE__, (message)) + +// Compile-time assertion for type equality. +// StaticAssertTypeEq() compiles if and only if type1 and type2 +// are the same type. The value it returns is not interesting. +// +// Instead of making StaticAssertTypeEq a class template, we make it a +// function template that invokes a helper class template. This +// prevents a user from misusing StaticAssertTypeEq by +// defining objects of that type. +// +// CAVEAT: +// +// When used inside a method of a class template, +// StaticAssertTypeEq() is effective ONLY IF the method is +// instantiated. For example, given: +// +// template class Foo { +// public: +// void Bar() { testing::StaticAssertTypeEq(); } +// }; +// +// the code: +// +// void Test1() { Foo foo; } +// +// will NOT generate a compiler error, as Foo::Bar() is never +// actually instantiated. Instead, you need: +// +// void Test2() { Foo foo; foo.Bar(); } +// +// to cause a compiler error. +template +constexpr bool StaticAssertTypeEq() noexcept { + static_assert(std::is_same::value, "T1 and T2 are not the same type"); + return true; +} + +// Defines a test. +// +// The first parameter is the name of the test suite, and the second +// parameter is the name of the test within the test suite. +// +// The convention is to end the test suite name with "Test". For +// example, a test suite for the Foo class can be named FooTest. +// +// Test code should appear between braces after an invocation of +// this macro. Example: +// +// TEST(FooTest, InitializesCorrectly) { +// Foo foo; +// EXPECT_TRUE(foo.StatusIsOK()); +// } + +// Note that we call GetTestTypeId() instead of GetTypeId< +// ::testing::Test>() here to get the type ID of testing::Test. This +// is to work around a suspected linker bug when using Google Test as +// a framework on Mac OS X. The bug causes GetTypeId< +// ::testing::Test>() to return different values depending on whether +// the call is from the Google Test framework itself or from user test +// code. GetTestTypeId() is guaranteed to always return the same +// value, as it always calls GetTypeId<>() from the Google Test +// framework. +#define GTEST_TEST(test_suite_name, test_name) \ + GTEST_TEST_(test_suite_name, test_name, ::testing::Test, \ + ::testing::internal::GetTestTypeId()) + +// Define this macro to 1 to omit the definition of TEST(), which +// is a generic name and clashes with some other libraries. +#if !GTEST_DONT_DEFINE_TEST +#define TEST(test_suite_name, test_name) GTEST_TEST(test_suite_name, test_name) +#endif + +// Defines a test that uses a test fixture. +// +// The first parameter is the name of the test fixture class, which +// also doubles as the test suite name. The second parameter is the +// name of the test within the test suite. +// +// A test fixture class must be declared earlier. The user should put +// the test code between braces after using this macro. Example: +// +// class FooTest : public testing::Test { +// protected: +// void SetUp() override { b_.AddElement(3); } +// +// Foo a_; +// Foo b_; +// }; +// +// TEST_F(FooTest, InitializesCorrectly) { +// EXPECT_TRUE(a_.StatusIsOK()); +// } +// +// TEST_F(FooTest, ReturnsElementCountCorrectly) { +// EXPECT_EQ(a_.size(), 0); +// EXPECT_EQ(b_.size(), 1); +// } +// +// GOOGLETEST_CM0011 DO NOT DELETE +#if !GTEST_DONT_DEFINE_TEST +#define TEST_F(test_fixture, test_name) \ + GTEST_TEST_(test_fixture, test_name, test_fixture, \ + ::testing::internal::GetTypeId()) +#endif // !GTEST_DONT_DEFINE_TEST + +// Returns a path to temporary directory. +// Tries to determine an appropriate directory for the platform. +GTEST_API_ std::string TempDir(); + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +// Dynamically registers a test with the framework. +// +// This is an advanced API only to be used when the `TEST` macros are +// insufficient. The macros should be preferred when possible, as they avoid +// most of the complexity of calling this function. +// +// The `factory` argument is a factory callable (move-constructible) object or +// function pointer that creates a new instance of the Test object. It +// handles ownership to the caller. The signature of the callable is +// `Fixture*()`, where `Fixture` is the test fixture class for the test. All +// tests registered with the same `test_suite_name` must return the same +// fixture type. This is checked at runtime. +// +// The framework will infer the fixture class from the factory and will call +// the `SetUpTestSuite` and `TearDownTestSuite` for it. +// +// Must be called before `RUN_ALL_TESTS()` is invoked, otherwise behavior is +// undefined. +// +// Use case example: +// +// class MyFixture : public ::testing::Test { +// public: +// // All of these optional, just like in regular macro usage. +// static void SetUpTestSuite() { ... } +// static void TearDownTestSuite() { ... } +// void SetUp() override { ... } +// void TearDown() override { ... } +// }; +// +// class MyTest : public MyFixture { +// public: +// explicit MyTest(int data) : data_(data) {} +// void TestBody() override { ... } +// +// private: +// int data_; +// }; +// +// void RegisterMyTests(const std::vector& values) { +// for (int v : values) { +// ::testing::RegisterTest( +// "MyFixture", ("Test" + std::to_string(v)).c_str(), nullptr, +// std::to_string(v).c_str(), +// __FILE__, __LINE__, +// // Important to use the fixture type as the return type here. +// [=]() -> MyFixture* { return new MyTest(v); }); +// } +// } +// ... +// int main(int argc, char** argv) { +// std::vector values_to_test = LoadValuesFromConfig(); +// RegisterMyTests(values_to_test); +// ... +// return RUN_ALL_TESTS(); +// } +// +template +TestInfo *RegisterTest(const char *test_suite_name, const char *test_name, + const char *type_param, const char *value_param, + const char *file, int line, Factory factory) { + using TestT = typename std::remove_pointer::type; + + class FactoryImpl : public internal::TestFactoryBase { + public: + explicit FactoryImpl(Factory f) : factory_(std::move(f)) {} + Test *CreateTest() override { return factory_(); } + + private: + Factory factory_; + }; + + return internal::MakeAndRegisterTestInfo( + test_suite_name, test_name, type_param, value_param, + internal::CodeLocation(file, line), internal::GetTypeId(), + internal::SuiteApiResolver::GetSetUpCaseOrSuite(file, line), + internal::SuiteApiResolver::GetTearDownCaseOrSuite(file, line), + new FactoryImpl{ std::move(factory) }); +} + +} // namespace testing + +// Use this function in main() to run all tests. It returns 0 if all +// tests are successful, or 1 otherwise. +// +// RUN_ALL_TESTS() should be invoked after the command line has been +// parsed by InitGoogleTest(). +// +// This function was formerly a macro; thus, it is in the global +// namespace and has an all-caps name. +int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_; + +inline int RUN_ALL_TESTS() { return ::testing::UnitTest::GetInstance()->Run(); } + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +#endif // GTEST_INCLUDE_GTEST_GTEST_H_ diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest_pred_impl.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest_pred_impl.h new file mode 100644 index 000000000..1fc21910b --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest_pred_impl.h @@ -0,0 +1,277 @@ +// Copyright 2006, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This file is AUTOMATICALLY GENERATED on 01/02/2019 by command +// 'gen_gtest_pred_impl.py 5'. DO NOT EDIT BY HAND! +// +// Implements a family of generic predicate assertion macros. +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ +#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ + +#include "gtest/gtest.h" + +namespace testing { + +// This header implements a family of generic predicate assertion +// macros: +// +// ASSERT_PRED_FORMAT1(pred_format, v1) +// ASSERT_PRED_FORMAT2(pred_format, v1, v2) +// ... +// +// where pred_format is a function or functor that takes n (in the +// case of ASSERT_PRED_FORMATn) values and their source expression +// text, and returns a testing::AssertionResult. See the definition +// of ASSERT_EQ in gtest.h for an example. +// +// If you don't care about formatting, you can use the more +// restrictive version: +// +// ASSERT_PRED1(pred, v1) +// ASSERT_PRED2(pred, v1, v2) +// ... +// +// where pred is an n-ary function or functor that returns bool, +// and the values v1, v2, ..., must support the << operator for +// streaming to std::ostream. +// +// We also define the EXPECT_* variations. +// +// For now we only support predicates whose arity is at most 5. +// Please email googletestframework@googlegroups.com if you need +// support for higher arities. + +// GTEST_ASSERT_ is the basic statement to which all of the assertions +// in this file reduce. Don't use this in your code. + +#define GTEST_ASSERT_(expression, on_failure) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (const ::testing::AssertionResult gtest_ar = (expression)) \ + ; \ + else \ + on_failure(gtest_ar.failure_message()) + +// Helper function for implementing {EXPECT|ASSERT}_PRED1. Don't use +// this in your code. +template +AssertionResult AssertPred1Helper(const char *pred_text, const char *e1, + Pred pred, const T1 &v1) { + if (pred(v1)) return AssertionSuccess(); + + return AssertionFailure() + << pred_text << "(" << e1 << ") evaluates to false, where" + << "\n" + << e1 << " evaluates to " << ::testing::PrintToString(v1); +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1. +// Don't use this in your code. +#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure) \ + GTEST_ASSERT_(pred_format(#v1, v1), on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED1. Don't use +// this in your code. +#define GTEST_PRED1_(pred, v1, on_failure) \ + GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, #v1, pred, v1), on_failure) + +// Unary predicate assertion macros. +#define EXPECT_PRED_FORMAT1(pred_format, v1) \ + GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT1(pred_format, v1) \ + GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_) + +// Helper function for implementing {EXPECT|ASSERT}_PRED2. Don't use +// this in your code. +template +AssertionResult AssertPred2Helper(const char *pred_text, const char *e1, + const char *e2, Pred pred, const T1 &v1, + const T2 &v2) { + if (pred(v1, v2)) return AssertionSuccess(); + + return AssertionFailure() + << pred_text << "(" << e1 << ", " << e2 + << ") evaluates to false, where" + << "\n" + << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n" + << e2 << " evaluates to " << ::testing::PrintToString(v2); +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2. +// Don't use this in your code. +#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure) \ + GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED2. Don't use +// this in your code. +#define GTEST_PRED2_(pred, v1, v2, on_failure) \ + GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, #v1, #v2, pred, v1, v2), \ + on_failure) + +// Binary predicate assertion macros. +#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \ + GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED2(pred, v1, v2) \ + GTEST_PRED2_(pred, v1, v2, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \ + GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED2(pred, v1, v2) \ + GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_) + +// Helper function for implementing {EXPECT|ASSERT}_PRED3. Don't use +// this in your code. +template +AssertionResult AssertPred3Helper(const char *pred_text, const char *e1, + const char *e2, const char *e3, Pred pred, + const T1 &v1, const T2 &v2, const T3 &v3) { + if (pred(v1, v2, v3)) return AssertionSuccess(); + + return AssertionFailure() + << pred_text << "(" << e1 << ", " << e2 << ", " << e3 + << ") evaluates to false, where" + << "\n" + << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n" + << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n" + << e3 << " evaluates to " << ::testing::PrintToString(v3); +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3. +// Don't use this in your code. +#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure) \ + GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED3. Don't use +// this in your code. +#define GTEST_PRED3_(pred, v1, v2, v3, on_failure) \ + GTEST_ASSERT_( \ + ::testing::AssertPred3Helper(#pred, #v1, #v2, #v3, pred, v1, v2, v3), \ + on_failure) + +// Ternary predicate assertion macros. +#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \ + GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED3(pred, v1, v2, v3) \ + GTEST_PRED3_(pred, v1, v2, v3, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT3(pred_format, v1, v2, v3) \ + GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED3(pred, v1, v2, v3) \ + GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_) + +// Helper function for implementing {EXPECT|ASSERT}_PRED4. Don't use +// this in your code. +template +AssertionResult AssertPred4Helper(const char *pred_text, const char *e1, + const char *e2, const char *e3, + const char *e4, Pred pred, const T1 &v1, + const T2 &v2, const T3 &v3, const T4 &v4) { + if (pred(v1, v2, v3, v4)) return AssertionSuccess(); + + return AssertionFailure() + << pred_text << "(" << e1 << ", " << e2 << ", " << e3 << ", " << e4 + << ") evaluates to false, where" + << "\n" + << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n" + << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n" + << e3 << " evaluates to " << ::testing::PrintToString(v3) << "\n" + << e4 << " evaluates to " << ::testing::PrintToString(v4); +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4. +// Don't use this in your code. +#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure) \ + GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED4. Don't use +// this in your code. +#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure) \ + GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, #v1, #v2, #v3, #v4, pred, \ + v1, v2, v3, v4), \ + on_failure) + +// 4-ary predicate assertion macros. +#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \ + GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED4(pred, v1, v2, v3, v4) \ + GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \ + GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED4(pred, v1, v2, v3, v4) \ + GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_) + +// Helper function for implementing {EXPECT|ASSERT}_PRED5. Don't use +// this in your code. +template +AssertionResult AssertPred5Helper(const char *pred_text, const char *e1, + const char *e2, const char *e3, + const char *e4, const char *e5, Pred pred, + const T1 &v1, const T2 &v2, const T3 &v3, + const T4 &v4, const T5 &v5) { + if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess(); + + return AssertionFailure() + << pred_text << "(" << e1 << ", " << e2 << ", " << e3 << ", " << e4 + << ", " << e5 << ") evaluates to false, where" + << "\n" + << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n" + << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n" + << e3 << " evaluates to " << ::testing::PrintToString(v3) << "\n" + << e4 << " evaluates to " << ::testing::PrintToString(v4) << "\n" + << e5 << " evaluates to " << ::testing::PrintToString(v5); +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5. +// Don't use this in your code. +#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure) \ + GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \ + on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED5. Don't use +// this in your code. +#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure) \ + GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, #v1, #v2, #v3, #v4, #v5, \ + pred, v1, v2, v3, v4, v5), \ + on_failure) + +// 5-ary predicate assertion macros. +#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \ + GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED5(pred, v1, v2, v3, v4, v5) \ + GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \ + GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \ + GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_) + +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest_prod.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest_prod.h new file mode 100644 index 000000000..3dc5b2386 --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest_prod.h @@ -0,0 +1,61 @@ +// Copyright 2006, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +// Google C++ Testing and Mocking Framework definitions useful in production +// code. GOOGLETEST_CM0003 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_ +#define GTEST_INCLUDE_GTEST_GTEST_PROD_H_ + +// When you need to test the private or protected members of a class, +// use the FRIEND_TEST macro to declare your tests as friends of the +// class. For example: +// +// class MyClass { +// private: +// void PrivateMethod(); +// FRIEND_TEST(MyClassTest, PrivateMethodWorks); +// }; +// +// class MyClassTest : public testing::Test { +// // ... +// }; +// +// TEST_F(MyClassTest, PrivateMethodWorks) { +// // Can call MyClass::PrivateMethod() here. +// } +// +// Note: The test class must be in the same namespace as the class being tested. +// For example, putting MyClassTest in an anonymous namespace will not work. + +#define FRIEND_TEST(test_case_name, test_name) \ + friend class test_case_name##_##test_name##_Test + +#endif // GTEST_INCLUDE_GTEST_GTEST_PROD_H_ diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/README.md b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/README.md new file mode 100644 index 000000000..ff391fb4e --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/README.md @@ -0,0 +1,56 @@ +# Customization Points + +The custom directory is an injection point for custom user configurations. + +## Header `gtest.h` + +### The following macros can be defined: + +* `GTEST_OS_STACK_TRACE_GETTER_` - The name of an implementation of + `OsStackTraceGetterInterface`. +* `GTEST_CUSTOM_TEMPDIR_FUNCTION_` - An override for `testing::TempDir()`. See + `testing::TempDir` for semantics and signature. + +## Header `gtest-port.h` + +The following macros can be defined: + +### Flag related macros: + +* `GTEST_FLAG(flag_name)` +* `GTEST_USE_OWN_FLAGFILE_FLAG_` - Define to 0 when the system provides its + own flagfile flag parsing. +* `GTEST_DECLARE_bool_(name)` +* `GTEST_DECLARE_int32_(name)` +* `GTEST_DECLARE_string_(name)` +* `GTEST_DEFINE_bool_(name, default_val, doc)` +* `GTEST_DEFINE_int32_(name, default_val, doc)` +* `GTEST_DEFINE_string_(name, default_val, doc)` + +### Logging: + +* `GTEST_LOG_(severity)` +* `GTEST_CHECK_(condition)` +* Functions `LogToStderr()` and `FlushInfoLog()` have to be provided too. + +### Threading: + +* `GTEST_HAS_NOTIFICATION_` - Enabled if Notification is already provided. +* `GTEST_HAS_MUTEX_AND_THREAD_LOCAL_` - Enabled if `Mutex` and `ThreadLocal` + are already provided. Must also provide `GTEST_DECLARE_STATIC_MUTEX_(mutex)` + and `GTEST_DEFINE_STATIC_MUTEX_(mutex)` +* `GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)` +* `GTEST_LOCK_EXCLUDED_(locks)` + +### Underlying library support features + +* `GTEST_HAS_CXXABI_H_` + +### Exporting API symbols: + +* `GTEST_API_` - Specifier for exported symbols. + +## Header `gtest-printers.h` + +* See documentation at `gtest/gtest-printers.h` for details on how to define a + custom printer. diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-port.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-port.h new file mode 100644 index 000000000..cd85d956d --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-port.h @@ -0,0 +1,37 @@ +// Copyright 2015, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Injection point for custom user configurations. See README for details +// +// ** Custom implementation starts here ** + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_ + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_ diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-printers.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-printers.h new file mode 100644 index 000000000..eb4467abc --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-printers.h @@ -0,0 +1,42 @@ +// Copyright 2015, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// This file provides an injection point for custom printers in a local +// installation of gTest. +// It will be included from gtest-printers.h and the overrides in this file +// will be visible to everyone. +// +// Injection point for custom user configurations. See README for details +// +// ** Custom implementation starts here ** + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_ + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_ diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest.h new file mode 100644 index 000000000..4c8e07be2 --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest.h @@ -0,0 +1,37 @@ +// Copyright 2015, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Injection point for custom user configurations. See README for details +// +// ** Custom implementation starts here ** + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_ + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_ diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-death-test-internal.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-death-test-internal.h new file mode 100644 index 000000000..3e9497d45 --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-death-test-internal.h @@ -0,0 +1,301 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This header file defines internal utilities needed for implementing +// death tests. They are subject to change without notice. +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ + +#include "gtest/gtest-matchers.h" +#include "gtest/internal/gtest-internal.h" + +#include +#include + +namespace testing { +namespace internal { + +GTEST_DECLARE_string_(internal_run_death_test); + +// Names of the flags (needed for parsing Google Test flags). +const char kDeathTestStyleFlag[] = "death_test_style"; +const char kDeathTestUseFork[] = "death_test_use_fork"; +const char kInternalRunDeathTestFlag[] = "internal_run_death_test"; + +#if GTEST_HAS_DEATH_TEST + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +// DeathTest is a class that hides much of the complexity of the +// GTEST_DEATH_TEST_ macro. It is abstract; its static Create method +// returns a concrete class that depends on the prevailing death test +// style, as defined by the --gtest_death_test_style and/or +// --gtest_internal_run_death_test flags. + +// In describing the results of death tests, these terms are used with +// the corresponding definitions: +// +// exit status: The integer exit information in the format specified +// by wait(2) +// exit code: The integer code passed to exit(3), _exit(2), or +// returned from main() +class GTEST_API_ DeathTest { + public: + // Create returns false if there was an error determining the + // appropriate action to take for the current death test; for example, + // if the gtest_death_test_style flag is set to an invalid value. + // The LastMessage method will return a more detailed message in that + // case. Otherwise, the DeathTest pointer pointed to by the "test" + // argument is set. If the death test should be skipped, the pointer + // is set to NULL; otherwise, it is set to the address of a new concrete + // DeathTest object that controls the execution of the current test. + static bool Create(const char *statement, + Matcher matcher, const char *file, + int line, DeathTest **test); + DeathTest(); + virtual ~DeathTest() {} + + // A helper class that aborts a death test when it's deleted. + class ReturnSentinel { + public: + explicit ReturnSentinel(DeathTest *test) : test_(test) {} + ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); } + + private: + DeathTest *const test_; + GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel); + } GTEST_ATTRIBUTE_UNUSED_; + + // An enumeration of possible roles that may be taken when a death + // test is encountered. EXECUTE means that the death test logic should + // be executed immediately. OVERSEE means that the program should prepare + // the appropriate environment for a child process to execute the death + // test, then wait for it to complete. + enum TestRole { OVERSEE_TEST, EXECUTE_TEST }; + + // An enumeration of the three reasons that a test might be aborted. + enum AbortReason { + TEST_ENCOUNTERED_RETURN_STATEMENT, + TEST_THREW_EXCEPTION, + TEST_DID_NOT_DIE + }; + + // Assumes one of the above roles. + virtual TestRole AssumeRole() = 0; + + // Waits for the death test to finish and returns its status. + virtual int Wait() = 0; + + // Returns true if the death test passed; that is, the test process + // exited during the test, its exit status matches a user-supplied + // predicate, and its stderr output matches a user-supplied regular + // expression. + // The user-supplied predicate may be a macro expression rather + // than a function pointer or functor, or else Wait and Passed could + // be combined. + virtual bool Passed(bool exit_status_ok) = 0; + + // Signals that the death test did not die as expected. + virtual void Abort(AbortReason reason) = 0; + + // Returns a human-readable outcome message regarding the outcome of + // the last death test. + static const char *LastMessage(); + + static void set_last_death_test_message(const std::string &message); + + private: + // A string containing a description of the outcome of the last death test. + static std::string last_death_test_message_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest); +}; + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +// Factory interface for death tests. May be mocked out for testing. +class DeathTestFactory { + public: + virtual ~DeathTestFactory() {} + virtual bool Create(const char *statement, + Matcher matcher, const char *file, + int line, DeathTest **test) = 0; +}; + +// A concrete DeathTestFactory implementation for normal use. +class DefaultDeathTestFactory : public DeathTestFactory { + public: + bool Create(const char *statement, Matcher matcher, + const char *file, int line, DeathTest **test) override; +}; + +// Returns true if exit_status describes a process that was terminated +// by a signal, or exited normally with a nonzero exit code. +GTEST_API_ bool ExitedUnsuccessfully(int exit_status); + +// A string passed to EXPECT_DEATH (etc.) is caught by one of these overloads +// and interpreted as a regex (rather than an Eq matcher) for legacy +// compatibility. +inline Matcher MakeDeathTestMatcher( + ::testing::internal::RE regex) { + return ContainsRegex(regex.pattern()); +} +inline Matcher MakeDeathTestMatcher(const char *regex) { + return ContainsRegex(regex); +} +inline Matcher MakeDeathTestMatcher( + const ::std::string ®ex) { + return ContainsRegex(regex); +} + +// If a Matcher is passed to EXPECT_DEATH (etc.), it's +// used directly. +inline Matcher MakeDeathTestMatcher( + Matcher matcher) { + return matcher; +} + +// Traps C++ exceptions escaping statement and reports them as test +// failures. Note that trapping SEH exceptions is not implemented here. +#if GTEST_HAS_EXCEPTIONS +#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } catch (const ::std::exception >est_exception) { \ + fprintf( \ + stderr, \ + "\n%s: Caught std::exception-derived exception escaping the " \ + "death test statement. Exception message: %s\n", \ + ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \ + gtest_exception.what()); \ + fflush(stderr); \ + death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \ + } catch (...) { \ + death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \ + } + +#else +#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) + +#endif + +// This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*, +// ASSERT_EXIT*, and EXPECT_EXIT*. +#define GTEST_DEATH_TEST_(statement, predicate, regex_or_matcher, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + ::testing::internal::DeathTest *gtest_dt; \ + if (!::testing::internal::DeathTest::Create( \ + #statement, \ + ::testing::internal::MakeDeathTestMatcher(regex_or_matcher), \ + __FILE__, __LINE__, >est_dt)) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \ + } \ + if (gtest_dt != nullptr) { \ + std::unique_ptr< ::testing::internal::DeathTest> gtest_dt_ptr(gtest_dt); \ + switch (gtest_dt->AssumeRole()) { \ + case ::testing::internal::DeathTest::OVERSEE_TEST: \ + if (!gtest_dt->Passed(predicate(gtest_dt->Wait()))) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \ + } \ + break; \ + case ::testing::internal::DeathTest::EXECUTE_TEST: { \ + ::testing::internal::DeathTest::ReturnSentinel gtest_sentinel( \ + gtest_dt); \ + GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, gtest_dt); \ + gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE); \ + break; \ + } \ + default: break; \ + } \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__) \ + : fail(::testing::internal::DeathTest::LastMessage()) +// The symbol "fail" here expands to something into which a message +// can be streamed. + +// This macro is for implementing ASSERT/EXPECT_DEBUG_DEATH when compiled in +// NDEBUG mode. In this case we need the statements to be executed and the macro +// must accept a streamed message even though the message is never printed. +// The regex object is not evaluated, but it is used to prevent "unused" +// warnings and to avoid an expression that doesn't compile in debug mode. +#define GTEST_EXECUTE_STATEMENT_(statement, regex_or_matcher) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } else if (!::testing::internal::AlwaysTrue()) { \ + ::testing::internal::MakeDeathTestMatcher(regex_or_matcher); \ + } else \ + ::testing::Message() + +// A class representing the parsed contents of the +// --gtest_internal_run_death_test flag, as it existed when +// RUN_ALL_TESTS was called. +class InternalRunDeathTestFlag { + public: + InternalRunDeathTestFlag(const std::string &a_file, int a_line, int an_index, + int a_write_fd) + : file_(a_file), line_(a_line), index_(an_index), write_fd_(a_write_fd) {} + + ~InternalRunDeathTestFlag() { + if (write_fd_ >= 0) posix::Close(write_fd_); + } + + const std::string &file() const { return file_; } + int line() const { return line_; } + int index() const { return index_; } + int write_fd() const { return write_fd_; } + + private: + std::string file_; + int line_; + int index_; + int write_fd_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(InternalRunDeathTestFlag); +}; + +// Returns a newly created InternalRunDeathTestFlag object with fields +// initialized from the GTEST_FLAG(internal_run_death_test) flag if +// the flag is specified; otherwise returns NULL. +InternalRunDeathTestFlag *ParseInternalRunDeathTestFlag(); + +#endif // GTEST_HAS_DEATH_TEST + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-filepath.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-filepath.h new file mode 100644 index 000000000..b228d4734 --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-filepath.h @@ -0,0 +1,208 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Google Test filepath utilities +// +// This header file declares classes and functions used internally by +// Google Test. They are subject to change without notice. +// +// This file is #included in gtest/internal/gtest-internal.h. +// Do not include this header file separately! + +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ + +#include "gtest/internal/gtest-string.h" + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +namespace testing { +namespace internal { + +// FilePath - a class for file and directory pathname manipulation which +// handles platform-specific conventions (like the pathname separator). +// Used for helper functions for naming files in a directory for xml output. +// Except for Set methods, all methods are const or static, which provides an +// "immutable value object" -- useful for peace of mind. +// A FilePath with a value ending in a path separator ("like/this/") represents +// a directory, otherwise it is assumed to represent a file. In either case, +// it may or may not represent an actual file or directory in the file system. +// Names are NOT checked for syntax correctness -- no checking for illegal +// characters, malformed paths, etc. + +class GTEST_API_ FilePath { + public: + FilePath() : pathname_("") {} + FilePath(const FilePath &rhs) : pathname_(rhs.pathname_) {} + + explicit FilePath(const std::string &pathname) : pathname_(pathname) { + Normalize(); + } + + FilePath &operator=(const FilePath &rhs) { + Set(rhs); + return *this; + } + + void Set(const FilePath &rhs) { pathname_ = rhs.pathname_; } + + const std::string &string() const { return pathname_; } + const char *c_str() const { return pathname_.c_str(); } + + // Returns the current working directory, or "" if unsuccessful. + static FilePath GetCurrentDir(); + + // Given directory = "dir", base_name = "test", number = 0, + // extension = "xml", returns "dir/test.xml". If number is greater + // than zero (e.g., 12), returns "dir/test_12.xml". + // On Windows platform, uses \ as the separator rather than /. + static FilePath MakeFileName(const FilePath &directory, + const FilePath &base_name, int number, + const char *extension); + + // Given directory = "dir", relative_path = "test.xml", + // returns "dir/test.xml". + // On Windows, uses \ as the separator rather than /. + static FilePath ConcatPaths(const FilePath &directory, + const FilePath &relative_path); + + // Returns a pathname for a file that does not currently exist. The pathname + // will be directory/base_name.extension or + // directory/base_name_.extension if directory/base_name.extension + // already exists. The number will be incremented until a pathname is found + // that does not already exist. + // Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'. + // There could be a race condition if two or more processes are calling this + // function at the same time -- they could both pick the same filename. + static FilePath GenerateUniqueFileName(const FilePath &directory, + const FilePath &base_name, + const char *extension); + + // Returns true if and only if the path is "". + bool IsEmpty() const { return pathname_.empty(); } + + // If input name has a trailing separator character, removes it and returns + // the name, otherwise return the name string unmodified. + // On Windows platform, uses \ as the separator, other platforms use /. + FilePath RemoveTrailingPathSeparator() const; + + // Returns a copy of the FilePath with the directory part removed. + // Example: FilePath("path/to/file").RemoveDirectoryName() returns + // FilePath("file"). If there is no directory part ("just_a_file"), it returns + // the FilePath unmodified. If there is no file part ("just_a_dir/") it + // returns an empty FilePath (""). + // On Windows platform, '\' is the path separator, otherwise it is '/'. + FilePath RemoveDirectoryName() const; + + // RemoveFileName returns the directory path with the filename removed. + // Example: FilePath("path/to/file").RemoveFileName() returns "path/to/". + // If the FilePath is "a_file" or "/a_file", RemoveFileName returns + // FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does + // not have a file, like "just/a/dir/", it returns the FilePath unmodified. + // On Windows platform, '\' is the path separator, otherwise it is '/'. + FilePath RemoveFileName() const; + + // Returns a copy of the FilePath with the case-insensitive extension removed. + // Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns + // FilePath("dir/file"). If a case-insensitive extension is not + // found, returns a copy of the original FilePath. + FilePath RemoveExtension(const char *extension) const; + + // Creates directories so that path exists. Returns true if successful or if + // the directories already exist; returns false if unable to create + // directories for any reason. Will also return false if the FilePath does + // not represent a directory (that is, it doesn't end with a path separator). + bool CreateDirectoriesRecursively() const; + + // Create the directory so that path exists. Returns true if successful or + // if the directory already exists; returns false if unable to create the + // directory for any reason, including if the parent directory does not + // exist. Not named "CreateDirectory" because that's a macro on Windows. + bool CreateFolder() const; + + // Returns true if FilePath describes something in the file-system, + // either a file, directory, or whatever, and that something exists. + bool FileOrDirectoryExists() const; + + // Returns true if pathname describes a directory in the file-system + // that exists. + bool DirectoryExists() const; + + // Returns true if FilePath ends with a path separator, which indicates that + // it is intended to represent a directory. Returns false otherwise. + // This does NOT check that a directory (or file) actually exists. + bool IsDirectory() const; + + // Returns true if pathname describes a root directory. (Windows has one + // root directory per disk drive.) + bool IsRootDirectory() const; + + // Returns true if pathname describes an absolute path. + bool IsAbsolutePath() const; + + private: + // Replaces multiple consecutive separators with a single separator. + // For example, "bar///foo" becomes "bar/foo". Does not eliminate other + // redundancies that might be in a pathname involving "." or "..". + // + // A pathname with multiple consecutive separators may occur either through + // user error or as a result of some scripts or APIs that generate a pathname + // with a trailing separator. On other platforms the same API or script + // may NOT generate a pathname with a trailing "/". Then elsewhere that + // pathname may have another "/" and pathname components added to it, + // without checking for the separator already being there. + // The script language and operating system may allow paths like "foo//bar" + // but some of the functions in FilePath will not handle that correctly. In + // particular, RemoveTrailingPathSeparator() only removes one separator, and + // it is called in CreateDirectoriesRecursively() assuming that it will change + // a pathname from directory syntax (trailing separator) to filename syntax. + // + // On Windows this method also replaces the alternate path separator '/' with + // the primary path separator '\\', so that for example "bar\\/\\foo" becomes + // "bar\\foo". + + void Normalize(); + + // Returns a pointer to the last occurence of a valid path separator in + // the FilePath. On Windows, for example, both '/' and '\' are valid path + // separators. Returns NULL if no path separator was found. + const char *FindLastPathSeparator() const; + + std::string pathname_; +}; // class FilePath + +} // namespace internal +} // namespace testing + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-internal.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-internal.h new file mode 100644 index 000000000..9640aba83 --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-internal.h @@ -0,0 +1,1441 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This header file declares functions and macros used internally by +// Google Test. They are subject to change without notice. + +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ + +#include "gtest/internal/gtest-port.h" + +#if GTEST_OS_LINUX +#include +#include +#include +#include +#endif // GTEST_OS_LINUX + +#if GTEST_HAS_EXCEPTIONS +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gtest/gtest-message.h" +#include "gtest/internal/gtest-filepath.h" +#include "gtest/internal/gtest-string.h" +#include "gtest/internal/gtest-type-util.h" + +// Due to C++ preprocessor weirdness, we need double indirection to +// concatenate two tokens when one of them is __LINE__. Writing +// +// foo ## __LINE__ +// +// will result in the token foo__LINE__, instead of foo followed by +// the current line number. For more details, see +// http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6 +#define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar) +#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo##bar + +// Stringifies its argument. +// Work around a bug in visual studio which doesn't accept code like this: +// +// #define GTEST_STRINGIFY_(name) #name +// #define MACRO(a, b, c) ... GTEST_STRINGIFY_(a) ... +// MACRO(, x, y) +// +// Complaining about the argument to GTEST_STRINGIFY_ being empty. +// This is allowed by the spec. +#define GTEST_STRINGIFY_HELPER_(name, ...) #name +#define GTEST_STRINGIFY_(...) GTEST_STRINGIFY_HELPER_(__VA_ARGS__, ) + +namespace proto2 { +class Message; +} + +namespace testing { + +// Forward declarations. + +class AssertionResult; // Result of an assertion. +class Message; // Represents a failure message. +class Test; // Represents a test. +class TestInfo; // Information about a test. +class TestPartResult; // Result of a test part. +class UnitTest; // A collection of test suites. + +template +::std::string PrintToString(const T &value); + +namespace internal { + +struct TraceInfo; // Information about a trace point. +class TestInfoImpl; // Opaque implementation of TestInfo +class UnitTestImpl; // Opaque implementation of UnitTest + +// The text used in failure messages to indicate the start of the +// stack trace. +GTEST_API_ extern const char kStackTraceMarker[]; + +// An IgnoredValue object can be implicitly constructed from ANY value. +class IgnoredValue { + struct Sink {}; + + public: + // This constructor template allows any value to be implicitly + // converted to IgnoredValue. The object has no data member and + // doesn't try to remember anything about the argument. We + // deliberately omit the 'explicit' keyword in order to allow the + // conversion to be implicit. + // Disable the conversion if T already has a magical conversion operator. + // Otherwise we get ambiguity. + template ::value, + int>::type = 0> + IgnoredValue(const T & /* ignored */) {} // NOLINT(runtime/explicit) +}; + +// Appends the user-supplied message to the Google-Test-generated message. +GTEST_API_ std::string AppendUserMessage(const std::string >est_msg, + const Message &user_msg); + +#if GTEST_HAS_EXCEPTIONS + +GTEST_DISABLE_MSC_WARNINGS_PUSH_( + 4275 /* an exported class was derived from a class that was not exported */) + +// This exception is thrown by (and only by) a failed Google Test +// assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions +// are enabled). We derive it from std::runtime_error, which is for +// errors presumably detectable only at run time. Since +// std::runtime_error inherits from std::exception, many testing +// frameworks know how to extract and print the message inside it. +class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error { + public: + explicit GoogleTestFailureException(const TestPartResult &failure); +}; + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4275 + +#endif // GTEST_HAS_EXCEPTIONS + +namespace edit_distance { +// Returns the optimal edits to go from 'left' to 'right'. +// All edits cost the same, with replace having lower priority than +// add/remove. +// Simple implementation of the Wagner-Fischer algorithm. +// See http://en.wikipedia.org/wiki/Wagner-Fischer_algorithm +enum EditType { kMatch, kAdd, kRemove, kReplace }; +GTEST_API_ std::vector CalculateOptimalEdits( + const std::vector &left, const std::vector &right); + +// Same as above, but the input is represented as strings. +GTEST_API_ std::vector CalculateOptimalEdits( + const std::vector &left, + const std::vector &right); + +// Create a diff of the input strings in Unified diff format. +GTEST_API_ std::string CreateUnifiedDiff(const std::vector &left, + const std::vector &right, + size_t context = 2); + +} // namespace edit_distance + +// Calculate the diff between 'left' and 'right' and return it in unified diff +// format. +// If not null, stores in 'total_line_count' the total number of lines found +// in left + right. +GTEST_API_ std::string DiffStrings(const std::string &left, + const std::string &right, + size_t *total_line_count); + +// Constructs and returns the message for an equality assertion +// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure. +// +// The first four parameters are the expressions used in the assertion +// and their values, as strings. For example, for ASSERT_EQ(foo, bar) +// where foo is 5 and bar is 6, we have: +// +// expected_expression: "foo" +// actual_expression: "bar" +// expected_value: "5" +// actual_value: "6" +// +// The ignoring_case parameter is true if and only if the assertion is a +// *_STRCASEEQ*. When it's true, the string " (ignoring case)" will +// be inserted into the message. +GTEST_API_ AssertionResult EqFailure(const char *expected_expression, + const char *actual_expression, + const std::string &expected_value, + const std::string &actual_value, + bool ignoring_case); + +// Constructs a failure message for Boolean assertions such as EXPECT_TRUE. +GTEST_API_ std::string GetBoolAssertionFailureMessage( + const AssertionResult &assertion_result, const char *expression_text, + const char *actual_predicate_value, const char *expected_predicate_value); + +// This template class represents an IEEE floating-point number +// (either single-precision or double-precision, depending on the +// template parameters). +// +// The purpose of this class is to do more sophisticated number +// comparison. (Due to round-off error, etc, it's very unlikely that +// two floating-points will be equal exactly. Hence a naive +// comparison by the == operation often doesn't work.) +// +// Format of IEEE floating-point: +// +// The most-significant bit being the leftmost, an IEEE +// floating-point looks like +// +// sign_bit exponent_bits fraction_bits +// +// Here, sign_bit is a single bit that designates the sign of the +// number. +// +// For float, there are 8 exponent bits and 23 fraction bits. +// +// For double, there are 11 exponent bits and 52 fraction bits. +// +// More details can be found at +// http://en.wikipedia.org/wiki/IEEE_floating-point_standard. +// +// Template parameter: +// +// RawType: the raw floating-point type (either float or double) +template +class FloatingPoint { + public: + // Defines the unsigned integer type that has the same size as the + // floating point number. + typedef typename TypeWithSize::UInt Bits; + + // Constants. + + // # of bits in a number. + static const size_t kBitCount = 8 * sizeof(RawType); + + // # of fraction bits in a number. + static const size_t kFractionBitCount = + std::numeric_limits::digits - 1; + + // # of exponent bits in a number. + static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount; + + // The mask for the sign bit. + static const Bits kSignBitMask = static_cast(1) << (kBitCount - 1); + + // The mask for the fraction bits. + static const Bits kFractionBitMask = ~static_cast(0) >> + (kExponentBitCount + 1); + + // The mask for the exponent bits. + static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask); + + // How many ULP's (Units in the Last Place) we want to tolerate when + // comparing two numbers. The larger the value, the more error we + // allow. A 0 value means that two numbers must be exactly the same + // to be considered equal. + // + // The maximum error of a single floating-point operation is 0.5 + // units in the last place. On Intel CPU's, all floating-point + // calculations are done with 80-bit precision, while double has 64 + // bits. Therefore, 4 should be enough for ordinary use. + // + // See the following article for more details on ULP: + // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/ + static const size_t kMaxUlps = 4; + + // Constructs a FloatingPoint from a raw floating-point number. + // + // On an Intel CPU, passing a non-normalized NAN (Not a Number) + // around may change its bits, although the new value is guaranteed + // to be also a NAN. Therefore, don't expect this constructor to + // preserve the bits in x when x is a NAN. + explicit FloatingPoint(const RawType &x) { u_.value_ = x; } + + // Static methods + + // Reinterprets a bit pattern as a floating-point number. + // + // This function is needed to test the AlmostEquals() method. + static RawType ReinterpretBits(const Bits bits) { + FloatingPoint fp(0); + fp.u_.bits_ = bits; + return fp.u_.value_; + } + + // Returns the floating-point number that represent positive infinity. + static RawType Infinity() { return ReinterpretBits(kExponentBitMask); } + + // Returns the maximum representable finite floating-point number. + static RawType Max(); + + // Non-static methods + + // Returns the bits that represents this number. + const Bits &bits() const { return u_.bits_; } + + // Returns the exponent bits of this number. + Bits exponent_bits() const { return kExponentBitMask & u_.bits_; } + + // Returns the fraction bits of this number. + Bits fraction_bits() const { return kFractionBitMask & u_.bits_; } + + // Returns the sign bit of this number. + Bits sign_bit() const { return kSignBitMask & u_.bits_; } + + // Returns true if and only if this is NAN (not a number). + bool is_nan() const { + // It's a NAN if the exponent bits are all ones and the fraction + // bits are not entirely zeros. + return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0); + } + + // Returns true if and only if this number is at most kMaxUlps ULP's away + // from rhs. In particular, this function: + // + // - returns false if either number is (or both are) NAN. + // - treats really large numbers as almost equal to infinity. + // - thinks +0.0 and -0.0 are 0 DLP's apart. + bool AlmostEquals(const FloatingPoint &rhs) const { + // The IEEE standard says that any comparison operation involving + // a NAN must return false. + if (is_nan() || rhs.is_nan()) return false; + + return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_) <= + kMaxUlps; + } + + private: + // The data type used to store the actual floating-point number. + union FloatingPointUnion { + RawType value_; // The raw floating-point number. + Bits bits_; // The bits that represent the number. + }; + + // Converts an integer from the sign-and-magnitude representation to + // the biased representation. More precisely, let N be 2 to the + // power of (kBitCount - 1), an integer x is represented by the + // unsigned number x + N. + // + // For instance, + // + // -N + 1 (the most negative number representable using + // sign-and-magnitude) is represented by 1; + // 0 is represented by N; and + // N - 1 (the biggest number representable using + // sign-and-magnitude) is represented by 2N - 1. + // + // Read http://en.wikipedia.org/wiki/Signed_number_representations + // for more details on signed number representations. + static Bits SignAndMagnitudeToBiased(const Bits &sam) { + if (kSignBitMask & sam) { + // sam represents a negative number. + return ~sam + 1; + } else { + // sam represents a positive number. + return kSignBitMask | sam; + } + } + + // Given two numbers in the sign-and-magnitude representation, + // returns the distance between them as an unsigned number. + static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1, + const Bits &sam2) { + const Bits biased1 = SignAndMagnitudeToBiased(sam1); + const Bits biased2 = SignAndMagnitudeToBiased(sam2); + return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1); + } + + FloatingPointUnion u_; +}; + +// We cannot use std::numeric_limits::max() as it clashes with the max() +// macro defined by . +template <> +inline float FloatingPoint::Max() { + return FLT_MAX; +} +template <> +inline double FloatingPoint::Max() { + return DBL_MAX; +} + +// Typedefs the instances of the FloatingPoint template class that we +// care to use. +typedef FloatingPoint Float; +typedef FloatingPoint Double; + +// In order to catch the mistake of putting tests that use different +// test fixture classes in the same test suite, we need to assign +// unique IDs to fixture classes and compare them. The TypeId type is +// used to hold such IDs. The user should treat TypeId as an opaque +// type: the only operation allowed on TypeId values is to compare +// them for equality using the == operator. +typedef const void *TypeId; + +template +class TypeIdHelper { + public: + // dummy_ must not have a const type. Otherwise an overly eager + // compiler (e.g. MSVC 7.1 & 8.0) may try to merge + // TypeIdHelper::dummy_ for different Ts as an "optimization". + static bool dummy_; +}; + +template +bool TypeIdHelper::dummy_ = false; + +// GetTypeId() returns the ID of type T. Different values will be +// returned for different types. Calling the function twice with the +// same type argument is guaranteed to return the same ID. +template +TypeId GetTypeId() { + // The compiler is required to allocate a different + // TypeIdHelper::dummy_ variable for each T used to instantiate + // the template. Therefore, the address of dummy_ is guaranteed to + // be unique. + return &(TypeIdHelper::dummy_); +} + +// Returns the type ID of ::testing::Test. Always call this instead +// of GetTypeId< ::testing::Test>() to get the type ID of +// ::testing::Test, as the latter may give the wrong result due to a +// suspected linker bug when compiling Google Test as a Mac OS X +// framework. +GTEST_API_ TypeId GetTestTypeId(); + +// Defines the abstract factory interface that creates instances +// of a Test object. +class TestFactoryBase { + public: + virtual ~TestFactoryBase() {} + + // Creates a test instance to run. The instance is both created and destroyed + // within TestInfoImpl::Run() + virtual Test *CreateTest() = 0; + + protected: + TestFactoryBase() {} + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestFactoryBase); +}; + +// This class provides implementation of TeastFactoryBase interface. +// It is used in TEST and TEST_F macros. +template +class TestFactoryImpl : public TestFactoryBase { + public: + Test *CreateTest() override { return new TestClass; } +}; + +#if GTEST_OS_WINDOWS + +// Predicate-formatters for implementing the HRESULT checking macros +// {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED} +// We pass a long instead of HRESULT to avoid causing an +// include dependency for the HRESULT type. +GTEST_API_ AssertionResult IsHRESULTSuccess(const char *expr, + long hr); // NOLINT +GTEST_API_ AssertionResult IsHRESULTFailure(const char *expr, + long hr); // NOLINT + +#endif // GTEST_OS_WINDOWS + +// Types of SetUpTestSuite() and TearDownTestSuite() functions. +using SetUpTestSuiteFunc = void (*)(); +using TearDownTestSuiteFunc = void (*)(); + +struct CodeLocation { + CodeLocation(const std::string &a_file, int a_line) + : file(a_file), line(a_line) {} + + std::string file; + int line; +}; + +// Helper to identify which setup function for TestCase / TestSuite to call. +// Only one function is allowed, either TestCase or TestSute but not both. + +// Utility functions to help SuiteApiResolver +using SetUpTearDownSuiteFuncType = void (*)(); + +inline SetUpTearDownSuiteFuncType GetNotDefaultOrNull( + SetUpTearDownSuiteFuncType a, SetUpTearDownSuiteFuncType def) { + return a == def ? nullptr : a; +} + +template +// Note that SuiteApiResolver inherits from T because +// SetUpTestSuite()/TearDownTestSuite() could be protected. Ths way +// SuiteApiResolver can access them. +struct SuiteApiResolver : T { + // testing::Test is only forward declared at this point. So we make it a + // dependend class for the compiler to be OK with it. + using Test = + typename std::conditional::type; + + static SetUpTearDownSuiteFuncType GetSetUpCaseOrSuite(const char *filename, + int line_num) { + SetUpTearDownSuiteFuncType test_case_fp = + GetNotDefaultOrNull(&T::SetUpTestCase, &Test::SetUpTestCase); + SetUpTearDownSuiteFuncType test_suite_fp = + GetNotDefaultOrNull(&T::SetUpTestSuite, &Test::SetUpTestSuite); + + GTEST_CHECK_(!test_case_fp || !test_suite_fp) + << "Test can not provide both SetUpTestSuite and SetUpTestCase, please " + "make sure there is only one present at " + << filename << ":" << line_num; + + return test_case_fp != nullptr ? test_case_fp : test_suite_fp; + } + + static SetUpTearDownSuiteFuncType GetTearDownCaseOrSuite(const char *filename, + int line_num) { + SetUpTearDownSuiteFuncType test_case_fp = + GetNotDefaultOrNull(&T::TearDownTestCase, &Test::TearDownTestCase); + SetUpTearDownSuiteFuncType test_suite_fp = + GetNotDefaultOrNull(&T::TearDownTestSuite, &Test::TearDownTestSuite); + + GTEST_CHECK_(!test_case_fp || !test_suite_fp) + << "Test can not provide both TearDownTestSuite and TearDownTestCase," + " please make sure there is only one present at" + << filename << ":" << line_num; + + return test_case_fp != nullptr ? test_case_fp : test_suite_fp; + } +}; + +// Creates a new TestInfo object and registers it with Google Test; +// returns the created object. +// +// Arguments: +// +// test_suite_name: name of the test suite +// name: name of the test +// type_param the name of the test's type parameter, or NULL if +// this is not a typed or a type-parameterized test. +// value_param text representation of the test's value parameter, +// or NULL if this is not a type-parameterized test. +// code_location: code location where the test is defined +// fixture_class_id: ID of the test fixture class +// set_up_tc: pointer to the function that sets up the test suite +// tear_down_tc: pointer to the function that tears down the test suite +// factory: pointer to the factory that creates a test object. +// The newly created TestInfo instance will assume +// ownership of the factory object. +GTEST_API_ TestInfo *MakeAndRegisterTestInfo( + const char *test_suite_name, const char *name, const char *type_param, + const char *value_param, CodeLocation code_location, + TypeId fixture_class_id, SetUpTestSuiteFunc set_up_tc, + TearDownTestSuiteFunc tear_down_tc, TestFactoryBase *factory); + +// If *pstr starts with the given prefix, modifies *pstr to be right +// past the prefix and returns true; otherwise leaves *pstr unchanged +// and returns false. None of pstr, *pstr, and prefix can be NULL. +GTEST_API_ bool SkipPrefix(const char *prefix, const char **pstr); + +#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +// State of the definition of a type-parameterized test suite. +class GTEST_API_ TypedTestSuitePState { + public: + TypedTestSuitePState() : registered_(false) {} + + // Adds the given test name to defined_test_names_ and return true + // if the test suite hasn't been registered; otherwise aborts the + // program. + bool AddTestName(const char *file, int line, const char *case_name, + const char *test_name) { + if (registered_) { + fprintf(stderr, + "%s Test %s must be defined before " + "REGISTER_TYPED_TEST_SUITE_P(%s, ...).\n", + FormatFileLocation(file, line).c_str(), test_name, case_name); + fflush(stderr); + posix::Abort(); + } + registered_tests_.insert( + ::std::make_pair(test_name, CodeLocation(file, line))); + return true; + } + + bool TestExists(const std::string &test_name) const { + return registered_tests_.count(test_name) > 0; + } + + const CodeLocation &GetCodeLocation(const std::string &test_name) const { + RegisteredTestsMap::const_iterator it = registered_tests_.find(test_name); + GTEST_CHECK_(it != registered_tests_.end()); + return it->second; + } + + // Verifies that registered_tests match the test names in + // defined_test_names_; returns registered_tests if successful, or + // aborts the program otherwise. + const char *VerifyRegisteredTestNames(const char *test_suite_name, + const char *file, int line, + const char *registered_tests); + + private: + typedef ::std::map RegisteredTestsMap; + + bool registered_; + RegisteredTestsMap registered_tests_; +}; + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +using TypedTestCasePState = TypedTestSuitePState; +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +// Skips to the first non-space char after the first comma in 'str'; +// returns NULL if no comma is found in 'str'. +inline const char *SkipComma(const char *str) { + const char *comma = strchr(str, ','); + if (comma == nullptr) { + return nullptr; + } + while (IsSpace(*(++comma))) { + } + return comma; +} + +// Returns the prefix of 'str' before the first comma in it; returns +// the entire string if it contains no comma. +inline std::string GetPrefixUntilComma(const char *str) { + const char *comma = strchr(str, ','); + return comma == nullptr ? str : std::string(str, comma); +} + +// Splits a given string on a given delimiter, populating a given +// vector with the fields. +void SplitString(const ::std::string &str, char delimiter, + ::std::vector<::std::string> *dest); + +// The default argument to the template below for the case when the user does +// not provide a name generator. +struct DefaultNameGenerator { + template + static std::string GetName(int i) { + return StreamableToString(i); + } +}; + +template +struct NameGeneratorSelector { + typedef Provided type; +}; + +template +void GenerateNamesRecursively(internal::None, std::vector *, int) { +} + +template +void GenerateNamesRecursively(Types, std::vector *result, int i) { + result->push_back(NameGenerator::template GetName(i)); + GenerateNamesRecursively(typename Types::Tail(), result, + i + 1); +} + +template +std::vector GenerateNames() { + std::vector result; + GenerateNamesRecursively(Types(), &result, 0); + return result; +} + +// TypeParameterizedTest::Register() +// registers a list of type-parameterized tests with Google Test. The +// return value is insignificant - we just need to return something +// such that we can call this function in a namespace scope. +// +// Implementation note: The GTEST_TEMPLATE_ macro declares a template +// template parameter. It's defined in gtest-type-util.h. +template +class TypeParameterizedTest { + public: + // 'index' is the index of the test in the type list 'Types' + // specified in INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, TestSuite, + // Types). Valid values for 'index' are [0, N - 1] where N is the + // length of Types. + static bool Register(const char *prefix, const CodeLocation &code_location, + const char *case_name, const char *test_names, int index, + const std::vector &type_names = + GenerateNames()) { + typedef typename Types::Head Type; + typedef Fixture FixtureClass; + typedef typename GTEST_BIND_(TestSel, Type) TestClass; + + // First, registers the first type-parameterized test in the type + // list. + MakeAndRegisterTestInfo( + (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name + + "/" + type_names[static_cast(index)]) + .c_str(), + StripTrailingSpaces(GetPrefixUntilComma(test_names)).c_str(), + GetTypeName().c_str(), + nullptr, // No value parameter. + code_location, GetTypeId(), + SuiteApiResolver::GetSetUpCaseOrSuite( + code_location.file.c_str(), code_location.line), + SuiteApiResolver::GetTearDownCaseOrSuite( + code_location.file.c_str(), code_location.line), + new TestFactoryImpl); + + // Next, recurses (at compile time) with the tail of the type list. + return TypeParameterizedTest::Register(prefix, + code_location, + case_name, + test_names, + index + 1, + type_names); + } +}; + +// The base case for the compile time recursion. +template +class TypeParameterizedTest { + public: + static bool Register(const char * /*prefix*/, const CodeLocation &, + const char * /*case_name*/, const char * /*test_names*/, + int /*index*/, + const std::vector & = + std::vector() /*type_names*/) { + return true; + } +}; + +GTEST_API_ void RegisterTypeParameterizedTestSuite(const char *test_suite_name, + CodeLocation code_location); +GTEST_API_ void RegisterTypeParameterizedTestSuiteInstantiation( + const char *case_name); + +// TypeParameterizedTestSuite::Register() +// registers *all combinations* of 'Tests' and 'Types' with Google +// Test. The return value is insignificant - we just need to return +// something such that we can call this function in a namespace scope. +template +class TypeParameterizedTestSuite { + public: + static bool Register(const char *prefix, CodeLocation code_location, + const TypedTestSuitePState *state, const char *case_name, + const char *test_names, + const std::vector &type_names = + GenerateNames()) { + RegisterTypeParameterizedTestSuiteInstantiation(case_name); + std::string test_name = + StripTrailingSpaces(GetPrefixUntilComma(test_names)); + if (!state->TestExists(test_name)) { + fprintf(stderr, "Failed to get code location for test %s.%s at %s.", + case_name, test_name.c_str(), + FormatFileLocation(code_location.file.c_str(), code_location.line) + .c_str()); + fflush(stderr); + posix::Abort(); + } + const CodeLocation &test_location = state->GetCodeLocation(test_name); + + typedef typename Tests::Head Head; + + // First, register the first test in 'Test' for each type in 'Types'. + TypeParameterizedTest::Register( + prefix, test_location, case_name, test_names, 0, type_names); + + // Next, recurses (at compile time) with the tail of the test list. + return TypeParameterizedTestSuite::Register(prefix, code_location, + state, case_name, + SkipComma(test_names), + type_names); + } +}; + +// The base case for the compile time recursion. +template +class TypeParameterizedTestSuite { + public: + static bool Register(const char * /*prefix*/, const CodeLocation &, + const TypedTestSuitePState * /*state*/, + const char * /*case_name*/, const char * /*test_names*/, + const std::vector & = + std::vector() /*type_names*/) { + return true; + } +}; + +#endif // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P + +// Returns the current OS stack trace as an std::string. +// +// The maximum number of stack frames to be included is specified by +// the gtest_stack_trace_depth flag. The skip_count parameter +// specifies the number of top frames to be skipped, which doesn't +// count against the number of frames to be included. +// +// For example, if Foo() calls Bar(), which in turn calls +// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in +// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't. +GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(UnitTest *unit_test, + int skip_count); + +// Helpers for suppressing warnings on unreachable code or constant +// condition. + +// Always returns true. +GTEST_API_ bool AlwaysTrue(); + +// Always returns false. +inline bool AlwaysFalse() { return !AlwaysTrue(); } + +// Helper for suppressing false warning from Clang on a const char* +// variable declared in a conditional expression always being NULL in +// the else branch. +struct GTEST_API_ ConstCharPtr { + ConstCharPtr(const char *str) : value(str) {} + operator bool() const { return true; } + const char *value; +}; + +// Helper for declaring std::string within 'if' statement +// in pre C++17 build environment. +struct TrueWithString { + TrueWithString() = default; + explicit TrueWithString(const char *str) : value(str) {} + explicit TrueWithString(const std::string &str) : value(str) {} + explicit operator bool() const { return true; } + std::string value; +}; + +// A simple Linear Congruential Generator for generating random +// numbers with a uniform distribution. Unlike rand() and srand(), it +// doesn't use global state (and therefore can't interfere with user +// code). Unlike rand_r(), it's portable. An LCG isn't very random, +// but it's good enough for our purposes. +class GTEST_API_ Random { + public: + static const uint32_t kMaxRange = 1u << 31; + + explicit Random(uint32_t seed) : state_(seed) {} + + void Reseed(uint32_t seed) { state_ = seed; } + + // Generates a random number from [0, range). Crashes if 'range' is + // 0 or greater than kMaxRange. + uint32_t Generate(uint32_t range); + + private: + uint32_t state_; + GTEST_DISALLOW_COPY_AND_ASSIGN_(Random); +}; + +// Turns const U&, U&, const U, and U all into U. +#define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \ + typename std::remove_const::type>::type + +// IsAProtocolMessage::value is a compile-time bool constant that's +// true if and only if T is type proto2::Message or a subclass of it. +template +struct IsAProtocolMessage + : public std::is_convertible {}; + +// When the compiler sees expression IsContainerTest(0), if C is an +// STL-style container class, the first overload of IsContainerTest +// will be viable (since both C::iterator* and C::const_iterator* are +// valid types and NULL can be implicitly converted to them). It will +// be picked over the second overload as 'int' is a perfect match for +// the type of argument 0. If C::iterator or C::const_iterator is not +// a valid type, the first overload is not viable, and the second +// overload will be picked. Therefore, we can determine whether C is +// a container class by checking the type of IsContainerTest(0). +// The value of the expression is insignificant. +// +// In C++11 mode we check the existence of a const_iterator and that an +// iterator is properly implemented for the container. +// +// For pre-C++11 that we look for both C::iterator and C::const_iterator. +// The reason is that C++ injects the name of a class as a member of the +// class itself (e.g. you can refer to class iterator as either +// 'iterator' or 'iterator::iterator'). If we look for C::iterator +// only, for example, we would mistakenly think that a class named +// iterator is an STL container. +// +// Also note that the simpler approach of overloading +// IsContainerTest(typename C::const_iterator*) and +// IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++. +typedef int IsContainer; +template ().begin()), + class = decltype(::std::declval().end()), + class = decltype(++::std::declval()), + class = decltype(*::std::declval()), + class = typename C::const_iterator> +IsContainer IsContainerTest(int /* dummy */) { + return 0; +} + +typedef char IsNotContainer; +template +IsNotContainer IsContainerTest(long /* dummy */) { + return '\0'; +} + +// Trait to detect whether a type T is a hash table. +// The heuristic used is that the type contains an inner type `hasher` and does +// not contain an inner type `reverse_iterator`. +// If the container is iterable in reverse, then order might actually matter. +template +struct IsHashTable { + private: + template + static char test(typename U::hasher *, typename U::reverse_iterator *); + template + static int test(typename U::hasher *, ...); + template + static char test(...); + + public: + static const bool value = sizeof(test(nullptr, nullptr)) == sizeof(int); +}; + +template +const bool IsHashTable::value; + +template (0)) == sizeof(IsContainer)> +struct IsRecursiveContainerImpl; + +template +struct IsRecursiveContainerImpl : public std::false_type {}; + +// Since the IsRecursiveContainerImpl depends on the IsContainerTest we need to +// obey the same inconsistencies as the IsContainerTest, namely check if +// something is a container is relying on only const_iterator in C++11 and +// is relying on both const_iterator and iterator otherwise +template +struct IsRecursiveContainerImpl { + using value_type = decltype(*std::declval()); + using type = + std::is_same::type>::type, + C>; +}; + +// IsRecursiveContainer is a unary compile-time predicate that +// evaluates whether C is a recursive container type. A recursive container +// type is a container type whose value_type is equal to the container type +// itself. An example for a recursive container type is +// boost::filesystem::path, whose iterator has a value_type that is equal to +// boost::filesystem::path. +template +struct IsRecursiveContainer : public IsRecursiveContainerImpl::type {}; + +// Utilities for native arrays. + +// ArrayEq() compares two k-dimensional native arrays using the +// elements' operator==, where k can be any integer >= 0. When k is +// 0, ArrayEq() degenerates into comparing a single pair of values. + +template +bool ArrayEq(const T *lhs, size_t size, const U *rhs); + +// This generic version is used when k is 0. +template +inline bool ArrayEq(const T &lhs, const U &rhs) { + return lhs == rhs; +} + +// This overload is used when k >= 1. +template +inline bool ArrayEq(const T (&lhs)[N], const U (&rhs)[N]) { + return internal::ArrayEq(lhs, N, rhs); +} + +// This helper reduces code bloat. If we instead put its logic inside +// the previous ArrayEq() function, arrays with different sizes would +// lead to different copies of the template code. +template +bool ArrayEq(const T *lhs, size_t size, const U *rhs) { + for (size_t i = 0; i != size; i++) { + if (!internal::ArrayEq(lhs[i], rhs[i])) return false; + } + return true; +} + +// Finds the first element in the iterator range [begin, end) that +// equals elem. Element may be a native array type itself. +template +Iter ArrayAwareFind(Iter begin, Iter end, const Element &elem) { + for (Iter it = begin; it != end; ++it) { + if (internal::ArrayEq(*it, elem)) return it; + } + return end; +} + +// CopyArray() copies a k-dimensional native array using the elements' +// operator=, where k can be any integer >= 0. When k is 0, +// CopyArray() degenerates into copying a single value. + +template +void CopyArray(const T *from, size_t size, U *to); + +// This generic version is used when k is 0. +template +inline void CopyArray(const T &from, U *to) { + *to = from; +} + +// This overload is used when k >= 1. +template +inline void CopyArray(const T (&from)[N], U (*to)[N]) { + internal::CopyArray(from, N, *to); +} + +// This helper reduces code bloat. If we instead put its logic inside +// the previous CopyArray() function, arrays with different sizes +// would lead to different copies of the template code. +template +void CopyArray(const T *from, size_t size, U *to) { + for (size_t i = 0; i != size; i++) { + internal::CopyArray(from[i], to + i); + } +} + +// The relation between an NativeArray object (see below) and the +// native array it represents. +// We use 2 different structs to allow non-copyable types to be used, as long +// as RelationToSourceReference() is passed. +struct RelationToSourceReference {}; +struct RelationToSourceCopy {}; + +// Adapts a native array to a read-only STL-style container. Instead +// of the complete STL container concept, this adaptor only implements +// members useful for Google Mock's container matchers. New members +// should be added as needed. To simplify the implementation, we only +// support Element being a raw type (i.e. having no top-level const or +// reference modifier). It's the client's responsibility to satisfy +// this requirement. Element can be an array type itself (hence +// multi-dimensional arrays are supported). +template +class NativeArray { + public: + // STL-style container typedefs. + typedef Element value_type; + typedef Element *iterator; + typedef const Element *const_iterator; + + // Constructs from a native array. References the source. + NativeArray(const Element *array, size_t count, RelationToSourceReference) { + InitRef(array, count); + } + + // Constructs from a native array. Copies the source. + NativeArray(const Element *array, size_t count, RelationToSourceCopy) { + InitCopy(array, count); + } + + // Copy constructor. + NativeArray(const NativeArray &rhs) { + (this->*rhs.clone_)(rhs.array_, rhs.size_); + } + + ~NativeArray() { + if (clone_ != &NativeArray::InitRef) delete[] array_; + } + + // STL-style container methods. + size_t size() const { return size_; } + const_iterator begin() const { return array_; } + const_iterator end() const { return array_ + size_; } + bool operator==(const NativeArray &rhs) const { + return size() == rhs.size() && ArrayEq(begin(), size(), rhs.begin()); + } + + private: + static_assert(!std::is_const::value, "Type must not be const"); + static_assert(!std::is_reference::value, + "Type must not be a reference"); + + // Initializes this object with a copy of the input. + void InitCopy(const Element *array, size_t a_size) { + Element *const copy = new Element[a_size]; + CopyArray(array, a_size, copy); + array_ = copy; + size_ = a_size; + clone_ = &NativeArray::InitCopy; + } + + // Initializes this object with a reference of the input. + void InitRef(const Element *array, size_t a_size) { + array_ = array; + size_ = a_size; + clone_ = &NativeArray::InitRef; + } + + const Element *array_; + size_t size_; + void (NativeArray::*clone_)(const Element *, size_t); + + GTEST_DISALLOW_ASSIGN_(NativeArray); +}; + +// Backport of std::index_sequence. +template +struct IndexSequence { + using type = IndexSequence; +}; + +// Double the IndexSequence, and one if plus_one is true. +template +struct DoubleSequence; +template +struct DoubleSequence, sizeofT> { + using type = IndexSequence; +}; +template +struct DoubleSequence, sizeofT> { + using type = IndexSequence; +}; + +// Backport of std::make_index_sequence. +// It uses O(ln(N)) instantiation depth. +template +struct MakeIndexSequence + : DoubleSequence::type, + N / 2>::type {}; + +template <> +struct MakeIndexSequence<0> : IndexSequence<> {}; + +template +struct Ignore { + Ignore(...); // NOLINT +}; + +template +struct ElemFromListImpl; +template +struct ElemFromListImpl> { + // We make Ignore a template to solve a problem with MSVC. + // A non-template Ignore would work fine with `decltype(Ignore(I))...`, but + // MSVC doesn't understand how to deal with that pack expansion. + // Use `0 * I` to have a single instantiation of Ignore. + template + static R Apply(Ignore<0 * I>..., R (*)(), ...); +}; + +template +struct ElemFromList { + using type = + decltype(ElemFromListImpl::type>::Apply( + static_cast(nullptr)...)); +}; + +template +class FlatTuple; + +template +struct FlatTupleElemBase; + +template +struct FlatTupleElemBase, I> { + using value_type = typename ElemFromList::type; + FlatTupleElemBase() = default; + explicit FlatTupleElemBase(value_type t) : value(std::move(t)) {} + value_type value; +}; + +template +struct FlatTupleBase; + +template +struct FlatTupleBase, IndexSequence> + : FlatTupleElemBase, Idx>... { + using Indices = IndexSequence; + FlatTupleBase() = default; + explicit FlatTupleBase(T... t) + : FlatTupleElemBase, Idx>(std::move(t))... {} +}; + +// Analog to std::tuple but with different tradeoffs. +// This class minimizes the template instantiation depth, thus allowing more +// elements than std::tuple would. std::tuple has been seen to require an +// instantiation depth of more than 10x the number of elements in some +// implementations. +// FlatTuple and ElemFromList are not recursive and have a fixed depth +// regardless of T... +// MakeIndexSequence, on the other hand, it is recursive but with an +// instantiation depth of O(ln(N)). +template +class FlatTuple + : private FlatTupleBase, + typename MakeIndexSequence::type> { + using Indices = typename FlatTupleBase< + FlatTuple, typename MakeIndexSequence::type>::Indices; + + public: + FlatTuple() = default; + explicit FlatTuple(T... t) : FlatTuple::FlatTupleBase(std::move(t)...) {} + + template + const typename ElemFromList::type &Get() const { + return static_cast *>(this)->value; + } + + template + typename ElemFromList::type &Get() { + return static_cast *>(this)->value; + } +}; + +// Utility functions to be called with static_assert to induce deprecation +// warnings. +GTEST_INTERNAL_DEPRECATED( + "INSTANTIATE_TEST_CASE_P is deprecated, please use " + "INSTANTIATE_TEST_SUITE_P") +constexpr bool InstantiateTestCase_P_IsDeprecated() { return true; } + +GTEST_INTERNAL_DEPRECATED( + "TYPED_TEST_CASE_P is deprecated, please use " + "TYPED_TEST_SUITE_P") +constexpr bool TypedTestCase_P_IsDeprecated() { return true; } + +GTEST_INTERNAL_DEPRECATED( + "TYPED_TEST_CASE is deprecated, please use " + "TYPED_TEST_SUITE") +constexpr bool TypedTestCaseIsDeprecated() { return true; } + +GTEST_INTERNAL_DEPRECATED( + "REGISTER_TYPED_TEST_CASE_P is deprecated, please use " + "REGISTER_TYPED_TEST_SUITE_P") +constexpr bool RegisterTypedTestCase_P_IsDeprecated() { return true; } + +GTEST_INTERNAL_DEPRECATED( + "INSTANTIATE_TYPED_TEST_CASE_P is deprecated, please use " + "INSTANTIATE_TYPED_TEST_SUITE_P") +constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; } + +} // namespace internal +} // namespace testing + +#define GTEST_MESSAGE_AT_(file, line, message, result_type) \ + ::testing::internal::AssertHelper(result_type, file, line, message) = \ + ::testing::Message() + +#define GTEST_MESSAGE_(message, result_type) \ + GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type) + +#define GTEST_FATAL_FAILURE_(message) \ + return GTEST_MESSAGE_(message, ::testing::TestPartResult::kFatalFailure) + +#define GTEST_NONFATAL_FAILURE_(message) \ + GTEST_MESSAGE_(message, ::testing::TestPartResult::kNonFatalFailure) + +#define GTEST_SUCCESS_(message) \ + GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess) + +#define GTEST_SKIP_(message) \ + return GTEST_MESSAGE_(message, ::testing::TestPartResult::kSkip) + +// Suppress MSVC warning 4072 (unreachable code) for the code following +// statement if it returns or throws (or doesn't return or throw in some +// situations). +#define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \ + if (::testing::internal::AlwaysTrue()) { \ + statement; \ + } + +#define GTEST_TEST_THROW_(statement, expected_exception, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::ConstCharPtr gtest_msg = "") { \ + bool gtest_caught_expected = false; \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } catch (expected_exception const &) { \ + gtest_caught_expected = true; \ + } catch (...) { \ + gtest_msg.value = "Expected: " #statement \ + " throws an exception of type " #expected_exception \ + ".\n Actual: it throws a different type."; \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \ + } \ + if (!gtest_caught_expected) { \ + gtest_msg.value = "Expected: " #statement \ + " throws an exception of type " #expected_exception \ + ".\n Actual: it throws nothing."; \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__) \ + : fail(gtest_msg.value) + +#if GTEST_HAS_EXCEPTIONS + +#define GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_() \ + catch (std::exception const &e) { \ + gtest_msg.value = \ + ("it throws std::exception-derived exception with description: \""); \ + gtest_msg.value += e.what(); \ + gtest_msg.value += "\"."; \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \ + } + +#else // GTEST_HAS_EXCEPTIONS + +#define GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_() + +#endif // GTEST_HAS_EXCEPTIONS + +#define GTEST_TEST_NO_THROW_(statement, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::TrueWithString gtest_msg{}) { \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } \ + GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_() \ + catch (...) { \ + gtest_msg.value = "it throws."; \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__) \ + : fail(("Expected: " #statement " doesn't throw an exception.\n" \ + " Actual: " + \ + gtest_msg.value) \ + .c_str()) + +#define GTEST_TEST_ANY_THROW_(statement, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + bool gtest_caught_any = false; \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } catch (...) { \ + gtest_caught_any = true; \ + } \ + if (!gtest_caught_any) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__) \ + : fail("Expected: " #statement \ + " throws an exception.\n" \ + " Actual: it doesn't.") + +// Implements Boolean test assertions such as EXPECT_TRUE. expression can be +// either a boolean expression or an AssertionResult. text is a textual +// represenation of expression as it was passed into the EXPECT_TRUE. +#define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (const ::testing::AssertionResult gtest_ar_ = \ + ::testing::AssertionResult(expression)) \ + ; \ + else \ + fail(::testing::internal::GetBoolAssertionFailureMessage( \ + gtest_ar_, text, #actual, #expected) \ + .c_str()) + +#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__) \ + : fail("Expected: " #statement \ + " doesn't generate new fatal " \ + "failures in the current thread.\n" \ + " Actual: it does.") + +// Expands to the name of the class that implements the given test. +#define GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ + test_suite_name##_##test_name##_Test + +// Helper macro for defining tests. +#define GTEST_TEST_(test_suite_name, test_name, parent_class, parent_id) \ + static_assert(sizeof(GTEST_STRINGIFY_(test_suite_name)) > 1, \ + "test_suite_name must not be empty"); \ + static_assert(sizeof(GTEST_STRINGIFY_(test_name)) > 1, \ + "test_name must not be empty"); \ + class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ + : public parent_class { \ + public: \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() {} \ + ~GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() override = default; \ + GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name)); \ + GTEST_DISALLOW_MOVE_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name)); \ + \ + private: \ + void TestBody() override; \ + static ::testing::TestInfo *const test_info_ GTEST_ATTRIBUTE_UNUSED_; \ + }; \ + \ + ::testing::TestInfo *const GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name)::test_info_ = \ + ::testing::internal::MakeAndRegisterTestInfo( \ + #test_suite_name, #test_name, nullptr, nullptr, \ + ::testing::internal::CodeLocation(__FILE__, __LINE__), (parent_id), \ + ::testing::internal::SuiteApiResolver< \ + parent_class>::GetSetUpCaseOrSuite(__FILE__, __LINE__), \ + ::testing::internal::SuiteApiResolver< \ + parent_class>::GetTearDownCaseOrSuite(__FILE__, __LINE__), \ + new ::testing::internal::TestFactoryImpl); \ + void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody() + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util.h new file mode 100644 index 000000000..0d8fc71ce --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util.h @@ -0,0 +1,922 @@ +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Type and function utilities for implementing parameterized tests. + +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-port.h" +#include "gtest/gtest-printers.h" +#include "gtest/gtest-test-part.h" + +namespace testing { +// Input to a parameterized test name generator, describing a test parameter. +// Consists of the parameter value and the integer parameter index. +template +struct TestParamInfo { + TestParamInfo(const ParamType &a_param, size_t an_index) + : param(a_param), index(an_index) {} + ParamType param; + size_t index; +}; + +// A builtin parameterized test name generator which returns the result of +// testing::PrintToString. +struct PrintToStringParamName { + template + std::string operator()(const TestParamInfo &info) const { + return PrintToString(info.param); + } +}; + +namespace internal { + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// Utility Functions + +// Outputs a message explaining invalid registration of different +// fixture class for the same test suite. This may happen when +// TEST_P macro is used to define two tests with the same name +// but in different namespaces. +GTEST_API_ void ReportInvalidTestSuiteType(const char *test_suite_name, + CodeLocation code_location); + +template +class ParamGeneratorInterface; +template +class ParamGenerator; + +// Interface for iterating over elements provided by an implementation +// of ParamGeneratorInterface. +template +class ParamIteratorInterface { + public: + virtual ~ParamIteratorInterface() {} + // A pointer to the base generator instance. + // Used only for the purposes of iterator comparison + // to make sure that two iterators belong to the same generator. + virtual const ParamGeneratorInterface *BaseGenerator() const = 0; + // Advances iterator to point to the next element + // provided by the generator. The caller is responsible + // for not calling Advance() on an iterator equal to + // BaseGenerator()->End(). + virtual void Advance() = 0; + // Clones the iterator object. Used for implementing copy semantics + // of ParamIterator. + virtual ParamIteratorInterface *Clone() const = 0; + // Dereferences the current iterator and provides (read-only) access + // to the pointed value. It is the caller's responsibility not to call + // Current() on an iterator equal to BaseGenerator()->End(). + // Used for implementing ParamGenerator::operator*(). + virtual const T *Current() const = 0; + // Determines whether the given iterator and other point to the same + // element in the sequence generated by the generator. + // Used for implementing ParamGenerator::operator==(). + virtual bool Equals(const ParamIteratorInterface &other) const = 0; +}; + +// Class iterating over elements provided by an implementation of +// ParamGeneratorInterface. It wraps ParamIteratorInterface +// and implements the const forward iterator concept. +template +class ParamIterator { + public: + typedef T value_type; + typedef const T &reference; + typedef ptrdiff_t difference_type; + + // ParamIterator assumes ownership of the impl_ pointer. + ParamIterator(const ParamIterator &other) : impl_(other.impl_->Clone()) {} + ParamIterator &operator=(const ParamIterator &other) { + if (this != &other) impl_.reset(other.impl_->Clone()); + return *this; + } + + const T &operator*() const { return *impl_->Current(); } + const T *operator->() const { return impl_->Current(); } + // Prefix version of operator++. + ParamIterator &operator++() { + impl_->Advance(); + return *this; + } + // Postfix version of operator++. + ParamIterator operator++(int /*unused*/) { + ParamIteratorInterface *clone = impl_->Clone(); + impl_->Advance(); + return ParamIterator(clone); + } + bool operator==(const ParamIterator &other) const { + return impl_.get() == other.impl_.get() || impl_->Equals(*other.impl_); + } + bool operator!=(const ParamIterator &other) const { + return !(*this == other); + } + + private: + friend class ParamGenerator; + explicit ParamIterator(ParamIteratorInterface *impl) : impl_(impl) {} + std::unique_ptr> impl_; +}; + +// ParamGeneratorInterface is the binary interface to access generators +// defined in other translation units. +template +class ParamGeneratorInterface { + public: + typedef T ParamType; + + virtual ~ParamGeneratorInterface() {} + + // Generator interface definition + virtual ParamIteratorInterface *Begin() const = 0; + virtual ParamIteratorInterface *End() const = 0; +}; + +// Wraps ParamGeneratorInterface and provides general generator syntax +// compatible with the STL Container concept. +// This class implements copy initialization semantics and the contained +// ParamGeneratorInterface instance is shared among all copies +// of the original object. This is possible because that instance is immutable. +template +class ParamGenerator { + public: + typedef ParamIterator iterator; + + explicit ParamGenerator(ParamGeneratorInterface *impl) : impl_(impl) {} + ParamGenerator(const ParamGenerator &other) : impl_(other.impl_) {} + + ParamGenerator &operator=(const ParamGenerator &other) { + impl_ = other.impl_; + return *this; + } + + iterator begin() const { return iterator(impl_->Begin()); } + iterator end() const { return iterator(impl_->End()); } + + private: + std::shared_ptr> impl_; +}; + +// Generates values from a range of two comparable values. Can be used to +// generate sequences of user-defined types that implement operator+() and +// operator<(). +// This class is used in the Range() function. +template +class RangeGenerator : public ParamGeneratorInterface { + public: + RangeGenerator(T begin, T end, IncrementT step) + : begin_(begin), end_(end), step_(step), + end_index_(CalculateEndIndex(begin, end, step)) {} + ~RangeGenerator() override {} + + ParamIteratorInterface *Begin() const override { + return new Iterator(this, begin_, 0, step_); + } + ParamIteratorInterface *End() const override { + return new Iterator(this, end_, end_index_, step_); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface *base, T value, int index, + IncrementT step) + : base_(base), value_(value), index_(index), step_(step) {} + ~Iterator() override {} + + const ParamGeneratorInterface *BaseGenerator() const override { + return base_; + } + void Advance() override { + value_ = static_cast(value_ + step_); + index_++; + } + ParamIteratorInterface *Clone() const override { + return new Iterator(*this); + } + const T *Current() const override { return &value_; } + bool Equals(const ParamIteratorInterface &other) const override { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const int other_index = + CheckedDowncastToActualType(&other)->index_; + return index_ == other_index; + } + + private: + Iterator(const Iterator &other) + : ParamIteratorInterface(), base_(other.base_), value_(other.value_), + index_(other.index_), step_(other.step_) {} + + // No implementation - assignment is unsupported. + void operator=(const Iterator &other); + + const ParamGeneratorInterface *const base_; + T value_; + int index_; + const IncrementT step_; + }; // class RangeGenerator::Iterator + + static int CalculateEndIndex(const T &begin, const T &end, + const IncrementT &step) { + int end_index = 0; + for (T i = begin; i < end; i = static_cast(i + step)) end_index++; + return end_index; + } + + // No implementation - assignment is unsupported. + void operator=(const RangeGenerator &other); + + const T begin_; + const T end_; + const IncrementT step_; + // The index for the end() iterator. All the elements in the generated + // sequence are indexed (0-based) to aid iterator comparison. + const int end_index_; +}; // class RangeGenerator + +// Generates values from a pair of STL-style iterators. Used in the +// ValuesIn() function. The elements are copied from the source range +// since the source can be located on the stack, and the generator +// is likely to persist beyond that stack frame. +template +class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface { + public: + template + ValuesInIteratorRangeGenerator(ForwardIterator begin, ForwardIterator end) + : container_(begin, end) {} + ~ValuesInIteratorRangeGenerator() override {} + + ParamIteratorInterface *Begin() const override { + return new Iterator(this, container_.begin()); + } + ParamIteratorInterface *End() const override { + return new Iterator(this, container_.end()); + } + + private: + typedef typename ::std::vector ContainerType; + + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface *base, + typename ContainerType::const_iterator iterator) + : base_(base), iterator_(iterator) {} + ~Iterator() override {} + + const ParamGeneratorInterface *BaseGenerator() const override { + return base_; + } + void Advance() override { + ++iterator_; + value_.reset(); + } + ParamIteratorInterface *Clone() const override { + return new Iterator(*this); + } + // We need to use cached value referenced by iterator_ because *iterator_ + // can return a temporary object (and of type other then T), so just + // having "return &*iterator_;" doesn't work. + // value_ is updated here and not in Advance() because Advance() + // can advance iterator_ beyond the end of the range, and we cannot + // detect that fact. The client code, on the other hand, is + // responsible for not calling Current() on an out-of-range iterator. + const T *Current() const override { + if (value_.get() == nullptr) value_.reset(new T(*iterator_)); + return value_.get(); + } + bool Equals(const ParamIteratorInterface &other) const override { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + return iterator_ == + CheckedDowncastToActualType(&other)->iterator_; + } + + private: + Iterator(const Iterator &other) + // The explicit constructor call suppresses a false warning + // emitted by gcc when supplied with the -Wextra option. + : ParamIteratorInterface(), base_(other.base_), + iterator_(other.iterator_) {} + + const ParamGeneratorInterface *const base_; + typename ContainerType::const_iterator iterator_; + // A cached value of *iterator_. We keep it here to allow access by + // pointer in the wrapping iterator's operator->(). + // value_ needs to be mutable to be accessed in Current(). + // Use of std::unique_ptr helps manage cached value's lifetime, + // which is bound by the lifespan of the iterator itself. + mutable std::unique_ptr value_; + }; // class ValuesInIteratorRangeGenerator::Iterator + + // No implementation - assignment is unsupported. + void operator=(const ValuesInIteratorRangeGenerator &other); + + const ContainerType container_; +}; // class ValuesInIteratorRangeGenerator + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Default parameterized test name generator, returns a string containing the +// integer test parameter index. +template +std::string DefaultParamName(const TestParamInfo &info) { + Message name_stream; + name_stream << info.index; + return name_stream.GetString(); +} + +template +void TestNotEmpty() { + static_assert(sizeof(T) == 0, "Empty arguments are not allowed."); +} +template +void TestNotEmpty(const T &) {} + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Stores a parameter value and later creates tests parameterized with that +// value. +template +class ParameterizedTestFactory : public TestFactoryBase { + public: + typedef typename TestClass::ParamType ParamType; + explicit ParameterizedTestFactory(ParamType parameter) + : parameter_(parameter) {} + Test *CreateTest() override { + TestClass::SetParam(¶meter_); + return new TestClass(); + } + + private: + const ParamType parameter_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestFactory); +}; + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// TestMetaFactoryBase is a base class for meta-factories that create +// test factories for passing into MakeAndRegisterTestInfo function. +template +class TestMetaFactoryBase { + public: + virtual ~TestMetaFactoryBase() {} + + virtual TestFactoryBase *CreateTestFactory(ParamType parameter) = 0; +}; + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// TestMetaFactory creates test factories for passing into +// MakeAndRegisterTestInfo function. Since MakeAndRegisterTestInfo receives +// ownership of test factory pointer, same factory object cannot be passed +// into that method twice. But ParameterizedTestSuiteInfo is going to call +// it for each Test/Parameter value combination. Thus it needs meta factory +// creator class. +template +class TestMetaFactory + : public TestMetaFactoryBase { + public: + using ParamType = typename TestSuite::ParamType; + + TestMetaFactory() {} + + TestFactoryBase *CreateTestFactory(ParamType parameter) override { + return new ParameterizedTestFactory(parameter); + } + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestMetaFactory); +}; + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// ParameterizedTestSuiteInfoBase is a generic interface +// to ParameterizedTestSuiteInfo classes. ParameterizedTestSuiteInfoBase +// accumulates test information provided by TEST_P macro invocations +// and generators provided by INSTANTIATE_TEST_SUITE_P macro invocations +// and uses that information to register all resulting test instances +// in RegisterTests method. The ParameterizeTestSuiteRegistry class holds +// a collection of pointers to the ParameterizedTestSuiteInfo objects +// and calls RegisterTests() on each of them when asked. +class ParameterizedTestSuiteInfoBase { + public: + virtual ~ParameterizedTestSuiteInfoBase() {} + + // Base part of test suite name for display purposes. + virtual const std::string &GetTestSuiteName() const = 0; + // Test case id to verify identity. + virtual TypeId GetTestSuiteTypeId() const = 0; + // UnitTest class invokes this method to register tests in this + // test suite right before running them in RUN_ALL_TESTS macro. + // This method should not be called more than once on any single + // instance of a ParameterizedTestSuiteInfoBase derived class. + virtual void RegisterTests() = 0; + + protected: + ParameterizedTestSuiteInfoBase() {} + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteInfoBase); +}; + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Report a the name of a test_suit as safe to ignore +// as the side effect of construction of this type. +struct MarkAsIgnored { + explicit MarkAsIgnored(const char *test_suite); +}; + +GTEST_API_ void InsertSyntheticTestCase(const std::string &name, + CodeLocation location, bool has_test_p); + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// ParameterizedTestSuiteInfo accumulates tests obtained from TEST_P +// macro invocations for a particular test suite and generators +// obtained from INSTANTIATE_TEST_SUITE_P macro invocations for that +// test suite. It registers tests with all values generated by all +// generators when asked. +template +class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase { + public: + // ParamType and GeneratorCreationFunc are private types but are required + // for declarations of public methods AddTestPattern() and + // AddTestSuiteInstantiation(). + using ParamType = typename TestSuite::ParamType; + // A function that returns an instance of appropriate generator type. + typedef ParamGenerator(GeneratorCreationFunc)(); + using ParamNameGeneratorFunc = std::string(const TestParamInfo &); + + explicit ParameterizedTestSuiteInfo(const char *name, + CodeLocation code_location) + : test_suite_name_(name), code_location_(code_location) {} + + // Test case base name for display purposes. + const std::string &GetTestSuiteName() const override { + return test_suite_name_; + } + // Test case id to verify identity. + TypeId GetTestSuiteTypeId() const override { return GetTypeId(); } + // TEST_P macro uses AddTestPattern() to record information + // about a single test in a LocalTestInfo structure. + // test_suite_name is the base name of the test suite (without invocation + // prefix). test_base_name is the name of an individual test without + // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is + // test suite base name and DoBar is test base name. + void AddTestPattern(const char *test_suite_name, const char *test_base_name, + TestMetaFactoryBase *meta_factory) { + tests_.push_back(std::shared_ptr( + new TestInfo(test_suite_name, test_base_name, meta_factory))); + } + // INSTANTIATE_TEST_SUITE_P macro uses AddGenerator() to record information + // about a generator. + int AddTestSuiteInstantiation(const std::string &instantiation_name, + GeneratorCreationFunc *func, + ParamNameGeneratorFunc *name_func, + const char *file, int line) { + instantiations_.push_back( + InstantiationInfo(instantiation_name, func, name_func, file, line)); + return 0; // Return value used only to run this method in namespace scope. + } + // UnitTest class invokes this method to register tests in this test suite + // right before running tests in RUN_ALL_TESTS macro. + // This method should not be called more than once on any single + // instance of a ParameterizedTestSuiteInfoBase derived class. + // UnitTest has a guard to prevent from calling this method more than once. + void RegisterTests() override { + bool generated_instantiations = false; + + for (typename TestInfoContainer::iterator test_it = tests_.begin(); + test_it != tests_.end(); ++test_it) { + std::shared_ptr test_info = *test_it; + for (typename InstantiationContainer::iterator gen_it = + instantiations_.begin(); + gen_it != instantiations_.end(); ++gen_it) { + const std::string &instantiation_name = gen_it->name; + ParamGenerator generator((*gen_it->generator)()); + ParamNameGeneratorFunc *name_func = gen_it->name_func; + const char *file = gen_it->file; + int line = gen_it->line; + + std::string test_suite_name; + if (!instantiation_name.empty()) + test_suite_name = instantiation_name + "/"; + test_suite_name += test_info->test_suite_base_name; + + size_t i = 0; + std::set test_param_names; + for (typename ParamGenerator::iterator param_it = + generator.begin(); + param_it != generator.end(); ++param_it, ++i) { + generated_instantiations = true; + + Message test_name_stream; + + std::string param_name = + name_func(TestParamInfo(*param_it, i)); + + GTEST_CHECK_(IsValidParamName(param_name)) + << "Parameterized test name '" << param_name + << "' is invalid, in " << file << " line " << line << std::endl; + + GTEST_CHECK_(test_param_names.count(param_name) == 0) + << "Duplicate parameterized test name '" << param_name << "', in " + << file << " line " << line << std::endl; + + test_param_names.insert(param_name); + + if (!test_info->test_base_name.empty()) { + test_name_stream << test_info->test_base_name << "/"; + } + test_name_stream << param_name; + MakeAndRegisterTestInfo( + test_suite_name.c_str(), test_name_stream.GetString().c_str(), + nullptr, // No type parameter. + PrintToString(*param_it).c_str(), code_location_, + GetTestSuiteTypeId(), + SuiteApiResolver::GetSetUpCaseOrSuite(file, line), + SuiteApiResolver::GetTearDownCaseOrSuite(file, line), + test_info->test_meta_factory->CreateTestFactory(*param_it)); + } // for param_it + } // for gen_it + } // for test_it + + if (!generated_instantiations) { + // There are no generaotrs, or they all generate nothing ... + InsertSyntheticTestCase(GetTestSuiteName(), code_location_, + !tests_.empty()); + } + } // RegisterTests + + private: + // LocalTestInfo structure keeps information about a single test registered + // with TEST_P macro. + struct TestInfo { + TestInfo(const char *a_test_suite_base_name, const char *a_test_base_name, + TestMetaFactoryBase *a_test_meta_factory) + : test_suite_base_name(a_test_suite_base_name), + test_base_name(a_test_base_name), + test_meta_factory(a_test_meta_factory) {} + + const std::string test_suite_base_name; + const std::string test_base_name; + const std::unique_ptr> test_meta_factory; + }; + using TestInfoContainer = ::std::vector>; + // Records data received from INSTANTIATE_TEST_SUITE_P macros: + // + struct InstantiationInfo { + InstantiationInfo(const std::string &name_in, + GeneratorCreationFunc *generator_in, + ParamNameGeneratorFunc *name_func_in, const char *file_in, + int line_in) + : name(name_in), generator(generator_in), name_func(name_func_in), + file(file_in), line(line_in) {} + + std::string name; + GeneratorCreationFunc *generator; + ParamNameGeneratorFunc *name_func; + const char *file; + int line; + }; + typedef ::std::vector InstantiationContainer; + + static bool IsValidParamName(const std::string &name) { + // Check for empty string + if (name.empty()) return false; + + // Check for invalid characters + for (std::string::size_type index = 0; index < name.size(); ++index) { + if (!isalnum(name[index]) && name[index] != '_') return false; + } + + return true; + } + + const std::string test_suite_name_; + CodeLocation code_location_; + TestInfoContainer tests_; + InstantiationContainer instantiations_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteInfo); +}; // class ParameterizedTestSuiteInfo + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +template +using ParameterizedTestCaseInfo = ParameterizedTestSuiteInfo; +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// ParameterizedTestSuiteRegistry contains a map of +// ParameterizedTestSuiteInfoBase classes accessed by test suite names. TEST_P +// and INSTANTIATE_TEST_SUITE_P macros use it to locate their corresponding +// ParameterizedTestSuiteInfo descriptors. +class ParameterizedTestSuiteRegistry { + public: + ParameterizedTestSuiteRegistry() {} + ~ParameterizedTestSuiteRegistry() { + for (auto &test_suite_info : test_suite_infos_) { + delete test_suite_info; + } + } + + // Looks up or creates and returns a structure containing information about + // tests and instantiations of a particular test suite. + template + ParameterizedTestSuiteInfo *GetTestSuitePatternHolder( + const char *test_suite_name, CodeLocation code_location) { + ParameterizedTestSuiteInfo *typed_test_info = nullptr; + for (auto &test_suite_info : test_suite_infos_) { + if (test_suite_info->GetTestSuiteName() == test_suite_name) { + if (test_suite_info->GetTestSuiteTypeId() != GetTypeId()) { + // Complain about incorrect usage of Google Test facilities + // and terminate the program since we cannot guaranty correct + // test suite setup and tear-down in this case. + ReportInvalidTestSuiteType(test_suite_name, code_location); + posix::Abort(); + } else { + // At this point we are sure that the object we found is of the same + // type we are looking for, so we downcast it to that type + // without further checks. + typed_test_info = CheckedDowncastToActualType< + ParameterizedTestSuiteInfo>(test_suite_info); + } + break; + } + } + if (typed_test_info == nullptr) { + typed_test_info = new ParameterizedTestSuiteInfo( + test_suite_name, code_location); + test_suite_infos_.push_back(typed_test_info); + } + return typed_test_info; + } + void RegisterTests() { + for (auto &test_suite_info : test_suite_infos_) { + test_suite_info->RegisterTests(); + } + } +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + template + ParameterizedTestCaseInfo *GetTestCasePatternHolder( + const char *test_case_name, CodeLocation code_location) { + return GetTestSuitePatternHolder(test_case_name, code_location); + } + +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + private: + using TestSuiteInfoContainer = + ::std::vector; + + TestSuiteInfoContainer test_suite_infos_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteRegistry); +}; + +// Keep track of what type-parameterized test suite are defined and +// where as well as which are intatiated. This allows susequently +// identifying suits that are defined but never used. +class TypeParameterizedTestSuiteRegistry { + public: + // Add a suite definition + void RegisterTestSuite(const char *test_suite_name, + CodeLocation code_location); + + // Add an instantiation of a suit. + void RegisterInstantiation(const char *test_suite_name); + + // For each suit repored as defined but not reported as instantiation, + // emit a test that reports that fact (configurably, as an error). + void CheckForInstantiations(); + + private: + struct TypeParameterizedTestSuiteInfo { + explicit TypeParameterizedTestSuiteInfo(CodeLocation c) + : code_location(c), instantiated(false) {} + + CodeLocation code_location; + bool instantiated; + }; + + std::map suites_; +}; + +} // namespace internal + +// Forward declarations of ValuesIn(), which is implemented in +// include/gtest/gtest-param-test.h. +template +internal::ParamGenerator ValuesIn( + const Container &container); + +namespace internal { +// Used in the Values() function to provide polymorphic capabilities. + +template +class ValueArray { + public: + ValueArray(Ts... v) : v_{ std::move(v)... } {} + + template + operator ParamGenerator() const { // NOLINT + return ValuesIn(MakeVector(MakeIndexSequence())); + } + + private: + template + std::vector MakeVector(IndexSequence) const { + return std::vector{ static_cast(v_.template Get())... }; + } + + FlatTuple v_; +}; + +template +class CartesianProductGenerator + : public ParamGeneratorInterface<::std::tuple> { + public: + typedef ::std::tuple ParamType; + + CartesianProductGenerator(const std::tuple...> &g) + : generators_(g) {} + ~CartesianProductGenerator() override {} + + ParamIteratorInterface *Begin() const override { + return new Iterator(this, generators_, false); + } + ParamIteratorInterface *End() const override { + return new Iterator(this, generators_, true); + } + + private: + template + class IteratorImpl; + template + class IteratorImpl> + : public ParamIteratorInterface { + public: + IteratorImpl(const ParamGeneratorInterface *base, + const std::tuple...> &generators, + bool is_end) + : base_(base), begin_(std::get(generators).begin()...), + end_(std::get(generators).end()...), + current_(is_end ? end_ : begin_) { + ComputeCurrentValue(); + } + ~IteratorImpl() override {} + + const ParamGeneratorInterface *BaseGenerator() const override { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + void Advance() override { + assert(!AtEnd()); + // Advance the last iterator. + ++std::get(current_); + // if that reaches end, propagate that up. + AdvanceIfEnd(); + ComputeCurrentValue(); + } + ParamIteratorInterface *Clone() const override { + return new IteratorImpl(*this); + } + + const ParamType *Current() const override { return current_value_.get(); } + + bool Equals(const ParamIteratorInterface &other) const override { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const IteratorImpl *typed_other = + CheckedDowncastToActualType(&other); + + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + if (AtEnd() && typed_other->AtEnd()) return true; + + bool same = true; + bool dummy[] = { (same = same && + std::get(current_) == + std::get(typed_other->current_))... }; + (void)dummy; + return same; + } + + private: + template + void AdvanceIfEnd() { + if (std::get(current_) != std::get(end_)) return; + + bool last = ThisI == 0; + if (last) { + // We are done. Nothing else to propagate. + return; + } + + constexpr size_t NextI = ThisI - (ThisI != 0); + std::get(current_) = std::get(begin_); + ++std::get(current_); + AdvanceIfEnd(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = std::make_shared(*std::get(current_)...); + } + bool AtEnd() const { + bool at_end = false; + bool dummy[] = { (at_end = at_end || std::get(current_) == + std::get(end_))... }; + (void)dummy; + return at_end; + } + + const ParamGeneratorInterface *const base_; + std::tuple::iterator...> begin_; + std::tuple::iterator...> end_; + std::tuple::iterator...> current_; + std::shared_ptr current_value_; + }; + + using Iterator = IteratorImpl::type>; + + std::tuple...> generators_; +}; + +template +class CartesianProductHolder { + public: + CartesianProductHolder(const Gen &... g) : generators_(g...) {} + template + operator ParamGenerator<::std::tuple>() const { + return ParamGenerator<::std::tuple>( + new CartesianProductGenerator(generators_)); + } + + private: + std::tuple generators_; +}; + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-port-arch.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-port-arch.h new file mode 100644 index 000000000..f803a19be --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-port-arch.h @@ -0,0 +1,111 @@ +// Copyright 2015, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This header file defines the GTEST_OS_* macro. +// It is separate from gtest-port.h so that custom/gtest-port.h can include it. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_ + +// Determines the platform on which Google Test is compiled. +#ifdef __CYGWIN__ +#define GTEST_OS_CYGWIN 1 +#elif defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__) +#define GTEST_OS_WINDOWS_MINGW 1 +#define GTEST_OS_WINDOWS 1 +#elif defined _WIN32 +#define GTEST_OS_WINDOWS 1 +#ifdef _WIN32_WCE +#define GTEST_OS_WINDOWS_MOBILE 1 +#elif defined(WINAPI_FAMILY) +#include +#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) +#define GTEST_OS_WINDOWS_DESKTOP 1 +#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP) +#define GTEST_OS_WINDOWS_PHONE 1 +#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) +#define GTEST_OS_WINDOWS_RT 1 +#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE) +#define GTEST_OS_WINDOWS_PHONE 1 +#define GTEST_OS_WINDOWS_TV_TITLE 1 +#else +// WINAPI_FAMILY defined but no known partition matched. +// Default to desktop. +#define GTEST_OS_WINDOWS_DESKTOP 1 +#endif +#else +#define GTEST_OS_WINDOWS_DESKTOP 1 +#endif // _WIN32_WCE +#elif defined __OS2__ +#define GTEST_OS_OS2 1 +#elif defined __APPLE__ +#define GTEST_OS_MAC 1 +#if TARGET_OS_IPHONE +#define GTEST_OS_IOS 1 +#endif +#elif defined __DragonFly__ +#define GTEST_OS_DRAGONFLY 1 +#elif defined __FreeBSD__ +#define GTEST_OS_FREEBSD 1 +#elif defined __Fuchsia__ +#define GTEST_OS_FUCHSIA 1 +#elif defined(__GLIBC__) && defined(__FreeBSD_kernel__) +#define GTEST_OS_GNU_KFREEBSD 1 +#elif defined __linux__ +#define GTEST_OS_LINUX 1 +#if defined __ANDROID__ +#define GTEST_OS_LINUX_ANDROID 1 +#endif +#elif defined __MVS__ +#define GTEST_OS_ZOS 1 +#elif defined(__sun) && defined(__SVR4) +#define GTEST_OS_SOLARIS 1 +#elif defined(_AIX) +#define GTEST_OS_AIX 1 +#elif defined(__hpux) +#define GTEST_OS_HPUX 1 +#elif defined __native_client__ +#define GTEST_OS_NACL 1 +#elif defined __NetBSD__ +#define GTEST_OS_NETBSD 1 +#elif defined __OpenBSD__ +#define GTEST_OS_OPENBSD 1 +#elif defined __QNX__ +#define GTEST_OS_QNX 1 +#elif defined(__HAIKU__) +#define GTEST_OS_HAIKU 1 +#elif defined ESP8266 +#define GTEST_OS_ESP8266 1 +#elif defined ESP32 +#define GTEST_OS_ESP32 1 +#endif // __CYGWIN__ + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_ diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-port.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-port.h new file mode 100644 index 000000000..083da569f --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-port.h @@ -0,0 +1,2232 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Low-level types and utilities for porting Google Test to various +// platforms. All macros ending with _ and symbols defined in an +// internal namespace are subject to change without notice. Code +// outside Google Test MUST NOT USE THEM DIRECTLY. Macros that don't +// end with _ are part of Google Test's public API and can be used by +// code outside Google Test. +// +// This file is fundamental to Google Test. All other Google Test source +// files are expected to #include this. Therefore, it cannot #include +// any other Google Test header. + +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ + +// Environment-describing macros +// ----------------------------- +// +// Google Test can be used in many different environments. Macros in +// this section tell Google Test what kind of environment it is being +// used in, such that Google Test can provide environment-specific +// features and implementations. +// +// Google Test tries to automatically detect the properties of its +// environment, so users usually don't need to worry about these +// macros. However, the automatic detection is not perfect. +// Sometimes it's necessary for a user to define some of the following +// macros in the build script to override Google Test's decisions. +// +// If the user doesn't define a macro in the list, Google Test will +// provide a default definition. After this header is #included, all +// macros in this list will be defined to either 1 or 0. +// +// Notes to maintainers: +// - Each macro here is a user-tweakable knob; do not grow the list +// lightly. +// - Use #if to key off these macros. Don't use #ifdef or "#if +// defined(...)", which will not work as these macros are ALWAYS +// defined. +// +// GTEST_HAS_CLONE - Define it to 1/0 to indicate that clone(2) +// is/isn't available. +// GTEST_HAS_EXCEPTIONS - Define it to 1/0 to indicate that exceptions +// are enabled. +// GTEST_HAS_POSIX_RE - Define it to 1/0 to indicate that POSIX regular +// expressions are/aren't available. +// GTEST_HAS_PTHREAD - Define it to 1/0 to indicate that +// is/isn't available. +// GTEST_HAS_RTTI - Define it to 1/0 to indicate that RTTI is/isn't +// enabled. +// GTEST_HAS_STD_WSTRING - Define it to 1/0 to indicate that +// std::wstring does/doesn't work (Google Test can +// be used where std::wstring is unavailable). +// GTEST_HAS_SEH - Define it to 1/0 to indicate whether the +// compiler supports Microsoft's "Structured +// Exception Handling". +// GTEST_HAS_STREAM_REDIRECTION +// - Define it to 1/0 to indicate whether the +// platform supports I/O stream redirection using +// dup() and dup2(). +// GTEST_LINKED_AS_SHARED_LIBRARY +// - Define to 1 when compiling tests that use +// Google Test as a shared library (known as +// DLL on Windows). +// GTEST_CREATE_SHARED_LIBRARY +// - Define to 1 when compiling Google Test itself +// as a shared library. +// GTEST_DEFAULT_DEATH_TEST_STYLE +// - The default value of --gtest_death_test_style. +// The legacy default has been "fast" in the open +// source version since 2008. The recommended value +// is "threadsafe", and can be set in +// custom/gtest-port.h. + +// Platform-indicating macros +// -------------------------- +// +// Macros indicating the platform on which Google Test is being used +// (a macro is defined to 1 if compiled on the given platform; +// otherwise UNDEFINED -- it's never defined to 0.). Google Test +// defines these macros automatically. Code outside Google Test MUST +// NOT define them. +// +// GTEST_OS_AIX - IBM AIX +// GTEST_OS_CYGWIN - Cygwin +// GTEST_OS_DRAGONFLY - DragonFlyBSD +// GTEST_OS_FREEBSD - FreeBSD +// GTEST_OS_FUCHSIA - Fuchsia +// GTEST_OS_GNU_KFREEBSD - GNU/kFreeBSD +// GTEST_OS_HAIKU - Haiku +// GTEST_OS_HPUX - HP-UX +// GTEST_OS_LINUX - Linux +// GTEST_OS_LINUX_ANDROID - Google Android +// GTEST_OS_MAC - Mac OS X +// GTEST_OS_IOS - iOS +// GTEST_OS_NACL - Google Native Client (NaCl) +// GTEST_OS_NETBSD - NetBSD +// GTEST_OS_OPENBSD - OpenBSD +// GTEST_OS_OS2 - OS/2 +// GTEST_OS_QNX - QNX +// GTEST_OS_SOLARIS - Sun Solaris +// GTEST_OS_WINDOWS - Windows (Desktop, MinGW, or Mobile) +// GTEST_OS_WINDOWS_DESKTOP - Windows Desktop +// GTEST_OS_WINDOWS_MINGW - MinGW +// GTEST_OS_WINDOWS_MOBILE - Windows Mobile +// GTEST_OS_WINDOWS_PHONE - Windows Phone +// GTEST_OS_WINDOWS_RT - Windows Store App/WinRT +// GTEST_OS_ZOS - z/OS +// +// Among the platforms, Cygwin, Linux, Mac OS X, and Windows have the +// most stable support. Since core members of the Google Test project +// don't have access to other platforms, support for them may be less +// stable. If you notice any problems on your platform, please notify +// googletestframework@googlegroups.com (patches for fixing them are +// even more welcome!). +// +// It is possible that none of the GTEST_OS_* macros are defined. + +// Feature-indicating macros +// ------------------------- +// +// Macros indicating which Google Test features are available (a macro +// is defined to 1 if the corresponding feature is supported; +// otherwise UNDEFINED -- it's never defined to 0.). Google Test +// defines these macros automatically. Code outside Google Test MUST +// NOT define them. +// +// These macros are public so that portable tests can be written. +// Such tests typically surround code using a feature with an #if +// which controls that code. For example: +// +// #if GTEST_HAS_DEATH_TEST +// EXPECT_DEATH(DoSomethingDeadly()); +// #endif +// +// GTEST_HAS_DEATH_TEST - death tests +// GTEST_HAS_TYPED_TEST - typed tests +// GTEST_HAS_TYPED_TEST_P - type-parameterized tests +// GTEST_IS_THREADSAFE - Google Test is thread-safe. +// GOOGLETEST_CM0007 DO NOT DELETE +// GTEST_USES_POSIX_RE - enhanced POSIX regex is used. Do not confuse with +// GTEST_HAS_POSIX_RE (see above) which users can +// define themselves. +// GTEST_USES_SIMPLE_RE - our own simple regex is used; +// the above RE\b(s) are mutually exclusive. + +// Misc public macros +// ------------------ +// +// GTEST_FLAG(flag_name) - references the variable corresponding to +// the given Google Test flag. + +// Internal utilities +// ------------------ +// +// The following macros and utilities are for Google Test's INTERNAL +// use only. Code outside Google Test MUST NOT USE THEM DIRECTLY. +// +// Macros for basic C++ coding: +// GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning. +// GTEST_ATTRIBUTE_UNUSED_ - declares that a class' instances or a +// variable don't have to be used. +// GTEST_DISALLOW_ASSIGN_ - disables copy operator=. +// GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=. +// GTEST_DISALLOW_MOVE_ASSIGN_ - disables move operator=. +// GTEST_DISALLOW_MOVE_AND_ASSIGN_ - disables move ctor and operator=. +// GTEST_MUST_USE_RESULT_ - declares that a function's result must be used. +// GTEST_INTENTIONAL_CONST_COND_PUSH_ - start code section where MSVC C4127 is +// suppressed (constant conditional). +// GTEST_INTENTIONAL_CONST_COND_POP_ - finish code section where MSVC C4127 +// is suppressed. +// GTEST_INTERNAL_HAS_STRING_VIEW - for enabling Matcher or +// Matcher +// specializations. +// +// Synchronization: +// Mutex, MutexLock, ThreadLocal, GetThreadCount() +// - synchronization primitives. +// +// Regular expressions: +// RE - a simple regular expression class using the POSIX +// Extended Regular Expression syntax on UNIX-like platforms +// GOOGLETEST_CM0008 DO NOT DELETE +// or a reduced regular exception syntax on other +// platforms, including Windows. +// Logging: +// GTEST_LOG_() - logs messages at the specified severity level. +// LogToStderr() - directs all log messages to stderr. +// FlushInfoLog() - flushes informational log messages. +// +// Stdout and stderr capturing: +// CaptureStdout() - starts capturing stdout. +// GetCapturedStdout() - stops capturing stdout and returns the captured +// string. +// CaptureStderr() - starts capturing stderr. +// GetCapturedStderr() - stops capturing stderr and returns the captured +// string. +// +// Integer types: +// TypeWithSize - maps an integer to a int type. +// TimeInMillis - integers of known sizes. +// BiggestInt - the biggest signed integer type. +// +// Command-line utilities: +// GTEST_DECLARE_*() - declares a flag. +// GTEST_DEFINE_*() - defines a flag. +// GetInjectableArgvs() - returns the command line as a vector of strings. +// +// Environment variable utilities: +// GetEnv() - gets the value of an environment variable. +// BoolFromGTestEnv() - parses a bool environment variable. +// Int32FromGTestEnv() - parses an int32_t environment variable. +// StringFromGTestEnv() - parses a string environment variable. +// +// Deprecation warnings: +// GTEST_INTERNAL_DEPRECATED(message) - attribute marking a function as +// deprecated; calling a marked function +// should generate a compiler warning + +#include // for isspace, etc +#include // for ptrdiff_t +#include +#include +#include +#include +#include +#include + +#ifndef _WIN32_WCE +#include +#include +#endif // !_WIN32_WCE + +#if defined __APPLE__ +#include +#include +#endif + +#include // NOLINT +#include +#include // NOLINT +#include +#include // NOLINT + +#include "gtest/internal/custom/gtest-port.h" +#include "gtest/internal/gtest-port-arch.h" + +#if !defined(GTEST_DEV_EMAIL_) +#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com" +#define GTEST_FLAG_PREFIX_ "gtest_" +#define GTEST_FLAG_PREFIX_DASH_ "gtest-" +#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_" +#define GTEST_NAME_ "Google Test" +#define GTEST_PROJECT_URL_ "https://github.com/google/googletest/" +#endif // !defined(GTEST_DEV_EMAIL_) + +#if !defined(GTEST_INIT_GOOGLE_TEST_NAME_) +#define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest" +#endif // !defined(GTEST_INIT_GOOGLE_TEST_NAME_) + +// Determines the version of gcc that is used to compile this. +#ifdef __GNUC__ +// 40302 means version 4.3.2. +#define GTEST_GCC_VER_ \ + (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) +#endif // __GNUC__ + +// Macros for disabling Microsoft Visual C++ warnings. +// +// GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 4385) +// /* code that triggers warnings C4800 and C4385 */ +// GTEST_DISABLE_MSC_WARNINGS_POP_() +#if defined(_MSC_VER) +#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \ + __pragma(warning(push)) __pragma(warning(disable : warnings)) +#define GTEST_DISABLE_MSC_WARNINGS_POP_() __pragma(warning(pop)) +#else +// Not all compilers are MSVC +#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) +#define GTEST_DISABLE_MSC_WARNINGS_POP_() +#endif + +// Clang on Windows does not understand MSVC's pragma warning. +// We need clang-specific way to disable function deprecation warning. +#ifdef __clang__ +#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \ + _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"") +#define GTEST_DISABLE_MSC_DEPRECATED_POP_() _Pragma("clang diagnostic pop") +#else +#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \ + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996) +#define GTEST_DISABLE_MSC_DEPRECATED_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_() +#endif + +// Brings in definitions for functions used in the testing::internal::posix +// namespace (read, write, close, chdir, isatty, stat). We do not currently +// use them on Windows Mobile. +#if GTEST_OS_WINDOWS +#if !GTEST_OS_WINDOWS_MOBILE +#include +#include +#endif +// In order to avoid having to include , use forward declaration +#if GTEST_OS_WINDOWS_MINGW && !defined(__MINGW64_VERSION_MAJOR) +// MinGW defined _CRITICAL_SECTION and _RTL_CRITICAL_SECTION as two +// separate (equivalent) structs, instead of using typedef +typedef struct _CRITICAL_SECTION GTEST_CRITICAL_SECTION; +#else +// Assume CRITICAL_SECTION is a typedef of _RTL_CRITICAL_SECTION. +// This assumption is verified by +// WindowsTypesTest.CRITICAL_SECTIONIs_RTL_CRITICAL_SECTION. +typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; +#endif +#else +// This assumes that non-Windows OSes provide unistd.h. For OSes where this +// is not the case, we need to include headers that provide the functions +// mentioned above. +#include +#include +#endif // GTEST_OS_WINDOWS + +#if GTEST_OS_LINUX_ANDROID +// Used to define __ANDROID_API__ matching the target NDK API level. +#include // NOLINT +#endif + +// Defines this to true if and only if Google Test can use POSIX regular +// expressions. +#ifndef GTEST_HAS_POSIX_RE +#if GTEST_OS_LINUX_ANDROID +// On Android, is only available starting with Gingerbread. +#define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9) +#else +#define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS) +#endif +#endif + +#if GTEST_USES_PCRE +// The appropriate headers have already been included. + +#elif GTEST_HAS_POSIX_RE + +// On some platforms, needs someone to define size_t, and +// won't compile otherwise. We can #include it here as we already +// included , which is guaranteed to define size_t through +// . +#include // NOLINT + +#define GTEST_USES_POSIX_RE 1 + +#elif GTEST_OS_WINDOWS + +// is not available on Windows. Use our own simple regex +// implementation instead. +#define GTEST_USES_SIMPLE_RE 1 + +#else + +// may not be available on this platform. Use our own +// simple regex implementation instead. +#define GTEST_USES_SIMPLE_RE 1 + +#endif // GTEST_USES_PCRE + +#ifndef GTEST_HAS_EXCEPTIONS +// The user didn't tell us whether exceptions are enabled, so we need +// to figure it out. +#if defined(_MSC_VER) && defined(_CPPUNWIND) +// MSVC defines _CPPUNWIND to 1 if and only if exceptions are enabled. +#define GTEST_HAS_EXCEPTIONS 1 +#elif defined(__BORLANDC__) +// C++Builder's implementation of the STL uses the _HAS_EXCEPTIONS +// macro to enable exceptions, so we'll do the same. +// Assumes that exceptions are enabled by default. +#ifndef _HAS_EXCEPTIONS +#define _HAS_EXCEPTIONS 1 +#endif // _HAS_EXCEPTIONS +#define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS +#elif defined(__clang__) +// clang defines __EXCEPTIONS if and only if exceptions are enabled before clang +// 220714, but if and only if cleanups are enabled after that. In Obj-C++ files, +// there can be cleanups for ObjC exceptions which also need cleanups, even if +// C++ exceptions are disabled. clang has __has_feature(cxx_exceptions) which +// checks for C++ exceptions starting at clang r206352, but which checked for +// cleanups prior to that. To reliably check for C++ exception availability with +// clang, check for +// __EXCEPTIONS && __has_feature(cxx_exceptions). +#define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions)) +#elif defined(__GNUC__) && __EXCEPTIONS +// gcc defines __EXCEPTIONS to 1 if and only if exceptions are enabled. +#define GTEST_HAS_EXCEPTIONS 1 +#elif defined(__SUNPRO_CC) +// Sun Pro CC supports exceptions. However, there is no compile-time way of +// detecting whether they are enabled or not. Therefore, we assume that +// they are enabled unless the user tells us otherwise. +#define GTEST_HAS_EXCEPTIONS 1 +#elif defined(__IBMCPP__) && __EXCEPTIONS +// xlC defines __EXCEPTIONS to 1 if and only if exceptions are enabled. +#define GTEST_HAS_EXCEPTIONS 1 +#elif defined(__HP_aCC) +// Exception handling is in effect by default in HP aCC compiler. It has to +// be turned of by +noeh compiler option if desired. +#define GTEST_HAS_EXCEPTIONS 1 +#else +// For other compilers, we assume exceptions are disabled to be +// conservative. +#define GTEST_HAS_EXCEPTIONS 0 +#endif // defined(_MSC_VER) || defined(__BORLANDC__) +#endif // GTEST_HAS_EXCEPTIONS + +#ifndef GTEST_HAS_STD_WSTRING +// The user didn't tell us whether ::std::wstring is available, so we need +// to figure it out. +// Cygwin 1.7 and below doesn't support ::std::wstring. +// Solaris' libc++ doesn't support it either. Android has +// no support for it at least as recent as Froyo (2.2). +#define GTEST_HAS_STD_WSTRING \ + (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \ + GTEST_OS_HAIKU || GTEST_OS_ESP32 || GTEST_OS_ESP8266)) + +#endif // GTEST_HAS_STD_WSTRING + +// Determines whether RTTI is available. +#ifndef GTEST_HAS_RTTI +// The user didn't tell us whether RTTI is enabled, so we need to +// figure it out. + +#ifdef _MSC_VER + +#ifdef _CPPRTTI // MSVC defines this macro if and only if RTTI is enabled. +#define GTEST_HAS_RTTI 1 +#else +#define GTEST_HAS_RTTI 0 +#endif + +// Starting with version 4.3.2, gcc defines __GXX_RTTI if and only if RTTI is +// enabled. +#elif defined(__GNUC__) + +#ifdef __GXX_RTTI +// When building against STLport with the Android NDK and with +// -frtti -fno-exceptions, the build fails at link time with undefined +// references to __cxa_bad_typeid. Note sure if STL or toolchain bug, +// so disable RTTI when detected. +#if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && !defined(__EXCEPTIONS) +#define GTEST_HAS_RTTI 0 +#else +#define GTEST_HAS_RTTI 1 +#endif // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS +#else +#define GTEST_HAS_RTTI 0 +#endif // __GXX_RTTI + +// Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends +// using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the +// first version with C++ support. +#elif defined(__clang__) + +#define GTEST_HAS_RTTI __has_feature(cxx_rtti) + +// Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if +// both the typeid and dynamic_cast features are present. +#elif defined(__IBMCPP__) && (__IBMCPP__ >= 900) + +#ifdef __RTTI_ALL__ +#define GTEST_HAS_RTTI 1 +#else +#define GTEST_HAS_RTTI 0 +#endif + +#else + +// For all other compilers, we assume RTTI is enabled. +#define GTEST_HAS_RTTI 1 + +#endif // _MSC_VER + +#endif // GTEST_HAS_RTTI + +// It's this header's responsibility to #include when RTTI +// is enabled. +#if GTEST_HAS_RTTI +#include +#endif + +// Determines whether Google Test can use the pthreads library. +#ifndef GTEST_HAS_PTHREAD +// The user didn't tell us explicitly, so we make reasonable assumptions about +// which platforms have pthreads support. +// +// To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0 +// to your compiler flags. +#define GTEST_HAS_PTHREAD \ + (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX || GTEST_OS_QNX || \ + GTEST_OS_FREEBSD || GTEST_OS_NACL || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA || \ + GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_OPENBSD || \ + GTEST_OS_HAIKU) +#endif // GTEST_HAS_PTHREAD + +#if GTEST_HAS_PTHREAD +// gtest-port.h guarantees to #include when GTEST_HAS_PTHREAD is +// true. +#include // NOLINT + +// For timespec and nanosleep, used below. +#include // NOLINT +#endif + +// Determines whether clone(2) is supported. +// Usually it will only be available on Linux, excluding +// Linux on the Itanium architecture. +// Also see http://linux.die.net/man/2/clone. +#ifndef GTEST_HAS_CLONE +// The user didn't tell us, so we need to figure it out. + +#if GTEST_OS_LINUX && !defined(__ia64__) +#if GTEST_OS_LINUX_ANDROID +// On Android, clone() became available at different API levels for each 32-bit +// architecture. +#if defined(__LP64__) || (defined(__arm__) && __ANDROID_API__ >= 9) || \ + (defined(__mips__) && __ANDROID_API__ >= 12) || \ + (defined(__i386__) && __ANDROID_API__ >= 17) +#define GTEST_HAS_CLONE 1 +#else +#define GTEST_HAS_CLONE 0 +#endif +#else +#define GTEST_HAS_CLONE 1 +#endif +#else +#define GTEST_HAS_CLONE 0 +#endif // GTEST_OS_LINUX && !defined(__ia64__) + +#endif // GTEST_HAS_CLONE + +// Determines whether to support stream redirection. This is used to test +// output correctness and to implement death tests. +#ifndef GTEST_HAS_STREAM_REDIRECTION +// By default, we assume that stream redirection is supported on all +// platforms except known mobile ones. +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \ + GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 +#define GTEST_HAS_STREAM_REDIRECTION 0 +#else +#define GTEST_HAS_STREAM_REDIRECTION 1 +#endif // !GTEST_OS_WINDOWS_MOBILE +#endif // GTEST_HAS_STREAM_REDIRECTION + +// Determines whether to support death tests. +// pops up a dialog window that cannot be suppressed programmatically. +#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \ + (GTEST_OS_MAC && !GTEST_OS_IOS) || \ + (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER) || GTEST_OS_WINDOWS_MINGW || \ + GTEST_OS_AIX || GTEST_OS_HPUX || GTEST_OS_OPENBSD || GTEST_OS_QNX || \ + GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA || \ + GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_HAIKU) +#define GTEST_HAS_DEATH_TEST 1 +#endif + +// Determines whether to support type-driven tests. + +// Typed tests need and variadic macros, which GCC, VC++ 8.0, +// Sun Pro CC, IBM Visual Age, and HP aCC support. +#if defined(__GNUC__) || defined(_MSC_VER) || defined(__SUNPRO_CC) || \ + defined(__IBMCPP__) || defined(__HP_aCC) +#define GTEST_HAS_TYPED_TEST 1 +#define GTEST_HAS_TYPED_TEST_P 1 +#endif + +// Determines whether the system compiler uses UTF-16 for encoding wide strings. +#define GTEST_WIDE_STRING_USES_UTF16_ \ + (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_AIX || GTEST_OS_OS2) + +// Determines whether test results can be streamed to a socket. +#if GTEST_OS_LINUX || GTEST_OS_GNU_KFREEBSD || GTEST_OS_DRAGONFLY || \ + GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_OPENBSD +#define GTEST_CAN_STREAM_RESULTS_ 1 +#endif + +// Defines some utility macros. + +// The GNU compiler emits a warning if nested "if" statements are followed by +// an "else" statement and braces are not used to explicitly disambiguate the +// "else" binding. This leads to problems with code like: +// +// if (gate) +// ASSERT_*(condition) << "Some message"; +// +// The "switch (0) case 0:" idiom is used to suppress this. +#ifdef __INTEL_COMPILER +#define GTEST_AMBIGUOUS_ELSE_BLOCKER_ +#else +#define GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + switch (0) \ + case 0: \ + default: // NOLINT +#endif + +// Use this annotation at the end of a struct/class definition to +// prevent the compiler from optimizing away instances that are never +// used. This is useful when all interesting logic happens inside the +// c'tor and / or d'tor. Example: +// +// struct Foo { +// Foo() { ... } +// } GTEST_ATTRIBUTE_UNUSED_; +// +// Also use it after a variable or parameter declaration to tell the +// compiler the variable/parameter does not have to be used. +#if defined(__GNUC__) && !defined(COMPILER_ICC) +#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused)) +#elif defined(__clang__) +#if __has_attribute(unused) +#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused)) +#endif +#endif +#ifndef GTEST_ATTRIBUTE_UNUSED_ +#define GTEST_ATTRIBUTE_UNUSED_ +#endif + +// Use this annotation before a function that takes a printf format string. +#if (defined(__GNUC__) || defined(__clang__)) && !defined(COMPILER_ICC) +#if defined(__MINGW_PRINTF_FORMAT) +// MinGW has two different printf implementations. Ensure the format macro +// matches the selected implementation. See +// https://sourceforge.net/p/mingw-w64/wiki2/gnu%20printf/. +#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \ + __attribute__( \ + (__format__(__MINGW_PRINTF_FORMAT, string_index, first_to_check))) +#else +#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \ + __attribute__((__format__(__printf__, string_index, first_to_check))) +#endif +#else +#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) +#endif + +// A macro to disallow copy operator= +// This should be used in the private: declarations for a class. +#define GTEST_DISALLOW_ASSIGN_(type) type &operator=(type const &) = delete + +// A macro to disallow copy constructor and operator= +// This should be used in the private: declarations for a class. +#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type) \ + type(type const &) = delete; \ + GTEST_DISALLOW_ASSIGN_(type) + +// A macro to disallow move operator= +// This should be used in the private: declarations for a class. +#define GTEST_DISALLOW_MOVE_ASSIGN_(type) \ + type &operator=(type &&) noexcept = delete + +// A macro to disallow move constructor and operator= +// This should be used in the private: declarations for a class. +#define GTEST_DISALLOW_MOVE_AND_ASSIGN_(type) \ + type(type &&) noexcept = delete; \ + GTEST_DISALLOW_MOVE_ASSIGN_(type) + +// Tell the compiler to warn about unused return values for functions declared +// with this macro. The macro should be used on function declarations +// following the argument list: +// +// Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_; +#if defined(__GNUC__) && !defined(COMPILER_ICC) +#define GTEST_MUST_USE_RESULT_ __attribute__((warn_unused_result)) +#else +#define GTEST_MUST_USE_RESULT_ +#endif // __GNUC__ && !COMPILER_ICC + +// MS C++ compiler emits warning when a conditional expression is compile time +// constant. In some contexts this warning is false positive and needs to be +// suppressed. Use the following two macros in such cases: +// +// GTEST_INTENTIONAL_CONST_COND_PUSH_() +// while (true) { +// GTEST_INTENTIONAL_CONST_COND_POP_() +// } +#define GTEST_INTENTIONAL_CONST_COND_PUSH_() \ + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127) +#define GTEST_INTENTIONAL_CONST_COND_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_() + +// Determine whether the compiler supports Microsoft's Structured Exception +// Handling. This is supported by several Windows compilers but generally +// does not exist on any other system. +#ifndef GTEST_HAS_SEH +// The user didn't tell us, so we need to figure it out. + +#if defined(_MSC_VER) || defined(__BORLANDC__) +// These two compilers are known to support SEH. +#define GTEST_HAS_SEH 1 +#else +// Assume no SEH. +#define GTEST_HAS_SEH 0 +#endif + +#endif // GTEST_HAS_SEH + +#ifndef GTEST_IS_THREADSAFE + +#define GTEST_IS_THREADSAFE \ + (GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ || \ + (GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT) || \ + GTEST_HAS_PTHREAD) + +#endif // GTEST_IS_THREADSAFE + +// GTEST_API_ qualifies all symbols that must be exported. The definitions below +// are guarded by #ifndef to give embedders a chance to define GTEST_API_ in +// gtest/internal/custom/gtest-port.h +#ifndef GTEST_API_ + +#ifdef _MSC_VER +#if GTEST_LINKED_AS_SHARED_LIBRARY +#define GTEST_API_ __declspec(dllimport) +#elif GTEST_CREATE_SHARED_LIBRARY +#define GTEST_API_ __declspec(dllexport) +#endif +#elif __GNUC__ >= 4 || defined(__clang__) +#define GTEST_API_ __attribute__((visibility("default"))) +#endif // _MSC_VER + +#endif // GTEST_API_ + +#ifndef GTEST_API_ +#define GTEST_API_ +#endif // GTEST_API_ + +#ifndef GTEST_DEFAULT_DEATH_TEST_STYLE +#define GTEST_DEFAULT_DEATH_TEST_STYLE "fast" +#endif // GTEST_DEFAULT_DEATH_TEST_STYLE + +#ifdef __GNUC__ +// Ask the compiler to never inline a given function. +#define GTEST_NO_INLINE_ __attribute__((noinline)) +#else +#define GTEST_NO_INLINE_ +#endif + +// _LIBCPP_VERSION is defined by the libc++ library from the LLVM project. +#if !defined(GTEST_HAS_CXXABI_H_) +#if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)) +#define GTEST_HAS_CXXABI_H_ 1 +#else +#define GTEST_HAS_CXXABI_H_ 0 +#endif +#endif + +// A function level attribute to disable checking for use of uninitialized +// memory when built with MemorySanitizer. +#if defined(__clang__) +#if __has_feature(memory_sanitizer) +#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ __attribute__((no_sanitize_memory)) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ +#endif // __has_feature(memory_sanitizer) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ +#endif // __clang__ + +// A function level attribute to disable AddressSanitizer instrumentation. +#if defined(__clang__) +#if __has_feature(address_sanitizer) +#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \ + __attribute__((no_sanitize_address)) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +#endif // __has_feature(address_sanitizer) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +#endif // __clang__ + +// A function level attribute to disable HWAddressSanitizer instrumentation. +#if defined(__clang__) +#if __has_feature(hwaddress_sanitizer) +#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ \ + __attribute__((no_sanitize("hwaddress"))) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ +#endif // __has_feature(hwaddress_sanitizer) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ +#endif // __clang__ + +// A function level attribute to disable ThreadSanitizer instrumentation. +#if defined(__clang__) +#if __has_feature(thread_sanitizer) +#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ __attribute__((no_sanitize_thread)) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ +#endif // __has_feature(thread_sanitizer) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ +#endif // __clang__ + +namespace testing { + +class Message; + +// Legacy imports for backwards compatibility. +// New code should use std:: names directly. +using std::get; +using std::make_tuple; +using std::tuple; +using std::tuple_element; +using std::tuple_size; + +namespace internal { + +// A secret type that Google Test users don't know about. It has no +// definition on purpose. Therefore it's impossible to create a +// Secret object, which is what we want. +class Secret; + +// The GTEST_COMPILE_ASSERT_ is a legacy macro used to verify that a compile +// time expression is true (in new code, use static_assert instead). For +// example, you could use it to verify the size of a static array: +// +// GTEST_COMPILE_ASSERT_(GTEST_ARRAY_SIZE_(names) == NUM_NAMES, +// names_incorrect_size); +// +// The second argument to the macro must be a valid C++ identifier. If the +// expression is false, compiler will issue an error containing this identifier. +#define GTEST_COMPILE_ASSERT_(expr, msg) static_assert(expr, #msg) + +// A helper for suppressing warnings on constant condition. It just +// returns 'condition'. +GTEST_API_ bool IsTrue(bool condition); + +// Defines RE. + +#if GTEST_USES_PCRE +// if used, PCRE is injected by custom/gtest-port.h +#elif GTEST_USES_POSIX_RE || GTEST_USES_SIMPLE_RE + +// A simple C++ wrapper for . It uses the POSIX Extended +// Regular Expression syntax. +class GTEST_API_ RE { + public: + // A copy constructor is required by the Standard to initialize object + // references from r-values. + RE(const RE &other) { Init(other.pattern()); } + + // Constructs an RE from a string. + RE(const ::std::string ®ex) { Init(regex.c_str()); } // NOLINT + + RE(const char *regex) { Init(regex); } // NOLINT + ~RE(); + + // Returns the string representation of the regex. + const char *pattern() const { return pattern_; } + + // FullMatch(str, re) returns true if and only if regular expression re + // matches the entire str. + // PartialMatch(str, re) returns true if and only if regular expression re + // matches a substring of str (including str itself). + static bool FullMatch(const ::std::string &str, const RE &re) { + return FullMatch(str.c_str(), re); + } + static bool PartialMatch(const ::std::string &str, const RE &re) { + return PartialMatch(str.c_str(), re); + } + + static bool FullMatch(const char *str, const RE &re); + static bool PartialMatch(const char *str, const RE &re); + + private: + void Init(const char *regex); + const char *pattern_; + bool is_valid_; + +#if GTEST_USES_POSIX_RE + + regex_t full_regex_; // For FullMatch(). + regex_t partial_regex_; // For PartialMatch(). + +#else // GTEST_USES_SIMPLE_RE + + const char *full_pattern_; // For FullMatch(); + +#endif + + GTEST_DISALLOW_ASSIGN_(RE); +}; + +#endif // GTEST_USES_PCRE + +// Formats a source file path and a line number as they would appear +// in an error message from the compiler used to compile this code. +GTEST_API_ ::std::string FormatFileLocation(const char *file, int line); + +// Formats a file location for compiler-independent XML output. +// Although this function is not platform dependent, we put it next to +// FormatFileLocation in order to contrast the two functions. +GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char *file, + int line); + +// Defines logging utilities: +// GTEST_LOG_(severity) - logs messages at the specified severity level. The +// message itself is streamed into the macro. +// LogToStderr() - directs all log messages to stderr. +// FlushInfoLog() - flushes informational log messages. + +enum GTestLogSeverity { GTEST_INFO, GTEST_WARNING, GTEST_ERROR, GTEST_FATAL }; + +// Formats log entry severity, provides a stream object for streaming the +// log message, and terminates the message with a newline when going out of +// scope. +class GTEST_API_ GTestLog { + public: + GTestLog(GTestLogSeverity severity, const char *file, int line); + + // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program. + ~GTestLog(); + + ::std::ostream &GetStream() { return ::std::cerr; } + + private: + const GTestLogSeverity severity_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog); +}; + +#if !defined(GTEST_LOG_) + +#define GTEST_LOG_(severity) \ + ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \ + __FILE__, __LINE__) \ + .GetStream() + +inline void LogToStderr() {} +inline void FlushInfoLog() { fflush(nullptr); } + +#endif // !defined(GTEST_LOG_) + +#if !defined(GTEST_CHECK_) +// INTERNAL IMPLEMENTATION - DO NOT USE. +// +// GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition +// is not satisfied. +// Synopsys: +// GTEST_CHECK_(boolean_condition); +// or +// GTEST_CHECK_(boolean_condition) << "Additional message"; +// +// This checks the condition and if the condition is not satisfied +// it prints message about the condition violation, including the +// condition itself, plus additional message streamed into it, if any, +// and then it aborts the program. It aborts the program irrespective of +// whether it is built in the debug mode or not. +#define GTEST_CHECK_(condition) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::IsTrue(condition)) \ + ; \ + else \ + GTEST_LOG_(FATAL) << "Condition " #condition " failed. " +#endif // !defined(GTEST_CHECK_) + +// An all-mode assert to verify that the given POSIX-style function +// call returns 0 (indicating success). Known limitation: this +// doesn't expand to a balanced 'if' statement, so enclose the macro +// in {} if you need to use it as the only statement in an 'if' +// branch. +#define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \ + if (const int gtest_error = (posix_call)) \ + GTEST_LOG_(FATAL) << #posix_call << "failed with error " << gtest_error + +// Transforms "T" into "const T&" according to standard reference collapsing +// rules (this is only needed as a backport for C++98 compilers that do not +// support reference collapsing). Specifically, it transforms: +// +// char ==> const char& +// const char ==> const char& +// char& ==> char& +// const char& ==> const char& +// +// Note that the non-const reference will not have "const" added. This is +// standard, and necessary so that "T" can always bind to "const T&". +template +struct ConstRef { + typedef const T &type; +}; +template +struct ConstRef { + typedef T &type; +}; + +// The argument T must depend on some template parameters. +#define GTEST_REFERENCE_TO_CONST_(T) \ + typename ::testing::internal::ConstRef::type + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Use ImplicitCast_ as a safe version of static_cast for upcasting in +// the type hierarchy (e.g. casting a Foo* to a SuperclassOfFoo* or a +// const Foo*). When you use ImplicitCast_, the compiler checks that +// the cast is safe. Such explicit ImplicitCast_s are necessary in +// surprisingly many situations where C++ demands an exact type match +// instead of an argument type convertable to a target type. +// +// The syntax for using ImplicitCast_ is the same as for static_cast: +// +// ImplicitCast_(expr) +// +// ImplicitCast_ would have been part of the C++ standard library, +// but the proposal was submitted too late. It will probably make +// its way into the language in the future. +// +// This relatively ugly name is intentional. It prevents clashes with +// similar functions users may have (e.g., implicit_cast). The internal +// namespace alone is not enough because the function can be found by ADL. +template +inline To ImplicitCast_(To x) { + return x; +} + +// When you upcast (that is, cast a pointer from type Foo to type +// SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts +// always succeed. When you downcast (that is, cast a pointer from +// type Foo to type SubclassOfFoo), static_cast<> isn't safe, because +// how do you know the pointer is really of type SubclassOfFoo? It +// could be a bare Foo, or of type DifferentSubclassOfFoo. Thus, +// when you downcast, you should use this macro. In debug mode, we +// use dynamic_cast<> to double-check the downcast is legal (we die +// if it's not). In normal mode, we do the efficient static_cast<> +// instead. Thus, it's important to test in debug mode to make sure +// the cast is legal! +// This is the only place in the code we should use dynamic_cast<>. +// In particular, you SHOULDN'T be using dynamic_cast<> in order to +// do RTTI (eg code like this: +// if (dynamic_cast(foo)) HandleASubclass1Object(foo); +// if (dynamic_cast(foo)) HandleASubclass2Object(foo); +// You should design the code some other way not to need this. +// +// This relatively ugly name is intentional. It prevents clashes with +// similar functions users may have (e.g., down_cast). The internal +// namespace alone is not enough because the function can be found by ADL. +template // use like this: DownCast_(foo); +inline To DownCast_(From *f) { // so we only accept pointers + // Ensures that To is a sub-type of From *. This test is here only + // for compile-time type checking, and has no overhead in an + // optimized build at run-time, as it will be optimized away + // completely. + GTEST_INTENTIONAL_CONST_COND_PUSH_() + if (false) { + GTEST_INTENTIONAL_CONST_COND_POP_() + const To to = nullptr; + ::testing::internal::ImplicitCast_(to); + } + +#if GTEST_HAS_RTTI + // RTTI: debug mode only! + GTEST_CHECK_(f == nullptr || dynamic_cast(f) != nullptr); +#endif + return static_cast(f); +} + +// Downcasts the pointer of type Base to Derived. +// Derived must be a subclass of Base. The parameter MUST +// point to a class of type Derived, not any subclass of it. +// When RTTI is available, the function performs a runtime +// check to enforce this. +template +Derived *CheckedDowncastToActualType(Base *base) { +#if GTEST_HAS_RTTI + GTEST_CHECK_(typeid(*base) == typeid(Derived)); +#endif + +#if GTEST_HAS_DOWNCAST_ + return ::down_cast(base); +#elif GTEST_HAS_RTTI + return dynamic_cast(base); // NOLINT +#else + return static_cast(base); // Poor man's downcast. +#endif +} + +#if GTEST_HAS_STREAM_REDIRECTION + +// Defines the stderr capturer: +// CaptureStdout - starts capturing stdout. +// GetCapturedStdout - stops capturing stdout and returns the captured string. +// CaptureStderr - starts capturing stderr. +// GetCapturedStderr - stops capturing stderr and returns the captured string. +// +GTEST_API_ void CaptureStdout(); +GTEST_API_ std::string GetCapturedStdout(); +GTEST_API_ void CaptureStderr(); +GTEST_API_ std::string GetCapturedStderr(); + +#endif // GTEST_HAS_STREAM_REDIRECTION +// Returns the size (in bytes) of a file. +GTEST_API_ size_t GetFileSize(FILE *file); + +// Reads the entire content of a file as a string. +GTEST_API_ std::string ReadEntireFile(FILE *file); + +// All command line arguments. +GTEST_API_ std::vector GetArgvs(); + +#if GTEST_HAS_DEATH_TEST + +std::vector GetInjectableArgvs(); +// Deprecated: pass the args vector by value instead. +void SetInjectableArgvs(const std::vector *new_argvs); +void SetInjectableArgvs(const std::vector &new_argvs); +void ClearInjectableArgvs(); + +#endif // GTEST_HAS_DEATH_TEST + +// Defines synchronization primitives. +#if GTEST_IS_THREADSAFE +#if GTEST_HAS_PTHREAD +// Sleeps for (roughly) n milliseconds. This function is only for testing +// Google Test's own constructs. Don't use it in user tests, either +// directly or indirectly. +inline void SleepMilliseconds(int n) { + const timespec time = { + 0, // 0 seconds. + n * 1000L * 1000L, // And n ms. + }; + nanosleep(&time, nullptr); +} +#endif // GTEST_HAS_PTHREAD + +#if GTEST_HAS_NOTIFICATION_ +// Notification has already been imported into the namespace. +// Nothing to do here. + +#elif GTEST_HAS_PTHREAD +// Allows a controller thread to pause execution of newly created +// threads until notified. Instances of this class must be created +// and destroyed in the controller thread. +// +// This class is only for testing Google Test's own constructs. Do not +// use it in user tests, either directly or indirectly. +class Notification { + public: + Notification() : notified_(false) { + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr)); + } + ~Notification() { pthread_mutex_destroy(&mutex_); } + + // Notifies all threads created with this notification to start. Must + // be called from the controller thread. + void Notify() { + pthread_mutex_lock(&mutex_); + notified_ = true; + pthread_mutex_unlock(&mutex_); + } + + // Blocks until the controller thread notifies. Must be called from a test + // thread. + void WaitForNotification() { + for (;;) { + pthread_mutex_lock(&mutex_); + const bool notified = notified_; + pthread_mutex_unlock(&mutex_); + if (notified) break; + SleepMilliseconds(10); + } + } + + private: + pthread_mutex_t mutex_; + bool notified_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification); +}; + +#elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT + +GTEST_API_ void SleepMilliseconds(int n); + +// Provides leak-safe Windows kernel handle ownership. +// Used in death tests and in threading support. +class GTEST_API_ AutoHandle { + public: + // Assume that Win32 HANDLE type is equivalent to void*. Doing so allows us to + // avoid including in this header file. Including is + // undesirable because it defines a lot of symbols and macros that tend to + // conflict with client code. This assumption is verified by + // WindowsTypesTest.HANDLEIsVoidStar. + typedef void *Handle; + AutoHandle(); + explicit AutoHandle(Handle handle); + + ~AutoHandle(); + + Handle Get() const; + void Reset(); + void Reset(Handle handle); + + private: + // Returns true if and only if the handle is a valid handle object that can be + // closed. + bool IsCloseable() const; + + Handle handle_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle); +}; + +// Allows a controller thread to pause execution of newly created +// threads until notified. Instances of this class must be created +// and destroyed in the controller thread. +// +// This class is only for testing Google Test's own constructs. Do not +// use it in user tests, either directly or indirectly. +class GTEST_API_ Notification { + public: + Notification(); + void Notify(); + void WaitForNotification(); + + private: + AutoHandle event_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification); +}; +#endif // GTEST_HAS_NOTIFICATION_ + +// On MinGW, we can have both GTEST_OS_WINDOWS and GTEST_HAS_PTHREAD +// defined, but we don't want to use MinGW's pthreads implementation, which +// has conformance problems with some versions of the POSIX standard. +#if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW + +// As a C-function, ThreadFuncWithCLinkage cannot be templated itself. +// Consequently, it cannot select a correct instantiation of ThreadWithParam +// in order to call its Run(). Introducing ThreadWithParamBase as a +// non-templated base class for ThreadWithParam allows us to bypass this +// problem. +class ThreadWithParamBase { + public: + virtual ~ThreadWithParamBase() {} + virtual void Run() = 0; +}; + +// pthread_create() accepts a pointer to a function type with the C linkage. +// According to the Standard (7.5/1), function types with different linkages +// are different even if they are otherwise identical. Some compilers (for +// example, SunStudio) treat them as different types. Since class methods +// cannot be defined with C-linkage we need to define a free C-function to +// pass into pthread_create(). +extern "C" inline void *ThreadFuncWithCLinkage(void *thread) { + static_cast(thread)->Run(); + return nullptr; +} + +// Helper class for testing Google Test's multi-threading constructs. +// To use it, write: +// +// void ThreadFunc(int param) { /* Do things with param */ } +// Notification thread_can_start; +// ... +// // The thread_can_start parameter is optional; you can supply NULL. +// ThreadWithParam thread(&ThreadFunc, 5, &thread_can_start); +// thread_can_start.Notify(); +// +// These classes are only for testing Google Test's own constructs. Do +// not use them in user tests, either directly or indirectly. +template +class ThreadWithParam : public ThreadWithParamBase { + public: + typedef void UserThreadFunc(T); + + ThreadWithParam(UserThreadFunc *func, T param, Notification *thread_can_start) + : func_(func), param_(param), thread_can_start_(thread_can_start), + finished_(false) { + ThreadWithParamBase *const base = this; + // The thread can be created only after all fields except thread_ + // have been initialized. + GTEST_CHECK_POSIX_SUCCESS_( + pthread_create(&thread_, nullptr, &ThreadFuncWithCLinkage, base)); + } + ~ThreadWithParam() override { Join(); } + + void Join() { + if (!finished_) { + GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, nullptr)); + finished_ = true; + } + } + + void Run() override { + if (thread_can_start_ != nullptr) thread_can_start_->WaitForNotification(); + func_(param_); + } + + private: + UserThreadFunc *const func_; // User-supplied thread function. + const T param_; // User-supplied parameter to the thread function. + // When non-NULL, used to block execution until the controller thread + // notifies. + Notification *const thread_can_start_; + bool finished_; // true if and only if we know that the thread function has + // finished. + pthread_t thread_; // The native thread object. + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam); +}; +#endif // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD || + // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ + +#if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ +// Mutex and ThreadLocal have already been imported into the namespace. +// Nothing to do here. + +#elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT + +// Mutex implements mutex on Windows platforms. It is used in conjunction +// with class MutexLock: +// +// Mutex mutex; +// ... +// MutexLock lock(&mutex); // Acquires the mutex and releases it at the +// // end of the current scope. +// +// A static Mutex *must* be defined or declared using one of the following +// macros: +// GTEST_DEFINE_STATIC_MUTEX_(g_some_mutex); +// GTEST_DECLARE_STATIC_MUTEX_(g_some_mutex); +// +// (A non-static Mutex is defined/declared in the usual way). +class GTEST_API_ Mutex { + public: + enum MutexType { kStatic = 0, kDynamic = 1 }; + // We rely on kStaticMutex being 0 as it is to what the linker initializes + // type_ in static mutexes. critical_section_ will be initialized lazily + // in ThreadSafeLazyInit(). + enum StaticConstructorSelector { kStaticMutex = 0 }; + + // This constructor intentionally does nothing. It relies on type_ being + // statically initialized to 0 (effectively setting it to kStatic) and on + // ThreadSafeLazyInit() to lazily initialize the rest of the members. + explicit Mutex(StaticConstructorSelector /*dummy*/) {} + + Mutex(); + ~Mutex(); + + void Lock(); + + void Unlock(); + + // Does nothing if the current thread holds the mutex. Otherwise, crashes + // with high probability. + void AssertHeld(); + + private: + // Initializes owner_thread_id_ and critical_section_ in static mutexes. + void ThreadSafeLazyInit(); + + // Per https://blogs.msdn.microsoft.com/oldnewthing/20040223-00/?p=40503, + // we assume that 0 is an invalid value for thread IDs. + unsigned int owner_thread_id_; + + // For static mutexes, we rely on these members being initialized to zeros + // by the linker. + MutexType type_; + long critical_section_init_phase_; // NOLINT + GTEST_CRITICAL_SECTION *critical_section_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex); +}; + +#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ + extern ::testing::internal::Mutex mutex + +#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \ + ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex) + +// We cannot name this class MutexLock because the ctor declaration would +// conflict with a macro named MutexLock, which is defined on some +// platforms. That macro is used as a defensive measure to prevent against +// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than +// "MutexLock l(&mu)". Hence the typedef trick below. +class GTestMutexLock { + public: + explicit GTestMutexLock(Mutex *mutex) : mutex_(mutex) { mutex_->Lock(); } + + ~GTestMutexLock() { mutex_->Unlock(); } + + private: + Mutex *const mutex_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock); +}; + +typedef GTestMutexLock MutexLock; + +// Base class for ValueHolder. Allows a caller to hold and delete a value +// without knowing its type. +class ThreadLocalValueHolderBase { + public: + virtual ~ThreadLocalValueHolderBase() {} +}; + +// Provides a way for a thread to send notifications to a ThreadLocal +// regardless of its parameter type. +class ThreadLocalBase { + public: + // Creates a new ValueHolder object holding a default value passed to + // this ThreadLocal's constructor and returns it. It is the caller's + // responsibility not to call this when the ThreadLocal instance already + // has a value on the current thread. + virtual ThreadLocalValueHolderBase *NewValueForCurrentThread() const = 0; + + protected: + ThreadLocalBase() {} + virtual ~ThreadLocalBase() {} + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocalBase); +}; + +// Maps a thread to a set of ThreadLocals that have values instantiated on that +// thread and notifies them when the thread exits. A ThreadLocal instance is +// expected to persist until all threads it has values on have terminated. +class GTEST_API_ ThreadLocalRegistry { + public: + // Registers thread_local_instance as having value on the current thread. + // Returns a value that can be used to identify the thread from other threads. + static ThreadLocalValueHolderBase *GetValueOnCurrentThread( + const ThreadLocalBase *thread_local_instance); + + // Invoked when a ThreadLocal instance is destroyed. + static void OnThreadLocalDestroyed( + const ThreadLocalBase *thread_local_instance); +}; + +class GTEST_API_ ThreadWithParamBase { + public: + void Join(); + + protected: + class Runnable { + public: + virtual ~Runnable() {} + virtual void Run() = 0; + }; + + ThreadWithParamBase(Runnable *runnable, Notification *thread_can_start); + virtual ~ThreadWithParamBase(); + + private: + AutoHandle thread_; +}; + +// Helper class for testing Google Test's multi-threading constructs. +template +class ThreadWithParam : public ThreadWithParamBase { + public: + typedef void UserThreadFunc(T); + + ThreadWithParam(UserThreadFunc *func, T param, Notification *thread_can_start) + : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {} + virtual ~ThreadWithParam() {} + + private: + class RunnableImpl : public Runnable { + public: + RunnableImpl(UserThreadFunc *func, T param) : func_(func), param_(param) {} + virtual ~RunnableImpl() {} + virtual void Run() { func_(param_); } + + private: + UserThreadFunc *const func_; + const T param_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(RunnableImpl); + }; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam); +}; + +// Implements thread-local storage on Windows systems. +// +// // Thread 1 +// ThreadLocal tl(100); // 100 is the default value for each thread. +// +// // Thread 2 +// tl.set(150); // Changes the value for thread 2 only. +// EXPECT_EQ(150, tl.get()); +// +// // Thread 1 +// EXPECT_EQ(100, tl.get()); // In thread 1, tl has the original value. +// tl.set(200); +// EXPECT_EQ(200, tl.get()); +// +// The template type argument T must have a public copy constructor. +// In addition, the default ThreadLocal constructor requires T to have +// a public default constructor. +// +// The users of a TheadLocal instance have to make sure that all but one +// threads (including the main one) using that instance have exited before +// destroying it. Otherwise, the per-thread objects managed for them by the +// ThreadLocal instance are not guaranteed to be destroyed on all platforms. +// +// Google Test only uses global ThreadLocal objects. That means they +// will die after main() has returned. Therefore, no per-thread +// object managed by Google Test will be leaked as long as all threads +// using Google Test have exited when main() returns. +template +class ThreadLocal : public ThreadLocalBase { + public: + ThreadLocal() : default_factory_(new DefaultValueHolderFactory()) {} + explicit ThreadLocal(const T &value) + : default_factory_(new InstanceValueHolderFactory(value)) {} + + ~ThreadLocal() { ThreadLocalRegistry::OnThreadLocalDestroyed(this); } + + T *pointer() { return GetOrCreateValue(); } + const T *pointer() const { return GetOrCreateValue(); } + const T &get() const { return *pointer(); } + void set(const T &value) { *pointer() = value; } + + private: + // Holds a value of T. Can be deleted via its base class without the caller + // knowing the type of T. + class ValueHolder : public ThreadLocalValueHolderBase { + public: + ValueHolder() : value_() {} + explicit ValueHolder(const T &value) : value_(value) {} + + T *pointer() { return &value_; } + + private: + T value_; + GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder); + }; + + T *GetOrCreateValue() const { + return static_cast( + ThreadLocalRegistry::GetValueOnCurrentThread(this)) + ->pointer(); + } + + virtual ThreadLocalValueHolderBase *NewValueForCurrentThread() const { + return default_factory_->MakeNewHolder(); + } + + class ValueHolderFactory { + public: + ValueHolderFactory() {} + virtual ~ValueHolderFactory() {} + virtual ValueHolder *MakeNewHolder() const = 0; + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory); + }; + + class DefaultValueHolderFactory : public ValueHolderFactory { + public: + DefaultValueHolderFactory() {} + ValueHolder *MakeNewHolder() const override { return new ValueHolder(); } + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory); + }; + + class InstanceValueHolderFactory : public ValueHolderFactory { + public: + explicit InstanceValueHolderFactory(const T &value) : value_(value) {} + ValueHolder *MakeNewHolder() const override { + return new ValueHolder(value_); + } + + private: + const T value_; // The value for each thread. + + GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory); + }; + + std::unique_ptr default_factory_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal); +}; + +#elif GTEST_HAS_PTHREAD + +// MutexBase and Mutex implement mutex on pthreads-based platforms. +class MutexBase { + public: + // Acquires this mutex. + void Lock() { + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_lock(&mutex_)); + owner_ = pthread_self(); + has_owner_ = true; + } + + // Releases this mutex. + void Unlock() { + // Since the lock is being released the owner_ field should no longer be + // considered valid. We don't protect writing to has_owner_ here, as it's + // the caller's responsibility to ensure that the current thread holds the + // mutex when this is called. + has_owner_ = false; + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_unlock(&mutex_)); + } + + // Does nothing if the current thread holds the mutex. Otherwise, crashes + // with high probability. + void AssertHeld() const { + GTEST_CHECK_(has_owner_ && pthread_equal(owner_, pthread_self())) + << "The current thread is not holding the mutex @" << this; + } + + // A static mutex may be used before main() is entered. It may even + // be used before the dynamic initialization stage. Therefore we + // must be able to initialize a static mutex object at link time. + // This means MutexBase has to be a POD and its member variables + // have to be public. + public: + pthread_mutex_t mutex_; // The underlying pthread mutex. + // has_owner_ indicates whether the owner_ field below contains a valid thread + // ID and is therefore safe to inspect (e.g., to use in pthread_equal()). All + // accesses to the owner_ field should be protected by a check of this field. + // An alternative might be to memset() owner_ to all zeros, but there's no + // guarantee that a zero'd pthread_t is necessarily invalid or even different + // from pthread_self(). + bool has_owner_; + pthread_t owner_; // The thread holding the mutex. +}; + +// Forward-declares a static mutex. +#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ + extern ::testing::internal::MutexBase mutex + +// Defines and statically (i.e. at link time) initializes a static mutex. +// The initialization list here does not explicitly initialize each field, +// instead relying on default initialization for the unspecified fields. In +// particular, the owner_ field (a pthread_t) is not explicitly initialized. +// This allows initialization to work whether pthread_t is a scalar or struct. +// The flag -Wmissing-field-initializers must not be specified for this to work. +#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \ + ::testing::internal::MutexBase mutex = { PTHREAD_MUTEX_INITIALIZER, false, 0 } + +// The Mutex class can only be used for mutexes created at runtime. It +// shares its API with MutexBase otherwise. +class Mutex : public MutexBase { + public: + Mutex() { + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr)); + has_owner_ = false; + } + ~Mutex() { GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_)); } + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex); +}; + +// We cannot name this class MutexLock because the ctor declaration would +// conflict with a macro named MutexLock, which is defined on some +// platforms. That macro is used as a defensive measure to prevent against +// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than +// "MutexLock l(&mu)". Hence the typedef trick below. +class GTestMutexLock { + public: + explicit GTestMutexLock(MutexBase *mutex) : mutex_(mutex) { mutex_->Lock(); } + + ~GTestMutexLock() { mutex_->Unlock(); } + + private: + MutexBase *const mutex_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock); +}; + +typedef GTestMutexLock MutexLock; + +// Helpers for ThreadLocal. + +// pthread_key_create() requires DeleteThreadLocalValue() to have +// C-linkage. Therefore it cannot be templatized to access +// ThreadLocal. Hence the need for class +// ThreadLocalValueHolderBase. +class ThreadLocalValueHolderBase { + public: + virtual ~ThreadLocalValueHolderBase() {} +}; + +// Called by pthread to delete thread-local data stored by +// pthread_setspecific(). +extern "C" inline void DeleteThreadLocalValue(void *value_holder) { + delete static_cast(value_holder); +} + +// Implements thread-local storage on pthreads-based systems. +template +class GTEST_API_ ThreadLocal { + public: + ThreadLocal() + : key_(CreateKey()), default_factory_(new DefaultValueHolderFactory()) {} + explicit ThreadLocal(const T &value) + : key_(CreateKey()), + default_factory_(new InstanceValueHolderFactory(value)) {} + + ~ThreadLocal() { + // Destroys the managed object for the current thread, if any. + DeleteThreadLocalValue(pthread_getspecific(key_)); + + // Releases resources associated with the key. This will *not* + // delete managed objects for other threads. + GTEST_CHECK_POSIX_SUCCESS_(pthread_key_delete(key_)); + } + + T *pointer() { return GetOrCreateValue(); } + const T *pointer() const { return GetOrCreateValue(); } + const T &get() const { return *pointer(); } + void set(const T &value) { *pointer() = value; } + + private: + // Holds a value of type T. + class ValueHolder : public ThreadLocalValueHolderBase { + public: + ValueHolder() : value_() {} + explicit ValueHolder(const T &value) : value_(value) {} + + T *pointer() { return &value_; } + + private: + T value_; + GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder); + }; + + static pthread_key_t CreateKey() { + pthread_key_t key; + // When a thread exits, DeleteThreadLocalValue() will be called on + // the object managed for that thread. + GTEST_CHECK_POSIX_SUCCESS_( + pthread_key_create(&key, &DeleteThreadLocalValue)); + return key; + } + + T *GetOrCreateValue() const { + ThreadLocalValueHolderBase *const holder = + static_cast(pthread_getspecific(key_)); + if (holder != nullptr) { + return CheckedDowncastToActualType(holder)->pointer(); + } + + ValueHolder *const new_holder = default_factory_->MakeNewHolder(); + ThreadLocalValueHolderBase *const holder_base = new_holder; + GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base)); + return new_holder->pointer(); + } + + class ValueHolderFactory { + public: + ValueHolderFactory() {} + virtual ~ValueHolderFactory() {} + virtual ValueHolder *MakeNewHolder() const = 0; + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory); + }; + + class DefaultValueHolderFactory : public ValueHolderFactory { + public: + DefaultValueHolderFactory() {} + ValueHolder *MakeNewHolder() const override { return new ValueHolder(); } + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory); + }; + + class InstanceValueHolderFactory : public ValueHolderFactory { + public: + explicit InstanceValueHolderFactory(const T &value) : value_(value) {} + ValueHolder *MakeNewHolder() const override { + return new ValueHolder(value_); + } + + private: + const T value_; // The value for each thread. + + GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory); + }; + + // A key pthreads uses for looking up per-thread values. + const pthread_key_t key_; + std::unique_ptr default_factory_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal); +}; + +#endif // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ + +#else // GTEST_IS_THREADSAFE + +// A dummy implementation of synchronization primitives (mutex, lock, +// and thread-local variable). Necessary for compiling Google Test where +// mutex is not supported - using Google Test in multiple threads is not +// supported on such platforms. + +class Mutex { + public: + Mutex() {} + void Lock() {} + void Unlock() {} + void AssertHeld() const {} +}; + +#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ + extern ::testing::internal::Mutex mutex + +#define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex + +// We cannot name this class MutexLock because the ctor declaration would +// conflict with a macro named MutexLock, which is defined on some +// platforms. That macro is used as a defensive measure to prevent against +// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than +// "MutexLock l(&mu)". Hence the typedef trick below. +class GTestMutexLock { + public: + explicit GTestMutexLock(Mutex *) {} // NOLINT +}; + +typedef GTestMutexLock MutexLock; + +template +class GTEST_API_ ThreadLocal { + public: + ThreadLocal() : value_() {} + explicit ThreadLocal(const T &value) : value_(value) {} + T *pointer() { return &value_; } + const T *pointer() const { return &value_; } + const T &get() const { return value_; } + void set(const T &value) { value_ = value; } + + private: + T value_; +}; + +#endif // GTEST_IS_THREADSAFE + +// Returns the number of threads running in the process, or 0 to indicate that +// we cannot detect it. +GTEST_API_ size_t GetThreadCount(); + +#if GTEST_OS_WINDOWS +#define GTEST_PATH_SEP_ "\\" +#define GTEST_HAS_ALT_PATH_SEP_ 1 +#else +#define GTEST_PATH_SEP_ "/" +#define GTEST_HAS_ALT_PATH_SEP_ 0 +#endif // GTEST_OS_WINDOWS + +// Utilities for char. + +// isspace(int ch) and friends accept an unsigned char or EOF. char +// may be signed, depending on the compiler (or compiler flags). +// Therefore we need to cast a char to unsigned char before calling +// isspace(), etc. + +inline bool IsAlpha(char ch) { + return isalpha(static_cast(ch)) != 0; +} +inline bool IsAlNum(char ch) { + return isalnum(static_cast(ch)) != 0; +} +inline bool IsDigit(char ch) { + return isdigit(static_cast(ch)) != 0; +} +inline bool IsLower(char ch) { + return islower(static_cast(ch)) != 0; +} +inline bool IsSpace(char ch) { + return isspace(static_cast(ch)) != 0; +} +inline bool IsUpper(char ch) { + return isupper(static_cast(ch)) != 0; +} +inline bool IsXDigit(char ch) { + return isxdigit(static_cast(ch)) != 0; +} +inline bool IsXDigit(wchar_t ch) { + const unsigned char low_byte = static_cast(ch); + return ch == low_byte && isxdigit(low_byte) != 0; +} + +inline char ToLower(char ch) { + return static_cast(tolower(static_cast(ch))); +} +inline char ToUpper(char ch) { + return static_cast(toupper(static_cast(ch))); +} + +inline std::string StripTrailingSpaces(std::string str) { + std::string::iterator it = str.end(); + while (it != str.begin() && IsSpace(*--it)) it = str.erase(it); + return str; +} + +// The testing::internal::posix namespace holds wrappers for common +// POSIX functions. These wrappers hide the differences between +// Windows/MSVC and POSIX systems. Since some compilers define these +// standard functions as macros, the wrapper cannot have the same name +// as the wrapped function. + +namespace posix { + +// Functions with a different name on Windows. + +#if GTEST_OS_WINDOWS + +typedef struct _stat StatStruct; + +#ifdef __BORLANDC__ +inline int IsATTY(int fd) { return isatty(fd); } +inline int StrCaseCmp(const char *s1, const char *s2) { + return stricmp(s1, s2); +} +inline char *StrDup(const char *src) { return strdup(src); } +#else // !__BORLANDC__ +#if GTEST_OS_WINDOWS_MOBILE +inline int IsATTY(int /* fd */) { return 0; } +#else +inline int IsATTY(int fd) { return _isatty(fd); } +#endif // GTEST_OS_WINDOWS_MOBILE +inline int StrCaseCmp(const char *s1, const char *s2) { + return _stricmp(s1, s2); +} +inline char *StrDup(const char *src) { return _strdup(src); } +#endif // __BORLANDC__ + +#if GTEST_OS_WINDOWS_MOBILE +inline int FileNo(FILE *file) { return reinterpret_cast(_fileno(file)); } +// Stat(), RmDir(), and IsDir() are not needed on Windows CE at this +// time and thus not defined there. +#else +inline int FileNo(FILE *file) { return _fileno(file); } +inline int Stat(const char *path, StatStruct *buf) { return _stat(path, buf); } +inline int RmDir(const char *dir) { return _rmdir(dir); } +inline bool IsDir(const StatStruct &st) { return (_S_IFDIR & st.st_mode) != 0; } +#endif // GTEST_OS_WINDOWS_MOBILE + +#elif GTEST_OS_ESP8266 +typedef struct stat StatStruct; + +inline int FileNo(FILE *file) { return fileno(file); } +inline int IsATTY(int fd) { return isatty(fd); } +inline int Stat(const char *path, StatStruct *buf) { + // stat function not implemented on ESP8266 + return 0; +} +inline int StrCaseCmp(const char *s1, const char *s2) { + return strcasecmp(s1, s2); +} +inline char *StrDup(const char *src) { return strdup(src); } +inline int RmDir(const char *dir) { return rmdir(dir); } +inline bool IsDir(const StatStruct &st) { return S_ISDIR(st.st_mode); } + +#else + +typedef struct stat StatStruct; + +inline int FileNo(FILE *file) { return fileno(file); } +inline int IsATTY(int fd) { return isatty(fd); } +inline int Stat(const char *path, StatStruct *buf) { return stat(path, buf); } +inline int StrCaseCmp(const char *s1, const char *s2) { + return strcasecmp(s1, s2); +} +inline char *StrDup(const char *src) { return strdup(src); } +inline int RmDir(const char *dir) { return rmdir(dir); } +inline bool IsDir(const StatStruct &st) { return S_ISDIR(st.st_mode); } + +#endif // GTEST_OS_WINDOWS + +// Functions deprecated by MSVC 8.0. + +GTEST_DISABLE_MSC_DEPRECATED_PUSH_() + +// ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and +// StrError() aren't needed on Windows CE at this time and thus not +// defined there. + +#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT +inline int ChDir(const char *dir) { return chdir(dir); } +#endif +inline FILE *FOpen(const char *path, const char *mode) { + return fopen(path, mode); +} +#if !GTEST_OS_WINDOWS_MOBILE +inline FILE *FReopen(const char *path, const char *mode, FILE *stream) { + return freopen(path, mode, stream); +} +inline FILE *FDOpen(int fd, const char *mode) { return fdopen(fd, mode); } +#endif +inline int FClose(FILE *fp) { return fclose(fp); } +#if !GTEST_OS_WINDOWS_MOBILE +inline int Read(int fd, void *buf, unsigned int count) { + return static_cast(read(fd, buf, count)); +} +inline int Write(int fd, const void *buf, unsigned int count) { + return static_cast(write(fd, buf, count)); +} +inline int Close(int fd) { return close(fd); } +inline const char *StrError(int errnum) { return strerror(errnum); } +#endif +inline const char *GetEnv(const char *name) { +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \ + GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 + // We are on an embedded platform, which has no environment variables. + static_cast(name); // To prevent 'unused argument' warning. + return nullptr; +#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9) + // Environment variables which we programmatically clear will be set to the + // empty string rather than unset (NULL). Handle that case. + const char *const env = getenv(name); + return (env != nullptr && env[0] != '\0') ? env : nullptr; +#else + return getenv(name); +#endif +} + +GTEST_DISABLE_MSC_DEPRECATED_POP_() + +#if GTEST_OS_WINDOWS_MOBILE +// Windows CE has no C library. The abort() function is used in +// several places in Google Test. This implementation provides a reasonable +// imitation of standard behaviour. +[[noreturn]] void Abort(); +#else +[[noreturn]] inline void Abort() { + abort(); +} +#endif // GTEST_OS_WINDOWS_MOBILE + +} // namespace posix + +// MSVC "deprecates" snprintf and issues warnings wherever it is used. In +// order to avoid these warnings, we need to use _snprintf or _snprintf_s on +// MSVC-based platforms. We map the GTEST_SNPRINTF_ macro to the appropriate +// function in order to achieve that. We use macro definition here because +// snprintf is a variadic function. +#if _MSC_VER && !GTEST_OS_WINDOWS_MOBILE +// MSVC 2005 and above support variadic macros. +#define GTEST_SNPRINTF_(buffer, size, format, ...) \ + _snprintf_s(buffer, size, size, format, __VA_ARGS__) +#elif defined(_MSC_VER) +// Windows CE does not define _snprintf_s +#define GTEST_SNPRINTF_ _snprintf +#else +#define GTEST_SNPRINTF_ snprintf +#endif + +// The biggest signed integer type the compiler supports. +// +// long long is guaranteed to be at least 64-bits in C++11. +using BiggestInt = long long; // NOLINT + +// The maximum number a BiggestInt can represent. +constexpr BiggestInt kMaxBiggestInt = (std::numeric_limits::max)(); + +// This template class serves as a compile-time function from size to +// type. It maps a size in bytes to a primitive type with that +// size. e.g. +// +// TypeWithSize<4>::UInt +// +// is typedef-ed to be unsigned int (unsigned integer made up of 4 +// bytes). +// +// Such functionality should belong to STL, but I cannot find it +// there. +// +// Google Test uses this class in the implementation of floating-point +// comparison. +// +// For now it only handles UInt (unsigned int) as that's all Google Test +// needs. Other types can be easily added in the future if need +// arises. +template +class TypeWithSize { + public: + // This prevents the user from using TypeWithSize with incorrect + // values of N. + using UInt = void; +}; + +// The specialization for size 4. +template <> +class TypeWithSize<4> { + public: + using Int = std::int32_t; + using UInt = std::uint32_t; +}; + +// The specialization for size 8. +template <> +class TypeWithSize<8> { + public: + using Int = std::int64_t; + using UInt = std::uint64_t; +}; + +// Integer types of known sizes. +using TimeInMillis = int64_t; // Represents time in milliseconds. + +// Utilities for command line flags and environment variables. + +// Macro for referencing flags. +#if !defined(GTEST_FLAG) +#define GTEST_FLAG(name) FLAGS_gtest_##name +#endif // !defined(GTEST_FLAG) + +#if !defined(GTEST_USE_OWN_FLAGFILE_FLAG_) +#define GTEST_USE_OWN_FLAGFILE_FLAG_ 1 +#endif // !defined(GTEST_USE_OWN_FLAGFILE_FLAG_) + +#if !defined(GTEST_DECLARE_bool_) +#define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver + +// Macros for declaring flags. +#define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name) +#define GTEST_DECLARE_int32_(name) \ + GTEST_API_ extern std::int32_t GTEST_FLAG(name) +#define GTEST_DECLARE_string_(name) \ + GTEST_API_ extern ::std::string GTEST_FLAG(name) + +// Macros for defining flags. +#define GTEST_DEFINE_bool_(name, default_val, doc) \ + GTEST_API_ bool GTEST_FLAG(name) = (default_val) +#define GTEST_DEFINE_int32_(name, default_val, doc) \ + GTEST_API_ std::int32_t GTEST_FLAG(name) = (default_val) +#define GTEST_DEFINE_string_(name, default_val, doc) \ + GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val) + +#endif // !defined(GTEST_DECLARE_bool_) + +// Thread annotations +#if !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_) +#define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks) +#define GTEST_LOCK_EXCLUDED_(locks) +#endif // !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_) + +// Parses 'str' for a 32-bit signed integer. If successful, writes the result +// to *value and returns true; otherwise leaves *value unchanged and returns +// false. +bool ParseInt32(const Message &src_text, const char *str, int32_t *value); + +// Parses a bool/int32_t/string from the environment variable +// corresponding to the given Google Test flag. +bool BoolFromGTestEnv(const char *flag, bool default_val); +GTEST_API_ int32_t Int32FromGTestEnv(const char *flag, int32_t default_val); +std::string OutputFlagAlsoCheckEnvVar(); +const char *StringFromGTestEnv(const char *flag, const char *default_val); + +} // namespace internal +} // namespace testing + +#if !defined(GTEST_INTERNAL_DEPRECATED) + +// Internal Macro to mark an API deprecated, for googletest usage only +// Usage: class GTEST_INTERNAL_DEPRECATED(message) MyClass or +// GTEST_INTERNAL_DEPRECATED(message) myFunction(); Every usage of +// a deprecated entity will trigger a warning when compiled with +// `-Wdeprecated-declarations` option (clang, gcc, any __GNUC__ compiler). +// For msvc /W3 option will need to be used +// Note that for 'other' compilers this macro evaluates to nothing to prevent +// compilations errors. +#if defined(_MSC_VER) +#define GTEST_INTERNAL_DEPRECATED(message) __declspec(deprecated(message)) +#elif defined(__GNUC__) +#define GTEST_INTERNAL_DEPRECATED(message) __attribute__((deprecated(message))) +#else +#define GTEST_INTERNAL_DEPRECATED(message) +#endif + +#endif // !defined(GTEST_INTERNAL_DEPRECATED) + +#if GTEST_HAS_ABSL +// Always use absl::string_view for Matcher<> specializations if googletest +// is built with absl support. +#define GTEST_INTERNAL_HAS_STRING_VIEW 1 +#include "absl/strings/string_view.h" +namespace testing { +namespace internal { +using StringView = ::absl::string_view; +} // namespace internal +} // namespace testing +#else +#ifdef __has_include +#if __has_include() && __cplusplus >= 201703L +// Otherwise for C++17 and higher use std::string_view for Matcher<> +// specializations. +#define GTEST_INTERNAL_HAS_STRING_VIEW 1 +#include +namespace testing { +namespace internal { +using StringView = ::std::string_view; +} // namespace internal +} // namespace testing + // The case where absl is configured NOT to alias std::string_view is not + // supported. +#endif // __has_include() && __cplusplus >= 201703L +#endif // __has_include +#endif // GTEST_HAS_ABSL + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-string.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-string.h new file mode 100644 index 000000000..f1f933097 --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-string.h @@ -0,0 +1,171 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This header file declares the String class and functions used internally by +// Google Test. They are subject to change without notice. They should not used +// by code external to Google Test. +// +// This header file is #included by gtest-internal.h. +// It should not be #included by other files. + +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ + +#ifdef __BORLANDC__ +// string.h is not guaranteed to provide strcpy on C++ Builder. +#include +#endif + +#include +#include +#include + +#include "gtest/internal/gtest-port.h" + +namespace testing { +namespace internal { + +// String - an abstract class holding static string utilities. +class GTEST_API_ String { + public: + // Static utility methods + + // Clones a 0-terminated C string, allocating memory using new. The + // caller is responsible for deleting the return value using + // delete[]. Returns the cloned string, or NULL if the input is + // NULL. + // + // This is different from strdup() in string.h, which allocates + // memory using malloc(). + static const char *CloneCString(const char *c_str); + +#if GTEST_OS_WINDOWS_MOBILE + // Windows CE does not have the 'ANSI' versions of Win32 APIs. To be + // able to pass strings to Win32 APIs on CE we need to convert them + // to 'Unicode', UTF-16. + + // Creates a UTF-16 wide string from the given ANSI string, allocating + // memory using new. The caller is responsible for deleting the return + // value using delete[]. Returns the wide string, or NULL if the + // input is NULL. + // + // The wide string is created using the ANSI codepage (CP_ACP) to + // match the behaviour of the ANSI versions of Win32 calls and the + // C runtime. + static LPCWSTR AnsiToUtf16(const char *c_str); + + // Creates an ANSI string from the given wide string, allocating + // memory using new. The caller is responsible for deleting the return + // value using delete[]. Returns the ANSI string, or NULL if the + // input is NULL. + // + // The returned string is created using the ANSI codepage (CP_ACP) to + // match the behaviour of the ANSI versions of Win32 calls and the + // C runtime. + static const char *Utf16ToAnsi(LPCWSTR utf16_str); +#endif + + // Compares two C strings. Returns true if and only if they have the same + // content. + // + // Unlike strcmp(), this function can handle NULL argument(s). A + // NULL C string is considered different to any non-NULL C string, + // including the empty string. + static bool CStringEquals(const char *lhs, const char *rhs); + + // Converts a wide C string to a String using the UTF-8 encoding. + // NULL will be converted to "(null)". If an error occurred during + // the conversion, "(failed to convert from wide string)" is + // returned. + static std::string ShowWideCString(const wchar_t *wide_c_str); + + // Compares two wide C strings. Returns true if and only if they have the + // same content. + // + // Unlike wcscmp(), this function can handle NULL argument(s). A + // NULL C string is considered different to any non-NULL C string, + // including the empty string. + static bool WideCStringEquals(const wchar_t *lhs, const wchar_t *rhs); + + // Compares two C strings, ignoring case. Returns true if and only if + // they have the same content. + // + // Unlike strcasecmp(), this function can handle NULL argument(s). + // A NULL C string is considered different to any non-NULL C string, + // including the empty string. + static bool CaseInsensitiveCStringEquals(const char *lhs, const char *rhs); + + // Compares two wide C strings, ignoring case. Returns true if and only if + // they have the same content. + // + // Unlike wcscasecmp(), this function can handle NULL argument(s). + // A NULL C string is considered different to any non-NULL wide C string, + // including the empty string. + // NB: The implementations on different platforms slightly differ. + // On windows, this method uses _wcsicmp which compares according to LC_CTYPE + // environment variable. On GNU platform this method uses wcscasecmp + // which compares according to LC_CTYPE category of the current locale. + // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the + // current locale. + static bool CaseInsensitiveWideCStringEquals(const wchar_t *lhs, + const wchar_t *rhs); + + // Returns true if and only if the given string ends with the given suffix, + // ignoring case. Any string is considered to end with an empty suffix. + static bool EndsWithCaseInsensitive(const std::string &str, + const std::string &suffix); + + // Formats an int value as "%02d". + static std::string FormatIntWidth2(int value); // "%02d" for width == 2 + + // Formats an int value as "%X". + static std::string FormatHexInt(int value); + + // Formats an int value as "%X". + static std::string FormatHexUInt32(uint32_t value); + + // Formats a byte as "%02X". + static std::string FormatByte(unsigned char value); + + private: + String(); // Not meant to be instantiated. +}; // class String + +// Gets the content of the stringstream's buffer as an std::string. Each '\0' +// character in the buffer is replaced with "\\0". +GTEST_API_ std::string StringStreamToString(::std::stringstream *stream); + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h new file mode 100644 index 000000000..3b3a651dc --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h @@ -0,0 +1,184 @@ +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Type utilities needed for implementing typed and type-parameterized +// tests. + +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ + +#include "gtest/internal/gtest-port.h" + +// #ifdef __GNUC__ is too general here. It is possible to use gcc without using +// libstdc++ (which is where cxxabi.h comes from). +#if GTEST_HAS_CXXABI_H_ +#include +#elif defined(__HP_aCC) +#include +#endif // GTEST_HASH_CXXABI_H_ + +namespace testing { +namespace internal { + +// Canonicalizes a given name with respect to the Standard C++ Library. +// This handles removing the inline namespace within `std` that is +// used by various standard libraries (e.g., `std::__1`). Names outside +// of namespace std are returned unmodified. +inline std::string CanonicalizeForStdLibVersioning(std::string s) { + static const char prefix[] = "std::__"; + if (s.compare(0, strlen(prefix), prefix) == 0) { + std::string::size_type end = s.find("::", strlen(prefix)); + if (end != s.npos) { + // Erase everything between the initial `std` and the second `::`. + s.erase(strlen("std"), end - strlen("std")); + } + } + return s; +} + +// GetTypeName() returns a human-readable name of type T. +// NB: This function is also used in Google Mock, so don't move it inside of +// the typed-test-only section below. +template +std::string GetTypeName() { +#if GTEST_HAS_RTTI + + const char *const name = typeid(T).name(); +#if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC) + int status = 0; + // gcc's implementation of typeid(T).name() mangles the type name, + // so we have to demangle it. +#if GTEST_HAS_CXXABI_H_ + using abi::__cxa_demangle; +#endif // GTEST_HAS_CXXABI_H_ + char *const readable_name = __cxa_demangle(name, nullptr, nullptr, &status); + const std::string name_str(status == 0 ? readable_name : name); + free(readable_name); + return CanonicalizeForStdLibVersioning(name_str); +#else + return name; +#endif // GTEST_HAS_CXXABI_H_ || __HP_aCC + +#else + + return ""; + +#endif // GTEST_HAS_RTTI +} + +#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P + +// A unique type indicating an empty node +struct None {}; + +#define GTEST_TEMPLATE_ \ + template \ + class + +// The template "selector" struct TemplateSel is used to +// represent Tmpl, which must be a class template with one type +// parameter, as a type. TemplateSel::Bind::type is defined +// as the type Tmpl. This allows us to actually instantiate the +// template "selected" by TemplateSel. +// +// This trick is necessary for simulating typedef for class templates, +// which C++ doesn't support directly. +template +struct TemplateSel { + template + struct Bind { + typedef Tmpl type; + }; +}; + +#define GTEST_BIND_(TmplSel, T) TmplSel::template Bind::type + +template +struct Templates { + using Head = TemplateSel; + using Tail = Templates; +}; + +template +struct Templates { + using Head = TemplateSel; + using Tail = None; +}; + +// Tuple-like type lists +template +struct Types { + using Head = Head_; + using Tail = Types; +}; + +template +struct Types { + using Head = Head_; + using Tail = None; +}; + +// Helper metafunctions to tell apart a single type from types +// generated by ::testing::Types +template +struct ProxyTypeList { + using type = Types; +}; + +template +struct is_proxy_type_list : std::false_type {}; + +template +struct is_proxy_type_list> : std::true_type {}; + +// Generator which conditionally creates type lists. +// It recognizes if a requested type list should be created +// and prevents creating a new type list nested within another one. +template +struct GenerateTypeList { + private: + using proxy = typename std::conditional::value, T, + ProxyTypeList>::type; + + public: + using type = typename proxy::type; +}; + +#endif // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P + +} // namespace internal + +template +using Types = internal::ProxyTypeList; + +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ diff --git a/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-all.cc b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-all.cc new file mode 100644 index 000000000..ad292905c --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-all.cc @@ -0,0 +1,48 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +// Google C++ Testing and Mocking Framework (Google Test) +// +// Sometimes it's desirable to build Google Test by compiling a single file. +// This file serves this purpose. + +// This line ensures that gtest.h can be compiled on its own, even +// when it's fused. +#include "gtest/gtest.h" + +// The following lines pull in the real gtest *.cc files. +#include "src/gtest.cc" +#include "src/gtest-death-test.cc" +#include "src/gtest-filepath.cc" +#include "src/gtest-matchers.cc" +#include "src/gtest-port.cc" +#include "src/gtest-printers.cc" +#include "src/gtest-test-part.cc" +#include "src/gtest-typed-test.cc" diff --git a/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-death-test.cc b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-death-test.cc new file mode 100644 index 000000000..c38551cda --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-death-test.cc @@ -0,0 +1,1614 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +// This file implements death tests. + +#include "gtest/gtest-death-test.h" + +#include + +#include "gtest/internal/gtest-port.h" +#include "gtest/internal/custom/gtest.h" + +#if GTEST_HAS_DEATH_TEST + +#if GTEST_OS_MAC +#include +#endif // GTEST_OS_MAC + +#include +#include +#include + +#if GTEST_OS_LINUX +#include +#endif // GTEST_OS_LINUX + +#include + +#if GTEST_OS_WINDOWS +#include +#else +#include +#include +#endif // GTEST_OS_WINDOWS + +#if GTEST_OS_QNX +#include +#endif // GTEST_OS_QNX + +#if GTEST_OS_FUCHSIA +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif // GTEST_OS_FUCHSIA + +#endif // GTEST_HAS_DEATH_TEST + +#include "gtest/gtest-message.h" +#include "gtest/internal/gtest-string.h" +#include "src/gtest-internal-inl.h" + +namespace testing { + +// Constants. + +// The default death test style. +// +// This is defined in internal/gtest-port.h as "fast", but can be overridden by +// a definition in internal/custom/gtest-port.h. The recommended value, which is +// used internally at Google, is "threadsafe". +static const char kDefaultDeathTestStyle[] = GTEST_DEFAULT_DEATH_TEST_STYLE; + +GTEST_DEFINE_string_( + death_test_style, + internal::StringFromGTestEnv("death_test_style", kDefaultDeathTestStyle), + "Indicates how to run a death test in a forked child process: " + "\"threadsafe\" (child process re-executes the test binary " + "from the beginning, running only the specific death test) or " + "\"fast\" (child process runs the death test immediately " + "after forking)."); + +GTEST_DEFINE_bool_( + death_test_use_fork, + internal::BoolFromGTestEnv("death_test_use_fork", false), + "Instructs to use fork()/_exit() instead of clone() in death tests. " + "Ignored and always uses fork() on POSIX systems where clone() is not " + "implemented. Useful when running under valgrind or similar tools if " + "those do not support clone(). Valgrind 3.3.1 will just fail if " + "it sees an unsupported combination of clone() flags. " + "It is not recommended to use this flag w/o valgrind though it will " + "work in 99% of the cases. Once valgrind is fixed, this flag will " + "most likely be removed."); + +namespace internal { +GTEST_DEFINE_string_( + internal_run_death_test, "", + "Indicates the file, line number, temporal index of " + "the single death test to run, and a file descriptor to " + "which a success code may be sent, all separated by " + "the '|' characters. This flag is specified if and only if the " + "current process is a sub-process launched for running a thread-safe " + "death test. FOR INTERNAL USE ONLY."); +} // namespace internal + +#if GTEST_HAS_DEATH_TEST + +namespace internal { + +// Valid only for fast death tests. Indicates the code is running in the +// child process of a fast style death test. +#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA +static bool g_in_fast_death_test_child = false; +#endif + +// Returns a Boolean value indicating whether the caller is currently +// executing in the context of the death test child process. Tools such as +// Valgrind heap checkers may need this to modify their behavior in death +// tests. IMPORTANT: This is an internal utility. Using it may break the +// implementation of death tests. User code MUST NOT use it. +bool InDeathTestChild() { +#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA + + // On Windows and Fuchsia, death tests are thread-safe regardless of the value + // of the death_test_style flag. + return !GTEST_FLAG(internal_run_death_test).empty(); + +#else + + if (GTEST_FLAG(death_test_style) == "threadsafe") + return !GTEST_FLAG(internal_run_death_test).empty(); + else + return g_in_fast_death_test_child; +#endif +} + +} // namespace internal + +// ExitedWithCode constructor. +ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {} + +// ExitedWithCode function-call operator. +bool ExitedWithCode::operator()(int exit_status) const { +#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA + + return exit_status == exit_code_; + +#else + + return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_; + +#endif // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA +} + +#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA +// KilledBySignal constructor. +KilledBySignal::KilledBySignal(int signum) : signum_(signum) {} + +// KilledBySignal function-call operator. +bool KilledBySignal::operator()(int exit_status) const { +#if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_) + { + bool result; + if (GTEST_KILLED_BY_SIGNAL_OVERRIDE_(signum_, exit_status, &result)) { + return result; + } + } +#endif // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_) + return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_; +} +#endif // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA + +namespace internal { + +// Utilities needed for death tests. + +// Generates a textual description of a given exit code, in the format +// specified by wait(2). +static std::string ExitSummary(int exit_code) { + Message m; + +#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA + + m << "Exited with exit status " << exit_code; + +#else + + if (WIFEXITED(exit_code)) { + m << "Exited with exit status " << WEXITSTATUS(exit_code); + } else if (WIFSIGNALED(exit_code)) { + m << "Terminated by signal " << WTERMSIG(exit_code); + } +#ifdef WCOREDUMP + if (WCOREDUMP(exit_code)) { + m << " (core dumped)"; + } +#endif +#endif // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA + + return m.GetString(); +} + +// Returns true if exit_status describes a process that was terminated +// by a signal, or exited normally with a nonzero exit code. +bool ExitedUnsuccessfully(int exit_status) { + return !ExitedWithCode(0)(exit_status); +} + +#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA +// Generates a textual failure message when a death test finds more than +// one thread running, or cannot determine the number of threads, prior +// to executing the given statement. It is the responsibility of the +// caller not to pass a thread_count of 1. +static std::string DeathTestThreadWarning(size_t thread_count) { + Message msg; + msg << "Death tests use fork(), which is unsafe particularly" + << " in a threaded context. For this test, " << GTEST_NAME_ << " "; + if (thread_count == 0) { + msg << "couldn't detect the number of threads."; + } else { + msg << "detected " << thread_count << " threads."; + } + msg << " See " + "https://github.com/google/googletest/blob/master/googletest/docs/" + "advanced.md#death-tests-and-threads" + << " for more explanation and suggested solutions, especially if" + << " this is the last message you see before your test times out."; + return msg.GetString(); +} +#endif // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA + +// Flag characters for reporting a death test that did not die. +static const char kDeathTestLived = 'L'; +static const char kDeathTestReturned = 'R'; +static const char kDeathTestThrew = 'T'; +static const char kDeathTestInternalError = 'I'; + +#if GTEST_OS_FUCHSIA + +// File descriptor used for the pipe in the child process. +static const int kFuchsiaReadPipeFd = 3; + +#endif + +// An enumeration describing all of the possible ways that a death test can +// conclude. DIED means that the process died while executing the test +// code; LIVED means that process lived beyond the end of the test code; +// RETURNED means that the test statement attempted to execute a return +// statement, which is not allowed; THREW means that the test statement +// returned control by throwing an exception. IN_PROGRESS means the test +// has not yet concluded. +enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW }; + +// Routine for aborting the program which is safe to call from an +// exec-style death test child process, in which case the error +// message is propagated back to the parent process. Otherwise, the +// message is simply printed to stderr. In either case, the program +// then exits with status 1. +static void DeathTestAbort(const std::string &message) { + // On a POSIX system, this function may be called from a threadsafe-style + // death test child process, which operates on a very small stack. Use + // the heap for any additional non-minuscule memory requirements. + const InternalRunDeathTestFlag *const flag = + GetUnitTestImpl()->internal_run_death_test_flag(); + if (flag != nullptr) { + FILE *parent = posix::FDOpen(flag->write_fd(), "w"); + fputc(kDeathTestInternalError, parent); + fprintf(parent, "%s", message.c_str()); + fflush(parent); + _exit(1); + } else { + fprintf(stderr, "%s", message.c_str()); + fflush(stderr); + posix::Abort(); + } +} + +// A replacement for CHECK that calls DeathTestAbort if the assertion +// fails. +#define GTEST_DEATH_TEST_CHECK_(expression) \ + do { \ + if (!::testing::internal::IsTrue(expression)) { \ + DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ + \ + ", line " + \ + ::testing::internal::StreamableToString(__LINE__) + \ + ": " + #expression); \ + } \ + } while (::testing::internal::AlwaysFalse()) + +// This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for +// evaluating any system call that fulfills two conditions: it must return +// -1 on failure, and set errno to EINTR when it is interrupted and +// should be tried again. The macro expands to a loop that repeatedly +// evaluates the expression as long as it evaluates to -1 and sets +// errno to EINTR. If the expression evaluates to -1 but errno is +// something other than EINTR, DeathTestAbort is called. +#define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \ + do { \ + int gtest_retval; \ + do { \ + gtest_retval = (expression); \ + } while (gtest_retval == -1 && errno == EINTR); \ + if (gtest_retval == -1) { \ + DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ + \ + ", line " + \ + ::testing::internal::StreamableToString(__LINE__) + \ + ": " + #expression + " != -1"); \ + } \ + } while (::testing::internal::AlwaysFalse()) + +// Returns the message describing the last system error in errno. +std::string GetLastErrnoDescription() { + return errno == 0 ? "" : posix::StrError(errno); +} + +// This is called from a death test parent process to read a failure +// message from the death test child process and log it with the FATAL +// severity. On Windows, the message is read from a pipe handle. On other +// platforms, it is read from a file descriptor. +static void FailFromInternalError(int fd) { + Message error; + char buffer[256]; + int num_read; + + do { + while ((num_read = posix::Read(fd, buffer, 255)) > 0) { + buffer[num_read] = '\0'; + error << buffer; + } + } while (num_read == -1 && errno == EINTR); + + if (num_read == 0) { + GTEST_LOG_(FATAL) << error.GetString(); + } else { + const int last_error = errno; + GTEST_LOG_(FATAL) << "Error while reading death test internal: " + << GetLastErrnoDescription() << " [" << last_error << "]"; + } +} + +// Death test constructor. Increments the running death test count +// for the current test. +DeathTest::DeathTest() { + TestInfo *const info = GetUnitTestImpl()->current_test_info(); + if (info == nullptr) { + DeathTestAbort( + "Cannot run a death test outside of a TEST or " + "TEST_F construct"); + } +} + +// Creates and returns a death test by dispatching to the current +// death test factory. +bool DeathTest::Create(const char *statement, + Matcher matcher, const char *file, + int line, DeathTest **test) { + return GetUnitTestImpl()->death_test_factory()->Create( + statement, std::move(matcher), file, line, test); +} + +const char *DeathTest::LastMessage() { + return last_death_test_message_.c_str(); +} + +void DeathTest::set_last_death_test_message(const std::string &message) { + last_death_test_message_ = message; +} + +std::string DeathTest::last_death_test_message_; + +// Provides cross platform implementation for some death functionality. +class DeathTestImpl : public DeathTest { + protected: + DeathTestImpl(const char *a_statement, Matcher matcher) + : statement_(a_statement), matcher_(std::move(matcher)), spawned_(false), + status_(-1), outcome_(IN_PROGRESS), read_fd_(-1), write_fd_(-1) {} + + // read_fd_ is expected to be closed and cleared by a derived class. + ~DeathTestImpl() override { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); } + + void Abort(AbortReason reason) override; + bool Passed(bool status_ok) override; + + const char *statement() const { return statement_; } + bool spawned() const { return spawned_; } + void set_spawned(bool is_spawned) { spawned_ = is_spawned; } + int status() const { return status_; } + void set_status(int a_status) { status_ = a_status; } + DeathTestOutcome outcome() const { return outcome_; } + void set_outcome(DeathTestOutcome an_outcome) { outcome_ = an_outcome; } + int read_fd() const { return read_fd_; } + void set_read_fd(int fd) { read_fd_ = fd; } + int write_fd() const { return write_fd_; } + void set_write_fd(int fd) { write_fd_ = fd; } + + // Called in the parent process only. Reads the result code of the death + // test child process via a pipe, interprets it to set the outcome_ + // member, and closes read_fd_. Outputs diagnostics and terminates in + // case of unexpected codes. + void ReadAndInterpretStatusByte(); + + // Returns stderr output from the child process. + virtual std::string GetErrorLogs(); + + private: + // The textual content of the code this object is testing. This class + // doesn't own this string and should not attempt to delete it. + const char *const statement_; + // A matcher that's expected to match the stderr output by the child process. + Matcher matcher_; + // True if the death test child process has been successfully spawned. + bool spawned_; + // The exit status of the child process. + int status_; + // How the death test concluded. + DeathTestOutcome outcome_; + // Descriptor to the read end of the pipe to the child process. It is + // always -1 in the child process. The child keeps its write end of the + // pipe in write_fd_. + int read_fd_; + // Descriptor to the child's write end of the pipe to the parent process. + // It is always -1 in the parent process. The parent keeps its end of the + // pipe in read_fd_. + int write_fd_; +}; + +// Called in the parent process only. Reads the result code of the death +// test child process via a pipe, interprets it to set the outcome_ +// member, and closes read_fd_. Outputs diagnostics and terminates in +// case of unexpected codes. +void DeathTestImpl::ReadAndInterpretStatusByte() { + char flag; + int bytes_read; + + // The read() here blocks until data is available (signifying the + // failure of the death test) or until the pipe is closed (signifying + // its success), so it's okay to call this in the parent before + // the child process has exited. + do { + bytes_read = posix::Read(read_fd(), &flag, 1); + } while (bytes_read == -1 && errno == EINTR); + + if (bytes_read == 0) { + set_outcome(DIED); + } else if (bytes_read == 1) { + switch (flag) { + case kDeathTestReturned: set_outcome(RETURNED); break; + case kDeathTestThrew: set_outcome(THREW); break; + case kDeathTestLived: set_outcome(LIVED); break; + case kDeathTestInternalError: + FailFromInternalError(read_fd()); // Does not return. + break; + default: + GTEST_LOG_(FATAL) << "Death test child process reported " + << "unexpected status byte (" + << static_cast(flag) << ")"; + } + } else { + GTEST_LOG_(FATAL) << "Read from death test child process failed: " + << GetLastErrnoDescription(); + } + GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Close(read_fd())); + set_read_fd(-1); +} + +std::string DeathTestImpl::GetErrorLogs() { return GetCapturedStderr(); } + +// Signals that the death test code which should have exited, didn't. +// Should be called only in a death test child process. +// Writes a status byte to the child's status file descriptor, then +// calls _exit(1). +void DeathTestImpl::Abort(AbortReason reason) { + // The parent process considers the death test to be a failure if + // it finds any data in our pipe. So, here we write a single flag byte + // to the pipe, then exit. + const char status_ch = reason == TEST_DID_NOT_DIE + ? kDeathTestLived + : reason == TEST_THREW_EXCEPTION + ? kDeathTestThrew + : kDeathTestReturned; + + GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1)); + // We are leaking the descriptor here because on some platforms (i.e., + // when built as Windows DLL), destructors of global objects will still + // run after calling _exit(). On such systems, write_fd_ will be + // indirectly closed from the destructor of UnitTestImpl, causing double + // close if it is also closed here. On debug configurations, double close + // may assert. As there are no in-process buffers to flush here, we are + // relying on the OS to close the descriptor after the process terminates + // when the destructors are not run. + _exit(1); // Exits w/o any normal exit hooks (we were supposed to crash) +} + +// Returns an indented copy of stderr output for a death test. +// This makes distinguishing death test output lines from regular log lines +// much easier. +static ::std::string FormatDeathTestOutput(const ::std::string &output) { + ::std::string ret; + for (size_t at = 0;;) { + const size_t line_end = output.find('\n', at); + ret += "[ DEATH ] "; + if (line_end == ::std::string::npos) { + ret += output.substr(at); + break; + } + ret += output.substr(at, line_end + 1 - at); + at = line_end + 1; + } + return ret; +} + +// Assesses the success or failure of a death test, using both private +// members which have previously been set, and one argument: +// +// Private data members: +// outcome: An enumeration describing how the death test +// concluded: DIED, LIVED, THREW, or RETURNED. The death test +// fails in the latter three cases. +// status: The exit status of the child process. On *nix, it is in the +// in the format specified by wait(2). On Windows, this is the +// value supplied to the ExitProcess() API or a numeric code +// of the exception that terminated the program. +// matcher_: A matcher that's expected to match the stderr output by the child +// process. +// +// Argument: +// status_ok: true if exit_status is acceptable in the context of +// this particular death test, which fails if it is false +// +// Returns true if and only if all of the above conditions are met. Otherwise, +// the first failing condition, in the order given above, is the one that is +// reported. Also sets the last death test message string. +bool DeathTestImpl::Passed(bool status_ok) { + if (!spawned()) return false; + + const std::string error_message = GetErrorLogs(); + + bool success = false; + Message buffer; + + buffer << "Death test: " << statement() << "\n"; + switch (outcome()) { + case LIVED: + buffer << " Result: failed to die.\n" + << " Error msg:\n" + << FormatDeathTestOutput(error_message); + break; + case THREW: + buffer << " Result: threw an exception.\n" + << " Error msg:\n" + << FormatDeathTestOutput(error_message); + break; + case RETURNED: + buffer << " Result: illegal return in test statement.\n" + << " Error msg:\n" + << FormatDeathTestOutput(error_message); + break; + case DIED: + if (status_ok) { + if (matcher_.Matches(error_message)) { + success = true; + } else { + std::ostringstream stream; + matcher_.DescribeTo(&stream); + buffer << " Result: died but not with expected error.\n" + << " Expected: " << stream.str() << "\n" + << "Actual msg:\n" + << FormatDeathTestOutput(error_message); + } + } else { + buffer << " Result: died but not with expected exit code:\n" + << " " << ExitSummary(status()) << "\n" + << "Actual msg:\n" + << FormatDeathTestOutput(error_message); + } + break; + case IN_PROGRESS: + default: + GTEST_LOG_(FATAL) + << "DeathTest::Passed somehow called before conclusion of test"; + } + + DeathTest::set_last_death_test_message(buffer.GetString()); + return success; +} + +#if GTEST_OS_WINDOWS +// WindowsDeathTest implements death tests on Windows. Due to the +// specifics of starting new processes on Windows, death tests there are +// always threadsafe, and Google Test considers the +// --gtest_death_test_style=fast setting to be equivalent to +// --gtest_death_test_style=threadsafe there. +// +// A few implementation notes: Like the Linux version, the Windows +// implementation uses pipes for child-to-parent communication. But due to +// the specifics of pipes on Windows, some extra steps are required: +// +// 1. The parent creates a communication pipe and stores handles to both +// ends of it. +// 2. The parent starts the child and provides it with the information +// necessary to acquire the handle to the write end of the pipe. +// 3. The child acquires the write end of the pipe and signals the parent +// using a Windows event. +// 4. Now the parent can release the write end of the pipe on its side. If +// this is done before step 3, the object's reference count goes down to +// 0 and it is destroyed, preventing the child from acquiring it. The +// parent now has to release it, or read operations on the read end of +// the pipe will not return when the child terminates. +// 5. The parent reads child's output through the pipe (outcome code and +// any possible error messages) from the pipe, and its stderr and then +// determines whether to fail the test. +// +// Note: to distinguish Win32 API calls from the local method and function +// calls, the former are explicitly resolved in the global namespace. +// +class WindowsDeathTest : public DeathTestImpl { + public: + WindowsDeathTest(const char *a_statement, + Matcher matcher, const char *file, + int line) + : DeathTestImpl(a_statement, std::move(matcher)), file_(file), + line_(line) {} + + // All of these virtual functions are inherited from DeathTest. + virtual int Wait(); + virtual TestRole AssumeRole(); + + private: + // The name of the file in which the death test is located. + const char *const file_; + // The line number on which the death test is located. + const int line_; + // Handle to the write end of the pipe to the child process. + AutoHandle write_handle_; + // Child process handle. + AutoHandle child_handle_; + // Event the child process uses to signal the parent that it has + // acquired the handle to the write end of the pipe. After seeing this + // event the parent can release its own handles to make sure its + // ReadFile() calls return when the child terminates. + AutoHandle event_handle_; +}; + +// Waits for the child in a death test to exit, returning its exit +// status, or 0 if no child process exists. As a side effect, sets the +// outcome data member. +int WindowsDeathTest::Wait() { + if (!spawned()) return 0; + + // Wait until the child either signals that it has acquired the write end + // of the pipe or it dies. + const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() }; + switch (::WaitForMultipleObjects(2, wait_handles, + FALSE, // Waits for any of the handles. + INFINITE)) { + case WAIT_OBJECT_0: + case WAIT_OBJECT_0 + 1: break; + default: GTEST_DEATH_TEST_CHECK_(false); // Should not get here. + } + + // The child has acquired the write end of the pipe or exited. + // We release the handle on our side and continue. + write_handle_.Reset(); + event_handle_.Reset(); + + ReadAndInterpretStatusByte(); + + // Waits for the child process to exit if it haven't already. This + // returns immediately if the child has already exited, regardless of + // whether previous calls to WaitForMultipleObjects synchronized on this + // handle or not. + GTEST_DEATH_TEST_CHECK_(WAIT_OBJECT_0 == + ::WaitForSingleObject(child_handle_.Get(), INFINITE)); + DWORD status_code; + GTEST_DEATH_TEST_CHECK_( + ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE); + child_handle_.Reset(); + set_status(static_cast(status_code)); + return status(); +} + +// The AssumeRole process for a Windows death test. It creates a child +// process with the same executable as the current process to run the +// death test. The child process is given the --gtest_filter and +// --gtest_internal_run_death_test flags such that it knows to run the +// current death test only. +DeathTest::TestRole WindowsDeathTest::AssumeRole() { + const UnitTestImpl *const impl = GetUnitTestImpl(); + const InternalRunDeathTestFlag *const flag = + impl->internal_run_death_test_flag(); + const TestInfo *const info = impl->current_test_info(); + const int death_test_index = info->result()->death_test_count(); + + if (flag != nullptr) { + // ParseInternalRunDeathTestFlag() has performed all the necessary + // processing. + set_write_fd(flag->write_fd()); + return EXECUTE_TEST; + } + + // WindowsDeathTest uses an anonymous pipe to communicate results of + // a death test. + SECURITY_ATTRIBUTES handles_are_inheritable = { sizeof(SECURITY_ATTRIBUTES), + nullptr, TRUE }; + HANDLE read_handle, write_handle; + GTEST_DEATH_TEST_CHECK_(::CreatePipe(&read_handle, &write_handle, + &handles_are_inheritable, + 0) // Default buffer size. + != FALSE); + set_read_fd( + ::_open_osfhandle(reinterpret_cast(read_handle), O_RDONLY)); + write_handle_.Reset(write_handle); + event_handle_.Reset(::CreateEvent( + &handles_are_inheritable, + TRUE, // The event will automatically reset to non-signaled state. + FALSE, // The initial state is non-signalled. + nullptr)); // The even is unnamed. + GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != nullptr); + const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ + + kFilterFlag + "=" + info->test_suite_name() + + "." + info->name(); + const std::string internal_flag = + std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "=" + + file_ + "|" + StreamableToString(line_) + "|" + + StreamableToString(death_test_index) + "|" + + StreamableToString(static_cast(::GetCurrentProcessId())) + + // size_t has the same width as pointers on both 32-bit and 64-bit + // Windows platforms. + // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx. + "|" + StreamableToString(reinterpret_cast(write_handle)) + "|" + + StreamableToString(reinterpret_cast(event_handle_.Get())); + + char executable_path[_MAX_PATH + 1]; // NOLINT + GTEST_DEATH_TEST_CHECK_(_MAX_PATH + 1 != ::GetModuleFileNameA(nullptr, + executable_path, + _MAX_PATH)); + + std::string command_line = std::string(::GetCommandLineA()) + " " + + filter_flag + " \"" + internal_flag + "\""; + + DeathTest::set_last_death_test_message(""); + + CaptureStderr(); + // Flush the log buffers since the log streams are shared with the child. + FlushInfoLog(); + + // The child process will share the standard handles with the parent. + STARTUPINFOA startup_info; + memset(&startup_info, 0, sizeof(STARTUPINFO)); + startup_info.dwFlags = STARTF_USESTDHANDLES; + startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE); + startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE); + startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE); + + PROCESS_INFORMATION process_info; + GTEST_DEATH_TEST_CHECK_( + ::CreateProcessA( + executable_path, const_cast(command_line.c_str()), + nullptr, // Retuned process handle is not inheritable. + nullptr, // Retuned thread handle is not inheritable. + TRUE, // Child inherits all inheritable handles (for write_handle_). + 0x0, // Default creation flags. + nullptr, // Inherit the parent's environment. + UnitTest::GetInstance()->original_working_dir(), &startup_info, + &process_info) != FALSE); + child_handle_.Reset(process_info.hProcess); + ::CloseHandle(process_info.hThread); + set_spawned(true); + return OVERSEE_TEST; +} + +#elif GTEST_OS_FUCHSIA + +class FuchsiaDeathTest : public DeathTestImpl { + public: + FuchsiaDeathTest(const char *a_statement, + Matcher matcher, const char *file, + int line) + : DeathTestImpl(a_statement, std::move(matcher)), file_(file), + line_(line) {} + + // All of these virtual functions are inherited from DeathTest. + int Wait() override; + TestRole AssumeRole() override; + std::string GetErrorLogs() override; + + private: + // The name of the file in which the death test is located. + const char *const file_; + // The line number on which the death test is located. + const int line_; + // The stderr data captured by the child process. + std::string captured_stderr_; + + zx::process child_process_; + zx::channel exception_channel_; + zx::socket stderr_socket_; +}; + +// Utility class for accumulating command-line arguments. +class Arguments { + public: + Arguments() { args_.push_back(nullptr); } + + ~Arguments() { + for (std::vector::iterator i = args_.begin(); i != args_.end(); + ++i) { + free(*i); + } + } + void AddArgument(const char *argument) { + args_.insert(args_.end() - 1, posix::StrDup(argument)); + } + + template + void AddArguments(const ::std::vector &arguments) { + for (typename ::std::vector::const_iterator i = arguments.begin(); + i != arguments.end(); ++i) { + args_.insert(args_.end() - 1, posix::StrDup(i->c_str())); + } + } + char *const *Argv() { return &args_[0]; } + + int size() { return args_.size() - 1; } + + private: + std::vector args_; +}; + +// Waits for the child in a death test to exit, returning its exit +// status, or 0 if no child process exists. As a side effect, sets the +// outcome data member. +int FuchsiaDeathTest::Wait() { + const int kProcessKey = 0; + const int kSocketKey = 1; + const int kExceptionKey = 2; + + if (!spawned()) return 0; + + // Create a port to wait for socket/task/exception events. + zx_status_t status_zx; + zx::port port; + status_zx = zx::port::create(0, &port); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + + // Register to wait for the child process to terminate. + status_zx = child_process_.wait_async( + port, kProcessKey, ZX_PROCESS_TERMINATED, ZX_WAIT_ASYNC_ONCE); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + + // Register to wait for the socket to be readable or closed. + status_zx = stderr_socket_.wait_async( + port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED, + ZX_WAIT_ASYNC_ONCE); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + + // Register to wait for an exception. + status_zx = exception_channel_.wait_async( + port, kExceptionKey, ZX_CHANNEL_READABLE, ZX_WAIT_ASYNC_ONCE); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + + bool process_terminated = false; + bool socket_closed = false; + do { + zx_port_packet_t packet = {}; + status_zx = port.wait(zx::time::infinite(), &packet); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + + if (packet.key == kExceptionKey) { + // Process encountered an exception. Kill it directly rather than + // letting other handlers process the event. We will get a kProcessKey + // event when the process actually terminates. + status_zx = child_process_.kill(); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + } else if (packet.key == kProcessKey) { + // Process terminated. + GTEST_DEATH_TEST_CHECK_(ZX_PKT_IS_SIGNAL_ONE(packet.type)); + GTEST_DEATH_TEST_CHECK_(packet.signal.observed & ZX_PROCESS_TERMINATED); + process_terminated = true; + } else if (packet.key == kSocketKey) { + GTEST_DEATH_TEST_CHECK_(ZX_PKT_IS_SIGNAL_ONE(packet.type)); + if (packet.signal.observed & ZX_SOCKET_READABLE) { + // Read data from the socket. + constexpr size_t kBufferSize = 1024; + do { + size_t old_length = captured_stderr_.length(); + size_t bytes_read = 0; + captured_stderr_.resize(old_length + kBufferSize); + status_zx = + stderr_socket_.read(0, &captured_stderr_.front() + old_length, + kBufferSize, &bytes_read); + captured_stderr_.resize(old_length + bytes_read); + } while (status_zx == ZX_OK); + if (status_zx == ZX_ERR_PEER_CLOSED) { + socket_closed = true; + } else { + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_ERR_SHOULD_WAIT); + status_zx = stderr_socket_.wait_async( + port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED, + ZX_WAIT_ASYNC_ONCE); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + } + } else { + GTEST_DEATH_TEST_CHECK_(packet.signal.observed & ZX_SOCKET_PEER_CLOSED); + socket_closed = true; + } + } + } while (!process_terminated && !socket_closed); + + ReadAndInterpretStatusByte(); + + zx_info_process_t buffer; + status_zx = child_process_.get_info(ZX_INFO_PROCESS, &buffer, sizeof(buffer), + nullptr, nullptr); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + + GTEST_DEATH_TEST_CHECK_(buffer.exited); + set_status(buffer.return_code); + return status(); +} + +// The AssumeRole process for a Fuchsia death test. It creates a child +// process with the same executable as the current process to run the +// death test. The child process is given the --gtest_filter and +// --gtest_internal_run_death_test flags such that it knows to run the +// current death test only. +DeathTest::TestRole FuchsiaDeathTest::AssumeRole() { + const UnitTestImpl *const impl = GetUnitTestImpl(); + const InternalRunDeathTestFlag *const flag = + impl->internal_run_death_test_flag(); + const TestInfo *const info = impl->current_test_info(); + const int death_test_index = info->result()->death_test_count(); + + if (flag != nullptr) { + // ParseInternalRunDeathTestFlag() has performed all the necessary + // processing. + set_write_fd(kFuchsiaReadPipeFd); + return EXECUTE_TEST; + } + + // Flush the log buffers since the log streams are shared with the child. + FlushInfoLog(); + + // Build the child process command line. + const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ + + kFilterFlag + "=" + info->test_suite_name() + + "." + info->name(); + const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ + + kInternalRunDeathTestFlag + "=" + file_ + + "|" + StreamableToString(line_) + "|" + + StreamableToString(death_test_index); + Arguments args; + args.AddArguments(GetInjectableArgvs()); + args.AddArgument(filter_flag.c_str()); + args.AddArgument(internal_flag.c_str()); + + // Build the pipe for communication with the child. + zx_status_t status; + zx_handle_t child_pipe_handle; + int child_pipe_fd; + status = fdio_pipe_half(&child_pipe_fd, &child_pipe_handle); + GTEST_DEATH_TEST_CHECK_(status == ZX_OK); + set_read_fd(child_pipe_fd); + + // Set the pipe handle for the child. + fdio_spawn_action_t spawn_actions[2] = {}; + fdio_spawn_action_t *add_handle_action = &spawn_actions[0]; + add_handle_action->action = FDIO_SPAWN_ACTION_ADD_HANDLE; + add_handle_action->h.id = PA_HND(PA_FD, kFuchsiaReadPipeFd); + add_handle_action->h.handle = child_pipe_handle; + + // Create a socket pair will be used to receive the child process' stderr. + zx::socket stderr_producer_socket; + status = zx::socket::create(0, &stderr_producer_socket, &stderr_socket_); + GTEST_DEATH_TEST_CHECK_(status >= 0); + int stderr_producer_fd = -1; + status = + fdio_fd_create(stderr_producer_socket.release(), &stderr_producer_fd); + GTEST_DEATH_TEST_CHECK_(status >= 0); + + // Make the stderr socket nonblocking. + GTEST_DEATH_TEST_CHECK_(fcntl(stderr_producer_fd, F_SETFL, 0) == 0); + + fdio_spawn_action_t *add_stderr_action = &spawn_actions[1]; + add_stderr_action->action = FDIO_SPAWN_ACTION_CLONE_FD; + add_stderr_action->fd.local_fd = stderr_producer_fd; + add_stderr_action->fd.target_fd = STDERR_FILENO; + + // Create a child job. + zx_handle_t child_job = ZX_HANDLE_INVALID; + status = zx_job_create(zx_job_default(), 0, &child_job); + GTEST_DEATH_TEST_CHECK_(status == ZX_OK); + zx_policy_basic_t policy; + policy.condition = ZX_POL_NEW_ANY; + policy.policy = ZX_POL_ACTION_ALLOW; + status = zx_job_set_policy(child_job, ZX_JOB_POL_RELATIVE, ZX_JOB_POL_BASIC, + &policy, 1); + GTEST_DEATH_TEST_CHECK_(status == ZX_OK); + + // Create an exception channel attached to the |child_job|, to allow + // us to suppress the system default exception handler from firing. + status = zx_task_create_exception_channel( + child_job, 0, exception_channel_.reset_and_get_address()); + GTEST_DEATH_TEST_CHECK_(status == ZX_OK); + + // Spawn the child process. + status = fdio_spawn_etc(child_job, FDIO_SPAWN_CLONE_ALL, args.Argv()[0], + args.Argv(), nullptr, 2, spawn_actions, + child_process_.reset_and_get_address(), nullptr); + GTEST_DEATH_TEST_CHECK_(status == ZX_OK); + + set_spawned(true); + return OVERSEE_TEST; +} + +std::string FuchsiaDeathTest::GetErrorLogs() { return captured_stderr_; } + +#else // We are neither on Windows, nor on Fuchsia. + +// ForkingDeathTest provides implementations for most of the abstract +// methods of the DeathTest interface. Only the AssumeRole method is +// left undefined. +class ForkingDeathTest : public DeathTestImpl { + public: + ForkingDeathTest(const char *statement, Matcher matcher); + + // All of these virtual functions are inherited from DeathTest. + int Wait() override; + + protected: + void set_child_pid(pid_t child_pid) { child_pid_ = child_pid; } + + private: + // PID of child process during death test; 0 in the child process itself. + pid_t child_pid_; +}; + +// Constructs a ForkingDeathTest. +ForkingDeathTest::ForkingDeathTest(const char *a_statement, + Matcher matcher) + : DeathTestImpl(a_statement, std::move(matcher)), child_pid_(-1) {} + +// Waits for the child in a death test to exit, returning its exit +// status, or 0 if no child process exists. As a side effect, sets the +// outcome data member. +int ForkingDeathTest::Wait() { + if (!spawned()) return 0; + + ReadAndInterpretStatusByte(); + + int status_value; + GTEST_DEATH_TEST_CHECK_SYSCALL_(waitpid(child_pid_, &status_value, 0)); + set_status(status_value); + return status_value; +} + +// A concrete death test class that forks, then immediately runs the test +// in the child process. +class NoExecDeathTest : public ForkingDeathTest { + public: + NoExecDeathTest(const char *a_statement, Matcher matcher) + : ForkingDeathTest(a_statement, std::move(matcher)) {} + TestRole AssumeRole() override; +}; + +// The AssumeRole process for a fork-and-run death test. It implements a +// straightforward fork, with a simple pipe to transmit the status byte. +DeathTest::TestRole NoExecDeathTest::AssumeRole() { + const size_t thread_count = GetThreadCount(); + if (thread_count != 1) { + GTEST_LOG_(WARNING) << DeathTestThreadWarning(thread_count); + } + + int pipe_fd[2]; + GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1); + + DeathTest::set_last_death_test_message(""); + CaptureStderr(); + // When we fork the process below, the log file buffers are copied, but the + // file descriptors are shared. We flush all log files here so that closing + // the file descriptors in the child process doesn't throw off the + // synchronization between descriptors and buffers in the parent process. + // This is as close to the fork as possible to avoid a race condition in case + // there are multiple threads running before the death test, and another + // thread writes to the log file. + FlushInfoLog(); + + const pid_t child_pid = fork(); + GTEST_DEATH_TEST_CHECK_(child_pid != -1); + set_child_pid(child_pid); + if (child_pid == 0) { + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[0])); + set_write_fd(pipe_fd[1]); + // Redirects all logging to stderr in the child process to prevent + // concurrent writes to the log files. We capture stderr in the parent + // process and append the child process' output to a log. + LogToStderr(); + // Event forwarding to the listeners of event listener API mush be shut + // down in death test subprocesses. + GetUnitTestImpl()->listeners()->SuppressEventForwarding(); + g_in_fast_death_test_child = true; + return EXECUTE_TEST; + } else { + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1])); + set_read_fd(pipe_fd[0]); + set_spawned(true); + return OVERSEE_TEST; + } +} + +// A concrete death test class that forks and re-executes the main +// program from the beginning, with command-line flags set that cause +// only this specific death test to be run. +class ExecDeathTest : public ForkingDeathTest { + public: + ExecDeathTest(const char *a_statement, Matcher matcher, + const char *file, int line) + : ForkingDeathTest(a_statement, std::move(matcher)), file_(file), + line_(line) {} + TestRole AssumeRole() override; + + private: + static ::std::vector GetArgvsForDeathTestChildProcess() { + ::std::vector args = GetInjectableArgvs(); +#if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_) + ::std::vector extra_args = + GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_(); + args.insert(args.end(), extra_args.begin(), extra_args.end()); +#endif // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_) + return args; + } + // The name of the file in which the death test is located. + const char *const file_; + // The line number on which the death test is located. + const int line_; +}; + +// Utility class for accumulating command-line arguments. +class Arguments { + public: + Arguments() { args_.push_back(nullptr); } + + ~Arguments() { + for (std::vector::iterator i = args_.begin(); i != args_.end(); + ++i) { + free(*i); + } + } + void AddArgument(const char *argument) { + args_.insert(args_.end() - 1, posix::StrDup(argument)); + } + + template + void AddArguments(const ::std::vector &arguments) { + for (typename ::std::vector::const_iterator i = arguments.begin(); + i != arguments.end(); ++i) { + args_.insert(args_.end() - 1, posix::StrDup(i->c_str())); + } + } + char *const *Argv() { return &args_[0]; } + + private: + std::vector args_; +}; + +// A struct that encompasses the arguments to the child process of a +// threadsafe-style death test process. +struct ExecDeathTestArgs { + char *const *argv; // Command-line arguments for the child's call to exec + int close_fd; // File descriptor to close; the read end of a pipe +}; + +#if GTEST_OS_MAC +inline char **GetEnviron() { + // When Google Test is built as a framework on MacOS X, the environ variable + // is unavailable. Apple's documentation (man environ) recommends using + // _NSGetEnviron() instead. + return *_NSGetEnviron(); +} +#else +// Some POSIX platforms expect you to declare environ. extern "C" makes +// it reside in the global namespace. +extern "C" char **environ; +inline char **GetEnviron() { return environ; } +#endif // GTEST_OS_MAC + +#if !GTEST_OS_QNX +// The main function for a threadsafe-style death test child process. +// This function is called in a clone()-ed process and thus must avoid +// any potentially unsafe operations like malloc or libc functions. +static int ExecDeathTestChildMain(void *child_arg) { + ExecDeathTestArgs *const args = static_cast(child_arg); + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(args->close_fd)); + + // We need to execute the test program in the same environment where + // it was originally invoked. Therefore we change to the original + // working directory first. + const char *const original_dir = + UnitTest::GetInstance()->original_working_dir(); + // We can safely call chdir() as it's a direct system call. + if (chdir(original_dir) != 0) { + DeathTestAbort(std::string("chdir(\"") + original_dir + + "\") failed: " + GetLastErrnoDescription()); + return EXIT_FAILURE; + } + + // We can safely call execve() as it's a direct system call. We + // cannot use execvp() as it's a libc function and thus potentially + // unsafe. Since execve() doesn't search the PATH, the user must + // invoke the test program via a valid path that contains at least + // one path separator. + execve(args->argv[0], args->argv, GetEnviron()); + DeathTestAbort(std::string("execve(") + args->argv[0] + ", ...) in " + + original_dir + " failed: " + GetLastErrnoDescription()); + return EXIT_FAILURE; +} +#endif // !GTEST_OS_QNX + +#if GTEST_HAS_CLONE +// Two utility routines that together determine the direction the stack +// grows. +// This could be accomplished more elegantly by a single recursive +// function, but we want to guard against the unlikely possibility of +// a smart compiler optimizing the recursion away. +// +// GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining +// StackLowerThanAddress into StackGrowsDown, which then doesn't give +// correct answer. +static void StackLowerThanAddress(const void *ptr, + bool *result) GTEST_NO_INLINE_; +// HWAddressSanitizer add a random tag to the MSB of the local variable address, +// making comparison result unpredictable. +GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ +static void StackLowerThanAddress(const void *ptr, bool *result) { + int dummy; + *result = (&dummy < ptr); +} + +// Make sure AddressSanitizer does not tamper with the stack here. +GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ +static bool StackGrowsDown() { + int dummy; + bool result; + StackLowerThanAddress(&dummy, &result); + return result; +} +#endif // GTEST_HAS_CLONE + +// Spawns a child process with the same executable as the current process in +// a thread-safe manner and instructs it to run the death test. The +// implementation uses fork(2) + exec. On systems where clone(2) is +// available, it is used instead, being slightly more thread-safe. On QNX, +// fork supports only single-threaded environments, so this function uses +// spawn(2) there instead. The function dies with an error message if +// anything goes wrong. +static pid_t ExecDeathTestSpawnChild(char *const *argv, int close_fd) { + ExecDeathTestArgs args = { argv, close_fd }; + pid_t child_pid = -1; + +#if GTEST_OS_QNX + // Obtains the current directory and sets it to be closed in the child + // process. + const int cwd_fd = open(".", O_RDONLY); + GTEST_DEATH_TEST_CHECK_(cwd_fd != -1); + GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(cwd_fd, F_SETFD, FD_CLOEXEC)); + // We need to execute the test program in the same environment where + // it was originally invoked. Therefore we change to the original + // working directory first. + const char *const original_dir = + UnitTest::GetInstance()->original_working_dir(); + // We can safely call chdir() as it's a direct system call. + if (chdir(original_dir) != 0) { + DeathTestAbort(std::string("chdir(\"") + original_dir + + "\") failed: " + GetLastErrnoDescription()); + return EXIT_FAILURE; + } + + int fd_flags; + // Set close_fd to be closed after spawn. + GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD)); + GTEST_DEATH_TEST_CHECK_SYSCALL_( + fcntl(close_fd, F_SETFD, fd_flags | FD_CLOEXEC)); + struct inheritance inherit = { 0 }; + // spawn is a system call. + child_pid = + spawn(args.argv[0], 0, nullptr, &inherit, args.argv, GetEnviron()); + // Restores the current working directory. + GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1); + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd)); + +#else // GTEST_OS_QNX +#if GTEST_OS_LINUX + // When a SIGPROF signal is received while fork() or clone() are executing, + // the process may hang. To avoid this, we ignore SIGPROF here and re-enable + // it after the call to fork()/clone() is complete. + struct sigaction saved_sigprof_action; + struct sigaction ignore_sigprof_action; + memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action)); + sigemptyset(&ignore_sigprof_action.sa_mask); + ignore_sigprof_action.sa_handler = SIG_IGN; + GTEST_DEATH_TEST_CHECK_SYSCALL_( + sigaction(SIGPROF, &ignore_sigprof_action, &saved_sigprof_action)); +#endif // GTEST_OS_LINUX + +#if GTEST_HAS_CLONE + const bool use_fork = GTEST_FLAG(death_test_use_fork); + + if (!use_fork) { + static const bool stack_grows_down = StackGrowsDown(); + const auto stack_size = static_cast(getpagesize() * 2); + // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead. + void *const stack = mmap(nullptr, stack_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); + GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED); + + // Maximum stack alignment in bytes: For a downward-growing stack, this + // amount is subtracted from size of the stack space to get an address + // that is within the stack space and is aligned on all systems we care + // about. As far as I know there is no ABI with stack alignment greater + // than 64. We assume stack and stack_size already have alignment of + // kMaxStackAlignment. + const size_t kMaxStackAlignment = 64; + void *const stack_top = + static_cast(stack) + + (stack_grows_down ? stack_size - kMaxStackAlignment : 0); + GTEST_DEATH_TEST_CHECK_( + static_cast(stack_size) > kMaxStackAlignment && + reinterpret_cast(stack_top) % kMaxStackAlignment == 0); + + child_pid = clone(&ExecDeathTestChildMain, stack_top, SIGCHLD, &args); + + GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1); + } +#else + const bool use_fork = true; +#endif // GTEST_HAS_CLONE + + if (use_fork && (child_pid = fork()) == 0) { + ExecDeathTestChildMain(&args); + _exit(0); + } +#endif // GTEST_OS_QNX +#if GTEST_OS_LINUX + GTEST_DEATH_TEST_CHECK_SYSCALL_( + sigaction(SIGPROF, &saved_sigprof_action, nullptr)); +#endif // GTEST_OS_LINUX + + GTEST_DEATH_TEST_CHECK_(child_pid != -1); + return child_pid; +} + +// The AssumeRole process for a fork-and-exec death test. It re-executes the +// main program from the beginning, setting the --gtest_filter +// and --gtest_internal_run_death_test flags to cause only the current +// death test to be re-run. +DeathTest::TestRole ExecDeathTest::AssumeRole() { + const UnitTestImpl *const impl = GetUnitTestImpl(); + const InternalRunDeathTestFlag *const flag = + impl->internal_run_death_test_flag(); + const TestInfo *const info = impl->current_test_info(); + const int death_test_index = info->result()->death_test_count(); + + if (flag != nullptr) { + set_write_fd(flag->write_fd()); + return EXECUTE_TEST; + } + + int pipe_fd[2]; + GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1); + // Clear the close-on-exec flag on the write end of the pipe, lest + // it be closed when the child process does an exec: + GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1); + + const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ + + kFilterFlag + "=" + info->test_suite_name() + + "." + info->name(); + const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ + + kInternalRunDeathTestFlag + "=" + file_ + + "|" + StreamableToString(line_) + "|" + + StreamableToString(death_test_index) + "|" + + StreamableToString(pipe_fd[1]); + Arguments args; + args.AddArguments(GetArgvsForDeathTestChildProcess()); + args.AddArgument(filter_flag.c_str()); + args.AddArgument(internal_flag.c_str()); + + DeathTest::set_last_death_test_message(""); + + CaptureStderr(); + // See the comment in NoExecDeathTest::AssumeRole for why the next line + // is necessary. + FlushInfoLog(); + + const pid_t child_pid = ExecDeathTestSpawnChild(args.Argv(), pipe_fd[0]); + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1])); + set_child_pid(child_pid); + set_read_fd(pipe_fd[0]); + set_spawned(true); + return OVERSEE_TEST; +} + +#endif // !GTEST_OS_WINDOWS + +// Creates a concrete DeathTest-derived class that depends on the +// --gtest_death_test_style flag, and sets the pointer pointed to +// by the "test" argument to its address. If the test should be +// skipped, sets that pointer to NULL. Returns true, unless the +// flag is set to an invalid value. +bool DefaultDeathTestFactory::Create(const char *statement, + Matcher matcher, + const char *file, int line, + DeathTest **test) { + UnitTestImpl *const impl = GetUnitTestImpl(); + const InternalRunDeathTestFlag *const flag = + impl->internal_run_death_test_flag(); + const int death_test_index = + impl->current_test_info()->increment_death_test_count(); + + if (flag != nullptr) { + if (death_test_index > flag->index()) { + DeathTest::set_last_death_test_message( + "Death test count (" + StreamableToString(death_test_index) + + ") somehow exceeded expected maximum (" + + StreamableToString(flag->index()) + ")"); + return false; + } + + if (!(flag->file() == file && flag->line() == line && + flag->index() == death_test_index)) { + *test = nullptr; + return true; + } + } + +#if GTEST_OS_WINDOWS + + if (GTEST_FLAG(death_test_style) == "threadsafe" || + GTEST_FLAG(death_test_style) == "fast") { + *test = new WindowsDeathTest(statement, std::move(matcher), file, line); + } + +#elif GTEST_OS_FUCHSIA + + if (GTEST_FLAG(death_test_style) == "threadsafe" || + GTEST_FLAG(death_test_style) == "fast") { + *test = new FuchsiaDeathTest(statement, std::move(matcher), file, line); + } + +#else + + if (GTEST_FLAG(death_test_style) == "threadsafe") { + *test = new ExecDeathTest(statement, std::move(matcher), file, line); + } else if (GTEST_FLAG(death_test_style) == "fast") { + *test = new NoExecDeathTest(statement, std::move(matcher)); + } + +#endif // GTEST_OS_WINDOWS + + else { // NOLINT - this is more readable than unbalanced brackets inside #if. + DeathTest::set_last_death_test_message("Unknown death test style \"" + + GTEST_FLAG(death_test_style) + + "\" encountered"); + return false; + } + + return true; +} + +#if GTEST_OS_WINDOWS +// Recreates the pipe and event handles from the provided parameters, +// signals the event, and returns a file descriptor wrapped around the pipe +// handle. This function is called in the child process only. +static int GetStatusFileDescriptor(unsigned int parent_process_id, + size_t write_handle_as_size_t, + size_t event_handle_as_size_t) { + AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE, + FALSE, // Non-inheritable. + parent_process_id)); + if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) { + DeathTestAbort("Unable to open parent process " + + StreamableToString(parent_process_id)); + } + + GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t)); + + const HANDLE write_handle = reinterpret_cast(write_handle_as_size_t); + HANDLE dup_write_handle; + + // The newly initialized handle is accessible only in the parent + // process. To obtain one accessible within the child, we need to use + // DuplicateHandle. + if (!::DuplicateHandle(parent_process_handle.Get(), write_handle, + ::GetCurrentProcess(), &dup_write_handle, + 0x0, // Requested privileges ignored since + // DUPLICATE_SAME_ACCESS is used. + FALSE, // Request non-inheritable handler. + DUPLICATE_SAME_ACCESS)) { + DeathTestAbort("Unable to duplicate the pipe handle " + + StreamableToString(write_handle_as_size_t) + + " from the parent process " + + StreamableToString(parent_process_id)); + } + + const HANDLE event_handle = reinterpret_cast(event_handle_as_size_t); + HANDLE dup_event_handle; + + if (!::DuplicateHandle(parent_process_handle.Get(), event_handle, + ::GetCurrentProcess(), &dup_event_handle, 0x0, FALSE, + DUPLICATE_SAME_ACCESS)) { + DeathTestAbort("Unable to duplicate the event handle " + + StreamableToString(event_handle_as_size_t) + + " from the parent process " + + StreamableToString(parent_process_id)); + } + + const int write_fd = + ::_open_osfhandle(reinterpret_cast(dup_write_handle), O_APPEND); + if (write_fd == -1) { + DeathTestAbort("Unable to convert pipe handle " + + StreamableToString(write_handle_as_size_t) + + " to a file descriptor"); + } + + // Signals the parent that the write end of the pipe has been acquired + // so the parent can release its own write end. + ::SetEvent(dup_event_handle); + + return write_fd; +} +#endif // GTEST_OS_WINDOWS + +// Returns a newly created InternalRunDeathTestFlag object with fields +// initialized from the GTEST_FLAG(internal_run_death_test) flag if +// the flag is specified; otherwise returns NULL. +InternalRunDeathTestFlag *ParseInternalRunDeathTestFlag() { + if (GTEST_FLAG(internal_run_death_test) == "") return nullptr; + + // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we + // can use it here. + int line = -1; + int index = -1; + ::std::vector< ::std::string> fields; + SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields); + int write_fd = -1; + +#if GTEST_OS_WINDOWS + + unsigned int parent_process_id = 0; + size_t write_handle_as_size_t = 0; + size_t event_handle_as_size_t = 0; + + if (fields.size() != 6 || !ParseNaturalNumber(fields[1], &line) || + !ParseNaturalNumber(fields[2], &index) || + !ParseNaturalNumber(fields[3], &parent_process_id) || + !ParseNaturalNumber(fields[4], &write_handle_as_size_t) || + !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) { + DeathTestAbort("Bad --gtest_internal_run_death_test flag: " + + GTEST_FLAG(internal_run_death_test)); + } + write_fd = GetStatusFileDescriptor(parent_process_id, write_handle_as_size_t, + event_handle_as_size_t); + +#elif GTEST_OS_FUCHSIA + + if (fields.size() != 3 || !ParseNaturalNumber(fields[1], &line) || + !ParseNaturalNumber(fields[2], &index)) { + DeathTestAbort("Bad --gtest_internal_run_death_test flag: " + + GTEST_FLAG(internal_run_death_test)); + } + +#else + + if (fields.size() != 4 || !ParseNaturalNumber(fields[1], &line) || + !ParseNaturalNumber(fields[2], &index) || + !ParseNaturalNumber(fields[3], &write_fd)) { + DeathTestAbort("Bad --gtest_internal_run_death_test flag: " + + GTEST_FLAG(internal_run_death_test)); + } + +#endif // GTEST_OS_WINDOWS + + return new InternalRunDeathTestFlag(fields[0], line, index, write_fd); +} + +} // namespace internal + +#endif // GTEST_HAS_DEATH_TEST + +} // namespace testing diff --git a/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-filepath.cc b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-filepath.cc new file mode 100644 index 000000000..f9427e0f1 --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-filepath.cc @@ -0,0 +1,377 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "gtest/internal/gtest-filepath.h" + +#include +#include "gtest/internal/gtest-port.h" +#include "gtest/gtest-message.h" + +#if GTEST_OS_WINDOWS_MOBILE +#include +#elif GTEST_OS_WINDOWS +#include +#include +#else +#include +#include // Some Linux distributions define PATH_MAX here. +#endif // GTEST_OS_WINDOWS_MOBILE + +#include "gtest/internal/gtest-string.h" + +#if GTEST_OS_WINDOWS +#define GTEST_PATH_MAX_ _MAX_PATH +#elif defined(PATH_MAX) +#define GTEST_PATH_MAX_ PATH_MAX +#elif defined(_XOPEN_PATH_MAX) +#define GTEST_PATH_MAX_ _XOPEN_PATH_MAX +#else +#define GTEST_PATH_MAX_ _POSIX_PATH_MAX +#endif // GTEST_OS_WINDOWS + +namespace testing { +namespace internal { + +#if GTEST_OS_WINDOWS +// On Windows, '\\' is the standard path separator, but many tools and the +// Windows API also accept '/' as an alternate path separator. Unless otherwise +// noted, a file path can contain either kind of path separators, or a mixture +// of them. +const char kPathSeparator = '\\'; +const char kAlternatePathSeparator = '/'; +const char kAlternatePathSeparatorString[] = "/"; +#if GTEST_OS_WINDOWS_MOBILE +// Windows CE doesn't have a current directory. You should not use +// the current directory in tests on Windows CE, but this at least +// provides a reasonable fallback. +const char kCurrentDirectoryString[] = "\\"; +// Windows CE doesn't define INVALID_FILE_ATTRIBUTES +const DWORD kInvalidFileAttributes = 0xffffffff; +#else +const char kCurrentDirectoryString[] = ".\\"; +#endif // GTEST_OS_WINDOWS_MOBILE +#else +const char kPathSeparator = '/'; +const char kCurrentDirectoryString[] = "./"; +#endif // GTEST_OS_WINDOWS + +// Returns whether the given character is a valid path separator. +static bool IsPathSeparator(char c) { +#if GTEST_HAS_ALT_PATH_SEP_ + return (c == kPathSeparator) || (c == kAlternatePathSeparator); +#else + return c == kPathSeparator; +#endif +} + +// Returns the current working directory, or "" if unsuccessful. +FilePath FilePath::GetCurrentDir() { +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \ + GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_ESP32 + // These platforms do not have a current directory, so we just return + // something reasonable. + return FilePath(kCurrentDirectoryString); +#elif GTEST_OS_WINDOWS + char cwd[GTEST_PATH_MAX_ + 1] = { '\0' }; + return FilePath(_getcwd(cwd, sizeof(cwd)) == nullptr ? "" : cwd); +#else + char cwd[GTEST_PATH_MAX_ + 1] = { '\0' }; + char *result = getcwd(cwd, sizeof(cwd)); +#if GTEST_OS_NACL + // getcwd will likely fail in NaCl due to the sandbox, so return something + // reasonable. The user may have provided a shim implementation for getcwd, + // however, so fallback only when failure is detected. + return FilePath(result == nullptr ? kCurrentDirectoryString : cwd); +#endif // GTEST_OS_NACL + return FilePath(result == nullptr ? "" : cwd); +#endif // GTEST_OS_WINDOWS_MOBILE +} + +// Returns a copy of the FilePath with the case-insensitive extension removed. +// Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns +// FilePath("dir/file"). If a case-insensitive extension is not +// found, returns a copy of the original FilePath. +FilePath FilePath::RemoveExtension(const char *extension) const { + const std::string dot_extension = std::string(".") + extension; + if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) { + return FilePath( + pathname_.substr(0, pathname_.length() - dot_extension.length())); + } + return *this; +} + +// Returns a pointer to the last occurrence of a valid path separator in +// the FilePath. On Windows, for example, both '/' and '\' are valid path +// separators. Returns NULL if no path separator was found. +const char *FilePath::FindLastPathSeparator() const { + const char *const last_sep = strrchr(c_str(), kPathSeparator); +#if GTEST_HAS_ALT_PATH_SEP_ + const char *const last_alt_sep = strrchr(c_str(), kAlternatePathSeparator); + // Comparing two pointers of which only one is NULL is undefined. + if (last_alt_sep != nullptr && + (last_sep == nullptr || last_alt_sep > last_sep)) { + return last_alt_sep; + } +#endif + return last_sep; +} + +// Returns a copy of the FilePath with the directory part removed. +// Example: FilePath("path/to/file").RemoveDirectoryName() returns +// FilePath("file"). If there is no directory part ("just_a_file"), it returns +// the FilePath unmodified. If there is no file part ("just_a_dir/") it +// returns an empty FilePath (""). +// On Windows platform, '\' is the path separator, otherwise it is '/'. +FilePath FilePath::RemoveDirectoryName() const { + const char *const last_sep = FindLastPathSeparator(); + return last_sep ? FilePath(last_sep + 1) : *this; +} + +// RemoveFileName returns the directory path with the filename removed. +// Example: FilePath("path/to/file").RemoveFileName() returns "path/to/". +// If the FilePath is "a_file" or "/a_file", RemoveFileName returns +// FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does +// not have a file, like "just/a/dir/", it returns the FilePath unmodified. +// On Windows platform, '\' is the path separator, otherwise it is '/'. +FilePath FilePath::RemoveFileName() const { + const char *const last_sep = FindLastPathSeparator(); + std::string dir; + if (last_sep) { + dir = std::string(c_str(), static_cast(last_sep + 1 - c_str())); + } else { + dir = kCurrentDirectoryString; + } + return FilePath(dir); +} + +// Helper functions for naming files in a directory for xml output. + +// Given directory = "dir", base_name = "test", number = 0, +// extension = "xml", returns "dir/test.xml". If number is greater +// than zero (e.g., 12), returns "dir/test_12.xml". +// On Windows platform, uses \ as the separator rather than /. +FilePath FilePath::MakeFileName(const FilePath &directory, + const FilePath &base_name, int number, + const char *extension) { + std::string file; + if (number == 0) { + file = base_name.string() + "." + extension; + } else { + file = + base_name.string() + "_" + StreamableToString(number) + "." + extension; + } + return ConcatPaths(directory, FilePath(file)); +} + +// Given directory = "dir", relative_path = "test.xml", returns "dir/test.xml". +// On Windows, uses \ as the separator rather than /. +FilePath FilePath::ConcatPaths(const FilePath &directory, + const FilePath &relative_path) { + if (directory.IsEmpty()) return relative_path; + const FilePath dir(directory.RemoveTrailingPathSeparator()); + return FilePath(dir.string() + kPathSeparator + relative_path.string()); +} + +// Returns true if pathname describes something findable in the file-system, +// either a file, directory, or whatever. +bool FilePath::FileOrDirectoryExists() const { +#if GTEST_OS_WINDOWS_MOBILE + LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str()); + const DWORD attributes = GetFileAttributes(unicode); + delete[] unicode; + return attributes != kInvalidFileAttributes; +#else + posix::StatStruct file_stat; + return posix::Stat(pathname_.c_str(), &file_stat) == 0; +#endif // GTEST_OS_WINDOWS_MOBILE +} + +// Returns true if pathname describes a directory in the file-system +// that exists. +bool FilePath::DirectoryExists() const { + bool result = false; +#if GTEST_OS_WINDOWS + // Don't strip off trailing separator if path is a root directory on + // Windows (like "C:\\"). + const FilePath &path(IsRootDirectory() ? *this + : RemoveTrailingPathSeparator()); +#else + const FilePath &path(*this); +#endif + +#if GTEST_OS_WINDOWS_MOBILE + LPCWSTR unicode = String::AnsiToUtf16(path.c_str()); + const DWORD attributes = GetFileAttributes(unicode); + delete[] unicode; + if ((attributes != kInvalidFileAttributes) && + (attributes & FILE_ATTRIBUTE_DIRECTORY)) { + result = true; + } +#else + posix::StatStruct file_stat; + result = + posix::Stat(path.c_str(), &file_stat) == 0 && posix::IsDir(file_stat); +#endif // GTEST_OS_WINDOWS_MOBILE + + return result; +} + +// Returns true if pathname describes a root directory. (Windows has one +// root directory per disk drive.) +bool FilePath::IsRootDirectory() const { +#if GTEST_OS_WINDOWS + return pathname_.length() == 3 && IsAbsolutePath(); +#else + return pathname_.length() == 1 && IsPathSeparator(pathname_.c_str()[0]); +#endif +} + +// Returns true if pathname describes an absolute path. +bool FilePath::IsAbsolutePath() const { + const char *const name = pathname_.c_str(); +#if GTEST_OS_WINDOWS + return pathname_.length() >= 3 && + ((name[0] >= 'a' && name[0] <= 'z') || + (name[0] >= 'A' && name[0] <= 'Z')) && + name[1] == ':' && IsPathSeparator(name[2]); +#else + return IsPathSeparator(name[0]); +#endif +} + +// Returns a pathname for a file that does not currently exist. The pathname +// will be directory/base_name.extension or +// directory/base_name_.extension if directory/base_name.extension +// already exists. The number will be incremented until a pathname is found +// that does not already exist. +// Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'. +// There could be a race condition if two or more processes are calling this +// function at the same time -- they could both pick the same filename. +FilePath FilePath::GenerateUniqueFileName(const FilePath &directory, + const FilePath &base_name, + const char *extension) { + FilePath full_pathname; + int number = 0; + do { + full_pathname.Set(MakeFileName(directory, base_name, number++, extension)); + } while (full_pathname.FileOrDirectoryExists()); + return full_pathname; +} + +// Returns true if FilePath ends with a path separator, which indicates that +// it is intended to represent a directory. Returns false otherwise. +// This does NOT check that a directory (or file) actually exists. +bool FilePath::IsDirectory() const { + return !pathname_.empty() && + IsPathSeparator(pathname_.c_str()[pathname_.length() - 1]); +} + +// Create directories so that path exists. Returns true if successful or if +// the directories already exist; returns false if unable to create directories +// for any reason. +bool FilePath::CreateDirectoriesRecursively() const { + if (!this->IsDirectory()) { + return false; + } + + if (pathname_.length() == 0 || this->DirectoryExists()) { + return true; + } + + const FilePath parent(this->RemoveTrailingPathSeparator().RemoveFileName()); + return parent.CreateDirectoriesRecursively() && this->CreateFolder(); +} + +// Create the directory so that path exists. Returns true if successful or +// if the directory already exists; returns false if unable to create the +// directory for any reason, including if the parent directory does not +// exist. Not named "CreateDirectory" because that's a macro on Windows. +bool FilePath::CreateFolder() const { +#if GTEST_OS_WINDOWS_MOBILE + FilePath removed_sep(this->RemoveTrailingPathSeparator()); + LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str()); + int result = CreateDirectory(unicode, nullptr) ? 0 : -1; + delete[] unicode; +#elif GTEST_OS_WINDOWS + int result = _mkdir(pathname_.c_str()); +#elif GTEST_OS_ESP8266 + // do nothing + int result = 0; +#else + int result = mkdir(pathname_.c_str(), 0777); +#endif // GTEST_OS_WINDOWS_MOBILE + + if (result == -1) { + return this->DirectoryExists(); // An error is OK if the directory exists. + } + return true; // No error. +} + +// If input name has a trailing separator character, remove it and return the +// name, otherwise return the name string unmodified. +// On Windows platform, uses \ as the separator, other platforms use /. +FilePath FilePath::RemoveTrailingPathSeparator() const { + return IsDirectory() ? FilePath(pathname_.substr(0, pathname_.length() - 1)) + : *this; +} + +// Removes any redundant separators that might be in the pathname. +// For example, "bar///foo" becomes "bar/foo". Does not eliminate other +// redundancies that might be in a pathname involving "." or "..". +void FilePath::Normalize() { + if (pathname_.c_str() == nullptr) { + pathname_ = ""; + return; + } + const char *src = pathname_.c_str(); + char *const dest = new char[pathname_.length() + 1]; + char *dest_ptr = dest; + memset(dest_ptr, 0, pathname_.length() + 1); + + while (*src != '\0') { + *dest_ptr = *src; + if (!IsPathSeparator(*src)) { + src++; + } else { +#if GTEST_HAS_ALT_PATH_SEP_ + if (*dest_ptr == kAlternatePathSeparator) { + *dest_ptr = kPathSeparator; + } +#endif + while (IsPathSeparator(*src)) src++; + } + dest_ptr++; + } + *dest_ptr = '\0'; + pathname_ = dest; + delete[] dest; +} + +} // namespace internal +} // namespace testing diff --git a/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-internal-inl.h b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-internal-inl.h new file mode 100644 index 000000000..16d8cde66 --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-internal-inl.h @@ -0,0 +1,1213 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Utility functions and classes used by the Google C++ testing framework.// +// This file contains purely Google Test's internal implementation. Please +// DO NOT #INCLUDE IT IN A USER PROGRAM. + +#ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_ +#define GTEST_SRC_GTEST_INTERNAL_INL_H_ + +#ifndef _WIN32_WCE +#include +#endif // !_WIN32_WCE +#include +#include // For strtoll/_strtoul64/malloc/free. +#include // For memmove. + +#include +#include +#include +#include +#include + +#include "gtest/internal/gtest-port.h" + +#if GTEST_CAN_STREAM_RESULTS_ +#include // NOLINT +#include // NOLINT +#endif + +#if GTEST_OS_WINDOWS +#include // NOLINT +#endif // GTEST_OS_WINDOWS + +#include "gtest/gtest.h" +#include "gtest/gtest-spi.h" + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +namespace testing { + +// Declares the flags. +// +// We don't want the users to modify this flag in the code, but want +// Google Test's own unit tests to be able to access it. Therefore we +// declare it here as opposed to in gtest.h. +GTEST_DECLARE_bool_(death_test_use_fork); + +namespace internal { + +// The value of GetTestTypeId() as seen from within the Google Test +// library. This is solely for testing GetTestTypeId(). +GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest; + +// Names of the flags (needed for parsing Google Test flags). +const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests"; +const char kBreakOnFailureFlag[] = "break_on_failure"; +const char kCatchExceptionsFlag[] = "catch_exceptions"; +const char kColorFlag[] = "color"; +const char kFilterFlag[] = "filter"; +const char kListTestsFlag[] = "list_tests"; +const char kOutputFlag[] = "output"; +const char kPrintTimeFlag[] = "print_time"; +const char kPrintUTF8Flag[] = "print_utf8"; +const char kRandomSeedFlag[] = "random_seed"; +const char kRepeatFlag[] = "repeat"; +const char kShuffleFlag[] = "shuffle"; +const char kStackTraceDepthFlag[] = "stack_trace_depth"; +const char kStreamResultToFlag[] = "stream_result_to"; +const char kThrowOnFailureFlag[] = "throw_on_failure"; +const char kFlagfileFlag[] = "flagfile"; + +// A valid random seed must be in [1, kMaxRandomSeed]. +const int kMaxRandomSeed = 99999; + +// g_help_flag is true if and only if the --help flag or an equivalent form +// is specified on the command line. +GTEST_API_ extern bool g_help_flag; + +// Returns the current time in milliseconds. +GTEST_API_ TimeInMillis GetTimeInMillis(); + +// Returns true if and only if Google Test should use colors in the output. +GTEST_API_ bool ShouldUseColor(bool stdout_is_tty); + +// Formats the given time in milliseconds as seconds. +GTEST_API_ std::string FormatTimeInMillisAsSeconds(TimeInMillis ms); + +// Converts the given time in milliseconds to a date string in the ISO 8601 +// format, without the timezone information. N.B.: due to the use the +// non-reentrant localtime() function, this function is not thread safe. Do +// not use it in any code that can be called from multiple threads. +GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms); + +// Parses a string for an Int32 flag, in the form of "--flag=value". +// +// On success, stores the value of the flag in *value, and returns +// true. On failure, returns false without changing *value. +GTEST_API_ bool ParseInt32Flag(const char *str, const char *flag, + int32_t *value); + +// Returns a random seed in range [1, kMaxRandomSeed] based on the +// given --gtest_random_seed flag value. +inline int GetRandomSeedFromFlag(int32_t random_seed_flag) { + const unsigned int raw_seed = + (random_seed_flag == 0) ? static_cast(GetTimeInMillis()) + : static_cast(random_seed_flag); + + // Normalizes the actual seed to range [1, kMaxRandomSeed] such that + // it's easy to type. + const int normalized_seed = + static_cast((raw_seed - 1U) % + static_cast(kMaxRandomSeed)) + + 1; + return normalized_seed; +} + +// Returns the first valid random seed after 'seed'. The behavior is +// undefined if 'seed' is invalid. The seed after kMaxRandomSeed is +// considered to be 1. +inline int GetNextRandomSeed(int seed) { + GTEST_CHECK_(1 <= seed && seed <= kMaxRandomSeed) + << "Invalid random seed " << seed << " - must be in [1, " + << kMaxRandomSeed << "]."; + const int next_seed = seed + 1; + return (next_seed > kMaxRandomSeed) ? 1 : next_seed; +} + +// This class saves the values of all Google Test flags in its c'tor, and +// restores them in its d'tor. +class GTestFlagSaver { + public: + // The c'tor. + GTestFlagSaver() { + also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests); + break_on_failure_ = GTEST_FLAG(break_on_failure); + catch_exceptions_ = GTEST_FLAG(catch_exceptions); + color_ = GTEST_FLAG(color); + death_test_style_ = GTEST_FLAG(death_test_style); + death_test_use_fork_ = GTEST_FLAG(death_test_use_fork); + filter_ = GTEST_FLAG(filter); + internal_run_death_test_ = GTEST_FLAG(internal_run_death_test); + list_tests_ = GTEST_FLAG(list_tests); + output_ = GTEST_FLAG(output); + print_time_ = GTEST_FLAG(print_time); + print_utf8_ = GTEST_FLAG(print_utf8); + random_seed_ = GTEST_FLAG(random_seed); + repeat_ = GTEST_FLAG(repeat); + shuffle_ = GTEST_FLAG(shuffle); + stack_trace_depth_ = GTEST_FLAG(stack_trace_depth); + stream_result_to_ = GTEST_FLAG(stream_result_to); + throw_on_failure_ = GTEST_FLAG(throw_on_failure); + } + + // The d'tor is not virtual. DO NOT INHERIT FROM THIS CLASS. + ~GTestFlagSaver() { + GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_; + GTEST_FLAG(break_on_failure) = break_on_failure_; + GTEST_FLAG(catch_exceptions) = catch_exceptions_; + GTEST_FLAG(color) = color_; + GTEST_FLAG(death_test_style) = death_test_style_; + GTEST_FLAG(death_test_use_fork) = death_test_use_fork_; + GTEST_FLAG(filter) = filter_; + GTEST_FLAG(internal_run_death_test) = internal_run_death_test_; + GTEST_FLAG(list_tests) = list_tests_; + GTEST_FLAG(output) = output_; + GTEST_FLAG(print_time) = print_time_; + GTEST_FLAG(print_utf8) = print_utf8_; + GTEST_FLAG(random_seed) = random_seed_; + GTEST_FLAG(repeat) = repeat_; + GTEST_FLAG(shuffle) = shuffle_; + GTEST_FLAG(stack_trace_depth) = stack_trace_depth_; + GTEST_FLAG(stream_result_to) = stream_result_to_; + GTEST_FLAG(throw_on_failure) = throw_on_failure_; + } + + private: + // Fields for saving the original values of flags. + bool also_run_disabled_tests_; + bool break_on_failure_; + bool catch_exceptions_; + std::string color_; + std::string death_test_style_; + bool death_test_use_fork_; + std::string filter_; + std::string internal_run_death_test_; + bool list_tests_; + std::string output_; + bool print_time_; + bool print_utf8_; + int32_t random_seed_; + int32_t repeat_; + bool shuffle_; + int32_t stack_trace_depth_; + std::string stream_result_to_; + bool throw_on_failure_; +} GTEST_ATTRIBUTE_UNUSED_; + +// Converts a Unicode code point to a narrow string in UTF-8 encoding. +// code_point parameter is of type UInt32 because wchar_t may not be +// wide enough to contain a code point. +// If the code_point is not a valid Unicode code point +// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted +// to "(Invalid Unicode 0xXXXXXXXX)". +GTEST_API_ std::string CodePointToUtf8(uint32_t code_point); + +// Converts a wide string to a narrow string in UTF-8 encoding. +// The wide string is assumed to have the following encoding: +// UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin) +// UTF-32 if sizeof(wchar_t) == 4 (on Linux) +// Parameter str points to a null-terminated wide string. +// Parameter num_chars may additionally limit the number +// of wchar_t characters processed. -1 is used when the entire string +// should be processed. +// If the string contains code points that are not valid Unicode code points +// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output +// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding +// and contains invalid UTF-16 surrogate pairs, values in those pairs +// will be encoded as individual Unicode characters from Basic Normal Plane. +GTEST_API_ std::string WideStringToUtf8(const wchar_t *str, int num_chars); + +// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file +// if the variable is present. If a file already exists at this location, this +// function will write over it. If the variable is present, but the file cannot +// be created, prints an error and exits. +void WriteToShardStatusFileIfNeeded(); + +// Checks whether sharding is enabled by examining the relevant +// environment variable values. If the variables are present, +// but inconsistent (e.g., shard_index >= total_shards), prints +// an error and exits. If in_subprocess_for_death_test, sharding is +// disabled because it must only be applied to the original test +// process. Otherwise, we could filter out death tests we intended to execute. +GTEST_API_ bool ShouldShard(const char *total_shards_str, + const char *shard_index_str, + bool in_subprocess_for_death_test); + +// Parses the environment variable var as a 32-bit integer. If it is unset, +// returns default_val. If it is not a 32-bit integer, prints an error and +// and aborts. +GTEST_API_ int32_t Int32FromEnvOrDie(const char *env_var, int32_t default_val); + +// Given the total number of shards, the shard index, and the test id, +// returns true if and only if the test should be run on this shard. The test id +// is some arbitrary but unique non-negative integer assigned to each test +// method. Assumes that 0 <= shard_index < total_shards. +GTEST_API_ bool ShouldRunTestOnShard(int total_shards, int shard_index, + int test_id); + +// STL container utilities. + +// Returns the number of elements in the given container that satisfy +// the given predicate. +template +inline int CountIf(const Container &c, Predicate predicate) { + // Implemented as an explicit loop since std::count_if() in libCstd on + // Solaris has a non-standard signature. + int count = 0; + for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) { + if (predicate(*it)) ++count; + } + return count; +} + +// Applies a function/functor to each element in the container. +template +void ForEach(const Container &c, Functor functor) { + std::for_each(c.begin(), c.end(), functor); +} + +// Returns the i-th element of the vector, or default_value if i is not +// in range [0, v.size()). +template +inline E GetElementOr(const std::vector &v, int i, E default_value) { + return (i < 0 || i >= static_cast(v.size())) ? default_value + : v[static_cast(i)]; +} + +// Performs an in-place shuffle of a range of the vector's elements. +// 'begin' and 'end' are element indices as an STL-style range; +// i.e. [begin, end) are shuffled, where 'end' == size() means to +// shuffle to the end of the vector. +template +void ShuffleRange(internal::Random *random, int begin, int end, + std::vector *v) { + const int size = static_cast(v->size()); + GTEST_CHECK_(0 <= begin && begin <= size) + << "Invalid shuffle range start " << begin << ": must be in range [0, " + << size << "]."; + GTEST_CHECK_(begin <= end && end <= size) + << "Invalid shuffle range finish " << end << ": must be in range [" + << begin << ", " << size << "]."; + + // Fisher-Yates shuffle, from + // http://en.wikipedia.org/wiki/Fisher-Yates_shuffle + for (int range_width = end - begin; range_width >= 2; range_width--) { + const int last_in_range = begin + range_width - 1; + const int selected = + begin + + static_cast(random->Generate(static_cast(range_width))); + std::swap((*v)[static_cast(selected)], + (*v)[static_cast(last_in_range)]); + } +} + +// Performs an in-place shuffle of the vector's elements. +template +inline void Shuffle(internal::Random *random, std::vector *v) { + ShuffleRange(random, 0, static_cast(v->size()), v); +} + +// A function for deleting an object. Handy for being used as a +// functor. +template +static void Delete(T *x) { + delete x; +} + +// A predicate that checks the key of a TestProperty against a known key. +// +// TestPropertyKeyIs is copyable. +class TestPropertyKeyIs { + public: + // Constructor. + // + // TestPropertyKeyIs has NO default constructor. + explicit TestPropertyKeyIs(const std::string &key) : key_(key) {} + + // Returns true if and only if the test name of test property matches on key_. + bool operator()(const TestProperty &test_property) const { + return test_property.key() == key_; + } + + private: + std::string key_; +}; + +// Class UnitTestOptions. +// +// This class contains functions for processing options the user +// specifies when running the tests. It has only static members. +// +// In most cases, the user can specify an option using either an +// environment variable or a command line flag. E.g. you can set the +// test filter using either GTEST_FILTER or --gtest_filter. If both +// the variable and the flag are present, the latter overrides the +// former. +class GTEST_API_ UnitTestOptions { + public: + // Functions for processing the gtest_output flag. + + // Returns the output format, or "" for normal printed output. + static std::string GetOutputFormat(); + + // Returns the absolute path of the requested output file, or the + // default (test_detail.xml in the original working directory) if + // none was explicitly specified. + static std::string GetAbsolutePathToOutputFile(); + + // Functions for processing the gtest_filter flag. + + // Returns true if and only if the wildcard pattern matches the string. + // The first ':' or '\0' character in pattern marks the end of it. + // + // This recursive algorithm isn't very efficient, but is clear and + // works well enough for matching test names, which are short. + static bool PatternMatchesString(const char *pattern, const char *str); + + // Returns true if and only if the user-specified filter matches the test + // suite name and the test name. + static bool FilterMatchesTest(const std::string &test_suite_name, + const std::string &test_name); + +#if GTEST_OS_WINDOWS + // Function for supporting the gtest_catch_exception flag. + + // Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the + // given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise. + // This function is useful as an __except condition. + static int GTestShouldProcessSEH(DWORD exception_code); +#endif // GTEST_OS_WINDOWS + + // Returns true if "name" matches the ':' separated list of glob-style + // filters in "filter". + static bool MatchesFilter(const std::string &name, const char *filter); +}; + +// Returns the current application's name, removing directory path if that +// is present. Used by UnitTestOptions::GetOutputFile. +GTEST_API_ FilePath GetCurrentExecutableName(); + +// The role interface for getting the OS stack trace as a string. +class OsStackTraceGetterInterface { + public: + OsStackTraceGetterInterface() {} + virtual ~OsStackTraceGetterInterface() {} + + // Returns the current OS stack trace as an std::string. Parameters: + // + // max_depth - the maximum number of stack frames to be included + // in the trace. + // skip_count - the number of top frames to be skipped; doesn't count + // against max_depth. + virtual std::string CurrentStackTrace(int max_depth, int skip_count) = 0; + + // UponLeavingGTest() should be called immediately before Google Test calls + // user code. It saves some information about the current stack that + // CurrentStackTrace() will use to find and hide Google Test stack frames. + virtual void UponLeavingGTest() = 0; + + // This string is inserted in place of stack frames that are part of + // Google Test's implementation. + static const char *const kElidedFramesMarker; + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface); +}; + +// A working implementation of the OsStackTraceGetterInterface interface. +class OsStackTraceGetter : public OsStackTraceGetterInterface { + public: + OsStackTraceGetter() {} + + std::string CurrentStackTrace(int max_depth, int skip_count) override; + void UponLeavingGTest() override; + + private: +#if GTEST_HAS_ABSL + Mutex mutex_; // Protects all internal state. + + // We save the stack frame below the frame that calls user code. + // We do this because the address of the frame immediately below + // the user code changes between the call to UponLeavingGTest() + // and any calls to the stack trace code from within the user code. + void *caller_frame_ = nullptr; +#endif // GTEST_HAS_ABSL + + GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter); +}; + +// Information about a Google Test trace point. +struct TraceInfo { + const char *file; + int line; + std::string message; +}; + +// This is the default global test part result reporter used in UnitTestImpl. +// This class should only be used by UnitTestImpl. +class DefaultGlobalTestPartResultReporter + : public TestPartResultReporterInterface { + public: + explicit DefaultGlobalTestPartResultReporter(UnitTestImpl *unit_test); + // Implements the TestPartResultReporterInterface. Reports the test part + // result in the current test. + void ReportTestPartResult(const TestPartResult &result) override; + + private: + UnitTestImpl *const unit_test_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter); +}; + +// This is the default per thread test part result reporter used in +// UnitTestImpl. This class should only be used by UnitTestImpl. +class DefaultPerThreadTestPartResultReporter + : public TestPartResultReporterInterface { + public: + explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl *unit_test); + // Implements the TestPartResultReporterInterface. The implementation just + // delegates to the current global test part result reporter of *unit_test_. + void ReportTestPartResult(const TestPartResult &result) override; + + private: + UnitTestImpl *const unit_test_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter); +}; + +// The private implementation of the UnitTest class. We don't protect +// the methods under a mutex, as this class is not accessible by a +// user and the UnitTest class that delegates work to this class does +// proper locking. +class GTEST_API_ UnitTestImpl { + public: + explicit UnitTestImpl(UnitTest *parent); + virtual ~UnitTestImpl(); + + // There are two different ways to register your own TestPartResultReporter. + // You can register your own repoter to listen either only for test results + // from the current thread or for results from all threads. + // By default, each per-thread test result repoter just passes a new + // TestPartResult to the global test result reporter, which registers the + // test part result for the currently running test. + + // Returns the global test part result reporter. + TestPartResultReporterInterface *GetGlobalTestPartResultReporter(); + + // Sets the global test part result reporter. + void SetGlobalTestPartResultReporter( + TestPartResultReporterInterface *reporter); + + // Returns the test part result reporter for the current thread. + TestPartResultReporterInterface *GetTestPartResultReporterForCurrentThread(); + + // Sets the test part result reporter for the current thread. + void SetTestPartResultReporterForCurrentThread( + TestPartResultReporterInterface *reporter); + + // Gets the number of successful test suites. + int successful_test_suite_count() const; + + // Gets the number of failed test suites. + int failed_test_suite_count() const; + + // Gets the number of all test suites. + int total_test_suite_count() const; + + // Gets the number of all test suites that contain at least one test + // that should run. + int test_suite_to_run_count() const; + + // Gets the number of successful tests. + int successful_test_count() const; + + // Gets the number of skipped tests. + int skipped_test_count() const; + + // Gets the number of failed tests. + int failed_test_count() const; + + // Gets the number of disabled tests that will be reported in the XML report. + int reportable_disabled_test_count() const; + + // Gets the number of disabled tests. + int disabled_test_count() const; + + // Gets the number of tests to be printed in the XML report. + int reportable_test_count() const; + + // Gets the number of all tests. + int total_test_count() const; + + // Gets the number of tests that should run. + int test_to_run_count() const; + + // Gets the time of the test program start, in ms from the start of the + // UNIX epoch. + TimeInMillis start_timestamp() const { return start_timestamp_; } + + // Gets the elapsed time, in milliseconds. + TimeInMillis elapsed_time() const { return elapsed_time_; } + + // Returns true if and only if the unit test passed (i.e. all test suites + // passed). + bool Passed() const { return !Failed(); } + + // Returns true if and only if the unit test failed (i.e. some test suite + // failed or something outside of all tests failed). + bool Failed() const { + return failed_test_suite_count() > 0 || ad_hoc_test_result()->Failed(); + } + + // Gets the i-th test suite among all the test suites. i can range from 0 to + // total_test_suite_count() - 1. If i is not in that range, returns NULL. + const TestSuite *GetTestSuite(int i) const { + const int index = GetElementOr(test_suite_indices_, i, -1); + return index < 0 ? nullptr : test_suites_[static_cast(i)]; + } + + // Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + const TestCase *GetTestCase(int i) const { return GetTestSuite(i); } +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + // Gets the i-th test suite among all the test suites. i can range from 0 to + // total_test_suite_count() - 1. If i is not in that range, returns NULL. + TestSuite *GetMutableSuiteCase(int i) { + const int index = GetElementOr(test_suite_indices_, i, -1); + return index < 0 ? nullptr : test_suites_[static_cast(index)]; + } + + // Provides access to the event listener list. + TestEventListeners *listeners() { return &listeners_; } + + // Returns the TestResult for the test that's currently running, or + // the TestResult for the ad hoc test if no test is running. + TestResult *current_test_result(); + + // Returns the TestResult for the ad hoc test. + const TestResult *ad_hoc_test_result() const { return &ad_hoc_test_result_; } + + // Sets the OS stack trace getter. + // + // Does nothing if the input and the current OS stack trace getter + // are the same; otherwise, deletes the old getter and makes the + // input the current getter. + void set_os_stack_trace_getter(OsStackTraceGetterInterface *getter); + + // Returns the current OS stack trace getter if it is not NULL; + // otherwise, creates an OsStackTraceGetter, makes it the current + // getter, and returns it. + OsStackTraceGetterInterface *os_stack_trace_getter(); + + // Returns the current OS stack trace as an std::string. + // + // The maximum number of stack frames to be included is specified by + // the gtest_stack_trace_depth flag. The skip_count parameter + // specifies the number of top frames to be skipped, which doesn't + // count against the number of frames to be included. + // + // For example, if Foo() calls Bar(), which in turn calls + // CurrentOsStackTraceExceptTop(1), Foo() will be included in the + // trace but Bar() and CurrentOsStackTraceExceptTop() won't. + std::string CurrentOsStackTraceExceptTop(int skip_count) GTEST_NO_INLINE_; + + // Finds and returns a TestSuite with the given name. If one doesn't + // exist, creates one and returns it. + // + // Arguments: + // + // test_suite_name: name of the test suite + // type_param: the name of the test's type parameter, or NULL if + // this is not a typed or a type-parameterized test. + // set_up_tc: pointer to the function that sets up the test suite + // tear_down_tc: pointer to the function that tears down the test suite + TestSuite *GetTestSuite(const char *test_suite_name, const char *type_param, + internal::SetUpTestSuiteFunc set_up_tc, + internal::TearDownTestSuiteFunc tear_down_tc); + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + TestCase *GetTestCase(const char *test_case_name, const char *type_param, + internal::SetUpTestSuiteFunc set_up_tc, + internal::TearDownTestSuiteFunc tear_down_tc) { + return GetTestSuite(test_case_name, type_param, set_up_tc, tear_down_tc); + } +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + // Adds a TestInfo to the unit test. + // + // Arguments: + // + // set_up_tc: pointer to the function that sets up the test suite + // tear_down_tc: pointer to the function that tears down the test suite + // test_info: the TestInfo object + void AddTestInfo(internal::SetUpTestSuiteFunc set_up_tc, + internal::TearDownTestSuiteFunc tear_down_tc, + TestInfo *test_info) { + // In order to support thread-safe death tests, we need to + // remember the original working directory when the test program + // was first invoked. We cannot do this in RUN_ALL_TESTS(), as + // the user may have changed the current directory before calling + // RUN_ALL_TESTS(). Therefore we capture the current directory in + // AddTestInfo(), which is called to register a TEST or TEST_F + // before main() is reached. + if (original_working_dir_.IsEmpty()) { + original_working_dir_.Set(FilePath::GetCurrentDir()); + GTEST_CHECK_(!original_working_dir_.IsEmpty()) + << "Failed to get the current working directory."; + } + + GetTestSuite(test_info->test_suite_name(), test_info->type_param(), + set_up_tc, tear_down_tc) + ->AddTestInfo(test_info); + } + + // Returns ParameterizedTestSuiteRegistry object used to keep track of + // value-parameterized tests and instantiate and register them. + internal::ParameterizedTestSuiteRegistry ¶meterized_test_registry() { + return parameterized_test_registry_; + } + + std::set *ignored_parameterized_test_suites() { + return &ignored_parameterized_test_suites_; + } + + // Returns TypeParameterizedTestSuiteRegistry object used to keep track of + // type-parameterized tests and instantiations of them. + internal::TypeParameterizedTestSuiteRegistry & + type_parameterized_test_registry() { + return type_parameterized_test_registry_; + } + + // Sets the TestSuite object for the test that's currently running. + void set_current_test_suite(TestSuite *a_current_test_suite) { + current_test_suite_ = a_current_test_suite; + } + + // Sets the TestInfo object for the test that's currently running. If + // current_test_info is NULL, the assertion results will be stored in + // ad_hoc_test_result_. + void set_current_test_info(TestInfo *a_current_test_info) { + current_test_info_ = a_current_test_info; + } + + // Registers all parameterized tests defined using TEST_P and + // INSTANTIATE_TEST_SUITE_P, creating regular tests for each test/parameter + // combination. This method can be called more then once; it has guards + // protecting from registering the tests more then once. If + // value-parameterized tests are disabled, RegisterParameterizedTests is + // present but does nothing. + void RegisterParameterizedTests(); + + // Runs all tests in this UnitTest object, prints the result, and + // returns true if all tests are successful. If any exception is + // thrown during a test, this test is considered to be failed, but + // the rest of the tests will still be run. + bool RunAllTests(); + + // Clears the results of all tests, except the ad hoc tests. + void ClearNonAdHocTestResult() { + ForEach(test_suites_, TestSuite::ClearTestSuiteResult); + } + + // Clears the results of ad-hoc test assertions. + void ClearAdHocTestResult() { ad_hoc_test_result_.Clear(); } + + // Adds a TestProperty to the current TestResult object when invoked in a + // context of a test or a test suite, or to the global property set. If the + // result already contains a property with the same key, the value will be + // updated. + void RecordProperty(const TestProperty &test_property); + + enum ReactionToSharding { HONOR_SHARDING_PROTOCOL, IGNORE_SHARDING_PROTOCOL }; + + // Matches the full name of each test against the user-specified + // filter to decide whether the test should run, then records the + // result in each TestSuite and TestInfo object. + // If shard_tests == HONOR_SHARDING_PROTOCOL, further filters tests + // based on sharding variables in the environment. + // Returns the number of tests that should run. + int FilterTests(ReactionToSharding shard_tests); + + // Prints the names of the tests matching the user-specified filter flag. + void ListTestsMatchingFilter(); + + const TestSuite *current_test_suite() const { return current_test_suite_; } + TestInfo *current_test_info() { return current_test_info_; } + const TestInfo *current_test_info() const { return current_test_info_; } + + // Returns the vector of environments that need to be set-up/torn-down + // before/after the tests are run. + std::vector &environments() { return environments_; } + + // Getters for the per-thread Google Test trace stack. + std::vector >est_trace_stack() { + return *(gtest_trace_stack_.pointer()); + } + const std::vector >est_trace_stack() const { + return gtest_trace_stack_.get(); + } + +#if GTEST_HAS_DEATH_TEST + void InitDeathTestSubprocessControlInfo() { + internal_run_death_test_flag_.reset(ParseInternalRunDeathTestFlag()); + } + // Returns a pointer to the parsed --gtest_internal_run_death_test + // flag, or NULL if that flag was not specified. + // This information is useful only in a death test child process. + // Must not be called before a call to InitGoogleTest. + const InternalRunDeathTestFlag *internal_run_death_test_flag() const { + return internal_run_death_test_flag_.get(); + } + + // Returns a pointer to the current death test factory. + internal::DeathTestFactory *death_test_factory() { + return death_test_factory_.get(); + } + + void SuppressTestEventsIfInSubprocess(); + + friend class ReplaceDeathTestFactory; +#endif // GTEST_HAS_DEATH_TEST + + // Initializes the event listener performing XML output as specified by + // UnitTestOptions. Must not be called before InitGoogleTest. + void ConfigureXmlOutput(); + +#if GTEST_CAN_STREAM_RESULTS_ + // Initializes the event listener for streaming test results to a socket. + // Must not be called before InitGoogleTest. + void ConfigureStreamingOutput(); +#endif + + // Performs initialization dependent upon flag values obtained in + // ParseGoogleTestFlagsOnly. Is called from InitGoogleTest after the call to + // ParseGoogleTestFlagsOnly. In case a user neglects to call InitGoogleTest + // this function is also called from RunAllTests. Since this function can be + // called more than once, it has to be idempotent. + void PostFlagParsingInit(); + + // Gets the random seed used at the start of the current test iteration. + int random_seed() const { return random_seed_; } + + // Gets the random number generator. + internal::Random *random() { return &random_; } + + // Shuffles all test suites, and the tests within each test suite, + // making sure that death tests are still run first. + void ShuffleTests(); + + // Restores the test suites and tests to their order before the first shuffle. + void UnshuffleTests(); + + // Returns the value of GTEST_FLAG(catch_exceptions) at the moment + // UnitTest::Run() starts. + bool catch_exceptions() const { return catch_exceptions_; } + + private: + friend class ::testing::UnitTest; + + // Used by UnitTest::Run() to capture the state of + // GTEST_FLAG(catch_exceptions) at the moment it starts. + void set_catch_exceptions(bool value) { catch_exceptions_ = value; } + + // The UnitTest object that owns this implementation object. + UnitTest *const parent_; + + // The working directory when the first TEST() or TEST_F() was + // executed. + internal::FilePath original_working_dir_; + + // The default test part result reporters. + DefaultGlobalTestPartResultReporter default_global_test_part_result_reporter_; + DefaultPerThreadTestPartResultReporter + default_per_thread_test_part_result_reporter_; + + // Points to (but doesn't own) the global test part result reporter. + TestPartResultReporterInterface *global_test_part_result_repoter_; + + // Protects read and write access to global_test_part_result_reporter_. + internal::Mutex global_test_part_result_reporter_mutex_; + + // Points to (but doesn't own) the per-thread test part result reporter. + internal::ThreadLocal + per_thread_test_part_result_reporter_; + + // The vector of environments that need to be set-up/torn-down + // before/after the tests are run. + std::vector environments_; + + // The vector of TestSuites in their original order. It owns the + // elements in the vector. + std::vector test_suites_; + + // Provides a level of indirection for the test suite list to allow + // easy shuffling and restoring the test suite order. The i-th + // element of this vector is the index of the i-th test suite in the + // shuffled order. + std::vector test_suite_indices_; + + // ParameterizedTestRegistry object used to register value-parameterized + // tests. + internal::ParameterizedTestSuiteRegistry parameterized_test_registry_; + internal::TypeParameterizedTestSuiteRegistry + type_parameterized_test_registry_; + + // The set holding the name of parameterized + // test suites that may go uninstantiated. + std::set ignored_parameterized_test_suites_; + + // Indicates whether RegisterParameterizedTests() has been called already. + bool parameterized_tests_registered_; + + // Index of the last death test suite registered. Initially -1. + int last_death_test_suite_; + + // This points to the TestSuite for the currently running test. It + // changes as Google Test goes through one test suite after another. + // When no test is running, this is set to NULL and Google Test + // stores assertion results in ad_hoc_test_result_. Initially NULL. + TestSuite *current_test_suite_; + + // This points to the TestInfo for the currently running test. It + // changes as Google Test goes through one test after another. When + // no test is running, this is set to NULL and Google Test stores + // assertion results in ad_hoc_test_result_. Initially NULL. + TestInfo *current_test_info_; + + // Normally, a user only writes assertions inside a TEST or TEST_F, + // or inside a function called by a TEST or TEST_F. Since Google + // Test keeps track of which test is current running, it can + // associate such an assertion with the test it belongs to. + // + // If an assertion is encountered when no TEST or TEST_F is running, + // Google Test attributes the assertion result to an imaginary "ad hoc" + // test, and records the result in ad_hoc_test_result_. + TestResult ad_hoc_test_result_; + + // The list of event listeners that can be used to track events inside + // Google Test. + TestEventListeners listeners_; + + // The OS stack trace getter. Will be deleted when the UnitTest + // object is destructed. By default, an OsStackTraceGetter is used, + // but the user can set this field to use a custom getter if that is + // desired. + OsStackTraceGetterInterface *os_stack_trace_getter_; + + // True if and only if PostFlagParsingInit() has been called. + bool post_flag_parse_init_performed_; + + // The random number seed used at the beginning of the test run. + int random_seed_; + + // Our random number generator. + internal::Random random_; + + // The time of the test program start, in ms from the start of the + // UNIX epoch. + TimeInMillis start_timestamp_; + + // How long the test took to run, in milliseconds. + TimeInMillis elapsed_time_; + +#if GTEST_HAS_DEATH_TEST + // The decomposed components of the gtest_internal_run_death_test flag, + // parsed when RUN_ALL_TESTS is called. + std::unique_ptr internal_run_death_test_flag_; + std::unique_ptr death_test_factory_; +#endif // GTEST_HAS_DEATH_TEST + + // A per-thread stack of traces created by the SCOPED_TRACE() macro. + internal::ThreadLocal > gtest_trace_stack_; + + // The value of GTEST_FLAG(catch_exceptions) at the moment RunAllTests() + // starts. + bool catch_exceptions_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl); +}; // class UnitTestImpl + +// Convenience function for accessing the global UnitTest +// implementation object. +inline UnitTestImpl *GetUnitTestImpl() { + return UnitTest::GetInstance()->impl(); +} + +#if GTEST_USES_SIMPLE_RE + +// Internal helper functions for implementing the simple regular +// expression matcher. +GTEST_API_ bool IsInSet(char ch, const char *str); +GTEST_API_ bool IsAsciiDigit(char ch); +GTEST_API_ bool IsAsciiPunct(char ch); +GTEST_API_ bool IsRepeat(char ch); +GTEST_API_ bool IsAsciiWhiteSpace(char ch); +GTEST_API_ bool IsAsciiWordChar(char ch); +GTEST_API_ bool IsValidEscape(char ch); +GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch); +GTEST_API_ bool ValidateRegex(const char *regex); +GTEST_API_ bool MatchRegexAtHead(const char *regex, const char *str); +GTEST_API_ bool MatchRepetitionAndRegexAtHead(bool escaped, char ch, + char repeat, const char *regex, + const char *str); +GTEST_API_ bool MatchRegexAnywhere(const char *regex, const char *str); + +#endif // GTEST_USES_SIMPLE_RE + +// Parses the command line for Google Test flags, without initializing +// other parts of Google Test. +GTEST_API_ void ParseGoogleTestFlagsOnly(int *argc, char **argv); +GTEST_API_ void ParseGoogleTestFlagsOnly(int *argc, wchar_t **argv); + +#if GTEST_HAS_DEATH_TEST + +// Returns the message describing the last system error, regardless of the +// platform. +GTEST_API_ std::string GetLastErrnoDescription(); + +// Attempts to parse a string into a positive integer pointed to by the +// number parameter. Returns true if that is possible. +// GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we can use +// it here. +template +bool ParseNaturalNumber(const ::std::string &str, Integer *number) { + // Fail fast if the given string does not begin with a digit; + // this bypasses strtoXXX's "optional leading whitespace and plus + // or minus sign" semantics, which are undesirable here. + if (str.empty() || !IsDigit(str[0])) { + return false; + } + errno = 0; + + char *end; + // BiggestConvertible is the largest integer type that system-provided + // string-to-number conversion routines can return. + using BiggestConvertible = unsigned long long; // NOLINT + + const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10); // NOLINT + const bool parse_success = *end == '\0' && errno == 0; + + GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed)); + + const Integer result = static_cast(parsed); + if (parse_success && static_cast(result) == parsed) { + *number = result; + return true; + } + return false; +} +#endif // GTEST_HAS_DEATH_TEST + +// TestResult contains some private methods that should be hidden from +// Google Test user but are required for testing. This class allow our tests +// to access them. +// +// This class is supplied only for the purpose of testing Google Test's own +// constructs. Do not use it in user tests, either directly or indirectly. +class TestResultAccessor { + public: + static void RecordProperty(TestResult *test_result, + const std::string &xml_element, + const TestProperty &property) { + test_result->RecordProperty(xml_element, property); + } + + static void ClearTestPartResults(TestResult *test_result) { + test_result->ClearTestPartResults(); + } + + static const std::vector &test_part_results( + const TestResult &test_result) { + return test_result.test_part_results(); + } +}; + +#if GTEST_CAN_STREAM_RESULTS_ + +// Streams test results to the given port on the given host machine. +class StreamingListener : public EmptyTestEventListener { + public: + // Abstract base class for writing strings to a socket. + class AbstractSocketWriter { + public: + virtual ~AbstractSocketWriter() {} + + // Sends a string to the socket. + virtual void Send(const std::string &message) = 0; + + // Closes the socket. + virtual void CloseConnection() {} + + // Sends a string and a newline to the socket. + void SendLn(const std::string &message) { Send(message + "\n"); } + }; + + // Concrete class for actually writing strings to a socket. + class SocketWriter : public AbstractSocketWriter { + public: + SocketWriter(const std::string &host, const std::string &port) + : sockfd_(-1), host_name_(host), port_num_(port) { + MakeConnection(); + } + + ~SocketWriter() override { + if (sockfd_ != -1) CloseConnection(); + } + + // Sends a string to the socket. + void Send(const std::string &message) override { + GTEST_CHECK_(sockfd_ != -1) + << "Send() can be called only when there is a connection."; + + const auto len = static_cast(message.length()); + if (write(sockfd_, message.c_str(), len) != static_cast(len)) { + GTEST_LOG_(WARNING) << "stream_result_to: failed to stream to " + << host_name_ << ":" << port_num_; + } + } + + private: + // Creates a client socket and connects to the server. + void MakeConnection(); + + // Closes the socket. + void CloseConnection() override { + GTEST_CHECK_(sockfd_ != -1) + << "CloseConnection() can be called only when there is a connection."; + + close(sockfd_); + sockfd_ = -1; + } + + int sockfd_; // socket file descriptor + const std::string host_name_; + const std::string port_num_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter); + }; // class SocketWriter + + // Escapes '=', '&', '%', and '\n' characters in str as "%xx". + static std::string UrlEncode(const char *str); + + StreamingListener(const std::string &host, const std::string &port) + : socket_writer_(new SocketWriter(host, port)) { + Start(); + } + + explicit StreamingListener(AbstractSocketWriter *socket_writer) + : socket_writer_(socket_writer) { + Start(); + } + + void OnTestProgramStart(const UnitTest & /* unit_test */) override { + SendLn("event=TestProgramStart"); + } + + void OnTestProgramEnd(const UnitTest &unit_test) override { + // Note that Google Test current only report elapsed time for each + // test iteration, not for the entire test program. + SendLn("event=TestProgramEnd&passed=" + FormatBool(unit_test.Passed())); + + // Notify the streaming server to stop. + socket_writer_->CloseConnection(); + } + + void OnTestIterationStart(const UnitTest & /* unit_test */, + int iteration) override { + SendLn("event=TestIterationStart&iteration=" + + StreamableToString(iteration)); + } + + void OnTestIterationEnd(const UnitTest &unit_test, + int /* iteration */) override { + SendLn("event=TestIterationEnd&passed=" + FormatBool(unit_test.Passed()) + + "&elapsed_time=" + StreamableToString(unit_test.elapsed_time()) + + "ms"); + } + + // Note that "event=TestCaseStart" is a wire format and has to remain + // "case" for compatibilty + void OnTestCaseStart(const TestCase &test_case) override { + SendLn(std::string("event=TestCaseStart&name=") + test_case.name()); + } + + // Note that "event=TestCaseEnd" is a wire format and has to remain + // "case" for compatibilty + void OnTestCaseEnd(const TestCase &test_case) override { + SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed()) + + "&elapsed_time=" + StreamableToString(test_case.elapsed_time()) + + "ms"); + } + + void OnTestStart(const TestInfo &test_info) override { + SendLn(std::string("event=TestStart&name=") + test_info.name()); + } + + void OnTestEnd(const TestInfo &test_info) override { + SendLn("event=TestEnd&passed=" + + FormatBool((test_info.result())->Passed()) + "&elapsed_time=" + + StreamableToString((test_info.result())->elapsed_time()) + "ms"); + } + + void OnTestPartResult(const TestPartResult &test_part_result) override { + const char *file_name = test_part_result.file_name(); + if (file_name == nullptr) file_name = ""; + SendLn("event=TestPartResult&file=" + UrlEncode(file_name) + + "&line=" + StreamableToString(test_part_result.line_number()) + + "&message=" + UrlEncode(test_part_result.message())); + } + + private: + // Sends the given message and a newline to the socket. + void SendLn(const std::string &message) { socket_writer_->SendLn(message); } + + // Called at the start of streaming to notify the receiver what + // protocol we are using. + void Start() { SendLn("gtest_streaming_protocol_version=1.0"); } + + std::string FormatBool(bool value) { return value ? "1" : "0"; } + + const std::unique_ptr socket_writer_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener); +}; // class StreamingListener + +#endif // GTEST_CAN_STREAM_RESULTS_ + +} // namespace internal +} // namespace testing + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +#endif // GTEST_SRC_GTEST_INTERNAL_INL_H_ diff --git a/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-matchers.cc b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-matchers.cc new file mode 100644 index 000000000..27aaa2b7c --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-matchers.cc @@ -0,0 +1,97 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This file implements just enough of the matcher interface to allow +// EXPECT_DEATH and friends to accept a matcher argument. + +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-port.h" +#include "gtest/gtest-matchers.h" + +#include + +namespace testing { + +// Constructs a matcher that matches a const std::string& whose value is +// equal to s. +Matcher::Matcher(const std::string &s) { *this = Eq(s); } + +// Constructs a matcher that matches a const std::string& whose value is +// equal to s. +Matcher::Matcher(const char *s) { + *this = Eq(std::string(s)); +} + +// Constructs a matcher that matches a std::string whose value is equal to +// s. +Matcher::Matcher(const std::string &s) { *this = Eq(s); } + +// Constructs a matcher that matches a std::string whose value is equal to +// s. +Matcher::Matcher(const char *s) { *this = Eq(std::string(s)); } + +#if GTEST_INTERNAL_HAS_STRING_VIEW +// Constructs a matcher that matches a const StringView& whose value is +// equal to s. +Matcher::Matcher(const std::string &s) { + *this = Eq(s); +} + +// Constructs a matcher that matches a const StringView& whose value is +// equal to s. +Matcher::Matcher(const char *s) { + *this = Eq(std::string(s)); +} + +// Constructs a matcher that matches a const StringView& whose value is +// equal to s. +Matcher::Matcher(internal::StringView s) { + *this = Eq(std::string(s)); +} + +// Constructs a matcher that matches a StringView whose value is equal to +// s. +Matcher::Matcher(const std::string &s) { *this = Eq(s); } + +// Constructs a matcher that matches a StringView whose value is equal to +// s. +Matcher::Matcher(const char *s) { + *this = Eq(std::string(s)); +} + +// Constructs a matcher that matches a StringView whose value is equal to +// s. +Matcher::Matcher(internal::StringView s) { + *this = Eq(std::string(s)); +} +#endif // GTEST_INTERNAL_HAS_STRING_VIEW + +} // namespace testing diff --git a/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-port.cc b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-port.cc new file mode 100644 index 000000000..adfdbef9c --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-port.cc @@ -0,0 +1,1361 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "gtest/internal/gtest-port.h" + +#include +#include +#include +#include +#include +#include +#include + +#if GTEST_OS_WINDOWS +#include +#include +#include +#include // Used in ThreadLocal. +#ifdef _MSC_VER +#include +#endif // _MSC_VER +#else +#include +#endif // GTEST_OS_WINDOWS + +#if GTEST_OS_MAC +#include +#include +#include +#endif // GTEST_OS_MAC + +#if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \ + GTEST_OS_NETBSD || GTEST_OS_OPENBSD +#include +#if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD +#include +#endif +#endif + +#if GTEST_OS_QNX +#include +#include +#include +#endif // GTEST_OS_QNX + +#if GTEST_OS_AIX +#include +#include +#endif // GTEST_OS_AIX + +#if GTEST_OS_FUCHSIA +#include +#include +#endif // GTEST_OS_FUCHSIA + +#include "gtest/gtest-spi.h" +#include "gtest/gtest-message.h" +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-string.h" +#include "src/gtest-internal-inl.h" + +namespace testing { +namespace internal { + +#if defined(_MSC_VER) || defined(__BORLANDC__) +// MSVC and C++Builder do not provide a definition of STDERR_FILENO. +const int kStdOutFileno = 1; +const int kStdErrFileno = 2; +#else +const int kStdOutFileno = STDOUT_FILENO; +const int kStdErrFileno = STDERR_FILENO; +#endif // _MSC_VER + +#if GTEST_OS_LINUX + +namespace { +template +T ReadProcFileField(const std::string &filename, int field) { + std::string dummy; + std::ifstream file(filename.c_str()); + while (field-- > 0) { + file >> dummy; + } + T output = 0; + file >> output; + return output; +} +} // namespace + +// Returns the number of active threads, or 0 when there is an error. +size_t GetThreadCount() { + const std::string filename = + (Message() << "/proc/" << getpid() << "/stat").GetString(); + return ReadProcFileField(filename, 19); +} + +#elif GTEST_OS_MAC + +size_t GetThreadCount() { + const task_t task = mach_task_self(); + mach_msg_type_number_t thread_count; + thread_act_array_t thread_list; + const kern_return_t status = task_threads(task, &thread_list, &thread_count); + if (status == KERN_SUCCESS) { + // task_threads allocates resources in thread_list and we need to free them + // to avoid leaks. + vm_deallocate(task, reinterpret_cast(thread_list), + sizeof(thread_t) * thread_count); + return static_cast(thread_count); + } else { + return 0; + } +} + +#elif GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \ + GTEST_OS_NETBSD + +#if GTEST_OS_NETBSD +#undef KERN_PROC +#define KERN_PROC KERN_PROC2 +#define kinfo_proc kinfo_proc2 +#endif + +#if GTEST_OS_DRAGONFLY +#define KP_NLWP(kp) (kp.kp_nthreads) +#elif GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD +#define KP_NLWP(kp) (kp.ki_numthreads) +#elif GTEST_OS_NETBSD +#define KP_NLWP(kp) (kp.p_nlwps) +#endif + +// Returns the number of threads running in the process, or 0 to indicate that +// we cannot detect it. +size_t GetThreadCount() { + int mib[] = { + CTL_KERN, + KERN_PROC, + KERN_PROC_PID, + getpid(), +#if GTEST_OS_NETBSD + sizeof(struct kinfo_proc), + 1, +#endif + }; + u_int miblen = sizeof(mib) / sizeof(mib[0]); + struct kinfo_proc info; + size_t size = sizeof(info); + if (sysctl(mib, miblen, &info, &size, NULL, 0)) { + return 0; + } + return static_cast(KP_NLWP(info)); +} +#elif GTEST_OS_OPENBSD + +// Returns the number of threads running in the process, or 0 to indicate that +// we cannot detect it. +size_t GetThreadCount() { + int mib[] = { + CTL_KERN, + KERN_PROC, + KERN_PROC_PID | KERN_PROC_SHOW_THREADS, + getpid(), + sizeof(struct kinfo_proc), + 0, + }; + u_int miblen = sizeof(mib) / sizeof(mib[0]); + + // get number of structs + size_t size; + if (sysctl(mib, miblen, NULL, &size, NULL, 0)) { + return 0; + } + mib[5] = size / mib[4]; + + // populate array of structs + struct kinfo_proc info[mib[5]]; + if (sysctl(mib, miblen, &info, &size, NULL, 0)) { + return 0; + } + + // exclude empty members + int nthreads = 0; + for (int i = 0; i < size / mib[4]; i++) { + if (info[i].p_tid != -1) nthreads++; + } + return nthreads; +} + +#elif GTEST_OS_QNX + +// Returns the number of threads running in the process, or 0 to indicate that +// we cannot detect it. +size_t GetThreadCount() { + const int fd = open("/proc/self/as", O_RDONLY); + if (fd < 0) { + return 0; + } + procfs_info process_info; + const int status = + devctl(fd, DCMD_PROC_INFO, &process_info, sizeof(process_info), nullptr); + close(fd); + if (status == EOK) { + return static_cast(process_info.num_threads); + } else { + return 0; + } +} + +#elif GTEST_OS_AIX + +size_t GetThreadCount() { + struct procentry64 entry; + pid_t pid = getpid(); + int status = getprocs64(&entry, sizeof(entry), nullptr, 0, &pid, 1); + if (status == 1) { + return entry.pi_thcount; + } else { + return 0; + } +} + +#elif GTEST_OS_FUCHSIA + +size_t GetThreadCount() { + int dummy_buffer; + size_t avail; + zx_status_t status = + zx_object_get_info(zx_process_self(), ZX_INFO_PROCESS_THREADS, + &dummy_buffer, 0, nullptr, &avail); + if (status == ZX_OK) { + return avail; + } else { + return 0; + } +} + +#else + +size_t GetThreadCount() { + // There's no portable way to detect the number of threads, so we just + // return 0 to indicate that we cannot detect it. + return 0; +} + +#endif // GTEST_OS_LINUX + +#if GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS + +void SleepMilliseconds(int n) { ::Sleep(static_cast(n)); } + +AutoHandle::AutoHandle() : handle_(INVALID_HANDLE_VALUE) {} + +AutoHandle::AutoHandle(Handle handle) : handle_(handle) {} + +AutoHandle::~AutoHandle() { Reset(); } + +AutoHandle::Handle AutoHandle::Get() const { return handle_; } + +void AutoHandle::Reset() { Reset(INVALID_HANDLE_VALUE); } + +void AutoHandle::Reset(HANDLE handle) { + // Resetting with the same handle we already own is invalid. + if (handle_ != handle) { + if (IsCloseable()) { + ::CloseHandle(handle_); + } + handle_ = handle; + } else { + GTEST_CHECK_(!IsCloseable()) + << "Resetting a valid handle to itself is likely a programmer error " + "and thus not allowed."; + } +} + +bool AutoHandle::IsCloseable() const { + // Different Windows APIs may use either of these values to represent an + // invalid handle. + return handle_ != nullptr && handle_ != INVALID_HANDLE_VALUE; +} + +Notification::Notification() + : event_(::CreateEvent(nullptr, // Default security attributes. + TRUE, // Do not reset automatically. + FALSE, // Initially unset. + nullptr)) { // Anonymous event. + GTEST_CHECK_(event_.Get() != nullptr); +} + +void Notification::Notify() { GTEST_CHECK_(::SetEvent(event_.Get()) != FALSE); } + +void Notification::WaitForNotification() { + GTEST_CHECK_(::WaitForSingleObject(event_.Get(), INFINITE) == WAIT_OBJECT_0); +} + +Mutex::Mutex() + : owner_thread_id_(0), type_(kDynamic), critical_section_init_phase_(0), + critical_section_(new CRITICAL_SECTION) { + ::InitializeCriticalSection(critical_section_); +} + +Mutex::~Mutex() { + // Static mutexes are leaked intentionally. It is not thread-safe to try + // to clean them up. + if (type_ == kDynamic) { + ::DeleteCriticalSection(critical_section_); + delete critical_section_; + critical_section_ = nullptr; + } +} + +void Mutex::Lock() { + ThreadSafeLazyInit(); + ::EnterCriticalSection(critical_section_); + owner_thread_id_ = ::GetCurrentThreadId(); +} + +void Mutex::Unlock() { + ThreadSafeLazyInit(); + // We don't protect writing to owner_thread_id_ here, as it's the + // caller's responsibility to ensure that the current thread holds the + // mutex when this is called. + owner_thread_id_ = 0; + ::LeaveCriticalSection(critical_section_); +} + +// Does nothing if the current thread holds the mutex. Otherwise, crashes +// with high probability. +void Mutex::AssertHeld() { + ThreadSafeLazyInit(); + GTEST_CHECK_(owner_thread_id_ == ::GetCurrentThreadId()) + << "The current thread is not holding the mutex @" << this; +} + +namespace { + +#ifdef _MSC_VER +// Use the RAII idiom to flag mem allocs that are intentionally never +// deallocated. The motivation is to silence the false positive mem leaks +// that are reported by the debug version of MS's CRT which can only detect +// if an alloc is missing a matching deallocation. +// Example: +// MemoryIsNotDeallocated memory_is_not_deallocated; +// critical_section_ = new CRITICAL_SECTION; +// +class MemoryIsNotDeallocated { + public: + MemoryIsNotDeallocated() : old_crtdbg_flag_(0) { + old_crtdbg_flag_ = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG); + // Set heap allocation block type to _IGNORE_BLOCK so that MS debug CRT + // doesn't report mem leak if there's no matching deallocation. + _CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF); + } + + ~MemoryIsNotDeallocated() { + // Restore the original _CRTDBG_ALLOC_MEM_DF flag + _CrtSetDbgFlag(old_crtdbg_flag_); + } + + private: + int old_crtdbg_flag_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(MemoryIsNotDeallocated); +}; +#endif // _MSC_VER + +} // namespace + +// Initializes owner_thread_id_ and critical_section_ in static mutexes. +void Mutex::ThreadSafeLazyInit() { + // Dynamic mutexes are initialized in the constructor. + if (type_ == kStatic) { + switch ( + ::InterlockedCompareExchange(&critical_section_init_phase_, 1L, 0L)) { + case 0: + // If critical_section_init_phase_ was 0 before the exchange, we + // are the first to test it and need to perform the initialization. + owner_thread_id_ = 0; + { + // Use RAII to flag that following mem alloc is never deallocated. +#ifdef _MSC_VER + MemoryIsNotDeallocated memory_is_not_deallocated; +#endif // _MSC_VER + critical_section_ = new CRITICAL_SECTION; + } + ::InitializeCriticalSection(critical_section_); + // Updates the critical_section_init_phase_ to 2 to signal + // initialization complete. + GTEST_CHECK_(::InterlockedCompareExchange(&critical_section_init_phase_, + 2L, 1L) == 1L); + break; + case 1: + // Somebody else is already initializing the mutex; spin until they + // are done. + while (::InterlockedCompareExchange(&critical_section_init_phase_, 2L, + 2L) != 2L) { + // Possibly yields the rest of the thread's time slice to other + // threads. + ::Sleep(0); + } + break; + + case 2: break; // The mutex is already initialized and ready for use. + + default: + GTEST_CHECK_(false) + << "Unexpected value of critical_section_init_phase_ " + << "while initializing a static mutex."; + } + } +} + +namespace { + +class ThreadWithParamSupport : public ThreadWithParamBase { + public: + static HANDLE CreateThread(Runnable *runnable, + Notification *thread_can_start) { + ThreadMainParam *param = new ThreadMainParam(runnable, thread_can_start); + DWORD thread_id; + HANDLE thread_handle = ::CreateThread( + nullptr, // Default security. + 0, // Default stack size. + &ThreadWithParamSupport::ThreadMain, + param, // Parameter to ThreadMainStatic + 0x0, // Default creation flags. + &thread_id); // Need a valid pointer for the call to work under Win98. + GTEST_CHECK_(thread_handle != nullptr) + << "CreateThread failed with error " << ::GetLastError() << "."; + if (thread_handle == nullptr) { + delete param; + } + return thread_handle; + } + + private: + struct ThreadMainParam { + ThreadMainParam(Runnable *runnable, Notification *thread_can_start) + : runnable_(runnable), thread_can_start_(thread_can_start) {} + std::unique_ptr runnable_; + // Does not own. + Notification *thread_can_start_; + }; + + static DWORD WINAPI ThreadMain(void *ptr) { + // Transfers ownership. + std::unique_ptr param(static_cast(ptr)); + if (param->thread_can_start_ != nullptr) + param->thread_can_start_->WaitForNotification(); + param->runnable_->Run(); + return 0; + } + + // Prohibit instantiation. + ThreadWithParamSupport(); + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParamSupport); +}; + +} // namespace + +ThreadWithParamBase::ThreadWithParamBase(Runnable *runnable, + Notification *thread_can_start) + : thread_( + ThreadWithParamSupport::CreateThread(runnable, thread_can_start)) {} + +ThreadWithParamBase::~ThreadWithParamBase() { Join(); } + +void ThreadWithParamBase::Join() { + GTEST_CHECK_(::WaitForSingleObject(thread_.Get(), INFINITE) == WAIT_OBJECT_0) + << "Failed to join the thread with error " << ::GetLastError() << "."; +} + +// Maps a thread to a set of ThreadIdToThreadLocals that have values +// instantiated on that thread and notifies them when the thread exits. A +// ThreadLocal instance is expected to persist until all threads it has +// values on have terminated. +class ThreadLocalRegistryImpl { + public: + // Registers thread_local_instance as having value on the current thread. + // Returns a value that can be used to identify the thread from other threads. + static ThreadLocalValueHolderBase *GetValueOnCurrentThread( + const ThreadLocalBase *thread_local_instance) { +#ifdef _MSC_VER + MemoryIsNotDeallocated memory_is_not_deallocated; +#endif // _MSC_VER + DWORD current_thread = ::GetCurrentThreadId(); + MutexLock lock(&mutex_); + ThreadIdToThreadLocals *const thread_to_thread_locals = + GetThreadLocalsMapLocked(); + ThreadIdToThreadLocals::iterator thread_local_pos = + thread_to_thread_locals->find(current_thread); + if (thread_local_pos == thread_to_thread_locals->end()) { + thread_local_pos = + thread_to_thread_locals + ->insert(std::make_pair(current_thread, ThreadLocalValues())) + .first; + StartWatcherThreadFor(current_thread); + } + ThreadLocalValues &thread_local_values = thread_local_pos->second; + ThreadLocalValues::iterator value_pos = + thread_local_values.find(thread_local_instance); + if (value_pos == thread_local_values.end()) { + value_pos = + thread_local_values + .insert(std::make_pair( + thread_local_instance, + std::shared_ptr( + thread_local_instance->NewValueForCurrentThread()))) + .first; + } + return value_pos->second.get(); + } + + static void OnThreadLocalDestroyed( + const ThreadLocalBase *thread_local_instance) { + std::vector > value_holders; + // Clean up the ThreadLocalValues data structure while holding the lock, but + // defer the destruction of the ThreadLocalValueHolderBases. + { + MutexLock lock(&mutex_); + ThreadIdToThreadLocals *const thread_to_thread_locals = + GetThreadLocalsMapLocked(); + for (ThreadIdToThreadLocals::iterator it = + thread_to_thread_locals->begin(); + it != thread_to_thread_locals->end(); ++it) { + ThreadLocalValues &thread_local_values = it->second; + ThreadLocalValues::iterator value_pos = + thread_local_values.find(thread_local_instance); + if (value_pos != thread_local_values.end()) { + value_holders.push_back(value_pos->second); + thread_local_values.erase(value_pos); + // This 'if' can only be successful at most once, so theoretically we + // could break out of the loop here, but we don't bother doing so. + } + } + } + // Outside the lock, let the destructor for 'value_holders' deallocate the + // ThreadLocalValueHolderBases. + } + + static void OnThreadExit(DWORD thread_id) { + GTEST_CHECK_(thread_id != 0) << ::GetLastError(); + std::vector > value_holders; + // Clean up the ThreadIdToThreadLocals data structure while holding the + // lock, but defer the destruction of the ThreadLocalValueHolderBases. + { + MutexLock lock(&mutex_); + ThreadIdToThreadLocals *const thread_to_thread_locals = + GetThreadLocalsMapLocked(); + ThreadIdToThreadLocals::iterator thread_local_pos = + thread_to_thread_locals->find(thread_id); + if (thread_local_pos != thread_to_thread_locals->end()) { + ThreadLocalValues &thread_local_values = thread_local_pos->second; + for (ThreadLocalValues::iterator value_pos = + thread_local_values.begin(); + value_pos != thread_local_values.end(); ++value_pos) { + value_holders.push_back(value_pos->second); + } + thread_to_thread_locals->erase(thread_local_pos); + } + } + // Outside the lock, let the destructor for 'value_holders' deallocate the + // ThreadLocalValueHolderBases. + } + + private: + // In a particular thread, maps a ThreadLocal object to its value. + typedef std::map > + ThreadLocalValues; + // Stores all ThreadIdToThreadLocals having values in a thread, indexed by + // thread's ID. + typedef std::map ThreadIdToThreadLocals; + + // Holds the thread id and thread handle that we pass from + // StartWatcherThreadFor to WatcherThreadFunc. + typedef std::pair ThreadIdAndHandle; + + static void StartWatcherThreadFor(DWORD thread_id) { + // The returned handle will be kept in thread_map and closed by + // watcher_thread in WatcherThreadFunc. + HANDLE thread = + ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION, FALSE, thread_id); + GTEST_CHECK_(thread != nullptr); + // We need to pass a valid thread ID pointer into CreateThread for it + // to work correctly under Win98. + DWORD watcher_thread_id; + HANDLE watcher_thread = ::CreateThread( + nullptr, // Default security. + 0, // Default stack size + &ThreadLocalRegistryImpl::WatcherThreadFunc, + reinterpret_cast(new ThreadIdAndHandle(thread_id, thread)), + CREATE_SUSPENDED, &watcher_thread_id); + GTEST_CHECK_(watcher_thread != nullptr); + // Give the watcher thread the same priority as ours to avoid being + // blocked by it. + ::SetThreadPriority(watcher_thread, + ::GetThreadPriority(::GetCurrentThread())); + ::ResumeThread(watcher_thread); + ::CloseHandle(watcher_thread); + } + + // Monitors exit from a given thread and notifies those + // ThreadIdToThreadLocals about thread termination. + static DWORD WINAPI WatcherThreadFunc(LPVOID param) { + const ThreadIdAndHandle *tah = + reinterpret_cast(param); + GTEST_CHECK_(::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0); + OnThreadExit(tah->first); + ::CloseHandle(tah->second); + delete tah; + return 0; + } + + // Returns map of thread local instances. + static ThreadIdToThreadLocals *GetThreadLocalsMapLocked() { + mutex_.AssertHeld(); +#ifdef _MSC_VER + MemoryIsNotDeallocated memory_is_not_deallocated; +#endif // _MSC_VER + static ThreadIdToThreadLocals *map = new ThreadIdToThreadLocals(); + return map; + } + + // Protects access to GetThreadLocalsMapLocked() and its return value. + static Mutex mutex_; + // Protects access to GetThreadMapLocked() and its return value. + static Mutex thread_map_mutex_; +}; + +Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex); +Mutex ThreadLocalRegistryImpl::thread_map_mutex_(Mutex::kStaticMutex); + +ThreadLocalValueHolderBase *ThreadLocalRegistry::GetValueOnCurrentThread( + const ThreadLocalBase *thread_local_instance) { + return ThreadLocalRegistryImpl::GetValueOnCurrentThread( + thread_local_instance); +} + +void ThreadLocalRegistry::OnThreadLocalDestroyed( + const ThreadLocalBase *thread_local_instance) { + ThreadLocalRegistryImpl::OnThreadLocalDestroyed(thread_local_instance); +} + +#endif // GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS + +#if GTEST_USES_POSIX_RE + +// Implements RE. Currently only needed for death tests. + +RE::~RE() { + if (is_valid_) { + // regfree'ing an invalid regex might crash because the content + // of the regex is undefined. Since the regex's are essentially + // the same, one cannot be valid (or invalid) without the other + // being so too. + regfree(&partial_regex_); + regfree(&full_regex_); + } + free(const_cast(pattern_)); +} + +// Returns true if and only if regular expression re matches the entire str. +bool RE::FullMatch(const char *str, const RE &re) { + if (!re.is_valid_) return false; + + regmatch_t match; + return regexec(&re.full_regex_, str, 1, &match, 0) == 0; +} + +// Returns true if and only if regular expression re matches a substring of +// str (including str itself). +bool RE::PartialMatch(const char *str, const RE &re) { + if (!re.is_valid_) return false; + + regmatch_t match; + return regexec(&re.partial_regex_, str, 1, &match, 0) == 0; +} + +// Initializes an RE from its string representation. +void RE::Init(const char *regex) { + pattern_ = posix::StrDup(regex); + + // Reserves enough bytes to hold the regular expression used for a + // full match. + const size_t full_regex_len = strlen(regex) + 10; + char *const full_pattern = new char[full_regex_len]; + + snprintf(full_pattern, full_regex_len, "^(%s)$", regex); + is_valid_ = regcomp(&full_regex_, full_pattern, REG_EXTENDED) == 0; + // We want to call regcomp(&partial_regex_, ...) even if the + // previous expression returns false. Otherwise partial_regex_ may + // not be properly initialized can may cause trouble when it's + // freed. + // + // Some implementation of POSIX regex (e.g. on at least some + // versions of Cygwin) doesn't accept the empty string as a valid + // regex. We change it to an equivalent form "()" to be safe. + if (is_valid_) { + const char *const partial_regex = (*regex == '\0') ? "()" : regex; + is_valid_ = regcomp(&partial_regex_, partial_regex, REG_EXTENDED) == 0; + } + EXPECT_TRUE(is_valid_) + << "Regular expression \"" << regex + << "\" is not a valid POSIX Extended regular expression."; + + delete[] full_pattern; +} + +#elif GTEST_USES_SIMPLE_RE + +// Returns true if and only if ch appears anywhere in str (excluding the +// terminating '\0' character). +bool IsInSet(char ch, const char *str) { + return ch != '\0' && strchr(str, ch) != nullptr; +} + +// Returns true if and only if ch belongs to the given classification. +// Unlike similar functions in , these aren't affected by the +// current locale. +bool IsAsciiDigit(char ch) { return '0' <= ch && ch <= '9'; } +bool IsAsciiPunct(char ch) { + return IsInSet(ch, "^-!\"#$%&'()*+,./:;<=>?@[\\]_`{|}~"); +} +bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); } +bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); } +bool IsAsciiWordChar(char ch) { + return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') || + ('0' <= ch && ch <= '9') || ch == '_'; +} + +// Returns true if and only if "\\c" is a supported escape sequence. +bool IsValidEscape(char c) { + return (IsAsciiPunct(c) || IsInSet(c, "dDfnrsStvwW")); +} + +// Returns true if and only if the given atom (specified by escaped and +// pattern) matches ch. The result is undefined if the atom is invalid. +bool AtomMatchesChar(bool escaped, char pattern_char, char ch) { + if (escaped) { // "\\p" where p is pattern_char. + switch (pattern_char) { + case 'd': return IsAsciiDigit(ch); + case 'D': return !IsAsciiDigit(ch); + case 'f': return ch == '\f'; + case 'n': return ch == '\n'; + case 'r': return ch == '\r'; + case 's': return IsAsciiWhiteSpace(ch); + case 'S': return !IsAsciiWhiteSpace(ch); + case 't': return ch == '\t'; + case 'v': return ch == '\v'; + case 'w': return IsAsciiWordChar(ch); + case 'W': return !IsAsciiWordChar(ch); + } + return IsAsciiPunct(pattern_char) && pattern_char == ch; + } + + return (pattern_char == '.' && ch != '\n') || pattern_char == ch; +} + +// Helper function used by ValidateRegex() to format error messages. +static std::string FormatRegexSyntaxError(const char *regex, int index) { + return (Message() << "Syntax error at index " << index + << " in simple regular expression \"" << regex << "\": ") + .GetString(); +} + +// Generates non-fatal failures and returns false if regex is invalid; +// otherwise returns true. +bool ValidateRegex(const char *regex) { + if (regex == nullptr) { + ADD_FAILURE() << "NULL is not a valid simple regular expression."; + return false; + } + + bool is_valid = true; + + // True if and only if ?, *, or + can follow the previous atom. + bool prev_repeatable = false; + for (int i = 0; regex[i]; i++) { + if (regex[i] == '\\') { // An escape sequence + i++; + if (regex[i] == '\0') { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1) + << "'\\' cannot appear at the end."; + return false; + } + + if (!IsValidEscape(regex[i])) { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1) + << "invalid escape sequence \"\\" << regex[i] << "\"."; + is_valid = false; + } + prev_repeatable = true; + } else { // Not an escape sequence. + const char ch = regex[i]; + + if (ch == '^' && i > 0) { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i) + << "'^' can only appear at the beginning."; + is_valid = false; + } else if (ch == '$' && regex[i + 1] != '\0') { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i) + << "'$' can only appear at the end."; + is_valid = false; + } else if (IsInSet(ch, "()[]{}|")) { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch + << "' is unsupported."; + is_valid = false; + } else if (IsRepeat(ch) && !prev_repeatable) { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch + << "' can only follow a repeatable token."; + is_valid = false; + } + + prev_repeatable = !IsInSet(ch, "^$?*+"); + } + } + + return is_valid; +} + +// Matches a repeated regex atom followed by a valid simple regular +// expression. The regex atom is defined as c if escaped is false, +// or \c otherwise. repeat is the repetition meta character (?, *, +// or +). The behavior is undefined if str contains too many +// characters to be indexable by size_t, in which case the test will +// probably time out anyway. We are fine with this limitation as +// std::string has it too. +bool MatchRepetitionAndRegexAtHead(bool escaped, char c, char repeat, + const char *regex, const char *str) { + const size_t min_count = (repeat == '+') ? 1 : 0; + const size_t max_count = (repeat == '?') ? 1 : static_cast(-1) - 1; + // We cannot call numeric_limits::max() as it conflicts with the + // max() macro on Windows. + + for (size_t i = 0; i <= max_count; ++i) { + // We know that the atom matches each of the first i characters in str. + if (i >= min_count && MatchRegexAtHead(regex, str + i)) { + // We have enough matches at the head, and the tail matches too. + // Since we only care about *whether* the pattern matches str + // (as opposed to *how* it matches), there is no need to find a + // greedy match. + return true; + } + if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i])) return false; + } + return false; +} + +// Returns true if and only if regex matches a prefix of str. regex must +// be a valid simple regular expression and not start with "^", or the +// result is undefined. +bool MatchRegexAtHead(const char *regex, const char *str) { + if (*regex == '\0') // An empty regex matches a prefix of anything. + return true; + + // "$" only matches the end of a string. Note that regex being + // valid guarantees that there's nothing after "$" in it. + if (*regex == '$') return *str == '\0'; + + // Is the first thing in regex an escape sequence? + const bool escaped = *regex == '\\'; + if (escaped) ++regex; + if (IsRepeat(regex[1])) { + // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so + // here's an indirect recursion. It terminates as the regex gets + // shorter in each recursion. + return MatchRepetitionAndRegexAtHead(escaped, regex[0], regex[1], regex + 2, + str); + } else { + // regex isn't empty, isn't "$", and doesn't start with a + // repetition. We match the first atom of regex with the first + // character of str and recurse. + return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) && + MatchRegexAtHead(regex + 1, str + 1); + } +} + +// Returns true if and only if regex matches any substring of str. regex must +// be a valid simple regular expression, or the result is undefined. +// +// The algorithm is recursive, but the recursion depth doesn't exceed +// the regex length, so we won't need to worry about running out of +// stack space normally. In rare cases the time complexity can be +// exponential with respect to the regex length + the string length, +// but usually it's must faster (often close to linear). +bool MatchRegexAnywhere(const char *regex, const char *str) { + if (regex == nullptr || str == nullptr) return false; + + if (*regex == '^') return MatchRegexAtHead(regex + 1, str); + + // A successful match can be anywhere in str. + do { + if (MatchRegexAtHead(regex, str)) return true; + } while (*str++ != '\0'); + return false; +} + +// Implements the RE class. + +RE::~RE() { + free(const_cast(pattern_)); + free(const_cast(full_pattern_)); +} + +// Returns true if and only if regular expression re matches the entire str. +bool RE::FullMatch(const char *str, const RE &re) { + return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str); +} + +// Returns true if and only if regular expression re matches a substring of +// str (including str itself). +bool RE::PartialMatch(const char *str, const RE &re) { + return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str); +} + +// Initializes an RE from its string representation. +void RE::Init(const char *regex) { + pattern_ = full_pattern_ = nullptr; + if (regex != nullptr) { + pattern_ = posix::StrDup(regex); + } + + is_valid_ = ValidateRegex(regex); + if (!is_valid_) { + // No need to calculate the full pattern when the regex is invalid. + return; + } + + const size_t len = strlen(regex); + // Reserves enough bytes to hold the regular expression used for a + // full match: we need space to prepend a '^', append a '$', and + // terminate the string with '\0'. + char *buffer = static_cast(malloc(len + 3)); + full_pattern_ = buffer; + + if (*regex != '^') + *buffer++ = '^'; // Makes sure full_pattern_ starts with '^'. + + // We don't use snprintf or strncpy, as they trigger a warning when + // compiled with VC++ 8.0. + memcpy(buffer, regex, len); + buffer += len; + + if (len == 0 || regex[len - 1] != '$') + *buffer++ = '$'; // Makes sure full_pattern_ ends with '$'. + + *buffer = '\0'; +} + +#endif // GTEST_USES_POSIX_RE + +const char kUnknownFile[] = "unknown file"; + +// Formats a source file path and a line number as they would appear +// in an error message from the compiler used to compile this code. +GTEST_API_ ::std::string FormatFileLocation(const char *file, int line) { + const std::string file_name(file == nullptr ? kUnknownFile : file); + + if (line < 0) { + return file_name + ":"; + } +#ifdef _MSC_VER + return file_name + "(" + StreamableToString(line) + "):"; +#else + return file_name + ":" + StreamableToString(line) + ":"; +#endif // _MSC_VER +} + +// Formats a file location for compiler-independent XML output. +// Although this function is not platform dependent, we put it next to +// FormatFileLocation in order to contrast the two functions. +// Note that FormatCompilerIndependentFileLocation() does NOT append colon +// to the file location it produces, unlike FormatFileLocation(). +GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char *file, + int line) { + const std::string file_name(file == nullptr ? kUnknownFile : file); + + if (line < 0) + return file_name; + else + return file_name + ":" + StreamableToString(line); +} + +GTestLog::GTestLog(GTestLogSeverity severity, const char *file, int line) + : severity_(severity) { + const char *const marker = + severity == GTEST_INFO + ? "[ INFO ]" + : severity == GTEST_WARNING + ? "[WARNING]" + : severity == GTEST_ERROR ? "[ ERROR ]" : "[ FATAL ]"; + GetStream() << ::std::endl + << marker << " " << FormatFileLocation(file, line).c_str() + << ": "; +} + +// Flushes the buffers and, if severity is GTEST_FATAL, aborts the program. +GTestLog::~GTestLog() { + GetStream() << ::std::endl; + if (severity_ == GTEST_FATAL) { + fflush(stderr); + posix::Abort(); + } +} + +// Disable Microsoft deprecation warnings for POSIX functions called from +// this class (creat, dup, dup2, and close) +GTEST_DISABLE_MSC_DEPRECATED_PUSH_() + +#if GTEST_HAS_STREAM_REDIRECTION + +// Object that captures an output stream (stdout/stderr). +class CapturedStream { + public: + // The ctor redirects the stream to a temporary file. + explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) { +#if GTEST_OS_WINDOWS + char temp_dir_path[MAX_PATH + 1] = { '\0' }; // NOLINT + char temp_file_path[MAX_PATH + 1] = { '\0' }; // NOLINT + + ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path); + const UINT success = ::GetTempFileNameA(temp_dir_path, "gtest_redir", + 0, // Generate unique file name. + temp_file_path); + GTEST_CHECK_(success != 0) + << "Unable to create a temporary file in " << temp_dir_path; + const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE); + GTEST_CHECK_(captured_fd != -1) + << "Unable to open temporary file " << temp_file_path; + filename_ = temp_file_path; +#else + // There's no guarantee that a test has write access to the current + // directory, so we create the temporary file in the /tmp directory + // instead. We use /tmp on most systems, and /sdcard on Android. + // That's because Android doesn't have /tmp. +#if GTEST_OS_LINUX_ANDROID + // Note: Android applications are expected to call the framework's + // Context.getExternalStorageDirectory() method through JNI to get + // the location of the world-writable SD Card directory. However, + // this requires a Context handle, which cannot be retrieved + // globally from native code. Doing so also precludes running the + // code as part of a regular standalone executable, which doesn't + // run in a Dalvik process (e.g. when running it through 'adb shell'). + // + // The location /data/local/tmp is directly accessible from native code. + // '/sdcard' and other variants cannot be relied on, as they are not + // guaranteed to be mounted, or may have a delay in mounting. + char name_template[] = "/data/local/tmp/gtest_captured_stream.XXXXXX"; +#else + char name_template[] = "/tmp/captured_stream.XXXXXX"; +#endif // GTEST_OS_LINUX_ANDROID + const int captured_fd = mkstemp(name_template); + if (captured_fd == -1) { + GTEST_LOG_(WARNING) + << "Failed to create tmp file " << name_template + << " for test; does the test have access to the /tmp directory?"; + } + filename_ = name_template; +#endif // GTEST_OS_WINDOWS + fflush(nullptr); + dup2(captured_fd, fd_); + close(captured_fd); + } + + ~CapturedStream() { remove(filename_.c_str()); } + + std::string GetCapturedString() { + if (uncaptured_fd_ != -1) { + // Restores the original stream. + fflush(nullptr); + dup2(uncaptured_fd_, fd_); + close(uncaptured_fd_); + uncaptured_fd_ = -1; + } + + FILE *const file = posix::FOpen(filename_.c_str(), "r"); + if (file == nullptr) { + GTEST_LOG_(FATAL) << "Failed to open tmp file " << filename_ + << " for capturing stream."; + } + const std::string content = ReadEntireFile(file); + posix::FClose(file); + return content; + } + + private: + const int fd_; // A stream to capture. + int uncaptured_fd_; + // Name of the temporary file holding the stderr output. + ::std::string filename_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream); +}; + +GTEST_DISABLE_MSC_DEPRECATED_POP_() + +static CapturedStream *g_captured_stderr = nullptr; +static CapturedStream *g_captured_stdout = nullptr; + +// Starts capturing an output stream (stdout/stderr). +static void CaptureStream(int fd, const char *stream_name, + CapturedStream **stream) { + if (*stream != nullptr) { + GTEST_LOG_(FATAL) << "Only one " << stream_name + << " capturer can exist at a time."; + } + *stream = new CapturedStream(fd); +} + +// Stops capturing the output stream and returns the captured string. +static std::string GetCapturedStream(CapturedStream **captured_stream) { + const std::string content = (*captured_stream)->GetCapturedString(); + + delete *captured_stream; + *captured_stream = nullptr; + + return content; +} + +// Starts capturing stdout. +void CaptureStdout() { + CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout); +} + +// Starts capturing stderr. +void CaptureStderr() { + CaptureStream(kStdErrFileno, "stderr", &g_captured_stderr); +} + +// Stops capturing stdout and returns the captured string. +std::string GetCapturedStdout() { + return GetCapturedStream(&g_captured_stdout); +} + +// Stops capturing stderr and returns the captured string. +std::string GetCapturedStderr() { + return GetCapturedStream(&g_captured_stderr); +} + +#endif // GTEST_HAS_STREAM_REDIRECTION + +size_t GetFileSize(FILE *file) { + fseek(file, 0, SEEK_END); + return static_cast(ftell(file)); +} + +std::string ReadEntireFile(FILE *file) { + const size_t file_size = GetFileSize(file); + char *const buffer = new char[file_size]; + + size_t bytes_last_read = 0; // # of bytes read in the last fread() + size_t bytes_read = 0; // # of bytes read so far + + fseek(file, 0, SEEK_SET); + + // Keeps reading the file until we cannot read further or the + // pre-determined file size is reached. + do { + bytes_last_read = + fread(buffer + bytes_read, 1, file_size - bytes_read, file); + bytes_read += bytes_last_read; + } while (bytes_last_read > 0 && bytes_read < file_size); + + const std::string content(buffer, bytes_read); + delete[] buffer; + + return content; +} + +#if GTEST_HAS_DEATH_TEST +static const std::vector *g_injected_test_argvs = + nullptr; // Owned. + +std::vector GetInjectableArgvs() { + if (g_injected_test_argvs != nullptr) { + return *g_injected_test_argvs; + } + return GetArgvs(); +} + +void SetInjectableArgvs(const std::vector *new_argvs) { + if (g_injected_test_argvs != new_argvs) delete g_injected_test_argvs; + g_injected_test_argvs = new_argvs; +} + +void SetInjectableArgvs(const std::vector &new_argvs) { + SetInjectableArgvs( + new std::vector(new_argvs.begin(), new_argvs.end())); +} + +void ClearInjectableArgvs() { + delete g_injected_test_argvs; + g_injected_test_argvs = nullptr; +} +#endif // GTEST_HAS_DEATH_TEST + +#if GTEST_OS_WINDOWS_MOBILE +namespace posix { +void Abort() { + DebugBreak(); + TerminateProcess(GetCurrentProcess(), 1); +} +} // namespace posix +#endif // GTEST_OS_WINDOWS_MOBILE + +// Returns the name of the environment variable corresponding to the +// given flag. For example, FlagToEnvVar("foo") will return +// "GTEST_FOO" in the open-source version. +static std::string FlagToEnvVar(const char *flag) { + const std::string full_flag = + (Message() << GTEST_FLAG_PREFIX_ << flag).GetString(); + + Message env_var; + for (size_t i = 0; i != full_flag.length(); i++) { + env_var << ToUpper(full_flag.c_str()[i]); + } + + return env_var.GetString(); +} + +// Parses 'str' for a 32-bit signed integer. If successful, writes +// the result to *value and returns true; otherwise leaves *value +// unchanged and returns false. +bool ParseInt32(const Message &src_text, const char *str, int32_t *value) { + // Parses the environment variable as a decimal integer. + char *end = nullptr; + const long long_value = strtol(str, &end, 10); // NOLINT + + // Has strtol() consumed all characters in the string? + if (*end != '\0') { + // No - an invalid character was encountered. + Message msg; + msg << "WARNING: " << src_text + << " is expected to be a 32-bit integer, but actually" + << " has value \"" << str << "\".\n"; + printf("%s", msg.GetString().c_str()); + fflush(stdout); + return false; + } + + // Is the parsed value in the range of an int32_t? + const auto result = static_cast(long_value); + if (long_value == LONG_MAX || long_value == LONG_MIN || + // The parsed value overflows as a long. (strtol() returns + // LONG_MAX or LONG_MIN when the input overflows.) + result != long_value + // The parsed value overflows as an int32_t. + ) { + Message msg; + msg << "WARNING: " << src_text + << " is expected to be a 32-bit integer, but actually" + << " has value " << str << ", which overflows.\n"; + printf("%s", msg.GetString().c_str()); + fflush(stdout); + return false; + } + + *value = result; + return true; +} + +// Reads and returns the Boolean environment variable corresponding to +// the given flag; if it's not set, returns default_value. +// +// The value is considered true if and only if it's not "0". +bool BoolFromGTestEnv(const char *flag, bool default_value) { +#if defined(GTEST_GET_BOOL_FROM_ENV_) + return GTEST_GET_BOOL_FROM_ENV_(flag, default_value); +#else + const std::string env_var = FlagToEnvVar(flag); + const char *const string_value = posix::GetEnv(env_var.c_str()); + return string_value == nullptr ? default_value + : strcmp(string_value, "0") != 0; +#endif // defined(GTEST_GET_BOOL_FROM_ENV_) +} + +// Reads and returns a 32-bit integer stored in the environment +// variable corresponding to the given flag; if it isn't set or +// doesn't represent a valid 32-bit integer, returns default_value. +int32_t Int32FromGTestEnv(const char *flag, int32_t default_value) { +#if defined(GTEST_GET_INT32_FROM_ENV_) + return GTEST_GET_INT32_FROM_ENV_(flag, default_value); +#else + const std::string env_var = FlagToEnvVar(flag); + const char *const string_value = posix::GetEnv(env_var.c_str()); + if (string_value == nullptr) { + // The environment variable is not set. + return default_value; + } + + int32_t result = default_value; + if (!ParseInt32(Message() << "Environment variable " << env_var, string_value, + &result)) { + printf("The default value %s is used.\n", + (Message() << default_value).GetString().c_str()); + fflush(stdout); + return default_value; + } + + return result; +#endif // defined(GTEST_GET_INT32_FROM_ENV_) +} + +// As a special case for the 'output' flag, if GTEST_OUTPUT is not +// set, we look for XML_OUTPUT_FILE, which is set by the Bazel build +// system. The value of XML_OUTPUT_FILE is a filename without the +// "xml:" prefix of GTEST_OUTPUT. +// Note that this is meant to be called at the call site so it does +// not check that the flag is 'output' +// In essence this checks an env variable called XML_OUTPUT_FILE +// and if it is set we prepend "xml:" to its value, if it not set we return "" +std::string OutputFlagAlsoCheckEnvVar() { + std::string default_value_for_output_flag = ""; + const char *xml_output_file_env = posix::GetEnv("XML_OUTPUT_FILE"); + if (nullptr != xml_output_file_env) { + default_value_for_output_flag = std::string("xml:") + xml_output_file_env; + } + return default_value_for_output_flag; +} + +// Reads and returns the string environment variable corresponding to +// the given flag; if it's not set, returns default_value. +const char *StringFromGTestEnv(const char *flag, const char *default_value) { +#if defined(GTEST_GET_STRING_FROM_ENV_) + return GTEST_GET_STRING_FROM_ENV_(flag, default_value); +#else + const std::string env_var = FlagToEnvVar(flag); + const char *const value = posix::GetEnv(env_var.c_str()); + return value == nullptr ? default_value : value; +#endif // defined(GTEST_GET_STRING_FROM_ENV_) +} + +} // namespace internal +} // namespace testing diff --git a/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-printers.cc b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-printers.cc new file mode 100644 index 000000000..8399386a9 --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-printers.cc @@ -0,0 +1,400 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Google Test - The Google C++ Testing and Mocking Framework +// +// This file implements a universal value printer that can print a +// value of any type T: +// +// void ::testing::internal::UniversalPrinter::Print(value, ostream_ptr); +// +// It uses the << operator when possible, and prints the bytes in the +// object otherwise. A user can override its behavior for a class +// type Foo by defining either operator<<(::std::ostream&, const Foo&) +// or void PrintTo(const Foo&, ::std::ostream*) in the namespace that +// defines Foo. + +#include "gtest/gtest-printers.h" +#include +#include +#include +#include // NOLINT +#include +#include "gtest/internal/gtest-port.h" +#include "src/gtest-internal-inl.h" + +namespace testing { + +namespace { + +using ::std::ostream; + +// Prints a segment of bytes in the given object. +GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ +GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ +GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ +void PrintByteSegmentInObjectTo(const unsigned char *obj_bytes, size_t start, + size_t count, ostream *os) { + char text[5] = ""; + for (size_t i = 0; i != count; i++) { + const size_t j = start + i; + if (i != 0) { + // Organizes the bytes into groups of 2 for easy parsing by + // human. + if ((j % 2) == 0) + *os << ' '; + else + *os << '-'; + } + GTEST_SNPRINTF_(text, sizeof(text), "%02X", obj_bytes[j]); + *os << text; + } +} + +// Prints the bytes in the given value to the given ostream. +void PrintBytesInObjectToImpl(const unsigned char *obj_bytes, size_t count, + ostream *os) { + // Tells the user how big the object is. + *os << count << "-byte object <"; + + const size_t kThreshold = 132; + const size_t kChunkSize = 64; + // If the object size is bigger than kThreshold, we'll have to omit + // some details by printing only the first and the last kChunkSize + // bytes. + if (count < kThreshold) { + PrintByteSegmentInObjectTo(obj_bytes, 0, count, os); + } else { + PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os); + *os << " ... "; + // Rounds up to 2-byte boundary. + const size_t resume_pos = (count - kChunkSize + 1) / 2 * 2; + PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os); + } + *os << ">"; +} + +} // namespace + +namespace internal2 { + +// Delegates to PrintBytesInObjectToImpl() to print the bytes in the +// given object. The delegation simplifies the implementation, which +// uses the << operator and thus is easier done outside of the +// ::testing::internal namespace, which contains a << operator that +// sometimes conflicts with the one in STL. +void PrintBytesInObjectTo(const unsigned char *obj_bytes, size_t count, + ostream *os) { + PrintBytesInObjectToImpl(obj_bytes, count, os); +} + +} // namespace internal2 + +namespace internal { + +// Depending on the value of a char (or wchar_t), we print it in one +// of three formats: +// - as is if it's a printable ASCII (e.g. 'a', '2', ' '), +// - as a hexadecimal escape sequence (e.g. '\x7F'), or +// - as a special escape sequence (e.g. '\r', '\n'). +enum CharFormat { kAsIs, kHexEscape, kSpecialEscape }; + +// Returns true if c is a printable ASCII character. We test the +// value of c directly instead of calling isprint(), which is buggy on +// Windows Mobile. +inline bool IsPrintableAscii(wchar_t c) { return 0x20 <= c && c <= 0x7E; } + +// Prints a wide or narrow char c as a character literal without the +// quotes, escaping it when necessary; returns how c was formatted. +// The template argument UnsignedChar is the unsigned version of Char, +// which is the type of c. +template +static CharFormat PrintAsCharLiteralTo(Char c, ostream *os) { + wchar_t w_c = static_cast(c); + switch (w_c) { + case L'\0': *os << "\\0"; break; + case L'\'': *os << "\\'"; break; + case L'\\': *os << "\\\\"; break; + case L'\a': *os << "\\a"; break; + case L'\b': *os << "\\b"; break; + case L'\f': *os << "\\f"; break; + case L'\n': *os << "\\n"; break; + case L'\r': *os << "\\r"; break; + case L'\t': *os << "\\t"; break; + case L'\v': *os << "\\v"; break; + default: + if (IsPrintableAscii(w_c)) { + *os << static_cast(c); + return kAsIs; + } else { + ostream::fmtflags flags = os->flags(); + *os << "\\x" << std::hex << std::uppercase + << static_cast(static_cast(c)); + os->flags(flags); + return kHexEscape; + } + } + return kSpecialEscape; +} + +// Prints a wchar_t c as if it's part of a string literal, escaping it when +// necessary; returns how c was formatted. +static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream *os) { + switch (c) { + case L'\'': *os << "'"; return kAsIs; + case L'"': *os << "\\\""; return kSpecialEscape; + default: return PrintAsCharLiteralTo(c, os); + } +} + +// Prints a char c as if it's part of a string literal, escaping it when +// necessary; returns how c was formatted. +static CharFormat PrintAsStringLiteralTo(char c, ostream *os) { + return PrintAsStringLiteralTo( + static_cast(static_cast(c)), os); +} + +// Prints a wide or narrow character c and its code. '\0' is printed +// as "'\\0'", other unprintable characters are also properly escaped +// using the standard C++ escape sequence. The template argument +// UnsignedChar is the unsigned version of Char, which is the type of c. +template +void PrintCharAndCodeTo(Char c, ostream *os) { + // First, print c as a literal in the most readable form we can find. + *os << ((sizeof(c) > 1) ? "L'" : "'"); + const CharFormat format = PrintAsCharLiteralTo(c, os); + *os << "'"; + + // To aid user debugging, we also print c's code in decimal, unless + // it's 0 (in which case c was printed as '\\0', making the code + // obvious). + if (c == 0) return; + *os << " (" << static_cast(c); + + // For more convenience, we print c's code again in hexadecimal, + // unless c was already printed in the form '\x##' or the code is in + // [1, 9]. + if (format == kHexEscape || (1 <= c && c <= 9)) { + // Do nothing. + } else { + *os << ", 0x" << String::FormatHexInt(static_cast(c)); + } + *os << ")"; +} + +void PrintTo(unsigned char c, ::std::ostream *os) { + PrintCharAndCodeTo(c, os); +} +void PrintTo(signed char c, ::std::ostream *os) { + PrintCharAndCodeTo(c, os); +} + +// Prints a wchar_t as a symbol if it is printable or as its internal +// code otherwise and also as its code. L'\0' is printed as "L'\\0'". +void PrintTo(wchar_t wc, ostream *os) { PrintCharAndCodeTo(wc, os); } + +// Prints the given array of characters to the ostream. CharType must be either +// char or wchar_t. +// The array starts at begin, the length is len, it may include '\0' characters +// and may not be NUL-terminated. +template +GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ + GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ + GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static CharFormat + PrintCharsAsStringTo(const CharType *begin, size_t len, ostream *os) { + const char *const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\""; + *os << kQuoteBegin; + bool is_previous_hex = false; + CharFormat print_format = kAsIs; + for (size_t index = 0; index < len; ++index) { + const CharType cur = begin[index]; + if (is_previous_hex && IsXDigit(cur)) { + // Previous character is of '\x..' form and this character can be + // interpreted as another hexadecimal digit in its number. Break string to + // disambiguate. + *os << "\" " << kQuoteBegin; + } + is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape; + // Remember if any characters required hex escaping. + if (is_previous_hex) { + print_format = kHexEscape; + } + } + *os << "\""; + return print_format; +} + +// Prints a (const) char/wchar_t array of 'len' elements, starting at address +// 'begin'. CharType must be either char or wchar_t. +template +GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ + GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ + GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static void + UniversalPrintCharArray(const CharType *begin, size_t len, + ostream *os) { + // The code + // const char kFoo[] = "foo"; + // generates an array of 4, not 3, elements, with the last one being '\0'. + // + // Therefore when printing a char array, we don't print the last element if + // it's '\0', such that the output matches the string literal as it's + // written in the source code. + if (len > 0 && begin[len - 1] == '\0') { + PrintCharsAsStringTo(begin, len - 1, os); + return; + } + + // If, however, the last element in the array is not '\0', e.g. + // const char kFoo[] = { 'f', 'o', 'o' }; + // we must print the entire array. We also print a message to indicate + // that the array is not NUL-terminated. + PrintCharsAsStringTo(begin, len, os); + *os << " (no terminating NUL)"; +} + +// Prints a (const) char array of 'len' elements, starting at address 'begin'. +void UniversalPrintArray(const char *begin, size_t len, ostream *os) { + UniversalPrintCharArray(begin, len, os); +} + +// Prints a (const) wchar_t array of 'len' elements, starting at address +// 'begin'. +void UniversalPrintArray(const wchar_t *begin, size_t len, ostream *os) { + UniversalPrintCharArray(begin, len, os); +} + +// Prints the given C string to the ostream. +void PrintTo(const char *s, ostream *os) { + if (s == nullptr) { + *os << "NULL"; + } else { + *os << ImplicitCast_(s) << " pointing to "; + PrintCharsAsStringTo(s, strlen(s), os); + } +} + +// MSVC compiler can be configured to define whar_t as a typedef +// of unsigned short. Defining an overload for const wchar_t* in that case +// would cause pointers to unsigned shorts be printed as wide strings, +// possibly accessing more memory than intended and causing invalid +// memory accesses. MSVC defines _NATIVE_WCHAR_T_DEFINED symbol when +// wchar_t is implemented as a native type. +#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED) +// Prints the given wide C string to the ostream. +void PrintTo(const wchar_t *s, ostream *os) { + if (s == nullptr) { + *os << "NULL"; + } else { + *os << ImplicitCast_(s) << " pointing to "; + PrintCharsAsStringTo(s, wcslen(s), os); + } +} +#endif // wchar_t is native + +namespace { + +bool ContainsUnprintableControlCodes(const char *str, size_t length) { + const unsigned char *s = reinterpret_cast(str); + + for (size_t i = 0; i < length; i++) { + unsigned char ch = *s++; + if (std::iscntrl(ch)) { + switch (ch) { + case '\t': + case '\n': + case '\r': break; + default: return true; + } + } + } + return false; +} + +bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t <= 0xbf; } + +bool IsValidUTF8(const char *str, size_t length) { + const unsigned char *s = reinterpret_cast(str); + + for (size_t i = 0; i < length;) { + unsigned char lead = s[i++]; + + if (lead <= 0x7f) { + continue; // single-byte character (ASCII) 0..7F + } + if (lead < 0xc2) { + return false; // trail byte or non-shortest form + } else if (lead <= 0xdf && (i + 1) <= length && IsUTF8TrailByte(s[i])) { + ++i; // 2-byte character + } else if (0xe0 <= lead && lead <= 0xef && (i + 2) <= length && + IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) && + // check for non-shortest form and surrogate + (lead != 0xe0 || s[i] >= 0xa0) && + (lead != 0xed || s[i] < 0xa0)) { + i += 2; // 3-byte character + } else if (0xf0 <= lead && lead <= 0xf4 && (i + 3) <= length && + IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) && + IsUTF8TrailByte(s[i + 2]) && + // check for non-shortest form + (lead != 0xf0 || s[i] >= 0x90) && + (lead != 0xf4 || s[i] < 0x90)) { + i += 3; // 4-byte character + } else { + return false; + } + } + return true; +} + +void ConditionalPrintAsText(const char *str, size_t length, ostream *os) { + if (!ContainsUnprintableControlCodes(str, length) && + IsValidUTF8(str, length)) { + *os << "\n As Text: \"" << str << "\""; + } +} + +} // anonymous namespace + +void PrintStringTo(const ::std::string &s, ostream *os) { + if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) { + if (GTEST_FLAG(print_utf8)) { + ConditionalPrintAsText(s.data(), s.size(), os); + } + } +} + +#if GTEST_HAS_STD_WSTRING +void PrintWideStringTo(const ::std::wstring &s, ostream *os) { + PrintCharsAsStringTo(s.data(), s.size(), os); +} +#endif // GTEST_HAS_STD_WSTRING + +} // namespace internal + +} // namespace testing diff --git a/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-test-part.cc b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-test-part.cc new file mode 100644 index 000000000..44b0e2b3f --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-test-part.cc @@ -0,0 +1,107 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +// The Google C++ Testing and Mocking Framework (Google Test) + +#include "gtest/gtest-test-part.h" + +#include "gtest/internal/gtest-port.h" +#include "src/gtest-internal-inl.h" + +namespace testing { + +using internal::GetUnitTestImpl; + +// Gets the summary of the failure message by omitting the stack trace +// in it. +std::string TestPartResult::ExtractSummary(const char *message) { + const char *const stack_trace = strstr(message, internal::kStackTraceMarker); + return stack_trace == nullptr ? message : std::string(message, stack_trace); +} + +// Prints a TestPartResult object. +std::ostream &operator<<(std::ostream &os, const TestPartResult &result) { + return os << internal::FormatFileLocation(result.file_name(), + result.line_number()) + << " " + << (result.type() == TestPartResult::kSuccess + ? "Success" + : result.type() == TestPartResult::kSkip + ? "Skipped" + : result.type() == TestPartResult::kFatalFailure + ? "Fatal failure" + : "Non-fatal failure") + << ":\n" + << result.message() << std::endl; +} + +// Appends a TestPartResult to the array. +void TestPartResultArray::Append(const TestPartResult &result) { + array_.push_back(result); +} + +// Returns the TestPartResult at the given index (0-based). +const TestPartResult &TestPartResultArray::GetTestPartResult(int index) const { + if (index < 0 || index >= size()) { + printf("\nInvalid index (%d) into TestPartResultArray.\n", index); + internal::posix::Abort(); + } + + return array_[static_cast(index)]; +} + +// Returns the number of TestPartResult objects in the array. +int TestPartResultArray::size() const { + return static_cast(array_.size()); +} + +namespace internal { + +HasNewFatalFailureHelper::HasNewFatalFailureHelper() + : has_new_fatal_failure_(false), + original_reporter_( + GetUnitTestImpl()->GetTestPartResultReporterForCurrentThread()) { + GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this); +} + +HasNewFatalFailureHelper::~HasNewFatalFailureHelper() { + GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread( + original_reporter_); +} + +void HasNewFatalFailureHelper::ReportTestPartResult( + const TestPartResult &result) { + if (result.fatally_failed()) has_new_fatal_failure_ = true; + original_reporter_->ReportTestPartResult(result); +} + +} // namespace internal + +} // namespace testing diff --git a/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-typed-test.cc b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-typed-test.cc new file mode 100644 index 000000000..04effad17 --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-typed-test.cc @@ -0,0 +1,117 @@ +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "gtest/gtest-typed-test.h" + +#include "gtest/gtest.h" + +namespace testing { +namespace internal { + +#if GTEST_HAS_TYPED_TEST_P + +// Skips to the first non-space char in str. Returns an empty string if str +// contains only whitespace characters. +static const char *SkipSpaces(const char *str) { + while (IsSpace(*str)) str++; + return str; +} + +static std::vector SplitIntoTestNames(const char *src) { + std::vector name_vec; + src = SkipSpaces(src); + for (; src != nullptr; src = SkipComma(src)) { + name_vec.push_back(StripTrailingSpaces(GetPrefixUntilComma(src))); + } + return name_vec; +} + +// Verifies that registered_tests match the test names in +// registered_tests_; returns registered_tests if successful, or +// aborts the program otherwise. +const char *TypedTestSuitePState::VerifyRegisteredTestNames( + const char *test_suite_name, const char *file, int line, + const char *registered_tests) { + RegisterTypeParameterizedTestSuite(test_suite_name, CodeLocation(file, line)); + + typedef RegisteredTestsMap::const_iterator RegisteredTestIter; + registered_ = true; + + std::vector name_vec = SplitIntoTestNames(registered_tests); + + Message errors; + + std::set tests; + for (std::vector::const_iterator name_it = name_vec.begin(); + name_it != name_vec.end(); ++name_it) { + const std::string &name = *name_it; + if (tests.count(name) != 0) { + errors << "Test " << name << " is listed more than once.\n"; + continue; + } + + bool found = false; + for (RegisteredTestIter it = registered_tests_.begin(); + it != registered_tests_.end(); ++it) { + if (name == it->first) { + found = true; + break; + } + } + + if (found) { + tests.insert(name); + } else { + errors << "No test named " << name + << " can be found in this test suite.\n"; + } + } + + for (RegisteredTestIter it = registered_tests_.begin(); + it != registered_tests_.end(); ++it) { + if (tests.count(it->first) == 0) { + errors << "You forgot to list test " << it->first << ".\n"; + } + } + + const std::string &errors_str = errors.GetString(); + if (errors_str != "") { + fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(), + errors_str.c_str()); + fflush(stderr); + posix::Abort(); + } + + return registered_tests; +} + +#endif // GTEST_HAS_TYPED_TEST_P + +} // namespace internal +} // namespace testing diff --git a/libs/libaom/src/third_party/googletest/src/googletest/src/gtest.cc b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest.cc new file mode 100644 index 000000000..5b4037fec --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest.cc @@ -0,0 +1,6240 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +// The Google C++ Testing and Mocking Framework (Google Test) + +#include "gtest/gtest.h" +#include "gtest/internal/custom/gtest.h" +#include "gtest/gtest-spi.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include // NOLINT +#include +#include + +#if GTEST_OS_LINUX + +#define GTEST_HAS_GETTIMEOFDAY_ 1 + +#include // NOLINT +#include // NOLINT +#include // NOLINT +// Declares vsnprintf(). This header is not available on Windows. +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include + +#elif GTEST_OS_ZOS +#define GTEST_HAS_GETTIMEOFDAY_ 1 +#include // NOLINT + +// On z/OS we additionally need strings.h for strcasecmp. +#include // NOLINT + +#elif GTEST_OS_WINDOWS_MOBILE // We are on Windows CE. + +#include // NOLINT +#undef min + +#elif GTEST_OS_WINDOWS // We are on Windows proper. + +#include // NOLINT +#undef min + +#ifdef _MSC_VER +#include // NOLINT +#include // NOLINT +#endif + +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT + +#if GTEST_OS_WINDOWS_MINGW +// MinGW has gettimeofday() but not _ftime64(). +#define GTEST_HAS_GETTIMEOFDAY_ 1 +#include // NOLINT +#endif // GTEST_OS_WINDOWS_MINGW + +#else + +// Assume other platforms have gettimeofday(). +#define GTEST_HAS_GETTIMEOFDAY_ 1 + +// cpplint thinks that the header is already included, so we want to +// silence it. +#include // NOLINT +#include // NOLINT + +#endif // GTEST_OS_LINUX + +#if GTEST_HAS_EXCEPTIONS +#include +#endif + +#if GTEST_CAN_STREAM_RESULTS_ +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT +#endif + +#include "src/gtest-internal-inl.h" + +#if GTEST_OS_WINDOWS +#define vsnprintf _vsnprintf +#endif // GTEST_OS_WINDOWS + +#if GTEST_OS_MAC +#ifndef GTEST_OS_IOS +#include +#endif +#endif + +#if GTEST_HAS_ABSL +#include "absl/debugging/failure_signal_handler.h" +#include "absl/debugging/stacktrace.h" +#include "absl/debugging/symbolize.h" +#include "absl/strings/str_cat.h" +#endif // GTEST_HAS_ABSL + +namespace testing { + +using internal::CountIf; +using internal::ForEach; +using internal::GetElementOr; +using internal::Shuffle; + +// Constants. + +// A test whose test suite name or test name matches this filter is +// disabled and not run. +static const char kDisableTestFilter[] = "DISABLED_*:*/DISABLED_*"; + +// A test suite whose name matches this filter is considered a death +// test suite and will be run before test suites whose name doesn't +// match this filter. +static const char kDeathTestSuiteFilter[] = "*DeathTest:*DeathTest/*"; + +// A test filter that matches everything. +static const char kUniversalFilter[] = "*"; + +// The default output format. +static const char kDefaultOutputFormat[] = "xml"; +// The default output file. +static const char kDefaultOutputFile[] = "test_detail"; + +// The environment variable name for the test shard index. +static const char kTestShardIndex[] = "GTEST_SHARD_INDEX"; +// The environment variable name for the total number of test shards. +static const char kTestTotalShards[] = "GTEST_TOTAL_SHARDS"; +// The environment variable name for the test shard status file. +static const char kTestShardStatusFile[] = "GTEST_SHARD_STATUS_FILE"; + +namespace internal { + +// The text used in failure messages to indicate the start of the +// stack trace. +const char kStackTraceMarker[] = "\nStack trace:\n"; + +// g_help_flag is true if and only if the --help flag or an equivalent form +// is specified on the command line. +bool g_help_flag = false; + +// Utilty function to Open File for Writing +static FILE *OpenFileForWriting(const std::string &output_file) { + FILE *fileout = nullptr; + FilePath output_file_path(output_file); + FilePath output_dir(output_file_path.RemoveFileName()); + + if (output_dir.CreateDirectoriesRecursively()) { + fileout = posix::FOpen(output_file.c_str(), "w"); + } + if (fileout == nullptr) { + GTEST_LOG_(FATAL) << "Unable to open file \"" << output_file << "\""; + } + return fileout; +} + +} // namespace internal + +// Bazel passes in the argument to '--test_filter' via the TESTBRIDGE_TEST_ONLY +// environment variable. +static const char *GetDefaultFilter() { + const char *const testbridge_test_only = + internal::posix::GetEnv("TESTBRIDGE_TEST_ONLY"); + if (testbridge_test_only != nullptr) { + return testbridge_test_only; + } + return kUniversalFilter; +} + +GTEST_DEFINE_bool_( + also_run_disabled_tests, + internal::BoolFromGTestEnv("also_run_disabled_tests", false), + "Run disabled tests too, in addition to the tests normally being run."); + +GTEST_DEFINE_bool_( + break_on_failure, internal::BoolFromGTestEnv("break_on_failure", false), + "True if and only if a failed assertion should be a debugger " + "break-point."); + +GTEST_DEFINE_bool_(catch_exceptions, + internal::BoolFromGTestEnv("catch_exceptions", true), + "True if and only if " GTEST_NAME_ + " should catch exceptions and treat them as test failures."); + +GTEST_DEFINE_string_( + color, internal::StringFromGTestEnv("color", "auto"), + "Whether to use colors in the output. Valid values: yes, no, " + "and auto. 'auto' means to use colors if the output is " + "being sent to a terminal and the TERM environment variable " + "is set to a terminal type that supports colors."); + +GTEST_DEFINE_string_( + filter, internal::StringFromGTestEnv("filter", GetDefaultFilter()), + "A colon-separated list of glob (not regex) patterns " + "for filtering the tests to run, optionally followed by a " + "'-' and a : separated list of negative patterns (tests to " + "exclude). A test is run if it matches one of the positive " + "patterns and does not match any of the negative patterns."); + +GTEST_DEFINE_bool_( + install_failure_signal_handler, + internal::BoolFromGTestEnv("install_failure_signal_handler", false), + "If true and supported on the current platform, " GTEST_NAME_ + " should " + "install a signal handler that dumps debugging information when fatal " + "signals are raised."); + +GTEST_DEFINE_bool_(list_tests, false, "List all tests without running them."); + +// The net priority order after flag processing is thus: +// --gtest_output command line flag +// GTEST_OUTPUT environment variable +// XML_OUTPUT_FILE environment variable +// '' +GTEST_DEFINE_string_( + output, + internal::StringFromGTestEnv("output", + internal::OutputFlagAlsoCheckEnvVar().c_str()), + "A format (defaults to \"xml\" but can be specified to be \"json\"), " + "optionally followed by a colon and an output file name or directory. " + "A directory is indicated by a trailing pathname separator. " + "Examples: \"xml:filename.xml\", \"xml::directoryname/\". " + "If a directory is specified, output files will be created " + "within that directory, with file-names based on the test " + "executable's name and, if necessary, made unique by adding " + "digits."); + +GTEST_DEFINE_bool_(print_time, internal::BoolFromGTestEnv("print_time", true), + "True if and only if " GTEST_NAME_ + " should display elapsed time in text output."); + +GTEST_DEFINE_bool_(print_utf8, internal::BoolFromGTestEnv("print_utf8", true), + "True if and only if " GTEST_NAME_ + " prints UTF8 characters as text."); + +GTEST_DEFINE_int32_( + random_seed, internal::Int32FromGTestEnv("random_seed", 0), + "Random number seed to use when shuffling test orders. Must be in range " + "[1, 99999], or 0 to use a seed based on the current time."); + +GTEST_DEFINE_int32_( + repeat, internal::Int32FromGTestEnv("repeat", 1), + "How many times to repeat each test. Specify a negative number " + "for repeating forever. Useful for shaking out flaky tests."); + +GTEST_DEFINE_bool_(show_internal_stack_frames, false, + "True if and only if " GTEST_NAME_ + " should include internal stack frames when " + "printing test failure stack traces."); + +GTEST_DEFINE_bool_(shuffle, internal::BoolFromGTestEnv("shuffle", false), + "True if and only if " GTEST_NAME_ + " should randomize tests' order on every run."); + +GTEST_DEFINE_int32_( + stack_trace_depth, + internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth), + "The maximum number of stack frames to print when an " + "assertion fails. The valid range is 0 through 100, inclusive."); + +GTEST_DEFINE_string_( + stream_result_to, internal::StringFromGTestEnv("stream_result_to", ""), + "This flag specifies the host name and the port number on which to stream " + "test results. Example: \"localhost:555\". The flag is effective only on " + "Linux."); + +GTEST_DEFINE_bool_( + throw_on_failure, internal::BoolFromGTestEnv("throw_on_failure", false), + "When this flag is specified, a failed assertion will throw an exception " + "if exceptions are enabled or exit the program with a non-zero code " + "otherwise. For use with an external test framework."); + +#if GTEST_USE_OWN_FLAGFILE_FLAG_ +GTEST_DEFINE_string_( + flagfile, internal::StringFromGTestEnv("flagfile", ""), + "This flag specifies the flagfile to read command-line flags from."); +#endif // GTEST_USE_OWN_FLAGFILE_FLAG_ + +namespace internal { + +// Generates a random number from [0, range), using a Linear +// Congruential Generator (LCG). Crashes if 'range' is 0 or greater +// than kMaxRange. +uint32_t Random::Generate(uint32_t range) { + // These constants are the same as are used in glibc's rand(3). + // Use wider types than necessary to prevent unsigned overflow diagnostics. + state_ = static_cast(1103515245ULL * state_ + 12345U) % kMaxRange; + + GTEST_CHECK_(range > 0) << "Cannot generate a number in the range [0, 0)."; + GTEST_CHECK_(range <= kMaxRange) + << "Generation of a number in [0, " << range << ") was requested, " + << "but this can only generate numbers in [0, " << kMaxRange << ")."; + + // Converting via modulus introduces a bit of downward bias, but + // it's simple, and a linear congruential generator isn't too good + // to begin with. + return state_ % range; +} + +// GTestIsInitialized() returns true if and only if the user has initialized +// Google Test. Useful for catching the user mistake of not initializing +// Google Test before calling RUN_ALL_TESTS(). +static bool GTestIsInitialized() { return GetArgvs().size() > 0; } + +// Iterates over a vector of TestSuites, keeping a running sum of the +// results of calling a given int-returning method on each. +// Returns the sum. +static int SumOverTestSuiteList(const std::vector &case_list, + int (TestSuite::*method)() const) { + int sum = 0; + for (size_t i = 0; i < case_list.size(); i++) { + sum += (case_list[i]->*method)(); + } + return sum; +} + +// Returns true if and only if the test suite passed. +static bool TestSuitePassed(const TestSuite *test_suite) { + return test_suite->should_run() && test_suite->Passed(); +} + +// Returns true if and only if the test suite failed. +static bool TestSuiteFailed(const TestSuite *test_suite) { + return test_suite->should_run() && test_suite->Failed(); +} + +// Returns true if and only if test_suite contains at least one test that +// should run. +static bool ShouldRunTestSuite(const TestSuite *test_suite) { + return test_suite->should_run(); +} + +// AssertHelper constructor. +AssertHelper::AssertHelper(TestPartResult::Type type, const char *file, + int line, const char *message) + : data_(new AssertHelperData(type, file, line, message)) {} + +AssertHelper::~AssertHelper() { delete data_; } + +// Message assignment, for assertion streaming support. +void AssertHelper::operator=(const Message &message) const { + UnitTest::GetInstance()->AddTestPartResult( + data_->type, data_->file, data_->line, + AppendUserMessage(data_->message, message), + UnitTest::GetInstance()->impl()->CurrentOsStackTraceExceptTop(1) + // Skips the stack frame for this function itself. + ); // NOLINT +} + +namespace { + +// When TEST_P is found without a matching INSTANTIATE_TEST_SUITE_P +// to creates test cases for it, a syntetic test case is +// inserted to report ether an error or a log message. +// +// This configuration bit will likely be removed at some point. +constexpr bool kErrorOnUninstantiatedParameterizedTest = false; +constexpr bool kErrorOnUninstantiatedTypeParameterizedTest = false; + +// A test that fails at a given file/line location with a given message. +class FailureTest : public Test { + public: + explicit FailureTest(const CodeLocation &loc, std::string error_message, + bool as_error) + : loc_(loc), error_message_(std::move(error_message)), + as_error_(as_error) {} + + void TestBody() override { + if (as_error_) { + AssertHelper(TestPartResult::kNonFatalFailure, loc_.file.c_str(), + loc_.line, "") = Message() << error_message_; + } else { + std::cout << error_message_ << std::endl; + } + } + + private: + const CodeLocation loc_; + const std::string error_message_; + const bool as_error_; +}; + +} // namespace + +std::set *GetIgnoredParameterizedTestSuites() { + return UnitTest::GetInstance()->impl()->ignored_parameterized_test_suites(); +} + +// Add a given test_suit to the list of them allow to go un-instantiated. +MarkAsIgnored::MarkAsIgnored(const char *test_suite) { + GetIgnoredParameterizedTestSuites()->insert(test_suite); +} + +// If this parameterized test suite has no instantiations (and that +// has not been marked as okay), emit a test case reporting that. +void InsertSyntheticTestCase(const std::string &name, CodeLocation location, + bool has_test_p) { + const auto &ignored = *GetIgnoredParameterizedTestSuites(); + if (ignored.find(name) != ignored.end()) return; + + const char kMissingInstantiation[] = // + " is defined via TEST_P, but never instantiated. None of the test cases " + "will run. Either no INSTANTIATE_TEST_SUITE_P is provided or the only " + "ones provided expand to nothing." + "\n\n" + "Ideally, TEST_P definitions should only ever be included as part of " + "binaries that intend to use them. (As opposed to, for example, being " + "placed in a library that may be linked in to get other utilities.)"; + + const char kMissingTestCase[] = // + " is instantiated via INSTANTIATE_TEST_SUITE_P, but no tests are " + "defined via TEST_P . No test cases will run." + "\n\n" + "Ideally, INSTANTIATE_TEST_SUITE_P should only ever be invoked from " + "code that always depend on code that provides TEST_P. Failing to do " + "so is often an indication of dead code, e.g. the last TEST_P was " + "removed but the rest got left behind."; + + std::string message = + "Paramaterized test suite " + name + + (has_test_p ? kMissingInstantiation : kMissingTestCase) + + "\n\n" + "To suppress this error for this test suite, insert the following line " + "(in a non-header) in the namespace it is defined in:" + "\n\n" + "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" + + name + ");"; + + std::string full_name = "UninstantiatedParamaterizedTestSuite<" + name + ">"; + RegisterTest( // + "GoogleTestVerification", full_name.c_str(), + nullptr, // No type parameter. + nullptr, // No value parameter. + location.file.c_str(), location.line, [message, location] { + return new FailureTest(location, message, + kErrorOnUninstantiatedParameterizedTest); + }); +} + +void RegisterTypeParameterizedTestSuite(const char *test_suite_name, + CodeLocation code_location) { + GetUnitTestImpl()->type_parameterized_test_registry().RegisterTestSuite( + test_suite_name, code_location); +} + +void RegisterTypeParameterizedTestSuiteInstantiation(const char *case_name) { + GetUnitTestImpl()->type_parameterized_test_registry().RegisterInstantiation( + case_name); +} + +void TypeParameterizedTestSuiteRegistry::RegisterTestSuite( + const char *test_suite_name, CodeLocation code_location) { + suites_.emplace(std::string(test_suite_name), + TypeParameterizedTestSuiteInfo(code_location)); +} + +void TypeParameterizedTestSuiteRegistry::RegisterInstantiation( + const char *test_suite_name) { + auto it = suites_.find(std::string(test_suite_name)); + if (it != suites_.end()) { + it->second.instantiated = true; + } else { + GTEST_LOG_(ERROR) << "Unknown type parameterized test suit '" + << test_suite_name << "'"; + } +} + +void TypeParameterizedTestSuiteRegistry::CheckForInstantiations() { + const auto &ignored = *GetIgnoredParameterizedTestSuites(); + for (const auto &testcase : suites_) { + if (testcase.second.instantiated) continue; + if (ignored.find(testcase.first) != ignored.end()) continue; + + std::string message = + "Type paramaterized test suite " + testcase.first + + " is defined via REGISTER_TYPED_TEST_SUITE_P, but never instantiated " + "via INSTANTIATE_TYPED_TEST_SUITE_P. None of the test cases will run." + "\n\n" + "Ideally, TYPED_TEST_P definitions should only ever be included as " + "part of binaries that intend to use them. (As opposed to, for " + "example, being placed in a library that may be linked in to get other " + "utilities.)" + "\n\n" + "To suppress this error for this test suite, insert the following line " + "(in a non-header) in the namespace it is definedin in:" + "\n\n" + "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" + + testcase.first + ");"; + + std::string full_name = + "UninstantiatedTypeParamaterizedTestSuite<" + testcase.first + ">"; + RegisterTest( // + "GoogleTestVerification", full_name.c_str(), + nullptr, // No type parameter. + nullptr, // No value parameter. + testcase.second.code_location.file.c_str(), + testcase.second.code_location.line, [message, testcase] { + return new FailureTest(testcase.second.code_location, message, + kErrorOnUninstantiatedTypeParameterizedTest); + }); + } +} + +// A copy of all command line arguments. Set by InitGoogleTest(). +static ::std::vector g_argvs; + +::std::vector GetArgvs() { +#if defined(GTEST_CUSTOM_GET_ARGVS_) + // GTEST_CUSTOM_GET_ARGVS_() may return a container of std::string or + // ::string. This code converts it to the appropriate type. + const auto &custom = GTEST_CUSTOM_GET_ARGVS_(); + return ::std::vector(custom.begin(), custom.end()); +#else // defined(GTEST_CUSTOM_GET_ARGVS_) + return g_argvs; +#endif // defined(GTEST_CUSTOM_GET_ARGVS_) +} + +// Returns the current application's name, removing directory path if that +// is present. +FilePath GetCurrentExecutableName() { + FilePath result; + +#if GTEST_OS_WINDOWS || GTEST_OS_OS2 + result.Set(FilePath(GetArgvs()[0]).RemoveExtension("exe")); +#else + result.Set(FilePath(GetArgvs()[0])); +#endif // GTEST_OS_WINDOWS + + return result.RemoveDirectoryName(); +} + +// Functions for processing the gtest_output flag. + +// Returns the output format, or "" for normal printed output. +std::string UnitTestOptions::GetOutputFormat() { + const char *const gtest_output_flag = GTEST_FLAG(output).c_str(); + const char *const colon = strchr(gtest_output_flag, ':'); + return (colon == nullptr) + ? std::string(gtest_output_flag) + : std::string(gtest_output_flag, + static_cast(colon - gtest_output_flag)); +} + +// Returns the name of the requested output file, or the default if none +// was explicitly specified. +std::string UnitTestOptions::GetAbsolutePathToOutputFile() { + const char *const gtest_output_flag = GTEST_FLAG(output).c_str(); + + std::string format = GetOutputFormat(); + if (format.empty()) format = std::string(kDefaultOutputFormat); + + const char *const colon = strchr(gtest_output_flag, ':'); + if (colon == nullptr) + return internal::FilePath::MakeFileName( + internal::FilePath( + UnitTest::GetInstance()->original_working_dir()), + internal::FilePath(kDefaultOutputFile), 0, format.c_str()) + .string(); + + internal::FilePath output_name(colon + 1); + if (!output_name.IsAbsolutePath()) + output_name = internal::FilePath::ConcatPaths( + internal::FilePath(UnitTest::GetInstance()->original_working_dir()), + internal::FilePath(colon + 1)); + + if (!output_name.IsDirectory()) return output_name.string(); + + internal::FilePath result(internal::FilePath::GenerateUniqueFileName( + output_name, internal::GetCurrentExecutableName(), + GetOutputFormat().c_str())); + return result.string(); +} + +// Returns true if and only if the wildcard pattern matches the string. +// The first ':' or '\0' character in pattern marks the end of it. +// +// This recursive algorithm isn't very efficient, but is clear and +// works well enough for matching test names, which are short. +bool UnitTestOptions::PatternMatchesString(const char *pattern, + const char *str) { + switch (*pattern) { + case '\0': + case ':': // Either ':' or '\0' marks the end of the pattern. + return *str == '\0'; + case '?': // Matches any single character. + return *str != '\0' && PatternMatchesString(pattern + 1, str + 1); + case '*': // Matches any string (possibly empty) of characters. + return (*str != '\0' && PatternMatchesString(pattern, str + 1)) || + PatternMatchesString(pattern + 1, str); + default: // Non-special character. Matches itself. + return *pattern == *str && PatternMatchesString(pattern + 1, str + 1); + } +} + +bool UnitTestOptions::MatchesFilter(const std::string &name, + const char *filter) { + const char *cur_pattern = filter; + for (;;) { + if (PatternMatchesString(cur_pattern, name.c_str())) { + return true; + } + + // Finds the next pattern in the filter. + cur_pattern = strchr(cur_pattern, ':'); + + // Returns if no more pattern can be found. + if (cur_pattern == nullptr) { + return false; + } + + // Skips the pattern separater (the ':' character). + cur_pattern++; + } +} + +// Returns true if and only if the user-specified filter matches the test +// suite name and the test name. +bool UnitTestOptions::FilterMatchesTest(const std::string &test_suite_name, + const std::string &test_name) { + const std::string &full_name = test_suite_name + "." + test_name.c_str(); + + // Split --gtest_filter at '-', if there is one, to separate into + // positive filter and negative filter portions + const char *const p = GTEST_FLAG(filter).c_str(); + const char *const dash = strchr(p, '-'); + std::string positive; + std::string negative; + if (dash == nullptr) { + positive = GTEST_FLAG(filter).c_str(); // Whole string is a positive filter + negative = ""; + } else { + positive = std::string(p, dash); // Everything up to the dash + negative = std::string(dash + 1); // Everything after the dash + if (positive.empty()) { + // Treat '-test1' as the same as '*-test1' + positive = kUniversalFilter; + } + } + + // A filter is a colon-separated list of patterns. It matches a + // test if any pattern in it matches the test. + return (MatchesFilter(full_name, positive.c_str()) && + !MatchesFilter(full_name, negative.c_str())); +} + +#if GTEST_HAS_SEH +// Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the +// given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise. +// This function is useful as an __except condition. +int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) { + // Google Test should handle a SEH exception if: + // 1. the user wants it to, AND + // 2. this is not a breakpoint exception, AND + // 3. this is not a C++ exception (VC++ implements them via SEH, + // apparently). + // + // SEH exception code for C++ exceptions. + // (see http://support.microsoft.com/kb/185294 for more information). + const DWORD kCxxExceptionCode = 0xe06d7363; + + bool should_handle = true; + + if (!GTEST_FLAG(catch_exceptions)) + should_handle = false; + else if (exception_code == EXCEPTION_BREAKPOINT) + should_handle = false; + else if (exception_code == kCxxExceptionCode) + should_handle = false; + + return should_handle ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH; +} +#endif // GTEST_HAS_SEH + +} // namespace internal + +// The c'tor sets this object as the test part result reporter used by +// Google Test. The 'result' parameter specifies where to report the +// results. Intercepts only failures from the current thread. +ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter( + TestPartResultArray *result) + : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD), result_(result) { + Init(); +} + +// The c'tor sets this object as the test part result reporter used by +// Google Test. The 'result' parameter specifies where to report the +// results. +ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter( + InterceptMode intercept_mode, TestPartResultArray *result) + : intercept_mode_(intercept_mode), result_(result) { + Init(); +} + +void ScopedFakeTestPartResultReporter::Init() { + internal::UnitTestImpl *const impl = internal::GetUnitTestImpl(); + if (intercept_mode_ == INTERCEPT_ALL_THREADS) { + old_reporter_ = impl->GetGlobalTestPartResultReporter(); + impl->SetGlobalTestPartResultReporter(this); + } else { + old_reporter_ = impl->GetTestPartResultReporterForCurrentThread(); + impl->SetTestPartResultReporterForCurrentThread(this); + } +} + +// The d'tor restores the test part result reporter used by Google Test +// before. +ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() { + internal::UnitTestImpl *const impl = internal::GetUnitTestImpl(); + if (intercept_mode_ == INTERCEPT_ALL_THREADS) { + impl->SetGlobalTestPartResultReporter(old_reporter_); + } else { + impl->SetTestPartResultReporterForCurrentThread(old_reporter_); + } +} + +// Increments the test part result count and remembers the result. +// This method is from the TestPartResultReporterInterface interface. +void ScopedFakeTestPartResultReporter::ReportTestPartResult( + const TestPartResult &result) { + result_->Append(result); +} + +namespace internal { + +// Returns the type ID of ::testing::Test. We should always call this +// instead of GetTypeId< ::testing::Test>() to get the type ID of +// testing::Test. This is to work around a suspected linker bug when +// using Google Test as a framework on Mac OS X. The bug causes +// GetTypeId< ::testing::Test>() to return different values depending +// on whether the call is from the Google Test framework itself or +// from user test code. GetTestTypeId() is guaranteed to always +// return the same value, as it always calls GetTypeId<>() from the +// gtest.cc, which is within the Google Test framework. +TypeId GetTestTypeId() { return GetTypeId(); } + +// The value of GetTestTypeId() as seen from within the Google Test +// library. This is solely for testing GetTestTypeId(). +extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId(); + +// This predicate-formatter checks that 'results' contains a test part +// failure of the given type and that the failure message contains the +// given substring. +static AssertionResult HasOneFailure(const char * /* results_expr */, + const char * /* type_expr */, + const char * /* substr_expr */, + const TestPartResultArray &results, + TestPartResult::Type type, + const std::string &substr) { + const std::string expected(type == TestPartResult::kFatalFailure + ? "1 fatal failure" + : "1 non-fatal failure"); + Message msg; + if (results.size() != 1) { + msg << "Expected: " << expected << "\n" + << " Actual: " << results.size() << " failures"; + for (int i = 0; i < results.size(); i++) { + msg << "\n" << results.GetTestPartResult(i); + } + return AssertionFailure() << msg; + } + + const TestPartResult &r = results.GetTestPartResult(0); + if (r.type() != type) { + return AssertionFailure() << "Expected: " << expected << "\n" + << " Actual:\n" + << r; + } + + if (strstr(r.message(), substr.c_str()) == nullptr) { + return AssertionFailure() + << "Expected: " << expected << " containing \"" << substr << "\"\n" + << " Actual:\n" + << r; + } + + return AssertionSuccess(); +} + +// The constructor of SingleFailureChecker remembers where to look up +// test part results, what type of failure we expect, and what +// substring the failure message should contain. +SingleFailureChecker::SingleFailureChecker(const TestPartResultArray *results, + TestPartResult::Type type, + const std::string &substr) + : results_(results), type_(type), substr_(substr) {} + +// The destructor of SingleFailureChecker verifies that the given +// TestPartResultArray contains exactly one failure that has the given +// type and contains the given substring. If that's not the case, a +// non-fatal failure will be generated. +SingleFailureChecker::~SingleFailureChecker() { + EXPECT_PRED_FORMAT3(HasOneFailure, *results_, type_, substr_); +} + +DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter( + UnitTestImpl *unit_test) + : unit_test_(unit_test) {} + +void DefaultGlobalTestPartResultReporter::ReportTestPartResult( + const TestPartResult &result) { + unit_test_->current_test_result()->AddTestPartResult(result); + unit_test_->listeners()->repeater()->OnTestPartResult(result); +} + +DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter( + UnitTestImpl *unit_test) + : unit_test_(unit_test) {} + +void DefaultPerThreadTestPartResultReporter::ReportTestPartResult( + const TestPartResult &result) { + unit_test_->GetGlobalTestPartResultReporter()->ReportTestPartResult(result); +} + +// Returns the global test part result reporter. +TestPartResultReporterInterface * +UnitTestImpl::GetGlobalTestPartResultReporter() { + internal::MutexLock lock(&global_test_part_result_reporter_mutex_); + return global_test_part_result_repoter_; +} + +// Sets the global test part result reporter. +void UnitTestImpl::SetGlobalTestPartResultReporter( + TestPartResultReporterInterface *reporter) { + internal::MutexLock lock(&global_test_part_result_reporter_mutex_); + global_test_part_result_repoter_ = reporter; +} + +// Returns the test part result reporter for the current thread. +TestPartResultReporterInterface * +UnitTestImpl::GetTestPartResultReporterForCurrentThread() { + return per_thread_test_part_result_reporter_.get(); +} + +// Sets the test part result reporter for the current thread. +void UnitTestImpl::SetTestPartResultReporterForCurrentThread( + TestPartResultReporterInterface *reporter) { + per_thread_test_part_result_reporter_.set(reporter); +} + +// Gets the number of successful test suites. +int UnitTestImpl::successful_test_suite_count() const { + return CountIf(test_suites_, TestSuitePassed); +} + +// Gets the number of failed test suites. +int UnitTestImpl::failed_test_suite_count() const { + return CountIf(test_suites_, TestSuiteFailed); +} + +// Gets the number of all test suites. +int UnitTestImpl::total_test_suite_count() const { + return static_cast(test_suites_.size()); +} + +// Gets the number of all test suites that contain at least one test +// that should run. +int UnitTestImpl::test_suite_to_run_count() const { + return CountIf(test_suites_, ShouldRunTestSuite); +} + +// Gets the number of successful tests. +int UnitTestImpl::successful_test_count() const { + return SumOverTestSuiteList(test_suites_, &TestSuite::successful_test_count); +} + +// Gets the number of skipped tests. +int UnitTestImpl::skipped_test_count() const { + return SumOverTestSuiteList(test_suites_, &TestSuite::skipped_test_count); +} + +// Gets the number of failed tests. +int UnitTestImpl::failed_test_count() const { + return SumOverTestSuiteList(test_suites_, &TestSuite::failed_test_count); +} + +// Gets the number of disabled tests that will be reported in the XML report. +int UnitTestImpl::reportable_disabled_test_count() const { + return SumOverTestSuiteList(test_suites_, + &TestSuite::reportable_disabled_test_count); +} + +// Gets the number of disabled tests. +int UnitTestImpl::disabled_test_count() const { + return SumOverTestSuiteList(test_suites_, &TestSuite::disabled_test_count); +} + +// Gets the number of tests to be printed in the XML report. +int UnitTestImpl::reportable_test_count() const { + return SumOverTestSuiteList(test_suites_, &TestSuite::reportable_test_count); +} + +// Gets the number of all tests. +int UnitTestImpl::total_test_count() const { + return SumOverTestSuiteList(test_suites_, &TestSuite::total_test_count); +} + +// Gets the number of tests that should run. +int UnitTestImpl::test_to_run_count() const { + return SumOverTestSuiteList(test_suites_, &TestSuite::test_to_run_count); +} + +// Returns the current OS stack trace as an std::string. +// +// The maximum number of stack frames to be included is specified by +// the gtest_stack_trace_depth flag. The skip_count parameter +// specifies the number of top frames to be skipped, which doesn't +// count against the number of frames to be included. +// +// For example, if Foo() calls Bar(), which in turn calls +// CurrentOsStackTraceExceptTop(1), Foo() will be included in the +// trace but Bar() and CurrentOsStackTraceExceptTop() won't. +std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) { + return os_stack_trace_getter()->CurrentStackTrace( + static_cast(GTEST_FLAG(stack_trace_depth)), skip_count + 1 + // Skips the user-specified number of frames plus this function + // itself. + ); // NOLINT +} + +// Returns the current time in milliseconds. +TimeInMillis GetTimeInMillis() { +#if GTEST_OS_WINDOWS_MOBILE || defined(__BORLANDC__) + // Difference between 1970-01-01 and 1601-01-01 in milliseconds. + // http://analogous.blogspot.com/2005/04/epoch.html + const TimeInMillis kJavaEpochToWinFileTimeDelta = + static_cast(116444736UL) * 100000UL; + const DWORD kTenthMicrosInMilliSecond = 10000; + + SYSTEMTIME now_systime; + FILETIME now_filetime; + ULARGE_INTEGER now_int64; + GetSystemTime(&now_systime); + if (SystemTimeToFileTime(&now_systime, &now_filetime)) { + now_int64.LowPart = now_filetime.dwLowDateTime; + now_int64.HighPart = now_filetime.dwHighDateTime; + now_int64.QuadPart = (now_int64.QuadPart / kTenthMicrosInMilliSecond) - + kJavaEpochToWinFileTimeDelta; + return now_int64.QuadPart; + } + return 0; +#elif GTEST_OS_WINDOWS && !GTEST_HAS_GETTIMEOFDAY_ + __timeb64 now; + + // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996 + // (deprecated function) there. + GTEST_DISABLE_MSC_DEPRECATED_PUSH_() + _ftime64(&now); + GTEST_DISABLE_MSC_DEPRECATED_POP_() + + return static_cast(now.time) * 1000 + now.millitm; +#elif GTEST_HAS_GETTIMEOFDAY_ + struct timeval now; + gettimeofday(&now, nullptr); + return static_cast(now.tv_sec) * 1000 + now.tv_usec / 1000; +#else +#error "Don't know how to get the current time on your system." +#endif +} + +// Utilities + +// class String. + +#if GTEST_OS_WINDOWS_MOBILE +// Creates a UTF-16 wide string from the given ANSI string, allocating +// memory using new. The caller is responsible for deleting the return +// value using delete[]. Returns the wide string, or NULL if the +// input is NULL. +LPCWSTR String::AnsiToUtf16(const char *ansi) { + if (!ansi) return nullptr; + const int length = strlen(ansi); + const int unicode_length = + MultiByteToWideChar(CP_ACP, 0, ansi, length, nullptr, 0); + WCHAR *unicode = new WCHAR[unicode_length + 1]; + MultiByteToWideChar(CP_ACP, 0, ansi, length, unicode, unicode_length); + unicode[unicode_length] = 0; + return unicode; +} + +// Creates an ANSI string from the given wide string, allocating +// memory using new. The caller is responsible for deleting the return +// value using delete[]. Returns the ANSI string, or NULL if the +// input is NULL. +const char *String::Utf16ToAnsi(LPCWSTR utf16_str) { + if (!utf16_str) return nullptr; + const int ansi_length = WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, nullptr, + 0, nullptr, nullptr); + char *ansi = new char[ansi_length + 1]; + WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, ansi, ansi_length, nullptr, + nullptr); + ansi[ansi_length] = 0; + return ansi; +} + +#endif // GTEST_OS_WINDOWS_MOBILE + +// Compares two C strings. Returns true if and only if they have the same +// content. +// +// Unlike strcmp(), this function can handle NULL argument(s). A NULL +// C string is considered different to any non-NULL C string, +// including the empty string. +bool String::CStringEquals(const char *lhs, const char *rhs) { + if (lhs == nullptr) return rhs == nullptr; + + if (rhs == nullptr) return false; + + return strcmp(lhs, rhs) == 0; +} + +#if GTEST_HAS_STD_WSTRING + +// Converts an array of wide chars to a narrow string using the UTF-8 +// encoding, and streams the result to the given Message object. +static void StreamWideCharsToMessage(const wchar_t *wstr, size_t length, + Message *msg) { + for (size_t i = 0; i != length;) { // NOLINT + if (wstr[i] != L'\0') { + *msg << WideStringToUtf8(wstr + i, static_cast(length - i)); + while (i != length && wstr[i] != L'\0') i++; + } else { + *msg << '\0'; + i++; + } + } +} + +#endif // GTEST_HAS_STD_WSTRING + +void SplitString(const ::std::string &str, char delimiter, + ::std::vector< ::std::string> *dest) { + ::std::vector< ::std::string> parsed; + ::std::string::size_type pos = 0; + while (::testing::internal::AlwaysTrue()) { + const ::std::string::size_type colon = str.find(delimiter, pos); + if (colon == ::std::string::npos) { + parsed.push_back(str.substr(pos)); + break; + } else { + parsed.push_back(str.substr(pos, colon - pos)); + pos = colon + 1; + } + } + dest->swap(parsed); +} + +} // namespace internal + +// Constructs an empty Message. +// We allocate the stringstream separately because otherwise each use of +// ASSERT/EXPECT in a procedure adds over 200 bytes to the procedure's +// stack frame leading to huge stack frames in some cases; gcc does not reuse +// the stack space. +Message::Message() : ss_(new ::std::stringstream) { + // By default, we want there to be enough precision when printing + // a double to a Message. + *ss_ << std::setprecision(std::numeric_limits::digits10 + 2); +} + +// These two overloads allow streaming a wide C string to a Message +// using the UTF-8 encoding. +Message &Message::operator<<(const wchar_t *wide_c_str) { + return *this << internal::String::ShowWideCString(wide_c_str); +} +Message &Message::operator<<(wchar_t *wide_c_str) { + return *this << internal::String::ShowWideCString(wide_c_str); +} + +#if GTEST_HAS_STD_WSTRING +// Converts the given wide string to a narrow string using the UTF-8 +// encoding, and streams the result to this Message object. +Message &Message::operator<<(const ::std::wstring &wstr) { + internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this); + return *this; +} +#endif // GTEST_HAS_STD_WSTRING + +// Gets the text streamed to this object so far as an std::string. +// Each '\0' character in the buffer is replaced with "\\0". +std::string Message::GetString() const { + return internal::StringStreamToString(ss_.get()); +} + +// AssertionResult constructors. +// Used in EXPECT_TRUE/FALSE(assertion_result). +AssertionResult::AssertionResult(const AssertionResult &other) + : success_(other.success_), + message_(other.message_.get() != nullptr + ? new ::std::string(*other.message_) + : static_cast< ::std::string *>(nullptr)) {} + +// Swaps two AssertionResults. +void AssertionResult::swap(AssertionResult &other) { + using std::swap; + swap(success_, other.success_); + swap(message_, other.message_); +} + +// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE. +AssertionResult AssertionResult::operator!() const { + AssertionResult negation(!success_); + if (message_.get() != nullptr) negation << *message_; + return negation; +} + +// Makes a successful assertion result. +AssertionResult AssertionSuccess() { return AssertionResult(true); } + +// Makes a failed assertion result. +AssertionResult AssertionFailure() { return AssertionResult(false); } + +// Makes a failed assertion result with the given failure message. +// Deprecated; use AssertionFailure() << message. +AssertionResult AssertionFailure(const Message &message) { + return AssertionFailure() << message; +} + +namespace internal { + +namespace edit_distance { +std::vector CalculateOptimalEdits(const std::vector &left, + const std::vector &right) { + std::vector > costs( + left.size() + 1, std::vector(right.size() + 1)); + std::vector > best_move( + left.size() + 1, std::vector(right.size() + 1)); + + // Populate for empty right. + for (size_t l_i = 0; l_i < costs.size(); ++l_i) { + costs[l_i][0] = static_cast(l_i); + best_move[l_i][0] = kRemove; + } + // Populate for empty left. + for (size_t r_i = 1; r_i < costs[0].size(); ++r_i) { + costs[0][r_i] = static_cast(r_i); + best_move[0][r_i] = kAdd; + } + + for (size_t l_i = 0; l_i < left.size(); ++l_i) { + for (size_t r_i = 0; r_i < right.size(); ++r_i) { + if (left[l_i] == right[r_i]) { + // Found a match. Consume it. + costs[l_i + 1][r_i + 1] = costs[l_i][r_i]; + best_move[l_i + 1][r_i + 1] = kMatch; + continue; + } + + const double add = costs[l_i + 1][r_i]; + const double remove = costs[l_i][r_i + 1]; + const double replace = costs[l_i][r_i]; + if (add < remove && add < replace) { + costs[l_i + 1][r_i + 1] = add + 1; + best_move[l_i + 1][r_i + 1] = kAdd; + } else if (remove < add && remove < replace) { + costs[l_i + 1][r_i + 1] = remove + 1; + best_move[l_i + 1][r_i + 1] = kRemove; + } else { + // We make replace a little more expensive than add/remove to lower + // their priority. + costs[l_i + 1][r_i + 1] = replace + 1.00001; + best_move[l_i + 1][r_i + 1] = kReplace; + } + } + } + + // Reconstruct the best path. We do it in reverse order. + std::vector best_path; + for (size_t l_i = left.size(), r_i = right.size(); l_i > 0 || r_i > 0;) { + EditType move = best_move[l_i][r_i]; + best_path.push_back(move); + l_i -= move != kAdd; + r_i -= move != kRemove; + } + std::reverse(best_path.begin(), best_path.end()); + return best_path; +} + +namespace { + +// Helper class to convert string into ids with deduplication. +class InternalStrings { + public: + size_t GetId(const std::string &str) { + IdMap::iterator it = ids_.find(str); + if (it != ids_.end()) return it->second; + size_t id = ids_.size(); + return ids_[str] = id; + } + + private: + typedef std::map IdMap; + IdMap ids_; +}; + +} // namespace + +std::vector CalculateOptimalEdits( + const std::vector &left, + const std::vector &right) { + std::vector left_ids, right_ids; + { + InternalStrings intern_table; + for (size_t i = 0; i < left.size(); ++i) { + left_ids.push_back(intern_table.GetId(left[i])); + } + for (size_t i = 0; i < right.size(); ++i) { + right_ids.push_back(intern_table.GetId(right[i])); + } + } + return CalculateOptimalEdits(left_ids, right_ids); +} + +namespace { + +// Helper class that holds the state for one hunk and prints it out to the +// stream. +// It reorders adds/removes when possible to group all removes before all +// adds. It also adds the hunk header before printint into the stream. +class Hunk { + public: + Hunk(size_t left_start, size_t right_start) + : left_start_(left_start), right_start_(right_start), adds_(), removes_(), + common_() {} + + void PushLine(char edit, const char *line) { + switch (edit) { + case ' ': + ++common_; + FlushEdits(); + hunk_.push_back(std::make_pair(' ', line)); + break; + case '-': + ++removes_; + hunk_removes_.push_back(std::make_pair('-', line)); + break; + case '+': + ++adds_; + hunk_adds_.push_back(std::make_pair('+', line)); + break; + } + } + + void PrintTo(std::ostream *os) { + PrintHeader(os); + FlushEdits(); + for (std::list >::const_iterator it = + hunk_.begin(); + it != hunk_.end(); ++it) { + *os << it->first << it->second << "\n"; + } + } + + bool has_edits() const { return adds_ || removes_; } + + private: + void FlushEdits() { + hunk_.splice(hunk_.end(), hunk_removes_); + hunk_.splice(hunk_.end(), hunk_adds_); + } + + // Print a unified diff header for one hunk. + // The format is + // "@@ -, +, @@" + // where the left/right parts are omitted if unnecessary. + void PrintHeader(std::ostream *ss) const { + *ss << "@@ "; + if (removes_) { + *ss << "-" << left_start_ << "," << (removes_ + common_); + } + if (removes_ && adds_) { + *ss << " "; + } + if (adds_) { + *ss << "+" << right_start_ << "," << (adds_ + common_); + } + *ss << " @@\n"; + } + + size_t left_start_, right_start_; + size_t adds_, removes_, common_; + std::list > hunk_, hunk_adds_, hunk_removes_; +}; + +} // namespace + +// Create a list of diff hunks in Unified diff format. +// Each hunk has a header generated by PrintHeader above plus a body with +// lines prefixed with ' ' for no change, '-' for deletion and '+' for +// addition. +// 'context' represents the desired unchanged prefix/suffix around the diff. +// If two hunks are close enough that their contexts overlap, then they are +// joined into one hunk. +std::string CreateUnifiedDiff(const std::vector &left, + const std::vector &right, + size_t context) { + const std::vector edits = CalculateOptimalEdits(left, right); + + size_t l_i = 0, r_i = 0, edit_i = 0; + std::stringstream ss; + while (edit_i < edits.size()) { + // Find first edit. + while (edit_i < edits.size() && edits[edit_i] == kMatch) { + ++l_i; + ++r_i; + ++edit_i; + } + + // Find the first line to include in the hunk. + const size_t prefix_context = std::min(l_i, context); + Hunk hunk(l_i - prefix_context + 1, r_i - prefix_context + 1); + for (size_t i = prefix_context; i > 0; --i) { + hunk.PushLine(' ', left[l_i - i].c_str()); + } + + // Iterate the edits until we found enough suffix for the hunk or the input + // is over. + size_t n_suffix = 0; + for (; edit_i < edits.size(); ++edit_i) { + if (n_suffix >= context) { + // Continue only if the next hunk is very close. + auto it = edits.begin() + static_cast(edit_i); + while (it != edits.end() && *it == kMatch) ++it; + if (it == edits.end() || + static_cast(it - edits.begin()) - edit_i >= context) { + // There is no next edit or it is too far away. + break; + } + } + + EditType edit = edits[edit_i]; + // Reset count when a non match is found. + n_suffix = edit == kMatch ? n_suffix + 1 : 0; + + if (edit == kMatch || edit == kRemove || edit == kReplace) { + hunk.PushLine(edit == kMatch ? ' ' : '-', left[l_i].c_str()); + } + if (edit == kAdd || edit == kReplace) { + hunk.PushLine('+', right[r_i].c_str()); + } + + // Advance indices, depending on edit type. + l_i += edit != kAdd; + r_i += edit != kRemove; + } + + if (!hunk.has_edits()) { + // We are done. We don't want this hunk. + break; + } + + hunk.PrintTo(&ss); + } + return ss.str(); +} + +} // namespace edit_distance + +namespace { + +// The string representation of the values received in EqFailure() are already +// escaped. Split them on escaped '\n' boundaries. Leave all other escaped +// characters the same. +std::vector SplitEscapedString(const std::string &str) { + std::vector lines; + size_t start = 0, end = str.size(); + if (end > 2 && str[0] == '"' && str[end - 1] == '"') { + ++start; + --end; + } + bool escaped = false; + for (size_t i = start; i + 1 < end; ++i) { + if (escaped) { + escaped = false; + if (str[i] == 'n') { + lines.push_back(str.substr(start, i - start - 1)); + start = i + 1; + } + } else { + escaped = str[i] == '\\'; + } + } + lines.push_back(str.substr(start, end - start)); + return lines; +} + +} // namespace + +// Constructs and returns the message for an equality assertion +// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure. +// +// The first four parameters are the expressions used in the assertion +// and their values, as strings. For example, for ASSERT_EQ(foo, bar) +// where foo is 5 and bar is 6, we have: +// +// lhs_expression: "foo" +// rhs_expression: "bar" +// lhs_value: "5" +// rhs_value: "6" +// +// The ignoring_case parameter is true if and only if the assertion is a +// *_STRCASEEQ*. When it's true, the string "Ignoring case" will +// be inserted into the message. +AssertionResult EqFailure(const char *lhs_expression, + const char *rhs_expression, + const std::string &lhs_value, + const std::string &rhs_value, bool ignoring_case) { + Message msg; + msg << "Expected equality of these values:"; + msg << "\n " << lhs_expression; + if (lhs_value != lhs_expression) { + msg << "\n Which is: " << lhs_value; + } + msg << "\n " << rhs_expression; + if (rhs_value != rhs_expression) { + msg << "\n Which is: " << rhs_value; + } + + if (ignoring_case) { + msg << "\nIgnoring case"; + } + + if (!lhs_value.empty() && !rhs_value.empty()) { + const std::vector lhs_lines = SplitEscapedString(lhs_value); + const std::vector rhs_lines = SplitEscapedString(rhs_value); + if (lhs_lines.size() > 1 || rhs_lines.size() > 1) { + msg << "\nWith diff:\n" + << edit_distance::CreateUnifiedDiff(lhs_lines, rhs_lines); + } + } + + return AssertionFailure() << msg; +} + +// Constructs a failure message for Boolean assertions such as EXPECT_TRUE. +std::string GetBoolAssertionFailureMessage( + const AssertionResult &assertion_result, const char *expression_text, + const char *actual_predicate_value, const char *expected_predicate_value) { + const char *actual_message = assertion_result.message(); + Message msg; + msg << "Value of: " << expression_text + << "\n Actual: " << actual_predicate_value; + if (actual_message[0] != '\0') msg << " (" << actual_message << ")"; + msg << "\nExpected: " << expected_predicate_value; + return msg.GetString(); +} + +// Helper function for implementing ASSERT_NEAR. +AssertionResult DoubleNearPredFormat(const char *expr1, const char *expr2, + const char *abs_error_expr, double val1, + double val2, double abs_error) { + const double diff = fabs(val1 - val2); + if (diff <= abs_error) return AssertionSuccess(); + + return AssertionFailure() + << "The difference between " << expr1 << " and " << expr2 << " is " + << diff << ", which exceeds " << abs_error_expr << ", where\n" + << expr1 << " evaluates to " << val1 << ",\n" + << expr2 << " evaluates to " << val2 << ", and\n" + << abs_error_expr << " evaluates to " << abs_error << "."; +} + +// Helper template for implementing FloatLE() and DoubleLE(). +template +AssertionResult FloatingPointLE(const char *expr1, const char *expr2, + RawType val1, RawType val2) { + // Returns success if val1 is less than val2, + if (val1 < val2) { + return AssertionSuccess(); + } + + // or if val1 is almost equal to val2. + const FloatingPoint lhs(val1), rhs(val2); + if (lhs.AlmostEquals(rhs)) { + return AssertionSuccess(); + } + + // Note that the above two checks will both fail if either val1 or + // val2 is NaN, as the IEEE floating-point standard requires that + // any predicate involving a NaN must return false. + + ::std::stringstream val1_ss; + val1_ss << std::setprecision(std::numeric_limits::digits10 + 2) + << val1; + + ::std::stringstream val2_ss; + val2_ss << std::setprecision(std::numeric_limits::digits10 + 2) + << val2; + + return AssertionFailure() + << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n" + << " Actual: " << StringStreamToString(&val1_ss) << " vs " + << StringStreamToString(&val2_ss); +} + +} // namespace internal + +// Asserts that val1 is less than, or almost equal to, val2. Fails +// otherwise. In particular, it fails if either val1 or val2 is NaN. +AssertionResult FloatLE(const char *expr1, const char *expr2, float val1, + float val2) { + return internal::FloatingPointLE(expr1, expr2, val1, val2); +} + +// Asserts that val1 is less than, or almost equal to, val2. Fails +// otherwise. In particular, it fails if either val1 or val2 is NaN. +AssertionResult DoubleLE(const char *expr1, const char *expr2, double val1, + double val2) { + return internal::FloatingPointLE(expr1, expr2, val1, val2); +} + +namespace internal { + +// The helper function for {ASSERT|EXPECT}_EQ with int or enum +// arguments. +AssertionResult CmpHelperEQ(const char *lhs_expression, + const char *rhs_expression, BiggestInt lhs, + BiggestInt rhs) { + if (lhs == rhs) { + return AssertionSuccess(); + } + + return EqFailure(lhs_expression, rhs_expression, + FormatForComparisonFailureMessage(lhs, rhs), + FormatForComparisonFailureMessage(rhs, lhs), false); +} + +// A macro for implementing the helper functions needed to implement +// ASSERT_?? and EXPECT_?? with integer or enum arguments. It is here +// just to avoid copy-and-paste of similar code. +#define GTEST_IMPL_CMP_HELPER_(op_name, op) \ + AssertionResult CmpHelper##op_name(const char *expr1, const char *expr2, \ + BiggestInt val1, BiggestInt val2) { \ + if (val1 op val2) { \ + return AssertionSuccess(); \ + } else { \ + return AssertionFailure() \ + << "Expected: (" << expr1 << ") " #op " (" << expr2 \ + << "), actual: " << FormatForComparisonFailureMessage(val1, val2) \ + << " vs " << FormatForComparisonFailureMessage(val2, val1); \ + } \ + } + +// Implements the helper function for {ASSERT|EXPECT}_NE with int or +// enum arguments. +GTEST_IMPL_CMP_HELPER_(NE, !=) +// Implements the helper function for {ASSERT|EXPECT}_LE with int or +// enum arguments. +GTEST_IMPL_CMP_HELPER_(LE, <=) +// Implements the helper function for {ASSERT|EXPECT}_LT with int or +// enum arguments. +GTEST_IMPL_CMP_HELPER_(LT, <) +// Implements the helper function for {ASSERT|EXPECT}_GE with int or +// enum arguments. +GTEST_IMPL_CMP_HELPER_(GE, >=) +// Implements the helper function for {ASSERT|EXPECT}_GT with int or +// enum arguments. +GTEST_IMPL_CMP_HELPER_(GT, >) + +#undef GTEST_IMPL_CMP_HELPER_ + +// The helper function for {ASSERT|EXPECT}_STREQ. +AssertionResult CmpHelperSTREQ(const char *lhs_expression, + const char *rhs_expression, const char *lhs, + const char *rhs) { + if (String::CStringEquals(lhs, rhs)) { + return AssertionSuccess(); + } + + return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs), + PrintToString(rhs), false); +} + +// The helper function for {ASSERT|EXPECT}_STRCASEEQ. +AssertionResult CmpHelperSTRCASEEQ(const char *lhs_expression, + const char *rhs_expression, const char *lhs, + const char *rhs) { + if (String::CaseInsensitiveCStringEquals(lhs, rhs)) { + return AssertionSuccess(); + } + + return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs), + PrintToString(rhs), true); +} + +// The helper function for {ASSERT|EXPECT}_STRNE. +AssertionResult CmpHelperSTRNE(const char *s1_expression, + const char *s2_expression, const char *s1, + const char *s2) { + if (!String::CStringEquals(s1, s2)) { + return AssertionSuccess(); + } else { + return AssertionFailure() + << "Expected: (" << s1_expression << ") != (" << s2_expression + << "), actual: \"" << s1 << "\" vs \"" << s2 << "\""; + } +} + +// The helper function for {ASSERT|EXPECT}_STRCASENE. +AssertionResult CmpHelperSTRCASENE(const char *s1_expression, + const char *s2_expression, const char *s1, + const char *s2) { + if (!String::CaseInsensitiveCStringEquals(s1, s2)) { + return AssertionSuccess(); + } else { + return AssertionFailure() + << "Expected: (" << s1_expression << ") != (" << s2_expression + << ") (ignoring case), actual: \"" << s1 << "\" vs \"" << s2 << "\""; + } +} + +} // namespace internal + +namespace { + +// Helper functions for implementing IsSubString() and IsNotSubstring(). + +// This group of overloaded functions return true if and only if needle +// is a substring of haystack. NULL is considered a substring of +// itself only. + +bool IsSubstringPred(const char *needle, const char *haystack) { + if (needle == nullptr || haystack == nullptr) return needle == haystack; + + return strstr(haystack, needle) != nullptr; +} + +bool IsSubstringPred(const wchar_t *needle, const wchar_t *haystack) { + if (needle == nullptr || haystack == nullptr) return needle == haystack; + + return wcsstr(haystack, needle) != nullptr; +} + +// StringType here can be either ::std::string or ::std::wstring. +template +bool IsSubstringPred(const StringType &needle, const StringType &haystack) { + return haystack.find(needle) != StringType::npos; +} + +// This function implements either IsSubstring() or IsNotSubstring(), +// depending on the value of the expected_to_be_substring parameter. +// StringType here can be const char*, const wchar_t*, ::std::string, +// or ::std::wstring. +template +AssertionResult IsSubstringImpl(bool expected_to_be_substring, + const char *needle_expr, + const char *haystack_expr, + const StringType &needle, + const StringType &haystack) { + if (IsSubstringPred(needle, haystack) == expected_to_be_substring) + return AssertionSuccess(); + + const bool is_wide_string = sizeof(needle[0]) > 1; + const char *const begin_string_quote = is_wide_string ? "L\"" : "\""; + return AssertionFailure() + << "Value of: " << needle_expr << "\n" + << " Actual: " << begin_string_quote << needle << "\"\n" + << "Expected: " << (expected_to_be_substring ? "" : "not ") + << "a substring of " << haystack_expr << "\n" + << "Which is: " << begin_string_quote << haystack << "\""; +} + +} // namespace + +// IsSubstring() and IsNotSubstring() check whether needle is a +// substring of haystack (NULL is considered a substring of itself +// only), and return an appropriate error message when they fail. + +AssertionResult IsSubstring(const char *needle_expr, const char *haystack_expr, + const char *needle, const char *haystack) { + return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsSubstring(const char *needle_expr, const char *haystack_expr, + const wchar_t *needle, const wchar_t *haystack) { + return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsNotSubstring(const char *needle_expr, + const char *haystack_expr, const char *needle, + const char *haystack) { + return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsNotSubstring(const char *needle_expr, + const char *haystack_expr, const wchar_t *needle, + const wchar_t *haystack) { + return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsSubstring(const char *needle_expr, const char *haystack_expr, + const ::std::string &needle, + const ::std::string &haystack) { + return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsNotSubstring(const char *needle_expr, + const char *haystack_expr, + const ::std::string &needle, + const ::std::string &haystack) { + return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); +} + +#if GTEST_HAS_STD_WSTRING +AssertionResult IsSubstring(const char *needle_expr, const char *haystack_expr, + const ::std::wstring &needle, + const ::std::wstring &haystack) { + return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsNotSubstring(const char *needle_expr, + const char *haystack_expr, + const ::std::wstring &needle, + const ::std::wstring &haystack) { + return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); +} +#endif // GTEST_HAS_STD_WSTRING + +namespace internal { + +#if GTEST_OS_WINDOWS + +namespace { + +// Helper function for IsHRESULT{SuccessFailure} predicates +AssertionResult HRESULTFailureHelper(const char *expr, const char *expected, + long hr) { // NOLINT +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE + + // Windows CE doesn't support FormatMessage. + const char error_text[] = ""; + +#else + + // Looks up the human-readable system message for the HRESULT code + // and since we're not passing any params to FormatMessage, we don't + // want inserts expanded. + const DWORD kFlags = + FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS; + const DWORD kBufSize = 4096; + // Gets the system's human readable message string for this HRESULT. + char error_text[kBufSize] = { '\0' }; + DWORD message_length = ::FormatMessageA(kFlags, + 0, // no source, we're asking system + static_cast(hr), // the error + 0, // no line width restrictions + error_text, // output buffer + kBufSize, // buf size + nullptr); // no arguments for inserts + // Trims tailing white space (FormatMessage leaves a trailing CR-LF) + for (; message_length && IsSpace(error_text[message_length - 1]); + --message_length) { + error_text[message_length - 1] = '\0'; + } + +#endif // GTEST_OS_WINDOWS_MOBILE + + const std::string error_hex("0x" + String::FormatHexInt(hr)); + return ::testing::AssertionFailure() + << "Expected: " << expr << " " << expected << ".\n" + << " Actual: " << error_hex << " " << error_text << "\n"; +} + +} // namespace + +AssertionResult IsHRESULTSuccess(const char *expr, long hr) { // NOLINT + if (SUCCEEDED(hr)) { + return AssertionSuccess(); + } + return HRESULTFailureHelper(expr, "succeeds", hr); +} + +AssertionResult IsHRESULTFailure(const char *expr, long hr) { // NOLINT + if (FAILED(hr)) { + return AssertionSuccess(); + } + return HRESULTFailureHelper(expr, "fails", hr); +} + +#endif // GTEST_OS_WINDOWS + +// Utility functions for encoding Unicode text (wide strings) in +// UTF-8. + +// A Unicode code-point can have up to 21 bits, and is encoded in UTF-8 +// like this: +// +// Code-point length Encoding +// 0 - 7 bits 0xxxxxxx +// 8 - 11 bits 110xxxxx 10xxxxxx +// 12 - 16 bits 1110xxxx 10xxxxxx 10xxxxxx +// 17 - 21 bits 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + +// The maximum code-point a one-byte UTF-8 sequence can represent. +constexpr uint32_t kMaxCodePoint1 = (static_cast(1) << 7) - 1; + +// The maximum code-point a two-byte UTF-8 sequence can represent. +constexpr uint32_t kMaxCodePoint2 = (static_cast(1) << (5 + 6)) - 1; + +// The maximum code-point a three-byte UTF-8 sequence can represent. +constexpr uint32_t kMaxCodePoint3 = + (static_cast(1) << (4 + 2 * 6)) - 1; + +// The maximum code-point a four-byte UTF-8 sequence can represent. +constexpr uint32_t kMaxCodePoint4 = + (static_cast(1) << (3 + 3 * 6)) - 1; + +// Chops off the n lowest bits from a bit pattern. Returns the n +// lowest bits. As a side effect, the original bit pattern will be +// shifted to the right by n bits. +inline uint32_t ChopLowBits(uint32_t *bits, int n) { + const uint32_t low_bits = *bits & ((static_cast(1) << n) - 1); + *bits >>= n; + return low_bits; +} + +// Converts a Unicode code point to a narrow string in UTF-8 encoding. +// code_point parameter is of type uint32_t because wchar_t may not be +// wide enough to contain a code point. +// If the code_point is not a valid Unicode code point +// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted +// to "(Invalid Unicode 0xXXXXXXXX)". +std::string CodePointToUtf8(uint32_t code_point) { + if (code_point > kMaxCodePoint4) { + return "(Invalid Unicode 0x" + String::FormatHexUInt32(code_point) + ")"; + } + + char str[5]; // Big enough for the largest valid code point. + if (code_point <= kMaxCodePoint1) { + str[1] = '\0'; + str[0] = static_cast(code_point); // 0xxxxxxx + } else if (code_point <= kMaxCodePoint2) { + str[2] = '\0'; + str[1] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[0] = static_cast(0xC0 | code_point); // 110xxxxx + } else if (code_point <= kMaxCodePoint3) { + str[3] = '\0'; + str[2] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[1] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[0] = static_cast(0xE0 | code_point); // 1110xxxx + } else { // code_point <= kMaxCodePoint4 + str[4] = '\0'; + str[3] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[2] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[1] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[0] = static_cast(0xF0 | code_point); // 11110xxx + } + return str; +} + +// The following two functions only make sense if the system +// uses UTF-16 for wide string encoding. All supported systems +// with 16 bit wchar_t (Windows, Cygwin) do use UTF-16. + +// Determines if the arguments constitute UTF-16 surrogate pair +// and thus should be combined into a single Unicode code point +// using CreateCodePointFromUtf16SurrogatePair. +inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) { + return sizeof(wchar_t) == 2 && (first & 0xFC00) == 0xD800 && + (second & 0xFC00) == 0xDC00; +} + +// Creates a Unicode code point from UTF16 surrogate pair. +inline uint32_t CreateCodePointFromUtf16SurrogatePair(wchar_t first, + wchar_t second) { + const auto first_u = static_cast(first); + const auto second_u = static_cast(second); + const uint32_t mask = (1 << 10) - 1; + return (sizeof(wchar_t) == 2) + ? (((first_u & mask) << 10) | (second_u & mask)) + 0x10000 + : + // This function should not be called when the condition is + // false, but we provide a sensible default in case it is. + first_u; +} + +// Converts a wide string to a narrow string in UTF-8 encoding. +// The wide string is assumed to have the following encoding: +// UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin) +// UTF-32 if sizeof(wchar_t) == 4 (on Linux) +// Parameter str points to a null-terminated wide string. +// Parameter num_chars may additionally limit the number +// of wchar_t characters processed. -1 is used when the entire string +// should be processed. +// If the string contains code points that are not valid Unicode code points +// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output +// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding +// and contains invalid UTF-16 surrogate pairs, values in those pairs +// will be encoded as individual Unicode characters from Basic Normal Plane. +std::string WideStringToUtf8(const wchar_t *str, int num_chars) { + if (num_chars == -1) num_chars = static_cast(wcslen(str)); + + ::std::stringstream stream; + for (int i = 0; i < num_chars; ++i) { + uint32_t unicode_code_point; + + if (str[i] == L'\0') { + break; + } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) { + unicode_code_point = + CreateCodePointFromUtf16SurrogatePair(str[i], str[i + 1]); + i++; + } else { + unicode_code_point = static_cast(str[i]); + } + + stream << CodePointToUtf8(unicode_code_point); + } + return StringStreamToString(&stream); +} + +// Converts a wide C string to an std::string using the UTF-8 encoding. +// NULL will be converted to "(null)". +std::string String::ShowWideCString(const wchar_t *wide_c_str) { + if (wide_c_str == nullptr) return "(null)"; + + return internal::WideStringToUtf8(wide_c_str, -1); +} + +// Compares two wide C strings. Returns true if and only if they have the +// same content. +// +// Unlike wcscmp(), this function can handle NULL argument(s). A NULL +// C string is considered different to any non-NULL C string, +// including the empty string. +bool String::WideCStringEquals(const wchar_t *lhs, const wchar_t *rhs) { + if (lhs == nullptr) return rhs == nullptr; + + if (rhs == nullptr) return false; + + return wcscmp(lhs, rhs) == 0; +} + +// Helper function for *_STREQ on wide strings. +AssertionResult CmpHelperSTREQ(const char *lhs_expression, + const char *rhs_expression, const wchar_t *lhs, + const wchar_t *rhs) { + if (String::WideCStringEquals(lhs, rhs)) { + return AssertionSuccess(); + } + + return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs), + PrintToString(rhs), false); +} + +// Helper function for *_STRNE on wide strings. +AssertionResult CmpHelperSTRNE(const char *s1_expression, + const char *s2_expression, const wchar_t *s1, + const wchar_t *s2) { + if (!String::WideCStringEquals(s1, s2)) { + return AssertionSuccess(); + } + + return AssertionFailure() + << "Expected: (" << s1_expression << ") != (" << s2_expression + << "), actual: " << PrintToString(s1) << " vs " << PrintToString(s2); +} + +// Compares two C strings, ignoring case. Returns true if and only if they have +// the same content. +// +// Unlike strcasecmp(), this function can handle NULL argument(s). A +// NULL C string is considered different to any non-NULL C string, +// including the empty string. +bool String::CaseInsensitiveCStringEquals(const char *lhs, const char *rhs) { + if (lhs == nullptr) return rhs == nullptr; + if (rhs == nullptr) return false; + return posix::StrCaseCmp(lhs, rhs) == 0; +} + +// Compares two wide C strings, ignoring case. Returns true if and only if they +// have the same content. +// +// Unlike wcscasecmp(), this function can handle NULL argument(s). +// A NULL C string is considered different to any non-NULL wide C string, +// including the empty string. +// NB: The implementations on different platforms slightly differ. +// On windows, this method uses _wcsicmp which compares according to LC_CTYPE +// environment variable. On GNU platform this method uses wcscasecmp +// which compares according to LC_CTYPE category of the current locale. +// On MacOS X, it uses towlower, which also uses LC_CTYPE category of the +// current locale. +bool String::CaseInsensitiveWideCStringEquals(const wchar_t *lhs, + const wchar_t *rhs) { + if (lhs == nullptr) return rhs == nullptr; + + if (rhs == nullptr) return false; + +#if GTEST_OS_WINDOWS + return _wcsicmp(lhs, rhs) == 0; +#elif GTEST_OS_LINUX && !GTEST_OS_LINUX_ANDROID + return wcscasecmp(lhs, rhs) == 0; +#else + // Android, Mac OS X and Cygwin don't define wcscasecmp. + // Other unknown OSes may not define it either. + wint_t left, right; + do { + left = towlower(static_cast(*lhs++)); + right = towlower(static_cast(*rhs++)); + } while (left && left == right); + return left == right; +#endif // OS selector +} + +// Returns true if and only if str ends with the given suffix, ignoring case. +// Any string is considered to end with an empty suffix. +bool String::EndsWithCaseInsensitive(const std::string &str, + const std::string &suffix) { + const size_t str_len = str.length(); + const size_t suffix_len = suffix.length(); + return (str_len >= suffix_len) && + CaseInsensitiveCStringEquals(str.c_str() + str_len - suffix_len, + suffix.c_str()); +} + +// Formats an int value as "%02d". +std::string String::FormatIntWidth2(int value) { + std::stringstream ss; + ss << std::setfill('0') << std::setw(2) << value; + return ss.str(); +} + +// Formats an int value as "%X". +std::string String::FormatHexUInt32(uint32_t value) { + std::stringstream ss; + ss << std::hex << std::uppercase << value; + return ss.str(); +} + +// Formats an int value as "%X". +std::string String::FormatHexInt(int value) { + return FormatHexUInt32(static_cast(value)); +} + +// Formats a byte as "%02X". +std::string String::FormatByte(unsigned char value) { + std::stringstream ss; + ss << std::setfill('0') << std::setw(2) << std::hex << std::uppercase + << static_cast(value); + return ss.str(); +} + +// Converts the buffer in a stringstream to an std::string, converting NUL +// bytes to "\\0" along the way. +std::string StringStreamToString(::std::stringstream *ss) { + const ::std::string &str = ss->str(); + const char *const start = str.c_str(); + const char *const end = start + str.length(); + + std::string result; + result.reserve(static_cast(2 * (end - start))); + for (const char *ch = start; ch != end; ++ch) { + if (*ch == '\0') { + result += "\\0"; // Replaces NUL with "\\0"; + } else { + result += *ch; + } + } + + return result; +} + +// Appends the user-supplied message to the Google-Test-generated message. +std::string AppendUserMessage(const std::string >est_msg, + const Message &user_msg) { + // Appends the user message if it's non-empty. + const std::string user_msg_string = user_msg.GetString(); + if (user_msg_string.empty()) { + return gtest_msg; + } + + return gtest_msg + "\n" + user_msg_string; +} + +} // namespace internal + +// class TestResult + +// Creates an empty TestResult. +TestResult::TestResult() + : death_test_count_(0), start_timestamp_(0), elapsed_time_(0) {} + +// D'tor. +TestResult::~TestResult() {} + +// Returns the i-th test part result among all the results. i can +// range from 0 to total_part_count() - 1. If i is not in that range, +// aborts the program. +const TestPartResult &TestResult::GetTestPartResult(int i) const { + if (i < 0 || i >= total_part_count()) internal::posix::Abort(); + return test_part_results_.at(static_cast(i)); +} + +// Returns the i-th test property. i can range from 0 to +// test_property_count() - 1. If i is not in that range, aborts the +// program. +const TestProperty &TestResult::GetTestProperty(int i) const { + if (i < 0 || i >= test_property_count()) internal::posix::Abort(); + return test_properties_.at(static_cast(i)); +} + +// Clears the test part results. +void TestResult::ClearTestPartResults() { test_part_results_.clear(); } + +// Adds a test part result to the list. +void TestResult::AddTestPartResult(const TestPartResult &test_part_result) { + test_part_results_.push_back(test_part_result); +} + +// Adds a test property to the list. If a property with the same key as the +// supplied property is already represented, the value of this test_property +// replaces the old value for that key. +void TestResult::RecordProperty(const std::string &xml_element, + const TestProperty &test_property) { + if (!ValidateTestProperty(xml_element, test_property)) { + return; + } + internal::MutexLock lock(&test_properites_mutex_); + const std::vector::iterator property_with_matching_key = + std::find_if(test_properties_.begin(), test_properties_.end(), + internal::TestPropertyKeyIs(test_property.key())); + if (property_with_matching_key == test_properties_.end()) { + test_properties_.push_back(test_property); + return; + } + property_with_matching_key->SetValue(test_property.value()); +} + +// The list of reserved attributes used in the element of XML +// output. +static const char *const kReservedTestSuitesAttributes[] = { + "disabled", "errors", "failures", "name", + "random_seed", "tests", "time", "timestamp" +}; + +// The list of reserved attributes used in the element of XML +// output. +static const char *const kReservedTestSuiteAttributes[] = { + "disabled", "errors", "failures", "name", "tests", "time", "timestamp" +}; + +// The list of reserved attributes used in the element of XML output. +static const char *const kReservedTestCaseAttributes[] = { + "classname", "name", "status", "time", + "type_param", "value_param", "file", "line" +}; + +// Use a slightly different set for allowed output to ensure existing tests can +// still RecordProperty("result") or "RecordProperty(timestamp") +static const char *const kReservedOutputTestCaseAttributes[] = { + "classname", "name", "status", "time", "type_param", + "value_param", "file", "line", "result", "timestamp" +}; + +template +std::vector ArrayAsVector(const char *const (&array)[kSize]) { + return std::vector(array, array + kSize); +} + +static std::vector GetReservedAttributesForElement( + const std::string &xml_element) { + if (xml_element == "testsuites") { + return ArrayAsVector(kReservedTestSuitesAttributes); + } else if (xml_element == "testsuite") { + return ArrayAsVector(kReservedTestSuiteAttributes); + } else if (xml_element == "testcase") { + return ArrayAsVector(kReservedTestCaseAttributes); + } else { + GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element; + } + // This code is unreachable but some compilers may not realizes that. + return std::vector(); +} + +// TODO(jdesprez): Merge the two getReserved attributes once skip is improved +static std::vector GetReservedOutputAttributesForElement( + const std::string &xml_element) { + if (xml_element == "testsuites") { + return ArrayAsVector(kReservedTestSuitesAttributes); + } else if (xml_element == "testsuite") { + return ArrayAsVector(kReservedTestSuiteAttributes); + } else if (xml_element == "testcase") { + return ArrayAsVector(kReservedOutputTestCaseAttributes); + } else { + GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element; + } + // This code is unreachable but some compilers may not realizes that. + return std::vector(); +} + +static std::string FormatWordList(const std::vector &words) { + Message word_list; + for (size_t i = 0; i < words.size(); ++i) { + if (i > 0 && words.size() > 2) { + word_list << ", "; + } + if (i == words.size() - 1) { + word_list << "and "; + } + word_list << "'" << words[i] << "'"; + } + return word_list.GetString(); +} + +static bool ValidateTestPropertyName( + const std::string &property_name, + const std::vector &reserved_names) { + if (std::find(reserved_names.begin(), reserved_names.end(), property_name) != + reserved_names.end()) { + ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name + << " (" << FormatWordList(reserved_names) + << " are reserved by " << GTEST_NAME_ << ")"; + return false; + } + return true; +} + +// Adds a failure if the key is a reserved attribute of the element named +// xml_element. Returns true if the property is valid. +bool TestResult::ValidateTestProperty(const std::string &xml_element, + const TestProperty &test_property) { + return ValidateTestPropertyName(test_property.key(), + GetReservedAttributesForElement(xml_element)); +} + +// Clears the object. +void TestResult::Clear() { + test_part_results_.clear(); + test_properties_.clear(); + death_test_count_ = 0; + elapsed_time_ = 0; +} + +// Returns true off the test part was skipped. +static bool TestPartSkipped(const TestPartResult &result) { + return result.skipped(); +} + +// Returns true if and only if the test was skipped. +bool TestResult::Skipped() const { + return !Failed() && CountIf(test_part_results_, TestPartSkipped) > 0; +} + +// Returns true if and only if the test failed. +bool TestResult::Failed() const { + for (int i = 0; i < total_part_count(); ++i) { + if (GetTestPartResult(i).failed()) return true; + } + return false; +} + +// Returns true if and only if the test part fatally failed. +static bool TestPartFatallyFailed(const TestPartResult &result) { + return result.fatally_failed(); +} + +// Returns true if and only if the test fatally failed. +bool TestResult::HasFatalFailure() const { + return CountIf(test_part_results_, TestPartFatallyFailed) > 0; +} + +// Returns true if and only if the test part non-fatally failed. +static bool TestPartNonfatallyFailed(const TestPartResult &result) { + return result.nonfatally_failed(); +} + +// Returns true if and only if the test has a non-fatal failure. +bool TestResult::HasNonfatalFailure() const { + return CountIf(test_part_results_, TestPartNonfatallyFailed) > 0; +} + +// Gets the number of all test parts. This is the sum of the number +// of successful test parts and the number of failed test parts. +int TestResult::total_part_count() const { + return static_cast(test_part_results_.size()); +} + +// Returns the number of the test properties. +int TestResult::test_property_count() const { + return static_cast(test_properties_.size()); +} + +// class Test + +// Creates a Test object. + +// The c'tor saves the states of all flags. +Test::Test() : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {} + +// The d'tor restores the states of all flags. The actual work is +// done by the d'tor of the gtest_flag_saver_ field, and thus not +// visible here. +Test::~Test() {} + +// Sets up the test fixture. +// +// A sub-class may override this. +void Test::SetUp() {} + +// Tears down the test fixture. +// +// A sub-class may override this. +void Test::TearDown() {} + +// Allows user supplied key value pairs to be recorded for later output. +void Test::RecordProperty(const std::string &key, const std::string &value) { + UnitTest::GetInstance()->RecordProperty(key, value); +} + +// Allows user supplied key value pairs to be recorded for later output. +void Test::RecordProperty(const std::string &key, int value) { + Message value_message; + value_message << value; + RecordProperty(key, value_message.GetString().c_str()); +} + +namespace internal { + +void ReportFailureInUnknownLocation(TestPartResult::Type result_type, + const std::string &message) { + // This function is a friend of UnitTest and as such has access to + // AddTestPartResult. + UnitTest::GetInstance()->AddTestPartResult( + result_type, + nullptr, // No info about the source file where the exception occurred. + -1, // We have no info on which line caused the exception. + message, + ""); // No stack trace, either. +} + +} // namespace internal + +// Google Test requires all tests in the same test suite to use the same test +// fixture class. This function checks if the current test has the +// same fixture class as the first test in the current test suite. If +// yes, it returns true; otherwise it generates a Google Test failure and +// returns false. +bool Test::HasSameFixtureClass() { + internal::UnitTestImpl *const impl = internal::GetUnitTestImpl(); + const TestSuite *const test_suite = impl->current_test_suite(); + + // Info about the first test in the current test suite. + const TestInfo *const first_test_info = test_suite->test_info_list()[0]; + const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_; + const char *const first_test_name = first_test_info->name(); + + // Info about the current test. + const TestInfo *const this_test_info = impl->current_test_info(); + const internal::TypeId this_fixture_id = this_test_info->fixture_class_id_; + const char *const this_test_name = this_test_info->name(); + + if (this_fixture_id != first_fixture_id) { + // Is the first test defined using TEST? + const bool first_is_TEST = first_fixture_id == internal::GetTestTypeId(); + // Is this test defined using TEST? + const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId(); + + if (first_is_TEST || this_is_TEST) { + // Both TEST and TEST_F appear in same test suite, which is incorrect. + // Tell the user how to fix this. + + // Gets the name of the TEST and the name of the TEST_F. Note + // that first_is_TEST and this_is_TEST cannot both be true, as + // the fixture IDs are different for the two tests. + const char *const TEST_name = + first_is_TEST ? first_test_name : this_test_name; + const char *const TEST_F_name = + first_is_TEST ? this_test_name : first_test_name; + + ADD_FAILURE() + << "All tests in the same test suite must use the same test fixture\n" + << "class, so mixing TEST_F and TEST in the same test suite is\n" + << "illegal. In test suite " << this_test_info->test_suite_name() + << ",\n" + << "test " << TEST_F_name << " is defined using TEST_F but\n" + << "test " << TEST_name << " is defined using TEST. You probably\n" + << "want to change the TEST to TEST_F or move it to another test\n" + << "case."; + } else { + // Two fixture classes with the same name appear in two different + // namespaces, which is not allowed. Tell the user how to fix this. + ADD_FAILURE() + << "All tests in the same test suite must use the same test fixture\n" + << "class. However, in test suite " + << this_test_info->test_suite_name() << ",\n" + << "you defined test " << first_test_name << " and test " + << this_test_name << "\n" + << "using two different test fixture classes. This can happen if\n" + << "the two classes are from different namespaces or translation\n" + << "units and have the same name. You should probably rename one\n" + << "of the classes to put the tests into different test suites."; + } + return false; + } + + return true; +} + +#if GTEST_HAS_SEH + +// Adds an "exception thrown" fatal failure to the current test. This +// function returns its result via an output parameter pointer because VC++ +// prohibits creation of objects with destructors on stack in functions +// using __try (see error C2712). +static std::string *FormatSehExceptionMessage(DWORD exception_code, + const char *location) { + Message message; + message << "SEH exception with code 0x" << std::setbase(16) << exception_code + << std::setbase(10) << " thrown in " << location << "."; + + return new std::string(message.GetString()); +} + +#endif // GTEST_HAS_SEH + +namespace internal { + +#if GTEST_HAS_EXCEPTIONS + +// Adds an "exception thrown" fatal failure to the current test. +static std::string FormatCxxExceptionMessage(const char *description, + const char *location) { + Message message; + if (description != nullptr) { + message << "C++ exception with description \"" << description << "\""; + } else { + message << "Unknown C++ exception"; + } + message << " thrown in " << location << "."; + + return message.GetString(); +} + +static std::string PrintTestPartResultToString( + const TestPartResult &test_part_result); + +GoogleTestFailureException::GoogleTestFailureException( + const TestPartResult &failure) + : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {} + +#endif // GTEST_HAS_EXCEPTIONS + +// We put these helper functions in the internal namespace as IBM's xlC +// compiler rejects the code if they were declared static. + +// Runs the given method and handles SEH exceptions it throws, when +// SEH is supported; returns the 0-value for type Result in case of an +// SEH exception. (Microsoft compilers cannot handle SEH and C++ +// exceptions in the same function. Therefore, we provide a separate +// wrapper function for handling SEH exceptions.) +template +Result HandleSehExceptionsInMethodIfSupported(T *object, Result (T::*method)(), + const char *location) { +#if GTEST_HAS_SEH + __try { + return (object->*method)(); + } __except (internal::UnitTestOptions::GTestShouldProcessSEH( // NOLINT + GetExceptionCode())) { + // We create the exception message on the heap because VC++ prohibits + // creation of objects with destructors on stack in functions using __try + // (see error C2712). + std::string *exception_message = + FormatSehExceptionMessage(GetExceptionCode(), location); + internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure, + *exception_message); + delete exception_message; + return static_cast(0); + } +#else + (void)location; + return (object->*method)(); +#endif // GTEST_HAS_SEH +} + +// Runs the given method and catches and reports C++ and/or SEH-style +// exceptions, if they are supported; returns the 0-value for type +// Result in case of an SEH exception. +template +Result HandleExceptionsInMethodIfSupported(T *object, Result (T::*method)(), + const char *location) { + // NOTE: The user code can affect the way in which Google Test handles + // exceptions by setting GTEST_FLAG(catch_exceptions), but only before + // RUN_ALL_TESTS() starts. It is technically possible to check the flag + // after the exception is caught and either report or re-throw the + // exception based on the flag's value: + // + // try { + // // Perform the test method. + // } catch (...) { + // if (GTEST_FLAG(catch_exceptions)) + // // Report the exception as failure. + // else + // throw; // Re-throws the original exception. + // } + // + // However, the purpose of this flag is to allow the program to drop into + // the debugger when the exception is thrown. On most platforms, once the + // control enters the catch block, the exception origin information is + // lost and the debugger will stop the program at the point of the + // re-throw in this function -- instead of at the point of the original + // throw statement in the code under test. For this reason, we perform + // the check early, sacrificing the ability to affect Google Test's + // exception handling in the method where the exception is thrown. + if (internal::GetUnitTestImpl()->catch_exceptions()) { +#if GTEST_HAS_EXCEPTIONS + try { + return HandleSehExceptionsInMethodIfSupported(object, method, location); + } catch (const AssertionException &) { // NOLINT + // This failure was reported already. + } catch (const internal::GoogleTestFailureException &) { // NOLINT + // This exception type can only be thrown by a failed Google + // Test assertion with the intention of letting another testing + // framework catch it. Therefore we just re-throw it. + throw; + } catch (const std::exception &e) { // NOLINT + internal::ReportFailureInUnknownLocation( + TestPartResult::kFatalFailure, + FormatCxxExceptionMessage(e.what(), location)); + } catch (...) { // NOLINT + internal::ReportFailureInUnknownLocation( + TestPartResult::kFatalFailure, + FormatCxxExceptionMessage(nullptr, location)); + } + return static_cast(0); +#else + return HandleSehExceptionsInMethodIfSupported(object, method, location); +#endif // GTEST_HAS_EXCEPTIONS + } else { + return (object->*method)(); + } +} + +} // namespace internal + +// Runs the test and updates the test result. +void Test::Run() { + if (!HasSameFixtureClass()) return; + + internal::UnitTestImpl *const impl = internal::GetUnitTestImpl(); + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()"); + // We will run the test only if SetUp() was successful and didn't call + // GTEST_SKIP(). + if (!HasFatalFailure() && !IsSkipped()) { + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported(this, &Test::TestBody, + "the test body"); + } + + // However, we want to clean up as much as possible. Hence we will + // always call TearDown(), even if SetUp() or the test body has + // failed. + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported(this, &Test::TearDown, + "TearDown()"); +} + +// Returns true if and only if the current test has a fatal failure. +bool Test::HasFatalFailure() { + return internal::GetUnitTestImpl()->current_test_result()->HasFatalFailure(); +} + +// Returns true if and only if the current test has a non-fatal failure. +bool Test::HasNonfatalFailure() { + return internal::GetUnitTestImpl() + ->current_test_result() + ->HasNonfatalFailure(); +} + +// Returns true if and only if the current test was skipped. +bool Test::IsSkipped() { + return internal::GetUnitTestImpl()->current_test_result()->Skipped(); +} + +// class TestInfo + +// Constructs a TestInfo object. It assumes ownership of the test factory +// object. +TestInfo::TestInfo(const std::string &a_test_suite_name, + const std::string &a_name, const char *a_type_param, + const char *a_value_param, + internal::CodeLocation a_code_location, + internal::TypeId fixture_class_id, + internal::TestFactoryBase *factory) + : test_suite_name_(a_test_suite_name), name_(a_name), + type_param_(a_type_param ? new std::string(a_type_param) : nullptr), + value_param_(a_value_param ? new std::string(a_value_param) : nullptr), + location_(a_code_location), fixture_class_id_(fixture_class_id), + should_run_(false), is_disabled_(false), matches_filter_(false), + factory_(factory), result_() {} + +// Destructs a TestInfo object. +TestInfo::~TestInfo() { delete factory_; } + +namespace internal { + +// Creates a new TestInfo object and registers it with Google Test; +// returns the created object. +// +// Arguments: +// +// test_suite_name: name of the test suite +// name: name of the test +// type_param: the name of the test's type parameter, or NULL if +// this is not a typed or a type-parameterized test. +// value_param: text representation of the test's value parameter, +// or NULL if this is not a value-parameterized test. +// code_location: code location where the test is defined +// fixture_class_id: ID of the test fixture class +// set_up_tc: pointer to the function that sets up the test suite +// tear_down_tc: pointer to the function that tears down the test suite +// factory: pointer to the factory that creates a test object. +// The newly created TestInfo instance will assume +// ownership of the factory object. +TestInfo *MakeAndRegisterTestInfo( + const char *test_suite_name, const char *name, const char *type_param, + const char *value_param, CodeLocation code_location, + TypeId fixture_class_id, SetUpTestSuiteFunc set_up_tc, + TearDownTestSuiteFunc tear_down_tc, TestFactoryBase *factory) { + TestInfo *const test_info = + new TestInfo(test_suite_name, name, type_param, value_param, + code_location, fixture_class_id, factory); + GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info); + return test_info; +} + +void ReportInvalidTestSuiteType(const char *test_suite_name, + CodeLocation code_location) { + Message errors; + errors + << "Attempted redefinition of test suite " << test_suite_name << ".\n" + << "All tests in the same test suite must use the same test fixture\n" + << "class. However, in test suite " << test_suite_name << ", you tried\n" + << "to define a test using a fixture class different from the one\n" + << "used earlier. This can happen if the two fixture classes are\n" + << "from different namespaces and have the same name. You should\n" + << "probably rename one of the classes to put the tests into different\n" + << "test suites."; + + GTEST_LOG_(ERROR) << FormatFileLocation(code_location.file.c_str(), + code_location.line) + << " " << errors.GetString(); +} +} // namespace internal + +namespace { + +// A predicate that checks the test name of a TestInfo against a known +// value. +// +// This is used for implementation of the TestSuite class only. We put +// it in the anonymous namespace to prevent polluting the outer +// namespace. +// +// TestNameIs is copyable. +class TestNameIs { + public: + // Constructor. + // + // TestNameIs has NO default constructor. + explicit TestNameIs(const char *name) : name_(name) {} + + // Returns true if and only if the test name of test_info matches name_. + bool operator()(const TestInfo *test_info) const { + return test_info && test_info->name() == name_; + } + + private: + std::string name_; +}; + +} // namespace + +namespace internal { + +// This method expands all parameterized tests registered with macros TEST_P +// and INSTANTIATE_TEST_SUITE_P into regular tests and registers those. +// This will be done just once during the program runtime. +void UnitTestImpl::RegisterParameterizedTests() { + if (!parameterized_tests_registered_) { + parameterized_test_registry_.RegisterTests(); + type_parameterized_test_registry_.CheckForInstantiations(); + parameterized_tests_registered_ = true; + } +} + +} // namespace internal + +// Creates the test object, runs it, records its result, and then +// deletes it. +void TestInfo::Run() { + if (!should_run_) return; + + // Tells UnitTest where to store test result. + internal::UnitTestImpl *const impl = internal::GetUnitTestImpl(); + impl->set_current_test_info(this); + + TestEventListener *repeater = UnitTest::GetInstance()->listeners().repeater(); + + // Notifies the unit test event listeners that a test is about to start. + repeater->OnTestStart(*this); + + const TimeInMillis start = internal::GetTimeInMillis(); + + impl->os_stack_trace_getter()->UponLeavingGTest(); + + // Creates the test object. + Test *const test = internal::HandleExceptionsInMethodIfSupported( + factory_, &internal::TestFactoryBase::CreateTest, + "the test fixture's constructor"); + + // Runs the test if the constructor didn't generate a fatal failure or invoke + // GTEST_SKIP(). + // Note that the object will not be null + if (!Test::HasFatalFailure() && !Test::IsSkipped()) { + // This doesn't throw as all user code that can throw are wrapped into + // exception handling code. + test->Run(); + } + + if (test != nullptr) { + // Deletes the test object. + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported( + test, &Test::DeleteSelf_, "the test fixture's destructor"); + } + + result_.set_start_timestamp(start); + result_.set_elapsed_time(internal::GetTimeInMillis() - start); + + // Notifies the unit test event listener that a test has just finished. + repeater->OnTestEnd(*this); + + // Tells UnitTest to stop associating assertion results to this + // test. + impl->set_current_test_info(nullptr); +} + +// class TestSuite + +// Gets the number of successful tests in this test suite. +int TestSuite::successful_test_count() const { + return CountIf(test_info_list_, TestPassed); +} + +// Gets the number of successful tests in this test suite. +int TestSuite::skipped_test_count() const { + return CountIf(test_info_list_, TestSkipped); +} + +// Gets the number of failed tests in this test suite. +int TestSuite::failed_test_count() const { + return CountIf(test_info_list_, TestFailed); +} + +// Gets the number of disabled tests that will be reported in the XML report. +int TestSuite::reportable_disabled_test_count() const { + return CountIf(test_info_list_, TestReportableDisabled); +} + +// Gets the number of disabled tests in this test suite. +int TestSuite::disabled_test_count() const { + return CountIf(test_info_list_, TestDisabled); +} + +// Gets the number of tests to be printed in the XML report. +int TestSuite::reportable_test_count() const { + return CountIf(test_info_list_, TestReportable); +} + +// Get the number of tests in this test suite that should run. +int TestSuite::test_to_run_count() const { + return CountIf(test_info_list_, ShouldRunTest); +} + +// Gets the number of all tests. +int TestSuite::total_test_count() const { + return static_cast(test_info_list_.size()); +} + +// Creates a TestSuite with the given name. +// +// Arguments: +// +// name: name of the test suite +// a_type_param: the name of the test suite's type parameter, or NULL if +// this is not a typed or a type-parameterized test suite. +// set_up_tc: pointer to the function that sets up the test suite +// tear_down_tc: pointer to the function that tears down the test suite +TestSuite::TestSuite(const char *a_name, const char *a_type_param, + internal::SetUpTestSuiteFunc set_up_tc, + internal::TearDownTestSuiteFunc tear_down_tc) + : name_(a_name), + type_param_(a_type_param ? new std::string(a_type_param) : nullptr), + set_up_tc_(set_up_tc), tear_down_tc_(tear_down_tc), should_run_(false), + start_timestamp_(0), elapsed_time_(0) {} + +// Destructor of TestSuite. +TestSuite::~TestSuite() { + // Deletes every Test in the collection. + ForEach(test_info_list_, internal::Delete); +} + +// Returns the i-th test among all the tests. i can range from 0 to +// total_test_count() - 1. If i is not in that range, returns NULL. +const TestInfo *TestSuite::GetTestInfo(int i) const { + const int index = GetElementOr(test_indices_, i, -1); + return index < 0 ? nullptr : test_info_list_[static_cast(index)]; +} + +// Returns the i-th test among all the tests. i can range from 0 to +// total_test_count() - 1. If i is not in that range, returns NULL. +TestInfo *TestSuite::GetMutableTestInfo(int i) { + const int index = GetElementOr(test_indices_, i, -1); + return index < 0 ? nullptr : test_info_list_[static_cast(index)]; +} + +// Adds a test to this test suite. Will delete the test upon +// destruction of the TestSuite object. +void TestSuite::AddTestInfo(TestInfo *test_info) { + test_info_list_.push_back(test_info); + test_indices_.push_back(static_cast(test_indices_.size())); +} + +// Runs every test in this TestSuite. +void TestSuite::Run() { + if (!should_run_) return; + + internal::UnitTestImpl *const impl = internal::GetUnitTestImpl(); + impl->set_current_test_suite(this); + + TestEventListener *repeater = UnitTest::GetInstance()->listeners().repeater(); + + // Call both legacy and the new API + repeater->OnTestSuiteStart(*this); +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI + repeater->OnTestCaseStart(*this); +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI + + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported( + this, &TestSuite::RunSetUpTestSuite, "SetUpTestSuite()"); + + start_timestamp_ = internal::GetTimeInMillis(); + for (int i = 0; i < total_test_count(); i++) { + GetMutableTestInfo(i)->Run(); + } + elapsed_time_ = internal::GetTimeInMillis() - start_timestamp_; + + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported( + this, &TestSuite::RunTearDownTestSuite, "TearDownTestSuite()"); + + // Call both legacy and the new API + repeater->OnTestSuiteEnd(*this); +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI + repeater->OnTestCaseEnd(*this); +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI + + impl->set_current_test_suite(nullptr); +} + +// Clears the results of all tests in this test suite. +void TestSuite::ClearResult() { + ad_hoc_test_result_.Clear(); + ForEach(test_info_list_, TestInfo::ClearTestResult); +} + +// Shuffles the tests in this test suite. +void TestSuite::ShuffleTests(internal::Random *random) { + Shuffle(random, &test_indices_); +} + +// Restores the test order to before the first shuffle. +void TestSuite::UnshuffleTests() { + for (size_t i = 0; i < test_indices_.size(); i++) { + test_indices_[i] = static_cast(i); + } +} + +// Formats a countable noun. Depending on its quantity, either the +// singular form or the plural form is used. e.g. +// +// FormatCountableNoun(1, "formula", "formuli") returns "1 formula". +// FormatCountableNoun(5, "book", "books") returns "5 books". +static std::string FormatCountableNoun(int count, const char *singular_form, + const char *plural_form) { + return internal::StreamableToString(count) + " " + + (count == 1 ? singular_form : plural_form); +} + +// Formats the count of tests. +static std::string FormatTestCount(int test_count) { + return FormatCountableNoun(test_count, "test", "tests"); +} + +// Formats the count of test suites. +static std::string FormatTestSuiteCount(int test_suite_count) { + return FormatCountableNoun(test_suite_count, "test suite", "test suites"); +} + +// Converts a TestPartResult::Type enum to human-friendly string +// representation. Both kNonFatalFailure and kFatalFailure are translated +// to "Failure", as the user usually doesn't care about the difference +// between the two when viewing the test result. +static const char *TestPartResultTypeToString(TestPartResult::Type type) { + switch (type) { + case TestPartResult::kSkip: return "Skipped"; + case TestPartResult::kSuccess: return "Success"; + + case TestPartResult::kNonFatalFailure: + case TestPartResult::kFatalFailure: +#ifdef _MSC_VER + return "error: "; +#else + return "Failure\n"; +#endif + default: return "Unknown result type"; + } +} + +namespace internal { + +// Prints a TestPartResult to an std::string. +static std::string PrintTestPartResultToString( + const TestPartResult &test_part_result) { + return (Message() << internal::FormatFileLocation( + test_part_result.file_name(), + test_part_result.line_number()) + << " " + << TestPartResultTypeToString(test_part_result.type()) + << test_part_result.message()) + .GetString(); +} + +// Prints a TestPartResult. +static void PrintTestPartResult(const TestPartResult &test_part_result) { + const std::string &result = PrintTestPartResultToString(test_part_result); + printf("%s\n", result.c_str()); + fflush(stdout); + // If the test program runs in Visual Studio or a debugger, the + // following statements add the test part result message to the Output + // window such that the user can double-click on it to jump to the + // corresponding source code location; otherwise they do nothing. +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE + // We don't call OutputDebugString*() on Windows Mobile, as printing + // to stdout is done by OutputDebugString() there already - we don't + // want the same message printed twice. + ::OutputDebugStringA(result.c_str()); + ::OutputDebugStringA("\n"); +#endif +} + +// class PrettyUnitTestResultPrinter +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \ + !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW + +// Returns the character attribute for the given color. +static WORD GetColorAttribute(GTestColor color) { + switch (color) { + case COLOR_RED: return FOREGROUND_RED; + case COLOR_GREEN: return FOREGROUND_GREEN; + case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN; + default: return 0; + } +} + +static int GetBitOffset(WORD color_mask) { + if (color_mask == 0) return 0; + + int bitOffset = 0; + while ((color_mask & 1) == 0) { + color_mask >>= 1; + ++bitOffset; + } + return bitOffset; +} + +static WORD GetNewColor(GTestColor color, WORD old_color_attrs) { + // Let's reuse the BG + static const WORD background_mask = BACKGROUND_BLUE | BACKGROUND_GREEN | + BACKGROUND_RED | BACKGROUND_INTENSITY; + static const WORD foreground_mask = FOREGROUND_BLUE | FOREGROUND_GREEN | + FOREGROUND_RED | FOREGROUND_INTENSITY; + const WORD existing_bg = old_color_attrs & background_mask; + + WORD new_color = + GetColorAttribute(color) | existing_bg | FOREGROUND_INTENSITY; + static const int bg_bitOffset = GetBitOffset(background_mask); + static const int fg_bitOffset = GetBitOffset(foreground_mask); + + if (((new_color & background_mask) >> bg_bitOffset) == + ((new_color & foreground_mask) >> fg_bitOffset)) { + new_color ^= FOREGROUND_INTENSITY; // invert intensity + } + return new_color; +} + +#else + +// Returns the ANSI color code for the given color. COLOR_DEFAULT is +// an invalid input. +static const char *GetAnsiColorCode(GTestColor color) { + switch (color) { + case COLOR_RED: return "1"; + case COLOR_GREEN: return "2"; + case COLOR_YELLOW: return "3"; + default: return nullptr; + } +} + +#endif // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE + +// Returns true if and only if Google Test should use colors in the output. +bool ShouldUseColor(bool stdout_is_tty) { + const char *const gtest_color = GTEST_FLAG(color).c_str(); + + if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) { +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW + // On Windows the TERM variable is usually not set, but the + // console there does support colors. + return stdout_is_tty; +#else + // On non-Windows platforms, we rely on the TERM variable. + const char *const term = posix::GetEnv("TERM"); + const bool term_supports_color = + String::CStringEquals(term, "xterm") || + String::CStringEquals(term, "xterm-color") || + String::CStringEquals(term, "xterm-256color") || + String::CStringEquals(term, "screen") || + String::CStringEquals(term, "screen-256color") || + String::CStringEquals(term, "tmux") || + String::CStringEquals(term, "tmux-256color") || + String::CStringEquals(term, "rxvt-unicode") || + String::CStringEquals(term, "rxvt-unicode-256color") || + String::CStringEquals(term, "linux") || + String::CStringEquals(term, "cygwin"); + return stdout_is_tty && term_supports_color; +#endif // GTEST_OS_WINDOWS + } + + return String::CaseInsensitiveCStringEquals(gtest_color, "yes") || + String::CaseInsensitiveCStringEquals(gtest_color, "true") || + String::CaseInsensitiveCStringEquals(gtest_color, "t") || + String::CStringEquals(gtest_color, "1"); + // We take "yes", "true", "t", and "1" as meaning "yes". If the + // value is neither one of these nor "auto", we treat it as "no" to + // be conservative. +} + +// Helpers for printing colored strings to stdout. Note that on Windows, we +// cannot simply emit special characters and have the terminal change colors. +// This routine must actually emit the characters rather than return a string +// that would be colored when printed, as can be done on Linux. +void ColoredPrintf(GTestColor color, const char *fmt, ...) { + va_list args; + va_start(args, fmt); + +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS || GTEST_OS_IOS || \ + GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT || defined(ESP_PLATFORM) + const bool use_color = AlwaysFalse(); +#else + static const bool in_color_mode = + ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0); + const bool use_color = in_color_mode && (color != COLOR_DEFAULT); +#endif // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS + + if (!use_color) { + vprintf(fmt, args); + va_end(args); + return; + } + +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \ + !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW + const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE); + + // Gets the current text color. + CONSOLE_SCREEN_BUFFER_INFO buffer_info; + GetConsoleScreenBufferInfo(stdout_handle, &buffer_info); + const WORD old_color_attrs = buffer_info.wAttributes; + const WORD new_color = GetNewColor(color, old_color_attrs); + + // We need to flush the stream buffers into the console before each + // SetConsoleTextAttribute call lest it affect the text that is already + // printed but has not yet reached the console. + fflush(stdout); + SetConsoleTextAttribute(stdout_handle, new_color); + + vprintf(fmt, args); + + fflush(stdout); + // Restores the text color. + SetConsoleTextAttribute(stdout_handle, old_color_attrs); +#else + printf("\033[0;3%sm", GetAnsiColorCode(color)); + vprintf(fmt, args); + printf("\033[m"); // Resets the terminal to default. +#endif // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE + va_end(args); +} + +// Text printed in Google Test's text output and --gtest_list_tests +// output to label the type parameter and value parameter for a test. +static const char kTypeParamLabel[] = "TypeParam"; +static const char kValueParamLabel[] = "GetParam()"; + +static void PrintFullTestCommentIfPresent(const TestInfo &test_info) { + const char *const type_param = test_info.type_param(); + const char *const value_param = test_info.value_param(); + + if (type_param != nullptr || value_param != nullptr) { + printf(", where "); + if (type_param != nullptr) { + printf("%s = %s", kTypeParamLabel, type_param); + if (value_param != nullptr) printf(" and "); + } + if (value_param != nullptr) { + printf("%s = %s", kValueParamLabel, value_param); + } + } +} + +// This class implements the TestEventListener interface. +// +// Class PrettyUnitTestResultPrinter is copyable. +class PrettyUnitTestResultPrinter : public TestEventListener { + public: + PrettyUnitTestResultPrinter() {} + static void PrintTestName(const char *test_suite, const char *test) { + printf("%s.%s", test_suite, test); + } + + // The following methods override what's in the TestEventListener class. + void OnTestProgramStart(const UnitTest & /*unit_test*/) override {} + void OnTestIterationStart(const UnitTest &unit_test, int iteration) override; + void OnEnvironmentsSetUpStart(const UnitTest &unit_test) override; + void OnEnvironmentsSetUpEnd(const UnitTest & /*unit_test*/) override {} +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseStart(const TestCase &test_case) override; +#else + void OnTestSuiteStart(const TestSuite &test_suite) override; +#endif // OnTestCaseStart + + void OnTestStart(const TestInfo &test_info) override; + + void OnTestPartResult(const TestPartResult &result) override; + void OnTestEnd(const TestInfo &test_info) override; +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseEnd(const TestCase &test_case) override; +#else + void OnTestSuiteEnd(const TestSuite &test_suite) override; +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + void OnEnvironmentsTearDownStart(const UnitTest &unit_test) override; + void OnEnvironmentsTearDownEnd(const UnitTest & /*unit_test*/) override {} + void OnTestIterationEnd(const UnitTest &unit_test, int iteration) override; + void OnTestProgramEnd(const UnitTest & /*unit_test*/) override {} + + private: + static void PrintFailedTests(const UnitTest &unit_test); + static void PrintFailedTestSuites(const UnitTest &unit_test); + static void PrintSkippedTests(const UnitTest &unit_test); +}; + +// Fired before each iteration of tests starts. +void PrettyUnitTestResultPrinter::OnTestIterationStart( + const UnitTest &unit_test, int iteration) { + if (GTEST_FLAG(repeat) != 1) + printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1); + + const char *const filter = GTEST_FLAG(filter).c_str(); + + // Prints the filter if it's not *. This reminds the user that some + // tests may be skipped. + if (!String::CStringEquals(filter, kUniversalFilter)) { + ColoredPrintf(COLOR_YELLOW, "Note: %s filter = %s\n", GTEST_NAME_, filter); + } + + if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) { + const int32_t shard_index = Int32FromEnvOrDie(kTestShardIndex, -1); + ColoredPrintf(COLOR_YELLOW, "Note: This is test shard %d of %s.\n", + static_cast(shard_index) + 1, + internal::posix::GetEnv(kTestTotalShards)); + } + + if (GTEST_FLAG(shuffle)) { + ColoredPrintf(COLOR_YELLOW, + "Note: Randomizing tests' orders with a seed of %d .\n", + unit_test.random_seed()); + } + + ColoredPrintf(COLOR_GREEN, "[==========] "); + printf("Running %s from %s.\n", + FormatTestCount(unit_test.test_to_run_count()).c_str(), + FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str()); + fflush(stdout); +} + +void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart( + const UnitTest & /*unit_test*/) { + ColoredPrintf(COLOR_GREEN, "[----------] "); + printf("Global test environment set-up.\n"); + fflush(stdout); +} + +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase &test_case) { + const std::string counts = + FormatCountableNoun(test_case.test_to_run_count(), "test", "tests"); + ColoredPrintf(COLOR_GREEN, "[----------] "); + printf("%s from %s", counts.c_str(), test_case.name()); + if (test_case.type_param() == nullptr) { + printf("\n"); + } else { + printf(", where %s = %s\n", kTypeParamLabel, test_case.type_param()); + } + fflush(stdout); +} +#else +void PrettyUnitTestResultPrinter::OnTestSuiteStart( + const TestSuite &test_suite) { + const std::string counts = + FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests"); + ColoredPrintf(COLOR_GREEN, "[----------] "); + printf("%s from %s", counts.c_str(), test_suite.name()); + if (test_suite.type_param() == nullptr) { + printf("\n"); + } else { + printf(", where %s = %s\n", kTypeParamLabel, test_suite.type_param()); + } + fflush(stdout); +} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo &test_info) { + ColoredPrintf(COLOR_GREEN, "[ RUN ] "); + PrintTestName(test_info.test_suite_name(), test_info.name()); + printf("\n"); + fflush(stdout); +} + +// Called after an assertion failure. +void PrettyUnitTestResultPrinter::OnTestPartResult( + const TestPartResult &result) { + switch (result.type()) { + // If the test part succeeded, we don't need to do anything. + case TestPartResult::kSuccess: return; + default: + // Print failure message from the assertion + // (e.g. expected this and got that). + PrintTestPartResult(result); + fflush(stdout); + } +} + +void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo &test_info) { + if (test_info.result()->Passed()) { + ColoredPrintf(COLOR_GREEN, "[ OK ] "); + } else if (test_info.result()->Skipped()) { + ColoredPrintf(COLOR_GREEN, "[ SKIPPED ] "); + } else { + ColoredPrintf(COLOR_RED, "[ FAILED ] "); + } + PrintTestName(test_info.test_suite_name(), test_info.name()); + if (test_info.result()->Failed()) PrintFullTestCommentIfPresent(test_info); + + if (GTEST_FLAG(print_time)) { + printf(" (%s ms)\n", + internal::StreamableToString(test_info.result()->elapsed_time()) + .c_str()); + } else { + printf("\n"); + } + fflush(stdout); +} + +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase &test_case) { + if (!GTEST_FLAG(print_time)) return; + + const std::string counts = + FormatCountableNoun(test_case.test_to_run_count(), "test", "tests"); + ColoredPrintf(COLOR_GREEN, "[----------] "); + printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_case.name(), + internal::StreamableToString(test_case.elapsed_time()).c_str()); + fflush(stdout); +} +#else +void PrettyUnitTestResultPrinter::OnTestSuiteEnd(const TestSuite &test_suite) { + if (!GTEST_FLAG(print_time)) return; + + const std::string counts = + FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests"); + ColoredPrintf(COLOR_GREEN, "[----------] "); + printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_suite.name(), + internal::StreamableToString(test_suite.elapsed_time()).c_str()); + fflush(stdout); +} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart( + const UnitTest & /*unit_test*/) { + ColoredPrintf(COLOR_GREEN, "[----------] "); + printf("Global test environment tear-down\n"); + fflush(stdout); +} + +// Internal helper for printing the list of failed tests. +void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest &unit_test) { + const int failed_test_count = unit_test.failed_test_count(); + ColoredPrintf(COLOR_RED, "[ FAILED ] "); + printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str()); + + for (int i = 0; i < unit_test.total_test_suite_count(); ++i) { + const TestSuite &test_suite = *unit_test.GetTestSuite(i); + if (!test_suite.should_run() || (test_suite.failed_test_count() == 0)) { + continue; + } + for (int j = 0; j < test_suite.total_test_count(); ++j) { + const TestInfo &test_info = *test_suite.GetTestInfo(j); + if (!test_info.should_run() || !test_info.result()->Failed()) { + continue; + } + ColoredPrintf(COLOR_RED, "[ FAILED ] "); + printf("%s.%s", test_suite.name(), test_info.name()); + PrintFullTestCommentIfPresent(test_info); + printf("\n"); + } + } + printf("\n%2d FAILED %s\n", failed_test_count, + failed_test_count == 1 ? "TEST" : "TESTS"); +} + +// Internal helper for printing the list of test suite failures not covered by +// PrintFailedTests. +void PrettyUnitTestResultPrinter::PrintFailedTestSuites( + const UnitTest &unit_test) { + int suite_failure_count = 0; + for (int i = 0; i < unit_test.total_test_suite_count(); ++i) { + const TestSuite &test_suite = *unit_test.GetTestSuite(i); + if (!test_suite.should_run()) { + continue; + } + if (test_suite.ad_hoc_test_result().Failed()) { + ColoredPrintf(COLOR_RED, "[ FAILED ] "); + printf("%s: SetUpTestSuite or TearDownTestSuite\n", test_suite.name()); + ++suite_failure_count; + } + } + if (suite_failure_count > 0) { + printf("\n%2d FAILED TEST %s\n", suite_failure_count, + suite_failure_count == 1 ? "SUITE" : "SUITES"); + } +} + +// Internal helper for printing the list of skipped tests. +void PrettyUnitTestResultPrinter::PrintSkippedTests(const UnitTest &unit_test) { + const int skipped_test_count = unit_test.skipped_test_count(); + if (skipped_test_count == 0) { + return; + } + + for (int i = 0; i < unit_test.total_test_suite_count(); ++i) { + const TestSuite &test_suite = *unit_test.GetTestSuite(i); + if (!test_suite.should_run() || (test_suite.skipped_test_count() == 0)) { + continue; + } + for (int j = 0; j < test_suite.total_test_count(); ++j) { + const TestInfo &test_info = *test_suite.GetTestInfo(j); + if (!test_info.should_run() || !test_info.result()->Skipped()) { + continue; + } + ColoredPrintf(COLOR_GREEN, "[ SKIPPED ] "); + printf("%s.%s", test_suite.name(), test_info.name()); + printf("\n"); + } + } +} + +void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest &unit_test, + int /*iteration*/) { + ColoredPrintf(COLOR_GREEN, "[==========] "); + printf("%s from %s ran.", + FormatTestCount(unit_test.test_to_run_count()).c_str(), + FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str()); + if (GTEST_FLAG(print_time)) { + printf(" (%s ms total)", + internal::StreamableToString(unit_test.elapsed_time()).c_str()); + } + printf("\n"); + ColoredPrintf(COLOR_GREEN, "[ PASSED ] "); + printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str()); + + const int skipped_test_count = unit_test.skipped_test_count(); + if (skipped_test_count > 0) { + ColoredPrintf(COLOR_GREEN, "[ SKIPPED ] "); + printf("%s, listed below:\n", FormatTestCount(skipped_test_count).c_str()); + PrintSkippedTests(unit_test); + } + + if (!unit_test.Passed()) { + PrintFailedTests(unit_test); + PrintFailedTestSuites(unit_test); + } + + int num_disabled = unit_test.reportable_disabled_test_count(); + if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) { + if (unit_test.Passed()) { + printf("\n"); // Add a spacer if no FAILURE banner is displayed. + } + ColoredPrintf(COLOR_YELLOW, " YOU HAVE %d DISABLED %s\n\n", num_disabled, + num_disabled == 1 ? "TEST" : "TESTS"); + } + // Ensure that Google Test output is printed before, e.g., heapchecker output. + fflush(stdout); +} + +// End PrettyUnitTestResultPrinter + +// class TestEventRepeater +// +// This class forwards events to other event listeners. +class TestEventRepeater : public TestEventListener { + public: + TestEventRepeater() : forwarding_enabled_(true) {} + ~TestEventRepeater() override; + void Append(TestEventListener *listener); + TestEventListener *Release(TestEventListener *listener); + + // Controls whether events will be forwarded to listeners_. Set to false + // in death test child processes. + bool forwarding_enabled() const { return forwarding_enabled_; } + void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; } + + void OnTestProgramStart(const UnitTest &unit_test) override; + void OnTestIterationStart(const UnitTest &unit_test, int iteration) override; + void OnEnvironmentsSetUpStart(const UnitTest &unit_test) override; + void OnEnvironmentsSetUpEnd(const UnitTest &unit_test) override; +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseStart(const TestSuite ¶meter) override; +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestSuiteStart(const TestSuite ¶meter) override; + void OnTestStart(const TestInfo &test_info) override; + void OnTestPartResult(const TestPartResult &result) override; + void OnTestEnd(const TestInfo &test_info) override; +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseEnd(const TestCase ¶meter) override; +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestSuiteEnd(const TestSuite ¶meter) override; + void OnEnvironmentsTearDownStart(const UnitTest &unit_test) override; + void OnEnvironmentsTearDownEnd(const UnitTest &unit_test) override; + void OnTestIterationEnd(const UnitTest &unit_test, int iteration) override; + void OnTestProgramEnd(const UnitTest &unit_test) override; + + private: + // Controls whether events will be forwarded to listeners_. Set to false + // in death test child processes. + bool forwarding_enabled_; + // The list of listeners that receive events. + std::vector listeners_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater); +}; + +TestEventRepeater::~TestEventRepeater() { + ForEach(listeners_, Delete); +} + +void TestEventRepeater::Append(TestEventListener *listener) { + listeners_.push_back(listener); +} + +TestEventListener *TestEventRepeater::Release(TestEventListener *listener) { + for (size_t i = 0; i < listeners_.size(); ++i) { + if (listeners_[i] == listener) { + listeners_.erase(listeners_.begin() + static_cast(i)); + return listener; + } + } + + return nullptr; +} + +// Since most methods are very similar, use macros to reduce boilerplate. +// This defines a member that forwards the call to all listeners. +#define GTEST_REPEATER_METHOD_(Name, Type) \ + void TestEventRepeater::Name(const Type ¶meter) { \ + if (forwarding_enabled_) { \ + for (size_t i = 0; i < listeners_.size(); i++) { \ + listeners_[i]->Name(parameter); \ + } \ + } \ + } +// This defines a member that forwards the call to all listeners in reverse +// order. +#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type) \ + void TestEventRepeater::Name(const Type ¶meter) { \ + if (forwarding_enabled_) { \ + for (size_t i = listeners_.size(); i != 0; i--) { \ + listeners_[i - 1]->Name(parameter); \ + } \ + } \ + } + +GTEST_REPEATER_METHOD_(OnTestProgramStart, UnitTest) +GTEST_REPEATER_METHOD_(OnEnvironmentsSetUpStart, UnitTest) +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +GTEST_REPEATER_METHOD_(OnTestCaseStart, TestSuite) +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +GTEST_REPEATER_METHOD_(OnTestSuiteStart, TestSuite) +GTEST_REPEATER_METHOD_(OnTestStart, TestInfo) +GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult) +GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest) +GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest) +GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsTearDownEnd, UnitTest) +GTEST_REVERSE_REPEATER_METHOD_(OnTestEnd, TestInfo) +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestSuite) +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +GTEST_REVERSE_REPEATER_METHOD_(OnTestSuiteEnd, TestSuite) +GTEST_REVERSE_REPEATER_METHOD_(OnTestProgramEnd, UnitTest) + +#undef GTEST_REPEATER_METHOD_ +#undef GTEST_REVERSE_REPEATER_METHOD_ + +void TestEventRepeater::OnTestIterationStart(const UnitTest &unit_test, + int iteration) { + if (forwarding_enabled_) { + for (size_t i = 0; i < listeners_.size(); i++) { + listeners_[i]->OnTestIterationStart(unit_test, iteration); + } + } +} + +void TestEventRepeater::OnTestIterationEnd(const UnitTest &unit_test, + int iteration) { + if (forwarding_enabled_) { + for (size_t i = listeners_.size(); i > 0; i--) { + listeners_[i - 1]->OnTestIterationEnd(unit_test, iteration); + } + } +} + +// End TestEventRepeater + +// This class generates an XML output file. +class XmlUnitTestResultPrinter : public EmptyTestEventListener { + public: + explicit XmlUnitTestResultPrinter(const char *output_file); + + void OnTestIterationEnd(const UnitTest &unit_test, int iteration) override; + void ListTestsMatchingFilter(const std::vector &test_suites); + + // Prints an XML summary of all unit tests. + static void PrintXmlTestsList(std::ostream *stream, + const std::vector &test_suites); + + private: + // Is c a whitespace character that is normalized to a space character + // when it appears in an XML attribute value? + static bool IsNormalizableWhitespace(char c) { + return c == 0x9 || c == 0xA || c == 0xD; + } + + // May c appear in a well-formed XML document? + static bool IsValidXmlCharacter(char c) { + return IsNormalizableWhitespace(c) || c >= 0x20; + } + + // Returns an XML-escaped copy of the input string str. If + // is_attribute is true, the text is meant to appear as an attribute + // value, and normalizable whitespace is preserved by replacing it + // with character references. + static std::string EscapeXml(const std::string &str, bool is_attribute); + + // Returns the given string with all characters invalid in XML removed. + static std::string RemoveInvalidXmlCharacters(const std::string &str); + + // Convenience wrapper around EscapeXml when str is an attribute value. + static std::string EscapeXmlAttribute(const std::string &str) { + return EscapeXml(str, true); + } + + // Convenience wrapper around EscapeXml when str is not an attribute value. + static std::string EscapeXmlText(const char *str) { + return EscapeXml(str, false); + } + + // Verifies that the given attribute belongs to the given element and + // streams the attribute as XML. + static void OutputXmlAttribute(std::ostream *stream, + const std::string &element_name, + const std::string &name, + const std::string &value); + + // Streams an XML CDATA section, escaping invalid CDATA sequences as needed. + static void OutputXmlCDataSection(::std::ostream *stream, const char *data); + + // Streams an XML representation of a TestInfo object. + static void OutputXmlTestInfo(::std::ostream *stream, + const char *test_suite_name, + const TestInfo &test_info); + + // Prints an XML representation of a TestSuite object + static void PrintXmlTestSuite(::std::ostream *stream, + const TestSuite &test_suite); + + // Prints an XML summary of unit_test to output stream out. + static void PrintXmlUnitTest(::std::ostream *stream, + const UnitTest &unit_test); + + // Produces a string representing the test properties in a result as space + // delimited XML attributes based on the property key="value" pairs. + // When the std::string is not empty, it includes a space at the beginning, + // to delimit this attribute from prior attributes. + static std::string TestPropertiesAsXmlAttributes(const TestResult &result); + + // Streams an XML representation of the test properties of a TestResult + // object. + static void OutputXmlTestProperties(std::ostream *stream, + const TestResult &result); + + // The output file. + const std::string output_file_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter); +}; + +// Creates a new XmlUnitTestResultPrinter. +XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char *output_file) + : output_file_(output_file) { + if (output_file_.empty()) { + GTEST_LOG_(FATAL) << "XML output file may not be null"; + } +} + +// Called after the unit test ends. +void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest &unit_test, + int /*iteration*/) { + FILE *xmlout = OpenFileForWriting(output_file_); + std::stringstream stream; + PrintXmlUnitTest(&stream, unit_test); + fprintf(xmlout, "%s", StringStreamToString(&stream).c_str()); + fclose(xmlout); +} + +void XmlUnitTestResultPrinter::ListTestsMatchingFilter( + const std::vector &test_suites) { + FILE *xmlout = OpenFileForWriting(output_file_); + std::stringstream stream; + PrintXmlTestsList(&stream, test_suites); + fprintf(xmlout, "%s", StringStreamToString(&stream).c_str()); + fclose(xmlout); +} + +// Returns an XML-escaped copy of the input string str. If is_attribute +// is true, the text is meant to appear as an attribute value, and +// normalizable whitespace is preserved by replacing it with character +// references. +// +// Invalid XML characters in str, if any, are stripped from the output. +// It is expected that most, if not all, of the text processed by this +// module will consist of ordinary English text. +// If this module is ever modified to produce version 1.1 XML output, +// most invalid characters can be retained using character references. +std::string XmlUnitTestResultPrinter::EscapeXml(const std::string &str, + bool is_attribute) { + Message m; + + for (size_t i = 0; i < str.size(); ++i) { + const char ch = str[i]; + switch (ch) { + case '<': m << "<"; break; + case '>': m << ">"; break; + case '&': m << "&"; break; + case '\'': + if (is_attribute) + m << "'"; + else + m << '\''; + break; + case '"': + if (is_attribute) + m << """; + else + m << '"'; + break; + default: + if (IsValidXmlCharacter(ch)) { + if (is_attribute && IsNormalizableWhitespace(ch)) + m << "&#x" << String::FormatByte(static_cast(ch)) + << ";"; + else + m << ch; + } + break; + } + } + + return m.GetString(); +} + +// Returns the given string with all characters invalid in XML removed. +// Currently invalid characters are dropped from the string. An +// alternative is to replace them with certain characters such as . or ?. +std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters( + const std::string &str) { + std::string output; + output.reserve(str.size()); + for (std::string::const_iterator it = str.begin(); it != str.end(); ++it) + if (IsValidXmlCharacter(*it)) output.push_back(*it); + + return output; +} + +// The following routines generate an XML representation of a UnitTest +// object. +// GOOGLETEST_CM0009 DO NOT DELETE +// +// This is how Google Test concepts map to the DTD: +// +// <-- corresponds to a UnitTest object +// <-- corresponds to a TestSuite object +// <-- corresponds to a TestInfo object +// ... +// ... +// ... +// <-- individual assertion failures +// +// +// + +// Formats the given time in milliseconds as seconds. +std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) { + ::std::stringstream ss; + ss << (static_cast(ms) * 1e-3); + return ss.str(); +} + +static bool PortableLocaltime(time_t seconds, struct tm *out) { +#if defined(_MSC_VER) + return localtime_s(out, &seconds) == 0; +#elif defined(__MINGW32__) || defined(__MINGW64__) + // MINGW provides neither localtime_r nor localtime_s, but uses + // Windows' localtime(), which has a thread-local tm buffer. + struct tm *tm_ptr = localtime(&seconds); // NOLINT + if (tm_ptr == nullptr) return false; + *out = *tm_ptr; + return true; +#else + return localtime_r(&seconds, out) != nullptr; +#endif +} + +// Converts the given epoch time in milliseconds to a date string in the ISO +// 8601 format, without the timezone information. +std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) { + struct tm time_struct; + if (!PortableLocaltime(static_cast(ms / 1000), &time_struct)) + return ""; + // YYYY-MM-DDThh:mm:ss + return StreamableToString(time_struct.tm_year + 1900) + "-" + + String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" + + String::FormatIntWidth2(time_struct.tm_mday) + "T" + + String::FormatIntWidth2(time_struct.tm_hour) + ":" + + String::FormatIntWidth2(time_struct.tm_min) + ":" + + String::FormatIntWidth2(time_struct.tm_sec); +} + +// Streams an XML CDATA section, escaping invalid CDATA sequences as needed. +void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream *stream, + const char *data) { + const char *segment = data; + *stream << ""); + if (next_segment != nullptr) { + stream->write(segment, + static_cast(next_segment - segment)); + *stream << "]]>]]>"); + } else { + *stream << segment; + break; + } + } + *stream << "]]>"; +} + +void XmlUnitTestResultPrinter::OutputXmlAttribute( + std::ostream *stream, const std::string &element_name, + const std::string &name, const std::string &value) { + const std::vector &allowed_names = + GetReservedOutputAttributesForElement(element_name); + + GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) != + allowed_names.end()) + << "Attribute " << name << " is not allowed for element <" << element_name + << ">."; + + *stream << " " << name << "=\"" << EscapeXmlAttribute(value) << "\""; +} + +// Prints an XML representation of a TestInfo object. +void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream *stream, + const char *test_suite_name, + const TestInfo &test_info) { + const TestResult &result = *test_info.result(); + const std::string kTestsuite = "testcase"; + + if (test_info.is_in_another_shard()) { + return; + } + + *stream << " \n"; + return; + } + + OutputXmlAttribute(stream, kTestsuite, "status", + test_info.should_run() ? "run" : "notrun"); + OutputXmlAttribute(stream, kTestsuite, "result", + test_info.should_run() + ? (result.Skipped() ? "skipped" : "completed") + : "suppressed"); + OutputXmlAttribute(stream, kTestsuite, "time", + FormatTimeInMillisAsSeconds(result.elapsed_time())); + OutputXmlAttribute( + stream, kTestsuite, "timestamp", + FormatEpochTimeInMillisAsIso8601(result.start_timestamp())); + OutputXmlAttribute(stream, kTestsuite, "classname", test_suite_name); + + int failures = 0; + for (int i = 0; i < result.total_part_count(); ++i) { + const TestPartResult &part = result.GetTestPartResult(i); + if (part.failed()) { + if (++failures == 1) { + *stream << ">\n"; + } + const std::string location = + internal::FormatCompilerIndependentFileLocation(part.file_name(), + part.line_number()); + const std::string summary = location + "\n" + part.summary(); + *stream << " "; + const std::string detail = location + "\n" + part.message(); + OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str()); + *stream << "\n"; + } + } + + if (failures == 0 && result.test_property_count() == 0) { + *stream << " />\n"; + } else { + if (failures == 0) { + *stream << ">\n"; + } + OutputXmlTestProperties(stream, result); + *stream << " \n"; + } +} + +// Prints an XML representation of a TestSuite object +void XmlUnitTestResultPrinter::PrintXmlTestSuite(std::ostream *stream, + const TestSuite &test_suite) { + const std::string kTestsuite = "testsuite"; + *stream << " <" << kTestsuite; + OutputXmlAttribute(stream, kTestsuite, "name", test_suite.name()); + OutputXmlAttribute(stream, kTestsuite, "tests", + StreamableToString(test_suite.reportable_test_count())); + if (!GTEST_FLAG(list_tests)) { + OutputXmlAttribute(stream, kTestsuite, "failures", + StreamableToString(test_suite.failed_test_count())); + OutputXmlAttribute( + stream, kTestsuite, "disabled", + StreamableToString(test_suite.reportable_disabled_test_count())); + OutputXmlAttribute(stream, kTestsuite, "errors", "0"); + OutputXmlAttribute(stream, kTestsuite, "time", + FormatTimeInMillisAsSeconds(test_suite.elapsed_time())); + OutputXmlAttribute( + stream, kTestsuite, "timestamp", + FormatEpochTimeInMillisAsIso8601(test_suite.start_timestamp())); + *stream << TestPropertiesAsXmlAttributes(test_suite.ad_hoc_test_result()); + } + *stream << ">\n"; + for (int i = 0; i < test_suite.total_test_count(); ++i) { + if (test_suite.GetTestInfo(i)->is_reportable()) + OutputXmlTestInfo(stream, test_suite.name(), *test_suite.GetTestInfo(i)); + } + *stream << " \n"; +} + +// Prints an XML summary of unit_test to output stream out. +void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream *stream, + const UnitTest &unit_test) { + const std::string kTestsuites = "testsuites"; + + *stream << "\n"; + *stream << "<" << kTestsuites; + + OutputXmlAttribute(stream, kTestsuites, "tests", + StreamableToString(unit_test.reportable_test_count())); + OutputXmlAttribute(stream, kTestsuites, "failures", + StreamableToString(unit_test.failed_test_count())); + OutputXmlAttribute( + stream, kTestsuites, "disabled", + StreamableToString(unit_test.reportable_disabled_test_count())); + OutputXmlAttribute(stream, kTestsuites, "errors", "0"); + OutputXmlAttribute(stream, kTestsuites, "time", + FormatTimeInMillisAsSeconds(unit_test.elapsed_time())); + OutputXmlAttribute( + stream, kTestsuites, "timestamp", + FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp())); + + if (GTEST_FLAG(shuffle)) { + OutputXmlAttribute(stream, kTestsuites, "random_seed", + StreamableToString(unit_test.random_seed())); + } + *stream << TestPropertiesAsXmlAttributes(unit_test.ad_hoc_test_result()); + + OutputXmlAttribute(stream, kTestsuites, "name", "AllTests"); + *stream << ">\n"; + + for (int i = 0; i < unit_test.total_test_suite_count(); ++i) { + if (unit_test.GetTestSuite(i)->reportable_test_count() > 0) + PrintXmlTestSuite(stream, *unit_test.GetTestSuite(i)); + } + *stream << "\n"; +} + +void XmlUnitTestResultPrinter::PrintXmlTestsList( + std::ostream *stream, const std::vector &test_suites) { + const std::string kTestsuites = "testsuites"; + + *stream << "\n"; + *stream << "<" << kTestsuites; + + int total_tests = 0; + for (auto test_suite : test_suites) { + total_tests += test_suite->total_test_count(); + } + OutputXmlAttribute(stream, kTestsuites, "tests", + StreamableToString(total_tests)); + OutputXmlAttribute(stream, kTestsuites, "name", "AllTests"); + *stream << ">\n"; + + for (auto test_suite : test_suites) { + PrintXmlTestSuite(stream, *test_suite); + } + *stream << "\n"; +} + +// Produces a string representing the test properties in a result as space +// delimited XML attributes based on the property key="value" pairs. +std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes( + const TestResult &result) { + Message attributes; + for (int i = 0; i < result.test_property_count(); ++i) { + const TestProperty &property = result.GetTestProperty(i); + attributes << " " << property.key() << "=" + << "\"" << EscapeXmlAttribute(property.value()) << "\""; + } + return attributes.GetString(); +} + +void XmlUnitTestResultPrinter::OutputXmlTestProperties( + std::ostream *stream, const TestResult &result) { + const std::string kProperties = "properties"; + const std::string kProperty = "property"; + + if (result.test_property_count() <= 0) { + return; + } + + *stream << "<" << kProperties << ">\n"; + for (int i = 0; i < result.test_property_count(); ++i) { + const TestProperty &property = result.GetTestProperty(i); + *stream << "<" << kProperty; + *stream << " name=\"" << EscapeXmlAttribute(property.key()) << "\""; + *stream << " value=\"" << EscapeXmlAttribute(property.value()) << "\""; + *stream << "/>\n"; + } + *stream << "\n"; +} + +// End XmlUnitTestResultPrinter + +// This class generates an JSON output file. +class JsonUnitTestResultPrinter : public EmptyTestEventListener { + public: + explicit JsonUnitTestResultPrinter(const char *output_file); + + void OnTestIterationEnd(const UnitTest &unit_test, int iteration) override; + + // Prints an JSON summary of all unit tests. + static void PrintJsonTestList(::std::ostream *stream, + const std::vector &test_suites); + + private: + // Returns an JSON-escaped copy of the input string str. + static std::string EscapeJson(const std::string &str); + + //// Verifies that the given attribute belongs to the given element and + //// streams the attribute as JSON. + static void OutputJsonKey(std::ostream *stream, + const std::string &element_name, + const std::string &name, const std::string &value, + const std::string &indent, bool comma = true); + static void OutputJsonKey(std::ostream *stream, + const std::string &element_name, + const std::string &name, int value, + const std::string &indent, bool comma = true); + + // Streams a JSON representation of a TestInfo object. + static void OutputJsonTestInfo(::std::ostream *stream, + const char *test_suite_name, + const TestInfo &test_info); + + // Prints a JSON representation of a TestSuite object + static void PrintJsonTestSuite(::std::ostream *stream, + const TestSuite &test_suite); + + // Prints a JSON summary of unit_test to output stream out. + static void PrintJsonUnitTest(::std::ostream *stream, + const UnitTest &unit_test); + + // Produces a string representing the test properties in a result as + // a JSON dictionary. + static std::string TestPropertiesAsJson(const TestResult &result, + const std::string &indent); + + // The output file. + const std::string output_file_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(JsonUnitTestResultPrinter); +}; + +// Creates a new JsonUnitTestResultPrinter. +JsonUnitTestResultPrinter::JsonUnitTestResultPrinter(const char *output_file) + : output_file_(output_file) { + if (output_file_.empty()) { + GTEST_LOG_(FATAL) << "JSON output file may not be null"; + } +} + +void JsonUnitTestResultPrinter::OnTestIterationEnd(const UnitTest &unit_test, + int /*iteration*/) { + FILE *jsonout = OpenFileForWriting(output_file_); + std::stringstream stream; + PrintJsonUnitTest(&stream, unit_test); + fprintf(jsonout, "%s", StringStreamToString(&stream).c_str()); + fclose(jsonout); +} + +// Returns an JSON-escaped copy of the input string str. +std::string JsonUnitTestResultPrinter::EscapeJson(const std::string &str) { + Message m; + + for (size_t i = 0; i < str.size(); ++i) { + const char ch = str[i]; + switch (ch) { + case '\\': + case '"': + case '/': m << '\\' << ch; break; + case '\b': m << "\\b"; break; + case '\t': m << "\\t"; break; + case '\n': m << "\\n"; break; + case '\f': m << "\\f"; break; + case '\r': m << "\\r"; break; + default: + if (ch < ' ') { + m << "\\u00" << String::FormatByte(static_cast(ch)); + } else { + m << ch; + } + break; + } + } + + return m.GetString(); +} + +// The following routines generate an JSON representation of a UnitTest +// object. + +// Formats the given time in milliseconds as seconds. +static std::string FormatTimeInMillisAsDuration(TimeInMillis ms) { + ::std::stringstream ss; + ss << (static_cast(ms) * 1e-3) << "s"; + return ss.str(); +} + +// Converts the given epoch time in milliseconds to a date string in the +// RFC3339 format, without the timezone information. +static std::string FormatEpochTimeInMillisAsRFC3339(TimeInMillis ms) { + struct tm time_struct; + if (!PortableLocaltime(static_cast(ms / 1000), &time_struct)) + return ""; + // YYYY-MM-DDThh:mm:ss + return StreamableToString(time_struct.tm_year + 1900) + "-" + + String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" + + String::FormatIntWidth2(time_struct.tm_mday) + "T" + + String::FormatIntWidth2(time_struct.tm_hour) + ":" + + String::FormatIntWidth2(time_struct.tm_min) + ":" + + String::FormatIntWidth2(time_struct.tm_sec) + "Z"; +} + +static inline std::string Indent(size_t width) { + return std::string(width, ' '); +} + +void JsonUnitTestResultPrinter::OutputJsonKey(std::ostream *stream, + const std::string &element_name, + const std::string &name, + const std::string &value, + const std::string &indent, + bool comma) { + const std::vector &allowed_names = + GetReservedOutputAttributesForElement(element_name); + + GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) != + allowed_names.end()) + << "Key \"" << name << "\" is not allowed for value \"" << element_name + << "\"."; + + *stream << indent << "\"" << name << "\": \"" << EscapeJson(value) << "\""; + if (comma) *stream << ",\n"; +} + +void JsonUnitTestResultPrinter::OutputJsonKey( + std::ostream *stream, const std::string &element_name, + const std::string &name, int value, const std::string &indent, bool comma) { + const std::vector &allowed_names = + GetReservedOutputAttributesForElement(element_name); + + GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) != + allowed_names.end()) + << "Key \"" << name << "\" is not allowed for value \"" << element_name + << "\"."; + + *stream << indent << "\"" << name << "\": " << StreamableToString(value); + if (comma) *stream << ",\n"; +} + +// Prints a JSON representation of a TestInfo object. +void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream *stream, + const char *test_suite_name, + const TestInfo &test_info) { + const TestResult &result = *test_info.result(); + const std::string kTestsuite = "testcase"; + const std::string kIndent = Indent(10); + + *stream << Indent(8) << "{\n"; + OutputJsonKey(stream, kTestsuite, "name", test_info.name(), kIndent); + + if (test_info.value_param() != nullptr) { + OutputJsonKey(stream, kTestsuite, "value_param", test_info.value_param(), + kIndent); + } + if (test_info.type_param() != nullptr) { + OutputJsonKey(stream, kTestsuite, "type_param", test_info.type_param(), + kIndent); + } + if (GTEST_FLAG(list_tests)) { + OutputJsonKey(stream, kTestsuite, "file", test_info.file(), kIndent); + OutputJsonKey(stream, kTestsuite, "line", test_info.line(), kIndent, false); + *stream << "\n" << Indent(8) << "}"; + return; + } + + OutputJsonKey(stream, kTestsuite, "status", + test_info.should_run() ? "RUN" : "NOTRUN", kIndent); + OutputJsonKey(stream, kTestsuite, "result", + test_info.should_run() + ? (result.Skipped() ? "SKIPPED" : "COMPLETED") + : "SUPPRESSED", + kIndent); + OutputJsonKey(stream, kTestsuite, "timestamp", + FormatEpochTimeInMillisAsRFC3339(result.start_timestamp()), + kIndent); + OutputJsonKey(stream, kTestsuite, "time", + FormatTimeInMillisAsDuration(result.elapsed_time()), kIndent); + OutputJsonKey(stream, kTestsuite, "classname", test_suite_name, kIndent, + false); + *stream << TestPropertiesAsJson(result, kIndent); + + int failures = 0; + for (int i = 0; i < result.total_part_count(); ++i) { + const TestPartResult &part = result.GetTestPartResult(i); + if (part.failed()) { + *stream << ",\n"; + if (++failures == 1) { + *stream << kIndent << "\"" + << "failures" + << "\": [\n"; + } + const std::string location = + internal::FormatCompilerIndependentFileLocation(part.file_name(), + part.line_number()); + const std::string message = EscapeJson(location + "\n" + part.message()); + *stream << kIndent << " {\n" + << kIndent << " \"failure\": \"" << message << "\",\n" + << kIndent << " \"type\": \"\"\n" + << kIndent << " }"; + } + } + + if (failures > 0) *stream << "\n" << kIndent << "]"; + *stream << "\n" << Indent(8) << "}"; +} + +// Prints an JSON representation of a TestSuite object +void JsonUnitTestResultPrinter::PrintJsonTestSuite( + std::ostream *stream, const TestSuite &test_suite) { + const std::string kTestsuite = "testsuite"; + const std::string kIndent = Indent(6); + + *stream << Indent(4) << "{\n"; + OutputJsonKey(stream, kTestsuite, "name", test_suite.name(), kIndent); + OutputJsonKey(stream, kTestsuite, "tests", test_suite.reportable_test_count(), + kIndent); + if (!GTEST_FLAG(list_tests)) { + OutputJsonKey(stream, kTestsuite, "failures", + test_suite.failed_test_count(), kIndent); + OutputJsonKey(stream, kTestsuite, "disabled", + test_suite.reportable_disabled_test_count(), kIndent); + OutputJsonKey(stream, kTestsuite, "errors", 0, kIndent); + OutputJsonKey( + stream, kTestsuite, "timestamp", + FormatEpochTimeInMillisAsRFC3339(test_suite.start_timestamp()), + kIndent); + OutputJsonKey(stream, kTestsuite, "time", + FormatTimeInMillisAsDuration(test_suite.elapsed_time()), + kIndent, false); + *stream << TestPropertiesAsJson(test_suite.ad_hoc_test_result(), kIndent) + << ",\n"; + } + + *stream << kIndent << "\"" << kTestsuite << "\": [\n"; + + bool comma = false; + for (int i = 0; i < test_suite.total_test_count(); ++i) { + if (test_suite.GetTestInfo(i)->is_reportable()) { + if (comma) { + *stream << ",\n"; + } else { + comma = true; + } + OutputJsonTestInfo(stream, test_suite.name(), *test_suite.GetTestInfo(i)); + } + } + *stream << "\n" << kIndent << "]\n" << Indent(4) << "}"; +} + +// Prints a JSON summary of unit_test to output stream out. +void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream *stream, + const UnitTest &unit_test) { + const std::string kTestsuites = "testsuites"; + const std::string kIndent = Indent(2); + *stream << "{\n"; + + OutputJsonKey(stream, kTestsuites, "tests", unit_test.reportable_test_count(), + kIndent); + OutputJsonKey(stream, kTestsuites, "failures", unit_test.failed_test_count(), + kIndent); + OutputJsonKey(stream, kTestsuites, "disabled", + unit_test.reportable_disabled_test_count(), kIndent); + OutputJsonKey(stream, kTestsuites, "errors", 0, kIndent); + if (GTEST_FLAG(shuffle)) { + OutputJsonKey(stream, kTestsuites, "random_seed", unit_test.random_seed(), + kIndent); + } + OutputJsonKey(stream, kTestsuites, "timestamp", + FormatEpochTimeInMillisAsRFC3339(unit_test.start_timestamp()), + kIndent); + OutputJsonKey(stream, kTestsuites, "time", + FormatTimeInMillisAsDuration(unit_test.elapsed_time()), kIndent, + false); + + *stream << TestPropertiesAsJson(unit_test.ad_hoc_test_result(), kIndent) + << ",\n"; + + OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent); + *stream << kIndent << "\"" << kTestsuites << "\": [\n"; + + bool comma = false; + for (int i = 0; i < unit_test.total_test_suite_count(); ++i) { + if (unit_test.GetTestSuite(i)->reportable_test_count() > 0) { + if (comma) { + *stream << ",\n"; + } else { + comma = true; + } + PrintJsonTestSuite(stream, *unit_test.GetTestSuite(i)); + } + } + + *stream << "\n" + << kIndent << "]\n" + << "}\n"; +} + +void JsonUnitTestResultPrinter::PrintJsonTestList( + std::ostream *stream, const std::vector &test_suites) { + const std::string kTestsuites = "testsuites"; + const std::string kIndent = Indent(2); + *stream << "{\n"; + int total_tests = 0; + for (auto test_suite : test_suites) { + total_tests += test_suite->total_test_count(); + } + OutputJsonKey(stream, kTestsuites, "tests", total_tests, kIndent); + + OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent); + *stream << kIndent << "\"" << kTestsuites << "\": [\n"; + + for (size_t i = 0; i < test_suites.size(); ++i) { + if (i != 0) { + *stream << ",\n"; + } + PrintJsonTestSuite(stream, *test_suites[i]); + } + + *stream << "\n" + << kIndent << "]\n" + << "}\n"; +} +// Produces a string representing the test properties in a result as +// a JSON dictionary. +std::string JsonUnitTestResultPrinter::TestPropertiesAsJson( + const TestResult &result, const std::string &indent) { + Message attributes; + for (int i = 0; i < result.test_property_count(); ++i) { + const TestProperty &property = result.GetTestProperty(i); + attributes << ",\n" + << indent << "\"" << property.key() << "\": " + << "\"" << EscapeJson(property.value()) << "\""; + } + return attributes.GetString(); +} + +// End JsonUnitTestResultPrinter + +#if GTEST_CAN_STREAM_RESULTS_ + +// Checks if str contains '=', '&', '%' or '\n' characters. If yes, +// replaces them by "%xx" where xx is their hexadecimal value. For +// example, replaces "=" with "%3D". This algorithm is O(strlen(str)) +// in both time and space -- important as the input str may contain an +// arbitrarily long test failure message and stack trace. +std::string StreamingListener::UrlEncode(const char *str) { + std::string result; + result.reserve(strlen(str) + 1); + for (char ch = *str; ch != '\0'; ch = *++str) { + switch (ch) { + case '%': + case '=': + case '&': + case '\n': + result.append("%" + String::FormatByte(static_cast(ch))); + break; + default: result.push_back(ch); break; + } + } + return result; +} + +void StreamingListener::SocketWriter::MakeConnection() { + GTEST_CHECK_(sockfd_ == -1) + << "MakeConnection() can't be called when there is already a connection."; + + addrinfo hints; + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_UNSPEC; // To allow both IPv4 and IPv6 addresses. + hints.ai_socktype = SOCK_STREAM; + addrinfo *servinfo = nullptr; + + // Use the getaddrinfo() to get a linked list of IP addresses for + // the given host name. + const int error_num = + getaddrinfo(host_name_.c_str(), port_num_.c_str(), &hints, &servinfo); + if (error_num != 0) { + GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: " + << gai_strerror(error_num); + } + + // Loop through all the results and connect to the first we can. + for (addrinfo *cur_addr = servinfo; sockfd_ == -1 && cur_addr != nullptr; + cur_addr = cur_addr->ai_next) { + sockfd_ = socket(cur_addr->ai_family, cur_addr->ai_socktype, + cur_addr->ai_protocol); + if (sockfd_ != -1) { + // Connect the client socket to the server socket. + if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) { + close(sockfd_); + sockfd_ = -1; + } + } + } + + freeaddrinfo(servinfo); // all done with this structure + + if (sockfd_ == -1) { + GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to " + << host_name_ << ":" << port_num_; + } +} + +// End of class Streaming Listener +#endif // GTEST_CAN_STREAM_RESULTS__ + +// class OsStackTraceGetter + +const char *const OsStackTraceGetterInterface::kElidedFramesMarker = + "... " GTEST_NAME_ " internal frames ..."; + +std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count) + GTEST_LOCK_EXCLUDED_(mutex_) { +#if GTEST_HAS_ABSL + std::string result; + + if (max_depth <= 0) { + return result; + } + + max_depth = std::min(max_depth, kMaxStackTraceDepth); + + std::vector raw_stack(max_depth); + // Skips the frames requested by the caller, plus this function. + const int raw_stack_size = + absl::GetStackTrace(&raw_stack[0], max_depth, skip_count + 1); + + void *caller_frame = nullptr; + { + MutexLock lock(&mutex_); + caller_frame = caller_frame_; + } + + for (int i = 0; i < raw_stack_size; ++i) { + if (raw_stack[i] == caller_frame && + !GTEST_FLAG(show_internal_stack_frames)) { + // Add a marker to the trace and stop adding frames. + absl::StrAppend(&result, kElidedFramesMarker, "\n"); + break; + } + + char tmp[1024]; + const char *symbol = "(unknown)"; + if (absl::Symbolize(raw_stack[i], tmp, sizeof(tmp))) { + symbol = tmp; + } + + char line[1024]; + snprintf(line, sizeof(line), " %p: %s\n", raw_stack[i], symbol); + result += line; + } + + return result; + +#else // !GTEST_HAS_ABSL + static_cast(max_depth); + static_cast(skip_count); + return ""; +#endif // GTEST_HAS_ABSL +} + +void OsStackTraceGetter::UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_) { +#if GTEST_HAS_ABSL + void *caller_frame = nullptr; + if (absl::GetStackTrace(&caller_frame, 1, 3) <= 0) { + caller_frame = nullptr; + } + + MutexLock lock(&mutex_); + caller_frame_ = caller_frame; +#endif // GTEST_HAS_ABSL +} + +// A helper class that creates the premature-exit file in its +// constructor and deletes the file in its destructor. +class ScopedPrematureExitFile { + public: + explicit ScopedPrematureExitFile(const char *premature_exit_filepath) + : premature_exit_filepath_( + premature_exit_filepath ? premature_exit_filepath : "") { + // If a path to the premature-exit file is specified... + if (!premature_exit_filepath_.empty()) { + // create the file with a single "0" character in it. I/O + // errors are ignored as there's nothing better we can do and we + // don't want to fail the test because of this. + FILE *pfile = posix::FOpen(premature_exit_filepath, "w"); + fwrite("0", 1, 1, pfile); + fclose(pfile); + } + } + + ~ScopedPrematureExitFile() { +#if !defined GTEST_OS_ESP8266 + if (!premature_exit_filepath_.empty()) { + int retval = remove(premature_exit_filepath_.c_str()); + if (retval) { + GTEST_LOG_(ERROR) << "Failed to remove premature exit filepath \"" + << premature_exit_filepath_ << "\" with error " + << retval; + } + } +#endif + } + + private: + const std::string premature_exit_filepath_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile); +}; + +} // namespace internal + +// class TestEventListeners + +TestEventListeners::TestEventListeners() + : repeater_(new internal::TestEventRepeater()), + default_result_printer_(nullptr), default_xml_generator_(nullptr) {} + +TestEventListeners::~TestEventListeners() { delete repeater_; } + +// Returns the standard listener responsible for the default console +// output. Can be removed from the listeners list to shut down default +// console output. Note that removing this object from the listener list +// with Release transfers its ownership to the user. +void TestEventListeners::Append(TestEventListener *listener) { + repeater_->Append(listener); +} + +// Removes the given event listener from the list and returns it. It then +// becomes the caller's responsibility to delete the listener. Returns +// NULL if the listener is not found in the list. +TestEventListener *TestEventListeners::Release(TestEventListener *listener) { + if (listener == default_result_printer_) + default_result_printer_ = nullptr; + else if (listener == default_xml_generator_) + default_xml_generator_ = nullptr; + return repeater_->Release(listener); +} + +// Returns repeater that broadcasts the TestEventListener events to all +// subscribers. +TestEventListener *TestEventListeners::repeater() { return repeater_; } + +// Sets the default_result_printer attribute to the provided listener. +// The listener is also added to the listener list and previous +// default_result_printer is removed from it and deleted. The listener can +// also be NULL in which case it will not be added to the list. Does +// nothing if the previous and the current listener objects are the same. +void TestEventListeners::SetDefaultResultPrinter(TestEventListener *listener) { + if (default_result_printer_ != listener) { + // It is an error to pass this method a listener that is already in the + // list. + delete Release(default_result_printer_); + default_result_printer_ = listener; + if (listener != nullptr) Append(listener); + } +} + +// Sets the default_xml_generator attribute to the provided listener. The +// listener is also added to the listener list and previous +// default_xml_generator is removed from it and deleted. The listener can +// also be NULL in which case it will not be added to the list. Does +// nothing if the previous and the current listener objects are the same. +void TestEventListeners::SetDefaultXmlGenerator(TestEventListener *listener) { + if (default_xml_generator_ != listener) { + // It is an error to pass this method a listener that is already in the + // list. + delete Release(default_xml_generator_); + default_xml_generator_ = listener; + if (listener != nullptr) Append(listener); + } +} + +// Controls whether events will be forwarded by the repeater to the +// listeners in the list. +bool TestEventListeners::EventForwardingEnabled() const { + return repeater_->forwarding_enabled(); +} + +void TestEventListeners::SuppressEventForwarding() { + repeater_->set_forwarding_enabled(false); +} + +// class UnitTest + +// Gets the singleton UnitTest object. The first time this method is +// called, a UnitTest object is constructed and returned. Consecutive +// calls will return the same object. +// +// We don't protect this under mutex_ as a user is not supposed to +// call this before main() starts, from which point on the return +// value will never change. +UnitTest *UnitTest::GetInstance() { + // CodeGear C++Builder insists on a public destructor for the + // default implementation. Use this implementation to keep good OO + // design with private destructor. + +#if defined(__BORLANDC__) + static UnitTest *const instance = new UnitTest; + return instance; +#else + static UnitTest instance; + return &instance; +#endif // defined(__BORLANDC__) +} + +// Gets the number of successful test suites. +int UnitTest::successful_test_suite_count() const { + return impl()->successful_test_suite_count(); +} + +// Gets the number of failed test suites. +int UnitTest::failed_test_suite_count() const { + return impl()->failed_test_suite_count(); +} + +// Gets the number of all test suites. +int UnitTest::total_test_suite_count() const { + return impl()->total_test_suite_count(); +} + +// Gets the number of all test suites that contain at least one test +// that should run. +int UnitTest::test_suite_to_run_count() const { + return impl()->test_suite_to_run_count(); +} + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +int UnitTest::successful_test_case_count() const { + return impl()->successful_test_suite_count(); +} +int UnitTest::failed_test_case_count() const { + return impl()->failed_test_suite_count(); +} +int UnitTest::total_test_case_count() const { + return impl()->total_test_suite_count(); +} +int UnitTest::test_case_to_run_count() const { + return impl()->test_suite_to_run_count(); +} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +// Gets the number of successful tests. +int UnitTest::successful_test_count() const { + return impl()->successful_test_count(); +} + +// Gets the number of skipped tests. +int UnitTest::skipped_test_count() const { + return impl()->skipped_test_count(); +} + +// Gets the number of failed tests. +int UnitTest::failed_test_count() const { return impl()->failed_test_count(); } + +// Gets the number of disabled tests that will be reported in the XML report. +int UnitTest::reportable_disabled_test_count() const { + return impl()->reportable_disabled_test_count(); +} + +// Gets the number of disabled tests. +int UnitTest::disabled_test_count() const { + return impl()->disabled_test_count(); +} + +// Gets the number of tests to be printed in the XML report. +int UnitTest::reportable_test_count() const { + return impl()->reportable_test_count(); +} + +// Gets the number of all tests. +int UnitTest::total_test_count() const { return impl()->total_test_count(); } + +// Gets the number of tests that should run. +int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); } + +// Gets the time of the test program start, in ms from the start of the +// UNIX epoch. +internal::TimeInMillis UnitTest::start_timestamp() const { + return impl()->start_timestamp(); +} + +// Gets the elapsed time, in milliseconds. +internal::TimeInMillis UnitTest::elapsed_time() const { + return impl()->elapsed_time(); +} + +// Returns true if and only if the unit test passed (i.e. all test suites +// passed). +bool UnitTest::Passed() const { return impl()->Passed(); } + +// Returns true if and only if the unit test failed (i.e. some test suite +// failed or something outside of all tests failed). +bool UnitTest::Failed() const { return impl()->Failed(); } + +// Gets the i-th test suite among all the test suites. i can range from 0 to +// total_test_suite_count() - 1. If i is not in that range, returns NULL. +const TestSuite *UnitTest::GetTestSuite(int i) const { + return impl()->GetTestSuite(i); +} + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +const TestCase *UnitTest::GetTestCase(int i) const { + return impl()->GetTestCase(i); +} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +// Returns the TestResult containing information on test failures and +// properties logged outside of individual test suites. +const TestResult &UnitTest::ad_hoc_test_result() const { + return *impl()->ad_hoc_test_result(); +} + +// Gets the i-th test suite among all the test suites. i can range from 0 to +// total_test_suite_count() - 1. If i is not in that range, returns NULL. +TestSuite *UnitTest::GetMutableTestSuite(int i) { + return impl()->GetMutableSuiteCase(i); +} + +// Returns the list of event listeners that can be used to track events +// inside Google Test. +TestEventListeners &UnitTest::listeners() { return *impl()->listeners(); } + +// Registers and returns a global test environment. When a test +// program is run, all global test environments will be set-up in the +// order they were registered. After all tests in the program have +// finished, all global test environments will be torn-down in the +// *reverse* order they were registered. +// +// The UnitTest object takes ownership of the given environment. +// +// We don't protect this under mutex_, as we only support calling it +// from the main thread. +Environment *UnitTest::AddEnvironment(Environment *env) { + if (env == nullptr) { + return nullptr; + } + + impl_->environments().push_back(env); + return env; +} + +// Adds a TestPartResult to the current TestResult object. All Google Test +// assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call +// this to report their results. The user code should use the +// assertion macros instead of calling this directly. +void UnitTest::AddTestPartResult(TestPartResult::Type result_type, + const char *file_name, int line_number, + const std::string &message, + const std::string &os_stack_trace) + GTEST_LOCK_EXCLUDED_(mutex_) { + Message msg; + msg << message; + + internal::MutexLock lock(&mutex_); + if (impl_->gtest_trace_stack().size() > 0) { + msg << "\n" << GTEST_NAME_ << " trace:"; + + for (size_t i = impl_->gtest_trace_stack().size(); i > 0; --i) { + const internal::TraceInfo &trace = impl_->gtest_trace_stack()[i - 1]; + msg << "\n" + << internal::FormatFileLocation(trace.file, trace.line) << " " + << trace.message; + } + } + + if (os_stack_trace.c_str() != nullptr && !os_stack_trace.empty()) { + msg << internal::kStackTraceMarker << os_stack_trace; + } + + const TestPartResult result = TestPartResult( + result_type, file_name, line_number, msg.GetString().c_str()); + impl_->GetTestPartResultReporterForCurrentThread()->ReportTestPartResult( + result); + + if (result_type != TestPartResult::kSuccess && + result_type != TestPartResult::kSkip) { + // gtest_break_on_failure takes precedence over + // gtest_throw_on_failure. This allows a user to set the latter + // in the code (perhaps in order to use Google Test assertions + // with another testing framework) and specify the former on the + // command line for debugging. + if (GTEST_FLAG(break_on_failure)) { +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT + // Using DebugBreak on Windows allows gtest to still break into a debugger + // when a failure happens and both the --gtest_break_on_failure and + // the --gtest_catch_exceptions flags are specified. + DebugBreak(); +#elif (!defined(__native_client__)) && \ + ((defined(__clang__) || defined(__GNUC__)) && \ + (defined(__x86_64__) || defined(__i386__))) + // with clang/gcc we can achieve the same effect on x86 by invoking int3 + asm("int3"); +#else + // Dereference nullptr through a volatile pointer to prevent the compiler + // from removing. We use this rather than abort() or __builtin_trap() for + // portability: some debuggers don't correctly trap abort(). + *static_cast(nullptr) = 1; +#endif // GTEST_OS_WINDOWS + } else if (GTEST_FLAG(throw_on_failure)) { +#if GTEST_HAS_EXCEPTIONS + throw internal::GoogleTestFailureException(result); +#else + // We cannot call abort() as it generates a pop-up in debug mode + // that cannot be suppressed in VC 7.1 or below. + exit(1); +#endif + } + } +} + +// Adds a TestProperty to the current TestResult object when invoked from +// inside a test, to current TestSuite's ad_hoc_test_result_ when invoked +// from SetUpTestSuite or TearDownTestSuite, or to the global property set +// when invoked elsewhere. If the result already contains a property with +// the same key, the value will be updated. +void UnitTest::RecordProperty(const std::string &key, + const std::string &value) { + impl_->RecordProperty(TestProperty(key, value)); +} + +// Runs all tests in this UnitTest object and prints the result. +// Returns 0 if successful, or 1 otherwise. +// +// We don't protect this under mutex_, as we only support calling it +// from the main thread. +int UnitTest::Run() { + const bool in_death_test_child_process = + internal::GTEST_FLAG(internal_run_death_test).length() > 0; + + // Google Test implements this protocol for catching that a test + // program exits before returning control to Google Test: + // + // 1. Upon start, Google Test creates a file whose absolute path + // is specified by the environment variable + // TEST_PREMATURE_EXIT_FILE. + // 2. When Google Test has finished its work, it deletes the file. + // + // This allows a test runner to set TEST_PREMATURE_EXIT_FILE before + // running a Google-Test-based test program and check the existence + // of the file at the end of the test execution to see if it has + // exited prematurely. + + // If we are in the child process of a death test, don't + // create/delete the premature exit file, as doing so is unnecessary + // and will confuse the parent process. Otherwise, create/delete + // the file upon entering/leaving this function. If the program + // somehow exits before this function has a chance to return, the + // premature-exit file will be left undeleted, causing a test runner + // that understands the premature-exit-file protocol to report the + // test as having failed. + const internal::ScopedPrematureExitFile premature_exit_file( + in_death_test_child_process + ? nullptr + : internal::posix::GetEnv("TEST_PREMATURE_EXIT_FILE")); + + // Captures the value of GTEST_FLAG(catch_exceptions). This value will be + // used for the duration of the program. + impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions)); + +#if GTEST_OS_WINDOWS + // Either the user wants Google Test to catch exceptions thrown by the + // tests or this is executing in the context of death test child + // process. In either case the user does not want to see pop-up dialogs + // about crashes - they are expected. + if (impl()->catch_exceptions() || in_death_test_child_process) { +#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT + // SetErrorMode doesn't exist on CE. + SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT | + SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX); +#endif // !GTEST_OS_WINDOWS_MOBILE + +#if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE + // Death test children can be terminated with _abort(). On Windows, + // _abort() can show a dialog with a warning message. This forces the + // abort message to go to stderr instead. + _set_error_mode(_OUT_TO_STDERR); +#endif + +#if defined(_MSC_VER) && !GTEST_OS_WINDOWS_MOBILE + // In the debug version, Visual Studio pops up a separate dialog + // offering a choice to debug the aborted program. We need to suppress + // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement + // executed. Google Test will notify the user of any unexpected + // failure via stderr. + if (!GTEST_FLAG(break_on_failure)) + _set_abort_behavior( + 0x0, // Clear the following flags: + _WRITE_ABORT_MSG | _CALL_REPORTFAULT); // pop-up window, core dump. + + // In debug mode, the Windows CRT can crash with an assertion over invalid + // input (e.g. passing an invalid file descriptor). The default handling + // for these assertions is to pop up a dialog and wait for user input. + // Instead ask the CRT to dump such assertions to stderr non-interactively. + if (!IsDebuggerPresent()) { + (void)_CrtSetReportMode(_CRT_ASSERT, + _CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG); + (void)_CrtSetReportFile(_CRT_ASSERT, _CRTDBG_FILE_STDERR); + } +#endif + } +#endif // GTEST_OS_WINDOWS + + return internal::HandleExceptionsInMethodIfSupported( + impl(), &internal::UnitTestImpl::RunAllTests, + "auxiliary test code (environments or event listeners)") + ? 0 + : 1; +} + +// Returns the working directory when the first TEST() or TEST_F() was +// executed. +const char *UnitTest::original_working_dir() const { + return impl_->original_working_dir_.c_str(); +} + +// Returns the TestSuite object for the test that's currently running, +// or NULL if no test is running. +const TestSuite *UnitTest::current_test_suite() const + GTEST_LOCK_EXCLUDED_(mutex_) { + internal::MutexLock lock(&mutex_); + return impl_->current_test_suite(); +} + +// Legacy API is still available but deprecated +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +const TestCase *UnitTest::current_test_case() const + GTEST_LOCK_EXCLUDED_(mutex_) { + internal::MutexLock lock(&mutex_); + return impl_->current_test_suite(); +} +#endif + +// Returns the TestInfo object for the test that's currently running, +// or NULL if no test is running. +const TestInfo *UnitTest::current_test_info() const + GTEST_LOCK_EXCLUDED_(mutex_) { + internal::MutexLock lock(&mutex_); + return impl_->current_test_info(); +} + +// Returns the random seed used at the start of the current test run. +int UnitTest::random_seed() const { return impl_->random_seed(); } + +// Returns ParameterizedTestSuiteRegistry object used to keep track of +// value-parameterized tests and instantiate and register them. +internal::ParameterizedTestSuiteRegistry & +UnitTest::parameterized_test_registry() GTEST_LOCK_EXCLUDED_(mutex_) { + return impl_->parameterized_test_registry(); +} + +// Creates an empty UnitTest. +UnitTest::UnitTest() { impl_ = new internal::UnitTestImpl(this); } + +// Destructor of UnitTest. +UnitTest::~UnitTest() { delete impl_; } + +// Pushes a trace defined by SCOPED_TRACE() on to the per-thread +// Google Test trace stack. +void UnitTest::PushGTestTrace(const internal::TraceInfo &trace) + GTEST_LOCK_EXCLUDED_(mutex_) { + internal::MutexLock lock(&mutex_); + impl_->gtest_trace_stack().push_back(trace); +} + +// Pops a trace from the per-thread Google Test trace stack. +void UnitTest::PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_) { + internal::MutexLock lock(&mutex_); + impl_->gtest_trace_stack().pop_back(); +} + +namespace internal { + +UnitTestImpl::UnitTestImpl(UnitTest *parent) + : parent_(parent), + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4355 /* using this in initializer */) + default_global_test_part_result_reporter_(this), + default_per_thread_test_part_result_reporter_(this), + GTEST_DISABLE_MSC_WARNINGS_POP_() global_test_part_result_repoter_( + &default_global_test_part_result_reporter_), + per_thread_test_part_result_reporter_( + &default_per_thread_test_part_result_reporter_), + parameterized_test_registry_(), parameterized_tests_registered_(false), + last_death_test_suite_(-1), current_test_suite_(nullptr), + current_test_info_(nullptr), ad_hoc_test_result_(), + os_stack_trace_getter_(nullptr), post_flag_parse_init_performed_(false), + random_seed_(0), // Will be overridden by the flag before first use. + random_(0), // Will be reseeded before first use. + start_timestamp_(0), elapsed_time_(0), +#if GTEST_HAS_DEATH_TEST + death_test_factory_(new DefaultDeathTestFactory), +#endif + // Will be overridden by the flag before first use. + catch_exceptions_(false) { + listeners()->SetDefaultResultPrinter(new PrettyUnitTestResultPrinter); +} + +UnitTestImpl::~UnitTestImpl() { + // Deletes every TestSuite. + ForEach(test_suites_, internal::Delete); + + // Deletes every Environment. + ForEach(environments_, internal::Delete); + + delete os_stack_trace_getter_; +} + +// Adds a TestProperty to the current TestResult object when invoked in a +// context of a test, to current test suite's ad_hoc_test_result when invoke +// from SetUpTestSuite/TearDownTestSuite, or to the global property set +// otherwise. If the result already contains a property with the same key, +// the value will be updated. +void UnitTestImpl::RecordProperty(const TestProperty &test_property) { + std::string xml_element; + TestResult *test_result; // TestResult appropriate for property recording. + + if (current_test_info_ != nullptr) { + xml_element = "testcase"; + test_result = &(current_test_info_->result_); + } else if (current_test_suite_ != nullptr) { + xml_element = "testsuite"; + test_result = &(current_test_suite_->ad_hoc_test_result_); + } else { + xml_element = "testsuites"; + test_result = &ad_hoc_test_result_; + } + test_result->RecordProperty(xml_element, test_property); +} + +#if GTEST_HAS_DEATH_TEST +// Disables event forwarding if the control is currently in a death test +// subprocess. Must not be called before InitGoogleTest. +void UnitTestImpl::SuppressTestEventsIfInSubprocess() { + if (internal_run_death_test_flag_.get() != nullptr) + listeners()->SuppressEventForwarding(); +} +#endif // GTEST_HAS_DEATH_TEST + +// Initializes event listeners performing XML output as specified by +// UnitTestOptions. Must not be called before InitGoogleTest. +void UnitTestImpl::ConfigureXmlOutput() { + const std::string &output_format = UnitTestOptions::GetOutputFormat(); + if (output_format == "xml") { + listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter( + UnitTestOptions::GetAbsolutePathToOutputFile().c_str())); + } else if (output_format == "json") { + listeners()->SetDefaultXmlGenerator(new JsonUnitTestResultPrinter( + UnitTestOptions::GetAbsolutePathToOutputFile().c_str())); + } else if (output_format != "") { + GTEST_LOG_(WARNING) << "WARNING: unrecognized output format \"" + << output_format << "\" ignored."; + } +} + +#if GTEST_CAN_STREAM_RESULTS_ +// Initializes event listeners for streaming test results in string form. +// Must not be called before InitGoogleTest. +void UnitTestImpl::ConfigureStreamingOutput() { + const std::string &target = GTEST_FLAG(stream_result_to); + if (!target.empty()) { + const size_t pos = target.find(':'); + if (pos != std::string::npos) { + listeners()->Append( + new StreamingListener(target.substr(0, pos), target.substr(pos + 1))); + } else { + GTEST_LOG_(WARNING) << "unrecognized streaming target \"" << target + << "\" ignored."; + } + } +} +#endif // GTEST_CAN_STREAM_RESULTS_ + +// Performs initialization dependent upon flag values obtained in +// ParseGoogleTestFlagsOnly. Is called from InitGoogleTest after the call to +// ParseGoogleTestFlagsOnly. In case a user neglects to call InitGoogleTest +// this function is also called from RunAllTests. Since this function can be +// called more than once, it has to be idempotent. +void UnitTestImpl::PostFlagParsingInit() { + // Ensures that this function does not execute more than once. + if (!post_flag_parse_init_performed_) { + post_flag_parse_init_performed_ = true; + +#if defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_) + // Register to send notifications about key process state changes. + listeners()->Append(new GTEST_CUSTOM_TEST_EVENT_LISTENER_()); +#endif // defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_) + +#if GTEST_HAS_DEATH_TEST + InitDeathTestSubprocessControlInfo(); + SuppressTestEventsIfInSubprocess(); +#endif // GTEST_HAS_DEATH_TEST + + // Registers parameterized tests. This makes parameterized tests + // available to the UnitTest reflection API without running + // RUN_ALL_TESTS. + RegisterParameterizedTests(); + + // Configures listeners for XML output. This makes it possible for users + // to shut down the default XML output before invoking RUN_ALL_TESTS. + ConfigureXmlOutput(); + +#if GTEST_CAN_STREAM_RESULTS_ + // Configures listeners for streaming test results to the specified server. + ConfigureStreamingOutput(); +#endif // GTEST_CAN_STREAM_RESULTS_ + +#if GTEST_HAS_ABSL + if (GTEST_FLAG(install_failure_signal_handler)) { + absl::FailureSignalHandlerOptions options; + absl::InstallFailureSignalHandler(options); + } +#endif // GTEST_HAS_ABSL + } +} + +// A predicate that checks the name of a TestSuite against a known +// value. +// +// This is used for implementation of the UnitTest class only. We put +// it in the anonymous namespace to prevent polluting the outer +// namespace. +// +// TestSuiteNameIs is copyable. +class TestSuiteNameIs { + public: + // Constructor. + explicit TestSuiteNameIs(const std::string &name) : name_(name) {} + + // Returns true if and only if the name of test_suite matches name_. + bool operator()(const TestSuite *test_suite) const { + return test_suite != nullptr && + strcmp(test_suite->name(), name_.c_str()) == 0; + } + + private: + std::string name_; +}; + +// Finds and returns a TestSuite with the given name. If one doesn't +// exist, creates one and returns it. It's the CALLER'S +// RESPONSIBILITY to ensure that this function is only called WHEN THE +// TESTS ARE NOT SHUFFLED. +// +// Arguments: +// +// test_suite_name: name of the test suite +// type_param: the name of the test suite's type parameter, or NULL if +// this is not a typed or a type-parameterized test suite. +// set_up_tc: pointer to the function that sets up the test suite +// tear_down_tc: pointer to the function that tears down the test suite +TestSuite *UnitTestImpl::GetTestSuite( + const char *test_suite_name, const char *type_param, + internal::SetUpTestSuiteFunc set_up_tc, + internal::TearDownTestSuiteFunc tear_down_tc) { + // Can we find a TestSuite with the given name? + const auto test_suite = + std::find_if(test_suites_.rbegin(), test_suites_.rend(), + TestSuiteNameIs(test_suite_name)); + + if (test_suite != test_suites_.rend()) return *test_suite; + + // No. Let's create one. + auto *const new_test_suite = + new TestSuite(test_suite_name, type_param, set_up_tc, tear_down_tc); + + // Is this a death test suite? + if (internal::UnitTestOptions::MatchesFilter(test_suite_name, + kDeathTestSuiteFilter)) { + // Yes. Inserts the test suite after the last death test suite + // defined so far. This only works when the test suites haven't + // been shuffled. Otherwise we may end up running a death test + // after a non-death test. + ++last_death_test_suite_; + test_suites_.insert(test_suites_.begin() + last_death_test_suite_, + new_test_suite); + } else { + // No. Appends to the end of the list. + test_suites_.push_back(new_test_suite); + } + + test_suite_indices_.push_back(static_cast(test_suite_indices_.size())); + return new_test_suite; +} + +// Helpers for setting up / tearing down the given environment. They +// are for use in the ForEach() function. +static void SetUpEnvironment(Environment *env) { env->SetUp(); } +static void TearDownEnvironment(Environment *env) { env->TearDown(); } + +// Runs all tests in this UnitTest object, prints the result, and +// returns true if all tests are successful. If any exception is +// thrown during a test, the test is considered to be failed, but the +// rest of the tests will still be run. +// +// When parameterized tests are enabled, it expands and registers +// parameterized tests first in RegisterParameterizedTests(). +// All other functions called from RunAllTests() may safely assume that +// parameterized tests are ready to be counted and run. +bool UnitTestImpl::RunAllTests() { + // True if and only if Google Test is initialized before RUN_ALL_TESTS() is + // called. + const bool gtest_is_initialized_before_run_all_tests = GTestIsInitialized(); + + // Do not run any test if the --help flag was specified. + if (g_help_flag) return true; + + // Repeats the call to the post-flag parsing initialization in case the + // user didn't call InitGoogleTest. + PostFlagParsingInit(); + + // Even if sharding is not on, test runners may want to use the + // GTEST_SHARD_STATUS_FILE to query whether the test supports the sharding + // protocol. + internal::WriteToShardStatusFileIfNeeded(); + + // True if and only if we are in a subprocess for running a thread-safe-style + // death test. + bool in_subprocess_for_death_test = false; + +#if GTEST_HAS_DEATH_TEST + in_subprocess_for_death_test = + (internal_run_death_test_flag_.get() != nullptr); +#if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_) + if (in_subprocess_for_death_test) { + GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_(); + } +#endif // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_) +#endif // GTEST_HAS_DEATH_TEST + + const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex, + in_subprocess_for_death_test); + + // Compares the full test names with the filter to decide which + // tests to run. + const bool has_tests_to_run = + FilterTests(should_shard ? HONOR_SHARDING_PROTOCOL + : IGNORE_SHARDING_PROTOCOL) > 0; + + // Lists the tests and exits if the --gtest_list_tests flag was specified. + if (GTEST_FLAG(list_tests)) { + // This must be called *after* FilterTests() has been called. + ListTestsMatchingFilter(); + return true; + } + + random_seed_ = + GTEST_FLAG(shuffle) ? GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0; + + // True if and only if at least one test has failed. + bool failed = false; + + TestEventListener *repeater = listeners()->repeater(); + + start_timestamp_ = GetTimeInMillis(); + repeater->OnTestProgramStart(*parent_); + + // How many times to repeat the tests? We don't want to repeat them + // when we are inside the subprocess of a death test. + const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat); + // Repeats forever if the repeat count is negative. + const bool gtest_repeat_forever = repeat < 0; + for (int i = 0; gtest_repeat_forever || i != repeat; i++) { + // We want to preserve failures generated by ad-hoc test + // assertions executed before RUN_ALL_TESTS(). + ClearNonAdHocTestResult(); + + const TimeInMillis start = GetTimeInMillis(); + + // Shuffles test suites and tests if requested. + if (has_tests_to_run && GTEST_FLAG(shuffle)) { + random()->Reseed(static_cast(random_seed_)); + // This should be done before calling OnTestIterationStart(), + // such that a test event listener can see the actual test order + // in the event. + ShuffleTests(); + } + + // Tells the unit test event listeners that the tests are about to start. + repeater->OnTestIterationStart(*parent_, i); + + // Runs each test suite if there is at least one test to run. + if (has_tests_to_run) { + // Sets up all environments beforehand. + repeater->OnEnvironmentsSetUpStart(*parent_); + ForEach(environments_, SetUpEnvironment); + repeater->OnEnvironmentsSetUpEnd(*parent_); + + // Runs the tests only if there was no fatal failure or skip triggered + // during global set-up. + if (Test::IsSkipped()) { + // Emit diagnostics when global set-up calls skip, as it will not be + // emitted by default. + TestResult &test_result = + *internal::GetUnitTestImpl()->current_test_result(); + for (int j = 0; j < test_result.total_part_count(); ++j) { + const TestPartResult &test_part_result = + test_result.GetTestPartResult(j); + if (test_part_result.type() == TestPartResult::kSkip) { + const std::string &result = test_part_result.message(); + printf("%s\n", result.c_str()); + } + } + fflush(stdout); + } else if (!Test::HasFatalFailure()) { + for (int test_index = 0; test_index < total_test_suite_count(); + test_index++) { + GetMutableSuiteCase(test_index)->Run(); + } + } + + // Tears down all environments in reverse order afterwards. + repeater->OnEnvironmentsTearDownStart(*parent_); + std::for_each(environments_.rbegin(), environments_.rend(), + TearDownEnvironment); + repeater->OnEnvironmentsTearDownEnd(*parent_); + } + + elapsed_time_ = GetTimeInMillis() - start; + + // Tells the unit test event listener that the tests have just finished. + repeater->OnTestIterationEnd(*parent_, i); + + // Gets the result and clears it. + if (!Passed()) { + failed = true; + } + + // Restores the original test order after the iteration. This + // allows the user to quickly repro a failure that happens in the + // N-th iteration without repeating the first (N - 1) iterations. + // This is not enclosed in "if (GTEST_FLAG(shuffle)) { ... }", in + // case the user somehow changes the value of the flag somewhere + // (it's always safe to unshuffle the tests). + UnshuffleTests(); + + if (GTEST_FLAG(shuffle)) { + // Picks a new random seed for each iteration. + random_seed_ = GetNextRandomSeed(random_seed_); + } + } + + repeater->OnTestProgramEnd(*parent_); + + if (!gtest_is_initialized_before_run_all_tests) { + ColoredPrintf( + COLOR_RED, + "\nIMPORTANT NOTICE - DO NOT IGNORE:\n" + "This test program did NOT call " GTEST_INIT_GOOGLE_TEST_NAME_ + "() before calling RUN_ALL_TESTS(). This is INVALID. Soon " GTEST_NAME_ + " will start to enforce the valid usage. " + "Please fix it ASAP, or IT WILL START TO FAIL.\n"); // NOLINT +#if GTEST_FOR_GOOGLE_ + ColoredPrintf(COLOR_RED, + "For more details, see http://wiki/Main/ValidGUnitMain.\n"); +#endif // GTEST_FOR_GOOGLE_ + } + + return !failed; +} + +// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file +// if the variable is present. If a file already exists at this location, this +// function will write over it. If the variable is present, but the file cannot +// be created, prints an error and exits. +void WriteToShardStatusFileIfNeeded() { + const char *const test_shard_file = posix::GetEnv(kTestShardStatusFile); + if (test_shard_file != nullptr) { + FILE *const file = posix::FOpen(test_shard_file, "w"); + if (file == nullptr) { + ColoredPrintf(COLOR_RED, + "Could not write to the test shard status file \"%s\" " + "specified by the %s environment variable.\n", + test_shard_file, kTestShardStatusFile); + fflush(stdout); + exit(EXIT_FAILURE); + } + fclose(file); + } +} + +// Checks whether sharding is enabled by examining the relevant +// environment variable values. If the variables are present, +// but inconsistent (i.e., shard_index >= total_shards), prints +// an error and exits. If in_subprocess_for_death_test, sharding is +// disabled because it must only be applied to the original test +// process. Otherwise, we could filter out death tests we intended to execute. +bool ShouldShard(const char *total_shards_env, const char *shard_index_env, + bool in_subprocess_for_death_test) { + if (in_subprocess_for_death_test) { + return false; + } + + const int32_t total_shards = Int32FromEnvOrDie(total_shards_env, -1); + const int32_t shard_index = Int32FromEnvOrDie(shard_index_env, -1); + + if (total_shards == -1 && shard_index == -1) { + return false; + } else if (total_shards == -1 && shard_index != -1) { + const Message msg = Message() << "Invalid environment variables: you have " + << kTestShardIndex << " = " << shard_index + << ", but have left " << kTestTotalShards + << " unset.\n"; + ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str()); + fflush(stdout); + exit(EXIT_FAILURE); + } else if (total_shards != -1 && shard_index == -1) { + const Message msg = Message() + << "Invalid environment variables: you have " + << kTestTotalShards << " = " << total_shards + << ", but have left " << kTestShardIndex << " unset.\n"; + ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str()); + fflush(stdout); + exit(EXIT_FAILURE); + } else if (shard_index < 0 || shard_index >= total_shards) { + const Message msg = + Message() << "Invalid environment variables: we require 0 <= " + << kTestShardIndex << " < " << kTestTotalShards + << ", but you have " << kTestShardIndex << "=" << shard_index + << ", " << kTestTotalShards << "=" << total_shards << ".\n"; + ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str()); + fflush(stdout); + exit(EXIT_FAILURE); + } + + return total_shards > 1; +} + +// Parses the environment variable var as an Int32. If it is unset, +// returns default_val. If it is not an Int32, prints an error +// and aborts. +int32_t Int32FromEnvOrDie(const char *var, int32_t default_val) { + const char *str_val = posix::GetEnv(var); + if (str_val == nullptr) { + return default_val; + } + + int32_t result; + if (!ParseInt32(Message() << "The value of environment variable " << var, + str_val, &result)) { + exit(EXIT_FAILURE); + } + return result; +} + +// Given the total number of shards, the shard index, and the test id, +// returns true if and only if the test should be run on this shard. The test id +// is some arbitrary but unique non-negative integer assigned to each test +// method. Assumes that 0 <= shard_index < total_shards. +bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) { + return (test_id % total_shards) == shard_index; +} + +// Compares the name of each test with the user-specified filter to +// decide whether the test should be run, then records the result in +// each TestSuite and TestInfo object. +// If shard_tests == true, further filters tests based on sharding +// variables in the environment - see +// https://github.com/google/googletest/blob/master/googletest/docs/advanced.md +// . Returns the number of tests that should run. +int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) { + const int32_t total_shards = shard_tests == HONOR_SHARDING_PROTOCOL + ? Int32FromEnvOrDie(kTestTotalShards, -1) + : -1; + const int32_t shard_index = shard_tests == HONOR_SHARDING_PROTOCOL + ? Int32FromEnvOrDie(kTestShardIndex, -1) + : -1; + + // num_runnable_tests are the number of tests that will + // run across all shards (i.e., match filter and are not disabled). + // num_selected_tests are the number of tests to be run on + // this shard. + int num_runnable_tests = 0; + int num_selected_tests = 0; + for (auto *test_suite : test_suites_) { + const std::string &test_suite_name = test_suite->name(); + test_suite->set_should_run(false); + + for (size_t j = 0; j < test_suite->test_info_list().size(); j++) { + TestInfo *const test_info = test_suite->test_info_list()[j]; + const std::string test_name(test_info->name()); + // A test is disabled if test suite name or test name matches + // kDisableTestFilter. + const bool is_disabled = internal::UnitTestOptions::MatchesFilter( + test_suite_name, kDisableTestFilter) || + internal::UnitTestOptions::MatchesFilter( + test_name, kDisableTestFilter); + test_info->is_disabled_ = is_disabled; + + const bool matches_filter = internal::UnitTestOptions::FilterMatchesTest( + test_suite_name, test_name); + test_info->matches_filter_ = matches_filter; + + const bool is_runnable = + (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) && + matches_filter; + + const bool is_in_another_shard = + shard_tests != IGNORE_SHARDING_PROTOCOL && + !ShouldRunTestOnShard(total_shards, shard_index, num_runnable_tests); + test_info->is_in_another_shard_ = is_in_another_shard; + const bool is_selected = is_runnable && !is_in_another_shard; + + num_runnable_tests += is_runnable; + num_selected_tests += is_selected; + + test_info->should_run_ = is_selected; + test_suite->set_should_run(test_suite->should_run() || is_selected); + } + } + return num_selected_tests; +} + +// Prints the given C-string on a single line by replacing all '\n' +// characters with string "\\n". If the output takes more than +// max_length characters, only prints the first max_length characters +// and "...". +static void PrintOnOneLine(const char *str, int max_length) { + if (str != nullptr) { + for (int i = 0; *str != '\0'; ++str) { + if (i >= max_length) { + printf("..."); + break; + } + if (*str == '\n') { + printf("\\n"); + i += 2; + } else { + printf("%c", *str); + ++i; + } + } + } +} + +// Prints the names of the tests matching the user-specified filter flag. +void UnitTestImpl::ListTestsMatchingFilter() { + // Print at most this many characters for each type/value parameter. + const int kMaxParamLength = 250; + + for (auto *test_suite : test_suites_) { + bool printed_test_suite_name = false; + + for (size_t j = 0; j < test_suite->test_info_list().size(); j++) { + const TestInfo *const test_info = test_suite->test_info_list()[j]; + if (test_info->matches_filter_) { + if (!printed_test_suite_name) { + printed_test_suite_name = true; + printf("%s.", test_suite->name()); + if (test_suite->type_param() != nullptr) { + printf(" # %s = ", kTypeParamLabel); + // We print the type parameter on a single line to make + // the output easy to parse by a program. + PrintOnOneLine(test_suite->type_param(), kMaxParamLength); + } + printf("\n"); + } + printf(" %s", test_info->name()); + if (test_info->value_param() != nullptr) { + printf(" # %s = ", kValueParamLabel); + // We print the value parameter on a single line to make the + // output easy to parse by a program. + PrintOnOneLine(test_info->value_param(), kMaxParamLength); + } + printf("\n"); + } + } + } + fflush(stdout); + const std::string &output_format = UnitTestOptions::GetOutputFormat(); + if (output_format == "xml" || output_format == "json") { + FILE *fileout = OpenFileForWriting( + UnitTestOptions::GetAbsolutePathToOutputFile().c_str()); + std::stringstream stream; + if (output_format == "xml") { + XmlUnitTestResultPrinter( + UnitTestOptions::GetAbsolutePathToOutputFile().c_str()) + .PrintXmlTestsList(&stream, test_suites_); + } else if (output_format == "json") { + JsonUnitTestResultPrinter( + UnitTestOptions::GetAbsolutePathToOutputFile().c_str()) + .PrintJsonTestList(&stream, test_suites_); + } + fprintf(fileout, "%s", StringStreamToString(&stream).c_str()); + fclose(fileout); + } +} + +// Sets the OS stack trace getter. +// +// Does nothing if the input and the current OS stack trace getter are +// the same; otherwise, deletes the old getter and makes the input the +// current getter. +void UnitTestImpl::set_os_stack_trace_getter( + OsStackTraceGetterInterface *getter) { + if (os_stack_trace_getter_ != getter) { + delete os_stack_trace_getter_; + os_stack_trace_getter_ = getter; + } +} + +// Returns the current OS stack trace getter if it is not NULL; +// otherwise, creates an OsStackTraceGetter, makes it the current +// getter, and returns it. +OsStackTraceGetterInterface *UnitTestImpl::os_stack_trace_getter() { + if (os_stack_trace_getter_ == nullptr) { +#ifdef GTEST_OS_STACK_TRACE_GETTER_ + os_stack_trace_getter_ = new GTEST_OS_STACK_TRACE_GETTER_; +#else + os_stack_trace_getter_ = new OsStackTraceGetter; +#endif // GTEST_OS_STACK_TRACE_GETTER_ + } + + return os_stack_trace_getter_; +} + +// Returns the most specific TestResult currently running. +TestResult *UnitTestImpl::current_test_result() { + if (current_test_info_ != nullptr) { + return ¤t_test_info_->result_; + } + if (current_test_suite_ != nullptr) { + return ¤t_test_suite_->ad_hoc_test_result_; + } + return &ad_hoc_test_result_; +} + +// Shuffles all test suites, and the tests within each test suite, +// making sure that death tests are still run first. +void UnitTestImpl::ShuffleTests() { + // Shuffles the death test suites. + ShuffleRange(random(), 0, last_death_test_suite_ + 1, &test_suite_indices_); + + // Shuffles the non-death test suites. + ShuffleRange(random(), last_death_test_suite_ + 1, + static_cast(test_suites_.size()), &test_suite_indices_); + + // Shuffles the tests inside each test suite. + for (auto &test_suite : test_suites_) { + test_suite->ShuffleTests(random()); + } +} + +// Restores the test suites and tests to their order before the first shuffle. +void UnitTestImpl::UnshuffleTests() { + for (size_t i = 0; i < test_suites_.size(); i++) { + // Unshuffles the tests in each test suite. + test_suites_[i]->UnshuffleTests(); + // Resets the index of each test suite. + test_suite_indices_[i] = static_cast(i); + } +} + +// Returns the current OS stack trace as an std::string. +// +// The maximum number of stack frames to be included is specified by +// the gtest_stack_trace_depth flag. The skip_count parameter +// specifies the number of top frames to be skipped, which doesn't +// count against the number of frames to be included. +// +// For example, if Foo() calls Bar(), which in turn calls +// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in +// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't. +std::string GetCurrentOsStackTraceExceptTop(UnitTest * /*unit_test*/, + int skip_count) { + // We pass skip_count + 1 to skip this wrapper function in addition + // to what the user really wants to skip. + return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1); +} + +// Used by the GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_ macro to +// suppress unreachable code warnings. +namespace { +class ClassUniqueToAlwaysTrue {}; +} // namespace + +bool IsTrue(bool condition) { return condition; } + +bool AlwaysTrue() { +#if GTEST_HAS_EXCEPTIONS + // This condition is always false so AlwaysTrue() never actually throws, + // but it makes the compiler think that it may throw. + if (IsTrue(false)) throw ClassUniqueToAlwaysTrue(); +#endif // GTEST_HAS_EXCEPTIONS + return true; +} + +// If *pstr starts with the given prefix, modifies *pstr to be right +// past the prefix and returns true; otherwise leaves *pstr unchanged +// and returns false. None of pstr, *pstr, and prefix can be NULL. +bool SkipPrefix(const char *prefix, const char **pstr) { + const size_t prefix_len = strlen(prefix); + if (strncmp(*pstr, prefix, prefix_len) == 0) { + *pstr += prefix_len; + return true; + } + return false; +} + +// Parses a string as a command line flag. The string should have +// the format "--flag=value". When def_optional is true, the "=value" +// part can be omitted. +// +// Returns the value of the flag, or NULL if the parsing failed. +static const char *ParseFlagValue(const char *str, const char *flag, + bool def_optional) { + // str and flag must not be NULL. + if (str == nullptr || flag == nullptr) return nullptr; + + // The flag must start with "--" followed by GTEST_FLAG_PREFIX_. + const std::string flag_str = std::string("--") + GTEST_FLAG_PREFIX_ + flag; + const size_t flag_len = flag_str.length(); + if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr; + + // Skips the flag name. + const char *flag_end = str + flag_len; + + // When def_optional is true, it's OK to not have a "=value" part. + if (def_optional && (flag_end[0] == '\0')) { + return flag_end; + } + + // If def_optional is true and there are more characters after the + // flag name, or if def_optional is false, there must be a '=' after + // the flag name. + if (flag_end[0] != '=') return nullptr; + + // Returns the string after "=". + return flag_end + 1; +} + +// Parses a string for a bool flag, in the form of either +// "--flag=value" or "--flag". +// +// In the former case, the value is taken as true as long as it does +// not start with '0', 'f', or 'F'. +// +// In the latter case, the value is taken as true. +// +// On success, stores the value of the flag in *value, and returns +// true. On failure, returns false without changing *value. +static bool ParseBoolFlag(const char *str, const char *flag, bool *value) { + // Gets the value of the flag as a string. + const char *const value_str = ParseFlagValue(str, flag, true); + + // Aborts if the parsing failed. + if (value_str == nullptr) return false; + + // Converts the string value to a bool. + *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F'); + return true; +} + +// Parses a string for an int32_t flag, in the form of "--flag=value". +// +// On success, stores the value of the flag in *value, and returns +// true. On failure, returns false without changing *value. +bool ParseInt32Flag(const char *str, const char *flag, int32_t *value) { + // Gets the value of the flag as a string. + const char *const value_str = ParseFlagValue(str, flag, false); + + // Aborts if the parsing failed. + if (value_str == nullptr) return false; + + // Sets *value to the value of the flag. + return ParseInt32(Message() << "The value of flag --" << flag, value_str, + value); +} + +// Parses a string for a string flag, in the form of "--flag=value". +// +// On success, stores the value of the flag in *value, and returns +// true. On failure, returns false without changing *value. +template +static bool ParseStringFlag(const char *str, const char *flag, String *value) { + // Gets the value of the flag as a string. + const char *const value_str = ParseFlagValue(str, flag, false); + + // Aborts if the parsing failed. + if (value_str == nullptr) return false; + + // Sets *value to the value of the flag. + *value = value_str; + return true; +} + +// Determines whether a string has a prefix that Google Test uses for its +// flags, i.e., starts with GTEST_FLAG_PREFIX_ or GTEST_FLAG_PREFIX_DASH_. +// If Google Test detects that a command line flag has its prefix but is not +// recognized, it will print its help message. Flags starting with +// GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test +// internal flags and do not trigger the help message. +static bool HasGoogleTestFlagPrefix(const char *str) { + return (SkipPrefix("--", &str) || SkipPrefix("-", &str) || + SkipPrefix("/", &str)) && + !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) && + (SkipPrefix(GTEST_FLAG_PREFIX_, &str) || + SkipPrefix(GTEST_FLAG_PREFIX_DASH_, &str)); +} + +// Prints a string containing code-encoded text. The following escape +// sequences can be used in the string to control the text color: +// +// @@ prints a single '@' character. +// @R changes the color to red. +// @G changes the color to green. +// @Y changes the color to yellow. +// @D changes to the default terminal text color. +// +static void PrintColorEncoded(const char *str) { + GTestColor color = COLOR_DEFAULT; // The current color. + + // Conceptually, we split the string into segments divided by escape + // sequences. Then we print one segment at a time. At the end of + // each iteration, the str pointer advances to the beginning of the + // next segment. + for (;;) { + const char *p = strchr(str, '@'); + if (p == nullptr) { + ColoredPrintf(color, "%s", str); + return; + } + + ColoredPrintf(color, "%s", std::string(str, p).c_str()); + + const char ch = p[1]; + str = p + 2; + if (ch == '@') { + ColoredPrintf(color, "@"); + } else if (ch == 'D') { + color = COLOR_DEFAULT; + } else if (ch == 'R') { + color = COLOR_RED; + } else if (ch == 'G') { + color = COLOR_GREEN; + } else if (ch == 'Y') { + color = COLOR_YELLOW; + } else { + --str; + } + } +} + +static const char kColorEncodedHelpMessage[] = + "This program contains tests written using " GTEST_NAME_ + ". You can use the\n" + "following command line flags to control its behavior:\n" + "\n" + "Test Selection:\n" + " @G--" GTEST_FLAG_PREFIX_ + "list_tests@D\n" + " List the names of all tests instead of running them. The name of\n" + " TEST(Foo, Bar) is \"Foo.Bar\".\n" + " @G--" GTEST_FLAG_PREFIX_ + "filter=@YPOSTIVE_PATTERNS" + "[@G-@YNEGATIVE_PATTERNS]@D\n" + " Run only the tests whose name matches one of the positive patterns " + "but\n" + " none of the negative patterns. '?' matches any single character; " + "'*'\n" + " matches any substring; ':' separates two patterns.\n" + " @G--" GTEST_FLAG_PREFIX_ + "also_run_disabled_tests@D\n" + " Run all disabled tests too.\n" + "\n" + "Test Execution:\n" + " @G--" GTEST_FLAG_PREFIX_ + "repeat=@Y[COUNT]@D\n" + " Run the tests repeatedly; use a negative count to repeat forever.\n" + " @G--" GTEST_FLAG_PREFIX_ + "shuffle@D\n" + " Randomize tests' orders on every iteration.\n" + " @G--" GTEST_FLAG_PREFIX_ + "random_seed=@Y[NUMBER]@D\n" + " Random number seed to use for shuffling test orders (between 1 and\n" + " 99999, or 0 to use a seed based on the current time).\n" + "\n" + "Test Output:\n" + " @G--" GTEST_FLAG_PREFIX_ + "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n" + " Enable/disable colored output. The default is @Gauto@D.\n" + " -@G-" GTEST_FLAG_PREFIX_ + "print_time=0@D\n" + " Don't print the elapsed time of each test.\n" + " @G--" GTEST_FLAG_PREFIX_ + "output=@Y(@Gjson@Y|@Gxml@Y)[@G:@YDIRECTORY_PATH@G" GTEST_PATH_SEP_ + "@Y|@G:@YFILE_PATH]@D\n" + " Generate a JSON or XML report in the given directory or with the " + "given\n" + " file name. @YFILE_PATH@D defaults to @Gtest_detail.xml@D.\n" +#if GTEST_CAN_STREAM_RESULTS_ + " @G--" GTEST_FLAG_PREFIX_ + "stream_result_to=@YHOST@G:@YPORT@D\n" + " Stream test results to the given server.\n" +#endif // GTEST_CAN_STREAM_RESULTS_ + "\n" + "Assertion Behavior:\n" +#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS + " @G--" GTEST_FLAG_PREFIX_ + "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n" + " Set the default death test style.\n" +#endif // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS + " @G--" GTEST_FLAG_PREFIX_ + "break_on_failure@D\n" + " Turn assertion failures into debugger break-points.\n" + " @G--" GTEST_FLAG_PREFIX_ + "throw_on_failure@D\n" + " Turn assertion failures into C++ exceptions for use by an external\n" + " test framework.\n" + " @G--" GTEST_FLAG_PREFIX_ + "catch_exceptions=0@D\n" + " Do not report exceptions as test failures. Instead, allow them\n" + " to crash the program or throw a pop-up (on Windows).\n" + "\n" + "Except for @G--" GTEST_FLAG_PREFIX_ + "list_tests@D, you can alternatively set " + "the corresponding\n" + "environment variable of a flag (all letters in upper-case). For example, " + "to\n" + "disable colored text output, you can either specify " + "@G--" GTEST_FLAG_PREFIX_ + "color=no@D or set\n" + "the @G" GTEST_FLAG_PREFIX_UPPER_ + "COLOR@D environment variable to @Gno@D.\n" + "\n" + "For more information, please read the " GTEST_NAME_ + " documentation at\n" + "@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_ + "\n" + "(not one in your own code or tests), please report it to\n" + "@G<" GTEST_DEV_EMAIL_ ">@D.\n"; + +static bool ParseGoogleTestFlag(const char *const arg) { + return ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag, + >EST_FLAG(also_run_disabled_tests)) || + ParseBoolFlag(arg, kBreakOnFailureFlag, + >EST_FLAG(break_on_failure)) || + ParseBoolFlag(arg, kCatchExceptionsFlag, + >EST_FLAG(catch_exceptions)) || + ParseStringFlag(arg, kColorFlag, >EST_FLAG(color)) || + ParseStringFlag(arg, kDeathTestStyleFlag, + >EST_FLAG(death_test_style)) || + ParseBoolFlag(arg, kDeathTestUseFork, + >EST_FLAG(death_test_use_fork)) || + ParseStringFlag(arg, kFilterFlag, >EST_FLAG(filter)) || + ParseStringFlag(arg, kInternalRunDeathTestFlag, + >EST_FLAG(internal_run_death_test)) || + ParseBoolFlag(arg, kListTestsFlag, >EST_FLAG(list_tests)) || + ParseStringFlag(arg, kOutputFlag, >EST_FLAG(output)) || + ParseBoolFlag(arg, kPrintTimeFlag, >EST_FLAG(print_time)) || + ParseBoolFlag(arg, kPrintUTF8Flag, >EST_FLAG(print_utf8)) || + ParseInt32Flag(arg, kRandomSeedFlag, >EST_FLAG(random_seed)) || + ParseInt32Flag(arg, kRepeatFlag, >EST_FLAG(repeat)) || + ParseBoolFlag(arg, kShuffleFlag, >EST_FLAG(shuffle)) || + ParseInt32Flag(arg, kStackTraceDepthFlag, + >EST_FLAG(stack_trace_depth)) || + ParseStringFlag(arg, kStreamResultToFlag, + >EST_FLAG(stream_result_to)) || + ParseBoolFlag(arg, kThrowOnFailureFlag, >EST_FLAG(throw_on_failure)); +} + +#if GTEST_USE_OWN_FLAGFILE_FLAG_ +static void LoadFlagsFromFile(const std::string &path) { + FILE *flagfile = posix::FOpen(path.c_str(), "r"); + if (!flagfile) { + GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG(flagfile) + << "\""; + } + std::string contents(ReadEntireFile(flagfile)); + posix::FClose(flagfile); + std::vector lines; + SplitString(contents, '\n', &lines); + for (size_t i = 0; i < lines.size(); ++i) { + if (lines[i].empty()) continue; + if (!ParseGoogleTestFlag(lines[i].c_str())) g_help_flag = true; + } +} +#endif // GTEST_USE_OWN_FLAGFILE_FLAG_ + +// Parses the command line for Google Test flags, without initializing +// other parts of Google Test. The type parameter CharType can be +// instantiated to either char or wchar_t. +template +void ParseGoogleTestFlagsOnlyImpl(int *argc, CharType **argv) { + for (int i = 1; i < *argc; i++) { + const std::string arg_string = StreamableToString(argv[i]); + const char *const arg = arg_string.c_str(); + + using internal::ParseBoolFlag; + using internal::ParseInt32Flag; + using internal::ParseStringFlag; + + bool remove_flag = false; + if (ParseGoogleTestFlag(arg)) { + remove_flag = true; +#if GTEST_USE_OWN_FLAGFILE_FLAG_ + } else if (ParseStringFlag(arg, kFlagfileFlag, >EST_FLAG(flagfile))) { + LoadFlagsFromFile(GTEST_FLAG(flagfile)); + remove_flag = true; +#endif // GTEST_USE_OWN_FLAGFILE_FLAG_ + } else if (arg_string == "--help" || arg_string == "-h" || + arg_string == "-?" || arg_string == "/?" || + HasGoogleTestFlagPrefix(arg)) { + // Both help flag and unrecognized Google Test flags (excluding + // internal ones) trigger help display. + g_help_flag = true; + } + + if (remove_flag) { + // Shift the remainder of the argv list left by one. Note + // that argv has (*argc + 1) elements, the last one always being + // NULL. The following loop moves the trailing NULL element as + // well. + for (int j = i; j != *argc; j++) { + argv[j] = argv[j + 1]; + } + + // Decrements the argument count. + (*argc)--; + + // We also need to decrement the iterator as we just removed + // an element. + i--; + } + } + + if (g_help_flag) { + // We print the help here instead of in RUN_ALL_TESTS(), as the + // latter may not be called at all if the user is using Google + // Test with another testing framework. + PrintColorEncoded(kColorEncodedHelpMessage); + } +} + +// Parses the command line for Google Test flags, without initializing +// other parts of Google Test. +void ParseGoogleTestFlagsOnly(int *argc, char **argv) { + ParseGoogleTestFlagsOnlyImpl(argc, argv); + + // Fix the value of *_NSGetArgc() on macOS, but if and only if + // *_NSGetArgv() == argv + // Only applicable to char** version of argv +#if GTEST_OS_MAC +#ifndef GTEST_OS_IOS + if (*_NSGetArgv() == argv) { + *_NSGetArgc() = *argc; + } +#endif +#endif +} +void ParseGoogleTestFlagsOnly(int *argc, wchar_t **argv) { + ParseGoogleTestFlagsOnlyImpl(argc, argv); +} + +// The internal implementation of InitGoogleTest(). +// +// The type parameter CharType can be instantiated to either char or +// wchar_t. +template +void InitGoogleTestImpl(int *argc, CharType **argv) { + // We don't want to run the initialization code twice. + if (GTestIsInitialized()) return; + + if (*argc <= 0) return; + + g_argvs.clear(); + for (int i = 0; i != *argc; i++) { + g_argvs.push_back(StreamableToString(argv[i])); + } + +#if GTEST_HAS_ABSL + absl::InitializeSymbolizer(g_argvs[0].c_str()); +#endif // GTEST_HAS_ABSL + + ParseGoogleTestFlagsOnly(argc, argv); + GetUnitTestImpl()->PostFlagParsingInit(); +} + +} // namespace internal + +// Initializes Google Test. This must be called before calling +// RUN_ALL_TESTS(). In particular, it parses a command line for the +// flags that Google Test recognizes. Whenever a Google Test flag is +// seen, it is removed from argv, and *argc is decremented. +// +// No value is returned. Instead, the Google Test flag variables are +// updated. +// +// Calling the function for the second time has no user-visible effect. +void InitGoogleTest(int *argc, char **argv) { +#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) + GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv); +#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) + internal::InitGoogleTestImpl(argc, argv); +#endif // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) +} + +// This overloaded version can be used in Windows programs compiled in +// UNICODE mode. +void InitGoogleTest(int *argc, wchar_t **argv) { +#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) + GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv); +#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) + internal::InitGoogleTestImpl(argc, argv); +#endif // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) +} + +// This overloaded version can be used on Arduino/embedded platforms where +// there is no argc/argv. +void InitGoogleTest() { + // Since Arduino doesn't have a command line, fake out the argc/argv arguments + int argc = 1; + const auto arg0 = "dummy"; + char *argv0 = const_cast(arg0); + char **argv = &argv0; + +#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) + GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(&argc, argv); +#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) + internal::InitGoogleTestImpl(&argc, argv); +#endif // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) +} + +std::string TempDir() { +#if defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_) + return GTEST_CUSTOM_TEMPDIR_FUNCTION_(); +#endif + +#if GTEST_OS_WINDOWS_MOBILE + return "\\temp\\"; +#elif GTEST_OS_WINDOWS + const char *temp_dir = internal::posix::GetEnv("TEMP"); + if (temp_dir == nullptr || temp_dir[0] == '\0') + return "\\temp\\"; + else if (temp_dir[strlen(temp_dir) - 1] == '\\') + return temp_dir; + else + return std::string(temp_dir) + "\\"; +#elif GTEST_OS_LINUX_ANDROID + const char *temp_dir = internal::posix::GetEnv("TEST_TMPDIR"); + if (temp_dir == nullptr || temp_dir[0] == '\0') + return "/data/local/tmp/"; + else + return temp_dir; +#else + return "/tmp/"; +#endif // GTEST_OS_WINDOWS_MOBILE +} + +// Class ScopedTrace + +// Pushes the given source file location and message onto a per-thread +// trace stack maintained by Google Test. +void ScopedTrace::PushTrace(const char *file, int line, std::string message) { + internal::TraceInfo trace; + trace.file = file; + trace.line = line; + trace.message.swap(message); + + UnitTest::GetInstance()->PushGTestTrace(trace); +} + +// Pops the info pushed by the c'tor. +ScopedTrace::~ScopedTrace() GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) { + UnitTest::GetInstance()->PopGTestTrace(); +} + +} // namespace testing diff --git a/libs/libaom/src/third_party/googletest/src/googletest/src/gtest_main.cc b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest_main.cc new file mode 100644 index 000000000..77c90ce61 --- /dev/null +++ b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest_main.cc @@ -0,0 +1,52 @@ +// Copyright 2006, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include +#include "gtest/gtest.h" + +#if GTEST_OS_ESP8266 || GTEST_OS_ESP32 +#if GTEST_OS_ESP8266 +extern "C" { +#endif +void setup() { testing::InitGoogleTest(); } + +void loop() { RUN_ALL_TESTS(); } + +#if GTEST_OS_ESP8266 +} +#endif + +#else + +GTEST_API_ int main(int argc, char **argv) { + printf("Running main() from %s\n", __FILE__); + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} +#endif diff --git a/libs/libaom/src/third_party/libwebm/AUTHORS.TXT b/libs/libaom/src/third_party/libwebm/AUTHORS.TXT new file mode 100644 index 000000000..9686ac13e --- /dev/null +++ b/libs/libaom/src/third_party/libwebm/AUTHORS.TXT @@ -0,0 +1,4 @@ +# Names should be added to this file like so: +# Name or Organization + +Google Inc. diff --git a/libs/libaom/src/third_party/libwebm/Android.mk b/libs/libaom/src/third_party/libwebm/Android.mk new file mode 100644 index 000000000..b46ba101d --- /dev/null +++ b/libs/libaom/src/third_party/libwebm/Android.mk @@ -0,0 +1,17 @@ +LOCAL_PATH:= $(call my-dir) + +include $(CLEAR_VARS) +LOCAL_MODULE:= libwebm +LOCAL_CPPFLAGS:=-D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS +LOCAL_CPPFLAGS+=-D__STDC_LIMIT_MACROS -std=c++11 +LOCAL_C_INCLUDES:= $(LOCAL_PATH) +LOCAL_EXPORT_C_INCLUDES:= $(LOCAL_PATH) + +LOCAL_SRC_FILES:= common/file_util.cc \ + common/hdr_util.cc \ + mkvparser/mkvparser.cc \ + mkvparser/mkvreader.cc \ + mkvmuxer/mkvmuxer.cc \ + mkvmuxer/mkvmuxerutil.cc \ + mkvmuxer/mkvwriter.cc +include $(BUILD_STATIC_LIBRARY) diff --git a/libs/libaom/src/third_party/libwebm/LICENSE.TXT b/libs/libaom/src/third_party/libwebm/LICENSE.TXT new file mode 100644 index 000000000..7a6f99547 --- /dev/null +++ b/libs/libaom/src/third_party/libwebm/LICENSE.TXT @@ -0,0 +1,30 @@ +Copyright (c) 2010, Google Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + * Neither the name of Google nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/libs/libaom/src/third_party/libwebm/PATENTS.TXT b/libs/libaom/src/third_party/libwebm/PATENTS.TXT new file mode 100644 index 000000000..caedf607e --- /dev/null +++ b/libs/libaom/src/third_party/libwebm/PATENTS.TXT @@ -0,0 +1,23 @@ +Additional IP Rights Grant (Patents) +------------------------------------ + +"These implementations" means the copyrightable works that implement the WebM +codecs distributed by Google as part of the WebM Project. + +Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge, +royalty-free, irrevocable (except as stated in this section) patent license to +make, have made, use, offer to sell, sell, import, transfer, and otherwise +run, modify and propagate the contents of these implementations of WebM, where +such license applies only to those patent claims, both currently owned by +Google and acquired in the future, licensable by Google that are necessarily +infringed by these implementations of WebM. This grant does not include claims +that would be infringed only as a consequence of further modification of these +implementations. If you or your agent or exclusive licensee institute or order +or agree to the institution of patent litigation or any other patent +enforcement activity against any entity (including a cross-claim or +counterclaim in a lawsuit) alleging that any of these implementations of WebM +or any code incorporated within any of these implementations of WebM +constitute direct or contributory patent infringement, or inducement of +patent infringement, then any patent rights granted to you under this License +for these implementations of WebM shall terminate as of the date such +litigation is filed. diff --git a/libs/libaom/src/third_party/libwebm/README.libaom b/libs/libaom/src/third_party/libwebm/README.libaom new file mode 100644 index 000000000..1e87afd3d --- /dev/null +++ b/libs/libaom/src/third_party/libwebm/README.libaom @@ -0,0 +1,20 @@ +URL: https://chromium.googlesource.com/webm/libwebm +Version: 37d9b860ebbf40cb0f6dcb7a6fef452d798062da +License: BSD +License File: LICENSE.txt + +Description: +libwebm is used to handle WebM container I/O. + +Local Changes: +Only keep: + - Android.mk + - AUTHORS.TXT + - common/ + file_util.cc/h + hdr_util.cc/h + webmids.h + - LICENSE.TXT + - mkvmuxer/ + - mkvparser/ + - PATENTS.TXT diff --git a/libs/libaom/src/third_party/libwebm/common/file_util.cc b/libs/libaom/src/third_party/libwebm/common/file_util.cc new file mode 100644 index 000000000..6eb6428b9 --- /dev/null +++ b/libs/libaom/src/third_party/libwebm/common/file_util.cc @@ -0,0 +1,93 @@ +// Copyright (c) 2016 The WebM project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +#include "common/file_util.h" + +#include +#ifndef _MSC_VER +#include // close() +#endif + +#include +#include +#include +#include +#include +#include + +namespace libwebm { + +std::string GetTempFileName() { +#if !defined _MSC_VER && !defined __MINGW32__ + std::string temp_file_name_template_str = + std::string(std::getenv("TEST_TMPDIR") ? std::getenv("TEST_TMPDIR") + : ".") + + "/libwebm_temp.XXXXXX"; + char* temp_file_name_template = + new char[temp_file_name_template_str.length() + 1]; + memset(temp_file_name_template, 0, temp_file_name_template_str.length() + 1); + temp_file_name_template_str.copy(temp_file_name_template, + temp_file_name_template_str.length(), 0); + int fd = mkstemp(temp_file_name_template); + std::string temp_file_name = + (fd != -1) ? std::string(temp_file_name_template) : std::string(); + delete[] temp_file_name_template; + if (fd != -1) { + close(fd); + } + return temp_file_name; +#else + char tmp_file_name[_MAX_PATH]; +#if defined _MSC_VER || defined MINGW_HAS_SECURE_API + errno_t err = tmpnam_s(tmp_file_name); +#else + char* fname_pointer = tmpnam(tmp_file_name); + int err = (fname_pointer == &tmp_file_name[0]) ? 0 : -1; +#endif + if (err == 0) { + return std::string(tmp_file_name); + } + return std::string(); +#endif +} + +uint64_t GetFileSize(const std::string& file_name) { + uint64_t file_size = 0; +#ifndef _MSC_VER + struct stat st; + st.st_size = 0; + if (stat(file_name.c_str(), &st) == 0) { +#else + struct _stat st; + st.st_size = 0; + if (_stat(file_name.c_str(), &st) == 0) { +#endif + file_size = st.st_size; + } + return file_size; +} + +bool GetFileContents(const std::string& file_name, std::string* contents) { + std::ifstream file(file_name.c_str()); + *contents = std::string(static_cast(GetFileSize(file_name)), 0); + if (file.good() && contents->size()) { + file.read(&(*contents)[0], contents->size()); + } + return !file.fail(); +} + +TempFileDeleter::TempFileDeleter() { file_name_ = GetTempFileName(); } + +TempFileDeleter::~TempFileDeleter() { + std::ifstream file(file_name_.c_str()); + if (file.good()) { + file.close(); + std::remove(file_name_.c_str()); + } +} + +} // namespace libwebm diff --git a/libs/libaom/src/third_party/libwebm/common/file_util.h b/libs/libaom/src/third_party/libwebm/common/file_util.h new file mode 100644 index 000000000..a87373464 --- /dev/null +++ b/libs/libaom/src/third_party/libwebm/common/file_util.h @@ -0,0 +1,44 @@ +// Copyright (c) 2016 The WebM project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +#ifndef LIBWEBM_COMMON_FILE_UTIL_H_ +#define LIBWEBM_COMMON_FILE_UTIL_H_ + +#include + +#include + +#include "mkvmuxer/mkvmuxertypes.h" // LIBWEBM_DISALLOW_COPY_AND_ASSIGN() + +namespace libwebm { + +// Returns a temporary file name. +std::string GetTempFileName(); + +// Returns size of file specified by |file_name|, or 0 upon failure. +uint64_t GetFileSize(const std::string& file_name); + +// Gets the contents file_name as a string. Returns false on error. +bool GetFileContents(const std::string& file_name, std::string* contents); + +// Manages life of temporary file specified at time of construction. Deletes +// file upon destruction. +class TempFileDeleter { + public: + TempFileDeleter(); + explicit TempFileDeleter(std::string file_name) : file_name_(file_name) {} + ~TempFileDeleter(); + const std::string& name() const { return file_name_; } + + private: + std::string file_name_; + LIBWEBM_DISALLOW_COPY_AND_ASSIGN(TempFileDeleter); +}; + +} // namespace libwebm + +#endif // LIBWEBM_COMMON_FILE_UTIL_H_ diff --git a/libs/libaom/src/third_party/libwebm/common/hdr_util.cc b/libs/libaom/src/third_party/libwebm/common/hdr_util.cc new file mode 100644 index 000000000..916f7170b --- /dev/null +++ b/libs/libaom/src/third_party/libwebm/common/hdr_util.cc @@ -0,0 +1,220 @@ +// Copyright (c) 2016 The WebM project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +#include "hdr_util.h" + +#include +#include +#include + +#include "mkvparser/mkvparser.h" + +namespace libwebm { +const int Vp9CodecFeatures::kValueNotPresent = INT_MAX; + +bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc, + PrimaryChromaticityPtr* muxer_pc) { + muxer_pc->reset(new (std::nothrow) + mkvmuxer::PrimaryChromaticity(parser_pc.x, parser_pc.y)); + if (!muxer_pc->get()) + return false; + return true; +} + +bool MasteringMetadataValuePresent(double value) { + return value != mkvparser::MasteringMetadata::kValueNotPresent; +} + +bool CopyMasteringMetadata(const mkvparser::MasteringMetadata& parser_mm, + mkvmuxer::MasteringMetadata* muxer_mm) { + if (MasteringMetadataValuePresent(parser_mm.luminance_max)) + muxer_mm->set_luminance_max(parser_mm.luminance_max); + if (MasteringMetadataValuePresent(parser_mm.luminance_min)) + muxer_mm->set_luminance_min(parser_mm.luminance_min); + + PrimaryChromaticityPtr r_ptr(nullptr); + PrimaryChromaticityPtr g_ptr(nullptr); + PrimaryChromaticityPtr b_ptr(nullptr); + PrimaryChromaticityPtr wp_ptr(nullptr); + + if (parser_mm.r) { + if (!CopyPrimaryChromaticity(*parser_mm.r, &r_ptr)) + return false; + } + if (parser_mm.g) { + if (!CopyPrimaryChromaticity(*parser_mm.g, &g_ptr)) + return false; + } + if (parser_mm.b) { + if (!CopyPrimaryChromaticity(*parser_mm.b, &b_ptr)) + return false; + } + if (parser_mm.white_point) { + if (!CopyPrimaryChromaticity(*parser_mm.white_point, &wp_ptr)) + return false; + } + + if (!muxer_mm->SetChromaticity(r_ptr.get(), g_ptr.get(), b_ptr.get(), + wp_ptr.get())) { + return false; + } + + return true; +} + +bool ColourValuePresent(long long value) { + return value != mkvparser::Colour::kValueNotPresent; +} + +bool CopyColour(const mkvparser::Colour& parser_colour, + mkvmuxer::Colour* muxer_colour) { + if (!muxer_colour) + return false; + + if (ColourValuePresent(parser_colour.matrix_coefficients)) + muxer_colour->set_matrix_coefficients(parser_colour.matrix_coefficients); + if (ColourValuePresent(parser_colour.bits_per_channel)) + muxer_colour->set_bits_per_channel(parser_colour.bits_per_channel); + if (ColourValuePresent(parser_colour.chroma_subsampling_horz)) { + muxer_colour->set_chroma_subsampling_horz( + parser_colour.chroma_subsampling_horz); + } + if (ColourValuePresent(parser_colour.chroma_subsampling_vert)) { + muxer_colour->set_chroma_subsampling_vert( + parser_colour.chroma_subsampling_vert); + } + if (ColourValuePresent(parser_colour.cb_subsampling_horz)) + muxer_colour->set_cb_subsampling_horz(parser_colour.cb_subsampling_horz); + if (ColourValuePresent(parser_colour.cb_subsampling_vert)) + muxer_colour->set_cb_subsampling_vert(parser_colour.cb_subsampling_vert); + if (ColourValuePresent(parser_colour.chroma_siting_horz)) + muxer_colour->set_chroma_siting_horz(parser_colour.chroma_siting_horz); + if (ColourValuePresent(parser_colour.chroma_siting_vert)) + muxer_colour->set_chroma_siting_vert(parser_colour.chroma_siting_vert); + if (ColourValuePresent(parser_colour.range)) + muxer_colour->set_range(parser_colour.range); + if (ColourValuePresent(parser_colour.transfer_characteristics)) { + muxer_colour->set_transfer_characteristics( + parser_colour.transfer_characteristics); + } + if (ColourValuePresent(parser_colour.primaries)) + muxer_colour->set_primaries(parser_colour.primaries); + if (ColourValuePresent(parser_colour.max_cll)) + muxer_colour->set_max_cll(parser_colour.max_cll); + if (ColourValuePresent(parser_colour.max_fall)) + muxer_colour->set_max_fall(parser_colour.max_fall); + + if (parser_colour.mastering_metadata) { + mkvmuxer::MasteringMetadata muxer_mm; + if (!CopyMasteringMetadata(*parser_colour.mastering_metadata, &muxer_mm)) + return false; + if (!muxer_colour->SetMasteringMetadata(muxer_mm)) + return false; + } + return true; +} + +// Format of VPx private data: +// +// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | ID Byte | Length | | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | +// | | +// : Bytes 1..Length of Codec Feature : +// | | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// +// ID Byte Format +// ID byte is an unsigned byte. +// 0 1 2 3 4 5 6 7 +// +-+-+-+-+-+-+-+-+ +// |X| ID | +// +-+-+-+-+-+-+-+-+ +// +// The X bit is reserved. +// +// See the following link for more information: +// http://www.webmproject.org/vp9/profiles/ +bool ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length, + Vp9CodecFeatures* features) { + const int kVpxCodecPrivateMinLength = 3; + if (!private_data || !features || length < kVpxCodecPrivateMinLength) + return false; + + const uint8_t kVp9ProfileId = 1; + const uint8_t kVp9LevelId = 2; + const uint8_t kVp9BitDepthId = 3; + const uint8_t kVp9ChromaSubsamplingId = 4; + const int kVpxFeatureLength = 1; + int offset = 0; + + // Set features to not set. + features->profile = Vp9CodecFeatures::kValueNotPresent; + features->level = Vp9CodecFeatures::kValueNotPresent; + features->bit_depth = Vp9CodecFeatures::kValueNotPresent; + features->chroma_subsampling = Vp9CodecFeatures::kValueNotPresent; + do { + const uint8_t id_byte = private_data[offset++]; + const uint8_t length_byte = private_data[offset++]; + if (length_byte != kVpxFeatureLength) + return false; + if (id_byte == kVp9ProfileId) { + const int priv_profile = static_cast(private_data[offset++]); + if (priv_profile < 0 || priv_profile > 3) + return false; + if (features->profile != Vp9CodecFeatures::kValueNotPresent && + features->profile != priv_profile) { + return false; + } + features->profile = priv_profile; + } else if (id_byte == kVp9LevelId) { + const int priv_level = static_cast(private_data[offset++]); + + const int kNumLevels = 14; + const int levels[kNumLevels] = {10, 11, 20, 21, 30, 31, 40, + 41, 50, 51, 52, 60, 61, 62}; + + for (int i = 0; i < kNumLevels; ++i) { + if (priv_level == levels[i]) { + if (features->level != Vp9CodecFeatures::kValueNotPresent && + features->level != priv_level) { + return false; + } + features->level = priv_level; + break; + } + } + if (features->level == Vp9CodecFeatures::kValueNotPresent) + return false; + } else if (id_byte == kVp9BitDepthId) { + const int priv_profile = static_cast(private_data[offset++]); + if (priv_profile != 8 && priv_profile != 10 && priv_profile != 12) + return false; + if (features->bit_depth != Vp9CodecFeatures::kValueNotPresent && + features->bit_depth != priv_profile) { + return false; + } + features->bit_depth = priv_profile; + } else if (id_byte == kVp9ChromaSubsamplingId) { + const int priv_profile = static_cast(private_data[offset++]); + if (priv_profile != 0 && priv_profile != 2 && priv_profile != 3) + return false; + if (features->chroma_subsampling != Vp9CodecFeatures::kValueNotPresent && + features->chroma_subsampling != priv_profile) { + return false; + } + features->chroma_subsampling = priv_profile; + } else { + // Invalid ID. + return false; + } + } while (offset + kVpxCodecPrivateMinLength <= length); + + return true; +} +} // namespace libwebm diff --git a/libs/libaom/src/third_party/libwebm/common/hdr_util.h b/libs/libaom/src/third_party/libwebm/common/hdr_util.h new file mode 100644 index 000000000..78e2eeb70 --- /dev/null +++ b/libs/libaom/src/third_party/libwebm/common/hdr_util.h @@ -0,0 +1,71 @@ +// Copyright (c) 2016 The WebM project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +#ifndef LIBWEBM_COMMON_HDR_UTIL_H_ +#define LIBWEBM_COMMON_HDR_UTIL_H_ + +#include + +#include + +#include "mkvmuxer/mkvmuxer.h" + +namespace mkvparser { +struct Colour; +struct MasteringMetadata; +struct PrimaryChromaticity; +} // namespace mkvparser + +namespace libwebm { +// Utility types and functions for working with the Colour element and its +// children. Copiers return true upon success. Presence functions return true +// when the specified element is present. + +// TODO(tomfinegan): These should be moved to libwebm_utils once c++11 is +// required by libwebm. + +// Features of the VP9 codec that may be set in the CodecPrivate of a VP9 video +// stream. A value of kValueNotPresent represents that the value was not set in +// the CodecPrivate. +struct Vp9CodecFeatures { + static const int kValueNotPresent; + + Vp9CodecFeatures() + : profile(kValueNotPresent), + level(kValueNotPresent), + bit_depth(kValueNotPresent), + chroma_subsampling(kValueNotPresent) {} + ~Vp9CodecFeatures() {} + + int profile; + int level; + int bit_depth; + int chroma_subsampling; +}; + +typedef std::unique_ptr PrimaryChromaticityPtr; + +bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc, + PrimaryChromaticityPtr* muxer_pc); + +bool MasteringMetadataValuePresent(double value); + +bool CopyMasteringMetadata(const mkvparser::MasteringMetadata& parser_mm, + mkvmuxer::MasteringMetadata* muxer_mm); + +bool ColourValuePresent(long long value); + +bool CopyColour(const mkvparser::Colour& parser_colour, + mkvmuxer::Colour* muxer_colour); + +// Returns true if |features| is set to one or more valid values. +bool ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length, + Vp9CodecFeatures* features); + +} // namespace libwebm + +#endif // LIBWEBM_COMMON_HDR_UTIL_H_ diff --git a/libs/libaom/src/third_party/libwebm/common/webmids.h b/libs/libaom/src/third_party/libwebm/common/webmids.h new file mode 100644 index 000000000..fc0c20814 --- /dev/null +++ b/libs/libaom/src/third_party/libwebm/common/webmids.h @@ -0,0 +1,193 @@ +// Copyright (c) 2012 The WebM project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. + +#ifndef COMMON_WEBMIDS_H_ +#define COMMON_WEBMIDS_H_ + +namespace libwebm { + +enum MkvId { + kMkvEBML = 0x1A45DFA3, + kMkvEBMLVersion = 0x4286, + kMkvEBMLReadVersion = 0x42F7, + kMkvEBMLMaxIDLength = 0x42F2, + kMkvEBMLMaxSizeLength = 0x42F3, + kMkvDocType = 0x4282, + kMkvDocTypeVersion = 0x4287, + kMkvDocTypeReadVersion = 0x4285, + kMkvVoid = 0xEC, + kMkvSignatureSlot = 0x1B538667, + kMkvSignatureAlgo = 0x7E8A, + kMkvSignatureHash = 0x7E9A, + kMkvSignaturePublicKey = 0x7EA5, + kMkvSignature = 0x7EB5, + kMkvSignatureElements = 0x7E5B, + kMkvSignatureElementList = 0x7E7B, + kMkvSignedElement = 0x6532, + // segment + kMkvSegment = 0x18538067, + // Meta Seek Information + kMkvSeekHead = 0x114D9B74, + kMkvSeek = 0x4DBB, + kMkvSeekID = 0x53AB, + kMkvSeekPosition = 0x53AC, + // Segment Information + kMkvInfo = 0x1549A966, + kMkvTimecodeScale = 0x2AD7B1, + kMkvDuration = 0x4489, + kMkvDateUTC = 0x4461, + kMkvTitle = 0x7BA9, + kMkvMuxingApp = 0x4D80, + kMkvWritingApp = 0x5741, + // Cluster + kMkvCluster = 0x1F43B675, + kMkvTimecode = 0xE7, + kMkvPrevSize = 0xAB, + kMkvBlockGroup = 0xA0, + kMkvBlock = 0xA1, + kMkvBlockDuration = 0x9B, + kMkvReferenceBlock = 0xFB, + kMkvLaceNumber = 0xCC, + kMkvSimpleBlock = 0xA3, + kMkvBlockAdditions = 0x75A1, + kMkvBlockMore = 0xA6, + kMkvBlockAddID = 0xEE, + kMkvBlockAdditional = 0xA5, + kMkvDiscardPadding = 0x75A2, + // Track + kMkvTracks = 0x1654AE6B, + kMkvTrackEntry = 0xAE, + kMkvTrackNumber = 0xD7, + kMkvTrackUID = 0x73C5, + kMkvTrackType = 0x83, + kMkvFlagEnabled = 0xB9, + kMkvFlagDefault = 0x88, + kMkvFlagForced = 0x55AA, + kMkvFlagLacing = 0x9C, + kMkvDefaultDuration = 0x23E383, + kMkvMaxBlockAdditionID = 0x55EE, + kMkvName = 0x536E, + kMkvLanguage = 0x22B59C, + kMkvCodecID = 0x86, + kMkvCodecPrivate = 0x63A2, + kMkvCodecName = 0x258688, + kMkvCodecDelay = 0x56AA, + kMkvSeekPreRoll = 0x56BB, + // video + kMkvVideo = 0xE0, + kMkvFlagInterlaced = 0x9A, + kMkvStereoMode = 0x53B8, + kMkvAlphaMode = 0x53C0, + kMkvPixelWidth = 0xB0, + kMkvPixelHeight = 0xBA, + kMkvPixelCropBottom = 0x54AA, + kMkvPixelCropTop = 0x54BB, + kMkvPixelCropLeft = 0x54CC, + kMkvPixelCropRight = 0x54DD, + kMkvDisplayWidth = 0x54B0, + kMkvDisplayHeight = 0x54BA, + kMkvDisplayUnit = 0x54B2, + kMkvAspectRatioType = 0x54B3, + kMkvColourSpace = 0x2EB524, + kMkvFrameRate = 0x2383E3, + // end video + // colour + kMkvColour = 0x55B0, + kMkvMatrixCoefficients = 0x55B1, + kMkvBitsPerChannel = 0x55B2, + kMkvChromaSubsamplingHorz = 0x55B3, + kMkvChromaSubsamplingVert = 0x55B4, + kMkvCbSubsamplingHorz = 0x55B5, + kMkvCbSubsamplingVert = 0x55B6, + kMkvChromaSitingHorz = 0x55B7, + kMkvChromaSitingVert = 0x55B8, + kMkvRange = 0x55B9, + kMkvTransferCharacteristics = 0x55BA, + kMkvPrimaries = 0x55BB, + kMkvMaxCLL = 0x55BC, + kMkvMaxFALL = 0x55BD, + // mastering metadata + kMkvMasteringMetadata = 0x55D0, + kMkvPrimaryRChromaticityX = 0x55D1, + kMkvPrimaryRChromaticityY = 0x55D2, + kMkvPrimaryGChromaticityX = 0x55D3, + kMkvPrimaryGChromaticityY = 0x55D4, + kMkvPrimaryBChromaticityX = 0x55D5, + kMkvPrimaryBChromaticityY = 0x55D6, + kMkvWhitePointChromaticityX = 0x55D7, + kMkvWhitePointChromaticityY = 0x55D8, + kMkvLuminanceMax = 0x55D9, + kMkvLuminanceMin = 0x55DA, + // end mastering metadata + // end colour + // projection + kMkvProjection = 0x7670, + kMkvProjectionType = 0x7671, + kMkvProjectionPrivate = 0x7672, + kMkvProjectionPoseYaw = 0x7673, + kMkvProjectionPosePitch = 0x7674, + kMkvProjectionPoseRoll = 0x7675, + // end projection + // audio + kMkvAudio = 0xE1, + kMkvSamplingFrequency = 0xB5, + kMkvOutputSamplingFrequency = 0x78B5, + kMkvChannels = 0x9F, + kMkvBitDepth = 0x6264, + // end audio + // ContentEncodings + kMkvContentEncodings = 0x6D80, + kMkvContentEncoding = 0x6240, + kMkvContentEncodingOrder = 0x5031, + kMkvContentEncodingScope = 0x5032, + kMkvContentEncodingType = 0x5033, + kMkvContentCompression = 0x5034, + kMkvContentCompAlgo = 0x4254, + kMkvContentCompSettings = 0x4255, + kMkvContentEncryption = 0x5035, + kMkvContentEncAlgo = 0x47E1, + kMkvContentEncKeyID = 0x47E2, + kMkvContentSignature = 0x47E3, + kMkvContentSigKeyID = 0x47E4, + kMkvContentSigAlgo = 0x47E5, + kMkvContentSigHashAlgo = 0x47E6, + kMkvContentEncAESSettings = 0x47E7, + kMkvAESSettingsCipherMode = 0x47E8, + kMkvAESSettingsCipherInitData = 0x47E9, + // end ContentEncodings + // Cueing Data + kMkvCues = 0x1C53BB6B, + kMkvCuePoint = 0xBB, + kMkvCueTime = 0xB3, + kMkvCueTrackPositions = 0xB7, + kMkvCueTrack = 0xF7, + kMkvCueClusterPosition = 0xF1, + kMkvCueBlockNumber = 0x5378, + // Chapters + kMkvChapters = 0x1043A770, + kMkvEditionEntry = 0x45B9, + kMkvChapterAtom = 0xB6, + kMkvChapterUID = 0x73C4, + kMkvChapterStringUID = 0x5654, + kMkvChapterTimeStart = 0x91, + kMkvChapterTimeEnd = 0x92, + kMkvChapterDisplay = 0x80, + kMkvChapString = 0x85, + kMkvChapLanguage = 0x437C, + kMkvChapCountry = 0x437E, + // Tags + kMkvTags = 0x1254C367, + kMkvTag = 0x7373, + kMkvSimpleTag = 0x67C8, + kMkvTagName = 0x45A3, + kMkvTagString = 0x4487 +}; + +} // namespace libwebm + +#endif // COMMON_WEBMIDS_H_ diff --git a/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxer.cc new file mode 100644 index 000000000..512031211 --- /dev/null +++ b/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxer.cc @@ -0,0 +1,4221 @@ +// Copyright (c) 2012 The WebM project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. + +#include "mkvmuxer/mkvmuxer.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/webmids.h" +#include "mkvmuxer/mkvmuxerutil.h" +#include "mkvmuxer/mkvwriter.h" +#include "mkvparser/mkvparser.h" + +namespace mkvmuxer { + +const float PrimaryChromaticity::kChromaticityMin = 0.0f; +const float PrimaryChromaticity::kChromaticityMax = 1.0f; +const float MasteringMetadata::kMinLuminance = 0.0f; +const float MasteringMetadata::kMinLuminanceMax = 999.99f; +const float MasteringMetadata::kMaxLuminanceMax = 9999.99f; +const float MasteringMetadata::kValueNotPresent = FLT_MAX; +const uint64_t Colour::kValueNotPresent = UINT64_MAX; + +namespace { + +const char kDocTypeWebm[] = "webm"; +const char kDocTypeMatroska[] = "matroska"; + +// Deallocate the string designated by |dst|, and then copy the |src| +// string to |dst|. The caller owns both the |src| string and the +// |dst| copy (hence the caller is responsible for eventually +// deallocating the strings, either directly, or indirectly via +// StrCpy). Returns true if the source string was successfully copied +// to the destination. +bool StrCpy(const char* src, char** dst_ptr) { + if (dst_ptr == NULL) + return false; + + char*& dst = *dst_ptr; + + delete[] dst; + dst = NULL; + + if (src == NULL) + return true; + + const size_t size = strlen(src) + 1; + + dst = new (std::nothrow) char[size]; // NOLINT + if (dst == NULL) + return false; + + strcpy(dst, src); // NOLINT + return true; +} + +typedef std::unique_ptr PrimaryChromaticityPtr; +bool CopyChromaticity(const PrimaryChromaticity* src, + PrimaryChromaticityPtr* dst) { + if (!dst) + return false; + + dst->reset(new (std::nothrow) PrimaryChromaticity(src->x(), src->y())); + if (!dst->get()) + return false; + + return true; +} + +} // namespace + +/////////////////////////////////////////////////////////////// +// +// IMkvWriter Class + +IMkvWriter::IMkvWriter() {} + +IMkvWriter::~IMkvWriter() {} + +bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version, + const char* const doc_type) { + // Level 0 + uint64_t size = + EbmlElementSize(libwebm::kMkvEBMLVersion, static_cast(1)); + size += EbmlElementSize(libwebm::kMkvEBMLReadVersion, static_cast(1)); + size += EbmlElementSize(libwebm::kMkvEBMLMaxIDLength, static_cast(4)); + size += + EbmlElementSize(libwebm::kMkvEBMLMaxSizeLength, static_cast(8)); + size += EbmlElementSize(libwebm::kMkvDocType, doc_type); + size += EbmlElementSize(libwebm::kMkvDocTypeVersion, + static_cast(doc_type_version)); + size += + EbmlElementSize(libwebm::kMkvDocTypeReadVersion, static_cast(2)); + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvEBML, size)) + return false; + if (!WriteEbmlElement(writer, libwebm::kMkvEBMLVersion, + static_cast(1))) { + return false; + } + if (!WriteEbmlElement(writer, libwebm::kMkvEBMLReadVersion, + static_cast(1))) { + return false; + } + if (!WriteEbmlElement(writer, libwebm::kMkvEBMLMaxIDLength, + static_cast(4))) { + return false; + } + if (!WriteEbmlElement(writer, libwebm::kMkvEBMLMaxSizeLength, + static_cast(8))) { + return false; + } + if (!WriteEbmlElement(writer, libwebm::kMkvDocType, doc_type)) + return false; + if (!WriteEbmlElement(writer, libwebm::kMkvDocTypeVersion, + static_cast(doc_type_version))) { + return false; + } + if (!WriteEbmlElement(writer, libwebm::kMkvDocTypeReadVersion, + static_cast(2))) { + return false; + } + + return true; +} + +bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version) { + return WriteEbmlHeader(writer, doc_type_version, kDocTypeWebm); +} + +bool WriteEbmlHeader(IMkvWriter* writer) { + return WriteEbmlHeader(writer, mkvmuxer::Segment::kDefaultDocTypeVersion); +} + +bool ChunkedCopy(mkvparser::IMkvReader* source, mkvmuxer::IMkvWriter* dst, + int64_t start, int64_t size) { + // TODO(vigneshv): Check if this is a reasonable value. + const uint32_t kBufSize = 2048; + uint8_t* buf = new uint8_t[kBufSize]; + int64_t offset = start; + while (size > 0) { + const int64_t read_len = (size > kBufSize) ? kBufSize : size; + if (source->Read(offset, static_cast(read_len), buf)) + return false; + dst->Write(buf, static_cast(read_len)); + offset += read_len; + size -= read_len; + } + delete[] buf; + return true; +} + +/////////////////////////////////////////////////////////////// +// +// Frame Class + +Frame::Frame() + : add_id_(0), + additional_(NULL), + additional_length_(0), + duration_(0), + duration_set_(false), + frame_(NULL), + is_key_(false), + length_(0), + track_number_(0), + timestamp_(0), + discard_padding_(0), + reference_block_timestamp_(0), + reference_block_timestamp_set_(false) {} + +Frame::~Frame() { + delete[] frame_; + delete[] additional_; +} + +bool Frame::CopyFrom(const Frame& frame) { + delete[] frame_; + frame_ = NULL; + length_ = 0; + if (frame.length() > 0 && frame.frame() != NULL && + !Init(frame.frame(), frame.length())) { + return false; + } + add_id_ = 0; + delete[] additional_; + additional_ = NULL; + additional_length_ = 0; + if (frame.additional_length() > 0 && frame.additional() != NULL && + !AddAdditionalData(frame.additional(), frame.additional_length(), + frame.add_id())) { + return false; + } + duration_ = frame.duration(); + duration_set_ = frame.duration_set(); + is_key_ = frame.is_key(); + track_number_ = frame.track_number(); + timestamp_ = frame.timestamp(); + discard_padding_ = frame.discard_padding(); + reference_block_timestamp_ = frame.reference_block_timestamp(); + reference_block_timestamp_set_ = frame.reference_block_timestamp_set(); + return true; +} + +bool Frame::Init(const uint8_t* frame, uint64_t length) { + uint8_t* const data = + new (std::nothrow) uint8_t[static_cast(length)]; // NOLINT + if (!data) + return false; + + delete[] frame_; + frame_ = data; + length_ = length; + + memcpy(frame_, frame, static_cast(length_)); + return true; +} + +bool Frame::AddAdditionalData(const uint8_t* additional, uint64_t length, + uint64_t add_id) { + uint8_t* const data = + new (std::nothrow) uint8_t[static_cast(length)]; // NOLINT + if (!data) + return false; + + delete[] additional_; + additional_ = data; + additional_length_ = length; + add_id_ = add_id; + + memcpy(additional_, additional, static_cast(additional_length_)); + return true; +} + +bool Frame::IsValid() const { + if (length_ == 0 || !frame_) { + return false; + } + if ((additional_length_ != 0 && !additional_) || + (additional_ != NULL && additional_length_ == 0)) { + return false; + } + if (track_number_ == 0 || track_number_ > kMaxTrackNumber) { + return false; + } + if (!CanBeSimpleBlock() && !is_key_ && !reference_block_timestamp_set_) { + return false; + } + return true; +} + +bool Frame::CanBeSimpleBlock() const { + return additional_ == NULL && discard_padding_ == 0 && duration_ == 0; +} + +void Frame::set_duration(uint64_t duration) { + duration_ = duration; + duration_set_ = true; +} + +void Frame::set_reference_block_timestamp(int64_t reference_block_timestamp) { + reference_block_timestamp_ = reference_block_timestamp; + reference_block_timestamp_set_ = true; +} + +/////////////////////////////////////////////////////////////// +// +// CuePoint Class + +CuePoint::CuePoint() + : time_(0), + track_(0), + cluster_pos_(0), + block_number_(1), + output_block_number_(true) {} + +CuePoint::~CuePoint() {} + +bool CuePoint::Write(IMkvWriter* writer) const { + if (!writer || track_ < 1 || cluster_pos_ < 1) + return false; + + uint64_t size = EbmlElementSize(libwebm::kMkvCueClusterPosition, + static_cast(cluster_pos_)); + size += EbmlElementSize(libwebm::kMkvCueTrack, static_cast(track_)); + if (output_block_number_ && block_number_ > 1) + size += EbmlElementSize(libwebm::kMkvCueBlockNumber, + static_cast(block_number_)); + const uint64_t track_pos_size = + EbmlMasterElementSize(libwebm::kMkvCueTrackPositions, size) + size; + const uint64_t payload_size = + EbmlElementSize(libwebm::kMkvCueTime, static_cast(time_)) + + track_pos_size; + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvCuePoint, payload_size)) + return false; + + const int64_t payload_position = writer->Position(); + if (payload_position < 0) + return false; + + if (!WriteEbmlElement(writer, libwebm::kMkvCueTime, + static_cast(time_))) { + return false; + } + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvCueTrackPositions, size)) + return false; + if (!WriteEbmlElement(writer, libwebm::kMkvCueTrack, + static_cast(track_))) { + return false; + } + if (!WriteEbmlElement(writer, libwebm::kMkvCueClusterPosition, + static_cast(cluster_pos_))) { + return false; + } + if (output_block_number_ && block_number_ > 1) { + if (!WriteEbmlElement(writer, libwebm::kMkvCueBlockNumber, + static_cast(block_number_))) { + return false; + } + } + + const int64_t stop_position = writer->Position(); + if (stop_position < 0) + return false; + + if (stop_position - payload_position != static_cast(payload_size)) + return false; + + return true; +} + +uint64_t CuePoint::PayloadSize() const { + uint64_t size = EbmlElementSize(libwebm::kMkvCueClusterPosition, + static_cast(cluster_pos_)); + size += EbmlElementSize(libwebm::kMkvCueTrack, static_cast(track_)); + if (output_block_number_ && block_number_ > 1) + size += EbmlElementSize(libwebm::kMkvCueBlockNumber, + static_cast(block_number_)); + const uint64_t track_pos_size = + EbmlMasterElementSize(libwebm::kMkvCueTrackPositions, size) + size; + const uint64_t payload_size = + EbmlElementSize(libwebm::kMkvCueTime, static_cast(time_)) + + track_pos_size; + + return payload_size; +} + +uint64_t CuePoint::Size() const { + const uint64_t payload_size = PayloadSize(); + return EbmlMasterElementSize(libwebm::kMkvCuePoint, payload_size) + + payload_size; +} + +/////////////////////////////////////////////////////////////// +// +// Cues Class + +Cues::Cues() + : cue_entries_capacity_(0), + cue_entries_size_(0), + cue_entries_(NULL), + output_block_number_(true) {} + +Cues::~Cues() { + if (cue_entries_) { + for (int32_t i = 0; i < cue_entries_size_; ++i) { + CuePoint* const cue = cue_entries_[i]; + delete cue; + } + delete[] cue_entries_; + } +} + +bool Cues::AddCue(CuePoint* cue) { + if (!cue) + return false; + + if ((cue_entries_size_ + 1) > cue_entries_capacity_) { + // Add more CuePoints. + const int32_t new_capacity = + (!cue_entries_capacity_) ? 2 : cue_entries_capacity_ * 2; + + if (new_capacity < 1) + return false; + + CuePoint** const cues = + new (std::nothrow) CuePoint*[new_capacity]; // NOLINT + if (!cues) + return false; + + for (int32_t i = 0; i < cue_entries_size_; ++i) { + cues[i] = cue_entries_[i]; + } + + delete[] cue_entries_; + + cue_entries_ = cues; + cue_entries_capacity_ = new_capacity; + } + + cue->set_output_block_number(output_block_number_); + cue_entries_[cue_entries_size_++] = cue; + return true; +} + +CuePoint* Cues::GetCueByIndex(int32_t index) const { + if (cue_entries_ == NULL) + return NULL; + + if (index >= cue_entries_size_) + return NULL; + + return cue_entries_[index]; +} + +uint64_t Cues::Size() { + uint64_t size = 0; + for (int32_t i = 0; i < cue_entries_size_; ++i) + size += GetCueByIndex(i)->Size(); + size += EbmlMasterElementSize(libwebm::kMkvCues, size); + return size; +} + +bool Cues::Write(IMkvWriter* writer) const { + if (!writer) + return false; + + uint64_t size = 0; + for (int32_t i = 0; i < cue_entries_size_; ++i) { + const CuePoint* const cue = GetCueByIndex(i); + + if (!cue) + return false; + + size += cue->Size(); + } + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvCues, size)) + return false; + + const int64_t payload_position = writer->Position(); + if (payload_position < 0) + return false; + + for (int32_t i = 0; i < cue_entries_size_; ++i) { + const CuePoint* const cue = GetCueByIndex(i); + + if (!cue->Write(writer)) + return false; + } + + const int64_t stop_position = writer->Position(); + if (stop_position < 0) + return false; + + if (stop_position - payload_position != static_cast(size)) + return false; + + return true; +} + +/////////////////////////////////////////////////////////////// +// +// ContentEncAESSettings Class + +ContentEncAESSettings::ContentEncAESSettings() : cipher_mode_(kCTR) {} + +uint64_t ContentEncAESSettings::Size() const { + const uint64_t payload = PayloadSize(); + const uint64_t size = + EbmlMasterElementSize(libwebm::kMkvContentEncAESSettings, payload) + + payload; + return size; +} + +bool ContentEncAESSettings::Write(IMkvWriter* writer) const { + const uint64_t payload = PayloadSize(); + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvContentEncAESSettings, + payload)) + return false; + + const int64_t payload_position = writer->Position(); + if (payload_position < 0) + return false; + + if (!WriteEbmlElement(writer, libwebm::kMkvAESSettingsCipherMode, + static_cast(cipher_mode_))) { + return false; + } + + const int64_t stop_position = writer->Position(); + if (stop_position < 0 || + stop_position - payload_position != static_cast(payload)) + return false; + + return true; +} + +uint64_t ContentEncAESSettings::PayloadSize() const { + uint64_t size = EbmlElementSize(libwebm::kMkvAESSettingsCipherMode, + static_cast(cipher_mode_)); + return size; +} + +/////////////////////////////////////////////////////////////// +// +// ContentEncoding Class + +ContentEncoding::ContentEncoding() + : enc_algo_(5), + enc_key_id_(NULL), + encoding_order_(0), + encoding_scope_(1), + encoding_type_(1), + enc_key_id_length_(0) {} + +ContentEncoding::~ContentEncoding() { delete[] enc_key_id_; } + +bool ContentEncoding::SetEncryptionID(const uint8_t* id, uint64_t length) { + if (!id || length < 1) + return false; + + delete[] enc_key_id_; + + enc_key_id_ = + new (std::nothrow) uint8_t[static_cast(length)]; // NOLINT + if (!enc_key_id_) + return false; + + memcpy(enc_key_id_, id, static_cast(length)); + enc_key_id_length_ = length; + + return true; +} + +uint64_t ContentEncoding::Size() const { + const uint64_t encryption_size = EncryptionSize(); + const uint64_t encoding_size = EncodingSize(0, encryption_size); + const uint64_t encodings_size = + EbmlMasterElementSize(libwebm::kMkvContentEncoding, encoding_size) + + encoding_size; + + return encodings_size; +} + +bool ContentEncoding::Write(IMkvWriter* writer) const { + const uint64_t encryption_size = EncryptionSize(); + const uint64_t encoding_size = EncodingSize(0, encryption_size); + const uint64_t size = + EbmlMasterElementSize(libwebm::kMkvContentEncoding, encoding_size) + + encoding_size; + + const int64_t payload_position = writer->Position(); + if (payload_position < 0) + return false; + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvContentEncoding, + encoding_size)) + return false; + if (!WriteEbmlElement(writer, libwebm::kMkvContentEncodingOrder, + static_cast(encoding_order_))) + return false; + if (!WriteEbmlElement(writer, libwebm::kMkvContentEncodingScope, + static_cast(encoding_scope_))) + return false; + if (!WriteEbmlElement(writer, libwebm::kMkvContentEncodingType, + static_cast(encoding_type_))) + return false; + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvContentEncryption, + encryption_size)) + return false; + if (!WriteEbmlElement(writer, libwebm::kMkvContentEncAlgo, + static_cast(enc_algo_))) { + return false; + } + if (!WriteEbmlElement(writer, libwebm::kMkvContentEncKeyID, enc_key_id_, + enc_key_id_length_)) + return false; + + if (!enc_aes_settings_.Write(writer)) + return false; + + const int64_t stop_position = writer->Position(); + if (stop_position < 0 || + stop_position - payload_position != static_cast(size)) + return false; + + return true; +} + +uint64_t ContentEncoding::EncodingSize(uint64_t compresion_size, + uint64_t encryption_size) const { + // TODO(fgalligan): Add support for compression settings. + if (compresion_size != 0) + return 0; + + uint64_t encoding_size = 0; + + if (encryption_size > 0) { + encoding_size += + EbmlMasterElementSize(libwebm::kMkvContentEncryption, encryption_size) + + encryption_size; + } + encoding_size += EbmlElementSize(libwebm::kMkvContentEncodingType, + static_cast(encoding_type_)); + encoding_size += EbmlElementSize(libwebm::kMkvContentEncodingScope, + static_cast(encoding_scope_)); + encoding_size += EbmlElementSize(libwebm::kMkvContentEncodingOrder, + static_cast(encoding_order_)); + + return encoding_size; +} + +uint64_t ContentEncoding::EncryptionSize() const { + const uint64_t aes_size = enc_aes_settings_.Size(); + + uint64_t encryption_size = EbmlElementSize(libwebm::kMkvContentEncKeyID, + enc_key_id_, enc_key_id_length_); + encryption_size += EbmlElementSize(libwebm::kMkvContentEncAlgo, + static_cast(enc_algo_)); + + return encryption_size + aes_size; +} + +/////////////////////////////////////////////////////////////// +// +// Track Class + +Track::Track(unsigned int* seed) + : codec_id_(NULL), + codec_private_(NULL), + language_(NULL), + max_block_additional_id_(0), + name_(NULL), + number_(0), + type_(0), + uid_(MakeUID(seed)), + codec_delay_(0), + seek_pre_roll_(0), + default_duration_(0), + codec_private_length_(0), + content_encoding_entries_(NULL), + content_encoding_entries_size_(0) {} + +Track::~Track() { + delete[] codec_id_; + delete[] codec_private_; + delete[] language_; + delete[] name_; + + if (content_encoding_entries_) { + for (uint32_t i = 0; i < content_encoding_entries_size_; ++i) { + ContentEncoding* const encoding = content_encoding_entries_[i]; + delete encoding; + } + delete[] content_encoding_entries_; + } +} + +bool Track::AddContentEncoding() { + const uint32_t count = content_encoding_entries_size_ + 1; + + ContentEncoding** const content_encoding_entries = + new (std::nothrow) ContentEncoding*[count]; // NOLINT + if (!content_encoding_entries) + return false; + + ContentEncoding* const content_encoding = + new (std::nothrow) ContentEncoding(); // NOLINT + if (!content_encoding) { + delete[] content_encoding_entries; + return false; + } + + for (uint32_t i = 0; i < content_encoding_entries_size_; ++i) { + content_encoding_entries[i] = content_encoding_entries_[i]; + } + + delete[] content_encoding_entries_; + + content_encoding_entries_ = content_encoding_entries; + content_encoding_entries_[content_encoding_entries_size_] = content_encoding; + content_encoding_entries_size_ = count; + return true; +} + +ContentEncoding* Track::GetContentEncodingByIndex(uint32_t index) const { + if (content_encoding_entries_ == NULL) + return NULL; + + if (index >= content_encoding_entries_size_) + return NULL; + + return content_encoding_entries_[index]; +} + +uint64_t Track::PayloadSize() const { + uint64_t size = + EbmlElementSize(libwebm::kMkvTrackNumber, static_cast(number_)); + size += EbmlElementSize(libwebm::kMkvTrackUID, static_cast(uid_)); + size += EbmlElementSize(libwebm::kMkvTrackType, static_cast(type_)); + if (codec_id_) + size += EbmlElementSize(libwebm::kMkvCodecID, codec_id_); + if (codec_private_) + size += EbmlElementSize(libwebm::kMkvCodecPrivate, codec_private_, + codec_private_length_); + if (language_) + size += EbmlElementSize(libwebm::kMkvLanguage, language_); + if (name_) + size += EbmlElementSize(libwebm::kMkvName, name_); + if (max_block_additional_id_) { + size += EbmlElementSize(libwebm::kMkvMaxBlockAdditionID, + static_cast(max_block_additional_id_)); + } + if (codec_delay_) { + size += EbmlElementSize(libwebm::kMkvCodecDelay, + static_cast(codec_delay_)); + } + if (seek_pre_roll_) { + size += EbmlElementSize(libwebm::kMkvSeekPreRoll, + static_cast(seek_pre_roll_)); + } + if (default_duration_) { + size += EbmlElementSize(libwebm::kMkvDefaultDuration, + static_cast(default_duration_)); + } + + if (content_encoding_entries_size_ > 0) { + uint64_t content_encodings_size = 0; + for (uint32_t i = 0; i < content_encoding_entries_size_; ++i) { + ContentEncoding* const encoding = content_encoding_entries_[i]; + content_encodings_size += encoding->Size(); + } + + size += EbmlMasterElementSize(libwebm::kMkvContentEncodings, + content_encodings_size) + + content_encodings_size; + } + + return size; +} + +uint64_t Track::Size() const { + uint64_t size = PayloadSize(); + size += EbmlMasterElementSize(libwebm::kMkvTrackEntry, size); + return size; +} + +bool Track::Write(IMkvWriter* writer) const { + if (!writer) + return false; + + // mandatory elements without a default value. + if (!type_ || !codec_id_) + return false; + + // AV1 tracks require a CodecPrivate. See + // https://github.com/Matroska-Org/matroska-specification/blob/av1-mappin/codec/av1.md + // TODO(tomfinegan): Update the above link to the AV1 Matroska mappings to + // point to a stable version once it is finalized, or our own WebM mappings + // page on webmproject.org should we decide to release them. + if (!strcmp(codec_id_, Tracks::kAv1CodecId) && !codec_private_) + return false; + + // |size| may be bigger than what is written out in this function because + // derived classes may write out more data in the Track element. + const uint64_t payload_size = PayloadSize(); + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvTrackEntry, payload_size)) + return false; + + uint64_t size = + EbmlElementSize(libwebm::kMkvTrackNumber, static_cast(number_)); + size += EbmlElementSize(libwebm::kMkvTrackUID, static_cast(uid_)); + size += EbmlElementSize(libwebm::kMkvTrackType, static_cast(type_)); + if (codec_id_) + size += EbmlElementSize(libwebm::kMkvCodecID, codec_id_); + if (codec_private_) + size += EbmlElementSize(libwebm::kMkvCodecPrivate, codec_private_, + static_cast(codec_private_length_)); + if (language_) + size += EbmlElementSize(libwebm::kMkvLanguage, language_); + if (name_) + size += EbmlElementSize(libwebm::kMkvName, name_); + if (max_block_additional_id_) + size += EbmlElementSize(libwebm::kMkvMaxBlockAdditionID, + static_cast(max_block_additional_id_)); + if (codec_delay_) + size += EbmlElementSize(libwebm::kMkvCodecDelay, + static_cast(codec_delay_)); + if (seek_pre_roll_) + size += EbmlElementSize(libwebm::kMkvSeekPreRoll, + static_cast(seek_pre_roll_)); + if (default_duration_) + size += EbmlElementSize(libwebm::kMkvDefaultDuration, + static_cast(default_duration_)); + + const int64_t payload_position = writer->Position(); + if (payload_position < 0) + return false; + + if (!WriteEbmlElement(writer, libwebm::kMkvTrackNumber, + static_cast(number_))) + return false; + if (!WriteEbmlElement(writer, libwebm::kMkvTrackUID, + static_cast(uid_))) + return false; + if (!WriteEbmlElement(writer, libwebm::kMkvTrackType, + static_cast(type_))) + return false; + if (max_block_additional_id_) { + if (!WriteEbmlElement(writer, libwebm::kMkvMaxBlockAdditionID, + static_cast(max_block_additional_id_))) { + return false; + } + } + if (codec_delay_) { + if (!WriteEbmlElement(writer, libwebm::kMkvCodecDelay, + static_cast(codec_delay_))) + return false; + } + if (seek_pre_roll_) { + if (!WriteEbmlElement(writer, libwebm::kMkvSeekPreRoll, + static_cast(seek_pre_roll_))) + return false; + } + if (default_duration_) { + if (!WriteEbmlElement(writer, libwebm::kMkvDefaultDuration, + static_cast(default_duration_))) + return false; + } + if (codec_id_) { + if (!WriteEbmlElement(writer, libwebm::kMkvCodecID, codec_id_)) + return false; + } + if (codec_private_) { + if (!WriteEbmlElement(writer, libwebm::kMkvCodecPrivate, codec_private_, + static_cast(codec_private_length_))) + return false; + } + if (language_) { + if (!WriteEbmlElement(writer, libwebm::kMkvLanguage, language_)) + return false; + } + if (name_) { + if (!WriteEbmlElement(writer, libwebm::kMkvName, name_)) + return false; + } + + int64_t stop_position = writer->Position(); + if (stop_position < 0 || + stop_position - payload_position != static_cast(size)) + return false; + + if (content_encoding_entries_size_ > 0) { + uint64_t content_encodings_size = 0; + for (uint32_t i = 0; i < content_encoding_entries_size_; ++i) { + ContentEncoding* const encoding = content_encoding_entries_[i]; + content_encodings_size += encoding->Size(); + } + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvContentEncodings, + content_encodings_size)) + return false; + + for (uint32_t i = 0; i < content_encoding_entries_size_; ++i) { + ContentEncoding* const encoding = content_encoding_entries_[i]; + if (!encoding->Write(writer)) + return false; + } + } + + stop_position = writer->Position(); + if (stop_position < 0) + return false; + return true; +} + +bool Track::SetCodecPrivate(const uint8_t* codec_private, uint64_t length) { + if (!codec_private || length < 1) + return false; + + delete[] codec_private_; + + codec_private_ = + new (std::nothrow) uint8_t[static_cast(length)]; // NOLINT + if (!codec_private_) + return false; + + memcpy(codec_private_, codec_private, static_cast(length)); + codec_private_length_ = length; + + return true; +} + +void Track::set_codec_id(const char* codec_id) { + if (codec_id) { + delete[] codec_id_; + + const size_t length = strlen(codec_id) + 1; + codec_id_ = new (std::nothrow) char[length]; // NOLINT + if (codec_id_) { +#ifdef _MSC_VER + strcpy_s(codec_id_, length, codec_id); +#else + strcpy(codec_id_, codec_id); +#endif + } + } +} + +// TODO(fgalligan): Vet the language parameter. +void Track::set_language(const char* language) { + if (language) { + delete[] language_; + + const size_t length = strlen(language) + 1; + language_ = new (std::nothrow) char[length]; // NOLINT + if (language_) { +#ifdef _MSC_VER + strcpy_s(language_, length, language); +#else + strcpy(language_, language); +#endif + } + } +} + +void Track::set_name(const char* name) { + if (name) { + delete[] name_; + + const size_t length = strlen(name) + 1; + name_ = new (std::nothrow) char[length]; // NOLINT + if (name_) { +#ifdef _MSC_VER + strcpy_s(name_, length, name); +#else + strcpy(name_, name); +#endif + } + } +} + +/////////////////////////////////////////////////////////////// +// +// Colour and its child elements + +uint64_t PrimaryChromaticity::PrimaryChromaticitySize( + libwebm::MkvId x_id, libwebm::MkvId y_id) const { + return EbmlElementSize(x_id, x_) + EbmlElementSize(y_id, y_); +} + +bool PrimaryChromaticity::Write(IMkvWriter* writer, libwebm::MkvId x_id, + libwebm::MkvId y_id) const { + if (!Valid()) { + return false; + } + return WriteEbmlElement(writer, x_id, x_) && + WriteEbmlElement(writer, y_id, y_); +} + +bool PrimaryChromaticity::Valid() const { + return (x_ >= kChromaticityMin && x_ <= kChromaticityMax && + y_ >= kChromaticityMin && y_ <= kChromaticityMax); +} + +uint64_t MasteringMetadata::MasteringMetadataSize() const { + uint64_t size = PayloadSize(); + + if (size > 0) + size += EbmlMasterElementSize(libwebm::kMkvMasteringMetadata, size); + + return size; +} + +bool MasteringMetadata::Valid() const { + if (luminance_min_ != kValueNotPresent) { + if (luminance_min_ < kMinLuminance || luminance_min_ > kMinLuminanceMax || + luminance_min_ > luminance_max_) { + return false; + } + } + if (luminance_max_ != kValueNotPresent) { + if (luminance_max_ < kMinLuminance || luminance_max_ > kMaxLuminanceMax || + luminance_max_ < luminance_min_) { + return false; + } + } + if (r_ && !r_->Valid()) + return false; + if (g_ && !g_->Valid()) + return false; + if (b_ && !b_->Valid()) + return false; + if (white_point_ && !white_point_->Valid()) + return false; + + return true; +} + +bool MasteringMetadata::Write(IMkvWriter* writer) const { + const uint64_t size = PayloadSize(); + + // Don't write an empty element. + if (size == 0) + return true; + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvMasteringMetadata, size)) + return false; + if (luminance_max_ != kValueNotPresent && + !WriteEbmlElement(writer, libwebm::kMkvLuminanceMax, luminance_max_)) { + return false; + } + if (luminance_min_ != kValueNotPresent && + !WriteEbmlElement(writer, libwebm::kMkvLuminanceMin, luminance_min_)) { + return false; + } + if (r_ && !r_->Write(writer, libwebm::kMkvPrimaryRChromaticityX, + libwebm::kMkvPrimaryRChromaticityY)) { + return false; + } + if (g_ && !g_->Write(writer, libwebm::kMkvPrimaryGChromaticityX, + libwebm::kMkvPrimaryGChromaticityY)) { + return false; + } + if (b_ && !b_->Write(writer, libwebm::kMkvPrimaryBChromaticityX, + libwebm::kMkvPrimaryBChromaticityY)) { + return false; + } + if (white_point_ && + !white_point_->Write(writer, libwebm::kMkvWhitePointChromaticityX, + libwebm::kMkvWhitePointChromaticityY)) { + return false; + } + + return true; +} + +bool MasteringMetadata::SetChromaticity( + const PrimaryChromaticity* r, const PrimaryChromaticity* g, + const PrimaryChromaticity* b, const PrimaryChromaticity* white_point) { + PrimaryChromaticityPtr r_ptr(nullptr); + if (r) { + if (!CopyChromaticity(r, &r_ptr)) + return false; + } + PrimaryChromaticityPtr g_ptr(nullptr); + if (g) { + if (!CopyChromaticity(g, &g_ptr)) + return false; + } + PrimaryChromaticityPtr b_ptr(nullptr); + if (b) { + if (!CopyChromaticity(b, &b_ptr)) + return false; + } + PrimaryChromaticityPtr wp_ptr(nullptr); + if (white_point) { + if (!CopyChromaticity(white_point, &wp_ptr)) + return false; + } + + r_ = r_ptr.release(); + g_ = g_ptr.release(); + b_ = b_ptr.release(); + white_point_ = wp_ptr.release(); + return true; +} + +uint64_t MasteringMetadata::PayloadSize() const { + uint64_t size = 0; + + if (luminance_max_ != kValueNotPresent) + size += EbmlElementSize(libwebm::kMkvLuminanceMax, luminance_max_); + if (luminance_min_ != kValueNotPresent) + size += EbmlElementSize(libwebm::kMkvLuminanceMin, luminance_min_); + + if (r_) { + size += r_->PrimaryChromaticitySize(libwebm::kMkvPrimaryRChromaticityX, + libwebm::kMkvPrimaryRChromaticityY); + } + if (g_) { + size += g_->PrimaryChromaticitySize(libwebm::kMkvPrimaryGChromaticityX, + libwebm::kMkvPrimaryGChromaticityY); + } + if (b_) { + size += b_->PrimaryChromaticitySize(libwebm::kMkvPrimaryBChromaticityX, + libwebm::kMkvPrimaryBChromaticityY); + } + if (white_point_) { + size += white_point_->PrimaryChromaticitySize( + libwebm::kMkvWhitePointChromaticityX, + libwebm::kMkvWhitePointChromaticityY); + } + + return size; +} + +uint64_t Colour::ColourSize() const { + uint64_t size = PayloadSize(); + + if (size > 0) + size += EbmlMasterElementSize(libwebm::kMkvColour, size); + + return size; +} + +bool Colour::Valid() const { + if (mastering_metadata_ && !mastering_metadata_->Valid()) + return false; + if (matrix_coefficients_ != kValueNotPresent && + !IsMatrixCoefficientsValueValid(matrix_coefficients_)) { + return false; + } + if (chroma_siting_horz_ != kValueNotPresent && + !IsChromaSitingHorzValueValid(chroma_siting_horz_)) { + return false; + } + if (chroma_siting_vert_ != kValueNotPresent && + !IsChromaSitingVertValueValid(chroma_siting_vert_)) { + return false; + } + if (range_ != kValueNotPresent && !IsColourRangeValueValid(range_)) + return false; + if (transfer_characteristics_ != kValueNotPresent && + !IsTransferCharacteristicsValueValid(transfer_characteristics_)) { + return false; + } + if (primaries_ != kValueNotPresent && !IsPrimariesValueValid(primaries_)) + return false; + + return true; +} + +bool Colour::Write(IMkvWriter* writer) const { + const uint64_t size = PayloadSize(); + + // Don't write an empty element. + if (size == 0) + return true; + + // Don't write an invalid element. + if (!Valid()) + return false; + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvColour, size)) + return false; + + if (matrix_coefficients_ != kValueNotPresent && + !WriteEbmlElement(writer, libwebm::kMkvMatrixCoefficients, + static_cast(matrix_coefficients_))) { + return false; + } + if (bits_per_channel_ != kValueNotPresent && + !WriteEbmlElement(writer, libwebm::kMkvBitsPerChannel, + static_cast(bits_per_channel_))) { + return false; + } + if (chroma_subsampling_horz_ != kValueNotPresent && + !WriteEbmlElement(writer, libwebm::kMkvChromaSubsamplingHorz, + static_cast(chroma_subsampling_horz_))) { + return false; + } + if (chroma_subsampling_vert_ != kValueNotPresent && + !WriteEbmlElement(writer, libwebm::kMkvChromaSubsamplingVert, + static_cast(chroma_subsampling_vert_))) { + return false; + } + + if (cb_subsampling_horz_ != kValueNotPresent && + !WriteEbmlElement(writer, libwebm::kMkvCbSubsamplingHorz, + static_cast(cb_subsampling_horz_))) { + return false; + } + if (cb_subsampling_vert_ != kValueNotPresent && + !WriteEbmlElement(writer, libwebm::kMkvCbSubsamplingVert, + static_cast(cb_subsampling_vert_))) { + return false; + } + if (chroma_siting_horz_ != kValueNotPresent && + !WriteEbmlElement(writer, libwebm::kMkvChromaSitingHorz, + static_cast(chroma_siting_horz_))) { + return false; + } + if (chroma_siting_vert_ != kValueNotPresent && + !WriteEbmlElement(writer, libwebm::kMkvChromaSitingVert, + static_cast(chroma_siting_vert_))) { + return false; + } + if (range_ != kValueNotPresent && + !WriteEbmlElement(writer, libwebm::kMkvRange, + static_cast(range_))) { + return false; + } + if (transfer_characteristics_ != kValueNotPresent && + !WriteEbmlElement(writer, libwebm::kMkvTransferCharacteristics, + static_cast(transfer_characteristics_))) { + return false; + } + if (primaries_ != kValueNotPresent && + !WriteEbmlElement(writer, libwebm::kMkvPrimaries, + static_cast(primaries_))) { + return false; + } + if (max_cll_ != kValueNotPresent && + !WriteEbmlElement(writer, libwebm::kMkvMaxCLL, + static_cast(max_cll_))) { + return false; + } + if (max_fall_ != kValueNotPresent && + !WriteEbmlElement(writer, libwebm::kMkvMaxFALL, + static_cast(max_fall_))) { + return false; + } + + if (mastering_metadata_ && !mastering_metadata_->Write(writer)) + return false; + + return true; +} + +bool Colour::SetMasteringMetadata(const MasteringMetadata& mastering_metadata) { + std::unique_ptr mm_ptr(new MasteringMetadata()); + if (!mm_ptr.get()) + return false; + + mm_ptr->set_luminance_max(mastering_metadata.luminance_max()); + mm_ptr->set_luminance_min(mastering_metadata.luminance_min()); + + if (!mm_ptr->SetChromaticity(mastering_metadata.r(), mastering_metadata.g(), + mastering_metadata.b(), + mastering_metadata.white_point())) { + return false; + } + + delete mastering_metadata_; + mastering_metadata_ = mm_ptr.release(); + return true; +} + +uint64_t Colour::PayloadSize() const { + uint64_t size = 0; + + if (matrix_coefficients_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvMatrixCoefficients, + static_cast(matrix_coefficients_)); + } + if (bits_per_channel_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvBitsPerChannel, + static_cast(bits_per_channel_)); + } + if (chroma_subsampling_horz_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvChromaSubsamplingHorz, + static_cast(chroma_subsampling_horz_)); + } + if (chroma_subsampling_vert_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvChromaSubsamplingVert, + static_cast(chroma_subsampling_vert_)); + } + if (cb_subsampling_horz_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvCbSubsamplingHorz, + static_cast(cb_subsampling_horz_)); + } + if (cb_subsampling_vert_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvCbSubsamplingVert, + static_cast(cb_subsampling_vert_)); + } + if (chroma_siting_horz_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvChromaSitingHorz, + static_cast(chroma_siting_horz_)); + } + if (chroma_siting_vert_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvChromaSitingVert, + static_cast(chroma_siting_vert_)); + } + if (range_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvRange, static_cast(range_)); + } + if (transfer_characteristics_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvTransferCharacteristics, + static_cast(transfer_characteristics_)); + } + if (primaries_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvPrimaries, + static_cast(primaries_)); + } + if (max_cll_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvMaxCLL, static_cast(max_cll_)); + } + if (max_fall_ != kValueNotPresent) { + size += + EbmlElementSize(libwebm::kMkvMaxFALL, static_cast(max_fall_)); + } + + if (mastering_metadata_) + size += mastering_metadata_->MasteringMetadataSize(); + + return size; +} + +/////////////////////////////////////////////////////////////// +// +// Projection element + +uint64_t Projection::ProjectionSize() const { + uint64_t size = PayloadSize(); + + if (size > 0) + size += EbmlMasterElementSize(libwebm::kMkvProjection, size); + + return size; +} + +bool Projection::Write(IMkvWriter* writer) const { + const uint64_t size = PayloadSize(); + + // Don't write an empty element. + if (size == 0) + return true; + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvProjection, size)) + return false; + + if (!WriteEbmlElement(writer, libwebm::kMkvProjectionType, + static_cast(type_))) { + return false; + } + + if (private_data_length_ > 0 && private_data_ != NULL && + !WriteEbmlElement(writer, libwebm::kMkvProjectionPrivate, private_data_, + private_data_length_)) { + return false; + } + + if (!WriteEbmlElement(writer, libwebm::kMkvProjectionPoseYaw, pose_yaw_)) + return false; + + if (!WriteEbmlElement(writer, libwebm::kMkvProjectionPosePitch, + pose_pitch_)) { + return false; + } + + if (!WriteEbmlElement(writer, libwebm::kMkvProjectionPoseRoll, pose_roll_)) { + return false; + } + + return true; +} + +bool Projection::SetProjectionPrivate(const uint8_t* data, + uint64_t data_length) { + if (data == NULL || data_length == 0) { + return false; + } + + if (data_length != static_cast(data_length)) { + return false; + } + + uint8_t* new_private_data = + new (std::nothrow) uint8_t[static_cast(data_length)]; + if (new_private_data == NULL) { + return false; + } + + delete[] private_data_; + private_data_ = new_private_data; + private_data_length_ = data_length; + memcpy(private_data_, data, static_cast(data_length)); + + return true; +} + +uint64_t Projection::PayloadSize() const { + uint64_t size = + EbmlElementSize(libwebm::kMkvProjection, static_cast(type_)); + + if (private_data_length_ > 0 && private_data_ != NULL) { + size += EbmlElementSize(libwebm::kMkvProjectionPrivate, private_data_, + private_data_length_); + } + + size += EbmlElementSize(libwebm::kMkvProjectionPoseYaw, pose_yaw_); + size += EbmlElementSize(libwebm::kMkvProjectionPosePitch, pose_pitch_); + size += EbmlElementSize(libwebm::kMkvProjectionPoseRoll, pose_roll_); + + return size; +} + +/////////////////////////////////////////////////////////////// +// +// VideoTrack Class + +VideoTrack::VideoTrack(unsigned int* seed) + : Track(seed), + display_height_(0), + display_width_(0), + pixel_height_(0), + pixel_width_(0), + crop_left_(0), + crop_right_(0), + crop_top_(0), + crop_bottom_(0), + frame_rate_(0.0), + height_(0), + stereo_mode_(0), + alpha_mode_(0), + width_(0), + colour_space_(NULL), + colour_(NULL), + projection_(NULL) {} + +VideoTrack::~VideoTrack() { + delete colour_; + delete projection_; +} + +bool VideoTrack::SetStereoMode(uint64_t stereo_mode) { + if (stereo_mode != kMono && stereo_mode != kSideBySideLeftIsFirst && + stereo_mode != kTopBottomRightIsFirst && + stereo_mode != kTopBottomLeftIsFirst && + stereo_mode != kSideBySideRightIsFirst) + return false; + + stereo_mode_ = stereo_mode; + return true; +} + +bool VideoTrack::SetAlphaMode(uint64_t alpha_mode) { + if (alpha_mode != kNoAlpha && alpha_mode != kAlpha) + return false; + + alpha_mode_ = alpha_mode; + return true; +} + +uint64_t VideoTrack::PayloadSize() const { + const uint64_t parent_size = Track::PayloadSize(); + + uint64_t size = VideoPayloadSize(); + size += EbmlMasterElementSize(libwebm::kMkvVideo, size); + + return parent_size + size; +} + +bool VideoTrack::Write(IMkvWriter* writer) const { + if (!Track::Write(writer)) + return false; + + const uint64_t size = VideoPayloadSize(); + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvVideo, size)) + return false; + + const int64_t payload_position = writer->Position(); + if (payload_position < 0) + return false; + + if (!WriteEbmlElement( + writer, libwebm::kMkvPixelWidth, + static_cast((pixel_width_ > 0) ? pixel_width_ : width_))) + return false; + if (!WriteEbmlElement( + writer, libwebm::kMkvPixelHeight, + static_cast((pixel_height_ > 0) ? pixel_height_ : height_))) + return false; + if (display_width_ > 0) { + if (!WriteEbmlElement(writer, libwebm::kMkvDisplayWidth, + static_cast(display_width_))) + return false; + } + if (display_height_ > 0) { + if (!WriteEbmlElement(writer, libwebm::kMkvDisplayHeight, + static_cast(display_height_))) + return false; + } + if (crop_left_ > 0) { + if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropLeft, + static_cast(crop_left_))) + return false; + } + if (crop_right_ > 0) { + if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropRight, + static_cast(crop_right_))) + return false; + } + if (crop_top_ > 0) { + if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropTop, + static_cast(crop_top_))) + return false; + } + if (crop_bottom_ > 0) { + if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropBottom, + static_cast(crop_bottom_))) + return false; + } + if (stereo_mode_ > kMono) { + if (!WriteEbmlElement(writer, libwebm::kMkvStereoMode, + static_cast(stereo_mode_))) + return false; + } + if (alpha_mode_ > kNoAlpha) { + if (!WriteEbmlElement(writer, libwebm::kMkvAlphaMode, + static_cast(alpha_mode_))) + return false; + } + if (colour_space_) { + if (!WriteEbmlElement(writer, libwebm::kMkvColourSpace, colour_space_)) + return false; + } + if (frame_rate_ > 0.0) { + if (!WriteEbmlElement(writer, libwebm::kMkvFrameRate, + static_cast(frame_rate_))) { + return false; + } + } + if (colour_) { + if (!colour_->Write(writer)) + return false; + } + if (projection_) { + if (!projection_->Write(writer)) + return false; + } + + const int64_t stop_position = writer->Position(); + if (stop_position < 0 || + stop_position - payload_position != static_cast(size)) { + return false; + } + + return true; +} + +void VideoTrack::set_colour_space(const char* colour_space) { + if (colour_space) { + delete[] colour_space_; + + const size_t length = strlen(colour_space) + 1; + colour_space_ = new (std::nothrow) char[length]; // NOLINT + if (colour_space_) { +#ifdef _MSC_VER + strcpy_s(colour_space_, length, colour_space); +#else + strcpy(colour_space_, colour_space); +#endif + } + } +} + +bool VideoTrack::SetColour(const Colour& colour) { + std::unique_ptr colour_ptr(new Colour()); + if (!colour_ptr.get()) + return false; + + if (colour.mastering_metadata()) { + if (!colour_ptr->SetMasteringMetadata(*colour.mastering_metadata())) + return false; + } + + colour_ptr->set_matrix_coefficients(colour.matrix_coefficients()); + colour_ptr->set_bits_per_channel(colour.bits_per_channel()); + colour_ptr->set_chroma_subsampling_horz(colour.chroma_subsampling_horz()); + colour_ptr->set_chroma_subsampling_vert(colour.chroma_subsampling_vert()); + colour_ptr->set_cb_subsampling_horz(colour.cb_subsampling_horz()); + colour_ptr->set_cb_subsampling_vert(colour.cb_subsampling_vert()); + colour_ptr->set_chroma_siting_horz(colour.chroma_siting_horz()); + colour_ptr->set_chroma_siting_vert(colour.chroma_siting_vert()); + colour_ptr->set_range(colour.range()); + colour_ptr->set_transfer_characteristics(colour.transfer_characteristics()); + colour_ptr->set_primaries(colour.primaries()); + colour_ptr->set_max_cll(colour.max_cll()); + colour_ptr->set_max_fall(colour.max_fall()); + delete colour_; + colour_ = colour_ptr.release(); + return true; +} + +bool VideoTrack::SetProjection(const Projection& projection) { + std::unique_ptr projection_ptr(new Projection()); + if (!projection_ptr.get()) + return false; + + if (projection.private_data()) { + if (!projection_ptr->SetProjectionPrivate( + projection.private_data(), projection.private_data_length())) { + return false; + } + } + + projection_ptr->set_type(projection.type()); + projection_ptr->set_pose_yaw(projection.pose_yaw()); + projection_ptr->set_pose_pitch(projection.pose_pitch()); + projection_ptr->set_pose_roll(projection.pose_roll()); + delete projection_; + projection_ = projection_ptr.release(); + return true; +} + +uint64_t VideoTrack::VideoPayloadSize() const { + uint64_t size = EbmlElementSize( + libwebm::kMkvPixelWidth, + static_cast((pixel_width_ > 0) ? pixel_width_ : width_)); + size += EbmlElementSize( + libwebm::kMkvPixelHeight, + static_cast((pixel_height_ > 0) ? pixel_height_ : height_)); + if (display_width_ > 0) + size += EbmlElementSize(libwebm::kMkvDisplayWidth, + static_cast(display_width_)); + if (display_height_ > 0) + size += EbmlElementSize(libwebm::kMkvDisplayHeight, + static_cast(display_height_)); + if (crop_left_ > 0) + size += EbmlElementSize(libwebm::kMkvPixelCropLeft, + static_cast(crop_left_)); + if (crop_right_ > 0) + size += EbmlElementSize(libwebm::kMkvPixelCropRight, + static_cast(crop_right_)); + if (crop_top_ > 0) + size += EbmlElementSize(libwebm::kMkvPixelCropTop, + static_cast(crop_top_)); + if (crop_bottom_ > 0) + size += EbmlElementSize(libwebm::kMkvPixelCropBottom, + static_cast(crop_bottom_)); + if (stereo_mode_ > kMono) + size += EbmlElementSize(libwebm::kMkvStereoMode, + static_cast(stereo_mode_)); + if (alpha_mode_ > kNoAlpha) + size += EbmlElementSize(libwebm::kMkvAlphaMode, + static_cast(alpha_mode_)); + if (frame_rate_ > 0.0) + size += EbmlElementSize(libwebm::kMkvFrameRate, + static_cast(frame_rate_)); + if (colour_space_) + size += EbmlElementSize(libwebm::kMkvColourSpace, colour_space_); + if (colour_) + size += colour_->ColourSize(); + if (projection_) + size += projection_->ProjectionSize(); + + return size; +} + +/////////////////////////////////////////////////////////////// +// +// AudioTrack Class + +AudioTrack::AudioTrack(unsigned int* seed) + : Track(seed), bit_depth_(0), channels_(1), sample_rate_(0.0) {} + +AudioTrack::~AudioTrack() {} + +uint64_t AudioTrack::PayloadSize() const { + const uint64_t parent_size = Track::PayloadSize(); + + uint64_t size = EbmlElementSize(libwebm::kMkvSamplingFrequency, + static_cast(sample_rate_)); + size += + EbmlElementSize(libwebm::kMkvChannels, static_cast(channels_)); + if (bit_depth_ > 0) + size += + EbmlElementSize(libwebm::kMkvBitDepth, static_cast(bit_depth_)); + size += EbmlMasterElementSize(libwebm::kMkvAudio, size); + + return parent_size + size; +} + +bool AudioTrack::Write(IMkvWriter* writer) const { + if (!Track::Write(writer)) + return false; + + // Calculate AudioSettings size. + uint64_t size = EbmlElementSize(libwebm::kMkvSamplingFrequency, + static_cast(sample_rate_)); + size += + EbmlElementSize(libwebm::kMkvChannels, static_cast(channels_)); + if (bit_depth_ > 0) + size += + EbmlElementSize(libwebm::kMkvBitDepth, static_cast(bit_depth_)); + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvAudio, size)) + return false; + + const int64_t payload_position = writer->Position(); + if (payload_position < 0) + return false; + + if (!WriteEbmlElement(writer, libwebm::kMkvSamplingFrequency, + static_cast(sample_rate_))) + return false; + if (!WriteEbmlElement(writer, libwebm::kMkvChannels, + static_cast(channels_))) + return false; + if (bit_depth_ > 0) + if (!WriteEbmlElement(writer, libwebm::kMkvBitDepth, + static_cast(bit_depth_))) + return false; + + const int64_t stop_position = writer->Position(); + if (stop_position < 0 || + stop_position - payload_position != static_cast(size)) + return false; + + return true; +} + +/////////////////////////////////////////////////////////////// +// +// Tracks Class + +const char Tracks::kOpusCodecId[] = "A_OPUS"; +const char Tracks::kVorbisCodecId[] = "A_VORBIS"; +const char Tracks::kAv1CodecId[] = "V_AV1"; +const char Tracks::kVp8CodecId[] = "V_VP8"; +const char Tracks::kVp9CodecId[] = "V_VP9"; +const char Tracks::kWebVttCaptionsId[] = "D_WEBVTT/CAPTIONS"; +const char Tracks::kWebVttDescriptionsId[] = "D_WEBVTT/DESCRIPTIONS"; +const char Tracks::kWebVttMetadataId[] = "D_WEBVTT/METADATA"; +const char Tracks::kWebVttSubtitlesId[] = "D_WEBVTT/SUBTITLES"; + +Tracks::Tracks() + : track_entries_(NULL), track_entries_size_(0), wrote_tracks_(false) {} + +Tracks::~Tracks() { + if (track_entries_) { + for (uint32_t i = 0; i < track_entries_size_; ++i) { + Track* const track = track_entries_[i]; + delete track; + } + delete[] track_entries_; + } +} + +bool Tracks::AddTrack(Track* track, int32_t number) { + if (number < 0 || wrote_tracks_) + return false; + + // This muxer only supports track numbers in the range [1, 126], in + // order to be able (to use Matroska integer representation) to + // serialize the block header (of which the track number is a part) + // for a frame using exactly 4 bytes. + + if (number > 0x7E) + return false; + + uint32_t track_num = number; + + if (track_num > 0) { + // Check to make sure a track does not already have |track_num|. + for (uint32_t i = 0; i < track_entries_size_; ++i) { + if (track_entries_[i]->number() == track_num) + return false; + } + } + + const uint32_t count = track_entries_size_ + 1; + + Track** const track_entries = new (std::nothrow) Track*[count]; // NOLINT + if (!track_entries) + return false; + + for (uint32_t i = 0; i < track_entries_size_; ++i) { + track_entries[i] = track_entries_[i]; + } + + delete[] track_entries_; + + // Find the lowest availible track number > 0. + if (track_num == 0) { + track_num = count; + + // Check to make sure a track does not already have |track_num|. + bool exit = false; + do { + exit = true; + for (uint32_t i = 0; i < track_entries_size_; ++i) { + if (track_entries[i]->number() == track_num) { + track_num++; + exit = false; + break; + } + } + } while (!exit); + } + track->set_number(track_num); + + track_entries_ = track_entries; + track_entries_[track_entries_size_] = track; + track_entries_size_ = count; + return true; +} + +const Track* Tracks::GetTrackByIndex(uint32_t index) const { + if (track_entries_ == NULL) + return NULL; + + if (index >= track_entries_size_) + return NULL; + + return track_entries_[index]; +} + +Track* Tracks::GetTrackByNumber(uint64_t track_number) const { + const int32_t count = track_entries_size(); + for (int32_t i = 0; i < count; ++i) { + if (track_entries_[i]->number() == track_number) + return track_entries_[i]; + } + + return NULL; +} + +bool Tracks::TrackIsAudio(uint64_t track_number) const { + const Track* const track = GetTrackByNumber(track_number); + + if (track->type() == kAudio) + return true; + + return false; +} + +bool Tracks::TrackIsVideo(uint64_t track_number) const { + const Track* const track = GetTrackByNumber(track_number); + + if (track->type() == kVideo) + return true; + + return false; +} + +bool Tracks::Write(IMkvWriter* writer) const { + uint64_t size = 0; + const int32_t count = track_entries_size(); + for (int32_t i = 0; i < count; ++i) { + const Track* const track = GetTrackByIndex(i); + + if (!track) + return false; + + size += track->Size(); + } + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvTracks, size)) + return false; + + const int64_t payload_position = writer->Position(); + if (payload_position < 0) + return false; + + for (int32_t i = 0; i < count; ++i) { + const Track* const track = GetTrackByIndex(i); + if (!track->Write(writer)) + return false; + } + + const int64_t stop_position = writer->Position(); + if (stop_position < 0 || + stop_position - payload_position != static_cast(size)) + return false; + + wrote_tracks_ = true; + return true; +} + +/////////////////////////////////////////////////////////////// +// +// Chapter Class + +bool Chapter::set_id(const char* id) { return StrCpy(id, &id_); } + +void Chapter::set_time(const Segment& segment, uint64_t start_ns, + uint64_t end_ns) { + const SegmentInfo* const info = segment.GetSegmentInfo(); + const uint64_t timecode_scale = info->timecode_scale(); + start_timecode_ = start_ns / timecode_scale; + end_timecode_ = end_ns / timecode_scale; +} + +bool Chapter::add_string(const char* title, const char* language, + const char* country) { + if (!ExpandDisplaysArray()) + return false; + + Display& d = displays_[displays_count_++]; + d.Init(); + + if (!d.set_title(title)) + return false; + + if (!d.set_language(language)) + return false; + + if (!d.set_country(country)) + return false; + + return true; +} + +Chapter::Chapter() { + // This ctor only constructs the object. Proper initialization is + // done in Init() (called in Chapters::AddChapter()). The only + // reason we bother implementing this ctor is because we had to + // declare it as private (along with the dtor), in order to prevent + // clients from creating Chapter instances (a privelege we grant + // only to the Chapters class). Doing no initialization here also + // means that creating arrays of chapter objects is more efficient, + // because we only initialize each new chapter object as it becomes + // active on the array. +} + +Chapter::~Chapter() {} + +void Chapter::Init(unsigned int* seed) { + id_ = NULL; + start_timecode_ = 0; + end_timecode_ = 0; + displays_ = NULL; + displays_size_ = 0; + displays_count_ = 0; + uid_ = MakeUID(seed); +} + +void Chapter::ShallowCopy(Chapter* dst) const { + dst->id_ = id_; + dst->start_timecode_ = start_timecode_; + dst->end_timecode_ = end_timecode_; + dst->uid_ = uid_; + dst->displays_ = displays_; + dst->displays_size_ = displays_size_; + dst->displays_count_ = displays_count_; +} + +void Chapter::Clear() { + StrCpy(NULL, &id_); + + while (displays_count_ > 0) { + Display& d = displays_[--displays_count_]; + d.Clear(); + } + + delete[] displays_; + displays_ = NULL; + + displays_size_ = 0; +} + +bool Chapter::ExpandDisplaysArray() { + if (displays_size_ > displays_count_) + return true; // nothing to do yet + + const int size = (displays_size_ == 0) ? 1 : 2 * displays_size_; + + Display* const displays = new (std::nothrow) Display[size]; // NOLINT + if (displays == NULL) + return false; + + for (int idx = 0; idx < displays_count_; ++idx) { + displays[idx] = displays_[idx]; // shallow copy + } + + delete[] displays_; + + displays_ = displays; + displays_size_ = size; + + return true; +} + +uint64_t Chapter::WriteAtom(IMkvWriter* writer) const { + uint64_t payload_size = + EbmlElementSize(libwebm::kMkvChapterStringUID, id_) + + EbmlElementSize(libwebm::kMkvChapterUID, static_cast(uid_)) + + EbmlElementSize(libwebm::kMkvChapterTimeStart, + static_cast(start_timecode_)) + + EbmlElementSize(libwebm::kMkvChapterTimeEnd, + static_cast(end_timecode_)); + + for (int idx = 0; idx < displays_count_; ++idx) { + const Display& d = displays_[idx]; + payload_size += d.WriteDisplay(NULL); + } + + const uint64_t atom_size = + EbmlMasterElementSize(libwebm::kMkvChapterAtom, payload_size) + + payload_size; + + if (writer == NULL) + return atom_size; + + const int64_t start = writer->Position(); + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvChapterAtom, payload_size)) + return 0; + + if (!WriteEbmlElement(writer, libwebm::kMkvChapterStringUID, id_)) + return 0; + + if (!WriteEbmlElement(writer, libwebm::kMkvChapterUID, + static_cast(uid_))) + return 0; + + if (!WriteEbmlElement(writer, libwebm::kMkvChapterTimeStart, + static_cast(start_timecode_))) + return 0; + + if (!WriteEbmlElement(writer, libwebm::kMkvChapterTimeEnd, + static_cast(end_timecode_))) + return 0; + + for (int idx = 0; idx < displays_count_; ++idx) { + const Display& d = displays_[idx]; + + if (!d.WriteDisplay(writer)) + return 0; + } + + const int64_t stop = writer->Position(); + + if (stop >= start && uint64_t(stop - start) != atom_size) + return 0; + + return atom_size; +} + +void Chapter::Display::Init() { + title_ = NULL; + language_ = NULL; + country_ = NULL; +} + +void Chapter::Display::Clear() { + StrCpy(NULL, &title_); + StrCpy(NULL, &language_); + StrCpy(NULL, &country_); +} + +bool Chapter::Display::set_title(const char* title) { + return StrCpy(title, &title_); +} + +bool Chapter::Display::set_language(const char* language) { + return StrCpy(language, &language_); +} + +bool Chapter::Display::set_country(const char* country) { + return StrCpy(country, &country_); +} + +uint64_t Chapter::Display::WriteDisplay(IMkvWriter* writer) const { + uint64_t payload_size = EbmlElementSize(libwebm::kMkvChapString, title_); + + if (language_) + payload_size += EbmlElementSize(libwebm::kMkvChapLanguage, language_); + + if (country_) + payload_size += EbmlElementSize(libwebm::kMkvChapCountry, country_); + + const uint64_t display_size = + EbmlMasterElementSize(libwebm::kMkvChapterDisplay, payload_size) + + payload_size; + + if (writer == NULL) + return display_size; + + const int64_t start = writer->Position(); + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvChapterDisplay, + payload_size)) + return 0; + + if (!WriteEbmlElement(writer, libwebm::kMkvChapString, title_)) + return 0; + + if (language_) { + if (!WriteEbmlElement(writer, libwebm::kMkvChapLanguage, language_)) + return 0; + } + + if (country_) { + if (!WriteEbmlElement(writer, libwebm::kMkvChapCountry, country_)) + return 0; + } + + const int64_t stop = writer->Position(); + + if (stop >= start && uint64_t(stop - start) != display_size) + return 0; + + return display_size; +} + +/////////////////////////////////////////////////////////////// +// +// Chapters Class + +Chapters::Chapters() : chapters_size_(0), chapters_count_(0), chapters_(NULL) {} + +Chapters::~Chapters() { + while (chapters_count_ > 0) { + Chapter& chapter = chapters_[--chapters_count_]; + chapter.Clear(); + } + + delete[] chapters_; + chapters_ = NULL; +} + +int Chapters::Count() const { return chapters_count_; } + +Chapter* Chapters::AddChapter(unsigned int* seed) { + if (!ExpandChaptersArray()) + return NULL; + + Chapter& chapter = chapters_[chapters_count_++]; + chapter.Init(seed); + + return &chapter; +} + +bool Chapters::Write(IMkvWriter* writer) const { + if (writer == NULL) + return false; + + const uint64_t payload_size = WriteEdition(NULL); // return size only + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvChapters, payload_size)) + return false; + + const int64_t start = writer->Position(); + + if (WriteEdition(writer) == 0) // error + return false; + + const int64_t stop = writer->Position(); + + if (stop >= start && uint64_t(stop - start) != payload_size) + return false; + + return true; +} + +bool Chapters::ExpandChaptersArray() { + if (chapters_size_ > chapters_count_) + return true; // nothing to do yet + + const int size = (chapters_size_ == 0) ? 1 : 2 * chapters_size_; + + Chapter* const chapters = new (std::nothrow) Chapter[size]; // NOLINT + if (chapters == NULL) + return false; + + for (int idx = 0; idx < chapters_count_; ++idx) { + const Chapter& src = chapters_[idx]; + Chapter* const dst = chapters + idx; + src.ShallowCopy(dst); + } + + delete[] chapters_; + + chapters_ = chapters; + chapters_size_ = size; + + return true; +} + +uint64_t Chapters::WriteEdition(IMkvWriter* writer) const { + uint64_t payload_size = 0; + + for (int idx = 0; idx < chapters_count_; ++idx) { + const Chapter& chapter = chapters_[idx]; + payload_size += chapter.WriteAtom(NULL); + } + + const uint64_t edition_size = + EbmlMasterElementSize(libwebm::kMkvEditionEntry, payload_size) + + payload_size; + + if (writer == NULL) // return size only + return edition_size; + + const int64_t start = writer->Position(); + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvEditionEntry, payload_size)) + return 0; // error + + for (int idx = 0; idx < chapters_count_; ++idx) { + const Chapter& chapter = chapters_[idx]; + + const uint64_t chapter_size = chapter.WriteAtom(writer); + if (chapter_size == 0) // error + return 0; + } + + const int64_t stop = writer->Position(); + + if (stop >= start && uint64_t(stop - start) != edition_size) + return 0; + + return edition_size; +} + +// Tag Class + +bool Tag::add_simple_tag(const char* tag_name, const char* tag_string) { + if (!ExpandSimpleTagsArray()) + return false; + + SimpleTag& st = simple_tags_[simple_tags_count_++]; + st.Init(); + + if (!st.set_tag_name(tag_name)) + return false; + + if (!st.set_tag_string(tag_string)) + return false; + + return true; +} + +Tag::Tag() { + simple_tags_ = NULL; + simple_tags_size_ = 0; + simple_tags_count_ = 0; +} + +Tag::~Tag() {} + +void Tag::ShallowCopy(Tag* dst) const { + dst->simple_tags_ = simple_tags_; + dst->simple_tags_size_ = simple_tags_size_; + dst->simple_tags_count_ = simple_tags_count_; +} + +void Tag::Clear() { + while (simple_tags_count_ > 0) { + SimpleTag& st = simple_tags_[--simple_tags_count_]; + st.Clear(); + } + + delete[] simple_tags_; + simple_tags_ = NULL; + + simple_tags_size_ = 0; +} + +bool Tag::ExpandSimpleTagsArray() { + if (simple_tags_size_ > simple_tags_count_) + return true; // nothing to do yet + + const int size = (simple_tags_size_ == 0) ? 1 : 2 * simple_tags_size_; + + SimpleTag* const simple_tags = new (std::nothrow) SimpleTag[size]; // NOLINT + if (simple_tags == NULL) + return false; + + for (int idx = 0; idx < simple_tags_count_; ++idx) { + simple_tags[idx] = simple_tags_[idx]; // shallow copy + } + + delete[] simple_tags_; + + simple_tags_ = simple_tags; + simple_tags_size_ = size; + + return true; +} + +uint64_t Tag::Write(IMkvWriter* writer) const { + uint64_t payload_size = 0; + + for (int idx = 0; idx < simple_tags_count_; ++idx) { + const SimpleTag& st = simple_tags_[idx]; + payload_size += st.Write(NULL); + } + + const uint64_t tag_size = + EbmlMasterElementSize(libwebm::kMkvTag, payload_size) + payload_size; + + if (writer == NULL) + return tag_size; + + const int64_t start = writer->Position(); + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvTag, payload_size)) + return 0; + + for (int idx = 0; idx < simple_tags_count_; ++idx) { + const SimpleTag& st = simple_tags_[idx]; + + if (!st.Write(writer)) + return 0; + } + + const int64_t stop = writer->Position(); + + if (stop >= start && uint64_t(stop - start) != tag_size) + return 0; + + return tag_size; +} + +// Tag::SimpleTag + +void Tag::SimpleTag::Init() { + tag_name_ = NULL; + tag_string_ = NULL; +} + +void Tag::SimpleTag::Clear() { + StrCpy(NULL, &tag_name_); + StrCpy(NULL, &tag_string_); +} + +bool Tag::SimpleTag::set_tag_name(const char* tag_name) { + return StrCpy(tag_name, &tag_name_); +} + +bool Tag::SimpleTag::set_tag_string(const char* tag_string) { + return StrCpy(tag_string, &tag_string_); +} + +uint64_t Tag::SimpleTag::Write(IMkvWriter* writer) const { + uint64_t payload_size = EbmlElementSize(libwebm::kMkvTagName, tag_name_); + + payload_size += EbmlElementSize(libwebm::kMkvTagString, tag_string_); + + const uint64_t simple_tag_size = + EbmlMasterElementSize(libwebm::kMkvSimpleTag, payload_size) + + payload_size; + + if (writer == NULL) + return simple_tag_size; + + const int64_t start = writer->Position(); + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvSimpleTag, payload_size)) + return 0; + + if (!WriteEbmlElement(writer, libwebm::kMkvTagName, tag_name_)) + return 0; + + if (!WriteEbmlElement(writer, libwebm::kMkvTagString, tag_string_)) + return 0; + + const int64_t stop = writer->Position(); + + if (stop >= start && uint64_t(stop - start) != simple_tag_size) + return 0; + + return simple_tag_size; +} + +// Tags Class + +Tags::Tags() : tags_size_(0), tags_count_(0), tags_(NULL) {} + +Tags::~Tags() { + while (tags_count_ > 0) { + Tag& tag = tags_[--tags_count_]; + tag.Clear(); + } + + delete[] tags_; + tags_ = NULL; +} + +int Tags::Count() const { return tags_count_; } + +Tag* Tags::AddTag() { + if (!ExpandTagsArray()) + return NULL; + + Tag& tag = tags_[tags_count_++]; + + return &tag; +} + +bool Tags::Write(IMkvWriter* writer) const { + if (writer == NULL) + return false; + + uint64_t payload_size = 0; + + for (int idx = 0; idx < tags_count_; ++idx) { + const Tag& tag = tags_[idx]; + payload_size += tag.Write(NULL); + } + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvTags, payload_size)) + return false; + + const int64_t start = writer->Position(); + + for (int idx = 0; idx < tags_count_; ++idx) { + const Tag& tag = tags_[idx]; + + const uint64_t tag_size = tag.Write(writer); + if (tag_size == 0) // error + return 0; + } + + const int64_t stop = writer->Position(); + + if (stop >= start && uint64_t(stop - start) != payload_size) + return false; + + return true; +} + +bool Tags::ExpandTagsArray() { + if (tags_size_ > tags_count_) + return true; // nothing to do yet + + const int size = (tags_size_ == 0) ? 1 : 2 * tags_size_; + + Tag* const tags = new (std::nothrow) Tag[size]; // NOLINT + if (tags == NULL) + return false; + + for (int idx = 0; idx < tags_count_; ++idx) { + const Tag& src = tags_[idx]; + Tag* const dst = tags + idx; + src.ShallowCopy(dst); + } + + delete[] tags_; + + tags_ = tags; + tags_size_ = size; + + return true; +} + +/////////////////////////////////////////////////////////////// +// +// Cluster class + +Cluster::Cluster(uint64_t timecode, int64_t cues_pos, uint64_t timecode_scale, + bool write_last_frame_with_duration, bool fixed_size_timecode) + : blocks_added_(0), + finalized_(false), + fixed_size_timecode_(fixed_size_timecode), + header_written_(false), + payload_size_(0), + position_for_cues_(cues_pos), + size_position_(-1), + timecode_(timecode), + timecode_scale_(timecode_scale), + write_last_frame_with_duration_(write_last_frame_with_duration), + writer_(NULL) {} + +Cluster::~Cluster() { + // Delete any stored frames that are left behind. This will happen if the + // Cluster was not Finalized for whatever reason. + while (!stored_frames_.empty()) { + while (!stored_frames_.begin()->second.empty()) { + delete stored_frames_.begin()->second.front(); + stored_frames_.begin()->second.pop_front(); + } + stored_frames_.erase(stored_frames_.begin()->first); + } +} + +bool Cluster::Init(IMkvWriter* ptr_writer) { + if (!ptr_writer) { + return false; + } + writer_ = ptr_writer; + return true; +} + +bool Cluster::AddFrame(const Frame* const frame) { + return QueueOrWriteFrame(frame); +} + +bool Cluster::AddFrame(const uint8_t* data, uint64_t length, + uint64_t track_number, uint64_t abs_timecode, + bool is_key) { + Frame frame; + if (!frame.Init(data, length)) + return false; + frame.set_track_number(track_number); + frame.set_timestamp(abs_timecode); + frame.set_is_key(is_key); + return QueueOrWriteFrame(&frame); +} + +bool Cluster::AddFrameWithAdditional(const uint8_t* data, uint64_t length, + const uint8_t* additional, + uint64_t additional_length, + uint64_t add_id, uint64_t track_number, + uint64_t abs_timecode, bool is_key) { + if (!additional || additional_length == 0) { + return false; + } + Frame frame; + if (!frame.Init(data, length) || + !frame.AddAdditionalData(additional, additional_length, add_id)) { + return false; + } + frame.set_track_number(track_number); + frame.set_timestamp(abs_timecode); + frame.set_is_key(is_key); + return QueueOrWriteFrame(&frame); +} + +bool Cluster::AddFrameWithDiscardPadding(const uint8_t* data, uint64_t length, + int64_t discard_padding, + uint64_t track_number, + uint64_t abs_timecode, bool is_key) { + Frame frame; + if (!frame.Init(data, length)) + return false; + frame.set_discard_padding(discard_padding); + frame.set_track_number(track_number); + frame.set_timestamp(abs_timecode); + frame.set_is_key(is_key); + return QueueOrWriteFrame(&frame); +} + +bool Cluster::AddMetadata(const uint8_t* data, uint64_t length, + uint64_t track_number, uint64_t abs_timecode, + uint64_t duration_timecode) { + Frame frame; + if (!frame.Init(data, length)) + return false; + frame.set_track_number(track_number); + frame.set_timestamp(abs_timecode); + frame.set_duration(duration_timecode); + frame.set_is_key(true); // All metadata blocks are keyframes. + return QueueOrWriteFrame(&frame); +} + +void Cluster::AddPayloadSize(uint64_t size) { payload_size_ += size; } + +bool Cluster::Finalize() { + return !write_last_frame_with_duration_ && Finalize(false, 0); +} + +bool Cluster::Finalize(bool set_last_frame_duration, uint64_t duration) { + if (!writer_ || finalized_) + return false; + + if (write_last_frame_with_duration_) { + // Write out held back Frames. This essentially performs a k-way merge + // across all tracks in the increasing order of timestamps. + while (!stored_frames_.empty()) { + Frame* frame = stored_frames_.begin()->second.front(); + + // Get the next frame to write (frame with least timestamp across all + // tracks). + for (FrameMapIterator frames_iterator = ++stored_frames_.begin(); + frames_iterator != stored_frames_.end(); ++frames_iterator) { + if (frames_iterator->second.front()->timestamp() < frame->timestamp()) { + frame = frames_iterator->second.front(); + } + } + + // Set the duration if it's the last frame for the track. + if (set_last_frame_duration && + stored_frames_[frame->track_number()].size() == 1 && + !frame->duration_set()) { + frame->set_duration(duration - frame->timestamp()); + if (!frame->is_key() && !frame->reference_block_timestamp_set()) { + frame->set_reference_block_timestamp( + last_block_timestamp_[frame->track_number()]); + } + } + + // Write the frame and remove it from |stored_frames_|. + const bool wrote_frame = DoWriteFrame(frame); + stored_frames_[frame->track_number()].pop_front(); + if (stored_frames_[frame->track_number()].empty()) { + stored_frames_.erase(frame->track_number()); + } + delete frame; + if (!wrote_frame) + return false; + } + } + + if (size_position_ == -1) + return false; + + if (writer_->Seekable()) { + const int64_t pos = writer_->Position(); + + if (writer_->Position(size_position_)) + return false; + + if (WriteUIntSize(writer_, payload_size(), 8)) + return false; + + if (writer_->Position(pos)) + return false; + } + + finalized_ = true; + + return true; +} + +uint64_t Cluster::Size() const { + const uint64_t element_size = + EbmlMasterElementSize(libwebm::kMkvCluster, 0xFFFFFFFFFFFFFFFFULL) + + payload_size_; + return element_size; +} + +bool Cluster::PreWriteBlock() { + if (finalized_) + return false; + + if (!header_written_) { + if (!WriteClusterHeader()) + return false; + } + + return true; +} + +void Cluster::PostWriteBlock(uint64_t element_size) { + AddPayloadSize(element_size); + ++blocks_added_; +} + +int64_t Cluster::GetRelativeTimecode(int64_t abs_timecode) const { + const int64_t cluster_timecode = this->Cluster::timecode(); + const int64_t rel_timecode = + static_cast(abs_timecode) - cluster_timecode; + + if (rel_timecode < 0 || rel_timecode > kMaxBlockTimecode) + return -1; + + return rel_timecode; +} + +bool Cluster::DoWriteFrame(const Frame* const frame) { + if (!frame || !frame->IsValid()) + return false; + + if (!PreWriteBlock()) + return false; + + const uint64_t element_size = WriteFrame(writer_, frame, this); + if (element_size == 0) + return false; + + PostWriteBlock(element_size); + last_block_timestamp_[frame->track_number()] = frame->timestamp(); + return true; +} + +bool Cluster::QueueOrWriteFrame(const Frame* const frame) { + if (!frame || !frame->IsValid()) + return false; + + // If |write_last_frame_with_duration_| is not set, then write the frame right + // away. + if (!write_last_frame_with_duration_) { + return DoWriteFrame(frame); + } + + // Queue the current frame. + uint64_t track_number = frame->track_number(); + Frame* const frame_to_store = new Frame(); + frame_to_store->CopyFrom(*frame); + stored_frames_[track_number].push_back(frame_to_store); + + // Iterate through all queued frames in the current track except the last one + // and write it if it is okay to do so (i.e.) no other track has an held back + // frame with timestamp <= the timestamp of the frame in question. + std::vector::iterator> frames_to_erase; + for (std::list::iterator + current_track_iterator = stored_frames_[track_number].begin(), + end = --stored_frames_[track_number].end(); + current_track_iterator != end; ++current_track_iterator) { + const Frame* const frame_to_write = *current_track_iterator; + bool okay_to_write = true; + for (FrameMapIterator track_iterator = stored_frames_.begin(); + track_iterator != stored_frames_.end(); ++track_iterator) { + if (track_iterator->first == track_number) { + continue; + } + if (track_iterator->second.front()->timestamp() < + frame_to_write->timestamp()) { + okay_to_write = false; + break; + } + } + if (okay_to_write) { + const bool wrote_frame = DoWriteFrame(frame_to_write); + delete frame_to_write; + if (!wrote_frame) + return false; + frames_to_erase.push_back(current_track_iterator); + } else { + break; + } + } + for (std::vector::iterator>::iterator iterator = + frames_to_erase.begin(); + iterator != frames_to_erase.end(); ++iterator) { + stored_frames_[track_number].erase(*iterator); + } + return true; +} + +bool Cluster::WriteClusterHeader() { + if (finalized_) + return false; + + if (WriteID(writer_, libwebm::kMkvCluster)) + return false; + + // Save for later. + size_position_ = writer_->Position(); + + // Write "unknown" (EBML coded -1) as cluster size value. We need to write 8 + // bytes because we do not know how big our cluster will be. + if (SerializeInt(writer_, kEbmlUnknownValue, 8)) + return false; + + if (!WriteEbmlElement(writer_, libwebm::kMkvTimecode, timecode(), + fixed_size_timecode_ ? 8 : 0)) { + return false; + } + AddPayloadSize(EbmlElementSize(libwebm::kMkvTimecode, timecode(), + fixed_size_timecode_ ? 8 : 0)); + header_written_ = true; + + return true; +} + +/////////////////////////////////////////////////////////////// +// +// SeekHead Class + +SeekHead::SeekHead() : start_pos_(0ULL) { + for (int32_t i = 0; i < kSeekEntryCount; ++i) { + seek_entry_id_[i] = 0; + seek_entry_pos_[i] = 0; + } +} + +SeekHead::~SeekHead() {} + +bool SeekHead::Finalize(IMkvWriter* writer) const { + if (writer->Seekable()) { + if (start_pos_ == -1) + return false; + + uint64_t payload_size = 0; + uint64_t entry_size[kSeekEntryCount]; + + for (int32_t i = 0; i < kSeekEntryCount; ++i) { + if (seek_entry_id_[i] != 0) { + entry_size[i] = EbmlElementSize(libwebm::kMkvSeekID, + static_cast(seek_entry_id_[i])); + entry_size[i] += EbmlElementSize( + libwebm::kMkvSeekPosition, static_cast(seek_entry_pos_[i])); + + payload_size += + EbmlMasterElementSize(libwebm::kMkvSeek, entry_size[i]) + + entry_size[i]; + } + } + + // No SeekHead elements + if (payload_size == 0) + return true; + + const int64_t pos = writer->Position(); + if (writer->Position(start_pos_)) + return false; + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvSeekHead, payload_size)) + return false; + + for (int32_t i = 0; i < kSeekEntryCount; ++i) { + if (seek_entry_id_[i] != 0) { + if (!WriteEbmlMasterElement(writer, libwebm::kMkvSeek, entry_size[i])) + return false; + + if (!WriteEbmlElement(writer, libwebm::kMkvSeekID, + static_cast(seek_entry_id_[i]))) + return false; + + if (!WriteEbmlElement(writer, libwebm::kMkvSeekPosition, + static_cast(seek_entry_pos_[i]))) + return false; + } + } + + const uint64_t total_entry_size = kSeekEntryCount * MaxEntrySize(); + const uint64_t total_size = + EbmlMasterElementSize(libwebm::kMkvSeekHead, total_entry_size) + + total_entry_size; + const int64_t size_left = total_size - (writer->Position() - start_pos_); + + const uint64_t bytes_written = WriteVoidElement(writer, size_left); + if (!bytes_written) + return false; + + if (writer->Position(pos)) + return false; + } + + return true; +} + +bool SeekHead::Write(IMkvWriter* writer) { + const uint64_t entry_size = kSeekEntryCount * MaxEntrySize(); + const uint64_t size = + EbmlMasterElementSize(libwebm::kMkvSeekHead, entry_size); + + start_pos_ = writer->Position(); + + const uint64_t bytes_written = WriteVoidElement(writer, size + entry_size); + if (!bytes_written) + return false; + + return true; +} + +bool SeekHead::AddSeekEntry(uint32_t id, uint64_t pos) { + for (int32_t i = 0; i < kSeekEntryCount; ++i) { + if (seek_entry_id_[i] == 0) { + seek_entry_id_[i] = id; + seek_entry_pos_[i] = pos; + return true; + } + } + return false; +} + +uint32_t SeekHead::GetId(int index) const { + if (index < 0 || index >= kSeekEntryCount) + return UINT_MAX; + return seek_entry_id_[index]; +} + +uint64_t SeekHead::GetPosition(int index) const { + if (index < 0 || index >= kSeekEntryCount) + return ULLONG_MAX; + return seek_entry_pos_[index]; +} + +bool SeekHead::SetSeekEntry(int index, uint32_t id, uint64_t position) { + if (index < 0 || index >= kSeekEntryCount) + return false; + seek_entry_id_[index] = id; + seek_entry_pos_[index] = position; + return true; +} + +uint64_t SeekHead::MaxEntrySize() const { + const uint64_t max_entry_payload_size = + EbmlElementSize(libwebm::kMkvSeekID, + static_cast(UINT64_C(0xffffffff))) + + EbmlElementSize(libwebm::kMkvSeekPosition, + static_cast(UINT64_C(0xffffffffffffffff))); + const uint64_t max_entry_size = + EbmlMasterElementSize(libwebm::kMkvSeek, max_entry_payload_size) + + max_entry_payload_size; + + return max_entry_size; +} + +/////////////////////////////////////////////////////////////// +// +// SegmentInfo Class + +SegmentInfo::SegmentInfo() + : duration_(-1.0), + muxing_app_(NULL), + timecode_scale_(1000000ULL), + writing_app_(NULL), + date_utc_(LLONG_MIN), + duration_pos_(-1) {} + +SegmentInfo::~SegmentInfo() { + delete[] muxing_app_; + delete[] writing_app_; +} + +bool SegmentInfo::Init() { + int32_t major; + int32_t minor; + int32_t build; + int32_t revision; + GetVersion(&major, &minor, &build, &revision); + char temp[256]; +#ifdef _MSC_VER + sprintf_s(temp, sizeof(temp) / sizeof(temp[0]), "libwebm-%d.%d.%d.%d", major, + minor, build, revision); +#else + snprintf(temp, sizeof(temp) / sizeof(temp[0]), "libwebm-%d.%d.%d.%d", major, + minor, build, revision); +#endif + + const size_t app_len = strlen(temp) + 1; + + delete[] muxing_app_; + + muxing_app_ = new (std::nothrow) char[app_len]; // NOLINT + if (!muxing_app_) + return false; + +#ifdef _MSC_VER + strcpy_s(muxing_app_, app_len, temp); +#else + strcpy(muxing_app_, temp); +#endif + + set_writing_app(temp); + if (!writing_app_) + return false; + return true; +} + +bool SegmentInfo::Finalize(IMkvWriter* writer) const { + if (!writer) + return false; + + if (duration_ > 0.0) { + if (writer->Seekable()) { + if (duration_pos_ == -1) + return false; + + const int64_t pos = writer->Position(); + + if (writer->Position(duration_pos_)) + return false; + + if (!WriteEbmlElement(writer, libwebm::kMkvDuration, + static_cast(duration_))) + return false; + + if (writer->Position(pos)) + return false; + } + } + + return true; +} + +bool SegmentInfo::Write(IMkvWriter* writer) { + if (!writer || !muxing_app_ || !writing_app_) + return false; + + uint64_t size = EbmlElementSize(libwebm::kMkvTimecodeScale, + static_cast(timecode_scale_)); + if (duration_ > 0.0) + size += + EbmlElementSize(libwebm::kMkvDuration, static_cast(duration_)); + if (date_utc_ != LLONG_MIN) + size += EbmlDateElementSize(libwebm::kMkvDateUTC); + size += EbmlElementSize(libwebm::kMkvMuxingApp, muxing_app_); + size += EbmlElementSize(libwebm::kMkvWritingApp, writing_app_); + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvInfo, size)) + return false; + + const int64_t payload_position = writer->Position(); + if (payload_position < 0) + return false; + + if (!WriteEbmlElement(writer, libwebm::kMkvTimecodeScale, + static_cast(timecode_scale_))) + return false; + + if (duration_ > 0.0) { + // Save for later + duration_pos_ = writer->Position(); + + if (!WriteEbmlElement(writer, libwebm::kMkvDuration, + static_cast(duration_))) + return false; + } + + if (date_utc_ != LLONG_MIN) + WriteEbmlDateElement(writer, libwebm::kMkvDateUTC, date_utc_); + + if (!WriteEbmlElement(writer, libwebm::kMkvMuxingApp, muxing_app_)) + return false; + if (!WriteEbmlElement(writer, libwebm::kMkvWritingApp, writing_app_)) + return false; + + const int64_t stop_position = writer->Position(); + if (stop_position < 0 || + stop_position - payload_position != static_cast(size)) + return false; + + return true; +} + +void SegmentInfo::set_muxing_app(const char* app) { + if (app) { + const size_t length = strlen(app) + 1; + char* temp_str = new (std::nothrow) char[length]; // NOLINT + if (!temp_str) + return; + +#ifdef _MSC_VER + strcpy_s(temp_str, length, app); +#else + strcpy(temp_str, app); +#endif + + delete[] muxing_app_; + muxing_app_ = temp_str; + } +} + +void SegmentInfo::set_writing_app(const char* app) { + if (app) { + const size_t length = strlen(app) + 1; + char* temp_str = new (std::nothrow) char[length]; // NOLINT + if (!temp_str) + return; + +#ifdef _MSC_VER + strcpy_s(temp_str, length, app); +#else + strcpy(temp_str, app); +#endif + + delete[] writing_app_; + writing_app_ = temp_str; + } +} + +/////////////////////////////////////////////////////////////// +// +// Segment Class + +Segment::Segment() + : chunk_count_(0), + chunk_name_(NULL), + chunk_writer_cluster_(NULL), + chunk_writer_cues_(NULL), + chunk_writer_header_(NULL), + chunking_(false), + chunking_base_name_(NULL), + cluster_list_(NULL), + cluster_list_capacity_(0), + cluster_list_size_(0), + cues_position_(kAfterClusters), + cues_track_(0), + force_new_cluster_(false), + frames_(NULL), + frames_capacity_(0), + frames_size_(0), + has_video_(false), + header_written_(false), + last_block_duration_(0), + last_timestamp_(0), + max_cluster_duration_(kDefaultMaxClusterDuration), + max_cluster_size_(0), + mode_(kFile), + new_cuepoint_(false), + output_cues_(true), + accurate_cluster_duration_(false), + fixed_size_cluster_timecode_(false), + estimate_file_duration_(false), + payload_pos_(0), + size_position_(0), + doc_type_version_(kDefaultDocTypeVersion), + doc_type_version_written_(0), + duration_(0.0), + writer_cluster_(NULL), + writer_cues_(NULL), + writer_header_(NULL) { + const time_t curr_time = time(NULL); + seed_ = static_cast(curr_time); +#ifdef _WIN32 + srand(seed_); +#endif +} + +Segment::~Segment() { + if (cluster_list_) { + for (int32_t i = 0; i < cluster_list_size_; ++i) { + Cluster* const cluster = cluster_list_[i]; + delete cluster; + } + delete[] cluster_list_; + } + + if (frames_) { + for (int32_t i = 0; i < frames_size_; ++i) { + Frame* const frame = frames_[i]; + delete frame; + } + delete[] frames_; + } + + delete[] chunk_name_; + delete[] chunking_base_name_; + + if (chunk_writer_cluster_) { + chunk_writer_cluster_->Close(); + delete chunk_writer_cluster_; + } + if (chunk_writer_cues_) { + chunk_writer_cues_->Close(); + delete chunk_writer_cues_; + } + if (chunk_writer_header_) { + chunk_writer_header_->Close(); + delete chunk_writer_header_; + } +} + +void Segment::MoveCuesBeforeClustersHelper(uint64_t diff, int32_t index, + uint64_t* cues_size) { + CuePoint* const cue_point = cues_.GetCueByIndex(index); + if (cue_point == NULL) + return; + const uint64_t old_cue_point_size = cue_point->Size(); + const uint64_t cluster_pos = cue_point->cluster_pos() + diff; + cue_point->set_cluster_pos(cluster_pos); // update the new cluster position + // New size of the cue is computed as follows + // Let a = current sum of size of all CuePoints + // Let b = Increase in Cue Point's size due to this iteration + // Let c = Increase in size of Cues Element's length due to this iteration + // (This is computed as CodedSize(a + b) - CodedSize(a)) + // Let d = b + c. Now d is the |diff| passed to the next recursive call. + // Let e = a + b. Now e is the |cues_size| passed to the next recursive + // call. + const uint64_t cue_point_size_diff = cue_point->Size() - old_cue_point_size; + const uint64_t cue_size_diff = + GetCodedUIntSize(*cues_size + cue_point_size_diff) - + GetCodedUIntSize(*cues_size); + *cues_size += cue_point_size_diff; + diff = cue_size_diff + cue_point_size_diff; + if (diff > 0) { + for (int32_t i = 0; i < cues_.cue_entries_size(); ++i) { + MoveCuesBeforeClustersHelper(diff, i, cues_size); + } + } +} + +void Segment::MoveCuesBeforeClusters() { + const uint64_t current_cue_size = cues_.Size(); + uint64_t cue_size = 0; + for (int32_t i = 0; i < cues_.cue_entries_size(); ++i) + cue_size += cues_.GetCueByIndex(i)->Size(); + for (int32_t i = 0; i < cues_.cue_entries_size(); ++i) + MoveCuesBeforeClustersHelper(current_cue_size, i, &cue_size); + + // Adjust the Seek Entry to reflect the change in position + // of Cluster and Cues + int32_t cluster_index = 0; + int32_t cues_index = 0; + for (int32_t i = 0; i < SeekHead::kSeekEntryCount; ++i) { + if (seek_head_.GetId(i) == libwebm::kMkvCluster) + cluster_index = i; + if (seek_head_.GetId(i) == libwebm::kMkvCues) + cues_index = i; + } + seek_head_.SetSeekEntry(cues_index, libwebm::kMkvCues, + seek_head_.GetPosition(cluster_index)); + seek_head_.SetSeekEntry(cluster_index, libwebm::kMkvCluster, + cues_.Size() + seek_head_.GetPosition(cues_index)); +} + +bool Segment::Init(IMkvWriter* ptr_writer) { + if (!ptr_writer) { + return false; + } + writer_cluster_ = ptr_writer; + writer_cues_ = ptr_writer; + writer_header_ = ptr_writer; + memset(&track_frames_written_, 0, + sizeof(track_frames_written_[0]) * kMaxTrackNumber); + memset(&last_track_timestamp_, 0, + sizeof(last_track_timestamp_[0]) * kMaxTrackNumber); + return segment_info_.Init(); +} + +bool Segment::CopyAndMoveCuesBeforeClusters(mkvparser::IMkvReader* reader, + IMkvWriter* writer) { + if (!writer->Seekable() || chunking_) + return false; + const int64_t cluster_offset = + cluster_list_[0]->size_position() - GetUIntSize(libwebm::kMkvCluster); + + // Copy the headers. + if (!ChunkedCopy(reader, writer, 0, cluster_offset)) + return false; + + // Recompute cue positions and seek entries. + MoveCuesBeforeClusters(); + + // Write cues and seek entries. + // TODO(vigneshv): As of now, it's safe to call seek_head_.Finalize() for the + // second time with a different writer object. But the name Finalize() doesn't + // indicate something we want to call more than once. So consider renaming it + // to write() or some such. + if (!cues_.Write(writer) || !seek_head_.Finalize(writer)) + return false; + + // Copy the Clusters. + if (!ChunkedCopy(reader, writer, cluster_offset, + cluster_end_offset_ - cluster_offset)) + return false; + + // Update the Segment size in case the Cues size has changed. + const int64_t pos = writer->Position(); + const int64_t segment_size = writer->Position() - payload_pos_; + if (writer->Position(size_position_) || + WriteUIntSize(writer, segment_size, 8) || writer->Position(pos)) + return false; + return true; +} + +bool Segment::Finalize() { + if (WriteFramesAll() < 0) + return false; + + // In kLive mode, call Cluster::Finalize only if |accurate_cluster_duration_| + // is set. In all other modes, always call Cluster::Finalize. + if ((mode_ == kLive ? accurate_cluster_duration_ : true) && + cluster_list_size_ > 0) { + // Update last cluster's size + Cluster* const old_cluster = cluster_list_[cluster_list_size_ - 1]; + + // For the last frame of the last Cluster, we don't write it as a BlockGroup + // with Duration unless the frame itself has duration set explicitly. + if (!old_cluster || !old_cluster->Finalize(false, 0)) + return false; + } + + if (mode_ == kFile) { + if (chunking_ && chunk_writer_cluster_) { + chunk_writer_cluster_->Close(); + chunk_count_++; + } + + double duration = + (static_cast(last_timestamp_) + last_block_duration_) / + segment_info_.timecode_scale(); + if (duration_ > 0.0) { + duration = duration_; + } else { + if (last_block_duration_ == 0 && estimate_file_duration_) { + const int num_tracks = static_cast(tracks_.track_entries_size()); + for (int i = 0; i < num_tracks; ++i) { + if (track_frames_written_[i] < 2) + continue; + + // Estimate the duration for the last block of a Track. + const double nano_per_frame = + static_cast(last_track_timestamp_[i]) / + (track_frames_written_[i] - 1); + const double track_duration = + (last_track_timestamp_[i] + nano_per_frame) / + segment_info_.timecode_scale(); + if (track_duration > duration) + duration = track_duration; + } + } + } + segment_info_.set_duration(duration); + if (!segment_info_.Finalize(writer_header_)) + return false; + + if (output_cues_) + if (!seek_head_.AddSeekEntry(libwebm::kMkvCues, MaxOffset())) + return false; + + if (chunking_) { + if (!chunk_writer_cues_) + return false; + + char* name = NULL; + if (!UpdateChunkName("cues", &name)) + return false; + + const bool cues_open = chunk_writer_cues_->Open(name); + delete[] name; + if (!cues_open) + return false; + } + + cluster_end_offset_ = writer_cluster_->Position(); + + // Write the seek headers and cues + if (output_cues_) + if (!cues_.Write(writer_cues_)) + return false; + + if (!seek_head_.Finalize(writer_header_)) + return false; + + if (writer_header_->Seekable()) { + if (size_position_ == -1) + return false; + + const int64_t segment_size = MaxOffset(); + if (segment_size < 1) + return false; + + const int64_t pos = writer_header_->Position(); + UpdateDocTypeVersion(); + if (doc_type_version_ != doc_type_version_written_) { + if (writer_header_->Position(0)) + return false; + + const char* const doc_type = + DocTypeIsWebm() ? kDocTypeWebm : kDocTypeMatroska; + if (!WriteEbmlHeader(writer_header_, doc_type_version_, doc_type)) + return false; + if (writer_header_->Position() != ebml_header_size_) + return false; + + doc_type_version_written_ = doc_type_version_; + } + + if (writer_header_->Position(size_position_)) + return false; + + if (WriteUIntSize(writer_header_, segment_size, 8)) + return false; + + if (writer_header_->Position(pos)) + return false; + } + + if (chunking_) { + // Do not close any writers until the segment size has been written, + // otherwise the size may be off. + if (!chunk_writer_cues_ || !chunk_writer_header_) + return false; + + chunk_writer_cues_->Close(); + chunk_writer_header_->Close(); + } + } + + return true; +} + +Track* Segment::AddTrack(int32_t number) { + Track* const track = new (std::nothrow) Track(&seed_); // NOLINT + + if (!track) + return NULL; + + if (!tracks_.AddTrack(track, number)) { + delete track; + return NULL; + } + + return track; +} + +Chapter* Segment::AddChapter() { return chapters_.AddChapter(&seed_); } + +Tag* Segment::AddTag() { return tags_.AddTag(); } + +uint64_t Segment::AddVideoTrack(int32_t width, int32_t height, int32_t number) { + VideoTrack* const track = new (std::nothrow) VideoTrack(&seed_); // NOLINT + if (!track) + return 0; + + track->set_type(Tracks::kVideo); + track->set_codec_id(Tracks::kVp8CodecId); + track->set_width(width); + track->set_height(height); + + if (!tracks_.AddTrack(track, number)) { + delete track; + return 0; + } + has_video_ = true; + + return track->number(); +} + +bool Segment::AddCuePoint(uint64_t timestamp, uint64_t track) { + if (cluster_list_size_ < 1) + return false; + + const Cluster* const cluster = cluster_list_[cluster_list_size_ - 1]; + if (!cluster) + return false; + + CuePoint* const cue = new (std::nothrow) CuePoint(); // NOLINT + if (!cue) + return false; + + cue->set_time(timestamp / segment_info_.timecode_scale()); + cue->set_block_number(cluster->blocks_added()); + cue->set_cluster_pos(cluster->position_for_cues()); + cue->set_track(track); + if (!cues_.AddCue(cue)) { + delete cue; + return false; + } + + new_cuepoint_ = false; + return true; +} + +uint64_t Segment::AddAudioTrack(int32_t sample_rate, int32_t channels, + int32_t number) { + AudioTrack* const track = new (std::nothrow) AudioTrack(&seed_); // NOLINT + if (!track) + return 0; + + track->set_type(Tracks::kAudio); + track->set_codec_id(Tracks::kVorbisCodecId); + track->set_sample_rate(sample_rate); + track->set_channels(channels); + + if (!tracks_.AddTrack(track, number)) { + delete track; + return 0; + } + + return track->number(); +} + +bool Segment::AddFrame(const uint8_t* data, uint64_t length, + uint64_t track_number, uint64_t timestamp, bool is_key) { + if (!data) + return false; + + Frame frame; + if (!frame.Init(data, length)) + return false; + frame.set_track_number(track_number); + frame.set_timestamp(timestamp); + frame.set_is_key(is_key); + return AddGenericFrame(&frame); +} + +bool Segment::AddFrameWithAdditional(const uint8_t* data, uint64_t length, + const uint8_t* additional, + uint64_t additional_length, + uint64_t add_id, uint64_t track_number, + uint64_t timestamp, bool is_key) { + if (!data || !additional) + return false; + + Frame frame; + if (!frame.Init(data, length) || + !frame.AddAdditionalData(additional, additional_length, add_id)) { + return false; + } + frame.set_track_number(track_number); + frame.set_timestamp(timestamp); + frame.set_is_key(is_key); + return AddGenericFrame(&frame); +} + +bool Segment::AddFrameWithDiscardPadding(const uint8_t* data, uint64_t length, + int64_t discard_padding, + uint64_t track_number, + uint64_t timestamp, bool is_key) { + if (!data) + return false; + + Frame frame; + if (!frame.Init(data, length)) + return false; + frame.set_discard_padding(discard_padding); + frame.set_track_number(track_number); + frame.set_timestamp(timestamp); + frame.set_is_key(is_key); + return AddGenericFrame(&frame); +} + +bool Segment::AddMetadata(const uint8_t* data, uint64_t length, + uint64_t track_number, uint64_t timestamp_ns, + uint64_t duration_ns) { + if (!data) + return false; + + Frame frame; + if (!frame.Init(data, length)) + return false; + frame.set_track_number(track_number); + frame.set_timestamp(timestamp_ns); + frame.set_duration(duration_ns); + frame.set_is_key(true); // All metadata blocks are keyframes. + return AddGenericFrame(&frame); +} + +bool Segment::AddGenericFrame(const Frame* frame) { + if (!frame) + return false; + + if (!CheckHeaderInfo()) + return false; + + // Check for non-monotonically increasing timestamps. + if (frame->timestamp() < last_timestamp_) + return false; + + // Check if the track number is valid. + if (!tracks_.GetTrackByNumber(frame->track_number())) + return false; + + if (frame->discard_padding() != 0) + doc_type_version_ = 4; + + if (cluster_list_size_ > 0) { + const uint64_t timecode_scale = segment_info_.timecode_scale(); + const uint64_t frame_timecode = frame->timestamp() / timecode_scale; + + const Cluster* const last_cluster = cluster_list_[cluster_list_size_ - 1]; + const uint64_t last_cluster_timecode = last_cluster->timecode(); + + const uint64_t rel_timecode = frame_timecode - last_cluster_timecode; + if (rel_timecode > kMaxBlockTimecode) { + force_new_cluster_ = true; + } + } + + // If the segment has a video track hold onto audio frames to make sure the + // audio that is associated with the start time of a video key-frame is + // muxed into the same cluster. + if (has_video_ && tracks_.TrackIsAudio(frame->track_number()) && + !force_new_cluster_) { + Frame* const new_frame = new (std::nothrow) Frame(); + if (!new_frame || !new_frame->CopyFrom(*frame)) { + delete new_frame; + return false; + } + if (!QueueFrame(new_frame)) { + delete new_frame; + return false; + } + track_frames_written_[frame->track_number() - 1]++; + return true; + } + + if (!DoNewClusterProcessing(frame->track_number(), frame->timestamp(), + frame->is_key())) { + return false; + } + + if (cluster_list_size_ < 1) + return false; + + Cluster* const cluster = cluster_list_[cluster_list_size_ - 1]; + if (!cluster) + return false; + + // If the Frame is not a SimpleBlock, then set the reference_block_timestamp + // if it is not set already. + bool frame_created = false; + if (!frame->CanBeSimpleBlock() && !frame->is_key() && + !frame->reference_block_timestamp_set()) { + Frame* const new_frame = new (std::nothrow) Frame(); + if (!new_frame || !new_frame->CopyFrom(*frame)) { + delete new_frame; + return false; + } + new_frame->set_reference_block_timestamp( + last_track_timestamp_[frame->track_number() - 1]); + frame = new_frame; + frame_created = true; + } + + if (!cluster->AddFrame(frame)) + return false; + + if (new_cuepoint_ && cues_track_ == frame->track_number()) { + if (!AddCuePoint(frame->timestamp(), cues_track_)) + return false; + } + + last_timestamp_ = frame->timestamp(); + last_track_timestamp_[frame->track_number() - 1] = frame->timestamp(); + last_block_duration_ = frame->duration(); + track_frames_written_[frame->track_number() - 1]++; + + if (frame_created) + delete frame; + return true; +} + +void Segment::OutputCues(bool output_cues) { output_cues_ = output_cues; } + +void Segment::AccurateClusterDuration(bool accurate_cluster_duration) { + accurate_cluster_duration_ = accurate_cluster_duration; +} + +void Segment::UseFixedSizeClusterTimecode(bool fixed_size_cluster_timecode) { + fixed_size_cluster_timecode_ = fixed_size_cluster_timecode; +} + +bool Segment::SetChunking(bool chunking, const char* filename) { + if (chunk_count_ > 0) + return false; + + if (chunking) { + if (!filename) + return false; + + // Check if we are being set to what is already set. + if (chunking_ && !strcmp(filename, chunking_base_name_)) + return true; + + const size_t name_length = strlen(filename) + 1; + char* const temp = new (std::nothrow) char[name_length]; // NOLINT + if (!temp) + return false; + +#ifdef _MSC_VER + strcpy_s(temp, name_length, filename); +#else + strcpy(temp, filename); +#endif + + delete[] chunking_base_name_; + chunking_base_name_ = temp; + + if (!UpdateChunkName("chk", &chunk_name_)) + return false; + + if (!chunk_writer_cluster_) { + chunk_writer_cluster_ = new (std::nothrow) MkvWriter(); // NOLINT + if (!chunk_writer_cluster_) + return false; + } + + if (!chunk_writer_cues_) { + chunk_writer_cues_ = new (std::nothrow) MkvWriter(); // NOLINT + if (!chunk_writer_cues_) + return false; + } + + if (!chunk_writer_header_) { + chunk_writer_header_ = new (std::nothrow) MkvWriter(); // NOLINT + if (!chunk_writer_header_) + return false; + } + + if (!chunk_writer_cluster_->Open(chunk_name_)) + return false; + + const size_t header_length = strlen(filename) + strlen(".hdr") + 1; + char* const header = new (std::nothrow) char[header_length]; // NOLINT + if (!header) + return false; + +#ifdef _MSC_VER + strcpy_s(header, header_length - strlen(".hdr"), chunking_base_name_); + strcat_s(header, header_length, ".hdr"); +#else + strcpy(header, chunking_base_name_); + strcat(header, ".hdr"); +#endif + if (!chunk_writer_header_->Open(header)) { + delete[] header; + return false; + } + + writer_cluster_ = chunk_writer_cluster_; + writer_cues_ = chunk_writer_cues_; + writer_header_ = chunk_writer_header_; + + delete[] header; + } + + chunking_ = chunking; + + return true; +} + +bool Segment::CuesTrack(uint64_t track_number) { + const Track* const track = GetTrackByNumber(track_number); + if (!track) + return false; + + cues_track_ = track_number; + return true; +} + +void Segment::ForceNewClusterOnNextFrame() { force_new_cluster_ = true; } + +Track* Segment::GetTrackByNumber(uint64_t track_number) const { + return tracks_.GetTrackByNumber(track_number); +} + +bool Segment::WriteSegmentHeader() { + UpdateDocTypeVersion(); + + const char* const doc_type = + DocTypeIsWebm() ? kDocTypeWebm : kDocTypeMatroska; + if (!WriteEbmlHeader(writer_header_, doc_type_version_, doc_type)) + return false; + doc_type_version_written_ = doc_type_version_; + ebml_header_size_ = static_cast(writer_header_->Position()); + + // Write "unknown" (-1) as segment size value. If mode is kFile, Segment + // will write over duration when the file is finalized. + if (WriteID(writer_header_, libwebm::kMkvSegment)) + return false; + + // Save for later. + size_position_ = writer_header_->Position(); + + // Write "unknown" (EBML coded -1) as segment size value. We need to write 8 + // bytes because if we are going to overwrite the segment size later we do + // not know how big our segment will be. + if (SerializeInt(writer_header_, kEbmlUnknownValue, 8)) + return false; + + payload_pos_ = writer_header_->Position(); + + if (mode_ == kFile && writer_header_->Seekable()) { + // Set the duration > 0.0 so SegmentInfo will write out the duration. When + // the muxer is done writing we will set the correct duration and have + // SegmentInfo upadte it. + segment_info_.set_duration(1.0); + + if (!seek_head_.Write(writer_header_)) + return false; + } + + if (!seek_head_.AddSeekEntry(libwebm::kMkvInfo, MaxOffset())) + return false; + if (!segment_info_.Write(writer_header_)) + return false; + + if (!seek_head_.AddSeekEntry(libwebm::kMkvTracks, MaxOffset())) + return false; + if (!tracks_.Write(writer_header_)) + return false; + + if (chapters_.Count() > 0) { + if (!seek_head_.AddSeekEntry(libwebm::kMkvChapters, MaxOffset())) + return false; + if (!chapters_.Write(writer_header_)) + return false; + } + + if (tags_.Count() > 0) { + if (!seek_head_.AddSeekEntry(libwebm::kMkvTags, MaxOffset())) + return false; + if (!tags_.Write(writer_header_)) + return false; + } + + if (chunking_ && (mode_ == kLive || !writer_header_->Seekable())) { + if (!chunk_writer_header_) + return false; + + chunk_writer_header_->Close(); + } + + header_written_ = true; + + return true; +} + +// Here we are testing whether to create a new cluster, given a frame +// having time frame_timestamp_ns. +// +int Segment::TestFrame(uint64_t track_number, uint64_t frame_timestamp_ns, + bool is_key) const { + if (force_new_cluster_) + return 1; + + // If no clusters have been created yet, then create a new cluster + // and write this frame immediately, in the new cluster. This path + // should only be followed once, the first time we attempt to write + // a frame. + + if (cluster_list_size_ <= 0) + return 1; + + // There exists at least one cluster. We must compare the frame to + // the last cluster, in order to determine whether the frame is + // written to the existing cluster, or that a new cluster should be + // created. + + const uint64_t timecode_scale = segment_info_.timecode_scale(); + const uint64_t frame_timecode = frame_timestamp_ns / timecode_scale; + + const Cluster* const last_cluster = cluster_list_[cluster_list_size_ - 1]; + const uint64_t last_cluster_timecode = last_cluster->timecode(); + + // For completeness we test for the case when the frame's timecode + // is less than the cluster's timecode. Although in principle that + // is allowed, this muxer doesn't actually write clusters like that, + // so this indicates a bug somewhere in our algorithm. + + if (frame_timecode < last_cluster_timecode) // should never happen + return -1; + + // If the frame has a timestamp significantly larger than the last + // cluster (in Matroska, cluster-relative timestamps are serialized + // using a 16-bit signed integer), then we cannot write this frame + // to that cluster, and so we must create a new cluster. + + const int64_t delta_timecode = frame_timecode - last_cluster_timecode; + + if (delta_timecode > kMaxBlockTimecode) + return 2; + + // We decide to create a new cluster when we have a video keyframe. + // This will flush queued (audio) frames, and write the keyframe + // immediately, in the newly-created cluster. + + if (is_key && tracks_.TrackIsVideo(track_number)) + return 1; + + // Create a new cluster if we have accumulated too many frames + // already, where "too many" is defined as "the total time of frames + // in the cluster exceeds a threshold". + + const uint64_t delta_ns = delta_timecode * timecode_scale; + + if (max_cluster_duration_ > 0 && delta_ns >= max_cluster_duration_) + return 1; + + // This is similar to the case above, with the difference that a new + // cluster is created when the size of the current cluster exceeds a + // threshold. + + const uint64_t cluster_size = last_cluster->payload_size(); + + if (max_cluster_size_ > 0 && cluster_size >= max_cluster_size_) + return 1; + + // There's no need to create a new cluster, so emit this frame now. + + return 0; +} + +bool Segment::MakeNewCluster(uint64_t frame_timestamp_ns) { + const int32_t new_size = cluster_list_size_ + 1; + + if (new_size > cluster_list_capacity_) { + // Add more clusters. + const int32_t new_capacity = + (cluster_list_capacity_ <= 0) ? 1 : cluster_list_capacity_ * 2; + Cluster** const clusters = + new (std::nothrow) Cluster*[new_capacity]; // NOLINT + if (!clusters) + return false; + + for (int32_t i = 0; i < cluster_list_size_; ++i) { + clusters[i] = cluster_list_[i]; + } + + delete[] cluster_list_; + + cluster_list_ = clusters; + cluster_list_capacity_ = new_capacity; + } + + if (!WriteFramesLessThan(frame_timestamp_ns)) + return false; + + if (cluster_list_size_ > 0) { + // Update old cluster's size + Cluster* const old_cluster = cluster_list_[cluster_list_size_ - 1]; + + if (!old_cluster || !old_cluster->Finalize(true, frame_timestamp_ns)) + return false; + } + + if (output_cues_) + new_cuepoint_ = true; + + if (chunking_ && cluster_list_size_ > 0) { + chunk_writer_cluster_->Close(); + chunk_count_++; + + if (!UpdateChunkName("chk", &chunk_name_)) + return false; + if (!chunk_writer_cluster_->Open(chunk_name_)) + return false; + } + + const uint64_t timecode_scale = segment_info_.timecode_scale(); + const uint64_t frame_timecode = frame_timestamp_ns / timecode_scale; + + uint64_t cluster_timecode = frame_timecode; + + if (frames_size_ > 0) { + const Frame* const f = frames_[0]; // earliest queued frame + const uint64_t ns = f->timestamp(); + const uint64_t tc = ns / timecode_scale; + + if (tc < cluster_timecode) + cluster_timecode = tc; + } + + Cluster*& cluster = cluster_list_[cluster_list_size_]; + const int64_t offset = MaxOffset(); + cluster = new (std::nothrow) + Cluster(cluster_timecode, offset, segment_info_.timecode_scale(), + accurate_cluster_duration_, fixed_size_cluster_timecode_); + if (!cluster) + return false; + + if (!cluster->Init(writer_cluster_)) + return false; + + cluster_list_size_ = new_size; + return true; +} + +bool Segment::DoNewClusterProcessing(uint64_t track_number, + uint64_t frame_timestamp_ns, bool is_key) { + for (;;) { + // Based on the characteristics of the current frame and current + // cluster, decide whether to create a new cluster. + const int result = TestFrame(track_number, frame_timestamp_ns, is_key); + if (result < 0) // error + return false; + + // Always set force_new_cluster_ to false after TestFrame. + force_new_cluster_ = false; + + // A non-zero result means create a new cluster. + if (result > 0 && !MakeNewCluster(frame_timestamp_ns)) + return false; + + // Write queued (audio) frames. + const int frame_count = WriteFramesAll(); + if (frame_count < 0) // error + return false; + + // Write the current frame to the current cluster (if TestFrame + // returns 0) or to a newly created cluster (TestFrame returns 1). + if (result <= 1) + return true; + + // TestFrame returned 2, which means there was a large time + // difference between the cluster and the frame itself. Do the + // test again, comparing the frame to the new cluster. + } +} + +bool Segment::CheckHeaderInfo() { + if (!header_written_) { + if (!WriteSegmentHeader()) + return false; + + if (!seek_head_.AddSeekEntry(libwebm::kMkvCluster, MaxOffset())) + return false; + + if (output_cues_ && cues_track_ == 0) { + // Check for a video track + for (uint32_t i = 0; i < tracks_.track_entries_size(); ++i) { + const Track* const track = tracks_.GetTrackByIndex(i); + if (!track) + return false; + + if (tracks_.TrackIsVideo(track->number())) { + cues_track_ = track->number(); + break; + } + } + + // Set first track found + if (cues_track_ == 0) { + const Track* const track = tracks_.GetTrackByIndex(0); + if (!track) + return false; + + cues_track_ = track->number(); + } + } + } + return true; +} + +void Segment::UpdateDocTypeVersion() { + for (uint32_t index = 0; index < tracks_.track_entries_size(); ++index) { + const Track* track = tracks_.GetTrackByIndex(index); + if (track == NULL) + break; + if ((track->codec_delay() || track->seek_pre_roll()) && + doc_type_version_ < 4) { + doc_type_version_ = 4; + break; + } + } +} + +bool Segment::UpdateChunkName(const char* ext, char** name) const { + if (!name || !ext) + return false; + + char ext_chk[64]; +#ifdef _MSC_VER + sprintf_s(ext_chk, sizeof(ext_chk), "_%06d.%s", chunk_count_, ext); +#else + snprintf(ext_chk, sizeof(ext_chk), "_%06d.%s", chunk_count_, ext); +#endif + + const size_t length = strlen(chunking_base_name_) + strlen(ext_chk) + 1; + char* const str = new (std::nothrow) char[length]; // NOLINT + if (!str) + return false; + +#ifdef _MSC_VER + strcpy_s(str, length - strlen(ext_chk), chunking_base_name_); + strcat_s(str, length, ext_chk); +#else + strcpy(str, chunking_base_name_); + strcat(str, ext_chk); +#endif + + delete[] * name; + *name = str; + + return true; +} + +int64_t Segment::MaxOffset() { + if (!writer_header_) + return -1; + + int64_t offset = writer_header_->Position() - payload_pos_; + + if (chunking_) { + for (int32_t i = 0; i < cluster_list_size_; ++i) { + Cluster* const cluster = cluster_list_[i]; + offset += cluster->Size(); + } + + if (writer_cues_) + offset += writer_cues_->Position(); + } + + return offset; +} + +bool Segment::QueueFrame(Frame* frame) { + const int32_t new_size = frames_size_ + 1; + + if (new_size > frames_capacity_) { + // Add more frames. + const int32_t new_capacity = (!frames_capacity_) ? 2 : frames_capacity_ * 2; + + if (new_capacity < 1) + return false; + + Frame** const frames = new (std::nothrow) Frame*[new_capacity]; // NOLINT + if (!frames) + return false; + + for (int32_t i = 0; i < frames_size_; ++i) { + frames[i] = frames_[i]; + } + + delete[] frames_; + frames_ = frames; + frames_capacity_ = new_capacity; + } + + frames_[frames_size_++] = frame; + + return true; +} + +int Segment::WriteFramesAll() { + if (frames_ == NULL) + return 0; + + if (cluster_list_size_ < 1) + return -1; + + Cluster* const cluster = cluster_list_[cluster_list_size_ - 1]; + + if (!cluster) + return -1; + + for (int32_t i = 0; i < frames_size_; ++i) { + Frame*& frame = frames_[i]; + // TODO(jzern/vigneshv): using Segment::AddGenericFrame here would limit the + // places where |doc_type_version_| needs to be updated. + if (frame->discard_padding() != 0) + doc_type_version_ = 4; + if (!cluster->AddFrame(frame)) + return -1; + + if (new_cuepoint_ && cues_track_ == frame->track_number()) { + if (!AddCuePoint(frame->timestamp(), cues_track_)) + return -1; + } + + if (frame->timestamp() > last_timestamp_) { + last_timestamp_ = frame->timestamp(); + last_track_timestamp_[frame->track_number() - 1] = frame->timestamp(); + } + + delete frame; + frame = NULL; + } + + const int result = frames_size_; + frames_size_ = 0; + + return result; +} + +bool Segment::WriteFramesLessThan(uint64_t timestamp) { + // Check |cluster_list_size_| to see if this is the first cluster. If it is + // the first cluster the audio frames that are less than the first video + // timesatmp will be written in a later step. + if (frames_size_ > 0 && cluster_list_size_ > 0) { + if (!frames_) + return false; + + Cluster* const cluster = cluster_list_[cluster_list_size_ - 1]; + if (!cluster) + return false; + + int32_t shift_left = 0; + + // TODO(fgalligan): Change this to use the durations of frames instead of + // the next frame's start time if the duration is accurate. + for (int32_t i = 1; i < frames_size_; ++i) { + const Frame* const frame_curr = frames_[i]; + + if (frame_curr->timestamp() > timestamp) + break; + + const Frame* const frame_prev = frames_[i - 1]; + if (frame_prev->discard_padding() != 0) + doc_type_version_ = 4; + if (!cluster->AddFrame(frame_prev)) + return false; + + if (new_cuepoint_ && cues_track_ == frame_prev->track_number()) { + if (!AddCuePoint(frame_prev->timestamp(), cues_track_)) + return false; + } + + ++shift_left; + if (frame_prev->timestamp() > last_timestamp_) { + last_timestamp_ = frame_prev->timestamp(); + last_track_timestamp_[frame_prev->track_number() - 1] = + frame_prev->timestamp(); + } + + delete frame_prev; + } + + if (shift_left > 0) { + if (shift_left >= frames_size_) + return false; + + const int32_t new_frames_size = frames_size_ - shift_left; + for (int32_t i = 0; i < new_frames_size; ++i) { + frames_[i] = frames_[i + shift_left]; + } + + frames_size_ = new_frames_size; + } + } + + return true; +} + +bool Segment::DocTypeIsWebm() const { + const int kNumCodecIds = 9; + + // TODO(vigneshv): Tweak .clang-format. + const char* kWebmCodecIds[kNumCodecIds] = { + Tracks::kOpusCodecId, Tracks::kVorbisCodecId, + Tracks::kAv1CodecId, Tracks::kVp8CodecId, + Tracks::kVp9CodecId, Tracks::kWebVttCaptionsId, + Tracks::kWebVttDescriptionsId, Tracks::kWebVttMetadataId, + Tracks::kWebVttSubtitlesId}; + + const int num_tracks = static_cast(tracks_.track_entries_size()); + for (int track_index = 0; track_index < num_tracks; ++track_index) { + const Track* const track = tracks_.GetTrackByIndex(track_index); + const std::string codec_id = track->codec_id(); + + bool id_is_webm = false; + for (int id_index = 0; id_index < kNumCodecIds; ++id_index) { + if (codec_id == kWebmCodecIds[id_index]) { + id_is_webm = true; + break; + } + } + + if (!id_is_webm) + return false; + } + + return true; +} + +} // namespace mkvmuxer diff --git a/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxer.h b/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxer.h new file mode 100644 index 000000000..f2db37714 --- /dev/null +++ b/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxer.h @@ -0,0 +1,1924 @@ +// Copyright (c) 2012 The WebM project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. + +#ifndef MKVMUXER_MKVMUXER_H_ +#define MKVMUXER_MKVMUXER_H_ + +#include + +#include +#include +#include + +#include "common/webmids.h" +#include "mkvmuxer/mkvmuxertypes.h" + +// For a description of the WebM elements see +// http://www.webmproject.org/code/specs/container/. + +namespace mkvparser { +class IMkvReader; +} // namespace mkvparser + +namespace mkvmuxer { + +class MkvWriter; +class Segment; + +const uint64_t kMaxTrackNumber = 126; + +/////////////////////////////////////////////////////////////// +// Interface used by the mkvmuxer to write out the Mkv data. +class IMkvWriter { + public: + // Writes out |len| bytes of |buf|. Returns 0 on success. + virtual int32 Write(const void* buf, uint32 len) = 0; + + // Returns the offset of the output position from the beginning of the + // output. + virtual int64 Position() const = 0; + + // Set the current File position. Returns 0 on success. + virtual int32 Position(int64 position) = 0; + + // Returns true if the writer is seekable. + virtual bool Seekable() const = 0; + + // Element start notification. Called whenever an element identifier is about + // to be written to the stream. |element_id| is the element identifier, and + // |position| is the location in the WebM stream where the first octet of the + // element identifier will be written. + // Note: the |MkvId| enumeration in webmids.hpp defines element values. + virtual void ElementStartNotify(uint64 element_id, int64 position) = 0; + + protected: + IMkvWriter(); + virtual ~IMkvWriter(); + + private: + LIBWEBM_DISALLOW_COPY_AND_ASSIGN(IMkvWriter); +}; + +// Writes out the EBML header for a WebM file, but allows caller to specify +// DocType. This function must be called before any other libwebm writing +// functions are called. +bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version, + const char* const doc_type); + +// Writes out the EBML header for a WebM file. This function must be called +// before any other libwebm writing functions are called. +bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version); + +// Deprecated. Writes out EBML header with doc_type_version as +// kDefaultDocTypeVersion. Exists for backward compatibility. +bool WriteEbmlHeader(IMkvWriter* writer); + +// Copies in Chunk from source to destination between the given byte positions +bool ChunkedCopy(mkvparser::IMkvReader* source, IMkvWriter* dst, int64_t start, + int64_t size); + +/////////////////////////////////////////////////////////////// +// Class to hold data the will be written to a block. +class Frame { + public: + Frame(); + ~Frame(); + + // Sets this frame's contents based on |frame|. Returns true on success. On + // failure, this frame's existing contents may be lost. + bool CopyFrom(const Frame& frame); + + // Copies |frame| data into |frame_|. Returns true on success. + bool Init(const uint8_t* frame, uint64_t length); + + // Copies |additional| data into |additional_|. Returns true on success. + bool AddAdditionalData(const uint8_t* additional, uint64_t length, + uint64_t add_id); + + // Returns true if the frame has valid parameters. + bool IsValid() const; + + // Returns true if the frame can be written as a SimpleBlock based on current + // parameters. + bool CanBeSimpleBlock() const; + + uint64_t add_id() const { return add_id_; } + const uint8_t* additional() const { return additional_; } + uint64_t additional_length() const { return additional_length_; } + void set_duration(uint64_t duration); + uint64_t duration() const { return duration_; } + bool duration_set() const { return duration_set_; } + const uint8_t* frame() const { return frame_; } + void set_is_key(bool key) { is_key_ = key; } + bool is_key() const { return is_key_; } + uint64_t length() const { return length_; } + void set_track_number(uint64_t track_number) { track_number_ = track_number; } + uint64_t track_number() const { return track_number_; } + void set_timestamp(uint64_t timestamp) { timestamp_ = timestamp; } + uint64_t timestamp() const { return timestamp_; } + void set_discard_padding(int64_t discard_padding) { + discard_padding_ = discard_padding; + } + int64_t discard_padding() const { return discard_padding_; } + void set_reference_block_timestamp(int64_t reference_block_timestamp); + int64_t reference_block_timestamp() const { + return reference_block_timestamp_; + } + bool reference_block_timestamp_set() const { + return reference_block_timestamp_set_; + } + + private: + // Id of the Additional data. + uint64_t add_id_; + + // Pointer to additional data. Owned by this class. + uint8_t* additional_; + + // Length of the additional data. + uint64_t additional_length_; + + // Duration of the frame in nanoseconds. + uint64_t duration_; + + // Flag indicating that |duration_| has been set. Setting duration causes the + // frame to be written out as a Block with BlockDuration instead of as a + // SimpleBlock. + bool duration_set_; + + // Pointer to the data. Owned by this class. + uint8_t* frame_; + + // Flag telling if the data should set the key flag of a block. + bool is_key_; + + // Length of the data. + uint64_t length_; + + // Mkv track number the data is associated with. + uint64_t track_number_; + + // Timestamp of the data in nanoseconds. + uint64_t timestamp_; + + // Discard padding for the frame. + int64_t discard_padding_; + + // Reference block timestamp. + int64_t reference_block_timestamp_; + + // Flag indicating if |reference_block_timestamp_| has been set. + bool reference_block_timestamp_set_; + + LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Frame); +}; + +/////////////////////////////////////////////////////////////// +// Class to hold one cue point in a Cues element. +class CuePoint { + public: + CuePoint(); + ~CuePoint(); + + // Returns the size in bytes for the entire CuePoint element. + uint64_t Size() const; + + // Output the CuePoint element to the writer. Returns true on success. + bool Write(IMkvWriter* writer) const; + + void set_time(uint64_t time) { time_ = time; } + uint64_t time() const { return time_; } + void set_track(uint64_t track) { track_ = track; } + uint64_t track() const { return track_; } + void set_cluster_pos(uint64_t cluster_pos) { cluster_pos_ = cluster_pos; } + uint64_t cluster_pos() const { return cluster_pos_; } + void set_block_number(uint64_t block_number) { block_number_ = block_number; } + uint64_t block_number() const { return block_number_; } + void set_output_block_number(bool output_block_number) { + output_block_number_ = output_block_number; + } + bool output_block_number() const { return output_block_number_; } + + private: + // Returns the size in bytes for the payload of the CuePoint element. + uint64_t PayloadSize() const; + + // Absolute timecode according to the segment time base. + uint64_t time_; + + // The Track element associated with the CuePoint. + uint64_t track_; + + // The position of the Cluster containing the Block. + uint64_t cluster_pos_; + + // Number of the Block within the Cluster, starting from 1. + uint64_t block_number_; + + // If true the muxer will write out the block number for the cue if the + // block number is different than the default of 1. Default is set to true. + bool output_block_number_; + + LIBWEBM_DISALLOW_COPY_AND_ASSIGN(CuePoint); +}; + +/////////////////////////////////////////////////////////////// +// Cues element. +class Cues { + public: + Cues(); + ~Cues(); + + // Adds a cue point to the Cues element. Returns true on success. + bool AddCue(CuePoint* cue); + + // Returns the cue point by index. Returns NULL if there is no cue point + // match. + CuePoint* GetCueByIndex(int32_t index) const; + + // Returns the total size of the Cues element + uint64_t Size(); + + // Output the Cues element to the writer. Returns true on success. + bool Write(IMkvWriter* writer) const; + + int32_t cue_entries_size() const { return cue_entries_size_; } + void set_output_block_number(bool output_block_number) { + output_block_number_ = output_block_number; + } + bool output_block_number() const { return output_block_number_; } + + private: + // Number of allocated elements in |cue_entries_|. + int32_t cue_entries_capacity_; + + // Number of CuePoints in |cue_entries_|. + int32_t cue_entries_size_; + + // CuePoint list. + CuePoint** cue_entries_; + + // If true the muxer will write out the block number for the cue if the + // block number is different than the default of 1. Default is set to true. + bool output_block_number_; + + LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Cues); +}; + +/////////////////////////////////////////////////////////////// +// ContentEncAESSettings element +class ContentEncAESSettings { + public: + enum { kCTR = 1 }; + + ContentEncAESSettings(); + ~ContentEncAESSettings() {} + + // Returns the size in bytes for the ContentEncAESSettings element. + uint64_t Size() const; + + // Writes out the ContentEncAESSettings element to |writer|. Returns true on + // success. + bool Write(IMkvWriter* writer) const; + + uint64_t cipher_mode() const { return cipher_mode_; } + + private: + // Returns the size in bytes for the payload of the ContentEncAESSettings + // element. + uint64_t PayloadSize() const; + + // Sub elements + uint64_t cipher_mode_; + + LIBWEBM_DISALLOW_COPY_AND_ASSIGN(ContentEncAESSettings); +}; + +/////////////////////////////////////////////////////////////// +// ContentEncoding element +// Elements used to describe if the track data has been encrypted or +// compressed with zlib or header stripping. +// Currently only whole frames can be encrypted with AES. This dictates that +// ContentEncodingOrder will be 0, ContentEncodingScope will be 1, +// ContentEncodingType will be 1, and ContentEncAlgo will be 5. +class ContentEncoding { + public: + ContentEncoding(); + ~ContentEncoding(); + + // Sets the content encryption id. Copies |length| bytes from |id| to + // |enc_key_id_|. Returns true on success. + bool SetEncryptionID(const uint8_t* id, uint64_t length); + + // Returns the size in bytes for the ContentEncoding element. + uint64_t Size() const; + + // Writes out the ContentEncoding element to |writer|. Returns true on + // success. + bool Write(IMkvWriter* writer) const; + + uint64_t enc_algo() const { return enc_algo_; } + uint64_t encoding_order() const { return encoding_order_; } + uint64_t encoding_scope() const { return encoding_scope_; } + uint64_t encoding_type() const { return encoding_type_; } + ContentEncAESSettings* enc_aes_settings() { return &enc_aes_settings_; } + + private: + // Returns the size in bytes for the encoding elements. + uint64_t EncodingSize(uint64_t compresion_size, + uint64_t encryption_size) const; + + // Returns the size in bytes for the encryption elements. + uint64_t EncryptionSize() const; + + // Track element names + uint64_t enc_algo_; + uint8_t* enc_key_id_; + uint64_t encoding_order_; + uint64_t encoding_scope_; + uint64_t encoding_type_; + + // ContentEncAESSettings element. + ContentEncAESSettings enc_aes_settings_; + + // Size of the ContentEncKeyID data in bytes. + uint64_t enc_key_id_length_; + + LIBWEBM_DISALLOW_COPY_AND_ASSIGN(ContentEncoding); +}; + +/////////////////////////////////////////////////////////////// +// Colour element. +class PrimaryChromaticity { + public: + static const float kChromaticityMin; + static const float kChromaticityMax; + + PrimaryChromaticity(float x_val, float y_val) : x_(x_val), y_(y_val) {} + PrimaryChromaticity() : x_(0), y_(0) {} + ~PrimaryChromaticity() {} + + // Returns sum of |x_id| and |y_id| element id sizes and payload sizes. + uint64_t PrimaryChromaticitySize(libwebm::MkvId x_id, + libwebm::MkvId y_id) const; + bool Valid() const; + bool Write(IMkvWriter* writer, libwebm::MkvId x_id, + libwebm::MkvId y_id) const; + + float x() const { return x_; } + void set_x(float new_x) { x_ = new_x; } + float y() const { return y_; } + void set_y(float new_y) { y_ = new_y; } + + private: + float x_; + float y_; +}; + +class MasteringMetadata { + public: + static const float kValueNotPresent; + static const float kMinLuminance; + static const float kMinLuminanceMax; + static const float kMaxLuminanceMax; + + MasteringMetadata() + : luminance_max_(kValueNotPresent), + luminance_min_(kValueNotPresent), + r_(NULL), + g_(NULL), + b_(NULL), + white_point_(NULL) {} + ~MasteringMetadata() { + delete r_; + delete g_; + delete b_; + delete white_point_; + } + + // Returns total size of the MasteringMetadata element. + uint64_t MasteringMetadataSize() const; + bool Valid() const; + bool Write(IMkvWriter* writer) const; + + // Copies non-null chromaticity. + bool SetChromaticity(const PrimaryChromaticity* r, + const PrimaryChromaticity* g, + const PrimaryChromaticity* b, + const PrimaryChromaticity* white_point); + const PrimaryChromaticity* r() const { return r_; } + const PrimaryChromaticity* g() const { return g_; } + const PrimaryChromaticity* b() const { return b_; } + const PrimaryChromaticity* white_point() const { return white_point_; } + + float luminance_max() const { return luminance_max_; } + void set_luminance_max(float luminance_max) { + luminance_max_ = luminance_max; + } + float luminance_min() const { return luminance_min_; } + void set_luminance_min(float luminance_min) { + luminance_min_ = luminance_min; + } + + private: + // Returns size of MasteringMetadata child elements. + uint64_t PayloadSize() const; + + float luminance_max_; + float luminance_min_; + PrimaryChromaticity* r_; + PrimaryChromaticity* g_; + PrimaryChromaticity* b_; + PrimaryChromaticity* white_point_; +}; + +class Colour { + public: + enum MatrixCoefficients { + kGbr = 0, + kBt709 = 1, + kUnspecifiedMc = 2, + kReserved = 3, + kFcc = 4, + kBt470bg = 5, + kSmpte170MMc = 6, + kSmpte240MMc = 7, + kYcocg = 8, + kBt2020NonConstantLuminance = 9, + kBt2020ConstantLuminance = 10, + }; + enum ChromaSitingHorz { + kUnspecifiedCsh = 0, + kLeftCollocated = 1, + kHalfCsh = 2, + }; + enum ChromaSitingVert { + kUnspecifiedCsv = 0, + kTopCollocated = 1, + kHalfCsv = 2, + }; + enum Range { + kUnspecifiedCr = 0, + kBroadcastRange = 1, + kFullRange = 2, + kMcTcDefined = 3, // Defined by MatrixCoefficients/TransferCharacteristics. + }; + enum TransferCharacteristics { + kIturBt709Tc = 1, + kUnspecifiedTc = 2, + kReservedTc = 3, + kGamma22Curve = 4, + kGamma28Curve = 5, + kSmpte170MTc = 6, + kSmpte240MTc = 7, + kLinear = 8, + kLog = 9, + kLogSqrt = 10, + kIec6196624 = 11, + kIturBt1361ExtendedColourGamut = 12, + kIec6196621 = 13, + kIturBt202010bit = 14, + kIturBt202012bit = 15, + kSmpteSt2084 = 16, + kSmpteSt4281Tc = 17, + kAribStdB67Hlg = 18, + }; + enum Primaries { + kReservedP0 = 0, + kIturBt709P = 1, + kUnspecifiedP = 2, + kReservedP3 = 3, + kIturBt470M = 4, + kIturBt470Bg = 5, + kSmpte170MP = 6, + kSmpte240MP = 7, + kFilm = 8, + kIturBt2020 = 9, + kSmpteSt4281P = 10, + kJedecP22Phosphors = 22, + }; + static const uint64_t kValueNotPresent; + Colour() + : matrix_coefficients_(kValueNotPresent), + bits_per_channel_(kValueNotPresent), + chroma_subsampling_horz_(kValueNotPresent), + chroma_subsampling_vert_(kValueNotPresent), + cb_subsampling_horz_(kValueNotPresent), + cb_subsampling_vert_(kValueNotPresent), + chroma_siting_horz_(kValueNotPresent), + chroma_siting_vert_(kValueNotPresent), + range_(kValueNotPresent), + transfer_characteristics_(kValueNotPresent), + primaries_(kValueNotPresent), + max_cll_(kValueNotPresent), + max_fall_(kValueNotPresent), + mastering_metadata_(NULL) {} + ~Colour() { delete mastering_metadata_; } + + // Returns total size of the Colour element. + uint64_t ColourSize() const; + bool Valid() const; + bool Write(IMkvWriter* writer) const; + + // Deep copies |mastering_metadata|. + bool SetMasteringMetadata(const MasteringMetadata& mastering_metadata); + + const MasteringMetadata* mastering_metadata() const { + return mastering_metadata_; + } + + uint64_t matrix_coefficients() const { return matrix_coefficients_; } + void set_matrix_coefficients(uint64_t matrix_coefficients) { + matrix_coefficients_ = matrix_coefficients; + } + uint64_t bits_per_channel() const { return bits_per_channel_; } + void set_bits_per_channel(uint64_t bits_per_channel) { + bits_per_channel_ = bits_per_channel; + } + uint64_t chroma_subsampling_horz() const { return chroma_subsampling_horz_; } + void set_chroma_subsampling_horz(uint64_t chroma_subsampling_horz) { + chroma_subsampling_horz_ = chroma_subsampling_horz; + } + uint64_t chroma_subsampling_vert() const { return chroma_subsampling_vert_; } + void set_chroma_subsampling_vert(uint64_t chroma_subsampling_vert) { + chroma_subsampling_vert_ = chroma_subsampling_vert; + } + uint64_t cb_subsampling_horz() const { return cb_subsampling_horz_; } + void set_cb_subsampling_horz(uint64_t cb_subsampling_horz) { + cb_subsampling_horz_ = cb_subsampling_horz; + } + uint64_t cb_subsampling_vert() const { return cb_subsampling_vert_; } + void set_cb_subsampling_vert(uint64_t cb_subsampling_vert) { + cb_subsampling_vert_ = cb_subsampling_vert; + } + uint64_t chroma_siting_horz() const { return chroma_siting_horz_; } + void set_chroma_siting_horz(uint64_t chroma_siting_horz) { + chroma_siting_horz_ = chroma_siting_horz; + } + uint64_t chroma_siting_vert() const { return chroma_siting_vert_; } + void set_chroma_siting_vert(uint64_t chroma_siting_vert) { + chroma_siting_vert_ = chroma_siting_vert; + } + uint64_t range() const { return range_; } + void set_range(uint64_t range) { range_ = range; } + uint64_t transfer_characteristics() const { + return transfer_characteristics_; + } + void set_transfer_characteristics(uint64_t transfer_characteristics) { + transfer_characteristics_ = transfer_characteristics; + } + uint64_t primaries() const { return primaries_; } + void set_primaries(uint64_t primaries) { primaries_ = primaries; } + uint64_t max_cll() const { return max_cll_; } + void set_max_cll(uint64_t max_cll) { max_cll_ = max_cll; } + uint64_t max_fall() const { return max_fall_; } + void set_max_fall(uint64_t max_fall) { max_fall_ = max_fall; } + + private: + // Returns size of Colour child elements. + uint64_t PayloadSize() const; + + uint64_t matrix_coefficients_; + uint64_t bits_per_channel_; + uint64_t chroma_subsampling_horz_; + uint64_t chroma_subsampling_vert_; + uint64_t cb_subsampling_horz_; + uint64_t cb_subsampling_vert_; + uint64_t chroma_siting_horz_; + uint64_t chroma_siting_vert_; + uint64_t range_; + uint64_t transfer_characteristics_; + uint64_t primaries_; + uint64_t max_cll_; + uint64_t max_fall_; + + MasteringMetadata* mastering_metadata_; +}; + +/////////////////////////////////////////////////////////////// +// Projection element. +class Projection { + public: + enum ProjectionType { + kTypeNotPresent = -1, + kRectangular = 0, + kEquirectangular = 1, + kCubeMap = 2, + kMesh = 3, + }; + static const uint64_t kValueNotPresent; + Projection() + : type_(kRectangular), + pose_yaw_(0.0), + pose_pitch_(0.0), + pose_roll_(0.0), + private_data_(NULL), + private_data_length_(0) {} + ~Projection() { delete[] private_data_; } + + uint64_t ProjectionSize() const; + bool Write(IMkvWriter* writer) const; + + bool SetProjectionPrivate(const uint8_t* private_data, + uint64_t private_data_length); + + ProjectionType type() const { return type_; } + void set_type(ProjectionType type) { type_ = type; } + float pose_yaw() const { return pose_yaw_; } + void set_pose_yaw(float pose_yaw) { pose_yaw_ = pose_yaw; } + float pose_pitch() const { return pose_pitch_; } + void set_pose_pitch(float pose_pitch) { pose_pitch_ = pose_pitch; } + float pose_roll() const { return pose_roll_; } + void set_pose_roll(float pose_roll) { pose_roll_ = pose_roll; } + uint8_t* private_data() const { return private_data_; } + uint64_t private_data_length() const { return private_data_length_; } + + private: + // Returns size of VideoProjection child elements. + uint64_t PayloadSize() const; + + ProjectionType type_; + float pose_yaw_; + float pose_pitch_; + float pose_roll_; + uint8_t* private_data_; + uint64_t private_data_length_; +}; + +/////////////////////////////////////////////////////////////// +// Track element. +class Track { + public: + // The |seed| parameter is used to synthesize a UID for the track. + explicit Track(unsigned int* seed); + virtual ~Track(); + + // Adds a ContentEncoding element to the Track. Returns true on success. + virtual bool AddContentEncoding(); + + // Returns the ContentEncoding by index. Returns NULL if there is no + // ContentEncoding match. + ContentEncoding* GetContentEncodingByIndex(uint32_t index) const; + + // Returns the size in bytes for the payload of the Track element. + virtual uint64_t PayloadSize() const; + + // Returns the size in bytes of the Track element. + virtual uint64_t Size() const; + + // Output the Track element to the writer. Returns true on success. + virtual bool Write(IMkvWriter* writer) const; + + // Sets the CodecPrivate element of the Track element. Copies |length| + // bytes from |codec_private| to |codec_private_|. Returns true on success. + bool SetCodecPrivate(const uint8_t* codec_private, uint64_t length); + + void set_codec_id(const char* codec_id); + const char* codec_id() const { return codec_id_; } + const uint8_t* codec_private() const { return codec_private_; } + void set_language(const char* language); + const char* language() const { return language_; } + void set_max_block_additional_id(uint64_t max_block_additional_id) { + max_block_additional_id_ = max_block_additional_id; + } + uint64_t max_block_additional_id() const { return max_block_additional_id_; } + void set_name(const char* name); + const char* name() const { return name_; } + void set_number(uint64_t number) { number_ = number; } + uint64_t number() const { return number_; } + void set_type(uint64_t type) { type_ = type; } + uint64_t type() const { return type_; } + void set_uid(uint64_t uid) { uid_ = uid; } + uint64_t uid() const { return uid_; } + void set_codec_delay(uint64_t codec_delay) { codec_delay_ = codec_delay; } + uint64_t codec_delay() const { return codec_delay_; } + void set_seek_pre_roll(uint64_t seek_pre_roll) { + seek_pre_roll_ = seek_pre_roll; + } + uint64_t seek_pre_roll() const { return seek_pre_roll_; } + void set_default_duration(uint64_t default_duration) { + default_duration_ = default_duration; + } + uint64_t default_duration() const { return default_duration_; } + + uint64_t codec_private_length() const { return codec_private_length_; } + uint32_t content_encoding_entries_size() const { + return content_encoding_entries_size_; + } + + private: + // Track element names. + char* codec_id_; + uint8_t* codec_private_; + char* language_; + uint64_t max_block_additional_id_; + char* name_; + uint64_t number_; + uint64_t type_; + uint64_t uid_; + uint64_t codec_delay_; + uint64_t seek_pre_roll_; + uint64_t default_duration_; + + // Size of the CodecPrivate data in bytes. + uint64_t codec_private_length_; + + // ContentEncoding element list. + ContentEncoding** content_encoding_entries_; + + // Number of ContentEncoding elements added. + uint32_t content_encoding_entries_size_; + + LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Track); +}; + +/////////////////////////////////////////////////////////////// +// Track that has video specific elements. +class VideoTrack : public Track { + public: + // Supported modes for stereo 3D. + enum StereoMode { + kMono = 0, + kSideBySideLeftIsFirst = 1, + kTopBottomRightIsFirst = 2, + kTopBottomLeftIsFirst = 3, + kSideBySideRightIsFirst = 11 + }; + + enum AlphaMode { kNoAlpha = 0, kAlpha = 1 }; + + // The |seed| parameter is used to synthesize a UID for the track. + explicit VideoTrack(unsigned int* seed); + virtual ~VideoTrack(); + + // Returns the size in bytes for the payload of the Track element plus the + // video specific elements. + virtual uint64_t PayloadSize() const; + + // Output the VideoTrack element to the writer. Returns true on success. + virtual bool Write(IMkvWriter* writer) const; + + // Sets the video's stereo mode. Returns true on success. + bool SetStereoMode(uint64_t stereo_mode); + + // Sets the video's alpha mode. Returns true on success. + bool SetAlphaMode(uint64_t alpha_mode); + + void set_display_height(uint64_t height) { display_height_ = height; } + uint64_t display_height() const { return display_height_; } + void set_display_width(uint64_t width) { display_width_ = width; } + uint64_t display_width() const { return display_width_; } + void set_pixel_height(uint64_t height) { pixel_height_ = height; } + uint64_t pixel_height() const { return pixel_height_; } + void set_pixel_width(uint64_t width) { pixel_width_ = width; } + uint64_t pixel_width() const { return pixel_width_; } + + void set_crop_left(uint64_t crop_left) { crop_left_ = crop_left; } + uint64_t crop_left() const { return crop_left_; } + void set_crop_right(uint64_t crop_right) { crop_right_ = crop_right; } + uint64_t crop_right() const { return crop_right_; } + void set_crop_top(uint64_t crop_top) { crop_top_ = crop_top; } + uint64_t crop_top() const { return crop_top_; } + void set_crop_bottom(uint64_t crop_bottom) { crop_bottom_ = crop_bottom; } + uint64_t crop_bottom() const { return crop_bottom_; } + + void set_frame_rate(double frame_rate) { frame_rate_ = frame_rate; } + double frame_rate() const { return frame_rate_; } + void set_height(uint64_t height) { height_ = height; } + uint64_t height() const { return height_; } + uint64_t stereo_mode() { return stereo_mode_; } + uint64_t alpha_mode() { return alpha_mode_; } + void set_width(uint64_t width) { width_ = width; } + uint64_t width() const { return width_; } + void set_colour_space(const char* colour_space); + const char* colour_space() const { return colour_space_; } + + Colour* colour() { return colour_; } + + // Deep copies |colour|. + bool SetColour(const Colour& colour); + + Projection* projection() { return projection_; } + + // Deep copies |projection|. + bool SetProjection(const Projection& projection); + + private: + // Returns the size in bytes of the Video element. + uint64_t VideoPayloadSize() const; + + // Video track element names. + uint64_t display_height_; + uint64_t display_width_; + uint64_t pixel_height_; + uint64_t pixel_width_; + uint64_t crop_left_; + uint64_t crop_right_; + uint64_t crop_top_; + uint64_t crop_bottom_; + double frame_rate_; + uint64_t height_; + uint64_t stereo_mode_; + uint64_t alpha_mode_; + uint64_t width_; + char* colour_space_; + + Colour* colour_; + Projection* projection_; + + LIBWEBM_DISALLOW_COPY_AND_ASSIGN(VideoTrack); +}; + +/////////////////////////////////////////////////////////////// +// Track that has audio specific elements. +class AudioTrack : public Track { + public: + // The |seed| parameter is used to synthesize a UID for the track. + explicit AudioTrack(unsigned int* seed); + virtual ~AudioTrack(); + + // Returns the size in bytes for the payload of the Track element plus the + // audio specific elements. + virtual uint64_t PayloadSize() const; + + // Output the AudioTrack element to the writer. Returns true on success. + virtual bool Write(IMkvWriter* writer) const; + + void set_bit_depth(uint64_t bit_depth) { bit_depth_ = bit_depth; } + uint64_t bit_depth() const { return bit_depth_; } + void set_channels(uint64_t channels) { channels_ = channels; } + uint64_t channels() const { return channels_; } + void set_sample_rate(double sample_rate) { sample_rate_ = sample_rate; } + double sample_rate() const { return sample_rate_; } + + private: + // Audio track element names. + uint64_t bit_depth_; + uint64_t channels_; + double sample_rate_; + + LIBWEBM_DISALLOW_COPY_AND_ASSIGN(AudioTrack); +}; + +/////////////////////////////////////////////////////////////// +// Tracks element +class Tracks { + public: + // Audio and video type defined by the Matroska specs. + enum { kVideo = 0x1, kAudio = 0x2 }; + + static const char kOpusCodecId[]; + static const char kVorbisCodecId[]; + static const char kAv1CodecId[]; + static const char kVp8CodecId[]; + static const char kVp9CodecId[]; + static const char kWebVttCaptionsId[]; + static const char kWebVttDescriptionsId[]; + static const char kWebVttMetadataId[]; + static const char kWebVttSubtitlesId[]; + + Tracks(); + ~Tracks(); + + // Adds a Track element to the Tracks object. |track| will be owned and + // deleted by the Tracks object. Returns true on success. |number| is the + // number to use for the track. |number| must be >= 0. If |number| == 0 + // then the muxer will decide on the track number. + bool AddTrack(Track* track, int32_t number); + + // Returns the track by index. Returns NULL if there is no track match. + const Track* GetTrackByIndex(uint32_t idx) const; + + // Search the Tracks and return the track that matches |tn|. Returns NULL + // if there is no track match. + Track* GetTrackByNumber(uint64_t track_number) const; + + // Returns true if the track number is an audio track. + bool TrackIsAudio(uint64_t track_number) const; + + // Returns true if the track number is a video track. + bool TrackIsVideo(uint64_t track_number) const; + + // Output the Tracks element to the writer. Returns true on success. + bool Write(IMkvWriter* writer) const; + + uint32_t track_entries_size() const { return track_entries_size_; } + + private: + // Track element list. + Track** track_entries_; + + // Number of Track elements added. + uint32_t track_entries_size_; + + // Whether or not Tracks element has already been written via IMkvWriter. + mutable bool wrote_tracks_; + + LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Tracks); +}; + +/////////////////////////////////////////////////////////////// +// Chapter element +// +class Chapter { + public: + // Set the identifier for this chapter. (This corresponds to the + // Cue Identifier line in WebVTT.) + // TODO(matthewjheaney): the actual serialization of this item in + // MKV is pending. + bool set_id(const char* id); + + // Converts the nanosecond start and stop times of this chapter to + // their corresponding timecode values, and stores them that way. + void set_time(const Segment& segment, uint64_t start_time_ns, + uint64_t end_time_ns); + + // Sets the uid for this chapter. Primarily used to enable + // deterministic output from the muxer. + void set_uid(const uint64_t uid) { uid_ = uid; } + + // Add a title string to this chapter, per the semantics described + // here: + // http://www.matroska.org/technical/specs/index.html + // + // The title ("chapter string") is a UTF-8 string. + // + // The language has ISO 639-2 representation, described here: + // http://www.loc.gov/standards/iso639-2/englangn.html + // http://www.loc.gov/standards/iso639-2/php/English_list.php + // If you specify NULL as the language value, this implies + // English ("eng"). + // + // The country value corresponds to the codes listed here: + // http://www.iana.org/domains/root/db/ + // + // The function returns false if the string could not be allocated. + bool add_string(const char* title, const char* language, const char* country); + + private: + friend class Chapters; + + // For storage of chapter titles that differ by language. + class Display { + public: + // Establish representation invariant for new Display object. + void Init(); + + // Reclaim resources, in anticipation of destruction. + void Clear(); + + // Copies the title to the |title_| member. Returns false on + // error. + bool set_title(const char* title); + + // Copies the language to the |language_| member. Returns false + // on error. + bool set_language(const char* language); + + // Copies the country to the |country_| member. Returns false on + // error. + bool set_country(const char* country); + + // If |writer| is non-NULL, serialize the Display sub-element of + // the Atom into the stream. Returns the Display element size on + // success, 0 if error. + uint64_t WriteDisplay(IMkvWriter* writer) const; + + private: + char* title_; + char* language_; + char* country_; + }; + + Chapter(); + ~Chapter(); + + // Establish the representation invariant for a newly-created + // Chapter object. The |seed| parameter is used to create the UID + // for this chapter atom. + void Init(unsigned int* seed); + + // Copies this Chapter object to a different one. This is used when + // expanding a plain array of Chapter objects (see Chapters). + void ShallowCopy(Chapter* dst) const; + + // Reclaim resources used by this Chapter object, pending its + // destruction. + void Clear(); + + // If there is no storage remaining on the |displays_| array for a + // new display object, creates a new, longer array and copies the + // existing Display objects to the new array. Returns false if the + // array cannot be expanded. + bool ExpandDisplaysArray(); + + // If |writer| is non-NULL, serialize the Atom sub-element into the + // stream. Returns the total size of the element on success, 0 if + // error. + uint64_t WriteAtom(IMkvWriter* writer) const; + + // The string identifier for this chapter (corresponds to WebVTT cue + // identifier). + char* id_; + + // Start timecode of the chapter. + uint64_t start_timecode_; + + // Stop timecode of the chapter. + uint64_t end_timecode_; + + // The binary identifier for this chapter. + uint64_t uid_; + + // The Atom element can contain multiple Display sub-elements, as + // the same logical title can be rendered in different languages. + Display* displays_; + + // The physical length (total size) of the |displays_| array. + int displays_size_; + + // The logical length (number of active elements) on the |displays_| + // array. + int displays_count_; + + LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Chapter); +}; + +/////////////////////////////////////////////////////////////// +// Chapters element +// +class Chapters { + public: + Chapters(); + ~Chapters(); + + Chapter* AddChapter(unsigned int* seed); + + // Returns the number of chapters that have been added. + int Count() const; + + // Output the Chapters element to the writer. Returns true on success. + bool Write(IMkvWriter* writer) const; + + private: + // Expands the chapters_ array if there is not enough space to contain + // another chapter object. Returns true on success. + bool ExpandChaptersArray(); + + // If |writer| is non-NULL, serialize the Edition sub-element of the + // Chapters element into the stream. Returns the Edition element + // size on success, 0 if error. + uint64_t WriteEdition(IMkvWriter* writer) const; + + // Total length of the chapters_ array. + int chapters_size_; + + // Number of active chapters on the chapters_ array. + int chapters_count_; + + // Array for storage of chapter objects. + Chapter* chapters_; + + LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Chapters); +}; + +/////////////////////////////////////////////////////////////// +// Tag element +// +class Tag { + public: + bool add_simple_tag(const char* tag_name, const char* tag_string); + + private: + // Tags calls Clear and the destructor of Tag + friend class Tags; + + // For storage of simple tags + class SimpleTag { + public: + // Establish representation invariant for new SimpleTag object. + void Init(); + + // Reclaim resources, in anticipation of destruction. + void Clear(); + + // Copies the title to the |tag_name_| member. Returns false on + // error. + bool set_tag_name(const char* tag_name); + + // Copies the language to the |tag_string_| member. Returns false + // on error. + bool set_tag_string(const char* tag_string); + + // If |writer| is non-NULL, serialize the SimpleTag sub-element of + // the Atom into the stream. Returns the SimpleTag element size on + // success, 0 if error. + uint64_t Write(IMkvWriter* writer) const; + + private: + char* tag_name_; + char* tag_string_; + }; + + Tag(); + ~Tag(); + + // Copies this Tag object to a different one. This is used when + // expanding a plain array of Tag objects (see Tags). + void ShallowCopy(Tag* dst) const; + + // Reclaim resources used by this Tag object, pending its + // destruction. + void Clear(); + + // If there is no storage remaining on the |simple_tags_| array for a + // new display object, creates a new, longer array and copies the + // existing SimpleTag objects to the new array. Returns false if the + // array cannot be expanded. + bool ExpandSimpleTagsArray(); + + // If |writer| is non-NULL, serialize the Tag sub-element into the + // stream. Returns the total size of the element on success, 0 if + // error. + uint64_t Write(IMkvWriter* writer) const; + + // The Atom element can contain multiple SimpleTag sub-elements + SimpleTag* simple_tags_; + + // The physical length (total size) of the |simple_tags_| array. + int simple_tags_size_; + + // The logical length (number of active elements) on the |simple_tags_| + // array. + int simple_tags_count_; + + LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Tag); +}; + +/////////////////////////////////////////////////////////////// +// Tags element +// +class Tags { + public: + Tags(); + ~Tags(); + + Tag* AddTag(); + + // Returns the number of tags that have been added. + int Count() const; + + // Output the Tags element to the writer. Returns true on success. + bool Write(IMkvWriter* writer) const; + + private: + // Expands the tags_ array if there is not enough space to contain + // another tag object. Returns true on success. + bool ExpandTagsArray(); + + // Total length of the tags_ array. + int tags_size_; + + // Number of active tags on the tags_ array. + int tags_count_; + + // Array for storage of tag objects. + Tag* tags_; + + LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Tags); +}; + +/////////////////////////////////////////////////////////////// +// Cluster element +// +// Notes: +// |Init| must be called before any other method in this class. +class Cluster { + public: + // |timecode| is the absolute timecode of the cluster. |cues_pos| is the + // position for the cluster within the segment that should be written in + // the cues element. |timecode_scale| is the timecode scale of the segment. + Cluster(uint64_t timecode, int64_t cues_pos, uint64_t timecode_scale, + bool write_last_frame_with_duration = false, + bool fixed_size_timecode = false); + ~Cluster(); + + bool Init(IMkvWriter* ptr_writer); + + // Adds a frame to be output in the file. The frame is written out through + // |writer_| if successful. Returns true on success. + bool AddFrame(const Frame* frame); + + // Adds a frame to be output in the file. The frame is written out through + // |writer_| if successful. Returns true on success. + // Inputs: + // data: Pointer to the data + // length: Length of the data + // track_number: Track to add the data to. Value returned by Add track + // functions. The range of allowed values is [1, 126]. + // timecode: Absolute (not relative to cluster) timestamp of the + // frame, expressed in timecode units. + // is_key: Flag telling whether or not this frame is a key frame. + bool AddFrame(const uint8_t* data, uint64_t length, uint64_t track_number, + uint64_t timecode, // timecode units (absolute) + bool is_key); + + // Adds a frame to be output in the file. The frame is written out through + // |writer_| if successful. Returns true on success. + // Inputs: + // data: Pointer to the data + // length: Length of the data + // additional: Pointer to the additional data + // additional_length: Length of the additional data + // add_id: Value of BlockAddID element + // track_number: Track to add the data to. Value returned by Add track + // functions. The range of allowed values is [1, 126]. + // abs_timecode: Absolute (not relative to cluster) timestamp of the + // frame, expressed in timecode units. + // is_key: Flag telling whether or not this frame is a key frame. + bool AddFrameWithAdditional(const uint8_t* data, uint64_t length, + const uint8_t* additional, + uint64_t additional_length, uint64_t add_id, + uint64_t track_number, uint64_t abs_timecode, + bool is_key); + + // Adds a frame to be output in the file. The frame is written out through + // |writer_| if successful. Returns true on success. + // Inputs: + // data: Pointer to the data. + // length: Length of the data. + // discard_padding: DiscardPadding element value. + // track_number: Track to add the data to. Value returned by Add track + // functions. The range of allowed values is [1, 126]. + // abs_timecode: Absolute (not relative to cluster) timestamp of the + // frame, expressed in timecode units. + // is_key: Flag telling whether or not this frame is a key frame. + bool AddFrameWithDiscardPadding(const uint8_t* data, uint64_t length, + int64_t discard_padding, + uint64_t track_number, uint64_t abs_timecode, + bool is_key); + + // Writes a frame of metadata to the output medium; returns true on + // success. + // Inputs: + // data: Pointer to the data + // length: Length of the data + // track_number: Track to add the data to. Value returned by Add track + // functions. The range of allowed values is [1, 126]. + // timecode: Absolute (not relative to cluster) timestamp of the + // metadata frame, expressed in timecode units. + // duration: Duration of metadata frame, in timecode units. + // + // The metadata frame is written as a block group, with a duration + // sub-element but no reference time sub-elements (indicating that + // it is considered a keyframe, per Matroska semantics). + bool AddMetadata(const uint8_t* data, uint64_t length, uint64_t track_number, + uint64_t timecode, uint64_t duration); + + // Increments the size of the cluster's data in bytes. + void AddPayloadSize(uint64_t size); + + // Closes the cluster so no more data can be written to it. Will update the + // cluster's size if |writer_| is seekable. Returns true on success. This + // variant of Finalize() fails when |write_last_frame_with_duration_| is set + // to true. + bool Finalize(); + + // Closes the cluster so no more data can be written to it. Will update the + // cluster's size if |writer_| is seekable. Returns true on success. + // Inputs: + // set_last_frame_duration: Boolean indicating whether or not the duration + // of the last frame should be set. If set to + // false, the |duration| value is ignored and + // |write_last_frame_with_duration_| will not be + // honored. + // duration: Duration of the Cluster in timecode scale. + bool Finalize(bool set_last_frame_duration, uint64_t duration); + + // Returns the size in bytes for the entire Cluster element. + uint64_t Size() const; + + // Given |abs_timecode|, calculates timecode relative to most recent timecode. + // Returns -1 on failure, or a relative timecode. + int64_t GetRelativeTimecode(int64_t abs_timecode) const; + + int64_t size_position() const { return size_position_; } + int32_t blocks_added() const { return blocks_added_; } + uint64_t payload_size() const { return payload_size_; } + int64_t position_for_cues() const { return position_for_cues_; } + uint64_t timecode() const { return timecode_; } + uint64_t timecode_scale() const { return timecode_scale_; } + void set_write_last_frame_with_duration(bool write_last_frame_with_duration) { + write_last_frame_with_duration_ = write_last_frame_with_duration; + } + bool write_last_frame_with_duration() const { + return write_last_frame_with_duration_; + } + + private: + // Iterator type for the |stored_frames_| map. + typedef std::map >::iterator FrameMapIterator; + + // Utility method that confirms that blocks can still be added, and that the + // cluster header has been written. Used by |DoWriteFrame*|. Returns true + // when successful. + bool PreWriteBlock(); + + // Utility method used by the |DoWriteFrame*| methods that handles the book + // keeping required after each block is written. + void PostWriteBlock(uint64_t element_size); + + // Does some verification and calls WriteFrame. + bool DoWriteFrame(const Frame* const frame); + + // Either holds back the given frame, or writes it out depending on whether or + // not |write_last_frame_with_duration_| is set. + bool QueueOrWriteFrame(const Frame* const frame); + + // Outputs the Cluster header to |writer_|. Returns true on success. + bool WriteClusterHeader(); + + // Number of blocks added to the cluster. + int32_t blocks_added_; + + // Flag telling if the cluster has been closed. + bool finalized_; + + // Flag indicating whether the cluster's timecode will always be written out + // using 8 bytes. + bool fixed_size_timecode_; + + // Flag telling if the cluster's header has been written. + bool header_written_; + + // The size of the cluster elements in bytes. + uint64_t payload_size_; + + // The file position used for cue points. + const int64_t position_for_cues_; + + // The file position of the cluster's size element. + int64_t size_position_; + + // The absolute timecode of the cluster. + const uint64_t timecode_; + + // The timecode scale of the Segment containing the cluster. + const uint64_t timecode_scale_; + + // Flag indicating whether the last frame of the cluster should be written as + // a Block with Duration. If set to true, then it will result in holding back + // of frames and the parameterized version of Finalize() must be called to + // finish writing the Cluster. + bool write_last_frame_with_duration_; + + // Map used to hold back frames, if required. Track number is the key. + std::map > stored_frames_; + + // Map from track number to the timestamp of the last block written for that + // track. + std::map last_block_timestamp_; + + // Pointer to the writer object. Not owned by this class. + IMkvWriter* writer_; + + LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Cluster); +}; + +/////////////////////////////////////////////////////////////// +// SeekHead element +class SeekHead { + public: + SeekHead(); + ~SeekHead(); + + // TODO(fgalligan): Change this to reserve a certain size. Then check how + // big the seek entry to be added is as not every seek entry will be the + // maximum size it could be. + // Adds a seek entry to be written out when the element is finalized. |id| + // must be the coded mkv element id. |pos| is the file position of the + // element. Returns true on success. + bool AddSeekEntry(uint32_t id, uint64_t pos); + + // Writes out SeekHead and SeekEntry elements. Returns true on success. + bool Finalize(IMkvWriter* writer) const; + + // Returns the id of the Seek Entry at the given index. Returns -1 if index is + // out of range. + uint32_t GetId(int index) const; + + // Returns the position of the Seek Entry at the given index. Returns -1 if + // index is out of range. + uint64_t GetPosition(int index) const; + + // Sets the Seek Entry id and position at given index. + // Returns true on success. + bool SetSeekEntry(int index, uint32_t id, uint64_t position); + + // Reserves space by writing out a Void element which will be updated with + // a SeekHead element later. Returns true on success. + bool Write(IMkvWriter* writer); + + // We are going to put a cap on the number of Seek Entries. + const static int32_t kSeekEntryCount = 5; + + private: + // Returns the maximum size in bytes of one seek entry. + uint64_t MaxEntrySize() const; + + // Seek entry id element list. + uint32_t seek_entry_id_[kSeekEntryCount]; + + // Seek entry pos element list. + uint64_t seek_entry_pos_[kSeekEntryCount]; + + // The file position of SeekHead element. + int64_t start_pos_; + + LIBWEBM_DISALLOW_COPY_AND_ASSIGN(SeekHead); +}; + +/////////////////////////////////////////////////////////////// +// Segment Information element +class SegmentInfo { + public: + SegmentInfo(); + ~SegmentInfo(); + + // Will update the duration if |duration_| is > 0.0. Returns true on success. + bool Finalize(IMkvWriter* writer) const; + + // Sets |muxing_app_| and |writing_app_|. + bool Init(); + + // Output the Segment Information element to the writer. Returns true on + // success. + bool Write(IMkvWriter* writer); + + void set_duration(double duration) { duration_ = duration; } + double duration() const { return duration_; } + void set_muxing_app(const char* app); + const char* muxing_app() const { return muxing_app_; } + void set_timecode_scale(uint64_t scale) { timecode_scale_ = scale; } + uint64_t timecode_scale() const { return timecode_scale_; } + void set_writing_app(const char* app); + const char* writing_app() const { return writing_app_; } + void set_date_utc(int64_t date_utc) { date_utc_ = date_utc; } + int64_t date_utc() const { return date_utc_; } + + private: + // Segment Information element names. + // Initially set to -1 to signify that a duration has not been set and should + // not be written out. + double duration_; + // Set to libwebm-%d.%d.%d.%d, major, minor, build, revision. + char* muxing_app_; + uint64_t timecode_scale_; + // Initially set to libwebm-%d.%d.%d.%d, major, minor, build, revision. + char* writing_app_; + // LLONG_MIN when DateUTC is not set. + int64_t date_utc_; + + // The file position of the duration element. + int64_t duration_pos_; + + LIBWEBM_DISALLOW_COPY_AND_ASSIGN(SegmentInfo); +}; + +/////////////////////////////////////////////////////////////// +// This class represents the main segment in a WebM file. Currently only +// supports one Segment element. +// +// Notes: +// |Init| must be called before any other method in this class. +class Segment { + public: + enum Mode { kLive = 0x1, kFile = 0x2 }; + + enum CuesPosition { + kAfterClusters = 0x0, // Position Cues after Clusters - Default + kBeforeClusters = 0x1 // Position Cues before Clusters + }; + + static const uint32_t kDefaultDocTypeVersion = 4; + static const uint64_t kDefaultMaxClusterDuration = 30000000000ULL; + + Segment(); + ~Segment(); + + // Initializes |SegmentInfo| and returns result. Always returns false when + // |ptr_writer| is NULL. + bool Init(IMkvWriter* ptr_writer); + + // Adds a generic track to the segment. Returns the newly-allocated + // track object (which is owned by the segment) on success, NULL on + // error. |number| is the number to use for the track. |number| + // must be >= 0. If |number| == 0 then the muxer will decide on the + // track number. + Track* AddTrack(int32_t number); + + // Adds a Vorbis audio track to the segment. Returns the number of the track + // on success, 0 on error. |number| is the number to use for the audio track. + // |number| must be >= 0. If |number| == 0 then the muxer will decide on + // the track number. + uint64_t AddAudioTrack(int32_t sample_rate, int32_t channels, int32_t number); + + // Adds an empty chapter to the chapters of this segment. Returns + // non-NULL on success. After adding the chapter, the caller should + // populate its fields via the Chapter member functions. + Chapter* AddChapter(); + + // Adds an empty tag to the tags of this segment. Returns + // non-NULL on success. After adding the tag, the caller should + // populate its fields via the Tag member functions. + Tag* AddTag(); + + // Adds a cue point to the Cues element. |timestamp| is the time in + // nanoseconds of the cue's time. |track| is the Track of the Cue. This + // function must be called after AddFrame to calculate the correct + // BlockNumber for the CuePoint. Returns true on success. + bool AddCuePoint(uint64_t timestamp, uint64_t track); + + // Adds a frame to be output in the file. Returns true on success. + // Inputs: + // data: Pointer to the data + // length: Length of the data + // track_number: Track to add the data to. Value returned by Add track + // functions. + // timestamp: Timestamp of the frame in nanoseconds from 0. + // is_key: Flag telling whether or not this frame is a key frame. + bool AddFrame(const uint8_t* data, uint64_t length, uint64_t track_number, + uint64_t timestamp_ns, bool is_key); + + // Writes a frame of metadata to the output medium; returns true on + // success. + // Inputs: + // data: Pointer to the data + // length: Length of the data + // track_number: Track to add the data to. Value returned by Add track + // functions. + // timecode: Absolute timestamp of the metadata frame, expressed + // in nanosecond units. + // duration: Duration of metadata frame, in nanosecond units. + // + // The metadata frame is written as a block group, with a duration + // sub-element but no reference time sub-elements (indicating that + // it is considered a keyframe, per Matroska semantics). + bool AddMetadata(const uint8_t* data, uint64_t length, uint64_t track_number, + uint64_t timestamp_ns, uint64_t duration_ns); + + // Writes a frame with additional data to the output medium; returns true on + // success. + // Inputs: + // data: Pointer to the data. + // length: Length of the data. + // additional: Pointer to additional data. + // additional_length: Length of additional data. + // add_id: Additional ID which identifies the type of additional data. + // track_number: Track to add the data to. Value returned by Add track + // functions. + // timestamp: Absolute timestamp of the frame, expressed in nanosecond + // units. + // is_key: Flag telling whether or not this frame is a key frame. + bool AddFrameWithAdditional(const uint8_t* data, uint64_t length, + const uint8_t* additional, + uint64_t additional_length, uint64_t add_id, + uint64_t track_number, uint64_t timestamp, + bool is_key); + + // Writes a frame with DiscardPadding to the output medium; returns true on + // success. + // Inputs: + // data: Pointer to the data. + // length: Length of the data. + // discard_padding: DiscardPadding element value. + // track_number: Track to add the data to. Value returned by Add track + // functions. + // timestamp: Absolute timestamp of the frame, expressed in nanosecond + // units. + // is_key: Flag telling whether or not this frame is a key frame. + bool AddFrameWithDiscardPadding(const uint8_t* data, uint64_t length, + int64_t discard_padding, + uint64_t track_number, uint64_t timestamp, + bool is_key); + + // Writes a Frame to the output medium. Chooses the correct way of writing + // the frame (Block vs SimpleBlock) based on the parameters passed. + // Inputs: + // frame: frame object + bool AddGenericFrame(const Frame* frame); + + // Adds a VP8 video track to the segment. Returns the number of the track on + // success, 0 on error. |number| is the number to use for the video track. + // |number| must be >= 0. If |number| == 0 then the muxer will decide on + // the track number. + uint64_t AddVideoTrack(int32_t width, int32_t height, int32_t number); + + // This function must be called after Finalize() if you need a copy of the + // output with Cues written before the Clusters. It will return false if the + // writer is not seekable of if chunking is set to true. + // Input parameters: + // reader - an IMkvReader object created with the same underlying file of the + // current writer object. Make sure to close the existing writer + // object before creating this so that all the data is properly + // flushed and available for reading. + // writer - an IMkvWriter object pointing to a *different* file than the one + // pointed by the current writer object. This file will contain the + // Cues element before the Clusters. + bool CopyAndMoveCuesBeforeClusters(mkvparser::IMkvReader* reader, + IMkvWriter* writer); + + // Sets which track to use for the Cues element. Must have added the track + // before calling this function. Returns true on success. |track_number| is + // returned by the Add track functions. + bool CuesTrack(uint64_t track_number); + + // This will force the muxer to create a new Cluster when the next frame is + // added. + void ForceNewClusterOnNextFrame(); + + // Writes out any frames that have not been written out. Finalizes the last + // cluster. May update the size and duration of the segment. May output the + // Cues element. May finalize the SeekHead element. Returns true on success. + bool Finalize(); + + // Returns the Cues object. + Cues* GetCues() { return &cues_; } + + // Returns the Segment Information object. + const SegmentInfo* GetSegmentInfo() const { return &segment_info_; } + SegmentInfo* GetSegmentInfo() { return &segment_info_; } + + // Search the Tracks and return the track that matches |track_number|. + // Returns NULL if there is no track match. + Track* GetTrackByNumber(uint64_t track_number) const; + + // Toggles whether to output a cues element. + void OutputCues(bool output_cues); + + // Toggles whether to write the last frame in each Cluster with Duration. + void AccurateClusterDuration(bool accurate_cluster_duration); + + // Toggles whether to write the Cluster Timecode using exactly 8 bytes. + void UseFixedSizeClusterTimecode(bool fixed_size_cluster_timecode); + + // Sets if the muxer will output files in chunks or not. |chunking| is a + // flag telling whether or not to turn on chunking. |filename| is the base + // filename for the chunk files. The header chunk file will be named + // |filename|.hdr and the data chunks will be named + // |filename|_XXXXXX.chk. Chunking implies that the muxer will be writing + // to files so the muxer will use the default MkvWriter class to control + // what data is written to what files. Returns true on success. + // TODO: Should we change the IMkvWriter Interface to add Open and Close? + // That will force the interface to be dependent on files. + bool SetChunking(bool chunking, const char* filename); + + bool chunking() const { return chunking_; } + uint64_t cues_track() const { return cues_track_; } + void set_max_cluster_duration(uint64_t max_cluster_duration) { + max_cluster_duration_ = max_cluster_duration; + } + uint64_t max_cluster_duration() const { return max_cluster_duration_; } + void set_max_cluster_size(uint64_t max_cluster_size) { + max_cluster_size_ = max_cluster_size; + } + uint64_t max_cluster_size() const { return max_cluster_size_; } + void set_mode(Mode mode) { mode_ = mode; } + Mode mode() const { return mode_; } + CuesPosition cues_position() const { return cues_position_; } + bool output_cues() const { return output_cues_; } + void set_estimate_file_duration(bool estimate_duration) { + estimate_file_duration_ = estimate_duration; + } + bool estimate_file_duration() const { return estimate_file_duration_; } + const SegmentInfo* segment_info() const { return &segment_info_; } + void set_duration(double duration) { duration_ = duration; } + double duration() const { return duration_; } + + // Returns true when codec IDs are valid for WebM. + bool DocTypeIsWebm() const; + + private: + // Checks if header information has been output and initialized. If not it + // will output the Segment element and initialize the SeekHead elment and + // Cues elements. + bool CheckHeaderInfo(); + + // Sets |doc_type_version_| based on the current element requirements. + void UpdateDocTypeVersion(); + + // Sets |name| according to how many chunks have been written. |ext| is the + // file extension. |name| must be deleted by the calling app. Returns true + // on success. + bool UpdateChunkName(const char* ext, char** name) const; + + // Returns the maximum offset within the segment's payload. When chunking + // this function is needed to determine offsets of elements within the + // chunked files. Returns -1 on error. + int64_t MaxOffset(); + + // Adds the frame to our frame array. + bool QueueFrame(Frame* frame); + + // Output all frames that are queued. Returns -1 on error, otherwise + // it returns the number of frames written. + int WriteFramesAll(); + + // Output all frames that are queued that have an end time that is less + // then |timestamp|. Returns true on success and if there are no frames + // queued. + bool WriteFramesLessThan(uint64_t timestamp); + + // Outputs the segment header, Segment Information element, SeekHead element, + // and Tracks element to |writer_|. + bool WriteSegmentHeader(); + + // Given a frame with the specified timestamp (nanosecond units) and + // keyframe status, determine whether a new cluster should be + // created, before writing enqueued frames and the frame itself. The + // function returns one of the following values: + // -1 = error: an out-of-order frame was detected + // 0 = do not create a new cluster, and write frame to the existing cluster + // 1 = create a new cluster, and write frame to that new cluster + // 2 = create a new cluster, and re-run test + int TestFrame(uint64_t track_num, uint64_t timestamp_ns, bool key) const; + + // Create a new cluster, using the earlier of the first enqueued + // frame, or the indicated time. Returns true on success. + bool MakeNewCluster(uint64_t timestamp_ns); + + // Checks whether a new cluster needs to be created, and if so + // creates a new cluster. Returns false if creation of a new cluster + // was necessary but creation was not successful. + bool DoNewClusterProcessing(uint64_t track_num, uint64_t timestamp_ns, + bool key); + + // Adjusts Cue Point values (to place Cues before Clusters) so that they + // reflect the correct offsets. + void MoveCuesBeforeClusters(); + + // This function recursively computes the correct cluster offsets (this is + // done to move the Cues before Clusters). It recursively updates the change + // in size (which indicates a change in cluster offset) until no sizes change. + // Parameters: + // diff - indicates the difference in size of the Cues element that needs to + // accounted for. + // index - index in the list of Cues which is currently being adjusted. + // cue_size - sum of size of all the CuePoint elements. + void MoveCuesBeforeClustersHelper(uint64_t diff, int index, + uint64_t* cue_size); + + // Seeds the random number generator used to make UIDs. + unsigned int seed_; + + // WebM elements + Cues cues_; + SeekHead seek_head_; + SegmentInfo segment_info_; + Tracks tracks_; + Chapters chapters_; + Tags tags_; + + // Number of chunks written. + int chunk_count_; + + // Current chunk filename. + char* chunk_name_; + + // Default MkvWriter object created by this class used for writing clusters + // out in separate files. + MkvWriter* chunk_writer_cluster_; + + // Default MkvWriter object created by this class used for writing Cues + // element out to a file. + MkvWriter* chunk_writer_cues_; + + // Default MkvWriter object created by this class used for writing the + // Matroska header out to a file. + MkvWriter* chunk_writer_header_; + + // Flag telling whether or not the muxer is chunking output to multiple + // files. + bool chunking_; + + // Base filename for the chunked files. + char* chunking_base_name_; + + // File position offset where the Clusters end. + int64_t cluster_end_offset_; + + // List of clusters. + Cluster** cluster_list_; + + // Number of cluster pointers allocated in the cluster list. + int32_t cluster_list_capacity_; + + // Number of clusters in the cluster list. + int32_t cluster_list_size_; + + // Indicates whether Cues should be written before or after Clusters + CuesPosition cues_position_; + + // Track number that is associated with the cues element for this segment. + uint64_t cues_track_; + + // Tells the muxer to force a new cluster on the next Block. + bool force_new_cluster_; + + // List of stored audio frames. These variables are used to store frames so + // the muxer can follow the guideline "Audio blocks that contain the video + // key frame's timecode should be in the same cluster as the video key frame + // block." + Frame** frames_; + + // Number of frame pointers allocated in the frame list. + int32_t frames_capacity_; + + // Number of frames in the frame list. + int32_t frames_size_; + + // Flag telling if a video track has been added to the segment. + bool has_video_; + + // Flag telling if the segment's header has been written. + bool header_written_; + + // Duration of the last block in nanoseconds. + uint64_t last_block_duration_; + + // Last timestamp in nanoseconds added to a cluster. + uint64_t last_timestamp_; + + // Last timestamp in nanoseconds by track number added to a cluster. + uint64_t last_track_timestamp_[kMaxTrackNumber]; + + // Number of frames written per track. + uint64_t track_frames_written_[kMaxTrackNumber]; + + // Maximum time in nanoseconds for a cluster duration. This variable is a + // guideline and some clusters may have a longer duration. Default is 30 + // seconds. + uint64_t max_cluster_duration_; + + // Maximum size in bytes for a cluster. This variable is a guideline and + // some clusters may have a larger size. Default is 0 which signifies that + // the muxer will decide the size. + uint64_t max_cluster_size_; + + // The mode that segment is in. If set to |kLive| the writer must not + // seek backwards. + Mode mode_; + + // Flag telling the muxer that a new cue point should be added. + bool new_cuepoint_; + + // TODO(fgalligan): Should we add support for more than one Cues element? + // Flag whether or not the muxer should output a Cues element. + bool output_cues_; + + // Flag whether or not the last frame in each Cluster will have a Duration + // element in it. + bool accurate_cluster_duration_; + + // Flag whether or not to write the Cluster Timecode using exactly 8 bytes. + bool fixed_size_cluster_timecode_; + + // Flag whether or not to estimate the file duration. + bool estimate_file_duration_; + + // The size of the EBML header, used to validate the header if + // WriteEbmlHeader() is called more than once. + int32_t ebml_header_size_; + + // The file position of the segment's payload. + int64_t payload_pos_; + + // The file position of the element's size. + int64_t size_position_; + + // Current DocTypeVersion (|doc_type_version_|) and that written in + // WriteSegmentHeader(). + // WriteEbmlHeader() will be called from Finalize() if |doc_type_version_| + // differs from |doc_type_version_written_|. + uint32_t doc_type_version_; + uint32_t doc_type_version_written_; + + // If |duration_| is > 0, then explicitly set the duration of the segment. + double duration_; + + // Pointer to the writer objects. Not owned by this class. + IMkvWriter* writer_cluster_; + IMkvWriter* writer_cues_; + IMkvWriter* writer_header_; + + LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Segment); +}; + +} // namespace mkvmuxer + +#endif // MKVMUXER_MKVMUXER_H_ diff --git a/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxertypes.h b/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxertypes.h new file mode 100644 index 000000000..e5db12160 --- /dev/null +++ b/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxertypes.h @@ -0,0 +1,28 @@ +// Copyright (c) 2012 The WebM project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. + +#ifndef MKVMUXER_MKVMUXERTYPES_H_ +#define MKVMUXER_MKVMUXERTYPES_H_ + +namespace mkvmuxer { +typedef unsigned char uint8; +typedef short int16; +typedef int int32; +typedef unsigned int uint32; +typedef long long int64; +typedef unsigned long long uint64; +} // namespace mkvmuxer + +// Copied from Chromium basictypes.h +// A macro to disallow the copy constructor and operator= functions +// This should be used in the private: declarations for a class +#define LIBWEBM_DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&); \ + void operator=(const TypeName&) + +#endif // MKVMUXER_MKVMUXERTYPES_HPP_ diff --git a/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc new file mode 100644 index 000000000..6436817c9 --- /dev/null +++ b/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc @@ -0,0 +1,743 @@ +// Copyright (c) 2012 The WebM project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. + +#include "mkvmuxer/mkvmuxerutil.h" + +#ifdef __ANDROID__ +#include +#include +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include "common/webmids.h" +#include "mkvmuxer/mkvmuxer.h" +#include "mkvmuxer/mkvwriter.h" + +namespace mkvmuxer { + +namespace { + +// Date elements are always 8 octets in size. +const int kDateElementSize = 8; + +uint64 WriteBlock(IMkvWriter* writer, const Frame* const frame, int64 timecode, + uint64 timecode_scale) { + uint64 block_additional_elem_size = 0; + uint64 block_addid_elem_size = 0; + uint64 block_more_payload_size = 0; + uint64 block_more_elem_size = 0; + uint64 block_additions_payload_size = 0; + uint64 block_additions_elem_size = 0; + if (frame->additional()) { + block_additional_elem_size = + EbmlElementSize(libwebm::kMkvBlockAdditional, frame->additional(), + frame->additional_length()); + block_addid_elem_size = EbmlElementSize( + libwebm::kMkvBlockAddID, static_cast(frame->add_id())); + + block_more_payload_size = + block_addid_elem_size + block_additional_elem_size; + block_more_elem_size = + EbmlMasterElementSize(libwebm::kMkvBlockMore, block_more_payload_size) + + block_more_payload_size; + block_additions_payload_size = block_more_elem_size; + block_additions_elem_size = + EbmlMasterElementSize(libwebm::kMkvBlockAdditions, + block_additions_payload_size) + + block_additions_payload_size; + } + + uint64 discard_padding_elem_size = 0; + if (frame->discard_padding() != 0) { + discard_padding_elem_size = + EbmlElementSize(libwebm::kMkvDiscardPadding, + static_cast(frame->discard_padding())); + } + + const uint64 reference_block_timestamp = + frame->reference_block_timestamp() / timecode_scale; + uint64 reference_block_elem_size = 0; + if (!frame->is_key()) { + reference_block_elem_size = + EbmlElementSize(libwebm::kMkvReferenceBlock, reference_block_timestamp); + } + + const uint64 duration = frame->duration() / timecode_scale; + uint64 block_duration_elem_size = 0; + if (duration > 0) + block_duration_elem_size = + EbmlElementSize(libwebm::kMkvBlockDuration, duration); + + const uint64 block_payload_size = 4 + frame->length(); + const uint64 block_elem_size = + EbmlMasterElementSize(libwebm::kMkvBlock, block_payload_size) + + block_payload_size; + + const uint64 block_group_payload_size = + block_elem_size + block_additions_elem_size + block_duration_elem_size + + discard_padding_elem_size + reference_block_elem_size; + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvBlockGroup, + block_group_payload_size)) { + return 0; + } + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvBlock, block_payload_size)) + return 0; + + if (WriteUInt(writer, frame->track_number())) + return 0; + + if (SerializeInt(writer, timecode, 2)) + return 0; + + // For a Block, flags is always 0. + if (SerializeInt(writer, 0, 1)) + return 0; + + if (writer->Write(frame->frame(), static_cast(frame->length()))) + return 0; + + if (frame->additional()) { + if (!WriteEbmlMasterElement(writer, libwebm::kMkvBlockAdditions, + block_additions_payload_size)) { + return 0; + } + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvBlockMore, + block_more_payload_size)) + return 0; + + if (!WriteEbmlElement(writer, libwebm::kMkvBlockAddID, + static_cast(frame->add_id()))) + return 0; + + if (!WriteEbmlElement(writer, libwebm::kMkvBlockAdditional, + frame->additional(), frame->additional_length())) { + return 0; + } + } + + if (frame->discard_padding() != 0 && + !WriteEbmlElement(writer, libwebm::kMkvDiscardPadding, + static_cast(frame->discard_padding()))) { + return false; + } + + if (!frame->is_key() && !WriteEbmlElement(writer, libwebm::kMkvReferenceBlock, + reference_block_timestamp)) { + return false; + } + + if (duration > 0 && + !WriteEbmlElement(writer, libwebm::kMkvBlockDuration, duration)) { + return false; + } + return EbmlMasterElementSize(libwebm::kMkvBlockGroup, + block_group_payload_size) + + block_group_payload_size; +} + +uint64 WriteSimpleBlock(IMkvWriter* writer, const Frame* const frame, + int64 timecode) { + if (WriteID(writer, libwebm::kMkvSimpleBlock)) + return 0; + + const int32 size = static_cast(frame->length()) + 4; + if (WriteUInt(writer, size)) + return 0; + + if (WriteUInt(writer, static_cast(frame->track_number()))) + return 0; + + if (SerializeInt(writer, timecode, 2)) + return 0; + + uint64 flags = 0; + if (frame->is_key()) + flags |= 0x80; + + if (SerializeInt(writer, flags, 1)) + return 0; + + if (writer->Write(frame->frame(), static_cast(frame->length()))) + return 0; + + return GetUIntSize(libwebm::kMkvSimpleBlock) + GetCodedUIntSize(size) + 4 + + frame->length(); +} + +} // namespace + +int32 GetCodedUIntSize(uint64 value) { + if (value < 0x000000000000007FULL) + return 1; + else if (value < 0x0000000000003FFFULL) + return 2; + else if (value < 0x00000000001FFFFFULL) + return 3; + else if (value < 0x000000000FFFFFFFULL) + return 4; + else if (value < 0x00000007FFFFFFFFULL) + return 5; + else if (value < 0x000003FFFFFFFFFFULL) + return 6; + else if (value < 0x0001FFFFFFFFFFFFULL) + return 7; + return 8; +} + +int32 GetUIntSize(uint64 value) { + if (value < 0x0000000000000100ULL) + return 1; + else if (value < 0x0000000000010000ULL) + return 2; + else if (value < 0x0000000001000000ULL) + return 3; + else if (value < 0x0000000100000000ULL) + return 4; + else if (value < 0x0000010000000000ULL) + return 5; + else if (value < 0x0001000000000000ULL) + return 6; + else if (value < 0x0100000000000000ULL) + return 7; + return 8; +} + +int32 GetIntSize(int64 value) { + // Doubling the requested value ensures positive values with their high bit + // set are written with 0-padding to avoid flipping the signedness. + const uint64 v = (value < 0) ? value ^ -1LL : value; + return GetUIntSize(2 * v); +} + +uint64 EbmlMasterElementSize(uint64 type, uint64 value) { + // Size of EBML ID + int32 ebml_size = GetUIntSize(type); + + // Datasize + ebml_size += GetCodedUIntSize(value); + + return ebml_size; +} + +uint64 EbmlElementSize(uint64 type, int64 value) { + // Size of EBML ID + int32 ebml_size = GetUIntSize(type); + + // Datasize + ebml_size += GetIntSize(value); + + // Size of Datasize + ebml_size++; + + return ebml_size; +} + +uint64 EbmlElementSize(uint64 type, uint64 value) { + return EbmlElementSize(type, value, 0); +} + +uint64 EbmlElementSize(uint64 type, uint64 value, uint64 fixed_size) { + // Size of EBML ID + uint64 ebml_size = GetUIntSize(type); + + // Datasize + ebml_size += (fixed_size > 0) ? fixed_size : GetUIntSize(value); + + // Size of Datasize + ebml_size++; + + return ebml_size; +} + +uint64 EbmlElementSize(uint64 type, float /* value */) { + // Size of EBML ID + uint64 ebml_size = GetUIntSize(type); + + // Datasize + ebml_size += sizeof(float); + + // Size of Datasize + ebml_size++; + + return ebml_size; +} + +uint64 EbmlElementSize(uint64 type, const char* value) { + if (!value) + return 0; + + // Size of EBML ID + uint64 ebml_size = GetUIntSize(type); + + // Datasize + ebml_size += strlen(value); + + // Size of Datasize + ebml_size += GetCodedUIntSize(strlen(value)); + + return ebml_size; +} + +uint64 EbmlElementSize(uint64 type, const uint8* value, uint64 size) { + if (!value) + return 0; + + // Size of EBML ID + uint64 ebml_size = GetUIntSize(type); + + // Datasize + ebml_size += size; + + // Size of Datasize + ebml_size += GetCodedUIntSize(size); + + return ebml_size; +} + +uint64 EbmlDateElementSize(uint64 type) { + // Size of EBML ID + uint64 ebml_size = GetUIntSize(type); + + // Datasize + ebml_size += kDateElementSize; + + // Size of Datasize + ebml_size++; + + return ebml_size; +} + +int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size) { + if (!writer || size < 1 || size > 8) + return -1; + + for (int32 i = 1; i <= size; ++i) { + const int32 byte_count = size - i; + const int32 bit_count = byte_count * 8; + + const int64 bb = value >> bit_count; + const uint8 b = static_cast(bb); + + const int32 status = writer->Write(&b, 1); + + if (status < 0) + return status; + } + + return 0; +} + +int32 SerializeFloat(IMkvWriter* writer, float f) { + if (!writer) + return -1; + + assert(sizeof(uint32) == sizeof(float)); + // This union is merely used to avoid a reinterpret_cast from float& to + // uint32& which will result in violation of strict aliasing. + union U32 { + uint32 u32; + float f; + } value; + value.f = f; + + for (int32 i = 1; i <= 4; ++i) { + const int32 byte_count = 4 - i; + const int32 bit_count = byte_count * 8; + + const uint8 byte = static_cast(value.u32 >> bit_count); + + const int32 status = writer->Write(&byte, 1); + + if (status < 0) + return status; + } + + return 0; +} + +int32 WriteUInt(IMkvWriter* writer, uint64 value) { + if (!writer) + return -1; + + int32 size = GetCodedUIntSize(value); + + return WriteUIntSize(writer, value, size); +} + +int32 WriteUIntSize(IMkvWriter* writer, uint64 value, int32 size) { + if (!writer || size < 0 || size > 8) + return -1; + + if (size > 0) { + const uint64 bit = 1LL << (size * 7); + + if (value > (bit - 2)) + return -1; + + value |= bit; + } else { + size = 1; + int64 bit; + + for (;;) { + bit = 1LL << (size * 7); + const uint64 max = bit - 2; + + if (value <= max) + break; + + ++size; + } + + if (size > 8) + return false; + + value |= bit; + } + + return SerializeInt(writer, value, size); +} + +int32 WriteID(IMkvWriter* writer, uint64 type) { + if (!writer) + return -1; + + writer->ElementStartNotify(type, writer->Position()); + + const int32 size = GetUIntSize(type); + + return SerializeInt(writer, type, size); +} + +bool WriteEbmlMasterElement(IMkvWriter* writer, uint64 type, uint64 size) { + if (!writer) + return false; + + if (WriteID(writer, type)) + return false; + + if (WriteUInt(writer, size)) + return false; + + return true; +} + +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value) { + return WriteEbmlElement(writer, type, value, 0); +} + +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value, + uint64 fixed_size) { + if (!writer) + return false; + + if (WriteID(writer, type)) + return false; + + uint64 size = GetUIntSize(value); + if (fixed_size > 0) { + if (size > fixed_size) + return false; + size = fixed_size; + } + if (WriteUInt(writer, size)) + return false; + + if (SerializeInt(writer, value, static_cast(size))) + return false; + + return true; +} + +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, int64 value) { + if (!writer) + return false; + + if (WriteID(writer, type)) + return 0; + + const uint64 size = GetIntSize(value); + if (WriteUInt(writer, size)) + return false; + + if (SerializeInt(writer, value, static_cast(size))) + return false; + + return true; +} + +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, float value) { + if (!writer) + return false; + + if (WriteID(writer, type)) + return false; + + if (WriteUInt(writer, 4)) + return false; + + if (SerializeFloat(writer, value)) + return false; + + return true; +} + +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const char* value) { + if (!writer || !value) + return false; + + if (WriteID(writer, type)) + return false; + + const uint64 length = strlen(value); + if (WriteUInt(writer, length)) + return false; + + if (writer->Write(value, static_cast(length))) + return false; + + return true; +} + +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const uint8* value, + uint64 size) { + if (!writer || !value || size < 1) + return false; + + if (WriteID(writer, type)) + return false; + + if (WriteUInt(writer, size)) + return false; + + if (writer->Write(value, static_cast(size))) + return false; + + return true; +} + +bool WriteEbmlDateElement(IMkvWriter* writer, uint64 type, int64 value) { + if (!writer) + return false; + + if (WriteID(writer, type)) + return false; + + if (WriteUInt(writer, kDateElementSize)) + return false; + + if (SerializeInt(writer, value, kDateElementSize)) + return false; + + return true; +} + +uint64 WriteFrame(IMkvWriter* writer, const Frame* const frame, + Cluster* cluster) { + if (!writer || !frame || !frame->IsValid() || !cluster || + !cluster->timecode_scale()) + return 0; + + // Technically the timecode for a block can be less than the + // timecode for the cluster itself (remember that block timecode + // is a signed, 16-bit integer). However, as a simplification we + // only permit non-negative cluster-relative timecodes for blocks. + const int64 relative_timecode = cluster->GetRelativeTimecode( + frame->timestamp() / cluster->timecode_scale()); + if (relative_timecode < 0 || relative_timecode > kMaxBlockTimecode) + return 0; + + return frame->CanBeSimpleBlock() + ? WriteSimpleBlock(writer, frame, relative_timecode) + : WriteBlock(writer, frame, relative_timecode, + cluster->timecode_scale()); +} + +uint64 WriteVoidElement(IMkvWriter* writer, uint64 size) { + if (!writer) + return false; + + // Subtract one for the void ID and the coded size. + uint64 void_entry_size = size - 1 - GetCodedUIntSize(size - 1); + uint64 void_size = EbmlMasterElementSize(libwebm::kMkvVoid, void_entry_size) + + void_entry_size; + + if (void_size != size) + return 0; + + const int64 payload_position = writer->Position(); + if (payload_position < 0) + return 0; + + if (WriteID(writer, libwebm::kMkvVoid)) + return 0; + + if (WriteUInt(writer, void_entry_size)) + return 0; + + const uint8 value = 0; + for (int32 i = 0; i < static_cast(void_entry_size); ++i) { + if (writer->Write(&value, 1)) + return 0; + } + + const int64 stop_position = writer->Position(); + if (stop_position < 0 || + stop_position - payload_position != static_cast(void_size)) + return 0; + + return void_size; +} + +void GetVersion(int32* major, int32* minor, int32* build, int32* revision) { + *major = 0; + *minor = 2; + *build = 1; + *revision = 0; +} + +uint64 MakeUID(unsigned int* seed) { + uint64 uid = 0; + +#ifdef __MINGW32__ + srand(*seed); +#endif + + for (int i = 0; i < 7; ++i) { // avoid problems with 8-byte values + uid <<= 8; + +// TODO(fgalligan): Move random number generation to platform specific code. +#ifdef _MSC_VER + (void)seed; + const int32 nn = rand(); +#elif __ANDROID__ + (void)seed; + int32 temp_num = 1; + int fd = open("/dev/urandom", O_RDONLY); + if (fd != -1) { + read(fd, &temp_num, sizeof(temp_num)); + close(fd); + } + const int32 nn = temp_num; +#elif defined __MINGW32__ + const int32 nn = rand(); +#else + const int32 nn = rand_r(seed); +#endif + const int32 n = 0xFF & (nn >> 4); // throw away low-order bits + + uid |= n; + } + + return uid; +} + +bool IsMatrixCoefficientsValueValid(uint64_t value) { + switch (value) { + case mkvmuxer::Colour::kGbr: + case mkvmuxer::Colour::kBt709: + case mkvmuxer::Colour::kUnspecifiedMc: + case mkvmuxer::Colour::kReserved: + case mkvmuxer::Colour::kFcc: + case mkvmuxer::Colour::kBt470bg: + case mkvmuxer::Colour::kSmpte170MMc: + case mkvmuxer::Colour::kSmpte240MMc: + case mkvmuxer::Colour::kYcocg: + case mkvmuxer::Colour::kBt2020NonConstantLuminance: + case mkvmuxer::Colour::kBt2020ConstantLuminance: + return true; + } + return false; +} + +bool IsChromaSitingHorzValueValid(uint64_t value) { + switch (value) { + case mkvmuxer::Colour::kUnspecifiedCsh: + case mkvmuxer::Colour::kLeftCollocated: + case mkvmuxer::Colour::kHalfCsh: + return true; + } + return false; +} + +bool IsChromaSitingVertValueValid(uint64_t value) { + switch (value) { + case mkvmuxer::Colour::kUnspecifiedCsv: + case mkvmuxer::Colour::kTopCollocated: + case mkvmuxer::Colour::kHalfCsv: + return true; + } + return false; +} + +bool IsColourRangeValueValid(uint64_t value) { + switch (value) { + case mkvmuxer::Colour::kUnspecifiedCr: + case mkvmuxer::Colour::kBroadcastRange: + case mkvmuxer::Colour::kFullRange: + case mkvmuxer::Colour::kMcTcDefined: + return true; + } + return false; +} + +bool IsTransferCharacteristicsValueValid(uint64_t value) { + switch (value) { + case mkvmuxer::Colour::kIturBt709Tc: + case mkvmuxer::Colour::kUnspecifiedTc: + case mkvmuxer::Colour::kReservedTc: + case mkvmuxer::Colour::kGamma22Curve: + case mkvmuxer::Colour::kGamma28Curve: + case mkvmuxer::Colour::kSmpte170MTc: + case mkvmuxer::Colour::kSmpte240MTc: + case mkvmuxer::Colour::kLinear: + case mkvmuxer::Colour::kLog: + case mkvmuxer::Colour::kLogSqrt: + case mkvmuxer::Colour::kIec6196624: + case mkvmuxer::Colour::kIturBt1361ExtendedColourGamut: + case mkvmuxer::Colour::kIec6196621: + case mkvmuxer::Colour::kIturBt202010bit: + case mkvmuxer::Colour::kIturBt202012bit: + case mkvmuxer::Colour::kSmpteSt2084: + case mkvmuxer::Colour::kSmpteSt4281Tc: + case mkvmuxer::Colour::kAribStdB67Hlg: + return true; + } + return false; +} + +bool IsPrimariesValueValid(uint64_t value) { + switch (value) { + case mkvmuxer::Colour::kReservedP0: + case mkvmuxer::Colour::kIturBt709P: + case mkvmuxer::Colour::kUnspecifiedP: + case mkvmuxer::Colour::kReservedP3: + case mkvmuxer::Colour::kIturBt470M: + case mkvmuxer::Colour::kIturBt470Bg: + case mkvmuxer::Colour::kSmpte170MP: + case mkvmuxer::Colour::kSmpte240MP: + case mkvmuxer::Colour::kFilm: + case mkvmuxer::Colour::kIturBt2020: + case mkvmuxer::Colour::kSmpteSt4281P: + case mkvmuxer::Colour::kJedecP22Phosphors: + return true; + } + return false; +} + +} // namespace mkvmuxer diff --git a/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxerutil.h b/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxerutil.h new file mode 100644 index 000000000..3355428bd --- /dev/null +++ b/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxerutil.h @@ -0,0 +1,115 @@ +// Copyright (c) 2012 The WebM project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +#ifndef MKVMUXER_MKVMUXERUTIL_H_ +#define MKVMUXER_MKVMUXERUTIL_H_ + +#include "mkvmuxertypes.h" + +#include "stdint.h" + +namespace mkvmuxer { +class Cluster; +class Frame; +class IMkvWriter; + +// TODO(tomfinegan): mkvmuxer:: integer types continue to be used here because +// changing them causes pain for downstream projects. It would be nice if a +// solution that allows removal of the mkvmuxer:: integer types while avoiding +// pain for downstream users of libwebm. Considering that mkvmuxerutil.{cc,h} +// are really, for the great majority of cases, EBML size calculation and writer +// functions, perhaps a more EBML focused utility would be the way to go as a +// first step. + +const uint64 kEbmlUnknownValue = 0x01FFFFFFFFFFFFFFULL; +const int64 kMaxBlockTimecode = 0x07FFFLL; + +// Writes out |value| in Big Endian order. Returns 0 on success. +int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size); + +// Writes out |f| in Big Endian order. Returns 0 on success. +int32 SerializeFloat(IMkvWriter* writer, float f); + +// Returns the size in bytes of the element. +int32 GetUIntSize(uint64 value); +int32 GetIntSize(int64 value); +int32 GetCodedUIntSize(uint64 value); +uint64 EbmlMasterElementSize(uint64 type, uint64 value); +uint64 EbmlElementSize(uint64 type, int64 value); +uint64 EbmlElementSize(uint64 type, uint64 value); +uint64 EbmlElementSize(uint64 type, float value); +uint64 EbmlElementSize(uint64 type, const char* value); +uint64 EbmlElementSize(uint64 type, const uint8* value, uint64 size); +uint64 EbmlDateElementSize(uint64 type); + +// Returns the size in bytes of the element assuming that the element was +// written using |fixed_size| bytes. If |fixed_size| is set to zero, then it +// computes the necessary number of bytes based on |value|. +uint64 EbmlElementSize(uint64 type, uint64 value, uint64 fixed_size); + +// Creates an EBML coded number from |value| and writes it out. The size of +// the coded number is determined by the value of |value|. |value| must not +// be in a coded form. Returns 0 on success. +int32 WriteUInt(IMkvWriter* writer, uint64 value); + +// Creates an EBML coded number from |value| and writes it out. The size of +// the coded number is determined by the value of |size|. |value| must not +// be in a coded form. Returns 0 on success. +int32 WriteUIntSize(IMkvWriter* writer, uint64 value, int32 size); + +// Output an Mkv master element. Returns true if the element was written. +bool WriteEbmlMasterElement(IMkvWriter* writer, uint64 value, uint64 size); + +// Outputs an Mkv ID, calls |IMkvWriter::ElementStartNotify|, and passes the +// ID to |SerializeInt|. Returns 0 on success. +int32 WriteID(IMkvWriter* writer, uint64 type); + +// Output an Mkv non-master element. Returns true if the element was written. +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value); +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, int64 value); +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, float value); +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const char* value); +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const uint8* value, + uint64 size); +bool WriteEbmlDateElement(IMkvWriter* writer, uint64 type, int64 value); + +// Output an Mkv non-master element using fixed size. The element will be +// written out using exactly |fixed_size| bytes. If |fixed_size| is set to zero +// then it computes the necessary number of bytes based on |value|. Returns true +// if the element was written. +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value, + uint64 fixed_size); + +// Output a Mkv Frame. It decides the correct element to write (Block vs +// SimpleBlock) based on the parameters of the Frame. +uint64 WriteFrame(IMkvWriter* writer, const Frame* const frame, + Cluster* cluster); + +// Output a void element. |size| must be the entire size in bytes that will be +// void. The function will calculate the size of the void header and subtract +// it from |size|. +uint64 WriteVoidElement(IMkvWriter* writer, uint64 size); + +// Returns the version number of the muxer in |major|, |minor|, |build|, +// and |revision|. +void GetVersion(int32* major, int32* minor, int32* build, int32* revision); + +// Returns a random number to be used for UID, using |seed| to seed +// the random-number generator (see POSIX rand_r() for semantics). +uint64 MakeUID(unsigned int* seed); + +// Colour field validation helpers. All return true when |value| is valid. +bool IsMatrixCoefficientsValueValid(uint64_t value); +bool IsChromaSitingHorzValueValid(uint64_t value); +bool IsChromaSitingVertValueValid(uint64_t value); +bool IsColourRangeValueValid(uint64_t value); +bool IsTransferCharacteristicsValueValid(uint64_t value); +bool IsPrimariesValueValid(uint64_t value); + +} // namespace mkvmuxer + +#endif // MKVMUXER_MKVMUXERUTIL_H_ diff --git a/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvwriter.cc b/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvwriter.cc new file mode 100644 index 000000000..d668384d8 --- /dev/null +++ b/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvwriter.cc @@ -0,0 +1,92 @@ +// Copyright (c) 2012 The WebM project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. + +#include "mkvmuxer/mkvwriter.h" + +#include + +#ifdef _MSC_VER +#include // for _SH_DENYWR +#endif + +namespace mkvmuxer { + +MkvWriter::MkvWriter() : file_(NULL), writer_owns_file_(true) {} + +MkvWriter::MkvWriter(FILE* fp) : file_(fp), writer_owns_file_(false) {} + +MkvWriter::~MkvWriter() { Close(); } + +int32 MkvWriter::Write(const void* buffer, uint32 length) { + if (!file_) + return -1; + + if (length == 0) + return 0; + + if (buffer == NULL) + return -1; + + const size_t bytes_written = fwrite(buffer, 1, length, file_); + + return (bytes_written == length) ? 0 : -1; +} + +bool MkvWriter::Open(const char* filename) { + if (filename == NULL) + return false; + + if (file_) + return false; + +#ifdef _MSC_VER + file_ = _fsopen(filename, "wb", _SH_DENYWR); +#else + file_ = fopen(filename, "wb"); +#endif + if (file_ == NULL) + return false; + return true; +} + +void MkvWriter::Close() { + if (file_ && writer_owns_file_) { + fclose(file_); + } + file_ = NULL; +} + +int64 MkvWriter::Position() const { + if (!file_) + return 0; + +#ifdef _MSC_VER + return _ftelli64(file_); +#else + return ftell(file_); +#endif +} + +int32 MkvWriter::Position(int64 position) { + if (!file_) + return -1; + +#ifdef _MSC_VER + return _fseeki64(file_, position, SEEK_SET); +#elif defined(_WIN32) + return fseeko64(file_, static_cast(position), SEEK_SET); +#else + return fseeko(file_, static_cast(position), SEEK_SET); +#endif +} + +bool MkvWriter::Seekable() const { return true; } + +void MkvWriter::ElementStartNotify(uint64, int64) {} + +} // namespace mkvmuxer diff --git a/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvwriter.h b/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvwriter.h new file mode 100644 index 000000000..4227c6374 --- /dev/null +++ b/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvwriter.h @@ -0,0 +1,51 @@ +// Copyright (c) 2012 The WebM project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. + +#ifndef MKVMUXER_MKVWRITER_H_ +#define MKVMUXER_MKVWRITER_H_ + +#include + +#include "mkvmuxer/mkvmuxer.h" +#include "mkvmuxer/mkvmuxertypes.h" + +namespace mkvmuxer { + +// Default implementation of the IMkvWriter interface on Windows. +class MkvWriter : public IMkvWriter { + public: + MkvWriter(); + explicit MkvWriter(FILE* fp); + virtual ~MkvWriter(); + + // IMkvWriter interface + virtual int64 Position() const; + virtual int32 Position(int64 position); + virtual bool Seekable() const; + virtual int32 Write(const void* buffer, uint32 length); + virtual void ElementStartNotify(uint64 element_id, int64 position); + + // Creates and opens a file for writing. |filename| is the name of the file + // to open. This function will overwrite the contents of |filename|. Returns + // true on success. + bool Open(const char* filename); + + // Closes an opened file. + void Close(); + + private: + // File handle to output file. + FILE* file_; + bool writer_owns_file_; + + LIBWEBM_DISALLOW_COPY_AND_ASSIGN(MkvWriter); +}; + +} // namespace mkvmuxer + +#endif // MKVMUXER_MKVWRITER_H_ diff --git a/libs/libaom/src/third_party/libwebm/mkvparser/mkvparser.cc b/libs/libaom/src/third_party/libwebm/mkvparser/mkvparser.cc new file mode 100644 index 000000000..ace65bd59 --- /dev/null +++ b/libs/libaom/src/third_party/libwebm/mkvparser/mkvparser.cc @@ -0,0 +1,8071 @@ +// Copyright (c) 2012 The WebM project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +#include "mkvparser/mkvparser.h" + +#if defined(_MSC_VER) && _MSC_VER < 1800 +#include // _isnan() / _finite() +#define MSC_COMPAT +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include "common/webmids.h" + +namespace mkvparser { +const long long kStringElementSizeLimit = 20 * 1000 * 1000; +const float MasteringMetadata::kValueNotPresent = FLT_MAX; +const long long Colour::kValueNotPresent = LLONG_MAX; +const float Projection::kValueNotPresent = FLT_MAX; + +#ifdef MSC_COMPAT +inline bool isnan(double val) { return !!_isnan(val); } +inline bool isinf(double val) { return !_finite(val); } +#else +inline bool isnan(double val) { return std::isnan(val); } +inline bool isinf(double val) { return std::isinf(val); } +#endif // MSC_COMPAT + +template +Type* SafeArrayAlloc(unsigned long long num_elements, + unsigned long long element_size) { + if (num_elements == 0 || element_size == 0) + return NULL; + + const size_t kMaxAllocSize = 0x80000000; // 2GiB + const unsigned long long num_bytes = num_elements * element_size; + if (element_size > (kMaxAllocSize / num_elements)) + return NULL; + if (num_bytes != static_cast(num_bytes)) + return NULL; + + return new (std::nothrow) Type[static_cast(num_bytes)]; +} + +void GetVersion(int& major, int& minor, int& build, int& revision) { + major = 1; + minor = 0; + build = 0; + revision = 30; +} + +long long ReadUInt(IMkvReader* pReader, long long pos, long& len) { + if (!pReader || pos < 0) + return E_FILE_FORMAT_INVALID; + + len = 1; + unsigned char b; + int status = pReader->Read(pos, 1, &b); + + if (status < 0) // error or underflow + return status; + + if (status > 0) // interpreted as "underflow" + return E_BUFFER_NOT_FULL; + + if (b == 0) // we can't handle u-int values larger than 8 bytes + return E_FILE_FORMAT_INVALID; + + unsigned char m = 0x80; + + while (!(b & m)) { + m >>= 1; + ++len; + } + + long long result = b & (~m); + ++pos; + + for (int i = 1; i < len; ++i) { + status = pReader->Read(pos, 1, &b); + + if (status < 0) { + len = 1; + return status; + } + + if (status > 0) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + result <<= 8; + result |= b; + + ++pos; + } + + return result; +} + +// Reads an EBML ID and returns it. +// An ID must at least 1 byte long, cannot exceed 4, and its value must be +// greater than 0. +// See known EBML values and EBMLMaxIDLength: +// http://www.matroska.org/technical/specs/index.html +// Returns the ID, or a value less than 0 to report an error while reading the +// ID. +long long ReadID(IMkvReader* pReader, long long pos, long& len) { + if (pReader == NULL || pos < 0) + return E_FILE_FORMAT_INVALID; + + // Read the first byte. The length in bytes of the ID is determined by + // finding the first set bit in the first byte of the ID. + unsigned char temp_byte = 0; + int read_status = pReader->Read(pos, 1, &temp_byte); + + if (read_status < 0) + return E_FILE_FORMAT_INVALID; + else if (read_status > 0) // No data to read. + return E_BUFFER_NOT_FULL; + + if (temp_byte == 0) // ID length > 8 bytes; invalid file. + return E_FILE_FORMAT_INVALID; + + int bit_pos = 0; + const int kMaxIdLengthInBytes = 4; + const int kCheckByte = 0x80; + + // Find the first bit that's set. + bool found_bit = false; + for (; bit_pos < kMaxIdLengthInBytes; ++bit_pos) { + if ((kCheckByte >> bit_pos) & temp_byte) { + found_bit = true; + break; + } + } + + if (!found_bit) { + // The value is too large to be a valid ID. + return E_FILE_FORMAT_INVALID; + } + + // Read the remaining bytes of the ID (if any). + const int id_length = bit_pos + 1; + long long ebml_id = temp_byte; + for (int i = 1; i < id_length; ++i) { + ebml_id <<= 8; + read_status = pReader->Read(pos + i, 1, &temp_byte); + + if (read_status < 0) + return E_FILE_FORMAT_INVALID; + else if (read_status > 0) + return E_BUFFER_NOT_FULL; + + ebml_id |= temp_byte; + } + + len = id_length; + return ebml_id; +} + +long long GetUIntLength(IMkvReader* pReader, long long pos, long& len) { + if (!pReader || pos < 0) + return E_FILE_FORMAT_INVALID; + + long long total, available; + + int status = pReader->Length(&total, &available); + if (status < 0 || (total >= 0 && available > total)) + return E_FILE_FORMAT_INVALID; + + len = 1; + + if (pos >= available) + return pos; // too few bytes available + + unsigned char b; + + status = pReader->Read(pos, 1, &b); + + if (status != 0) + return status; + + if (b == 0) // we can't handle u-int values larger than 8 bytes + return E_FILE_FORMAT_INVALID; + + unsigned char m = 0x80; + + while (!(b & m)) { + m >>= 1; + ++len; + } + + return 0; // success +} + +// TODO(vigneshv): This function assumes that unsigned values never have their +// high bit set. +long long UnserializeUInt(IMkvReader* pReader, long long pos, long long size) { + if (!pReader || pos < 0 || (size <= 0) || (size > 8)) + return E_FILE_FORMAT_INVALID; + + long long result = 0; + + for (long long i = 0; i < size; ++i) { + unsigned char b; + + const long status = pReader->Read(pos, 1, &b); + + if (status < 0) + return status; + + result <<= 8; + result |= b; + + ++pos; + } + + return result; +} + +long UnserializeFloat(IMkvReader* pReader, long long pos, long long size_, + double& result) { + if (!pReader || pos < 0 || ((size_ != 4) && (size_ != 8))) + return E_FILE_FORMAT_INVALID; + + const long size = static_cast(size_); + + unsigned char buf[8]; + + const int status = pReader->Read(pos, size, buf); + + if (status < 0) // error + return status; + + if (size == 4) { + union { + float f; + unsigned long ff; + }; + + ff = 0; + + for (int i = 0;;) { + ff |= buf[i]; + + if (++i >= 4) + break; + + ff <<= 8; + } + + result = f; + } else { + union { + double d; + unsigned long long dd; + }; + + dd = 0; + + for (int i = 0;;) { + dd |= buf[i]; + + if (++i >= 8) + break; + + dd <<= 8; + } + + result = d; + } + + if (mkvparser::isinf(result) || mkvparser::isnan(result)) + return E_FILE_FORMAT_INVALID; + + return 0; +} + +long UnserializeInt(IMkvReader* pReader, long long pos, long long size, + long long& result_ref) { + if (!pReader || pos < 0 || size < 1 || size > 8) + return E_FILE_FORMAT_INVALID; + + signed char first_byte = 0; + const long status = pReader->Read(pos, 1, (unsigned char*)&first_byte); + + if (status < 0) + return status; + + unsigned long long result = first_byte; + ++pos; + + for (long i = 1; i < size; ++i) { + unsigned char b; + + const long status = pReader->Read(pos, 1, &b); + + if (status < 0) + return status; + + result <<= 8; + result |= b; + + ++pos; + } + + result_ref = static_cast(result); + return 0; +} + +long UnserializeString(IMkvReader* pReader, long long pos, long long size, + char*& str) { + delete[] str; + str = NULL; + + if (size >= LONG_MAX || size < 0 || size > kStringElementSizeLimit) + return E_FILE_FORMAT_INVALID; + + // +1 for '\0' terminator + const long required_size = static_cast(size) + 1; + + str = SafeArrayAlloc(1, required_size); + if (str == NULL) + return E_FILE_FORMAT_INVALID; + + unsigned char* const buf = reinterpret_cast(str); + + const long status = pReader->Read(pos, static_cast(size), buf); + + if (status) { + delete[] str; + str = NULL; + + return status; + } + + str[required_size - 1] = '\0'; + return 0; +} + +long ParseElementHeader(IMkvReader* pReader, long long& pos, long long stop, + long long& id, long long& size) { + if (stop >= 0 && pos >= stop) + return E_FILE_FORMAT_INVALID; + + long len; + + id = ReadID(pReader, pos, len); + + if (id < 0) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume id + + if (stop >= 0 && pos >= stop) + return E_FILE_FORMAT_INVALID; + + size = ReadUInt(pReader, pos, len); + + if (size < 0 || len < 1 || len > 8) { + // Invalid: Negative payload size, negative or 0 length integer, or integer + // larger than 64 bits (libwebm cannot handle them). + return E_FILE_FORMAT_INVALID; + } + + // Avoid rolling over pos when very close to LLONG_MAX. + const unsigned long long rollover_check = + static_cast(pos) + len; + if (rollover_check > LLONG_MAX) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume length of size + + // pos now designates payload + + if (stop >= 0 && pos > stop) + return E_FILE_FORMAT_INVALID; + + return 0; // success +} + +bool Match(IMkvReader* pReader, long long& pos, unsigned long expected_id, + long long& val) { + if (!pReader || pos < 0) + return false; + + long long total = 0; + long long available = 0; + + const long status = pReader->Length(&total, &available); + if (status < 0 || (total >= 0 && available > total)) + return false; + + long len = 0; + + const long long id = ReadID(pReader, pos, len); + if (id < 0 || (available - pos) > len) + return false; + + if (static_cast(id) != expected_id) + return false; + + pos += len; // consume id + + const long long size = ReadUInt(pReader, pos, len); + if (size < 0 || size > 8 || len < 1 || len > 8 || (available - pos) > len) + return false; + + pos += len; // consume length of size of payload + + val = UnserializeUInt(pReader, pos, size); + if (val < 0) + return false; + + pos += size; // consume size of payload + + return true; +} + +bool Match(IMkvReader* pReader, long long& pos, unsigned long expected_id, + unsigned char*& buf, size_t& buflen) { + if (!pReader || pos < 0) + return false; + + long long total = 0; + long long available = 0; + + long status = pReader->Length(&total, &available); + if (status < 0 || (total >= 0 && available > total)) + return false; + + long len = 0; + const long long id = ReadID(pReader, pos, len); + if (id < 0 || (available - pos) > len) + return false; + + if (static_cast(id) != expected_id) + return false; + + pos += len; // consume id + + const long long size = ReadUInt(pReader, pos, len); + if (size < 0 || len <= 0 || len > 8 || (available - pos) > len) + return false; + + unsigned long long rollover_check = + static_cast(pos) + len; + if (rollover_check > LLONG_MAX) + return false; + + pos += len; // consume length of size of payload + + rollover_check = static_cast(pos) + size; + if (rollover_check > LLONG_MAX) + return false; + + if ((pos + size) > available) + return false; + + if (size >= LONG_MAX) + return false; + + const long buflen_ = static_cast(size); + + buf = SafeArrayAlloc(1, buflen_); + if (!buf) + return false; + + status = pReader->Read(pos, buflen_, buf); + if (status != 0) + return false; + + buflen = buflen_; + + pos += size; // consume size of payload + return true; +} + +EBMLHeader::EBMLHeader() : m_docType(NULL) { Init(); } + +EBMLHeader::~EBMLHeader() { delete[] m_docType; } + +void EBMLHeader::Init() { + m_version = 1; + m_readVersion = 1; + m_maxIdLength = 4; + m_maxSizeLength = 8; + + if (m_docType) { + delete[] m_docType; + m_docType = NULL; + } + + m_docTypeVersion = 1; + m_docTypeReadVersion = 1; +} + +long long EBMLHeader::Parse(IMkvReader* pReader, long long& pos) { + if (!pReader) + return E_FILE_FORMAT_INVALID; + + long long total, available; + + long status = pReader->Length(&total, &available); + + if (status < 0) // error + return status; + + pos = 0; + + // Scan until we find what looks like the first byte of the EBML header. + const long long kMaxScanBytes = (available >= 1024) ? 1024 : available; + const unsigned char kEbmlByte0 = 0x1A; + unsigned char scan_byte = 0; + + while (pos < kMaxScanBytes) { + status = pReader->Read(pos, 1, &scan_byte); + + if (status < 0) // error + return status; + else if (status > 0) + return E_BUFFER_NOT_FULL; + + if (scan_byte == kEbmlByte0) + break; + + ++pos; + } + + long len = 0; + const long long ebml_id = ReadID(pReader, pos, len); + + if (ebml_id == E_BUFFER_NOT_FULL) + return E_BUFFER_NOT_FULL; + + if (len != 4 || ebml_id != libwebm::kMkvEBML) + return E_FILE_FORMAT_INVALID; + + // Move read pos forward to the EBML header size field. + pos += 4; + + // Read length of size field. + long long result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error + return E_FILE_FORMAT_INVALID; + else if (result > 0) // need more data + return E_BUFFER_NOT_FULL; + + if (len < 1 || len > 8) + return E_FILE_FORMAT_INVALID; + + if ((total >= 0) && ((total - pos) < len)) + return E_FILE_FORMAT_INVALID; + + if ((available - pos) < len) + return pos + len; // try again later + + // Read the EBML header size. + result = ReadUInt(pReader, pos, len); + + if (result < 0) // error + return result; + + pos += len; // consume size field + + // pos now designates start of payload + + if ((total >= 0) && ((total - pos) < result)) + return E_FILE_FORMAT_INVALID; + + if ((available - pos) < result) + return pos + result; + + const long long end = pos + result; + + Init(); + + while (pos < end) { + long long id, size; + + status = ParseElementHeader(pReader, pos, end, id, size); + + if (status < 0) // error + return status; + + if (size == 0) + return E_FILE_FORMAT_INVALID; + + if (id == libwebm::kMkvEBMLVersion) { + m_version = UnserializeUInt(pReader, pos, size); + + if (m_version <= 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvEBMLReadVersion) { + m_readVersion = UnserializeUInt(pReader, pos, size); + + if (m_readVersion <= 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvEBMLMaxIDLength) { + m_maxIdLength = UnserializeUInt(pReader, pos, size); + + if (m_maxIdLength <= 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvEBMLMaxSizeLength) { + m_maxSizeLength = UnserializeUInt(pReader, pos, size); + + if (m_maxSizeLength <= 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvDocType) { + if (m_docType) + return E_FILE_FORMAT_INVALID; + + status = UnserializeString(pReader, pos, size, m_docType); + + if (status) // error + return status; + } else if (id == libwebm::kMkvDocTypeVersion) { + m_docTypeVersion = UnserializeUInt(pReader, pos, size); + + if (m_docTypeVersion <= 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvDocTypeReadVersion) { + m_docTypeReadVersion = UnserializeUInt(pReader, pos, size); + + if (m_docTypeReadVersion <= 0) + return E_FILE_FORMAT_INVALID; + } + + pos += size; + } + + if (pos != end) + return E_FILE_FORMAT_INVALID; + + // Make sure DocType, DocTypeReadVersion, and DocTypeVersion are valid. + if (m_docType == NULL || m_docTypeReadVersion <= 0 || m_docTypeVersion <= 0) + return E_FILE_FORMAT_INVALID; + + // Make sure EBMLMaxIDLength and EBMLMaxSizeLength are valid. + if (m_maxIdLength <= 0 || m_maxIdLength > 4 || m_maxSizeLength <= 0 || + m_maxSizeLength > 8) + return E_FILE_FORMAT_INVALID; + + return 0; +} + +Segment::Segment(IMkvReader* pReader, long long elem_start, + // long long elem_size, + long long start, long long size) + : m_pReader(pReader), + m_element_start(elem_start), + // m_element_size(elem_size), + m_start(start), + m_size(size), + m_pos(start), + m_pUnknownSize(0), + m_pSeekHead(NULL), + m_pInfo(NULL), + m_pTracks(NULL), + m_pCues(NULL), + m_pChapters(NULL), + m_pTags(NULL), + m_clusters(NULL), + m_clusterCount(0), + m_clusterPreloadCount(0), + m_clusterSize(0) {} + +Segment::~Segment() { + const long count = m_clusterCount + m_clusterPreloadCount; + + Cluster** i = m_clusters; + Cluster** j = m_clusters + count; + + while (i != j) { + Cluster* const p = *i++; + delete p; + } + + delete[] m_clusters; + + delete m_pTracks; + delete m_pInfo; + delete m_pCues; + delete m_pChapters; + delete m_pTags; + delete m_pSeekHead; +} + +long long Segment::CreateInstance(IMkvReader* pReader, long long pos, + Segment*& pSegment) { + if (pReader == NULL || pos < 0) + return E_PARSE_FAILED; + + pSegment = NULL; + + long long total, available; + + const long status = pReader->Length(&total, &available); + + if (status < 0) // error + return status; + + if (available < 0) + return -1; + + if ((total >= 0) && (available > total)) + return -1; + + // I would assume that in practice this loop would execute + // exactly once, but we allow for other elements (e.g. Void) + // to immediately follow the EBML header. This is fine for + // the source filter case (since the entire file is available), + // but in the splitter case over a network we should probably + // just give up early. We could for example decide only to + // execute this loop a maximum of, say, 10 times. + // TODO: + // There is an implied "give up early" by only parsing up + // to the available limit. We do do that, but only if the + // total file size is unknown. We could decide to always + // use what's available as our limit (irrespective of whether + // we happen to know the total file length). This would have + // as its sense "parse this much of the file before giving up", + // which a slightly different sense from "try to parse up to + // 10 EMBL elements before giving up". + + for (;;) { + if ((total >= 0) && (pos >= total)) + return E_FILE_FORMAT_INVALID; + + // Read ID + long len; + long long result = GetUIntLength(pReader, pos, len); + + if (result) // error, or too few available bytes + return result; + + if ((total >= 0) && ((pos + len) > total)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > available) + return pos + len; + + const long long idpos = pos; + const long long id = ReadID(pReader, pos, len); + + if (id < 0) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume ID + + // Read Size + + result = GetUIntLength(pReader, pos, len); + + if (result) // error, or too few available bytes + return result; + + if ((total >= 0) && ((pos + len) > total)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > available) + return pos + len; + + long long size = ReadUInt(pReader, pos, len); + + if (size < 0) // error + return size; + + pos += len; // consume length of size of element + + // Pos now points to start of payload + + // Handle "unknown size" for live streaming of webm files. + const long long unknown_size = (1LL << (7 * len)) - 1; + + if (id == libwebm::kMkvSegment) { + if (size == unknown_size) + size = -1; + + else if (total < 0) + size = -1; + + else if ((pos + size) > total) + size = -1; + + pSegment = new (std::nothrow) Segment(pReader, idpos, pos, size); + if (pSegment == NULL) + return E_PARSE_FAILED; + + return 0; // success + } + + if (size == unknown_size) + return E_FILE_FORMAT_INVALID; + + if ((total >= 0) && ((pos + size) > total)) + return E_FILE_FORMAT_INVALID; + + if ((pos + size) > available) + return pos + size; + + pos += size; // consume payload + } +} + +long long Segment::ParseHeaders() { + // Outermost (level 0) segment object has been constructed, + // and pos designates start of payload. We need to find the + // inner (level 1) elements. + long long total, available; + + const int status = m_pReader->Length(&total, &available); + + if (status < 0) // error + return status; + + if (total > 0 && available > total) + return E_FILE_FORMAT_INVALID; + + const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size; + + if ((segment_stop >= 0 && total >= 0 && segment_stop > total) || + (segment_stop >= 0 && m_pos > segment_stop)) { + return E_FILE_FORMAT_INVALID; + } + + for (;;) { + if ((total >= 0) && (m_pos >= total)) + break; + + if ((segment_stop >= 0) && (m_pos >= segment_stop)) + break; + + long long pos = m_pos; + const long long element_start = pos; + + // Avoid rolling over pos when very close to LLONG_MAX. + unsigned long long rollover_check = pos + 1ULL; + if (rollover_check > LLONG_MAX) + return E_FILE_FORMAT_INVALID; + + if ((pos + 1) > available) + return (pos + 1); + + long len; + long long result = GetUIntLength(m_pReader, pos, len); + + if (result < 0) // error + return result; + + if (result > 0) { + // MkvReader doesn't have enough data to satisfy this read attempt. + return (pos + 1); + } + + if ((segment_stop >= 0) && ((pos + len) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > available) + return pos + len; + + const long long idpos = pos; + const long long id = ReadID(m_pReader, idpos, len); + + if (id < 0) + return E_FILE_FORMAT_INVALID; + + if (id == libwebm::kMkvCluster) + break; + + pos += len; // consume ID + + if ((pos + 1) > available) + return (pos + 1); + + // Read Size + result = GetUIntLength(m_pReader, pos, len); + + if (result < 0) // error + return result; + + if (result > 0) { + // MkvReader doesn't have enough data to satisfy this read attempt. + return (pos + 1); + } + + if ((segment_stop >= 0) && ((pos + len) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > available) + return pos + len; + + const long long size = ReadUInt(m_pReader, pos, len); + + if (size < 0 || len < 1 || len > 8) { + // TODO(tomfinegan): ReadUInt should return an error when len is < 1 or + // len > 8 is true instead of checking this _everywhere_. + return size; + } + + pos += len; // consume length of size of element + + // Avoid rolling over pos when very close to LLONG_MAX. + rollover_check = static_cast(pos) + size; + if (rollover_check > LLONG_MAX) + return E_FILE_FORMAT_INVALID; + + const long long element_size = size + pos - element_start; + + // Pos now points to start of payload + + if ((segment_stop >= 0) && ((pos + size) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + // We read EBML elements either in total or nothing at all. + + if ((pos + size) > available) + return pos + size; + + if (id == libwebm::kMkvInfo) { + if (m_pInfo) + return E_FILE_FORMAT_INVALID; + + m_pInfo = new (std::nothrow) + SegmentInfo(this, pos, size, element_start, element_size); + + if (m_pInfo == NULL) + return -1; + + const long status = m_pInfo->Parse(); + + if (status) + return status; + } else if (id == libwebm::kMkvTracks) { + if (m_pTracks) + return E_FILE_FORMAT_INVALID; + + m_pTracks = new (std::nothrow) + Tracks(this, pos, size, element_start, element_size); + + if (m_pTracks == NULL) + return -1; + + const long status = m_pTracks->Parse(); + + if (status) + return status; + } else if (id == libwebm::kMkvCues) { + if (m_pCues == NULL) { + m_pCues = new (std::nothrow) + Cues(this, pos, size, element_start, element_size); + + if (m_pCues == NULL) + return -1; + } + } else if (id == libwebm::kMkvSeekHead) { + if (m_pSeekHead == NULL) { + m_pSeekHead = new (std::nothrow) + SeekHead(this, pos, size, element_start, element_size); + + if (m_pSeekHead == NULL) + return -1; + + const long status = m_pSeekHead->Parse(); + + if (status) + return status; + } + } else if (id == libwebm::kMkvChapters) { + if (m_pChapters == NULL) { + m_pChapters = new (std::nothrow) + Chapters(this, pos, size, element_start, element_size); + + if (m_pChapters == NULL) + return -1; + + const long status = m_pChapters->Parse(); + + if (status) + return status; + } + } else if (id == libwebm::kMkvTags) { + if (m_pTags == NULL) { + m_pTags = new (std::nothrow) + Tags(this, pos, size, element_start, element_size); + + if (m_pTags == NULL) + return -1; + + const long status = m_pTags->Parse(); + + if (status) + return status; + } + } + + m_pos = pos + size; // consume payload + } + + if (segment_stop >= 0 && m_pos > segment_stop) + return E_FILE_FORMAT_INVALID; + + if (m_pInfo == NULL) // TODO: liberalize this behavior + return E_FILE_FORMAT_INVALID; + + if (m_pTracks == NULL) + return E_FILE_FORMAT_INVALID; + + return 0; // success +} + +long Segment::LoadCluster(long long& pos, long& len) { + for (;;) { + const long result = DoLoadCluster(pos, len); + + if (result <= 1) + return result; + } +} + +long Segment::DoLoadCluster(long long& pos, long& len) { + if (m_pos < 0) + return DoLoadClusterUnknownSize(pos, len); + + long long total, avail; + + long status = m_pReader->Length(&total, &avail); + + if (status < 0) // error + return status; + + if (total >= 0 && avail > total) + return E_FILE_FORMAT_INVALID; + + const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size; + + long long cluster_off = -1; // offset relative to start of segment + long long cluster_size = -1; // size of cluster payload + + for (;;) { + if ((total >= 0) && (m_pos >= total)) + return 1; // no more clusters + + if ((segment_stop >= 0) && (m_pos >= segment_stop)) + return 1; // no more clusters + + pos = m_pos; + + // Read ID + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + long long result = GetUIntLength(m_pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) + return E_BUFFER_NOT_FULL; + + if ((segment_stop >= 0) && ((pos + len) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long idpos = pos; + const long long id = ReadID(m_pReader, idpos, len); + + if (id < 0) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume ID + + // Read Size + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + result = GetUIntLength(m_pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) + return E_BUFFER_NOT_FULL; + + if ((segment_stop >= 0) && ((pos + len) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long size = ReadUInt(m_pReader, pos, len); + + if (size < 0) // error + return static_cast(size); + + pos += len; // consume length of size of element + + // pos now points to start of payload + + if (size == 0) { + // Missing element payload: move on. + m_pos = pos; + continue; + } + + const long long unknown_size = (1LL << (7 * len)) - 1; + + if ((segment_stop >= 0) && (size != unknown_size) && + ((pos + size) > segment_stop)) { + return E_FILE_FORMAT_INVALID; + } + + if (id == libwebm::kMkvCues) { + if (size == unknown_size) { + // Cues element of unknown size: Not supported. + return E_FILE_FORMAT_INVALID; + } + + if (m_pCues == NULL) { + const long long element_size = (pos - idpos) + size; + + m_pCues = new (std::nothrow) Cues(this, pos, size, idpos, element_size); + if (m_pCues == NULL) + return -1; + } + + m_pos = pos + size; // consume payload + continue; + } + + if (id != libwebm::kMkvCluster) { + // Besides the Segment, Libwebm allows only cluster elements of unknown + // size. Fail the parse upon encountering a non-cluster element reporting + // unknown size. + if (size == unknown_size) + return E_FILE_FORMAT_INVALID; + + m_pos = pos + size; // consume payload + continue; + } + + // We have a cluster. + + cluster_off = idpos - m_start; // relative pos + + if (size != unknown_size) + cluster_size = size; + + break; + } + + if (cluster_off < 0) { + // No cluster, die. + return E_FILE_FORMAT_INVALID; + } + + long long pos_; + long len_; + + status = Cluster::HasBlockEntries(this, cluster_off, pos_, len_); + + if (status < 0) { // error, or underflow + pos = pos_; + len = len_; + + return status; + } + + // status == 0 means "no block entries found" + // status > 0 means "found at least one block entry" + + // TODO: + // The issue here is that the segment increments its own + // pos ptr past the most recent cluster parsed, and then + // starts from there to parse the next cluster. If we + // don't know the size of the current cluster, then we + // must either parse its payload (as we do below), looking + // for the cluster (or cues) ID to terminate the parse. + // This isn't really what we want: rather, we really need + // a way to create the curr cluster object immediately. + // The pity is that cluster::parse can determine its own + // boundary, and we largely duplicate that same logic here. + // + // Maybe we need to get rid of our look-ahead preloading + // in source::parse??? + // + // As we're parsing the blocks in the curr cluster + //(in cluster::parse), we should have some way to signal + // to the segment that we have determined the boundary, + // so it can adjust its own segment::m_pos member. + // + // The problem is that we're asserting in asyncreadinit, + // because we adjust the pos down to the curr seek pos, + // and the resulting adjusted len is > 2GB. I'm suspicious + // that this is even correct, but even if it is, we can't + // be loading that much data in the cache anyway. + + const long idx = m_clusterCount; + + if (m_clusterPreloadCount > 0) { + if (idx >= m_clusterSize) + return E_FILE_FORMAT_INVALID; + + Cluster* const pCluster = m_clusters[idx]; + if (pCluster == NULL || pCluster->m_index >= 0) + return E_FILE_FORMAT_INVALID; + + const long long off = pCluster->GetPosition(); + if (off < 0) + return E_FILE_FORMAT_INVALID; + + if (off == cluster_off) { // preloaded already + if (status == 0) // no entries found + return E_FILE_FORMAT_INVALID; + + if (cluster_size >= 0) + pos += cluster_size; + else { + const long long element_size = pCluster->GetElementSize(); + + if (element_size <= 0) + return E_FILE_FORMAT_INVALID; // TODO: handle this case + + pos = pCluster->m_element_start + element_size; + } + + pCluster->m_index = idx; // move from preloaded to loaded + ++m_clusterCount; + --m_clusterPreloadCount; + + m_pos = pos; // consume payload + if (segment_stop >= 0 && m_pos > segment_stop) + return E_FILE_FORMAT_INVALID; + + return 0; // success + } + } + + if (status == 0) { // no entries found + if (cluster_size >= 0) + pos += cluster_size; + + if ((total >= 0) && (pos >= total)) { + m_pos = total; + return 1; // no more clusters + } + + if ((segment_stop >= 0) && (pos >= segment_stop)) { + m_pos = segment_stop; + return 1; // no more clusters + } + + m_pos = pos; + return 2; // try again + } + + // status > 0 means we have an entry + + Cluster* const pCluster = Cluster::Create(this, idx, cluster_off); + if (pCluster == NULL) + return -1; + + if (!AppendCluster(pCluster)) { + delete pCluster; + return -1; + } + + if (cluster_size >= 0) { + pos += cluster_size; + + m_pos = pos; + + if (segment_stop > 0 && m_pos > segment_stop) + return E_FILE_FORMAT_INVALID; + + return 0; + } + + m_pUnknownSize = pCluster; + m_pos = -pos; + + return 0; // partial success, since we have a new cluster + + // status == 0 means "no block entries found" + // pos designates start of payload + // m_pos has NOT been adjusted yet (in case we need to come back here) +} + +long Segment::DoLoadClusterUnknownSize(long long& pos, long& len) { + if (m_pos >= 0 || m_pUnknownSize == NULL) + return E_PARSE_FAILED; + + const long status = m_pUnknownSize->Parse(pos, len); + + if (status < 0) // error or underflow + return status; + + if (status == 0) // parsed a block + return 2; // continue parsing + + const long long start = m_pUnknownSize->m_element_start; + const long long size = m_pUnknownSize->GetElementSize(); + + if (size < 0) + return E_FILE_FORMAT_INVALID; + + pos = start + size; + m_pos = pos; + + m_pUnknownSize = 0; + + return 2; // continue parsing +} + +bool Segment::AppendCluster(Cluster* pCluster) { + if (pCluster == NULL || pCluster->m_index < 0) + return false; + + const long count = m_clusterCount + m_clusterPreloadCount; + + long& size = m_clusterSize; + const long idx = pCluster->m_index; + + if (size < count || idx != m_clusterCount) + return false; + + if (count >= size) { + const long n = (size <= 0) ? 2048 : 2 * size; + + Cluster** const qq = new (std::nothrow) Cluster*[n]; + if (qq == NULL) + return false; + + Cluster** q = qq; + Cluster** p = m_clusters; + Cluster** const pp = p + count; + + while (p != pp) + *q++ = *p++; + + delete[] m_clusters; + + m_clusters = qq; + size = n; + } + + if (m_clusterPreloadCount > 0) { + Cluster** const p = m_clusters + m_clusterCount; + if (*p == NULL || (*p)->m_index >= 0) + return false; + + Cluster** q = p + m_clusterPreloadCount; + if (q >= (m_clusters + size)) + return false; + + for (;;) { + Cluster** const qq = q - 1; + if ((*qq)->m_index >= 0) + return false; + + *q = *qq; + q = qq; + + if (q == p) + break; + } + } + + m_clusters[idx] = pCluster; + ++m_clusterCount; + return true; +} + +bool Segment::PreloadCluster(Cluster* pCluster, ptrdiff_t idx) { + if (pCluster == NULL || pCluster->m_index >= 0 || idx < m_clusterCount) + return false; + + const long count = m_clusterCount + m_clusterPreloadCount; + + long& size = m_clusterSize; + if (size < count) + return false; + + if (count >= size) { + const long n = (size <= 0) ? 2048 : 2 * size; + + Cluster** const qq = new (std::nothrow) Cluster*[n]; + if (qq == NULL) + return false; + Cluster** q = qq; + + Cluster** p = m_clusters; + Cluster** const pp = p + count; + + while (p != pp) + *q++ = *p++; + + delete[] m_clusters; + + m_clusters = qq; + size = n; + } + + if (m_clusters == NULL) + return false; + + Cluster** const p = m_clusters + idx; + + Cluster** q = m_clusters + count; + if (q < p || q >= (m_clusters + size)) + return false; + + while (q > p) { + Cluster** const qq = q - 1; + + if ((*qq)->m_index >= 0) + return false; + + *q = *qq; + q = qq; + } + + m_clusters[idx] = pCluster; + ++m_clusterPreloadCount; + return true; +} + +long Segment::Load() { + if (m_clusters != NULL || m_clusterSize != 0 || m_clusterCount != 0) + return E_PARSE_FAILED; + + // Outermost (level 0) segment object has been constructed, + // and pos designates start of payload. We need to find the + // inner (level 1) elements. + + const long long header_status = ParseHeaders(); + + if (header_status < 0) // error + return static_cast(header_status); + + if (header_status > 0) // underflow + return E_BUFFER_NOT_FULL; + + if (m_pInfo == NULL || m_pTracks == NULL) + return E_FILE_FORMAT_INVALID; + + for (;;) { + const long status = LoadCluster(); + + if (status < 0) // error + return status; + + if (status >= 1) // no more clusters + return 0; + } +} + +SeekHead::Entry::Entry() : id(0), pos(0), element_start(0), element_size(0) {} + +SeekHead::SeekHead(Segment* pSegment, long long start, long long size_, + long long element_start, long long element_size) + : m_pSegment(pSegment), + m_start(start), + m_size(size_), + m_element_start(element_start), + m_element_size(element_size), + m_entries(0), + m_entry_count(0), + m_void_elements(0), + m_void_element_count(0) {} + +SeekHead::~SeekHead() { + delete[] m_entries; + delete[] m_void_elements; +} + +long SeekHead::Parse() { + IMkvReader* const pReader = m_pSegment->m_pReader; + + long long pos = m_start; + const long long stop = m_start + m_size; + + // first count the seek head entries + + int entry_count = 0; + int void_element_count = 0; + + while (pos < stop) { + long long id, size; + + const long status = ParseElementHeader(pReader, pos, stop, id, size); + + if (status < 0) // error + return status; + + if (id == libwebm::kMkvSeek) + ++entry_count; + else if (id == libwebm::kMkvVoid) + ++void_element_count; + + pos += size; // consume payload + + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + + if (entry_count > 0) { + m_entries = new (std::nothrow) Entry[entry_count]; + + if (m_entries == NULL) + return -1; + } + + if (void_element_count > 0) { + m_void_elements = new (std::nothrow) VoidElement[void_element_count]; + + if (m_void_elements == NULL) + return -1; + } + + // now parse the entries and void elements + + Entry* pEntry = m_entries; + VoidElement* pVoidElement = m_void_elements; + + pos = m_start; + + while (pos < stop) { + const long long idpos = pos; + + long long id, size; + + const long status = ParseElementHeader(pReader, pos, stop, id, size); + + if (status < 0) // error + return status; + + if (id == libwebm::kMkvSeek && entry_count > 0) { + if (ParseEntry(pReader, pos, size, pEntry)) { + Entry& e = *pEntry++; + + e.element_start = idpos; + e.element_size = (pos + size) - idpos; + } + } else if (id == libwebm::kMkvVoid && void_element_count > 0) { + VoidElement& e = *pVoidElement++; + + e.element_start = idpos; + e.element_size = (pos + size) - idpos; + } + + pos += size; // consume payload + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + + ptrdiff_t count_ = ptrdiff_t(pEntry - m_entries); + assert(count_ >= 0); + assert(count_ <= entry_count); + + m_entry_count = static_cast(count_); + + count_ = ptrdiff_t(pVoidElement - m_void_elements); + assert(count_ >= 0); + assert(count_ <= void_element_count); + + m_void_element_count = static_cast(count_); + + return 0; +} + +int SeekHead::GetCount() const { return m_entry_count; } + +const SeekHead::Entry* SeekHead::GetEntry(int idx) const { + if (idx < 0) + return 0; + + if (idx >= m_entry_count) + return 0; + + return m_entries + idx; +} + +int SeekHead::GetVoidElementCount() const { return m_void_element_count; } + +const SeekHead::VoidElement* SeekHead::GetVoidElement(int idx) const { + if (idx < 0) + return 0; + + if (idx >= m_void_element_count) + return 0; + + return m_void_elements + idx; +} + +long Segment::ParseCues(long long off, long long& pos, long& len) { + if (m_pCues) + return 0; // success + + if (off < 0) + return -1; + + long long total, avail; + + const int status = m_pReader->Length(&total, &avail); + + if (status < 0) // error + return status; + + assert((total < 0) || (avail <= total)); + + pos = m_start + off; + + if ((total < 0) || (pos >= total)) + return 1; // don't bother parsing cues + + const long long element_start = pos; + const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size; + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + long long result = GetUIntLength(m_pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // underflow (weird) + { + len = 1; + return E_BUFFER_NOT_FULL; + } + + if ((segment_stop >= 0) && ((pos + len) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long idpos = pos; + + const long long id = ReadID(m_pReader, idpos, len); + + if (id != libwebm::kMkvCues) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume ID + assert((segment_stop < 0) || (pos <= segment_stop)); + + // Read Size + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + result = GetUIntLength(m_pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // underflow (weird) + { + len = 1; + return E_BUFFER_NOT_FULL; + } + + if ((segment_stop >= 0) && ((pos + len) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long size = ReadUInt(m_pReader, pos, len); + + if (size < 0) // error + return static_cast(size); + + if (size == 0) // weird, although technically not illegal + return 1; // done + + pos += len; // consume length of size of element + assert((segment_stop < 0) || (pos <= segment_stop)); + + // Pos now points to start of payload + + const long long element_stop = pos + size; + + if ((segment_stop >= 0) && (element_stop > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((total >= 0) && (element_stop > total)) + return 1; // don't bother parsing anymore + + len = static_cast(size); + + if (element_stop > avail) + return E_BUFFER_NOT_FULL; + + const long long element_size = element_stop - element_start; + + m_pCues = + new (std::nothrow) Cues(this, pos, size, element_start, element_size); + if (m_pCues == NULL) + return -1; + + return 0; // success +} + +bool SeekHead::ParseEntry(IMkvReader* pReader, long long start, long long size_, + Entry* pEntry) { + if (size_ <= 0) + return false; + + long long pos = start; + const long long stop = start + size_; + + long len; + + // parse the container for the level-1 element ID + + const long long seekIdId = ReadID(pReader, pos, len); + if (seekIdId < 0) + return false; + + if (seekIdId != libwebm::kMkvSeekID) + return false; + + if ((pos + len) > stop) + return false; + + pos += len; // consume SeekID id + + const long long seekIdSize = ReadUInt(pReader, pos, len); + + if (seekIdSize <= 0) + return false; + + if ((pos + len) > stop) + return false; + + pos += len; // consume size of field + + if ((pos + seekIdSize) > stop) + return false; + + pEntry->id = ReadID(pReader, pos, len); // payload + + if (pEntry->id <= 0) + return false; + + if (len != seekIdSize) + return false; + + pos += seekIdSize; // consume SeekID payload + + const long long seekPosId = ReadID(pReader, pos, len); + + if (seekPosId != libwebm::kMkvSeekPosition) + return false; + + if ((pos + len) > stop) + return false; + + pos += len; // consume id + + const long long seekPosSize = ReadUInt(pReader, pos, len); + + if (seekPosSize <= 0) + return false; + + if ((pos + len) > stop) + return false; + + pos += len; // consume size + + if ((pos + seekPosSize) > stop) + return false; + + pEntry->pos = UnserializeUInt(pReader, pos, seekPosSize); + + if (pEntry->pos < 0) + return false; + + pos += seekPosSize; // consume payload + + if (pos != stop) + return false; + + return true; +} + +Cues::Cues(Segment* pSegment, long long start_, long long size_, + long long element_start, long long element_size) + : m_pSegment(pSegment), + m_start(start_), + m_size(size_), + m_element_start(element_start), + m_element_size(element_size), + m_cue_points(NULL), + m_count(0), + m_preload_count(0), + m_pos(start_) {} + +Cues::~Cues() { + const long n = m_count + m_preload_count; + + CuePoint** p = m_cue_points; + CuePoint** const q = p + n; + + while (p != q) { + CuePoint* const pCP = *p++; + assert(pCP); + + delete pCP; + } + + delete[] m_cue_points; +} + +long Cues::GetCount() const { + if (m_cue_points == NULL) + return -1; + + return m_count; // TODO: really ignore preload count? +} + +bool Cues::DoneParsing() const { + const long long stop = m_start + m_size; + return (m_pos >= stop); +} + +bool Cues::Init() const { + if (m_cue_points) + return true; + + if (m_count != 0 || m_preload_count != 0) + return false; + + IMkvReader* const pReader = m_pSegment->m_pReader; + + const long long stop = m_start + m_size; + long long pos = m_start; + + long cue_points_size = 0; + + while (pos < stop) { + const long long idpos = pos; + + long len; + + const long long id = ReadID(pReader, pos, len); + if (id < 0 || (pos + len) > stop) { + return false; + } + + pos += len; // consume ID + + const long long size = ReadUInt(pReader, pos, len); + if (size < 0 || (pos + len > stop)) { + return false; + } + + pos += len; // consume Size field + if (pos + size > stop) { + return false; + } + + if (id == libwebm::kMkvCuePoint) { + if (!PreloadCuePoint(cue_points_size, idpos)) + return false; + } + + pos += size; // skip payload + } + return true; +} + +bool Cues::PreloadCuePoint(long& cue_points_size, long long pos) const { + if (m_count != 0) + return false; + + if (m_preload_count >= cue_points_size) { + const long n = (cue_points_size <= 0) ? 2048 : 2 * cue_points_size; + + CuePoint** const qq = new (std::nothrow) CuePoint*[n]; + if (qq == NULL) + return false; + + CuePoint** q = qq; // beginning of target + + CuePoint** p = m_cue_points; // beginning of source + CuePoint** const pp = p + m_preload_count; // end of source + + while (p != pp) + *q++ = *p++; + + delete[] m_cue_points; + + m_cue_points = qq; + cue_points_size = n; + } + + CuePoint* const pCP = new (std::nothrow) CuePoint(m_preload_count, pos); + if (pCP == NULL) + return false; + + m_cue_points[m_preload_count++] = pCP; + return true; +} + +bool Cues::LoadCuePoint() const { + const long long stop = m_start + m_size; + + if (m_pos >= stop) + return false; // nothing else to do + + if (!Init()) { + m_pos = stop; + return false; + } + + IMkvReader* const pReader = m_pSegment->m_pReader; + + while (m_pos < stop) { + const long long idpos = m_pos; + + long len; + + const long long id = ReadID(pReader, m_pos, len); + if (id < 0 || (m_pos + len) > stop) + return false; + + m_pos += len; // consume ID + + const long long size = ReadUInt(pReader, m_pos, len); + if (size < 0 || (m_pos + len) > stop) + return false; + + m_pos += len; // consume Size field + if ((m_pos + size) > stop) + return false; + + if (id != libwebm::kMkvCuePoint) { + m_pos += size; // consume payload + if (m_pos > stop) + return false; + + continue; + } + + if (m_preload_count < 1) + return false; + + CuePoint* const pCP = m_cue_points[m_count]; + if (!pCP || (pCP->GetTimeCode() < 0 && (-pCP->GetTimeCode() != idpos))) + return false; + + if (!pCP->Load(pReader)) { + m_pos = stop; + return false; + } + ++m_count; + --m_preload_count; + + m_pos += size; // consume payload + if (m_pos > stop) + return false; + + return true; // yes, we loaded a cue point + } + + return false; // no, we did not load a cue point +} + +bool Cues::Find(long long time_ns, const Track* pTrack, const CuePoint*& pCP, + const CuePoint::TrackPosition*& pTP) const { + if (time_ns < 0 || pTrack == NULL || m_cue_points == NULL || m_count == 0) + return false; + + CuePoint** const ii = m_cue_points; + CuePoint** i = ii; + + CuePoint** const jj = ii + m_count; + CuePoint** j = jj; + + pCP = *i; + if (pCP == NULL) + return false; + + if (time_ns <= pCP->GetTime(m_pSegment)) { + pTP = pCP->Find(pTrack); + return (pTP != NULL); + } + + while (i < j) { + // INVARIANT: + //[ii, i) <= time_ns + //[i, j) ? + //[j, jj) > time_ns + + CuePoint** const k = i + (j - i) / 2; + if (k >= jj) + return false; + + CuePoint* const pCP = *k; + if (pCP == NULL) + return false; + + const long long t = pCP->GetTime(m_pSegment); + + if (t <= time_ns) + i = k + 1; + else + j = k; + + if (i > j) + return false; + } + + if (i != j || i > jj || i <= ii) + return false; + + pCP = *--i; + + if (pCP == NULL || pCP->GetTime(m_pSegment) > time_ns) + return false; + + // TODO: here and elsewhere, it's probably not correct to search + // for the cue point with this time, and then search for a matching + // track. In principle, the matching track could be on some earlier + // cue point, and with our current algorithm, we'd miss it. To make + // this bullet-proof, we'd need to create a secondary structure, + // with a list of cue points that apply to a track, and then search + // that track-based structure for a matching cue point. + + pTP = pCP->Find(pTrack); + return (pTP != NULL); +} + +const CuePoint* Cues::GetFirst() const { + if (m_cue_points == NULL || m_count == 0) + return NULL; + + CuePoint* const* const pp = m_cue_points; + if (pp == NULL) + return NULL; + + CuePoint* const pCP = pp[0]; + if (pCP == NULL || pCP->GetTimeCode() < 0) + return NULL; + + return pCP; +} + +const CuePoint* Cues::GetLast() const { + if (m_cue_points == NULL || m_count <= 0) + return NULL; + + const long index = m_count - 1; + + CuePoint* const* const pp = m_cue_points; + if (pp == NULL) + return NULL; + + CuePoint* const pCP = pp[index]; + if (pCP == NULL || pCP->GetTimeCode() < 0) + return NULL; + + return pCP; +} + +const CuePoint* Cues::GetNext(const CuePoint* pCurr) const { + if (pCurr == NULL || pCurr->GetTimeCode() < 0 || m_cue_points == NULL || + m_count < 1) { + return NULL; + } + + long index = pCurr->m_index; + if (index >= m_count) + return NULL; + + CuePoint* const* const pp = m_cue_points; + if (pp == NULL || pp[index] != pCurr) + return NULL; + + ++index; + + if (index >= m_count) + return NULL; + + CuePoint* const pNext = pp[index]; + + if (pNext == NULL || pNext->GetTimeCode() < 0) + return NULL; + + return pNext; +} + +const BlockEntry* Cues::GetBlock(const CuePoint* pCP, + const CuePoint::TrackPosition* pTP) const { + if (pCP == NULL || pTP == NULL) + return NULL; + + return m_pSegment->GetBlock(*pCP, *pTP); +} + +const BlockEntry* Segment::GetBlock(const CuePoint& cp, + const CuePoint::TrackPosition& tp) { + Cluster** const ii = m_clusters; + Cluster** i = ii; + + const long count = m_clusterCount + m_clusterPreloadCount; + + Cluster** const jj = ii + count; + Cluster** j = jj; + + while (i < j) { + // INVARIANT: + //[ii, i) < pTP->m_pos + //[i, j) ? + //[j, jj) > pTP->m_pos + + Cluster** const k = i + (j - i) / 2; + assert(k < jj); + + Cluster* const pCluster = *k; + assert(pCluster); + + // const long long pos_ = pCluster->m_pos; + // assert(pos_); + // const long long pos = pos_ * ((pos_ < 0) ? -1 : 1); + + const long long pos = pCluster->GetPosition(); + assert(pos >= 0); + + if (pos < tp.m_pos) + i = k + 1; + else if (pos > tp.m_pos) + j = k; + else + return pCluster->GetEntry(cp, tp); + } + + assert(i == j); + // assert(Cluster::HasBlockEntries(this, tp.m_pos)); + + Cluster* const pCluster = Cluster::Create(this, -1, tp.m_pos); //, -1); + if (pCluster == NULL) + return NULL; + + const ptrdiff_t idx = i - m_clusters; + + if (!PreloadCluster(pCluster, idx)) { + delete pCluster; + return NULL; + } + assert(m_clusters); + assert(m_clusterPreloadCount > 0); + assert(m_clusters[idx] == pCluster); + + return pCluster->GetEntry(cp, tp); +} + +const Cluster* Segment::FindOrPreloadCluster(long long requested_pos) { + if (requested_pos < 0) + return 0; + + Cluster** const ii = m_clusters; + Cluster** i = ii; + + const long count = m_clusterCount + m_clusterPreloadCount; + + Cluster** const jj = ii + count; + Cluster** j = jj; + + while (i < j) { + // INVARIANT: + //[ii, i) < pTP->m_pos + //[i, j) ? + //[j, jj) > pTP->m_pos + + Cluster** const k = i + (j - i) / 2; + assert(k < jj); + + Cluster* const pCluster = *k; + assert(pCluster); + + // const long long pos_ = pCluster->m_pos; + // assert(pos_); + // const long long pos = pos_ * ((pos_ < 0) ? -1 : 1); + + const long long pos = pCluster->GetPosition(); + assert(pos >= 0); + + if (pos < requested_pos) + i = k + 1; + else if (pos > requested_pos) + j = k; + else + return pCluster; + } + + assert(i == j); + // assert(Cluster::HasBlockEntries(this, tp.m_pos)); + + Cluster* const pCluster = Cluster::Create(this, -1, requested_pos); + if (pCluster == NULL) + return NULL; + + const ptrdiff_t idx = i - m_clusters; + + if (!PreloadCluster(pCluster, idx)) { + delete pCluster; + return NULL; + } + assert(m_clusters); + assert(m_clusterPreloadCount > 0); + assert(m_clusters[idx] == pCluster); + + return pCluster; +} + +CuePoint::CuePoint(long idx, long long pos) + : m_element_start(0), + m_element_size(0), + m_index(idx), + m_timecode(-1 * pos), + m_track_positions(NULL), + m_track_positions_count(0) { + assert(pos > 0); +} + +CuePoint::~CuePoint() { delete[] m_track_positions; } + +bool CuePoint::Load(IMkvReader* pReader) { + // odbgstream os; + // os << "CuePoint::Load(begin): timecode=" << m_timecode << endl; + + if (m_timecode >= 0) // already loaded + return true; + + assert(m_track_positions == NULL); + assert(m_track_positions_count == 0); + + long long pos_ = -m_timecode; + const long long element_start = pos_; + + long long stop; + + { + long len; + + const long long id = ReadID(pReader, pos_, len); + if (id != libwebm::kMkvCuePoint) + return false; + + pos_ += len; // consume ID + + const long long size = ReadUInt(pReader, pos_, len); + assert(size >= 0); + + pos_ += len; // consume Size field + // pos_ now points to start of payload + + stop = pos_ + size; + } + + const long long element_size = stop - element_start; + + long long pos = pos_; + + // First count number of track positions + + while (pos < stop) { + long len; + + const long long id = ReadID(pReader, pos, len); + if ((id < 0) || (pos + len > stop)) { + return false; + } + + pos += len; // consume ID + + const long long size = ReadUInt(pReader, pos, len); + if ((size < 0) || (pos + len > stop)) { + return false; + } + + pos += len; // consume Size field + if ((pos + size) > stop) { + return false; + } + + if (id == libwebm::kMkvCueTime) + m_timecode = UnserializeUInt(pReader, pos, size); + + else if (id == libwebm::kMkvCueTrackPositions) + ++m_track_positions_count; + + pos += size; // consume payload + } + + if (m_timecode < 0 || m_track_positions_count <= 0) { + return false; + } + + // os << "CuePoint::Load(cont'd): idpos=" << idpos + // << " timecode=" << m_timecode + // << endl; + + m_track_positions = new (std::nothrow) TrackPosition[m_track_positions_count]; + if (m_track_positions == NULL) + return false; + + // Now parse track positions + + TrackPosition* p = m_track_positions; + pos = pos_; + + while (pos < stop) { + long len; + + const long long id = ReadID(pReader, pos, len); + if (id < 0 || (pos + len) > stop) + return false; + + pos += len; // consume ID + + const long long size = ReadUInt(pReader, pos, len); + assert(size >= 0); + assert((pos + len) <= stop); + + pos += len; // consume Size field + assert((pos + size) <= stop); + + if (id == libwebm::kMkvCueTrackPositions) { + TrackPosition& tp = *p++; + if (!tp.Parse(pReader, pos, size)) { + return false; + } + } + + pos += size; // consume payload + if (pos > stop) + return false; + } + + assert(size_t(p - m_track_positions) == m_track_positions_count); + + m_element_start = element_start; + m_element_size = element_size; + + return true; +} + +bool CuePoint::TrackPosition::Parse(IMkvReader* pReader, long long start_, + long long size_) { + const long long stop = start_ + size_; + long long pos = start_; + + m_track = -1; + m_pos = -1; + m_block = 1; // default + + while (pos < stop) { + long len; + + const long long id = ReadID(pReader, pos, len); + if ((id < 0) || ((pos + len) > stop)) { + return false; + } + + pos += len; // consume ID + + const long long size = ReadUInt(pReader, pos, len); + if ((size < 0) || ((pos + len) > stop)) { + return false; + } + + pos += len; // consume Size field + if ((pos + size) > stop) { + return false; + } + + if (id == libwebm::kMkvCueTrack) + m_track = UnserializeUInt(pReader, pos, size); + else if (id == libwebm::kMkvCueClusterPosition) + m_pos = UnserializeUInt(pReader, pos, size); + else if (id == libwebm::kMkvCueBlockNumber) + m_block = UnserializeUInt(pReader, pos, size); + + pos += size; // consume payload + } + + if ((m_pos < 0) || (m_track <= 0)) { + return false; + } + + return true; +} + +const CuePoint::TrackPosition* CuePoint::Find(const Track* pTrack) const { + if (pTrack == NULL) { + return NULL; + } + + const long long n = pTrack->GetNumber(); + + const TrackPosition* i = m_track_positions; + const TrackPosition* const j = i + m_track_positions_count; + + while (i != j) { + const TrackPosition& p = *i++; + + if (p.m_track == n) + return &p; + } + + return NULL; // no matching track number found +} + +long long CuePoint::GetTimeCode() const { return m_timecode; } + +long long CuePoint::GetTime(const Segment* pSegment) const { + assert(pSegment); + assert(m_timecode >= 0); + + const SegmentInfo* const pInfo = pSegment->GetInfo(); + assert(pInfo); + + const long long scale = pInfo->GetTimeCodeScale(); + assert(scale >= 1); + + const long long time = scale * m_timecode; + + return time; +} + +bool Segment::DoneParsing() const { + if (m_size < 0) { + long long total, avail; + + const int status = m_pReader->Length(&total, &avail); + + if (status < 0) // error + return true; // must assume done + + if (total < 0) + return false; // assume live stream + + return (m_pos >= total); + } + + const long long stop = m_start + m_size; + + return (m_pos >= stop); +} + +const Cluster* Segment::GetFirst() const { + if ((m_clusters == NULL) || (m_clusterCount <= 0)) + return &m_eos; + + Cluster* const pCluster = m_clusters[0]; + assert(pCluster); + + return pCluster; +} + +const Cluster* Segment::GetLast() const { + if ((m_clusters == NULL) || (m_clusterCount <= 0)) + return &m_eos; + + const long idx = m_clusterCount - 1; + + Cluster* const pCluster = m_clusters[idx]; + assert(pCluster); + + return pCluster; +} + +unsigned long Segment::GetCount() const { return m_clusterCount; } + +const Cluster* Segment::GetNext(const Cluster* pCurr) { + assert(pCurr); + assert(pCurr != &m_eos); + assert(m_clusters); + + long idx = pCurr->m_index; + + if (idx >= 0) { + assert(m_clusterCount > 0); + assert(idx < m_clusterCount); + assert(pCurr == m_clusters[idx]); + + ++idx; + + if (idx >= m_clusterCount) + return &m_eos; // caller will LoadCluster as desired + + Cluster* const pNext = m_clusters[idx]; + assert(pNext); + assert(pNext->m_index >= 0); + assert(pNext->m_index == idx); + + return pNext; + } + + assert(m_clusterPreloadCount > 0); + + long long pos = pCurr->m_element_start; + + assert(m_size >= 0); // TODO + const long long stop = m_start + m_size; // end of segment + + { + long len; + + long long result = GetUIntLength(m_pReader, pos, len); + assert(result == 0); + assert((pos + len) <= stop); // TODO + if (result != 0) + return NULL; + + const long long id = ReadID(m_pReader, pos, len); + if (id != libwebm::kMkvCluster) + return NULL; + + pos += len; // consume ID + + // Read Size + result = GetUIntLength(m_pReader, pos, len); + assert(result == 0); // TODO + assert((pos + len) <= stop); // TODO + + const long long size = ReadUInt(m_pReader, pos, len); + assert(size > 0); // TODO + // assert((pCurr->m_size <= 0) || (pCurr->m_size == size)); + + pos += len; // consume length of size of element + assert((pos + size) <= stop); // TODO + + // Pos now points to start of payload + + pos += size; // consume payload + } + + long long off_next = 0; + + while (pos < stop) { + long len; + + long long result = GetUIntLength(m_pReader, pos, len); + assert(result == 0); + assert((pos + len) <= stop); // TODO + if (result != 0) + return NULL; + + const long long idpos = pos; // pos of next (potential) cluster + + const long long id = ReadID(m_pReader, idpos, len); + if (id < 0) + return NULL; + + pos += len; // consume ID + + // Read Size + result = GetUIntLength(m_pReader, pos, len); + assert(result == 0); // TODO + assert((pos + len) <= stop); // TODO + + const long long size = ReadUInt(m_pReader, pos, len); + assert(size >= 0); // TODO + + pos += len; // consume length of size of element + assert((pos + size) <= stop); // TODO + + // Pos now points to start of payload + + if (size == 0) // weird + continue; + + if (id == libwebm::kMkvCluster) { + const long long off_next_ = idpos - m_start; + + long long pos_; + long len_; + + const long status = Cluster::HasBlockEntries(this, off_next_, pos_, len_); + + assert(status >= 0); + + if (status > 0) { + off_next = off_next_; + break; + } + } + + pos += size; // consume payload + } + + if (off_next <= 0) + return 0; + + Cluster** const ii = m_clusters + m_clusterCount; + Cluster** i = ii; + + Cluster** const jj = ii + m_clusterPreloadCount; + Cluster** j = jj; + + while (i < j) { + // INVARIANT: + //[0, i) < pos_next + //[i, j) ? + //[j, jj) > pos_next + + Cluster** const k = i + (j - i) / 2; + assert(k < jj); + + Cluster* const pNext = *k; + assert(pNext); + assert(pNext->m_index < 0); + + // const long long pos_ = pNext->m_pos; + // assert(pos_); + // pos = pos_ * ((pos_ < 0) ? -1 : 1); + + pos = pNext->GetPosition(); + + if (pos < off_next) + i = k + 1; + else if (pos > off_next) + j = k; + else + return pNext; + } + + assert(i == j); + + Cluster* const pNext = Cluster::Create(this, -1, off_next); + if (pNext == NULL) + return NULL; + + const ptrdiff_t idx_next = i - m_clusters; // insertion position + + if (!PreloadCluster(pNext, idx_next)) { + delete pNext; + return NULL; + } + assert(m_clusters); + assert(idx_next < m_clusterSize); + assert(m_clusters[idx_next] == pNext); + + return pNext; +} + +long Segment::ParseNext(const Cluster* pCurr, const Cluster*& pResult, + long long& pos, long& len) { + assert(pCurr); + assert(!pCurr->EOS()); + assert(m_clusters); + + pResult = 0; + + if (pCurr->m_index >= 0) { // loaded (not merely preloaded) + assert(m_clusters[pCurr->m_index] == pCurr); + + const long next_idx = pCurr->m_index + 1; + + if (next_idx < m_clusterCount) { + pResult = m_clusters[next_idx]; + return 0; // success + } + + // curr cluster is last among loaded + + const long result = LoadCluster(pos, len); + + if (result < 0) // error or underflow + return result; + + if (result > 0) // no more clusters + { + // pResult = &m_eos; + return 1; + } + + pResult = GetLast(); + return 0; // success + } + + assert(m_pos > 0); + + long long total, avail; + + long status = m_pReader->Length(&total, &avail); + + if (status < 0) // error + return status; + + assert((total < 0) || (avail <= total)); + + const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size; + + // interrogate curr cluster + + pos = pCurr->m_element_start; + + if (pCurr->m_element_size >= 0) + pos += pCurr->m_element_size; + else { + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + long long result = GetUIntLength(m_pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // weird + return E_BUFFER_NOT_FULL; + + if ((segment_stop >= 0) && ((pos + len) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long id = ReadUInt(m_pReader, pos, len); + + if (id != libwebm::kMkvCluster) + return -1; + + pos += len; // consume ID + + // Read Size + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + result = GetUIntLength(m_pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // weird + return E_BUFFER_NOT_FULL; + + if ((segment_stop >= 0) && ((pos + len) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long size = ReadUInt(m_pReader, pos, len); + + if (size < 0) // error + return static_cast(size); + + pos += len; // consume size field + + const long long unknown_size = (1LL << (7 * len)) - 1; + + if (size == unknown_size) // TODO: should never happen + return E_FILE_FORMAT_INVALID; // TODO: resolve this + + // assert((pCurr->m_size <= 0) || (pCurr->m_size == size)); + + if ((segment_stop >= 0) && ((pos + size) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + // Pos now points to start of payload + + pos += size; // consume payload (that is, the current cluster) + if (segment_stop >= 0 && pos > segment_stop) + return E_FILE_FORMAT_INVALID; + + // By consuming the payload, we are assuming that the curr + // cluster isn't interesting. That is, we don't bother checking + // whether the payload of the curr cluster is less than what + // happens to be available (obtained via IMkvReader::Length). + // Presumably the caller has already dispensed with the current + // cluster, and really does want the next cluster. + } + + // pos now points to just beyond the last fully-loaded cluster + + for (;;) { + const long status = DoParseNext(pResult, pos, len); + + if (status <= 1) + return status; + } +} + +long Segment::DoParseNext(const Cluster*& pResult, long long& pos, long& len) { + long long total, avail; + + long status = m_pReader->Length(&total, &avail); + + if (status < 0) // error + return status; + + assert((total < 0) || (avail <= total)); + + const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size; + + // Parse next cluster. This is strictly a parsing activity. + // Creation of a new cluster object happens later, after the + // parsing is done. + + long long off_next = 0; + long long cluster_size = -1; + + for (;;) { + if ((total >= 0) && (pos >= total)) + return 1; // EOF + + if ((segment_stop >= 0) && (pos >= segment_stop)) + return 1; // EOF + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + long long result = GetUIntLength(m_pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // weird + return E_BUFFER_NOT_FULL; + + if ((segment_stop >= 0) && ((pos + len) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long idpos = pos; // absolute + const long long idoff = pos - m_start; // relative + + const long long id = ReadID(m_pReader, idpos, len); // absolute + + if (id < 0) // error + return static_cast(id); + + if (id == 0) // weird + return -1; // generic error + + pos += len; // consume ID + + // Read Size + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + result = GetUIntLength(m_pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // weird + return E_BUFFER_NOT_FULL; + + if ((segment_stop >= 0) && ((pos + len) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long size = ReadUInt(m_pReader, pos, len); + + if (size < 0) // error + return static_cast(size); + + pos += len; // consume length of size of element + + // Pos now points to start of payload + + if (size == 0) // weird + continue; + + const long long unknown_size = (1LL << (7 * len)) - 1; + + if ((segment_stop >= 0) && (size != unknown_size) && + ((pos + size) > segment_stop)) { + return E_FILE_FORMAT_INVALID; + } + + if (id == libwebm::kMkvCues) { + if (size == unknown_size) + return E_FILE_FORMAT_INVALID; + + const long long element_stop = pos + size; + + if ((segment_stop >= 0) && (element_stop > segment_stop)) + return E_FILE_FORMAT_INVALID; + + const long long element_start = idpos; + const long long element_size = element_stop - element_start; + + if (m_pCues == NULL) { + m_pCues = new (std::nothrow) + Cues(this, pos, size, element_start, element_size); + if (m_pCues == NULL) + return false; + } + + pos += size; // consume payload + if (segment_stop >= 0 && pos > segment_stop) + return E_FILE_FORMAT_INVALID; + + continue; + } + + if (id != libwebm::kMkvCluster) { // not a Cluster ID + if (size == unknown_size) + return E_FILE_FORMAT_INVALID; + + pos += size; // consume payload + if (segment_stop >= 0 && pos > segment_stop) + return E_FILE_FORMAT_INVALID; + + continue; + } + + // We have a cluster. + off_next = idoff; + + if (size != unknown_size) + cluster_size = size; + + break; + } + + assert(off_next > 0); // have cluster + + // We have parsed the next cluster. + // We have not created a cluster object yet. What we need + // to do now is determine whether it has already be preloaded + //(in which case, an object for this cluster has already been + // created), and if not, create a new cluster object. + + Cluster** const ii = m_clusters + m_clusterCount; + Cluster** i = ii; + + Cluster** const jj = ii + m_clusterPreloadCount; + Cluster** j = jj; + + while (i < j) { + // INVARIANT: + //[0, i) < pos_next + //[i, j) ? + //[j, jj) > pos_next + + Cluster** const k = i + (j - i) / 2; + assert(k < jj); + + const Cluster* const pNext = *k; + assert(pNext); + assert(pNext->m_index < 0); + + pos = pNext->GetPosition(); + assert(pos >= 0); + + if (pos < off_next) + i = k + 1; + else if (pos > off_next) + j = k; + else { + pResult = pNext; + return 0; // success + } + } + + assert(i == j); + + long long pos_; + long len_; + + status = Cluster::HasBlockEntries(this, off_next, pos_, len_); + + if (status < 0) { // error or underflow + pos = pos_; + len = len_; + + return status; + } + + if (status > 0) { // means "found at least one block entry" + Cluster* const pNext = Cluster::Create(this, + -1, // preloaded + off_next); + if (pNext == NULL) + return -1; + + const ptrdiff_t idx_next = i - m_clusters; // insertion position + + if (!PreloadCluster(pNext, idx_next)) { + delete pNext; + return -1; + } + assert(m_clusters); + assert(idx_next < m_clusterSize); + assert(m_clusters[idx_next] == pNext); + + pResult = pNext; + return 0; // success + } + + // status == 0 means "no block entries found" + + if (cluster_size < 0) { // unknown size + const long long payload_pos = pos; // absolute pos of cluster payload + + for (;;) { // determine cluster size + if ((total >= 0) && (pos >= total)) + break; + + if ((segment_stop >= 0) && (pos >= segment_stop)) + break; // no more clusters + + // Read ID + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + long long result = GetUIntLength(m_pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // weird + return E_BUFFER_NOT_FULL; + + if ((segment_stop >= 0) && ((pos + len) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long idpos = pos; + const long long id = ReadID(m_pReader, idpos, len); + + if (id < 0) // error (or underflow) + return static_cast(id); + + // This is the distinguished set of ID's we use to determine + // that we have exhausted the sub-element's inside the cluster + // whose ID we parsed earlier. + + if (id == libwebm::kMkvCluster || id == libwebm::kMkvCues) + break; + + pos += len; // consume ID (of sub-element) + + // Read Size + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + result = GetUIntLength(m_pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // weird + return E_BUFFER_NOT_FULL; + + if ((segment_stop >= 0) && ((pos + len) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long size = ReadUInt(m_pReader, pos, len); + + if (size < 0) // error + return static_cast(size); + + pos += len; // consume size field of element + + // pos now points to start of sub-element's payload + + if (size == 0) // weird + continue; + + const long long unknown_size = (1LL << (7 * len)) - 1; + + if (size == unknown_size) + return E_FILE_FORMAT_INVALID; // not allowed for sub-elements + + if ((segment_stop >= 0) && ((pos + size) > segment_stop)) // weird + return E_FILE_FORMAT_INVALID; + + pos += size; // consume payload of sub-element + if (segment_stop >= 0 && pos > segment_stop) + return E_FILE_FORMAT_INVALID; + } // determine cluster size + + cluster_size = pos - payload_pos; + assert(cluster_size >= 0); // TODO: handle cluster_size = 0 + + pos = payload_pos; // reset and re-parse original cluster + } + + pos += cluster_size; // consume payload + if (segment_stop >= 0 && pos > segment_stop) + return E_FILE_FORMAT_INVALID; + + return 2; // try to find a cluster that follows next +} + +const Cluster* Segment::FindCluster(long long time_ns) const { + if ((m_clusters == NULL) || (m_clusterCount <= 0)) + return &m_eos; + + { + Cluster* const pCluster = m_clusters[0]; + assert(pCluster); + assert(pCluster->m_index == 0); + + if (time_ns <= pCluster->GetTime()) + return pCluster; + } + + // Binary search of cluster array + + long i = 0; + long j = m_clusterCount; + + while (i < j) { + // INVARIANT: + //[0, i) <= time_ns + //[i, j) ? + //[j, m_clusterCount) > time_ns + + const long k = i + (j - i) / 2; + assert(k < m_clusterCount); + + Cluster* const pCluster = m_clusters[k]; + assert(pCluster); + assert(pCluster->m_index == k); + + const long long t = pCluster->GetTime(); + + if (t <= time_ns) + i = k + 1; + else + j = k; + + assert(i <= j); + } + + assert(i == j); + assert(i > 0); + assert(i <= m_clusterCount); + + const long k = i - 1; + + Cluster* const pCluster = m_clusters[k]; + assert(pCluster); + assert(pCluster->m_index == k); + assert(pCluster->GetTime() <= time_ns); + + return pCluster; +} + +const Tracks* Segment::GetTracks() const { return m_pTracks; } +const SegmentInfo* Segment::GetInfo() const { return m_pInfo; } +const Cues* Segment::GetCues() const { return m_pCues; } +const Chapters* Segment::GetChapters() const { return m_pChapters; } +const Tags* Segment::GetTags() const { return m_pTags; } +const SeekHead* Segment::GetSeekHead() const { return m_pSeekHead; } + +long long Segment::GetDuration() const { + assert(m_pInfo); + return m_pInfo->GetDuration(); +} + +Chapters::Chapters(Segment* pSegment, long long payload_start, + long long payload_size, long long element_start, + long long element_size) + : m_pSegment(pSegment), + m_start(payload_start), + m_size(payload_size), + m_element_start(element_start), + m_element_size(element_size), + m_editions(NULL), + m_editions_size(0), + m_editions_count(0) {} + +Chapters::~Chapters() { + while (m_editions_count > 0) { + Edition& e = m_editions[--m_editions_count]; + e.Clear(); + } + delete[] m_editions; +} + +long Chapters::Parse() { + IMkvReader* const pReader = m_pSegment->m_pReader; + + long long pos = m_start; // payload start + const long long stop = pos + m_size; // payload stop + + while (pos < stop) { + long long id, size; + + long status = ParseElementHeader(pReader, pos, stop, id, size); + + if (status < 0) // error + return status; + + if (size == 0) // weird + continue; + + if (id == libwebm::kMkvEditionEntry) { + status = ParseEdition(pos, size); + + if (status < 0) // error + return status; + } + + pos += size; + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + return 0; +} + +int Chapters::GetEditionCount() const { return m_editions_count; } + +const Chapters::Edition* Chapters::GetEdition(int idx) const { + if (idx < 0) + return NULL; + + if (idx >= m_editions_count) + return NULL; + + return m_editions + idx; +} + +bool Chapters::ExpandEditionsArray() { + if (m_editions_size > m_editions_count) + return true; // nothing else to do + + const int size = (m_editions_size == 0) ? 1 : 2 * m_editions_size; + + Edition* const editions = new (std::nothrow) Edition[size]; + + if (editions == NULL) + return false; + + for (int idx = 0; idx < m_editions_count; ++idx) { + m_editions[idx].ShallowCopy(editions[idx]); + } + + delete[] m_editions; + m_editions = editions; + + m_editions_size = size; + return true; +} + +long Chapters::ParseEdition(long long pos, long long size) { + if (!ExpandEditionsArray()) + return -1; + + Edition& e = m_editions[m_editions_count++]; + e.Init(); + + return e.Parse(m_pSegment->m_pReader, pos, size); +} + +Chapters::Edition::Edition() {} + +Chapters::Edition::~Edition() {} + +int Chapters::Edition::GetAtomCount() const { return m_atoms_count; } + +const Chapters::Atom* Chapters::Edition::GetAtom(int index) const { + if (index < 0) + return NULL; + + if (index >= m_atoms_count) + return NULL; + + return m_atoms + index; +} + +void Chapters::Edition::Init() { + m_atoms = NULL; + m_atoms_size = 0; + m_atoms_count = 0; +} + +void Chapters::Edition::ShallowCopy(Edition& rhs) const { + rhs.m_atoms = m_atoms; + rhs.m_atoms_size = m_atoms_size; + rhs.m_atoms_count = m_atoms_count; +} + +void Chapters::Edition::Clear() { + while (m_atoms_count > 0) { + Atom& a = m_atoms[--m_atoms_count]; + a.Clear(); + } + + delete[] m_atoms; + m_atoms = NULL; + + m_atoms_size = 0; +} + +long Chapters::Edition::Parse(IMkvReader* pReader, long long pos, + long long size) { + const long long stop = pos + size; + + while (pos < stop) { + long long id, size; + + long status = ParseElementHeader(pReader, pos, stop, id, size); + + if (status < 0) // error + return status; + + if (size == 0) + continue; + + if (id == libwebm::kMkvChapterAtom) { + status = ParseAtom(pReader, pos, size); + + if (status < 0) // error + return status; + } + + pos += size; + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + return 0; +} + +long Chapters::Edition::ParseAtom(IMkvReader* pReader, long long pos, + long long size) { + if (!ExpandAtomsArray()) + return -1; + + Atom& a = m_atoms[m_atoms_count++]; + a.Init(); + + return a.Parse(pReader, pos, size); +} + +bool Chapters::Edition::ExpandAtomsArray() { + if (m_atoms_size > m_atoms_count) + return true; // nothing else to do + + const int size = (m_atoms_size == 0) ? 1 : 2 * m_atoms_size; + + Atom* const atoms = new (std::nothrow) Atom[size]; + + if (atoms == NULL) + return false; + + for (int idx = 0; idx < m_atoms_count; ++idx) { + m_atoms[idx].ShallowCopy(atoms[idx]); + } + + delete[] m_atoms; + m_atoms = atoms; + + m_atoms_size = size; + return true; +} + +Chapters::Atom::Atom() {} + +Chapters::Atom::~Atom() {} + +unsigned long long Chapters::Atom::GetUID() const { return m_uid; } + +const char* Chapters::Atom::GetStringUID() const { return m_string_uid; } + +long long Chapters::Atom::GetStartTimecode() const { return m_start_timecode; } + +long long Chapters::Atom::GetStopTimecode() const { return m_stop_timecode; } + +long long Chapters::Atom::GetStartTime(const Chapters* pChapters) const { + return GetTime(pChapters, m_start_timecode); +} + +long long Chapters::Atom::GetStopTime(const Chapters* pChapters) const { + return GetTime(pChapters, m_stop_timecode); +} + +int Chapters::Atom::GetDisplayCount() const { return m_displays_count; } + +const Chapters::Display* Chapters::Atom::GetDisplay(int index) const { + if (index < 0) + return NULL; + + if (index >= m_displays_count) + return NULL; + + return m_displays + index; +} + +void Chapters::Atom::Init() { + m_string_uid = NULL; + m_uid = 0; + m_start_timecode = -1; + m_stop_timecode = -1; + + m_displays = NULL; + m_displays_size = 0; + m_displays_count = 0; +} + +void Chapters::Atom::ShallowCopy(Atom& rhs) const { + rhs.m_string_uid = m_string_uid; + rhs.m_uid = m_uid; + rhs.m_start_timecode = m_start_timecode; + rhs.m_stop_timecode = m_stop_timecode; + + rhs.m_displays = m_displays; + rhs.m_displays_size = m_displays_size; + rhs.m_displays_count = m_displays_count; +} + +void Chapters::Atom::Clear() { + delete[] m_string_uid; + m_string_uid = NULL; + + while (m_displays_count > 0) { + Display& d = m_displays[--m_displays_count]; + d.Clear(); + } + + delete[] m_displays; + m_displays = NULL; + + m_displays_size = 0; +} + +long Chapters::Atom::Parse(IMkvReader* pReader, long long pos, long long size) { + const long long stop = pos + size; + + while (pos < stop) { + long long id, size; + + long status = ParseElementHeader(pReader, pos, stop, id, size); + + if (status < 0) // error + return status; + + if (size == 0) // 0 length payload, skip. + continue; + + if (id == libwebm::kMkvChapterDisplay) { + status = ParseDisplay(pReader, pos, size); + + if (status < 0) // error + return status; + } else if (id == libwebm::kMkvChapterStringUID) { + status = UnserializeString(pReader, pos, size, m_string_uid); + + if (status < 0) // error + return status; + } else if (id == libwebm::kMkvChapterUID) { + long long val; + status = UnserializeInt(pReader, pos, size, val); + + if (status < 0) // error + return status; + + m_uid = static_cast(val); + } else if (id == libwebm::kMkvChapterTimeStart) { + const long long val = UnserializeUInt(pReader, pos, size); + + if (val < 0) // error + return static_cast(val); + + m_start_timecode = val; + } else if (id == libwebm::kMkvChapterTimeEnd) { + const long long val = UnserializeUInt(pReader, pos, size); + + if (val < 0) // error + return static_cast(val); + + m_stop_timecode = val; + } + + pos += size; + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + return 0; +} + +long long Chapters::Atom::GetTime(const Chapters* pChapters, + long long timecode) { + if (pChapters == NULL) + return -1; + + Segment* const pSegment = pChapters->m_pSegment; + + if (pSegment == NULL) // weird + return -1; + + const SegmentInfo* const pInfo = pSegment->GetInfo(); + + if (pInfo == NULL) + return -1; + + const long long timecode_scale = pInfo->GetTimeCodeScale(); + + if (timecode_scale < 1) // weird + return -1; + + if (timecode < 0) + return -1; + + const long long result = timecode_scale * timecode; + + return result; +} + +long Chapters::Atom::ParseDisplay(IMkvReader* pReader, long long pos, + long long size) { + if (!ExpandDisplaysArray()) + return -1; + + Display& d = m_displays[m_displays_count++]; + d.Init(); + + return d.Parse(pReader, pos, size); +} + +bool Chapters::Atom::ExpandDisplaysArray() { + if (m_displays_size > m_displays_count) + return true; // nothing else to do + + const int size = (m_displays_size == 0) ? 1 : 2 * m_displays_size; + + Display* const displays = new (std::nothrow) Display[size]; + + if (displays == NULL) + return false; + + for (int idx = 0; idx < m_displays_count; ++idx) { + m_displays[idx].ShallowCopy(displays[idx]); + } + + delete[] m_displays; + m_displays = displays; + + m_displays_size = size; + return true; +} + +Chapters::Display::Display() {} + +Chapters::Display::~Display() {} + +const char* Chapters::Display::GetString() const { return m_string; } + +const char* Chapters::Display::GetLanguage() const { return m_language; } + +const char* Chapters::Display::GetCountry() const { return m_country; } + +void Chapters::Display::Init() { + m_string = NULL; + m_language = NULL; + m_country = NULL; +} + +void Chapters::Display::ShallowCopy(Display& rhs) const { + rhs.m_string = m_string; + rhs.m_language = m_language; + rhs.m_country = m_country; +} + +void Chapters::Display::Clear() { + delete[] m_string; + m_string = NULL; + + delete[] m_language; + m_language = NULL; + + delete[] m_country; + m_country = NULL; +} + +long Chapters::Display::Parse(IMkvReader* pReader, long long pos, + long long size) { + const long long stop = pos + size; + + while (pos < stop) { + long long id, size; + + long status = ParseElementHeader(pReader, pos, stop, id, size); + + if (status < 0) // error + return status; + + if (size == 0) // No payload. + continue; + + if (id == libwebm::kMkvChapString) { + status = UnserializeString(pReader, pos, size, m_string); + + if (status) + return status; + } else if (id == libwebm::kMkvChapLanguage) { + status = UnserializeString(pReader, pos, size, m_language); + + if (status) + return status; + } else if (id == libwebm::kMkvChapCountry) { + status = UnserializeString(pReader, pos, size, m_country); + + if (status) + return status; + } + + pos += size; + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + return 0; +} + +Tags::Tags(Segment* pSegment, long long payload_start, long long payload_size, + long long element_start, long long element_size) + : m_pSegment(pSegment), + m_start(payload_start), + m_size(payload_size), + m_element_start(element_start), + m_element_size(element_size), + m_tags(NULL), + m_tags_size(0), + m_tags_count(0) {} + +Tags::~Tags() { + while (m_tags_count > 0) { + Tag& t = m_tags[--m_tags_count]; + t.Clear(); + } + delete[] m_tags; +} + +long Tags::Parse() { + IMkvReader* const pReader = m_pSegment->m_pReader; + + long long pos = m_start; // payload start + const long long stop = pos + m_size; // payload stop + + while (pos < stop) { + long long id, size; + + long status = ParseElementHeader(pReader, pos, stop, id, size); + + if (status < 0) + return status; + + if (size == 0) // 0 length tag, read another + continue; + + if (id == libwebm::kMkvTag) { + status = ParseTag(pos, size); + + if (status < 0) + return status; + } + + pos += size; + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + + return 0; +} + +int Tags::GetTagCount() const { return m_tags_count; } + +const Tags::Tag* Tags::GetTag(int idx) const { + if (idx < 0) + return NULL; + + if (idx >= m_tags_count) + return NULL; + + return m_tags + idx; +} + +bool Tags::ExpandTagsArray() { + if (m_tags_size > m_tags_count) + return true; // nothing else to do + + const int size = (m_tags_size == 0) ? 1 : 2 * m_tags_size; + + Tag* const tags = new (std::nothrow) Tag[size]; + + if (tags == NULL) + return false; + + for (int idx = 0; idx < m_tags_count; ++idx) { + m_tags[idx].ShallowCopy(tags[idx]); + } + + delete[] m_tags; + m_tags = tags; + + m_tags_size = size; + return true; +} + +long Tags::ParseTag(long long pos, long long size) { + if (!ExpandTagsArray()) + return -1; + + Tag& t = m_tags[m_tags_count++]; + t.Init(); + + return t.Parse(m_pSegment->m_pReader, pos, size); +} + +Tags::Tag::Tag() {} + +Tags::Tag::~Tag() {} + +int Tags::Tag::GetSimpleTagCount() const { return m_simple_tags_count; } + +const Tags::SimpleTag* Tags::Tag::GetSimpleTag(int index) const { + if (index < 0) + return NULL; + + if (index >= m_simple_tags_count) + return NULL; + + return m_simple_tags + index; +} + +void Tags::Tag::Init() { + m_simple_tags = NULL; + m_simple_tags_size = 0; + m_simple_tags_count = 0; +} + +void Tags::Tag::ShallowCopy(Tag& rhs) const { + rhs.m_simple_tags = m_simple_tags; + rhs.m_simple_tags_size = m_simple_tags_size; + rhs.m_simple_tags_count = m_simple_tags_count; +} + +void Tags::Tag::Clear() { + while (m_simple_tags_count > 0) { + SimpleTag& d = m_simple_tags[--m_simple_tags_count]; + d.Clear(); + } + + delete[] m_simple_tags; + m_simple_tags = NULL; + + m_simple_tags_size = 0; +} + +long Tags::Tag::Parse(IMkvReader* pReader, long long pos, long long size) { + const long long stop = pos + size; + + while (pos < stop) { + long long id, size; + + long status = ParseElementHeader(pReader, pos, stop, id, size); + + if (status < 0) + return status; + + if (size == 0) // 0 length tag, read another + continue; + + if (id == libwebm::kMkvSimpleTag) { + status = ParseSimpleTag(pReader, pos, size); + + if (status < 0) + return status; + } + + pos += size; + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + return 0; +} + +long Tags::Tag::ParseSimpleTag(IMkvReader* pReader, long long pos, + long long size) { + if (!ExpandSimpleTagsArray()) + return -1; + + SimpleTag& st = m_simple_tags[m_simple_tags_count++]; + st.Init(); + + return st.Parse(pReader, pos, size); +} + +bool Tags::Tag::ExpandSimpleTagsArray() { + if (m_simple_tags_size > m_simple_tags_count) + return true; // nothing else to do + + const int size = (m_simple_tags_size == 0) ? 1 : 2 * m_simple_tags_size; + + SimpleTag* const displays = new (std::nothrow) SimpleTag[size]; + + if (displays == NULL) + return false; + + for (int idx = 0; idx < m_simple_tags_count; ++idx) { + m_simple_tags[idx].ShallowCopy(displays[idx]); + } + + delete[] m_simple_tags; + m_simple_tags = displays; + + m_simple_tags_size = size; + return true; +} + +Tags::SimpleTag::SimpleTag() {} + +Tags::SimpleTag::~SimpleTag() {} + +const char* Tags::SimpleTag::GetTagName() const { return m_tag_name; } + +const char* Tags::SimpleTag::GetTagString() const { return m_tag_string; } + +void Tags::SimpleTag::Init() { + m_tag_name = NULL; + m_tag_string = NULL; +} + +void Tags::SimpleTag::ShallowCopy(SimpleTag& rhs) const { + rhs.m_tag_name = m_tag_name; + rhs.m_tag_string = m_tag_string; +} + +void Tags::SimpleTag::Clear() { + delete[] m_tag_name; + m_tag_name = NULL; + + delete[] m_tag_string; + m_tag_string = NULL; +} + +long Tags::SimpleTag::Parse(IMkvReader* pReader, long long pos, + long long size) { + const long long stop = pos + size; + + while (pos < stop) { + long long id, size; + + long status = ParseElementHeader(pReader, pos, stop, id, size); + + if (status < 0) // error + return status; + + if (size == 0) // weird + continue; + + if (id == libwebm::kMkvTagName) { + status = UnserializeString(pReader, pos, size, m_tag_name); + + if (status) + return status; + } else if (id == libwebm::kMkvTagString) { + status = UnserializeString(pReader, pos, size, m_tag_string); + + if (status) + return status; + } + + pos += size; + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + return 0; +} + +SegmentInfo::SegmentInfo(Segment* pSegment, long long start, long long size_, + long long element_start, long long element_size) + : m_pSegment(pSegment), + m_start(start), + m_size(size_), + m_element_start(element_start), + m_element_size(element_size), + m_pMuxingAppAsUTF8(NULL), + m_pWritingAppAsUTF8(NULL), + m_pTitleAsUTF8(NULL) {} + +SegmentInfo::~SegmentInfo() { + delete[] m_pMuxingAppAsUTF8; + m_pMuxingAppAsUTF8 = NULL; + + delete[] m_pWritingAppAsUTF8; + m_pWritingAppAsUTF8 = NULL; + + delete[] m_pTitleAsUTF8; + m_pTitleAsUTF8 = NULL; +} + +long SegmentInfo::Parse() { + assert(m_pMuxingAppAsUTF8 == NULL); + assert(m_pWritingAppAsUTF8 == NULL); + assert(m_pTitleAsUTF8 == NULL); + + IMkvReader* const pReader = m_pSegment->m_pReader; + + long long pos = m_start; + const long long stop = m_start + m_size; + + m_timecodeScale = 1000000; + m_duration = -1; + + while (pos < stop) { + long long id, size; + + const long status = ParseElementHeader(pReader, pos, stop, id, size); + + if (status < 0) // error + return status; + + if (id == libwebm::kMkvTimecodeScale) { + m_timecodeScale = UnserializeUInt(pReader, pos, size); + + if (m_timecodeScale <= 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvDuration) { + const long status = UnserializeFloat(pReader, pos, size, m_duration); + + if (status < 0) + return status; + + if (m_duration < 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvMuxingApp) { + const long status = + UnserializeString(pReader, pos, size, m_pMuxingAppAsUTF8); + + if (status) + return status; + } else if (id == libwebm::kMkvWritingApp) { + const long status = + UnserializeString(pReader, pos, size, m_pWritingAppAsUTF8); + + if (status) + return status; + } else if (id == libwebm::kMkvTitle) { + const long status = UnserializeString(pReader, pos, size, m_pTitleAsUTF8); + + if (status) + return status; + } + + pos += size; + + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + const double rollover_check = m_duration * m_timecodeScale; + if (rollover_check > static_cast(LLONG_MAX)) + return E_FILE_FORMAT_INVALID; + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + + return 0; +} + +long long SegmentInfo::GetTimeCodeScale() const { return m_timecodeScale; } + +long long SegmentInfo::GetDuration() const { + if (m_duration < 0) + return -1; + + assert(m_timecodeScale >= 1); + + const double dd = double(m_duration) * double(m_timecodeScale); + const long long d = static_cast(dd); + + return d; +} + +const char* SegmentInfo::GetMuxingAppAsUTF8() const { + return m_pMuxingAppAsUTF8; +} + +const char* SegmentInfo::GetWritingAppAsUTF8() const { + return m_pWritingAppAsUTF8; +} + +const char* SegmentInfo::GetTitleAsUTF8() const { return m_pTitleAsUTF8; } + +/////////////////////////////////////////////////////////////// +// ContentEncoding element +ContentEncoding::ContentCompression::ContentCompression() + : algo(0), settings(NULL), settings_len(0) {} + +ContentEncoding::ContentCompression::~ContentCompression() { + delete[] settings; +} + +ContentEncoding::ContentEncryption::ContentEncryption() + : algo(0), + key_id(NULL), + key_id_len(0), + signature(NULL), + signature_len(0), + sig_key_id(NULL), + sig_key_id_len(0), + sig_algo(0), + sig_hash_algo(0) {} + +ContentEncoding::ContentEncryption::~ContentEncryption() { + delete[] key_id; + delete[] signature; + delete[] sig_key_id; +} + +ContentEncoding::ContentEncoding() + : compression_entries_(NULL), + compression_entries_end_(NULL), + encryption_entries_(NULL), + encryption_entries_end_(NULL), + encoding_order_(0), + encoding_scope_(1), + encoding_type_(0) {} + +ContentEncoding::~ContentEncoding() { + ContentCompression** comp_i = compression_entries_; + ContentCompression** const comp_j = compression_entries_end_; + + while (comp_i != comp_j) { + ContentCompression* const comp = *comp_i++; + delete comp; + } + + delete[] compression_entries_; + + ContentEncryption** enc_i = encryption_entries_; + ContentEncryption** const enc_j = encryption_entries_end_; + + while (enc_i != enc_j) { + ContentEncryption* const enc = *enc_i++; + delete enc; + } + + delete[] encryption_entries_; +} + +const ContentEncoding::ContentCompression* +ContentEncoding::GetCompressionByIndex(unsigned long idx) const { + const ptrdiff_t count = compression_entries_end_ - compression_entries_; + assert(count >= 0); + + if (idx >= static_cast(count)) + return NULL; + + return compression_entries_[idx]; +} + +unsigned long ContentEncoding::GetCompressionCount() const { + const ptrdiff_t count = compression_entries_end_ - compression_entries_; + assert(count >= 0); + + return static_cast(count); +} + +const ContentEncoding::ContentEncryption* ContentEncoding::GetEncryptionByIndex( + unsigned long idx) const { + const ptrdiff_t count = encryption_entries_end_ - encryption_entries_; + assert(count >= 0); + + if (idx >= static_cast(count)) + return NULL; + + return encryption_entries_[idx]; +} + +unsigned long ContentEncoding::GetEncryptionCount() const { + const ptrdiff_t count = encryption_entries_end_ - encryption_entries_; + assert(count >= 0); + + return static_cast(count); +} + +long ContentEncoding::ParseContentEncAESSettingsEntry( + long long start, long long size, IMkvReader* pReader, + ContentEncAESSettings* aes) { + assert(pReader); + assert(aes); + + long long pos = start; + const long long stop = start + size; + + while (pos < stop) { + long long id, size; + const long status = ParseElementHeader(pReader, pos, stop, id, size); + if (status < 0) // error + return status; + + if (id == libwebm::kMkvAESSettingsCipherMode) { + aes->cipher_mode = UnserializeUInt(pReader, pos, size); + if (aes->cipher_mode != 1) + return E_FILE_FORMAT_INVALID; + } + + pos += size; // consume payload + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + return 0; +} + +long ContentEncoding::ParseContentEncodingEntry(long long start, long long size, + IMkvReader* pReader) { + assert(pReader); + + long long pos = start; + const long long stop = start + size; + + // Count ContentCompression and ContentEncryption elements. + int compression_count = 0; + int encryption_count = 0; + + while (pos < stop) { + long long id, size; + const long status = ParseElementHeader(pReader, pos, stop, id, size); + if (status < 0) // error + return status; + + if (id == libwebm::kMkvContentCompression) + ++compression_count; + + if (id == libwebm::kMkvContentEncryption) + ++encryption_count; + + pos += size; // consume payload + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (compression_count <= 0 && encryption_count <= 0) + return -1; + + if (compression_count > 0) { + compression_entries_ = + new (std::nothrow) ContentCompression*[compression_count]; + if (!compression_entries_) + return -1; + compression_entries_end_ = compression_entries_; + } + + if (encryption_count > 0) { + encryption_entries_ = + new (std::nothrow) ContentEncryption*[encryption_count]; + if (!encryption_entries_) { + delete[] compression_entries_; + compression_entries_ = NULL; + return -1; + } + encryption_entries_end_ = encryption_entries_; + } + + pos = start; + while (pos < stop) { + long long id, size; + long status = ParseElementHeader(pReader, pos, stop, id, size); + if (status < 0) // error + return status; + + if (id == libwebm::kMkvContentEncodingOrder) { + encoding_order_ = UnserializeUInt(pReader, pos, size); + } else if (id == libwebm::kMkvContentEncodingScope) { + encoding_scope_ = UnserializeUInt(pReader, pos, size); + if (encoding_scope_ < 1) + return -1; + } else if (id == libwebm::kMkvContentEncodingType) { + encoding_type_ = UnserializeUInt(pReader, pos, size); + } else if (id == libwebm::kMkvContentCompression) { + ContentCompression* const compression = + new (std::nothrow) ContentCompression(); + if (!compression) + return -1; + + status = ParseCompressionEntry(pos, size, pReader, compression); + if (status) { + delete compression; + return status; + } + assert(compression_count > 0); + *compression_entries_end_++ = compression; + } else if (id == libwebm::kMkvContentEncryption) { + ContentEncryption* const encryption = + new (std::nothrow) ContentEncryption(); + if (!encryption) + return -1; + + status = ParseEncryptionEntry(pos, size, pReader, encryption); + if (status) { + delete encryption; + return status; + } + assert(encryption_count > 0); + *encryption_entries_end_++ = encryption; + } + + pos += size; // consume payload + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + return 0; +} + +long ContentEncoding::ParseCompressionEntry(long long start, long long size, + IMkvReader* pReader, + ContentCompression* compression) { + assert(pReader); + assert(compression); + + long long pos = start; + const long long stop = start + size; + + bool valid = false; + + while (pos < stop) { + long long id, size; + const long status = ParseElementHeader(pReader, pos, stop, id, size); + if (status < 0) // error + return status; + + if (id == libwebm::kMkvContentCompAlgo) { + long long algo = UnserializeUInt(pReader, pos, size); + if (algo < 0) + return E_FILE_FORMAT_INVALID; + compression->algo = algo; + valid = true; + } else if (id == libwebm::kMkvContentCompSettings) { + if (size <= 0) + return E_FILE_FORMAT_INVALID; + + const size_t buflen = static_cast(size); + unsigned char* buf = SafeArrayAlloc(1, buflen); + if (buf == NULL) + return -1; + + const int read_status = + pReader->Read(pos, static_cast(buflen), buf); + if (read_status) { + delete[] buf; + return status; + } + + // There should be only one settings element per content compression. + if (compression->settings != NULL) { + delete[] buf; + return E_FILE_FORMAT_INVALID; + } + + compression->settings = buf; + compression->settings_len = buflen; + } + + pos += size; // consume payload + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + // ContentCompAlgo is mandatory + if (!valid) + return E_FILE_FORMAT_INVALID; + + return 0; +} + +long ContentEncoding::ParseEncryptionEntry(long long start, long long size, + IMkvReader* pReader, + ContentEncryption* encryption) { + assert(pReader); + assert(encryption); + + long long pos = start; + const long long stop = start + size; + + while (pos < stop) { + long long id, size; + const long status = ParseElementHeader(pReader, pos, stop, id, size); + if (status < 0) // error + return status; + + if (id == libwebm::kMkvContentEncAlgo) { + encryption->algo = UnserializeUInt(pReader, pos, size); + if (encryption->algo != 5) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvContentEncKeyID) { + delete[] encryption->key_id; + encryption->key_id = NULL; + encryption->key_id_len = 0; + + if (size <= 0) + return E_FILE_FORMAT_INVALID; + + const size_t buflen = static_cast(size); + unsigned char* buf = SafeArrayAlloc(1, buflen); + if (buf == NULL) + return -1; + + const int read_status = + pReader->Read(pos, static_cast(buflen), buf); + if (read_status) { + delete[] buf; + return status; + } + + encryption->key_id = buf; + encryption->key_id_len = buflen; + } else if (id == libwebm::kMkvContentSignature) { + delete[] encryption->signature; + encryption->signature = NULL; + encryption->signature_len = 0; + + if (size <= 0) + return E_FILE_FORMAT_INVALID; + + const size_t buflen = static_cast(size); + unsigned char* buf = SafeArrayAlloc(1, buflen); + if (buf == NULL) + return -1; + + const int read_status = + pReader->Read(pos, static_cast(buflen), buf); + if (read_status) { + delete[] buf; + return status; + } + + encryption->signature = buf; + encryption->signature_len = buflen; + } else if (id == libwebm::kMkvContentSigKeyID) { + delete[] encryption->sig_key_id; + encryption->sig_key_id = NULL; + encryption->sig_key_id_len = 0; + + if (size <= 0) + return E_FILE_FORMAT_INVALID; + + const size_t buflen = static_cast(size); + unsigned char* buf = SafeArrayAlloc(1, buflen); + if (buf == NULL) + return -1; + + const int read_status = + pReader->Read(pos, static_cast(buflen), buf); + if (read_status) { + delete[] buf; + return status; + } + + encryption->sig_key_id = buf; + encryption->sig_key_id_len = buflen; + } else if (id == libwebm::kMkvContentSigAlgo) { + encryption->sig_algo = UnserializeUInt(pReader, pos, size); + } else if (id == libwebm::kMkvContentSigHashAlgo) { + encryption->sig_hash_algo = UnserializeUInt(pReader, pos, size); + } else if (id == libwebm::kMkvContentEncAESSettings) { + const long status = ParseContentEncAESSettingsEntry( + pos, size, pReader, &encryption->aes_settings); + if (status) + return status; + } + + pos += size; // consume payload + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + return 0; +} + +Track::Track(Segment* pSegment, long long element_start, long long element_size) + : m_pSegment(pSegment), + m_element_start(element_start), + m_element_size(element_size), + content_encoding_entries_(NULL), + content_encoding_entries_end_(NULL) {} + +Track::~Track() { + Info& info = const_cast(m_info); + info.Clear(); + + ContentEncoding** i = content_encoding_entries_; + ContentEncoding** const j = content_encoding_entries_end_; + + while (i != j) { + ContentEncoding* const encoding = *i++; + delete encoding; + } + + delete[] content_encoding_entries_; +} + +long Track::Create(Segment* pSegment, const Info& info, long long element_start, + long long element_size, Track*& pResult) { + if (pResult) + return -1; + + Track* const pTrack = + new (std::nothrow) Track(pSegment, element_start, element_size); + + if (pTrack == NULL) + return -1; // generic error + + const int status = info.Copy(pTrack->m_info); + + if (status) { // error + delete pTrack; + return status; + } + + pResult = pTrack; + return 0; // success +} + +Track::Info::Info() + : uid(0), + defaultDuration(0), + codecDelay(0), + seekPreRoll(0), + nameAsUTF8(NULL), + language(NULL), + codecId(NULL), + codecNameAsUTF8(NULL), + codecPrivate(NULL), + codecPrivateSize(0), + lacing(false) {} + +Track::Info::~Info() { Clear(); } + +void Track::Info::Clear() { + delete[] nameAsUTF8; + nameAsUTF8 = NULL; + + delete[] language; + language = NULL; + + delete[] codecId; + codecId = NULL; + + delete[] codecPrivate; + codecPrivate = NULL; + codecPrivateSize = 0; + + delete[] codecNameAsUTF8; + codecNameAsUTF8 = NULL; +} + +int Track::Info::CopyStr(char* Info::*str, Info& dst_) const { + if (str == static_cast(NULL)) + return -1; + + char*& dst = dst_.*str; + + if (dst) // should be NULL already + return -1; + + const char* const src = this->*str; + + if (src == NULL) + return 0; + + const size_t len = strlen(src); + + dst = SafeArrayAlloc(1, len + 1); + + if (dst == NULL) + return -1; + + strcpy(dst, src); + + return 0; +} + +int Track::Info::Copy(Info& dst) const { + if (&dst == this) + return 0; + + dst.type = type; + dst.number = number; + dst.defaultDuration = defaultDuration; + dst.codecDelay = codecDelay; + dst.seekPreRoll = seekPreRoll; + dst.uid = uid; + dst.lacing = lacing; + dst.settings = settings; + + // We now copy the string member variables from src to dst. + // This involves memory allocation so in principle the operation + // can fail (indeed, that's why we have Info::Copy), so we must + // report this to the caller. An error return from this function + // therefore implies that the copy was only partially successful. + + if (int status = CopyStr(&Info::nameAsUTF8, dst)) + return status; + + if (int status = CopyStr(&Info::language, dst)) + return status; + + if (int status = CopyStr(&Info::codecId, dst)) + return status; + + if (int status = CopyStr(&Info::codecNameAsUTF8, dst)) + return status; + + if (codecPrivateSize > 0) { + if (codecPrivate == NULL) + return -1; + + if (dst.codecPrivate) + return -1; + + if (dst.codecPrivateSize != 0) + return -1; + + dst.codecPrivate = SafeArrayAlloc(1, codecPrivateSize); + + if (dst.codecPrivate == NULL) + return -1; + + memcpy(dst.codecPrivate, codecPrivate, codecPrivateSize); + dst.codecPrivateSize = codecPrivateSize; + } + + return 0; +} + +const BlockEntry* Track::GetEOS() const { return &m_eos; } + +long Track::GetType() const { return m_info.type; } + +long Track::GetNumber() const { return m_info.number; } + +unsigned long long Track::GetUid() const { return m_info.uid; } + +const char* Track::GetNameAsUTF8() const { return m_info.nameAsUTF8; } + +const char* Track::GetLanguage() const { return m_info.language; } + +const char* Track::GetCodecNameAsUTF8() const { return m_info.codecNameAsUTF8; } + +const char* Track::GetCodecId() const { return m_info.codecId; } + +const unsigned char* Track::GetCodecPrivate(size_t& size) const { + size = m_info.codecPrivateSize; + return m_info.codecPrivate; +} + +bool Track::GetLacing() const { return m_info.lacing; } + +unsigned long long Track::GetDefaultDuration() const { + return m_info.defaultDuration; +} + +unsigned long long Track::GetCodecDelay() const { return m_info.codecDelay; } + +unsigned long long Track::GetSeekPreRoll() const { return m_info.seekPreRoll; } + +long Track::GetFirst(const BlockEntry*& pBlockEntry) const { + const Cluster* pCluster = m_pSegment->GetFirst(); + + for (int i = 0;;) { + if (pCluster == NULL) { + pBlockEntry = GetEOS(); + return 1; + } + + if (pCluster->EOS()) { + if (m_pSegment->DoneParsing()) { + pBlockEntry = GetEOS(); + return 1; + } + + pBlockEntry = 0; + return E_BUFFER_NOT_FULL; + } + + long status = pCluster->GetFirst(pBlockEntry); + + if (status < 0) // error + return status; + + if (pBlockEntry == 0) { // empty cluster + pCluster = m_pSegment->GetNext(pCluster); + continue; + } + + for (;;) { + const Block* const pBlock = pBlockEntry->GetBlock(); + assert(pBlock); + + const long long tn = pBlock->GetTrackNumber(); + + if ((tn == m_info.number) && VetEntry(pBlockEntry)) + return 0; + + const BlockEntry* pNextEntry; + + status = pCluster->GetNext(pBlockEntry, pNextEntry); + + if (status < 0) // error + return status; + + if (pNextEntry == 0) + break; + + pBlockEntry = pNextEntry; + } + + ++i; + + if (i >= 100) + break; + + pCluster = m_pSegment->GetNext(pCluster); + } + + // NOTE: if we get here, it means that we didn't find a block with + // a matching track number. We interpret that as an error (which + // might be too conservative). + + pBlockEntry = GetEOS(); // so we can return a non-NULL value + return 1; +} + +long Track::GetNext(const BlockEntry* pCurrEntry, + const BlockEntry*& pNextEntry) const { + assert(pCurrEntry); + assert(!pCurrEntry->EOS()); //? + + const Block* const pCurrBlock = pCurrEntry->GetBlock(); + assert(pCurrBlock && pCurrBlock->GetTrackNumber() == m_info.number); + if (!pCurrBlock || pCurrBlock->GetTrackNumber() != m_info.number) + return -1; + + const Cluster* pCluster = pCurrEntry->GetCluster(); + assert(pCluster); + assert(!pCluster->EOS()); + + long status = pCluster->GetNext(pCurrEntry, pNextEntry); + + if (status < 0) // error + return status; + + for (int i = 0;;) { + while (pNextEntry) { + const Block* const pNextBlock = pNextEntry->GetBlock(); + assert(pNextBlock); + + if (pNextBlock->GetTrackNumber() == m_info.number) + return 0; + + pCurrEntry = pNextEntry; + + status = pCluster->GetNext(pCurrEntry, pNextEntry); + + if (status < 0) // error + return status; + } + + pCluster = m_pSegment->GetNext(pCluster); + + if (pCluster == NULL) { + pNextEntry = GetEOS(); + return 1; + } + + if (pCluster->EOS()) { + if (m_pSegment->DoneParsing()) { + pNextEntry = GetEOS(); + return 1; + } + + // TODO: there is a potential O(n^2) problem here: we tell the + // caller to (pre)load another cluster, which he does, but then he + // calls GetNext again, which repeats the same search. This is + // a pathological case, since the only way it can happen is if + // there exists a long sequence of clusters none of which contain a + // block from this track. One way around this problem is for the + // caller to be smarter when he loads another cluster: don't call + // us back until you have a cluster that contains a block from this + // track. (Of course, that's not cheap either, since our caller + // would have to scan the each cluster as it's loaded, so that + // would just push back the problem.) + + pNextEntry = NULL; + return E_BUFFER_NOT_FULL; + } + + status = pCluster->GetFirst(pNextEntry); + + if (status < 0) // error + return status; + + if (pNextEntry == NULL) // empty cluster + continue; + + ++i; + + if (i >= 100) + break; + } + + // NOTE: if we get here, it means that we didn't find a block with + // a matching track number after lots of searching, so we give + // up trying. + + pNextEntry = GetEOS(); // so we can return a non-NULL value + return 1; +} + +bool Track::VetEntry(const BlockEntry* pBlockEntry) const { + assert(pBlockEntry); + const Block* const pBlock = pBlockEntry->GetBlock(); + assert(pBlock); + assert(pBlock->GetTrackNumber() == m_info.number); + if (!pBlock || pBlock->GetTrackNumber() != m_info.number) + return false; + + // This function is used during a seek to determine whether the + // frame is a valid seek target. This default function simply + // returns true, which means all frames are valid seek targets. + // It gets overridden by the VideoTrack class, because only video + // keyframes can be used as seek target. + + return true; +} + +long Track::Seek(long long time_ns, const BlockEntry*& pResult) const { + const long status = GetFirst(pResult); + + if (status < 0) // buffer underflow, etc + return status; + + assert(pResult); + + if (pResult->EOS()) + return 0; + + const Cluster* pCluster = pResult->GetCluster(); + assert(pCluster); + assert(pCluster->GetIndex() >= 0); + + if (time_ns <= pResult->GetBlock()->GetTime(pCluster)) + return 0; + + Cluster** const clusters = m_pSegment->m_clusters; + assert(clusters); + + const long count = m_pSegment->GetCount(); // loaded only, not preloaded + assert(count > 0); + + Cluster** const i = clusters + pCluster->GetIndex(); + assert(i); + assert(*i == pCluster); + assert(pCluster->GetTime() <= time_ns); + + Cluster** const j = clusters + count; + + Cluster** lo = i; + Cluster** hi = j; + + while (lo < hi) { + // INVARIANT: + //[i, lo) <= time_ns + //[lo, hi) ? + //[hi, j) > time_ns + + Cluster** const mid = lo + (hi - lo) / 2; + assert(mid < hi); + + pCluster = *mid; + assert(pCluster); + assert(pCluster->GetIndex() >= 0); + assert(pCluster->GetIndex() == long(mid - m_pSegment->m_clusters)); + + const long long t = pCluster->GetTime(); + + if (t <= time_ns) + lo = mid + 1; + else + hi = mid; + + assert(lo <= hi); + } + + assert(lo == hi); + assert(lo > i); + assert(lo <= j); + + while (lo > i) { + pCluster = *--lo; + assert(pCluster); + assert(pCluster->GetTime() <= time_ns); + + pResult = pCluster->GetEntry(this); + + if ((pResult != 0) && !pResult->EOS()) + return 0; + + // landed on empty cluster (no entries) + } + + pResult = GetEOS(); // weird + return 0; +} + +const ContentEncoding* Track::GetContentEncodingByIndex( + unsigned long idx) const { + const ptrdiff_t count = + content_encoding_entries_end_ - content_encoding_entries_; + assert(count >= 0); + + if (idx >= static_cast(count)) + return NULL; + + return content_encoding_entries_[idx]; +} + +unsigned long Track::GetContentEncodingCount() const { + const ptrdiff_t count = + content_encoding_entries_end_ - content_encoding_entries_; + assert(count >= 0); + + return static_cast(count); +} + +long Track::ParseContentEncodingsEntry(long long start, long long size) { + IMkvReader* const pReader = m_pSegment->m_pReader; + assert(pReader); + + long long pos = start; + const long long stop = start + size; + + // Count ContentEncoding elements. + int count = 0; + while (pos < stop) { + long long id, size; + const long status = ParseElementHeader(pReader, pos, stop, id, size); + if (status < 0) // error + return status; + + // pos now designates start of element + if (id == libwebm::kMkvContentEncoding) + ++count; + + pos += size; // consume payload + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (count <= 0) + return -1; + + content_encoding_entries_ = new (std::nothrow) ContentEncoding*[count]; + if (!content_encoding_entries_) + return -1; + + content_encoding_entries_end_ = content_encoding_entries_; + + pos = start; + while (pos < stop) { + long long id, size; + long status = ParseElementHeader(pReader, pos, stop, id, size); + if (status < 0) // error + return status; + + // pos now designates start of element + if (id == libwebm::kMkvContentEncoding) { + ContentEncoding* const content_encoding = + new (std::nothrow) ContentEncoding(); + if (!content_encoding) + return -1; + + status = content_encoding->ParseContentEncodingEntry(pos, size, pReader); + if (status) { + delete content_encoding; + return status; + } + + *content_encoding_entries_end_++ = content_encoding; + } + + pos += size; // consume payload + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + + return 0; +} + +Track::EOSBlock::EOSBlock() : BlockEntry(NULL, LONG_MIN) {} + +BlockEntry::Kind Track::EOSBlock::GetKind() const { return kBlockEOS; } + +const Block* Track::EOSBlock::GetBlock() const { return NULL; } + +bool PrimaryChromaticity::Parse(IMkvReader* reader, long long read_pos, + long long value_size, bool is_x, + PrimaryChromaticity** chromaticity) { + if (!reader) + return false; + + if (!*chromaticity) + *chromaticity = new PrimaryChromaticity(); + + if (!*chromaticity) + return false; + + PrimaryChromaticity* pc = *chromaticity; + float* value = is_x ? &pc->x : &pc->y; + + double parser_value = 0; + const long long parse_status = + UnserializeFloat(reader, read_pos, value_size, parser_value); + + // Valid range is [0, 1]. Make sure the double is representable as a float + // before casting. + if (parse_status < 0 || parser_value < 0.0 || parser_value > 1.0 || + (parser_value > 0.0 && parser_value < FLT_MIN)) + return false; + + *value = static_cast(parser_value); + + return true; +} + +bool MasteringMetadata::Parse(IMkvReader* reader, long long mm_start, + long long mm_size, MasteringMetadata** mm) { + if (!reader || *mm) + return false; + + std::unique_ptr mm_ptr(new MasteringMetadata()); + if (!mm_ptr.get()) + return false; + + const long long mm_end = mm_start + mm_size; + long long read_pos = mm_start; + + while (read_pos < mm_end) { + long long child_id = 0; + long long child_size = 0; + + const long long status = + ParseElementHeader(reader, read_pos, mm_end, child_id, child_size); + if (status < 0) + return false; + + if (child_id == libwebm::kMkvLuminanceMax) { + double value = 0; + const long long value_parse_status = + UnserializeFloat(reader, read_pos, child_size, value); + if (value < -FLT_MAX || value > FLT_MAX || + (value > 0.0 && value < FLT_MIN)) { + return false; + } + mm_ptr->luminance_max = static_cast(value); + if (value_parse_status < 0 || mm_ptr->luminance_max < 0.0 || + mm_ptr->luminance_max > 9999.99) { + return false; + } + } else if (child_id == libwebm::kMkvLuminanceMin) { + double value = 0; + const long long value_parse_status = + UnserializeFloat(reader, read_pos, child_size, value); + if (value < -FLT_MAX || value > FLT_MAX || + (value > 0.0 && value < FLT_MIN)) { + return false; + } + mm_ptr->luminance_min = static_cast(value); + if (value_parse_status < 0 || mm_ptr->luminance_min < 0.0 || + mm_ptr->luminance_min > 999.9999) { + return false; + } + } else { + bool is_x = false; + PrimaryChromaticity** chromaticity; + switch (child_id) { + case libwebm::kMkvPrimaryRChromaticityX: + case libwebm::kMkvPrimaryRChromaticityY: + is_x = child_id == libwebm::kMkvPrimaryRChromaticityX; + chromaticity = &mm_ptr->r; + break; + case libwebm::kMkvPrimaryGChromaticityX: + case libwebm::kMkvPrimaryGChromaticityY: + is_x = child_id == libwebm::kMkvPrimaryGChromaticityX; + chromaticity = &mm_ptr->g; + break; + case libwebm::kMkvPrimaryBChromaticityX: + case libwebm::kMkvPrimaryBChromaticityY: + is_x = child_id == libwebm::kMkvPrimaryBChromaticityX; + chromaticity = &mm_ptr->b; + break; + case libwebm::kMkvWhitePointChromaticityX: + case libwebm::kMkvWhitePointChromaticityY: + is_x = child_id == libwebm::kMkvWhitePointChromaticityX; + chromaticity = &mm_ptr->white_point; + break; + default: + return false; + } + const bool value_parse_status = PrimaryChromaticity::Parse( + reader, read_pos, child_size, is_x, chromaticity); + if (!value_parse_status) + return false; + } + + read_pos += child_size; + if (read_pos > mm_end) + return false; + } + + *mm = mm_ptr.release(); + return true; +} + +bool Colour::Parse(IMkvReader* reader, long long colour_start, + long long colour_size, Colour** colour) { + if (!reader || *colour) + return false; + + std::unique_ptr colour_ptr(new Colour()); + if (!colour_ptr.get()) + return false; + + const long long colour_end = colour_start + colour_size; + long long read_pos = colour_start; + + while (read_pos < colour_end) { + long long child_id = 0; + long long child_size = 0; + + const long status = + ParseElementHeader(reader, read_pos, colour_end, child_id, child_size); + if (status < 0) + return false; + + if (child_id == libwebm::kMkvMatrixCoefficients) { + colour_ptr->matrix_coefficients = + UnserializeUInt(reader, read_pos, child_size); + if (colour_ptr->matrix_coefficients < 0) + return false; + } else if (child_id == libwebm::kMkvBitsPerChannel) { + colour_ptr->bits_per_channel = + UnserializeUInt(reader, read_pos, child_size); + if (colour_ptr->bits_per_channel < 0) + return false; + } else if (child_id == libwebm::kMkvChromaSubsamplingHorz) { + colour_ptr->chroma_subsampling_horz = + UnserializeUInt(reader, read_pos, child_size); + if (colour_ptr->chroma_subsampling_horz < 0) + return false; + } else if (child_id == libwebm::kMkvChromaSubsamplingVert) { + colour_ptr->chroma_subsampling_vert = + UnserializeUInt(reader, read_pos, child_size); + if (colour_ptr->chroma_subsampling_vert < 0) + return false; + } else if (child_id == libwebm::kMkvCbSubsamplingHorz) { + colour_ptr->cb_subsampling_horz = + UnserializeUInt(reader, read_pos, child_size); + if (colour_ptr->cb_subsampling_horz < 0) + return false; + } else if (child_id == libwebm::kMkvCbSubsamplingVert) { + colour_ptr->cb_subsampling_vert = + UnserializeUInt(reader, read_pos, child_size); + if (colour_ptr->cb_subsampling_vert < 0) + return false; + } else if (child_id == libwebm::kMkvChromaSitingHorz) { + colour_ptr->chroma_siting_horz = + UnserializeUInt(reader, read_pos, child_size); + if (colour_ptr->chroma_siting_horz < 0) + return false; + } else if (child_id == libwebm::kMkvChromaSitingVert) { + colour_ptr->chroma_siting_vert = + UnserializeUInt(reader, read_pos, child_size); + if (colour_ptr->chroma_siting_vert < 0) + return false; + } else if (child_id == libwebm::kMkvRange) { + colour_ptr->range = UnserializeUInt(reader, read_pos, child_size); + if (colour_ptr->range < 0) + return false; + } else if (child_id == libwebm::kMkvTransferCharacteristics) { + colour_ptr->transfer_characteristics = + UnserializeUInt(reader, read_pos, child_size); + if (colour_ptr->transfer_characteristics < 0) + return false; + } else if (child_id == libwebm::kMkvPrimaries) { + colour_ptr->primaries = UnserializeUInt(reader, read_pos, child_size); + if (colour_ptr->primaries < 0) + return false; + } else if (child_id == libwebm::kMkvMaxCLL) { + colour_ptr->max_cll = UnserializeUInt(reader, read_pos, child_size); + if (colour_ptr->max_cll < 0) + return false; + } else if (child_id == libwebm::kMkvMaxFALL) { + colour_ptr->max_fall = UnserializeUInt(reader, read_pos, child_size); + if (colour_ptr->max_fall < 0) + return false; + } else if (child_id == libwebm::kMkvMasteringMetadata) { + if (!MasteringMetadata::Parse(reader, read_pos, child_size, + &colour_ptr->mastering_metadata)) + return false; + } else { + return false; + } + + read_pos += child_size; + if (read_pos > colour_end) + return false; + } + *colour = colour_ptr.release(); + return true; +} + +bool Projection::Parse(IMkvReader* reader, long long start, long long size, + Projection** projection) { + if (!reader || *projection) + return false; + + std::unique_ptr projection_ptr(new Projection()); + if (!projection_ptr.get()) + return false; + + const long long end = start + size; + long long read_pos = start; + + while (read_pos < end) { + long long child_id = 0; + long long child_size = 0; + + const long long status = + ParseElementHeader(reader, read_pos, end, child_id, child_size); + if (status < 0) + return false; + + if (child_id == libwebm::kMkvProjectionType) { + long long projection_type = kTypeNotPresent; + projection_type = UnserializeUInt(reader, read_pos, child_size); + if (projection_type < 0) + return false; + + projection_ptr->type = static_cast(projection_type); + } else if (child_id == libwebm::kMkvProjectionPrivate) { + unsigned char* data = SafeArrayAlloc(1, child_size); + + if (data == NULL) + return false; + + const int status = + reader->Read(read_pos, static_cast(child_size), data); + + if (status) { + delete[] data; + return false; + } + + projection_ptr->private_data = data; + projection_ptr->private_data_length = static_cast(child_size); + } else { + double value = 0; + const long long value_parse_status = + UnserializeFloat(reader, read_pos, child_size, value); + // Make sure value is representable as a float before casting. + if (value_parse_status < 0 || value < -FLT_MAX || value > FLT_MAX || + (value > 0.0 && value < FLT_MIN)) { + return false; + } + + switch (child_id) { + case libwebm::kMkvProjectionPoseYaw: + projection_ptr->pose_yaw = static_cast(value); + break; + case libwebm::kMkvProjectionPosePitch: + projection_ptr->pose_pitch = static_cast(value); + break; + case libwebm::kMkvProjectionPoseRoll: + projection_ptr->pose_roll = static_cast(value); + break; + default: + return false; + } + } + + read_pos += child_size; + if (read_pos > end) + return false; + } + + *projection = projection_ptr.release(); + return true; +} + +VideoTrack::VideoTrack(Segment* pSegment, long long element_start, + long long element_size) + : Track(pSegment, element_start, element_size), + m_colour_space(NULL), + m_colour(NULL), + m_projection(NULL) {} + +VideoTrack::~VideoTrack() { + delete m_colour; + delete m_projection; +} + +long VideoTrack::Parse(Segment* pSegment, const Info& info, + long long element_start, long long element_size, + VideoTrack*& pResult) { + if (pResult) + return -1; + + if (info.type != Track::kVideo) + return -1; + + long long width = 0; + long long height = 0; + long long display_width = 0; + long long display_height = 0; + long long display_unit = 0; + long long stereo_mode = 0; + + double rate = 0.0; + char* colour_space = NULL; + + IMkvReader* const pReader = pSegment->m_pReader; + + const Settings& s = info.settings; + assert(s.start >= 0); + assert(s.size >= 0); + + long long pos = s.start; + assert(pos >= 0); + + const long long stop = pos + s.size; + + std::unique_ptr colour_ptr; + std::unique_ptr projection_ptr; + + while (pos < stop) { + long long id, size; + + const long status = ParseElementHeader(pReader, pos, stop, id, size); + + if (status < 0) // error + return status; + + if (id == libwebm::kMkvPixelWidth) { + width = UnserializeUInt(pReader, pos, size); + + if (width <= 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvPixelHeight) { + height = UnserializeUInt(pReader, pos, size); + + if (height <= 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvDisplayWidth) { + display_width = UnserializeUInt(pReader, pos, size); + + if (display_width <= 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvDisplayHeight) { + display_height = UnserializeUInt(pReader, pos, size); + + if (display_height <= 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvDisplayUnit) { + display_unit = UnserializeUInt(pReader, pos, size); + + if (display_unit < 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvStereoMode) { + stereo_mode = UnserializeUInt(pReader, pos, size); + + if (stereo_mode < 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvFrameRate) { + const long status = UnserializeFloat(pReader, pos, size, rate); + + if (status < 0) + return status; + + if (rate <= 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvColour) { + Colour* colour = NULL; + if (!Colour::Parse(pReader, pos, size, &colour)) { + return E_FILE_FORMAT_INVALID; + } else { + colour_ptr.reset(colour); + } + } else if (id == libwebm::kMkvProjection) { + Projection* projection = NULL; + if (!Projection::Parse(pReader, pos, size, &projection)) { + return E_FILE_FORMAT_INVALID; + } else { + projection_ptr.reset(projection); + } + } else if (id == libwebm::kMkvColourSpace) { + const long status = UnserializeString(pReader, pos, size, colour_space); + if (status < 0) + return status; + } + + pos += size; // consume payload + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + + VideoTrack* const pTrack = + new (std::nothrow) VideoTrack(pSegment, element_start, element_size); + + if (pTrack == NULL) + return -1; // generic error + + const int status = info.Copy(pTrack->m_info); + + if (status) { // error + delete pTrack; + return status; + } + + pTrack->m_width = width; + pTrack->m_height = height; + pTrack->m_display_width = display_width; + pTrack->m_display_height = display_height; + pTrack->m_display_unit = display_unit; + pTrack->m_stereo_mode = stereo_mode; + pTrack->m_rate = rate; + pTrack->m_colour = colour_ptr.release(); + pTrack->m_colour_space = colour_space; + pTrack->m_projection = projection_ptr.release(); + + pResult = pTrack; + return 0; // success +} + +bool VideoTrack::VetEntry(const BlockEntry* pBlockEntry) const { + return Track::VetEntry(pBlockEntry) && pBlockEntry->GetBlock()->IsKey(); +} + +long VideoTrack::Seek(long long time_ns, const BlockEntry*& pResult) const { + const long status = GetFirst(pResult); + + if (status < 0) // buffer underflow, etc + return status; + + assert(pResult); + + if (pResult->EOS()) + return 0; + + const Cluster* pCluster = pResult->GetCluster(); + assert(pCluster); + assert(pCluster->GetIndex() >= 0); + + if (time_ns <= pResult->GetBlock()->GetTime(pCluster)) + return 0; + + Cluster** const clusters = m_pSegment->m_clusters; + assert(clusters); + + const long count = m_pSegment->GetCount(); // loaded only, not pre-loaded + assert(count > 0); + + Cluster** const i = clusters + pCluster->GetIndex(); + assert(i); + assert(*i == pCluster); + assert(pCluster->GetTime() <= time_ns); + + Cluster** const j = clusters + count; + + Cluster** lo = i; + Cluster** hi = j; + + while (lo < hi) { + // INVARIANT: + //[i, lo) <= time_ns + //[lo, hi) ? + //[hi, j) > time_ns + + Cluster** const mid = lo + (hi - lo) / 2; + assert(mid < hi); + + pCluster = *mid; + assert(pCluster); + assert(pCluster->GetIndex() >= 0); + assert(pCluster->GetIndex() == long(mid - m_pSegment->m_clusters)); + + const long long t = pCluster->GetTime(); + + if (t <= time_ns) + lo = mid + 1; + else + hi = mid; + + assert(lo <= hi); + } + + assert(lo == hi); + assert(lo > i); + assert(lo <= j); + + pCluster = *--lo; + assert(pCluster); + assert(pCluster->GetTime() <= time_ns); + + pResult = pCluster->GetEntry(this, time_ns); + + if ((pResult != 0) && !pResult->EOS()) // found a keyframe + return 0; + + while (lo != i) { + pCluster = *--lo; + assert(pCluster); + assert(pCluster->GetTime() <= time_ns); + + pResult = pCluster->GetEntry(this, time_ns); + + if ((pResult != 0) && !pResult->EOS()) + return 0; + } + + // weird: we're on the first cluster, but no keyframe found + // should never happen but we must return something anyway + + pResult = GetEOS(); + return 0; +} + +Colour* VideoTrack::GetColour() const { return m_colour; } + +Projection* VideoTrack::GetProjection() const { return m_projection; } + +long long VideoTrack::GetWidth() const { return m_width; } + +long long VideoTrack::GetHeight() const { return m_height; } + +long long VideoTrack::GetDisplayWidth() const { + return m_display_width > 0 ? m_display_width : GetWidth(); +} + +long long VideoTrack::GetDisplayHeight() const { + return m_display_height > 0 ? m_display_height : GetHeight(); +} + +long long VideoTrack::GetDisplayUnit() const { return m_display_unit; } + +long long VideoTrack::GetStereoMode() const { return m_stereo_mode; } + +double VideoTrack::GetFrameRate() const { return m_rate; } + +AudioTrack::AudioTrack(Segment* pSegment, long long element_start, + long long element_size) + : Track(pSegment, element_start, element_size) {} + +long AudioTrack::Parse(Segment* pSegment, const Info& info, + long long element_start, long long element_size, + AudioTrack*& pResult) { + if (pResult) + return -1; + + if (info.type != Track::kAudio) + return -1; + + IMkvReader* const pReader = pSegment->m_pReader; + + const Settings& s = info.settings; + assert(s.start >= 0); + assert(s.size >= 0); + + long long pos = s.start; + assert(pos >= 0); + + const long long stop = pos + s.size; + + double rate = 8000.0; // MKV default + long long channels = 1; + long long bit_depth = 0; + + while (pos < stop) { + long long id, size; + + long status = ParseElementHeader(pReader, pos, stop, id, size); + + if (status < 0) // error + return status; + + if (id == libwebm::kMkvSamplingFrequency) { + status = UnserializeFloat(pReader, pos, size, rate); + + if (status < 0) + return status; + + if (rate <= 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvChannels) { + channels = UnserializeUInt(pReader, pos, size); + + if (channels <= 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvBitDepth) { + bit_depth = UnserializeUInt(pReader, pos, size); + + if (bit_depth <= 0) + return E_FILE_FORMAT_INVALID; + } + + pos += size; // consume payload + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + + AudioTrack* const pTrack = + new (std::nothrow) AudioTrack(pSegment, element_start, element_size); + + if (pTrack == NULL) + return -1; // generic error + + const int status = info.Copy(pTrack->m_info); + + if (status) { + delete pTrack; + return status; + } + + pTrack->m_rate = rate; + pTrack->m_channels = channels; + pTrack->m_bitDepth = bit_depth; + + pResult = pTrack; + return 0; // success +} + +double AudioTrack::GetSamplingRate() const { return m_rate; } + +long long AudioTrack::GetChannels() const { return m_channels; } + +long long AudioTrack::GetBitDepth() const { return m_bitDepth; } + +Tracks::Tracks(Segment* pSegment, long long start, long long size_, + long long element_start, long long element_size) + : m_pSegment(pSegment), + m_start(start), + m_size(size_), + m_element_start(element_start), + m_element_size(element_size), + m_trackEntries(NULL), + m_trackEntriesEnd(NULL) {} + +long Tracks::Parse() { + assert(m_trackEntries == NULL); + assert(m_trackEntriesEnd == NULL); + + const long long stop = m_start + m_size; + IMkvReader* const pReader = m_pSegment->m_pReader; + + int count = 0; + long long pos = m_start; + + while (pos < stop) { + long long id, size; + + const long status = ParseElementHeader(pReader, pos, stop, id, size); + + if (status < 0) // error + return status; + + if (size == 0) // weird + continue; + + if (id == libwebm::kMkvTrackEntry) + ++count; + + pos += size; // consume payload + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + + if (count <= 0) + return 0; // success + + m_trackEntries = new (std::nothrow) Track*[count]; + + if (m_trackEntries == NULL) + return -1; + + m_trackEntriesEnd = m_trackEntries; + + pos = m_start; + + while (pos < stop) { + const long long element_start = pos; + + long long id, payload_size; + + const long status = + ParseElementHeader(pReader, pos, stop, id, payload_size); + + if (status < 0) // error + return status; + + if (payload_size == 0) // weird + continue; + + const long long payload_stop = pos + payload_size; + assert(payload_stop <= stop); // checked in ParseElement + + const long long element_size = payload_stop - element_start; + + if (id == libwebm::kMkvTrackEntry) { + Track*& pTrack = *m_trackEntriesEnd; + pTrack = NULL; + + const long status = ParseTrackEntry(pos, payload_size, element_start, + element_size, pTrack); + if (status) + return status; + + if (pTrack) + ++m_trackEntriesEnd; + } + + pos = payload_stop; + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + + return 0; // success +} + +unsigned long Tracks::GetTracksCount() const { + const ptrdiff_t result = m_trackEntriesEnd - m_trackEntries; + assert(result >= 0); + + return static_cast(result); +} + +long Tracks::ParseTrackEntry(long long track_start, long long track_size, + long long element_start, long long element_size, + Track*& pResult) const { + if (pResult) + return -1; + + IMkvReader* const pReader = m_pSegment->m_pReader; + + long long pos = track_start; + const long long track_stop = track_start + track_size; + + Track::Info info; + + info.type = 0; + info.number = 0; + info.uid = 0; + info.defaultDuration = 0; + + Track::Settings v; + v.start = -1; + v.size = -1; + + Track::Settings a; + a.start = -1; + a.size = -1; + + Track::Settings e; // content_encodings_settings; + e.start = -1; + e.size = -1; + + long long lacing = 1; // default is true + + while (pos < track_stop) { + long long id, size; + + const long status = ParseElementHeader(pReader, pos, track_stop, id, size); + + if (status < 0) // error + return status; + + if (size < 0) + return E_FILE_FORMAT_INVALID; + + const long long start = pos; + + if (id == libwebm::kMkvVideo) { + v.start = start; + v.size = size; + } else if (id == libwebm::kMkvAudio) { + a.start = start; + a.size = size; + } else if (id == libwebm::kMkvContentEncodings) { + e.start = start; + e.size = size; + } else if (id == libwebm::kMkvTrackUID) { + if (size > 8) + return E_FILE_FORMAT_INVALID; + + info.uid = 0; + + long long pos_ = start; + const long long pos_end = start + size; + + while (pos_ != pos_end) { + unsigned char b; + + const int status = pReader->Read(pos_, 1, &b); + + if (status) + return status; + + info.uid <<= 8; + info.uid |= b; + + ++pos_; + } + } else if (id == libwebm::kMkvTrackNumber) { + const long long num = UnserializeUInt(pReader, pos, size); + + if ((num <= 0) || (num > 127)) + return E_FILE_FORMAT_INVALID; + + info.number = static_cast(num); + } else if (id == libwebm::kMkvTrackType) { + const long long type = UnserializeUInt(pReader, pos, size); + + if ((type <= 0) || (type > 254)) + return E_FILE_FORMAT_INVALID; + + info.type = static_cast(type); + } else if (id == libwebm::kMkvName) { + const long status = + UnserializeString(pReader, pos, size, info.nameAsUTF8); + + if (status) + return status; + } else if (id == libwebm::kMkvLanguage) { + const long status = UnserializeString(pReader, pos, size, info.language); + + if (status) + return status; + } else if (id == libwebm::kMkvDefaultDuration) { + const long long duration = UnserializeUInt(pReader, pos, size); + + if (duration < 0) + return E_FILE_FORMAT_INVALID; + + info.defaultDuration = static_cast(duration); + } else if (id == libwebm::kMkvCodecID) { + const long status = UnserializeString(pReader, pos, size, info.codecId); + + if (status) + return status; + } else if (id == libwebm::kMkvFlagLacing) { + lacing = UnserializeUInt(pReader, pos, size); + + if ((lacing < 0) || (lacing > 1)) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvCodecPrivate) { + delete[] info.codecPrivate; + info.codecPrivate = NULL; + info.codecPrivateSize = 0; + + const size_t buflen = static_cast(size); + + if (buflen) { + unsigned char* buf = SafeArrayAlloc(1, buflen); + + if (buf == NULL) + return -1; + + const int status = pReader->Read(pos, static_cast(buflen), buf); + + if (status) { + delete[] buf; + return status; + } + + info.codecPrivate = buf; + info.codecPrivateSize = buflen; + } + } else if (id == libwebm::kMkvCodecName) { + const long status = + UnserializeString(pReader, pos, size, info.codecNameAsUTF8); + + if (status) + return status; + } else if (id == libwebm::kMkvCodecDelay) { + info.codecDelay = UnserializeUInt(pReader, pos, size); + } else if (id == libwebm::kMkvSeekPreRoll) { + info.seekPreRoll = UnserializeUInt(pReader, pos, size); + } + + pos += size; // consume payload + if (pos > track_stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != track_stop) + return E_FILE_FORMAT_INVALID; + + if (info.number <= 0) // not specified + return E_FILE_FORMAT_INVALID; + + if (GetTrackByNumber(info.number)) + return E_FILE_FORMAT_INVALID; + + if (info.type <= 0) // not specified + return E_FILE_FORMAT_INVALID; + + info.lacing = (lacing > 0) ? true : false; + + if (info.type == Track::kVideo) { + if (v.start < 0) + return E_FILE_FORMAT_INVALID; + + if (a.start >= 0) + return E_FILE_FORMAT_INVALID; + + info.settings = v; + + VideoTrack* pTrack = NULL; + + const long status = VideoTrack::Parse(m_pSegment, info, element_start, + element_size, pTrack); + + if (status) + return status; + + pResult = pTrack; + assert(pResult); + + if (e.start >= 0) + pResult->ParseContentEncodingsEntry(e.start, e.size); + } else if (info.type == Track::kAudio) { + if (a.start < 0) + return E_FILE_FORMAT_INVALID; + + if (v.start >= 0) + return E_FILE_FORMAT_INVALID; + + info.settings = a; + + AudioTrack* pTrack = NULL; + + const long status = AudioTrack::Parse(m_pSegment, info, element_start, + element_size, pTrack); + + if (status) + return status; + + pResult = pTrack; + assert(pResult); + + if (e.start >= 0) + pResult->ParseContentEncodingsEntry(e.start, e.size); + } else { + // neither video nor audio - probably metadata or subtitles + + if (a.start >= 0) + return E_FILE_FORMAT_INVALID; + + if (v.start >= 0) + return E_FILE_FORMAT_INVALID; + + if (info.type == Track::kMetadata && e.start >= 0) + return E_FILE_FORMAT_INVALID; + + info.settings.start = -1; + info.settings.size = 0; + + Track* pTrack = NULL; + + const long status = + Track::Create(m_pSegment, info, element_start, element_size, pTrack); + + if (status) + return status; + + pResult = pTrack; + assert(pResult); + } + + return 0; // success +} + +Tracks::~Tracks() { + Track** i = m_trackEntries; + Track** const j = m_trackEntriesEnd; + + while (i != j) { + Track* const pTrack = *i++; + delete pTrack; + } + + delete[] m_trackEntries; +} + +const Track* Tracks::GetTrackByNumber(long tn) const { + if (tn < 0) + return NULL; + + Track** i = m_trackEntries; + Track** const j = m_trackEntriesEnd; + + while (i != j) { + Track* const pTrack = *i++; + + if (pTrack == NULL) + continue; + + if (tn == pTrack->GetNumber()) + return pTrack; + } + + return NULL; // not found +} + +const Track* Tracks::GetTrackByIndex(unsigned long idx) const { + const ptrdiff_t count = m_trackEntriesEnd - m_trackEntries; + + if (idx >= static_cast(count)) + return NULL; + + return m_trackEntries[idx]; +} + +long Cluster::Load(long long& pos, long& len) const { + if (m_pSegment == NULL) + return E_PARSE_FAILED; + + if (m_timecode >= 0) // at least partially loaded + return 0; + + if (m_pos != m_element_start || m_element_size >= 0) + return E_PARSE_FAILED; + + IMkvReader* const pReader = m_pSegment->m_pReader; + long long total, avail; + const int status = pReader->Length(&total, &avail); + + if (status < 0) // error + return status; + + if (total >= 0 && (avail > total || m_pos > total)) + return E_FILE_FORMAT_INVALID; + + pos = m_pos; + + long long cluster_size = -1; + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + long long result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error or underflow + return static_cast(result); + + if (result > 0) + return E_BUFFER_NOT_FULL; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long id_ = ReadID(pReader, pos, len); + + if (id_ < 0) // error + return static_cast(id_); + + if (id_ != libwebm::kMkvCluster) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume id + + // read cluster size + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) + return E_BUFFER_NOT_FULL; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long size = ReadUInt(pReader, pos, len); + + if (size < 0) // error + return static_cast(cluster_size); + + if (size == 0) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume length of size of element + + const long long unknown_size = (1LL << (7 * len)) - 1; + + if (size != unknown_size) + cluster_size = size; + + // pos points to start of payload + long long timecode = -1; + long long new_pos = -1; + bool bBlock = false; + + long long cluster_stop = (cluster_size < 0) ? -1 : pos + cluster_size; + + for (;;) { + if ((cluster_stop >= 0) && (pos >= cluster_stop)) + break; + + // Parse ID + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + long long result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) + return E_BUFFER_NOT_FULL; + + if ((cluster_stop >= 0) && ((pos + len) > cluster_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long id = ReadID(pReader, pos, len); + + if (id < 0) // error + return static_cast(id); + + if (id == 0) + return E_FILE_FORMAT_INVALID; + + // This is the distinguished set of ID's we use to determine + // that we have exhausted the sub-element's inside the cluster + // whose ID we parsed earlier. + + if (id == libwebm::kMkvCluster) + break; + + if (id == libwebm::kMkvCues) + break; + + pos += len; // consume ID field + + // Parse Size + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) + return E_BUFFER_NOT_FULL; + + if ((cluster_stop >= 0) && ((pos + len) > cluster_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long size = ReadUInt(pReader, pos, len); + + if (size < 0) // error + return static_cast(size); + + const long long unknown_size = (1LL << (7 * len)) - 1; + + if (size == unknown_size) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume size field + + if ((cluster_stop >= 0) && (pos > cluster_stop)) + return E_FILE_FORMAT_INVALID; + + // pos now points to start of payload + + if (size == 0) + continue; + + if ((cluster_stop >= 0) && ((pos + size) > cluster_stop)) + return E_FILE_FORMAT_INVALID; + + if (id == libwebm::kMkvTimecode) { + len = static_cast(size); + + if ((pos + size) > avail) + return E_BUFFER_NOT_FULL; + + timecode = UnserializeUInt(pReader, pos, size); + + if (timecode < 0) // error (or underflow) + return static_cast(timecode); + + new_pos = pos + size; + + if (bBlock) + break; + } else if (id == libwebm::kMkvBlockGroup) { + bBlock = true; + break; + } else if (id == libwebm::kMkvSimpleBlock) { + bBlock = true; + break; + } + + pos += size; // consume payload + if (cluster_stop >= 0 && pos > cluster_stop) + return E_FILE_FORMAT_INVALID; + } + + if (cluster_stop >= 0 && pos > cluster_stop) + return E_FILE_FORMAT_INVALID; + + if (timecode < 0) // no timecode found + return E_FILE_FORMAT_INVALID; + + if (!bBlock) + return E_FILE_FORMAT_INVALID; + + m_pos = new_pos; // designates position just beyond timecode payload + m_timecode = timecode; // m_timecode >= 0 means we're partially loaded + + if (cluster_size >= 0) + m_element_size = cluster_stop - m_element_start; + + return 0; +} + +long Cluster::Parse(long long& pos, long& len) const { + long status = Load(pos, len); + + if (status < 0) + return status; + + if (m_pos < m_element_start || m_timecode < 0) + return E_PARSE_FAILED; + + const long long cluster_stop = + (m_element_size < 0) ? -1 : m_element_start + m_element_size; + + if ((cluster_stop >= 0) && (m_pos >= cluster_stop)) + return 1; // nothing else to do + + IMkvReader* const pReader = m_pSegment->m_pReader; + + long long total, avail; + + status = pReader->Length(&total, &avail); + + if (status < 0) // error + return status; + + if (total >= 0 && avail > total) + return E_FILE_FORMAT_INVALID; + + pos = m_pos; + + for (;;) { + if ((cluster_stop >= 0) && (pos >= cluster_stop)) + break; + + if ((total >= 0) && (pos >= total)) { + if (m_element_size < 0) + m_element_size = pos - m_element_start; + + break; + } + + // Parse ID + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + long long result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) + return E_BUFFER_NOT_FULL; + + if ((cluster_stop >= 0) && ((pos + len) > cluster_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long id = ReadID(pReader, pos, len); + + if (id < 0) + return E_FILE_FORMAT_INVALID; + + // This is the distinguished set of ID's we use to determine + // that we have exhausted the sub-element's inside the cluster + // whose ID we parsed earlier. + + if ((id == libwebm::kMkvCluster) || (id == libwebm::kMkvCues)) { + if (m_element_size < 0) + m_element_size = pos - m_element_start; + + break; + } + + pos += len; // consume ID field + + // Parse Size + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) + return E_BUFFER_NOT_FULL; + + if ((cluster_stop >= 0) && ((pos + len) > cluster_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long size = ReadUInt(pReader, pos, len); + + if (size < 0) // error + return static_cast(size); + + const long long unknown_size = (1LL << (7 * len)) - 1; + + if (size == unknown_size) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume size field + + if ((cluster_stop >= 0) && (pos > cluster_stop)) + return E_FILE_FORMAT_INVALID; + + // pos now points to start of payload + + if (size == 0) + continue; + + // const long long block_start = pos; + const long long block_stop = pos + size; + + if (cluster_stop >= 0) { + if (block_stop > cluster_stop) { + if (id == libwebm::kMkvBlockGroup || id == libwebm::kMkvSimpleBlock) { + return E_FILE_FORMAT_INVALID; + } + + pos = cluster_stop; + break; + } + } else if ((total >= 0) && (block_stop > total)) { + m_element_size = total - m_element_start; + pos = total; + break; + } else if (block_stop > avail) { + len = static_cast(size); + return E_BUFFER_NOT_FULL; + } + + Cluster* const this_ = const_cast(this); + + if (id == libwebm::kMkvBlockGroup) + return this_->ParseBlockGroup(size, pos, len); + + if (id == libwebm::kMkvSimpleBlock) + return this_->ParseSimpleBlock(size, pos, len); + + pos += size; // consume payload + if (cluster_stop >= 0 && pos > cluster_stop) + return E_FILE_FORMAT_INVALID; + } + + if (m_element_size < 1) + return E_FILE_FORMAT_INVALID; + + m_pos = pos; + if (cluster_stop >= 0 && m_pos > cluster_stop) + return E_FILE_FORMAT_INVALID; + + if (m_entries_count > 0) { + const long idx = m_entries_count - 1; + + const BlockEntry* const pLast = m_entries[idx]; + if (pLast == NULL) + return E_PARSE_FAILED; + + const Block* const pBlock = pLast->GetBlock(); + if (pBlock == NULL) + return E_PARSE_FAILED; + + const long long start = pBlock->m_start; + + if ((total >= 0) && (start > total)) + return E_PARSE_FAILED; // defend against trucated stream + + const long long size = pBlock->m_size; + + const long long stop = start + size; + if (cluster_stop >= 0 && stop > cluster_stop) + return E_FILE_FORMAT_INVALID; + + if ((total >= 0) && (stop > total)) + return E_PARSE_FAILED; // defend against trucated stream + } + + return 1; // no more entries +} + +long Cluster::ParseSimpleBlock(long long block_size, long long& pos, + long& len) { + const long long block_start = pos; + const long long block_stop = pos + block_size; + + IMkvReader* const pReader = m_pSegment->m_pReader; + + long long total, avail; + + long status = pReader->Length(&total, &avail); + + if (status < 0) // error + return status; + + assert((total < 0) || (avail <= total)); + + // parse track number + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + long long result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // weird + return E_BUFFER_NOT_FULL; + + if ((pos + len) > block_stop) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long track = ReadUInt(pReader, pos, len); + + if (track < 0) // error + return static_cast(track); + + if (track == 0) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume track number + + if ((pos + 2) > block_stop) + return E_FILE_FORMAT_INVALID; + + if ((pos + 2) > avail) { + len = 2; + return E_BUFFER_NOT_FULL; + } + + pos += 2; // consume timecode + + if ((pos + 1) > block_stop) + return E_FILE_FORMAT_INVALID; + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + unsigned char flags; + + status = pReader->Read(pos, 1, &flags); + + if (status < 0) { // error or underflow + len = 1; + return status; + } + + ++pos; // consume flags byte + assert(pos <= avail); + + if (pos >= block_stop) + return E_FILE_FORMAT_INVALID; + + const int lacing = int(flags & 0x06) >> 1; + + if ((lacing != 0) && (block_stop > avail)) { + len = static_cast(block_stop - pos); + return E_BUFFER_NOT_FULL; + } + + status = CreateBlock(libwebm::kMkvSimpleBlock, block_start, block_size, + 0); // DiscardPadding + + if (status != 0) + return status; + + m_pos = block_stop; + + return 0; // success +} + +long Cluster::ParseBlockGroup(long long payload_size, long long& pos, + long& len) { + const long long payload_start = pos; + const long long payload_stop = pos + payload_size; + + IMkvReader* const pReader = m_pSegment->m_pReader; + + long long total, avail; + + long status = pReader->Length(&total, &avail); + + if (status < 0) // error + return status; + + assert((total < 0) || (avail <= total)); + + if ((total >= 0) && (payload_stop > total)) + return E_FILE_FORMAT_INVALID; + + if (payload_stop > avail) { + len = static_cast(payload_size); + return E_BUFFER_NOT_FULL; + } + + long long discard_padding = 0; + + while (pos < payload_stop) { + // parse sub-block element ID + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + long long result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // weird + return E_BUFFER_NOT_FULL; + + if ((pos + len) > payload_stop) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long id = ReadID(pReader, pos, len); + + if (id < 0) // error + return static_cast(id); + + if (id == 0) // not a valid ID + return E_FILE_FORMAT_INVALID; + + pos += len; // consume ID field + + // Parse Size + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // weird + return E_BUFFER_NOT_FULL; + + if ((pos + len) > payload_stop) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long size = ReadUInt(pReader, pos, len); + + if (size < 0) // error + return static_cast(size); + + pos += len; // consume size field + + // pos now points to start of sub-block group payload + + if (pos > payload_stop) + return E_FILE_FORMAT_INVALID; + + if (size == 0) // weird + continue; + + const long long unknown_size = (1LL << (7 * len)) - 1; + + if (size == unknown_size) + return E_FILE_FORMAT_INVALID; + + if (id == libwebm::kMkvDiscardPadding) { + status = UnserializeInt(pReader, pos, size, discard_padding); + + if (status < 0) // error + return status; + } + + if (id != libwebm::kMkvBlock) { + pos += size; // consume sub-part of block group + + if (pos > payload_stop) + return E_FILE_FORMAT_INVALID; + + continue; + } + + const long long block_stop = pos + size; + + if (block_stop > payload_stop) + return E_FILE_FORMAT_INVALID; + + // parse track number + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // weird + return E_BUFFER_NOT_FULL; + + if ((pos + len) > block_stop) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long track = ReadUInt(pReader, pos, len); + + if (track < 0) // error + return static_cast(track); + + if (track == 0) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume track number + + if ((pos + 2) > block_stop) + return E_FILE_FORMAT_INVALID; + + if ((pos + 2) > avail) { + len = 2; + return E_BUFFER_NOT_FULL; + } + + pos += 2; // consume timecode + + if ((pos + 1) > block_stop) + return E_FILE_FORMAT_INVALID; + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + unsigned char flags; + + status = pReader->Read(pos, 1, &flags); + + if (status < 0) { // error or underflow + len = 1; + return status; + } + + ++pos; // consume flags byte + assert(pos <= avail); + + if (pos >= block_stop) + return E_FILE_FORMAT_INVALID; + + const int lacing = int(flags & 0x06) >> 1; + + if ((lacing != 0) && (block_stop > avail)) { + len = static_cast(block_stop - pos); + return E_BUFFER_NOT_FULL; + } + + pos = block_stop; // consume block-part of block group + if (pos > payload_stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != payload_stop) + return E_FILE_FORMAT_INVALID; + + status = CreateBlock(libwebm::kMkvBlockGroup, payload_start, payload_size, + discard_padding); + if (status != 0) + return status; + + m_pos = payload_stop; + + return 0; // success +} + +long Cluster::GetEntry(long index, const mkvparser::BlockEntry*& pEntry) const { + assert(m_pos >= m_element_start); + + pEntry = NULL; + + if (index < 0) + return -1; // generic error + + if (m_entries_count < 0) + return E_BUFFER_NOT_FULL; + + assert(m_entries); + assert(m_entries_size > 0); + assert(m_entries_count <= m_entries_size); + + if (index < m_entries_count) { + pEntry = m_entries[index]; + assert(pEntry); + + return 1; // found entry + } + + if (m_element_size < 0) // we don't know cluster end yet + return E_BUFFER_NOT_FULL; // underflow + + const long long element_stop = m_element_start + m_element_size; + + if (m_pos >= element_stop) + return 0; // nothing left to parse + + return E_BUFFER_NOT_FULL; // underflow, since more remains to be parsed +} + +Cluster* Cluster::Create(Segment* pSegment, long idx, long long off) { + if (!pSegment || off < 0) + return NULL; + + const long long element_start = pSegment->m_start + off; + + Cluster* const pCluster = + new (std::nothrow) Cluster(pSegment, idx, element_start); + + return pCluster; +} + +Cluster::Cluster() + : m_pSegment(NULL), + m_element_start(0), + m_index(0), + m_pos(0), + m_element_size(0), + m_timecode(0), + m_entries(NULL), + m_entries_size(0), + m_entries_count(0) // means "no entries" +{} + +Cluster::Cluster(Segment* pSegment, long idx, long long element_start + /* long long element_size */) + : m_pSegment(pSegment), + m_element_start(element_start), + m_index(idx), + m_pos(element_start), + m_element_size(-1 /* element_size */), + m_timecode(-1), + m_entries(NULL), + m_entries_size(0), + m_entries_count(-1) // means "has not been parsed yet" +{} + +Cluster::~Cluster() { + if (m_entries_count <= 0) { + delete[] m_entries; + return; + } + + BlockEntry** i = m_entries; + BlockEntry** const j = m_entries + m_entries_count; + + while (i != j) { + BlockEntry* p = *i++; + assert(p); + + delete p; + } + + delete[] m_entries; +} + +bool Cluster::EOS() const { return (m_pSegment == NULL); } + +long Cluster::GetIndex() const { return m_index; } + +long long Cluster::GetPosition() const { + const long long pos = m_element_start - m_pSegment->m_start; + assert(pos >= 0); + + return pos; +} + +long long Cluster::GetElementSize() const { return m_element_size; } + +long Cluster::HasBlockEntries( + const Segment* pSegment, + long long off, // relative to start of segment payload + long long& pos, long& len) { + assert(pSegment); + assert(off >= 0); // relative to segment + + IMkvReader* const pReader = pSegment->m_pReader; + + long long total, avail; + + long status = pReader->Length(&total, &avail); + + if (status < 0) // error + return status; + + assert((total < 0) || (avail <= total)); + + pos = pSegment->m_start + off; // absolute + + if ((total >= 0) && (pos >= total)) + return 0; // we don't even have a complete cluster + + const long long segment_stop = + (pSegment->m_size < 0) ? -1 : pSegment->m_start + pSegment->m_size; + + long long cluster_stop = -1; // interpreted later to mean "unknown size" + + { + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + long long result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // need more data + return E_BUFFER_NOT_FULL; + + if ((segment_stop >= 0) && ((pos + len) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((total >= 0) && ((pos + len) > total)) + return 0; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long id = ReadID(pReader, pos, len); + + if (id < 0) // error + return static_cast(id); + + if (id != libwebm::kMkvCluster) + return E_PARSE_FAILED; + + pos += len; // consume Cluster ID field + + // read size field + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // weird + return E_BUFFER_NOT_FULL; + + if ((segment_stop >= 0) && ((pos + len) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((total >= 0) && ((pos + len) > total)) + return 0; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long size = ReadUInt(pReader, pos, len); + + if (size < 0) // error + return static_cast(size); + + if (size == 0) + return 0; // cluster does not have entries + + pos += len; // consume size field + + // pos now points to start of payload + + const long long unknown_size = (1LL << (7 * len)) - 1; + + if (size != unknown_size) { + cluster_stop = pos + size; + assert(cluster_stop >= 0); + + if ((segment_stop >= 0) && (cluster_stop > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((total >= 0) && (cluster_stop > total)) + // return E_FILE_FORMAT_INVALID; //too conservative + return 0; // cluster does not have any entries + } + } + + for (;;) { + if ((cluster_stop >= 0) && (pos >= cluster_stop)) + return 0; // no entries detected + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + long long result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // need more data + return E_BUFFER_NOT_FULL; + + if ((cluster_stop >= 0) && ((pos + len) > cluster_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long id = ReadID(pReader, pos, len); + + if (id < 0) // error + return static_cast(id); + + // This is the distinguished set of ID's we use to determine + // that we have exhausted the sub-element's inside the cluster + // whose ID we parsed earlier. + + if (id == libwebm::kMkvCluster) + return 0; // no entries found + + if (id == libwebm::kMkvCues) + return 0; // no entries found + + pos += len; // consume id field + + if ((cluster_stop >= 0) && (pos >= cluster_stop)) + return E_FILE_FORMAT_INVALID; + + // read size field + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // underflow + return E_BUFFER_NOT_FULL; + + if ((cluster_stop >= 0) && ((pos + len) > cluster_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long size = ReadUInt(pReader, pos, len); + + if (size < 0) // error + return static_cast(size); + + pos += len; // consume size field + + // pos now points to start of payload + + if ((cluster_stop >= 0) && (pos > cluster_stop)) + return E_FILE_FORMAT_INVALID; + + if (size == 0) // weird + continue; + + const long long unknown_size = (1LL << (7 * len)) - 1; + + if (size == unknown_size) + return E_FILE_FORMAT_INVALID; // not supported inside cluster + + if ((cluster_stop >= 0) && ((pos + size) > cluster_stop)) + return E_FILE_FORMAT_INVALID; + + if (id == libwebm::kMkvBlockGroup) + return 1; // have at least one entry + + if (id == libwebm::kMkvSimpleBlock) + return 1; // have at least one entry + + pos += size; // consume payload + if (cluster_stop >= 0 && pos > cluster_stop) + return E_FILE_FORMAT_INVALID; + } +} + +long long Cluster::GetTimeCode() const { + long long pos; + long len; + + const long status = Load(pos, len); + + if (status < 0) // error + return status; + + return m_timecode; +} + +long long Cluster::GetTime() const { + const long long tc = GetTimeCode(); + + if (tc < 0) + return tc; + + const SegmentInfo* const pInfo = m_pSegment->GetInfo(); + assert(pInfo); + + const long long scale = pInfo->GetTimeCodeScale(); + assert(scale >= 1); + + const long long t = m_timecode * scale; + + return t; +} + +long long Cluster::GetFirstTime() const { + const BlockEntry* pEntry; + + const long status = GetFirst(pEntry); + + if (status < 0) // error + return status; + + if (pEntry == NULL) // empty cluster + return GetTime(); + + const Block* const pBlock = pEntry->GetBlock(); + assert(pBlock); + + return pBlock->GetTime(this); +} + +long long Cluster::GetLastTime() const { + const BlockEntry* pEntry; + + const long status = GetLast(pEntry); + + if (status < 0) // error + return status; + + if (pEntry == NULL) // empty cluster + return GetTime(); + + const Block* const pBlock = pEntry->GetBlock(); + assert(pBlock); + + return pBlock->GetTime(this); +} + +long Cluster::CreateBlock(long long id, + long long pos, // absolute pos of payload + long long size, long long discard_padding) { + if (id != libwebm::kMkvBlockGroup && id != libwebm::kMkvSimpleBlock) + return E_PARSE_FAILED; + + if (m_entries_count < 0) { // haven't parsed anything yet + assert(m_entries == NULL); + assert(m_entries_size == 0); + + m_entries_size = 1024; + m_entries = new (std::nothrow) BlockEntry*[m_entries_size]; + if (m_entries == NULL) + return -1; + + m_entries_count = 0; + } else { + assert(m_entries); + assert(m_entries_size > 0); + assert(m_entries_count <= m_entries_size); + + if (m_entries_count >= m_entries_size) { + const long entries_size = 2 * m_entries_size; + + BlockEntry** const entries = new (std::nothrow) BlockEntry*[entries_size]; + if (entries == NULL) + return -1; + + BlockEntry** src = m_entries; + BlockEntry** const src_end = src + m_entries_count; + + BlockEntry** dst = entries; + + while (src != src_end) + *dst++ = *src++; + + delete[] m_entries; + + m_entries = entries; + m_entries_size = entries_size; + } + } + + if (id == libwebm::kMkvBlockGroup) + return CreateBlockGroup(pos, size, discard_padding); + else + return CreateSimpleBlock(pos, size); +} + +long Cluster::CreateBlockGroup(long long start_offset, long long size, + long long discard_padding) { + assert(m_entries); + assert(m_entries_size > 0); + assert(m_entries_count >= 0); + assert(m_entries_count < m_entries_size); + + IMkvReader* const pReader = m_pSegment->m_pReader; + + long long pos = start_offset; + const long long stop = start_offset + size; + + // For WebM files, there is a bias towards previous reference times + //(in order to support alt-ref frames, which refer back to the previous + // keyframe). Normally a 0 value is not possible, but here we tenatively + // allow 0 as the value of a reference frame, with the interpretation + // that this is a "previous" reference time. + + long long prev = 1; // nonce + long long next = 0; // nonce + long long duration = -1; // really, this is unsigned + + long long bpos = -1; + long long bsize = -1; + + while (pos < stop) { + long len; + const long long id = ReadID(pReader, pos, len); + if (id < 0 || (pos + len) > stop) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume ID + + const long long size = ReadUInt(pReader, pos, len); + assert(size >= 0); // TODO + assert((pos + len) <= stop); + + pos += len; // consume size + + if (id == libwebm::kMkvBlock) { + if (bpos < 0) { // Block ID + bpos = pos; + bsize = size; + } + } else if (id == libwebm::kMkvBlockDuration) { + if (size > 8) + return E_FILE_FORMAT_INVALID; + + duration = UnserializeUInt(pReader, pos, size); + + if (duration < 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvReferenceBlock) { + if (size > 8 || size <= 0) + return E_FILE_FORMAT_INVALID; + const long size_ = static_cast(size); + + long long time; + + long status = UnserializeInt(pReader, pos, size_, time); + assert(status == 0); + if (status != 0) + return -1; + + if (time <= 0) // see note above + prev = time; + else + next = time; + } + + pos += size; // consume payload + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + if (bpos < 0) + return E_FILE_FORMAT_INVALID; + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + assert(bsize >= 0); + + const long idx = m_entries_count; + + BlockEntry** const ppEntry = m_entries + idx; + BlockEntry*& pEntry = *ppEntry; + + pEntry = new (std::nothrow) + BlockGroup(this, idx, bpos, bsize, prev, next, duration, discard_padding); + + if (pEntry == NULL) + return -1; // generic error + + BlockGroup* const p = static_cast(pEntry); + + const long status = p->Parse(); + + if (status == 0) { // success + ++m_entries_count; + return 0; + } + + delete pEntry; + pEntry = 0; + + return status; +} + +long Cluster::CreateSimpleBlock(long long st, long long sz) { + assert(m_entries); + assert(m_entries_size > 0); + assert(m_entries_count >= 0); + assert(m_entries_count < m_entries_size); + + const long idx = m_entries_count; + + BlockEntry** const ppEntry = m_entries + idx; + BlockEntry*& pEntry = *ppEntry; + + pEntry = new (std::nothrow) SimpleBlock(this, idx, st, sz); + + if (pEntry == NULL) + return -1; // generic error + + SimpleBlock* const p = static_cast(pEntry); + + const long status = p->Parse(); + + if (status == 0) { + ++m_entries_count; + return 0; + } + + delete pEntry; + pEntry = 0; + + return status; +} + +long Cluster::GetFirst(const BlockEntry*& pFirst) const { + if (m_entries_count <= 0) { + long long pos; + long len; + + const long status = Parse(pos, len); + + if (status < 0) { // error + pFirst = NULL; + return status; + } + + if (m_entries_count <= 0) { // empty cluster + pFirst = NULL; + return 0; + } + } + + assert(m_entries); + + pFirst = m_entries[0]; + assert(pFirst); + + return 0; // success +} + +long Cluster::GetLast(const BlockEntry*& pLast) const { + for (;;) { + long long pos; + long len; + + const long status = Parse(pos, len); + + if (status < 0) { // error + pLast = NULL; + return status; + } + + if (status > 0) // no new block + break; + } + + if (m_entries_count <= 0) { + pLast = NULL; + return 0; + } + + assert(m_entries); + + const long idx = m_entries_count - 1; + + pLast = m_entries[idx]; + assert(pLast); + + return 0; +} + +long Cluster::GetNext(const BlockEntry* pCurr, const BlockEntry*& pNext) const { + assert(pCurr); + assert(m_entries); + assert(m_entries_count > 0); + + size_t idx = pCurr->GetIndex(); + assert(idx < size_t(m_entries_count)); + assert(m_entries[idx] == pCurr); + + ++idx; + + if (idx >= size_t(m_entries_count)) { + long long pos; + long len; + + const long status = Parse(pos, len); + + if (status < 0) { // error + pNext = NULL; + return status; + } + + if (status > 0) { + pNext = NULL; + return 0; + } + + assert(m_entries); + assert(m_entries_count > 0); + assert(idx < size_t(m_entries_count)); + } + + pNext = m_entries[idx]; + assert(pNext); + + return 0; +} + +long Cluster::GetEntryCount() const { return m_entries_count; } + +const BlockEntry* Cluster::GetEntry(const Track* pTrack, + long long time_ns) const { + assert(pTrack); + + if (m_pSegment == NULL) // this is the special EOS cluster + return pTrack->GetEOS(); + + const BlockEntry* pResult = pTrack->GetEOS(); + + long index = 0; + + for (;;) { + if (index >= m_entries_count) { + long long pos; + long len; + + const long status = Parse(pos, len); + assert(status >= 0); + + if (status > 0) // completely parsed, and no more entries + return pResult; + + if (status < 0) // should never happen + return 0; + + assert(m_entries); + assert(index < m_entries_count); + } + + const BlockEntry* const pEntry = m_entries[index]; + assert(pEntry); + assert(!pEntry->EOS()); + + const Block* const pBlock = pEntry->GetBlock(); + assert(pBlock); + + if (pBlock->GetTrackNumber() != pTrack->GetNumber()) { + ++index; + continue; + } + + if (pTrack->VetEntry(pEntry)) { + if (time_ns < 0) // just want first candidate block + return pEntry; + + const long long ns = pBlock->GetTime(this); + + if (ns > time_ns) + return pResult; + + pResult = pEntry; // have a candidate + } else if (time_ns >= 0) { + const long long ns = pBlock->GetTime(this); + + if (ns > time_ns) + return pResult; + } + + ++index; + } +} + +const BlockEntry* Cluster::GetEntry(const CuePoint& cp, + const CuePoint::TrackPosition& tp) const { + assert(m_pSegment); + const long long tc = cp.GetTimeCode(); + + if (tp.m_block > 0) { + const long block = static_cast(tp.m_block); + const long index = block - 1; + + while (index >= m_entries_count) { + long long pos; + long len; + + const long status = Parse(pos, len); + + if (status < 0) // TODO: can this happen? + return NULL; + + if (status > 0) // nothing remains to be parsed + return NULL; + } + + const BlockEntry* const pEntry = m_entries[index]; + assert(pEntry); + assert(!pEntry->EOS()); + + const Block* const pBlock = pEntry->GetBlock(); + assert(pBlock); + + if ((pBlock->GetTrackNumber() == tp.m_track) && + (pBlock->GetTimeCode(this) == tc)) { + return pEntry; + } + } + + long index = 0; + + for (;;) { + if (index >= m_entries_count) { + long long pos; + long len; + + const long status = Parse(pos, len); + + if (status < 0) // TODO: can this happen? + return NULL; + + if (status > 0) // nothing remains to be parsed + return NULL; + + assert(m_entries); + assert(index < m_entries_count); + } + + const BlockEntry* const pEntry = m_entries[index]; + assert(pEntry); + assert(!pEntry->EOS()); + + const Block* const pBlock = pEntry->GetBlock(); + assert(pBlock); + + if (pBlock->GetTrackNumber() != tp.m_track) { + ++index; + continue; + } + + const long long tc_ = pBlock->GetTimeCode(this); + + if (tc_ < tc) { + ++index; + continue; + } + + if (tc_ > tc) + return NULL; + + const Tracks* const pTracks = m_pSegment->GetTracks(); + assert(pTracks); + + const long tn = static_cast(tp.m_track); + const Track* const pTrack = pTracks->GetTrackByNumber(tn); + + if (pTrack == NULL) + return NULL; + + const long long type = pTrack->GetType(); + + if (type == 2) // audio + return pEntry; + + if (type != 1) // not video + return NULL; + + if (!pBlock->IsKey()) + return NULL; + + return pEntry; + } +} + +BlockEntry::BlockEntry(Cluster* p, long idx) : m_pCluster(p), m_index(idx) {} +BlockEntry::~BlockEntry() {} +const Cluster* BlockEntry::GetCluster() const { return m_pCluster; } +long BlockEntry::GetIndex() const { return m_index; } + +SimpleBlock::SimpleBlock(Cluster* pCluster, long idx, long long start, + long long size) + : BlockEntry(pCluster, idx), m_block(start, size, 0) {} + +long SimpleBlock::Parse() { return m_block.Parse(m_pCluster); } +BlockEntry::Kind SimpleBlock::GetKind() const { return kBlockSimple; } +const Block* SimpleBlock::GetBlock() const { return &m_block; } + +BlockGroup::BlockGroup(Cluster* pCluster, long idx, long long block_start, + long long block_size, long long prev, long long next, + long long duration, long long discard_padding) + : BlockEntry(pCluster, idx), + m_block(block_start, block_size, discard_padding), + m_prev(prev), + m_next(next), + m_duration(duration) {} + +long BlockGroup::Parse() { + const long status = m_block.Parse(m_pCluster); + + if (status) + return status; + + m_block.SetKey((m_prev > 0) && (m_next <= 0)); + + return 0; +} + +BlockEntry::Kind BlockGroup::GetKind() const { return kBlockGroup; } +const Block* BlockGroup::GetBlock() const { return &m_block; } +long long BlockGroup::GetPrevTimeCode() const { return m_prev; } +long long BlockGroup::GetNextTimeCode() const { return m_next; } +long long BlockGroup::GetDurationTimeCode() const { return m_duration; } + +Block::Block(long long start, long long size_, long long discard_padding) + : m_start(start), + m_size(size_), + m_track(0), + m_timecode(-1), + m_flags(0), + m_frames(NULL), + m_frame_count(-1), + m_discard_padding(discard_padding) {} + +Block::~Block() { delete[] m_frames; } + +long Block::Parse(const Cluster* pCluster) { + if (pCluster == NULL) + return -1; + + if (pCluster->m_pSegment == NULL) + return -1; + + assert(m_start >= 0); + assert(m_size >= 0); + assert(m_track <= 0); + assert(m_frames == NULL); + assert(m_frame_count <= 0); + + long long pos = m_start; + const long long stop = m_start + m_size; + + long len; + + IMkvReader* const pReader = pCluster->m_pSegment->m_pReader; + + m_track = ReadUInt(pReader, pos, len); + + if (m_track <= 0) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > stop) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume track number + + if ((stop - pos) < 2) + return E_FILE_FORMAT_INVALID; + + long status; + long long value; + + status = UnserializeInt(pReader, pos, 2, value); + + if (status) + return E_FILE_FORMAT_INVALID; + + if (value < SHRT_MIN) + return E_FILE_FORMAT_INVALID; + + if (value > SHRT_MAX) + return E_FILE_FORMAT_INVALID; + + m_timecode = static_cast(value); + + pos += 2; + + if ((stop - pos) <= 0) + return E_FILE_FORMAT_INVALID; + + status = pReader->Read(pos, 1, &m_flags); + + if (status) + return E_FILE_FORMAT_INVALID; + + const int lacing = int(m_flags & 0x06) >> 1; + + ++pos; // consume flags byte + + if (lacing == 0) { // no lacing + if (pos > stop) + return E_FILE_FORMAT_INVALID; + + m_frame_count = 1; + m_frames = new (std::nothrow) Frame[m_frame_count]; + if (m_frames == NULL) + return -1; + + Frame& f = m_frames[0]; + f.pos = pos; + + const long long frame_size = stop - pos; + + if (frame_size > LONG_MAX || frame_size <= 0) + return E_FILE_FORMAT_INVALID; + + f.len = static_cast(frame_size); + + return 0; // success + } + + if (pos >= stop) + return E_FILE_FORMAT_INVALID; + + unsigned char biased_count; + + status = pReader->Read(pos, 1, &biased_count); + + if (status) + return E_FILE_FORMAT_INVALID; + + ++pos; // consume frame count + if (pos > stop) + return E_FILE_FORMAT_INVALID; + + m_frame_count = int(biased_count) + 1; + + m_frames = new (std::nothrow) Frame[m_frame_count]; + if (m_frames == NULL) + return -1; + + if (!m_frames) + return E_FILE_FORMAT_INVALID; + + if (lacing == 1) { // Xiph + Frame* pf = m_frames; + Frame* const pf_end = pf + m_frame_count; + + long long size = 0; + int frame_count = m_frame_count; + + while (frame_count > 1) { + long frame_size = 0; + + for (;;) { + unsigned char val; + + if (pos >= stop) + return E_FILE_FORMAT_INVALID; + + status = pReader->Read(pos, 1, &val); + + if (status) + return E_FILE_FORMAT_INVALID; + + ++pos; // consume xiph size byte + + frame_size += val; + + if (val < 255) + break; + } + + Frame& f = *pf++; + assert(pf < pf_end); + if (pf >= pf_end) + return E_FILE_FORMAT_INVALID; + + f.pos = 0; // patch later + + if (frame_size <= 0) + return E_FILE_FORMAT_INVALID; + + f.len = frame_size; + size += frame_size; // contribution of this frame + + --frame_count; + } + + if (pf >= pf_end || pos > stop) + return E_FILE_FORMAT_INVALID; + + { + Frame& f = *pf++; + + if (pf != pf_end) + return E_FILE_FORMAT_INVALID; + + f.pos = 0; // patch later + + const long long total_size = stop - pos; + + if (total_size < size) + return E_FILE_FORMAT_INVALID; + + const long long frame_size = total_size - size; + + if (frame_size > LONG_MAX || frame_size <= 0) + return E_FILE_FORMAT_INVALID; + + f.len = static_cast(frame_size); + } + + pf = m_frames; + while (pf != pf_end) { + Frame& f = *pf++; + assert((pos + f.len) <= stop); + + if ((pos + f.len) > stop) + return E_FILE_FORMAT_INVALID; + + f.pos = pos; + pos += f.len; + } + + assert(pos == stop); + if (pos != stop) + return E_FILE_FORMAT_INVALID; + + } else if (lacing == 2) { // fixed-size lacing + if (pos >= stop) + return E_FILE_FORMAT_INVALID; + + const long long total_size = stop - pos; + + if ((total_size % m_frame_count) != 0) + return E_FILE_FORMAT_INVALID; + + const long long frame_size = total_size / m_frame_count; + + if (frame_size > LONG_MAX || frame_size <= 0) + return E_FILE_FORMAT_INVALID; + + Frame* pf = m_frames; + Frame* const pf_end = pf + m_frame_count; + + while (pf != pf_end) { + assert((pos + frame_size) <= stop); + if ((pos + frame_size) > stop) + return E_FILE_FORMAT_INVALID; + + Frame& f = *pf++; + + f.pos = pos; + f.len = static_cast(frame_size); + + pos += frame_size; + } + + assert(pos == stop); + if (pos != stop) + return E_FILE_FORMAT_INVALID; + + } else { + assert(lacing == 3); // EBML lacing + + if (pos >= stop) + return E_FILE_FORMAT_INVALID; + + long long size = 0; + int frame_count = m_frame_count; + + long long frame_size = ReadUInt(pReader, pos, len); + + if (frame_size <= 0) + return E_FILE_FORMAT_INVALID; + + if (frame_size > LONG_MAX) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > stop) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume length of size of first frame + + if ((pos + frame_size) > stop) + return E_FILE_FORMAT_INVALID; + + Frame* pf = m_frames; + Frame* const pf_end = pf + m_frame_count; + + { + Frame& curr = *pf; + + curr.pos = 0; // patch later + + curr.len = static_cast(frame_size); + size += curr.len; // contribution of this frame + } + + --frame_count; + + while (frame_count > 1) { + if (pos >= stop) + return E_FILE_FORMAT_INVALID; + + assert(pf < pf_end); + if (pf >= pf_end) + return E_FILE_FORMAT_INVALID; + + const Frame& prev = *pf++; + assert(prev.len == frame_size); + if (prev.len != frame_size) + return E_FILE_FORMAT_INVALID; + + assert(pf < pf_end); + if (pf >= pf_end) + return E_FILE_FORMAT_INVALID; + + Frame& curr = *pf; + + curr.pos = 0; // patch later + + const long long delta_size_ = ReadUInt(pReader, pos, len); + + if (delta_size_ < 0) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > stop) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume length of (delta) size + if (pos > stop) + return E_FILE_FORMAT_INVALID; + + const long exp = 7 * len - 1; + const long long bias = (1LL << exp) - 1LL; + const long long delta_size = delta_size_ - bias; + + frame_size += delta_size; + + if (frame_size <= 0) + return E_FILE_FORMAT_INVALID; + + if (frame_size > LONG_MAX) + return E_FILE_FORMAT_INVALID; + + curr.len = static_cast(frame_size); + // Check if size + curr.len could overflow. + if (size > LLONG_MAX - curr.len) { + return E_FILE_FORMAT_INVALID; + } + size += curr.len; // contribution of this frame + + --frame_count; + } + + // parse last frame + if (frame_count > 0) { + if (pos > stop || pf >= pf_end) + return E_FILE_FORMAT_INVALID; + + const Frame& prev = *pf++; + assert(prev.len == frame_size); + if (prev.len != frame_size) + return E_FILE_FORMAT_INVALID; + + if (pf >= pf_end) + return E_FILE_FORMAT_INVALID; + + Frame& curr = *pf++; + if (pf != pf_end) + return E_FILE_FORMAT_INVALID; + + curr.pos = 0; // patch later + + const long long total_size = stop - pos; + + if (total_size < size) + return E_FILE_FORMAT_INVALID; + + frame_size = total_size - size; + + if (frame_size > LONG_MAX || frame_size <= 0) + return E_FILE_FORMAT_INVALID; + + curr.len = static_cast(frame_size); + } + + pf = m_frames; + while (pf != pf_end) { + Frame& f = *pf++; + if ((pos + f.len) > stop) + return E_FILE_FORMAT_INVALID; + + f.pos = pos; + pos += f.len; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + } + + return 0; // success +} + +long long Block::GetTimeCode(const Cluster* pCluster) const { + if (pCluster == 0) + return m_timecode; + + const long long tc0 = pCluster->GetTimeCode(); + assert(tc0 >= 0); + + // Check if tc0 + m_timecode would overflow. + if (tc0 < 0 || LLONG_MAX - tc0 < m_timecode) { + return -1; + } + + const long long tc = tc0 + m_timecode; + + return tc; // unscaled timecode units +} + +long long Block::GetTime(const Cluster* pCluster) const { + assert(pCluster); + + const long long tc = GetTimeCode(pCluster); + + const Segment* const pSegment = pCluster->m_pSegment; + const SegmentInfo* const pInfo = pSegment->GetInfo(); + assert(pInfo); + + const long long scale = pInfo->GetTimeCodeScale(); + assert(scale >= 1); + + // Check if tc * scale could overflow. + if (tc != 0 && scale > LLONG_MAX / tc) { + return -1; + } + const long long ns = tc * scale; + + return ns; +} + +long long Block::GetTrackNumber() const { return m_track; } + +bool Block::IsKey() const { + return ((m_flags & static_cast(1 << 7)) != 0); +} + +void Block::SetKey(bool bKey) { + if (bKey) + m_flags |= static_cast(1 << 7); + else + m_flags &= 0x7F; +} + +bool Block::IsInvisible() const { return bool(int(m_flags & 0x08) != 0); } + +Block::Lacing Block::GetLacing() const { + const int value = int(m_flags & 0x06) >> 1; + return static_cast(value); +} + +int Block::GetFrameCount() const { return m_frame_count; } + +const Block::Frame& Block::GetFrame(int idx) const { + assert(idx >= 0); + assert(idx < m_frame_count); + + const Frame& f = m_frames[idx]; + assert(f.pos > 0); + assert(f.len > 0); + + return f; +} + +long Block::Frame::Read(IMkvReader* pReader, unsigned char* buf) const { + assert(pReader); + assert(buf); + + const long status = pReader->Read(pos, len, buf); + return status; +} + +long long Block::GetDiscardPadding() const { return m_discard_padding; } + +} // namespace mkvparser diff --git a/libs/libaom/src/third_party/libwebm/mkvparser/mkvparser.h b/libs/libaom/src/third_party/libwebm/mkvparser/mkvparser.h new file mode 100644 index 000000000..848d01f03 --- /dev/null +++ b/libs/libaom/src/third_party/libwebm/mkvparser/mkvparser.h @@ -0,0 +1,1147 @@ +// Copyright (c) 2012 The WebM project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +#ifndef MKVPARSER_MKVPARSER_H_ +#define MKVPARSER_MKVPARSER_H_ + +#include + +namespace mkvparser { + +const int E_PARSE_FAILED = -1; +const int E_FILE_FORMAT_INVALID = -2; +const int E_BUFFER_NOT_FULL = -3; + +class IMkvReader { + public: + virtual int Read(long long pos, long len, unsigned char* buf) = 0; + virtual int Length(long long* total, long long* available) = 0; + + protected: + virtual ~IMkvReader() {} +}; + +template +Type* SafeArrayAlloc(unsigned long long num_elements, + unsigned long long element_size); +long long GetUIntLength(IMkvReader*, long long, long&); +long long ReadUInt(IMkvReader*, long long, long&); +long long ReadID(IMkvReader* pReader, long long pos, long& len); +long long UnserializeUInt(IMkvReader*, long long pos, long long size); + +long UnserializeFloat(IMkvReader*, long long pos, long long size, double&); +long UnserializeInt(IMkvReader*, long long pos, long long size, + long long& result); + +long UnserializeString(IMkvReader*, long long pos, long long size, char*& str); + +long ParseElementHeader(IMkvReader* pReader, + long long& pos, // consume id and size fields + long long stop, // if you know size of element's parent + long long& id, long long& size); + +bool Match(IMkvReader*, long long&, unsigned long, long long&); +bool Match(IMkvReader*, long long&, unsigned long, unsigned char*&, size_t&); + +void GetVersion(int& major, int& minor, int& build, int& revision); + +struct EBMLHeader { + EBMLHeader(); + ~EBMLHeader(); + long long m_version; + long long m_readVersion; + long long m_maxIdLength; + long long m_maxSizeLength; + char* m_docType; + long long m_docTypeVersion; + long long m_docTypeReadVersion; + + long long Parse(IMkvReader*, long long&); + void Init(); +}; + +class Segment; +class Track; +class Cluster; + +class Block { + Block(const Block&); + Block& operator=(const Block&); + + public: + const long long m_start; + const long long m_size; + + Block(long long start, long long size, long long discard_padding); + ~Block(); + + long Parse(const Cluster*); + + long long GetTrackNumber() const; + long long GetTimeCode(const Cluster*) const; // absolute, but not scaled + long long GetTime(const Cluster*) const; // absolute, and scaled (ns) + bool IsKey() const; + void SetKey(bool); + bool IsInvisible() const; + + enum Lacing { kLacingNone, kLacingXiph, kLacingFixed, kLacingEbml }; + Lacing GetLacing() const; + + int GetFrameCount() const; // to index frames: [0, count) + + struct Frame { + long long pos; // absolute offset + long len; + + long Read(IMkvReader*, unsigned char*) const; + }; + + const Frame& GetFrame(int frame_index) const; + + long long GetDiscardPadding() const; + + private: + long long m_track; // Track::Number() + short m_timecode; // relative to cluster + unsigned char m_flags; + + Frame* m_frames; + int m_frame_count; + + protected: + const long long m_discard_padding; +}; + +class BlockEntry { + BlockEntry(const BlockEntry&); + BlockEntry& operator=(const BlockEntry&); + + protected: + BlockEntry(Cluster*, long index); + + public: + virtual ~BlockEntry(); + + bool EOS() const { return (GetKind() == kBlockEOS); } + const Cluster* GetCluster() const; + long GetIndex() const; + virtual const Block* GetBlock() const = 0; + + enum Kind { kBlockEOS, kBlockSimple, kBlockGroup }; + virtual Kind GetKind() const = 0; + + protected: + Cluster* const m_pCluster; + const long m_index; +}; + +class SimpleBlock : public BlockEntry { + SimpleBlock(const SimpleBlock&); + SimpleBlock& operator=(const SimpleBlock&); + + public: + SimpleBlock(Cluster*, long index, long long start, long long size); + long Parse(); + + Kind GetKind() const; + const Block* GetBlock() const; + + protected: + Block m_block; +}; + +class BlockGroup : public BlockEntry { + BlockGroup(const BlockGroup&); + BlockGroup& operator=(const BlockGroup&); + + public: + BlockGroup(Cluster*, long index, + long long block_start, // absolute pos of block's payload + long long block_size, // size of block's payload + long long prev, long long next, long long duration, + long long discard_padding); + + long Parse(); + + Kind GetKind() const; + const Block* GetBlock() const; + + long long GetPrevTimeCode() const; // relative to block's time + long long GetNextTimeCode() const; // as above + long long GetDurationTimeCode() const; + + private: + Block m_block; + const long long m_prev; + const long long m_next; + const long long m_duration; +}; + +/////////////////////////////////////////////////////////////// +// ContentEncoding element +// Elements used to describe if the track data has been encrypted or +// compressed with zlib or header stripping. +class ContentEncoding { + public: + enum { kCTR = 1 }; + + ContentEncoding(); + ~ContentEncoding(); + + // ContentCompression element names + struct ContentCompression { + ContentCompression(); + ~ContentCompression(); + + unsigned long long algo; + unsigned char* settings; + long long settings_len; + }; + + // ContentEncAESSettings element names + struct ContentEncAESSettings { + ContentEncAESSettings() : cipher_mode(kCTR) {} + ~ContentEncAESSettings() {} + + unsigned long long cipher_mode; + }; + + // ContentEncryption element names + struct ContentEncryption { + ContentEncryption(); + ~ContentEncryption(); + + unsigned long long algo; + unsigned char* key_id; + long long key_id_len; + unsigned char* signature; + long long signature_len; + unsigned char* sig_key_id; + long long sig_key_id_len; + unsigned long long sig_algo; + unsigned long long sig_hash_algo; + + ContentEncAESSettings aes_settings; + }; + + // Returns ContentCompression represented by |idx|. Returns NULL if |idx| + // is out of bounds. + const ContentCompression* GetCompressionByIndex(unsigned long idx) const; + + // Returns number of ContentCompression elements in this ContentEncoding + // element. + unsigned long GetCompressionCount() const; + + // Parses the ContentCompression element from |pReader|. |start| is the + // starting offset of the ContentCompression payload. |size| is the size in + // bytes of the ContentCompression payload. |compression| is where the parsed + // values will be stored. + long ParseCompressionEntry(long long start, long long size, + IMkvReader* pReader, + ContentCompression* compression); + + // Returns ContentEncryption represented by |idx|. Returns NULL if |idx| + // is out of bounds. + const ContentEncryption* GetEncryptionByIndex(unsigned long idx) const; + + // Returns number of ContentEncryption elements in this ContentEncoding + // element. + unsigned long GetEncryptionCount() const; + + // Parses the ContentEncAESSettings element from |pReader|. |start| is the + // starting offset of the ContentEncAESSettings payload. |size| is the + // size in bytes of the ContentEncAESSettings payload. |encryption| is + // where the parsed values will be stored. + long ParseContentEncAESSettingsEntry(long long start, long long size, + IMkvReader* pReader, + ContentEncAESSettings* aes); + + // Parses the ContentEncoding element from |pReader|. |start| is the + // starting offset of the ContentEncoding payload. |size| is the size in + // bytes of the ContentEncoding payload. Returns true on success. + long ParseContentEncodingEntry(long long start, long long size, + IMkvReader* pReader); + + // Parses the ContentEncryption element from |pReader|. |start| is the + // starting offset of the ContentEncryption payload. |size| is the size in + // bytes of the ContentEncryption payload. |encryption| is where the parsed + // values will be stored. + long ParseEncryptionEntry(long long start, long long size, + IMkvReader* pReader, ContentEncryption* encryption); + + unsigned long long encoding_order() const { return encoding_order_; } + unsigned long long encoding_scope() const { return encoding_scope_; } + unsigned long long encoding_type() const { return encoding_type_; } + + private: + // Member variables for list of ContentCompression elements. + ContentCompression** compression_entries_; + ContentCompression** compression_entries_end_; + + // Member variables for list of ContentEncryption elements. + ContentEncryption** encryption_entries_; + ContentEncryption** encryption_entries_end_; + + // ContentEncoding element names + unsigned long long encoding_order_; + unsigned long long encoding_scope_; + unsigned long long encoding_type_; + + // LIBWEBM_DISALLOW_COPY_AND_ASSIGN(ContentEncoding); + ContentEncoding(const ContentEncoding&); + ContentEncoding& operator=(const ContentEncoding&); +}; + +class Track { + Track(const Track&); + Track& operator=(const Track&); + + public: + class Info; + static long Create(Segment*, const Info&, long long element_start, + long long element_size, Track*&); + + enum Type { kVideo = 1, kAudio = 2, kSubtitle = 0x11, kMetadata = 0x21 }; + + Segment* const m_pSegment; + const long long m_element_start; + const long long m_element_size; + virtual ~Track(); + + long GetType() const; + long GetNumber() const; + unsigned long long GetUid() const; + const char* GetNameAsUTF8() const; + const char* GetLanguage() const; + const char* GetCodecNameAsUTF8() const; + const char* GetCodecId() const; + const unsigned char* GetCodecPrivate(size_t&) const; + bool GetLacing() const; + unsigned long long GetDefaultDuration() const; + unsigned long long GetCodecDelay() const; + unsigned long long GetSeekPreRoll() const; + + const BlockEntry* GetEOS() const; + + struct Settings { + long long start; + long long size; + }; + + class Info { + public: + Info(); + ~Info(); + int Copy(Info&) const; + void Clear(); + long type; + long number; + unsigned long long uid; + unsigned long long defaultDuration; + unsigned long long codecDelay; + unsigned long long seekPreRoll; + char* nameAsUTF8; + char* language; + char* codecId; + char* codecNameAsUTF8; + unsigned char* codecPrivate; + size_t codecPrivateSize; + bool lacing; + Settings settings; + + private: + Info(const Info&); + Info& operator=(const Info&); + int CopyStr(char* Info::*str, Info&) const; + }; + + long GetFirst(const BlockEntry*&) const; + long GetNext(const BlockEntry* pCurr, const BlockEntry*& pNext) const; + virtual bool VetEntry(const BlockEntry*) const; + virtual long Seek(long long time_ns, const BlockEntry*&) const; + + const ContentEncoding* GetContentEncodingByIndex(unsigned long idx) const; + unsigned long GetContentEncodingCount() const; + + long ParseContentEncodingsEntry(long long start, long long size); + + protected: + Track(Segment*, long long element_start, long long element_size); + + Info m_info; + + class EOSBlock : public BlockEntry { + public: + EOSBlock(); + + Kind GetKind() const; + const Block* GetBlock() const; + }; + + EOSBlock m_eos; + + private: + ContentEncoding** content_encoding_entries_; + ContentEncoding** content_encoding_entries_end_; +}; + +struct PrimaryChromaticity { + PrimaryChromaticity() : x(0), y(0) {} + ~PrimaryChromaticity() {} + static bool Parse(IMkvReader* reader, long long read_pos, + long long value_size, bool is_x, + PrimaryChromaticity** chromaticity); + float x; + float y; +}; + +struct MasteringMetadata { + static const float kValueNotPresent; + + MasteringMetadata() + : r(NULL), + g(NULL), + b(NULL), + white_point(NULL), + luminance_max(kValueNotPresent), + luminance_min(kValueNotPresent) {} + ~MasteringMetadata() { + delete r; + delete g; + delete b; + delete white_point; + } + + static bool Parse(IMkvReader* reader, long long element_start, + long long element_size, + MasteringMetadata** mastering_metadata); + + PrimaryChromaticity* r; + PrimaryChromaticity* g; + PrimaryChromaticity* b; + PrimaryChromaticity* white_point; + float luminance_max; + float luminance_min; +}; + +struct Colour { + static const long long kValueNotPresent; + + // Unless otherwise noted all values assigned upon construction are the + // equivalent of unspecified/default. + Colour() + : matrix_coefficients(kValueNotPresent), + bits_per_channel(kValueNotPresent), + chroma_subsampling_horz(kValueNotPresent), + chroma_subsampling_vert(kValueNotPresent), + cb_subsampling_horz(kValueNotPresent), + cb_subsampling_vert(kValueNotPresent), + chroma_siting_horz(kValueNotPresent), + chroma_siting_vert(kValueNotPresent), + range(kValueNotPresent), + transfer_characteristics(kValueNotPresent), + primaries(kValueNotPresent), + max_cll(kValueNotPresent), + max_fall(kValueNotPresent), + mastering_metadata(NULL) {} + ~Colour() { + delete mastering_metadata; + mastering_metadata = NULL; + } + + static bool Parse(IMkvReader* reader, long long element_start, + long long element_size, Colour** colour); + + long long matrix_coefficients; + long long bits_per_channel; + long long chroma_subsampling_horz; + long long chroma_subsampling_vert; + long long cb_subsampling_horz; + long long cb_subsampling_vert; + long long chroma_siting_horz; + long long chroma_siting_vert; + long long range; + long long transfer_characteristics; + long long primaries; + long long max_cll; + long long max_fall; + + MasteringMetadata* mastering_metadata; +}; + +struct Projection { + enum ProjectionType { + kTypeNotPresent = -1, + kRectangular = 0, + kEquirectangular = 1, + kCubeMap = 2, + kMesh = 3, + }; + static const float kValueNotPresent; + Projection() + : type(kTypeNotPresent), + private_data(NULL), + private_data_length(0), + pose_yaw(kValueNotPresent), + pose_pitch(kValueNotPresent), + pose_roll(kValueNotPresent) {} + ~Projection() { delete[] private_data; } + static bool Parse(IMkvReader* reader, long long element_start, + long long element_size, Projection** projection); + + ProjectionType type; + unsigned char* private_data; + size_t private_data_length; + float pose_yaw; + float pose_pitch; + float pose_roll; +}; + +class VideoTrack : public Track { + VideoTrack(const VideoTrack&); + VideoTrack& operator=(const VideoTrack&); + + VideoTrack(Segment*, long long element_start, long long element_size); + + public: + virtual ~VideoTrack(); + static long Parse(Segment*, const Info&, long long element_start, + long long element_size, VideoTrack*&); + + long long GetWidth() const; + long long GetHeight() const; + long long GetDisplayWidth() const; + long long GetDisplayHeight() const; + long long GetDisplayUnit() const; + long long GetStereoMode() const; + double GetFrameRate() const; + + bool VetEntry(const BlockEntry*) const; + long Seek(long long time_ns, const BlockEntry*&) const; + + Colour* GetColour() const; + + Projection* GetProjection() const; + + const char* GetColourSpace() const { return m_colour_space; } + + private: + long long m_width; + long long m_height; + long long m_display_width; + long long m_display_height; + long long m_display_unit; + long long m_stereo_mode; + char* m_colour_space; + double m_rate; + + Colour* m_colour; + Projection* m_projection; +}; + +class AudioTrack : public Track { + AudioTrack(const AudioTrack&); + AudioTrack& operator=(const AudioTrack&); + + AudioTrack(Segment*, long long element_start, long long element_size); + + public: + static long Parse(Segment*, const Info&, long long element_start, + long long element_size, AudioTrack*&); + + double GetSamplingRate() const; + long long GetChannels() const; + long long GetBitDepth() const; + + private: + double m_rate; + long long m_channels; + long long m_bitDepth; +}; + +class Tracks { + Tracks(const Tracks&); + Tracks& operator=(const Tracks&); + + public: + Segment* const m_pSegment; + const long long m_start; + const long long m_size; + const long long m_element_start; + const long long m_element_size; + + Tracks(Segment*, long long start, long long size, long long element_start, + long long element_size); + + ~Tracks(); + + long Parse(); + + unsigned long GetTracksCount() const; + + const Track* GetTrackByNumber(long tn) const; + const Track* GetTrackByIndex(unsigned long idx) const; + + private: + Track** m_trackEntries; + Track** m_trackEntriesEnd; + + long ParseTrackEntry(long long payload_start, long long payload_size, + long long element_start, long long element_size, + Track*&) const; +}; + +class Chapters { + Chapters(const Chapters&); + Chapters& operator=(const Chapters&); + + public: + Segment* const m_pSegment; + const long long m_start; + const long long m_size; + const long long m_element_start; + const long long m_element_size; + + Chapters(Segment*, long long payload_start, long long payload_size, + long long element_start, long long element_size); + + ~Chapters(); + + long Parse(); + + class Atom; + class Edition; + + class Display { + friend class Atom; + Display(); + Display(const Display&); + ~Display(); + Display& operator=(const Display&); + + public: + const char* GetString() const; + const char* GetLanguage() const; + const char* GetCountry() const; + + private: + void Init(); + void ShallowCopy(Display&) const; + void Clear(); + long Parse(IMkvReader*, long long pos, long long size); + + char* m_string; + char* m_language; + char* m_country; + }; + + class Atom { + friend class Edition; + Atom(); + Atom(const Atom&); + ~Atom(); + Atom& operator=(const Atom&); + + public: + unsigned long long GetUID() const; + const char* GetStringUID() const; + + long long GetStartTimecode() const; + long long GetStopTimecode() const; + + long long GetStartTime(const Chapters*) const; + long long GetStopTime(const Chapters*) const; + + int GetDisplayCount() const; + const Display* GetDisplay(int index) const; + + private: + void Init(); + void ShallowCopy(Atom&) const; + void Clear(); + long Parse(IMkvReader*, long long pos, long long size); + static long long GetTime(const Chapters*, long long timecode); + + long ParseDisplay(IMkvReader*, long long pos, long long size); + bool ExpandDisplaysArray(); + + char* m_string_uid; + unsigned long long m_uid; + long long m_start_timecode; + long long m_stop_timecode; + + Display* m_displays; + int m_displays_size; + int m_displays_count; + }; + + class Edition { + friend class Chapters; + Edition(); + Edition(const Edition&); + ~Edition(); + Edition& operator=(const Edition&); + + public: + int GetAtomCount() const; + const Atom* GetAtom(int index) const; + + private: + void Init(); + void ShallowCopy(Edition&) const; + void Clear(); + long Parse(IMkvReader*, long long pos, long long size); + + long ParseAtom(IMkvReader*, long long pos, long long size); + bool ExpandAtomsArray(); + + Atom* m_atoms; + int m_atoms_size; + int m_atoms_count; + }; + + int GetEditionCount() const; + const Edition* GetEdition(int index) const; + + private: + long ParseEdition(long long pos, long long size); + bool ExpandEditionsArray(); + + Edition* m_editions; + int m_editions_size; + int m_editions_count; +}; + +class Tags { + Tags(const Tags&); + Tags& operator=(const Tags&); + + public: + Segment* const m_pSegment; + const long long m_start; + const long long m_size; + const long long m_element_start; + const long long m_element_size; + + Tags(Segment*, long long payload_start, long long payload_size, + long long element_start, long long element_size); + + ~Tags(); + + long Parse(); + + class Tag; + class SimpleTag; + + class SimpleTag { + friend class Tag; + SimpleTag(); + SimpleTag(const SimpleTag&); + ~SimpleTag(); + SimpleTag& operator=(const SimpleTag&); + + public: + const char* GetTagName() const; + const char* GetTagString() const; + + private: + void Init(); + void ShallowCopy(SimpleTag&) const; + void Clear(); + long Parse(IMkvReader*, long long pos, long long size); + + char* m_tag_name; + char* m_tag_string; + }; + + class Tag { + friend class Tags; + Tag(); + Tag(const Tag&); + ~Tag(); + Tag& operator=(const Tag&); + + public: + int GetSimpleTagCount() const; + const SimpleTag* GetSimpleTag(int index) const; + + private: + void Init(); + void ShallowCopy(Tag&) const; + void Clear(); + long Parse(IMkvReader*, long long pos, long long size); + + long ParseSimpleTag(IMkvReader*, long long pos, long long size); + bool ExpandSimpleTagsArray(); + + SimpleTag* m_simple_tags; + int m_simple_tags_size; + int m_simple_tags_count; + }; + + int GetTagCount() const; + const Tag* GetTag(int index) const; + + private: + long ParseTag(long long pos, long long size); + bool ExpandTagsArray(); + + Tag* m_tags; + int m_tags_size; + int m_tags_count; +}; + +class SegmentInfo { + SegmentInfo(const SegmentInfo&); + SegmentInfo& operator=(const SegmentInfo&); + + public: + Segment* const m_pSegment; + const long long m_start; + const long long m_size; + const long long m_element_start; + const long long m_element_size; + + SegmentInfo(Segment*, long long start, long long size, + long long element_start, long long element_size); + + ~SegmentInfo(); + + long Parse(); + + long long GetTimeCodeScale() const; + long long GetDuration() const; // scaled + const char* GetMuxingAppAsUTF8() const; + const char* GetWritingAppAsUTF8() const; + const char* GetTitleAsUTF8() const; + + private: + long long m_timecodeScale; + double m_duration; + char* m_pMuxingAppAsUTF8; + char* m_pWritingAppAsUTF8; + char* m_pTitleAsUTF8; +}; + +class SeekHead { + SeekHead(const SeekHead&); + SeekHead& operator=(const SeekHead&); + + public: + Segment* const m_pSegment; + const long long m_start; + const long long m_size; + const long long m_element_start; + const long long m_element_size; + + SeekHead(Segment*, long long start, long long size, long long element_start, + long long element_size); + + ~SeekHead(); + + long Parse(); + + struct Entry { + Entry(); + + // the SeekHead entry payload + long long id; + long long pos; + + // absolute pos of SeekEntry ID + long long element_start; + + // SeekEntry ID size + size size + payload + long long element_size; + }; + + int GetCount() const; + const Entry* GetEntry(int idx) const; + + struct VoidElement { + // absolute pos of Void ID + long long element_start; + + // ID size + size size + payload size + long long element_size; + }; + + int GetVoidElementCount() const; + const VoidElement* GetVoidElement(int idx) const; + + private: + Entry* m_entries; + int m_entry_count; + + VoidElement* m_void_elements; + int m_void_element_count; + + static bool ParseEntry(IMkvReader*, + long long pos, // payload + long long size, Entry*); +}; + +class Cues; +class CuePoint { + friend class Cues; + + CuePoint(long, long long); + ~CuePoint(); + + CuePoint(const CuePoint&); + CuePoint& operator=(const CuePoint&); + + public: + long long m_element_start; + long long m_element_size; + + bool Load(IMkvReader*); + + long long GetTimeCode() const; // absolute but unscaled + long long GetTime(const Segment*) const; // absolute and scaled (ns units) + + struct TrackPosition { + long long m_track; + long long m_pos; // of cluster + long long m_block; + // codec_state //defaults to 0 + // reference = clusters containing req'd referenced blocks + // reftime = timecode of the referenced block + + bool Parse(IMkvReader*, long long, long long); + }; + + const TrackPosition* Find(const Track*) const; + + private: + const long m_index; + long long m_timecode; + TrackPosition* m_track_positions; + size_t m_track_positions_count; +}; + +class Cues { + friend class Segment; + + Cues(Segment*, long long start, long long size, long long element_start, + long long element_size); + ~Cues(); + + Cues(const Cues&); + Cues& operator=(const Cues&); + + public: + Segment* const m_pSegment; + const long long m_start; + const long long m_size; + const long long m_element_start; + const long long m_element_size; + + bool Find( // lower bound of time_ns + long long time_ns, const Track*, const CuePoint*&, + const CuePoint::TrackPosition*&) const; + + const CuePoint* GetFirst() const; + const CuePoint* GetLast() const; + const CuePoint* GetNext(const CuePoint*) const; + + const BlockEntry* GetBlock(const CuePoint*, + const CuePoint::TrackPosition*) const; + + bool LoadCuePoint() const; + long GetCount() const; // loaded only + // long GetTotal() const; //loaded + preloaded + bool DoneParsing() const; + + private: + bool Init() const; + bool PreloadCuePoint(long&, long long) const; + + mutable CuePoint** m_cue_points; + mutable long m_count; + mutable long m_preload_count; + mutable long long m_pos; +}; + +class Cluster { + friend class Segment; + + Cluster(const Cluster&); + Cluster& operator=(const Cluster&); + + public: + Segment* const m_pSegment; + + public: + static Cluster* Create(Segment*, + long index, // index in segment + long long off); // offset relative to segment + // long long element_size); + + Cluster(); // EndOfStream + ~Cluster(); + + bool EOS() const; + + long long GetTimeCode() const; // absolute, but not scaled + long long GetTime() const; // absolute, and scaled (nanosecond units) + long long GetFirstTime() const; // time (ns) of first (earliest) block + long long GetLastTime() const; // time (ns) of last (latest) block + + long GetFirst(const BlockEntry*&) const; + long GetLast(const BlockEntry*&) const; + long GetNext(const BlockEntry* curr, const BlockEntry*& next) const; + + const BlockEntry* GetEntry(const Track*, long long ns = -1) const; + const BlockEntry* GetEntry(const CuePoint&, + const CuePoint::TrackPosition&) const; + // const BlockEntry* GetMaxKey(const VideoTrack*) const; + + // static bool HasBlockEntries(const Segment*, long long); + + static long HasBlockEntries(const Segment*, long long idoff, long long& pos, + long& size); + + long GetEntryCount() const; + + long Load(long long& pos, long& size) const; + + long Parse(long long& pos, long& size) const; + long GetEntry(long index, const mkvparser::BlockEntry*&) const; + + protected: + Cluster(Segment*, long index, long long element_start); + // long long element_size); + + public: + const long long m_element_start; + long long GetPosition() const; // offset relative to segment + + long GetIndex() const; + long long GetElementSize() const; + // long long GetPayloadSize() const; + + // long long Unparsed() const; + + private: + long m_index; + mutable long long m_pos; + // mutable long long m_size; + mutable long long m_element_size; + mutable long long m_timecode; + mutable BlockEntry** m_entries; + mutable long m_entries_size; + mutable long m_entries_count; + + long ParseSimpleBlock(long long, long long&, long&); + long ParseBlockGroup(long long, long long&, long&); + + long CreateBlock(long long id, long long pos, long long size, + long long discard_padding); + long CreateBlockGroup(long long start_offset, long long size, + long long discard_padding); + long CreateSimpleBlock(long long, long long); +}; + +class Segment { + friend class Cues; + friend class Track; + friend class VideoTrack; + + Segment(const Segment&); + Segment& operator=(const Segment&); + + private: + Segment(IMkvReader*, long long elem_start, + // long long elem_size, + long long pos, long long size); + + public: + IMkvReader* const m_pReader; + const long long m_element_start; + // const long long m_element_size; + const long long m_start; // posn of segment payload + const long long m_size; // size of segment payload + Cluster m_eos; // TODO: make private? + + static long long CreateInstance(IMkvReader*, long long, Segment*&); + ~Segment(); + + long Load(); // loads headers and all clusters + + // for incremental loading + // long long Unparsed() const; + bool DoneParsing() const; + long long ParseHeaders(); // stops when first cluster is found + // long FindNextCluster(long long& pos, long& size) const; + long LoadCluster(long long& pos, long& size); // load one cluster + long LoadCluster(); + + long ParseNext(const Cluster* pCurr, const Cluster*& pNext, long long& pos, + long& size); + + const SeekHead* GetSeekHead() const; + const Tracks* GetTracks() const; + const SegmentInfo* GetInfo() const; + const Cues* GetCues() const; + const Chapters* GetChapters() const; + const Tags* GetTags() const; + + long long GetDuration() const; + + unsigned long GetCount() const; + const Cluster* GetFirst() const; + const Cluster* GetLast() const; + const Cluster* GetNext(const Cluster*); + + const Cluster* FindCluster(long long time_nanoseconds) const; + // const BlockEntry* Seek(long long time_nanoseconds, const Track*) const; + + const Cluster* FindOrPreloadCluster(long long pos); + + long ParseCues(long long cues_off, // offset relative to start of segment + long long& parse_pos, long& parse_len); + + private: + long long m_pos; // absolute file posn; what has been consumed so far + Cluster* m_pUnknownSize; + + SeekHead* m_pSeekHead; + SegmentInfo* m_pInfo; + Tracks* m_pTracks; + Cues* m_pCues; + Chapters* m_pChapters; + Tags* m_pTags; + Cluster** m_clusters; + long m_clusterCount; // number of entries for which m_index >= 0 + long m_clusterPreloadCount; // number of entries for which m_index < 0 + long m_clusterSize; // array size + + long DoLoadCluster(long long&, long&); + long DoLoadClusterUnknownSize(long long&, long&); + long DoParseNext(const Cluster*&, long long&, long&); + + bool AppendCluster(Cluster*); + bool PreloadCluster(Cluster*, ptrdiff_t); + + // void ParseSeekHead(long long pos, long long size); + // void ParseSeekEntry(long long pos, long long size); + // void ParseCues(long long); + + const BlockEntry* GetBlock(const CuePoint&, const CuePoint::TrackPosition&); +}; + +} // namespace mkvparser + +inline long mkvparser::Segment::LoadCluster() { + long long pos; + long size; + + return LoadCluster(pos, size); +} + +#endif // MKVPARSER_MKVPARSER_H_ diff --git a/libs/libaom/src/third_party/libwebm/mkvparser/mkvreader.cc b/libs/libaom/src/third_party/libwebm/mkvparser/mkvreader.cc new file mode 100644 index 000000000..9d19c1be5 --- /dev/null +++ b/libs/libaom/src/third_party/libwebm/mkvparser/mkvreader.cc @@ -0,0 +1,135 @@ +// Copyright (c) 2010 The WebM project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +#include "mkvparser/mkvreader.h" + +#include + +#include + +namespace mkvparser { + +MkvReader::MkvReader() : m_file(NULL), reader_owns_file_(true) {} + +MkvReader::MkvReader(FILE* fp) : m_file(fp), reader_owns_file_(false) { + GetFileSize(); +} + +MkvReader::~MkvReader() { + if (reader_owns_file_) + Close(); + m_file = NULL; +} + +int MkvReader::Open(const char* fileName) { + if (fileName == NULL) + return -1; + + if (m_file) + return -1; + +#ifdef _MSC_VER + const errno_t e = fopen_s(&m_file, fileName, "rb"); + + if (e) + return -1; // error +#else + m_file = fopen(fileName, "rb"); + + if (m_file == NULL) + return -1; +#endif + return !GetFileSize(); +} + +bool MkvReader::GetFileSize() { + if (m_file == NULL) + return false; +#ifdef _MSC_VER + int status = _fseeki64(m_file, 0L, SEEK_END); + + if (status) + return false; // error + + m_length = _ftelli64(m_file); +#else + fseek(m_file, 0L, SEEK_END); + m_length = ftell(m_file); +#endif + assert(m_length >= 0); + + if (m_length < 0) + return false; + +#ifdef _MSC_VER + status = _fseeki64(m_file, 0L, SEEK_SET); + + if (status) + return false; // error +#else + fseek(m_file, 0L, SEEK_SET); +#endif + + return true; +} + +void MkvReader::Close() { + if (m_file != NULL) { + fclose(m_file); + m_file = NULL; + } +} + +int MkvReader::Length(long long* total, long long* available) { + if (m_file == NULL) + return -1; + + if (total) + *total = m_length; + + if (available) + *available = m_length; + + return 0; +} + +int MkvReader::Read(long long offset, long len, unsigned char* buffer) { + if (m_file == NULL) + return -1; + + if (offset < 0) + return -1; + + if (len < 0) + return -1; + + if (len == 0) + return 0; + + if (offset >= m_length) + return -1; + +#ifdef _MSC_VER + const int status = _fseeki64(m_file, offset, SEEK_SET); + + if (status) + return -1; // error +#elif defined(_WIN32) + fseeko64(m_file, static_cast(offset), SEEK_SET); +#else + fseeko(m_file, static_cast(offset), SEEK_SET); +#endif + + const size_t size = fread(buffer, 1, len, m_file); + + if (size < size_t(len)) + return -1; // error + + return 0; // success +} + +} // namespace mkvparser diff --git a/libs/libaom/src/third_party/libwebm/mkvparser/mkvreader.h b/libs/libaom/src/third_party/libwebm/mkvparser/mkvreader.h new file mode 100644 index 000000000..9831ecf64 --- /dev/null +++ b/libs/libaom/src/third_party/libwebm/mkvparser/mkvreader.h @@ -0,0 +1,45 @@ +// Copyright (c) 2010 The WebM project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +#ifndef MKVPARSER_MKVREADER_H_ +#define MKVPARSER_MKVREADER_H_ + +#include + +#include "mkvparser/mkvparser.h" + +namespace mkvparser { + +class MkvReader : public IMkvReader { + public: + MkvReader(); + explicit MkvReader(FILE* fp); + virtual ~MkvReader(); + + int Open(const char*); + void Close(); + + virtual int Read(long long position, long length, unsigned char* buffer); + virtual int Length(long long* total, long long* available); + + private: + MkvReader(const MkvReader&); + MkvReader& operator=(const MkvReader&); + + // Determines the size of the file. This is called either by the constructor + // or by the Open function depending on file ownership. Returns true on + // success. + bool GetFileSize(); + + long long m_length; + FILE* m_file; + bool reader_owns_file_; +}; + +} // namespace mkvparser + +#endif // MKVPARSER_MKVREADER_H_ diff --git a/libs/libaom/src/third_party/libyuv/README.libaom b/libs/libaom/src/third_party/libyuv/README.libaom new file mode 100644 index 000000000..09693c1f2 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/README.libaom @@ -0,0 +1,15 @@ +Name: libyuv +URL: http://code.google.com/p/libyuv/ +Version: 1456 +License: BSD +License File: LICENSE + +Description: +libyuv is an open source project that includes YUV conversion and scaling +functionality. + +The optimized scaler in libyuv is used in multiple resolution encoder example, +which down-samples the original input video (f.g. 1280x720) a number of times +in order to encode multiple resolution bit streams. + +Local Modifications: diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/basic_types.h b/libs/libaom/src/third_party/libyuv/include/libyuv/basic_types.h new file mode 100644 index 000000000..66e68536c --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/include/libyuv/basic_types.h @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_ // NOLINT +#define INCLUDE_LIBYUV_BASIC_TYPES_H_ + +#include // for NULL, size_t + +#if defined(__ANDROID__) || (defined(_MSC_VER) && (_MSC_VER < 1600)) +#include // for uintptr_t on x86 +#else +#include // for uintptr_t +#endif + +#ifndef GG_LONGLONG +#ifndef INT_TYPES_DEFINED +#define INT_TYPES_DEFINED +#ifdef COMPILER_MSVC +typedef unsigned __int64 uint64; +typedef __int64 int64; +#ifndef INT64_C +#define INT64_C(x) x ## I64 +#endif +#ifndef UINT64_C +#define UINT64_C(x) x ## UI64 +#endif +#define INT64_F "I64" +#else // COMPILER_MSVC +#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__) +typedef unsigned long uint64; // NOLINT +typedef long int64; // NOLINT +#ifndef INT64_C +#define INT64_C(x) x ## L +#endif +#ifndef UINT64_C +#define UINT64_C(x) x ## UL +#endif +#define INT64_F "l" +#else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__) +typedef unsigned long long uint64; // NOLINT +typedef long long int64; // NOLINT +#ifndef INT64_C +#define INT64_C(x) x ## LL +#endif +#ifndef UINT64_C +#define UINT64_C(x) x ## ULL +#endif +#define INT64_F "ll" +#endif // __LP64__ +#endif // COMPILER_MSVC +typedef unsigned int uint32; +typedef int int32; +typedef unsigned short uint16; // NOLINT +typedef short int16; // NOLINT +typedef unsigned char uint8; +typedef signed char int8; +#endif // INT_TYPES_DEFINED +#endif // GG_LONGLONG + +// Detect compiler is for x86 or x64. +#if defined(__x86_64__) || defined(_M_X64) || \ + defined(__i386__) || defined(_M_IX86) +#define CPU_X86 1 +#endif +// Detect compiler is for ARM. +#if defined(__arm__) || defined(_M_ARM) +#define CPU_ARM 1 +#endif + +#ifndef ALIGNP +#ifdef __cplusplus +#define ALIGNP(p, t) \ + (reinterpret_cast(((reinterpret_cast(p) + \ + ((t) - 1)) & ~((t) - 1)))) +#else +#define ALIGNP(p, t) \ + ((uint8*)((((uintptr_t)(p) + ((t) - 1)) & ~((t) - 1)))) /* NOLINT */ +#endif +#endif + +#if !defined(LIBYUV_API) +#if defined(_WIN32) || defined(__CYGWIN__) +#if defined(LIBYUV_BUILDING_SHARED_LIBRARY) +#define LIBYUV_API __declspec(dllexport) +#elif defined(LIBYUV_USING_SHARED_LIBRARY) +#define LIBYUV_API __declspec(dllimport) +#else +#define LIBYUV_API +#endif // LIBYUV_BUILDING_SHARED_LIBRARY +#elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \ + (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \ + defined(LIBYUV_USING_SHARED_LIBRARY)) +#define LIBYUV_API __attribute__ ((visibility ("default"))) +#else +#define LIBYUV_API +#endif // __GNUC__ +#endif // LIBYUV_API + +#define LIBYUV_BOOL int +#define LIBYUV_FALSE 0 +#define LIBYUV_TRUE 1 + +// Visual C x86 or GCC little endian. +#if defined(__x86_64__) || defined(_M_X64) || \ + defined(__i386__) || defined(_M_IX86) || \ + defined(__arm__) || defined(_M_ARM) || \ + (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#define LIBYUV_LITTLE_ENDIAN +#endif + +#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_ NOLINT diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/compare.h b/libs/libaom/src/third_party/libyuv/include/libyuv/compare.h new file mode 100644 index 000000000..2a9f1560c --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/include/libyuv/compare.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef INCLUDE_LIBYUV_COMPARE_H_ // NOLINT +#define INCLUDE_LIBYUV_COMPARE_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Compute a hash for specified memory. Seed of 5381 recommended. +LIBYUV_API +uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed); + +// Scan an opaque argb image and return fourcc based on alpha offset. +// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown. +LIBYUV_API +uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height); + +// Sum Square Error - used to compute Mean Square Error or PSNR. +LIBYUV_API +uint64 ComputeSumSquareError(const uint8* src_a, + const uint8* src_b, int count); + +LIBYUV_API +uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b, + int width, int height); + +static const int kMaxPsnr = 128; + +LIBYUV_API +double SumSquareErrorToPsnr(uint64 sse, uint64 count); + +LIBYUV_API +double CalcFramePsnr(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b, + int width, int height); + +LIBYUV_API +double I420Psnr(const uint8* src_y_a, int stride_y_a, + const uint8* src_u_a, int stride_u_a, + const uint8* src_v_a, int stride_v_a, + const uint8* src_y_b, int stride_y_b, + const uint8* src_u_b, int stride_u_b, + const uint8* src_v_b, int stride_v_b, + int width, int height); + +LIBYUV_API +double CalcFrameSsim(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b, + int width, int height); + +LIBYUV_API +double I420Ssim(const uint8* src_y_a, int stride_y_a, + const uint8* src_u_a, int stride_u_a, + const uint8* src_v_a, int stride_v_a, + const uint8* src_y_b, int stride_y_b, + const uint8* src_u_b, int stride_u_b, + const uint8* src_v_b, int stride_v_b, + int width, int height); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_COMPARE_H_ NOLINT diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/convert.h b/libs/libaom/src/third_party/libyuv/include/libyuv/convert.h new file mode 100644 index 000000000..d6f206c10 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/include/libyuv/convert.h @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef INCLUDE_LIBYUV_CONVERT_H_ // NOLINT +#define INCLUDE_LIBYUV_CONVERT_H_ + +#include "libyuv/basic_types.h" +// TODO(fbarchard): Remove the following headers includes. +#include "libyuv/convert_from.h" +#include "libyuv/planar_functions.h" +#include "libyuv/rotate.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Convert I444 to I420. +LIBYUV_API +int I444ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert I422 to I420. +LIBYUV_API +int I422ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert I411 to I420. +LIBYUV_API +int I411ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Copy I420 to I420. +#define I420ToI420 I420Copy +LIBYUV_API +int I420Copy(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert I400 (grey) to I420. +LIBYUV_API +int I400ToI420(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +#define J400ToJ420 I400ToI420 + +// Convert NV12 to I420. +LIBYUV_API +int NV12ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert NV21 to I420. +LIBYUV_API +int NV21ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_vu, int src_stride_vu, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert YUY2 to I420. +LIBYUV_API +int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert UYVY to I420. +LIBYUV_API +int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert M420 to I420. +LIBYUV_API +int M420ToI420(const uint8* src_m420, int src_stride_m420, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// ARGB little endian (bgra in memory) to I420. +LIBYUV_API +int ARGBToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// BGRA little endian (argb in memory) to I420. +LIBYUV_API +int BGRAToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// ABGR little endian (rgba in memory) to I420. +LIBYUV_API +int ABGRToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// RGBA little endian (abgr in memory) to I420. +LIBYUV_API +int RGBAToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// RGB little endian (bgr in memory) to I420. +LIBYUV_API +int RGB24ToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// RGB big endian (rgb in memory) to I420. +LIBYUV_API +int RAWToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// RGB16 (RGBP fourcc) little endian to I420. +LIBYUV_API +int RGB565ToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// RGB15 (RGBO fourcc) little endian to I420. +LIBYUV_API +int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// RGB12 (R444 fourcc) little endian to I420. +LIBYUV_API +int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +#ifdef HAVE_JPEG +// src_width/height provided by capture. +// dst_width/height for clipping determine final size. +LIBYUV_API +int MJPGToI420(const uint8* sample, size_t sample_size, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int src_width, int src_height, + int dst_width, int dst_height); + +// Query size of MJPG in pixels. +LIBYUV_API +int MJPGSize(const uint8* sample, size_t sample_size, + int* width, int* height); +#endif + +// Convert camera sample to I420 with cropping, rotation and vertical flip. +// "src_size" is needed to parse MJPG. +// "dst_stride_y" number of bytes in a row of the dst_y plane. +// Normally this would be the same as dst_width, with recommended alignment +// to 16 bytes for better efficiency. +// If rotation of 90 or 270 is used, stride is affected. The caller should +// allocate the I420 buffer according to rotation. +// "dst_stride_u" number of bytes in a row of the dst_u plane. +// Normally this would be the same as (dst_width + 1) / 2, with +// recommended alignment to 16 bytes for better efficiency. +// If rotation of 90 or 270 is used, stride is affected. +// "crop_x" and "crop_y" are starting position for cropping. +// To center, crop_x = (src_width - dst_width) / 2 +// crop_y = (src_height - dst_height) / 2 +// "src_width" / "src_height" is size of src_frame in pixels. +// "src_height" can be negative indicating a vertically flipped image source. +// "crop_width" / "crop_height" is the size to crop the src to. +// Must be less than or equal to src_width/src_height +// Cropping parameters are pre-rotation. +// "rotation" can be 0, 90, 180 or 270. +// "format" is a fourcc. ie 'I420', 'YUY2' +// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure. +LIBYUV_API +int ConvertToI420(const uint8* src_frame, size_t src_size, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int crop_x, int crop_y, + int src_width, int src_height, + int crop_width, int crop_height, + enum RotationMode rotation, + uint32 format); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_CONVERT_H_ NOLINT diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/convert_argb.h b/libs/libaom/src/third_party/libyuv/include/libyuv/convert_argb.h new file mode 100644 index 000000000..ea75c0b26 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/include/libyuv/convert_argb.h @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_ // NOLINT +#define INCLUDE_LIBYUV_CONVERT_ARGB_H_ + +#include "libyuv/basic_types.h" +// TODO(fbarchard): Remove the following headers includes +#include "libyuv/convert_from.h" +#include "libyuv/planar_functions.h" +#include "libyuv/rotate.h" + +// TODO(fbarchard): This set of functions should exactly match convert.h +// TODO(fbarchard): Add tests. Create random content of right size and convert +// with C vs Opt and or to I420 and compare. +// TODO(fbarchard): Some of these functions lack parameter setting. + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Alias. +#define ARGBToARGB ARGBCopy + +// Copy ARGB to ARGB. +LIBYUV_API +int ARGBCopy(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I420 to ARGB. +LIBYUV_API +int I420ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I422 to ARGB. +LIBYUV_API +int I422ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I444 to ARGB. +LIBYUV_API +int I444ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I411 to ARGB. +LIBYUV_API +int I411ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I400 (grey) to ARGB. Reverse of ARGBToI400. +LIBYUV_API +int I400ToARGB(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert J400 (jpeg grey) to ARGB. +LIBYUV_API +int J400ToARGB(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Alias. +#define YToARGB I400ToARGB + +// Convert NV12 to ARGB. +LIBYUV_API +int NV12ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert NV21 to ARGB. +LIBYUV_API +int NV21ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_vu, int src_stride_vu, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert M420 to ARGB. +LIBYUV_API +int M420ToARGB(const uint8* src_m420, int src_stride_m420, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert YUY2 to ARGB. +LIBYUV_API +int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert UYVY to ARGB. +LIBYUV_API +int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert J420 to ARGB. +LIBYUV_API +int J420ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert J422 to ARGB. +LIBYUV_API +int J422ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// BGRA little endian (argb in memory) to ARGB. +LIBYUV_API +int BGRAToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// ABGR little endian (rgba in memory) to ARGB. +LIBYUV_API +int ABGRToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// RGBA little endian (abgr in memory) to ARGB. +LIBYUV_API +int RGBAToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Deprecated function name. +#define BG24ToARGB RGB24ToARGB + +// RGB little endian (bgr in memory) to ARGB. +LIBYUV_API +int RGB24ToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// RGB big endian (rgb in memory) to ARGB. +LIBYUV_API +int RAWToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// RGB16 (RGBP fourcc) little endian to ARGB. +LIBYUV_API +int RGB565ToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// RGB15 (RGBO fourcc) little endian to ARGB. +LIBYUV_API +int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// RGB12 (R444 fourcc) little endian to ARGB. +LIBYUV_API +int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +#ifdef HAVE_JPEG +// src_width/height provided by capture +// dst_width/height for clipping determine final size. +LIBYUV_API +int MJPGToARGB(const uint8* sample, size_t sample_size, + uint8* dst_argb, int dst_stride_argb, + int src_width, int src_height, + int dst_width, int dst_height); +#endif + +// Convert camera sample to ARGB with cropping, rotation and vertical flip. +// "src_size" is needed to parse MJPG. +// "dst_stride_argb" number of bytes in a row of the dst_argb plane. +// Normally this would be the same as dst_width, with recommended alignment +// to 16 bytes for better efficiency. +// If rotation of 90 or 270 is used, stride is affected. The caller should +// allocate the I420 buffer according to rotation. +// "dst_stride_u" number of bytes in a row of the dst_u plane. +// Normally this would be the same as (dst_width + 1) / 2, with +// recommended alignment to 16 bytes for better efficiency. +// If rotation of 90 or 270 is used, stride is affected. +// "crop_x" and "crop_y" are starting position for cropping. +// To center, crop_x = (src_width - dst_width) / 2 +// crop_y = (src_height - dst_height) / 2 +// "src_width" / "src_height" is size of src_frame in pixels. +// "src_height" can be negative indicating a vertically flipped image source. +// "crop_width" / "crop_height" is the size to crop the src to. +// Must be less than or equal to src_width/src_height +// Cropping parameters are pre-rotation. +// "rotation" can be 0, 90, 180 or 270. +// "format" is a fourcc. ie 'I420', 'YUY2' +// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure. +LIBYUV_API +int ConvertToARGB(const uint8* src_frame, size_t src_size, + uint8* dst_argb, int dst_stride_argb, + int crop_x, int crop_y, + int src_width, int src_height, + int crop_width, int crop_height, + enum RotationMode rotation, + uint32 format); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_CONVERT_ARGB_H_ NOLINT diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/convert_from.h b/libs/libaom/src/third_party/libyuv/include/libyuv/convert_from.h new file mode 100644 index 000000000..3591b4fd6 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/include/libyuv/convert_from.h @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_ // NOLINT +#define INCLUDE_LIBYUV_CONVERT_FROM_H_ + +#include "libyuv/basic_types.h" +#include "libyuv/rotate.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// See Also convert.h for conversions from formats to I420. + +// I420Copy in convert to I420ToI420. + +LIBYUV_API +int I420ToI422(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +LIBYUV_API +int I420ToI444(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +LIBYUV_API +int I420ToI411(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21. +LIBYUV_API +int I400Copy(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height); + +// TODO(fbarchard): I420ToM420 + +LIBYUV_API +int I420ToNV12(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_uv, int dst_stride_uv, + int width, int height); + +LIBYUV_API +int I420ToNV21(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_vu, int dst_stride_vu, + int width, int height); + +LIBYUV_API +int I420ToYUY2(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToUYVY(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +LIBYUV_API +int I420ToBGRA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +LIBYUV_API +int I420ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +LIBYUV_API +int I420ToRGBA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_rgba, int dst_stride_rgba, + int width, int height); + +LIBYUV_API +int I420ToRGB24(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToRAW(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes). +// Values in dither matrix from 0 to 7 recommended. +// The order of the dither matrix is first byte is upper left. + +LIBYUV_API +int I420ToRGB565Dither(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + const uint8* dither4x4, int width, int height); + +LIBYUV_API +int I420ToARGB1555(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToARGB4444(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +// Convert I420 to specified format. +// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the +// buffer has contiguous rows. Can be negative. A multiple of 16 is optimal. +LIBYUV_API +int ConvertFromI420(const uint8* y, int y_stride, + const uint8* u, int u_stride, + const uint8* v, int v_stride, + uint8* dst_sample, int dst_sample_stride, + int width, int height, + uint32 format); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_CONVERT_FROM_H_ NOLINT diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/convert_from_argb.h b/libs/libaom/src/third_party/libyuv/include/libyuv/convert_from_argb.h new file mode 100644 index 000000000..4a6226813 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/include/libyuv/convert_from_argb.h @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ // NOLINT +#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Copy ARGB to ARGB. +#define ARGBToARGB ARGBCopy +LIBYUV_API +int ARGBCopy(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert ARGB To BGRA. +LIBYUV_API +int ARGBToBGRA(const uint8* src_argb, int src_stride_argb, + uint8* dst_bgra, int dst_stride_bgra, + int width, int height); + +// Convert ARGB To ABGR. +LIBYUV_API +int ARGBToABGR(const uint8* src_argb, int src_stride_argb, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height); + +// Convert ARGB To RGBA. +LIBYUV_API +int ARGBToRGBA(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgba, int dst_stride_rgba, + int width, int height); + +// Convert ARGB To RGB24. +LIBYUV_API +int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb24, int dst_stride_rgb24, + int width, int height); + +// Convert ARGB To RAW. +LIBYUV_API +int ARGBToRAW(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb, int dst_stride_rgb, + int width, int height); + +// Convert ARGB To RGB565. +LIBYUV_API +int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height); + +// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes). +// Values in dither matrix from 0 to 7 recommended. +// The order of the dither matrix is first byte is upper left. +// TODO(fbarchard): Consider pointer to 2d array for dither4x4. +// const uint8(*dither)[4][4]; +LIBYUV_API +int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb565, int dst_stride_rgb565, + const uint8* dither4x4, int width, int height); + +// Convert ARGB To ARGB1555. +LIBYUV_API +int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb1555, int dst_stride_argb1555, + int width, int height); + +// Convert ARGB To ARGB4444. +LIBYUV_API +int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb4444, int dst_stride_argb4444, + int width, int height); + +// Convert ARGB To I444. +LIBYUV_API +int ARGBToI444(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert ARGB To I422. +LIBYUV_API +int ARGBToI422(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert ARGB To I420. (also in convert.h) +LIBYUV_API +int ARGBToI420(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert ARGB to J420. (JPeg full range I420). +LIBYUV_API +int ARGBToJ420(const uint8* src_argb, int src_stride_argb, + uint8* dst_yj, int dst_stride_yj, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert ARGB to J422. +LIBYUV_API +int ARGBToJ422(const uint8* src_argb, int src_stride_argb, + uint8* dst_yj, int dst_stride_yj, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert ARGB To I411. +LIBYUV_API +int ARGBToI411(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert ARGB to J400. (JPeg full range). +LIBYUV_API +int ARGBToJ400(const uint8* src_argb, int src_stride_argb, + uint8* dst_yj, int dst_stride_yj, + int width, int height); + +// Convert ARGB to I400. +LIBYUV_API +int ARGBToI400(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + int width, int height); + +// Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB) +LIBYUV_API +int ARGBToG(const uint8* src_argb, int src_stride_argb, + uint8* dst_g, int dst_stride_g, + int width, int height); + +// Convert ARGB To NV12. +LIBYUV_API +int ARGBToNV12(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_uv, int dst_stride_uv, + int width, int height); + +// Convert ARGB To NV21. +LIBYUV_API +int ARGBToNV21(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_vu, int dst_stride_vu, + int width, int height); + +// Convert ARGB To NV21. +LIBYUV_API +int ARGBToNV21(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_vu, int dst_stride_vu, + int width, int height); + +// Convert ARGB To YUY2. +LIBYUV_API +int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, + uint8* dst_yuy2, int dst_stride_yuy2, + int width, int height); + +// Convert ARGB To UYVY. +LIBYUV_API +int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, + uint8* dst_uyvy, int dst_stride_uyvy, + int width, int height); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ NOLINT diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/cpu_id.h b/libs/libaom/src/third_party/libyuv/include/libyuv/cpu_id.h new file mode 100644 index 000000000..870e94e8c --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/include/libyuv/cpu_id.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef INCLUDE_LIBYUV_CPU_ID_H_ // NOLINT +#define INCLUDE_LIBYUV_CPU_ID_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// TODO(fbarchard): Consider overlapping bits for different architectures. +// Internal flag to indicate cpuid requires initialization. +#define kCpuInit 0x1 + +// These flags are only valid on ARM processors. +static const int kCpuHasARM = 0x2; +static const int kCpuHasNEON = 0x4; +// 0x8 reserved for future ARM flag. + +// These flags are only valid on x86 processors. +static const int kCpuHasX86 = 0x10; +static const int kCpuHasSSE2 = 0x20; +static const int kCpuHasSSSE3 = 0x40; +static const int kCpuHasSSE41 = 0x80; +static const int kCpuHasSSE42 = 0x100; +static const int kCpuHasAVX = 0x200; +static const int kCpuHasAVX2 = 0x400; +static const int kCpuHasERMS = 0x800; +static const int kCpuHasFMA3 = 0x1000; +// 0x2000, 0x4000, 0x8000 reserved for future X86 flags. + +// These flags are only valid on MIPS processors. +static const int kCpuHasMIPS = 0x10000; +static const int kCpuHasMIPS_DSP = 0x20000; +static const int kCpuHasMIPS_DSPR2 = 0x40000; + +// Internal function used to auto-init. +LIBYUV_API +int InitCpuFlags(void); + +// Internal function for parsing /proc/cpuinfo. +LIBYUV_API +int ArmCpuCaps(const char* cpuinfo_name); + +// Detect CPU has SSE2 etc. +// Test_flag parameter should be one of kCpuHas constants above. +// returns non-zero if instruction set is detected +static __inline int TestCpuFlag(int test_flag) { + LIBYUV_API extern int cpu_info_; + return (cpu_info_ == kCpuInit ? InitCpuFlags() : cpu_info_) & test_flag; +} + +// For testing, allow CPU flags to be disabled. +// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3. +// MaskCpuFlags(-1) to enable all cpu specific optimizations. +// MaskCpuFlags(0) to disable all cpu specific optimizations. +LIBYUV_API +void MaskCpuFlags(int enable_flags); + +// Low level cpuid for X86. Returns zeros on other CPUs. +// eax is the info type that you want. +// ecx is typically the cpu number, and should normally be zero. +LIBYUV_API +void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_CPU_ID_H_ NOLINT diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/mjpeg_decoder.h b/libs/libaom/src/third_party/libyuv/include/libyuv/mjpeg_decoder.h new file mode 100644 index 000000000..fa1e51f9a --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/include/libyuv/mjpeg_decoder.h @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_ // NOLINT +#define INCLUDE_LIBYUV_MJPEG_DECODER_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +// NOTE: For a simplified public API use convert.h MJPGToI420(). + +struct jpeg_common_struct; +struct jpeg_decompress_struct; +struct jpeg_source_mgr; + +namespace libyuv { + +#ifdef __cplusplus +extern "C" { +#endif + +LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size); + +#ifdef __cplusplus +} // extern "C" +#endif + +static const uint32 kUnknownDataSize = 0xFFFFFFFF; + +enum JpegSubsamplingType { + kJpegYuv420, + kJpegYuv422, + kJpegYuv411, + kJpegYuv444, + kJpegYuv400, + kJpegUnknown +}; + +struct Buffer { + const uint8* data; + int len; +}; + +struct BufferVector { + Buffer* buffers; + int len; + int pos; +}; + +struct SetJmpErrorMgr; + +// MJPEG ("Motion JPEG") is a pseudo-standard video codec where the frames are +// simply independent JPEG images with a fixed huffman table (which is omitted). +// It is rarely used in video transmission, but is common as a camera capture +// format, especially in Logitech devices. This class implements a decoder for +// MJPEG frames. +// +// See http://tools.ietf.org/html/rfc2435 +class LIBYUV_API MJpegDecoder { + public: + typedef void (*CallbackFunction)(void* opaque, + const uint8* const* data, + const int* strides, + int rows); + + static const int kColorSpaceUnknown; + static const int kColorSpaceGrayscale; + static const int kColorSpaceRgb; + static const int kColorSpaceYCbCr; + static const int kColorSpaceCMYK; + static const int kColorSpaceYCCK; + + MJpegDecoder(); + ~MJpegDecoder(); + + // Loads a new frame, reads its headers, and determines the uncompressed + // image format. + // Returns LIBYUV_TRUE if image looks valid and format is supported. + // If return value is LIBYUV_TRUE, then the values for all the following + // getters are populated. + // src_len is the size of the compressed mjpeg frame in bytes. + LIBYUV_BOOL LoadFrame(const uint8* src, size_t src_len); + + // Returns width of the last loaded frame in pixels. + int GetWidth(); + + // Returns height of the last loaded frame in pixels. + int GetHeight(); + + // Returns format of the last loaded frame. The return value is one of the + // kColorSpace* constants. + int GetColorSpace(); + + // Number of color components in the color space. + int GetNumComponents(); + + // Sample factors of the n-th component. + int GetHorizSampFactor(int component); + + int GetVertSampFactor(int component); + + int GetHorizSubSampFactor(int component); + + int GetVertSubSampFactor(int component); + + // Public for testability. + int GetImageScanlinesPerImcuRow(); + + // Public for testability. + int GetComponentScanlinesPerImcuRow(int component); + + // Width of a component in bytes. + int GetComponentWidth(int component); + + // Height of a component. + int GetComponentHeight(int component); + + // Width of a component in bytes with padding for DCTSIZE. Public for testing. + int GetComponentStride(int component); + + // Size of a component in bytes. + int GetComponentSize(int component); + + // Call this after LoadFrame() if you decide you don't want to decode it + // after all. + LIBYUV_BOOL UnloadFrame(); + + // Decodes the entire image into a one-buffer-per-color-component format. + // dst_width must match exactly. dst_height must be <= to image height; if + // less, the image is cropped. "planes" must have size equal to at least + // GetNumComponents() and they must point to non-overlapping buffers of size + // at least GetComponentSize(i). The pointers in planes are incremented + // to point to after the end of the written data. + // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded. + LIBYUV_BOOL DecodeToBuffers(uint8** planes, int dst_width, int dst_height); + + // Decodes the entire image and passes the data via repeated calls to a + // callback function. Each call will get the data for a whole number of + // image scanlines. + // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded. + LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, void* opaque, + int dst_width, int dst_height); + + // The helper function which recognizes the jpeg sub-sampling type. + static JpegSubsamplingType JpegSubsamplingTypeHelper( + int* subsample_x, int* subsample_y, int number_of_components); + + private: + void AllocOutputBuffers(int num_outbufs); + void DestroyOutputBuffers(); + + LIBYUV_BOOL StartDecode(); + LIBYUV_BOOL FinishDecode(); + + void SetScanlinePointers(uint8** data); + LIBYUV_BOOL DecodeImcuRow(); + + int GetComponentScanlinePadding(int component); + + // A buffer holding the input data for a frame. + Buffer buf_; + BufferVector buf_vec_; + + jpeg_decompress_struct* decompress_struct_; + jpeg_source_mgr* source_mgr_; + SetJmpErrorMgr* error_mgr_; + + // LIBYUV_TRUE iff at least one component has scanline padding. (i.e., + // GetComponentScanlinePadding() != 0.) + LIBYUV_BOOL has_scanline_padding_; + + // Temporaries used to point to scanline outputs. + int num_outbufs_; // Outermost size of all arrays below. + uint8*** scanlines_; + int* scanlines_sizes_; + // Temporary buffer used for decoding when we can't decode directly to the + // output buffers. Large enough for just one iMCU row. + uint8** databuf_; + int* databuf_strides_; +}; + +} // namespace libyuv + +#endif // __cplusplus +#endif // INCLUDE_LIBYUV_MJPEG_DECODER_H_ NOLINT diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/planar_functions.h b/libs/libaom/src/third_party/libyuv/include/libyuv/planar_functions.h new file mode 100644 index 000000000..7fe4d8eed --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/include/libyuv/planar_functions.h @@ -0,0 +1,454 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ // NOLINT +#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ + +#include "libyuv/basic_types.h" + +// TODO(fbarchard): Remove the following headers includes. +#include "libyuv/convert.h" +#include "libyuv/convert_argb.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Copy a plane of data. +LIBYUV_API +void CopyPlane(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height); + +LIBYUV_API +void CopyPlane_16(const uint16* src_y, int src_stride_y, + uint16* dst_y, int dst_stride_y, + int width, int height); + +// Set a plane of data to a 32 bit value. +LIBYUV_API +void SetPlane(uint8* dst_y, int dst_stride_y, + int width, int height, + uint32 value); + +// Copy I400. Supports inverting. +LIBYUV_API +int I400ToI400(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height); + +#define J400ToJ400 I400ToI400 + +// Copy I422 to I422. +#define I422ToI422 I422Copy +LIBYUV_API +int I422Copy(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Copy I444 to I444. +#define I444ToI444 I444Copy +LIBYUV_API +int I444Copy(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert YUY2 to I422. +LIBYUV_API +int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert UYVY to I422. +LIBYUV_API +int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +LIBYUV_API +int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_uv, int dst_stride_uv, + int width, int height); + +LIBYUV_API +int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_y, int dst_stride_y, + uint8* dst_uv, int dst_stride_uv, + int width, int height); + +// Convert I420 to I400. (calls CopyPlane ignoring u/v). +LIBYUV_API +int I420ToI400(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + int width, int height); + +// Alias +#define J420ToJ400 I420ToI400 +#define I420ToI420Mirror I420Mirror + +// I420 mirror. +LIBYUV_API +int I420Mirror(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Alias +#define I400ToI400Mirror I400Mirror + +// I400 mirror. A single plane is mirrored horizontally. +// Pass negative height to achieve 180 degree rotation. +LIBYUV_API +int I400Mirror(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height); + +// Alias +#define ARGBToARGBMirror ARGBMirror + +// ARGB mirror. +LIBYUV_API +int ARGBMirror(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert NV12 to RGB565. +LIBYUV_API +int NV12ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height); + +// Convert NV21 to RGB565. +LIBYUV_API +int NV21ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height); + +// I422ToARGB is in convert_argb.h +// Convert I422 to BGRA. +LIBYUV_API +int I422ToBGRA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_bgra, int dst_stride_bgra, + int width, int height); + +// Convert I422 to ABGR. +LIBYUV_API +int I422ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height); + +// Convert I422 to RGBA. +LIBYUV_API +int I422ToRGBA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_rgba, int dst_stride_rgba, + int width, int height); + +// Draw a rectangle into I420. +LIBYUV_API +int I420Rect(uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int x, int y, int width, int height, + int value_y, int value_u, int value_v); + +// Draw a rectangle into ARGB. +LIBYUV_API +int ARGBRect(uint8* dst_argb, int dst_stride_argb, + int x, int y, int width, int height, uint32 value); + +// Convert ARGB to gray scale ARGB. +LIBYUV_API +int ARGBGrayTo(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Make a rectangle of ARGB gray scale. +LIBYUV_API +int ARGBGray(uint8* dst_argb, int dst_stride_argb, + int x, int y, int width, int height); + +// Make a rectangle of ARGB Sepia tone. +LIBYUV_API +int ARGBSepia(uint8* dst_argb, int dst_stride_argb, + int x, int y, int width, int height); + +// Apply a matrix rotation to each ARGB pixel. +// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2. +// The first 4 coefficients apply to B, G, R, A and produce B of the output. +// The next 4 coefficients apply to B, G, R, A and produce G of the output. +// The next 4 coefficients apply to B, G, R, A and produce R of the output. +// The last 4 coefficients apply to B, G, R, A and produce A of the output. +LIBYUV_API +int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + const int8* matrix_argb, + int width, int height); + +// Deprecated. Use ARGBColorMatrix instead. +// Apply a matrix rotation to each ARGB pixel. +// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1. +// The first 4 coefficients apply to B, G, R, A and produce B of the output. +// The next 4 coefficients apply to B, G, R, A and produce G of the output. +// The last 4 coefficients apply to B, G, R, A and produce R of the output. +LIBYUV_API +int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb, + const int8* matrix_rgb, + int x, int y, int width, int height); + +// Apply a color table each ARGB pixel. +// Table contains 256 ARGB values. +LIBYUV_API +int ARGBColorTable(uint8* dst_argb, int dst_stride_argb, + const uint8* table_argb, + int x, int y, int width, int height); + +// Apply a color table each ARGB pixel but preserve destination alpha. +// Table contains 256 ARGB values. +LIBYUV_API +int RGBColorTable(uint8* dst_argb, int dst_stride_argb, + const uint8* table_argb, + int x, int y, int width, int height); + +// Apply a luma/color table each ARGB pixel but preserve destination alpha. +// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from +// RGB (YJ style) and C is an 8 bit color component (R, G or B). +LIBYUV_API +int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + const uint8* luma_rgb_table, + int width, int height); + +// Apply a 3 term polynomial to ARGB values. +// poly points to a 4x4 matrix. The first row is constants. The 2nd row is +// coefficients for b, g, r and a. The 3rd row is coefficients for b squared, +// g squared, r squared and a squared. The 4rd row is coefficients for b to +// the 3, g to the 3, r to the 3 and a to the 3. The values are summed and +// result clamped to 0 to 255. +// A polynomial approximation can be dirived using software such as 'R'. + +LIBYUV_API +int ARGBPolynomial(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + const float* poly, + int width, int height); + +// Quantize a rectangle of ARGB. Alpha unaffected. +// scale is a 16 bit fractional fixed point scaler between 0 and 65535. +// interval_size should be a value between 1 and 255. +// interval_offset should be a value between 0 and 255. +LIBYUV_API +int ARGBQuantize(uint8* dst_argb, int dst_stride_argb, + int scale, int interval_size, int interval_offset, + int x, int y, int width, int height); + +// Copy ARGB to ARGB. +LIBYUV_API +int ARGBCopy(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Copy ARGB to ARGB. +LIBYUV_API +int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Copy ARGB to ARGB. +LIBYUV_API +int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width); + +// Get function to Alpha Blend ARGB pixels and store to destination. +LIBYUV_API +ARGBBlendRow GetARGBBlend(); + +// Alpha Blend ARGB images and store to destination. +// Alpha of destination is set to 255. +LIBYUV_API +int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255. +LIBYUV_API +int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Add ARGB image with ARGB image. Saturates to 255. +LIBYUV_API +int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0. +LIBYUV_API +int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I422 to YUY2. +LIBYUV_API +int I422ToYUY2(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +// Convert I422 to UYVY. +LIBYUV_API +int I422ToUYVY(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +// Convert unattentuated ARGB to preattenuated ARGB. +LIBYUV_API +int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert preattentuated ARGB to unattenuated ARGB. +LIBYUV_API +int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert MJPG to ARGB. +LIBYUV_API +int MJPGToARGB(const uint8* sample, size_t sample_size, + uint8* argb, int argb_stride, + int w, int h, int dw, int dh); + +// Internal function - do not call directly. +// Computes table of cumulative sum for image where the value is the sum +// of all values above and to the left of the entry. Used by ARGBBlur. +LIBYUV_API +int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb, + int32* dst_cumsum, int dst_stride32_cumsum, + int width, int height); + +// Blur ARGB image. +// dst_cumsum table of width * (height + 1) * 16 bytes aligned to +// 16 byte boundary. +// dst_stride32_cumsum is number of ints in a row (width * 4). +// radius is number of pixels around the center. e.g. 1 = 3x3. 2=5x5. +// Blur is optimized for radius of 5 (11x11) or less. +LIBYUV_API +int ARGBBlur(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int32* dst_cumsum, int dst_stride32_cumsum, + int width, int height, int radius); + +// Multiply ARGB image by ARGB value. +LIBYUV_API +int ARGBShade(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height, uint32 value); + +// Interpolate between two ARGB images using specified amount of interpolation +// (0 to 255) and store to destination. +// 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0 +// and 255 means 1% src_argb0 and 99% src_argb1. +// Internally uses ARGBScale bilinear filtering. +// Caveat: This function will write up to 16 bytes beyond the end of dst_argb. +LIBYUV_API +int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height, int interpolation); + +#if defined(__pnacl__) || defined(__CLR_VER) || \ + (defined(__i386__) && !defined(__SSE2__)) +#define LIBYUV_DISABLE_X86 +#endif +// The following are available on all x86 platforms: +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +#define HAS_ARGBAFFINEROW_SSE2 +#endif + +// Row function for copying pixels from a source with a slope to a row +// of destination. Useful for scaling, rotation, mirror, texture mapping. +LIBYUV_API +void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, + uint8* dst_argb, const float* uv_dudv, int width); +LIBYUV_API +void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, + uint8* dst_argb, const float* uv_dudv, int width); + +// Shuffle ARGB channel order. e.g. BGRA to ARGB. +// shuffler is 16 bytes and must be aligned. +LIBYUV_API +int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_argb, int dst_stride_argb, + const uint8* shuffler, int width, int height); + +// Sobel ARGB effect with planar output. +LIBYUV_API +int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + int width, int height); + +// Sobel ARGB effect. +LIBYUV_API +int ARGBSobel(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB. +LIBYUV_API +int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ NOLINT diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/rotate.h b/libs/libaom/src/third_party/libyuv/include/libyuv/rotate.h new file mode 100644 index 000000000..8a9673f28 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/include/libyuv/rotate.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef INCLUDE_LIBYUV_ROTATE_H_ // NOLINT +#define INCLUDE_LIBYUV_ROTATE_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Supported rotation. +typedef enum RotationMode { + kRotate0 = 0, // No rotation. + kRotate90 = 90, // Rotate 90 degrees clockwise. + kRotate180 = 180, // Rotate 180 degrees. + kRotate270 = 270, // Rotate 270 degrees clockwise. + + // Deprecated. + kRotateNone = 0, + kRotateClockwise = 90, + kRotateCounterClockwise = 270, +} RotationModeEnum; + +// Rotate I420 frame. +LIBYUV_API +int I420Rotate(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int src_width, int src_height, enum RotationMode mode); + +// Rotate NV12 input and store in I420. +LIBYUV_API +int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int src_width, int src_height, enum RotationMode mode); + +// Rotate a plane by 0, 90, 180, or 270. +LIBYUV_API +int RotatePlane(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int src_width, int src_height, enum RotationMode mode); + +// Rotate planes by 90, 180, 270. Deprecated. +LIBYUV_API +void RotatePlane90(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height); + +LIBYUV_API +void RotatePlane180(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height); + +LIBYUV_API +void RotatePlane270(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height); + +LIBYUV_API +void RotateUV90(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height); + +// Rotations for when U and V are interleaved. +// These functions take one input pointer and +// split the data into two buffers while +// rotating them. Deprecated. +LIBYUV_API +void RotateUV180(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height); + +LIBYUV_API +void RotateUV270(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height); + +// The 90 and 270 functions are based on transposes. +// Doing a transpose with reversing the read/write +// order will result in a rotation by +- 90 degrees. +// Deprecated. +LIBYUV_API +void TransposePlane(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height); + +LIBYUV_API +void TransposeUV(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_ROTATE_H_ NOLINT diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/rotate_argb.h b/libs/libaom/src/third_party/libyuv/include/libyuv/rotate_argb.h new file mode 100644 index 000000000..2bdc8ec6b --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/include/libyuv/rotate_argb.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_ // NOLINT +#define INCLUDE_LIBYUV_ROTATE_ARGB_H_ + +#include "libyuv/basic_types.h" +#include "libyuv/rotate.h" // For RotationMode. + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Rotate ARGB frame +LIBYUV_API +int ARGBRotate(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int src_width, int src_height, enum RotationMode mode); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_ROTATE_ARGB_H_ NOLINT diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/rotate_row.h b/libs/libaom/src/third_party/libyuv/include/libyuv/rotate_row.h new file mode 100644 index 000000000..d0bfbdd2b --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/include/libyuv/rotate_row.h @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_ // NOLINT +#define INCLUDE_LIBYUV_ROTATE_ROW_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if defined(__pnacl__) || defined(__CLR_VER) || \ + (defined(__i386__) && !defined(__SSE2__)) +#define LIBYUV_DISABLE_X86 +#endif + +// Visual C 2012 required for AVX2. +#if defined(_M_IX86) && !defined(__clang__) && \ + defined(_MSC_VER) && _MSC_VER >= 1700 +#define VISUALC_HAS_AVX2 1 +#endif // VisualStudio >= 2012 + +// TODO(fbarchard): switch to standard form of inline; fails on clangcl. +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +#if defined(__APPLE__) && defined(__i386__) +#define DECLARE_FUNCTION(name) \ + ".text \n" \ + ".private_extern _" #name " \n" \ + ".align 4,0x90 \n" \ +"_" #name ": \n" +#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__) +#define DECLARE_FUNCTION(name) \ + ".text \n" \ + ".align 4,0x90 \n" \ +"_" #name ": \n" +#else +#define DECLARE_FUNCTION(name) \ + ".text \n" \ + ".align 4,0x90 \n" \ +#name ": \n" +#endif +#endif + +// The following are available for Visual C: +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ + defined(_MSC_VER) && !defined(__clang__) +#define HAS_TRANSPOSEWX8_SSSE3 +#define HAS_TRANSPOSEUVWX8_SSE2 +#endif + +// The following are available for GCC but not NaCL: +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__))) +#define HAS_TRANSPOSEWX8_SSSE3 +#endif + +// The following are available for 32 bit GCC: +#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__) && !defined(__clang__) +#define HAS_TRANSPOSEUVWX8_SSE2 +#endif + +// The following are available for 64 bit GCC but not NaCL: +#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \ + defined(__x86_64__) +#define HAS_TRANSPOSEWX8_FAST_SSSE3 +#define HAS_TRANSPOSEUVWX8_SSE2 +#endif + +#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ + (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) +#define HAS_TRANSPOSEWX8_NEON +#define HAS_TRANSPOSEUVWX8_NEON +#endif + +#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ + defined(__mips__) && \ + defined(__mips_dsp) && (__mips_dsp_rev >= 2) +#define HAS_TRANSPOSEWX8_MIPS_DSPR2 +#define HAS_TRANSPOSEUVWx8_MIPS_DSPR2 +#endif // defined(__mips__) + +void TransposeWxH_C(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width, int height); + +void TransposeWx8_C(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); +void TransposeWx8_NEON(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); +void TransposeWx8_SSSE3(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); +void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); +void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); + +void TransposeWx8_Any_NEON(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); +void TransposeWx8_Any_SSSE3(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); +void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); +void TransposeWx8_Any_MIPS_DSPR2(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); + +void TransposeUVWxH_C(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height); + +void TransposeUVWx8_C(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, int width); +void TransposeUVWx8_SSE2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, int width); +void TransposeUVWx8_NEON(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, int width); +void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, int width); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_ROTATE_ROW_H_ NOLINT diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/row.h b/libs/libaom/src/third_party/libyuv/include/libyuv/row.h new file mode 100644 index 000000000..5c3187ef7 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/include/libyuv/row.h @@ -0,0 +1,1857 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef INCLUDE_LIBYUV_ROW_H_ // NOLINT +#define INCLUDE_LIBYUV_ROW_H_ + +#include // For malloc. + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1))) + +#ifdef __cplusplus +#define align_buffer_64(var, size) \ + uint8* var##_mem = reinterpret_cast(malloc((size) + 63)); \ + uint8* var = reinterpret_cast \ + ((reinterpret_cast(var##_mem) + 63) & ~63) +#else +#define align_buffer_64(var, size) \ + uint8* var##_mem = (uint8*)(malloc((size) + 63)); /* NOLINT */ \ + uint8* var = (uint8*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */ +#endif + +#define free_aligned_buffer_64(var) \ + free(var##_mem); \ + var = 0 + +#if defined(__pnacl__) || defined(__CLR_VER) || \ + (defined(__i386__) && !defined(__SSE2__)) +#define LIBYUV_DISABLE_X86 +#endif +// True if compiling for SSSE3 as a requirement. +#if defined(__SSSE3__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 3)) +#define LIBYUV_SSSE3_ONLY +#endif + +#if defined(__native_client__) +#define LIBYUV_DISABLE_NEON +#endif +// clang >= 3.5.0 required for Arm64. +#if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON) +#if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5)) +#define LIBYUV_DISABLE_NEON +#endif // clang >= 3.5 +#endif // __clang__ + +// The following are available on all x86 platforms: +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +// Conversions: +#define HAS_ABGRTOUVROW_SSSE3 +#define HAS_ABGRTOYROW_SSSE3 +#define HAS_ARGB1555TOARGBROW_SSE2 +#define HAS_ARGB4444TOARGBROW_SSE2 +#define HAS_ARGBSETROW_X86 +#define HAS_ARGBSHUFFLEROW_SSE2 +#define HAS_ARGBSHUFFLEROW_SSSE3 +#define HAS_ARGBTOARGB1555ROW_SSE2 +#define HAS_ARGBTOARGB4444ROW_SSE2 +#define HAS_ARGBTORAWROW_SSSE3 +#define HAS_ARGBTORGB24ROW_SSSE3 +#define HAS_ARGBTORGB565ROW_SSE2 +#define HAS_ARGBTOUV422ROW_SSSE3 +#define HAS_ARGBTOUV444ROW_SSSE3 +#define HAS_ARGBTOUVJROW_SSSE3 +#define HAS_ARGBTOUVROW_SSSE3 +#define HAS_ARGBTOYJROW_SSSE3 +#define HAS_ARGBTOYROW_SSSE3 +#define HAS_BGRATOUVROW_SSSE3 +#define HAS_BGRATOYROW_SSSE3 +#define HAS_COPYROW_ERMS +#define HAS_COPYROW_SSE2 +#define HAS_I400TOARGBROW_SSE2 +#define HAS_I411TOARGBROW_SSSE3 +#define HAS_I422TOABGRROW_SSSE3 +#define HAS_I422TOARGB1555ROW_SSSE3 +#define HAS_I422TOARGB4444ROW_SSSE3 +#define HAS_I422TOARGBROW_SSSE3 +#define HAS_I422TOBGRAROW_SSSE3 +#define HAS_I422TORAWROW_SSSE3 +#define HAS_I422TORGB24ROW_SSSE3 +#define HAS_I422TORGB565ROW_SSSE3 +#define HAS_I422TORGBAROW_SSSE3 +#define HAS_I422TOUYVYROW_SSE2 +#define HAS_I422TOYUY2ROW_SSE2 +#define HAS_I444TOARGBROW_SSSE3 +#define HAS_J400TOARGBROW_SSE2 +#define HAS_J422TOARGBROW_SSSE3 +#define HAS_MERGEUVROW_SSE2 +#define HAS_MIRRORROW_SSE2 +#define HAS_MIRRORROW_SSSE3 +#define HAS_MIRRORROW_UV_SSSE3 +#define HAS_MIRRORUVROW_SSSE3 +#define HAS_NV12TOARGBROW_SSSE3 +#define HAS_NV12TORGB565ROW_SSSE3 +#define HAS_NV21TOARGBROW_SSSE3 +#define HAS_NV21TORGB565ROW_SSSE3 +#define HAS_RAWTOARGBROW_SSSE3 +#define HAS_RAWTOYROW_SSSE3 +#define HAS_RGB24TOARGBROW_SSSE3 +#define HAS_RGB24TOYROW_SSSE3 +#define HAS_RGB565TOARGBROW_SSE2 +#define HAS_RGBATOUVROW_SSSE3 +#define HAS_RGBATOYROW_SSSE3 +#define HAS_SETROW_ERMS +#define HAS_SETROW_X86 +#define HAS_SPLITUVROW_SSE2 +#define HAS_UYVYTOARGBROW_SSSE3 +#define HAS_UYVYTOUV422ROW_SSE2 +#define HAS_UYVYTOUVROW_SSE2 +#define HAS_UYVYTOYROW_SSE2 +#define HAS_YUY2TOARGBROW_SSSE3 +#define HAS_YUY2TOUV422ROW_SSE2 +#define HAS_YUY2TOUVROW_SSE2 +#define HAS_YUY2TOYROW_SSE2 + +// Effects: +#define HAS_ARGBADDROW_SSE2 +#define HAS_ARGBAFFINEROW_SSE2 +#define HAS_ARGBATTENUATEROW_SSSE3 +#define HAS_ARGBBLENDROW_SSSE3 +#define HAS_ARGBCOLORMATRIXROW_SSSE3 +#define HAS_ARGBCOLORTABLEROW_X86 +#define HAS_ARGBCOPYALPHAROW_SSE2 +#define HAS_ARGBCOPYYTOALPHAROW_SSE2 +#define HAS_ARGBGRAYROW_SSSE3 +#define HAS_ARGBLUMACOLORTABLEROW_SSSE3 +#define HAS_ARGBMIRRORROW_SSE2 +#define HAS_ARGBMULTIPLYROW_SSE2 +#define HAS_ARGBPOLYNOMIALROW_SSE2 +#define HAS_ARGBQUANTIZEROW_SSE2 +#define HAS_ARGBSEPIAROW_SSSE3 +#define HAS_ARGBSHADEROW_SSE2 +#define HAS_ARGBSUBTRACTROW_SSE2 +#define HAS_ARGBUNATTENUATEROW_SSE2 +#define HAS_COMPUTECUMULATIVESUMROW_SSE2 +#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 +#define HAS_INTERPOLATEROW_SSE2 +#define HAS_INTERPOLATEROW_SSSE3 +#define HAS_RGBCOLORTABLEROW_X86 +#define HAS_SOBELROW_SSE2 +#define HAS_SOBELTOPLANEROW_SSE2 +#define HAS_SOBELXROW_SSE2 +#define HAS_SOBELXYROW_SSE2 +#define HAS_SOBELYROW_SSE2 +#endif + +// The following are available on x64 Visual C and clangcl. +#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \ + (!defined(__clang__) || defined(__SSSE3__)) +#define HAS_I422TOARGBROW_SSSE3 +#endif + +// GCC >= 4.7.0 required for AVX2. +#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) +#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) +#define GCC_HAS_AVX2 1 +#endif // GNUC >= 4.7 +#endif // __GNUC__ + +// clang >= 3.4.0 required for AVX2. +#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) +#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4)) +#define CLANG_HAS_AVX2 1 +#endif // clang >= 3.4 +#endif // __clang__ + +// Visual C 2012 required for AVX2. +#if defined(_M_IX86) && !defined(__clang__) && \ + defined(_MSC_VER) && _MSC_VER >= 1700 +#define VISUALC_HAS_AVX2 1 +#endif // VisualStudio >= 2012 + +// The following are available require VS2012. Port to GCC. +#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) +#define HAS_ARGB1555TOARGBROW_AVX2 +#define HAS_ARGB4444TOARGBROW_AVX2 +#define HAS_ARGBTOARGB1555ROW_AVX2 +#define HAS_ARGBTOARGB4444ROW_AVX2 +#define HAS_ARGBTORGB565DITHERROW_AVX2 +#define HAS_ARGBTORGB565DITHERROW_SSE2 +#define HAS_ARGBTORGB565ROW_AVX2 +#define HAS_I411TOARGBROW_AVX2 +#define HAS_I422TOARGB1555ROW_AVX2 +#define HAS_I422TOARGB4444ROW_AVX2 +#define HAS_I422TORGB565ROW_AVX2 +#define HAS_I444TOARGBROW_AVX2 +#define HAS_J400TOARGBROW_AVX2 +#define HAS_NV12TOARGBROW_AVX2 +#define HAS_NV12TORGB565ROW_AVX2 +#define HAS_NV21TOARGBROW_AVX2 +#define HAS_NV21TORGB565ROW_AVX2 +#define HAS_RGB565TOARGBROW_AVX2 +#endif + +// The following are available on all x86 platforms, but +// require VS2012, clang 3.4 or gcc 4.7. +// The code supports NaCL but requires a new compiler and validator. +#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \ + defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) +#define HAS_ARGBCOPYALPHAROW_AVX2 +#define HAS_ARGBCOPYYTOALPHAROW_AVX2 +#define HAS_ARGBMIRRORROW_AVX2 +#define HAS_ARGBPOLYNOMIALROW_AVX2 +#define HAS_ARGBSHUFFLEROW_AVX2 +#define HAS_ARGBTOUVROW_AVX2 +#define HAS_ARGBTOYJROW_AVX2 +#define HAS_ARGBTOYROW_AVX2 +#define HAS_COPYROW_AVX +#define HAS_I400TOARGBROW_AVX2 +#define HAS_I422TOABGRROW_AVX2 +#define HAS_I422TOARGBROW_AVX2 +#define HAS_I422TOBGRAROW_AVX2 +#define HAS_I422TORAWROW_AVX2 +#define HAS_I422TORGB24ROW_AVX2 +#define HAS_I422TORGBAROW_AVX2 +#define HAS_INTERPOLATEROW_AVX2 +#define HAS_J422TOARGBROW_AVX2 +#define HAS_MERGEUVROW_AVX2 +#define HAS_MIRRORROW_AVX2 +#define HAS_SPLITUVROW_AVX2 +#define HAS_UYVYTOARGBROW_AVX2 +#define HAS_UYVYTOUV422ROW_AVX2 +#define HAS_UYVYTOUVROW_AVX2 +#define HAS_UYVYTOYROW_AVX2 +#define HAS_YUY2TOARGBROW_AVX2 +#define HAS_YUY2TOUV422ROW_AVX2 +#define HAS_YUY2TOUVROW_AVX2 +#define HAS_YUY2TOYROW_AVX2 + +// Effects: +#define HAS_ARGBADDROW_AVX2 +#define HAS_ARGBATTENUATEROW_AVX2 +#define HAS_ARGBMULTIPLYROW_AVX2 +#define HAS_ARGBSUBTRACTROW_AVX2 +#define HAS_ARGBUNATTENUATEROW_AVX2 +#endif + +// The following are disabled when SSSE3 is available: +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ + !defined(LIBYUV_SSSE3_ONLY) +#define HAS_ARGBATTENUATEROW_SSE2 +#define HAS_ARGBBLENDROW_SSE2 +#define HAS_MIRRORROW_SSE2 +#endif + +// The following are available on Neon platforms: +#if !defined(LIBYUV_DISABLE_NEON) && \ + (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)) +#define HAS_ABGRTOUVROW_NEON +#define HAS_ABGRTOYROW_NEON +#define HAS_ARGB1555TOARGBROW_NEON +#define HAS_ARGB1555TOUVROW_NEON +#define HAS_ARGB1555TOYROW_NEON +#define HAS_ARGB4444TOARGBROW_NEON +#define HAS_ARGB4444TOUVROW_NEON +#define HAS_ARGB4444TOYROW_NEON +#define HAS_ARGBTOARGB1555ROW_NEON +#define HAS_ARGBTOARGB4444ROW_NEON +#define HAS_ARGBTORAWROW_NEON +#define HAS_ARGBTORGB24ROW_NEON +#define HAS_ARGBTORGB565ROW_NEON +#define HAS_ARGBTOUV411ROW_NEON +#define HAS_ARGBTOUV422ROW_NEON +#define HAS_ARGBTOUV444ROW_NEON +#define HAS_ARGBTOUVJROW_NEON +#define HAS_ARGBTOUVROW_NEON +#define HAS_ARGBTOYJROW_NEON +#define HAS_ARGBTOYROW_NEON +#define HAS_BGRATOUVROW_NEON +#define HAS_BGRATOYROW_NEON +#define HAS_COPYROW_NEON +#define HAS_J400TOARGBROW_NEON +#define HAS_I411TOARGBROW_NEON +#define HAS_I422TOABGRROW_NEON +#define HAS_I422TOARGB1555ROW_NEON +#define HAS_I422TOARGB4444ROW_NEON +#define HAS_I422TOARGBROW_NEON +#define HAS_I422TOBGRAROW_NEON +#define HAS_I422TORAWROW_NEON +#define HAS_I422TORGB24ROW_NEON +#define HAS_I422TORGB565ROW_NEON +#define HAS_I422TORGBAROW_NEON +#define HAS_I422TOUYVYROW_NEON +#define HAS_I422TOYUY2ROW_NEON +#define HAS_I444TOARGBROW_NEON +#define HAS_MERGEUVROW_NEON +#define HAS_MIRRORROW_NEON +#define HAS_MIRRORUVROW_NEON +#define HAS_NV12TOARGBROW_NEON +#define HAS_NV12TORGB565ROW_NEON +#define HAS_NV21TOARGBROW_NEON +#define HAS_NV21TORGB565ROW_NEON +#define HAS_RAWTOARGBROW_NEON +#define HAS_RAWTOUVROW_NEON +#define HAS_RAWTOYROW_NEON +#define HAS_RGB24TOARGBROW_NEON +#define HAS_RGB24TOUVROW_NEON +#define HAS_RGB24TOYROW_NEON +#define HAS_RGB565TOARGBROW_NEON +#define HAS_RGB565TOUVROW_NEON +#define HAS_RGB565TOYROW_NEON +#define HAS_RGBATOUVROW_NEON +#define HAS_RGBATOYROW_NEON +#define HAS_SETROW_NEON +#define HAS_ARGBSETROW_NEON +#define HAS_SPLITUVROW_NEON +#define HAS_UYVYTOARGBROW_NEON +#define HAS_UYVYTOUV422ROW_NEON +#define HAS_UYVYTOUVROW_NEON +#define HAS_UYVYTOYROW_NEON +#define HAS_I400TOARGBROW_NEON +#define HAS_YUY2TOARGBROW_NEON +#define HAS_YUY2TOUV422ROW_NEON +#define HAS_YUY2TOUVROW_NEON +#define HAS_YUY2TOYROW_NEON +#define HAS_ARGBTORGB565DITHERROW_NEON + +// Effects: +#define HAS_ARGBADDROW_NEON +#define HAS_ARGBATTENUATEROW_NEON +#define HAS_ARGBBLENDROW_NEON +#define HAS_ARGBGRAYROW_NEON +#define HAS_ARGBMIRRORROW_NEON +#define HAS_ARGBMULTIPLYROW_NEON +#define HAS_ARGBQUANTIZEROW_NEON +#define HAS_ARGBSEPIAROW_NEON +#define HAS_ARGBSHADEROW_NEON +#define HAS_ARGBSUBTRACTROW_NEON +#define HAS_INTERPOLATEROW_NEON +#define HAS_SOBELROW_NEON +#define HAS_SOBELTOPLANEROW_NEON +#define HAS_SOBELXROW_NEON +#define HAS_SOBELXYROW_NEON +#define HAS_SOBELYROW_NEON +#define HAS_ARGBCOLORMATRIXROW_NEON +#define HAS_ARGBSHUFFLEROW_NEON +#endif + +// The following are available on Mips platforms: +#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \ + (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6) +#define HAS_COPYROW_MIPS +#if defined(__mips_dsp) && (__mips_dsp_rev >= 2) +#define HAS_I422TOABGRROW_MIPS_DSPR2 +#define HAS_I422TOARGBROW_MIPS_DSPR2 +#define HAS_I422TOBGRAROW_MIPS_DSPR2 +#define HAS_INTERPOLATEROW_MIPS_DSPR2 +#define HAS_MIRRORROW_MIPS_DSPR2 +#define HAS_MIRRORUVROW_MIPS_DSPR2 +#define HAS_SPLITUVROW_MIPS_DSPR2 +#endif +#endif + +#if defined(_MSC_VER) && !defined(__CLR_VER) +#define SIMD_ALIGNED(var) __declspec(align(16)) var +#define SIMD_ALIGNED32(var) __declspec(align(64)) var +typedef __declspec(align(16)) int16 vec16[8]; +typedef __declspec(align(16)) int32 vec32[4]; +typedef __declspec(align(16)) int8 vec8[16]; +typedef __declspec(align(16)) uint16 uvec16[8]; +typedef __declspec(align(16)) uint32 uvec32[4]; +typedef __declspec(align(16)) uint8 uvec8[16]; +typedef __declspec(align(32)) int16 lvec16[16]; +typedef __declspec(align(32)) int32 lvec32[8]; +typedef __declspec(align(32)) int8 lvec8[32]; +typedef __declspec(align(32)) uint16 ulvec16[16]; +typedef __declspec(align(32)) uint32 ulvec32[8]; +typedef __declspec(align(32)) uint8 ulvec8[32]; +#elif defined(__GNUC__) +// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const. +#define SIMD_ALIGNED(var) var __attribute__((aligned(16))) +#define SIMD_ALIGNED32(var) var __attribute__((aligned(64))) +typedef int16 __attribute__((vector_size(16))) vec16; +typedef int32 __attribute__((vector_size(16))) vec32; +typedef int8 __attribute__((vector_size(16))) vec8; +typedef uint16 __attribute__((vector_size(16))) uvec16; +typedef uint32 __attribute__((vector_size(16))) uvec32; +typedef uint8 __attribute__((vector_size(16))) uvec8; +typedef int16 __attribute__((vector_size(32))) lvec16; +typedef int32 __attribute__((vector_size(32))) lvec32; +typedef int8 __attribute__((vector_size(32))) lvec8; +typedef uint16 __attribute__((vector_size(32))) ulvec16; +typedef uint32 __attribute__((vector_size(32))) ulvec32; +typedef uint8 __attribute__((vector_size(32))) ulvec8; +#else +#define SIMD_ALIGNED(var) var +#define SIMD_ALIGNED32(var) var +typedef int16 vec16[8]; +typedef int32 vec32[4]; +typedef int8 vec8[16]; +typedef uint16 uvec16[8]; +typedef uint32 uvec32[4]; +typedef uint8 uvec8[16]; +typedef int16 lvec16[16]; +typedef int32 lvec32[8]; +typedef int8 lvec8[32]; +typedef uint16 ulvec16[16]; +typedef uint32 ulvec32[8]; +typedef uint8 ulvec8[32]; +#endif + +#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__) +#define OMITFP +#else +#define OMITFP __attribute__((optimize("omit-frame-pointer"))) +#endif + +// NaCL macros for GCC x86 and x64. +#if defined(__native_client__) +#define LABELALIGN ".p2align 5\n" +#else +#define LABELALIGN +#endif +#if defined(__native_client__) && defined(__x86_64__) +// r14 is used for MEMOP macros. +#define NACL_R14 "r14", +#define BUNDLELOCK ".bundle_lock\n" +#define BUNDLEUNLOCK ".bundle_unlock\n" +#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")" +#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")" +#define MEMLEA(offset, base) #offset "(%q" #base ")" +#define MEMLEA3(offset, index, scale) \ + #offset "(,%q" #index "," #scale ")" +#define MEMLEA4(offset, base, index, scale) \ + #offset "(%q" #base ",%q" #index "," #scale ")" +#define MEMMOVESTRING(s, d) "%%nacl:(%q" #s "),%%nacl:(%q" #d "), %%r15" +#define MEMSTORESTRING(reg, d) "%%" #reg ",%%nacl:(%q" #d "), %%r15" +#define MEMOPREG(opcode, offset, base, index, scale, reg) \ + BUNDLELOCK \ + "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ + #opcode " (%%r15,%%r14),%%" #reg "\n" \ + BUNDLEUNLOCK +#define MEMOPMEM(opcode, reg, offset, base, index, scale) \ + BUNDLELOCK \ + "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ + #opcode " %%" #reg ",(%%r15,%%r14)\n" \ + BUNDLEUNLOCK +#define MEMOPARG(opcode, offset, base, index, scale, arg) \ + BUNDLELOCK \ + "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ + #opcode " (%%r15,%%r14),%" #arg "\n" \ + BUNDLEUNLOCK +#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \ + BUNDLELOCK \ + "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ + #opcode " (%%r15,%%r14),%%" #reg1 ",%%" #reg2 "\n" \ + BUNDLEUNLOCK +#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \ + BUNDLELOCK \ + "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ + #op " $" #sel ",%%" #reg ",(%%r15,%%r14)\n" \ + BUNDLEUNLOCK +#else // defined(__native_client__) && defined(__x86_64__) +#define NACL_R14 +#define BUNDLEALIGN +#define MEMACCESS(base) "(%" #base ")" +#define MEMACCESS2(offset, base) #offset "(%" #base ")" +#define MEMLEA(offset, base) #offset "(%" #base ")" +#define MEMLEA3(offset, index, scale) \ + #offset "(,%" #index "," #scale ")" +#define MEMLEA4(offset, base, index, scale) \ + #offset "(%" #base ",%" #index "," #scale ")" +#define MEMMOVESTRING(s, d) +#define MEMSTORESTRING(reg, d) +#define MEMOPREG(opcode, offset, base, index, scale, reg) \ + #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n" +#define MEMOPMEM(opcode, reg, offset, base, index, scale) \ + #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n" +#define MEMOPARG(opcode, offset, base, index, scale, arg) \ + #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n" +#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \ + #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg1 ",%%" \ + #reg2 "\n" +#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \ + #op " $" #sel ",%%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n" +#endif // defined(__native_client__) && defined(__x86_64__) + +#if defined(__arm__) || defined(__aarch64__) +#undef MEMACCESS +#if defined(__native_client__) +#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n" +#else +#define MEMACCESS(base) +#endif +#endif + +void I444ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I411ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToBGRARow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_bgra, + int width); +void I422ToABGRRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_abgr, + int width); +void I422ToRGBARow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); +void I422ToRGB24Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb24, + int width); +void I422ToRAWRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_raw, + int width); +void I422ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb565, + int width); +void I422ToARGB1555Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb1555, + int width); +void I422ToARGB4444Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb4444, + int width); +void NV12ToARGBRow_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToARGBRow_NEON(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); +void NV12ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_rgb565, + int width); +void NV21ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_vu, + uint8* dst_rgb565, + int width); +void YUY2ToARGBRow_NEON(const uint8* src_yuy2, + uint8* dst_argb, + int width); +void UYVYToARGBRow_NEON(const uint8* src_uyvy, + uint8* dst_argb, + int width); + +void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYJRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix); +void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix); +void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix); +void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix); +void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int pix); +void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix); +void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix); +void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix); +void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix); +void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix); +void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int pix); +void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int pix); +void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int pix); +void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_u, uint8* dst_v, int pix); +void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, + uint8* dst_u, uint8* dst_v, int pix); +void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_u, uint8* dst_v, int pix); +void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_u, uint8* dst_v, int pix); +void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_u, uint8* dst_v, int pix); +void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix); +void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix); +void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix); +void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix); +void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix); +void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix); +void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix); +void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix); +void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int pix); +void ABGRToYRow_C(const uint8* src_abgr, uint8* dst_y, int pix); +void RGBAToYRow_C(const uint8* src_rgba, uint8* dst_y, int pix); +void RGB24ToYRow_C(const uint8* src_rgb24, uint8* dst_y, int pix); +void RAWToYRow_C(const uint8* src_raw, uint8* dst_y, int pix); +void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int pix); +void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int pix); +void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int pix); +void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYJRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_Any_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix); +void ABGRToYRow_Any_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix); +void RGBAToYRow_Any_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix); +void RGB24ToYRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix); +void RAWToYRow_Any_SSSE3(const uint8* src_raw, uint8* dst_y, int pix); +void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYJRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_Any_NEON(const uint8* src_bgra, uint8* dst_y, int pix); +void ABGRToYRow_Any_NEON(const uint8* src_abgr, uint8* dst_y, int pix); +void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int pix); +void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int pix); +void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int pix); +void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int pix); +void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y, int pix); +void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y, int pix); + +void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVJRow_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void BGRAToUVRow_SSSE3(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int width); +void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int width); +void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int width); +void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int width); +void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUV444Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix); +void ARGBToUV422Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix); +void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix); +void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix); +void ARGBToUVJRow_Any_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix); +void BGRAToUVRow_Any_NEON(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int pix); +void ABGRToUVRow_Any_NEON(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int pix); +void RGBAToUVRow_Any_NEON(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int pix); +void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_u, uint8* dst_v, int pix); +void RAWToUVRow_Any_NEON(const uint8* src_raw, int src_stride_raw, + uint8* dst_u, uint8* dst_v, int pix); +void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_u, uint8* dst_v, int pix); +void ARGB1555ToUVRow_Any_NEON(const uint8* src_argb1555, + int src_stride_argb1555, + uint8* dst_u, uint8* dst_v, int pix); +void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444, + int src_stride_argb4444, + uint8* dst_u, uint8* dst_v, int pix); +void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVJRow_C(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void BGRAToUVRow_C(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int width); +void ABGRToUVRow_C(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int width); +void RGBAToUVRow_C(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int width); +void RGB24ToUVRow_C(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_u, uint8* dst_v, int width); +void RAWToUVRow_C(const uint8* src_raw, int src_stride_raw, + uint8* dst_u, uint8* dst_v, int width); +void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_u, uint8* dst_v, int width); +void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_u, uint8* dst_v, int width); +void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_u, uint8* dst_v, int width); + +void ARGBToUV444Row_SSSE3(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width); + +void ARGBToUV422Row_SSSE3(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUV422Row_Any_SSSE3(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width); + +void ARGBToUV444Row_C(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUV422Row_C(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUV411Row_C(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVJ422Row_C(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width); + +void MirrorRow_AVX2(const uint8* src, uint8* dst, int width); +void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width); +void MirrorRow_SSE2(const uint8* src, uint8* dst, int width); +void MirrorRow_NEON(const uint8* src, uint8* dst, int width); +void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width); +void MirrorRow_C(const uint8* src, uint8* dst, int width); +void MirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width); +void MirrorRow_Any_SSSE3(const uint8* src, uint8* dst, int width); +void MirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width); +void MirrorRow_Any_NEON(const uint8* src, uint8* dst, int width); + +void MirrorUVRow_SSSE3(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width); +void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width); +void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width); +void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width); + +void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width); +void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width); +void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width); +void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width); +void ARGBMirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width); +void ARGBMirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width); +void ARGBMirrorRow_Any_NEON(const uint8* src, uint8* dst, int width); + +void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); +void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); +void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); +void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); +void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int pix); +void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int pix); +void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int pix); +void SplitUVRow_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int pix); +void SplitUVRow_Any_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int pix); + +void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width); +void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width); +void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width); +void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width); +void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width); +void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width); +void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width); + +void CopyRow_SSE2(const uint8* src, uint8* dst, int count); +void CopyRow_AVX(const uint8* src, uint8* dst, int count); +void CopyRow_ERMS(const uint8* src, uint8* dst, int count); +void CopyRow_NEON(const uint8* src, uint8* dst, int count); +void CopyRow_MIPS(const uint8* src, uint8* dst, int count); +void CopyRow_C(const uint8* src, uint8* dst, int count); +void CopyRow_Any_SSE2(const uint8* src, uint8* dst, int count); +void CopyRow_Any_AVX(const uint8* src, uint8* dst, int count); +void CopyRow_Any_NEON(const uint8* src, uint8* dst, int count); + +void CopyRow_16_C(const uint16* src, uint16* dst, int count); + +void ARGBCopyAlphaRow_C(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBCopyAlphaRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBCopyAlphaRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width); + +void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width); +void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); +void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width); + +void SetRow_C(uint8* dst, uint8 v8, int count); +void SetRow_X86(uint8* dst, uint8 v8, int count); +void SetRow_ERMS(uint8* dst, uint8 v8, int count); +void SetRow_NEON(uint8* dst, uint8 v8, int count); +void SetRow_Any_X86(uint8* dst, uint8 v8, int count); +void SetRow_Any_NEON(uint8* dst, uint8 v8, int count); + +void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int count); +void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count); +void ARGBSetRow_NEON(uint8* dst_argb, uint32 v32, int count); +void ARGBSetRow_Any_NEON(uint8* dst_argb, uint32 v32, int count); + +// ARGBShufflers for BGRAToARGB etc. +void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_Any_NEON(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); + +void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix); +void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix); +void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int pix); +void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, + int pix); +void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, + int pix); +void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int pix); +void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, + int pix); +void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, + int pix); + +void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix); +void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix); +void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix); +void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, + int pix); +void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, + int pix); +void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix); +void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix); +void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix); +void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix); +void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix); +void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix); +void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix); + +void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb, + int pix); +void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb, + int pix); +void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb, + int pix); +void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb, + int pix); +void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555, uint8* dst_argb, + int pix); +void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444, uint8* dst_argb, + int pix); + +void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix); +void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int pix); +void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb, + int pix); +void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555, uint8* dst_argb, + int pix); +void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444, uint8* dst_argb, + int pix); + +void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); + +void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb, + const uint32 dither4, int pix); +void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, + const uint32 dither4, int pix); +void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, + const uint32 dither4, int pix); + +void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); + +void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, + const uint32 dither4, int width); + +void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); + +void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix); +void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix); +void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix); +void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); +void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix); +void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int pix); +void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix); + +void I444ToARGBRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGBRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I411ToARGBRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void NV12ToARGBRow_C(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToRGB565Row_C(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); +void NV12ToRGB565Row_C(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToARGBRow_C(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); +void YUY2ToARGBRow_C(const uint8* src_yuy2, + uint8* dst_argb, + int width); +void UYVYToARGBRow_C(const uint8* src_uyvy, + uint8* dst_argb, + int width); +void J422ToARGBRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToBGRARow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_bgra, + int width); +void I422ToABGRRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_abgr, + int width); +void I422ToRGBARow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); +void I422ToRGB24Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb24, + int width); +void I422ToRAWRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_raw, + int width); +void I422ToARGB4444Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb4444, + int width); +void I422ToARGB1555Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb4444, + int width); +void I422ToRGB565Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb565, + int width); +void I422ToARGBRow_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToBGRARow_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToRGBARow_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToABGRRow_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I444ToARGBRow_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I444ToARGBRow_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGBRow_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I411ToARGBRow_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I411ToARGBRow_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void NV12ToARGBRow_SSSE3(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToARGBRow_SSSE3(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); +void NV12ToARGBRow_AVX2(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToARGBRow_AVX2(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); +void NV12ToRGB565Row_SSSE3(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToRGB565Row_SSSE3(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); +void NV12ToRGB565Row_AVX2(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToRGB565Row_AVX2(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); +void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, + uint8* dst_argb, + int width); +void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, + uint8* dst_argb, + int width); +void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, + uint8* dst_argb, + int width); +void UYVYToARGBRow_AVX2(const uint8* src_uyvy, + uint8* dst_argb, + int width); +void J422ToARGBRow_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void J422ToARGBRow_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToBGRARow_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_bgra, + int width); +void I422ToABGRRow_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_abgr, + int width); +void I422ToRGBARow_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); +void I422ToARGB4444Row_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGB4444Row_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGB1555Row_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGB1555Row_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToRGB565Row_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToRGB565Row_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToRGB24Row_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb24, + int width); +void I422ToRGB24Row_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb24, + int width); +void I422ToRAWRow_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_raw, + int width); +void I422ToRAWRow_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_raw, + int width); +void I422ToARGBRow_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToBGRARow_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToRGBARow_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToABGRRow_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I444ToARGBRow_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I444ToARGBRow_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGBRow_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I411ToARGBRow_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I411ToARGBRow_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void NV12ToARGBRow_Any_SSSE3(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToARGBRow_Any_SSSE3(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); +void NV12ToARGBRow_Any_AVX2(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToARGBRow_Any_AVX2(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); +void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToRGB565Row_Any_SSSE3(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); +void NV12ToRGB565Row_Any_AVX2(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToRGB565Row_Any_AVX2(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); +void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2, + uint8* dst_argb, + int width); +void UYVYToARGBRow_Any_SSSE3(const uint8* src_uyvy, + uint8* dst_argb, + int width); +void YUY2ToARGBRow_Any_AVX2(const uint8* src_yuy2, + uint8* dst_argb, + int width); +void UYVYToARGBRow_Any_AVX2(const uint8* src_uyvy, + uint8* dst_argb, + int width); +void J422ToARGBRow_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void J422ToARGBRow_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToBGRARow_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_bgra, + int width); +void I422ToABGRRow_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_abgr, + int width); +void I422ToRGBARow_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); +void I422ToARGB4444Row_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); +void I422ToARGB4444Row_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); +void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); +void I422ToARGB1555Row_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); +void I422ToRGB565Row_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); +void I422ToRGB565Row_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); +void I422ToRGB24Row_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToRGB24Row_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToRAWRow_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToRAWRow_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); + +void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width); +void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); +void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width); +void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width); +void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width); +void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width); +void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width); + +// ARGB preattenuated alpha blend. +void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBBlendRow_SSE2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); + +// ARGB multiply images. Same API as Blend, but these require +// pointer and width alignment for SSE2. +void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBMultiplyRow_AVX2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBMultiplyRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBMultiplyRow_NEON(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBMultiplyRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); + +// ARGB add images. +void ARGBAddRow_C(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBAddRow_SSE2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBAddRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBAddRow_AVX2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBAddRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBAddRow_NEON(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBAddRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); + +// ARGB subtract images. Same API as Blend, but these require +// pointer and width alignment for SSE2. +void ARGBSubtractRow_C(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBSubtractRow_SSE2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBSubtractRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBSubtractRow_AVX2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBSubtractRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBSubtractRow_NEON(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); + +void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); + +void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, + const uint32 dither4, int pix); +void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, + const uint32 dither4, int pix); + +void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); + +void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, + const uint32 dither4, int width); + +void I444ToARGBRow_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGBRow_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I411ToARGBRow_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToBGRARow_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToABGRRow_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToRGBARow_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToRGB24Row_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToRAWRow_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGB4444Row_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGB1555Row_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToRGB565Row_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void NV12ToARGBRow_Any_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToARGBRow_Any_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV12ToRGB565Row_Any_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToRGB565Row_Any_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void YUY2ToARGBRow_Any_NEON(const uint8* src_yuy2, + uint8* dst_argb, + int width); +void UYVYToARGBRow_Any_NEON(const uint8* src_uyvy, + uint8* dst_argb, + int width); +void I422ToARGBRow_MIPS_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToBGRARow_MIPS_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGBRow_MIPS_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToBGRARow_MIPS_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); + +void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_NEON(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_C(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_AVX2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_AVX2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_NEON(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); + +void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_C(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); + +void I422ToYUY2Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, int width); +void I422ToUYVYRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, int width); +void I422ToYUY2Row_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, int width); +void I422ToUYVYRow_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, int width); +void I422ToYUY2Row_Any_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, int width); +void I422ToUYVYRow_Any_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, int width); +void I422ToYUY2Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, int width); +void I422ToUYVYRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, int width); +void I422ToYUY2Row_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, int width); +void I422ToUYVYRow_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, int width); + +// Effects related row functions. +void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, + int width); +void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb, + int width); +void ARGBAttenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb, + int width); +void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb, + int width); + +// Inverse table for unattenuate, shared by C and SSE2. +extern const uint32 fixed_invtbl8[256]; +void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBUnattenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, + int width); +void ARGBUnattenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb, + int width); + +void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width); + +void ARGBSepiaRow_C(uint8* dst_argb, int width); +void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width); +void ARGBSepiaRow_NEON(uint8* dst_argb, int width); + +void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width); +void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width); +void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width); + +void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width); +void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width); + +void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width); +void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width); + +void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width); +void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width); +void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width); + +void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value); +void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value); +void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value); + +// Used for blur. +void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, + int width, int area, uint8* dst, int count); +void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, + const int32* previous_cumsum, int width); + +void CumulativeSumToAverageRow_C(const int32* topleft, const int32* botleft, + int width, int area, uint8* dst, int count); +void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum, + const int32* previous_cumsum, int width); + +LIBYUV_API +void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, + uint8* dst_argb, const float* uv_dudv, int width); +LIBYUV_API +void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, + uint8* dst_argb, const float* uv_dudv, int width); + +// Used for I420Scale, ARGBScale, and ARGBInterpolate. +void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, + int width, int source_y_fraction); +void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_Any_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); + +void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr, + ptrdiff_t src_stride_ptr, + int width, int source_y_fraction); + +// Sobel images. +void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, + uint8* dst_sobelx, int width); +void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobelx, int width); +void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobelx, int width); +void SobelYRow_C(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width); +void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width); +void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width); +void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width); +void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width); +void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width); +void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width); +void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width); +void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width); +void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width); +void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width); +void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width); +void SobelRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width); +void SobelRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width); +void SobelToPlaneRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width); +void SobelToPlaneRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width); +void SobelXYRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width); +void SobelXYRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width); + +void ARGBPolynomialRow_C(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width); +void ARGBPolynomialRow_SSE2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width); +void ARGBPolynomialRow_AVX2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width); + +void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, + const uint8* luma, uint32 lumacoeff); +void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + int width, + const uint8* luma, uint32 lumacoeff); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_ROW_H_ NOLINT diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/scale.h b/libs/libaom/src/third_party/libyuv/include/libyuv/scale.h new file mode 100644 index 000000000..3974aba34 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/include/libyuv/scale.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef INCLUDE_LIBYUV_SCALE_H_ // NOLINT +#define INCLUDE_LIBYUV_SCALE_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Supported filtering. +typedef enum FilterMode { + kFilterNone = 0, // Point sample; Fastest. + kFilterLinear = 1, // Filter horizontally only. + kFilterBilinear = 2, // Faster than box, but lower quality scaling down. + kFilterBox = 3 // Highest quality. +} FilterModeEnum; + +// Scale a YUV plane. +LIBYUV_API +void ScalePlane(const uint8* src, int src_stride, + int src_width, int src_height, + uint8* dst, int dst_stride, + int dst_width, int dst_height, + enum FilterMode filtering); + +LIBYUV_API +void ScalePlane_16(const uint16* src, int src_stride, + int src_width, int src_height, + uint16* dst, int dst_stride, + int dst_width, int dst_height, + enum FilterMode filtering); + +// Scales a YUV 4:2:0 image from the src width and height to the +// dst width and height. +// If filtering is kFilterNone, a simple nearest-neighbor algorithm is +// used. This produces basic (blocky) quality at the fastest speed. +// If filtering is kFilterBilinear, interpolation is used to produce a better +// quality image, at the expense of speed. +// If filtering is kFilterBox, averaging is used to produce ever better +// quality image, at further expense of speed. +// Returns 0 if successful. + +LIBYUV_API +int I420Scale(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int dst_width, int dst_height, + enum FilterMode filtering); + +LIBYUV_API +int I420Scale_16(const uint16* src_y, int src_stride_y, + const uint16* src_u, int src_stride_u, + const uint16* src_v, int src_stride_v, + int src_width, int src_height, + uint16* dst_y, int dst_stride_y, + uint16* dst_u, int dst_stride_u, + uint16* dst_v, int dst_stride_v, + int dst_width, int dst_height, + enum FilterMode filtering); + +#ifdef __cplusplus +// Legacy API. Deprecated. +LIBYUV_API +int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, + int src_stride_y, int src_stride_u, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, uint8* dst_u, uint8* dst_v, + int dst_stride_y, int dst_stride_u, int dst_stride_v, + int dst_width, int dst_height, + LIBYUV_BOOL interpolate); + +// Legacy API. Deprecated. +LIBYUV_API +int ScaleOffset(const uint8* src_i420, int src_width, int src_height, + uint8* dst_i420, int dst_width, int dst_height, int dst_yoffset, + LIBYUV_BOOL interpolate); + +// For testing, allow disabling of specialized scalers. +LIBYUV_API +void SetUseReferenceImpl(LIBYUV_BOOL use); +#endif // __cplusplus + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_SCALE_H_ NOLINT diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/scale_argb.h b/libs/libaom/src/third_party/libyuv/include/libyuv/scale_argb.h new file mode 100644 index 000000000..22563837d --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/include/libyuv/scale_argb.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_ // NOLINT +#define INCLUDE_LIBYUV_SCALE_ARGB_H_ + +#include "libyuv/basic_types.h" +#include "libyuv/scale.h" // For FilterMode + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +LIBYUV_API +int ARGBScale(const uint8* src_argb, int src_stride_argb, + int src_width, int src_height, + uint8* dst_argb, int dst_stride_argb, + int dst_width, int dst_height, + enum FilterMode filtering); + +// Clipped scale takes destination rectangle coordinates for clip values. +LIBYUV_API +int ARGBScaleClip(const uint8* src_argb, int src_stride_argb, + int src_width, int src_height, + uint8* dst_argb, int dst_stride_argb, + int dst_width, int dst_height, + int clip_x, int clip_y, int clip_width, int clip_height, + enum FilterMode filtering); + +// TODO(fbarchard): Implement this. +// Scale with YUV conversion to ARGB and clipping. +LIBYUV_API +int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint32 src_fourcc, + int src_width, int src_height, + uint8* dst_argb, int dst_stride_argb, + uint32 dst_fourcc, + int dst_width, int dst_height, + int clip_x, int clip_y, int clip_width, int clip_height, + enum FilterMode filtering); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_SCALE_ARGB_H_ NOLINT diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/scale_row.h b/libs/libaom/src/third_party/libyuv/include/libyuv/scale_row.h new file mode 100644 index 000000000..a46b5ce69 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/include/libyuv/scale_row.h @@ -0,0 +1,479 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_ // NOLINT +#define INCLUDE_LIBYUV_SCALE_ROW_H_ + +#include "libyuv/basic_types.h" +#include "libyuv/scale.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if defined(__pnacl__) || defined(__CLR_VER) || \ + (defined(__i386__) && !defined(__SSE2__)) +#define LIBYUV_DISABLE_X86 +#endif + +// Visual C 2012 required for AVX2. +#if defined(_M_IX86) && !defined(__clang__) && \ + defined(_MSC_VER) && _MSC_VER >= 1700 +#define VISUALC_HAS_AVX2 1 +#endif // VisualStudio >= 2012 + +// The following are available on all x86 platforms: +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +#define HAS_FIXEDDIV1_X86 +#define HAS_FIXEDDIV_X86 +#define HAS_SCALEARGBCOLS_SSE2 +#define HAS_SCALEARGBCOLSUP2_SSE2 +#define HAS_SCALEARGBFILTERCOLS_SSSE3 +#define HAS_SCALEARGBROWDOWN2_SSE2 +#define HAS_SCALEARGBROWDOWNEVEN_SSE2 +#define HAS_SCALECOLSUP2_SSE2 +#define HAS_SCALEFILTERCOLS_SSSE3 +#define HAS_SCALEROWDOWN2_SSE2 +#define HAS_SCALEROWDOWN34_SSSE3 +#define HAS_SCALEROWDOWN38_SSSE3 +#define HAS_SCALEROWDOWN4_SSE2 +#endif + +// The following are available on VS2012: +#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) +#define HAS_SCALEADDROW_AVX2 +#define HAS_SCALEROWDOWN2_AVX2 +#define HAS_SCALEROWDOWN4_AVX2 +#endif + +// The following are available on Visual C: +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && !defined(__clang__) +#define HAS_SCALEADDROW_SSE2 +#endif + +// The following are available on Neon platforms: +#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ + (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) +#define HAS_SCALEARGBCOLS_NEON +#define HAS_SCALEARGBROWDOWN2_NEON +#define HAS_SCALEARGBROWDOWNEVEN_NEON +#define HAS_SCALEFILTERCOLS_NEON +#define HAS_SCALEROWDOWN2_NEON +#define HAS_SCALEROWDOWN34_NEON +#define HAS_SCALEROWDOWN38_NEON +#define HAS_SCALEROWDOWN4_NEON +#define HAS_SCALEARGBFILTERCOLS_NEON +#endif + +// The following are available on Mips platforms: +#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ + defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2) +#define HAS_SCALEROWDOWN2_MIPS_DSPR2 +#define HAS_SCALEROWDOWN4_MIPS_DSPR2 +#define HAS_SCALEROWDOWN34_MIPS_DSPR2 +#define HAS_SCALEROWDOWN38_MIPS_DSPR2 +#endif + +// Scale ARGB vertically with bilinear interpolation. +void ScalePlaneVertical(int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_argb, uint8* dst_argb, + int x, int y, int dy, + int bpp, enum FilterMode filtering); + +void ScalePlaneVertical_16(int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_argb, uint16* dst_argb, + int x, int y, int dy, + int wpp, enum FilterMode filtering); + +// Simplify the filtering based on scale factors. +enum FilterMode ScaleFilterReduce(int src_width, int src_height, + int dst_width, int dst_height, + enum FilterMode filtering); + +// Divide num by div and return as 16.16 fixed point result. +int FixedDiv_C(int num, int div); +int FixedDiv_X86(int num, int div); +// Divide num - 1 by div - 1 and return as 16.16 fixed point result. +int FixedDiv1_C(int num, int div); +int FixedDiv1_X86(int num, int div); +#ifdef HAS_FIXEDDIV_X86 +#define FixedDiv FixedDiv_X86 +#define FixedDiv1 FixedDiv1_X86 +#else +#define FixedDiv FixedDiv_C +#define FixedDiv1 FixedDiv1_C +#endif + +// Compute slope values for stepping. +void ScaleSlope(int src_width, int src_height, + int dst_width, int dst_height, + enum FilterMode filtering, + int* x, int* y, int* dx, int* dy); + +void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width); +void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width); +void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width); +void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width); +void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width); +void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width); +void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width); +void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* d, int dst_width); +void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width); +void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* d, int dst_width); +void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx); +void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx); +void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int, int); +void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int, int); +void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx); +void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx); +void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx); +void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx); +void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width); +void ScaleRowDown38_3_Box_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width); +void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width); +void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width); +void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width); +void ScaleARGBRowDown2_C(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Linear_C(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEvenBox_C(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int, int); +void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); + +// Specialized scalers for x86. +void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + +void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Linear_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + +void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_0_Box_Any_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_3_Box_Any_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + +void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width); +void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width); +void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width); +void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width); + +void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx); +void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx); + + +// ARGB Column functions +void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +void ScaleARGBCols_Any_NEON(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); + +// ARGB Row functions +void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Linear_Any_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleARGBRowDown2Linear_Any_NEON(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); + +void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); + +// ScaleRowDown2Box also used by planar functions +// NEON downscalers with interpolation. + +// Note - not static due to reuse in convert for 444 to 420. +void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); + +void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + +// Down scale from 4 to 3 pixels. Use the neon multilane read/write +// to load up the every 4th pixel into a 4 different registers. +// Point samples 32 pixels to 24 pixels. +void ScaleRowDown34_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + +// 32 -> 12 +void ScaleRowDown38_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +// 32x3 -> 12x1 +void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +// 32x2 -> 12x1 +void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + +void ScaleRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +// 32 -> 12 +void ScaleRowDown38_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +// 32x3 -> 12x1 +void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +// 32x2 -> 12x1 +void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + +void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width); +void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width); + +void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx); + +void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx); + + +void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width); +void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width); +void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_SCALE_ROW_H_ NOLINT diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/version.h b/libs/libaom/src/third_party/libyuv/include/libyuv/version.h new file mode 100644 index 000000000..287b98ebf --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/include/libyuv/version.h @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT +#define INCLUDE_LIBYUV_VERSION_H_ + +#define LIBYUV_VERSION 1456 + +#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/video_common.h b/libs/libaom/src/third_party/libyuv/include/libyuv/video_common.h new file mode 100644 index 000000000..7b0a19cc9 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/include/libyuv/video_common.h @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Common definitions for video, including fourcc and VideoFormat. + +#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_ // NOLINT +#define INCLUDE_LIBYUV_VIDEO_COMMON_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +////////////////////////////////////////////////////////////////////////////// +// Definition of FourCC codes +////////////////////////////////////////////////////////////////////////////// + +// Convert four characters to a FourCC code. +// Needs to be a macro otherwise the OS X compiler complains when the kFormat* +// constants are used in a switch. +#ifdef __cplusplus +#define FOURCC(a, b, c, d) ( \ + (static_cast(a)) | (static_cast(b) << 8) | \ + (static_cast(c) << 16) | (static_cast(d) << 24)) +#else +#define FOURCC(a, b, c, d) ( \ + ((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \ + ((uint32)(c) << 16) | ((uint32)(d) << 24)) /* NOLINT */ +#endif + +// Some pages discussing FourCC codes: +// http://www.fourcc.org/yuv.php +// http://v4l2spec.bytesex.org/spec/book1.htm +// http://developer.apple.com/quicktime/icefloe/dispatch020.html +// http://msdn.microsoft.com/library/windows/desktop/dd206750.aspx#nv12 +// http://people.xiph.org/~xiphmont/containers/nut/nut4cc.txt + +// FourCC codes grouped according to implementation efficiency. +// Primary formats should convert in 1 efficient step. +// Secondary formats are converted in 2 steps. +// Auxilliary formats call primary converters. +enum FourCC { + // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed. + FOURCC_I420 = FOURCC('I', '4', '2', '0'), + FOURCC_I422 = FOURCC('I', '4', '2', '2'), + FOURCC_I444 = FOURCC('I', '4', '4', '4'), + FOURCC_I411 = FOURCC('I', '4', '1', '1'), + FOURCC_I400 = FOURCC('I', '4', '0', '0'), + FOURCC_NV21 = FOURCC('N', 'V', '2', '1'), + FOURCC_NV12 = FOURCC('N', 'V', '1', '2'), + FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'), + FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'), + + // 2 Secondary YUV formats: row biplanar. + FOURCC_M420 = FOURCC('M', '4', '2', '0'), + FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), // deprecated. + + // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp. + FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'), + FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'), + FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'), + FOURCC_24BG = FOURCC('2', '4', 'B', 'G'), + FOURCC_RAW = FOURCC('r', 'a', 'w', ' '), + FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'), + FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'), // rgb565 LE. + FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // argb1555 LE. + FOURCC_R444 = FOURCC('R', '4', '4', '4'), // argb4444 LE. + + // 4 Secondary RGB formats: 4 Bayer Patterns. deprecated. + FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'), + FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'), + FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'), + FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'), + + // 1 Primary Compressed YUV format. + FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'), + + // 5 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias. + FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'), + FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'), + FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'), + FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Linux version of I420. + FOURCC_J420 = FOURCC('J', '4', '2', '0'), + FOURCC_J400 = FOURCC('J', '4', '0', '0'), + + // 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc. + FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420. + FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'), // Alias for I422. + FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'), // Alias for I444. + FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'), // Alias for YUY2. + FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'), // Alias for YUY2 on Mac. + FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'), // Alias for UYVY. + FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'), // Alias for UYVY on Mac. + FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'), // Alias for MJPG. + FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'), // Alias for MJPG on Mac. + FOURCC_BA81 = FOURCC('B', 'A', '8', '1'), // Alias for BGGR. + FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'), // Alias for RAW. + FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'), // Alias for 24BG. + FOURCC_CM32 = FOURCC(0, 0, 0, 32), // Alias for BGRA kCMPixelFormat_32ARGB + FOURCC_CM24 = FOURCC(0, 0, 0, 24), // Alias for RAW kCMPixelFormat_24RGB + FOURCC_L555 = FOURCC('L', '5', '5', '5'), // Alias for RGBO. + FOURCC_L565 = FOURCC('L', '5', '6', '5'), // Alias for RGBP. + FOURCC_5551 = FOURCC('5', '5', '5', '1'), // Alias for RGBO. + + // 1 Auxiliary compressed YUV format set aside for capturer. + FOURCC_H264 = FOURCC('H', '2', '6', '4'), + + // Match any fourcc. + FOURCC_ANY = -1, +}; + +enum FourCCBpp { + // Canonical fourcc codes used in our code. + FOURCC_BPP_I420 = 12, + FOURCC_BPP_I422 = 16, + FOURCC_BPP_I444 = 24, + FOURCC_BPP_I411 = 12, + FOURCC_BPP_I400 = 8, + FOURCC_BPP_NV21 = 12, + FOURCC_BPP_NV12 = 12, + FOURCC_BPP_YUY2 = 16, + FOURCC_BPP_UYVY = 16, + FOURCC_BPP_M420 = 12, + FOURCC_BPP_Q420 = 12, + FOURCC_BPP_ARGB = 32, + FOURCC_BPP_BGRA = 32, + FOURCC_BPP_ABGR = 32, + FOURCC_BPP_RGBA = 32, + FOURCC_BPP_24BG = 24, + FOURCC_BPP_RAW = 24, + FOURCC_BPP_RGBP = 16, + FOURCC_BPP_RGBO = 16, + FOURCC_BPP_R444 = 16, + FOURCC_BPP_RGGB = 8, + FOURCC_BPP_BGGR = 8, + FOURCC_BPP_GRBG = 8, + FOURCC_BPP_GBRG = 8, + FOURCC_BPP_YV12 = 12, + FOURCC_BPP_YV16 = 16, + FOURCC_BPP_YV24 = 24, + FOURCC_BPP_YU12 = 12, + FOURCC_BPP_J420 = 12, + FOURCC_BPP_J400 = 8, + FOURCC_BPP_MJPG = 0, // 0 means unknown. + FOURCC_BPP_H264 = 0, + FOURCC_BPP_IYUV = 12, + FOURCC_BPP_YU16 = 16, + FOURCC_BPP_YU24 = 24, + FOURCC_BPP_YUYV = 16, + FOURCC_BPP_YUVS = 16, + FOURCC_BPP_HDYC = 16, + FOURCC_BPP_2VUY = 16, + FOURCC_BPP_JPEG = 1, + FOURCC_BPP_DMB1 = 1, + FOURCC_BPP_BA81 = 8, + FOURCC_BPP_RGB3 = 24, + FOURCC_BPP_BGR3 = 24, + FOURCC_BPP_CM32 = 32, + FOURCC_BPP_CM24 = 24, + + // Match any fourcc. + FOURCC_BPP_ANY = 0, // 0 means unknown. +}; + +// Converts fourcc aliases into canonical ones. +LIBYUV_API uint32 CanonicalFourCC(uint32 fourcc); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_VIDEO_COMMON_H_ NOLINT diff --git a/libs/libaom/src/third_party/libyuv/source/compare.cc b/libs/libaom/src/third_party/libyuv/source/compare.cc new file mode 100644 index 000000000..46aa8473d --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/compare.cc @@ -0,0 +1,373 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/compare.h" + +#include +#include +#ifdef _OPENMP +#include +#endif + +#include "libyuv/basic_types.h" +#include "libyuv/cpu_id.h" +#include "libyuv/row.h" +#include "libyuv/video_common.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// hash seed of 5381 recommended. +// Internal C version of HashDjb2 with int sized count for efficiency. +uint32 HashDjb2_C(const uint8* src, int count, uint32 seed); + +// This module is for Visual C x86 +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || \ + (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))) +#define HAS_HASHDJB2_SSE41 +uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed); + +#ifdef VISUALC_HAS_AVX2 +#define HAS_HASHDJB2_AVX2 +uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed); +#endif + +#endif // HAS_HASHDJB2_SSE41 + +// hash seed of 5381 recommended. +LIBYUV_API +uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { + const int kBlockSize = 1 << 15; // 32768; + int remainder; + uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C; +#if defined(HAS_HASHDJB2_SSE41) + if (TestCpuFlag(kCpuHasSSE41)) { + HashDjb2_SSE = HashDjb2_SSE41; + } +#endif +#if defined(HAS_HASHDJB2_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + HashDjb2_SSE = HashDjb2_AVX2; + } +#endif + + while (count >= (uint64)(kBlockSize)) { + seed = HashDjb2_SSE(src, kBlockSize, seed); + src += kBlockSize; + count -= kBlockSize; + } + remainder = (int)(count) & ~15; + if (remainder) { + seed = HashDjb2_SSE(src, remainder, seed); + src += remainder; + count -= remainder; + } + remainder = (int)(count) & 15; + if (remainder) { + seed = HashDjb2_C(src, remainder, seed); + } + return seed; +} + +static uint32 ARGBDetectRow_C(const uint8* argb, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB. + return FOURCC_BGRA; + } + if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA. + return FOURCC_ARGB; + } + if (argb[4] != 255) { // Second pixel first byte is not Alpha of 255. + return FOURCC_BGRA; + } + if (argb[7] != 255) { // Second pixel 4th byte is not Alpha of 255. + return FOURCC_ARGB; + } + argb += 8; + } + if (width & 1) { + if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB. + return FOURCC_BGRA; + } + if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA. + return FOURCC_ARGB; + } + } + return 0; +} + +// Scan an opaque argb image and return fourcc based on alpha offset. +// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown. +LIBYUV_API +uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) { + uint32 fourcc = 0; + int h; + + // Coalesce rows. + if (stride_argb == width * 4) { + width *= height; + height = 1; + stride_argb = 0; + } + for (h = 0; h < height && fourcc == 0; ++h) { + fourcc = ARGBDetectRow_C(argb, width); + argb += stride_argb; + } + return fourcc; +} + +uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count); +#if !defined(LIBYUV_DISABLE_NEON) && \ + (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) +#define HAS_SUMSQUAREERROR_NEON +uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count); +#endif +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +#define HAS_SUMSQUAREERROR_SSE2 +uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count); +#endif + +#ifdef VISUALC_HAS_AVX2 +#define HAS_SUMSQUAREERROR_AVX2 +uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count); +#endif + +// TODO(fbarchard): Refactor into row function. +LIBYUV_API +uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, + int count) { + // SumSquareError returns values 0 to 65535 for each squared difference. + // Up to 65536 of those can be summed and remain within a uint32. + // After each block of 65536 pixels, accumulate into a uint64. + const int kBlockSize = 65536; + int remainder = count & (kBlockSize - 1) & ~31; + uint64 sse = 0; + int i; + uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) = + SumSquareError_C; +#if defined(HAS_SUMSQUAREERROR_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SumSquareError = SumSquareError_NEON; + } +#endif +#if defined(HAS_SUMSQUAREERROR_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + // Note only used for multiples of 16 so count is not checked. + SumSquareError = SumSquareError_SSE2; + } +#endif +#if defined(HAS_SUMSQUAREERROR_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + // Note only used for multiples of 32 so count is not checked. + SumSquareError = SumSquareError_AVX2; + } +#endif +#ifdef _OPENMP +#pragma omp parallel for reduction(+: sse) +#endif + for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) { + sse += SumSquareError(src_a + i, src_b + i, kBlockSize); + } + src_a += count & ~(kBlockSize - 1); + src_b += count & ~(kBlockSize - 1); + if (remainder) { + sse += SumSquareError(src_a, src_b, remainder); + src_a += remainder; + src_b += remainder; + } + remainder = count & 31; + if (remainder) { + sse += SumSquareError_C(src_a, src_b, remainder); + } + return sse; +} + +LIBYUV_API +uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b, + int width, int height) { + uint64 sse = 0; + int h; + // Coalesce rows. + if (stride_a == width && + stride_b == width) { + width *= height; + height = 1; + stride_a = stride_b = 0; + } + for (h = 0; h < height; ++h) { + sse += ComputeSumSquareError(src_a, src_b, width); + src_a += stride_a; + src_b += stride_b; + } + return sse; +} + +LIBYUV_API +double SumSquareErrorToPsnr(uint64 sse, uint64 count) { + double psnr; + if (sse > 0) { + double mse = (double)(count) / (double)(sse); + psnr = 10.0 * log10(255.0 * 255.0 * mse); + } else { + psnr = kMaxPsnr; // Limit to prevent divide by 0 + } + + if (psnr > kMaxPsnr) + psnr = kMaxPsnr; + + return psnr; +} + +LIBYUV_API +double CalcFramePsnr(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b, + int width, int height) { + const uint64 samples = width * height; + const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a, + src_b, stride_b, + width, height); + return SumSquareErrorToPsnr(sse, samples); +} + +LIBYUV_API +double I420Psnr(const uint8* src_y_a, int stride_y_a, + const uint8* src_u_a, int stride_u_a, + const uint8* src_v_a, int stride_v_a, + const uint8* src_y_b, int stride_y_b, + const uint8* src_u_b, int stride_u_b, + const uint8* src_v_b, int stride_v_b, + int width, int height) { + const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a, + src_y_b, stride_y_b, + width, height); + const int width_uv = (width + 1) >> 1; + const int height_uv = (height + 1) >> 1; + const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a, + src_u_b, stride_u_b, + width_uv, height_uv); + const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a, + src_v_b, stride_v_b, + width_uv, height_uv); + const uint64 samples = width * height + 2 * (width_uv * height_uv); + const uint64 sse = sse_y + sse_u + sse_v; + return SumSquareErrorToPsnr(sse, samples); +} + +static const int64 cc1 = 26634; // (64^2*(.01*255)^2 +static const int64 cc2 = 239708; // (64^2*(.03*255)^2 + +static double Ssim8x8_C(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b) { + int64 sum_a = 0; + int64 sum_b = 0; + int64 sum_sq_a = 0; + int64 sum_sq_b = 0; + int64 sum_axb = 0; + + int i; + for (i = 0; i < 8; ++i) { + int j; + for (j = 0; j < 8; ++j) { + sum_a += src_a[j]; + sum_b += src_b[j]; + sum_sq_a += src_a[j] * src_a[j]; + sum_sq_b += src_b[j] * src_b[j]; + sum_axb += src_a[j] * src_b[j]; + } + + src_a += stride_a; + src_b += stride_b; + } + + { + const int64 count = 64; + // scale the constants by number of pixels + const int64 c1 = (cc1 * count * count) >> 12; + const int64 c2 = (cc2 * count * count) >> 12; + + const int64 sum_a_x_sum_b = sum_a * sum_b; + + const int64 ssim_n = (2 * sum_a_x_sum_b + c1) * + (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2); + + const int64 sum_a_sq = sum_a*sum_a; + const int64 sum_b_sq = sum_b*sum_b; + + const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) * + (count * sum_sq_a - sum_a_sq + + count * sum_sq_b - sum_b_sq + c2); + + if (ssim_d == 0.0) { + return DBL_MAX; + } + return ssim_n * 1.0 / ssim_d; + } +} + +// We are using a 8x8 moving window with starting location of each 8x8 window +// on the 4x4 pixel grid. Such arrangement allows the windows to overlap +// block boundaries to penalize blocking artifacts. +LIBYUV_API +double CalcFrameSsim(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b, + int width, int height) { + int samples = 0; + double ssim_total = 0; + double (*Ssim8x8)(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b) = Ssim8x8_C; + + // sample point start with each 4x4 location + int i; + for (i = 0; i < height - 8; i += 4) { + int j; + for (j = 0; j < width - 8; j += 4) { + ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b); + samples++; + } + + src_a += stride_a * 4; + src_b += stride_b * 4; + } + + ssim_total /= samples; + return ssim_total; +} + +LIBYUV_API +double I420Ssim(const uint8* src_y_a, int stride_y_a, + const uint8* src_u_a, int stride_u_a, + const uint8* src_v_a, int stride_v_a, + const uint8* src_y_b, int stride_y_b, + const uint8* src_u_b, int stride_u_b, + const uint8* src_v_b, int stride_v_b, + int width, int height) { + const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a, + src_y_b, stride_y_b, width, height); + const int width_uv = (width + 1) >> 1; + const int height_uv = (height + 1) >> 1; + const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, + src_u_b, stride_u_b, + width_uv, height_uv); + const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, + src_v_b, stride_v_b, + width_uv, height_uv); + return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v); +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/compare_common.cc b/libs/libaom/src/third_party/libyuv/source/compare_common.cc new file mode 100644 index 000000000..c546b5182 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/compare_common.cc @@ -0,0 +1,42 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) { + uint32 sse = 0u; + int i; + for (i = 0; i < count; ++i) { + int diff = src_a[i] - src_b[i]; + sse += (uint32)(diff * diff); + } + return sse; +} + +// hash seed of 5381 recommended. +// Internal C version of HashDjb2 with int sized count for efficiency. +uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) { + uint32 hash = seed; + int i; + for (i = 0; i < count; ++i) { + hash += (hash << 5) + src[i]; + } + return hash; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/compare_gcc.cc b/libs/libaom/src/third_party/libyuv/source/compare_gcc.cc new file mode 100644 index 000000000..247cb33bb --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/compare_gcc.cc @@ -0,0 +1,152 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) + +uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { + uint32 sse; + asm volatile ( // NOLINT + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm5,%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" + "lea " MEMLEA(0x10, 0) ",%0 \n" + "movdqu " MEMACCESS(1) ",%%xmm2 \n" + "lea " MEMLEA(0x10, 1) ",%1 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psubusb %%xmm2,%%xmm1 \n" + "psubusb %%xmm3,%%xmm2 \n" + "por %%xmm2,%%xmm1 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpckhbw %%xmm5,%%xmm2 \n" + "pmaddwd %%xmm1,%%xmm1 \n" + "pmaddwd %%xmm2,%%xmm2 \n" + "paddd %%xmm1,%%xmm0 \n" + "paddd %%xmm2,%%xmm0 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + + "pshufd $0xee,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "pshufd $0x1,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "movd %%xmm0,%3 \n" + + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=g"(sse) // %3 + :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); // NOLINT + return sse; +} + +#endif // defined(__x86_64__) || defined(__i386__) + +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))) +#define HAS_HASHDJB2_SSE41 +static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 +static uvec32 kHashMul0 = { + 0x0c3525e1, // 33 ^ 15 + 0xa3476dc1, // 33 ^ 14 + 0x3b4039a1, // 33 ^ 13 + 0x4f5f0981, // 33 ^ 12 +}; +static uvec32 kHashMul1 = { + 0x30f35d61, // 33 ^ 11 + 0x855cb541, // 33 ^ 10 + 0x040a9121, // 33 ^ 9 + 0x747c7101, // 33 ^ 8 +}; +static uvec32 kHashMul2 = { + 0xec41d4e1, // 33 ^ 7 + 0x4cfa3cc1, // 33 ^ 6 + 0x025528a1, // 33 ^ 5 + 0x00121881, // 33 ^ 4 +}; +static uvec32 kHashMul3 = { + 0x00008c61, // 33 ^ 3 + 0x00000441, // 33 ^ 2 + 0x00000021, // 33 ^ 1 + 0x00000001, // 33 ^ 0 +}; + +uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { + uint32 hash; + asm volatile ( // NOLINT + "movd %2,%%xmm0 \n" + "pxor %%xmm7,%%xmm7 \n" + "movdqa %4,%%xmm6 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" + "lea " MEMLEA(0x10, 0) ",%0 \n" + "pmulld %%xmm6,%%xmm0 \n" + "movdqa %5,%%xmm5 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm7,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm7,%%xmm3 \n" + "pmulld %%xmm5,%%xmm3 \n" + "movdqa %6,%%xmm5 \n" + "movdqa %%xmm2,%%xmm4 \n" + "punpckhwd %%xmm7,%%xmm4 \n" + "pmulld %%xmm5,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "punpckhbw %%xmm7,%%xmm1 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklwd %%xmm7,%%xmm2 \n" + "pmulld %%xmm5,%%xmm2 \n" + "movdqa %8,%%xmm5 \n" + "punpckhwd %%xmm7,%%xmm1 \n" + "pmulld %%xmm5,%%xmm1 \n" + "paddd %%xmm4,%%xmm3 \n" + "paddd %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm1 \n" + "pshufd $0xe,%%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm1 \n" + "pshufd $0x1,%%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "sub $0x10,%1 \n" + "jg 1b \n" + "movd %%xmm0,%3 \n" + : "+r"(src), // %0 + "+r"(count), // %1 + "+rm"(seed), // %2 + "=g"(hash) // %3 + : "m"(kHash16x33), // %4 + "m"(kHashMul0), // %5 + "m"(kHashMul1), // %6 + "m"(kHashMul2), // %7 + "m"(kHashMul3) // %8 + : "memory", "cc" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); // NOLINT + return hash; +} +#endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + diff --git a/libs/libaom/src/third_party/libyuv/source/compare_neon.cc b/libs/libaom/src/third_party/libyuv/source/compare_neon.cc new file mode 100644 index 000000000..ef006ec41 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/compare_neon.cc @@ -0,0 +1,65 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ + !defined(__aarch64__) + +uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { + volatile uint32 sse; + asm volatile ( + "vmov.u8 q8, #0 \n" + "vmov.u8 q10, #0 \n" + "vmov.u8 q9, #0 \n" + "vmov.u8 q11, #0 \n" + + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" + MEMACCESS(1) + "vld1.8 {q1}, [%1]! \n" + "subs %2, %2, #16 \n" + "vsubl.u8 q2, d0, d2 \n" + "vsubl.u8 q3, d1, d3 \n" + "vmlal.s16 q8, d4, d4 \n" + "vmlal.s16 q9, d6, d6 \n" + "vmlal.s16 q10, d5, d5 \n" + "vmlal.s16 q11, d7, d7 \n" + "bgt 1b \n" + + "vadd.u32 q8, q8, q9 \n" + "vadd.u32 q10, q10, q11 \n" + "vadd.u32 q11, q8, q10 \n" + "vpaddl.u32 q1, q11 \n" + "vadd.u64 d0, d2, d3 \n" + "vmov.32 %3, d0[0] \n" + : "+r"(src_a), + "+r"(src_b), + "+r"(count), + "=r"(sse) + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); + return sse; +} + +#endif // defined(__ARM_NEON__) && !defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/compare_neon64.cc b/libs/libaom/src/third_party/libyuv/source/compare_neon64.cc new file mode 100644 index 000000000..6d1e5e1bc --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/compare_neon64.cc @@ -0,0 +1,63 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { + volatile uint32 sse; + asm volatile ( + "eor v16.16b, v16.16b, v16.16b \n" + "eor v18.16b, v18.16b, v18.16b \n" + "eor v17.16b, v17.16b, v17.16b \n" + "eor v19.16b, v19.16b, v19.16b \n" + + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" + MEMACCESS(1) + "ld1 {v1.16b}, [%1], #16 \n" + "subs %w2, %w2, #16 \n" + "usubl v2.8h, v0.8b, v1.8b \n" + "usubl2 v3.8h, v0.16b, v1.16b \n" + "smlal v16.4s, v2.4h, v2.4h \n" + "smlal v17.4s, v3.4h, v3.4h \n" + "smlal2 v18.4s, v2.8h, v2.8h \n" + "smlal2 v19.4s, v3.8h, v3.8h \n" + "b.gt 1b \n" + + "add v16.4s, v16.4s, v17.4s \n" + "add v18.4s, v18.4s, v19.4s \n" + "add v19.4s, v16.4s, v18.4s \n" + "addv s0, v19.4s \n" + "fmov %w3, s0 \n" + : "+r"(src_a), + "+r"(src_b), + "+r"(count), + "=r"(sse) + : + : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); + return sse; +} + +#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/compare_win.cc b/libs/libaom/src/third_party/libyuv/source/compare_win.cc new file mode 100644 index 000000000..19806f275 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/compare_win.cc @@ -0,0 +1,229 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for Visual C x86. +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ + defined(_MSC_VER) && !defined(__clang__) + +__declspec(naked) +uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { + __asm { + mov eax, [esp + 4] // src_a + mov edx, [esp + 8] // src_b + mov ecx, [esp + 12] // count + pxor xmm0, xmm0 + pxor xmm5, xmm5 + + wloop: + movdqu xmm1, [eax] + lea eax, [eax + 16] + movdqu xmm2, [edx] + lea edx, [edx + 16] + movdqa xmm3, xmm1 // abs trick + psubusb xmm1, xmm2 + psubusb xmm2, xmm3 + por xmm1, xmm2 + movdqa xmm2, xmm1 + punpcklbw xmm1, xmm5 + punpckhbw xmm2, xmm5 + pmaddwd xmm1, xmm1 + pmaddwd xmm2, xmm2 + paddd xmm0, xmm1 + paddd xmm0, xmm2 + sub ecx, 16 + jg wloop + + pshufd xmm1, xmm0, 0xee + paddd xmm0, xmm1 + pshufd xmm1, xmm0, 0x01 + paddd xmm0, xmm1 + movd eax, xmm0 + ret + } +} + +// Visual C 2012 required for AVX2. +#if _MSC_VER >= 1700 +// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. +#pragma warning(disable: 4752) +__declspec(naked) +uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { + __asm { + mov eax, [esp + 4] // src_a + mov edx, [esp + 8] // src_b + mov ecx, [esp + 12] // count + vpxor ymm0, ymm0, ymm0 // sum + vpxor ymm5, ymm5, ymm5 // constant 0 for unpck + sub edx, eax + + wloop: + vmovdqu ymm1, [eax] + vmovdqu ymm2, [eax + edx] + lea eax, [eax + 32] + vpsubusb ymm3, ymm1, ymm2 // abs difference trick + vpsubusb ymm2, ymm2, ymm1 + vpor ymm1, ymm2, ymm3 + vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order. + vpunpckhbw ymm1, ymm1, ymm5 + vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32. + vpmaddwd ymm1, ymm1, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm2 + sub ecx, 32 + jg wloop + + vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes. + vpaddd ymm0, ymm0, ymm1 + vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes. + vpaddd ymm0, ymm0, ymm1 + vpermq ymm1, ymm0, 0x02 // high + low lane. + vpaddd ymm0, ymm0, ymm1 + vmovd eax, xmm0 + vzeroupper + ret + } +} +#endif // _MSC_VER >= 1700 + +#define HAS_HASHDJB2_SSE41 +static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 +static uvec32 kHashMul0 = { + 0x0c3525e1, // 33 ^ 15 + 0xa3476dc1, // 33 ^ 14 + 0x3b4039a1, // 33 ^ 13 + 0x4f5f0981, // 33 ^ 12 +}; +static uvec32 kHashMul1 = { + 0x30f35d61, // 33 ^ 11 + 0x855cb541, // 33 ^ 10 + 0x040a9121, // 33 ^ 9 + 0x747c7101, // 33 ^ 8 +}; +static uvec32 kHashMul2 = { + 0xec41d4e1, // 33 ^ 7 + 0x4cfa3cc1, // 33 ^ 6 + 0x025528a1, // 33 ^ 5 + 0x00121881, // 33 ^ 4 +}; +static uvec32 kHashMul3 = { + 0x00008c61, // 33 ^ 3 + 0x00000441, // 33 ^ 2 + 0x00000021, // 33 ^ 1 + 0x00000001, // 33 ^ 0 +}; + +// 27: 66 0F 38 40 C6 pmulld xmm0,xmm6 +// 44: 66 0F 38 40 DD pmulld xmm3,xmm5 +// 59: 66 0F 38 40 E5 pmulld xmm4,xmm5 +// 72: 66 0F 38 40 D5 pmulld xmm2,xmm5 +// 83: 66 0F 38 40 CD pmulld xmm1,xmm5 +#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \ + _asm _emit 0x40 _asm _emit reg + +__declspec(naked) +uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { + __asm { + mov eax, [esp + 4] // src + mov ecx, [esp + 8] // count + movd xmm0, [esp + 12] // seed + + pxor xmm7, xmm7 // constant 0 for unpck + movdqa xmm6, kHash16x33 + + wloop: + movdqu xmm1, [eax] // src[0-15] + lea eax, [eax + 16] + pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16 + movdqa xmm5, kHashMul0 + movdqa xmm2, xmm1 + punpcklbw xmm2, xmm7 // src[0-7] + movdqa xmm3, xmm2 + punpcklwd xmm3, xmm7 // src[0-3] + pmulld(0xdd) // pmulld xmm3, xmm5 + movdqa xmm5, kHashMul1 + movdqa xmm4, xmm2 + punpckhwd xmm4, xmm7 // src[4-7] + pmulld(0xe5) // pmulld xmm4, xmm5 + movdqa xmm5, kHashMul2 + punpckhbw xmm1, xmm7 // src[8-15] + movdqa xmm2, xmm1 + punpcklwd xmm2, xmm7 // src[8-11] + pmulld(0xd5) // pmulld xmm2, xmm5 + movdqa xmm5, kHashMul3 + punpckhwd xmm1, xmm7 // src[12-15] + pmulld(0xcd) // pmulld xmm1, xmm5 + paddd xmm3, xmm4 // add 16 results + paddd xmm1, xmm2 + paddd xmm1, xmm3 + + pshufd xmm2, xmm1, 0x0e // upper 2 dwords + paddd xmm1, xmm2 + pshufd xmm2, xmm1, 0x01 + paddd xmm1, xmm2 + paddd xmm0, xmm1 + sub ecx, 16 + jg wloop + + movd eax, xmm0 // return hash + ret + } +} + +// Visual C 2012 required for AVX2. +#if _MSC_VER >= 1700 +__declspec(naked) +uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { + __asm { + mov eax, [esp + 4] // src + mov ecx, [esp + 8] // count + movd xmm0, [esp + 12] // seed + movdqa xmm6, kHash16x33 + + wloop: + vpmovzxbd xmm3, dword ptr [eax] // src[0-3] + pmulld xmm0, xmm6 // hash *= 33 ^ 16 + vpmovzxbd xmm4, dword ptr [eax + 4] // src[4-7] + pmulld xmm3, kHashMul0 + vpmovzxbd xmm2, dword ptr [eax + 8] // src[8-11] + pmulld xmm4, kHashMul1 + vpmovzxbd xmm1, dword ptr [eax + 12] // src[12-15] + pmulld xmm2, kHashMul2 + lea eax, [eax + 16] + pmulld xmm1, kHashMul3 + paddd xmm3, xmm4 // add 16 results + paddd xmm1, xmm2 + paddd xmm1, xmm3 + pshufd xmm2, xmm1, 0x0e // upper 2 dwords + paddd xmm1, xmm2 + pshufd xmm2, xmm1, 0x01 + paddd xmm1, xmm2 + paddd xmm0, xmm1 + sub ecx, 16 + jg wloop + + movd eax, xmm0 // return hash + ret + } +} +#endif // _MSC_VER >= 1700 +#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/convert.cc b/libs/libaom/src/third_party/libyuv/source/convert.cc new file mode 100644 index 000000000..3ad6bd7a4 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/convert.cc @@ -0,0 +1,1389 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert.h" + +#include "libyuv/basic_types.h" +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" +#include "libyuv/rotate.h" +#include "libyuv/scale.h" // For ScalePlane() +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// Any I4xx To I420 format with mirroring. +static int I4xxToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int src_y_width, int src_y_height, + int src_uv_width, int src_uv_height) { + const int dst_y_width = Abs(src_y_width); + const int dst_y_height = Abs(src_y_height); + const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1); + const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1); + if (src_y_width == 0 || src_y_height == 0 || + src_uv_width == 0 || src_uv_height == 0) { + return -1; + } + ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, + dst_y, dst_stride_y, dst_y_width, dst_y_height, + kFilterBilinear); + ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, + dst_u, dst_stride_u, dst_uv_width, dst_uv_height, + kFilterBilinear); + ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, + dst_v, dst_stride_v, dst_uv_width, dst_uv_height, + kFilterBilinear); + return 0; +} + +// Copy I420 with optional flipping +// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure +// is does row coalescing. +LIBYUV_API +int I420Copy(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_u || !src_v || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + // Copy UV planes. + CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); + CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); + return 0; +} + +// 422 chroma is 1/2 width, 1x height +// 420 chroma is 1/2 width, 1/2 height +LIBYUV_API +int I422ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + const int src_uv_width = SUBSAMPLE(width, 1, 1); + return I4xxToI420(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, + src_uv_width, height); +} + +// 444 chroma is 1x width, 1x height +// 420 chroma is 1/2 width, 1/2 height +LIBYUV_API +int I444ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + return I4xxToI420(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, + width, height); +} + +// 411 chroma is 1/4 width, 1x height +// 420 chroma is 1/2 width, 1/2 height +LIBYUV_API +int I411ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + const int src_uv_width = SUBSAMPLE(width, 3, 2); + return I4xxToI420(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, + src_uv_width, height); +} + +// I400 is greyscale typically used in MJPG +LIBYUV_API +int I400ToI420(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128); + SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128); + return 0; +} + +static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, + uint8* dst, int dst_stride, + int width, int height) { + int y; + void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; + } +#endif +#if defined(HAS_COPYROW_AVX) + if (TestCpuFlag(kCpuHasAVX)) { + CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; + } +#endif +#if defined(HAS_COPYROW_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + CopyRow = CopyRow_ERMS; + } +#endif +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; + } +#endif +#if defined(HAS_COPYROW_MIPS) + if (TestCpuFlag(kCpuHasMIPS)) { + CopyRow = CopyRow_MIPS; + } +#endif + + // Copy plane + for (y = 0; y < height - 1; y += 2) { + CopyRow(src, dst, width); + CopyRow(src + src_stride_0, dst + dst_stride, width); + src += src_stride_0 + src_stride_1; + dst += dst_stride * 2; + } + if (height & 1) { + CopyRow(src, dst, width); + } +} + +// Support converting from FOURCC_M420 +// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for +// easy conversion to I420. +// M420 format description: +// M420 is row biplanar 420: 2 rows of Y and 1 row of UV. +// Chroma is half width / half height. (420) +// src_stride_m420 is row planar. Normally this will be the width in pixels. +// The UV plane is half width, but 2 values, so src_stride_m420 applies to +// this as well as the two Y planes. +static int X420ToI420(const uint8* src_y, + int src_stride_y0, int src_stride_y1, + const uint8* src_uv, int src_stride_uv, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) = + SplitUVRow_C; + if (!src_y || !src_uv || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_u = dst_u + (halfheight - 1) * dst_stride_u; + dst_v = dst_v + (halfheight - 1) * dst_stride_v; + dst_stride_y = -dst_stride_y; + dst_stride_u = -dst_stride_u; + dst_stride_v = -dst_stride_v; + } + // Coalesce rows. + if (src_stride_y0 == width && + src_stride_y1 == width && + dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y0 = src_stride_y1 = dst_stride_y = 0; + } + // Coalesce rows. + if (src_stride_uv == halfwidth * 2 && + dst_stride_u == halfwidth && + dst_stride_v == halfwidth) { + halfwidth *= halfheight; + halfheight = 1; + src_stride_uv = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_SPLITUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SplitUVRow = SplitUVRow_Any_SSE2; + if (IS_ALIGNED(halfwidth, 16)) { + SplitUVRow = SplitUVRow_SSE2; + } + } +#endif +#if defined(HAS_SPLITUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + SplitUVRow = SplitUVRow_Any_AVX2; + if (IS_ALIGNED(halfwidth, 32)) { + SplitUVRow = SplitUVRow_AVX2; + } + } +#endif +#if defined(HAS_SPLITUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SplitUVRow = SplitUVRow_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + SplitUVRow = SplitUVRow_NEON; + } + } +#endif +#if defined(HAS_SPLITUVROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && + IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) && + IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) && + IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) { + SplitUVRow = SplitUVRow_Any_MIPS_DSPR2; + if (IS_ALIGNED(halfwidth, 16)) { + SplitUVRow = SplitUVRow_MIPS_DSPR2; + } + } +#endif + + if (dst_y) { + if (src_stride_y0 == src_stride_y1) { + CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height); + } else { + CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y, + width, height); + } + } + + for (y = 0; y < halfheight; ++y) { + // Copy a row of UV. + SplitUVRow(src_uv, dst_u, dst_v, halfwidth); + dst_u += dst_stride_u; + dst_v += dst_stride_v; + src_uv += src_stride_uv; + } + return 0; +} + +// Convert NV12 to I420. +LIBYUV_API +int NV12ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + return X420ToI420(src_y, src_stride_y, src_stride_y, + src_uv, src_stride_uv, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height); +} + +// Convert NV21 to I420. Same as NV12 but u and v pointers swapped. +LIBYUV_API +int NV21ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_vu, int src_stride_vu, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + return X420ToI420(src_y, src_stride_y, src_stride_y, + src_vu, src_stride_vu, + dst_y, dst_stride_y, + dst_v, dst_stride_v, + dst_u, dst_stride_u, + width, height); +} + +// Convert M420 to I420. +LIBYUV_API +int M420ToI420(const uint8* src_m420, int src_stride_m420, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2, + src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height); +} + +// Convert YUY2 to I420. +LIBYUV_API +int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) = YUY2ToUVRow_C; + void (*YUY2ToYRow)(const uint8* src_yuy2, + uint8* dst_y, int pix) = YUY2ToYRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; + } +#if defined(HAS_YUY2TOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + YUY2ToUVRow = YUY2ToUVRow_Any_SSE2; + YUY2ToYRow = YUY2ToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + YUY2ToUVRow = YUY2ToUVRow_SSE2; + YUY2ToYRow = YUY2ToYRow_SSE2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + YUY2ToUVRow = YUY2ToUVRow_Any_AVX2; + YUY2ToYRow = YUY2ToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + YUY2ToUVRow = YUY2ToUVRow_AVX2; + YUY2ToYRow = YUY2ToYRow_AVX2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + YUY2ToYRow = YUY2ToYRow_Any_NEON; + YUY2ToUVRow = YUY2ToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_NEON; + YUY2ToUVRow = YUY2ToUVRow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width); + YUY2ToYRow(src_yuy2, dst_y, width); + YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width); + src_yuy2 += src_stride_yuy2 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width); + YUY2ToYRow(src_yuy2, dst_y, width); + } + return 0; +} + +// Convert UYVY to I420. +LIBYUV_API +int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) = UYVYToUVRow_C; + void (*UYVYToYRow)(const uint8* src_uyvy, + uint8* dst_y, int pix) = UYVYToYRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; + src_stride_uyvy = -src_stride_uyvy; + } +#if defined(HAS_UYVYTOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + UYVYToUVRow = UYVYToUVRow_Any_SSE2; + UYVYToYRow = UYVYToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + UYVYToUVRow = UYVYToUVRow_SSE2; + UYVYToYRow = UYVYToYRow_SSE2; + } + } +#endif +#if defined(HAS_UYVYTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + UYVYToUVRow = UYVYToUVRow_Any_AVX2; + UYVYToYRow = UYVYToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + UYVYToUVRow = UYVYToUVRow_AVX2; + UYVYToYRow = UYVYToYRow_AVX2; + } + } +#endif +#if defined(HAS_UYVYTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + UYVYToYRow = UYVYToYRow_Any_NEON; + UYVYToUVRow = UYVYToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_NEON; + UYVYToUVRow = UYVYToUVRow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width); + UYVYToYRow(src_uyvy, dst_y, width); + UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width); + src_uyvy += src_stride_uyvy * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width); + UYVYToYRow(src_uyvy, dst_y, width); + } + return 0; +} + +// Convert ARGB to I420. +LIBYUV_API +int ARGBToI420(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + if (!src_argb || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); + src_argb += src_stride_argb * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + } + return 0; +} + +// Convert BGRA to I420. +LIBYUV_API +int BGRAToI420(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C; + void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int pix) = + BGRAToYRow_C; + if (!src_bgra || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_bgra = src_bgra + (height - 1) * src_stride_bgra; + src_stride_bgra = -src_stride_bgra; + } +#if defined(HAS_BGRATOYROW_SSSE3) && defined(HAS_BGRATOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + BGRAToUVRow = BGRAToUVRow_Any_SSSE3; + BGRAToYRow = BGRAToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + BGRAToUVRow = BGRAToUVRow_SSSE3; + BGRAToYRow = BGRAToYRow_SSSE3; + } + } +#endif +#if defined(HAS_BGRATOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + BGRAToYRow = BGRAToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + BGRAToYRow = BGRAToYRow_NEON; + } + } +#endif +#if defined(HAS_BGRATOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + BGRAToUVRow = BGRAToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + BGRAToUVRow = BGRAToUVRow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width); + BGRAToYRow(src_bgra, dst_y, width); + BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width); + src_bgra += src_stride_bgra * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width); + BGRAToYRow(src_bgra, dst_y, width); + } + return 0; +} + +// Convert ABGR to I420. +LIBYUV_API +int ABGRToI420(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C; + void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int pix) = + ABGRToYRow_C; + if (!src_abgr || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; + } +#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToUVRow = ABGRToUVRow_Any_SSSE3; + ABGRToYRow = ABGRToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_SSSE3; + ABGRToYRow = ABGRToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToYRow = ABGRToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ABGRToYRow = ABGRToYRow_NEON; + } + } +#endif +#if defined(HAS_ABGRTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToUVRow = ABGRToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width); + ABGRToYRow(src_abgr, dst_y, width); + ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width); + src_abgr += src_stride_abgr * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width); + ABGRToYRow(src_abgr, dst_y, width); + } + return 0; +} + +// Convert RGBA to I420. +LIBYUV_API +int RGBAToI420(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C; + void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int pix) = + RGBAToYRow_C; + if (!src_rgba || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgba = src_rgba + (height - 1) * src_stride_rgba; + src_stride_rgba = -src_stride_rgba; + } +#if defined(HAS_RGBATOYROW_SSSE3) && defined(HAS_RGBATOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RGBAToUVRow = RGBAToUVRow_Any_SSSE3; + RGBAToYRow = RGBAToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGBAToUVRow = RGBAToUVRow_SSSE3; + RGBAToYRow = RGBAToYRow_SSSE3; + } + } +#endif +#if defined(HAS_RGBATOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGBAToYRow = RGBAToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGBAToYRow = RGBAToYRow_NEON; + } + } +#endif +#if defined(HAS_RGBATOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGBAToUVRow = RGBAToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGBAToUVRow = RGBAToUVRow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width); + RGBAToYRow(src_rgba, dst_y, width); + RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width); + src_rgba += src_stride_rgba * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width); + RGBAToYRow(src_rgba, dst_y, width); + } + return 0; +} + +// Convert RGB24 to I420. +LIBYUV_API +int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; +#if defined(HAS_RGB24TOYROW_NEON) + void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C; + void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int pix) = + RGB24ToYRow_C; +#else + void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = + RGB24ToARGBRow_C; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; +#endif + if (!src_rgb24 || !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; + src_stride_rgb24 = -src_stride_rgb24; + } + +// Neon version does direct RGB24 to YUV. +#if defined(HAS_RGB24TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB24ToUVRow = RGB24ToUVRow_Any_NEON; + RGB24ToYRow = RGB24ToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGB24ToYRow = RGB24ToYRow_NEON; + if (IS_ALIGNED(width, 16)) { + RGB24ToUVRow = RGB24ToUVRow_NEON; + } + } + } +// Other platforms do intermediate conversion from RGB24 to ARGB. +#else +#if defined(HAS_RGB24TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif + { + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 31) & ~31; + align_buffer_64(row, kRowSize * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { +#if defined(HAS_RGB24TOYROW_NEON) + RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); + RGB24ToYRow(src_rgb24, dst_y, width); + RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); +#else + RGB24ToARGBRow(src_rgb24, row, width); + RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); +#endif + src_rgb24 += src_stride_rgb24 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if defined(HAS_RGB24TOYROW_NEON) + RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width); + RGB24ToYRow(src_rgb24, dst_y, width); +#else + RGB24ToARGBRow(src_rgb24, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); +#endif + } +#if !defined(HAS_RGB24TOYROW_NEON) + free_aligned_buffer_64(row); + } +#endif + return 0; +} + +// Convert RAW to I420. +LIBYUV_API +int RAWToI420(const uint8* src_raw, int src_stride_raw, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; +#if defined(HAS_RAWTOYROW_NEON) + void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw, + uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C; + void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int pix) = + RAWToYRow_C; +#else + void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = + RAWToARGBRow_C; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; +#endif + if (!src_raw || !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; + } + +// Neon version does direct RAW to YUV. +#if defined(HAS_RAWTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToUVRow = RAWToUVRow_Any_NEON; + RAWToYRow = RAWToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RAWToYRow = RAWToYRow_NEON; + if (IS_ALIGNED(width, 16)) { + RAWToUVRow = RAWToUVRow_NEON; + } + } + } +// Other platforms do intermediate conversion from RAW to ARGB. +#else +#if defined(HAS_RAWTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RAWToARGBRow = RAWToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif + { + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 31) & ~31; + align_buffer_64(row, kRowSize * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { +#if defined(HAS_RAWTOYROW_NEON) + RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width); + RAWToYRow(src_raw, dst_y, width); + RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); +#else + RAWToARGBRow(src_raw, row, width); + RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); +#endif + src_raw += src_stride_raw * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if defined(HAS_RAWTOYROW_NEON) + RAWToUVRow(src_raw, 0, dst_u, dst_v, width); + RAWToYRow(src_raw, dst_y, width); +#else + RAWToARGBRow(src_raw, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); +#endif + } +#if !defined(HAS_RAWTOYROW_NEON) + free_aligned_buffer_64(row); + } +#endif + return 0; +} + +// Convert RGB565 to I420. +LIBYUV_API +int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; +#if defined(HAS_RGB565TOYROW_NEON) + void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C; + void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int pix) = + RGB565ToYRow_C; +#else + void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = + RGB565ToARGBRow_C; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; +#endif + if (!src_rgb565 || !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565; + src_stride_rgb565 = -src_stride_rgb565; + } + +// Neon version does direct RGB565 to YUV. +#if defined(HAS_RGB565TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB565ToUVRow = RGB565ToUVRow_Any_NEON; + RGB565ToYRow = RGB565ToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGB565ToYRow = RGB565ToYRow_NEON; + if (IS_ALIGNED(width, 16)) { + RGB565ToUVRow = RGB565ToUVRow_NEON; + } + } + } +// Other platforms do intermediate conversion from RGB565 to ARGB. +#else +#if defined(HAS_RGB565TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + RGB565ToARGBRow = RGB565ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_RGB565TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + RGB565ToARGBRow = RGB565ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif + { + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 31) & ~31; + align_buffer_64(row, kRowSize * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { +#if defined(HAS_RGB565TOYROW_NEON) + RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width); + RGB565ToYRow(src_rgb565, dst_y, width); + RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width); +#else + RGB565ToARGBRow(src_rgb565, row, width); + RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); +#endif + src_rgb565 += src_stride_rgb565 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if defined(HAS_RGB565TOYROW_NEON) + RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width); + RGB565ToYRow(src_rgb565, dst_y, width); +#else + RGB565ToARGBRow(src_rgb565, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); +#endif + } +#if !defined(HAS_RGB565TOYROW_NEON) + free_aligned_buffer_64(row); + } +#endif + return 0; +} + +// Convert ARGB1555 to I420. +LIBYUV_API +int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; +#if defined(HAS_ARGB1555TOYROW_NEON) + void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C; + void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int pix) = + ARGB1555ToYRow_C; +#else + void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = + ARGB1555ToARGBRow_C; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; +#endif + if (!src_argb1555 || !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555; + src_stride_argb1555 = -src_stride_argb1555; + } + +// Neon version does direct ARGB1555 to YUV. +#if defined(HAS_ARGB1555TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON; + ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGB1555ToYRow = ARGB1555ToYRow_NEON; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_NEON; + } + } + } +// Other platforms do intermediate conversion from ARGB1555 to ARGB. +#else +#if defined(HAS_ARGB1555TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_ARGB1555TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif + { + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 31) & ~31; + align_buffer_64(row, kRowSize * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { +#if defined(HAS_ARGB1555TOYROW_NEON) + ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width); + ARGB1555ToYRow(src_argb1555, dst_y, width); + ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y, + width); +#else + ARGB1555ToARGBRow(src_argb1555, row, width); + ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize, + width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); +#endif + src_argb1555 += src_stride_argb1555 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if defined(HAS_ARGB1555TOYROW_NEON) + ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width); + ARGB1555ToYRow(src_argb1555, dst_y, width); +#else + ARGB1555ToARGBRow(src_argb1555, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); +#endif + } +#if !defined(HAS_ARGB1555TOYROW_NEON) + free_aligned_buffer_64(row); + } +#endif + return 0; +} + +// Convert ARGB4444 to I420. +LIBYUV_API +int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; +#if defined(HAS_ARGB4444TOYROW_NEON) + void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C; + void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int pix) = + ARGB4444ToYRow_C; +#else + void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = + ARGB4444ToARGBRow_C; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; +#endif + if (!src_argb4444 || !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444; + src_stride_argb4444 = -src_stride_argb4444; + } + +// Neon version does direct ARGB4444 to YUV. +#if defined(HAS_ARGB4444TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON; + ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGB4444ToYRow = ARGB4444ToYRow_NEON; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToUVRow = ARGB4444ToUVRow_NEON; + } + } + } +// Other platforms do intermediate conversion from ARGB4444 to ARGB. +#else +#if defined(HAS_ARGB4444TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_ARGB4444TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif + { + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 31) & ~31; + align_buffer_64(row, kRowSize * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { +#if defined(HAS_ARGB4444TOYROW_NEON) + ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width); + ARGB4444ToYRow(src_argb4444, dst_y, width); + ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y, + width); +#else + ARGB4444ToARGBRow(src_argb4444, row, width); + ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize, + width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); +#endif + src_argb4444 += src_stride_argb4444 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if defined(HAS_ARGB4444TOYROW_NEON) + ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width); + ARGB4444ToYRow(src_argb4444, dst_y, width); +#else + ARGB4444ToARGBRow(src_argb4444, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); +#endif + } +#if !defined(HAS_ARGB4444TOYROW_NEON) + free_aligned_buffer_64(row); + } +#endif + return 0; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/convert_argb.cc b/libs/libaom/src/third_party/libyuv/source/convert_argb.cc new file mode 100644 index 000000000..44756bc41 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/convert_argb.cc @@ -0,0 +1,1155 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert_argb.h" + +#include "libyuv/cpu_id.h" +#ifdef HAVE_JPEG +#include "libyuv/mjpeg_decoder.h" +#endif +#include "libyuv/rotate_argb.h" +#include "libyuv/row.h" +#include "libyuv/video_common.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Copy ARGB with optional flipping +LIBYUV_API +int ARGBCopy(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_argb || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + + CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width * 4, height); + return 0; +} + +// Convert I444 to ARGB. +LIBYUV_API +int I444ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*I444ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I444ToARGBRow_C; + if (!src_y || !src_u || !src_v || + !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_y == width && + src_stride_u == width && + src_stride_v == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; + } +#if defined(HAS_I444TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I444ToARGBRow = I444ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I444TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I444ToARGBRow = I444ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I444ToARGBRow = I444ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I444TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I444ToARGBRow = I444ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I444ToARGBRow(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I422 to ARGB. +LIBYUV_API +int I422ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*I422ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToARGBRow_C; + if (!src_y || !src_u || !src_v || + !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_y == width && + src_stride_u * 2 == width && + src_stride_v * 2 == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; + } +#if defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422TOARGBROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; + } +#endif + + for (y = 0; y < height; ++y) { + I422ToARGBRow(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I411 to ARGB. +LIBYUV_API +int I411ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*I411ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I411ToARGBRow_C; + if (!src_y || !src_u || !src_v || + !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_y == width && + src_stride_u * 4 == width && + src_stride_v * 4 == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; + } +#if defined(HAS_I411TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I411ToARGBRow = I411ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I411ToARGBRow = I411ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I411TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I411ToARGBRow = I411ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I411ToARGBRow = I411ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I411TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I411ToARGBRow = I411ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I411ToARGBRow = I411ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I411ToARGBRow(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I400 to ARGB. +LIBYUV_API +int I400ToARGB(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*I400ToARGBRow)(const uint8* y_buf, + uint8* rgb_buf, + int width) = I400ToARGBRow_C; + if (!src_y || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_y == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = dst_stride_argb = 0; + } +#if defined(HAS_I400TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + I400ToARGBRow = I400ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + I400ToARGBRow = I400ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_I400TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I400ToARGBRow = I400ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I400ToARGBRow = I400ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I400TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I400ToARGBRow = I400ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I400ToARGBRow = I400ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I400ToARGBRow(src_y, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + } + return 0; +} + +// Convert J400 to ARGB. +LIBYUV_API +int J400ToARGB(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) = + J400ToARGBRow_C; + if (!src_y || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = dst_stride_argb = 0; + } +#if defined(HAS_J400TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + J400ToARGBRow = J400ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + J400ToARGBRow = J400ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_J400TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + J400ToARGBRow = J400ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + J400ToARGBRow = J400ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_J400TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + J400ToARGBRow = J400ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + J400ToARGBRow = J400ToARGBRow_NEON; + } + } +#endif + for (y = 0; y < height; ++y) { + J400ToARGBRow(src_y, dst_argb, width); + src_y += src_stride_y; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Shuffle table for converting BGRA to ARGB. +static uvec8 kShuffleMaskBGRAToARGB = { + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u +}; + +// Shuffle table for converting ABGR to ARGB. +static uvec8 kShuffleMaskABGRToARGB = { + 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u +}; + +// Shuffle table for converting RGBA to ARGB. +static uvec8 kShuffleMaskRGBAToARGB = { + 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u +}; + +// Convert BGRA to ARGB. +LIBYUV_API +int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + return ARGBShuffle(src_bgra, src_stride_bgra, + dst_argb, dst_stride_argb, + (const uint8*)(&kShuffleMaskBGRAToARGB), + width, height); +} + +// Convert ARGB to BGRA (same as BGRAToARGB). +LIBYUV_API +int ARGBToBGRA(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + return ARGBShuffle(src_bgra, src_stride_bgra, + dst_argb, dst_stride_argb, + (const uint8*)(&kShuffleMaskBGRAToARGB), + width, height); +} + +// Convert ABGR to ARGB. +LIBYUV_API +int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + return ARGBShuffle(src_abgr, src_stride_abgr, + dst_argb, dst_stride_argb, + (const uint8*)(&kShuffleMaskABGRToARGB), + width, height); +} + +// Convert ARGB to ABGR to (same as ABGRToARGB). +LIBYUV_API +int ARGBToABGR(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + return ARGBShuffle(src_abgr, src_stride_abgr, + dst_argb, dst_stride_argb, + (const uint8*)(&kShuffleMaskABGRToARGB), + width, height); +} + +// Convert RGBA to ARGB. +LIBYUV_API +int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + return ARGBShuffle(src_rgba, src_stride_rgba, + dst_argb, dst_stride_argb, + (const uint8*)(&kShuffleMaskRGBAToARGB), + width, height); +} + +// Convert RGB24 to ARGB. +LIBYUV_API +int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = + RGB24ToARGBRow_C; + if (!src_rgb24 || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; + src_stride_rgb24 = -src_stride_rgb24; + } + // Coalesce rows. + if (src_stride_rgb24 == width * 3 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_rgb24 = dst_stride_argb = 0; + } +#if defined(HAS_RGB24TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_RGB24TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGB24ToARGBRow = RGB24ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + RGB24ToARGBRow(src_rgb24, dst_argb, width); + src_rgb24 += src_stride_rgb24; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert RAW to ARGB. +LIBYUV_API +int RAWToARGB(const uint8* src_raw, int src_stride_raw, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = + RAWToARGBRow_C; + if (!src_raw || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; + } + // Coalesce rows. + if (src_stride_raw == width * 3 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_raw = dst_stride_argb = 0; + } +#if defined(HAS_RAWTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RAWToARGBRow = RAWToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_RAWTOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToARGBRow = RAWToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RAWToARGBRow = RAWToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + RAWToARGBRow(src_raw, dst_argb, width); + src_raw += src_stride_raw; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert RGB565 to ARGB. +LIBYUV_API +int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) = + RGB565ToARGBRow_C; + if (!src_rgb565 || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565; + src_stride_rgb565 = -src_stride_rgb565; + } + // Coalesce rows. + if (src_stride_rgb565 == width * 2 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_rgb565 = dst_stride_argb = 0; + } +#if defined(HAS_RGB565TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + RGB565ToARGBRow = RGB565ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_RGB565TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + RGB565ToARGBRow = RGB565ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_RGB565TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGB565ToARGBRow = RGB565ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + RGB565ToARGBRow(src_rgb565, dst_argb, width); + src_rgb565 += src_stride_rgb565; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert ARGB1555 to ARGB. +LIBYUV_API +int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb, + int pix) = ARGB1555ToARGBRow_C; + if (!src_argb1555 || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555; + src_stride_argb1555 = -src_stride_argb1555; + } + // Coalesce rows. + if (src_stride_argb1555 == width * 2 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb1555 = dst_stride_argb = 0; + } +#if defined(HAS_ARGB1555TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_ARGB1555TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGB1555TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGB1555ToARGBRow(src_argb1555, dst_argb, width); + src_argb1555 += src_stride_argb1555; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert ARGB4444 to ARGB. +LIBYUV_API +int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb, + int pix) = ARGB4444ToARGBRow_C; + if (!src_argb4444 || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444; + src_stride_argb4444 = -src_stride_argb4444; + } + // Coalesce rows. + if (src_stride_argb4444 == width * 2 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb4444 = dst_stride_argb = 0; + } +#if defined(HAS_ARGB4444TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_ARGB4444TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGB4444TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGB4444ToARGBRow(src_argb4444, dst_argb, width); + src_argb4444 += src_stride_argb4444; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert NV12 to ARGB. +LIBYUV_API +int NV12ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*NV12ToARGBRow)(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width) = NV12ToARGBRow_C; + if (!src_y || !src_uv || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_NV12TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_NV12TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV12ToARGBRow = NV12ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + NV12ToARGBRow = NV12ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_NV12TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + NV12ToARGBRow = NV12ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV12ToARGBRow(src_y, src_uv, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +// Convert NV21 to ARGB. +LIBYUV_API +int NV21ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*NV21ToARGBRow)(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width) = NV21ToARGBRow_C; + if (!src_y || !src_uv || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_NV21TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV21ToARGBRow = NV21ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_NV21TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV21ToARGBRow = NV21ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + NV21ToARGBRow = NV21ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_NV21TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + NV21ToARGBRow = NV21ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV21ToARGBRow = NV21ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV21ToARGBRow(src_y, src_uv, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +// Convert M420 to ARGB. +LIBYUV_API +int M420ToARGB(const uint8* src_m420, int src_stride_m420, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*NV12ToARGBRow)(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width) = NV12ToARGBRow_C; + if (!src_m420 || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_NV12TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_NV12TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV12ToARGBRow = NV12ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + NV12ToARGBRow = NV12ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_NV12TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + NV12ToARGBRow = NV12ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width); + NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2, + dst_argb + dst_stride_argb, width); + dst_argb += dst_stride_argb * 2; + src_m420 += src_stride_m420 * 3; + } + if (height & 1) { + NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width); + } + return 0; +} + +// Convert YUY2 to ARGB. +LIBYUV_API +int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb, int pix) = + YUY2ToARGBRow_C; + if (!src_yuy2 || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; + } + // Coalesce rows. + if (src_stride_yuy2 == width * 2 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_yuy2 = dst_stride_argb = 0; + } +#if defined(HAS_YUY2TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + YUY2ToARGBRow = YUY2ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_YUY2TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + YUY2ToARGBRow = YUY2ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + YUY2ToARGBRow = YUY2ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_YUY2TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + YUY2ToARGBRow = YUY2ToARGBRow_NEON; + } + } +#endif + for (y = 0; y < height; ++y) { + YUY2ToARGBRow(src_yuy2, dst_argb, width); + src_yuy2 += src_stride_yuy2; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert UYVY to ARGB. +LIBYUV_API +int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb, int pix) = + UYVYToARGBRow_C; + if (!src_uyvy || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; + src_stride_uyvy = -src_stride_uyvy; + } + // Coalesce rows. + if (src_stride_uyvy == width * 2 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_uyvy = dst_stride_argb = 0; + } +#if defined(HAS_UYVYTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + UYVYToARGBRow = UYVYToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_UYVYTOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + UYVYToARGBRow = UYVYToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + UYVYToARGBRow = UYVYToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_UYVYTOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + UYVYToARGBRow = UYVYToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + UYVYToARGBRow = UYVYToARGBRow_NEON; + } + } +#endif + for (y = 0; y < height; ++y) { + UYVYToARGBRow(src_uyvy, dst_argb, width); + src_uyvy += src_stride_uyvy; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert J420 to ARGB. +LIBYUV_API +int J420ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*J422ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = J422ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_J422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + J422ToARGBRow = J422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + J422ToARGBRow = J422ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_J422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + J422ToARGBRow = J422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + J422ToARGBRow = J422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_J422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + J422ToARGBRow = J422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + J422ToARGBRow = J422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_J422TOARGBROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2; + } +#endif + + for (y = 0; y < height; ++y) { + J422ToARGBRow(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert J422 to ARGB. +LIBYUV_API +int J422ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*J422ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = J422ToARGBRow_C; + if (!src_y || !src_u || !src_v || + !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_y == width && + src_stride_u * 2 == width && + src_stride_v * 2 == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; + } +#if defined(HAS_J422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + J422ToARGBRow = J422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + J422ToARGBRow = J422ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_J422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + J422ToARGBRow = J422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + J422ToARGBRow = J422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_J422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + J422ToARGBRow = J422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + J422ToARGBRow = J422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_J422TOARGBROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2; + } +#endif + + for (y = 0; y < height; ++y) { + J422ToARGBRow(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/convert_from.cc b/libs/libaom/src/third_party/libyuv/source/convert_from.cc new file mode 100644 index 000000000..31f1ac992 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/convert_from.cc @@ -0,0 +1,1348 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert_from.h" + +#include "libyuv/basic_types.h" +#include "libyuv/convert.h" // For I420Copy +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" +#include "libyuv/rotate.h" +#include "libyuv/scale.h" // For ScalePlane() +#include "libyuv/video_common.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// I420 To any I4xx YUV format with mirroring. +static int I420ToI4xx(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int src_y_width, int src_y_height, + int dst_uv_width, int dst_uv_height) { + const int dst_y_width = Abs(src_y_width); + const int dst_y_height = Abs(src_y_height); + const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1); + const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1); + if (src_y_width == 0 || src_y_height == 0 || + dst_uv_width <= 0 || dst_uv_height <= 0) { + return -1; + } + ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, + dst_y, dst_stride_y, dst_y_width, dst_y_height, + kFilterBilinear); + ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, + dst_u, dst_stride_u, dst_uv_width, dst_uv_height, + kFilterBilinear); + ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, + dst_v, dst_stride_v, dst_uv_width, dst_uv_height, + kFilterBilinear); + return 0; +} + +// 420 chroma is 1/2 width, 1/2 height +// 422 chroma is 1/2 width, 1x height +LIBYUV_API +int I420ToI422(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + const int dst_uv_width = (Abs(width) + 1) >> 1; + const int dst_uv_height = Abs(height); + return I420ToI4xx(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, + dst_uv_width, dst_uv_height); +} + +// 420 chroma is 1/2 width, 1/2 height +// 444 chroma is 1x width, 1x height +LIBYUV_API +int I420ToI444(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + const int dst_uv_width = Abs(width); + const int dst_uv_height = Abs(height); + return I420ToI4xx(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, + dst_uv_width, dst_uv_height); +} + +// 420 chroma is 1/2 width, 1/2 height +// 411 chroma is 1/4 width, 1x height +LIBYUV_API +int I420ToI411(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + const int dst_uv_width = (Abs(width) + 3) >> 2; + const int dst_uv_height = Abs(height); + return I420ToI4xx(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, + dst_uv_width, dst_uv_height); +} + +// Copy to I400. Source can be I420,422,444,400,NV12,NV21 +LIBYUV_API +int I400Copy(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height) { + if (!src_y || !dst_y || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + return 0; +} + +LIBYUV_API +int I422ToYUY2(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_yuy2, int dst_stride_yuy2, + int width, int height) { + int y; + void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u, + const uint8* src_v, uint8* dst_yuy2, int width) = + I422ToYUY2Row_C; + if (!src_y || !src_u || !src_v || !dst_yuy2 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; + dst_stride_yuy2 = -dst_stride_yuy2; + } + // Coalesce rows. + if (src_stride_y == width && + src_stride_u * 2 == width && + src_stride_v * 2 == width && + dst_stride_yuy2 == width * 2) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0; + } +#if defined(HAS_I422TOYUY2ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_SSE2; + } + } +#endif +#if defined(HAS_I422TOYUY2ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToYUY2Row = I422ToYUY2Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + dst_yuy2 += dst_stride_yuy2; + } + return 0; +} + +LIBYUV_API +int I420ToYUY2(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_yuy2, int dst_stride_yuy2, + int width, int height) { + int y; + void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u, + const uint8* src_v, uint8* dst_yuy2, int width) = + I422ToYUY2Row_C; + if (!src_y || !src_u || !src_v || !dst_yuy2 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; + dst_stride_yuy2 = -dst_stride_yuy2; + } +#if defined(HAS_I422TOYUY2ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_SSE2; + } + } +#endif +#if defined(HAS_I422TOYUY2ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToYUY2Row = I422ToYUY2Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); + I422ToYUY2Row(src_y + src_stride_y, src_u, src_v, + dst_yuy2 + dst_stride_yuy2, width); + src_y += src_stride_y * 2; + src_u += src_stride_u; + src_v += src_stride_v; + dst_yuy2 += dst_stride_yuy2 * 2; + } + if (height & 1) { + I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); + } + return 0; +} + +LIBYUV_API +int I422ToUYVY(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_uyvy, int dst_stride_uyvy, + int width, int height) { + int y; + void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u, + const uint8* src_v, uint8* dst_uyvy, int width) = + I422ToUYVYRow_C; + if (!src_y || !src_u || !src_v || !dst_uyvy || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy; + dst_stride_uyvy = -dst_stride_uyvy; + } + // Coalesce rows. + if (src_stride_y == width && + src_stride_u * 2 == width && + src_stride_v * 2 == width && + dst_stride_uyvy == width * 2) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0; + } +#if defined(HAS_I422TOUYVYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_SSE2; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToUYVYRow = I422ToUYVYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + dst_uyvy += dst_stride_uyvy; + } + return 0; +} + +LIBYUV_API +int I420ToUYVY(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_uyvy, int dst_stride_uyvy, + int width, int height) { + int y; + void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u, + const uint8* src_v, uint8* dst_uyvy, int width) = + I422ToUYVYRow_C; + if (!src_y || !src_u || !src_v || !dst_uyvy || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy; + dst_stride_uyvy = -dst_stride_uyvy; + } +#if defined(HAS_I422TOUYVYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_SSE2; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToUYVYRow = I422ToUYVYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); + I422ToUYVYRow(src_y + src_stride_y, src_u, src_v, + dst_uyvy + dst_stride_uyvy, width); + src_y += src_stride_y * 2; + src_u += src_stride_u; + src_v += src_stride_v; + dst_uyvy += dst_stride_uyvy * 2; + } + if (height & 1) { + I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); + } + return 0; +} + +LIBYUV_API +int I420ToNV12(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_uv, int dst_stride_uv, + int width, int height) { + int y; + void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) = MergeUVRow_C; + // Coalesce rows. + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_uv = dst_uv + (halfheight - 1) * dst_stride_uv; + dst_stride_y = -dst_stride_y; + dst_stride_uv = -dst_stride_uv; + } + if (src_stride_y == width && + dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } + // Coalesce rows. + if (src_stride_u == halfwidth && + src_stride_v == halfwidth && + dst_stride_uv == halfwidth * 2) { + halfwidth *= halfheight; + halfheight = 1; + src_stride_u = src_stride_v = dst_stride_uv = 0; + } +#if defined(HAS_MERGEUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + MergeUVRow_ = MergeUVRow_Any_SSE2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_SSE2; + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeUVRow_ = MergeUVRow_Any_AVX2; + if (IS_ALIGNED(halfwidth, 32)) { + MergeUVRow_ = MergeUVRow_AVX2; + } + } +#endif +#if defined(HAS_MERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeUVRow_ = MergeUVRow_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_NEON; + } + } +#endif + + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + for (y = 0; y < halfheight; ++y) { + // Merge a row of U and V into a row of UV. + MergeUVRow_(src_u, src_v, dst_uv, halfwidth); + src_u += src_stride_u; + src_v += src_stride_v; + dst_uv += dst_stride_uv; + } + return 0; +} + +LIBYUV_API +int I420ToNV21(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_vu, int dst_stride_vu, + int width, int height) { + return I420ToNV12(src_y, src_stride_y, + src_v, src_stride_v, + src_u, src_stride_u, + dst_y, src_stride_y, + dst_vu, dst_stride_vu, + width, height); +} + +// Convert I420 to ARGB. +LIBYUV_API +int I420ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*I422ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422TOARGBROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; + } +#endif + + for (y = 0; y < height; ++y) { + I422ToARGBRow(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to BGRA. +LIBYUV_API +int I420ToBGRA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_bgra, int dst_stride_bgra, + int width, int height) { + int y; + void (*I422ToBGRARow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToBGRARow_C; + if (!src_y || !src_u || !src_v || !dst_bgra || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra; + dst_stride_bgra = -dst_stride_bgra; + } +#if defined(HAS_I422TOBGRAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToBGRARow = I422ToBGRARow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToBGRARow = I422ToBGRARow_SSSE3; + } + } +#endif +#if defined(HAS_I422TOBGRAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToBGRARow = I422ToBGRARow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToBGRARow = I422ToBGRARow_AVX2; + } + } +#endif +#if defined(HAS_I422TOBGRAROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToBGRARow = I422ToBGRARow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToBGRARow = I422ToBGRARow_NEON; + } + } +#endif +#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && + IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) { + I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2; + } +#endif + + for (y = 0; y < height; ++y) { + I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width); + dst_bgra += dst_stride_bgra; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to ABGR. +LIBYUV_API +int I420ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height) { + int y; + void (*I422ToABGRRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToABGRRow_C; + if (!src_y || !src_u || !src_v || !dst_abgr || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr; + dst_stride_abgr = -dst_stride_abgr; + } +#if defined(HAS_I422TOABGRROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToABGRRow = I422ToABGRRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToABGRRow = I422ToABGRRow_SSSE3; + } + } +#endif +#if defined(HAS_I422TOABGRROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToABGRRow = I422ToABGRRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToABGRRow = I422ToABGRRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOABGRROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToABGRRow = I422ToABGRRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToABGRRow = I422ToABGRRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width); + dst_abgr += dst_stride_abgr; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RGBA. +LIBYUV_API +int I420ToRGBA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_rgba, int dst_stride_rgba, + int width, int height) { + int y; + void (*I422ToRGBARow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToRGBARow_C; + if (!src_y || !src_u || !src_v || !dst_rgba || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; + dst_stride_rgba = -dst_stride_rgba; + } +#if defined(HAS_I422TORGBAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToRGBARow = I422ToRGBARow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_SSSE3; + } + } +#endif +#if defined(HAS_I422TORGBAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGBARow = I422ToRGBARow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRGBARow = I422ToRGBARow_AVX2; + } + } +#endif +#if defined(HAS_I422TORGBAROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGBARow = I422ToRGBARow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width); + dst_rgba += dst_stride_rgba; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RGB24. +LIBYUV_API +int I420ToRGB24(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_rgb24, int dst_stride_rgb24, + int width, int height) { + int y; + void (*I422ToRGB24Row)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToRGB24Row_C; + if (!src_y || !src_u || !src_v || !dst_rgb24 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; + dst_stride_rgb24 = -dst_stride_rgb24; + } +#if defined(HAS_I422TORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGB24Row = I422ToRGB24Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGB24Row = I422ToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRGB24Row = I422ToRGB24Row_AVX2; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGB24Row = I422ToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGB24Row = I422ToRGB24Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RAW. +LIBYUV_API +int I420ToRAW(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_raw, int dst_stride_raw, + int width, int height) { + int y; + void (*I422ToRAWRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToRAWRow_C; + if (!src_y || !src_u || !src_v || !dst_raw || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_raw = dst_raw + (height - 1) * dst_stride_raw; + dst_stride_raw = -dst_stride_raw; + } +#if defined(HAS_I422TORAWROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToRAWRow = I422ToRAWRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRAWRow = I422ToRAWRow_SSSE3; + } + } +#endif +#if defined(HAS_I422TORAWROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRAWRow = I422ToRAWRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRAWRow = I422ToRAWRow_AVX2; + } + } +#endif +#if defined(HAS_I422TORAWROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRAWRow = I422ToRAWRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRAWRow = I422ToRAWRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRAWRow(src_y, src_u, src_v, dst_raw, width); + dst_raw += dst_stride_raw; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to ARGB1555. +LIBYUV_API +int I420ToARGB1555(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb1555, int dst_stride_argb1555, + int width, int height) { + int y; + void (*I422ToARGB1555Row)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToARGB1555Row_C; + if (!src_y || !src_u || !src_v || !dst_argb1555 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555; + dst_stride_argb1555 = -dst_stride_argb1555; + } +#if defined(HAS_I422TOARGB1555ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGB1555Row = I422ToARGB1555Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGB1555ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGB1555Row = I422ToARGB1555Row_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGB1555ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGB1555Row = I422ToARGB1555Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, width); + dst_argb1555 += dst_stride_argb1555; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + + +// Convert I420 to ARGB4444. +LIBYUV_API +int I420ToARGB4444(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb4444, int dst_stride_argb4444, + int width, int height) { + int y; + void (*I422ToARGB4444Row)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToARGB4444Row_C; + if (!src_y || !src_u || !src_v || !dst_argb4444 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444; + dst_stride_argb4444 = -dst_stride_argb4444; + } +#if defined(HAS_I422TOARGB4444ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGB4444Row = I422ToARGB4444Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGB4444ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGB4444Row = I422ToARGB4444Row_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGB4444ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGB4444Row = I422ToARGB4444Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, width); + dst_argb4444 += dst_stride_argb4444; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RGB565. +LIBYUV_API +int I420ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height) { + int y; + void (*I422ToRGB565Row)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToRGB565Row_C; + if (!src_y || !src_u || !src_v || !dst_rgb565 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } +#if defined(HAS_I422TORGB565ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGB565Row = I422ToRGB565Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRGB565Row = I422ToRGB565Row_AVX2; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGB565Row = I422ToRGB565Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Ordered 8x8 dither for 888 to 565. Values from 0 to 7. +static const uint8 kDither565_4x4[16] = { + 0, 4, 1, 5, + 6, 2, 7, 3, + 1, 5, 0, 4, + 7, 3, 6, 2, +}; + +// Convert I420 to RGB565 with dithering. +LIBYUV_API +int I420ToRGB565Dither(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_rgb565, int dst_stride_rgb565, + const uint8* dither4x4, int width, int height) { + int y; + void (*I422ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToARGBRow_C; + void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb, + const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C; + if (!src_y || !src_u || !src_v || !dst_rgb565 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } + if (!dither4x4) { + dither4x4 = kDither565_4x4; + } +#if defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422TOARGBROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) { + I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON; + } + } +#endif + { + // Allocate a row of argb. + align_buffer_64(row_argb, width * 4); + for (y = 0; y < height; ++y) { + I422ToARGBRow(src_y, src_u, src_v, row_argb, width); + ARGBToRGB565DitherRow(row_argb, dst_rgb565, + *(uint32*)(dither4x4 + ((y & 3) << 2)), width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + free_aligned_buffer_64(row_argb); + } + return 0; +} + +// Convert I420 to specified format +LIBYUV_API +int ConvertFromI420(const uint8* y, int y_stride, + const uint8* u, int u_stride, + const uint8* v, int v_stride, + uint8* dst_sample, int dst_sample_stride, + int width, int height, + uint32 fourcc) { + uint32 format = CanonicalFourCC(fourcc); + int r = 0; + if (!y || !u|| !v || !dst_sample || + width <= 0 || height == 0) { + return -1; + } + switch (format) { + // Single plane formats + case FOURCC_YUY2: + r = I420ToYUY2(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); + break; + case FOURCC_UYVY: + r = I420ToUYVY(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); + break; + case FOURCC_RGBP: + r = I420ToRGB565(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); + break; + case FOURCC_RGBO: + r = I420ToARGB1555(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); + break; + case FOURCC_R444: + r = I420ToARGB4444(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); + break; + case FOURCC_24BG: + r = I420ToRGB24(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 3, + width, height); + break; + case FOURCC_RAW: + r = I420ToRAW(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 3, + width, height); + break; + case FOURCC_ARGB: + r = I420ToARGB(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, + width, height); + break; + case FOURCC_BGRA: + r = I420ToBGRA(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, + width, height); + break; + case FOURCC_ABGR: + r = I420ToABGR(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, + width, height); + break; + case FOURCC_RGBA: + r = I420ToRGBA(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, + width, height); + break; + case FOURCC_I400: + r = I400Copy(y, y_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width, + width, height); + break; + case FOURCC_NV12: { + uint8* dst_uv = dst_sample + width * height; + r = I420ToNV12(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width, + dst_uv, + dst_sample_stride ? dst_sample_stride : width, + width, height); + break; + } + case FOURCC_NV21: { + uint8* dst_vu = dst_sample + width * height; + r = I420ToNV21(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width, + dst_vu, + dst_sample_stride ? dst_sample_stride : width, + width, height); + break; + } + // TODO(fbarchard): Add M420. + // Triplanar formats + // TODO(fbarchard): halfstride instead of halfwidth + case FOURCC_I420: + case FOURCC_YU12: + case FOURCC_YV12: { + int halfwidth = (width + 1) / 2; + int halfheight = (height + 1) / 2; + uint8* dst_u; + uint8* dst_v; + if (format == FOURCC_YV12) { + dst_v = dst_sample + width * height; + dst_u = dst_v + halfwidth * halfheight; + } else { + dst_u = dst_sample + width * height; + dst_v = dst_u + halfwidth * halfheight; + } + r = I420Copy(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, width, + dst_u, halfwidth, + dst_v, halfwidth, + width, height); + break; + } + case FOURCC_I422: + case FOURCC_YV16: { + int halfwidth = (width + 1) / 2; + uint8* dst_u; + uint8* dst_v; + if (format == FOURCC_YV16) { + dst_v = dst_sample + width * height; + dst_u = dst_v + halfwidth * height; + } else { + dst_u = dst_sample + width * height; + dst_v = dst_u + halfwidth * height; + } + r = I420ToI422(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, width, + dst_u, halfwidth, + dst_v, halfwidth, + width, height); + break; + } + case FOURCC_I444: + case FOURCC_YV24: { + uint8* dst_u; + uint8* dst_v; + if (format == FOURCC_YV24) { + dst_v = dst_sample + width * height; + dst_u = dst_v + width * height; + } else { + dst_u = dst_sample + width * height; + dst_v = dst_u + width * height; + } + r = I420ToI444(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, width, + dst_u, width, + dst_v, width, + width, height); + break; + } + case FOURCC_I411: { + int quarterwidth = (width + 3) / 4; + uint8* dst_u = dst_sample + width * height; + uint8* dst_v = dst_u + quarterwidth * height; + r = I420ToI411(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, width, + dst_u, quarterwidth, + dst_v, quarterwidth, + width, height); + break; + } + + // Formats not supported - MJPG, biplanar, some rgb formats. + default: + return -1; // unknown fourcc - return failure code. + } + return r; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/convert_from_argb.cc b/libs/libaom/src/third_party/libyuv/source/convert_from_argb.cc new file mode 100644 index 000000000..8d1e97aec --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/convert_from_argb.cc @@ -0,0 +1,1301 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert_from_argb.h" + +#include "libyuv/basic_types.h" +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// ARGB little endian (bgra in memory) to I444 +LIBYUV_API +int ARGBToI444(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) = ARGBToUV444Row_C; + if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_y == width && + dst_stride_u == width && + dst_stride_v == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_ARGBTOUV444ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUV444Row = ARGBToUV444Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUV444ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUV444Row = ARGBToUV444Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToUV444Row = ARGBToUV444Row_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToUV444Row(src_argb, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + src_argb += src_stride_argb; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + +// ARGB little endian (bgra in memory) to I422 +LIBYUV_API +int ARGBToI422(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) = ARGBToUV422Row_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_y == width && + dst_stride_u * 2 == width && + dst_stride_v * 2 == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_ARGBTOUV422ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUV422ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUV422Row = ARGBToUV422Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToUV422Row(src_argb, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + src_argb += src_stride_argb; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + +// ARGB little endian (bgra in memory) to I411 +LIBYUV_API +int ARGBToI411(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) = ARGBToUV411Row_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_y == width && + dst_stride_u * 4 == width && + dst_stride_v * 4 == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUV411ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUV411Row = ARGBToUV411Row_Any_NEON; + if (IS_ALIGNED(width, 32)) { + ARGBToUV411Row = ARGBToUV411Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToUV411Row(src_argb, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + src_argb += src_stride_argb; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + +LIBYUV_API +int ARGBToNV12(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_uv, int dst_stride_uv, + int width, int height) { + int y; + int halfwidth = (width + 1) >> 1; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) = MergeUVRow_C; + if (!src_argb || + !dst_y || !dst_uv || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } +#endif +#if defined(HAS_MERGEUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + MergeUVRow_ = MergeUVRow_Any_SSE2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_SSE2; + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeUVRow_ = MergeUVRow_Any_AVX2; + if (IS_ALIGNED(halfwidth, 32)) { + MergeUVRow_ = MergeUVRow_AVX2; + } + } +#endif +#if defined(HAS_MERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeUVRow_ = MergeUVRow_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_NEON; + } + } +#endif + { + // Allocate a rows of uv. + align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); + uint8* row_v = row_u + ((halfwidth + 31) & ~31); + + for (y = 0; y < height - 1; y += 2) { + ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); + MergeUVRow_(row_u, row_v, dst_uv, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); + src_argb += src_stride_argb * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + ARGBToUVRow(src_argb, 0, row_u, row_v, width); + MergeUVRow_(row_u, row_v, dst_uv, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + } + free_aligned_buffer_64(row_u); + } + return 0; +} + +// Same as NV12 but U and V swapped. +LIBYUV_API +int ARGBToNV21(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_uv, int dst_stride_uv, + int width, int height) { + int y; + int halfwidth = (width + 1) >> 1; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) = MergeUVRow_C; + if (!src_argb || + !dst_y || !dst_uv || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } +#endif +#if defined(HAS_MERGEUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + MergeUVRow_ = MergeUVRow_Any_SSE2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_SSE2; + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeUVRow_ = MergeUVRow_Any_AVX2; + if (IS_ALIGNED(halfwidth, 32)) { + MergeUVRow_ = MergeUVRow_AVX2; + } + } +#endif +#if defined(HAS_MERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeUVRow_ = MergeUVRow_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_NEON; + } + } +#endif + { + // Allocate a rows of uv. + align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); + uint8* row_v = row_u + ((halfwidth + 31) & ~31); + + for (y = 0; y < height - 1; y += 2) { + ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); + MergeUVRow_(row_v, row_u, dst_uv, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); + src_argb += src_stride_argb * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + ARGBToUVRow(src_argb, 0, row_u, row_v, width); + MergeUVRow_(row_v, row_u, dst_uv, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + } + free_aligned_buffer_64(row_u); + } + return 0; +} + +// Convert ARGB to YUY2. +LIBYUV_API +int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, + uint8* dst_yuy2, int dst_stride_yuy2, + int width, int height) { + int y; + void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) = ARGBToUV422Row_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u, + const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C; + + if (!src_argb || !dst_yuy2 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; + dst_stride_yuy2 = -dst_stride_yuy2; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_yuy2 == width * 2) { + width *= height; + height = 1; + src_stride_argb = dst_stride_yuy2 = 0; + } +#if defined(HAS_ARGBTOUV422ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUV422ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUV422Row = ARGBToUV422Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif + +#if defined(HAS_I422TOYUY2ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_SSE2; + } + } +#endif +#if defined(HAS_I422TOYUY2ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToYUY2Row = I422ToYUY2Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_NEON; + } + } +#endif + + { + // Allocate a rows of yuv. + align_buffer_64(row_y, ((width + 63) & ~63) * 2); + uint8* row_u = row_y + ((width + 63) & ~63); + uint8* row_v = row_u + ((width + 63) & ~63) / 2; + + for (y = 0; y < height; ++y) { + ARGBToUV422Row(src_argb, row_u, row_v, width); + ARGBToYRow(src_argb, row_y, width); + I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width); + src_argb += src_stride_argb; + dst_yuy2 += dst_stride_yuy2; + } + + free_aligned_buffer_64(row_y); + } + return 0; +} + +// Convert ARGB to UYVY. +LIBYUV_API +int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, + uint8* dst_uyvy, int dst_stride_uyvy, + int width, int height) { + int y; + void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) = ARGBToUV422Row_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u, + const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C; + + if (!src_argb || !dst_uyvy || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy; + dst_stride_uyvy = -dst_stride_uyvy; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_uyvy == width * 2) { + width *= height; + height = 1; + src_stride_argb = dst_stride_uyvy = 0; + } +#if defined(HAS_ARGBTOUV422ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUV422ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUV422Row = ARGBToUV422Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif + +#if defined(HAS_I422TOUYVYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_SSE2; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToUYVYRow = I422ToUYVYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_NEON; + } + } +#endif + + { + // Allocate a rows of yuv. + align_buffer_64(row_y, ((width + 63) & ~63) * 2); + uint8* row_u = row_y + ((width + 63) & ~63); + uint8* row_v = row_u + ((width + 63) & ~63) / 2; + + for (y = 0; y < height; ++y) { + ARGBToUV422Row(src_argb, row_u, row_v, width); + ARGBToYRow(src_argb, row_y, width); + I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width); + src_argb += src_stride_argb; + dst_uyvy += dst_stride_uyvy; + } + + free_aligned_buffer_64(row_y); + } + return 0; +} + +// Convert ARGB to I400. +LIBYUV_API +int ARGBToI400(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + int width, int height) { + int y; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + if (!src_argb || !dst_y || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_y == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_y = 0; + } +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToYRow(src_argb, dst_y, width); + src_argb += src_stride_argb; + dst_y += dst_stride_y; + } + return 0; +} + +// Shuffle table for converting ARGB to RGBA. +static uvec8 kShuffleMaskARGBToRGBA = { + 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u +}; + +// Convert ARGB to RGBA. +LIBYUV_API +int ARGBToRGBA(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgba, int dst_stride_rgba, + int width, int height) { + return ARGBShuffle(src_argb, src_stride_argb, + dst_rgba, dst_stride_rgba, + (const uint8*)(&kShuffleMaskARGBToRGBA), + width, height); +} + +// Convert ARGB To RGB24. +LIBYUV_API +int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb24, int dst_stride_rgb24, + int width, int height) { + int y; + void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = + ARGBToRGB24Row_C; + if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_rgb24 == width * 3) { + width *= height; + height = 1; + src_stride_argb = dst_stride_rgb24 = 0; + } +#if defined(HAS_ARGBTORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToRGB24Row = ARGBToRGB24Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB24Row = ARGBToRGB24Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToRGB24Row(src_argb, dst_rgb24, width); + src_argb += src_stride_argb; + dst_rgb24 += dst_stride_rgb24; + } + return 0; +} + +// Convert ARGB To RAW. +LIBYUV_API +int ARGBToRAW(const uint8* src_argb, int src_stride_argb, + uint8* dst_raw, int dst_stride_raw, + int width, int height) { + int y; + void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) = + ARGBToRAWRow_C; + if (!src_argb || !dst_raw || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_raw == width * 3) { + width *= height; + height = 1; + src_stride_argb = dst_stride_raw = 0; + } +#if defined(HAS_ARGBTORAWROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToRAWRow = ARGBToRAWRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTORAWROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToRAWRow = ARGBToRAWRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToRAWRow = ARGBToRAWRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToRAWRow(src_argb, dst_raw, width); + src_argb += src_stride_argb; + dst_raw += dst_stride_raw; + } + return 0; +} + +// Ordered 8x8 dither for 888 to 565. Values from 0 to 7. +static const uint8 kDither565_4x4[16] = { + 0, 4, 1, 5, + 6, 2, 7, 3, + 1, 5, 0, 4, + 7, 3, 6, 2, +}; + +// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes). +LIBYUV_API +int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb565, int dst_stride_rgb565, + const uint8* dither4x4, int width, int height) { + int y; + void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb, + const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C; + if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + if (!dither4x4) { + dither4x4 = kDither565_4x4; + } +#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON; + } + } +#endif + for (y = 0; y < height; ++y) { + ARGBToRGB565DitherRow(src_argb, dst_rgb565, + *(uint32*)(dither4x4 + ((y & 3) << 2)), width); + src_argb += src_stride_argb; + dst_rgb565 += dst_stride_rgb565; + } + return 0; +} + +// Convert ARGB To RGB565. +// TODO(fbarchard): Consider using dither function low level with zeros. +LIBYUV_API +int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height) { + int y; + void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = + ARGBToRGB565Row_C; + if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_rgb565 == width * 2) { + width *= height; + height = 1; + src_stride_argb = dst_stride_rgb565 = 0; + } +#if defined(HAS_ARGBTORGB565ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBToRGB565Row = ARGBToRGB565Row_SSE2; + } + } +#endif +#if defined(HAS_ARGBTORGB565ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565Row = ARGBToRGB565Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565Row = ARGBToRGB565Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToRGB565Row(src_argb, dst_rgb565, width); + src_argb += src_stride_argb; + dst_rgb565 += dst_stride_rgb565; + } + return 0; +} + +// Convert ARGB To ARGB1555. +LIBYUV_API +int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb1555, int dst_stride_argb1555, + int width, int height) { + int y; + void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = + ARGBToARGB1555Row_C; + if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb1555 == width * 2) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb1555 = 0; + } +#if defined(HAS_ARGBTOARGB1555ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2; + } + } +#endif +#if defined(HAS_ARGBTOARGB1555ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOARGB1555ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToARGB1555Row(src_argb, dst_argb1555, width); + src_argb += src_stride_argb; + dst_argb1555 += dst_stride_argb1555; + } + return 0; +} + +// Convert ARGB To ARGB4444. +LIBYUV_API +int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb4444, int dst_stride_argb4444, + int width, int height) { + int y; + void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = + ARGBToARGB4444Row_C; + if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb4444 == width * 2) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb4444 = 0; + } +#if defined(HAS_ARGBTOARGB4444ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2; + } + } +#endif +#if defined(HAS_ARGBTOARGB4444ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOARGB4444ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToARGB4444Row(src_argb, dst_argb4444, width); + src_argb += src_stride_argb; + dst_argb4444 += dst_stride_argb4444; + } + return 0; +} + +// Convert ARGB to J420. (JPeg full range I420). +LIBYUV_API +int ARGBToJ420(const uint8* src_argb, int src_stride_argb, + uint8* dst_yj, int dst_stride_yj, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C; + void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) = + ARGBToYJRow_C; + if (!src_argb || + !dst_yj || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_SSSE3; + ARGBToYJRow = ARGBToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYJRow = ARGBToYJRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYJRow = ARGBToYJRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVJRow = ARGBToUVJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width); + ARGBToYJRow(src_argb, dst_yj, width); + ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width); + src_argb += src_stride_argb * 2; + dst_yj += dst_stride_yj * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width); + ARGBToYJRow(src_argb, dst_yj, width); + } + return 0; +} + +// ARGB little endian (bgra in memory) to J422 +LIBYUV_API +int ARGBToJ422(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*ARGBToUVJ422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) = ARGBToUVJ422Row_C; + void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYJRow_C; + if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_y == width && + dst_stride_u * 2 == width && + dst_stride_v * 2 == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_ARGBTOUVJ422ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVJ422Row = ARGBToUVJ422Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJ422Row = ARGBToUVJ422Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVJ422ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVJ422Row = ARGBToUVJ422Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJ422Row = ARGBToUVJ422Row_NEON; + } + } +#endif + +#if defined(HAS_ARGBTOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYJRow = ARGBToYJRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYJRow = ARGBToYJRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToUVJ422Row(src_argb, dst_u, dst_v, width); + ARGBToYJRow(src_argb, dst_y, width); + src_argb += src_stride_argb; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + +// Convert ARGB to J400. +LIBYUV_API +int ARGBToJ400(const uint8* src_argb, int src_stride_argb, + uint8* dst_yj, int dst_stride_yj, + int width, int height) { + int y; + void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) = + ARGBToYJRow_C; + if (!src_argb || !dst_yj || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_yj == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_yj = 0; + } +#if defined(HAS_ARGBTOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYJRow = ARGBToYJRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYJRow = ARGBToYJRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToYJRow(src_argb, dst_yj, width); + src_argb += src_stride_argb; + dst_yj += dst_stride_yj; + } + return 0; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/convert_jpeg.cc b/libs/libaom/src/third_party/libyuv/source/convert_jpeg.cc new file mode 100644 index 000000000..bcb980f7f --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/convert_jpeg.cc @@ -0,0 +1,392 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert.h" + +#ifdef HAVE_JPEG +#include "libyuv/mjpeg_decoder.h" +#endif + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#ifdef HAVE_JPEG +struct I420Buffers { + uint8* y; + int y_stride; + uint8* u; + int u_stride; + uint8* v; + int v_stride; + int w; + int h; +}; + +static void JpegCopyI420(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + I420Buffers* dest = (I420Buffers*)(opaque); + I420Copy(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->y, dest->y_stride, + dest->u, dest->u_stride, + dest->v, dest->v_stride, + dest->w, rows); + dest->y += rows * dest->y_stride; + dest->u += ((rows + 1) >> 1) * dest->u_stride; + dest->v += ((rows + 1) >> 1) * dest->v_stride; + dest->h -= rows; +} + +static void JpegI422ToI420(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + I420Buffers* dest = (I420Buffers*)(opaque); + I422ToI420(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->y, dest->y_stride, + dest->u, dest->u_stride, + dest->v, dest->v_stride, + dest->w, rows); + dest->y += rows * dest->y_stride; + dest->u += ((rows + 1) >> 1) * dest->u_stride; + dest->v += ((rows + 1) >> 1) * dest->v_stride; + dest->h -= rows; +} + +static void JpegI444ToI420(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + I420Buffers* dest = (I420Buffers*)(opaque); + I444ToI420(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->y, dest->y_stride, + dest->u, dest->u_stride, + dest->v, dest->v_stride, + dest->w, rows); + dest->y += rows * dest->y_stride; + dest->u += ((rows + 1) >> 1) * dest->u_stride; + dest->v += ((rows + 1) >> 1) * dest->v_stride; + dest->h -= rows; +} + +static void JpegI411ToI420(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + I420Buffers* dest = (I420Buffers*)(opaque); + I411ToI420(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->y, dest->y_stride, + dest->u, dest->u_stride, + dest->v, dest->v_stride, + dest->w, rows); + dest->y += rows * dest->y_stride; + dest->u += ((rows + 1) >> 1) * dest->u_stride; + dest->v += ((rows + 1) >> 1) * dest->v_stride; + dest->h -= rows; +} + +static void JpegI400ToI420(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + I420Buffers* dest = (I420Buffers*)(opaque); + I400ToI420(data[0], strides[0], + dest->y, dest->y_stride, + dest->u, dest->u_stride, + dest->v, dest->v_stride, + dest->w, rows); + dest->y += rows * dest->y_stride; + dest->u += ((rows + 1) >> 1) * dest->u_stride; + dest->v += ((rows + 1) >> 1) * dest->v_stride; + dest->h -= rows; +} + +// Query size of MJPG in pixels. +LIBYUV_API +int MJPGSize(const uint8* sample, size_t sample_size, + int* width, int* height) { + MJpegDecoder mjpeg_decoder; + LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); + if (ret) { + *width = mjpeg_decoder.GetWidth(); + *height = mjpeg_decoder.GetHeight(); + } + mjpeg_decoder.UnloadFrame(); + return ret ? 0 : -1; // -1 for runtime failure. +} + +// MJPG (Motion JPeg) to I420 +// TODO(fbarchard): review w and h requirement. dw and dh may be enough. +LIBYUV_API +int MJPGToI420(const uint8* sample, + size_t sample_size, + uint8* y, int y_stride, + uint8* u, int u_stride, + uint8* v, int v_stride, + int w, int h, + int dw, int dh) { + if (sample_size == kUnknownDataSize) { + // ERROR: MJPEG frame size unknown + return -1; + } + + // TODO(fbarchard): Port MJpeg to C. + MJpegDecoder mjpeg_decoder; + LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); + if (ret && (mjpeg_decoder.GetWidth() != w || + mjpeg_decoder.GetHeight() != h)) { + // ERROR: MJPEG frame has unexpected dimensions + mjpeg_decoder.UnloadFrame(); + return 1; // runtime failure + } + if (ret) { + I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh }; + // YUV420 + if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 2 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh); + // YUV422 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh); + // YUV444 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh); + // YUV411 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 4 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh); + // YUV400 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceGrayscale && + mjpeg_decoder.GetNumComponents() == 1 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh); + } else { + // TODO(fbarchard): Implement conversion for any other colorspace/sample + // factors that occur in practice. 411 is supported by libjpeg + // ERROR: Unable to convert MJPEG frame because format is not supported + mjpeg_decoder.UnloadFrame(); + return 1; + } + } + return ret ? 0 : 1; +} + +#ifdef HAVE_JPEG +struct ARGBBuffers { + uint8* argb; + int argb_stride; + int w; + int h; +}; + +static void JpegI420ToARGB(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + ARGBBuffers* dest = (ARGBBuffers*)(opaque); + I420ToARGB(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->argb, dest->argb_stride, + dest->w, rows); + dest->argb += rows * dest->argb_stride; + dest->h -= rows; +} + +static void JpegI422ToARGB(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + ARGBBuffers* dest = (ARGBBuffers*)(opaque); + I422ToARGB(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->argb, dest->argb_stride, + dest->w, rows); + dest->argb += rows * dest->argb_stride; + dest->h -= rows; +} + +static void JpegI444ToARGB(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + ARGBBuffers* dest = (ARGBBuffers*)(opaque); + I444ToARGB(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->argb, dest->argb_stride, + dest->w, rows); + dest->argb += rows * dest->argb_stride; + dest->h -= rows; +} + +static void JpegI411ToARGB(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + ARGBBuffers* dest = (ARGBBuffers*)(opaque); + I411ToARGB(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->argb, dest->argb_stride, + dest->w, rows); + dest->argb += rows * dest->argb_stride; + dest->h -= rows; +} + +static void JpegI400ToARGB(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + ARGBBuffers* dest = (ARGBBuffers*)(opaque); + I400ToARGB(data[0], strides[0], + dest->argb, dest->argb_stride, + dest->w, rows); + dest->argb += rows * dest->argb_stride; + dest->h -= rows; +} + +// MJPG (Motion JPeg) to ARGB +// TODO(fbarchard): review w and h requirement. dw and dh may be enough. +LIBYUV_API +int MJPGToARGB(const uint8* sample, + size_t sample_size, + uint8* argb, int argb_stride, + int w, int h, + int dw, int dh) { + if (sample_size == kUnknownDataSize) { + // ERROR: MJPEG frame size unknown + return -1; + } + + // TODO(fbarchard): Port MJpeg to C. + MJpegDecoder mjpeg_decoder; + LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); + if (ret && (mjpeg_decoder.GetWidth() != w || + mjpeg_decoder.GetHeight() != h)) { + // ERROR: MJPEG frame has unexpected dimensions + mjpeg_decoder.UnloadFrame(); + return 1; // runtime failure + } + if (ret) { + ARGBBuffers bufs = { argb, argb_stride, dw, dh }; + // YUV420 + if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 2 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh); + // YUV422 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh); + // YUV444 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh); + // YUV411 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 4 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh); + // YUV400 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceGrayscale && + mjpeg_decoder.GetNumComponents() == 1 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh); + } else { + // TODO(fbarchard): Implement conversion for any other colorspace/sample + // factors that occur in practice. 411 is supported by libjpeg + // ERROR: Unable to convert MJPEG frame because format is not supported + mjpeg_decoder.UnloadFrame(); + return 1; + } + } + return ret ? 0 : 1; +} +#endif + +#endif + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/convert_to_argb.cc b/libs/libaom/src/third_party/libyuv/source/convert_to_argb.cc new file mode 100644 index 000000000..af829fbd3 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/convert_to_argb.cc @@ -0,0 +1,306 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert_argb.h" + +#include "libyuv/cpu_id.h" +#ifdef HAVE_JPEG +#include "libyuv/mjpeg_decoder.h" +#endif +#include "libyuv/rotate_argb.h" +#include "libyuv/row.h" +#include "libyuv/video_common.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Convert camera sample to I420 with cropping, rotation and vertical flip. +// src_width is used for source stride computation +// src_height is used to compute location of planes, and indicate inversion +// sample_size is measured in bytes and is the size of the frame. +// With MJPEG it is the compressed size of the frame. +LIBYUV_API +int ConvertToARGB(const uint8* sample, size_t sample_size, + uint8* crop_argb, int argb_stride, + int crop_x, int crop_y, + int src_width, int src_height, + int crop_width, int crop_height, + enum RotationMode rotation, + uint32 fourcc) { + uint32 format = CanonicalFourCC(fourcc); + int aligned_src_width = (src_width + 1) & ~1; + const uint8* src; + const uint8* src_uv; + int abs_src_height = (src_height < 0) ? -src_height : src_height; + int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height; + int r = 0; + + // One pass rotation is available for some formats. For the rest, convert + // to I420 (with optional vertical flipping) into a temporary I420 buffer, + // and then rotate the I420 to the final destination buffer. + // For in-place conversion, if destination crop_argb is same as source sample, + // also enable temporary buffer. + LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) || + crop_argb == sample; + uint8* tmp_argb = crop_argb; + int tmp_argb_stride = argb_stride; + uint8* rotate_buffer = NULL; + int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; + + if (crop_argb == NULL || sample == NULL || + src_width <= 0 || crop_width <= 0 || + src_height == 0 || crop_height == 0) { + return -1; + } + if (src_height < 0) { + inv_crop_height = -inv_crop_height; + } + + if (need_buf) { + int argb_size = crop_width * abs_crop_height * 4; + rotate_buffer = (uint8*)malloc(argb_size); + if (!rotate_buffer) { + return 1; // Out of memory runtime error. + } + crop_argb = rotate_buffer; + argb_stride = crop_width; + } + + switch (format) { + // Single plane formats + case FOURCC_YUY2: + src = sample + (aligned_src_width * crop_y + crop_x) * 2; + r = YUY2ToARGB(src, aligned_src_width * 2, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_UYVY: + src = sample + (aligned_src_width * crop_y + crop_x) * 2; + r = UYVYToARGB(src, aligned_src_width * 2, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_24BG: + src = sample + (src_width * crop_y + crop_x) * 3; + r = RGB24ToARGB(src, src_width * 3, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_RAW: + src = sample + (src_width * crop_y + crop_x) * 3; + r = RAWToARGB(src, src_width * 3, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_ARGB: + src = sample + (src_width * crop_y + crop_x) * 4; + r = ARGBToARGB(src, src_width * 4, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_BGRA: + src = sample + (src_width * crop_y + crop_x) * 4; + r = BGRAToARGB(src, src_width * 4, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_ABGR: + src = sample + (src_width * crop_y + crop_x) * 4; + r = ABGRToARGB(src, src_width * 4, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_RGBA: + src = sample + (src_width * crop_y + crop_x) * 4; + r = RGBAToARGB(src, src_width * 4, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_RGBP: + src = sample + (src_width * crop_y + crop_x) * 2; + r = RGB565ToARGB(src, src_width * 2, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_RGBO: + src = sample + (src_width * crop_y + crop_x) * 2; + r = ARGB1555ToARGB(src, src_width * 2, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_R444: + src = sample + (src_width * crop_y + crop_x) * 2; + r = ARGB4444ToARGB(src, src_width * 2, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_I400: + src = sample + src_width * crop_y + crop_x; + r = I400ToARGB(src, src_width, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + + // Biplanar formats + case FOURCC_NV12: + src = sample + (src_width * crop_y + crop_x); + src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x; + r = NV12ToARGB(src, src_width, + src_uv, aligned_src_width, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_NV21: + src = sample + (src_width * crop_y + crop_x); + src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x; + // Call NV12 but with u and v parameters swapped. + r = NV21ToARGB(src, src_width, + src_uv, aligned_src_width, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_M420: + src = sample + (src_width * crop_y) * 12 / 8 + crop_x; + r = M420ToARGB(src, src_width, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + // Triplanar formats + case FOURCC_I420: + case FOURCC_YU12: + case FOURCC_YV12: { + const uint8* src_y = sample + (src_width * crop_y + crop_x); + const uint8* src_u; + const uint8* src_v; + int halfwidth = (src_width + 1) / 2; + int halfheight = (abs_src_height + 1) / 2; + if (format == FOURCC_YV12) { + src_v = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + src_u = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + } else { + src_u = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + src_v = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + } + r = I420ToARGB(src_y, src_width, + src_u, halfwidth, + src_v, halfwidth, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + } + + case FOURCC_J420: { + const uint8* src_y = sample + (src_width * crop_y + crop_x); + const uint8* src_u; + const uint8* src_v; + int halfwidth = (src_width + 1) / 2; + int halfheight = (abs_src_height + 1) / 2; + src_u = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + src_v = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + r = J420ToARGB(src_y, src_width, + src_u, halfwidth, + src_v, halfwidth, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + } + + case FOURCC_I422: + case FOURCC_YV16: { + const uint8* src_y = sample + src_width * crop_y + crop_x; + const uint8* src_u; + const uint8* src_v; + int halfwidth = (src_width + 1) / 2; + if (format == FOURCC_YV16) { + src_v = sample + src_width * abs_src_height + + halfwidth * crop_y + crop_x / 2; + src_u = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + } else { + src_u = sample + src_width * abs_src_height + + halfwidth * crop_y + crop_x / 2; + src_v = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + } + r = I422ToARGB(src_y, src_width, + src_u, halfwidth, + src_v, halfwidth, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + } + case FOURCC_I444: + case FOURCC_YV24: { + const uint8* src_y = sample + src_width * crop_y + crop_x; + const uint8* src_u; + const uint8* src_v; + if (format == FOURCC_YV24) { + src_v = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + } else { + src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + } + r = I444ToARGB(src_y, src_width, + src_u, src_width, + src_v, src_width, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + } + case FOURCC_I411: { + int quarterwidth = (src_width + 3) / 4; + const uint8* src_y = sample + src_width * crop_y + crop_x; + const uint8* src_u = sample + src_width * abs_src_height + + quarterwidth * crop_y + crop_x / 4; + const uint8* src_v = sample + src_width * abs_src_height + + quarterwidth * (abs_src_height + crop_y) + crop_x / 4; + r = I411ToARGB(src_y, src_width, + src_u, quarterwidth, + src_v, quarterwidth, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + } +#ifdef HAVE_JPEG + case FOURCC_MJPG: + r = MJPGToARGB(sample, sample_size, + crop_argb, argb_stride, + src_width, abs_src_height, crop_width, inv_crop_height); + break; +#endif + default: + r = -1; // unknown fourcc - return failure code. + } + + if (need_buf) { + if (!r) { + r = ARGBRotate(crop_argb, argb_stride, + tmp_argb, tmp_argb_stride, + crop_width, abs_crop_height, rotation); + } + free(rotate_buffer); + } + + return r; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/convert_to_i420.cc b/libs/libaom/src/third_party/libyuv/source/convert_to_i420.cc new file mode 100644 index 000000000..5e75369b5 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/convert_to_i420.cc @@ -0,0 +1,339 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "libyuv/convert.h" + +#include "libyuv/video_common.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Convert camera sample to I420 with cropping, rotation and vertical flip. +// src_width is used for source stride computation +// src_height is used to compute location of planes, and indicate inversion +// sample_size is measured in bytes and is the size of the frame. +// With MJPEG it is the compressed size of the frame. +LIBYUV_API +int ConvertToI420(const uint8* sample, + size_t sample_size, + uint8* y, int y_stride, + uint8* u, int u_stride, + uint8* v, int v_stride, + int crop_x, int crop_y, + int src_width, int src_height, + int crop_width, int crop_height, + enum RotationMode rotation, + uint32 fourcc) { + uint32 format = CanonicalFourCC(fourcc); + int aligned_src_width = (src_width + 1) & ~1; + const uint8* src; + const uint8* src_uv; + int abs_src_height = (src_height < 0) ? -src_height : src_height; + int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height; + int r = 0; + LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 && + format != FOURCC_NV12 && format != FOURCC_NV21 && + format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample; + uint8* tmp_y = y; + uint8* tmp_u = u; + uint8* tmp_v = v; + int tmp_y_stride = y_stride; + int tmp_u_stride = u_stride; + int tmp_v_stride = v_stride; + uint8* rotate_buffer = NULL; + int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; + + if (!y || !u || !v || !sample || + src_width <= 0 || crop_width <= 0 || + src_height == 0 || crop_height == 0) { + return -1; + } + if (src_height < 0) { + inv_crop_height = -inv_crop_height; + } + + // One pass rotation is available for some formats. For the rest, convert + // to I420 (with optional vertical flipping) into a temporary I420 buffer, + // and then rotate the I420 to the final destination buffer. + // For in-place conversion, if destination y is same as source sample, + // also enable temporary buffer. + if (need_buf) { + int y_size = crop_width * abs_crop_height; + int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2); + rotate_buffer = (uint8*)malloc(y_size + uv_size * 2); + if (!rotate_buffer) { + return 1; // Out of memory runtime error. + } + y = rotate_buffer; + u = y + y_size; + v = u + uv_size; + y_stride = crop_width; + u_stride = v_stride = ((crop_width + 1) / 2); + } + + switch (format) { + // Single plane formats + case FOURCC_YUY2: + src = sample + (aligned_src_width * crop_y + crop_x) * 2; + r = YUY2ToI420(src, aligned_src_width * 2, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_UYVY: + src = sample + (aligned_src_width * crop_y + crop_x) * 2; + r = UYVYToI420(src, aligned_src_width * 2, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_RGBP: + src = sample + (src_width * crop_y + crop_x) * 2; + r = RGB565ToI420(src, src_width * 2, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_RGBO: + src = sample + (src_width * crop_y + crop_x) * 2; + r = ARGB1555ToI420(src, src_width * 2, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_R444: + src = sample + (src_width * crop_y + crop_x) * 2; + r = ARGB4444ToI420(src, src_width * 2, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_24BG: + src = sample + (src_width * crop_y + crop_x) * 3; + r = RGB24ToI420(src, src_width * 3, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_RAW: + src = sample + (src_width * crop_y + crop_x) * 3; + r = RAWToI420(src, src_width * 3, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_ARGB: + src = sample + (src_width * crop_y + crop_x) * 4; + r = ARGBToI420(src, src_width * 4, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_BGRA: + src = sample + (src_width * crop_y + crop_x) * 4; + r = BGRAToI420(src, src_width * 4, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_ABGR: + src = sample + (src_width * crop_y + crop_x) * 4; + r = ABGRToI420(src, src_width * 4, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_RGBA: + src = sample + (src_width * crop_y + crop_x) * 4; + r = RGBAToI420(src, src_width * 4, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_I400: + src = sample + src_width * crop_y + crop_x; + r = I400ToI420(src, src_width, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + // Biplanar formats + case FOURCC_NV12: + src = sample + (src_width * crop_y + crop_x); + src_uv = sample + (src_width * src_height) + + ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2); + r = NV12ToI420Rotate(src, src_width, + src_uv, aligned_src_width, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height, rotation); + break; + case FOURCC_NV21: + src = sample + (src_width * crop_y + crop_x); + src_uv = sample + (src_width * src_height) + + ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2); + // Call NV12 but with u and v parameters swapped. + r = NV12ToI420Rotate(src, src_width, + src_uv, aligned_src_width, + y, y_stride, + v, v_stride, + u, u_stride, + crop_width, inv_crop_height, rotation); + break; + case FOURCC_M420: + src = sample + (src_width * crop_y) * 12 / 8 + crop_x; + r = M420ToI420(src, src_width, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + // Triplanar formats + case FOURCC_I420: + case FOURCC_YU12: + case FOURCC_YV12: { + const uint8* src_y = sample + (src_width * crop_y + crop_x); + const uint8* src_u; + const uint8* src_v; + int halfwidth = (src_width + 1) / 2; + int halfheight = (abs_src_height + 1) / 2; + if (format == FOURCC_YV12) { + src_v = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + src_u = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + } else { + src_u = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + src_v = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + } + r = I420Rotate(src_y, src_width, + src_u, halfwidth, + src_v, halfwidth, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height, rotation); + break; + } + case FOURCC_I422: + case FOURCC_YV16: { + const uint8* src_y = sample + src_width * crop_y + crop_x; + const uint8* src_u; + const uint8* src_v; + int halfwidth = (src_width + 1) / 2; + if (format == FOURCC_YV16) { + src_v = sample + src_width * abs_src_height + + halfwidth * crop_y + crop_x / 2; + src_u = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + } else { + src_u = sample + src_width * abs_src_height + + halfwidth * crop_y + crop_x / 2; + src_v = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + } + r = I422ToI420(src_y, src_width, + src_u, halfwidth, + src_v, halfwidth, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + } + case FOURCC_I444: + case FOURCC_YV24: { + const uint8* src_y = sample + src_width * crop_y + crop_x; + const uint8* src_u; + const uint8* src_v; + if (format == FOURCC_YV24) { + src_v = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + } else { + src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + } + r = I444ToI420(src_y, src_width, + src_u, src_width, + src_v, src_width, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + } + case FOURCC_I411: { + int quarterwidth = (src_width + 3) / 4; + const uint8* src_y = sample + src_width * crop_y + crop_x; + const uint8* src_u = sample + src_width * abs_src_height + + quarterwidth * crop_y + crop_x / 4; + const uint8* src_v = sample + src_width * abs_src_height + + quarterwidth * (abs_src_height + crop_y) + crop_x / 4; + r = I411ToI420(src_y, src_width, + src_u, quarterwidth, + src_v, quarterwidth, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + } +#ifdef HAVE_JPEG + case FOURCC_MJPG: + r = MJPGToI420(sample, sample_size, + y, y_stride, + u, u_stride, + v, v_stride, + src_width, abs_src_height, crop_width, inv_crop_height); + break; +#endif + default: + r = -1; // unknown fourcc - return failure code. + } + + if (need_buf) { + if (!r) { + r = I420Rotate(y, y_stride, + u, u_stride, + v, v_stride, + tmp_y, tmp_y_stride, + tmp_u, tmp_u_stride, + tmp_v, tmp_v_stride, + crop_width, abs_crop_height, rotation); + } + free(rotate_buffer); + } + + return r; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/cpu_id.cc b/libs/libaom/src/third_party/libyuv/source/cpu_id.cc new file mode 100644 index 000000000..72f686e3b --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/cpu_id.cc @@ -0,0 +1,307 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/cpu_id.h" + +#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__) +#include // For __cpuidex() +#endif +#if !defined(__pnacl__) && !defined(__CLR_VER) && \ + !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \ + defined(_MSC_VER) && !defined(__clang__) && (_MSC_FULL_VER >= 160040219) +#include // For _xgetbv() +#endif + +#if !defined(__native_client__) +#include // For getenv() +#endif + +// For ArmCpuCaps() but unittested on all platforms +#include +#include + +#include "libyuv/basic_types.h" // For CPU_X86 + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// For functions that use the stack and have runtime checks for overflow, +// use SAFEBUFFERS to avoid additional check. +#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219) +#define SAFEBUFFERS __declspec(safebuffers) +#else +#define SAFEBUFFERS +#endif + +// Low level cpuid for X86. +#if (defined(_M_IX86) || defined(_M_X64) || \ + defined(__i386__) || defined(__x86_64__)) && \ + !defined(__pnacl__) && !defined(__CLR_VER) +LIBYUV_API +void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { +#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__) +// Visual C version uses intrinsic or inline x86 assembly. +#if (_MSC_FULL_VER >= 160040219) + __cpuidex((int*)(cpu_info), info_eax, info_ecx); +#elif defined(_M_IX86) + __asm { + mov eax, info_eax + mov ecx, info_ecx + mov edi, cpu_info + cpuid + mov [edi], eax + mov [edi + 4], ebx + mov [edi + 8], ecx + mov [edi + 12], edx + } +#else + if (info_ecx == 0) { + __cpuid((int*)(cpu_info), info_eax); + } else { + cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0; + } +#endif +// GCC version uses inline x86 assembly. +#else // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__) + uint32 info_ebx, info_edx; + asm volatile ( // NOLINT +#if defined( __i386__) && defined(__PIC__) + // Preserve ebx for fpic 32 bit. + "mov %%ebx, %%edi \n" + "cpuid \n" + "xchg %%edi, %%ebx \n" + : "=D" (info_ebx), +#else + "cpuid \n" + : "=b" (info_ebx), +#endif // defined( __i386__) && defined(__PIC__) + "+a" (info_eax), "+c" (info_ecx), "=d" (info_edx)); + cpu_info[0] = info_eax; + cpu_info[1] = info_ebx; + cpu_info[2] = info_ecx; + cpu_info[3] = info_edx; +#endif // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__) +} +#else // (defined(_M_IX86) || defined(_M_X64) ... +LIBYUV_API +void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) { + cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; +} +#endif + +// TODO(fbarchard): Enable xgetbv when validator supports it. +#if (defined(_M_IX86) || defined(_M_X64) || \ + defined(__i386__) || defined(__x86_64__)) && \ + !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__) +#define HAS_XGETBV +// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers. +int TestOsSaveYmm() { + uint32 xcr0 = 0u; +#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219) + xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required. +#elif defined(_M_IX86) && defined(_MSC_VER) && !defined(__clang__) + __asm { + xor ecx, ecx // xcr 0 + _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier. + mov xcr0, eax + } +#elif defined(__i386__) || defined(__x86_64__) + asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx"); +#endif // defined(__i386__) || defined(__x86_64__) + return((xcr0 & 6) == 6); // Is ymm saved? +} +#endif // defined(_M_IX86) || defined(_M_X64) .. + +// based on libaom arm_cpudetect.c +// For Arm, but public to allow testing on any CPU +LIBYUV_API SAFEBUFFERS +int ArmCpuCaps(const char* cpuinfo_name) { + char cpuinfo_line[512]; + FILE* f = fopen(cpuinfo_name, "r"); + if (!f) { + // Assume Neon if /proc/cpuinfo is unavailable. + // This will occur for Chrome sandbox for Pepper or Render process. + return kCpuHasNEON; + } + while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) { + if (memcmp(cpuinfo_line, "Features", 8) == 0) { + char* p = strstr(cpuinfo_line, " neon"); + if (p && (p[5] == ' ' || p[5] == '\n')) { + fclose(f); + return kCpuHasNEON; + } + // aarch64 uses asimd for Neon. + p = strstr(cpuinfo_line, " asimd"); + if (p && (p[6] == ' ' || p[6] == '\n')) { + fclose(f); + return kCpuHasNEON; + } + } + } + fclose(f); + return 0; +} + +#if defined(__mips__) && defined(__linux__) +static int MipsCpuCaps(const char* search_string) { + char cpuinfo_line[512]; + const char* file_name = "/proc/cpuinfo"; + FILE* f = fopen(file_name, "r"); + if (!f) { + // Assume DSP if /proc/cpuinfo is unavailable. + // This will occur for Chrome sandbox for Pepper or Render process. + return kCpuHasMIPS_DSP; + } + while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f) != NULL) { + if (strstr(cpuinfo_line, search_string) != NULL) { + fclose(f); + return kCpuHasMIPS_DSP; + } + } + fclose(f); + return 0; +} +#endif + +// CPU detect function for SIMD instruction sets. +LIBYUV_API +int cpu_info_ = kCpuInit; // cpu_info is not initialized yet. + +// Test environment variable for disabling CPU features. Any non-zero value +// to disable. Zero ignored to make it easy to set the variable on/off. +#if !defined(__native_client__) && !defined(_M_ARM) + +static LIBYUV_BOOL TestEnv(const char* name) { + const char* var = getenv(name); + if (var) { + if (var[0] != '0') { + return LIBYUV_TRUE; + } + } + return LIBYUV_FALSE; +} +#else // nacl does not support getenv(). +static LIBYUV_BOOL TestEnv(const char*) { + return LIBYUV_FALSE; +} +#endif + +LIBYUV_API SAFEBUFFERS +int InitCpuFlags(void) { +#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86) + + uint32 cpu_info0[4] = { 0, 0, 0, 0 }; + uint32 cpu_info1[4] = { 0, 0, 0, 0 }; + uint32 cpu_info7[4] = { 0, 0, 0, 0 }; + CpuId(0, 0, cpu_info0); + CpuId(1, 0, cpu_info1); + if (cpu_info0[0] >= 7) { + CpuId(7, 0, cpu_info7); + } + cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) | + ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) | + ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) | + ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) | + ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) | + ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) | + kCpuHasX86; + +#ifdef HAS_XGETBV + if ((cpu_info1[2] & 0x18000000) == 0x18000000 && // AVX and OSSave + TestOsSaveYmm()) { // Saves YMM. + cpu_info_ |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | + kCpuHasAVX; + } +#endif + // Environment variable overrides for testing. + if (TestEnv("LIBYUV_DISABLE_X86")) { + cpu_info_ &= ~kCpuHasX86; + } + if (TestEnv("LIBYUV_DISABLE_SSE2")) { + cpu_info_ &= ~kCpuHasSSE2; + } + if (TestEnv("LIBYUV_DISABLE_SSSE3")) { + cpu_info_ &= ~kCpuHasSSSE3; + } + if (TestEnv("LIBYUV_DISABLE_SSE41")) { + cpu_info_ &= ~kCpuHasSSE41; + } + if (TestEnv("LIBYUV_DISABLE_SSE42")) { + cpu_info_ &= ~kCpuHasSSE42; + } + if (TestEnv("LIBYUV_DISABLE_AVX")) { + cpu_info_ &= ~kCpuHasAVX; + } + if (TestEnv("LIBYUV_DISABLE_AVX2")) { + cpu_info_ &= ~kCpuHasAVX2; + } + if (TestEnv("LIBYUV_DISABLE_ERMS")) { + cpu_info_ &= ~kCpuHasERMS; + } + if (TestEnv("LIBYUV_DISABLE_FMA3")) { + cpu_info_ &= ~kCpuHasFMA3; + } +#endif +#if defined(__mips__) && defined(__linux__) + // Linux mips parse text file for dsp detect. + cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP. +#if defined(__mips_dspr2) + cpu_info_ |= kCpuHasMIPS_DSPR2; +#endif + cpu_info_ |= kCpuHasMIPS; + + if (getenv("LIBYUV_DISABLE_MIPS")) { + cpu_info_ &= ~kCpuHasMIPS; + } + if (getenv("LIBYUV_DISABLE_MIPS_DSP")) { + cpu_info_ &= ~kCpuHasMIPS_DSP; + } + if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) { + cpu_info_ &= ~kCpuHasMIPS_DSPR2; + } +#endif +#if defined(__arm__) || defined(__aarch64__) +// gcc -mfpu=neon defines __ARM_NEON__ +// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon. +// For Linux, /proc/cpuinfo can be tested but without that assume Neon. +#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__) + cpu_info_ = kCpuHasNEON; +// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon +// flag in it. +// So for aarch64, neon enabling is hard coded here. +#endif +#if defined(__aarch64__) + cpu_info_ = kCpuHasNEON; +#else + // Linux arm parse text file for neon detect. + cpu_info_ = ArmCpuCaps("/proc/cpuinfo"); +#endif + cpu_info_ |= kCpuHasARM; + if (TestEnv("LIBYUV_DISABLE_NEON")) { + cpu_info_ &= ~kCpuHasNEON; + } +#endif // __arm__ + if (TestEnv("LIBYUV_DISABLE_ASM")) { + cpu_info_ = 0; + } + return cpu_info_; +} + +LIBYUV_API +void MaskCpuFlags(int enable_flags) { + cpu_info_ = InitCpuFlags() & enable_flags; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/mjpeg_decoder.cc b/libs/libaom/src/third_party/libyuv/source/mjpeg_decoder.cc new file mode 100644 index 000000000..75f8a610e --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/mjpeg_decoder.cc @@ -0,0 +1,572 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/mjpeg_decoder.h" + +#ifdef HAVE_JPEG +#include + +#if !defined(__pnacl__) && !defined(__CLR_VER) && \ + !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) +// Must be included before jpeglib. +#include +#define HAVE_SETJMP + +#if defined(_MSC_VER) +// disable warning 4324: structure was padded due to __declspec(align()) +#pragma warning(disable:4324) +#endif + +#endif +struct FILE; // For jpeglib.h. + +// C++ build requires extern C for jpeg internals. +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#ifdef __cplusplus +} // extern "C" +#endif + +#include "libyuv/planar_functions.h" // For CopyPlane(). + +namespace libyuv { + +#ifdef HAVE_SETJMP +struct SetJmpErrorMgr { + jpeg_error_mgr base; // Must be at the top + jmp_buf setjmp_buffer; +}; +#endif + +const int MJpegDecoder::kColorSpaceUnknown = JCS_UNKNOWN; +const int MJpegDecoder::kColorSpaceGrayscale = JCS_GRAYSCALE; +const int MJpegDecoder::kColorSpaceRgb = JCS_RGB; +const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr; +const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK; +const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK; + +// Methods that are passed to jpeglib. +boolean fill_input_buffer(jpeg_decompress_struct* cinfo); +void init_source(jpeg_decompress_struct* cinfo); +void skip_input_data(jpeg_decompress_struct* cinfo, + long num_bytes); // NOLINT +void term_source(jpeg_decompress_struct* cinfo); +void ErrorHandler(jpeg_common_struct* cinfo); + +MJpegDecoder::MJpegDecoder() + : has_scanline_padding_(LIBYUV_FALSE), + num_outbufs_(0), + scanlines_(NULL), + scanlines_sizes_(NULL), + databuf_(NULL), + databuf_strides_(NULL) { + decompress_struct_ = new jpeg_decompress_struct; + source_mgr_ = new jpeg_source_mgr; +#ifdef HAVE_SETJMP + error_mgr_ = new SetJmpErrorMgr; + decompress_struct_->err = jpeg_std_error(&error_mgr_->base); + // Override standard exit()-based error handler. + error_mgr_->base.error_exit = &ErrorHandler; +#endif + decompress_struct_->client_data = NULL; + source_mgr_->init_source = &init_source; + source_mgr_->fill_input_buffer = &fill_input_buffer; + source_mgr_->skip_input_data = &skip_input_data; + source_mgr_->resync_to_restart = &jpeg_resync_to_restart; + source_mgr_->term_source = &term_source; + jpeg_create_decompress(decompress_struct_); + decompress_struct_->src = source_mgr_; + buf_vec_.buffers = &buf_; + buf_vec_.len = 1; +} + +MJpegDecoder::~MJpegDecoder() { + jpeg_destroy_decompress(decompress_struct_); + delete decompress_struct_; + delete source_mgr_; +#ifdef HAVE_SETJMP + delete error_mgr_; +#endif + DestroyOutputBuffers(); +} + +LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) { + if (!ValidateJpeg(src, src_len)) { + return LIBYUV_FALSE; + } + + buf_.data = src; + buf_.len = static_cast(src_len); + buf_vec_.pos = 0; + decompress_struct_->client_data = &buf_vec_; +#ifdef HAVE_SETJMP + if (setjmp(error_mgr_->setjmp_buffer)) { + // We called jpeg_read_header, it experienced an error, and we called + // longjmp() and rewound the stack to here. Return error. + return LIBYUV_FALSE; + } +#endif + if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) { + // ERROR: Bad MJPEG header + return LIBYUV_FALSE; + } + AllocOutputBuffers(GetNumComponents()); + for (int i = 0; i < num_outbufs_; ++i) { + int scanlines_size = GetComponentScanlinesPerImcuRow(i); + if (scanlines_sizes_[i] != scanlines_size) { + if (scanlines_[i]) { + delete scanlines_[i]; + } + scanlines_[i] = new uint8* [scanlines_size]; + scanlines_sizes_[i] = scanlines_size; + } + + // We allocate padding for the final scanline to pad it up to DCTSIZE bytes + // to avoid memory errors, since jpeglib only reads full MCUs blocks. For + // the preceding scanlines, the padding is not needed/wanted because the + // following addresses will already be valid (they are the initial bytes of + // the next scanline) and will be overwritten when jpeglib writes out that + // next scanline. + int databuf_stride = GetComponentStride(i); + int databuf_size = scanlines_size * databuf_stride; + if (databuf_strides_[i] != databuf_stride) { + if (databuf_[i]) { + delete databuf_[i]; + } + databuf_[i] = new uint8[databuf_size]; + databuf_strides_[i] = databuf_stride; + } + + if (GetComponentStride(i) != GetComponentWidth(i)) { + has_scanline_padding_ = LIBYUV_TRUE; + } + } + return LIBYUV_TRUE; +} + +static int DivideAndRoundUp(int numerator, int denominator) { + return (numerator + denominator - 1) / denominator; +} + +static int DivideAndRoundDown(int numerator, int denominator) { + return numerator / denominator; +} + +// Returns width of the last loaded frame. +int MJpegDecoder::GetWidth() { + return decompress_struct_->image_width; +} + +// Returns height of the last loaded frame. +int MJpegDecoder::GetHeight() { + return decompress_struct_->image_height; +} + +// Returns format of the last loaded frame. The return value is one of the +// kColorSpace* constants. +int MJpegDecoder::GetColorSpace() { + return decompress_struct_->jpeg_color_space; +} + +// Number of color components in the color space. +int MJpegDecoder::GetNumComponents() { + return decompress_struct_->num_components; +} + +// Sample factors of the n-th component. +int MJpegDecoder::GetHorizSampFactor(int component) { + return decompress_struct_->comp_info[component].h_samp_factor; +} + +int MJpegDecoder::GetVertSampFactor(int component) { + return decompress_struct_->comp_info[component].v_samp_factor; +} + +int MJpegDecoder::GetHorizSubSampFactor(int component) { + return decompress_struct_->max_h_samp_factor / + GetHorizSampFactor(component); +} + +int MJpegDecoder::GetVertSubSampFactor(int component) { + return decompress_struct_->max_v_samp_factor / + GetVertSampFactor(component); +} + +int MJpegDecoder::GetImageScanlinesPerImcuRow() { + return decompress_struct_->max_v_samp_factor * DCTSIZE; +} + +int MJpegDecoder::GetComponentScanlinesPerImcuRow(int component) { + int vs = GetVertSubSampFactor(component); + return DivideAndRoundUp(GetImageScanlinesPerImcuRow(), vs); +} + +int MJpegDecoder::GetComponentWidth(int component) { + int hs = GetHorizSubSampFactor(component); + return DivideAndRoundUp(GetWidth(), hs); +} + +int MJpegDecoder::GetComponentHeight(int component) { + int vs = GetVertSubSampFactor(component); + return DivideAndRoundUp(GetHeight(), vs); +} + +// Get width in bytes padded out to a multiple of DCTSIZE +int MJpegDecoder::GetComponentStride(int component) { + return (GetComponentWidth(component) + DCTSIZE - 1) & ~(DCTSIZE - 1); +} + +int MJpegDecoder::GetComponentSize(int component) { + return GetComponentWidth(component) * GetComponentHeight(component); +} + +LIBYUV_BOOL MJpegDecoder::UnloadFrame() { +#ifdef HAVE_SETJMP + if (setjmp(error_mgr_->setjmp_buffer)) { + // We called jpeg_abort_decompress, it experienced an error, and we called + // longjmp() and rewound the stack to here. Return error. + return LIBYUV_FALSE; + } +#endif + jpeg_abort_decompress(decompress_struct_); + return LIBYUV_TRUE; +} + +// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height. +LIBYUV_BOOL MJpegDecoder::DecodeToBuffers( + uint8** planes, int dst_width, int dst_height) { + if (dst_width != GetWidth() || + dst_height > GetHeight()) { + // ERROR: Bad dimensions + return LIBYUV_FALSE; + } +#ifdef HAVE_SETJMP + if (setjmp(error_mgr_->setjmp_buffer)) { + // We called into jpeglib, it experienced an error sometime during this + // function call, and we called longjmp() and rewound the stack to here. + // Return error. + return LIBYUV_FALSE; + } +#endif + if (!StartDecode()) { + return LIBYUV_FALSE; + } + SetScanlinePointers(databuf_); + int lines_left = dst_height; + // Compute amount of lines to skip to implement vertical crop. + // TODO(fbarchard): Ensure skip is a multiple of maximum component + // subsample. ie 2 + int skip = (GetHeight() - dst_height) / 2; + if (skip > 0) { + // There is no API to skip lines in the output data, so we read them + // into the temp buffer. + while (skip >= GetImageScanlinesPerImcuRow()) { + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + skip -= GetImageScanlinesPerImcuRow(); + } + if (skip > 0) { + // Have a partial iMCU row left over to skip. Must read it and then + // copy the parts we want into the destination. + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + for (int i = 0; i < num_outbufs_; ++i) { + // TODO(fbarchard): Compute skip to avoid this + assert(skip % GetVertSubSampFactor(i) == 0); + int rows_to_skip = + DivideAndRoundDown(skip, GetVertSubSampFactor(i)); + int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) - + rows_to_skip; + int data_to_skip = rows_to_skip * GetComponentStride(i); + CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i), + planes[i], GetComponentWidth(i), + GetComponentWidth(i), scanlines_to_copy); + planes[i] += scanlines_to_copy * GetComponentWidth(i); + } + lines_left -= (GetImageScanlinesPerImcuRow() - skip); + } + } + + // Read full MCUs but cropped horizontally + for (; lines_left > GetImageScanlinesPerImcuRow(); + lines_left -= GetImageScanlinesPerImcuRow()) { + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + for (int i = 0; i < num_outbufs_; ++i) { + int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i); + CopyPlane(databuf_[i], GetComponentStride(i), + planes[i], GetComponentWidth(i), + GetComponentWidth(i), scanlines_to_copy); + planes[i] += scanlines_to_copy * GetComponentWidth(i); + } + } + + if (lines_left > 0) { + // Have a partial iMCU row left over to decode. + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + for (int i = 0; i < num_outbufs_; ++i) { + int scanlines_to_copy = + DivideAndRoundUp(lines_left, GetVertSubSampFactor(i)); + CopyPlane(databuf_[i], GetComponentStride(i), + planes[i], GetComponentWidth(i), + GetComponentWidth(i), scanlines_to_copy); + planes[i] += scanlines_to_copy * GetComponentWidth(i); + } + } + return FinishDecode(); +} + +LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque, + int dst_width, int dst_height) { + if (dst_width != GetWidth() || + dst_height > GetHeight()) { + // ERROR: Bad dimensions + return LIBYUV_FALSE; + } +#ifdef HAVE_SETJMP + if (setjmp(error_mgr_->setjmp_buffer)) { + // We called into jpeglib, it experienced an error sometime during this + // function call, and we called longjmp() and rewound the stack to here. + // Return error. + return LIBYUV_FALSE; + } +#endif + if (!StartDecode()) { + return LIBYUV_FALSE; + } + SetScanlinePointers(databuf_); + int lines_left = dst_height; + // TODO(fbarchard): Compute amount of lines to skip to implement vertical crop + int skip = (GetHeight() - dst_height) / 2; + if (skip > 0) { + while (skip >= GetImageScanlinesPerImcuRow()) { + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + skip -= GetImageScanlinesPerImcuRow(); + } + if (skip > 0) { + // Have a partial iMCU row left over to skip. + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + for (int i = 0; i < num_outbufs_; ++i) { + // TODO(fbarchard): Compute skip to avoid this + assert(skip % GetVertSubSampFactor(i) == 0); + int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i)); + int data_to_skip = rows_to_skip * GetComponentStride(i); + // Change our own data buffer pointers so we can pass them to the + // callback. + databuf_[i] += data_to_skip; + } + int scanlines_to_copy = GetImageScanlinesPerImcuRow() - skip; + (*fn)(opaque, databuf_, databuf_strides_, scanlines_to_copy); + // Now change them back. + for (int i = 0; i < num_outbufs_; ++i) { + int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i)); + int data_to_skip = rows_to_skip * GetComponentStride(i); + databuf_[i] -= data_to_skip; + } + lines_left -= scanlines_to_copy; + } + } + // Read full MCUs until we get to the crop point. + for (; lines_left >= GetImageScanlinesPerImcuRow(); + lines_left -= GetImageScanlinesPerImcuRow()) { + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + (*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow()); + } + if (lines_left > 0) { + // Have a partial iMCU row left over to decode. + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + (*fn)(opaque, databuf_, databuf_strides_, lines_left); + } + return FinishDecode(); +} + +void init_source(j_decompress_ptr cinfo) { + fill_input_buffer(cinfo); +} + +boolean fill_input_buffer(j_decompress_ptr cinfo) { + BufferVector* buf_vec = reinterpret_cast(cinfo->client_data); + if (buf_vec->pos >= buf_vec->len) { + assert(0 && "No more data"); + // ERROR: No more data + return FALSE; + } + cinfo->src->next_input_byte = buf_vec->buffers[buf_vec->pos].data; + cinfo->src->bytes_in_buffer = buf_vec->buffers[buf_vec->pos].len; + ++buf_vec->pos; + return TRUE; +} + +void skip_input_data(j_decompress_ptr cinfo, + long num_bytes) { // NOLINT + cinfo->src->next_input_byte += num_bytes; +} + +void term_source(j_decompress_ptr cinfo) { + // Nothing to do. +} + +#ifdef HAVE_SETJMP +void ErrorHandler(j_common_ptr cinfo) { + // This is called when a jpeglib command experiences an error. Unfortunately + // jpeglib's error handling model is not very flexible, because it expects the + // error handler to not return--i.e., it wants the program to terminate. To + // recover from errors we use setjmp() as shown in their example. setjmp() is + // C's implementation for the "call with current continuation" functionality + // seen in some functional programming languages. + // A formatted message can be output, but is unsafe for release. +#ifdef DEBUG + char buf[JMSG_LENGTH_MAX]; + (*cinfo->err->format_message)(cinfo, buf); + // ERROR: Error in jpeglib: buf +#endif + + SetJmpErrorMgr* mgr = reinterpret_cast(cinfo->err); + // This rewinds the call stack to the point of the corresponding setjmp() + // and causes it to return (for a second time) with value 1. + longjmp(mgr->setjmp_buffer, 1); +} +#endif + +void MJpegDecoder::AllocOutputBuffers(int num_outbufs) { + if (num_outbufs != num_outbufs_) { + // We could perhaps optimize this case to resize the output buffers without + // necessarily having to delete and recreate each one, but it's not worth + // it. + DestroyOutputBuffers(); + + scanlines_ = new uint8** [num_outbufs]; + scanlines_sizes_ = new int[num_outbufs]; + databuf_ = new uint8* [num_outbufs]; + databuf_strides_ = new int[num_outbufs]; + + for (int i = 0; i < num_outbufs; ++i) { + scanlines_[i] = NULL; + scanlines_sizes_[i] = 0; + databuf_[i] = NULL; + databuf_strides_[i] = 0; + } + + num_outbufs_ = num_outbufs; + } +} + +void MJpegDecoder::DestroyOutputBuffers() { + for (int i = 0; i < num_outbufs_; ++i) { + delete [] scanlines_[i]; + delete [] databuf_[i]; + } + delete [] scanlines_; + delete [] databuf_; + delete [] scanlines_sizes_; + delete [] databuf_strides_; + scanlines_ = NULL; + databuf_ = NULL; + scanlines_sizes_ = NULL; + databuf_strides_ = NULL; + num_outbufs_ = 0; +} + +// JDCT_IFAST and do_block_smoothing improve performance substantially. +LIBYUV_BOOL MJpegDecoder::StartDecode() { + decompress_struct_->raw_data_out = TRUE; + decompress_struct_->dct_method = JDCT_IFAST; // JDCT_ISLOW is default + decompress_struct_->dither_mode = JDITHER_NONE; + // Not applicable to 'raw': + decompress_struct_->do_fancy_upsampling = (boolean)(LIBYUV_FALSE); + // Only for buffered mode: + decompress_struct_->enable_2pass_quant = (boolean)(LIBYUV_FALSE); + // Blocky but fast: + decompress_struct_->do_block_smoothing = (boolean)(LIBYUV_FALSE); + + if (!jpeg_start_decompress(decompress_struct_)) { + // ERROR: Couldn't start JPEG decompressor"; + return LIBYUV_FALSE; + } + return LIBYUV_TRUE; +} + +LIBYUV_BOOL MJpegDecoder::FinishDecode() { + // jpeglib considers it an error if we finish without decoding the whole + // image, so we call "abort" rather than "finish". + jpeg_abort_decompress(decompress_struct_); + return LIBYUV_TRUE; +} + +void MJpegDecoder::SetScanlinePointers(uint8** data) { + for (int i = 0; i < num_outbufs_; ++i) { + uint8* data_i = data[i]; + for (int j = 0; j < scanlines_sizes_[i]; ++j) { + scanlines_[i][j] = data_i; + data_i += GetComponentStride(i); + } + } +} + +inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() { + return (unsigned int)(GetImageScanlinesPerImcuRow()) == + jpeg_read_raw_data(decompress_struct_, + scanlines_, + GetImageScanlinesPerImcuRow()); +} + +// The helper function which recognizes the jpeg sub-sampling type. +JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper( + int* subsample_x, int* subsample_y, int number_of_components) { + if (number_of_components == 3) { // Color images. + if (subsample_x[0] == 1 && subsample_y[0] == 1 && + subsample_x[1] == 2 && subsample_y[1] == 2 && + subsample_x[2] == 2 && subsample_y[2] == 2) { + return kJpegYuv420; + } else if (subsample_x[0] == 1 && subsample_y[0] == 1 && + subsample_x[1] == 2 && subsample_y[1] == 1 && + subsample_x[2] == 2 && subsample_y[2] == 1) { + return kJpegYuv422; + } else if (subsample_x[0] == 1 && subsample_y[0] == 1 && + subsample_x[1] == 1 && subsample_y[1] == 1 && + subsample_x[2] == 1 && subsample_y[2] == 1) { + return kJpegYuv444; + } + } else if (number_of_components == 1) { // Grey-scale images. + if (subsample_x[0] == 1 && subsample_y[0] == 1) { + return kJpegYuv400; + } + } + return kJpegUnknown; +} + +} // namespace libyuv +#endif // HAVE_JPEG + diff --git a/libs/libaom/src/third_party/libyuv/source/mjpeg_validate.cc b/libs/libaom/src/third_party/libyuv/source/mjpeg_validate.cc new file mode 100644 index 000000000..8edfbe1e7 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/mjpeg_validate.cc @@ -0,0 +1,101 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/mjpeg_decoder.h" + +#include // For memchr. + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Enable this to try scasb implementation. +// #define ENABLE_SCASB 1 + +#ifdef ENABLE_SCASB + +// Multiple of 1. +__declspec(naked) +const uint8* ScanRow_ERMS(const uint8* src, uint32 val, int count) { + __asm { + mov edx, edi + mov edi, [esp + 4] // src + mov eax, [esp + 8] // val + mov ecx, [esp + 12] // count + repne scasb + jne sr99 + mov eax, edi + sub eax, 1 + mov edi, edx + ret + + sr99: + mov eax, 0 + mov edi, edx + ret + } +} +#endif + +// Helper function to scan for EOI marker. +static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) { + const uint8* end = sample + sample_size - 1; + const uint8* it = sample; + for (;;) { +#ifdef ENABLE_SCASB + it = ScanRow_ERMS(it, 0xff, end - it); +#else + it = static_cast(memchr(it, 0xff, end - it)); +#endif + if (it == NULL) { + break; + } + if (it[1] == 0xd9) { + return LIBYUV_TRUE; // Success: Valid jpeg. + } + ++it; // Skip over current 0xff. + } + // ERROR: Invalid jpeg end code not found. Size sample_size + return LIBYUV_FALSE; +} + +// Helper function to validate the jpeg appears intact. +LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) { + const size_t kBackSearchSize = 1024; + if (sample_size < 64) { + // ERROR: Invalid jpeg size: sample_size + return LIBYUV_FALSE; + } + if (sample[0] != 0xff || sample[1] != 0xd8) { // Start Of Image + // ERROR: Invalid jpeg initial start code + return LIBYUV_FALSE; + } + // Step over SOI marker. + sample += 2; + sample_size -= 2; + + // Look for the End Of Image (EOI) marker in the end kilobyte of the buffer. + if (sample_size > kBackSearchSize) { + if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) { + return LIBYUV_TRUE; // Success: Valid jpeg. + } + // Reduce search size for forward search. + sample_size = sample_size - kBackSearchSize + 1; + } + return ScanEOI(sample, sample_size); + +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + diff --git a/libs/libaom/src/third_party/libyuv/source/planar_functions.cc b/libs/libaom/src/third_party/libyuv/source/planar_functions.cc new file mode 100644 index 000000000..b96bd5020 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/planar_functions.cc @@ -0,0 +1,2555 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/planar_functions.h" + +#include // for memset() + +#include "libyuv/cpu_id.h" +#ifdef HAVE_JPEG +#include "libyuv/mjpeg_decoder.h" +#endif +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Copy a plane of data +LIBYUV_API +void CopyPlane(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height) { + int y; + void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; + // Coalesce rows. + if (src_stride_y == width && + dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } + // Nothing to do. + if (src_y == dst_y && src_stride_y == dst_stride_y) { + return; + } +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; + } +#endif +#if defined(HAS_COPYROW_AVX) + if (TestCpuFlag(kCpuHasAVX)) { + CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; + } +#endif +#if defined(HAS_COPYROW_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + CopyRow = CopyRow_ERMS; + } +#endif +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; + } +#endif +#if defined(HAS_COPYROW_MIPS) + if (TestCpuFlag(kCpuHasMIPS)) { + CopyRow = CopyRow_MIPS; + } +#endif + + // Copy plane + for (y = 0; y < height; ++y) { + CopyRow(src_y, dst_y, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + +LIBYUV_API +void CopyPlane_16(const uint16* src_y, int src_stride_y, + uint16* dst_y, int dst_stride_y, + int width, int height) { + int y; + void (*CopyRow)(const uint16* src, uint16* dst, int width) = CopyRow_16_C; + // Coalesce rows. + if (src_stride_y == width && + dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } +#if defined(HAS_COPYROW_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { + CopyRow = CopyRow_16_SSE2; + } +#endif +#if defined(HAS_COPYROW_16_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + CopyRow = CopyRow_16_ERMS; + } +#endif +#if defined(HAS_COPYROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { + CopyRow = CopyRow_16_NEON; + } +#endif +#if defined(HAS_COPYROW_16_MIPS) + if (TestCpuFlag(kCpuHasMIPS)) { + CopyRow = CopyRow_16_MIPS; + } +#endif + + // Copy plane + for (y = 0; y < height; ++y) { + CopyRow(src_y, dst_y, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + +// Copy I422. +LIBYUV_API +int I422Copy(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int halfwidth = (width + 1) >> 1; + if (!src_y || !src_u || !src_v || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height); + CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height); + return 0; +} + +// Copy I444. +LIBYUV_API +int I444Copy(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (!src_y || !src_u || !src_v || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height); + CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height); + return 0; +} + +// Copy I400. +LIBYUV_API +int I400ToI400(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height) { + if (!src_y || !dst_y || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + return 0; +} + +// Convert I420 to I400. +LIBYUV_API +int I420ToI400(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + int width, int height) { + if (!src_y || !dst_y || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + return 0; +} + +// Mirror a plane of data. +void MirrorPlane(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height) { + int y; + void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } +#if defined(HAS_MIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MirrorRow = MirrorRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_NEON; + } + } +#endif +#if defined(HAS_MIRRORROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + MirrorRow = MirrorRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_SSE2; + } + } +#endif +#if defined(HAS_MIRRORROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + MirrorRow = MirrorRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_SSSE3; + } + } +#endif +#if defined(HAS_MIRRORROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MirrorRow = MirrorRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + MirrorRow = MirrorRow_AVX2; + } + } +#endif +// TODO(fbarchard): Mirror on mips handle unaligned memory. +#if defined(HAS_MIRRORROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) { + MirrorRow = MirrorRow_MIPS_DSPR2; + } +#endif + + // Mirror plane + for (y = 0; y < height; ++y) { + MirrorRow(src_y, dst_y, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + +// Convert YUY2 to I422. +LIBYUV_API +int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*YUY2ToUV422Row)(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) = + YUY2ToUV422Row_C; + void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) = + YUY2ToYRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; + } + // Coalesce rows. + if (src_stride_yuy2 == width * 2 && + dst_stride_y == width && + dst_stride_u * 2 == width && + dst_stride_v * 2 == width) { + width *= height; + height = 1; + src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_YUY2TOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; + YUY2ToYRow = YUY2ToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + YUY2ToUV422Row = YUY2ToUV422Row_SSE2; + YUY2ToYRow = YUY2ToYRow_SSE2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2; + YUY2ToYRow = YUY2ToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + YUY2ToUV422Row = YUY2ToUV422Row_AVX2; + YUY2ToYRow = YUY2ToYRow_AVX2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + YUY2ToYRow = YUY2ToYRow_Any_NEON; + if (width >= 16) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON; + } + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_NEON; + YUY2ToUV422Row = YUY2ToUV422Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); + YUY2ToYRow(src_yuy2, dst_y, width); + src_yuy2 += src_stride_yuy2; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + +// Convert UYVY to I422. +LIBYUV_API +int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*UYVYToUV422Row)(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) = + UYVYToUV422Row_C; + void (*UYVYToYRow)(const uint8* src_uyvy, + uint8* dst_y, int pix) = UYVYToYRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; + src_stride_uyvy = -src_stride_uyvy; + } + // Coalesce rows. + if (src_stride_uyvy == width * 2 && + dst_stride_y == width && + dst_stride_u * 2 == width && + dst_stride_v * 2 == width) { + width *= height; + height = 1; + src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_UYVYTOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + UYVYToUV422Row = UYVYToUV422Row_Any_SSE2; + UYVYToYRow = UYVYToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + UYVYToUV422Row = UYVYToUV422Row_SSE2; + UYVYToYRow = UYVYToYRow_SSE2; + } + } +#endif +#if defined(HAS_UYVYTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + UYVYToUV422Row = UYVYToUV422Row_Any_AVX2; + UYVYToYRow = UYVYToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + UYVYToUV422Row = UYVYToUV422Row_AVX2; + UYVYToYRow = UYVYToYRow_AVX2; + } + } +#endif +#if defined(HAS_UYVYTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + UYVYToYRow = UYVYToYRow_Any_NEON; + if (width >= 16) { + UYVYToUV422Row = UYVYToUV422Row_Any_NEON; + } + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_NEON; + UYVYToUV422Row = UYVYToUV422Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + UYVYToUV422Row(src_uyvy, dst_u, dst_v, width); + UYVYToYRow(src_uyvy, dst_y, width); + src_uyvy += src_stride_uyvy; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + +// Mirror I400 with optional flipping +LIBYUV_API +int I400Mirror(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height) { + if (!src_y || !dst_y || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + + MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + return 0; +} + +// Mirror I420 with optional flipping +LIBYUV_API +int I420Mirror(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + if (dst_y) { + MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + MirrorPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); + MirrorPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); + return 0; +} + +// ARGB mirror. +LIBYUV_API +int ARGBMirror(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) = + ARGBMirrorRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +#if defined(HAS_ARGBMIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBMirrorRow = ARGBMirrorRow_Any_NEON; + if (IS_ALIGNED(width, 4)) { + ARGBMirrorRow = ARGBMirrorRow_NEON; + } + } +#endif +#if defined(HAS_ARGBMIRRORROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBMirrorRow = ARGBMirrorRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBMirrorRow = ARGBMirrorRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBMIRRORROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBMirrorRow = ARGBMirrorRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBMirrorRow = ARGBMirrorRow_AVX2; + } + } +#endif + + // Mirror plane + for (y = 0; y < height; ++y) { + ARGBMirrorRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Get a blender that optimized for the CPU and pixel count. +// As there are 6 blenders to choose from, the caller should try to use +// the same blend function for all pixels if possible. +LIBYUV_API +ARGBBlendRow GetARGBBlend() { + void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width) = ARGBBlendRow_C; +#if defined(HAS_ARGBBLENDROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBBlendRow = ARGBBlendRow_SSSE3; + return ARGBBlendRow; + } +#endif +#if defined(HAS_ARGBBLENDROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBBlendRow = ARGBBlendRow_SSE2; + } +#endif +#if defined(HAS_ARGBBLENDROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBBlendRow = ARGBBlendRow_NEON; + } +#endif + return ARGBBlendRow; +} + +// Alpha Blend 2 ARGB images and store to destination. +LIBYUV_API +int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width) = GetARGBBlend(); + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_argb0 == width * 4 && + src_stride_argb1 == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; + } + + for (y = 0; y < height; ++y) { + ARGBBlendRow(src_argb0, src_argb1, dst_argb, width); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Multiply 2 ARGB images and store to destination. +LIBYUV_API +int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst, + int width) = ARGBMultiplyRow_C; + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_argb0 == width * 4 && + src_stride_argb1 == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; + } +#if defined(HAS_ARGBMULTIPLYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBMultiplyRow = ARGBMultiplyRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBMULTIPLYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBMultiplyRow = ARGBMultiplyRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBMULTIPLYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBMultiplyRow = ARGBMultiplyRow_NEON; + } + } +#endif + + // Multiply plane + for (y = 0; y < height; ++y) { + ARGBMultiplyRow(src_argb0, src_argb1, dst_argb, width); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Add 2 ARGB images and store to destination. +LIBYUV_API +int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst, + int width) = ARGBAddRow_C; + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_argb0 == width * 4 && + src_stride_argb1 == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; + } +#if defined(HAS_ARGBADDROW_SSE2) && (defined(_MSC_VER) && !defined(__clang__)) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBAddRow = ARGBAddRow_SSE2; + } +#endif +#if defined(HAS_ARGBADDROW_SSE2) && !(defined(_MSC_VER) && !defined(__clang__)) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBAddRow = ARGBAddRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBAddRow = ARGBAddRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBADDROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAddRow = ARGBAddRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAddRow = ARGBAddRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBADDROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAddRow = ARGBAddRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAddRow = ARGBAddRow_NEON; + } + } +#endif + + // Add plane + for (y = 0; y < height; ++y) { + ARGBAddRow(src_argb0, src_argb1, dst_argb, width); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Subtract 2 ARGB images and store to destination. +LIBYUV_API +int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst, + int width) = ARGBSubtractRow_C; + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_argb0 == width * 4 && + src_stride_argb1 == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; + } +#if defined(HAS_ARGBSUBTRACTROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBSubtractRow = ARGBSubtractRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBSubtractRow = ARGBSubtractRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBSUBTRACTROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBSubtractRow = ARGBSubtractRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBSubtractRow = ARGBSubtractRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBSUBTRACTROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBSubtractRow = ARGBSubtractRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBSubtractRow = ARGBSubtractRow_NEON; + } + } +#endif + + // Subtract plane + for (y = 0; y < height; ++y) { + ARGBSubtractRow(src_argb0, src_argb1, dst_argb, width); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert I422 to BGRA. +LIBYUV_API +int I422ToBGRA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_bgra, int dst_stride_bgra, + int width, int height) { + int y; + void (*I422ToBGRARow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToBGRARow_C; + if (!src_y || !src_u || !src_v || + !dst_bgra || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra; + dst_stride_bgra = -dst_stride_bgra; + } + // Coalesce rows. + if (src_stride_y == width && + src_stride_u * 2 == width && + src_stride_v * 2 == width && + dst_stride_bgra == width * 4) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_bgra = 0; + } +#if defined(HAS_I422TOBGRAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToBGRARow = I422ToBGRARow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToBGRARow = I422ToBGRARow_SSSE3; + } + } +#endif +#if defined(HAS_I422TOBGRAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToBGRARow = I422ToBGRARow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToBGRARow = I422ToBGRARow_AVX2; + } + } +#endif +#if defined(HAS_I422TOBGRAROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToBGRARow = I422ToBGRARow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToBGRARow = I422ToBGRARow_NEON; + } + } +#endif +#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && + IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) { + I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2; + } +#endif + + for (y = 0; y < height; ++y) { + I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width); + dst_bgra += dst_stride_bgra; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I422 to ABGR. +LIBYUV_API +int I422ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height) { + int y; + void (*I422ToABGRRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToABGRRow_C; + if (!src_y || !src_u || !src_v || + !dst_abgr || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr; + dst_stride_abgr = -dst_stride_abgr; + } + // Coalesce rows. + if (src_stride_y == width && + src_stride_u * 2 == width && + src_stride_v * 2 == width && + dst_stride_abgr == width * 4) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_abgr = 0; + } +#if defined(HAS_I422TOABGRROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToABGRRow = I422ToABGRRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToABGRRow = I422ToABGRRow_NEON; + } + } +#endif +#if defined(HAS_I422TOABGRROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToABGRRow = I422ToABGRRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToABGRRow = I422ToABGRRow_SSSE3; + } + } +#endif +#if defined(HAS_I422TOABGRROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToABGRRow = I422ToABGRRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToABGRRow = I422ToABGRRow_AVX2; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width); + dst_abgr += dst_stride_abgr; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I422 to RGBA. +LIBYUV_API +int I422ToRGBA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_rgba, int dst_stride_rgba, + int width, int height) { + int y; + void (*I422ToRGBARow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToRGBARow_C; + if (!src_y || !src_u || !src_v || + !dst_rgba || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; + dst_stride_rgba = -dst_stride_rgba; + } + // Coalesce rows. + if (src_stride_y == width && + src_stride_u * 2 == width && + src_stride_v * 2 == width && + dst_stride_rgba == width * 4) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_rgba = 0; + } +#if defined(HAS_I422TORGBAROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToRGBARow = I422ToRGBARow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_NEON; + } + } +#endif +#if defined(HAS_I422TORGBAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToRGBARow = I422ToRGBARow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_SSSE3; + } + } +#endif +#if defined(HAS_I422TORGBAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGBARow = I422ToRGBARow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRGBARow = I422ToRGBARow_AVX2; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width); + dst_rgba += dst_stride_rgba; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert NV12 to RGB565. +LIBYUV_API +int NV12ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height) { + int y; + void (*NV12ToRGB565Row)(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width) = NV12ToRGB565Row_C; + if (!src_y || !src_uv || !dst_rgb565 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } +#if defined(HAS_NV12TORGB565ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB565Row = NV12ToRGB565Row_SSSE3; + } + } +#endif +#if defined(HAS_NV12TORGB565ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + NV12ToRGB565Row = NV12ToRGB565Row_AVX2; + } + } +#endif +#if defined(HAS_NV12TORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB565Row = NV12ToRGB565Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV12ToRGB565Row(src_y, src_uv, dst_rgb565, width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +// Convert NV21 to RGB565. +LIBYUV_API +int NV21ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_vu, int src_stride_vu, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height) { + int y; + void (*NV21ToRGB565Row)(const uint8* y_buf, + const uint8* src_vu, + uint8* rgb_buf, + int width) = NV21ToRGB565Row_C; + if (!src_y || !src_vu || !dst_rgb565 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } +#if defined(HAS_NV21TORGB565ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + NV21ToRGB565Row = NV21ToRGB565Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV21ToRGB565Row = NV21ToRGB565Row_SSSE3; + } + } +#endif +#if defined(HAS_NV21TORGB565ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV21ToRGB565Row = NV21ToRGB565Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + NV21ToRGB565Row = NV21ToRGB565Row_AVX2; + } + } +#endif +#if defined(HAS_NV21TORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + NV21ToRGB565Row = NV21ToRGB565Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV21ToRGB565Row = NV21ToRGB565Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV21ToRGB565Row(src_y, src_vu, dst_rgb565, width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + if (y & 1) { + src_vu += src_stride_vu; + } + } + return 0; +} + +LIBYUV_API +void SetPlane(uint8* dst_y, int dst_stride_y, + int width, int height, + uint32 value) { + int y; + void (*SetRow)(uint8* dst, uint8 value, int pix) = SetRow_C; + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + // Coalesce rows. + if (dst_stride_y == width) { + width *= height; + height = 1; + dst_stride_y = 0; + } +#if defined(HAS_SETROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SetRow = SetRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SetRow = SetRow_NEON; + } + } +#endif +#if defined(HAS_SETROW_X86) + if (TestCpuFlag(kCpuHasX86)) { + SetRow = SetRow_Any_X86; + if (IS_ALIGNED(width, 4)) { + SetRow = SetRow_X86; + } + } +#endif +#if defined(HAS_SETROW_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + SetRow = SetRow_ERMS; + } +#endif + + // Set plane + for (y = 0; y < height; ++y) { + SetRow(dst_y, value, width); + dst_y += dst_stride_y; + } +} + +// Draw a rectangle into I420 +LIBYUV_API +int I420Rect(uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int x, int y, + int width, int height, + int value_y, int value_u, int value_v) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + uint8* start_y = dst_y + y * dst_stride_y + x; + uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2); + uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2); + if (!dst_y || !dst_u || !dst_v || + width <= 0 || height == 0 || + x < 0 || y < 0 || + value_y < 0 || value_y > 255 || + value_u < 0 || value_u > 255 || + value_v < 0 || value_v > 255) { + return -1; + } + + SetPlane(start_y, dst_stride_y, width, height, value_y); + SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u); + SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v); + return 0; +} + +// Draw a rectangle into ARGB +LIBYUV_API +int ARGBRect(uint8* dst_argb, int dst_stride_argb, + int dst_x, int dst_y, + int width, int height, + uint32 value) { + int y; + void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int pix) = ARGBSetRow_C; + if (!dst_argb || + width <= 0 || height == 0 || + dst_x < 0 || dst_y < 0) { + return -1; + } + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + dst_argb += dst_y * dst_stride_argb + dst_x * 4; + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } + +#if defined(HAS_ARGBSETROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBSetRow = ARGBSetRow_Any_NEON; + if (IS_ALIGNED(width, 4)) { + ARGBSetRow = ARGBSetRow_NEON; + } + } +#endif +#if defined(HAS_ARGBSETROW_X86) + if (TestCpuFlag(kCpuHasX86)) { + ARGBSetRow = ARGBSetRow_X86; + } +#endif + + // Set plane + for (y = 0; y < height; ++y) { + ARGBSetRow(dst_argb, value, width); + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert unattentuated ARGB to preattenuated ARGB. +// An unattenutated ARGB alpha blend uses the formula +// p = a * f + (1 - a) * b +// where +// p is output pixel +// f is foreground pixel +// b is background pixel +// a is alpha value from foreground pixel +// An preattenutated ARGB alpha blend uses the formula +// p = f + (1 - a) * b +// where +// f is foreground pixel premultiplied by alpha + +LIBYUV_API +int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, + int width) = ARGBAttenuateRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBATTENUATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBAttenuateRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert preattentuated ARGB to unattenuated ARGB. +LIBYUV_API +int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb, + int width) = ARGBUnattenuateRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBUNATTENUATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBUNATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2; + } + } +#endif +// TODO(fbarchard): Neon version. + + for (y = 0; y < height; ++y) { + ARGBUnattenuateRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert ARGB to Grayed ARGB. +LIBYUV_API +int ARGBGrayTo(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb, + int width) = ARGBGrayRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBGRAYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_SSSE3; + } +#endif +#if defined(HAS_ARGBGRAYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_NEON; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBGrayRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Make a rectangle of ARGB gray scale. +LIBYUV_API +int ARGBGray(uint8* dst_argb, int dst_stride_argb, + int dst_x, int dst_y, + int width, int height) { + int y; + void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb, + int width) = ARGBGrayRow_C; + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { + return -1; + } + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } +#if defined(HAS_ARGBGRAYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_SSSE3; + } +#endif +#if defined(HAS_ARGBGRAYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_NEON; + } +#endif + for (y = 0; y < height; ++y) { + ARGBGrayRow(dst, dst, width); + dst += dst_stride_argb; + } + return 0; +} + +// Make a rectangle of ARGB Sepia tone. +LIBYUV_API +int ARGBSepia(uint8* dst_argb, int dst_stride_argb, + int dst_x, int dst_y, int width, int height) { + int y; + void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C; + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { + return -1; + } + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } +#if defined(HAS_ARGBSEPIAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { + ARGBSepiaRow = ARGBSepiaRow_SSSE3; + } +#endif +#if defined(HAS_ARGBSEPIAROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBSepiaRow = ARGBSepiaRow_NEON; + } +#endif + for (y = 0; y < height; ++y) { + ARGBSepiaRow(dst, width); + dst += dst_stride_argb; + } + return 0; +} + +// Apply a 4x4 matrix to each ARGB pixel. +// Note: Normally for shading, but can be used to swizzle or invert. +LIBYUV_API +int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + const int8* matrix_argb, + int width, int height) { + int y; + void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) = ARGBColorMatrixRow_C; + if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { + ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3; + } +#endif +#if defined(HAS_ARGBCOLORMATRIXROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBColorMatrixRow = ARGBColorMatrixRow_NEON; + } +#endif + for (y = 0; y < height; ++y) { + ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Apply a 4x3 matrix to each ARGB pixel. +// Deprecated. +LIBYUV_API +int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb, + const int8* matrix_rgb, + int dst_x, int dst_y, int width, int height) { + SIMD_ALIGNED(int8 matrix_argb[16]); + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 || + dst_x < 0 || dst_y < 0) { + return -1; + } + + // Convert 4x3 7 bit matrix to 4x4 6 bit matrix. + matrix_argb[0] = matrix_rgb[0] / 2; + matrix_argb[1] = matrix_rgb[1] / 2; + matrix_argb[2] = matrix_rgb[2] / 2; + matrix_argb[3] = matrix_rgb[3] / 2; + matrix_argb[4] = matrix_rgb[4] / 2; + matrix_argb[5] = matrix_rgb[5] / 2; + matrix_argb[6] = matrix_rgb[6] / 2; + matrix_argb[7] = matrix_rgb[7] / 2; + matrix_argb[8] = matrix_rgb[8] / 2; + matrix_argb[9] = matrix_rgb[9] / 2; + matrix_argb[10] = matrix_rgb[10] / 2; + matrix_argb[11] = matrix_rgb[11] / 2; + matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0; + matrix_argb[15] = 64; // 1.0 + + return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb, + dst, dst_stride_argb, + &matrix_argb[0], width, height); +} + +// Apply a color table each ARGB pixel. +// Table contains 256 ARGB values. +LIBYUV_API +int ARGBColorTable(uint8* dst_argb, int dst_stride_argb, + const uint8* table_argb, + int dst_x, int dst_y, int width, int height) { + int y; + void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb, + int width) = ARGBColorTableRow_C; + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || !table_argb || width <= 0 || height <= 0 || + dst_x < 0 || dst_y < 0) { + return -1; + } + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } +#if defined(HAS_ARGBCOLORTABLEROW_X86) + if (TestCpuFlag(kCpuHasX86)) { + ARGBColorTableRow = ARGBColorTableRow_X86; + } +#endif + for (y = 0; y < height; ++y) { + ARGBColorTableRow(dst, table_argb, width); + dst += dst_stride_argb; + } + return 0; +} + +// Apply a color table each ARGB pixel but preserve destination alpha. +// Table contains 256 ARGB values. +LIBYUV_API +int RGBColorTable(uint8* dst_argb, int dst_stride_argb, + const uint8* table_argb, + int dst_x, int dst_y, int width, int height) { + int y; + void (*RGBColorTableRow)(uint8* dst_argb, const uint8* table_argb, + int width) = RGBColorTableRow_C; + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || !table_argb || width <= 0 || height <= 0 || + dst_x < 0 || dst_y < 0) { + return -1; + } + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } +#if defined(HAS_RGBCOLORTABLEROW_X86) + if (TestCpuFlag(kCpuHasX86)) { + RGBColorTableRow = RGBColorTableRow_X86; + } +#endif + for (y = 0; y < height; ++y) { + RGBColorTableRow(dst, table_argb, width); + dst += dst_stride_argb; + } + return 0; +} + +// ARGBQuantize is used to posterize art. +// e.g. rgb / qvalue * qvalue + qvalue / 2 +// But the low levels implement efficiently with 3 parameters, and could be +// used for other high level operations. +// dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset; +// where scale is 1 / interval_size as a fixed point value. +// The divide is replaces with a multiply by reciprocal fixed point multiply. +// Caveat - although SSE2 saturates, the C function does not and should be used +// with care if doing anything but quantization. +LIBYUV_API +int ARGBQuantize(uint8* dst_argb, int dst_stride_argb, + int scale, int interval_size, int interval_offset, + int dst_x, int dst_y, int width, int height) { + int y; + void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width) = ARGBQuantizeRow_C; + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 || + interval_size < 1 || interval_size > 255) { + return -1; + } + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } +#if defined(HAS_ARGBQUANTIZEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { + ARGBQuantizeRow = ARGBQuantizeRow_SSE2; + } +#endif +#if defined(HAS_ARGBQUANTIZEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBQuantizeRow = ARGBQuantizeRow_NEON; + } +#endif + for (y = 0; y < height; ++y) { + ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width); + dst += dst_stride_argb; + } + return 0; +} + +// Computes table of cumulative sum for image where the value is the sum +// of all values above and to the left of the entry. Used by ARGBBlur. +LIBYUV_API +int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb, + int32* dst_cumsum, int dst_stride32_cumsum, + int width, int height) { + int y; + void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum, + const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C; + int32* previous_cumsum = dst_cumsum; + if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) { + return -1; + } +#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2; + } +#endif + memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4); // 4 int per pixel. + for (y = 0; y < height; ++y) { + ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width); + previous_cumsum = dst_cumsum; + dst_cumsum += dst_stride32_cumsum; + src_argb += src_stride_argb; + } + return 0; +} + +// Blur ARGB image. +// Caller should allocate CumulativeSum table of width * height * 16 bytes +// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory +// as the buffer is treated as circular. +LIBYUV_API +int ARGBBlur(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int32* dst_cumsum, int dst_stride32_cumsum, + int width, int height, int radius) { + int y; + void (*ComputeCumulativeSumRow)(const uint8 *row, int32 *cumsum, + const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C; + void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft, + int width, int area, uint8* dst, int count) = CumulativeSumToAverageRow_C; + int32* cumsum_bot_row; + int32* max_cumsum_bot_row; + int32* cumsum_top_row; + + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + if (radius > height) { + radius = height; + } + if (radius > (width / 2 - 1)) { + radius = width / 2 - 1; + } + if (radius <= 0) { + return -1; + } +#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2; + CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2; + } +#endif + // Compute enough CumulativeSum for first row to be blurred. After this + // one row of CumulativeSum is updated at a time. + ARGBComputeCumulativeSum(src_argb, src_stride_argb, + dst_cumsum, dst_stride32_cumsum, + width, radius); + + src_argb = src_argb + radius * src_stride_argb; + cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum]; + + max_cumsum_bot_row = &dst_cumsum[(radius * 2 + 2) * dst_stride32_cumsum]; + cumsum_top_row = &dst_cumsum[0]; + + for (y = 0; y < height; ++y) { + int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0; + int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1); + int area = radius * (bot_y - top_y); + int boxwidth = radius * 4; + int x; + int n; + + // Increment cumsum_top_row pointer with circular buffer wrap around. + if (top_y) { + cumsum_top_row += dst_stride32_cumsum; + if (cumsum_top_row >= max_cumsum_bot_row) { + cumsum_top_row = dst_cumsum; + } + } + // Increment cumsum_bot_row pointer with circular buffer wrap around and + // then fill in a row of CumulativeSum. + if ((y + radius) < height) { + const int32* prev_cumsum_bot_row = cumsum_bot_row; + cumsum_bot_row += dst_stride32_cumsum; + if (cumsum_bot_row >= max_cumsum_bot_row) { + cumsum_bot_row = dst_cumsum; + } + ComputeCumulativeSumRow(src_argb, cumsum_bot_row, prev_cumsum_bot_row, + width); + src_argb += src_stride_argb; + } + + // Left clipped. + for (x = 0; x < radius + 1; ++x) { + CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, + boxwidth, area, &dst_argb[x * 4], 1); + area += (bot_y - top_y); + boxwidth += 4; + } + + // Middle unclipped. + n = (width - 1) - radius - x + 1; + CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, + boxwidth, area, &dst_argb[x * 4], n); + + // Right clipped. + for (x += n; x <= width - 1; ++x) { + area -= (bot_y - top_y); + boxwidth -= 4; + CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4, + cumsum_bot_row + (x - radius - 1) * 4, + boxwidth, area, &dst_argb[x * 4], 1); + } + dst_argb += dst_stride_argb; + } + return 0; +} + +// Multiply ARGB image by a specified ARGB value. +LIBYUV_API +int ARGBShade(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height, uint32 value) { + int y; + void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb, + int width, uint32 value) = ARGBShadeRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBSHADEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { + ARGBShadeRow = ARGBShadeRow_SSE2; + } +#endif +#if defined(HAS_ARGBSHADEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBShadeRow = ARGBShadeRow_NEON; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBShadeRow(src_argb, dst_argb, width, value); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Interpolate 2 ARGB images by specified amount (0 to 255). +LIBYUV_API +int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height, int interpolation) { + int y; + void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_argb0 == width * 4 && + src_stride_argb1 == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; + } +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + InterpolateRow = InterpolateRow_SSE2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(width, 4)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && + IS_ALIGNED(src_argb0, 4) && IS_ALIGNED(src_stride_argb0, 4) && + IS_ALIGNED(src_argb1, 4) && IS_ALIGNED(src_stride_argb1, 4) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + InterpolateRow = InterpolateRow_MIPS_DSPR2; + } +#endif + + for (y = 0; y < height; ++y) { + InterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0, + width * 4, interpolation); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Shuffle ARGB channel order. e.g. BGRA to ARGB. +LIBYUV_API +int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_argb, int dst_stride_argb, + const uint8* shuffler, int width, int height) { + int y; + void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb, + const uint8* shuffler, int pix) = ARGBShuffleRow_C; + if (!src_bgra || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_bgra = src_bgra + (height - 1) * src_stride_bgra; + src_stride_bgra = -src_stride_bgra; + } + // Coalesce rows. + if (src_stride_bgra == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_bgra = dst_stride_argb = 0; + } +#if defined(HAS_ARGBSHUFFLEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBShuffleRow = ARGBShuffleRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBShuffleRow = ARGBShuffleRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + ARGBShuffleRow = ARGBShuffleRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBShuffleRow = ARGBShuffleRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGBShuffleRow = ARGBShuffleRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBShuffleRow = ARGBShuffleRow_Any_NEON; + if (IS_ALIGNED(width, 4)) { + ARGBShuffleRow = ARGBShuffleRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBShuffleRow(src_bgra, dst_argb, shuffler, width); + src_bgra += src_stride_bgra; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Sobel ARGB effect. +static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height, + void (*SobelRow)(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst, int width)) { + int y; + void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int pix) = + ARGBToYJRow_C; + void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width) = SobelYRow_C; + void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobely, int width) = + SobelXRow_C; + const int kEdge = 16; // Extra pixels at start of row for extrude/align. + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + +#if defined(HAS_ARGBTOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYJRow = ARGBToYJRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYJRow = ARGBToYJRow_NEON; + } + } +#endif + +#if defined(HAS_SOBELYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SobelYRow = SobelYRow_SSE2; + } +#endif +#if defined(HAS_SOBELYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SobelYRow = SobelYRow_NEON; + } +#endif +#if defined(HAS_SOBELXROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SobelXRow = SobelXRow_SSE2; + } +#endif +#if defined(HAS_SOBELXROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SobelXRow = SobelXRow_NEON; + } +#endif + { + // 3 rows with edges before/after. + const int kRowSize = (width + kEdge + 31) & ~31; + align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge)); + uint8* row_sobelx = rows; + uint8* row_sobely = rows + kRowSize; + uint8* row_y = rows + kRowSize * 2; + + // Convert first row. + uint8* row_y0 = row_y + kEdge; + uint8* row_y1 = row_y0 + kRowSize; + uint8* row_y2 = row_y1 + kRowSize; + ARGBToYJRow(src_argb, row_y0, width); + row_y0[-1] = row_y0[0]; + memset(row_y0 + width, row_y0[width - 1], 16); // Extrude 16 for valgrind. + ARGBToYJRow(src_argb, row_y1, width); + row_y1[-1] = row_y1[0]; + memset(row_y1 + width, row_y1[width - 1], 16); + memset(row_y2 + width, 0, 16); + + for (y = 0; y < height; ++y) { + // Convert next row of ARGB to G. + if (y < (height - 1)) { + src_argb += src_stride_argb; + } + ARGBToYJRow(src_argb, row_y2, width); + row_y2[-1] = row_y2[0]; + row_y2[width] = row_y2[width - 1]; + + SobelXRow(row_y0 - 1, row_y1 - 1, row_y2 - 1, row_sobelx, width); + SobelYRow(row_y0 - 1, row_y2 - 1, row_sobely, width); + SobelRow(row_sobelx, row_sobely, dst_argb, width); + + // Cycle thru circular queue of 3 row_y buffers. + { + uint8* row_yt = row_y0; + row_y0 = row_y1; + row_y1 = row_y2; + row_y2 = row_yt; + } + + dst_argb += dst_stride_argb; + } + free_aligned_buffer_64(rows); + } + return 0; +} + +// Sobel ARGB effect. +LIBYUV_API +int ARGBSobel(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) = SobelRow_C; +#if defined(HAS_SOBELROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SobelRow = SobelRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + SobelRow = SobelRow_SSE2; + } + } +#endif +#if defined(HAS_SOBELROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SobelRow = SobelRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + SobelRow = SobelRow_NEON; + } + } +#endif + return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height, SobelRow); +} + +// Sobel ARGB effect with planar output. +LIBYUV_API +int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + int width, int height) { + void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_, int width) = SobelToPlaneRow_C; +#if defined(HAS_SOBELTOPLANEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SobelToPlaneRow = SobelToPlaneRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + SobelToPlaneRow = SobelToPlaneRow_SSE2; + } + } +#endif +#if defined(HAS_SOBELTOPLANEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SobelToPlaneRow = SobelToPlaneRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SobelToPlaneRow = SobelToPlaneRow_NEON; + } + } +#endif + return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, + width, height, SobelToPlaneRow); +} + +// SobelXY ARGB effect. +// Similar to Sobel, but also stores Sobel X in R and Sobel Y in B. G = Sobel. +LIBYUV_API +int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) = SobelXYRow_C; +#if defined(HAS_SOBELXYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SobelXYRow = SobelXYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + SobelXYRow = SobelXYRow_SSE2; + } + } +#endif +#if defined(HAS_SOBELXYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SobelXYRow = SobelXYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + SobelXYRow = SobelXYRow_NEON; + } + } +#endif + return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height, SobelXYRow); +} + +// Apply a 4x4 polynomial to each ARGB pixel. +LIBYUV_API +int ARGBPolynomial(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + const float* poly, + int width, int height) { + int y; + void (*ARGBPolynomialRow)(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) = ARGBPolynomialRow_C; + if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBPOLYNOMIALROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) { + ARGBPolynomialRow = ARGBPolynomialRow_SSE2; + } +#endif +#if defined(HAS_ARGBPOLYNOMIALROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) && + IS_ALIGNED(width, 2)) { + ARGBPolynomialRow = ARGBPolynomialRow_AVX2; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBPolynomialRow(src_argb, dst_argb, poly, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Apply a lumacolortable to each ARGB pixel. +LIBYUV_API +int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + const uint8* luma, + int width, int height) { + int y; + void (*ARGBLumaColorTableRow)(const uint8* src_argb, uint8* dst_argb, + int width, const uint8* luma, const uint32 lumacoeff) = + ARGBLumaColorTableRow_C; + if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) { + ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBLumaColorTableRow(src_argb, dst_argb, width, luma, 0x00264b0f); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Copy Alpha from one ARGB image to another. +LIBYUV_API +int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) = + ARGBCopyAlphaRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBCOPYALPHAROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) { + ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2; + } +#endif +#if defined(HAS_ARGBCOPYALPHAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) { + ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBCopyAlphaRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Copy a planar Y channel to the alpha channel of a destination ARGB image. +LIBYUV_API +int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) = + ARGBCopyYToAlphaRow_C; + if (!src_y || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = dst_stride_argb = 0; + } +#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) { + ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2; + } +#endif +#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) { + ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBCopyYToAlphaRow(src_y, dst_argb, width); + src_y += src_stride_y; + dst_argb += dst_stride_argb; + } + return 0; +} + +LIBYUV_API +int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_uv, int dst_stride_uv, + int width, int height) { + int y; + int halfwidth = (width + 1) >> 1; + void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) = + SplitUVRow_C; + void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + if (!src_yuy2 || + !dst_y || !dst_uv || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; + } +#if defined(HAS_SPLITUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SplitUVRow = SplitUVRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + SplitUVRow = SplitUVRow_SSE2; + } + } +#endif +#if defined(HAS_SPLITUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + SplitUVRow = SplitUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_AVX2; + } + } +#endif +#if defined(HAS_SPLITUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SplitUVRow = SplitUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SplitUVRow = SplitUVRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + InterpolateRow = InterpolateRow_SSE2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif + + { + int awidth = halfwidth * 2; + // 2 rows of uv + align_buffer_64(rows, awidth * 2); + + for (y = 0; y < height - 1; y += 2) { + // Split Y from UV. + SplitUVRow(src_yuy2, dst_y, rows, awidth); + SplitUVRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, + rows + awidth, awidth); + InterpolateRow(dst_uv, rows, awidth, awidth, 128); + src_yuy2 += src_stride_yuy2 * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + // Split Y from UV. + SplitUVRow(src_yuy2, dst_y, dst_uv, width); + } + free_aligned_buffer_64(rows); + } + return 0; +} + +LIBYUV_API +int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_y, int dst_stride_y, + uint8* dst_uv, int dst_stride_uv, + int width, int height) { + int y; + int halfwidth = (width + 1) >> 1; + void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) = + SplitUVRow_C; + void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + if (!src_uyvy || + !dst_y || !dst_uv || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; + src_stride_uyvy = -src_stride_uyvy; + } +#if defined(HAS_SPLITUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SplitUVRow = SplitUVRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + SplitUVRow = SplitUVRow_SSE2; + } + } +#endif +#if defined(HAS_SPLITUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + SplitUVRow = SplitUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_AVX2; + } + } +#endif +#if defined(HAS_SPLITUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SplitUVRow = SplitUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SplitUVRow = SplitUVRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + InterpolateRow = InterpolateRow_SSE2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif + + { + int awidth = halfwidth * 2; + // 2 rows of uv + align_buffer_64(rows, awidth * 2); + + for (y = 0; y < height - 1; y += 2) { + // Split Y from UV. + SplitUVRow(src_uyvy, rows, dst_y, awidth); + SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth, + dst_y + dst_stride_y, awidth); + InterpolateRow(dst_uv, rows, awidth, awidth, 128); + src_uyvy += src_stride_uyvy * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + // Split Y from UV. + SplitUVRow(src_uyvy, dst_y, dst_uv, width); + } + free_aligned_buffer_64(rows); + } + return 0; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/rotate.cc b/libs/libaom/src/third_party/libyuv/source/rotate.cc new file mode 100644 index 000000000..be3d58920 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/rotate.cc @@ -0,0 +1,496 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate.h" + +#include "libyuv/cpu_id.h" +#include "libyuv/convert.h" +#include "libyuv/planar_functions.h" +#include "libyuv/rotate_row.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +LIBYUV_API +void TransposePlane(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + int i = height; + void (*TransposeWx8)(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width) = TransposeWx8_C; +#if defined(HAS_TRANSPOSEWX8_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + TransposeWx8 = TransposeWx8_NEON; + } +#endif +#if defined(HAS_TRANSPOSEWX8_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + TransposeWx8 = TransposeWx8_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + TransposeWx8 = TransposeWx8_SSSE3; + } + } +#endif +#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + TransposeWx8 = TransposeWx8_Fast_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + TransposeWx8 = TransposeWx8_Fast_SSSE3; + } + } +#endif +#if defined(HAS_TRANSPOSEWX8_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { + if (IS_ALIGNED(width, 4) && + IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { + TransposeWx8 = TransposeWx8_Fast_MIPS_DSPR2; + } else { + TransposeWx8 = TransposeWx8_MIPS_DSPR2; + } + } +#endif + + // Work across the source in 8x8 tiles + while (i >= 8) { + TransposeWx8(src, src_stride, dst, dst_stride, width); + src += 8 * src_stride; // Go down 8 rows. + dst += 8; // Move over 8 columns. + i -= 8; + } + + if (i > 0) { + TransposeWxH_C(src, src_stride, dst, dst_stride, width, i); + } +} + +LIBYUV_API +void RotatePlane90(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + // Rotate by 90 is a transpose with the source read + // from bottom to top. So set the source pointer to the end + // of the buffer and flip the sign of the source stride. + src += src_stride * (height - 1); + src_stride = -src_stride; + TransposePlane(src, src_stride, dst, dst_stride, width, height); +} + +LIBYUV_API +void RotatePlane270(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + // Rotate by 270 is a transpose with the destination written + // from bottom to top. So set the destination pointer to the end + // of the buffer and flip the sign of the destination stride. + dst += dst_stride * (width - 1); + dst_stride = -dst_stride; + TransposePlane(src, src_stride, dst, dst_stride, width, height); +} + +LIBYUV_API +void RotatePlane180(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + // Swap first and last row and mirror the content. Uses a temporary row. + align_buffer_64(row, width); + const uint8* src_bot = src + src_stride * (height - 1); + uint8* dst_bot = dst + dst_stride * (height - 1); + int half_height = (height + 1) >> 1; + int y; + void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C; + void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; +#if defined(HAS_MIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MirrorRow = MirrorRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_NEON; + } + } +#endif +#if defined(HAS_MIRRORROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + MirrorRow = MirrorRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_SSE2; + } + } +#endif +#if defined(HAS_MIRRORROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + MirrorRow = MirrorRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_SSSE3; + } + } +#endif +#if defined(HAS_MIRRORROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MirrorRow = MirrorRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + MirrorRow = MirrorRow_AVX2; + } + } +#endif +// TODO(fbarchard): Mirror on mips handle unaligned memory. +#if defined(HAS_MIRRORROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && + IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) { + MirrorRow = MirrorRow_MIPS_DSPR2; + } +#endif +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; + } +#endif +#if defined(HAS_COPYROW_AVX) + if (TestCpuFlag(kCpuHasAVX)) { + CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; + } +#endif +#if defined(HAS_COPYROW_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + CopyRow = CopyRow_ERMS; + } +#endif +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; + } +#endif +#if defined(HAS_COPYROW_MIPS) + if (TestCpuFlag(kCpuHasMIPS)) { + CopyRow = CopyRow_MIPS; + } +#endif + + // Odd height will harmlessly mirror the middle row twice. + for (y = 0; y < half_height; ++y) { + MirrorRow(src, row, width); // Mirror first row into a buffer + src += src_stride; + MirrorRow(src_bot, dst, width); // Mirror last row into first row + dst += dst_stride; + CopyRow(row, dst_bot, width); // Copy first mirrored row into last + src_bot -= src_stride; + dst_bot -= dst_stride; + } + free_aligned_buffer_64(row); +} + +LIBYUV_API +void TransposeUV(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height) { + int i = height; + void (*TransposeUVWx8)(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width) = TransposeUVWx8_C; +#if defined(HAS_TRANSPOSEUVWX8_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + TransposeUVWx8 = TransposeUVWx8_NEON; + } +#endif +#if defined(HAS_TRANSPOSEUVWX8_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) { + TransposeUVWx8 = TransposeUVWx8_SSE2; + } +#endif +#if defined(HAS_TRANSPOSEUVWx8_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) && + IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { + TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2; + } +#endif + + // Work through the source in 8x8 tiles. + while (i >= 8) { + TransposeUVWx8(src, src_stride, + dst_a, dst_stride_a, + dst_b, dst_stride_b, + width); + src += 8 * src_stride; // Go down 8 rows. + dst_a += 8; // Move over 8 columns. + dst_b += 8; // Move over 8 columns. + i -= 8; + } + + if (i > 0) { + TransposeUVWxH_C(src, src_stride, + dst_a, dst_stride_a, + dst_b, dst_stride_b, + width, i); + } +} + +LIBYUV_API +void RotateUV90(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height) { + src += src_stride * (height - 1); + src_stride = -src_stride; + + TransposeUV(src, src_stride, + dst_a, dst_stride_a, + dst_b, dst_stride_b, + width, height); +} + +LIBYUV_API +void RotateUV270(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height) { + dst_a += dst_stride_a * (width - 1); + dst_b += dst_stride_b * (width - 1); + dst_stride_a = -dst_stride_a; + dst_stride_b = -dst_stride_b; + + TransposeUV(src, src_stride, + dst_a, dst_stride_a, + dst_b, dst_stride_b, + width, height); +} + +// Rotate 180 is a horizontal and vertical flip. +LIBYUV_API +void RotateUV180(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height) { + int i; + void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) = + MirrorUVRow_C; +#if defined(HAS_MIRRORUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + MirrorRowUV = MirrorUVRow_NEON; + } +#endif +#if defined(HAS_MIRRORROW_UV_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { + MirrorRowUV = MirrorUVRow_SSSE3; + } +#endif +#if defined(HAS_MIRRORUVROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && + IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { + MirrorRowUV = MirrorUVRow_MIPS_DSPR2; + } +#endif + + dst_a += dst_stride_a * (height - 1); + dst_b += dst_stride_b * (height - 1); + + for (i = 0; i < height; ++i) { + MirrorRowUV(src, dst_a, dst_b, width); + src += src_stride; + dst_a -= dst_stride_a; + dst_b -= dst_stride_b; + } +} + +LIBYUV_API +int RotatePlane(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height, + enum RotationMode mode) { + if (!src || width <= 0 || height == 0 || !dst) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + src = src + (height - 1) * src_stride; + src_stride = -src_stride; + } + + switch (mode) { + case kRotate0: + // copy frame + CopyPlane(src, src_stride, + dst, dst_stride, + width, height); + return 0; + case kRotate90: + RotatePlane90(src, src_stride, + dst, dst_stride, + width, height); + return 0; + case kRotate270: + RotatePlane270(src, src_stride, + dst, dst_stride, + width, height); + return 0; + case kRotate180: + RotatePlane180(src, src_stride, + dst, dst_stride, + width, height); + return 0; + default: + break; + } + return -1; +} + +LIBYUV_API +int I420Rotate(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height, + enum RotationMode mode) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || + !dst_y || !dst_u || !dst_v) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + switch (mode) { + case kRotate0: + // copy frame + return I420Copy(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height); + case kRotate90: + RotatePlane90(src_y, src_stride_y, + dst_y, dst_stride_y, + width, height); + RotatePlane90(src_u, src_stride_u, + dst_u, dst_stride_u, + halfwidth, halfheight); + RotatePlane90(src_v, src_stride_v, + dst_v, dst_stride_v, + halfwidth, halfheight); + return 0; + case kRotate270: + RotatePlane270(src_y, src_stride_y, + dst_y, dst_stride_y, + width, height); + RotatePlane270(src_u, src_stride_u, + dst_u, dst_stride_u, + halfwidth, halfheight); + RotatePlane270(src_v, src_stride_v, + dst_v, dst_stride_v, + halfwidth, halfheight); + return 0; + case kRotate180: + RotatePlane180(src_y, src_stride_y, + dst_y, dst_stride_y, + width, height); + RotatePlane180(src_u, src_stride_u, + dst_u, dst_stride_u, + halfwidth, halfheight); + RotatePlane180(src_v, src_stride_v, + dst_v, dst_stride_v, + halfwidth, halfheight); + return 0; + default: + break; + } + return -1; +} + +LIBYUV_API +int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height, + enum RotationMode mode) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_uv || width <= 0 || height == 0 || + !dst_y || !dst_u || !dst_v) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_uv = src_uv + (halfheight - 1) * src_stride_uv; + src_stride_y = -src_stride_y; + src_stride_uv = -src_stride_uv; + } + + switch (mode) { + case kRotate0: + // copy frame + return NV12ToI420(src_y, src_stride_y, + src_uv, src_stride_uv, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height); + case kRotate90: + RotatePlane90(src_y, src_stride_y, + dst_y, dst_stride_y, + width, height); + RotateUV90(src_uv, src_stride_uv, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + halfwidth, halfheight); + return 0; + case kRotate270: + RotatePlane270(src_y, src_stride_y, + dst_y, dst_stride_y, + width, height); + RotateUV270(src_uv, src_stride_uv, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + halfwidth, halfheight); + return 0; + case kRotate180: + RotatePlane180(src_y, src_stride_y, + dst_y, dst_stride_y, + width, height); + RotateUV180(src_uv, src_stride_uv, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + halfwidth, halfheight); + return 0; + default: + break; + } + return -1; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/rotate_any.cc b/libs/libaom/src/third_party/libyuv/source/rotate_any.cc new file mode 100644 index 000000000..4d6eb34e1 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/rotate_any.cc @@ -0,0 +1,55 @@ +/* + * Copyright 2015 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate.h" +#include "libyuv/rotate_row.h" + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define TANY(NAMEANY, TPOS_SIMD, TPOS_C, MASK) \ + void NAMEANY(const uint8* src, int src_stride, \ + uint8* dst, int dst_stride, int width) { \ + int r = width & MASK; \ + int n = width - r; \ + if (n > 0) { \ + TPOS_SIMD(src, src_stride, dst, dst_stride, n); \ + } \ + TPOS_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \ + } + +#ifdef HAS_TRANSPOSEWX8_NEON +TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, TransposeWx8_C, 7) +#endif +#ifdef HAS_TRANSPOSEWX8_SSSE3 +TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, TransposeWx8_C, 7) +#endif +#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3 +TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, TransposeWx8_C, 15) +#endif +#ifdef HAS_TRANSPOSEWX8_MIPS_DSPR2 +TANY(TransposeWx8_Any_MIPS_DSPR2, TransposeWx8_MIPS_DSPR2, TransposeWx8_C, 7) +#endif + +#undef TANY + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + + + + + diff --git a/libs/libaom/src/third_party/libyuv/source/rotate_argb.cc b/libs/libaom/src/third_party/libyuv/source/rotate_argb.cc new file mode 100644 index 000000000..787c0ad1b --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/rotate_argb.cc @@ -0,0 +1,205 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate.h" + +#include "libyuv/cpu_id.h" +#include "libyuv/convert.h" +#include "libyuv/planar_functions.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// ARGBScale has a function to copy pixels to a row, striding each source +// pixel by a constant. +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || \ + (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__)) +#define HAS_SCALEARGBROWDOWNEVEN_SSE2 +void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride, + int src_stepx, uint8* dst_ptr, int dst_width); +#endif +#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ + (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) +#define HAS_SCALEARGBROWDOWNEVEN_NEON +void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride, + int src_stepx, uint8* dst_ptr, int dst_width); +#endif + +void ScaleARGBRowDownEven_C(const uint8* src_ptr, int, + int src_stepx, uint8* dst_ptr, int dst_width); + +static void ARGBTranspose(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width, int height) { + int i; + int src_pixel_step = src_stride >> 2; + void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride, + int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C; +#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) { // Width of dest. + ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2; + } +#endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) { // Width of dest. + ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON; + } +#endif + + for (i = 0; i < width; ++i) { // column of source to row of dest. + ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height); + dst += dst_stride; + src += 4; + } +} + +void ARGBRotate90(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width, int height) { + // Rotate by 90 is a ARGBTranspose with the source read + // from bottom to top. So set the source pointer to the end + // of the buffer and flip the sign of the source stride. + src += src_stride * (height - 1); + src_stride = -src_stride; + ARGBTranspose(src, src_stride, dst, dst_stride, width, height); +} + +void ARGBRotate270(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width, int height) { + // Rotate by 270 is a ARGBTranspose with the destination written + // from bottom to top. So set the destination pointer to the end + // of the buffer and flip the sign of the destination stride. + dst += dst_stride * (width - 1); + dst_stride = -dst_stride; + ARGBTranspose(src, src_stride, dst, dst_stride, width, height); +} + +void ARGBRotate180(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width, int height) { + // Swap first and last row and mirror the content. Uses a temporary row. + align_buffer_64(row, width * 4); + const uint8* src_bot = src + src_stride * (height - 1); + uint8* dst_bot = dst + dst_stride * (height - 1); + int half_height = (height + 1) >> 1; + int y; + void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) = + ARGBMirrorRow_C; + void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; +#if defined(HAS_ARGBMIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBMirrorRow = ARGBMirrorRow_Any_NEON; + if (IS_ALIGNED(width, 4)) { + ARGBMirrorRow = ARGBMirrorRow_NEON; + } + } +#endif +#if defined(HAS_ARGBMIRRORROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBMirrorRow = ARGBMirrorRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBMirrorRow = ARGBMirrorRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBMIRRORROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBMirrorRow = ARGBMirrorRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBMirrorRow = ARGBMirrorRow_AVX2; + } + } +#endif +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; + } +#endif +#if defined(HAS_COPYROW_AVX) + if (TestCpuFlag(kCpuHasAVX)) { + CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX; + } +#endif +#if defined(HAS_COPYROW_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + CopyRow = CopyRow_ERMS; + } +#endif +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON; + } +#endif +#if defined(HAS_COPYROW_MIPS) + if (TestCpuFlag(kCpuHasMIPS)) { + CopyRow = CopyRow_MIPS; + } +#endif + + // Odd height will harmlessly mirror the middle row twice. + for (y = 0; y < half_height; ++y) { + ARGBMirrorRow(src, row, width); // Mirror first row into a buffer + ARGBMirrorRow(src_bot, dst, width); // Mirror last row into first row + CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last + src += src_stride; + dst += dst_stride; + src_bot -= src_stride; + dst_bot -= dst_stride; + } + free_aligned_buffer_64(row); +} + +LIBYUV_API +int ARGBRotate(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, int width, int height, + enum RotationMode mode) { + if (!src_argb || width <= 0 || height == 0 || !dst_argb) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + + switch (mode) { + case kRotate0: + // copy frame + return ARGBCopy(src_argb, src_stride_argb, + dst_argb, dst_stride_argb, + width, height); + case kRotate90: + ARGBRotate90(src_argb, src_stride_argb, + dst_argb, dst_stride_argb, + width, height); + return 0; + case kRotate270: + ARGBRotate270(src_argb, src_stride_argb, + dst_argb, dst_stride_argb, + width, height); + return 0; + case kRotate180: + ARGBRotate180(src_argb, src_stride_argb, + dst_argb, dst_stride_argb, + width, height); + return 0; + default: + break; + } + return -1; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/rotate_common.cc b/libs/libaom/src/third_party/libyuv/source/rotate_common.cc new file mode 100644 index 000000000..b33a9a0c6 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/rotate_common.cc @@ -0,0 +1,92 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" +#include "libyuv/rotate_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +void TransposeWx8_C(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width) { + int i; + for (i = 0; i < width; ++i) { + dst[0] = src[0 * src_stride]; + dst[1] = src[1 * src_stride]; + dst[2] = src[2 * src_stride]; + dst[3] = src[3 * src_stride]; + dst[4] = src[4 * src_stride]; + dst[5] = src[5 * src_stride]; + dst[6] = src[6 * src_stride]; + dst[7] = src[7 * src_stride]; + ++src; + dst += dst_stride; + } +} + +void TransposeUVWx8_C(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, int width) { + int i; + for (i = 0; i < width; ++i) { + dst_a[0] = src[0 * src_stride + 0]; + dst_b[0] = src[0 * src_stride + 1]; + dst_a[1] = src[1 * src_stride + 0]; + dst_b[1] = src[1 * src_stride + 1]; + dst_a[2] = src[2 * src_stride + 0]; + dst_b[2] = src[2 * src_stride + 1]; + dst_a[3] = src[3 * src_stride + 0]; + dst_b[3] = src[3 * src_stride + 1]; + dst_a[4] = src[4 * src_stride + 0]; + dst_b[4] = src[4 * src_stride + 1]; + dst_a[5] = src[5 * src_stride + 0]; + dst_b[5] = src[5 * src_stride + 1]; + dst_a[6] = src[6 * src_stride + 0]; + dst_b[6] = src[6 * src_stride + 1]; + dst_a[7] = src[7 * src_stride + 0]; + dst_b[7] = src[7 * src_stride + 1]; + src += 2; + dst_a += dst_stride_a; + dst_b += dst_stride_b; + } +} + +void TransposeWxH_C(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + int i; + for (i = 0; i < width; ++i) { + int j; + for (j = 0; j < height; ++j) { + dst[i * dst_stride + j] = src[j * src_stride + i]; + } + } +} + +void TransposeUVWxH_C(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height) { + int i; + for (i = 0; i < width * 2; i += 2) { + int j; + for (j = 0; j < height; ++j) { + dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)]; + dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1]; + } + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/rotate_gcc.cc b/libs/libaom/src/third_party/libyuv/source/rotate_gcc.cc new file mode 100644 index 000000000..fd385bcd3 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/rotate_gcc.cc @@ -0,0 +1,493 @@ +/* + * Copyright 2015 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" +#include "libyuv/rotate_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC x86 and x64. +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) + +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__))) +void TransposeWx8_SSSE3(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width) { + asm volatile ( + // Read in the data from the source pointer. + // First round of bit swap. + ".p2align 2 \n" + "1: \n" + "movq (%0),%%xmm0 \n" + "movq (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "movq (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "movq (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movq (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "movq (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movq (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "lea 0x8(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "neg %3 \n" + // Second round of bit swap. + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + // Third round of bit swap. + // Write to the destination pointer. + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "sub $0x8,%2 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + +#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__) && !defined(__clang__) +void TransposeUVWx8_SSE2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, int width); + asm ( + DECLARE_FUNCTION(TransposeUVWx8_SSE2) + "push %ebx \n" + "push %esi \n" + "push %edi \n" + "push %ebp \n" + "mov 0x14(%esp),%eax \n" + "mov 0x18(%esp),%edi \n" + "mov 0x1c(%esp),%edx \n" + "mov 0x20(%esp),%esi \n" + "mov 0x24(%esp),%ebx \n" + "mov 0x28(%esp),%ebp \n" + "mov %esp,%ecx \n" + "sub $0x14,%esp \n" + "and $0xfffffff0,%esp \n" + "mov %ecx,0x10(%esp) \n" + "mov 0x2c(%ecx),%ecx \n" + +"1: \n" + "movdqu (%eax),%xmm0 \n" + "movdqu (%eax,%edi,1),%xmm1 \n" + "lea (%eax,%edi,2),%eax \n" + "movdqa %xmm0,%xmm7 \n" + "punpcklbw %xmm1,%xmm0 \n" + "punpckhbw %xmm1,%xmm7 \n" + "movdqa %xmm7,%xmm1 \n" + "movdqu (%eax),%xmm2 \n" + "movdqu (%eax,%edi,1),%xmm3 \n" + "lea (%eax,%edi,2),%eax \n" + "movdqa %xmm2,%xmm7 \n" + "punpcklbw %xmm3,%xmm2 \n" + "punpckhbw %xmm3,%xmm7 \n" + "movdqa %xmm7,%xmm3 \n" + "movdqu (%eax),%xmm4 \n" + "movdqu (%eax,%edi,1),%xmm5 \n" + "lea (%eax,%edi,2),%eax \n" + "movdqa %xmm4,%xmm7 \n" + "punpcklbw %xmm5,%xmm4 \n" + "punpckhbw %xmm5,%xmm7 \n" + "movdqa %xmm7,%xmm5 \n" + "movdqu (%eax),%xmm6 \n" + "movdqu (%eax,%edi,1),%xmm7 \n" + "lea (%eax,%edi,2),%eax \n" + "movdqu %xmm5,(%esp) \n" + "neg %edi \n" + "movdqa %xmm6,%xmm5 \n" + "punpcklbw %xmm7,%xmm6 \n" + "punpckhbw %xmm7,%xmm5 \n" + "movdqa %xmm5,%xmm7 \n" + "lea 0x10(%eax,%edi,8),%eax \n" + "neg %edi \n" + "movdqa %xmm0,%xmm5 \n" + "punpcklwd %xmm2,%xmm0 \n" + "punpckhwd %xmm2,%xmm5 \n" + "movdqa %xmm5,%xmm2 \n" + "movdqa %xmm1,%xmm5 \n" + "punpcklwd %xmm3,%xmm1 \n" + "punpckhwd %xmm3,%xmm5 \n" + "movdqa %xmm5,%xmm3 \n" + "movdqa %xmm4,%xmm5 \n" + "punpcklwd %xmm6,%xmm4 \n" + "punpckhwd %xmm6,%xmm5 \n" + "movdqa %xmm5,%xmm6 \n" + "movdqu (%esp),%xmm5 \n" + "movdqu %xmm6,(%esp) \n" + "movdqa %xmm5,%xmm6 \n" + "punpcklwd %xmm7,%xmm5 \n" + "punpckhwd %xmm7,%xmm6 \n" + "movdqa %xmm6,%xmm7 \n" + "movdqa %xmm0,%xmm6 \n" + "punpckldq %xmm4,%xmm0 \n" + "punpckhdq %xmm4,%xmm6 \n" + "movdqa %xmm6,%xmm4 \n" + "movdqu (%esp),%xmm6 \n" + "movlpd %xmm0,(%edx) \n" + "movhpd %xmm0,(%ebx) \n" + "movlpd %xmm4,(%edx,%esi,1) \n" + "lea (%edx,%esi,2),%edx \n" + "movhpd %xmm4,(%ebx,%ebp,1) \n" + "lea (%ebx,%ebp,2),%ebx \n" + "movdqa %xmm2,%xmm0 \n" + "punpckldq %xmm6,%xmm2 \n" + "movlpd %xmm2,(%edx) \n" + "movhpd %xmm2,(%ebx) \n" + "punpckhdq %xmm6,%xmm0 \n" + "movlpd %xmm0,(%edx,%esi,1) \n" + "lea (%edx,%esi,2),%edx \n" + "movhpd %xmm0,(%ebx,%ebp,1) \n" + "lea (%ebx,%ebp,2),%ebx \n" + "movdqa %xmm1,%xmm0 \n" + "punpckldq %xmm5,%xmm1 \n" + "movlpd %xmm1,(%edx) \n" + "movhpd %xmm1,(%ebx) \n" + "punpckhdq %xmm5,%xmm0 \n" + "movlpd %xmm0,(%edx,%esi,1) \n" + "lea (%edx,%esi,2),%edx \n" + "movhpd %xmm0,(%ebx,%ebp,1) \n" + "lea (%ebx,%ebp,2),%ebx \n" + "movdqa %xmm3,%xmm0 \n" + "punpckldq %xmm7,%xmm3 \n" + "movlpd %xmm3,(%edx) \n" + "movhpd %xmm3,(%ebx) \n" + "punpckhdq %xmm7,%xmm0 \n" + "sub $0x8,%ecx \n" + "movlpd %xmm0,(%edx,%esi,1) \n" + "lea (%edx,%esi,2),%edx \n" + "movhpd %xmm0,(%ebx,%ebp,1) \n" + "lea (%ebx,%ebp,2),%ebx \n" + "jg 1b \n" + "mov 0x10(%esp),%esp \n" + "pop %ebp \n" + "pop %edi \n" + "pop %esi \n" + "pop %ebx \n" +#if defined(__native_client__) + "pop %ecx \n" + "and $0xffffffe0,%ecx \n" + "jmp *%ecx \n" +#else + "ret \n" +#endif +); +#endif +#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \ + defined(__x86_64__) +// 64 bit version has enough registers to do 16x8 to 8x16 at a time. +void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width) { + asm volatile ( + // Read in the data from the source pointer. + // First round of bit swap. + ".p2align 2 \n" +"1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqu (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm8,%%xmm9 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "palignr $0x8,%%xmm9,%%xmm9 \n" + "movdqu (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm2,%%xmm10 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm10 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movdqa %%xmm10,%%xmm11 \n" + "movdqu (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "movdqu (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm4,%%xmm12 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm12 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movdqa %%xmm12,%%xmm13 \n" + "movdqu (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movdqu (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm6,%%xmm14 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "punpckhbw %%xmm7,%%xmm14 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "movdqa %%xmm14,%%xmm15 \n" + "lea 0x10(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "neg %3 \n" + // Second round of bit swap. + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "punpcklwd %%xmm10,%%xmm8 \n" + "punpcklwd %%xmm11,%%xmm9 \n" + "movdqa %%xmm8,%%xmm10 \n" + "movdqa %%xmm9,%%xmm11 \n" + "palignr $0x8,%%xmm10,%%xmm10 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "punpcklwd %%xmm14,%%xmm12 \n" + "punpcklwd %%xmm15,%%xmm13 \n" + "movdqa %%xmm12,%%xmm14 \n" + "movdqa %%xmm13,%%xmm15 \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + // Third round of bit swap. + // Write to the destination pointer. + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm12,%%xmm8 \n" + "movq %%xmm8,(%1) \n" + "movdqa %%xmm8,%%xmm12 \n" + "palignr $0x8,%%xmm12,%%xmm12 \n" + "movq %%xmm12,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm14,%%xmm10 \n" + "movdqa %%xmm10,%%xmm14 \n" + "movq %%xmm10,(%1) \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "punpckldq %%xmm13,%%xmm9 \n" + "movq %%xmm14,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm9,%%xmm13 \n" + "movq %%xmm9,(%1) \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movq %%xmm13,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm15,%%xmm11 \n" + "movq %%xmm11,(%1) \n" + "movdqa %%xmm11,%%xmm15 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "sub $0x10,%2 \n" + "movq %%xmm15,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" +); +} + +void TransposeUVWx8_SSE2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, int width) { + asm volatile ( + // Read in the data from the source pointer. + // First round of bit swap. + ".p2align 2 \n" +"1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%0,%4),%%xmm1 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqa %%xmm8,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "movdqu (%0,%4),%%xmm3 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm8 \n" + "movdqa %%xmm8,%%xmm3 \n" + "movdqu (%0),%%xmm4 \n" + "movdqu (%0,%4),%%xmm5 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm4,%%xmm8 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm8 \n" + "movdqa %%xmm8,%%xmm5 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu (%0,%4),%%xmm7 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm6,%%xmm8 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %4 \n" + "lea 0x10(%0,%4,8),%0 \n" + "punpckhbw %%xmm7,%%xmm8 \n" + "movdqa %%xmm8,%%xmm7 \n" + "neg %4 \n" + // Second round of bit swap. + "movdqa %%xmm0,%%xmm8 \n" + "movdqa %%xmm1,%%xmm9 \n" + "punpckhwd %%xmm2,%%xmm8 \n" + "punpckhwd %%xmm3,%%xmm9 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm8,%%xmm2 \n" + "movdqa %%xmm9,%%xmm3 \n" + "movdqa %%xmm4,%%xmm8 \n" + "movdqa %%xmm5,%%xmm9 \n" + "punpckhwd %%xmm6,%%xmm8 \n" + "punpckhwd %%xmm7,%%xmm9 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm8,%%xmm6 \n" + "movdqa %%xmm9,%%xmm7 \n" + // Third round of bit swap. + // Write to the destination pointer. + "movdqa %%xmm0,%%xmm8 \n" + "punpckldq %%xmm4,%%xmm0 \n" + "movlpd %%xmm0,(%1) \n" // Write back U channel + "movhpd %%xmm0,(%2) \n" // Write back V channel + "punpckhdq %%xmm4,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movlpd %%xmm2,(%1) \n" + "movhpd %%xmm2,(%2) \n" + "punpckhdq %%xmm6,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm1,%%xmm8 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movlpd %%xmm1,(%1) \n" + "movhpd %%xmm1,(%2) \n" + "punpckhdq %%xmm5,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm3,%%xmm8 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movlpd %%xmm3,(%1) \n" + "movhpd %%xmm3,(%2) \n" + "punpckhdq %%xmm7,%%xmm8 \n" + "sub $0x8,%3 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst_a), // %1 + "+r"(dst_b), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(src_stride)), // %4 + "r"((intptr_t)(dst_stride_a)), // %5 + "r"((intptr_t)(dst_stride_b)) // %6 + : "memory", "cc", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9" +); +} +#endif +#endif + +#endif // defined(__x86_64__) || defined(__i386__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/rotate_mips.cc b/libs/libaom/src/third_party/libyuv/source/rotate_mips.cc new file mode 100644 index 000000000..efe6bd909 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/rotate_mips.cc @@ -0,0 +1,484 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" +#include "libyuv/rotate_row.h" + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_MIPS) && \ + defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \ + (_MIPS_SIM == _MIPS_SIM_ABI32) + +void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 + "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 + "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 + "addu $t3, $t2, %[src_stride] \n" + "addu $t5, $t4, %[src_stride] \n" + "addu $t6, $t2, $t4 \n" + "andi $t0, %[dst], 0x3 \n" + "andi $t1, %[dst_stride], 0x3 \n" + "or $t0, $t0, $t1 \n" + "bnez $t0, 11f \n" + " subu $t7, $t9, %[src_stride] \n" +//dst + dst_stride word aligned + "1: \n" + "lbu $t0, 0(%[src]) \n" + "lbux $t1, %[src_stride](%[src]) \n" + "lbux $t8, $t2(%[src]) \n" + "lbux $t9, $t3(%[src]) \n" + "sll $t1, $t1, 16 \n" + "sll $t9, $t9, 16 \n" + "or $t0, $t0, $t1 \n" + "or $t8, $t8, $t9 \n" + "precr.qb.ph $s0, $t8, $t0 \n" + "lbux $t0, $t4(%[src]) \n" + "lbux $t1, $t5(%[src]) \n" + "lbux $t8, $t6(%[src]) \n" + "lbux $t9, $t7(%[src]) \n" + "sll $t1, $t1, 16 \n" + "sll $t9, $t9, 16 \n" + "or $t0, $t0, $t1 \n" + "or $t8, $t8, $t9 \n" + "precr.qb.ph $s1, $t8, $t0 \n" + "sw $s0, 0(%[dst]) \n" + "addiu %[width], -1 \n" + "addiu %[src], 1 \n" + "sw $s1, 4(%[dst]) \n" + "bnez %[width], 1b \n" + " addu %[dst], %[dst], %[dst_stride] \n" + "b 2f \n" +//dst + dst_stride unaligned + "11: \n" + "lbu $t0, 0(%[src]) \n" + "lbux $t1, %[src_stride](%[src]) \n" + "lbux $t8, $t2(%[src]) \n" + "lbux $t9, $t3(%[src]) \n" + "sll $t1, $t1, 16 \n" + "sll $t9, $t9, 16 \n" + "or $t0, $t0, $t1 \n" + "or $t8, $t8, $t9 \n" + "precr.qb.ph $s0, $t8, $t0 \n" + "lbux $t0, $t4(%[src]) \n" + "lbux $t1, $t5(%[src]) \n" + "lbux $t8, $t6(%[src]) \n" + "lbux $t9, $t7(%[src]) \n" + "sll $t1, $t1, 16 \n" + "sll $t9, $t9, 16 \n" + "or $t0, $t0, $t1 \n" + "or $t8, $t8, $t9 \n" + "precr.qb.ph $s1, $t8, $t0 \n" + "swr $s0, 0(%[dst]) \n" + "swl $s0, 3(%[dst]) \n" + "addiu %[width], -1 \n" + "addiu %[src], 1 \n" + "swr $s1, 4(%[dst]) \n" + "swl $s1, 7(%[dst]) \n" + "bnez %[width], 11b \n" + "addu %[dst], %[dst], %[dst_stride] \n" + "2: \n" + ".set pop \n" + :[src] "+r" (src), + [dst] "+r" (dst), + [width] "+r" (width) + :[src_stride] "r" (src_stride), + [dst_stride] "r" (dst_stride) + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9", + "s0", "s1" + ); +} + +void TransposeWx8_Fast_MIPS_DSPR2(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width) { + __asm__ __volatile__ ( + ".set noat \n" + ".set push \n" + ".set noreorder \n" + "beqz %[width], 2f \n" + " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 + "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 + "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 + "addu $t3, $t2, %[src_stride] \n" + "addu $t5, $t4, %[src_stride] \n" + "addu $t6, $t2, $t4 \n" + + "srl $AT, %[width], 0x2 \n" + "andi $t0, %[dst], 0x3 \n" + "andi $t1, %[dst_stride], 0x3 \n" + "or $t0, $t0, $t1 \n" + "bnez $t0, 11f \n" + " subu $t7, $t9, %[src_stride] \n" +//dst + dst_stride word aligned + "1: \n" + "lw $t0, 0(%[src]) \n" + "lwx $t1, %[src_stride](%[src]) \n" + "lwx $t8, $t2(%[src]) \n" + "lwx $t9, $t3(%[src]) \n" + +// t0 = | 30 | 20 | 10 | 00 | +// t1 = | 31 | 21 | 11 | 01 | +// t8 = | 32 | 22 | 12 | 02 | +// t9 = | 33 | 23 | 13 | 03 | + + "precr.qb.ph $s0, $t1, $t0 \n" + "precr.qb.ph $s1, $t9, $t8 \n" + "precrq.qb.ph $s2, $t1, $t0 \n" + "precrq.qb.ph $s3, $t9, $t8 \n" + + // s0 = | 21 | 01 | 20 | 00 | + // s1 = | 23 | 03 | 22 | 02 | + // s2 = | 31 | 11 | 30 | 10 | + // s3 = | 33 | 13 | 32 | 12 | + + "precr.qb.ph $s4, $s1, $s0 \n" + "precrq.qb.ph $s5, $s1, $s0 \n" + "precr.qb.ph $s6, $s3, $s2 \n" + "precrq.qb.ph $s7, $s3, $s2 \n" + + // s4 = | 03 | 02 | 01 | 00 | + // s5 = | 23 | 22 | 21 | 20 | + // s6 = | 13 | 12 | 11 | 10 | + // s7 = | 33 | 32 | 31 | 30 | + + "lwx $t0, $t4(%[src]) \n" + "lwx $t1, $t5(%[src]) \n" + "lwx $t8, $t6(%[src]) \n" + "lwx $t9, $t7(%[src]) \n" + +// t0 = | 34 | 24 | 14 | 04 | +// t1 = | 35 | 25 | 15 | 05 | +// t8 = | 36 | 26 | 16 | 06 | +// t9 = | 37 | 27 | 17 | 07 | + + "precr.qb.ph $s0, $t1, $t0 \n" + "precr.qb.ph $s1, $t9, $t8 \n" + "precrq.qb.ph $s2, $t1, $t0 \n" + "precrq.qb.ph $s3, $t9, $t8 \n" + + // s0 = | 25 | 05 | 24 | 04 | + // s1 = | 27 | 07 | 26 | 06 | + // s2 = | 35 | 15 | 34 | 14 | + // s3 = | 37 | 17 | 36 | 16 | + + "precr.qb.ph $t0, $s1, $s0 \n" + "precrq.qb.ph $t1, $s1, $s0 \n" + "precr.qb.ph $t8, $s3, $s2 \n" + "precrq.qb.ph $t9, $s3, $s2 \n" + + // t0 = | 07 | 06 | 05 | 04 | + // t1 = | 27 | 26 | 25 | 24 | + // t8 = | 17 | 16 | 15 | 14 | + // t9 = | 37 | 36 | 35 | 34 | + + "addu $s0, %[dst], %[dst_stride] \n" + "addu $s1, $s0, %[dst_stride] \n" + "addu $s2, $s1, %[dst_stride] \n" + + "sw $s4, 0(%[dst]) \n" + "sw $t0, 4(%[dst]) \n" + "sw $s6, 0($s0) \n" + "sw $t8, 4($s0) \n" + "sw $s5, 0($s1) \n" + "sw $t1, 4($s1) \n" + "sw $s7, 0($s2) \n" + "sw $t9, 4($s2) \n" + + "addiu $AT, -1 \n" + "addiu %[src], 4 \n" + + "bnez $AT, 1b \n" + " addu %[dst], $s2, %[dst_stride] \n" + "b 2f \n" +//dst + dst_stride unaligned + "11: \n" + "lw $t0, 0(%[src]) \n" + "lwx $t1, %[src_stride](%[src]) \n" + "lwx $t8, $t2(%[src]) \n" + "lwx $t9, $t3(%[src]) \n" + +// t0 = | 30 | 20 | 10 | 00 | +// t1 = | 31 | 21 | 11 | 01 | +// t8 = | 32 | 22 | 12 | 02 | +// t9 = | 33 | 23 | 13 | 03 | + + "precr.qb.ph $s0, $t1, $t0 \n" + "precr.qb.ph $s1, $t9, $t8 \n" + "precrq.qb.ph $s2, $t1, $t0 \n" + "precrq.qb.ph $s3, $t9, $t8 \n" + + // s0 = | 21 | 01 | 20 | 00 | + // s1 = | 23 | 03 | 22 | 02 | + // s2 = | 31 | 11 | 30 | 10 | + // s3 = | 33 | 13 | 32 | 12 | + + "precr.qb.ph $s4, $s1, $s0 \n" + "precrq.qb.ph $s5, $s1, $s0 \n" + "precr.qb.ph $s6, $s3, $s2 \n" + "precrq.qb.ph $s7, $s3, $s2 \n" + + // s4 = | 03 | 02 | 01 | 00 | + // s5 = | 23 | 22 | 21 | 20 | + // s6 = | 13 | 12 | 11 | 10 | + // s7 = | 33 | 32 | 31 | 30 | + + "lwx $t0, $t4(%[src]) \n" + "lwx $t1, $t5(%[src]) \n" + "lwx $t8, $t6(%[src]) \n" + "lwx $t9, $t7(%[src]) \n" + +// t0 = | 34 | 24 | 14 | 04 | +// t1 = | 35 | 25 | 15 | 05 | +// t8 = | 36 | 26 | 16 | 06 | +// t9 = | 37 | 27 | 17 | 07 | + + "precr.qb.ph $s0, $t1, $t0 \n" + "precr.qb.ph $s1, $t9, $t8 \n" + "precrq.qb.ph $s2, $t1, $t0 \n" + "precrq.qb.ph $s3, $t9, $t8 \n" + + // s0 = | 25 | 05 | 24 | 04 | + // s1 = | 27 | 07 | 26 | 06 | + // s2 = | 35 | 15 | 34 | 14 | + // s3 = | 37 | 17 | 36 | 16 | + + "precr.qb.ph $t0, $s1, $s0 \n" + "precrq.qb.ph $t1, $s1, $s0 \n" + "precr.qb.ph $t8, $s3, $s2 \n" + "precrq.qb.ph $t9, $s3, $s2 \n" + + // t0 = | 07 | 06 | 05 | 04 | + // t1 = | 27 | 26 | 25 | 24 | + // t8 = | 17 | 16 | 15 | 14 | + // t9 = | 37 | 36 | 35 | 34 | + + "addu $s0, %[dst], %[dst_stride] \n" + "addu $s1, $s0, %[dst_stride] \n" + "addu $s2, $s1, %[dst_stride] \n" + + "swr $s4, 0(%[dst]) \n" + "swl $s4, 3(%[dst]) \n" + "swr $t0, 4(%[dst]) \n" + "swl $t0, 7(%[dst]) \n" + "swr $s6, 0($s0) \n" + "swl $s6, 3($s0) \n" + "swr $t8, 4($s0) \n" + "swl $t8, 7($s0) \n" + "swr $s5, 0($s1) \n" + "swl $s5, 3($s1) \n" + "swr $t1, 4($s1) \n" + "swl $t1, 7($s1) \n" + "swr $s7, 0($s2) \n" + "swl $s7, 3($s2) \n" + "swr $t9, 4($s2) \n" + "swl $t9, 7($s2) \n" + + "addiu $AT, -1 \n" + "addiu %[src], 4 \n" + + "bnez $AT, 11b \n" + " addu %[dst], $s2, %[dst_stride] \n" + "2: \n" + ".set pop \n" + ".set at \n" + :[src] "+r" (src), + [dst] "+r" (dst), + [width] "+r" (width) + :[src_stride] "r" (src_stride), + [dst_stride] "r" (dst_stride) + : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", + "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7" + ); +} + +void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "beqz %[width], 2f \n" + " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 + "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 + "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 + "addu $t3, $t2, %[src_stride] \n" + "addu $t5, $t4, %[src_stride] \n" + "addu $t6, $t2, $t4 \n" + "subu $t7, $t9, %[src_stride] \n" + "srl $t1, %[width], 1 \n" + +// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b + "andi $t0, %[dst_a], 0x3 \n" + "andi $t8, %[dst_b], 0x3 \n" + "or $t0, $t0, $t8 \n" + "andi $t8, %[dst_stride_a], 0x3 \n" + "andi $s5, %[dst_stride_b], 0x3 \n" + "or $t8, $t8, $s5 \n" + "or $t0, $t0, $t8 \n" + "bnez $t0, 11f \n" + " nop \n" +// dst + dst_stride word aligned (both, a & b dst addresses) + "1: \n" + "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0| + "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1| + "addu $s5, %[dst_a], %[dst_stride_a] \n" + "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2| + "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3| + "addu $s6, %[dst_b], %[dst_stride_b] \n" + + "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0| + "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2| + "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0| + "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0| + + "sll $t0, $t0, 16 \n" + "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0| + "sll $t9, $t9, 16 \n" + "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2| + + "sw $s3, 0($s5) \n" + "sw $s4, 0($s6) \n" + + "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0| + "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0| + + "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4| + "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5| + "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6| + "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7| + "sw $s3, 0(%[dst_a]) \n" + "sw $s4, 0(%[dst_b]) \n" + + "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4| + "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7| + "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4| + "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4| + + "sll $t0, $t0, 16 \n" + "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4| + "sll $t9, $t9, 16 \n" + "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6| + "sw $s3, 4($s5) \n" + "sw $s4, 4($s6) \n" + + "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4| + "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4| + + "addiu %[src], 4 \n" + "addiu $t1, -1 \n" + "sll $t0, %[dst_stride_a], 1 \n" + "sll $t8, %[dst_stride_b], 1 \n" + "sw $s3, 4(%[dst_a]) \n" + "sw $s4, 4(%[dst_b]) \n" + "addu %[dst_a], %[dst_a], $t0 \n" + "bnez $t1, 1b \n" + " addu %[dst_b], %[dst_b], $t8 \n" + "b 2f \n" + " nop \n" + +// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned + "11: \n" + "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0| + "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1| + "addu $s5, %[dst_a], %[dst_stride_a] \n" + "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2| + "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3| + "addu $s6, %[dst_b], %[dst_stride_b] \n" + + "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0| + "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2| + "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0| + "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0| + + "sll $t0, $t0, 16 \n" + "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0| + "sll $t9, $t9, 16 \n" + "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2| + + "swr $s3, 0($s5) \n" + "swl $s3, 3($s5) \n" + "swr $s4, 0($s6) \n" + "swl $s4, 3($s6) \n" + + "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0| + "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0| + + "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4| + "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5| + "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6| + "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7| + "swr $s3, 0(%[dst_a]) \n" + "swl $s3, 3(%[dst_a]) \n" + "swr $s4, 0(%[dst_b]) \n" + "swl $s4, 3(%[dst_b]) \n" + + "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4| + "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7| + "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4| + "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4| + + "sll $t0, $t0, 16 \n" + "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4| + "sll $t9, $t9, 16 \n" + "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6| + + "swr $s3, 4($s5) \n" + "swl $s3, 7($s5) \n" + "swr $s4, 4($s6) \n" + "swl $s4, 7($s6) \n" + + "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4| + "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4| + + "addiu %[src], 4 \n" + "addiu $t1, -1 \n" + "sll $t0, %[dst_stride_a], 1 \n" + "sll $t8, %[dst_stride_b], 1 \n" + "swr $s3, 4(%[dst_a]) \n" + "swl $s3, 7(%[dst_a]) \n" + "swr $s4, 4(%[dst_b]) \n" + "swl $s4, 7(%[dst_b]) \n" + "addu %[dst_a], %[dst_a], $t0 \n" + "bnez $t1, 11b \n" + " addu %[dst_b], %[dst_b], $t8 \n" + + "2: \n" + ".set pop \n" + : [src] "+r" (src), + [dst_a] "+r" (dst_a), + [dst_b] "+r" (dst_b), + [width] "+r" (width), + [src_stride] "+r" (src_stride) + : [dst_stride_a] "r" (dst_stride_a), + [dst_stride_b] "r" (dst_stride_b) + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9", + "s0", "s1", "s2", "s3", + "s4", "s5", "s6" + ); +} + +#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/rotate_neon.cc b/libs/libaom/src/third_party/libyuv/source/rotate_neon.cc new file mode 100644 index 000000000..76043b3b3 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/rotate_neon.cc @@ -0,0 +1,535 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" +#include "libyuv/rotate_row.h" + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ + !defined(__aarch64__) + +static uvec8 kVTbl4x4Transpose = + { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; + +void TransposeWx8_NEON(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width) { + const uint8* src_temp = NULL; + asm volatile ( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %5, #8 \n" + + // handle 8x8 blocks. this should be the majority of the plane + ".p2align 2 \n" + "1: \n" + "mov %0, %1 \n" + + MEMACCESS(0) + "vld1.8 {d0}, [%0], %2 \n" + MEMACCESS(0) + "vld1.8 {d1}, [%0], %2 \n" + MEMACCESS(0) + "vld1.8 {d2}, [%0], %2 \n" + MEMACCESS(0) + "vld1.8 {d3}, [%0], %2 \n" + MEMACCESS(0) + "vld1.8 {d4}, [%0], %2 \n" + MEMACCESS(0) + "vld1.8 {d5}, [%0], %2 \n" + MEMACCESS(0) + "vld1.8 {d6}, [%0], %2 \n" + MEMACCESS(0) + "vld1.8 {d7}, [%0] \n" + + "vtrn.8 d1, d0 \n" + "vtrn.8 d3, d2 \n" + "vtrn.8 d5, d4 \n" + "vtrn.8 d7, d6 \n" + + "vtrn.16 d1, d3 \n" + "vtrn.16 d0, d2 \n" + "vtrn.16 d5, d7 \n" + "vtrn.16 d4, d6 \n" + + "vtrn.32 d1, d5 \n" + "vtrn.32 d0, d4 \n" + "vtrn.32 d3, d7 \n" + "vtrn.32 d2, d6 \n" + + "vrev16.8 q0, q0 \n" + "vrev16.8 q1, q1 \n" + "vrev16.8 q2, q2 \n" + "vrev16.8 q3, q3 \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "vst1.8 {d1}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d0}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d3}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d2}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d5}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d4}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d7}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d6}, [%0] \n" + + "add %1, #8 \n" // src += 8 + "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride + "subs %5, #8 \n" // w -= 8 + "bge 1b \n" + + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %5, #8 \n" + "beq 4f \n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %5, #2 \n" + "blt 3f \n" + + "cmp %5, #4 \n" + "blt 2f \n" + + // 4x8 block + "mov %0, %1 \n" + MEMACCESS(0) + "vld1.32 {d0[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.32 {d0[1]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.32 {d1[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.32 {d1[1]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.32 {d2[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.32 {d2[1]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.32 {d3[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.32 {d3[1]}, [%0] \n" + + "mov %0, %3 \n" + + MEMACCESS(6) + "vld1.8 {q3}, [%6] \n" + + "vtbl.8 d4, {d0, d1}, d6 \n" + "vtbl.8 d5, {d0, d1}, d7 \n" + "vtbl.8 d0, {d2, d3}, d6 \n" + "vtbl.8 d1, {d2, d3}, d7 \n" + + // TODO(frkoenig): Rework shuffle above to + // write out with 4 instead of 8 writes. + MEMACCESS(0) + "vst1.32 {d4[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d4[1]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d5[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d5[1]}, [%0] \n" + + "add %0, %3, #4 \n" + MEMACCESS(0) + "vst1.32 {d0[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d0[1]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d1[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d1[1]}, [%0] \n" + + "add %1, #4 \n" // src += 4 + "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride + "subs %5, #4 \n" // w -= 4 + "beq 4f \n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %5, #2 \n" + "blt 3f \n" + + // 2x8 block + "2: \n" + "mov %0, %1 \n" + MEMACCESS(0) + "vld1.16 {d0[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.16 {d1[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.16 {d0[1]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.16 {d1[1]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.16 {d0[2]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.16 {d1[2]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.16 {d0[3]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.16 {d1[3]}, [%0] \n" + + "vtrn.8 d0, d1 \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "vst1.64 {d0}, [%0], %4 \n" + MEMACCESS(0) + "vst1.64 {d1}, [%0] \n" + + "add %1, #2 \n" // src += 2 + "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride + "subs %5, #2 \n" // w -= 2 + "beq 4f \n" + + // 1x8 block + "3: \n" + MEMACCESS(1) + "vld1.8 {d0[0]}, [%1], %2 \n" + MEMACCESS(1) + "vld1.8 {d0[1]}, [%1], %2 \n" + MEMACCESS(1) + "vld1.8 {d0[2]}, [%1], %2 \n" + MEMACCESS(1) + "vld1.8 {d0[3]}, [%1], %2 \n" + MEMACCESS(1) + "vld1.8 {d0[4]}, [%1], %2 \n" + MEMACCESS(1) + "vld1.8 {d0[5]}, [%1], %2 \n" + MEMACCESS(1) + "vld1.8 {d0[6]}, [%1], %2 \n" + MEMACCESS(1) + "vld1.8 {d0[7]}, [%1] \n" + + MEMACCESS(3) + "vst1.64 {d0}, [%3] \n" + + "4: \n" + + : "+r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(src_stride), // %2 + "+r"(dst), // %3 + "+r"(dst_stride), // %4 + "+r"(width) // %5 + : "r"(&kVTbl4x4Transpose) // %6 + : "memory", "cc", "q0", "q1", "q2", "q3" + ); +} + +static uvec8 kVTbl4x4TransposeDi = + { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 }; + +void TransposeUVWx8_NEON(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width) { + const uint8* src_temp = NULL; + asm volatile ( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %7, #8 \n" + + // handle 8x8 blocks. this should be the majority of the plane + ".p2align 2 \n" + "1: \n" + "mov %0, %1 \n" + + MEMACCESS(0) + "vld2.8 {d0, d1}, [%0], %2 \n" + MEMACCESS(0) + "vld2.8 {d2, d3}, [%0], %2 \n" + MEMACCESS(0) + "vld2.8 {d4, d5}, [%0], %2 \n" + MEMACCESS(0) + "vld2.8 {d6, d7}, [%0], %2 \n" + MEMACCESS(0) + "vld2.8 {d16, d17}, [%0], %2 \n" + MEMACCESS(0) + "vld2.8 {d18, d19}, [%0], %2 \n" + MEMACCESS(0) + "vld2.8 {d20, d21}, [%0], %2 \n" + MEMACCESS(0) + "vld2.8 {d22, d23}, [%0] \n" + + "vtrn.8 q1, q0 \n" + "vtrn.8 q3, q2 \n" + "vtrn.8 q9, q8 \n" + "vtrn.8 q11, q10 \n" + + "vtrn.16 q1, q3 \n" + "vtrn.16 q0, q2 \n" + "vtrn.16 q9, q11 \n" + "vtrn.16 q8, q10 \n" + + "vtrn.32 q1, q9 \n" + "vtrn.32 q0, q8 \n" + "vtrn.32 q3, q11 \n" + "vtrn.32 q2, q10 \n" + + "vrev16.8 q0, q0 \n" + "vrev16.8 q1, q1 \n" + "vrev16.8 q2, q2 \n" + "vrev16.8 q3, q3 \n" + "vrev16.8 q8, q8 \n" + "vrev16.8 q9, q9 \n" + "vrev16.8 q10, q10 \n" + "vrev16.8 q11, q11 \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "vst1.8 {d2}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d0}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d6}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d4}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d18}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d16}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d22}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d20}, [%0] \n" + + "mov %0, %5 \n" + + MEMACCESS(0) + "vst1.8 {d3}, [%0], %6 \n" + MEMACCESS(0) + "vst1.8 {d1}, [%0], %6 \n" + MEMACCESS(0) + "vst1.8 {d7}, [%0], %6 \n" + MEMACCESS(0) + "vst1.8 {d5}, [%0], %6 \n" + MEMACCESS(0) + "vst1.8 {d19}, [%0], %6 \n" + MEMACCESS(0) + "vst1.8 {d17}, [%0], %6 \n" + MEMACCESS(0) + "vst1.8 {d23}, [%0], %6 \n" + MEMACCESS(0) + "vst1.8 {d21}, [%0] \n" + + "add %1, #8*2 \n" // src += 8*2 + "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a + "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b + "subs %7, #8 \n" // w -= 8 + "bge 1b \n" + + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %7, #8 \n" + "beq 4f \n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %7, #2 \n" + "blt 3f \n" + + "cmp %7, #4 \n" + "blt 2f \n" + + // TODO(frkoenig): Clean this up + // 4x8 block + "mov %0, %1 \n" + MEMACCESS(0) + "vld1.64 {d0}, [%0], %2 \n" + MEMACCESS(0) + "vld1.64 {d1}, [%0], %2 \n" + MEMACCESS(0) + "vld1.64 {d2}, [%0], %2 \n" + MEMACCESS(0) + "vld1.64 {d3}, [%0], %2 \n" + MEMACCESS(0) + "vld1.64 {d4}, [%0], %2 \n" + MEMACCESS(0) + "vld1.64 {d5}, [%0], %2 \n" + MEMACCESS(0) + "vld1.64 {d6}, [%0], %2 \n" + MEMACCESS(0) + "vld1.64 {d7}, [%0] \n" + + MEMACCESS(8) + "vld1.8 {q15}, [%8] \n" + + "vtrn.8 q0, q1 \n" + "vtrn.8 q2, q3 \n" + + "vtbl.8 d16, {d0, d1}, d30 \n" + "vtbl.8 d17, {d0, d1}, d31 \n" + "vtbl.8 d18, {d2, d3}, d30 \n" + "vtbl.8 d19, {d2, d3}, d31 \n" + "vtbl.8 d20, {d4, d5}, d30 \n" + "vtbl.8 d21, {d4, d5}, d31 \n" + "vtbl.8 d22, {d6, d7}, d30 \n" + "vtbl.8 d23, {d6, d7}, d31 \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "vst1.32 {d16[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d16[1]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d17[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d17[1]}, [%0], %4 \n" + + "add %0, %3, #4 \n" + MEMACCESS(0) + "vst1.32 {d20[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d20[1]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d21[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d21[1]}, [%0] \n" + + "mov %0, %5 \n" + + MEMACCESS(0) + "vst1.32 {d18[0]}, [%0], %6 \n" + MEMACCESS(0) + "vst1.32 {d18[1]}, [%0], %6 \n" + MEMACCESS(0) + "vst1.32 {d19[0]}, [%0], %6 \n" + MEMACCESS(0) + "vst1.32 {d19[1]}, [%0], %6 \n" + + "add %0, %5, #4 \n" + MEMACCESS(0) + "vst1.32 {d22[0]}, [%0], %6 \n" + MEMACCESS(0) + "vst1.32 {d22[1]}, [%0], %6 \n" + MEMACCESS(0) + "vst1.32 {d23[0]}, [%0], %6 \n" + MEMACCESS(0) + "vst1.32 {d23[1]}, [%0] \n" + + "add %1, #4*2 \n" // src += 4 * 2 + "add %3, %3, %4, lsl #2 \n" // dst_a += 4 * dst_stride_a + "add %5, %5, %6, lsl #2 \n" // dst_b += 4 * dst_stride_b + "subs %7, #4 \n" // w -= 4 + "beq 4f \n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %7, #2 \n" + "blt 3f \n" + + // 2x8 block + "2: \n" + "mov %0, %1 \n" + MEMACCESS(0) + "vld2.16 {d0[0], d2[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld2.16 {d1[0], d3[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld2.16 {d0[1], d2[1]}, [%0], %2 \n" + MEMACCESS(0) + "vld2.16 {d1[1], d3[1]}, [%0], %2 \n" + MEMACCESS(0) + "vld2.16 {d0[2], d2[2]}, [%0], %2 \n" + MEMACCESS(0) + "vld2.16 {d1[2], d3[2]}, [%0], %2 \n" + MEMACCESS(0) + "vld2.16 {d0[3], d2[3]}, [%0], %2 \n" + MEMACCESS(0) + "vld2.16 {d1[3], d3[3]}, [%0] \n" + + "vtrn.8 d0, d1 \n" + "vtrn.8 d2, d3 \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "vst1.64 {d0}, [%0], %4 \n" + MEMACCESS(0) + "vst1.64 {d2}, [%0] \n" + + "mov %0, %5 \n" + + MEMACCESS(0) + "vst1.64 {d1}, [%0], %6 \n" + MEMACCESS(0) + "vst1.64 {d3}, [%0] \n" + + "add %1, #2*2 \n" // src += 2 * 2 + "add %3, %3, %4, lsl #1 \n" // dst_a += 2 * dst_stride_a + "add %5, %5, %6, lsl #1 \n" // dst_b += 2 * dst_stride_b + "subs %7, #2 \n" // w -= 2 + "beq 4f \n" + + // 1x8 block + "3: \n" + MEMACCESS(1) + "vld2.8 {d0[0], d1[0]}, [%1], %2 \n" + MEMACCESS(1) + "vld2.8 {d0[1], d1[1]}, [%1], %2 \n" + MEMACCESS(1) + "vld2.8 {d0[2], d1[2]}, [%1], %2 \n" + MEMACCESS(1) + "vld2.8 {d0[3], d1[3]}, [%1], %2 \n" + MEMACCESS(1) + "vld2.8 {d0[4], d1[4]}, [%1], %2 \n" + MEMACCESS(1) + "vld2.8 {d0[5], d1[5]}, [%1], %2 \n" + MEMACCESS(1) + "vld2.8 {d0[6], d1[6]}, [%1], %2 \n" + MEMACCESS(1) + "vld2.8 {d0[7], d1[7]}, [%1] \n" + + MEMACCESS(3) + "vst1.64 {d0}, [%3] \n" + MEMACCESS(5) + "vst1.64 {d1}, [%5] \n" + + "4: \n" + + : "+r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(src_stride), // %2 + "+r"(dst_a), // %3 + "+r"(dst_stride_a), // %4 + "+r"(dst_b), // %5 + "+r"(dst_stride_b), // %6 + "+r"(width) // %7 + : "r"(&kVTbl4x4TransposeDi) // %8 + : "memory", "cc", + "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" + ); +} +#endif // defined(__ARM_NEON__) && !defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/rotate_neon64.cc b/libs/libaom/src/third_party/libyuv/source/rotate_neon64.cc new file mode 100644 index 000000000..f52c082b3 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/rotate_neon64.cc @@ -0,0 +1,543 @@ +/* + * Copyright 2014 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" +#include "libyuv/rotate_row.h" + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon armv8 64 bit. +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +static uvec8 kVTbl4x4Transpose = + { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; + +void TransposeWx8_NEON(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width) { + const uint8* src_temp = NULL; + int64 width64 = (int64) width; // Work around clang 3.4 warning. + asm volatile ( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %3, %3, #8 \n" + + // handle 8x8 blocks. this should be the majority of the plane + "1: \n" + "mov %0, %1 \n" + + MEMACCESS(0) + "ld1 {v0.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v2.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v3.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v4.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v5.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v6.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v7.8b}, [%0] \n" + + "trn2 v16.8b, v0.8b, v1.8b \n" + "trn1 v17.8b, v0.8b, v1.8b \n" + "trn2 v18.8b, v2.8b, v3.8b \n" + "trn1 v19.8b, v2.8b, v3.8b \n" + "trn2 v20.8b, v4.8b, v5.8b \n" + "trn1 v21.8b, v4.8b, v5.8b \n" + "trn2 v22.8b, v6.8b, v7.8b \n" + "trn1 v23.8b, v6.8b, v7.8b \n" + + "trn2 v3.4h, v17.4h, v19.4h \n" + "trn1 v1.4h, v17.4h, v19.4h \n" + "trn2 v2.4h, v16.4h, v18.4h \n" + "trn1 v0.4h, v16.4h, v18.4h \n" + "trn2 v7.4h, v21.4h, v23.4h \n" + "trn1 v5.4h, v21.4h, v23.4h \n" + "trn2 v6.4h, v20.4h, v22.4h \n" + "trn1 v4.4h, v20.4h, v22.4h \n" + + "trn2 v21.2s, v1.2s, v5.2s \n" + "trn1 v17.2s, v1.2s, v5.2s \n" + "trn2 v20.2s, v0.2s, v4.2s \n" + "trn1 v16.2s, v0.2s, v4.2s \n" + "trn2 v23.2s, v3.2s, v7.2s \n" + "trn1 v19.2s, v3.2s, v7.2s \n" + "trn2 v22.2s, v2.2s, v6.2s \n" + "trn1 v18.2s, v2.2s, v6.2s \n" + + "mov %0, %2 \n" + + MEMACCESS(0) + "st1 {v17.8b}, [%0], %6 \n" + MEMACCESS(0) + "st1 {v16.8b}, [%0], %6 \n" + MEMACCESS(0) + "st1 {v19.8b}, [%0], %6 \n" + MEMACCESS(0) + "st1 {v18.8b}, [%0], %6 \n" + MEMACCESS(0) + "st1 {v21.8b}, [%0], %6 \n" + MEMACCESS(0) + "st1 {v20.8b}, [%0], %6 \n" + MEMACCESS(0) + "st1 {v23.8b}, [%0], %6 \n" + MEMACCESS(0) + "st1 {v22.8b}, [%0] \n" + + "add %1, %1, #8 \n" // src += 8 + "add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride + "subs %3, %3, #8 \n" // w -= 8 + "b.ge 1b \n" + + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %3, %3, #8 \n" + "b.eq 4f \n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %3, #2 \n" + "b.lt 3f \n" + + "cmp %3, #4 \n" + "b.lt 2f \n" + + // 4x8 block + "mov %0, %1 \n" + MEMACCESS(0) + "ld1 {v0.s}[0], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v0.s}[1], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v0.s}[2], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v0.s}[3], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.s}[0], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.s}[1], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.s}[2], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.s}[3], [%0] \n" + + "mov %0, %2 \n" + + MEMACCESS(4) + "ld1 {v2.16b}, [%4] \n" + + "tbl v3.16b, {v0.16b}, v2.16b \n" + "tbl v0.16b, {v1.16b}, v2.16b \n" + + // TODO(frkoenig): Rework shuffle above to + // write out with 4 instead of 8 writes. + MEMACCESS(0) + "st1 {v3.s}[0], [%0], %6 \n" + MEMACCESS(0) + "st1 {v3.s}[1], [%0], %6 \n" + MEMACCESS(0) + "st1 {v3.s}[2], [%0], %6 \n" + MEMACCESS(0) + "st1 {v3.s}[3], [%0] \n" + + "add %0, %2, #4 \n" + MEMACCESS(0) + "st1 {v0.s}[0], [%0], %6 \n" + MEMACCESS(0) + "st1 {v0.s}[1], [%0], %6 \n" + MEMACCESS(0) + "st1 {v0.s}[2], [%0], %6 \n" + MEMACCESS(0) + "st1 {v0.s}[3], [%0] \n" + + "add %1, %1, #4 \n" // src += 4 + "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride + "subs %3, %3, #4 \n" // w -= 4 + "b.eq 4f \n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %3, #2 \n" + "b.lt 3f \n" + + // 2x8 block + "2: \n" + "mov %0, %1 \n" + MEMACCESS(0) + "ld1 {v0.h}[0], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.h}[0], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v0.h}[1], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.h}[1], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v0.h}[2], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.h}[2], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v0.h}[3], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.h}[3], [%0] \n" + + "trn2 v2.8b, v0.8b, v1.8b \n" + "trn1 v3.8b, v0.8b, v1.8b \n" + + "mov %0, %2 \n" + + MEMACCESS(0) + "st1 {v3.8b}, [%0], %6 \n" + MEMACCESS(0) + "st1 {v2.8b}, [%0] \n" + + "add %1, %1, #2 \n" // src += 2 + "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride + "subs %3, %3, #2 \n" // w -= 2 + "b.eq 4f \n" + + // 1x8 block + "3: \n" + MEMACCESS(1) + "ld1 {v0.b}[0], [%1], %5 \n" + MEMACCESS(1) + "ld1 {v0.b}[1], [%1], %5 \n" + MEMACCESS(1) + "ld1 {v0.b}[2], [%1], %5 \n" + MEMACCESS(1) + "ld1 {v0.b}[3], [%1], %5 \n" + MEMACCESS(1) + "ld1 {v0.b}[4], [%1], %5 \n" + MEMACCESS(1) + "ld1 {v0.b}[5], [%1], %5 \n" + MEMACCESS(1) + "ld1 {v0.b}[6], [%1], %5 \n" + MEMACCESS(1) + "ld1 {v0.b}[7], [%1] \n" + + MEMACCESS(2) + "st1 {v0.8b}, [%2] \n" + + "4: \n" + + : "+r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(dst), // %2 + "+r"(width64) // %3 + : "r"(&kVTbl4x4Transpose), // %4 + "r"(static_cast(src_stride)), // %5 + "r"(static_cast(dst_stride)) // %6 + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23" + ); +} + +static uint8 kVTbl4x4TransposeDi[32] = + { 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54, + 1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55}; + +void TransposeUVWx8_NEON(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width) { + const uint8* src_temp = NULL; + int64 width64 = (int64) width; // Work around clang 3.4 warning. + asm volatile ( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %4, %4, #8 \n" + + // handle 8x8 blocks. this should be the majority of the plane + "1: \n" + "mov %0, %1 \n" + + MEMACCESS(0) + "ld1 {v0.16b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.16b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v2.16b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v3.16b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v4.16b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v5.16b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v6.16b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v7.16b}, [%0] \n" + + "trn1 v16.16b, v0.16b, v1.16b \n" + "trn2 v17.16b, v0.16b, v1.16b \n" + "trn1 v18.16b, v2.16b, v3.16b \n" + "trn2 v19.16b, v2.16b, v3.16b \n" + "trn1 v20.16b, v4.16b, v5.16b \n" + "trn2 v21.16b, v4.16b, v5.16b \n" + "trn1 v22.16b, v6.16b, v7.16b \n" + "trn2 v23.16b, v6.16b, v7.16b \n" + + "trn1 v0.8h, v16.8h, v18.8h \n" + "trn2 v1.8h, v16.8h, v18.8h \n" + "trn1 v2.8h, v20.8h, v22.8h \n" + "trn2 v3.8h, v20.8h, v22.8h \n" + "trn1 v4.8h, v17.8h, v19.8h \n" + "trn2 v5.8h, v17.8h, v19.8h \n" + "trn1 v6.8h, v21.8h, v23.8h \n" + "trn2 v7.8h, v21.8h, v23.8h \n" + + "trn1 v16.4s, v0.4s, v2.4s \n" + "trn2 v17.4s, v0.4s, v2.4s \n" + "trn1 v18.4s, v1.4s, v3.4s \n" + "trn2 v19.4s, v1.4s, v3.4s \n" + "trn1 v20.4s, v4.4s, v6.4s \n" + "trn2 v21.4s, v4.4s, v6.4s \n" + "trn1 v22.4s, v5.4s, v7.4s \n" + "trn2 v23.4s, v5.4s, v7.4s \n" + + "mov %0, %2 \n" + + MEMACCESS(0) + "st1 {v16.d}[0], [%0], %6 \n" + MEMACCESS(0) + "st1 {v18.d}[0], [%0], %6 \n" + MEMACCESS(0) + "st1 {v17.d}[0], [%0], %6 \n" + MEMACCESS(0) + "st1 {v19.d}[0], [%0], %6 \n" + MEMACCESS(0) + "st1 {v16.d}[1], [%0], %6 \n" + MEMACCESS(0) + "st1 {v18.d}[1], [%0], %6 \n" + MEMACCESS(0) + "st1 {v17.d}[1], [%0], %6 \n" + MEMACCESS(0) + "st1 {v19.d}[1], [%0] \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "st1 {v20.d}[0], [%0], %7 \n" + MEMACCESS(0) + "st1 {v22.d}[0], [%0], %7 \n" + MEMACCESS(0) + "st1 {v21.d}[0], [%0], %7 \n" + MEMACCESS(0) + "st1 {v23.d}[0], [%0], %7 \n" + MEMACCESS(0) + "st1 {v20.d}[1], [%0], %7 \n" + MEMACCESS(0) + "st1 {v22.d}[1], [%0], %7 \n" + MEMACCESS(0) + "st1 {v21.d}[1], [%0], %7 \n" + MEMACCESS(0) + "st1 {v23.d}[1], [%0] \n" + + "add %1, %1, #16 \n" // src += 8*2 + "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a + "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b + "subs %4, %4, #8 \n" // w -= 8 + "b.ge 1b \n" + + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %4, %4, #8 \n" + "b.eq 4f \n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %4, #2 \n" + "b.lt 3f \n" + + "cmp %4, #4 \n" + "b.lt 2f \n" + + // TODO(frkoenig): Clean this up + // 4x8 block + "mov %0, %1 \n" + MEMACCESS(0) + "ld1 {v0.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v2.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v3.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v4.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v5.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v6.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v7.8b}, [%0] \n" + + MEMACCESS(8) + "ld1 {v30.16b}, [%8], #16 \n" + "ld1 {v31.16b}, [%8] \n" + + "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n" + "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n" + "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n" + "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n" + + "mov %0, %2 \n" + + MEMACCESS(0) + "st1 {v16.s}[0], [%0], %6 \n" + MEMACCESS(0) + "st1 {v16.s}[1], [%0], %6 \n" + MEMACCESS(0) + "st1 {v16.s}[2], [%0], %6 \n" + MEMACCESS(0) + "st1 {v16.s}[3], [%0], %6 \n" + + "add %0, %2, #4 \n" + MEMACCESS(0) + "st1 {v18.s}[0], [%0], %6 \n" + MEMACCESS(0) + "st1 {v18.s}[1], [%0], %6 \n" + MEMACCESS(0) + "st1 {v18.s}[2], [%0], %6 \n" + MEMACCESS(0) + "st1 {v18.s}[3], [%0] \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "st1 {v17.s}[0], [%0], %7 \n" + MEMACCESS(0) + "st1 {v17.s}[1], [%0], %7 \n" + MEMACCESS(0) + "st1 {v17.s}[2], [%0], %7 \n" + MEMACCESS(0) + "st1 {v17.s}[3], [%0], %7 \n" + + "add %0, %3, #4 \n" + MEMACCESS(0) + "st1 {v19.s}[0], [%0], %7 \n" + MEMACCESS(0) + "st1 {v19.s}[1], [%0], %7 \n" + MEMACCESS(0) + "st1 {v19.s}[2], [%0], %7 \n" + MEMACCESS(0) + "st1 {v19.s}[3], [%0] \n" + + "add %1, %1, #8 \n" // src += 4 * 2 + "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a + "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b + "subs %4, %4, #4 \n" // w -= 4 + "b.eq 4f \n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %4, #2 \n" + "b.lt 3f \n" + + // 2x8 block + "2: \n" + "mov %0, %1 \n" + MEMACCESS(0) + "ld2 {v0.h, v1.h}[0], [%0], %5 \n" + MEMACCESS(0) + "ld2 {v2.h, v3.h}[0], [%0], %5 \n" + MEMACCESS(0) + "ld2 {v0.h, v1.h}[1], [%0], %5 \n" + MEMACCESS(0) + "ld2 {v2.h, v3.h}[1], [%0], %5 \n" + MEMACCESS(0) + "ld2 {v0.h, v1.h}[2], [%0], %5 \n" + MEMACCESS(0) + "ld2 {v2.h, v3.h}[2], [%0], %5 \n" + MEMACCESS(0) + "ld2 {v0.h, v1.h}[3], [%0], %5 \n" + MEMACCESS(0) + "ld2 {v2.h, v3.h}[3], [%0] \n" + + "trn1 v4.8b, v0.8b, v2.8b \n" + "trn2 v5.8b, v0.8b, v2.8b \n" + "trn1 v6.8b, v1.8b, v3.8b \n" + "trn2 v7.8b, v1.8b, v3.8b \n" + + "mov %0, %2 \n" + + MEMACCESS(0) + "st1 {v4.d}[0], [%0], %6 \n" + MEMACCESS(0) + "st1 {v6.d}[0], [%0] \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "st1 {v5.d}[0], [%0], %7 \n" + MEMACCESS(0) + "st1 {v7.d}[0], [%0] \n" + + "add %1, %1, #4 \n" // src += 2 * 2 + "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a + "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b + "subs %4, %4, #2 \n" // w -= 2 + "b.eq 4f \n" + + // 1x8 block + "3: \n" + MEMACCESS(1) + "ld2 {v0.b, v1.b}[0], [%1], %5 \n" + MEMACCESS(1) + "ld2 {v0.b, v1.b}[1], [%1], %5 \n" + MEMACCESS(1) + "ld2 {v0.b, v1.b}[2], [%1], %5 \n" + MEMACCESS(1) + "ld2 {v0.b, v1.b}[3], [%1], %5 \n" + MEMACCESS(1) + "ld2 {v0.b, v1.b}[4], [%1], %5 \n" + MEMACCESS(1) + "ld2 {v0.b, v1.b}[5], [%1], %5 \n" + MEMACCESS(1) + "ld2 {v0.b, v1.b}[6], [%1], %5 \n" + MEMACCESS(1) + "ld2 {v0.b, v1.b}[7], [%1] \n" + + MEMACCESS(2) + "st1 {v0.d}[0], [%2] \n" + MEMACCESS(3) + "st1 {v1.d}[0], [%3] \n" + + "4: \n" + + : "+r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(dst_a), // %2 + "+r"(dst_b), // %3 + "+r"(width64) // %4 + : "r"(static_cast(src_stride)), // %5 + "r"(static_cast(dst_stride_a)), // %6 + "r"(static_cast(dst_stride_b)), // %7 + "r"(&kVTbl4x4TransposeDi) // %8 + : "memory", "cc", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v30", "v31" + ); +} +#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/rotate_win.cc b/libs/libaom/src/third_party/libyuv/source/rotate_win.cc new file mode 100644 index 000000000..2760066df --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/rotate_win.cc @@ -0,0 +1,248 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" +#include "libyuv/rotate_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for Visual C x86. +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ + defined(_MSC_VER) && !defined(__clang__) + +__declspec(naked) +void TransposeWx8_SSSE3(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width) { + __asm { + push edi + push esi + push ebp + mov eax, [esp + 12 + 4] // src + mov edi, [esp + 12 + 8] // src_stride + mov edx, [esp + 12 + 12] // dst + mov esi, [esp + 12 + 16] // dst_stride + mov ecx, [esp + 12 + 20] // width + + // Read in the data from the source pointer. + // First round of bit swap. + align 4 + convertloop: + movq xmm0, qword ptr [eax] + lea ebp, [eax + 8] + movq xmm1, qword ptr [eax + edi] + lea eax, [eax + 2 * edi] + punpcklbw xmm0, xmm1 + movq xmm2, qword ptr [eax] + movdqa xmm1, xmm0 + palignr xmm1, xmm1, 8 + movq xmm3, qword ptr [eax + edi] + lea eax, [eax + 2 * edi] + punpcklbw xmm2, xmm3 + movdqa xmm3, xmm2 + movq xmm4, qword ptr [eax] + palignr xmm3, xmm3, 8 + movq xmm5, qword ptr [eax + edi] + punpcklbw xmm4, xmm5 + lea eax, [eax + 2 * edi] + movdqa xmm5, xmm4 + movq xmm6, qword ptr [eax] + palignr xmm5, xmm5, 8 + movq xmm7, qword ptr [eax + edi] + punpcklbw xmm6, xmm7 + mov eax, ebp + movdqa xmm7, xmm6 + palignr xmm7, xmm7, 8 + // Second round of bit swap. + punpcklwd xmm0, xmm2 + punpcklwd xmm1, xmm3 + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + palignr xmm2, xmm2, 8 + palignr xmm3, xmm3, 8 + punpcklwd xmm4, xmm6 + punpcklwd xmm5, xmm7 + movdqa xmm6, xmm4 + movdqa xmm7, xmm5 + palignr xmm6, xmm6, 8 + palignr xmm7, xmm7, 8 + // Third round of bit swap. + // Write to the destination pointer. + punpckldq xmm0, xmm4 + movq qword ptr [edx], xmm0 + movdqa xmm4, xmm0 + palignr xmm4, xmm4, 8 + movq qword ptr [edx + esi], xmm4 + lea edx, [edx + 2 * esi] + punpckldq xmm2, xmm6 + movdqa xmm6, xmm2 + palignr xmm6, xmm6, 8 + movq qword ptr [edx], xmm2 + punpckldq xmm1, xmm5 + movq qword ptr [edx + esi], xmm6 + lea edx, [edx + 2 * esi] + movdqa xmm5, xmm1 + movq qword ptr [edx], xmm1 + palignr xmm5, xmm5, 8 + punpckldq xmm3, xmm7 + movq qword ptr [edx + esi], xmm5 + lea edx, [edx + 2 * esi] + movq qword ptr [edx], xmm3 + movdqa xmm7, xmm3 + palignr xmm7, xmm7, 8 + sub ecx, 8 + movq qword ptr [edx + esi], xmm7 + lea edx, [edx + 2 * esi] + jg convertloop + + pop ebp + pop esi + pop edi + ret + } +} + +__declspec(naked) +void TransposeUVWx8_SSE2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int w) { + __asm { + push ebx + push esi + push edi + push ebp + mov eax, [esp + 16 + 4] // src + mov edi, [esp + 16 + 8] // src_stride + mov edx, [esp + 16 + 12] // dst_a + mov esi, [esp + 16 + 16] // dst_stride_a + mov ebx, [esp + 16 + 20] // dst_b + mov ebp, [esp + 16 + 24] // dst_stride_b + mov ecx, esp + sub esp, 4 + 16 + and esp, ~15 + mov [esp + 16], ecx + mov ecx, [ecx + 16 + 28] // w + + align 4 + convertloop: + // Read in the data from the source pointer. + // First round of bit swap. + movdqu xmm0, [eax] + movdqu xmm1, [eax + edi] + lea eax, [eax + 2 * edi] + movdqa xmm7, xmm0 // use xmm7 as temp register. + punpcklbw xmm0, xmm1 + punpckhbw xmm7, xmm1 + movdqa xmm1, xmm7 + movdqu xmm2, [eax] + movdqu xmm3, [eax + edi] + lea eax, [eax + 2 * edi] + movdqa xmm7, xmm2 + punpcklbw xmm2, xmm3 + punpckhbw xmm7, xmm3 + movdqa xmm3, xmm7 + movdqu xmm4, [eax] + movdqu xmm5, [eax + edi] + lea eax, [eax + 2 * edi] + movdqa xmm7, xmm4 + punpcklbw xmm4, xmm5 + punpckhbw xmm7, xmm5 + movdqa xmm5, xmm7 + movdqu xmm6, [eax] + movdqu xmm7, [eax + edi] + lea eax, [eax + 2 * edi] + movdqu [esp], xmm5 // backup xmm5 + neg edi + movdqa xmm5, xmm6 // use xmm5 as temp register. + punpcklbw xmm6, xmm7 + punpckhbw xmm5, xmm7 + movdqa xmm7, xmm5 + lea eax, [eax + 8 * edi + 16] + neg edi + // Second round of bit swap. + movdqa xmm5, xmm0 + punpcklwd xmm0, xmm2 + punpckhwd xmm5, xmm2 + movdqa xmm2, xmm5 + movdqa xmm5, xmm1 + punpcklwd xmm1, xmm3 + punpckhwd xmm5, xmm3 + movdqa xmm3, xmm5 + movdqa xmm5, xmm4 + punpcklwd xmm4, xmm6 + punpckhwd xmm5, xmm6 + movdqa xmm6, xmm5 + movdqu xmm5, [esp] // restore xmm5 + movdqu [esp], xmm6 // backup xmm6 + movdqa xmm6, xmm5 // use xmm6 as temp register. + punpcklwd xmm5, xmm7 + punpckhwd xmm6, xmm7 + movdqa xmm7, xmm6 + // Third round of bit swap. + // Write to the destination pointer. + movdqa xmm6, xmm0 + punpckldq xmm0, xmm4 + punpckhdq xmm6, xmm4 + movdqa xmm4, xmm6 + movdqu xmm6, [esp] // restore xmm6 + movlpd qword ptr [edx], xmm0 + movhpd qword ptr [ebx], xmm0 + movlpd qword ptr [edx + esi], xmm4 + lea edx, [edx + 2 * esi] + movhpd qword ptr [ebx + ebp], xmm4 + lea ebx, [ebx + 2 * ebp] + movdqa xmm0, xmm2 // use xmm0 as the temp register. + punpckldq xmm2, xmm6 + movlpd qword ptr [edx], xmm2 + movhpd qword ptr [ebx], xmm2 + punpckhdq xmm0, xmm6 + movlpd qword ptr [edx + esi], xmm0 + lea edx, [edx + 2 * esi] + movhpd qword ptr [ebx + ebp], xmm0 + lea ebx, [ebx + 2 * ebp] + movdqa xmm0, xmm1 // use xmm0 as the temp register. + punpckldq xmm1, xmm5 + movlpd qword ptr [edx], xmm1 + movhpd qword ptr [ebx], xmm1 + punpckhdq xmm0, xmm5 + movlpd qword ptr [edx + esi], xmm0 + lea edx, [edx + 2 * esi] + movhpd qword ptr [ebx + ebp], xmm0 + lea ebx, [ebx + 2 * ebp] + movdqa xmm0, xmm3 // use xmm0 as the temp register. + punpckldq xmm3, xmm7 + movlpd qword ptr [edx], xmm3 + movhpd qword ptr [ebx], xmm3 + punpckhdq xmm0, xmm7 + sub ecx, 8 + movlpd qword ptr [edx + esi], xmm0 + lea edx, [edx + 2 * esi] + movhpd qword ptr [ebx + ebp], xmm0 + lea ebx, [ebx + 2 * ebp] + jg convertloop + + mov esp, [esp + 16] + pop ebp + pop edi + pop esi + pop ebx + ret + } +} + +#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/row_any.cc b/libs/libaom/src/third_party/libyuv/source/row_any.cc new file mode 100644 index 000000000..1cb1f6b93 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/row_any.cc @@ -0,0 +1,680 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#include // For memset. + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Subsampled source needs to be increase by 1 of not even. +#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift)) + +// Any 3 planes to 1. +#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \ + uint8* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8 temp[64 * 4]); \ + memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \ + } \ + memcpy(temp, y_buf + n, r); \ + memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \ + SS(r, DUVSHIFT) * BPP); \ + } + +#ifdef HAS_I422TOARGBROW_SSSE3 +ANY31(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7) +#endif +#ifdef HAS_I444TOARGBROW_SSSE3 +ANY31(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7) +ANY31(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7) +ANY31(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_SSSE3, 1, 0, 4, 7) +ANY31(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_SSSE3, 1, 0, 4, 7) +ANY31(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7) +ANY31(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7) +ANY31(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7) +ANY31(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7) +ANY31(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7) +ANY31(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, 1, 0, 3, 7) +ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15) +ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15) +#endif // HAS_I444TOARGBROW_SSSE3 +#ifdef HAS_I422TORGB24ROW_AVX2 +ANY31(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15) +#endif +#ifdef HAS_I422TORAWROW_AVX2 +ANY31(I422ToRAWRow_Any_AVX2, I422ToRAWRow_AVX2, 1, 0, 3, 15) +#endif +#ifdef HAS_J422TOARGBROW_SSSE3 +ANY31(J422ToARGBRow_Any_SSSE3, J422ToARGBRow_SSSE3, 1, 0, 4, 7) +#endif +#ifdef HAS_J422TOARGBROW_AVX2 +ANY31(J422ToARGBRow_Any_AVX2, J422ToARGBRow_AVX2, 1, 0, 4, 15) +#endif +#ifdef HAS_I422TOARGBROW_AVX2 +ANY31(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15) +#endif +#ifdef HAS_I422TOBGRAROW_AVX2 +ANY31(I422ToBGRARow_Any_AVX2, I422ToBGRARow_AVX2, 1, 0, 4, 15) +#endif +#ifdef HAS_I422TORGBAROW_AVX2 +ANY31(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15) +#endif +#ifdef HAS_I422TOABGRROW_AVX2 +ANY31(I422ToABGRRow_Any_AVX2, I422ToABGRRow_AVX2, 1, 0, 4, 15) +#endif +#ifdef HAS_I444TOARGBROW_AVX2 +ANY31(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15) +#endif +#ifdef HAS_I411TOARGBROW_AVX2 +ANY31(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15) +#endif +#ifdef HAS_I422TOARGB4444ROW_AVX2 +ANY31(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7) +#endif +#ifdef HAS_I422TOARGB1555ROW_AVX2 +ANY31(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 7) +#endif +#ifdef HAS_I422TORGB565ROW_AVX2 +ANY31(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7) +#endif +#ifdef HAS_I422TOARGBROW_NEON +ANY31(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7) +ANY31(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7) +ANY31(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7) +ANY31(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, 1, 0, 4, 7) +ANY31(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, 1, 0, 4, 7) +ANY31(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7) +ANY31(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7) +ANY31(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, 1, 0, 3, 7) +ANY31(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7) +ANY31(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7) +ANY31(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7) +#endif +#ifdef HAS_I422TOYUY2ROW_NEON +ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15) +#endif +#ifdef HAS_I422TOUYVYROW_NEON +ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15) +#endif +#undef ANY31 + +// Any 2 planes to 1. +#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ + void NAMEANY(const uint8* y_buf, const uint8* uv_buf, \ + uint8* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8 temp[64 * 3]); \ + memset(temp, 0, 64 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \ + } \ + memcpy(temp, y_buf + n * SBPP, r * SBPP); \ + memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \ + SS(r, UVSHIFT) * SBPP2); \ + ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ + } + +// Biplanar to RGB. +#ifdef HAS_NV12TOARGBROW_SSSE3 +ANY21(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7) +ANY21(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7) +#endif +#ifdef HAS_NV12TOARGBROW_AVX2 +ANY21(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15) +ANY21(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15) +#endif +#ifdef HAS_NV12TOARGBROW_NEON +ANY21(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7) +ANY21(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7) +#endif +#ifdef HAS_NV12TORGB565ROW_SSSE3 +ANY21(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7) +ANY21(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, 1, 1, 2, 2, 7) +#endif +#ifdef HAS_NV12TORGB565ROW_AVX2 +ANY21(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15) +ANY21(NV21ToRGB565Row_Any_AVX2, NV21ToRGB565Row_AVX2, 1, 1, 2, 2, 15) +#endif +#ifdef HAS_NV12TORGB565ROW_NEON +ANY21(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7) +ANY21(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, 1, 1, 2, 2, 7) +#endif + +// Merge functions. +#ifdef HAS_MERGEUVROW_SSE2 +ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15) +#endif +#ifdef HAS_MERGEUVROW_AVX2 +ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31) +#endif +#ifdef HAS_MERGEUVROW_NEON +ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15) +#endif + +// Math functions. +#ifdef HAS_ARGBMULTIPLYROW_SSE2 +ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3) +#endif +#ifdef HAS_ARGBADDROW_SSE2 +ANY21(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, 0, 4, 4, 4, 3) +#endif +#ifdef HAS_ARGBSUBTRACTROW_SSE2 +ANY21(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, 0, 4, 4, 4, 3) +#endif +#ifdef HAS_ARGBMULTIPLYROW_AVX2 +ANY21(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBADDROW_AVX2 +ANY21(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBSUBTRACTROW_AVX2 +ANY21(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBMULTIPLYROW_NEON +ANY21(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBADDROW_NEON +ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBSUBTRACTROW_NEON +ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_SOBELROW_SSE2 +ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15) +#endif +#ifdef HAS_SOBELROW_NEON +ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7) +#endif +#ifdef HAS_SOBELTOPLANEROW_SSE2 +ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15) +#endif +#ifdef HAS_SOBELTOPLANEROW_NEON +ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15) +#endif +#ifdef HAS_SOBELXYROW_SSE2 +ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15) +#endif +#ifdef HAS_SOBELXYROW_NEON +ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7) +#endif +#undef ANY21 + +// Any 1 to 1. +#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ + void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8 temp[128 * 2]); \ + memset(temp, 0, 128); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ + ANY_SIMD(temp, temp + 128, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ + } + +#ifdef HAS_COPYROW_AVX +ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63) +#endif +#ifdef HAS_COPYROW_SSE2 +ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31) +#endif +#ifdef HAS_COPYROW_NEON +ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31) +#endif +#if defined(HAS_ARGBTORGB24ROW_SSSE3) +ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15) +ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15) +ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3) +ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3) +ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3) +#endif +#if defined(HAS_ARGBTOARGB4444ROW_AVX2) +ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7) +ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7) +ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7) +#endif +#if defined(HAS_J400TOARGBROW_SSE2) +ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7) +#endif +#if defined(HAS_J400TOARGBROW_AVX2) +ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15) +#endif +#if defined(HAS_I400TOARGBROW_SSE2) +ANY11(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, 0, 1, 4, 7) +#endif +#if defined(HAS_I400TOARGBROW_AVX2) +ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15) +#endif +#if defined(HAS_YUY2TOARGBROW_SSSE3) +ANY11(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15) +ANY11(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15) +ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15) +ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15) +ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7) +ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7) +ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7) +#endif +#if defined(HAS_RGB565TOARGBROW_AVX2) +ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15) +#endif +#if defined(HAS_ARGB1555TOARGBROW_AVX2) +ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15) +#endif +#if defined(HAS_ARGB4444TOARGBROW_AVX2) +ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15) +#endif +#if defined(HAS_YUY2TOARGBROW_AVX2) +ANY11(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31) +ANY11(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31) +#endif +#if defined(HAS_ARGBTORGB24ROW_NEON) +ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7) +ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7) +ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7) +ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7) +ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7) +ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7) +ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7) +ANY11(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7) +ANY11(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7) +#endif +#ifdef HAS_ARGBTOYROW_AVX2 +ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31) +#endif +#ifdef HAS_ARGBTOYJROW_AVX2 +ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31) +#endif +#ifdef HAS_UYVYTOYROW_AVX2 +ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31) +#endif +#ifdef HAS_YUY2TOYROW_AVX2 +ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31) +#endif +#ifdef HAS_ARGBTOYROW_SSSE3 +ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15) +#endif +#ifdef HAS_BGRATOYROW_SSSE3 +ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15) +ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15) +ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15) +ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15) +ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15) +#endif +#ifdef HAS_ARGBTOYJROW_SSSE3 +ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15) +#endif +#ifdef HAS_ARGBTOYROW_NEON +ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7) +#endif +#ifdef HAS_ARGBTOYJROW_NEON +ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7) +#endif +#ifdef HAS_BGRATOYROW_NEON +ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7) +#endif +#ifdef HAS_ABGRTOYROW_NEON +ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7) +#endif +#ifdef HAS_RGBATOYROW_NEON +ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7) +#endif +#ifdef HAS_RGB24TOYROW_NEON +ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7) +#endif +#ifdef HAS_RAWTOYROW_NEON +ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7) +#endif +#ifdef HAS_RGB565TOYROW_NEON +ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7) +#endif +#ifdef HAS_ARGB1555TOYROW_NEON +ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7) +#endif +#ifdef HAS_ARGB4444TOYROW_NEON +ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7) +#endif +#ifdef HAS_YUY2TOYROW_NEON +ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15) +#endif +#ifdef HAS_UYVYTOYROW_NEON +ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 0, 2, 1, 15) +#endif +#ifdef HAS_RGB24TOARGBROW_NEON +ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7) +#endif +#ifdef HAS_RAWTOARGBROW_NEON +ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7) +#endif +#ifdef HAS_RGB565TOARGBROW_NEON +ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7) +#endif +#ifdef HAS_ARGB1555TOARGBROW_NEON +ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7) +#endif +#ifdef HAS_ARGB4444TOARGBROW_NEON +ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7) +#endif +#ifdef HAS_ARGBATTENUATEROW_SSSE3 +ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3) +#endif +#ifdef HAS_ARGBATTENUATEROW_SSE2 +ANY11(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, 0, 4, 4, 3) +#endif +#ifdef HAS_ARGBUNATTENUATEROW_SSE2 +ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3) +#endif +#ifdef HAS_ARGBATTENUATEROW_AVX2 +ANY11(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, 0, 4, 4, 7) +#endif +#ifdef HAS_ARGBUNATTENUATEROW_AVX2 +ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7) +#endif +#ifdef HAS_ARGBATTENUATEROW_NEON +ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7) +#endif +#undef ANY11 + +// Any 1 to 1 with parameter. +#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \ + void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \ + T shuffler, int width) { \ + SIMD_ALIGNED(uint8 temp[64 * 2]); \ + memset(temp, 0, 64); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, shuffler, n); \ + } \ + memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ + ANY_SIMD(temp, temp + 64, shuffler, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \ + } + +#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) +ANY11P(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2, + const uint32, 4, 2, 3) +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_AVX2) +ANY11P(ARGBToRGB565DitherRow_Any_AVX2, ARGBToRGB565DitherRow_AVX2, + const uint32, 4, 2, 7) +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_NEON) +ANY11P(ARGBToRGB565DitherRow_Any_NEON, ARGBToRGB565DitherRow_NEON, + const uint32, 4, 2, 7) +#endif +#ifdef HAS_ARGBSHUFFLEROW_SSE2 +ANY11P(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, const uint8*, 4, 4, 3) +#endif +#ifdef HAS_ARGBSHUFFLEROW_SSSE3 +ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8*, 4, 4, 7) +#endif +#ifdef HAS_ARGBSHUFFLEROW_AVX2 +ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8*, 4, 4, 15) +#endif +#ifdef HAS_ARGBSHUFFLEROW_NEON +ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3) +#endif +#undef ANY11P + +// Any 1 to 1 interpolate. Takes 2 rows of source via stride. +#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ + void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \ + ptrdiff_t src_stride_ptr, int width, \ + int source_y_fraction) { \ + SIMD_ALIGNED(uint8 temp[64 * 3]); \ + memset(temp, 0, 64 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \ + } \ + memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ + memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP); \ + ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ + } + +#ifdef HAS_INTERPOLATEROW_AVX2 +ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31) +#endif +#ifdef HAS_INTERPOLATEROW_SSSE3 +ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15) +#endif +#ifdef HAS_INTERPOLATEROW_SSE2 +ANY11T(InterpolateRow_Any_SSE2, InterpolateRow_SSE2, 1, 1, 15) +#endif +#ifdef HAS_INTERPOLATEROW_NEON +ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15) +#endif +#ifdef HAS_INTERPOLATEROW_MIPS_DSPR2 +ANY11T(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, 1, 1, 3) +#endif +#undef ANY11T + +// Any 1 to 1 mirror. +#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \ + void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8 temp[64 * 2]); \ + memset(temp, 0, 64); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \ + } \ + memcpy(temp, src_ptr, r * BPP); \ + ANY_SIMD(temp, temp + 64, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \ + } + +#ifdef HAS_MIRRORROW_AVX2 +ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31) +#endif +#ifdef HAS_MIRRORROW_SSSE3 +ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15) +#endif +#ifdef HAS_MIRRORROW_SSE2 +ANY11M(MirrorRow_Any_SSE2, MirrorRow_SSE2, 1, 15) +#endif +#ifdef HAS_MIRRORROW_NEON +ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15) +#endif +#ifdef HAS_ARGBMIRRORROW_AVX2 +ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7) +#endif +#ifdef HAS_ARGBMIRRORROW_SSE2 +ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3) +#endif +#ifdef HAS_ARGBMIRRORROW_NEON +ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3) +#endif +#undef ANY11M + +// Any 1 plane. (memset) +#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \ + void NAMEANY(uint8* dst_ptr, T v32, int width) { \ + SIMD_ALIGNED(uint8 temp[64]); \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(dst_ptr, v32, n); \ + } \ + ANY_SIMD(temp, v32, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp, r * BPP); \ + } + +#ifdef HAS_SETROW_X86 +ANY1(SetRow_Any_X86, SetRow_X86, uint8, 1, 3) +#endif +#ifdef HAS_SETROW_NEON +ANY1(SetRow_Any_NEON, SetRow_NEON, uint8, 1, 15) +#endif +#ifdef HAS_ARGBSETROW_NEON +ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3) +#endif +#undef ANY1 + +// Any 1 to 2. Outputs UV planes. +#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \ + void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) {\ + SIMD_ALIGNED(uint8 temp[128 * 3]); \ + memset(temp, 0, 128); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_u, dst_v, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ + if ((width & 1) && BPP == 4) { /* repeat last 4 bytes for subsampler */ \ + memcpy(temp + SS(r, UVSHIFT) * BPP, \ + temp + SS(r, UVSHIFT) * BPP - BPP, 4); \ + } \ + ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \ + memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \ + memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT)); \ + } + +#ifdef HAS_SPLITUVROW_SSE2 +ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15) +#endif +#ifdef HAS_SPLITUVROW_AVX2 +ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31) +#endif +#ifdef HAS_SPLITUVROW_NEON +ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15) +#endif +#ifdef HAS_SPLITUVROW_MIPS_DSPR2 +ANY12(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_MIPS_DSPR2, 0, 2, 0, 15) +#endif +#ifdef HAS_ARGBTOUV444ROW_SSSE3 +ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15) +#endif +#ifdef HAS_YUY2TOUV422ROW_AVX2 +ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31) +ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31) +#endif +#ifdef HAS_ARGBTOUV422ROW_SSSE3 +ANY12(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_SSSE3, 0, 4, 1, 15) +#endif +#ifdef HAS_YUY2TOUV422ROW_SSE2 +ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15) +ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15) +#endif +#ifdef HAS_YUY2TOUV422ROW_NEON +ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7) +ANY12(ARGBToUV422Row_Any_NEON, ARGBToUV422Row_NEON, 0, 4, 1, 15) +ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31) +ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15) +ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15) +#endif +#undef ANY12 + +// Any 1 to 2 with source stride (2 rows of source). Outputs UV planes. +// 128 byte row allows for 32 avx ARGB pixels. +#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8* src_ptr, int src_stride_ptr, \ + uint8* dst_u, uint8* dst_v, int width) { \ + SIMD_ALIGNED(uint8 temp[128 * 4]); \ + memset(temp, 0, 128 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ + memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \ + SS(r, UVSHIFT) * BPP); \ + if ((width & 1) && BPP == 4) { /* repeat last 4 bytes for subsampler */ \ + memcpy(temp + SS(r, UVSHIFT) * BPP, \ + temp + SS(r, UVSHIFT) * BPP - BPP, 4); \ + memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \ + temp + 128 + SS(r, UVSHIFT) * BPP - BPP, 4); \ + } \ + ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1); \ + memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1)); \ + memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1)); \ + } + +#ifdef HAS_ARGBTOUVROW_AVX2 +ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31) +#endif +#ifdef HAS_ARGBTOUVROW_SSSE3 +ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15) +ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15) +ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15) +ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15) +ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15) +#endif +#ifdef HAS_YUY2TOUVROW_AVX2 +ANY12S(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, 1, 4, 31) +ANY12S(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, 1, 4, 31) +#endif +#ifdef HAS_YUY2TOUVROW_SSE2 +ANY12S(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, 1, 4, 15) +ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15) +#endif +#ifdef HAS_ARGBTOUVROW_NEON +ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15) +#endif +#ifdef HAS_ARGBTOUVJROW_NEON +ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15) +#endif +#ifdef HAS_BGRATOUVROW_NEON +ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15) +#endif +#ifdef HAS_ABGRTOUVROW_NEON +ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15) +#endif +#ifdef HAS_RGBATOUVROW_NEON +ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15) +#endif +#ifdef HAS_RGB24TOUVROW_NEON +ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15) +#endif +#ifdef HAS_RAWTOUVROW_NEON +ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15) +#endif +#ifdef HAS_RGB565TOUVROW_NEON +ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15) +#endif +#ifdef HAS_ARGB1555TOUVROW_NEON +ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15) +#endif +#ifdef HAS_ARGB4444TOUVROW_NEON +ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15) +#endif +#ifdef HAS_YUY2TOUVROW_NEON +ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15) +#endif +#ifdef HAS_UYVYTOUVROW_NEON +ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15) +#endif +#undef ANY12S + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/row_common.cc b/libs/libaom/src/third_party/libyuv/source/row_common.cc new file mode 100644 index 000000000..49875894f --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/row_common.cc @@ -0,0 +1,2576 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#include // For memcpy and memset. + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// llvm x86 is poor at ternary operator, so use branchless min/max. + +#define USE_BRANCHLESS 1 +#if USE_BRANCHLESS +static __inline int32 clamp0(int32 v) { + return ((-(v) >> 31) & (v)); +} + +static __inline int32 clamp255(int32 v) { + return (((255 - (v)) >> 31) | (v)) & 255; +} + +static __inline uint32 Clamp(int32 val) { + int v = clamp0(val); + return (uint32)(clamp255(v)); +} + +static __inline uint32 Abs(int32 v) { + int m = v >> 31; + return (v + m) ^ m; +} +#else // USE_BRANCHLESS +static __inline int32 clamp0(int32 v) { + return (v < 0) ? 0 : v; +} + +static __inline int32 clamp255(int32 v) { + return (v > 255) ? 255 : v; +} + +static __inline uint32 Clamp(int32 val) { + int v = clamp0(val); + return (uint32)(clamp255(v)); +} + +static __inline uint32 Abs(int32 v) { + return (v < 0) ? -v : v; +} +#endif // USE_BRANCHLESS + +#ifdef LIBYUV_LITTLE_ENDIAN +#define WRITEWORD(p, v) *(uint32*)(p) = v +#else +static inline void WRITEWORD(uint8* p, uint32 v) { + p[0] = (uint8)(v & 255); + p[1] = (uint8)((v >> 8) & 255); + p[2] = (uint8)((v >> 16) & 255); + p[3] = (uint8)((v >> 24) & 255); +} +#endif + +void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 b = src_rgb24[0]; + uint8 g = src_rgb24[1]; + uint8 r = src_rgb24[2]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = 255u; + dst_argb += 4; + src_rgb24 += 3; + } +} + +void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 r = src_raw[0]; + uint8 g = src_raw[1]; + uint8 b = src_raw[2]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = 255u; + dst_argb += 4; + src_raw += 3; + } +} + +void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 b = src_rgb565[0] & 0x1f; + uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8 r = src_rgb565[1] >> 3; + dst_argb[0] = (b << 3) | (b >> 2); + dst_argb[1] = (g << 2) | (g >> 4); + dst_argb[2] = (r << 3) | (r >> 2); + dst_argb[3] = 255u; + dst_argb += 4; + src_rgb565 += 2; + } +} + +void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb, + int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 b = src_argb1555[0] & 0x1f; + uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8 r = (src_argb1555[1] & 0x7c) >> 2; + uint8 a = src_argb1555[1] >> 7; + dst_argb[0] = (b << 3) | (b >> 2); + dst_argb[1] = (g << 3) | (g >> 2); + dst_argb[2] = (r << 3) | (r >> 2); + dst_argb[3] = -a; + dst_argb += 4; + src_argb1555 += 2; + } +} + +void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb, + int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 b = src_argb4444[0] & 0x0f; + uint8 g = src_argb4444[0] >> 4; + uint8 r = src_argb4444[1] & 0x0f; + uint8 a = src_argb4444[1] >> 4; + dst_argb[0] = (b << 4) | b; + dst_argb[1] = (g << 4) | g; + dst_argb[2] = (r << 4) | r; + dst_argb[3] = (a << 4) | a; + dst_argb += 4; + src_argb4444 += 2; + } +} + +void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 b = src_argb[0]; + uint8 g = src_argb[1]; + uint8 r = src_argb[2]; + dst_rgb[0] = b; + dst_rgb[1] = g; + dst_rgb[2] = r; + dst_rgb += 3; + src_argb += 4; + } +} + +void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 b = src_argb[0]; + uint8 g = src_argb[1]; + uint8 r = src_argb[2]; + dst_rgb[0] = r; + dst_rgb[1] = g; + dst_rgb[2] = b; + dst_rgb += 3; + src_argb += 4; + } +} + +void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint8 b0 = src_argb[0] >> 3; + uint8 g0 = src_argb[1] >> 2; + uint8 r0 = src_argb[2] >> 3; + uint8 b1 = src_argb[4] >> 3; + uint8 g1 = src_argb[5] >> 2; + uint8 r1 = src_argb[6] >> 3; + WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | + (b1 << 16) | (g1 << 21) | (r1 << 27)); + dst_rgb += 4; + src_argb += 8; + } + if (width & 1) { + uint8 b0 = src_argb[0] >> 3; + uint8 g0 = src_argb[1] >> 2; + uint8 r0 = src_argb[2] >> 3; + *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); + } +} + +// dither4 is a row of 4 values from 4x4 dither matrix. +// The 4x4 matrix contains values to increase RGB. When converting to +// fewer bits (565) this provides an ordered dither. +// The order in the 4x4 matrix in first byte is upper left. +// The 4 values are passed as an int, then referenced as an array, so +// endian will not affect order of the original matrix. But the dither4 +// will containing the first pixel in the lower byte for little endian +// or the upper byte for big endian. +void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb, + const uint32 dither4, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + int dither0 = ((const unsigned char*)(&dither4))[x & 3]; + int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3]; + uint8 b0 = clamp255(src_argb[0] + dither0) >> 3; + uint8 g0 = clamp255(src_argb[1] + dither0) >> 2; + uint8 r0 = clamp255(src_argb[2] + dither0) >> 3; + uint8 b1 = clamp255(src_argb[4] + dither1) >> 3; + uint8 g1 = clamp255(src_argb[5] + dither1) >> 2; + uint8 r1 = clamp255(src_argb[6] + dither1) >> 3; + WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | + (b1 << 16) | (g1 << 21) | (r1 << 27)); + dst_rgb += 4; + src_argb += 8; + } + if (width & 1) { + int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3]; + uint8 b0 = clamp255(src_argb[0] + dither0) >> 3; + uint8 g0 = clamp255(src_argb[1] + dither0) >> 2; + uint8 r0 = clamp255(src_argb[2] + dither0) >> 3; + *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); + } +} + +void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint8 b0 = src_argb[0] >> 3; + uint8 g0 = src_argb[1] >> 3; + uint8 r0 = src_argb[2] >> 3; + uint8 a0 = src_argb[3] >> 7; + uint8 b1 = src_argb[4] >> 3; + uint8 g1 = src_argb[5] >> 3; + uint8 r1 = src_argb[6] >> 3; + uint8 a1 = src_argb[7] >> 7; + *(uint32*)(dst_rgb) = + b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) | + (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31); + dst_rgb += 4; + src_argb += 8; + } + if (width & 1) { + uint8 b0 = src_argb[0] >> 3; + uint8 g0 = src_argb[1] >> 3; + uint8 r0 = src_argb[2] >> 3; + uint8 a0 = src_argb[3] >> 7; + *(uint16*)(dst_rgb) = + b0 | (g0 << 5) | (r0 << 10) | (a0 << 15); + } +} + +void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint8 b0 = src_argb[0] >> 4; + uint8 g0 = src_argb[1] >> 4; + uint8 r0 = src_argb[2] >> 4; + uint8 a0 = src_argb[3] >> 4; + uint8 b1 = src_argb[4] >> 4; + uint8 g1 = src_argb[5] >> 4; + uint8 r1 = src_argb[6] >> 4; + uint8 a1 = src_argb[7] >> 4; + *(uint32*)(dst_rgb) = + b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) | + (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28); + dst_rgb += 4; + src_argb += 8; + } + if (width & 1) { + uint8 b0 = src_argb[0] >> 4; + uint8 g0 = src_argb[1] >> 4; + uint8 r0 = src_argb[2] >> 4; + uint8 a0 = src_argb[3] >> 4; + *(uint16*)(dst_rgb) = + b0 | (g0 << 4) | (r0 << 8) | (a0 << 12); + } +} + +static __inline int RGBToY(uint8 r, uint8 g, uint8 b) { + return (66 * r + 129 * g + 25 * b + 0x1080) >> 8; +} + +static __inline int RGBToU(uint8 r, uint8 g, uint8 b) { + return (112 * b - 74 * g - 38 * r + 0x8080) >> 8; +} +static __inline int RGBToV(uint8 r, uint8 g, uint8 b) { + return (112 * r - 94 * g - 18 * b + 0x8080) >> 8; +} + +#define MAKEROWY(NAME, R, G, B, BPP) \ +void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \ + src_argb0 += BPP; \ + dst_y += 1; \ + } \ +} \ +void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb, \ + uint8* dst_u, uint8* dst_v, int width) { \ + const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] + \ + src_rgb1[B] + src_rgb1[B + BPP]) >> 2; \ + uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] + \ + src_rgb1[G] + src_rgb1[G + BPP]) >> 2; \ + uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] + \ + src_rgb1[R] + src_rgb1[R + BPP]) >> 2; \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + src_rgb0 += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \ + uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \ + uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + } \ +} + +MAKEROWY(ARGB, 2, 1, 0, 4) +MAKEROWY(BGRA, 1, 2, 3, 4) +MAKEROWY(ABGR, 0, 1, 2, 4) +MAKEROWY(RGBA, 3, 2, 1, 4) +MAKEROWY(RGB24, 2, 1, 0, 3) +MAKEROWY(RAW, 0, 1, 2, 3) +#undef MAKEROWY + +// JPeg uses a variation on BT.601-1 full range +// y = 0.29900 * r + 0.58700 * g + 0.11400 * b +// u = -0.16874 * r - 0.33126 * g + 0.50000 * b + center +// v = 0.50000 * r - 0.41869 * g - 0.08131 * b + center +// BT.601 Mpeg range uses: +// b 0.1016 * 255 = 25.908 = 25 +// g 0.5078 * 255 = 129.489 = 129 +// r 0.2578 * 255 = 65.739 = 66 +// JPeg 8 bit Y (not used): +// b 0.11400 * 256 = 29.184 = 29 +// g 0.58700 * 256 = 150.272 = 150 +// r 0.29900 * 256 = 76.544 = 77 +// JPeg 7 bit Y: +// b 0.11400 * 128 = 14.592 = 15 +// g 0.58700 * 128 = 75.136 = 75 +// r 0.29900 * 128 = 38.272 = 38 +// JPeg 8 bit U: +// b 0.50000 * 255 = 127.5 = 127 +// g -0.33126 * 255 = -84.4713 = -84 +// r -0.16874 * 255 = -43.0287 = -43 +// JPeg 8 bit V: +// b -0.08131 * 255 = -20.73405 = -20 +// g -0.41869 * 255 = -106.76595 = -107 +// r 0.50000 * 255 = 127.5 = 127 + +static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) { + return (38 * r + 75 * g + 15 * b + 64) >> 7; +} + +static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) { + return (127 * b - 84 * g - 43 * r + 0x8080) >> 8; +} +static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) { + return (127 * r - 107 * g - 20 * b + 0x8080) >> 8; +} + +#define AVGB(a, b) (((a) + (b) + 1) >> 1) + +#define MAKEROWYJ(NAME, R, G, B, BPP) \ +void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \ + src_argb0 += BPP; \ + dst_y += 1; \ + } \ +} \ +void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb, \ + uint8* dst_u, uint8* dst_v, int width) { \ + const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \ + AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \ + uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \ + AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \ + uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \ + AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ + src_rgb0 += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]); \ + uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]); \ + uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ + } \ +} + +MAKEROWYJ(ARGB, 2, 1, 0, 4) +#undef MAKEROWYJ + +void ARGBToUVJ422Row_C(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint8 ab = (src_argb[0] + src_argb[4]) >> 1; + uint8 ag = (src_argb[1] + src_argb[5]) >> 1; + uint8 ar = (src_argb[2] + src_argb[6]) >> 1; + dst_u[0] = RGBToUJ(ar, ag, ab); + dst_v[0] = RGBToVJ(ar, ag, ab); + src_argb += 8; + dst_u += 1; + dst_v += 1; + } + if (width & 1) { + uint8 ab = src_argb[0]; + uint8 ag = src_argb[1]; + uint8 ar = src_argb[2]; + dst_u[0] = RGBToUJ(ar, ag, ab); + dst_v[0] = RGBToVJ(ar, ag, ab); + } +} + +void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 b = src_rgb565[0] & 0x1f; + uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8 r = src_rgb565[1] >> 3; + b = (b << 3) | (b >> 2); + g = (g << 2) | (g >> 4); + r = (r << 3) | (r >> 2); + dst_y[0] = RGBToY(r, g, b); + src_rgb565 += 2; + dst_y += 1; + } +} + +void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 b = src_argb1555[0] & 0x1f; + uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8 r = (src_argb1555[1] & 0x7c) >> 2; + b = (b << 3) | (b >> 2); + g = (g << 3) | (g >> 2); + r = (r << 3) | (r >> 2); + dst_y[0] = RGBToY(r, g, b); + src_argb1555 += 2; + dst_y += 1; + } +} + +void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 b = src_argb4444[0] & 0x0f; + uint8 g = src_argb4444[0] >> 4; + uint8 r = src_argb4444[1] & 0x0f; + b = (b << 4) | b; + g = (g << 4) | g; + r = (r << 4) | r; + dst_y[0] = RGBToY(r, g, b); + src_argb4444 += 2; + dst_y += 1; + } +} + +void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_u, uint8* dst_v, int width) { + const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565; + int x; + for (x = 0; x < width - 1; x += 2) { + uint8 b0 = src_rgb565[0] & 0x1f; + uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8 r0 = src_rgb565[1] >> 3; + uint8 b1 = src_rgb565[2] & 0x1f; + uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3); + uint8 r1 = src_rgb565[3] >> 3; + uint8 b2 = next_rgb565[0] & 0x1f; + uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); + uint8 r2 = next_rgb565[1] >> 3; + uint8 b3 = next_rgb565[2] & 0x1f; + uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3); + uint8 r3 = next_rgb565[3] >> 3; + uint8 b = (b0 + b1 + b2 + b3); // 565 * 4 = 787. + uint8 g = (g0 + g1 + g2 + g3); + uint8 r = (r0 + r1 + r2 + r3); + b = (b << 1) | (b >> 6); // 787 -> 888. + r = (r << 1) | (r >> 6); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); + src_rgb565 += 4; + next_rgb565 += 4; + dst_u += 1; + dst_v += 1; + } + if (width & 1) { + uint8 b0 = src_rgb565[0] & 0x1f; + uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8 r0 = src_rgb565[1] >> 3; + uint8 b2 = next_rgb565[0] & 0x1f; + uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); + uint8 r2 = next_rgb565[1] >> 3; + uint8 b = (b0 + b2); // 565 * 2 = 676. + uint8 g = (g0 + g2); + uint8 r = (r0 + r2); + b = (b << 2) | (b >> 4); // 676 -> 888 + g = (g << 1) | (g >> 6); + r = (r << 2) | (r >> 4); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); + } +} + +void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_u, uint8* dst_v, int width) { + const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555; + int x; + for (x = 0; x < width - 1; x += 2) { + uint8 b0 = src_argb1555[0] & 0x1f; + uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8 r0 = (src_argb1555[1] & 0x7c) >> 2; + uint8 b1 = src_argb1555[2] & 0x1f; + uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3); + uint8 r1 = (src_argb1555[3] & 0x7c) >> 2; + uint8 b2 = next_argb1555[0] & 0x1f; + uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); + uint8 r2 = (next_argb1555[1] & 0x7c) >> 2; + uint8 b3 = next_argb1555[2] & 0x1f; + uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3); + uint8 r3 = (next_argb1555[3] & 0x7c) >> 2; + uint8 b = (b0 + b1 + b2 + b3); // 555 * 4 = 777. + uint8 g = (g0 + g1 + g2 + g3); + uint8 r = (r0 + r1 + r2 + r3); + b = (b << 1) | (b >> 6); // 777 -> 888. + g = (g << 1) | (g >> 6); + r = (r << 1) | (r >> 6); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); + src_argb1555 += 4; + next_argb1555 += 4; + dst_u += 1; + dst_v += 1; + } + if (width & 1) { + uint8 b0 = src_argb1555[0] & 0x1f; + uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8 r0 = (src_argb1555[1] & 0x7c) >> 2; + uint8 b2 = next_argb1555[0] & 0x1f; + uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); + uint8 r2 = next_argb1555[1] >> 3; + uint8 b = (b0 + b2); // 555 * 2 = 666. + uint8 g = (g0 + g2); + uint8 r = (r0 + r2); + b = (b << 2) | (b >> 4); // 666 -> 888. + g = (g << 2) | (g >> 4); + r = (r << 2) | (r >> 4); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); + } +} + +void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_u, uint8* dst_v, int width) { + const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444; + int x; + for (x = 0; x < width - 1; x += 2) { + uint8 b0 = src_argb4444[0] & 0x0f; + uint8 g0 = src_argb4444[0] >> 4; + uint8 r0 = src_argb4444[1] & 0x0f; + uint8 b1 = src_argb4444[2] & 0x0f; + uint8 g1 = src_argb4444[2] >> 4; + uint8 r1 = src_argb4444[3] & 0x0f; + uint8 b2 = next_argb4444[0] & 0x0f; + uint8 g2 = next_argb4444[0] >> 4; + uint8 r2 = next_argb4444[1] & 0x0f; + uint8 b3 = next_argb4444[2] & 0x0f; + uint8 g3 = next_argb4444[2] >> 4; + uint8 r3 = next_argb4444[3] & 0x0f; + uint8 b = (b0 + b1 + b2 + b3); // 444 * 4 = 666. + uint8 g = (g0 + g1 + g2 + g3); + uint8 r = (r0 + r1 + r2 + r3); + b = (b << 2) | (b >> 4); // 666 -> 888. + g = (g << 2) | (g >> 4); + r = (r << 2) | (r >> 4); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); + src_argb4444 += 4; + next_argb4444 += 4; + dst_u += 1; + dst_v += 1; + } + if (width & 1) { + uint8 b0 = src_argb4444[0] & 0x0f; + uint8 g0 = src_argb4444[0] >> 4; + uint8 r0 = src_argb4444[1] & 0x0f; + uint8 b2 = next_argb4444[0] & 0x0f; + uint8 g2 = next_argb4444[0] >> 4; + uint8 r2 = next_argb4444[1] & 0x0f; + uint8 b = (b0 + b2); // 444 * 2 = 555. + uint8 g = (g0 + g2); + uint8 r = (r0 + r2); + b = (b << 3) | (b >> 2); // 555 -> 888. + g = (g << 3) | (g >> 2); + r = (r << 3) | (r >> 2); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); + } +} + +void ARGBToUV444Row_C(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 ab = src_argb[0]; + uint8 ag = src_argb[1]; + uint8 ar = src_argb[2]; + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); + src_argb += 4; + dst_u += 1; + dst_v += 1; + } +} + +void ARGBToUV422Row_C(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint8 ab = (src_argb[0] + src_argb[4]) >> 1; + uint8 ag = (src_argb[1] + src_argb[5]) >> 1; + uint8 ar = (src_argb[2] + src_argb[6]) >> 1; + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); + src_argb += 8; + dst_u += 1; + dst_v += 1; + } + if (width & 1) { + uint8 ab = src_argb[0]; + uint8 ag = src_argb[1]; + uint8 ar = src_argb[2]; + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); + } +} + +void ARGBToUV411Row_C(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width) { + int x; + for (x = 0; x < width - 3; x += 4) { + uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2; + uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2; + uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2; + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); + src_argb += 16; + dst_u += 1; + dst_v += 1; + } + if ((width & 3) == 3) { + uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8]) / 3; + uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9]) / 3; + uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10]) / 3; + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); + } else if ((width & 3) == 2) { + uint8 ab = (src_argb[0] + src_argb[4]) >> 1; + uint8 ag = (src_argb[1] + src_argb[5]) >> 1; + uint8 ar = (src_argb[2] + src_argb[6]) >> 1; + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); + } else if ((width & 3) == 1) { + uint8 ab = src_argb[0]; + uint8 ag = src_argb[1]; + uint8 ar = src_argb[2]; + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); + } +} + +void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]); + dst_argb[2] = dst_argb[1] = dst_argb[0] = y; + dst_argb[3] = src_argb[3]; + dst_argb += 4; + src_argb += 4; + } +} + +// Convert a row of image to Sepia tone. +void ARGBSepiaRow_C(uint8* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + int b = dst_argb[0]; + int g = dst_argb[1]; + int r = dst_argb[2]; + int sb = (b * 17 + g * 68 + r * 35) >> 7; + int sg = (b * 22 + g * 88 + r * 45) >> 7; + int sr = (b * 24 + g * 98 + r * 50) >> 7; + // b does not over flow. a is preserved from original. + dst_argb[0] = sb; + dst_argb[1] = clamp255(sg); + dst_argb[2] = clamp255(sr); + dst_argb += 4; + } +} + +// Apply color matrix to a row of image. Matrix is signed. +// TODO(fbarchard): Consider adding rounding (+32). +void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + int b = src_argb[0]; + int g = src_argb[1]; + int r = src_argb[2]; + int a = src_argb[3]; + int sb = (b * matrix_argb[0] + g * matrix_argb[1] + + r * matrix_argb[2] + a * matrix_argb[3]) >> 6; + int sg = (b * matrix_argb[4] + g * matrix_argb[5] + + r * matrix_argb[6] + a * matrix_argb[7]) >> 6; + int sr = (b * matrix_argb[8] + g * matrix_argb[9] + + r * matrix_argb[10] + a * matrix_argb[11]) >> 6; + int sa = (b * matrix_argb[12] + g * matrix_argb[13] + + r * matrix_argb[14] + a * matrix_argb[15]) >> 6; + dst_argb[0] = Clamp(sb); + dst_argb[1] = Clamp(sg); + dst_argb[2] = Clamp(sr); + dst_argb[3] = Clamp(sa); + src_argb += 4; + dst_argb += 4; + } +} + +// Apply color table to a row of image. +void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + int b = dst_argb[0]; + int g = dst_argb[1]; + int r = dst_argb[2]; + int a = dst_argb[3]; + dst_argb[0] = table_argb[b * 4 + 0]; + dst_argb[1] = table_argb[g * 4 + 1]; + dst_argb[2] = table_argb[r * 4 + 2]; + dst_argb[3] = table_argb[a * 4 + 3]; + dst_argb += 4; + } +} + +// Apply color table to a row of image. +void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + int b = dst_argb[0]; + int g = dst_argb[1]; + int r = dst_argb[2]; + dst_argb[0] = table_argb[b * 4 + 0]; + dst_argb[1] = table_argb[g * 4 + 1]; + dst_argb[2] = table_argb[r * 4 + 2]; + dst_argb += 4; + } +} + +void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width) { + int x; + for (x = 0; x < width; ++x) { + int b = dst_argb[0]; + int g = dst_argb[1]; + int r = dst_argb[2]; + dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset; + dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset; + dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset; + dst_argb += 4; + } +} + +#define REPEAT8(v) (v) | ((v) << 8) +#define SHADE(f, v) v * f >> 24 + +void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value) { + const uint32 b_scale = REPEAT8(value & 0xff); + const uint32 g_scale = REPEAT8((value >> 8) & 0xff); + const uint32 r_scale = REPEAT8((value >> 16) & 0xff); + const uint32 a_scale = REPEAT8(value >> 24); + + int i; + for (i = 0; i < width; ++i) { + const uint32 b = REPEAT8(src_argb[0]); + const uint32 g = REPEAT8(src_argb[1]); + const uint32 r = REPEAT8(src_argb[2]); + const uint32 a = REPEAT8(src_argb[3]); + dst_argb[0] = SHADE(b, b_scale); + dst_argb[1] = SHADE(g, g_scale); + dst_argb[2] = SHADE(r, r_scale); + dst_argb[3] = SHADE(a, a_scale); + src_argb += 4; + dst_argb += 4; + } +} +#undef REPEAT8 +#undef SHADE + +#define REPEAT8(v) (v) | ((v) << 8) +#define SHADE(f, v) v * f >> 16 + +void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + int i; + for (i = 0; i < width; ++i) { + const uint32 b = REPEAT8(src_argb0[0]); + const uint32 g = REPEAT8(src_argb0[1]); + const uint32 r = REPEAT8(src_argb0[2]); + const uint32 a = REPEAT8(src_argb0[3]); + const uint32 b_scale = src_argb1[0]; + const uint32 g_scale = src_argb1[1]; + const uint32 r_scale = src_argb1[2]; + const uint32 a_scale = src_argb1[3]; + dst_argb[0] = SHADE(b, b_scale); + dst_argb[1] = SHADE(g, g_scale); + dst_argb[2] = SHADE(r, r_scale); + dst_argb[3] = SHADE(a, a_scale); + src_argb0 += 4; + src_argb1 += 4; + dst_argb += 4; + } +} +#undef REPEAT8 +#undef SHADE + +#define SHADE(f, v) clamp255(v + f) + +void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + int i; + for (i = 0; i < width; ++i) { + const int b = src_argb0[0]; + const int g = src_argb0[1]; + const int r = src_argb0[2]; + const int a = src_argb0[3]; + const int b_add = src_argb1[0]; + const int g_add = src_argb1[1]; + const int r_add = src_argb1[2]; + const int a_add = src_argb1[3]; + dst_argb[0] = SHADE(b, b_add); + dst_argb[1] = SHADE(g, g_add); + dst_argb[2] = SHADE(r, r_add); + dst_argb[3] = SHADE(a, a_add); + src_argb0 += 4; + src_argb1 += 4; + dst_argb += 4; + } +} +#undef SHADE + +#define SHADE(f, v) clamp0(f - v) + +void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + int i; + for (i = 0; i < width; ++i) { + const int b = src_argb0[0]; + const int g = src_argb0[1]; + const int r = src_argb0[2]; + const int a = src_argb0[3]; + const int b_sub = src_argb1[0]; + const int g_sub = src_argb1[1]; + const int r_sub = src_argb1[2]; + const int a_sub = src_argb1[3]; + dst_argb[0] = SHADE(b, b_sub); + dst_argb[1] = SHADE(g, g_sub); + dst_argb[2] = SHADE(r, r_sub); + dst_argb[3] = SHADE(a, a_sub); + src_argb0 += 4; + src_argb1 += 4; + dst_argb += 4; + } +} +#undef SHADE + +// Sobel functions which mimics SSSE3. +void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, + uint8* dst_sobelx, int width) { + int i; + for (i = 0; i < width; ++i) { + int a = src_y0[i]; + int b = src_y1[i]; + int c = src_y2[i]; + int a_sub = src_y0[i + 2]; + int b_sub = src_y1[i + 2]; + int c_sub = src_y2[i + 2]; + int a_diff = a - a_sub; + int b_diff = b - b_sub; + int c_diff = c - c_sub; + int sobel = Abs(a_diff + b_diff * 2 + c_diff); + dst_sobelx[i] = (uint8)(clamp255(sobel)); + } +} + +void SobelYRow_C(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width) { + int i; + for (i = 0; i < width; ++i) { + int a = src_y0[i + 0]; + int b = src_y0[i + 1]; + int c = src_y0[i + 2]; + int a_sub = src_y1[i + 0]; + int b_sub = src_y1[i + 1]; + int c_sub = src_y1[i + 2]; + int a_diff = a - a_sub; + int b_diff = b - b_sub; + int c_diff = c - c_sub; + int sobel = Abs(a_diff + b_diff * 2 + c_diff); + dst_sobely[i] = (uint8)(clamp255(sobel)); + } +} + +void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + int i; + for (i = 0; i < width; ++i) { + int r = src_sobelx[i]; + int b = src_sobely[i]; + int s = clamp255(r + b); + dst_argb[0] = (uint8)(s); + dst_argb[1] = (uint8)(s); + dst_argb[2] = (uint8)(s); + dst_argb[3] = (uint8)(255u); + dst_argb += 4; + } +} + +void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width) { + int i; + for (i = 0; i < width; ++i) { + int r = src_sobelx[i]; + int b = src_sobely[i]; + int s = clamp255(r + b); + dst_y[i] = (uint8)(s); + } +} + +void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + int i; + for (i = 0; i < width; ++i) { + int r = src_sobelx[i]; + int b = src_sobely[i]; + int g = clamp255(r + b); + dst_argb[0] = (uint8)(b); + dst_argb[1] = (uint8)(g); + dst_argb[2] = (uint8)(r); + dst_argb[3] = (uint8)(255u); + dst_argb += 4; + } +} + +void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) { + // Copy a Y to RGB. + int x; + for (x = 0; x < width; ++x) { + uint8 y = src_y[0]; + dst_argb[2] = dst_argb[1] = dst_argb[0] = y; + dst_argb[3] = 255u; + dst_argb += 4; + ++src_y; + } +} + +// BT.601 YUV to RGB reference +// R = (Y - 16) * 1.164 - V * -1.596 +// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813 +// B = (Y - 16) * 1.164 - U * -2.018 + +// Y contribution to R,G,B. Scale and bias. +// TODO(fbarchard): Consider moving constants into a common header. +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ + +// U and V contributions to R,G,B. +#define UB -128 /* max(-128, round(-2.018 * 64)) */ +#define UG 25 /* round(0.391 * 64) */ +#define VG 52 /* round(0.813 * 64) */ +#define VR -102 /* round(-1.596 * 64) */ + +// Bias values to subtract 16 from Y and 128 from U and V. +#define BB (UB * 128 + YGB) +#define BG (UG * 128 + VG * 128 + YGB) +#define BR (VR * 128 + YGB) + +// C reference code that mimics the YUV assembly. +static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, + uint8* b, uint8* g, uint8* r) { + uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16; + *b = Clamp((int32)(-(u * UB) + y1 + BB) >> 6); + *g = Clamp((int32)(-(v * VG + u * UG) + y1 + BG) >> 6); + *r = Clamp((int32)(-(v * VR)+ y1 + BR) >> 6); +} + +// C reference code that mimics the YUV assembly. +static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) { + uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16; + *b = Clamp((int32)(y1 + YGB) >> 6); + *g = Clamp((int32)(y1 + YGB) >> 6); + *r = Clamp((int32)(y1 + YGB) >> 6); +} + +#undef YG +#undef YGB +#undef UB +#undef UG +#undef VG +#undef VR +#undef BB +#undef BG +#undef BR + +// JPEG YUV to RGB reference +// * R = Y - V * -1.40200 +// * G = Y - U * 0.34414 - V * 0.71414 +// * B = Y - U * -1.77200 + +// Y contribution to R,G,B. Scale and bias. +// TODO(fbarchard): Consider moving constants into a common header. +#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ +#define YGBJ 32 /* 64 / 2 */ + +// U and V contributions to R,G,B. +#define UBJ -113 /* round(-1.77200 * 64) */ +#define UGJ 22 /* round(0.34414 * 64) */ +#define VGJ 46 /* round(0.71414 * 64) */ +#define VRJ -90 /* round(-1.40200 * 64) */ + +// Bias values to subtract 16 from Y and 128 from U and V. +#define BBJ (UBJ * 128 + YGBJ) +#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ) +#define BRJ (VRJ * 128 + YGBJ) + +// C reference code that mimics the YUV assembly. +static __inline void YuvJPixel(uint8 y, uint8 u, uint8 v, + uint8* b, uint8* g, uint8* r) { + uint32 y1 = (uint32)(y * 0x0101 * YGJ) >> 16; + *b = Clamp((int32)(-(u * UBJ) + y1 + BBJ) >> 6); + *g = Clamp((int32)(-(v * VGJ + u * UGJ) + y1 + BGJ) >> 6); + *r = Clamp((int32)(-(v * VRJ) + y1 + BRJ) >> 6); +} + +#undef YGJ +#undef YGBJ +#undef UBJ +#undef UGJ +#undef VGJ +#undef VRJ +#undef BBJ +#undef BGJ +#undef BRJ + +#if !defined(LIBYUV_DISABLE_NEON) && \ + (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON)) +// C mimic assembly. +// TODO(fbarchard): Remove subsampling from Neon. +void I444ToARGBRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint8 u = (src_u[0] + src_u[1] + 1) >> 1; + uint8 v = (src_v[0] + src_v[1] + 1) >> 1; + YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + src_y += 2; + src_u += 2; + src_v += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + } +} +#else +void I444ToARGBRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width; ++x) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + src_y += 1; + src_u += 1; + src_v += 1; + rgb_buf += 4; // Advance 1 pixel. + } +} +#endif + +// Also used for 420 +void I422ToARGBRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + YuvPixel(src_y[1], src_u[0], src_v[0], + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + } +} + +void J422ToARGBRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvJPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + YuvJPixel(src_y[1], src_u[0], src_v[0], + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvJPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + } +} + +void I422ToRGB24Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + YuvPixel(src_y[1], src_u[0], src_v[0], + rgb_buf + 3, rgb_buf + 4, rgb_buf + 5); + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 6; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + } +} + +void I422ToRAWRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 2, rgb_buf + 1, rgb_buf + 0); + YuvPixel(src_y[1], src_u[0], src_v[0], + rgb_buf + 5, rgb_buf + 4, rgb_buf + 3); + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 6; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 2, rgb_buf + 1, rgb_buf + 0); + } +} + +void I422ToARGB4444Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb4444, + int width) { + uint8 b0; + uint8 g0; + uint8 r0; + uint8 b1; + uint8 g1; + uint8 r1; + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0); + YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1); + b0 = b0 >> 4; + g0 = g0 >> 4; + r0 = r0 >> 4; + b1 = b1 >> 4; + g1 = g1 >> 4; + r1 = r1 >> 4; + *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | + (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000; + src_y += 2; + src_u += 1; + src_v += 1; + dst_argb4444 += 4; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0); + b0 = b0 >> 4; + g0 = g0 >> 4; + r0 = r0 >> 4; + *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | + 0xf000; + } +} + +void I422ToARGB1555Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb1555, + int width) { + uint8 b0; + uint8 g0; + uint8 r0; + uint8 b1; + uint8 g1; + uint8 r1; + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0); + YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1); + b0 = b0 >> 3; + g0 = g0 >> 3; + r0 = r0 >> 3; + b1 = b1 >> 3; + g1 = g1 >> 3; + r1 = r1 >> 3; + *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | + (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000; + src_y += 2; + src_u += 1; + src_v += 1; + dst_argb1555 += 4; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0); + b0 = b0 >> 3; + g0 = g0 >> 3; + r0 = r0 >> 3; + *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | + 0x8000; + } +} + +void I422ToRGB565Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb565, + int width) { + uint8 b0; + uint8 g0; + uint8 r0; + uint8 b1; + uint8 g1; + uint8 r1; + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0); + YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + b1 = b1 >> 3; + g1 = g1 >> 2; + r1 = r1 >> 3; + *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) | + (b1 << 16) | (g1 << 21) | (r1 << 27); + src_y += 2; + src_u += 1; + src_v += 1; + dst_rgb565 += 4; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); + } +} + +void I411ToARGBRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 3; x += 4) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + YuvPixel(src_y[1], src_u[0], src_v[0], + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + YuvPixel(src_y[2], src_u[0], src_v[0], + rgb_buf + 8, rgb_buf + 9, rgb_buf + 10); + rgb_buf[11] = 255; + YuvPixel(src_y[3], src_u[0], src_v[0], + rgb_buf + 12, rgb_buf + 13, rgb_buf + 14); + rgb_buf[15] = 255; + src_y += 4; + src_u += 1; + src_v += 1; + rgb_buf += 16; // Advance 4 pixels. + } + if (width & 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + YuvPixel(src_y[1], src_u[0], src_v[0], + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + src_y += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + } +} + +void NV12ToARGBRow_C(const uint8* src_y, + const uint8* src_uv, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_uv[0], src_uv[1], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + YuvPixel(src_y[1], src_uv[0], src_uv[1], + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + src_y += 2; + src_uv += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_uv[0], src_uv[1], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + } +} + +void NV21ToARGBRow_C(const uint8* src_y, + const uint8* src_vu, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_vu[1], src_vu[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + + YuvPixel(src_y[1], src_vu[1], src_vu[0], + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + + src_y += 2; + src_vu += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_vu[1], src_vu[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + } +} + +void NV12ToRGB565Row_C(const uint8* src_y, + const uint8* src_uv, + uint8* dst_rgb565, + int width) { + uint8 b0; + uint8 g0; + uint8 r0; + uint8 b1; + uint8 g1; + uint8 r1; + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0); + YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + b1 = b1 >> 3; + g1 = g1 >> 2; + r1 = r1 >> 3; + *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) | + (b1 << 16) | (g1 << 21) | (r1 << 27); + src_y += 2; + src_uv += 2; + dst_rgb565 += 4; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); + } +} + +void NV21ToRGB565Row_C(const uint8* src_y, + const uint8* vsrc_u, + uint8* dst_rgb565, + int width) { + uint8 b0; + uint8 g0; + uint8 r0; + uint8 b1; + uint8 g1; + uint8 r1; + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0); + YuvPixel(src_y[1], vsrc_u[1], vsrc_u[0], &b1, &g1, &r1); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + b1 = b1 >> 3; + g1 = g1 >> 2; + r1 = r1 >> 3; + *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) | + (b1 << 16) | (g1 << 21) | (r1 << 27); + src_y += 2; + vsrc_u += 2; + dst_rgb565 += 4; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); + } +} + +void YUY2ToARGBRow_C(const uint8* src_yuy2, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + src_yuy2 += 4; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + } +} + +void UYVYToARGBRow_C(const uint8* src_uyvy, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + src_uyvy += 4; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + } +} + +void I422ToBGRARow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 3, rgb_buf + 2, rgb_buf + 1); + rgb_buf[0] = 255; + YuvPixel(src_y[1], src_u[0], src_v[0], + rgb_buf + 7, rgb_buf + 6, rgb_buf + 5); + rgb_buf[4] = 255; + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 3, rgb_buf + 2, rgb_buf + 1); + rgb_buf[0] = 255; + } +} + +void I422ToABGRRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 2, rgb_buf + 1, rgb_buf + 0); + rgb_buf[3] = 255; + YuvPixel(src_y[1], src_u[0], src_v[0], + rgb_buf + 6, rgb_buf + 5, rgb_buf + 4); + rgb_buf[7] = 255; + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 2, rgb_buf + 1, rgb_buf + 0); + rgb_buf[3] = 255; + } +} + +void I422ToRGBARow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 1, rgb_buf + 2, rgb_buf + 3); + rgb_buf[0] = 255; + YuvPixel(src_y[1], src_u[0], src_v[0], + rgb_buf + 5, rgb_buf + 6, rgb_buf + 7); + rgb_buf[4] = 255; + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 1, rgb_buf + 2, rgb_buf + 3); + rgb_buf[0] = 255; + } +} + +void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + src_y += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + } +} + +void MirrorRow_C(const uint8* src, uint8* dst, int width) { + int x; + src += width - 1; + for (x = 0; x < width - 1; x += 2) { + dst[x] = src[0]; + dst[x + 1] = src[-1]; + src -= 2; + } + if (width & 1) { + dst[width - 1] = src[0]; + } +} + +void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { + int x; + src_uv += (width - 1) << 1; + for (x = 0; x < width - 1; x += 2) { + dst_u[x] = src_uv[0]; + dst_u[x + 1] = src_uv[-2]; + dst_v[x] = src_uv[1]; + dst_v[x + 1] = src_uv[-2 + 1]; + src_uv -= 4; + } + if (width & 1) { + dst_u[width - 1] = src_uv[0]; + dst_v[width - 1] = src_uv[1]; + } +} + +void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) { + int x; + const uint32* src32 = (const uint32*)(src); + uint32* dst32 = (uint32*)(dst); + src32 += width - 1; + for (x = 0; x < width - 1; x += 2) { + dst32[x] = src32[0]; + dst32[x + 1] = src32[-1]; + src32 -= 2; + } + if (width & 1) { + dst32[width - 1] = src32[0]; + } +} + +void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_u[x] = src_uv[0]; + dst_u[x + 1] = src_uv[2]; + dst_v[x] = src_uv[1]; + dst_v[x + 1] = src_uv[3]; + src_uv += 4; + } + if (width & 1) { + dst_u[width - 1] = src_uv[0]; + dst_v[width - 1] = src_uv[1]; + } +} + +void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_uv[0] = src_u[x]; + dst_uv[1] = src_v[x]; + dst_uv[2] = src_u[x + 1]; + dst_uv[3] = src_v[x + 1]; + dst_uv += 4; + } + if (width & 1) { + dst_uv[0] = src_u[width - 1]; + dst_uv[1] = src_v[width - 1]; + } +} + +void CopyRow_C(const uint8* src, uint8* dst, int count) { + memcpy(dst, src, count); +} + +void CopyRow_16_C(const uint16* src, uint16* dst, int count) { + memcpy(dst, src, count * 2); +} + +void SetRow_C(uint8* dst, uint8 v8, int width) { + memset(dst, v8, width); +} + +void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) { + uint32* d = (uint32*)(dst_argb); + int x; + for (x = 0; x < width; ++x) { + d[x] = v32; + } +} + +// Filter 2 rows of YUY2 UV's (422) into U and V (420). +void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_u, uint8* dst_v, int width) { + // Output a row of UV values, filtering 2 rows of YUY2. + int x; + for (x = 0; x < width; x += 2) { + dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1; + dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1; + src_yuy2 += 4; + dst_u += 1; + dst_v += 1; + } +} + +// Copy row of YUY2 UV's (422) into U and V (422). +void YUY2ToUV422Row_C(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int width) { + // Output a row of UV values. + int x; + for (x = 0; x < width; x += 2) { + dst_u[0] = src_yuy2[1]; + dst_v[0] = src_yuy2[3]; + src_yuy2 += 4; + dst_u += 1; + dst_v += 1; + } +} + +// Copy row of YUY2 Y's (422) into Y (420/422). +void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) { + // Output a row of Y values. + int x; + for (x = 0; x < width - 1; x += 2) { + dst_y[x] = src_yuy2[0]; + dst_y[x + 1] = src_yuy2[2]; + src_yuy2 += 4; + } + if (width & 1) { + dst_y[width - 1] = src_yuy2[0]; + } +} + +// Filter 2 rows of UYVY UV's (422) into U and V (420). +void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_u, uint8* dst_v, int width) { + // Output a row of UV values. + int x; + for (x = 0; x < width; x += 2) { + dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1; + dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1; + src_uyvy += 4; + dst_u += 1; + dst_v += 1; + } +} + +// Copy row of UYVY UV's (422) into U and V (422). +void UYVYToUV422Row_C(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int width) { + // Output a row of UV values. + int x; + for (x = 0; x < width; x += 2) { + dst_u[0] = src_uyvy[0]; + dst_v[0] = src_uyvy[2]; + src_uyvy += 4; + dst_u += 1; + dst_v += 1; + } +} + +// Copy row of UYVY Y's (422) into Y (420/422). +void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) { + // Output a row of Y values. + int x; + for (x = 0; x < width - 1; x += 2) { + dst_y[x] = src_uyvy[1]; + dst_y[x + 1] = src_uyvy[3]; + src_uyvy += 4; + } + if (width & 1) { + dst_y[width - 1] = src_uyvy[1]; + } +} + +#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f + +// Blend src_argb0 over src_argb1 and store to dst_argb. +// dst_argb may be src_argb0 or src_argb1. +// This code mimics the SSSE3 version for better testability. +void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint32 fb = src_argb0[0]; + uint32 fg = src_argb0[1]; + uint32 fr = src_argb0[2]; + uint32 a = src_argb0[3]; + uint32 bb = src_argb1[0]; + uint32 bg = src_argb1[1]; + uint32 br = src_argb1[2]; + dst_argb[0] = BLEND(fb, bb, a); + dst_argb[1] = BLEND(fg, bg, a); + dst_argb[2] = BLEND(fr, br, a); + dst_argb[3] = 255u; + + fb = src_argb0[4 + 0]; + fg = src_argb0[4 + 1]; + fr = src_argb0[4 + 2]; + a = src_argb0[4 + 3]; + bb = src_argb1[4 + 0]; + bg = src_argb1[4 + 1]; + br = src_argb1[4 + 2]; + dst_argb[4 + 0] = BLEND(fb, bb, a); + dst_argb[4 + 1] = BLEND(fg, bg, a); + dst_argb[4 + 2] = BLEND(fr, br, a); + dst_argb[4 + 3] = 255u; + src_argb0 += 8; + src_argb1 += 8; + dst_argb += 8; + } + + if (width & 1) { + uint32 fb = src_argb0[0]; + uint32 fg = src_argb0[1]; + uint32 fr = src_argb0[2]; + uint32 a = src_argb0[3]; + uint32 bb = src_argb1[0]; + uint32 bg = src_argb1[1]; + uint32 br = src_argb1[2]; + dst_argb[0] = BLEND(fb, bb, a); + dst_argb[1] = BLEND(fg, bg, a); + dst_argb[2] = BLEND(fr, br, a); + dst_argb[3] = 255u; + } +} +#undef BLEND +#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24 + +// Multiply source RGB by alpha and store to destination. +// This code mimics the SSSE3 version for better testability. +void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { + int i; + for (i = 0; i < width - 1; i += 2) { + uint32 b = src_argb[0]; + uint32 g = src_argb[1]; + uint32 r = src_argb[2]; + uint32 a = src_argb[3]; + dst_argb[0] = ATTENUATE(b, a); + dst_argb[1] = ATTENUATE(g, a); + dst_argb[2] = ATTENUATE(r, a); + dst_argb[3] = a; + b = src_argb[4]; + g = src_argb[5]; + r = src_argb[6]; + a = src_argb[7]; + dst_argb[4] = ATTENUATE(b, a); + dst_argb[5] = ATTENUATE(g, a); + dst_argb[6] = ATTENUATE(r, a); + dst_argb[7] = a; + src_argb += 8; + dst_argb += 8; + } + + if (width & 1) { + const uint32 b = src_argb[0]; + const uint32 g = src_argb[1]; + const uint32 r = src_argb[2]; + const uint32 a = src_argb[3]; + dst_argb[0] = ATTENUATE(b, a); + dst_argb[1] = ATTENUATE(g, a); + dst_argb[2] = ATTENUATE(r, a); + dst_argb[3] = a; + } +} +#undef ATTENUATE + +// Divide source RGB by alpha and store to destination. +// b = (b * 255 + (a / 2)) / a; +// g = (g * 255 + (a / 2)) / a; +// r = (r * 255 + (a / 2)) / a; +// Reciprocal method is off by 1 on some values. ie 125 +// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower. +#define T(a) 0x01000000 + (0x10000 / a) +const uint32 fixed_invtbl8[256] = { + 0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07), + T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f), + T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17), + T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f), + T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27), + T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f), + T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37), + T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f), + T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47), + T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f), + T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57), + T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f), + T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67), + T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f), + T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77), + T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f), + T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87), + T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f), + T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97), + T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f), + T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7), + T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf), + T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7), + T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf), + T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7), + T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf), + T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7), + T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf), + T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7), + T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef), + T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7), + T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 }; +#undef T + +void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { + int i; + for (i = 0; i < width; ++i) { + uint32 b = src_argb[0]; + uint32 g = src_argb[1]; + uint32 r = src_argb[2]; + const uint32 a = src_argb[3]; + const uint32 ia = fixed_invtbl8[a] & 0xffff; // 8.8 fixed point + b = (b * ia) >> 8; + g = (g * ia) >> 8; + r = (r * ia) >> 8; + // Clamping should not be necessary but is free in assembly. + dst_argb[0] = clamp255(b); + dst_argb[1] = clamp255(g); + dst_argb[2] = clamp255(r); + dst_argb[3] = a; + src_argb += 4; + dst_argb += 4; + } +} + +void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum, + const int32* previous_cumsum, int width) { + int32 row_sum[4] = {0, 0, 0, 0}; + int x; + for (x = 0; x < width; ++x) { + row_sum[0] += row[x * 4 + 0]; + row_sum[1] += row[x * 4 + 1]; + row_sum[2] += row[x * 4 + 2]; + row_sum[3] += row[x * 4 + 3]; + cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0]; + cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1]; + cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2]; + cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3]; + } +} + +void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl, + int w, int area, uint8* dst, int count) { + float ooa = 1.0f / area; + int i; + for (i = 0; i < count; ++i) { + dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa); + dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa); + dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa); + dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa); + dst += 4; + tl += 4; + bl += 4; + } +} + +// Copy pixels from rotated source to destination row with a slope. +LIBYUV_API +void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, + uint8* dst_argb, const float* uv_dudv, int width) { + int i; + // Render a row of pixels from source into a buffer. + float uv[2]; + uv[0] = uv_dudv[0]; + uv[1] = uv_dudv[1]; + for (i = 0; i < width; ++i) { + int x = (int)(uv[0]); + int y = (int)(uv[1]); + *(uint32*)(dst_argb) = + *(const uint32*)(src_argb + y * src_argb_stride + + x * 4); + dst_argb += 4; + uv[0] += uv_dudv[2]; + uv[1] += uv_dudv[3]; + } +} + +// Blend 2 rows into 1. +static void HalfRow_C(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix) { + int x; + for (x = 0; x < pix; ++x) { + dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; + } +} + +static void HalfRow_16_C(const uint16* src_uv, int src_uv_stride, + uint16* dst_uv, int pix) { + int x; + for (x = 0; x < pix; ++x) { + dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; + } +} + +// C version 2x2 -> 2x1. +void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, + int width, int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint8* src_ptr1 = src_ptr + src_stride; + int x; + if (source_y_fraction == 0) { + memcpy(dst_ptr, src_ptr, width); + return; + } + if (source_y_fraction == 128) { + HalfRow_C(src_ptr, (int)(src_stride), dst_ptr, width); + return; + } + for (x = 0; x < width - 1; x += 2) { + dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; + dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; + src_ptr += 2; + src_ptr1 += 2; + dst_ptr += 2; + } + if (width & 1) { + dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; + } +} + +void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr, + ptrdiff_t src_stride, + int width, int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint16* src_ptr1 = src_ptr + src_stride; + int x; + if (source_y_fraction == 0) { + memcpy(dst_ptr, src_ptr, width * 2); + return; + } + if (source_y_fraction == 128) { + HalfRow_16_C(src_ptr, (int)(src_stride), dst_ptr, width); + return; + } + for (x = 0; x < width - 1; x += 2) { + dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; + dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; + src_ptr += 2; + src_ptr1 += 2; + dst_ptr += 2; + } + if (width & 1) { + dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; + } +} + +// Use first 4 shuffler values to reorder ARGB channels. +void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + int index0 = shuffler[0]; + int index1 = shuffler[1]; + int index2 = shuffler[2]; + int index3 = shuffler[3]; + // Shuffle a row of ARGB. + int x; + for (x = 0; x < pix; ++x) { + // To support in-place conversion. + uint8 b = src_argb[index0]; + uint8 g = src_argb[index1]; + uint8 r = src_argb[index2]; + uint8 a = src_argb[index3]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = a; + src_argb += 4; + dst_argb += 4; + } +} + +void I422ToYUY2Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_frame[0] = src_y[0]; + dst_frame[1] = src_u[0]; + dst_frame[2] = src_y[1]; + dst_frame[3] = src_v[0]; + dst_frame += 4; + src_y += 2; + src_u += 1; + src_v += 1; + } + if (width & 1) { + dst_frame[0] = src_y[0]; + dst_frame[1] = src_u[0]; + dst_frame[2] = 0; + dst_frame[3] = src_v[0]; + } +} + +void I422ToUYVYRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_frame[0] = src_u[0]; + dst_frame[1] = src_y[0]; + dst_frame[2] = src_v[0]; + dst_frame[3] = src_y[1]; + dst_frame += 4; + src_y += 2; + src_u += 1; + src_v += 1; + } + if (width & 1) { + dst_frame[0] = src_u[0]; + dst_frame[1] = src_y[0]; + dst_frame[2] = src_v[0]; + dst_frame[3] = 0; + } +} + +// Maximum temporary width for wrappers to process at a time, in pixels. +#define MAXTWIDTH 2048 + +#if !(defined(_MSC_VER) && !defined(__clang__)) && \ + defined(HAS_I422TORGB565ROW_SSSE3) +// row_win.cc has asm version, but GCC uses 2 step wrapper. +void I422ToRGB565Row_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb565, + int width) { + SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth); + ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_rgb565 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_I422TOARGB1555ROW_SSSE3) +void I422ToARGB1555Row_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb1555, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth); + ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_argb1555 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_I422TOARGB4444ROW_SSSE3) +void I422ToARGB4444Row_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb4444, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth); + ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_argb4444 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV12TORGB565ROW_SSSE3) +void NV12ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_uv, + uint8* dst_rgb565, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV12ToARGBRow_SSSE3(src_y, src_uv, row, twidth); + ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); + src_y += twidth; + src_uv += twidth; + dst_rgb565 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV21TORGB565ROW_SSSE3) +void NV21ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_vu, + uint8* dst_rgb565, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV21ToARGBRow_SSSE3(src_y, src_vu, row, twidth); + ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); + src_y += twidth; + src_vu += twidth; + dst_rgb565 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_YUY2TOARGBROW_SSSE3) +void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, uint8* dst_argb, int width) { + // Row buffers for intermediate YUV pixels. + SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]); + SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]); + SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, twidth); + YUY2ToYRow_SSE2(src_yuy2, row_y, twidth); + I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth); + src_yuy2 += twidth * 2; + dst_argb += twidth * 4; + width -= twidth; + } +} +#endif + +#if defined(HAS_UYVYTOARGBROW_SSSE3) +void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, uint8* dst_argb, int width) { + // Row buffers for intermediate YUV pixels. + SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]); + SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]); + SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, twidth); + UYVYToYRow_SSE2(src_uyvy, row_y, twidth); + I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth); + src_uyvy += twidth * 2; + dst_argb += twidth * 4; + width -= twidth; + } +} +#endif // !defined(LIBYUV_DISABLE_X86) + +#if defined(HAS_I422TORGB565ROW_AVX2) +void I422ToRGB565Row_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb565, + int width) { + SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth); + ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_rgb565 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_I422TOARGB1555ROW_AVX2) +void I422ToARGB1555Row_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb1555, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth); + ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_argb1555 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_I422TOARGB4444ROW_AVX2) +void I422ToARGB4444Row_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb4444, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth); + ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_argb4444 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_I422TORGB24ROW_AVX2) +void I422ToRGB24Row_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb24, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth); + // TODO(fbarchard): ARGBToRGB24Row_AVX2 + ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + +#if defined(HAS_I422TORAWROW_AVX2) +void I422ToRAWRow_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_raw, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth); + // TODO(fbarchard): ARGBToRAWRow_AVX2 + ARGBToRAWRow_SSSE3(row, dst_raw, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_raw += twidth * 3; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV12TORGB565ROW_AVX2) +void NV12ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_uv, + uint8* dst_rgb565, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV12ToARGBRow_AVX2(src_y, src_uv, row, twidth); + ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth); + src_y += twidth; + src_uv += twidth; + dst_rgb565 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV21TORGB565ROW_AVX2) +void NV21ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_vu, + uint8* dst_rgb565, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV21ToARGBRow_AVX2(src_y, src_vu, row, twidth); + ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth); + src_y += twidth; + src_vu += twidth; + dst_rgb565 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_YUY2TOARGBROW_AVX2) +void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, uint8* dst_argb, int width) { + // Row buffers for intermediate YUV pixels. + SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]); + SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]); + SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + YUY2ToUV422Row_AVX2(src_yuy2, row_u, row_v, twidth); + YUY2ToYRow_AVX2(src_yuy2, row_y, twidth); + I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, twidth); + src_yuy2 += twidth * 2; + dst_argb += twidth * 4; + width -= twidth; + } +} +#endif + +#if defined(HAS_UYVYTOARGBROW_AVX2) +void UYVYToARGBRow_AVX2(const uint8* src_uyvy, uint8* dst_argb, int width) { + // Row buffers for intermediate YUV pixels. + SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]); + SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]); + SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + UYVYToUV422Row_AVX2(src_uyvy, row_u, row_v, twidth); + UYVYToYRow_AVX2(src_uyvy, row_y, twidth); + I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, twidth); + src_uyvy += twidth * 2; + dst_argb += twidth * 4; + width -= twidth; + } +} +#endif // !defined(LIBYUV_DISABLE_X86) + +void ARGBPolynomialRow_C(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) { + int i; + for (i = 0; i < width; ++i) { + float b = (float)(src_argb[0]); + float g = (float)(src_argb[1]); + float r = (float)(src_argb[2]); + float a = (float)(src_argb[3]); + float b2 = b * b; + float g2 = g * g; + float r2 = r * r; + float a2 = a * a; + float db = poly[0] + poly[4] * b; + float dg = poly[1] + poly[5] * g; + float dr = poly[2] + poly[6] * r; + float da = poly[3] + poly[7] * a; + float b3 = b2 * b; + float g3 = g2 * g; + float r3 = r2 * r; + float a3 = a2 * a; + db += poly[8] * b2; + dg += poly[9] * g2; + dr += poly[10] * r2; + da += poly[11] * a2; + db += poly[12] * b3; + dg += poly[13] * g3; + dr += poly[14] * r3; + da += poly[15] * a3; + + dst_argb[0] = Clamp((int32)(db)); + dst_argb[1] = Clamp((int32)(dg)); + dst_argb[2] = Clamp((int32)(dr)); + dst_argb[3] = Clamp((int32)(da)); + src_argb += 4; + dst_argb += 4; + } +} + +void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, + const uint8* luma, uint32 lumacoeff) { + uint32 bc = lumacoeff & 0xff; + uint32 gc = (lumacoeff >> 8) & 0xff; + uint32 rc = (lumacoeff >> 16) & 0xff; + + int i; + for (i = 0; i < width - 1; i += 2) { + // Luminance in rows, color values in columns. + const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc + + src_argb[2] * rc) & 0x7F00u) + luma; + const uint8* luma1; + dst_argb[0] = luma0[src_argb[0]]; + dst_argb[1] = luma0[src_argb[1]]; + dst_argb[2] = luma0[src_argb[2]]; + dst_argb[3] = src_argb[3]; + luma1 = ((src_argb[4] * bc + src_argb[5] * gc + + src_argb[6] * rc) & 0x7F00u) + luma; + dst_argb[4] = luma1[src_argb[4]]; + dst_argb[5] = luma1[src_argb[5]]; + dst_argb[6] = luma1[src_argb[6]]; + dst_argb[7] = src_argb[7]; + src_argb += 8; + dst_argb += 8; + } + if (width & 1) { + // Luminance in rows, color values in columns. + const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc + + src_argb[2] * rc) & 0x7F00u) + luma; + dst_argb[0] = luma0[src_argb[0]]; + dst_argb[1] = luma0[src_argb[1]]; + dst_argb[2] = luma0[src_argb[2]]; + dst_argb[3] = src_argb[3]; + } +} + +void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) { + int i; + for (i = 0; i < width - 1; i += 2) { + dst[3] = src[3]; + dst[7] = src[7]; + dst += 8; + src += 8; + } + if (width & 1) { + dst[3] = src[3]; + } +} + +void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) { + int i; + for (i = 0; i < width - 1; i += 2) { + dst[3] = src[0]; + dst[7] = src[1]; + dst += 8; + src += 2; + } + if (width & 1) { + dst[3] = src[0]; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/row_gcc.cc b/libs/libaom/src/third_party/libyuv/source/row_gcc.cc new file mode 100644 index 000000000..820de0a1c --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/row_gcc.cc @@ -0,0 +1,5475 @@ +// VERSION 2 +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC x86 and x64. +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) + +#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) + +// Constants for ARGB +static vec8 kARGBToY = { + 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 +}; + +// JPeg full range. +static vec8 kARGBToYJ = { + 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 +}; +#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) + +#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) + +static vec8 kARGBToU = { + 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 +}; + +static vec8 kARGBToUJ = { + 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 +}; + +static vec8 kARGBToV = { + -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, +}; + +static vec8 kARGBToVJ = { + -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 +}; + +// Constants for BGRA +static vec8 kBGRAToY = { + 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 +}; + +static vec8 kBGRAToU = { + 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 +}; + +static vec8 kBGRAToV = { + 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 +}; + +// Constants for ABGR +static vec8 kABGRToY = { + 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 +}; + +static vec8 kABGRToU = { + -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 +}; + +static vec8 kABGRToV = { + 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 +}; + +// Constants for RGBA. +static vec8 kRGBAToY = { + 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 +}; + +static vec8 kRGBAToU = { + 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 +}; + +static vec8 kRGBAToV = { + 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 +}; + +static uvec8 kAddY16 = { + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u +}; + +// 7 bit fixed point 0.5. +static vec16 kAddYJ64 = { + 64, 64, 64, 64, 64, 64, 64, 64 +}; + +static uvec8 kAddUV128 = { + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u +}; + +static uvec16 kAddUVJ128 = { + 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u +}; +#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) + +#ifdef HAS_RGB24TOARGBROW_SSSE3 + +// Shuffle table for converting RGB24 to ARGB. +static uvec8 kShuffleMaskRGB24ToARGB = { + 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u +}; + +// Shuffle table for converting RAW to ARGB. +static uvec8 kShuffleMaskRAWToARGB = { + 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u +}; + +// Shuffle table for converting ARGB to RGB24. +static uvec8 kShuffleMaskARGBToRGB24 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u +}; + +// Shuffle table for converting ARGB to RAW. +static uvec8 kShuffleMaskARGBToRAW = { + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u +}; + +// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 +static uvec8 kShuffleMaskARGBToRGB24_0 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u +}; + +// Shuffle table for converting ARGB to RAW. +static uvec8 kShuffleMaskARGBToRAW_0 = { + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u +}; +#endif // HAS_RGB24TOARGBROW_SSSE3 + +#if defined(TESTING) && defined(__x86_64__) +void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { + asm volatile ( + ".p2align 5 \n" + "mov %%eax,%%eax \n" + "mov %%ebx,%%ebx \n" + "mov %%ecx,%%ecx \n" + "mov %%edx,%%edx \n" + "mov %%esi,%%esi \n" + "mov %%edi,%%edi \n" + "mov %%ebp,%%ebp \n" + "mov %%esp,%%esp \n" + ".p2align 5 \n" + "mov %%r8d,%%r8d \n" + "mov %%r9d,%%r9d \n" + "mov %%r10d,%%r10d \n" + "mov %%r11d,%%r11d \n" + "mov %%r12d,%%r12d \n" + "mov %%r13d,%%r13d \n" + "mov %%r14d,%%r14d \n" + "mov %%r15d,%%r15d \n" + ".p2align 5 \n" + "lea (%%rax),%%eax \n" + "lea (%%rbx),%%ebx \n" + "lea (%%rcx),%%ecx \n" + "lea (%%rdx),%%edx \n" + "lea (%%rsi),%%esi \n" + "lea (%%rdi),%%edi \n" + "lea (%%rbp),%%ebp \n" + "lea (%%rsp),%%esp \n" + ".p2align 5 \n" + "lea (%%r8),%%r8d \n" + "lea (%%r9),%%r9d \n" + "lea (%%r10),%%r10d \n" + "lea (%%r11),%%r11d \n" + "lea (%%r12),%%r12d \n" + "lea (%%r13),%%r13d \n" + "lea (%%r14),%%r14d \n" + "lea (%%r15),%%r15d \n" + + ".p2align 5 \n" + "lea 0x10(%%rax),%%eax \n" + "lea 0x10(%%rbx),%%ebx \n" + "lea 0x10(%%rcx),%%ecx \n" + "lea 0x10(%%rdx),%%edx \n" + "lea 0x10(%%rsi),%%esi \n" + "lea 0x10(%%rdi),%%edi \n" + "lea 0x10(%%rbp),%%ebp \n" + "lea 0x10(%%rsp),%%esp \n" + ".p2align 5 \n" + "lea 0x10(%%r8),%%r8d \n" + "lea 0x10(%%r9),%%r9d \n" + "lea 0x10(%%r10),%%r10d \n" + "lea 0x10(%%r11),%%r11d \n" + "lea 0x10(%%r12),%%r12d \n" + "lea 0x10(%%r13),%%r13d \n" + "lea 0x10(%%r14),%%r14d \n" + "lea 0x10(%%r15),%%r15d \n" + + ".p2align 5 \n" + "add 0x10,%%eax \n" + "add 0x10,%%ebx \n" + "add 0x10,%%ecx \n" + "add 0x10,%%edx \n" + "add 0x10,%%esi \n" + "add 0x10,%%edi \n" + "add 0x10,%%ebp \n" + "add 0x10,%%esp \n" + ".p2align 5 \n" + "add 0x10,%%r8d \n" + "add 0x10,%%r9d \n" + "add 0x10,%%r10d \n" + "add 0x10,%%r11d \n" + "add 0x10,%%r12d \n" + "add 0x10,%%r13d \n" + "add 0x10,%%r14d \n" + "add 0x10,%%r15d \n" + + ".p2align 2 \n" + "1: \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x8,0) ",%0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5" + ); +} +#endif // TESTING + +#ifdef HAS_J400TOARGBROW_SSE2 +void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" + LABELALIGN + "1: \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x8,0) ",%0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm5,%%xmm0 \n" + "por %%xmm5,%%xmm1 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + :: "memory", "cc", "xmm0", "xmm1", "xmm5" + ); +} +#endif // HAS_J400TOARGBROW_SSE2 + +#ifdef HAS_RGB24TOARGBROW_SSSE3 +void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 + "pslld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" + "lea " MEMLEA(0x30,0) ",%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "m"(kShuffleMaskRGB24ToARGB) // %3 + : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 + "pslld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" + "lea " MEMLEA(0x30,0) ",%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "m"(kShuffleMaskRAWToARGB) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "mov $0x1080108,%%eax \n" + "movd %%eax,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x20802080,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psllw $0xb,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0xa,%%xmm4 \n" + "psrlw $0x5,%%xmm4 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "psllw $0x8,%%xmm7 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm3,%%xmm1 \n" + "psllw $0xb,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "psllw $0x8,%%xmm1 \n" + "por %%xmm2,%%xmm1 \n" + "pand %%xmm4,%%xmm0 \n" + "pmulhuw %%xmm6,%%xmm0 \n" + "por %%xmm7,%%xmm0 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) + MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) + "lea " MEMLEA(0x10,0) ",%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "eax", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + +void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "mov $0x1080108,%%eax \n" + "movd %%eax,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x42004200,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psllw $0xb,%%xmm3 \n" + "movdqa %%xmm3,%%xmm4 \n" + "psrlw $0x6,%%xmm4 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "psllw $0x8,%%xmm7 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psllw $0x1,%%xmm1 \n" + "psllw $0xb,%%xmm2 \n" + "pand %%xmm3,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "psllw $0x8,%%xmm1 \n" + "por %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm4,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "pmulhuw %%xmm6,%%xmm0 \n" + "pand %%xmm7,%%xmm2 \n" + "por %%xmm2,%%xmm0 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) + MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) + "lea " MEMLEA(0x10,0) ",%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "eax", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + +void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "mov $0xf0f0f0f,%%eax \n" + "movd %%eax,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "pslld $0x4,%%xmm5 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm4,%%xmm0 \n" + "pand %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm3 \n" + "psllw $0x4,%%xmm1 \n" + "psrlw $0x4,%%xmm3 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm3,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2) + MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2) + "lea " MEMLEA(0x10,0) ",%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "eax", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "movdqa %3,%%xmm6 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" + "lea " MEMLEA(0x30,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : "m"(kShuffleMaskARGBToRGB24) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + ); +} + +void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "movdqa %3,%%xmm6 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" + "lea " MEMLEA(0x30,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : "m"(kShuffleMaskARGBToRAW) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + ); +} + +void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "pcmpeqb %%xmm3,%%xmm3 \n" + "psrld $0x1b,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1a,%%xmm4 \n" + "pslld $0x5,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0xb,%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pslld $0x8,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x5,%%xmm2 \n" + "psrad $0x10,%%xmm0 \n" + "pand %%xmm3,%%xmm1 \n" + "pand %%xmm4,%%xmm2 \n" + "pand %%xmm5,%%xmm0 \n" + "por %%xmm2,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1b,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "pslld $0x5,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "pslld $0xa,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "pslld $0xf,%%xmm7 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "psrad $0x10,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x6,%%xmm2 \n" + "psrld $0x9,%%xmm3 \n" + "pand %%xmm7,%%xmm0 \n" + "pand %%xmm4,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm6,%%xmm3 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm3,%%xmm2 \n" + "por %%xmm2,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + :: "memory", "cc", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + +void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0xc,%%xmm4 \n" + "movdqa %%xmm4,%%xmm3 \n" + "psrlw $0x8,%%xmm3 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm3,%%xmm0 \n" + "pand %%xmm4,%%xmm1 \n" + "psrlq $0x4,%%xmm0 \n" + "psrlq $0x8,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" + ); +} +#endif // HAS_RGB24TOARGBROW_SSSE3 + +#ifdef HAS_ARGBTOYROW_SSSE3 +// Convert 16 ARGB pixels (64 bytes) to 16 Y values. +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kARGBToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_ARGBTOYROW_SSSE3 + +#ifdef HAS_ARGBTOYJROW_SSSE3 +// Convert 16 ARGB pixels (64 bytes) to 16 YJ values. +// Same as ARGBToYRow but different coefficients, no add 16, but do rounding. +void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kARGBToYJ), // %3 + "m"(kAddYJ64) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_ARGBTOYJROW_SSSE3 + +#ifdef HAS_ARGBTOYROW_AVX2 +// vpermd for vphaddw + vpackuswb vpermd. +static const lvec32 kPermdARGBToY_AVX = { + 0, 4, 1, 5, 2, 6, 3, 7 +}; + +// Convert 32 ARGB pixels (128 bytes) to 32 Y values. +void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "vbroadcastf128 %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + "vmovdqu %5,%%ymm6 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" + "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "lea " MEMLEA(0x80,0) ",%0 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. + "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" + "vpsrlw $0x7,%%ymm0,%%ymm0 \n" + "vpsrlw $0x7,%%ymm2,%%ymm2 \n" + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. + "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. + "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kARGBToY), // %3 + "m"(kAddY16), // %4 + "m"(kPermdARGBToY_AVX) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + ); +} +#endif // HAS_ARGBTOYROW_AVX2 + +#ifdef HAS_ARGBTOYJROW_AVX2 +// Convert 32 ARGB pixels (128 bytes) to 32 Y values. +void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "vbroadcastf128 %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + "vmovdqu %5,%%ymm6 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" + "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "lea " MEMLEA(0x80,0) ",%0 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. + "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding. + "vpaddw %%ymm5,%%ymm2,%%ymm2 \n" + "vpsrlw $0x7,%%ymm0,%%ymm0 \n" + "vpsrlw $0x7,%%ymm2,%%ymm2 \n" + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. + "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kARGBToYJ), // %3 + "m"(kAddYJ64), // %4 + "m"(kPermdARGBToY_AVX) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + ); +} +#endif // HAS_ARGBTOYJROW_AVX2 + +#ifdef HAS_ARGBTOUVROW_SSSE3 +void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kARGBToV), // %5 + "m"(kARGBToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" + ); +} +#endif // HAS_ARGBTOUVROW_SSSE3 + +#ifdef HAS_ARGBTOUVROW_AVX2 +// vpshufb for vphaddw + vpackuswb packed to shorts. +static const lvec8 kShufARGBToUV_AVX = { + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +}; +void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "vbroadcastf128 %5,%%ymm5 \n" + "vbroadcastf128 %6,%%ymm6 \n" + "vbroadcastf128 %7,%%ymm7 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" + "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" + VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 + VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) + VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2) + VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3) + "lea " MEMLEA(0x80,0) ",%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" + "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" + + "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n" + VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kAddUV128), // %5 + "m"(kARGBToV), // %6 + "m"(kARGBToU), // %7 + "m"(kShufARGBToUV_AVX) // %8 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_ARGBTOUVROW_AVX2 + +#ifdef HAS_ARGBTOUVJROW_SSSE3 +void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kARGBToVJ), // %5 + "m"(kARGBToUJ), // %6 + "m"(kAddUVJ128) // %7 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" + ); +} +#endif // HAS_ARGBTOUVJROW_SSSE3 + +#ifdef HAS_ARGBTOUV444ROW_SSSE3 +void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int width) { + asm volatile ( + "movdqa %4,%%xmm3 \n" + "movdqa %5,%%xmm4 \n" + "movdqa %6,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm6 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm6,%%xmm2 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "packsswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + "pmaddubsw %%xmm3,%%xmm0 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm6,%%xmm2 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "packsswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "m"(kARGBToV), // %4 + "m"(kARGBToU), // %5 + "m"(kAddUV128) // %6 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm6" + ); +} +#endif // HAS_ARGBTOUV444ROW_SSSE3 + +#ifdef HAS_ARGBTOUV422ROW_SSSE3 +void ARGBToUV422Row_SSSE3(const uint8* src_argb0, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %4,%%xmm3 \n" + "movdqa %5,%%xmm4 \n" + "movdqa %6,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "m"(kARGBToV), // %4 + "m"(kARGBToU), // %5 + "m"(kAddUV128) // %6 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" + ); +} +#endif // HAS_ARGBTOUV422ROW_SSSE3 + +void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kBGRAToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_bgra0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_bgra)), // %4 + "m"(kBGRAToV), // %5 + "m"(kBGRAToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" + ); +} + +void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kABGRToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kRGBAToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_abgr0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_abgr)), // %4 + "m"(kABGRToV), // %5 + "m"(kABGRToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" + ); +} + +void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_rgba0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_rgba)), // %4 + "m"(kRGBAToV), // %5 + "m"(kRGBAToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" + ); +} + +#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) + +struct YuvConstants { + lvec8 kUVToB; // 0 + lvec8 kUVToG; // 32 + lvec8 kUVToR; // 64 + lvec16 kUVBiasB; // 96 + lvec16 kUVBiasG; // 128 + lvec16 kUVBiasR; // 160 + lvec16 kYToRgb; // 192 +}; + +// BT.601 YUV to RGB reference +// R = (Y - 16) * 1.164 - V * -1.596 +// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813 +// B = (Y - 16) * 1.164 - U * -2.018 + +// Y contribution to R,G,B. Scale and bias. +// TODO(fbarchard): Consider moving constants into a common header. +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ + +// U and V contributions to R,G,B. +#define UB -128 /* max(-128, round(-2.018 * 64)) */ +#define UG 25 /* round(0.391 * 64) */ +#define VG 52 /* round(0.813 * 64) */ +#define VR -102 /* round(-1.596 * 64) */ + +// Bias values to subtract 16 from Y and 128 from U and V. +#define BB (UB * 128 + YGB) +#define BG (UG * 128 + VG * 128 + YGB) +#define BR (VR * 128 + YGB) + +// BT601 constants for YUV to RGB. +static YuvConstants SIMD_ALIGNED(kYuvConstants) = { + { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, + UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, + { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, + { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, + 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, + { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, + { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, + { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, + { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } +}; + +// BT601 constants for NV21 where chroma plane is VU instead of UV. +static YuvConstants SIMD_ALIGNED(kYvuConstants) = { + { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, + 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, + { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, + VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, + { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, + VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, + { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, + { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, + { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, + { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } +}; + +#undef YG +#undef YGB +#undef UB +#undef UG +#undef VG +#undef VR +#undef BB +#undef BG +#undef BR + +// JPEG YUV to RGB reference +// * R = Y - V * -1.40200 +// * G = Y - U * 0.34414 - V * 0.71414 +// * B = Y - U * -1.77200 + +// Y contribution to R,G,B. Scale and bias. +// TODO(fbarchard): Consider moving constants into a common header. +#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ +#define YGBJ 32 /* 64 / 2 */ + +// U and V contributions to R,G,B. +#define UBJ -113 /* round(-1.77200 * 64) */ +#define UGJ 22 /* round(0.34414 * 64) */ +#define VGJ 46 /* round(0.71414 * 64) */ +#define VRJ -90 /* round(-1.40200 * 64) */ + +// Bias values to subtract 16 from Y and 128 from U and V. +#define BBJ (UBJ * 128 + YGBJ) +#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ) +#define BRJ (VRJ * 128 + YGBJ) + +// JPEG constants for YUV to RGB. +YuvConstants SIMD_ALIGNED(kYuvJConstants) = { + { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, + UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 }, + { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, + UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, + UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, + UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ }, + { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, + 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ }, + { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, + BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ }, + { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, + BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ }, + { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, + BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ }, + { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, + YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ } +}; + +#undef YGJ +#undef YGBJ +#undef UBJ +#undef UGJ +#undef VGJ +#undef VRJ +#undef BBJ +#undef BGJ +#undef BRJ + +// Read 8 UV from 411 +#define READYUV444 \ + "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ + MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ + "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" + +// Read 4 UV from 422, upsample to 8 UV +#define READYUV422 \ + "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ + MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ + "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" + +// Read 2 UV from 411, upsample to 8 UV +#define READYUV411 \ + "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ + MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ + "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "punpckldq %%xmm0,%%xmm0 \n" + +// Read 4 UV from NV12, upsample to 8 UV +#define READNV12 \ + "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ + "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" + +// Convert 8 pixels: 8 UV and 8 Y +#define YUVTORGB(YuvConstants) \ + "movdqa %%xmm0,%%xmm1 \n" \ + "movdqa %%xmm0,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm3 \n" \ + "movdqa " MEMACCESS2(96, [YuvConstants]) ",%%xmm0 \n" \ + "pmaddubsw " MEMACCESS([YuvConstants]) ",%%xmm1 \n" \ + "psubw %%xmm1,%%xmm0 \n" \ + "movdqa " MEMACCESS2(128, [YuvConstants]) ",%%xmm1 \n" \ + "pmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%xmm2 \n" \ + "psubw %%xmm2,%%xmm1 \n" \ + "movdqa " MEMACCESS2(160, [YuvConstants]) ",%%xmm2 \n" \ + "pmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%xmm3 \n" \ + "psubw %%xmm3,%%xmm2 \n" \ + "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \ + "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ + "punpcklbw %%xmm3,%%xmm3 \n" \ + "pmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%xmm3 \n" \ + "paddsw %%xmm3,%%xmm0 \n" \ + "paddsw %%xmm3,%%xmm1 \n" \ + "paddsw %%xmm3,%%xmm2 \n" \ + "psraw $0x6,%%xmm0 \n" \ + "psraw $0x6,%%xmm1 \n" \ + "psraw $0x6,%%xmm2 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "packuswb %%xmm1,%%xmm1 \n" \ + "packuswb %%xmm2,%%xmm2 \n" + +// Store 8 ARGB values. Assumes XMM5 is zero. +#define STOREARGB \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklbw %%xmm5,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm1 \n" \ + "punpcklwd %%xmm2,%%xmm0 \n" \ + "punpckhwd %%xmm2,%%xmm1 \n" \ + "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ + "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \ + "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n" + +// Store 8 BGRA values. Assumes XMM5 is zero. +#define STOREBGRA \ + "pcmpeqb %%xmm5,%%xmm5 \n" \ + "punpcklbw %%xmm0,%%xmm1 \n" \ + "punpcklbw %%xmm2,%%xmm5 \n" \ + "movdqa %%xmm5,%%xmm0 \n" \ + "punpcklwd %%xmm1,%%xmm5 \n" \ + "punpckhwd %%xmm1,%%xmm0 \n" \ + "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \ + "movdqu %%xmm0," MEMACCESS2(0x10, [dst_bgra]) " \n" \ + "lea " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra] \n" + +// Store 8 ABGR values. Assumes XMM5 is zero. +#define STOREABGR \ + "punpcklbw %%xmm1,%%xmm2 \n" \ + "punpcklbw %%xmm5,%%xmm0 \n" \ + "movdqa %%xmm2,%%xmm1 \n" \ + "punpcklwd %%xmm0,%%xmm2 \n" \ + "punpckhwd %%xmm0,%%xmm1 \n" \ + "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \ + "movdqu %%xmm1," MEMACCESS2(0x10, [dst_abgr]) " \n" \ + "lea " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr] \n" + +// Store 8 RGBA values. Assumes XMM5 is zero. +#define STORERGBA \ + "pcmpeqb %%xmm5,%%xmm5 \n" \ + "punpcklbw %%xmm2,%%xmm1 \n" \ + "punpcklbw %%xmm0,%%xmm5 \n" \ + "movdqa %%xmm5,%%xmm0 \n" \ + "punpcklwd %%xmm1,%%xmm5 \n" \ + "punpckhwd %%xmm1,%%xmm0 \n" \ + "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ + "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ + "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" + +void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN + "1: \n" + READYUV444 + YUVTORGB(kYuvConstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} + +// TODO(fbarchard): Consider putting masks into constants. +void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_rgb24, + int width) { + asm volatile ( + "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" + "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" + "sub %[u_buf],%[v_buf] \n" + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB(kYuvConstants) + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n" + "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n" + "subl $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] +// TODO(fbarchard): Make width a register for 32 bit. +#if defined(__i386__) && defined(__pic__) + [width]"+m"(width) // %[width] +#else + [width]"+rm"(width) // %[width] +#endif + : [kYuvConstants]"r"(&kYuvConstants.kUVToB), + [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), + [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6" + ); +} + +void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_raw, + int width) { + asm volatile ( + "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n" + "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n" + "sub %[u_buf],%[v_buf] \n" + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB(kYuvConstants) + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "movq %%xmm0," MEMACCESS([dst_raw]) " \n" + "movdqu %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n" + "lea " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n" + "subl $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_raw]"+r"(dst_raw), // %[dst_raw] +// TODO(fbarchard): Make width a register for 32 bit. +#if defined(__i386__) && defined(__pic__) + [width]"+m"(width) // %[width] +#else + [width]"+rm"(width) // %[width] +#endif + : [kYuvConstants]"r"(&kYuvConstants.kUVToB), + [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0), + [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW) + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6" + ); +} + +void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB(kYuvConstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} + +void OMITFP J422ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB(kYuvConstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} + +void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN + "1: \n" + READYUV411 + YUVTORGB(kYuvConstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} + +void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN + "1: \n" + READNV12 + YUVTORGB(kYuvConstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + // Does not use r14. + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} + +void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN + "1: \n" + READNV12 + YUVTORGB(kYuvConstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYvuConstants.kUVToB) // %[kYuvConstants] + // Does not use r14. + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} + +void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_bgra, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB(kYuvConstants) + STOREBGRA + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} + +void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_abgr, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB(kYuvConstants) + STOREABGR + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} + +void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_rgba, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB(kYuvConstants) + STORERGBA + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} + +#endif // HAS_I422TOARGBROW_SSSE3 + +// Read 8 UV from 422, upsample to 16 UV. +#define READYUV422_AVX2 \ + "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ + MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ + "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" + +// Convert 16 pixels: 16 UV and 16 Y. +#define YUVTORGB_AVX2(YuvConstants) \ + "vpmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2 \n" \ + "vpmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1 \n" \ + "vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \ + "vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \ + "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ + "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm3 \n" \ + "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ + "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm3 \n" \ + "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ + "vmovdqu " MEMACCESS([y_buf]) ",%%xmm3 \n" \ + "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ + "vpunpcklbw %%ymm3,%%ymm3,%%ymm3 \n" \ + "vpmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3 \n" \ + "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" \ + "vpaddsw %%ymm3,%%ymm1,%%ymm1 \n" \ + "vpaddsw %%ymm3,%%ymm2,%%ymm2 \n" \ + "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ + "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ + "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ + "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" + +#if defined(HAS_I422TOBGRAROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). +void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_bgra, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN + "1: \n" + READYUV422_AVX2 + YUVTORGB_AVX2(kYuvConstants) + + // Step 3: Weave into BGRA + "vpunpcklbw %%ymm0,%%ymm1,%%ymm1 \n" // GB + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm2,%%ymm5,%%ymm2 \n" // AR + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" // ARGB first 8 pixels + "vpunpckhwd %%ymm1,%%ymm2,%%ymm2 \n" // ARGB next 8 pixels + + "vmovdqu %%ymm0," MEMACCESS([dst_bgra]) "\n" + "vmovdqu %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n" + "lea " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} +#endif // HAS_I422TOBGRAROW_AVX2 + +#if defined(HAS_I422TOARGBROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN + "1: \n" + READYUV422_AVX2 + YUVTORGB_AVX2(kYuvConstants) + + // Step 3: Weave into ARGB + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels + "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels + + "vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n" + "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n" + "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} +#endif // HAS_I422TOARGBROW_AVX2 + +#if defined(HAS_J422TOARGBROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP J422ToARGBRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN + "1: \n" + READYUV422_AVX2 + YUVTORGB_AVX2(kYuvConstants) + + // Step 3: Weave into ARGB + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels + "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels + + "vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n" + "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n" + "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} +#endif // HAS_J422TOARGBROW_AVX2 + +#if defined(HAS_I422TOABGRROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). +void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN + "1: \n" + READYUV422_AVX2 + YUVTORGB_AVX2(kYuvConstants) + + // Step 3: Weave into ABGR + "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" // RG + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" // BA + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" // RGBA first 8 pixels + "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" // RGBA next 8 pixels + "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" + "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" + "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} +#endif // HAS_I422TOABGRROW_AVX2 + +#if defined(HAS_I422TORGBAROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). +void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN + "1: \n" + READYUV422_AVX2 + YUVTORGB_AVX2(kYuvConstants) + + // Step 3: Weave into RGBA + "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n" + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" + "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" + "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" + "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" + "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} +#endif // HAS_I422TORGBAROW_AVX2 + +#ifdef HAS_I400TOARGBROW_SSE2 +void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { + asm volatile ( + "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 + "movd %%eax,%%xmm2 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16 + "movd %%eax,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + LABELALIGN + "1: \n" + // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 + "movq " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x8,0) ",%0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "psubusw %%xmm3,%%xmm0 \n" + "psrlw $6, %%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + + // Step 2: Weave into ARGB + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "por %%xmm4,%%xmm1 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(y_buf), // %0 + "+r"(dst_argb), // %1 + "+rm"(width) // %2 + : + : "memory", "cc", "eax" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" + ); +} +#endif // HAS_I400TOARGBROW_SSE2 + +#ifdef HAS_I400TOARGBROW_AVX2 +// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). +// note: vpunpcklbw mutates and vpackuswb unmutates. +void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) { + asm volatile ( + "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16 + "vmovd %%eax,%%xmm2 \n" + "vbroadcastss %%xmm2,%%ymm2 \n" + "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164 + "vmovd %%eax,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpslld $0x18,%%ymm4,%%ymm4 \n" + + LABELALIGN + "1: \n" + // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164 + "vmovdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n" + "vpsrlw $0x6,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n" + "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n" + "vpor %%ymm4,%%ymm0,%%ymm0 \n" + "vpor %%ymm4,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(y_buf), // %0 + "+r"(dst_argb), // %1 + "+rm"(width) // %2 + : + : "memory", "cc", "eax" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" + ); +} +#endif // HAS_I400TOARGBROW_AVX2 + +#ifdef HAS_MIRRORROW_SSSE3 +// Shuffle table for reversing the bytes. +static uvec8 kShuffleMirror = { + 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u +}; + +void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile ( + "movdqa %3,%%xmm5 \n" + LABELALIGN + "1: \n" + MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0 + "pshufb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirror) // %3 + : "memory", "cc", NACL_R14 + "xmm0", "xmm5" + ); +} +#endif // HAS_MIRRORROW_SSSE3 + +#ifdef HAS_MIRRORROW_AVX2 +void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile ( + "vbroadcastf128 %3,%%ymm5 \n" + LABELALIGN + "1: \n" + MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0 + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpermq $0x4e,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirror) // %3 + : "memory", "cc", NACL_R14 + "xmm0", "xmm5" + ); +} +#endif // HAS_MIRRORROW_AVX2 + +#ifdef HAS_MIRRORROW_SSE2 +void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile ( + LABELALIGN + "1: \n" + MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0 + "movdqa %%xmm0,%%xmm1 \n" + "psllw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "pshuflw $0x1b,%%xmm0,%%xmm0 \n" + "pshufhw $0x1b,%%xmm0,%%xmm0 \n" + "pshufd $0x4e,%%xmm0,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1)",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : + : "memory", "cc", NACL_R14 + "xmm0", "xmm1" + ); +} +#endif // HAS_MIRRORROW_SSE2 + +#ifdef HAS_MIRRORROW_UV_SSSE3 +// Shuffle table for reversing the bytes of UV channels. +static uvec8 kShuffleMirrorUV = { + 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u +}; +void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, + int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile ( + "movdqa %4,%%xmm1 \n" + "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(-0x10,0) ",%0 \n" + "pshufb %%xmm1,%%xmm0 \n" + "movlpd %%xmm0," MEMACCESS(1) " \n" + MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $8,%3 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(temp_width) // %3 + : "m"(kShuffleMirrorUV) // %4 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1" + ); +} +#endif // HAS_MIRRORROW_UV_SSSE3 + +#ifdef HAS_ARGBMIRRORROW_SSE2 + +void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile ( + "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "pshufd $0x1b,%%xmm0,%%xmm0 \n" + "lea " MEMLEA(-0x10,0) ",%0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : + : "memory", "cc" + , "xmm0" + ); +} +#endif // HAS_ARGBMIRRORROW_SSE2 + +#ifdef HAS_ARGBMIRRORROW_AVX2 +// Shuffle table for reversing the bytes. +static const ulvec32 kARGBShuffleMirror_AVX2 = { + 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u +}; +void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile ( + "vmovdqu %3,%%ymm5 \n" + LABELALIGN + "1: \n" + VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0 + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kARGBShuffleMirror_AVX2) // %3 + : "memory", "cc", NACL_R14 + "xmm0", "xmm5" + ); +} +#endif // HAS_ARGBMIRRORROW_AVX2 + +#ifdef HAS_SPLITUVROW_AVX2 +void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm2 \n" + "vpsrlw $0x8,%%ymm1,%%ymm3 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2) + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} +#endif // HAS_SPLITUVROW_AVX2 + +#ifdef HAS_SPLITUVROW_SSE2 +void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} +#endif // HAS_SPLITUVROW_SSE2 + +#ifdef HAS_MERGEUVROW_AVX2 +void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) { + asm volatile ( + "sub %0,%1 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1 + "lea " MEMLEA(0x20,0) ",%0 \n" + "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n" + "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n" + "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n" + "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n" + "lea " MEMLEA(0x40,2) ",%2 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2" + ); +} +#endif // HAS_MERGEUVROW_AVX2 + +#ifdef HAS_MERGEUVROW_SSE2 +void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) { + asm volatile ( + "sub %0,%1 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm2 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" + "lea " MEMLEA(0x20,2) ",%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2" + ); +} +#endif // HAS_MERGEUVROW_SSE2 + +#ifdef HAS_COPYROW_SSE2 +void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(count) // %2 + : + : "memory", "cc" + , "xmm0", "xmm1" + ); +} +#endif // HAS_COPYROW_SSE2 + +#ifdef HAS_COPYROW_AVX +void CopyRow_AVX(const uint8* src, uint8* dst, int count) { + asm volatile ( + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" + "sub $0x40,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(count) // %2 + : + : "memory", "cc" + , "xmm0", "xmm1" + ); +} +#endif // HAS_COPYROW_AVX + +#ifdef HAS_COPYROW_ERMS +// Multiple of 1. +void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { + size_t width_tmp = (size_t)(width); + asm volatile ( + "rep movsb " MEMMOVESTRING(0,1) " \n" + : "+S"(src), // %0 + "+D"(dst), // %1 + "+c"(width_tmp) // %2 + : + : "memory", "cc" + ); +} +#endif // HAS_COPYROW_ERMS + +#ifdef HAS_ARGBCOPYALPHAROW_SSE2 +// width in pixels +void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { + asm volatile ( + "pcmpeqb %%xmm0,%%xmm0 \n" + "pslld $0x18,%%xmm0 \n" + "pcmpeqb %%xmm1,%%xmm1 \n" + "psrld $0x8,%%xmm1 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqu " MEMACCESS(1) ",%%xmm4 \n" + "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n" + "pand %%xmm0,%%xmm2 \n" + "pand %%xmm0,%%xmm3 \n" + "pand %%xmm1,%%xmm4 \n" + "pand %%xmm1,%%xmm5 \n" + "por %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm2," MEMACCESS(1) " \n" + "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_ARGBCOPYALPHAROW_SSE2 + +#ifdef HAS_ARGBCOPYALPHAROW_AVX2 +// width in pixels +void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { + asm volatile ( + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" + "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm1," MEMACCESS(1) " \n" + "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc" + , "xmm0", "xmm1", "xmm2" + ); +} +#endif // HAS_ARGBCOPYALPHAROW_AVX2 + +#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 +// width in pixels +void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { + asm volatile ( + "pcmpeqb %%xmm0,%%xmm0 \n" + "pslld $0x18,%%xmm0 \n" + "pcmpeqb %%xmm1,%%xmm1 \n" + "psrld $0x8,%%xmm1 \n" + LABELALIGN + "1: \n" + "movq " MEMACCESS(0) ",%%xmm2 \n" + "lea " MEMLEA(0x8,0) ",%0 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "punpckhwd %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "movdqu " MEMACCESS(1) ",%%xmm4 \n" + "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n" + "pand %%xmm0,%%xmm2 \n" + "pand %%xmm0,%%xmm3 \n" + "pand %%xmm1,%%xmm4 \n" + "pand %%xmm1,%%xmm5 \n" + "por %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm2," MEMACCESS(1) " \n" + "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 + +#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 +// width in pixels +void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { + asm volatile ( + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" + LABELALIGN + "1: \n" + "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n" + "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "vpslld $0x18,%%ymm1,%%ymm1 \n" + "vpslld $0x18,%%ymm2,%%ymm2 \n" + "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" + "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm1," MEMACCESS(1) " \n" + "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc" + , "xmm0", "xmm1", "xmm2" + ); +} +#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 + +#ifdef HAS_SETROW_X86 +void SetRow_X86(uint8* dst, uint8 v8, int width) { + size_t width_tmp = (size_t)(width >> 2); + const uint32 v32 = v8 * 0x01010101; // Duplicate byte to all bytes. + asm volatile ( + "rep stosl " MEMSTORESTRING(eax,0) " \n" + : "+D"(dst), // %0 + "+c"(width_tmp) // %1 + : "a"(v32) // %2 + : "memory", "cc"); +} + +void SetRow_ERMS(uint8* dst, uint8 v8, int width) { + size_t width_tmp = (size_t)(width); + asm volatile ( + "rep stosb " MEMSTORESTRING(al,0) " \n" + : "+D"(dst), // %0 + "+c"(width_tmp) // %1 + : "a"(v8) // %2 + : "memory", "cc"); +} + +void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) { + size_t width_tmp = (size_t)(width); + asm volatile ( + "rep stosl " MEMSTORESTRING(eax,0) " \n" + : "+D"(dst_argb), // %0 + "+c"(width_tmp) // %1 + : "a"(v32) // %2 + : "memory", "cc"); +} +#endif // HAS_SETROW_X86 + +#ifdef HAS_YUY2TOYROW_SSE2 +void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" + , "xmm0", "xmm1", "xmm5" + ); +} + +void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 + MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : "r"((intptr_t)(stride_yuy2)) // %4 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} + +void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm5" + ); +} + +void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" + , "xmm0", "xmm1" + ); +} + +void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 + MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : "r"((intptr_t)(stride_uyvy)) // %4 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} + +void UYVYToUV422Row_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm5" + ); +} +#endif // HAS_YUY2TOYROW_SSE2 + +#ifdef HAS_YUY2TOYROW_AVX2 +void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) { + asm volatile ( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" + , "xmm0", "xmm1", "xmm5" + ); +} + +void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 + VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" + VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : "r"((intptr_t)(stride_yuy2)) // %4 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm5" + ); +} + +void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" + VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm5" + ); +} + +void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) { + asm volatile ( + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" + , "xmm0", "xmm1", "xmm5" + ); +} +void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 + VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" + VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : "r"((intptr_t)(stride_uyvy)) // %4 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm5" + ); +} + +void UYVYToUV422Row_AVX2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" + VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm5" + ); +} +#endif // HAS_YUY2TOYROW_AVX2 + +#ifdef HAS_ARGBBLENDROW_SSE2 +// Blend 8 pixels at a time. +void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0xf,%%xmm7 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x8,%%xmm6 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + + // 4 pixel loop. + LABELALIGN + "41: \n" + "movdqu " MEMACCESS(0) ",%%xmm3 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movdqu " MEMACCESS(1) ",%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "pshufhw $0xf5,%%xmm3,%%xmm3 \n" + "pshuflw $0xf5,%%xmm3,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movdqu " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%3 \n" + "jge 41b \n" + + "49: \n" + "add $0x3,%3 \n" + "jl 99f \n" + + // 1 pixel loop. + "91: \n" + "movd " MEMACCESS(0) ",%%xmm3 \n" + "lea " MEMLEA(0x4,0) ",%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movd " MEMACCESS(1) ",%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "pshufhw $0xf5,%%xmm3,%%xmm3 \n" + "pshuflw $0xf5,%%xmm3,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movd " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x4,1) ",%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movd %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x4,2) ",%2 \n" + "sub $0x1,%3 \n" + "jge 91b \n" + "99: \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_ARGBBLENDROW_SSE2 + +#ifdef HAS_ARGBBLENDROW_SSSE3 +// Shuffle table for isolating alpha. +static uvec8 kShuffleAlpha = { + 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, + 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 +}; + +// Blend 8 pixels at a time +// Shuffle table for reversing the bytes. + +// Same as SSE2, but replaces +// psrlw xmm3, 8 // alpha +// pshufhw xmm3, xmm3,0F5h // 8 alpha words +// pshuflw xmm3, xmm3,0F5h +// with.. +// pshufb xmm3, kShuffleAlpha // alpha + +void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0xf,%%xmm7 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x8,%%xmm6 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + + // 4 pixel loop. + LABELALIGN + "40: \n" + "movdqu " MEMACCESS(0) ",%%xmm3 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movdqu " MEMACCESS(1) ",%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movdqu " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%3 \n" + "jge 40b \n" + + "49: \n" + "add $0x3,%3 \n" + "jl 99f \n" + + // 1 pixel loop. + "91: \n" + "movd " MEMACCESS(0) ",%%xmm3 \n" + "lea " MEMLEA(0x4,0) ",%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movd " MEMACCESS(1) ",%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movd " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x4,1) ",%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movd %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x4,2) ",%2 \n" + "sub $0x1,%3 \n" + "jge 91b \n" + "99: \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : "m"(kShuffleAlpha) // %4 + : "memory", "cc" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_ARGBBLENDROW_SSSE3 + +#ifdef HAS_ARGBATTENUATEROW_SSE2 +// Attenuate 4 pixels at a time. +void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrld $0x8,%%xmm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "pshufhw $0xff,%%xmm0,%%xmm2 \n" + "pshuflw $0xff,%%xmm2,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "pshufhw $0xff,%%xmm1,%%xmm2 \n" + "pshuflw $0xff,%%xmm2,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm2 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "psrlw $0x8,%%xmm0 \n" + "pand %%xmm4,%%xmm2 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "pand %%xmm5,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "memory", "cc" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_ARGBATTENUATEROW_SSE2 + +#ifdef HAS_ARGBATTENUATEROW_SSSE3 +// Shuffle table duplicating alpha +static uvec8 kShuffleAlpha0 = { + 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u +}; +static uvec8 kShuffleAlpha1 = { + 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, + 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u +}; +// Attenuate 4 pixels at a time. +void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + "pcmpeqb %%xmm3,%%xmm3 \n" + "pslld $0x18,%%xmm3 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" + "punpcklbw %%xmm1,%%xmm1 \n" + "pmulhuw %%xmm1,%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm2 \n" + "punpckhbw %%xmm2,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm2 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "pand %%xmm3,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleAlpha0), // %3 + "m"(kShuffleAlpha1) // %4 + : "memory", "cc" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_ARGBATTENUATEROW_SSSE3 + +#ifdef HAS_ARGBATTENUATEROW_AVX2 +// Shuffle table duplicating alpha. +static const uvec8 kShuffleAlpha_AVX2 = { + 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u +}; +// Attenuate 8 pixels at a time. +void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + "vbroadcastf128 %3,%%ymm4 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpslld $0x18,%%ymm5,%%ymm5 \n" + "sub %0,%1 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm6 \n" + "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" + "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" + "vpshufb %%ymm4,%%ymm0,%%ymm2 \n" + "vpshufb %%ymm4,%%ymm1,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpand %%ymm5,%%ymm6,%%ymm6 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpor %%ymm6,%%ymm0,%%ymm0 \n" + MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1) + "lea " MEMLEA(0x20,0) ",%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleAlpha_AVX2) // %3 + : "memory", "cc" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + ); +} +#endif // HAS_ARGBATTENUATEROW_AVX2 + +#ifdef HAS_ARGBUNATTENUATEROW_SSE2 +// Unattenuate 4 pixels at a time. +void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, + int width) { + uintptr_t alpha = 0; + asm volatile ( + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movzb " MEMACCESS2(0x03,0) ",%3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 + "movzb " MEMACCESS2(0x07,0) ",%3 \n" + MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "movlhps %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" + "movzb " MEMACCESS2(0x0b,0) ",%3 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 + "movzb " MEMACCESS2(0x0f,0) ",%3 \n" + MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "movlhps %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width), // %2 + "+r"(alpha) // %3 + : "r"(fixed_invtbl8) // %4 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_ARGBUNATTENUATEROW_SSE2 + +#ifdef HAS_ARGBUNATTENUATEROW_AVX2 +// Shuffle table duplicating alpha. +static const uvec8 kUnattenShuffleAlpha_AVX2 = { + 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u +}; +// Unattenuate 8 pixels at a time. +void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, + int width) { + uintptr_t alpha = 0; + asm volatile ( + "sub %0,%1 \n" + "vbroadcastf128 %5,%%ymm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + // replace VPGATHER + "movzb " MEMACCESS2(0x03,0) ",%3 \n" + MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0 + "movzb " MEMACCESS2(0x07,0) ",%3 \n" + MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1 + "movzb " MEMACCESS2(0x0b,0) ",%3 \n" + "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n" + MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2 + "movzb " MEMACCESS2(0x0f,0) ",%3 \n" + MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3 + "movzb " MEMACCESS2(0x13,0) ",%3 \n" + "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n" + MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0 + "movzb " MEMACCESS2(0x17,0) ",%3 \n" + MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1 + "movzb " MEMACCESS2(0x1b,0) ",%3 \n" + "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n" + MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2 + "movzb " MEMACCESS2(0x1f,0) ",%3 \n" + MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3 + "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n" + "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n" + "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n" + "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n" + // end of VPGATHER + + "vmovdqu " MEMACCESS(0) ",%%ymm6 \n" + "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" + "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" + "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n" + "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n" + "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1) + "lea " MEMLEA(0x20,0) ",%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width), // %2 + "+r"(alpha) // %3 + : "r"(fixed_invtbl8), // %4 + "m"(kUnattenShuffleAlpha_AVX2) // %5 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_ARGBUNATTENUATEROW_AVX2 + +#ifdef HAS_ARGBGRAYROW_SSSE3 +// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels +void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "phaddw %%xmm1,%%xmm0 \n" + "paddw %%xmm5,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "psrld $0x18,%%xmm2 \n" + "psrld $0x18,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpcklbw %%xmm2,%%xmm3 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm3,%%xmm0 \n" + "punpckhwd %%xmm3,%%xmm1 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kARGBToYJ), // %3 + "m"(kAddYJ64) // %4 + : "memory", "cc" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_ARGBGRAYROW_SSSE3 + +#ifdef HAS_ARGBSEPIAROW_SSSE3 +// b = (r * 35 + g * 68 + b * 17) >> 7 +// g = (r * 45 + g * 88 + b * 22) >> 7 +// r = (r * 50 + g * 98 + b * 24) >> 7 +// Constant for ARGB color to sepia tone +static vec8 kARGBToSepiaB = { + 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 +}; + +static vec8 kARGBToSepiaG = { + 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 +}; + +static vec8 kARGBToSepiaR = { + 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 +}; + +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. +void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { + asm volatile ( + "movdqa %2,%%xmm2 \n" + "movdqa %3,%%xmm3 \n" + "movdqa %4,%%xmm4 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm2,%%xmm6 \n" + "phaddw %%xmm6,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm5 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm5 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "phaddw %%xmm1,%%xmm5 \n" + "psrlw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm5 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm5 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "phaddw %%xmm1,%%xmm5 \n" + "psrlw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "movdqu " MEMACCESS(0) ",%%xmm6 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "psrld $0x18,%%xmm6 \n" + "psrld $0x18,%%xmm1 \n" + "packuswb %%xmm1,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm5 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm5,%%xmm0 \n" + "punpckhwd %%xmm5,%%xmm1 \n" + "movdqu %%xmm0," MEMACCESS(0) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "sub $0x8,%1 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "m"(kARGBToSepiaB), // %2 + "m"(kARGBToSepiaG), // %3 + "m"(kARGBToSepiaR) // %4 + : "memory", "cc" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + ); +} +#endif // HAS_ARGBSEPIAROW_SSSE3 + +#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 +// Tranform 8 ARGB pixels (32 bytes) with color matrix. +// Same as Sepia except matrix is provided. +void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) { + asm volatile ( + "movdqu " MEMACCESS(3) ",%%xmm5 \n" + "pshufd $0x00,%%xmm5,%%xmm2 \n" + "pshufd $0x55,%%xmm5,%%xmm3 \n" + "pshufd $0xaa,%%xmm5,%%xmm4 \n" + "pshufd $0xff,%%xmm5,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm2,%%xmm7 \n" + "movdqu " MEMACCESS(0) ",%%xmm6 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "phaddsw %%xmm7,%%xmm0 \n" + "phaddsw %%xmm1,%%xmm6 \n" + "psraw $0x6,%%xmm0 \n" + "psraw $0x6,%%xmm6 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm7 \n" + "phaddsw %%xmm7,%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm6 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm7 \n" + "phaddsw %%xmm7,%%xmm6 \n" + "psraw $0x6,%%xmm1 \n" + "psraw $0x6,%%xmm6 \n" + "packuswb %%xmm1,%%xmm1 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "punpcklwd %%xmm1,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm6 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 + : "memory", "cc" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_ARGBCOLORMATRIXROW_SSSE3 + +#ifdef HAS_ARGBQUANTIZEROW_SSE2 +// Quantize 4 ARGB pixels (16 bytes). +void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width) { + asm volatile ( + "movd %2,%%xmm2 \n" + "movd %3,%%xmm3 \n" + "movd %4,%%xmm4 \n" + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshufd $0x44,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "pshufd $0x44,%%xmm3,%%xmm3 \n" + "pshuflw $0x40,%%xmm4,%%xmm4 \n" + "pshufd $0x44,%%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "pslld $0x18,%%xmm6 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "pmullw %%xmm3,%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm7 \n" + "pmullw %%xmm3,%%xmm1 \n" + "pand %%xmm6,%%xmm7 \n" + "paddw %%xmm4,%%xmm0 \n" + "paddw %%xmm4,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "por %%xmm7,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(0) " \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "sub $0x4,%1 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "r"(scale), // %2 + "r"(interval_size), // %3 + "r"(interval_offset) // %4 + : "memory", "cc" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_ARGBQUANTIZEROW_SSE2 + +#ifdef HAS_ARGBSHADEROW_SSE2 +// Shade 4 pixels at a time by specified value. +void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value) { + asm volatile ( + "movd %3,%%xmm2 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "punpcklqdq %%xmm2,%%xmm2 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 + : "memory", "cc" + , "xmm0", "xmm1", "xmm2" + ); +} +#endif // HAS_ARGBSHADEROW_SSE2 + +#ifdef HAS_ARGBMULTIPLYROW_SSE2 +// Multiply 2 rows of ARGB pixels together, 4 pixels at a time. +void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + "pxor %%xmm5,%%xmm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqu " MEMACCESS(1) ",%%xmm2 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "movdqu %%xmm0,%%xmm1 \n" + "movdqu %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpckhbw %%xmm5,%%xmm3 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} +#endif // HAS_ARGBMULTIPLYROW_SSE2 + +#ifdef HAS_ARGBMULTIPLYROW_AVX2 +// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. +void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "vmovdqu " MEMACCESS(1) ",%%ymm3 \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" + "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" + "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x20,2) ",%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__AVX2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} +#endif // HAS_ARGBMULTIPLYROW_AVX2 + +#ifdef HAS_ARGBADDROW_SSE2 +// Add 2 rows of ARGB pixels together, 4 pixels at a time. +void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqu " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" + , "xmm0", "xmm1" + ); +} +#endif // HAS_ARGBADDROW_SSE2 + +#ifdef HAS_ARGBADDROW_AVX2 +// Add 2 rows of ARGB pixels together, 4 pixels at a time. +void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 4 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "vmovdqu %%ymm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x20,2) ",%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" + , "xmm0" + ); +} +#endif // HAS_ARGBADDROW_AVX2 + +#ifdef HAS_ARGBSUBTRACTROW_SSE2 +// Subtract 2 rows of ARGB pixels, 4 pixels at a time. +void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqu " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "psubusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" + , "xmm0", "xmm1" + ); +} +#endif // HAS_ARGBSUBTRACTROW_SSE2 + +#ifdef HAS_ARGBSUBTRACTROW_AVX2 +// Subtract 2 rows of ARGB pixels, 8 pixels at a time. +void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 4 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "vmovdqu %%ymm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x20,2) ",%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" + , "xmm0" + ); +} +#endif // HAS_ARGBSUBTRACTROW_AVX2 + +#ifdef HAS_SOBELXROW_SSE2 +// SobelX as a matrix is +// -1 0 1 +// -2 0 2 +// -1 0 1 +void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobelx, int width) { + asm volatile ( + "sub %0,%1 \n" + "sub %0,%2 \n" + "sub %0,%3 \n" + "pxor %%xmm5,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "psubw %%xmm1,%%xmm0 \n" + MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 + MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2 + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "psubw %%xmm2,%%xmm1 \n" + MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2 + MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3 + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "psubw %%xmm3,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "psubw %%xmm0,%%xmm1 \n" + "pmaxsw %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1) + "lea " MEMLEA(0x8,0) ",%0 \n" + "sub $0x8,%4 \n" + "jg 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(src_y2), // %2 + "+r"(dst_sobelx), // %3 + "+r"(width) // %4 + : + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} +#endif // HAS_SOBELXROW_SSE2 + +#ifdef HAS_SOBELYROW_SSE2 +// SobelY as a matrix is +// -1 -2 -1 +// 0 0 0 +// 1 2 1 +void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width) { + asm volatile ( + "sub %0,%1 \n" + "sub %0,%2 \n" + "pxor %%xmm5,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "psubw %%xmm1,%%xmm0 \n" + "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n" + MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2 + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "psubw %%xmm2,%%xmm1 \n" + "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n" + MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3 + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "psubw %%xmm3,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "psubw %%xmm0,%%xmm1 \n" + "pmaxsw %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1) + "lea " MEMLEA(0x8,0) ",%0 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(dst_sobely), // %2 + "+r"(width) // %3 + : + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} +#endif // HAS_SOBELYROW_SSE2 + +#ifdef HAS_SOBELROW_SSE2 +// Adds Sobel X and Sobel Y and stores Sobel into ARGB. +// A = 255 +// R = Sobel +// G = Sobel +// B = Sobel +void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + asm volatile ( + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 + "lea " MEMLEA(0x10,0) ",%0 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm2 \n" + "punpckhbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm1 \n" + "punpckhwd %%xmm2,%%xmm2 \n" + "por %%xmm5,%%xmm1 \n" + "por %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklwd %%xmm0,%%xmm3 \n" + "punpckhwd %%xmm0,%%xmm0 \n" + "por %%xmm5,%%xmm3 \n" + "por %%xmm5,%%xmm0 \n" + "movdqu %%xmm1," MEMACCESS(2) " \n" + "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" + "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n" + "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n" + "lea " MEMLEA(0x40,2) ",%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} +#endif // HAS_SOBELROW_SSE2 + +#ifdef HAS_SOBELTOPLANEROW_SSE2 +// Adds Sobel X and Sobel Y and stores Sobel into a plane. +void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width) { + asm volatile ( + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 + "lea " MEMLEA(0x10,0) ",%0 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_y), // %2 + "+r"(width) // %3 + : + : "memory", "cc", NACL_R14 + "xmm0", "xmm1" + ); +} +#endif // HAS_SOBELTOPLANEROW_SSE2 + +#ifdef HAS_SOBELXYROW_SSE2 +// Mixes Sobel X, Sobel Y and Sobel into ARGB. +// A = 255 +// R = Sobel X +// G = Sobel +// B = Sobel Y +void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + asm volatile ( + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "paddusb %%xmm1,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "punpckhbw %%xmm5,%%xmm0 \n" + "movdqa %%xmm1,%%xmm4 \n" + "punpcklbw %%xmm2,%%xmm4 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqa %%xmm4,%%xmm6 \n" + "punpcklwd %%xmm3,%%xmm6 \n" + "punpckhwd %%xmm3,%%xmm4 \n" + "movdqa %%xmm1,%%xmm7 \n" + "punpcklwd %%xmm0,%%xmm7 \n" + "punpckhwd %%xmm0,%%xmm1 \n" + "movdqu %%xmm6," MEMACCESS(2) " \n" + "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n" + "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n" + "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n" + "lea " MEMLEA(0x40,2) ",%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_SOBELXYROW_SSE2 + +#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 +// Creates a table of cumulative sums where each value is a sum of all values +// above and to the left of the value, inclusive of the value. +void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, + const int32* previous_cumsum, int width) { + asm volatile ( + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + "test $0xf,%1 \n" + "jne 49f \n" + + // 4 pixel loop \n" + LABELALIGN + "40: \n" + "movdqu " MEMACCESS(0) ",%%xmm2 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm2,%%xmm4 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm1,%%xmm2 \n" + "punpckhwd %%xmm1,%%xmm3 \n" + "punpckhbw %%xmm1,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "punpcklwd %%xmm1,%%xmm4 \n" + "punpckhwd %%xmm1,%%xmm5 \n" + "paddd %%xmm2,%%xmm0 \n" + "movdqu " MEMACCESS(2) ",%%xmm2 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n" + "paddd %%xmm0,%%xmm3 \n" + "paddd %%xmm4,%%xmm0 \n" + "movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n" + "paddd %%xmm0,%%xmm4 \n" + "paddd %%xmm5,%%xmm0 \n" + "movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n" + "lea " MEMLEA(0x40,2) ",%2 \n" + "paddd %%xmm0,%%xmm5 \n" + "movdqu %%xmm2," MEMACCESS(1) " \n" + "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm4," MEMACCESS2(0x20,1) " \n" + "movdqu %%xmm5," MEMACCESS2(0x30,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" + "sub $0x4,%3 \n" + "jge 40b \n" + + "49: \n" + "add $0x3,%3 \n" + "jl 19f \n" + + // 1 pixel loop \n" + LABELALIGN + "10: \n" + "movd " MEMACCESS(0) ",%%xmm2 \n" + "lea " MEMLEA(0x4,0) ",%0 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "punpcklwd %%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm0 \n" + "movdqu " MEMACCESS(2) ",%%xmm2 \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "paddd %%xmm0,%%xmm2 \n" + "movdqu %%xmm2," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x1,%3 \n" + "jge 10b \n" + + "19: \n" + : "+r"(row), // %0 + "+r"(cumsum), // %1 + "+r"(previous_cumsum), // %2 + "+r"(width) // %3 + : + : "memory", "cc" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 + +#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 +void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, + int width, int area, uint8* dst, + int count) { + asm volatile ( + "movd %5,%%xmm5 \n" + "cvtdq2ps %%xmm5,%%xmm5 \n" + "rcpss %%xmm5,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + "cmpl $0x80,%5 \n" + "ja 40f \n" + + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrld $0x10,%%xmm6 \n" + "cvtdq2ps %%xmm6,%%xmm6 \n" + "addps %%xmm6,%%xmm5 \n" + "mulps %%xmm4,%%xmm5 \n" + "cvtps2dq %%xmm5,%%xmm5 \n" + "packssdw %%xmm5,%%xmm5 \n" + + // 4 pixel small loop \n" + LABELALIGN + "4: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" + MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 + MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 + MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 + MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 + "lea " MEMLEA(0x40,0) ",%0 \n" + "psubd " MEMACCESS(1) ",%%xmm0 \n" + "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" + "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" + "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" + MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 + MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 + MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 + MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 + "lea " MEMLEA(0x40,1) ",%1 \n" + "packssdw %%xmm1,%%xmm0 \n" + "packssdw %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm0 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%3 \n" + "jge 4b \n" + "jmp 49f \n" + + // 4 pixel loop \n" + LABELALIGN + "40: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" + MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 + MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 + MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 + MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 + "lea " MEMLEA(0x40,0) ",%0 \n" + "psubd " MEMACCESS(1) ",%%xmm0 \n" + "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" + "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" + "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" + MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 + MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 + MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 + MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 + "lea " MEMLEA(0x40,1) ",%1 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm1,%%xmm1 \n" + "mulps %%xmm4,%%xmm0 \n" + "mulps %%xmm4,%%xmm1 \n" + "cvtdq2ps %%xmm2,%%xmm2 \n" + "cvtdq2ps %%xmm3,%%xmm3 \n" + "mulps %%xmm4,%%xmm2 \n" + "mulps %%xmm4,%%xmm3 \n" + "cvtps2dq %%xmm0,%%xmm0 \n" + "cvtps2dq %%xmm1,%%xmm1 \n" + "cvtps2dq %%xmm2,%%xmm2 \n" + "cvtps2dq %%xmm3,%%xmm3 \n" + "packssdw %%xmm1,%%xmm0 \n" + "packssdw %%xmm3,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%3 \n" + "jge 40b \n" + + "49: \n" + "add $0x3,%3 \n" + "jl 19f \n" + + // 1 pixel loop \n" + LABELALIGN + "10: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 + "lea " MEMLEA(0x10,0) ",%0 \n" + "psubd " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 + "lea " MEMLEA(0x10,1) ",%1 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "mulps %%xmm4,%%xmm0 \n" + "cvtps2dq %%xmm0,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x4,2) ",%2 \n" + "sub $0x1,%3 \n" + "jge 10b \n" + "19: \n" + : "+r"(topleft), // %0 + "+r"(botleft), // %1 + "+r"(dst), // %2 + "+rm"(count) // %3 + : "r"((intptr_t)(width)), // %4 + "rm"(area) // %5 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + ); +} +#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 + +#ifdef HAS_ARGBAFFINEROW_SSE2 +// Copy ARGB pixels from source image with slope to a row of destination. +LIBYUV_API +void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, + uint8* dst_argb, const float* src_dudv, int width) { + intptr_t src_argb_stride_temp = src_argb_stride; + intptr_t temp = 0; + asm volatile ( + "movq " MEMACCESS(3) ",%%xmm2 \n" + "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n" + "shl $0x10,%1 \n" + "add $0x4,%1 \n" + "movd %1,%%xmm5 \n" + "sub $0x4,%4 \n" + "jl 49f \n" + + "pshufd $0x44,%%xmm7,%%xmm7 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "movdqa %%xmm2,%%xmm0 \n" + "addps %%xmm7,%%xmm0 \n" + "movlhps %%xmm0,%%xmm2 \n" + "movdqa %%xmm7,%%xmm4 \n" + "addps %%xmm4,%%xmm4 \n" + "movdqa %%xmm2,%%xmm3 \n" + "addps %%xmm4,%%xmm3 \n" + "addps %%xmm4,%%xmm4 \n" + + // 4 pixel loop \n" + LABELALIGN + "40: \n" + "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2 + "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2 + "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts + "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride + "movd %%xmm0,%k1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd %%xmm0,%k5 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 + MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 + "punpckldq %%xmm6,%%xmm1 \n" + "addps %%xmm4,%%xmm2 \n" + "movq %%xmm1," MEMACCESS(2) " \n" + "movd %%xmm0,%k1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd %%xmm0,%k5 \n" + MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 + MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 + "punpckldq %%xmm6,%%xmm0 \n" + "addps %%xmm4,%%xmm3 \n" + "movq %%xmm0," MEMACCESS2(0x08,2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%4 \n" + "jge 40b \n" + + "49: \n" + "add $0x3,%4 \n" + "jl 19f \n" + + // 1 pixel loop \n" + LABELALIGN + "10: \n" + "cvttps2dq %%xmm2,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "pmaddwd %%xmm5,%%xmm0 \n" + "addps %%xmm7,%%xmm2 \n" + "movd %%xmm0,%k1 \n" + MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 + "movd %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x04,2) ",%2 \n" + "sub $0x1,%4 \n" + "jge 10b \n" + "19: \n" + : "+r"(src_argb), // %0 + "+r"(src_argb_stride_temp), // %1 + "+r"(dst_argb), // %2 + "+r"(src_dudv), // %3 + "+rm"(width), // %4 + "+r"(temp) // %5 + : + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_ARGBAFFINEROW_SSE2 + +#ifdef HAS_INTERPOLATEROW_SSSE3 +// Bilinear filter 16x2 -> 16x1 +void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + asm volatile ( + "sub %1,%0 \n" + "shr %3 \n" + "cmp $0x0,%3 \n" + "je 100f \n" + "cmp $0x20,%3 \n" + "je 75f \n" + "cmp $0x40,%3 \n" + "je 50f \n" + "cmp $0x60,%3 \n" + "je 25f \n" + + "movd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x80,%3 \n" + "movd %3,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + + // General purpose row blend. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm2) + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "pmaddubsw %%xmm5,%%xmm0 \n" + "pmaddubsw %%xmm5,%%xmm1 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "jmp 99f \n" + + // Blend 25 / 75. + LABELALIGN + "25: \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm1) + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 25b \n" + "jmp 99f \n" + + // Blend 50 / 50. + LABELALIGN + "50: \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm1) + "pavgb %%xmm1,%%xmm0 \n" + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 50b \n" + "jmp 99f \n" + + // Blend 75 / 25. + LABELALIGN + "75: \n" + "movdqu " MEMACCESS(1) ",%%xmm1 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm0) + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 75b \n" + "jmp 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + LABELALIGN + "100: \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm5" + ); +} +#endif // HAS_INTERPOLATEROW_SSSE3 + +#ifdef HAS_INTERPOLATEROW_AVX2 +// Bilinear filter 32x2 -> 32x1 +void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + asm volatile ( + "shr %3 \n" + "cmp $0x0,%3 \n" + "je 100f \n" + "sub %1,%0 \n" + "cmp $0x20,%3 \n" + "je 75f \n" + "cmp $0x40,%3 \n" + "je 50f \n" + "cmp $0x60,%3 \n" + "je 25f \n" + + "vmovd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x80,%3 \n" + "vmovd %3,%%xmm5 \n" + "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n" + "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n" + "vpxor %%ymm0,%%ymm0,%%ymm0 \n" + "vpermd %%ymm5,%%ymm0,%%ymm5 \n" + + // General purpose row blend. + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(1) ",%%ymm0 \n" + MEMOPREG(vmovdqu,0x00,1,4,1,ymm2) + "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" + "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm5,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm5,%%ymm1,%%ymm1 \n" + "vpsrlw $0x7,%%ymm0,%%ymm0 \n" + "vpsrlw $0x7,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "jmp 99f \n" + + // Blend 25 / 75. + LABELALIGN + "25: \n" + "vmovdqu " MEMACCESS(1) ",%%ymm0 \n" + MEMOPREG(vmovdqu,0x00,1,4,1,ymm1) + "vpavgb %%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm1,%%ymm0,%%ymm0 \n" + MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 25b \n" + "jmp 99f \n" + + // Blend 50 / 50. + LABELALIGN + "50: \n" + "vmovdqu " MEMACCESS(1) ",%%ymm0 \n" + VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0) // vpavgb (%1,%4,1),%%ymm0,%%ymm0 + MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 50b \n" + "jmp 99f \n" + + // Blend 75 / 25. + LABELALIGN + "75: \n" + "vmovdqu " MEMACCESS(1) ",%%ymm1 \n" + MEMOPREG(vmovdqu,0x00,1,4,1,ymm0) + "vpavgb %%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm1,%%ymm0,%%ymm0 \n" + MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 75b \n" + "jmp 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + LABELALIGN + "100: \n" + "rep movsb " MEMMOVESTRING(1,0) " \n" + "jmp 999f \n" + + "99: \n" + "vzeroupper \n" + "999: \n" + : "+D"(dst_ptr), // %0 + "+S"(src_ptr), // %1 + "+c"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm5" + ); +} +#endif // HAS_INTERPOLATEROW_AVX2 + +#ifdef HAS_INTERPOLATEROW_SSE2 +// Bilinear filter 16x2 -> 16x1 +void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + asm volatile ( + "sub %1,%0 \n" + "shr %3 \n" + "cmp $0x0,%3 \n" + "je 100f \n" + "cmp $0x20,%3 \n" + "je 75f \n" + "cmp $0x40,%3 \n" + "je 50f \n" + "cmp $0x60,%3 \n" + "je 25f \n" + + "movd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x80,%3 \n" + "movd %3,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + + // General purpose row blend. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2 + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm4,%%xmm2 \n" + "punpckhbw %%xmm4,%%xmm3 \n" + "punpcklbw %%xmm4,%%xmm0 \n" + "punpckhbw %%xmm4,%%xmm1 \n" + "psubw %%xmm0,%%xmm2 \n" + "psubw %%xmm1,%%xmm3 \n" + "paddw %%xmm2,%%xmm2 \n" + "paddw %%xmm3,%%xmm3 \n" + "pmulhw %%xmm5,%%xmm2 \n" + "pmulhw %%xmm5,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "jmp 99f \n" + + // Blend 25 / 75. + LABELALIGN + "25: \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 25b \n" + "jmp 99f \n" + + // Blend 50 / 50. + LABELALIGN + "50: \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 + "pavgb %%xmm1,%%xmm0 \n" + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 50b \n" + "jmp 99f \n" + + // Blend 75 / 25. + LABELALIGN + "75: \n" + "movdqu " MEMACCESS(1) ",%%xmm1 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0 + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 75b \n" + "jmp 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + LABELALIGN + "100: \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_INTERPOLATEROW_SSE2 + +#ifdef HAS_ARGBSHUFFLEROW_SSSE3 +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + asm volatile ( + "movdqu " MEMACCESS(3) ",%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "r"(shuffler) // %3 + : "memory", "cc" + , "xmm0", "xmm1", "xmm5" + ); +} +#endif // HAS_ARGBSHUFFLEROW_SSSE3 + +#ifdef HAS_ARGBSHUFFLEROW_AVX2 +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + asm volatile ( + "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "r"(shuffler) // %3 + : "memory", "cc" + , "xmm0", "xmm1", "xmm5" + ); +} +#endif // HAS_ARGBSHUFFLEROW_AVX2 + +#ifdef HAS_ARGBSHUFFLEROW_SSE2 +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + uintptr_t pixel_temp = 0u; + asm volatile ( + "pxor %%xmm5,%%xmm5 \n" + "mov " MEMACCESS(4) ",%k2 \n" + "cmp $0x3000102,%k2 \n" + "je 3012f \n" + "cmp $0x10203,%k2 \n" + "je 123f \n" + "cmp $0x30201,%k2 \n" + "je 321f \n" + "cmp $0x2010003,%k2 \n" + "je 2103f \n" + + LABELALIGN + "1: \n" + "movzb " MEMACCESS(4) ",%2 \n" + MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 + "mov %b2," MEMACCESS(1) " \n" + "movzb " MEMACCESS2(0x1,4) ",%2 \n" + MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 + "mov %b2," MEMACCESS2(0x1,1) " \n" + "movzb " MEMACCESS2(0x2,4) ",%2 \n" + MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 + "mov %b2," MEMACCESS2(0x2,1) " \n" + "movzb " MEMACCESS2(0x3,4) ",%2 \n" + MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 + "mov %b2," MEMACCESS2(0x3,1) " \n" + "lea " MEMLEA(0x4,0) ",%0 \n" + "lea " MEMLEA(0x4,1) ",%1 \n" + "sub $0x1,%3 \n" + "jg 1b \n" + "jmp 99f \n" + + LABELALIGN + "123: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pshufhw $0x1b,%%xmm0,%%xmm0 \n" + "pshuflw $0x1b,%%xmm0,%%xmm0 \n" + "pshufhw $0x1b,%%xmm1,%%xmm1 \n" + "pshuflw $0x1b,%%xmm1,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%3 \n" + "jg 123b \n" + "jmp 99f \n" + + LABELALIGN + "321: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pshufhw $0x39,%%xmm0,%%xmm0 \n" + "pshuflw $0x39,%%xmm0,%%xmm0 \n" + "pshufhw $0x39,%%xmm1,%%xmm1 \n" + "pshuflw $0x39,%%xmm1,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%3 \n" + "jg 321b \n" + "jmp 99f \n" + + LABELALIGN + "2103: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pshufhw $0x93,%%xmm0,%%xmm0 \n" + "pshuflw $0x93,%%xmm0,%%xmm0 \n" + "pshufhw $0x93,%%xmm1,%%xmm1 \n" + "pshuflw $0x93,%%xmm1,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%3 \n" + "jg 2103b \n" + "jmp 99f \n" + + LABELALIGN + "3012: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pshufhw $0xc6,%%xmm0,%%xmm0 \n" + "pshuflw $0xc6,%%xmm0,%%xmm0 \n" + "pshufhw $0xc6,%%xmm1,%%xmm1 \n" + "pshuflw $0xc6,%%xmm1,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%3 \n" + "jg 3012b \n" + + "99: \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+d"(pixel_temp), // %2 + "+r"(pix) // %3 + : "r"(shuffler) // %4 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm5" + ); +} +#endif // HAS_ARGBSHUFFLEROW_SSE2 + +#ifdef HAS_I422TOYUY2ROW_SSE2 +void I422ToYUY2Row_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movq " MEMACCESS(1) ",%%xmm2 \n" + MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 + "lea " MEMLEA(0x8,1) ",%1 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0," MEMACCESS(3) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n" + "lea " MEMLEA(0x20,3) ",%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_frame), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3" + ); +} +#endif // HAS_I422TOYUY2ROW_SSE2 + +#ifdef HAS_I422TOUYVYROW_SSE2 +void I422ToUYVYRow_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movq " MEMACCESS(1) ",%%xmm2 \n" + MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 + "lea " MEMLEA(0x8,1) ",%1 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1," MEMACCESS(3) " \n" + "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n" + "lea " MEMLEA(0x20,3) ",%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_frame), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3" + ); +} +#endif // HAS_I422TOUYVYROW_SSE2 + +#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 +void ARGBPolynomialRow_SSE2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) { + asm volatile ( + "pxor %%xmm3,%%xmm3 \n" + + // 2 pixel loop. + LABELALIGN + "1: \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x8,0) ",%0 \n" + "punpcklbw %%xmm3,%%xmm0 \n" + "movdqa %%xmm0,%%xmm4 \n" + "punpcklwd %%xmm3,%%xmm0 \n" + "punpckhwd %%xmm3,%%xmm4 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm4,%%xmm4 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm4,%%xmm5 \n" + "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n" + "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n" + "addps " MEMACCESS(3) ",%%xmm0 \n" + "addps " MEMACCESS(3) ",%%xmm4 \n" + "movdqa %%xmm1,%%xmm2 \n" + "movdqa %%xmm5,%%xmm6 \n" + "mulps %%xmm1,%%xmm2 \n" + "mulps %%xmm5,%%xmm6 \n" + "mulps %%xmm2,%%xmm1 \n" + "mulps %%xmm6,%%xmm5 \n" + "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n" + "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n" + "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n" + "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n" + "addps %%xmm2,%%xmm0 \n" + "addps %%xmm6,%%xmm4 \n" + "addps %%xmm1,%%xmm0 \n" + "addps %%xmm5,%%xmm4 \n" + "cvttps2dq %%xmm0,%%xmm0 \n" + "cvttps2dq %%xmm4,%%xmm4 \n" + "packuswb %%xmm4,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x2,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(poly) // %3 + : "memory", "cc" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + ); +} +#endif // HAS_ARGBPOLYNOMIALROW_SSE2 + +#ifdef HAS_ARGBPOLYNOMIALROW_AVX2 +void ARGBPolynomialRow_AVX2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) { + asm volatile ( + "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n" + "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n" + "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n" + "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n" + + // 2 pixel loop. + LABELALIGN + "1: \n" + "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels + "lea " MEMLEA(0x8,0) ",%0 \n" + "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats + "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X + "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X + "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X + "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X + "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X + "vcvttps2dq %%ymm0,%%ymm0 \n" + "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n" + "vmovq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x2,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(poly) // %3 + : "memory", "cc", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_ARGBPOLYNOMIALROW_AVX2 + +#ifdef HAS_ARGBCOLORTABLEROW_X86 +// Tranform ARGB pixels with color table. +void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, + int width) { + uintptr_t pixel_temp = 0u; + asm volatile ( + // 1 pixel loop. + LABELALIGN + "1: \n" + "movzb " MEMACCESS(0) ",%1 \n" + "lea " MEMLEA(0x4,0) ",%0 \n" + MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 + "mov %b1," MEMACCESS2(-0x4,0) " \n" + "movzb " MEMACCESS2(-0x3,0) ",%1 \n" + MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 + "mov %b1," MEMACCESS2(-0x3,0) " \n" + "movzb " MEMACCESS2(-0x2,0) ",%1 \n" + MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 + "mov %b1," MEMACCESS2(-0x2,0) " \n" + "movzb " MEMACCESS2(-0x1,0) ",%1 \n" + MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1 + "mov %b1," MEMACCESS2(-0x1,0) " \n" + "dec %2 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+d"(pixel_temp), // %1 + "+r"(width) // %2 + : "r"(table_argb) // %3 + : "memory", "cc"); +} +#endif // HAS_ARGBCOLORTABLEROW_X86 + +#ifdef HAS_RGBCOLORTABLEROW_X86 +// Tranform RGB pixels with color table. +void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { + uintptr_t pixel_temp = 0u; + asm volatile ( + // 1 pixel loop. + LABELALIGN + "1: \n" + "movzb " MEMACCESS(0) ",%1 \n" + "lea " MEMLEA(0x4,0) ",%0 \n" + MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 + "mov %b1," MEMACCESS2(-0x4,0) " \n" + "movzb " MEMACCESS2(-0x3,0) ",%1 \n" + MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 + "mov %b1," MEMACCESS2(-0x3,0) " \n" + "movzb " MEMACCESS2(-0x2,0) ",%1 \n" + MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 + "mov %b1," MEMACCESS2(-0x2,0) " \n" + "dec %2 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+d"(pixel_temp), // %1 + "+r"(width) // %2 + : "r"(table_argb) // %3 + : "memory", "cc"); +} +#endif // HAS_RGBCOLORTABLEROW_X86 + +#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 +// Tranform RGB pixels with luma table. +void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + int width, + const uint8* luma, uint32 lumacoeff) { + uintptr_t pixel_temp = 0u; + uintptr_t table_temp = 0u; + asm volatile ( + "movd %6,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0x8,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(2) ",%%xmm0 \n" + "pmaddubsw %%xmm3,%%xmm0 \n" + "phaddw %%xmm0,%%xmm0 \n" + "pand %%xmm4,%%xmm0 \n" + "punpcklwd %%xmm5,%%xmm0 \n" + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb " MEMACCESS(2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS(3) " \n" + "movzb " MEMACCESS2(0x1,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0x1,3) " \n" + "movzb " MEMACCESS2(0x2,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0x2,3) " \n" + "movzb " MEMACCESS2(0x3,2) ",%0 \n" + "mov %b0," MEMACCESS2(0x3,3) " \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb " MEMACCESS2(0x4,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0x4,3) " \n" + "movzb " MEMACCESS2(0x5,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0x5,3) " \n" + "movzb " MEMACCESS2(0x6,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0x6,3) " \n" + "movzb " MEMACCESS2(0x7,2) ",%0 \n" + "mov %b0," MEMACCESS2(0x7,3) " \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb " MEMACCESS2(0x8,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0x8,3) " \n" + "movzb " MEMACCESS2(0x9,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0x9,3) " \n" + "movzb " MEMACCESS2(0xa,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0xa,3) " \n" + "movzb " MEMACCESS2(0xb,2) ",%0 \n" + "mov %b0," MEMACCESS2(0xb,3) " \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + + "movzb " MEMACCESS2(0xc,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0xc,3) " \n" + "movzb " MEMACCESS2(0xd,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0xd,3) " \n" + "movzb " MEMACCESS2(0xe,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0xe,3) " \n" + "movzb " MEMACCESS2(0xf,2) ",%0 \n" + "mov %b0," MEMACCESS2(0xf,3) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "lea " MEMLEA(0x10,3) ",%3 \n" + "sub $0x4,%4 \n" + "jg 1b \n" + : "+d"(pixel_temp), // %0 + "+a"(table_temp), // %1 + "+r"(src_argb), // %2 + "+r"(dst_argb), // %3 + "+rm"(width) // %4 + : "r"(luma), // %5 + "rm"(lumacoeff) // %6 + : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 + +#endif // defined(__x86_64__) || defined(__i386__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/row_mips.cc b/libs/libaom/src/third_party/libyuv/source/row_mips.cc new file mode 100644 index 000000000..cfc9ffe03 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/row_mips.cc @@ -0,0 +1,911 @@ +/* + * Copyright (c) 2012 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// The following are available on Mips platforms: +#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \ + (_MIPS_SIM == _MIPS_SIM_ABI32) + +#ifdef HAS_COPYROW_MIPS +void CopyRow_MIPS(const uint8* src, uint8* dst, int count) { + __asm__ __volatile__ ( + ".set noreorder \n" + ".set noat \n" + "slti $at, %[count], 8 \n" + "bne $at ,$zero, $last8 \n" + "xor $t8, %[src], %[dst] \n" + "andi $t8, $t8, 0x3 \n" + + "bne $t8, $zero, unaligned \n" + "negu $a3, %[dst] \n" + // make dst/src aligned + "andi $a3, $a3, 0x3 \n" + "beq $a3, $zero, $chk16w \n" + // word-aligned now count is the remining bytes count + "subu %[count], %[count], $a3 \n" + + "lwr $t8, 0(%[src]) \n" + "addu %[src], %[src], $a3 \n" + "swr $t8, 0(%[dst]) \n" + "addu %[dst], %[dst], $a3 \n" + + // Now the dst/src are mutually word-aligned with word-aligned addresses + "$chk16w: \n" + "andi $t8, %[count], 0x3f \n" // whole 64-B chunks? + // t8 is the byte count after 64-byte chunks + "beq %[count], $t8, chk8w \n" + // There will be at most 1 32-byte chunk after it + "subu $a3, %[count], $t8 \n" // the reminder + // Here a3 counts bytes in 16w chunks + "addu $a3, %[dst], $a3 \n" + // Now a3 is the final dst after 64-byte chunks + "addu $t0, %[dst], %[count] \n" + // t0 is the "past the end" address + + // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past + // the "t0-32" address + // This means: for x=128 the last "safe" a1 address is "t0-160" + // Alternatively, for x=64 the last "safe" a1 address is "t0-96" + // we will use "pref 30,128(a1)", so "t0-160" is the limit + "subu $t9, $t0, 160 \n" + // t9 is the "last safe pref 30,128(a1)" address + "pref 0, 0(%[src]) \n" // first line of src + "pref 0, 32(%[src]) \n" // second line of src + "pref 0, 64(%[src]) \n" + "pref 30, 32(%[dst]) \n" + // In case the a1 > t9 don't use "pref 30" at all + "sgtu $v1, %[dst], $t9 \n" + "bgtz $v1, $loop16w \n" + "nop \n" + // otherwise, start with using pref30 + "pref 30, 64(%[dst]) \n" + "$loop16w: \n" + "pref 0, 96(%[src]) \n" + "lw $t0, 0(%[src]) \n" + "bgtz $v1, $skip_pref30_96 \n" // skip + "lw $t1, 4(%[src]) \n" + "pref 30, 96(%[dst]) \n" // continue + "$skip_pref30_96: \n" + "lw $t2, 8(%[src]) \n" + "lw $t3, 12(%[src]) \n" + "lw $t4, 16(%[src]) \n" + "lw $t5, 20(%[src]) \n" + "lw $t6, 24(%[src]) \n" + "lw $t7, 28(%[src]) \n" + "pref 0, 128(%[src]) \n" + // bring the next lines of src, addr 128 + "sw $t0, 0(%[dst]) \n" + "sw $t1, 4(%[dst]) \n" + "sw $t2, 8(%[dst]) \n" + "sw $t3, 12(%[dst]) \n" + "sw $t4, 16(%[dst]) \n" + "sw $t5, 20(%[dst]) \n" + "sw $t6, 24(%[dst]) \n" + "sw $t7, 28(%[dst]) \n" + "lw $t0, 32(%[src]) \n" + "bgtz $v1, $skip_pref30_128 \n" // skip pref 30,128(a1) + "lw $t1, 36(%[src]) \n" + "pref 30, 128(%[dst]) \n" // set dest, addr 128 + "$skip_pref30_128: \n" + "lw $t2, 40(%[src]) \n" + "lw $t3, 44(%[src]) \n" + "lw $t4, 48(%[src]) \n" + "lw $t5, 52(%[src]) \n" + "lw $t6, 56(%[src]) \n" + "lw $t7, 60(%[src]) \n" + "pref 0, 160(%[src]) \n" + // bring the next lines of src, addr 160 + "sw $t0, 32(%[dst]) \n" + "sw $t1, 36(%[dst]) \n" + "sw $t2, 40(%[dst]) \n" + "sw $t3, 44(%[dst]) \n" + "sw $t4, 48(%[dst]) \n" + "sw $t5, 52(%[dst]) \n" + "sw $t6, 56(%[dst]) \n" + "sw $t7, 60(%[dst]) \n" + + "addiu %[dst], %[dst], 64 \n" // adding 64 to dest + "sgtu $v1, %[dst], $t9 \n" + "bne %[dst], $a3, $loop16w \n" + " addiu %[src], %[src], 64 \n" // adding 64 to src + "move %[count], $t8 \n" + + // Here we have src and dest word-aligned but less than 64-bytes to go + + "chk8w: \n" + "pref 0, 0x0(%[src]) \n" + "andi $t8, %[count], 0x1f \n" // 32-byte chunk? + // the t8 is the reminder count past 32-bytes + "beq %[count], $t8, chk1w \n" + // count=t8,no 32-byte chunk + " nop \n" + + "lw $t0, 0(%[src]) \n" + "lw $t1, 4(%[src]) \n" + "lw $t2, 8(%[src]) \n" + "lw $t3, 12(%[src]) \n" + "lw $t4, 16(%[src]) \n" + "lw $t5, 20(%[src]) \n" + "lw $t6, 24(%[src]) \n" + "lw $t7, 28(%[src]) \n" + "addiu %[src], %[src], 32 \n" + + "sw $t0, 0(%[dst]) \n" + "sw $t1, 4(%[dst]) \n" + "sw $t2, 8(%[dst]) \n" + "sw $t3, 12(%[dst]) \n" + "sw $t4, 16(%[dst]) \n" + "sw $t5, 20(%[dst]) \n" + "sw $t6, 24(%[dst]) \n" + "sw $t7, 28(%[dst]) \n" + "addiu %[dst], %[dst], 32 \n" + + "chk1w: \n" + "andi %[count], $t8, 0x3 \n" + // now count is the reminder past 1w chunks + "beq %[count], $t8, $last8 \n" + " subu $a3, $t8, %[count] \n" + // a3 is count of bytes in 1w chunks + "addu $a3, %[dst], $a3 \n" + // now a3 is the dst address past the 1w chunks + // copying in words (4-byte chunks) + "$wordCopy_loop: \n" + "lw $t3, 0(%[src]) \n" + // the first t3 may be equal t0 ... optimize? + "addiu %[src], %[src],4 \n" + "addiu %[dst], %[dst],4 \n" + "bne %[dst], $a3,$wordCopy_loop \n" + " sw $t3, -4(%[dst]) \n" + + // For the last (<8) bytes + "$last8: \n" + "blez %[count], leave \n" + " addu $a3, %[dst], %[count] \n" // a3 -last dst address + "$last8loop: \n" + "lb $v1, 0(%[src]) \n" + "addiu %[src], %[src], 1 \n" + "addiu %[dst], %[dst], 1 \n" + "bne %[dst], $a3, $last8loop \n" + " sb $v1, -1(%[dst]) \n" + + "leave: \n" + " j $ra \n" + " nop \n" + + // + // UNALIGNED case + // + + "unaligned: \n" + // got here with a3="negu a1" + "andi $a3, $a3, 0x3 \n" // a1 is word aligned? + "beqz $a3, $ua_chk16w \n" + " subu %[count], %[count], $a3 \n" + // bytes left after initial a3 bytes + "lwr $v1, 0(%[src]) \n" + "lwl $v1, 3(%[src]) \n" + "addu %[src], %[src], $a3 \n" // a3 may be 1, 2 or 3 + "swr $v1, 0(%[dst]) \n" + "addu %[dst], %[dst], $a3 \n" + // below the dst will be word aligned (NOTE1) + "$ua_chk16w: \n" + "andi $t8, %[count], 0x3f \n" // whole 64-B chunks? + // t8 is the byte count after 64-byte chunks + "beq %[count], $t8, ua_chk8w \n" + // if a2==t8, no 64-byte chunks + // There will be at most 1 32-byte chunk after it + "subu $a3, %[count], $t8 \n" // the reminder + // Here a3 counts bytes in 16w chunks + "addu $a3, %[dst], $a3 \n" + // Now a3 is the final dst after 64-byte chunks + "addu $t0, %[dst], %[count] \n" // t0 "past the end" + "subu $t9, $t0, 160 \n" + // t9 is the "last safe pref 30,128(a1)" address + "pref 0, 0(%[src]) \n" // first line of src + "pref 0, 32(%[src]) \n" // second line addr 32 + "pref 0, 64(%[src]) \n" + "pref 30, 32(%[dst]) \n" + // safe, as we have at least 64 bytes ahead + // In case the a1 > t9 don't use "pref 30" at all + "sgtu $v1, %[dst], $t9 \n" + "bgtz $v1, $ua_loop16w \n" + // skip "pref 30,64(a1)" for too short arrays + " nop \n" + // otherwise, start with using pref30 + "pref 30, 64(%[dst]) \n" + "$ua_loop16w: \n" + "pref 0, 96(%[src]) \n" + "lwr $t0, 0(%[src]) \n" + "lwl $t0, 3(%[src]) \n" + "lwr $t1, 4(%[src]) \n" + "bgtz $v1, $ua_skip_pref30_96 \n" + " lwl $t1, 7(%[src]) \n" + "pref 30, 96(%[dst]) \n" + // continue setting up the dest, addr 96 + "$ua_skip_pref30_96: \n" + "lwr $t2, 8(%[src]) \n" + "lwl $t2, 11(%[src]) \n" + "lwr $t3, 12(%[src]) \n" + "lwl $t3, 15(%[src]) \n" + "lwr $t4, 16(%[src]) \n" + "lwl $t4, 19(%[src]) \n" + "lwr $t5, 20(%[src]) \n" + "lwl $t5, 23(%[src]) \n" + "lwr $t6, 24(%[src]) \n" + "lwl $t6, 27(%[src]) \n" + "lwr $t7, 28(%[src]) \n" + "lwl $t7, 31(%[src]) \n" + "pref 0, 128(%[src]) \n" + // bring the next lines of src, addr 128 + "sw $t0, 0(%[dst]) \n" + "sw $t1, 4(%[dst]) \n" + "sw $t2, 8(%[dst]) \n" + "sw $t3, 12(%[dst]) \n" + "sw $t4, 16(%[dst]) \n" + "sw $t5, 20(%[dst]) \n" + "sw $t6, 24(%[dst]) \n" + "sw $t7, 28(%[dst]) \n" + "lwr $t0, 32(%[src]) \n" + "lwl $t0, 35(%[src]) \n" + "lwr $t1, 36(%[src]) \n" + "bgtz $v1, ua_skip_pref30_128 \n" + " lwl $t1, 39(%[src]) \n" + "pref 30, 128(%[dst]) \n" + // continue setting up the dest, addr 128 + "ua_skip_pref30_128: \n" + + "lwr $t2, 40(%[src]) \n" + "lwl $t2, 43(%[src]) \n" + "lwr $t3, 44(%[src]) \n" + "lwl $t3, 47(%[src]) \n" + "lwr $t4, 48(%[src]) \n" + "lwl $t4, 51(%[src]) \n" + "lwr $t5, 52(%[src]) \n" + "lwl $t5, 55(%[src]) \n" + "lwr $t6, 56(%[src]) \n" + "lwl $t6, 59(%[src]) \n" + "lwr $t7, 60(%[src]) \n" + "lwl $t7, 63(%[src]) \n" + "pref 0, 160(%[src]) \n" + // bring the next lines of src, addr 160 + "sw $t0, 32(%[dst]) \n" + "sw $t1, 36(%[dst]) \n" + "sw $t2, 40(%[dst]) \n" + "sw $t3, 44(%[dst]) \n" + "sw $t4, 48(%[dst]) \n" + "sw $t5, 52(%[dst]) \n" + "sw $t6, 56(%[dst]) \n" + "sw $t7, 60(%[dst]) \n" + + "addiu %[dst],%[dst],64 \n" // adding 64 to dest + "sgtu $v1,%[dst],$t9 \n" + "bne %[dst],$a3,$ua_loop16w \n" + " addiu %[src],%[src],64 \n" // adding 64 to src + "move %[count],$t8 \n" + + // Here we have src and dest word-aligned but less than 64-bytes to go + + "ua_chk8w: \n" + "pref 0, 0x0(%[src]) \n" + "andi $t8, %[count], 0x1f \n" // 32-byte chunk? + // the t8 is the reminder count + "beq %[count], $t8, $ua_chk1w \n" + // when count==t8, no 32-byte chunk + + "lwr $t0, 0(%[src]) \n" + "lwl $t0, 3(%[src]) \n" + "lwr $t1, 4(%[src]) \n" + "lwl $t1, 7(%[src]) \n" + "lwr $t2, 8(%[src]) \n" + "lwl $t2, 11(%[src]) \n" + "lwr $t3, 12(%[src]) \n" + "lwl $t3, 15(%[src]) \n" + "lwr $t4, 16(%[src]) \n" + "lwl $t4, 19(%[src]) \n" + "lwr $t5, 20(%[src]) \n" + "lwl $t5, 23(%[src]) \n" + "lwr $t6, 24(%[src]) \n" + "lwl $t6, 27(%[src]) \n" + "lwr $t7, 28(%[src]) \n" + "lwl $t7, 31(%[src]) \n" + "addiu %[src], %[src], 32 \n" + + "sw $t0, 0(%[dst]) \n" + "sw $t1, 4(%[dst]) \n" + "sw $t2, 8(%[dst]) \n" + "sw $t3, 12(%[dst]) \n" + "sw $t4, 16(%[dst]) \n" + "sw $t5, 20(%[dst]) \n" + "sw $t6, 24(%[dst]) \n" + "sw $t7, 28(%[dst]) \n" + "addiu %[dst], %[dst], 32 \n" + + "$ua_chk1w: \n" + "andi %[count], $t8, 0x3 \n" + // now count is the reminder past 1w chunks + "beq %[count], $t8, ua_smallCopy \n" + "subu $a3, $t8, %[count] \n" + // a3 is count of bytes in 1w chunks + "addu $a3, %[dst], $a3 \n" + // now a3 is the dst address past the 1w chunks + + // copying in words (4-byte chunks) + "$ua_wordCopy_loop: \n" + "lwr $v1, 0(%[src]) \n" + "lwl $v1, 3(%[src]) \n" + "addiu %[src], %[src], 4 \n" + "addiu %[dst], %[dst], 4 \n" + // note: dst=a1 is word aligned here, see NOTE1 + "bne %[dst], $a3, $ua_wordCopy_loop \n" + " sw $v1,-4(%[dst]) \n" + + // Now less than 4 bytes (value in count) left to copy + "ua_smallCopy: \n" + "beqz %[count], leave \n" + " addu $a3, %[dst], %[count] \n" // a3 = last dst address + "$ua_smallCopy_loop: \n" + "lb $v1, 0(%[src]) \n" + "addiu %[src], %[src], 1 \n" + "addiu %[dst], %[dst], 1 \n" + "bne %[dst],$a3,$ua_smallCopy_loop \n" + " sb $v1, -1(%[dst]) \n" + + "j $ra \n" + " nop \n" + ".set at \n" + ".set reorder \n" + : [dst] "+r" (dst), [src] "+r" (src) + : [count] "r" (count) + : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", + "t8", "t9", "a3", "v1", "at" + ); +} +#endif // HAS_COPYROW_MIPS + +// MIPS DSPR2 functions +#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \ + (__mips_dsp_rev >= 2) && \ + (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6) + +void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "srl $t4, %[width], 4 \n" // multiplies of 16 + "blez $t4, 2f \n" + " andi %[width], %[width], 0xf \n" // residual + + ".p2align 2 \n" + "1: \n" + "addiu $t4, $t4, -1 \n" + "lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0 + "lw $t1, 4(%[src_uv]) \n" // V3 | U3 | V2 | U2 + "lw $t2, 8(%[src_uv]) \n" // V5 | U5 | V4 | U4 + "lw $t3, 12(%[src_uv]) \n" // V7 | U7 | V6 | U6 + "lw $t5, 16(%[src_uv]) \n" // V9 | U9 | V8 | U8 + "lw $t6, 20(%[src_uv]) \n" // V11 | U11 | V10 | U10 + "lw $t7, 24(%[src_uv]) \n" // V13 | U13 | V12 | U12 + "lw $t8, 28(%[src_uv]) \n" // V15 | U15 | V14 | U14 + "addiu %[src_uv], %[src_uv], 32 \n" + "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0 + "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0 + "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4 + "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4 + "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8 + "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8 + "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12 + "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12 + "sw $t9, 0(%[dst_v]) \n" + "sw $t0, 0(%[dst_u]) \n" + "sw $t1, 4(%[dst_v]) \n" + "sw $t2, 4(%[dst_u]) \n" + "sw $t3, 8(%[dst_v]) \n" + "sw $t5, 8(%[dst_u]) \n" + "sw $t6, 12(%[dst_v]) \n" + "sw $t7, 12(%[dst_u]) \n" + "addiu %[dst_v], %[dst_v], 16 \n" + "bgtz $t4, 1b \n" + " addiu %[dst_u], %[dst_u], 16 \n" + + "beqz %[width], 3f \n" + " nop \n" + + "2: \n" + "lbu $t0, 0(%[src_uv]) \n" + "lbu $t1, 1(%[src_uv]) \n" + "addiu %[src_uv], %[src_uv], 2 \n" + "addiu %[width], %[width], -1 \n" + "sb $t0, 0(%[dst_u]) \n" + "sb $t1, 0(%[dst_v]) \n" + "addiu %[dst_u], %[dst_u], 1 \n" + "bgtz %[width], 2b \n" + " addiu %[dst_v], %[dst_v], 1 \n" + + "3: \n" + ".set pop \n" + : [src_uv] "+r" (src_uv), + [width] "+r" (width), + [dst_u] "+r" (dst_u), + [dst_v] "+r" (dst_v) + : + : "t0", "t1", "t2", "t3", + "t4", "t5", "t6", "t7", "t8", "t9" + ); +} + +void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + "srl $t4, %[width], 4 \n" // multiplies of 16 + "andi $t5, %[width], 0xf \n" + "blez $t4, 2f \n" + " addu %[src], %[src], %[width] \n" // src += width + + ".p2align 2 \n" + "1: \n" + "lw $t0, -16(%[src]) \n" // |3|2|1|0| + "lw $t1, -12(%[src]) \n" // |7|6|5|4| + "lw $t2, -8(%[src]) \n" // |11|10|9|8| + "lw $t3, -4(%[src]) \n" // |15|14|13|12| + "wsbh $t0, $t0 \n" // |2|3|0|1| + "wsbh $t1, $t1 \n" // |6|7|4|5| + "wsbh $t2, $t2 \n" // |10|11|8|9| + "wsbh $t3, $t3 \n" // |14|15|12|13| + "rotr $t0, $t0, 16 \n" // |0|1|2|3| + "rotr $t1, $t1, 16 \n" // |4|5|6|7| + "rotr $t2, $t2, 16 \n" // |8|9|10|11| + "rotr $t3, $t3, 16 \n" // |12|13|14|15| + "addiu %[src], %[src], -16 \n" + "addiu $t4, $t4, -1 \n" + "sw $t3, 0(%[dst]) \n" // |15|14|13|12| + "sw $t2, 4(%[dst]) \n" // |11|10|9|8| + "sw $t1, 8(%[dst]) \n" // |7|6|5|4| + "sw $t0, 12(%[dst]) \n" // |3|2|1|0| + "bgtz $t4, 1b \n" + " addiu %[dst], %[dst], 16 \n" + "beqz $t5, 3f \n" + " nop \n" + + "2: \n" + "lbu $t0, -1(%[src]) \n" + "addiu $t5, $t5, -1 \n" + "addiu %[src], %[src], -1 \n" + "sb $t0, 0(%[dst]) \n" + "bgez $t5, 2b \n" + " addiu %[dst], %[dst], 1 \n" + + "3: \n" + ".set pop \n" + : [src] "+r" (src), [dst] "+r" (dst) + : [width] "r" (width) + : "t0", "t1", "t2", "t3", "t4", "t5" + ); +} + +void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) { + int x = 0; + int y = 0; + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + "addu $t4, %[width], %[width] \n" + "srl %[x], %[width], 4 \n" + "andi %[y], %[width], 0xf \n" + "blez %[x], 2f \n" + " addu %[src_uv], %[src_uv], $t4 \n" + + ".p2align 2 \n" + "1: \n" + "lw $t0, -32(%[src_uv]) \n" // |3|2|1|0| + "lw $t1, -28(%[src_uv]) \n" // |7|6|5|4| + "lw $t2, -24(%[src_uv]) \n" // |11|10|9|8| + "lw $t3, -20(%[src_uv]) \n" // |15|14|13|12| + "lw $t4, -16(%[src_uv]) \n" // |19|18|17|16| + "lw $t6, -12(%[src_uv]) \n" // |23|22|21|20| + "lw $t7, -8(%[src_uv]) \n" // |27|26|25|24| + "lw $t8, -4(%[src_uv]) \n" // |31|30|29|28| + + "rotr $t0, $t0, 16 \n" // |1|0|3|2| + "rotr $t1, $t1, 16 \n" // |5|4|7|6| + "rotr $t2, $t2, 16 \n" // |9|8|11|10| + "rotr $t3, $t3, 16 \n" // |13|12|15|14| + "rotr $t4, $t4, 16 \n" // |17|16|19|18| + "rotr $t6, $t6, 16 \n" // |21|20|23|22| + "rotr $t7, $t7, 16 \n" // |25|24|27|26| + "rotr $t8, $t8, 16 \n" // |29|28|31|30| + "precr.qb.ph $t9, $t0, $t1 \n" // |0|2|4|6| + "precrq.qb.ph $t5, $t0, $t1 \n" // |1|3|5|7| + "precr.qb.ph $t0, $t2, $t3 \n" // |8|10|12|14| + "precrq.qb.ph $t1, $t2, $t3 \n" // |9|11|13|15| + "precr.qb.ph $t2, $t4, $t6 \n" // |16|18|20|22| + "precrq.qb.ph $t3, $t4, $t6 \n" // |17|19|21|23| + "precr.qb.ph $t4, $t7, $t8 \n" // |24|26|28|30| + "precrq.qb.ph $t6, $t7, $t8 \n" // |25|27|29|31| + "addiu %[src_uv], %[src_uv], -32 \n" + "addiu %[x], %[x], -1 \n" + "swr $t4, 0(%[dst_u]) \n" + "swl $t4, 3(%[dst_u]) \n" // |30|28|26|24| + "swr $t6, 0(%[dst_v]) \n" + "swl $t6, 3(%[dst_v]) \n" // |31|29|27|25| + "swr $t2, 4(%[dst_u]) \n" + "swl $t2, 7(%[dst_u]) \n" // |22|20|18|16| + "swr $t3, 4(%[dst_v]) \n" + "swl $t3, 7(%[dst_v]) \n" // |23|21|19|17| + "swr $t0, 8(%[dst_u]) \n" + "swl $t0, 11(%[dst_u]) \n" // |14|12|10|8| + "swr $t1, 8(%[dst_v]) \n" + "swl $t1, 11(%[dst_v]) \n" // |15|13|11|9| + "swr $t9, 12(%[dst_u]) \n" + "swl $t9, 15(%[dst_u]) \n" // |6|4|2|0| + "swr $t5, 12(%[dst_v]) \n" + "swl $t5, 15(%[dst_v]) \n" // |7|5|3|1| + "addiu %[dst_v], %[dst_v], 16 \n" + "bgtz %[x], 1b \n" + " addiu %[dst_u], %[dst_u], 16 \n" + "beqz %[y], 3f \n" + " nop \n" + "b 2f \n" + " nop \n" + + "2: \n" + "lbu $t0, -2(%[src_uv]) \n" + "lbu $t1, -1(%[src_uv]) \n" + "addiu %[src_uv], %[src_uv], -2 \n" + "addiu %[y], %[y], -1 \n" + "sb $t0, 0(%[dst_u]) \n" + "sb $t1, 0(%[dst_v]) \n" + "addiu %[dst_u], %[dst_u], 1 \n" + "bgtz %[y], 2b \n" + " addiu %[dst_v], %[dst_v], 1 \n" + + "3: \n" + ".set pop \n" + : [src_uv] "+r" (src_uv), + [dst_u] "+r" (dst_u), + [dst_v] "+r" (dst_v), + [x] "=&r" (x), + [y] "+r" (y) + : [width] "r" (width) + : "t0", "t1", "t2", "t3", "t4", + "t5", "t7", "t8", "t9" + ); +} + +// Convert (4 Y and 2 VU) I422 and arrange RGB values into +// t5 = | 0 | B0 | 0 | b0 | +// t4 = | 0 | B1 | 0 | b1 | +// t9 = | 0 | G0 | 0 | g0 | +// t8 = | 0 | G1 | 0 | g1 | +// t2 = | 0 | R0 | 0 | r0 | +// t1 = | 0 | R1 | 0 | r1 | +#define I422ToTransientMipsRGB \ + "lw $t0, 0(%[y_buf]) \n" \ + "lhu $t1, 0(%[u_buf]) \n" \ + "lhu $t2, 0(%[v_buf]) \n" \ + "preceu.ph.qbr $t1, $t1 \n" \ + "preceu.ph.qbr $t2, $t2 \n" \ + "preceu.ph.qbra $t3, $t0 \n" \ + "preceu.ph.qbla $t0, $t0 \n" \ + "subu.ph $t1, $t1, $s5 \n" \ + "subu.ph $t2, $t2, $s5 \n" \ + "subu.ph $t3, $t3, $s4 \n" \ + "subu.ph $t0, $t0, $s4 \n" \ + "mul.ph $t3, $t3, $s0 \n" \ + "mul.ph $t0, $t0, $s0 \n" \ + "shll.ph $t4, $t1, 0x7 \n" \ + "subu.ph $t4, $t4, $t1 \n" \ + "mul.ph $t6, $t1, $s1 \n" \ + "mul.ph $t1, $t2, $s2 \n" \ + "addq_s.ph $t5, $t4, $t3 \n" \ + "addq_s.ph $t4, $t4, $t0 \n" \ + "shra.ph $t5, $t5, 6 \n" \ + "shra.ph $t4, $t4, 6 \n" \ + "addiu %[u_buf], 2 \n" \ + "addiu %[v_buf], 2 \n" \ + "addu.ph $t6, $t6, $t1 \n" \ + "mul.ph $t1, $t2, $s3 \n" \ + "addu.ph $t9, $t6, $t3 \n" \ + "addu.ph $t8, $t6, $t0 \n" \ + "shra.ph $t9, $t9, 6 \n" \ + "shra.ph $t8, $t8, 6 \n" \ + "addu.ph $t2, $t1, $t3 \n" \ + "addu.ph $t1, $t1, $t0 \n" \ + "shra.ph $t2, $t2, 6 \n" \ + "shra.ph $t1, $t1, 6 \n" \ + "subu.ph $t5, $t5, $s5 \n" \ + "subu.ph $t4, $t4, $s5 \n" \ + "subu.ph $t9, $t9, $s5 \n" \ + "subu.ph $t8, $t8, $s5 \n" \ + "subu.ph $t2, $t2, $s5 \n" \ + "subu.ph $t1, $t1, $s5 \n" \ + "shll_s.ph $t5, $t5, 8 \n" \ + "shll_s.ph $t4, $t4, 8 \n" \ + "shll_s.ph $t9, $t9, 8 \n" \ + "shll_s.ph $t8, $t8, 8 \n" \ + "shll_s.ph $t2, $t2, 8 \n" \ + "shll_s.ph $t1, $t1, 8 \n" \ + "shra.ph $t5, $t5, 8 \n" \ + "shra.ph $t4, $t4, 8 \n" \ + "shra.ph $t9, $t9, 8 \n" \ + "shra.ph $t8, $t8, 8 \n" \ + "shra.ph $t2, $t2, 8 \n" \ + "shra.ph $t1, $t1, 8 \n" \ + "addu.ph $t5, $t5, $s5 \n" \ + "addu.ph $t4, $t4, $s5 \n" \ + "addu.ph $t9, $t9, $s5 \n" \ + "addu.ph $t8, $t8, $s5 \n" \ + "addu.ph $t2, $t2, $s5 \n" \ + "addu.ph $t1, $t1, $s5 \n" + +void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "beqz %[width], 2f \n" + " repl.ph $s0, 74 \n" // |YG|YG| = |74|74| + "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25| + "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52| + "repl.ph $s3, 102 \n" // |VR|VR| = |102|102| + "repl.ph $s4, 16 \n" // |0|16|0|16| + "repl.ph $s5, 128 \n" // |128|128| // clipping + "lui $s6, 0xff00 \n" + "ori $s6, 0xff00 \n" // |ff|00|ff|00|ff| + + ".p2align 2 \n" + "1: \n" + I422ToTransientMipsRGB +// Arranging into argb format + "precr.qb.ph $t4, $t8, $t4 \n" // |G1|g1|B1|b1| + "precr.qb.ph $t5, $t9, $t5 \n" // |G0|g0|B0|b0| + "addiu %[width], -4 \n" + "precrq.qb.ph $t8, $t4, $t5 \n" // |G1|B1|G0|B0| + "precr.qb.ph $t9, $t4, $t5 \n" // |g1|b1|g0|b0| + "precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0| + + "addiu %[y_buf], 4 \n" + "preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0| + "preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0| + "or $t1, $t1, $s6 \n" // |ff|R1|ff|R0| + "or $t2, $t2, $s6 \n" // |ff|r1|ff|r0| + "precrq.ph.w $t0, $t2, $t9 \n" // |ff|r1|g1|b1| + "precrq.ph.w $t3, $t1, $t8 \n" // |ff|R1|G1|B1| + "sll $t9, $t9, 16 \n" + "sll $t8, $t8, 16 \n" + "packrl.ph $t2, $t2, $t9 \n" // |ff|r0|g0|b0| + "packrl.ph $t1, $t1, $t8 \n" // |ff|R0|G0|B0| +// Store results. + "sw $t2, 0(%[rgb_buf]) \n" + "sw $t0, 4(%[rgb_buf]) \n" + "sw $t1, 8(%[rgb_buf]) \n" + "sw $t3, 12(%[rgb_buf]) \n" + "bnez %[width], 1b \n" + " addiu %[rgb_buf], 16 \n" + "2: \n" + ".set pop \n" + :[y_buf] "+r" (y_buf), + [u_buf] "+r" (u_buf), + [v_buf] "+r" (v_buf), + [width] "+r" (width), + [rgb_buf] "+r" (rgb_buf) + : + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9", + "s0", "s1", "s2", "s3", + "s4", "s5", "s6" + ); +} + +void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "beqz %[width], 2f \n" + " repl.ph $s0, 74 \n" // |YG|YG| = |74|74| + "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25| + "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52| + "repl.ph $s3, 102 \n" // |VR|VR| = |102|102| + "repl.ph $s4, 16 \n" // |0|16|0|16| + "repl.ph $s5, 128 \n" // |128|128| + "lui $s6, 0xff00 \n" + "ori $s6, 0xff00 \n" // |ff|00|ff|00| + + ".p2align 2 \n" + "1: \n" + I422ToTransientMipsRGB +// Arranging into abgr format + "precr.qb.ph $t0, $t8, $t1 \n" // |G1|g1|R1|r1| + "precr.qb.ph $t3, $t9, $t2 \n" // |G0|g0|R0|r0| + "precrq.qb.ph $t8, $t0, $t3 \n" // |G1|R1|G0|R0| + "precr.qb.ph $t9, $t0, $t3 \n" // |g1|r1|g0|r0| + + "precr.qb.ph $t2, $t4, $t5 \n" // |B1|b1|B0|b0| + "addiu %[width], -4 \n" + "addiu %[y_buf], 4 \n" + "preceu.ph.qbla $t1, $t2 \n" // |0 |B1|0 |B0| + "preceu.ph.qbra $t2, $t2 \n" // |0 |b1|0 |b0| + "or $t1, $t1, $s6 \n" // |ff|B1|ff|B0| + "or $t2, $t2, $s6 \n" // |ff|b1|ff|b0| + "precrq.ph.w $t0, $t2, $t9 \n" // |ff|b1|g1|r1| + "precrq.ph.w $t3, $t1, $t8 \n" // |ff|B1|G1|R1| + "sll $t9, $t9, 16 \n" + "sll $t8, $t8, 16 \n" + "packrl.ph $t2, $t2, $t9 \n" // |ff|b0|g0|r0| + "packrl.ph $t1, $t1, $t8 \n" // |ff|B0|G0|R0| +// Store results. + "sw $t2, 0(%[rgb_buf]) \n" + "sw $t0, 4(%[rgb_buf]) \n" + "sw $t1, 8(%[rgb_buf]) \n" + "sw $t3, 12(%[rgb_buf]) \n" + "bnez %[width], 1b \n" + " addiu %[rgb_buf], 16 \n" + "2: \n" + ".set pop \n" + :[y_buf] "+r" (y_buf), + [u_buf] "+r" (u_buf), + [v_buf] "+r" (v_buf), + [width] "+r" (width), + [rgb_buf] "+r" (rgb_buf) + : + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9", + "s0", "s1", "s2", "s3", + "s4", "s5", "s6" + ); +} + +void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "beqz %[width], 2f \n" + " repl.ph $s0, 74 \n" // |YG|YG| = |74 |74 | + "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25| + "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52| + "repl.ph $s3, 102 \n" // |VR|VR| = |102|102| + "repl.ph $s4, 16 \n" // |0|16|0|16| + "repl.ph $s5, 128 \n" // |128|128| + "lui $s6, 0xff \n" + "ori $s6, 0xff \n" // |00|ff|00|ff| + + ".p2align 2 \n" + "1: \n" + I422ToTransientMipsRGB + // Arranging into bgra format + "precr.qb.ph $t4, $t4, $t8 \n" // |B1|b1|G1|g1| + "precr.qb.ph $t5, $t5, $t9 \n" // |B0|b0|G0|g0| + "precrq.qb.ph $t8, $t4, $t5 \n" // |B1|G1|B0|G0| + "precr.qb.ph $t9, $t4, $t5 \n" // |b1|g1|b0|g0| + + "precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0| + "addiu %[width], -4 \n" + "addiu %[y_buf], 4 \n" + "preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0| + "preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0| + "sll $t1, $t1, 8 \n" // |R1|0 |R0|0 | + "sll $t2, $t2, 8 \n" // |r1|0 |r0|0 | + "or $t1, $t1, $s6 \n" // |R1|ff|R0|ff| + "or $t2, $t2, $s6 \n" // |r1|ff|r0|ff| + "precrq.ph.w $t0, $t9, $t2 \n" // |b1|g1|r1|ff| + "precrq.ph.w $t3, $t8, $t1 \n" // |B1|G1|R1|ff| + "sll $t1, $t1, 16 \n" + "sll $t2, $t2, 16 \n" + "packrl.ph $t2, $t9, $t2 \n" // |b0|g0|r0|ff| + "packrl.ph $t1, $t8, $t1 \n" // |B0|G0|R0|ff| +// Store results. + "sw $t2, 0(%[rgb_buf]) \n" + "sw $t0, 4(%[rgb_buf]) \n" + "sw $t1, 8(%[rgb_buf]) \n" + "sw $t3, 12(%[rgb_buf]) \n" + "bnez %[width], 1b \n" + " addiu %[rgb_buf], 16 \n" + "2: \n" + ".set pop \n" + :[y_buf] "+r" (y_buf), + [u_buf] "+r" (u_buf), + [v_buf] "+r" (v_buf), + [width] "+r" (width), + [rgb_buf] "+r" (rgb_buf) + : + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9", + "s0", "s1", "s2", "s3", + "s4", "s5", "s6" + ); +} + +// Bilinear filter 8x2 -> 8x1 +void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + int y0_fraction = 256 - source_y_fraction; + const uint8* src_ptr1 = src_ptr + src_stride; + + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + "replv.ph $t0, %[y0_fraction] \n" + "replv.ph $t1, %[source_y_fraction] \n" + + ".p2align 2 \n" + "1: \n" + "lw $t2, 0(%[src_ptr]) \n" + "lw $t3, 0(%[src_ptr1]) \n" + "lw $t4, 4(%[src_ptr]) \n" + "lw $t5, 4(%[src_ptr1]) \n" + "muleu_s.ph.qbl $t6, $t2, $t0 \n" + "muleu_s.ph.qbr $t7, $t2, $t0 \n" + "muleu_s.ph.qbl $t8, $t3, $t1 \n" + "muleu_s.ph.qbr $t9, $t3, $t1 \n" + "muleu_s.ph.qbl $t2, $t4, $t0 \n" + "muleu_s.ph.qbr $t3, $t4, $t0 \n" + "muleu_s.ph.qbl $t4, $t5, $t1 \n" + "muleu_s.ph.qbr $t5, $t5, $t1 \n" + "addq.ph $t6, $t6, $t8 \n" + "addq.ph $t7, $t7, $t9 \n" + "addq.ph $t2, $t2, $t4 \n" + "addq.ph $t3, $t3, $t5 \n" + "shra.ph $t6, $t6, 8 \n" + "shra.ph $t7, $t7, 8 \n" + "shra.ph $t2, $t2, 8 \n" + "shra.ph $t3, $t3, 8 \n" + "precr.qb.ph $t6, $t6, $t7 \n" + "precr.qb.ph $t2, $t2, $t3 \n" + "addiu %[src_ptr], %[src_ptr], 8 \n" + "addiu %[src_ptr1], %[src_ptr1], 8 \n" + "addiu %[dst_width], %[dst_width], -8 \n" + "sw $t6, 0(%[dst_ptr]) \n" + "sw $t2, 4(%[dst_ptr]) \n" + "bgtz %[dst_width], 1b \n" + " addiu %[dst_ptr], %[dst_ptr], 8 \n" + + ".set pop \n" + : [dst_ptr] "+r" (dst_ptr), + [src_ptr1] "+r" (src_ptr1), + [src_ptr] "+r" (src_ptr), + [dst_width] "+r" (dst_width) + : [source_y_fraction] "r" (source_y_fraction), + [y0_fraction] "r" (y0_fraction), + [src_stride] "r" (src_stride) + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9" + ); +} +#endif // __mips_dsp_rev >= 2 + +#endif // defined(__mips__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/row_neon.cc b/libs/libaom/src/third_party/libyuv/source/row_neon.cc new file mode 100644 index 000000000..1a72eb903 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/row_neon.cc @@ -0,0 +1,3084 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ + !defined(__aarch64__) + +// Read 8 Y, 4 U and 4 V from 422 +#define READYUV422 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ + "vld1.32 {d2[0]}, [%1]! \n" \ + MEMACCESS(2) \ + "vld1.32 {d2[1]}, [%2]! \n" + +// Read 8 Y, 2 U and 2 V from 422 +#define READYUV411 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ + "vld1.16 {d2[0]}, [%1]! \n" \ + MEMACCESS(2) \ + "vld1.16 {d2[1]}, [%2]! \n" \ + "vmov.u8 d3, d2 \n" \ + "vzip.u8 d2, d3 \n" + +// Read 8 Y, 8 U and 8 V from 444 +#define READYUV444 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ + "vld1.8 {d2}, [%1]! \n" \ + MEMACCESS(2) \ + "vld1.8 {d3}, [%2]! \n" \ + "vpaddl.u8 q1, q1 \n" \ + "vrshrn.u16 d2, q1, #1 \n" + +// Read 8 Y, and set 4 U and 4 V to 128 +#define READYUV400 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ + "vmov.u8 d2, #128 \n" + +// Read 8 Y and 4 UV from NV12 +#define READNV12 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ + "vld1.8 {d2}, [%1]! \n" \ + "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" + +// Read 8 Y and 4 VU from NV21 +#define READNV21 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ + "vld1.8 {d2}, [%1]! \n" \ + "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ + "vuzp.u8 d3, d2 \n" \ + "vtrn.u32 d2, d3 \n" + +// Read 8 YUY2 +#define READYUY2 \ + MEMACCESS(0) \ + "vld2.8 {d0, d2}, [%0]! \n" \ + "vmov.u8 d3, d2 \n" \ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" + +// Read 8 UYVY +#define READUYVY \ + MEMACCESS(0) \ + "vld2.8 {d2, d3}, [%0]! \n" \ + "vmov.u8 d0, d3 \n" \ + "vmov.u8 d3, d2 \n" \ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" + +#define YUV422TORGB_SETUP_REG \ + MEMACCESS([kUVToRB]) \ + "vld1.8 {d24}, [%[kUVToRB]] \n" \ + MEMACCESS([kUVToG]) \ + "vld1.8 {d25}, [%[kUVToG]] \n" \ + MEMACCESS([kUVBiasBGR]) \ + "vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \ + MEMACCESS([kUVBiasBGR]) \ + "vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \ + MEMACCESS([kUVBiasBGR]) \ + "vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \ + MEMACCESS([kYToRgb]) \ + "vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n" + +#define YUV422TORGB \ + "vmull.u8 q8, d2, d24 \n" /* u/v B/R component */\ + "vmull.u8 q9, d2, d25 \n" /* u/v G component */\ + "vmovl.u8 q0, d0 \n" /* Y */\ + "vmovl.s16 q10, d1 \n" \ + "vmovl.s16 q0, d0 \n" \ + "vmul.s32 q10, q10, q15 \n" \ + "vmul.s32 q0, q0, q15 \n" \ + "vqshrun.s32 d0, q0, #16 \n" \ + "vqshrun.s32 d1, q10, #16 \n" /* Y */\ + "vadd.s16 d18, d19 \n" \ + "vshll.u16 q1, d16, #16 \n" /* Replicate u * UB */\ + "vshll.u16 q10, d17, #16 \n" /* Replicate v * VR */\ + "vshll.u16 q3, d18, #16 \n" /* Replicate (v*VG + u*UG)*/\ + "vaddw.u16 q1, q1, d16 \n" \ + "vaddw.u16 q10, q10, d17 \n" \ + "vaddw.u16 q3, q3, d18 \n" \ + "vqadd.s16 q8, q0, q13 \n" /* B */ \ + "vqadd.s16 q9, q0, q14 \n" /* R */ \ + "vqadd.s16 q0, q0, q4 \n" /* G */ \ + "vqadd.s16 q8, q8, q1 \n" /* B */ \ + "vqadd.s16 q9, q9, q10 \n" /* R */ \ + "vqsub.s16 q0, q0, q3 \n" /* G */ \ + "vqshrun.s16 d20, q8, #6 \n" /* B */ \ + "vqshrun.s16 d22, q9, #6 \n" /* R */ \ + "vqshrun.s16 d21, q0, #6 \n" /* G */ + +// YUV to RGB conversion constants. +// Y contribution to R,G,B. Scale and bias. +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */ + +// U and V contributions to R,G,B. +#define UB -128 /* -min(128, round(2.018 * 64)) */ +#define UG 25 /* -round(-0.391 * 64) */ +#define VG 52 /* -round(-0.813 * 64) */ +#define VR -102 /* -round(1.596 * 64) */ + +// Bias values to subtract 16 from Y and 128 from U and V. +#define BB (UB * 128 - YGB) +#define BG (UG * 128 + VG * 128 - YGB) +#define BR (VR * 128 - YGB) + +static uvec8 kUVToRB = { 128, 128, 128, 128, 102, 102, 102, 102, + 0, 0, 0, 0, 0, 0, 0, 0 }; +static uvec8 kUVToG = { 25, 25, 25, 25, 52, 52, 52, 52, + 0, 0, 0, 0, 0, 0, 0, 0 }; +static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 }; +static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 }; + +#undef YG +#undef YGB +#undef UB +#undef UG +#undef VG +#undef VR +#undef BB +#undef BG +#undef BR + +void I444ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + ".p2align 2 \n" + "1: \n" + READYUV444 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(3) + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : [kUVToRB]"r"(&kUVToRB), // %5 + [kUVToG]"r"(&kUVToG), // %6 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void I422ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(3) + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : [kUVToRB]"r"(&kUVToRB), // %5 + [kUVToG]"r"(&kUVToG), // %6 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void I411ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + ".p2align 2 \n" + "1: \n" + READYUV411 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(3) + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : [kUVToRB]"r"(&kUVToRB), // %5 + [kUVToG]"r"(&kUVToG), // %6 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void I422ToBGRARow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_bgra, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vswp.u8 d20, d22 \n" + "vmov.u8 d19, #255 \n" + MEMACCESS(3) + "vst4.8 {d19, d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_bgra), // %3 + "+r"(width) // %4 + : [kUVToRB]"r"(&kUVToRB), // %5 + [kUVToG]"r"(&kUVToG), // %6 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void I422ToABGRRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_abgr, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vswp.u8 d20, d22 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(3) + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_abgr), // %3 + "+r"(width) // %4 + : [kUVToRB]"r"(&kUVToRB), // %5 + [kUVToG]"r"(&kUVToG), // %6 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void I422ToRGBARow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d19, #255 \n" + MEMACCESS(3) + "vst4.8 {d19, d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgba), // %3 + "+r"(width) // %4 + : [kUVToRB]"r"(&kUVToRB), // %5 + [kUVToG]"r"(&kUVToG), // %6 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void I422ToRGB24Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb24, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + MEMACCESS(3) + "vst3.8 {d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb24), // %3 + "+r"(width) // %4 + : [kUVToRB]"r"(&kUVToRB), // %5 + [kUVToG]"r"(&kUVToG), // %6 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void I422ToRAWRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_raw, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vswp.u8 d20, d22 \n" + MEMACCESS(3) + "vst3.8 {d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_raw), // %3 + "+r"(width) // %4 + : [kUVToRB]"r"(&kUVToRB), // %5 + [kUVToG]"r"(&kUVToG), // %6 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +#define ARGBTORGB565 \ + "vshr.u8 d20, d20, #3 \n" /* B */ \ + "vshr.u8 d21, d21, #2 \n" /* G */ \ + "vshr.u8 d22, d22, #3 \n" /* R */ \ + "vmovl.u8 q8, d20 \n" /* B */ \ + "vmovl.u8 q9, d21 \n" /* G */ \ + "vmovl.u8 q10, d22 \n" /* R */ \ + "vshl.u16 q9, q9, #5 \n" /* G */ \ + "vshl.u16 q10, q10, #11 \n" /* R */ \ + "vorr q0, q8, q9 \n" /* BG */ \ + "vorr q0, q0, q10 \n" /* BGR */ + +void I422ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb565, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + ARGBTORGB565 + MEMACCESS(3) + "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb565), // %3 + "+r"(width) // %4 + : [kUVToRB]"r"(&kUVToRB), // %5 + [kUVToG]"r"(&kUVToG), // %6 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +#define ARGBTOARGB1555 \ + "vshr.u8 q10, q10, #3 \n" /* B */ \ + "vshr.u8 d22, d22, #3 \n" /* R */ \ + "vshr.u8 d23, d23, #7 \n" /* A */ \ + "vmovl.u8 q8, d20 \n" /* B */ \ + "vmovl.u8 q9, d21 \n" /* G */ \ + "vmovl.u8 q10, d22 \n" /* R */ \ + "vmovl.u8 q11, d23 \n" /* A */ \ + "vshl.u16 q9, q9, #5 \n" /* G */ \ + "vshl.u16 q10, q10, #10 \n" /* R */ \ + "vshl.u16 q11, q11, #15 \n" /* A */ \ + "vorr q0, q8, q9 \n" /* BG */ \ + "vorr q1, q10, q11 \n" /* RA */ \ + "vorr q0, q0, q1 \n" /* BGRA */ + +void I422ToARGB1555Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb1555, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" + ARGBTOARGB1555 + MEMACCESS(3) + "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb1555), // %3 + "+r"(width) // %4 + : [kUVToRB]"r"(&kUVToRB), // %5 + [kUVToG]"r"(&kUVToG), // %6 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +#define ARGBTOARGB4444 \ + "vshr.u8 d20, d20, #4 \n" /* B */ \ + "vbic.32 d21, d21, d4 \n" /* G */ \ + "vshr.u8 d22, d22, #4 \n" /* R */ \ + "vbic.32 d23, d23, d4 \n" /* A */ \ + "vorr d0, d20, d21 \n" /* BG */ \ + "vorr d1, d22, d23 \n" /* RA */ \ + "vzip.u8 d0, d1 \n" /* BGRA */ + +void I422ToARGB4444Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb4444, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" + ARGBTOARGB4444 + MEMACCESS(3) + "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb4444), // %3 + "+r"(width) // %4 + : [kUVToRB]"r"(&kUVToRB), // %5 + [kUVToG]"r"(&kUVToG), // %6 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void I400ToARGBRow_NEON(const uint8* src_y, + uint8* dst_argb, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + ".p2align 2 \n" + "1: \n" + READYUV400 + YUV422TORGB + "subs %2, %2, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(1) + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : [kUVToRB]"r"(&kUVToRB), // %3 + [kUVToG]"r"(&kUVToG), // %4 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void J400ToARGBRow_NEON(const uint8* src_y, + uint8* dst_argb, + int width) { + asm volatile ( + "vmov.u8 d23, #255 \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {d20}, [%0]! \n" + "vmov d21, d20 \n" + "vmov d22, d20 \n" + "subs %2, %2, #8 \n" + MEMACCESS(1) + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d20", "d21", "d22", "d23" + ); +} + +void NV12ToARGBRow_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + ".p2align 2 \n" + "1: \n" + READNV12 + YUV422TORGB + "subs %3, %3, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(2) + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : [kUVToRB]"r"(&kUVToRB), // %4 + [kUVToG]"r"(&kUVToG), // %5 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void NV21ToARGBRow_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + ".p2align 2 \n" + "1: \n" + READNV21 + YUV422TORGB + "subs %3, %3, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(2) + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : [kUVToRB]"r"(&kUVToRB), // %4 + [kUVToG]"r"(&kUVToG), // %5 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void NV12ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_rgb565, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + ".p2align 2 \n" + "1: \n" + READNV12 + YUV422TORGB + "subs %3, %3, #8 \n" + ARGBTORGB565 + MEMACCESS(2) + "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb565), // %2 + "+r"(width) // %3 + : [kUVToRB]"r"(&kUVToRB), // %4 + [kUVToG]"r"(&kUVToG), // %5 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void NV21ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_rgb565, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + ".p2align 2 \n" + "1: \n" + READNV21 + YUV422TORGB + "subs %3, %3, #8 \n" + ARGBTORGB565 + MEMACCESS(2) + "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb565), // %2 + "+r"(width) // %3 + : [kUVToRB]"r"(&kUVToRB), // %4 + [kUVToG]"r"(&kUVToG), // %5 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void YUY2ToARGBRow_NEON(const uint8* src_yuy2, + uint8* dst_argb, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + ".p2align 2 \n" + "1: \n" + READYUY2 + YUV422TORGB + "subs %2, %2, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(1) + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : [kUVToRB]"r"(&kUVToRB), // %3 + [kUVToG]"r"(&kUVToG), // %4 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void UYVYToARGBRow_NEON(const uint8* src_uyvy, + uint8* dst_argb, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + ".p2align 2 \n" + "1: \n" + READUYVY + YUV422TORGB + "subs %2, %2, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(1) + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : [kUVToRB]"r"(&kUVToRB), // %3 + [kUVToG]"r"(&kUVToG), // %4 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. +void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV + "subs %3, %3, #16 \n" // 16 processed per loop + MEMACCESS(1) + "vst1.8 {q0}, [%1]! \n" // store U + MEMACCESS(2) + "vst1.8 {q1}, [%2]! \n" // store V + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// Reads 16 U's and V's and writes out 16 pairs of UV. +void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load U + MEMACCESS(1) + "vld1.8 {q1}, [%1]! \n" // load V + "subs %3, %3, #16 \n" // 16 processed per loop + MEMACCESS(2) + "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV + "bgt 1b \n" + : + "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. +void CopyRow_NEON(const uint8* src, uint8* dst, int count) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 + "subs %2, %2, #32 \n" // 32 processed per loop + MEMACCESS(1) + "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(count) // %2 // Output registers + : // Input registers + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// SetRow writes 'count' bytes using an 8 bit value repeated. +void SetRow_NEON(uint8* dst, uint8 v8, int count) { + asm volatile ( + "vdup.8 q0, %2 \n" // duplicate 16 bytes + "1: \n" + "subs %1, %1, #16 \n" // 16 bytes per loop + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" // store + "bgt 1b \n" + : "+r"(dst), // %0 + "+r"(count) // %1 + : "r"(v8) // %2 + : "cc", "memory", "q0" + ); +} + +// ARGBSetRow writes 'count' pixels using an 32 bit value repeated. +void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { + asm volatile ( + "vdup.u32 q0, %2 \n" // duplicate 4 ints + "1: \n" + "subs %1, %1, #4 \n" // 4 pixels per loop + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" // store + "bgt 1b \n" + : "+r"(dst), // %0 + "+r"(count) // %1 + : "r"(v32) // %2 + : "cc", "memory", "q0" + ); +} + +void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { + asm volatile ( + // Start at end of source row. + "mov r3, #-16 \n" + "add %0, %0, %2 \n" + "sub %0, #16 \n" + + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0], r3 \n" // src -= 16 + "subs %2, #16 \n" // 16 pixels per loop. + "vrev64.8 q0, q0 \n" + MEMACCESS(1) + "vst1.8 {d1}, [%1]! \n" // dst += 16 + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "r3", "q0" + ); +} + +void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) { + asm volatile ( + // Start at end of source row. + "mov r12, #-16 \n" + "add %0, %0, %3, lsl #1 \n" + "sub %0, #16 \n" + + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 + "subs %3, #8 \n" // 8 pixels per loop. + "vrev64.8 q0, q0 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // dst += 8 + MEMACCESS(2) + "vst1.8 {d1}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "r12", "q0" + ); +} + +void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { + asm volatile ( + // Start at end of source row. + "mov r3, #-16 \n" + "add %0, %0, %2, lsl #2 \n" + "sub %0, #16 \n" + + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0], r3 \n" // src -= 16 + "subs %2, #4 \n" // 4 pixels per loop. + "vrev64.32 q0, q0 \n" + MEMACCESS(1) + "vst1.8 {d1}, [%1]! \n" // dst += 16 + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "r3", "q0" + ); +} + +void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { + asm volatile ( + "vmov.u8 d4, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + MEMACCESS(1) + "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} + +void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { + asm volatile ( + "vmov.u8 d4, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + MEMACCESS(1) + "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} + +#define RGB565TOARGB \ + "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \ + "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \ + "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \ + "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \ + "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ + "vorr.u8 d0, d0, d4 \n" /* B */ \ + "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \ + "vorr.u8 d2, d1, d5 \n" /* R */ \ + "vorr.u8 d1, d4, d6 \n" /* G */ + +void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { + asm volatile ( + "vmov.u8 d3, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + RGB565TOARGB + MEMACCESS(1) + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +#define ARGB1555TOARGB \ + "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \ + "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \ + "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \ + "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \ + "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \ + "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \ + "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \ + "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \ + "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \ + "vorr.u8 q1, q1, q3 \n" /* R,A */ \ + "vorr.u8 q0, q0, q2 \n" /* B,G */ \ + +// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. +#define RGB555TOARGB \ + "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \ + "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \ + "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \ + "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \ + "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ + "vorr.u8 d0, d0, d4 \n" /* B */ \ + "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \ + "vorr.u8 d2, d1, d5 \n" /* R */ \ + "vorr.u8 d1, d4, d6 \n" /* G */ + +void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, + int pix) { + asm volatile ( + "vmov.u8 d3, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + MEMACCESS(1) + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +#define ARGB4444TOARGB \ + "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \ + "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \ + "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \ + "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \ + "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \ + "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \ + "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \ + "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */ + +void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, + int pix) { + asm volatile ( + "vmov.u8 d3, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + MEMACCESS(1) + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2" // Clobber List + ); +} + +void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + MEMACCESS(1) + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb24), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} + +void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + MEMACCESS(1) + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_raw), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} + +void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. + "subs %2, %2, #16 \n" // 16 processed per loop. + MEMACCESS(1) + "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. + "subs %2, %2, #16 \n" // 16 processed per loop. + MEMACCESS(1) + "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) + "vst1.8 {d1}, [%1]! \n" // store 8 U. + MEMACCESS(2) + "vst1.8 {d3}, [%2]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List + ); +} + +void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 U. + MEMACCESS(2) + "vst1.8 {d2}, [%2]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List + ); +} + +void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // stride + src_yuy2 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. + "vrhadd.u8 d1, d1, d5 \n" // average rows of U + "vrhadd.u8 d3, d3, d7 \n" // average rows of V + MEMACCESS(2) + "vst1.8 {d1}, [%2]! \n" // store 8 U. + MEMACCESS(3) + "vst1.8 {d3}, [%3]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(stride_yuy2), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List + ); +} + +void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // stride + src_uyvy + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. + "vrhadd.u8 d0, d0, d4 \n" // average rows of U + "vrhadd.u8 d2, d2, d6 \n" // average rows of V + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 U. + MEMACCESS(3) + "vst1.8 {d2}, [%3]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(stride_uyvy), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List + ); +} + +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + asm volatile ( + MEMACCESS(3) + "vld1.8 {q2}, [%3] \n" // shuffler + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 4 pixels. + "subs %2, %2, #4 \n" // 4 processed per loop + "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels + "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels + MEMACCESS(1) + "vst1.8 {q1}, [%1]! \n" // store 4. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "r"(shuffler) // %3 + : "cc", "memory", "q0", "q1", "q2" // Clobber List + ); +} + +void I422ToYUY2Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys + MEMACCESS(1) + "vld1.8 {d1}, [%1]! \n" // load 8 Us + MEMACCESS(2) + "vld1.8 {d3}, [%2]! \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + MEMACCESS(3) + "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3" + ); +} + +void I422ToUYVYRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys + MEMACCESS(1) + "vld1.8 {d0}, [%1]! \n" // load 8 Us + MEMACCESS(2) + "vld1.8 {d2}, [%2]! \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + MEMACCESS(3) + "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3" + ); +} + +void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTORGB565 + MEMACCESS(1) + "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb565), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q8", "q9", "q10", "q11" + ); +} + +void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, + const uint32 dither4, int width) { + asm volatile ( + ".p2align 2 \n" + "vdup.32 d2, %2 \n" // dither4 + "1: \n" + MEMACCESS(1) + "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d20, d20, d2 \n" + "vqadd.u8 d21, d21, d2 \n" + "vqadd.u8 d22, d22, d2 \n" + ARGBTORGB565 + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(dst_rgb) // %0 + : "r"(src_argb), // %1 + "r"(dither4), // %2 + "r"(width) // %3 + : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11" + ); +} + +void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, + int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTOARGB1555 + MEMACCESS(1) + "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb1555), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q8", "q9", "q10", "q11" + ); +} + +void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, + int pix) { + asm volatile ( + "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTOARGB4444 + MEMACCESS(1) + "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb4444), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q8", "q9", "q10", "q11" + ); +} + +void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q12", "q13" + ); +} + +void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient + "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient + "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q12", "q13" + ); +} + +// 8x1 pixels. +void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient + "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient + "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient + "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient + "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlsl.u8 q2, d1, d25 \n" // G + "vmlsl.u8 q2, d2, d26 \n" // R + "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned + + "vmull.u8 q3, d2, d24 \n" // R + "vmlsl.u8 q3, d1, d28 \n" // G + "vmlsl.u8 q3, d0, d27 \n" // B + "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned + + "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V + + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + MEMACCESS(2) + "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15" + ); +} + +// 16x1 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + + "subs %3, %3, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q0, q10 \n" // B + "vmls.s16 q8, q1, q11 \n" // G + "vmls.s16 q8, q2, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + + "vmul.s16 q9, q2, q10 \n" // R + "vmls.s16 q9, q1, q14 \n" // G + "vmls.s16 q9, q0, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + MEMACCESS(2) + "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32. +void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(0) + "vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels. + MEMACCESS(0) + "vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels. + "vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q6, q6 \n" // R 16 bytes -> 8 shorts. + + "vpadd.u16 d0, d0, d1 \n" // B 16 shorts -> 8 shorts. + "vpadd.u16 d1, d8, d9 \n" // B + "vpadd.u16 d2, d2, d3 \n" // G 16 shorts -> 8 shorts. + "vpadd.u16 d3, d10, d11 \n" // G + "vpadd.u16 d4, d4, d5 \n" // R 16 shorts -> 8 shorts. + "vpadd.u16 d5, d12, d13 \n" // R + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %3, %3, #32 \n" // 32 processed per loop. + "vmul.s16 q8, q0, q10 \n" // B + "vmls.s16 q8, q1, q11 \n" // G + "vmls.s16 q8, q2, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q2, q10 \n" // R + "vmls.s16 q9, q1, q14 \n" // G + "vmls.s16 q9, q0, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + MEMACCESS(2) + "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#define RGBTOUV(QB, QG, QR) \ + "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ + "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ + "vmls.s16 q8, " #QR ", q12 \n" /* R */ \ + "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ + "vmul.s16 q9, " #QR ", q10 \n" /* R */ \ + "vmls.s16 q9, " #QG ", q14 \n" /* G */ \ + "vmls.s16 q9, " #QB ", q13 \n" /* B */ \ + "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ + "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ + "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ + +// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. +void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. + MEMACCESS(1) + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q0, q1, q2) + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride_argb), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// TODO(fbarchard): Subsample match C code. +void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient + "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient + "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient + "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient + "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. + MEMACCESS(1) + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q0, q1, q2) + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride_argb), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_bgra + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. + "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels. + MEMACCESS(1) + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. + "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q1, q1, #1 \n" // 2x average + "vrshr.u16 q2, q2, #1 \n" + "vrshr.u16 q3, q3, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q3, q2, q1) + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(src_stride_bgra), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_abgr + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. + "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. + MEMACCESS(1) + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. + "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q2, q1, q0) + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(src_stride_abgr), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_rgba + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. + "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels. + MEMACCESS(1) + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. + "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q0, q1, q2) + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(src_stride_rgba), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_rgb24 + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. + MEMACCESS(0) + "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. + MEMACCESS(1) + "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q0, q1, q2) + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(src_stride_rgb24), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_raw + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. + MEMACCESS(0) + "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. + "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. + MEMACCESS(1) + "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. + "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q2, q1, q0) + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(src_stride_raw), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + RGB565TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. + RGB565TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. + RGB565TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. + RGB565TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(src_stride_rgb565), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(src_stride_argb1555), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(src_stride_argb4444), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + RGB565TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" + ); +} + +void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" + ); +} + +void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" + ); +} + +void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d1, d4 \n" // R + "vmlal.u8 q8, d2, d5 \n" // G + "vmlal.u8 q8, d3, d6 \n" // B + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + ); +} + +void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // R + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // B + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + ); +} + +void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d1, d4 \n" // B + "vmlal.u8 q8, d2, d5 \n" // G + "vmlal.u8 q8, d3, d6 \n" // R + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + ); +} + +void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // B + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // R + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + ); +} + +void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // B + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // R + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + ); +} + +// Bilinear filter 16x2 -> 16x1 +void InterpolateRow_NEON(uint8* dst_ptr, + const uint8* src_ptr, ptrdiff_t src_stride, + int dst_width, int source_y_fraction) { + asm volatile ( + "cmp %4, #0 \n" + "beq 100f \n" + "add %2, %1 \n" + "cmp %4, #64 \n" + "beq 75f \n" + "cmp %4, #128 \n" + "beq 50f \n" + "cmp %4, #192 \n" + "beq 25f \n" + + "vdup.8 d5, %4 \n" + "rsb %4, #256 \n" + "vdup.8 d4, %4 \n" + // General purpose row blend. + "1: \n" + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vmull.u8 q13, d0, d4 \n" + "vmull.u8 q14, d1, d4 \n" + "vmlal.u8 q13, d2, d5 \n" + "vmlal.u8 q14, d3, d5 \n" + "vrshrn.u16 d0, q13, #8 \n" + "vrshrn.u16 d1, q14, #8 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 1b \n" + "b 99f \n" + + // Blend 25 / 75. + "25: \n" + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 25b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 50b \n" + "b 99f \n" + + // Blend 75 / 25. + "75: \n" + MEMACCESS(1) + "vld1.8 {q1}, [%1]! \n" + MEMACCESS(2) + "vld1.8 {q0}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 75b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" + "subs %3, %3, #16 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(source_y_fraction) // %4 + : + : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14" + ); +} + +// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr +void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + "subs %3, #8 \n" + "blt 89f \n" + // Blend 8 pixels. + "8: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. + MEMACCESS(1) + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q10, d4, d3 \n" // db * a + "vmull.u8 q11, d5, d3 \n" // dg * a + "vmull.u8 q12, d6, d3 \n" // dr * a + "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 + "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 + "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 + "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 + "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 + "vqadd.u8 q0, q0, q2 \n" // + sbg + "vqadd.u8 d2, d2, d6 \n" // + sr + "vmov.u8 d3, #255 \n" // a = 255 + MEMACCESS(2) + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. + "bge 8b \n" + + "89: \n" + "adds %3, #8-1 \n" + "blt 99f \n" + + // Blend 1 pixels. + "1: \n" + MEMACCESS(0) + "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. + MEMACCESS(1) + "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. + "subs %3, %3, #1 \n" // 1 processed per loop. + "vmull.u8 q10, d4, d3 \n" // db * a + "vmull.u8 q11, d5, d3 \n" // dg * a + "vmull.u8 q12, d6, d3 \n" // dr * a + "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 + "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 + "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 + "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 + "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 + "vqadd.u8 q0, q0, q2 \n" // + sbg + "vqadd.u8 d2, d2, d6 \n" // + sr + "vmov.u8 d3, #255 \n" // a = 255 + MEMACCESS(2) + "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. + "bge 1b \n" + + "99: \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12" + ); +} + +// Attenuate 8 pixels at a time. +void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + // Attenuate 8 pixels. + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q10, d0, d3 \n" // b * a + "vmull.u8 q11, d1, d3 \n" // g * a + "vmull.u8 q12, d2, d3 \n" // r * a + "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 + "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 + "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 + MEMACCESS(1) + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q10", "q11", "q12" + ); +} + +// Quantize 8 ARGB pixels (32 bytes). +// dst = (dst * scale >> 16) * interval_size + interval_offset; +void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width) { + asm volatile ( + "vdup.u16 q8, %2 \n" + "vshr.u16 q8, q8, #1 \n" // scale >>= 1 + "vdup.u16 q9, %3 \n" // interval multiply. + "vdup.u16 q10, %4 \n" // interval add + + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. + "subs %1, %1, #8 \n" // 8 processed per loop. + "vmovl.u8 q0, d0 \n" // b (0 .. 255) + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q2, d4 \n" + "vqdmulh.s16 q0, q0, q8 \n" // b * scale + "vqdmulh.s16 q1, q1, q8 \n" // g + "vqdmulh.s16 q2, q2, q8 \n" // r + "vmul.u16 q0, q0, q9 \n" // b * interval_size + "vmul.u16 q1, q1, q9 \n" // g + "vmul.u16 q2, q2, q9 \n" // r + "vadd.u16 q0, q0, q10 \n" // b + interval_offset + "vadd.u16 q1, q1, q10 \n" // g + "vadd.u16 q2, q2, q10 \n" // r + "vqmovn.u16 d0, q0 \n" + "vqmovn.u16 d2, q1 \n" + "vqmovn.u16 d4, q2 \n" + MEMACCESS(0) + "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "r"(scale), // %2 + "r"(interval_size), // %3 + "r"(interval_offset) // %4 + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10" + ); +} + +// Shade 8 pixels at a time by specified value. +// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. +// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. +void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value) { + asm volatile ( + "vdup.u32 q0, %3 \n" // duplicate scale value. + "vzip.u8 d0, d1 \n" // d0 aarrggbb. + "vshr.u16 q0, q0, #1 \n" // scale / 2. + + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmovl.u8 q10, d20 \n" // b (0 .. 255) + "vmovl.u8 q11, d22 \n" + "vmovl.u8 q12, d24 \n" + "vmovl.u8 q13, d26 \n" + "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2 + "vqrdmulh.s16 q11, q11, d0[1] \n" // g + "vqrdmulh.s16 q12, q12, d0[2] \n" // r + "vqrdmulh.s16 q13, q13, d0[3] \n" // a + "vqmovn.u16 d20, q10 \n" + "vqmovn.u16 d22, q11 \n" + "vqmovn.u16 d24, q12 \n" + "vqmovn.u16 d26, q13 \n" + MEMACCESS(1) + "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 + : "cc", "memory", "q0", "q10", "q11", "q12", "q13" + ); +} + +// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels +// Similar to ARGBToYJ but stores ARGB. +// C code is (15 * b + 75 * g + 38 * r + 64) >> 7; +void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient + "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient + "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B + "vmov d1, d0 \n" // G + "vmov d2, d0 \n" // R + MEMACCESS(1) + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q12", "q13" + ); +} + +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. +// b = (r * 35 + g * 68 + b * 17) >> 7 +// g = (r * 45 + g * 88 + b * 22) >> 7 +// r = (r * 50 + g * 98 + b * 24) >> 7 +void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { + asm volatile ( + "vmov.u8 d20, #17 \n" // BB coefficient + "vmov.u8 d21, #68 \n" // BG coefficient + "vmov.u8 d22, #35 \n" // BR coefficient + "vmov.u8 d24, #22 \n" // GB coefficient + "vmov.u8 d25, #88 \n" // GG coefficient + "vmov.u8 d26, #45 \n" // GR coefficient + "vmov.u8 d28, #24 \n" // BB coefficient + "vmov.u8 d29, #98 \n" // BG coefficient + "vmov.u8 d30, #50 \n" // BR coefficient + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. + "subs %1, %1, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d20 \n" // B to Sepia B + "vmlal.u8 q2, d1, d21 \n" // G + "vmlal.u8 q2, d2, d22 \n" // R + "vmull.u8 q3, d0, d24 \n" // B to Sepia G + "vmlal.u8 q3, d1, d25 \n" // G + "vmlal.u8 q3, d2, d26 \n" // R + "vmull.u8 q8, d0, d28 \n" // B to Sepia R + "vmlal.u8 q8, d1, d29 \n" // G + "vmlal.u8 q8, d2, d30 \n" // R + "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B + "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G + "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R + MEMACCESS(0) + "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : + : "cc", "memory", "q0", "q1", "q2", "q3", + "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// Tranform 8 ARGB pixels (32 bytes) with color matrix. +// TODO(fbarchard): Was same as Sepia except matrix is provided. This function +// needs to saturate. Consider doing a non-saturating version. +void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) { + asm volatile ( + MEMACCESS(3) + "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. + "vmovl.s8 q0, d4 \n" // B,G coefficients s16. + "vmovl.s8 q1, d5 \n" // R,A coefficients s16. + + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit + "vmovl.u8 q9, d18 \n" // g + "vmovl.u8 q10, d20 \n" // r + "vmovl.u8 q11, d22 \n" // a + "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B + "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G + "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R + "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A + "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B + "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G + "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R + "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B + "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G + "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R + "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B + "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G + "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R + "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B + "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G + "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R + "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A + MEMACCESS(1) + "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 + : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. +#ifdef HAS_ARGBMULTIPLYROW_NEON +// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. +void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(1) + "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q0, d0, d1 \n" // multiply B + "vmull.u8 q1, d2, d3 \n" // multiply G + "vmull.u8 q2, d4, d5 \n" // multiply R + "vmull.u8 q3, d6, d7 \n" // multiply A + "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B + "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G + "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R + "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A + MEMACCESS(2) + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3" + ); +} +#endif // HAS_ARGBMULTIPLYROW_NEON + +// Add 2 rows of ARGB pixels together, 8 pixels at a time. +void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(1) + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 q0, q0, q2 \n" // add B, G + "vqadd.u8 q1, q1, q3 \n" // add R, A + MEMACCESS(2) + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3" + ); +} + +// Subtract 2 rows of ARGB pixels, 8 pixels at a time. +void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(1) + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqsub.u8 q0, q0, q2 \n" // subtract B, G + "vqsub.u8 q1, q1, q3 \n" // subtract R, A + MEMACCESS(2) + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3" + ); +} + +// Adds Sobel X and Sobel Y and stores Sobel into ARGB. +// A = 255 +// R = Sobel +// G = Sobel +// B = Sobel +void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + asm volatile ( + "vmov.u8 d3, #255 \n" // alpha + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. + MEMACCESS(1) + "vld1.8 {d1}, [%1]! \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d0, d0, d1 \n" // add + "vmov.u8 d1, d0 \n" + "vmov.u8 d2, d0 \n" + MEMACCESS(2) + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1" + ); +} + +// Adds Sobel X and Sobel Y and stores Sobel into plane. +void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width) { + asm volatile ( + // 16 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. + MEMACCESS(1) + "vld1.8 {q1}, [%1]! \n" // load 16 sobely. + "subs %3, %3, #16 \n" // 16 processed per loop. + "vqadd.u8 q0, q0, q1 \n" // add + MEMACCESS(2) + "vst1.8 {q0}, [%2]! \n" // store 16 pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_y), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1" + ); +} + +// Mixes Sobel X, Sobel Y and Sobel into ARGB. +// A = 255 +// R = Sobel X +// G = Sobel +// B = Sobel Y +void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + asm volatile ( + "vmov.u8 d3, #255 \n" // alpha + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. + MEMACCESS(1) + "vld1.8 {d0}, [%1]! \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d1, d0, d2 \n" // add + MEMACCESS(2) + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1" + ); +} + +// SobelX as a matrix is +// -1 0 1 +// -2 0 2 +// -1 0 1 +void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobelx, int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {d0}, [%0],%5 \n" // top + MEMACCESS(0) + "vld1.8 {d1}, [%0],%6 \n" + "vsubl.u8 q0, d0, d1 \n" + MEMACCESS(1) + "vld1.8 {d2}, [%1],%5 \n" // center * 2 + MEMACCESS(1) + "vld1.8 {d3}, [%1],%6 \n" + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vadd.s16 q0, q0, q1 \n" + MEMACCESS(2) + "vld1.8 {d2}, [%2],%5 \n" // bottom + MEMACCESS(2) + "vld1.8 {d3}, [%2],%6 \n" + "subs %4, %4, #8 \n" // 8 pixels + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vabs.s16 q0, q0 \n" + "vqmovn.u16 d0, q0 \n" + MEMACCESS(3) + "vst1.8 {d0}, [%3]! \n" // store 8 sobelx + "bgt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(src_y2), // %2 + "+r"(dst_sobelx), // %3 + "+r"(width) // %4 + : "r"(2), // %5 + "r"(6) // %6 + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// SobelY as a matrix is +// -1 -2 -1 +// 0 0 0 +// 1 2 1 +void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {d0}, [%0],%4 \n" // left + MEMACCESS(1) + "vld1.8 {d1}, [%1],%4 \n" + "vsubl.u8 q0, d0, d1 \n" + MEMACCESS(0) + "vld1.8 {d2}, [%0],%4 \n" // center * 2 + MEMACCESS(1) + "vld1.8 {d3}, [%1],%4 \n" + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vadd.s16 q0, q0, q1 \n" + MEMACCESS(0) + "vld1.8 {d2}, [%0],%5 \n" // right + MEMACCESS(1) + "vld1.8 {d3}, [%1],%5 \n" + "subs %3, %3, #8 \n" // 8 pixels + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vabs.s16 q0, q0 \n" + "vqmovn.u16 d0, q0 \n" + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 sobely + "bgt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(dst_sobely), // %2 + "+r"(width) // %3 + : "r"(1), // %4 + "r"(6) // %5 + : "cc", "memory", "q0", "q1" // Clobber List + ); +} +#endif // defined(__ARM_NEON__) && !defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/row_neon64.cc b/libs/libaom/src/third_party/libyuv/source/row_neon64.cc new file mode 100644 index 000000000..5d015454b --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/row_neon64.cc @@ -0,0 +1,3087 @@ +/* + * Copyright 2014 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon armv8 64 bit. +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +// Read 8 Y, 4 U and 4 V from 422 +#define READYUV422 \ + MEMACCESS(0) \ + "ld1 {v0.8b}, [%0], #8 \n" \ + MEMACCESS(1) \ + "ld1 {v1.s}[0], [%1], #4 \n" \ + MEMACCESS(2) \ + "ld1 {v1.s}[1], [%2], #4 \n" + +// Read 8 Y, 2 U and 2 V from 422 +#define READYUV411 \ + MEMACCESS(0) \ + "ld1 {v0.8b}, [%0], #8 \n" \ + MEMACCESS(1) \ + "ld1 {v2.h}[0], [%1], #2 \n" \ + MEMACCESS(2) \ + "ld1 {v2.h}[1], [%2], #2 \n" \ + "zip1 v1.8b, v2.8b, v2.8b \n" + +// Read 8 Y, 8 U and 8 V from 444 +#define READYUV444 \ + MEMACCESS(0) \ + "ld1 {v0.8b}, [%0], #8 \n" \ + MEMACCESS(1) \ + "ld1 {v1.d}[0], [%1], #8 \n" \ + MEMACCESS(2) \ + "ld1 {v1.d}[1], [%2], #8 \n" \ + "uaddlp v1.8h, v1.16b \n" \ + "rshrn v1.8b, v1.8h, #1 \n" + +// Read 8 Y, and set 4 U and 4 V to 128 +#define READYUV400 \ + MEMACCESS(0) \ + "ld1 {v0.8b}, [%0], #8 \n" \ + "movi v1.8b , #128 \n" + +// Read 8 Y and 4 UV from NV12 +#define READNV12 \ + MEMACCESS(0) \ + "ld1 {v0.8b}, [%0], #8 \n" \ + MEMACCESS(1) \ + "ld1 {v2.8b}, [%1], #8 \n" \ + "uzp1 v1.8b, v2.8b, v2.8b \n" \ + "uzp2 v3.8b, v2.8b, v2.8b \n" \ + "ins v1.s[1], v3.s[0] \n" + +// Read 8 Y and 4 VU from NV21 +#define READNV21 \ + MEMACCESS(0) \ + "ld1 {v0.8b}, [%0], #8 \n" \ + MEMACCESS(1) \ + "ld1 {v2.8b}, [%1], #8 \n" \ + "uzp1 v3.8b, v2.8b, v2.8b \n" \ + "uzp2 v1.8b, v2.8b, v2.8b \n" \ + "ins v1.s[1], v3.s[0] \n" + +// Read 8 YUY2 +#define READYUY2 \ + MEMACCESS(0) \ + "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \ + "uzp2 v3.8b, v1.8b, v1.8b \n" \ + "uzp1 v1.8b, v1.8b, v1.8b \n" \ + "ins v1.s[1], v3.s[0] \n" + +// Read 8 UYVY +#define READUYVY \ + MEMACCESS(0) \ + "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \ + "orr v0.8b, v3.8b, v3.8b \n" \ + "uzp1 v1.8b, v2.8b, v2.8b \n" \ + "uzp2 v3.8b, v2.8b, v2.8b \n" \ + "ins v1.s[1], v3.s[0] \n" + +#define YUV422TORGB_SETUP_REG \ + "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \ + "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \ + "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \ + "ld1r {v31.4s}, [%[kYToRgb]] \n" \ + "movi v27.8h, #128 \n" \ + "movi v28.8h, #102 \n" \ + "movi v29.8h, #25 \n" \ + "movi v30.8h, #52 \n" + +#define YUV422TORGB(vR, vG, vB) \ + "uxtl v0.8h, v0.8b \n" /* Extract Y */ \ + "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \ + "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \ + "ushll v0.4s, v0.4h, #0 \n" \ + "mul v3.4s, v3.4s, v31.4s \n" \ + "mul v0.4s, v0.4s, v31.4s \n" \ + "sqshrun v0.4h, v0.4s, #16 \n" \ + "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \ + "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \ + "mov v2.d[0], v1.d[1] \n" /* Extract V */ \ + "uxtl v2.8h, v2.8b \n" \ + "uxtl v1.8h, v1.8b \n" /* Extract U */ \ + "mul v3.8h, v1.8h, v27.8h \n" \ + "mul v5.8h, v1.8h, v29.8h \n" \ + "mul v6.8h, v2.8h, v30.8h \n" \ + "mul v7.8h, v2.8h, v28.8h \n" \ + "sqadd v6.8h, v6.8h, v5.8h \n" \ + "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \ + "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \ + "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \ + "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \ + "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \ + "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \ + "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \ + "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \ + "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \ + +// YUV to RGB conversion constants. +// Y contribution to R,G,B. Scale and bias. +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */ + +// U and V contributions to R,G,B. +#define UB -128 /* -min(128, round(2.018 * 64)) */ +#define UG 25 /* -round(-0.391 * 64) */ +#define VG 52 /* -round(-0.813 * 64) */ +#define VR -102 /* -round(1.596 * 64) */ + +// Bias values to subtract 16 from Y and 128 from U and V. +#define BB (UB * 128 - YGB) +#define BG (UG * 128 + VG * 128 - YGB) +#define BR (VR * 128 - YGB) + +static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 }; +static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 }; + +#undef YG +#undef YGB +#undef UB +#undef UG +#undef VG +#undef VR +#undef BB +#undef BG +#undef BR + +#define RGBTOUV_SETUP_REG \ + "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ + "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ + "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ + "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ + "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ + "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ + + +#ifdef HAS_I444TOARGBROW_NEON +void I444ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READYUV444 + YUV422TORGB(v22, v21, v20) + "subs %w4, %w4, #8 \n" + "movi v23.8b, #255 \n" /* A */ + MEMACCESS(3) + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_I444TOARGBROW_NEON + +#ifdef HAS_I422TOARGBROW_NEON +void I422ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READYUV422 + YUV422TORGB(v22, v21, v20) + "subs %w4, %w4, #8 \n" + "movi v23.8b, #255 \n" /* A */ + MEMACCESS(3) + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_I422TOARGBROW_NEON + +#ifdef HAS_I411TOARGBROW_NEON +void I411ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READYUV411 + YUV422TORGB(v22, v21, v20) + "subs %w4, %w4, #8 \n" + "movi v23.8b, #255 \n" /* A */ + MEMACCESS(3) + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_I411TOARGBROW_NEON + +#ifdef HAS_I422TOBGRAROW_NEON +void I422ToBGRARow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_bgra, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READYUV422 + YUV422TORGB(v21, v22, v23) + "subs %w4, %w4, #8 \n" + "movi v20.8b, #255 \n" /* A */ + MEMACCESS(3) + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_bgra), // %3 + "+r"(width) // %4 + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_I422TOBGRAROW_NEON + +#ifdef HAS_I422TOABGRROW_NEON +void I422ToABGRRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_abgr, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READYUV422 + YUV422TORGB(v20, v21, v22) + "subs %w4, %w4, #8 \n" + "movi v23.8b, #255 \n" /* A */ + MEMACCESS(3) + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_abgr), // %3 + "+r"(width) // %4 + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_I422TOABGRROW_NEON + +#ifdef HAS_I422TORGBAROW_NEON +void I422ToRGBARow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READYUV422 + YUV422TORGB(v23, v22, v21) + "subs %w4, %w4, #8 \n" + "movi v20.8b, #255 \n" /* A */ + MEMACCESS(3) + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgba), // %3 + "+r"(width) // %4 + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_I422TORGBAROW_NEON + +#ifdef HAS_I422TORGB24ROW_NEON +void I422ToRGB24Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb24, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READYUV422 + YUV422TORGB(v22, v21, v20) + "subs %w4, %w4, #8 \n" + MEMACCESS(3) + "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb24), // %3 + "+r"(width) // %4 + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_I422TORGB24ROW_NEON + +#ifdef HAS_I422TORAWROW_NEON +void I422ToRAWRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_raw, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READYUV422 + YUV422TORGB(v20, v21, v22) + "subs %w4, %w4, #8 \n" + MEMACCESS(3) + "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_raw), // %3 + "+r"(width) // %4 + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_I422TORAWROW_NEON + +#define ARGBTORGB565 \ + "shll v0.8h, v22.8b, #8 \n" /* R */ \ + "shll v20.8h, v20.8b, #8 \n" /* B */ \ + "shll v21.8h, v21.8b, #8 \n" /* G */ \ + "sri v0.8h, v21.8h, #5 \n" /* RG */ \ + "sri v0.8h, v20.8h, #11 \n" /* RGB */ + +#ifdef HAS_I422TORGB565ROW_NEON +void I422ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb565, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READYUV422 + YUV422TORGB(v22, v21, v20) + "subs %w4, %w4, #8 \n" + ARGBTORGB565 + MEMACCESS(3) + "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb565), // %3 + "+r"(width) // %4 + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_I422TORGB565ROW_NEON + +#define ARGBTOARGB1555 \ + "shll v0.8h, v23.8b, #8 \n" /* A */ \ + "shll v22.8h, v22.8b, #8 \n" /* R */ \ + "shll v20.8h, v20.8b, #8 \n" /* B */ \ + "shll v21.8h, v21.8b, #8 \n" /* G */ \ + "sri v0.8h, v22.8h, #1 \n" /* AR */ \ + "sri v0.8h, v21.8h, #6 \n" /* ARG */ \ + "sri v0.8h, v20.8h, #11 \n" /* ARGB */ + +#ifdef HAS_I422TOARGB1555ROW_NEON +void I422ToARGB1555Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb1555, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READYUV422 + YUV422TORGB(v22, v21, v20) + "subs %w4, %w4, #8 \n" + "movi v23.8b, #255 \n" + ARGBTOARGB1555 + MEMACCESS(3) + "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb1555), // %3 + "+r"(width) // %4 + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_I422TOARGB1555ROW_NEON + +#define ARGBTOARGB4444 \ + /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \ + "ushr v20.8b, v20.8b, #4 \n" /* B */ \ + "bic v21.8b, v21.8b, v4.8b \n" /* G */ \ + "ushr v22.8b, v22.8b, #4 \n" /* R */ \ + "bic v23.8b, v23.8b, v4.8b \n" /* A */ \ + "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \ + "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \ + "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ + +#ifdef HAS_I422TOARGB4444ROW_NEON +void I422ToARGB4444Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb4444, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "movi v4.16b, #0x0f \n" // bits to clear with vbic. + "1: \n" + READYUV422 + YUV422TORGB(v22, v21, v20) + "subs %w4, %w4, #8 \n" + "movi v23.8b, #255 \n" + ARGBTOARGB4444 + MEMACCESS(3) + "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb4444), // %3 + "+r"(width) // %4 + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_I422TOARGB4444ROW_NEON + +#ifdef HAS_I400TOARGBROW_NEON +void I400ToARGBRow_NEON(const uint8* src_y, + uint8* dst_argb, + int width) { + int64 width64 = (int64)(width); + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READYUV400 + YUV422TORGB(v22, v21, v20) + "subs %w2, %w2, #8 \n" + "movi v23.8b, #255 \n" + MEMACCESS(1) + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width64) // %2 + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_I400TOARGBROW_NEON + +#ifdef HAS_J400TOARGBROW_NEON +void J400ToARGBRow_NEON(const uint8* src_y, + uint8* dst_argb, + int width) { + asm volatile ( + "movi v23.8b, #255 \n" + "1: \n" + MEMACCESS(0) + "ld1 {v20.8b}, [%0], #8 \n" + "orr v21.8b, v20.8b, v20.8b \n" + "orr v22.8b, v20.8b, v20.8b \n" + "subs %w2, %w2, #8 \n" + MEMACCESS(1) + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v20", "v21", "v22", "v23" + ); +} +#endif // HAS_J400TOARGBROW_NEON + +#ifdef HAS_NV12TOARGBROW_NEON +void NV12ToARGBRow_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READNV12 + YUV422TORGB(v22, v21, v20) + "subs %w3, %w3, #8 \n" + "movi v23.8b, #255 \n" + MEMACCESS(2) + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_NV12TOARGBROW_NEON + +#ifdef HAS_NV21TOARGBROW_NEON +void NV21ToARGBRow_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READNV21 + YUV422TORGB(v22, v21, v20) + "subs %w3, %w3, #8 \n" + "movi v23.8b, #255 \n" + MEMACCESS(2) + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_NV21TOARGBROW_NEON + +#ifdef HAS_NV12TORGB565ROW_NEON +void NV12ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_rgb565, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READNV12 + YUV422TORGB(v22, v21, v20) + "subs %w3, %w3, #8 \n" + ARGBTORGB565 + MEMACCESS(2) + "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb565), // %2 + "+r"(width) // %3 + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_NV12TORGB565ROW_NEON + +#ifdef HAS_NV21TORGB565ROW_NEON +void NV21ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_rgb565, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READNV21 + YUV422TORGB(v22, v21, v20) + "subs %w3, %w3, #8 \n" + ARGBTORGB565 + MEMACCESS(2) + "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb565), // %2 + "+r"(width) // %3 + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_NV21TORGB565ROW_NEON + +#ifdef HAS_YUY2TOARGBROW_NEON +void YUY2ToARGBRow_NEON(const uint8* src_yuy2, + uint8* dst_argb, + int width) { + int64 width64 = (int64)(width); + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READYUY2 + YUV422TORGB(v22, v21, v20) + "subs %w2, %w2, #8 \n" + "movi v23.8b, #255 \n" + MEMACCESS(1) + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" + "b.gt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_argb), // %1 + "+r"(width64) // %2 + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_YUY2TOARGBROW_NEON + +#ifdef HAS_UYVYTOARGBROW_NEON +void UYVYToARGBRow_NEON(const uint8* src_uyvy, + uint8* dst_argb, + int width) { + int64 width64 = (int64)(width); + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READUYVY + YUV422TORGB(v22, v21, v20) + "subs %w2, %w2, #8 \n" + "movi v23.8b, #255 \n" + MEMACCESS(1) + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" + "b.gt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_argb), // %1 + "+r"(width64) // %2 + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_UYVYTOARGBROW_NEON + +// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. +#ifdef HAS_SPLITUVROW_NEON +void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV + "subs %w3, %w3, #16 \n" // 16 processed per loop + MEMACCESS(1) + "st1 {v0.16b}, [%1], #16 \n" // store U + MEMACCESS(2) + "st1 {v1.16b}, [%2], #16 \n" // store V + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "v0", "v1" // Clobber List + ); +} +#endif // HAS_SPLITUVROW_NEON + +// Reads 16 U's and V's and writes out 16 pairs of UV. +#ifdef HAS_MERGEUVROW_NEON +void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load U + MEMACCESS(1) + "ld1 {v1.16b}, [%1], #16 \n" // load V + "subs %w3, %w3, #16 \n" // 16 processed per loop + MEMACCESS(2) + "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV + "b.gt 1b \n" + : + "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "v0", "v1" // Clobber List + ); +} +#endif // HAS_MERGEUVROW_NEON + +// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. +#ifdef HAS_COPYROW_NEON +void CopyRow_NEON(const uint8* src, uint8* dst, int count) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 + "subs %w2, %w2, #32 \n" // 32 processed per loop + MEMACCESS(1) + "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(count) // %2 // Output registers + : // Input registers + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} +#endif // HAS_COPYROW_NEON + +// SetRow writes 'count' bytes using an 8 bit value repeated. +void SetRow_NEON(uint8* dst, uint8 v8, int count) { + asm volatile ( + "dup v0.16b, %w2 \n" // duplicate 16 bytes + "1: \n" + "subs %w1, %w1, #16 \n" // 16 bytes per loop + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" // store + "b.gt 1b \n" + : "+r"(dst), // %0 + "+r"(count) // %1 + : "r"(v8) // %2 + : "cc", "memory", "v0" + ); +} + +void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { + asm volatile ( + "dup v0.4s, %w2 \n" // duplicate 4 ints + "1: \n" + "subs %w1, %w1, #4 \n" // 4 ints per loop + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" // store + "b.gt 1b \n" + : "+r"(dst), // %0 + "+r"(count) // %1 + : "r"(v32) // %2 + : "cc", "memory", "v0" + ); +} + +#ifdef HAS_MIRRORROW_NEON +void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { + int64 width64 = (int64) width; + asm volatile ( + // Start at end of source row. + "add %0, %0, %2 \n" + "sub %0, %0, #16 \n" + + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 + "subs %2, %2, #16 \n" // 16 pixels per loop. + "rev64 v0.16b, v0.16b \n" + MEMACCESS(1) + "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 + MEMACCESS(1) + "st1 {v0.D}[0], [%1], #8 \n" + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width64) // %2 + : "r"((ptrdiff_t)-16) // %3 + : "cc", "memory", "v0" + ); +} +#endif // HAS_MIRRORROW_NEON + +#ifdef HAS_MIRRORUVROW_NEON +void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) { + int64 width64 = (int64) width; + asm volatile ( + // Start at end of source row. + "add %0, %0, %3, lsl #1 \n" + "sub %0, %0, #16 \n" + + "1: \n" + MEMACCESS(0) + "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 + "subs %3, %3, #8 \n" // 8 pixels per loop. + "rev64 v0.8b, v0.8b \n" + "rev64 v1.8b, v1.8b \n" + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // dst += 8 + MEMACCESS(2) + "st1 {v1.8b}, [%2], #8 \n" + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width64) // %3 + : "r"((ptrdiff_t)-16) // %4 + : "cc", "memory", "v0", "v1" + ); +} +#endif // HAS_MIRRORUVROW_NEON + +#ifdef HAS_ARGBMIRRORROW_NEON +void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { + int64 width64 = (int64) width; + asm volatile ( + // Start at end of source row. + "add %0, %0, %2, lsl #2 \n" + "sub %0, %0, #16 \n" + + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 + "subs %2, %2, #4 \n" // 4 pixels per loop. + "rev64 v0.4s, v0.4s \n" + MEMACCESS(1) + "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 + MEMACCESS(1) + "st1 {v0.D}[0], [%1], #8 \n" + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width64) // %2 + : "r"((ptrdiff_t)-16) // %3 + : "cc", "memory", "v0" + ); +} +#endif // HAS_ARGBMIRRORROW_NEON + +#ifdef HAS_RGB24TOARGBROW_NEON +void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { + asm volatile ( + "movi v4.8b, #255 \n" // Alpha + "1: \n" + MEMACCESS(0) + "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + MEMACCESS(1) + "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List + ); +} +#endif // HAS_RGB24TOARGBROW_NEON + +#ifdef HAS_RAWTOARGBROW_NEON +void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { + asm volatile ( + "movi v5.8b, #255 \n" // Alpha + "1: \n" + MEMACCESS(0) + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v3.8b, v1.8b, v1.8b \n" // move g + "orr v4.8b, v0.8b, v0.8b \n" // move r + MEMACCESS(1) + "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a + "b.gt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List + ); +} +#endif // HAS_RAWTOARGBROW_NEON + +#define RGB565TOARGB \ + "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \ + "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \ + "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ + "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ + "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ + "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ + "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ + "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ + "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ + "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ + "dup v2.2D, v0.D[1] \n" /* R */ + +#ifdef HAS_RGB565TOARGBROW_NEON +void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { + asm volatile ( + "movi v3.8b, #255 \n" // Alpha + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + RGB565TOARGB + MEMACCESS(1) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List + ); +} +#endif // HAS_RGB565TOARGBROW_NEON + +#define ARGB1555TOARGB \ + "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ + "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ + "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \ + \ + "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \ + "xtn2 v3.16b, v2.8h \n" \ + \ + "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ + "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ + \ + "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \ + "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ + "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ + \ + "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ + "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \ + "dup v1.2D, v0.D[1] \n" \ + "dup v3.2D, v2.D[1] \n" + +// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. +#define RGB555TOARGB \ + "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ + "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ + "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \ + \ + "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ + "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ + \ + "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \ + "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ + "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ + \ + "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ + "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ + "dup v1.2D, v0.D[1] \n" /* G */ \ + +#ifdef HAS_ARGB1555TOARGBROW_NEON +void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, + int pix) { + asm volatile ( + "movi v3.8b, #255 \n" // Alpha + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + MEMACCESS(1) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} +#endif // HAS_ARGB1555TOARGBROW_NEON + +#define ARGB4444TOARGB \ + "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ + "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ + "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \ + "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \ + "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \ + "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \ + "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \ + "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \ + "dup v0.2D, v2.D[1] \n" \ + "dup v1.2D, v3.D[1] \n" + +#ifdef HAS_ARGB4444TOARGBROW_NEON +void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, + int pix) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + MEMACCESS(1) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List + ); +} +#endif // HAS_ARGB4444TOARGBROW_NEON + +#ifdef HAS_ARGBTORGB24ROW_NEON +void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + MEMACCESS(1) + "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb24), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List + ); +} +#endif // HAS_ARGBTORGB24ROW_NEON + +#ifdef HAS_ARGBTORAWROW_NEON +void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v4.8b, v2.8b, v2.8b \n" // mov g + "orr v5.8b, v1.8b, v1.8b \n" // mov b + MEMACCESS(1) + "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_raw), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List + ); +} +#endif // HAS_ARGBTORAWROW_NEON + +#ifdef HAS_YUY2TOYROW_NEON +void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. + "subs %w2, %w2, #16 \n" // 16 processed per loop. + MEMACCESS(1) + "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. + "b.gt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1" // Clobber List + ); +} +#endif // HAS_YUY2TOYROW_NEON + +#ifdef HAS_UYVYTOYROW_NEON +void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. + "subs %w2, %w2, #16 \n" // 16 processed per loop. + MEMACCESS(1) + "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. + "b.gt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1" // Clobber List + ); +} +#endif // HAS_UYVYTOYROW_NEON + +#ifdef HAS_YUY2TOUV422ROW_NEON +void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels + "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) + "st1 {v1.8b}, [%1], #8 \n" // store 8 U. + MEMACCESS(2) + "st1 {v3.8b}, [%2], #8 \n" // store 8 V. + "b.gt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} +#endif // HAS_YUY2TOUV422ROW_NEON + +#ifdef HAS_UYVYTOUV422ROW_NEON +void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels + "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 U. + MEMACCESS(2) + "st1 {v2.8b}, [%2], #8 \n" // store 8 V. + "b.gt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} +#endif // HAS_UYVYTOUV422ROW_NEON + +#ifdef HAS_YUY2TOUVROW_NEON +void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_yuy2b = src_yuy2 + stride_yuy2; + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels + "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row + "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U + "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V + MEMACCESS(2) + "st1 {v1.8b}, [%2], #8 \n" // store 8 U. + MEMACCESS(3) + "st1 {v3.8b}, [%3], #8 \n" // store 8 V. + "b.gt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(src_yuy2b), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", + "v5", "v6", "v7" // Clobber List + ); +} +#endif // HAS_YUY2TOUVROW_NEON + +#ifdef HAS_UYVYTOUVROW_NEON +void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_uyvyb = src_uyvy + stride_uyvy; + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels + "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row + "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U + "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V + MEMACCESS(2) + "st1 {v0.8b}, [%2], #8 \n" // store 8 U. + MEMACCESS(3) + "st1 {v2.8b}, [%3], #8 \n" // store 8 V. + "b.gt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(src_uyvyb), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", + "v5", "v6", "v7" // Clobber List + ); +} +#endif // HAS_UYVYTOUVROW_NEON + +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +#ifdef HAS_ARGBSHUFFLEROW_NEON +void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + asm volatile ( + MEMACCESS(3) + "ld1 {v2.16b}, [%3] \n" // shuffler + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. + "subs %w2, %w2, #4 \n" // 4 processed per loop + "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels + MEMACCESS(1) + "st1 {v1.16b}, [%1], #16 \n" // store 4. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "r"(shuffler) // %3 + : "cc", "memory", "v0", "v1", "v2" // Clobber List + ); +} +#endif // HAS_ARGBSHUFFLEROW_NEON + +#ifdef HAS_I422TOYUY2ROW_NEON +void I422ToYUY2Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, int width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys + "orr v2.8b, v1.8b, v1.8b \n" + MEMACCESS(1) + "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us + MEMACCESS(2) + "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs + "subs %w4, %w4, #16 \n" // 16 pixels + MEMACCESS(3) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3" + ); +} +#endif // HAS_I422TOYUY2ROW_NEON + +#ifdef HAS_I422TOUYVYROW_NEON +void I422ToUYVYRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, int width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys + "orr v3.8b, v2.8b, v2.8b \n" + MEMACCESS(1) + "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us + MEMACCESS(2) + "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs + "subs %w4, %w4, #16 \n" // 16 pixels + MEMACCESS(3) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3" + ); +} +#endif // HAS_I422TOUYVYROW_NEON + +#ifdef HAS_ARGBTORGB565ROW_NEON +void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGBTORGB565 + MEMACCESS(1) + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb565), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v20", "v21", "v22", "v23" + ); +} +#endif // HAS_ARGBTORGB565ROW_NEON + +#ifdef HAS_ARGBTORGB565DITHERROW_NEON +void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, + const uint32 dither4, int width) { + asm volatile ( + "dup v1.4s, %w2 \n" // dither4 + "1: \n" + MEMACCESS(1) + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v20.8b, v20.8b, v1.8b \n" + "uqadd v21.8b, v21.8b, v1.8b \n" + "uqadd v22.8b, v22.8b, v1.8b \n" + ARGBTORGB565 + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" + : "+r"(dst_rgb) // %0 + : "r"(src_argb), // %1 + "r"(dither4), // %2 + "r"(width) // %3 + : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23" + ); +} +#endif // HAS_ARGBTORGB565ROW_NEON + +#ifdef HAS_ARGBTOARGB1555ROW_NEON +void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, + int pix) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGBTOARGB1555 + MEMACCESS(1) + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb1555), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v20", "v21", "v22", "v23" + ); +} +#endif // HAS_ARGBTOARGB1555ROW_NEON + +#ifdef HAS_ARGBTOARGB4444ROW_NEON +void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, + int pix) { + asm volatile ( + "movi v4.16b, #0x0f \n" // bits to clear with vbic. + "1: \n" + MEMACCESS(0) + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGBTOARGB4444 + MEMACCESS(1) + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb4444), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23" + ); +} +#endif // HAS_ARGBTOARGB4444ROW_NEON + +#ifdef HAS_ARGBTOYROW_NEON +void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + ); +} +#endif // HAS_ARGBTOYROW_NEON + +#ifdef HAS_ARGBTOYJROW_NEON +void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "movi v4.8b, #15 \n" // B * 0.11400 coefficient + "movi v5.8b, #75 \n" // G * 0.58700 coefficient + "movi v6.8b, #38 \n" // R * 0.29900 coefficient + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" + ); +} +#endif // HAS_ARGBTOYJROW_NEON + +// 8x1 pixels. +#ifdef HAS_ARGBTOUV444ROW_NEON +void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient + "movi v25.8b, #74 \n" // UG -0.5781 coefficient + "movi v26.8b, #38 \n" // UR -0.2969 coefficient + "movi v27.8b, #18 \n" // VB -0.1406 coefficient + "movi v28.8b, #94 \n" // VG -0.7344 coefficient + "movi v29.16b,#0x80 \n" // 128.5 + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v24.8b \n" // B + "umlsl v4.8h, v1.8b, v25.8b \n" // G + "umlsl v4.8h, v2.8b, v26.8b \n" // R + "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned + + "umull v3.8h, v2.8b, v24.8b \n" // R + "umlsl v3.8h, v1.8b, v28.8b \n" // G + "umlsl v3.8h, v0.8b, v27.8b \n" // B + "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned + + "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V + + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. + MEMACCESS(2) + "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", + "v24", "v25", "v26", "v27", "v28", "v29" + ); +} +#endif // HAS_ARGBTOUV444ROW_NEON + +// 16x1 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#ifdef HAS_ARGBTOUV422ROW_NEON +void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + MEMACCESS(0) + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + + "subs %w3, %w3, #16 \n" // 16 processed per loop. + "mul v3.8h, v0.8h, v20.8h \n" // B + "mls v3.8h, v1.8h, v21.8h \n" // G + "mls v3.8h, v2.8h, v22.8h \n" // R + "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned + + "mul v4.8h, v2.8h, v20.8h \n" // R + "mls v4.8h, v1.8h, v24.8h \n" // G + "mls v4.8h, v0.8h, v23.8h \n" // B + "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned + + "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V + + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. + MEMACCESS(2) + "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} +#endif // HAS_ARGBTOUV422ROW_NEON + +// 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32. +#ifdef HAS_ARGBTOUV411ROW_NEON +void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + MEMACCESS(0) + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + MEMACCESS(0) + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n" // load next 16. + "uaddlp v4.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v5.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v6.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "addp v0.8h, v0.8h, v4.8h \n" // B 16 shorts -> 8 shorts. + "addp v1.8h, v1.8h, v5.8h \n" // G 16 shorts -> 8 shorts. + "addp v2.8h, v2.8h, v6.8h \n" // R 16 shorts -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w3, %w3, #32 \n" // 32 processed per loop. + "mul v3.8h, v0.8h, v20.8h \n" // B + "mls v3.8h, v1.8h, v21.8h \n" // G + "mls v3.8h, v2.8h, v22.8h \n" // R + "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned + "mul v4.8h, v2.8h, v20.8h \n" // R + "mls v4.8h, v1.8h, v24.8h \n" // G + "mls v4.8h, v0.8h, v23.8h \n" // B + "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned + "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. + MEMACCESS(2) + "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} +#endif // HAS_ARGBTOUV411ROW_NEON + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#define RGBTOUV(QB, QG, QR) \ + "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ + "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ + "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ + "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ + "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ + "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ + "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ + "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ + "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ + "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ + +// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. +// TODO(fbarchard): consider ptrdiff_t for all strides. + +#ifdef HAS_ARGBTOUVROW_NEON +void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_argb_1 = src_argb + src_stride_argb; + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + MEMACCESS(0) + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + + MEMACCESS(1) + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 32 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + MEMACCESS(2) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + MEMACCESS(3) + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_argb_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} +#endif // HAS_ARGBTOUVROW_NEON + +// TODO(fbarchard): Subsample match C code. +#ifdef HAS_ARGBTOUVJROW_NEON +void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_argb_1 = src_argb + src_stride_argb; + asm volatile ( + "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 + "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 + "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 + "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 + "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 + "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) + "1: \n" + MEMACCESS(0) + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 32 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + MEMACCESS(2) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + MEMACCESS(3) + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_argb_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} +#endif // HAS_ARGBTOUVJROW_NEON + +#ifdef HAS_BGRATOUVROW_NEON +void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_bgra_1 = src_bgra + src_stride_bgra; + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + MEMACCESS(0) + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more + "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v3.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 32 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + MEMACCESS(2) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + MEMACCESS(3) + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(src_bgra_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} +#endif // HAS_BGRATOUVROW_NEON + +#ifdef HAS_ABGRTOUVROW_NEON +void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_abgr_1 = src_abgr + src_stride_abgr; + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + MEMACCESS(0) + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. + "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v3.8h, #1 \n" // 2x average + "urshr v2.8h, v2.8h, #1 \n" + "urshr v1.8h, v1.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 32 processed per loop. + RGBTOUV(v0.8h, v2.8h, v1.8h) + MEMACCESS(2) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + MEMACCESS(3) + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(src_abgr_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} +#endif // HAS_ABGRTOUVROW_NEON + +#ifdef HAS_RGBATOUVROW_NEON +void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_rgba_1 = src_rgba + src_stride_rgba; + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + MEMACCESS(0) + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. + "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 32 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + MEMACCESS(2) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + MEMACCESS(3) + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(src_rgba_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} +#endif // HAS_RGBATOUVROW_NEON + +#ifdef HAS_RGB24TOUVROW_NEON +void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24; + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + MEMACCESS(0) + "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more. + "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 32 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + MEMACCESS(2) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + MEMACCESS(3) + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(src_rgb24_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} +#endif // HAS_RGB24TOUVROW_NEON + +#ifdef HAS_RAWTOUVROW_NEON +void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, + uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_raw_1 = src_raw + src_stride_raw; + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + MEMACCESS(0) + "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. + "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels + "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v2.8h, v2.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v0.8h, v0.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 32 processed per loop. + RGBTOUV(v2.8h, v1.8h, v0.8h) + MEMACCESS(2) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + MEMACCESS(3) + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_raw), // %0 + "+r"(src_raw_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} +#endif // HAS_RAWTOUVROW_NEON + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#ifdef HAS_RGB565TOUVROW_NEON +void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565; + asm volatile ( + "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2 + "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2 + "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 + "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 + "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 + "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + RGB565TOARGB + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels. + RGB565TOARGB + "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels. + RGB565TOARGB + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels. + RGB565TOARGB + "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ins v16.D[1], v17.D[0] \n" + "ins v18.D[1], v19.D[0] \n" + "ins v20.D[1], v21.D[0] \n" + + "urshr v4.8h, v16.8h, #1 \n" // 2x average + "urshr v5.8h, v18.8h, #1 \n" + "urshr v6.8h, v20.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + "mul v16.8h, v4.8h, v22.8h \n" // B + "mls v16.8h, v5.8h, v23.8h \n" // G + "mls v16.8h, v6.8h, v24.8h \n" // R + "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned + "mul v17.8h, v6.8h, v22.8h \n" // R + "mls v17.8h, v5.8h, v26.8h \n" // G + "mls v17.8h, v4.8h, v25.8h \n" // B + "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned + "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V + MEMACCESS(2) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + MEMACCESS(3) + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(src_rgb565_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27" + ); +} +#endif // HAS_RGB565TOUVROW_NEON + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#ifdef HAS_ARGB1555TOUVROW_NEON +void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555; + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ins v16.D[1], v26.D[0] \n" + "ins v17.D[1], v27.D[0] \n" + "ins v18.D[1], v28.D[0] \n" + + "urshr v4.8h, v16.8h, #1 \n" // 2x average + "urshr v5.8h, v17.8h, #1 \n" + "urshr v6.8h, v18.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + "mul v2.8h, v4.8h, v20.8h \n" // B + "mls v2.8h, v5.8h, v21.8h \n" // G + "mls v2.8h, v6.8h, v22.8h \n" // R + "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned + "mul v3.8h, v6.8h, v20.8h \n" // R + "mls v3.8h, v5.8h, v24.8h \n" // G + "mls v3.8h, v4.8h, v23.8h \n" // B + "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned + "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V + MEMACCESS(2) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + MEMACCESS(3) + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(src_argb1555_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28" + ); +} +#endif // HAS_ARGB1555TOUVROW_NEON + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#ifdef HAS_ARGB4444TOUVROW_NEON +void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444; + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ins v16.D[1], v26.D[0] \n" + "ins v17.D[1], v27.D[0] \n" + "ins v18.D[1], v28.D[0] \n" + + "urshr v4.8h, v16.8h, #1 \n" // 2x average + "urshr v5.8h, v17.8h, #1 \n" + "urshr v6.8h, v18.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + "mul v2.8h, v4.8h, v20.8h \n" // B + "mls v2.8h, v5.8h, v21.8h \n" // G + "mls v2.8h, v6.8h, v22.8h \n" // R + "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned + "mul v3.8h, v6.8h, v20.8h \n" // R + "mls v3.8h, v5.8h, v24.8h \n" // G + "mls v3.8h, v4.8h, v23.8h \n" // B + "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned + "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V + MEMACCESS(2) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + MEMACCESS(3) + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(src_argb4444_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28" + + ); +} +#endif // HAS_ARGB4444TOUVROW_NEON + +#ifdef HAS_RGB565TOYROW_NEON +void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { + asm volatile ( + "movi v24.8b, #13 \n" // B * 0.1016 coefficient + "movi v25.8b, #65 \n" // G * 0.5078 coefficient + "movi v26.8b, #33 \n" // R * 0.2578 coefficient + "movi v27.8b, #16 \n" // Add 16 constant + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + RGB565TOARGB + "umull v3.8h, v0.8b, v24.8b \n" // B + "umlal v3.8h, v1.8b, v25.8b \n" // G + "umlal v3.8h, v2.8b, v26.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v27.8b \n" + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", + "v24", "v25", "v26", "v27" + ); +} +#endif // HAS_RGB565TOYROW_NEON + +#ifdef HAS_ARGB1555TOYROW_NEON +void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { + asm volatile ( + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + ); +} +#endif // HAS_ARGB1555TOYROW_NEON + +#ifdef HAS_ARGB4444TOYROW_NEON +void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { + asm volatile ( + "movi v24.8b, #13 \n" // B * 0.1016 coefficient + "movi v25.8b, #65 \n" // G * 0.5078 coefficient + "movi v26.8b, #33 \n" // R * 0.2578 coefficient + "movi v27.8b, #16 \n" // Add 16 constant + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + "umull v3.8h, v0.8b, v24.8b \n" // B + "umlal v3.8h, v1.8b, v25.8b \n" // G + "umlal v3.8h, v2.8b, v26.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v27.8b \n" + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27" + ); +} +#endif // HAS_ARGB4444TOYROW_NEON + +#ifdef HAS_BGRATOYROW_NEON +void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { + asm volatile ( + "movi v4.8b, #33 \n" // R * 0.2578 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #13 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v1.8b, v4.8b \n" // R + "umlal v16.8h, v2.8b, v5.8b \n" // G + "umlal v16.8h, v3.8b, v6.8b \n" // B + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" + ); +} +#endif // HAS_BGRATOYROW_NEON + +#ifdef HAS_ABGRTOYROW_NEON +void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { + asm volatile ( + "movi v4.8b, #33 \n" // R * 0.2578 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #13 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // R + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // B + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" + ); +} +#endif // HAS_ABGRTOYROW_NEON + +#ifdef HAS_RGBATOYROW_NEON +void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { + asm volatile ( + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v1.8b, v4.8b \n" // B + "umlal v16.8h, v2.8b, v5.8b \n" // G + "umlal v16.8h, v3.8b, v6.8b \n" // R + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" + ); +} +#endif // HAS_RGBATOYROW_NEON + +#ifdef HAS_RGB24TOYROW_NEON +void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { + asm volatile ( + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + MEMACCESS(0) + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // B + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" + ); +} +#endif // HAS_RGB24TOYROW_NEON + +#ifdef HAS_RAWTOYROW_NEON +void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { + asm volatile ( + "movi v4.8b, #33 \n" // R * 0.2578 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #13 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + MEMACCESS(0) + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // B + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" + ); +} +#endif // HAS_RAWTOYROW_NEON + +// Bilinear filter 16x2 -> 16x1 +#ifdef HAS_INTERPOLATEROW_NEON +void InterpolateRow_NEON(uint8* dst_ptr, + const uint8* src_ptr, ptrdiff_t src_stride, + int dst_width, int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint8* src_ptr1 = src_ptr + src_stride; + asm volatile ( + "cmp %w4, #0 \n" + "b.eq 100f \n" + "cmp %w4, #64 \n" + "b.eq 75f \n" + "cmp %w4, #128 \n" + "b.eq 50f \n" + "cmp %w4, #192 \n" + "b.eq 25f \n" + + "dup v5.16b, %w4 \n" + "dup v4.16b, %w5 \n" + // General purpose row blend. + "1: \n" + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" + MEMACCESS(2) + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "umull v2.8h, v0.8b, v4.8b \n" + "umull2 v3.8h, v0.16b, v4.16b \n" + "umlal v2.8h, v1.8b, v5.8b \n" + "umlal2 v3.8h, v1.16b, v5.16b \n" + "rshrn v0.8b, v2.8h, #8 \n" + "rshrn2 v0.16b, v3.8h, #8 \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 1b \n" + "b 99f \n" + + // Blend 25 / 75. + "25: \n" + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" + MEMACCESS(2) + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 25b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" + MEMACCESS(2) + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 50b \n" + "b 99f \n" + + // Blend 75 / 25. + "75: \n" + MEMACCESS(1) + "ld1 {v1.16b}, [%1], #16 \n" + MEMACCESS(2) + "ld1 {v0.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 75b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" + "subs %w3, %w3, #16 \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_ptr1), // %2 + "+r"(dst_width), // %3 + "+r"(y1_fraction), // %4 + "+r"(y0_fraction) // %5 + : + : "cc", "memory", "v0", "v1", "v3", "v4", "v5" + ); +} +#endif // HAS_INTERPOLATEROW_NEON + +// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr +#ifdef HAS_ARGBBLENDROW_NEON +void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + "subs %w3, %w3, #8 \n" + "b.lt 89f \n" + // Blend 8 pixels. + "8: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels + MEMACCESS(1) + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "umull v16.8h, v4.8b, v3.8b \n" // db * a + "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "umull v18.8h, v6.8b, v3.8b \n" // dr * a + "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 + "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 + "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 + "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) + "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) + "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) + "uqadd v0.8b, v0.8b, v4.8b \n" // + sb + "uqadd v1.8b, v1.8b, v5.8b \n" // + sg + "uqadd v2.8b, v2.8b, v6.8b \n" // + sr + "movi v3.8b, #255 \n" // a = 255 + MEMACCESS(2) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels + "b.ge 8b \n" + + "89: \n" + "adds %w3, %w3, #8-1 \n" + "b.lt 99f \n" + + // Blend 1 pixels. + "1: \n" + MEMACCESS(0) + "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. + MEMACCESS(1) + "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. + "subs %w3, %w3, #1 \n" // 1 processed per loop. + "umull v16.8h, v4.8b, v3.8b \n" // db * a + "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "umull v18.8h, v6.8b, v3.8b \n" // dr * a + "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 + "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 + "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 + "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) + "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) + "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) + "uqadd v0.8b, v0.8b, v4.8b \n" // + sb + "uqadd v1.8b, v1.8b, v5.8b \n" // + sg + "uqadd v2.8b, v2.8b, v6.8b \n" // + sr + "movi v3.8b, #255 \n" // a = 255 + MEMACCESS(2) + "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. + "b.ge 1b \n" + + "99: \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18" + ); +} +#endif // HAS_ARGBBLENDROW_NEON + +// Attenuate 8 pixels at a time. +#ifdef HAS_ARGBATTENUATEROW_NEON +void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + // Attenuate 8 pixels. + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v3.8b \n" // b * a + "umull v5.8h, v1.8b, v3.8b \n" // g * a + "umull v6.8h, v2.8b, v3.8b \n" // r * a + "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 + "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 + "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 + MEMACCESS(1) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" + ); +} +#endif // HAS_ARGBATTENUATEROW_NEON + +// Quantize 8 ARGB pixels (32 bytes). +// dst = (dst * scale >> 16) * interval_size + interval_offset; +#ifdef HAS_ARGBQUANTIZEROW_NEON +void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width) { + asm volatile ( + "dup v4.8h, %w2 \n" + "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 + "dup v5.8h, %w3 \n" // interval multiply. + "dup v6.8h, %w4 \n" // interval add + + // 8 pixel loop. + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB. + "subs %w1, %w1, #8 \n" // 8 processed per loop. + "uxtl v0.8h, v0.8b \n" // b (0 .. 255) + "uxtl v1.8h, v1.8b \n" + "uxtl v2.8h, v2.8b \n" + "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale + "sqdmulh v1.8h, v1.8h, v4.8h \n" // g + "sqdmulh v2.8h, v2.8h, v4.8h \n" // r + "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size + "mul v1.8h, v1.8h, v5.8h \n" // g + "mul v2.8h, v2.8h, v5.8h \n" // r + "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset + "add v1.8h, v1.8h, v6.8h \n" // g + "add v2.8h, v2.8h, v6.8h \n" // r + "uqxtn v0.8b, v0.8h \n" + "uqxtn v1.8b, v1.8h \n" + "uqxtn v2.8b, v2.8h \n" + MEMACCESS(0) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "r"(scale), // %2 + "r"(interval_size), // %3 + "r"(interval_offset) // %4 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" + ); +} +#endif // HAS_ARGBQUANTIZEROW_NEON + +// Shade 8 pixels at a time by specified value. +// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. +// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. +#ifdef HAS_ARGBSHADEROW_NEON +void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value) { + asm volatile ( + "dup v0.4s, %w3 \n" // duplicate scale value. + "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. + "ushr v0.8h, v0.8h, #1 \n" // scale / 2. + + // 8 pixel loop. + "1: \n" + MEMACCESS(0) + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "uxtl v4.8h, v4.8b \n" // b (0 .. 255) + "uxtl v5.8h, v5.8b \n" + "uxtl v6.8h, v6.8b \n" + "uxtl v7.8h, v7.8b \n" + "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 + "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g + "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r + "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a + "uqxtn v4.8b, v4.8h \n" + "uqxtn v5.8b, v5.8h \n" + "uqxtn v6.8b, v6.8h \n" + "uqxtn v7.8b, v7.8h \n" + MEMACCESS(1) + "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 + : "cc", "memory", "v0", "v4", "v5", "v6", "v7" + ); +} +#endif // HAS_ARGBSHADEROW_NEON + +// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels +// Similar to ARGBToYJ but stores ARGB. +// C code is (15 * b + 75 * g + 38 * r + 64) >> 7; +#ifdef HAS_ARGBGRAYROW_NEON +void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + "movi v24.8b, #15 \n" // B * 0.11400 coefficient + "movi v25.8b, #75 \n" // G * 0.58700 coefficient + "movi v26.8b, #38 \n" // R * 0.29900 coefficient + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v24.8b \n" // B + "umlal v4.8h, v1.8b, v25.8b \n" // G + "umlal v4.8h, v2.8b, v26.8b \n" // R + "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B + "orr v1.8b, v0.8b, v0.8b \n" // G + "orr v2.8b, v0.8b, v0.8b \n" // R + MEMACCESS(1) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26" + ); +} +#endif // HAS_ARGBGRAYROW_NEON + +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. +// b = (r * 35 + g * 68 + b * 17) >> 7 +// g = (r * 45 + g * 88 + b * 22) >> 7 +// r = (r * 50 + g * 98 + b * 24) >> 7 + +#ifdef HAS_ARGBSEPIAROW_NEON +void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { + asm volatile ( + "movi v20.8b, #17 \n" // BB coefficient + "movi v21.8b, #68 \n" // BG coefficient + "movi v22.8b, #35 \n" // BR coefficient + "movi v24.8b, #22 \n" // GB coefficient + "movi v25.8b, #88 \n" // GG coefficient + "movi v26.8b, #45 \n" // GR coefficient + "movi v28.8b, #24 \n" // BB coefficient + "movi v29.8b, #98 \n" // BG coefficient + "movi v30.8b, #50 \n" // BR coefficient + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. + "subs %w1, %w1, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B + "umlal v4.8h, v1.8b, v21.8b \n" // G + "umlal v4.8h, v2.8b, v22.8b \n" // R + "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G + "umlal v5.8h, v1.8b, v25.8b \n" // G + "umlal v5.8h, v2.8b, v26.8b \n" // R + "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R + "umlal v6.8h, v1.8b, v29.8b \n" // G + "umlal v6.8h, v2.8b, v30.8b \n" // R + "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B + "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G + "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R + MEMACCESS(0) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30" + ); +} +#endif // HAS_ARGBSEPIAROW_NEON + +// Tranform 8 ARGB pixels (32 bytes) with color matrix. +// TODO(fbarchard): Was same as Sepia except matrix is provided. This function +// needs to saturate. Consider doing a non-saturating version. +#ifdef HAS_ARGBCOLORMATRIXROW_NEON +void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) { + asm volatile ( + MEMACCESS(3) + "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. + "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. + "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. + + "1: \n" + MEMACCESS(0) + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit + "uxtl v17.8h, v17.8b \n" // g + "uxtl v18.8h, v18.8b \n" // r + "uxtl v19.8h, v19.8b \n" // a + "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B + "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G + "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R + "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A + "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B + "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G + "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R + "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B + "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G + "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R + "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B + "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G + "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R + "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B + "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G + "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R + "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A + MEMACCESS(1) + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 pixels. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", + "v18", "v19", "v22", "v23", "v24", "v25" + ); +} +#endif // HAS_ARGBCOLORMATRIXROW_NEON + +// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. +// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. +#ifdef HAS_ARGBMULTIPLYROW_NEON +void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 8 pixel loop. + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + MEMACCESS(1) + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "umull v0.8h, v0.8b, v4.8b \n" // multiply B + "umull v1.8h, v1.8b, v5.8b \n" // multiply G + "umull v2.8h, v2.8b, v6.8b \n" // multiply R + "umull v3.8h, v3.8b, v7.8b \n" // multiply A + "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B + "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G + "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R + "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A + MEMACCESS(2) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + ); +} +#endif // HAS_ARGBMULTIPLYROW_NEON + +// Add 2 rows of ARGB pixels together, 8 pixels at a time. +#ifdef HAS_ARGBADDROW_NEON +void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 8 pixel loop. + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + MEMACCESS(1) + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v0.8b, v0.8b, v4.8b \n" + "uqadd v1.8b, v1.8b, v5.8b \n" + "uqadd v2.8b, v2.8b, v6.8b \n" + "uqadd v3.8b, v3.8b, v7.8b \n" + MEMACCESS(2) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + ); +} +#endif // HAS_ARGBADDROW_NEON + +// Subtract 2 rows of ARGB pixels, 8 pixels at a time. +#ifdef HAS_ARGBSUBTRACTROW_NEON +void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 8 pixel loop. + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + MEMACCESS(1) + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqsub v0.8b, v0.8b, v4.8b \n" + "uqsub v1.8b, v1.8b, v5.8b \n" + "uqsub v2.8b, v2.8b, v6.8b \n" + "uqsub v3.8b, v3.8b, v7.8b \n" + MEMACCESS(2) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + ); +} +#endif // HAS_ARGBSUBTRACTROW_NEON + +// Adds Sobel X and Sobel Y and stores Sobel into ARGB. +// A = 255 +// R = Sobel +// G = Sobel +// B = Sobel +#ifdef HAS_SOBELROW_NEON +void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + asm volatile ( + "movi v3.8b, #255 \n" // alpha + // 8 pixel loop. + "1: \n" + MEMACCESS(0) + "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. + MEMACCESS(1) + "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v0.8b, v0.8b, v1.8b \n" // add + "orr v1.8b, v0.8b, v0.8b \n" + "orr v2.8b, v0.8b, v0.8b \n" + MEMACCESS(2) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3" + ); +} +#endif // HAS_SOBELROW_NEON + +// Adds Sobel X and Sobel Y and stores Sobel into plane. +#ifdef HAS_SOBELTOPLANEROW_NEON +void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width) { + asm volatile ( + // 16 pixel loop. + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. + MEMACCESS(1) + "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. + "subs %w3, %w3, #16 \n" // 16 processed per loop. + "uqadd v0.16b, v0.16b, v1.16b \n" // add + MEMACCESS(2) + "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. + "b.gt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_y), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1" + ); +} +#endif // HAS_SOBELTOPLANEROW_NEON + +// Mixes Sobel X, Sobel Y and Sobel into ARGB. +// A = 255 +// R = Sobel X +// G = Sobel +// B = Sobel Y +#ifdef HAS_SOBELXYROW_NEON +void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + asm volatile ( + "movi v3.8b, #255 \n" // alpha + // 8 pixel loop. + "1: \n" + MEMACCESS(0) + "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. + MEMACCESS(1) + "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v1.8b, v0.8b, v2.8b \n" // add + MEMACCESS(2) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3" + ); +} +#endif // HAS_SOBELXYROW_NEON + +// SobelX as a matrix is +// -1 0 1 +// -2 0 2 +// -1 0 1 +#ifdef HAS_SOBELXROW_NEON +void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobelx, int width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld1 {v0.8b}, [%0],%5 \n" // top + MEMACCESS(0) + "ld1 {v1.8b}, [%0],%6 \n" + "usubl v0.8h, v0.8b, v1.8b \n" + MEMACCESS(1) + "ld1 {v2.8b}, [%1],%5 \n" // center * 2 + MEMACCESS(1) + "ld1 {v3.8b}, [%1],%6 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "add v0.8h, v0.8h, v1.8h \n" + MEMACCESS(2) + "ld1 {v2.8b}, [%2],%5 \n" // bottom + MEMACCESS(2) + "ld1 {v3.8b}, [%2],%6 \n" + "subs %w4, %w4, #8 \n" // 8 pixels + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "abs v0.8h, v0.8h \n" + "uqxtn v0.8b, v0.8h \n" + MEMACCESS(3) + "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx + "b.gt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(src_y2), // %2 + "+r"(dst_sobelx), // %3 + "+r"(width) // %4 + : "r"(2LL), // %5 + "r"(6LL) // %6 + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} +#endif // HAS_SOBELXROW_NEON + +// SobelY as a matrix is +// -1 -2 -1 +// 0 0 0 +// 1 2 1 +#ifdef HAS_SOBELYROW_NEON +void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld1 {v0.8b}, [%0],%4 \n" // left + MEMACCESS(1) + "ld1 {v1.8b}, [%1],%4 \n" + "usubl v0.8h, v0.8b, v1.8b \n" + MEMACCESS(0) + "ld1 {v2.8b}, [%0],%4 \n" // center * 2 + MEMACCESS(1) + "ld1 {v3.8b}, [%1],%4 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "add v0.8h, v0.8h, v1.8h \n" + MEMACCESS(0) + "ld1 {v2.8b}, [%0],%5 \n" // right + MEMACCESS(1) + "ld1 {v3.8b}, [%1],%5 \n" + "subs %w3, %w3, #8 \n" // 8 pixels + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "abs v0.8h, v0.8h \n" + "uqxtn v0.8b, v0.8h \n" + MEMACCESS(2) + "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely + "b.gt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(dst_sobely), // %2 + "+r"(width) // %3 + : "r"(1LL), // %4 + "r"(6LL) // %5 + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} +#endif // HAS_SOBELYROW_NEON +#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/row_win.cc b/libs/libaom/src/third_party/libyuv/source/row_win.cc new file mode 100644 index 000000000..71be268b4 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/row_win.cc @@ -0,0 +1,6331 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_X64) && \ + defined(_MSC_VER) && !defined(__clang__) +#include +#include // For _mm_maddubs_epi16 +#endif + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for Visual C. +#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) && \ + defined(_MSC_VER) && !defined(__clang__) + +struct YuvConstants { + lvec8 kUVToB; // 0 + lvec8 kUVToG; // 32 + lvec8 kUVToR; // 64 + lvec16 kUVBiasB; // 96 + lvec16 kUVBiasG; // 128 + lvec16 kUVBiasR; // 160 + lvec16 kYToRgb; // 192 +}; + +// BT.601 YUV to RGB reference +// R = (Y - 16) * 1.164 - V * -1.596 +// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813 +// B = (Y - 16) * 1.164 - U * -2.018 + +// Y contribution to R,G,B. Scale and bias. +// TODO(fbarchard): Consider moving constants into a common header. +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ + +// U and V contributions to R,G,B. +#define UB -128 /* max(-128, round(-2.018 * 64)) */ +#define UG 25 /* round(0.391 * 64) */ +#define VG 52 /* round(0.813 * 64) */ +#define VR -102 /* round(-1.596 * 64) */ + +// Bias values to subtract 16 from Y and 128 from U and V. +#define BB (UB * 128 + YGB) +#define BG (UG * 128 + VG * 128 + YGB) +#define BR (VR * 128 + YGB) + +// BT601 constants for YUV to RGB. +static YuvConstants SIMD_ALIGNED(kYuvConstants) = { + { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, + UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, + { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, + { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, + 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, + { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, + { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, + { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, + { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } +}; + +// BT601 constants for NV21 where chroma plane is VU instead of UV. +static YuvConstants SIMD_ALIGNED(kYvuConstants) = { + { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, + 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, + { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, + VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, + { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, + VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, + { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, + { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, + { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, + { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } +}; + +#undef YG +#undef YGB +#undef UB +#undef UG +#undef VG +#undef VR +#undef BB +#undef BG +#undef BR + +// JPEG YUV to RGB reference +// * R = Y - V * -1.40200 +// * G = Y - U * 0.34414 - V * 0.71414 +// * B = Y - U * -1.77200 + +// Y contribution to R,G,B. Scale and bias. +// TODO(fbarchard): Consider moving constants into a common header. +#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ +#define YGBJ 32 /* 64 / 2 */ + +// U and V contributions to R,G,B. +#define UBJ -113 /* round(-1.77200 * 64) */ +#define UGJ 22 /* round(0.34414 * 64) */ +#define VGJ 46 /* round(0.71414 * 64) */ +#define VRJ -90 /* round(-1.40200 * 64) */ + +// Bias values to subtract 16 from Y and 128 from U and V. +#define BBJ (UBJ * 128 + YGBJ) +#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ) +#define BRJ (VRJ * 128 + YGBJ) + +// JPEG constants for YUV to RGB. +static YuvConstants SIMD_ALIGNED(kYuvJConstants) = { + { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, + UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 }, + { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, + UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, + UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, + UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ }, + { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, + 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ }, + { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, + BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ }, + { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, + BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ }, + { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, + BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ }, + { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, + YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ } +}; + +#undef YGJ +#undef YGBJ +#undef UBJ +#undef UGJ +#undef VGJ +#undef VRJ +#undef BBJ +#undef BGJ +#undef BRJ + +// 64 bit +#if defined(_M_X64) +#if defined(HAS_I422TOARGBROW_SSSE3) +void I422ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __m128i xmm0, xmm1, xmm2, xmm3; + const __m128i xmm5 = _mm_set1_epi8(-1); + const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; + + while (width > 0) { + xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); + xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); + xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); + xmm1 = _mm_loadu_si128(&xmm0); + xmm2 = _mm_loadu_si128(&xmm0); + xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kYuvConstants.kUVToB); + xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kYuvConstants.kUVToG); + xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kYuvConstants.kUVToR); + xmm0 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasB, xmm0); + xmm1 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasG, xmm1); + xmm2 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasR, xmm2); + xmm3 = _mm_loadl_epi64((__m128i*)y_buf); + xmm3 = _mm_unpacklo_epi8(xmm3, xmm3); + xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)kYuvConstants.kYToRgb); + xmm0 = _mm_adds_epi16(xmm0, xmm3); + xmm1 = _mm_adds_epi16(xmm1, xmm3); + xmm2 = _mm_adds_epi16(xmm2, xmm3); + xmm0 = _mm_srai_epi16(xmm0, 6); + xmm1 = _mm_srai_epi16(xmm1, 6); + xmm2 = _mm_srai_epi16(xmm2, 6); + xmm0 = _mm_packus_epi16(xmm0, xmm0); + xmm1 = _mm_packus_epi16(xmm1, xmm1); + xmm2 = _mm_packus_epi16(xmm2, xmm2); + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); + xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); + xmm1 = _mm_loadu_si128(&xmm0); + xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); + xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); + + _mm_storeu_si128((__m128i *)dst_argb, xmm0); + _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); + + y_buf += 8; + u_buf += 4; + dst_argb += 32; + width -= 8; + } +} +#endif +// 32 bit +#else // defined(_M_X64) +#ifdef HAS_ARGBTOYROW_SSSE3 + +// Constants for ARGB. +static const vec8 kARGBToY = { + 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 +}; + +// JPeg full range. +static const vec8 kARGBToYJ = { + 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 +}; + +static const vec8 kARGBToU = { + 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 +}; + +static const vec8 kARGBToUJ = { + 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 +}; + +static const vec8 kARGBToV = { + -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, +}; + +static const vec8 kARGBToVJ = { + -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 +}; + +// vpshufb for vphaddw + vpackuswb packed to shorts. +static const lvec8 kShufARGBToUV_AVX = { + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +}; + +// Constants for BGRA. +static const vec8 kBGRAToY = { + 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 +}; + +static const vec8 kBGRAToU = { + 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 +}; + +static const vec8 kBGRAToV = { + 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 +}; + +// Constants for ABGR. +static const vec8 kABGRToY = { + 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 +}; + +static const vec8 kABGRToU = { + -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 +}; + +static const vec8 kABGRToV = { + 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 +}; + +// Constants for RGBA. +static const vec8 kRGBAToY = { + 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 +}; + +static const vec8 kRGBAToU = { + 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 +}; + +static const vec8 kRGBAToV = { + 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 +}; + +static const uvec8 kAddY16 = { + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u +}; + +// 7 bit fixed point 0.5. +static const vec16 kAddYJ64 = { + 64, 64, 64, 64, 64, 64, 64, 64 +}; + +static const uvec8 kAddUV128 = { + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u +}; + +static const uvec16 kAddUVJ128 = { + 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u +}; + +// Shuffle table for converting RGB24 to ARGB. +static const uvec8 kShuffleMaskRGB24ToARGB = { + 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u +}; + +// Shuffle table for converting RAW to ARGB. +static const uvec8 kShuffleMaskRAWToARGB = { + 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u +}; + +// Shuffle table for converting ARGB to RGB24. +static const uvec8 kShuffleMaskARGBToRGB24 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u +}; + +// Shuffle table for converting ARGB to RAW. +static const uvec8 kShuffleMaskARGBToRAW = { + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u +}; + +// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 +static const uvec8 kShuffleMaskARGBToRGB24_0 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u +}; + +// Shuffle table for converting ARGB to RAW. +static const uvec8 kShuffleMaskARGBToRAW_0 = { + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u +}; + +// Duplicates gray value 3 times and fills in alpha opaque. +__declspec(naked) +void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { + __asm { + mov eax, [esp + 4] // src_y + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pslld xmm5, 24 + + convertloop: + movq xmm0, qword ptr [eax] + lea eax, [eax + 8] + punpcklbw xmm0, xmm0 + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm0 + punpckhwd xmm1, xmm1 + por xmm0, xmm5 + por xmm1, xmm5 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + ret + } +} + +#ifdef HAS_J400TOARGBROW_AVX2 +// Duplicates gray value 3 times and fills in alpha opaque. +__declspec(naked) +void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) { + __asm { + mov eax, [esp + 4] // src_y + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 + vpslld ymm5, ymm5, 24 + + convertloop: + vmovdqu xmm0, [eax] + lea eax, [eax + 16] + vpermq ymm0, ymm0, 0xd8 + vpunpcklbw ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 + vpunpckhwd ymm1, ymm0, ymm0 + vpunpcklwd ymm0, ymm0, ymm0 + vpor ymm0, ymm0, ymm5 + vpor ymm1, ymm1, ymm5 + vmovdqu [edx], ymm0 + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_J400TOARGBROW_AVX2 + +__declspec(naked) +void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { + __asm { + mov eax, [esp + 4] // src_rgb24 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pslld xmm5, 24 + movdqa xmm4, kShuffleMaskRGB24ToARGB + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm3, [eax + 32] + lea eax, [eax + 48] + movdqa xmm2, xmm3 + palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} + pshufb xmm2, xmm4 + por xmm2, xmm5 + palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} + pshufb xmm0, xmm4 + movdqu [edx + 32], xmm2 + por xmm0, xmm5 + pshufb xmm1, xmm4 + movdqu [edx], xmm0 + por xmm1, xmm5 + palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} + pshufb xmm3, xmm4 + movdqu [edx + 16], xmm1 + por xmm3, xmm5 + movdqu [edx + 48], xmm3 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) +void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, + int pix) { + __asm { + mov eax, [esp + 4] // src_raw + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pslld xmm5, 24 + movdqa xmm4, kShuffleMaskRAWToARGB + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm3, [eax + 32] + lea eax, [eax + 48] + movdqa xmm2, xmm3 + palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} + pshufb xmm2, xmm4 + por xmm2, xmm5 + palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} + pshufb xmm0, xmm4 + movdqu [edx + 32], xmm2 + por xmm0, xmm5 + pshufb xmm1, xmm4 + movdqu [edx], xmm0 + por xmm1, xmm5 + palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} + pshufb xmm3, xmm4 + movdqu [edx + 16], xmm1 + por xmm3, xmm5 + movdqu [edx + 48], xmm3 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + ret + } +} + +// pmul method to replicate bits. +// Math to replicate bits: +// (v << 8) | (v << 3) +// v * 256 + v * 8 +// v * (256 + 8) +// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 +// 20 instructions. +__declspec(naked) +void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, + int pix) { + __asm { + mov eax, 0x01080108 // generate multiplier to repeat 5 bits + movd xmm5, eax + pshufd xmm5, xmm5, 0 + mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits + movd xmm6, eax + pshufd xmm6, xmm6, 0 + pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red + psllw xmm3, 11 + pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green + psllw xmm4, 10 + psrlw xmm4, 5 + pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha + psllw xmm7, 8 + + mov eax, [esp + 4] // src_rgb565 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + sub edx, eax + sub edx, eax + + convertloop: + movdqu xmm0, [eax] // fetch 8 pixels of bgr565 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + pand xmm1, xmm3 // R in upper 5 bits + psllw xmm2, 11 // B in upper 5 bits + pmulhuw xmm1, xmm5 // * (256 + 8) + pmulhuw xmm2, xmm5 // * (256 + 8) + psllw xmm1, 8 + por xmm1, xmm2 // RB + pand xmm0, xmm4 // G in middle 6 bits + pmulhuw xmm0, xmm6 // << 5 * (256 + 4) + por xmm0, xmm7 // AG + movdqa xmm2, xmm1 + punpcklbw xmm1, xmm0 + punpckhbw xmm2, xmm0 + movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB + movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB + lea eax, [eax + 16] + sub ecx, 8 + jg convertloop + ret + } +} + +#ifdef HAS_RGB565TOARGBROW_AVX2 +// pmul method to replicate bits. +// Math to replicate bits: +// (v << 8) | (v << 3) +// v * 256 + v * 8 +// v * (256 + 8) +// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 +__declspec(naked) +void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, + int pix) { + __asm { + mov eax, 0x01080108 // generate multiplier to repeat 5 bits + vmovd xmm5, eax + vbroadcastss ymm5, xmm5 + mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits + movd xmm6, eax + vbroadcastss ymm6, xmm6 + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red + vpsllw ymm3, ymm3, 11 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green + vpsllw ymm4, ymm4, 10 + vpsrlw ymm4, ymm4, 5 + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha + vpsllw ymm7, ymm7, 8 + + mov eax, [esp + 4] // src_rgb565 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + sub edx, eax + sub edx, eax + + convertloop: + vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565 + vpand ymm1, ymm0, ymm3 // R in upper 5 bits + vpsllw ymm2, ymm0, 11 // B in upper 5 bits + vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) + vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) + vpsllw ymm1, ymm1, 8 + vpor ymm1, ymm1, ymm2 // RB + vpand ymm0, ymm0, ymm4 // G in middle 6 bits + vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4) + vpor ymm0, ymm0, ymm7 // AG + vpermq ymm0, ymm0, 0xd8 // mutate for unpack + vpermq ymm1, ymm1, 0xd8 + vpunpckhbw ymm2, ymm1, ymm0 + vpunpcklbw ymm1, ymm1, ymm0 + vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB + vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB + lea eax, [eax + 32] + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_RGB565TOARGBROW_AVX2 + +#ifdef HAS_ARGB1555TOARGBROW_AVX2 +__declspec(naked) +void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, + int pix) { + __asm { + mov eax, 0x01080108 // generate multiplier to repeat 5 bits + vmovd xmm5, eax + vbroadcastss ymm5, xmm5 + mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits + movd xmm6, eax + vbroadcastss ymm6, xmm6 + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red + vpsllw ymm3, ymm3, 11 + vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha + vpsllw ymm7, ymm7, 8 + + mov eax, [esp + 4] // src_argb1555 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + sub edx, eax + sub edx, eax + + convertloop: + vmovdqu ymm0, [eax] // fetch 16 pixels of 1555 + vpsllw ymm1, ymm0, 1 // R in upper 5 bits + vpsllw ymm2, ymm0, 11 // B in upper 5 bits + vpand ymm1, ymm1, ymm3 + vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) + vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) + vpsllw ymm1, ymm1, 8 + vpor ymm1, ymm1, ymm2 // RB + vpsraw ymm2, ymm0, 8 // A + vpand ymm0, ymm0, ymm4 // G in middle 5 bits + vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8) + vpand ymm2, ymm2, ymm7 + vpor ymm0, ymm0, ymm2 // AG + vpermq ymm0, ymm0, 0xd8 // mutate for unpack + vpermq ymm1, ymm1, 0xd8 + vpunpckhbw ymm2, ymm1, ymm0 + vpunpcklbw ymm1, ymm1, ymm0 + vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB + vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB + lea eax, [eax + 32] + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGB1555TOARGBROW_AVX2 + +#ifdef HAS_ARGB4444TOARGBROW_AVX2 +__declspec(naked) +void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, + int pix) { + __asm { + mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f + vmovd xmm4, eax + vbroadcastss ymm4, xmm4 + vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles + mov eax, [esp + 4] // src_argb4444 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + sub edx, eax + sub edx, eax + + convertloop: + vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444 + vpand ymm2, ymm0, ymm5 // mask high nibbles + vpand ymm0, ymm0, ymm4 // mask low nibbles + vpsrlw ymm3, ymm2, 4 + vpsllw ymm1, ymm0, 4 + vpor ymm2, ymm2, ymm3 + vpor ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 // mutate for unpack + vpermq ymm2, ymm2, 0xd8 + vpunpckhbw ymm1, ymm0, ymm2 + vpunpcklbw ymm0, ymm0, ymm2 + vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB + vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB + lea eax, [eax + 32] + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGB4444TOARGBROW_AVX2 + +// 24 instructions +__declspec(naked) +void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, + int pix) { + __asm { + mov eax, 0x01080108 // generate multiplier to repeat 5 bits + movd xmm5, eax + pshufd xmm5, xmm5, 0 + mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits + movd xmm6, eax + pshufd xmm6, xmm6, 0 + pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red + psllw xmm3, 11 + movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green + psrlw xmm4, 6 + pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha + psllw xmm7, 8 + + mov eax, [esp + 4] // src_argb1555 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + sub edx, eax + sub edx, eax + + convertloop: + movdqu xmm0, [eax] // fetch 8 pixels of 1555 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + psllw xmm1, 1 // R in upper 5 bits + psllw xmm2, 11 // B in upper 5 bits + pand xmm1, xmm3 + pmulhuw xmm2, xmm5 // * (256 + 8) + pmulhuw xmm1, xmm5 // * (256 + 8) + psllw xmm1, 8 + por xmm1, xmm2 // RB + movdqa xmm2, xmm0 + pand xmm0, xmm4 // G in middle 5 bits + psraw xmm2, 8 // A + pmulhuw xmm0, xmm6 // << 6 * (256 + 8) + pand xmm2, xmm7 + por xmm0, xmm2 // AG + movdqa xmm2, xmm1 + punpcklbw xmm1, xmm0 + punpckhbw xmm2, xmm0 + movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB + movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB + lea eax, [eax + 16] + sub ecx, 8 + jg convertloop + ret + } +} + +// 18 instructions. +__declspec(naked) +void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, + int pix) { + __asm { + mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f + movd xmm4, eax + pshufd xmm4, xmm4, 0 + movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles + pslld xmm5, 4 + mov eax, [esp + 4] // src_argb4444 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + sub edx, eax + sub edx, eax + + convertloop: + movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 + movdqa xmm2, xmm0 + pand xmm0, xmm4 // mask low nibbles + pand xmm2, xmm5 // mask high nibbles + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + psllw xmm1, 4 + psrlw xmm3, 4 + por xmm0, xmm1 + por xmm2, xmm3 + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB + movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB + lea eax, [eax + 16] + sub ecx, 8 + jg convertloop + ret + } +} + +__declspec(naked) +void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + movdqa xmm6, kShuffleMaskARGBToRGB24 + + convertloop: + movdqu xmm0, [eax] // fetch 16 pixels of argb + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + lea eax, [eax + 64] + pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB + pshufb xmm1, xmm6 + pshufb xmm2, xmm6 + pshufb xmm3, xmm6 + movdqa xmm4, xmm1 // 4 bytes from 1 for 0 + psrldq xmm1, 4 // 8 bytes from 1 + pslldq xmm4, 12 // 4 bytes from 1 for 0 + movdqa xmm5, xmm2 // 8 bytes from 2 for 1 + por xmm0, xmm4 // 4 bytes from 1 for 0 + pslldq xmm5, 8 // 8 bytes from 2 for 1 + movdqu [edx], xmm0 // store 0 + por xmm1, xmm5 // 8 bytes from 2 for 1 + psrldq xmm2, 8 // 4 bytes from 2 + pslldq xmm3, 4 // 12 bytes from 3 for 2 + por xmm2, xmm3 // 12 bytes from 3 for 2 + movdqu [edx + 16], xmm1 // store 1 + movdqu [edx + 32], xmm2 // store 2 + lea edx, [edx + 48] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) +void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + movdqa xmm6, kShuffleMaskARGBToRAW + + convertloop: + movdqu xmm0, [eax] // fetch 16 pixels of argb + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + lea eax, [eax + 64] + pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB + pshufb xmm1, xmm6 + pshufb xmm2, xmm6 + pshufb xmm3, xmm6 + movdqa xmm4, xmm1 // 4 bytes from 1 for 0 + psrldq xmm1, 4 // 8 bytes from 1 + pslldq xmm4, 12 // 4 bytes from 1 for 0 + movdqa xmm5, xmm2 // 8 bytes from 2 for 1 + por xmm0, xmm4 // 4 bytes from 1 for 0 + pslldq xmm5, 8 // 8 bytes from 2 for 1 + movdqu [edx], xmm0 // store 0 + por xmm1, xmm5 // 8 bytes from 2 for 1 + psrldq xmm2, 8 // 4 bytes from 2 + pslldq xmm3, 4 // 12 bytes from 3 for 2 + por xmm2, xmm3 // 12 bytes from 3 for 2 + movdqu [edx + 16], xmm1 // store 1 + movdqu [edx + 32], xmm2 // store 2 + lea edx, [edx + 48] + sub ecx, 16 + jg convertloop + ret + } +} + +// 4 pixels +__declspec(naked) +void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + pcmpeqb xmm3, xmm3 // generate mask 0x0000001f + psrld xmm3, 27 + pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 + psrld xmm4, 26 + pslld xmm4, 5 + pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 + pslld xmm5, 11 + + convertloop: + movdqu xmm0, [eax] // fetch 4 pixels of argb + movdqa xmm1, xmm0 // B + movdqa xmm2, xmm0 // G + pslld xmm0, 8 // R + psrld xmm1, 3 // B + psrld xmm2, 5 // G + psrad xmm0, 16 // R + pand xmm1, xmm3 // B + pand xmm2, xmm4 // G + pand xmm0, xmm5 // R + por xmm1, xmm2 // BG + por xmm0, xmm1 // BGR + packssdw xmm0, xmm0 + lea eax, [eax + 16] + movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 + lea edx, [edx + 8] + sub ecx, 4 + jg convertloop + ret + } +} + +// 8 pixels +__declspec(naked) +void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, + const uint32 dither4, int pix) { + __asm { + + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + movd xmm6, [esp + 12] // dither4 + mov ecx, [esp + 16] // pix + punpcklbw xmm6, xmm6 // make dither 16 bytes + movdqa xmm7, xmm6 + punpcklwd xmm6, xmm6 + punpckhwd xmm7, xmm7 + pcmpeqb xmm3, xmm3 // generate mask 0x0000001f + psrld xmm3, 27 + pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 + psrld xmm4, 26 + pslld xmm4, 5 + pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 + pslld xmm5, 11 + + convertloop: + movdqu xmm0, [eax] // fetch 4 pixels of argb + paddusb xmm0, xmm6 // add dither + movdqa xmm1, xmm0 // B + movdqa xmm2, xmm0 // G + pslld xmm0, 8 // R + psrld xmm1, 3 // B + psrld xmm2, 5 // G + psrad xmm0, 16 // R + pand xmm1, xmm3 // B + pand xmm2, xmm4 // G + pand xmm0, xmm5 // R + por xmm1, xmm2 // BG + por xmm0, xmm1 // BGR + packssdw xmm0, xmm0 + lea eax, [eax + 16] + movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 + lea edx, [edx + 8] + sub ecx, 4 + jg convertloop + ret + } +} + +#ifdef HAS_ARGBTORGB565DITHERROW_AVX2 +__declspec(naked) +void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, + const uint32 dither4, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + vbroadcastss xmm6, [esp + 12] // dither4 + mov ecx, [esp + 16] // pix + vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes + vpermq ymm6, ymm6, 0xd8 + vpunpcklwd ymm6, ymm6, ymm6 + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f + vpsrld ymm3, ymm3, 27 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 + vpsrld ymm4, ymm4, 26 + vpslld ymm4, ymm4, 5 + vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 + + convertloop: + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpaddusb ymm0, ymm0, ymm6 // add dither + vpsrld ymm2, ymm0, 5 // G + vpsrld ymm1, ymm0, 3 // B + vpsrld ymm0, ymm0, 8 // R + vpand ymm2, ymm2, ymm4 // G + vpand ymm1, ymm1, ymm3 // B + vpand ymm0, ymm0, ymm5 // R + vpor ymm1, ymm1, ymm2 // BG + vpor ymm0, ymm0, ymm1 // BGR + vpackusdw ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 + lea eax, [eax + 32] + vmovdqu [edx], xmm0 // store 8 pixels of RGB565 + lea edx, [edx + 16] + sub ecx, 8 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBTORGB565DITHERROW_AVX2 + +// TODO(fbarchard): Improve sign extension/packing. +__declspec(naked) +void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + pcmpeqb xmm4, xmm4 // generate mask 0x0000001f + psrld xmm4, 27 + movdqa xmm5, xmm4 // generate mask 0x000003e0 + pslld xmm5, 5 + movdqa xmm6, xmm4 // generate mask 0x00007c00 + pslld xmm6, 10 + pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 + pslld xmm7, 15 + + convertloop: + movdqu xmm0, [eax] // fetch 4 pixels of argb + movdqa xmm1, xmm0 // B + movdqa xmm2, xmm0 // G + movdqa xmm3, xmm0 // R + psrad xmm0, 16 // A + psrld xmm1, 3 // B + psrld xmm2, 6 // G + psrld xmm3, 9 // R + pand xmm0, xmm7 // A + pand xmm1, xmm4 // B + pand xmm2, xmm5 // G + pand xmm3, xmm6 // R + por xmm0, xmm1 // BA + por xmm2, xmm3 // GR + por xmm0, xmm2 // BGRA + packssdw xmm0, xmm0 + lea eax, [eax + 16] + movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 + lea edx, [edx + 8] + sub ecx, 4 + jg convertloop + ret + } +} + +__declspec(naked) +void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 + psllw xmm4, 12 + movdqa xmm3, xmm4 // generate mask 0x00f000f0 + psrlw xmm3, 8 + + convertloop: + movdqu xmm0, [eax] // fetch 4 pixels of argb + movdqa xmm1, xmm0 + pand xmm0, xmm3 // low nibble + pand xmm1, xmm4 // high nibble + psrld xmm0, 4 + psrld xmm1, 8 + por xmm0, xmm1 + packuswb xmm0, xmm0 + lea eax, [eax + 16] + movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 + lea edx, [edx + 8] + sub ecx, 4 + jg convertloop + ret + } +} + +#ifdef HAS_ARGBTORGB565ROW_AVX2 +__declspec(naked) +void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f + vpsrld ymm3, ymm3, 27 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 + vpsrld ymm4, ymm4, 26 + vpslld ymm4, ymm4, 5 + vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 + + convertloop: + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpsrld ymm2, ymm0, 5 // G + vpsrld ymm1, ymm0, 3 // B + vpsrld ymm0, ymm0, 8 // R + vpand ymm2, ymm2, ymm4 // G + vpand ymm1, ymm1, ymm3 // B + vpand ymm0, ymm0, ymm5 // R + vpor ymm1, ymm1, ymm2 // BG + vpor ymm0, ymm0, ymm1 // BGR + vpackusdw ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 + lea eax, [eax + 32] + vmovdqu [edx], xmm0 // store 8 pixels of RGB565 + lea edx, [edx + 16] + sub ecx, 8 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBTORGB565ROW_AVX2 + +#ifdef HAS_ARGBTOARGB1555ROW_AVX2 +__declspec(naked) +void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + vpcmpeqb ymm4, ymm4, ymm4 + vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f + vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 + vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 + vpslld ymm7, ymm7, 15 + + convertloop: + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpsrld ymm3, ymm0, 9 // R + vpsrld ymm2, ymm0, 6 // G + vpsrld ymm1, ymm0, 3 // B + vpsrad ymm0, ymm0, 16 // A + vpand ymm3, ymm3, ymm6 // R + vpand ymm2, ymm2, ymm5 // G + vpand ymm1, ymm1, ymm4 // B + vpand ymm0, ymm0, ymm7 // A + vpor ymm0, ymm0, ymm1 // BA + vpor ymm2, ymm2, ymm3 // GR + vpor ymm0, ymm0, ymm2 // BGRA + vpackssdw ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 + lea eax, [eax + 32] + vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555 + lea edx, [edx + 16] + sub ecx, 8 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBTOARGB1555ROW_AVX2 + +#ifdef HAS_ARGBTOARGB4444ROW_AVX2 +__declspec(naked) +void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 + vpsllw ymm4, ymm4, 12 + vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 + + convertloop: + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpand ymm1, ymm0, ymm4 // high nibble + vpand ymm0, ymm0, ymm3 // low nibble + vpsrld ymm1, ymm1, 8 + vpsrld ymm0, ymm0, 4 + vpor ymm0, ymm0, ymm1 + vpackuswb ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 + lea eax, [eax + 32] + vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444 + lea edx, [edx + 16] + sub ecx, 8 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBTOARGB4444ROW_AVX2 + +// Convert 16 ARGB pixels (64 bytes) to 16 Y values. +__declspec(naked) +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm4, kARGBToY + movdqa xmm5, kAddY16 + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + ret + } +} + +// Convert 16 ARGB pixels (64 bytes) to 16 YJ values. +// Same as ARGBToYRow but different coefficients, no add 16, but do rounding. +__declspec(naked) +void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm4, kARGBToYJ + movdqa xmm5, kAddYJ64 + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + paddw xmm0, xmm5 // Add .5 for rounding. + paddw xmm2, xmm5 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + ret + } +} + +#ifdef HAS_ARGBTOYROW_AVX2 +// vpermd for vphaddw + vpackuswb vpermd. +static const lvec32 kPermdARGBToY_AVX = { + 0, 4, 1, 5, 2, 6, 3, 7 +}; + +// Convert 32 ARGB pixels (128 bytes) to 32 Y values. +__declspec(naked) +void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + vbroadcastf128 ymm4, kARGBToY + vbroadcastf128 ymm5, kAddY16 + vmovdqu ymm6, kPermdARGBToY_AVX + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] + vpmaddubsw ymm0, ymm0, ymm4 + vpmaddubsw ymm1, ymm1, ymm4 + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + lea eax, [eax + 128] + vphaddw ymm0, ymm0, ymm1 // mutates. + vphaddw ymm2, ymm2, ymm3 + vpsrlw ymm0, ymm0, 7 + vpsrlw ymm2, ymm2, 7 + vpackuswb ymm0, ymm0, ymm2 // mutates. + vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. + vpaddb ymm0, ymm0, ymm5 // add 16 for Y + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBTOYROW_AVX2 + +#ifdef HAS_ARGBTOYJROW_AVX2 +// Convert 32 ARGB pixels (128 bytes) to 32 Y values. +__declspec(naked) +void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + vbroadcastf128 ymm4, kARGBToYJ + vbroadcastf128 ymm5, kAddYJ64 + vmovdqu ymm6, kPermdARGBToY_AVX + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] + vpmaddubsw ymm0, ymm0, ymm4 + vpmaddubsw ymm1, ymm1, ymm4 + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + lea eax, [eax + 128] + vphaddw ymm0, ymm0, ymm1 // mutates. + vphaddw ymm2, ymm2, ymm3 + vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding. + vpaddw ymm2, ymm2, ymm5 + vpsrlw ymm0, ymm0, 7 + vpsrlw ymm2, ymm2, 7 + vpackuswb ymm0, ymm0, ymm2 // mutates. + vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBTOYJROW_AVX2 + +__declspec(naked) +void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm4, kBGRAToY + movdqa xmm5, kAddY16 + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) +void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm4, kABGRToY + movdqa xmm5, kAddY16 + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) +void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm4, kRGBAToY + movdqa xmm5, kAddY16 + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) +void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm5, kAddUV128 + movdqa xmm6, kARGBToV + movdqa xmm7, kARGBToU + sub edi, edx // stride from u to v + + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) +void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm5, kAddUVJ128 + movdqa xmm6, kARGBToVJ + movdqa xmm7, kARGBToUJ + sub edi, edx // stride from u to v + + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + paddw xmm0, xmm5 // +.5 rounding -> unsigned + paddw xmm1, xmm5 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +#ifdef HAS_ARGBTOUVROW_AVX2 +__declspec(naked) +void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + vbroadcastf128 ymm5, kAddUV128 + vbroadcastf128 ymm6, kARGBToV + vbroadcastf128 ymm7, kARGBToU + sub edi, edx // stride from u to v + + convertloop: + /* step 1 - subsample 32x2 argb pixels to 16x1 */ + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] + vpavgb ymm0, ymm0, [eax + esi] + vpavgb ymm1, ymm1, [eax + esi + 32] + vpavgb ymm2, ymm2, [eax + esi + 64] + vpavgb ymm3, ymm3, [eax + esi + 96] + lea eax, [eax + 128] + vshufps ymm4, ymm0, ymm1, 0x88 + vshufps ymm0, ymm0, ymm1, 0xdd + vpavgb ymm0, ymm0, ymm4 // mutated by vshufps + vshufps ymm4, ymm2, ymm3, 0x88 + vshufps ymm2, ymm2, ymm3, 0xdd + vpavgb ymm2, ymm2, ymm4 // mutated by vshufps + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 32 different pixels, its 16 pixels of U and 16 of V + vpmaddubsw ymm1, ymm0, ymm7 // U + vpmaddubsw ymm3, ymm2, ymm7 + vpmaddubsw ymm0, ymm0, ymm6 // V + vpmaddubsw ymm2, ymm2, ymm6 + vphaddw ymm1, ymm1, ymm3 // mutates + vphaddw ymm0, ymm0, ymm2 + vpsraw ymm1, ymm1, 8 + vpsraw ymm0, ymm0, 8 + vpacksswb ymm0, ymm1, ymm0 // mutates + vpermq ymm0, ymm0, 0xd8 // For vpacksswb + vpshufb ymm0, ymm0, kShufARGBToUV_AVX // For vshufps + vphaddw + vpaddb ymm0, ymm0, ymm5 // -> unsigned + + // step 3 - store 16 U and 16 V values + vextractf128 [edx], ymm0, 0 // U + vextractf128 [edx + edi], ymm0, 1 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_ARGBTOUVROW_AVX2 + +__declspec(naked) +void ARGBToUV444Row_SSSE3(const uint8* src_argb0, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_argb + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + movdqa xmm5, kAddUV128 + movdqa xmm6, kARGBToV + movdqa xmm7, kARGBToU + sub edi, edx // stride from u to v + + convertloop: + /* convert to U and V */ + movdqu xmm0, [eax] // U + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm7 + pmaddubsw xmm1, xmm7 + pmaddubsw xmm2, xmm7 + pmaddubsw xmm3, xmm7 + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psraw xmm0, 8 + psraw xmm2, 8 + packsswb xmm0, xmm2 + paddb xmm0, xmm5 + movdqu [edx], xmm0 + + movdqu xmm0, [eax] // V + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm6 + pmaddubsw xmm1, xmm6 + pmaddubsw xmm2, xmm6 + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psraw xmm0, 8 + psraw xmm2, 8 + packsswb xmm0, xmm2 + paddb xmm0, xmm5 + lea eax, [eax + 64] + movdqu [edx + edi], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) +void ARGBToUV422Row_SSSE3(const uint8* src_argb0, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_argb + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + movdqa xmm5, kAddUV128 + movdqa xmm6, kARGBToV + movdqa xmm7, kARGBToU + sub edi, edx // stride from u to v + + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) +void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm5, kAddUV128 + movdqa xmm6, kBGRAToV + movdqa xmm7, kBGRAToU + sub edi, edx // stride from u to v + + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) +void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm5, kAddUV128 + movdqa xmm6, kABGRToV + movdqa xmm7, kABGRToU + sub edi, edx // stride from u to v + + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) +void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm5, kAddUV128 + movdqa xmm6, kRGBAToV + movdqa xmm7, kRGBAToU + sub edi, edx // stride from u to v + + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} +#endif // HAS_ARGBTOYROW_SSSE3 + +// Read 16 UV from 444 +#define READYUV444_AVX2 __asm { \ + __asm vmovdqu xmm0, [esi] /* U */ /* NOLINT */ \ + __asm vmovdqu xmm1, [esi + edi] /* V */ /* NOLINT */ \ + __asm lea esi, [esi + 16] \ + __asm vpermq ymm0, ymm0, 0xd8 \ + __asm vpermq ymm1, ymm1, 0xd8 \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ + } + +// Read 8 UV from 422, upsample to 16 UV. +#define READYUV422_AVX2 __asm { \ + __asm vmovq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ + __asm vmovq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ + __asm lea esi, [esi + 8] \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ + __asm vpermq ymm0, ymm0, 0xd8 \ + __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ + } + +// Read 4 UV from 411, upsample to 16 UV. +#define READYUV411_AVX2 __asm { \ + __asm vmovd xmm0, dword ptr [esi] /* U */ /* NOLINT */ \ + __asm vmovd xmm1, dword ptr [esi + edi] /* V */ /* NOLINT */ \ + __asm lea esi, [esi + 4] \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ + __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ + __asm vpermq ymm0, ymm0, 0xd8 \ + __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \ + } + +// Read 8 UV from NV12, upsample to 16 UV. +#define READNV12_AVX2 __asm { \ + __asm vmovdqu xmm0, [esi] /* UV */ \ + __asm lea esi, [esi + 16] \ + __asm vpermq ymm0, ymm0, 0xd8 \ + __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ + } + +// Convert 16 pixels: 16 UV and 16 Y. +#define YUVTORGB_AVX2(YuvConstants) __asm { \ + /* Step 1: Find 8 UV contributions to 16 R,G,B values */ \ + __asm vpmaddubsw ymm2, ymm0, YuvConstants.kUVToR /* scale R UV */ \ + __asm vpmaddubsw ymm1, ymm0, YuvConstants.kUVToG /* scale G UV */ \ + __asm vpmaddubsw ymm0, ymm0, YuvConstants.kUVToB /* scale B UV */ \ + __asm vmovdqu ymm3, YuvConstants.kUVBiasR \ + __asm vpsubw ymm2, ymm3, ymm2 \ + __asm vmovdqu ymm3, YuvConstants.kUVBiasG \ + __asm vpsubw ymm1, ymm3, ymm1 \ + __asm vmovdqu ymm3, YuvConstants.kUVBiasB \ + __asm vpsubw ymm0, ymm3, ymm0 \ + /* Step 2: Find Y contribution to 16 R,G,B values */ \ + __asm vmovdqu xmm3, [eax] /* NOLINT */ \ + __asm lea eax, [eax + 16] \ + __asm vpermq ymm3, ymm3, 0xd8 \ + __asm vpunpcklbw ymm3, ymm3, ymm3 \ + __asm vpmulhuw ymm3, ymm3, YuvConstants.kYToRgb \ + __asm vpaddsw ymm0, ymm0, ymm3 /* B += Y */ \ + __asm vpaddsw ymm1, ymm1, ymm3 /* G += Y */ \ + __asm vpaddsw ymm2, ymm2, ymm3 /* R += Y */ \ + __asm vpsraw ymm0, ymm0, 6 \ + __asm vpsraw ymm1, ymm1, 6 \ + __asm vpsraw ymm2, ymm2, 6 \ + __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \ + __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \ + __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \ + } + +// Store 16 ARGB values. +#define STOREARGB_AVX2 __asm { \ + /* Step 3: Weave into ARGB */ \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ + __asm vpermq ymm0, ymm0, 0xd8 \ + __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ + __asm vpermq ymm2, ymm2, 0xd8 \ + __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ + __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ + __asm vmovdqu 0[edx], ymm1 \ + __asm vmovdqu 32[edx], ymm0 \ + __asm lea edx, [edx + 64] \ + } + +#ifdef HAS_I422TOARGBROW_AVX2 +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +__declspec(naked) +void I422ToARGBRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + + convertloop: + READYUV422_AVX2 + YUVTORGB_AVX2(kYuvConstants) + STOREARGB_AVX2 + + sub ecx, 16 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_I422TOARGBROW_AVX2 + +#ifdef HAS_J422TOARGBROW_AVX2 +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +__declspec(naked) +void J422ToARGBRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + + convertloop: + READYUV422_AVX2 + YUVTORGB_AVX2(kYuvJConstants) + STOREARGB_AVX2 + + sub ecx, 16 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_J422TOARGBROW_AVX2 + +#ifdef HAS_I444TOARGBROW_AVX2 +// 16 pixels +// 16 UV values with 16 Y producing 16 ARGB (64 bytes). +__declspec(naked) +void I444ToARGBRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + + convertloop: + READYUV444_AVX2 + YUVTORGB_AVX2(kYuvConstants) + STOREARGB_AVX2 + + sub ecx, 16 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_I444TOARGBROW_AVX2 + +#ifdef HAS_I411TOARGBROW_AVX2 +// 16 pixels +// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +__declspec(naked) +void I411ToARGBRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + + convertloop: + READYUV411_AVX2 + YUVTORGB_AVX2(kYuvConstants) + STOREARGB_AVX2 + + sub ecx, 16 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_I411TOARGBROW_AVX2 + +#ifdef HAS_NV12TOARGBROW_AVX2 +// 16 pixels. +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +__declspec(naked) +void NV12ToARGBRow_AVX2(const uint8* y_buf, + const uint8* uv_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // Y + mov esi, [esp + 4 + 8] // UV + mov edx, [esp + 4 + 12] // argb + mov ecx, [esp + 4 + 16] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + + convertloop: + READNV12_AVX2 + YUVTORGB_AVX2(kYuvConstants) + STOREARGB_AVX2 + + sub ecx, 16 + jg convertloop + + pop esi + vzeroupper + ret + } +} +#endif // HAS_NV12TOARGBROW_AVX2 + +#ifdef HAS_NV21TOARGBROW_AVX2 +// 16 pixels. +// 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes). +__declspec(naked) +void NV21ToARGBRow_AVX2(const uint8* y_buf, + const uint8* uv_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // Y + mov esi, [esp + 4 + 8] // UV + mov edx, [esp + 4 + 12] // argb + mov ecx, [esp + 4 + 16] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + + convertloop: + READNV12_AVX2 + YUVTORGB_AVX2(kYvuConstants) + STOREARGB_AVX2 + + sub ecx, 16 + jg convertloop + + pop esi + vzeroupper + ret + } +} +#endif // HAS_NV21TOARGBROW_AVX2 + +#ifdef HAS_I422TOBGRAROW_AVX2 +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). +// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. +__declspec(naked) +void I422ToBGRARow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + + convertloop: + READYUV422_AVX2 + YUVTORGB_AVX2(kYuvConstants) + + // Step 3: Weave into BGRA + vpunpcklbw ymm1, ymm1, ymm0 // GB + vpermq ymm1, ymm1, 0xd8 + vpunpcklbw ymm2, ymm5, ymm2 // AR + vpermq ymm2, ymm2, 0xd8 + vpunpcklwd ymm0, ymm2, ymm1 // ARGB first 8 pixels + vpunpckhwd ymm2, ymm2, ymm1 // ARGB next 8 pixels + vmovdqu [edx], ymm0 + vmovdqu [edx + 32], ymm2 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_I422TOBGRAROW_AVX2 + +#ifdef HAS_I422TORGBAROW_AVX2 +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). +// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. +__declspec(naked) +void I422ToRGBARow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + + convertloop: + READYUV422_AVX2 + YUVTORGB_AVX2(kYuvConstants) + + // Step 3: Weave into RGBA + vpunpcklbw ymm1, ymm1, ymm2 // GR + vpermq ymm1, ymm1, 0xd8 + vpunpcklbw ymm2, ymm5, ymm0 // AB + vpermq ymm2, ymm2, 0xd8 + vpunpcklwd ymm0, ymm2, ymm1 // ABGR first 8 pixels + vpunpckhwd ymm1, ymm2, ymm1 // ABGR next 8 pixels + vmovdqu [edx], ymm0 + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_I422TORGBAROW_AVX2 + +#ifdef HAS_I422TOABGRROW_AVX2 +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). +// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. +__declspec(naked) +void I422ToABGRRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + + convertloop: + READYUV422_AVX2 + YUVTORGB_AVX2(kYuvConstants) + + // Step 3: Weave into ABGR + vpunpcklbw ymm1, ymm2, ymm1 // RG + vpermq ymm1, ymm1, 0xd8 + vpunpcklbw ymm2, ymm0, ymm5 // BA + vpermq ymm2, ymm2, 0xd8 + vpunpcklwd ymm0, ymm1, ymm2 // RGBA first 8 pixels + vpunpckhwd ymm1, ymm1, ymm2 // RGBA next 8 pixels + vmovdqu [edx], ymm0 + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_I422TOABGRROW_AVX2 + +#if defined(HAS_I422TOARGBROW_SSSE3) +// TODO(fbarchard): Read that does half size on Y and treats 420 as 444. + +// Read 8 UV from 444. +#define READYUV444 __asm { \ + __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ + __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ + __asm lea esi, [esi + 8] \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ + } + +// Read 4 UV from 422, upsample to 8 UV. +#define READYUV422 __asm { \ + __asm movd xmm0, [esi] /* U */ \ + __asm movd xmm1, [esi + edi] /* V */ \ + __asm lea esi, [esi + 4] \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + } + +// Read 2 UV from 411, upsample to 8 UV. +#define READYUV411 __asm { \ + __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \ + __asm movd xmm0, ebx \ + __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \ + __asm movd xmm1, ebx \ + __asm lea esi, [esi + 2] \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \ + } + +// Read 4 UV from NV12, upsample to 8 UV. +#define READNV12 __asm { \ + __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \ + __asm lea esi, [esi + 8] \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + } + +// Convert 8 pixels: 8 UV and 8 Y. +#define YUVTORGB(YuvConstants) __asm { \ + /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ + __asm movdqa xmm1, xmm0 \ + __asm movdqa xmm2, xmm0 \ + __asm movdqa xmm3, xmm0 \ + __asm movdqa xmm0, YuvConstants.kUVBiasB /* unbias back to signed */ \ + __asm pmaddubsw xmm1, YuvConstants.kUVToB /* scale B UV */ \ + __asm psubw xmm0, xmm1 \ + __asm movdqa xmm1, YuvConstants.kUVBiasG \ + __asm pmaddubsw xmm2, YuvConstants.kUVToG /* scale G UV */ \ + __asm psubw xmm1, xmm2 \ + __asm movdqa xmm2, YuvConstants.kUVBiasR \ + __asm pmaddubsw xmm3, YuvConstants.kUVToR /* scale R UV */ \ + __asm psubw xmm2, xmm3 \ + /* Step 2: Find Y contribution to 8 R,G,B values */ \ + __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ + __asm lea eax, [eax + 8] \ + __asm punpcklbw xmm3, xmm3 \ + __asm pmulhuw xmm3, YuvConstants.kYToRgb \ + __asm paddsw xmm0, xmm3 /* B += Y */ \ + __asm paddsw xmm1, xmm3 /* G += Y */ \ + __asm paddsw xmm2, xmm3 /* R += Y */ \ + __asm psraw xmm0, 6 \ + __asm psraw xmm1, 6 \ + __asm psraw xmm2, 6 \ + __asm packuswb xmm0, xmm0 /* B */ \ + __asm packuswb xmm1, xmm1 /* G */ \ + __asm packuswb xmm2, xmm2 /* R */ \ + } + +// Store 8 ARGB values. +#define STOREARGB __asm { \ + /* Step 3: Weave into ARGB */ \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm5 /* RA */ \ + __asm movdqa xmm1, xmm0 \ + __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ + __asm movdqu 0[edx], xmm0 \ + __asm movdqu 16[edx], xmm1 \ + __asm lea edx, [edx + 32] \ + } + +// Store 8 BGRA values. +#define STOREBGRA __asm { \ + /* Step 3: Weave into BGRA */ \ + __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ + __asm punpcklbw xmm1, xmm0 /* GB */ \ + __asm punpcklbw xmm5, xmm2 /* AR */ \ + __asm movdqa xmm0, xmm5 \ + __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ + __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ + __asm movdqu 0[edx], xmm5 \ + __asm movdqu 16[edx], xmm0 \ + __asm lea edx, [edx + 32] \ + } + +// Store 8 ABGR values. +#define STOREABGR __asm { \ + /* Step 3: Weave into ABGR */ \ + __asm punpcklbw xmm2, xmm1 /* RG */ \ + __asm punpcklbw xmm0, xmm5 /* BA */ \ + __asm movdqa xmm1, xmm2 \ + __asm punpcklwd xmm2, xmm0 /* RGBA first 4 pixels */ \ + __asm punpckhwd xmm1, xmm0 /* RGBA next 4 pixels */ \ + __asm movdqu 0[edx], xmm2 \ + __asm movdqu 16[edx], xmm1 \ + __asm lea edx, [edx + 32] \ + } + +// Store 8 RGBA values. +#define STORERGBA __asm { \ + /* Step 3: Weave into RGBA */ \ + __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ + __asm punpcklbw xmm1, xmm2 /* GR */ \ + __asm punpcklbw xmm5, xmm0 /* AB */ \ + __asm movdqa xmm0, xmm5 \ + __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ + __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ + __asm movdqu 0[edx], xmm5 \ + __asm movdqu 16[edx], xmm0 \ + __asm lea edx, [edx + 32] \ + } + +// Store 8 RGB24 values. +#define STORERGB24 __asm { \ + /* Step 3: Weave into RRGB */ \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm2 /* RR */ \ + __asm movdqa xmm1, xmm0 \ + __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ + /* Step 4: RRGB -> RGB24 */ \ + __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ + __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ + __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ + __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ + __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ + __asm lea edx, [edx + 24] \ + } + +// Store 8 RAW values. +#define STORERAW __asm { \ + /* Step 3: Weave into RRGB */ \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm2 /* RR */ \ + __asm movdqa xmm1, xmm0 \ + __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ + /* Step 4: RRGB -> RAW */ \ + __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ + __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ + __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ + __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ + __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ + __asm lea edx, [edx + 24] \ + } + +// Store 8 RGB565 values. +#define STORERGB565 __asm { \ + /* Step 3: Weave into RRGB */ \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm2 /* RR */ \ + __asm movdqa xmm1, xmm0 \ + __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ + /* Step 4: RRGB -> RGB565 */ \ + __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ + __asm movdqa xmm2, xmm0 /* G */ \ + __asm pslld xmm0, 8 /* R */ \ + __asm psrld xmm3, 3 /* B */ \ + __asm psrld xmm2, 5 /* G */ \ + __asm psrad xmm0, 16 /* R */ \ + __asm pand xmm3, xmm5 /* B */ \ + __asm pand xmm2, xmm6 /* G */ \ + __asm pand xmm0, xmm7 /* R */ \ + __asm por xmm3, xmm2 /* BG */ \ + __asm por xmm0, xmm3 /* BGR */ \ + __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \ + __asm movdqa xmm2, xmm1 /* G */ \ + __asm pslld xmm1, 8 /* R */ \ + __asm psrld xmm3, 3 /* B */ \ + __asm psrld xmm2, 5 /* G */ \ + __asm psrad xmm1, 16 /* R */ \ + __asm pand xmm3, xmm5 /* B */ \ + __asm pand xmm2, xmm6 /* G */ \ + __asm pand xmm1, xmm7 /* R */ \ + __asm por xmm3, xmm2 /* BG */ \ + __asm por xmm1, xmm3 /* BGR */ \ + __asm packssdw xmm0, xmm1 \ + __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ + __asm lea edx, [edx + 16] \ + } + +// 8 pixels. +// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) +void I444ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + + convertloop: + READYUV444 + YUVTORGB(kYuvConstants) + STOREARGB + + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). +__declspec(naked) +void I422ToRGB24Row_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_rgb24, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // rgb24 + mov ecx, [esp + 8 + 20] // width + sub edi, esi + movdqa xmm5, kShuffleMaskARGBToRGB24_0 + movdqa xmm6, kShuffleMaskARGBToRGB24 + + convertloop: + READYUV422 + YUVTORGB(kYuvConstants) + STORERGB24 + + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes). +__declspec(naked) +void I422ToRAWRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_raw, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // raw + mov ecx, [esp + 8 + 20] // width + sub edi, esi + movdqa xmm5, kShuffleMaskARGBToRAW_0 + movdqa xmm6, kShuffleMaskARGBToRAW + + convertloop: + READYUV422 + YUVTORGB(kYuvConstants) + STORERAW + + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). +__declspec(naked) +void I422ToRGB565Row_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb565_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // rgb565 + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate mask 0x0000001f + psrld xmm5, 27 + pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 + psrld xmm6, 26 + pslld xmm6, 5 + pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 + pslld xmm7, 11 + + convertloop: + READYUV422 + YUVTORGB(kYuvConstants) + STORERGB565 + + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) +void I422ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + + convertloop: + READYUV422 + YUVTORGB(kYuvConstants) + STOREARGB + + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels. +// JPeg color space version of I422ToARGB +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) +void J422ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + + convertloop: + READYUV422 + YUVTORGB(kYuvJConstants) + STOREARGB + + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels. +// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +// Similar to I420 but duplicate UV once more. +__declspec(naked) +void I411ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push ebx + push esi + push edi + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ecx, [esp + 12 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + + convertloop: + READYUV411 // modifies EBX + YUVTORGB(kYuvConstants) + STOREARGB + + sub ecx, 8 + jg convertloop + + pop edi + pop esi + pop ebx + ret + } +} + +// 8 pixels. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) +void NV12ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // Y + mov esi, [esp + 4 + 8] // UV + mov edx, [esp + 4 + 12] // argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + + convertloop: + READNV12 + YUVTORGB(kYuvConstants) + STOREARGB + + sub ecx, 8 + jg convertloop + + pop esi + ret + } +} + +// 8 pixels. +// 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) +void NV21ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // Y + mov esi, [esp + 4 + 8] // UV + mov edx, [esp + 4 + 12] // argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + + convertloop: + READNV12 + YUVTORGB(kYvuConstants) + STOREARGB + + sub ecx, 8 + jg convertloop + + pop esi + ret + } +} + +__declspec(naked) +void I422ToBGRARow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_bgra, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // bgra + mov ecx, [esp + 8 + 20] // width + sub edi, esi + + convertloop: + READYUV422 + YUVTORGB(kYuvConstants) + STOREBGRA + + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) +void I422ToABGRRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_abgr, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // abgr + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + + convertloop: + READYUV422 + YUVTORGB(kYuvConstants) + STOREABGR + + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) +void I422ToRGBARow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_rgba, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // rgba + mov ecx, [esp + 8 + 20] // width + sub edi, esi + + convertloop: + READYUV422 + YUVTORGB(kYuvConstants) + STORERGBA + + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +#endif // HAS_I422TOARGBROW_SSSE3 + +#ifdef HAS_I400TOARGBROW_SSE2 +// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). +__declspec(naked) +void I400ToARGBRow_SSE2(const uint8* y_buf, + uint8* rgb_buf, + int width) { + __asm { + mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) + movd xmm2, eax + pshufd xmm2, xmm2,0 + mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) + movd xmm3, eax + pshufd xmm3, xmm3, 0 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + + mov eax, [esp + 4] // Y + mov edx, [esp + 8] // rgb + mov ecx, [esp + 12] // width + + convertloop: + // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 + movq xmm0, qword ptr [eax] + lea eax, [eax + 8] + punpcklbw xmm0, xmm0 // Y.Y + pmulhuw xmm0, xmm2 + psubusw xmm0, xmm3 + psrlw xmm0, 6 + packuswb xmm0, xmm0 // G + + // Step 2: Weave into ARGB + punpcklbw xmm0, xmm0 // GG + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm0 // BGRA first 4 pixels + punpckhwd xmm1, xmm1 // BGRA next 4 pixels + por xmm0, xmm4 + por xmm1, xmm4 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + ret + } +} +#endif // HAS_I400TOARGBROW_SSE2 + +#ifdef HAS_I400TOARGBROW_AVX2 +// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). +// note: vpunpcklbw mutates and vpackuswb unmutates. +__declspec(naked) +void I400ToARGBRow_AVX2(const uint8* y_buf, + uint8* rgb_buf, + int width) { + __asm { + mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) + vmovd xmm2, eax + vbroadcastss ymm2, xmm2 + mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) + vmovd xmm3, eax + vbroadcastss ymm3, xmm3 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000 + vpslld ymm4, ymm4, 24 + + mov eax, [esp + 4] // Y + mov edx, [esp + 8] // rgb + mov ecx, [esp + 12] // width + + convertloop: + // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164 + vmovdqu xmm0, [eax] + lea eax, [eax + 16] + vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates + vpunpcklbw ymm0, ymm0, ymm0 // Y.Y + vpmulhuw ymm0, ymm0, ymm2 + vpsubusw ymm0, ymm0, ymm3 + vpsrlw ymm0, ymm0, 6 + vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120 + + // TODO(fbarchard): Weave alpha with unpack. + // Step 2: Weave into ARGB + vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates + vpermq ymm1, ymm1, 0xd8 + vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels + vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm4 + vmovdqu [edx], ymm0 + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_I400TOARGBROW_AVX2 + +#ifdef HAS_MIRRORROW_SSSE3 +// Shuffle table for reversing the bytes. +static const uvec8 kShuffleMirror = { + 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u +}; + +// TODO(fbarchard): Replace lea with -16 offset. +__declspec(naked) +void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + movdqa xmm5, kShuffleMirror + + convertloop: + movdqu xmm0, [eax - 16 + ecx] + pshufb xmm0, xmm5 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + ret + } +} +#endif // HAS_MIRRORROW_SSSE3 + +#ifdef HAS_MIRRORROW_AVX2 +__declspec(naked) +void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + vbroadcastf128 ymm5, kShuffleMirror + + convertloop: + vmovdqu ymm0, [eax - 32 + ecx] + vpshufb ymm0, ymm0, ymm5 + vpermq ymm0, ymm0, 0x4e // swap high and low halfs + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_MIRRORROW_AVX2 + +#ifdef HAS_MIRRORROW_SSE2 +__declspec(naked) +void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + + convertloop: + movdqu xmm0, [eax - 16 + ecx] + movdqa xmm1, xmm0 // swap bytes + psllw xmm0, 8 + psrlw xmm1, 8 + por xmm0, xmm1 + pshuflw xmm0, xmm0, 0x1b // swap words + pshufhw xmm0, xmm0, 0x1b + pshufd xmm0, xmm0, 0x4e // swap qwords + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + ret + } +} +#endif // HAS_MIRRORROW_SSE2 + +#ifdef HAS_MIRRORROW_UV_SSSE3 +// Shuffle table for reversing the bytes of UV channels. +static const uvec8 kShuffleMirrorUV = { + 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u +}; + +__declspec(naked) +void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + movdqa xmm1, kShuffleMirrorUV + lea eax, [eax + ecx * 2 - 16] + sub edi, edx + + convertloop: + movdqu xmm0, [eax] + lea eax, [eax - 16] + pshufb xmm0, xmm1 + movlpd qword ptr [edx], xmm0 + movhpd qword ptr [edx + edi], xmm0 + lea edx, [edx + 8] + sub ecx, 8 + jg convertloop + + pop edi + ret + } +} +#endif // HAS_MIRRORROW_UV_SSSE3 + +#ifdef HAS_ARGBMIRRORROW_SSE2 +__declspec(naked) +void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + lea eax, [eax - 16 + ecx * 4] // last 4 pixels. + + convertloop: + movdqu xmm0, [eax] + lea eax, [eax - 16] + pshufd xmm0, xmm0, 0x1b + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg convertloop + ret + } +} +#endif // HAS_ARGBMIRRORROW_SSE2 + +#ifdef HAS_ARGBMIRRORROW_AVX2 +// Shuffle table for reversing the bytes. +static const ulvec32 kARGBShuffleMirror_AVX2 = { + 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u +}; + +__declspec(naked) +void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + vmovdqu ymm5, kARGBShuffleMirror_AVX2 + + convertloop: + vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBMIRRORROW_AVX2 + +#ifdef HAS_SPLITUVROW_SSE2 +__declspec(naked) +void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + pand xmm0, xmm5 // even bytes + pand xmm1, xmm5 + packuswb xmm0, xmm1 + psrlw xmm2, 8 // odd bytes + psrlw xmm3, 8 + packuswb xmm2, xmm3 + movdqu [edx], xmm0 + movdqu [edx + edi], xmm2 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} + +#endif // HAS_SPLITUVROW_SSE2 + +#ifdef HAS_SPLITUVROW_AVX2 +__declspec(naked) +void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpsrlw ymm2, ymm0, 8 // odd bytes + vpsrlw ymm3, ymm1, 8 + vpand ymm0, ymm0, ymm5 // even bytes + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 + vpackuswb ymm2, ymm2, ymm3 + vpermq ymm0, ymm0, 0xd8 + vpermq ymm2, ymm2, 0xd8 + vmovdqu [edx], ymm0 + vmovdqu [edx + edi], ymm2 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloop + + pop edi + vzeroupper + ret + } +} +#endif // HAS_SPLITUVROW_AVX2 + +#ifdef HAS_MERGEUVROW_SSE2 +__declspec(naked) +void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_u + mov edx, [esp + 4 + 8] // src_v + mov edi, [esp + 4 + 12] // dst_uv + mov ecx, [esp + 4 + 16] // width + sub edx, eax + + convertloop: + movdqu xmm0, [eax] // read 16 U's + movdqu xmm1, [eax + edx] // and 16 V's + lea eax, [eax + 16] + movdqa xmm2, xmm0 + punpcklbw xmm0, xmm1 // first 8 UV pairs + punpckhbw xmm2, xmm1 // next 8 UV pairs + movdqu [edi], xmm0 + movdqu [edi + 16], xmm2 + lea edi, [edi + 32] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} +#endif // HAS_MERGEUVROW_SSE2 + +#ifdef HAS_MERGEUVROW_AVX2 +__declspec(naked) +void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_u + mov edx, [esp + 4 + 8] // src_v + mov edi, [esp + 4 + 12] // dst_uv + mov ecx, [esp + 4 + 16] // width + sub edx, eax + + convertloop: + vmovdqu ymm0, [eax] // read 32 U's + vmovdqu ymm1, [eax + edx] // and 32 V's + lea eax, [eax + 32] + vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 + vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 + vextractf128 [edi], ymm2, 0 // bytes 0..15 + vextractf128 [edi + 16], ymm0, 0 // bytes 16..31 + vextractf128 [edi + 32], ymm2, 1 // bytes 32..47 + vextractf128 [edi + 48], ymm0, 1 // bytes 47..63 + lea edi, [edi + 64] + sub ecx, 32 + jg convertloop + + pop edi + vzeroupper + ret + } +} +#endif // HAS_MERGEUVROW_AVX2 + +#ifdef HAS_COPYROW_SSE2 +// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. +__declspec(naked) +void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // count + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloop + ret + } +} +#endif // HAS_COPYROW_SSE2 + +#ifdef HAS_COPYROW_AVX +// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time. +__declspec(naked) +void CopyRow_AVX(const uint8* src, uint8* dst, int count) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // count + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vmovdqu [edx], ymm0 + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + sub ecx, 64 + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_COPYROW_AVX + +// Multiple of 1. +__declspec(naked) +void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { + __asm { + mov eax, esi + mov edx, edi + mov esi, [esp + 4] // src + mov edi, [esp + 8] // dst + mov ecx, [esp + 12] // count + rep movsb + mov edi, edx + mov esi, eax + ret + } +} + +#ifdef HAS_ARGBCOPYALPHAROW_SSE2 +// width in pixels +__declspec(naked) +void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // count + pcmpeqb xmm0, xmm0 // generate mask 0xff000000 + pslld xmm0, 24 + pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff + psrld xmm1, 8 + + convertloop: + movdqu xmm2, [eax] + movdqu xmm3, [eax + 16] + lea eax, [eax + 32] + movdqu xmm4, [edx] + movdqu xmm5, [edx + 16] + pand xmm2, xmm0 + pand xmm3, xmm0 + pand xmm4, xmm1 + pand xmm5, xmm1 + por xmm2, xmm4 + por xmm3, xmm5 + movdqu [edx], xmm2 + movdqu [edx + 16], xmm3 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + ret + } +} +#endif // HAS_ARGBCOPYALPHAROW_SSE2 + +#ifdef HAS_ARGBCOPYALPHAROW_AVX2 +// width in pixels +__declspec(naked) +void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // count + vpcmpeqb ymm0, ymm0, ymm0 + vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff + + convertloop: + vmovdqu ymm1, [eax] + vmovdqu ymm2, [eax + 32] + lea eax, [eax + 64] + vpblendvb ymm1, ymm1, [edx], ymm0 + vpblendvb ymm2, ymm2, [edx + 32], ymm0 + vmovdqu [edx], ymm1 + vmovdqu [edx + 32], ymm2 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBCOPYALPHAROW_AVX2 + +#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 +// width in pixels +__declspec(naked) +void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // count + pcmpeqb xmm0, xmm0 // generate mask 0xff000000 + pslld xmm0, 24 + pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff + psrld xmm1, 8 + + convertloop: + movq xmm2, qword ptr [eax] // 8 Y's + lea eax, [eax + 8] + punpcklbw xmm2, xmm2 + punpckhwd xmm3, xmm2 + punpcklwd xmm2, xmm2 + movdqu xmm4, [edx] + movdqu xmm5, [edx + 16] + pand xmm2, xmm0 + pand xmm3, xmm0 + pand xmm4, xmm1 + pand xmm5, xmm1 + por xmm2, xmm4 + por xmm3, xmm5 + movdqu [edx], xmm2 + movdqu [edx + 16], xmm3 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + ret + } +} +#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 + +#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 +// width in pixels +__declspec(naked) +void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // count + vpcmpeqb ymm0, ymm0, ymm0 + vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff + + convertloop: + vpmovzxbd ymm1, qword ptr [eax] + vpmovzxbd ymm2, qword ptr [eax + 8] + lea eax, [eax + 16] + vpslld ymm1, ymm1, 24 + vpslld ymm2, ymm2, 24 + vpblendvb ymm1, ymm1, [edx], ymm0 + vpblendvb ymm2, ymm2, [edx + 32], ymm0 + vmovdqu [edx], ymm1 + vmovdqu [edx + 32], ymm2 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 + +#ifdef HAS_SETROW_X86 +// Write 'count' bytes using an 8 bit value repeated. +// Count should be multiple of 4. +__declspec(naked) +void SetRow_X86(uint8* dst, uint8 v8, int count) { + __asm { + movzx eax, byte ptr [esp + 8] // v8 + mov edx, 0x01010101 // Duplicate byte to all bytes. + mul edx // overwrites edx with upper part of result. + mov edx, edi + mov edi, [esp + 4] // dst + mov ecx, [esp + 12] // count + shr ecx, 2 + rep stosd + mov edi, edx + ret + } +} + +// Write 'count' bytes using an 8 bit value repeated. +__declspec(naked) +void SetRow_ERMS(uint8* dst, uint8 v8, int count) { + __asm { + mov edx, edi + mov edi, [esp + 4] // dst + mov eax, [esp + 8] // v8 + mov ecx, [esp + 12] // count + rep stosb + mov edi, edx + ret + } +} + +// Write 'count' 32 bit values. +__declspec(naked) +void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { + __asm { + mov edx, edi + mov edi, [esp + 4] // dst + mov eax, [esp + 8] // v32 + mov ecx, [esp + 12] // count + rep stosd + mov edi, edx + ret + } +} +#endif // HAS_SETROW_X86 + +#ifdef HAS_YUY2TOYROW_AVX2 +__declspec(naked) +void YUY2ToYRow_AVX2(const uint8* src_yuy2, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_yuy2 + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpand ymm0, ymm0, ymm5 // even bytes are Y + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloop + vzeroupper + ret + } +} + +__declspec(naked) +void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vpavgb ymm0, ymm0, [eax + esi] + vpavgb ymm1, ymm1, [eax + esi + 32] + lea eax, [eax + 64] + vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vpand ymm1, ymm0, ymm5 // U + vpsrlw ymm0, ymm0, 8 // V + vpackuswb ymm1, ymm1, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm0 // mutates. + vpermq ymm1, ymm1, 0xd8 + vpermq ymm0, ymm0, 0xd8 + vextractf128 [edx], ymm1, 0 // U + vextractf128 [edx + edi], ymm0, 0 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} + +__declspec(naked) +void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vpand ymm1, ymm0, ymm5 // U + vpsrlw ymm0, ymm0, 8 // V + vpackuswb ymm1, ymm1, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm0 // mutates. + vpermq ymm1, ymm1, 0xd8 + vpermq ymm0, ymm0, 0xd8 + vextractf128 [edx], ymm1, 0 // U + vextractf128 [edx + edi], ymm0, 0 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + vzeroupper + ret + } +} + +__declspec(naked) +void UYVYToYRow_AVX2(const uint8* src_uyvy, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_uyvy + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpsrlw ymm0, ymm0, 8 // odd bytes are Y + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloop + vzeroupper + ret + } +} + +__declspec(naked) +void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vpavgb ymm0, ymm0, [eax + esi] + vpavgb ymm1, ymm1, [eax + esi + 32] + lea eax, [eax + 64] + vpand ymm0, ymm0, ymm5 // UYVY -> UVUV + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vpand ymm1, ymm0, ymm5 // U + vpsrlw ymm0, ymm0, 8 // V + vpackuswb ymm1, ymm1, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm0 // mutates. + vpermq ymm1, ymm1, 0xd8 + vpermq ymm0, ymm0, 0xd8 + vextractf128 [edx], ymm1, 0 // U + vextractf128 [edx + edi], ymm0, 0 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} + +__declspec(naked) +void UYVYToUV422Row_AVX2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpand ymm0, ymm0, ymm5 // UYVY -> UVUV + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vpand ymm1, ymm0, ymm5 // U + vpsrlw ymm0, ymm0, 8 // V + vpackuswb ymm1, ymm1, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm0 // mutates. + vpermq ymm1, ymm1, 0xd8 + vpermq ymm0, ymm0, 0xd8 + vextractf128 [edx], ymm1, 0 // U + vextractf128 [edx + edi], ymm0, 0 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + vzeroupper + ret + } +} +#endif // HAS_YUY2TOYROW_AVX2 + +#ifdef HAS_YUY2TOYROW_SSE2 +__declspec(naked) +void YUY2ToYRow_SSE2(const uint8* src_yuy2, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_yuy2 + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 // even bytes are Y + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) +void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) +void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) +void UYVYToYRow_SSE2(const uint8* src_uyvy, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_uyvy + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // odd bytes are Y + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) +void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + pand xmm0, xmm5 // UYVY -> UVUV + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) +void UYVYToUV422Row_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 // UYVY -> UVUV + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} +#endif // HAS_YUY2TOYROW_SSE2 + +#ifdef HAS_ARGBBLENDROW_SSE2 +// Blend 8 pixels at a time. +__declspec(naked) +void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm7, xmm7 // generate constant 1 + psrlw xmm7, 15 + pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff + psrlw xmm6, 8 + pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 + psllw xmm5, 8 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + sub ecx, 4 + jl convertloop4b // less than 4 pixels? + + // 4 pixel loop. + convertloop4: + movdqu xmm3, [eax] // src argb + lea eax, [eax + 16] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movdqu xmm2, [esi] // _r_b + psrlw xmm3, 8 // alpha + pshufhw xmm3, xmm3, 0F5h // 8 alpha words + pshuflw xmm3, xmm3, 0F5h + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movdqu xmm1, [esi] // _a_g + lea esi, [esi + 16] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jge convertloop4 + + convertloop4b: + add ecx, 4 - 1 + jl convertloop1b + + // 1 pixel loop. + convertloop1: + movd xmm3, [eax] // src argb + lea eax, [eax + 4] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movd xmm2, [esi] // _r_b + psrlw xmm3, 8 // alpha + pshufhw xmm3, xmm3, 0F5h // 8 alpha words + pshuflw xmm3, xmm3, 0F5h + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movd xmm1, [esi] // _a_g + lea esi, [esi + 4] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + movd [edx], xmm0 + lea edx, [edx + 4] + sub ecx, 1 + jge convertloop1 + + convertloop1b: + pop esi + ret + } +} +#endif // HAS_ARGBBLENDROW_SSE2 + +#ifdef HAS_ARGBBLENDROW_SSSE3 +// Shuffle table for isolating alpha. +static const uvec8 kShuffleAlpha = { + 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, + 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 +}; +// Same as SSE2, but replaces: +// psrlw xmm3, 8 // alpha +// pshufhw xmm3, xmm3, 0F5h // 8 alpha words +// pshuflw xmm3, xmm3, 0F5h +// with.. +// pshufb xmm3, kShuffleAlpha // alpha +// Blend 8 pixels at a time. + +__declspec(naked) +void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm7, xmm7 // generate constant 0x0001 + psrlw xmm7, 15 + pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff + psrlw xmm6, 8 + pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 + psllw xmm5, 8 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + sub ecx, 4 + jl convertloop4b // less than 4 pixels? + + // 4 pixel loop. + convertloop4: + movdqu xmm3, [eax] // src argb + lea eax, [eax + 16] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movdqu xmm2, [esi] // _r_b + pshufb xmm3, kShuffleAlpha // alpha + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movdqu xmm1, [esi] // _a_g + lea esi, [esi + 16] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jge convertloop4 + + convertloop4b: + add ecx, 4 - 1 + jl convertloop1b + + // 1 pixel loop. + convertloop1: + movd xmm3, [eax] // src argb + lea eax, [eax + 4] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movd xmm2, [esi] // _r_b + pshufb xmm3, kShuffleAlpha // alpha + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movd xmm1, [esi] // _a_g + lea esi, [esi + 4] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + movd [edx], xmm0 + lea edx, [edx + 4] + sub ecx, 1 + jge convertloop1 + + convertloop1b: + pop esi + ret + } +} +#endif // HAS_ARGBBLENDROW_SSSE3 + +#ifdef HAS_ARGBATTENUATEROW_SSE2 +// Attenuate 4 pixels at a time. +__declspec(naked) +void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { + __asm { + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff + psrld xmm5, 8 + + convertloop: + movdqu xmm0, [eax] // read 4 pixels + punpcklbw xmm0, xmm0 // first 2 + pshufhw xmm2, xmm0, 0FFh // 8 alpha words + pshuflw xmm2, xmm2, 0FFh + pmulhuw xmm0, xmm2 // rgb * a + movdqu xmm1, [eax] // read 4 pixels + punpckhbw xmm1, xmm1 // next 2 pixels + pshufhw xmm2, xmm1, 0FFh // 8 alpha words + pshuflw xmm2, xmm2, 0FFh + pmulhuw xmm1, xmm2 // rgb * a + movdqu xmm2, [eax] // alphas + lea eax, [eax + 16] + psrlw xmm0, 8 + pand xmm2, xmm4 + psrlw xmm1, 8 + packuswb xmm0, xmm1 + pand xmm0, xmm5 // keep original alphas + por xmm0, xmm2 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg convertloop + + ret + } +} +#endif // HAS_ARGBATTENUATEROW_SSE2 + +#ifdef HAS_ARGBATTENUATEROW_SSSE3 +// Shuffle table duplicating alpha. +static const uvec8 kShuffleAlpha0 = { + 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, +}; +static const uvec8 kShuffleAlpha1 = { + 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, + 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, +}; +__declspec(naked) +void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { + __asm { + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + pcmpeqb xmm3, xmm3 // generate mask 0xff000000 + pslld xmm3, 24 + movdqa xmm4, kShuffleAlpha0 + movdqa xmm5, kShuffleAlpha1 + + convertloop: + movdqu xmm0, [eax] // read 4 pixels + pshufb xmm0, xmm4 // isolate first 2 alphas + movdqu xmm1, [eax] // read 4 pixels + punpcklbw xmm1, xmm1 // first 2 pixel rgbs + pmulhuw xmm0, xmm1 // rgb * a + movdqu xmm1, [eax] // read 4 pixels + pshufb xmm1, xmm5 // isolate next 2 alphas + movdqu xmm2, [eax] // read 4 pixels + punpckhbw xmm2, xmm2 // next 2 pixel rgbs + pmulhuw xmm1, xmm2 // rgb * a + movdqu xmm2, [eax] // mask original alpha + lea eax, [eax + 16] + pand xmm2, xmm3 + psrlw xmm0, 8 + psrlw xmm1, 8 + packuswb xmm0, xmm1 + por xmm0, xmm2 // copy original alpha + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg convertloop + + ret + } +} +#endif // HAS_ARGBATTENUATEROW_SSSE3 + +#ifdef HAS_ARGBATTENUATEROW_AVX2 +// Shuffle table duplicating alpha. +static const uvec8 kShuffleAlpha_AVX2 = { + 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u +}; +__declspec(naked) +void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { + __asm { + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + vbroadcastf128 ymm4,kShuffleAlpha_AVX2 + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 + vpslld ymm5, ymm5, 24 + + convertloop: + vmovdqu ymm6, [eax] // read 8 pixels. + vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. + vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. + vpshufb ymm2, ymm0, ymm4 // low 4 alphas + vpshufb ymm3, ymm1, ymm4 // high 4 alphas + vpmulhuw ymm0, ymm0, ymm2 // rgb * a + vpmulhuw ymm1, ymm1, ymm3 // rgb * a + vpand ymm6, ymm6, ymm5 // isolate alpha + vpsrlw ymm0, ymm0, 8 + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 // unmutated. + vpor ymm0, ymm0, ymm6 // copy original alpha + vmovdqu [eax + edx], ymm0 + lea eax, [eax + 32] + sub ecx, 8 + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBATTENUATEROW_AVX2 + +#ifdef HAS_ARGBUNATTENUATEROW_SSE2 +// Unattenuate 4 pixels at a time. +__declspec(naked) +void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb0 + mov edx, [esp + 8 + 8] // dst_argb + mov ecx, [esp + 8 + 12] // width + + convertloop: + movdqu xmm0, [eax] // read 4 pixels + movzx esi, byte ptr [eax + 3] // first alpha + movzx edi, byte ptr [eax + 7] // second alpha + punpcklbw xmm0, xmm0 // first 2 + movd xmm2, dword ptr fixed_invtbl8[esi * 4] + movd xmm3, dword ptr fixed_invtbl8[edi * 4] + pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a + pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words + movlhps xmm2, xmm3 + pmulhuw xmm0, xmm2 // rgb * a + + movdqu xmm1, [eax] // read 4 pixels + movzx esi, byte ptr [eax + 11] // third alpha + movzx edi, byte ptr [eax + 15] // forth alpha + punpckhbw xmm1, xmm1 // next 2 + movd xmm2, dword ptr fixed_invtbl8[esi * 4] + movd xmm3, dword ptr fixed_invtbl8[edi * 4] + pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words + pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words + movlhps xmm2, xmm3 + pmulhuw xmm1, xmm2 // rgb * a + lea eax, [eax + 16] + + packuswb xmm0, xmm1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg convertloop + pop edi + pop esi + ret + } +} +#endif // HAS_ARGBUNATTENUATEROW_SSE2 + +#ifdef HAS_ARGBUNATTENUATEROW_AVX2 +// Shuffle table duplicating alpha. +static const uvec8 kUnattenShuffleAlpha_AVX2 = { + 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u +}; +// TODO(fbarchard): Enable USE_GATHER for future hardware if faster. +// USE_GATHER is not on by default, due to being a slow instruction. +#ifdef USE_GATHER +__declspec(naked) +void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, + int width) { + __asm { + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2 + + convertloop: + vmovdqu ymm6, [eax] // read 8 pixels. + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather. + vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. + vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. + vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. + vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a + vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a + vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. + vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a + vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas + vpmulhuw ymm0, ymm0, ymm2 // rgb * ia + vpmulhuw ymm1, ymm1, ymm3 // rgb * ia + vpackuswb ymm0, ymm0, ymm1 // unmutated. + vmovdqu [eax + edx], ymm0 + lea eax, [eax + 32] + sub ecx, 8 + jg convertloop + + vzeroupper + ret + } +} +#else // USE_GATHER +__declspec(naked) +void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, + int width) { + __asm { + + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2 + + push esi + push edi + + convertloop: + // replace VPGATHER + movzx esi, byte ptr [eax + 3] // alpha0 + movzx edi, byte ptr [eax + 7] // alpha1 + vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a0] + vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a1] + movzx esi, byte ptr [eax + 11] // alpha2 + movzx edi, byte ptr [eax + 15] // alpha3 + vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] + vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a2] + vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a3] + movzx esi, byte ptr [eax + 19] // alpha4 + movzx edi, byte ptr [eax + 23] // alpha5 + vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] + vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a4] + vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a5] + movzx esi, byte ptr [eax + 27] // alpha6 + movzx edi, byte ptr [eax + 31] // alpha7 + vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] + vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a6] + vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a7] + vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] + vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] + vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] + vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] + // end of VPGATHER + + vmovdqu ymm6, [eax] // read 8 pixels. + vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. + vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. + vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a + vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. + vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a + vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas + vpmulhuw ymm0, ymm0, ymm2 // rgb * ia + vpmulhuw ymm1, ymm1, ymm3 // rgb * ia + vpackuswb ymm0, ymm0, ymm1 // unmutated. + vmovdqu [eax + edx], ymm0 + lea eax, [eax + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // USE_GATHER +#endif // HAS_ARGBATTENUATEROW_AVX2 + +#ifdef HAS_ARGBGRAYROW_SSSE3 +// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. +__declspec(naked) +void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* width */ + movdqa xmm4, kARGBToYJ + movdqa xmm5, kAddYJ64 + + convertloop: + movdqu xmm0, [eax] // G + movdqu xmm1, [eax + 16] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + phaddw xmm0, xmm1 + paddw xmm0, xmm5 // Add .5 for rounding. + psrlw xmm0, 7 + packuswb xmm0, xmm0 // 8 G bytes + movdqu xmm2, [eax] // A + movdqu xmm3, [eax + 16] + lea eax, [eax + 32] + psrld xmm2, 24 + psrld xmm3, 24 + packuswb xmm2, xmm3 + packuswb xmm2, xmm2 // 8 A bytes + movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA + punpcklbw xmm0, xmm0 // 8 GG words + punpcklbw xmm3, xmm2 // 8 GA words + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm3 // GGGA first 4 + punpckhwd xmm1, xmm3 // GGGA next 4 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + ret + } +} +#endif // HAS_ARGBGRAYROW_SSSE3 + +#ifdef HAS_ARGBSEPIAROW_SSSE3 +// b = (r * 35 + g * 68 + b * 17) >> 7 +// g = (r * 45 + g * 88 + b * 22) >> 7 +// r = (r * 50 + g * 98 + b * 24) >> 7 +// Constant for ARGB color to sepia tone. +static const vec8 kARGBToSepiaB = { + 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 +}; + +static const vec8 kARGBToSepiaG = { + 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 +}; + +static const vec8 kARGBToSepiaR = { + 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 +}; + +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. +__declspec(naked) +void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { + __asm { + mov eax, [esp + 4] /* dst_argb */ + mov ecx, [esp + 8] /* width */ + movdqa xmm2, kARGBToSepiaB + movdqa xmm3, kARGBToSepiaG + movdqa xmm4, kARGBToSepiaR + + convertloop: + movdqu xmm0, [eax] // B + movdqu xmm6, [eax + 16] + pmaddubsw xmm0, xmm2 + pmaddubsw xmm6, xmm2 + phaddw xmm0, xmm6 + psrlw xmm0, 7 + packuswb xmm0, xmm0 // 8 B values + movdqu xmm5, [eax] // G + movdqu xmm1, [eax + 16] + pmaddubsw xmm5, xmm3 + pmaddubsw xmm1, xmm3 + phaddw xmm5, xmm1 + psrlw xmm5, 7 + packuswb xmm5, xmm5 // 8 G values + punpcklbw xmm0, xmm5 // 8 BG values + movdqu xmm5, [eax] // R + movdqu xmm1, [eax + 16] + pmaddubsw xmm5, xmm4 + pmaddubsw xmm1, xmm4 + phaddw xmm5, xmm1 + psrlw xmm5, 7 + packuswb xmm5, xmm5 // 8 R values + movdqu xmm6, [eax] // A + movdqu xmm1, [eax + 16] + psrld xmm6, 24 + psrld xmm1, 24 + packuswb xmm6, xmm1 + packuswb xmm6, xmm6 // 8 A values + punpcklbw xmm5, xmm6 // 8 RA values + movdqa xmm1, xmm0 // Weave BG, RA together + punpcklwd xmm0, xmm5 // BGRA first 4 + punpckhwd xmm1, xmm5 // BGRA next 4 + movdqu [eax], xmm0 + movdqu [eax + 16], xmm1 + lea eax, [eax + 32] + sub ecx, 8 + jg convertloop + ret + } +} +#endif // HAS_ARGBSEPIAROW_SSSE3 + +#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 +// Tranform 8 ARGB pixels (32 bytes) with color matrix. +// Same as Sepia except matrix is provided. +// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R +// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. +__declspec(naked) +void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* matrix_argb */ + movdqu xmm5, [ecx] + pshufd xmm2, xmm5, 0x00 + pshufd xmm3, xmm5, 0x55 + pshufd xmm4, xmm5, 0xaa + pshufd xmm5, xmm5, 0xff + mov ecx, [esp + 16] /* width */ + + convertloop: + movdqu xmm0, [eax] // B + movdqu xmm7, [eax + 16] + pmaddubsw xmm0, xmm2 + pmaddubsw xmm7, xmm2 + movdqu xmm6, [eax] // G + movdqu xmm1, [eax + 16] + pmaddubsw xmm6, xmm3 + pmaddubsw xmm1, xmm3 + phaddsw xmm0, xmm7 // B + phaddsw xmm6, xmm1 // G + psraw xmm0, 6 // B + psraw xmm6, 6 // G + packuswb xmm0, xmm0 // 8 B values + packuswb xmm6, xmm6 // 8 G values + punpcklbw xmm0, xmm6 // 8 BG values + movdqu xmm1, [eax] // R + movdqu xmm7, [eax + 16] + pmaddubsw xmm1, xmm4 + pmaddubsw xmm7, xmm4 + phaddsw xmm1, xmm7 // R + movdqu xmm6, [eax] // A + movdqu xmm7, [eax + 16] + pmaddubsw xmm6, xmm5 + pmaddubsw xmm7, xmm5 + phaddsw xmm6, xmm7 // A + psraw xmm1, 6 // R + psraw xmm6, 6 // A + packuswb xmm1, xmm1 // 8 R values + packuswb xmm6, xmm6 // 8 A values + punpcklbw xmm1, xmm6 // 8 RA values + movdqa xmm6, xmm0 // Weave BG, RA together + punpcklwd xmm0, xmm1 // BGRA first 4 + punpckhwd xmm6, xmm1 // BGRA next 4 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm6 + lea eax, [eax + 32] + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + ret + } +} +#endif // HAS_ARGBCOLORMATRIXROW_SSSE3 + +#ifdef HAS_ARGBQUANTIZEROW_SSE2 +// Quantize 4 ARGB pixels (16 bytes). +__declspec(naked) +void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width) { + __asm { + mov eax, [esp + 4] /* dst_argb */ + movd xmm2, [esp + 8] /* scale */ + movd xmm3, [esp + 12] /* interval_size */ + movd xmm4, [esp + 16] /* interval_offset */ + mov ecx, [esp + 20] /* width */ + pshuflw xmm2, xmm2, 040h + pshufd xmm2, xmm2, 044h + pshuflw xmm3, xmm3, 040h + pshufd xmm3, xmm3, 044h + pshuflw xmm4, xmm4, 040h + pshufd xmm4, xmm4, 044h + pxor xmm5, xmm5 // constant 0 + pcmpeqb xmm6, xmm6 // generate mask 0xff000000 + pslld xmm6, 24 + + convertloop: + movdqu xmm0, [eax] // read 4 pixels + punpcklbw xmm0, xmm5 // first 2 pixels + pmulhuw xmm0, xmm2 // pixel * scale >> 16 + movdqu xmm1, [eax] // read 4 pixels + punpckhbw xmm1, xmm5 // next 2 pixels + pmulhuw xmm1, xmm2 + pmullw xmm0, xmm3 // * interval_size + movdqu xmm7, [eax] // read 4 pixels + pmullw xmm1, xmm3 + pand xmm7, xmm6 // mask alpha + paddw xmm0, xmm4 // + interval_size / 2 + paddw xmm1, xmm4 + packuswb xmm0, xmm1 + por xmm0, xmm7 + movdqu [eax], xmm0 + lea eax, [eax + 16] + sub ecx, 4 + jg convertloop + ret + } +} +#endif // HAS_ARGBQUANTIZEROW_SSE2 + +#ifdef HAS_ARGBSHADEROW_SSE2 +// Shade 4 pixels at a time by specified value. +__declspec(naked) +void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + movd xmm2, [esp + 16] // value + punpcklbw xmm2, xmm2 + punpcklqdq xmm2, xmm2 + + convertloop: + movdqu xmm0, [eax] // read 4 pixels + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm0 // first 2 + punpckhbw xmm1, xmm1 // next 2 + pmulhuw xmm0, xmm2 // argb * value + pmulhuw xmm1, xmm2 // argb * value + psrlw xmm0, 8 + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg convertloop + + ret + } +} +#endif // HAS_ARGBSHADEROW_SSE2 + +#ifdef HAS_ARGBMULTIPLYROW_SSE2 +// Multiply 2 rows of ARGB pixels together, 4 pixels at a time. +__declspec(naked) +void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + pxor xmm5, xmm5 // constant 0 + + convertloop: + movdqu xmm0, [eax] // read 4 pixels from src_argb0 + movdqu xmm2, [esi] // read 4 pixels from src_argb1 + movdqu xmm1, xmm0 + movdqu xmm3, xmm2 + punpcklbw xmm0, xmm0 // first 2 + punpckhbw xmm1, xmm1 // next 2 + punpcklbw xmm2, xmm5 // first 2 + punpckhbw xmm3, xmm5 // next 2 + pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 + pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 + lea eax, [eax + 16] + lea esi, [esi + 16] + packuswb xmm0, xmm1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg convertloop + + pop esi + ret + } +} +#endif // HAS_ARGBMULTIPLYROW_SSE2 + +#ifdef HAS_ARGBADDROW_SSE2 +// Add 2 rows of ARGB pixels together, 4 pixels at a time. +// TODO(fbarchard): Port this to posix, neon and other math functions. +__declspec(naked) +void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + + sub ecx, 4 + jl convertloop49 + + convertloop4: + movdqu xmm0, [eax] // read 4 pixels from src_argb0 + lea eax, [eax + 16] + movdqu xmm1, [esi] // read 4 pixels from src_argb1 + lea esi, [esi + 16] + paddusb xmm0, xmm1 // src_argb0 + src_argb1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jge convertloop4 + + convertloop49: + add ecx, 4 - 1 + jl convertloop19 + + convertloop1: + movd xmm0, [eax] // read 1 pixels from src_argb0 + lea eax, [eax + 4] + movd xmm1, [esi] // read 1 pixels from src_argb1 + lea esi, [esi + 4] + paddusb xmm0, xmm1 // src_argb0 + src_argb1 + movd [edx], xmm0 + lea edx, [edx + 4] + sub ecx, 1 + jge convertloop1 + + convertloop19: + pop esi + ret + } +} +#endif // HAS_ARGBADDROW_SSE2 + +#ifdef HAS_ARGBSUBTRACTROW_SSE2 +// Subtract 2 rows of ARGB pixels together, 4 pixels at a time. +__declspec(naked) +void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + + convertloop: + movdqu xmm0, [eax] // read 4 pixels from src_argb0 + lea eax, [eax + 16] + movdqu xmm1, [esi] // read 4 pixels from src_argb1 + lea esi, [esi + 16] + psubusb xmm0, xmm1 // src_argb0 - src_argb1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg convertloop + + pop esi + ret + } +} +#endif // HAS_ARGBSUBTRACTROW_SSE2 + +#ifdef HAS_ARGBMULTIPLYROW_AVX2 +// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. +__declspec(naked) +void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + vpxor ymm5, ymm5, ymm5 // constant 0 + + convertloop: + vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 + lea eax, [eax + 32] + vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 + lea esi, [esi + 32] + vpunpcklbw ymm0, ymm1, ymm1 // low 4 + vpunpckhbw ymm1, ymm1, ymm1 // high 4 + vpunpcklbw ymm2, ymm3, ymm5 // low 4 + vpunpckhbw ymm3, ymm3, ymm5 // high 4 + vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 + vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 + vpackuswb ymm0, ymm0, ymm1 + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + vzeroupper + ret + } +} +#endif // HAS_ARGBMULTIPLYROW_AVX2 + +#ifdef HAS_ARGBADDROW_AVX2 +// Add 2 rows of ARGB pixels together, 8 pixels at a time. +__declspec(naked) +void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + + convertloop: + vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 + lea eax, [eax + 32] + vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 + lea esi, [esi + 32] + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + vzeroupper + ret + } +} +#endif // HAS_ARGBADDROW_AVX2 + +#ifdef HAS_ARGBSUBTRACTROW_AVX2 +// Subtract 2 rows of ARGB pixels together, 8 pixels at a time. +__declspec(naked) +void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + + convertloop: + vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 + lea eax, [eax + 32] + vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1 + lea esi, [esi + 32] + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + vzeroupper + ret + } +} +#endif // HAS_ARGBSUBTRACTROW_AVX2 + +#ifdef HAS_SOBELXROW_SSE2 +// SobelX as a matrix is +// -1 0 1 +// -2 0 2 +// -1 0 1 +__declspec(naked) +void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobelx, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_y0 + mov esi, [esp + 8 + 8] // src_y1 + mov edi, [esp + 8 + 12] // src_y2 + mov edx, [esp + 8 + 16] // dst_sobelx + mov ecx, [esp + 8 + 20] // width + sub esi, eax + sub edi, eax + sub edx, eax + pxor xmm5, xmm5 // constant 0 + + convertloop: + movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] + movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + psubw xmm0, xmm1 + movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] + movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] + punpcklbw xmm1, xmm5 + punpcklbw xmm2, xmm5 + psubw xmm1, xmm2 + movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] + movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2] + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 + psubw xmm2, xmm3 + paddw xmm0, xmm2 + paddw xmm0, xmm1 + paddw xmm0, xmm1 + pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw + psubw xmm1, xmm0 + pmaxsw xmm0, xmm1 + packuswb xmm0, xmm0 + movq qword ptr [eax + edx], xmm0 + lea eax, [eax + 8] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} +#endif // HAS_SOBELXROW_SSE2 + +#ifdef HAS_SOBELYROW_SSE2 +// SobelY as a matrix is +// -1 -2 -1 +// 0 0 0 +// 1 2 1 +__declspec(naked) +void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_y0 + mov esi, [esp + 4 + 8] // src_y1 + mov edx, [esp + 4 + 12] // dst_sobely + mov ecx, [esp + 4 + 16] // width + sub esi, eax + sub edx, eax + pxor xmm5, xmm5 // constant 0 + + convertloop: + movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] + movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + psubw xmm0, xmm1 + movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] + movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1] + punpcklbw xmm1, xmm5 + punpcklbw xmm2, xmm5 + psubw xmm1, xmm2 + movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] + movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 + psubw xmm2, xmm3 + paddw xmm0, xmm2 + paddw xmm0, xmm1 + paddw xmm0, xmm1 + pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw + psubw xmm1, xmm0 + pmaxsw xmm0, xmm1 + packuswb xmm0, xmm0 + movq qword ptr [eax + edx], xmm0 + lea eax, [eax + 8] + sub ecx, 8 + jg convertloop + + pop esi + ret + } +} +#endif // HAS_SOBELYROW_SSE2 + +#ifdef HAS_SOBELROW_SSE2 +// Adds Sobel X and Sobel Y and stores Sobel into ARGB. +// A = 255 +// R = Sobel +// G = Sobel +// B = Sobel +__declspec(naked) +void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + sub esi, eax + pcmpeqb xmm5, xmm5 // alpha 255 + pslld xmm5, 24 // 0xff000000 + + convertloop: + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely + lea eax, [eax + 16] + paddusb xmm0, xmm1 // sobel = sobelx + sobely + movdqa xmm2, xmm0 // GG + punpcklbw xmm2, xmm0 // First 8 + punpckhbw xmm0, xmm0 // Next 8 + movdqa xmm1, xmm2 // GGGG + punpcklwd xmm1, xmm2 // First 4 + punpckhwd xmm2, xmm2 // Next 4 + por xmm1, xmm5 // GGGA + por xmm2, xmm5 + movdqa xmm3, xmm0 // GGGG + punpcklwd xmm3, xmm0 // Next 4 + punpckhwd xmm0, xmm0 // Last 4 + por xmm3, xmm5 // GGGA + por xmm0, xmm5 + movdqu [edx], xmm1 + movdqu [edx + 16], xmm2 + movdqu [edx + 32], xmm3 + movdqu [edx + 48], xmm0 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + + pop esi + ret + } +} +#endif // HAS_SOBELROW_SSE2 + +#ifdef HAS_SOBELTOPLANEROW_SSE2 +// Adds Sobel X and Sobel Y and stores Sobel into a plane. +__declspec(naked) +void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + sub esi, eax + + convertloop: + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely + lea eax, [eax + 16] + paddusb xmm0, xmm1 // sobel = sobelx + sobely + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + + pop esi + ret + } +} +#endif // HAS_SOBELTOPLANEROW_SSE2 + +#ifdef HAS_SOBELXYROW_SSE2 +// Mixes Sobel X, Sobel Y and Sobel into ARGB. +// A = 255 +// R = Sobel X +// G = Sobel +// B = Sobel Y +__declspec(naked) +void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + sub esi, eax + pcmpeqb xmm5, xmm5 // alpha 255 + + convertloop: + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely + lea eax, [eax + 16] + movdqa xmm2, xmm0 + paddusb xmm2, xmm1 // sobel = sobelx + sobely + movdqa xmm3, xmm0 // XA + punpcklbw xmm3, xmm5 + punpckhbw xmm0, xmm5 + movdqa xmm4, xmm1 // YS + punpcklbw xmm4, xmm2 + punpckhbw xmm1, xmm2 + movdqa xmm6, xmm4 // YSXA + punpcklwd xmm6, xmm3 // First 4 + punpckhwd xmm4, xmm3 // Next 4 + movdqa xmm7, xmm1 // YSXA + punpcklwd xmm7, xmm0 // Next 4 + punpckhwd xmm1, xmm0 // Last 4 + movdqu [edx], xmm6 + movdqu [edx + 16], xmm4 + movdqu [edx + 32], xmm7 + movdqu [edx + 48], xmm1 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + + pop esi + ret + } +} +#endif // HAS_SOBELXYROW_SSE2 + +#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 +// Consider float CumulativeSum. +// Consider calling CumulativeSum one row at time as needed. +// Consider circular CumulativeSum buffer of radius * 2 + 1 height. +// Convert cumulative sum for an area to an average for 1 pixel. +// topleft is pointer to top left of CumulativeSum buffer for area. +// botleft is pointer to bottom left of CumulativeSum buffer. +// width is offset from left to right of area in CumulativeSum buffer measured +// in number of ints. +// area is the number of pixels in the area being averaged. +// dst points to pixel to store result to. +// count is number of averaged pixels to produce. +// Does 4 pixels at a time. +void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, + int width, int area, uint8* dst, + int count) { + __asm { + mov eax, topleft // eax topleft + mov esi, botleft // esi botleft + mov edx, width + movd xmm5, area + mov edi, dst + mov ecx, count + cvtdq2ps xmm5, xmm5 + rcpss xmm4, xmm5 // 1.0f / area + pshufd xmm4, xmm4, 0 + sub ecx, 4 + jl l4b + + cmp area, 128 // 128 pixels will not overflow 15 bits. + ja l4 + + pshufd xmm5, xmm5, 0 // area + pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 + psrld xmm6, 16 + cvtdq2ps xmm6, xmm6 + addps xmm5, xmm6 // (65536.0 + area - 1) + mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area + cvtps2dq xmm5, xmm5 // 0.16 fixed point + packssdw xmm5, xmm5 // 16 bit shorts + + // 4 pixel loop small blocks. + s4: + // top left + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + + // - top right + psubd xmm0, [eax + edx * 4] + psubd xmm1, [eax + edx * 4 + 16] + psubd xmm2, [eax + edx * 4 + 32] + psubd xmm3, [eax + edx * 4 + 48] + lea eax, [eax + 64] + + // - bottom left + psubd xmm0, [esi] + psubd xmm1, [esi + 16] + psubd xmm2, [esi + 32] + psubd xmm3, [esi + 48] + + // + bottom right + paddd xmm0, [esi + edx * 4] + paddd xmm1, [esi + edx * 4 + 16] + paddd xmm2, [esi + edx * 4 + 32] + paddd xmm3, [esi + edx * 4 + 48] + lea esi, [esi + 64] + + packssdw xmm0, xmm1 // pack 4 pixels into 2 registers + packssdw xmm2, xmm3 + + pmulhuw xmm0, xmm5 + pmulhuw xmm2, xmm5 + + packuswb xmm0, xmm2 + movdqu [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 4 + jge s4 + + jmp l4b + + // 4 pixel loop + l4: + // top left + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + + // - top right + psubd xmm0, [eax + edx * 4] + psubd xmm1, [eax + edx * 4 + 16] + psubd xmm2, [eax + edx * 4 + 32] + psubd xmm3, [eax + edx * 4 + 48] + lea eax, [eax + 64] + + // - bottom left + psubd xmm0, [esi] + psubd xmm1, [esi + 16] + psubd xmm2, [esi + 32] + psubd xmm3, [esi + 48] + + // + bottom right + paddd xmm0, [esi + edx * 4] + paddd xmm1, [esi + edx * 4 + 16] + paddd xmm2, [esi + edx * 4 + 32] + paddd xmm3, [esi + edx * 4 + 48] + lea esi, [esi + 64] + + cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area + cvtdq2ps xmm1, xmm1 + mulps xmm0, xmm4 + mulps xmm1, xmm4 + cvtdq2ps xmm2, xmm2 + cvtdq2ps xmm3, xmm3 + mulps xmm2, xmm4 + mulps xmm3, xmm4 + cvtps2dq xmm0, xmm0 + cvtps2dq xmm1, xmm1 + cvtps2dq xmm2, xmm2 + cvtps2dq xmm3, xmm3 + packssdw xmm0, xmm1 + packssdw xmm2, xmm3 + packuswb xmm0, xmm2 + movdqu [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 4 + jge l4 + + l4b: + add ecx, 4 - 1 + jl l1b + + // 1 pixel loop + l1: + movdqu xmm0, [eax] + psubd xmm0, [eax + edx * 4] + lea eax, [eax + 16] + psubd xmm0, [esi] + paddd xmm0, [esi + edx * 4] + lea esi, [esi + 16] + cvtdq2ps xmm0, xmm0 + mulps xmm0, xmm4 + cvtps2dq xmm0, xmm0 + packssdw xmm0, xmm0 + packuswb xmm0, xmm0 + movd dword ptr [edi], xmm0 + lea edi, [edi + 4] + sub ecx, 1 + jge l1 + l1b: + } +} +#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 + +#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 +// Creates a table of cumulative sums where each value is a sum of all values +// above and to the left of the value. +void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, + const int32* previous_cumsum, int width) { + __asm { + mov eax, row + mov edx, cumsum + mov esi, previous_cumsum + mov ecx, width + pxor xmm0, xmm0 + pxor xmm1, xmm1 + + sub ecx, 4 + jl l4b + test edx, 15 + jne l4b + + // 4 pixel loop + l4: + movdqu xmm2, [eax] // 4 argb pixels 16 bytes. + lea eax, [eax + 16] + movdqa xmm4, xmm2 + + punpcklbw xmm2, xmm1 + movdqa xmm3, xmm2 + punpcklwd xmm2, xmm1 + punpckhwd xmm3, xmm1 + + punpckhbw xmm4, xmm1 + movdqa xmm5, xmm4 + punpcklwd xmm4, xmm1 + punpckhwd xmm5, xmm1 + + paddd xmm0, xmm2 + movdqu xmm2, [esi] // previous row above. + paddd xmm2, xmm0 + + paddd xmm0, xmm3 + movdqu xmm3, [esi + 16] + paddd xmm3, xmm0 + + paddd xmm0, xmm4 + movdqu xmm4, [esi + 32] + paddd xmm4, xmm0 + + paddd xmm0, xmm5 + movdqu xmm5, [esi + 48] + lea esi, [esi + 64] + paddd xmm5, xmm0 + + movdqu [edx], xmm2 + movdqu [edx + 16], xmm3 + movdqu [edx + 32], xmm4 + movdqu [edx + 48], xmm5 + + lea edx, [edx + 64] + sub ecx, 4 + jge l4 + + l4b: + add ecx, 4 - 1 + jl l1b + + // 1 pixel loop + l1: + movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes. + lea eax, [eax + 4] + punpcklbw xmm2, xmm1 + punpcklwd xmm2, xmm1 + paddd xmm0, xmm2 + movdqu xmm2, [esi] + lea esi, [esi + 16] + paddd xmm2, xmm0 + movdqu [edx], xmm2 + lea edx, [edx + 16] + sub ecx, 1 + jge l1 + + l1b: + } +} +#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 + +#ifdef HAS_ARGBAFFINEROW_SSE2 +// Copy ARGB pixels from source image with slope to a row of destination. +__declspec(naked) +LIBYUV_API +void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, + uint8* dst_argb, const float* uv_dudv, int width) { + __asm { + push esi + push edi + mov eax, [esp + 12] // src_argb + mov esi, [esp + 16] // stride + mov edx, [esp + 20] // dst_argb + mov ecx, [esp + 24] // pointer to uv_dudv + movq xmm2, qword ptr [ecx] // uv + movq xmm7, qword ptr [ecx + 8] // dudv + mov ecx, [esp + 28] // width + shl esi, 16 // 4, stride + add esi, 4 + movd xmm5, esi + sub ecx, 4 + jl l4b + + // setup for 4 pixel loop + pshufd xmm7, xmm7, 0x44 // dup dudv + pshufd xmm5, xmm5, 0 // dup 4, stride + movdqa xmm0, xmm2 // x0, y0, x1, y1 + addps xmm0, xmm7 + movlhps xmm2, xmm0 + movdqa xmm4, xmm7 + addps xmm4, xmm4 // dudv *= 2 + movdqa xmm3, xmm2 // x2, y2, x3, y3 + addps xmm3, xmm4 + addps xmm4, xmm4 // dudv *= 4 + + // 4 pixel loop + l4: + cvttps2dq xmm0, xmm2 // x, y float to int first 2 + cvttps2dq xmm1, xmm3 // x, y float to int next 2 + packssdw xmm0, xmm1 // x, y as 8 shorts + pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // shift right + movd edi, xmm0 + pshufd xmm0, xmm0, 0x39 // shift right + movd xmm1, [eax + esi] // read pixel 0 + movd xmm6, [eax + edi] // read pixel 1 + punpckldq xmm1, xmm6 // combine pixel 0 and 1 + addps xmm2, xmm4 // x, y += dx, dy first 2 + movq qword ptr [edx], xmm1 + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // shift right + movd edi, xmm0 + movd xmm6, [eax + esi] // read pixel 2 + movd xmm0, [eax + edi] // read pixel 3 + punpckldq xmm6, xmm0 // combine pixel 2 and 3 + addps xmm3, xmm4 // x, y += dx, dy next 2 + movq qword ptr 8[edx], xmm6 + lea edx, [edx + 16] + sub ecx, 4 + jge l4 + + l4b: + add ecx, 4 - 1 + jl l1b + + // 1 pixel loop + l1: + cvttps2dq xmm0, xmm2 // x, y float to int + packssdw xmm0, xmm0 // x, y as shorts + pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride + addps xmm2, xmm7 // x, y += dx, dy + movd esi, xmm0 + movd xmm0, [eax + esi] // copy a pixel + movd [edx], xmm0 + lea edx, [edx + 4] + sub ecx, 1 + jge l1 + l1b: + pop edi + pop esi + ret + } +} +#endif // HAS_ARGBAFFINEROW_SSE2 + +#ifdef HAS_INTERPOLATEROW_AVX2 +// Bilinear filter 32x2 -> 32x1 +__declspec(naked) +void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + shr eax, 1 + // Dispatch to specialized filters if applicable. + cmp eax, 0 + je xloop100 // 0 / 128. Blend 100 / 0. + sub edi, esi + cmp eax, 32 + je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. + cmp eax, 64 + je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. + cmp eax, 96 + je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. + + vmovd xmm0, eax // high fraction 0..127 + neg eax + add eax, 128 + vmovd xmm5, eax // low fraction 128..1 + vpunpcklbw xmm5, xmm5, xmm0 + vpunpcklwd xmm5, xmm5, xmm5 + vpxor ymm0, ymm0, ymm0 + vpermd ymm5, ymm0, ymm5 + + xloop: + vmovdqu ymm0, [esi] + vmovdqu ymm2, [esi + edx] + vpunpckhbw ymm1, ymm0, ymm2 // mutates + vpunpcklbw ymm0, ymm0, ymm2 // mutates + vpmaddubsw ymm0, ymm0, ymm5 + vpmaddubsw ymm1, ymm1, ymm5 + vpsrlw ymm0, ymm0, 7 + vpsrlw ymm1, ymm1, 7 + vpackuswb ymm0, ymm0, ymm1 // unmutates + vmovdqu [esi + edi], ymm0 + lea esi, [esi + 32] + sub ecx, 32 + jg xloop + jmp xloop99 + + // Blend 25 / 75. + xloop25: + vmovdqu ymm0, [esi] + vmovdqu ymm1, [esi + edx] + vpavgb ymm0, ymm0, ymm1 + vpavgb ymm0, ymm0, ymm1 + vmovdqu [esi + edi], ymm0 + lea esi, [esi + 32] + sub ecx, 32 + jg xloop25 + jmp xloop99 + + // Blend 50 / 50. + xloop50: + vmovdqu ymm0, [esi] + vpavgb ymm0, ymm0, [esi + edx] + vmovdqu [esi + edi], ymm0 + lea esi, [esi + 32] + sub ecx, 32 + jg xloop50 + jmp xloop99 + + // Blend 75 / 25. + xloop75: + vmovdqu ymm1, [esi] + vmovdqu ymm0, [esi + edx] + vpavgb ymm0, ymm0, ymm1 + vpavgb ymm0, ymm0, ymm1 + vmovdqu [esi + edi], ymm0 + lea esi, [esi + 32] + sub ecx, 32 + jg xloop75 + jmp xloop99 + + // Blend 100 / 0 - Copy row unchanged. + xloop100: + rep movsb + + xloop99: + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_INTERPOLATEROW_AVX2 + +// Bilinear filter 16x2 -> 16x1 +__declspec(naked) +void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + sub edi, esi + shr eax, 1 + // Dispatch to specialized filters if applicable. + cmp eax, 0 + je xloop100 // 0 / 128. Blend 100 / 0. + cmp eax, 32 + je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. + cmp eax, 64 + je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. + cmp eax, 96 + je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. + + movd xmm0, eax // high fraction 0..127 + neg eax + add eax, 128 + movd xmm5, eax // low fraction 128..1 + punpcklbw xmm5, xmm0 + punpcklwd xmm5, xmm5 + pshufd xmm5, xmm5, 0 + + xloop: + movdqu xmm0, [esi] + movdqu xmm2, [esi + edx] + movdqu xmm1, xmm0 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + pmaddubsw xmm0, xmm5 + pmaddubsw xmm1, xmm5 + psrlw xmm0, 7 + psrlw xmm1, 7 + packuswb xmm0, xmm1 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + sub ecx, 16 + jg xloop + jmp xloop99 + + // Blend 25 / 75. + xloop25: + movdqu xmm0, [esi] + movdqu xmm1, [esi + edx] + pavgb xmm0, xmm1 + pavgb xmm0, xmm1 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + sub ecx, 16 + jg xloop25 + jmp xloop99 + + // Blend 50 / 50. + xloop50: + movdqu xmm0, [esi] + movdqu xmm1, [esi + edx] + pavgb xmm0, xmm1 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + sub ecx, 16 + jg xloop50 + jmp xloop99 + + // Blend 75 / 25. + xloop75: + movdqu xmm1, [esi] + movdqu xmm0, [esi + edx] + pavgb xmm0, xmm1 + pavgb xmm0, xmm1 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + sub ecx, 16 + jg xloop75 + jmp xloop99 + + // Blend 100 / 0 - Copy row unchanged. + xloop100: + movdqu xmm0, [esi] + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + sub ecx, 16 + jg xloop100 + + xloop99: + pop edi + pop esi + ret + } +} + +#ifdef HAS_INTERPOLATEROW_SSE2 +// Bilinear filter 16x2 -> 16x1 +__declspec(naked) +void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + sub edi, esi + // Dispatch to specialized filters if applicable. + cmp eax, 0 + je xloop100 // 0 / 256. Blend 100 / 0. + cmp eax, 64 + je xloop75 // 64 / 256 is 0.25. Blend 75 / 25. + cmp eax, 128 + je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. + cmp eax, 192 + je xloop25 // 192 / 256 is 0.75. Blend 25 / 75. + + movd xmm5, eax // xmm5 = y fraction + punpcklbw xmm5, xmm5 + psrlw xmm5, 1 + punpcklwd xmm5, xmm5 + punpckldq xmm5, xmm5 + punpcklqdq xmm5, xmm5 + pxor xmm4, xmm4 + + xloop: + movdqu xmm0, [esi] // row0 + movdqu xmm2, [esi + edx] // row1 + movdqu xmm1, xmm0 + movdqu xmm3, xmm2 + punpcklbw xmm2, xmm4 + punpckhbw xmm3, xmm4 + punpcklbw xmm0, xmm4 + punpckhbw xmm1, xmm4 + psubw xmm2, xmm0 // row1 - row0 + psubw xmm3, xmm1 + paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 + paddw xmm3, xmm3 + pmulhw xmm2, xmm5 // scale diff + pmulhw xmm3, xmm5 + paddw xmm0, xmm2 // sum rows + paddw xmm1, xmm3 + packuswb xmm0, xmm1 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + sub ecx, 16 + jg xloop + jmp xloop99 + + // Blend 25 / 75. + xloop25: + movdqu xmm0, [esi] + movdqu xmm1, [esi + edx] + pavgb xmm0, xmm1 + pavgb xmm0, xmm1 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + sub ecx, 16 + jg xloop25 + jmp xloop99 + + // Blend 50 / 50. + xloop50: + movdqu xmm0, [esi] + movdqu xmm1, [esi + edx] + pavgb xmm0, xmm1 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + sub ecx, 16 + jg xloop50 + jmp xloop99 + + // Blend 75 / 25. + xloop75: + movdqu xmm1, [esi] + movdqu xmm0, [esi + edx] + pavgb xmm0, xmm1 + pavgb xmm0, xmm1 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + sub ecx, 16 + jg xloop75 + jmp xloop99 + + // Blend 100 / 0 - Copy row unchanged. + xloop100: + movdqu xmm0, [esi] + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + sub ecx, 16 + jg xloop100 + + xloop99: + pop edi + pop esi + ret + } +} +#endif // HAS_INTERPOLATEROW_SSE2 + +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +__declspec(naked) +void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // shuffler + movdqu xmm5, [ecx] + mov ecx, [esp + 16] // pix + + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + pshufb xmm0, xmm5 + pshufb xmm1, xmm5 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg wloop + ret + } +} + +#ifdef HAS_ARGBSHUFFLEROW_AVX2 +__declspec(naked) +void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // shuffler + vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. + mov ecx, [esp + 16] // pix + + wloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpshufb ymm0, ymm0, ymm5 + vpshufb ymm1, ymm1, ymm5 + vmovdqu [edx], ymm0 + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + sub ecx, 16 + jg wloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBSHUFFLEROW_AVX2 + +__declspec(naked) +void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + __asm { + push ebx + push esi + mov eax, [esp + 8 + 4] // src_argb + mov edx, [esp + 8 + 8] // dst_argb + mov esi, [esp + 8 + 12] // shuffler + mov ecx, [esp + 8 + 16] // pix + pxor xmm5, xmm5 + + mov ebx, [esi] // shuffler + cmp ebx, 0x03000102 + je shuf_3012 + cmp ebx, 0x00010203 + je shuf_0123 + cmp ebx, 0x00030201 + je shuf_0321 + cmp ebx, 0x02010003 + je shuf_2103 + + // TODO(fbarchard): Use one source pointer and 3 offsets. + shuf_any1: + movzx ebx, byte ptr [esi] + movzx ebx, byte ptr [eax + ebx] + mov [edx], bl + movzx ebx, byte ptr [esi + 1] + movzx ebx, byte ptr [eax + ebx] + mov [edx + 1], bl + movzx ebx, byte ptr [esi + 2] + movzx ebx, byte ptr [eax + ebx] + mov [edx + 2], bl + movzx ebx, byte ptr [esi + 3] + movzx ebx, byte ptr [eax + ebx] + mov [edx + 3], bl + lea eax, [eax + 4] + lea edx, [edx + 4] + sub ecx, 1 + jg shuf_any1 + jmp shuf99 + + shuf_0123: + movdqu xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm5 + punpckhbw xmm1, xmm5 + pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB + pshuflw xmm0, xmm0, 01Bh + pshufhw xmm1, xmm1, 01Bh + pshuflw xmm1, xmm1, 01Bh + packuswb xmm0, xmm1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg shuf_0123 + jmp shuf99 + + shuf_0321: + movdqu xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm5 + punpckhbw xmm1, xmm5 + pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB + pshuflw xmm0, xmm0, 039h + pshufhw xmm1, xmm1, 039h + pshuflw xmm1, xmm1, 039h + packuswb xmm0, xmm1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg shuf_0321 + jmp shuf99 + + shuf_2103: + movdqu xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm5 + punpckhbw xmm1, xmm5 + pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA + pshuflw xmm0, xmm0, 093h + pshufhw xmm1, xmm1, 093h + pshuflw xmm1, xmm1, 093h + packuswb xmm0, xmm1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg shuf_2103 + jmp shuf99 + + shuf_3012: + movdqu xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm5 + punpckhbw xmm1, xmm5 + pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB + pshuflw xmm0, xmm0, 0C6h + pshufhw xmm1, xmm1, 0C6h + pshuflw xmm1, xmm1, 0C6h + packuswb xmm0, xmm1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg shuf_3012 + + shuf99: + pop esi + pop ebx + ret + } +} + +// YUY2 - Macro-pixel = 2 image pixels +// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... + +// UYVY - Macro-pixel = 2 image pixels +// U0Y0V0Y1 + +__declspec(naked) +void I422ToYUY2Row_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_y + mov esi, [esp + 8 + 8] // src_u + mov edx, [esp + 8 + 12] // src_v + mov edi, [esp + 8 + 16] // dst_frame + mov ecx, [esp + 8 + 20] // width + sub edx, esi + + convertloop: + movq xmm2, qword ptr [esi] // U + movq xmm3, qword ptr [esi + edx] // V + lea esi, [esi + 8] + punpcklbw xmm2, xmm3 // UV + movdqu xmm0, [eax] // Y + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm2 // YUYV + punpckhbw xmm1, xmm2 + movdqu [edi], xmm0 + movdqu [edi + 16], xmm1 + lea edi, [edi + 32] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) +void I422ToUYVYRow_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_y + mov esi, [esp + 8 + 8] // src_u + mov edx, [esp + 8 + 12] // src_v + mov edi, [esp + 8 + 16] // dst_frame + mov ecx, [esp + 8 + 20] // width + sub edx, esi + + convertloop: + movq xmm2, qword ptr [esi] // U + movq xmm3, qword ptr [esi + edx] // V + lea esi, [esi + 8] + punpcklbw xmm2, xmm3 // UV + movdqu xmm0, [eax] // Y + movdqa xmm1, xmm2 + lea eax, [eax + 16] + punpcklbw xmm1, xmm0 // UYVY + punpckhbw xmm2, xmm0 + movdqu [edi], xmm1 + movdqu [edi + 16], xmm2 + lea edi, [edi + 32] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 +__declspec(naked) +void ARGBPolynomialRow_SSE2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] /* src_argb */ + mov edx, [esp + 4 + 8] /* dst_argb */ + mov esi, [esp + 4 + 12] /* poly */ + mov ecx, [esp + 4 + 16] /* width */ + pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. + + // 2 pixel loop. + convertloop: +// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel +// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel + movq xmm0, qword ptr [eax] // BGRABGRA + lea eax, [eax + 8] + punpcklbw xmm0, xmm3 + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm3 // pixel 0 + punpckhwd xmm4, xmm3 // pixel 1 + cvtdq2ps xmm0, xmm0 // 4 floats + cvtdq2ps xmm4, xmm4 + movdqa xmm1, xmm0 // X + movdqa xmm5, xmm4 + mulps xmm0, [esi + 16] // C1 * X + mulps xmm4, [esi + 16] + addps xmm0, [esi] // result = C0 + C1 * X + addps xmm4, [esi] + movdqa xmm2, xmm1 + movdqa xmm6, xmm5 + mulps xmm2, xmm1 // X * X + mulps xmm6, xmm5 + mulps xmm1, xmm2 // X * X * X + mulps xmm5, xmm6 + mulps xmm2, [esi + 32] // C2 * X * X + mulps xmm6, [esi + 32] + mulps xmm1, [esi + 48] // C3 * X * X * X + mulps xmm5, [esi + 48] + addps xmm0, xmm2 // result += C2 * X * X + addps xmm4, xmm6 + addps xmm0, xmm1 // result += C3 * X * X * X + addps xmm4, xmm5 + cvttps2dq xmm0, xmm0 + cvttps2dq xmm4, xmm4 + packuswb xmm0, xmm4 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + sub ecx, 2 + jg convertloop + pop esi + ret + } +} +#endif // HAS_ARGBPOLYNOMIALROW_SSE2 + +#ifdef HAS_ARGBPOLYNOMIALROW_AVX2 +__declspec(naked) +void ARGBPolynomialRow_AVX2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* poly */ + vbroadcastf128 ymm4, [ecx] // C0 + vbroadcastf128 ymm5, [ecx + 16] // C1 + vbroadcastf128 ymm6, [ecx + 32] // C2 + vbroadcastf128 ymm7, [ecx + 48] // C3 + mov ecx, [esp + 16] /* width */ + + // 2 pixel loop. + convertloop: + vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels + lea eax, [eax + 8] + vcvtdq2ps ymm0, ymm0 // X 8 floats + vmulps ymm2, ymm0, ymm0 // X * X + vmulps ymm3, ymm0, ymm7 // C3 * X + vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X + vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X + vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X + vcvttps2dq ymm0, ymm0 + vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000 + vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000 + vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000 + vmovq qword ptr [edx], xmm0 + lea edx, [edx + 8] + sub ecx, 2 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBPOLYNOMIALROW_AVX2 + +#ifdef HAS_ARGBCOLORTABLEROW_X86 +// Tranform ARGB pixels with color table. +__declspec(naked) +void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] /* dst_argb */ + mov esi, [esp + 4 + 8] /* table_argb */ + mov ecx, [esp + 4 + 12] /* width */ + + // 1 pixel loop. + convertloop: + movzx edx, byte ptr [eax] + lea eax, [eax + 4] + movzx edx, byte ptr [esi + edx * 4] + mov byte ptr [eax - 4], dl + movzx edx, byte ptr [eax - 4 + 1] + movzx edx, byte ptr [esi + edx * 4 + 1] + mov byte ptr [eax - 4 + 1], dl + movzx edx, byte ptr [eax - 4 + 2] + movzx edx, byte ptr [esi + edx * 4 + 2] + mov byte ptr [eax - 4 + 2], dl + movzx edx, byte ptr [eax - 4 + 3] + movzx edx, byte ptr [esi + edx * 4 + 3] + mov byte ptr [eax - 4 + 3], dl + dec ecx + jg convertloop + pop esi + ret + } +} +#endif // HAS_ARGBCOLORTABLEROW_X86 + +#ifdef HAS_RGBCOLORTABLEROW_X86 +// Tranform RGB pixels with color table. +__declspec(naked) +void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] /* dst_argb */ + mov esi, [esp + 4 + 8] /* table_argb */ + mov ecx, [esp + 4 + 12] /* width */ + + // 1 pixel loop. + convertloop: + movzx edx, byte ptr [eax] + lea eax, [eax + 4] + movzx edx, byte ptr [esi + edx * 4] + mov byte ptr [eax - 4], dl + movzx edx, byte ptr [eax - 4 + 1] + movzx edx, byte ptr [esi + edx * 4 + 1] + mov byte ptr [eax - 4 + 1], dl + movzx edx, byte ptr [eax - 4 + 2] + movzx edx, byte ptr [esi + edx * 4 + 2] + mov byte ptr [eax - 4 + 2], dl + dec ecx + jg convertloop + + pop esi + ret + } +} +#endif // HAS_RGBCOLORTABLEROW_X86 + +#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 +// Tranform RGB pixels with luma table. +__declspec(naked) +void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + int width, + const uint8* luma, uint32 lumacoeff) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] /* src_argb */ + mov edi, [esp + 8 + 8] /* dst_argb */ + mov ecx, [esp + 8 + 12] /* width */ + movd xmm2, dword ptr [esp + 8 + 16] // luma table + movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff + pshufd xmm2, xmm2, 0 + pshufd xmm3, xmm3, 0 + pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 + psllw xmm4, 8 + pxor xmm5, xmm5 + + // 4 pixel loop. + convertloop: + movdqu xmm0, qword ptr [eax] // generate luma ptr + pmaddubsw xmm0, xmm3 + phaddw xmm0, xmm0 + pand xmm0, xmm4 // mask out low bits + punpcklwd xmm0, xmm5 + paddd xmm0, xmm2 // add table base + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 + + movzx edx, byte ptr [eax] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi], dl + movzx edx, byte ptr [eax + 1] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 1], dl + movzx edx, byte ptr [eax + 2] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 2], dl + movzx edx, byte ptr [eax + 3] // copy alpha. + mov byte ptr [edi + 3], dl + + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 + + movzx edx, byte ptr [eax + 4] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 4], dl + movzx edx, byte ptr [eax + 5] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 5], dl + movzx edx, byte ptr [eax + 6] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 6], dl + movzx edx, byte ptr [eax + 7] // copy alpha. + mov byte ptr [edi + 7], dl + + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 + + movzx edx, byte ptr [eax + 8] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 8], dl + movzx edx, byte ptr [eax + 9] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 9], dl + movzx edx, byte ptr [eax + 10] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 10], dl + movzx edx, byte ptr [eax + 11] // copy alpha. + mov byte ptr [edi + 11], dl + + movd esi, xmm0 + + movzx edx, byte ptr [eax + 12] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 12], dl + movzx edx, byte ptr [eax + 13] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 13], dl + movzx edx, byte ptr [eax + 14] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 14], dl + movzx edx, byte ptr [eax + 15] // copy alpha. + mov byte ptr [edi + 15], dl + + lea eax, [eax + 16] + lea edi, [edi + 16] + sub ecx, 4 + jg convertloop + + pop edi + pop esi + ret + } +} +#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 + +#endif // defined(_M_X64) +#endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/row_x86.asm b/libs/libaom/src/third_party/libyuv/source/row_x86.asm new file mode 100644 index 000000000..0cb326f8e --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/row_x86.asm @@ -0,0 +1,146 @@ +; +; Copyright 2012 The LibYuv Project Authors. All rights reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%ifdef __YASM_VERSION_ID__ +%if __YASM_VERSION_ID__ < 01020000h +%error AVX2 is supported only by yasm 1.2.0 or later. +%endif +%endif +%include "x86inc.asm" + +SECTION .text + +; cglobal numeric constants are parameters, gpr regs, mm regs + +; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) + +%macro YUY2TOYROW 2-3 +cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix +%ifidn %1,YUY2 + pcmpeqb m2, m2, m2 ; generate mask 0x00ff00ff + psrlw m2, m2, 8 +%endif + + ALIGN 4 +.convertloop: + mov%2 m0, [src_yuy2q] + mov%2 m1, [src_yuy2q + mmsize] + lea src_yuy2q, [src_yuy2q + mmsize * 2] +%ifidn %1,YUY2 + pand m0, m0, m2 ; YUY2 even bytes are Y + pand m1, m1, m2 +%else + psrlw m0, m0, 8 ; UYVY odd bytes are Y + psrlw m1, m1, 8 +%endif + packuswb m0, m0, m1 +%if cpuflag(AVX2) + vpermq m0, m0, 0xd8 +%endif + sub pixd, mmsize + mov%2 [dst_yq], m0 + lea dst_yq, [dst_yq + mmsize] + jg .convertloop + REP_RET +%endmacro + +; TODO(fbarchard): Remove MMX. Add SSSE3 pshufb version. +INIT_MMX MMX +YUY2TOYROW YUY2,a, +YUY2TOYROW YUY2,u,_Unaligned +YUY2TOYROW UYVY,a, +YUY2TOYROW UYVY,u,_Unaligned +INIT_XMM SSE2 +YUY2TOYROW YUY2,a, +YUY2TOYROW YUY2,u,_Unaligned +YUY2TOYROW UYVY,a, +YUY2TOYROW UYVY,u,_Unaligned +INIT_YMM AVX2 +YUY2TOYROW YUY2,a, +YUY2TOYROW UYVY,a, + +; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) + +%macro SplitUVRow 1-2 +cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix + pcmpeqb m4, m4, m4 ; generate mask 0x00ff00ff + psrlw m4, m4, 8 + sub dst_vq, dst_uq + + ALIGN 4 +.convertloop: + mov%1 m0, [src_uvq] + mov%1 m1, [src_uvq + mmsize] + lea src_uvq, [src_uvq + mmsize * 2] + psrlw m2, m0, 8 ; odd bytes + psrlw m3, m1, 8 + pand m0, m0, m4 ; even bytes + pand m1, m1, m4 + packuswb m0, m0, m1 + packuswb m2, m2, m3 +%if cpuflag(AVX2) + vpermq m0, m0, 0xd8 + vpermq m2, m2, 0xd8 +%endif + mov%1 [dst_uq], m0 + mov%1 [dst_uq + dst_vq], m2 + lea dst_uq, [dst_uq + mmsize] + sub pixd, mmsize + jg .convertloop + REP_RET +%endmacro + +INIT_MMX MMX +SplitUVRow a, +SplitUVRow u,_Unaligned +INIT_XMM SSE2 +SplitUVRow a, +SplitUVRow u,_Unaligned +INIT_YMM AVX2 +SplitUVRow a, + +; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +; int width); + +%macro MergeUVRow_ 1-2 +cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix + sub src_vq, src_uq + + ALIGN 4 +.convertloop: + mov%1 m0, [src_uq] + mov%1 m1, [src_vq] + lea src_uq, [src_uq + mmsize] + punpcklbw m2, m0, m1 // first 8 UV pairs + punpckhbw m0, m0, m1 // next 8 UV pairs +%if cpuflag(AVX2) + vperm2i128 m1, m2, m0, 0x20 // low 128 of ymm2 and low 128 of ymm0 + vperm2i128 m2, m2, m0, 0x31 // high 128 of ymm2 and high 128 of ymm0 + mov%1 [dst_uvq], m1 + mov%1 [dst_uvq + mmsize], m2 +%else + mov%1 [dst_uvq], m2 + mov%1 [dst_uvq + mmsize], m0 +%endif + lea dst_uvq, [dst_uvq + mmsize * 2] + sub pixd, mmsize + jg .convertloop + REP_RET +%endmacro + +INIT_MMX MMX +MergeUVRow_ a, +MergeUVRow_ u,_Unaligned +INIT_XMM SSE2 +MergeUVRow_ a, +MergeUVRow_ u,_Unaligned +INIT_YMM AVX2 +MergeUVRow_ a, + diff --git a/libs/libaom/src/third_party/libyuv/source/scale.cc b/libs/libaom/src/third_party/libyuv/source/scale.cc new file mode 100644 index 000000000..0a01304c4 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/scale.cc @@ -0,0 +1,1689 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" + +#include +#include + +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" // For CopyPlane +#include "libyuv/row.h" +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) + +// Scale plane, 1/2 +// This is an optimized version for scaling down a plane to 1/2 of +// its original size. + +static void ScalePlaneDown2(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) = + filtering == kFilterNone ? ScaleRowDown2_C : + (filtering == kFilterLinear ? ScaleRowDown2Linear_C : ScaleRowDown2Box_C); + int row_stride = src_stride << 1; + if (!filtering) { + src_ptr += src_stride; // Point to odd rows. + src_stride = 0; + } + +#if defined(HAS_SCALEROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_NEON : + (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON : + ScaleRowDown2Box_Any_NEON); + if (IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON : + (filtering == kFilterLinear ? ScaleRowDown2Linear_NEON : + ScaleRowDown2Box_NEON); + } + } +#endif +#if defined(HAS_SCALEROWDOWN2_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSE2 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSE2 : + ScaleRowDown2Box_Any_SSE2); + if (IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 : + ScaleRowDown2Box_SSE2); + } + } +#endif +#if defined(HAS_SCALEROWDOWN2_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_AVX2 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 : + ScaleRowDown2Box_Any_AVX2); + if (IS_ALIGNED(dst_width, 32)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_AVX2 : + ScaleRowDown2Box_AVX2); + } + } +#endif +#if defined(HAS_SCALEROWDOWN2_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) && + IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + ScaleRowDown2 = filtering ? + ScaleRowDown2Box_MIPS_DSPR2 : ScaleRowDown2_MIPS_DSPR2; + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + // TODO(fbarchard): Loop through source height to allow odd height. + for (y = 0; y < dst_height; ++y) { + ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += row_stride; + dst_ptr += dst_stride; + } +} + +static void ScalePlaneDown2_16(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_ptr, uint16* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown2)(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width) = + filtering == kFilterNone ? ScaleRowDown2_16_C : + (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C : + ScaleRowDown2Box_16_C); + int row_stride = src_stride << 1; + if (!filtering) { + src_ptr += src_stride; // Point to odd rows. + src_stride = 0; + } + +#if defined(HAS_SCALEROWDOWN2_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON : + ScaleRowDown2_16_NEON; + } +#endif +#if defined(HAS_SCALEROWDOWN2_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 : + ScaleRowDown2Box_16_SSE2); + } +#endif +#if defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) && + IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + ScaleRowDown2 = filtering ? + ScaleRowDown2Box_16_MIPS_DSPR2 : ScaleRowDown2_16_MIPS_DSPR2; + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + // TODO(fbarchard): Loop through source height to allow odd height. + for (y = 0; y < dst_height; ++y) { + ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += row_stride; + dst_ptr += dst_stride; + } +} + +// Scale plane, 1/4 +// This is an optimized version for scaling down a plane to 1/4 of +// its original size. + +static void ScalePlaneDown4(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) = + filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C; + int row_stride = src_stride << 2; + if (!filtering) { + src_ptr += src_stride * 2; // Point to row 2. + src_stride = 0; + } +#if defined(HAS_SCALEROWDOWN4_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowDown4 = filtering ? + ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON; + } + } +#endif +#if defined(HAS_SCALEROWDOWN4_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleRowDown4 = filtering ? + ScaleRowDown4Box_Any_SSE2 : ScaleRowDown4_Any_SSE2; + if (IS_ALIGNED(dst_width, 8)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2; + } + } +#endif +#if defined(HAS_SCALEROWDOWN4_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowDown4 = filtering ? + ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2; + if (IS_ALIGNED(dst_width, 16)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2; + } + } +#endif +#if defined(HAS_SCALEROWDOWN4_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) && + IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + ScaleRowDown4 = filtering ? + ScaleRowDown4Box_MIPS_DSPR2 : ScaleRowDown4_MIPS_DSPR2; + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (y = 0; y < dst_height; ++y) { + ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += row_stride; + dst_ptr += dst_stride; + } +} + +static void ScalePlaneDown4_16(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_ptr, uint16* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown4)(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width) = + filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C; + int row_stride = src_stride << 2; + if (!filtering) { + src_ptr += src_stride * 2; // Point to row 2. + src_stride = 0; + } +#if defined(HAS_SCALEROWDOWN4_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON : + ScaleRowDown4_16_NEON; + } +#endif +#if defined(HAS_SCALEROWDOWN4_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 : + ScaleRowDown4_16_SSE2; + } +#endif +#if defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) && + IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + ScaleRowDown4 = filtering ? + ScaleRowDown4Box_16_MIPS_DSPR2 : ScaleRowDown4_16_MIPS_DSPR2; + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (y = 0; y < dst_height; ++y) { + ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += row_stride; + dst_ptr += dst_stride; + } +} + +// Scale plane down, 3/4 + +static void ScalePlaneDown34(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + assert(dst_width % 3 == 0); + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_C; + ScaleRowDown34_1 = ScaleRowDown34_C; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_C; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_C; + } +#if defined(HAS_SCALEROWDOWN34_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_Any_NEON; + ScaleRowDown34_1 = ScaleRowDown34_Any_NEON; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_NEON; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_NEON; + } + if (dst_width % 24 == 0) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_NEON; + ScaleRowDown34_1 = ScaleRowDown34_NEON; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON; + } + } + } +#endif +#if defined(HAS_SCALEROWDOWN34_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3; + } + if (dst_width % 24 == 0) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_SSSE3; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3; + } + } + } +#endif +#if defined(HAS_SCALEROWDOWN34_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) && + IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_MIPS_DSPR2; + ScaleRowDown34_1 = ScaleRowDown34_MIPS_DSPR2; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_MIPS_DSPR2; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_MIPS_DSPR2; + } + } +#endif + + for (y = 0; y < dst_height - 2; y += 3) { + ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, + dst_ptr, dst_width); + src_ptr += src_stride * 2; + dst_ptr += dst_stride; + } + + // Remainder 1 or 2 rows with last row vertically unfiltered + if ((dst_height % 3) == 2) { + ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width); + } else if ((dst_height % 3) == 1) { + ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width); + } +} + +static void ScalePlaneDown34_16(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_ptr, uint16* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown34_0)(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width); + void (*ScaleRowDown34_1)(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width); + const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + assert(dst_width % 3 == 0); + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_16_C; + ScaleRowDown34_1 = ScaleRowDown34_16_C; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_C; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_C; + } +#if defined(HAS_SCALEROWDOWN34_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_16_NEON; + ScaleRowDown34_1 = ScaleRowDown34_16_NEON; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_NEON; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_NEON; + } + } +#endif +#if defined(HAS_SCALEROWDOWN34_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_SSSE3; + } + } +#endif +#if defined(HAS_SCALEROWDOWN34_16_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) && + IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_16_MIPS_DSPR2; + ScaleRowDown34_1 = ScaleRowDown34_16_MIPS_DSPR2; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_MIPS_DSPR2; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_MIPS_DSPR2; + } + } +#endif + + for (y = 0; y < dst_height - 2; y += 3) { + ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, + dst_ptr, dst_width); + src_ptr += src_stride * 2; + dst_ptr += dst_stride; + } + + // Remainder 1 or 2 rows with last row vertically unfiltered + if ((dst_height % 3) == 2) { + ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width); + } else if ((dst_height % 3) == 1) { + ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width); + } +} + + +// Scale plane, 3/8 +// This is an optimized version for scaling down a plane to 3/8 +// of its original size. +// +// Uses box filter arranges like this +// aaabbbcc -> abc +// aaabbbcc def +// aaabbbcc ghi +// dddeeeff +// dddeeeff +// dddeeeff +// ggghhhii +// ggghhhii +// Boxes are 3x3, 2x3, 3x2 and 2x2 + +static void ScalePlaneDown38(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + assert(dst_width % 3 == 0); + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_C; + ScaleRowDown38_2 = ScaleRowDown38_C; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_C; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_C; + } + +#if defined(HAS_SCALEROWDOWN38_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_Any_NEON; + ScaleRowDown38_2 = ScaleRowDown38_Any_NEON; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_NEON; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_NEON; + } + if (dst_width % 12 == 0) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_NEON; + ScaleRowDown38_2 = ScaleRowDown38_NEON; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON; + } + } + } +#endif +#if defined(HAS_SCALEROWDOWN38_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3; + } + if (dst_width % 12 == 0 && !filtering) { + ScaleRowDown38_3 = ScaleRowDown38_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_SSSE3; + } + if (dst_width % 6 == 0 && filtering) { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3; + } + } +#endif +#if defined(HAS_SCALEROWDOWN38_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) && + IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_MIPS_DSPR2; + ScaleRowDown38_2 = ScaleRowDown38_MIPS_DSPR2; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_MIPS_DSPR2; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_MIPS_DSPR2; + } + } +#endif + + for (y = 0; y < dst_height - 2; y += 3) { + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 2; + dst_ptr += dst_stride; + } + + // Remainder 1 or 2 rows with last row vertically unfiltered + if ((dst_height % 3) == 2) { + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); + } else if ((dst_height % 3) == 1) { + ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); + } +} + +static void ScalePlaneDown38_16(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_ptr, uint16* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown38_3)(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width); + void (*ScaleRowDown38_2)(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width); + const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + assert(dst_width % 3 == 0); + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_16_C; + ScaleRowDown38_2 = ScaleRowDown38_16_C; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_C; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_C; + } +#if defined(HAS_SCALEROWDOWN38_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_16_NEON; + ScaleRowDown38_2 = ScaleRowDown38_16_NEON; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_NEON; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON; + } + } +#endif +#if defined(HAS_SCALEROWDOWN38_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3; + } + } +#endif +#if defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) && + IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_16_MIPS_DSPR2; + ScaleRowDown38_2 = ScaleRowDown38_16_MIPS_DSPR2; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_MIPS_DSPR2; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_MIPS_DSPR2; + } + } +#endif + + for (y = 0; y < dst_height - 2; y += 3) { + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 2; + dst_ptr += dst_stride; + } + + // Remainder 1 or 2 rows with last row vertically unfiltered + if ((dst_height % 3) == 2) { + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); + } else if ((dst_height % 3) == 1) { + ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); + } +} + +#define MIN1(x) ((x) < 1 ? 1 : (x)) + +static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) { + uint32 sum = 0u; + int x; + assert(iboxwidth > 0); + for (x = 0; x < iboxwidth; ++x) { + sum += src_ptr[x]; + } + return sum; +} + +static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) { + uint32 sum = 0u; + int x; + assert(iboxwidth > 0); + for (x = 0; x < iboxwidth; ++x) { + sum += src_ptr[x]; + } + return sum; +} + +static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx, + const uint16* src_ptr, uint8* dst_ptr) { + int i; + int scaletbl[2]; + int minboxwidth = dx >> 16; + int* scaleptr = scaletbl - minboxwidth; + int boxwidth; + scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight); + scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight); + for (i = 0; i < dst_width; ++i) { + int ix = x >> 16; + x += dx; + boxwidth = MIN1((x >> 16) - ix); + *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16; + } +} + +static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx, + const uint32* src_ptr, uint16* dst_ptr) { + int i; + int scaletbl[2]; + int minboxwidth = dx >> 16; + int* scaleptr = scaletbl - minboxwidth; + int boxwidth; + scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight); + scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight); + for (i = 0; i < dst_width; ++i) { + int ix = x >> 16; + x += dx; + boxwidth = MIN1((x >> 16) - ix); + *dst_ptr++ = + SumPixels_16(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16; + } +} + +static void ScaleAddCols0_C(int dst_width, int boxheight, int x, int, + const uint16* src_ptr, uint8* dst_ptr) { + int scaleval = 65536 / boxheight; + int i; + src_ptr += (x >> 16); + for (i = 0; i < dst_width; ++i) { + *dst_ptr++ = src_ptr[i] * scaleval >> 16; + } +} + +static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx, + const uint16* src_ptr, uint8* dst_ptr) { + int boxwidth = MIN1(dx >> 16); + int scaleval = 65536 / (boxwidth * boxheight); + int i; + x >>= 16; + for (i = 0; i < dst_width; ++i) { + *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16; + x += boxwidth; + } +} + +static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx, + const uint32* src_ptr, uint16* dst_ptr) { + int boxwidth = MIN1(dx >> 16); + int scaleval = 65536 / (boxwidth * boxheight); + int i; + for (i = 0; i < dst_width; ++i) { + *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + x) * scaleval >> 16; + x += boxwidth; + } +} + +// Scale plane down to any dimensions, with interpolation. +// (boxfilter). +// +// Same method as SimpleScale, which is fixed point, outputting +// one pixel of destination using fixed point (16.16) to step +// through source, sampling a box of pixel with simple +// averaging. +static void ScalePlaneBox(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + int j, k; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + const int max_y = (src_height << 16); + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, + &x, &y, &dx, &dy); + src_width = Abs(src_width); + { + // Allocate a row buffer of uint16. + align_buffer_64(row16, src_width * 2); + void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, + const uint16* src_ptr, uint8* dst_ptr) = + (dx & 0xffff) ? ScaleAddCols2_C: + ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C); + void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) = + ScaleAddRow_C; +#if defined(HAS_SCALEADDROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleAddRow = ScaleAddRow_Any_SSE2; + if (IS_ALIGNED(src_width, 16)) { + ScaleAddRow = ScaleAddRow_SSE2; + } + } +#endif +#if defined(HAS_SCALEADDROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleAddRow = ScaleAddRow_Any_AVX2; + if (IS_ALIGNED(src_width, 32)) { + ScaleAddRow = ScaleAddRow_AVX2; + } + } +#endif +#if defined(HAS_SCALEADDROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleAddRow = ScaleAddRow_Any_NEON; + if (IS_ALIGNED(src_width, 16)) { + ScaleAddRow = ScaleAddRow_NEON; + } + } +#endif + + for (j = 0; j < dst_height; ++j) { + int boxheight; + int iy = y >> 16; + const uint8* src = src_ptr + iy * src_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + boxheight = MIN1((y >> 16) - iy); + memset(row16, 0, src_width * 2); + for (k = 0; k < boxheight; ++k) { + ScaleAddRow(src, (uint16 *)(row16), src_width); + src += src_stride; + } + ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr); + dst_ptr += dst_stride; + } + free_aligned_buffer_64(row16); + } +} + +static void ScalePlaneBox_16(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_ptr, uint16* dst_ptr) { + int j, k; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + const int max_y = (src_height << 16); + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, + &x, &y, &dx, &dy); + src_width = Abs(src_width); + { + // Allocate a row buffer of uint32. + align_buffer_64(row32, src_width * 4); + void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, + const uint32* src_ptr, uint16* dst_ptr) = + (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C; + void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) = + ScaleAddRow_16_C; + +#if defined(HAS_SCALEADDROW_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) { + ScaleAddRow = ScaleAddRow_16_SSE2; + } +#endif + + for (j = 0; j < dst_height; ++j) { + int boxheight; + int iy = y >> 16; + const uint16* src = src_ptr + iy * src_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + boxheight = MIN1((y >> 16) - iy); + memset(row32, 0, src_width * 4); + for (k = 0; k < boxheight; ++k) { + ScaleAddRow(src, (uint32 *)(row32), src_width); + src += src_stride; + } + ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr); + dst_ptr += dst_stride; + } + free_aligned_buffer_64(row32); + } +} + +// Scale plane down with bilinear interpolation. +void ScalePlaneBilinearDown(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + enum FilterMode filtering) { + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. + // Allocate a row buffer. + align_buffer_64(row, src_width); + + const int max_y = (src_height - 1) << 16; + int j; + void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C; + void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_C; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, + &x, &y, &dx, &dy); + src_width = Abs(src_width); + +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_SSE2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(src_width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { + InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; + if (IS_ALIGNED(src_width, 4)) { + InterpolateRow = InterpolateRow_MIPS_DSPR2; + } + } +#endif + + +#if defined(HAS_SCALEFILTERCOLS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEFILTERCOLS_NEON) + if (TestCpuFlag(kCpuHasNEON) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleFilterCols = ScaleFilterCols_NEON; + } + } +#endif + if (y > max_y) { + y = max_y; + } + + for (j = 0; j < dst_height; ++j) { + int yi = y >> 16; + const uint8* src = src_ptr + yi * src_stride; + if (filtering == kFilterLinear) { + ScaleFilterCols(dst_ptr, src, dst_width, x, dx); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(row, src, src_stride, src_width, yf); + ScaleFilterCols(dst_ptr, row, dst_width, x, dx); + } + dst_ptr += dst_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + } + free_aligned_buffer_64(row); +} + +void ScalePlaneBilinearDown_16(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_ptr, uint16* dst_ptr, + enum FilterMode filtering) { + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. + // Allocate a row buffer. + align_buffer_64(row, src_width * 2); + + const int max_y = (src_height - 1) << 16; + int j; + void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C; + void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_16_C; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, + &x, &y, &dx, &dy); + src_width = Abs(src_width); + +#if defined(HAS_INTERPOLATEROW_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + InterpolateRow = InterpolateRow_Any_16_SSE2; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_16_SSE2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_16_SSSE3; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_16_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_16_AVX2; + if (IS_ALIGNED(src_width, 32)) { + InterpolateRow = InterpolateRow_16_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_16_NEON; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_16_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { + InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2; + if (IS_ALIGNED(src_width, 4)) { + InterpolateRow = InterpolateRow_16_MIPS_DSPR2; + } + } +#endif + + +#if defined(HAS_SCALEFILTERCOLS_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_16_SSSE3; + } +#endif + if (y > max_y) { + y = max_y; + } + + for (j = 0; j < dst_height; ++j) { + int yi = y >> 16; + const uint16* src = src_ptr + yi * src_stride; + if (filtering == kFilterLinear) { + ScaleFilterCols(dst_ptr, src, dst_width, x, dx); + } else { + int yf = (y >> 8) & 255; + InterpolateRow((uint16*)row, src, src_stride, src_width, yf); + ScaleFilterCols(dst_ptr, (uint16*)row, dst_width, x, dx); + } + dst_ptr += dst_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + } + free_aligned_buffer_64(row); +} + +// Scale up down with bilinear interpolation. +void ScalePlaneBilinearUp(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + enum FilterMode filtering) { + int j; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + const int max_y = (src_height - 1) << 16; + void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_C; + void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) = + filtering ? ScaleFilterCols_C : ScaleCols_C; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, + &x, &y, &dx, &dy); + src_width = Abs(src_width); + +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_SSE2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { + InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_MIPS_DSPR2; + } + } +#endif + + if (filtering && src_width >= 32768) { + ScaleFilterCols = ScaleFilterCols64_C; + } +#if defined(HAS_SCALEFILTERCOLS_SSSE3) + if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEFILTERCOLS_NEON) + if (filtering && TestCpuFlag(kCpuHasNEON) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleFilterCols = ScaleFilterCols_NEON; + } + } +#endif + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleFilterCols = ScaleColsUp2_C; +#if defined(HAS_SCALECOLS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleFilterCols = ScaleColsUp2_SSE2; + } +#endif + } + + if (y > max_y) { + y = max_y; + } + { + int yi = y >> 16; + const uint8* src = src_ptr + yi * src_stride; + + // Allocate 2 row buffers. + const int kRowSize = (dst_width + 31) & ~31; + align_buffer_64(row, kRowSize * 2); + + uint8* rowptr = row; + int rowstride = kRowSize; + int lasty = yi; + + ScaleFilterCols(rowptr, src, dst_width, x, dx); + if (src_height > 1) { + src += src_stride; + } + ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx); + src += src_stride; + + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + src = src_ptr + yi * src_stride; + } + if (yi != lasty) { + ScaleFilterCols(rowptr, src, dst_width, x, dx); + rowptr += rowstride; + rowstride = -rowstride; + lasty = yi; + src += src_stride; + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf); + } + dst_ptr += dst_stride; + y += dy; + } + free_aligned_buffer_64(row); + } +} + +void ScalePlaneBilinearUp_16(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_ptr, uint16* dst_ptr, + enum FilterMode filtering) { + int j; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + const int max_y = (src_height - 1) << 16; + void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_16_C; + void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx) = + filtering ? ScaleFilterCols_16_C : ScaleCols_16_C; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, + &x, &y, &dx, &dy); + src_width = Abs(src_width); + +#if defined(HAS_INTERPOLATEROW_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + InterpolateRow = InterpolateRow_Any_16_SSE2; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_16_SSE2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_16_SSSE3; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_16_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_16_AVX2; + if (IS_ALIGNED(dst_width, 32)) { + InterpolateRow = InterpolateRow_16_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_16_NEON; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_16_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { + InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_16_MIPS_DSPR2; + } + } +#endif + + if (filtering && src_width >= 32768) { + ScaleFilterCols = ScaleFilterCols64_16_C; + } +#if defined(HAS_SCALEFILTERCOLS_16_SSSE3) + if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_16_SSSE3; + } +#endif + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleFilterCols = ScaleColsUp2_16_C; +#if defined(HAS_SCALECOLS_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleFilterCols = ScaleColsUp2_16_SSE2; + } +#endif + } + + if (y > max_y) { + y = max_y; + } + { + int yi = y >> 16; + const uint16* src = src_ptr + yi * src_stride; + + // Allocate 2 row buffers. + const int kRowSize = (dst_width + 31) & ~31; + align_buffer_64(row, kRowSize * 4); + + uint16* rowptr = (uint16*)row; + int rowstride = kRowSize; + int lasty = yi; + + ScaleFilterCols(rowptr, src, dst_width, x, dx); + if (src_height > 1) { + src += src_stride; + } + ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx); + src += src_stride; + + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + src = src_ptr + yi * src_stride; + } + if (yi != lasty) { + ScaleFilterCols(rowptr, src, dst_width, x, dx); + rowptr += rowstride; + rowstride = -rowstride; + lasty = yi; + src += src_stride; + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf); + } + dst_ptr += dst_stride; + y += dy; + } + free_aligned_buffer_64(row); + } +} + +// Scale Plane to/from any dimensions, without interpolation. +// Fixed point math is used for performance: The upper 16 bits +// of x and dx is the integer part of the source position and +// the lower 16 bits are the fixed decimal part. + +static void ScalePlaneSimple(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + int i; + void (*ScaleCols)(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) = ScaleCols_C; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, + &x, &y, &dx, &dy); + src_width = Abs(src_width); + + if (src_width * 2 == dst_width && x < 0x8000) { + ScaleCols = ScaleColsUp2_C; +#if defined(HAS_SCALECOLS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleCols = ScaleColsUp2_SSE2; + } +#endif + } + + for (i = 0; i < dst_height; ++i) { + ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx); + dst_ptr += dst_stride; + y += dy; + } +} + +static void ScalePlaneSimple_16(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_ptr, uint16* dst_ptr) { + int i; + void (*ScaleCols)(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx) = ScaleCols_16_C; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, + &x, &y, &dx, &dy); + src_width = Abs(src_width); + + if (src_width * 2 == dst_width && x < 0x8000) { + ScaleCols = ScaleColsUp2_16_C; +#if defined(HAS_SCALECOLS_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleCols = ScaleColsUp2_16_SSE2; + } +#endif + } + + for (i = 0; i < dst_height; ++i) { + ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, + dst_width, x, dx); + dst_ptr += dst_stride; + y += dy; + } +} + +// Scale a plane. +// This function dispatches to a specialized scaler based on scale factor. + +LIBYUV_API +void ScalePlane(const uint8* src, int src_stride, + int src_width, int src_height, + uint8* dst, int dst_stride, + int dst_width, int dst_height, + enum FilterMode filtering) { + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, + dst_width, dst_height, filtering); + + // Negative height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src = src + (src_height - 1) * src_stride; + src_stride = -src_stride; + } + + // Use specialized scales to improve performance for common resolutions. + // For example, all the 1/2 scalings will use ScalePlaneDown2() + if (dst_width == src_width && dst_height == src_height) { + // Straight copy. + CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height); + return; + } + if (dst_width == src_width && filtering != kFilterBox) { + int dy = FixedDiv(src_height, dst_height); + // Arbitrary scale vertically, but unscaled horizontally. + ScalePlaneVertical(src_height, + dst_width, dst_height, + src_stride, dst_stride, src, dst, + 0, 0, dy, 1, filtering); + return; + } + if (dst_width <= Abs(src_width) && dst_height <= src_height) { + // Scale down. + if (4 * dst_width == 3 * src_width && + 4 * dst_height == 3 * src_height) { + // optimized, 3/4 + ScalePlaneDown34(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if (2 * dst_width == src_width && 2 * dst_height == src_height) { + // optimized, 1/2 + ScalePlaneDown2(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + // 3/8 rounded up for odd sized chroma height. + if (8 * dst_width == 3 * src_width && + dst_height == ((src_height * 3 + 7) / 8)) { + // optimized, 3/8 + ScalePlaneDown38(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if (4 * dst_width == src_width && 4 * dst_height == src_height && + (filtering == kFilterBox || filtering == kFilterNone)) { + // optimized, 1/4 + ScalePlaneDown4(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + } + if (filtering == kFilterBox && dst_height * 2 < src_height) { + ScalePlaneBox(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } + if (filtering && dst_height > src_height) { + ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if (filtering) { + ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + ScalePlaneSimple(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); +} + +LIBYUV_API +void ScalePlane_16(const uint16* src, int src_stride, + int src_width, int src_height, + uint16* dst, int dst_stride, + int dst_width, int dst_height, + enum FilterMode filtering) { + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, + dst_width, dst_height, filtering); + + // Negative height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src = src + (src_height - 1) * src_stride; + src_stride = -src_stride; + } + + // Use specialized scales to improve performance for common resolutions. + // For example, all the 1/2 scalings will use ScalePlaneDown2() + if (dst_width == src_width && dst_height == src_height) { + // Straight copy. + CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height); + return; + } + if (dst_width == src_width) { + int dy = FixedDiv(src_height, dst_height); + // Arbitrary scale vertically, but unscaled vertically. + ScalePlaneVertical_16(src_height, + dst_width, dst_height, + src_stride, dst_stride, src, dst, + 0, 0, dy, 1, filtering); + return; + } + if (dst_width <= Abs(src_width) && dst_height <= src_height) { + // Scale down. + if (4 * dst_width == 3 * src_width && + 4 * dst_height == 3 * src_height) { + // optimized, 3/4 + ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if (2 * dst_width == src_width && 2 * dst_height == src_height) { + // optimized, 1/2 + ScalePlaneDown2_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + // 3/8 rounded up for odd sized chroma height. + if (8 * dst_width == 3 * src_width && + dst_height == ((src_height * 3 + 7) / 8)) { + // optimized, 3/8 + ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if (4 * dst_width == src_width && 4 * dst_height == src_height && + filtering != kFilterBilinear) { + // optimized, 1/4 + ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + } + if (filtering == kFilterBox && dst_height * 2 < src_height) { + ScalePlaneBox_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } + if (filtering && dst_height > src_height) { + ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if (filtering) { + ScalePlaneBilinearDown_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); +} + +// Scale an I420 image. +// This function in turn calls a scaling function for each plane. + +LIBYUV_API +int I420Scale(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int dst_width, int dst_height, + enum FilterMode filtering) { + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int src_halfheight = SUBSAMPLE(src_height, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); + if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || + !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane(src_y, src_stride_y, src_width, src_height, + dst_y, dst_stride_y, dst_width, dst_height, + filtering); + ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, + dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, + filtering); + ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, + dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, + filtering); + return 0; +} + +LIBYUV_API +int I420Scale_16(const uint16* src_y, int src_stride_y, + const uint16* src_u, int src_stride_u, + const uint16* src_v, int src_stride_v, + int src_width, int src_height, + uint16* dst_y, int dst_stride_y, + uint16* dst_u, int dst_stride_u, + uint16* dst_v, int dst_stride_v, + int dst_width, int dst_height, + enum FilterMode filtering) { + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int src_halfheight = SUBSAMPLE(src_height, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); + if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || + !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane_16(src_y, src_stride_y, src_width, src_height, + dst_y, dst_stride_y, dst_width, dst_height, + filtering); + ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, + dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, + filtering); + ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, + dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, + filtering); + return 0; +} + +// Deprecated api +LIBYUV_API +int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, + int src_stride_y, int src_stride_u, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, uint8* dst_u, uint8* dst_v, + int dst_stride_y, int dst_stride_u, int dst_stride_v, + int dst_width, int dst_height, + LIBYUV_BOOL interpolate) { + return I420Scale(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + src_width, src_height, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + dst_width, dst_height, + interpolate ? kFilterBox : kFilterNone); +} + +// Deprecated api +LIBYUV_API +int ScaleOffset(const uint8* src, int src_width, int src_height, + uint8* dst, int dst_width, int dst_height, int dst_yoffset, + LIBYUV_BOOL interpolate) { + // Chroma requires offset to multiple of 2. + int dst_yoffset_even = dst_yoffset & ~1; + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int src_halfheight = SUBSAMPLE(src_height, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); + int aheight = dst_height - dst_yoffset_even * 2; // actual output height + const uint8* src_y = src; + const uint8* src_u = src + src_width * src_height; + const uint8* src_v = src + src_width * src_height + + src_halfwidth * src_halfheight; + uint8* dst_y = dst + dst_yoffset_even * dst_width; + uint8* dst_u = dst + dst_width * dst_height + + (dst_yoffset_even >> 1) * dst_halfwidth; + uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight + + (dst_yoffset_even >> 1) * dst_halfwidth; + if (!src || src_width <= 0 || src_height <= 0 || + !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset_even < 0 || + dst_yoffset_even >= dst_height) { + return -1; + } + return I420Scale(src_y, src_width, + src_u, src_halfwidth, + src_v, src_halfwidth, + src_width, src_height, + dst_y, dst_width, + dst_u, dst_halfwidth, + dst_v, dst_halfwidth, + dst_width, aheight, + interpolate ? kFilterBox : kFilterNone); +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/scale_any.cc b/libs/libaom/src/third_party/libyuv/source/scale_any.cc new file mode 100644 index 000000000..2f6a2c8ba --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/scale_any.cc @@ -0,0 +1,200 @@ +/* + * Copyright 2015 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" +#include "libyuv/scale_row.h" + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols +#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \ + void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \ + int dst_width, int x, int dx) { \ + int n = dst_width & ~MASK; \ + if (n > 0) { \ + TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \ + } \ + TERP_C(dst_ptr + n * BPP, src_ptr, \ + dst_width & MASK, x + n * dx, dx); \ + } + +#ifdef HAS_SCALEFILTERCOLS_NEON +CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7) +#endif +#ifdef HAS_SCALEARGBCOLS_NEON +CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7) +#endif +#ifdef HAS_SCALEARGBFILTERCOLS_NEON +CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON, + ScaleARGBFilterCols_C, 4, 3) +#endif +#undef CANY + +// Fixed scale down. +#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ + void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \ + uint8* dst_ptr, int dst_width) { \ + int r = (int)((unsigned int)dst_width % (MASK + 1)); \ + int n = dst_width - r; \ + if (n > 0) { \ + SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ + } \ + SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ + dst_ptr + n * BPP, r); \ + } + +#ifdef HAS_SCALEROWDOWN2_SSE2 +SDANY(ScaleRowDown2_Any_SSE2, ScaleRowDown2_SSE2, ScaleRowDown2_C, 2, 1, 15) +SDANY(ScaleRowDown2Linear_Any_SSE2, ScaleRowDown2Linear_SSE2, + ScaleRowDown2Linear_C, 2, 1, 15) +SDANY(ScaleRowDown2Box_Any_SSE2, ScaleRowDown2Box_SSE2, ScaleRowDown2Box_C, + 2, 1, 15) +#endif +#ifdef HAS_SCALEROWDOWN2_AVX2 +SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31) +SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2, + ScaleRowDown2Linear_C, 2, 1, 31) +SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C, + 2, 1, 31) +#endif +#ifdef HAS_SCALEROWDOWN2_NEON +SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15) +SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON, + ScaleRowDown2Linear_C, 2, 1, 15) +SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON, + ScaleRowDown2Box_C, 2, 1, 15) +#endif +#ifdef HAS_SCALEROWDOWN4_SSE2 +SDANY(ScaleRowDown4_Any_SSE2, ScaleRowDown4_SSE2, ScaleRowDown4_C, 4, 1, 7) +SDANY(ScaleRowDown4Box_Any_SSE2, ScaleRowDown4Box_SSE2, ScaleRowDown4Box_C, + 4, 1, 7) +#endif +#ifdef HAS_SCALEROWDOWN4_AVX2 +SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15) +SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C, + 4, 1, 15) +#endif +#ifdef HAS_SCALEROWDOWN4_NEON +SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7) +SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C, + 4, 1, 7) +#endif +#ifdef HAS_SCALEROWDOWN34_SSSE3 +SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3, + ScaleRowDown34_C, 4 / 3, 1, 23) +SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3, + ScaleRowDown34_0_Box_C, 4 / 3, 1, 23) +SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3, + ScaleRowDown34_1_Box_C, 4 / 3, 1, 23) +#endif +#ifdef HAS_SCALEROWDOWN34_NEON +SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON, + ScaleRowDown34_C, 4 / 3, 1, 23) +SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON, + ScaleRowDown34_0_Box_C, 4 / 3, 1, 23) +SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON, + ScaleRowDown34_1_Box_C, 4 / 3, 1, 23) +#endif +#ifdef HAS_SCALEROWDOWN38_SSSE3 +SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3, + ScaleRowDown38_C, 8 / 3, 1, 11) +SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3, + ScaleRowDown38_3_Box_C, 8 / 3, 1, 5) +SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3, + ScaleRowDown38_2_Box_C, 8 / 3, 1, 5) +#endif +#ifdef HAS_SCALEROWDOWN38_NEON +SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON, + ScaleRowDown38_C, 8 / 3, 1, 11) +SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON, + ScaleRowDown38_3_Box_C, 8 / 3, 1, 11) +SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON, + ScaleRowDown38_2_Box_C, 8 / 3, 1, 11) +#endif + +#ifdef HAS_SCALEARGBROWDOWN2_SSE2 +SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2, + ScaleARGBRowDown2_C, 2, 4, 3) +SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2, + ScaleARGBRowDown2Linear_C, 2, 4, 3) +SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2, + ScaleARGBRowDown2Box_C, 2, 4, 3) +#endif +#ifdef HAS_SCALEARGBROWDOWN2_NEON +SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON, + ScaleARGBRowDown2_C, 2, 4, 7) +SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON, + ScaleARGBRowDown2Linear_C, 2, 4, 7) +SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON, + ScaleARGBRowDown2Box_C, 2, 4, 7) +#endif +#undef SDANY + +// Scale down by even scale factor. +#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \ + void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx, \ + uint8* dst_ptr, int dst_width) { \ + int r = (int)((unsigned int)dst_width % (MASK + 1)); \ + int n = dst_width - r; \ + if (n > 0) { \ + SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \ + } \ + SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, \ + src_stepx, dst_ptr + n * BPP, r); \ + } + +#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2 +SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2, + ScaleARGBRowDownEven_C, 4, 3) +SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2, + ScaleARGBRowDownEvenBox_C, 4, 3) +#endif +#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON +SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON, + ScaleARGBRowDownEven_C, 4, 3) +SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON, + ScaleARGBRowDownEvenBox_C, 4, 3) +#endif + +// Add rows box filter scale down. +#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \ + void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) { \ + int n = src_width & ~MASK; \ + if (n > 0) { \ + SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \ + } \ + SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \ + } + +#ifdef HAS_SCALEADDROW_SSE2 +SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15) +#endif +#ifdef HAS_SCALEADDROW_AVX2 +SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31) +#endif +#ifdef HAS_SCALEADDROW_NEON +SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15) +#endif +#undef SAANY + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + + + + + diff --git a/libs/libaom/src/third_party/libyuv/source/scale_argb.cc b/libs/libaom/src/third_party/libyuv/source/scale_argb.cc new file mode 100644 index 000000000..40a2d1ab2 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/scale_argb.cc @@ -0,0 +1,853 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" + +#include +#include + +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" // For CopyARGB +#include "libyuv/row.h" +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// ScaleARGB ARGB, 1/2 +// This is an optimized version for scaling down a ARGB to 1/2 of +// its original size. +static void ScaleARGBDown2(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_argb, uint8* dst_argb, + int x, int dx, int y, int dy, + enum FilterMode filtering) { + int j; + int row_stride = src_stride * (dy >> 16); + void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) = + filtering == kFilterNone ? ScaleARGBRowDown2_C : + (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C : + ScaleARGBRowDown2Box_C); + assert(dx == 65536 * 2); // Test scale factor of 2. + assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2. + // Advance to odd row, even column. + if (filtering == kFilterBilinear) { + src_argb += (y >> 16) * src_stride + (x >> 16) * 4; + } else { + src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4; + } + +#if defined(HAS_SCALEARGBROWDOWN2_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 : + (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 : + ScaleARGBRowDown2Box_Any_SSE2); + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 : + (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 : + ScaleARGBRowDown2Box_SSE2); + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON : + (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON : + ScaleARGBRowDown2Box_Any_NEON); + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON : + (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON : + ScaleARGBRowDown2Box_NEON); + } + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (j = 0; j < dst_height; ++j) { + ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width); + src_argb += row_stride; + dst_argb += dst_stride; + } +} + +// ScaleARGB ARGB, 1/4 +// This is an optimized version for scaling down a ARGB to 1/4 of +// its original size. +static void ScaleARGBDown4Box(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_argb, uint8* dst_argb, + int x, int dx, int y, int dy) { + int j; + // Allocate 2 rows of ARGB. + const int kRowSize = (dst_width * 2 * 4 + 31) & ~31; + align_buffer_64(row, kRowSize * 2); + int row_stride = src_stride * (dy >> 16); + void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C; + // Advance to odd row, even column. + src_argb += (y >> 16) * src_stride + (x >> 16) * 4; + assert(dx == 65536 * 4); // Test scale factor of 4. + assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4. +#if defined(HAS_SCALEARGBROWDOWN2_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2; + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON; + } + } +#endif + + for (j = 0; j < dst_height; ++j) { + ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2); + ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, + row + kRowSize, dst_width * 2); + ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width); + src_argb += row_stride; + dst_argb += dst_stride; + } + free_aligned_buffer_64(row); +} + +// ScaleARGB ARGB Even +// This is an optimized version for scaling down a ARGB to even +// multiple of its original size. +static void ScaleARGBDownEven(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_argb, uint8* dst_argb, + int x, int dx, int y, int dy, + enum FilterMode filtering) { + int j; + int col_step = dx >> 16; + int row_stride = (dy >> 16) * src_stride; + void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride, + int src_step, uint8* dst_argb, int dst_width) = + filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C; + assert(IS_ALIGNED(src_width, 2)); + assert(IS_ALIGNED(src_height, 2)); + src_argb += (y >> 16) * src_stride + (x >> 16) * 4; +#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 : + ScaleARGBRowDownEven_Any_SSE2; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 : + ScaleARGBRowDownEven_SSE2; + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON : + ScaleARGBRowDownEven_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON : + ScaleARGBRowDownEven_NEON; + } + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (j = 0; j < dst_height; ++j) { + ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width); + src_argb += row_stride; + dst_argb += dst_stride; + } +} + +// Scale ARGB down with bilinear interpolation. +static void ScaleARGBBilinearDown(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_argb, uint8* dst_argb, + int x, int dx, int y, int dy, + enum FilterMode filtering) { + int j; + void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_C; + void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C; + int64 xlast = x + (int64)(dst_width - 1) * dx; + int64 xl = (dx >= 0) ? x : xlast; + int64 xr = (dx >= 0) ? xlast : x; + int clip_src_width; + xl = (xl >> 16) & ~3; // Left edge aligned. + xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels. + xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel. + if (xr > src_width) { + xr = src_width; + } + clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4. + src_argb += xl * 4; + x -= (int)(xl << 16); +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(clip_src_width, 16)) { + InterpolateRow = InterpolateRow_SSE2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(clip_src_width, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(clip_src_width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(clip_src_width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && + IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) { + InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; + if (IS_ALIGNED(clip_src_width, 4)) { + InterpolateRow = InterpolateRow_MIPS_DSPR2; + } + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_NEON; + } + } +#endif + // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. + // Allocate a row of ARGB. + { + align_buffer_64(row, clip_src_width * 4); + + const int max_y = (src_height - 1) << 16; + if (y > max_y) { + y = max_y; + } + for (j = 0; j < dst_height; ++j) { + int yi = y >> 16; + const uint8* src = src_argb + yi * src_stride; + if (filtering == kFilterLinear) { + ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(row, src, src_stride, clip_src_width, yf); + ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx); + } + dst_argb += dst_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + } + free_aligned_buffer_64(row); + } +} + +// Scale ARGB up with bilinear interpolation. +static void ScaleARGBBilinearUp(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_argb, uint8* dst_argb, + int x, int dx, int y, int dy, + enum FilterMode filtering) { + int j; + void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_C; + void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) = + filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; + const int max_y = (src_height - 1) << 16; +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_SSE2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { + InterpolateRow = InterpolateRow_MIPS_DSPR2; + } +#endif + if (src_width >= 32768) { + ScaleARGBFilterCols = filtering ? + ScaleARGBFilterCols64_C : ScaleARGBCols64_C; + } +#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) + if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_NEON) + if (filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_NEON; + } + } +#endif +#if defined(HAS_SCALEARGBCOLS_SSE2) + if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBCols_SSE2; + } +#endif +#if defined(HAS_SCALEARGBCOLS_NEON) + if (!filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBCols_NEON; + } + } +#endif + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleARGBFilterCols = ScaleARGBColsUp2_C; +#if defined(HAS_SCALEARGBCOLSUP2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; + } +#endif + } + + if (y > max_y) { + y = max_y; + } + + { + int yi = y >> 16; + const uint8* src = src_argb + yi * src_stride; + + // Allocate 2 rows of ARGB. + const int kRowSize = (dst_width * 4 + 31) & ~31; + align_buffer_64(row, kRowSize * 2); + + uint8* rowptr = row; + int rowstride = kRowSize; + int lasty = yi; + + ScaleARGBFilterCols(rowptr, src, dst_width, x, dx); + if (src_height > 1) { + src += src_stride; + } + ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx); + src += src_stride; + + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + src = src_argb + yi * src_stride; + } + if (yi != lasty) { + ScaleARGBFilterCols(rowptr, src, dst_width, x, dx); + rowptr += rowstride; + rowstride = -rowstride; + lasty = yi; + src += src_stride; + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf); + } + dst_argb += dst_stride; + y += dy; + } + free_aligned_buffer_64(row); + } +} + +#ifdef YUVSCALEUP +// Scale YUV to ARGB up with bilinear interpolation. +static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride_y, + int src_stride_u, + int src_stride_v, + int dst_stride_argb, + const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int x, int dx, int y, int dy, + enum FilterMode filtering) { + int j; + void (*I422ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToARGBRow_C; +#if defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(src_width, 8)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(src_width, 16)) { + I422ToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(src_width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422TOARGBROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; + } +#endif + + void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_C; +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_SSE2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + InterpolateRow = InterpolateRow_MIPS_DSPR2; + } +#endif + + void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) = + filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; + if (src_width >= 32768) { + ScaleARGBFilterCols = filtering ? + ScaleARGBFilterCols64_C : ScaleARGBCols64_C; + } +#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) + if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_NEON) + if (filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_NEON; + } + } +#endif +#if defined(HAS_SCALEARGBCOLS_SSE2) + if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBCols_SSE2; + } +#endif +#if defined(HAS_SCALEARGBCOLS_NEON) + if (!filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBCols_NEON; + } + } +#endif + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleARGBFilterCols = ScaleARGBColsUp2_C; +#if defined(HAS_SCALEARGBCOLSUP2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; + } +#endif + } + + const int max_y = (src_height - 1) << 16; + if (y > max_y) { + y = max_y; + } + const int kYShift = 1; // Shift Y by 1 to convert Y plane to UV coordinate. + int yi = y >> 16; + int uv_yi = yi >> kYShift; + const uint8* src_row_y = src_y + yi * src_stride_y; + const uint8* src_row_u = src_u + uv_yi * src_stride_u; + const uint8* src_row_v = src_v + uv_yi * src_stride_v; + + // Allocate 2 rows of ARGB. + const int kRowSize = (dst_width * 4 + 31) & ~31; + align_buffer_64(row, kRowSize * 2); + + // Allocate 1 row of ARGB for source conversion. + align_buffer_64(argb_row, src_width * 4); + + uint8* rowptr = row; + int rowstride = kRowSize; + int lasty = yi; + + // TODO(fbarchard): Convert first 2 rows of YUV to ARGB. + ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx); + if (src_height > 1) { + src_row_y += src_stride_y; + if (yi & 1) { + src_row_u += src_stride_u; + src_row_v += src_stride_v; + } + } + ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx); + if (src_height > 2) { + src_row_y += src_stride_y; + if (!(yi & 1)) { + src_row_u += src_stride_u; + src_row_v += src_stride_v; + } + } + + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + uv_yi = yi >> kYShift; + src_row_y = src_y + yi * src_stride_y; + src_row_u = src_u + uv_yi * src_stride_u; + src_row_v = src_v + uv_yi * src_stride_v; + } + if (yi != lasty) { + // TODO(fbarchard): Convert the clipped region of row. + I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width); + ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx); + rowptr += rowstride; + rowstride = -rowstride; + lasty = yi; + src_row_y += src_stride_y; + if (yi & 1) { + src_row_u += src_stride_u; + src_row_v += src_stride_v; + } + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf); + } + dst_argb += dst_stride_argb; + y += dy; + } + free_aligned_buffer_64(row); + free_aligned_buffer_64(row_argb); +} +#endif + +// Scale ARGB to/from any dimensions, without interpolation. +// Fixed point math is used for performance: The upper 16 bits +// of x and dx is the integer part of the source position and +// the lower 16 bits are the fixed decimal part. + +static void ScaleARGBSimple(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_argb, uint8* dst_argb, + int x, int dx, int y, int dy) { + int j; + void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C; +#if defined(HAS_SCALEARGBCOLS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { + ScaleARGBCols = ScaleARGBCols_SSE2; + } +#endif +#if defined(HAS_SCALEARGBCOLS_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBCols = ScaleARGBCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBCols = ScaleARGBCols_NEON; + } + } +#endif + if (src_width * 2 == dst_width && x < 0x8000) { + ScaleARGBCols = ScaleARGBColsUp2_C; +#if defined(HAS_SCALEARGBCOLSUP2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleARGBCols = ScaleARGBColsUp2_SSE2; + } +#endif + } + + for (j = 0; j < dst_height; ++j) { + ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, + dst_width, x, dx); + dst_argb += dst_stride; + y += dy; + } +} + +// ScaleARGB a ARGB. +// This function in turn calls a scaling function +// suitable for handling the desired resolutions. +static void ScaleARGB(const uint8* src, int src_stride, + int src_width, int src_height, + uint8* dst, int dst_stride, + int dst_width, int dst_height, + int clip_x, int clip_y, int clip_width, int clip_height, + enum FilterMode filtering) { + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + // ARGB does not support box filter yet, but allow the user to pass it. + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, + dst_width, dst_height, + filtering); + + // Negative src_height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src = src + (src_height - 1) * src_stride; + src_stride = -src_stride; + } + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, + &x, &y, &dx, &dy); + src_width = Abs(src_width); + if (clip_x) { + int64 clipf = (int64)(clip_x) * dx; + x += (clipf & 0xffff); + src += (clipf >> 16) * 4; + dst += clip_x * 4; + } + if (clip_y) { + int64 clipf = (int64)(clip_y) * dy; + y += (clipf & 0xffff); + src += (clipf >> 16) * src_stride; + dst += clip_y * dst_stride; + } + + // Special case for integer step values. + if (((dx | dy) & 0xffff) == 0) { + if (!dx || !dy) { // 1 pixel wide and/or tall. + filtering = kFilterNone; + } else { + // Optimized even scale down. ie 2, 4, 6, 8, 10x. + if (!(dx & 0x10000) && !(dy & 0x10000)) { + if (dx == 0x20000) { + // Optimized 1/2 downsample. + ScaleARGBDown2(src_width, src_height, + clip_width, clip_height, + src_stride, dst_stride, src, dst, + x, dx, y, dy, filtering); + return; + } + if (dx == 0x40000 && filtering == kFilterBox) { + // Optimized 1/4 box downsample. + ScaleARGBDown4Box(src_width, src_height, + clip_width, clip_height, + src_stride, dst_stride, src, dst, + x, dx, y, dy); + return; + } + ScaleARGBDownEven(src_width, src_height, + clip_width, clip_height, + src_stride, dst_stride, src, dst, + x, dx, y, dy, filtering); + return; + } + // Optimized odd scale down. ie 3, 5, 7, 9x. + if ((dx & 0x10000) && (dy & 0x10000)) { + filtering = kFilterNone; + if (dx == 0x10000 && dy == 0x10000) { + // Straight copy. + ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride, + dst, dst_stride, clip_width, clip_height); + return; + } + } + } + } + if (dx == 0x10000 && (x & 0xffff) == 0) { + // Arbitrary scale vertically, but unscaled vertically. + ScalePlaneVertical(src_height, + clip_width, clip_height, + src_stride, dst_stride, src, dst, + x, y, dy, 4, filtering); + return; + } + if (filtering && dy < 65536) { + ScaleARGBBilinearUp(src_width, src_height, + clip_width, clip_height, + src_stride, dst_stride, src, dst, + x, dx, y, dy, filtering); + return; + } + if (filtering) { + ScaleARGBBilinearDown(src_width, src_height, + clip_width, clip_height, + src_stride, dst_stride, src, dst, + x, dx, y, dy, filtering); + return; + } + ScaleARGBSimple(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, + x, dx, y, dy); +} + +LIBYUV_API +int ARGBScaleClip(const uint8* src_argb, int src_stride_argb, + int src_width, int src_height, + uint8* dst_argb, int dst_stride_argb, + int dst_width, int dst_height, + int clip_x, int clip_y, int clip_width, int clip_height, + enum FilterMode filtering) { + if (!src_argb || src_width == 0 || src_height == 0 || + !dst_argb || dst_width <= 0 || dst_height <= 0 || + clip_x < 0 || clip_y < 0 || + clip_width > 32768 || clip_height > 32768 || + (clip_x + clip_width) > dst_width || + (clip_y + clip_height) > dst_height) { + return -1; + } + ScaleARGB(src_argb, src_stride_argb, src_width, src_height, + dst_argb, dst_stride_argb, dst_width, dst_height, + clip_x, clip_y, clip_width, clip_height, filtering); + return 0; +} + +// Scale an ARGB image. +LIBYUV_API +int ARGBScale(const uint8* src_argb, int src_stride_argb, + int src_width, int src_height, + uint8* dst_argb, int dst_stride_argb, + int dst_width, int dst_height, + enum FilterMode filtering) { + if (!src_argb || src_width == 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || + !dst_argb || dst_width <= 0 || dst_height <= 0) { + return -1; + } + ScaleARGB(src_argb, src_stride_argb, src_width, src_height, + dst_argb, dst_stride_argb, dst_width, dst_height, + 0, 0, dst_width, dst_height, filtering); + return 0; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/scale_common.cc b/libs/libaom/src/third_party/libyuv/source/scale_common.cc new file mode 100644 index 000000000..1711f3d54 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/scale_common.cc @@ -0,0 +1,1137 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" + +#include +#include + +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" // For CopyARGB +#include "libyuv/row.h" +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// CPU agnostic row functions +void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src_ptr[1]; + dst[1] = src_ptr[3]; + dst += 2; + src_ptr += 4; + } + if (dst_width & 1) { + dst[0] = src_ptr[1]; + } +} + +void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width) { + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src_ptr[1]; + dst[1] = src_ptr[3]; + dst += 2; + src_ptr += 4; + } + if (dst_width & 1) { + dst[0] = src_ptr[1]; + } +} + +void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + const uint8* s = src_ptr; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (s[0] + s[1] + 1) >> 1; + dst[1] = (s[2] + s[3] + 1) >> 1; + dst += 2; + s += 4; + } + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + 1) >> 1; + } +} + +void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width) { + const uint16* s = src_ptr; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (s[0] + s[1] + 1) >> 1; + dst[1] = (s[2] + s[3] + 1) >> 1; + dst += 2; + s += 4; + } + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + 1) >> 1; + } +} + +void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + const uint8* s = src_ptr; + const uint8* t = src_ptr + src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; + dst += 2; + s += 4; + t += 4; + } + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + } +} + +void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width) { + const uint16* s = src_ptr; + const uint16* t = src_ptr + src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; + dst += 2; + s += 4; + t += 4; + } + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + } +} + +void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src_ptr[2]; + dst[1] = src_ptr[6]; + dst += 2; + src_ptr += 8; + } + if (dst_width & 1) { + dst[0] = src_ptr[2]; + } +} + +void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width) { + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src_ptr[2]; + dst[1] = src_ptr[6]; + dst += 2; + src_ptr += 8; + } + if (dst_width & 1) { + dst[0] = src_ptr[2]; + } +} + +void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + intptr_t stride = src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride + 3] + + src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + + src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + + src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + + 8) >> 4; + dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + + src_ptr[stride + 4] + src_ptr[stride + 5] + + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] + + src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] + + src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] + + 8) >> 4; + dst += 2; + src_ptr += 8; + } + if (dst_width & 1) { + dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride + 3] + + src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + + src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + + src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + + 8) >> 4; + } +} + +void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width) { + intptr_t stride = src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride + 3] + + src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + + src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + + src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + + 8) >> 4; + dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + + src_ptr[stride + 4] + src_ptr[stride + 5] + + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] + + src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] + + src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] + + 8) >> 4; + dst += 2; + src_ptr += 8; + } + if (dst_width & 1) { + dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride + 3] + + src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + + src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + + src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + + 8) >> 4; + } +} + +void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + int x; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[1]; + dst[2] = src_ptr[3]; + dst += 3; + src_ptr += 4; + } +} + +void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width) { + int x; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[1]; + dst[2] = src_ptr[3]; + dst += 3; + src_ptr += 4; + } +} + +// Filter rows 0 and 1 together, 3 : 1 +void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width) { + const uint8* s = src_ptr; + const uint8* t = src_ptr + src_stride; + int x; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 * 3 + b0 + 2) >> 2; + d[1] = (a1 * 3 + b1 + 2) >> 2; + d[2] = (a2 * 3 + b2 + 2) >> 2; + d += 3; + s += 4; + t += 4; + } +} + +void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* d, int dst_width) { + const uint16* s = src_ptr; + const uint16* t = src_ptr + src_stride; + int x; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 * 3 + b0 + 2) >> 2; + d[1] = (a1 * 3 + b1 + 2) >> 2; + d[2] = (a2 * 3 + b2 + 2) >> 2; + d += 3; + s += 4; + t += 4; + } +} + +// Filter rows 1 and 2 together, 1 : 1 +void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width) { + const uint8* s = src_ptr; + const uint8* t = src_ptr + src_stride; + int x; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 + b0 + 1) >> 1; + d[1] = (a1 + b1 + 1) >> 1; + d[2] = (a2 + b2 + 1) >> 1; + d += 3; + s += 4; + t += 4; + } +} + +void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* d, int dst_width) { + const uint16* s = src_ptr; + const uint16* t = src_ptr + src_stride; + int x; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 + b0 + 1) >> 1; + d[1] = (a1 + b1 + 1) >> 1; + d[2] = (a2 + b2 + 1) >> 1; + d += 3; + s += 4; + t += 4; + } +} + +// Scales a single row of pixels using point sampling. +void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst_ptr[0] = src_ptr[x >> 16]; + x += dx; + dst_ptr[1] = src_ptr[x >> 16]; + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + dst_ptr[0] = src_ptr[x >> 16]; + } +} + +void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx) { + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst_ptr[0] = src_ptr[x >> 16]; + x += dx; + dst_ptr[1] = src_ptr[x >> 16]; + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + dst_ptr[0] = src_ptr[x >> 16]; + } +} + +// Scales a single row of pixels up by 2x using point sampling. +void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst_ptr[1] = dst_ptr[0] = src_ptr[0]; + src_ptr += 1; + dst_ptr += 2; + } + if (dst_width & 1) { + dst_ptr[0] = src_ptr[0]; + } +} + +void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx) { + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst_ptr[1] = dst_ptr[0] = src_ptr[0]; + src_ptr += 1; + dst_ptr += 2; + } + if (dst_width & 1) { + dst_ptr[0] = src_ptr[0]; + } +} + +// (1-f)a + fb can be replaced with a + f(b-a) +#define BLENDER(a, b, f) (uint8)((int)(a) + \ + ((int)(f) * ((int)(b) - (int)(a)) >> 16)) + +void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + x += dx; + xi = x >> 16; + a = src_ptr[xi]; + b = src_ptr[xi + 1]; + dst_ptr[1] = BLENDER(a, b, x & 0xffff); + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + int xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + } +} + +void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x32, int dx) { + int64 x = (int64)(x32); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int64 xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + x += dx; + xi = x >> 16; + a = src_ptr[xi]; + b = src_ptr[xi + 1]; + dst_ptr[1] = BLENDER(a, b, x & 0xffff); + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + int64 xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + } +} +#undef BLENDER + +#define BLENDER(a, b, f) (uint16)((int)(a) + \ + ((int)(f) * ((int)(b) - (int)(a)) >> 16)) + +void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx) { + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + x += dx; + xi = x >> 16; + a = src_ptr[xi]; + b = src_ptr[xi + 1]; + dst_ptr[1] = BLENDER(a, b, x & 0xffff); + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + int xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + } +} + +void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x32, int dx) { + int64 x = (int64)(x32); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int64 xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + x += dx; + xi = x >> 16; + a = src_ptr[xi]; + b = src_ptr[xi + 1]; + dst_ptr[1] = BLENDER(a, b, x & 0xffff); + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + int64 xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + } +} +#undef BLENDER + +void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + int x; + assert(dst_width % 3 == 0); + for (x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[3]; + dst[2] = src_ptr[6]; + dst += 3; + src_ptr += 8; + } +} + +void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width) { + int x; + assert(dst_width % 3 == 0); + for (x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[3]; + dst[2] = src_ptr[6]; + dst += 3; + src_ptr += 8; + } +} + +// 8x3 -> 3x1 +void ScaleRowDown38_3_Box_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + intptr_t stride = src_stride; + int i; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (i = 0; i < dst_width; i += 3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * + (65536 / 9) >> 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[stride + 3] + src_ptr[stride + 4] + + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * + (65536 / 9) >> 16; + dst_ptr[2] = (src_ptr[6] + src_ptr[7] + + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * + (65536 / 6) >> 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width) { + intptr_t stride = src_stride; + int i; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (i = 0; i < dst_width; i += 3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * + (65536 / 9) >> 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[stride + 3] + src_ptr[stride + 4] + + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * + (65536 / 9) >> 16; + dst_ptr[2] = (src_ptr[6] + src_ptr[7] + + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * + (65536 / 6) >> 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +// 8x2 -> 3x1 +void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + intptr_t stride = src_stride; + int i; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (i = 0; i < dst_width; i += 3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2]) * (65536 / 6) >> 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[stride + 3] + src_ptr[stride + 4] + + src_ptr[stride + 5]) * (65536 / 6) >> 16; + dst_ptr[2] = (src_ptr[6] + src_ptr[7] + + src_ptr[stride + 6] + src_ptr[stride + 7]) * + (65536 / 4) >> 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width) { + intptr_t stride = src_stride; + int i; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (i = 0; i < dst_width; i += 3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2]) * (65536 / 6) >> 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[stride + 3] + src_ptr[stride + 4] + + src_ptr[stride + 5]) * (65536 / 6) >> 16; + dst_ptr[2] = (src_ptr[6] + src_ptr[7] + + src_ptr[stride + 6] + src_ptr[stride + 7]) * + (65536 / 4) >> 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) { + int x; + assert(src_width > 0); + for (x = 0; x < src_width - 1; x += 2) { + dst_ptr[0] += src_ptr[0]; + dst_ptr[1] += src_ptr[1]; + src_ptr += 2; + dst_ptr += 2; + } + if (src_width & 1) { + dst_ptr[0] += src_ptr[0]; + } +} + +void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) { + int x; + assert(src_width > 0); + for (x = 0; x < src_width - 1; x += 2) { + dst_ptr[0] += src_ptr[0]; + dst_ptr[1] += src_ptr[1]; + src_ptr += 2; + dst_ptr += 2; + } + if (src_width & 1) { + dst_ptr[0] += src_ptr[0]; + } +} + +void ScaleARGBRowDown2_C(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + const uint32* src = (const uint32*)(src_argb); + uint32* dst = (uint32*)(dst_argb); + + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src[1]; + dst[1] = src[3]; + src += 4; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[1]; + } +} + +void ScaleARGBRowDown2Linear_C(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1; + dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1; + dst_argb[2] = (src_argb[2] + src_argb[6] + 1) >> 1; + dst_argb[3] = (src_argb[3] + src_argb[7] + 1) >> 1; + src_argb += 8; + dst_argb += 4; + } +} + +void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + dst_argb[0] = (src_argb[0] + src_argb[4] + + src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2; + dst_argb[1] = (src_argb[1] + src_argb[5] + + src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2; + dst_argb[2] = (src_argb[2] + src_argb[6] + + src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2; + dst_argb[3] = (src_argb[3] + src_argb[7] + + src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2; + src_argb += 8; + dst_argb += 4; + } +} + +void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + const uint32* src = (const uint32*)(src_argb); + uint32* dst = (uint32*)(dst_argb); + + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src[0]; + dst[1] = src[src_stepx]; + src += src_stepx * 2; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[0]; + } +} + +void ScaleARGBRowDownEvenBox_C(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + dst_argb[0] = (src_argb[0] + src_argb[4] + + src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2; + dst_argb[1] = (src_argb[1] + src_argb[5] + + src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2; + dst_argb[2] = (src_argb[2] + src_argb[6] + + src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2; + dst_argb[3] = (src_argb[3] + src_argb[7] + + src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2; + src_argb += src_stepx * 4; + dst_argb += 4; + } +} + +// Scales a single row of pixels using point sampling. +void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + const uint32* src = (const uint32*)(src_argb); + uint32* dst = (uint32*)(dst_argb); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst[0] = src[x >> 16]; + x += dx; + dst[1] = src[x >> 16]; + x += dx; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[x >> 16]; + } +} + +void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x32, int dx) { + int64 x = (int64)(x32); + const uint32* src = (const uint32*)(src_argb); + uint32* dst = (uint32*)(dst_argb); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst[0] = src[x >> 16]; + x += dx; + dst[1] = src[x >> 16]; + x += dx; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[x >> 16]; + } +} + +// Scales a single row of pixels up by 2x using point sampling. +void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + const uint32* src = (const uint32*)(src_argb); + uint32* dst = (uint32*)(dst_argb); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst[1] = dst[0] = src[0]; + src += 1; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[0]; + } +} + +// Mimics SSSE3 blender +#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7 +#define BLENDERC(a, b, f, s) (uint32)( \ + BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) +#define BLENDER(a, b, f) \ + BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \ + BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0) + +void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + const uint32* src = (const uint32*)(src_argb); + uint32* dst = (uint32*)(dst_argb); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint32 a = src[xi]; + uint32 b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + x += dx; + xi = x >> 16; + xf = (x >> 9) & 0x7f; + a = src[xi]; + b = src[xi + 1]; + dst[1] = BLENDER(a, b, xf); + x += dx; + dst += 2; + } + if (dst_width & 1) { + int xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint32 a = src[xi]; + uint32 b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + } +} + +void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x32, int dx) { + int64 x = (int64)(x32); + const uint32* src = (const uint32*)(src_argb); + uint32* dst = (uint32*)(dst_argb); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int64 xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint32 a = src[xi]; + uint32 b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + x += dx; + xi = x >> 16; + xf = (x >> 9) & 0x7f; + a = src[xi]; + b = src[xi + 1]; + dst[1] = BLENDER(a, b, xf); + x += dx; + dst += 2; + } + if (dst_width & 1) { + int64 xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint32 a = src[xi]; + uint32 b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + } +} +#undef BLENDER1 +#undef BLENDERC +#undef BLENDER + +// Scale plane vertically with bilinear interpolation. +void ScalePlaneVertical(int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_argb, uint8* dst_argb, + int x, int y, int dy, + int bpp, enum FilterMode filtering) { + // TODO(fbarchard): Allow higher bpp. + int dst_width_bytes = dst_width * bpp; + void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_C; + const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; + int j; + assert(bpp >= 1 && bpp <= 4); + assert(src_height != 0); + assert(dst_width > 0); + assert(dst_height > 0); + src_argb += (x >> 16) * bpp; +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_SSE2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width_bytes, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && + IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { + InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; + if (IS_ALIGNED(dst_width_bytes, 4)) { + InterpolateRow = InterpolateRow_MIPS_DSPR2; + } + } +#endif + for (j = 0; j < dst_height; ++j) { + int yi; + int yf; + if (y > max_y) { + y = max_y; + } + yi = y >> 16; + yf = filtering ? ((y >> 8) & 255) : 0; + InterpolateRow(dst_argb, src_argb + yi * src_stride, + src_stride, dst_width_bytes, yf); + dst_argb += dst_stride; + y += dy; + } +} +void ScalePlaneVertical_16(int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_argb, uint16* dst_argb, + int x, int y, int dy, + int wpp, enum FilterMode filtering) { + // TODO(fbarchard): Allow higher wpp. + int dst_width_words = dst_width * wpp; + void (*InterpolateRow)(uint16* dst_argb, const uint16* src_argb, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_16_C; + const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; + int j; + assert(wpp >= 1 && wpp <= 2); + assert(src_height != 0); + assert(dst_width > 0); + assert(dst_height > 0); + src_argb += (x >> 16) * wpp; +#if defined(HAS_INTERPOLATEROW_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + InterpolateRow = InterpolateRow_Any_16_SSE2; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_16_SSE2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_16_SSSE3; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_16_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_16_AVX2; + if (IS_ALIGNED(dst_width_bytes, 32)) { + InterpolateRow = InterpolateRow_16_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_16_NEON; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_16_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && + IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { + InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2; + if (IS_ALIGNED(dst_width_bytes, 4)) { + InterpolateRow = InterpolateRow_16_MIPS_DSPR2; + } + } +#endif + for (j = 0; j < dst_height; ++j) { + int yi; + int yf; + if (y > max_y) { + y = max_y; + } + yi = y >> 16; + yf = filtering ? ((y >> 8) & 255) : 0; + InterpolateRow(dst_argb, src_argb + yi * src_stride, + src_stride, dst_width_words, yf); + dst_argb += dst_stride; + y += dy; + } +} + +// Simplify the filtering based on scale factors. +enum FilterMode ScaleFilterReduce(int src_width, int src_height, + int dst_width, int dst_height, + enum FilterMode filtering) { + if (src_width < 0) { + src_width = -src_width; + } + if (src_height < 0) { + src_height = -src_height; + } + if (filtering == kFilterBox) { + // If scaling both axis to 0.5 or larger, switch from Box to Bilinear. + if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) { + filtering = kFilterBilinear; + } + } + if (filtering == kFilterBilinear) { + if (src_height == 1) { + filtering = kFilterLinear; + } + // TODO(fbarchard): Detect any odd scale factor and reduce to Linear. + if (dst_height == src_height || dst_height * 3 == src_height) { + filtering = kFilterLinear; + } + // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to + // avoid reading 2 pixels horizontally that causes memory exception. + if (src_width == 1) { + filtering = kFilterNone; + } + } + if (filtering == kFilterLinear) { + if (src_width == 1) { + filtering = kFilterNone; + } + // TODO(fbarchard): Detect any odd scale factor and reduce to None. + if (dst_width == src_width || dst_width * 3 == src_width) { + filtering = kFilterNone; + } + } + return filtering; +} + +// Divide num by div and return as 16.16 fixed point result. +int FixedDiv_C(int num, int div) { + return (int)(((int64)(num) << 16) / div); +} + +// Divide num by div and return as 16.16 fixed point result. +int FixedDiv1_C(int num, int div) { + return (int)((((int64)(num) << 16) - 0x00010001) / + (div - 1)); +} + +#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s) + +// Compute slope values for stepping. +void ScaleSlope(int src_width, int src_height, + int dst_width, int dst_height, + enum FilterMode filtering, + int* x, int* y, int* dx, int* dy) { + assert(x != NULL); + assert(y != NULL); + assert(dx != NULL); + assert(dy != NULL); + assert(src_width != 0); + assert(src_height != 0); + assert(dst_width > 0); + assert(dst_height > 0); + // Check for 1 pixel and avoid FixedDiv overflow. + if (dst_width == 1 && src_width >= 32768) { + dst_width = src_width; + } + if (dst_height == 1 && src_height >= 32768) { + dst_height = src_height; + } + if (filtering == kFilterBox) { + // Scale step for point sampling duplicates all pixels equally. + *dx = FixedDiv(Abs(src_width), dst_width); + *dy = FixedDiv(src_height, dst_height); + *x = 0; + *y = 0; + } else if (filtering == kFilterBilinear) { + // Scale step for bilinear sampling renders last pixel once for upsample. + if (dst_width <= Abs(src_width)) { + *dx = FixedDiv(Abs(src_width), dst_width); + *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter. + } else if (dst_width > 1) { + *dx = FixedDiv1(Abs(src_width), dst_width); + *x = 0; + } + if (dst_height <= src_height) { + *dy = FixedDiv(src_height, dst_height); + *y = CENTERSTART(*dy, -32768); // Subtract 0.5 (32768) to center filter. + } else if (dst_height > 1) { + *dy = FixedDiv1(src_height, dst_height); + *y = 0; + } + } else if (filtering == kFilterLinear) { + // Scale step for bilinear sampling renders last pixel once for upsample. + if (dst_width <= Abs(src_width)) { + *dx = FixedDiv(Abs(src_width), dst_width); + *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter. + } else if (dst_width > 1) { + *dx = FixedDiv1(Abs(src_width), dst_width); + *x = 0; + } + *dy = FixedDiv(src_height, dst_height); + *y = *dy >> 1; + } else { + // Scale step for point sampling duplicates all pixels equally. + *dx = FixedDiv(Abs(src_width), dst_width); + *dy = FixedDiv(src_height, dst_height); + *x = CENTERSTART(*dx, 0); + *y = CENTERSTART(*dy, 0); + } + // Negative src_width means horizontally mirror. + if (src_width < 0) { + *x += (dst_width - 1) * *dx; + *dx = -*dx; + // src_width = -src_width; // Caller must do this. + } +} +#undef CENTERSTART + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/scale_gcc.cc b/libs/libaom/src/third_party/libyuv/source/scale_gcc.cc new file mode 100644 index 000000000..8a6ac5459 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/scale_gcc.cc @@ -0,0 +1,1089 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC x86 and x64. +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) + +// Offsets for source bytes 0 to 9 +static uvec8 kShuf0 = + { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. +static uvec8 kShuf1 = + { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static uvec8 kShuf2 = + { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 0 to 10 +static uvec8 kShuf01 = + { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; + +// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. +static uvec8 kShuf11 = + { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static uvec8 kShuf21 = + { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; + +// Coefficients for source bytes 0 to 10 +static uvec8 kMadd01 = + { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; + +// Coefficients for source bytes 10 to 21 +static uvec8 kMadd11 = + { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; + +// Coefficients for source bytes 21 to 31 +static uvec8 kMadd21 = + { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; + +// Coefficients for source bytes 21 to 31 +static vec16 kRound34 = + { 2, 2, 2, 2, 2, 2, 2, 2 }; + +static uvec8 kShuf38a = + { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +static uvec8 kShuf38b = + { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 0,1,2 +static uvec8 kShufAc = + { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 3,4,5 +static uvec8 kShufAc3 = + { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x3 and 2x3 +static uvec16 kScaleAc33 = + { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; + +// Arrange first value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb0 = + { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; + +// Arrange second value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb1 = + { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; + +// Arrange third value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb2 = + { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x2 and 2x2 +static uvec16 kScaleAb2 = + { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; + +// GCC versions of row functions are verbatim conversions from Visual C. +// Generated using gcc disassembly on Visual C object file: +// objdump -D yuvscaler.obj >yuvscaler.txt + +void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + :: "memory", "cc", "xmm0", "xmm1" + ); +} + +void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm5,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + :: "memory", "cc", "xmm0", "xmm1", "xmm5" + ); +} + +void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 + MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm5,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} + +void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrld $0x18,%%xmm5 \n" + "pslld $0x10,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + :: "memory", "cc", "xmm0", "xmm1", "xmm5" + ); +} + +void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + intptr_t stridex3 = 0; + asm volatile ( + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0x8,%%xmm7 \n" + "lea " MEMLEA4(0x00,4,4,2) ",%3 \n" + + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 + MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2 + MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3 + MEMOPREG(movdqu,0x00,0,3,1,xmm4) // movdqu (%0,%3,1),%%xmm4 + MEMOPREG(movdqu,0x10,0,3,1,xmm5) // movdqu 0x10(%0,%3,1),%%xmm5 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm4,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm5,%%xmm3 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm7,%%xmm2 \n" + "pand %%xmm7,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "pand %%xmm7,%%xmm2 \n" + "pavgw %%xmm2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(stridex3) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7" + ); +} + +void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %0,%%xmm3 \n" + "movdqa %1,%%xmm4 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kShuf0), // %0 + "m"(kShuf1), // %1 + "m"(kShuf2) // %2 + ); + asm volatile ( + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm2 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "palignr $0x8,%%xmm0,%%xmm1 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm5,%%xmm2 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "movq %%xmm1," MEMACCESS2(0x8,1) " \n" + "movq %%xmm2," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x18,1) ",%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 + : + : "m"(kShuf01), // %0 + "m"(kShuf11), // %1 + "m"(kShuf21) // %2 + ); + asm volatile ( + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 + : + : "m"(kMadd01), // %0 + "m"(kMadd11), // %1 + "m"(kRound34) // %2 + ); + asm volatile ( + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS(1) " \n" + "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS2(0x8,1) " \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3),%%xmm7 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x18,1) ",%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + +void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 + : + : "m"(kShuf01), // %0 + "m"(kShuf11), // %1 + "m"(kShuf21) // %2 + ); + asm volatile ( + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 + : + : "m"(kMadd01), // %0 + "m"(kMadd11), // %1 + "m"(kRound34) // %2 + ); + + asm volatile ( + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3,1),%%xmm7 + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS(1) " \n" + "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7 + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS2(0x8,1) " \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3,1),%%xmm7 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x18,1) ",%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + +void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "movhlps %%xmm0,%%xmm1 \n" + "movd %%xmm1," MEMACCESS2(0x8,1) " \n" + "lea " MEMLEA(0xc,1) ",%1 \n" + "sub $0xc,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kShuf38a), // %3 + "m"(kShuf38b) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5" + ); +} + +void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "movdqa %3,%%xmm5 \n" + : + : "m"(kShufAb0), // %0 + "m"(kShufAb1), // %1 + "m"(kShufAb2), // %2 + "m"(kScaleAb2) // %3 + ); + asm volatile ( + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,3,1,xmm1) // movdqu (%0,%3,1),%%xmm1 + "lea " MEMLEA(0x10,0) ",%0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "paddusw %%xmm6,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "paddusw %%xmm0,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movd %%xmm1," MEMACCESS(1) " \n" + "psrlq $0x10,%%xmm1 \n" + "movd %%xmm1," MEMACCESS2(0x2,1) " \n" + "lea " MEMLEA(0x6,1) ",%1 \n" + "sub $0x6,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + ); +} + +void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + : + : "m"(kShufAc), // %0 + "m"(kShufAc3), // %1 + "m"(kScaleAc33) // %2 + ); + asm volatile ( + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,3,1,xmm6) // movdqu (%0,%3,1),%%xmm6 + "movhlps %%xmm0,%%xmm1 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + MEMOPREG(movdqu,0x00,0,3,2,xmm6) // movdqu (%0,%3,2),%%xmm6 + "lea " MEMLEA(0x10,0) ",%0 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "movdqa %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "pshufb %%xmm3,%%xmm7 \n" + "paddusw %%xmm7,%%xmm6 \n" + "pmulhuw %%xmm4,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movd %%xmm6," MEMACCESS(1) " \n" + "psrlq $0x10,%%xmm6 \n" + "movd %%xmm6," MEMACCESS2(0x2,1) " \n" + "lea " MEMLEA(0x6,1) ",%1 \n" + "sub $0x6,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + +// Reads 16xN bytes and produces 16 shorts at a time. +void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, int src_height) { + int tmp_height = 0; + intptr_t tmp_src = 0; + asm volatile ( + "mov %0,%3 \n" // row pointer + "mov %5,%2 \n" // height + "pxor %%xmm0,%%xmm0 \n" // clear accumulators + "pxor %%xmm1,%%xmm1 \n" + "pxor %%xmm4,%%xmm4 \n" + + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(3) ",%%xmm2 \n" + "add %6,%3 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm4,%%xmm2 \n" + "punpckhbw %%xmm4,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "sub $0x1,%2 \n" + "jg 1b \n" + + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16 + "mov %0,%3 \n" // row pointer + "mov %5,%2 \n" // height + "pxor %%xmm0,%%xmm0 \n" // clear accumulators + "pxor %%xmm1,%%xmm1 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(tmp_height), // %2 + "+r"(tmp_src), // %3 + "+r"(src_width), // %4 + "+rm"(src_height) // %5 + : "rm"((intptr_t)(src_stride)) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" + ); +} + +// Bilinear column filtering. SSSE3 version. +void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + intptr_t x0 = 0, x1 = 0, temp_pixel = 0; + asm volatile ( + "movd %6,%%xmm2 \n" + "movd %7,%%xmm3 \n" + "movl $0x04040000,%k2 \n" + "movd %k2,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "subl $0x2,%5 \n" + "jl 29f \n" + "movdqa %%xmm2,%%xmm0 \n" + "paddd %%xmm3,%%xmm0 \n" + "punpckldq %%xmm0,%%xmm2 \n" + "punpckldq %%xmm3,%%xmm3 \n" + "paddd %%xmm3,%%xmm3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + + LABELALIGN + "2: \n" + "movdqa %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm2 \n" + MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 + "movd %k2,%%xmm0 \n" + "psrlw $0x9,%%xmm1 \n" + MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2 + "movd %k2,%%xmm4 \n" + "pshufb %%xmm5,%%xmm1 \n" + "punpcklwd %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm1 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,%k2 \n" + "mov %w2," MEMACCESS(0) " \n" + "lea " MEMLEA(0x2,0) ",%0 \n" + "sub $0x2,%5 \n" + "jge 2b \n" + + LABELALIGN + "29: \n" + "addl $0x1,%5 \n" + "jl 99f \n" + MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 + "movd %k2,%%xmm0 \n" + "psrlw $0x9,%%xmm2 \n" + "pshufb %%xmm5,%%xmm2 \n" + "pxor %%xmm6,%%xmm2 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,%k2 \n" + "mov %b2," MEMACCESS(0) " \n" + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+a"(temp_pixel), // %2 + "+r"(x0), // %3 + "+r"(x1), // %4 + "+rm"(dst_width) // %5 + : "rm"(x), // %6 + "rm"(dx) // %7 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + ); +} + +// Reads 4 pixels, duplicates them and writes 8 pixels. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "movdqu %%xmm0," MEMACCESS(0) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + :: "memory", "cc", "xmm0", "xmm1" + ); +} + +void ScaleARGBRowDown2_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "shufps $0xdd,%%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + :: "memory", "cc", "xmm0", "xmm1" + ); +} + +void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + :: "memory", "cc", "xmm0", "xmm1" + ); +} + +void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 + MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3" + ); +} + +// Reads 4 pixels at a time. +// Alignment requirement: dst_argb 16 byte aligned. +void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, uint8* dst_argb, int dst_width) { + intptr_t src_stepx_x4 = (intptr_t)(src_stepx); + intptr_t src_stepx_x12 = 0; + asm volatile ( + "lea " MEMLEA3(0x00,1,4) ",%1 \n" + "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" + LABELALIGN + "1: \n" + "movd " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 + "punpckldq %%xmm1,%%xmm0 \n" + MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2 + MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3 + "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" + "punpckldq %%xmm3,%%xmm2 \n" + "punpcklqdq %%xmm2,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stepx_x4), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width), // %3 + "+r"(src_stepx_x12) // %4 + :: "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3" + ); +} + +// Blends four 2x2 to 4x1. +// Alignment requirement: dst_argb 16 byte aligned. +void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, int src_stepx, + uint8* dst_argb, int dst_width) { + intptr_t src_stepx_x4 = (intptr_t)(src_stepx); + intptr_t src_stepx_x12 = 0; + intptr_t row1 = (intptr_t)(src_stride); + asm volatile ( + "lea " MEMLEA3(0x00,1,4) ",%1 \n" + "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" + "lea " MEMLEA4(0x00,0,5,1) ",%5 \n" + + LABELALIGN + "1: \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0 + MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1 + MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1 + "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" + "movq " MEMACCESS(5) ",%%xmm2 \n" + MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2 + MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3 + MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3 + "lea " MEMLEA4(0x00,5,1,4) ",%5 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stepx_x4), // %1 + "+r"(dst_argb), // %2 + "+rm"(dst_width), // %3 + "+r"(src_stepx_x12), // %4 + "+r"(row1) // %5 + :: "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3" + ); +} + +void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + intptr_t x0 = 0, x1 = 0; + asm volatile ( + "movd %5,%%xmm2 \n" + "movd %6,%%xmm3 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + "pshufd $0x11,%%xmm3,%%xmm0 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm3 \n" + "pshufd $0x5,%%xmm3,%%xmm0 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pextrw $0x1,%%xmm2,%k0 \n" + "pextrw $0x3,%%xmm2,%k1 \n" + "cmp $0x0,%4 \n" + "jl 99f \n" + "sub $0x4,%4 \n" + "jl 49f \n" + + LABELALIGN + "40: \n" + MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 + MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 + "pextrw $0x5,%%xmm2,%k0 \n" + "pextrw $0x7,%%xmm2,%k1 \n" + "paddd %%xmm3,%%xmm2 \n" + "punpckldq %%xmm1,%%xmm0 \n" + MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1 + MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4 + "pextrw $0x1,%%xmm2,%k0 \n" + "pextrw $0x3,%%xmm2,%k1 \n" + "punpckldq %%xmm4,%%xmm1 \n" + "punpcklqdq %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%4 \n" + "jge 40b \n" + + "49: \n" + "test $0x2,%4 \n" + "je 29f \n" + MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 + MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 + "pextrw $0x5,%%xmm2,%k0 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movq %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x8,2) ",%2 \n" + "29: \n" + "test $0x1,%4 \n" + "je 99f \n" + MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 + "movd %%xmm0," MEMACCESS(2) " \n" + "99: \n" + : "+a"(x0), // %0 + "+d"(x1), // %1 + "+r"(dst_argb), // %2 + "+r"(src_argb), // %3 + "+r"(dst_width) // %4 + : "rm"(x), // %5 + "rm"(dx) // %6 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" + ); +} + +// Reads 4 pixels, duplicates them and writes 8 pixels. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpckldq %%xmm0,%%xmm0 \n" + "punpckhdq %%xmm1,%%xmm1 \n" + "movdqu %%xmm0," MEMACCESS(0) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width) // %2 + :: "memory", "cc", NACL_R14 + "xmm0", "xmm1" + ); +} + +// Shuffle table for arranging 2 pixels into pairs for pmaddubsw +static uvec8 kShuffleColARGB = { + 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel + 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel +}; + +// Shuffle table for duplicating 2 fractions into 8 bytes each +static uvec8 kShuffleFractions = { + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, +}; + +// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version +void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + intptr_t x0 = 0, x1 = 0; + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm5 \n" + : + : "m"(kShuffleColARGB), // %0 + "m"(kShuffleFractions) // %1 + ); + + asm volatile ( + "movd %5,%%xmm2 \n" + "movd %6,%%xmm3 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "sub $0x2,%2 \n" + "jl 29f \n" + "movdqa %%xmm2,%%xmm0 \n" + "paddd %%xmm3,%%xmm0 \n" + "punpckldq %%xmm0,%%xmm2 \n" + "punpckldq %%xmm3,%%xmm3 \n" + "paddd %%xmm3,%%xmm3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + + LABELALIGN + "2: \n" + "movdqa %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm2 \n" + MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 + "psrlw $0x9,%%xmm1 \n" + MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0 + "pshufb %%xmm5,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm1 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0," MEMACCESS(0) " \n" + "lea " MEMLEA(0x8,0) ",%0 \n" + "sub $0x2,%2 \n" + "jge 2b \n" + + LABELALIGN + "29: \n" + "add $0x1,%2 \n" + "jl 99f \n" + "psrlw $0x9,%%xmm2 \n" + MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 + "pshufb %%xmm5,%%xmm2 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm2 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0," MEMACCESS(0) " \n" + + LABELALIGN + "99: \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+rm"(dst_width), // %2 + "+r"(x0), // %3 + "+r"(x1) // %4 + : "rm"(x), // %5 + "rm"(dx) // %6 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + ); +} + +// Divide num by div and return as 16.16 fixed point result. +int FixedDiv_X86(int num, int div) { + asm volatile ( + "cdq \n" + "shld $0x10,%%eax,%%edx \n" + "shl $0x10,%%eax \n" + "idiv %1 \n" + "mov %0, %%eax \n" + : "+a"(num) // %0 + : "c"(div) // %1 + : "memory", "cc", "edx" + ); + return num; +} + +// Divide num - 1 by div - 1 and return as 16.16 fixed point result. +int FixedDiv1_X86(int num, int div) { + asm volatile ( + "cdq \n" + "shld $0x10,%%eax,%%edx \n" + "shl $0x10,%%eax \n" + "sub $0x10001,%%eax \n" + "sbb $0x0,%%edx \n" + "sub $0x1,%1 \n" + "idiv %1 \n" + "mov %0, %%eax \n" + : "+a"(num) // %0 + : "c"(div) // %1 + : "memory", "cc", "edx" + ); + return num; +} + +#endif // defined(__x86_64__) || defined(__i386__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/scale_mips.cc b/libs/libaom/src/third_party/libyuv/source/scale_mips.cc new file mode 100644 index 000000000..3eb4f27c4 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/scale_mips.cc @@ -0,0 +1,654 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC MIPS DSPR2 +#if !defined(LIBYUV_DISABLE_MIPS) && \ + defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \ + (_MIPS_SIM == _MIPS_SIM_ABI32) + +void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + __asm__ __volatile__( + ".set push \n" + ".set noreorder \n" + + "srl $t9, %[dst_width], 4 \n" // iterations -> by 16 + "beqz $t9, 2f \n" + " nop \n" + + ".p2align 2 \n" + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| + "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| + "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| + "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| + "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16| + "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20| + "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24| + "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28| + // TODO(fbarchard): Use odd pixels instead of even. + "precr.qb.ph $t8, $t1, $t0 \n" // |6|4|2|0| + "precr.qb.ph $t0, $t3, $t2 \n" // |14|12|10|8| + "precr.qb.ph $t1, $t5, $t4 \n" // |22|20|18|16| + "precr.qb.ph $t2, $t7, $t6 \n" // |30|28|26|24| + "addiu %[src_ptr], %[src_ptr], 32 \n" + "addiu $t9, $t9, -1 \n" + "sw $t8, 0(%[dst]) \n" + "sw $t0, 4(%[dst]) \n" + "sw $t1, 8(%[dst]) \n" + "sw $t2, 12(%[dst]) \n" + "bgtz $t9, 1b \n" + " addiu %[dst], %[dst], 16 \n" + + "2: \n" + "andi $t9, %[dst_width], 0xf \n" // residue + "beqz $t9, 3f \n" + " nop \n" + + "21: \n" + "lbu $t0, 0(%[src_ptr]) \n" + "addiu %[src_ptr], %[src_ptr], 2 \n" + "addiu $t9, $t9, -1 \n" + "sb $t0, 0(%[dst]) \n" + "bgtz $t9, 21b \n" + " addiu %[dst], %[dst], 1 \n" + + "3: \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [dst] "+r" (dst) + : [dst_width] "r" (dst_width) + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9" + ); +} + +void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + const uint8* t = src_ptr + src_stride; + + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + "srl $t9, %[dst_width], 3 \n" // iterations -> step 8 + "bltz $t9, 2f \n" + " nop \n" + + ".p2align 2 \n" + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| + "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| + "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| + "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| + "lw $t4, 0(%[t]) \n" // |19|18|17|16| + "lw $t5, 4(%[t]) \n" // |23|22|21|20| + "lw $t6, 8(%[t]) \n" // |27|26|25|24| + "lw $t7, 12(%[t]) \n" // |31|30|29|28| + "addiu $t9, $t9, -1 \n" + "srl $t8, $t0, 16 \n" // |X|X|3|2| + "ins $t0, $t4, 16, 16 \n" // |17|16|1|0| + "ins $t4, $t8, 0, 16 \n" // |19|18|3|2| + "raddu.w.qb $t0, $t0 \n" // |17+16+1+0| + "raddu.w.qb $t4, $t4 \n" // |19+18+3+2| + "shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2 + "shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2 + "srl $t8, $t1, 16 \n" // |X|X|7|6| + "ins $t1, $t5, 16, 16 \n" // |21|20|5|4| + "ins $t5, $t8, 0, 16 \n" // |22|23|7|6| + "raddu.w.qb $t1, $t1 \n" // |21+20+5+4| + "raddu.w.qb $t5, $t5 \n" // |23+22+7+6| + "shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2 + "shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2 + "srl $t8, $t2, 16 \n" // |X|X|11|10| + "ins $t2, $t6, 16, 16 \n" // |25|24|9|8| + "ins $t6, $t8, 0, 16 \n" // |27|26|11|10| + "raddu.w.qb $t2, $t2 \n" // |25+24+9+8| + "raddu.w.qb $t6, $t6 \n" // |27+26+11+10| + "shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2 + "shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2 + "srl $t8, $t3, 16 \n" // |X|X|15|14| + "ins $t3, $t7, 16, 16 \n" // |29|28|13|12| + "ins $t7, $t8, 0, 16 \n" // |31|30|15|14| + "raddu.w.qb $t3, $t3 \n" // |29+28+13+12| + "raddu.w.qb $t7, $t7 \n" // |31+30+15+14| + "shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2 + "shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2 + "addiu %[src_ptr], %[src_ptr], 16 \n" + "addiu %[t], %[t], 16 \n" + "sb $t0, 0(%[dst]) \n" + "sb $t4, 1(%[dst]) \n" + "sb $t1, 2(%[dst]) \n" + "sb $t5, 3(%[dst]) \n" + "sb $t2, 4(%[dst]) \n" + "sb $t6, 5(%[dst]) \n" + "sb $t3, 6(%[dst]) \n" + "sb $t7, 7(%[dst]) \n" + "bgtz $t9, 1b \n" + " addiu %[dst], %[dst], 8 \n" + + "2: \n" + "andi $t9, %[dst_width], 0x7 \n" // x = residue + "beqz $t9, 3f \n" + " nop \n" + + "21: \n" + "lwr $t1, 0(%[src_ptr]) \n" + "lwl $t1, 3(%[src_ptr]) \n" + "lwr $t2, 0(%[t]) \n" + "lwl $t2, 3(%[t]) \n" + "srl $t8, $t1, 16 \n" + "ins $t1, $t2, 16, 16 \n" + "ins $t2, $t8, 0, 16 \n" + "raddu.w.qb $t1, $t1 \n" + "raddu.w.qb $t2, $t2 \n" + "shra_r.w $t1, $t1, 2 \n" + "shra_r.w $t2, $t2, 2 \n" + "sb $t1, 0(%[dst]) \n" + "sb $t2, 1(%[dst]) \n" + "addiu %[src_ptr], %[src_ptr], 4 \n" + "addiu $t9, $t9, -2 \n" + "addiu %[t], %[t], 4 \n" + "bgtz $t9, 21b \n" + " addiu %[dst], %[dst], 2 \n" + + "3: \n" + ".set pop \n" + + : [src_ptr] "+r" (src_ptr), + [dst] "+r" (dst), [t] "+r" (t) + : [dst_width] "r" (dst_width) + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9" + ); +} + +void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + "srl $t9, %[dst_width], 3 \n" + "beqz $t9, 2f \n" + " nop \n" + + ".p2align 2 \n" + "1: \n" + "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| + "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| + "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8| + "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12| + "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16| + "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20| + "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24| + "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28| + "precr.qb.ph $t1, $t2, $t1 \n" // |6|4|2|0| + "precr.qb.ph $t2, $t4, $t3 \n" // |14|12|10|8| + "precr.qb.ph $t5, $t6, $t5 \n" // |22|20|18|16| + "precr.qb.ph $t6, $t8, $t7 \n" // |30|28|26|24| + "precr.qb.ph $t1, $t2, $t1 \n" // |12|8|4|0| + "precr.qb.ph $t5, $t6, $t5 \n" // |28|24|20|16| + "addiu %[src_ptr], %[src_ptr], 32 \n" + "addiu $t9, $t9, -1 \n" + "sw $t1, 0(%[dst]) \n" + "sw $t5, 4(%[dst]) \n" + "bgtz $t9, 1b \n" + " addiu %[dst], %[dst], 8 \n" + + "2: \n" + "andi $t9, %[dst_width], 7 \n" // residue + "beqz $t9, 3f \n" + " nop \n" + + "21: \n" + "lbu $t1, 0(%[src_ptr]) \n" + "addiu %[src_ptr], %[src_ptr], 4 \n" + "addiu $t9, $t9, -1 \n" + "sb $t1, 0(%[dst]) \n" + "bgtz $t9, 21b \n" + " addiu %[dst], %[dst], 1 \n" + + "3: \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [dst] "+r" (dst) + : [dst_width] "r" (dst_width) + : "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9" + ); +} + +void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + intptr_t stride = src_stride; + const uint8* s1 = src_ptr + stride; + const uint8* s2 = s1 + stride; + const uint8* s3 = s2 + stride; + + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + "srl $t9, %[dst_width], 1 \n" + "andi $t8, %[dst_width], 1 \n" + + ".p2align 2 \n" + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| + "lw $t1, 0(%[s1]) \n" // |7|6|5|4| + "lw $t2, 0(%[s2]) \n" // |11|10|9|8| + "lw $t3, 0(%[s3]) \n" // |15|14|13|12| + "lw $t4, 4(%[src_ptr]) \n" // |19|18|17|16| + "lw $t5, 4(%[s1]) \n" // |23|22|21|20| + "lw $t6, 4(%[s2]) \n" // |27|26|25|24| + "lw $t7, 4(%[s3]) \n" // |31|30|29|28| + "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0| + "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4| + "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8| + "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12| + "raddu.w.qb $t4, $t4 \n" // |19 + 18 + 17 + 16| + "raddu.w.qb $t5, $t5 \n" // |23 + 22 + 21 + 20| + "raddu.w.qb $t6, $t6 \n" // |27 + 26 + 25 + 24| + "raddu.w.qb $t7, $t7 \n" // |31 + 30 + 29 + 28| + "add $t0, $t0, $t1 \n" + "add $t1, $t2, $t3 \n" + "add $t0, $t0, $t1 \n" + "add $t4, $t4, $t5 \n" + "add $t6, $t6, $t7 \n" + "add $t4, $t4, $t6 \n" + "shra_r.w $t0, $t0, 4 \n" + "shra_r.w $t4, $t4, 4 \n" + "sb $t0, 0(%[dst]) \n" + "sb $t4, 1(%[dst]) \n" + "addiu %[src_ptr], %[src_ptr], 8 \n" + "addiu %[s1], %[s1], 8 \n" + "addiu %[s2], %[s2], 8 \n" + "addiu %[s3], %[s3], 8 \n" + "addiu $t9, $t9, -1 \n" + "bgtz $t9, 1b \n" + " addiu %[dst], %[dst], 2 \n" + "beqz $t8, 2f \n" + " nop \n" + + "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| + "lw $t1, 0(%[s1]) \n" // |7|6|5|4| + "lw $t2, 0(%[s2]) \n" // |11|10|9|8| + "lw $t3, 0(%[s3]) \n" // |15|14|13|12| + "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0| + "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4| + "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8| + "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12| + "add $t0, $t0, $t1 \n" + "add $t1, $t2, $t3 \n" + "add $t0, $t0, $t1 \n" + "shra_r.w $t0, $t0, 4 \n" + "sb $t0, 0(%[dst]) \n" + + "2: \n" + ".set pop \n" + + : [src_ptr] "+r" (src_ptr), + [dst] "+r" (dst), + [s1] "+r" (s1), + [s2] "+r" (s2), + [s3] "+r" (s3) + : [dst_width] "r" (dst_width) + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6","t7", "t8", "t9" + ); +} + +void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + ".p2align 2 \n" + "1: \n" + "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| + "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| + "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8| + "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12| + "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16| + "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20| + "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24| + "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28| + "precrq.qb.ph $t0, $t2, $t4 \n" // |7|5|15|13| + "precrq.qb.ph $t9, $t6, $t8 \n" // |23|21|31|30| + "addiu %[dst_width], %[dst_width], -24 \n" + "ins $t1, $t1, 8, 16 \n" // |3|1|0|X| + "ins $t4, $t0, 8, 16 \n" // |X|15|13|12| + "ins $t5, $t5, 8, 16 \n" // |19|17|16|X| + "ins $t8, $t9, 8, 16 \n" // |X|31|29|28| + "addiu %[src_ptr], %[src_ptr], 32 \n" + "packrl.ph $t0, $t3, $t0 \n" // |9|8|7|5| + "packrl.ph $t9, $t7, $t9 \n" // |25|24|23|21| + "prepend $t1, $t2, 8 \n" // |4|3|1|0| + "prepend $t3, $t4, 24 \n" // |15|13|12|11| + "prepend $t5, $t6, 8 \n" // |20|19|17|16| + "prepend $t7, $t8, 24 \n" // |31|29|28|27| + "sw $t1, 0(%[dst]) \n" + "sw $t0, 4(%[dst]) \n" + "sw $t3, 8(%[dst]) \n" + "sw $t5, 12(%[dst]) \n" + "sw $t9, 16(%[dst]) \n" + "sw $t7, 20(%[dst]) \n" + "bnez %[dst_width], 1b \n" + " addiu %[dst], %[dst], 24 \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [dst] "+r" (dst), + [dst_width] "+r" (dst_width) + : + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6","t7", "t8", "t9" + ); +} + +void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "repl.ph $t3, 3 \n" // 0x00030003 + + ".p2align 2 \n" + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| + "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| + "rotr $t2, $t0, 8 \n" // |S0|S3|S2|S1| + "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1| + "muleu_s.ph.qbl $t4, $t2, $t3 \n" // |S0*3|S3*3| + "muleu_s.ph.qbl $t5, $t6, $t3 \n" // |T0*3|T3*3| + "andi $t0, $t2, 0xFFFF \n" // |0|0|S2|S1| + "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1| + "raddu.w.qb $t0, $t0 \n" + "raddu.w.qb $t1, $t1 \n" + "shra_r.w $t0, $t0, 1 \n" + "shra_r.w $t1, $t1, 1 \n" + "preceu.ph.qbr $t2, $t2 \n" // |0|S2|0|S1| + "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1| + "rotr $t2, $t2, 16 \n" // |0|S1|0|S2| + "rotr $t6, $t6, 16 \n" // |0|T1|0|T2| + "addu.ph $t2, $t2, $t4 \n" + "addu.ph $t6, $t6, $t5 \n" + "sll $t5, $t0, 1 \n" + "add $t0, $t5, $t0 \n" + "shra_r.ph $t2, $t2, 2 \n" + "shra_r.ph $t6, $t6, 2 \n" + "shll.ph $t4, $t2, 1 \n" + "addq.ph $t4, $t4, $t2 \n" + "addu $t0, $t0, $t1 \n" + "addiu %[src_ptr], %[src_ptr], 4 \n" + "shra_r.w $t0, $t0, 2 \n" + "addu.ph $t6, $t6, $t4 \n" + "shra_r.ph $t6, $t6, 2 \n" + "srl $t1, $t6, 16 \n" + "addiu %[dst_width], %[dst_width], -3 \n" + "sb $t1, 0(%[d]) \n" + "sb $t0, 1(%[d]) \n" + "sb $t6, 2(%[d]) \n" + "bgtz %[dst_width], 1b \n" + " addiu %[d], %[d], 3 \n" + "3: \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [src_stride] "+r" (src_stride), + [d] "+r" (d), + [dst_width] "+r" (dst_width) + : + : "t0", "t1", "t2", "t3", + "t4", "t5", "t6" + ); +} + +void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "repl.ph $t2, 3 \n" // 0x00030003 + + ".p2align 2 \n" + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| + "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| + "rotr $t4, $t0, 8 \n" // |S0|S3|S2|S1| + "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1| + "muleu_s.ph.qbl $t3, $t4, $t2 \n" // |S0*3|S3*3| + "muleu_s.ph.qbl $t5, $t6, $t2 \n" // |T0*3|T3*3| + "andi $t0, $t4, 0xFFFF \n" // |0|0|S2|S1| + "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1| + "raddu.w.qb $t0, $t0 \n" + "raddu.w.qb $t1, $t1 \n" + "shra_r.w $t0, $t0, 1 \n" + "shra_r.w $t1, $t1, 1 \n" + "preceu.ph.qbr $t4, $t4 \n" // |0|S2|0|S1| + "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1| + "rotr $t4, $t4, 16 \n" // |0|S1|0|S2| + "rotr $t6, $t6, 16 \n" // |0|T1|0|T2| + "addu.ph $t4, $t4, $t3 \n" + "addu.ph $t6, $t6, $t5 \n" + "shra_r.ph $t6, $t6, 2 \n" + "shra_r.ph $t4, $t4, 2 \n" + "addu.ph $t6, $t6, $t4 \n" + "addiu %[src_ptr], %[src_ptr], 4 \n" + "shra_r.ph $t6, $t6, 1 \n" + "addu $t0, $t0, $t1 \n" + "addiu %[dst_width], %[dst_width], -3 \n" + "shra_r.w $t0, $t0, 1 \n" + "srl $t1, $t6, 16 \n" + "sb $t1, 0(%[d]) \n" + "sb $t0, 1(%[d]) \n" + "sb $t6, 2(%[d]) \n" + "bgtz %[dst_width], 1b \n" + " addiu %[d], %[d], 3 \n" + "3: \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [src_stride] "+r" (src_stride), + [d] "+r" (d), + [dst_width] "+r" (dst_width) + : + : "t0", "t1", "t2", "t3", + "t4", "t5", "t6" + ); +} + +void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + ".p2align 2 \n" + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| + "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| + "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| + "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| + "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16| + "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20| + "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24| + "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28| + "wsbh $t0, $t0 \n" // |2|3|0|1| + "wsbh $t6, $t6 \n" // |26|27|24|25| + "srl $t0, $t0, 8 \n" // |X|2|3|0| + "srl $t3, $t3, 16 \n" // |X|X|15|14| + "srl $t5, $t5, 16 \n" // |X|X|23|22| + "srl $t7, $t7, 16 \n" // |X|X|31|30| + "ins $t1, $t2, 24, 8 \n" // |8|6|5|4| + "ins $t6, $t5, 0, 8 \n" // |26|27|24|22| + "ins $t1, $t0, 0, 16 \n" // |8|6|3|0| + "ins $t6, $t7, 24, 8 \n" // |30|27|24|22| + "prepend $t2, $t3, 24 \n" // |X|15|14|11| + "ins $t4, $t4, 16, 8 \n" // |19|16|17|X| + "ins $t4, $t2, 0, 16 \n" // |19|16|14|11| + "addiu %[src_ptr], %[src_ptr], 32 \n" + "addiu %[dst_width], %[dst_width], -12 \n" + "addiu $t8,%[dst_width], -12 \n" + "sw $t1, 0(%[dst]) \n" + "sw $t4, 4(%[dst]) \n" + "sw $t6, 8(%[dst]) \n" + "bgez $t8, 1b \n" + " addiu %[dst], %[dst], 12 \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [dst] "+r" (dst), + [dst_width] "+r" (dst_width) + : + : "t0", "t1", "t2", "t3", "t4", + "t5", "t6", "t7", "t8" + ); +} + +void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + intptr_t stride = src_stride; + const uint8* t = src_ptr + stride; + const int c = 0x2AAA; + + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + ".p2align 2 \n" + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| + "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| + "lw $t2, 0(%[t]) \n" // |T3|T2|T1|T0| + "lw $t3, 4(%[t]) \n" // |T7|T6|T5|T4| + "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6| + "packrl.ph $t4, $t1, $t3 \n" // |S7|S6|T7|T6| + "packrl.ph $t5, $t3, $t1 \n" // |T5|T4|S5|S4| + "raddu.w.qb $t4, $t4 \n" // S7+S6+T7+T6 + "raddu.w.qb $t5, $t5 \n" // T5+T4+S5+S4 + "precrq.qb.ph $t6, $t0, $t2 \n" // |S3|S1|T3|T1| + "precrq.qb.ph $t6, $t6, $t6 \n" // |S3|T3|S3|T3| + "srl $t4, $t4, 2 \n" // t4 / 4 + "srl $t6, $t6, 16 \n" // |0|0|S3|T3| + "raddu.w.qb $t6, $t6 \n" // 0+0+S3+T3 + "addu $t6, $t5, $t6 \n" + "mul $t6, $t6, %[c] \n" // t6 * 0x2AAA + "sll $t0, $t0, 8 \n" // |S2|S1|S0|0| + "sll $t2, $t2, 8 \n" // |T2|T1|T0|0| + "raddu.w.qb $t0, $t0 \n" // S2+S1+S0+0 + "raddu.w.qb $t2, $t2 \n" // T2+T1+T0+0 + "addu $t0, $t0, $t2 \n" + "mul $t0, $t0, %[c] \n" // t0 * 0x2AAA + "addiu %[src_ptr], %[src_ptr], 8 \n" + "addiu %[t], %[t], 8 \n" + "addiu %[dst_width], %[dst_width], -3 \n" + "addiu %[dst_ptr], %[dst_ptr], 3 \n" + "srl $t6, $t6, 16 \n" + "srl $t0, $t0, 16 \n" + "sb $t4, -1(%[dst_ptr]) \n" + "sb $t6, -2(%[dst_ptr]) \n" + "bgtz %[dst_width], 1b \n" + " sb $t0, -3(%[dst_ptr]) \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [dst_ptr] "+r" (dst_ptr), + [t] "+r" (t), + [dst_width] "+r" (dst_width) + : [c] "r" (c) + : "t0", "t1", "t2", "t3", "t4", "t5", "t6" + ); +} + +void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + intptr_t stride = src_stride; + const uint8* s1 = src_ptr + stride; + stride += stride; + const uint8* s2 = src_ptr + stride; + const int c1 = 0x1C71; + const int c2 = 0x2AAA; + + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + ".p2align 2 \n" + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| + "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| + "lw $t2, 0(%[s1]) \n" // |T3|T2|T1|T0| + "lw $t3, 4(%[s1]) \n" // |T7|T6|T5|T4| + "lw $t4, 0(%[s2]) \n" // |R3|R2|R1|R0| + "lw $t5, 4(%[s2]) \n" // |R7|R6|R5|R4| + "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6| + "packrl.ph $t6, $t1, $t3 \n" // |S7|S6|T7|T6| + "raddu.w.qb $t6, $t6 \n" // S7+S6+T7+T6 + "packrl.ph $t7, $t3, $t1 \n" // |T5|T4|S5|S4| + "raddu.w.qb $t7, $t7 \n" // T5+T4+S5+S4 + "sll $t8, $t5, 16 \n" // |R5|R4|0|0| + "raddu.w.qb $t8, $t8 \n" // R5+R4 + "addu $t7, $t7, $t8 \n" + "srl $t8, $t5, 16 \n" // |0|0|R7|R6| + "raddu.w.qb $t8, $t8 \n" // R7 + R6 + "addu $t6, $t6, $t8 \n" + "mul $t6, $t6, %[c2] \n" // t6 * 0x2AAA + "precrq.qb.ph $t8, $t0, $t2 \n" // |S3|S1|T3|T1| + "precrq.qb.ph $t8, $t8, $t4 \n" // |S3|T3|R3|R1| + "srl $t8, $t8, 8 \n" // |0|S3|T3|R3| + "raddu.w.qb $t8, $t8 \n" // S3 + T3 + R3 + "addu $t7, $t7, $t8 \n" + "mul $t7, $t7, %[c1] \n" // t7 * 0x1C71 + "sll $t0, $t0, 8 \n" // |S2|S1|S0|0| + "sll $t2, $t2, 8 \n" // |T2|T1|T0|0| + "sll $t4, $t4, 8 \n" // |R2|R1|R0|0| + "raddu.w.qb $t0, $t0 \n" + "raddu.w.qb $t2, $t2 \n" + "raddu.w.qb $t4, $t4 \n" + "addu $t0, $t0, $t2 \n" + "addu $t0, $t0, $t4 \n" + "mul $t0, $t0, %[c1] \n" // t0 * 0x1C71 + "addiu %[src_ptr], %[src_ptr], 8 \n" + "addiu %[s1], %[s1], 8 \n" + "addiu %[s2], %[s2], 8 \n" + "addiu %[dst_width], %[dst_width], -3 \n" + "addiu %[dst_ptr], %[dst_ptr], 3 \n" + "srl $t6, $t6, 16 \n" + "srl $t7, $t7, 16 \n" + "srl $t0, $t0, 16 \n" + "sb $t6, -1(%[dst_ptr]) \n" + "sb $t7, -2(%[dst_ptr]) \n" + "bgtz %[dst_width], 1b \n" + " sb $t0, -3(%[dst_ptr]) \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [dst_ptr] "+r" (dst_ptr), + [s1] "+r" (s1), + [s2] "+r" (s2), + [dst_width] "+r" (dst_width) + : [c1] "r" (c1), [c2] "r" (c2) + : "t0", "t1", "t2", "t3", "t4", + "t5", "t6", "t7", "t8" + ); +} + +#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + diff --git a/libs/libaom/src/third_party/libyuv/source/scale_neon.cc b/libs/libaom/src/third_party/libyuv/source/scale_neon.cc new file mode 100644 index 000000000..7825878e9 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/scale_neon.cc @@ -0,0 +1,1037 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon. +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ + !defined(__aarch64__) + +// NEON downscalers with interpolation. +// Provided by Fritz Koenig + +// Read 32x1 throw away even pixels, and write 16x1. +void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + // load even pixels into q0, odd into q1 + MEMACCESS(0) + "vld2.8 {q0, q1}, [%0]! \n" + "subs %2, %2, #16 \n" // 16 processed per loop + MEMACCESS(1) + "vst1.8 {q1}, [%1]! \n" // store odd pixels + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1" // Clobber List + ); +} + +// Read 32x1 average down and write 16x1. +void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc + "subs %2, %2, #16 \n" // 16 processed per loop + "vpaddl.u8 q0, q0 \n" // add adjacent + "vpaddl.u8 q1, q1 \n" + "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack + "vrshrn.u16 d1, q1, #1 \n" + MEMACCESS(1) + "vst1.8 {q0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1" // Clobber List + ); +} + +// Read 32x2 average down and write 16x1. +void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + // change the stride to row 2 pointer + "add %1, %0 \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc + MEMACCESS(1) + "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc + "subs %3, %3, #16 \n" // 16 processed per loop + "vpaddl.u8 q0, q0 \n" // row 1 add adjacent + "vpaddl.u8 q1, q1 \n" + "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1 + "vpadal.u8 q1, q3 \n" + "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack + "vrshrn.u16 d1, q1, #2 \n" + MEMACCESS(2) + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "q0", "q1", "q2", "q3" // Clobber List + ); +} + +void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #8 \n" // 8 processed per loop + MEMACCESS(1) + "vst1.8 {d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1", "memory", "cc" + ); +} + +void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + const uint8* src_ptr1 = src_ptr + src_stride; + const uint8* src_ptr2 = src_ptr + src_stride * 2; + const uint8* src_ptr3 = src_ptr + src_stride * 3; +asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load up 16x4 + MEMACCESS(3) + "vld1.8 {q1}, [%3]! \n" + MEMACCESS(4) + "vld1.8 {q2}, [%4]! \n" + MEMACCESS(5) + "vld1.8 {q3}, [%5]! \n" + "subs %2, %2, #4 \n" + "vpaddl.u8 q0, q0 \n" + "vpadal.u8 q0, q1 \n" + "vpadal.u8 q0, q2 \n" + "vpadal.u8 q0, q3 \n" + "vpaddl.u16 q0, q0 \n" + "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding + "vmovn.u16 d0, q0 \n" + MEMACCESS(1) + "vst1.32 {d0[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_ptr1), // %3 + "+r"(src_ptr2), // %4 + "+r"(src_ptr3) // %5 + : + : "q0", "q1", "q2", "q3", "memory", "cc" + ); +} + +// Down scale from 4 to 3 pixels. Use the neon multilane read/write +// to load up the every 4th pixel into a 4 different registers. +// Point samples 32 pixels to 24 pixels. +void ScaleRowDown34_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #24 \n" + "vmov d2, d3 \n" // order d0, d1, d2 + MEMACCESS(1) + "vst3.8 {d0, d1, d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "d0", "d1", "d2", "d3", "memory", "cc" + ); +} + +void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + MEMACCESS(3) + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + "subs %2, %2, #24 \n" + + // filter src line 0 with src line 1 + // expand chars to shorts to allow for room + // when adding lines together + "vmovl.u8 q8, d4 \n" + "vmovl.u8 q9, d5 \n" + "vmovl.u8 q10, d6 \n" + "vmovl.u8 q11, d7 \n" + + // 3 * line_0 + line_1 + "vmlal.u8 q8, d0, d24 \n" + "vmlal.u8 q9, d1, d24 \n" + "vmlal.u8 q10, d2, d24 \n" + "vmlal.u8 q11, d3, d24 \n" + + // (3 * line_0 + line_1) >> 2 + "vqrshrn.u16 d0, q8, #2 \n" + "vqrshrn.u16 d1, q9, #2 \n" + "vqrshrn.u16 d2, q10, #2 \n" + "vqrshrn.u16 d3, q11, #2 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q8, d1 \n" + "vmlal.u8 q8, d0, d24 \n" + "vqrshrn.u16 d0, q8, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q8, d2 \n" + "vmlal.u8 q8, d3, d24 \n" + "vqrshrn.u16 d2, q8, #2 \n" + + MEMACCESS(1) + "vst3.8 {d0, d1, d2}, [%1]! \n" + + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc" + ); +} + +void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + MEMACCESS(3) + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + "subs %2, %2, #24 \n" + // average src line 0 with src line 1 + "vrhadd.u8 q0, q0, q2 \n" + "vrhadd.u8 q1, q1, q3 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q3, d1 \n" + "vmlal.u8 q3, d0, d24 \n" + "vqrshrn.u16 d0, q3, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q3, d2 \n" + "vmlal.u8 q3, d3, d24 \n" + "vqrshrn.u16 d2, q3, #2 \n" + + MEMACCESS(1) + "vst3.8 {d0, d1, d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc" + ); +} + +#define HAS_SCALEROWDOWN38_NEON +static uvec8 kShuf38 = + { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; +static uvec8 kShuf38_2 = + { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 }; +static vec16 kMult38_Div6 = + { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; +static vec16 kMult38_Div9 = + { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; + +// 32 -> 12 +void ScaleRowDown38_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + MEMACCESS(3) + "vld1.8 {q3}, [%3] \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" + "subs %2, %2, #12 \n" + "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" + "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" + MEMACCESS(1) + "vst1.8 {d4}, [%1]! \n" + MEMACCESS(1) + "vst1.32 {d5[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(&kShuf38) // %3 + : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc" + ); +} + +// 32x3 -> 12x1 +void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + const uint8* src_ptr1 = src_ptr + src_stride * 2; + + asm volatile ( + MEMACCESS(5) + "vld1.16 {q13}, [%5] \n" + MEMACCESS(6) + "vld1.8 {q14}, [%6] \n" + MEMACCESS(7) + "vld1.8 {q15}, [%7] \n" + "add %3, %0 \n" + ".p2align 2 \n" + "1: \n" + + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + MEMACCESS(3) + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" + MEMACCESS(4) + "vld4.8 {d16, d17, d18, d19}, [%4]! \n" + "subs %2, %2, #12 \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + "vtrn.u8 d16, d17 \n" + + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + "vtrn.u8 d18, d19 \n" + + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + "vpaddl.u8 q8, q8 \n" + + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + "vpaddl.u8 d19, d19 \n" + + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 q0, q8 \n" + "vadd.u16 d4, d3, d7 \n" + "vadd.u16 d4, d19 \n" + + // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] + // + s[6 + st * 1] + s[7 + st * 1] + // + s[6 + st * 2] + s[7 + st * 2]) / 6 + "vqrdmulh.s16 q2, q2, q13 \n" + "vmovn.u16 d4, q2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + "vmovl.u8 q9, d18 \n" + + // combine source lines + "vadd.u16 q1, q3 \n" + "vadd.u16 q1, q9 \n" + + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" + + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" + + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q0, q15 \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" + + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + + MEMACCESS(1) + "vst1.8 {d3}, [%1]! \n" + MEMACCESS(1) + "vst1.32 {d4[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride), // %3 + "+r"(src_ptr1) // %4 + : "r"(&kMult38_Div6), // %5 + "r"(&kShuf38_2), // %6 + "r"(&kMult38_Div9) // %7 + : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc" + ); +} + +// 32x2 -> 12x1 +void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + MEMACCESS(4) + "vld1.16 {q13}, [%4] \n" + MEMACCESS(5) + "vld1.8 {q14}, [%5] \n" + "add %3, %0 \n" + ".p2align 2 \n" + "1: \n" + + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + MEMACCESS(3) + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" + "subs %2, %2, #12 \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 d4, d3, d7 \n" + + // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 + "vqrshrn.u16 d4, q2, #2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + + // combine source lines + "vadd.u16 q1, q3 \n" + + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" + + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" + + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q0, q13 \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" + + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + + MEMACCESS(1) + "vst1.8 {d3}, [%1]! \n" + MEMACCESS(1) + "vst1.32 {d4[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : "r"(&kMult38_Div6), // %4 + "r"(&kShuf38_2) // %5 + : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" + ); +} + +void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, int src_height) { + const uint8* src_tmp = NULL; + asm volatile ( + ".p2align 2 \n" + "1: \n" + "mov %0, %1 \n" + "mov r12, %5 \n" + "veor q2, q2, q2 \n" + "veor q3, q3, q3 \n" + "2: \n" + // load 16 pixels into q0 + MEMACCESS(0) + "vld1.8 {q0}, [%0], %3 \n" + "vaddw.u8 q3, q3, d1 \n" + "vaddw.u8 q2, q2, d0 \n" + "subs r12, r12, #1 \n" + "bgt 2b \n" + MEMACCESS(2) + "vst1.16 {q2, q3}, [%2]! \n" // store pixels + "add %1, %1, #16 \n" + "subs %4, %4, #16 \n" // 16 processed per loop + "bgt 1b \n" + : "+r"(src_tmp), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_ptr), // %2 + "+r"(src_stride), // %3 + "+r"(src_width), // %4 + "+r"(src_height) // %5 + : + : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD2_DATA8_LANE(n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5 \n" \ + "add %3, %3, %4 \n" \ + MEMACCESS(6) \ + "vld2.8 {d6["#n"], d7["#n"]}, [%6] \n" + +void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + int dx_offset[4] = {0, 1, 2, 3}; + int* tmp = dx_offset; + const uint8* src_tmp = src_ptr; + asm volatile ( + ".p2align 2 \n" + "vdup.32 q0, %3 \n" // x + "vdup.32 q1, %4 \n" // dx + "vld1.32 {q2}, [%5] \n" // 0 1 2 3 + "vshl.i32 q3, q1, #2 \n" // 4 * dx + "vmul.s32 q1, q1, q2 \n" + // x , x + 1 * dx, x + 2 * dx, x + 3 * dx + "vadd.s32 q1, q1, q0 \n" + // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx + "vadd.s32 q2, q1, q3 \n" + "vshl.i32 q0, q3, #1 \n" // 8 * dx + "1: \n" + LOAD2_DATA8_LANE(0) + LOAD2_DATA8_LANE(1) + LOAD2_DATA8_LANE(2) + LOAD2_DATA8_LANE(3) + LOAD2_DATA8_LANE(4) + LOAD2_DATA8_LANE(5) + LOAD2_DATA8_LANE(6) + LOAD2_DATA8_LANE(7) + "vmov q10, q1 \n" + "vmov q11, q2 \n" + "vuzp.16 q10, q11 \n" + "vmovl.u8 q8, d6 \n" + "vmovl.u8 q9, d7 \n" + "vsubl.s16 q11, d18, d16 \n" + "vsubl.s16 q12, d19, d17 \n" + "vmovl.u16 q13, d20 \n" + "vmovl.u16 q10, d21 \n" + "vmul.s32 q11, q11, q13 \n" + "vmul.s32 q12, q12, q10 \n" + "vshrn.s32 d18, q11, #16 \n" + "vshrn.s32 d19, q12, #16 \n" + "vadd.s16 q8, q8, q9 \n" + "vmovn.s16 d6, q8 \n" + + MEMACCESS(0) + "vst1.8 {d6}, [%0]! \n" // store pixels + "vadd.s32 q1, q1, q0 \n" + "vadd.s32 q2, q2, q0 \n" + "subs %2, %2, #8 \n" // 8 processed per loop + "bgt 1b \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(x), // %3 + "+r"(dx), // %4 + "+r"(tmp), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13" + ); +} + +#undef LOAD2_DATA8_LANE + +// 16x2 -> 16x1 +void ScaleFilterRows_NEON(uint8* dst_ptr, + const uint8* src_ptr, ptrdiff_t src_stride, + int dst_width, int source_y_fraction) { + asm volatile ( + "cmp %4, #0 \n" + "beq 100f \n" + "add %2, %1 \n" + "cmp %4, #64 \n" + "beq 75f \n" + "cmp %4, #128 \n" + "beq 50f \n" + "cmp %4, #192 \n" + "beq 25f \n" + + "vdup.8 d5, %4 \n" + "rsb %4, #256 \n" + "vdup.8 d4, %4 \n" + // General purpose row blend. + "1: \n" + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vmull.u8 q13, d0, d4 \n" + "vmull.u8 q14, d1, d4 \n" + "vmlal.u8 q13, d2, d5 \n" + "vmlal.u8 q14, d3, d5 \n" + "vrshrn.u16 d0, q13, #8 \n" + "vrshrn.u16 d1, q14, #8 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 1b \n" + "b 99f \n" + + // Blend 25 / 75. + "25: \n" + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 25b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 50b \n" + "b 99f \n" + + // Blend 75 / 25. + "75: \n" + MEMACCESS(1) + "vld1.8 {q1}, [%1]! \n" + MEMACCESS(2) + "vld1.8 {q0}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 75b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" + "subs %3, %3, #16 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 100b \n" + + "99: \n" + MEMACCESS(0) + "vst1.8 {d1[7]}, [%0] \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(source_y_fraction) // %4 + : + : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc" + ); +} + +void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + // load even pixels into q0, odd into q1 + MEMACCESS(0) + "vld2.32 {q0, q1}, [%0]! \n" + MEMACCESS(0) + "vld2.32 {q2, q3}, [%0]! \n" + "subs %2, %2, #8 \n" // 8 processed per loop + MEMACCESS(1) + "vst1.8 {q1}, [%1]! \n" // store odd pixels + MEMACCESS(1) + "vst1.8 {q3}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. + "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack + "vrshrn.u16 d1, q1, #1 \n" + "vrshrn.u16 d2, q2, #1 \n" + "vrshrn.u16 d3, q3, #1 \n" + MEMACCESS(1) + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels. + MEMACCESS(1) + "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. + "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. + "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack + "vrshrn.u16 d1, q1, #2 \n" + "vrshrn.u16 d2, q2, #2 \n" + "vrshrn.u16 d3, q3, #2 \n" + MEMACCESS(2) + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" + ); +} + +// Reads 4 pixels at a time. +// Alignment requirement: src_argb 4 byte aligned. +void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, uint8* dst_argb, int dst_width) { + asm volatile ( + "mov r12, %3, lsl #2 \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.32 {d0[0]}, [%0], r12 \n" + MEMACCESS(0) + "vld1.32 {d0[1]}, [%0], r12 \n" + MEMACCESS(0) + "vld1.32 {d1[0]}, [%0], r12 \n" + MEMACCESS(0) + "vld1.32 {d1[1]}, [%0], r12 \n" + "subs %2, %2, #4 \n" // 4 pixels per loop. + MEMACCESS(1) + "vst1.8 {q0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"(src_stepx) // %3 + : "memory", "cc", "r12", "q0" + ); +} + +// Reads 4 pixels at a time. +// Alignment requirement: src_argb 4 byte aligned. +void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + asm volatile ( + "mov r12, %4, lsl #2 \n" + "add %1, %1, %0 \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 + MEMACCESS(1) + "vld1.8 {d1}, [%1], r12 \n" + MEMACCESS(0) + "vld1.8 {d2}, [%0], r12 \n" + MEMACCESS(1) + "vld1.8 {d3}, [%1], r12 \n" + MEMACCESS(0) + "vld1.8 {d4}, [%0], r12 \n" + MEMACCESS(1) + "vld1.8 {d5}, [%1], r12 \n" + MEMACCESS(0) + "vld1.8 {d6}, [%0], r12 \n" + MEMACCESS(1) + "vld1.8 {d7}, [%1], r12 \n" + "vaddl.u8 q0, d0, d1 \n" + "vaddl.u8 q1, d2, d3 \n" + "vaddl.u8 q2, d4, d5 \n" + "vaddl.u8 q3, d6, d7 \n" + "vswp.8 d1, d2 \n" // ab_cd -> ac_bd + "vswp.8 d5, d6 \n" // ef_gh -> eg_fh + "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) + "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) + "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. + "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. + "subs %3, %3, #4 \n" // 4 pixels per loop. + MEMACCESS(2) + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width) // %3 + : "r"(src_stepx) // %4 + : "memory", "cc", "r12", "q0", "q1", "q2", "q3" + ); +} + +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD1_DATA32_LANE(dn, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + MEMACCESS(6) \ + "vld1.32 {"#dn"["#n"]}, [%6] \n" + +void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + int tmp = 0; + const uint8* src_tmp = src_argb; + asm volatile ( + ".p2align 2 \n" + "1: \n" + LOAD1_DATA32_LANE(d0, 0) + LOAD1_DATA32_LANE(d0, 1) + LOAD1_DATA32_LANE(d1, 0) + LOAD1_DATA32_LANE(d1, 1) + LOAD1_DATA32_LANE(d2, 0) + LOAD1_DATA32_LANE(d2, 1) + LOAD1_DATA32_LANE(d3, 0) + LOAD1_DATA32_LANE(d3, 1) + + MEMACCESS(0) + "vst1.32 {q0, q1}, [%0]! \n" // store pixels + "subs %2, %2, #8 \n" // 8 processed per loop + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width), // %2 + "+r"(x), // %3 + "+r"(dx), // %4 + "+r"(tmp), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "q0", "q1" + ); +} + +#undef LOAD1_DATA32_LANE + +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD2_DATA32_LANE(dn1, dn2, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + MEMACCESS(6) \ + "vld2.32 {"#dn1"["#n"], "#dn2"["#n"]}, [%6] \n" + +void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + int dx_offset[4] = {0, 1, 2, 3}; + int* tmp = dx_offset; + const uint8* src_tmp = src_argb; + asm volatile ( + ".p2align 2 \n" + "vdup.32 q0, %3 \n" // x + "vdup.32 q1, %4 \n" // dx + "vld1.32 {q2}, [%5] \n" // 0 1 2 3 + "vshl.i32 q9, q1, #2 \n" // 4 * dx + "vmul.s32 q1, q1, q2 \n" + "vmov.i8 q3, #0x7f \n" // 0x7F + "vmov.i16 q15, #0x7f \n" // 0x7F + // x , x + 1 * dx, x + 2 * dx, x + 3 * dx + "vadd.s32 q8, q1, q0 \n" + "1: \n" + // d0, d1: a + // d2, d3: b + LOAD2_DATA32_LANE(d0, d2, 0) + LOAD2_DATA32_LANE(d0, d2, 1) + LOAD2_DATA32_LANE(d1, d3, 0) + LOAD2_DATA32_LANE(d1, d3, 1) + "vshrn.i32 d22, q8, #9 \n" + "vand.16 d22, d22, d30 \n" + "vdup.8 d24, d22[0] \n" + "vdup.8 d25, d22[2] \n" + "vdup.8 d26, d22[4] \n" + "vdup.8 d27, d22[6] \n" + "vext.8 d4, d24, d25, #4 \n" + "vext.8 d5, d26, d27, #4 \n" // f + "veor.8 q10, q2, q3 \n" // 0x7f ^ f + "vmull.u8 q11, d0, d20 \n" + "vmull.u8 q12, d1, d21 \n" + "vmull.u8 q13, d2, d4 \n" + "vmull.u8 q14, d3, d5 \n" + "vadd.i16 q11, q11, q13 \n" + "vadd.i16 q12, q12, q14 \n" + "vshrn.i16 d0, q11, #7 \n" + "vshrn.i16 d1, q12, #7 \n" + + MEMACCESS(0) + "vst1.32 {d0, d1}, [%0]! \n" // store pixels + "vadd.s32 q8, q8, q9 \n" + "subs %2, %2, #4 \n" // 4 processed per loop + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width), // %2 + "+r"(x), // %3 + "+r"(dx), // %4 + "+r"(tmp), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +#undef LOAD2_DATA32_LANE + +#endif // defined(__ARM_NEON__) && !defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/scale_neon64.cc b/libs/libaom/src/third_party/libyuv/source/scale_neon64.cc new file mode 100644 index 000000000..1d5519357 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/scale_neon64.cc @@ -0,0 +1,1042 @@ +/* + * Copyright 2014 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" +#include "libyuv/row.h" +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon armv8 64 bit. +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +// Read 32x1 throw away even pixels, and write 16x1. +void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + "1: \n" + // load even pixels into v0, odd into v1 + MEMACCESS(0) + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop + MEMACCESS(1) + "st1 {v1.16b}, [%1], #16 \n" // store odd pixels + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1" // Clobber List + ); +} + +// Read 32x1 average down and write 16x1. +void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc + "subs %w2, %w2, #16 \n" // 16 processed per loop + "uaddlp v0.8h, v0.16b \n" // add adjacent + "uaddlp v1.8h, v1.16b \n" + "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack + "rshrn2 v0.16b, v1.8h, #1 \n" + MEMACCESS(1) + "st1 {v0.16b}, [%1], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1" // Clobber List + ); +} + +// Read 32x2 average down and write 16x1. +void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc + MEMACCESS(1) + "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc + "subs %w3, %w3, #16 \n" // 16 processed per loop + "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent + "uaddlp v1.8h, v1.16b \n" + "uadalp v0.8h, v2.16b \n" // row 2 add adjacent + row1 + "uadalp v1.8h, v3.16b \n" + "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack + "rshrn2 v0.16b, v1.8h, #2 \n" + MEMACCESS(2) + "st1 {v0.16b}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "v0", "v1", "v2", "v3" // Clobber List + ); +} + +void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "subs %w2, %w2, #8 \n" // 8 processed per loop + MEMACCESS(1) + "st1 {v2.8b}, [%1], #8 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1", "v2", "v3", "memory", "cc" + ); +} + +void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + const uint8* src_ptr1 = src_ptr + src_stride; + const uint8* src_ptr2 = src_ptr + src_stride * 2; + const uint8* src_ptr3 = src_ptr + src_stride * 3; +asm volatile ( + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 + MEMACCESS(3) + "ld1 {v1.16b}, [%2], #16 \n" + MEMACCESS(4) + "ld1 {v2.16b}, [%3], #16 \n" + MEMACCESS(5) + "ld1 {v3.16b}, [%4], #16 \n" + "subs %w5, %w5, #4 \n" + "uaddlp v0.8h, v0.16b \n" + "uadalp v0.8h, v1.16b \n" + "uadalp v0.8h, v2.16b \n" + "uadalp v0.8h, v3.16b \n" + "addp v0.8h, v0.8h, v0.8h \n" + "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding + MEMACCESS(1) + "st1 {v0.s}[0], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_ptr1), // %2 + "+r"(src_ptr2), // %3 + "+r"(src_ptr3), // %4 + "+r"(dst_width) // %5 + : + : "v0", "v1", "v2", "v3", "memory", "cc" + ); +} + +// Down scale from 4 to 3 pixels. Use the neon multilane read/write +// to load up the every 4th pixel into a 4 different registers. +// Point samples 32 pixels to 24 pixels. +void ScaleRowDown34_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "subs %w2, %w2, #24 \n" + "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2 + MEMACCESS(1) + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1", "v2", "v3", "memory", "cc" + ); +} + +void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movi v20.8b, #3 \n" + "add %3, %3, %0 \n" + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + MEMACCESS(3) + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 + "subs %w2, %w2, #24 \n" + + // filter src line 0 with src line 1 + // expand chars to shorts to allow for room + // when adding lines together + "ushll v16.8h, v4.8b, #0 \n" + "ushll v17.8h, v5.8b, #0 \n" + "ushll v18.8h, v6.8b, #0 \n" + "ushll v19.8h, v7.8b, #0 \n" + + // 3 * line_0 + line_1 + "umlal v16.8h, v0.8b, v20.8b \n" + "umlal v17.8h, v1.8b, v20.8b \n" + "umlal v18.8h, v2.8b, v20.8b \n" + "umlal v19.8h, v3.8b, v20.8b \n" + + // (3 * line_0 + line_1) >> 2 + "uqrshrn v0.8b, v16.8h, #2 \n" + "uqrshrn v1.8b, v17.8h, #2 \n" + "uqrshrn v2.8b, v18.8h, #2 \n" + "uqrshrn v3.8b, v19.8h, #2 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "ushll v16.8h, v1.8b, #0 \n" + "umlal v16.8h, v0.8b, v20.8b \n" + "uqrshrn v0.8b, v16.8h, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "urhadd v1.8b, v1.8b, v2.8b \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "ushll v16.8h, v2.8b, #0 \n" + "umlal v16.8h, v3.8b, v20.8b \n" + "uqrshrn v2.8b, v16.8h, #2 \n" + + MEMACCESS(1) + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", + "v20", "memory", "cc" + ); +} + +void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movi v20.8b, #3 \n" + "add %3, %3, %0 \n" + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + MEMACCESS(3) + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 + "subs %w2, %w2, #24 \n" + // average src line 0 with src line 1 + "urhadd v0.8b, v0.8b, v4.8b \n" + "urhadd v1.8b, v1.8b, v5.8b \n" + "urhadd v2.8b, v2.8b, v6.8b \n" + "urhadd v3.8b, v3.8b, v7.8b \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "ushll v4.8h, v1.8b, #0 \n" + "umlal v4.8h, v0.8b, v20.8b \n" + "uqrshrn v0.8b, v4.8h, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "urhadd v1.8b, v1.8b, v2.8b \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "ushll v4.8h, v2.8b, #0 \n" + "umlal v4.8h, v3.8b, v20.8b \n" + "uqrshrn v2.8b, v4.8h, #2 \n" + + MEMACCESS(1) + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc" + ); +} + +static uvec8 kShuf38 = + { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; +static uvec8 kShuf38_2 = + { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 }; +static vec16 kMult38_Div6 = + { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; +static vec16 kMult38_Div9 = + { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; + +// 32 -> 12 +void ScaleRowDown38_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + MEMACCESS(3) + "ld1 {v3.16b}, [%3] \n" + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #12 \n" + "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" + MEMACCESS(1) + "st1 {v2.8b}, [%1], #8 \n" + MEMACCESS(1) + "st1 {v2.s}[2], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(&kShuf38) // %3 + : "v0", "v1", "v2", "v3", "memory", "cc" + ); +} + +// 32x3 -> 12x1 +void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + const uint8* src_ptr1 = src_ptr + src_stride * 2; + ptrdiff_t tmp_src_stride = src_stride; + + asm volatile ( + MEMACCESS(5) + "ld1 {v29.8h}, [%5] \n" + MEMACCESS(6) + "ld1 {v30.16b}, [%6] \n" + MEMACCESS(7) + "ld1 {v31.8h}, [%7] \n" + "add %2, %2, %0 \n" + "1: \n" + + // 00 40 01 41 02 42 03 43 + // 10 50 11 51 12 52 13 53 + // 20 60 21 61 22 62 23 63 + // 30 70 31 71 32 72 33 73 + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" + MEMACCESS(3) + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" + MEMACCESS(4) + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" + "subs %w4, %w4, #12 \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // 00 10 01 11 02 12 03 13 + // 40 50 41 51 42 52 43 53 + "trn1 v20.8b, v0.8b, v1.8b \n" + "trn2 v21.8b, v0.8b, v1.8b \n" + "trn1 v22.8b, v4.8b, v5.8b \n" + "trn2 v23.8b, v4.8b, v5.8b \n" + "trn1 v24.8b, v16.8b, v17.8b \n" + "trn2 v25.8b, v16.8b, v17.8b \n" + + // 20 30 21 31 22 32 23 33 + // 60 70 61 71 62 72 63 73 + "trn1 v0.8b, v2.8b, v3.8b \n" + "trn2 v1.8b, v2.8b, v3.8b \n" + "trn1 v4.8b, v6.8b, v7.8b \n" + "trn2 v5.8b, v6.8b, v7.8b \n" + "trn1 v16.8b, v18.8b, v19.8b \n" + "trn2 v17.8b, v18.8b, v19.8b \n" + + // 00+10 01+11 02+12 03+13 + // 40+50 41+51 42+52 43+53 + "uaddlp v20.4h, v20.8b \n" + "uaddlp v21.4h, v21.8b \n" + "uaddlp v22.4h, v22.8b \n" + "uaddlp v23.4h, v23.8b \n" + "uaddlp v24.4h, v24.8b \n" + "uaddlp v25.4h, v25.8b \n" + + // 60+70 61+71 62+72 63+73 + "uaddlp v1.4h, v1.8b \n" + "uaddlp v5.4h, v5.8b \n" + "uaddlp v17.4h, v17.8b \n" + + // combine source lines + "add v20.4h, v20.4h, v22.4h \n" + "add v21.4h, v21.4h, v23.4h \n" + "add v20.4h, v20.4h, v24.4h \n" + "add v21.4h, v21.4h, v25.4h \n" + "add v2.4h, v1.4h, v5.4h \n" + "add v2.4h, v2.4h, v17.4h \n" + + // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] + // + s[6 + st * 1] + s[7 + st * 1] + // + s[6 + st * 2] + s[7 + st * 2]) / 6 + "sqrdmulh v2.8h, v2.8h, v29.8h \n" + "xtn v2.8b, v2.8h \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "ushll v16.8h, v16.8b, #0 \n" + "uaddl v0.8h, v0.8b, v4.8b \n" + + // combine source lines + "add v0.8h, v0.8h, v16.8h \n" + + // xx 20 xx 21 xx 22 xx 23 + // xx 30 xx 31 xx 32 xx 33 + "trn1 v1.8h, v0.8h, v0.8h \n" + "trn2 v4.8h, v0.8h, v0.8h \n" + "xtn v0.4h, v1.4s \n" + "xtn v4.4h, v4.4s \n" + + // 0+1+2, 3+4+5 + "add v20.8h, v20.8h, v0.8h \n" + "add v21.8h, v21.8h, v4.8h \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "sqrdmulh v0.8h, v20.8h, v31.8h \n" + "sqrdmulh v1.8h, v21.8h, v31.8h \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" + + MEMACCESS(1) + "st1 {v3.8b}, [%1], #8 \n" + MEMACCESS(1) + "st1 {v3.s}[2], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(tmp_src_stride), // %2 + "+r"(src_ptr1), // %3 + "+r"(dst_width) // %4 + : "r"(&kMult38_Div6), // %5 + "r"(&kShuf38_2), // %6 + "r"(&kMult38_Div9) // %7 + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", + "v30", "v31", "memory", "cc" + ); +} + +// 32x2 -> 12x1 +void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + // TODO(fbarchard): use src_stride directly for clang 3.5+. + ptrdiff_t tmp_src_stride = src_stride; + asm volatile ( + MEMACCESS(4) + "ld1 {v30.8h}, [%4] \n" + MEMACCESS(5) + "ld1 {v31.16b}, [%5] \n" + "add %2, %2, %0 \n" + "1: \n" + + // 00 40 01 41 02 42 03 43 + // 10 50 11 51 12 52 13 53 + // 20 60 21 61 22 62 23 63 + // 30 70 31 71 32 72 33 73 + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" + MEMACCESS(3) + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" + "subs %w3, %w3, #12 \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // 00 10 01 11 02 12 03 13 + // 40 50 41 51 42 52 43 53 + "trn1 v16.8b, v0.8b, v1.8b \n" + "trn2 v17.8b, v0.8b, v1.8b \n" + "trn1 v18.8b, v4.8b, v5.8b \n" + "trn2 v19.8b, v4.8b, v5.8b \n" + + // 20 30 21 31 22 32 23 33 + // 60 70 61 71 62 72 63 73 + "trn1 v0.8b, v2.8b, v3.8b \n" + "trn2 v1.8b, v2.8b, v3.8b \n" + "trn1 v4.8b, v6.8b, v7.8b \n" + "trn2 v5.8b, v6.8b, v7.8b \n" + + // 00+10 01+11 02+12 03+13 + // 40+50 41+51 42+52 43+53 + "uaddlp v16.4h, v16.8b \n" + "uaddlp v17.4h, v17.8b \n" + "uaddlp v18.4h, v18.8b \n" + "uaddlp v19.4h, v19.8b \n" + + // 60+70 61+71 62+72 63+73 + "uaddlp v1.4h, v1.8b \n" + "uaddlp v5.4h, v5.8b \n" + + // combine source lines + "add v16.4h, v16.4h, v18.4h \n" + "add v17.4h, v17.4h, v19.4h \n" + "add v2.4h, v1.4h, v5.4h \n" + + // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 + "uqrshrn v2.8b, v2.8h, #2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + + // combine source lines + "uaddl v0.8h, v0.8b, v4.8b \n" + + // xx 20 xx 21 xx 22 xx 23 + // xx 30 xx 31 xx 32 xx 33 + "trn1 v1.8h, v0.8h, v0.8h \n" + "trn2 v4.8h, v0.8h, v0.8h \n" + "xtn v0.4h, v1.4s \n" + "xtn v4.4h, v4.4s \n" + + // 0+1+2, 3+4+5 + "add v16.8h, v16.8h, v0.8h \n" + "add v17.8h, v17.8h, v4.8h \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "sqrdmulh v0.8h, v16.8h, v30.8h \n" + "sqrdmulh v1.8h, v17.8h, v30.8h \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + + "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" + + MEMACCESS(1) + "st1 {v3.8b}, [%1], #8 \n" + MEMACCESS(1) + "st1 {v3.s}[2], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(tmp_src_stride), // %2 + "+r"(dst_width) // %3 + : "r"(&kMult38_Div6), // %4 + "r"(&kShuf38_2) // %5 + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", + "v18", "v19", "v30", "v31", "memory", "cc" + ); +} + +void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, int src_height) { + const uint8* src_tmp = NULL; + asm volatile ( + "1: \n" + "mov %0, %1 \n" + "mov w12, %w5 \n" + "eor v2.16b, v2.16b, v2.16b \n" + "eor v3.16b, v3.16b, v3.16b \n" + "2: \n" + // load 16 pixels into q0 + MEMACCESS(0) + "ld1 {v0.16b}, [%0], %3 \n" + "uaddw2 v3.8h, v3.8h, v0.16b \n" + "uaddw v2.8h, v2.8h, v0.8b \n" + "subs w12, w12, #1 \n" + "b.gt 2b \n" + MEMACCESS(2) + "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels + "add %1, %1, #16 \n" + "subs %w4, %w4, #16 \n" // 16 processed per loop + "b.gt 1b \n" + : "+r"(src_tmp), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_ptr), // %2 + "+r"(src_stride), // %3 + "+r"(src_width), // %4 + "+r"(src_height) // %5 + : + : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD2_DATA8_LANE(n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5 \n" \ + "add %3, %3, %4 \n" \ + MEMACCESS(6) \ + "ld2 {v4.b, v5.b}["#n"], [%6] \n" + +void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + int dx_offset[4] = {0, 1, 2, 3}; + int* tmp = dx_offset; + const uint8* src_tmp = src_ptr; + int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. + int64 x64 = (int64) x; + int64 dx64 = (int64) dx; + asm volatile ( + "dup v0.4s, %w3 \n" // x + "dup v1.4s, %w4 \n" // dx + "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 + "shl v3.4s, v1.4s, #2 \n" // 4 * dx + "mul v1.4s, v1.4s, v2.4s \n" + // x , x + 1 * dx, x + 2 * dx, x + 3 * dx + "add v1.4s, v1.4s, v0.4s \n" + // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx + "add v2.4s, v1.4s, v3.4s \n" + "shl v0.4s, v3.4s, #1 \n" // 8 * dx + "1: \n" + LOAD2_DATA8_LANE(0) + LOAD2_DATA8_LANE(1) + LOAD2_DATA8_LANE(2) + LOAD2_DATA8_LANE(3) + LOAD2_DATA8_LANE(4) + LOAD2_DATA8_LANE(5) + LOAD2_DATA8_LANE(6) + LOAD2_DATA8_LANE(7) + "mov v6.16b, v1.16b \n" + "mov v7.16b, v2.16b \n" + "uzp1 v6.8h, v6.8h, v7.8h \n" + "ushll v4.8h, v4.8b, #0 \n" + "ushll v5.8h, v5.8b, #0 \n" + "ssubl v16.4s, v5.4h, v4.4h \n" + "ssubl2 v17.4s, v5.8h, v4.8h \n" + "ushll v7.4s, v6.4h, #0 \n" + "ushll2 v6.4s, v6.8h, #0 \n" + "mul v16.4s, v16.4s, v7.4s \n" + "mul v17.4s, v17.4s, v6.4s \n" + "shrn v6.4h, v16.4s, #16 \n" + "shrn2 v6.8h, v17.4s, #16 \n" + "add v4.8h, v4.8h, v6.8h \n" + "xtn v4.8b, v4.8h \n" + + MEMACCESS(0) + "st1 {v4.8b}, [%0], #8 \n" // store pixels + "add v1.4s, v1.4s, v0.4s \n" + "add v2.4s, v2.4s, v0.4s \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "b.gt 1b \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width64), // %2 + "+r"(x64), // %3 + "+r"(dx64), // %4 + "+r"(tmp), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "v0", "v1", "v2", "v3", + "v4", "v5", "v6", "v7", "v16", "v17" + ); +} + +#undef LOAD2_DATA8_LANE + +// 16x2 -> 16x1 +void ScaleFilterRows_NEON(uint8* dst_ptr, + const uint8* src_ptr, ptrdiff_t src_stride, + int dst_width, int source_y_fraction) { + int y_fraction = 256 - source_y_fraction; + asm volatile ( + "cmp %w4, #0 \n" + "b.eq 100f \n" + "add %2, %2, %1 \n" + "cmp %w4, #64 \n" + "b.eq 75f \n" + "cmp %w4, #128 \n" + "b.eq 50f \n" + "cmp %w4, #192 \n" + "b.eq 25f \n" + + "dup v5.8b, %w4 \n" + "dup v4.8b, %w5 \n" + // General purpose row blend. + "1: \n" + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" + MEMACCESS(2) + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "umull v6.8h, v0.8b, v4.8b \n" + "umull2 v7.8h, v0.16b, v4.16b \n" + "umlal v6.8h, v1.8b, v5.8b \n" + "umlal2 v7.8h, v1.16b, v5.16b \n" + "rshrn v0.8b, v6.8h, #8 \n" + "rshrn2 v0.16b, v7.8h, #8 \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 1b \n" + "b 99f \n" + + // Blend 25 / 75. + "25: \n" + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" + MEMACCESS(2) + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 25b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" + MEMACCESS(2) + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 50b \n" + "b 99f \n" + + // Blend 75 / 25. + "75: \n" + MEMACCESS(1) + "ld1 {v1.16b}, [%1], #16 \n" + MEMACCESS(2) + "ld1 {v0.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 75b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" + "subs %w3, %w3, #16 \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 100b \n" + + "99: \n" + MEMACCESS(0) + "st1 {v0.b}[15], [%0] \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(source_y_fraction),// %4 + "+r"(y_fraction) // %5 + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc" + ); +} + +void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + "1: \n" + // load even pixels into q0, odd into q1 + MEMACCESS (0) + "ld2 {v0.4s, v1.4s}, [%0], #32 \n" + MEMACCESS (0) + "ld2 {v2.4s, v3.4s}, [%0], #32 \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + MEMACCESS (1) + "st1 {v1.16b}, [%1], #16 \n" // store odd pixels + MEMACCESS (1) + "st1 {v3.16b}, [%1], #16 \n" + "b.gt 1b \n" + : "+r" (src_ptr), // %0 + "+r" (dst), // %1 + "+r" (dst_width) // %2 + : + : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + asm volatile ( + "1: \n" + MEMACCESS (0) + // load 8 ARGB pixels. + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. + "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack + "rshrn v1.8b, v1.8h, #1 \n" + "rshrn v2.8b, v2.8h, #1 \n" + "rshrn v3.8b, v3.8h, #1 \n" + MEMACCESS (1) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + MEMACCESS (0) + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels. + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. + MEMACCESS (1) + "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels. + "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. + "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. + "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack + "rshrn v1.8b, v1.8h, #2 \n" + "rshrn v2.8b, v2.8h, #2 \n" + "rshrn v3.8b, v3.8h, #2 \n" + MEMACCESS (2) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" + "b.gt 1b \n" + : "+r" (src_ptr), // %0 + "+r" (src_stride), // %1 + "+r" (dst), // %2 + "+r" (dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19" + ); +} + +// Reads 4 pixels at a time. +// Alignment requirement: src_argb 4 byte aligned. +void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, uint8* dst_argb, int dst_width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld1 {v0.s}[0], [%0], %3 \n" + MEMACCESS(0) + "ld1 {v0.s}[1], [%0], %3 \n" + MEMACCESS(0) + "ld1 {v0.s}[2], [%0], %3 \n" + MEMACCESS(0) + "ld1 {v0.s}[3], [%0], %3 \n" + "subs %w2, %w2, #4 \n" // 4 pixels per loop. + MEMACCESS(1) + "st1 {v0.16b}, [%1], #16 \n" + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"((int64)(src_stepx * 4)) // %3 + : "memory", "cc", "v0" + ); +} + +// Reads 4 pixels at a time. +// Alignment requirement: src_argb 4 byte aligned. +// TODO(Yang Zhang): Might be worth another optimization pass in future. +// It could be upgraded to 8 pixels at a time to start with. +void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + asm volatile ( + "add %1, %1, %0 \n" + "1: \n" + MEMACCESS(0) + "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1 + MEMACCESS(1) + "ld1 {v1.8b}, [%1], %4 \n" + MEMACCESS(0) + "ld1 {v2.8b}, [%0], %4 \n" + MEMACCESS(1) + "ld1 {v3.8b}, [%1], %4 \n" + MEMACCESS(0) + "ld1 {v4.8b}, [%0], %4 \n" + MEMACCESS(1) + "ld1 {v5.8b}, [%1], %4 \n" + MEMACCESS(0) + "ld1 {v6.8b}, [%0], %4 \n" + MEMACCESS(1) + "ld1 {v7.8b}, [%1], %4 \n" + "uaddl v0.8h, v0.8b, v1.8b \n" + "uaddl v2.8h, v2.8b, v3.8b \n" + "uaddl v4.8h, v4.8b, v5.8b \n" + "uaddl v6.8h, v6.8b, v7.8b \n" + "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd + "mov v0.d[1], v2.d[0] \n" + "mov v2.d[0], v16.d[1] \n" + "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh + "mov v4.d[1], v6.d[0] \n" + "mov v6.d[0], v16.d[1] \n" + "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) + "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) + "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. + "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. + "subs %w3, %w3, #4 \n" // 4 pixels per loop. + MEMACCESS(2) + "st1 {v0.16b}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width) // %3 + : "r"((int64)(src_stepx * 4)) // %4 + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" + ); +} + +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD1_DATA32_LANE(vn, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + MEMACCESS(6) \ + "ld1 {"#vn".s}["#n"], [%6] \n" + +void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + const uint8* src_tmp = src_argb; + int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. + int64 x64 = (int64) x; + int64 dx64 = (int64) dx; + int64 tmp64 = 0; + asm volatile ( + "1: \n" + LOAD1_DATA32_LANE(v0, 0) + LOAD1_DATA32_LANE(v0, 1) + LOAD1_DATA32_LANE(v0, 2) + LOAD1_DATA32_LANE(v0, 3) + LOAD1_DATA32_LANE(v1, 0) + LOAD1_DATA32_LANE(v1, 1) + LOAD1_DATA32_LANE(v1, 2) + LOAD1_DATA32_LANE(v1, 3) + + MEMACCESS(0) + "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width64), // %2 + "+r"(x64), // %3 + "+r"(dx64), // %4 + "+r"(tmp64), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "v0", "v1" + ); +} + +#undef LOAD1_DATA32_LANE + +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD2_DATA32_LANE(vn1, vn2, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + MEMACCESS(6) \ + "ld2 {"#vn1".s, "#vn2".s}["#n"], [%6] \n" + +void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + int dx_offset[4] = {0, 1, 2, 3}; + int* tmp = dx_offset; + const uint8* src_tmp = src_argb; + int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. + int64 x64 = (int64) x; + int64 dx64 = (int64) dx; + asm volatile ( + "dup v0.4s, %w3 \n" // x + "dup v1.4s, %w4 \n" // dx + "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 + "shl v6.4s, v1.4s, #2 \n" // 4 * dx + "mul v1.4s, v1.4s, v2.4s \n" + "movi v3.16b, #0x7f \n" // 0x7F + "movi v4.8h, #0x7f \n" // 0x7F + // x , x + 1 * dx, x + 2 * dx, x + 3 * dx + "add v5.4s, v1.4s, v0.4s \n" + "1: \n" + // d0, d1: a + // d2, d3: b + LOAD2_DATA32_LANE(v0, v1, 0) + LOAD2_DATA32_LANE(v0, v1, 1) + LOAD2_DATA32_LANE(v0, v1, 2) + LOAD2_DATA32_LANE(v0, v1, 3) + "shrn v2.4h, v5.4s, #9 \n" + "and v2.8b, v2.8b, v4.8b \n" + "dup v16.8b, v2.b[0] \n" + "dup v17.8b, v2.b[2] \n" + "dup v18.8b, v2.b[4] \n" + "dup v19.8b, v2.b[6] \n" + "ext v2.8b, v16.8b, v17.8b, #4 \n" + "ext v17.8b, v18.8b, v19.8b, #4 \n" + "ins v2.d[1], v17.d[0] \n" // f + "eor v7.16b, v2.16b, v3.16b \n" // 0x7f ^ f + "umull v16.8h, v0.8b, v7.8b \n" + "umull2 v17.8h, v0.16b, v7.16b \n" + "umull v18.8h, v1.8b, v2.8b \n" + "umull2 v19.8h, v1.16b, v2.16b \n" + "add v16.8h, v16.8h, v18.8h \n" + "add v17.8h, v17.8h, v19.8h \n" + "shrn v0.8b, v16.8h, #7 \n" + "shrn2 v0.16b, v17.8h, #7 \n" + + MEMACCESS(0) + "st1 {v0.4s}, [%0], #16 \n" // store pixels + "add v5.4s, v5.4s, v6.4s \n" + "subs %w2, %w2, #4 \n" // 4 processed per loop + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width64), // %2 + "+r"(x64), // %3 + "+r"(dx64), // %4 + "+r"(tmp), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v16", "v17", "v18", "v19" + ); +} + +#undef LOAD2_DATA32_LANE + +#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/scale_win.cc b/libs/libaom/src/third_party/libyuv/source/scale_win.cc new file mode 100644 index 000000000..c3896ebad --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/scale_win.cc @@ -0,0 +1,1354 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for Visual C x86. +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ + defined(_MSC_VER) && !defined(__clang__) + +// Offsets for source bytes 0 to 9 +static uvec8 kShuf0 = + { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. +static uvec8 kShuf1 = + { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static uvec8 kShuf2 = + { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 0 to 10 +static uvec8 kShuf01 = + { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; + +// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. +static uvec8 kShuf11 = + { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static uvec8 kShuf21 = + { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; + +// Coefficients for source bytes 0 to 10 +static uvec8 kMadd01 = + { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; + +// Coefficients for source bytes 10 to 21 +static uvec8 kMadd11 = + { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; + +// Coefficients for source bytes 21 to 31 +static uvec8 kMadd21 = + { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; + +// Coefficients for source bytes 21 to 31 +static vec16 kRound34 = + { 2, 2, 2, 2, 2, 2, 2, 2 }; + +static uvec8 kShuf38a = + { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +static uvec8 kShuf38b = + { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 0,1,2 +static uvec8 kShufAc = + { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 3,4,5 +static uvec8 kShufAc3 = + { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x3 and 2x3 +static uvec16 kScaleAc33 = + { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; + +// Arrange first value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb0 = + { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; + +// Arrange second value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb1 = + { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; + +// Arrange third value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb2 = + { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x2 and 2x2 +static uvec16 kScaleAb2 = + { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; + +// Reads 32 pixels, throws half away and writes 16 pixels. +__declspec(naked) +void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // isolate odd pixels. + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg wloop + + ret + } +} + +// Blends 32x1 rectangle to 16x1. +__declspec(naked) +void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm5 + pand xmm3, xmm5 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg wloop + + ret + } +} + +// Blends 32x2 rectangle to 16x1. +__declspec(naked) +void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm5 + pand xmm3, xmm5 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg wloop + + pop esi + ret + } +} + +#ifdef HAS_SCALEROWDOWN2_AVX2 +// Reads 64 pixels, throws half away and writes 32 pixels. +__declspec(naked) +void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + + wloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpsrlw ymm0, ymm0, 8 // isolate odd pixels. + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg wloop + + vzeroupper + ret + } +} + +// Blends 64x1 rectangle to 32x1. +__declspec(naked) +void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + + vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b + vpsrlw ymm4, ymm4, 15 + vpackuswb ymm4, ymm4, ymm4 + vpxor ymm5, ymm5, ymm5 // constant 0 + + wloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + + vpmaddubsw ymm0, ymm0, ymm4 // average horizontally + vpmaddubsw ymm1, ymm1, ymm4 + vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 + vpavgw ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg wloop + + vzeroupper + ret + } +} + +// Blends 64x2 rectangle to 32x1. +__declspec(naked) +void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + + vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b + vpsrlw ymm4, ymm4, 15 + vpackuswb ymm4, ymm4, ymm4 + vpxor ymm5, ymm5, ymm5 // constant 0 + + wloop: + vmovdqu ymm0, [eax] // average rows + vmovdqu ymm1, [eax + 32] + vpavgb ymm0, ymm0, [eax + esi] + vpavgb ymm1, ymm1, [eax + esi + 32] + lea eax, [eax + 64] + + vpmaddubsw ymm0, ymm0, ymm4 // average horizontally + vpmaddubsw ymm1, ymm1, ymm4 + vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 + vpavgw ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg wloop + + pop esi + vzeroupper + ret + } +} +#endif // HAS_SCALEROWDOWN2_AVX2 + +// Point samples 32 pixels to 8 pixels. +__declspec(naked) +void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 + psrld xmm5, 24 + pslld xmm5, 16 + + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 + pand xmm1, xmm5 + packuswb xmm0, xmm1 + psrlw xmm0, 8 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + sub ecx, 8 + jg wloop + + ret + } +} + +// Blends 32x4 rectangle to 8x1. +__declspec(naked) +void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_ptr + mov esi, [esp + 8 + 8] // src_stride + mov edx, [esp + 8 + 12] // dst_ptr + mov ecx, [esp + 8 + 16] // dst_width + lea edi, [esi + esi * 2] // src_stride * 3 + pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff + psrlw xmm7, 8 + + wloop: + movdqu xmm0, [eax] // average rows + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + movdqu xmm2, [eax + esi * 2] + movdqu xmm3, [eax + esi * 2 + 16] + movdqu xmm4, [eax + edi] + movdqu xmm5, [eax + edi + 16] + lea eax, [eax + 32] + pavgb xmm2, xmm4 + pavgb xmm3, xmm5 + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm7 + pand xmm3, xmm7 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + movdqa xmm2, xmm0 // average columns (16 to 8 pixels) + psrlw xmm0, 8 + pand xmm2, xmm7 + pavgw xmm0, xmm2 + packuswb xmm0, xmm0 + + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + sub ecx, 8 + jg wloop + + pop edi + pop esi + ret + } +} + +#ifdef HAS_SCALEROWDOWN4_AVX2 +// Point samples 64 pixels to 16 pixels. +__declspec(naked) +void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000 + vpsrld ymm5, ymm5, 24 + vpslld ymm5, ymm5, 16 + + wloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpand ymm0, ymm0, ymm5 + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vpsrlw ymm0, ymm0, 8 + vpackuswb ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vmovdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg wloop + + vzeroupper + ret + } +} + +// Blends 64x4 rectangle to 16x1. +__declspec(naked) +void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_ptr + mov esi, [esp + 8 + 8] // src_stride + mov edx, [esp + 8 + 12] // dst_ptr + mov ecx, [esp + 8 + 16] // dst_width + lea edi, [esi + esi * 2] // src_stride * 3 + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0x00ff00ff + vpsrlw ymm7, ymm7, 8 + + wloop: + vmovdqu ymm0, [eax] // average rows + vmovdqu ymm1, [eax + 32] + vpavgb ymm0, ymm0, [eax + esi] + vpavgb ymm1, ymm1, [eax + esi + 32] + vmovdqu ymm2, [eax + esi * 2] + vmovdqu ymm3, [eax + esi * 2 + 32] + vpavgb ymm2, ymm2, [eax + edi] + vpavgb ymm3, ymm3, [eax + edi + 32] + lea eax, [eax + 64] + vpavgb ymm0, ymm0, ymm2 + vpavgb ymm1, ymm1, ymm3 + + vpand ymm2, ymm0, ymm7 // average columns (64 to 32 pixels) + vpand ymm3, ymm1, ymm7 + vpsrlw ymm0, ymm0, 8 + vpsrlw ymm1, ymm1, 8 + vpavgw ymm0, ymm0, ymm2 + vpavgw ymm1, ymm1, ymm3 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + + vpand ymm2, ymm0, ymm7 // average columns (32 to 16 pixels) + vpsrlw ymm0, ymm0, 8 + vpavgw ymm0, ymm0, ymm2 + vpackuswb ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + + vmovdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg wloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_SCALEROWDOWN4_AVX2 + +// Point samples 32 pixels to 24 pixels. +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. +// Then shuffled to do the scaling. + +__declspec(naked) +void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + movdqa xmm3, kShuf0 + movdqa xmm4, kShuf1 + movdqa xmm5, kShuf2 + + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa xmm2, xmm1 + palignr xmm1, xmm0, 8 + pshufb xmm0, xmm3 + pshufb xmm1, xmm4 + pshufb xmm2, xmm5 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + 8], xmm1 + movq qword ptr [edx + 16], xmm2 + lea edx, [edx + 24] + sub ecx, 24 + jg wloop + + ret + } +} + +// Blends 32x2 rectangle to 24x1 +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. +// Then shuffled to do the scaling. + +// Register usage: +// xmm0 src_row 0 +// xmm1 src_row 1 +// xmm2 shuf 0 +// xmm3 shuf 1 +// xmm4 shuf 2 +// xmm5 madd 0 +// xmm6 madd 1 +// xmm7 kRound34 + +// Note that movdqa+palign may be better than movdqu. +__declspec(naked) +void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, kShuf01 + movdqa xmm3, kShuf11 + movdqa xmm4, kShuf21 + movdqa xmm5, kMadd01 + movdqa xmm6, kMadd11 + movdqa xmm7, kRound34 + + wloop: + movdqu xmm0, [eax] // pixels 0..7 + movdqu xmm1, [eax + esi] + pavgb xmm0, xmm1 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + movdqu xmm0, [eax + 8] // pixels 8..15 + movdqu xmm1, [eax + esi + 8] + pavgb xmm0, xmm1 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx + 8], xmm0 + movdqu xmm0, [eax + 16] // pixels 16..23 + movdqu xmm1, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm1 + pshufb xmm0, xmm4 + movdqa xmm1, kMadd21 + pmaddubsw xmm0, xmm1 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx + 16], xmm0 + lea edx, [edx + 24] + sub ecx, 24 + jg wloop + + pop esi + ret + } +} + +// Note that movdqa+palign may be better than movdqu. +__declspec(naked) +void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, kShuf01 + movdqa xmm3, kShuf11 + movdqa xmm4, kShuf21 + movdqa xmm5, kMadd01 + movdqa xmm6, kMadd11 + movdqa xmm7, kRound34 + + wloop: + movdqu xmm0, [eax] // pixels 0..7 + movdqu xmm1, [eax + esi] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + movdqu xmm0, [eax + 8] // pixels 8..15 + movdqu xmm1, [eax + esi + 8] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx + 8], xmm0 + movdqu xmm0, [eax + 16] // pixels 16..23 + movdqu xmm1, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm4 + movdqa xmm1, kMadd21 + pmaddubsw xmm0, xmm1 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx + 16], xmm0 + lea edx, [edx+24] + sub ecx, 24 + jg wloop + + pop esi + ret + } +} + +// 3/8 point sampler + +// Scale 32 pixels to 12 +__declspec(naked) +void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + movdqa xmm4, kShuf38a + movdqa xmm5, kShuf38b + + xloop: + movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 + movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 + lea eax, [eax + 32] + pshufb xmm0, xmm4 + pshufb xmm1, xmm5 + paddusb xmm0, xmm1 + + movq qword ptr [edx], xmm0 // write 12 pixels + movhlps xmm1, xmm0 + movd [edx + 8], xmm1 + lea edx, [edx + 12] + sub ecx, 12 + jg xloop + + ret + } +} + +// Scale 16x3 pixels to 6x1 with interpolation +__declspec(naked) +void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, kShufAc + movdqa xmm3, kShufAc3 + movdqa xmm4, kScaleAc33 + pxor xmm5, xmm5 + + xloop: + movdqu xmm0, [eax] // sum up 3 rows into xmm0/1 + movdqu xmm6, [eax + esi] + movhlps xmm1, xmm0 + movhlps xmm7, xmm6 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpcklbw xmm6, xmm5 + punpcklbw xmm7, xmm5 + paddusw xmm0, xmm6 + paddusw xmm1, xmm7 + movdqu xmm6, [eax + esi * 2] + lea eax, [eax + 16] + movhlps xmm7, xmm6 + punpcklbw xmm6, xmm5 + punpcklbw xmm7, xmm5 + paddusw xmm0, xmm6 + paddusw xmm1, xmm7 + + movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 + psrldq xmm0, 2 + paddusw xmm6, xmm0 + psrldq xmm0, 2 + paddusw xmm6, xmm0 + pshufb xmm6, xmm2 + + movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 + psrldq xmm1, 2 + paddusw xmm7, xmm1 + psrldq xmm1, 2 + paddusw xmm7, xmm1 + pshufb xmm7, xmm3 + paddusw xmm6, xmm7 + + pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 + packuswb xmm6, xmm6 + + movd [edx], xmm6 // write 6 pixels + psrlq xmm6, 16 + movd [edx + 2], xmm6 + lea edx, [edx + 6] + sub ecx, 6 + jg xloop + + pop esi + ret + } +} + +// Scale 16x2 pixels to 6x1 with interpolation +__declspec(naked) +void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, kShufAb0 + movdqa xmm3, kShufAb1 + movdqa xmm4, kShufAb2 + movdqa xmm5, kScaleAb2 + + xloop: + movdqu xmm0, [eax] // average 2 rows into xmm0 + movdqu xmm1, [eax + esi] + lea eax, [eax + 16] + pavgb xmm0, xmm1 + + movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 + pshufb xmm1, xmm2 + movdqa xmm6, xmm0 + pshufb xmm6, xmm3 + paddusw xmm1, xmm6 + pshufb xmm0, xmm4 + paddusw xmm1, xmm0 + + pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 + packuswb xmm1, xmm1 + + movd [edx], xmm1 // write 6 pixels + psrlq xmm1, 16 + movd [edx + 2], xmm1 + lea edx, [edx + 6] + sub ecx, 6 + jg xloop + + pop esi + ret + } +} + +// Reads 16 bytes and accumulates to 16 shorts at a time. +__declspec(naked) +void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { + __asm { + mov eax, [esp + 4] // src_ptr + mov edx, [esp + 8] // dst_ptr + mov ecx, [esp + 12] // src_width + pxor xmm5, xmm5 + + // sum rows + xloop: + movdqu xmm3, [eax] // read 16 bytes + lea eax, [eax + 16] + movdqu xmm0, [edx] // read 16 words from destination + movdqu xmm1, [edx + 16] + movdqa xmm2, xmm3 + punpcklbw xmm2, xmm5 + punpckhbw xmm3, xmm5 + paddusw xmm0, xmm2 // sum 16 words + paddusw xmm1, xmm3 + movdqu [edx], xmm0 // write 16 words to destination + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 16 + jg xloop + ret + } +} + +#ifdef HAS_SCALEADDROW_AVX2 +// Reads 32 bytes and accumulates to 32 shorts at a time. +__declspec(naked) +void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { + __asm { + mov eax, [esp + 4] // src_ptr + mov edx, [esp + 8] // dst_ptr + mov ecx, [esp + 12] // src_width + vpxor ymm5, ymm5, ymm5 + + // sum rows + xloop: + vmovdqu ymm3, [eax] // read 32 bytes + lea eax, [eax + 32] + vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck + vpunpcklbw ymm2, ymm3, ymm5 + vpunpckhbw ymm3, ymm3, ymm5 + vpaddusw ymm0, ymm2, [edx] // sum 16 words + vpaddusw ymm1, ymm3, [edx + 32] + vmovdqu [edx], ymm0 // write 32 words to destination + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + sub ecx, 32 + jg xloop + + vzeroupper + ret + } +} +#endif // HAS_SCALEADDROW_AVX2 + +// Bilinear column filtering. SSSE3 version. +__declspec(naked) +void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + __asm { + push ebx + push esi + push edi + mov edi, [esp + 12 + 4] // dst_ptr + mov esi, [esp + 12 + 8] // src_ptr + mov ecx, [esp + 12 + 12] // dst_width + movd xmm2, [esp + 12 + 16] // x + movd xmm3, [esp + 12 + 20] // dx + mov eax, 0x04040000 // shuffle to line up fractions with pixel. + movd xmm5, eax + pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. + psrlw xmm6, 9 + pextrw eax, xmm2, 1 // get x0 integer. preroll + sub ecx, 2 + jl xloop29 + + movdqa xmm0, xmm2 // x1 = x0 + dx + paddd xmm0, xmm3 + punpckldq xmm2, xmm0 // x0 x1 + punpckldq xmm3, xmm3 // dx dx + paddd xmm3, xmm3 // dx * 2, dx * 2 + pextrw edx, xmm2, 3 // get x1 integer. preroll + + // 2 Pixel loop. + xloop2: + movdqa xmm1, xmm2 // x0, x1 fractions. + paddd xmm2, xmm3 // x += dx + movzx ebx, word ptr [esi + eax] // 2 source x0 pixels + movd xmm0, ebx + psrlw xmm1, 9 // 7 bit fractions. + movzx ebx, word ptr [esi + edx] // 2 source x1 pixels + movd xmm4, ebx + pshufb xmm1, xmm5 // 0011 + punpcklwd xmm0, xmm4 + pxor xmm1, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. + psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. + packuswb xmm0, xmm0 // 8 bits, 2 pixels. + movd ebx, xmm0 + mov [edi], bx + lea edi, [edi + 2] + sub ecx, 2 // 2 pixels + jge xloop2 + + xloop29: + + add ecx, 2 - 1 + jl xloop99 + + // 1 pixel remainder + movzx ebx, word ptr [esi + eax] // 2 source x0 pixels + movd xmm0, ebx + psrlw xmm2, 9 // 7 bit fractions. + pshufb xmm2, xmm5 // 0011 + pxor xmm2, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm2 // 16 bit + psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. + packuswb xmm0, xmm0 // 8 bits + movd ebx, xmm0 + mov [edi], bl + + xloop99: + + pop edi + pop esi + pop ebx + ret + } +} + +// Reads 16 pixels, duplicates them and writes 32 pixels. +__declspec(naked) +void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + __asm { + mov edx, [esp + 4] // dst_ptr + mov eax, [esp + 8] // src_ptr + mov ecx, [esp + 12] // dst_width + + wloop: + movdqu xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm0 + punpckhbw xmm1, xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 32 + jg wloop + + ret + } +} + +// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) +__declspec(naked) +void ScaleARGBRowDown2_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + __asm { + mov eax, [esp + 4] // src_argb + // src_stride ignored + mov edx, [esp + 12] // dst_argb + mov ecx, [esp + 16] // dst_width + + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + shufps xmm0, xmm1, 0xdd + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg wloop + + ret + } +} + +// Blends 8x1 rectangle to 4x1. +__declspec(naked) +void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + __asm { + mov eax, [esp + 4] // src_argb + // src_stride ignored + mov edx, [esp + 12] // dst_argb + mov ecx, [esp + 16] // dst_width + + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa xmm2, xmm0 + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels + pavgb xmm0, xmm2 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg wloop + + ret + } +} + +// Blends 8x2 rectangle to 4x1. +__declspec(naked) +void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // dst_width + + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + movdqa xmm2, xmm0 // average columns (8 to 4 pixels) + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels + pavgb xmm0, xmm2 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg wloop + + pop esi + ret + } +} + +// Reads 4 pixels at a time. +__declspec(naked) +void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + __asm { + push ebx + push edi + mov eax, [esp + 8 + 4] // src_argb + // src_stride ignored + mov ebx, [esp + 8 + 12] // src_stepx + mov edx, [esp + 8 + 16] // dst_argb + mov ecx, [esp + 8 + 20] // dst_width + lea ebx, [ebx * 4] + lea edi, [ebx + ebx * 2] + + wloop: + movd xmm0, [eax] + movd xmm1, [eax + ebx] + punpckldq xmm0, xmm1 + movd xmm2, [eax + ebx * 2] + movd xmm3, [eax + edi] + lea eax, [eax + ebx * 4] + punpckldq xmm2, xmm3 + punpcklqdq xmm0, xmm2 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg wloop + + pop edi + pop ebx + ret + } +} + +// Blends four 2x2 to 4x1. +__declspec(naked) +void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + __asm { + push ebx + push esi + push edi + mov eax, [esp + 12 + 4] // src_argb + mov esi, [esp + 12 + 8] // src_stride + mov ebx, [esp + 12 + 12] // src_stepx + mov edx, [esp + 12 + 16] // dst_argb + mov ecx, [esp + 12 + 20] // dst_width + lea esi, [eax + esi] // row1 pointer + lea ebx, [ebx * 4] + lea edi, [ebx + ebx * 2] + + wloop: + movq xmm0, qword ptr [eax] // row0 4 pairs + movhps xmm0, qword ptr [eax + ebx] + movq xmm1, qword ptr [eax + ebx * 2] + movhps xmm1, qword ptr [eax + edi] + lea eax, [eax + ebx * 4] + movq xmm2, qword ptr [esi] // row1 4 pairs + movhps xmm2, qword ptr [esi + ebx] + movq xmm3, qword ptr [esi + ebx * 2] + movhps xmm3, qword ptr [esi + edi] + lea esi, [esi + ebx * 4] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + movdqa xmm2, xmm0 // average columns (8 to 4 pixels) + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels + pavgb xmm0, xmm2 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg wloop + + pop edi + pop esi + pop ebx + ret + } +} + +// Column scaling unfiltered. SSE2 version. +__declspec(naked) +void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + __asm { + push edi + push esi + mov edi, [esp + 8 + 4] // dst_argb + mov esi, [esp + 8 + 8] // src_argb + mov ecx, [esp + 8 + 12] // dst_width + movd xmm2, [esp + 8 + 16] // x + movd xmm3, [esp + 8 + 20] // dx + + pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 + pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 + paddd xmm2, xmm0 + paddd xmm3, xmm3 // 0, 0, 0, dx * 2 + pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 + paddd xmm2, xmm0 // x3 x2 x1 x0 + paddd xmm3, xmm3 // 0, 0, 0, dx * 4 + pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 + + pextrw eax, xmm2, 1 // get x0 integer. + pextrw edx, xmm2, 3 // get x1 integer. + + cmp ecx, 0 + jle xloop99 + sub ecx, 4 + jl xloop49 + + // 4 Pixel loop. + xloop4: + movd xmm0, [esi + eax * 4] // 1 source x0 pixels + movd xmm1, [esi + edx * 4] // 1 source x1 pixels + pextrw eax, xmm2, 5 // get x2 integer. + pextrw edx, xmm2, 7 // get x3 integer. + paddd xmm2, xmm3 // x += dx + punpckldq xmm0, xmm1 // x0 x1 + + movd xmm1, [esi + eax * 4] // 1 source x2 pixels + movd xmm4, [esi + edx * 4] // 1 source x3 pixels + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. + punpckldq xmm1, xmm4 // x2 x3 + punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 + movdqu [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 4 // 4 pixels + jge xloop4 + + xloop49: + test ecx, 2 + je xloop29 + + // 2 Pixels. + movd xmm0, [esi + eax * 4] // 1 source x0 pixels + movd xmm1, [esi + edx * 4] // 1 source x1 pixels + pextrw eax, xmm2, 5 // get x2 integer. + punpckldq xmm0, xmm1 // x0 x1 + + movq qword ptr [edi], xmm0 + lea edi, [edi + 8] + + xloop29: + test ecx, 1 + je xloop99 + + // 1 Pixels. + movd xmm0, [esi + eax * 4] // 1 source x2 pixels + movd dword ptr [edi], xmm0 + xloop99: + + pop esi + pop edi + ret + } +} + +// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. +// TODO(fbarchard): Port to Neon + +// Shuffle table for arranging 2 pixels into pairs for pmaddubsw +static uvec8 kShuffleColARGB = { + 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel + 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel +}; + +// Shuffle table for duplicating 2 fractions into 8 bytes each +static uvec8 kShuffleFractions = { + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, +}; + +__declspec(naked) +void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_argb + mov esi, [esp + 8 + 8] // src_argb + mov ecx, [esp + 8 + 12] // dst_width + movd xmm2, [esp + 8 + 16] // x + movd xmm3, [esp + 8 + 20] // dx + movdqa xmm4, kShuffleColARGB + movdqa xmm5, kShuffleFractions + pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. + psrlw xmm6, 9 + pextrw eax, xmm2, 1 // get x0 integer. preroll + sub ecx, 2 + jl xloop29 + + movdqa xmm0, xmm2 // x1 = x0 + dx + paddd xmm0, xmm3 + punpckldq xmm2, xmm0 // x0 x1 + punpckldq xmm3, xmm3 // dx dx + paddd xmm3, xmm3 // dx * 2, dx * 2 + pextrw edx, xmm2, 3 // get x1 integer. preroll + + // 2 Pixel loop. + xloop2: + movdqa xmm1, xmm2 // x0, x1 fractions. + paddd xmm2, xmm3 // x += dx + movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels + psrlw xmm1, 9 // 7 bit fractions. + movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels + pshufb xmm1, xmm5 // 0000000011111111 + pshufb xmm0, xmm4 // arrange pixels into pairs + pxor xmm1, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. + psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. + packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. + movq qword ptr [edi], xmm0 + lea edi, [edi + 8] + sub ecx, 2 // 2 pixels + jge xloop2 + + xloop29: + + add ecx, 2 - 1 + jl xloop99 + + // 1 pixel remainder + psrlw xmm2, 9 // 7 bit fractions. + movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels + pshufb xmm2, xmm5 // 00000000 + pshufb xmm0, xmm4 // arrange pixels into pairs + pxor xmm2, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. + psrlw xmm0, 7 + packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. + movd [edi], xmm0 + + xloop99: + + pop edi + pop esi + ret + } +} + +// Reads 4 pixels, duplicates them and writes 8 pixels. +__declspec(naked) +void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + __asm { + mov edx, [esp + 4] // dst_argb + mov eax, [esp + 8] // src_argb + mov ecx, [esp + 12] // dst_width + + wloop: + movdqu xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpckldq xmm0, xmm0 + punpckhdq xmm1, xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg wloop + + ret + } +} + +// Divide num by div and return as 16.16 fixed point result. +__declspec(naked) +int FixedDiv_X86(int num, int div) { + __asm { + mov eax, [esp + 4] // num + cdq // extend num to 64 bits + shld edx, eax, 16 // 32.16 + shl eax, 16 + idiv dword ptr [esp + 8] + ret + } +} + +// Divide num by div and return as 16.16 fixed point result. +__declspec(naked) +int FixedDiv1_X86(int num, int div) { + __asm { + mov eax, [esp + 4] // num + mov ecx, [esp + 8] // denom + cdq // extend num to 64 bits + shld edx, eax, 16 // 32.16 + shl eax, 16 + sub eax, 0x00010001 + sbb edx, 0 + sub ecx, 1 + idiv ecx + ret + } +} +#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/libs/libaom/src/third_party/libyuv/source/video_common.cc b/libs/libaom/src/third_party/libyuv/source/video_common.cc new file mode 100644 index 000000000..379a0669a --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/video_common.cc @@ -0,0 +1,64 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "libyuv/video_common.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof(x[0])) + +struct FourCCAliasEntry { + uint32 alias; + uint32 canonical; +}; + +static const struct FourCCAliasEntry kFourCCAliases[] = { + {FOURCC_IYUV, FOURCC_I420}, + {FOURCC_YU16, FOURCC_I422}, + {FOURCC_YU24, FOURCC_I444}, + {FOURCC_YUYV, FOURCC_YUY2}, + {FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs + {FOURCC_HDYC, FOURCC_UYVY}, + {FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8 + {FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not. + {FOURCC_DMB1, FOURCC_MJPG}, + {FOURCC_BA81, FOURCC_BGGR}, // deprecated. + {FOURCC_RGB3, FOURCC_RAW }, + {FOURCC_BGR3, FOURCC_24BG}, + {FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB + {FOURCC_CM24, FOURCC_RAW }, // kCMPixelFormat_24RGB + {FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555 + {FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565 + {FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551 +}; +// TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB. +// {FOURCC_BGRA, FOURCC_ARGB}, // kCMPixelFormat_32BGRA + +LIBYUV_API +uint32 CanonicalFourCC(uint32 fourcc) { + int i; + for (i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) { + if (kFourCCAliases[i].alias == fourcc) { + return kFourCCAliases[i].canonical; + } + } + // Not an alias, so return it as-is. + return fourcc; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + diff --git a/libs/libaom/src/third_party/libyuv/source/x86inc.asm b/libs/libaom/src/third_party/libyuv/source/x86inc.asm new file mode 100644 index 000000000..cb5c32df3 --- /dev/null +++ b/libs/libaom/src/third_party/libyuv/source/x86inc.asm @@ -0,0 +1,1136 @@ +;***************************************************************************** +;* x86inc.asm: x264asm abstraction layer +;***************************************************************************** +;* Copyright (C) 2005-2012 x264 project +;* +;* Authors: Loren Merritt +;* Anton Mitrofanov +;* Jason Garrett-Glaser +;* Henrik Gramner +;* +;* Permission to use, copy, modify, and/or distribute this software for any +;* purpose with or without fee is hereby granted, provided that the above +;* copyright notice and this permission notice appear in all copies. +;* +;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +;***************************************************************************** + +; This is a header file for the x264ASM assembly language, which uses +; NASM/YASM syntax combined with a large number of macros to provide easy +; abstraction between different calling conventions (x86_32, win64, linux64). +; It also has various other useful features to simplify writing the kind of +; DSP functions that are most often used in x264. + +; Unlike the rest of x264, this file is available under an ISC license, as it +; has significant usefulness outside of x264 and we want it to be available +; to the largest audience possible. Of course, if you modify it for your own +; purposes to add a new feature, we strongly encourage contributing a patch +; as this feature might be useful for others as well. Send patches or ideas +; to x264-devel@videolan.org . + +; Local changes for libyuv: +; remove %define program_name and references in labels +; rename cpus to uppercase + +%define WIN64 0 +%define UNIX64 0 +%if ARCH_X86_64 + %ifidn __OUTPUT_FORMAT__,win32 + %define WIN64 1 + %elifidn __OUTPUT_FORMAT__,win64 + %define WIN64 1 + %else + %define UNIX64 1 + %endif +%endif + +%ifdef PREFIX + %define mangle(x) _ %+ x +%else + %define mangle(x) x +%endif + +; Name of the .rodata section. +; Kludge: Something on OS X fails to align .rodata even given an align attribute, +; so use a different read-only section. +%macro SECTION_RODATA 0-1 16 + %ifidn __OUTPUT_FORMAT__,macho64 + SECTION .text align=%1 + %elifidn __OUTPUT_FORMAT__,macho + SECTION .text align=%1 + fakegot: + %elifidn __OUTPUT_FORMAT__,aout + section .text + %else + SECTION .rodata align=%1 + %endif +%endmacro + +; aout does not support align= +%macro SECTION_TEXT 0-1 16 + %ifidn __OUTPUT_FORMAT__,aout + SECTION .text + %else + SECTION .text align=%1 + %endif +%endmacro + +%if WIN64 + %define PIC +%elif ARCH_X86_64 == 0 +; x86_32 doesn't require PIC. +; Some distros prefer shared objects to be PIC, but nothing breaks if +; the code contains a few textrels, so we'll skip that complexity. + %undef PIC +%endif +%ifdef PIC + default rel +%endif + +; Always use long nops (reduces 0x90 spam in disassembly on x86_32) +CPU amdnop + +; Macros to eliminate most code duplication between x86_32 and x86_64: +; Currently this works only for leaf functions which load all their arguments +; into registers at the start, and make no other use of the stack. Luckily that +; covers most of x264's asm. + +; PROLOGUE: +; %1 = number of arguments. loads them from stack if needed. +; %2 = number of registers used. pushes callee-saved regs if needed. +; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. +; %4 = list of names to define to registers +; PROLOGUE can also be invoked by adding the same options to cglobal + +; e.g. +; cglobal foo, 2,3,0, dst, src, tmp +; declares a function (foo), taking two args (dst and src) and one local variable (tmp) + +; TODO Some functions can use some args directly from the stack. If they're the +; last args then you can just not declare them, but if they're in the middle +; we need more flexible macro. + +; RET: +; Pops anything that was pushed by PROLOGUE, and returns. + +; REP_RET: +; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons +; which are slow when a normal ret follows a branch. + +; registers: +; rN and rNq are the native-size register holding function argument N +; rNd, rNw, rNb are dword, word, and byte size +; rNh is the high 8 bits of the word size +; rNm is the original location of arg N (a register or on the stack), dword +; rNmp is native size + +%macro DECLARE_REG 2-3 + %define r%1q %2 + %define r%1d %2d + %define r%1w %2w + %define r%1b %2b + %define r%1h %2h + %if %0 == 2 + %define r%1m %2d + %define r%1mp %2 + %elif ARCH_X86_64 ; memory + %define r%1m [rsp + stack_offset + %3] + %define r%1mp qword r %+ %1m + %else + %define r%1m [esp + stack_offset + %3] + %define r%1mp dword r %+ %1m + %endif + %define r%1 %2 +%endmacro + +%macro DECLARE_REG_SIZE 3 + %define r%1q r%1 + %define e%1q r%1 + %define r%1d e%1 + %define e%1d e%1 + %define r%1w %1 + %define e%1w %1 + %define r%1h %3 + %define e%1h %3 + %define r%1b %2 + %define e%1b %2 +%if ARCH_X86_64 == 0 + %define r%1 e%1 +%endif +%endmacro + +DECLARE_REG_SIZE ax, al, ah +DECLARE_REG_SIZE bx, bl, bh +DECLARE_REG_SIZE cx, cl, ch +DECLARE_REG_SIZE dx, dl, dh +DECLARE_REG_SIZE si, sil, null +DECLARE_REG_SIZE di, dil, null +DECLARE_REG_SIZE bp, bpl, null + +; t# defines for when per-arch register allocation is more complex than just function arguments + +%macro DECLARE_REG_TMP 1-* + %assign %%i 0 + %rep %0 + CAT_XDEFINE t, %%i, r%1 + %assign %%i %%i+1 + %rotate 1 + %endrep +%endmacro + +%macro DECLARE_REG_TMP_SIZE 0-* + %rep %0 + %define t%1q t%1 %+ q + %define t%1d t%1 %+ d + %define t%1w t%1 %+ w + %define t%1h t%1 %+ h + %define t%1b t%1 %+ b + %rotate 1 + %endrep +%endmacro + +DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 + +%if ARCH_X86_64 + %define gprsize 8 +%else + %define gprsize 4 +%endif + +%macro PUSH 1 + push %1 + %assign stack_offset stack_offset+gprsize +%endmacro + +%macro POP 1 + pop %1 + %assign stack_offset stack_offset-gprsize +%endmacro + +%macro PUSH_IF_USED 1-* + %rep %0 + %if %1 < regs_used + PUSH r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro POP_IF_USED 1-* + %rep %0 + %if %1 < regs_used + pop r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro LOAD_IF_USED 1-* + %rep %0 + %if %1 < num_args + mov r%1, r %+ %1 %+ mp + %endif + %rotate 1 + %endrep +%endmacro + +%macro SUB 2 + sub %1, %2 + %ifidn %1, rsp + %assign stack_offset stack_offset+(%2) + %endif +%endmacro + +%macro ADD 2 + add %1, %2 + %ifidn %1, rsp + %assign stack_offset stack_offset-(%2) + %endif +%endmacro + +%macro movifnidn 2 + %ifnidn %1, %2 + mov %1, %2 + %endif +%endmacro + +%macro movsxdifnidn 2 + %ifnidn %1, %2 + movsxd %1, %2 + %endif +%endmacro + +%macro ASSERT 1 + %if (%1) == 0 + %error assert failed + %endif +%endmacro + +%macro DEFINE_ARGS 0-* + %ifdef n_arg_names + %assign %%i 0 + %rep n_arg_names + CAT_UNDEF arg_name %+ %%i, q + CAT_UNDEF arg_name %+ %%i, d + CAT_UNDEF arg_name %+ %%i, w + CAT_UNDEF arg_name %+ %%i, h + CAT_UNDEF arg_name %+ %%i, b + CAT_UNDEF arg_name %+ %%i, m + CAT_UNDEF arg_name %+ %%i, mp + CAT_UNDEF arg_name, %%i + %assign %%i %%i+1 + %endrep + %endif + + %xdefine %%stack_offset stack_offset + %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine + %assign %%i 0 + %rep %0 + %xdefine %1q r %+ %%i %+ q + %xdefine %1d r %+ %%i %+ d + %xdefine %1w r %+ %%i %+ w + %xdefine %1h r %+ %%i %+ h + %xdefine %1b r %+ %%i %+ b + %xdefine %1m r %+ %%i %+ m + %xdefine %1mp r %+ %%i %+ mp + CAT_XDEFINE arg_name, %%i, %1 + %assign %%i %%i+1 + %rotate 1 + %endrep + %xdefine stack_offset %%stack_offset + %assign n_arg_names %0 +%endmacro + +%if WIN64 ; Windows x64 ;================================================= + +DECLARE_REG 0, rcx +DECLARE_REG 1, rdx +DECLARE_REG 2, R8 +DECLARE_REG 3, R9 +DECLARE_REG 4, R10, 40 +DECLARE_REG 5, R11, 48 +DECLARE_REG 6, rax, 56 +DECLARE_REG 7, rdi, 64 +DECLARE_REG 8, rsi, 72 +DECLARE_REG 9, rbx, 80 +DECLARE_REG 10, rbp, 88 +DECLARE_REG 11, R12, 96 +DECLARE_REG 12, R13, 104 +DECLARE_REG 13, R14, 112 +DECLARE_REG 14, R15, 120 + +%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + ASSERT regs_used <= 15 + PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 + %if mmsize == 8 + %assign xmm_regs_used 0 + %else + WIN64_SPILL_XMM %3 + %endif + LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 + DEFINE_ARGS %4 +%endmacro + +%macro WIN64_SPILL_XMM 1 + %assign xmm_regs_used %1 + ASSERT xmm_regs_used <= 16 + %if xmm_regs_used > 6 + SUB rsp, (xmm_regs_used-6)*16+16 + %assign %%i xmm_regs_used + %rep (xmm_regs_used-6) + %assign %%i %%i-1 + movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i + %endrep + %endif +%endmacro + +%macro WIN64_RESTORE_XMM_INTERNAL 1 + %if xmm_regs_used > 6 + %assign %%i xmm_regs_used + %rep (xmm_regs_used-6) + %assign %%i %%i-1 + movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)] + %endrep + add %1, (xmm_regs_used-6)*16+16 + %endif +%endmacro + +%macro WIN64_RESTORE_XMM 1 + WIN64_RESTORE_XMM_INTERNAL %1 + %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16 + %assign xmm_regs_used 0 +%endmacro + +%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 + +%macro RET 0 + WIN64_RESTORE_XMM_INTERNAL rsp + POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 +%if mmsize == 32 + vzeroupper +%endif + ret +%endmacro + +%elif ARCH_X86_64 ; *nix x64 ;============================================= + +DECLARE_REG 0, rdi +DECLARE_REG 1, rsi +DECLARE_REG 2, rdx +DECLARE_REG 3, rcx +DECLARE_REG 4, R8 +DECLARE_REG 5, R9 +DECLARE_REG 6, rax, 8 +DECLARE_REG 7, R10, 16 +DECLARE_REG 8, R11, 24 +DECLARE_REG 9, rbx, 32 +DECLARE_REG 10, rbp, 40 +DECLARE_REG 11, R12, 48 +DECLARE_REG 12, R13, 56 +DECLARE_REG 13, R14, 64 +DECLARE_REG 14, R15, 72 + +%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + ASSERT regs_used <= 15 + PUSH_IF_USED 9, 10, 11, 12, 13, 14 + LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 + DEFINE_ARGS %4 +%endmacro + +%define has_epilogue regs_used > 9 || mmsize == 32 + +%macro RET 0 + POP_IF_USED 14, 13, 12, 11, 10, 9 +%if mmsize == 32 + vzeroupper +%endif + ret +%endmacro + +%else ; X86_32 ;============================================================== + +DECLARE_REG 0, eax, 4 +DECLARE_REG 1, ecx, 8 +DECLARE_REG 2, edx, 12 +DECLARE_REG 3, ebx, 16 +DECLARE_REG 4, esi, 20 +DECLARE_REG 5, edi, 24 +DECLARE_REG 6, ebp, 28 +%define rsp esp + +%macro DECLARE_ARG 1-* + %rep %0 + %define r%1m [esp + stack_offset + 4*%1 + 4] + %define r%1mp dword r%1m + %rotate 1 + %endrep +%endmacro + +DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 + +%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... + %assign num_args %1 + %assign regs_used %2 + %if regs_used > 7 + %assign regs_used 7 + %endif + ASSERT regs_used >= num_args + PUSH_IF_USED 3, 4, 5, 6 + LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 + DEFINE_ARGS %4 +%endmacro + +%define has_epilogue regs_used > 3 || mmsize == 32 + +%macro RET 0 + POP_IF_USED 6, 5, 4, 3 +%if mmsize == 32 + vzeroupper +%endif + ret +%endmacro + +%endif ;====================================================================== + +%if WIN64 == 0 +%macro WIN64_SPILL_XMM 1 +%endmacro +%macro WIN64_RESTORE_XMM 1 +%endmacro +%endif + +%macro REP_RET 0 + %if has_epilogue + RET + %else + rep ret + %endif +%endmacro + +%macro TAIL_CALL 2 ; callee, is_nonadjacent + %if has_epilogue + call %1 + RET + %elif %2 + jmp %1 + %endif +%endmacro + +;============================================================================= +; arch-independent part +;============================================================================= + +%assign function_align 16 + +; Begin a function. +; Applies any symbol mangling needed for C linkage, and sets up a define such that +; subsequent uses of the function name automatically refer to the mangled version. +; Appends cpuflags to the function name if cpuflags has been specified. +%macro cglobal 1-2+ ; name, [PROLOGUE args] +%if %0 == 1 + cglobal_internal %1 %+ SUFFIX +%else + cglobal_internal %1 %+ SUFFIX, %2 +%endif +%endmacro +%macro cglobal_internal 1-2+ + %ifndef cglobaled_%1 + %xdefine %1 mangle(%1) + %xdefine %1.skip_prologue %1 %+ .skip_prologue + CAT_XDEFINE cglobaled_, %1, 1 + %endif + %xdefine current_function %1 + %ifidn __OUTPUT_FORMAT__,elf + global %1:function hidden + %else + global %1 + %endif + align function_align + %1: + RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer + %assign stack_offset 0 + %if %0 > 1 + PROLOGUE %2 + %endif +%endmacro + +%macro cextern 1 + %xdefine %1 mangle(%1) + CAT_XDEFINE cglobaled_, %1, 1 + extern %1 +%endmacro + +; like cextern, but without the prefix +%macro cextern_naked 1 + %xdefine %1 mangle(%1) + CAT_XDEFINE cglobaled_, %1, 1 + extern %1 +%endmacro + +%macro const 2+ + %xdefine %1 mangle(%1) + global %1 + %1: %2 +%endmacro + +; This is needed for ELF, otherwise the GNU linker assumes the stack is +; executable by default. +%ifidn __OUTPUT_FORMAT__,elf +SECTION .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf32 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf64 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif + +; cpuflags + +%assign cpuflags_MMX (1<<0) +%assign cpuflags_MMX2 (1<<1) | cpuflags_MMX +%assign cpuflags_3dnow (1<<2) | cpuflags_MMX +%assign cpuflags_3dnow2 (1<<3) | cpuflags_3dnow +%assign cpuflags_SSE (1<<4) | cpuflags_MMX2 +%assign cpuflags_SSE2 (1<<5) | cpuflags_SSE +%assign cpuflags_SSE2slow (1<<6) | cpuflags_SSE2 +%assign cpuflags_SSE3 (1<<7) | cpuflags_SSE2 +%assign cpuflags_SSSE3 (1<<8) | cpuflags_SSE3 +%assign cpuflags_SSE4 (1<<9) | cpuflags_SSSE3 +%assign cpuflags_SSE42 (1<<10)| cpuflags_SSE4 +%assign cpuflags_AVX (1<<11)| cpuflags_SSE42 +%assign cpuflags_xop (1<<12)| cpuflags_AVX +%assign cpuflags_fma4 (1<<13)| cpuflags_AVX +%assign cpuflags_AVX2 (1<<14)| cpuflags_AVX +%assign cpuflags_fma3 (1<<15)| cpuflags_AVX + +%assign cpuflags_cache32 (1<<16) +%assign cpuflags_cache64 (1<<17) +%assign cpuflags_slowctz (1<<18) +%assign cpuflags_lzcnt (1<<19) +%assign cpuflags_misalign (1<<20) +%assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant +%assign cpuflags_atom (1<<22) +%assign cpuflags_bmi1 (1<<23) +%assign cpuflags_bmi2 (1<<24)|cpuflags_bmi1 +%assign cpuflags_tbm (1<<25)|cpuflags_bmi1 + +%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) +%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) + +; Takes up to 2 cpuflags from the above list. +; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. +; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. +%macro INIT_CPUFLAGS 0-2 + %if %0 >= 1 + %xdefine cpuname %1 + %assign cpuflags cpuflags_%1 + %if %0 >= 2 + %xdefine cpuname %1_%2 + %assign cpuflags cpuflags | cpuflags_%2 + %endif + %xdefine SUFFIX _ %+ cpuname + %if cpuflag(AVX) + %assign AVX_enabled 1 + %endif + %if mmsize == 16 && notcpuflag(SSE2) + %define mova movaps + %define movu movups + %define movnta movntps + %endif + %if cpuflag(aligned) + %define movu mova + %elifidn %1, SSE3 + %define movu lddqu + %endif + %else + %xdefine SUFFIX + %undef cpuname + %undef cpuflags + %endif +%endmacro + +; merge MMX and SSE* + +%macro CAT_XDEFINE 3 + %xdefine %1%2 %3 +%endmacro + +%macro CAT_UNDEF 2 + %undef %1%2 +%endmacro + +%macro INIT_MMX 0-1+ + %assign AVX_enabled 0 + %define RESET_MM_PERMUTATION INIT_MMX %1 + %define mmsize 8 + %define num_mmregs 8 + %define mova movq + %define movu movq + %define movh movd + %define movnta movntq + %assign %%i 0 + %rep 8 + CAT_XDEFINE m, %%i, mm %+ %%i + CAT_XDEFINE nmm, %%i, %%i + %assign %%i %%i+1 + %endrep + %rep 8 + CAT_UNDEF m, %%i + CAT_UNDEF nmm, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 +%endmacro + +%macro INIT_XMM 0-1+ + %assign AVX_enabled 0 + %define RESET_MM_PERMUTATION INIT_XMM %1 + %define mmsize 16 + %define num_mmregs 8 + %if ARCH_X86_64 + %define num_mmregs 16 + %endif + %define mova movdqa + %define movu movdqu + %define movh movq + %define movnta movntdq + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, xmm %+ %%i + CAT_XDEFINE nxmm, %%i, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 +%endmacro + +%macro INIT_YMM 0-1+ + %assign AVX_enabled 1 + %define RESET_MM_PERMUTATION INIT_YMM %1 + %define mmsize 32 + %define num_mmregs 8 + %if ARCH_X86_64 + %define num_mmregs 16 + %endif + %define mova vmovaps + %define movu vmovups + %undef movh + %define movnta vmovntps + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, ymm %+ %%i + CAT_XDEFINE nymm, %%i, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 +%endmacro + +INIT_XMM + +; I often want to use macros that permute their arguments. e.g. there's no +; efficient way to implement butterfly or transpose or dct without swapping some +; arguments. +; +; I would like to not have to manually keep track of the permutations: +; If I insert a permutation in the middle of a function, it should automatically +; change everything that follows. For more complex macros I may also have multiple +; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. +; +; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that +; permutes its arguments. It's equivalent to exchanging the contents of the +; registers, except that this way you exchange the register names instead, so it +; doesn't cost any cycles. + +%macro PERMUTE 2-* ; takes a list of pairs to swap +%rep %0/2 + %xdefine tmp%2 m%2 + %xdefine ntmp%2 nm%2 + %rotate 2 +%endrep +%rep %0/2 + %xdefine m%1 tmp%2 + %xdefine nm%1 ntmp%2 + %undef tmp%2 + %undef ntmp%2 + %rotate 2 +%endrep +%endmacro + +%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs) +%rep %0-1 +%ifdef m%1 + %xdefine tmp m%1 + %xdefine m%1 m%2 + %xdefine m%2 tmp + CAT_XDEFINE n, m%1, %1 + CAT_XDEFINE n, m%2, %2 +%else + ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here. + ; Be careful using this mode in nested macros though, as in some cases there may be + ; other copies of m# that have already been dereferenced and don't get updated correctly. + %xdefine %%n1 n %+ %1 + %xdefine %%n2 n %+ %2 + %xdefine tmp m %+ %%n1 + CAT_XDEFINE m, %%n1, m %+ %%n2 + CAT_XDEFINE m, %%n2, tmp + CAT_XDEFINE n, m %+ %%n1, %%n1 + CAT_XDEFINE n, m %+ %%n2, %%n2 +%endif + %undef tmp + %rotate 1 +%endrep +%endmacro + +; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later +; calls to that function will automatically load the permutation, so values can +; be returned in mmregs. +%macro SAVE_MM_PERMUTATION 0-1 + %if %0 + %xdefine %%f %1_m + %else + %xdefine %%f current_function %+ _m + %endif + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE %%f, %%i, m %+ %%i + %assign %%i %%i+1 + %endrep +%endmacro + +%macro LOAD_MM_PERMUTATION 1 ; name to load from + %ifdef %1_m0 + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, %1_m %+ %%i + CAT_XDEFINE n, m %+ %%i, %%i + %assign %%i %%i+1 + %endrep + %endif +%endmacro + +; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't +%macro call 1 + call_internal %1, %1 %+ SUFFIX +%endmacro +%macro call_internal 2 + %xdefine %%i %1 + %ifndef cglobaled_%1 + %ifdef cglobaled_%2 + %xdefine %%i %2 + %endif + %endif + call %%i + LOAD_MM_PERMUTATION %%i +%endmacro + +; Substitutions that reduce instruction size but are functionally equivalent +%macro add 2 + %ifnum %2 + %if %2==128 + sub %1, -128 + %else + add %1, %2 + %endif + %else + add %1, %2 + %endif +%endmacro + +%macro sub 2 + %ifnum %2 + %if %2==128 + add %1, -128 + %else + sub %1, %2 + %endif + %else + sub %1, %2 + %endif +%endmacro + +;============================================================================= +; AVX abstraction layer +;============================================================================= + +%assign i 0 +%rep 16 + %if i < 8 + CAT_XDEFINE sizeofmm, i, 8 + %endif + CAT_XDEFINE sizeofxmm, i, 16 + CAT_XDEFINE sizeofymm, i, 32 +%assign i i+1 +%endrep +%undef i + +%macro CHECK_AVX_INSTR_EMU 3-* + %xdefine %%opcode %1 + %xdefine %%dst %2 + %rep %0-2 + %ifidn %%dst, %3 + %error non-AVX emulation of ``%%opcode'' is not supported + %endif + %rotate 1 + %endrep +%endmacro + +;%1 == instruction +;%2 == 1 if float, 0 if int +;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm) +;%4 == number of operands given +;%5+: operands +%macro RUN_AVX_INSTR 6-7+ + %ifid %6 + %define %%sizeofreg sizeof%6 + %elifid %5 + %define %%sizeofreg sizeof%5 + %else + %define %%sizeofreg mmsize + %endif + %if %%sizeofreg==32 + %if %4>=3 + v%1 %5, %6, %7 + %else + v%1 %5, %6 + %endif + %else + %if %%sizeofreg==8 + %define %%regmov movq + %elif %2 + %define %%regmov movaps + %else + %define %%regmov movdqa + %endif + + %if %4>=3+%3 + %ifnidn %5, %6 + %if AVX_enabled && %%sizeofreg==16 + v%1 %5, %6, %7 + %else + CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7 + %%regmov %5, %6 + %1 %5, %7 + %endif + %else + %1 %5, %7 + %endif + %elif %4>=3 + %1 %5, %6, %7 + %else + %1 %5, %6 + %endif + %endif +%endmacro + +; 3arg AVX ops with a memory arg can only have it in src2, +; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov). +; So, if the op is symmetric and the wrong one is memory, swap them. +%macro RUN_AVX_INSTR1 8 + %assign %%swap 0 + %if AVX_enabled + %ifnid %6 + %assign %%swap 1 + %endif + %elifnidn %5, %6 + %ifnid %7 + %assign %%swap 1 + %endif + %endif + %if %%swap && %3 == 0 && %8 == 1 + RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6 + %else + RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7 + %endif +%endmacro + +;%1 == instruction +;%2 == 1 if float, 0 if int +;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm) +;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not +%macro AVX_INSTR 4 + %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4 + %ifidn %3, fnord + RUN_AVX_INSTR %6, %7, %8, 2, %1, %2 + %elifidn %4, fnord + RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9 + %elifidn %5, fnord + RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4 + %else + RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5 + %endif + %endmacro +%endmacro + +AVX_INSTR addpd, 1, 0, 1 +AVX_INSTR addps, 1, 0, 1 +AVX_INSTR addsd, 1, 0, 1 +AVX_INSTR addss, 1, 0, 1 +AVX_INSTR addsubpd, 1, 0, 0 +AVX_INSTR addsubps, 1, 0, 0 +AVX_INSTR andpd, 1, 0, 1 +AVX_INSTR andps, 1, 0, 1 +AVX_INSTR andnpd, 1, 0, 0 +AVX_INSTR andnps, 1, 0, 0 +AVX_INSTR blendpd, 1, 0, 0 +AVX_INSTR blendps, 1, 0, 0 +AVX_INSTR blendvpd, 1, 0, 0 +AVX_INSTR blendvps, 1, 0, 0 +AVX_INSTR cmppd, 1, 0, 0 +AVX_INSTR cmpps, 1, 0, 0 +AVX_INSTR cmpsd, 1, 0, 0 +AVX_INSTR cmpss, 1, 0, 0 +AVX_INSTR cvtdq2ps, 1, 0, 0 +AVX_INSTR cvtps2dq, 1, 0, 0 +AVX_INSTR divpd, 1, 0, 0 +AVX_INSTR divps, 1, 0, 0 +AVX_INSTR divsd, 1, 0, 0 +AVX_INSTR divss, 1, 0, 0 +AVX_INSTR dppd, 1, 1, 0 +AVX_INSTR dpps, 1, 1, 0 +AVX_INSTR haddpd, 1, 0, 0 +AVX_INSTR haddps, 1, 0, 0 +AVX_INSTR hsubpd, 1, 0, 0 +AVX_INSTR hsubps, 1, 0, 0 +AVX_INSTR maxpd, 1, 0, 1 +AVX_INSTR maxps, 1, 0, 1 +AVX_INSTR maxsd, 1, 0, 1 +AVX_INSTR maxss, 1, 0, 1 +AVX_INSTR minpd, 1, 0, 1 +AVX_INSTR minps, 1, 0, 1 +AVX_INSTR minsd, 1, 0, 1 +AVX_INSTR minss, 1, 0, 1 +AVX_INSTR movhlps, 1, 0, 0 +AVX_INSTR movlhps, 1, 0, 0 +AVX_INSTR movsd, 1, 0, 0 +AVX_INSTR movss, 1, 0, 0 +AVX_INSTR mpsadbw, 0, 1, 0 +AVX_INSTR mulpd, 1, 0, 1 +AVX_INSTR mulps, 1, 0, 1 +AVX_INSTR mulsd, 1, 0, 1 +AVX_INSTR mulss, 1, 0, 1 +AVX_INSTR orpd, 1, 0, 1 +AVX_INSTR orps, 1, 0, 1 +AVX_INSTR pabsb, 0, 0, 0 +AVX_INSTR pabsw, 0, 0, 0 +AVX_INSTR pabsd, 0, 0, 0 +AVX_INSTR packsswb, 0, 0, 0 +AVX_INSTR packssdw, 0, 0, 0 +AVX_INSTR packuswb, 0, 0, 0 +AVX_INSTR packusdw, 0, 0, 0 +AVX_INSTR paddb, 0, 0, 1 +AVX_INSTR paddw, 0, 0, 1 +AVX_INSTR paddd, 0, 0, 1 +AVX_INSTR paddq, 0, 0, 1 +AVX_INSTR paddsb, 0, 0, 1 +AVX_INSTR paddsw, 0, 0, 1 +AVX_INSTR paddusb, 0, 0, 1 +AVX_INSTR paddusw, 0, 0, 1 +AVX_INSTR palignr, 0, 1, 0 +AVX_INSTR pand, 0, 0, 1 +AVX_INSTR pandn, 0, 0, 0 +AVX_INSTR pavgb, 0, 0, 1 +AVX_INSTR pavgw, 0, 0, 1 +AVX_INSTR pblendvb, 0, 0, 0 +AVX_INSTR pblendw, 0, 1, 0 +AVX_INSTR pcmpestri, 0, 0, 0 +AVX_INSTR pcmpestrm, 0, 0, 0 +AVX_INSTR pcmpistri, 0, 0, 0 +AVX_INSTR pcmpistrm, 0, 0, 0 +AVX_INSTR pcmpeqb, 0, 0, 1 +AVX_INSTR pcmpeqw, 0, 0, 1 +AVX_INSTR pcmpeqd, 0, 0, 1 +AVX_INSTR pcmpeqq, 0, 0, 1 +AVX_INSTR pcmpgtb, 0, 0, 0 +AVX_INSTR pcmpgtw, 0, 0, 0 +AVX_INSTR pcmpgtd, 0, 0, 0 +AVX_INSTR pcmpgtq, 0, 0, 0 +AVX_INSTR phaddw, 0, 0, 0 +AVX_INSTR phaddd, 0, 0, 0 +AVX_INSTR phaddsw, 0, 0, 0 +AVX_INSTR phsubw, 0, 0, 0 +AVX_INSTR phsubd, 0, 0, 0 +AVX_INSTR phsubsw, 0, 0, 0 +AVX_INSTR pmaddwd, 0, 0, 1 +AVX_INSTR pmaddubsw, 0, 0, 0 +AVX_INSTR pmaxsb, 0, 0, 1 +AVX_INSTR pmaxsw, 0, 0, 1 +AVX_INSTR pmaxsd, 0, 0, 1 +AVX_INSTR pmaxub, 0, 0, 1 +AVX_INSTR pmaxuw, 0, 0, 1 +AVX_INSTR pmaxud, 0, 0, 1 +AVX_INSTR pminsb, 0, 0, 1 +AVX_INSTR pminsw, 0, 0, 1 +AVX_INSTR pminsd, 0, 0, 1 +AVX_INSTR pminub, 0, 0, 1 +AVX_INSTR pminuw, 0, 0, 1 +AVX_INSTR pminud, 0, 0, 1 +AVX_INSTR pmovmskb, 0, 0, 0 +AVX_INSTR pmulhuw, 0, 0, 1 +AVX_INSTR pmulhrsw, 0, 0, 1 +AVX_INSTR pmulhw, 0, 0, 1 +AVX_INSTR pmullw, 0, 0, 1 +AVX_INSTR pmulld, 0, 0, 1 +AVX_INSTR pmuludq, 0, 0, 1 +AVX_INSTR pmuldq, 0, 0, 1 +AVX_INSTR por, 0, 0, 1 +AVX_INSTR psadbw, 0, 0, 1 +AVX_INSTR pshufb, 0, 0, 0 +AVX_INSTR pshufd, 0, 1, 0 +AVX_INSTR pshufhw, 0, 1, 0 +AVX_INSTR pshuflw, 0, 1, 0 +AVX_INSTR psignb, 0, 0, 0 +AVX_INSTR psignw, 0, 0, 0 +AVX_INSTR psignd, 0, 0, 0 +AVX_INSTR psllw, 0, 0, 0 +AVX_INSTR pslld, 0, 0, 0 +AVX_INSTR psllq, 0, 0, 0 +AVX_INSTR pslldq, 0, 0, 0 +AVX_INSTR psraw, 0, 0, 0 +AVX_INSTR psrad, 0, 0, 0 +AVX_INSTR psrlw, 0, 0, 0 +AVX_INSTR psrld, 0, 0, 0 +AVX_INSTR psrlq, 0, 0, 0 +AVX_INSTR psrldq, 0, 0, 0 +AVX_INSTR psubb, 0, 0, 0 +AVX_INSTR psubw, 0, 0, 0 +AVX_INSTR psubd, 0, 0, 0 +AVX_INSTR psubq, 0, 0, 0 +AVX_INSTR psubsb, 0, 0, 0 +AVX_INSTR psubsw, 0, 0, 0 +AVX_INSTR psubusb, 0, 0, 0 +AVX_INSTR psubusw, 0, 0, 0 +AVX_INSTR ptest, 0, 0, 0 +AVX_INSTR punpckhbw, 0, 0, 0 +AVX_INSTR punpckhwd, 0, 0, 0 +AVX_INSTR punpckhdq, 0, 0, 0 +AVX_INSTR punpckhqdq, 0, 0, 0 +AVX_INSTR punpcklbw, 0, 0, 0 +AVX_INSTR punpcklwd, 0, 0, 0 +AVX_INSTR punpckldq, 0, 0, 0 +AVX_INSTR punpcklqdq, 0, 0, 0 +AVX_INSTR pxor, 0, 0, 1 +AVX_INSTR shufps, 1, 1, 0 +AVX_INSTR subpd, 1, 0, 0 +AVX_INSTR subps, 1, 0, 0 +AVX_INSTR subsd, 1, 0, 0 +AVX_INSTR subss, 1, 0, 0 +AVX_INSTR unpckhpd, 1, 0, 0 +AVX_INSTR unpckhps, 1, 0, 0 +AVX_INSTR unpcklpd, 1, 0, 0 +AVX_INSTR unpcklps, 1, 0, 0 +AVX_INSTR xorpd, 1, 0, 1 +AVX_INSTR xorps, 1, 0, 1 + +; 3DNow instructions, for sharing code between AVX, SSE and 3DN +AVX_INSTR pfadd, 1, 0, 1 +AVX_INSTR pfsub, 1, 0, 0 +AVX_INSTR pfmul, 1, 0, 1 + +; base-4 constants for shuffles +%assign i 0 +%rep 256 + %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) + %if j < 10 + CAT_XDEFINE q000, j, i + %elif j < 100 + CAT_XDEFINE q00, j, i + %elif j < 1000 + CAT_XDEFINE q0, j, i + %else + CAT_XDEFINE q, j, i + %endif +%assign i i+1 +%endrep +%undef i +%undef j + +%macro FMA_INSTR 3 + %macro %1 4-7 %1, %2, %3 + %if cpuflag(xop) + v%5 %1, %2, %3, %4 + %else + %6 %1, %2, %3 + %7 %1, %4 + %endif + %endmacro +%endmacro + +FMA_INSTR pmacsdd, pmulld, paddd +FMA_INSTR pmacsww, pmullw, paddw +FMA_INSTR pmadcswd, pmaddwd, paddd + +; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf. +; This lets us use tzcnt without bumping the yasm version requirement yet. +%define tzcnt rep bsf diff --git a/libs/libaom/src/third_party/vector/LICENSE b/libs/libaom/src/third_party/vector/LICENSE new file mode 100644 index 000000000..afcb9f00a --- /dev/null +++ b/libs/libaom/src/third_party/vector/LICENSE @@ -0,0 +1,19 @@ +The MIT License (MIT) +Copyright (c) 2016 Peter Goldsborough + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/libs/libaom/src/third_party/vector/README.libaom b/libs/libaom/src/third_party/vector/README.libaom new file mode 100644 index 000000000..729446dbc --- /dev/null +++ b/libs/libaom/src/third_party/vector/README.libaom @@ -0,0 +1,16 @@ +Name: vector +URL: https://github.com/goldsborough/vector +Version: commit-id: 40efe82 +License: MIT +License File: LICENSE + +Description: +A feature-complete, generic and customizable resizable +array implementation in pure C that supports almost +the entire C++ std::vector API, including iterators. + +Local Modifications: +1. Renamed some functions to fit in with the AOMedia +naming convention. +2. Removed non-global functions from vector.h. +3. Made all non-global functions in vector.c static. diff --git a/libs/libaom/src/third_party/vector/vector.c b/libs/libaom/src/third_party/vector/vector.c new file mode 100644 index 000000000..4b8b9c6fd --- /dev/null +++ b/libs/libaom/src/third_party/vector/vector.c @@ -0,0 +1,540 @@ +/* +The MIT License(MIT) +Copyright(c) 2016 Peter Goldsborough + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files(the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions : + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#define __STDC_WANT_LIB_EXT1__ 1 + +#include +#include +#include + +#include "third_party/vector/vector.h" + +/***** PRIVATE *****/ +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +static bool _vector_should_grow(Vector *vector) { + assert(vector->size <= vector->capacity); + return vector->size == vector->capacity; +} + +static bool _vector_should_shrink(Vector *vector) { + assert(vector->size <= vector->capacity); + return vector->size == vector->capacity * VECTOR_SHRINK_THRESHOLD; +} + +static void *_vector_offset(Vector *vector, size_t index) { + // return vector->data + (index * vector->element_size); + return (unsigned char *)vector->data + (index * vector->element_size); +} + +static const void *_vector_const_offset(const Vector *vector, size_t index) { + // return vector->data + (index * vector->element_size); + return (unsigned char *)vector->data + (index * vector->element_size); +} + +static void _vector_assign(Vector *vector, size_t index, void *element) { + /* Insert the element */ + void *offset = _vector_offset(vector, index); + memcpy(offset, element, vector->element_size); +} + +static int _vector_move_right(Vector *vector, size_t index) { + assert(vector->size < vector->capacity); + + /* The location where to start to move from. */ + void *offset = _vector_offset(vector, index); + + /* How many to move to the right. */ + size_t elements_in_bytes = (vector->size - index) * vector->element_size; + +#ifdef __STDC_LIB_EXT1__ + size_t right_capacity_in_bytes = + (vector->capacity - (index + 1)) * vector->element_size; + + /* clang-format off */ + int return_code = memmove_s( + offset + vector->element_size, + right_capacity_in_bytes, + offset, + elements_in_bytes); + + /* clang-format on */ + + return return_code == 0 ? VECTOR_SUCCESS : VECTOR_ERROR; + +#else + // memmove(offset + vector->element_size, offset, elements_in_bytes); + memmove((unsigned char *)offset + vector->element_size, offset, + elements_in_bytes); + return VECTOR_SUCCESS; +#endif +} + +static void _vector_move_left(Vector *vector, size_t index) { + size_t right_elements_in_bytes; + void *offset; + + /* The offset into the memory */ + offset = _vector_offset(vector, index); + + /* How many to move to the left */ + right_elements_in_bytes = (vector->size - index - 1) * vector->element_size; + + // memmove(offset, offset + vector->element_size, right_elements_in_bytes); + memmove(offset, (unsigned char *)offset + vector->element_size, + right_elements_in_bytes); +} + +static int _vector_reallocate(Vector *vector, size_t new_capacity) { + size_t new_capacity_in_bytes; + void *old; + assert(vector != NULL); + + if (new_capacity < VECTOR_MINIMUM_CAPACITY) { + if (vector->capacity > VECTOR_MINIMUM_CAPACITY) { + new_capacity = VECTOR_MINIMUM_CAPACITY; + } else { + /* NO-OP */ + return VECTOR_SUCCESS; + } + } + + new_capacity_in_bytes = new_capacity * vector->element_size; + old = vector->data; + + if ((vector->data = malloc(new_capacity_in_bytes)) == NULL) { + return VECTOR_ERROR; + } + +#ifdef __STDC_LIB_EXT1__ + /* clang-format off */ + if (memcpy_s(vector->data, + new_capacity_in_bytes, + old, + aom_vector_byte_size(vector)) != 0) { + return VECTOR_ERROR; + } +/* clang-format on */ +#else + memcpy(vector->data, old, aom_vector_byte_size(vector)); +#endif + + vector->capacity = new_capacity; + + free(old); + + return VECTOR_SUCCESS; +} + +static int _vector_adjust_capacity(Vector *vector) { + return _vector_reallocate(vector, + MAX(1, vector->size * VECTOR_GROWTH_FACTOR)); +} + +static void _vector_swap(size_t *first, size_t *second) { + size_t temp = *first; + *first = *second; + *second = temp; +} + +int aom_vector_setup(Vector *vector, size_t capacity, size_t element_size) { + assert(vector != NULL); + + if (vector == NULL) return VECTOR_ERROR; + + vector->size = 0; + vector->capacity = MAX(VECTOR_MINIMUM_CAPACITY, capacity); + vector->element_size = element_size; + vector->data = malloc(vector->capacity * element_size); + + return vector->data == NULL ? VECTOR_ERROR : VECTOR_SUCCESS; +} + +int aom_vector_copy(Vector *destination, Vector *source) { + assert(destination != NULL); + assert(source != NULL); + assert(aom_vector_is_initialized(source)); + assert(!aom_vector_is_initialized(destination)); + + if (destination == NULL) return VECTOR_ERROR; + if (source == NULL) return VECTOR_ERROR; + if (aom_vector_is_initialized(destination)) return VECTOR_ERROR; + if (!aom_vector_is_initialized(source)) return VECTOR_ERROR; + + /* Copy ALL the data */ + destination->size = source->size; + destination->capacity = source->size * 2; + destination->element_size = source->element_size; + + /* Note that we are not necessarily allocating the same capacity */ + destination->data = malloc(destination->capacity * source->element_size); + if (destination->data == NULL) return VECTOR_ERROR; + + memcpy(destination->data, source->data, aom_vector_byte_size(source)); + + return VECTOR_SUCCESS; +} + +int aom_vector_copy_assign(Vector *destination, Vector *source) { + assert(destination != NULL); + assert(source != NULL); + assert(aom_vector_is_initialized(source)); + assert(aom_vector_is_initialized(destination)); + + if (destination == NULL) return VECTOR_ERROR; + if (source == NULL) return VECTOR_ERROR; + if (!aom_vector_is_initialized(destination)) return VECTOR_ERROR; + if (!aom_vector_is_initialized(source)) return VECTOR_ERROR; + + aom_vector_destroy(destination); + + return aom_vector_copy(destination, source); +} + +int aom_vector_move(Vector *destination, Vector *source) { + assert(destination != NULL); + assert(source != NULL); + + if (destination == NULL) return VECTOR_ERROR; + if (source == NULL) return VECTOR_ERROR; + + *destination = *source; + source->data = NULL; + + return VECTOR_SUCCESS; +} + +int aom_vector_move_assign(Vector *destination, Vector *source) { + aom_vector_swap(destination, source); + return aom_vector_destroy(source); +} + +int aom_vector_swap(Vector *destination, Vector *source) { + void *temp; + + assert(destination != NULL); + assert(source != NULL); + assert(aom_vector_is_initialized(source)); + assert(aom_vector_is_initialized(destination)); + + if (destination == NULL) return VECTOR_ERROR; + if (source == NULL) return VECTOR_ERROR; + if (!aom_vector_is_initialized(destination)) return VECTOR_ERROR; + if (!aom_vector_is_initialized(source)) return VECTOR_ERROR; + + _vector_swap(&destination->size, &source->size); + _vector_swap(&destination->capacity, &source->capacity); + _vector_swap(&destination->element_size, &source->element_size); + + temp = destination->data; + destination->data = source->data; + source->data = temp; + + return VECTOR_SUCCESS; +} + +int aom_vector_destroy(Vector *vector) { + assert(vector != NULL); + + if (vector == NULL) return VECTOR_ERROR; + + free(vector->data); + vector->data = NULL; + + return VECTOR_SUCCESS; +} + +/* Insertion */ +int aom_vector_push_back(Vector *vector, void *element) { + assert(vector != NULL); + assert(element != NULL); + + if (_vector_should_grow(vector)) { + if (_vector_adjust_capacity(vector) == VECTOR_ERROR) { + return VECTOR_ERROR; + } + } + + _vector_assign(vector, vector->size, element); + + ++vector->size; + + return VECTOR_SUCCESS; +} + +int aom_vector_push_front(Vector *vector, void *element) { + return aom_vector_insert(vector, 0, element); +} + +int aom_vector_insert(Vector *vector, size_t index, void *element) { + void *offset; + + assert(vector != NULL); + assert(element != NULL); + assert(index <= vector->size); + + if (vector == NULL) return VECTOR_ERROR; + if (element == NULL) return VECTOR_ERROR; + if (vector->element_size == 0) return VECTOR_ERROR; + if (index > vector->size) return VECTOR_ERROR; + + if (_vector_should_grow(vector)) { + if (_vector_adjust_capacity(vector) == VECTOR_ERROR) { + return VECTOR_ERROR; + } + } + + /* Move other elements to the right */ + if (_vector_move_right(vector, index) == VECTOR_ERROR) { + return VECTOR_ERROR; + } + + /* Insert the element */ + offset = _vector_offset(vector, index); + memcpy(offset, element, vector->element_size); + ++vector->size; + + return VECTOR_SUCCESS; +} + +int aom_vector_assign(Vector *vector, size_t index, void *element) { + assert(vector != NULL); + assert(element != NULL); + assert(index < vector->size); + + if (vector == NULL) return VECTOR_ERROR; + if (element == NULL) return VECTOR_ERROR; + if (vector->element_size == 0) return VECTOR_ERROR; + if (index >= vector->size) return VECTOR_ERROR; + + _vector_assign(vector, index, element); + + return VECTOR_SUCCESS; +} + +/* Deletion */ +int aom_vector_pop_back(Vector *vector) { + assert(vector != NULL); + assert(vector->size > 0); + + if (vector == NULL) return VECTOR_ERROR; + if (vector->element_size == 0) return VECTOR_ERROR; + + --vector->size; + +#ifndef VECTOR_NO_SHRINK + if (_vector_should_shrink(vector)) { + _vector_adjust_capacity(vector); + } +#endif + + return VECTOR_SUCCESS; +} + +int aom_vector_pop_front(Vector *vector) { return aom_vector_erase(vector, 0); } + +int aom_vector_erase(Vector *vector, size_t index) { + assert(vector != NULL); + assert(index < vector->size); + + if (vector == NULL) return VECTOR_ERROR; + if (vector->element_size == 0) return VECTOR_ERROR; + if (index >= vector->size) return VECTOR_ERROR; + + /* Just overwrite */ + _vector_move_left(vector, index); + +#ifndef VECTOR_NO_SHRINK + if (--vector->size == vector->capacity / 4) { + _vector_adjust_capacity(vector); + } +#endif + + return VECTOR_SUCCESS; +} + +int aom_vector_clear(Vector *vector) { return aom_vector_resize(vector, 0); } + +/* Lookup */ +void *aom_vector_get(Vector *vector, size_t index) { + assert(vector != NULL); + assert(index < vector->size); + + if (vector == NULL) return NULL; + if (vector->element_size == 0) return NULL; + if (index >= vector->size) return NULL; + + return _vector_offset(vector, index); +} + +const void *aom_vector_const_get(const Vector *vector, size_t index) { + assert(vector != NULL); + assert(index < vector->size); + + if (vector == NULL) return NULL; + if (vector->element_size == 0) return NULL; + if (index >= vector->size) return NULL; + + return _vector_const_offset(vector, index); +} + +void *aom_vector_front(Vector *vector) { return aom_vector_get(vector, 0); } + +void *aom_vector_back(Vector *vector) { + return aom_vector_get(vector, vector->size - 1); +} + +/* Information */ + +bool aom_vector_is_initialized(const Vector *vector) { + return vector->data != NULL; +} + +size_t aom_vector_byte_size(const Vector *vector) { + return vector->size * vector->element_size; +} + +size_t aom_vector_free_space(const Vector *vector) { + return vector->capacity - vector->size; +} + +bool aom_vector_is_empty(const Vector *vector) { return vector->size == 0; } + +/* Memory management */ +int aom_vector_resize(Vector *vector, size_t new_size) { + if (new_size <= vector->capacity * VECTOR_SHRINK_THRESHOLD) { + vector->size = new_size; + if (_vector_reallocate(vector, new_size * VECTOR_GROWTH_FACTOR) == -1) { + return VECTOR_ERROR; + } + } else if (new_size > vector->capacity) { + if (_vector_reallocate(vector, new_size * VECTOR_GROWTH_FACTOR) == -1) { + return VECTOR_ERROR; + } + } + + vector->size = new_size; + + return VECTOR_SUCCESS; +} + +int aom_vector_reserve(Vector *vector, size_t minimum_capacity) { + if (minimum_capacity > vector->capacity) { + if (_vector_reallocate(vector, minimum_capacity) == VECTOR_ERROR) { + return VECTOR_ERROR; + } + } + + return VECTOR_SUCCESS; +} + +int aom_vector_shrink_to_fit(Vector *vector) { + return _vector_reallocate(vector, vector->size); +} + +/* Iterators */ +Iterator aom_vector_begin(Vector *vector) { return aom_vector_iterator(vector, 0); } + +Iterator aom_vector_end(Vector *vector) { + return aom_vector_iterator(vector, vector->size); +} + +Iterator aom_vector_iterator(Vector *vector, size_t index) { + Iterator iterator = { NULL, 0 }; + + assert(vector != NULL); + assert(index <= vector->size); + + if (vector == NULL) return iterator; + if (index > vector->size) return iterator; + if (vector->element_size == 0) return iterator; + + iterator.pointer = _vector_offset(vector, index); + iterator.element_size = vector->element_size; + + return iterator; +} + +void *aom_iterator_get(Iterator *iterator) { return iterator->pointer; } + +int aom_iterator_erase(Vector *vector, Iterator *iterator) { + size_t index = aom_iterator_index(vector, iterator); + + if (aom_vector_erase(vector, index) == VECTOR_ERROR) { + return VECTOR_ERROR; + } + + *iterator = aom_vector_iterator(vector, index); + + return VECTOR_SUCCESS; +} + +void aom_iterator_increment(Iterator *iterator) { + assert(iterator != NULL); + // iterator->pointer += iterator->element_size; + iterator->pointer = + (unsigned char *)iterator->pointer + iterator->element_size; +} + +void aom_iterator_decrement(Iterator *iterator) { + assert(iterator != NULL); + // iterator->pointer -= iterator->element_size; + iterator->pointer = + (unsigned char *)iterator->pointer - iterator->element_size; +} + +void *aom_iterator_next(Iterator *iterator) { + void *current = iterator->pointer; + aom_iterator_increment(iterator); + + return current; +} + +void *aom_iterator_previous(Iterator *iterator) { + void *current = iterator->pointer; + aom_iterator_decrement(iterator); + + return current; +} + +bool aom_iterator_equals(Iterator *first, Iterator *second) { + assert(first->element_size == second->element_size); + return first->pointer == second->pointer; +} + +bool aom_iterator_is_before(Iterator *first, Iterator *second) { + assert(first->element_size == second->element_size); + return first->pointer < second->pointer; +} + +bool aom_iterator_is_after(Iterator *first, Iterator *second) { + assert(first->element_size == second->element_size); + return first->pointer > second->pointer; +} + +size_t aom_iterator_index(Vector *vector, Iterator *iterator) { + assert(vector != NULL); + assert(iterator != NULL); + // return (iterator->pointer - vector->data) / vector->element_size; + return ((unsigned char *)iterator->pointer - (unsigned char *)vector->data) / + vector->element_size; +} diff --git a/libs/libaom/src/third_party/vector/vector.h b/libs/libaom/src/third_party/vector/vector.h new file mode 100644 index 000000000..d09eb64c9 --- /dev/null +++ b/libs/libaom/src/third_party/vector/vector.h @@ -0,0 +1,138 @@ +/* +The MIT License(MIT) +Copyright(c) 2016 Peter Goldsborough + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files(the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions : + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#ifndef VECTOR_H +#define VECTOR_H + +#include +#include + +/***** DEFINITIONS *****/ + +#define VECTOR_MINIMUM_CAPACITY 2 +#define VECTOR_GROWTH_FACTOR 2 +#define VECTOR_SHRINK_THRESHOLD (1 / 4) + +#define VECTOR_ERROR -1 +#define VECTOR_SUCCESS 0 + +#define VECTOR_UNINITIALIZED NULL +#define VECTOR_INITIALIZER \ + { 0, 0, 0, VECTOR_UNINITIALIZED } + +/***** STRUCTURES *****/ + +typedef struct Vector { + size_t size; + size_t capacity; + size_t element_size; + + void *data; +} Vector; + +typedef struct Iterator { + void *pointer; + size_t element_size; +} Iterator; + +/***** METHODS *****/ + +/* Constructor */ +int aom_vector_setup(Vector *vector, size_t capacity, size_t element_size); + +/* Copy Constructor */ +int aom_vector_copy(Vector *destination, Vector *source); + +/* Copy Assignment */ +int aom_vector_copy_assign(Vector *destination, Vector *source); + +/* Move Constructor */ +int aom_vector_move(Vector *destination, Vector *source); + +/* Move Assignment */ +int aom_vector_move_assign(Vector *destination, Vector *source); + +int aom_vector_swap(Vector *destination, Vector *source); + +/* Destructor */ +int aom_vector_destroy(Vector *vector); + +/* Insertion */ +int aom_vector_push_back(Vector *vector, void *element); +int aom_vector_push_front(Vector *vector, void *element); +int aom_vector_insert(Vector *vector, size_t index, void *element); +int aom_vector_assign(Vector *vector, size_t index, void *element); + +/* Deletion */ +int aom_vector_pop_back(Vector *vector); +int aom_vector_pop_front(Vector *vector); +int aom_vector_erase(Vector *vector, size_t index); +int aom_vector_clear(Vector *vector); + +/* Lookup */ +void *aom_vector_get(Vector *vector, size_t index); +const void *aom_vector_const_get(const Vector *vector, size_t index); +void *aom_vector_front(Vector *vector); +void *aom_vector_back(Vector *vector); +#define VECTOR_GET_AS(type, aom_vector_pointer, index) \ + *((type *)aom_vector_get((aom_vector_pointer), (index))) + +/* Information */ +bool aom_vector_is_initialized(const Vector *vector); +size_t aom_vector_byte_size(const Vector *vector); +size_t aom_vector_free_space(const Vector *vector); +bool aom_vector_is_empty(const Vector *vector); + +/* Memory management */ +int aom_vector_resize(Vector *vector, size_t new_size); +int aom_vector_reserve(Vector *vector, size_t minimum_capacity); +int aom_vector_shrink_to_fit(Vector *vector); + +/* Iterators */ +Iterator aom_vector_begin(Vector *vector); +Iterator aom_vector_end(Vector *vector); +Iterator aom_vector_iterator(Vector *vector, size_t index); + +void *aom_iterator_get(Iterator *iterator); +#define ITERATOR_GET_AS(type, iterator) *((type *)aom_iterator_get((iterator))) + +int aom_iterator_erase(Vector *vector, Iterator *iterator); + +void aom_iterator_increment(Iterator *iterator); +void aom_iterator_decrement(Iterator *iterator); + +void *aom_iterator_next(Iterator *iterator); +void *aom_iterator_previous(Iterator *iterator); + +bool aom_iterator_equals(Iterator *first, Iterator *second); +bool aom_iterator_is_before(Iterator *first, Iterator *second); +bool aom_iterator_is_after(Iterator *first, Iterator *second); + +size_t aom_iterator_index(Vector *vector, Iterator *iterator); + +#define VECTOR_FOR_EACH(aom_vector_pointer, iterator_name) \ + for (Iterator(iterator_name) = aom_vector_begin((aom_vector_pointer)), \ + end = aom_vector_end((aom_vector_pointer)); \ + !aom_iterator_equals(&(iterator_name), &end); \ + aom_iterator_increment(&(iterator_name))) + +#endif /* VECTOR_H */ diff --git a/libs/libaom/src/third_party/x86inc/LICENSE b/libs/libaom/src/third_party/x86inc/LICENSE new file mode 100644 index 000000000..7d07645a1 --- /dev/null +++ b/libs/libaom/src/third_party/x86inc/LICENSE @@ -0,0 +1,18 @@ +Copyright (C) 2005-2012 x264 project + +Authors: Loren Merritt + Anton Mitrofanov + Jason Garrett-Glaser + Henrik Gramner + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. diff --git a/libs/libaom/src/third_party/x86inc/README.libaom b/libs/libaom/src/third_party/x86inc/README.libaom new file mode 100644 index 000000000..07c4dad20 --- /dev/null +++ b/libs/libaom/src/third_party/x86inc/README.libaom @@ -0,0 +1,20 @@ +URL: https://git.videolan.org/git/x264.git +Version: d23d18655249944c1ca894b451e2c82c7a584c62 +License: ISC +License File: LICENSE + +Description: +x264/libav's framework for x86 assembly. Contains a variety of macros and +defines that help automatically allow assembly to work cross-platform. + +Local Modifications: +Get configuration from aom_config.asm. +Prefix functions with aom by default. +Manage name mangling (prefixing with '_') manually because 'PREFIX' does not + exist in libaom. +Expand PIC default to macho64 and respect CONFIG_PIC from libaom +Set 'private_extern' visibility for macho targets. +Copy PIC 'GLOBAL' macros from x86_abi_support.asm +Use .text instead of .rodata on macho to avoid broken tables in PIC mode. +Use .text with no alignment for aout +Only use 'hidden' visibility with Chromium diff --git a/libs/libaom/src/third_party/x86inc/x86inc.asm b/libs/libaom/src/third_party/x86inc/x86inc.asm new file mode 100644 index 000000000..adaf2d99e --- /dev/null +++ b/libs/libaom/src/third_party/x86inc/x86inc.asm @@ -0,0 +1,1649 @@ +;***************************************************************************** +;* x86inc.asm: x264asm abstraction layer +;***************************************************************************** +;* Copyright (C) 2005-2016 x264 project +;* +;* Authors: Loren Merritt +;* Anton Mitrofanov +;* Fiona Glaser +;* Henrik Gramner +;* +;* Permission to use, copy, modify, and/or distribute this software for any +;* purpose with or without fee is hereby granted, provided that the above +;* copyright notice and this permission notice appear in all copies. +;* +;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +;***************************************************************************** + +; This is a header file for the x264ASM assembly language, which uses +; NASM/YASM syntax combined with a large number of macros to provide easy +; abstraction between different calling conventions (x86_32, win64, linux64). +; It also has various other useful features to simplify writing the kind of +; DSP functions that are most often used in x264. + +; Unlike the rest of x264, this file is available under an ISC license, as it +; has significant usefulness outside of x264 and we want it to be available +; to the largest audience possible. Of course, if you modify it for your own +; purposes to add a new feature, we strongly encourage contributing a patch +; as this feature might be useful for others as well. Send patches or ideas +; to x264-devel@videolan.org . + +%include "config/aom_config.asm" + +%ifndef private_prefix + %define private_prefix aom +%endif + +%ifndef public_prefix + %define public_prefix private_prefix +%endif + +%ifndef STACK_ALIGNMENT + %if ARCH_X86_64 + %define STACK_ALIGNMENT 16 + %else + %define STACK_ALIGNMENT 4 + %endif +%endif + +%define WIN64 0 +%define UNIX64 0 +%if ARCH_X86_64 + %ifidn __OUTPUT_FORMAT__,win32 + %define WIN64 1 + %elifidn __OUTPUT_FORMAT__,win64 + %define WIN64 1 + %elifidn __OUTPUT_FORMAT__,x64 + %define WIN64 1 + %else + %define UNIX64 1 + %endif +%endif + +%define FORMAT_ELF 0 +%ifidn __OUTPUT_FORMAT__,elf + %define FORMAT_ELF 1 +%elifidn __OUTPUT_FORMAT__,elf32 + %define FORMAT_ELF 1 +%elifidn __OUTPUT_FORMAT__,elf64 + %define FORMAT_ELF 1 +%endif + +%define FORMAT_MACHO 0 +%ifidn __OUTPUT_FORMAT__,macho32 + %define FORMAT_MACHO 1 +%elifidn __OUTPUT_FORMAT__,macho64 + %define FORMAT_MACHO 1 +%endif + +; Set PREFIX for libaom builds. +%if FORMAT_ELF + %undef PREFIX +%elif WIN64 + %undef PREFIX +%else + %define PREFIX +%endif + +%ifdef PREFIX + %define mangle(x) _ %+ x +%else + %define mangle(x) x +%endif + +; In some instances macho32 tables get misaligned when using .rodata. +; When looking at the disassembly it appears that the offset is either +; correct or consistently off by 90. Placing them in the .text section +; works around the issue. It appears to be specific to the way libaom +; handles the tables. +%macro SECTION_RODATA 0-1 16 + %ifidn __OUTPUT_FORMAT__,macho32 + SECTION .text align=%1 + fakegot: + %elifidn __OUTPUT_FORMAT__,aout + SECTION .text + %else + SECTION .rodata align=%1 + %endif +%endmacro + +; PIC macros are copied from aom_ports/x86_abi_support.asm. The "define PIC" +; from original code is added in for 64bit. +%ifidn __OUTPUT_FORMAT__,elf32 +%define ABI_IS_32BIT 1 +%elifidn __OUTPUT_FORMAT__,macho32 +%define ABI_IS_32BIT 1 +%elifidn __OUTPUT_FORMAT__,win32 +%define ABI_IS_32BIT 1 +%elifidn __OUTPUT_FORMAT__,aout +%define ABI_IS_32BIT 1 +%else +%define ABI_IS_32BIT 0 +%endif + +%if ABI_IS_32BIT + %if CONFIG_PIC=1 + %ifidn __OUTPUT_FORMAT__,elf32 + %define GET_GOT_DEFINED 1 + %define WRT_PLT wrt ..plt + %macro GET_GOT 1 + extern _GLOBAL_OFFSET_TABLE_ + push %1 + call %%get_got + %%sub_offset: + jmp %%exitGG + %%get_got: + mov %1, [esp] + add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc + ret + %%exitGG: + %undef GLOBAL + %define GLOBAL(x) x + %1 wrt ..gotoff + %undef RESTORE_GOT + %define RESTORE_GOT pop %1 + %endmacro + %elifidn __OUTPUT_FORMAT__,macho32 + %define GET_GOT_DEFINED 1 + %macro GET_GOT 1 + push %1 + call %%get_got + %%get_got: + pop %1 + %undef GLOBAL + %define GLOBAL(x) x + %1 - %%get_got + %undef RESTORE_GOT + %define RESTORE_GOT pop %1 + %endmacro + %else + %define GET_GOT_DEFINED 0 + %endif + %endif + + %if ARCH_X86_64 == 0 + %undef PIC + %endif + +%else + %macro GET_GOT 1 + %endmacro + %define GLOBAL(x) rel x + %define WRT_PLT wrt ..plt + + %if WIN64 + %define PIC + %elifidn __OUTPUT_FORMAT__,macho64 + %define PIC + %elif CONFIG_PIC + %define PIC + %endif +%endif + +%ifnmacro GET_GOT + %macro GET_GOT 1 + %endmacro + %define GLOBAL(x) x +%endif +%ifndef RESTORE_GOT + %define RESTORE_GOT +%endif +%ifndef WRT_PLT + %define WRT_PLT +%endif + +%ifdef PIC + default rel +%endif + +%ifndef GET_GOT_DEFINED + %define GET_GOT_DEFINED 0 +%endif +; Done with PIC macros + +%ifdef __NASM_VER__ + %use smartalign +%endif + +; Macros to eliminate most code duplication between x86_32 and x86_64: +; Currently this works only for leaf functions which load all their arguments +; into registers at the start, and make no other use of the stack. Luckily that +; covers most of x264's asm. + +; PROLOGUE: +; %1 = number of arguments. loads them from stack if needed. +; %2 = number of registers used. pushes callee-saved regs if needed. +; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. +; %4 = (optional) stack size to be allocated. The stack will be aligned before +; allocating the specified stack size. If the required stack alignment is +; larger than the known stack alignment the stack will be manually aligned +; and an extra register will be allocated to hold the original stack +; pointer (to not invalidate r0m etc.). To prevent the use of an extra +; register as stack pointer, request a negative stack size. +; %4+/%5+ = list of names to define to registers +; PROLOGUE can also be invoked by adding the same options to cglobal + +; e.g. +; cglobal foo, 2,3,7,0x40, dst, src, tmp +; declares a function (foo) that automatically loads two arguments (dst and +; src) into registers, uses one additional register (tmp) plus 7 vector +; registers (m0-m6) and allocates 0x40 bytes of stack space. + +; TODO Some functions can use some args directly from the stack. If they're the +; last args then you can just not declare them, but if they're in the middle +; we need more flexible macro. + +; RET: +; Pops anything that was pushed by PROLOGUE, and returns. + +; REP_RET: +; Use this instead of RET if it's a branch target. + +; registers: +; rN and rNq are the native-size register holding function argument N +; rNd, rNw, rNb are dword, word, and byte size +; rNh is the high 8 bits of the word size +; rNm is the original location of arg N (a register or on the stack), dword +; rNmp is native size + +%macro DECLARE_REG 2-3 + %define r%1q %2 + %define r%1d %2d + %define r%1w %2w + %define r%1b %2b + %define r%1h %2h + %define %2q %2 + %if %0 == 2 + %define r%1m %2d + %define r%1mp %2 + %elif ARCH_X86_64 ; memory + %define r%1m [rstk + stack_offset + %3] + %define r%1mp qword r %+ %1 %+ m + %else + %define r%1m [rstk + stack_offset + %3] + %define r%1mp dword r %+ %1 %+ m + %endif + %define r%1 %2 +%endmacro + +%macro DECLARE_REG_SIZE 3 + %define r%1q r%1 + %define e%1q r%1 + %define r%1d e%1 + %define e%1d e%1 + %define r%1w %1 + %define e%1w %1 + %define r%1h %3 + %define e%1h %3 + %define r%1b %2 + %define e%1b %2 + %if ARCH_X86_64 == 0 + %define r%1 e%1 + %endif +%endmacro + +DECLARE_REG_SIZE ax, al, ah +DECLARE_REG_SIZE bx, bl, bh +DECLARE_REG_SIZE cx, cl, ch +DECLARE_REG_SIZE dx, dl, dh +DECLARE_REG_SIZE si, sil, null +DECLARE_REG_SIZE di, dil, null +DECLARE_REG_SIZE bp, bpl, null + +; t# defines for when per-arch register allocation is more complex than just function arguments + +%macro DECLARE_REG_TMP 1-* + %assign %%i 0 + %rep %0 + CAT_XDEFINE t, %%i, r%1 + %assign %%i %%i+1 + %rotate 1 + %endrep +%endmacro + +%macro DECLARE_REG_TMP_SIZE 0-* + %rep %0 + %define t%1q t%1 %+ q + %define t%1d t%1 %+ d + %define t%1w t%1 %+ w + %define t%1h t%1 %+ h + %define t%1b t%1 %+ b + %rotate 1 + %endrep +%endmacro + +DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 + +%if ARCH_X86_64 + %define gprsize 8 +%else + %define gprsize 4 +%endif + +%macro PUSH 1 + push %1 + %ifidn rstk, rsp + %assign stack_offset stack_offset+gprsize + %endif +%endmacro + +%macro POP 1 + pop %1 + %ifidn rstk, rsp + %assign stack_offset stack_offset-gprsize + %endif +%endmacro + +%macro PUSH_IF_USED 1-* + %rep %0 + %if %1 < regs_used + PUSH r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro POP_IF_USED 1-* + %rep %0 + %if %1 < regs_used + pop r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro LOAD_IF_USED 1-* + %rep %0 + %if %1 < num_args + mov r%1, r %+ %1 %+ mp + %endif + %rotate 1 + %endrep +%endmacro + +%macro SUB 2 + sub %1, %2 + %ifidn %1, rstk + %assign stack_offset stack_offset+(%2) + %endif +%endmacro + +%macro ADD 2 + add %1, %2 + %ifidn %1, rstk + %assign stack_offset stack_offset-(%2) + %endif +%endmacro + +%macro movifnidn 2 + %ifnidn %1, %2 + mov %1, %2 + %endif +%endmacro + +%macro movsxdifnidn 2 + %ifnidn %1, %2 + movsxd %1, %2 + %endif +%endmacro + +%macro ASSERT 1 + %if (%1) == 0 + %error assertion ``%1'' failed + %endif +%endmacro + +%macro DEFINE_ARGS 0-* + %ifdef n_arg_names + %assign %%i 0 + %rep n_arg_names + CAT_UNDEF arg_name %+ %%i, q + CAT_UNDEF arg_name %+ %%i, d + CAT_UNDEF arg_name %+ %%i, w + CAT_UNDEF arg_name %+ %%i, h + CAT_UNDEF arg_name %+ %%i, b + CAT_UNDEF arg_name %+ %%i, m + CAT_UNDEF arg_name %+ %%i, mp + CAT_UNDEF arg_name, %%i + %assign %%i %%i+1 + %endrep + %endif + + %xdefine %%stack_offset stack_offset + %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine + %assign %%i 0 + %rep %0 + %xdefine %1q r %+ %%i %+ q + %xdefine %1d r %+ %%i %+ d + %xdefine %1w r %+ %%i %+ w + %xdefine %1h r %+ %%i %+ h + %xdefine %1b r %+ %%i %+ b + %xdefine %1m r %+ %%i %+ m + %xdefine %1mp r %+ %%i %+ mp + CAT_XDEFINE arg_name, %%i, %1 + %assign %%i %%i+1 + %rotate 1 + %endrep + %xdefine stack_offset %%stack_offset + %assign n_arg_names %0 +%endmacro + +%define required_stack_alignment ((mmsize + 15) & ~15) + +%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) + %ifnum %1 + %if %1 != 0 + %assign %%pad 0 + %assign stack_size %1 + %if stack_size < 0 + %assign stack_size -stack_size + %endif + %if WIN64 + %assign %%pad %%pad + 32 ; shadow space + %if mmsize != 8 + %assign xmm_regs_used %2 + %if xmm_regs_used > 8 + %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers + %endif + %endif + %endif + %if required_stack_alignment <= STACK_ALIGNMENT + ; maintain the current stack alignment + %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) + SUB rsp, stack_size_padded + %else + %assign %%reg_num (regs_used - 1) + %xdefine rstk r %+ %%reg_num + ; align stack, and save original stack location directly above + ; it, i.e. in [rsp+stack_size_padded], so we can restore the + ; stack in a single instruction (i.e. mov rsp, rstk or mov + ; rsp, [rsp+stack_size_padded]) + %if %1 < 0 ; need to store rsp on stack + %xdefine rstkm [rsp + stack_size + %%pad] + %assign %%pad %%pad + gprsize + %else ; can keep rsp in rstk during whole function + %xdefine rstkm rstk + %endif + %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1)) + mov rstk, rsp + and rsp, ~(required_stack_alignment-1) + sub rsp, stack_size_padded + movifnidn rstkm, rstk + %endif + WIN64_PUSH_XMM + %endif + %endif +%endmacro + +%macro SETUP_STACK_POINTER 1 + %ifnum %1 + %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT + %if %1 > 0 + %assign regs_used (regs_used + 1) + %endif + %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3 + ; Ensure that we don't clobber any registers containing arguments + %assign regs_used 5 + UNIX64 * 3 + %endif + %endif + %endif +%endmacro + +%macro DEFINE_ARGS_INTERNAL 3+ + %ifnum %2 + DEFINE_ARGS %3 + %elif %1 == 4 + DEFINE_ARGS %2 + %elif %1 > 4 + DEFINE_ARGS %2, %3 + %endif +%endmacro + +%if WIN64 ; Windows x64 ;================================================= + +DECLARE_REG 0, rcx +DECLARE_REG 1, rdx +DECLARE_REG 2, R8 +DECLARE_REG 3, R9 +DECLARE_REG 4, R10, 40 +DECLARE_REG 5, R11, 48 +DECLARE_REG 6, rax, 56 +DECLARE_REG 7, rdi, 64 +DECLARE_REG 8, rsi, 72 +DECLARE_REG 9, rbx, 80 +DECLARE_REG 10, rbp, 88 +DECLARE_REG 11, R12, 96 +DECLARE_REG 12, R13, 104 +DECLARE_REG 13, R14, 112 +DECLARE_REG 14, R15, 120 + +%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 15 + PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 + ALLOC_STACK %4, %3 + %if mmsize != 8 && stack_size == 0 + WIN64_SPILL_XMM %3 + %endif + LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 + DEFINE_ARGS_INTERNAL %0, %4, %5 +%endmacro + +%macro WIN64_PUSH_XMM 0 + ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. + %if xmm_regs_used > 6 + movaps [rstk + stack_offset + 8], xmm6 + %endif + %if xmm_regs_used > 7 + movaps [rstk + stack_offset + 24], xmm7 + %endif + %if xmm_regs_used > 8 + %assign %%i 8 + %rep xmm_regs_used-8 + movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i + %assign %%i %%i+1 + %endrep + %endif +%endmacro + +%macro WIN64_SPILL_XMM 1 + %assign xmm_regs_used %1 + ASSERT xmm_regs_used <= 16 + %if xmm_regs_used > 8 + ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. + %assign %%pad (xmm_regs_used-8)*16 + 32 + %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) + SUB rsp, stack_size_padded + %endif + WIN64_PUSH_XMM +%endmacro + +%macro WIN64_RESTORE_XMM_INTERNAL 1 + %assign %%pad_size 0 + %if xmm_regs_used > 8 + %assign %%i xmm_regs_used + %rep xmm_regs_used-8 + %assign %%i %%i-1 + movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32] + %endrep + %endif + %if stack_size_padded > 0 + %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm + %else + add %1, stack_size_padded + %assign %%pad_size stack_size_padded + %endif + %endif + %if xmm_regs_used > 7 + movaps xmm7, [%1 + stack_offset - %%pad_size + 24] + %endif + %if xmm_regs_used > 6 + movaps xmm6, [%1 + stack_offset - %%pad_size + 8] + %endif +%endmacro + +%macro WIN64_RESTORE_XMM 1 + WIN64_RESTORE_XMM_INTERNAL %1 + %assign stack_offset (stack_offset-stack_size_padded) + %assign xmm_regs_used 0 +%endmacro + +%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0 + +%macro RET 0 + WIN64_RESTORE_XMM_INTERNAL rsp + POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 + %if mmsize == 32 + vzeroupper + %endif + AUTO_REP_RET +%endmacro + +%elif ARCH_X86_64 ; *nix x64 ;============================================= + +DECLARE_REG 0, rdi +DECLARE_REG 1, rsi +DECLARE_REG 2, rdx +DECLARE_REG 3, rcx +DECLARE_REG 4, R8 +DECLARE_REG 5, R9 +DECLARE_REG 6, rax, 8 +DECLARE_REG 7, R10, 16 +DECLARE_REG 8, R11, 24 +DECLARE_REG 9, rbx, 32 +DECLARE_REG 10, rbp, 40 +DECLARE_REG 11, R12, 48 +DECLARE_REG 12, R13, 56 +DECLARE_REG 13, R14, 64 +DECLARE_REG 14, R15, 72 + +%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 15 + PUSH_IF_USED 9, 10, 11, 12, 13, 14 + ALLOC_STACK %4 + LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 + DEFINE_ARGS_INTERNAL %0, %4, %5 +%endmacro + +%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0 + +%macro RET 0 + %if stack_size_padded > 0 + %if required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm + %else + add rsp, stack_size_padded + %endif + %endif + POP_IF_USED 14, 13, 12, 11, 10, 9 + %if mmsize == 32 + vzeroupper + %endif + AUTO_REP_RET +%endmacro + +%else ; X86_32 ;============================================================== + +DECLARE_REG 0, eax, 4 +DECLARE_REG 1, ecx, 8 +DECLARE_REG 2, edx, 12 +DECLARE_REG 3, ebx, 16 +DECLARE_REG 4, esi, 20 +DECLARE_REG 5, edi, 24 +DECLARE_REG 6, ebp, 28 +%define rsp esp + +%macro DECLARE_ARG 1-* + %rep %0 + %define r%1m [rstk + stack_offset + 4*%1 + 4] + %define r%1mp dword r%1m + %rotate 1 + %endrep +%endmacro + +DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 + +%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + %if num_args > 7 + %assign num_args 7 + %endif + %if regs_used > 7 + %assign regs_used 7 + %endif + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 7 + PUSH_IF_USED 3, 4, 5, 6 + ALLOC_STACK %4 + LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 + DEFINE_ARGS_INTERNAL %0, %4, %5 +%endmacro + +%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0 + +%macro RET 0 + %if stack_size_padded > 0 + %if required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm + %else + add rsp, stack_size_padded + %endif + %endif + POP_IF_USED 6, 5, 4, 3 + %if mmsize == 32 + vzeroupper + %endif + AUTO_REP_RET +%endmacro + +%endif ;====================================================================== + +%if WIN64 == 0 + %macro WIN64_SPILL_XMM 1 + %endmacro + %macro WIN64_RESTORE_XMM 1 + %endmacro + %macro WIN64_PUSH_XMM 0 + %endmacro +%endif + +; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either +; a branch or a branch target. So switch to a 2-byte form of ret in that case. +; We can automatically detect "follows a branch", but not a branch target. +; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) +%macro REP_RET 0 + %if has_epilogue + RET + %else + rep ret + %endif + annotate_function_size +%endmacro + +%define last_branch_adr $$ +%macro AUTO_REP_RET 0 + %if notcpuflag(ssse3) + times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr. + %endif + ret + annotate_function_size +%endmacro + +%macro BRANCH_INSTR 0-* + %rep %0 + %macro %1 1-2 %1 + %2 %1 + %if notcpuflag(ssse3) + %%branch_instr equ $ + %xdefine last_branch_adr %%branch_instr + %endif + %endmacro + %rotate 1 + %endrep +%endmacro + +BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp + +%macro TAIL_CALL 2 ; callee, is_nonadjacent + %if has_epilogue + call %1 + RET + %elif %2 + jmp %1 + %endif + annotate_function_size +%endmacro + +;============================================================================= +; arch-independent part +;============================================================================= + +%assign function_align 16 + +; Begin a function. +; Applies any symbol mangling needed for C linkage, and sets up a define such that +; subsequent uses of the function name automatically refer to the mangled version. +; Appends cpuflags to the function name if cpuflags has been specified. +; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX +; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). +%macro cglobal 1-2+ "" ; name, [PROLOGUE args] + cglobal_internal 1, %1 %+ SUFFIX, %2 +%endmacro +%macro cvisible 1-2+ "" ; name, [PROLOGUE args] + cglobal_internal 0, %1 %+ SUFFIX, %2 +%endmacro +%macro cglobal_internal 2-3+ + annotate_function_size + %if %1 + %xdefine %%FUNCTION_PREFIX private_prefix + ; libaom explicitly sets visibility in shared object builds. Avoid + ; setting visibility to hidden as it may break builds that split + ; sources on e.g., directory boundaries. + %ifdef CHROMIUM + %xdefine %%VISIBILITY hidden + %else + %xdefine %%VISIBILITY + %endif + %else + %xdefine %%FUNCTION_PREFIX public_prefix + %xdefine %%VISIBILITY + %endif + %ifndef cglobaled_%2 + %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2) + %xdefine %2.skip_prologue %2 %+ .skip_prologue + CAT_XDEFINE cglobaled_, %2, 1 + %endif + %xdefine current_function %2 + %xdefine current_function_section __SECT__ + %if FORMAT_ELF + global %2:function %%VISIBILITY + %elif FORMAT_MACHO + %ifdef __NASM_VER__ + global %2 + %else + global %2:private_extern + %endif + %else + global %2 + %endif + align function_align + %2: + RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer + %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required + %assign stack_offset 0 ; stack pointer offset relative to the return address + %assign stack_size 0 ; amount of stack space that can be freely used inside a function + %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding + %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 + %ifnidn %3, "" + PROLOGUE %3 + %endif +%endmacro + +%macro cextern 1 + %xdefine %1 mangle(private_prefix %+ _ %+ %1) + CAT_XDEFINE cglobaled_, %1, 1 + extern %1 +%endmacro + +; like cextern, but without the prefix +%macro cextern_naked 1 + %ifdef PREFIX + %xdefine %1 mangle(%1) + %endif + CAT_XDEFINE cglobaled_, %1, 1 + extern %1 +%endmacro + +%macro const 1-2+ + %xdefine %1 mangle(private_prefix %+ _ %+ %1) + %if FORMAT_ELF + global %1:data hidden + %else + global %1 + %endif + %1: %2 +%endmacro + +; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default. +%if FORMAT_ELF + [SECTION .note.GNU-stack noalloc noexec nowrite progbits] +%endif + +; Tell debuggers how large the function was. +; This may be invoked multiple times per function; we rely on later instances overriding earlier ones. +; This is invoked by RET and similar macros, and also cglobal does it for the previous function, +; but if the last function in a source file doesn't use any of the standard macros for its epilogue, +; then its size might be unspecified. +%macro annotate_function_size 0 + %ifdef __YASM_VER__ + %ifdef current_function + %if FORMAT_ELF + current_function_section + %%ecf equ $ + size current_function %%ecf - current_function + __SECT__ + %endif + %endif + %endif +%endmacro + +; cpuflags + +%assign cpuflags_mmx (1<<0) +%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx +%assign cpuflags_3dnow (1<<2) | cpuflags_mmx +%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow +%assign cpuflags_sse (1<<4) | cpuflags_mmx2 +%assign cpuflags_sse2 (1<<5) | cpuflags_sse +%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 +%assign cpuflags_sse3 (1<<7) | cpuflags_sse2 +%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 +%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 +%assign cpuflags_sse42 (1<<10)| cpuflags_sse4 +%assign cpuflags_avx (1<<11)| cpuflags_sse42 +%assign cpuflags_xop (1<<12)| cpuflags_avx +%assign cpuflags_fma4 (1<<13)| cpuflags_avx +%assign cpuflags_fma3 (1<<14)| cpuflags_avx +%assign cpuflags_avx2 (1<<15)| cpuflags_fma3 + +%assign cpuflags_cache32 (1<<16) +%assign cpuflags_cache64 (1<<17) +%assign cpuflags_slowctz (1<<18) +%assign cpuflags_lzcnt (1<<19) +%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant +%assign cpuflags_atom (1<<21) +%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt +%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1 + +; Returns a boolean value expressing whether or not the specified cpuflag is enabled. +%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) +%define notcpuflag(x) (cpuflag(x) ^ 1) + +; Takes an arbitrary number of cpuflags from the above list. +; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. +; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. +%macro INIT_CPUFLAGS 0-* + %xdefine SUFFIX + %undef cpuname + %assign cpuflags 0 + + %if %0 >= 1 + %rep %0 + %ifdef cpuname + %xdefine cpuname cpuname %+ _%1 + %else + %xdefine cpuname %1 + %endif + %assign cpuflags cpuflags | cpuflags_%1 + %rotate 1 + %endrep + %xdefine SUFFIX _ %+ cpuname + + %if cpuflag(avx) + %assign avx_enabled 1 + %endif + %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) + %define mova movaps + %define movu movups + %define movnta movntps + %endif + %if cpuflag(aligned) + %define movu mova + %elif cpuflag(sse3) && notcpuflag(ssse3) + %define movu lddqu + %endif + %endif + + %if ARCH_X86_64 || cpuflag(sse2) + %ifdef __NASM_VER__ + ALIGNMODE k8 + %else + CPU amdnop + %endif + %else + %ifdef __NASM_VER__ + ALIGNMODE nop + %else + CPU basicnop + %endif + %endif +%endmacro + +; Merge mmx and sse* +; m# is a simd register of the currently selected size +; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# +; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# +; (All 3 remain in sync through SWAP.) + +%macro CAT_XDEFINE 3 + %xdefine %1%2 %3 +%endmacro + +%macro CAT_UNDEF 2 + %undef %1%2 +%endmacro + +%macro INIT_MMX 0-1+ + %assign avx_enabled 0 + %define RESET_MM_PERMUTATION INIT_MMX %1 + %define mmsize 8 + %define num_mmregs 8 + %define mova movq + %define movu movq + %define movh movd + %define movnta movntq + %assign %%i 0 + %rep 8 + CAT_XDEFINE m, %%i, mm %+ %%i + CAT_XDEFINE nnmm, %%i, %%i + %assign %%i %%i+1 + %endrep + %rep 8 + CAT_UNDEF m, %%i + CAT_UNDEF nnmm, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 +%endmacro + +%macro INIT_XMM 0-1+ + %assign avx_enabled 0 + %define RESET_MM_PERMUTATION INIT_XMM %1 + %define mmsize 16 + %define num_mmregs 8 + %if ARCH_X86_64 + %define num_mmregs 16 + %endif + %define mova movdqa + %define movu movdqu + %define movh movq + %define movnta movntdq + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, xmm %+ %%i + CAT_XDEFINE nnxmm, %%i, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 +%endmacro + +%macro INIT_YMM 0-1+ + %assign avx_enabled 1 + %define RESET_MM_PERMUTATION INIT_YMM %1 + %define mmsize 32 + %define num_mmregs 8 + %if ARCH_X86_64 + %define num_mmregs 16 + %endif + %define mova movdqa + %define movu movdqu + %undef movh + %define movnta movntdq + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, ymm %+ %%i + CAT_XDEFINE nnymm, %%i, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 +%endmacro + +INIT_XMM + +%macro DECLARE_MMCAST 1 + %define mmmm%1 mm%1 + %define mmxmm%1 mm%1 + %define mmymm%1 mm%1 + %define xmmmm%1 mm%1 + %define xmmxmm%1 xmm%1 + %define xmmymm%1 xmm%1 + %define ymmmm%1 mm%1 + %define ymmxmm%1 xmm%1 + %define ymmymm%1 ymm%1 + %define xm%1 xmm %+ m%1 + %define ym%1 ymm %+ m%1 +%endmacro + +%assign i 0 +%rep 16 + DECLARE_MMCAST i + %assign i i+1 +%endrep + +; I often want to use macros that permute their arguments. e.g. there's no +; efficient way to implement butterfly or transpose or dct without swapping some +; arguments. +; +; I would like to not have to manually keep track of the permutations: +; If I insert a permutation in the middle of a function, it should automatically +; change everything that follows. For more complex macros I may also have multiple +; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. +; +; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that +; permutes its arguments. It's equivalent to exchanging the contents of the +; registers, except that this way you exchange the register names instead, so it +; doesn't cost any cycles. + +%macro PERMUTE 2-* ; takes a list of pairs to swap + %rep %0/2 + %xdefine %%tmp%2 m%2 + %rotate 2 + %endrep + %rep %0/2 + %xdefine m%1 %%tmp%2 + CAT_XDEFINE nn, m%1, %1 + %rotate 2 + %endrep +%endmacro + +%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) + %ifnum %1 ; SWAP 0, 1, ... + SWAP_INTERNAL_NUM %1, %2 + %else ; SWAP m0, m1, ... + SWAP_INTERNAL_NAME %1, %2 + %endif +%endmacro + +%macro SWAP_INTERNAL_NUM 2-* + %rep %0-1 + %xdefine %%tmp m%1 + %xdefine m%1 m%2 + %xdefine m%2 %%tmp + CAT_XDEFINE nn, m%1, %1 + CAT_XDEFINE nn, m%2, %2 + %rotate 1 + %endrep +%endmacro + +%macro SWAP_INTERNAL_NAME 2-* + %xdefine %%args nn %+ %1 + %rep %0-1 + %xdefine %%args %%args, nn %+ %2 + %rotate 1 + %endrep + SWAP_INTERNAL_NUM %%args +%endmacro + +; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later +; calls to that function will automatically load the permutation, so values can +; be returned in mmregs. +%macro SAVE_MM_PERMUTATION 0-1 + %if %0 + %xdefine %%f %1_m + %else + %xdefine %%f current_function %+ _m + %endif + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE %%f, %%i, m %+ %%i + %assign %%i %%i+1 + %endrep +%endmacro + +%macro LOAD_MM_PERMUTATION 1 ; name to load from + %ifdef %1_m0 + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, %1_m %+ %%i + CAT_XDEFINE nn, m %+ %%i, %%i + %assign %%i %%i+1 + %endrep + %endif +%endmacro + +; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't +%macro call 1 + call_internal %1 %+ SUFFIX, %1 +%endmacro +%macro call_internal 2 + %xdefine %%i %2 + %ifndef cglobaled_%2 + %ifdef cglobaled_%1 + %xdefine %%i %1 + %endif + %endif + call %%i + LOAD_MM_PERMUTATION %%i +%endmacro + +; Substitutions that reduce instruction size but are functionally equivalent +%macro add 2 + %ifnum %2 + %if %2==128 + sub %1, -128 + %else + add %1, %2 + %endif + %else + add %1, %2 + %endif +%endmacro + +%macro sub 2 + %ifnum %2 + %if %2==128 + add %1, -128 + %else + sub %1, %2 + %endif + %else + sub %1, %2 + %endif +%endmacro + +;============================================================================= +; AVX abstraction layer +;============================================================================= + +%assign i 0 +%rep 16 + %if i < 8 + CAT_XDEFINE sizeofmm, i, 8 + %endif + CAT_XDEFINE sizeofxmm, i, 16 + CAT_XDEFINE sizeofymm, i, 32 + %assign i i+1 +%endrep +%undef i + +%macro CHECK_AVX_INSTR_EMU 3-* + %xdefine %%opcode %1 + %xdefine %%dst %2 + %rep %0-2 + %ifidn %%dst, %3 + %error non-avx emulation of ``%%opcode'' is not supported + %endif + %rotate 1 + %endrep +%endmacro + +;%1 == instruction +;%2 == minimal instruction set +;%3 == 1 if float, 0 if int +;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise +;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not +;%6+: operands +%macro RUN_AVX_INSTR 6-9+ + %ifnum sizeof%7 + %assign __sizeofreg sizeof%7 + %elifnum sizeof%6 + %assign __sizeofreg sizeof%6 + %else + %assign __sizeofreg mmsize + %endif + %assign __emulate_avx 0 + %if avx_enabled && __sizeofreg >= 16 + %xdefine __instr v%1 + %else + %xdefine __instr %1 + %if %0 >= 8+%4 + %assign __emulate_avx 1 + %endif + %endif + %ifnidn %2, fnord + %ifdef cpuname + %if notcpuflag(%2) + %error use of ``%1'' %2 instruction in cpuname function: current_function + %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8 + %error use of ``%1'' sse2 instruction in cpuname function: current_function + %endif + %endif + %endif + + %if __emulate_avx + %xdefine __src1 %7 + %xdefine __src2 %8 + %ifnidn %6, %7 + %if %0 >= 9 + CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, %8, %9 + %else + CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, %8 + %endif + %if %5 && %4 == 0 + %ifnid %8 + ; 3-operand AVX instructions with a memory arg can only have it in src2, + ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). + ; So, if the instruction is commutative with a memory arg, swap them. + %xdefine __src1 %8 + %xdefine __src2 %7 + %endif + %endif + %if __sizeofreg == 8 + MOVQ %6, __src1 + %elif %3 + MOVAPS %6, __src1 + %else + MOVDQA %6, __src1 + %endif + %endif + %if %0 >= 9 + %1 %6, __src2, %9 + %else + %1 %6, __src2 + %endif + %elif %0 >= 9 + __instr %6, %7, %8, %9 + %elif %0 == 8 + __instr %6, %7, %8 + %elif %0 == 7 + __instr %6, %7 + %else + __instr %6 + %endif +%endmacro + +;%1 == instruction +;%2 == minimal instruction set +;%3 == 1 if float, 0 if int +;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise +;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not +%macro AVX_INSTR 1-5 fnord, 0, 1, 0 + %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5 + %ifidn %2, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1 + %elifidn %3, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2 + %elifidn %4, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3 + %elifidn %5, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4 + %else + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5 + %endif + %endmacro +%endmacro + +; Instructions with both VEX and non-VEX encodings +; Non-destructive instructions are written without parameters +AVX_INSTR addpd, sse2, 1, 0, 1 +AVX_INSTR addps, sse, 1, 0, 1 +AVX_INSTR addsd, sse2, 1, 0, 1 +AVX_INSTR addss, sse, 1, 0, 1 +AVX_INSTR addsubpd, sse3, 1, 0, 0 +AVX_INSTR addsubps, sse3, 1, 0, 0 +AVX_INSTR aesdec, fnord, 0, 0, 0 +AVX_INSTR aesdeclast, fnord, 0, 0, 0 +AVX_INSTR aesenc, fnord, 0, 0, 0 +AVX_INSTR aesenclast, fnord, 0, 0, 0 +AVX_INSTR aesimc +AVX_INSTR aeskeygenassist +AVX_INSTR andnpd, sse2, 1, 0, 0 +AVX_INSTR andnps, sse, 1, 0, 0 +AVX_INSTR andpd, sse2, 1, 0, 1 +AVX_INSTR andps, sse, 1, 0, 1 +AVX_INSTR blendpd, sse4, 1, 0, 0 +AVX_INSTR blendps, sse4, 1, 0, 0 +AVX_INSTR blendvpd, sse4, 1, 0, 0 +AVX_INSTR blendvps, sse4, 1, 0, 0 +AVX_INSTR cmppd, sse2, 1, 1, 0 +AVX_INSTR cmpps, sse, 1, 1, 0 +AVX_INSTR cmpsd, sse2, 1, 1, 0 +AVX_INSTR cmpss, sse, 1, 1, 0 +AVX_INSTR comisd, sse2 +AVX_INSTR comiss, sse +AVX_INSTR cvtdq2pd, sse2 +AVX_INSTR cvtdq2ps, sse2 +AVX_INSTR cvtpd2dq, sse2 +AVX_INSTR cvtpd2ps, sse2 +AVX_INSTR cvtps2dq, sse2 +AVX_INSTR cvtps2pd, sse2 +AVX_INSTR cvtsd2si, sse2 +AVX_INSTR cvtsd2ss, sse2 +AVX_INSTR cvtsi2sd, sse2 +AVX_INSTR cvtsi2ss, sse +AVX_INSTR cvtss2sd, sse2 +AVX_INSTR cvtss2si, sse +AVX_INSTR cvttpd2dq, sse2 +AVX_INSTR cvttps2dq, sse2 +AVX_INSTR cvttsd2si, sse2 +AVX_INSTR cvttss2si, sse +AVX_INSTR divpd, sse2, 1, 0, 0 +AVX_INSTR divps, sse, 1, 0, 0 +AVX_INSTR divsd, sse2, 1, 0, 0 +AVX_INSTR divss, sse, 1, 0, 0 +AVX_INSTR dppd, sse4, 1, 1, 0 +AVX_INSTR dpps, sse4, 1, 1, 0 +AVX_INSTR extractps, sse4 +AVX_INSTR haddpd, sse3, 1, 0, 0 +AVX_INSTR haddps, sse3, 1, 0, 0 +AVX_INSTR hsubpd, sse3, 1, 0, 0 +AVX_INSTR hsubps, sse3, 1, 0, 0 +AVX_INSTR insertps, sse4, 1, 1, 0 +AVX_INSTR lddqu, sse3 +AVX_INSTR ldmxcsr, sse +AVX_INSTR maskmovdqu, sse2 +AVX_INSTR maxpd, sse2, 1, 0, 1 +AVX_INSTR maxps, sse, 1, 0, 1 +AVX_INSTR maxsd, sse2, 1, 0, 1 +AVX_INSTR maxss, sse, 1, 0, 1 +AVX_INSTR minpd, sse2, 1, 0, 1 +AVX_INSTR minps, sse, 1, 0, 1 +AVX_INSTR minsd, sse2, 1, 0, 1 +AVX_INSTR minss, sse, 1, 0, 1 +AVX_INSTR movapd, sse2 +AVX_INSTR movaps, sse +AVX_INSTR movd, mmx +AVX_INSTR movddup, sse3 +AVX_INSTR movdqa, sse2 +AVX_INSTR movdqu, sse2 +AVX_INSTR movhlps, sse, 1, 0, 0 +AVX_INSTR movhpd, sse2, 1, 0, 0 +AVX_INSTR movhps, sse, 1, 0, 0 +AVX_INSTR movlhps, sse, 1, 0, 0 +AVX_INSTR movlpd, sse2, 1, 0, 0 +AVX_INSTR movlps, sse, 1, 0, 0 +AVX_INSTR movmskpd, sse2 +AVX_INSTR movmskps, sse +AVX_INSTR movntdq, sse2 +AVX_INSTR movntdqa, sse4 +AVX_INSTR movntpd, sse2 +AVX_INSTR movntps, sse +AVX_INSTR movq, mmx +AVX_INSTR movsd, sse2, 1, 0, 0 +AVX_INSTR movshdup, sse3 +AVX_INSTR movsldup, sse3 +AVX_INSTR movss, sse, 1, 0, 0 +AVX_INSTR movupd, sse2 +AVX_INSTR movups, sse +AVX_INSTR mpsadbw, sse4 +AVX_INSTR mulpd, sse2, 1, 0, 1 +AVX_INSTR mulps, sse, 1, 0, 1 +AVX_INSTR mulsd, sse2, 1, 0, 1 +AVX_INSTR mulss, sse, 1, 0, 1 +AVX_INSTR orpd, sse2, 1, 0, 1 +AVX_INSTR orps, sse, 1, 0, 1 +AVX_INSTR pabsb, ssse3 +AVX_INSTR pabsd, ssse3 +AVX_INSTR pabsw, ssse3 +AVX_INSTR packsswb, mmx, 0, 0, 0 +AVX_INSTR packssdw, mmx, 0, 0, 0 +AVX_INSTR packuswb, mmx, 0, 0, 0 +AVX_INSTR packusdw, sse4, 0, 0, 0 +AVX_INSTR paddb, mmx, 0, 0, 1 +AVX_INSTR paddw, mmx, 0, 0, 1 +AVX_INSTR paddd, mmx, 0, 0, 1 +AVX_INSTR paddq, sse2, 0, 0, 1 +AVX_INSTR paddsb, mmx, 0, 0, 1 +AVX_INSTR paddsw, mmx, 0, 0, 1 +AVX_INSTR paddusb, mmx, 0, 0, 1 +AVX_INSTR paddusw, mmx, 0, 0, 1 +AVX_INSTR palignr, ssse3 +AVX_INSTR pand, mmx, 0, 0, 1 +AVX_INSTR pandn, mmx, 0, 0, 0 +AVX_INSTR pavgb, mmx2, 0, 0, 1 +AVX_INSTR pavgw, mmx2, 0, 0, 1 +AVX_INSTR pblendvb, sse4, 0, 0, 0 +AVX_INSTR pblendw, sse4 +AVX_INSTR pclmulqdq +AVX_INSTR pcmpestri, sse42 +AVX_INSTR pcmpestrm, sse42 +AVX_INSTR pcmpistri, sse42 +AVX_INSTR pcmpistrm, sse42 +AVX_INSTR pcmpeqb, mmx, 0, 0, 1 +AVX_INSTR pcmpeqw, mmx, 0, 0, 1 +AVX_INSTR pcmpeqd, mmx, 0, 0, 1 +AVX_INSTR pcmpeqq, sse4, 0, 0, 1 +AVX_INSTR pcmpgtb, mmx, 0, 0, 0 +AVX_INSTR pcmpgtw, mmx, 0, 0, 0 +AVX_INSTR pcmpgtd, mmx, 0, 0, 0 +AVX_INSTR pcmpgtq, sse42, 0, 0, 0 +AVX_INSTR pextrb, sse4 +AVX_INSTR pextrd, sse4 +AVX_INSTR pextrq, sse4 +AVX_INSTR pextrw, mmx2 +AVX_INSTR phaddw, ssse3, 0, 0, 0 +AVX_INSTR phaddd, ssse3, 0, 0, 0 +AVX_INSTR phaddsw, ssse3, 0, 0, 0 +AVX_INSTR phminposuw, sse4 +AVX_INSTR phsubw, ssse3, 0, 0, 0 +AVX_INSTR phsubd, ssse3, 0, 0, 0 +AVX_INSTR phsubsw, ssse3, 0, 0, 0 +AVX_INSTR pinsrb, sse4 +AVX_INSTR pinsrd, sse4 +AVX_INSTR pinsrq, sse4 +AVX_INSTR pinsrw, mmx2 +AVX_INSTR pmaddwd, mmx, 0, 0, 1 +AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 +AVX_INSTR pmaxsb, sse4, 0, 0, 1 +AVX_INSTR pmaxsw, mmx2, 0, 0, 1 +AVX_INSTR pmaxsd, sse4, 0, 0, 1 +AVX_INSTR pmaxub, mmx2, 0, 0, 1 +AVX_INSTR pmaxuw, sse4, 0, 0, 1 +AVX_INSTR pmaxud, sse4, 0, 0, 1 +AVX_INSTR pminsb, sse4, 0, 0, 1 +AVX_INSTR pminsw, mmx2, 0, 0, 1 +AVX_INSTR pminsd, sse4, 0, 0, 1 +AVX_INSTR pminub, mmx2, 0, 0, 1 +AVX_INSTR pminuw, sse4, 0, 0, 1 +AVX_INSTR pminud, sse4, 0, 0, 1 +AVX_INSTR pmovmskb, mmx2 +AVX_INSTR pmovsxbw, sse4 +AVX_INSTR pmovsxbd, sse4 +AVX_INSTR pmovsxbq, sse4 +AVX_INSTR pmovsxwd, sse4 +AVX_INSTR pmovsxwq, sse4 +AVX_INSTR pmovsxdq, sse4 +AVX_INSTR pmovzxbw, sse4 +AVX_INSTR pmovzxbd, sse4 +AVX_INSTR pmovzxbq, sse4 +AVX_INSTR pmovzxwd, sse4 +AVX_INSTR pmovzxwq, sse4 +AVX_INSTR pmovzxdq, sse4 +AVX_INSTR pmuldq, sse4, 0, 0, 1 +AVX_INSTR pmulhrsw, ssse3, 0, 0, 1 +AVX_INSTR pmulhuw, mmx2, 0, 0, 1 +AVX_INSTR pmulhw, mmx, 0, 0, 1 +AVX_INSTR pmullw, mmx, 0, 0, 1 +AVX_INSTR pmulld, sse4, 0, 0, 1 +AVX_INSTR pmuludq, sse2, 0, 0, 1 +AVX_INSTR por, mmx, 0, 0, 1 +AVX_INSTR psadbw, mmx2, 0, 0, 1 +AVX_INSTR pshufb, ssse3, 0, 0, 0 +AVX_INSTR pshufd, sse2 +AVX_INSTR pshufhw, sse2 +AVX_INSTR pshuflw, sse2 +AVX_INSTR psignb, ssse3, 0, 0, 0 +AVX_INSTR psignw, ssse3, 0, 0, 0 +AVX_INSTR psignd, ssse3, 0, 0, 0 +AVX_INSTR psllw, mmx, 0, 0, 0 +AVX_INSTR pslld, mmx, 0, 0, 0 +AVX_INSTR psllq, mmx, 0, 0, 0 +AVX_INSTR pslldq, sse2, 0, 0, 0 +AVX_INSTR psraw, mmx, 0, 0, 0 +AVX_INSTR psrad, mmx, 0, 0, 0 +AVX_INSTR psrlw, mmx, 0, 0, 0 +AVX_INSTR psrld, mmx, 0, 0, 0 +AVX_INSTR psrlq, mmx, 0, 0, 0 +AVX_INSTR psrldq, sse2, 0, 0, 0 +AVX_INSTR psubb, mmx, 0, 0, 0 +AVX_INSTR psubw, mmx, 0, 0, 0 +AVX_INSTR psubd, mmx, 0, 0, 0 +AVX_INSTR psubq, sse2, 0, 0, 0 +AVX_INSTR psubsb, mmx, 0, 0, 0 +AVX_INSTR psubsw, mmx, 0, 0, 0 +AVX_INSTR psubusb, mmx, 0, 0, 0 +AVX_INSTR psubusw, mmx, 0, 0, 0 +AVX_INSTR ptest, sse4 +AVX_INSTR punpckhbw, mmx, 0, 0, 0 +AVX_INSTR punpckhwd, mmx, 0, 0, 0 +AVX_INSTR punpckhdq, mmx, 0, 0, 0 +AVX_INSTR punpckhqdq, sse2, 0, 0, 0 +AVX_INSTR punpcklbw, mmx, 0, 0, 0 +AVX_INSTR punpcklwd, mmx, 0, 0, 0 +AVX_INSTR punpckldq, mmx, 0, 0, 0 +AVX_INSTR punpcklqdq, sse2, 0, 0, 0 +AVX_INSTR pxor, mmx, 0, 0, 1 +AVX_INSTR rcpps, sse, 1, 0, 0 +AVX_INSTR rcpss, sse, 1, 0, 0 +AVX_INSTR roundpd, sse4 +AVX_INSTR roundps, sse4 +AVX_INSTR roundsd, sse4 +AVX_INSTR roundss, sse4 +AVX_INSTR rsqrtps, sse, 1, 0, 0 +AVX_INSTR rsqrtss, sse, 1, 0, 0 +AVX_INSTR shufpd, sse2, 1, 1, 0 +AVX_INSTR shufps, sse, 1, 1, 0 +AVX_INSTR sqrtpd, sse2, 1, 0, 0 +AVX_INSTR sqrtps, sse, 1, 0, 0 +AVX_INSTR sqrtsd, sse2, 1, 0, 0 +AVX_INSTR sqrtss, sse, 1, 0, 0 +AVX_INSTR stmxcsr, sse +AVX_INSTR subpd, sse2, 1, 0, 0 +AVX_INSTR subps, sse, 1, 0, 0 +AVX_INSTR subsd, sse2, 1, 0, 0 +AVX_INSTR subss, sse, 1, 0, 0 +AVX_INSTR ucomisd, sse2 +AVX_INSTR ucomiss, sse +AVX_INSTR unpckhpd, sse2, 1, 0, 0 +AVX_INSTR unpckhps, sse, 1, 0, 0 +AVX_INSTR unpcklpd, sse2, 1, 0, 0 +AVX_INSTR unpcklps, sse, 1, 0, 0 +AVX_INSTR xorpd, sse2, 1, 0, 1 +AVX_INSTR xorps, sse, 1, 0, 1 + +; 3DNow instructions, for sharing code between AVX, SSE and 3DN +AVX_INSTR pfadd, 3dnow, 1, 0, 1 +AVX_INSTR pfsub, 3dnow, 1, 0, 0 +AVX_INSTR pfmul, 3dnow, 1, 0, 1 + +; base-4 constants for shuffles +%assign i 0 +%rep 256 + %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) + %if j < 10 + CAT_XDEFINE q000, j, i + %elif j < 100 + CAT_XDEFINE q00, j, i + %elif j < 1000 + CAT_XDEFINE q0, j, i + %else + CAT_XDEFINE q, j, i + %endif + %assign i i+1 +%endrep +%undef i +%undef j + +%macro FMA_INSTR 3 + %macro %1 4-7 %1, %2, %3 + %if cpuflag(xop) + v%5 %1, %2, %3, %4 + %elifnidn %1, %4 + %6 %1, %2, %3 + %7 %1, %4 + %else + %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported + %endif + %endmacro +%endmacro + +FMA_INSTR pmacsww, pmullw, paddw +FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation +FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation +FMA_INSTR pmadcswd, pmaddwd, paddd + +; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax. +; FMA3 is only possible if dst is the same as one of the src registers. +; Either src2 or src3 can be a memory operand. +%macro FMA4_INSTR 2-* + %push fma4_instr + %xdefine %$prefix %1 + %rep %0 - 1 + %macro %$prefix%2 4-6 %$prefix, %2 + %if notcpuflag(fma3) && notcpuflag(fma4) + %error use of ``%5%6'' fma instruction in cpuname function: current_function + %elif cpuflag(fma4) + v%5%6 %1, %2, %3, %4 + %elifidn %1, %2 + ; If %3 or %4 is a memory operand it needs to be encoded as the last operand. + %ifid %3 + v%{5}213%6 %2, %3, %4 + %else + v%{5}132%6 %2, %4, %3 + %endif + %elifidn %1, %3 + v%{5}213%6 %3, %2, %4 + %elifidn %1, %4 + v%{5}231%6 %4, %2, %3 + %else + %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported + %endif + %endmacro + %rotate 1 + %endrep + %pop +%endmacro + +FMA4_INSTR fmadd, pd, ps, sd, ss +FMA4_INSTR fmaddsub, pd, ps +FMA4_INSTR fmsub, pd, ps, sd, ss +FMA4_INSTR fmsubadd, pd, ps +FMA4_INSTR fnmadd, pd, ps, sd, ss +FMA4_INSTR fnmsub, pd, ps, sd, ss + +; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0) +%ifdef __YASM_VER__ + %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0 + %macro vpbroadcastq 2 + %if sizeof%1 == 16 + movddup %1, %2 + %else + vbroadcastsd %1, %2 + %endif + %endmacro + %endif +%endif diff --git a/libs/libaom/src/tools/aggregate_entropy_stats.py b/libs/libaom/src/tools/aggregate_entropy_stats.py new file mode 100644 index 000000000..7cb4d18e1 --- /dev/null +++ b/libs/libaom/src/tools/aggregate_entropy_stats.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python +## Copyright (c) 2017, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +"""Aggregate multiple entropy stats output which is written in 32-bit int. + +python ./aggregate_entropy_stats.py [dir of stats files] [keyword of filenames] + [filename of final stats] +""" + +__author__ = "yuec@google.com" + +import os +import sys +import numpy as np + +def main(): + dir = sys.argv[1] + sum = [] + for fn in os.listdir(dir): + if sys.argv[2] in fn: + stats = np.fromfile(dir + fn, dtype=np.int32) + if len(sum) == 0: + sum = stats + else: + sum = np.add(sum, stats) + if len(sum) == 0: + print("No stats file is found. Double-check directory and keyword?") + else: + sum.tofile(dir+sys.argv[3]) + +if __name__ == '__main__': + main() diff --git a/libs/libaom/src/tools/aom_entropy_optimizer.c b/libs/libaom/src/tools/aom_entropy_optimizer.c new file mode 100644 index 000000000..9f529d9ab --- /dev/null +++ b/libs/libaom/src/tools/aom_entropy_optimizer.c @@ -0,0 +1,761 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// This tool is a gadget for offline probability training. +// A binary executable aom_entropy_optimizer will be generated in tools/. It +// parses a binary file consisting of counts written in the format of +// FRAME_COUNTS in entropymode.h, and computes optimized probability tables +// and CDF tables, which will be written to a new c file optimized_probs.c +// according to format in the codebase. +// +// Command line: ./aom_entropy_optimizer [directory of the count file] +// +// The input file can either be generated by encoding a single clip by +// turning on entropy_stats experiment, or be collected at a larger scale at +// which a python script which will be provided soon can be used to aggregate +// multiple stats output. + +#include +#include + +#include "config/aom_config.h" + +#include "av1/encoder/encoder.h" + +#define SPACES_PER_TAB 2 +#define CDF_MAX_SIZE 16 + +typedef unsigned int aom_count_type; +// A log file recording parsed counts +static FILE *logfile; // TODO(yuec): make it a command line option + +static void counts_to_cdf(const aom_count_type *counts, aom_cdf_prob *cdf, + int modes) { + int64_t csum[CDF_MAX_SIZE]; + assert(modes <= CDF_MAX_SIZE); + + csum[0] = counts[0] + 1; + for (int i = 1; i < modes; ++i) csum[i] = counts[i] + 1 + csum[i - 1]; + + for (int i = 0; i < modes; ++i) fprintf(logfile, "%d ", counts[i]); + fprintf(logfile, "\n"); + + int64_t sum = csum[modes - 1]; + const int64_t round_shift = sum >> 1; + for (int i = 0; i < modes; ++i) { + cdf[i] = (csum[i] * CDF_PROB_TOP + round_shift) / sum; + cdf[i] = AOMMIN(cdf[i], CDF_PROB_TOP - (modes - 1 + i) * 4); + cdf[i] = (i == 0) ? AOMMAX(cdf[i], 4) : AOMMAX(cdf[i], cdf[i - 1] + 4); + } +} + +static int parse_counts_for_cdf_opt(aom_count_type **ct_ptr, + FILE *const probsfile, int tabs, + int dim_of_cts, int *cts_each_dim) { + if (dim_of_cts < 1) { + fprintf(stderr, "The dimension of a counts vector should be at least 1!\n"); + return 1; + } + const int total_modes = cts_each_dim[0]; + if (dim_of_cts == 1) { + assert(total_modes <= CDF_MAX_SIZE); + aom_cdf_prob cdfs[CDF_MAX_SIZE]; + aom_count_type *counts1d = *ct_ptr; + + counts_to_cdf(counts1d, cdfs, total_modes); + (*ct_ptr) += total_modes; + + if (tabs > 0) fprintf(probsfile, "%*c", tabs * SPACES_PER_TAB, ' '); + fprintf(probsfile, "AOM_CDF%d(", total_modes); + for (int k = 0; k < total_modes - 1; ++k) { + fprintf(probsfile, "%d", cdfs[k]); + if (k < total_modes - 2) fprintf(probsfile, ", "); + } + fprintf(probsfile, ")"); + } else { + for (int k = 0; k < total_modes; ++k) { + int tabs_next_level; + + if (dim_of_cts == 2) + fprintf(probsfile, "%*c{ ", tabs * SPACES_PER_TAB, ' '); + else + fprintf(probsfile, "%*c{\n", tabs * SPACES_PER_TAB, ' '); + tabs_next_level = dim_of_cts == 2 ? 0 : tabs + 1; + + if (parse_counts_for_cdf_opt(ct_ptr, probsfile, tabs_next_level, + dim_of_cts - 1, cts_each_dim + 1)) { + return 1; + } + + if (dim_of_cts == 2) { + if (k == total_modes - 1) + fprintf(probsfile, " }\n"); + else + fprintf(probsfile, " },\n"); + } else { + if (k == total_modes - 1) + fprintf(probsfile, "%*c}\n", tabs * SPACES_PER_TAB, ' '); + else + fprintf(probsfile, "%*c},\n", tabs * SPACES_PER_TAB, ' '); + } + } + } + return 0; +} + +static void optimize_cdf_table(aom_count_type *counts, FILE *const probsfile, + int dim_of_cts, int *cts_each_dim, + char *prefix) { + aom_count_type *ct_ptr = counts; + + fprintf(probsfile, "%s = {\n", prefix); + fprintf(logfile, "%s\n", prefix); + if (parse_counts_for_cdf_opt(&ct_ptr, probsfile, 1, dim_of_cts, + cts_each_dim)) { + fprintf(probsfile, "Optimizer failed!\n"); + } + fprintf(probsfile, "};\n\n"); + fprintf(logfile, "============================\n"); +} + +static void optimize_uv_mode(aom_count_type *counts, FILE *const probsfile, + int dim_of_cts, int *cts_each_dim, char *prefix) { + aom_count_type *ct_ptr = counts; + + fprintf(probsfile, "%s = {\n", prefix); + fprintf(probsfile, "%*c{\n", SPACES_PER_TAB, ' '); + fprintf(logfile, "%s\n", prefix); + cts_each_dim[2] = UV_INTRA_MODES - 1; + for (int k = 0; k < cts_each_dim[1]; ++k) { + fprintf(probsfile, "%*c{ ", 2 * SPACES_PER_TAB, ' '); + parse_counts_for_cdf_opt(&ct_ptr, probsfile, 0, dim_of_cts - 2, + cts_each_dim + 2); + if (k + 1 == cts_each_dim[1]) { + fprintf(probsfile, " }\n"); + } else { + fprintf(probsfile, " },\n"); + } + ++ct_ptr; + } + fprintf(probsfile, "%*c},\n", SPACES_PER_TAB, ' '); + fprintf(probsfile, "%*c{\n", SPACES_PER_TAB, ' '); + cts_each_dim[2] = UV_INTRA_MODES; + parse_counts_for_cdf_opt(&ct_ptr, probsfile, 2, dim_of_cts - 1, + cts_each_dim + 1); + fprintf(probsfile, "%*c}\n", SPACES_PER_TAB, ' '); + fprintf(probsfile, "};\n\n"); + fprintf(logfile, "============================\n"); +} + +static void optimize_cdf_table_var_modes_2d(aom_count_type *counts, + FILE *const probsfile, + int dim_of_cts, int *cts_each_dim, + int *modes_each_ctx, char *prefix) { + aom_count_type *ct_ptr = counts; + + assert(dim_of_cts == 2); + (void)dim_of_cts; + + fprintf(probsfile, "%s = {\n", prefix); + fprintf(logfile, "%s\n", prefix); + + for (int d0_idx = 0; d0_idx < cts_each_dim[0]; ++d0_idx) { + int num_of_modes = modes_each_ctx[d0_idx]; + + if (num_of_modes > 0) { + fprintf(probsfile, "%*c{ ", SPACES_PER_TAB, ' '); + parse_counts_for_cdf_opt(&ct_ptr, probsfile, 0, 1, &num_of_modes); + ct_ptr += cts_each_dim[1] - num_of_modes; + fprintf(probsfile, " },\n"); + } else { + fprintf(probsfile, "%*c{ 0 },\n", SPACES_PER_TAB, ' '); + fprintf(logfile, "dummy cdf, no need to optimize\n"); + ct_ptr += cts_each_dim[1]; + } + } + fprintf(probsfile, "};\n\n"); + fprintf(logfile, "============================\n"); +} + +static void optimize_cdf_table_var_modes_3d(aom_count_type *counts, + FILE *const probsfile, + int dim_of_cts, int *cts_each_dim, + int *modes_each_ctx, char *prefix) { + aom_count_type *ct_ptr = counts; + + assert(dim_of_cts == 3); + (void)dim_of_cts; + + fprintf(probsfile, "%s = {\n", prefix); + fprintf(logfile, "%s\n", prefix); + + for (int d0_idx = 0; d0_idx < cts_each_dim[0]; ++d0_idx) { + fprintf(probsfile, "%*c{\n", SPACES_PER_TAB, ' '); + for (int d1_idx = 0; d1_idx < cts_each_dim[1]; ++d1_idx) { + int num_of_modes = modes_each_ctx[d0_idx]; + + if (num_of_modes > 0) { + fprintf(probsfile, "%*c{ ", 2 * SPACES_PER_TAB, ' '); + parse_counts_for_cdf_opt(&ct_ptr, probsfile, 0, 1, &num_of_modes); + ct_ptr += cts_each_dim[2] - num_of_modes; + fprintf(probsfile, " },\n"); + } else { + fprintf(probsfile, "%*c{ 0 },\n", 2 * SPACES_PER_TAB, ' '); + fprintf(logfile, "dummy cdf, no need to optimize\n"); + ct_ptr += cts_each_dim[2]; + } + } + fprintf(probsfile, "%*c},\n", SPACES_PER_TAB, ' '); + } + fprintf(probsfile, "};\n\n"); + fprintf(logfile, "============================\n"); +} + +static void optimize_cdf_table_var_modes_4d(aom_count_type *counts, + FILE *const probsfile, + int dim_of_cts, int *cts_each_dim, + int *modes_each_ctx, char *prefix) { + aom_count_type *ct_ptr = counts; + + assert(dim_of_cts == 4); + (void)dim_of_cts; + + fprintf(probsfile, "%s = {\n", prefix); + fprintf(logfile, "%s\n", prefix); + + for (int d0_idx = 0; d0_idx < cts_each_dim[0]; ++d0_idx) { + fprintf(probsfile, "%*c{\n", SPACES_PER_TAB, ' '); + for (int d1_idx = 0; d1_idx < cts_each_dim[1]; ++d1_idx) { + fprintf(probsfile, "%*c{\n", 2 * SPACES_PER_TAB, ' '); + for (int d2_idx = 0; d2_idx < cts_each_dim[2]; ++d2_idx) { + int num_of_modes = modes_each_ctx[d0_idx]; + + if (num_of_modes > 0) { + fprintf(probsfile, "%*c{ ", 3 * SPACES_PER_TAB, ' '); + parse_counts_for_cdf_opt(&ct_ptr, probsfile, 0, 1, &num_of_modes); + ct_ptr += cts_each_dim[3] - num_of_modes; + fprintf(probsfile, " },\n"); + } else { + fprintf(probsfile, "%*c{ 0 },\n", 3 * SPACES_PER_TAB, ' '); + fprintf(logfile, "dummy cdf, no need to optimize\n"); + ct_ptr += cts_each_dim[3]; + } + } + fprintf(probsfile, "%*c},\n", 2 * SPACES_PER_TAB, ' '); + } + fprintf(probsfile, "%*c},\n", SPACES_PER_TAB, ' '); + } + fprintf(probsfile, "};\n\n"); + fprintf(logfile, "============================\n"); +} + +int main(int argc, const char **argv) { + if (argc < 2) { + fprintf(stderr, "Please specify the input stats file!\n"); + exit(EXIT_FAILURE); + } + + FILE *const statsfile = fopen(argv[1], "rb"); + if (statsfile == NULL) { + fprintf(stderr, "Failed to open input file!\n"); + exit(EXIT_FAILURE); + } + + FRAME_COUNTS fc; + const size_t bytes = fread(&fc, sizeof(FRAME_COUNTS), 1, statsfile); + if (!bytes) { + fclose(statsfile); + return 1; + } + + FILE *const probsfile = fopen("optimized_probs.c", "w"); + if (probsfile == NULL) { + fprintf(stderr, + "Failed to create output file for optimized entropy tables!\n"); + exit(EXIT_FAILURE); + } + + logfile = fopen("aom_entropy_optimizer_parsed_counts.log", "w"); + if (logfile == NULL) { + fprintf(stderr, "Failed to create log file for parsed counts!\n"); + exit(EXIT_FAILURE); + } + + int cts_each_dim[10]; + + /* Intra mode (keyframe luma) */ + cts_each_dim[0] = KF_MODE_CONTEXTS; + cts_each_dim[1] = KF_MODE_CONTEXTS; + cts_each_dim[2] = INTRA_MODES; + optimize_cdf_table(&fc.kf_y_mode[0][0][0], probsfile, 3, cts_each_dim, + "const aom_cdf_prob\n" + "default_kf_y_mode_cdf[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS]" + "[CDF_SIZE(INTRA_MODES)]"); + + cts_each_dim[0] = DIRECTIONAL_MODES; + cts_each_dim[1] = 2 * MAX_ANGLE_DELTA + 1; + optimize_cdf_table(&fc.angle_delta[0][0], probsfile, 2, cts_each_dim, + "static const aom_cdf_prob default_angle_delta_cdf" + "[DIRECTIONAL_MODES][CDF_SIZE(2 * MAX_ANGLE_DELTA + 1)]"); + + /* Intra mode (non-keyframe luma) */ + cts_each_dim[0] = BLOCK_SIZE_GROUPS; + cts_each_dim[1] = INTRA_MODES; + optimize_cdf_table( + &fc.y_mode[0][0], probsfile, 2, cts_each_dim, + "static const aom_cdf_prob\n" + "default_if_y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTRA_MODES)]"); + + /* Intra mode (chroma) */ + cts_each_dim[0] = CFL_ALLOWED_TYPES; + cts_each_dim[1] = INTRA_MODES; + cts_each_dim[2] = UV_INTRA_MODES; + optimize_uv_mode(&fc.uv_mode[0][0][0], probsfile, 3, cts_each_dim, + "static const aom_cdf_prob\n" + "default_uv_mode_cdf[CFL_ALLOWED_TYPES][INTRA_MODES]" + "[CDF_SIZE(UV_INTRA_MODES)]"); + + /* block partition */ + cts_each_dim[0] = PARTITION_CONTEXTS; + cts_each_dim[1] = EXT_PARTITION_TYPES; + int part_types_each_ctx[PARTITION_CONTEXTS] = { 4, 4, 4, 4, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, + 10, 10, 8, 8, 8, 8 }; + optimize_cdf_table_var_modes_2d( + &fc.partition[0][0], probsfile, 2, cts_each_dim, part_types_each_ctx, + "static const aom_cdf_prob default_partition_cdf[PARTITION_CONTEXTS]" + "[CDF_SIZE(EXT_PARTITION_TYPES)]"); + + /* tx type */ + cts_each_dim[0] = EXT_TX_SETS_INTRA; + cts_each_dim[1] = EXT_TX_SIZES; + cts_each_dim[2] = INTRA_MODES; + cts_each_dim[3] = TX_TYPES; + int intra_ext_tx_types_each_ctx[EXT_TX_SETS_INTRA] = { 0, 7, 5 }; + optimize_cdf_table_var_modes_4d( + &fc.intra_ext_tx[0][0][0][0], probsfile, 4, cts_each_dim, + intra_ext_tx_types_each_ctx, + "static const aom_cdf_prob default_intra_ext_tx_cdf[EXT_TX_SETS_INTRA]" + "[EXT_TX_SIZES][INTRA_MODES][CDF_SIZE(TX_TYPES)]"); + + cts_each_dim[0] = EXT_TX_SETS_INTER; + cts_each_dim[1] = EXT_TX_SIZES; + cts_each_dim[2] = TX_TYPES; + int inter_ext_tx_types_each_ctx[EXT_TX_SETS_INTER] = { 0, 16, 12, 2 }; + optimize_cdf_table_var_modes_3d( + &fc.inter_ext_tx[0][0][0], probsfile, 3, cts_each_dim, + inter_ext_tx_types_each_ctx, + "static const aom_cdf_prob default_inter_ext_tx_cdf[EXT_TX_SETS_INTER]" + "[EXT_TX_SIZES][CDF_SIZE(TX_TYPES)]"); + + /* Chroma from Luma */ + cts_each_dim[0] = CFL_JOINT_SIGNS; + optimize_cdf_table(&fc.cfl_sign[0], probsfile, 1, cts_each_dim, + "static const aom_cdf_prob\n" + "default_cfl_sign_cdf[CDF_SIZE(CFL_JOINT_SIGNS)]"); + cts_each_dim[0] = CFL_ALPHA_CONTEXTS; + cts_each_dim[1] = CFL_ALPHABET_SIZE; + optimize_cdf_table(&fc.cfl_alpha[0][0], probsfile, 2, cts_each_dim, + "static const aom_cdf_prob\n" + "default_cfl_alpha_cdf[CFL_ALPHA_CONTEXTS]" + "[CDF_SIZE(CFL_ALPHABET_SIZE)]"); + + /* Interpolation filter */ + cts_each_dim[0] = SWITCHABLE_FILTER_CONTEXTS; + cts_each_dim[1] = SWITCHABLE_FILTERS; + optimize_cdf_table(&fc.switchable_interp[0][0], probsfile, 2, cts_each_dim, + "static const aom_cdf_prob\n" + "default_switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS]" + "[CDF_SIZE(SWITCHABLE_FILTERS)]"); + + /* Motion vector referencing */ + cts_each_dim[0] = NEWMV_MODE_CONTEXTS; + cts_each_dim[1] = 2; + optimize_cdf_table(&fc.newmv_mode[0][0], probsfile, 2, cts_each_dim, + "static const aom_cdf_prob " + "default_newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(2)]"); + + cts_each_dim[0] = GLOBALMV_MODE_CONTEXTS; + cts_each_dim[1] = 2; + optimize_cdf_table(&fc.zeromv_mode[0][0], probsfile, 2, cts_each_dim, + "static const aom_cdf_prob " + "default_zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE(2)]"); + + cts_each_dim[0] = REFMV_MODE_CONTEXTS; + cts_each_dim[1] = 2; + optimize_cdf_table(&fc.refmv_mode[0][0], probsfile, 2, cts_each_dim, + "static const aom_cdf_prob " + "default_refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(2)]"); + + cts_each_dim[0] = DRL_MODE_CONTEXTS; + cts_each_dim[1] = 2; + optimize_cdf_table(&fc.drl_mode[0][0], probsfile, 2, cts_each_dim, + "static const aom_cdf_prob " + "default_drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)]"); + + /* ext_inter experiment */ + /* New compound mode */ + cts_each_dim[0] = INTER_MODE_CONTEXTS; + cts_each_dim[1] = INTER_COMPOUND_MODES; + optimize_cdf_table(&fc.inter_compound_mode[0][0], probsfile, 2, cts_each_dim, + "static const aom_cdf_prob\n" + "default_inter_compound_mode_cdf[INTER_MODE_CONTEXTS][CDF_" + "SIZE(INTER_COMPOUND_MODES)]"); + + /* Interintra */ + cts_each_dim[0] = BLOCK_SIZE_GROUPS; + cts_each_dim[1] = 2; + optimize_cdf_table(&fc.interintra[0][0], probsfile, 2, cts_each_dim, + "static const aom_cdf_prob " + "default_interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(2)]"); + + cts_each_dim[0] = BLOCK_SIZE_GROUPS; + cts_each_dim[1] = INTERINTRA_MODES; + optimize_cdf_table(&fc.interintra_mode[0][0], probsfile, 2, cts_each_dim, + "static const aom_cdf_prob\n" + "default_interintra_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(" + "INTERINTRA_MODES)]"); + + cts_each_dim[0] = BLOCK_SIZES_ALL; + cts_each_dim[1] = 2; + optimize_cdf_table( + &fc.wedge_interintra[0][0], probsfile, 2, cts_each_dim, + "static const aom_cdf_prob\n" + "default_wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)]"); + + /* Compound type */ + cts_each_dim[0] = BLOCK_SIZES_ALL; + cts_each_dim[1] = COMPOUND_TYPES - 1; + optimize_cdf_table(&fc.compound_type[0][0], probsfile, 2, cts_each_dim, + "static const aom_cdf_prob default_compound_type_cdf" + "[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES - 1)]"); + + cts_each_dim[0] = BLOCK_SIZES_ALL; + cts_each_dim[1] = 16; + optimize_cdf_table(&fc.wedge_idx[0][0], probsfile, 2, cts_each_dim, + "static const aom_cdf_prob " + "default_wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)]"); + + /* motion_var and warped_motion experiments */ + cts_each_dim[0] = BLOCK_SIZES_ALL; + cts_each_dim[1] = MOTION_MODES; + optimize_cdf_table( + &fc.motion_mode[0][0], probsfile, 2, cts_each_dim, + "static const aom_cdf_prob\n" + "default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(MOTION_MODES)]"); + cts_each_dim[0] = BLOCK_SIZES_ALL; + cts_each_dim[1] = 2; + optimize_cdf_table(&fc.obmc[0][0], probsfile, 2, cts_each_dim, + "static const aom_cdf_prob " + "default_obmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)]"); + + /* Intra/inter flag */ + cts_each_dim[0] = INTRA_INTER_CONTEXTS; + cts_each_dim[1] = 2; + optimize_cdf_table( + &fc.intra_inter[0][0], probsfile, 2, cts_each_dim, + "static const aom_cdf_prob\n" + "default_intra_inter_cdf[INTRA_INTER_CONTEXTS][CDF_SIZE(2)]"); + + /* Single/comp ref flag */ + cts_each_dim[0] = COMP_INTER_CONTEXTS; + cts_each_dim[1] = 2; + optimize_cdf_table( + &fc.comp_inter[0][0], probsfile, 2, cts_each_dim, + "static const aom_cdf_prob\n" + "default_comp_inter_cdf[COMP_INTER_CONTEXTS][CDF_SIZE(2)]"); + + /* ext_comp_refs experiment */ + cts_each_dim[0] = COMP_REF_TYPE_CONTEXTS; + cts_each_dim[1] = 2; + optimize_cdf_table( + &fc.comp_ref_type[0][0], probsfile, 2, cts_each_dim, + "static const aom_cdf_prob\n" + "default_comp_ref_type_cdf[COMP_REF_TYPE_CONTEXTS][CDF_SIZE(2)]"); + + cts_each_dim[0] = UNI_COMP_REF_CONTEXTS; + cts_each_dim[1] = UNIDIR_COMP_REFS - 1; + cts_each_dim[2] = 2; + optimize_cdf_table(&fc.uni_comp_ref[0][0][0], probsfile, 3, cts_each_dim, + "static const aom_cdf_prob\n" + "default_uni_comp_ref_cdf[UNI_COMP_REF_CONTEXTS][UNIDIR_" + "COMP_REFS - 1][CDF_SIZE(2)]"); + + /* Reference frame (single ref) */ + cts_each_dim[0] = REF_CONTEXTS; + cts_each_dim[1] = SINGLE_REFS - 1; + cts_each_dim[2] = 2; + optimize_cdf_table( + &fc.single_ref[0][0][0], probsfile, 3, cts_each_dim, + "static const aom_cdf_prob\n" + "default_single_ref_cdf[REF_CONTEXTS][SINGLE_REFS - 1][CDF_SIZE(2)]"); + + /* ext_refs experiment */ + cts_each_dim[0] = REF_CONTEXTS; + cts_each_dim[1] = FWD_REFS - 1; + cts_each_dim[2] = 2; + optimize_cdf_table( + &fc.comp_ref[0][0][0], probsfile, 3, cts_each_dim, + "static const aom_cdf_prob\n" + "default_comp_ref_cdf[REF_CONTEXTS][FWD_REFS - 1][CDF_SIZE(2)]"); + + cts_each_dim[0] = REF_CONTEXTS; + cts_each_dim[1] = BWD_REFS - 1; + cts_each_dim[2] = 2; + optimize_cdf_table( + &fc.comp_bwdref[0][0][0], probsfile, 3, cts_each_dim, + "static const aom_cdf_prob\n" + "default_comp_bwdref_cdf[REF_CONTEXTS][BWD_REFS - 1][CDF_SIZE(2)]"); + + /* palette */ + cts_each_dim[0] = PALATTE_BSIZE_CTXS; + cts_each_dim[1] = PALETTE_SIZES; + optimize_cdf_table(&fc.palette_y_size[0][0], probsfile, 2, cts_each_dim, + "const aom_cdf_prob default_palette_y_size_cdf" + "[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)]"); + + cts_each_dim[0] = PALATTE_BSIZE_CTXS; + cts_each_dim[1] = PALETTE_SIZES; + optimize_cdf_table(&fc.palette_uv_size[0][0], probsfile, 2, cts_each_dim, + "const aom_cdf_prob default_palette_uv_size_cdf" + "[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)]"); + + cts_each_dim[0] = PALATTE_BSIZE_CTXS; + cts_each_dim[1] = PALETTE_Y_MODE_CONTEXTS; + cts_each_dim[2] = 2; + optimize_cdf_table(&fc.palette_y_mode[0][0][0], probsfile, 3, cts_each_dim, + "const aom_cdf_prob default_palette_y_mode_cdf" + "[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS]" + "[CDF_SIZE(2)]"); + + cts_each_dim[0] = PALETTE_UV_MODE_CONTEXTS; + cts_each_dim[1] = 2; + optimize_cdf_table(&fc.palette_uv_mode[0][0], probsfile, 2, cts_each_dim, + "const aom_cdf_prob default_palette_uv_mode_cdf" + "[PALETTE_UV_MODE_CONTEXTS][CDF_SIZE(2)]"); + + cts_each_dim[0] = PALETTE_SIZES; + cts_each_dim[1] = PALETTE_COLOR_INDEX_CONTEXTS; + cts_each_dim[2] = PALETTE_COLORS; + int palette_color_indexes_each_ctx[PALETTE_SIZES] = { 2, 3, 4, 5, 6, 7, 8 }; + optimize_cdf_table_var_modes_3d( + &fc.palette_y_color_index[0][0][0], probsfile, 3, cts_each_dim, + palette_color_indexes_each_ctx, + "const aom_cdf_prob default_palette_y_color_index_cdf[PALETTE_SIZES]" + "[PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)]"); + + cts_each_dim[0] = PALETTE_SIZES; + cts_each_dim[1] = PALETTE_COLOR_INDEX_CONTEXTS; + cts_each_dim[2] = PALETTE_COLORS; + optimize_cdf_table_var_modes_3d( + &fc.palette_uv_color_index[0][0][0], probsfile, 3, cts_each_dim, + palette_color_indexes_each_ctx, + "const aom_cdf_prob default_palette_uv_color_index_cdf[PALETTE_SIZES]" + "[PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)]"); + + /* Transform size */ + cts_each_dim[0] = TXFM_PARTITION_CONTEXTS; + cts_each_dim[1] = 2; + optimize_cdf_table( + &fc.txfm_partition[0][0], probsfile, 2, cts_each_dim, + "static const aom_cdf_prob\n" + "default_txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)]"); + + /* Skip flag */ + cts_each_dim[0] = SKIP_CONTEXTS; + cts_each_dim[1] = 2; + optimize_cdf_table(&fc.skip[0][0], probsfile, 2, cts_each_dim, + "static const aom_cdf_prob " + "default_skip_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)]"); + + /* Skip mode flag */ + cts_each_dim[0] = SKIP_MODE_CONTEXTS; + cts_each_dim[1] = 2; + optimize_cdf_table(&fc.skip_mode[0][0], probsfile, 2, cts_each_dim, + "static const aom_cdf_prob " + "default_skip_mode_cdfs[SKIP_MODE_CONTEXTS][CDF_SIZE(2)]"); + + /* joint compound flag */ + cts_each_dim[0] = COMP_INDEX_CONTEXTS; + cts_each_dim[1] = 2; + optimize_cdf_table(&fc.compound_index[0][0], probsfile, 2, cts_each_dim, + "static const aom_cdf_prob default_compound_idx_cdfs" + "[COMP_INDEX_CONTEXTS][CDF_SIZE(2)]"); + + cts_each_dim[0] = COMP_GROUP_IDX_CONTEXTS; + cts_each_dim[1] = 2; + optimize_cdf_table(&fc.comp_group_idx[0][0], probsfile, 2, cts_each_dim, + "static const aom_cdf_prob default_comp_group_idx_cdfs" + "[COMP_GROUP_IDX_CONTEXTS][CDF_SIZE(2)]"); + + /* intrabc */ + cts_each_dim[0] = 2; + optimize_cdf_table( + &fc.intrabc[0], probsfile, 1, cts_each_dim, + "static const aom_cdf_prob default_intrabc_cdf[CDF_SIZE(2)]"); + + /* filter_intra experiment */ + cts_each_dim[0] = FILTER_INTRA_MODES; + optimize_cdf_table( + &fc.filter_intra_mode[0], probsfile, 1, cts_each_dim, + "static const aom_cdf_prob " + "default_filter_intra_mode_cdf[CDF_SIZE(FILTER_INTRA_MODES)]"); + + cts_each_dim[0] = BLOCK_SIZES_ALL; + cts_each_dim[1] = 2; + optimize_cdf_table(&fc.filter_intra[0][0], probsfile, 2, cts_each_dim, + "static const aom_cdf_prob " + "default_filter_intra_cdfs[BLOCK_SIZES_ALL][CDF_SIZE(2)]"); + + /* restoration type */ + cts_each_dim[0] = RESTORE_SWITCHABLE_TYPES; + optimize_cdf_table(&fc.switchable_restore[0], probsfile, 1, cts_each_dim, + "static const aom_cdf_prob default_switchable_restore_cdf" + "[CDF_SIZE(RESTORE_SWITCHABLE_TYPES)]"); + + cts_each_dim[0] = 2; + optimize_cdf_table(&fc.wiener_restore[0], probsfile, 1, cts_each_dim, + "static const aom_cdf_prob default_wiener_restore_cdf" + "[CDF_SIZE(2)]"); + + cts_each_dim[0] = 2; + optimize_cdf_table(&fc.sgrproj_restore[0], probsfile, 1, cts_each_dim, + "static const aom_cdf_prob default_sgrproj_restore_cdf" + "[CDF_SIZE(2)]"); + + /* intra tx size */ + cts_each_dim[0] = MAX_TX_CATS; + cts_each_dim[1] = TX_SIZE_CONTEXTS; + cts_each_dim[2] = MAX_TX_DEPTH + 1; + int intra_tx_sizes_each_ctx[MAX_TX_CATS] = { 2, 3, 3, 3 }; + optimize_cdf_table_var_modes_3d( + &fc.intra_tx_size[0][0][0], probsfile, 3, cts_each_dim, + intra_tx_sizes_each_ctx, + "static const aom_cdf_prob default_tx_size_cdf" + "[MAX_TX_CATS][TX_SIZE_CONTEXTS][CDF_SIZE(MAX_TX_DEPTH + 1)]"); + + /* transform coding */ + cts_each_dim[0] = TOKEN_CDF_Q_CTXS; + cts_each_dim[1] = TX_SIZES; + cts_each_dim[2] = TXB_SKIP_CONTEXTS; + cts_each_dim[3] = 2; + optimize_cdf_table(&fc.txb_skip[0][0][0][0], probsfile, 4, cts_each_dim, + "static const aom_cdf_prob " + "av1_default_txb_skip_cdfs[TOKEN_CDF_Q_CTXS][TX_SIZES]" + "[TXB_SKIP_CONTEXTS][CDF_SIZE(2)]"); + + cts_each_dim[0] = TOKEN_CDF_Q_CTXS; + cts_each_dim[1] = TX_SIZES; + cts_each_dim[2] = PLANE_TYPES; + cts_each_dim[3] = EOB_COEF_CONTEXTS; + cts_each_dim[4] = 2; + optimize_cdf_table( + &fc.eob_extra[0][0][0][0][0], probsfile, 5, cts_each_dim, + "static const aom_cdf_prob av1_default_eob_extra_cdfs " + "[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS]" + "[CDF_SIZE(2)]"); + + cts_each_dim[0] = TOKEN_CDF_Q_CTXS; + cts_each_dim[1] = PLANE_TYPES; + cts_each_dim[2] = 2; + cts_each_dim[3] = 5; + optimize_cdf_table(&fc.eob_multi16[0][0][0][0], probsfile, 4, cts_each_dim, + "static const aom_cdf_prob av1_default_eob_multi16_cdfs" + "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(5)]"); + + cts_each_dim[0] = TOKEN_CDF_Q_CTXS; + cts_each_dim[1] = PLANE_TYPES; + cts_each_dim[2] = 2; + cts_each_dim[3] = 6; + optimize_cdf_table(&fc.eob_multi32[0][0][0][0], probsfile, 4, cts_each_dim, + "static const aom_cdf_prob av1_default_eob_multi32_cdfs" + "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(6)]"); + + cts_each_dim[0] = TOKEN_CDF_Q_CTXS; + cts_each_dim[1] = PLANE_TYPES; + cts_each_dim[2] = 2; + cts_each_dim[3] = 7; + optimize_cdf_table(&fc.eob_multi64[0][0][0][0], probsfile, 4, cts_each_dim, + "static const aom_cdf_prob av1_default_eob_multi64_cdfs" + "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(7)]"); + + cts_each_dim[0] = TOKEN_CDF_Q_CTXS; + cts_each_dim[1] = PLANE_TYPES; + cts_each_dim[2] = 2; + cts_each_dim[3] = 8; + optimize_cdf_table(&fc.eob_multi128[0][0][0][0], probsfile, 4, cts_each_dim, + "static const aom_cdf_prob av1_default_eob_multi128_cdfs" + "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(8)]"); + + cts_each_dim[0] = TOKEN_CDF_Q_CTXS; + cts_each_dim[1] = PLANE_TYPES; + cts_each_dim[2] = 2; + cts_each_dim[3] = 9; + optimize_cdf_table(&fc.eob_multi256[0][0][0][0], probsfile, 4, cts_each_dim, + "static const aom_cdf_prob av1_default_eob_multi256_cdfs" + "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(9)]"); + + cts_each_dim[0] = TOKEN_CDF_Q_CTXS; + cts_each_dim[1] = PLANE_TYPES; + cts_each_dim[2] = 2; + cts_each_dim[3] = 10; + optimize_cdf_table(&fc.eob_multi512[0][0][0][0], probsfile, 4, cts_each_dim, + "static const aom_cdf_prob av1_default_eob_multi512_cdfs" + "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(10)]"); + + cts_each_dim[0] = TOKEN_CDF_Q_CTXS; + cts_each_dim[1] = PLANE_TYPES; + cts_each_dim[2] = 2; + cts_each_dim[3] = 11; + optimize_cdf_table(&fc.eob_multi1024[0][0][0][0], probsfile, 4, cts_each_dim, + "static const aom_cdf_prob av1_default_eob_multi1024_cdfs" + "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(11)]"); + + cts_each_dim[0] = TOKEN_CDF_Q_CTXS; + cts_each_dim[1] = TX_SIZES; + cts_each_dim[2] = PLANE_TYPES; + cts_each_dim[3] = LEVEL_CONTEXTS; + cts_each_dim[4] = BR_CDF_SIZE; + optimize_cdf_table(&fc.coeff_lps_multi[0][0][0][0][0], probsfile, 5, + cts_each_dim, + "static const aom_cdf_prob " + "av1_default_coeff_lps_multi_cdfs[TOKEN_CDF_Q_CTXS]" + "[TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS]" + "[CDF_SIZE(BR_CDF_SIZE)]"); + + cts_each_dim[0] = TOKEN_CDF_Q_CTXS; + cts_each_dim[1] = TX_SIZES; + cts_each_dim[2] = PLANE_TYPES; + cts_each_dim[3] = SIG_COEF_CONTEXTS; + cts_each_dim[4] = NUM_BASE_LEVELS + 2; + optimize_cdf_table( + &fc.coeff_base_multi[0][0][0][0][0], probsfile, 5, cts_each_dim, + "static const aom_cdf_prob av1_default_coeff_base_multi_cdfs" + "[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS]" + "[CDF_SIZE(NUM_BASE_LEVELS + 2)]"); + + cts_each_dim[0] = TOKEN_CDF_Q_CTXS; + cts_each_dim[1] = TX_SIZES; + cts_each_dim[2] = PLANE_TYPES; + cts_each_dim[3] = SIG_COEF_CONTEXTS_EOB; + cts_each_dim[4] = NUM_BASE_LEVELS + 1; + optimize_cdf_table( + &fc.coeff_base_eob_multi[0][0][0][0][0], probsfile, 5, cts_each_dim, + "static const aom_cdf_prob av1_default_coeff_base_eob_multi_cdfs" + "[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB]" + "[CDF_SIZE(NUM_BASE_LEVELS + 1)]"); + + fclose(statsfile); + fclose(logfile); + fclose(probsfile); + + return 0; +} diff --git a/libs/libaom/src/tools/cpplint.py b/libs/libaom/src/tools/cpplint.py new file mode 100644 index 000000000..25fbef73d --- /dev/null +++ b/libs/libaom/src/tools/cpplint.py @@ -0,0 +1,4756 @@ +#!/usr/bin/python +# +# Copyright (c) 2009 Google Inc. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +"""Does google-lint on c++ files. + +The goal of this script is to identify places in the code that *may* +be in non-compliance with google style. It does not attempt to fix +up these problems -- the point is to educate. It does also not +attempt to find all problems, or to ensure that everything it does +find is legitimately a problem. + +In particular, we can get very confused by /* and // inside strings! +We do a small hack, which is to ignore //'s with "'s after them on the +same line, but it is far from perfect (in either direction). +""" + +import codecs +import copy +import getopt +import math # for log +import os +import re +import sre_compile +import string +import sys +import unicodedata + + +_USAGE = """ +Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...] + [--counting=total|toplevel|detailed] [--root=subdir] + [--linelength=digits] + [file] ... + + The style guidelines this tries to follow are those in + http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml + + Every problem is given a confidence score from 1-5, with 5 meaning we are + certain of the problem, and 1 meaning it could be a legitimate construct. + This will miss some errors, and is not a substitute for a code review. + + To suppress false-positive errors of a certain category, add a + 'NOLINT(category)' comment to the line. NOLINT or NOLINT(*) + suppresses errors of all categories on that line. + + The files passed in will be linted; at least one file must be provided. + Default linted extensions are .cc, .cpp, .cu, .cuh and .h. Change the + extensions with the --extensions flag. + + Flags: + + output=vs7 + By default, the output is formatted to ease emacs parsing. Visual Studio + compatible output (vs7) may also be used. Other formats are unsupported. + + verbose=# + Specify a number 0-5 to restrict errors to certain verbosity levels. + + filter=-x,+y,... + Specify a comma-separated list of category-filters to apply: only + error messages whose category names pass the filters will be printed. + (Category names are printed with the message and look like + "[whitespace/indent]".) Filters are evaluated left to right. + "-FOO" and "FOO" means "do not print categories that start with FOO". + "+FOO" means "do print categories that start with FOO". + + Examples: --filter=-whitespace,+whitespace/braces + --filter=whitespace,runtime/printf,+runtime/printf_format + --filter=-,+build/include_what_you_use + + To see a list of all the categories used in cpplint, pass no arg: + --filter= + + counting=total|toplevel|detailed + The total number of errors found is always printed. If + 'toplevel' is provided, then the count of errors in each of + the top-level categories like 'build' and 'whitespace' will + also be printed. If 'detailed' is provided, then a count + is provided for each category like 'build/class'. + + root=subdir + The root directory used for deriving header guard CPP variable. + By default, the header guard CPP variable is calculated as the relative + path to the directory that contains .git, .hg, or .svn. When this flag + is specified, the relative path is calculated from the specified + directory. If the specified directory does not exist, this flag is + ignored. + + Examples: + Assuing that src/.git exists, the header guard CPP variables for + src/chrome/browser/ui/browser.h are: + + No flag => CHROME_BROWSER_UI_BROWSER_H_ + --root=chrome => BROWSER_UI_BROWSER_H_ + --root=chrome/browser => UI_BROWSER_H_ + + linelength=digits + This is the allowed line length for the project. The default value is + 80 characters. + + Examples: + --linelength=120 + + extensions=extension,extension,... + The allowed file extensions that cpplint will check + + Examples: + --extensions=hpp,cpp +""" + +# We categorize each error message we print. Here are the categories. +# We want an explicit list so we can list them all in cpplint --filter=. +# If you add a new error message with a new category, add it to the list +# here! cpplint_unittest.py should tell you if you forget to do this. +_ERROR_CATEGORIES = [ + 'build/class', + 'build/deprecated', + 'build/endif_comment', + 'build/explicit_make_pair', + 'build/forward_decl', + 'build/header_guard', + 'build/include', + 'build/include_alpha', + 'build/include_order', + 'build/include_what_you_use', + 'build/namespaces', + 'build/printf_format', + 'build/storage_class', + 'legal/copyright', + 'readability/alt_tokens', + 'readability/braces', + 'readability/casting', + 'readability/check', + 'readability/constructors', + 'readability/fn_size', + 'readability/function', + 'readability/multiline_comment', + 'readability/multiline_string', + 'readability/namespace', + 'readability/nolint', + 'readability/nul', + 'readability/streams', + 'readability/todo', + 'readability/utf8', + 'runtime/arrays', + 'runtime/casting', + 'runtime/explicit', + 'runtime/int', + 'runtime/init', + 'runtime/invalid_increment', + 'runtime/member_string_references', + 'runtime/memset', + 'runtime/operator', + 'runtime/printf', + 'runtime/printf_format', + 'runtime/references', + 'runtime/sizeof', + 'runtime/string', + 'runtime/threadsafe_fn', + 'runtime/vlog', + 'whitespace/blank_line', + 'whitespace/braces', + 'whitespace/comma', + 'whitespace/comments', + 'whitespace/empty_conditional_body', + 'whitespace/empty_loop_body', + 'whitespace/end_of_line', + 'whitespace/ending_newline', + 'whitespace/forcolon', + 'whitespace/indent', + 'whitespace/line_length', + 'whitespace/newline', + 'whitespace/operators', + 'whitespace/parens', + 'whitespace/semicolon', + 'whitespace/tab', + 'whitespace/todo' + ] + +# The default state of the category filter. This is overrided by the --filter= +# flag. By default all errors are on, so only add here categories that should be +# off by default (i.e., categories that must be enabled by the --filter= flags). +# All entries here should start with a '-' or '+', as in the --filter= flag. +_DEFAULT_FILTERS = ['-build/include_alpha'] + +# We used to check for high-bit characters, but after much discussion we +# decided those were OK, as long as they were in UTF-8 and didn't represent +# hard-coded international strings, which belong in a separate i18n file. + + +# C++ headers +_CPP_HEADERS = frozenset([ + # Legacy + 'algobase.h', + 'algo.h', + 'alloc.h', + 'builtinbuf.h', + 'bvector.h', + 'complex.h', + 'defalloc.h', + 'deque.h', + 'editbuf.h', + 'fstream.h', + 'function.h', + 'hash_map', + 'hash_map.h', + 'hash_set', + 'hash_set.h', + 'hashtable.h', + 'heap.h', + 'indstream.h', + 'iomanip.h', + 'iostream.h', + 'istream.h', + 'iterator.h', + 'list.h', + 'map.h', + 'multimap.h', + 'multiset.h', + 'ostream.h', + 'pair.h', + 'parsestream.h', + 'pfstream.h', + 'procbuf.h', + 'pthread_alloc', + 'pthread_alloc.h', + 'rope', + 'rope.h', + 'ropeimpl.h', + 'set.h', + 'slist', + 'slist.h', + 'stack.h', + 'stdiostream.h', + 'stl_alloc.h', + 'stl_relops.h', + 'streambuf.h', + 'stream.h', + 'strfile.h', + 'strstream.h', + 'tempbuf.h', + 'tree.h', + 'type_traits.h', + 'vector.h', + # 17.6.1.2 C++ library headers + 'algorithm', + 'array', + 'atomic', + 'bitset', + 'chrono', + 'codecvt', + 'complex', + 'condition_variable', + 'deque', + 'exception', + 'forward_list', + 'fstream', + 'functional', + 'future', + 'initializer_list', + 'iomanip', + 'ios', + 'iosfwd', + 'iostream', + 'istream', + 'iterator', + 'limits', + 'list', + 'locale', + 'map', + 'memory', + 'mutex', + 'new', + 'numeric', + 'ostream', + 'queue', + 'random', + 'ratio', + 'regex', + 'set', + 'sstream', + 'stack', + 'stdexcept', + 'streambuf', + 'string', + 'strstream', + 'system_error', + 'thread', + 'tuple', + 'typeindex', + 'typeinfo', + 'type_traits', + 'unordered_map', + 'unordered_set', + 'utility', + 'valarray', + 'vector', + # 17.6.1.2 C++ headers for C library facilities + 'cassert', + 'ccomplex', + 'cctype', + 'cerrno', + 'cfenv', + 'cfloat', + 'cinttypes', + 'ciso646', + 'climits', + 'clocale', + 'cmath', + 'csetjmp', + 'csignal', + 'cstdalign', + 'cstdarg', + 'cstdbool', + 'cstddef', + 'cstdint', + 'cstdio', + 'cstdlib', + 'cstring', + 'ctgmath', + 'ctime', + 'cuchar', + 'cwchar', + 'cwctype', + ]) + +# Assertion macros. These are defined in base/logging.h and +# testing/base/gunit.h. Note that the _M versions need to come first +# for substring matching to work. +_CHECK_MACROS = [ + 'DCHECK', 'CHECK', + 'EXPECT_TRUE_M', 'EXPECT_TRUE', + 'ASSERT_TRUE_M', 'ASSERT_TRUE', + 'EXPECT_FALSE_M', 'EXPECT_FALSE', + 'ASSERT_FALSE_M', 'ASSERT_FALSE', + ] + +# Replacement macros for CHECK/DCHECK/EXPECT_TRUE/EXPECT_FALSE +_CHECK_REPLACEMENT = dict([(m, {}) for m in _CHECK_MACROS]) + +for op, replacement in [('==', 'EQ'), ('!=', 'NE'), + ('>=', 'GE'), ('>', 'GT'), + ('<=', 'LE'), ('<', 'LT')]: + _CHECK_REPLACEMENT['DCHECK'][op] = 'DCHECK_%s' % replacement + _CHECK_REPLACEMENT['CHECK'][op] = 'CHECK_%s' % replacement + _CHECK_REPLACEMENT['EXPECT_TRUE'][op] = 'EXPECT_%s' % replacement + _CHECK_REPLACEMENT['ASSERT_TRUE'][op] = 'ASSERT_%s' % replacement + _CHECK_REPLACEMENT['EXPECT_TRUE_M'][op] = 'EXPECT_%s_M' % replacement + _CHECK_REPLACEMENT['ASSERT_TRUE_M'][op] = 'ASSERT_%s_M' % replacement + +for op, inv_replacement in [('==', 'NE'), ('!=', 'EQ'), + ('>=', 'LT'), ('>', 'LE'), + ('<=', 'GT'), ('<', 'GE')]: + _CHECK_REPLACEMENT['EXPECT_FALSE'][op] = 'EXPECT_%s' % inv_replacement + _CHECK_REPLACEMENT['ASSERT_FALSE'][op] = 'ASSERT_%s' % inv_replacement + _CHECK_REPLACEMENT['EXPECT_FALSE_M'][op] = 'EXPECT_%s_M' % inv_replacement + _CHECK_REPLACEMENT['ASSERT_FALSE_M'][op] = 'ASSERT_%s_M' % inv_replacement + +# Alternative tokens and their replacements. For full list, see section 2.5 +# Alternative tokens [lex.digraph] in the C++ standard. +# +# Digraphs (such as '%:') are not included here since it's a mess to +# match those on a word boundary. +_ALT_TOKEN_REPLACEMENT = { + 'and': '&&', + 'bitor': '|', + 'or': '||', + 'xor': '^', + 'compl': '~', + 'bitand': '&', + 'and_eq': '&=', + 'or_eq': '|=', + 'xor_eq': '^=', + 'not': '!', + 'not_eq': '!=' + } + +# Compile regular expression that matches all the above keywords. The "[ =()]" +# bit is meant to avoid matching these keywords outside of boolean expressions. +# +# False positives include C-style multi-line comments and multi-line strings +# but those have always been troublesome for cpplint. +_ALT_TOKEN_REPLACEMENT_PATTERN = re.compile( + r'[ =()](' + ('|'.join(_ALT_TOKEN_REPLACEMENT.keys())) + r')(?=[ (]|$)') + + +# These constants define types of headers for use with +# _IncludeState.CheckNextIncludeOrder(). +_C_SYS_HEADER = 1 +_CPP_SYS_HEADER = 2 +_LIKELY_MY_HEADER = 3 +_POSSIBLE_MY_HEADER = 4 +_OTHER_HEADER = 5 + +# These constants define the current inline assembly state +_NO_ASM = 0 # Outside of inline assembly block +_INSIDE_ASM = 1 # Inside inline assembly block +_END_ASM = 2 # Last line of inline assembly block +_BLOCK_ASM = 3 # The whole block is an inline assembly block + +# Match start of assembly blocks +_MATCH_ASM = re.compile(r'^\s*(?:asm|_asm|__asm|__asm__)' + r'(?:\s+(volatile|__volatile__))?' + r'\s*[{(]') + + +_regexp_compile_cache = {} + +# Finds occurrences of NOLINT or NOLINT(...). +_RE_SUPPRESSION = re.compile(r'\bNOLINT\b(\([^)]*\))?') + +# {str, set(int)}: a map from error categories to sets of linenumbers +# on which those errors are expected and should be suppressed. +_error_suppressions = {} + +# The root directory used for deriving header guard CPP variable. +# This is set by --root flag. +_root = None + +# The allowed line length of files. +# This is set by --linelength flag. +_line_length = 80 + +# The allowed extensions for file names +# This is set by --extensions flag. +_valid_extensions = set(['cc', 'h', 'cpp', 'cu', 'cuh']) + +def ParseNolintSuppressions(filename, raw_line, linenum, error): + """Updates the global list of error-suppressions. + + Parses any NOLINT comments on the current line, updating the global + error_suppressions store. Reports an error if the NOLINT comment + was malformed. + + Args: + filename: str, the name of the input file. + raw_line: str, the line of input text, with comments. + linenum: int, the number of the current line. + error: function, an error handler. + """ + # FIXME(adonovan): "NOLINT(" is misparsed as NOLINT(*). + matched = _RE_SUPPRESSION.search(raw_line) + if matched: + category = matched.group(1) + if category in (None, '(*)'): # => "suppress all" + _error_suppressions.setdefault(None, set()).add(linenum) + else: + if category.startswith('(') and category.endswith(')'): + category = category[1:-1] + if category in _ERROR_CATEGORIES: + _error_suppressions.setdefault(category, set()).add(linenum) + else: + error(filename, linenum, 'readability/nolint', 5, + 'Unknown NOLINT error category: %s' % category) + + +def ResetNolintSuppressions(): + "Resets the set of NOLINT suppressions to empty." + _error_suppressions.clear() + + +def IsErrorSuppressedByNolint(category, linenum): + """Returns true if the specified error category is suppressed on this line. + + Consults the global error_suppressions map populated by + ParseNolintSuppressions/ResetNolintSuppressions. + + Args: + category: str, the category of the error. + linenum: int, the current line number. + Returns: + bool, True iff the error should be suppressed due to a NOLINT comment. + """ + return (linenum in _error_suppressions.get(category, set()) or + linenum in _error_suppressions.get(None, set())) + +def Match(pattern, s): + """Matches the string with the pattern, caching the compiled regexp.""" + # The regexp compilation caching is inlined in both Match and Search for + # performance reasons; factoring it out into a separate function turns out + # to be noticeably expensive. + if pattern not in _regexp_compile_cache: + _regexp_compile_cache[pattern] = sre_compile.compile(pattern) + return _regexp_compile_cache[pattern].match(s) + + +def ReplaceAll(pattern, rep, s): + """Replaces instances of pattern in a string with a replacement. + + The compiled regex is kept in a cache shared by Match and Search. + + Args: + pattern: regex pattern + rep: replacement text + s: search string + + Returns: + string with replacements made (or original string if no replacements) + """ + if pattern not in _regexp_compile_cache: + _regexp_compile_cache[pattern] = sre_compile.compile(pattern) + return _regexp_compile_cache[pattern].sub(rep, s) + + +def Search(pattern, s): + """Searches the string for the pattern, caching the compiled regexp.""" + if pattern not in _regexp_compile_cache: + _regexp_compile_cache[pattern] = sre_compile.compile(pattern) + return _regexp_compile_cache[pattern].search(s) + + +class _IncludeState(dict): + """Tracks line numbers for includes, and the order in which includes appear. + + As a dict, an _IncludeState object serves as a mapping between include + filename and line number on which that file was included. + + Call CheckNextIncludeOrder() once for each header in the file, passing + in the type constants defined above. Calls in an illegal order will + raise an _IncludeError with an appropriate error message. + + """ + # self._section will move monotonically through this set. If it ever + # needs to move backwards, CheckNextIncludeOrder will raise an error. + _INITIAL_SECTION = 0 + _MY_H_SECTION = 1 + _C_SECTION = 2 + _CPP_SECTION = 3 + _OTHER_H_SECTION = 4 + + _TYPE_NAMES = { + _C_SYS_HEADER: 'C system header', + _CPP_SYS_HEADER: 'C++ system header', + _LIKELY_MY_HEADER: 'header this file implements', + _POSSIBLE_MY_HEADER: 'header this file may implement', + _OTHER_HEADER: 'other header', + } + _SECTION_NAMES = { + _INITIAL_SECTION: "... nothing. (This can't be an error.)", + _MY_H_SECTION: 'a header this file implements', + _C_SECTION: 'C system header', + _CPP_SECTION: 'C++ system header', + _OTHER_H_SECTION: 'other header', + } + + def __init__(self): + dict.__init__(self) + self.ResetSection() + + def ResetSection(self): + # The name of the current section. + self._section = self._INITIAL_SECTION + # The path of last found header. + self._last_header = '' + + def SetLastHeader(self, header_path): + self._last_header = header_path + + def CanonicalizeAlphabeticalOrder(self, header_path): + """Returns a path canonicalized for alphabetical comparison. + + - replaces "-" with "_" so they both cmp the same. + - removes '-inl' since we don't require them to be after the main header. + - lowercase everything, just in case. + + Args: + header_path: Path to be canonicalized. + + Returns: + Canonicalized path. + """ + return header_path.replace('-inl.h', '.h').replace('-', '_').lower() + + def IsInAlphabeticalOrder(self, clean_lines, linenum, header_path): + """Check if a header is in alphabetical order with the previous header. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + header_path: Canonicalized header to be checked. + + Returns: + Returns true if the header is in alphabetical order. + """ + # If previous section is different from current section, _last_header will + # be reset to empty string, so it's always less than current header. + # + # If previous line was a blank line, assume that the headers are + # intentionally sorted the way they are. + if (self._last_header > header_path and + not Match(r'^\s*$', clean_lines.elided[linenum - 1])): + return False + return True + + def CheckNextIncludeOrder(self, header_type): + """Returns a non-empty error message if the next header is out of order. + + This function also updates the internal state to be ready to check + the next include. + + Args: + header_type: One of the _XXX_HEADER constants defined above. + + Returns: + The empty string if the header is in the right order, or an + error message describing what's wrong. + + """ + error_message = ('Found %s after %s' % + (self._TYPE_NAMES[header_type], + self._SECTION_NAMES[self._section])) + + last_section = self._section + + if header_type == _C_SYS_HEADER: + if self._section <= self._C_SECTION: + self._section = self._C_SECTION + else: + self._last_header = '' + return error_message + elif header_type == _CPP_SYS_HEADER: + if self._section <= self._CPP_SECTION: + self._section = self._CPP_SECTION + else: + self._last_header = '' + return error_message + elif header_type == _LIKELY_MY_HEADER: + if self._section <= self._MY_H_SECTION: + self._section = self._MY_H_SECTION + else: + self._section = self._OTHER_H_SECTION + elif header_type == _POSSIBLE_MY_HEADER: + if self._section <= self._MY_H_SECTION: + self._section = self._MY_H_SECTION + else: + # This will always be the fallback because we're not sure + # enough that the header is associated with this file. + self._section = self._OTHER_H_SECTION + else: + assert header_type == _OTHER_HEADER + self._section = self._OTHER_H_SECTION + + if last_section != self._section: + self._last_header = '' + + return '' + + +class _CppLintState(object): + """Maintains module-wide state..""" + + def __init__(self): + self.verbose_level = 1 # global setting. + self.error_count = 0 # global count of reported errors + # filters to apply when emitting error messages + self.filters = _DEFAULT_FILTERS[:] + self.counting = 'total' # In what way are we counting errors? + self.errors_by_category = {} # string to int dict storing error counts + + # output format: + # "emacs" - format that emacs can parse (default) + # "vs7" - format that Microsoft Visual Studio 7 can parse + self.output_format = 'emacs' + + def SetOutputFormat(self, output_format): + """Sets the output format for errors.""" + self.output_format = output_format + + def SetVerboseLevel(self, level): + """Sets the module's verbosity, and returns the previous setting.""" + last_verbose_level = self.verbose_level + self.verbose_level = level + return last_verbose_level + + def SetCountingStyle(self, counting_style): + """Sets the module's counting options.""" + self.counting = counting_style + + def SetFilters(self, filters): + """Sets the error-message filters. + + These filters are applied when deciding whether to emit a given + error message. + + Args: + filters: A string of comma-separated filters (eg "+whitespace/indent"). + Each filter should start with + or -; else we die. + + Raises: + ValueError: The comma-separated filters did not all start with '+' or '-'. + E.g. "-,+whitespace,-whitespace/indent,whitespace/badfilter" + """ + # Default filters always have less priority than the flag ones. + self.filters = _DEFAULT_FILTERS[:] + for filt in filters.split(','): + clean_filt = filt.strip() + if clean_filt: + self.filters.append(clean_filt) + for filt in self.filters: + if not (filt.startswith('+') or filt.startswith('-')): + raise ValueError('Every filter in --filters must start with + or -' + ' (%s does not)' % filt) + + def ResetErrorCounts(self): + """Sets the module's error statistic back to zero.""" + self.error_count = 0 + self.errors_by_category = {} + + def IncrementErrorCount(self, category): + """Bumps the module's error statistic.""" + self.error_count += 1 + if self.counting in ('toplevel', 'detailed'): + if self.counting != 'detailed': + category = category.split('/')[0] + if category not in self.errors_by_category: + self.errors_by_category[category] = 0 + self.errors_by_category[category] += 1 + + def PrintErrorCounts(self): + """Print a summary of errors by category, and the total.""" + for category, count in self.errors_by_category.iteritems(): + sys.stderr.write('Category \'%s\' errors found: %d\n' % + (category, count)) + sys.stderr.write('Total errors found: %d\n' % self.error_count) + +_cpplint_state = _CppLintState() + + +def _OutputFormat(): + """Gets the module's output format.""" + return _cpplint_state.output_format + + +def _SetOutputFormat(output_format): + """Sets the module's output format.""" + _cpplint_state.SetOutputFormat(output_format) + + +def _VerboseLevel(): + """Returns the module's verbosity setting.""" + return _cpplint_state.verbose_level + + +def _SetVerboseLevel(level): + """Sets the module's verbosity, and returns the previous setting.""" + return _cpplint_state.SetVerboseLevel(level) + + +def _SetCountingStyle(level): + """Sets the module's counting options.""" + _cpplint_state.SetCountingStyle(level) + + +def _Filters(): + """Returns the module's list of output filters, as a list.""" + return _cpplint_state.filters + + +def _SetFilters(filters): + """Sets the module's error-message filters. + + These filters are applied when deciding whether to emit a given + error message. + + Args: + filters: A string of comma-separated filters (eg "whitespace/indent"). + Each filter should start with + or -; else we die. + """ + _cpplint_state.SetFilters(filters) + + +class _FunctionState(object): + """Tracks current function name and the number of lines in its body.""" + + _NORMAL_TRIGGER = 250 # for --v=0, 500 for --v=1, etc. + _TEST_TRIGGER = 400 # about 50% more than _NORMAL_TRIGGER. + + def __init__(self): + self.in_a_function = False + self.lines_in_function = 0 + self.current_function = '' + + def Begin(self, function_name): + """Start analyzing function body. + + Args: + function_name: The name of the function being tracked. + """ + self.in_a_function = True + self.lines_in_function = 0 + self.current_function = function_name + + def Count(self): + """Count line in current function body.""" + if self.in_a_function: + self.lines_in_function += 1 + + def Check(self, error, filename, linenum): + """Report if too many lines in function body. + + Args: + error: The function to call with any errors found. + filename: The name of the current file. + linenum: The number of the line to check. + """ + if Match(r'T(EST|est)', self.current_function): + base_trigger = self._TEST_TRIGGER + else: + base_trigger = self._NORMAL_TRIGGER + trigger = base_trigger * 2**_VerboseLevel() + + if self.lines_in_function > trigger: + error_level = int(math.log(self.lines_in_function / base_trigger, 2)) + # 50 => 0, 100 => 1, 200 => 2, 400 => 3, 800 => 4, 1600 => 5, ... + if error_level > 5: + error_level = 5 + error(filename, linenum, 'readability/fn_size', error_level, + 'Small and focused functions are preferred:' + ' %s has %d non-comment lines' + ' (error triggered by exceeding %d lines).' % ( + self.current_function, self.lines_in_function, trigger)) + + def End(self): + """Stop analyzing function body.""" + self.in_a_function = False + + +class _IncludeError(Exception): + """Indicates a problem with the include order in a file.""" + pass + + +class FileInfo: + """Provides utility functions for filenames. + + FileInfo provides easy access to the components of a file's path + relative to the project root. + """ + + def __init__(self, filename): + self._filename = filename + + def FullName(self): + """Make Windows paths like Unix.""" + return os.path.abspath(self._filename).replace('\\', '/') + + def RepositoryName(self): + """FullName after removing the local path to the repository. + + If we have a real absolute path name here we can try to do something smart: + detecting the root of the checkout and truncating /path/to/checkout from + the name so that we get header guards that don't include things like + "C:\Documents and Settings\..." or "/home/username/..." in them and thus + people on different computers who have checked the source out to different + locations won't see bogus errors. + """ + fullname = self.FullName() + + if os.path.exists(fullname): + project_dir = os.path.dirname(fullname) + + if os.path.exists(os.path.join(project_dir, ".svn")): + # If there's a .svn file in the current directory, we recursively look + # up the directory tree for the top of the SVN checkout + root_dir = project_dir + one_up_dir = os.path.dirname(root_dir) + while os.path.exists(os.path.join(one_up_dir, ".svn")): + root_dir = os.path.dirname(root_dir) + one_up_dir = os.path.dirname(one_up_dir) + + prefix = os.path.commonprefix([root_dir, project_dir]) + return fullname[len(prefix) + 1:] + + # Not SVN <= 1.6? Try to find a git, hg, or svn top level directory by + # searching up from the current path. + root_dir = os.path.dirname(fullname) + while (root_dir != os.path.dirname(root_dir) and + not os.path.exists(os.path.join(root_dir, ".git")) and + not os.path.exists(os.path.join(root_dir, ".hg")) and + not os.path.exists(os.path.join(root_dir, ".svn"))): + root_dir = os.path.dirname(root_dir) + + if (os.path.exists(os.path.join(root_dir, ".git")) or + os.path.exists(os.path.join(root_dir, ".hg")) or + os.path.exists(os.path.join(root_dir, ".svn"))): + prefix = os.path.commonprefix([root_dir, project_dir]) + return fullname[len(prefix) + 1:] + + # Don't know what to do; header guard warnings may be wrong... + return fullname + + def Split(self): + """Splits the file into the directory, basename, and extension. + + For 'chrome/browser/browser.cc', Split() would + return ('chrome/browser', 'browser', '.cc') + + Returns: + A tuple of (directory, basename, extension). + """ + + googlename = self.RepositoryName() + project, rest = os.path.split(googlename) + return (project,) + os.path.splitext(rest) + + def BaseName(self): + """File base name - text after the final slash, before the final period.""" + return self.Split()[1] + + def Extension(self): + """File extension - text following the final period.""" + return self.Split()[2] + + def NoExtension(self): + """File has no source file extension.""" + return '/'.join(self.Split()[0:2]) + + def IsSource(self): + """File has a source file extension.""" + return self.Extension()[1:] in ('c', 'cc', 'cpp', 'cxx') + + +def _ShouldPrintError(category, confidence, linenum): + """If confidence >= verbose, category passes filter and is not suppressed.""" + + # There are three ways we might decide not to print an error message: + # a "NOLINT(category)" comment appears in the source, + # the verbosity level isn't high enough, or the filters filter it out. + if IsErrorSuppressedByNolint(category, linenum): + return False + if confidence < _cpplint_state.verbose_level: + return False + + is_filtered = False + for one_filter in _Filters(): + if one_filter.startswith('-'): + if category.startswith(one_filter[1:]): + is_filtered = True + elif one_filter.startswith('+'): + if category.startswith(one_filter[1:]): + is_filtered = False + else: + assert False # should have been checked for in SetFilter. + if is_filtered: + return False + + return True + + +def Error(filename, linenum, category, confidence, message): + """Logs the fact we've found a lint error. + + We log where the error was found, and also our confidence in the error, + that is, how certain we are this is a legitimate style regression, and + not a misidentification or a use that's sometimes justified. + + False positives can be suppressed by the use of + "cpplint(category)" comments on the offending line. These are + parsed into _error_suppressions. + + Args: + filename: The name of the file containing the error. + linenum: The number of the line containing the error. + category: A string used to describe the "category" this bug + falls under: "whitespace", say, or "runtime". Categories + may have a hierarchy separated by slashes: "whitespace/indent". + confidence: A number from 1-5 representing a confidence score for + the error, with 5 meaning that we are certain of the problem, + and 1 meaning that it could be a legitimate construct. + message: The error message. + """ + if _ShouldPrintError(category, confidence, linenum): + _cpplint_state.IncrementErrorCount(category) + if _cpplint_state.output_format == 'vs7': + sys.stderr.write('%s(%s): %s [%s] [%d]\n' % ( + filename, linenum, message, category, confidence)) + elif _cpplint_state.output_format == 'eclipse': + sys.stderr.write('%s:%s: warning: %s [%s] [%d]\n' % ( + filename, linenum, message, category, confidence)) + else: + sys.stderr.write('%s:%s: %s [%s] [%d]\n' % ( + filename, linenum, message, category, confidence)) + + +# Matches standard C++ escape sequences per 2.13.2.3 of the C++ standard. +_RE_PATTERN_CLEANSE_LINE_ESCAPES = re.compile( + r'\\([abfnrtv?"\\\']|\d+|x[0-9a-fA-F]+)') +# Matches strings. Escape codes should already be removed by ESCAPES. +_RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES = re.compile(r'"[^"]*"') +# Matches characters. Escape codes should already be removed by ESCAPES. +_RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES = re.compile(r"'.'") +# Matches multi-line C++ comments. +# This RE is a little bit more complicated than one might expect, because we +# have to take care of space removals tools so we can handle comments inside +# statements better. +# The current rule is: We only clear spaces from both sides when we're at the +# end of the line. Otherwise, we try to remove spaces from the right side, +# if this doesn't work we try on left side but only if there's a non-character +# on the right. +_RE_PATTERN_CLEANSE_LINE_C_COMMENTS = re.compile( + r"""(\s*/\*.*\*/\s*$| + /\*.*\*/\s+| + \s+/\*.*\*/(?=\W)| + /\*.*\*/)""", re.VERBOSE) + + +def IsCppString(line): + """Does line terminate so, that the next symbol is in string constant. + + This function does not consider single-line nor multi-line comments. + + Args: + line: is a partial line of code starting from the 0..n. + + Returns: + True, if next character appended to 'line' is inside a + string constant. + """ + + line = line.replace(r'\\', 'XX') # after this, \\" does not match to \" + return ((line.count('"') - line.count(r'\"') - line.count("'\"'")) & 1) == 1 + + +def CleanseRawStrings(raw_lines): + """Removes C++11 raw strings from lines. + + Before: + static const char kData[] = R"( + multi-line string + )"; + + After: + static const char kData[] = "" + (replaced by blank line) + ""; + + Args: + raw_lines: list of raw lines. + + Returns: + list of lines with C++11 raw strings replaced by empty strings. + """ + + delimiter = None + lines_without_raw_strings = [] + for line in raw_lines: + if delimiter: + # Inside a raw string, look for the end + end = line.find(delimiter) + if end >= 0: + # Found the end of the string, match leading space for this + # line and resume copying the original lines, and also insert + # a "" on the last line. + leading_space = Match(r'^(\s*)\S', line) + line = leading_space.group(1) + '""' + line[end + len(delimiter):] + delimiter = None + else: + # Haven't found the end yet, append a blank line. + line = '' + + else: + # Look for beginning of a raw string. + # See 2.14.15 [lex.string] for syntax. + matched = Match(r'^(.*)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line) + if matched: + delimiter = ')' + matched.group(2) + '"' + + end = matched.group(3).find(delimiter) + if end >= 0: + # Raw string ended on same line + line = (matched.group(1) + '""' + + matched.group(3)[end + len(delimiter):]) + delimiter = None + else: + # Start of a multi-line raw string + line = matched.group(1) + '""' + + lines_without_raw_strings.append(line) + + # TODO(unknown): if delimiter is not None here, we might want to + # emit a warning for unterminated string. + return lines_without_raw_strings + + +def FindNextMultiLineCommentStart(lines, lineix): + """Find the beginning marker for a multiline comment.""" + while lineix < len(lines): + if lines[lineix].strip().startswith('/*'): + # Only return this marker if the comment goes beyond this line + if lines[lineix].strip().find('*/', 2) < 0: + return lineix + lineix += 1 + return len(lines) + + +def FindNextMultiLineCommentEnd(lines, lineix): + """We are inside a comment, find the end marker.""" + while lineix < len(lines): + if lines[lineix].strip().endswith('*/'): + return lineix + lineix += 1 + return len(lines) + + +def RemoveMultiLineCommentsFromRange(lines, begin, end): + """Clears a range of lines for multi-line comments.""" + # Having // dummy comments makes the lines non-empty, so we will not get + # unnecessary blank line warnings later in the code. + for i in range(begin, end): + lines[i] = '// dummy' + + +def RemoveMultiLineComments(filename, lines, error): + """Removes multiline (c-style) comments from lines.""" + lineix = 0 + while lineix < len(lines): + lineix_begin = FindNextMultiLineCommentStart(lines, lineix) + if lineix_begin >= len(lines): + return + lineix_end = FindNextMultiLineCommentEnd(lines, lineix_begin) + if lineix_end >= len(lines): + error(filename, lineix_begin + 1, 'readability/multiline_comment', 5, + 'Could not find end of multi-line comment') + return + RemoveMultiLineCommentsFromRange(lines, lineix_begin, lineix_end + 1) + lineix = lineix_end + 1 + + +def CleanseComments(line): + """Removes //-comments and single-line C-style /* */ comments. + + Args: + line: A line of C++ source. + + Returns: + The line with single-line comments removed. + """ + commentpos = line.find('//') + if commentpos != -1 and not IsCppString(line[:commentpos]): + line = line[:commentpos].rstrip() + # get rid of /* ... */ + return _RE_PATTERN_CLEANSE_LINE_C_COMMENTS.sub('', line) + + +class CleansedLines(object): + """Holds 3 copies of all lines with different preprocessing applied to them. + + 1) elided member contains lines without strings and comments, + 2) lines member contains lines without comments, and + 3) raw_lines member contains all the lines without processing. + All these three members are of , and of the same length. + """ + + def __init__(self, lines): + self.elided = [] + self.lines = [] + self.raw_lines = lines + self.num_lines = len(lines) + self.lines_without_raw_strings = CleanseRawStrings(lines) + for linenum in range(len(self.lines_without_raw_strings)): + self.lines.append(CleanseComments( + self.lines_without_raw_strings[linenum])) + elided = self._CollapseStrings(self.lines_without_raw_strings[linenum]) + self.elided.append(CleanseComments(elided)) + + def NumLines(self): + """Returns the number of lines represented.""" + return self.num_lines + + @staticmethod + def _CollapseStrings(elided): + """Collapses strings and chars on a line to simple "" or '' blocks. + + We nix strings first so we're not fooled by text like '"http://"' + + Args: + elided: The line being processed. + + Returns: + The line with collapsed strings. + """ + if not _RE_PATTERN_INCLUDE.match(elided): + # Remove escaped characters first to make quote/single quote collapsing + # basic. Things that look like escaped characters shouldn't occur + # outside of strings and chars. + elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided) + elided = _RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES.sub("''", elided) + elided = _RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES.sub('""', elided) + return elided + + +def FindEndOfExpressionInLine(line, startpos, depth, startchar, endchar): + """Find the position just after the matching endchar. + + Args: + line: a CleansedLines line. + startpos: start searching at this position. + depth: nesting level at startpos. + startchar: expression opening character. + endchar: expression closing character. + + Returns: + On finding matching endchar: (index just after matching endchar, 0) + Otherwise: (-1, new depth at end of this line) + """ + for i in xrange(startpos, len(line)): + if line[i] == startchar: + depth += 1 + elif line[i] == endchar: + depth -= 1 + if depth == 0: + return (i + 1, 0) + return (-1, depth) + + +def CloseExpression(clean_lines, linenum, pos): + """If input points to ( or { or [ or <, finds the position that closes it. + + If lines[linenum][pos] points to a '(' or '{' or '[' or '<', finds the + linenum/pos that correspond to the closing of the expression. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + pos: A position on the line. + + Returns: + A tuple (line, linenum, pos) pointer *past* the closing brace, or + (line, len(lines), -1) if we never find a close. Note we ignore + strings and comments when matching; and the line we return is the + 'cleansed' line at linenum. + """ + + line = clean_lines.elided[linenum] + startchar = line[pos] + if startchar not in '({[<': + return (line, clean_lines.NumLines(), -1) + if startchar == '(': endchar = ')' + if startchar == '[': endchar = ']' + if startchar == '{': endchar = '}' + if startchar == '<': endchar = '>' + + # Check first line + (end_pos, num_open) = FindEndOfExpressionInLine( + line, pos, 0, startchar, endchar) + if end_pos > -1: + return (line, linenum, end_pos) + + # Continue scanning forward + while linenum < clean_lines.NumLines() - 1: + linenum += 1 + line = clean_lines.elided[linenum] + (end_pos, num_open) = FindEndOfExpressionInLine( + line, 0, num_open, startchar, endchar) + if end_pos > -1: + return (line, linenum, end_pos) + + # Did not find endchar before end of file, give up + return (line, clean_lines.NumLines(), -1) + + +def FindStartOfExpressionInLine(line, endpos, depth, startchar, endchar): + """Find position at the matching startchar. + + This is almost the reverse of FindEndOfExpressionInLine, but note + that the input position and returned position differs by 1. + + Args: + line: a CleansedLines line. + endpos: start searching at this position. + depth: nesting level at endpos. + startchar: expression opening character. + endchar: expression closing character. + + Returns: + On finding matching startchar: (index at matching startchar, 0) + Otherwise: (-1, new depth at beginning of this line) + """ + for i in xrange(endpos, -1, -1): + if line[i] == endchar: + depth += 1 + elif line[i] == startchar: + depth -= 1 + if depth == 0: + return (i, 0) + return (-1, depth) + + +def ReverseCloseExpression(clean_lines, linenum, pos): + """If input points to ) or } or ] or >, finds the position that opens it. + + If lines[linenum][pos] points to a ')' or '}' or ']' or '>', finds the + linenum/pos that correspond to the opening of the expression. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + pos: A position on the line. + + Returns: + A tuple (line, linenum, pos) pointer *at* the opening brace, or + (line, 0, -1) if we never find the matching opening brace. Note + we ignore strings and comments when matching; and the line we + return is the 'cleansed' line at linenum. + """ + line = clean_lines.elided[linenum] + endchar = line[pos] + if endchar not in ')}]>': + return (line, 0, -1) + if endchar == ')': startchar = '(' + if endchar == ']': startchar = '[' + if endchar == '}': startchar = '{' + if endchar == '>': startchar = '<' + + # Check last line + (start_pos, num_open) = FindStartOfExpressionInLine( + line, pos, 0, startchar, endchar) + if start_pos > -1: + return (line, linenum, start_pos) + + # Continue scanning backward + while linenum > 0: + linenum -= 1 + line = clean_lines.elided[linenum] + (start_pos, num_open) = FindStartOfExpressionInLine( + line, len(line) - 1, num_open, startchar, endchar) + if start_pos > -1: + return (line, linenum, start_pos) + + # Did not find startchar before beginning of file, give up + return (line, 0, -1) + + +def CheckForCopyright(filename, lines, error): + """Logs an error if no Copyright message appears at the top of the file.""" + + # We'll say it should occur by line 10. Don't forget there's a + # dummy line at the front. + for line in xrange(1, min(len(lines), 11)): + if re.search(r'Copyright', lines[line], re.I): break + else: # means no copyright line was found + error(filename, 0, 'legal/copyright', 5, + 'No copyright message found. ' + 'You should have a line: "Copyright [year] "') + + +def GetHeaderGuardCPPVariable(filename): + """Returns the CPP variable that should be used as a header guard. + + Args: + filename: The name of a C++ header file. + + Returns: + The CPP variable that should be used as a header guard in the + named file. + + """ + + # Restores original filename in case that cpplint is invoked from Emacs's + # flymake. + filename = re.sub(r'_flymake\.h$', '.h', filename) + filename = re.sub(r'/\.flymake/([^/]*)$', r'/\1', filename) + + fileinfo = FileInfo(filename) + file_path_from_root = fileinfo.RepositoryName() + if _root: + file_path_from_root = re.sub('^' + _root + os.sep, '', file_path_from_root) + return re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_' + + +def CheckForHeaderGuard(filename, lines, error): + """Checks that the file contains a header guard. + + Logs an error if no #ifndef header guard is present. For other + headers, checks that the full pathname is used. + + Args: + filename: The name of the C++ header file. + lines: An array of strings, each representing a line of the file. + error: The function to call with any errors found. + """ + + cppvar = GetHeaderGuardCPPVariable(filename) + + ifndef = None + ifndef_linenum = 0 + define = None + endif = None + endif_linenum = 0 + for linenum, line in enumerate(lines): + linesplit = line.split() + if len(linesplit) >= 2: + # find the first occurrence of #ifndef and #define, save arg + if not ifndef and linesplit[0] == '#ifndef': + # set ifndef to the header guard presented on the #ifndef line. + ifndef = linesplit[1] + ifndef_linenum = linenum + if not define and linesplit[0] == '#define': + define = linesplit[1] + # find the last occurrence of #endif, save entire line + if line.startswith('#endif'): + endif = line + endif_linenum = linenum + + if not ifndef: + error(filename, 0, 'build/header_guard', 5, + 'No #ifndef header guard found, suggested CPP variable is: %s' % + cppvar) + return + + if not define: + error(filename, 0, 'build/header_guard', 5, + 'No #define header guard found, suggested CPP variable is: %s' % + cppvar) + return + + # The guard should be PATH_FILE_H_, but we also allow PATH_FILE_H__ + # for backward compatibility. + if ifndef != cppvar: + error_level = 0 + if ifndef != cppvar + '_': + error_level = 5 + + ParseNolintSuppressions(filename, lines[ifndef_linenum], ifndef_linenum, + error) + error(filename, ifndef_linenum, 'build/header_guard', error_level, + '#ifndef header guard has wrong style, please use: %s' % cppvar) + + if define != ifndef: + error(filename, 0, 'build/header_guard', 5, + '#ifndef and #define don\'t match, suggested CPP variable is: %s' % + cppvar) + return + + if endif != ('#endif // %s' % cppvar): + error_level = 0 + if endif != ('#endif // %s' % (cppvar + '_')): + error_level = 5 + + ParseNolintSuppressions(filename, lines[endif_linenum], endif_linenum, + error) + error(filename, endif_linenum, 'build/header_guard', error_level, + '#endif line should be "#endif // %s"' % cppvar) + + +def CheckForBadCharacters(filename, lines, error): + """Logs an error for each line containing bad characters. + + Two kinds of bad characters: + + 1. Unicode replacement characters: These indicate that either the file + contained invalid UTF-8 (likely) or Unicode replacement characters (which + it shouldn't). Note that it's possible for this to throw off line + numbering if the invalid UTF-8 occurred adjacent to a newline. + + 2. NUL bytes. These are problematic for some tools. + + Args: + filename: The name of the current file. + lines: An array of strings, each representing a line of the file. + error: The function to call with any errors found. + """ + for linenum, line in enumerate(lines): + if u'\ufffd' in line: + error(filename, linenum, 'readability/utf8', 5, + 'Line contains invalid UTF-8 (or Unicode replacement character).') + if '\0' in line: + error(filename, linenum, 'readability/nul', 5, 'Line contains NUL byte.') + + +def CheckForNewlineAtEOF(filename, lines, error): + """Logs an error if there is no newline char at the end of the file. + + Args: + filename: The name of the current file. + lines: An array of strings, each representing a line of the file. + error: The function to call with any errors found. + """ + + # The array lines() was created by adding two newlines to the + # original file (go figure), then splitting on \n. + # To verify that the file ends in \n, we just have to make sure the + # last-but-two element of lines() exists and is empty. + if len(lines) < 3 or lines[-2]: + error(filename, len(lines) - 2, 'whitespace/ending_newline', 5, + 'Could not find a newline character at the end of the file.') + + +def CheckForMultilineCommentsAndStrings(filename, clean_lines, linenum, error): + """Logs an error if we see /* ... */ or "..." that extend past one line. + + /* ... */ comments are legit inside macros, for one line. + Otherwise, we prefer // comments, so it's ok to warn about the + other. Likewise, it's ok for strings to extend across multiple + lines, as long as a line continuation character (backslash) + terminates each line. Although not currently prohibited by the C++ + style guide, it's ugly and unnecessary. We don't do well with either + in this lint program, so we warn about both. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # Remove all \\ (escaped backslashes) from the line. They are OK, and the + # second (escaped) slash may trigger later \" detection erroneously. + line = line.replace('\\\\', '') + + if line.count('/*') > line.count('*/'): + error(filename, linenum, 'readability/multiline_comment', 5, + 'Complex multi-line /*...*/-style comment found. ' + 'Lint may give bogus warnings. ' + 'Consider replacing these with //-style comments, ' + 'with #if 0...#endif, ' + 'or with more clearly structured multi-line comments.') + + if (line.count('"') - line.count('\\"')) % 2: + error(filename, linenum, 'readability/multiline_string', 5, + 'Multi-line string ("...") found. This lint script doesn\'t ' + 'do well with such strings, and may give bogus warnings. ' + 'Use C++11 raw strings or concatenation instead.') + + +threading_list = ( + ('asctime(', 'asctime_r('), + ('ctime(', 'ctime_r('), + ('getgrgid(', 'getgrgid_r('), + ('getgrnam(', 'getgrnam_r('), + ('getlogin(', 'getlogin_r('), + ('getpwnam(', 'getpwnam_r('), + ('getpwuid(', 'getpwuid_r('), + ('gmtime(', 'gmtime_r('), + ('localtime(', 'localtime_r('), + ('rand(', 'rand_r('), + ('strtok(', 'strtok_r('), + ('ttyname(', 'ttyname_r('), + ) + + +def CheckPosixThreading(filename, clean_lines, linenum, error): + """Checks for calls to thread-unsafe functions. + + Much code has been originally written without consideration of + multi-threading. Also, engineers are relying on their old experience; + they have learned posix before threading extensions were added. These + tests guide the engineers to use thread-safe functions (when using + posix directly). + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + for single_thread_function, multithread_safe_function in threading_list: + ix = line.find(single_thread_function) + # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison + if ix >= 0 and (ix == 0 or (not line[ix - 1].isalnum() and + line[ix - 1] not in ('_', '.', '>'))): + error(filename, linenum, 'runtime/threadsafe_fn', 2, + 'Consider using ' + multithread_safe_function + + '...) instead of ' + single_thread_function + + '...) for improved thread safety.') + + +def CheckVlogArguments(filename, clean_lines, linenum, error): + """Checks that VLOG() is only used for defining a logging level. + + For example, VLOG(2) is correct. VLOG(INFO), VLOG(WARNING), VLOG(ERROR), and + VLOG(FATAL) are not. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + if Search(r'\bVLOG\((INFO|ERROR|WARNING|DFATAL|FATAL)\)', line): + error(filename, linenum, 'runtime/vlog', 5, + 'VLOG() should be used with numeric verbosity level. ' + 'Use LOG() if you want symbolic severity levels.') + + +# Matches invalid increment: *count++, which moves pointer instead of +# incrementing a value. +_RE_PATTERN_INVALID_INCREMENT = re.compile( + r'^\s*\*\w+(\+\+|--);') + + +def CheckInvalidIncrement(filename, clean_lines, linenum, error): + """Checks for invalid increment *count++. + + For example following function: + void increment_counter(int* count) { + *count++; + } + is invalid, because it effectively does count++, moving pointer, and should + be replaced with ++*count, (*count)++ or *count += 1. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + if _RE_PATTERN_INVALID_INCREMENT.match(line): + error(filename, linenum, 'runtime/invalid_increment', 5, + 'Changing pointer instead of value (or unused value of operator*).') + + +class _BlockInfo(object): + """Stores information about a generic block of code.""" + + def __init__(self, seen_open_brace): + self.seen_open_brace = seen_open_brace + self.open_parentheses = 0 + self.inline_asm = _NO_ASM + + def CheckBegin(self, filename, clean_lines, linenum, error): + """Run checks that applies to text up to the opening brace. + + This is mostly for checking the text after the class identifier + and the "{", usually where the base class is specified. For other + blocks, there isn't much to check, so we always pass. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + pass + + def CheckEnd(self, filename, clean_lines, linenum, error): + """Run checks that applies to text after the closing brace. + + This is mostly used for checking end of namespace comments. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + pass + + +class _ClassInfo(_BlockInfo): + """Stores information about a class.""" + + def __init__(self, name, class_or_struct, clean_lines, linenum): + _BlockInfo.__init__(self, False) + self.name = name + self.starting_linenum = linenum + self.is_derived = False + if class_or_struct == 'struct': + self.access = 'public' + self.is_struct = True + else: + self.access = 'private' + self.is_struct = False + + # Remember initial indentation level for this class. Using raw_lines here + # instead of elided to account for leading comments. + initial_indent = Match(r'^( *)\S', clean_lines.raw_lines[linenum]) + if initial_indent: + self.class_indent = len(initial_indent.group(1)) + else: + self.class_indent = 0 + + # Try to find the end of the class. This will be confused by things like: + # class A { + # } *x = { ... + # + # But it's still good enough for CheckSectionSpacing. + self.last_line = 0 + depth = 0 + for i in range(linenum, clean_lines.NumLines()): + line = clean_lines.elided[i] + depth += line.count('{') - line.count('}') + if not depth: + self.last_line = i + break + + def CheckBegin(self, filename, clean_lines, linenum, error): + # Look for a bare ':' + if Search('(^|[^:]):($|[^:])', clean_lines.elided[linenum]): + self.is_derived = True + + def CheckEnd(self, filename, clean_lines, linenum, error): + # Check that closing brace is aligned with beginning of the class. + # Only do this if the closing brace is indented by only whitespaces. + # This means we will not check single-line class definitions. + indent = Match(r'^( *)\}', clean_lines.elided[linenum]) + if indent and len(indent.group(1)) != self.class_indent: + if self.is_struct: + parent = 'struct ' + self.name + else: + parent = 'class ' + self.name + error(filename, linenum, 'whitespace/indent', 3, + 'Closing brace should be aligned with beginning of %s' % parent) + + +class _NamespaceInfo(_BlockInfo): + """Stores information about a namespace.""" + + def __init__(self, name, linenum): + _BlockInfo.__init__(self, False) + self.name = name or '' + self.starting_linenum = linenum + + def CheckEnd(self, filename, clean_lines, linenum, error): + """Check end of namespace comments.""" + line = clean_lines.raw_lines[linenum] + + # Check how many lines is enclosed in this namespace. Don't issue + # warning for missing namespace comments if there aren't enough + # lines. However, do apply checks if there is already an end of + # namespace comment and it's incorrect. + # + # TODO(unknown): We always want to check end of namespace comments + # if a namespace is large, but sometimes we also want to apply the + # check if a short namespace contained nontrivial things (something + # other than forward declarations). There is currently no logic on + # deciding what these nontrivial things are, so this check is + # triggered by namespace size only, which works most of the time. + if (linenum - self.starting_linenum < 10 + and not Match(r'};*\s*(//|/\*).*\bnamespace\b', line)): + return + + # Look for matching comment at end of namespace. + # + # Note that we accept C style "/* */" comments for terminating + # namespaces, so that code that terminate namespaces inside + # preprocessor macros can be cpplint clean. + # + # We also accept stuff like "// end of namespace ." with the + # period at the end. + # + # Besides these, we don't accept anything else, otherwise we might + # get false negatives when existing comment is a substring of the + # expected namespace. + if self.name: + # Named namespace + if not Match((r'};*\s*(//|/\*).*\bnamespace\s+' + re.escape(self.name) + + r'[\*/\.\\\s]*$'), + line): + error(filename, linenum, 'readability/namespace', 5, + 'Namespace should be terminated with "// namespace %s"' % + self.name) + else: + # Anonymous namespace + if not Match(r'};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line): + error(filename, linenum, 'readability/namespace', 5, + 'Namespace should be terminated with "// namespace"') + + +class _PreprocessorInfo(object): + """Stores checkpoints of nesting stacks when #if/#else is seen.""" + + def __init__(self, stack_before_if): + # The entire nesting stack before #if + self.stack_before_if = stack_before_if + + # The entire nesting stack up to #else + self.stack_before_else = [] + + # Whether we have already seen #else or #elif + self.seen_else = False + + +class _NestingState(object): + """Holds states related to parsing braces.""" + + def __init__(self): + # Stack for tracking all braces. An object is pushed whenever we + # see a "{", and popped when we see a "}". Only 3 types of + # objects are possible: + # - _ClassInfo: a class or struct. + # - _NamespaceInfo: a namespace. + # - _BlockInfo: some other type of block. + self.stack = [] + + # Stack of _PreprocessorInfo objects. + self.pp_stack = [] + + def SeenOpenBrace(self): + """Check if we have seen the opening brace for the innermost block. + + Returns: + True if we have seen the opening brace, False if the innermost + block is still expecting an opening brace. + """ + return (not self.stack) or self.stack[-1].seen_open_brace + + def InNamespaceBody(self): + """Check if we are currently one level inside a namespace body. + + Returns: + True if top of the stack is a namespace block, False otherwise. + """ + return self.stack and isinstance(self.stack[-1], _NamespaceInfo) + + def UpdatePreprocessor(self, line): + """Update preprocessor stack. + + We need to handle preprocessors due to classes like this: + #ifdef SWIG + struct ResultDetailsPageElementExtensionPoint { + #else + struct ResultDetailsPageElementExtensionPoint : public Extension { + #endif + + We make the following assumptions (good enough for most files): + - Preprocessor condition evaluates to true from #if up to first + #else/#elif/#endif. + + - Preprocessor condition evaluates to false from #else/#elif up + to #endif. We still perform lint checks on these lines, but + these do not affect nesting stack. + + Args: + line: current line to check. + """ + if Match(r'^\s*#\s*(if|ifdef|ifndef)\b', line): + # Beginning of #if block, save the nesting stack here. The saved + # stack will allow us to restore the parsing state in the #else case. + self.pp_stack.append(_PreprocessorInfo(copy.deepcopy(self.stack))) + elif Match(r'^\s*#\s*(else|elif)\b', line): + # Beginning of #else block + if self.pp_stack: + if not self.pp_stack[-1].seen_else: + # This is the first #else or #elif block. Remember the + # whole nesting stack up to this point. This is what we + # keep after the #endif. + self.pp_stack[-1].seen_else = True + self.pp_stack[-1].stack_before_else = copy.deepcopy(self.stack) + + # Restore the stack to how it was before the #if + self.stack = copy.deepcopy(self.pp_stack[-1].stack_before_if) + else: + # TODO(unknown): unexpected #else, issue warning? + pass + elif Match(r'^\s*#\s*endif\b', line): + # End of #if or #else blocks. + if self.pp_stack: + # If we saw an #else, we will need to restore the nesting + # stack to its former state before the #else, otherwise we + # will just continue from where we left off. + if self.pp_stack[-1].seen_else: + # Here we can just use a shallow copy since we are the last + # reference to it. + self.stack = self.pp_stack[-1].stack_before_else + # Drop the corresponding #if + self.pp_stack.pop() + else: + # TODO(unknown): unexpected #endif, issue warning? + pass + + def Update(self, filename, clean_lines, linenum, error): + """Update nesting state with current line. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # Update pp_stack first + self.UpdatePreprocessor(line) + + # Count parentheses. This is to avoid adding struct arguments to + # the nesting stack. + if self.stack: + inner_block = self.stack[-1] + depth_change = line.count('(') - line.count(')') + inner_block.open_parentheses += depth_change + + # Also check if we are starting or ending an inline assembly block. + if inner_block.inline_asm in (_NO_ASM, _END_ASM): + if (depth_change != 0 and + inner_block.open_parentheses == 1 and + _MATCH_ASM.match(line)): + # Enter assembly block + inner_block.inline_asm = _INSIDE_ASM + else: + # Not entering assembly block. If previous line was _END_ASM, + # we will now shift to _NO_ASM state. + inner_block.inline_asm = _NO_ASM + elif (inner_block.inline_asm == _INSIDE_ASM and + inner_block.open_parentheses == 0): + # Exit assembly block + inner_block.inline_asm = _END_ASM + + # Consume namespace declaration at the beginning of the line. Do + # this in a loop so that we catch same line declarations like this: + # namespace proto2 { namespace bridge { class MessageSet; } } + while True: + # Match start of namespace. The "\b\s*" below catches namespace + # declarations even if it weren't followed by a whitespace, this + # is so that we don't confuse our namespace checker. The + # missing spaces will be flagged by CheckSpacing. + namespace_decl_match = Match(r'^\s*namespace\b\s*([:\w]+)?(.*)$', line) + if not namespace_decl_match: + break + + new_namespace = _NamespaceInfo(namespace_decl_match.group(1), linenum) + self.stack.append(new_namespace) + + line = namespace_decl_match.group(2) + if line.find('{') != -1: + new_namespace.seen_open_brace = True + line = line[line.find('{') + 1:] + + # Look for a class declaration in whatever is left of the line + # after parsing namespaces. The regexp accounts for decorated classes + # such as in: + # class LOCKABLE API Object { + # }; + # + # Templates with class arguments may confuse the parser, for example: + # template , + # class Vector = vector > + # class HeapQueue { + # + # Because this parser has no nesting state about templates, by the + # time it saw "class Comparator", it may think that it's a new class. + # Nested templates have a similar problem: + # template < + # typename ExportedType, + # typename TupleType, + # template class ImplTemplate> + # + # To avoid these cases, we ignore classes that are followed by '=' or '>' + class_decl_match = Match( + r'\s*(template\s*<[\w\s<>,:]*>\s*)?' + r'(class|struct)\s+([A-Z_]+\s+)*(\w+(?:::\w+)*)' + r'(([^=>]|<[^<>]*>|<[^<>]*<[^<>]*>\s*>)*)$', line) + if (class_decl_match and + (not self.stack or self.stack[-1].open_parentheses == 0)): + self.stack.append(_ClassInfo( + class_decl_match.group(4), class_decl_match.group(2), + clean_lines, linenum)) + line = class_decl_match.group(5) + + # If we have not yet seen the opening brace for the innermost block, + # run checks here. + if not self.SeenOpenBrace(): + self.stack[-1].CheckBegin(filename, clean_lines, linenum, error) + + # Update access control if we are inside a class/struct + if self.stack and isinstance(self.stack[-1], _ClassInfo): + classinfo = self.stack[-1] + access_match = Match( + r'^(.*)\b(public|private|protected|signals)(\s+(?:slots\s*)?)?' + r':(?:[^:]|$)', + line) + if access_match: + classinfo.access = access_match.group(2) + + # Check that access keywords are indented +1 space. Skip this + # check if the keywords are not preceded by whitespaces. + indent = access_match.group(1) + if (len(indent) != classinfo.class_indent + 1 and + Match(r'^\s*$', indent)): + if classinfo.is_struct: + parent = 'struct ' + classinfo.name + else: + parent = 'class ' + classinfo.name + slots = '' + if access_match.group(3): + slots = access_match.group(3) + error(filename, linenum, 'whitespace/indent', 3, + '%s%s: should be indented +1 space inside %s' % ( + access_match.group(2), slots, parent)) + + # Consume braces or semicolons from what's left of the line + while True: + # Match first brace, semicolon, or closed parenthesis. + matched = Match(r'^[^{;)}]*([{;)}])(.*)$', line) + if not matched: + break + + token = matched.group(1) + if token == '{': + # If namespace or class hasn't seen a opening brace yet, mark + # namespace/class head as complete. Push a new block onto the + # stack otherwise. + if not self.SeenOpenBrace(): + self.stack[-1].seen_open_brace = True + else: + self.stack.append(_BlockInfo(True)) + if _MATCH_ASM.match(line): + self.stack[-1].inline_asm = _BLOCK_ASM + elif token == ';' or token == ')': + # If we haven't seen an opening brace yet, but we already saw + # a semicolon, this is probably a forward declaration. Pop + # the stack for these. + # + # Similarly, if we haven't seen an opening brace yet, but we + # already saw a closing parenthesis, then these are probably + # function arguments with extra "class" or "struct" keywords. + # Also pop these stack for these. + if not self.SeenOpenBrace(): + self.stack.pop() + else: # token == '}' + # Perform end of block checks and pop the stack. + if self.stack: + self.stack[-1].CheckEnd(filename, clean_lines, linenum, error) + self.stack.pop() + line = matched.group(2) + + def InnermostClass(self): + """Get class info on the top of the stack. + + Returns: + A _ClassInfo object if we are inside a class, or None otherwise. + """ + for i in range(len(self.stack), 0, -1): + classinfo = self.stack[i - 1] + if isinstance(classinfo, _ClassInfo): + return classinfo + return None + + def CheckCompletedBlocks(self, filename, error): + """Checks that all classes and namespaces have been completely parsed. + + Call this when all lines in a file have been processed. + Args: + filename: The name of the current file. + error: The function to call with any errors found. + """ + # Note: This test can result in false positives if #ifdef constructs + # get in the way of brace matching. See the testBuildClass test in + # cpplint_unittest.py for an example of this. + for obj in self.stack: + if isinstance(obj, _ClassInfo): + error(filename, obj.starting_linenum, 'build/class', 5, + 'Failed to find complete declaration of class %s' % + obj.name) + elif isinstance(obj, _NamespaceInfo): + error(filename, obj.starting_linenum, 'build/namespaces', 5, + 'Failed to find complete declaration of namespace %s' % + obj.name) + + +def CheckForNonStandardConstructs(filename, clean_lines, linenum, + nesting_state, error): + r"""Logs an error if we see certain non-ANSI constructs ignored by gcc-2. + + Complain about several constructs which gcc-2 accepts, but which are + not standard C++. Warning about these in lint is one way to ease the + transition to new compilers. + - put storage class first (e.g. "static const" instead of "const static"). + - "%lld" instead of %qd" in printf-type functions. + - "%1$d" is non-standard in printf-type functions. + - "\%" is an undefined character escape sequence. + - text after #endif is not allowed. + - invalid inner-style forward declaration. + - >? and ?= and )\?=?\s*(\w+|[+-]?\d+)(\.\d*)?', + line): + error(filename, linenum, 'build/deprecated', 3, + '>? and ))?' + # r'\s*const\s*' + type_name + '\s*&\s*\w+\s*;' + error(filename, linenum, 'runtime/member_string_references', 2, + 'const string& members are dangerous. It is much better to use ' + 'alternatives, such as pointers or simple constants.') + + # Everything else in this function operates on class declarations. + # Return early if the top of the nesting stack is not a class, or if + # the class head is not completed yet. + classinfo = nesting_state.InnermostClass() + if not classinfo or not classinfo.seen_open_brace: + return + + # The class may have been declared with namespace or classname qualifiers. + # The constructor and destructor will not have those qualifiers. + base_classname = classinfo.name.split('::')[-1] + + # Look for single-argument constructors that aren't marked explicit. + # Technically a valid construct, but against style. + args = Match(r'\s+(?:inline\s+)?%s\s*\(([^,()]+)\)' + % re.escape(base_classname), + line) + if (args and + args.group(1) != 'void' and + not Match(r'(const\s+)?%s(\s+const)?\s*(?:<\w+>\s*)?&' + % re.escape(base_classname), args.group(1).strip())): + error(filename, linenum, 'runtime/explicit', 5, + 'Single-argument constructors should be marked explicit.') + + +def CheckSpacingForFunctionCall(filename, line, linenum, error): + """Checks for the correctness of various spacing around function calls. + + Args: + filename: The name of the current file. + line: The text of the line to check. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + + # Since function calls often occur inside if/for/while/switch + # expressions - which have their own, more liberal conventions - we + # first see if we should be looking inside such an expression for a + # function call, to which we can apply more strict standards. + fncall = line # if there's no control flow construct, look at whole line + for pattern in (r'\bif\s*\((.*)\)\s*{', + r'\bfor\s*\((.*)\)\s*{', + r'\bwhile\s*\((.*)\)\s*[{;]', + r'\bswitch\s*\((.*)\)\s*{'): + match = Search(pattern, line) + if match: + fncall = match.group(1) # look inside the parens for function calls + break + + # Except in if/for/while/switch, there should never be space + # immediately inside parens (eg "f( 3, 4 )"). We make an exception + # for nested parens ( (a+b) + c ). Likewise, there should never be + # a space before a ( when it's a function argument. I assume it's a + # function argument when the char before the whitespace is legal in + # a function name (alnum + _) and we're not starting a macro. Also ignore + # pointers and references to arrays and functions coz they're too tricky: + # we use a very simple way to recognize these: + # " (something)(maybe-something)" or + # " (something)(maybe-something," or + # " (something)[something]" + # Note that we assume the contents of [] to be short enough that + # they'll never need to wrap. + if ( # Ignore control structures. + not Search(r'\b(if|for|while|switch|return|new|delete|catch|sizeof)\b', + fncall) and + # Ignore pointers/references to functions. + not Search(r' \([^)]+\)\([^)]*(\)|,$)', fncall) and + # Ignore pointers/references to arrays. + not Search(r' \([^)]+\)\[[^\]]+\]', fncall)): + if Search(r'\w\s*\(\s(?!\s*\\$)', fncall): # a ( used for a fn call + error(filename, linenum, 'whitespace/parens', 4, + 'Extra space after ( in function call') + elif Search(r'\(\s+(?!(\s*\\)|\()', fncall): + error(filename, linenum, 'whitespace/parens', 2, + 'Extra space after (') + if (Search(r'\w\s+\(', fncall) and + not Search(r'#\s*define|typedef', fncall) and + not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall)): + error(filename, linenum, 'whitespace/parens', 4, + 'Extra space before ( in function call') + # If the ) is followed only by a newline or a { + newline, assume it's + # part of a control statement (if/while/etc), and don't complain + if Search(r'[^)]\s+\)\s*[^{\s]', fncall): + # If the closing parenthesis is preceded by only whitespaces, + # try to give a more descriptive error message. + if Search(r'^\s+\)', fncall): + error(filename, linenum, 'whitespace/parens', 2, + 'Closing ) should be moved to the previous line') + else: + error(filename, linenum, 'whitespace/parens', 2, + 'Extra space before )') + + +def IsBlankLine(line): + """Returns true if the given line is blank. + + We consider a line to be blank if the line is empty or consists of + only white spaces. + + Args: + line: A line of a string. + + Returns: + True, if the given line is blank. + """ + return not line or line.isspace() + + +def CheckForFunctionLengths(filename, clean_lines, linenum, + function_state, error): + """Reports for long function bodies. + + For an overview why this is done, see: + http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions + + Uses a simplistic algorithm assuming other style guidelines + (especially spacing) are followed. + Only checks unindented functions, so class members are unchecked. + Trivial bodies are unchecked, so constructors with huge initializer lists + may be missed. + Blank/comment lines are not counted so as to avoid encouraging the removal + of vertical space and comments just to get through a lint check. + NOLINT *on the last line of a function* disables this check. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + function_state: Current function name and lines in body so far. + error: The function to call with any errors found. + """ + lines = clean_lines.lines + line = lines[linenum] + raw = clean_lines.raw_lines + raw_line = raw[linenum] + joined_line = '' + + starting_func = False + regexp = r'(\w(\w|::|\*|\&|\s)*)\(' # decls * & space::name( ... + match_result = Match(regexp, line) + if match_result: + # If the name is all caps and underscores, figure it's a macro and + # ignore it, unless it's TEST or TEST_F. + function_name = match_result.group(1).split()[-1] + if function_name == 'TEST' or function_name == 'TEST_F' or ( + not Match(r'[A-Z_]+$', function_name)): + starting_func = True + + if starting_func: + body_found = False + for start_linenum in xrange(linenum, clean_lines.NumLines()): + start_line = lines[start_linenum] + joined_line += ' ' + start_line.lstrip() + if Search(r'(;|})', start_line): # Declarations and trivial functions + body_found = True + break # ... ignore + elif Search(r'{', start_line): + body_found = True + function = Search(r'((\w|:)*)\(', line).group(1) + if Match(r'TEST', function): # Handle TEST... macros + parameter_regexp = Search(r'(\(.*\))', joined_line) + if parameter_regexp: # Ignore bad syntax + function += parameter_regexp.group(1) + else: + function += '()' + function_state.Begin(function) + break + if not body_found: + # No body for the function (or evidence of a non-function) was found. + error(filename, linenum, 'readability/fn_size', 5, + 'Lint failed to find start of function body.') + elif Match(r'^\}\s*$', line): # function end + function_state.Check(error, filename, linenum) + function_state.End() + elif not Match(r'^\s*$', line): + function_state.Count() # Count non-blank/non-comment lines. + + +_RE_PATTERN_TODO = re.compile(r'^//(\s*)TODO(\(.+?\))?:?(\s|$)?') + + +def CheckComment(comment, filename, linenum, error): + """Checks for common mistakes in TODO comments. + + Args: + comment: The text of the comment from the line in question. + filename: The name of the current file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + match = _RE_PATTERN_TODO.match(comment) + if match: + # One whitespace is correct; zero whitespace is handled elsewhere. + leading_whitespace = match.group(1) + if len(leading_whitespace) > 1: + error(filename, linenum, 'whitespace/todo', 2, + 'Too many spaces before TODO') + + username = match.group(2) + if not username: + error(filename, linenum, 'readability/todo', 2, + 'Missing username in TODO; it should look like ' + '"// TODO(my_username): Stuff."') + + middle_whitespace = match.group(3) + # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison + if middle_whitespace != ' ' and middle_whitespace != '': + error(filename, linenum, 'whitespace/todo', 2, + 'TODO(my_username) should be followed by a space') + +def CheckAccess(filename, clean_lines, linenum, nesting_state, error): + """Checks for improper use of DISALLOW* macros. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + nesting_state: A _NestingState instance which maintains information about + the current stack of nested blocks being parsed. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] # get rid of comments and strings + + matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|' + r'DISALLOW_EVIL_CONSTRUCTORS|' + r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line) + if not matched: + return + if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo): + if nesting_state.stack[-1].access != 'private': + error(filename, linenum, 'readability/constructors', 3, + '%s must be in the private: section' % matched.group(1)) + + else: + # Found DISALLOW* macro outside a class declaration, or perhaps it + # was used inside a function when it should have been part of the + # class declaration. We could issue a warning here, but it + # probably resulted in a compiler error already. + pass + + +def FindNextMatchingAngleBracket(clean_lines, linenum, init_suffix): + """Find the corresponding > to close a template. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: Current line number. + init_suffix: Remainder of the current line after the initial <. + + Returns: + True if a matching bracket exists. + """ + line = init_suffix + nesting_stack = ['<'] + while True: + # Find the next operator that can tell us whether < is used as an + # opening bracket or as a less-than operator. We only want to + # warn on the latter case. + # + # We could also check all other operators and terminate the search + # early, e.g. if we got something like this "a(),;\[\]]*([<>(),;\[\]])(.*)$', line) + if match: + # Found an operator, update nesting stack + operator = match.group(1) + line = match.group(2) + + if nesting_stack[-1] == '<': + # Expecting closing angle bracket + if operator in ('<', '(', '['): + nesting_stack.append(operator) + elif operator == '>': + nesting_stack.pop() + if not nesting_stack: + # Found matching angle bracket + return True + elif operator == ',': + # Got a comma after a bracket, this is most likely a template + # argument. We have not seen a closing angle bracket yet, but + # it's probably a few lines later if we look for it, so just + # return early here. + return True + else: + # Got some other operator. + return False + + else: + # Expecting closing parenthesis or closing bracket + if operator in ('<', '(', '['): + nesting_stack.append(operator) + elif operator in (')', ']'): + # We don't bother checking for matching () or []. If we got + # something like (] or [), it would have been a syntax error. + nesting_stack.pop() + + else: + # Scan the next line + linenum += 1 + if linenum >= len(clean_lines.elided): + break + line = clean_lines.elided[linenum] + + # Exhausted all remaining lines and still no matching angle bracket. + # Most likely the input was incomplete, otherwise we should have + # seen a semicolon and returned early. + return True + + +def FindPreviousMatchingAngleBracket(clean_lines, linenum, init_prefix): + """Find the corresponding < that started a template. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: Current line number. + init_prefix: Part of the current line before the initial >. + + Returns: + True if a matching bracket exists. + """ + line = init_prefix + nesting_stack = ['>'] + while True: + # Find the previous operator + match = Search(r'^(.*)([<>(),;\[\]])[^<>(),;\[\]]*$', line) + if match: + # Found an operator, update nesting stack + operator = match.group(2) + line = match.group(1) + + if nesting_stack[-1] == '>': + # Expecting opening angle bracket + if operator in ('>', ')', ']'): + nesting_stack.append(operator) + elif operator == '<': + nesting_stack.pop() + if not nesting_stack: + # Found matching angle bracket + return True + elif operator == ',': + # Got a comma before a bracket, this is most likely a + # template argument. The opening angle bracket is probably + # there if we look for it, so just return early here. + return True + else: + # Got some other operator. + return False + + else: + # Expecting opening parenthesis or opening bracket + if operator in ('>', ')', ']'): + nesting_stack.append(operator) + elif operator in ('(', '['): + nesting_stack.pop() + + else: + # Scan the previous line + linenum -= 1 + if linenum < 0: + break + line = clean_lines.elided[linenum] + + # Exhausted all earlier lines and still no matching angle bracket. + return False + + +def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): + """Checks for the correctness of various spacing issues in the code. + + Things we check for: spaces around operators, spaces after + if/for/while/switch, no spaces around parens in function calls, two + spaces between code and comment, don't start a block with a blank + line, don't end a function with a blank line, don't add a blank line + after public/protected/private, don't have too many blank lines in a row. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + nesting_state: A _NestingState instance which maintains information about + the current stack of nested blocks being parsed. + error: The function to call with any errors found. + """ + + # Don't use "elided" lines here, otherwise we can't check commented lines. + # Don't want to use "raw" either, because we don't want to check inside C++11 + # raw strings, + raw = clean_lines.lines_without_raw_strings + line = raw[linenum] + + # Before nixing comments, check if the line is blank for no good + # reason. This includes the first line after a block is opened, and + # blank lines at the end of a function (ie, right before a line like '}' + # + # Skip all the blank line checks if we are immediately inside a + # namespace body. In other words, don't issue blank line warnings + # for this block: + # namespace { + # + # } + # + # A warning about missing end of namespace comments will be issued instead. + if IsBlankLine(line) and not nesting_state.InNamespaceBody(): + elided = clean_lines.elided + prev_line = elided[linenum - 1] + prevbrace = prev_line.rfind('{') + # TODO(unknown): Don't complain if line before blank line, and line after, + # both start with alnums and are indented the same amount. + # This ignores whitespace at the start of a namespace block + # because those are not usually indented. + if prevbrace != -1 and prev_line[prevbrace:].find('}') == -1: + # OK, we have a blank line at the start of a code block. Before we + # complain, we check if it is an exception to the rule: The previous + # non-empty line has the parameters of a function header that are indented + # 4 spaces (because they did not fit in a 80 column line when placed on + # the same line as the function name). We also check for the case where + # the previous line is indented 6 spaces, which may happen when the + # initializers of a constructor do not fit into a 80 column line. + exception = False + if Match(r' {6}\w', prev_line): # Initializer list? + # We are looking for the opening column of initializer list, which + # should be indented 4 spaces to cause 6 space indentation afterwards. + search_position = linenum-2 + while (search_position >= 0 + and Match(r' {6}\w', elided[search_position])): + search_position -= 1 + exception = (search_position >= 0 + and elided[search_position][:5] == ' :') + else: + # Search for the function arguments or an initializer list. We use a + # simple heuristic here: If the line is indented 4 spaces; and we have a + # closing paren, without the opening paren, followed by an opening brace + # or colon (for initializer lists) we assume that it is the last line of + # a function header. If we have a colon indented 4 spaces, it is an + # initializer list. + exception = (Match(r' {4}\w[^\(]*\)\s*(const\s*)?(\{\s*$|:)', + prev_line) + or Match(r' {4}:', prev_line)) + + if not exception: + error(filename, linenum, 'whitespace/blank_line', 2, + 'Redundant blank line at the start of a code block ' + 'should be deleted.') + # Ignore blank lines at the end of a block in a long if-else + # chain, like this: + # if (condition1) { + # // Something followed by a blank line + # + # } else if (condition2) { + # // Something else + # } + if linenum + 1 < clean_lines.NumLines(): + next_line = raw[linenum + 1] + if (next_line + and Match(r'\s*}', next_line) + and next_line.find('} else ') == -1): + error(filename, linenum, 'whitespace/blank_line', 3, + 'Redundant blank line at the end of a code block ' + 'should be deleted.') + + matched = Match(r'\s*(public|protected|private):', prev_line) + if matched: + error(filename, linenum, 'whitespace/blank_line', 3, + 'Do not leave a blank line after "%s:"' % matched.group(1)) + + # Next, we complain if there's a comment too near the text + commentpos = line.find('//') + if commentpos != -1: + # Check if the // may be in quotes. If so, ignore it + # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison + if (line.count('"', 0, commentpos) - + line.count('\\"', 0, commentpos)) % 2 == 0: # not in quotes + # Allow one space for new scopes, two spaces otherwise: + if (not Match(r'^\s*{ //', line) and + ((commentpos >= 1 and + line[commentpos-1] not in string.whitespace) or + (commentpos >= 2 and + line[commentpos-2] not in string.whitespace))): + error(filename, linenum, 'whitespace/comments', 2, + 'At least two spaces is best between code and comments') + # There should always be a space between the // and the comment + commentend = commentpos + 2 + if commentend < len(line) and not line[commentend] == ' ': + # but some lines are exceptions -- e.g. if they're big + # comment delimiters like: + # //---------------------------------------------------------- + # or are an empty C++ style Doxygen comment, like: + # /// + # or C++ style Doxygen comments placed after the variable: + # ///< Header comment + # //!< Header comment + # or they begin with multiple slashes followed by a space: + # //////// Header comment + match = (Search(r'[=/-]{4,}\s*$', line[commentend:]) or + Search(r'^/$', line[commentend:]) or + Search(r'^!< ', line[commentend:]) or + Search(r'^/< ', line[commentend:]) or + Search(r'^/+ ', line[commentend:])) + if not match: + error(filename, linenum, 'whitespace/comments', 4, + 'Should have a space between // and comment') + CheckComment(line[commentpos:], filename, linenum, error) + + line = clean_lines.elided[linenum] # get rid of comments and strings + + # Don't try to do spacing checks for operator methods + line = re.sub(r'operator(==|!=|<|<<|<=|>=|>>|>)\(', 'operator\(', line) + + # We allow no-spaces around = within an if: "if ( (a=Foo()) == 0 )". + # Otherwise not. Note we only check for non-spaces on *both* sides; + # sometimes people put non-spaces on one side when aligning ='s among + # many lines (not that this is behavior that I approve of...) + if Search(r'[\w.]=[\w.]', line) and not Search(r'\b(if|while) ', line): + error(filename, linenum, 'whitespace/operators', 4, + 'Missing spaces around =') + + # It's ok not to have spaces around binary operators like + - * /, but if + # there's too little whitespace, we get concerned. It's hard to tell, + # though, so we punt on this one for now. TODO. + + # You should always have whitespace around binary operators. + # + # Check <= and >= first to avoid false positives with < and >, then + # check non-include lines for spacing around < and >. + match = Search(r'[^<>=!\s](==|!=|<=|>=)[^<>=!\s]', line) + if match: + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around %s' % match.group(1)) + # We allow no-spaces around << when used like this: 10<<20, but + # not otherwise (particularly, not when used as streams) + # Also ignore using ns::operator<<; + match = Search(r'(operator|\S)(?:L|UL|ULL|l|ul|ull)?<<(\S)', line) + if (match and + not (match.group(1).isdigit() and match.group(2).isdigit()) and + not (match.group(1) == 'operator' and match.group(2) == ';')): + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around <<') + elif not Match(r'#.*include', line): + # Avoid false positives on -> + reduced_line = line.replace('->', '') + + # Look for < that is not surrounded by spaces. This is only + # triggered if both sides are missing spaces, even though + # technically should should flag if at least one side is missing a + # space. This is done to avoid some false positives with shifts. + match = Search(r'[^\s<]<([^\s=<].*)', reduced_line) + if (match and + not FindNextMatchingAngleBracket(clean_lines, linenum, match.group(1))): + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around <') + + # Look for > that is not surrounded by spaces. Similar to the + # above, we only trigger if both sides are missing spaces to avoid + # false positives with shifts. + match = Search(r'^(.*[^\s>])>[^\s=>]', reduced_line) + if (match and + not FindPreviousMatchingAngleBracket(clean_lines, linenum, + match.group(1))): + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around >') + + # We allow no-spaces around >> for almost anything. This is because + # C++11 allows ">>" to close nested templates, which accounts for + # most cases when ">>" is not followed by a space. + # + # We still warn on ">>" followed by alpha character, because that is + # likely due to ">>" being used for right shifts, e.g.: + # value >> alpha + # + # When ">>" is used to close templates, the alphanumeric letter that + # follows would be part of an identifier, and there should still be + # a space separating the template type and the identifier. + # type> alpha + match = Search(r'>>[a-zA-Z_]', line) + if match: + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around >>') + + # There shouldn't be space around unary operators + match = Search(r'(!\s|~\s|[\s]--[\s;]|[\s]\+\+[\s;])', line) + if match: + error(filename, linenum, 'whitespace/operators', 4, + 'Extra space for operator %s' % match.group(1)) + + # A pet peeve of mine: no spaces after an if, while, switch, or for + match = Search(r' (if\(|for\(|while\(|switch\()', line) + if match: + error(filename, linenum, 'whitespace/parens', 5, + 'Missing space before ( in %s' % match.group(1)) + + # For if/for/while/switch, the left and right parens should be + # consistent about how many spaces are inside the parens, and + # there should either be zero or one spaces inside the parens. + # We don't want: "if ( foo)" or "if ( foo )". + # Exception: "for ( ; foo; bar)" and "for (foo; bar; )" are allowed. + match = Search(r'\b(if|for|while|switch)\s*' + r'\(([ ]*)(.).*[^ ]+([ ]*)\)\s*{\s*$', + line) + if match: + if len(match.group(2)) != len(match.group(4)): + if not (match.group(3) == ';' and + len(match.group(2)) == 1 + len(match.group(4)) or + not match.group(2) and Search(r'\bfor\s*\(.*; \)', line)): + error(filename, linenum, 'whitespace/parens', 5, + 'Mismatching spaces inside () in %s' % match.group(1)) + if len(match.group(2)) not in [0, 1]: + error(filename, linenum, 'whitespace/parens', 5, + 'Should have zero or one spaces inside ( and ) in %s' % + match.group(1)) + + # You should always have a space after a comma (either as fn arg or operator) + # + # This does not apply when the non-space character following the + # comma is another comma, since the only time when that happens is + # for empty macro arguments. + # + # We run this check in two passes: first pass on elided lines to + # verify that lines contain missing whitespaces, second pass on raw + # lines to confirm that those missing whitespaces are not due to + # elided comments. + if Search(r',[^,\s]', line) and Search(r',[^,\s]', raw[linenum]): + error(filename, linenum, 'whitespace/comma', 3, + 'Missing space after ,') + + # You should always have a space after a semicolon + # except for few corner cases + # TODO(unknown): clarify if 'if (1) { return 1;}' is requires one more + # space after ; + if Search(r';[^\s};\\)/]', line): + error(filename, linenum, 'whitespace/semicolon', 3, + 'Missing space after ;') + + # Next we will look for issues with function calls. + CheckSpacingForFunctionCall(filename, line, linenum, error) + + # Except after an opening paren, or after another opening brace (in case of + # an initializer list, for instance), you should have spaces before your + # braces. And since you should never have braces at the beginning of a line, + # this is an easy test. + match = Match(r'^(.*[^ ({]){', line) + if match: + # Try a bit harder to check for brace initialization. This + # happens in one of the following forms: + # Constructor() : initializer_list_{} { ... } + # Constructor{}.MemberFunction() + # Type variable{}; + # FunctionCall(type{}, ...); + # LastArgument(..., type{}); + # LOG(INFO) << type{} << " ..."; + # map_of_type[{...}] = ...; + # + # We check for the character following the closing brace, and + # silence the warning if it's one of those listed above, i.e. + # "{.;,)<]". + # + # To account for nested initializer list, we allow any number of + # closing braces up to "{;,)<". We can't simply silence the + # warning on first sight of closing brace, because that would + # cause false negatives for things that are not initializer lists. + # Silence this: But not this: + # Outer{ if (...) { + # Inner{...} if (...){ // Missing space before { + # }; } + # + # There is a false negative with this approach if people inserted + # spurious semicolons, e.g. "if (cond){};", but we will catch the + # spurious semicolon with a separate check. + (endline, endlinenum, endpos) = CloseExpression( + clean_lines, linenum, len(match.group(1))) + trailing_text = '' + if endpos > -1: + trailing_text = endline[endpos:] + for offset in xrange(endlinenum + 1, + min(endlinenum + 3, clean_lines.NumLines() - 1)): + trailing_text += clean_lines.elided[offset] + if not Match(r'^[\s}]*[{.;,)<\]]', trailing_text): + error(filename, linenum, 'whitespace/braces', 5, + 'Missing space before {') + + # Make sure '} else {' has spaces. + if Search(r'}else', line): + error(filename, linenum, 'whitespace/braces', 5, + 'Missing space before else') + + # You shouldn't have spaces before your brackets, except maybe after + # 'delete []' or 'new char * []'. + if Search(r'\w\s+\[', line) and not Search(r'delete\s+\[', line): + error(filename, linenum, 'whitespace/braces', 5, + 'Extra space before [') + + # You shouldn't have a space before a semicolon at the end of the line. + # There's a special case for "for" since the style guide allows space before + # the semicolon there. + if Search(r':\s*;\s*$', line): + error(filename, linenum, 'whitespace/semicolon', 5, + 'Semicolon defining empty statement. Use {} instead.') + elif Search(r'^\s*;\s*$', line): + error(filename, linenum, 'whitespace/semicolon', 5, + 'Line contains only semicolon. If this should be an empty statement, ' + 'use {} instead.') + elif (Search(r'\s+;\s*$', line) and + not Search(r'\bfor\b', line)): + error(filename, linenum, 'whitespace/semicolon', 5, + 'Extra space before last semicolon. If this should be an empty ' + 'statement, use {} instead.') + + # In range-based for, we wanted spaces before and after the colon, but + # not around "::" tokens that might appear. + if (Search('for *\(.*[^:]:[^: ]', line) or + Search('for *\(.*[^: ]:[^:]', line)): + error(filename, linenum, 'whitespace/forcolon', 2, + 'Missing space around colon in range-based for loop') + + +def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error): + """Checks for additional blank line issues related to sections. + + Currently the only thing checked here is blank line before protected/private. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + class_info: A _ClassInfo objects. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + # Skip checks if the class is small, where small means 25 lines or less. + # 25 lines seems like a good cutoff since that's the usual height of + # terminals, and any class that can't fit in one screen can't really + # be considered "small". + # + # Also skip checks if we are on the first line. This accounts for + # classes that look like + # class Foo { public: ... }; + # + # If we didn't find the end of the class, last_line would be zero, + # and the check will be skipped by the first condition. + if (class_info.last_line - class_info.starting_linenum <= 24 or + linenum <= class_info.starting_linenum): + return + + matched = Match(r'\s*(public|protected|private):', clean_lines.lines[linenum]) + if matched: + # Issue warning if the line before public/protected/private was + # not a blank line, but don't do this if the previous line contains + # "class" or "struct". This can happen two ways: + # - We are at the beginning of the class. + # - We are forward-declaring an inner class that is semantically + # private, but needed to be public for implementation reasons. + # Also ignores cases where the previous line ends with a backslash as can be + # common when defining classes in C macros. + prev_line = clean_lines.lines[linenum - 1] + if (not IsBlankLine(prev_line) and + not Search(r'\b(class|struct)\b', prev_line) and + not Search(r'\\$', prev_line)): + # Try a bit harder to find the beginning of the class. This is to + # account for multi-line base-specifier lists, e.g.: + # class Derived + # : public Base { + end_class_head = class_info.starting_linenum + for i in range(class_info.starting_linenum, linenum): + if Search(r'\{\s*$', clean_lines.lines[i]): + end_class_head = i + break + if end_class_head < linenum - 1: + error(filename, linenum, 'whitespace/blank_line', 3, + '"%s:" should be preceded by a blank line' % matched.group(1)) + + +def GetPreviousNonBlankLine(clean_lines, linenum): + """Return the most recent non-blank line and its line number. + + Args: + clean_lines: A CleansedLines instance containing the file contents. + linenum: The number of the line to check. + + Returns: + A tuple with two elements. The first element is the contents of the last + non-blank line before the current line, or the empty string if this is the + first non-blank line. The second is the line number of that line, or -1 + if this is the first non-blank line. + """ + + prevlinenum = linenum - 1 + while prevlinenum >= 0: + prevline = clean_lines.elided[prevlinenum] + if not IsBlankLine(prevline): # if not a blank line... + return (prevline, prevlinenum) + prevlinenum -= 1 + return ('', -1) + + +def CheckBraces(filename, clean_lines, linenum, error): + """Looks for misplaced braces (e.g. at the end of line). + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + + line = clean_lines.elided[linenum] # get rid of comments and strings + + if Match(r'\s*{\s*$', line): + # We allow an open brace to start a line in the case where someone is using + # braces in a block to explicitly create a new scope, which is commonly used + # to control the lifetime of stack-allocated variables. Braces are also + # used for brace initializers inside function calls. We don't detect this + # perfectly: we just don't complain if the last non-whitespace character on + # the previous non-blank line is ',', ';', ':', '(', '{', or '}', or if the + # previous line starts a preprocessor block. + prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0] + if (not Search(r'[,;:}{(]\s*$', prevline) and + not Match(r'\s*#', prevline)): + error(filename, linenum, 'whitespace/braces', 4, + '{ should almost always be at the end of the previous line') + + # An else clause should be on the same line as the preceding closing brace. + if Match(r'\s*else\s*', line): + prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0] + if Match(r'\s*}\s*$', prevline): + error(filename, linenum, 'whitespace/newline', 4, + 'An else should appear on the same line as the preceding }') + + # If braces come on one side of an else, they should be on both. + # However, we have to worry about "else if" that spans multiple lines! + if Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line): + if Search(r'}\s*else if([^{]*)$', line): # could be multi-line if + # find the ( after the if + pos = line.find('else if') + pos = line.find('(', pos) + if pos > 0: + (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos) + if endline[endpos:].find('{') == -1: # must be brace after if + error(filename, linenum, 'readability/braces', 5, + 'If an else has a brace on one side, it should have it on both') + else: # common case: else not followed by a multi-line if + error(filename, linenum, 'readability/braces', 5, + 'If an else has a brace on one side, it should have it on both') + + # Likewise, an else should never have the else clause on the same line + if Search(r'\belse [^\s{]', line) and not Search(r'\belse if\b', line): + error(filename, linenum, 'whitespace/newline', 4, + 'Else clause should never be on same line as else (use 2 lines)') + + # In the same way, a do/while should never be on one line + if Match(r'\s*do [^\s{]', line): + error(filename, linenum, 'whitespace/newline', 4, + 'do/while clauses should not be on a single line') + + # Block bodies should not be followed by a semicolon. Due to C++11 + # brace initialization, there are more places where semicolons are + # required than not, so we use a whitelist approach to check these + # rather than a blacklist. These are the places where "};" should + # be replaced by just "}": + # 1. Some flavor of block following closing parenthesis: + # for (;;) {}; + # while (...) {}; + # switch (...) {}; + # Function(...) {}; + # if (...) {}; + # if (...) else if (...) {}; + # + # 2. else block: + # if (...) else {}; + # + # 3. const member function: + # Function(...) const {}; + # + # 4. Block following some statement: + # x = 42; + # {}; + # + # 5. Block at the beginning of a function: + # Function(...) { + # {}; + # } + # + # Note that naively checking for the preceding "{" will also match + # braces inside multi-dimensional arrays, but this is fine since + # that expression will not contain semicolons. + # + # 6. Block following another block: + # while (true) {} + # {}; + # + # 7. End of namespaces: + # namespace {}; + # + # These semicolons seems far more common than other kinds of + # redundant semicolons, possibly due to people converting classes + # to namespaces. For now we do not warn for this case. + # + # Try matching case 1 first. + match = Match(r'^(.*\)\s*)\{', line) + if match: + # Matched closing parenthesis (case 1). Check the token before the + # matching opening parenthesis, and don't warn if it looks like a + # macro. This avoids these false positives: + # - macro that defines a base class + # - multi-line macro that defines a base class + # - macro that defines the whole class-head + # + # But we still issue warnings for macros that we know are safe to + # warn, specifically: + # - TEST, TEST_F, TEST_P, MATCHER, MATCHER_P + # - TYPED_TEST + # - INTERFACE_DEF + # - EXCLUSIVE_LOCKS_REQUIRED, SHARED_LOCKS_REQUIRED, LOCKS_EXCLUDED: + # + # We implement a whitelist of safe macros instead of a blacklist of + # unsafe macros, even though the latter appears less frequently in + # google code and would have been easier to implement. This is because + # the downside for getting the whitelist wrong means some extra + # semicolons, while the downside for getting the blacklist wrong + # would result in compile errors. + # + # In addition to macros, we also don't want to warn on compound + # literals. + closing_brace_pos = match.group(1).rfind(')') + opening_parenthesis = ReverseCloseExpression( + clean_lines, linenum, closing_brace_pos) + if opening_parenthesis[2] > -1: + line_prefix = opening_parenthesis[0][0:opening_parenthesis[2]] + macro = Search(r'\b([A-Z_]+)\s*$', line_prefix) + if ((macro and + macro.group(1) not in ( + 'TEST', 'TEST_F', 'MATCHER', 'MATCHER_P', 'TYPED_TEST', + 'EXCLUSIVE_LOCKS_REQUIRED', 'SHARED_LOCKS_REQUIRED', + 'LOCKS_EXCLUDED', 'INTERFACE_DEF')) or + Search(r'\s+=\s*$', line_prefix)): + match = None + + else: + # Try matching cases 2-3. + match = Match(r'^(.*(?:else|\)\s*const)\s*)\{', line) + if not match: + # Try matching cases 4-6. These are always matched on separate lines. + # + # Note that we can't simply concatenate the previous line to the + # current line and do a single match, otherwise we may output + # duplicate warnings for the blank line case: + # if (cond) { + # // blank line + # } + prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0] + if prevline and Search(r'[;{}]\s*$', prevline): + match = Match(r'^(\s*)\{', line) + + # Check matching closing brace + if match: + (endline, endlinenum, endpos) = CloseExpression( + clean_lines, linenum, len(match.group(1))) + if endpos > -1 and Match(r'^\s*;', endline[endpos:]): + # Current {} pair is eligible for semicolon check, and we have found + # the redundant semicolon, output warning here. + # + # Note: because we are scanning forward for opening braces, and + # outputting warnings for the matching closing brace, if there are + # nested blocks with trailing semicolons, we will get the error + # messages in reversed order. + error(filename, endlinenum, 'readability/braces', 4, + "You don't need a ; after a }") + + +def CheckEmptyBlockBody(filename, clean_lines, linenum, error): + """Look for empty loop/conditional body with only a single semicolon. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + + # Search for loop keywords at the beginning of the line. Because only + # whitespaces are allowed before the keywords, this will also ignore most + # do-while-loops, since those lines should start with closing brace. + # + # We also check "if" blocks here, since an empty conditional block + # is likely an error. + line = clean_lines.elided[linenum] + matched = Match(r'\s*(for|while|if)\s*\(', line) + if matched: + # Find the end of the conditional expression + (end_line, end_linenum, end_pos) = CloseExpression( + clean_lines, linenum, line.find('(')) + + # Output warning if what follows the condition expression is a semicolon. + # No warning for all other cases, including whitespace or newline, since we + # have a separate check for semicolons preceded by whitespace. + if end_pos >= 0 and Match(r';', end_line[end_pos:]): + if matched.group(1) == 'if': + error(filename, end_linenum, 'whitespace/empty_conditional_body', 5, + 'Empty conditional bodies should use {}') + else: + error(filename, end_linenum, 'whitespace/empty_loop_body', 5, + 'Empty loop bodies should use {} or continue') + + +def CheckCheck(filename, clean_lines, linenum, error): + """Checks the use of CHECK and EXPECT macros. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + + # Decide the set of replacement macros that should be suggested + lines = clean_lines.elided + check_macro = None + start_pos = -1 + for macro in _CHECK_MACROS: + i = lines[linenum].find(macro) + if i >= 0: + check_macro = macro + + # Find opening parenthesis. Do a regular expression match here + # to make sure that we are matching the expected CHECK macro, as + # opposed to some other macro that happens to contain the CHECK + # substring. + matched = Match(r'^(.*\b' + check_macro + r'\s*)\(', lines[linenum]) + if not matched: + continue + start_pos = len(matched.group(1)) + break + if not check_macro or start_pos < 0: + # Don't waste time here if line doesn't contain 'CHECK' or 'EXPECT' + return + + # Find end of the boolean expression by matching parentheses + (last_line, end_line, end_pos) = CloseExpression( + clean_lines, linenum, start_pos) + if end_pos < 0: + return + if linenum == end_line: + expression = lines[linenum][start_pos + 1:end_pos - 1] + else: + expression = lines[linenum][start_pos + 1:] + for i in xrange(linenum + 1, end_line): + expression += lines[i] + expression += last_line[0:end_pos - 1] + + # Parse expression so that we can take parentheses into account. + # This avoids false positives for inputs like "CHECK((a < 4) == b)", + # which is not replaceable by CHECK_LE. + lhs = '' + rhs = '' + operator = None + while expression: + matched = Match(r'^\s*(<<|<<=|>>|>>=|->\*|->|&&|\|\||' + r'==|!=|>=|>|<=|<|\()(.*)$', expression) + if matched: + token = matched.group(1) + if token == '(': + # Parenthesized operand + expression = matched.group(2) + (end, _) = FindEndOfExpressionInLine(expression, 0, 1, '(', ')') + if end < 0: + return # Unmatched parenthesis + lhs += '(' + expression[0:end] + expression = expression[end:] + elif token in ('&&', '||'): + # Logical and/or operators. This means the expression + # contains more than one term, for example: + # CHECK(42 < a && a < b); + # + # These are not replaceable with CHECK_LE, so bail out early. + return + elif token in ('<<', '<<=', '>>', '>>=', '->*', '->'): + # Non-relational operator + lhs += token + expression = matched.group(2) + else: + # Relational operator + operator = token + rhs = matched.group(2) + break + else: + # Unparenthesized operand. Instead of appending to lhs one character + # at a time, we do another regular expression match to consume several + # characters at once if possible. Trivial benchmark shows that this + # is more efficient when the operands are longer than a single + # character, which is generally the case. + matched = Match(r'^([^-=!<>()&|]+)(.*)$', expression) + if not matched: + matched = Match(r'^(\s*\S)(.*)$', expression) + if not matched: + break + lhs += matched.group(1) + expression = matched.group(2) + + # Only apply checks if we got all parts of the boolean expression + if not (lhs and operator and rhs): + return + + # Check that rhs do not contain logical operators. We already know + # that lhs is fine since the loop above parses out && and ||. + if rhs.find('&&') > -1 or rhs.find('||') > -1: + return + + # At least one of the operands must be a constant literal. This is + # to avoid suggesting replacements for unprintable things like + # CHECK(variable != iterator) + # + # The following pattern matches decimal, hex integers, strings, and + # characters (in that order). + lhs = lhs.strip() + rhs = rhs.strip() + match_constant = r'^([-+]?(\d+|0[xX][0-9a-fA-F]+)[lLuU]{0,3}|".*"|\'.*\')$' + if Match(match_constant, lhs) or Match(match_constant, rhs): + # Note: since we know both lhs and rhs, we can provide a more + # descriptive error message like: + # Consider using CHECK_EQ(x, 42) instead of CHECK(x == 42) + # Instead of: + # Consider using CHECK_EQ instead of CHECK(a == b) + # + # We are still keeping the less descriptive message because if lhs + # or rhs gets long, the error message might become unreadable. + error(filename, linenum, 'readability/check', 2, + 'Consider using %s instead of %s(a %s b)' % ( + _CHECK_REPLACEMENT[check_macro][operator], + check_macro, operator)) + + +def CheckAltTokens(filename, clean_lines, linenum, error): + """Check alternative keywords being used in boolean expressions. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # Avoid preprocessor lines + if Match(r'^\s*#', line): + return + + # Last ditch effort to avoid multi-line comments. This will not help + # if the comment started before the current line or ended after the + # current line, but it catches most of the false positives. At least, + # it provides a way to workaround this warning for people who use + # multi-line comments in preprocessor macros. + # + # TODO(unknown): remove this once cpplint has better support for + # multi-line comments. + if line.find('/*') >= 0 or line.find('*/') >= 0: + return + + for match in _ALT_TOKEN_REPLACEMENT_PATTERN.finditer(line): + error(filename, linenum, 'readability/alt_tokens', 2, + 'Use operator %s instead of %s' % ( + _ALT_TOKEN_REPLACEMENT[match.group(1)], match.group(1))) + + +def GetLineWidth(line): + """Determines the width of the line in column positions. + + Args: + line: A string, which may be a Unicode string. + + Returns: + The width of the line in column positions, accounting for Unicode + combining characters and wide characters. + """ + if isinstance(line, unicode): + width = 0 + for uc in unicodedata.normalize('NFC', line): + if unicodedata.east_asian_width(uc) in ('W', 'F'): + width += 2 + elif not unicodedata.combining(uc): + width += 1 + return width + else: + return len(line) + + +def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state, + error): + """Checks rules from the 'C++ style rules' section of cppguide.html. + + Most of these rules are hard to test (naming, comment style), but we + do what we can. In particular we check for 2-space indents, line lengths, + tab usage, spaces inside code, etc. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + file_extension: The extension (without the dot) of the filename. + nesting_state: A _NestingState instance which maintains information about + the current stack of nested blocks being parsed. + error: The function to call with any errors found. + """ + + # Don't use "elided" lines here, otherwise we can't check commented lines. + # Don't want to use "raw" either, because we don't want to check inside C++11 + # raw strings, + raw_lines = clean_lines.lines_without_raw_strings + line = raw_lines[linenum] + + if line.find('\t') != -1: + error(filename, linenum, 'whitespace/tab', 1, + 'Tab found; better to use spaces') + + # One or three blank spaces at the beginning of the line is weird; it's + # hard to reconcile that with 2-space indents. + # NOTE: here are the conditions rob pike used for his tests. Mine aren't + # as sophisticated, but it may be worth becoming so: RLENGTH==initial_spaces + # if(RLENGTH > 20) complain = 0; + # if(match($0, " +(error|private|public|protected):")) complain = 0; + # if(match(prev, "&& *$")) complain = 0; + # if(match(prev, "\\|\\| *$")) complain = 0; + # if(match(prev, "[\",=><] *$")) complain = 0; + # if(match($0, " <<")) complain = 0; + # if(match(prev, " +for \\(")) complain = 0; + # if(prevodd && match(prevprev, " +for \\(")) complain = 0; + initial_spaces = 0 + cleansed_line = clean_lines.elided[linenum] + while initial_spaces < len(line) and line[initial_spaces] == ' ': + initial_spaces += 1 + if line and line[-1].isspace(): + error(filename, linenum, 'whitespace/end_of_line', 4, + 'Line ends in whitespace. Consider deleting these extra spaces.') + # There are certain situations we allow one space, notably for section labels + elif ((initial_spaces == 1 or initial_spaces == 3) and + not Match(r'\s*\w+\s*:\s*$', cleansed_line)): + error(filename, linenum, 'whitespace/indent', 3, + 'Weird number of spaces at line-start. ' + 'Are you using a 2-space indent?') + + # Check if the line is a header guard. + is_header_guard = False + if file_extension == 'h': + cppvar = GetHeaderGuardCPPVariable(filename) + if (line.startswith('#ifndef %s' % cppvar) or + line.startswith('#define %s' % cppvar) or + line.startswith('#endif // %s' % cppvar)): + is_header_guard = True + # #include lines and header guards can be long, since there's no clean way to + # split them. + # + # URLs can be long too. It's possible to split these, but it makes them + # harder to cut&paste. + # + # The "$Id:...$" comment may also get very long without it being the + # developers fault. + if (not line.startswith('#include') and not is_header_guard and + not Match(r'^\s*//.*http(s?)://\S*$', line) and + not Match(r'^// \$Id:.*#[0-9]+ \$$', line)): + line_width = GetLineWidth(line) + extended_length = int((_line_length * 1.25)) + if line_width > extended_length: + error(filename, linenum, 'whitespace/line_length', 4, + 'Lines should very rarely be longer than %i characters' % + extended_length) + elif line_width > _line_length: + error(filename, linenum, 'whitespace/line_length', 2, + 'Lines should be <= %i characters long' % _line_length) + + if (cleansed_line.count(';') > 1 and + # for loops are allowed two ;'s (and may run over two lines). + cleansed_line.find('for') == -1 and + (GetPreviousNonBlankLine(clean_lines, linenum)[0].find('for') == -1 or + GetPreviousNonBlankLine(clean_lines, linenum)[0].find(';') != -1) and + # It's ok to have many commands in a switch case that fits in 1 line + not ((cleansed_line.find('case ') != -1 or + cleansed_line.find('default:') != -1) and + cleansed_line.find('break;') != -1)): + error(filename, linenum, 'whitespace/newline', 0, + 'More than one command on the same line') + + # Some more style checks + CheckBraces(filename, clean_lines, linenum, error) + CheckEmptyBlockBody(filename, clean_lines, linenum, error) + CheckAccess(filename, clean_lines, linenum, nesting_state, error) + CheckSpacing(filename, clean_lines, linenum, nesting_state, error) + CheckCheck(filename, clean_lines, linenum, error) + CheckAltTokens(filename, clean_lines, linenum, error) + classinfo = nesting_state.InnermostClass() + if classinfo: + CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error) + + +_RE_PATTERN_INCLUDE_NEW_STYLE = re.compile(r'#include +"[^/]+\.h"') +_RE_PATTERN_INCLUDE = re.compile(r'^\s*#\s*include\s*([<"])([^>"]*)[>"].*$') +# Matches the first component of a filename delimited by -s and _s. That is: +# _RE_FIRST_COMPONENT.match('foo').group(0) == 'foo' +# _RE_FIRST_COMPONENT.match('foo.cc').group(0) == 'foo' +# _RE_FIRST_COMPONENT.match('foo-bar_baz.cc').group(0) == 'foo' +# _RE_FIRST_COMPONENT.match('foo_bar-baz.cc').group(0) == 'foo' +_RE_FIRST_COMPONENT = re.compile(r'^[^-_.]+') + + +def _DropCommonSuffixes(filename): + """Drops common suffixes like _test.cc or -inl.h from filename. + + For example: + >>> _DropCommonSuffixes('foo/foo-inl.h') + 'foo/foo' + >>> _DropCommonSuffixes('foo/bar/foo.cc') + 'foo/bar/foo' + >>> _DropCommonSuffixes('foo/foo_internal.h') + 'foo/foo' + >>> _DropCommonSuffixes('foo/foo_unusualinternal.h') + 'foo/foo_unusualinternal' + + Args: + filename: The input filename. + + Returns: + The filename with the common suffix removed. + """ + for suffix in ('test.cc', 'regtest.cc', 'unittest.cc', + 'inl.h', 'impl.h', 'internal.h'): + if (filename.endswith(suffix) and len(filename) > len(suffix) and + filename[-len(suffix) - 1] in ('-', '_')): + return filename[:-len(suffix) - 1] + return os.path.splitext(filename)[0] + + +def _IsTestFilename(filename): + """Determines if the given filename has a suffix that identifies it as a test. + + Args: + filename: The input filename. + + Returns: + True if 'filename' looks like a test, False otherwise. + """ + if (filename.endswith('_test.cc') or + filename.endswith('_unittest.cc') or + filename.endswith('_regtest.cc')): + return True + else: + return False + + +def _ClassifyInclude(fileinfo, include, is_system): + """Figures out what kind of header 'include' is. + + Args: + fileinfo: The current file cpplint is running over. A FileInfo instance. + include: The path to a #included file. + is_system: True if the #include used <> rather than "". + + Returns: + One of the _XXX_HEADER constants. + + For example: + >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'stdio.h', True) + _C_SYS_HEADER + >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'string', True) + _CPP_SYS_HEADER + >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/foo.h', False) + _LIKELY_MY_HEADER + >>> _ClassifyInclude(FileInfo('foo/foo_unknown_extension.cc'), + ... 'bar/foo_other_ext.h', False) + _POSSIBLE_MY_HEADER + >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/bar.h', False) + _OTHER_HEADER + """ + # This is a list of all standard c++ header files, except + # those already checked for above. + is_cpp_h = include in _CPP_HEADERS + + if is_system: + if is_cpp_h: + return _CPP_SYS_HEADER + else: + return _C_SYS_HEADER + + # If the target file and the include we're checking share a + # basename when we drop common extensions, and the include + # lives in . , then it's likely to be owned by the target file. + target_dir, target_base = ( + os.path.split(_DropCommonSuffixes(fileinfo.RepositoryName()))) + include_dir, include_base = os.path.split(_DropCommonSuffixes(include)) + if target_base == include_base and ( + include_dir == target_dir or + include_dir == os.path.normpath(target_dir + '/../public')): + return _LIKELY_MY_HEADER + + # If the target and include share some initial basename + # component, it's possible the target is implementing the + # include, so it's allowed to be first, but we'll never + # complain if it's not there. + target_first_component = _RE_FIRST_COMPONENT.match(target_base) + include_first_component = _RE_FIRST_COMPONENT.match(include_base) + if (target_first_component and include_first_component and + target_first_component.group(0) == + include_first_component.group(0)): + return _POSSIBLE_MY_HEADER + + return _OTHER_HEADER + + + +def CheckIncludeLine(filename, clean_lines, linenum, include_state, error): + """Check rules that are applicable to #include lines. + + Strings on #include lines are NOT removed from elided line, to make + certain tasks easier. However, to prevent false positives, checks + applicable to #include lines in CheckLanguage must be put here. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + include_state: An _IncludeState instance in which the headers are inserted. + error: The function to call with any errors found. + """ + fileinfo = FileInfo(filename) + + line = clean_lines.lines[linenum] + + # "include" should use the new style "foo/bar.h" instead of just "bar.h" + if _RE_PATTERN_INCLUDE_NEW_STYLE.search(line): + error(filename, linenum, 'build/include', 4, + 'Include the directory when naming .h files') + + # we shouldn't include a file more than once. actually, there are a + # handful of instances where doing so is okay, but in general it's + # not. + match = _RE_PATTERN_INCLUDE.search(line) + if match: + include = match.group(2) + is_system = (match.group(1) == '<') + if include in include_state: + error(filename, linenum, 'build/include', 4, + '"%s" already included at %s:%s' % + (include, filename, include_state[include])) + else: + include_state[include] = linenum + + # We want to ensure that headers appear in the right order: + # 1) for foo.cc, foo.h (preferred location) + # 2) c system files + # 3) cpp system files + # 4) for foo.cc, foo.h (deprecated location) + # 5) other google headers + # + # We classify each include statement as one of those 5 types + # using a number of techniques. The include_state object keeps + # track of the highest type seen, and complains if we see a + # lower type after that. + error_message = include_state.CheckNextIncludeOrder( + _ClassifyInclude(fileinfo, include, is_system)) + if error_message: + error(filename, linenum, 'build/include_order', 4, + '%s. Should be: %s.h, c system, c++ system, other.' % + (error_message, fileinfo.BaseName())) + canonical_include = include_state.CanonicalizeAlphabeticalOrder(include) + if not include_state.IsInAlphabeticalOrder( + clean_lines, linenum, canonical_include): + error(filename, linenum, 'build/include_alpha', 4, + 'Include "%s" not in alphabetical order' % include) + include_state.SetLastHeader(canonical_include) + + # Look for any of the stream classes that are part of standard C++. + match = _RE_PATTERN_INCLUDE.match(line) + if match: + include = match.group(2) + if Match(r'(f|ind|io|i|o|parse|pf|stdio|str|)?stream$', include): + # Many unit tests use cout, so we exempt them. + if not _IsTestFilename(filename): + error(filename, linenum, 'readability/streams', 3, + 'Streams are highly discouraged.') + + +def _GetTextInside(text, start_pattern): + r"""Retrieves all the text between matching open and close parentheses. + + Given a string of lines and a regular expression string, retrieve all the text + following the expression and between opening punctuation symbols like + (, [, or {, and the matching close-punctuation symbol. This properly nested + occurrences of the punctuations, so for the text like + printf(a(), b(c())); + a call to _GetTextInside(text, r'printf\(') will return 'a(), b(c())'. + start_pattern must match string having an open punctuation symbol at the end. + + Args: + text: The lines to extract text. Its comments and strings must be elided. + It can be single line and can span multiple lines. + start_pattern: The regexp string indicating where to start extracting + the text. + Returns: + The extracted text. + None if either the opening string or ending punctuation could not be found. + """ + # TODO(sugawarayu): Audit cpplint.py to see what places could be profitably + # rewritten to use _GetTextInside (and use inferior regexp matching today). + + # Give opening punctuations to get the matching close-punctuations. + matching_punctuation = {'(': ')', '{': '}', '[': ']'} + closing_punctuation = set(matching_punctuation.itervalues()) + + # Find the position to start extracting text. + match = re.search(start_pattern, text, re.M) + if not match: # start_pattern not found in text. + return None + start_position = match.end(0) + + assert start_position > 0, ( + 'start_pattern must ends with an opening punctuation.') + assert text[start_position - 1] in matching_punctuation, ( + 'start_pattern must ends with an opening punctuation.') + # Stack of closing punctuations we expect to have in text after position. + punctuation_stack = [matching_punctuation[text[start_position - 1]]] + position = start_position + while punctuation_stack and position < len(text): + if text[position] == punctuation_stack[-1]: + punctuation_stack.pop() + elif text[position] in closing_punctuation: + # A closing punctuation without matching opening punctuations. + return None + elif text[position] in matching_punctuation: + punctuation_stack.append(matching_punctuation[text[position]]) + position += 1 + if punctuation_stack: + # Opening punctuations left without matching close-punctuations. + return None + # punctuations match. + return text[start_position:position - 1] + + +# Patterns for matching call-by-reference parameters. +# +# Supports nested templates up to 2 levels deep using this messy pattern: +# < (?: < (?: < [^<>]* +# > +# | [^<>] )* +# > +# | [^<>] )* +# > +_RE_PATTERN_IDENT = r'[_a-zA-Z]\w*' # =~ [[:alpha:]][[:alnum:]]* +_RE_PATTERN_TYPE = ( + r'(?:const\s+)?(?:typename\s+|class\s+|struct\s+|union\s+|enum\s+)?' + r'(?:\w|' + r'\s*<(?:<(?:<[^<>]*>|[^<>])*>|[^<>])*>|' + r'::)+') +# A call-by-reference parameter ends with '& identifier'. +_RE_PATTERN_REF_PARAM = re.compile( + r'(' + _RE_PATTERN_TYPE + r'(?:\s*(?:\bconst\b|[*]))*\s*' + r'&\s*' + _RE_PATTERN_IDENT + r')\s*(?:=[^,()]+)?[,)]') +# A call-by-const-reference parameter either ends with 'const& identifier' +# or looks like 'const type& identifier' when 'type' is atomic. +_RE_PATTERN_CONST_REF_PARAM = ( + r'(?:.*\s*\bconst\s*&\s*' + _RE_PATTERN_IDENT + + r'|const\s+' + _RE_PATTERN_TYPE + r'\s*&\s*' + _RE_PATTERN_IDENT + r')') + + +def CheckLanguage(filename, clean_lines, linenum, file_extension, + include_state, nesting_state, error): + """Checks rules from the 'C++ language rules' section of cppguide.html. + + Some of these rules are hard to test (function overloading, using + uint32 inappropriately), but we do the best we can. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + file_extension: The extension (without the dot) of the filename. + include_state: An _IncludeState instance in which the headers are inserted. + nesting_state: A _NestingState instance which maintains information about + the current stack of nested blocks being parsed. + error: The function to call with any errors found. + """ + # If the line is empty or consists of entirely a comment, no need to + # check it. + line = clean_lines.elided[linenum] + if not line: + return + + match = _RE_PATTERN_INCLUDE.search(line) + if match: + CheckIncludeLine(filename, clean_lines, linenum, include_state, error) + return + + # Reset include state across preprocessor directives. This is meant + # to silence warnings for conditional includes. + if Match(r'^\s*#\s*(?:ifdef|elif|else|endif)\b', line): + include_state.ResetSection() + + # Make Windows paths like Unix. + fullname = os.path.abspath(filename).replace('\\', '/') + + # TODO(unknown): figure out if they're using default arguments in fn proto. + + # Check to see if they're using an conversion function cast. + # I just try to capture the most common basic types, though there are more. + # Parameterless conversion functions, such as bool(), are allowed as they are + # probably a member operator declaration or default constructor. + match = Search( + r'(\bnew\s+)?\b' # Grab 'new' operator, if it's there + r'(int|float|double|bool|char|int32|uint32|int64|uint64)' + r'(\([^)].*)', line) + if match: + matched_new = match.group(1) + matched_type = match.group(2) + matched_funcptr = match.group(3) + + # gMock methods are defined using some variant of MOCK_METHODx(name, type) + # where type may be float(), int(string), etc. Without context they are + # virtually indistinguishable from int(x) casts. Likewise, gMock's + # MockCallback takes a template parameter of the form return_type(arg_type), + # which looks much like the cast we're trying to detect. + # + # std::function<> wrapper has a similar problem. + # + # Return types for function pointers also look like casts if they + # don't have an extra space. + if (matched_new is None and # If new operator, then this isn't a cast + not (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or + Search(r'\bMockCallback<.*>', line) or + Search(r'\bstd::function<.*>', line)) and + not (matched_funcptr and + Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(', + matched_funcptr))): + # Try a bit harder to catch gmock lines: the only place where + # something looks like an old-style cast is where we declare the + # return type of the mocked method, and the only time when we + # are missing context is if MOCK_METHOD was split across + # multiple lines. The missing MOCK_METHOD is usually one or two + # lines back, so scan back one or two lines. + # + # It's not possible for gmock macros to appear in the first 2 + # lines, since the class head + section name takes up 2 lines. + if (linenum < 2 or + not (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$', + clean_lines.elided[linenum - 1]) or + Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$', + clean_lines.elided[linenum - 2]))): + error(filename, linenum, 'readability/casting', 4, + 'Using deprecated casting style. ' + 'Use static_cast<%s>(...) instead' % + matched_type) + + CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum], + 'static_cast', + r'\((int|float|double|bool|char|u?int(16|32|64))\)', error) + + # This doesn't catch all cases. Consider (const char * const)"hello". + # + # (char *) "foo" should always be a const_cast (reinterpret_cast won't + # compile). + if CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum], + 'const_cast', r'\((char\s?\*+\s?)\)\s*"', error): + pass + else: + # Check pointer casts for other than string constants + CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum], + 'reinterpret_cast', r'\((\w+\s?\*+\s?)\)', error) + + # In addition, we look for people taking the address of a cast. This + # is dangerous -- casts can assign to temporaries, so the pointer doesn't + # point where you think. + match = Search( + r'(?:&\(([^)]+)\)[\w(])|' + r'(?:&(static|dynamic|down|reinterpret)_cast\b)', line) + if match and match.group(1) != '*': + error(filename, linenum, 'runtime/casting', 4, + ('Are you taking an address of a cast? ' + 'This is dangerous: could be a temp var. ' + 'Take the address before doing the cast, rather than after')) + + # Create an extended_line, which is the concatenation of the current and + # next lines, for more effective checking of code that may span more than one + # line. + if linenum + 1 < clean_lines.NumLines(): + extended_line = line + clean_lines.elided[linenum + 1] + else: + extended_line = line + + # Check for people declaring static/global STL strings at the top level. + # This is dangerous because the C++ language does not guarantee that + # globals with constructors are initialized before the first access. + match = Match( + r'((?:|static +)(?:|const +))string +([a-zA-Z0-9_:]+)\b(.*)', + line) + # Make sure it's not a function. + # Function template specialization looks like: "string foo(...". + # Class template definitions look like: "string Foo::Method(...". + # + # Also ignore things that look like operators. These are matched separately + # because operator names cross non-word boundaries. If we change the pattern + # above, we would decrease the accuracy of matching identifiers. + if (match and + not Search(r'\boperator\W', line) and + not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)?\s*\(([^"]|$)', match.group(3))): + error(filename, linenum, 'runtime/string', 4, + 'For a static/global string constant, use a C style string instead: ' + '"%schar %s[]".' % + (match.group(1), match.group(2))) + + if Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line): + error(filename, linenum, 'runtime/init', 4, + 'You seem to be initializing a member variable with itself.') + + if file_extension == 'h': + # TODO(unknown): check that 1-arg constructors are explicit. + # How to tell it's a constructor? + # (handled in CheckForNonStandardConstructs for now) + # TODO(unknown): check that classes have DISALLOW_EVIL_CONSTRUCTORS + # (level 1 error) + pass + + # Check if people are using the verboten C basic types. The only exception + # we regularly allow is "unsigned short port" for port. + if Search(r'\bshort port\b', line): + if not Search(r'\bunsigned short port\b', line): + error(filename, linenum, 'runtime/int', 4, + 'Use "unsigned short" for ports, not "short"') + else: + match = Search(r'\b(short|long(?! +double)|long long)\b', line) + if match: + error(filename, linenum, 'runtime/int', 4, + 'Use int16/int64/etc, rather than the C type %s' % match.group(1)) + + # When snprintf is used, the second argument shouldn't be a literal. + match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line) + if match and match.group(2) != '0': + # If 2nd arg is zero, snprintf is used to calculate size. + error(filename, linenum, 'runtime/printf', 3, + 'If you can, use sizeof(%s) instead of %s as the 2nd arg ' + 'to snprintf.' % (match.group(1), match.group(2))) + + # Check if some verboten C functions are being used. + if Search(r'\bsprintf\b', line): + error(filename, linenum, 'runtime/printf', 5, + 'Never use sprintf. Use snprintf instead.') + match = Search(r'\b(strcpy|strcat)\b', line) + if match: + error(filename, linenum, 'runtime/printf', 4, + 'Almost always, snprintf is better than %s' % match.group(1)) + + # Check if some verboten operator overloading is going on + # TODO(unknown): catch out-of-line unary operator&: + # class X {}; + # int operator&(const X& x) { return 42; } // unary operator& + # The trick is it's hard to tell apart from binary operator&: + # class Y { int operator&(const Y& x) { return 23; } }; // binary operator& + if Search(r'\boperator\s*&\s*\(\s*\)', line): + error(filename, linenum, 'runtime/operator', 4, + 'Unary operator& is dangerous. Do not use it.') + + # Check for suspicious usage of "if" like + # } if (a == b) { + if Search(r'\}\s*if\s*\(', line): + error(filename, linenum, 'readability/braces', 4, + 'Did you mean "else if"? If not, start a new line for "if".') + + # Check for potential format string bugs like printf(foo). + # We constrain the pattern not to pick things like DocidForPrintf(foo). + # Not perfect but it can catch printf(foo.c_str()) and printf(foo->c_str()) + # TODO(sugawarayu): Catch the following case. Need to change the calling + # convention of the whole function to process multiple line to handle it. + # printf( + # boy_this_is_a_really_long_variable_that_cannot_fit_on_the_prev_line); + printf_args = _GetTextInside(line, r'(?i)\b(string)?printf\s*\(') + if printf_args: + match = Match(r'([\w.\->()]+)$', printf_args) + if match and match.group(1) != '__VA_ARGS__': + function_name = re.search(r'\b((?:string)?printf)\s*\(', + line, re.I).group(1) + error(filename, linenum, 'runtime/printf', 4, + 'Potential format string bug. Do %s("%%s", %s) instead.' + % (function_name, match.group(1))) + + # Check for potential memset bugs like memset(buf, sizeof(buf), 0). + match = Search(r'memset\s*\(([^,]*),\s*([^,]*),\s*0\s*\)', line) + if match and not Match(r"^''|-?[0-9]+|0x[0-9A-Fa-f]$", match.group(2)): + error(filename, linenum, 'runtime/memset', 4, + 'Did you mean "memset(%s, 0, %s)"?' + % (match.group(1), match.group(2))) + + if Search(r'\busing namespace\b', line): + error(filename, linenum, 'build/namespaces', 5, + 'Do not use namespace using-directives. ' + 'Use using-declarations instead.') + + # Detect variable-length arrays. + match = Match(r'\s*(.+::)?(\w+) [a-z]\w*\[(.+)];', line) + if (match and match.group(2) != 'return' and match.group(2) != 'delete' and + match.group(3).find(']') == -1): + # Split the size using space and arithmetic operators as delimiters. + # If any of the resulting tokens are not compile time constants then + # report the error. + tokens = re.split(r'\s|\+|\-|\*|\/|<<|>>]', match.group(3)) + is_const = True + skip_next = False + for tok in tokens: + if skip_next: + skip_next = False + continue + + if Search(r'sizeof\(.+\)', tok): continue + if Search(r'arraysize\(\w+\)', tok): continue + + tok = tok.lstrip('(') + tok = tok.rstrip(')') + if not tok: continue + if Match(r'\d+', tok): continue + if Match(r'0[xX][0-9a-fA-F]+', tok): continue + if Match(r'k[A-Z0-9]\w*', tok): continue + if Match(r'(.+::)?k[A-Z0-9]\w*', tok): continue + if Match(r'(.+::)?[A-Z][A-Z0-9_]*', tok): continue + # A catch all for tricky sizeof cases, including 'sizeof expression', + # 'sizeof(*type)', 'sizeof(const type)', 'sizeof(struct StructName)' + # requires skipping the next token because we split on ' ' and '*'. + if tok.startswith('sizeof'): + skip_next = True + continue + is_const = False + break + if not is_const: + error(filename, linenum, 'runtime/arrays', 1, + 'Do not use variable-length arrays. Use an appropriately named ' + "('k' followed by CamelCase) compile-time constant for the size.") + + # If DISALLOW_EVIL_CONSTRUCTORS, DISALLOW_COPY_AND_ASSIGN, or + # DISALLOW_IMPLICIT_CONSTRUCTORS is present, then it should be the last thing + # in the class declaration. + match = Match( + (r'\s*' + r'(DISALLOW_(EVIL_CONSTRUCTORS|COPY_AND_ASSIGN|IMPLICIT_CONSTRUCTORS))' + r'\(.*\);$'), + line) + if match and linenum + 1 < clean_lines.NumLines(): + next_line = clean_lines.elided[linenum + 1] + # We allow some, but not all, declarations of variables to be present + # in the statement that defines the class. The [\w\*,\s]* fragment of + # the regular expression below allows users to declare instances of + # the class or pointers to instances, but not less common types such + # as function pointers or arrays. It's a tradeoff between allowing + # reasonable code and avoiding trying to parse more C++ using regexps. + if not Search(r'^\s*}[\w\*,\s]*;', next_line): + error(filename, linenum, 'readability/constructors', 3, + match.group(1) + ' should be the last thing in the class') + + # Check for use of unnamed namespaces in header files. Registration + # macros are typically OK, so we allow use of "namespace {" on lines + # that end with backslashes. + if (file_extension == 'h' + and Search(r'\bnamespace\s*{', line) + and line[-1] != '\\'): + error(filename, linenum, 'build/namespaces', 4, + 'Do not use unnamed namespaces in header files. See ' + 'http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces' + ' for more information.') + +def CheckForNonConstReference(filename, clean_lines, linenum, + nesting_state, error): + """Check for non-const references. + + Separate from CheckLanguage since it scans backwards from current + line, instead of scanning forward. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + nesting_state: A _NestingState instance which maintains information about + the current stack of nested blocks being parsed. + error: The function to call with any errors found. + """ + # Do nothing if there is no '&' on current line. + line = clean_lines.elided[linenum] + if '&' not in line: + return + + # Long type names may be broken across multiple lines, usually in one + # of these forms: + # LongType + # ::LongTypeContinued &identifier + # LongType:: + # LongTypeContinued &identifier + # LongType< + # ...>::LongTypeContinued &identifier + # + # If we detected a type split across two lines, join the previous + # line to current line so that we can match const references + # accordingly. + # + # Note that this only scans back one line, since scanning back + # arbitrary number of lines would be expensive. If you have a type + # that spans more than 2 lines, please use a typedef. + if linenum > 1: + previous = None + if Match(r'\s*::(?:[\w<>]|::)+\s*&\s*\S', line): + # previous_line\n + ::current_line + previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+[\w<>])\s*$', + clean_lines.elided[linenum - 1]) + elif Match(r'\s*[a-zA-Z_]([\w<>]|::)+\s*&\s*\S', line): + # previous_line::\n + current_line + previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+::)\s*$', + clean_lines.elided[linenum - 1]) + if previous: + line = previous.group(1) + line.lstrip() + else: + # Check for templated parameter that is split across multiple lines + endpos = line.rfind('>') + if endpos > -1: + (_, startline, startpos) = ReverseCloseExpression( + clean_lines, linenum, endpos) + if startpos > -1 and startline < linenum: + # Found the matching < on an earlier line, collect all + # pieces up to current line. + line = '' + for i in xrange(startline, linenum + 1): + line += clean_lines.elided[i].strip() + + # Check for non-const references in function parameters. A single '&' may + # found in the following places: + # inside expression: binary & for bitwise AND + # inside expression: unary & for taking the address of something + # inside declarators: reference parameter + # We will exclude the first two cases by checking that we are not inside a + # function body, including one that was just introduced by a trailing '{'. + # TODO(unknwon): Doesn't account for preprocessor directives. + # TODO(unknown): Doesn't account for 'catch(Exception& e)' [rare]. + check_params = False + if not nesting_state.stack: + check_params = True # top level + elif (isinstance(nesting_state.stack[-1], _ClassInfo) or + isinstance(nesting_state.stack[-1], _NamespaceInfo)): + check_params = True # within class or namespace + elif Match(r'.*{\s*$', line): + if (len(nesting_state.stack) == 1 or + isinstance(nesting_state.stack[-2], _ClassInfo) or + isinstance(nesting_state.stack[-2], _NamespaceInfo)): + check_params = True # just opened global/class/namespace block + # We allow non-const references in a few standard places, like functions + # called "swap()" or iostream operators like "<<" or ">>". Do not check + # those function parameters. + # + # We also accept & in static_assert, which looks like a function but + # it's actually a declaration expression. + whitelisted_functions = (r'(?:[sS]wap(?:<\w:+>)?|' + r'operator\s*[<>][<>]|' + r'static_assert|COMPILE_ASSERT' + r')\s*\(') + if Search(whitelisted_functions, line): + check_params = False + elif not Search(r'\S+\([^)]*$', line): + # Don't see a whitelisted function on this line. Actually we + # didn't see any function name on this line, so this is likely a + # multi-line parameter list. Try a bit harder to catch this case. + for i in xrange(2): + if (linenum > i and + Search(whitelisted_functions, clean_lines.elided[linenum - i - 1])): + check_params = False + break + + if check_params: + decls = ReplaceAll(r'{[^}]*}', ' ', line) # exclude function body + for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls): + if not Match(_RE_PATTERN_CONST_REF_PARAM, parameter): + error(filename, linenum, 'runtime/references', 2, + 'Is this a non-const reference? ' + 'If so, make const or use a pointer: ' + + ReplaceAll(' *<', '<', parameter)) + + +def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern, + error): + """Checks for a C-style cast by looking for the pattern. + + Args: + filename: The name of the current file. + linenum: The number of the line to check. + line: The line of code to check. + raw_line: The raw line of code to check, with comments. + cast_type: The string for the C++ cast to recommend. This is either + reinterpret_cast, static_cast, or const_cast, depending. + pattern: The regular expression used to find C-style casts. + error: The function to call with any errors found. + + Returns: + True if an error was emitted. + False otherwise. + """ + match = Search(pattern, line) + if not match: + return False + + # e.g., sizeof(int) + sizeof_match = Match(r'.*sizeof\s*$', line[0:match.start(1) - 1]) + if sizeof_match: + error(filename, linenum, 'runtime/sizeof', 1, + 'Using sizeof(type). Use sizeof(varname) instead if possible') + return True + + # operator++(int) and operator--(int) + if (line[0:match.start(1) - 1].endswith(' operator++') or + line[0:match.start(1) - 1].endswith(' operator--')): + return False + + # A single unnamed argument for a function tends to look like old + # style cast. If we see those, don't issue warnings for deprecated + # casts, instead issue warnings for unnamed arguments where + # appropriate. + # + # These are things that we want warnings for, since the style guide + # explicitly require all parameters to be named: + # Function(int); + # Function(int) { + # ConstMember(int) const; + # ConstMember(int) const { + # ExceptionMember(int) throw (...); + # ExceptionMember(int) throw (...) { + # PureVirtual(int) = 0; + # + # These are functions of some sort, where the compiler would be fine + # if they had named parameters, but people often omit those + # identifiers to reduce clutter: + # (FunctionPointer)(int); + # (FunctionPointer)(int) = value; + # Function((function_pointer_arg)(int)) + # ; + # <(FunctionPointerTemplateArgument)(int)>; + remainder = line[match.end(0):] + if Match(r'^\s*(?:;|const\b|throw\b|=|>|\{|\))', remainder): + # Looks like an unnamed parameter. + + # Don't warn on any kind of template arguments. + if Match(r'^\s*>', remainder): + return False + + # Don't warn on assignments to function pointers, but keep warnings for + # unnamed parameters to pure virtual functions. Note that this pattern + # will also pass on assignments of "0" to function pointers, but the + # preferred values for those would be "nullptr" or "NULL". + matched_zero = Match(r'^\s=\s*(\S+)\s*;', remainder) + if matched_zero and matched_zero.group(1) != '0': + return False + + # Don't warn on function pointer declarations. For this we need + # to check what came before the "(type)" string. + if Match(r'.*\)\s*$', line[0:match.start(0)]): + return False + + # Don't warn if the parameter is named with block comments, e.g.: + # Function(int /*unused_param*/); + if '/*' in raw_line: + return False + + # Passed all filters, issue warning here. + error(filename, linenum, 'readability/function', 3, + 'All parameters should be named in a function') + return True + + # At this point, all that should be left is actual casts. + error(filename, linenum, 'readability/casting', 4, + 'Using C-style cast. Use %s<%s>(...) instead' % + (cast_type, match.group(1))) + + return True + + +_HEADERS_CONTAINING_TEMPLATES = ( + ('', ('deque',)), + ('', ('unary_function', 'binary_function', + 'plus', 'minus', 'multiplies', 'divides', 'modulus', + 'negate', + 'equal_to', 'not_equal_to', 'greater', 'less', + 'greater_equal', 'less_equal', + 'logical_and', 'logical_or', 'logical_not', + 'unary_negate', 'not1', 'binary_negate', 'not2', + 'bind1st', 'bind2nd', + 'pointer_to_unary_function', + 'pointer_to_binary_function', + 'ptr_fun', + 'mem_fun_t', 'mem_fun', 'mem_fun1_t', 'mem_fun1_ref_t', + 'mem_fun_ref_t', + 'const_mem_fun_t', 'const_mem_fun1_t', + 'const_mem_fun_ref_t', 'const_mem_fun1_ref_t', + 'mem_fun_ref', + )), + ('', ('numeric_limits',)), + ('', ('list',)), + ('', ('map', 'multimap',)), + ('', ('allocator',)), + ('', ('queue', 'priority_queue',)), + ('', ('set', 'multiset',)), + ('', ('stack',)), + ('', ('char_traits', 'basic_string',)), + ('', ('pair',)), + ('', ('vector',)), + + # gcc extensions. + # Note: std::hash is their hash, ::hash is our hash + ('', ('hash_map', 'hash_multimap',)), + ('', ('hash_set', 'hash_multiset',)), + ('', ('slist',)), + ) + +_RE_PATTERN_STRING = re.compile(r'\bstring\b') + +_re_pattern_algorithm_header = [] +for _template in ('copy', 'max', 'min', 'min_element', 'sort', 'swap', + 'transform'): + # Match max(..., ...), max(..., ...), but not foo->max, foo.max or + # type::max(). + _re_pattern_algorithm_header.append( + (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'), + _template, + '')) + +_re_pattern_templates = [] +for _header, _templates in _HEADERS_CONTAINING_TEMPLATES: + for _template in _templates: + _re_pattern_templates.append( + (re.compile(r'(\<|\b)' + _template + r'\s*\<'), + _template + '<>', + _header)) + + +def FilesBelongToSameModule(filename_cc, filename_h): + """Check if these two filenames belong to the same module. + + The concept of a 'module' here is a as follows: + foo.h, foo-inl.h, foo.cc, foo_test.cc and foo_unittest.cc belong to the + same 'module' if they are in the same directory. + some/path/public/xyzzy and some/path/internal/xyzzy are also considered + to belong to the same module here. + + If the filename_cc contains a longer path than the filename_h, for example, + '/absolute/path/to/base/sysinfo.cc', and this file would include + 'base/sysinfo.h', this function also produces the prefix needed to open the + header. This is used by the caller of this function to more robustly open the + header file. We don't have access to the real include paths in this context, + so we need this guesswork here. + + Known bugs: tools/base/bar.cc and base/bar.h belong to the same module + according to this implementation. Because of this, this function gives + some false positives. This should be sufficiently rare in practice. + + Args: + filename_cc: is the path for the .cc file + filename_h: is the path for the header path + + Returns: + Tuple with a bool and a string: + bool: True if filename_cc and filename_h belong to the same module. + string: the additional prefix needed to open the header file. + """ + + if not filename_cc.endswith('.cc'): + return (False, '') + filename_cc = filename_cc[:-len('.cc')] + if filename_cc.endswith('_unittest'): + filename_cc = filename_cc[:-len('_unittest')] + elif filename_cc.endswith('_test'): + filename_cc = filename_cc[:-len('_test')] + filename_cc = filename_cc.replace('/public/', '/') + filename_cc = filename_cc.replace('/internal/', '/') + + if not filename_h.endswith('.h'): + return (False, '') + filename_h = filename_h[:-len('.h')] + if filename_h.endswith('-inl'): + filename_h = filename_h[:-len('-inl')] + filename_h = filename_h.replace('/public/', '/') + filename_h = filename_h.replace('/internal/', '/') + + files_belong_to_same_module = filename_cc.endswith(filename_h) + common_path = '' + if files_belong_to_same_module: + common_path = filename_cc[:-len(filename_h)] + return files_belong_to_same_module, common_path + + +def UpdateIncludeState(filename, include_state, io=codecs): + """Fill up the include_state with new includes found from the file. + + Args: + filename: the name of the header to read. + include_state: an _IncludeState instance in which the headers are inserted. + io: The io factory to use to read the file. Provided for testability. + + Returns: + True if a header was succesfully added. False otherwise. + """ + headerfile = None + try: + headerfile = io.open(filename, 'r', 'utf8', 'replace') + except IOError: + return False + linenum = 0 + for line in headerfile: + linenum += 1 + clean_line = CleanseComments(line) + match = _RE_PATTERN_INCLUDE.search(clean_line) + if match: + include = match.group(2) + # The value formatting is cute, but not really used right now. + # What matters here is that the key is in include_state. + include_state.setdefault(include, '%s:%d' % (filename, linenum)) + return True + + +def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error, + io=codecs): + """Reports for missing stl includes. + + This function will output warnings to make sure you are including the headers + necessary for the stl containers and functions that you use. We only give one + reason to include a header. For example, if you use both equal_to<> and + less<> in a .h file, only one (the latter in the file) of these will be + reported as a reason to include the . + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + include_state: An _IncludeState instance. + error: The function to call with any errors found. + io: The IO factory to use to read the header file. Provided for unittest + injection. + """ + required = {} # A map of header name to linenumber and the template entity. + # Example of required: { '': (1219, 'less<>') } + + for linenum in xrange(clean_lines.NumLines()): + line = clean_lines.elided[linenum] + if not line or line[0] == '#': + continue + + # String is special -- it is a non-templatized type in STL. + matched = _RE_PATTERN_STRING.search(line) + if matched: + # Don't warn about strings in non-STL namespaces: + # (We check only the first match per line; good enough.) + prefix = line[:matched.start()] + if prefix.endswith('std::') or not prefix.endswith('::'): + required[''] = (linenum, 'string') + + for pattern, template, header in _re_pattern_algorithm_header: + if pattern.search(line): + required[header] = (linenum, template) + + # The following function is just a speed up, no semantics are changed. + if not '<' in line: # Reduces the cpu time usage by skipping lines. + continue + + for pattern, template, header in _re_pattern_templates: + if pattern.search(line): + required[header] = (linenum, template) + + # The policy is that if you #include something in foo.h you don't need to + # include it again in foo.cc. Here, we will look at possible includes. + # Let's copy the include_state so it is only messed up within this function. + include_state = include_state.copy() + + # Did we find the header for this file (if any) and succesfully load it? + header_found = False + + # Use the absolute path so that matching works properly. + abs_filename = FileInfo(filename).FullName() + + # For Emacs's flymake. + # If cpplint is invoked from Emacs's flymake, a temporary file is generated + # by flymake and that file name might end with '_flymake.cc'. In that case, + # restore original file name here so that the corresponding header file can be + # found. + # e.g. If the file name is 'foo_flymake.cc', we should search for 'foo.h' + # instead of 'foo_flymake.h' + abs_filename = re.sub(r'_flymake\.cc$', '.cc', abs_filename) + + # include_state is modified during iteration, so we iterate over a copy of + # the keys. + header_keys = include_state.keys() + for header in header_keys: + (same_module, common_path) = FilesBelongToSameModule(abs_filename, header) + fullpath = common_path + header + if same_module and UpdateIncludeState(fullpath, include_state, io): + header_found = True + + # If we can't find the header file for a .cc, assume it's because we don't + # know where to look. In that case we'll give up as we're not sure they + # didn't include it in the .h file. + # TODO(unknown): Do a better job of finding .h files so we are confident that + # not having the .h file means there isn't one. + if filename.endswith('.cc') and not header_found: + return + + # All the lines have been processed, report the errors found. + for required_header_unstripped in required: + template = required[required_header_unstripped][1] + if required_header_unstripped.strip('<>"') not in include_state: + error(filename, required[required_header_unstripped][0], + 'build/include_what_you_use', 4, + 'Add #include ' + required_header_unstripped + ' for ' + template) + + +_RE_PATTERN_EXPLICIT_MAKEPAIR = re.compile(r'\bmake_pair\s*<') + + +def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error): + """Check that make_pair's template arguments are deduced. + + G++ 4.6 in C++0x mode fails badly if make_pair's template arguments are + specified explicitly, and such use isn't intended in any case. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + match = _RE_PATTERN_EXPLICIT_MAKEPAIR.search(line) + if match: + error(filename, linenum, 'build/explicit_make_pair', + 4, # 4 = high confidence + 'For C++11-compatibility, omit template arguments from make_pair' + ' OR use pair directly OR if appropriate, construct a pair directly') + + +def ProcessLine(filename, file_extension, clean_lines, line, + include_state, function_state, nesting_state, error, + extra_check_functions=[]): + """Processes a single line in the file. + + Args: + filename: Filename of the file that is being processed. + file_extension: The extension (dot not included) of the file. + clean_lines: An array of strings, each representing a line of the file, + with comments stripped. + line: Number of line being processed. + include_state: An _IncludeState instance in which the headers are inserted. + function_state: A _FunctionState instance which counts function lines, etc. + nesting_state: A _NestingState instance which maintains information about + the current stack of nested blocks being parsed. + error: A callable to which errors are reported, which takes 4 arguments: + filename, line number, error level, and message + extra_check_functions: An array of additional check functions that will be + run on each source line. Each function takes 4 + arguments: filename, clean_lines, line, error + """ + raw_lines = clean_lines.raw_lines + ParseNolintSuppressions(filename, raw_lines[line], line, error) + nesting_state.Update(filename, clean_lines, line, error) + if nesting_state.stack and nesting_state.stack[-1].inline_asm != _NO_ASM: + return + CheckForFunctionLengths(filename, clean_lines, line, function_state, error) + CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error) + CheckStyle(filename, clean_lines, line, file_extension, nesting_state, error) + CheckLanguage(filename, clean_lines, line, file_extension, include_state, + nesting_state, error) + CheckForNonConstReference(filename, clean_lines, line, nesting_state, error) + CheckForNonStandardConstructs(filename, clean_lines, line, + nesting_state, error) + CheckVlogArguments(filename, clean_lines, line, error) + CheckPosixThreading(filename, clean_lines, line, error) + CheckInvalidIncrement(filename, clean_lines, line, error) + CheckMakePairUsesDeduction(filename, clean_lines, line, error) + for check_fn in extra_check_functions: + check_fn(filename, clean_lines, line, error) + +def ProcessFileData(filename, file_extension, lines, error, + extra_check_functions=[]): + """Performs lint checks and reports any errors to the given error function. + + Args: + filename: Filename of the file that is being processed. + file_extension: The extension (dot not included) of the file. + lines: An array of strings, each representing a line of the file, with the + last element being empty if the file is terminated with a newline. + error: A callable to which errors are reported, which takes 4 arguments: + filename, line number, error level, and message + extra_check_functions: An array of additional check functions that will be + run on each source line. Each function takes 4 + arguments: filename, clean_lines, line, error + """ + lines = (['// marker so line numbers and indices both start at 1'] + lines + + ['// marker so line numbers end in a known way']) + + include_state = _IncludeState() + function_state = _FunctionState() + nesting_state = _NestingState() + + ResetNolintSuppressions() + + CheckForCopyright(filename, lines, error) + + if file_extension == 'h': + CheckForHeaderGuard(filename, lines, error) + + RemoveMultiLineComments(filename, lines, error) + clean_lines = CleansedLines(lines) + for line in xrange(clean_lines.NumLines()): + ProcessLine(filename, file_extension, clean_lines, line, + include_state, function_state, nesting_state, error, + extra_check_functions) + nesting_state.CheckCompletedBlocks(filename, error) + + CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error) + + # We check here rather than inside ProcessLine so that we see raw + # lines rather than "cleaned" lines. + CheckForBadCharacters(filename, lines, error) + + CheckForNewlineAtEOF(filename, lines, error) + +def ProcessFile(filename, vlevel, extra_check_functions=[]): + """Does google-lint on a single file. + + Args: + filename: The name of the file to parse. + + vlevel: The level of errors to report. Every error of confidence + >= verbose_level will be reported. 0 is a good default. + + extra_check_functions: An array of additional check functions that will be + run on each source line. Each function takes 4 + arguments: filename, clean_lines, line, error + """ + + _SetVerboseLevel(vlevel) + + try: + # Support the UNIX convention of using "-" for stdin. Note that + # we are not opening the file with universal newline support + # (which codecs doesn't support anyway), so the resulting lines do + # contain trailing '\r' characters if we are reading a file that + # has CRLF endings. + # If after the split a trailing '\r' is present, it is removed + # below. If it is not expected to be present (i.e. os.linesep != + # '\r\n' as in Windows), a warning is issued below if this file + # is processed. + + if filename == '-': + lines = codecs.StreamReaderWriter(sys.stdin, + codecs.getreader('utf8'), + codecs.getwriter('utf8'), + 'replace').read().split('\n') + else: + lines = codecs.open(filename, 'r', 'utf8', 'replace').read().split('\n') + + carriage_return_found = False + # Remove trailing '\r'. + for linenum in range(len(lines)): + if lines[linenum].endswith('\r'): + lines[linenum] = lines[linenum].rstrip('\r') + carriage_return_found = True + + except IOError: + sys.stderr.write( + "Skipping input '%s': Can't open for reading\n" % filename) + return + + # Note, if no dot is found, this will give the entire filename as the ext. + file_extension = filename[filename.rfind('.') + 1:] + + # When reading from stdin, the extension is unknown, so no cpplint tests + # should rely on the extension. + if filename != '-' and file_extension not in _valid_extensions: + sys.stderr.write('Ignoring %s; not a valid file name ' + '(%s)\n' % (filename, ', '.join(_valid_extensions))) + else: + ProcessFileData(filename, file_extension, lines, Error, + extra_check_functions) + if carriage_return_found and os.linesep != '\r\n': + # Use 0 for linenum since outputting only one error for potentially + # several lines. + Error(filename, 0, 'whitespace/newline', 1, + 'One or more unexpected \\r (^M) found;' + 'better to use only a \\n') + + sys.stderr.write('Done processing %s\n' % filename) + + +def PrintUsage(message): + """Prints a brief usage string and exits, optionally with an error message. + + Args: + message: The optional error message. + """ + sys.stderr.write(_USAGE) + if message: + sys.exit('\nFATAL ERROR: ' + message) + else: + sys.exit(1) + + +def PrintCategories(): + """Prints a list of all the error-categories used by error messages. + + These are the categories used to filter messages via --filter. + """ + sys.stderr.write(''.join(' %s\n' % cat for cat in _ERROR_CATEGORIES)) + sys.exit(0) + + +def ParseArguments(args): + """Parses the command line arguments. + + This may set the output format and verbosity level as side-effects. + + Args: + args: The command line arguments: + + Returns: + The list of filenames to lint. + """ + try: + (opts, filenames) = getopt.getopt(args, '', ['help', 'output=', 'verbose=', + 'counting=', + 'filter=', + 'root=', + 'linelength=', + 'extensions=']) + except getopt.GetoptError: + PrintUsage('Invalid arguments.') + + verbosity = _VerboseLevel() + output_format = _OutputFormat() + filters = '' + counting_style = '' + + for (opt, val) in opts: + if opt == '--help': + PrintUsage(None) + elif opt == '--output': + if val not in ('emacs', 'vs7', 'eclipse'): + PrintUsage('The only allowed output formats are emacs, vs7 and eclipse.') + output_format = val + elif opt == '--verbose': + verbosity = int(val) + elif opt == '--filter': + filters = val + if not filters: + PrintCategories() + elif opt == '--counting': + if val not in ('total', 'toplevel', 'detailed'): + PrintUsage('Valid counting options are total, toplevel, and detailed') + counting_style = val + elif opt == '--root': + global _root + _root = val + elif opt == '--linelength': + global _line_length + try: + _line_length = int(val) + except ValueError: + PrintUsage('Line length must be digits.') + elif opt == '--extensions': + global _valid_extensions + try: + _valid_extensions = set(val.split(',')) + except ValueError: + PrintUsage('Extensions must be comma seperated list.') + + if not filenames: + PrintUsage('No files were specified.') + + _SetOutputFormat(output_format) + _SetVerboseLevel(verbosity) + _SetFilters(filters) + _SetCountingStyle(counting_style) + + return filenames + + +def main(): + filenames = ParseArguments(sys.argv[1:]) + + # Change stderr to write with replacement characters so we don't die + # if we try to print something containing non-ASCII characters. + sys.stderr = codecs.StreamReaderWriter(sys.stderr, + codecs.getreader('utf8'), + codecs.getwriter('utf8'), + 'replace') + + _cpplint_state.ResetErrorCounts() + for filename in filenames: + ProcessFile(filename, _cpplint_state.verbose_level) + _cpplint_state.PrintErrorCounts() + + sys.exit(_cpplint_state.error_count > 0) + + +if __name__ == '__main__': + main() diff --git a/libs/libaom/src/tools/diff.py b/libs/libaom/src/tools/diff.py new file mode 100644 index 000000000..bac6aabdc --- /dev/null +++ b/libs/libaom/src/tools/diff.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python +## +## Copyright (c) 2016, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +"""Classes for representing diff pieces.""" + +__author__ = "jkoleszar@google.com" + +import re + + +class DiffLines(object): + """A container for one half of a diff.""" + + def __init__(self, filename, offset, length): + self.filename = filename + self.offset = offset + self.length = length + self.lines = [] + self.delta_line_nums = [] + + def Append(self, line): + l = len(self.lines) + if line[0] != " ": + self.delta_line_nums.append(self.offset + l) + self.lines.append(line[1:]) + assert l+1 <= self.length + + def Complete(self): + return len(self.lines) == self.length + + def __contains__(self, item): + return item >= self.offset and item <= self.offset + self.length - 1 + + +class DiffHunk(object): + """A container for one diff hunk, consisting of two DiffLines.""" + + def __init__(self, header, file_a, file_b, start_a, len_a, start_b, len_b): + self.header = header + self.left = DiffLines(file_a, start_a, len_a) + self.right = DiffLines(file_b, start_b, len_b) + self.lines = [] + + def Append(self, line): + """Adds a line to the DiffHunk and its DiffLines children.""" + if line[0] == "-": + self.left.Append(line) + elif line[0] == "+": + self.right.Append(line) + elif line[0] == " ": + self.left.Append(line) + self.right.Append(line) + elif line[0] == "\\": + # Ignore newline messages from git diff. + pass + else: + assert False, ("Unrecognized character at start of diff line " + "%r" % line[0]) + self.lines.append(line) + + def Complete(self): + return self.left.Complete() and self.right.Complete() + + def __repr__(self): + return "DiffHunk(%s, %s, len %d)" % ( + self.left.filename, self.right.filename, + max(self.left.length, self.right.length)) + + +def ParseDiffHunks(stream): + """Walk a file-like object, yielding DiffHunks as they're parsed.""" + + file_regex = re.compile(r"(\+\+\+|---) (\S+)") + range_regex = re.compile(r"@@ -(\d+)(,(\d+))? \+(\d+)(,(\d+))?") + hunk = None + while True: + line = stream.readline() + if not line: + break + + if hunk is None: + # Parse file names + diff_file = file_regex.match(line) + if diff_file: + if line.startswith("---"): + a_line = line + a = diff_file.group(2) + continue + if line.startswith("+++"): + b_line = line + b = diff_file.group(2) + continue + + # Parse offset/lengths + diffrange = range_regex.match(line) + if diffrange: + if diffrange.group(2): + start_a = int(diffrange.group(1)) + len_a = int(diffrange.group(3)) + else: + start_a = 1 + len_a = int(diffrange.group(1)) + + if diffrange.group(5): + start_b = int(diffrange.group(4)) + len_b = int(diffrange.group(6)) + else: + start_b = 1 + len_b = int(diffrange.group(4)) + + header = [a_line, b_line, line] + hunk = DiffHunk(header, a, b, start_a, len_a, start_b, len_b) + else: + # Add the current line to the hunk + hunk.Append(line) + + # See if the whole hunk has been parsed. If so, yield it and prepare + # for the next hunk. + if hunk.Complete(): + yield hunk + hunk = None + + # Partial hunks are a parse error + assert hunk is None diff --git a/libs/libaom/src/tools/dump_obu.cc b/libs/libaom/src/tools/dump_obu.cc new file mode 100644 index 000000000..30ee5e7a1 --- /dev/null +++ b/libs/libaom/src/tools/dump_obu.cc @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include +#include + +#include "config/aom_config.h" + +#include "common/ivfdec.h" +#include "common/obudec.h" +#include "common/tools_common.h" +#include "common/webmdec.h" +#include "tools/obu_parser.h" + +namespace { + +const size_t kInitialBufferSize = 100 * 1024; + +struct InputContext { + InputContext() = default; + ~InputContext() { free(unit_buffer); } + + void Init() { + memset(avx_ctx, 0, sizeof(*avx_ctx)); + memset(obu_ctx, 0, sizeof(*obu_ctx)); + obu_ctx->avx_ctx = avx_ctx; +#if CONFIG_WEBM_IO + memset(webm_ctx, 0, sizeof(*webm_ctx)); +#endif + } + + AvxInputContext *avx_ctx = nullptr; + ObuDecInputContext *obu_ctx = nullptr; +#if CONFIG_WEBM_IO + WebmInputContext *webm_ctx = nullptr; +#endif + uint8_t *unit_buffer = nullptr; + size_t unit_buffer_size = 0; +}; + +void PrintUsage() { + printf("Libaom OBU dump.\nUsage: dump_obu \n"); +} + +VideoFileType GetFileType(InputContext *ctx) { + if (file_is_ivf(ctx->avx_ctx)) return FILE_TYPE_IVF; + if (file_is_obu(ctx->obu_ctx)) return FILE_TYPE_OBU; +#if CONFIG_WEBM_IO + if (file_is_webm(ctx->webm_ctx, ctx->avx_ctx)) return FILE_TYPE_WEBM; +#endif + return FILE_TYPE_RAW; +} + +bool ReadTemporalUnit(InputContext *ctx, size_t *unit_size) { + const VideoFileType file_type = ctx->avx_ctx->file_type; + switch (file_type) { + case FILE_TYPE_IVF: { + if (ivf_read_frame(ctx->avx_ctx->file, &ctx->unit_buffer, unit_size, + &ctx->unit_buffer_size, NULL)) { + return false; + } + break; + } + case FILE_TYPE_OBU: { + if (obudec_read_temporal_unit(ctx->obu_ctx, &ctx->unit_buffer, unit_size, + &ctx->unit_buffer_size)) { + return false; + } + break; + } +#if CONFIG_WEBM_IO + case FILE_TYPE_WEBM: { + if (webm_read_frame(ctx->webm_ctx, &ctx->unit_buffer, unit_size, + &ctx->unit_buffer_size)) { + return false; + } + break; + } +#endif + default: + // TODO(tomfinegan): Abuse FILE_TYPE_RAW for AV1/OBU elementary streams? + fprintf(stderr, "Error: Unsupported file type.\n"); + return false; + } + + return true; +} + +} // namespace + +int main(int argc, const char *argv[]) { + // TODO(tomfinegan): Could do with some params for verbosity. + if (argc < 2) { + PrintUsage(); + return EXIT_SUCCESS; + } + + const std::string filename = argv[1]; + + using FilePtr = std::unique_ptr; + FilePtr input_file(fopen(filename.c_str(), "rb"), &fclose); + if (input_file.get() == nullptr) { + input_file.release(); + fprintf(stderr, "Error: Cannot open input file.\n"); + return EXIT_FAILURE; + } + + AvxInputContext avx_ctx; + InputContext input_ctx; + input_ctx.avx_ctx = &avx_ctx; + ObuDecInputContext obu_ctx; + input_ctx.obu_ctx = &obu_ctx; +#if CONFIG_WEBM_IO + WebmInputContext webm_ctx; + input_ctx.webm_ctx = &webm_ctx; +#endif + + input_ctx.Init(); + avx_ctx.file = input_file.get(); + avx_ctx.file_type = GetFileType(&input_ctx); + + // Note: the reader utilities will realloc the buffer using realloc() etc. + // Can't have nice things like unique_ptr wrappers with that type of + // behavior underneath the function calls. + input_ctx.unit_buffer = + reinterpret_cast(calloc(kInitialBufferSize, 1)); + if (!input_ctx.unit_buffer) { + fprintf(stderr, "Error: No memory, can't alloc input buffer.\n"); + return EXIT_FAILURE; + } + input_ctx.unit_buffer_size = kInitialBufferSize; + + size_t unit_size = 0; + int unit_number = 0; + int64_t obu_overhead_bytes_total = 0; + while (ReadTemporalUnit(&input_ctx, &unit_size)) { + printf("Temporal unit %d\n", unit_number); + + int obu_overhead_current_unit = 0; + if (!aom_tools::DumpObu(input_ctx.unit_buffer, static_cast(unit_size), + &obu_overhead_current_unit)) { + fprintf(stderr, "Error: Temporal Unit parse failed on unit number %d.\n", + unit_number); + return EXIT_FAILURE; + } + printf(" OBU overhead: %d\n", obu_overhead_current_unit); + ++unit_number; + obu_overhead_bytes_total += obu_overhead_current_unit; + } + + printf("File total OBU overhead: %" PRId64 "\n", obu_overhead_bytes_total); + return EXIT_SUCCESS; +} diff --git a/libs/libaom/src/tools/gen_authors.sh b/libs/libaom/src/tools/gen_authors.sh new file mode 100644 index 000000000..5def8bc89 --- /dev/null +++ b/libs/libaom/src/tools/gen_authors.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# Add organization names manually. + +cat <" | sort | uniq | grep -v "corp.google\|clang-format") +EOF diff --git a/libs/libaom/src/tools/gen_constrained_tokenset.py b/libs/libaom/src/tools/gen_constrained_tokenset.py new file mode 100644 index 000000000..5d12ee1ef --- /dev/null +++ b/libs/libaom/src/tools/gen_constrained_tokenset.py @@ -0,0 +1,120 @@ +#!/usr/bin/python +## +## Copyright (c) 2016, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +"""Generate the probability model for the constrained token set. + +Model obtained from a 2-sided zero-centered distribution derived +from a Pareto distribution. The cdf of the distribution is: +cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta] + +For a given beta and a given probability of the 1-node, the alpha +is first solved, and then the {alpha, beta} pair is used to generate +the probabilities for the rest of the nodes. +""" + +import heapq +import sys +import numpy as np +import scipy.optimize +import scipy.stats + + +def cdf_spareto(x, xm, beta): + p = 1 - (xm / (np.abs(x) + xm))**beta + p = 0.5 + 0.5 * np.sign(x) * p + return p + + +def get_spareto(p, beta): + cdf = cdf_spareto + + def func(x): + return ((cdf(1.5, x, beta) - cdf(0.5, x, beta)) / + (1 - cdf(0.5, x, beta)) - p)**2 + + alpha = scipy.optimize.fminbound(func, 1e-12, 10000, xtol=1e-12) + parray = np.zeros(11) + parray[0] = 2 * (cdf(0.5, alpha, beta) - 0.5) + parray[1] = (2 * (cdf(1.5, alpha, beta) - cdf(0.5, alpha, beta))) + parray[2] = (2 * (cdf(2.5, alpha, beta) - cdf(1.5, alpha, beta))) + parray[3] = (2 * (cdf(3.5, alpha, beta) - cdf(2.5, alpha, beta))) + parray[4] = (2 * (cdf(4.5, alpha, beta) - cdf(3.5, alpha, beta))) + parray[5] = (2 * (cdf(6.5, alpha, beta) - cdf(4.5, alpha, beta))) + parray[6] = (2 * (cdf(10.5, alpha, beta) - cdf(6.5, alpha, beta))) + parray[7] = (2 * (cdf(18.5, alpha, beta) - cdf(10.5, alpha, beta))) + parray[8] = (2 * (cdf(34.5, alpha, beta) - cdf(18.5, alpha, beta))) + parray[9] = (2 * (cdf(66.5, alpha, beta) - cdf(34.5, alpha, beta))) + parray[10] = 2 * (1. - cdf(66.5, alpha, beta)) + return parray + + +def quantize_probs(p, save_first_bin, bits): + """Quantize probability precisely. + + Quantize probabilities minimizing dH (Kullback-Leibler divergence) + approximated by: sum (p_i-q_i)^2/p_i. + References: + https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence + https://github.com/JarekDuda/AsymmetricNumeralSystemsToolkit + """ + num_sym = p.size + p = np.clip(p, 1e-16, 1) + L = 2**bits + pL = p * L + ip = 1. / p # inverse probability + q = np.clip(np.round(pL), 1, L + 1 - num_sym) + quant_err = (pL - q)**2 * ip + sgn = np.sign(L - q.sum()) # direction of correction + if sgn != 0: # correction is needed + v = [] # heap of adjustment results (adjustment err, index) of each symbol + for i in range(1 if save_first_bin else 0, num_sym): + q_adj = q[i] + sgn + if q_adj > 0 and q_adj < L: + adj_err = (pL[i] - q_adj)**2 * ip[i] - quant_err[i] + heapq.heappush(v, (adj_err, i)) + while q.sum() != L: + # apply lowest error adjustment + (adj_err, i) = heapq.heappop(v) + quant_err[i] += adj_err + q[i] += sgn + # calculate the cost of adjusting this symbol again + q_adj = q[i] + sgn + if q_adj > 0 and q_adj < L: + adj_err = (pL[i] - q_adj)**2 * ip[i] - quant_err[i] + heapq.heappush(v, (adj_err, i)) + return q + + +def get_quantized_spareto(p, beta, bits, first_token): + parray = get_spareto(p, beta) + parray = parray[1:] / (1 - parray[0]) + # CONFIG_NEW_TOKENSET + if first_token > 1: + parray = parray[1:] / (1 - parray[0]) + qarray = quantize_probs(parray, first_token == 1, bits) + return qarray.astype(np.int) + + +def main(bits=15, first_token=1): + beta = 8 + for q in range(1, 256): + parray = get_quantized_spareto(q / 256., beta, bits, first_token) + assert parray.sum() == 2**bits + print '{', ', '.join('%d' % i for i in parray), '},' + + +if __name__ == '__main__': + if len(sys.argv) > 2: + main(int(sys.argv[1]), int(sys.argv[2])) + elif len(sys.argv) > 1: + main(int(sys.argv[1])) + else: + main() diff --git a/libs/libaom/src/tools/inspect-cli.js b/libs/libaom/src/tools/inspect-cli.js new file mode 100644 index 000000000..a14c08111 --- /dev/null +++ b/libs/libaom/src/tools/inspect-cli.js @@ -0,0 +1,39 @@ +/** + * This tool lets you test if the compiled Javascript decoder is functioning properly. You'll + * need to download a SpiderMonkey js-shell to run this script. + * https://archive.mozilla.org/pub/firefox/nightly/latest-mozilla-central/ + * + * Example: + * js-shell inspect-cli.js video.ivf + */ +load("inspect.js"); +var buffer = read(scriptArgs[0], "binary"); +var Module = { + noExitRuntime: true, + noInitialRun: true, + preInit: [], + preRun: [], + postRun: [function () { + printErr(`Loaded Javascript Decoder OK`); + }], + memoryInitializerPrefixURL: "bin/", + arguments: ['input.ivf', 'output.raw'], + on_frame_decoded_json: function (jsonString) { + let json = JSON.parse("[" + Module.UTF8ToString(jsonString) + "null]"); + json.forEach(frame => { + if (frame) { + print(frame.frame); + } + }); + } +}; +DecoderModule(Module); +Module.FS.writeFile("/tmp/input.ivf", buffer, { encoding: "binary" }); +Module._open_file(); +Module._set_layers(0xFFFFFFFF); // Set this to zero if you want to benchmark decoding. +while(true) { + printErr("Decoding Frame ..."); + if (Module._read_frame()) { + break; + } +} diff --git a/libs/libaom/src/tools/inspect-post.js b/libs/libaom/src/tools/inspect-post.js new file mode 100644 index 000000000..31c40bb82 --- /dev/null +++ b/libs/libaom/src/tools/inspect-post.js @@ -0,0 +1 @@ +Module["FS"] = FS; diff --git a/libs/libaom/src/tools/intersect-diffs.py b/libs/libaom/src/tools/intersect-diffs.py new file mode 100644 index 000000000..df13c4ef7 --- /dev/null +++ b/libs/libaom/src/tools/intersect-diffs.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python +## +## Copyright (c) 2016, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +"""Calculates the "intersection" of two unified diffs. + +Given two diffs, A and B, it finds all hunks in B that had non-context lines +in A and prints them to stdout. This is useful to determine the hunks in B that +are relevant to A. The resulting file can be applied with patch(1) on top of A. +""" + +__author__ = "jkoleszar@google.com" + +import sys + +import diff + + +def FormatDiffHunks(hunks): + """Re-serialize a list of DiffHunks.""" + r = [] + last_header = None + for hunk in hunks: + this_header = hunk.header[0:2] + if last_header != this_header: + r.extend(hunk.header) + last_header = this_header + else: + r.extend(hunk.header[2]) + r.extend(hunk.lines) + r.append("\n") + return "".join(r) + + +def ZipHunks(rhs_hunks, lhs_hunks): + """Join two hunk lists on filename.""" + for rhs_hunk in rhs_hunks: + rhs_file = rhs_hunk.right.filename.split("/")[1:] + + for lhs_hunk in lhs_hunks: + lhs_file = lhs_hunk.left.filename.split("/")[1:] + if lhs_file != rhs_file: + continue + yield (rhs_hunk, lhs_hunk) + + +def main(): + old_hunks = [x for x in diff.ParseDiffHunks(open(sys.argv[1], "r"))] + new_hunks = [x for x in diff.ParseDiffHunks(open(sys.argv[2], "r"))] + out_hunks = [] + + # Join the right hand side of the older diff with the left hand side of the + # newer diff. + for old_hunk, new_hunk in ZipHunks(old_hunks, new_hunks): + if new_hunk in out_hunks: + continue + old_lines = old_hunk.right + new_lines = new_hunk.left + + # Determine if this hunk overlaps any non-context line from the other + for i in old_lines.delta_line_nums: + if i in new_lines: + out_hunks.append(new_hunk) + break + + if out_hunks: + print FormatDiffHunks(out_hunks) + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/libs/libaom/src/tools/lint-hunks.py b/libs/libaom/src/tools/lint-hunks.py new file mode 100644 index 000000000..d02bee16c --- /dev/null +++ b/libs/libaom/src/tools/lint-hunks.py @@ -0,0 +1,146 @@ +#!/usr/bin/python +## +## Copyright (c) 2016, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +"""Performs style checking on each diff hunk.""" +import getopt +import os +import StringIO +import subprocess +import sys + +import diff + + +SHORT_OPTIONS = "h" +LONG_OPTIONS = ["help"] + +TOPLEVEL_CMD = ["git", "rev-parse", "--show-toplevel"] +DIFF_CMD = ["git", "diff"] +DIFF_INDEX_CMD = ["git", "diff-index", "-u", "HEAD", "--"] +SHOW_CMD = ["git", "show"] +CPPLINT_FILTERS = ["-readability/casting"] + + +class Usage(Exception): + pass + + +class SubprocessException(Exception): + def __init__(self, args): + msg = "Failed to execute '%s'"%(" ".join(args)) + super(SubprocessException, self).__init__(msg) + + +class Subprocess(subprocess.Popen): + """Adds the notion of an expected returncode to Popen.""" + + def __init__(self, args, expected_returncode=0, **kwargs): + self._args = args + self._expected_returncode = expected_returncode + super(Subprocess, self).__init__(args, **kwargs) + + def communicate(self, *args, **kwargs): + result = super(Subprocess, self).communicate(*args, **kwargs) + if self._expected_returncode is not None: + try: + ok = self.returncode in self._expected_returncode + except TypeError: + ok = self.returncode == self._expected_returncode + if not ok: + raise SubprocessException(self._args) + return result + + +def main(argv=None): + if argv is None: + argv = sys.argv + try: + try: + opts, args = getopt.getopt(argv[1:], SHORT_OPTIONS, LONG_OPTIONS) + except getopt.error, msg: + raise Usage(msg) + + # process options + for o, _ in opts: + if o in ("-h", "--help"): + print __doc__ + sys.exit(0) + + if args and len(args) > 1: + print __doc__ + sys.exit(0) + + # Find the fully qualified path to the root of the tree + tl = Subprocess(TOPLEVEL_CMD, stdout=subprocess.PIPE) + tl = tl.communicate()[0].strip() + + # See if we're working on the index or not. + if args: + diff_cmd = DIFF_CMD + [args[0] + "^!"] + else: + diff_cmd = DIFF_INDEX_CMD + + # Build the command line to execute cpplint + cpplint_cmd = [os.path.join(tl, "tools", "cpplint.py"), + "--filter=" + ",".join(CPPLINT_FILTERS), + "-"] + + # Get a list of all affected lines + file_affected_line_map = {} + p = Subprocess(diff_cmd, stdout=subprocess.PIPE) + stdout = p.communicate()[0] + for hunk in diff.ParseDiffHunks(StringIO.StringIO(stdout)): + filename = hunk.right.filename[2:] + if filename not in file_affected_line_map: + file_affected_line_map[filename] = set() + file_affected_line_map[filename].update(hunk.right.delta_line_nums) + + # Run each affected file through cpplint + lint_failed = False + for filename, affected_lines in file_affected_line_map.iteritems(): + if filename.split(".")[-1] not in ("c", "h", "cc"): + continue + + if args: + # File contents come from git + show_cmd = SHOW_CMD + [args[0] + ":" + filename] + show = Subprocess(show_cmd, stdout=subprocess.PIPE) + lint = Subprocess(cpplint_cmd, expected_returncode=(0, 1), + stdin=show.stdout, stderr=subprocess.PIPE) + lint_out = lint.communicate()[1] + else: + # File contents come from the working tree + lint = Subprocess(cpplint_cmd, expected_returncode=(0, 1), + stdin=subprocess.PIPE, stderr=subprocess.PIPE) + stdin = open(os.path.join(tl, filename)).read() + lint_out = lint.communicate(stdin)[1] + + for line in lint_out.split("\n"): + fields = line.split(":") + if fields[0] != "-": + continue + warning_line_num = int(fields[1]) + if warning_line_num in affected_lines: + print "%s:%d:%s"%(filename, warning_line_num, + ":".join(fields[2:])) + lint_failed = True + + # Set exit code if any relevant lint errors seen + if lint_failed: + return 1 + + except Usage, err: + print >>sys.stderr, err + print >>sys.stderr, "for help use --help" + return 2 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/libs/libaom/src/tools/obu_parser.cc b/libs/libaom/src/tools/obu_parser.cc new file mode 100644 index 000000000..7d71386ce --- /dev/null +++ b/libs/libaom/src/tools/obu_parser.cc @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include +#include + +#include "aom/aom_codec.h" +#include "aom/aom_integer.h" +#include "aom_ports/mem_ops.h" +#include "av1/common/obu_util.h" +#include "tools/obu_parser.h" + +namespace aom_tools { + +// Basic OBU syntax +// 8 bits: Header +// 7 +// forbidden bit +// 6,5,4,3 +// type bits +// 2 +// extension flag bit +// 1 +// has size field bit +// 0 +// reserved bit +const uint32_t kObuForbiddenBitMask = 0x1; +const uint32_t kObuForbiddenBitShift = 7; +const uint32_t kObuTypeBitsMask = 0xF; +const uint32_t kObuTypeBitsShift = 3; +const uint32_t kObuExtensionFlagBitMask = 0x1; +const uint32_t kObuExtensionFlagBitShift = 2; +const uint32_t kObuHasSizeFieldBitMask = 0x1; +const uint32_t kObuHasSizeFieldBitShift = 1; + +// When extension flag bit is set: +// 8 bits: extension header +// 7,6,5 +// temporal ID +// 4,3 +// spatial ID +// 2,1,0 +// reserved bits +const uint32_t kObuExtTemporalIdBitsMask = 0x7; +const uint32_t kObuExtTemporalIdBitsShift = 5; +const uint32_t kObuExtSpatialIdBitsMask = 0x3; +const uint32_t kObuExtSpatialIdBitsShift = 3; + +bool ValidObuType(int obu_type) { + switch (obu_type) { + case OBU_SEQUENCE_HEADER: + case OBU_TEMPORAL_DELIMITER: + case OBU_FRAME_HEADER: + case OBU_TILE_GROUP: + case OBU_METADATA: + case OBU_FRAME: + case OBU_REDUNDANT_FRAME_HEADER: + case OBU_TILE_LIST: + case OBU_PADDING: return true; + } + return false; +} + +bool ParseObuHeader(uint8_t obu_header_byte, ObuHeader *obu_header) { + const int forbidden_bit = + (obu_header_byte >> kObuForbiddenBitShift) & kObuForbiddenBitMask; + if (forbidden_bit) { + fprintf(stderr, "Invalid OBU, forbidden bit set.\n"); + return false; + } + + obu_header->type = static_cast( + (obu_header_byte >> kObuTypeBitsShift) & kObuTypeBitsMask); + if (!ValidObuType(obu_header->type)) { + fprintf(stderr, "Invalid OBU type: %d.\n", obu_header->type); + return false; + } + + obu_header->has_extension = + (obu_header_byte >> kObuExtensionFlagBitShift) & kObuExtensionFlagBitMask; + obu_header->has_size_field = + (obu_header_byte >> kObuHasSizeFieldBitShift) & kObuHasSizeFieldBitMask; + return true; +} + +bool ParseObuExtensionHeader(uint8_t ext_header_byte, ObuHeader *obu_header) { + obu_header->temporal_layer_id = + (ext_header_byte >> kObuExtTemporalIdBitsShift) & + kObuExtTemporalIdBitsMask; + obu_header->spatial_layer_id = + (ext_header_byte >> kObuExtSpatialIdBitsShift) & kObuExtSpatialIdBitsMask; + + return true; +} + +void PrintObuHeader(const ObuHeader *header) { + printf( + " OBU type: %s\n" + " extension: %s\n", + aom_obu_type_to_string(static_cast(header->type)), + header->has_extension ? "yes" : "no"); + if (header->has_extension) { + printf( + " temporal_id: %d\n" + " spatial_id: %d\n", + header->temporal_layer_id, header->temporal_layer_id); + } +} + +bool DumpObu(const uint8_t *data, int length, int *obu_overhead_bytes) { + const int kObuHeaderSizeBytes = 1; + const int kMinimumBytesRequired = 1 + kObuHeaderSizeBytes; + int consumed = 0; + int obu_overhead = 0; + ObuHeader obu_header; + while (consumed < length) { + const int remaining = length - consumed; + if (remaining < kMinimumBytesRequired) { + fprintf(stderr, + "OBU parse error. Did not consume all data, %d bytes remain.\n", + remaining); + return false; + } + + int obu_header_size = 0; + + memset(&obu_header, 0, sizeof(obu_header)); + const uint8_t obu_header_byte = *(data + consumed); + if (!ParseObuHeader(obu_header_byte, &obu_header)) { + fprintf(stderr, "OBU parsing failed at offset %d.\n", consumed); + return false; + } + + ++obu_overhead; + ++obu_header_size; + + if (obu_header.has_extension) { + const uint8_t obu_ext_header_byte = + *(data + consumed + kObuHeaderSizeBytes); + if (!ParseObuExtensionHeader(obu_ext_header_byte, &obu_header)) { + fprintf(stderr, "OBU extension parsing failed at offset %d.\n", + consumed + kObuHeaderSizeBytes); + return false; + } + + ++obu_overhead; + ++obu_header_size; + } + + PrintObuHeader(&obu_header); + + uint64_t obu_size = 0; + size_t length_field_size = 0; + if (aom_uleb_decode(data + consumed + obu_header_size, + remaining - obu_header_size, &obu_size, + &length_field_size) != 0) { + fprintf(stderr, "OBU size parsing failed at offset %d.\n", + consumed + obu_header_size); + return false; + } + int current_obu_length = static_cast(obu_size); + if (obu_header_size + static_cast(length_field_size) + + current_obu_length > + remaining) { + fprintf(stderr, "OBU parsing failed: not enough OBU data.\n"); + return false; + } + consumed += obu_header_size + static_cast(length_field_size) + + current_obu_length; + printf(" length: %d\n", + static_cast(obu_header_size + length_field_size + + current_obu_length)); + } + + if (obu_overhead_bytes != nullptr) *obu_overhead_bytes = obu_overhead; + printf(" TU size: %d\n", consumed); + + return true; +} + +} // namespace aom_tools diff --git a/libs/libaom/src/tools/obu_parser.h b/libs/libaom/src/tools/obu_parser.h new file mode 100644 index 000000000..1d7d2d794 --- /dev/null +++ b/libs/libaom/src/tools/obu_parser.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_TOOLS_OBU_PARSER_H_ +#define AOM_TOOLS_OBU_PARSER_H_ + +#include + +namespace aom_tools { + +// Print information obtained from OBU(s) in data until data is exhausted or an +// error occurs. Returns true when all data is consumed successfully, and +// optionally reports OBU storage overhead via obu_overhead_bytes when the +// pointer is non-null. +bool DumpObu(const uint8_t *data, int length, int *obu_overhead_bytes); + +} // namespace aom_tools + +#endif // AOM_TOOLS_OBU_PARSER_H_ diff --git a/libs/libaom/src/tools/txfm_analyzer/txfm_gen_code.cc b/libs/libaom/src/tools/txfm_analyzer/txfm_gen_code.cc new file mode 100644 index 000000000..7c5400b91 --- /dev/null +++ b/libs/libaom/src/tools/txfm_analyzer/txfm_gen_code.cc @@ -0,0 +1,580 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include + +#include "tools/txfm_analyzer/txfm_graph.h" + +typedef enum CODE_TYPE { + CODE_TYPE_C, + CODE_TYPE_SSE2, + CODE_TYPE_SSE4_1 +} CODE_TYPE; + +int get_cos_idx(double value, int mod) { + return round(acos(fabs(value)) / PI * mod); +} + +char *cos_text_arr(double value, int mod, char *text, int size) { + int num = get_cos_idx(value, mod); + if (value < 0) { + snprintf(text, size, "-cospi[%2d]", num); + } else { + snprintf(text, size, " cospi[%2d]", num); + } + + if (num == 0) + printf("v: %f -> %d/%d v==-1 is %d\n", value, num, mod, value == -1); + + return text; +} + +char *cos_text_sse2(double w0, double w1, int mod, char *text, int size) { + int idx0 = get_cos_idx(w0, mod); + int idx1 = get_cos_idx(w1, mod); + char p[] = "p"; + char n[] = "m"; + char *sgn0 = w0 < 0 ? n : p; + char *sgn1 = w1 < 0 ? n : p; + snprintf(text, size, "cospi_%s%02d_%s%02d", sgn0, idx0, sgn1, idx1); + return text; +} + +char *cos_text_sse4_1(double w, int mod, char *text, int size) { + int idx = get_cos_idx(w, mod); + char p[] = "p"; + char n[] = "m"; + char *sgn = w < 0 ? n : p; + snprintf(text, size, "cospi_%s%02d", sgn, idx); + return text; +} + +void node_to_code_c(Node *node, const char *buf0, const char *buf1) { + int cnt = 0; + for (int i = 0; i < 2; i++) { + if (fabs(node->inWeight[i]) == 1 || fabs(node->inWeight[i]) == 0) cnt++; + } + if (cnt == 2) { + int cnt2 = 0; + printf(" %s[%d] =", buf1, node->nodeIdx); + for (int i = 0; i < 2; i++) { + if (fabs(node->inWeight[i]) == 1) { + cnt2++; + } + } + if (cnt2 == 2) { + printf(" apply_value("); + } + int cnt1 = 0; + for (int i = 0; i < 2; i++) { + if (node->inWeight[i] == 1) { + if (cnt1 > 0) + printf(" + %s[%d]", buf0, node->inNodeIdx[i]); + else + printf(" %s[%d]", buf0, node->inNodeIdx[i]); + cnt1++; + } else if (node->inWeight[i] == -1) { + if (cnt1 > 0) + printf(" - %s[%d]", buf0, node->inNodeIdx[i]); + else + printf("-%s[%d]", buf0, node->inNodeIdx[i]); + cnt1++; + } + } + if (cnt2 == 2) { + printf(", stage_range[stage])"); + } + printf(";\n"); + } else { + char w0[100]; + char w1[100]; + printf( + " %s[%d] = half_btf(%s, %s[%d], %s, %s[%d], " + "cos_bit);\n", + buf1, node->nodeIdx, cos_text_arr(node->inWeight[0], COS_MOD, w0, 100), + buf0, node->inNodeIdx[0], + cos_text_arr(node->inWeight[1], COS_MOD, w1, 100), buf0, + node->inNodeIdx[1]); + } +} + +void gen_code_c(Node *node, int stage_num, int node_num, TYPE_TXFM type) { + char *fun_name = new char[100]; + get_fun_name(fun_name, 100, type, node_num); + + printf("\n"); + printf( + "void av1_%s(const int32_t *input, int32_t *output, int8_t cos_bit, " + "const int8_t* stage_range) " + "{\n", + fun_name); + printf(" assert(output != input);\n"); + printf(" const int32_t size = %d;\n", node_num); + printf(" const int32_t *cospi = cospi_arr(cos_bit);\n"); + printf("\n"); + + printf(" int32_t stage = 0;\n"); + printf(" int32_t *bf0, *bf1;\n"); + printf(" int32_t step[%d];\n", node_num); + + const char *buf0 = "bf0"; + const char *buf1 = "bf1"; + const char *input = "input"; + + int si = 0; + printf("\n"); + printf(" // stage %d;\n", si); + printf(" apply_range(stage, input, %s, size, stage_range[stage]);\n", input); + + si = 1; + printf("\n"); + printf(" // stage %d;\n", si); + printf(" stage++;\n"); + if (si % 2 == (stage_num - 1) % 2) { + printf(" %s = output;\n", buf1); + } else { + printf(" %s = step;\n", buf1); + } + + for (int ni = 0; ni < node_num; ni++) { + int idx = get_idx(si, ni, node_num); + node_to_code_c(node + idx, input, buf1); + } + + printf(" range_check_buf(stage, input, bf1, size, stage_range[stage]);\n"); + + for (int si = 2; si < stage_num; si++) { + printf("\n"); + printf(" // stage %d\n", si); + printf(" stage++;\n"); + if (si % 2 == (stage_num - 1) % 2) { + printf(" %s = step;\n", buf0); + printf(" %s = output;\n", buf1); + } else { + printf(" %s = output;\n", buf0); + printf(" %s = step;\n", buf1); + } + + // computation code + for (int ni = 0; ni < node_num; ni++) { + int idx = get_idx(si, ni, node_num); + node_to_code_c(node + idx, buf0, buf1); + } + + if (si != stage_num - 1) { + printf( + " range_check_buf(stage, input, bf1, size, stage_range[stage]);\n"); + } + } + printf(" apply_range(stage, input, output, size, stage_range[stage]);\n"); + printf("}\n"); +} + +void single_node_to_code_sse2(Node *node, const char *buf0, const char *buf1) { + printf(" %s[%2d] =", buf1, node->nodeIdx); + if (node->inWeight[0] == 1 && node->inWeight[1] == 1) { + printf(" _mm_adds_epi16(%s[%d], %s[%d])", buf0, node->inNodeIdx[0], buf0, + node->inNodeIdx[1]); + } else if (node->inWeight[0] == 1 && node->inWeight[1] == -1) { + printf(" _mm_subs_epi16(%s[%d], %s[%d])", buf0, node->inNodeIdx[0], buf0, + node->inNodeIdx[1]); + } else if (node->inWeight[0] == -1 && node->inWeight[1] == 1) { + printf(" _mm_subs_epi16(%s[%d], %s[%d])", buf0, node->inNodeIdx[1], buf0, + node->inNodeIdx[0]); + } else if (node->inWeight[0] == 1 && node->inWeight[1] == 0) { + printf(" %s[%d]", buf0, node->inNodeIdx[0]); + } else if (node->inWeight[0] == 0 && node->inWeight[1] == 1) { + printf(" %s[%d]", buf0, node->inNodeIdx[1]); + } else if (node->inWeight[0] == -1 && node->inWeight[1] == 0) { + printf(" _mm_subs_epi16(__zero, %s[%d])", buf0, node->inNodeIdx[0]); + } else if (node->inWeight[0] == 0 && node->inWeight[1] == -1) { + printf(" _mm_subs_epi16(__zero, %s[%d])", buf0, node->inNodeIdx[1]); + } + printf(";\n"); +} + +void pair_node_to_code_sse2(Node *node, Node *partnerNode, const char *buf0, + const char *buf1) { + char temp0[100]; + char temp1[100]; + // btf_16_sse2_type0(w0, w1, in0, in1, out0, out1) + if (node->inNodeIdx[0] != partnerNode->inNodeIdx[0]) + printf(" btf_16_sse2(%s, %s, %s[%d], %s[%d], %s[%d], %s[%d]);\n", + cos_text_sse2(node->inWeight[0], node->inWeight[1], COS_MOD, temp0, + 100), + cos_text_sse2(partnerNode->inWeight[1], partnerNode->inWeight[0], + COS_MOD, temp1, 100), + buf0, node->inNodeIdx[0], buf0, node->inNodeIdx[1], buf1, + node->nodeIdx, buf1, partnerNode->nodeIdx); + else + printf(" btf_16_sse2(%s, %s, %s[%d], %s[%d], %s[%d], %s[%d]);\n", + cos_text_sse2(node->inWeight[0], node->inWeight[1], COS_MOD, temp0, + 100), + cos_text_sse2(partnerNode->inWeight[0], partnerNode->inWeight[1], + COS_MOD, temp1, 100), + buf0, node->inNodeIdx[0], buf0, node->inNodeIdx[1], buf1, + node->nodeIdx, buf1, partnerNode->nodeIdx); +} + +Node *get_partner_node(Node *node) { + int diff = node->inNode[1]->nodeIdx - node->nodeIdx; + return node + diff; +} + +void node_to_code_sse2(Node *node, const char *buf0, const char *buf1) { + int cnt = 0; + int cnt1 = 0; + if (node->visited == 0) { + node->visited = 1; + for (int i = 0; i < 2; i++) { + if (fabs(node->inWeight[i]) == 1 || fabs(node->inWeight[i]) == 0) cnt++; + if (fabs(node->inWeight[i]) == 1) cnt1++; + } + if (cnt == 2) { + if (cnt1 == 2) { + // has a partner + Node *partnerNode = get_partner_node(node); + partnerNode->visited = 1; + single_node_to_code_sse2(node, buf0, buf1); + single_node_to_code_sse2(partnerNode, buf0, buf1); + } else { + single_node_to_code_sse2(node, buf0, buf1); + } + } else { + Node *partnerNode = get_partner_node(node); + partnerNode->visited = 1; + pair_node_to_code_sse2(node, partnerNode, buf0, buf1); + } + } +} + +void gen_cospi_list_sse2(Node *node, int stage_num, int node_num) { + int visited[65][65][2][2]; + memset(visited, 0, sizeof(visited)); + char text[100]; + char text1[100]; + char text2[100]; + int size = 100; + printf("\n"); + for (int si = 1; si < stage_num; si++) { + for (int ni = 0; ni < node_num; ni++) { + int idx = get_idx(si, ni, node_num); + int cnt = 0; + Node *node0 = node + idx; + if (node0->visited == 0) { + node0->visited = 1; + for (int i = 0; i < 2; i++) { + if (fabs(node0->inWeight[i]) == 1 || fabs(node0->inWeight[i]) == 0) + cnt++; + } + if (cnt != 2) { + { + double w0 = node0->inWeight[0]; + double w1 = node0->inWeight[1]; + int idx0 = get_cos_idx(w0, COS_MOD); + int idx1 = get_cos_idx(w1, COS_MOD); + int sgn0 = w0 < 0 ? 1 : 0; + int sgn1 = w1 < 0 ? 1 : 0; + + if (!visited[idx0][idx1][sgn0][sgn1]) { + visited[idx0][idx1][sgn0][sgn1] = 1; + printf(" __m128i %s = pair_set_epi16(%s, %s);\n", + cos_text_sse2(w0, w1, COS_MOD, text, size), + cos_text_arr(w0, COS_MOD, text1, size), + cos_text_arr(w1, COS_MOD, text2, size)); + } + } + Node *node1 = get_partner_node(node0); + node1->visited = 1; + if (node1->inNode[0]->nodeIdx != node0->inNode[0]->nodeIdx) { + double w0 = node1->inWeight[0]; + double w1 = node1->inWeight[1]; + int idx0 = get_cos_idx(w0, COS_MOD); + int idx1 = get_cos_idx(w1, COS_MOD); + int sgn0 = w0 < 0 ? 1 : 0; + int sgn1 = w1 < 0 ? 1 : 0; + + if (!visited[idx1][idx0][sgn1][sgn0]) { + visited[idx1][idx0][sgn1][sgn0] = 1; + printf(" __m128i %s = pair_set_epi16(%s, %s);\n", + cos_text_sse2(w1, w0, COS_MOD, text, size), + cos_text_arr(w1, COS_MOD, text1, size), + cos_text_arr(w0, COS_MOD, text2, size)); + } + } else { + double w0 = node1->inWeight[0]; + double w1 = node1->inWeight[1]; + int idx0 = get_cos_idx(w0, COS_MOD); + int idx1 = get_cos_idx(w1, COS_MOD); + int sgn0 = w0 < 0 ? 1 : 0; + int sgn1 = w1 < 0 ? 1 : 0; + + if (!visited[idx0][idx1][sgn0][sgn1]) { + visited[idx0][idx1][sgn0][sgn1] = 1; + printf(" __m128i %s = pair_set_epi16(%s, %s);\n", + cos_text_sse2(w0, w1, COS_MOD, text, size), + cos_text_arr(w0, COS_MOD, text1, size), + cos_text_arr(w1, COS_MOD, text2, size)); + } + } + } + } + } + } +} + +void gen_code_sse2(Node *node, int stage_num, int node_num, TYPE_TXFM type) { + char *fun_name = new char[100]; + get_fun_name(fun_name, 100, type, node_num); + + printf("\n"); + printf( + "void %s_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) " + "{\n", + fun_name); + + printf(" const int32_t* cospi = cospi_arr(cos_bit);\n"); + printf(" const __m128i __zero = _mm_setzero_si128();\n"); + printf(" const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));\n"); + + graph_reset_visited(node, stage_num, node_num); + gen_cospi_list_sse2(node, stage_num, node_num); + graph_reset_visited(node, stage_num, node_num); + for (int si = 1; si < stage_num; si++) { + char in[100]; + char out[100]; + printf("\n"); + printf(" // stage %d\n", si); + if (si == 1) + snprintf(in, 100, "%s", "input"); + else + snprintf(in, 100, "x%d", si - 1); + if (si == stage_num - 1) { + snprintf(out, 100, "%s", "output"); + } else { + snprintf(out, 100, "x%d", si); + printf(" __m128i %s[%d];\n", out, node_num); + } + // computation code + for (int ni = 0; ni < node_num; ni++) { + int idx = get_idx(si, ni, node_num); + node_to_code_sse2(node + idx, in, out); + } + } + + printf("}\n"); +} +void gen_cospi_list_sse4_1(Node *node, int stage_num, int node_num) { + int visited[65][2]; + memset(visited, 0, sizeof(visited)); + char text[100]; + char text1[100]; + int size = 100; + printf("\n"); + for (int si = 1; si < stage_num; si++) { + for (int ni = 0; ni < node_num; ni++) { + int idx = get_idx(si, ni, node_num); + Node *node0 = node + idx; + if (node0->visited == 0) { + int cnt = 0; + node0->visited = 1; + for (int i = 0; i < 2; i++) { + if (fabs(node0->inWeight[i]) == 1 || fabs(node0->inWeight[i]) == 0) + cnt++; + } + if (cnt != 2) { + for (int i = 0; i < 2; i++) { + if (fabs(node0->inWeight[i]) != 1 && + fabs(node0->inWeight[i]) != 0) { + double w = node0->inWeight[i]; + int idx = get_cos_idx(w, COS_MOD); + int sgn = w < 0 ? 1 : 0; + + if (!visited[idx][sgn]) { + visited[idx][sgn] = 1; + printf(" __m128i %s = _mm_set1_epi32(%s);\n", + cos_text_sse4_1(w, COS_MOD, text, size), + cos_text_arr(w, COS_MOD, text1, size)); + } + } + } + Node *node1 = get_partner_node(node0); + node1->visited = 1; + } + } + } + } +} + +void single_node_to_code_sse4_1(Node *node, const char *buf0, + const char *buf1) { + printf(" %s[%2d] =", buf1, node->nodeIdx); + if (node->inWeight[0] == 1 && node->inWeight[1] == 1) { + printf(" _mm_add_epi32(%s[%d], %s[%d])", buf0, node->inNodeIdx[0], buf0, + node->inNodeIdx[1]); + } else if (node->inWeight[0] == 1 && node->inWeight[1] == -1) { + printf(" _mm_sub_epi32(%s[%d], %s[%d])", buf0, node->inNodeIdx[0], buf0, + node->inNodeIdx[1]); + } else if (node->inWeight[0] == -1 && node->inWeight[1] == 1) { + printf(" _mm_sub_epi32(%s[%d], %s[%d])", buf0, node->inNodeIdx[1], buf0, + node->inNodeIdx[0]); + } else if (node->inWeight[0] == 1 && node->inWeight[1] == 0) { + printf(" %s[%d]", buf0, node->inNodeIdx[0]); + } else if (node->inWeight[0] == 0 && node->inWeight[1] == 1) { + printf(" %s[%d]", buf0, node->inNodeIdx[1]); + } else if (node->inWeight[0] == -1 && node->inWeight[1] == 0) { + printf(" _mm_sub_epi32(__zero, %s[%d])", buf0, node->inNodeIdx[0]); + } else if (node->inWeight[0] == 0 && node->inWeight[1] == -1) { + printf(" _mm_sub_epi32(__zero, %s[%d])", buf0, node->inNodeIdx[1]); + } + printf(";\n"); +} + +void pair_node_to_code_sse4_1(Node *node, Node *partnerNode, const char *buf0, + const char *buf1) { + char temp0[100]; + char temp1[100]; + if (node->inWeight[0] * partnerNode->inWeight[0] < 0) { + /* type0 + * cos sin + * sin -cos + */ + // btf_32_sse2_type0(w0, w1, in0, in1, out0, out1) + // out0 = w0*in0 + w1*in1 + // out1 = -w0*in1 + w1*in0 + printf( + " btf_32_type0_sse4_1_new(%s, %s, %s[%d], %s[%d], %s[%d], %s[%d], " + "__rounding, cos_bit);\n", + cos_text_sse4_1(node->inWeight[0], COS_MOD, temp0, 100), + cos_text_sse4_1(node->inWeight[1], COS_MOD, temp1, 100), buf0, + node->inNodeIdx[0], buf0, node->inNodeIdx[1], buf1, node->nodeIdx, buf1, + partnerNode->nodeIdx); + } else { + /* type1 + * cos sin + * -sin cos + */ + // btf_32_sse2_type1(w0, w1, in0, in1, out0, out1) + // out0 = w0*in0 + w1*in1 + // out1 = w0*in1 - w1*in0 + printf( + " btf_32_type1_sse4_1_new(%s, %s, %s[%d], %s[%d], %s[%d], %s[%d], " + "__rounding, cos_bit);\n", + cos_text_sse4_1(node->inWeight[0], COS_MOD, temp0, 100), + cos_text_sse4_1(node->inWeight[1], COS_MOD, temp1, 100), buf0, + node->inNodeIdx[0], buf0, node->inNodeIdx[1], buf1, node->nodeIdx, buf1, + partnerNode->nodeIdx); + } +} + +void node_to_code_sse4_1(Node *node, const char *buf0, const char *buf1) { + int cnt = 0; + int cnt1 = 0; + if (node->visited == 0) { + node->visited = 1; + for (int i = 0; i < 2; i++) { + if (fabs(node->inWeight[i]) == 1 || fabs(node->inWeight[i]) == 0) cnt++; + if (fabs(node->inWeight[i]) == 1) cnt1++; + } + if (cnt == 2) { + if (cnt1 == 2) { + // has a partner + Node *partnerNode = get_partner_node(node); + partnerNode->visited = 1; + single_node_to_code_sse4_1(node, buf0, buf1); + single_node_to_code_sse4_1(partnerNode, buf0, buf1); + } else { + single_node_to_code_sse2(node, buf0, buf1); + } + } else { + Node *partnerNode = get_partner_node(node); + partnerNode->visited = 1; + pair_node_to_code_sse4_1(node, partnerNode, buf0, buf1); + } + } +} + +void gen_code_sse4_1(Node *node, int stage_num, int node_num, TYPE_TXFM type) { + char *fun_name = new char[100]; + get_fun_name(fun_name, 100, type, node_num); + + printf("\n"); + printf( + "void %s_sse4_1(const __m128i *input, __m128i *output, int8_t cos_bit) " + "{\n", + fun_name); + + printf(" const int32_t* cospi = cospi_arr(cos_bit);\n"); + printf(" const __m128i __zero = _mm_setzero_si128();\n"); + printf(" const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));\n"); + + graph_reset_visited(node, stage_num, node_num); + gen_cospi_list_sse4_1(node, stage_num, node_num); + graph_reset_visited(node, stage_num, node_num); + for (int si = 1; si < stage_num; si++) { + char in[100]; + char out[100]; + printf("\n"); + printf(" // stage %d\n", si); + if (si == 1) + snprintf(in, 100, "%s", "input"); + else + snprintf(in, 100, "x%d", si - 1); + if (si == stage_num - 1) { + snprintf(out, 100, "%s", "output"); + } else { + snprintf(out, 100, "x%d", si); + printf(" __m128i %s[%d];\n", out, node_num); + } + // computation code + for (int ni = 0; ni < node_num; ni++) { + int idx = get_idx(si, ni, node_num); + node_to_code_sse4_1(node + idx, in, out); + } + } + + printf("}\n"); +} + +void gen_hybrid_code(CODE_TYPE code_type, TYPE_TXFM txfm_type, int node_num) { + int stage_num = get_hybrid_stage_num(txfm_type, node_num); + + Node *node = new Node[node_num * stage_num]; + init_graph(node, stage_num, node_num); + + gen_hybrid_graph_1d(node, stage_num, node_num, 0, 0, node_num, txfm_type); + + switch (code_type) { + case CODE_TYPE_C: gen_code_c(node, stage_num, node_num, txfm_type); break; + case CODE_TYPE_SSE2: + gen_code_sse2(node, stage_num, node_num, txfm_type); + break; + case CODE_TYPE_SSE4_1: + gen_code_sse4_1(node, stage_num, node_num, txfm_type); + break; + } + + delete[] node; +} + +int main(int argc, char **argv) { + CODE_TYPE code_type = CODE_TYPE_SSE4_1; + for (int txfm_type = TYPE_DCT; txfm_type < TYPE_LAST; txfm_type++) { + for (int node_num = 4; node_num <= 64; node_num *= 2) { + gen_hybrid_code(code_type, (TYPE_TXFM)txfm_type, node_num); + } + } + return 0; +} diff --git a/libs/libaom/src/tools/txfm_analyzer/txfm_graph.cc b/libs/libaom/src/tools/txfm_analyzer/txfm_graph.cc new file mode 100644 index 000000000..a24906100 --- /dev/null +++ b/libs/libaom/src/tools/txfm_analyzer/txfm_graph.cc @@ -0,0 +1,943 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "tools/txfm_analyzer/txfm_graph.h" + +#include +#include +#include + +typedef struct Node Node; + +void get_fun_name(char *str_fun_name, int str_buf_size, const TYPE_TXFM type, + const int txfm_size) { + if (type == TYPE_DCT) + snprintf(str_fun_name, str_buf_size, "fdct%d_new", txfm_size); + else if (type == TYPE_ADST) + snprintf(str_fun_name, str_buf_size, "fadst%d_new", txfm_size); + else if (type == TYPE_IDCT) + snprintf(str_fun_name, str_buf_size, "idct%d_new", txfm_size); + else if (type == TYPE_IADST) + snprintf(str_fun_name, str_buf_size, "iadst%d_new", txfm_size); +} + +void get_txfm_type_name(char *str_fun_name, int str_buf_size, + const TYPE_TXFM type, const int txfm_size) { + if (type == TYPE_DCT) + snprintf(str_fun_name, str_buf_size, "TXFM_TYPE_DCT%d", txfm_size); + else if (type == TYPE_ADST) + snprintf(str_fun_name, str_buf_size, "TXFM_TYPE_ADST%d", txfm_size); + else if (type == TYPE_IDCT) + snprintf(str_fun_name, str_buf_size, "TXFM_TYPE_DCT%d", txfm_size); + else if (type == TYPE_IADST) + snprintf(str_fun_name, str_buf_size, "TXFM_TYPE_ADST%d", txfm_size); +} + +void get_hybrid_2d_type_name(char *buf, int buf_size, const TYPE_TXFM type0, + const TYPE_TXFM type1, const int txfm_size0, + const int txfm_size1) { + if (type0 == TYPE_DCT && type1 == TYPE_DCT) + snprintf(buf, buf_size, "_dct_dct_%dx%d", txfm_size1, txfm_size0); + else if (type0 == TYPE_DCT && type1 == TYPE_ADST) + snprintf(buf, buf_size, "_dct_adst_%dx%d", txfm_size1, txfm_size0); + else if (type0 == TYPE_ADST && type1 == TYPE_ADST) + snprintf(buf, buf_size, "_adst_adst_%dx%d", txfm_size1, txfm_size0); + else if (type0 == TYPE_ADST && type1 == TYPE_DCT) + snprintf(buf, buf_size, "_adst_dct_%dx%d", txfm_size1, txfm_size0); +} + +TYPE_TXFM get_inv_type(TYPE_TXFM type) { + if (type == TYPE_DCT) + return TYPE_IDCT; + else if (type == TYPE_ADST) + return TYPE_IADST; + else if (type == TYPE_IDCT) + return TYPE_DCT; + else if (type == TYPE_IADST) + return TYPE_ADST; + else + return TYPE_LAST; +} + +void reference_dct_1d(double *in, double *out, int size) { + const double kInvSqrt2 = 0.707106781186547524400844362104; + for (int k = 0; k < size; k++) { + out[k] = 0; // initialize out[k] + for (int n = 0; n < size; n++) { + out[k] += in[n] * cos(PI * (2 * n + 1) * k / (2 * size)); + } + if (k == 0) out[k] = out[k] * kInvSqrt2; + } +} + +void reference_dct_2d(double *in, double *out, int size) { + double *tempOut = new double[size * size]; + // dct each row: in -> out + for (int r = 0; r < size; r++) { + reference_dct_1d(in + r * size, out + r * size, size); + } + + for (int r = 0; r < size; r++) { + // out ->tempOut + for (int c = 0; c < size; c++) { + tempOut[r * size + c] = out[c * size + r]; + } + } + for (int r = 0; r < size; r++) { + reference_dct_1d(tempOut + r * size, out + r * size, size); + } + delete[] tempOut; +} + +void reference_adst_1d(double *in, double *out, int size) { + for (int k = 0; k < size; k++) { + out[k] = 0; // initialize out[k] + for (int n = 0; n < size; n++) { + out[k] += in[n] * sin(PI * (2 * n + 1) * (2 * k + 1) / (4 * size)); + } + } +} + +void reference_hybrid_2d(double *in, double *out, int size, int type0, + int type1) { + double *tempOut = new double[size * size]; + // dct each row: in -> out + for (int r = 0; r < size; r++) { + if (type0 == TYPE_DCT) + reference_dct_1d(in + r * size, out + r * size, size); + else + reference_adst_1d(in + r * size, out + r * size, size); + } + + for (int r = 0; r < size; r++) { + // out ->tempOut + for (int c = 0; c < size; c++) { + tempOut[r * size + c] = out[c * size + r]; + } + } + for (int r = 0; r < size; r++) { + if (type1 == TYPE_DCT) + reference_dct_1d(tempOut + r * size, out + r * size, size); + else + reference_adst_1d(tempOut + r * size, out + r * size, size); + } + delete[] tempOut; +} + +void reference_hybrid_2d_new(double *in, double *out, int size0, int size1, + int type0, int type1) { + double *tempOut = new double[size0 * size1]; + // dct each row: in -> out + for (int r = 0; r < size1; r++) { + if (type0 == TYPE_DCT) + reference_dct_1d(in + r * size0, out + r * size0, size0); + else + reference_adst_1d(in + r * size0, out + r * size0, size0); + } + + for (int r = 0; r < size1; r++) { + // out ->tempOut + for (int c = 0; c < size0; c++) { + tempOut[c * size1 + r] = out[r * size0 + c]; + } + } + for (int r = 0; r < size0; r++) { + if (type1 == TYPE_DCT) + reference_dct_1d(tempOut + r * size1, out + r * size1, size1); + else + reference_adst_1d(tempOut + r * size1, out + r * size1, size1); + } + delete[] tempOut; +} + +unsigned int get_max_bit(unsigned int x) { + int max_bit = -1; + while (x) { + x = x >> 1; + max_bit++; + } + return max_bit; +} + +unsigned int bitwise_reverse(unsigned int x, int max_bit) { + x = ((x >> 16) & 0x0000ffff) | ((x & 0x0000ffff) << 16); + x = ((x >> 8) & 0x00ff00ff) | ((x & 0x00ff00ff) << 8); + x = ((x >> 4) & 0x0f0f0f0f) | ((x & 0x0f0f0f0f) << 4); + x = ((x >> 2) & 0x33333333) | ((x & 0x33333333) << 2); + x = ((x >> 1) & 0x55555555) | ((x & 0x55555555) << 1); + x = x >> (31 - max_bit); + return x; +} + +int get_idx(int ri, int ci, int cSize) { return ri * cSize + ci; } + +void add_node(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int in, double w) { + int outIdx = get_idx(stage_idx, node_idx, node_num); + int inIdx = get_idx(stage_idx - 1, in, node_num); + int idx = node[outIdx].inNodeNum; + if (idx < 2) { + node[outIdx].inNode[idx] = &node[inIdx]; + node[outIdx].inNodeIdx[idx] = in; + node[outIdx].inWeight[idx] = w; + idx++; + node[outIdx].inNodeNum = idx; + } else { + printf("Error: inNode is full"); + } +} + +void connect_node(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int in0, double w0, int in1, double w1) { + int outIdx = get_idx(stage_idx, node_idx, node_num); + int inIdx0 = get_idx(stage_idx - 1, in0, node_num); + int inIdx1 = get_idx(stage_idx - 1, in1, node_num); + + int idx = 0; + // if(w0 != 0) { + node[outIdx].inNode[idx] = &node[inIdx0]; + node[outIdx].inNodeIdx[idx] = in0; + node[outIdx].inWeight[idx] = w0; + idx++; + //} + + // if(w1 != 0) { + node[outIdx].inNode[idx] = &node[inIdx1]; + node[outIdx].inNodeIdx[idx] = in1; + node[outIdx].inWeight[idx] = w1; + idx++; + //} + + node[outIdx].inNodeNum = idx; +} + +void propagate(Node *node, int stage_num, int node_num, int stage_idx) { + for (int ni = 0; ni < node_num; ni++) { + int outIdx = get_idx(stage_idx, ni, node_num); + node[outIdx].value = 0; + for (int k = 0; k < node[outIdx].inNodeNum; k++) { + node[outIdx].value += + node[outIdx].inNode[k]->value * node[outIdx].inWeight[k]; + } + } +} + +int64_t round_shift(int64_t value, int bit) { + if (bit > 0) { + if (value < 0) { + return -round_shift(-value, bit); + } else { + return (value + (1 << (bit - 1))) >> bit; + } + } else { + return value << (-bit); + } +} + +void round_shift_array(int32_t *arr, int size, int bit) { + if (bit == 0) { + return; + } else { + for (int i = 0; i < size; i++) { + arr[i] = round_shift(arr[i], bit); + } + } +} + +void graph_reset_visited(Node *node, int stage_num, int node_num) { + for (int si = 0; si < stage_num; si++) { + for (int ni = 0; ni < node_num; ni++) { + int idx = get_idx(si, ni, node_num); + node[idx].visited = 0; + } + } +} + +void estimate_value(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int estimate_bit) { + if (stage_idx > 0) { + int outIdx = get_idx(stage_idx, node_idx, node_num); + int64_t out = 0; + node[outIdx].value = 0; + for (int k = 0; k < node[outIdx].inNodeNum; k++) { + int64_t w = round(node[outIdx].inWeight[k] * (1 << estimate_bit)); + int64_t v = round(node[outIdx].inNode[k]->value); + out += v * w; + } + node[outIdx].value = round_shift(out, estimate_bit); + } +} + +void amplify_value(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int amplify_bit) { + int outIdx = get_idx(stage_idx, node_idx, node_num); + node[outIdx].value = round_shift(round(node[outIdx].value), -amplify_bit); +} + +void propagate_estimate_amlify(Node *node, int stage_num, int node_num, + int stage_idx, int amplify_bit, + int estimate_bit) { + for (int ni = 0; ni < node_num; ni++) { + estimate_value(node, stage_num, node_num, stage_idx, ni, estimate_bit); + amplify_value(node, stage_num, node_num, stage_idx, ni, amplify_bit); + } +} + +void init_graph(Node *node, int stage_num, int node_num) { + for (int si = 0; si < stage_num; si++) { + for (int ni = 0; ni < node_num; ni++) { + int outIdx = get_idx(si, ni, node_num); + node[outIdx].stageIdx = si; + node[outIdx].nodeIdx = ni; + node[outIdx].value = 0; + node[outIdx].inNodeNum = 0; + if (si >= 1) { + connect_node(node, stage_num, node_num, si, ni, ni, 1, ni, 0); + } + } + } +} + +void gen_B_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int N, int star) { + for (int i = 0; i < N / 2; i++) { + int out = node_idx + i; + int in1 = node_idx + N - 1 - i; + if (star == 1) { + connect_node(node, stage_num, node_num, stage_idx + 1, out, out, -1, in1, + 1); + } else { + connect_node(node, stage_num, node_num, stage_idx + 1, out, out, 1, in1, + 1); + } + } + for (int i = N / 2; i < N; i++) { + int out = node_idx + i; + int in1 = node_idx + N - 1 - i; + if (star == 1) { + connect_node(node, stage_num, node_num, stage_idx + 1, out, out, 1, in1, + 1); + } else { + connect_node(node, stage_num, node_num, stage_idx + 1, out, out, -1, in1, + 1); + } + } +} + +void gen_P_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int N) { + int max_bit = get_max_bit(N - 1); + for (int i = 0; i < N; i++) { + int out = node_idx + bitwise_reverse(i, max_bit); + int in = node_idx + i; + connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0); + } +} + +void gen_type1_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int N) { + int max_bit = get_max_bit(N); + for (int ni = 0; ni < N / 2; ni++) { + int ai = bitwise_reverse(N + ni, max_bit); + int out = node_idx + ni; + int in1 = node_idx + N - ni - 1; + connect_node(node, stage_num, node_num, stage_idx + 1, out, out, + sin(PI * ai / (2 * 2 * N)), in1, cos(PI * ai / (2 * 2 * N))); + } + for (int ni = N / 2; ni < N; ni++) { + int ai = bitwise_reverse(N + ni, max_bit); + int out = node_idx + ni; + int in1 = node_idx + N - ni - 1; + connect_node(node, stage_num, node_num, stage_idx + 1, out, out, + cos(PI * ai / (2 * 2 * N)), in1, -sin(PI * ai / (2 * 2 * N))); + } +} + +void gen_type2_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int N) { + for (int ni = 0; ni < N / 4; ni++) { + int out = node_idx + ni; + connect_node(node, stage_num, node_num, stage_idx + 1, out, out, 1, out, 0); + } + + for (int ni = N / 4; ni < N / 2; ni++) { + int out = node_idx + ni; + int in1 = node_idx + N - ni - 1; + connect_node(node, stage_num, node_num, stage_idx + 1, out, out, + -cos(PI / 4), in1, cos(-PI / 4)); + } + + for (int ni = N / 2; ni < N * 3 / 4; ni++) { + int out = node_idx + ni; + int in1 = node_idx + N - ni - 1; + connect_node(node, stage_num, node_num, stage_idx + 1, out, out, + cos(-PI / 4), in1, cos(PI / 4)); + } + + for (int ni = N * 3 / 4; ni < N; ni++) { + int out = node_idx + ni; + connect_node(node, stage_num, node_num, stage_idx + 1, out, out, 1, out, 0); + } +} + +void gen_type3_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int idx, int N) { + // TODO(angiebird): Simplify and clarify this function + + int i = 2 * N / (1 << (idx / 2)); + int max_bit = + get_max_bit(i / 2) - 1; // the max_bit counts on i/2 instead of N here + int N_over_i = 2 << (idx / 2); + + for (int nj = 0; nj < N / 2; nj += N_over_i) { + int j = nj / (N_over_i); + int kj = bitwise_reverse(i / 4 + j, max_bit); + // printf("kj = %d\n", kj); + + // I_N/2i --- 0 + int offset = nj; + for (int ni = 0; ni < N_over_i / 4; ni++) { + int out = node_idx + offset + ni; + int in = out; + connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0); + } + + // -C_Kj/i --- S_Kj/i + offset += N_over_i / 4; + for (int ni = 0; ni < N_over_i / 4; ni++) { + int out = node_idx + offset + ni; + int in0 = out; + double w0 = -cos(kj * PI / i); + int in1 = N - (offset + ni) - 1 + node_idx; + double w1 = sin(kj * PI / i); + connect_node(node, stage_num, node_num, stage_idx + 1, out, in0, w0, in1, + w1); + } + + // S_kj/i --- -C_Kj/i + offset += N_over_i / 4; + for (int ni = 0; ni < N_over_i / 4; ni++) { + int out = node_idx + offset + ni; + int in0 = out; + double w0 = -sin(kj * PI / i); + int in1 = N - (offset + ni) - 1 + node_idx; + double w1 = -cos(kj * PI / i); + connect_node(node, stage_num, node_num, stage_idx + 1, out, in0, w0, in1, + w1); + } + + // I_N/2i --- 0 + offset += N_over_i / 4; + for (int ni = 0; ni < N_over_i / 4; ni++) { + int out = node_idx + offset + ni; + int in = out; + connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0); + } + } + + for (int nj = N / 2; nj < N; nj += N_over_i) { + int j = nj / N_over_i; + int kj = bitwise_reverse(i / 4 + j, max_bit); + + // I_N/2i --- 0 + int offset = nj; + for (int ni = 0; ni < N_over_i / 4; ni++) { + int out = node_idx + offset + ni; + int in = out; + connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0); + } + + // C_kj/i --- -S_Kj/i + offset += N_over_i / 4; + for (int ni = 0; ni < N_over_i / 4; ni++) { + int out = node_idx + offset + ni; + int in0 = out; + double w0 = cos(kj * PI / i); + int in1 = N - (offset + ni) - 1 + node_idx; + double w1 = -sin(kj * PI / i); + connect_node(node, stage_num, node_num, stage_idx + 1, out, in0, w0, in1, + w1); + } + + // S_kj/i --- C_Kj/i + offset += N_over_i / 4; + for (int ni = 0; ni < N_over_i / 4; ni++) { + int out = node_idx + offset + ni; + int in0 = out; + double w0 = sin(kj * PI / i); + int in1 = N - (offset + ni) - 1 + node_idx; + double w1 = cos(kj * PI / i); + connect_node(node, stage_num, node_num, stage_idx + 1, out, in0, w0, in1, + w1); + } + + // I_N/2i --- 0 + offset += N_over_i / 4; + for (int ni = 0; ni < N_over_i / 4; ni++) { + int out = node_idx + offset + ni; + int in = out; + connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0); + } + } +} + +void gen_type4_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int idx, int N) { + int B_size = 1 << ((idx + 1) / 2); + for (int ni = 0; ni < N; ni += B_size) { + gen_B_graph(node, stage_num, node_num, stage_idx, node_idx + ni, B_size, + (ni / B_size) % 2); + } +} + +void gen_R_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int N) { + int max_idx = 2 * (get_max_bit(N) + 1) - 3; + for (int idx = 0; idx < max_idx; idx++) { + int s = stage_idx + max_idx - idx - 1; + if (idx == 0) { + // type 1 + gen_type1_graph(node, stage_num, node_num, s, node_idx, N); + } else if (idx == max_idx - 1) { + // type 2 + gen_type2_graph(node, stage_num, node_num, s, node_idx, N); + } else if ((idx + 1) % 2 == 0) { + // type 4 + gen_type4_graph(node, stage_num, node_num, s, node_idx, idx, N); + } else if ((idx + 1) % 2 == 1) { + // type 3 + gen_type3_graph(node, stage_num, node_num, s, node_idx, idx, N); + } else { + printf("check gen_R_graph()\n"); + } + } +} + +void gen_DCT_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int N) { + if (N > 2) { + gen_B_graph(node, stage_num, node_num, stage_idx, node_idx, N, 0); + gen_DCT_graph(node, stage_num, node_num, stage_idx + 1, node_idx, N / 2); + gen_R_graph(node, stage_num, node_num, stage_idx + 1, node_idx + N / 2, + N / 2); + } else { + // generate dct_2 + connect_node(node, stage_num, node_num, stage_idx + 1, node_idx, node_idx, + cos(PI / 4), node_idx + 1, cos(PI / 4)); + connect_node(node, stage_num, node_num, stage_idx + 1, node_idx + 1, + node_idx + 1, -cos(PI / 4), node_idx, cos(PI / 4)); + } +} + +int get_dct_stage_num(int size) { return 2 * get_max_bit(size); } + +void gen_DCT_graph_1d(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int dct_node_num) { + gen_DCT_graph(node, stage_num, node_num, stage_idx, node_idx, dct_node_num); + int dct_stage_num = get_dct_stage_num(dct_node_num); + gen_P_graph(node, stage_num, node_num, stage_idx + dct_stage_num - 2, + node_idx, dct_node_num); +} + +void gen_adst_B_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int adst_idx) { + int size = 1 << (adst_idx + 1); + for (int ni = 0; ni < size / 2; ni++) { + int nOut = node_idx + ni; + int nIn = nOut + size / 2; + connect_node(node, stage_num, node_num, stage_idx + 1, nOut, nOut, 1, nIn, + 1); + // printf("nOut: %d nIn: %d\n", nOut, nIn); + } + for (int ni = size / 2; ni < size; ni++) { + int nOut = node_idx + ni; + int nIn = nOut - size / 2; + connect_node(node, stage_num, node_num, stage_idx + 1, nOut, nOut, -1, nIn, + 1); + // printf("ndctOut: %d nIn: %d\n", nOut, nIn); + } +} + +void gen_adst_U_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int adst_idx, int adst_node_num) { + int size = 1 << (adst_idx + 1); + for (int ni = 0; ni < adst_node_num; ni += size) { + gen_adst_B_graph(node, stage_num, node_num, stage_idx, node_idx + ni, + adst_idx); + } +} + +void gen_adst_T_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, double freq) { + connect_node(node, stage_num, node_num, stage_idx + 1, node_idx, node_idx, + cos(freq * PI), node_idx + 1, sin(freq * PI)); + connect_node(node, stage_num, node_num, stage_idx + 1, node_idx + 1, + node_idx + 1, -cos(freq * PI), node_idx, sin(freq * PI)); +} + +void gen_adst_E_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int adst_idx) { + int size = 1 << (adst_idx); + for (int i = 0; i < size / 2; i++) { + int ni = i * 2; + double fi = (1 + 4 * i) * 1.0 / (1 << (adst_idx + 1)); + gen_adst_T_graph(node, stage_num, node_num, stage_idx, node_idx + ni, fi); + } +} + +void gen_adst_V_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int adst_idx, int adst_node_num) { + int size = 1 << (adst_idx); + for (int i = 0; i < adst_node_num / size; i++) { + if (i % 2 == 1) { + int ni = i * size; + gen_adst_E_graph(node, stage_num, node_num, stage_idx, node_idx + ni, + adst_idx); + } + } +} +void gen_adst_VJ_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int adst_node_num) { + for (int i = 0; i < adst_node_num / 2; i++) { + int ni = i * 2; + double fi = (1 + 4 * i) * 1.0 / (4 * adst_node_num); + gen_adst_T_graph(node, stage_num, node_num, stage_idx, node_idx + ni, fi); + } +} +void gen_adst_Q_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int adst_node_num) { + // reverse order when idx is 1, 3, 5, 7 ... + // example of adst_node_num = 8: + // 0 1 2 3 4 5 6 7 + // --> 0 7 2 5 4 3 6 1 + for (int ni = 0; ni < adst_node_num; ni++) { + if (ni % 2 == 0) { + int out = node_idx + ni; + connect_node(node, stage_num, node_num, stage_idx + 1, out, out, 1, out, + 0); + } else { + int out = node_idx + ni; + int in = node_idx + adst_node_num - ni; + connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0); + } + } +} +void gen_adst_Ibar_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int adst_node_num) { + // reverse order + // 0 1 2 3 --> 3 2 1 0 + for (int ni = 0; ni < adst_node_num; ni++) { + int out = node_idx + ni; + int in = node_idx + adst_node_num - ni - 1; + connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0); + } +} + +int get_Q_out2in(int adst_node_num, int out) { + int in; + if (out % 2 == 0) { + in = out; + } else { + in = adst_node_num - out; + } + return in; +} + +int get_Ibar_out2in(int adst_node_num, int out) { + return adst_node_num - out - 1; +} + +void gen_adst_IbarQ_graph(Node *node, int stage_num, int node_num, + int stage_idx, int node_idx, int adst_node_num) { + // in -> Ibar -> Q -> out + for (int ni = 0; ni < adst_node_num; ni++) { + int out = node_idx + ni; + int in = node_idx + + get_Ibar_out2in(adst_node_num, get_Q_out2in(adst_node_num, ni)); + connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0); + } +} + +void gen_adst_D_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int adst_node_num) { + // reverse order + for (int ni = 0; ni < adst_node_num; ni++) { + int out = node_idx + ni; + int in = out; + if (ni % 2 == 0) { + connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0); + } else { + connect_node(node, stage_num, node_num, stage_idx + 1, out, in, -1, in, + 0); + } + } +} + +int get_hadamard_idx(int x, int adst_node_num) { + int max_bit = get_max_bit(adst_node_num - 1); + x = bitwise_reverse(x, max_bit); + + // gray code + int c = x & 1; + int p = x & 1; + int y = c; + + for (int i = 1; i <= max_bit; i++) { + p = c; + c = (x >> i) & 1; + y += (c ^ p) << i; + } + return y; +} + +void gen_adst_Ht_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int adst_node_num) { + for (int ni = 0; ni < adst_node_num; ni++) { + int out = node_idx + ni; + int in = node_idx + get_hadamard_idx(ni, adst_node_num); + connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0); + } +} + +void gen_adst_HtD_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int adst_node_num) { + for (int ni = 0; ni < adst_node_num; ni++) { + int out = node_idx + ni; + int in = node_idx + get_hadamard_idx(ni, adst_node_num); + double inW; + if (ni % 2 == 0) + inW = 1; + else + inW = -1; + connect_node(node, stage_num, node_num, stage_idx + 1, out, in, inW, in, 0); + } +} + +int get_adst_stage_num(int adst_node_num) { + return 2 * get_max_bit(adst_node_num) + 2; +} + +int gen_iadst_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int adst_node_num) { + int max_bit = get_max_bit(adst_node_num); + int si = 0; + gen_adst_IbarQ_graph(node, stage_num, node_num, stage_idx + si, node_idx, + adst_node_num); + si++; + gen_adst_VJ_graph(node, stage_num, node_num, stage_idx + si, node_idx, + adst_node_num); + si++; + for (int adst_idx = max_bit - 1; adst_idx >= 1; adst_idx--) { + gen_adst_U_graph(node, stage_num, node_num, stage_idx + si, node_idx, + adst_idx, adst_node_num); + si++; + gen_adst_V_graph(node, stage_num, node_num, stage_idx + si, node_idx, + adst_idx, adst_node_num); + si++; + } + gen_adst_HtD_graph(node, stage_num, node_num, stage_idx + si, node_idx, + adst_node_num); + si++; + return si + 1; +} + +int gen_adst_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int adst_node_num) { + int hybrid_stage_num = get_hybrid_stage_num(TYPE_ADST, adst_node_num); + // generate a adst tempNode + Node *tempNode = new Node[hybrid_stage_num * adst_node_num]; + init_graph(tempNode, hybrid_stage_num, adst_node_num); + int si = gen_iadst_graph(tempNode, hybrid_stage_num, adst_node_num, 0, 0, + adst_node_num); + + // tempNode's inverse graph to node[stage_idx][node_idx] + gen_inv_graph(tempNode, hybrid_stage_num, adst_node_num, node, stage_num, + node_num, stage_idx, node_idx); + delete[] tempNode; + return si; +} + +void connect_layer_2d(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int dct_node_num) { + for (int first = 0; first < dct_node_num; first++) { + for (int second = 0; second < dct_node_num; second++) { + // int sIn = stage_idx; + int sOut = stage_idx + 1; + int nIn = node_idx + first * dct_node_num + second; + int nOut = node_idx + second * dct_node_num + first; + + // printf("sIn: %d nIn: %d sOut: %d nOut: %d\n", sIn, nIn, sOut, nOut); + + connect_node(node, stage_num, node_num, sOut, nOut, nIn, 1, nIn, 0); + } + } +} + +void connect_layer_2d_new(Node *node, int stage_num, int node_num, + int stage_idx, int node_idx, int dct_node_num0, + int dct_node_num1) { + for (int i = 0; i < dct_node_num1; i++) { + for (int j = 0; j < dct_node_num0; j++) { + // int sIn = stage_idx; + int sOut = stage_idx + 1; + int nIn = node_idx + i * dct_node_num0 + j; + int nOut = node_idx + j * dct_node_num1 + i; + + // printf("sIn: %d nIn: %d sOut: %d nOut: %d\n", sIn, nIn, sOut, nOut); + + connect_node(node, stage_num, node_num, sOut, nOut, nIn, 1, nIn, 0); + } + } +} + +void gen_DCT_graph_2d(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int dct_node_num) { + int dct_stage_num = get_dct_stage_num(dct_node_num); + // put 2 layers of dct_node_num DCTs on the graph + for (int ni = 0; ni < dct_node_num; ni++) { + gen_DCT_graph_1d(node, stage_num, node_num, stage_idx, + node_idx + ni * dct_node_num, dct_node_num); + gen_DCT_graph_1d(node, stage_num, node_num, stage_idx + dct_stage_num, + node_idx + ni * dct_node_num, dct_node_num); + } + // connect first layer and second layer + connect_layer_2d(node, stage_num, node_num, stage_idx + dct_stage_num - 1, + node_idx, dct_node_num); +} + +int get_hybrid_stage_num(int type, int hybrid_node_num) { + if (type == TYPE_DCT || type == TYPE_IDCT) { + return get_dct_stage_num(hybrid_node_num); + } else if (type == TYPE_ADST || type == TYPE_IADST) { + return get_adst_stage_num(hybrid_node_num); + } + return 0; +} + +int get_hybrid_2d_stage_num(int type0, int type1, int hybrid_node_num) { + int stage_num = 0; + stage_num += get_hybrid_stage_num(type0, hybrid_node_num); + stage_num += get_hybrid_stage_num(type1, hybrid_node_num); + return stage_num; +} + +int get_hybrid_2d_stage_num_new(int type0, int type1, int hybrid_node_num0, + int hybrid_node_num1) { + int stage_num = 0; + stage_num += get_hybrid_stage_num(type0, hybrid_node_num0); + stage_num += get_hybrid_stage_num(type1, hybrid_node_num1); + return stage_num; +} + +int get_hybrid_amplify_factor(int type, int hybrid_node_num) { + return get_max_bit(hybrid_node_num) - 1; +} + +void gen_hybrid_graph_1d(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int hybrid_node_num, int type) { + if (type == TYPE_DCT) { + gen_DCT_graph_1d(node, stage_num, node_num, stage_idx, node_idx, + hybrid_node_num); + } else if (type == TYPE_ADST) { + gen_adst_graph(node, stage_num, node_num, stage_idx, node_idx, + hybrid_node_num); + } else if (type == TYPE_IDCT) { + int hybrid_stage_num = get_hybrid_stage_num(type, hybrid_node_num); + // generate a dct tempNode + Node *tempNode = new Node[hybrid_stage_num * hybrid_node_num]; + init_graph(tempNode, hybrid_stage_num, hybrid_node_num); + gen_DCT_graph_1d(tempNode, hybrid_stage_num, hybrid_node_num, 0, 0, + hybrid_node_num); + + // tempNode's inverse graph to node[stage_idx][node_idx] + gen_inv_graph(tempNode, hybrid_stage_num, hybrid_node_num, node, stage_num, + node_num, stage_idx, node_idx); + delete[] tempNode; + } else if (type == TYPE_IADST) { + int hybrid_stage_num = get_hybrid_stage_num(type, hybrid_node_num); + // generate a adst tempNode + Node *tempNode = new Node[hybrid_stage_num * hybrid_node_num]; + init_graph(tempNode, hybrid_stage_num, hybrid_node_num); + gen_adst_graph(tempNode, hybrid_stage_num, hybrid_node_num, 0, 0, + hybrid_node_num); + + // tempNode's inverse graph to node[stage_idx][node_idx] + gen_inv_graph(tempNode, hybrid_stage_num, hybrid_node_num, node, stage_num, + node_num, stage_idx, node_idx); + delete[] tempNode; + } +} + +void gen_hybrid_graph_2d(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int hybrid_node_num, int type0, + int type1) { + int hybrid_stage_num = get_hybrid_stage_num(type0, hybrid_node_num); + + for (int ni = 0; ni < hybrid_node_num; ni++) { + gen_hybrid_graph_1d(node, stage_num, node_num, stage_idx, + node_idx + ni * hybrid_node_num, hybrid_node_num, + type0); + gen_hybrid_graph_1d(node, stage_num, node_num, stage_idx + hybrid_stage_num, + node_idx + ni * hybrid_node_num, hybrid_node_num, + type1); + } + + // connect first layer and second layer + connect_layer_2d(node, stage_num, node_num, stage_idx + hybrid_stage_num - 1, + node_idx, hybrid_node_num); +} + +void gen_hybrid_graph_2d_new(Node *node, int stage_num, int node_num, + int stage_idx, int node_idx, int hybrid_node_num0, + int hybrid_node_num1, int type0, int type1) { + int hybrid_stage_num0 = get_hybrid_stage_num(type0, hybrid_node_num0); + + for (int ni = 0; ni < hybrid_node_num1; ni++) { + gen_hybrid_graph_1d(node, stage_num, node_num, stage_idx, + node_idx + ni * hybrid_node_num0, hybrid_node_num0, + type0); + } + for (int ni = 0; ni < hybrid_node_num0; ni++) { + gen_hybrid_graph_1d( + node, stage_num, node_num, stage_idx + hybrid_stage_num0, + node_idx + ni * hybrid_node_num1, hybrid_node_num1, type1); + } + + // connect first layer and second layer + connect_layer_2d_new(node, stage_num, node_num, + stage_idx + hybrid_stage_num0 - 1, node_idx, + hybrid_node_num0, hybrid_node_num1); +} + +void gen_inv_graph(Node *node, int stage_num, int node_num, Node *invNode, + int inv_stage_num, int inv_node_num, int inv_stage_idx, + int inv_node_idx) { + // clean up inNodeNum in invNode because of add_node + for (int si = 1 + inv_stage_idx; si < inv_stage_idx + stage_num; si++) { + for (int ni = inv_node_idx; ni < inv_node_idx + node_num; ni++) { + int idx = get_idx(si, ni, inv_node_num); + invNode[idx].inNodeNum = 0; + } + } + // generate inverse graph of node on invNode + for (int si = 1; si < stage_num; si++) { + for (int ni = 0; ni < node_num; ni++) { + int invSi = stage_num - si; + int idx = get_idx(si, ni, node_num); + for (int k = 0; k < node[idx].inNodeNum; k++) { + int invNi = node[idx].inNodeIdx[k]; + add_node(invNode, inv_stage_num, inv_node_num, invSi + inv_stage_idx, + invNi + inv_node_idx, ni + inv_node_idx, + node[idx].inWeight[k]); + } + } + } +} diff --git a/libs/libaom/src/tools/txfm_analyzer/txfm_graph.h b/libs/libaom/src/tools/txfm_analyzer/txfm_graph.h new file mode 100644 index 000000000..8dc36146d --- /dev/null +++ b/libs/libaom/src/tools/txfm_analyzer/txfm_graph.h @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_TOOLS_TXFM_ANALYZER_TXFM_GRAPH_H_ +#define AOM_TOOLS_TXFM_ANALYZER_TXFM_GRAPH_H_ + +struct Node { + Node *inNode[2]; + int inNodeNum; + int inNodeIdx[2]; + double inWeight[2]; + double value; + int nodeIdx; + int stageIdx; + int visited; +}; + +#define STAGENUM (10) +#define NODENUM (32) +#define COS_MOD (128) + +typedef enum { + TYPE_DCT = 0, + TYPE_ADST, + TYPE_IDCT, + TYPE_IADST, + TYPE_LAST +} TYPE_TXFM; + +TYPE_TXFM get_inv_type(TYPE_TXFM type); +void get_fun_name(char *str_fun_name, int str_buf_size, const TYPE_TXFM type, + const int txfm_size); + +void get_txfm_type_name(char *str_fun_name, int str_buf_size, + const TYPE_TXFM type, const int txfm_size); +void get_hybrid_2d_type_name(char *buf, int buf_size, const TYPE_TXFM type0, + const TYPE_TXFM type1, const int txfm_size0, + const int txfm_size1); +unsigned int get_max_bit(unsigned int x); +unsigned int bitwise_reverse(unsigned int x, int max_bit); +int get_idx(int ri, int ci, int cSize); + +int get_dct_stage_num(int size); +void reference_dct_1d(double *in, double *out, int size); +void reference_dct_2d(double *in, double *out, int size); +void connect_node(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int in0, double w0, int in1, double w1); +void propagate(Node *node, int stage_num, int node_num, int stage); +void init_graph(Node *node, int stage_num, int node_num); +void graph_reset_visited(Node *node, int stage_num, int node_num); +void gen_B_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int N, int star); +void gen_P_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int N); + +void gen_type1_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int N); +void gen_type2_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int N); +void gen_type3_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int idx, int N); +void gen_type4_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int idx, int N); + +void gen_R_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int N); + +void gen_DCT_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int N); + +void gen_DCT_graph_1d(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int dct_node_num); +void connect_layer_2d(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int dct_node_num); + +void gen_DCT_graph_2d(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int dct_node_num); + +void gen_adst_B_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int adst_idx); + +void gen_adst_U_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int adst_idx, int adst_node_num); +void gen_adst_T_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, double freq); + +void gen_adst_E_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int adst_idx); + +void gen_adst_V_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int adst_idx, int adst_node_num); + +void gen_adst_VJ_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int adst_node_num); +void gen_adst_Q_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int adst_node_num); +void gen_adst_Ibar_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int adst_node_num); + +void gen_adst_D_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int adst_node_num); + +int get_hadamard_idx(int x, int adst_node_num); +void gen_adst_Ht_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int adst_node_num); + +int gen_adst_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int adst_node_num); +int gen_iadst_graph(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int adst_node_num); +void reference_adst_1d(double *in, double *out, int size); + +int get_adst_stage_num(int adst_node_num); +int get_hybrid_stage_num(int type, int hybrid_node_num); +int get_hybrid_2d_stage_num(int type0, int type1, int hybrid_node_num); +int get_hybrid_2d_stage_num_new(int type0, int type1, int hybrid_node_num0, + int hybrid_node_num1); +int get_hybrid_amplify_factor(int type, int hybrid_node_num); +void gen_hybrid_graph_1d(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int hybrid_node_num, int type); +void gen_hybrid_graph_2d(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int hybrid_node_num, int type0, + int type1); +void gen_hybrid_graph_2d_new(Node *node, int stage_num, int node_num, + int stage_idx, int node_idx, int hybrid_node_num0, + int hybrid_node_num1, int type0, int type1); + +void reference_hybrid_2d(double *in, double *out, int size, int type0, + int type1); + +void reference_hybrid_2d_new(double *in, double *out, int size0, int size1, + int type0, int type1); +void reference_adst_dct_2d(double *in, double *out, int size); + +void gen_code(Node *node, int stage_num, int node_num, TYPE_TXFM type); + +void gen_inv_graph(Node *node, int stage_num, int node_num, Node *invNode, + int inv_stage_num, int inv_node_num, int inv_stage_idx, + int inv_node_idx); + +TYPE_TXFM hybrid_char_to_int(char ctype); + +int64_t round_shift(int64_t value, int bit); +void round_shift_array(int32_t *arr, int size, int bit); +void estimate_value(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int estimate_bit); +void amplify_value(Node *node, int stage_num, int node_num, int stage_idx, + int node_idx, int estimate_bit); +void propagate_estimate_amlify(Node *node, int stage_num, int node_num, + int stage_idx, int amplify_bit, + int estimate_bit); +#endif // AOM_TOOLS_TXFM_ANALYZER_TXFM_GRAPH_H_ diff --git a/libs/libaom/src/tools/wrap-commit-msg.py b/libs/libaom/src/tools/wrap-commit-msg.py new file mode 100644 index 000000000..1c7882443 --- /dev/null +++ b/libs/libaom/src/tools/wrap-commit-msg.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python +## +## Copyright (c) 2016, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +"""Wraps paragraphs of text, preserving manual formatting + +This is like fold(1), but has the special convention of not modifying lines +that start with whitespace. This allows you to intersperse blocks with +special formatting, like code blocks, with written prose. The prose will +be wordwrapped, and the manual formatting will be preserved. + + * This won't handle the case of a bulleted (or ordered) list specially, so + manual wrapping must be done. + +Occasionally it's useful to put something with explicit formatting that +doesn't look at all like a block of text inline. + + indicator = has_leading_whitespace(line); + if (indicator) + preserve_formatting(line); + +The intent is that this docstring would make it through the transform +and still be legible and presented as it is in the source. If additional +cases are handled, update this doc to describe the effect. +""" + +__author__ = "jkoleszar@google.com" +import textwrap +import sys + +def wrap(text): + if text: + return textwrap.fill(text, break_long_words=False) + '\n' + return "" + + +def main(fileobj): + text = "" + output = "" + while True: + line = fileobj.readline() + if not line: + break + + if line.lstrip() == line: + text += line + else: + output += wrap(text) + text="" + output += line + output += wrap(text) + + # Replace the file or write to stdout. + if fileobj == sys.stdin: + fileobj = sys.stdout + else: + fileobj.seek(0) + fileobj.truncate(0) + fileobj.write(output) + +if __name__ == "__main__": + if len(sys.argv) > 1: + main(open(sys.argv[1], "r+")) + else: + main(sys.stdin) diff --git a/libs/libaom/src/usage.dox b/libs/libaom/src/usage.dox new file mode 100644 index 000000000..4004f4af5 --- /dev/null +++ b/libs/libaom/src/usage.dox @@ -0,0 +1,109 @@ +/*!\page usage Usage + + The aom multi-format codec SDK provides a unified interface amongst its + supported codecs. This abstraction allows applications using this SDK to + easily support multiple video formats with minimal code duplication or + "special casing." This section describes the interface common to all codecs. + For codec-specific details, see the \ref codecs page. + + The following sections are common to all codecs: + - \ref usage_types + - \ref usage_features + - \ref usage_init + - \ref usage_errors + + For more information on decoder and encoder specific usage, see the + following pages: + \if decoder + \li \subpage usage_decode + \endif + \if encoder + \li \subpage usage_encode + \endif + + \section usage_types Important Data Types + There are two important data structures to consider in this interface. + + \subsection usage_ctxs Contexts + A context is a storage area allocated by the calling application that the + codec may write into to store details about a single instance of that codec. + Most of the context is implementation specific, and thus opaque to the + application. The context structure as seen by the application is of fixed + size, and thus can be allocated with automatic storage or dynamically + on the heap. + + Most operations require an initialized codec context. Codec context + instances are codec specific. That is, the codec to be used for the encoded + video must be known at initialization time. See #aom_codec_ctx_t for further + information. + + \subsection usage_ifaces Interfaces + A codec interface is an opaque structure that controls how function calls + into the generic interface are dispatched to their codec-specific + implementations. Applications \ref MUSTNOT attempt to examine or override + this storage, as it contains internal implementation details likely to + change from release to release. + + Each supported codec will expose an interface structure to the application + as an extern reference to a structure of the incomplete type + #aom_codec_iface_t. + + \section usage_features Features + Several "features" are defined that are optionally implemented by codec + algorithms. Indeed, the same algorithm may support different features on + different platforms. The purpose of defining these features is that when + they are implemented, they conform to a common interface. The features, or + capabilities, of an algorithm can be queried from it's interface by using + the aom_codec_get_caps() method. Attempts to invoke features not supported + by an algorithm will generally result in #AOM_CODEC_INCAPABLE. + + \if decoder + Currently defined decoder features include: + \endif + + \section usage_init Initialization + To initialize a codec instance, the address of the codec context + and interface structures are passed to an initialization function. Depending + on the \ref usage_features that the codec supports, the codec could be + initialized in different modes. + + To prevent cases of confusion where the ABI of the library changes, + the ABI is versioned. The ABI version number must be passed at + initialization time to ensure the application is using a header file that + matches the library. The current ABI version number is stored in the + preprocessor macros #AOM_CODEC_ABI_VERSION, #AOM_ENCODER_ABI_VERSION, and + #AOM_DECODER_ABI_VERSION. For convenience, each initialization function has + a wrapper macro that inserts the correct version number. These macros are + named like the initialization methods, but without the _ver suffix. + + + The available initialization methods are: + \if encoder + \li #aom_codec_enc_init (calls aom_codec_enc_init_ver()) + \endif + \if decoder + \li #aom_codec_dec_init (calls aom_codec_dec_init_ver()) + \endif + + + \section usage_errors Error Handling + Almost all codec functions return an error status of type #aom_codec_err_t. + The semantics of how each error condition should be processed is clearly + defined in the definitions of each enumerated value. Error values can be + converted into ASCII strings with the aom_codec_error() and + aom_codec_err_to_string() methods. The difference between these two methods is + that aom_codec_error() returns the error state from an initialized context, + whereas aom_codec_err_to_string() can be used in cases where an error occurs + outside any context. The enumerated value returned from the last call can be + retrieved from the err member of the decoder context as well. + Finally, more detailed error information may be able to be obtained by using + the aom_codec_error_detail() method. Not all errors produce detailed error + information. + + In addition to error information, the codec library's build configuration + is available at runtime on some platforms. This information can be returned + by calling aom_codec_build_config(), and is formatted as a base64 coded string + (comprised of characters in the set [a-z_a-Z0-9+/]). This information is not + useful to an application at runtime, but may be of use to aom for support. + +*/ diff --git a/libs/libaom/src/usage_cx.dox b/libs/libaom/src/usage_cx.dox new file mode 100644 index 000000000..51b4e8e3e --- /dev/null +++ b/libs/libaom/src/usage_cx.dox @@ -0,0 +1,9 @@ +/*! \page usage_encode Encoding + + The aom_codec_encode() function is at the core of the encode loop. It + processes raw images passed by the application, producing packets of + compressed data. + + \ref samples + +*/ diff --git a/libs/libaom/src/usage_dx.dox b/libs/libaom/src/usage_dx.dox new file mode 100644 index 000000000..76dc213bf --- /dev/null +++ b/libs/libaom/src/usage_dx.dox @@ -0,0 +1,22 @@ +/*! \page usage_decode Decoding + + The aom_codec_decode() function is at the core of the decode loop. It + processes packets of compressed data passed by the application, producing + decoded images. The decoder expects packets to comprise exactly one image + frame of data. Packets \ref MUST be passed in decode order. If the + application wishes to associate some data with the frame, the + user_priv member may be set. + + \ref samples + + + \section usage_frame_iter Frame Iterator Based Decoding + Decoded frames are made available to the application + through the aom_codec_get_frame() iterator. The application initializes the + iterator storage (of type #aom_codec_iter_t) to NULL, then calls + aom_codec_get_frame repeatedly until it returns NULL, indicating that all + images have been returned. This process may result in zero, one, or many + frames that are ready for display, depending on the codec. + + +*/ -- cgit v1.2.3